From 849369d6c66d3054688672f97d31fceb8e8230fb Mon Sep 17 00:00:00 2001
From: root <root@artemis.panaceas.org>
Date: Fri, 25 Dec 2015 04:40:36 +0000
Subject: initial_commit

---
 fs/9p/Kconfig                        |    34 +
 fs/9p/Makefile                       |    17 +
 fs/9p/acl.c                          |   407 +
 fs/9p/acl.h                          |    49 +
 fs/9p/cache.c                        |   415 +
 fs/9p/cache.h                        |   139 +
 fs/9p/fid.c                          |   309 +
 fs/9p/fid.h                          |    50 +
 fs/9p/v9fs.c                         |   624 ++
 fs/9p/v9fs.h                         |   227 +
 fs/9p/v9fs_vfs.h                     |    87 +
 fs/9p/vfs_addr.c                     |   353 +
 fs/9p/vfs_dentry.c                   |   147 +
 fs/9p/vfs_dir.c                      |   318 +
 fs/9p/vfs_file.c                     |   771 ++
 fs/9p/vfs_inode.c                    |  1487 ++++
 fs/9p/vfs_inode_dotl.c               |  1003 +++
 fs/9p/vfs_super.c                    |   368 +
 fs/9p/xattr.c                        |   172 +
 fs/9p/xattr.h                        |    33 +
 fs/9p/xattr_user.c                   |    80 +
 fs/Kconfig                           |   277 +
 fs/Kconfig.binfmt                    |   163 +
 fs/Makefile                          |   129 +
 fs/adfs/Kconfig                      |    27 +
 fs/adfs/Makefile                     |     7 +
 fs/adfs/adfs.h                       |   204 +
 fs/adfs/dir.c                        |   296 +
 fs/adfs/dir_f.c                      |   486 ++
 fs/adfs/dir_f.h                      |    65 +
 fs/adfs/dir_fplus.c                  |   265 +
 fs/adfs/dir_fplus.h                  |    45 +
 fs/adfs/file.c                       |    37 +
 fs/adfs/inode.c                      |   366 +
 fs/adfs/map.c                        |   290 +
 fs/adfs/super.c                      |   543 ++
 fs/affs/Changes                      |   343 +
 fs/affs/Kconfig                      |    21 +
 fs/affs/Makefile                     |     9 +
 fs/affs/affs.h                       |   305 +
 fs/affs/amigaffs.c                   |   515 ++
 fs/affs/bitmap.c                     |   390 +
 fs/affs/dir.c                        |   156 +
 fs/affs/file.c                       |   936 +++
 fs/affs/inode.c                      |   413 +
 fs/affs/namei.c                      |   458 ++
 fs/affs/super.c                      |   626 ++
 fs/affs/symlink.c                    |    81 +
 fs/afs/Kconfig                       |    30 +
 fs/afs/Makefile                      |    32 +
 fs/afs/afs.h                         |   177 +
 fs/afs/afs_cm.h                      |    33 +
 fs/afs/afs_fs.h                      |    56 +
 fs/afs/afs_vl.h                      |    84 +
 fs/afs/cache.c                       |   402 +
 fs/afs/callback.c                    |   477 ++
 fs/afs/cell.c                        |   460 ++
 fs/afs/cmservice.c                   |   585 ++
 fs/afs/dir.c                         |  1184 +++
 fs/afs/file.c                        |   378 +
 fs/afs/flock.c                       |   588 ++
 fs/afs/fsclient.c                    |  1905 +++++
 fs/afs/inode.c                       |   498 ++
 fs/afs/internal.h                    |   890 +++
 fs/afs/main.c                        |   184 +
 fs/afs/misc.c                        |    75 +
 fs/afs/mntpt.c                       |   284 +
 fs/afs/netdevices.c                  |    68 +
 fs/afs/proc.c                        |   744 ++
 fs/afs/rxrpc.c                       |   859 ++
 fs/afs/security.c                    |   363 +
 fs/afs/server.c                      |   328 +
 fs/afs/super.c                       |   550 ++
 fs/afs/vlclient.c                    |   219 +
 fs/afs/vlocation.c                   |   726 ++
 fs/afs/vnode.c                       |  1025 +++
 fs/afs/volume.c                      |   401 +
 fs/afs/write.c                       |   754 ++
 fs/aio.c                             |  1795 +++++
 fs/anon_inodes.c                     |   244 +
 fs/attr.c                            |   252 +
 fs/autofs4/Kconfig                   |    20 +
 fs/autofs4/Makefile                  |     7 +
 fs/autofs4/autofs_i.h                |   350 +
 fs/autofs4/dev-ioctl.c               |   762 ++
 fs/autofs4/expire.c                  |   588 ++
 fs/autofs4/init.c                    |    51 +
 fs/autofs4/inode.c                   |   353 +
 fs/autofs4/root.c                    |   895 +++
 fs/autofs4/symlink.c                 |    24 +
 fs/autofs4/waitq.c                   |   552 ++
 fs/bad_inode.c                       |   360 +
 fs/befs/ChangeLog                    |   417 +
 fs/befs/Kconfig                      |    26 +
 fs/befs/Makefile                     |     7 +
 fs/befs/TODO                         |    14 +
 fs/befs/befs.h                       |   156 +
 fs/befs/befs_fs_types.h              |   255 +
 fs/befs/btree.c                      |   787 ++
 fs/befs/btree.h                      |    13 +
 fs/befs/datastream.c                 |   526 ++
 fs/befs/datastream.h                 |    19 +
 fs/befs/debug.c                      |   282 +
 fs/befs/endian.h                     |   125 +
 fs/befs/inode.c                      |    52 +
 fs/befs/inode.h                      |     8 +
 fs/befs/io.c                         |    83 +
 fs/befs/io.h                         |     9 +
 fs/befs/linuxvfs.c                   |   979 +++
 fs/befs/super.c                      |   112 +
 fs/befs/super.h                      |     8 +
 fs/bfs/Kconfig                       |    19 +
 fs/bfs/Makefile                      |     7 +
 fs/bfs/bfs.h                         |    59 +
 fs/bfs/dir.c                         |   374 +
 fs/bfs/file.c                        |   194 +
 fs/bfs/inode.c                       |   496 ++
 fs/binfmt_aout.c                     |   467 ++
 fs/binfmt_elf.c                      |  2092 +++++
 fs/binfmt_elf_fdpic.c                |  1875 +++++
 fs/binfmt_em86.c                     |   113 +
 fs/binfmt_flat.c                     |   960 +++
 fs/binfmt_misc.c                     |   746 ++
 fs/binfmt_script.c                   |   118 +
 fs/binfmt_som.c                      |   304 +
 fs/bio-integrity.c                   |   807 ++
 fs/bio.c                             |  1692 ++++
 fs/block_dev.c                       |  1671 ++++
 fs/btrfs/Kconfig                     |    33 +
 fs/btrfs/Makefile                    |    10 +
 fs/btrfs/acl.c                       |   335 +
 fs/btrfs/async-thread.c              |   718 ++
 fs/btrfs/async-thread.h              |   119 +
 fs/btrfs/btrfs_inode.h               |   187 +
 fs/btrfs/compat.h                    |     7 +
 fs/btrfs/compression.c               |  1029 +++
 fs/btrfs/compression.h               |    83 +
 fs/btrfs/ctree.c                     |  4402 +++++++++++
 fs/btrfs/ctree.h                     |  2683 +++++++
 fs/btrfs/delayed-inode.c             |  1773 +++++
 fs/btrfs/delayed-inode.h             |   145 +
 fs/btrfs/delayed-ref.c               |   711 ++
 fs/btrfs/delayed-ref.h               |   205 +
 fs/btrfs/dir-item.c                  |   453 ++
 fs/btrfs/disk-io.c                   |  3096 ++++++++
 fs/btrfs/disk-io.h                   |    97 +
 fs/btrfs/export.c                    |   317 +
 fs/btrfs/export.h                    |    19 +
 fs/btrfs/extent-tree.c               |  7347 +++++++++++++++++
 fs/btrfs/extent_io.c                 |  3890 +++++++++
 fs/btrfs/extent_io.h                 |   300 +
 fs/btrfs/extent_map.c                |   418 +
 fs/btrfs/extent_map.h                |    66 +
 fs/btrfs/file-item.c                 |   869 ++
 fs/btrfs/file.c                      |  1683 ++++
 fs/btrfs/free-space-cache.c          |  2693 +++++++
 fs/btrfs/free-space-cache.h          |   113 +
 fs/btrfs/hash.h                      |    27 +
 fs/btrfs/inode-item.c                |   234 +
 fs/btrfs/inode-map.c                 |   545 ++
 fs/btrfs/inode-map.h                 |    13 +
 fs/btrfs/inode.c                     |  7459 ++++++++++++++++++
 fs/btrfs/ioctl.c                     |  2914 +++++++
 fs/btrfs/ioctl.h                     |   251 +
 fs/btrfs/locking.c                   |   208 +
 fs/btrfs/locking.h                   |    29 +
 fs/btrfs/lzo.c                       |   427 +
 fs/btrfs/ordered-data.c              |   983 +++
 fs/btrfs/ordered-data.h              |   179 +
 fs/btrfs/orphan.c                    |    91 +
 fs/btrfs/print-tree.c                |   338 +
 fs/btrfs/print-tree.h                |    23 +
 fs/btrfs/ref-cache.c                 |    68 +
 fs/btrfs/ref-cache.h                 |    52 +
 fs/btrfs/relocation.c                |  4413 +++++++++++
 fs/btrfs/root-tree.c                 |   450 ++
 fs/btrfs/scrub.c                     |  1395 ++++
 fs/btrfs/struct-funcs.c              |   139 +
 fs/btrfs/super.c                     |  1305 +++
 fs/btrfs/sysfs.c                     |    46 +
 fs/btrfs/transaction.c               |  1463 ++++
 fs/btrfs/transaction.h               |   117 +
 fs/btrfs/tree-defrag.c               |   145 +
 fs/btrfs/tree-log.c                  |  3339 ++++++++
 fs/btrfs/tree-log.h                  |    52 +
 fs/btrfs/version.h                   |     4 +
 fs/btrfs/volumes.c                   |  3718 +++++++++
 fs/btrfs/volumes.h                   |   218 +
 fs/btrfs/xattr.c                     |   395 +
 fs/btrfs/xattr.h                     |    43 +
 fs/btrfs/zlib.c                      |   399 +
 fs/buffer.c                          |  3326 ++++++++
 fs/cachefiles/Kconfig                |    39 +
 fs/cachefiles/Makefile               |    18 +
 fs/cachefiles/bind.c                 |   283 +
 fs/cachefiles/daemon.c               |   752 ++
 fs/cachefiles/interface.c            |   470 ++
 fs/cachefiles/internal.h             |   354 +
 fs/cachefiles/key.c                  |   159 +
 fs/cachefiles/main.c                 |   106 +
 fs/cachefiles/namei.c                |   988 +++
 fs/cachefiles/proc.c                 |   134 +
 fs/cachefiles/rdwr.c                 |   984 +++
 fs/cachefiles/security.c             |   120 +
 fs/cachefiles/xattr.c                |   292 +
 fs/ceph/Kconfig                      |    18 +
 fs/ceph/Makefile                     |    11 +
 fs/ceph/addr.c                       |  1181 +++
 fs/ceph/caps.c                       |  3087 ++++++++
 fs/ceph/ceph_frag.c                  |    22 +
 fs/ceph/debugfs.c                    |   273 +
 fs/ceph/dir.c                        |  1275 +++
 fs/ceph/export.c                     |   249 +
 fs/ceph/file.c                       |   828 ++
 fs/ceph/inode.c                      |  1842 +++++
 fs/ceph/ioctl.c                      |   255 +
 fs/ceph/ioctl.h                      |    44 +
 fs/ceph/locks.c                      |   286 +
 fs/ceph/mds_client.c                 |  3454 ++++++++
 fs/ceph/mds_client.h                 |   379 +
 fs/ceph/mdsmap.c                     |   179 +
 fs/ceph/snap.c                       |   916 +++
 fs/ceph/strings.c                    |   117 +
 fs/ceph/super.c                      |   921 +++
 fs/ceph/super.h                      |   833 ++
 fs/ceph/xattr.c                      |   854 ++
 fs/char_dev.c                        |   575 ++
 fs/cifs/AUTHORS                      |    55 +
 fs/cifs/CHANGES                      |  1065 +++
 fs/cifs/Kconfig                      |   161 +
 fs/cifs/Makefile                     |    17 +
 fs/cifs/README                       |   748 ++
 fs/cifs/TODO                         |   129 +
 fs/cifs/asn1.c                       |   666 ++
 fs/cifs/cache.c                      |   333 +
 fs/cifs/cifs_debug.c                 |   782 ++
 fs/cifs/cifs_debug.h                 |    96 +
 fs/cifs/cifs_dfs_ref.c               |   370 +
 fs/cifs/cifs_fs_sb.h                 |    65 +
 fs/cifs/cifs_spnego.c                |   175 +
 fs/cifs/cifs_spnego.h                |    47 +
 fs/cifs/cifs_unicode.c               |   332 +
 fs/cifs/cifs_unicode.h               |   383 +
 fs/cifs/cifs_uniupr.h                |   253 +
 fs/cifs/cifsacl.c                    |  1142 +++
 fs/cifs/cifsacl.h                    |   103 +
 fs/cifs/cifsencrypt.c                |   762 ++
 fs/cifs/cifsfs.c                     |  1215 +++
 fs/cifs/cifsfs.h                     |   133 +
 fs/cifs/cifsglob.h                   |   951 +++
 fs/cifs/cifspdu.h                    |  2641 +++++++
 fs/cifs/cifsproto.h                  |   460 ++
 fs/cifs/cifssmb.c                    |  6088 ++++++++++++++
 fs/cifs/connect.c                    |  3719 +++++++++
 fs/cifs/dir.c                        |   742 ++
 fs/cifs/dns_resolve.c                |    98 +
 fs/cifs/dns_resolve.h                |    30 +
 fs/cifs/export.c                     |    67 +
 fs/cifs/file.c                       |  2471 ++++++
 fs/cifs/fscache.c                    |   235 +
 fs/cifs/fscache.h                    |   136 +
 fs/cifs/inode.c                      |  2272 ++++++
 fs/cifs/ioctl.c                      |   102 +
 fs/cifs/link.c                       |   593 ++
 fs/cifs/misc.c                       |   685 ++
 fs/cifs/netmisc.c                    |  1008 +++
 fs/cifs/nterr.c                      |   687 ++
 fs/cifs/nterr.h                      |   561 ++
 fs/cifs/ntlmssp.h                    |   128 +
 fs/cifs/readdir.c                    |   887 +++
 fs/cifs/rfc1002pdu.h                 |    74 +
 fs/cifs/sess.c                       |   952 +++
 fs/cifs/smbencrypt.c                 |   401 +
 fs/cifs/smberr.h                     |   184 +
 fs/cifs/smbfsctl.h                   |    84 +
 fs/cifs/transport.c                  |   931 +++
 fs/cifs/xattr.c                      |   420 +
 fs/coda/Kconfig                      |    21 +
 fs/coda/Makefile                     |    12 +
 fs/coda/cache.c                      |   122 +
 fs/coda/cnode.c                      |   174 +
 fs/coda/coda_cache.h                 |    22 +
 fs/coda/coda_fs_i.h                  |    58 +
 fs/coda/coda_int.h                   |    20 +
 fs/coda/coda_linux.c                 |   192 +
 fs/coda/coda_linux.h                 |   101 +
 fs/coda/dir.c                        |   650 ++
 fs/coda/file.c                       |   234 +
 fs/coda/inode.c                      |   329 +
 fs/coda/pioctl.c                     |    90 +
 fs/coda/psdev.c                      |   430 +
 fs/coda/symlink.c                    |    50 +
 fs/coda/sysctl.c                     |    73 +
 fs/coda/upcall.c                     |   883 +++
 fs/compat.c                          |  2061 +++++
 fs/compat_binfmt_elf.c               |   133 +
 fs/compat_ioctl.c                    |  1656 ++++
 fs/configfs/Kconfig                  |    11 +
 fs/configfs/Makefile                 |     7 +
 fs/configfs/configfs_internal.h      |   161 +
 fs/configfs/dir.c                    |  1766 +++++
 fs/configfs/file.c                   |   344 +
 fs/configfs/inode.c                  |   297 +
 fs/configfs/item.c                   |   216 +
 fs/configfs/mount.c                  |   187 +
 fs/configfs/symlink.c                |   320 +
 fs/cramfs/Kconfig                    |    19 +
 fs/cramfs/Makefile                   |     7 +
 fs/cramfs/README                     |   168 +
 fs/cramfs/inode.c                    |   601 ++
 fs/cramfs/uncompress.c               |    77 +
 fs/dcache.c                          |  3124 ++++++++
 fs/dcookies.c                        |   345 +
 fs/debugfs/Makefile                  |     4 +
 fs/debugfs/file.c                    |   527 ++
 fs/debugfs/inode.c                   |   544 ++
 fs/devpts/Makefile                   |     7 +
 fs/devpts/inode.c                    |   572 ++
 fs/direct-io.c                       |  1256 +++
 fs/dlm/Kconfig                       |    16 +
 fs/dlm/Makefile                      |    21 +
 fs/dlm/ast.c                         |   346 +
 fs/dlm/ast.h                         |    31 +
 fs/dlm/config.c                      |  1010 +++
 fs/dlm/config.h                      |    47 +
 fs/dlm/debug_fs.c                    |   731 ++
 fs/dlm/dir.c                         |   441 ++
 fs/dlm/dir.h                         |    30 +
 fs/dlm/dlm_internal.h                |   616 ++
 fs/dlm/lock.c                        |  5115 ++++++++++++
 fs/dlm/lock.h                        |    70 +
 fs/dlm/lockspace.c                   |   844 ++
 fs/dlm/lockspace.h                   |    26 +
 fs/dlm/lowcomms.c                    |  1572 ++++
 fs/dlm/lowcomms.h                    |    25 +
 fs/dlm/lvb_table.h                   |    18 +
 fs/dlm/main.c                        |    95 +
 fs/dlm/member.c                      |   390 +
 fs/dlm/member.h                      |    25 +
 fs/dlm/memory.c                      |    92 +
 fs/dlm/memory.h                      |    27 +
 fs/dlm/midcomms.c                    |   137 +
 fs/dlm/midcomms.h                    |    21 +
 fs/dlm/netlink.c                     |   139 +
 fs/dlm/plock.c                       |   503 ++
 fs/dlm/rcom.c                        |   511 ++
 fs/dlm/rcom.h                        |    25 +
 fs/dlm/recover.c                     |   789 ++
 fs/dlm/recover.h                     |    34 +
 fs/dlm/recoverd.c                    |   323 +
 fs/dlm/recoverd.h                    |    24 +
 fs/dlm/requestqueue.c                |   188 +
 fs/dlm/requestqueue.h                |    22 +
 fs/dlm/user.c                        |  1011 +++
 fs/dlm/user.h                        |    19 +
 fs/dlm/util.c                        |   154 +
 fs/dlm/util.h                        |    22 +
 fs/drop_caches.c                     |    67 +
 fs/ecryptfs/Kconfig                  |    14 +
 fs/ecryptfs/Makefile                 |     7 +
 fs/ecryptfs/crypto.c                 |  2238 ++++++
 fs/ecryptfs/debug.c                  |   121 +
 fs/ecryptfs/dentry.c                 |   105 +
 fs/ecryptfs/ecryptfs_kernel.h        |   771 ++
 fs/ecryptfs/file.c                   |   379 +
 fs/ecryptfs/inode.c                  |  1228 +++
 fs/ecryptfs/keystore.c               |  2530 ++++++
 fs/ecryptfs/kthread.c                |   197 +
 fs/ecryptfs/main.c                   |   867 ++
 fs/ecryptfs/messaging.c              |   578 ++
 fs/ecryptfs/miscdev.c                |   547 ++
 fs/ecryptfs/mmap.c                   |   566 ++
 fs/ecryptfs/read_write.c             |   357 +
 fs/ecryptfs/super.c                  |   181 +
 fs/efs/Kconfig                       |    14 +
 fs/efs/Makefile                      |     7 +
 fs/efs/dir.c                         |   110 +
 fs/efs/efs.h                         |   140 +
 fs/efs/file.c                        |    60 +
 fs/efs/inode.c                       |   313 +
 fs/efs/namei.c                       |   118 +
 fs/efs/super.c                       |   359 +
 fs/efs/symlink.c                     |    54 +
 fs/eventfd.c                         |   439 ++
 fs/eventpoll.c                       |  1818 +++++
 fs/exec.c                            |  2247 ++++++
 fs/exofs/BUGS                        |     3 +
 fs/exofs/Kbuild                      |    16 +
 fs/exofs/Kconfig                     |    13 +
 fs/exofs/common.h                    |   262 +
 fs/exofs/dir.c                       |   673 ++
 fs/exofs/exofs.h                     |   308 +
 fs/exofs/file.c                      |    77 +
 fs/exofs/inode.c                     |  1374 ++++
 fs/exofs/ios.c                       |   803 ++
 fs/exofs/namei.c                     |   336 +
 fs/exofs/pnfs.h                      |    45 +
 fs/exofs/super.c                     |  1001 +++
 fs/exofs/symlink.c                   |    55 +
 fs/exportfs/Makefile                 |     6 +
 fs/exportfs/expfs.c                  |   497 ++
 fs/ext2/Kconfig                      |    55 +
 fs/ext2/Makefile                     |    13 +
 fs/ext2/acl.c                        |   445 ++
 fs/ext2/acl.h                        |    78 +
 fs/ext2/balloc.c                     |  1555 ++++
 fs/ext2/dir.c                        |   733 ++
 fs/ext2/ext2.h                       |   182 +
 fs/ext2/file.c                       |   107 +
 fs/ext2/ialloc.c                     |   674 ++
 fs/ext2/inode.c                      |  1552 ++++
 fs/ext2/ioctl.c                      |   178 +
 fs/ext2/namei.c                      |   427 +
 fs/ext2/super.c                      |  1525 ++++
 fs/ext2/symlink.c                    |    54 +
 fs/ext2/xattr.c                      |  1031 +++
 fs/ext2/xattr.h                      |   127 +
 fs/ext2/xattr_security.c             |    76 +
 fs/ext2/xattr_trusted.c              |    58 +
 fs/ext2/xattr_user.c                 |    62 +
 fs/ext2/xip.c                        |    92 +
 fs/ext2/xip.h                        |    26 +
 fs/ext3/Kconfig                      |    89 +
 fs/ext3/Makefile                     |    12 +
 fs/ext3/acl.c                        |   481 ++
 fs/ext3/acl.h                        |    77 +
 fs/ext3/balloc.c                     |  2156 +++++
 fs/ext3/bitmap.c                     |    32 +
 fs/ext3/dir.c                        |   519 ++
 fs/ext3/ext3_jbd.c                   |    59 +
 fs/ext3/file.c                       |    85 +
 fs/ext3/fsync.c                      |    95 +
 fs/ext3/hash.c                       |   208 +
 fs/ext3/ialloc.c                     |   768 ++
 fs/ext3/inode.c                      |  3516 +++++++++
 fs/ext3/ioctl.c                      |   352 +
 fs/ext3/namei.c                      |  2550 ++++++
 fs/ext3/namei.h                      |     8 +
 fs/ext3/resize.c                     |  1120 +++
 fs/ext3/super.c                      |  3084 ++++++++
 fs/ext3/symlink.c                    |    56 +
 fs/ext3/xattr.c                      |  1336 ++++
 fs/ext3/xattr.h                      |   138 +
 fs/ext3/xattr_security.c             |    78 +
 fs/ext3/xattr_trusted.c              |    59 +
 fs/ext3/xattr_user.c                 |    62 +
 fs/ext4/Kconfig                      |    85 +
 fs/ext4/Makefile                     |    14 +
 fs/ext4/acl.c                        |   479 ++
 fs/ext4/acl.h                        |    77 +
 fs/ext4/balloc.c                     |   622 ++
 fs/ext4/bitmap.c                     |    31 +
 fs/ext4/block_validity.c             |   248 +
 fs/ext4/dir.c                        |   542 ++
 fs/ext4/ext4.h                       |  2236 ++++++
 fs/ext4/ext4_extents.h               |   294 +
 fs/ext4/ext4_jbd2.c                  |   152 +
 fs/ext4/ext4_jbd2.h                  |   327 +
 fs/ext4/extents.c                    |  4364 +++++++++++
 fs/ext4/file.c                       |   286 +
 fs/ext4/fsync.c                      |   225 +
 fs/ext4/hash.c                       |   208 +
 fs/ext4/ialloc.c                     |  1338 ++++
 fs/ext4/inode.c                      |  5957 ++++++++++++++
 fs/ext4/ioctl.c                      |   442 ++
 fs/ext4/mballoc.c                    |  4958 ++++++++++++
 fs/ext4/mballoc.h                    |   222 +
 fs/ext4/migrate.c                    |   634 ++
 fs/ext4/mmp.c                        |   351 +
 fs/ext4/move_extent.c                |  1424 ++++
 fs/ext4/namei.c                      |  2612 ++++++
 fs/ext4/page-io.c                    |   447 ++
 fs/ext4/resize.c                     |  1076 +++
 fs/ext4/super.c                      |  5027 ++++++++++++
 fs/ext4/symlink.c                    |    56 +
 fs/ext4/xattr.c                      |  1612 ++++
 fs/ext4/xattr.h                      |   155 +
 fs/ext4/xattr_security.c             |    78 +
 fs/ext4/xattr_trusted.c              |    59 +
 fs/ext4/xattr_user.c                 |    62 +
 fs/fat/Kconfig                       |   100 +
 fs/fat/Makefile                      |    11 +
 fs/fat/cache.c                       |   351 +
 fs/fat/dir.c                         |  1371 ++++
 fs/fat/fat.h                         |   351 +
 fs/fat/fatent.c                      |   685 ++
 fs/fat/file.c                        |   446 ++
 fs/fat/inode.c                       |  1619 ++++
 fs/fat/misc.c                        |   278 +
 fs/fat/namei_msdos.c                 |   702 ++
 fs/fat/namei_vfat.c                  |  1110 +++
 fs/fcntl.c                           |   853 ++
 fs/fhandle.c                         |   266 +
 fs/fifo.c                            |   154 +
 fs/file.c                            |   486 ++
 fs/file_table.c                      |   554 ++
 fs/filesystems.c                     |   287 +
 fs/freevxfs/Kconfig                  |    16 +
 fs/freevxfs/Makefile                 |     8 +
 fs/freevxfs/vxfs.h                   |   263 +
 fs/freevxfs/vxfs_bmap.c              |   281 +
 fs/freevxfs/vxfs_dir.h               |    92 +
 fs/freevxfs/vxfs_extern.h            |    81 +
 fs/freevxfs/vxfs_fshead.c            |   203 +
 fs/freevxfs/vxfs_fshead.h            |    67 +
 fs/freevxfs/vxfs_immed.c             |   115 +
 fs/freevxfs/vxfs_inode.c             |   361 +
 fs/freevxfs/vxfs_inode.h             |   180 +
 fs/freevxfs/vxfs_lookup.c            |   322 +
 fs/freevxfs/vxfs_olt.c               |   130 +
 fs/freevxfs/vxfs_olt.h               |   145 +
 fs/freevxfs/vxfs_subr.c              |   183 +
 fs/freevxfs/vxfs_super.c             |   287 +
 fs/fs-writeback.c                    |  1373 ++++
 fs/fs_struct.c                       |   201 +
 fs/fscache/Kconfig                   |    61 +
 fs/fscache/Makefile                  |    20 +
 fs/fscache/cache.c                   |   420 +
 fs/fscache/cookie.c                  |   514 ++
 fs/fscache/fsdef.c                   |   144 +
 fs/fscache/histogram.c               |   109 +
 fs/fscache/internal.h                |   438 ++
 fs/fscache/main.c                    |   218 +
 fs/fscache/netfs.c                   |   103 +
 fs/fscache/object-list.c             |   432 +
 fs/fscache/object.c                  |   892 +++
 fs/fscache/operation.c               |   463 ++
 fs/fscache/page.c                    |   996 +++
 fs/fscache/proc.c                    |    81 +
 fs/fscache/stats.c                   |   280 +
 fs/fuse/Kconfig                      |    15 +
 fs/fuse/Makefile                     |     8 +
 fs/fuse/control.c                    |   359 +
 fs/fuse/cuse.c                       |   620 ++
 fs/fuse/dev.c                        |  2049 +++++
 fs/fuse/dir.c                        |  1638 ++++
 fs/fuse/file.c                       |  2186 ++++++
 fs/fuse/fuse_i.h                     |   769 ++
 fs/fuse/inode.c                      |  1263 +++
 fs/generic_acl.c                     |   225 +
 fs/gfs2/Kconfig                      |    38 +
 fs/gfs2/Makefile                     |    10 +
 fs/gfs2/acl.c                        |   354 +
 fs/gfs2/acl.h                        |    24 +
 fs/gfs2/aops.c                       |  1182 +++
 fs/gfs2/bmap.c                       |  1297 +++
 fs/gfs2/bmap.h                       |    59 +
 fs/gfs2/dentry.c                     |   141 +
 fs/gfs2/dir.c                        |  2006 +++++
 fs/gfs2/dir.h                        |    68 +
 fs/gfs2/export.c                     |   206 +
 fs/gfs2/file.c                       |  1112 +++
 fs/gfs2/gfs2.h                       |    26 +
 fs/gfs2/glock.c                      |  1870 +++++
 fs/gfs2/glock.h                      |   244 +
 fs/gfs2/glops.c                      |   605 ++
 fs/gfs2/glops.h                      |    28 +
 fs/gfs2/incore.h                     |   686 ++
 fs/gfs2/inode.c                      |  1892 +++++
 fs/gfs2/inode.h                      |   143 +
 fs/gfs2/lock_dlm.c                   |   235 +
 fs/gfs2/log.c                        |   972 +++
 fs/gfs2/log.h                        |    69 +
 fs/gfs2/lops.c                       |   795 ++
 fs/gfs2/lops.h                       |   113 +
 fs/gfs2/main.c                       |   215 +
 fs/gfs2/meta_io.c                    |   459 ++
 fs/gfs2/meta_io.h                    |    82 +
 fs/gfs2/ops_fstype.c                 |  1407 ++++
 fs/gfs2/quota.c                      |  1644 ++++
 fs/gfs2/quota.h                      |    59 +
 fs/gfs2/recovery.c                   |   610 ++
 fs/gfs2/recovery.h                   |    36 +
 fs/gfs2/rgrp.c                       |  1885 +++++
 fs/gfs2/rgrp.h                       |    78 +
 fs/gfs2/super.c                      |  1585 ++++
 fs/gfs2/super.h                      |    60 +
 fs/gfs2/sys.c                        |   653 ++
 fs/gfs2/sys.h                        |    23 +
 fs/gfs2/trace_gfs2.h                 |   438 ++
 fs/gfs2/trans.c                      |   192 +
 fs/gfs2/trans.h                      |    47 +
 fs/gfs2/util.c                       |   285 +
 fs/gfs2/util.h                       |   172 +
 fs/gfs2/xattr.c                      |  1549 ++++
 fs/gfs2/xattr.h                      |    67 +
 fs/hfs/Kconfig                       |    12 +
 fs/hfs/Makefile                      |    10 +
 fs/hfs/attr.c                        |   121 +
 fs/hfs/bfind.c                       |   222 +
 fs/hfs/bitmap.c                      |   243 +
 fs/hfs/bnode.c                       |   478 ++
 fs/hfs/brec.c                        |   519 ++
 fs/hfs/btree.c                       |   366 +
 fs/hfs/btree.h                       |   168 +
 fs/hfs/catalog.c                     |   356 +
 fs/hfs/dir.c                         |   316 +
 fs/hfs/extent.c                      |   525 ++
 fs/hfs/hfs.h                         |   289 +
 fs/hfs/hfs_fs.h                      |   276 +
 fs/hfs/inode.c                       |   673 ++
 fs/hfs/mdb.c                         |   354 +
 fs/hfs/part_tbl.c                    |   117 +
 fs/hfs/string.c                      |   116 +
 fs/hfs/super.c                       |   493 ++
 fs/hfs/sysdep.c                      |    45 +
 fs/hfs/trans.c                       |   150 +
 fs/hfsplus/Kconfig                   |    13 +
 fs/hfsplus/Makefile                  |     9 +
 fs/hfsplus/bfind.c                   |   224 +
 fs/hfsplus/bitmap.c                  |   235 +
 fs/hfsplus/bnode.c                   |   656 ++
 fs/hfsplus/brec.c                    |   516 ++
 fs/hfsplus/btree.c                   |   377 +
 fs/hfsplus/catalog.c                 |   425 +
 fs/hfsplus/dir.c                     |   516 ++
 fs/hfsplus/extents.c                 |   561 ++
 fs/hfsplus/hfsplus_fs.h              |   463 ++
 fs/hfsplus/hfsplus_raw.h             |   337 +
 fs/hfsplus/inode.c                   |   617 ++
 fs/hfsplus/ioctl.c                   |   221 +
 fs/hfsplus/options.c                 |   230 +
 fs/hfsplus/part_tbl.c                |   157 +
 fs/hfsplus/super.c                   |   593 ++
 fs/hfsplus/tables.c                  |  3245 ++++++++
 fs/hfsplus/unicode.c                 |   452 ++
 fs/hfsplus/wrapper.c                 |   283 +
 fs/hostfs/Makefile                   |    11 +
 fs/hostfs/hostfs.h                   |    96 +
 fs/hostfs/hostfs_kern.c              |  1004 +++
 fs/hostfs/hostfs_user.c              |   387 +
 fs/hpfs/Kconfig                      |    14 +
 fs/hpfs/Makefile                     |     8 +
 fs/hpfs/alloc.c                      |   424 +
 fs/hpfs/anode.c                      |   491 ++
 fs/hpfs/buffer.c                     |   168 +
 fs/hpfs/dentry.c                     |    64 +
 fs/hpfs/dir.c                        |   322 +
 fs/hpfs/dnode.c                      |  1084 +++
 fs/hpfs/ea.c                         |   367 +
 fs/hpfs/file.c                       |   166 +
 fs/hpfs/hpfs.h                       |   568 ++
 fs/hpfs/hpfs_fn.h                    |   360 +
 fs/hpfs/inode.c                      |   308 +
 fs/hpfs/map.c                        |   287 +
 fs/hpfs/name.c                       |   112 +
 fs/hpfs/namei.c                      |   628 ++
 fs/hpfs/super.c                      |   712 ++
 fs/hppfs/Makefile                    |     6 +
 fs/hppfs/hppfs.c                     |   773 ++
 fs/hugetlbfs/Makefile                |     7 +
 fs/hugetlbfs/inode.c                 |  1033 +++
 fs/inode.c                           |  1697 ++++
 fs/internal.h                        |   137 +
 fs/ioctl.c                           |   623 ++
 fs/ioprio.c                          |   250 +
 fs/isofs/Kconfig                     |    39 +
 fs/isofs/Makefile                    |    10 +
 fs/isofs/compress.c                  |   380 +
 fs/isofs/dir.c                       |   288 +
 fs/isofs/export.c                    |   196 +
 fs/isofs/inode.c                     |  1579 ++++
 fs/isofs/isofs.h                     |   184 +
 fs/isofs/joliet.c                    |    69 +
 fs/isofs/namei.c                     |   196 +
 fs/isofs/rock.c                      |   778 ++
 fs/isofs/rock.h                      |   122 +
 fs/isofs/util.c                      |    80 +
 fs/isofs/zisofs.h                    |    21 +
 fs/jbd/Kconfig                       |    30 +
 fs/jbd/Makefile                      |     7 +
 fs/jbd/checkpoint.c                  |   757 ++
 fs/jbd/commit.c                      |   953 +++
 fs/jbd/journal.c                     |  2081 +++++
 fs/jbd/recovery.c                    |   587 ++
 fs/jbd/revoke.c                      |   706 ++
 fs/jbd/transaction.c                 |  2198 ++++++
 fs/jbd2/Kconfig                      |    33 +
 fs/jbd2/Makefile                     |     7 +
 fs/jbd2/checkpoint.c                 |   798 ++
 fs/jbd2/commit.c                     |  1048 +++
 fs/jbd2/journal.c                    |  2471 ++++++
 fs/jbd2/recovery.c                   |   738 ++
 fs/jbd2/revoke.c                     |   714 ++
 fs/jbd2/transaction.c                |  2227 ++++++
 fs/jffs2/Kconfig                     |   188 +
 fs/jffs2/LICENCE                     |    30 +
 fs/jffs2/Makefile                    |    21 +
 fs/jffs2/README.Locking              |   172 +
 fs/jffs2/TODO                        |    37 +
 fs/jffs2/acl.c                       |   440 ++
 fs/jffs2/acl.h                       |    44 +
 fs/jffs2/background.c                |   159 +
 fs/jffs2/build.c                     |   392 +
 fs/jffs2/compr.c                     |   360 +
 fs/jffs2/compr.h                     |   103 +
 fs/jffs2/compr_lzo.c                 |   111 +
 fs/jffs2/compr_rtime.c               |   130 +
 fs/jffs2/compr_rubin.c               |   455 ++
 fs/jffs2/compr_zlib.c                |   218 +
 fs/jffs2/debug.c                     |   860 ++
 fs/jffs2/debug.h                     |   291 +
 fs/jffs2/dir.c                       |   873 +++
 fs/jffs2/erase.c                     |   502 ++
 fs/jffs2/file.c                      |   321 +
 fs/jffs2/fs.c                        |   748 ++
 fs/jffs2/gc.c                        |  1326 ++++
 fs/jffs2/ioctl.c                     |    22 +
 fs/jffs2/jffs2_fs_i.h                |    56 +
 fs/jffs2/jffs2_fs_sb.h               |   147 +
 fs/jffs2/malloc.c                    |   318 +
 fs/jffs2/nodelist.c                  |   771 ++
 fs/jffs2/nodelist.h                  |   480 ++
 fs/jffs2/nodemgmt.c                  |   787 ++
 fs/jffs2/os-linux.h                  |   204 +
 fs/jffs2/read.c                      |   216 +
 fs/jffs2/readinode.c                 |  1461 ++++
 fs/jffs2/scan.c                      |  1148 +++
 fs/jffs2/security.c                  |    86 +
 fs/jffs2/summary.c                   |   869 ++
 fs/jffs2/summary.h                   |   213 +
 fs/jffs2/super.c                     |   316 +
 fs/jffs2/symlink.c                   |    64 +
 fs/jffs2/wbuf.c                      |  1310 ++++
 fs/jffs2/write.c                     |   707 ++
 fs/jffs2/writev.c                    |    79 +
 fs/jffs2/xattr.c                     |  1330 ++++
 fs/jffs2/xattr.h                     |   131 +
 fs/jffs2/xattr_trusted.c             |    55 +
 fs/jffs2/xattr_user.c                |    55 +
 fs/jfs/Kconfig                       |    50 +
 fs/jfs/Makefile                      |    16 +
 fs/jfs/acl.c                         |   211 +
 fs/jfs/endian24.h                    |    49 +
 fs/jfs/file.c                        |   154 +
 fs/jfs/inode.c                       |   414 +
 fs/jfs/ioctl.c                       |   148 +
 fs/jfs/jfs_acl.h                     |    41 +
 fs/jfs/jfs_btree.h                   |   172 +
 fs/jfs/jfs_debug.c                   |   109 +
 fs/jfs/jfs_debug.h                   |   122 +
 fs/jfs/jfs_dinode.h                  |   176 +
 fs/jfs/jfs_dmap.c                    |  3992 ++++++++++
 fs/jfs/jfs_dmap.h                    |   314 +
 fs/jfs/jfs_dtree.c                   |  4567 +++++++++++
 fs/jfs/jfs_dtree.h                   |   269 +
 fs/jfs/jfs_extent.c                  |   651 ++
 fs/jfs/jfs_extent.h                  |    31 +
 fs/jfs/jfs_filsys.h                  |   282 +
 fs/jfs/jfs_imap.c                    |  3187 ++++++++
 fs/jfs/jfs_imap.h                    |   175 +
 fs/jfs/jfs_incore.h                  |   224 +
 fs/jfs/jfs_inode.c                   |   166 +
 fs/jfs/jfs_inode.h                   |    53 +
 fs/jfs/jfs_lock.h                    |    52 +
 fs/jfs/jfs_logmgr.c                  |  2529 ++++++
 fs/jfs/jfs_logmgr.h                  |   513 ++
 fs/jfs/jfs_metapage.c                |   843 ++
 fs/jfs/jfs_metapage.h                |   155 +
 fs/jfs/jfs_mount.c                   |   507 ++
 fs/jfs/jfs_superblock.h              |   121 +
 fs/jfs/jfs_txnmgr.c                  |  3101 ++++++++
 fs/jfs/jfs_txnmgr.h                  |   311 +
 fs/jfs/jfs_types.h                   |   159 +
 fs/jfs/jfs_umount.c                  |   168 +
 fs/jfs/jfs_unicode.c                 |   138 +
 fs/jfs/jfs_unicode.h                 |   156 +
 fs/jfs/jfs_uniupr.c                  |   134 +
 fs/jfs/jfs_xattr.h                   |    75 +
 fs/jfs/jfs_xtree.c                   |  3905 +++++++++
 fs/jfs/jfs_xtree.h                   |   132 +
 fs/jfs/namei.c                       |  1639 ++++
 fs/jfs/resize.c                      |   543 ++
 fs/jfs/super.c                       |   901 +++
 fs/jfs/symlink.c                     |    52 +
 fs/jfs/xattr.c                       |  1128 +++
 fs/libfs.c                           |   997 +++
 fs/lockd/Makefile                    |    10 +
 fs/lockd/clnt4xdr.c                  |   605 ++
 fs/lockd/clntlock.c                  |   279 +
 fs/lockd/clntproc.c                  |   848 ++
 fs/lockd/clntxdr.c                   |   627 ++
 fs/lockd/grace.c                     |    59 +
 fs/lockd/host.c                      |   649 ++
 fs/lockd/mon.c                       |   555 ++
 fs/lockd/svc.c                       |   568 ++
 fs/lockd/svc4proc.c                  |   503 ++
 fs/lockd/svclock.c                   |   966 +++
 fs/lockd/svcproc.c                   |   544 ++
 fs/lockd/svcshare.c                  |   106 +
 fs/lockd/svcsubs.c                   |   446 ++
 fs/lockd/xdr.c                       |   343 +
 fs/lockd/xdr4.c                      |   334 +
 fs/locks.c                           |  2341 ++++++
 fs/logfs/Kconfig                     |    17 +
 fs/logfs/Makefile                    |    13 +
 fs/logfs/compr.c                     |    95 +
 fs/logfs/dev_bdev.c                  |   342 +
 fs/logfs/dev_mtd.c                   |   278 +
 fs/logfs/dir.c                       |   825 ++
 fs/logfs/file.c                      |   275 +
 fs/logfs/gc.c                        |   732 ++
 fs/logfs/inode.c                     |   405 +
 fs/logfs/journal.c                   |   895 +++
 fs/logfs/logfs.h                     |   734 ++
 fs/logfs/logfs_abi.h                 |   629 ++
 fs/logfs/readwrite.c                 |  2277 ++++++
 fs/logfs/segment.c                   |   932 +++
 fs/logfs/super.c                     |   671 ++
 fs/mbcache.c                         |   620 ++
 fs/minix/Kconfig                     |    25 +
 fs/minix/Makefile                    |     7 +
 fs/minix/bitmap.c                    |   279 +
 fs/minix/dir.c                       |   477 ++
 fs/minix/file.c                      |    51 +
 fs/minix/inode.c                     |   661 ++
 fs/minix/itree_common.c              |   364 +
 fs/minix/itree_v1.c                  |    67 +
 fs/minix/itree_v2.c                  |    75 +
 fs/minix/minix.h                     |   165 +
 fs/minix/namei.c                     |   269 +
 fs/mpage.c                           |   718 ++
 fs/namei.c                           |  3402 ++++++++
 fs/namespace.c                       |  2730 +++++++
 fs/ncpfs/Kconfig                     |   108 +
 fs/ncpfs/Makefile                    |    16 +
 fs/ncpfs/dir.c                       |  1280 +++
 fs/ncpfs/file.c                      |   297 +
 fs/ncpfs/getopt.c                    |    74 +
 fs/ncpfs/getopt.h                    |    16 +
 fs/ncpfs/inode.c                     |  1072 +++
 fs/ncpfs/ioctl.c                     |   918 +++
 fs/ncpfs/mmap.c                      |   128 +
 fs/ncpfs/ncp_fs.h                    |    98 +
 fs/ncpfs/ncp_fs_i.h                  |    29 +
 fs/ncpfs/ncp_fs_sb.h                 |   176 +
 fs/ncpfs/ncplib_kernel.c             |  1323 ++++
 fs/ncpfs/ncplib_kernel.h             |   254 +
 fs/ncpfs/ncpsign_kernel.c            |   127 +
 fs/ncpfs/ncpsign_kernel.h            |    26 +
 fs/ncpfs/sock.c                      |   880 +++
 fs/ncpfs/symlink.c                   |   181 +
 fs/nfs/Kconfig                       |   144 +
 fs/nfs/Makefile                      |    25 +
 fs/nfs/cache_lib.c                   |   141 +
 fs/nfs/cache_lib.h                   |    27 +
 fs/nfs/callback.c                    |   403 +
 fs/nfs/callback.h                    |   213 +
 fs/nfs/callback_proc.c               |   561 ++
 fs/nfs/callback_xdr.c                |   989 +++
 fs/nfs/client.c                      |  2004 +++++
 fs/nfs/delegation.c                  |   716 ++
 fs/nfs/delegation.h                  |    80 +
 fs/nfs/dir.c                         |  2350 ++++++
 fs/nfs/direct.c                      |  1039 +++
 fs/nfs/dns_resolve.c                 |   372 +
 fs/nfs/dns_resolve.h                 |    26 +
 fs/nfs/file.c                        |   921 +++
 fs/nfs/fscache-index.c               |   337 +
 fs/nfs/fscache.c                     |   535 ++
 fs/nfs/fscache.h                     |   222 +
 fs/nfs/getroot.c                     |   267 +
 fs/nfs/idmap.c                       |   779 ++
 fs/nfs/inode.c                       |  1656 ++++
 fs/nfs/internal.h                    |   456 ++
 fs/nfs/iostat.h                      |    71 +
 fs/nfs/mount_clnt.c                  |   518 ++
 fs/nfs/namespace.c                   |   371 +
 fs/nfs/nfs2xdr.c                     |  1157 +++
 fs/nfs/nfs3acl.c                     |   444 ++
 fs/nfs/nfs3proc.c                    |   890 +++
 fs/nfs/nfs3xdr.c                     |  2498 ++++++
 fs/nfs/nfs4_fs.h                     |   383 +
 fs/nfs/nfs4filelayout.c              |   914 +++
 fs/nfs/nfs4filelayout.h              |   105 +
 fs/nfs/nfs4filelayoutdev.c           |   636 ++
 fs/nfs/nfs4namespace.c               |   265 +
 fs/nfs/nfs4proc.c                    |  6114 +++++++++++++++
 fs/nfs/nfs4renewd.c                  |   130 +
 fs/nfs/nfs4state.c                   |  1766 +++++
 fs/nfs/nfs4xdr.c                     |  6679 ++++++++++++++++
 fs/nfs/nfsroot.c                     |   309 +
 fs/nfs/objlayout/Kbuild              |     5 +
 fs/nfs/objlayout/objio_osd.c         |  1056 +++
 fs/nfs/objlayout/objlayout.c         |   716 ++
 fs/nfs/objlayout/objlayout.h         |   187 +
 fs/nfs/objlayout/pnfs_osd_xdr_cli.c  |   415 +
 fs/nfs/pagelist.c                    |   453 ++
 fs/nfs/pnfs.c                        |  1319 ++++
 fs/nfs/pnfs.h                        |   427 +
 fs/nfs/pnfs_dev.c                    |   277 +
 fs/nfs/proc.c                        |   746 ++
 fs/nfs/read.c                        |   704 ++
 fs/nfs/super.c                       |  3099 ++++++++
 fs/nfs/symlink.c                     |    78 +
 fs/nfs/sysctl.c                      |    92 +
 fs/nfs/unlink.c                      |   580 ++
 fs/nfs/write.c                       |  1732 ++++
 fs/nfs_common/Makefile               |     7 +
 fs/nfs_common/nfsacl.c               |   286 +
 fs/nfsctl.c                          |   100 +
 fs/nfsd/Kconfig                      |    94 +
 fs/nfsd/Makefile                     |    13 +
 fs/nfsd/acl.h                        |    59 +
 fs/nfsd/auth.c                       |    95 +
 fs/nfsd/auth.h                       |    22 +
 fs/nfsd/cache.h                      |    83 +
 fs/nfsd/export.c                     |  1693 ++++
 fs/nfsd/idmap.h                      |    62 +
 fs/nfsd/lockd.c                      |    79 +
 fs/nfsd/nfs2acl.c                    |   356 +
 fs/nfsd/nfs3acl.c                    |   267 +
 fs/nfsd/nfs3proc.c                   |   896 +++
 fs/nfsd/nfs3xdr.c                    |  1112 +++
 fs/nfsd/nfs4acl.c                    |   838 ++
 fs/nfsd/nfs4callback.c               |  1026 +++
 fs/nfsd/nfs4idmap.c                  |   587 ++
 fs/nfsd/nfs4proc.c                   |  1472 ++++
 fs/nfsd/nfs4recover.c                |   402 +
 fs/nfsd/nfs4state.c                  |  4485 +++++++++++
 fs/nfsd/nfs4xdr.c                    |  3408 ++++++++
 fs/nfsd/nfscache.c                   |   322 +
 fs/nfsd/nfsctl.c                     |  1527 ++++
 fs/nfsd/nfsd.h                       |   340 +
 fs/nfsd/nfsfh.c                      |   693 ++
 fs/nfsd/nfsfh.h                      |   206 +
 fs/nfsd/nfsproc.c                    |   755 ++
 fs/nfsd/nfssvc.c                     |   663 ++
 fs/nfsd/nfsxdr.c                     |   545 ++
 fs/nfsd/state.h                      |   492 ++
 fs/nfsd/stats.c                      |   104 +
 fs/nfsd/vfs.c                        |  2273 ++++++
 fs/nfsd/vfs.h                        |   109 +
 fs/nfsd/xdr.h                        |   173 +
 fs/nfsd/xdr3.h                       |   344 +
 fs/nfsd/xdr4.h                       |   573 ++
 fs/nilfs2/Kconfig                    |    25 +
 fs/nilfs2/Makefile                   |     5 +
 fs/nilfs2/alloc.c                    |   721 ++
 fs/nilfs2/alloc.h                    |   100 +
 fs/nilfs2/bmap.c                     |   567 ++
 fs/nilfs2/bmap.h                     |   270 +
 fs/nilfs2/btnode.c                   |   297 +
 fs/nilfs2/btnode.h                   |    53 +
 fs/nilfs2/btree.c                    |  2310 ++++++
 fs/nilfs2/btree.h                    |    77 +
 fs/nilfs2/cpfile.c                   |   965 +++
 fs/nilfs2/cpfile.h                   |    46 +
 fs/nilfs2/dat.c                      |   511 ++
 fs/nilfs2/dat.h                      |    59 +
 fs/nilfs2/dir.c                      |   688 ++
 fs/nilfs2/direct.c                   |   369 +
 fs/nilfs2/direct.h                   |    51 +
 fs/nilfs2/export.h                   |    17 +
 fs/nilfs2/file.c                     |   159 +
 fs/nilfs2/gcinode.c                  |   196 +
 fs/nilfs2/ifile.c                    |   201 +
 fs/nilfs2/ifile.h                    |    56 +
 fs/nilfs2/inode.c                    |  1064 +++
 fs/nilfs2/ioctl.c                    |   863 ++
 fs/nilfs2/mdt.c                      |   589 ++
 fs/nilfs2/mdt.h                      |   110 +
 fs/nilfs2/namei.c                    |   594 ++
 fs/nilfs2/nilfs.h                    |   326 +
 fs/nilfs2/page.c                     |   551 ++
 fs/nilfs2/page.h                     |    80 +
 fs/nilfs2/recovery.c                 |   963 +++
 fs/nilfs2/segbuf.c                   |   536 ++
 fs/nilfs2/segbuf.h                   |   184 +
 fs/nilfs2/segment.c                  |  2707 +++++++
 fs/nilfs2/segment.h                  |   246 +
 fs/nilfs2/sufile.c                   |   921 +++
 fs/nilfs2/sufile.h                   |   144 +
 fs/nilfs2/super.c                    |  1462 ++++
 fs/nilfs2/the_nilfs.c                |   784 ++
 fs/nilfs2/the_nilfs.h                |   356 +
 fs/nls/Kconfig                       |   464 ++
 fs/nls/Makefile                      |    44 +
 fs/nls/nls_ascii.c                   |   167 +
 fs/nls/nls_base.c                    |   524 ++
 fs/nls/nls_cp1250.c                  |   347 +
 fs/nls/nls_cp1251.c                  |   302 +
 fs/nls/nls_cp1255.c                  |   385 +
 fs/nls/nls_cp437.c                   |   388 +
 fs/nls/nls_cp737.c                   |   351 +
 fs/nls/nls_cp775.c                   |   320 +
 fs/nls/nls_cp850.c                   |   316 +
 fs/nls/nls_cp852.c                   |   338 +
 fs/nls/nls_cp855.c                   |   300 +
 fs/nls/nls_cp857.c                   |   302 +
 fs/nls/nls_cp860.c                   |   365 +
 fs/nls/nls_cp861.c                   |   388 +
 fs/nls/nls_cp862.c                   |   422 +
 fs/nls/nls_cp863.c                   |   382 +
 fs/nls/nls_cp864.c                   |   408 +
 fs/nls/nls_cp865.c                   |   388 +
 fs/nls/nls_cp866.c                   |   306 +
 fs/nls/nls_cp869.c                   |   316 +
 fs/nls/nls_cp874.c                   |   276 +
 fs/nls/nls_cp932.c                   |  7934 +++++++++++++++++++
 fs/nls/nls_cp936.c                   | 11112 ++++++++++++++++++++++++++
 fs/nls/nls_cp949.c                   | 13947 +++++++++++++++++++++++++++++++++
 fs/nls/nls_cp950.c                   |  9483 ++++++++++++++++++++++
 fs/nls/nls_euc-jp.c                  |   581 ++
 fs/nls/nls_iso8859-1.c               |   258 +
 fs/nls/nls_iso8859-13.c              |   286 +
 fs/nls/nls_iso8859-14.c              |   342 +
 fs/nls/nls_iso8859-15.c              |   308 +
 fs/nls/nls_iso8859-2.c               |   309 +
 fs/nls/nls_iso8859-3.c               |   309 +
 fs/nls/nls_iso8859-4.c               |   309 +
 fs/nls/nls_iso8859-5.c               |   273 +
 fs/nls/nls_iso8859-6.c               |   264 +
 fs/nls/nls_iso8859-7.c               |   318 +
 fs/nls/nls_iso8859-9.c               |   273 +
 fs/nls/nls_koi8-r.c                  |   324 +
 fs/nls/nls_koi8-ru.c                 |    83 +
 fs/nls/nls_koi8-u.c                  |   331 +
 fs/nls/nls_utf8.c                    |    68 +
 fs/no-block.c                        |    23 +
 fs/notify/Kconfig                    |     6 +
 fs/notify/Makefile                   |     6 +
 fs/notify/dnotify/Kconfig            |    11 +
 fs/notify/dnotify/Makefile           |     1 +
 fs/notify/dnotify/dnotify.c          |   411 +
 fs/notify/fanotify/Kconfig           |    26 +
 fs/notify/fanotify/Makefile          |     1 +
 fs/notify/fanotify/fanotify.c        |   228 +
 fs/notify/fanotify/fanotify_user.c   |   891 +++
 fs/notify/fsnotify.c                 |   310 +
 fs/notify/fsnotify.h                 |    47 +
 fs/notify/group.c                    |   104 +
 fs/notify/inode_mark.c               |   315 +
 fs/notify/inotify/Kconfig            |    17 +
 fs/notify/inotify/Makefile           |     1 +
 fs/notify/inotify/inotify.h          |    21 +
 fs/notify/inotify/inotify_fsnotify.c |   222 +
 fs/notify/inotify/inotify_user.c     |   867 ++
 fs/notify/mark.c                     |   372 +
 fs/notify/notification.c             |   464 ++
 fs/notify/vfsmount_mark.c            |   190 +
 fs/ntfs/Kconfig                      |    78 +
 fs/ntfs/Makefile                     |    14 +
 fs/ntfs/aops.c                       |  1638 ++++
 fs/ntfs/aops.h                       |   107 +
 fs/ntfs/attrib.c                     |  2615 ++++++
 fs/ntfs/attrib.h                     |   116 +
 fs/ntfs/bitmap.c                     |   193 +
 fs/ntfs/bitmap.h                     |   118 +
 fs/ntfs/collate.c                    |   124 +
 fs/ntfs/collate.h                    |    50 +
 fs/ntfs/compress.c                   |   969 +++
 fs/ntfs/debug.c                      |   183 +
 fs/ntfs/debug.h                      |    63 +
 fs/ntfs/dir.c                        |  1575 ++++
 fs/ntfs/dir.h                        |    48 +
 fs/ntfs/endian.h                     |    93 +
 fs/ntfs/file.c                       |  2227 ++++++
 fs/ntfs/index.c                      |   454 ++
 fs/ntfs/index.h                      |   148 +
 fs/ntfs/inode.c                      |  3112 ++++++++
 fs/ntfs/inode.h                      |   321 +
 fs/ntfs/layout.h                     |  2435 ++++++
 fs/ntfs/lcnalloc.c                   |  1014 +++
 fs/ntfs/lcnalloc.h                   |   145 +
 fs/ntfs/logfile.c                    |   863 ++
 fs/ntfs/logfile.h                    |   309 +
 fs/ntfs/malloc.h                     |    96 +
 fs/ntfs/mft.c                        |  2917 +++++++
 fs/ntfs/mft.h                        |   124 +
 fs/ntfs/mst.c                        |   203 +
 fs/ntfs/namei.c                      |   405 +
 fs/ntfs/ntfs.h                       |   164 +
 fs/ntfs/quota.c                      |   117 +
 fs/ntfs/quota.h                      |    35 +
 fs/ntfs/runlist.c                    |  1907 +++++
 fs/ntfs/runlist.h                    |   102 +
 fs/ntfs/super.c                      |  3206 ++++++++
 fs/ntfs/sysctl.c                     |    83 +
 fs/ntfs/sysctl.h                     |    41 +
 fs/ntfs/time.h                       |   100 +
 fs/ntfs/types.h                      |    69 +
 fs/ntfs/unistr.c                     |   398 +
 fs/ntfs/upcase.c                     |    87 +
 fs/ntfs/usnjrnl.c                    |    84 +
 fs/ntfs/usnjrnl.h                    |   205 +
 fs/ntfs/volume.h                     |   177 +
 fs/ocfs2/Kconfig                     |    76 +
 fs/ocfs2/Makefile                    |    54 +
 fs/ocfs2/acl.c                       |   535 ++
 fs/ocfs2/acl.h                       |    36 +
 fs/ocfs2/alloc.c                     |  7352 +++++++++++++++++
 fs/ocfs2/alloc.h                     |   324 +
 fs/ocfs2/aops.c                      |  2048 +++++
 fs/ocfs2/aops.h                      |    94 +
 fs/ocfs2/blockcheck.c                |   645 ++
 fs/ocfs2/blockcheck.h                |   107 +
 fs/ocfs2/buffer_head_io.c            |   429 +
 fs/ocfs2/buffer_head_io.h            |    77 +
 fs/ocfs2/cluster/Makefile            |     4 +
 fs/ocfs2/cluster/heartbeat.c         |  2628 +++++++
 fs/ocfs2/cluster/heartbeat.h         |    89 +
 fs/ocfs2/cluster/masklog.c           |   155 +
 fs/ocfs2/cluster/masklog.h           |   219 +
 fs/ocfs2/cluster/netdebug.c          |   543 ++
 fs/ocfs2/cluster/nodemanager.c       |   989 +++
 fs/ocfs2/cluster/nodemanager.h       |    88 +
 fs/ocfs2/cluster/ocfs2_heartbeat.h   |    38 +
 fs/ocfs2/cluster/ocfs2_nodemanager.h |    45 +
 fs/ocfs2/cluster/quorum.c            |   331 +
 fs/ocfs2/cluster/quorum.h            |    36 +
 fs/ocfs2/cluster/sys.c               |    82 +
 fs/ocfs2/cluster/sys.h               |    33 +
 fs/ocfs2/cluster/tcp.c               |  2143 +++++
 fs/ocfs2/cluster/tcp.h               |   152 +
 fs/ocfs2/cluster/tcp_internal.h      |   242 +
 fs/ocfs2/cluster/ver.c               |    42 +
 fs/ocfs2/cluster/ver.h               |    31 +
 fs/ocfs2/dcache.c                    |   537 ++
 fs/ocfs2/dcache.h                    |    69 +
 fs/ocfs2/dir.c                       |  4555 +++++++++++
 fs/ocfs2/dir.h                       |   117 +
 fs/ocfs2/dlm/Makefile                |     7 +
 fs/ocfs2/dlm/dlmapi.h                |   220 +
 fs/ocfs2/dlm/dlmast.c                |   502 ++
 fs/ocfs2/dlm/dlmcommon.h             |  1181 +++
 fs/ocfs2/dlm/dlmconvert.c            |   548 ++
 fs/ocfs2/dlm/dlmconvert.h            |    35 +
 fs/ocfs2/dlm/dlmdebug.c              |  1017 +++
 fs/ocfs2/dlm/dlmdebug.h              |    81 +
 fs/ocfs2/dlm/dlmdomain.c             |  2397 ++++++
 fs/ocfs2/dlm/dlmdomain.h             |    36 +
 fs/ocfs2/dlm/dlmlock.c               |   767 ++
 fs/ocfs2/dlm/dlmmaster.c             |  3413 ++++++++
 fs/ocfs2/dlm/dlmrecovery.c           |  2886 +++++++
 fs/ocfs2/dlm/dlmthread.c             |   762 ++
 fs/ocfs2/dlm/dlmunlock.c             |   692 ++
 fs/ocfs2/dlm/dlmver.c                |    42 +
 fs/ocfs2/dlm/dlmver.h                |    31 +
 fs/ocfs2/dlmfs/Makefile              |     5 +
 fs/ocfs2/dlmfs/dlmfs.c               |   725 ++
 fs/ocfs2/dlmfs/dlmfsver.c            |    42 +
 fs/ocfs2/dlmfs/dlmfsver.h            |    31 +
 fs/ocfs2/dlmfs/userdlm.c             |   688 ++
 fs/ocfs2/dlmfs/userdlm.h             |   113 +
 fs/ocfs2/dlmglue.c                   |  4033 ++++++++++
 fs/ocfs2/dlmglue.h                   |   172 +
 fs/ocfs2/export.c                    |   276 +
 fs/ocfs2/export.h                    |    33 +
 fs/ocfs2/extent_map.c                |   902 +++
 fs/ocfs2/extent_map.h                |    90 +
 fs/ocfs2/file.c                      |  2688 +++++++
 fs/ocfs2/file.h                      |    76 +
 fs/ocfs2/heartbeat.c                 |   134 +
 fs/ocfs2/heartbeat.h                 |    45 +
 fs/ocfs2/inode.c                     |  1447 ++++
 fs/ocfs2/inode.h                     |   180 +
 fs/ocfs2/ioctl.c                     |  1031 +++
 fs/ocfs2/ioctl.h                     |    16 +
 fs/ocfs2/journal.c                   |  2192 ++++++
 fs/ocfs2/journal.h                   |   619 ++
 fs/ocfs2/localalloc.c                |  1307 +++
 fs/ocfs2/localalloc.h                |    62 +
 fs/ocfs2/locks.c                     |   139 +
 fs/ocfs2/locks.h                     |    32 +
 fs/ocfs2/mmap.c                      |   197 +
 fs/ocfs2/mmap.h                      |     6 +
 fs/ocfs2/move_extents.c              |  1152 +++
 fs/ocfs2/move_extents.h              |    22 +
 fs/ocfs2/namei.c                     |  2501 ++++++
 fs/ocfs2/namei.h                     |    45 +
 fs/ocfs2/ocfs1_fs_compat.h           |   109 +
 fs/ocfs2/ocfs2.h                     |   853 ++
 fs/ocfs2/ocfs2_fs.h                  |  1640 ++++
 fs/ocfs2/ocfs2_ioctl.h               |   242 +
 fs/ocfs2/ocfs2_lockid.h              |   128 +
 fs/ocfs2/ocfs2_lockingver.h          |    32 +
 fs/ocfs2/ocfs2_trace.h               |  2764 +++++++
 fs/ocfs2/quota.h                     |   117 +
 fs/ocfs2/quota_global.c              |   922 +++
 fs/ocfs2/quota_local.c               |  1316 ++++
 fs/ocfs2/refcounttree.c              |  4514 +++++++++++
 fs/ocfs2/refcounttree.h              |   118 +
 fs/ocfs2/reservations.c              |   839 ++
 fs/ocfs2/reservations.h              |   159 +
 fs/ocfs2/resize.c                    |   589 ++
 fs/ocfs2/resize.h                    |    32 +
 fs/ocfs2/slot_map.c                  |   538 ++
 fs/ocfs2/slot_map.h                  |    44 +
 fs/ocfs2/stack_o2cb.c                |   393 +
 fs/ocfs2/stack_user.c                |   908 +++
 fs/ocfs2/stackglue.c                 |   726 ++
 fs/ocfs2/stackglue.h                 |   292 +
 fs/ocfs2/suballoc.c                  |  2908 +++++++
 fs/ocfs2/suballoc.h                  |   221 +
 fs/ocfs2/super.c                     |  2651 +++++++
 fs/ocfs2/super.h                     |    55 +
 fs/ocfs2/symlink.c                   |   173 +
 fs/ocfs2/symlink.h                   |    42 +
 fs/ocfs2/sysfile.c                   |   180 +
 fs/ocfs2/sysfile.h                   |    33 +
 fs/ocfs2/uptodate.c                  |   638 ++
 fs/ocfs2/uptodate.h                  |    84 +
 fs/ocfs2/ver.c                       |    43 +
 fs/ocfs2/ver.h                       |    31 +
 fs/ocfs2/xattr.c                     |  7388 +++++++++++++++++
 fs/ocfs2/xattr.h                     |   100 +
 fs/omfs/Kconfig                      |    13 +
 fs/omfs/Makefile                     |     4 +
 fs/omfs/bitmap.c                     |   193 +
 fs/omfs/dir.c                        |   475 ++
 fs/omfs/file.c                       |   378 +
 fs/omfs/inode.c                      |   585 ++
 fs/omfs/omfs.h                       |    68 +
 fs/omfs/omfs_fs.h                    |    81 +
 fs/open.c                            |  1161 +++
 fs/openpromfs/Makefile               |     7 +
 fs/openpromfs/inode.c                |   473 ++
 fs/partitions/Kconfig                |   251 +
 fs/partitions/Makefile               |    20 +
 fs/partitions/acorn.c                |   556 ++
 fs/partitions/acorn.h                |    14 +
 fs/partitions/amiga.c                |   139 +
 fs/partitions/amiga.h                |     6 +
 fs/partitions/atari.c                |   149 +
 fs/partitions/atari.h                |    34 +
 fs/partitions/check.c                |   730 ++
 fs/partitions/check.h                |    49 +
 fs/partitions/efi.c                  |   675 ++
 fs/partitions/efi.h                  |   134 +
 fs/partitions/ibm.c                  |   275 +
 fs/partitions/ibm.h                  |     1 +
 fs/partitions/karma.c                |    57 +
 fs/partitions/karma.h                |     8 +
 fs/partitions/ldm.c                  |  1568 ++++
 fs/partitions/ldm.h                  |   215 +
 fs/partitions/mac.c                  |   134 +
 fs/partitions/mac.h                  |    44 +
 fs/partitions/msdos.c                |   552 ++
 fs/partitions/msdos.h                |     8 +
 fs/partitions/osf.c                  |    86 +
 fs/partitions/osf.h                  |     7 +
 fs/partitions/sgi.c                  |    82 +
 fs/partitions/sgi.h                  |     8 +
 fs/partitions/sun.c                  |   122 +
 fs/partitions/sun.h                  |     8 +
 fs/partitions/sysv68.c               |    95 +
 fs/partitions/sysv68.h               |     1 +
 fs/partitions/ultrix.c               |    48 +
 fs/partitions/ultrix.h               |     5 +
 fs/pipe.c                            |  1326 ++++
 fs/pnode.c                           |   371 +
 fs/pnode.h                           |    39 +
 fs/posix_acl.c                       |   388 +
 fs/proc/Kconfig                      |    69 +
 fs/proc/Makefile                     |    30 +
 fs/proc/array.c                      |   546 ++
 fs/proc/base.c                       |  3429 ++++++++
 fs/proc/cmdline.c                    |    29 +
 fs/proc/consoles.c                   |   114 +
 fs/proc/cpuinfo.c                    |    24 +
 fs/proc/devices.c                    |    70 +
 fs/proc/generic.c                    |   853 ++
 fs/proc/inode.c                      |   497 ++
 fs/proc/internal.h                   |   147 +
 fs/proc/interrupts.c                 |    53 +
 fs/proc/kcore.c                      |   635 ++
 fs/proc/kmsg.c                       |    64 +
 fs/proc/loadavg.c                    |    45 +
 fs/proc/meminfo.c                    |   194 +
 fs/proc/mmu.c                        |    60 +
 fs/proc/namespaces.c                 |   201 +
 fs/proc/nommu.c                      |   136 +
 fs/proc/page.c                       |   212 +
 fs/proc/proc_devtree.c               |   241 +
 fs/proc/proc_net.c                   |   241 +
 fs/proc/proc_sysctl.c                |   436 ++
 fs/proc/proc_tty.c                   |   189 +
 fs/proc/root.c                       |   213 +
 fs/proc/softirqs.c                   |    44 +
 fs/proc/stat.c                       |   170 +
 fs/proc/task_mmu.c                   |  1125 +++
 fs/proc/task_nommu.c                 |   271 +
 fs/proc/uptime.c                     |    53 +
 fs/proc/version.c                    |    34 +
 fs/proc/vmcore.c                     |   701 ++
 fs/pstore/Kconfig                    |    13 +
 fs/pstore/Makefile                   |     7 +
 fs/pstore/inode.c                    |   311 +
 fs/pstore/internal.h                 |     6 +
 fs/pstore/platform.c                 |   207 +
 fs/qnx4/Kconfig                      |    14 +
 fs/qnx4/Makefile                     |     7 +
 fs/qnx4/README                       |     9 +
 fs/qnx4/bitmap.c                     |    58 +
 fs/qnx4/dir.c                        |    85 +
 fs/qnx4/inode.c                      |   504 ++
 fs/qnx4/namei.c                      |   132 +
 fs/qnx4/qnx4.h                       |    49 +
 fs/quota/Kconfig                     |    74 +
 fs/quota/Makefile                    |     7 +
 fs/quota/compat.c                    |   118 +
 fs/quota/dquot.c                     |  2661 +++++++
 fs/quota/netlink.c                   |    96 +
 fs/quota/quota.c                     |   375 +
 fs/quota/quota_tree.c                |   659 ++
 fs/quota/quota_tree.h                |    25 +
 fs/quota/quota_v1.c                  |   233 +
 fs/quota/quota_v2.c                  |   336 +
 fs/quota/quotaio_v1.h                |    33 +
 fs/quota/quotaio_v2.h                |    73 +
 fs/ramfs/Makefile                    |     9 +
 fs/ramfs/file-mmu.c                  |    55 +
 fs/ramfs/file-nommu.c                |   266 +
 fs/ramfs/inode.c                     |   315 +
 fs/ramfs/internal.h                  |    14 +
 fs/read_write.c                      |   948 +++
 fs/read_write.h                      |    14 +
 fs/readdir.c                         |   311 +
 fs/reiserfs/Kconfig                  |    88 +
 fs/reiserfs/Makefile                 |    38 +
 fs/reiserfs/README                   |   161 +
 fs/reiserfs/bitmap.c                 |  1300 +++
 fs/reiserfs/dir.c                    |   310 +
 fs/reiserfs/do_balan.c               |  2074 +++++
 fs/reiserfs/file.c                   |   315 +
 fs/reiserfs/fix_node.c               |  2593 ++++++
 fs/reiserfs/hashes.c                 |   182 +
 fs/reiserfs/ibalance.c               |  1089 +++
 fs/reiserfs/inode.c                  |  3225 ++++++++
 fs/reiserfs/ioctl.c                  |   226 +
 fs/reiserfs/item_ops.c               |   756 ++
 fs/reiserfs/journal.c                |  4319 ++++++++++
 fs/reiserfs/lbalance.c               |  1311 ++++
 fs/reiserfs/lock.c                   |    97 +
 fs/reiserfs/namei.c                  |  1562 ++++
 fs/reiserfs/objectid.c               |   203 +
 fs/reiserfs/prints.c                 |   768 ++
 fs/reiserfs/procfs.c                 |   576 ++
 fs/reiserfs/resize.c                 |   212 +
 fs/reiserfs/stree.c                  |  2120 +++++
 fs/reiserfs/super.c                  |  2271 ++++++
 fs/reiserfs/tail_conversion.c        |   280 +
 fs/reiserfs/xattr.c                  |  1051 +++
 fs/reiserfs/xattr_acl.c              |   531 ++
 fs/reiserfs/xattr_security.c         |   120 +
 fs/reiserfs/xattr_trusted.c          |    56 +
 fs/reiserfs/xattr_user.c             |    52 +
 fs/romfs/Kconfig                     |    62 +
 fs/romfs/Makefile                    |    12 +
 fs/romfs/internal.h                  |    47 +
 fs/romfs/mmap-nommu.c                |    79 +
 fs/romfs/storage.c                   |   293 +
 fs/romfs/super.c                     |   662 ++
 fs/select.c                          |   999 +++
 fs/seq_file.c                        |   821 ++
 fs/signalfd.c                        |   295 +
 fs/splice.c                          |  2047 +++++
 fs/squashfs/Kconfig                  |    89 +
 fs/squashfs/Makefile                 |    10 +
 fs/squashfs/block.c                  |   210 +
 fs/squashfs/cache.c                  |   428 +
 fs/squashfs/decompressor.c           |   110 +
 fs/squashfs/decompressor.h           |    59 +
 fs/squashfs/dir.c                    |   244 +
 fs/squashfs/export.c                 |   163 +
 fs/squashfs/file.c                   |   501 ++
 fs/squashfs/fragment.c               |    99 +
 fs/squashfs/id.c                     |   102 +
 fs/squashfs/inode.c                  |   425 +
 fs/squashfs/lzo_wrapper.c            |   135 +
 fs/squashfs/namei.c                  |   257 +
 fs/squashfs/squashfs.h               |   102 +
 fs/squashfs/squashfs_fs.h            |   459 ++
 fs/squashfs/squashfs_fs_i.h          |    54 +
 fs/squashfs/squashfs_fs_sb.h         |    79 +
 fs/squashfs/super.c                  |   497 ++
 fs/squashfs/symlink.c                |   127 +
 fs/squashfs/xattr.c                  |   324 +
 fs/squashfs/xattr.h                  |    47 +
 fs/squashfs/xattr_id.c               |    95 +
 fs/squashfs/xz_wrapper.c             |   180 +
 fs/squashfs/zlib_wrapper.c           |   149 +
 fs/stack.c                           |    79 +
 fs/stat.c                            |   473 ++
 fs/statfs.c                          |   229 +
 fs/super.c                           |  1061 +++
 fs/sync.c                            |   402 +
 fs/sysfs/Kconfig                     |    23 +
 fs/sysfs/Makefile                    |     6 +
 fs/sysfs/bin.c                       |   506 ++
 fs/sysfs/dir.c                       |   964 +++
 fs/sysfs/file.c                      |   747 ++
 fs/sysfs/group.c                     |   207 +
 fs/sysfs/inode.c                     |   367 +
 fs/sysfs/mount.c                     |   201 +
 fs/sysfs/symlink.c                   |   307 +
 fs/sysfs/sysfs.h                     |   231 +
 fs/sysv/Kconfig                      |    36 +
 fs/sysv/Makefile                     |     8 +
 fs/sysv/balloc.c                     |   239 +
 fs/sysv/dir.c                        |   377 +
 fs/sysv/file.c                       |    58 +
 fs/sysv/ialloc.c                     |   234 +
 fs/sysv/inode.c                      |   381 +
 fs/sysv/itree.c                      |   494 ++
 fs/sysv/namei.c                      |   301 +
 fs/sysv/super.c                      |   590 ++
 fs/sysv/symlink.c                    |    20 +
 fs/sysv/sysv.h                       |   248 +
 fs/timerfd.c                         |   369 +
 fs/ubifs/Kconfig                     |    60 +
 fs/ubifs/Makefile                    |     9 +
 fs/ubifs/budget.c                    |   732 ++
 fs/ubifs/commit.c                    |   738 ++
 fs/ubifs/compress.c                  |   251 +
 fs/ubifs/debug.c                     |  2933 +++++++
 fs/ubifs/debug.h                     |   445 ++
 fs/ubifs/dir.c                       |  1206 +++
 fs/ubifs/file.c                      |  1593 ++++
 fs/ubifs/find.c                      |   977 +++
 fs/ubifs/gc.c                        |   985 +++
 fs/ubifs/io.c                        |  1054 +++
 fs/ubifs/ioctl.c                     |   205 +
 fs/ubifs/journal.c                   |  1466 ++++
 fs/ubifs/key.h                       |   548 ++
 fs/ubifs/log.c                       |   773 ++
 fs/ubifs/lprops.c                    |  1319 ++++
 fs/ubifs/lpt.c                       |  2277 ++++++
 fs/ubifs/lpt_commit.c                |  2050 +++++
 fs/ubifs/master.c                    |   396 +
 fs/ubifs/misc.h                      |   360 +
 fs/ubifs/orphan.c                    |   972 +++
 fs/ubifs/recovery.c                  |  1567 ++++
 fs/ubifs/replay.c                    |  1080 +++
 fs/ubifs/sb.c                        |   803 ++
 fs/ubifs/scan.c                      |   380 +
 fs/ubifs/shrinker.c                  |   325 +
 fs/ubifs/super.c                     |  2321 ++++++
 fs/ubifs/tnc.c                       |  3349 ++++++++
 fs/ubifs/tnc_commit.c                |  1103 +++
 fs/ubifs/tnc_misc.c                  |   494 ++
 fs/ubifs/ubifs-media.h               |   784 ++
 fs/ubifs/ubifs.h                     |  1777 +++++
 fs/ubifs/xattr.c                     |   572 ++
 fs/udf/Kconfig                       |    18 +
 fs/udf/Makefile                      |     9 +
 fs/udf/balloc.c                      |   815 ++
 fs/udf/dir.c                         |   210 +
 fs/udf/directory.c                   |   241 +
 fs/udf/ecma_167.h                    |   796 ++
 fs/udf/file.c                        |   249 +
 fs/udf/ialloc.c                      |   132 +
 fs/udf/inode.c                       |  2166 +++++
 fs/udf/lowlevel.c                    |    67 +
 fs/udf/misc.c                        |   296 +
 fs/udf/namei.c                       |  1339 ++++
 fs/udf/osta_udf.h                    |   279 +
 fs/udf/partition.c                   |   334 +
 fs/udf/super.c                       |  2324 ++++++
 fs/udf/symlink.c                     |   119 +
 fs/udf/truncate.c                    |   289 +
 fs/udf/udf_i.h                       |    45 +
 fs/udf/udf_sb.h                      |   182 +
 fs/udf/udfdecl.h                     |   241 +
 fs/udf/udfend.h                      |    77 +
 fs/udf/udftime.c                     |   171 +
 fs/udf/unicode.c                     |   493 ++
 fs/ufs/Kconfig                       |    43 +
 fs/ufs/Makefile                      |     8 +
 fs/ufs/balloc.c                      |   953 +++
 fs/ufs/cylinder.c                    |   201 +
 fs/ufs/dir.c                         |   666 ++
 fs/ufs/file.c                        |    46 +
 fs/ufs/ialloc.c                      |   354 +
 fs/ufs/inode.c                       |   906 +++
 fs/ufs/namei.c                       |   360 +
 fs/ufs/super.c                       |  1511 ++++
 fs/ufs/swab.h                        |   115 +
 fs/ufs/symlink.c                     |    53 +
 fs/ufs/truncate.c                    |   523 ++
 fs/ufs/ufs.h                         |   161 +
 fs/ufs/ufs_fs.h                      |   959 +++
 fs/ufs/util.c                        |   283 +
 fs/ufs/util.h                        |   592 ++
 fs/utimes.c                          |   224 +
 fs/xattr.c                           |   698 ++
 fs/xattr_acl.c                       |    98 +
 fs/xfs/Kconfig                       |    82 +
 fs/xfs/Makefile                      |   111 +
 fs/xfs/linux-2.6/kmem.c              |   132 +
 fs/xfs/linux-2.6/kmem.h              |   124 +
 fs/xfs/linux-2.6/mrlock.h            |    90 +
 fs/xfs/linux-2.6/time.h              |    36 +
 fs/xfs/linux-2.6/xfs_acl.c           |   464 ++
 fs/xfs/linux-2.6/xfs_aops.c          |  1506 ++++
 fs/xfs/linux-2.6/xfs_aops.h          |    68 +
 fs/xfs/linux-2.6/xfs_buf.c           |  1899 +++++
 fs/xfs/linux-2.6/xfs_buf.h           |   351 +
 fs/xfs/linux-2.6/xfs_discard.c       |   222 +
 fs/xfs/linux-2.6/xfs_discard.h       |    10 +
 fs/xfs/linux-2.6/xfs_export.c        |   250 +
 fs/xfs/linux-2.6/xfs_export.h        |    72 +
 fs/xfs/linux-2.6/xfs_file.c          |  1114 +++
 fs/xfs/linux-2.6/xfs_fs_subr.c       |    96 +
 fs/xfs/linux-2.6/xfs_globals.c       |    43 +
 fs/xfs/linux-2.6/xfs_ioctl.c         |  1556 ++++
 fs/xfs/linux-2.6/xfs_ioctl.h         |    85 +
 fs/xfs/linux-2.6/xfs_ioctl32.c       |   672 ++
 fs/xfs/linux-2.6/xfs_ioctl32.h       |   237 +
 fs/xfs/linux-2.6/xfs_iops.c          |   778 ++
 fs/xfs/linux-2.6/xfs_iops.h          |    30 +
 fs/xfs/linux-2.6/xfs_linux.h         |   307 +
 fs/xfs/linux-2.6/xfs_message.c       |   108 +
 fs/xfs/linux-2.6/xfs_message.h       |    39 +
 fs/xfs/linux-2.6/xfs_quotaops.c      |   139 +
 fs/xfs/linux-2.6/xfs_stats.c         |   122 +
 fs/xfs/linux-2.6/xfs_stats.h         |   223 +
 fs/xfs/linux-2.6/xfs_super.c         |  1720 ++++
 fs/xfs/linux-2.6/xfs_super.h         |    87 +
 fs/xfs/linux-2.6/xfs_sync.c          |  1132 +++
 fs/xfs/linux-2.6/xfs_sync.h          |    62 +
 fs/xfs/linux-2.6/xfs_sysctl.c        |   252 +
 fs/xfs/linux-2.6/xfs_sysctl.h        |   102 +
 fs/xfs/linux-2.6/xfs_trace.c         |    56 +
 fs/xfs/linux-2.6/xfs_trace.h         |  1776 +++++
 fs/xfs/linux-2.6/xfs_vnode.h         |    64 +
 fs/xfs/linux-2.6/xfs_xattr.c         |   241 +
 fs/xfs/quota/xfs_dquot.c             |  1496 ++++
 fs/xfs/quota/xfs_dquot.h             |   143 +
 fs/xfs/quota/xfs_dquot_item.c        |   533 ++
 fs/xfs/quota/xfs_dquot_item.h        |    48 +
 fs/xfs/quota/xfs_qm.c                |  2462 ++++++
 fs/xfs/quota/xfs_qm.h                |   172 +
 fs/xfs/quota/xfs_qm_bhv.c            |   176 +
 fs/xfs/quota/xfs_qm_stats.c          |   105 +
 fs/xfs/quota/xfs_qm_stats.h          |    53 +
 fs/xfs/quota/xfs_qm_syscalls.c       |  1259 +++
 fs/xfs/quota/xfs_quota_priv.h        |    53 +
 fs/xfs/quota/xfs_trans_dquot.c       |   895 +++
 fs/xfs/support/uuid.c                |    63 +
 fs/xfs/support/uuid.h                |    29 +
 fs/xfs/xfs.h                         |    29 +
 fs/xfs/xfs_acl.h                     |    62 +
 fs/xfs/xfs_ag.h                      |   289 +
 fs/xfs/xfs_alloc.c                   |  3047 +++++++
 fs/xfs/xfs_alloc.h                   |   253 +
 fs/xfs/xfs_alloc_btree.c             |   458 ++
 fs/xfs/xfs_alloc_btree.h             |   110 +
 fs/xfs/xfs_arch.h                    |   136 +
 fs/xfs/xfs_attr.c                    |  2230 ++++++
 fs/xfs/xfs_attr.h                    |   146 +
 fs/xfs/xfs_attr_leaf.c               |  2979 +++++++
 fs/xfs/xfs_attr_leaf.h               |   265 +
 fs/xfs/xfs_attr_sf.h                 |    70 +
 fs/xfs/xfs_bit.c                     |   120 +
 fs/xfs/xfs_bit.h                     |    84 +
 fs/xfs/xfs_bmap.c                    |  6139 +++++++++++++++
 fs/xfs/xfs_bmap.h                    |   402 +
 fs/xfs/xfs_bmap_btree.c              |   919 +++
 fs/xfs/xfs_bmap_btree.h              |   240 +
 fs/xfs/xfs_btree.c                   |  3679 +++++++++
 fs/xfs/xfs_btree.h                   |   455 ++
 fs/xfs/xfs_btree_trace.c             |   249 +
 fs/xfs/xfs_btree_trace.h             |    99 +
 fs/xfs/xfs_buf_item.c                |  1064 +++
 fs/xfs/xfs_buf_item.h                |   130 +
 fs/xfs/xfs_da_btree.c                |  2463 ++++++
 fs/xfs/xfs_da_btree.h                |   281 +
 fs/xfs/xfs_dfrag.c                   |   457 ++
 fs/xfs/xfs_dfrag.h                   |    53 +
 fs/xfs/xfs_dinode.h                  |   216 +
 fs/xfs/xfs_dir2.c                    |   764 ++
 fs/xfs/xfs_dir2.h                    |   106 +
 fs/xfs/xfs_dir2_block.c              |  1233 +++
 fs/xfs/xfs_dir2_block.h              |    92 +
 fs/xfs/xfs_dir2_data.c               |   833 ++
 fs/xfs/xfs_dir2_data.h               |   184 +
 fs/xfs/xfs_dir2_leaf.c               |  1881 +++++
 fs/xfs/xfs_dir2_leaf.h               |   253 +
 fs/xfs/xfs_dir2_node.c               |  2076 +++++
 fs/xfs/xfs_dir2_node.h               |   100 +
 fs/xfs/xfs_dir2_sf.c                 |  1267 +++
 fs/xfs/xfs_dir2_sf.h                 |   171 +
 fs/xfs/xfs_error.c                   |   185 +
 fs/xfs/xfs_error.h                   |   161 +
 fs/xfs/xfs_extfree_item.c            |   520 ++
 fs/xfs/xfs_extfree_item.h            |   161 +
 fs/xfs/xfs_filestream.c              |   824 ++
 fs/xfs/xfs_filestream.h              |    74 +
 fs/xfs/xfs_fs.h                      |   501 ++
 fs/xfs/xfs_fsops.c                   |   671 ++
 fs/xfs/xfs_fsops.h                   |    30 +
 fs/xfs/xfs_ialloc.c                  |  1563 ++++
 fs/xfs/xfs_ialloc.h                  |   164 +
 fs/xfs/xfs_ialloc_btree.c            |   346 +
 fs/xfs/xfs_ialloc_btree.h            |   112 +
 fs/xfs/xfs_iget.c                    |   727 ++
 fs/xfs/xfs_inode.c                   |  4145 ++++++++++
 fs/xfs/xfs_inode.h                   |   600 ++
 fs/xfs/xfs_inode_item.c              |  1060 +++
 fs/xfs/xfs_inode_item.h              |   166 +
 fs/xfs/xfs_inum.h                    |    80 +
 fs/xfs/xfs_iomap.c                   |   748 ++
 fs/xfs/xfs_iomap.h                   |    32 +
 fs/xfs/xfs_itable.c                  |   736 ++
 fs/xfs/xfs_itable.h                  |   106 +
 fs/xfs/xfs_log.c                     |  3767 +++++++++
 fs/xfs/xfs_log.h                     |   198 +
 fs/xfs/xfs_log_cil.c                 |   804 ++
 fs/xfs/xfs_log_priv.h                |   668 ++
 fs/xfs/xfs_log_recover.c             |  3843 +++++++++
 fs/xfs/xfs_log_recover.h             |    66 +
 fs/xfs/xfs_mount.c                   |  2585 ++++++
 fs/xfs/xfs_mount.h                   |   402 +
 fs/xfs/xfs_mru_cache.c               |   576 ++
 fs/xfs/xfs_mru_cache.h               |    53 +
 fs/xfs/xfs_quota.h                   |   390 +
 fs/xfs/xfs_rename.c                  |   358 +
 fs/xfs/xfs_rtalloc.c                 |  2325 ++++++
 fs/xfs/xfs_rtalloc.h                 |   166 +
 fs/xfs/xfs_rw.c                      |   175 +
 fs/xfs/xfs_rw.h                      |    49 +
 fs/xfs/xfs_sb.h                      |   543 ++
 fs/xfs/xfs_trans.c                   |  2026 +++++
 fs/xfs/xfs_trans.h                   |   506 ++
 fs/xfs/xfs_trans_ail.c               |   890 +++
 fs/xfs/xfs_trans_buf.c               |   879 +++
 fs/xfs/xfs_trans_extfree.c           |   134 +
 fs/xfs/xfs_trans_inode.c             |   190 +
 fs/xfs/xfs_trans_priv.h              |   143 +
 fs/xfs/xfs_trans_space.h             |    86 +
 fs/xfs/xfs_types.h                   |   156 +
 fs/xfs/xfs_utils.c                   |   321 +
 fs/xfs/xfs_utils.h                   |    27 +
 fs/xfs/xfs_vnodeops.c                |  2852 +++++++
 fs/xfs/xfs_vnodeops.h                |    63 +
 fs/yaffs2/Kconfig                    |   161 +
 fs/yaffs2/Makefile                   |    17 +
 fs/yaffs2/yaffs_allocator.c          |   396 +
 fs/yaffs2/yaffs_allocator.h          |    30 +
 fs/yaffs2/yaffs_attribs.c            |   124 +
 fs/yaffs2/yaffs_attribs.h            |    28 +
 fs/yaffs2/yaffs_bitmap.c             |    98 +
 fs/yaffs2/yaffs_bitmap.h             |    33 +
 fs/yaffs2/yaffs_checkptrw.c          |   415 +
 fs/yaffs2/yaffs_checkptrw.h          |    33 +
 fs/yaffs2/yaffs_ecc.c                |   298 +
 fs/yaffs2/yaffs_ecc.h                |    44 +
 fs/yaffs2/yaffs_getblockinfo.h       |    35 +
 fs/yaffs2/yaffs_guts.c               |  5164 ++++++++++++
 fs/yaffs2/yaffs_guts.h               |   915 +++
 fs/yaffs2/yaffs_linux.h              |    41 +
 fs/yaffs2/yaffs_mtdif.c              |    54 +
 fs/yaffs2/yaffs_mtdif.h              |    23 +
 fs/yaffs2/yaffs_mtdif1.c             |   330 +
 fs/yaffs2/yaffs_mtdif1.h             |    29 +
 fs/yaffs2/yaffs_mtdif2.c             |   225 +
 fs/yaffs2/yaffs_mtdif2.h             |    29 +
 fs/yaffs2/yaffs_nameval.c            |   201 +
 fs/yaffs2/yaffs_nameval.h            |    28 +
 fs/yaffs2/yaffs_nand.c               |   127 +
 fs/yaffs2/yaffs_nand.h               |    38 +
 fs/yaffs2/yaffs_packedtags1.c        |    53 +
 fs/yaffs2/yaffs_packedtags1.h        |    39 +
 fs/yaffs2/yaffs_packedtags2.c        |   196 +
 fs/yaffs2/yaffs_packedtags2.h        |    47 +
 fs/yaffs2/yaffs_tagscompat.c         |   422 +
 fs/yaffs2/yaffs_tagscompat.h         |    36 +
 fs/yaffs2/yaffs_tagsvalidity.c       |    27 +
 fs/yaffs2/yaffs_tagsvalidity.h       |    23 +
 fs/yaffs2/yaffs_trace.h              |    57 +
 fs/yaffs2/yaffs_verify.c             |   535 ++
 fs/yaffs2/yaffs_verify.h             |    43 +
 fs/yaffs2/yaffs_vfs.c                |  2792 +++++++
 fs/yaffs2/yaffs_yaffs1.c             |   433 +
 fs/yaffs2/yaffs_yaffs1.h             |    22 +
 fs/yaffs2/yaffs_yaffs2.c             |  1598 ++++
 fs/yaffs2/yaffs_yaffs2.h             |    39 +
 fs/yaffs2/yportenv.h                 |    70 +
 1680 files changed, 1011662 insertions(+)
 create mode 100644 fs/9p/Kconfig
 create mode 100644 fs/9p/Makefile
 create mode 100644 fs/9p/acl.c
 create mode 100644 fs/9p/acl.h
 create mode 100644 fs/9p/cache.c
 create mode 100644 fs/9p/cache.h
 create mode 100644 fs/9p/fid.c
 create mode 100644 fs/9p/fid.h
 create mode 100644 fs/9p/v9fs.c
 create mode 100644 fs/9p/v9fs.h
 create mode 100644 fs/9p/v9fs_vfs.h
 create mode 100644 fs/9p/vfs_addr.c
 create mode 100644 fs/9p/vfs_dentry.c
 create mode 100644 fs/9p/vfs_dir.c
 create mode 100644 fs/9p/vfs_file.c
 create mode 100644 fs/9p/vfs_inode.c
 create mode 100644 fs/9p/vfs_inode_dotl.c
 create mode 100644 fs/9p/vfs_super.c
 create mode 100644 fs/9p/xattr.c
 create mode 100644 fs/9p/xattr.h
 create mode 100644 fs/9p/xattr_user.c
 create mode 100644 fs/Kconfig
 create mode 100644 fs/Kconfig.binfmt
 create mode 100644 fs/Makefile
 create mode 100644 fs/adfs/Kconfig
 create mode 100644 fs/adfs/Makefile
 create mode 100644 fs/adfs/adfs.h
 create mode 100644 fs/adfs/dir.c
 create mode 100644 fs/adfs/dir_f.c
 create mode 100644 fs/adfs/dir_f.h
 create mode 100644 fs/adfs/dir_fplus.c
 create mode 100644 fs/adfs/dir_fplus.h
 create mode 100644 fs/adfs/file.c
 create mode 100644 fs/adfs/inode.c
 create mode 100644 fs/adfs/map.c
 create mode 100644 fs/adfs/super.c
 create mode 100644 fs/affs/Changes
 create mode 100644 fs/affs/Kconfig
 create mode 100644 fs/affs/Makefile
 create mode 100644 fs/affs/affs.h
 create mode 100644 fs/affs/amigaffs.c
 create mode 100644 fs/affs/bitmap.c
 create mode 100644 fs/affs/dir.c
 create mode 100644 fs/affs/file.c
 create mode 100644 fs/affs/inode.c
 create mode 100644 fs/affs/namei.c
 create mode 100644 fs/affs/super.c
 create mode 100644 fs/affs/symlink.c
 create mode 100644 fs/afs/Kconfig
 create mode 100644 fs/afs/Makefile
 create mode 100644 fs/afs/afs.h
 create mode 100644 fs/afs/afs_cm.h
 create mode 100644 fs/afs/afs_fs.h
 create mode 100644 fs/afs/afs_vl.h
 create mode 100644 fs/afs/cache.c
 create mode 100644 fs/afs/callback.c
 create mode 100644 fs/afs/cell.c
 create mode 100644 fs/afs/cmservice.c
 create mode 100644 fs/afs/dir.c
 create mode 100644 fs/afs/file.c
 create mode 100644 fs/afs/flock.c
 create mode 100644 fs/afs/fsclient.c
 create mode 100644 fs/afs/inode.c
 create mode 100644 fs/afs/internal.h
 create mode 100644 fs/afs/main.c
 create mode 100644 fs/afs/misc.c
 create mode 100644 fs/afs/mntpt.c
 create mode 100644 fs/afs/netdevices.c
 create mode 100644 fs/afs/proc.c
 create mode 100644 fs/afs/rxrpc.c
 create mode 100644 fs/afs/security.c
 create mode 100644 fs/afs/server.c
 create mode 100644 fs/afs/super.c
 create mode 100644 fs/afs/vlclient.c
 create mode 100644 fs/afs/vlocation.c
 create mode 100644 fs/afs/vnode.c
 create mode 100644 fs/afs/volume.c
 create mode 100644 fs/afs/write.c
 create mode 100644 fs/aio.c
 create mode 100644 fs/anon_inodes.c
 create mode 100644 fs/attr.c
 create mode 100644 fs/autofs4/Kconfig
 create mode 100644 fs/autofs4/Makefile
 create mode 100644 fs/autofs4/autofs_i.h
 create mode 100644 fs/autofs4/dev-ioctl.c
 create mode 100644 fs/autofs4/expire.c
 create mode 100644 fs/autofs4/init.c
 create mode 100644 fs/autofs4/inode.c
 create mode 100644 fs/autofs4/root.c
 create mode 100644 fs/autofs4/symlink.c
 create mode 100644 fs/autofs4/waitq.c
 create mode 100644 fs/bad_inode.c
 create mode 100644 fs/befs/ChangeLog
 create mode 100644 fs/befs/Kconfig
 create mode 100644 fs/befs/Makefile
 create mode 100644 fs/befs/TODO
 create mode 100644 fs/befs/befs.h
 create mode 100644 fs/befs/befs_fs_types.h
 create mode 100644 fs/befs/btree.c
 create mode 100644 fs/befs/btree.h
 create mode 100644 fs/befs/datastream.c
 create mode 100644 fs/befs/datastream.h
 create mode 100644 fs/befs/debug.c
 create mode 100644 fs/befs/endian.h
 create mode 100644 fs/befs/inode.c
 create mode 100644 fs/befs/inode.h
 create mode 100644 fs/befs/io.c
 create mode 100644 fs/befs/io.h
 create mode 100644 fs/befs/linuxvfs.c
 create mode 100644 fs/befs/super.c
 create mode 100644 fs/befs/super.h
 create mode 100644 fs/bfs/Kconfig
 create mode 100644 fs/bfs/Makefile
 create mode 100644 fs/bfs/bfs.h
 create mode 100644 fs/bfs/dir.c
 create mode 100644 fs/bfs/file.c
 create mode 100644 fs/bfs/inode.c
 create mode 100644 fs/binfmt_aout.c
 create mode 100644 fs/binfmt_elf.c
 create mode 100644 fs/binfmt_elf_fdpic.c
 create mode 100644 fs/binfmt_em86.c
 create mode 100644 fs/binfmt_flat.c
 create mode 100644 fs/binfmt_misc.c
 create mode 100644 fs/binfmt_script.c
 create mode 100644 fs/binfmt_som.c
 create mode 100644 fs/bio-integrity.c
 create mode 100644 fs/bio.c
 create mode 100644 fs/block_dev.c
 create mode 100644 fs/btrfs/Kconfig
 create mode 100644 fs/btrfs/Makefile
 create mode 100644 fs/btrfs/acl.c
 create mode 100644 fs/btrfs/async-thread.c
 create mode 100644 fs/btrfs/async-thread.h
 create mode 100644 fs/btrfs/btrfs_inode.h
 create mode 100644 fs/btrfs/compat.h
 create mode 100644 fs/btrfs/compression.c
 create mode 100644 fs/btrfs/compression.h
 create mode 100644 fs/btrfs/ctree.c
 create mode 100644 fs/btrfs/ctree.h
 create mode 100644 fs/btrfs/delayed-inode.c
 create mode 100644 fs/btrfs/delayed-inode.h
 create mode 100644 fs/btrfs/delayed-ref.c
 create mode 100644 fs/btrfs/delayed-ref.h
 create mode 100644 fs/btrfs/dir-item.c
 create mode 100644 fs/btrfs/disk-io.c
 create mode 100644 fs/btrfs/disk-io.h
 create mode 100644 fs/btrfs/export.c
 create mode 100644 fs/btrfs/export.h
 create mode 100644 fs/btrfs/extent-tree.c
 create mode 100644 fs/btrfs/extent_io.c
 create mode 100644 fs/btrfs/extent_io.h
 create mode 100644 fs/btrfs/extent_map.c
 create mode 100644 fs/btrfs/extent_map.h
 create mode 100644 fs/btrfs/file-item.c
 create mode 100644 fs/btrfs/file.c
 create mode 100644 fs/btrfs/free-space-cache.c
 create mode 100644 fs/btrfs/free-space-cache.h
 create mode 100644 fs/btrfs/hash.h
 create mode 100644 fs/btrfs/inode-item.c
 create mode 100644 fs/btrfs/inode-map.c
 create mode 100644 fs/btrfs/inode-map.h
 create mode 100644 fs/btrfs/inode.c
 create mode 100644 fs/btrfs/ioctl.c
 create mode 100644 fs/btrfs/ioctl.h
 create mode 100644 fs/btrfs/locking.c
 create mode 100644 fs/btrfs/locking.h
 create mode 100644 fs/btrfs/lzo.c
 create mode 100644 fs/btrfs/ordered-data.c
 create mode 100644 fs/btrfs/ordered-data.h
 create mode 100644 fs/btrfs/orphan.c
 create mode 100644 fs/btrfs/print-tree.c
 create mode 100644 fs/btrfs/print-tree.h
 create mode 100644 fs/btrfs/ref-cache.c
 create mode 100644 fs/btrfs/ref-cache.h
 create mode 100644 fs/btrfs/relocation.c
 create mode 100644 fs/btrfs/root-tree.c
 create mode 100644 fs/btrfs/scrub.c
 create mode 100644 fs/btrfs/struct-funcs.c
 create mode 100644 fs/btrfs/super.c
 create mode 100644 fs/btrfs/sysfs.c
 create mode 100644 fs/btrfs/transaction.c
 create mode 100644 fs/btrfs/transaction.h
 create mode 100644 fs/btrfs/tree-defrag.c
 create mode 100644 fs/btrfs/tree-log.c
 create mode 100644 fs/btrfs/tree-log.h
 create mode 100644 fs/btrfs/version.h
 create mode 100644 fs/btrfs/volumes.c
 create mode 100644 fs/btrfs/volumes.h
 create mode 100644 fs/btrfs/xattr.c
 create mode 100644 fs/btrfs/xattr.h
 create mode 100644 fs/btrfs/zlib.c
 create mode 100644 fs/buffer.c
 create mode 100644 fs/cachefiles/Kconfig
 create mode 100644 fs/cachefiles/Makefile
 create mode 100644 fs/cachefiles/bind.c
 create mode 100644 fs/cachefiles/daemon.c
 create mode 100644 fs/cachefiles/interface.c
 create mode 100644 fs/cachefiles/internal.h
 create mode 100644 fs/cachefiles/key.c
 create mode 100644 fs/cachefiles/main.c
 create mode 100644 fs/cachefiles/namei.c
 create mode 100644 fs/cachefiles/proc.c
 create mode 100644 fs/cachefiles/rdwr.c
 create mode 100644 fs/cachefiles/security.c
 create mode 100644 fs/cachefiles/xattr.c
 create mode 100644 fs/ceph/Kconfig
 create mode 100644 fs/ceph/Makefile
 create mode 100644 fs/ceph/addr.c
 create mode 100644 fs/ceph/caps.c
 create mode 100644 fs/ceph/ceph_frag.c
 create mode 100644 fs/ceph/debugfs.c
 create mode 100644 fs/ceph/dir.c
 create mode 100644 fs/ceph/export.c
 create mode 100644 fs/ceph/file.c
 create mode 100644 fs/ceph/inode.c
 create mode 100644 fs/ceph/ioctl.c
 create mode 100644 fs/ceph/ioctl.h
 create mode 100644 fs/ceph/locks.c
 create mode 100644 fs/ceph/mds_client.c
 create mode 100644 fs/ceph/mds_client.h
 create mode 100644 fs/ceph/mdsmap.c
 create mode 100644 fs/ceph/snap.c
 create mode 100644 fs/ceph/strings.c
 create mode 100644 fs/ceph/super.c
 create mode 100644 fs/ceph/super.h
 create mode 100644 fs/ceph/xattr.c
 create mode 100644 fs/char_dev.c
 create mode 100644 fs/cifs/AUTHORS
 create mode 100644 fs/cifs/CHANGES
 create mode 100644 fs/cifs/Kconfig
 create mode 100644 fs/cifs/Makefile
 create mode 100644 fs/cifs/README
 create mode 100644 fs/cifs/TODO
 create mode 100644 fs/cifs/asn1.c
 create mode 100644 fs/cifs/cache.c
 create mode 100644 fs/cifs/cifs_debug.c
 create mode 100644 fs/cifs/cifs_debug.h
 create mode 100644 fs/cifs/cifs_dfs_ref.c
 create mode 100644 fs/cifs/cifs_fs_sb.h
 create mode 100644 fs/cifs/cifs_spnego.c
 create mode 100644 fs/cifs/cifs_spnego.h
 create mode 100644 fs/cifs/cifs_unicode.c
 create mode 100644 fs/cifs/cifs_unicode.h
 create mode 100644 fs/cifs/cifs_uniupr.h
 create mode 100644 fs/cifs/cifsacl.c
 create mode 100644 fs/cifs/cifsacl.h
 create mode 100644 fs/cifs/cifsencrypt.c
 create mode 100644 fs/cifs/cifsfs.c
 create mode 100644 fs/cifs/cifsfs.h
 create mode 100644 fs/cifs/cifsglob.h
 create mode 100644 fs/cifs/cifspdu.h
 create mode 100644 fs/cifs/cifsproto.h
 create mode 100644 fs/cifs/cifssmb.c
 create mode 100644 fs/cifs/connect.c
 create mode 100644 fs/cifs/dir.c
 create mode 100644 fs/cifs/dns_resolve.c
 create mode 100644 fs/cifs/dns_resolve.h
 create mode 100644 fs/cifs/export.c
 create mode 100644 fs/cifs/file.c
 create mode 100644 fs/cifs/fscache.c
 create mode 100644 fs/cifs/fscache.h
 create mode 100644 fs/cifs/inode.c
 create mode 100644 fs/cifs/ioctl.c
 create mode 100644 fs/cifs/link.c
 create mode 100644 fs/cifs/misc.c
 create mode 100644 fs/cifs/netmisc.c
 create mode 100644 fs/cifs/nterr.c
 create mode 100644 fs/cifs/nterr.h
 create mode 100644 fs/cifs/ntlmssp.h
 create mode 100644 fs/cifs/readdir.c
 create mode 100644 fs/cifs/rfc1002pdu.h
 create mode 100644 fs/cifs/sess.c
 create mode 100644 fs/cifs/smbencrypt.c
 create mode 100644 fs/cifs/smberr.h
 create mode 100644 fs/cifs/smbfsctl.h
 create mode 100644 fs/cifs/transport.c
 create mode 100644 fs/cifs/xattr.c
 create mode 100644 fs/coda/Kconfig
 create mode 100644 fs/coda/Makefile
 create mode 100644 fs/coda/cache.c
 create mode 100644 fs/coda/cnode.c
 create mode 100644 fs/coda/coda_cache.h
 create mode 100644 fs/coda/coda_fs_i.h
 create mode 100644 fs/coda/coda_int.h
 create mode 100644 fs/coda/coda_linux.c
 create mode 100644 fs/coda/coda_linux.h
 create mode 100644 fs/coda/dir.c
 create mode 100644 fs/coda/file.c
 create mode 100644 fs/coda/inode.c
 create mode 100644 fs/coda/pioctl.c
 create mode 100644 fs/coda/psdev.c
 create mode 100644 fs/coda/symlink.c
 create mode 100644 fs/coda/sysctl.c
 create mode 100644 fs/coda/upcall.c
 create mode 100644 fs/compat.c
 create mode 100644 fs/compat_binfmt_elf.c
 create mode 100644 fs/compat_ioctl.c
 create mode 100644 fs/configfs/Kconfig
 create mode 100644 fs/configfs/Makefile
 create mode 100644 fs/configfs/configfs_internal.h
 create mode 100644 fs/configfs/dir.c
 create mode 100644 fs/configfs/file.c
 create mode 100644 fs/configfs/inode.c
 create mode 100644 fs/configfs/item.c
 create mode 100644 fs/configfs/mount.c
 create mode 100644 fs/configfs/symlink.c
 create mode 100644 fs/cramfs/Kconfig
 create mode 100644 fs/cramfs/Makefile
 create mode 100644 fs/cramfs/README
 create mode 100644 fs/cramfs/inode.c
 create mode 100644 fs/cramfs/uncompress.c
 create mode 100644 fs/dcache.c
 create mode 100644 fs/dcookies.c
 create mode 100644 fs/debugfs/Makefile
 create mode 100644 fs/debugfs/file.c
 create mode 100644 fs/debugfs/inode.c
 create mode 100644 fs/devpts/Makefile
 create mode 100644 fs/devpts/inode.c
 create mode 100644 fs/direct-io.c
 create mode 100644 fs/dlm/Kconfig
 create mode 100644 fs/dlm/Makefile
 create mode 100644 fs/dlm/ast.c
 create mode 100644 fs/dlm/ast.h
 create mode 100644 fs/dlm/config.c
 create mode 100644 fs/dlm/config.h
 create mode 100644 fs/dlm/debug_fs.c
 create mode 100644 fs/dlm/dir.c
 create mode 100644 fs/dlm/dir.h
 create mode 100644 fs/dlm/dlm_internal.h
 create mode 100644 fs/dlm/lock.c
 create mode 100644 fs/dlm/lock.h
 create mode 100644 fs/dlm/lockspace.c
 create mode 100644 fs/dlm/lockspace.h
 create mode 100644 fs/dlm/lowcomms.c
 create mode 100644 fs/dlm/lowcomms.h
 create mode 100644 fs/dlm/lvb_table.h
 create mode 100644 fs/dlm/main.c
 create mode 100644 fs/dlm/member.c
 create mode 100644 fs/dlm/member.h
 create mode 100644 fs/dlm/memory.c
 create mode 100644 fs/dlm/memory.h
 create mode 100644 fs/dlm/midcomms.c
 create mode 100644 fs/dlm/midcomms.h
 create mode 100644 fs/dlm/netlink.c
 create mode 100644 fs/dlm/plock.c
 create mode 100644 fs/dlm/rcom.c
 create mode 100644 fs/dlm/rcom.h
 create mode 100644 fs/dlm/recover.c
 create mode 100644 fs/dlm/recover.h
 create mode 100644 fs/dlm/recoverd.c
 create mode 100644 fs/dlm/recoverd.h
 create mode 100644 fs/dlm/requestqueue.c
 create mode 100644 fs/dlm/requestqueue.h
 create mode 100644 fs/dlm/user.c
 create mode 100644 fs/dlm/user.h
 create mode 100644 fs/dlm/util.c
 create mode 100644 fs/dlm/util.h
 create mode 100644 fs/drop_caches.c
 create mode 100644 fs/ecryptfs/Kconfig
 create mode 100644 fs/ecryptfs/Makefile
 create mode 100644 fs/ecryptfs/crypto.c
 create mode 100644 fs/ecryptfs/debug.c
 create mode 100644 fs/ecryptfs/dentry.c
 create mode 100644 fs/ecryptfs/ecryptfs_kernel.h
 create mode 100644 fs/ecryptfs/file.c
 create mode 100644 fs/ecryptfs/inode.c
 create mode 100644 fs/ecryptfs/keystore.c
 create mode 100644 fs/ecryptfs/kthread.c
 create mode 100644 fs/ecryptfs/main.c
 create mode 100644 fs/ecryptfs/messaging.c
 create mode 100644 fs/ecryptfs/miscdev.c
 create mode 100644 fs/ecryptfs/mmap.c
 create mode 100644 fs/ecryptfs/read_write.c
 create mode 100644 fs/ecryptfs/super.c
 create mode 100644 fs/efs/Kconfig
 create mode 100644 fs/efs/Makefile
 create mode 100644 fs/efs/dir.c
 create mode 100644 fs/efs/efs.h
 create mode 100644 fs/efs/file.c
 create mode 100644 fs/efs/inode.c
 create mode 100644 fs/efs/namei.c
 create mode 100644 fs/efs/super.c
 create mode 100644 fs/efs/symlink.c
 create mode 100644 fs/eventfd.c
 create mode 100644 fs/eventpoll.c
 create mode 100644 fs/exec.c
 create mode 100644 fs/exofs/BUGS
 create mode 100644 fs/exofs/Kbuild
 create mode 100644 fs/exofs/Kconfig
 create mode 100644 fs/exofs/common.h
 create mode 100644 fs/exofs/dir.c
 create mode 100644 fs/exofs/exofs.h
 create mode 100644 fs/exofs/file.c
 create mode 100644 fs/exofs/inode.c
 create mode 100644 fs/exofs/ios.c
 create mode 100644 fs/exofs/namei.c
 create mode 100644 fs/exofs/pnfs.h
 create mode 100644 fs/exofs/super.c
 create mode 100644 fs/exofs/symlink.c
 create mode 100644 fs/exportfs/Makefile
 create mode 100644 fs/exportfs/expfs.c
 create mode 100644 fs/ext2/Kconfig
 create mode 100644 fs/ext2/Makefile
 create mode 100644 fs/ext2/acl.c
 create mode 100644 fs/ext2/acl.h
 create mode 100644 fs/ext2/balloc.c
 create mode 100644 fs/ext2/dir.c
 create mode 100644 fs/ext2/ext2.h
 create mode 100644 fs/ext2/file.c
 create mode 100644 fs/ext2/ialloc.c
 create mode 100644 fs/ext2/inode.c
 create mode 100644 fs/ext2/ioctl.c
 create mode 100644 fs/ext2/namei.c
 create mode 100644 fs/ext2/super.c
 create mode 100644 fs/ext2/symlink.c
 create mode 100644 fs/ext2/xattr.c
 create mode 100644 fs/ext2/xattr.h
 create mode 100644 fs/ext2/xattr_security.c
 create mode 100644 fs/ext2/xattr_trusted.c
 create mode 100644 fs/ext2/xattr_user.c
 create mode 100644 fs/ext2/xip.c
 create mode 100644 fs/ext2/xip.h
 create mode 100644 fs/ext3/Kconfig
 create mode 100644 fs/ext3/Makefile
 create mode 100644 fs/ext3/acl.c
 create mode 100644 fs/ext3/acl.h
 create mode 100644 fs/ext3/balloc.c
 create mode 100644 fs/ext3/bitmap.c
 create mode 100644 fs/ext3/dir.c
 create mode 100644 fs/ext3/ext3_jbd.c
 create mode 100644 fs/ext3/file.c
 create mode 100644 fs/ext3/fsync.c
 create mode 100644 fs/ext3/hash.c
 create mode 100644 fs/ext3/ialloc.c
 create mode 100644 fs/ext3/inode.c
 create mode 100644 fs/ext3/ioctl.c
 create mode 100644 fs/ext3/namei.c
 create mode 100644 fs/ext3/namei.h
 create mode 100644 fs/ext3/resize.c
 create mode 100644 fs/ext3/super.c
 create mode 100644 fs/ext3/symlink.c
 create mode 100644 fs/ext3/xattr.c
 create mode 100644 fs/ext3/xattr.h
 create mode 100644 fs/ext3/xattr_security.c
 create mode 100644 fs/ext3/xattr_trusted.c
 create mode 100644 fs/ext3/xattr_user.c
 create mode 100644 fs/ext4/Kconfig
 create mode 100644 fs/ext4/Makefile
 create mode 100644 fs/ext4/acl.c
 create mode 100644 fs/ext4/acl.h
 create mode 100644 fs/ext4/balloc.c
 create mode 100644 fs/ext4/bitmap.c
 create mode 100644 fs/ext4/block_validity.c
 create mode 100644 fs/ext4/dir.c
 create mode 100644 fs/ext4/ext4.h
 create mode 100644 fs/ext4/ext4_extents.h
 create mode 100644 fs/ext4/ext4_jbd2.c
 create mode 100644 fs/ext4/ext4_jbd2.h
 create mode 100644 fs/ext4/extents.c
 create mode 100644 fs/ext4/file.c
 create mode 100644 fs/ext4/fsync.c
 create mode 100644 fs/ext4/hash.c
 create mode 100644 fs/ext4/ialloc.c
 create mode 100644 fs/ext4/inode.c
 create mode 100644 fs/ext4/ioctl.c
 create mode 100644 fs/ext4/mballoc.c
 create mode 100644 fs/ext4/mballoc.h
 create mode 100644 fs/ext4/migrate.c
 create mode 100644 fs/ext4/mmp.c
 create mode 100644 fs/ext4/move_extent.c
 create mode 100644 fs/ext4/namei.c
 create mode 100644 fs/ext4/page-io.c
 create mode 100644 fs/ext4/resize.c
 create mode 100644 fs/ext4/super.c
 create mode 100644 fs/ext4/symlink.c
 create mode 100644 fs/ext4/xattr.c
 create mode 100644 fs/ext4/xattr.h
 create mode 100644 fs/ext4/xattr_security.c
 create mode 100644 fs/ext4/xattr_trusted.c
 create mode 100644 fs/ext4/xattr_user.c
 create mode 100644 fs/fat/Kconfig
 create mode 100644 fs/fat/Makefile
 create mode 100644 fs/fat/cache.c
 create mode 100644 fs/fat/dir.c
 create mode 100644 fs/fat/fat.h
 create mode 100644 fs/fat/fatent.c
 create mode 100644 fs/fat/file.c
 create mode 100644 fs/fat/inode.c
 create mode 100644 fs/fat/misc.c
 create mode 100644 fs/fat/namei_msdos.c
 create mode 100644 fs/fat/namei_vfat.c
 create mode 100644 fs/fcntl.c
 create mode 100644 fs/fhandle.c
 create mode 100644 fs/fifo.c
 create mode 100644 fs/file.c
 create mode 100644 fs/file_table.c
 create mode 100644 fs/filesystems.c
 create mode 100644 fs/freevxfs/Kconfig
 create mode 100644 fs/freevxfs/Makefile
 create mode 100644 fs/freevxfs/vxfs.h
 create mode 100644 fs/freevxfs/vxfs_bmap.c
 create mode 100644 fs/freevxfs/vxfs_dir.h
 create mode 100644 fs/freevxfs/vxfs_extern.h
 create mode 100644 fs/freevxfs/vxfs_fshead.c
 create mode 100644 fs/freevxfs/vxfs_fshead.h
 create mode 100644 fs/freevxfs/vxfs_immed.c
 create mode 100644 fs/freevxfs/vxfs_inode.c
 create mode 100644 fs/freevxfs/vxfs_inode.h
 create mode 100644 fs/freevxfs/vxfs_lookup.c
 create mode 100644 fs/freevxfs/vxfs_olt.c
 create mode 100644 fs/freevxfs/vxfs_olt.h
 create mode 100644 fs/freevxfs/vxfs_subr.c
 create mode 100644 fs/freevxfs/vxfs_super.c
 create mode 100644 fs/fs-writeback.c
 create mode 100644 fs/fs_struct.c
 create mode 100644 fs/fscache/Kconfig
 create mode 100644 fs/fscache/Makefile
 create mode 100644 fs/fscache/cache.c
 create mode 100644 fs/fscache/cookie.c
 create mode 100644 fs/fscache/fsdef.c
 create mode 100644 fs/fscache/histogram.c
 create mode 100644 fs/fscache/internal.h
 create mode 100644 fs/fscache/main.c
 create mode 100644 fs/fscache/netfs.c
 create mode 100644 fs/fscache/object-list.c
 create mode 100644 fs/fscache/object.c
 create mode 100644 fs/fscache/operation.c
 create mode 100644 fs/fscache/page.c
 create mode 100644 fs/fscache/proc.c
 create mode 100644 fs/fscache/stats.c
 create mode 100644 fs/fuse/Kconfig
 create mode 100644 fs/fuse/Makefile
 create mode 100644 fs/fuse/control.c
 create mode 100644 fs/fuse/cuse.c
 create mode 100644 fs/fuse/dev.c
 create mode 100644 fs/fuse/dir.c
 create mode 100644 fs/fuse/file.c
 create mode 100644 fs/fuse/fuse_i.h
 create mode 100644 fs/fuse/inode.c
 create mode 100644 fs/generic_acl.c
 create mode 100644 fs/gfs2/Kconfig
 create mode 100644 fs/gfs2/Makefile
 create mode 100644 fs/gfs2/acl.c
 create mode 100644 fs/gfs2/acl.h
 create mode 100644 fs/gfs2/aops.c
 create mode 100644 fs/gfs2/bmap.c
 create mode 100644 fs/gfs2/bmap.h
 create mode 100644 fs/gfs2/dentry.c
 create mode 100644 fs/gfs2/dir.c
 create mode 100644 fs/gfs2/dir.h
 create mode 100644 fs/gfs2/export.c
 create mode 100644 fs/gfs2/file.c
 create mode 100644 fs/gfs2/gfs2.h
 create mode 100644 fs/gfs2/glock.c
 create mode 100644 fs/gfs2/glock.h
 create mode 100644 fs/gfs2/glops.c
 create mode 100644 fs/gfs2/glops.h
 create mode 100644 fs/gfs2/incore.h
 create mode 100644 fs/gfs2/inode.c
 create mode 100644 fs/gfs2/inode.h
 create mode 100644 fs/gfs2/lock_dlm.c
 create mode 100644 fs/gfs2/log.c
 create mode 100644 fs/gfs2/log.h
 create mode 100644 fs/gfs2/lops.c
 create mode 100644 fs/gfs2/lops.h
 create mode 100644 fs/gfs2/main.c
 create mode 100644 fs/gfs2/meta_io.c
 create mode 100644 fs/gfs2/meta_io.h
 create mode 100644 fs/gfs2/ops_fstype.c
 create mode 100644 fs/gfs2/quota.c
 create mode 100644 fs/gfs2/quota.h
 create mode 100644 fs/gfs2/recovery.c
 create mode 100644 fs/gfs2/recovery.h
 create mode 100644 fs/gfs2/rgrp.c
 create mode 100644 fs/gfs2/rgrp.h
 create mode 100644 fs/gfs2/super.c
 create mode 100644 fs/gfs2/super.h
 create mode 100644 fs/gfs2/sys.c
 create mode 100644 fs/gfs2/sys.h
 create mode 100644 fs/gfs2/trace_gfs2.h
 create mode 100644 fs/gfs2/trans.c
 create mode 100644 fs/gfs2/trans.h
 create mode 100644 fs/gfs2/util.c
 create mode 100644 fs/gfs2/util.h
 create mode 100644 fs/gfs2/xattr.c
 create mode 100644 fs/gfs2/xattr.h
 create mode 100644 fs/hfs/Kconfig
 create mode 100644 fs/hfs/Makefile
 create mode 100644 fs/hfs/attr.c
 create mode 100644 fs/hfs/bfind.c
 create mode 100644 fs/hfs/bitmap.c
 create mode 100644 fs/hfs/bnode.c
 create mode 100644 fs/hfs/brec.c
 create mode 100644 fs/hfs/btree.c
 create mode 100644 fs/hfs/btree.h
 create mode 100644 fs/hfs/catalog.c
 create mode 100644 fs/hfs/dir.c
 create mode 100644 fs/hfs/extent.c
 create mode 100644 fs/hfs/hfs.h
 create mode 100644 fs/hfs/hfs_fs.h
 create mode 100644 fs/hfs/inode.c
 create mode 100644 fs/hfs/mdb.c
 create mode 100644 fs/hfs/part_tbl.c
 create mode 100644 fs/hfs/string.c
 create mode 100644 fs/hfs/super.c
 create mode 100644 fs/hfs/sysdep.c
 create mode 100644 fs/hfs/trans.c
 create mode 100644 fs/hfsplus/Kconfig
 create mode 100644 fs/hfsplus/Makefile
 create mode 100644 fs/hfsplus/bfind.c
 create mode 100644 fs/hfsplus/bitmap.c
 create mode 100644 fs/hfsplus/bnode.c
 create mode 100644 fs/hfsplus/brec.c
 create mode 100644 fs/hfsplus/btree.c
 create mode 100644 fs/hfsplus/catalog.c
 create mode 100644 fs/hfsplus/dir.c
 create mode 100644 fs/hfsplus/extents.c
 create mode 100644 fs/hfsplus/hfsplus_fs.h
 create mode 100644 fs/hfsplus/hfsplus_raw.h
 create mode 100644 fs/hfsplus/inode.c
 create mode 100644 fs/hfsplus/ioctl.c
 create mode 100644 fs/hfsplus/options.c
 create mode 100644 fs/hfsplus/part_tbl.c
 create mode 100644 fs/hfsplus/super.c
 create mode 100644 fs/hfsplus/tables.c
 create mode 100644 fs/hfsplus/unicode.c
 create mode 100644 fs/hfsplus/wrapper.c
 create mode 100644 fs/hostfs/Makefile
 create mode 100644 fs/hostfs/hostfs.h
 create mode 100644 fs/hostfs/hostfs_kern.c
 create mode 100644 fs/hostfs/hostfs_user.c
 create mode 100644 fs/hpfs/Kconfig
 create mode 100644 fs/hpfs/Makefile
 create mode 100644 fs/hpfs/alloc.c
 create mode 100644 fs/hpfs/anode.c
 create mode 100644 fs/hpfs/buffer.c
 create mode 100644 fs/hpfs/dentry.c
 create mode 100644 fs/hpfs/dir.c
 create mode 100644 fs/hpfs/dnode.c
 create mode 100644 fs/hpfs/ea.c
 create mode 100644 fs/hpfs/file.c
 create mode 100644 fs/hpfs/hpfs.h
 create mode 100644 fs/hpfs/hpfs_fn.h
 create mode 100644 fs/hpfs/inode.c
 create mode 100644 fs/hpfs/map.c
 create mode 100644 fs/hpfs/name.c
 create mode 100644 fs/hpfs/namei.c
 create mode 100644 fs/hpfs/super.c
 create mode 100644 fs/hppfs/Makefile
 create mode 100644 fs/hppfs/hppfs.c
 create mode 100644 fs/hugetlbfs/Makefile
 create mode 100644 fs/hugetlbfs/inode.c
 create mode 100644 fs/inode.c
 create mode 100644 fs/internal.h
 create mode 100644 fs/ioctl.c
 create mode 100644 fs/ioprio.c
 create mode 100644 fs/isofs/Kconfig
 create mode 100644 fs/isofs/Makefile
 create mode 100644 fs/isofs/compress.c
 create mode 100644 fs/isofs/dir.c
 create mode 100644 fs/isofs/export.c
 create mode 100644 fs/isofs/inode.c
 create mode 100644 fs/isofs/isofs.h
 create mode 100644 fs/isofs/joliet.c
 create mode 100644 fs/isofs/namei.c
 create mode 100644 fs/isofs/rock.c
 create mode 100644 fs/isofs/rock.h
 create mode 100644 fs/isofs/util.c
 create mode 100644 fs/isofs/zisofs.h
 create mode 100644 fs/jbd/Kconfig
 create mode 100644 fs/jbd/Makefile
 create mode 100644 fs/jbd/checkpoint.c
 create mode 100644 fs/jbd/commit.c
 create mode 100644 fs/jbd/journal.c
 create mode 100644 fs/jbd/recovery.c
 create mode 100644 fs/jbd/revoke.c
 create mode 100644 fs/jbd/transaction.c
 create mode 100644 fs/jbd2/Kconfig
 create mode 100644 fs/jbd2/Makefile
 create mode 100644 fs/jbd2/checkpoint.c
 create mode 100644 fs/jbd2/commit.c
 create mode 100644 fs/jbd2/journal.c
 create mode 100644 fs/jbd2/recovery.c
 create mode 100644 fs/jbd2/revoke.c
 create mode 100644 fs/jbd2/transaction.c
 create mode 100644 fs/jffs2/Kconfig
 create mode 100644 fs/jffs2/LICENCE
 create mode 100644 fs/jffs2/Makefile
 create mode 100644 fs/jffs2/README.Locking
 create mode 100644 fs/jffs2/TODO
 create mode 100644 fs/jffs2/acl.c
 create mode 100644 fs/jffs2/acl.h
 create mode 100644 fs/jffs2/background.c
 create mode 100644 fs/jffs2/build.c
 create mode 100644 fs/jffs2/compr.c
 create mode 100644 fs/jffs2/compr.h
 create mode 100644 fs/jffs2/compr_lzo.c
 create mode 100644 fs/jffs2/compr_rtime.c
 create mode 100644 fs/jffs2/compr_rubin.c
 create mode 100644 fs/jffs2/compr_zlib.c
 create mode 100644 fs/jffs2/debug.c
 create mode 100644 fs/jffs2/debug.h
 create mode 100644 fs/jffs2/dir.c
 create mode 100644 fs/jffs2/erase.c
 create mode 100644 fs/jffs2/file.c
 create mode 100644 fs/jffs2/fs.c
 create mode 100644 fs/jffs2/gc.c
 create mode 100644 fs/jffs2/ioctl.c
 create mode 100644 fs/jffs2/jffs2_fs_i.h
 create mode 100644 fs/jffs2/jffs2_fs_sb.h
 create mode 100644 fs/jffs2/malloc.c
 create mode 100644 fs/jffs2/nodelist.c
 create mode 100644 fs/jffs2/nodelist.h
 create mode 100644 fs/jffs2/nodemgmt.c
 create mode 100644 fs/jffs2/os-linux.h
 create mode 100644 fs/jffs2/read.c
 create mode 100644 fs/jffs2/readinode.c
 create mode 100644 fs/jffs2/scan.c
 create mode 100644 fs/jffs2/security.c
 create mode 100644 fs/jffs2/summary.c
 create mode 100644 fs/jffs2/summary.h
 create mode 100644 fs/jffs2/super.c
 create mode 100644 fs/jffs2/symlink.c
 create mode 100644 fs/jffs2/wbuf.c
 create mode 100644 fs/jffs2/write.c
 create mode 100644 fs/jffs2/writev.c
 create mode 100644 fs/jffs2/xattr.c
 create mode 100644 fs/jffs2/xattr.h
 create mode 100644 fs/jffs2/xattr_trusted.c
 create mode 100644 fs/jffs2/xattr_user.c
 create mode 100644 fs/jfs/Kconfig
 create mode 100644 fs/jfs/Makefile
 create mode 100644 fs/jfs/acl.c
 create mode 100644 fs/jfs/endian24.h
 create mode 100644 fs/jfs/file.c
 create mode 100644 fs/jfs/inode.c
 create mode 100644 fs/jfs/ioctl.c
 create mode 100644 fs/jfs/jfs_acl.h
 create mode 100644 fs/jfs/jfs_btree.h
 create mode 100644 fs/jfs/jfs_debug.c
 create mode 100644 fs/jfs/jfs_debug.h
 create mode 100644 fs/jfs/jfs_dinode.h
 create mode 100644 fs/jfs/jfs_dmap.c
 create mode 100644 fs/jfs/jfs_dmap.h
 create mode 100644 fs/jfs/jfs_dtree.c
 create mode 100644 fs/jfs/jfs_dtree.h
 create mode 100644 fs/jfs/jfs_extent.c
 create mode 100644 fs/jfs/jfs_extent.h
 create mode 100644 fs/jfs/jfs_filsys.h
 create mode 100644 fs/jfs/jfs_imap.c
 create mode 100644 fs/jfs/jfs_imap.h
 create mode 100644 fs/jfs/jfs_incore.h
 create mode 100644 fs/jfs/jfs_inode.c
 create mode 100644 fs/jfs/jfs_inode.h
 create mode 100644 fs/jfs/jfs_lock.h
 create mode 100644 fs/jfs/jfs_logmgr.c
 create mode 100644 fs/jfs/jfs_logmgr.h
 create mode 100644 fs/jfs/jfs_metapage.c
 create mode 100644 fs/jfs/jfs_metapage.h
 create mode 100644 fs/jfs/jfs_mount.c
 create mode 100644 fs/jfs/jfs_superblock.h
 create mode 100644 fs/jfs/jfs_txnmgr.c
 create mode 100644 fs/jfs/jfs_txnmgr.h
 create mode 100644 fs/jfs/jfs_types.h
 create mode 100644 fs/jfs/jfs_umount.c
 create mode 100644 fs/jfs/jfs_unicode.c
 create mode 100644 fs/jfs/jfs_unicode.h
 create mode 100644 fs/jfs/jfs_uniupr.c
 create mode 100644 fs/jfs/jfs_xattr.h
 create mode 100644 fs/jfs/jfs_xtree.c
 create mode 100644 fs/jfs/jfs_xtree.h
 create mode 100644 fs/jfs/namei.c
 create mode 100644 fs/jfs/resize.c
 create mode 100644 fs/jfs/super.c
 create mode 100644 fs/jfs/symlink.c
 create mode 100644 fs/jfs/xattr.c
 create mode 100644 fs/libfs.c
 create mode 100644 fs/lockd/Makefile
 create mode 100644 fs/lockd/clnt4xdr.c
 create mode 100644 fs/lockd/clntlock.c
 create mode 100644 fs/lockd/clntproc.c
 create mode 100644 fs/lockd/clntxdr.c
 create mode 100644 fs/lockd/grace.c
 create mode 100644 fs/lockd/host.c
 create mode 100644 fs/lockd/mon.c
 create mode 100644 fs/lockd/svc.c
 create mode 100644 fs/lockd/svc4proc.c
 create mode 100644 fs/lockd/svclock.c
 create mode 100644 fs/lockd/svcproc.c
 create mode 100644 fs/lockd/svcshare.c
 create mode 100644 fs/lockd/svcsubs.c
 create mode 100644 fs/lockd/xdr.c
 create mode 100644 fs/lockd/xdr4.c
 create mode 100644 fs/locks.c
 create mode 100644 fs/logfs/Kconfig
 create mode 100644 fs/logfs/Makefile
 create mode 100644 fs/logfs/compr.c
 create mode 100644 fs/logfs/dev_bdev.c
 create mode 100644 fs/logfs/dev_mtd.c
 create mode 100644 fs/logfs/dir.c
 create mode 100644 fs/logfs/file.c
 create mode 100644 fs/logfs/gc.c
 create mode 100644 fs/logfs/inode.c
 create mode 100644 fs/logfs/journal.c
 create mode 100644 fs/logfs/logfs.h
 create mode 100644 fs/logfs/logfs_abi.h
 create mode 100644 fs/logfs/readwrite.c
 create mode 100644 fs/logfs/segment.c
 create mode 100644 fs/logfs/super.c
 create mode 100644 fs/mbcache.c
 create mode 100644 fs/minix/Kconfig
 create mode 100644 fs/minix/Makefile
 create mode 100644 fs/minix/bitmap.c
 create mode 100644 fs/minix/dir.c
 create mode 100644 fs/minix/file.c
 create mode 100644 fs/minix/inode.c
 create mode 100644 fs/minix/itree_common.c
 create mode 100644 fs/minix/itree_v1.c
 create mode 100644 fs/minix/itree_v2.c
 create mode 100644 fs/minix/minix.h
 create mode 100644 fs/minix/namei.c
 create mode 100644 fs/mpage.c
 create mode 100644 fs/namei.c
 create mode 100644 fs/namespace.c
 create mode 100644 fs/ncpfs/Kconfig
 create mode 100644 fs/ncpfs/Makefile
 create mode 100644 fs/ncpfs/dir.c
 create mode 100644 fs/ncpfs/file.c
 create mode 100644 fs/ncpfs/getopt.c
 create mode 100644 fs/ncpfs/getopt.h
 create mode 100644 fs/ncpfs/inode.c
 create mode 100644 fs/ncpfs/ioctl.c
 create mode 100644 fs/ncpfs/mmap.c
 create mode 100644 fs/ncpfs/ncp_fs.h
 create mode 100644 fs/ncpfs/ncp_fs_i.h
 create mode 100644 fs/ncpfs/ncp_fs_sb.h
 create mode 100644 fs/ncpfs/ncplib_kernel.c
 create mode 100644 fs/ncpfs/ncplib_kernel.h
 create mode 100644 fs/ncpfs/ncpsign_kernel.c
 create mode 100644 fs/ncpfs/ncpsign_kernel.h
 create mode 100644 fs/ncpfs/sock.c
 create mode 100644 fs/ncpfs/symlink.c
 create mode 100644 fs/nfs/Kconfig
 create mode 100644 fs/nfs/Makefile
 create mode 100644 fs/nfs/cache_lib.c
 create mode 100644 fs/nfs/cache_lib.h
 create mode 100644 fs/nfs/callback.c
 create mode 100644 fs/nfs/callback.h
 create mode 100644 fs/nfs/callback_proc.c
 create mode 100644 fs/nfs/callback_xdr.c
 create mode 100644 fs/nfs/client.c
 create mode 100644 fs/nfs/delegation.c
 create mode 100644 fs/nfs/delegation.h
 create mode 100644 fs/nfs/dir.c
 create mode 100644 fs/nfs/direct.c
 create mode 100644 fs/nfs/dns_resolve.c
 create mode 100644 fs/nfs/dns_resolve.h
 create mode 100644 fs/nfs/file.c
 create mode 100644 fs/nfs/fscache-index.c
 create mode 100644 fs/nfs/fscache.c
 create mode 100644 fs/nfs/fscache.h
 create mode 100644 fs/nfs/getroot.c
 create mode 100644 fs/nfs/idmap.c
 create mode 100644 fs/nfs/inode.c
 create mode 100644 fs/nfs/internal.h
 create mode 100644 fs/nfs/iostat.h
 create mode 100644 fs/nfs/mount_clnt.c
 create mode 100644 fs/nfs/namespace.c
 create mode 100644 fs/nfs/nfs2xdr.c
 create mode 100644 fs/nfs/nfs3acl.c
 create mode 100644 fs/nfs/nfs3proc.c
 create mode 100644 fs/nfs/nfs3xdr.c
 create mode 100644 fs/nfs/nfs4_fs.h
 create mode 100644 fs/nfs/nfs4filelayout.c
 create mode 100644 fs/nfs/nfs4filelayout.h
 create mode 100644 fs/nfs/nfs4filelayoutdev.c
 create mode 100644 fs/nfs/nfs4namespace.c
 create mode 100644 fs/nfs/nfs4proc.c
 create mode 100644 fs/nfs/nfs4renewd.c
 create mode 100644 fs/nfs/nfs4state.c
 create mode 100644 fs/nfs/nfs4xdr.c
 create mode 100644 fs/nfs/nfsroot.c
 create mode 100644 fs/nfs/objlayout/Kbuild
 create mode 100644 fs/nfs/objlayout/objio_osd.c
 create mode 100644 fs/nfs/objlayout/objlayout.c
 create mode 100644 fs/nfs/objlayout/objlayout.h
 create mode 100644 fs/nfs/objlayout/pnfs_osd_xdr_cli.c
 create mode 100644 fs/nfs/pagelist.c
 create mode 100644 fs/nfs/pnfs.c
 create mode 100644 fs/nfs/pnfs.h
 create mode 100644 fs/nfs/pnfs_dev.c
 create mode 100644 fs/nfs/proc.c
 create mode 100644 fs/nfs/read.c
 create mode 100644 fs/nfs/super.c
 create mode 100644 fs/nfs/symlink.c
 create mode 100644 fs/nfs/sysctl.c
 create mode 100644 fs/nfs/unlink.c
 create mode 100644 fs/nfs/write.c
 create mode 100644 fs/nfs_common/Makefile
 create mode 100644 fs/nfs_common/nfsacl.c
 create mode 100644 fs/nfsctl.c
 create mode 100644 fs/nfsd/Kconfig
 create mode 100644 fs/nfsd/Makefile
 create mode 100644 fs/nfsd/acl.h
 create mode 100644 fs/nfsd/auth.c
 create mode 100644 fs/nfsd/auth.h
 create mode 100644 fs/nfsd/cache.h
 create mode 100644 fs/nfsd/export.c
 create mode 100644 fs/nfsd/idmap.h
 create mode 100644 fs/nfsd/lockd.c
 create mode 100644 fs/nfsd/nfs2acl.c
 create mode 100644 fs/nfsd/nfs3acl.c
 create mode 100644 fs/nfsd/nfs3proc.c
 create mode 100644 fs/nfsd/nfs3xdr.c
 create mode 100644 fs/nfsd/nfs4acl.c
 create mode 100644 fs/nfsd/nfs4callback.c
 create mode 100644 fs/nfsd/nfs4idmap.c
 create mode 100644 fs/nfsd/nfs4proc.c
 create mode 100644 fs/nfsd/nfs4recover.c
 create mode 100644 fs/nfsd/nfs4state.c
 create mode 100644 fs/nfsd/nfs4xdr.c
 create mode 100644 fs/nfsd/nfscache.c
 create mode 100644 fs/nfsd/nfsctl.c
 create mode 100644 fs/nfsd/nfsd.h
 create mode 100644 fs/nfsd/nfsfh.c
 create mode 100644 fs/nfsd/nfsfh.h
 create mode 100644 fs/nfsd/nfsproc.c
 create mode 100644 fs/nfsd/nfssvc.c
 create mode 100644 fs/nfsd/nfsxdr.c
 create mode 100644 fs/nfsd/state.h
 create mode 100644 fs/nfsd/stats.c
 create mode 100644 fs/nfsd/vfs.c
 create mode 100644 fs/nfsd/vfs.h
 create mode 100644 fs/nfsd/xdr.h
 create mode 100644 fs/nfsd/xdr3.h
 create mode 100644 fs/nfsd/xdr4.h
 create mode 100644 fs/nilfs2/Kconfig
 create mode 100644 fs/nilfs2/Makefile
 create mode 100644 fs/nilfs2/alloc.c
 create mode 100644 fs/nilfs2/alloc.h
 create mode 100644 fs/nilfs2/bmap.c
 create mode 100644 fs/nilfs2/bmap.h
 create mode 100644 fs/nilfs2/btnode.c
 create mode 100644 fs/nilfs2/btnode.h
 create mode 100644 fs/nilfs2/btree.c
 create mode 100644 fs/nilfs2/btree.h
 create mode 100644 fs/nilfs2/cpfile.c
 create mode 100644 fs/nilfs2/cpfile.h
 create mode 100644 fs/nilfs2/dat.c
 create mode 100644 fs/nilfs2/dat.h
 create mode 100644 fs/nilfs2/dir.c
 create mode 100644 fs/nilfs2/direct.c
 create mode 100644 fs/nilfs2/direct.h
 create mode 100644 fs/nilfs2/export.h
 create mode 100644 fs/nilfs2/file.c
 create mode 100644 fs/nilfs2/gcinode.c
 create mode 100644 fs/nilfs2/ifile.c
 create mode 100644 fs/nilfs2/ifile.h
 create mode 100644 fs/nilfs2/inode.c
 create mode 100644 fs/nilfs2/ioctl.c
 create mode 100644 fs/nilfs2/mdt.c
 create mode 100644 fs/nilfs2/mdt.h
 create mode 100644 fs/nilfs2/namei.c
 create mode 100644 fs/nilfs2/nilfs.h
 create mode 100644 fs/nilfs2/page.c
 create mode 100644 fs/nilfs2/page.h
 create mode 100644 fs/nilfs2/recovery.c
 create mode 100644 fs/nilfs2/segbuf.c
 create mode 100644 fs/nilfs2/segbuf.h
 create mode 100644 fs/nilfs2/segment.c
 create mode 100644 fs/nilfs2/segment.h
 create mode 100644 fs/nilfs2/sufile.c
 create mode 100644 fs/nilfs2/sufile.h
 create mode 100644 fs/nilfs2/super.c
 create mode 100644 fs/nilfs2/the_nilfs.c
 create mode 100644 fs/nilfs2/the_nilfs.h
 create mode 100644 fs/nls/Kconfig
 create mode 100644 fs/nls/Makefile
 create mode 100644 fs/nls/nls_ascii.c
 create mode 100644 fs/nls/nls_base.c
 create mode 100644 fs/nls/nls_cp1250.c
 create mode 100644 fs/nls/nls_cp1251.c
 create mode 100644 fs/nls/nls_cp1255.c
 create mode 100644 fs/nls/nls_cp437.c
 create mode 100644 fs/nls/nls_cp737.c
 create mode 100644 fs/nls/nls_cp775.c
 create mode 100644 fs/nls/nls_cp850.c
 create mode 100644 fs/nls/nls_cp852.c
 create mode 100644 fs/nls/nls_cp855.c
 create mode 100644 fs/nls/nls_cp857.c
 create mode 100644 fs/nls/nls_cp860.c
 create mode 100644 fs/nls/nls_cp861.c
 create mode 100644 fs/nls/nls_cp862.c
 create mode 100644 fs/nls/nls_cp863.c
 create mode 100644 fs/nls/nls_cp864.c
 create mode 100644 fs/nls/nls_cp865.c
 create mode 100644 fs/nls/nls_cp866.c
 create mode 100644 fs/nls/nls_cp869.c
 create mode 100644 fs/nls/nls_cp874.c
 create mode 100644 fs/nls/nls_cp932.c
 create mode 100644 fs/nls/nls_cp936.c
 create mode 100644 fs/nls/nls_cp949.c
 create mode 100644 fs/nls/nls_cp950.c
 create mode 100644 fs/nls/nls_euc-jp.c
 create mode 100644 fs/nls/nls_iso8859-1.c
 create mode 100644 fs/nls/nls_iso8859-13.c
 create mode 100644 fs/nls/nls_iso8859-14.c
 create mode 100644 fs/nls/nls_iso8859-15.c
 create mode 100644 fs/nls/nls_iso8859-2.c
 create mode 100644 fs/nls/nls_iso8859-3.c
 create mode 100644 fs/nls/nls_iso8859-4.c
 create mode 100644 fs/nls/nls_iso8859-5.c
 create mode 100644 fs/nls/nls_iso8859-6.c
 create mode 100644 fs/nls/nls_iso8859-7.c
 create mode 100644 fs/nls/nls_iso8859-9.c
 create mode 100644 fs/nls/nls_koi8-r.c
 create mode 100644 fs/nls/nls_koi8-ru.c
 create mode 100644 fs/nls/nls_koi8-u.c
 create mode 100644 fs/nls/nls_utf8.c
 create mode 100644 fs/no-block.c
 create mode 100644 fs/notify/Kconfig
 create mode 100644 fs/notify/Makefile
 create mode 100644 fs/notify/dnotify/Kconfig
 create mode 100644 fs/notify/dnotify/Makefile
 create mode 100644 fs/notify/dnotify/dnotify.c
 create mode 100644 fs/notify/fanotify/Kconfig
 create mode 100644 fs/notify/fanotify/Makefile
 create mode 100644 fs/notify/fanotify/fanotify.c
 create mode 100644 fs/notify/fanotify/fanotify_user.c
 create mode 100644 fs/notify/fsnotify.c
 create mode 100644 fs/notify/fsnotify.h
 create mode 100644 fs/notify/group.c
 create mode 100644 fs/notify/inode_mark.c
 create mode 100644 fs/notify/inotify/Kconfig
 create mode 100644 fs/notify/inotify/Makefile
 create mode 100644 fs/notify/inotify/inotify.h
 create mode 100644 fs/notify/inotify/inotify_fsnotify.c
 create mode 100644 fs/notify/inotify/inotify_user.c
 create mode 100644 fs/notify/mark.c
 create mode 100644 fs/notify/notification.c
 create mode 100644 fs/notify/vfsmount_mark.c
 create mode 100644 fs/ntfs/Kconfig
 create mode 100644 fs/ntfs/Makefile
 create mode 100644 fs/ntfs/aops.c
 create mode 100644 fs/ntfs/aops.h
 create mode 100644 fs/ntfs/attrib.c
 create mode 100644 fs/ntfs/attrib.h
 create mode 100644 fs/ntfs/bitmap.c
 create mode 100644 fs/ntfs/bitmap.h
 create mode 100644 fs/ntfs/collate.c
 create mode 100644 fs/ntfs/collate.h
 create mode 100644 fs/ntfs/compress.c
 create mode 100644 fs/ntfs/debug.c
 create mode 100644 fs/ntfs/debug.h
 create mode 100644 fs/ntfs/dir.c
 create mode 100644 fs/ntfs/dir.h
 create mode 100644 fs/ntfs/endian.h
 create mode 100644 fs/ntfs/file.c
 create mode 100644 fs/ntfs/index.c
 create mode 100644 fs/ntfs/index.h
 create mode 100644 fs/ntfs/inode.c
 create mode 100644 fs/ntfs/inode.h
 create mode 100644 fs/ntfs/layout.h
 create mode 100644 fs/ntfs/lcnalloc.c
 create mode 100644 fs/ntfs/lcnalloc.h
 create mode 100644 fs/ntfs/logfile.c
 create mode 100644 fs/ntfs/logfile.h
 create mode 100644 fs/ntfs/malloc.h
 create mode 100644 fs/ntfs/mft.c
 create mode 100644 fs/ntfs/mft.h
 create mode 100644 fs/ntfs/mst.c
 create mode 100644 fs/ntfs/namei.c
 create mode 100644 fs/ntfs/ntfs.h
 create mode 100644 fs/ntfs/quota.c
 create mode 100644 fs/ntfs/quota.h
 create mode 100644 fs/ntfs/runlist.c
 create mode 100644 fs/ntfs/runlist.h
 create mode 100644 fs/ntfs/super.c
 create mode 100644 fs/ntfs/sysctl.c
 create mode 100644 fs/ntfs/sysctl.h
 create mode 100644 fs/ntfs/time.h
 create mode 100644 fs/ntfs/types.h
 create mode 100644 fs/ntfs/unistr.c
 create mode 100644 fs/ntfs/upcase.c
 create mode 100644 fs/ntfs/usnjrnl.c
 create mode 100644 fs/ntfs/usnjrnl.h
 create mode 100644 fs/ntfs/volume.h
 create mode 100644 fs/ocfs2/Kconfig
 create mode 100644 fs/ocfs2/Makefile
 create mode 100644 fs/ocfs2/acl.c
 create mode 100644 fs/ocfs2/acl.h
 create mode 100644 fs/ocfs2/alloc.c
 create mode 100644 fs/ocfs2/alloc.h
 create mode 100644 fs/ocfs2/aops.c
 create mode 100644 fs/ocfs2/aops.h
 create mode 100644 fs/ocfs2/blockcheck.c
 create mode 100644 fs/ocfs2/blockcheck.h
 create mode 100644 fs/ocfs2/buffer_head_io.c
 create mode 100644 fs/ocfs2/buffer_head_io.h
 create mode 100644 fs/ocfs2/cluster/Makefile
 create mode 100644 fs/ocfs2/cluster/heartbeat.c
 create mode 100644 fs/ocfs2/cluster/heartbeat.h
 create mode 100644 fs/ocfs2/cluster/masklog.c
 create mode 100644 fs/ocfs2/cluster/masklog.h
 create mode 100644 fs/ocfs2/cluster/netdebug.c
 create mode 100644 fs/ocfs2/cluster/nodemanager.c
 create mode 100644 fs/ocfs2/cluster/nodemanager.h
 create mode 100644 fs/ocfs2/cluster/ocfs2_heartbeat.h
 create mode 100644 fs/ocfs2/cluster/ocfs2_nodemanager.h
 create mode 100644 fs/ocfs2/cluster/quorum.c
 create mode 100644 fs/ocfs2/cluster/quorum.h
 create mode 100644 fs/ocfs2/cluster/sys.c
 create mode 100644 fs/ocfs2/cluster/sys.h
 create mode 100644 fs/ocfs2/cluster/tcp.c
 create mode 100644 fs/ocfs2/cluster/tcp.h
 create mode 100644 fs/ocfs2/cluster/tcp_internal.h
 create mode 100644 fs/ocfs2/cluster/ver.c
 create mode 100644 fs/ocfs2/cluster/ver.h
 create mode 100644 fs/ocfs2/dcache.c
 create mode 100644 fs/ocfs2/dcache.h
 create mode 100644 fs/ocfs2/dir.c
 create mode 100644 fs/ocfs2/dir.h
 create mode 100644 fs/ocfs2/dlm/Makefile
 create mode 100644 fs/ocfs2/dlm/dlmapi.h
 create mode 100644 fs/ocfs2/dlm/dlmast.c
 create mode 100644 fs/ocfs2/dlm/dlmcommon.h
 create mode 100644 fs/ocfs2/dlm/dlmconvert.c
 create mode 100644 fs/ocfs2/dlm/dlmconvert.h
 create mode 100644 fs/ocfs2/dlm/dlmdebug.c
 create mode 100644 fs/ocfs2/dlm/dlmdebug.h
 create mode 100644 fs/ocfs2/dlm/dlmdomain.c
 create mode 100644 fs/ocfs2/dlm/dlmdomain.h
 create mode 100644 fs/ocfs2/dlm/dlmlock.c
 create mode 100644 fs/ocfs2/dlm/dlmmaster.c
 create mode 100644 fs/ocfs2/dlm/dlmrecovery.c
 create mode 100644 fs/ocfs2/dlm/dlmthread.c
 create mode 100644 fs/ocfs2/dlm/dlmunlock.c
 create mode 100644 fs/ocfs2/dlm/dlmver.c
 create mode 100644 fs/ocfs2/dlm/dlmver.h
 create mode 100644 fs/ocfs2/dlmfs/Makefile
 create mode 100644 fs/ocfs2/dlmfs/dlmfs.c
 create mode 100644 fs/ocfs2/dlmfs/dlmfsver.c
 create mode 100644 fs/ocfs2/dlmfs/dlmfsver.h
 create mode 100644 fs/ocfs2/dlmfs/userdlm.c
 create mode 100644 fs/ocfs2/dlmfs/userdlm.h
 create mode 100644 fs/ocfs2/dlmglue.c
 create mode 100644 fs/ocfs2/dlmglue.h
 create mode 100644 fs/ocfs2/export.c
 create mode 100644 fs/ocfs2/export.h
 create mode 100644 fs/ocfs2/extent_map.c
 create mode 100644 fs/ocfs2/extent_map.h
 create mode 100644 fs/ocfs2/file.c
 create mode 100644 fs/ocfs2/file.h
 create mode 100644 fs/ocfs2/heartbeat.c
 create mode 100644 fs/ocfs2/heartbeat.h
 create mode 100644 fs/ocfs2/inode.c
 create mode 100644 fs/ocfs2/inode.h
 create mode 100644 fs/ocfs2/ioctl.c
 create mode 100644 fs/ocfs2/ioctl.h
 create mode 100644 fs/ocfs2/journal.c
 create mode 100644 fs/ocfs2/journal.h
 create mode 100644 fs/ocfs2/localalloc.c
 create mode 100644 fs/ocfs2/localalloc.h
 create mode 100644 fs/ocfs2/locks.c
 create mode 100644 fs/ocfs2/locks.h
 create mode 100644 fs/ocfs2/mmap.c
 create mode 100644 fs/ocfs2/mmap.h
 create mode 100644 fs/ocfs2/move_extents.c
 create mode 100644 fs/ocfs2/move_extents.h
 create mode 100644 fs/ocfs2/namei.c
 create mode 100644 fs/ocfs2/namei.h
 create mode 100644 fs/ocfs2/ocfs1_fs_compat.h
 create mode 100644 fs/ocfs2/ocfs2.h
 create mode 100644 fs/ocfs2/ocfs2_fs.h
 create mode 100644 fs/ocfs2/ocfs2_ioctl.h
 create mode 100644 fs/ocfs2/ocfs2_lockid.h
 create mode 100644 fs/ocfs2/ocfs2_lockingver.h
 create mode 100644 fs/ocfs2/ocfs2_trace.h
 create mode 100644 fs/ocfs2/quota.h
 create mode 100644 fs/ocfs2/quota_global.c
 create mode 100644 fs/ocfs2/quota_local.c
 create mode 100644 fs/ocfs2/refcounttree.c
 create mode 100644 fs/ocfs2/refcounttree.h
 create mode 100644 fs/ocfs2/reservations.c
 create mode 100644 fs/ocfs2/reservations.h
 create mode 100644 fs/ocfs2/resize.c
 create mode 100644 fs/ocfs2/resize.h
 create mode 100644 fs/ocfs2/slot_map.c
 create mode 100644 fs/ocfs2/slot_map.h
 create mode 100644 fs/ocfs2/stack_o2cb.c
 create mode 100644 fs/ocfs2/stack_user.c
 create mode 100644 fs/ocfs2/stackglue.c
 create mode 100644 fs/ocfs2/stackglue.h
 create mode 100644 fs/ocfs2/suballoc.c
 create mode 100644 fs/ocfs2/suballoc.h
 create mode 100644 fs/ocfs2/super.c
 create mode 100644 fs/ocfs2/super.h
 create mode 100644 fs/ocfs2/symlink.c
 create mode 100644 fs/ocfs2/symlink.h
 create mode 100644 fs/ocfs2/sysfile.c
 create mode 100644 fs/ocfs2/sysfile.h
 create mode 100644 fs/ocfs2/uptodate.c
 create mode 100644 fs/ocfs2/uptodate.h
 create mode 100644 fs/ocfs2/ver.c
 create mode 100644 fs/ocfs2/ver.h
 create mode 100644 fs/ocfs2/xattr.c
 create mode 100644 fs/ocfs2/xattr.h
 create mode 100644 fs/omfs/Kconfig
 create mode 100644 fs/omfs/Makefile
 create mode 100644 fs/omfs/bitmap.c
 create mode 100644 fs/omfs/dir.c
 create mode 100644 fs/omfs/file.c
 create mode 100644 fs/omfs/inode.c
 create mode 100644 fs/omfs/omfs.h
 create mode 100644 fs/omfs/omfs_fs.h
 create mode 100644 fs/open.c
 create mode 100644 fs/openpromfs/Makefile
 create mode 100644 fs/openpromfs/inode.c
 create mode 100644 fs/partitions/Kconfig
 create mode 100644 fs/partitions/Makefile
 create mode 100644 fs/partitions/acorn.c
 create mode 100644 fs/partitions/acorn.h
 create mode 100644 fs/partitions/amiga.c
 create mode 100644 fs/partitions/amiga.h
 create mode 100644 fs/partitions/atari.c
 create mode 100644 fs/partitions/atari.h
 create mode 100644 fs/partitions/check.c
 create mode 100644 fs/partitions/check.h
 create mode 100644 fs/partitions/efi.c
 create mode 100644 fs/partitions/efi.h
 create mode 100644 fs/partitions/ibm.c
 create mode 100644 fs/partitions/ibm.h
 create mode 100644 fs/partitions/karma.c
 create mode 100644 fs/partitions/karma.h
 create mode 100644 fs/partitions/ldm.c
 create mode 100644 fs/partitions/ldm.h
 create mode 100644 fs/partitions/mac.c
 create mode 100644 fs/partitions/mac.h
 create mode 100644 fs/partitions/msdos.c
 create mode 100644 fs/partitions/msdos.h
 create mode 100644 fs/partitions/osf.c
 create mode 100644 fs/partitions/osf.h
 create mode 100644 fs/partitions/sgi.c
 create mode 100644 fs/partitions/sgi.h
 create mode 100644 fs/partitions/sun.c
 create mode 100644 fs/partitions/sun.h
 create mode 100644 fs/partitions/sysv68.c
 create mode 100644 fs/partitions/sysv68.h
 create mode 100644 fs/partitions/ultrix.c
 create mode 100644 fs/partitions/ultrix.h
 create mode 100644 fs/pipe.c
 create mode 100644 fs/pnode.c
 create mode 100644 fs/pnode.h
 create mode 100644 fs/posix_acl.c
 create mode 100644 fs/proc/Kconfig
 create mode 100644 fs/proc/Makefile
 create mode 100644 fs/proc/array.c
 create mode 100644 fs/proc/base.c
 create mode 100644 fs/proc/cmdline.c
 create mode 100644 fs/proc/consoles.c
 create mode 100644 fs/proc/cpuinfo.c
 create mode 100644 fs/proc/devices.c
 create mode 100644 fs/proc/generic.c
 create mode 100644 fs/proc/inode.c
 create mode 100644 fs/proc/internal.h
 create mode 100644 fs/proc/interrupts.c
 create mode 100644 fs/proc/kcore.c
 create mode 100644 fs/proc/kmsg.c
 create mode 100644 fs/proc/loadavg.c
 create mode 100644 fs/proc/meminfo.c
 create mode 100644 fs/proc/mmu.c
 create mode 100644 fs/proc/namespaces.c
 create mode 100644 fs/proc/nommu.c
 create mode 100644 fs/proc/page.c
 create mode 100644 fs/proc/proc_devtree.c
 create mode 100644 fs/proc/proc_net.c
 create mode 100644 fs/proc/proc_sysctl.c
 create mode 100644 fs/proc/proc_tty.c
 create mode 100644 fs/proc/root.c
 create mode 100644 fs/proc/softirqs.c
 create mode 100644 fs/proc/stat.c
 create mode 100644 fs/proc/task_mmu.c
 create mode 100644 fs/proc/task_nommu.c
 create mode 100644 fs/proc/uptime.c
 create mode 100644 fs/proc/version.c
 create mode 100644 fs/proc/vmcore.c
 create mode 100644 fs/pstore/Kconfig
 create mode 100644 fs/pstore/Makefile
 create mode 100644 fs/pstore/inode.c
 create mode 100644 fs/pstore/internal.h
 create mode 100644 fs/pstore/platform.c
 create mode 100644 fs/qnx4/Kconfig
 create mode 100644 fs/qnx4/Makefile
 create mode 100644 fs/qnx4/README
 create mode 100644 fs/qnx4/bitmap.c
 create mode 100644 fs/qnx4/dir.c
 create mode 100644 fs/qnx4/inode.c
 create mode 100644 fs/qnx4/namei.c
 create mode 100644 fs/qnx4/qnx4.h
 create mode 100644 fs/quota/Kconfig
 create mode 100644 fs/quota/Makefile
 create mode 100644 fs/quota/compat.c
 create mode 100644 fs/quota/dquot.c
 create mode 100644 fs/quota/netlink.c
 create mode 100644 fs/quota/quota.c
 create mode 100644 fs/quota/quota_tree.c
 create mode 100644 fs/quota/quota_tree.h
 create mode 100644 fs/quota/quota_v1.c
 create mode 100644 fs/quota/quota_v2.c
 create mode 100644 fs/quota/quotaio_v1.h
 create mode 100644 fs/quota/quotaio_v2.h
 create mode 100644 fs/ramfs/Makefile
 create mode 100644 fs/ramfs/file-mmu.c
 create mode 100644 fs/ramfs/file-nommu.c
 create mode 100644 fs/ramfs/inode.c
 create mode 100644 fs/ramfs/internal.h
 create mode 100644 fs/read_write.c
 create mode 100644 fs/read_write.h
 create mode 100644 fs/readdir.c
 create mode 100644 fs/reiserfs/Kconfig
 create mode 100644 fs/reiserfs/Makefile
 create mode 100644 fs/reiserfs/README
 create mode 100644 fs/reiserfs/bitmap.c
 create mode 100644 fs/reiserfs/dir.c
 create mode 100644 fs/reiserfs/do_balan.c
 create mode 100644 fs/reiserfs/file.c
 create mode 100644 fs/reiserfs/fix_node.c
 create mode 100644 fs/reiserfs/hashes.c
 create mode 100644 fs/reiserfs/ibalance.c
 create mode 100644 fs/reiserfs/inode.c
 create mode 100644 fs/reiserfs/ioctl.c
 create mode 100644 fs/reiserfs/item_ops.c
 create mode 100644 fs/reiserfs/journal.c
 create mode 100644 fs/reiserfs/lbalance.c
 create mode 100644 fs/reiserfs/lock.c
 create mode 100644 fs/reiserfs/namei.c
 create mode 100644 fs/reiserfs/objectid.c
 create mode 100644 fs/reiserfs/prints.c
 create mode 100644 fs/reiserfs/procfs.c
 create mode 100644 fs/reiserfs/resize.c
 create mode 100644 fs/reiserfs/stree.c
 create mode 100644 fs/reiserfs/super.c
 create mode 100644 fs/reiserfs/tail_conversion.c
 create mode 100644 fs/reiserfs/xattr.c
 create mode 100644 fs/reiserfs/xattr_acl.c
 create mode 100644 fs/reiserfs/xattr_security.c
 create mode 100644 fs/reiserfs/xattr_trusted.c
 create mode 100644 fs/reiserfs/xattr_user.c
 create mode 100644 fs/romfs/Kconfig
 create mode 100644 fs/romfs/Makefile
 create mode 100644 fs/romfs/internal.h
 create mode 100644 fs/romfs/mmap-nommu.c
 create mode 100644 fs/romfs/storage.c
 create mode 100644 fs/romfs/super.c
 create mode 100644 fs/select.c
 create mode 100644 fs/seq_file.c
 create mode 100644 fs/signalfd.c
 create mode 100644 fs/splice.c
 create mode 100644 fs/squashfs/Kconfig
 create mode 100644 fs/squashfs/Makefile
 create mode 100644 fs/squashfs/block.c
 create mode 100644 fs/squashfs/cache.c
 create mode 100644 fs/squashfs/decompressor.c
 create mode 100644 fs/squashfs/decompressor.h
 create mode 100644 fs/squashfs/dir.c
 create mode 100644 fs/squashfs/export.c
 create mode 100644 fs/squashfs/file.c
 create mode 100644 fs/squashfs/fragment.c
 create mode 100644 fs/squashfs/id.c
 create mode 100644 fs/squashfs/inode.c
 create mode 100644 fs/squashfs/lzo_wrapper.c
 create mode 100644 fs/squashfs/namei.c
 create mode 100644 fs/squashfs/squashfs.h
 create mode 100644 fs/squashfs/squashfs_fs.h
 create mode 100644 fs/squashfs/squashfs_fs_i.h
 create mode 100644 fs/squashfs/squashfs_fs_sb.h
 create mode 100644 fs/squashfs/super.c
 create mode 100644 fs/squashfs/symlink.c
 create mode 100644 fs/squashfs/xattr.c
 create mode 100644 fs/squashfs/xattr.h
 create mode 100644 fs/squashfs/xattr_id.c
 create mode 100644 fs/squashfs/xz_wrapper.c
 create mode 100644 fs/squashfs/zlib_wrapper.c
 create mode 100644 fs/stack.c
 create mode 100644 fs/stat.c
 create mode 100644 fs/statfs.c
 create mode 100644 fs/super.c
 create mode 100644 fs/sync.c
 create mode 100644 fs/sysfs/Kconfig
 create mode 100644 fs/sysfs/Makefile
 create mode 100644 fs/sysfs/bin.c
 create mode 100644 fs/sysfs/dir.c
 create mode 100644 fs/sysfs/file.c
 create mode 100644 fs/sysfs/group.c
 create mode 100644 fs/sysfs/inode.c
 create mode 100644 fs/sysfs/mount.c
 create mode 100644 fs/sysfs/symlink.c
 create mode 100644 fs/sysfs/sysfs.h
 create mode 100644 fs/sysv/Kconfig
 create mode 100644 fs/sysv/Makefile
 create mode 100644 fs/sysv/balloc.c
 create mode 100644 fs/sysv/dir.c
 create mode 100644 fs/sysv/file.c
 create mode 100644 fs/sysv/ialloc.c
 create mode 100644 fs/sysv/inode.c
 create mode 100644 fs/sysv/itree.c
 create mode 100644 fs/sysv/namei.c
 create mode 100644 fs/sysv/super.c
 create mode 100644 fs/sysv/symlink.c
 create mode 100644 fs/sysv/sysv.h
 create mode 100644 fs/timerfd.c
 create mode 100644 fs/ubifs/Kconfig
 create mode 100644 fs/ubifs/Makefile
 create mode 100644 fs/ubifs/budget.c
 create mode 100644 fs/ubifs/commit.c
 create mode 100644 fs/ubifs/compress.c
 create mode 100644 fs/ubifs/debug.c
 create mode 100644 fs/ubifs/debug.h
 create mode 100644 fs/ubifs/dir.c
 create mode 100644 fs/ubifs/file.c
 create mode 100644 fs/ubifs/find.c
 create mode 100644 fs/ubifs/gc.c
 create mode 100644 fs/ubifs/io.c
 create mode 100644 fs/ubifs/ioctl.c
 create mode 100644 fs/ubifs/journal.c
 create mode 100644 fs/ubifs/key.h
 create mode 100644 fs/ubifs/log.c
 create mode 100644 fs/ubifs/lprops.c
 create mode 100644 fs/ubifs/lpt.c
 create mode 100644 fs/ubifs/lpt_commit.c
 create mode 100644 fs/ubifs/master.c
 create mode 100644 fs/ubifs/misc.h
 create mode 100644 fs/ubifs/orphan.c
 create mode 100644 fs/ubifs/recovery.c
 create mode 100644 fs/ubifs/replay.c
 create mode 100644 fs/ubifs/sb.c
 create mode 100644 fs/ubifs/scan.c
 create mode 100644 fs/ubifs/shrinker.c
 create mode 100644 fs/ubifs/super.c
 create mode 100644 fs/ubifs/tnc.c
 create mode 100644 fs/ubifs/tnc_commit.c
 create mode 100644 fs/ubifs/tnc_misc.c
 create mode 100644 fs/ubifs/ubifs-media.h
 create mode 100644 fs/ubifs/ubifs.h
 create mode 100644 fs/ubifs/xattr.c
 create mode 100644 fs/udf/Kconfig
 create mode 100644 fs/udf/Makefile
 create mode 100644 fs/udf/balloc.c
 create mode 100644 fs/udf/dir.c
 create mode 100644 fs/udf/directory.c
 create mode 100644 fs/udf/ecma_167.h
 create mode 100644 fs/udf/file.c
 create mode 100644 fs/udf/ialloc.c
 create mode 100644 fs/udf/inode.c
 create mode 100644 fs/udf/lowlevel.c
 create mode 100644 fs/udf/misc.c
 create mode 100644 fs/udf/namei.c
 create mode 100644 fs/udf/osta_udf.h
 create mode 100644 fs/udf/partition.c
 create mode 100644 fs/udf/super.c
 create mode 100644 fs/udf/symlink.c
 create mode 100644 fs/udf/truncate.c
 create mode 100644 fs/udf/udf_i.h
 create mode 100644 fs/udf/udf_sb.h
 create mode 100644 fs/udf/udfdecl.h
 create mode 100644 fs/udf/udfend.h
 create mode 100644 fs/udf/udftime.c
 create mode 100644 fs/udf/unicode.c
 create mode 100644 fs/ufs/Kconfig
 create mode 100644 fs/ufs/Makefile
 create mode 100644 fs/ufs/balloc.c
 create mode 100644 fs/ufs/cylinder.c
 create mode 100644 fs/ufs/dir.c
 create mode 100644 fs/ufs/file.c
 create mode 100644 fs/ufs/ialloc.c
 create mode 100644 fs/ufs/inode.c
 create mode 100644 fs/ufs/namei.c
 create mode 100644 fs/ufs/super.c
 create mode 100644 fs/ufs/swab.h
 create mode 100644 fs/ufs/symlink.c
 create mode 100644 fs/ufs/truncate.c
 create mode 100644 fs/ufs/ufs.h
 create mode 100644 fs/ufs/ufs_fs.h
 create mode 100644 fs/ufs/util.c
 create mode 100644 fs/ufs/util.h
 create mode 100644 fs/utimes.c
 create mode 100644 fs/xattr.c
 create mode 100644 fs/xattr_acl.c
 create mode 100644 fs/xfs/Kconfig
 create mode 100644 fs/xfs/Makefile
 create mode 100644 fs/xfs/linux-2.6/kmem.c
 create mode 100644 fs/xfs/linux-2.6/kmem.h
 create mode 100644 fs/xfs/linux-2.6/mrlock.h
 create mode 100644 fs/xfs/linux-2.6/time.h
 create mode 100644 fs/xfs/linux-2.6/xfs_acl.c
 create mode 100644 fs/xfs/linux-2.6/xfs_aops.c
 create mode 100644 fs/xfs/linux-2.6/xfs_aops.h
 create mode 100644 fs/xfs/linux-2.6/xfs_buf.c
 create mode 100644 fs/xfs/linux-2.6/xfs_buf.h
 create mode 100644 fs/xfs/linux-2.6/xfs_discard.c
 create mode 100644 fs/xfs/linux-2.6/xfs_discard.h
 create mode 100644 fs/xfs/linux-2.6/xfs_export.c
 create mode 100644 fs/xfs/linux-2.6/xfs_export.h
 create mode 100644 fs/xfs/linux-2.6/xfs_file.c
 create mode 100644 fs/xfs/linux-2.6/xfs_fs_subr.c
 create mode 100644 fs/xfs/linux-2.6/xfs_globals.c
 create mode 100644 fs/xfs/linux-2.6/xfs_ioctl.c
 create mode 100644 fs/xfs/linux-2.6/xfs_ioctl.h
 create mode 100644 fs/xfs/linux-2.6/xfs_ioctl32.c
 create mode 100644 fs/xfs/linux-2.6/xfs_ioctl32.h
 create mode 100644 fs/xfs/linux-2.6/xfs_iops.c
 create mode 100644 fs/xfs/linux-2.6/xfs_iops.h
 create mode 100644 fs/xfs/linux-2.6/xfs_linux.h
 create mode 100644 fs/xfs/linux-2.6/xfs_message.c
 create mode 100644 fs/xfs/linux-2.6/xfs_message.h
 create mode 100644 fs/xfs/linux-2.6/xfs_quotaops.c
 create mode 100644 fs/xfs/linux-2.6/xfs_stats.c
 create mode 100644 fs/xfs/linux-2.6/xfs_stats.h
 create mode 100644 fs/xfs/linux-2.6/xfs_super.c
 create mode 100644 fs/xfs/linux-2.6/xfs_super.h
 create mode 100644 fs/xfs/linux-2.6/xfs_sync.c
 create mode 100644 fs/xfs/linux-2.6/xfs_sync.h
 create mode 100644 fs/xfs/linux-2.6/xfs_sysctl.c
 create mode 100644 fs/xfs/linux-2.6/xfs_sysctl.h
 create mode 100644 fs/xfs/linux-2.6/xfs_trace.c
 create mode 100644 fs/xfs/linux-2.6/xfs_trace.h
 create mode 100644 fs/xfs/linux-2.6/xfs_vnode.h
 create mode 100644 fs/xfs/linux-2.6/xfs_xattr.c
 create mode 100644 fs/xfs/quota/xfs_dquot.c
 create mode 100644 fs/xfs/quota/xfs_dquot.h
 create mode 100644 fs/xfs/quota/xfs_dquot_item.c
 create mode 100644 fs/xfs/quota/xfs_dquot_item.h
 create mode 100644 fs/xfs/quota/xfs_qm.c
 create mode 100644 fs/xfs/quota/xfs_qm.h
 create mode 100644 fs/xfs/quota/xfs_qm_bhv.c
 create mode 100644 fs/xfs/quota/xfs_qm_stats.c
 create mode 100644 fs/xfs/quota/xfs_qm_stats.h
 create mode 100644 fs/xfs/quota/xfs_qm_syscalls.c
 create mode 100644 fs/xfs/quota/xfs_quota_priv.h
 create mode 100644 fs/xfs/quota/xfs_trans_dquot.c
 create mode 100644 fs/xfs/support/uuid.c
 create mode 100644 fs/xfs/support/uuid.h
 create mode 100644 fs/xfs/xfs.h
 create mode 100644 fs/xfs/xfs_acl.h
 create mode 100644 fs/xfs/xfs_ag.h
 create mode 100644 fs/xfs/xfs_alloc.c
 create mode 100644 fs/xfs/xfs_alloc.h
 create mode 100644 fs/xfs/xfs_alloc_btree.c
 create mode 100644 fs/xfs/xfs_alloc_btree.h
 create mode 100644 fs/xfs/xfs_arch.h
 create mode 100644 fs/xfs/xfs_attr.c
 create mode 100644 fs/xfs/xfs_attr.h
 create mode 100644 fs/xfs/xfs_attr_leaf.c
 create mode 100644 fs/xfs/xfs_attr_leaf.h
 create mode 100644 fs/xfs/xfs_attr_sf.h
 create mode 100644 fs/xfs/xfs_bit.c
 create mode 100644 fs/xfs/xfs_bit.h
 create mode 100644 fs/xfs/xfs_bmap.c
 create mode 100644 fs/xfs/xfs_bmap.h
 create mode 100644 fs/xfs/xfs_bmap_btree.c
 create mode 100644 fs/xfs/xfs_bmap_btree.h
 create mode 100644 fs/xfs/xfs_btree.c
 create mode 100644 fs/xfs/xfs_btree.h
 create mode 100644 fs/xfs/xfs_btree_trace.c
 create mode 100644 fs/xfs/xfs_btree_trace.h
 create mode 100644 fs/xfs/xfs_buf_item.c
 create mode 100644 fs/xfs/xfs_buf_item.h
 create mode 100644 fs/xfs/xfs_da_btree.c
 create mode 100644 fs/xfs/xfs_da_btree.h
 create mode 100644 fs/xfs/xfs_dfrag.c
 create mode 100644 fs/xfs/xfs_dfrag.h
 create mode 100644 fs/xfs/xfs_dinode.h
 create mode 100644 fs/xfs/xfs_dir2.c
 create mode 100644 fs/xfs/xfs_dir2.h
 create mode 100644 fs/xfs/xfs_dir2_block.c
 create mode 100644 fs/xfs/xfs_dir2_block.h
 create mode 100644 fs/xfs/xfs_dir2_data.c
 create mode 100644 fs/xfs/xfs_dir2_data.h
 create mode 100644 fs/xfs/xfs_dir2_leaf.c
 create mode 100644 fs/xfs/xfs_dir2_leaf.h
 create mode 100644 fs/xfs/xfs_dir2_node.c
 create mode 100644 fs/xfs/xfs_dir2_node.h
 create mode 100644 fs/xfs/xfs_dir2_sf.c
 create mode 100644 fs/xfs/xfs_dir2_sf.h
 create mode 100644 fs/xfs/xfs_error.c
 create mode 100644 fs/xfs/xfs_error.h
 create mode 100644 fs/xfs/xfs_extfree_item.c
 create mode 100644 fs/xfs/xfs_extfree_item.h
 create mode 100644 fs/xfs/xfs_filestream.c
 create mode 100644 fs/xfs/xfs_filestream.h
 create mode 100644 fs/xfs/xfs_fs.h
 create mode 100644 fs/xfs/xfs_fsops.c
 create mode 100644 fs/xfs/xfs_fsops.h
 create mode 100644 fs/xfs/xfs_ialloc.c
 create mode 100644 fs/xfs/xfs_ialloc.h
 create mode 100644 fs/xfs/xfs_ialloc_btree.c
 create mode 100644 fs/xfs/xfs_ialloc_btree.h
 create mode 100644 fs/xfs/xfs_iget.c
 create mode 100644 fs/xfs/xfs_inode.c
 create mode 100644 fs/xfs/xfs_inode.h
 create mode 100644 fs/xfs/xfs_inode_item.c
 create mode 100644 fs/xfs/xfs_inode_item.h
 create mode 100644 fs/xfs/xfs_inum.h
 create mode 100644 fs/xfs/xfs_iomap.c
 create mode 100644 fs/xfs/xfs_iomap.h
 create mode 100644 fs/xfs/xfs_itable.c
 create mode 100644 fs/xfs/xfs_itable.h
 create mode 100644 fs/xfs/xfs_log.c
 create mode 100644 fs/xfs/xfs_log.h
 create mode 100644 fs/xfs/xfs_log_cil.c
 create mode 100644 fs/xfs/xfs_log_priv.h
 create mode 100644 fs/xfs/xfs_log_recover.c
 create mode 100644 fs/xfs/xfs_log_recover.h
 create mode 100644 fs/xfs/xfs_mount.c
 create mode 100644 fs/xfs/xfs_mount.h
 create mode 100644 fs/xfs/xfs_mru_cache.c
 create mode 100644 fs/xfs/xfs_mru_cache.h
 create mode 100644 fs/xfs/xfs_quota.h
 create mode 100644 fs/xfs/xfs_rename.c
 create mode 100644 fs/xfs/xfs_rtalloc.c
 create mode 100644 fs/xfs/xfs_rtalloc.h
 create mode 100644 fs/xfs/xfs_rw.c
 create mode 100644 fs/xfs/xfs_rw.h
 create mode 100644 fs/xfs/xfs_sb.h
 create mode 100644 fs/xfs/xfs_trans.c
 create mode 100644 fs/xfs/xfs_trans.h
 create mode 100644 fs/xfs/xfs_trans_ail.c
 create mode 100644 fs/xfs/xfs_trans_buf.c
 create mode 100644 fs/xfs/xfs_trans_extfree.c
 create mode 100644 fs/xfs/xfs_trans_inode.c
 create mode 100644 fs/xfs/xfs_trans_priv.h
 create mode 100644 fs/xfs/xfs_trans_space.h
 create mode 100644 fs/xfs/xfs_types.h
 create mode 100644 fs/xfs/xfs_utils.c
 create mode 100644 fs/xfs/xfs_utils.h
 create mode 100644 fs/xfs/xfs_vnodeops.c
 create mode 100644 fs/xfs/xfs_vnodeops.h
 create mode 100644 fs/yaffs2/Kconfig
 create mode 100644 fs/yaffs2/Makefile
 create mode 100644 fs/yaffs2/yaffs_allocator.c
 create mode 100644 fs/yaffs2/yaffs_allocator.h
 create mode 100644 fs/yaffs2/yaffs_attribs.c
 create mode 100644 fs/yaffs2/yaffs_attribs.h
 create mode 100644 fs/yaffs2/yaffs_bitmap.c
 create mode 100644 fs/yaffs2/yaffs_bitmap.h
 create mode 100644 fs/yaffs2/yaffs_checkptrw.c
 create mode 100644 fs/yaffs2/yaffs_checkptrw.h
 create mode 100644 fs/yaffs2/yaffs_ecc.c
 create mode 100644 fs/yaffs2/yaffs_ecc.h
 create mode 100644 fs/yaffs2/yaffs_getblockinfo.h
 create mode 100644 fs/yaffs2/yaffs_guts.c
 create mode 100644 fs/yaffs2/yaffs_guts.h
 create mode 100644 fs/yaffs2/yaffs_linux.h
 create mode 100644 fs/yaffs2/yaffs_mtdif.c
 create mode 100644 fs/yaffs2/yaffs_mtdif.h
 create mode 100644 fs/yaffs2/yaffs_mtdif1.c
 create mode 100644 fs/yaffs2/yaffs_mtdif1.h
 create mode 100644 fs/yaffs2/yaffs_mtdif2.c
 create mode 100644 fs/yaffs2/yaffs_mtdif2.h
 create mode 100644 fs/yaffs2/yaffs_nameval.c
 create mode 100644 fs/yaffs2/yaffs_nameval.h
 create mode 100644 fs/yaffs2/yaffs_nand.c
 create mode 100644 fs/yaffs2/yaffs_nand.h
 create mode 100644 fs/yaffs2/yaffs_packedtags1.c
 create mode 100644 fs/yaffs2/yaffs_packedtags1.h
 create mode 100644 fs/yaffs2/yaffs_packedtags2.c
 create mode 100644 fs/yaffs2/yaffs_packedtags2.h
 create mode 100644 fs/yaffs2/yaffs_tagscompat.c
 create mode 100644 fs/yaffs2/yaffs_tagscompat.h
 create mode 100644 fs/yaffs2/yaffs_tagsvalidity.c
 create mode 100644 fs/yaffs2/yaffs_tagsvalidity.h
 create mode 100644 fs/yaffs2/yaffs_trace.h
 create mode 100644 fs/yaffs2/yaffs_verify.c
 create mode 100644 fs/yaffs2/yaffs_verify.h
 create mode 100644 fs/yaffs2/yaffs_vfs.c
 create mode 100644 fs/yaffs2/yaffs_yaffs1.c
 create mode 100644 fs/yaffs2/yaffs_yaffs1.h
 create mode 100644 fs/yaffs2/yaffs_yaffs2.c
 create mode 100644 fs/yaffs2/yaffs_yaffs2.h
 create mode 100644 fs/yaffs2/yportenv.h

(limited to 'fs')

diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
new file mode 100644
index 00000000..0a93dc1c
--- /dev/null
+++ b/fs/9p/Kconfig
@@ -0,0 +1,34 @@
+config 9P_FS
+	tristate "Plan 9 Resource Sharing Support (9P2000)"
+	depends on INET && NET_9P
+	help
+	  If you say Y here, you will get experimental support for
+	  Plan 9 resource sharing via the 9P2000 protocol.
+
+	  See <http://v9fs.sf.net> for more information.
+
+	  If unsure, say N.
+
+if 9P_FS
+config 9P_FSCACHE
+	bool "Enable 9P client caching support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y
+	help
+	  Choose Y here to enable persistent, read-only local
+	  caching support for 9p clients using FS-Cache
+
+
+config 9P_FS_POSIX_ACL
+	bool "9P POSIX Access Control Lists"
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+endif
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
new file mode 100644
index 00000000..ab8c1278
--- /dev/null
+++ b/fs/9p/Makefile
@@ -0,0 +1,17 @@
+obj-$(CONFIG_9P_FS) := 9p.o
+
+9p-objs := \
+	vfs_super.o \
+	vfs_inode.o \
+	vfs_inode_dotl.o \
+	vfs_addr.o \
+	vfs_file.o \
+	vfs_dir.o \
+	vfs_dentry.o \
+	v9fs.o \
+	fid.o  \
+	xattr.o \
+	xattr_user.o
+
+9p-$(CONFIG_9P_FSCACHE) += cache.o
+9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
new file mode 100644
index 00000000..4a866cd0
--- /dev/null
+++ b/fs/9p/acl.c
@@ -0,0 +1,407 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/posix_acl_xattr.h>
+#include "xattr.h"
+#include "acl.h"
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+
+static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
+{
+	ssize_t size;
+	void *value = NULL;
+	struct posix_acl *acl = NULL;
+
+	size = v9fs_fid_xattr_get(fid, name, NULL, 0);
+	if (size > 0) {
+		value = kzalloc(size, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		size = v9fs_fid_xattr_get(fid, name, value, size);
+		if (size > 0) {
+			acl = posix_acl_from_xattr(value, size);
+			if (IS_ERR(acl))
+				goto err_out;
+		}
+	} else if (size == -ENODATA || size == 0 ||
+		   size == -ENOSYS || size == -EOPNOTSUPP) {
+		acl = NULL;
+	} else
+		acl = ERR_PTR(-EIO);
+
+err_out:
+	kfree(value);
+	return acl;
+}
+
+int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
+{
+	int retval = 0;
+	struct posix_acl *pacl, *dacl;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
+			((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
+		set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
+		set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
+		return 0;
+	}
+	/* get the default/access acl values and cache them */
+	dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
+	pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
+
+	if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
+		set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
+		set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
+	} else
+		retval = -EIO;
+
+	if (!IS_ERR(dacl))
+		posix_acl_release(dacl);
+
+	if (!IS_ERR(pacl))
+		posix_acl_release(pacl);
+
+	return retval;
+}
+
+static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	/*
+	 * 9p Always cache the acl value when
+	 * instantiating the inode (v9fs_inode_from_fid)
+	 */
+	acl = get_cached_acl(inode, type);
+	BUG_ON(acl == ACL_NOT_CACHED);
+	return acl;
+}
+
+int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
+{
+	struct posix_acl *acl;
+	struct v9fs_session_info *v9ses;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
+			((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
+		/*
+		 * On access = client  and acl = on mode get the acl
+		 * values from the server
+		 */
+		return 0;
+	}
+	acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		int error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return error;
+	}
+	return -EAGAIN;
+}
+
+static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
+{
+	int retval;
+	char *name;
+	size_t size;
+	void *buffer;
+	struct inode *inode = dentry->d_inode;
+
+	set_cached_acl(inode, type, acl);
+
+	if (!acl)
+		return 0;
+
+	/* Set a setxattr request to server */
+	size = posix_acl_xattr_size(acl->a_count);
+	buffer = kmalloc(size, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+	retval = posix_acl_to_xattr(acl, buffer, size);
+	if (retval < 0)
+		goto err_free_out;
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	retval = v9fs_xattr_set(dentry, name, buffer, size, 0);
+err_free_out:
+	kfree(buffer);
+	return retval;
+}
+
+int v9fs_acl_chmod(struct dentry *dentry)
+{
+	int retval = 0;
+	struct posix_acl *acl, *clone;
+	struct inode *inode = dentry->d_inode;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+	if (acl) {
+		clone = posix_acl_clone(acl, GFP_KERNEL);
+		posix_acl_release(acl);
+		if (!clone)
+			return -ENOMEM;
+		retval = posix_acl_chmod_masq(clone, inode->i_mode);
+		if (!retval)
+			retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, clone);
+		posix_acl_release(clone);
+	}
+	return retval;
+}
+
+int v9fs_set_create_acl(struct dentry *dentry,
+			struct posix_acl **dpacl, struct posix_acl **pacl)
+{
+	if (dentry) {
+		v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, *dpacl);
+		v9fs_set_acl(dentry, ACL_TYPE_ACCESS, *pacl);
+	}
+	posix_acl_release(*dpacl);
+	posix_acl_release(*pacl);
+	*dpacl = *pacl = NULL;
+	return 0;
+}
+
+int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+		  struct posix_acl **dpacl, struct posix_acl **pacl)
+{
+	int retval = 0;
+	mode_t mode = *modep;
+	struct posix_acl *acl = NULL;
+
+	if (!S_ISLNK(mode)) {
+		acl = v9fs_get_cached_acl(dir, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (!acl)
+			mode &= ~current_umask();
+	}
+	if (acl) {
+		struct posix_acl *clone;
+
+		if (S_ISDIR(mode))
+			*dpacl = posix_acl_dup(acl);
+		clone = posix_acl_clone(acl, GFP_NOFS);
+		posix_acl_release(acl);
+		if (!clone)
+			return -ENOMEM;
+
+		retval = posix_acl_create_masq(clone, &mode);
+		if (retval < 0) {
+			posix_acl_release(clone);
+			goto cleanup;
+		}
+		if (retval > 0)
+			*pacl = clone;
+		else
+			posix_acl_release(clone);
+	}
+	*modep  = mode;
+	return 0;
+cleanup:
+	return retval;
+
+}
+
+static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
+			       void *buffer, size_t size, int type)
+{
+	char *full_name;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		full_name =  POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		full_name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	return v9fs_xattr_get(dentry, full_name, buffer, size);
+}
+
+static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
+			      void *buffer, size_t size, int type)
+{
+	struct v9fs_session_info *v9ses;
+	struct posix_acl *acl;
+	int error;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+
+	v9ses = v9fs_dentry2v9ses(dentry);
+	/*
+	 * We allow set/get/list of acl when access=client is not specified
+	 */
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+		return v9fs_remote_get_acl(dentry, name, buffer, size, type);
+
+	acl = v9fs_get_cached_acl(dentry->d_inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	error = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
+			      const void *value, size_t size,
+			      int flags, int type)
+{
+	char *full_name;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		full_name =  POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		full_name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	return v9fs_xattr_set(dentry, full_name, value, size, flags);
+}
+
+
+static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
+			      const void *value, size_t size,
+			      int flags, int type)
+{
+	int retval;
+	struct posix_acl *acl;
+	struct v9fs_session_info *v9ses;
+	struct inode *inode = dentry->d_inode;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+
+	v9ses = v9fs_dentry2v9ses(dentry);
+	/*
+	 * set the attribute on the remote. Without even looking at the
+	 * xattr value. We leave it to the server to validate
+	 */
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+		return v9fs_remote_set_acl(dentry, name,
+					   value, size, flags, type);
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+	if (value) {
+		/* update the cached acl value */
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		else if (acl) {
+			retval = posix_acl_valid(acl);
+			if (retval)
+				goto err_out;
+		}
+	} else
+		acl = NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			retval = posix_acl_equiv_mode(acl, &mode);
+			if (retval < 0)
+				goto err_out;
+			else {
+				struct iattr iattr;
+				if (retval == 0) {
+					/*
+					 * ACL can be represented
+					 * by the mode bits. So don't
+					 * update ACL.
+					 */
+					acl = NULL;
+					value = NULL;
+					size = 0;
+				}
+				/* Updte the mode bits */
+				iattr.ia_mode = ((mode & S_IALLUGO) |
+						 (inode->i_mode & ~S_IALLUGO));
+				iattr.ia_valid = ATTR_MODE;
+				/* FIXME should we update ctime ?
+				 * What is the following setxattr update the
+				 * mode ?
+				 */
+				v9fs_vfs_setattr_dotl(dentry, &iattr);
+			}
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		if (!S_ISDIR(inode->i_mode)) {
+			retval = acl ? -EINVAL : 0;
+			goto err_out;
+		}
+		break;
+	default:
+		BUG();
+	}
+	retval = v9fs_xattr_set(dentry, name, value, size, flags);
+	if (!retval)
+		set_cached_acl(inode, type, acl);
+err_out:
+	posix_acl_release(acl);
+	return retval;
+}
+
+const struct xattr_handler v9fs_xattr_acl_access_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.flags	= ACL_TYPE_ACCESS,
+	.get	= v9fs_xattr_get_acl,
+	.set	= v9fs_xattr_set_acl,
+};
+
+const struct xattr_handler v9fs_xattr_acl_default_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.flags	= ACL_TYPE_DEFAULT,
+	.get	= v9fs_xattr_get_acl,
+	.set	= v9fs_xattr_set_acl,
+};
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
new file mode 100644
index 00000000..c47ea9cf
--- /dev/null
+++ b/fs/9p/acl.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#ifndef FS_9P_ACL_H
+#define FS_9P_ACL_H
+
+#ifdef CONFIG_9P_FS_POSIX_ACL
+extern int v9fs_get_acl(struct inode *, struct p9_fid *);
+extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags);
+extern int v9fs_acl_chmod(struct dentry *);
+extern int v9fs_set_create_acl(struct dentry *,
+			       struct posix_acl **, struct posix_acl **);
+extern int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+			 struct posix_acl **dpacl, struct posix_acl **pacl);
+#else
+#define v9fs_check_acl NULL
+static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
+{
+	return 0;
+}
+static inline int v9fs_acl_chmod(struct dentry *dentry)
+{
+	return 0;
+}
+static inline int v9fs_set_create_acl(struct dentry *dentry,
+				      struct posix_acl **dpacl,
+				      struct posix_acl **pacl)
+{
+	return 0;
+}
+static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+				struct posix_acl **dpacl,
+				struct posix_acl **pacl)
+{
+	return 0;
+}
+
+#endif
+#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
new file mode 100644
index 00000000..945aa5f0
--- /dev/null
+++ b/fs/9p/cache.c
@@ -0,0 +1,415 @@
+/*
+ * V9FS cache definitions.
+ *
+ *  Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/jiffies.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <net/9p/9p.h>
+
+#include "v9fs.h"
+#include "cache.h"
+
+#define CACHETAG_LEN  11
+
+struct fscache_netfs v9fs_cache_netfs = {
+	.name 		= "9p",
+	.version 	= 0,
+};
+
+/**
+ * v9fs_random_cachetag - Generate a random tag to be associated
+ *			  with a new cache session.
+ *
+ * The value of jiffies is used for a fairly randomly cache tag.
+ */
+
+static
+int v9fs_random_cachetag(struct v9fs_session_info *v9ses)
+{
+	v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL);
+	if (!v9ses->cachetag)
+		return -ENOMEM;
+
+	return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies);
+}
+
+static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
+					   void *buffer, uint16_t bufmax)
+{
+	struct v9fs_session_info *v9ses;
+	uint16_t klen = 0;
+
+	v9ses = (struct v9fs_session_info *)cookie_netfs_data;
+	P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses,
+		   buffer, bufmax);
+
+	if (v9ses->cachetag)
+		klen = strlen(v9ses->cachetag);
+
+	if (klen > bufmax)
+		return 0;
+
+	memcpy(buffer, v9ses->cachetag, klen);
+	P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag);
+	return klen;
+}
+
+const struct fscache_cookie_def v9fs_cache_session_index_def = {
+	.name		= "9P.session",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= v9fs_cache_session_get_key,
+};
+
+void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
+{
+	/* If no cache session tag was specified, we generate a random one. */
+	if (!v9ses->cachetag)
+		v9fs_random_cachetag(v9ses);
+
+	v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
+						&v9fs_cache_session_index_def,
+						v9ses);
+	P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses,
+		   v9ses->fscache);
+}
+
+void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
+{
+	P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses,
+		   v9ses->fscache);
+	fscache_relinquish_cookie(v9ses->fscache, 0);
+	v9ses->fscache = NULL;
+}
+
+
+static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
+					 void *buffer, uint16_t bufmax)
+{
+	const struct v9fs_inode *v9inode = cookie_netfs_data;
+	memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path));
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode,
+		   v9inode->qid.path);
+	return sizeof(v9inode->qid.path);
+}
+
+static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
+				      uint64_t *size)
+{
+	const struct v9fs_inode *v9inode = cookie_netfs_data;
+	*size = i_size_read(&v9inode->vfs_inode);
+
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode,
+		   *size);
+}
+
+static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
+					 void *buffer, uint16_t buflen)
+{
+	const struct v9fs_inode *v9inode = cookie_netfs_data;
+	memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version));
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode,
+		   v9inode->qid.version);
+	return sizeof(v9inode->qid.version);
+}
+
+static enum
+fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
+					    const void *buffer,
+					    uint16_t buflen)
+{
+	const struct v9fs_inode *v9inode = cookie_netfs_data;
+
+	if (buflen != sizeof(v9inode->qid.version))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	if (memcmp(buffer, &v9inode->qid.version,
+		   sizeof(v9inode->qid.version)))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
+{
+	struct v9fs_inode *v9inode = cookie_netfs_data;
+	struct pagevec pvec;
+	pgoff_t first;
+	int loop, nr_pages;
+
+	pagevec_init(&pvec, 0);
+	first = 0;
+
+	for (;;) {
+		nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
+					  first,
+					  PAGEVEC_SIZE - pagevec_count(&pvec));
+		if (!nr_pages)
+			break;
+
+		for (loop = 0; loop < nr_pages; loop++)
+			ClearPageFsCache(pvec.pages[loop]);
+
+		first = pvec.pages[nr_pages - 1]->index + 1;
+
+		pvec.nr = nr_pages;
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+const struct fscache_cookie_def v9fs_cache_inode_index_def = {
+	.name		= "9p.inode",
+	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
+	.get_key	= v9fs_cache_inode_get_key,
+	.get_attr	= v9fs_cache_inode_get_attr,
+	.get_aux	= v9fs_cache_inode_get_aux,
+	.check_aux	= v9fs_cache_inode_check_aux,
+	.now_uncached	= v9fs_cache_inode_now_uncached,
+};
+
+void v9fs_cache_inode_get_cookie(struct inode *inode)
+{
+	struct v9fs_inode *v9inode;
+	struct v9fs_session_info *v9ses;
+
+	if (!S_ISREG(inode->i_mode))
+		return;
+
+	v9inode = V9FS_I(inode);
+	if (v9inode->fscache)
+		return;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
+						  &v9fs_cache_inode_index_def,
+						  v9inode);
+
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
+		   v9inode->fscache);
+}
+
+void v9fs_cache_inode_put_cookie(struct inode *inode)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	if (!v9inode->fscache)
+		return;
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
+		   v9inode->fscache);
+
+	fscache_relinquish_cookie(v9inode->fscache, 0);
+	v9inode->fscache = NULL;
+}
+
+void v9fs_cache_inode_flush_cookie(struct inode *inode)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	if (!v9inode->fscache)
+		return;
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
+		   v9inode->fscache);
+
+	fscache_relinquish_cookie(v9inode->fscache, 1);
+	v9inode->fscache = NULL;
+}
+
+void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_fid *fid;
+
+	if (!v9inode->fscache)
+		return;
+
+	spin_lock(&v9inode->fscache_lock);
+	fid = filp->private_data;
+	if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+		v9fs_cache_inode_flush_cookie(inode);
+	else
+		v9fs_cache_inode_get_cookie(inode);
+
+	spin_unlock(&v9inode->fscache_lock);
+}
+
+void v9fs_cache_inode_reset_cookie(struct inode *inode)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct v9fs_session_info *v9ses;
+	struct fscache_cookie *old;
+
+	if (!v9inode->fscache)
+		return;
+
+	old = v9inode->fscache;
+
+	spin_lock(&v9inode->fscache_lock);
+	fscache_relinquish_cookie(v9inode->fscache, 1);
+
+	v9ses = v9fs_inode2v9ses(inode);
+	v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
+						  &v9fs_cache_inode_index_def,
+						  v9inode);
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
+		   inode, old, v9inode->fscache);
+
+	spin_unlock(&v9inode->fscache_lock);
+}
+
+int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	struct inode *inode = page->mapping->host;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	BUG_ON(!v9inode->fscache);
+
+	return fscache_maybe_release_page(v9inode->fscache, page, gfp);
+}
+
+void __v9fs_fscache_invalidate_page(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	BUG_ON(!v9inode->fscache);
+
+	if (PageFsCache(page)) {
+		fscache_wait_on_page_write(v9inode->fscache, page);
+		BUG_ON(!PageLocked(page));
+		fscache_uncache_page(v9inode->fscache, page);
+	}
+}
+
+static void v9fs_vfs_readpage_complete(struct page *page, void *data,
+				       int error)
+{
+	if (!error)
+		SetPageUptodate(page);
+
+	unlock_page(page);
+}
+
+/**
+ * __v9fs_readpage_from_fscache - read a page from cache
+ *
+ * Returns 0 if the pages are in cache and a BIO is submitted,
+ * 1 if the pages are not in cache and -error otherwise.
+ */
+
+int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+	int ret;
+	const struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+	if (!v9inode->fscache)
+		return -ENOBUFS;
+
+	ret = fscache_read_or_alloc_page(v9inode->fscache,
+					 page,
+					 v9fs_vfs_readpage_complete,
+					 NULL,
+					 GFP_KERNEL);
+	switch (ret) {
+	case -ENOBUFS:
+	case -ENODATA:
+		P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret);
+		return 1;
+	case 0:
+		P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+		return ret;
+	default:
+		P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+		return ret;
+	}
+}
+
+/**
+ * __v9fs_readpages_from_fscache - read multiple pages from cache
+ *
+ * Returns 0 if the pages are in cache and a BIO is submitted,
+ * 1 if the pages are not in cache and -error otherwise.
+ */
+
+int __v9fs_readpages_from_fscache(struct inode *inode,
+				  struct address_space *mapping,
+				  struct list_head *pages,
+				  unsigned *nr_pages)
+{
+	int ret;
+	const struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
+	if (!v9inode->fscache)
+		return -ENOBUFS;
+
+	ret = fscache_read_or_alloc_pages(v9inode->fscache,
+					  mapping, pages, nr_pages,
+					  v9fs_vfs_readpage_complete,
+					  NULL,
+					  mapping_gfp_mask(mapping));
+	switch (ret) {
+	case -ENOBUFS:
+	case -ENODATA:
+		P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret);
+		return 1;
+	case 0:
+		BUG_ON(!list_empty(pages));
+		BUG_ON(*nr_pages != 0);
+		P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+		return ret;
+	default:
+		P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+		return ret;
+	}
+}
+
+/**
+ * __v9fs_readpage_to_fscache - write a page to the cache
+ *
+ */
+
+void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+	int ret;
+	const struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+	ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
+	P9_DPRINTK(P9_DEBUG_FSC, "ret =  %d", ret);
+	if (ret != 0)
+		v9fs_uncache_page(inode, page);
+}
+
+/*
+ * wait for a page to complete writing to the cache
+ */
+void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
+{
+	const struct v9fs_inode *v9inode = V9FS_I(inode);
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+	if (PageFsCache(page))
+		fscache_wait_on_page_write(v9inode->fscache, page);
+}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
new file mode 100644
index 00000000..40cc54ce
--- /dev/null
+++ b/fs/9p/cache.h
@@ -0,0 +1,139 @@
+/*
+ * V9FS cache definitions.
+ *
+ *  Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#ifndef _9P_CACHE_H
+#ifdef CONFIG_9P_FSCACHE
+#include <linux/fscache.h>
+#include <linux/spinlock.h>
+
+extern struct fscache_netfs v9fs_cache_netfs;
+extern const struct fscache_cookie_def v9fs_cache_session_index_def;
+extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
+
+extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses);
+extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses);
+
+extern void v9fs_cache_inode_get_cookie(struct inode *inode);
+extern void v9fs_cache_inode_put_cookie(struct inode *inode);
+extern void v9fs_cache_inode_flush_cookie(struct inode *inode);
+extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp);
+extern void v9fs_cache_inode_reset_cookie(struct inode *inode);
+
+extern int __v9fs_cache_register(void);
+extern void __v9fs_cache_unregister(void);
+
+extern int __v9fs_fscache_release_page(struct page *page, gfp_t gfp);
+extern void __v9fs_fscache_invalidate_page(struct page *page);
+extern int __v9fs_readpage_from_fscache(struct inode *inode,
+					struct page *page);
+extern int __v9fs_readpages_from_fscache(struct inode *inode,
+					 struct address_space *mapping,
+					 struct list_head *pages,
+					 unsigned *nr_pages);
+extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
+extern void __v9fs_fscache_wait_on_page_write(struct inode *inode,
+					      struct page *page);
+
+static inline int v9fs_fscache_release_page(struct page *page,
+					    gfp_t gfp)
+{
+	return __v9fs_fscache_release_page(page, gfp);
+}
+
+static inline void v9fs_fscache_invalidate_page(struct page *page)
+{
+	__v9fs_fscache_invalidate_page(page);
+}
+
+static inline int v9fs_readpage_from_fscache(struct inode *inode,
+					     struct page *page)
+{
+	return __v9fs_readpage_from_fscache(inode, page);
+}
+
+static inline int v9fs_readpages_from_fscache(struct inode *inode,
+					      struct address_space *mapping,
+					      struct list_head *pages,
+					      unsigned *nr_pages)
+{
+	return __v9fs_readpages_from_fscache(inode, mapping, pages,
+					     nr_pages);
+}
+
+static inline void v9fs_readpage_to_fscache(struct inode *inode,
+					    struct page *page)
+{
+	if (PageFsCache(page))
+		__v9fs_readpage_to_fscache(inode, page);
+}
+
+static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	fscache_uncache_page(v9inode->fscache, page);
+	BUG_ON(PageFsCache(page));
+}
+
+static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
+						   struct page *page)
+{
+	return __v9fs_fscache_wait_on_page_write(inode, page);
+}
+
+#else /* CONFIG_9P_FSCACHE */
+
+static inline int v9fs_fscache_release_page(struct page *page,
+					    gfp_t gfp) {
+	return 1;
+}
+
+static inline void v9fs_fscache_invalidate_page(struct page *page) {}
+
+static inline int v9fs_readpage_from_fscache(struct inode *inode,
+					     struct page *page)
+{
+	return -ENOBUFS;
+}
+
+static inline int v9fs_readpages_from_fscache(struct inode *inode,
+					      struct address_space *mapping,
+					      struct list_head *pages,
+					      unsigned *nr_pages)
+{
+	return -ENOBUFS;
+}
+
+static inline void v9fs_readpage_to_fscache(struct inode *inode,
+					    struct page *page)
+{}
+
+static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
+{}
+
+static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
+						   struct page *page)
+{
+	return;
+}
+
+#endif /* CONFIG_9P_FSCACHE */
+#endif /* _9P_CACHE_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
new file mode 100644
index 00000000..85b67ffa
--- /dev/null
+++ b/fs/9p/fid.c
@@ -0,0 +1,309 @@
+/*
+ * V9FS FID Management
+ *
+ *  Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
+ *  Copyright (C) 2005, 2006 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/idr.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+
+/**
+ * v9fs_fid_add - add a fid to a dentry
+ * @dentry: dentry that the fid is being added to
+ * @fid: fid to add
+ *
+ */
+
+int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
+{
+	struct v9fs_dentry *dent;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "fid %d dentry %s\n",
+					fid->fid, dentry->d_name.name);
+
+	dent = dentry->d_fsdata;
+	if (!dent) {
+		dent = kmalloc(sizeof(struct v9fs_dentry), GFP_KERNEL);
+		if (!dent)
+			return -ENOMEM;
+
+		spin_lock_init(&dent->lock);
+		INIT_LIST_HEAD(&dent->fidlist);
+		dentry->d_fsdata = dent;
+	}
+
+	spin_lock(&dent->lock);
+	list_add(&fid->dlist, &dent->fidlist);
+	spin_unlock(&dent->lock);
+
+	return 0;
+}
+
+/**
+ * v9fs_fid_find - retrieve a fid that belongs to the specified uid
+ * @dentry: dentry to look for fid in
+ * @uid: return fid that belongs to the specified user
+ * @any: if non-zero, return any fid associated with the dentry
+ *
+ */
+
+static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
+{
+	struct v9fs_dentry *dent;
+	struct p9_fid *fid, *ret;
+
+	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
+		dentry->d_name.name, dentry, uid, any);
+	dent = (struct v9fs_dentry *) dentry->d_fsdata;
+	ret = NULL;
+	if (dent) {
+		spin_lock(&dent->lock);
+		list_for_each_entry(fid, &dent->fidlist, dlist) {
+			if (any || fid->uid == uid) {
+				ret = fid;
+				break;
+			}
+		}
+		spin_unlock(&dent->lock);
+	}
+
+	return ret;
+}
+
+/*
+ * We need to hold v9ses->rename_sem as long as we hold references
+ * to returned path array. Array element contain pointers to
+ * dentry names.
+ */
+static int build_path_from_dentry(struct v9fs_session_info *v9ses,
+				  struct dentry *dentry, char ***names)
+{
+	int n = 0, i;
+	char **wnames;
+	struct dentry *ds;
+
+	for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent)
+		n++;
+
+	wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL);
+	if (!wnames)
+		goto err_out;
+
+	for (ds = dentry, i = (n-1); i >= 0; i--, ds = ds->d_parent)
+		wnames[i] = (char  *)ds->d_name.name;
+
+	*names = wnames;
+	return n;
+err_out:
+	return -ENOMEM;
+}
+
+static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
+					       uid_t uid, int any)
+{
+	struct dentry *ds;
+	char **wnames, *uname;
+	int i, n, l, clone, access;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid, *old_fid = NULL;
+
+	v9ses = v9fs_dentry2v9ses(dentry);
+	access = v9ses->flags & V9FS_ACCESS_MASK;
+	fid = v9fs_fid_find(dentry, uid, any);
+	if (fid)
+		return fid;
+	/*
+	 * we don't have a matching fid. To do a TWALK we need
+	 * parent fid. We need to prevent rename when we want to
+	 * look at the parent.
+	 */
+	down_read(&v9ses->rename_sem);
+	ds = dentry->d_parent;
+	fid = v9fs_fid_find(ds, uid, any);
+	if (fid) {
+		/* Found the parent fid do a lookup with that */
+		fid = p9_client_walk(fid, 1, (char **)&dentry->d_name.name, 1);
+		goto fid_out;
+	}
+	up_read(&v9ses->rename_sem);
+
+	/* start from the root and try to do a lookup */
+	fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any);
+	if (!fid) {
+		/* the user is not attached to the fs yet */
+		if (access == V9FS_ACCESS_SINGLE)
+			return ERR_PTR(-EPERM);
+
+		if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses))
+				uname = NULL;
+		else
+			uname = v9ses->uname;
+
+		fid = p9_client_attach(v9ses->clnt, NULL, uname, uid,
+				       v9ses->aname);
+		if (IS_ERR(fid))
+			return fid;
+
+		v9fs_fid_add(dentry->d_sb->s_root, fid);
+	}
+	/* If we are root ourself just return that */
+	if (dentry->d_sb->s_root == dentry)
+		return fid;
+	/*
+	 * Do a multipath walk with attached root.
+	 * When walking parent we need to make sure we
+	 * don't have a parallel rename happening
+	 */
+	down_read(&v9ses->rename_sem);
+	n  = build_path_from_dentry(v9ses, dentry, &wnames);
+	if (n < 0) {
+		fid = ERR_PTR(n);
+		goto err_out;
+	}
+	clone = 1;
+	i = 0;
+	while (i < n) {
+		l = min(n - i, P9_MAXWELEM);
+		/*
+		 * We need to hold rename lock when doing a multipath
+		 * walk to ensure none of the patch component change
+		 */
+		fid = p9_client_walk(fid, l, &wnames[i], clone);
+		if (IS_ERR(fid)) {
+			if (old_fid) {
+				/*
+				 * If we fail, clunk fid which are mapping
+				 * to path component and not the last component
+				 * of the path.
+				 */
+				p9_client_clunk(old_fid);
+			}
+			kfree(wnames);
+			goto err_out;
+		}
+		old_fid = fid;
+		i += l;
+		clone = 0;
+	}
+	kfree(wnames);
+fid_out:
+	if (!IS_ERR(fid))
+		v9fs_fid_add(dentry, fid);
+err_out:
+	up_read(&v9ses->rename_sem);
+	return fid;
+}
+
+/**
+ * v9fs_fid_lookup - lookup for a fid, try to walk if not found
+ * @dentry: dentry to look for fid in
+ *
+ * Look for a fid in the specified dentry for the current user.
+ * If no fid is found, try to create one walking from a fid from the parent
+ * dentry (if it has one), or the root dentry. If the user haven't accessed
+ * the fs yet, attach now and walk from the root.
+ */
+
+struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
+{
+	uid_t uid;
+	int  any, access;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_dentry2v9ses(dentry);
+	access = v9ses->flags & V9FS_ACCESS_MASK;
+	switch (access) {
+	case V9FS_ACCESS_SINGLE:
+	case V9FS_ACCESS_USER:
+	case V9FS_ACCESS_CLIENT:
+		uid = current_fsuid();
+		any = 0;
+		break;
+
+	case V9FS_ACCESS_ANY:
+		uid = v9ses->uid;
+		any = 1;
+		break;
+
+	default:
+		uid = ~0;
+		any = 0;
+		break;
+	}
+	return v9fs_fid_lookup_with_uid(dentry, uid, any);
+}
+
+struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
+{
+	struct p9_fid *fid, *ret;
+
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return fid;
+
+	ret = p9_client_walk(fid, 0, NULL, 1);
+	return ret;
+}
+
+static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid)
+{
+	struct p9_fid *fid, *ret;
+
+	fid = v9fs_fid_lookup_with_uid(dentry, uid, 0);
+	if (IS_ERR(fid))
+		return fid;
+
+	ret = p9_client_walk(fid, 0, NULL, 1);
+	return ret;
+}
+
+struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
+{
+	int err;
+	struct p9_fid *fid;
+
+	fid = v9fs_fid_clone_with_uid(dentry, 0);
+	if (IS_ERR(fid))
+		goto error_out;
+	/*
+	 * writeback fid will only be used to write back the
+	 * dirty pages. We always request for the open fid in read-write
+	 * mode so that a partial page write which result in page
+	 * read can work.
+	 */
+	err = p9_client_open(fid, O_RDWR);
+	if (err < 0) {
+		p9_client_clunk(fid);
+		fid = ERR_PTR(err);
+		goto error_out;
+	}
+error_out:
+	return fid;
+}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
new file mode 100644
index 00000000..bb0b6e7f
--- /dev/null
+++ b/fs/9p/fid.h
@@ -0,0 +1,50 @@
+/*
+ * V9FS FID Management
+ *
+ *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#ifndef FS_9P_FID_H
+#define FS_9P_FID_H
+#include <linux/list.h>
+
+/**
+ * struct v9fs_dentry - 9p private data stored in dentry d_fsdata
+ * @lock: protects the fidlist
+ * @fidlist: list of FIDs currently associated with this dentry
+ *
+ * This structure defines the 9p private data associated with
+ * a particular dentry.  In particular, this private data is used
+ * to lookup which 9P FID handle should be used for a particular VFS
+ * operation.  FID handles are associated with dentries instead of
+ * inodes in order to more closely map functionality to the Plan 9
+ * expected behavior for FID reclaimation and tracking.
+ *
+ * See Also: Mapping FIDs to Linux VFS model in
+ * Design and Implementation of the Linux 9P File System documentation
+ */
+struct v9fs_dentry {
+	spinlock_t lock; /* protect fidlist */
+	struct list_head fidlist;
+};
+
+struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
+struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
+int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
+struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
+#endif
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
new file mode 100644
index 00000000..ef966188
--- /dev/null
+++ b/fs/9p/v9fs.c
@@ -0,0 +1,624 @@
+/*
+ *  linux/fs/9p/v9fs.c
+ *
+ *  This file contains functions assisting in mapping VFS to 9P2000
+ *
+ *  Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/parser.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "cache.h"
+
+static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
+static LIST_HEAD(v9fs_sessionlist);
+struct kmem_cache *v9fs_inode_cache;
+
+/*
+ * Option Parsing (code inspired by NFS code)
+ *  NOTE: each transport will parse its own options
+ */
+
+enum {
+	/* Options that take integer arguments */
+	Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid,
+	/* String options */
+	Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag,
+	/* Options that take no arguments */
+	Opt_nodevmap,
+	/* Cache options */
+	Opt_cache_loose, Opt_fscache,
+	/* Access options */
+	Opt_access, Opt_posixacl,
+	/* Error token */
+	Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_debug, "debug=%x"},
+	{Opt_dfltuid, "dfltuid=%u"},
+	{Opt_dfltgid, "dfltgid=%u"},
+	{Opt_afid, "afid=%u"},
+	{Opt_uname, "uname=%s"},
+	{Opt_remotename, "aname=%s"},
+	{Opt_nodevmap, "nodevmap"},
+	{Opt_cache, "cache=%s"},
+	{Opt_cache_loose, "loose"},
+	{Opt_fscache, "fscache"},
+	{Opt_cachetag, "cachetag=%s"},
+	{Opt_access, "access=%s"},
+	{Opt_posixacl, "posixacl"},
+	{Opt_err, NULL}
+};
+
+/* Interpret mount options for cache mode */
+static int get_cache_mode(char *s)
+{
+	int version = -EINVAL;
+
+	if (!strcmp(s, "loose")) {
+		version = CACHE_LOOSE;
+		P9_DPRINTK(P9_DEBUG_9P, "Cache mode: loose\n");
+	} else if (!strcmp(s, "fscache")) {
+		version = CACHE_FSCACHE;
+		P9_DPRINTK(P9_DEBUG_9P, "Cache mode: fscache\n");
+	} else if (!strcmp(s, "none")) {
+		version = CACHE_NONE;
+		P9_DPRINTK(P9_DEBUG_9P, "Cache mode: none\n");
+	} else
+		printk(KERN_INFO "9p: Unknown Cache mode %s.\n", s);
+	return version;
+}
+
+/**
+ * v9fs_parse_options - parse mount options into session structure
+ * @v9ses: existing v9fs session information
+ *
+ * Return 0 upon success, -ERRNO upon failure.
+ */
+
+static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
+{
+	char *options, *tmp_options;
+	substring_t args[MAX_OPT_ARGS];
+	char *p;
+	int option = 0;
+	char *s, *e;
+	int ret = 0;
+
+	/* setup defaults */
+	v9ses->afid = ~0;
+	v9ses->debug = 0;
+	v9ses->cache = CACHE_NONE;
+#ifdef CONFIG_9P_FSCACHE
+	v9ses->cachetag = NULL;
+#endif
+
+	if (!opts)
+		return 0;
+
+	tmp_options = kstrdup(opts, GFP_KERNEL);
+	if (!tmp_options) {
+		ret = -ENOMEM;
+		goto fail_option_alloc;
+	}
+	options = tmp_options;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+		token = match_token(p, tokens, args);
+		if (token < Opt_uname) {
+			int r = match_int(&args[0], &option);
+			if (r < 0) {
+				P9_DPRINTK(P9_DEBUG_ERROR,
+					"integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
+		}
+		switch (token) {
+		case Opt_debug:
+			v9ses->debug = option;
+#ifdef CONFIG_NET_9P_DEBUG
+			p9_debug_level = option;
+#endif
+			break;
+
+		case Opt_dfltuid:
+			v9ses->dfltuid = option;
+			break;
+		case Opt_dfltgid:
+			v9ses->dfltgid = option;
+			break;
+		case Opt_afid:
+			v9ses->afid = option;
+			break;
+		case Opt_uname:
+			match_strlcpy(v9ses->uname, &args[0], PATH_MAX);
+			break;
+		case Opt_remotename:
+			match_strlcpy(v9ses->aname, &args[0], PATH_MAX);
+			break;
+		case Opt_nodevmap:
+			v9ses->nodev = 1;
+			break;
+		case Opt_cache_loose:
+			v9ses->cache = CACHE_LOOSE;
+			break;
+		case Opt_fscache:
+			v9ses->cache = CACHE_FSCACHE;
+			break;
+		case Opt_cachetag:
+#ifdef CONFIG_9P_FSCACHE
+			v9ses->cachetag = match_strdup(&args[0]);
+#endif
+			break;
+		case Opt_cache:
+			s = match_strdup(&args[0]);
+			if (!s) {
+				ret = -ENOMEM;
+				P9_DPRINTK(P9_DEBUG_ERROR,
+				  "problem allocating copy of cache arg\n");
+				goto free_and_return;
+			}
+			ret = get_cache_mode(s);
+			if (ret == -EINVAL) {
+				kfree(s);
+				goto free_and_return;
+			}
+
+			v9ses->cache = ret;
+			kfree(s);
+			break;
+
+		case Opt_access:
+			s = match_strdup(&args[0]);
+			if (!s) {
+				ret = -ENOMEM;
+				P9_DPRINTK(P9_DEBUG_ERROR,
+				  "problem allocating copy of access arg\n");
+				goto free_and_return;
+			}
+
+			v9ses->flags &= ~V9FS_ACCESS_MASK;
+			if (strcmp(s, "user") == 0)
+				v9ses->flags |= V9FS_ACCESS_USER;
+			else if (strcmp(s, "any") == 0)
+				v9ses->flags |= V9FS_ACCESS_ANY;
+			else if (strcmp(s, "client") == 0) {
+				v9ses->flags |= V9FS_ACCESS_CLIENT;
+			} else {
+				v9ses->flags |= V9FS_ACCESS_SINGLE;
+				v9ses->uid = simple_strtoul(s, &e, 10);
+				if (*e != '\0') {
+					ret = -EINVAL;
+					printk(KERN_INFO "9p: Unknown access "
+							"argument %s.\n", s);
+					kfree(s);
+					goto free_and_return;
+				}
+			}
+
+			kfree(s);
+			break;
+
+		case Opt_posixacl:
+#ifdef CONFIG_9P_FS_POSIX_ACL
+			v9ses->flags |= V9FS_POSIX_ACL;
+#else
+			P9_DPRINTK(P9_DEBUG_ERROR,
+					"Not defined CONFIG_9P_FS_POSIX_ACL. "
+					"Ignoring posixacl option\n");
+#endif
+			break;
+
+		default:
+			continue;
+		}
+	}
+
+free_and_return:
+	kfree(tmp_options);
+fail_option_alloc:
+	return ret;
+}
+
+/**
+ * v9fs_session_init - initialize session
+ * @v9ses: session information structure
+ * @dev_name: device being mounted
+ * @data: options
+ *
+ */
+
+struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
+		  const char *dev_name, char *data)
+{
+	int retval = -EINVAL;
+	struct p9_fid *fid;
+	int rc;
+
+	v9ses->uname = __getname();
+	if (!v9ses->uname)
+		return ERR_PTR(-ENOMEM);
+
+	v9ses->aname = __getname();
+	if (!v9ses->aname) {
+		__putname(v9ses->uname);
+		return ERR_PTR(-ENOMEM);
+	}
+	init_rwsem(&v9ses->rename_sem);
+
+	rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
+	if (rc) {
+		__putname(v9ses->aname);
+		__putname(v9ses->uname);
+		return ERR_PTR(rc);
+	}
+
+	spin_lock(&v9fs_sessionlist_lock);
+	list_add(&v9ses->slist, &v9fs_sessionlist);
+	spin_unlock(&v9fs_sessionlist_lock);
+
+	strcpy(v9ses->uname, V9FS_DEFUSER);
+	strcpy(v9ses->aname, V9FS_DEFANAME);
+	v9ses->uid = ~0;
+	v9ses->dfltuid = V9FS_DEFUID;
+	v9ses->dfltgid = V9FS_DEFGID;
+
+	v9ses->clnt = p9_client_create(dev_name, data);
+	if (IS_ERR(v9ses->clnt)) {
+		retval = PTR_ERR(v9ses->clnt);
+		v9ses->clnt = NULL;
+		P9_DPRINTK(P9_DEBUG_ERROR, "problem initializing 9p client\n");
+		goto error;
+	}
+
+	v9ses->flags = V9FS_ACCESS_USER;
+
+	if (p9_is_proto_dotl(v9ses->clnt)) {
+		v9ses->flags = V9FS_ACCESS_CLIENT;
+		v9ses->flags |= V9FS_PROTO_2000L;
+	} else if (p9_is_proto_dotu(v9ses->clnt)) {
+		v9ses->flags |= V9FS_PROTO_2000U;
+	}
+
+	rc = v9fs_parse_options(v9ses, data);
+	if (rc < 0) {
+		retval = rc;
+		goto error;
+	}
+
+	v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
+
+	if (!v9fs_proto_dotl(v9ses) &&
+	    ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
+		/*
+		 * We support ACCESS_CLIENT only for dotl.
+		 * Fall back to ACCESS_USER
+		 */
+		v9ses->flags &= ~V9FS_ACCESS_MASK;
+		v9ses->flags |= V9FS_ACCESS_USER;
+	}
+	/*FIXME !! */
+	/* for legacy mode, fall back to V9FS_ACCESS_ANY */
+	if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
+		((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
+
+		v9ses->flags &= ~V9FS_ACCESS_MASK;
+		v9ses->flags |= V9FS_ACCESS_ANY;
+		v9ses->uid = ~0;
+	}
+	if (!v9fs_proto_dotl(v9ses) ||
+		!((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
+		/*
+		 * We support ACL checks on clinet only if the protocol is
+		 * 9P2000.L and access is V9FS_ACCESS_CLIENT.
+		 */
+		v9ses->flags &= ~V9FS_ACL_MASK;
+	}
+
+	fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0,
+							v9ses->aname);
+	if (IS_ERR(fid)) {
+		retval = PTR_ERR(fid);
+		fid = NULL;
+		P9_DPRINTK(P9_DEBUG_ERROR, "cannot attach\n");
+		goto error;
+	}
+
+	if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE)
+		fid->uid = v9ses->uid;
+	else
+		fid->uid = ~0;
+
+#ifdef CONFIG_9P_FSCACHE
+	/* register the session for caching */
+	v9fs_cache_session_get_cookie(v9ses);
+#endif
+
+	return fid;
+
+error:
+	bdi_destroy(&v9ses->bdi);
+	return ERR_PTR(retval);
+}
+
+/**
+ * v9fs_session_close - shutdown a session
+ * @v9ses: session information structure
+ *
+ */
+
+void v9fs_session_close(struct v9fs_session_info *v9ses)
+{
+	if (v9ses->clnt) {
+		p9_client_destroy(v9ses->clnt);
+		v9ses->clnt = NULL;
+	}
+
+#ifdef CONFIG_9P_FSCACHE
+	if (v9ses->fscache) {
+		v9fs_cache_session_put_cookie(v9ses);
+		kfree(v9ses->cachetag);
+	}
+#endif
+	__putname(v9ses->uname);
+	__putname(v9ses->aname);
+
+	bdi_destroy(&v9ses->bdi);
+
+	spin_lock(&v9fs_sessionlist_lock);
+	list_del(&v9ses->slist);
+	spin_unlock(&v9fs_sessionlist_lock);
+}
+
+/**
+ * v9fs_session_cancel - terminate a session
+ * @v9ses: session to terminate
+ *
+ * mark transport as disconnected and cancel all pending requests.
+ */
+
+void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
+	P9_DPRINTK(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
+	p9_client_disconnect(v9ses->clnt);
+}
+
+/**
+ * v9fs_session_begin_cancel - Begin terminate of a session
+ * @v9ses: session to terminate
+ *
+ * After this call we don't allow any request other than clunk.
+ */
+
+void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
+{
+	P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
+	p9_client_begin_disconnect(v9ses->clnt);
+}
+
+extern int v9fs_error_init(void);
+
+static struct kobject *v9fs_kobj;
+
+#ifdef CONFIG_9P_FSCACHE
+/**
+ * caches_show - list caches associated with a session
+ *
+ * Returns the size of buffer written.
+ */
+
+static ssize_t caches_show(struct kobject *kobj,
+			   struct kobj_attribute *attr,
+			   char *buf)
+{
+	ssize_t n = 0, count = 0, limit = PAGE_SIZE;
+	struct v9fs_session_info *v9ses;
+
+	spin_lock(&v9fs_sessionlist_lock);
+	list_for_each_entry(v9ses, &v9fs_sessionlist, slist) {
+		if (v9ses->cachetag) {
+			n = snprintf(buf, limit, "%s\n", v9ses->cachetag);
+			if (n < 0) {
+				count = n;
+				break;
+			}
+
+			count += n;
+			limit -= n;
+		}
+	}
+
+	spin_unlock(&v9fs_sessionlist_lock);
+	return count;
+}
+
+static struct kobj_attribute v9fs_attr_cache = __ATTR_RO(caches);
+#endif /* CONFIG_9P_FSCACHE */
+
+static struct attribute *v9fs_attrs[] = {
+#ifdef CONFIG_9P_FSCACHE
+	&v9fs_attr_cache.attr,
+#endif
+	NULL,
+};
+
+static struct attribute_group v9fs_attr_group = {
+	.attrs = v9fs_attrs,
+};
+
+/**
+ * v9fs_sysfs_init - Initialize the v9fs sysfs interface
+ *
+ */
+
+static int v9fs_sysfs_init(void)
+{
+	v9fs_kobj = kobject_create_and_add("9p", fs_kobj);
+	if (!v9fs_kobj)
+		return -ENOMEM;
+
+	if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) {
+		kobject_put(v9fs_kobj);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * v9fs_sysfs_cleanup - Unregister the v9fs sysfs interface
+ *
+ */
+
+static void v9fs_sysfs_cleanup(void)
+{
+	sysfs_remove_group(v9fs_kobj, &v9fs_attr_group);
+	kobject_put(v9fs_kobj);
+}
+
+static void v9fs_inode_init_once(void *foo)
+{
+	struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
+#ifdef CONFIG_9P_FSCACHE
+	v9inode->fscache = NULL;
+#endif
+	memset(&v9inode->qid, 0, sizeof(v9inode->qid));
+	inode_init_once(&v9inode->vfs_inode);
+}
+
+/**
+ * v9fs_init_inode_cache - initialize a cache for 9P
+ * Returns 0 on success.
+ */
+static int v9fs_init_inode_cache(void)
+{
+	v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
+					  sizeof(struct v9fs_inode),
+					  0, (SLAB_RECLAIM_ACCOUNT|
+					      SLAB_MEM_SPREAD),
+					  v9fs_inode_init_once);
+	if (!v9fs_inode_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * v9fs_destroy_inode_cache - destroy the cache of 9P inode
+ *
+ */
+static void v9fs_destroy_inode_cache(void)
+{
+	kmem_cache_destroy(v9fs_inode_cache);
+}
+
+static int v9fs_cache_register(void)
+{
+	int ret;
+	ret = v9fs_init_inode_cache();
+	if (ret < 0)
+		return ret;
+#ifdef CONFIG_9P_FSCACHE
+	return fscache_register_netfs(&v9fs_cache_netfs);
+#else
+	return ret;
+#endif
+}
+
+static void v9fs_cache_unregister(void)
+{
+	v9fs_destroy_inode_cache();
+#ifdef CONFIG_9P_FSCACHE
+	fscache_unregister_netfs(&v9fs_cache_netfs);
+#endif
+}
+
+/**
+ * init_v9fs - Initialize module
+ *
+ */
+
+static int __init init_v9fs(void)
+{
+	int err;
+	printk(KERN_INFO "Installing v9fs 9p2000 file system support\n");
+	/* TODO: Setup list of registered trasnport modules */
+	err = register_filesystem(&v9fs_fs_type);
+	if (err < 0) {
+		printk(KERN_ERR "Failed to register filesystem\n");
+		return err;
+	}
+
+	err = v9fs_cache_register();
+	if (err < 0) {
+		printk(KERN_ERR "Failed to register v9fs for caching\n");
+		goto out_fs_unreg;
+	}
+
+	err = v9fs_sysfs_init();
+	if (err < 0) {
+		printk(KERN_ERR "Failed to register with sysfs\n");
+		goto out_sysfs_cleanup;
+	}
+
+	return 0;
+
+out_sysfs_cleanup:
+	v9fs_sysfs_cleanup();
+
+out_fs_unreg:
+	unregister_filesystem(&v9fs_fs_type);
+
+	return err;
+}
+
+/**
+ * exit_v9fs - shutdown module
+ *
+ */
+
+static void __exit exit_v9fs(void)
+{
+	v9fs_sysfs_cleanup();
+	v9fs_cache_unregister();
+	unregister_filesystem(&v9fs_fs_type);
+}
+
+module_init(init_v9fs)
+module_exit(exit_v9fs)
+
+MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
+MODULE_LICENSE("GPL");
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
new file mode 100644
index 00000000..e78956cb
--- /dev/null
+++ b/fs/9p/v9fs.h
@@ -0,0 +1,227 @@
+/*
+ * V9FS definitions.
+ *
+ *  Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#ifndef FS_9P_V9FS_H
+#define FS_9P_V9FS_H
+
+#include <linux/backing-dev.h>
+
+/**
+ * enum p9_session_flags - option flags for each 9P session
+ * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions
+ * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
+ * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
+ * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
+ * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client.
+ * @V9FS_ACCESS_ANY: use a single attach for all users
+ * @V9FS_ACCESS_MASK: bit mask of different ACCESS options
+ * @V9FS_POSIX_ACL: POSIX ACLs are enforced
+ *
+ * Session flags reflect options selected by users at mount time
+ */
+#define	V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \
+			 V9FS_ACCESS_USER |   \
+			 V9FS_ACCESS_CLIENT)
+#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
+#define V9FS_ACL_MASK V9FS_POSIX_ACL
+
+enum p9_session_flags {
+	V9FS_PROTO_2000U	= 0x01,
+	V9FS_PROTO_2000L	= 0x02,
+	V9FS_ACCESS_SINGLE	= 0x04,
+	V9FS_ACCESS_USER	= 0x08,
+	V9FS_ACCESS_CLIENT	= 0x10,
+	V9FS_POSIX_ACL		= 0x20
+};
+
+/* possible values of ->cache */
+/**
+ * enum p9_cache_modes - user specified cache preferences
+ * @CACHE_NONE: do not cache data, dentries, or directory contents (default)
+ * @CACHE_LOOSE: cache data, dentries, and directory contents w/no consistency
+ *
+ * eventually support loose, tight, time, session, default always none
+ */
+
+enum p9_cache_modes {
+	CACHE_NONE,
+	CACHE_LOOSE,
+	CACHE_FSCACHE,
+};
+
+/**
+ * struct v9fs_session_info - per-instance session information
+ * @flags: session options of type &p9_session_flags
+ * @nodev: set to 1 to disable device mapping
+ * @debug: debug level
+ * @afid: authentication handle
+ * @cache: cache mode of type &p9_cache_modes
+ * @cachetag: the tag of the cache associated with this session
+ * @fscache: session cookie associated with FS-Cache
+ * @options: copy of options string given by user
+ * @uname: string user name to mount hierarchy as
+ * @aname: mount specifier for remote hierarchy
+ * @maxdata: maximum data to be sent/recvd per protocol message
+ * @dfltuid: default numeric userid to mount hierarchy as
+ * @dfltgid: default numeric groupid to mount hierarchy as
+ * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy
+ * @clnt: reference to 9P network client instantiated for this session
+ * @slist: reference to list of registered 9p sessions
+ *
+ * This structure holds state for each session instance established during
+ * a sys_mount() .
+ *
+ * Bugs: there seems to be a lot of state which could be condensed and/or
+ * removed.
+ */
+
+struct v9fs_session_info {
+	/* options */
+	unsigned char flags;
+	unsigned char nodev;
+	unsigned short debug;
+	unsigned int afid;
+	unsigned int cache;
+#ifdef CONFIG_9P_FSCACHE
+	char *cachetag;
+	struct fscache_cookie *fscache;
+#endif
+
+	char *uname;		/* user name to mount as */
+	char *aname;		/* name of remote hierarchy being mounted */
+	unsigned int maxdata;	/* max data for client interface */
+	unsigned int dfltuid;	/* default uid/muid for legacy support */
+	unsigned int dfltgid;	/* default gid for legacy support */
+	u32 uid;		/* if ACCESS_SINGLE, the uid that has access */
+	struct p9_client *clnt;	/* 9p client */
+	struct list_head slist; /* list of sessions registered with v9fs */
+	struct backing_dev_info bdi;
+	struct rw_semaphore rename_sem;
+};
+
+/* cache_validity flags */
+#define V9FS_INO_INVALID_ATTR 0x01
+
+struct v9fs_inode {
+#ifdef CONFIG_9P_FSCACHE
+	spinlock_t fscache_lock;
+	struct fscache_cookie *fscache;
+#endif
+	struct p9_qid qid;
+	unsigned int cache_validity;
+	struct p9_fid *writeback_fid;
+	struct mutex v_mutex;
+	struct inode vfs_inode;
+};
+
+static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
+{
+	return container_of(inode, struct v9fs_inode, vfs_inode);
+}
+
+struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
+									char *);
+extern void v9fs_session_close(struct v9fs_session_info *v9ses);
+extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
+extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
+extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+			struct nameidata *nameidata);
+extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry);
+extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
+			void *p);
+extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
+					 struct p9_fid *fid,
+					 struct super_block *sb, int new);
+extern const struct inode_operations v9fs_dir_inode_operations_dotl;
+extern const struct inode_operations v9fs_file_inode_operations_dotl;
+extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
+extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
+					      struct p9_fid *fid,
+					      struct super_block *sb, int new);
+
+/* other default globals */
+#define V9FS_PORT	564
+#define V9FS_DEFUSER	"nobody"
+#define V9FS_DEFANAME	""
+#define V9FS_DEFUID	(-2)
+#define V9FS_DEFGID	(-2)
+
+static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
+{
+	return (inode->i_sb->s_fs_info);
+}
+
+static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry)
+{
+	return dentry->d_sb->s_fs_info;
+}
+
+static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
+{
+	return v9ses->flags & V9FS_PROTO_2000U;
+}
+
+static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
+{
+	return v9ses->flags & V9FS_PROTO_2000L;
+}
+
+/**
+ * v9fs_get_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+			struct super_block *sb)
+{
+	if (v9fs_proto_dotl(v9ses))
+		return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0);
+	else
+		return v9fs_inode_from_fid(v9ses, fid, sb, 0);
+}
+
+/**
+ * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+			    struct super_block *sb)
+{
+	if (v9fs_proto_dotl(v9ses))
+		return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1);
+	else
+		return v9fs_inode_from_fid(v9ses, fid, sb, 1);
+}
+
+#endif
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
new file mode 100644
index 00000000..f9a28eab
--- /dev/null
+++ b/fs/9p/v9fs_vfs.h
@@ -0,0 +1,87 @@
+/*
+ * V9FS VFS extensions.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#ifndef FS_9P_V9FS_VFS_H
+#define FS_9P_V9FS_VFS_H
+
+/* plan9 semantics are that created files are implicitly opened.
+ * But linux semantics are that you call create, then open.
+ * the plan9 approach is superior as it provides an atomic
+ * open.
+ * we track the create fid here. When the file is opened, if fidopen is
+ * non-zero, we use the fid and can skip some steps.
+ * there may be a better way to do this, but I don't know it.
+ * one BAD way is to clunk the fid on create, then open it again:
+ * you lose the atomicity of file open
+ */
+
+/* special case:
+ * unlink calls remove, which is an implicit clunk. So we have to track
+ * that kind of thing so that we don't try to clunk a dead fid.
+ */
+#define P9_LOCK_TIMEOUT (30*HZ)
+
+extern struct file_system_type v9fs_fs_type;
+extern const struct address_space_operations v9fs_addr_operations;
+extern const struct file_operations v9fs_file_operations;
+extern const struct file_operations v9fs_file_operations_dotl;
+extern const struct file_operations v9fs_dir_operations;
+extern const struct file_operations v9fs_dir_operations_dotl;
+extern const struct dentry_operations v9fs_dentry_operations;
+extern const struct dentry_operations v9fs_cached_dentry_operations;
+extern const struct file_operations v9fs_cached_file_operations;
+extern const struct file_operations v9fs_cached_file_operations_dotl;
+extern struct kmem_cache *v9fs_inode_cache;
+
+struct inode *v9fs_alloc_inode(struct super_block *sb);
+void v9fs_destroy_inode(struct inode *inode);
+struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t);
+int v9fs_init_inode(struct v9fs_session_info *v9ses,
+		    struct inode *inode, int mode, dev_t);
+void v9fs_evict_inode(struct inode *inode);
+ino_t v9fs_qid2ino(struct p9_qid *qid);
+void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
+void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
+int v9fs_dir_release(struct inode *inode, struct file *filp);
+int v9fs_file_open(struct inode *inode, struct file *file);
+void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
+int v9fs_uflags2omode(int uflags, int extended);
+
+ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
+ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64);
+void v9fs_blank_wstat(struct p9_wstat *wstat);
+int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
+int v9fs_file_fsync_dotl(struct file *filp, int datasync);
+ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *,
+				 const char __user *, size_t, loff_t *, int);
+int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
+int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode);
+static inline void v9fs_invalidate_inode_attr(struct inode *inode)
+{
+	struct v9fs_inode *v9inode;
+	v9inode = V9FS_I(inode);
+	v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
+	return;
+}
+
+int v9fs_open_to_dotl_flags(int flags);
+#endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
new file mode 100644
index 00000000..2524e4cb
--- /dev/null
+++ b/fs/9p/vfs_addr.c
@@ -0,0 +1,353 @@
+/*
+ *  linux/fs/9p/vfs_addr.c
+ *
+ * This file contians vfs address (mmap) ops for 9P2000.
+ *
+ *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/pagemap.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "cache.h"
+#include "fid.h"
+
+/**
+ * v9fs_fid_readpage - read an entire page in from 9P
+ *
+ * @fid: fid being read
+ * @page: structure to page
+ *
+ */
+static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
+{
+	int retval;
+	loff_t offset;
+	char *buffer;
+	struct inode *inode;
+
+	inode = page->mapping->host;
+	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+
+	BUG_ON(!PageLocked(page));
+
+	retval = v9fs_readpage_from_fscache(inode, page);
+	if (retval == 0)
+		return retval;
+
+	buffer = kmap(page);
+	offset = page_offset(page);
+
+	retval = v9fs_fid_readn(fid, buffer, NULL, PAGE_CACHE_SIZE, offset);
+	if (retval < 0) {
+		v9fs_uncache_page(inode, page);
+		goto done;
+	}
+
+	memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+
+	v9fs_readpage_to_fscache(inode, page);
+	retval = 0;
+
+done:
+	kunmap(page);
+	unlock_page(page);
+	return retval;
+}
+
+/**
+ * v9fs_vfs_readpage - read an entire page in from 9P
+ *
+ * @filp: file being read
+ * @page: structure to page
+ *
+ */
+
+static int v9fs_vfs_readpage(struct file *filp, struct page *page)
+{
+	return v9fs_fid_readpage(filp->private_data, page);
+}
+
+/**
+ * v9fs_vfs_readpages - read a set of pages from 9P
+ *
+ * @filp: file being read
+ * @mapping: the address space
+ * @pages: list of pages to read
+ * @nr_pages: count of pages to read
+ *
+ */
+
+static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
+			     struct list_head *pages, unsigned nr_pages)
+{
+	int ret = 0;
+	struct inode *inode;
+
+	inode = mapping->host;
+	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
+
+	ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
+	if (ret == 0)
+		return ret;
+
+	ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
+	P9_DPRINTK(P9_DEBUG_VFS, "  = %d\n", ret);
+	return ret;
+}
+
+/**
+ * v9fs_release_page - release the private state associated with a page
+ *
+ * Returns 1 if the page can be released, false otherwise.
+ */
+
+static int v9fs_release_page(struct page *page, gfp_t gfp)
+{
+	if (PagePrivate(page))
+		return 0;
+	return v9fs_fscache_release_page(page, gfp);
+}
+
+/**
+ * v9fs_invalidate_page - Invalidate a page completely or partially
+ *
+ * @page: structure to page
+ * @offset: offset in the page
+ */
+
+static void v9fs_invalidate_page(struct page *page, unsigned long offset)
+{
+	/*
+	 * If called with zero offset, we should release
+	 * the private state assocated with the page
+	 */
+	if (offset == 0)
+		v9fs_fscache_invalidate_page(page);
+}
+
+static int v9fs_vfs_writepage_locked(struct page *page)
+{
+	char *buffer;
+	int retval, len;
+	loff_t offset, size;
+	mm_segment_t old_fs;
+	struct v9fs_inode *v9inode;
+	struct inode *inode = page->mapping->host;
+
+	v9inode = V9FS_I(inode);
+	size = i_size_read(inode);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	set_page_writeback(page);
+
+	buffer = kmap(page);
+	offset = page_offset(page);
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* We should have writeback_fid always set */
+	BUG_ON(!v9inode->writeback_fid);
+
+	retval = v9fs_file_write_internal(inode,
+					  v9inode->writeback_fid,
+					  (__force const char __user *)buffer,
+					  len, &offset, 0);
+	if (retval > 0)
+		retval = 0;
+
+	set_fs(old_fs);
+	kunmap(page);
+	end_page_writeback(page);
+	return retval;
+}
+
+static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int retval;
+
+	retval = v9fs_vfs_writepage_locked(page);
+	if (retval < 0) {
+		if (retval == -EAGAIN) {
+			redirty_page_for_writepage(wbc, page);
+			retval = 0;
+		} else {
+			SetPageError(page);
+			mapping_set_error(page->mapping, retval);
+		}
+	} else
+		retval = 0;
+
+	unlock_page(page);
+	return retval;
+}
+
+/**
+ * v9fs_launder_page - Writeback a dirty page
+ * Returns 0 on success.
+ */
+
+static int v9fs_launder_page(struct page *page)
+{
+	int retval;
+	struct inode *inode = page->mapping->host;
+
+	v9fs_fscache_wait_on_page_write(inode, page);
+	if (clear_page_dirty_for_io(page)) {
+		retval = v9fs_vfs_writepage_locked(page);
+		if (retval)
+			return retval;
+	}
+	return 0;
+}
+
+/**
+ * v9fs_direct_IO - 9P address space operation for direct I/O
+ * @rw: direction (read or write)
+ * @iocb: target I/O control block
+ * @iov: array of vectors that define I/O buffer
+ * @pos: offset in file to begin the operation
+ * @nr_segs: size of iovec array
+ *
+ * The presence of v9fs_direct_IO() in the address space ops vector
+ * allowes open() O_DIRECT flags which would have failed otherwise.
+ *
+ * In the non-cached mode, we shunt off direct read and write requests before
+ * the VFS gets them, so this method should never be called.
+ *
+ * Direct IO is not 'yet' supported in the cached mode. Hence when
+ * this routine is called through generic_file_aio_read(), the read/write fails
+ * with an error.
+ *
+ */
+static ssize_t
+v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+	       loff_t pos, unsigned long nr_segs)
+{
+	/*
+	 * FIXME
+	 * Now that we do caching with cache mode enabled, We need
+	 * to support direct IO
+	 */
+	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
+			"off/no(%lld/%lu) EINVAL\n",
+			iocb->ki_filp->f_path.dentry->d_name.name,
+			(long long) pos, nr_segs);
+
+	return -EINVAL;
+}
+
+static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{
+	int retval = 0;
+	struct page *page;
+	struct v9fs_inode *v9inode;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct inode *inode = mapping->host;
+
+	v9inode = V9FS_I(inode);
+start:
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page) {
+		retval = -ENOMEM;
+		goto out;
+	}
+	BUG_ON(!v9inode->writeback_fid);
+	if (PageUptodate(page))
+		goto out;
+
+	if (len == PAGE_CACHE_SIZE)
+		goto out;
+
+	retval = v9fs_fid_readpage(v9inode->writeback_fid, page);
+	page_cache_release(page);
+	if (!retval)
+		goto start;
+out:
+	*pagep = page;
+	return retval;
+}
+
+static int v9fs_write_end(struct file *filp, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
+{
+	loff_t last_pos = pos + copied;
+	struct inode *inode = page->mapping->host;
+
+	if (unlikely(copied < len)) {
+		/*
+		 * zero out the rest of the area
+		 */
+		unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+		zero_user(page, from + copied, len - copied);
+		flush_dcache_page(page);
+	}
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold the i_mutex.
+	 */
+	if (last_pos > inode->i_size) {
+		inode_add_bytes(inode, last_pos - inode->i_size);
+		i_size_write(inode, last_pos);
+	}
+	set_page_dirty(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	return copied;
+}
+
+
+const struct address_space_operations v9fs_addr_operations = {
+	.readpage = v9fs_vfs_readpage,
+	.readpages = v9fs_vfs_readpages,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+	.writepage = v9fs_vfs_writepage,
+	.write_begin = v9fs_write_begin,
+	.write_end = v9fs_write_end,
+	.releasepage = v9fs_release_page,
+	.invalidatepage = v9fs_invalidate_page,
+	.launder_page = v9fs_launder_page,
+	.direct_IO = v9fs_direct_IO,
+};
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
new file mode 100644
index 00000000..e022890c
--- /dev/null
+++ b/fs/9p/vfs_dentry.c
@@ -0,0 +1,147 @@
+/*
+ *  linux/fs/9p/vfs_dentry.c
+ *
+ * This file contians vfs dentry ops for the 9P2000 protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+
+/**
+ * v9fs_dentry_delete - called when dentry refcount equals 0
+ * @dentry:  dentry in question
+ *
+ * By returning 1 here we should remove cacheing of unused
+ * dentry components.
+ *
+ */
+
+static int v9fs_dentry_delete(const struct dentry *dentry)
+{
+	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+									dentry);
+
+	return 1;
+}
+
+/**
+ * v9fs_cached_dentry_delete - called when dentry refcount equals 0
+ * @dentry:  dentry in question
+ *
+ */
+static int v9fs_cached_dentry_delete(const struct dentry *dentry)
+{
+	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n",
+		   dentry->d_name.name, dentry);
+
+	/* Don't cache negative dentries */
+	if (!dentry->d_inode)
+		return 1;
+	return 0;
+}
+
+/**
+ * v9fs_dentry_release - called when dentry is going to be freed
+ * @dentry:  dentry that is being release
+ *
+ */
+
+static void v9fs_dentry_release(struct dentry *dentry)
+{
+	struct v9fs_dentry *dent;
+	struct p9_fid *temp, *current_fid;
+
+	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+									dentry);
+	dent = dentry->d_fsdata;
+	if (dent) {
+		list_for_each_entry_safe(current_fid, temp, &dent->fidlist,
+									dlist) {
+			p9_client_clunk(current_fid);
+		}
+
+		kfree(dent);
+		dentry->d_fsdata = NULL;
+	}
+}
+
+static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct p9_fid *fid;
+	struct inode *inode;
+	struct v9fs_inode *v9inode;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	if (!inode)
+		goto out_valid;
+
+	v9inode = V9FS_I(inode);
+	if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
+		int retval;
+		struct v9fs_session_info *v9ses;
+		fid = v9fs_fid_lookup(dentry);
+		if (IS_ERR(fid))
+			return PTR_ERR(fid);
+
+		v9ses = v9fs_inode2v9ses(inode);
+		if (v9fs_proto_dotl(v9ses))
+			retval = v9fs_refresh_inode_dotl(fid, inode);
+		else
+			retval = v9fs_refresh_inode(fid, inode);
+		if (retval == -ENOENT)
+			return 0;
+		if (retval < 0)
+			return retval;
+	}
+out_valid:
+	return 1;
+}
+
+const struct dentry_operations v9fs_cached_dentry_operations = {
+	.d_revalidate = v9fs_lookup_revalidate,
+	.d_delete = v9fs_cached_dentry_delete,
+	.d_release = v9fs_dentry_release,
+};
+
+const struct dentry_operations v9fs_dentry_operations = {
+	.d_delete = v9fs_dentry_delete,
+	.d_release = v9fs_dentry_release,
+};
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
new file mode 100644
index 00000000..9c2bdda5
--- /dev/null
+++ b/fs/9p/vfs_dir.c
@@ -0,0 +1,318 @@
+/*
+ * linux/fs/9p/vfs_dir.c
+ *
+ * This file contains vfs directory ops for the 9P2000 protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+
+/**
+ * struct p9_rdir - readdir accounting
+ * @mutex: mutex protecting readdir
+ * @head: start offset of current dirread buffer
+ * @tail: end offset of current dirread buffer
+ * @buf: dirread buffer
+ *
+ * private structure for keeping track of readdir
+ * allocated on demand
+ */
+
+struct p9_rdir {
+	struct mutex mutex;
+	int head;
+	int tail;
+	uint8_t *buf;
+};
+
+/**
+ * dt_type - return file type
+ * @mistat: mistat structure
+ *
+ */
+
+static inline int dt_type(struct p9_wstat *mistat)
+{
+	unsigned long perm = mistat->mode;
+	int rettype = DT_REG;
+
+	if (perm & P9_DMDIR)
+		rettype = DT_DIR;
+	if (perm & P9_DMSYMLINK)
+		rettype = DT_LNK;
+
+	return rettype;
+}
+
+static void p9stat_init(struct p9_wstat *stbuf)
+{
+	stbuf->name  = NULL;
+	stbuf->uid   = NULL;
+	stbuf->gid   = NULL;
+	stbuf->muid  = NULL;
+	stbuf->extension = NULL;
+}
+
+/**
+ * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir
+ * @filp: opened file structure
+ * @buflen: Length in bytes of buffer to allocate
+ *
+ */
+
+static int v9fs_alloc_rdir_buf(struct file *filp, int buflen)
+{
+	struct p9_rdir *rdir;
+	struct p9_fid *fid;
+	int err = 0;
+
+	fid = filp->private_data;
+	if (!fid->rdir) {
+		rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
+
+		if (rdir == NULL) {
+			err = -ENOMEM;
+			goto exit;
+		}
+		spin_lock(&filp->f_dentry->d_lock);
+		if (!fid->rdir) {
+			rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir);
+			mutex_init(&rdir->mutex);
+			rdir->head = rdir->tail = 0;
+			fid->rdir = (void *) rdir;
+			rdir = NULL;
+		}
+		spin_unlock(&filp->f_dentry->d_lock);
+		kfree(rdir);
+	}
+exit:
+	return err;
+}
+
+/**
+ * v9fs_dir_readdir - read a directory
+ * @filp: opened file structure
+ * @dirent: directory structure ???
+ * @filldir: function to populate directory structure ???
+ *
+ */
+
+static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	int over;
+	struct p9_wstat st;
+	int err = 0;
+	struct p9_fid *fid;
+	int buflen;
+	int reclen = 0;
+	struct p9_rdir *rdir;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+	fid = filp->private_data;
+
+	buflen = fid->clnt->msize - P9_IOHDRSZ;
+
+	err = v9fs_alloc_rdir_buf(filp, buflen);
+	if (err)
+		goto exit;
+	rdir = (struct p9_rdir *) fid->rdir;
+
+	err = mutex_lock_interruptible(&rdir->mutex);
+	if (err)
+		return err;
+	while (err == 0) {
+		if (rdir->tail == rdir->head) {
+			err = v9fs_file_readn(filp, rdir->buf, NULL,
+							buflen, filp->f_pos);
+			if (err <= 0)
+				goto unlock_and_exit;
+
+			rdir->head = 0;
+			rdir->tail = err;
+		}
+		while (rdir->head < rdir->tail) {
+			p9stat_init(&st);
+			err = p9stat_read(rdir->buf + rdir->head,
+						rdir->tail - rdir->head, &st,
+						fid->clnt->proto_version);
+			if (err) {
+				P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+				err = -EIO;
+				p9stat_free(&st);
+				goto unlock_and_exit;
+			}
+			reclen = st.size+2;
+
+			over = filldir(dirent, st.name, strlen(st.name),
+			    filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
+
+			p9stat_free(&st);
+
+			if (over) {
+				err = 0;
+				goto unlock_and_exit;
+			}
+			rdir->head += reclen;
+			filp->f_pos += reclen;
+		}
+	}
+
+unlock_and_exit:
+	mutex_unlock(&rdir->mutex);
+exit:
+	return err;
+}
+
+/**
+ * v9fs_dir_readdir_dotl - read a directory
+ * @filp: opened file structure
+ * @dirent: buffer to fill dirent structures
+ * @filldir: function to populate dirent structures
+ *
+ */
+static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
+						filldir_t filldir)
+{
+	int over;
+	int err = 0;
+	struct p9_fid *fid;
+	int buflen;
+	struct p9_rdir *rdir;
+	struct p9_dirent curdirent;
+	u64 oldoffset = 0;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+	fid = filp->private_data;
+
+	buflen = fid->clnt->msize - P9_READDIRHDRSZ;
+
+	err = v9fs_alloc_rdir_buf(filp, buflen);
+	if (err)
+		goto exit;
+	rdir = (struct p9_rdir *) fid->rdir;
+
+	err = mutex_lock_interruptible(&rdir->mutex);
+	if (err)
+		return err;
+
+	while (err == 0) {
+		if (rdir->tail == rdir->head) {
+			err = p9_client_readdir(fid, rdir->buf, buflen,
+								filp->f_pos);
+			if (err <= 0)
+				goto unlock_and_exit;
+
+			rdir->head = 0;
+			rdir->tail = err;
+		}
+
+		while (rdir->head < rdir->tail) {
+
+			err = p9dirent_read(rdir->buf + rdir->head,
+						rdir->tail - rdir->head,
+						&curdirent,
+						fid->clnt->proto_version);
+			if (err < 0) {
+				P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+				err = -EIO;
+				goto unlock_and_exit;
+			}
+
+			/* d_off in dirent structure tracks the offset into
+			 * the next dirent in the dir. However, filldir()
+			 * expects offset into the current dirent. Hence
+			 * while calling filldir send the offset from the
+			 * previous dirent structure.
+			 */
+			over = filldir(dirent, curdirent.d_name,
+					strlen(curdirent.d_name),
+					oldoffset, v9fs_qid2ino(&curdirent.qid),
+					curdirent.d_type);
+			oldoffset = curdirent.d_off;
+
+			if (over) {
+				err = 0;
+				goto unlock_and_exit;
+			}
+
+			filp->f_pos = curdirent.d_off;
+			rdir->head += err;
+		}
+	}
+
+unlock_and_exit:
+	mutex_unlock(&rdir->mutex);
+exit:
+	return err;
+}
+
+
+/**
+ * v9fs_dir_release - close a directory
+ * @inode: inode of the directory
+ * @filp: file pointer to a directory
+ *
+ */
+
+int v9fs_dir_release(struct inode *inode, struct file *filp)
+{
+	struct p9_fid *fid;
+
+	fid = filp->private_data;
+	P9_DPRINTK(P9_DEBUG_VFS,
+			"v9fs_dir_release: inode: %p filp: %p fid: %d\n",
+			inode, filp, fid ? fid->fid : -1);
+	if (fid)
+		p9_client_clunk(fid);
+	return 0;
+}
+
+const struct file_operations v9fs_dir_operations = {
+	.read = generic_read_dir,
+	.llseek = generic_file_llseek,
+	.readdir = v9fs_dir_readdir,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+};
+
+const struct file_operations v9fs_dir_operations_dotl = {
+	.read = generic_read_dir,
+	.llseek = generic_file_llseek,
+	.readdir = v9fs_dir_readdir_dotl,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+        .fsync = v9fs_file_fsync_dotl,
+};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
new file mode 100644
index 00000000..9d6e1685
--- /dev/null
+++ b/fs/9p/vfs_file.c
@@ -0,0 +1,771 @@
+/*
+ *  linux/fs/9p/vfs_file.c
+ *
+ * This file contians vfs file ops for 9P2000.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/list.h>
+#include <linux/pagemap.h>
+#include <linux/utsname.h>
+#include <asm/uaccess.h>
+#include <linux/idr.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+#include "cache.h"
+
+static const struct vm_operations_struct v9fs_file_vm_ops;
+
+/**
+ * v9fs_file_open - open a file (or directory)
+ * @inode: inode to be opened
+ * @file: file being opened
+ *
+ */
+
+int v9fs_file_open(struct inode *inode, struct file *file)
+{
+	int err;
+	struct v9fs_inode *v9inode;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	int omode;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
+	v9inode = V9FS_I(inode);
+	v9ses = v9fs_inode2v9ses(inode);
+	if (v9fs_proto_dotl(v9ses))
+		omode = v9fs_open_to_dotl_flags(file->f_flags);
+	else
+		omode = v9fs_uflags2omode(file->f_flags,
+					v9fs_proto_dotu(v9ses));
+	fid = file->private_data;
+	if (!fid) {
+		fid = v9fs_fid_clone(file->f_path.dentry);
+		if (IS_ERR(fid))
+			return PTR_ERR(fid);
+
+		err = p9_client_open(fid, omode);
+		if (err < 0) {
+			p9_client_clunk(fid);
+			return err;
+		}
+		if (file->f_flags & O_TRUNC) {
+			i_size_write(inode, 0);
+			inode->i_blocks = 0;
+		}
+		if ((file->f_flags & O_APPEND) &&
+			(!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
+			generic_file_llseek(file, 0, SEEK_END);
+	}
+
+	file->private_data = fid;
+	mutex_lock(&v9inode->v_mutex);
+	if (v9ses->cache && !v9inode->writeback_fid &&
+	    ((file->f_flags & O_ACCMODE) != O_RDONLY)) {
+		/*
+		 * clone a fid and add it to writeback_fid
+		 * we do it during open time instead of
+		 * page dirty time via write_begin/page_mkwrite
+		 * because we want write after unlink usecase
+		 * to work.
+		 */
+		fid = v9fs_writeback_fid(file->f_path.dentry);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			mutex_unlock(&v9inode->v_mutex);
+			goto out_error;
+		}
+		v9inode->writeback_fid = (void *) fid;
+	}
+	mutex_unlock(&v9inode->v_mutex);
+#ifdef CONFIG_9P_FSCACHE
+	if (v9ses->cache)
+		v9fs_cache_inode_set_cookie(inode, file);
+#endif
+	return 0;
+out_error:
+	p9_client_clunk(file->private_data);
+	file->private_data = NULL;
+	return err;
+}
+
+/**
+ * v9fs_file_lock - lock a file (or directory)
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ * Bugs: this looks like a local only lock, we should extend into 9P
+ *       by using open exclusive
+ */
+
+static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	int res = 0;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
+
+	/* No mandatory locks */
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+		return -ENOLCK;
+
+	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+		filemap_write_and_wait(inode->i_mapping);
+		invalidate_mapping_pages(&inode->i_data, 0, -1);
+	}
+
+	return res;
+}
+
+static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct p9_flock flock;
+	struct p9_fid *fid;
+	uint8_t status;
+	int res = 0;
+	unsigned char fl_type;
+
+	fid = filp->private_data;
+	BUG_ON(fid == NULL);
+
+	if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
+		BUG();
+
+	res = posix_lock_file_wait(filp, fl);
+	if (res < 0)
+		goto out;
+
+	/* convert posix lock to p9 tlock args */
+	memset(&flock, 0, sizeof(flock));
+	/* map the lock type */
+	switch (fl->fl_type) {
+	case F_RDLCK:
+		flock.type = P9_LOCK_TYPE_RDLCK;
+		break;
+	case F_WRLCK:
+		flock.type = P9_LOCK_TYPE_WRLCK;
+		break;
+	case F_UNLCK:
+		flock.type = P9_LOCK_TYPE_UNLCK;
+		break;
+	}
+	flock.start = fl->fl_start;
+	if (fl->fl_end == OFFSET_MAX)
+		flock.length = 0;
+	else
+		flock.length = fl->fl_end - fl->fl_start + 1;
+	flock.proc_id = fl->fl_pid;
+	flock.client_id = utsname()->nodename;
+	if (IS_SETLKW(cmd))
+		flock.flags = P9_LOCK_FLAGS_BLOCK;
+
+	/*
+	 * if its a blocked request and we get P9_LOCK_BLOCKED as the status
+	 * for lock request, keep on trying
+	 */
+	for (;;) {
+		res = p9_client_lock_dotl(fid, &flock, &status);
+		if (res < 0)
+			break;
+
+		if (status != P9_LOCK_BLOCKED)
+			break;
+		if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
+			break;
+		schedule_timeout_interruptible(P9_LOCK_TIMEOUT);
+	}
+
+	/* map 9p status to VFS status */
+	switch (status) {
+	case P9_LOCK_SUCCESS:
+		res = 0;
+		break;
+	case P9_LOCK_BLOCKED:
+		res = -EAGAIN;
+		break;
+	case P9_LOCK_ERROR:
+	case P9_LOCK_GRACE:
+		res = -ENOLCK;
+		break;
+	default:
+		BUG();
+	}
+
+	/*
+	 * incase server returned error for lock request, revert
+	 * it locally
+	 */
+	if (res < 0 && fl->fl_type != F_UNLCK) {
+		fl_type = fl->fl_type;
+		fl->fl_type = F_UNLCK;
+		res = posix_lock_file_wait(filp, fl);
+		fl->fl_type = fl_type;
+	}
+out:
+	return res;
+}
+
+static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
+{
+	struct p9_getlock glock;
+	struct p9_fid *fid;
+	int res = 0;
+
+	fid = filp->private_data;
+	BUG_ON(fid == NULL);
+
+	posix_test_lock(filp, fl);
+	/*
+	 * if we have a conflicting lock locally, no need to validate
+	 * with server
+	 */
+	if (fl->fl_type != F_UNLCK)
+		return res;
+
+	/* convert posix lock to p9 tgetlock args */
+	memset(&glock, 0, sizeof(glock));
+	glock.type  = P9_LOCK_TYPE_UNLCK;
+	glock.start = fl->fl_start;
+	if (fl->fl_end == OFFSET_MAX)
+		glock.length = 0;
+	else
+		glock.length = fl->fl_end - fl->fl_start + 1;
+	glock.proc_id = fl->fl_pid;
+	glock.client_id = utsname()->nodename;
+
+	res = p9_client_getlock_dotl(fid, &glock);
+	if (res < 0)
+		return res;
+	/* map 9p lock type to os lock type */
+	switch (glock.type) {
+	case P9_LOCK_TYPE_RDLCK:
+		fl->fl_type = F_RDLCK;
+		break;
+	case P9_LOCK_TYPE_WRLCK:
+		fl->fl_type = F_WRLCK;
+		break;
+	case P9_LOCK_TYPE_UNLCK:
+		fl->fl_type = F_UNLCK;
+		break;
+	}
+	if (glock.type != P9_LOCK_TYPE_UNLCK) {
+		fl->fl_start = glock.start;
+		if (glock.length == 0)
+			fl->fl_end = OFFSET_MAX;
+		else
+			fl->fl_end = glock.start + glock.length - 1;
+		fl->fl_pid = glock.proc_id;
+	}
+	return res;
+}
+
+/**
+ * v9fs_file_lock_dotl - lock a file (or directory)
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ */
+
+static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int ret = -ENOLCK;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
+				cmd, fl, filp->f_path.dentry->d_name.name);
+
+	/* No mandatory locks */
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+		goto out_err;
+
+	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+		filemap_write_and_wait(inode->i_mapping);
+		invalidate_mapping_pages(&inode->i_data, 0, -1);
+	}
+
+	if (IS_SETLK(cmd) || IS_SETLKW(cmd))
+		ret = v9fs_file_do_lock(filp, cmd, fl);
+	else if (IS_GETLK(cmd))
+		ret = v9fs_file_getlock(filp, fl);
+	else
+		ret = -EINVAL;
+out_err:
+	return ret;
+}
+
+/**
+ * v9fs_file_flock_dotl - lock a file
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ */
+
+static int v9fs_file_flock_dotl(struct file *filp, int cmd,
+	struct file_lock *fl)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int ret = -ENOLCK;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
+				cmd, fl, filp->f_path.dentry->d_name.name);
+
+	/* No mandatory locks */
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+		goto out_err;
+
+	if (!(fl->fl_flags & FL_FLOCK))
+		goto out_err;
+
+	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+		filemap_write_and_wait(inode->i_mapping);
+		invalidate_mapping_pages(&inode->i_data, 0, -1);
+	}
+	/* Convert flock to posix lock */
+	fl->fl_owner = (fl_owner_t)filp;
+	fl->fl_start = 0;
+	fl->fl_end = OFFSET_MAX;
+	fl->fl_flags |= FL_POSIX;
+	fl->fl_flags ^= FL_FLOCK;
+
+	if (IS_SETLK(cmd) | IS_SETLKW(cmd))
+		ret = v9fs_file_do_lock(filp, cmd, fl);
+	else
+		ret = -EINVAL;
+out_err:
+	return ret;
+}
+
+/**
+ * v9fs_fid_readn - read from a fid
+ * @fid: fid to read
+ * @data: data buffer to read data into
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+ssize_t
+v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count,
+	       u64 offset)
+{
+	int n, total, size;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
+		   (long long unsigned) offset, count);
+	n = 0;
+	total = 0;
+	size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
+	do {
+		n = p9_client_read(fid, data, udata, offset, count);
+		if (n <= 0)
+			break;
+
+		if (data)
+			data += n;
+		if (udata)
+			udata += n;
+
+		offset += n;
+		count -= n;
+		total += n;
+	} while (count > 0 && n == size);
+
+	if (n < 0)
+		total = n;
+
+	return total;
+}
+
+/**
+ * v9fs_file_readn - read from a file
+ * @filp: file pointer to read
+ * @data: data buffer to read data into
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+ssize_t
+v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
+	       u64 offset)
+{
+	return v9fs_fid_readn(filp->private_data, data, udata, count, offset);
+}
+
+/**
+ * v9fs_file_read - read from a file
+ * @filp: file pointer to read
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+
+static ssize_t
+v9fs_file_read(struct file *filp, char __user *udata, size_t count,
+	       loff_t * offset)
+{
+	int ret;
+	struct p9_fid *fid;
+	size_t size;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
+	fid = filp->private_data;
+
+	size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
+	if (count > size)
+		ret = v9fs_file_readn(filp, NULL, udata, count, *offset);
+	else
+		ret = p9_client_read(fid, NULL, udata, *offset, count);
+
+	if (ret > 0)
+		*offset += ret;
+
+	return ret;
+}
+
+ssize_t
+v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
+			 const char __user *data, size_t count,
+			 loff_t *offset, int invalidate)
+{
+	int n;
+	loff_t i_size;
+	size_t total = 0;
+	struct p9_client *clnt;
+	loff_t origin = *offset;
+	unsigned long pg_start, pg_end;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
+		(int)count, (int)*offset);
+
+	clnt = fid->clnt;
+	do {
+		n = p9_client_write(fid, NULL, data+total, origin+total, count);
+		if (n <= 0)
+			break;
+		count -= n;
+		total += n;
+	} while (count > 0);
+
+	if (invalidate && (total > 0)) {
+		pg_start = origin >> PAGE_CACHE_SHIFT;
+		pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
+		if (inode->i_mapping && inode->i_mapping->nrpages)
+			invalidate_inode_pages2_range(inode->i_mapping,
+						      pg_start, pg_end);
+		*offset += total;
+		i_size = i_size_read(inode);
+		if (*offset > i_size) {
+			inode_add_bytes(inode, *offset - i_size);
+			i_size_write(inode, *offset);
+		}
+	}
+	if (n < 0)
+		return n;
+
+	return total;
+}
+
+/**
+ * v9fs_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_file_write(struct file *filp, const char __user * data,
+		size_t count, loff_t *offset)
+{
+	ssize_t retval = 0;
+	loff_t origin = *offset;
+
+
+	retval = generic_write_checks(filp, &origin, &count, 0);
+	if (retval)
+		goto out;
+
+	retval = -EINVAL;
+	if ((ssize_t) count < 0)
+		goto out;
+	retval = 0;
+	if (!count)
+		goto out;
+
+	retval = v9fs_file_write_internal(filp->f_path.dentry->d_inode,
+					filp->private_data,
+					data, count, &origin, 1);
+	/* update offset on successful write */
+	if (retval > 0)
+		*offset = origin;
+out:
+	return retval;
+}
+
+
+static int v9fs_file_fsync(struct file *filp, int datasync)
+{
+	struct p9_fid *fid;
+	struct p9_wstat wstat;
+	int retval;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
+
+	fid = filp->private_data;
+	v9fs_blank_wstat(&wstat);
+
+	retval = p9_client_wstat(fid, &wstat);
+	return retval;
+}
+
+int v9fs_file_fsync_dotl(struct file *filp, int datasync)
+{
+	struct p9_fid *fid;
+	int retval;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n",
+			filp, datasync);
+
+	fid = filp->private_data;
+
+	retval = p9_client_fsync(fid, datasync);
+	return retval;
+}
+
+static int
+v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int retval;
+
+	retval = generic_file_mmap(file, vma);
+	if (!retval)
+		vma->vm_ops = &v9fs_file_vm_ops;
+
+	return retval;
+}
+
+static int
+v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct v9fs_inode *v9inode;
+	struct page *page = vmf->page;
+	struct file *filp = vma->vm_file;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+
+	P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n",
+		   page, (unsigned long)filp->private_data);
+
+	v9inode = V9FS_I(inode);
+	/* make sure the cache has finished storing the page */
+	v9fs_fscache_wait_on_page_write(inode, page);
+	BUG_ON(!v9inode->writeback_fid);
+	lock_page(page);
+	if (page->mapping != inode->i_mapping)
+		goto out_unlock;
+
+	return VM_FAULT_LOCKED;
+out_unlock:
+	unlock_page(page);
+	return VM_FAULT_NOPAGE;
+}
+
+static ssize_t
+v9fs_direct_read(struct file *filp, char __user *udata, size_t count,
+		 loff_t *offsetp)
+{
+	loff_t size, offset;
+	struct inode *inode;
+	struct address_space *mapping;
+
+	offset = *offsetp;
+	mapping = filp->f_mapping;
+	inode = mapping->host;
+	if (!count)
+		return 0;
+	size = i_size_read(inode);
+	if (offset < size)
+		filemap_write_and_wait_range(mapping, offset,
+					     offset + count - 1);
+
+	return v9fs_file_read(filp, udata, count, offsetp);
+}
+
+/**
+ * v9fs_cached_file_read - read from a file
+ * @filp: file pointer to read
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+static ssize_t
+v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
+		      loff_t *offset)
+{
+	if (filp->f_flags & O_DIRECT)
+		return v9fs_direct_read(filp, data, count, offset);
+	return do_sync_read(filp, data, count, offset);
+}
+
+static ssize_t
+v9fs_direct_write(struct file *filp, const char __user * data,
+		  size_t count, loff_t *offsetp)
+{
+	loff_t offset;
+	ssize_t retval;
+	struct inode *inode;
+	struct address_space *mapping;
+
+	offset = *offsetp;
+	mapping = filp->f_mapping;
+	inode = mapping->host;
+	if (!count)
+		return 0;
+
+	mutex_lock(&inode->i_mutex);
+	retval = filemap_write_and_wait_range(mapping, offset,
+					      offset + count - 1);
+	if (retval)
+		goto err_out;
+	/*
+	 * After a write we want buffered reads to be sure to go to disk to get
+	 * the new data.  We invalidate clean cached page from the region we're
+	 * about to write.  We do this *before* the write so that if we fail
+	 * here we fall back to buffered write
+	 */
+	if (mapping->nrpages) {
+		pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT;
+		pgoff_t pg_end   = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+
+		retval = invalidate_inode_pages2_range(mapping,
+							pg_start, pg_end);
+		/*
+		 * If a page can not be invalidated, fall back
+		 * to buffered write.
+		 */
+		if (retval) {
+			if (retval == -EBUSY)
+				goto buff_write;
+			goto err_out;
+		}
+	}
+	retval = v9fs_file_write(filp, data, count, offsetp);
+err_out:
+	mutex_unlock(&inode->i_mutex);
+	return retval;
+
+buff_write:
+	mutex_unlock(&inode->i_mutex);
+	return do_sync_write(filp, data, count, offsetp);
+}
+
+/**
+ * v9fs_cached_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_cached_file_write(struct file *filp, const char __user * data,
+		       size_t count, loff_t *offset)
+{
+
+	if (filp->f_flags & O_DIRECT)
+		return v9fs_direct_write(filp, data, count, offset);
+	return do_sync_write(filp, data, count, offset);
+}
+
+static const struct vm_operations_struct v9fs_file_vm_ops = {
+	.fault = filemap_fault,
+	.page_mkwrite = v9fs_vm_page_mkwrite,
+};
+
+
+const struct file_operations v9fs_cached_file_operations = {
+	.llseek = generic_file_llseek,
+	.read = v9fs_cached_file_read,
+	.write = v9fs_cached_file_write,
+	.aio_read = generic_file_aio_read,
+	.aio_write = generic_file_aio_write,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock,
+	.mmap = v9fs_file_mmap,
+	.fsync = v9fs_file_fsync,
+};
+
+const struct file_operations v9fs_cached_file_operations_dotl = {
+	.llseek = generic_file_llseek,
+	.read = v9fs_cached_file_read,
+	.write = v9fs_cached_file_write,
+	.aio_read = generic_file_aio_read,
+	.aio_write = generic_file_aio_write,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock_dotl,
+	.flock = v9fs_file_flock_dotl,
+	.mmap = v9fs_file_mmap,
+	.fsync = v9fs_file_fsync_dotl,
+};
+
+const struct file_operations v9fs_file_operations = {
+	.llseek = generic_file_llseek,
+	.read = v9fs_file_read,
+	.write = v9fs_file_write,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock,
+	.mmap = generic_file_readonly_mmap,
+	.fsync = v9fs_file_fsync,
+};
+
+const struct file_operations v9fs_file_operations_dotl = {
+	.llseek = generic_file_llseek,
+	.read = v9fs_file_read,
+	.write = v9fs_file_write,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock_dotl,
+	.flock = v9fs_file_flock_dotl,
+	.mmap = generic_file_readonly_mmap,
+	.fsync = v9fs_file_fsync_dotl,
+};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
new file mode 100644
index 00000000..c72e20cd
--- /dev/null
+++ b/fs/9p/vfs_inode.c
@@ -0,0 +1,1487 @@
+/*
+ *  linux/fs/9p/vfs_inode.c
+ *
+ * This file contains vfs inode ops for the 9P2000 protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+#include "cache.h"
+#include "xattr.h"
+#include "acl.h"
+
+static const struct inode_operations v9fs_dir_inode_operations;
+static const struct inode_operations v9fs_dir_inode_operations_dotu;
+static const struct inode_operations v9fs_file_inode_operations;
+static const struct inode_operations v9fs_symlink_inode_operations;
+
+/**
+ * unixmode2p9mode - convert unix mode bits to plan 9
+ * @v9ses: v9fs session information
+ * @mode: mode to convert
+ *
+ */
+
+static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
+{
+	int res;
+	res = mode & 0777;
+	if (S_ISDIR(mode))
+		res |= P9_DMDIR;
+	if (v9fs_proto_dotu(v9ses)) {
+		if (S_ISLNK(mode))
+			res |= P9_DMSYMLINK;
+		if (v9ses->nodev == 0) {
+			if (S_ISSOCK(mode))
+				res |= P9_DMSOCKET;
+			if (S_ISFIFO(mode))
+				res |= P9_DMNAMEDPIPE;
+			if (S_ISBLK(mode))
+				res |= P9_DMDEVICE;
+			if (S_ISCHR(mode))
+				res |= P9_DMDEVICE;
+		}
+
+		if ((mode & S_ISUID) == S_ISUID)
+			res |= P9_DMSETUID;
+		if ((mode & S_ISGID) == S_ISGID)
+			res |= P9_DMSETGID;
+		if ((mode & S_ISVTX) == S_ISVTX)
+			res |= P9_DMSETVTX;
+		if ((mode & P9_DMLINK))
+			res |= P9_DMLINK;
+	}
+
+	return res;
+}
+
+/**
+ * p9mode2unixmode- convert plan9 mode bits to unix mode bits
+ * @v9ses: v9fs session information
+ * @stat: p9_wstat from which mode need to be derived
+ * @rdev: major number, minor number in case of device files.
+ *
+ */
+static int p9mode2unixmode(struct v9fs_session_info *v9ses,
+			   struct p9_wstat *stat, dev_t *rdev)
+{
+	int res;
+	int mode = stat->mode;
+
+	res = mode & S_IALLUGO;
+	*rdev = 0;
+
+	if ((mode & P9_DMDIR) == P9_DMDIR)
+		res |= S_IFDIR;
+	else if ((mode & P9_DMSYMLINK) && (v9fs_proto_dotu(v9ses)))
+		res |= S_IFLNK;
+	else if ((mode & P9_DMSOCKET) && (v9fs_proto_dotu(v9ses))
+		 && (v9ses->nodev == 0))
+		res |= S_IFSOCK;
+	else if ((mode & P9_DMNAMEDPIPE) && (v9fs_proto_dotu(v9ses))
+		 && (v9ses->nodev == 0))
+		res |= S_IFIFO;
+	else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
+		 && (v9ses->nodev == 0)) {
+		char type = 0, ext[32];
+		int major = -1, minor = -1;
+
+		strncpy(ext, stat->extension, sizeof(ext));
+		sscanf(ext, "%c %u %u", &type, &major, &minor);
+		switch (type) {
+		case 'c':
+			res |= S_IFCHR;
+			break;
+		case 'b':
+			res |= S_IFBLK;
+			break;
+		default:
+			P9_DPRINTK(P9_DEBUG_ERROR,
+				"Unknown special type %c %s\n", type,
+				stat->extension);
+		};
+		*rdev = MKDEV(major, minor);
+	} else
+		res |= S_IFREG;
+
+	if (v9fs_proto_dotu(v9ses)) {
+		if ((mode & P9_DMSETUID) == P9_DMSETUID)
+			res |= S_ISUID;
+
+		if ((mode & P9_DMSETGID) == P9_DMSETGID)
+			res |= S_ISGID;
+
+		if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
+			res |= S_ISVTX;
+	}
+	return res;
+}
+
+/**
+ * v9fs_uflags2omode- convert posix open flags to plan 9 mode bits
+ * @uflags: flags to convert
+ * @extended: if .u extensions are active
+ */
+
+int v9fs_uflags2omode(int uflags, int extended)
+{
+	int ret;
+
+	ret = 0;
+	switch (uflags&3) {
+	default:
+	case O_RDONLY:
+		ret = P9_OREAD;
+		break;
+
+	case O_WRONLY:
+		ret = P9_OWRITE;
+		break;
+
+	case O_RDWR:
+		ret = P9_ORDWR;
+		break;
+	}
+
+	if (uflags & O_TRUNC)
+		ret |= P9_OTRUNC;
+
+	if (extended) {
+		if (uflags & O_EXCL)
+			ret |= P9_OEXCL;
+
+		if (uflags & O_APPEND)
+			ret |= P9_OAPPEND;
+	}
+
+	return ret;
+}
+
+/**
+ * v9fs_blank_wstat - helper function to setup a 9P stat structure
+ * @wstat: structure to initialize
+ *
+ */
+
+void
+v9fs_blank_wstat(struct p9_wstat *wstat)
+{
+	wstat->type = ~0;
+	wstat->dev = ~0;
+	wstat->qid.type = ~0;
+	wstat->qid.version = ~0;
+	*((long long *)&wstat->qid.path) = ~0;
+	wstat->mode = ~0;
+	wstat->atime = ~0;
+	wstat->mtime = ~0;
+	wstat->length = ~0;
+	wstat->name = NULL;
+	wstat->uid = NULL;
+	wstat->gid = NULL;
+	wstat->muid = NULL;
+	wstat->n_uid = ~0;
+	wstat->n_gid = ~0;
+	wstat->n_muid = ~0;
+	wstat->extension = NULL;
+}
+
+/**
+ * v9fs_alloc_inode - helper function to allocate an inode
+ *
+ */
+struct inode *v9fs_alloc_inode(struct super_block *sb)
+{
+	struct v9fs_inode *v9inode;
+	v9inode = (struct v9fs_inode *)kmem_cache_alloc(v9fs_inode_cache,
+							GFP_KERNEL);
+	if (!v9inode)
+		return NULL;
+#ifdef CONFIG_9P_FSCACHE
+	v9inode->fscache = NULL;
+	spin_lock_init(&v9inode->fscache_lock);
+#endif
+	v9inode->writeback_fid = NULL;
+	v9inode->cache_validity = 0;
+	mutex_init(&v9inode->v_mutex);
+	return &v9inode->vfs_inode;
+}
+
+/**
+ * v9fs_destroy_inode - destroy an inode
+ *
+ */
+
+static void v9fs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(v9fs_inode_cache, V9FS_I(inode));
+}
+
+void v9fs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, v9fs_i_callback);
+}
+
+int v9fs_init_inode(struct v9fs_session_info *v9ses,
+		    struct inode *inode, int mode, dev_t rdev)
+{
+	int err = 0;
+
+	inode_init_owner(inode, NULL, mode);
+	inode->i_blocks = 0;
+	inode->i_rdev = rdev;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_mapping->a_ops = &v9fs_addr_operations;
+
+	switch (mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFBLK:
+	case S_IFCHR:
+	case S_IFSOCK:
+		if (v9fs_proto_dotl(v9ses)) {
+			inode->i_op = &v9fs_file_inode_operations_dotl;
+			inode->i_fop = &v9fs_file_operations_dotl;
+		} else if (v9fs_proto_dotu(v9ses)) {
+			inode->i_op = &v9fs_file_inode_operations;
+			inode->i_fop = &v9fs_file_operations;
+		} else {
+			P9_DPRINTK(P9_DEBUG_ERROR,
+				   "special files without extended mode\n");
+			err = -EINVAL;
+			goto error;
+		}
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		break;
+	case S_IFREG:
+		if (v9fs_proto_dotl(v9ses)) {
+			inode->i_op = &v9fs_file_inode_operations_dotl;
+			if (v9ses->cache)
+				inode->i_fop =
+					&v9fs_cached_file_operations_dotl;
+			else
+				inode->i_fop = &v9fs_file_operations_dotl;
+		} else {
+			inode->i_op = &v9fs_file_inode_operations;
+			if (v9ses->cache)
+				inode->i_fop = &v9fs_cached_file_operations;
+			else
+				inode->i_fop = &v9fs_file_operations;
+		}
+
+		break;
+	case S_IFLNK:
+		if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
+			P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
+						"legacy protocol.\n");
+			err = -EINVAL;
+			goto error;
+		}
+
+		if (v9fs_proto_dotl(v9ses))
+			inode->i_op = &v9fs_symlink_inode_operations_dotl;
+		else
+			inode->i_op = &v9fs_symlink_inode_operations;
+
+		break;
+	case S_IFDIR:
+		inc_nlink(inode);
+		if (v9fs_proto_dotl(v9ses))
+			inode->i_op = &v9fs_dir_inode_operations_dotl;
+		else if (v9fs_proto_dotu(v9ses))
+			inode->i_op = &v9fs_dir_inode_operations_dotu;
+		else
+			inode->i_op = &v9fs_dir_inode_operations;
+
+		if (v9fs_proto_dotl(v9ses))
+			inode->i_fop = &v9fs_dir_operations_dotl;
+		else
+			inode->i_fop = &v9fs_dir_operations;
+
+		break;
+	default:
+		P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
+			   mode, mode & S_IFMT);
+		err = -EINVAL;
+		goto error;
+	}
+error:
+	return err;
+
+}
+
+/**
+ * v9fs_get_inode - helper function to setup an inode
+ * @sb: superblock
+ * @mode: mode to setup inode with
+ *
+ */
+
+struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t rdev)
+{
+	int err;
+	struct inode *inode;
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
+
+	inode = new_inode(sb);
+	if (!inode) {
+		P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	err = v9fs_init_inode(v9ses, inode, mode, rdev);
+	if (err) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	return inode;
+}
+
+/*
+static struct v9fs_fid*
+v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry)
+{
+	int err;
+	int nfid;
+	struct v9fs_fid *ret;
+	struct v9fs_fcall *fcall;
+
+	nfid = v9fs_get_idpool(&v9ses->fidpool);
+	if (nfid < 0) {
+		eprintk(KERN_WARNING, "no free fids available\n");
+		return ERR_PTR(-ENOSPC);
+	}
+
+	err = v9fs_t_walk(v9ses, fid, nfid, (char *) dentry->d_name.name,
+		&fcall);
+
+	if (err < 0) {
+		if (fcall && fcall->id == RWALK)
+			goto clunk_fid;
+
+		PRINT_FCALL_ERROR("walk error", fcall);
+		v9fs_put_idpool(nfid, &v9ses->fidpool);
+		goto error;
+	}
+
+	kfree(fcall);
+	fcall = NULL;
+	ret = v9fs_fid_create(v9ses, nfid);
+	if (!ret) {
+		err = -ENOMEM;
+		goto clunk_fid;
+	}
+
+	err = v9fs_fid_insert(ret, dentry);
+	if (err < 0) {
+		v9fs_fid_destroy(ret);
+		goto clunk_fid;
+	}
+
+	return ret;
+
+clunk_fid:
+	v9fs_t_clunk(v9ses, nfid);
+
+error:
+	kfree(fcall);
+	return ERR_PTR(err);
+}
+*/
+
+
+/**
+ * v9fs_clear_inode - release an inode
+ * @inode: inode to release
+ *
+ */
+void v9fs_evict_inode(struct inode *inode)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	truncate_inode_pages(inode->i_mapping, 0);
+	end_writeback(inode);
+	filemap_fdatawrite(inode->i_mapping);
+
+#ifdef CONFIG_9P_FSCACHE
+	v9fs_cache_inode_put_cookie(inode);
+#endif
+	/* clunk the fid stashed in writeback_fid */
+	if (v9inode->writeback_fid) {
+		p9_client_clunk(v9inode->writeback_fid);
+		v9inode->writeback_fid = NULL;
+	}
+}
+
+static int v9fs_test_inode(struct inode *inode, void *data)
+{
+	int umode;
+	dev_t rdev;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_wstat *st = (struct p9_wstat *)data;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+
+	umode = p9mode2unixmode(v9ses, st, &rdev);
+	/* don't match inode of different type */
+	if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+		return 0;
+
+	/* compare qid details */
+	if (memcmp(&v9inode->qid.version,
+		   &st->qid.version, sizeof(v9inode->qid.version)))
+		return 0;
+
+	if (v9inode->qid.type != st->qid.type)
+		return 0;
+	return 1;
+}
+
+static int v9fs_test_new_inode(struct inode *inode, void *data)
+{
+	return 0;
+}
+
+static int v9fs_set_inode(struct inode *inode,  void *data)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_wstat *st = (struct p9_wstat *)data;
+
+	memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
+	return 0;
+}
+
+static struct inode *v9fs_qid_iget(struct super_block *sb,
+				   struct p9_qid *qid,
+				   struct p9_wstat *st,
+				   int new)
+{
+	dev_t rdev;
+	int retval, umode;
+	unsigned long i_ino;
+	struct inode *inode;
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	int (*test)(struct inode *, void *);
+
+	if (new)
+		test = v9fs_test_new_inode;
+	else
+		test = v9fs_test_inode;
+
+	i_ino = v9fs_qid2ino(qid);
+	inode = iget5_locked(sb, i_ino, test, v9fs_set_inode, st);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+	/*
+	 * initialize the inode with the stat info
+	 * FIXME!! we may need support for stale inodes
+	 * later.
+	 */
+	inode->i_ino = i_ino;
+	umode = p9mode2unixmode(v9ses, st, &rdev);
+	retval = v9fs_init_inode(v9ses, inode, umode, rdev);
+	if (retval)
+		goto error;
+
+	v9fs_stat2inode(st, inode, sb);
+#ifdef CONFIG_9P_FSCACHE
+	v9fs_cache_inode_get_cookie(inode);
+#endif
+	unlock_new_inode(inode);
+	return inode;
+error:
+	unlock_new_inode(inode);
+	iput(inode);
+	return ERR_PTR(retval);
+
+}
+
+struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+		    struct super_block *sb, int new)
+{
+	struct p9_wstat *st;
+	struct inode *inode = NULL;
+
+	st = p9_client_stat(fid);
+	if (IS_ERR(st))
+		return ERR_CAST(st);
+
+	inode = v9fs_qid_iget(sb, &st->qid, st, new);
+	p9stat_free(st);
+	kfree(st);
+	return inode;
+}
+
+/**
+ * v9fs_remove - helper function to remove files and directories
+ * @dir: directory inode that is being deleted
+ * @file:  dentry that is being deleted
+ * @rmdir: removing a directory
+ *
+ */
+
+static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
+{
+	int retval;
+	struct p9_fid *v9fid;
+	struct inode *file_inode;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
+		rmdir);
+
+	file_inode = file->d_inode;
+	v9fid = v9fs_fid_clone(file);
+	if (IS_ERR(v9fid))
+		return PTR_ERR(v9fid);
+
+	retval = p9_client_remove(v9fid);
+	if (!retval) {
+		/*
+		 * directories on unlink should have zero
+		 * link count
+		 */
+		if (rmdir) {
+			clear_nlink(file_inode);
+			drop_nlink(dir);
+		} else
+			drop_nlink(file_inode);
+
+		v9fs_invalidate_inode_attr(file_inode);
+		v9fs_invalidate_inode_attr(dir);
+	}
+	return retval;
+}
+
+/**
+ * v9fs_create - Create a file
+ * @v9ses: session information
+ * @dir: directory that dentry is being created in
+ * @dentry:  dentry that is being created
+ * @extension: 9p2000.u extension string to support devices, etc.
+ * @perm: create permissions
+ * @mode: open mode
+ *
+ */
+static struct p9_fid *
+v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
+		struct dentry *dentry, char *extension, u32 perm, u8 mode)
+{
+	int err;
+	char *name;
+	struct p9_fid *dfid, *ofid, *fid;
+	struct inode *inode;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+
+	err = 0;
+	ofid = NULL;
+	fid = NULL;
+	name = (char *) dentry->d_name.name;
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		return ERR_PTR(err);
+	}
+
+	/* clone a fid to use for creation */
+	ofid = p9_client_walk(dfid, 0, NULL, 1);
+	if (IS_ERR(ofid)) {
+		err = PTR_ERR(ofid);
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		return ERR_PTR(err);
+	}
+
+	err = p9_client_fcreate(ofid, name, perm, mode, extension);
+	if (err < 0) {
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
+		goto error;
+	}
+
+	/* now walk from the parent so we can get unopened fid */
+	fid = p9_client_walk(dfid, 1, &name, 1);
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		fid = NULL;
+		goto error;
+	}
+
+	/* instantiate inode and assign the unopened fid to the dentry */
+	inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+		goto error;
+	}
+	err = v9fs_fid_add(dentry, fid);
+	if (err < 0)
+		goto error;
+	d_instantiate(dentry, inode);
+	return ofid;
+error:
+	if (ofid)
+		p9_client_clunk(ofid);
+
+	if (fid)
+		p9_client_clunk(fid);
+
+	return ERR_PTR(err);
+}
+
+/**
+ * v9fs_vfs_create - VFS hook to create files
+ * @dir: directory inode that is being created
+ * @dentry:  dentry that is being deleted
+ * @mode: create permissions
+ * @nd: path information
+ *
+ */
+
+static int
+v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	int err;
+	u32 perm;
+	int flags;
+	struct file *filp;
+	struct v9fs_inode *v9inode;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid, *inode_fid;
+
+	err = 0;
+	fid = NULL;
+	v9ses = v9fs_inode2v9ses(dir);
+	perm = unixmode2p9mode(v9ses, mode);
+	if (nd && nd->flags & LOOKUP_OPEN)
+		flags = nd->intent.open.flags - 1;
+	else
+		flags = O_RDWR;
+
+	fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
+				v9fs_uflags2omode(flags,
+						v9fs_proto_dotu(v9ses)));
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		fid = NULL;
+		goto error;
+	}
+
+	v9fs_invalidate_inode_attr(dir);
+	/* if we are opening a file, assign the open fid to the file */
+	if (nd && nd->flags & LOOKUP_OPEN) {
+		v9inode = V9FS_I(dentry->d_inode);
+		mutex_lock(&v9inode->v_mutex);
+		if (v9ses->cache && !v9inode->writeback_fid &&
+		    ((flags & O_ACCMODE) != O_RDONLY)) {
+			/*
+			 * clone a fid and add it to writeback_fid
+			 * we do it during open time instead of
+			 * page dirty time via write_begin/page_mkwrite
+			 * because we want write after unlink usecase
+			 * to work.
+			 */
+			inode_fid = v9fs_writeback_fid(dentry);
+			if (IS_ERR(inode_fid)) {
+				err = PTR_ERR(inode_fid);
+				mutex_unlock(&v9inode->v_mutex);
+				goto error;
+			}
+			v9inode->writeback_fid = (void *) inode_fid;
+		}
+		mutex_unlock(&v9inode->v_mutex);
+		filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
+		if (IS_ERR(filp)) {
+			err = PTR_ERR(filp);
+			goto error;
+		}
+
+		filp->private_data = fid;
+#ifdef CONFIG_9P_FSCACHE
+		if (v9ses->cache)
+			v9fs_cache_inode_set_cookie(dentry->d_inode, filp);
+#endif
+	} else
+		p9_client_clunk(fid);
+
+	return 0;
+
+error:
+	if (fid)
+		p9_client_clunk(fid);
+
+	return err;
+}
+
+/**
+ * v9fs_vfs_mkdir - VFS mkdir hook to create a directory
+ * @dir:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+
+static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int err;
+	u32 perm;
+	struct p9_fid *fid;
+	struct v9fs_session_info *v9ses;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+	err = 0;
+	v9ses = v9fs_inode2v9ses(dir);
+	perm = unixmode2p9mode(v9ses, mode | S_IFDIR);
+	fid = v9fs_create(v9ses, dir, dentry, NULL, perm, P9_OREAD);
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		fid = NULL;
+	} else {
+		inc_nlink(dir);
+		v9fs_invalidate_inode_attr(dir);
+	}
+
+	if (fid)
+		p9_client_clunk(fid);
+
+	return err;
+}
+
+/**
+ * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
+ * @dir:  inode that is being walked from
+ * @dentry: dentry that is being walked to?
+ * @nameidata: path data
+ *
+ */
+
+struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+				      struct nameidata *nameidata)
+{
+	struct dentry *res;
+	struct super_block *sb;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *dfid, *fid;
+	struct inode *inode;
+	char *name;
+	int result = 0;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
+		dir, dentry->d_name.name, dentry, nameidata);
+
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	sb = dir->i_sb;
+	v9ses = v9fs_inode2v9ses(dir);
+	/* We can walk d_parent because we hold the dir->i_mutex */
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	if (IS_ERR(dfid))
+		return ERR_CAST(dfid);
+
+	name = (char *) dentry->d_name.name;
+	fid = p9_client_walk(dfid, 1, &name, 1);
+	if (IS_ERR(fid)) {
+		result = PTR_ERR(fid);
+		if (result == -ENOENT) {
+			inode = NULL;
+			goto inst_out;
+		}
+
+		return ERR_PTR(result);
+	}
+	/*
+	 * Make sure we don't use a wrong inode due to parallel
+	 * unlink. For cached mode create calls request for new
+	 * inode. But with cache disabled, lookup should do this.
+	 */
+	if (v9ses->cache)
+		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+	else
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+	if (IS_ERR(inode)) {
+		result = PTR_ERR(inode);
+		inode = NULL;
+		goto error;
+	}
+	result = v9fs_fid_add(dentry, fid);
+	if (result < 0)
+		goto error_iput;
+inst_out:
+	/*
+	 * If we had a rename on the server and a parallel lookup
+	 * for the new name, then make sure we instantiate with
+	 * the new name. ie look up for a/b, while on server somebody
+	 * moved b under k and client parallely did a lookup for
+	 * k/b.
+	 */
+	res = d_materialise_unique(dentry, inode);
+	if (!IS_ERR(res))
+		return res;
+	result = PTR_ERR(res);
+error_iput:
+	iput(inode);
+error:
+	p9_client_clunk(fid);
+
+	return ERR_PTR(result);
+}
+
+/**
+ * v9fs_vfs_unlink - VFS unlink hook to delete an inode
+ * @i:  inode that is being unlinked
+ * @d: dentry that is being unlinked
+ *
+ */
+
+int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
+{
+	return v9fs_remove(i, d, 0);
+}
+
+/**
+ * v9fs_vfs_rmdir - VFS unlink hook to delete a directory
+ * @i:  inode that is being unlinked
+ * @d: dentry that is being unlinked
+ *
+ */
+
+int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
+{
+	return v9fs_remove(i, d, 1);
+}
+
+/**
+ * v9fs_vfs_rename - VFS hook to rename an inode
+ * @old_dir:  old dir inode
+ * @old_dentry: old dentry
+ * @new_dir: new dir inode
+ * @new_dentry: new dentry
+ *
+ */
+
+int
+v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	int retval;
+	struct inode *old_inode;
+	struct inode *new_inode;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *oldfid;
+	struct p9_fid *olddirfid;
+	struct p9_fid *newdirfid;
+	struct p9_wstat wstat;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+	retval = 0;
+	old_inode = old_dentry->d_inode;
+	new_inode = new_dentry->d_inode;
+	v9ses = v9fs_inode2v9ses(old_inode);
+	oldfid = v9fs_fid_lookup(old_dentry);
+	if (IS_ERR(oldfid))
+		return PTR_ERR(oldfid);
+
+	olddirfid = v9fs_fid_clone(old_dentry->d_parent);
+	if (IS_ERR(olddirfid)) {
+		retval = PTR_ERR(olddirfid);
+		goto done;
+	}
+
+	newdirfid = v9fs_fid_clone(new_dentry->d_parent);
+	if (IS_ERR(newdirfid)) {
+		retval = PTR_ERR(newdirfid);
+		goto clunk_olddir;
+	}
+
+	down_write(&v9ses->rename_sem);
+	if (v9fs_proto_dotl(v9ses)) {
+		retval = p9_client_rename(oldfid, newdirfid,
+					(char *) new_dentry->d_name.name);
+		if (retval != -ENOSYS)
+			goto clunk_newdir;
+	}
+	if (old_dentry->d_parent != new_dentry->d_parent) {
+		/*
+		 * 9P .u can only handle file rename in the same directory
+		 */
+
+		P9_DPRINTK(P9_DEBUG_ERROR,
+				"old dir and new dir are different\n");
+		retval = -EXDEV;
+		goto clunk_newdir;
+	}
+	v9fs_blank_wstat(&wstat);
+	wstat.muid = v9ses->uname;
+	wstat.name = (char *) new_dentry->d_name.name;
+	retval = p9_client_wstat(oldfid, &wstat);
+
+clunk_newdir:
+	if (!retval) {
+		if (new_inode) {
+			if (S_ISDIR(new_inode->i_mode))
+				clear_nlink(new_inode);
+			else
+				drop_nlink(new_inode);
+			/*
+			 * Work around vfs rename rehash bug with
+			 * FS_RENAME_DOES_D_MOVE
+			 */
+			v9fs_invalidate_inode_attr(new_inode);
+		}
+		if (S_ISDIR(old_inode->i_mode)) {
+			if (!new_inode)
+				inc_nlink(new_dir);
+			drop_nlink(old_dir);
+		}
+		v9fs_invalidate_inode_attr(old_inode);
+		v9fs_invalidate_inode_attr(old_dir);
+		v9fs_invalidate_inode_attr(new_dir);
+
+		/* successful rename */
+		d_move(old_dentry, new_dentry);
+	}
+	up_write(&v9ses->rename_sem);
+	p9_client_clunk(newdirfid);
+
+clunk_olddir:
+	p9_client_clunk(olddirfid);
+
+done:
+	return retval;
+}
+
+/**
+ * v9fs_vfs_getattr - retrieve file metadata
+ * @mnt: mount information
+ * @dentry: file to get attributes on
+ * @stat: metadata structure to populate
+ *
+ */
+
+static int
+v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	int err;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_wstat *st;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+	err = -EPERM;
+	v9ses = v9fs_dentry2v9ses(dentry);
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		generic_fillattr(dentry->d_inode, stat);
+		return 0;
+	}
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	st = p9_client_stat(fid);
+	if (IS_ERR(st))
+		return PTR_ERR(st);
+
+	v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb);
+	generic_fillattr(dentry->d_inode, stat);
+
+	p9stat_free(st);
+	kfree(st);
+	return 0;
+}
+
+/**
+ * v9fs_vfs_setattr - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+
+static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	int retval;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_wstat wstat;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+	retval = inode_change_ok(dentry->d_inode, iattr);
+	if (retval)
+		return retval;
+
+	retval = -EPERM;
+	v9ses = v9fs_dentry2v9ses(dentry);
+	fid = v9fs_fid_lookup(dentry);
+	if(IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	v9fs_blank_wstat(&wstat);
+	if (iattr->ia_valid & ATTR_MODE)
+		wstat.mode = unixmode2p9mode(v9ses, iattr->ia_mode);
+
+	if (iattr->ia_valid & ATTR_MTIME)
+		wstat.mtime = iattr->ia_mtime.tv_sec;
+
+	if (iattr->ia_valid & ATTR_ATIME)
+		wstat.atime = iattr->ia_atime.tv_sec;
+
+	if (iattr->ia_valid & ATTR_SIZE)
+		wstat.length = iattr->ia_size;
+
+	if (v9fs_proto_dotu(v9ses)) {
+		if (iattr->ia_valid & ATTR_UID)
+			wstat.n_uid = iattr->ia_uid;
+
+		if (iattr->ia_valid & ATTR_GID)
+			wstat.n_gid = iattr->ia_gid;
+	}
+
+	/* Write all dirty data */
+	if (S_ISREG(dentry->d_inode->i_mode))
+		filemap_write_and_wait(dentry->d_inode->i_mapping);
+
+	retval = p9_client_wstat(fid, &wstat);
+	if (retval < 0)
+		return retval;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(dentry->d_inode))
+		truncate_setsize(dentry->d_inode, iattr->ia_size);
+
+	v9fs_invalidate_inode_attr(dentry->d_inode);
+
+	setattr_copy(dentry->d_inode, iattr);
+	mark_inode_dirty(dentry->d_inode);
+	return 0;
+}
+
+/**
+ * v9fs_stat2inode - populate an inode structure with mistat info
+ * @stat: Plan 9 metadata (mistat) structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+
+void
+v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
+	struct super_block *sb)
+{
+	mode_t mode;
+	char ext[32];
+	char tag_name[14];
+	unsigned int i_nlink;
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	inode->i_nlink = 1;
+
+	inode->i_atime.tv_sec = stat->atime;
+	inode->i_mtime.tv_sec = stat->mtime;
+	inode->i_ctime.tv_sec = stat->mtime;
+
+	inode->i_uid = v9ses->dfltuid;
+	inode->i_gid = v9ses->dfltgid;
+
+	if (v9fs_proto_dotu(v9ses)) {
+		inode->i_uid = stat->n_uid;
+		inode->i_gid = stat->n_gid;
+	}
+	if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) {
+		if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) {
+			/*
+			 * Hadlink support got added later to
+			 * to the .u extension. So there can be
+			 * server out there that doesn't support
+			 * this even with .u extension. So check
+			 * for non NULL stat->extension
+			 */
+			strncpy(ext, stat->extension, sizeof(ext));
+			/* HARDLINKCOUNT %u */
+			sscanf(ext, "%13s %u", tag_name, &i_nlink);
+			if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
+				inode->i_nlink = i_nlink;
+		}
+	}
+	mode = stat->mode & S_IALLUGO;
+	mode |= inode->i_mode & ~S_IALLUGO;
+	inode->i_mode = mode;
+	i_size_write(inode, stat->length);
+
+	/* not real number of blocks, but 512 byte ones ... */
+	inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+	v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
+}
+
+/**
+ * v9fs_qid2ino - convert qid into inode number
+ * @qid: qid to hash
+ *
+ * BUG: potential for inode number collisions?
+ */
+
+ino_t v9fs_qid2ino(struct p9_qid *qid)
+{
+	u64 path = qid->path + 2;
+	ino_t i = 0;
+
+	if (sizeof(ino_t) == sizeof(path))
+		memcpy(&i, &path, sizeof(ino_t));
+	else
+		i = (ino_t) (path ^ (path >> 32));
+
+	return i;
+}
+
+/**
+ * v9fs_readlink - read a symlink's location (internal version)
+ * @dentry: dentry for symlink
+ * @buffer: buffer to load symlink location into
+ * @buflen: length of buffer
+ *
+ */
+
+static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
+{
+	int retval;
+
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_wstat *st;
+
+	P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
+	retval = -EPERM;
+	v9ses = v9fs_dentry2v9ses(dentry);
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	if (!v9fs_proto_dotu(v9ses))
+		return -EBADF;
+
+	st = p9_client_stat(fid);
+	if (IS_ERR(st))
+		return PTR_ERR(st);
+
+	if (!(st->mode & P9_DMSYMLINK)) {
+		retval = -EINVAL;
+		goto done;
+	}
+
+	/* copy extension buffer into buffer */
+	strncpy(buffer, st->extension, buflen);
+
+	P9_DPRINTK(P9_DEBUG_VFS,
+		"%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer);
+
+	retval = strnlen(buffer, buflen);
+done:
+	p9stat_free(st);
+	kfree(st);
+	return retval;
+}
+
+/**
+ * v9fs_vfs_follow_link - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+
+static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int len = 0;
+	char *link = __getname();
+
+	P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
+
+	if (!link)
+		link = ERR_PTR(-ENOMEM);
+	else {
+		len = v9fs_readlink(dentry, link, PATH_MAX);
+
+		if (len < 0) {
+			__putname(link);
+			link = ERR_PTR(len);
+		} else
+			link[min(len, PATH_MAX-1)] = 0;
+	}
+	nd_set_link(nd, link);
+
+	return NULL;
+}
+
+/**
+ * v9fs_vfs_put_link - release a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ * @p: unused
+ *
+ */
+
+void
+v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+	char *s = nd_get_link(nd);
+
+	P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name,
+		IS_ERR(s) ? "<error>" : s);
+	if (!IS_ERR(s))
+		__putname(s);
+}
+
+/**
+ * v9fs_vfs_mkspecial - create a special file
+ * @dir: inode to create special file in
+ * @dentry: dentry to create
+ * @mode: mode to create special file
+ * @extension: 9p2000.u format extension string representing special file
+ *
+ */
+
+static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
+	int mode, const char *extension)
+{
+	u32 perm;
+	struct p9_fid *fid;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(dir);
+	if (!v9fs_proto_dotu(v9ses)) {
+		P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n");
+		return -EPERM;
+	}
+
+	perm = unixmode2p9mode(v9ses, mode);
+	fid = v9fs_create(v9ses, dir, dentry, (char *) extension, perm,
+								P9_OREAD);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	v9fs_invalidate_inode_attr(dir);
+	p9_client_clunk(fid);
+	return 0;
+}
+
+/**
+ * v9fs_vfs_symlink - helper function to create symlinks
+ * @dir: directory inode containing symlink
+ * @dentry: dentry for symlink
+ * @symname: symlink data
+ *
+ * See Also: 9P2000.u RFC for more information
+ *
+ */
+
+static int
+v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino,
+					dentry->d_name.name, symname);
+
+	return v9fs_vfs_mkspecial(dir, dentry, S_IFLNK, symname);
+}
+
+/**
+ * v9fs_vfs_link - create a hardlink
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+
+static int
+v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
+	      struct dentry *dentry)
+{
+	int retval;
+	char *name;
+	struct p9_fid *oldfid;
+
+	P9_DPRINTK(P9_DEBUG_VFS,
+		" %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
+		old_dentry->d_name.name);
+
+	oldfid = v9fs_fid_clone(old_dentry);
+	if (IS_ERR(oldfid))
+		return PTR_ERR(oldfid);
+
+	name = __getname();
+	if (unlikely(!name)) {
+		retval = -ENOMEM;
+		goto clunk_fid;
+	}
+
+	sprintf(name, "%d\n", oldfid->fid);
+	retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name);
+	__putname(name);
+	if (!retval) {
+		v9fs_refresh_inode(oldfid, old_dentry->d_inode);
+		v9fs_invalidate_inode_attr(dir);
+	}
+clunk_fid:
+	p9_client_clunk(oldfid);
+	return retval;
+}
+
+/**
+ * v9fs_vfs_mknod - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @rdev: device associated with special file
+ *
+ */
+
+static int
+v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+{
+	int retval;
+	char *name;
+
+	P9_DPRINTK(P9_DEBUG_VFS,
+		" %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
+		dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	name = __getname();
+	if (!name)
+		return -ENOMEM;
+	/* build extension */
+	if (S_ISBLK(mode))
+		sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev));
+	else if (S_ISCHR(mode))
+		sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
+	else if (S_ISFIFO(mode))
+		*name = 0;
+	else if (S_ISSOCK(mode))
+		*name = 0;
+	else {
+		__putname(name);
+		return -EINVAL;
+	}
+
+	retval = v9fs_vfs_mkspecial(dir, dentry, mode, name);
+	__putname(name);
+
+	return retval;
+}
+
+int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
+{
+	int umode;
+	dev_t rdev;
+	loff_t i_size;
+	struct p9_wstat *st;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	st = p9_client_stat(fid);
+	if (IS_ERR(st))
+		return PTR_ERR(st);
+	/*
+	 * Don't update inode if the file type is different
+	 */
+	umode = p9mode2unixmode(v9ses, st, &rdev);
+	if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+		goto out;
+
+	spin_lock(&inode->i_lock);
+	/*
+	 * We don't want to refresh inode->i_size,
+	 * because we may have cached data
+	 */
+	i_size = inode->i_size;
+	v9fs_stat2inode(st, inode, inode->i_sb);
+	if (v9ses->cache)
+		inode->i_size = i_size;
+	spin_unlock(&inode->i_lock);
+out:
+	p9stat_free(st);
+	kfree(st);
+	return 0;
+}
+
+static const struct inode_operations v9fs_dir_inode_operations_dotu = {
+	.create = v9fs_vfs_create,
+	.lookup = v9fs_vfs_lookup,
+	.symlink = v9fs_vfs_symlink,
+	.link = v9fs_vfs_link,
+	.unlink = v9fs_vfs_unlink,
+	.mkdir = v9fs_vfs_mkdir,
+	.rmdir = v9fs_vfs_rmdir,
+	.mknod = v9fs_vfs_mknod,
+	.rename = v9fs_vfs_rename,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
+static const struct inode_operations v9fs_dir_inode_operations = {
+	.create = v9fs_vfs_create,
+	.lookup = v9fs_vfs_lookup,
+	.unlink = v9fs_vfs_unlink,
+	.mkdir = v9fs_vfs_mkdir,
+	.rmdir = v9fs_vfs_rmdir,
+	.mknod = v9fs_vfs_mknod,
+	.rename = v9fs_vfs_rename,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
+static const struct inode_operations v9fs_file_inode_operations = {
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
+static const struct inode_operations v9fs_symlink_inode_operations = {
+	.readlink = generic_readlink,
+	.follow_link = v9fs_vfs_follow_link,
+	.put_link = v9fs_vfs_put_link,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
new file mode 100644
index 00000000..c873172a
--- /dev/null
+++ b/fs/9p/vfs_inode_dotl.c
@@ -0,0 +1,1003 @@
+/*
+ *  linux/fs/9p/vfs_inode_dotl.c
+ *
+ * This file contains vfs inode ops for the 9P2000.L protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+#include "cache.h"
+#include "xattr.h"
+#include "acl.h"
+
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+		    dev_t rdev);
+
+/**
+ * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
+ * new file system object. This checks the S_ISGID to determine the owning
+ * group of the new file system object.
+ */
+
+static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+{
+	BUG_ON(dir_inode == NULL);
+
+	if (dir_inode->i_mode & S_ISGID) {
+		/* set_gid bit is set.*/
+		return dir_inode->i_gid;
+	}
+	return current_fsgid();
+}
+
+/**
+ * v9fs_dentry_from_dir_inode - helper function to get the dentry from
+ * dir inode.
+ *
+ */
+
+static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
+{
+	struct dentry *dentry;
+
+	spin_lock(&inode->i_lock);
+	/* Directory should have only one entry. */
+	BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
+	dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+	spin_unlock(&inode->i_lock);
+	return dentry;
+}
+
+static int v9fs_test_inode_dotl(struct inode *inode, void *data)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
+
+	/* don't match inode of different type */
+	if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+		return 0;
+
+	if (inode->i_generation != st->st_gen)
+		return 0;
+
+	/* compare qid details */
+	if (memcmp(&v9inode->qid.version,
+		   &st->qid.version, sizeof(v9inode->qid.version)))
+		return 0;
+
+	if (v9inode->qid.type != st->qid.type)
+		return 0;
+	return 1;
+}
+
+/* Always get a new inode */
+static int v9fs_test_new_inode_dotl(struct inode *inode, void *data)
+{
+	return 0;
+}
+
+static int v9fs_set_inode_dotl(struct inode *inode,  void *data)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
+
+	memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
+	inode->i_generation = st->st_gen;
+	return 0;
+}
+
+static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
+					struct p9_qid *qid,
+					struct p9_fid *fid,
+					struct p9_stat_dotl *st,
+					int new)
+{
+	int retval;
+	unsigned long i_ino;
+	struct inode *inode;
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	int (*test)(struct inode *, void *);
+
+	if (new)
+		test = v9fs_test_new_inode_dotl;
+	else
+		test = v9fs_test_inode_dotl;
+
+	i_ino = v9fs_qid2ino(qid);
+	inode = iget5_locked(sb, i_ino, test, v9fs_set_inode_dotl, st);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+	/*
+	 * initialize the inode with the stat info
+	 * FIXME!! we may need support for stale inodes
+	 * later.
+	 */
+	inode->i_ino = i_ino;
+	retval = v9fs_init_inode(v9ses, inode,
+				 st->st_mode, new_decode_dev(st->st_rdev));
+	if (retval)
+		goto error;
+
+	v9fs_stat2inode_dotl(st, inode);
+#ifdef CONFIG_9P_FSCACHE
+	v9fs_cache_inode_get_cookie(inode);
+#endif
+	retval = v9fs_get_acl(inode, fid);
+	if (retval)
+		goto error;
+
+	unlock_new_inode(inode);
+	return inode;
+error:
+	unlock_new_inode(inode);
+	iput(inode);
+	return ERR_PTR(retval);
+
+}
+
+struct inode *
+v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+			 struct super_block *sb, int new)
+{
+	struct p9_stat_dotl *st;
+	struct inode *inode = NULL;
+
+	st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
+	if (IS_ERR(st))
+		return ERR_CAST(st);
+
+	inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new);
+	kfree(st);
+	return inode;
+}
+
+struct dotl_openflag_map {
+	int open_flag;
+	int dotl_flag;
+};
+
+static int v9fs_mapped_dotl_flags(int flags)
+{
+	int i;
+	int rflags = 0;
+	struct dotl_openflag_map dotl_oflag_map[] = {
+		{ O_CREAT,	P9_DOTL_CREATE },
+		{ O_EXCL,	P9_DOTL_EXCL },
+		{ O_NOCTTY,	P9_DOTL_NOCTTY },
+		{ O_TRUNC,	P9_DOTL_TRUNC },
+		{ O_APPEND,	P9_DOTL_APPEND },
+		{ O_NONBLOCK,	P9_DOTL_NONBLOCK },
+		{ O_DSYNC,	P9_DOTL_DSYNC },
+		{ FASYNC,	P9_DOTL_FASYNC },
+		{ O_DIRECT,	P9_DOTL_DIRECT },
+		{ O_LARGEFILE,	P9_DOTL_LARGEFILE },
+		{ O_DIRECTORY,	P9_DOTL_DIRECTORY },
+		{ O_NOFOLLOW,	P9_DOTL_NOFOLLOW },
+		{ O_NOATIME,	P9_DOTL_NOATIME },
+		{ O_CLOEXEC,	P9_DOTL_CLOEXEC },
+		{ O_SYNC,	P9_DOTL_SYNC},
+	};
+	for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) {
+		if (flags & dotl_oflag_map[i].open_flag)
+			rflags |= dotl_oflag_map[i].dotl_flag;
+	}
+	return rflags;
+}
+
+/**
+ * v9fs_open_to_dotl_flags- convert Linux specific open flags to
+ * plan 9 open flag.
+ * @flags: flags to convert
+ */
+int v9fs_open_to_dotl_flags(int flags)
+{
+	int rflags = 0;
+
+	/*
+	 * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY
+	 * and P9_DOTL_NOACCESS
+	 */
+	rflags |= flags & O_ACCMODE;
+	rflags |= v9fs_mapped_dotl_flags(flags);
+
+	return rflags;
+}
+
+/**
+ * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
+ * @dir: directory inode that is being created
+ * @dentry:  dentry that is being deleted
+ * @mode: create permissions
+ * @nd: path information
+ *
+ */
+
+static int
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
+		struct nameidata *nd)
+{
+	int err = 0;
+	gid_t gid;
+	int flags;
+	mode_t mode;
+	char *name = NULL;
+	struct file *filp;
+	struct p9_qid qid;
+	struct inode *inode;
+	struct p9_fid *fid = NULL;
+	struct v9fs_inode *v9inode;
+	struct p9_fid *dfid, *ofid, *inode_fid;
+	struct v9fs_session_info *v9ses;
+	struct posix_acl *pacl = NULL, *dacl = NULL;
+
+	v9ses = v9fs_inode2v9ses(dir);
+	if (nd && nd->flags & LOOKUP_OPEN)
+		flags = nd->intent.open.flags - 1;
+	else {
+		/*
+		 * create call without LOOKUP_OPEN is due
+		 * to mknod of regular files. So use mknod
+		 * operation.
+		 */
+		return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+	}
+
+	name = (char *) dentry->d_name.name;
+	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
+			"mode:0x%x\n", name, flags, omode);
+
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		return err;
+	}
+
+	/* clone a fid to use for creation */
+	ofid = p9_client_walk(dfid, 0, NULL, 1);
+	if (IS_ERR(ofid)) {
+		err = PTR_ERR(ofid);
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		return err;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+
+	mode = omode;
+	/* Update mode based on ACL value */
+	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+	if (err) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+			   "Failed to get acl values in creat %d\n", err);
+		goto error;
+	}
+	err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
+				    mode, gid, &qid);
+	if (err < 0) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+				"p9_client_open_dotl failed in creat %d\n",
+				err);
+		goto error;
+	}
+	v9fs_invalidate_inode_attr(dir);
+
+	/* instantiate inode and assign the unopened fid to the dentry */
+	fid = p9_client_walk(dfid, 1, &name, 1);
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		fid = NULL;
+		goto error;
+	}
+	inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+		goto error;
+	}
+	err = v9fs_fid_add(dentry, fid);
+	if (err < 0)
+		goto error;
+	d_instantiate(dentry, inode);
+
+	/* Now set the ACL based on the default value */
+	v9fs_set_create_acl(dentry, &dacl, &pacl);
+
+	v9inode = V9FS_I(inode);
+	mutex_lock(&v9inode->v_mutex);
+	if (v9ses->cache && !v9inode->writeback_fid &&
+	    ((flags & O_ACCMODE) != O_RDONLY)) {
+		/*
+		 * clone a fid and add it to writeback_fid
+		 * we do it during open time instead of
+		 * page dirty time via write_begin/page_mkwrite
+		 * because we want write after unlink usecase
+		 * to work.
+		 */
+		inode_fid = v9fs_writeback_fid(dentry);
+		if (IS_ERR(inode_fid)) {
+			err = PTR_ERR(inode_fid);
+			mutex_unlock(&v9inode->v_mutex);
+			goto err_clunk_old_fid;
+		}
+		v9inode->writeback_fid = (void *) inode_fid;
+	}
+	mutex_unlock(&v9inode->v_mutex);
+	/* Since we are opening a file, assign the open fid to the file */
+	filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
+	if (IS_ERR(filp)) {
+		err = PTR_ERR(filp);
+		goto err_clunk_old_fid;
+	}
+	filp->private_data = ofid;
+#ifdef CONFIG_9P_FSCACHE
+	if (v9ses->cache)
+		v9fs_cache_inode_set_cookie(inode, filp);
+#endif
+	return 0;
+
+error:
+	if (fid)
+		p9_client_clunk(fid);
+err_clunk_old_fid:
+	if (ofid)
+		p9_client_clunk(ofid);
+	v9fs_set_create_acl(NULL, &dacl, &pacl);
+	return err;
+}
+
+/**
+ * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
+ * @dir:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+
+static int v9fs_vfs_mkdir_dotl(struct inode *dir,
+			       struct dentry *dentry, int omode)
+{
+	int err;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid = NULL, *dfid = NULL;
+	gid_t gid;
+	char *name;
+	mode_t mode;
+	struct inode *inode;
+	struct p9_qid qid;
+	struct dentry *dir_dentry;
+	struct posix_acl *dacl = NULL, *pacl = NULL;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+	err = 0;
+	v9ses = v9fs_inode2v9ses(dir);
+
+	omode |= S_IFDIR;
+	if (dir->i_mode & S_ISGID)
+		omode |= S_ISGID;
+
+	dir_dentry = v9fs_dentry_from_dir_inode(dir);
+	dfid = v9fs_fid_lookup(dir_dentry);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		dfid = NULL;
+		goto error;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+	mode = omode;
+	/* Update mode based on ACL value */
+	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+	if (err) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+			   "Failed to get acl values in mkdir %d\n", err);
+		goto error;
+	}
+	name = (char *) dentry->d_name.name;
+	err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
+	if (err < 0)
+		goto error;
+
+	/* instantiate inode and assign the unopened fid to the dentry */
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		fid = p9_client_walk(dfid, 1, &name, 1);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+				err);
+			fid = NULL;
+			goto error;
+		}
+
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+				err);
+			goto error;
+		}
+		err = v9fs_fid_add(dentry, fid);
+		if (err < 0)
+			goto error;
+		d_instantiate(dentry, inode);
+		fid = NULL;
+	} else {
+		/*
+		 * Not in cached mode. No need to populate
+		 * inode with stat. We need to get an inode
+		 * so that we can set the acl with dentry
+		 */
+		inode = v9fs_get_inode(dir->i_sb, mode, 0);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto error;
+		}
+		d_instantiate(dentry, inode);
+	}
+	/* Now set the ACL based on the default value */
+	v9fs_set_create_acl(dentry, &dacl, &pacl);
+	inc_nlink(dir);
+	v9fs_invalidate_inode_attr(dir);
+error:
+	if (fid)
+		p9_client_clunk(fid);
+	v9fs_set_create_acl(NULL, &dacl, &pacl);
+	return err;
+}
+
+static int
+v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	int err;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_stat_dotl *st;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+	err = -EPERM;
+	v9ses = v9fs_dentry2v9ses(dentry);
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		generic_fillattr(dentry->d_inode, stat);
+		return 0;
+	}
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	/* Ask for all the fields in stat structure. Server will return
+	 * whatever it supports
+	 */
+
+	st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+	if (IS_ERR(st))
+		return PTR_ERR(st);
+
+	v9fs_stat2inode_dotl(st, dentry->d_inode);
+	generic_fillattr(dentry->d_inode, stat);
+	/* Change block size to what the server returned */
+	stat->blksize = st->st_blksize;
+
+	kfree(st);
+	return 0;
+}
+
+/**
+ * v9fs_vfs_setattr_dotl - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+
+int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+{
+	int retval;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_iattr_dotl p9attr;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+
+	retval = inode_change_ok(dentry->d_inode, iattr);
+	if (retval)
+		return retval;
+
+	p9attr.valid = iattr->ia_valid;
+	p9attr.mode = iattr->ia_mode;
+	p9attr.uid = iattr->ia_uid;
+	p9attr.gid = iattr->ia_gid;
+	p9attr.size = iattr->ia_size;
+	p9attr.atime_sec = iattr->ia_atime.tv_sec;
+	p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
+	p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
+	p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+
+	retval = -EPERM;
+	v9ses = v9fs_dentry2v9ses(dentry);
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	/* Write all dirty data */
+	if (S_ISREG(dentry->d_inode->i_mode))
+		filemap_write_and_wait(dentry->d_inode->i_mapping);
+
+	retval = p9_client_setattr(fid, &p9attr);
+	if (retval < 0)
+		return retval;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(dentry->d_inode))
+		truncate_setsize(dentry->d_inode, iattr->ia_size);
+
+	v9fs_invalidate_inode_attr(dentry->d_inode);
+	setattr_copy(dentry->d_inode, iattr);
+	mark_inode_dirty(dentry->d_inode);
+	if (iattr->ia_valid & ATTR_MODE) {
+		/* We also want to update ACL when we update mode bits */
+		retval = v9fs_acl_chmod(dentry);
+		if (retval < 0)
+			return retval;
+	}
+	return 0;
+}
+
+/**
+ * v9fs_stat2inode_dotl - populate an inode structure with stat info
+ * @stat: stat structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+
+void
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+{
+	mode_t mode;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
+		inode->i_atime.tv_sec = stat->st_atime_sec;
+		inode->i_atime.tv_nsec = stat->st_atime_nsec;
+		inode->i_mtime.tv_sec = stat->st_mtime_sec;
+		inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+		inode->i_ctime.tv_sec = stat->st_ctime_sec;
+		inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+		inode->i_uid = stat->st_uid;
+		inode->i_gid = stat->st_gid;
+		inode->i_nlink = stat->st_nlink;
+
+		mode = stat->st_mode & S_IALLUGO;
+		mode |= inode->i_mode & ~S_IALLUGO;
+		inode->i_mode = mode;
+
+		i_size_write(inode, stat->st_size);
+		inode->i_blocks = stat->st_blocks;
+	} else {
+		if (stat->st_result_mask & P9_STATS_ATIME) {
+			inode->i_atime.tv_sec = stat->st_atime_sec;
+			inode->i_atime.tv_nsec = stat->st_atime_nsec;
+		}
+		if (stat->st_result_mask & P9_STATS_MTIME) {
+			inode->i_mtime.tv_sec = stat->st_mtime_sec;
+			inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+		}
+		if (stat->st_result_mask & P9_STATS_CTIME) {
+			inode->i_ctime.tv_sec = stat->st_ctime_sec;
+			inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+		}
+		if (stat->st_result_mask & P9_STATS_UID)
+			inode->i_uid = stat->st_uid;
+		if (stat->st_result_mask & P9_STATS_GID)
+			inode->i_gid = stat->st_gid;
+		if (stat->st_result_mask & P9_STATS_NLINK)
+			inode->i_nlink = stat->st_nlink;
+		if (stat->st_result_mask & P9_STATS_MODE) {
+			inode->i_mode = stat->st_mode;
+			if ((S_ISBLK(inode->i_mode)) ||
+						(S_ISCHR(inode->i_mode)))
+				init_special_inode(inode, inode->i_mode,
+								inode->i_rdev);
+		}
+		if (stat->st_result_mask & P9_STATS_RDEV)
+			inode->i_rdev = new_decode_dev(stat->st_rdev);
+		if (stat->st_result_mask & P9_STATS_SIZE)
+			i_size_write(inode, stat->st_size);
+		if (stat->st_result_mask & P9_STATS_BLOCKS)
+			inode->i_blocks = stat->st_blocks;
+	}
+	if (stat->st_result_mask & P9_STATS_GEN)
+		inode->i_generation = stat->st_gen;
+
+	/* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
+	 * because the inode structure does not have fields for them.
+	 */
+	v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
+}
+
+static int
+v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
+		const char *symname)
+{
+	int err;
+	gid_t gid;
+	char *name;
+	struct p9_qid qid;
+	struct inode *inode;
+	struct p9_fid *dfid;
+	struct p9_fid *fid = NULL;
+	struct v9fs_session_info *v9ses;
+
+	name = (char *) dentry->d_name.name;
+	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
+			dir->i_ino, name, symname);
+	v9ses = v9fs_inode2v9ses(dir);
+
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		return err;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+
+	/* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
+	err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
+
+	if (err < 0) {
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+		goto error;
+	}
+
+	v9fs_invalidate_inode_attr(dir);
+	if (v9ses->cache) {
+		/* Now walk from the parent so we can get an unopened fid. */
+		fid = p9_client_walk(dfid, 1, &name, 1);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+					err);
+			fid = NULL;
+			goto error;
+		}
+
+		/* instantiate inode and assign the unopened fid to dentry */
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+					err);
+			goto error;
+		}
+		err = v9fs_fid_add(dentry, fid);
+		if (err < 0)
+			goto error;
+		d_instantiate(dentry, inode);
+		fid = NULL;
+	} else {
+		/* Not in cached mode. No need to populate inode with stat */
+		inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto error;
+		}
+		d_instantiate(dentry, inode);
+	}
+
+error:
+	if (fid)
+		p9_client_clunk(fid);
+
+	return err;
+}
+
+/**
+ * v9fs_vfs_link_dotl - create a hardlink for dotl
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+
+static int
+v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	int err;
+	char *name;
+	struct dentry *dir_dentry;
+	struct p9_fid *dfid, *oldfid;
+	struct v9fs_session_info *v9ses;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+			dir->i_ino, old_dentry->d_name.name,
+			dentry->d_name.name);
+
+	v9ses = v9fs_inode2v9ses(dir);
+	dir_dentry = v9fs_dentry_from_dir_inode(dir);
+	dfid = v9fs_fid_lookup(dir_dentry);
+	if (IS_ERR(dfid))
+		return PTR_ERR(dfid);
+
+	oldfid = v9fs_fid_lookup(old_dentry);
+	if (IS_ERR(oldfid))
+		return PTR_ERR(oldfid);
+
+	name = (char *) dentry->d_name.name;
+
+	err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
+
+	if (err < 0) {
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+		return err;
+	}
+
+	v9fs_invalidate_inode_attr(dir);
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		/* Get the latest stat info from server. */
+		struct p9_fid *fid;
+		fid = v9fs_fid_lookup(old_dentry);
+		if (IS_ERR(fid))
+			return PTR_ERR(fid);
+
+		v9fs_refresh_inode_dotl(fid, old_dentry->d_inode);
+	}
+	ihold(old_dentry->d_inode);
+	d_instantiate(dentry, old_dentry->d_inode);
+
+	return err;
+}
+
+/**
+ * v9fs_vfs_mknod_dotl - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @rdev: device associated with special file
+ *
+ */
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+		dev_t rdev)
+{
+	int err;
+	gid_t gid;
+	char *name;
+	mode_t mode;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid = NULL, *dfid = NULL;
+	struct inode *inode;
+	struct p9_qid qid;
+	struct dentry *dir_dentry;
+	struct posix_acl *dacl = NULL, *pacl = NULL;
+
+	P9_DPRINTK(P9_DEBUG_VFS,
+		" %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
+		dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	v9ses = v9fs_inode2v9ses(dir);
+	dir_dentry = v9fs_dentry_from_dir_inode(dir);
+	dfid = v9fs_fid_lookup(dir_dentry);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		dfid = NULL;
+		goto error;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+	mode = omode;
+	/* Update mode based on ACL value */
+	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+	if (err) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+			   "Failed to get acl values in mknod %d\n", err);
+		goto error;
+	}
+	name = (char *) dentry->d_name.name;
+
+	err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
+	if (err < 0)
+		goto error;
+
+	v9fs_invalidate_inode_attr(dir);
+	/* instantiate inode and assign the unopened fid to the dentry */
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		fid = p9_client_walk(dfid, 1, &name, 1);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+				err);
+			fid = NULL;
+			goto error;
+		}
+
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+				err);
+			goto error;
+		}
+		err = v9fs_fid_add(dentry, fid);
+		if (err < 0)
+			goto error;
+		d_instantiate(dentry, inode);
+		fid = NULL;
+	} else {
+		/*
+		 * Not in cached mode. No need to populate inode with stat.
+		 * socket syscall returns a fd, so we need instantiate
+		 */
+		inode = v9fs_get_inode(dir->i_sb, mode, rdev);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto error;
+		}
+		d_instantiate(dentry, inode);
+	}
+	/* Now set the ACL based on the default value */
+	v9fs_set_create_acl(dentry, &dacl, &pacl);
+error:
+	if (fid)
+		p9_client_clunk(fid);
+	v9fs_set_create_acl(NULL, &dacl, &pacl);
+	return err;
+}
+
+/**
+ * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+
+static void *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+{
+	int retval;
+	struct p9_fid *fid;
+	char *link = __getname();
+	char *target;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+
+	if (!link) {
+		link = ERR_PTR(-ENOMEM);
+		goto ndset;
+	}
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid)) {
+		__putname(link);
+		link = ERR_CAST(fid);
+		goto ndset;
+	}
+	retval = p9_client_readlink(fid, &target);
+	if (!retval) {
+		strcpy(link, target);
+		kfree(target);
+		goto ndset;
+	}
+	__putname(link);
+	link = ERR_PTR(retval);
+ndset:
+	nd_set_link(nd, link);
+	return NULL;
+}
+
+int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
+{
+	loff_t i_size;
+	struct p9_stat_dotl *st;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+	if (IS_ERR(st))
+		return PTR_ERR(st);
+	/*
+	 * Don't update inode if the file type is different
+	 */
+	if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+		goto out;
+
+	spin_lock(&inode->i_lock);
+	/*
+	 * We don't want to refresh inode->i_size,
+	 * because we may have cached data
+	 */
+	i_size = inode->i_size;
+	v9fs_stat2inode_dotl(st, inode);
+	if (v9ses->cache)
+		inode->i_size = i_size;
+	spin_unlock(&inode->i_lock);
+out:
+	kfree(st);
+	return 0;
+}
+
+const struct inode_operations v9fs_dir_inode_operations_dotl = {
+	.create = v9fs_vfs_create_dotl,
+	.lookup = v9fs_vfs_lookup,
+	.link = v9fs_vfs_link_dotl,
+	.symlink = v9fs_vfs_symlink_dotl,
+	.unlink = v9fs_vfs_unlink,
+	.mkdir = v9fs_vfs_mkdir_dotl,
+	.rmdir = v9fs_vfs_rmdir,
+	.mknod = v9fs_vfs_mknod_dotl,
+	.rename = v9fs_vfs_rename,
+	.getattr = v9fs_vfs_getattr_dotl,
+	.setattr = v9fs_vfs_setattr_dotl,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.removexattr = generic_removexattr,
+	.listxattr = v9fs_listxattr,
+	.check_acl = v9fs_check_acl,
+};
+
+const struct inode_operations v9fs_file_inode_operations_dotl = {
+	.getattr = v9fs_vfs_getattr_dotl,
+	.setattr = v9fs_vfs_setattr_dotl,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.removexattr = generic_removexattr,
+	.listxattr = v9fs_listxattr,
+	.check_acl = v9fs_check_acl,
+};
+
+const struct inode_operations v9fs_symlink_inode_operations_dotl = {
+	.readlink = generic_readlink,
+	.follow_link = v9fs_vfs_follow_link_dotl,
+	.put_link = v9fs_vfs_put_link,
+	.getattr = v9fs_vfs_getattr_dotl,
+	.setattr = v9fs_vfs_setattr_dotl,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.removexattr = generic_removexattr,
+	.listxattr = v9fs_listxattr,
+};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
new file mode 100644
index 00000000..c70251d4
--- /dev/null
+++ b/fs/9p/vfs_super.c
@@ -0,0 +1,368 @@
+/*
+ *  linux/fs/9p/vfs_super.c
+ *
+ * This file contians superblock ops for 9P2000. It is intended that
+ * you mount this file system on directories.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/magic.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+#include "xattr.h"
+#include "acl.h"
+
+static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
+
+/**
+ * v9fs_set_super - set the superblock
+ * @s: super block
+ * @data: file system specific data
+ *
+ */
+
+static int v9fs_set_super(struct super_block *s, void *data)
+{
+	s->s_fs_info = data;
+	return set_anon_super(s, data);
+}
+
+/**
+ * v9fs_fill_super - populate superblock with info
+ * @sb: superblock
+ * @v9ses: session information
+ * @flags: flags propagated from v9fs_mount()
+ *
+ */
+
+static void
+v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
+		int flags, void *data)
+{
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
+	sb->s_blocksize = 1 << sb->s_blocksize_bits;
+	sb->s_magic = V9FS_MAGIC;
+	if (v9fs_proto_dotl(v9ses)) {
+		sb->s_op = &v9fs_super_ops_dotl;
+		sb->s_xattr = v9fs_xattr_handlers;
+	} else
+		sb->s_op = &v9fs_super_ops;
+	sb->s_bdi = &v9ses->bdi;
+	if (v9ses->cache)
+		sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE;
+
+	sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
+	if (!v9ses->cache)
+		sb->s_flags |= MS_SYNCHRONOUS;
+
+#ifdef CONFIG_9P_FS_POSIX_ACL
+	if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
+		sb->s_flags |= MS_POSIXACL;
+#endif
+
+	save_mount_options(sb, data);
+}
+
+/**
+ * v9fs_mount - mount a superblock
+ * @fs_type: file system type
+ * @flags: mount flags
+ * @dev_name: device name that was mounted
+ * @data: mount options
+ *
+ */
+
+static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data)
+{
+	struct super_block *sb = NULL;
+	struct inode *inode = NULL;
+	struct dentry *root = NULL;
+	struct v9fs_session_info *v9ses = NULL;
+	int mode = S_IRWXUGO | S_ISVTX;
+	struct p9_fid *fid;
+	int retval = 0;
+
+	P9_DPRINTK(P9_DEBUG_VFS, " \n");
+
+	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
+	if (!v9ses)
+		return ERR_PTR(-ENOMEM);
+
+	fid = v9fs_session_init(v9ses, dev_name, data);
+	if (IS_ERR(fid)) {
+		retval = PTR_ERR(fid);
+		/*
+		 * we need to call session_close to tear down some
+		 * of the data structure setup by session_init
+		 */
+		goto close_session;
+	}
+
+	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
+	if (IS_ERR(sb)) {
+		retval = PTR_ERR(sb);
+		goto clunk_fid;
+	}
+	v9fs_fill_super(sb, v9ses, flags, data);
+
+	if (v9ses->cache)
+		sb->s_d_op = &v9fs_cached_dentry_operations;
+	else
+		sb->s_d_op = &v9fs_dentry_operations;
+
+	inode = v9fs_get_inode(sb, S_IFDIR | mode, 0);
+	if (IS_ERR(inode)) {
+		retval = PTR_ERR(inode);
+		goto release_sb;
+	}
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		retval = -ENOMEM;
+		goto release_sb;
+	}
+	sb->s_root = root;
+	if (v9fs_proto_dotl(v9ses)) {
+		struct p9_stat_dotl *st = NULL;
+		st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+		if (IS_ERR(st)) {
+			retval = PTR_ERR(st);
+			goto release_sb;
+		}
+		root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
+		v9fs_stat2inode_dotl(st, root->d_inode);
+		kfree(st);
+	} else {
+		struct p9_wstat *st = NULL;
+		st = p9_client_stat(fid);
+		if (IS_ERR(st)) {
+			retval = PTR_ERR(st);
+			goto release_sb;
+		}
+
+		root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
+		v9fs_stat2inode(st, root->d_inode, sb);
+
+		p9stat_free(st);
+		kfree(st);
+	}
+	retval = v9fs_get_acl(inode, fid);
+	if (retval)
+		goto release_sb;
+	v9fs_fid_add(root, fid);
+
+	P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
+	return dget(sb->s_root);
+
+clunk_fid:
+	p9_client_clunk(fid);
+close_session:
+	v9fs_session_close(v9ses);
+	kfree(v9ses);
+	return ERR_PTR(retval);
+
+release_sb:
+	/*
+	 * we will do the session_close and root dentry release
+	 * in the below call. But we need to clunk fid, because we haven't
+	 * attached the fid to dentry so it won't get clunked
+	 * automatically.
+	 */
+	p9_client_clunk(fid);
+	deactivate_locked_super(sb);
+	return ERR_PTR(retval);
+}
+
+/**
+ * v9fs_kill_super - Kill Superblock
+ * @s: superblock
+ *
+ */
+
+static void v9fs_kill_super(struct super_block *s)
+{
+	struct v9fs_session_info *v9ses = s->s_fs_info;
+
+	P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
+
+	kill_anon_super(s);
+
+	v9fs_session_cancel(v9ses);
+	v9fs_session_close(v9ses);
+	kfree(v9ses);
+	s->s_fs_info = NULL;
+	P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
+}
+
+static void
+v9fs_umount_begin(struct super_block *sb)
+{
+	struct v9fs_session_info *v9ses;
+
+	v9ses = sb->s_fs_info;
+	v9fs_session_begin_cancel(v9ses);
+}
+
+static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_rstatfs rs;
+	int res;
+
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid)) {
+		res = PTR_ERR(fid);
+		goto done;
+	}
+
+	v9ses = v9fs_dentry2v9ses(dentry);
+	if (v9fs_proto_dotl(v9ses)) {
+		res = p9_client_statfs(fid, &rs);
+		if (res == 0) {
+			buf->f_type = V9FS_MAGIC;
+			buf->f_bsize = rs.bsize;
+			buf->f_blocks = rs.blocks;
+			buf->f_bfree = rs.bfree;
+			buf->f_bavail = rs.bavail;
+			buf->f_files = rs.files;
+			buf->f_ffree = rs.ffree;
+			buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL;
+			buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL;
+			buf->f_namelen = rs.namelen;
+		}
+		if (res != -ENOSYS)
+			goto done;
+	}
+	res = simple_statfs(dentry, buf);
+done:
+	return res;
+}
+
+static int v9fs_drop_inode(struct inode *inode)
+{
+	struct v9fs_session_info *v9ses;
+	v9ses = v9fs_inode2v9ses(inode);
+	if (v9ses->cache)
+		return generic_drop_inode(inode);
+	/*
+	 * in case of non cached mode always drop the
+	 * the inode because we want the inode attribute
+	 * to always match that on the server.
+	 */
+	return 1;
+}
+
+static int v9fs_write_inode(struct inode *inode,
+			    struct writeback_control *wbc)
+{
+	int ret;
+	struct p9_wstat wstat;
+	struct v9fs_inode *v9inode;
+	/*
+	 * send an fsync request to server irrespective of
+	 * wbc->sync_mode.
+	 */
+	P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+	v9inode = V9FS_I(inode);
+	if (!v9inode->writeback_fid)
+		return 0;
+	v9fs_blank_wstat(&wstat);
+
+	ret = p9_client_wstat(v9inode->writeback_fid, &wstat);
+	if (ret < 0) {
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+		return ret;
+	}
+	return 0;
+}
+
+static int v9fs_write_inode_dotl(struct inode *inode,
+				 struct writeback_control *wbc)
+{
+	int ret;
+	struct v9fs_inode *v9inode;
+	/*
+	 * send an fsync request to server irrespective of
+	 * wbc->sync_mode.
+	 */
+	P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+	v9inode = V9FS_I(inode);
+	if (!v9inode->writeback_fid)
+		return 0;
+	ret = p9_client_fsync(v9inode->writeback_fid, 0);
+	if (ret < 0) {
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+		return ret;
+	}
+	return 0;
+}
+
+static const struct super_operations v9fs_super_ops = {
+	.alloc_inode = v9fs_alloc_inode,
+	.destroy_inode = v9fs_destroy_inode,
+	.statfs = simple_statfs,
+	.evict_inode = v9fs_evict_inode,
+	.show_options = generic_show_options,
+	.umount_begin = v9fs_umount_begin,
+	.write_inode = v9fs_write_inode,
+};
+
+static const struct super_operations v9fs_super_ops_dotl = {
+	.alloc_inode = v9fs_alloc_inode,
+	.destroy_inode = v9fs_destroy_inode,
+	.statfs = v9fs_statfs,
+	.drop_inode = v9fs_drop_inode,
+	.evict_inode = v9fs_evict_inode,
+	.show_options = generic_show_options,
+	.umount_begin = v9fs_umount_begin,
+	.write_inode = v9fs_write_inode_dotl,
+};
+
+struct file_system_type v9fs_fs_type = {
+	.name = "9p",
+	.mount = v9fs_mount,
+	.kill_sb = v9fs_kill_super,
+	.owner = THIS_MODULE,
+	.fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT,
+};
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
new file mode 100644
index 00000000..d2887738
--- /dev/null
+++ b/fs/9p/xattr.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "fid.h"
+#include "xattr.h"
+
+ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
+			   void *buffer, size_t buffer_size)
+{
+	ssize_t retval;
+	int msize, read_count;
+	u64 offset = 0, attr_size;
+	struct p9_fid *attr_fid;
+
+	attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
+	if (IS_ERR(attr_fid)) {
+		retval = PTR_ERR(attr_fid);
+		P9_DPRINTK(P9_DEBUG_VFS,
+			"p9_client_attrwalk failed %zd\n", retval);
+		attr_fid = NULL;
+		goto error;
+	}
+	if (!buffer_size) {
+		/* request to get the attr_size */
+		retval = attr_size;
+		goto error;
+	}
+	if (attr_size > buffer_size) {
+		retval = -ERANGE;
+		goto error;
+	}
+	msize = attr_fid->clnt->msize;
+	while (attr_size) {
+		if (attr_size > (msize - P9_IOHDRSZ))
+			read_count = msize - P9_IOHDRSZ;
+		else
+			read_count = attr_size;
+		read_count = p9_client_read(attr_fid, ((char *)buffer)+offset,
+					NULL, offset, read_count);
+		if (read_count < 0) {
+			/* error in xattr read */
+			retval = read_count;
+			goto error;
+		}
+		offset += read_count;
+		attr_size -= read_count;
+	}
+	/* Total read xattr bytes */
+	retval = offset;
+error:
+	if (attr_fid)
+		p9_client_clunk(attr_fid);
+	return retval;
+
+}
+
+
+/*
+ * v9fs_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
+		       void *buffer, size_t buffer_size)
+{
+	struct p9_fid *fid;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
+		__func__, name, buffer_size);
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	return v9fs_fid_xattr_get(fid, name, buffer, buffer_size);
+}
+
+/*
+ * v9fs_xattr_set()
+ *
+ * Create, replace or remove an extended attribute for this inode. Buffer
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int v9fs_xattr_set(struct dentry *dentry, const char *name,
+		   const void *value, size_t value_len, int flags)
+{
+	u64 offset = 0;
+	int retval, msize, write_count;
+	struct p9_fid *fid = NULL;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n",
+		__func__, name, value_len, flags);
+
+	fid = v9fs_fid_clone(dentry);
+	if (IS_ERR(fid)) {
+		retval = PTR_ERR(fid);
+		fid = NULL;
+		goto error;
+	}
+	/*
+	 * On success fid points to xattr
+	 */
+	retval = p9_client_xattrcreate(fid, name, value_len, flags);
+	if (retval < 0) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+			"p9_client_xattrcreate failed %d\n", retval);
+		goto error;
+	}
+	msize = fid->clnt->msize;
+	while (value_len) {
+		if (value_len > (msize - P9_IOHDRSZ))
+			write_count = msize - P9_IOHDRSZ;
+		else
+			write_count = value_len;
+		write_count = p9_client_write(fid, ((char *)value)+offset,
+					NULL, offset, write_count);
+		if (write_count < 0) {
+			/* error in xattr write */
+			retval = write_count;
+			goto error;
+		}
+		offset += write_count;
+		value_len -= write_count;
+	}
+	/* Total read xattr bytes */
+	retval = offset;
+error:
+	if (fid)
+		retval = p9_client_clunk(fid);
+	return retval;
+}
+
+ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
+}
+
+const struct xattr_handler *v9fs_xattr_handlers[] = {
+	&v9fs_xattr_user_handler,
+#ifdef CONFIG_9P_FS_POSIX_ACL
+	&v9fs_xattr_acl_access_handler,
+	&v9fs_xattr_acl_default_handler,
+#endif
+	NULL
+};
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
new file mode 100644
index 00000000..eaa837c5
--- /dev/null
+++ b/fs/9p/xattr.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#ifndef FS_9P_XATTR_H
+#define FS_9P_XATTR_H
+
+#include <linux/xattr.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+extern const struct xattr_handler *v9fs_xattr_handlers[];
+extern struct xattr_handler v9fs_xattr_user_handler;
+extern const struct xattr_handler v9fs_xattr_acl_access_handler;
+extern const struct xattr_handler v9fs_xattr_acl_default_handler;
+
+extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
+				  void *, size_t);
+extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
+			      void *, size_t);
+extern int v9fs_xattr_set(struct dentry *, const char *,
+			  const void *, size_t, int);
+extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t);
+#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c
new file mode 100644
index 00000000..d0b701b7
--- /dev/null
+++ b/fs/9p/xattr_user.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+
+static int v9fs_xattr_user_get(struct dentry *dentry, const char *name,
+			void *buffer, size_t size, int type)
+{
+	int retval;
+	char *full_name;
+	size_t name_len;
+	size_t prefix_len = XATTR_USER_PREFIX_LEN;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+	if (!full_name)
+		return -ENOMEM;
+	memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
+	memcpy(full_name+prefix_len, name, name_len);
+	full_name[prefix_len + name_len] = '\0';
+
+	retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+	kfree(full_name);
+	return retval;
+}
+
+static int v9fs_xattr_user_set(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags, int type)
+{
+	int retval;
+	char *full_name;
+	size_t name_len;
+	size_t prefix_len = XATTR_USER_PREFIX_LEN;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+	if (!full_name)
+		return -ENOMEM;
+	memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
+	memcpy(full_name + prefix_len, name, name_len);
+	full_name[prefix_len + name_len] = '\0';
+
+	retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+	kfree(full_name);
+	return retval;
+}
+
+struct xattr_handler v9fs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.get	= v9fs_xattr_user_get,
+	.set	= v9fs_xattr_user_set,
+};
diff --git a/fs/Kconfig b/fs/Kconfig
new file mode 100644
index 00000000..88701cc0
--- /dev/null
+++ b/fs/Kconfig
@@ -0,0 +1,277 @@
+#
+# File system configuration
+#
+
+menu "File systems"
+
+if BLOCK
+
+source "fs/ext2/Kconfig"
+source "fs/ext3/Kconfig"
+source "fs/ext4/Kconfig"
+
+config FS_XIP
+# execute in place
+	bool
+	depends on EXT2_FS_XIP
+	default y
+
+source "fs/jbd/Kconfig"
+source "fs/jbd2/Kconfig"
+
+config FS_MBCACHE
+# Meta block cache for Extended Attributes (ext2/ext3/ext4)
+	tristate
+	default y if EXT2_FS=y && EXT2_FS_XATTR
+	default y if EXT3_FS=y && EXT3_FS_XATTR
+	default y if EXT4_FS=y && EXT4_FS_XATTR
+	default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
+
+source "fs/reiserfs/Kconfig"
+source "fs/jfs/Kconfig"
+
+source "fs/xfs/Kconfig"
+source "fs/gfs2/Kconfig"
+source "fs/ocfs2/Kconfig"
+source "fs/btrfs/Kconfig"
+source "fs/nilfs2/Kconfig"
+
+endif # BLOCK
+
+# Posix ACL utility routines
+#
+# Note: Posix ACLs can be implemented without these helpers.  Never use
+# this symbol for ifdefs in core code.
+#
+config FS_POSIX_ACL
+	def_bool n
+
+config EXPORTFS
+	tristate
+
+config FILE_LOCKING
+	bool "Enable POSIX file locking API" if EXPERT
+	default y
+	help
+	  This option enables standard file locking support, required
+          for filesystems like NFS and for the flock() system
+          call. Disabling this option saves about 11k.
+
+source "fs/notify/Kconfig"
+
+source "fs/quota/Kconfig"
+
+source "fs/autofs4/Kconfig"
+source "fs/fuse/Kconfig"
+
+config CUSE
+	tristate "Character device in Userspace support"
+	depends on FUSE_FS
+	help
+	  This FUSE extension allows character devices to be
+	  implemented in userspace.
+
+	  If you want to develop or use userspace character device
+	  based on CUSE, answer Y or M.
+
+config GENERIC_ACL
+	bool
+	select FS_POSIX_ACL
+
+menu "Caches"
+
+source "fs/fscache/Kconfig"
+source "fs/cachefiles/Kconfig"
+
+endmenu
+
+if BLOCK
+menu "CD-ROM/DVD Filesystems"
+
+source "fs/isofs/Kconfig"
+source "fs/udf/Kconfig"
+
+endmenu
+endif # BLOCK
+
+if BLOCK
+menu "DOS/FAT/NT Filesystems"
+
+source "fs/fat/Kconfig"
+source "fs/ntfs/Kconfig"
+
+endmenu
+endif # BLOCK
+
+menu "Pseudo filesystems"
+
+source "fs/proc/Kconfig"
+source "fs/sysfs/Kconfig"
+
+config TMPFS
+	bool "Virtual memory file system support (former shm fs)"
+	depends on SHMEM
+	help
+	  Tmpfs is a file system which keeps all files in virtual memory.
+
+	  Everything in tmpfs is temporary in the sense that no files will be
+	  created on your hard drive. The files live in memory and swap
+	  space. If you unmount a tmpfs instance, everything stored therein is
+	  lost.
+
+	  See <file:Documentation/filesystems/tmpfs.txt> for details.
+
+config TMPFS_POSIX_ACL
+	bool "Tmpfs POSIX Access Control Lists"
+	depends on TMPFS
+	select TMPFS_XATTR
+	select GENERIC_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N.
+
+config TMPFS_XATTR
+	bool "Tmpfs extended attributes"
+	depends on TMPFS
+	default n
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  Currently this enables support for the trusted.* and
+	  security.* namespaces.
+
+	  You need this for POSIX ACL support on tmpfs.
+
+	  If unsure, say N.
+
+config HUGETLBFS
+	bool "HugeTLB file system support"
+	depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
+		   SYS_SUPPORTS_HUGETLBFS || BROKEN
+	help
+	  hugetlbfs is a filesystem backing for HugeTLB pages, based on
+	  ramfs. For architectures that support it, say Y here and read
+	  <file:Documentation/vm/hugetlbpage.txt> for details.
+
+	  If unsure, say N.
+
+config HUGETLB_PAGE
+	def_bool HUGETLBFS
+
+source "fs/configfs/Kconfig"
+
+endmenu
+
+menuconfig MISC_FILESYSTEMS
+	bool "Miscellaneous filesystems"
+	default y
+	---help---
+	  Say Y here to get to see options for various miscellaneous
+	  filesystems, such as filesystems that came from other
+	  operating systems.
+
+	  This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and
+	  disabled; if unsure, say Y here.
+
+if MISC_FILESYSTEMS
+
+source "fs/adfs/Kconfig"
+source "fs/affs/Kconfig"
+source "fs/ecryptfs/Kconfig"
+source "fs/hfs/Kconfig"
+source "fs/hfsplus/Kconfig"
+source "fs/befs/Kconfig"
+source "fs/bfs/Kconfig"
+source "fs/efs/Kconfig"
+
+# Patched by YAFFS
+source "fs/yaffs2/Kconfig"
+
+source "fs/jffs2/Kconfig"
+# UBIFS File system configuration
+source "fs/ubifs/Kconfig"
+source "fs/logfs/Kconfig"
+source "fs/cramfs/Kconfig"
+source "fs/squashfs/Kconfig"
+source "fs/freevxfs/Kconfig"
+source "fs/minix/Kconfig"
+source "fs/omfs/Kconfig"
+source "fs/hpfs/Kconfig"
+source "fs/qnx4/Kconfig"
+source "fs/romfs/Kconfig"
+source "fs/pstore/Kconfig"
+source "fs/sysv/Kconfig"
+source "fs/ufs/Kconfig"
+source "fs/exofs/Kconfig"
+
+endif # MISC_FILESYSTEMS
+
+menuconfig NETWORK_FILESYSTEMS
+	bool "Network File Systems"
+	default y
+	depends on NET
+	---help---
+	  Say Y here to get to see options for network filesystems and
+	  filesystem-related networking code, such as NFS daemon and
+	  RPCSEC security modules.
+
+	  This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and
+	  disabled; if unsure, say Y here.
+
+if NETWORK_FILESYSTEMS
+
+source "fs/nfs/Kconfig"
+source "fs/nfsd/Kconfig"
+
+config LOCKD
+	tristate
+	depends on FILE_LOCKING
+
+config LOCKD_V4
+	bool
+	depends on NFSD_V3 || NFS_V3
+	depends on FILE_LOCKING
+	default y
+
+config NFS_ACL_SUPPORT
+	tristate
+	select FS_POSIX_ACL
+
+config NFS_COMMON
+	bool
+	depends on NFSD || NFS_FS
+	default y
+
+source "net/sunrpc/Kconfig"
+source "fs/ceph/Kconfig"
+source "fs/cifs/Kconfig"
+source "fs/ncpfs/Kconfig"
+source "fs/coda/Kconfig"
+source "fs/afs/Kconfig"
+source "fs/9p/Kconfig"
+
+endif # NETWORK_FILESYSTEMS
+
+if BLOCK
+menu "Partition Types"
+
+source "fs/partitions/Kconfig"
+
+endmenu
+endif
+
+source "fs/nls/Kconfig"
+source "fs/dlm/Kconfig"
+
+endmenu
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
new file mode 100644
index 00000000..79e2ca79
--- /dev/null
+++ b/fs/Kconfig.binfmt
@@ -0,0 +1,163 @@
+config BINFMT_ELF
+	bool "Kernel support for ELF binaries"
+	depends on MMU && (BROKEN || !FRV)
+	default y
+	---help---
+	  ELF (Executable and Linkable Format) is a format for libraries and
+	  executables used across different architectures and operating
+	  systems. Saying Y here will enable your kernel to run ELF binaries
+	  and enlarge it by about 13 KB. ELF support under Linux has now all
+	  but replaced the traditional Linux a.out formats (QMAGIC and ZMAGIC)
+	  because it is portable (this does *not* mean that you will be able
+	  to run executables from different architectures or operating systems
+	  however) and makes building run-time libraries very easy. Many new
+	  executables are distributed solely in ELF format. You definitely
+	  want to say Y here.
+
+	  Information about ELF is contained in the ELF HOWTO available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  If you find that after upgrading from Linux kernel 1.2 and saying Y
+	  here, you still can't run any ELF binaries (they just crash), then
+	  you'll have to install the newest ELF runtime libraries, including
+	  ld.so (check the file <file:Documentation/Changes> for location and
+	  latest version).
+
+config COMPAT_BINFMT_ELF
+	bool
+	depends on COMPAT && BINFMT_ELF
+
+config BINFMT_ELF_FDPIC
+	bool "Kernel support for FDPIC ELF binaries"
+	default y
+	depends on (FRV || BLACKFIN || (SUPERH32 && !MMU))
+	help
+	  ELF FDPIC binaries are based on ELF, but allow the individual load
+	  segments of a binary to be located in memory independently of each
+	  other. This makes this format ideal for use in environments where no
+	  MMU is available as it still permits text segments to be shared,
+	  even if data segments are not.
+
+	  It is also possible to run FDPIC ELF binaries on MMU linux also.
+
+config CORE_DUMP_DEFAULT_ELF_HEADERS
+	bool "Write ELF core dumps with partial segments"
+	default y
+	depends on BINFMT_ELF && ELF_CORE
+	help
+	  ELF core dump files describe each memory mapping of the crashed
+	  process, and can contain or omit the memory contents of each one.
+	  The contents of an unmodified text mapping are omitted by default.
+
+	  For an unmodified text mapping of an ELF object, including just
+	  the first page of the file in a core dump makes it possible to
+	  identify the build ID bits in the file, without paying the i/o
+	  cost and disk space to dump all the text.  However, versions of
+	  GDB before 6.7 are confused by ELF core dump files in this format.
+
+	  The core dump behavior can be controlled per process using
+	  the /proc/PID/coredump_filter pseudo-file; this setting is
+	  inherited.  See Documentation/filesystems/proc.txt for details.
+
+	  This config option changes the default setting of coredump_filter
+	  seen at boot time.  If unsure, say Y.
+
+config BINFMT_FLAT
+	bool "Kernel support for flat binaries"
+	depends on !MMU && (!FRV || BROKEN)
+	help
+	  Support uClinux FLAT format binaries.
+
+config BINFMT_ZFLAT
+	bool "Enable ZFLAT support"
+	depends on BINFMT_FLAT
+	select ZLIB_INFLATE
+	help
+	  Support FLAT format compressed binaries
+
+config BINFMT_SHARED_FLAT
+	bool "Enable shared FLAT support"
+	depends on BINFMT_FLAT
+	help
+	  Support FLAT shared libraries
+
+config HAVE_AOUT
+       def_bool n
+
+config BINFMT_AOUT
+	tristate "Kernel support for a.out and ECOFF binaries"
+	depends on HAVE_AOUT
+	---help---
+	  A.out (Assembler.OUTput) is a set of formats for libraries and
+	  executables used in the earliest versions of UNIX.  Linux used
+	  the a.out formats QMAGIC and ZMAGIC until they were replaced
+	  with the ELF format.
+
+	  The conversion to ELF started in 1995.  This option is primarily
+	  provided for historical interest and for the benefit of those
+	  who need to run binaries from that era.
+
+	  Most people should answer N here.  If you think you may have
+	  occasional use for this format, enable module support above
+	  and answer M here to compile this support as a module called
+	  binfmt_aout.
+
+	  If any crucial components of your system (such as /sbin/init
+	  or /lib/ld.so) are still in a.out format, you will have to
+	  say Y here.
+
+config OSF4_COMPAT
+	bool "OSF/1 v4 readv/writev compatibility"
+	depends on ALPHA && BINFMT_AOUT
+	help
+	  Say Y if you are using OSF/1 binaries (like Netscape and Acrobat)
+	  with v4 shared libraries freely available from Compaq. If you're
+	  going to use shared libraries from Tru64 version 5.0 or later, say N.
+
+config BINFMT_EM86
+	tristate "Kernel support for Linux/Intel ELF binaries"
+	depends on ALPHA
+	---help---
+	  Say Y here if you want to be able to execute Linux/Intel ELF
+	  binaries just like native Alpha binaries on your Alpha machine. For
+	  this to work, you need to have the emulator /usr/bin/em86 in place.
+
+	  You can get the same functionality by saying N here and saying Y to
+	  "Kernel support for MISC binaries".
+
+	  You may answer M to compile the emulation support as a module and
+	  later load the module when you want to use a Linux/Intel binary. The
+	  module will be called binfmt_em86. If unsure, say Y.
+
+config BINFMT_SOM
+	tristate "Kernel support for SOM binaries"
+	depends on PARISC && HPUX
+	help
+	  SOM is a binary executable format inherited from HP/UX.  Say
+	  Y here to be able to load and execute SOM binaries directly.
+
+config BINFMT_MISC
+	tristate "Kernel support for MISC binaries"
+	---help---
+	  If you say Y here, it will be possible to plug wrapper-driven binary
+	  formats into the kernel. You will like this especially when you use
+	  programs that need an interpreter to run like Java, Python, .NET or
+	  Emacs-Lisp. It's also useful if you often run DOS executables under
+	  the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>). Once you have
+	  registered such a binary class with the kernel, you can start one of
+	  those programs simply by typing in its name at a shell prompt; Linux
+	  will automatically feed it to the correct interpreter.
+
+	  You can do other nice things, too. Read the file
+	  <file:Documentation/binfmt_misc.txt> to learn how to use this
+	  feature, <file:Documentation/java.txt> for information about how
+	  to include Java support. and <file:Documentation/mono.txt> for
+          information about how to include Mono-based .NET support.
+
+          To use binfmt_misc, you will need to mount it:
+		mount binfmt_misc -t binfmt_misc /proc/sys/fs/binfmt_misc
+
+	  You may say M here for module support and later load the module when
+	  you have use for it; the module is called binfmt_misc. If you
+	  don't know what to answer at this point, say Y.
diff --git a/fs/Makefile b/fs/Makefile
new file mode 100644
index 00000000..2999b4d4
--- /dev/null
+++ b/fs/Makefile
@@ -0,0 +1,129 @@
+#
+# Makefile for the Linux filesystems.
+#
+# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
+# Rewritten to use lists instead of if-statements.
+# 
+
+obj-y :=	open.o read_write.o file_table.o super.o \
+		char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
+		ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
+		attr.o bad_inode.o file.o filesystems.o namespace.o \
+		seq_file.o xattr.o libfs.o fs-writeback.o \
+		pnode.o drop_caches.o splice.o sync.o utimes.o \
+		stack.o fs_struct.o statfs.o
+
+ifeq ($(CONFIG_BLOCK),y)
+obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
+else
+obj-y +=	no-block.o
+endif
+
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
+obj-y				+= notify/
+obj-$(CONFIG_EPOLL)		+= eventpoll.o
+obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
+obj-$(CONFIG_SIGNALFD)		+= signalfd.o
+obj-$(CONFIG_TIMERFD)		+= timerfd.o
+obj-$(CONFIG_EVENTFD)		+= eventfd.o
+obj-$(CONFIG_AIO)               += aio.o
+obj-$(CONFIG_FILE_LOCKING)      += locks.o
+obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
+obj-$(CONFIG_NFSD_DEPRECATED)	+= nfsctl.o
+obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
+obj-$(CONFIG_BINFMT_EM86)	+= binfmt_em86.o
+obj-$(CONFIG_BINFMT_MISC)	+= binfmt_misc.o
+
+# binfmt_script is always there
+obj-y				+= binfmt_script.o
+
+obj-$(CONFIG_BINFMT_ELF)	+= binfmt_elf.o
+obj-$(CONFIG_COMPAT_BINFMT_ELF)	+= compat_binfmt_elf.o
+obj-$(CONFIG_BINFMT_ELF_FDPIC)	+= binfmt_elf_fdpic.o
+obj-$(CONFIG_BINFMT_SOM)	+= binfmt_som.o
+obj-$(CONFIG_BINFMT_FLAT)	+= binfmt_flat.o
+
+obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o
+obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
+obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
+obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
+
+obj-$(CONFIG_FHANDLE)		+= fhandle.o
+
+obj-y				+= quota/
+
+obj-$(CONFIG_PROC_FS)		+= proc/
+obj-y				+= partitions/
+obj-$(CONFIG_SYSFS)		+= sysfs/
+obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
+obj-y				+= devpts/
+
+obj-$(CONFIG_PROFILING)		+= dcookies.o
+obj-$(CONFIG_DLM)		+= dlm/
+ 
+# Do not add any filesystems before this line
+obj-$(CONFIG_FSCACHE)		+= fscache/
+obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
+obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
+obj-$(CONFIG_EXT2_FS)		+= ext2/
+# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
+# unless explicitly requested by rootfstype
+obj-$(CONFIG_EXT4_FS)		+= ext4/
+obj-$(CONFIG_JBD)		+= jbd/
+obj-$(CONFIG_JBD2)		+= jbd2/
+obj-$(CONFIG_CRAMFS)		+= cramfs/
+obj-$(CONFIG_SQUASHFS)		+= squashfs/
+obj-y				+= ramfs/
+obj-$(CONFIG_HUGETLBFS)		+= hugetlbfs/
+obj-$(CONFIG_CODA_FS)		+= coda/
+obj-$(CONFIG_MINIX_FS)		+= minix/
+obj-$(CONFIG_FAT_FS)		+= fat/
+obj-$(CONFIG_BFS_FS)		+= bfs/
+obj-$(CONFIG_ISO9660_FS)	+= isofs/
+obj-$(CONFIG_HFSPLUS_FS)	+= hfsplus/ # Before hfs to find wrapped HFS+
+obj-$(CONFIG_HFS_FS)		+= hfs/
+obj-$(CONFIG_ECRYPT_FS)		+= ecryptfs/
+obj-$(CONFIG_VXFS_FS)		+= freevxfs/
+obj-$(CONFIG_NFS_FS)		+= nfs/
+obj-$(CONFIG_EXPORTFS)		+= exportfs/
+obj-$(CONFIG_NFSD)		+= nfsd/
+obj-$(CONFIG_LOCKD)		+= lockd/
+obj-$(CONFIG_NLS)		+= nls/
+obj-$(CONFIG_SYSV_FS)		+= sysv/
+obj-$(CONFIG_CIFS)		+= cifs/
+obj-$(CONFIG_NCP_FS)		+= ncpfs/
+obj-$(CONFIG_HPFS_FS)		+= hpfs/
+obj-$(CONFIG_NTFS_FS)		+= ntfs/
+obj-$(CONFIG_UFS_FS)		+= ufs/
+obj-$(CONFIG_EFS_FS)		+= efs/
+obj-$(CONFIG_JFFS2_FS)		+= jffs2/
+obj-$(CONFIG_LOGFS)		+= logfs/
+obj-$(CONFIG_UBIFS_FS)		+= ubifs/
+obj-$(CONFIG_AFFS_FS)		+= affs/
+obj-$(CONFIG_ROMFS_FS)		+= romfs/
+obj-$(CONFIG_QNX4FS_FS)		+= qnx4/
+obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/
+obj-$(CONFIG_ADFS_FS)		+= adfs/
+obj-$(CONFIG_FUSE_FS)		+= fuse/
+obj-$(CONFIG_UDF_FS)		+= udf/
+obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
+obj-$(CONFIG_OMFS_FS)		+= omfs/
+obj-$(CONFIG_JFS_FS)		+= jfs/
+obj-$(CONFIG_XFS_FS)		+= xfs/
+obj-$(CONFIG_9P_FS)		+= 9p/
+obj-$(CONFIG_AFS_FS)		+= afs/
+obj-$(CONFIG_NILFS2_FS)		+= nilfs2/
+obj-$(CONFIG_BEFS_FS)		+= befs/
+obj-$(CONFIG_HOSTFS)		+= hostfs/
+obj-$(CONFIG_HPPFS)		+= hppfs/
+obj-$(CONFIG_CACHEFILES)	+= cachefiles/
+obj-$(CONFIG_DEBUG_FS)		+= debugfs/
+obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
+obj-$(CONFIG_BTRFS_FS)		+= btrfs/
+obj-$(CONFIG_GFS2_FS)           += gfs2/
+obj-$(CONFIG_EXOFS_FS)          += exofs/
+obj-$(CONFIG_CEPH_FS)		+= ceph/
+obj-$(CONFIG_PSTORE)		+= pstore/
+
+# Patched by YAFFS
+obj-$(CONFIG_YAFFS_FS)		+= yaffs2/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
new file mode 100644
index 00000000..e55182a7
--- /dev/null
+++ b/fs/adfs/Kconfig
@@ -0,0 +1,27 @@
+config ADFS_FS
+	tristate "ADFS file system support (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	help
+	  The Acorn Disc Filing System is the standard file system of the
+	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
+	  systems and the Acorn Archimedes range of machines. If you say Y
+	  here, Linux will be able to read from ADFS partitions on hard drives
+	  and from ADFS-formatted floppy discs. If you also want to be able to
+	  write to those devices, say Y to "ADFS write support" below.
+
+	  The ADFS partition should be the first partition (i.e.,
+	  /dev/[hs]d?1) on each of your drives. Please read the file
+	  <file:Documentation/filesystems/adfs.txt> for further details.
+
+	  To compile this code as a module, choose M here: the module will be
+	  called adfs.
+
+	  If unsure, say N.
+
+config ADFS_FS_RW
+	bool "ADFS write support (DANGEROUS)"
+	depends on ADFS_FS
+	help
+	  If you say Y here, you will be able to write to ADFS partitions on
+	  hard drives and ADFS-formatted floppy disks. This is experimental
+	  codes, so if you're unsure, say N.
diff --git a/fs/adfs/Makefile b/fs/adfs/Makefile
new file mode 100644
index 00000000..9b2d71a9
--- /dev/null
+++ b/fs/adfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux adfs filesystem routines.
+#
+
+obj-$(CONFIG_ADFS_FS) += adfs.o
+
+adfs-objs := dir.o dir_f.o dir_fplus.o file.o inode.o map.o super.o
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
new file mode 100644
index 00000000..718ac1f4
--- /dev/null
+++ b/fs/adfs/adfs.h
@@ -0,0 +1,204 @@
+#include <linux/fs.h>
+#include <linux/adfs_fs.h>
+
+/* Internal data structures for ADFS */
+
+#define ADFS_FREE_FRAG		 0
+#define ADFS_BAD_FRAG		 1
+#define ADFS_ROOT_FRAG		 2
+
+#define ADFS_NDA_OWNER_READ	(1 << 0)
+#define ADFS_NDA_OWNER_WRITE	(1 << 1)
+#define ADFS_NDA_LOCKED		(1 << 2)
+#define ADFS_NDA_DIRECTORY	(1 << 3)
+#define ADFS_NDA_EXECUTE	(1 << 4)
+#define ADFS_NDA_PUBLIC_READ	(1 << 5)
+#define ADFS_NDA_PUBLIC_WRITE	(1 << 6)
+
+#include "dir_f.h"
+
+struct buffer_head;
+
+/*
+ * adfs file system inode data in memory
+ */
+struct adfs_inode_info {
+	loff_t		mmu_private;
+	unsigned long	parent_id;	/* object id of parent		*/
+	__u32		loadaddr;	/* RISC OS load address		*/
+	__u32		execaddr;	/* RISC OS exec address		*/
+	unsigned int	filetype;	/* RISC OS file type		*/
+	unsigned int	attr;		/* RISC OS permissions		*/
+	unsigned int	stamped:1;	/* RISC OS file has date/time	*/
+	struct inode vfs_inode;
+};
+
+/*
+ * Forward-declare this
+ */
+struct adfs_discmap;
+struct adfs_dir_ops;
+
+/*
+ * ADFS file system superblock data in memory
+ */
+struct adfs_sb_info {
+	struct adfs_discmap *s_map;	/* bh list containing map		 */
+	struct adfs_dir_ops *s_dir;	/* directory operations			 */
+
+	uid_t		s_uid;		/* owner uid				 */
+	gid_t		s_gid;		/* owner gid				 */
+	umode_t		s_owner_mask;	/* ADFS owner perm -> unix perm		 */
+	umode_t		s_other_mask;	/* ADFS other perm -> unix perm		 */
+	int		s_ftsuffix;	/* ,xyz hex filetype suffix option */
+
+	__u32		s_ids_per_zone;	/* max. no ids in one zone		 */
+	__u32		s_idlen;	/* length of ID in map			 */
+	__u32		s_map_size;	/* sector size of a map			 */
+	unsigned long	s_size;		/* total size (in blocks) of this fs	 */
+	signed int	s_map2blk;	/* shift left by this for map->sector	 */
+	unsigned int	s_log2sharesize;/* log2 share size			 */
+	__le32		s_version;	/* disc format version			 */
+	unsigned int	s_namelen;	/* maximum number of characters in name	 */
+};
+
+static inline struct adfs_sb_info *ADFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct adfs_inode_info *ADFS_I(struct inode *inode)
+{
+	return container_of(inode, struct adfs_inode_info, vfs_inode);
+}
+
+/*
+ * Directory handling
+ */
+struct adfs_dir {
+	struct super_block	*sb;
+
+	int			nr_buffers;
+	struct buffer_head	*bh[4];
+
+	/* big directories need allocated buffers */
+	struct buffer_head	**bh_fplus;
+
+	unsigned int		pos;
+	unsigned int		parent_id;
+
+	struct adfs_dirheader	dirhead;
+	union  adfs_dirtail	dirtail;
+};
+
+/*
+ * This is the overall maximum name length
+ */
+#define ADFS_MAX_NAME_LEN	(256 + 4) /* +4 for ,xyz hex filetype suffix */
+struct object_info {
+	__u32		parent_id;		/* parent object id	*/
+	__u32		file_id;		/* object id		*/
+	__u32		loadaddr;		/* load address		*/
+	__u32		execaddr;		/* execution address	*/
+	__u32		size;			/* size			*/
+	__u8		attr;			/* RISC OS attributes	*/
+	unsigned int	name_len;		/* name length		*/
+	char		name[ADFS_MAX_NAME_LEN];/* file name		*/
+
+	/* RISC OS file type (12-bit: derived from loadaddr) */
+	__u16		filetype;
+};
+
+/* RISC OS 12-bit filetype converts to ,xyz hex filename suffix */
+static inline int append_filetype_suffix(char *buf, __u16 filetype)
+{
+	if (filetype == 0xffff)	/* no explicit 12-bit file type was set */
+		return 0;
+
+	*buf++ = ',';
+	*buf++ = hex_asc_lo(filetype >> 8);
+	*buf++ = hex_asc_lo(filetype >> 4);
+	*buf++ = hex_asc_lo(filetype >> 0);
+	return 4;
+}
+
+struct adfs_dir_ops {
+	int	(*read)(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir);
+	int	(*setpos)(struct adfs_dir *dir, unsigned int fpos);
+	int	(*getnext)(struct adfs_dir *dir, struct object_info *obj);
+	int	(*update)(struct adfs_dir *dir, struct object_info *obj);
+	int	(*create)(struct adfs_dir *dir, struct object_info *obj);
+	int	(*remove)(struct adfs_dir *dir, struct object_info *obj);
+	int	(*sync)(struct adfs_dir *dir);
+	void	(*free)(struct adfs_dir *dir);
+};
+
+struct adfs_discmap {
+	struct buffer_head	*dm_bh;
+	__u32			dm_startblk;
+	unsigned int		dm_startbit;
+	unsigned int		dm_endbit;
+};
+
+/* Inode stuff */
+struct inode *adfs_iget(struct super_block *sb, struct object_info *obj);
+int adfs_write_inode(struct inode *inode, struct writeback_control *wbc);
+int adfs_notify_change(struct dentry *dentry, struct iattr *attr);
+
+/* map.c */
+extern int adfs_map_lookup(struct super_block *sb, unsigned int frag_id, unsigned int offset);
+extern unsigned int adfs_map_free(struct super_block *sb);
+
+/* Misc */
+void __adfs_error(struct super_block *sb, const char *function,
+		  const char *fmt, ...);
+#define adfs_error(sb, fmt...) __adfs_error(sb, __func__, fmt)
+
+/* super.c */
+
+/*
+ * Inodes and file operations
+ */
+
+/* dir_*.c */
+extern const struct inode_operations adfs_dir_inode_operations;
+extern const struct file_operations adfs_dir_operations;
+extern const struct dentry_operations adfs_dentry_operations;
+extern struct adfs_dir_ops adfs_f_dir_ops;
+extern struct adfs_dir_ops adfs_fplus_dir_ops;
+
+extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
+			   int wait);
+
+/* file.c */
+extern const struct inode_operations adfs_file_inode_operations;
+extern const struct file_operations adfs_file_operations;
+
+static inline __u32 signed_asl(__u32 val, signed int shift)
+{
+	if (shift >= 0)
+		val <<= shift;
+	else
+		val >>= -shift;
+	return val;
+}
+
+/*
+ * Calculate the address of a block in an object given the block offset
+ * and the object identity.
+ *
+ * The root directory ID should always be looked up in the map [3.4]
+ */
+static inline int
+__adfs_block_map(struct super_block *sb, unsigned int object_id,
+		 unsigned int block)
+{
+	if (object_id & 255) {
+		unsigned int off;
+
+		off = (object_id & 255) - 1;
+		block += off << ADFS_SB(sb)->s_log2sharesize;
+	}
+
+	return adfs_map_lookup(sb, object_id >> 8, block);
+}
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
new file mode 100644
index 00000000..3d83075a
--- /dev/null
+++ b/fs/adfs/dir.c
@@ -0,0 +1,296 @@
+/*
+ *  linux/fs/adfs/dir.c
+ *
+ *  Copyright (C) 1999-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  Common directory handling for ADFS
+ */
+#include "adfs.h"
+
+/*
+ * For future.  This should probably be per-directory.
+ */
+static DEFINE_RWLOCK(adfs_dir_lock);
+
+static int
+adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+	struct object_info obj;
+	struct adfs_dir dir;
+	int ret = 0;
+
+	if (filp->f_pos >> 32)
+		goto out;
+
+	ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
+	if (ret)
+		goto out;
+
+	switch ((unsigned long)filp->f_pos) {
+	case 0:
+		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
+			goto free_out;
+		filp->f_pos += 1;
+
+	case 1:
+		if (filldir(dirent, "..", 2, 1, dir.parent_id, DT_DIR) < 0)
+			goto free_out;
+		filp->f_pos += 1;
+
+	default:
+		break;
+	}
+
+	read_lock(&adfs_dir_lock);
+
+	ret = ops->setpos(&dir, filp->f_pos - 2);
+	if (ret)
+		goto unlock_out;
+	while (ops->getnext(&dir, &obj) == 0) {
+		if (filldir(dirent, obj.name, obj.name_len,
+			    filp->f_pos, obj.file_id, DT_UNKNOWN) < 0)
+			goto unlock_out;
+		filp->f_pos += 1;
+	}
+
+unlock_out:
+	read_unlock(&adfs_dir_lock);
+
+free_out:
+	ops->free(&dir);
+
+out:
+	return ret;
+}
+
+int
+adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
+{
+	int ret = -EINVAL;
+#ifdef CONFIG_ADFS_FS_RW
+	struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+	struct adfs_dir dir;
+
+	printk(KERN_INFO "adfs_dir_update: object %06X in dir %06X\n",
+		 obj->file_id, obj->parent_id);
+
+	if (!ops->update) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = ops->read(sb, obj->parent_id, 0, &dir);
+	if (ret)
+		goto out;
+
+	write_lock(&adfs_dir_lock);
+	ret = ops->update(&dir, obj);
+	write_unlock(&adfs_dir_lock);
+
+	if (wait) {
+		int err = ops->sync(&dir);
+		if (!ret)
+			ret = err;
+	}
+
+	ops->free(&dir);
+out:
+#endif
+	return ret;
+}
+
+static int
+adfs_match(struct qstr *name, struct object_info *obj)
+{
+	int i;
+
+	if (name->len != obj->name_len)
+		return 0;
+
+	for (i = 0; i < name->len; i++) {
+		char c1, c2;
+
+		c1 = name->name[i];
+		c2 = obj->name[i];
+
+		if (c1 >= 'A' && c1 <= 'Z')
+			c1 += 'a' - 'A';
+		if (c2 >= 'A' && c2 <= 'Z')
+			c2 += 'a' - 'A';
+
+		if (c1 != c2)
+			return 0;
+	}
+	return 1;
+}
+
+static int
+adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_info *obj)
+{
+	struct super_block *sb = inode->i_sb;
+	struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+	struct adfs_dir dir;
+	int ret;
+
+	ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
+	if (ret)
+		goto out;
+
+	if (ADFS_I(inode)->parent_id != dir.parent_id) {
+		adfs_error(sb, "parent directory changed under me! (%lx but got %lx)\n",
+			   ADFS_I(inode)->parent_id, dir.parent_id);
+		ret = -EIO;
+		goto free_out;
+	}
+
+	obj->parent_id = inode->i_ino;
+
+	/*
+	 * '.' is handled by reserved_lookup() in fs/namei.c
+	 */
+	if (name->len == 2 && name->name[0] == '.' && name->name[1] == '.') {
+		/*
+		 * Currently unable to fill in the rest of 'obj',
+		 * but this is better than nothing.  We need to
+		 * ascend one level to find it's parent.
+		 */
+		obj->name_len = 0;
+		obj->file_id  = obj->parent_id;
+		goto free_out;
+	}
+
+	read_lock(&adfs_dir_lock);
+
+	ret = ops->setpos(&dir, 0);
+	if (ret)
+		goto unlock_out;
+
+	ret = -ENOENT;
+	while (ops->getnext(&dir, obj) == 0) {
+		if (adfs_match(name, obj)) {
+			ret = 0;
+			break;
+		}
+	}
+
+unlock_out:
+	read_unlock(&adfs_dir_lock);
+
+free_out:
+	ops->free(&dir);
+out:
+	return ret;
+}
+
+const struct file_operations adfs_dir_operations = {
+	.read		= generic_read_dir,
+	.llseek		= generic_file_llseek,
+	.readdir	= adfs_readdir,
+	.fsync		= generic_file_fsync,
+};
+
+static int
+adfs_hash(const struct dentry *parent, const struct inode *inode,
+		struct qstr *qstr)
+{
+	const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
+	const unsigned char *name;
+	unsigned long hash;
+	int i;
+
+	if (qstr->len < name_len)
+		return 0;
+
+	/*
+	 * Truncate the name in place, avoids
+	 * having to define a compare function.
+	 */
+	qstr->len = i = name_len;
+	name = qstr->name;
+	hash = init_name_hash();
+	while (i--) {
+		char c;
+
+		c = *name++;
+		if (c >= 'A' && c <= 'Z')
+			c += 'a' - 'A';
+
+		hash = partial_name_hash(c, hash);
+	}
+	qstr->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+/*
+ * Compare two names, taking note of the name length
+ * requirements of the underlying filesystem.
+ */
+static int
+adfs_compare(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	int i;
+
+	if (len != name->len)
+		return 1;
+
+	for (i = 0; i < name->len; i++) {
+		char a, b;
+
+		a = str[i];
+		b = name->name[i];
+
+		if (a >= 'A' && a <= 'Z')
+			a += 'a' - 'A';
+		if (b >= 'A' && b <= 'Z')
+			b += 'a' - 'A';
+
+		if (a != b)
+			return 1;
+	}
+	return 0;
+}
+
+const struct dentry_operations adfs_dentry_operations = {
+	.d_hash		= adfs_hash,
+	.d_compare	= adfs_compare,
+};
+
+static struct dentry *
+adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = NULL;
+	struct object_info obj;
+	int error;
+
+	error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
+	if (error == 0) {
+		error = -EACCES;
+		/*
+		 * This only returns NULL if get_empty_inode
+		 * fails.
+		 */
+		inode = adfs_iget(dir->i_sb, &obj);
+		if (inode)
+			error = 0;
+	}
+	d_add(dentry, inode);
+	return ERR_PTR(error);
+}
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations adfs_dir_inode_operations = {
+	.lookup		= adfs_lookup,
+	.setattr	= adfs_notify_change,
+};
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
new file mode 100644
index 00000000..4bbe853e
--- /dev/null
+++ b/fs/adfs/dir_f.c
@@ -0,0 +1,486 @@
+/*
+ *  linux/fs/adfs/dir_f.c
+ *
+ * Copyright (C) 1997-1999 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  E and F format directory handling
+ */
+#include <linux/buffer_head.h>
+#include "adfs.h"
+#include "dir_f.h"
+
+static void adfs_f_free(struct adfs_dir *dir);
+
+/*
+ * Read an (unaligned) value of length 1..4 bytes
+ */
+static inline unsigned int adfs_readval(unsigned char *p, int len)
+{
+	unsigned int val = 0;
+
+	switch (len) {
+	case 4:		val |= p[3] << 24;
+	case 3:		val |= p[2] << 16;
+	case 2:		val |= p[1] << 8;
+	default:	val |= p[0];
+	}
+	return val;
+}
+
+static inline void adfs_writeval(unsigned char *p, int len, unsigned int val)
+{
+	switch (len) {
+	case 4:		p[3] = val >> 24;
+	case 3:		p[2] = val >> 16;
+	case 2:		p[1] = val >> 8;
+	default:	p[0] = val;
+	}
+}
+
+static inline int adfs_readname(char *buf, char *ptr, int maxlen)
+{
+	char *old_buf = buf;
+
+	while ((unsigned char)*ptr >= ' ' && maxlen--) {
+		if (*ptr == '/')
+			*buf++ = '.';
+		else
+			*buf++ = *ptr;
+		ptr++;
+	}
+
+	return buf - old_buf;
+}
+
+#define ror13(v) ((v >> 13) | (v << 19))
+
+#define dir_u8(idx)				\
+	({ int _buf = idx >> blocksize_bits;	\
+	   int _off = idx - (_buf << blocksize_bits);\
+	  *(u8 *)(bh[_buf]->b_data + _off);	\
+	})
+
+#define dir_u32(idx)				\
+	({ int _buf = idx >> blocksize_bits;	\
+	   int _off = idx - (_buf << blocksize_bits);\
+	  *(__le32 *)(bh[_buf]->b_data + _off);	\
+	})
+
+#define bufoff(_bh,_idx)			\
+	({ int _buf = _idx >> blocksize_bits;	\
+	   int _off = _idx - (_buf << blocksize_bits);\
+	  (u8 *)(_bh[_buf]->b_data + _off);	\
+	})
+
+/*
+ * There are some algorithms that are nice in
+ * assembler, but a bitch in C...  This is one
+ * of them.
+ */
+static u8
+adfs_dir_checkbyte(const struct adfs_dir *dir)
+{
+	struct buffer_head * const *bh = dir->bh;
+	const int blocksize_bits = dir->sb->s_blocksize_bits;
+	union { __le32 *ptr32; u8 *ptr8; } ptr, end;
+	u32 dircheck = 0;
+	int last = 5 - 26;
+	int i = 0;
+
+	/*
+	 * Accumulate each word up to the last whole
+	 * word of the last directory entry.  This
+	 * can spread across several buffer heads.
+	 */
+	do {
+		last += 26;
+		do {
+			dircheck = le32_to_cpu(dir_u32(i)) ^ ror13(dircheck);
+
+			i += sizeof(u32);
+		} while (i < (last & ~3));
+	} while (dir_u8(last) != 0);
+
+	/*
+	 * Accumulate the last few bytes.  These
+	 * bytes will be within the same bh.
+	 */
+	if (i != last) {
+		ptr.ptr8 = bufoff(bh, i);
+		end.ptr8 = ptr.ptr8 + last - i;
+
+		do {
+			dircheck = *ptr.ptr8++ ^ ror13(dircheck);
+		} while (ptr.ptr8 < end.ptr8);
+	}
+
+	/*
+	 * The directory tail is in the final bh
+	 * Note that contary to the RISC OS PRMs,
+	 * the first few bytes are NOT included
+	 * in the check.  All bytes are in the
+	 * same bh.
+	 */
+	ptr.ptr8 = bufoff(bh, 2008);
+	end.ptr8 = ptr.ptr8 + 36;
+
+	do {
+		__le32 v = *ptr.ptr32++;
+		dircheck = le32_to_cpu(v) ^ ror13(dircheck);
+	} while (ptr.ptr32 < end.ptr32);
+
+	return (dircheck ^ (dircheck >> 8) ^ (dircheck >> 16) ^ (dircheck >> 24)) & 0xff;
+}
+
+/*
+ * Read and check that a directory is valid
+ */
+static int
+adfs_dir_read(struct super_block *sb, unsigned long object_id,
+	      unsigned int size, struct adfs_dir *dir)
+{
+	const unsigned int blocksize_bits = sb->s_blocksize_bits;
+	int blk = 0;
+
+	/*
+	 * Directories which are not a multiple of 2048 bytes
+	 * are considered bad v2 [3.6]
+	 */
+	if (size & 2047)
+		goto bad_dir;
+
+	size >>= blocksize_bits;
+
+	dir->nr_buffers = 0;
+	dir->sb = sb;
+
+	for (blk = 0; blk < size; blk++) {
+		int phys;
+
+		phys = __adfs_block_map(sb, object_id, blk);
+		if (!phys) {
+			adfs_error(sb, "dir object %lX has a hole at offset %d",
+				   object_id, blk);
+			goto release_buffers;
+		}
+
+		dir->bh[blk] = sb_bread(sb, phys);
+		if (!dir->bh[blk])
+			goto release_buffers;
+	}
+
+	memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead));
+	memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail));
+
+	if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq ||
+	    memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4))
+		goto bad_dir;
+
+	if (memcmp(&dir->dirhead.startname, "Nick", 4) &&
+	    memcmp(&dir->dirhead.startname, "Hugo", 4))
+		goto bad_dir;
+
+	if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte)
+		goto bad_dir;
+
+	dir->nr_buffers = blk;
+
+	return 0;
+
+bad_dir:
+	adfs_error(sb, "corrupted directory fragment %lX",
+		   object_id);
+release_buffers:
+	for (blk -= 1; blk >= 0; blk -= 1)
+		brelse(dir->bh[blk]);
+
+	dir->sb = NULL;
+
+	return -EIO;
+}
+
+/*
+ * convert a disk-based directory entry to a Linux ADFS directory entry
+ */
+static inline void
+adfs_dir2obj(struct adfs_dir *dir, struct object_info *obj,
+	struct adfs_direntry *de)
+{
+	obj->name_len =	adfs_readname(obj->name, de->dirobname, ADFS_F_NAME_LEN);
+	obj->file_id  = adfs_readval(de->dirinddiscadd, 3);
+	obj->loadaddr = adfs_readval(de->dirload, 4);
+	obj->execaddr = adfs_readval(de->direxec, 4);
+	obj->size     = adfs_readval(de->dirlen,  4);
+	obj->attr     = de->newdiratts;
+	obj->filetype = -1;
+
+	/*
+	 * object is a file and is filetyped and timestamped?
+	 * RISC OS 12-bit filetype is stored in load_address[19:8]
+	 */
+	if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
+		(0xfff00000 == (0xfff00000 & obj->loadaddr))) {
+		obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
+
+		/* optionally append the ,xyz hex filetype suffix */
+		if (ADFS_SB(dir->sb)->s_ftsuffix)
+			obj->name_len +=
+				append_filetype_suffix(
+					&obj->name[obj->name_len],
+					obj->filetype);
+	}
+}
+
+/*
+ * convert a Linux ADFS directory entry to a disk-based directory entry
+ */
+static inline void
+adfs_obj2dir(struct adfs_direntry *de, struct object_info *obj)
+{
+	adfs_writeval(de->dirinddiscadd, 3, obj->file_id);
+	adfs_writeval(de->dirload, 4, obj->loadaddr);
+	adfs_writeval(de->direxec, 4, obj->execaddr);
+	adfs_writeval(de->dirlen,  4, obj->size);
+	de->newdiratts = obj->attr;
+}
+
+/*
+ * get a directory entry.  Note that the caller is responsible
+ * for holding the relevant locks.
+ */
+static int
+__adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj)
+{
+	struct super_block *sb = dir->sb;
+	struct adfs_direntry de;
+	int thissize, buffer, offset;
+
+	buffer = pos >> sb->s_blocksize_bits;
+
+	if (buffer > dir->nr_buffers)
+		return -EINVAL;
+
+	offset = pos & (sb->s_blocksize - 1);
+	thissize = sb->s_blocksize - offset;
+	if (thissize > 26)
+		thissize = 26;
+
+	memcpy(&de, dir->bh[buffer]->b_data + offset, thissize);
+	if (thissize != 26)
+		memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data,
+		       26 - thissize);
+
+	if (!de.dirobname[0])
+		return -ENOENT;
+
+	adfs_dir2obj(dir, obj, &de);
+
+	return 0;
+}
+
+static int
+__adfs_dir_put(struct adfs_dir *dir, int pos, struct object_info *obj)
+{
+	struct super_block *sb = dir->sb;
+	struct adfs_direntry de;
+	int thissize, buffer, offset;
+
+	buffer = pos >> sb->s_blocksize_bits;
+
+	if (buffer > dir->nr_buffers)
+		return -EINVAL;
+
+	offset = pos & (sb->s_blocksize - 1);
+	thissize = sb->s_blocksize - offset;
+	if (thissize > 26)
+		thissize = 26;
+
+	/*
+	 * Get the entry in total
+	 */
+	memcpy(&de, dir->bh[buffer]->b_data + offset, thissize);
+	if (thissize != 26)
+		memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data,
+		       26 - thissize);
+
+	/*
+	 * update it
+	 */
+	adfs_obj2dir(&de, obj);
+
+	/*
+	 * Put the new entry back
+	 */
+	memcpy(dir->bh[buffer]->b_data + offset, &de, thissize);
+	if (thissize != 26)
+		memcpy(dir->bh[buffer + 1]->b_data, ((char *)&de) + thissize,
+		       26 - thissize);
+
+	return 0;
+}
+
+/*
+ * the caller is responsible for holding the necessary
+ * locks.
+ */
+static int
+adfs_dir_find_entry(struct adfs_dir *dir, unsigned long object_id)
+{
+	int pos, ret;
+
+	ret = -ENOENT;
+
+	for (pos = 5; pos < ADFS_NUM_DIR_ENTRIES * 26 + 5; pos += 26) {
+		struct object_info obj;
+
+		if (!__adfs_dir_get(dir, pos, &obj))
+			break;
+
+		if (obj.file_id == object_id) {
+			ret = pos;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int
+adfs_f_read(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir)
+{
+	int ret;
+
+	if (sz != ADFS_NEWDIR_SIZE)
+		return -EIO;
+
+	ret = adfs_dir_read(sb, id, sz, dir);
+	if (ret)
+		adfs_error(sb, "unable to read directory");
+	else
+		dir->parent_id = adfs_readval(dir->dirtail.new.dirparent, 3);
+
+	return ret;
+}
+
+static int
+adfs_f_setpos(struct adfs_dir *dir, unsigned int fpos)
+{
+	if (fpos >= ADFS_NUM_DIR_ENTRIES)
+		return -ENOENT;
+
+	dir->pos = 5 + fpos * 26;
+	return 0;
+}
+
+static int
+adfs_f_getnext(struct adfs_dir *dir, struct object_info *obj)
+{
+	unsigned int ret;
+
+	ret = __adfs_dir_get(dir, dir->pos, obj);
+	if (ret == 0)
+		dir->pos += 26;
+
+	return ret;
+}
+
+static int
+adfs_f_update(struct adfs_dir *dir, struct object_info *obj)
+{
+	struct super_block *sb = dir->sb;
+	int ret, i;
+
+	ret = adfs_dir_find_entry(dir, obj->file_id);
+	if (ret < 0) {
+		adfs_error(dir->sb, "unable to locate entry to update");
+		goto out;
+	}
+
+	__adfs_dir_put(dir, ret, obj);
+ 
+	/*
+	 * Increment directory sequence number
+	 */
+	dir->bh[0]->b_data[0] += 1;
+	dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 6] += 1;
+
+	ret = adfs_dir_checkbyte(dir);
+	/*
+	 * Update directory check byte
+	 */
+	dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 1] = ret;
+
+#if 1
+	{
+	const unsigned int blocksize_bits = sb->s_blocksize_bits;
+
+	memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead));
+	memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail));
+
+	if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq ||
+	    memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4))
+		goto bad_dir;
+
+	if (memcmp(&dir->dirhead.startname, "Nick", 4) &&
+	    memcmp(&dir->dirhead.startname, "Hugo", 4))
+		goto bad_dir;
+
+	if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte)
+		goto bad_dir;
+	}
+#endif
+	for (i = dir->nr_buffers - 1; i >= 0; i--)
+		mark_buffer_dirty(dir->bh[i]);
+
+	ret = 0;
+out:
+	return ret;
+#if 1
+bad_dir:
+	adfs_error(dir->sb, "whoops!  I broke a directory!");
+	return -EIO;
+#endif
+}
+
+static int
+adfs_f_sync(struct adfs_dir *dir)
+{
+	int err = 0;
+	int i;
+
+	for (i = dir->nr_buffers - 1; i >= 0; i--) {
+		struct buffer_head *bh = dir->bh[i];
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			err = -EIO;
+	}
+
+	return err;
+}
+
+static void
+adfs_f_free(struct adfs_dir *dir)
+{
+	int i;
+
+	for (i = dir->nr_buffers - 1; i >= 0; i--) {
+		brelse(dir->bh[i]);
+		dir->bh[i] = NULL;
+	}
+
+	dir->nr_buffers = 0;
+	dir->sb = NULL;
+}
+
+struct adfs_dir_ops adfs_f_dir_ops = {
+	.read		= adfs_f_read,
+	.setpos		= adfs_f_setpos,
+	.getnext	= adfs_f_getnext,
+	.update		= adfs_f_update,
+	.sync		= adfs_f_sync,
+	.free		= adfs_f_free
+};
diff --git a/fs/adfs/dir_f.h b/fs/adfs/dir_f.h
new file mode 100644
index 00000000..e4713404
--- /dev/null
+++ b/fs/adfs/dir_f.h
@@ -0,0 +1,65 @@
+/*
+ *  linux/fs/adfs/dir_f.h
+ *
+ *  Copyright (C) 1999 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  Structures of directories on the F format disk
+ */
+#ifndef ADFS_DIR_F_H
+#define ADFS_DIR_F_H
+
+/*
+ * Directory header
+ */
+struct adfs_dirheader {
+	unsigned char startmasseq;
+	unsigned char startname[4];
+};
+
+#define ADFS_NEWDIR_SIZE	2048
+#define ADFS_NUM_DIR_ENTRIES	77
+
+/*
+ * Directory entries
+ */
+struct adfs_direntry {
+#define ADFS_F_NAME_LEN 10
+	char dirobname[ADFS_F_NAME_LEN];
+	__u8 dirload[4];
+	__u8 direxec[4];
+	__u8 dirlen[4];
+	__u8 dirinddiscadd[3];
+	__u8 newdiratts;
+};
+
+/*
+ * Directory tail
+ */
+union adfs_dirtail {
+	struct {
+		unsigned char dirlastmask;
+		char dirname[10];
+		unsigned char dirparent[3];
+		char dirtitle[19];
+		unsigned char reserved[14];
+		unsigned char endmasseq;
+		unsigned char endname[4];
+		unsigned char dircheckbyte;
+	} old;
+	struct {
+		unsigned char dirlastmask;
+		unsigned char reserved[2];
+		unsigned char dirparent[3];
+		char dirtitle[19];
+		char dirname[10];
+		unsigned char endmasseq;
+		unsigned char endname[4];
+		unsigned char dircheckbyte;
+	} new;
+};
+
+#endif
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
new file mode 100644
index 00000000..d9e3bee4
--- /dev/null
+++ b/fs/adfs/dir_fplus.c
@@ -0,0 +1,265 @@
+/*
+ *  linux/fs/adfs/dir_fplus.c
+ *
+ *  Copyright (C) 1997-1999 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include "adfs.h"
+#include "dir_fplus.h"
+
+static int
+adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir)
+{
+	struct adfs_bigdirheader *h;
+	struct adfs_bigdirtail *t;
+	unsigned long block;
+	unsigned int blk, size;
+	int i, ret = -EIO;
+
+	dir->nr_buffers = 0;
+
+	/* start off using fixed bh set - only alloc for big dirs */
+	dir->bh_fplus = &dir->bh[0];
+
+	block = __adfs_block_map(sb, id, 0);
+	if (!block) {
+		adfs_error(sb, "dir object %X has a hole at offset 0", id);
+		goto out;
+	}
+
+	dir->bh_fplus[0] = sb_bread(sb, block);
+	if (!dir->bh_fplus[0])
+		goto out;
+	dir->nr_buffers += 1;
+
+	h = (struct adfs_bigdirheader *)dir->bh_fplus[0]->b_data;
+	size = le32_to_cpu(h->bigdirsize);
+	if (size != sz) {
+		printk(KERN_WARNING "adfs: adfs_fplus_read:"
+					" directory header size %X\n"
+					" does not match directory size %X\n",
+					size, sz);
+	}
+
+	if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 ||
+	    h->bigdirversion[2] != 0 || size & 2047 ||
+	    h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) {
+		printk(KERN_WARNING "adfs: dir object %X has"
+					" malformed dir header\n", id);
+		goto out;
+	}
+
+	size >>= sb->s_blocksize_bits;
+	if (size > sizeof(dir->bh)/sizeof(dir->bh[0])) {
+		/* this directory is too big for fixed bh set, must allocate */
+		struct buffer_head **bh_fplus =
+			kzalloc(size * sizeof(struct buffer_head *),
+				GFP_KERNEL);
+		if (!bh_fplus) {
+			adfs_error(sb, "not enough memory for"
+					" dir object %X (%d blocks)", id, size);
+			goto out;
+		}
+		dir->bh_fplus = bh_fplus;
+		/* copy over the pointer to the block that we've already read */
+		dir->bh_fplus[0] = dir->bh[0];
+	}
+
+	for (blk = 1; blk < size; blk++) {
+		block = __adfs_block_map(sb, id, blk);
+		if (!block) {
+			adfs_error(sb, "dir object %X has a hole at offset %d", id, blk);
+			goto out;
+		}
+
+		dir->bh_fplus[blk] = sb_bread(sb, block);
+		if (!dir->bh_fplus[blk]) {
+			adfs_error(sb,	"dir object %X failed read for"
+					" offset %d, mapped block %X",
+					id, blk, block);
+			goto out;
+		}
+
+		dir->nr_buffers += 1;
+	}
+
+	t = (struct adfs_bigdirtail *)
+		(dir->bh_fplus[size - 1]->b_data + (sb->s_blocksize - 8));
+
+	if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) ||
+	    t->bigdirendmasseq != h->startmasseq ||
+	    t->reserved[0] != 0 || t->reserved[1] != 0) {
+		printk(KERN_WARNING "adfs: dir object %X has "
+					"malformed dir end\n", id);
+		goto out;
+	}
+
+	dir->parent_id = le32_to_cpu(h->bigdirparent);
+	dir->sb = sb;
+	return 0;
+
+out:
+	if (dir->bh_fplus) {
+		for (i = 0; i < dir->nr_buffers; i++)
+			brelse(dir->bh_fplus[i]);
+
+		if (&dir->bh[0] != dir->bh_fplus)
+			kfree(dir->bh_fplus);
+
+		dir->bh_fplus = NULL;
+	}
+
+	dir->nr_buffers = 0;
+	dir->sb = NULL;
+	return ret;
+}
+
+static int
+adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos)
+{
+	struct adfs_bigdirheader *h =
+		(struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
+	int ret = -ENOENT;
+
+	if (fpos <= le32_to_cpu(h->bigdirentries)) {
+		dir->pos = fpos;
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static void
+dir_memcpy(struct adfs_dir *dir, unsigned int offset, void *to, int len)
+{
+	struct super_block *sb = dir->sb;
+	unsigned int buffer, partial, remainder;
+
+	buffer = offset >> sb->s_blocksize_bits;
+	offset &= sb->s_blocksize - 1;
+
+	partial = sb->s_blocksize - offset;
+
+	if (partial >= len)
+		memcpy(to, dir->bh_fplus[buffer]->b_data + offset, len);
+	else {
+		char *c = (char *)to;
+
+		remainder = len - partial;
+
+		memcpy(c,
+			dir->bh_fplus[buffer]->b_data + offset,
+			partial);
+
+		memcpy(c + partial,
+			dir->bh_fplus[buffer + 1]->b_data,
+			remainder);
+	}
+}
+
+static int
+adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
+{
+	struct adfs_bigdirheader *h =
+		(struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
+	struct adfs_bigdirentry bde;
+	unsigned int offset;
+	int i, ret = -ENOENT;
+
+	if (dir->pos >= le32_to_cpu(h->bigdirentries))
+		goto out;
+
+	offset = offsetof(struct adfs_bigdirheader, bigdirname);
+	offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3);
+	offset += dir->pos * sizeof(struct adfs_bigdirentry);
+
+	dir_memcpy(dir, offset, &bde, sizeof(struct adfs_bigdirentry));
+
+	obj->loadaddr = le32_to_cpu(bde.bigdirload);
+	obj->execaddr = le32_to_cpu(bde.bigdirexec);
+	obj->size     = le32_to_cpu(bde.bigdirlen);
+	obj->file_id  = le32_to_cpu(bde.bigdirindaddr);
+	obj->attr     = le32_to_cpu(bde.bigdirattr);
+	obj->name_len = le32_to_cpu(bde.bigdirobnamelen);
+
+	offset = offsetof(struct adfs_bigdirheader, bigdirname);
+	offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3);
+	offset += le32_to_cpu(h->bigdirentries) * sizeof(struct adfs_bigdirentry);
+	offset += le32_to_cpu(bde.bigdirobnameptr);
+
+	dir_memcpy(dir, offset, obj->name, obj->name_len);
+	for (i = 0; i < obj->name_len; i++)
+		if (obj->name[i] == '/')
+			obj->name[i] = '.';
+
+	obj->filetype = -1;
+
+	/*
+	 * object is a file and is filetyped and timestamped?
+	 * RISC OS 12-bit filetype is stored in load_address[19:8]
+	 */
+	if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
+		(0xfff00000 == (0xfff00000 & obj->loadaddr))) {
+		obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
+
+		/* optionally append the ,xyz hex filetype suffix */
+		if (ADFS_SB(dir->sb)->s_ftsuffix)
+			obj->name_len +=
+				append_filetype_suffix(
+					&obj->name[obj->name_len],
+					obj->filetype);
+	}
+
+	dir->pos += 1;
+	ret = 0;
+out:
+	return ret;
+}
+
+static int
+adfs_fplus_sync(struct adfs_dir *dir)
+{
+	int err = 0;
+	int i;
+
+	for (i = dir->nr_buffers - 1; i >= 0; i--) {
+		struct buffer_head *bh = dir->bh_fplus[i];
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			err = -EIO;
+	}
+
+	return err;
+}
+
+static void
+adfs_fplus_free(struct adfs_dir *dir)
+{
+	int i;
+
+	if (dir->bh_fplus) {
+		for (i = 0; i < dir->nr_buffers; i++)
+			brelse(dir->bh_fplus[i]);
+
+		if (&dir->bh[0] != dir->bh_fplus)
+			kfree(dir->bh_fplus);
+
+		dir->bh_fplus = NULL;
+	}
+
+	dir->nr_buffers = 0;
+	dir->sb = NULL;
+}
+
+struct adfs_dir_ops adfs_fplus_dir_ops = {
+	.read		= adfs_fplus_read,
+	.setpos		= adfs_fplus_setpos,
+	.getnext	= adfs_fplus_getnext,
+	.sync		= adfs_fplus_sync,
+	.free		= adfs_fplus_free
+};
diff --git a/fs/adfs/dir_fplus.h b/fs/adfs/dir_fplus.h
new file mode 100644
index 00000000..b55aa41a
--- /dev/null
+++ b/fs/adfs/dir_fplus.h
@@ -0,0 +1,45 @@
+/*
+ *  linux/fs/adfs/dir_fplus.h
+ *
+ *  Copyright (C) 1999 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  Structures of directories on the F+ format disk
+ */
+
+#define ADFS_FPLUS_NAME_LEN	255
+
+#define BIGDIRSTARTNAME ('S' | 'B' << 8 | 'P' << 16 | 'r' << 24)
+#define BIGDIRENDNAME	('o' | 'v' << 8 | 'e' << 16 | 'n' << 24)
+
+struct adfs_bigdirheader {
+	__u8	startmasseq;
+	__u8	bigdirversion[3];
+	__le32	bigdirstartname;
+	__le32	bigdirnamelen;
+	__le32	bigdirsize;
+	__le32	bigdirentries;
+	__le32	bigdirnamesize;
+	__le32	bigdirparent;
+	char	bigdirname[1];
+};
+
+struct adfs_bigdirentry {
+	__le32	bigdirload;
+	__le32	bigdirexec;
+	__le32	bigdirlen;
+	__le32	bigdirindaddr;
+	__le32	bigdirattr;
+	__le32	bigdirobnamelen;
+	__le32	bigdirobnameptr;
+};
+
+struct adfs_bigdirtail {
+	__le32	bigdirendname;
+	__u8	bigdirendmasseq;
+	__u8	reserved[2];
+	__u8	bigdircheckbyte;
+};
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
new file mode 100644
index 00000000..a36da538
--- /dev/null
+++ b/fs/adfs/file.c
@@ -0,0 +1,37 @@
+/*
+ *  linux/fs/adfs/file.c
+ *
+ * Copyright (C) 1997-1999 Russell King
+ * from:
+ *
+ *  linux/fs/ext2/file.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/file.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  adfs regular file handling primitives           
+ */
+#include "adfs.h"
+
+const struct file_operations adfs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.mmap		= generic_file_mmap,
+	.fsync		= generic_file_fsync,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+	.splice_read	= generic_file_splice_read,
+};
+
+const struct inode_operations adfs_file_inode_operations = {
+	.setattr	= adfs_notify_change,
+};
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
new file mode 100644
index 00000000..d5250c5a
--- /dev/null
+++ b/fs/adfs/inode.c
@@ -0,0 +1,366 @@
+/*
+ *  linux/fs/adfs/inode.c
+ *
+ *  Copyright (C) 1997-1999 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include "adfs.h"
+
+/*
+ * Lookup/Create a block at offset 'block' into 'inode'.  We currently do
+ * not support creation of new blocks, so we return -EIO for this case.
+ */
+static int
+adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
+	       int create)
+{
+	if (!create) {
+		if (block >= inode->i_blocks)
+			goto abort_toobig;
+
+		block = __adfs_block_map(inode->i_sb, inode->i_ino, block);
+		if (block)
+			map_bh(bh, inode->i_sb, block);
+		return 0;
+	}
+	/* don't support allocation of blocks yet */
+	return -EIO;
+
+abort_toobig:
+	return 0;
+}
+
+static int adfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, adfs_get_block, wbc);
+}
+
+static int adfs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, adfs_get_block);
+}
+
+static int adfs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	*pagep = NULL;
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+				adfs_get_block,
+				&ADFS_I(mapping->host)->mmu_private);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
+
+static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, adfs_get_block);
+}
+
+static const struct address_space_operations adfs_aops = {
+	.readpage	= adfs_readpage,
+	.writepage	= adfs_writepage,
+	.write_begin	= adfs_write_begin,
+	.write_end	= generic_write_end,
+	.bmap		= _adfs_bmap
+};
+
+/*
+ * Convert ADFS attributes and filetype to Linux permission.
+ */
+static umode_t
+adfs_atts2mode(struct super_block *sb, struct inode *inode)
+{
+	unsigned int attr = ADFS_I(inode)->attr;
+	umode_t mode, rmask;
+	struct adfs_sb_info *asb = ADFS_SB(sb);
+
+	if (attr & ADFS_NDA_DIRECTORY) {
+		mode = S_IRUGO & asb->s_owner_mask;
+		return S_IFDIR | S_IXUGO | mode;
+	}
+
+	switch (ADFS_I(inode)->filetype) {
+	case 0xfc0:	/* LinkFS */
+		return S_IFLNK|S_IRWXUGO;
+
+	case 0xfe6:	/* UnixExec */
+		rmask = S_IRUGO | S_IXUGO;
+		break;
+
+	default:
+		rmask = S_IRUGO;
+	}
+
+	mode = S_IFREG;
+
+	if (attr & ADFS_NDA_OWNER_READ)
+		mode |= rmask & asb->s_owner_mask;
+
+	if (attr & ADFS_NDA_OWNER_WRITE)
+		mode |= S_IWUGO & asb->s_owner_mask;
+
+	if (attr & ADFS_NDA_PUBLIC_READ)
+		mode |= rmask & asb->s_other_mask;
+
+	if (attr & ADFS_NDA_PUBLIC_WRITE)
+		mode |= S_IWUGO & asb->s_other_mask;
+	return mode;
+}
+
+/*
+ * Convert Linux permission to ADFS attribute.  We try to do the reverse
+ * of atts2mode, but there is not a 1:1 translation.
+ */
+static int
+adfs_mode2atts(struct super_block *sb, struct inode *inode)
+{
+	umode_t mode;
+	int attr;
+	struct adfs_sb_info *asb = ADFS_SB(sb);
+
+	/* FIXME: should we be able to alter a link? */
+	if (S_ISLNK(inode->i_mode))
+		return ADFS_I(inode)->attr;
+
+	if (S_ISDIR(inode->i_mode))
+		attr = ADFS_NDA_DIRECTORY;
+	else
+		attr = 0;
+
+	mode = inode->i_mode & asb->s_owner_mask;
+	if (mode & S_IRUGO)
+		attr |= ADFS_NDA_OWNER_READ;
+	if (mode & S_IWUGO)
+		attr |= ADFS_NDA_OWNER_WRITE;
+
+	mode = inode->i_mode & asb->s_other_mask;
+	mode &= ~asb->s_owner_mask;
+	if (mode & S_IRUGO)
+		attr |= ADFS_NDA_PUBLIC_READ;
+	if (mode & S_IWUGO)
+		attr |= ADFS_NDA_PUBLIC_WRITE;
+
+	return attr;
+}
+
+/*
+ * Convert an ADFS time to Unix time.  ADFS has a 40-bit centi-second time
+ * referenced to 1 Jan 1900 (til 2248) so we need to discard 2208988800 seconds
+ * of time to convert from RISC OS epoch to Unix epoch.
+ */
+static void
+adfs_adfs2unix_time(struct timespec *tv, struct inode *inode)
+{
+	unsigned int high, low;
+	/* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since
+	 * 01 Jan 1900 00:00:00 (RISC OS epoch)
+	 */
+	static const s64 nsec_unix_epoch_diff_risc_os_epoch =
+							2208988800000000000LL;
+	s64 nsec;
+
+	if (ADFS_I(inode)->stamped == 0)
+		goto cur_time;
+
+	high = ADFS_I(inode)->loadaddr & 0xFF; /* top 8 bits of timestamp */
+	low  = ADFS_I(inode)->execaddr;    /* bottom 32 bits of timestamp */
+
+	/* convert 40-bit centi-seconds to 32-bit seconds
+	 * going via nanoseconds to retain precision
+	 */
+	nsec = (((s64) high << 32) | (s64) low) * 10000000; /* cs to ns */
+
+	/* Files dated pre  01 Jan 1970 00:00:00. */
+	if (nsec < nsec_unix_epoch_diff_risc_os_epoch)
+		goto too_early;
+
+	/* convert from RISC OS to Unix epoch */
+	nsec -= nsec_unix_epoch_diff_risc_os_epoch;
+
+	*tv = ns_to_timespec(nsec);
+	return;
+
+ cur_time:
+	*tv = CURRENT_TIME;
+	return;
+
+ too_early:
+	tv->tv_sec = tv->tv_nsec = 0;
+	return;
+}
+
+/*
+ * Convert an Unix time to ADFS time.  We only do this if the entry has a
+ * time/date stamp already.
+ */
+static void
+adfs_unix2adfs_time(struct inode *inode, unsigned int secs)
+{
+	unsigned int high, low;
+
+	if (ADFS_I(inode)->stamped) {
+		/* convert 32-bit seconds to 40-bit centi-seconds */
+		low  = (secs & 255) * 100;
+		high = (secs / 256) * 100 + (low >> 8) + 0x336e996a;
+
+		ADFS_I(inode)->loadaddr = (high >> 24) |
+				(ADFS_I(inode)->loadaddr & ~0xff);
+		ADFS_I(inode)->execaddr = (low & 255) | (high << 8);
+	}
+}
+
+/*
+ * Fill in the inode information from the object information.
+ *
+ * Note that this is an inode-less filesystem, so we can't use the inode
+ * number to reference the metadata on the media.  Instead, we use the
+ * inode number to hold the object ID, which in turn will tell us where
+ * the data is held.  We also save the parent object ID, and with these
+ * two, we can locate the metadata.
+ *
+ * This does mean that we rely on an objects parent remaining the same at
+ * all times - we cannot cope with a cross-directory rename (yet).
+ */
+struct inode *
+adfs_iget(struct super_block *sb, struct object_info *obj)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		goto out;
+
+	inode->i_uid	 = ADFS_SB(sb)->s_uid;
+	inode->i_gid	 = ADFS_SB(sb)->s_gid;
+	inode->i_ino	 = obj->file_id;
+	inode->i_size	 = obj->size;
+	inode->i_nlink	 = 2;
+	inode->i_blocks	 = (inode->i_size + sb->s_blocksize - 1) >>
+			    sb->s_blocksize_bits;
+
+	/*
+	 * we need to save the parent directory ID so that
+	 * write_inode can update the directory information
+	 * for this file.  This will need special handling
+	 * for cross-directory renames.
+	 */
+	ADFS_I(inode)->parent_id = obj->parent_id;
+	ADFS_I(inode)->loadaddr  = obj->loadaddr;
+	ADFS_I(inode)->execaddr  = obj->execaddr;
+	ADFS_I(inode)->attr      = obj->attr;
+	ADFS_I(inode)->filetype  = obj->filetype;
+	ADFS_I(inode)->stamped   = ((obj->loadaddr & 0xfff00000) == 0xfff00000);
+
+	inode->i_mode	 = adfs_atts2mode(sb, inode);
+	adfs_adfs2unix_time(&inode->i_mtime, inode);
+	inode->i_atime = inode->i_mtime;
+	inode->i_ctime = inode->i_mtime;
+
+	if (S_ISDIR(inode->i_mode)) {
+		inode->i_op	= &adfs_dir_inode_operations;
+		inode->i_fop	= &adfs_dir_operations;
+	} else if (S_ISREG(inode->i_mode)) {
+		inode->i_op	= &adfs_file_inode_operations;
+		inode->i_fop	= &adfs_file_operations;
+		inode->i_mapping->a_ops = &adfs_aops;
+		ADFS_I(inode)->mmu_private = inode->i_size;
+	}
+
+	insert_inode_hash(inode);
+
+out:
+	return inode;
+}
+
+/*
+ * Validate and convert a changed access mode/time to their ADFS equivalents.
+ * adfs_write_inode will actually write the information back to the directory
+ * later.
+ */
+int
+adfs_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	unsigned int ia_valid = attr->ia_valid;
+	int error;
+	
+	error = inode_change_ok(inode, attr);
+
+	/*
+	 * we can't change the UID or GID of any file -
+	 * we have a global UID/GID in the superblock
+	 */
+	if ((ia_valid & ATTR_UID && attr->ia_uid != ADFS_SB(sb)->s_uid) ||
+	    (ia_valid & ATTR_GID && attr->ia_gid != ADFS_SB(sb)->s_gid))
+		error = -EPERM;
+
+	if (error)
+		goto out;
+
+	/* XXX: this is missing some actual on-disk truncation.. */
+	if (ia_valid & ATTR_SIZE)
+		truncate_setsize(inode, attr->ia_size);
+
+	if (ia_valid & ATTR_MTIME) {
+		inode->i_mtime = attr->ia_mtime;
+		adfs_unix2adfs_time(inode, attr->ia_mtime.tv_sec);
+	}
+	/*
+	 * FIXME: should we make these == to i_mtime since we don't
+	 * have the ability to represent them in our filesystem?
+	 */
+	if (ia_valid & ATTR_ATIME)
+		inode->i_atime = attr->ia_atime;
+	if (ia_valid & ATTR_CTIME)
+		inode->i_ctime = attr->ia_ctime;
+	if (ia_valid & ATTR_MODE) {
+		ADFS_I(inode)->attr = adfs_mode2atts(sb, inode);
+		inode->i_mode = adfs_atts2mode(sb, inode);
+	}
+
+	/*
+	 * FIXME: should we be marking this inode dirty even if
+	 * we don't have any metadata to write back?
+	 */
+	if (ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MODE))
+		mark_inode_dirty(inode);
+out:
+	return error;
+}
+
+/*
+ * write an existing inode back to the directory, and therefore the disk.
+ * The adfs-specific inode data has already been updated by
+ * adfs_notify_change()
+ */
+int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct super_block *sb = inode->i_sb;
+	struct object_info obj;
+	int ret;
+
+	obj.file_id	= inode->i_ino;
+	obj.name_len	= 0;
+	obj.parent_id	= ADFS_I(inode)->parent_id;
+	obj.loadaddr	= ADFS_I(inode)->loadaddr;
+	obj.execaddr	= ADFS_I(inode)->execaddr;
+	obj.attr	= ADFS_I(inode)->attr;
+	obj.size	= inode->i_size;
+
+	ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
+	return ret;
+}
diff --git a/fs/adfs/map.c b/fs/adfs/map.c
new file mode 100644
index 00000000..6935f052
--- /dev/null
+++ b/fs/adfs/map.c
@@ -0,0 +1,290 @@
+/*
+ *  linux/fs/adfs/map.c
+ *
+ *  Copyright (C) 1997-2002 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/buffer_head.h>
+#include <asm/unaligned.h>
+#include "adfs.h"
+
+/*
+ * The ADFS map is basically a set of sectors.  Each sector is called a
+ * zone which contains a bitstream made up of variable sized fragments.
+ * Each bit refers to a set of bytes in the filesystem, defined by
+ * log2bpmb.  This may be larger or smaller than the sector size, but
+ * the overall size it describes will always be a round number of
+ * sectors.  A fragment id is always idlen bits long.
+ *
+ *  < idlen > <       n        > <1>
+ * +---------+-------//---------+---+
+ * | frag id |  0000....000000  | 1 |
+ * +---------+-------//---------+---+
+ *
+ * The physical disk space used by a fragment is taken from the start of
+ * the fragment id up to and including the '1' bit - ie, idlen + n + 1
+ * bits.
+ *
+ * A fragment id can be repeated multiple times in the whole map for
+ * large or fragmented files.  The first map zone a fragment starts in
+ * is given by fragment id / ids_per_zone - this allows objects to start
+ * from any zone on the disk.
+ *
+ * Free space is described by a linked list of fragments.  Each free
+ * fragment describes free space in the same way as the other fragments,
+ * however, the frag id specifies an offset (in map bits) from the end
+ * of this fragment to the start of the next free fragment.
+ *
+ * Objects stored on the disk are allocated object ids (we use these as
+ * our inode numbers.)  Object ids contain a fragment id and an optional
+ * offset.  This allows a directory fragment to contain small files
+ * associated with that directory.
+ */
+
+/*
+ * For the future...
+ */
+static DEFINE_RWLOCK(adfs_map_lock);
+
+/*
+ * This is fun.  We need to load up to 19 bits from the map at an
+ * arbitrary bit alignment.  (We're limited to 19 bits by F+ version 2).
+ */
+#define GET_FRAG_ID(_map,_start,_idmask)				\
+	({								\
+		unsigned char *_m = _map + (_start >> 3);		\
+		u32 _frag = get_unaligned_le32(_m);			\
+		_frag >>= (_start & 7);					\
+		_frag & _idmask;					\
+	})
+
+/*
+ * return the map bit offset of the fragment frag_id in the zone dm.
+ * Note that the loop is optimised for best asm code - look at the
+ * output of:
+ *  gcc -D__KERNEL__ -O2 -I../../include -o - -S map.c
+ */
+static int
+lookup_zone(const struct adfs_discmap *dm, const unsigned int idlen,
+	    const unsigned int frag_id, unsigned int *offset)
+{
+	const unsigned int mapsize = dm->dm_endbit;
+	const u32 idmask = (1 << idlen) - 1;
+	unsigned char *map = dm->dm_bh->b_data + 4;
+	unsigned int start = dm->dm_startbit;
+	unsigned int mapptr;
+	u32 frag;
+
+	do {
+		frag = GET_FRAG_ID(map, start, idmask);
+		mapptr = start + idlen;
+
+		/*
+		 * find end of fragment
+		 */
+		{
+			__le32 *_map = (__le32 *)map;
+			u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31);
+			while (v == 0) {
+				mapptr = (mapptr & ~31) + 32;
+				if (mapptr >= mapsize)
+					goto error;
+				v = le32_to_cpu(_map[mapptr >> 5]);
+			}
+
+			mapptr += 1 + ffz(~v);
+		}
+
+		if (frag == frag_id)
+			goto found;
+again:
+		start = mapptr;
+	} while (mapptr < mapsize);
+	return -1;
+
+error:
+	printk(KERN_ERR "adfs: oversized fragment 0x%x at 0x%x-0x%x\n",
+		frag, start, mapptr);
+	return -1;
+
+found:
+	{
+		int length = mapptr - start;
+		if (*offset >= length) {
+			*offset -= length;
+			goto again;
+		}
+	}
+	return start + *offset;
+}
+
+/*
+ * Scan the free space map, for this zone, calculating the total
+ * number of map bits in each free space fragment.
+ *
+ * Note: idmask is limited to 15 bits [3.2]
+ */
+static unsigned int
+scan_free_map(struct adfs_sb_info *asb, struct adfs_discmap *dm)
+{
+	const unsigned int mapsize = dm->dm_endbit + 32;
+	const unsigned int idlen  = asb->s_idlen;
+	const unsigned int frag_idlen = idlen <= 15 ? idlen : 15;
+	const u32 idmask = (1 << frag_idlen) - 1;
+	unsigned char *map = dm->dm_bh->b_data;
+	unsigned int start = 8, mapptr;
+	u32 frag;
+	unsigned long total = 0;
+
+	/*
+	 * get fragment id
+	 */
+	frag = GET_FRAG_ID(map, start, idmask);
+
+	/*
+	 * If the freelink is null, then no free fragments
+	 * exist in this zone.
+	 */
+	if (frag == 0)
+		return 0;
+
+	do {
+		start += frag;
+
+		/*
+		 * get fragment id
+		 */
+		frag = GET_FRAG_ID(map, start, idmask);
+		mapptr = start + idlen;
+
+		/*
+		 * find end of fragment
+		 */
+		{
+			__le32 *_map = (__le32 *)map;
+			u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31);
+			while (v == 0) {
+				mapptr = (mapptr & ~31) + 32;
+				if (mapptr >= mapsize)
+					goto error;
+				v = le32_to_cpu(_map[mapptr >> 5]);
+			}
+
+			mapptr += 1 + ffz(~v);
+		}
+
+		total += mapptr - start;
+	} while (frag >= idlen + 1);
+
+	if (frag != 0)
+		printk(KERN_ERR "adfs: undersized free fragment\n");
+
+	return total;
+error:
+	printk(KERN_ERR "adfs: oversized free fragment\n");
+	return 0;
+}
+
+static int
+scan_map(struct adfs_sb_info *asb, unsigned int zone,
+	 const unsigned int frag_id, unsigned int mapoff)
+{
+	const unsigned int idlen = asb->s_idlen;
+	struct adfs_discmap *dm, *dm_end;
+	int result;
+
+	dm	= asb->s_map + zone;
+	zone	= asb->s_map_size;
+	dm_end	= asb->s_map + zone;
+
+	do {
+		result = lookup_zone(dm, idlen, frag_id, &mapoff);
+
+		if (result != -1)
+			goto found;
+
+		dm ++;
+		if (dm == dm_end)
+			dm = asb->s_map;
+	} while (--zone > 0);
+
+	return -1;
+found:
+	result -= dm->dm_startbit;
+	result += dm->dm_startblk;
+
+	return result;
+}
+
+/*
+ * calculate the amount of free blocks in the map.
+ *
+ *              n=1
+ *  total_free = E(free_in_zone_n)
+ *              nzones
+ */
+unsigned int
+adfs_map_free(struct super_block *sb)
+{
+	struct adfs_sb_info *asb = ADFS_SB(sb);
+	struct adfs_discmap *dm;
+	unsigned int total = 0;
+	unsigned int zone;
+
+	dm   = asb->s_map;
+	zone = asb->s_map_size;
+
+	do {
+		total += scan_free_map(asb, dm++);
+	} while (--zone > 0);
+
+	return signed_asl(total, asb->s_map2blk);
+}
+
+int
+adfs_map_lookup(struct super_block *sb, unsigned int frag_id,
+		unsigned int offset)
+{
+	struct adfs_sb_info *asb = ADFS_SB(sb);
+	unsigned int zone, mapoff;
+	int result;
+
+	/*
+	 * map & root fragment is special - it starts in the center of the
+	 * disk.  The other fragments start at zone (frag / ids_per_zone)
+	 */
+	if (frag_id == ADFS_ROOT_FRAG)
+		zone = asb->s_map_size >> 1;
+	else
+		zone = frag_id / asb->s_ids_per_zone;
+
+	if (zone >= asb->s_map_size)
+		goto bad_fragment;
+
+	/* Convert sector offset to map offset */
+	mapoff = signed_asl(offset, -asb->s_map2blk);
+
+	read_lock(&adfs_map_lock);
+	result = scan_map(asb, zone, frag_id, mapoff);
+	read_unlock(&adfs_map_lock);
+
+	if (result > 0) {
+		unsigned int secoff;
+
+		/* Calculate sector offset into map block */
+		secoff = offset - signed_asl(mapoff, asb->s_map2blk);
+		return secoff + signed_asl(result, asb->s_map2blk);
+	}
+
+	adfs_error(sb, "fragment 0x%04x at offset %d not found in map",
+		   frag_id, offset);
+	return 0;
+
+bad_fragment:
+	adfs_error(sb, "invalid fragment 0x%04x (zone = %d, max = %d)",
+		   frag_id, zone, asb->s_map_size);
+	return 0;
+}
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
new file mode 100644
index 00000000..c8bf36a1
--- /dev/null
+++ b/fs/adfs/super.c
@@ -0,0 +1,543 @@
+/*
+ *  linux/fs/adfs/super.c
+ *
+ *  Copyright (C) 1997-1999 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/buffer_head.h>
+#include <linux/parser.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include "adfs.h"
+#include "dir_f.h"
+#include "dir_fplus.h"
+
+#define ADFS_DEFAULT_OWNER_MASK S_IRWXU
+#define ADFS_DEFAULT_OTHER_MASK (S_IRWXG | S_IRWXO)
+
+void __adfs_error(struct super_block *sb, const char *function, const char *fmt, ...)
+{
+	char error_buf[128];
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+	va_end(args);
+
+	printk(KERN_CRIT "ADFS-fs error (device %s)%s%s: %s\n",
+		sb->s_id, function ? ": " : "",
+		function ? function : "", error_buf);
+}
+
+static int adfs_checkdiscrecord(struct adfs_discrecord *dr)
+{
+	int i;
+
+	/* sector size must be 256, 512 or 1024 bytes */
+	if (dr->log2secsize != 8 &&
+	    dr->log2secsize != 9 &&
+	    dr->log2secsize != 10)
+		return 1;
+
+	/* idlen must be at least log2secsize + 3 */
+	if (dr->idlen < dr->log2secsize + 3)
+		return 1;
+
+	/* we cannot have such a large disc that we
+	 * are unable to represent sector offsets in
+	 * 32 bits.  This works out at 2.0 TB.
+	 */
+	if (le32_to_cpu(dr->disc_size_high) >> dr->log2secsize)
+		return 1;
+
+	/* idlen must be no greater than 19 v2 [1.0] */
+	if (dr->idlen > 19)
+		return 1;
+
+	/* reserved bytes should be zero */
+	for (i = 0; i < sizeof(dr->unused52); i++)
+		if (dr->unused52[i] != 0)
+			return 1;
+
+	return 0;
+}
+
+static unsigned char adfs_calczonecheck(struct super_block *sb, unsigned char *map)
+{
+	unsigned int v0, v1, v2, v3;
+	int i;
+
+	v0 = v1 = v2 = v3 = 0;
+	for (i = sb->s_blocksize - 4; i; i -= 4) {
+		v0 += map[i]     + (v3 >> 8);
+		v3 &= 0xff;
+		v1 += map[i + 1] + (v0 >> 8);
+		v0 &= 0xff;
+		v2 += map[i + 2] + (v1 >> 8);
+		v1 &= 0xff;
+		v3 += map[i + 3] + (v2 >> 8);
+		v2 &= 0xff;
+	}
+	v0 +=           v3 >> 8;
+	v1 += map[1] + (v0 >> 8);
+	v2 += map[2] + (v1 >> 8);
+	v3 += map[3] + (v2 >> 8);
+
+	return v0 ^ v1 ^ v2 ^ v3;
+}
+
+static int adfs_checkmap(struct super_block *sb, struct adfs_discmap *dm)
+{
+	unsigned char crosscheck = 0, zonecheck = 1;
+	int i;
+
+	for (i = 0; i < ADFS_SB(sb)->s_map_size; i++) {
+		unsigned char *map;
+
+		map = dm[i].dm_bh->b_data;
+
+		if (adfs_calczonecheck(sb, map) != map[0]) {
+			adfs_error(sb, "zone %d fails zonecheck", i);
+			zonecheck = 0;
+		}
+		crosscheck ^= map[3];
+	}
+	if (crosscheck != 0xff)
+		adfs_error(sb, "crosscheck != 0xff");
+	return crosscheck == 0xff && zonecheck;
+}
+
+static void adfs_put_super(struct super_block *sb)
+{
+	int i;
+	struct adfs_sb_info *asb = ADFS_SB(sb);
+
+	for (i = 0; i < asb->s_map_size; i++)
+		brelse(asb->s_map[i].dm_bh);
+	kfree(asb->s_map);
+	kfree(asb);
+	sb->s_fs_info = NULL;
+}
+
+static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
+{
+	struct adfs_sb_info *asb = ADFS_SB(mnt->mnt_sb);
+
+	if (asb->s_uid != 0)
+		seq_printf(seq, ",uid=%u", asb->s_uid);
+	if (asb->s_gid != 0)
+		seq_printf(seq, ",gid=%u", asb->s_gid);
+	if (asb->s_owner_mask != ADFS_DEFAULT_OWNER_MASK)
+		seq_printf(seq, ",ownmask=%o", asb->s_owner_mask);
+	if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK)
+		seq_printf(seq, ",othmask=%o", asb->s_other_mask);
+	if (asb->s_ftsuffix != 0)
+		seq_printf(seq, ",ftsuffix=%u", asb->s_ftsuffix);
+
+	return 0;
+}
+
+enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err};
+
+static const match_table_t tokens = {
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_ownmask, "ownmask=%o"},
+	{Opt_othmask, "othmask=%o"},
+	{Opt_ftsuffix, "ftsuffix=%u"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(struct super_block *sb, char *options)
+{
+	char *p;
+	struct adfs_sb_info *asb = ADFS_SB(sb);
+	int option;
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(args, &option))
+				return -EINVAL;
+			asb->s_uid = option;
+			break;
+		case Opt_gid:
+			if (match_int(args, &option))
+				return -EINVAL;
+			asb->s_gid = option;
+			break;
+		case Opt_ownmask:
+			if (match_octal(args, &option))
+				return -EINVAL;
+			asb->s_owner_mask = option;
+			break;
+		case Opt_othmask:
+			if (match_octal(args, &option))
+				return -EINVAL;
+			asb->s_other_mask = option;
+			break;
+		case Opt_ftsuffix:
+			if (match_int(args, &option))
+				return -EINVAL;
+			asb->s_ftsuffix = option;
+			break;
+		default:
+			printk("ADFS-fs: unrecognised mount option \"%s\" "
+					"or missing value\n", p);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int adfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_NODIRATIME;
+	return parse_options(sb, data);
+}
+
+static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct adfs_sb_info *sbi = ADFS_SB(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type    = ADFS_SUPER_MAGIC;
+	buf->f_namelen = sbi->s_namelen;
+	buf->f_bsize   = sb->s_blocksize;
+	buf->f_blocks  = sbi->s_size;
+	buf->f_files   = sbi->s_ids_per_zone * sbi->s_map_size;
+	buf->f_bavail  =
+	buf->f_bfree   = adfs_map_free(sb);
+	buf->f_ffree   = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+static struct kmem_cache *adfs_inode_cachep;
+
+static struct inode *adfs_alloc_inode(struct super_block *sb)
+{
+	struct adfs_inode_info *ei;
+	ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void adfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
+}
+
+static void adfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, adfs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
+					     sizeof(struct adfs_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (adfs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(adfs_inode_cachep);
+}
+
+static const struct super_operations adfs_sops = {
+	.alloc_inode	= adfs_alloc_inode,
+	.destroy_inode	= adfs_destroy_inode,
+	.write_inode	= adfs_write_inode,
+	.put_super	= adfs_put_super,
+	.statfs		= adfs_statfs,
+	.remount_fs	= adfs_remount,
+	.show_options	= adfs_show_options,
+};
+
+static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr)
+{
+	struct adfs_discmap *dm;
+	unsigned int map_addr, zone_size, nzones;
+	int i, zone;
+	struct adfs_sb_info *asb = ADFS_SB(sb);
+
+	nzones    = asb->s_map_size;
+	zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare);
+	map_addr  = (nzones >> 1) * zone_size -
+		     ((nzones > 1) ? ADFS_DR_SIZE_BITS : 0);
+	map_addr  = signed_asl(map_addr, asb->s_map2blk);
+
+	asb->s_ids_per_zone = zone_size / (asb->s_idlen + 1);
+
+	dm = kmalloc(nzones * sizeof(*dm), GFP_KERNEL);
+	if (dm == NULL) {
+		adfs_error(sb, "not enough memory");
+		return NULL;
+	}
+
+	for (zone = 0; zone < nzones; zone++, map_addr++) {
+		dm[zone].dm_startbit = 0;
+		dm[zone].dm_endbit   = zone_size;
+		dm[zone].dm_startblk = zone * zone_size - ADFS_DR_SIZE_BITS;
+		dm[zone].dm_bh       = sb_bread(sb, map_addr);
+
+		if (!dm[zone].dm_bh) {
+			adfs_error(sb, "unable to read map");
+			goto error_free;
+		}
+	}
+
+	/* adjust the limits for the first and last map zones */
+	i = zone - 1;
+	dm[0].dm_startblk = 0;
+	dm[0].dm_startbit = ADFS_DR_SIZE_BITS;
+	dm[i].dm_endbit   = (le32_to_cpu(dr->disc_size_high) << (32 - dr->log2bpmb)) +
+			    (le32_to_cpu(dr->disc_size) >> dr->log2bpmb) +
+			    (ADFS_DR_SIZE_BITS - i * zone_size);
+
+	if (adfs_checkmap(sb, dm))
+		return dm;
+
+	adfs_error(sb, "map corrupted");
+
+error_free:
+	while (--zone >= 0)
+		brelse(dm[zone].dm_bh);
+
+	kfree(dm);
+	return NULL;
+}
+
+static inline unsigned long adfs_discsize(struct adfs_discrecord *dr, int block_bits)
+{
+	unsigned long discsize;
+
+	discsize  = le32_to_cpu(dr->disc_size_high) << (32 - block_bits);
+	discsize |= le32_to_cpu(dr->disc_size) >> block_bits;
+
+	return discsize;
+}
+
+static int adfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct adfs_discrecord *dr;
+	struct buffer_head *bh;
+	struct object_info root_obj;
+	unsigned char *b_data;
+	struct adfs_sb_info *asb;
+	struct inode *root;
+
+	sb->s_flags |= MS_NODIRATIME;
+
+	asb = kzalloc(sizeof(*asb), GFP_KERNEL);
+	if (!asb)
+		return -ENOMEM;
+	sb->s_fs_info = asb;
+
+	/* set default options */
+	asb->s_uid = 0;
+	asb->s_gid = 0;
+	asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
+	asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
+	asb->s_ftsuffix = 0;
+
+	if (parse_options(sb, data))
+		goto error;
+
+	sb_set_blocksize(sb, BLOCK_SIZE);
+	if (!(bh = sb_bread(sb, ADFS_DISCRECORD / BLOCK_SIZE))) {
+		adfs_error(sb, "unable to read superblock");
+		goto error;
+	}
+
+	b_data = bh->b_data + (ADFS_DISCRECORD % BLOCK_SIZE);
+
+	if (adfs_checkbblk(b_data)) {
+		if (!silent)
+			printk("VFS: Can't find an adfs filesystem on dev "
+				"%s.\n", sb->s_id);
+		goto error_free_bh;
+	}
+
+	dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET);
+
+	/*
+	 * Do some sanity checks on the ADFS disc record
+	 */
+	if (adfs_checkdiscrecord(dr)) {
+		if (!silent)
+			printk("VPS: Can't find an adfs filesystem on dev "
+				"%s.\n", sb->s_id);
+		goto error_free_bh;
+	}
+
+	brelse(bh);
+	if (sb_set_blocksize(sb, 1 << dr->log2secsize)) {
+		bh = sb_bread(sb, ADFS_DISCRECORD / sb->s_blocksize);
+		if (!bh) {
+			adfs_error(sb, "couldn't read superblock on "
+				"2nd try.");
+			goto error;
+		}
+		b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize);
+		if (adfs_checkbblk(b_data)) {
+			adfs_error(sb, "disc record mismatch, very weird!");
+			goto error_free_bh;
+		}
+		dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET);
+	} else {
+		if (!silent)
+			printk(KERN_ERR "VFS: Unsupported blocksize on dev "
+				"%s.\n", sb->s_id);
+		goto error;
+	}
+
+	/*
+	 * blocksize on this device should now be set to the ADFS log2secsize
+	 */
+
+	sb->s_magic		= ADFS_SUPER_MAGIC;
+	asb->s_idlen		= dr->idlen;
+	asb->s_map_size		= dr->nzones | (dr->nzones_high << 8);
+	asb->s_map2blk		= dr->log2bpmb - dr->log2secsize;
+	asb->s_size    		= adfs_discsize(dr, sb->s_blocksize_bits);
+	asb->s_version 		= dr->format_version;
+	asb->s_log2sharesize	= dr->log2sharesize;
+	
+	asb->s_map = adfs_read_map(sb, dr);
+	if (!asb->s_map)
+		goto error_free_bh;
+
+	brelse(bh);
+
+	/*
+	 * set up enough so that we can read an inode
+	 */
+	sb->s_op = &adfs_sops;
+
+	dr = (struct adfs_discrecord *)(asb->s_map[0].dm_bh->b_data + 4);
+
+	root_obj.parent_id = root_obj.file_id = le32_to_cpu(dr->root);
+	root_obj.name_len  = 0;
+	/* Set root object date as 01 Jan 1987 00:00:00 */
+	root_obj.loadaddr  = 0xfff0003f;
+	root_obj.execaddr  = 0xec22c000;
+	root_obj.size	   = ADFS_NEWDIR_SIZE;
+	root_obj.attr	   = ADFS_NDA_DIRECTORY   | ADFS_NDA_OWNER_READ |
+			     ADFS_NDA_OWNER_WRITE | ADFS_NDA_PUBLIC_READ;
+	root_obj.filetype  = -1;
+
+	/*
+	 * If this is a F+ disk with variable length directories,
+	 * get the root_size from the disc record.
+	 */
+	if (asb->s_version) {
+		root_obj.size = le32_to_cpu(dr->root_size);
+		asb->s_dir     = &adfs_fplus_dir_ops;
+		asb->s_namelen = ADFS_FPLUS_NAME_LEN;
+	} else {
+		asb->s_dir     = &adfs_f_dir_ops;
+		asb->s_namelen = ADFS_F_NAME_LEN;
+	}
+	/*
+	 * ,xyz hex filetype suffix may be added by driver
+	 * to files that have valid RISC OS filetype
+	 */
+	if (asb->s_ftsuffix)
+		asb->s_namelen += 4;
+
+	sb->s_d_op = &adfs_dentry_operations;
+	root = adfs_iget(sb, &root_obj);
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		int i;
+		iput(root);
+		for (i = 0; i < asb->s_map_size; i++)
+			brelse(asb->s_map[i].dm_bh);
+		kfree(asb->s_map);
+		adfs_error(sb, "get root inode failed\n");
+		goto error;
+	}
+	return 0;
+
+error_free_bh:
+	brelse(bh);
+error:
+	sb->s_fs_info = NULL;
+	kfree(asb);
+	return -EINVAL;
+}
+
+static struct dentry *adfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
+}
+
+static struct file_system_type adfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "adfs",
+	.mount		= adfs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_adfs_fs(void)
+{
+	int err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&adfs_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_adfs_fs(void)
+{
+	unregister_filesystem(&adfs_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_adfs_fs)
+module_exit(exit_adfs_fs)
+MODULE_LICENSE("GPL");
diff --git a/fs/affs/Changes b/fs/affs/Changes
new file mode 100644
index 00000000..a29409c1
--- /dev/null
+++ b/fs/affs/Changes
@@ -0,0 +1,343 @@
+(Note: I consider version numbers as cheap. That means
+that I do not like numbers like 0.1 and the like for
+things that can be used since quite some time. But
+then, 3.1 doesn't mean 'perfectly stable', too.)
+
+Known bugs:
+-----------
+
+- Doesn't work on the alpha. The only 64/32-bit
+  problem that I'm aware of (pointer/int conversion
+  in readdir()) gives compiler warnings but is
+  apparently not causing the failure, as directory
+  reads basically work (but all files are of size 0).
+  Alas, I've got no alpha to debug. :-(
+
+- The partition checker (drivers/block/genhd.c)
+  doesn't work with devices which have 256 byte
+  blocks (some very old SCSI drives). 
+
+- The feature to automatically make the fs clean
+  might leave a trashed file system with the
+  bitmap flag set valid.
+
+- When a file is truncated to a size that is not
+  a multiple of the blocksize, the rest of the
+  last allocated block is not cleared. Well,
+  this fs never claimed to be Posix conformant.
+
+Please direct bug reports to: zippel@linux-m68k.org
+
+Version 3.20
+------------
+- kill kernel lock
+- fix for a possible bitmap corruption
+
+Version 3.19
+------------
+
+- sizeof changes from Kernel Janitor Project
+- several bug fixes found with fsx
+
+Version 3.18
+------------
+
+- change to global min macro + warning fixes
+- add module tags
+
+Version 3.17
+------------
+
+- locking fixes
+- wrong sign in __affs_hash_dentry
+- remove unnecessary check in affs_new_inode
+- enable international mode for dircache fs
+
+Version 3.16
+------------
+
+- use mark_buffer_dirty_inode instead of mark_buffer_dirty.
+- introduce affs_lock_{link|dir|ext}.
+
+Version 3.15
+------------
+
+- disable link to directories until we can properly support them.
+- locking fixes for link creation/removal.
+
+Version 3.14
+------------
+
+- correctly cut off long file names for compares
+- correctly initialize s_last_bmap
+
+Version 3.13
+------------
+
+Major cleanup for 2.4 [Roman Zippel]
+- new extended block handling
+- new bitmap allocation functions
+- locking should be safe for the future
+- cleanup of some interfaces
+
+Version 3.12
+------------
+
+more 2.4 fixes: [Roman Zippel]
+- s_lock changes
+- increased getblock mess
+- clear meta blocks 
+
+Version 3.11
+------------
+
+- Converted to use 2.3.x page cache [Dave Jones <dave@powertweak.com>]
+- Corruption in truncate() bugfix [Ken Tyler <kent@werple.net.au>]
+
+Version 3.10
+------------
+
+- Changed partition checker to allow devices
+  with physical blocks != 512 bytes.
+
+- The partition checker now also ignores the
+  word at 0xd0 that Windows likes to write to.
+
+Version 3.9
+-----------
+
+- Moved cleanup from release_file() to put_inode().
+  This makes the first one obsolete.
+
+- truncate() zeroes the unused remainder of a
+  partially used last block when a file is truncated.
+  It also marks the inode dirty now (which is not
+  really necessary as notify_change() will do
+  it anyway).
+
+- Added a few comments, fixed some typos (and
+  introduced some new ones), made the debug messages
+  more consistent. Changed a bad example in the
+  doc file (affs.txt).
+
+- Sets the NOEXEC flag in read_super() for old file
+  systems, since you can't run programs on them.
+
+Version 3.8
+-----------
+Bill Hawes kindly reviewed the affs and sent me the
+patches he did. They're marked (BH). Thanks, Bill!
+
+- Cleanup of error handling in read_super().
+  Didn't release all resources in case of an
+  error. (BH)
+
+- put_inode() releases the ext cache only if it's
+  no longer needed. (BH)
+
+- One set of dentry callbacks is enough. (BH)
+
+- Cleanup of error handling in namei.c. (BH)
+
+- Cleanup of error handling in file.c. (BH)
+
+- The original blocksize of the device is
+  restored when the fs is unmounted. (BH)
+
+- getblock() did not invalidate the key cache
+  when it allocated a new block.
+
+- Removed some unnecessary locks as Bill
+  suggested.
+
+- Simplified match_name(), changed all hashing
+  and case insensitive name comparisons to use
+  uppercase. This makes the tolower() routines
+  obsolete.
+
+- Added mount option 'mufs' to force muFS
+  uid/gid interpretation.
+
+- File mode changes were not updated on disk.
+  This was fixed before, but somehow got lost.
+
+Version 3.7
+-----------
+
+- Added dentry callbacks to allow the dcache to
+  operate case insensitive and length ignorant
+  like the affs itself.
+
+- getblock() didn't update the lastblock field in the
+  inode if the fs was not an OFS. This bug only shows
+  up if a file was enlarged via truncate() and there
+  was not enough space.
+
+- Remove some more superfluous code left over from
+  the old link days ...
+
+- Fixed some oversights which were in patch 2.1.78.
+
+- Fixed a few typos.
+
+Version 3.6
+-----------
+
+- dentry changes. (Thanks to Jes Sorensen for his help.)
+
+- Fixed bug in balloc(): Superblock was not set dirty after
+  the bitmap was changed, so the bitmap wasn't sync'd.
+
+- Fixed nasty bug in find_new_zone(): If the current
+  zone number was zero, the loop didn't terminate,
+  causing a solid lock-up.
+
+- Removed support for old-style directory reads.
+
+- Fixed bug in add_entry(): When doing a sorted insert,
+  the pointer to the next entry in the hash chain wasn't
+  correctly byte-swapped. Since most of the users of the
+  affs use it on a 68k, they didn't notice. But why did
+  I not find this during my tests?
+
+- Fixed some oversights (version wasn't updated on some
+  directory changes).
+
+- Handling of hard links rewritten. To the VFS
+  they appear now as normal Unix links. They are
+  now resolved only once in lookup(). The backside
+  is that unlink(), rename() and rmdir() have to
+  be smart about them, but the result is worth the
+  effort. This also led to some code cleanup.
+
+- Changed name type to unsigned char; the test for
+  invalid filenames didn't work correctly.
+  (Thanks to Michael Krause for pointing at this.)
+
+- Changed mapping of executable flag.
+
+- Changed all network byte-order macros to the
+  recommended ones.
+
+- Added a remount function, so attempts to remount
+  a dircache filesystem or one with errors read/write
+  can be trapped. Previously, ro remounts didn't
+  flush the super block, and rw remounts didn't
+  create allocation zones ...
+
+- Call shrink_dcache_parent() in rmdir().
+  (Thanks to Bill Hawes.)
+
+- Permission checks in unlink().
+
+- Allow mounting of volumes with superfluous
+  bitmap pointers read only, also allows them
+  to be remounted read/write.
+
+- Owner/Group defaults now to the fs user (i.e.
+  the one that mounted it) instead of root. This
+  obsoletes the mount options uid and gid.
+
+- Argument to volume option could overflow the
+  name buffer. It is now silently truncated to
+  30 characters. (Damn it! This kind of bug
+  is too embarrassing.)
+
+- Split inode.c into 2 files, the superblock
+  routines desperately wanted their own file.
+
+- truncate() didn't allocate an extension block
+  cache. If a file was extended by means of
+  truncate(), this led to an Oops.
+
+- fsuser is now checked last.
+
+- rename() will not ignore changes in filename
+  casing any more (though mv(1) still won't allow
+  you to do "mv oldname OldName").
+
+Version 3.5
+-----------
+
+- Extension block caches are now allocated on
+  demand instead of when a file is opened, as
+  files can be read and written without opening
+  them (e. g. the loopback device does this).
+
+- Removed an unused function.
+
+Version 3.4
+-----------
+
+- Hash chains are now sorted by block numbers.
+  (Thanks to Kars de Jong for finding this.)
+- Removed all unnecessary external symbols.
+
+Version 3.3
+-----------
+
+- Tried to make all types 'correct' and consistent.
+- Errors and warnings are now reported via a
+  function. They are all prefixed by a severity
+  and have the same appearance:
+    "AFFS: <function>: <error message>"
+  (There's one exception to this, as in that function
+  is no pointer to the super block available.)
+- The filesystem is remounted read-only after an
+  error.
+- The names of newly created filesystem objects are
+  now checked for validity.
+- Minor cleanups in comments.
+- Added this Changes file. At last!
+
+Version 3.2
+-----------
+
+- Extension block cache: Reading/writing of huge files
+  (several MB) is much faster (of course the added
+  overhead slows down opening, but this is hardly
+  noticeable).
+- The same get_block()-routine can now be used for
+  both OFS and FFS.
+- The super block is now searched in the block that
+  was calculated and in the one following. This
+  should remedy the round-off error introduced by
+  the 1-k blocks that Linux uses.
+- Minor changes to adhere to the new VFS interface.
+- The number of used blocks is now also calculated
+  if the filesystem is mounted read-only.
+- Prefixed some constants with AFFS_ to avoid name
+  clashes.
+- Removed 'EXPERIMENTAL' status.
+
+Version 3.1
+-----------
+
+- Fixed a nasty bug which didn't allow read-only
+  mounts.
+- Allow dir-cache filesystems to be mounted
+  read only.
+- OFS support.
+- Several other changes I just cannot remember
+  any more.
+
+Version 3.0
+-----------
+
+- Almost complete rewrite for the new VFS
+  interface in Linux 1.3.
+- Write support.
+- Support for hard and symbolic links.
+- Lots of things I remember even less ...
+
+Version 2.0
+-----------
+
+- Fixed a few things to get it compiled.
+- Automatic root block calculation.
+- Partition checker for genhd.c
+
+========================================
+
+Let's just call Ray Burr's original affs
+'Version 1.0'.
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
new file mode 100644
index 00000000..cfad9afb
--- /dev/null
+++ b/fs/affs/Kconfig
@@ -0,0 +1,21 @@
+config AFFS_FS
+	tristate "Amiga FFS file system support (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	help
+	  The Fast File System (FFS) is the common file system used on hard
+	  disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20).  Say Y
+	  if you want to be able to read and write files from and to an Amiga
+	  FFS partition on your hard drive.  Amiga floppies however cannot be
+	  read with this driver due to an incompatibility of the floppy
+	  controller used in an Amiga and the standard floppy controller in
+	  PCs and workstations. Read <file:Documentation/filesystems/affs.txt>
+	  and <file:fs/affs/Changes>.
+
+	  With this driver you can also mount disk files used by Bernd
+	  Schmidt's Un*X Amiga Emulator
+	  (<http://www.freiburg.linux.de/~uae/>).
+	  If you want to do this, you will also need to say Y or M to "Loop
+	  device support", above.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called affs.  If unsure, say N.
diff --git a/fs/affs/Makefile b/fs/affs/Makefile
new file mode 100644
index 00000000..3988b4a7
--- /dev/null
+++ b/fs/affs/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the Linux affs filesystem routines.
+#
+
+#ccflags-y := -DDEBUG=1
+
+obj-$(CONFIG_AFFS_FS) += affs.o
+
+affs-objs := super.o namei.o inode.o file.o dir.o amigaffs.o bitmap.o symlink.o
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
new file mode 100644
index 00000000..0e95f73a
--- /dev/null
+++ b/fs/affs/affs.h
@@ -0,0 +1,305 @@
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/amigaffs.h>
+#include <linux/mutex.h>
+
+/* AmigaOS allows file names with up to 30 characters length.
+ * Names longer than that will be silently truncated. If you
+ * want to disallow this, comment out the following #define.
+ * Creating filesystem objects with longer names will then
+ * result in an error (ENAMETOOLONG).
+ */
+/*#define AFFS_NO_TRUNCATE */
+
+/* Ugly macros make the code more pretty. */
+
+#define GET_END_PTR(st,p,sz)		 ((st *)((char *)(p)+((sz)-sizeof(st))))
+#define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey])
+#define AFFS_BLOCK(sb, bh, blk)		(AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)])
+
+#ifdef __LITTLE_ENDIAN
+#define BO_EXBITS	0x18UL
+#elif defined(__BIG_ENDIAN)
+#define BO_EXBITS	0x00UL
+#else
+#error Endianness must be known for affs to work.
+#endif
+
+#define AFFS_HEAD(bh)		((struct affs_head *)(bh)->b_data)
+#define AFFS_TAIL(sb, bh)	((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail)))
+#define AFFS_ROOT_HEAD(bh)	((struct affs_root_head *)(bh)->b_data)
+#define AFFS_ROOT_TAIL(sb, bh)	((struct affs_root_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_root_tail)))
+#define AFFS_DATA_HEAD(bh)	((struct affs_data_head *)(bh)->b_data)
+#define AFFS_DATA(bh)		(((struct affs_data_head *)(bh)->b_data)->data)
+
+#define AFFS_CACHE_SIZE		PAGE_SIZE
+
+#define AFFS_MAX_PREALLOC	32
+#define AFFS_LC_SIZE		(AFFS_CACHE_SIZE/sizeof(u32)/2)
+#define AFFS_AC_SIZE		(AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2)
+#define AFFS_AC_MASK		(AFFS_AC_SIZE-1)
+
+struct affs_ext_key {
+	u32	ext;				/* idx of the extended block */
+	u32	key;				/* block number */
+};
+
+/*
+ * affs fs inode data in memory
+ */
+struct affs_inode_info {
+	atomic_t i_opencnt;
+	struct semaphore i_link_lock;		/* Protects internal inode access. */
+	struct semaphore i_ext_lock;		/* Protects internal inode access. */
+#define i_hash_lock i_ext_lock
+	u32	 i_blkcnt;			/* block count */
+	u32	 i_extcnt;			/* extended block count */
+	u32	*i_lc;				/* linear cache of extended blocks */
+	u32	 i_lc_size;
+	u32	 i_lc_shift;
+	u32	 i_lc_mask;
+	struct affs_ext_key *i_ac;		/* associative cache of extended blocks */
+	u32	 i_ext_last;			/* last accessed extended block */
+	struct buffer_head *i_ext_bh;		/* bh of last extended block */
+	loff_t	 mmu_private;
+	u32	 i_protect;			/* unused attribute bits */
+	u32	 i_lastalloc;			/* last allocated block */
+	int	 i_pa_cnt;			/* number of preallocated blocks */
+	struct inode vfs_inode;
+};
+
+/* short cut to get to the affs specific inode data */
+static inline struct affs_inode_info *AFFS_I(struct inode *inode)
+{
+	return list_entry(inode, struct affs_inode_info, vfs_inode);
+}
+
+/*
+ * super-block data in memory
+ *
+ * Block numbers are adjusted for their actual size
+ *
+ */
+
+struct affs_bm_info {
+	u32 bm_key;			/* Disk block number */
+	u32 bm_free;			/* Free blocks in here */
+};
+
+struct affs_sb_info {
+	int s_partition_size;		/* Partition size in blocks. */
+	int s_reserved;			/* Number of reserved blocks. */
+	//u32 s_blksize;			/* Initial device blksize */
+	u32 s_data_blksize;		/* size of the data block w/o header */
+	u32 s_root_block;		/* FFS root block number. */
+	int s_hashsize;			/* Size of hash table. */
+	unsigned long s_flags;		/* See below. */
+	uid_t s_uid;			/* uid to override */
+	gid_t s_gid;			/* gid to override */
+	umode_t s_mode;			/* mode to override */
+	struct buffer_head *s_root_bh;	/* Cached root block. */
+	struct mutex s_bmlock;		/* Protects bitmap access. */
+	struct affs_bm_info *s_bitmap;	/* Bitmap infos. */
+	u32 s_bmap_count;		/* # of bitmap blocks. */
+	u32 s_bmap_bits;		/* # of bits in one bitmap blocks */
+	u32 s_last_bmap;
+	struct buffer_head *s_bmap_bh;
+	char *s_prefix;			/* Prefix for volumes and assigns. */
+	char s_volume[32];		/* Volume prefix for absolute symlinks. */
+	spinlock_t symlink_lock;	/* protects the previous two */
+};
+
+#define SF_INTL		0x0001		/* International filesystem. */
+#define SF_BM_VALID	0x0002		/* Bitmap is valid. */
+#define SF_IMMUTABLE	0x0004		/* Protection bits cannot be changed */
+#define SF_QUIET	0x0008		/* chmod errors will be not reported */
+#define SF_SETUID	0x0010		/* Ignore Amiga uid */
+#define SF_SETGID	0x0020		/* Ignore Amiga gid */
+#define SF_SETMODE	0x0040		/* Ignore Amiga protection bits */
+#define SF_MUFS		0x0100		/* Use MUFS uid/gid mapping */
+#define SF_OFS		0x0200		/* Old filesystem */
+#define SF_PREFIX	0x0400		/* Buffer for prefix is allocated */
+#define SF_VERBOSE	0x0800		/* Talk about fs when mounting */
+
+/* short cut to get to the affs specific sb data */
+static inline struct affs_sb_info *AFFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/* amigaffs.c */
+
+extern int	affs_insert_hash(struct inode *inode, struct buffer_head *bh);
+extern int	affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh);
+extern int	affs_remove_header(struct dentry *dentry);
+extern u32	affs_checksum_block(struct super_block *sb, struct buffer_head *bh);
+extern void	affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
+extern void	secs_to_datestamp(time_t secs, struct affs_date *ds);
+extern mode_t	prot_to_mode(u32 prot);
+extern void	mode_to_prot(struct inode *inode);
+extern void	affs_error(struct super_block *sb, const char *function, const char *fmt, ...);
+extern void	affs_warning(struct super_block *sb, const char *function, const char *fmt, ...);
+extern int	affs_check_name(const unsigned char *name, int len);
+extern int	affs_copy_name(unsigned char *bstr, struct dentry *dentry);
+
+/* bitmap. c */
+
+extern u32	affs_count_free_blocks(struct super_block *s);
+extern void	affs_free_block(struct super_block *sb, u32 block);
+extern u32	affs_alloc_block(struct inode *inode, u32 goal);
+extern int	affs_init_bitmap(struct super_block *sb, int *flags);
+extern void	affs_free_bitmap(struct super_block *sb);
+
+/* namei.c */
+
+extern int	affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len);
+extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *);
+extern int	affs_unlink(struct inode *dir, struct dentry *dentry);
+extern int	affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *);
+extern int	affs_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+extern int	affs_rmdir(struct inode *dir, struct dentry *dentry);
+extern int	affs_link(struct dentry *olddentry, struct inode *dir,
+			  struct dentry *dentry);
+extern int	affs_symlink(struct inode *dir, struct dentry *dentry,
+			     const char *symname);
+extern int	affs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			    struct inode *new_dir, struct dentry *new_dentry);
+
+/* inode.c */
+
+extern unsigned long		 affs_parent_ino(struct inode *dir);
+extern struct inode		*affs_new_inode(struct inode *dir);
+extern int			 affs_notify_change(struct dentry *dentry, struct iattr *attr);
+extern void			 affs_evict_inode(struct inode *inode);
+extern struct inode		*affs_iget(struct super_block *sb,
+					unsigned long ino);
+extern int			 affs_write_inode(struct inode *inode,
+					struct writeback_control *wbc);
+extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type);
+
+/* file.c */
+
+void		affs_free_prealloc(struct inode *inode);
+extern void	affs_truncate(struct inode *);
+int		affs_file_fsync(struct file *, int);
+
+/* dir.c */
+
+extern void   affs_dir_truncate(struct inode *);
+
+/* jump tables */
+
+extern const struct inode_operations	 affs_file_inode_operations;
+extern const struct inode_operations	 affs_dir_inode_operations;
+extern const struct inode_operations   affs_symlink_inode_operations;
+extern const struct file_operations	 affs_file_operations;
+extern const struct file_operations	 affs_file_operations_ofs;
+extern const struct file_operations	 affs_dir_operations;
+extern const struct address_space_operations	 affs_symlink_aops;
+extern const struct address_space_operations	 affs_aops;
+extern const struct address_space_operations	 affs_aops_ofs;
+
+extern const struct dentry_operations	 affs_dentry_operations;
+extern const struct dentry_operations	 affs_intl_dentry_operations;
+
+static inline void
+affs_set_blocksize(struct super_block *sb, int size)
+{
+	sb_set_blocksize(sb, size);
+}
+static inline struct buffer_head *
+affs_bread(struct super_block *sb, int block)
+{
+	pr_debug("affs_bread: %d\n", block);
+	if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size)
+		return sb_bread(sb, block);
+	return NULL;
+}
+static inline struct buffer_head *
+affs_getblk(struct super_block *sb, int block)
+{
+	pr_debug("affs_getblk: %d\n", block);
+	if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size)
+		return sb_getblk(sb, block);
+	return NULL;
+}
+static inline struct buffer_head *
+affs_getzeroblk(struct super_block *sb, int block)
+{
+	struct buffer_head *bh;
+	pr_debug("affs_getzeroblk: %d\n", block);
+	if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) {
+		bh = sb_getblk(sb, block);
+		lock_buffer(bh);
+		memset(bh->b_data, 0 , sb->s_blocksize);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		return bh;
+	}
+	return NULL;
+}
+static inline struct buffer_head *
+affs_getemptyblk(struct super_block *sb, int block)
+{
+	struct buffer_head *bh;
+	pr_debug("affs_getemptyblk: %d\n", block);
+	if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) {
+		bh = sb_getblk(sb, block);
+		wait_on_buffer(bh);
+		set_buffer_uptodate(bh);
+		return bh;
+	}
+	return NULL;
+}
+static inline void
+affs_brelse(struct buffer_head *bh)
+{
+	if (bh)
+		pr_debug("affs_brelse: %lld\n", (long long) bh->b_blocknr);
+	brelse(bh);
+}
+
+static inline void
+affs_adjust_checksum(struct buffer_head *bh, u32 val)
+{
+	u32 tmp = be32_to_cpu(((__be32 *)bh->b_data)[5]);
+	((__be32 *)bh->b_data)[5] = cpu_to_be32(tmp - val);
+}
+static inline void
+affs_adjust_bitmapchecksum(struct buffer_head *bh, u32 val)
+{
+	u32 tmp = be32_to_cpu(((__be32 *)bh->b_data)[0]);
+	((__be32 *)bh->b_data)[0] = cpu_to_be32(tmp - val);
+}
+
+static inline void
+affs_lock_link(struct inode *inode)
+{
+	down(&AFFS_I(inode)->i_link_lock);
+}
+static inline void
+affs_unlock_link(struct inode *inode)
+{
+	up(&AFFS_I(inode)->i_link_lock);
+}
+static inline void
+affs_lock_dir(struct inode *inode)
+{
+	down(&AFFS_I(inode)->i_hash_lock);
+}
+static inline void
+affs_unlock_dir(struct inode *inode)
+{
+	up(&AFFS_I(inode)->i_hash_lock);
+}
+static inline void
+affs_lock_ext(struct inode *inode)
+{
+	down(&AFFS_I(inode)->i_ext_lock);
+}
+static inline void
+affs_unlock_ext(struct inode *inode)
+{
+	up(&AFFS_I(inode)->i_ext_lock);
+}
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
new file mode 100644
index 00000000..3a4557e8
--- /dev/null
+++ b/fs/affs/amigaffs.c
@@ -0,0 +1,515 @@
+/*
+ *  linux/fs/affs/amigaffs.c
+ *
+ *  (c) 1996  Hans-Joachim Widmaier - Rewritten
+ *
+ *  (C) 1993  Ray Burr - Amiga FFS filesystem.
+ *
+ *  Please send bug reports to: hjw@zvw.de
+ */
+
+#include "affs.h"
+
+extern struct timezone sys_tz;
+
+static char ErrorBuffer[256];
+
+/*
+ * Functions for accessing Amiga-FFS structures.
+ */
+
+
+/* Insert a header block bh into the directory dir
+ * caller must hold AFFS_DIR->i_hash_lock!
+ */
+
+int
+affs_insert_hash(struct inode *dir, struct buffer_head *bh)
+{
+	struct super_block *sb = dir->i_sb;
+	struct buffer_head *dir_bh;
+	u32 ino, hash_ino;
+	int offset;
+
+	ino = bh->b_blocknr;
+	offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]);
+
+	pr_debug("AFFS: insert_hash(dir=%u, ino=%d)\n", (u32)dir->i_ino, ino);
+
+	dir_bh = affs_bread(sb, dir->i_ino);
+	if (!dir_bh)
+		return -EIO;
+
+	hash_ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[offset]);
+	while (hash_ino) {
+		affs_brelse(dir_bh);
+		dir_bh = affs_bread(sb, hash_ino);
+		if (!dir_bh)
+			return -EIO;
+		hash_ino = be32_to_cpu(AFFS_TAIL(sb, dir_bh)->hash_chain);
+	}
+	AFFS_TAIL(sb, bh)->parent = cpu_to_be32(dir->i_ino);
+	AFFS_TAIL(sb, bh)->hash_chain = 0;
+	affs_fix_checksum(sb, bh);
+
+	if (dir->i_ino == dir_bh->b_blocknr)
+		AFFS_HEAD(dir_bh)->table[offset] = cpu_to_be32(ino);
+	else
+		AFFS_TAIL(sb, dir_bh)->hash_chain = cpu_to_be32(ino);
+
+	affs_adjust_checksum(dir_bh, ino);
+	mark_buffer_dirty_inode(dir_bh, dir);
+	affs_brelse(dir_bh);
+
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	dir->i_version++;
+	mark_inode_dirty(dir);
+
+	return 0;
+}
+
+/* Remove a header block from its directory.
+ * caller must hold AFFS_DIR->i_hash_lock!
+ */
+
+int
+affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
+{
+	struct super_block *sb;
+	struct buffer_head *bh;
+	u32 rem_ino, hash_ino;
+	__be32 ino;
+	int offset, retval;
+
+	sb = dir->i_sb;
+	rem_ino = rem_bh->b_blocknr;
+	offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]);
+	pr_debug("AFFS: remove_hash(dir=%d, ino=%d, hashval=%d)\n", (u32)dir->i_ino, rem_ino, offset);
+
+	bh = affs_bread(sb, dir->i_ino);
+	if (!bh)
+		return -EIO;
+
+	retval = -ENOENT;
+	hash_ino = be32_to_cpu(AFFS_HEAD(bh)->table[offset]);
+	while (hash_ino) {
+		if (hash_ino == rem_ino) {
+			ino = AFFS_TAIL(sb, rem_bh)->hash_chain;
+			if (dir->i_ino == bh->b_blocknr)
+				AFFS_HEAD(bh)->table[offset] = ino;
+			else
+				AFFS_TAIL(sb, bh)->hash_chain = ino;
+			affs_adjust_checksum(bh, be32_to_cpu(ino) - hash_ino);
+			mark_buffer_dirty_inode(bh, dir);
+			AFFS_TAIL(sb, rem_bh)->parent = 0;
+			retval = 0;
+			break;
+		}
+		affs_brelse(bh);
+		bh = affs_bread(sb, hash_ino);
+		if (!bh)
+			return -EIO;
+		hash_ino = be32_to_cpu(AFFS_TAIL(sb, bh)->hash_chain);
+	}
+
+	affs_brelse(bh);
+
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	dir->i_version++;
+	mark_inode_dirty(dir);
+
+	return retval;
+}
+
+static void
+affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
+{
+	struct inode *inode = dentry->d_inode;
+	void *data = dentry->d_fsdata;
+	struct list_head *head, *next;
+
+	spin_lock(&inode->i_lock);
+	head = &inode->i_dentry;
+	next = head->next;
+	while (next != head) {
+		dentry = list_entry(next, struct dentry, d_alias);
+		if (entry_ino == (u32)(long)dentry->d_fsdata) {
+			dentry->d_fsdata = data;
+			break;
+		}
+		next = next->next;
+	}
+	spin_unlock(&inode->i_lock);
+}
+
+
+/* Remove header from link chain */
+
+static int
+affs_remove_link(struct dentry *dentry)
+{
+	struct inode *dir, *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh = NULL, *link_bh = NULL;
+	u32 link_ino, ino;
+	int retval;
+
+	pr_debug("AFFS: remove_link(key=%ld)\n", inode->i_ino);
+	retval = -EIO;
+	bh = affs_bread(sb, inode->i_ino);
+	if (!bh)
+		goto done;
+
+	link_ino = (u32)(long)dentry->d_fsdata;
+	if (inode->i_ino == link_ino) {
+		/* we can't remove the head of the link, as its blocknr is still used as ino,
+		 * so we remove the block of the first link instead.
+		 */ 
+		link_ino = be32_to_cpu(AFFS_TAIL(sb, bh)->link_chain);
+		link_bh = affs_bread(sb, link_ino);
+		if (!link_bh)
+			goto done;
+
+		dir = affs_iget(sb, be32_to_cpu(AFFS_TAIL(sb, link_bh)->parent));
+		if (IS_ERR(dir)) {
+			retval = PTR_ERR(dir);
+			goto done;
+		}
+
+		affs_lock_dir(dir);
+		affs_fix_dcache(dentry, link_ino);
+		retval = affs_remove_hash(dir, link_bh);
+		if (retval) {
+			affs_unlock_dir(dir);
+			goto done;
+		}
+		mark_buffer_dirty_inode(link_bh, inode);
+
+		memcpy(AFFS_TAIL(sb, bh)->name, AFFS_TAIL(sb, link_bh)->name, 32);
+		retval = affs_insert_hash(dir, bh);
+		if (retval) {
+			affs_unlock_dir(dir);
+			goto done;
+		}
+		mark_buffer_dirty_inode(bh, inode);
+
+		affs_unlock_dir(dir);
+		iput(dir);
+	} else {
+		link_bh = affs_bread(sb, link_ino);
+		if (!link_bh)
+			goto done;
+	}
+
+	while ((ino = be32_to_cpu(AFFS_TAIL(sb, bh)->link_chain)) != 0) {
+		if (ino == link_ino) {
+			__be32 ino2 = AFFS_TAIL(sb, link_bh)->link_chain;
+			AFFS_TAIL(sb, bh)->link_chain = ino2;
+			affs_adjust_checksum(bh, be32_to_cpu(ino2) - link_ino);
+			mark_buffer_dirty_inode(bh, inode);
+			retval = 0;
+			/* Fix the link count, if bh is a normal header block without links */
+			switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) {
+			case ST_LINKDIR:
+			case ST_LINKFILE:
+				break;
+			default:
+				if (!AFFS_TAIL(sb, bh)->link_chain)
+					inode->i_nlink = 1;
+			}
+			affs_free_block(sb, link_ino);
+			goto done;
+		}
+		affs_brelse(bh);
+		bh = affs_bread(sb, ino);
+		if (!bh)
+			goto done;
+	}
+	retval = -ENOENT;
+done:
+	affs_brelse(link_bh);
+	affs_brelse(bh);
+	return retval;
+}
+
+
+static int
+affs_empty_dir(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh;
+	int retval, size;
+
+	retval = -EIO;
+	bh = affs_bread(sb, inode->i_ino);
+	if (!bh)
+		goto done;
+
+	retval = -ENOTEMPTY;
+	for (size = AFFS_SB(sb)->s_hashsize - 1; size >= 0; size--)
+		if (AFFS_HEAD(bh)->table[size])
+			goto not_empty;
+	retval = 0;
+not_empty:
+	affs_brelse(bh);
+done:
+	return retval;
+}
+
+
+/* Remove a filesystem object. If the object to be removed has
+ * links to it, one of the links must be changed to inherit
+ * the file or directory. As above, any inode will do.
+ * The buffer will not be freed. If the header is a link, the
+ * block will be marked as free.
+ * This function returns a negative error number in case of
+ * an error, else 0 if the inode is to be deleted or 1 if not.
+ */
+
+int
+affs_remove_header(struct dentry *dentry)
+{
+	struct super_block *sb;
+	struct inode *inode, *dir;
+	struct buffer_head *bh = NULL;
+	int retval;
+
+	dir = dentry->d_parent->d_inode;
+	sb = dir->i_sb;
+
+	retval = -ENOENT;
+	inode = dentry->d_inode;
+	if (!inode)
+		goto done;
+
+	pr_debug("AFFS: remove_header(key=%ld)\n", inode->i_ino);
+	retval = -EIO;
+	bh = affs_bread(sb, (u32)(long)dentry->d_fsdata);
+	if (!bh)
+		goto done;
+
+	affs_lock_link(inode);
+	affs_lock_dir(dir);
+	switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) {
+	case ST_USERDIR:
+		/* if we ever want to support links to dirs
+		 * i_hash_lock of the inode must only be
+		 * taken after some checks
+		 */
+		affs_lock_dir(inode);
+		retval = affs_empty_dir(inode);
+		affs_unlock_dir(inode);
+		if (retval)
+			goto done_unlock;
+		break;
+	default:
+		break;
+	}
+
+	retval = affs_remove_hash(dir, bh);
+	if (retval)
+		goto done_unlock;
+	mark_buffer_dirty_inode(bh, inode);
+
+	affs_unlock_dir(dir);
+
+	if (inode->i_nlink > 1)
+		retval = affs_remove_link(dentry);
+	else
+		inode->i_nlink = 0;
+	affs_unlock_link(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+
+done:
+	affs_brelse(bh);
+	return retval;
+
+done_unlock:
+	affs_unlock_dir(dir);
+	affs_unlock_link(inode);
+	goto done;
+}
+
+/* Checksum a block, do various consistency checks and optionally return
+   the blocks type number.  DATA points to the block.  If their pointers
+   are non-null, *PTYPE and *STYPE are set to the primary and secondary
+   block types respectively, *HASHSIZE is set to the size of the hashtable
+   (which lets us calculate the block size).
+   Returns non-zero if the block is not consistent. */
+
+u32
+affs_checksum_block(struct super_block *sb, struct buffer_head *bh)
+{
+	__be32 *ptr = (__be32 *)bh->b_data;
+	u32 sum;
+	int bsize;
+
+	sum = 0;
+	for (bsize = sb->s_blocksize / sizeof(__be32); bsize > 0; bsize--)
+		sum += be32_to_cpu(*ptr++);
+	return sum;
+}
+
+/*
+ * Calculate the checksum of a disk block and store it
+ * at the indicated position.
+ */
+
+void
+affs_fix_checksum(struct super_block *sb, struct buffer_head *bh)
+{
+	int cnt = sb->s_blocksize / sizeof(__be32);
+	__be32 *ptr = (__be32 *)bh->b_data;
+	u32 checksum;
+	__be32 *checksumptr;
+
+	checksumptr = ptr + 5;
+	*checksumptr = 0;
+	for (checksum = 0; cnt > 0; ptr++, cnt--)
+		checksum += be32_to_cpu(*ptr);
+	*checksumptr = cpu_to_be32(-checksum);
+}
+
+void
+secs_to_datestamp(time_t secs, struct affs_date *ds)
+{
+	u32	 days;
+	u32	 minute;
+
+	secs -= sys_tz.tz_minuteswest * 60 + ((8 * 365 + 2) * 24 * 60 * 60);
+	if (secs < 0)
+		secs = 0;
+	days    = secs / 86400;
+	secs   -= days * 86400;
+	minute  = secs / 60;
+	secs   -= minute * 60;
+
+	ds->days = cpu_to_be32(days);
+	ds->mins = cpu_to_be32(minute);
+	ds->ticks = cpu_to_be32(secs * 50);
+}
+
+mode_t
+prot_to_mode(u32 prot)
+{
+	int mode = 0;
+
+	if (!(prot & FIBF_NOWRITE))
+		mode |= S_IWUSR;
+	if (!(prot & FIBF_NOREAD))
+		mode |= S_IRUSR;
+	if (!(prot & FIBF_NOEXECUTE))
+		mode |= S_IXUSR;
+	if (prot & FIBF_GRP_WRITE)
+		mode |= S_IWGRP;
+	if (prot & FIBF_GRP_READ)
+		mode |= S_IRGRP;
+	if (prot & FIBF_GRP_EXECUTE)
+		mode |= S_IXGRP;
+	if (prot & FIBF_OTR_WRITE)
+		mode |= S_IWOTH;
+	if (prot & FIBF_OTR_READ)
+		mode |= S_IROTH;
+	if (prot & FIBF_OTR_EXECUTE)
+		mode |= S_IXOTH;
+
+	return mode;
+}
+
+void
+mode_to_prot(struct inode *inode)
+{
+	u32 prot = AFFS_I(inode)->i_protect;
+	mode_t mode = inode->i_mode;
+
+	if (!(mode & S_IXUSR))
+		prot |= FIBF_NOEXECUTE;
+	if (!(mode & S_IRUSR))
+		prot |= FIBF_NOREAD;
+	if (!(mode & S_IWUSR))
+		prot |= FIBF_NOWRITE;
+	if (mode & S_IXGRP)
+		prot |= FIBF_GRP_EXECUTE;
+	if (mode & S_IRGRP)
+		prot |= FIBF_GRP_READ;
+	if (mode & S_IWGRP)
+		prot |= FIBF_GRP_WRITE;
+	if (mode & S_IXOTH)
+		prot |= FIBF_OTR_EXECUTE;
+	if (mode & S_IROTH)
+		prot |= FIBF_OTR_READ;
+	if (mode & S_IWOTH)
+		prot |= FIBF_OTR_WRITE;
+
+	AFFS_I(inode)->i_protect = prot;
+}
+
+void
+affs_error(struct super_block *sb, const char *function, const char *fmt, ...)
+{
+	va_list	 args;
+
+	va_start(args,fmt);
+	vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args);
+	va_end(args);
+
+	printk(KERN_CRIT "AFFS error (device %s): %s(): %s\n", sb->s_id,
+		function,ErrorBuffer);
+	if (!(sb->s_flags & MS_RDONLY))
+		printk(KERN_WARNING "AFFS: Remounting filesystem read-only\n");
+	sb->s_flags |= MS_RDONLY;
+}
+
+void
+affs_warning(struct super_block *sb, const char *function, const char *fmt, ...)
+{
+	va_list	 args;
+
+	va_start(args,fmt);
+	vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args);
+	va_end(args);
+
+	printk(KERN_WARNING "AFFS warning (device %s): %s(): %s\n", sb->s_id,
+		function,ErrorBuffer);
+}
+
+/* Check if the name is valid for a affs object. */
+
+int
+affs_check_name(const unsigned char *name, int len)
+{
+	int	 i;
+
+	if (len > 30)
+#ifdef AFFS_NO_TRUNCATE
+		return -ENAMETOOLONG;
+#else
+		len = 30;
+#endif
+
+	for (i = 0; i < len; i++) {
+		if (name[i] < ' ' || name[i] == ':'
+		    || (name[i] > 0x7e && name[i] < 0xa0))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* This function copies name to bstr, with at most 30
+ * characters length. The bstr will be prepended by
+ * a length byte.
+ * NOTE: The name will must be already checked by
+ *       affs_check_name()!
+ */
+
+int
+affs_copy_name(unsigned char *bstr, struct dentry *dentry)
+{
+	int len = min(dentry->d_name.len, 30u);
+
+	*bstr++ = len;
+	memcpy(bstr, dentry->d_name.name, len);
+	return len;
+}
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
new file mode 100644
index 00000000..3e262711
--- /dev/null
+++ b/fs/affs/bitmap.c
@@ -0,0 +1,390 @@
+/*
+ *  linux/fs/affs/bitmap.c
+ *
+ *  (c) 1996 Hans-Joachim Widmaier
+ *
+ *  bitmap.c contains the code that handles all bitmap related stuff -
+ *  block allocation, deallocation, calculation of free space.
+ */
+
+#include <linux/slab.h>
+#include "affs.h"
+
+/* This is, of course, shamelessly stolen from fs/minix */
+
+static const int nibblemap[] = { 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4 };
+
+static u32
+affs_count_free_bits(u32 blocksize, const void *data)
+{
+	const u32 *map;
+	u32 free;
+	u32 tmp;
+
+	map = data;
+	free = 0;
+	for (blocksize /= 4; blocksize > 0; blocksize--) {
+		tmp = *map++;
+		while (tmp) {
+			free += nibblemap[tmp & 0xf];
+			tmp >>= 4;
+		}
+	}
+
+	return free;
+}
+
+u32
+affs_count_free_blocks(struct super_block *sb)
+{
+	struct affs_bm_info *bm;
+	u32 free;
+	int i;
+
+	pr_debug("AFFS: count_free_blocks()\n");
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	mutex_lock(&AFFS_SB(sb)->s_bmlock);
+
+	bm = AFFS_SB(sb)->s_bitmap;
+	free = 0;
+	for (i = AFFS_SB(sb)->s_bmap_count; i > 0; bm++, i--)
+		free += bm->bm_free;
+
+	mutex_unlock(&AFFS_SB(sb)->s_bmlock);
+
+	return free;
+}
+
+void
+affs_free_block(struct super_block *sb, u32 block)
+{
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+	struct affs_bm_info *bm;
+	struct buffer_head *bh;
+	u32 blk, bmap, bit, mask, tmp;
+	__be32 *data;
+
+	pr_debug("AFFS: free_block(%u)\n", block);
+
+	if (block > sbi->s_partition_size)
+		goto err_range;
+
+	blk     = block - sbi->s_reserved;
+	bmap    = blk / sbi->s_bmap_bits;
+	bit     = blk % sbi->s_bmap_bits;
+	bm      = &sbi->s_bitmap[bmap];
+
+	mutex_lock(&sbi->s_bmlock);
+
+	bh = sbi->s_bmap_bh;
+	if (sbi->s_last_bmap != bmap) {
+		affs_brelse(bh);
+		bh = affs_bread(sb, bm->bm_key);
+		if (!bh)
+			goto err_bh_read;
+		sbi->s_bmap_bh = bh;
+		sbi->s_last_bmap = bmap;
+	}
+
+	mask = 1 << (bit & 31);
+	data = (__be32 *)bh->b_data + bit / 32 + 1;
+
+	/* mark block free */
+	tmp = be32_to_cpu(*data);
+	if (tmp & mask)
+		goto err_free;
+	*data = cpu_to_be32(tmp | mask);
+
+	/* fix checksum */
+	tmp = be32_to_cpu(*(__be32 *)bh->b_data);
+	*(__be32 *)bh->b_data = cpu_to_be32(tmp - mask);
+
+	mark_buffer_dirty(bh);
+	sb->s_dirt = 1;
+	bm->bm_free++;
+
+	mutex_unlock(&sbi->s_bmlock);
+	return;
+
+err_free:
+	affs_warning(sb,"affs_free_block","Trying to free block %u which is already free", block);
+	mutex_unlock(&sbi->s_bmlock);
+	return;
+
+err_bh_read:
+	affs_error(sb,"affs_free_block","Cannot read bitmap block %u", bm->bm_key);
+	sbi->s_bmap_bh = NULL;
+	sbi->s_last_bmap = ~0;
+	mutex_unlock(&sbi->s_bmlock);
+	return;
+
+err_range:
+	affs_error(sb, "affs_free_block","Block %u outside partition", block);
+	return;
+}
+
+/*
+ * Allocate a block in the given allocation zone.
+ * Since we have to byte-swap the bitmap on little-endian
+ * machines, this is rather expensive. Therefore we will
+ * preallocate up to 16 blocks from the same word, if
+ * possible. We are not doing preallocations in the
+ * header zone, though.
+ */
+
+u32
+affs_alloc_block(struct inode *inode, u32 goal)
+{
+	struct super_block *sb;
+	struct affs_sb_info *sbi;
+	struct affs_bm_info *bm;
+	struct buffer_head *bh;
+	__be32 *data, *enddata;
+	u32 blk, bmap, bit, mask, mask2, tmp;
+	int i;
+
+	sb = inode->i_sb;
+	sbi = AFFS_SB(sb);
+
+	pr_debug("AFFS: balloc(inode=%lu,goal=%u): ", inode->i_ino, goal);
+
+	if (AFFS_I(inode)->i_pa_cnt) {
+		pr_debug("%d\n", AFFS_I(inode)->i_lastalloc+1);
+		AFFS_I(inode)->i_pa_cnt--;
+		return ++AFFS_I(inode)->i_lastalloc;
+	}
+
+	if (!goal || goal > sbi->s_partition_size) {
+		if (goal)
+			affs_warning(sb, "affs_balloc", "invalid goal %d", goal);
+		//if (!AFFS_I(inode)->i_last_block)
+		//	affs_warning(sb, "affs_balloc", "no last alloc block");
+		goal = sbi->s_reserved;
+	}
+
+	blk = goal - sbi->s_reserved;
+	bmap = blk / sbi->s_bmap_bits;
+	bm = &sbi->s_bitmap[bmap];
+
+	mutex_lock(&sbi->s_bmlock);
+
+	if (bm->bm_free)
+		goto find_bmap_bit;
+
+find_bmap:
+	/* search for the next bmap buffer with free bits */
+	i = sbi->s_bmap_count;
+	do {
+		if (--i < 0)
+			goto err_full;
+		bmap++;
+		bm++;
+		if (bmap < sbi->s_bmap_count)
+			continue;
+		/* restart search at zero */
+		bmap = 0;
+		bm = sbi->s_bitmap;
+	} while (!bm->bm_free);
+	blk = bmap * sbi->s_bmap_bits;
+
+find_bmap_bit:
+
+	bh = sbi->s_bmap_bh;
+	if (sbi->s_last_bmap != bmap) {
+		affs_brelse(bh);
+		bh = affs_bread(sb, bm->bm_key);
+		if (!bh)
+			goto err_bh_read;
+		sbi->s_bmap_bh = bh;
+		sbi->s_last_bmap = bmap;
+	}
+
+	/* find an unused block in this bitmap block */
+	bit = blk % sbi->s_bmap_bits;
+	data = (__be32 *)bh->b_data + bit / 32 + 1;
+	enddata = (__be32 *)((u8 *)bh->b_data + sb->s_blocksize);
+	mask = ~0UL << (bit & 31);
+	blk &= ~31UL;
+
+	tmp = be32_to_cpu(*data);
+	if (tmp & mask)
+		goto find_bit;
+
+	/* scan the rest of the buffer */
+	do {
+		blk += 32;
+		if (++data >= enddata)
+			/* didn't find something, can only happen
+			 * if scan didn't start at 0, try next bmap
+			 */
+			goto find_bmap;
+	} while (!*data);
+	tmp = be32_to_cpu(*data);
+	mask = ~0;
+
+find_bit:
+	/* finally look for a free bit in the word */
+	bit = ffs(tmp & mask) - 1;
+	blk += bit + sbi->s_reserved;
+	mask2 = mask = 1 << (bit & 31);
+	AFFS_I(inode)->i_lastalloc = blk;
+
+	/* prealloc as much as possible within this word */
+	while ((mask2 <<= 1)) {
+		if (!(tmp & mask2))
+			break;
+		AFFS_I(inode)->i_pa_cnt++;
+		mask |= mask2;
+	}
+	bm->bm_free -= AFFS_I(inode)->i_pa_cnt + 1;
+
+	*data = cpu_to_be32(tmp & ~mask);
+
+	/* fix checksum */
+	tmp = be32_to_cpu(*(__be32 *)bh->b_data);
+	*(__be32 *)bh->b_data = cpu_to_be32(tmp + mask);
+
+	mark_buffer_dirty(bh);
+	sb->s_dirt = 1;
+
+	mutex_unlock(&sbi->s_bmlock);
+
+	pr_debug("%d\n", blk);
+	return blk;
+
+err_bh_read:
+	affs_error(sb,"affs_read_block","Cannot read bitmap block %u", bm->bm_key);
+	sbi->s_bmap_bh = NULL;
+	sbi->s_last_bmap = ~0;
+err_full:
+	mutex_unlock(&sbi->s_bmlock);
+	pr_debug("failed\n");
+	return 0;
+}
+
+int affs_init_bitmap(struct super_block *sb, int *flags)
+{
+	struct affs_bm_info *bm;
+	struct buffer_head *bmap_bh = NULL, *bh = NULL;
+	__be32 *bmap_blk;
+	u32 size, blk, end, offset, mask;
+	int i, res = 0;
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+
+	if (*flags & MS_RDONLY)
+		return 0;
+
+	if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) {
+		printk(KERN_NOTICE "AFFS: Bitmap invalid - mounting %s read only\n",
+			sb->s_id);
+		*flags |= MS_RDONLY;
+		return 0;
+	}
+
+	sbi->s_last_bmap = ~0;
+	sbi->s_bmap_bh = NULL;
+	sbi->s_bmap_bits = sb->s_blocksize * 8 - 32;
+	sbi->s_bmap_count = (sbi->s_partition_size - sbi->s_reserved +
+				 sbi->s_bmap_bits - 1) / sbi->s_bmap_bits;
+	size = sbi->s_bmap_count * sizeof(*bm);
+	bm = sbi->s_bitmap = kzalloc(size, GFP_KERNEL);
+	if (!sbi->s_bitmap) {
+		printk(KERN_ERR "AFFS: Bitmap allocation failed\n");
+		return -ENOMEM;
+	}
+
+	bmap_blk = (__be32 *)sbi->s_root_bh->b_data;
+	blk = sb->s_blocksize / 4 - 49;
+	end = blk + 25;
+
+	for (i = sbi->s_bmap_count; i > 0; bm++, i--) {
+		affs_brelse(bh);
+
+		bm->bm_key = be32_to_cpu(bmap_blk[blk]);
+		bh = affs_bread(sb, bm->bm_key);
+		if (!bh) {
+			printk(KERN_ERR "AFFS: Cannot read bitmap\n");
+			res = -EIO;
+			goto out;
+		}
+		if (affs_checksum_block(sb, bh)) {
+			printk(KERN_WARNING "AFFS: Bitmap %u invalid - mounting %s read only.\n",
+			       bm->bm_key, sb->s_id);
+			*flags |= MS_RDONLY;
+			goto out;
+		}
+		pr_debug("AFFS: read bitmap block %d: %d\n", blk, bm->bm_key);
+		bm->bm_free = affs_count_free_bits(sb->s_blocksize - 4, bh->b_data + 4);
+
+		/* Don't try read the extension if this is the last block,
+		 * but we also need the right bm pointer below
+		 */
+		if (++blk < end || i == 1)
+			continue;
+		if (bmap_bh)
+			affs_brelse(bmap_bh);
+		bmap_bh = affs_bread(sb, be32_to_cpu(bmap_blk[blk]));
+		if (!bmap_bh) {
+			printk(KERN_ERR "AFFS: Cannot read bitmap extension\n");
+			res = -EIO;
+			goto out;
+		}
+		bmap_blk = (__be32 *)bmap_bh->b_data;
+		blk = 0;
+		end = sb->s_blocksize / 4 - 1;
+	}
+
+	offset = (sbi->s_partition_size - sbi->s_reserved) % sbi->s_bmap_bits;
+	mask = ~(0xFFFFFFFFU << (offset & 31));
+	pr_debug("last word: %d %d %d\n", offset, offset / 32 + 1, mask);
+	offset = offset / 32 + 1;
+
+	if (mask) {
+		u32 old, new;
+
+		/* Mark unused bits in the last word as allocated */
+		old = be32_to_cpu(((__be32 *)bh->b_data)[offset]);
+		new = old & mask;
+		//if (old != new) {
+			((__be32 *)bh->b_data)[offset] = cpu_to_be32(new);
+			/* fix checksum */
+			//new -= old;
+			//old = be32_to_cpu(*(__be32 *)bh->b_data);
+			//*(__be32 *)bh->b_data = cpu_to_be32(old - new);
+			//mark_buffer_dirty(bh);
+		//}
+		/* correct offset for the bitmap count below */
+		//offset++;
+	}
+	while (++offset < sb->s_blocksize / 4)
+		((__be32 *)bh->b_data)[offset] = 0;
+	((__be32 *)bh->b_data)[0] = 0;
+	((__be32 *)bh->b_data)[0] = cpu_to_be32(-affs_checksum_block(sb, bh));
+	mark_buffer_dirty(bh);
+
+	/* recalculate bitmap count for last block */
+	bm--;
+	bm->bm_free = affs_count_free_bits(sb->s_blocksize - 4, bh->b_data + 4);
+
+out:
+	affs_brelse(bh);
+	affs_brelse(bmap_bh);
+	return res;
+}
+
+void affs_free_bitmap(struct super_block *sb)
+{
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+
+	if (!sbi->s_bitmap)
+		return;
+
+	affs_brelse(sbi->s_bmap_bh);
+	sbi->s_bmap_bh = NULL;
+	sbi->s_last_bmap = ~0;
+	kfree(sbi->s_bitmap);
+	sbi->s_bitmap = NULL;
+}
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
new file mode 100644
index 00000000..8ca8f3a5
--- /dev/null
+++ b/fs/affs/dir.c
@@ -0,0 +1,156 @@
+/*
+ *  linux/fs/affs/dir.c
+ *
+ *  (c) 1996  Hans-Joachim Widmaier - Rewritten
+ *
+ *  (C) 1993  Ray Burr - Modified for Amiga FFS filesystem.
+ *
+ *  (C) 1992  Eric Youngdale Modified for ISO 9660 filesystem.
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ *
+ *  affs directory handling functions
+ *
+ */
+
+#include "affs.h"
+
+static int affs_readdir(struct file *, void *, filldir_t);
+
+const struct file_operations affs_dir_operations = {
+	.read		= generic_read_dir,
+	.llseek		= generic_file_llseek,
+	.readdir	= affs_readdir,
+	.fsync		= affs_file_fsync,
+};
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations affs_dir_inode_operations = {
+	.create		= affs_create,
+	.lookup		= affs_lookup,
+	.link		= affs_link,
+	.unlink		= affs_unlink,
+	.symlink	= affs_symlink,
+	.mkdir		= affs_mkdir,
+	.rmdir		= affs_rmdir,
+	.rename		= affs_rename,
+	.setattr	= affs_notify_change,
+};
+
+static int
+affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode		*inode = filp->f_path.dentry->d_inode;
+	struct super_block	*sb = inode->i_sb;
+	struct buffer_head	*dir_bh;
+	struct buffer_head	*fh_bh;
+	unsigned char		*name;
+	int			 namelen;
+	u32			 i;
+	int			 hash_pos;
+	int			 chain_pos;
+	u32			 f_pos;
+	u32			 ino;
+	int			 stored;
+	int			 res;
+
+	pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)filp->f_pos);
+
+	stored = 0;
+	res    = -EIO;
+	dir_bh = NULL;
+	fh_bh  = NULL;
+	f_pos  = filp->f_pos;
+
+	if (f_pos == 0) {
+		filp->private_data = (void *)0;
+		if (filldir(dirent, ".", 1, f_pos, inode->i_ino, DT_DIR) < 0)
+			return 0;
+		filp->f_pos = f_pos = 1;
+		stored++;
+	}
+	if (f_pos == 1) {
+		if (filldir(dirent, "..", 2, f_pos, parent_ino(filp->f_path.dentry), DT_DIR) < 0)
+			return stored;
+		filp->f_pos = f_pos = 2;
+		stored++;
+	}
+
+	affs_lock_dir(inode);
+	chain_pos = (f_pos - 2) & 0xffff;
+	hash_pos  = (f_pos - 2) >> 16;
+	if (chain_pos == 0xffff) {
+		affs_warning(sb, "readdir", "More than 65535 entries in chain");
+		chain_pos = 0;
+		hash_pos++;
+		filp->f_pos = ((hash_pos << 16) | chain_pos) + 2;
+	}
+	dir_bh = affs_bread(sb, inode->i_ino);
+	if (!dir_bh)
+		goto readdir_out;
+
+	/* If the directory hasn't changed since the last call to readdir(),
+	 * we can jump directly to where we left off.
+	 */
+	ino = (u32)(long)filp->private_data;
+	if (ino && filp->f_version == inode->i_version) {
+		pr_debug("AFFS: readdir() left off=%d\n", ino);
+		goto inside;
+	}
+
+	ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]);
+	for (i = 0; ino && i < chain_pos; i++) {
+		fh_bh = affs_bread(sb, ino);
+		if (!fh_bh) {
+			affs_error(sb, "readdir","Cannot read block %d", i);
+			goto readdir_out;
+		}
+		ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
+		affs_brelse(fh_bh);
+		fh_bh = NULL;
+	}
+	if (ino)
+		goto inside;
+	hash_pos++;
+
+	for (; hash_pos < AFFS_SB(sb)->s_hashsize; hash_pos++) {
+		ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]);
+		if (!ino)
+			continue;
+		f_pos = (hash_pos << 16) + 2;
+inside:
+		do {
+			fh_bh = affs_bread(sb, ino);
+			if (!fh_bh) {
+				affs_error(sb, "readdir","Cannot read block %d", ino);
+				goto readdir_done;
+			}
+
+			namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
+			name = AFFS_TAIL(sb, fh_bh)->name + 1;
+			pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n",
+				 namelen, name, ino, hash_pos, f_pos);
+			if (filldir(dirent, name, namelen, f_pos, ino, DT_UNKNOWN) < 0)
+				goto readdir_done;
+			stored++;
+			f_pos++;
+			ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
+			affs_brelse(fh_bh);
+			fh_bh = NULL;
+		} while (ino);
+	}
+readdir_done:
+	filp->f_pos = f_pos;
+	filp->f_version = inode->i_version;
+	filp->private_data = (void *)(long)ino;
+	res = stored;
+
+readdir_out:
+	affs_brelse(dir_bh);
+	affs_brelse(fh_bh);
+	affs_unlock_dir(inode);
+	pr_debug("AFFS: readdir()=%d\n", stored);
+	return res;
+}
diff --git a/fs/affs/file.c b/fs/affs/file.c
new file mode 100644
index 00000000..acf321b7
--- /dev/null
+++ b/fs/affs/file.c
@@ -0,0 +1,936 @@
+/*
+ *  linux/fs/affs/file.c
+ *
+ *  (c) 1996  Hans-Joachim Widmaier - Rewritten
+ *
+ *  (C) 1993  Ray Burr - Modified for Amiga FFS filesystem.
+ *
+ *  (C) 1992  Eric Youngdale Modified for ISO 9660 filesystem.
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ *
+ *  affs regular file handling primitives
+ */
+
+#include "affs.h"
+
+#if PAGE_SIZE < 4096
+#error PAGE_SIZE must be at least 4096
+#endif
+
+static int affs_grow_extcache(struct inode *inode, u32 lc_idx);
+static struct buffer_head *affs_alloc_extblock(struct inode *inode, struct buffer_head *bh, u32 ext);
+static inline struct buffer_head *affs_get_extblock(struct inode *inode, u32 ext);
+static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
+static int affs_file_open(struct inode *inode, struct file *filp);
+static int affs_file_release(struct inode *inode, struct file *filp);
+
+const struct file_operations affs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.open		= affs_file_open,
+	.release	= affs_file_release,
+	.fsync		= affs_file_fsync,
+	.splice_read	= generic_file_splice_read,
+};
+
+const struct inode_operations affs_file_inode_operations = {
+	.truncate	= affs_truncate,
+	.setattr	= affs_notify_change,
+};
+
+static int
+affs_file_open(struct inode *inode, struct file *filp)
+{
+	pr_debug("AFFS: open(%lu,%d)\n",
+		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
+	atomic_inc(&AFFS_I(inode)->i_opencnt);
+	return 0;
+}
+
+static int
+affs_file_release(struct inode *inode, struct file *filp)
+{
+	pr_debug("AFFS: release(%lu, %d)\n",
+		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
+
+	if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) {
+		mutex_lock(&inode->i_mutex);
+		if (inode->i_size != AFFS_I(inode)->mmu_private)
+			affs_truncate(inode);
+		affs_free_prealloc(inode);
+		mutex_unlock(&inode->i_mutex);
+	}
+
+	return 0;
+}
+
+static int
+affs_grow_extcache(struct inode *inode, u32 lc_idx)
+{
+	struct super_block	*sb = inode->i_sb;
+	struct buffer_head	*bh;
+	u32 lc_max;
+	int i, j, key;
+
+	if (!AFFS_I(inode)->i_lc) {
+		char *ptr = (char *)get_zeroed_page(GFP_NOFS);
+		if (!ptr)
+			return -ENOMEM;
+		AFFS_I(inode)->i_lc = (u32 *)ptr;
+		AFFS_I(inode)->i_ac = (struct affs_ext_key *)(ptr + AFFS_CACHE_SIZE / 2);
+	}
+
+	lc_max = AFFS_LC_SIZE << AFFS_I(inode)->i_lc_shift;
+
+	if (AFFS_I(inode)->i_extcnt > lc_max) {
+		u32 lc_shift, lc_mask, tmp, off;
+
+		/* need to recalculate linear cache, start from old size */
+		lc_shift = AFFS_I(inode)->i_lc_shift;
+		tmp = (AFFS_I(inode)->i_extcnt / AFFS_LC_SIZE) >> lc_shift;
+		for (; tmp; tmp >>= 1)
+			lc_shift++;
+		lc_mask = (1 << lc_shift) - 1;
+
+		/* fix idx and old size to new shift */
+		lc_idx >>= (lc_shift - AFFS_I(inode)->i_lc_shift);
+		AFFS_I(inode)->i_lc_size >>= (lc_shift - AFFS_I(inode)->i_lc_shift);
+
+		/* first shrink old cache to make more space */
+		off = 1 << (lc_shift - AFFS_I(inode)->i_lc_shift);
+		for (i = 1, j = off; j < AFFS_LC_SIZE; i++, j += off)
+			AFFS_I(inode)->i_ac[i] = AFFS_I(inode)->i_ac[j];
+
+		AFFS_I(inode)->i_lc_shift = lc_shift;
+		AFFS_I(inode)->i_lc_mask = lc_mask;
+	}
+
+	/* fill cache to the needed index */
+	i = AFFS_I(inode)->i_lc_size;
+	AFFS_I(inode)->i_lc_size = lc_idx + 1;
+	for (; i <= lc_idx; i++) {
+		if (!i) {
+			AFFS_I(inode)->i_lc[0] = inode->i_ino;
+			continue;
+		}
+		key = AFFS_I(inode)->i_lc[i - 1];
+		j = AFFS_I(inode)->i_lc_mask + 1;
+		// unlock cache
+		for (; j > 0; j--) {
+			bh = affs_bread(sb, key);
+			if (!bh)
+				goto err;
+			key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension);
+			affs_brelse(bh);
+		}
+		// lock cache
+		AFFS_I(inode)->i_lc[i] = key;
+	}
+
+	return 0;
+
+err:
+	// lock cache
+	return -EIO;
+}
+
+static struct buffer_head *
+affs_alloc_extblock(struct inode *inode, struct buffer_head *bh, u32 ext)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *new_bh;
+	u32 blocknr, tmp;
+
+	blocknr = affs_alloc_block(inode, bh->b_blocknr);
+	if (!blocknr)
+		return ERR_PTR(-ENOSPC);
+
+	new_bh = affs_getzeroblk(sb, blocknr);
+	if (!new_bh) {
+		affs_free_block(sb, blocknr);
+		return ERR_PTR(-EIO);
+	}
+
+	AFFS_HEAD(new_bh)->ptype = cpu_to_be32(T_LIST);
+	AFFS_HEAD(new_bh)->key = cpu_to_be32(blocknr);
+	AFFS_TAIL(sb, new_bh)->stype = cpu_to_be32(ST_FILE);
+	AFFS_TAIL(sb, new_bh)->parent = cpu_to_be32(inode->i_ino);
+	affs_fix_checksum(sb, new_bh);
+
+	mark_buffer_dirty_inode(new_bh, inode);
+
+	tmp = be32_to_cpu(AFFS_TAIL(sb, bh)->extension);
+	if (tmp)
+		affs_warning(sb, "alloc_ext", "previous extension set (%x)", tmp);
+	AFFS_TAIL(sb, bh)->extension = cpu_to_be32(blocknr);
+	affs_adjust_checksum(bh, blocknr - tmp);
+	mark_buffer_dirty_inode(bh, inode);
+
+	AFFS_I(inode)->i_extcnt++;
+	mark_inode_dirty(inode);
+
+	return new_bh;
+}
+
+static inline struct buffer_head *
+affs_get_extblock(struct inode *inode, u32 ext)
+{
+	/* inline the simplest case: same extended block as last time */
+	struct buffer_head *bh = AFFS_I(inode)->i_ext_bh;
+	if (ext == AFFS_I(inode)->i_ext_last)
+		get_bh(bh);
+	else
+		/* we have to do more (not inlined) */
+		bh = affs_get_extblock_slow(inode, ext);
+
+	return bh;
+}
+
+static struct buffer_head *
+affs_get_extblock_slow(struct inode *inode, u32 ext)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh;
+	u32 ext_key;
+	u32 lc_idx, lc_off, ac_idx;
+	u32 tmp, idx;
+
+	if (ext == AFFS_I(inode)->i_ext_last + 1) {
+		/* read the next extended block from the current one */
+		bh = AFFS_I(inode)->i_ext_bh;
+		ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension);
+		if (ext < AFFS_I(inode)->i_extcnt)
+			goto read_ext;
+		if (ext > AFFS_I(inode)->i_extcnt)
+			BUG();
+		bh = affs_alloc_extblock(inode, bh, ext);
+		if (IS_ERR(bh))
+			return bh;
+		goto store_ext;
+	}
+
+	if (ext == 0) {
+		/* we seek back to the file header block */
+		ext_key = inode->i_ino;
+		goto read_ext;
+	}
+
+	if (ext >= AFFS_I(inode)->i_extcnt) {
+		struct buffer_head *prev_bh;
+
+		/* allocate a new extended block */
+		if (ext > AFFS_I(inode)->i_extcnt)
+			BUG();
+
+		/* get previous extended block */
+		prev_bh = affs_get_extblock(inode, ext - 1);
+		if (IS_ERR(prev_bh))
+			return prev_bh;
+		bh = affs_alloc_extblock(inode, prev_bh, ext);
+		affs_brelse(prev_bh);
+		if (IS_ERR(bh))
+			return bh;
+		goto store_ext;
+	}
+
+again:
+	/* check if there is an extended cache and whether it's large enough */
+	lc_idx = ext >> AFFS_I(inode)->i_lc_shift;
+	lc_off = ext & AFFS_I(inode)->i_lc_mask;
+
+	if (lc_idx >= AFFS_I(inode)->i_lc_size) {
+		int err;
+
+		err = affs_grow_extcache(inode, lc_idx);
+		if (err)
+			return ERR_PTR(err);
+		goto again;
+	}
+
+	/* every n'th key we find in the linear cache */
+	if (!lc_off) {
+		ext_key = AFFS_I(inode)->i_lc[lc_idx];
+		goto read_ext;
+	}
+
+	/* maybe it's still in the associative cache */
+	ac_idx = (ext - lc_idx - 1) & AFFS_AC_MASK;
+	if (AFFS_I(inode)->i_ac[ac_idx].ext == ext) {
+		ext_key = AFFS_I(inode)->i_ac[ac_idx].key;
+		goto read_ext;
+	}
+
+	/* try to find one of the previous extended blocks */
+	tmp = ext;
+	idx = ac_idx;
+	while (--tmp, --lc_off > 0) {
+		idx = (idx - 1) & AFFS_AC_MASK;
+		if (AFFS_I(inode)->i_ac[idx].ext == tmp) {
+			ext_key = AFFS_I(inode)->i_ac[idx].key;
+			goto find_ext;
+		}
+	}
+
+	/* fall back to the linear cache */
+	ext_key = AFFS_I(inode)->i_lc[lc_idx];
+find_ext:
+	/* read all extended blocks until we find the one we need */
+	//unlock cache
+	do {
+		bh = affs_bread(sb, ext_key);
+		if (!bh)
+			goto err_bread;
+		ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension);
+		affs_brelse(bh);
+		tmp++;
+	} while (tmp < ext);
+	//lock cache
+
+	/* store it in the associative cache */
+	// recalculate ac_idx?
+	AFFS_I(inode)->i_ac[ac_idx].ext = ext;
+	AFFS_I(inode)->i_ac[ac_idx].key = ext_key;
+
+read_ext:
+	/* finally read the right extended block */
+	//unlock cache
+	bh = affs_bread(sb, ext_key);
+	if (!bh)
+		goto err_bread;
+	//lock cache
+
+store_ext:
+	/* release old cached extended block and store the new one */
+	affs_brelse(AFFS_I(inode)->i_ext_bh);
+	AFFS_I(inode)->i_ext_last = ext;
+	AFFS_I(inode)->i_ext_bh = bh;
+	get_bh(bh);
+
+	return bh;
+
+err_bread:
+	affs_brelse(bh);
+	return ERR_PTR(-EIO);
+}
+
+static int
+affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create)
+{
+	struct super_block	*sb = inode->i_sb;
+	struct buffer_head	*ext_bh;
+	u32			 ext;
+
+	pr_debug("AFFS: get_block(%u, %lu)\n", (u32)inode->i_ino, (unsigned long)block);
+
+	BUG_ON(block > (sector_t)0x7fffffffUL);
+
+	if (block >= AFFS_I(inode)->i_blkcnt) {
+		if (block > AFFS_I(inode)->i_blkcnt || !create)
+			goto err_big;
+	} else
+		create = 0;
+
+	//lock cache
+	affs_lock_ext(inode);
+
+	ext = (u32)block / AFFS_SB(sb)->s_hashsize;
+	block -= ext * AFFS_SB(sb)->s_hashsize;
+	ext_bh = affs_get_extblock(inode, ext);
+	if (IS_ERR(ext_bh))
+		goto err_ext;
+	map_bh(bh_result, sb, (sector_t)be32_to_cpu(AFFS_BLOCK(sb, ext_bh, block)));
+
+	if (create) {
+		u32 blocknr = affs_alloc_block(inode, ext_bh->b_blocknr);
+		if (!blocknr)
+			goto err_alloc;
+		set_buffer_new(bh_result);
+		AFFS_I(inode)->mmu_private += AFFS_SB(sb)->s_data_blksize;
+		AFFS_I(inode)->i_blkcnt++;
+
+		/* store new block */
+		if (bh_result->b_blocknr)
+			affs_warning(sb, "get_block", "block already set (%x)", bh_result->b_blocknr);
+		AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr);
+		AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1);
+		affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1);
+		bh_result->b_blocknr = blocknr;
+
+		if (!block) {
+			/* insert first block into header block */
+			u32 tmp = be32_to_cpu(AFFS_HEAD(ext_bh)->first_data);
+			if (tmp)
+				affs_warning(sb, "get_block", "first block already set (%d)", tmp);
+			AFFS_HEAD(ext_bh)->first_data = cpu_to_be32(blocknr);
+			affs_adjust_checksum(ext_bh, blocknr - tmp);
+		}
+	}
+
+	affs_brelse(ext_bh);
+	//unlock cache
+	affs_unlock_ext(inode);
+	return 0;
+
+err_big:
+	affs_error(inode->i_sb,"get_block","strange block request %d", block);
+	return -EIO;
+err_ext:
+	// unlock cache
+	affs_unlock_ext(inode);
+	return PTR_ERR(ext_bh);
+err_alloc:
+	brelse(ext_bh);
+	clear_buffer_mapped(bh_result);
+	bh_result->b_bdev = NULL;
+	// unlock cache
+	affs_unlock_ext(inode);
+	return -ENOSPC;
+}
+
+static int affs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, affs_get_block, wbc);
+}
+
+static int affs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, affs_get_block);
+}
+
+static int affs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	*pagep = NULL;
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+				affs_get_block,
+				&AFFS_I(mapping->host)->mmu_private);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
+
+static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,affs_get_block);
+}
+
+const struct address_space_operations affs_aops = {
+	.readpage = affs_readpage,
+	.writepage = affs_writepage,
+	.write_begin = affs_write_begin,
+	.write_end = generic_write_end,
+	.bmap = _affs_bmap
+};
+
+static inline struct buffer_head *
+affs_bread_ino(struct inode *inode, int block, int create)
+{
+	struct buffer_head *bh, tmp_bh;
+	int err;
+
+	tmp_bh.b_state = 0;
+	err = affs_get_block(inode, block, &tmp_bh, create);
+	if (!err) {
+		bh = affs_bread(inode->i_sb, tmp_bh.b_blocknr);
+		if (bh) {
+			bh->b_state |= tmp_bh.b_state;
+			return bh;
+		}
+		err = -EIO;
+	}
+	return ERR_PTR(err);
+}
+
+static inline struct buffer_head *
+affs_getzeroblk_ino(struct inode *inode, int block)
+{
+	struct buffer_head *bh, tmp_bh;
+	int err;
+
+	tmp_bh.b_state = 0;
+	err = affs_get_block(inode, block, &tmp_bh, 1);
+	if (!err) {
+		bh = affs_getzeroblk(inode->i_sb, tmp_bh.b_blocknr);
+		if (bh) {
+			bh->b_state |= tmp_bh.b_state;
+			return bh;
+		}
+		err = -EIO;
+	}
+	return ERR_PTR(err);
+}
+
+static inline struct buffer_head *
+affs_getemptyblk_ino(struct inode *inode, int block)
+{
+	struct buffer_head *bh, tmp_bh;
+	int err;
+
+	tmp_bh.b_state = 0;
+	err = affs_get_block(inode, block, &tmp_bh, 1);
+	if (!err) {
+		bh = affs_getemptyblk(inode->i_sb, tmp_bh.b_blocknr);
+		if (bh) {
+			bh->b_state |= tmp_bh.b_state;
+			return bh;
+		}
+		err = -EIO;
+	}
+	return ERR_PTR(err);
+}
+
+static int
+affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh;
+	char *data;
+	u32 bidx, boff, bsize;
+	u32 tmp;
+
+	pr_debug("AFFS: read_page(%u, %ld, %d, %d)\n", (u32)inode->i_ino, page->index, from, to);
+	BUG_ON(from > to || to > PAGE_CACHE_SIZE);
+	kmap(page);
+	data = page_address(page);
+	bsize = AFFS_SB(sb)->s_data_blksize;
+	tmp = (page->index << PAGE_CACHE_SHIFT) + from;
+	bidx = tmp / bsize;
+	boff = tmp % bsize;
+
+	while (from < to) {
+		bh = affs_bread_ino(inode, bidx, 0);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
+		tmp = min(bsize - boff, to - from);
+		BUG_ON(from + tmp > to || tmp > bsize);
+		memcpy(data + from, AFFS_DATA(bh) + boff, tmp);
+		affs_brelse(bh);
+		bidx++;
+		from += tmp;
+		boff = 0;
+	}
+	flush_dcache_page(page);
+	kunmap(page);
+	return 0;
+}
+
+static int
+affs_extent_file_ofs(struct inode *inode, u32 newsize)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh, *prev_bh;
+	u32 bidx, boff;
+	u32 size, bsize;
+	u32 tmp;
+
+	pr_debug("AFFS: extent_file(%u, %d)\n", (u32)inode->i_ino, newsize);
+	bsize = AFFS_SB(sb)->s_data_blksize;
+	bh = NULL;
+	size = AFFS_I(inode)->mmu_private;
+	bidx = size / bsize;
+	boff = size % bsize;
+	if (boff) {
+		bh = affs_bread_ino(inode, bidx, 0);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
+		tmp = min(bsize - boff, newsize - size);
+		BUG_ON(boff + tmp > bsize || tmp > bsize);
+		memset(AFFS_DATA(bh) + boff, 0, tmp);
+		be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp);
+		affs_fix_checksum(sb, bh);
+		mark_buffer_dirty_inode(bh, inode);
+		size += tmp;
+		bidx++;
+	} else if (bidx) {
+		bh = affs_bread_ino(inode, bidx - 1, 0);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
+	}
+
+	while (size < newsize) {
+		prev_bh = bh;
+		bh = affs_getzeroblk_ino(inode, bidx);
+		if (IS_ERR(bh))
+			goto out;
+		tmp = min(bsize, newsize - size);
+		BUG_ON(tmp > bsize);
+		AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
+		AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
+		AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
+		AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
+		affs_fix_checksum(sb, bh);
+		bh->b_state &= ~(1UL << BH_New);
+		mark_buffer_dirty_inode(bh, inode);
+		if (prev_bh) {
+			u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
+			if (tmp)
+				affs_warning(sb, "extent_file_ofs", "next block already set for %d (%d)", bidx, tmp);
+			AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
+			affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp);
+			mark_buffer_dirty_inode(prev_bh, inode);
+			affs_brelse(prev_bh);
+		}
+		size += bsize;
+		bidx++;
+	}
+	affs_brelse(bh);
+	inode->i_size = AFFS_I(inode)->mmu_private = newsize;
+	return 0;
+
+out:
+	inode->i_size = AFFS_I(inode)->mmu_private = newsize;
+	return PTR_ERR(bh);
+}
+
+static int
+affs_readpage_ofs(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	u32 to;
+	int err;
+
+	pr_debug("AFFS: read_page(%u, %ld)\n", (u32)inode->i_ino, page->index);
+	to = PAGE_CACHE_SIZE;
+	if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) {
+		to = inode->i_size & ~PAGE_CACHE_MASK;
+		memset(page_address(page) + to, 0, PAGE_CACHE_SIZE - to);
+	}
+
+	err = affs_do_readpage_ofs(file, page, 0, to);
+	if (!err)
+		SetPageUptodate(page);
+	unlock_page(page);
+	return err;
+}
+
+static int affs_write_begin_ofs(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct page *page;
+	pgoff_t index;
+	int err = 0;
+
+	pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len);
+	if (pos > AFFS_I(inode)->mmu_private) {
+		/* XXX: this probably leaves a too-big i_size in case of
+		 * failure. Should really be updating i_size at write_end time
+		 */
+		err = affs_extent_file_ofs(inode, pos);
+		if (err)
+			return err;
+	}
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	if (PageUptodate(page))
+		return 0;
+
+	/* XXX: inefficient but safe in the face of short writes */
+	err = affs_do_readpage_ofs(file, page, 0, PAGE_CACHE_SIZE);
+	if (err) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	return err;
+}
+
+static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh, *prev_bh;
+	char *data;
+	u32 bidx, boff, bsize;
+	unsigned from, to;
+	u32 tmp;
+	int written;
+
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = pos + len;
+	/*
+	 * XXX: not sure if this can handle short copies (len < copied), but
+	 * we don't have to, because the page should always be uptodate here,
+	 * due to write_begin.
+	 */
+
+	pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len);
+	bsize = AFFS_SB(sb)->s_data_blksize;
+	data = page_address(page);
+
+	bh = NULL;
+	written = 0;
+	tmp = (page->index << PAGE_CACHE_SHIFT) + from;
+	bidx = tmp / bsize;
+	boff = tmp % bsize;
+	if (boff) {
+		bh = affs_bread_ino(inode, bidx, 0);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
+		tmp = min(bsize - boff, to - from);
+		BUG_ON(boff + tmp > bsize || tmp > bsize);
+		memcpy(AFFS_DATA(bh) + boff, data + from, tmp);
+		be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp);
+		affs_fix_checksum(sb, bh);
+		mark_buffer_dirty_inode(bh, inode);
+		written += tmp;
+		from += tmp;
+		bidx++;
+	} else if (bidx) {
+		bh = affs_bread_ino(inode, bidx - 1, 0);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
+	}
+	while (from + bsize <= to) {
+		prev_bh = bh;
+		bh = affs_getemptyblk_ino(inode, bidx);
+		if (IS_ERR(bh))
+			goto out;
+		memcpy(AFFS_DATA(bh), data + from, bsize);
+		if (buffer_new(bh)) {
+			AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
+			AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
+			AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
+			AFFS_DATA_HEAD(bh)->size = cpu_to_be32(bsize);
+			AFFS_DATA_HEAD(bh)->next = 0;
+			bh->b_state &= ~(1UL << BH_New);
+			if (prev_bh) {
+				u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
+				if (tmp)
+					affs_warning(sb, "commit_write_ofs", "next block already set for %d (%d)", bidx, tmp);
+				AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
+				affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp);
+				mark_buffer_dirty_inode(prev_bh, inode);
+			}
+		}
+		affs_brelse(prev_bh);
+		affs_fix_checksum(sb, bh);
+		mark_buffer_dirty_inode(bh, inode);
+		written += bsize;
+		from += bsize;
+		bidx++;
+	}
+	if (from < to) {
+		prev_bh = bh;
+		bh = affs_bread_ino(inode, bidx, 1);
+		if (IS_ERR(bh))
+			goto out;
+		tmp = min(bsize, to - from);
+		BUG_ON(tmp > bsize);
+		memcpy(AFFS_DATA(bh), data + from, tmp);
+		if (buffer_new(bh)) {
+			AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
+			AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
+			AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
+			AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
+			AFFS_DATA_HEAD(bh)->next = 0;
+			bh->b_state &= ~(1UL << BH_New);
+			if (prev_bh) {
+				u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
+				if (tmp)
+					affs_warning(sb, "commit_write_ofs", "next block already set for %d (%d)", bidx, tmp);
+				AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
+				affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp);
+				mark_buffer_dirty_inode(prev_bh, inode);
+			}
+		} else if (be32_to_cpu(AFFS_DATA_HEAD(bh)->size) < tmp)
+			AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
+		affs_brelse(prev_bh);
+		affs_fix_checksum(sb, bh);
+		mark_buffer_dirty_inode(bh, inode);
+		written += tmp;
+		from += tmp;
+		bidx++;
+	}
+	SetPageUptodate(page);
+
+done:
+	affs_brelse(bh);
+	tmp = (page->index << PAGE_CACHE_SHIFT) + from;
+	if (tmp > inode->i_size)
+		inode->i_size = AFFS_I(inode)->mmu_private = tmp;
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	return written;
+
+out:
+	bh = prev_bh;
+	if (!written)
+		written = PTR_ERR(bh);
+	goto done;
+}
+
+const struct address_space_operations affs_aops_ofs = {
+	.readpage = affs_readpage_ofs,
+	//.writepage = affs_writepage_ofs,
+	.write_begin = affs_write_begin_ofs,
+	.write_end = affs_write_end_ofs
+};
+
+/* Free any preallocated blocks. */
+
+void
+affs_free_prealloc(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	pr_debug("AFFS: free_prealloc(ino=%lu)\n", inode->i_ino);
+
+	while (AFFS_I(inode)->i_pa_cnt) {
+		AFFS_I(inode)->i_pa_cnt--;
+		affs_free_block(sb, ++AFFS_I(inode)->i_lastalloc);
+	}
+}
+
+/* Truncate (or enlarge) a file to the requested size. */
+
+void
+affs_truncate(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	u32 ext, ext_key;
+	u32 last_blk, blkcnt, blk;
+	u32 size;
+	struct buffer_head *ext_bh;
+	int i;
+
+	pr_debug("AFFS: truncate(inode=%d, oldsize=%u, newsize=%u)\n",
+		 (u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size);
+
+	last_blk = 0;
+	ext = 0;
+	if (inode->i_size) {
+		last_blk = ((u32)inode->i_size - 1) / AFFS_SB(sb)->s_data_blksize;
+		ext = last_blk / AFFS_SB(sb)->s_hashsize;
+	}
+
+	if (inode->i_size > AFFS_I(inode)->mmu_private) {
+		struct address_space *mapping = inode->i_mapping;
+		struct page *page;
+		void *fsdata;
+		u32 size = inode->i_size;
+		int res;
+
+		res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata);
+		if (!res)
+			res = mapping->a_ops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
+		else
+			inode->i_size = AFFS_I(inode)->mmu_private;
+		mark_inode_dirty(inode);
+		return;
+	} else if (inode->i_size == AFFS_I(inode)->mmu_private)
+		return;
+
+	// lock cache
+	ext_bh = affs_get_extblock(inode, ext);
+	if (IS_ERR(ext_bh)) {
+		affs_warning(sb, "truncate", "unexpected read error for ext block %u (%d)",
+			     ext, PTR_ERR(ext_bh));
+		return;
+	}
+	if (AFFS_I(inode)->i_lc) {
+		/* clear linear cache */
+		i = (ext + 1) >> AFFS_I(inode)->i_lc_shift;
+		if (AFFS_I(inode)->i_lc_size > i) {
+			AFFS_I(inode)->i_lc_size = i;
+			for (; i < AFFS_LC_SIZE; i++)
+				AFFS_I(inode)->i_lc[i] = 0;
+		}
+		/* clear associative cache */
+		for (i = 0; i < AFFS_AC_SIZE; i++)
+			if (AFFS_I(inode)->i_ac[i].ext >= ext)
+				AFFS_I(inode)->i_ac[i].ext = 0;
+	}
+	ext_key = be32_to_cpu(AFFS_TAIL(sb, ext_bh)->extension);
+
+	blkcnt = AFFS_I(inode)->i_blkcnt;
+	i = 0;
+	blk = last_blk;
+	if (inode->i_size) {
+		i = last_blk % AFFS_SB(sb)->s_hashsize + 1;
+		blk++;
+	} else
+		AFFS_HEAD(ext_bh)->first_data = 0;
+	AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(i);
+	size = AFFS_SB(sb)->s_hashsize;
+	if (size > blkcnt - blk + i)
+		size = blkcnt - blk + i;
+	for (; i < size; i++, blk++) {
+		affs_free_block(sb, be32_to_cpu(AFFS_BLOCK(sb, ext_bh, i)));
+		AFFS_BLOCK(sb, ext_bh, i) = 0;
+	}
+	AFFS_TAIL(sb, ext_bh)->extension = 0;
+	affs_fix_checksum(sb, ext_bh);
+	mark_buffer_dirty_inode(ext_bh, inode);
+	affs_brelse(ext_bh);
+
+	if (inode->i_size) {
+		AFFS_I(inode)->i_blkcnt = last_blk + 1;
+		AFFS_I(inode)->i_extcnt = ext + 1;
+		if (AFFS_SB(sb)->s_flags & SF_OFS) {
+			struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
+			u32 tmp;
+			if (IS_ERR(bh)) {
+				affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)",
+					     ext, PTR_ERR(bh));
+				return;
+			}
+			tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
+			AFFS_DATA_HEAD(bh)->next = 0;
+			affs_adjust_checksum(bh, -tmp);
+			affs_brelse(bh);
+		}
+	} else {
+		AFFS_I(inode)->i_blkcnt = 0;
+		AFFS_I(inode)->i_extcnt = 1;
+	}
+	AFFS_I(inode)->mmu_private = inode->i_size;
+	// unlock cache
+
+	while (ext_key) {
+		ext_bh = affs_bread(sb, ext_key);
+		size = AFFS_SB(sb)->s_hashsize;
+		if (size > blkcnt - blk)
+			size = blkcnt - blk;
+		for (i = 0; i < size; i++, blk++)
+			affs_free_block(sb, be32_to_cpu(AFFS_BLOCK(sb, ext_bh, i)));
+		affs_free_block(sb, ext_key);
+		ext_key = be32_to_cpu(AFFS_TAIL(sb, ext_bh)->extension);
+		affs_brelse(ext_bh);
+	}
+	affs_free_prealloc(inode);
+}
+
+int affs_file_fsync(struct file *filp, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int ret, err;
+
+	ret = write_inode_now(inode, 0);
+	err = sync_blockdev(inode->i_sb->s_bdev);
+	if (!ret)
+		ret = err;
+	return ret;
+}
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
new file mode 100644
index 00000000..5d828903
--- /dev/null
+++ b/fs/affs/inode.c
@@ -0,0 +1,413 @@
+/*
+ *  linux/fs/affs/inode.c
+ *
+ *  (c) 1996  Hans-Joachim Widmaier - Rewritten
+ *
+ *  (C) 1993  Ray Burr - Modified for Amiga FFS filesystem.
+ *
+ *  (C) 1992  Eric Youngdale Modified for ISO9660 filesystem.
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ */
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include "affs.h"
+
+extern const struct inode_operations affs_symlink_inode_operations;
+extern struct timezone sys_tz;
+
+struct inode *affs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct affs_sb_info	*sbi = AFFS_SB(sb);
+	struct buffer_head	*bh;
+	struct affs_head	*head;
+	struct affs_tail	*tail;
+	struct inode		*inode;
+	u32			 block;
+	u32			 size;
+	u32			 prot;
+	u16			 id;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	pr_debug("AFFS: affs_iget(%lu)\n", inode->i_ino);
+
+	block = inode->i_ino;
+	bh = affs_bread(sb, block);
+	if (!bh) {
+		affs_warning(sb, "read_inode", "Cannot read block %d", block);
+		goto bad_inode;
+	}
+	if (affs_checksum_block(sb, bh) || be32_to_cpu(AFFS_HEAD(bh)->ptype) != T_SHORT) {
+		affs_warning(sb,"read_inode",
+			   "Checksum or type (ptype=%d) error on inode %d",
+			   AFFS_HEAD(bh)->ptype, block);
+		goto bad_inode;
+	}
+
+	head = AFFS_HEAD(bh);
+	tail = AFFS_TAIL(sb, bh);
+	prot = be32_to_cpu(tail->protect);
+
+	inode->i_size = 0;
+	inode->i_nlink = 1;
+	inode->i_mode = 0;
+	AFFS_I(inode)->i_extcnt = 1;
+	AFFS_I(inode)->i_ext_last = ~1;
+	AFFS_I(inode)->i_protect = prot;
+	atomic_set(&AFFS_I(inode)->i_opencnt, 0);
+	AFFS_I(inode)->i_blkcnt = 0;
+	AFFS_I(inode)->i_lc = NULL;
+	AFFS_I(inode)->i_lc_size = 0;
+	AFFS_I(inode)->i_lc_shift = 0;
+	AFFS_I(inode)->i_lc_mask = 0;
+	AFFS_I(inode)->i_ac = NULL;
+	AFFS_I(inode)->i_ext_bh = NULL;
+	AFFS_I(inode)->mmu_private = 0;
+	AFFS_I(inode)->i_lastalloc = 0;
+	AFFS_I(inode)->i_pa_cnt = 0;
+
+	if (sbi->s_flags & SF_SETMODE)
+		inode->i_mode = sbi->s_mode;
+	else
+		inode->i_mode = prot_to_mode(prot);
+
+	id = be16_to_cpu(tail->uid);
+	if (id == 0 || sbi->s_flags & SF_SETUID)
+		inode->i_uid = sbi->s_uid;
+	else if (id == 0xFFFF && sbi->s_flags & SF_MUFS)
+		inode->i_uid = 0;
+	else
+		inode->i_uid = id;
+
+	id = be16_to_cpu(tail->gid);
+	if (id == 0 || sbi->s_flags & SF_SETGID)
+		inode->i_gid = sbi->s_gid;
+	else if (id == 0xFFFF && sbi->s_flags & SF_MUFS)
+		inode->i_gid = 0;
+	else
+		inode->i_gid = id;
+
+	switch (be32_to_cpu(tail->stype)) {
+	case ST_ROOT:
+		inode->i_uid = sbi->s_uid;
+		inode->i_gid = sbi->s_gid;
+		/* fall through */
+	case ST_USERDIR:
+		if (be32_to_cpu(tail->stype) == ST_USERDIR ||
+		    sbi->s_flags & SF_SETMODE) {
+			if (inode->i_mode & S_IRUSR)
+				inode->i_mode |= S_IXUSR;
+			if (inode->i_mode & S_IRGRP)
+				inode->i_mode |= S_IXGRP;
+			if (inode->i_mode & S_IROTH)
+				inode->i_mode |= S_IXOTH;
+			inode->i_mode |= S_IFDIR;
+		} else
+			inode->i_mode = S_IRUGO | S_IXUGO | S_IWUSR | S_IFDIR;
+		/* Maybe it should be controlled by mount parameter? */
+		//inode->i_mode |= S_ISVTX;
+		inode->i_op = &affs_dir_inode_operations;
+		inode->i_fop = &affs_dir_operations;
+		break;
+	case ST_LINKDIR:
+#if 0
+		affs_warning(sb, "read_inode", "inode is LINKDIR");
+		goto bad_inode;
+#else
+		inode->i_mode |= S_IFDIR;
+		/* ... and leave ->i_op and ->i_fop pointing to empty */
+		break;
+#endif
+	case ST_LINKFILE:
+		affs_warning(sb, "read_inode", "inode is LINKFILE");
+		goto bad_inode;
+	case ST_FILE:
+		size = be32_to_cpu(tail->size);
+		inode->i_mode |= S_IFREG;
+		AFFS_I(inode)->mmu_private = inode->i_size = size;
+		if (inode->i_size) {
+			AFFS_I(inode)->i_blkcnt = (size - 1) /
+					       sbi->s_data_blksize + 1;
+			AFFS_I(inode)->i_extcnt = (AFFS_I(inode)->i_blkcnt - 1) /
+					       sbi->s_hashsize + 1;
+		}
+		if (tail->link_chain)
+			inode->i_nlink = 2;
+		inode->i_mapping->a_ops = (sbi->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
+		inode->i_op = &affs_file_inode_operations;
+		inode->i_fop = &affs_file_operations;
+		break;
+	case ST_SOFTLINK:
+		inode->i_mode |= S_IFLNK;
+		inode->i_op = &affs_symlink_inode_operations;
+		inode->i_data.a_ops = &affs_symlink_aops;
+		break;
+	}
+
+	inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec
+		       = (be32_to_cpu(tail->change.days) * (24 * 60 * 60) +
+		         be32_to_cpu(tail->change.mins) * 60 +
+			 be32_to_cpu(tail->change.ticks) / 50 +
+			 ((8 * 365 + 2) * 24 * 60 * 60)) +
+			 sys_tz.tz_minuteswest * 60;
+	inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_atime.tv_nsec = 0;
+	affs_brelse(bh);
+	unlock_new_inode(inode);
+	return inode;
+
+bad_inode:
+	affs_brelse(bh);
+	iget_failed(inode);
+	return ERR_PTR(-EIO);
+}
+
+int
+affs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct super_block	*sb = inode->i_sb;
+	struct buffer_head	*bh;
+	struct affs_tail	*tail;
+	uid_t			 uid;
+	gid_t			 gid;
+
+	pr_debug("AFFS: write_inode(%lu)\n",inode->i_ino);
+
+	if (!inode->i_nlink)
+		// possibly free block
+		return 0;
+	bh = affs_bread(sb, inode->i_ino);
+	if (!bh) {
+		affs_error(sb,"write_inode","Cannot read block %lu",inode->i_ino);
+		return -EIO;
+	}
+	tail = AFFS_TAIL(sb, bh);
+	if (tail->stype == cpu_to_be32(ST_ROOT)) {
+		secs_to_datestamp(inode->i_mtime.tv_sec,&AFFS_ROOT_TAIL(sb, bh)->root_change);
+	} else {
+		tail->protect = cpu_to_be32(AFFS_I(inode)->i_protect);
+		tail->size = cpu_to_be32(inode->i_size);
+		secs_to_datestamp(inode->i_mtime.tv_sec,&tail->change);
+		if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) {
+			uid = inode->i_uid;
+			gid = inode->i_gid;
+			if (AFFS_SB(sb)->s_flags & SF_MUFS) {
+				if (inode->i_uid == 0 || inode->i_uid == 0xFFFF)
+					uid = inode->i_uid ^ ~0;
+				if (inode->i_gid == 0 || inode->i_gid == 0xFFFF)
+					gid = inode->i_gid ^ ~0;
+			}
+			if (!(AFFS_SB(sb)->s_flags & SF_SETUID))
+				tail->uid = cpu_to_be16(uid);
+			if (!(AFFS_SB(sb)->s_flags & SF_SETGID))
+				tail->gid = cpu_to_be16(gid);
+		}
+	}
+	affs_fix_checksum(sb, bh);
+	mark_buffer_dirty_inode(bh, inode);
+	affs_brelse(bh);
+	affs_free_prealloc(inode);
+	return 0;
+}
+
+int
+affs_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	pr_debug("AFFS: notify_change(%lu,0x%x)\n",inode->i_ino,attr->ia_valid);
+
+	error = inode_change_ok(inode,attr);
+	if (error)
+		goto out;
+
+	if (((attr->ia_valid & ATTR_UID) && (AFFS_SB(inode->i_sb)->s_flags & SF_SETUID)) ||
+	    ((attr->ia_valid & ATTR_GID) && (AFFS_SB(inode->i_sb)->s_flags & SF_SETGID)) ||
+	    ((attr->ia_valid & ATTR_MODE) &&
+	     (AFFS_SB(inode->i_sb)->s_flags & (SF_SETMODE | SF_IMMUTABLE)))) {
+		if (!(AFFS_SB(inode->i_sb)->s_flags & SF_QUIET))
+			error = -EPERM;
+		goto out;
+	}
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
+	if (attr->ia_valid & ATTR_MODE)
+		mode_to_prot(inode);
+out:
+	return error;
+}
+
+void
+affs_evict_inode(struct inode *inode)
+{
+	unsigned long cache_page;
+	pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (!inode->i_nlink) {
+		inode->i_size = 0;
+		affs_truncate(inode);
+	}
+
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
+	affs_free_prealloc(inode);
+	cache_page = (unsigned long)AFFS_I(inode)->i_lc;
+	if (cache_page) {
+		pr_debug("AFFS: freeing ext cache\n");
+		AFFS_I(inode)->i_lc = NULL;
+		AFFS_I(inode)->i_ac = NULL;
+		free_page(cache_page);
+	}
+	affs_brelse(AFFS_I(inode)->i_ext_bh);
+	AFFS_I(inode)->i_ext_last = ~1;
+	AFFS_I(inode)->i_ext_bh = NULL;
+
+	if (!inode->i_nlink)
+		affs_free_block(inode->i_sb, inode->i_ino);
+}
+
+struct inode *
+affs_new_inode(struct inode *dir)
+{
+	struct super_block	*sb = dir->i_sb;
+	struct inode		*inode;
+	u32			 block;
+	struct buffer_head	*bh;
+
+	if (!(inode = new_inode(sb)))
+		goto err_inode;
+
+	if (!(block = affs_alloc_block(dir, dir->i_ino)))
+		goto err_block;
+
+	bh = affs_getzeroblk(sb, block);
+	if (!bh)
+		goto err_bh;
+	mark_buffer_dirty_inode(bh, inode);
+	affs_brelse(bh);
+
+	inode->i_uid     = current_fsuid();
+	inode->i_gid     = current_fsgid();
+	inode->i_ino     = block;
+	inode->i_nlink   = 1;
+	inode->i_mtime   = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	atomic_set(&AFFS_I(inode)->i_opencnt, 0);
+	AFFS_I(inode)->i_blkcnt = 0;
+	AFFS_I(inode)->i_lc = NULL;
+	AFFS_I(inode)->i_lc_size = 0;
+	AFFS_I(inode)->i_lc_shift = 0;
+	AFFS_I(inode)->i_lc_mask = 0;
+	AFFS_I(inode)->i_ac = NULL;
+	AFFS_I(inode)->i_ext_bh = NULL;
+	AFFS_I(inode)->mmu_private = 0;
+	AFFS_I(inode)->i_protect = 0;
+	AFFS_I(inode)->i_lastalloc = 0;
+	AFFS_I(inode)->i_pa_cnt = 0;
+	AFFS_I(inode)->i_extcnt = 1;
+	AFFS_I(inode)->i_ext_last = ~1;
+
+	insert_inode_hash(inode);
+
+	return inode;
+
+err_bh:
+	affs_free_block(sb, block);
+err_block:
+	iput(inode);
+err_inode:
+	return NULL;
+}
+
+/*
+ * Add an entry to a directory. Create the header block
+ * and insert it into the hash table.
+ */
+
+int
+affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type)
+{
+	struct super_block *sb = dir->i_sb;
+	struct buffer_head *inode_bh = NULL;
+	struct buffer_head *bh = NULL;
+	u32 block = 0;
+	int retval;
+
+	pr_debug("AFFS: add_entry(dir=%u, inode=%u, \"%*s\", type=%d)\n", (u32)dir->i_ino,
+	         (u32)inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name, type);
+
+	retval = -EIO;
+	bh = affs_bread(sb, inode->i_ino);
+	if (!bh)
+		goto done;
+
+	affs_lock_link(inode);
+	switch (type) {
+	case ST_LINKFILE:
+	case ST_LINKDIR:
+		retval = -ENOSPC;
+		block = affs_alloc_block(dir, dir->i_ino);
+		if (!block)
+			goto err;
+		retval = -EIO;
+		inode_bh = bh;
+		bh = affs_getzeroblk(sb, block);
+		if (!bh)
+			goto err;
+		break;
+	default:
+		break;
+	}
+
+	AFFS_HEAD(bh)->ptype = cpu_to_be32(T_SHORT);
+	AFFS_HEAD(bh)->key = cpu_to_be32(bh->b_blocknr);
+	affs_copy_name(AFFS_TAIL(sb, bh)->name, dentry);
+	AFFS_TAIL(sb, bh)->stype = cpu_to_be32(type);
+	AFFS_TAIL(sb, bh)->parent = cpu_to_be32(dir->i_ino);
+
+	if (inode_bh) {
+		__be32 chain;
+	       	chain = AFFS_TAIL(sb, inode_bh)->link_chain;
+		AFFS_TAIL(sb, bh)->original = cpu_to_be32(inode->i_ino);
+		AFFS_TAIL(sb, bh)->link_chain = chain;
+		AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block);
+		affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
+		mark_buffer_dirty_inode(inode_bh, inode);
+		inode->i_nlink = 2;
+		ihold(inode);
+	}
+	affs_fix_checksum(sb, bh);
+	mark_buffer_dirty_inode(bh, inode);
+	dentry->d_fsdata = (void *)(long)bh->b_blocknr;
+
+	affs_lock_dir(dir);
+	retval = affs_insert_hash(dir, bh);
+	mark_buffer_dirty_inode(bh, inode);
+	affs_unlock_dir(dir);
+	affs_unlock_link(inode);
+
+	d_instantiate(dentry, inode);
+done:
+	affs_brelse(inode_bh);
+	affs_brelse(bh);
+	return retval;
+err:
+	if (block)
+		affs_free_block(sb, block);
+	affs_unlock_link(inode);
+	goto done;
+}
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
new file mode 100644
index 00000000..e3e9efc1
--- /dev/null
+++ b/fs/affs/namei.c
@@ -0,0 +1,458 @@
+/*
+ *  linux/fs/affs/namei.c
+ *
+ *  (c) 1996  Hans-Joachim Widmaier - Rewritten
+ *
+ *  (C) 1993  Ray Burr - Modified for Amiga FFS filesystem.
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ */
+
+#include "affs.h"
+
+typedef int (*toupper_t)(int);
+
+static int	 affs_toupper(int ch);
+static int	 affs_hash_dentry(const struct dentry *,
+		const struct inode *, struct qstr *);
+static int       affs_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
+static int	 affs_intl_toupper(int ch);
+static int	 affs_intl_hash_dentry(const struct dentry *,
+		const struct inode *, struct qstr *);
+static int       affs_intl_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
+
+const struct dentry_operations affs_dentry_operations = {
+	.d_hash		= affs_hash_dentry,
+	.d_compare	= affs_compare_dentry,
+};
+
+const struct dentry_operations affs_intl_dentry_operations = {
+	.d_hash		= affs_intl_hash_dentry,
+	.d_compare	= affs_intl_compare_dentry,
+};
+
+
+/* Simple toupper() for DOS\1 */
+
+static int
+affs_toupper(int ch)
+{
+	return ch >= 'a' && ch <= 'z' ? ch -= ('a' - 'A') : ch;
+}
+
+/* International toupper() for DOS\3 ("international") */
+
+static int
+affs_intl_toupper(int ch)
+{
+	return (ch >= 'a' && ch <= 'z') || (ch >= 0xE0
+		&& ch <= 0xFE && ch != 0xF7) ?
+		ch - ('a' - 'A') : ch;
+}
+
+static inline toupper_t
+affs_get_toupper(struct super_block *sb)
+{
+	return AFFS_SB(sb)->s_flags & SF_INTL ? affs_intl_toupper : affs_toupper;
+}
+
+/*
+ * Note: the dentry argument is the parent dentry.
+ */
+static inline int
+__affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
+{
+	const u8 *name = qstr->name;
+	unsigned long hash;
+	int i;
+
+	i = affs_check_name(qstr->name, qstr->len);
+	if (i)
+		return i;
+
+	hash = init_name_hash();
+	i = min(qstr->len, 30u);
+	for (; i > 0; name++, i--)
+		hash = partial_name_hash(toupper(*name), hash);
+	qstr->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+static int
+affs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
+{
+	return __affs_hash_dentry(qstr, affs_toupper);
+}
+static int
+affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
+{
+	return __affs_hash_dentry(qstr, affs_intl_toupper);
+}
+
+static inline int __affs_compare_dentry(unsigned int len,
+		const char *str, const struct qstr *name, toupper_t toupper)
+{
+	const u8 *aname = str;
+	const u8 *bname = name->name;
+
+	/*
+	 * 'str' is the name of an already existing dentry, so the name
+	 * must be valid. 'name' must be validated first.
+	 */
+
+	if (affs_check_name(name->name, name->len))
+		return 1;
+
+	/*
+	 * If the names are longer than the allowed 30 chars,
+	 * the excess is ignored, so their length may differ.
+	 */
+	if (len >= 30) {
+		if (name->len < 30)
+			return 1;
+		len = 30;
+	} else if (len != name->len)
+		return 1;
+
+	for (; len > 0; len--)
+		if (toupper(*aname++) != toupper(*bname++))
+			return 1;
+
+	return 0;
+}
+
+static int
+affs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	return __affs_compare_dentry(len, str, name, affs_toupper);
+}
+static int
+affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	return __affs_compare_dentry(len, str, name, affs_intl_toupper);
+}
+
+/*
+ * NOTE! unlike strncmp, affs_match returns 1 for success, 0 for failure.
+ */
+
+static inline int
+affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper)
+{
+	const u8 *name = dentry->d_name.name;
+	int len = dentry->d_name.len;
+
+	if (len >= 30) {
+		if (*name2 < 30)
+			return 0;
+		len = 30;
+	} else if (len != *name2)
+		return 0;
+
+	for (name2++; len > 0; len--)
+		if (toupper(*name++) != toupper(*name2++))
+			return 0;
+	return 1;
+}
+
+int
+affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len)
+{
+	toupper_t toupper = affs_get_toupper(sb);
+	int hash;
+
+	hash = len = min(len, 30u);
+	for (; len > 0; len--)
+		hash = (hash * 13 + toupper(*name++)) & 0x7ff;
+
+	return hash % AFFS_SB(sb)->s_hashsize;
+}
+
+static struct buffer_head *
+affs_find_entry(struct inode *dir, struct dentry *dentry)
+{
+	struct super_block *sb = dir->i_sb;
+	struct buffer_head *bh;
+	toupper_t toupper = affs_get_toupper(sb);
+	u32 key;
+
+	pr_debug("AFFS: find_entry(\"%.*s\")\n", (int)dentry->d_name.len, dentry->d_name.name);
+
+	bh = affs_bread(sb, dir->i_ino);
+	if (!bh)
+		return ERR_PTR(-EIO);
+
+	key = be32_to_cpu(AFFS_HEAD(bh)->table[affs_hash_name(sb, dentry->d_name.name, dentry->d_name.len)]);
+
+	for (;;) {
+		affs_brelse(bh);
+		if (key == 0)
+			return NULL;
+		bh = affs_bread(sb, key);
+		if (!bh)
+			return ERR_PTR(-EIO);
+		if (affs_match(dentry, AFFS_TAIL(sb, bh)->name, toupper))
+			return bh;
+		key = be32_to_cpu(AFFS_TAIL(sb, bh)->hash_chain);
+	}
+}
+
+struct dentry *
+affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct buffer_head *bh;
+	struct inode *inode = NULL;
+
+	pr_debug("AFFS: lookup(\"%.*s\")\n",(int)dentry->d_name.len,dentry->d_name.name);
+
+	affs_lock_dir(dir);
+	bh = affs_find_entry(dir, dentry);
+	affs_unlock_dir(dir);
+	if (IS_ERR(bh))
+		return ERR_CAST(bh);
+	if (bh) {
+		u32 ino = bh->b_blocknr;
+
+		/* store the real header ino in d_fsdata for faster lookups */
+		dentry->d_fsdata = (void *)(long)ino;
+		switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) {
+		//link to dirs disabled
+		//case ST_LINKDIR:
+		case ST_LINKFILE:
+			ino = be32_to_cpu(AFFS_TAIL(sb, bh)->original);
+		}
+		affs_brelse(bh);
+		inode = affs_iget(sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	d_add(dentry, inode);
+	return NULL;
+}
+
+int
+affs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	pr_debug("AFFS: unlink(dir=%d, %lu \"%.*s\")\n", (u32)dir->i_ino,
+		 dentry->d_inode->i_ino,
+		 (int)dentry->d_name.len, dentry->d_name.name);
+
+	return affs_remove_header(dentry);
+}
+
+int
+affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode	*inode;
+	int		 error;
+
+	pr_debug("AFFS: create(%lu,\"%.*s\",0%o)\n",dir->i_ino,(int)dentry->d_name.len,
+		 dentry->d_name.name,mode);
+
+	inode = affs_new_inode(dir);
+	if (!inode)
+		return -ENOSPC;
+
+	inode->i_mode = mode;
+	mode_to_prot(inode);
+	mark_inode_dirty(inode);
+
+	inode->i_op = &affs_file_inode_operations;
+	inode->i_fop = &affs_file_operations;
+	inode->i_mapping->a_ops = (AFFS_SB(sb)->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
+	error = affs_add_entry(dir, inode, dentry, ST_FILE);
+	if (error) {
+		inode->i_nlink = 0;
+		iput(inode);
+		return error;
+	}
+	return 0;
+}
+
+int
+affs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode		*inode;
+	int			 error;
+
+	pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%o)\n",dir->i_ino,
+		 (int)dentry->d_name.len,dentry->d_name.name,mode);
+
+	inode = affs_new_inode(dir);
+	if (!inode)
+		return -ENOSPC;
+
+	inode->i_mode = S_IFDIR | mode;
+	mode_to_prot(inode);
+
+	inode->i_op = &affs_dir_inode_operations;
+	inode->i_fop = &affs_dir_operations;
+
+	error = affs_add_entry(dir, inode, dentry, ST_USERDIR);
+	if (error) {
+		inode->i_nlink = 0;
+		mark_inode_dirty(inode);
+		iput(inode);
+		return error;
+	}
+	return 0;
+}
+
+int
+affs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	pr_debug("AFFS: rmdir(dir=%u, %lu \"%.*s\")\n", (u32)dir->i_ino,
+		 dentry->d_inode->i_ino,
+		 (int)dentry->d_name.len, dentry->d_name.name);
+
+	return affs_remove_header(dentry);
+}
+
+int
+affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	struct super_block	*sb = dir->i_sb;
+	struct buffer_head	*bh;
+	struct inode		*inode;
+	char			*p;
+	int			 i, maxlen, error;
+	char			 c, lc;
+
+	pr_debug("AFFS: symlink(%lu,\"%.*s\" -> \"%s\")\n",dir->i_ino,
+		 (int)dentry->d_name.len,dentry->d_name.name,symname);
+
+	maxlen = AFFS_SB(sb)->s_hashsize * sizeof(u32) - 1;
+	inode  = affs_new_inode(dir);
+	if (!inode)
+		return -ENOSPC;
+
+	inode->i_op = &affs_symlink_inode_operations;
+	inode->i_data.a_ops = &affs_symlink_aops;
+	inode->i_mode = S_IFLNK | 0777;
+	mode_to_prot(inode);
+
+	error = -EIO;
+	bh = affs_bread(sb, inode->i_ino);
+	if (!bh)
+		goto err;
+	i  = 0;
+	p  = (char *)AFFS_HEAD(bh)->table;
+	lc = '/';
+	if (*symname == '/') {
+		struct affs_sb_info *sbi = AFFS_SB(sb);
+		while (*symname == '/')
+			symname++;
+		spin_lock(&sbi->symlink_lock);
+		while (sbi->s_volume[i])	/* Cannot overflow */
+			*p++ = sbi->s_volume[i++];
+		spin_unlock(&sbi->symlink_lock);
+	}
+	while (i < maxlen && (c = *symname++)) {
+		if (c == '.' && lc == '/' && *symname == '.' && symname[1] == '/') {
+			*p++ = '/';
+			i++;
+			symname += 2;
+			lc = '/';
+		} else if (c == '.' && lc == '/' && *symname == '/') {
+			symname++;
+			lc = '/';
+		} else {
+			*p++ = c;
+			lc   = c;
+			i++;
+		}
+		if (lc == '/')
+			while (*symname == '/')
+				symname++;
+	}
+	*p = 0;
+	mark_buffer_dirty_inode(bh, inode);
+	affs_brelse(bh);
+	mark_inode_dirty(inode);
+
+	error = affs_add_entry(dir, inode, dentry, ST_SOFTLINK);
+	if (error)
+		goto err;
+
+	return 0;
+
+err:
+	inode->i_nlink = 0;
+	mark_inode_dirty(inode);
+	iput(inode);
+	return error;
+}
+
+int
+affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+
+	pr_debug("AFFS: link(%u, %u, \"%.*s\")\n", (u32)inode->i_ino, (u32)dir->i_ino,
+		 (int)dentry->d_name.len,dentry->d_name.name);
+
+	return affs_add_entry(dir, inode, dentry, ST_LINKFILE);
+}
+
+int
+affs_rename(struct inode *old_dir, struct dentry *old_dentry,
+	    struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct super_block *sb = old_dir->i_sb;
+	struct buffer_head *bh = NULL;
+	int retval;
+
+	pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
+		 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
+		 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
+
+	retval = affs_check_name(new_dentry->d_name.name,new_dentry->d_name.len);
+	if (retval)
+		return retval;
+
+	/* Unlink destination if it already exists */
+	if (new_dentry->d_inode) {
+		retval = affs_remove_header(new_dentry);
+		if (retval)
+			return retval;
+	}
+
+	bh = affs_bread(sb, old_dentry->d_inode->i_ino);
+	if (!bh)
+		return -EIO;
+
+	/* Remove header from its parent directory. */
+	affs_lock_dir(old_dir);
+	retval = affs_remove_hash(old_dir, bh);
+	affs_unlock_dir(old_dir);
+	if (retval)
+		goto done;
+
+	/* And insert it into the new directory with the new name. */
+	affs_copy_name(AFFS_TAIL(sb, bh)->name, new_dentry);
+	affs_fix_checksum(sb, bh);
+	affs_lock_dir(new_dir);
+	retval = affs_insert_hash(new_dir, bh);
+	affs_unlock_dir(new_dir);
+	/* TODO: move it back to old_dir, if error? */
+
+done:
+	mark_buffer_dirty_inode(bh, retval ? old_dir : new_dir);
+	affs_brelse(bh);
+	return retval;
+}
diff --git a/fs/affs/super.c b/fs/affs/super.c
new file mode 100644
index 00000000..b31507d0
--- /dev/null
+++ b/fs/affs/super.c
@@ -0,0 +1,626 @@
+/*
+ *  linux/fs/affs/inode.c
+ *
+ *  (c) 1996  Hans-Joachim Widmaier - Rewritten
+ *
+ *  (C) 1993  Ray Burr - Modified for Amiga FFS filesystem.
+ *
+ *  (C) 1992  Eric Youngdale Modified for ISO 9660 filesystem.
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/statfs.h>
+#include <linux/parser.h>
+#include <linux/magic.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "affs.h"
+
+extern struct timezone sys_tz;
+
+static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
+static int affs_remount (struct super_block *sb, int *flags, char *data);
+
+static void
+affs_commit_super(struct super_block *sb, int wait, int clean)
+{
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+	struct buffer_head *bh = sbi->s_root_bh;
+	struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
+
+	tail->bm_flag = cpu_to_be32(clean);
+	secs_to_datestamp(get_seconds(), &tail->disk_change);
+	affs_fix_checksum(sb, bh);
+	mark_buffer_dirty(bh);
+	if (wait)
+		sync_dirty_buffer(bh);
+}
+
+static void
+affs_put_super(struct super_block *sb)
+{
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+	pr_debug("AFFS: put_super()\n");
+
+	if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt)
+		affs_commit_super(sb, 1, 1);
+
+	kfree(sbi->s_prefix);
+	affs_free_bitmap(sb);
+	affs_brelse(sbi->s_root_bh);
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+}
+
+static void
+affs_write_super(struct super_block *sb)
+{
+	lock_super(sb);
+	if (!(sb->s_flags & MS_RDONLY))
+		affs_commit_super(sb, 1, 2);
+	sb->s_dirt = 0;
+	unlock_super(sb);
+
+	pr_debug("AFFS: write_super() at %lu, clean=2\n", get_seconds());
+}
+
+static int
+affs_sync_fs(struct super_block *sb, int wait)
+{
+	lock_super(sb);
+	affs_commit_super(sb, wait, 2);
+	sb->s_dirt = 0;
+	unlock_super(sb);
+	return 0;
+}
+
+static struct kmem_cache * affs_inode_cachep;
+
+static struct inode *affs_alloc_inode(struct super_block *sb)
+{
+	struct affs_inode_info *i;
+
+	i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL);
+	if (!i)
+		return NULL;
+
+	i->vfs_inode.i_version = 1;
+	i->i_lc = NULL;
+	i->i_ext_bh = NULL;
+	i->i_pa_cnt = 0;
+
+	return &i->vfs_inode;
+}
+
+static void affs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
+}
+
+static void affs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, affs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
+
+	sema_init(&ei->i_link_lock, 1);
+	sema_init(&ei->i_ext_lock, 1);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	affs_inode_cachep = kmem_cache_create("affs_inode_cache",
+					     sizeof(struct affs_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (affs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(affs_inode_cachep);
+}
+
+static const struct super_operations affs_sops = {
+	.alloc_inode	= affs_alloc_inode,
+	.destroy_inode	= affs_destroy_inode,
+	.write_inode	= affs_write_inode,
+	.evict_inode	= affs_evict_inode,
+	.put_super	= affs_put_super,
+	.write_super	= affs_write_super,
+	.sync_fs	= affs_sync_fs,
+	.statfs		= affs_statfs,
+	.remount_fs	= affs_remount,
+	.show_options	= generic_show_options,
+};
+
+enum {
+	Opt_bs, Opt_mode, Opt_mufs, Opt_prefix, Opt_protect,
+	Opt_reserved, Opt_root, Opt_setgid, Opt_setuid,
+	Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_bs, "bs=%u"},
+	{Opt_mode, "mode=%o"},
+	{Opt_mufs, "mufs"},
+	{Opt_prefix, "prefix=%s"},
+	{Opt_protect, "protect"},
+	{Opt_reserved, "reserved=%u"},
+	{Opt_root, "root=%u"},
+	{Opt_setgid, "setgid=%u"},
+	{Opt_setuid, "setuid=%u"},
+	{Opt_verbose, "verbose"},
+	{Opt_volume, "volume=%s"},
+	{Opt_ignore, "grpquota"},
+	{Opt_ignore, "noquota"},
+	{Opt_ignore, "quota"},
+	{Opt_ignore, "usrquota"},
+	{Opt_err, NULL},
+};
+
+static int
+parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s32 *root,
+		int *blocksize, char **prefix, char *volume, unsigned long *mount_opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+
+	/* Fill in defaults */
+
+	*uid        = current_uid();
+	*gid        = current_gid();
+	*reserved   = 2;
+	*root       = -1;
+	*blocksize  = -1;
+	volume[0]   = ':';
+	volume[1]   = 0;
+	*mount_opts = 0;
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token, n, option;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_bs:
+			if (match_int(&args[0], &n))
+				return 0;
+			if (n != 512 && n != 1024 && n != 2048
+			    && n != 4096) {
+				printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
+				return 0;
+			}
+			*blocksize = n;
+			break;
+		case Opt_mode:
+			if (match_octal(&args[0], &option))
+				return 0;
+			*mode = option & 0777;
+			*mount_opts |= SF_SETMODE;
+			break;
+		case Opt_mufs:
+			*mount_opts |= SF_MUFS;
+			break;
+		case Opt_prefix:
+			*prefix = match_strdup(&args[0]);
+			if (!*prefix)
+				return 0;
+			*mount_opts |= SF_PREFIX;
+			break;
+		case Opt_protect:
+			*mount_opts |= SF_IMMUTABLE;
+			break;
+		case Opt_reserved:
+			if (match_int(&args[0], reserved))
+				return 0;
+			break;
+		case Opt_root:
+			if (match_int(&args[0], root))
+				return 0;
+			break;
+		case Opt_setgid:
+			if (match_int(&args[0], &option))
+				return 0;
+			*gid = option;
+			*mount_opts |= SF_SETGID;
+			break;
+		case Opt_setuid:
+			if (match_int(&args[0], &option))
+				return 0;
+			*uid = option;
+			*mount_opts |= SF_SETUID;
+			break;
+		case Opt_verbose:
+			*mount_opts |= SF_VERBOSE;
+			break;
+		case Opt_volume: {
+			char *vol = match_strdup(&args[0]);
+			if (!vol)
+				return 0;
+			strlcpy(volume, vol, 32);
+			kfree(vol);
+			break;
+		}
+		case Opt_ignore:
+		 	/* Silently ignore the quota options */
+			break;
+		default:
+			printk("AFFS: Unrecognized mount option \"%s\" "
+					"or missing value\n", p);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/* This function definitely needs to be split up. Some fine day I'll
+ * hopefully have the guts to do so. Until then: sorry for the mess.
+ */
+
+static int affs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct affs_sb_info	*sbi;
+	struct buffer_head	*root_bh = NULL;
+	struct buffer_head	*boot_bh;
+	struct inode		*root_inode = NULL;
+	s32			 root_block;
+	int			 size, blocksize;
+	u32			 chksum;
+	int			 num_bm;
+	int			 i, j;
+	s32			 key;
+	uid_t			 uid;
+	gid_t			 gid;
+	int			 reserved;
+	unsigned long		 mount_flags;
+	int			 tmp_flags;	/* fix remount prototype... */
+	u8			 sig[4];
+	int			 ret = -EINVAL;
+
+	save_mount_options(sb, data);
+
+	pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options");
+
+	sb->s_magic             = AFFS_SUPER_MAGIC;
+	sb->s_op                = &affs_sops;
+	sb->s_flags |= MS_NODIRATIME;
+
+	sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sb->s_fs_info = sbi;
+	mutex_init(&sbi->s_bmlock);
+	spin_lock_init(&sbi->symlink_lock);
+
+	if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
+				&blocksize,&sbi->s_prefix,
+				sbi->s_volume, &mount_flags)) {
+		printk(KERN_ERR "AFFS: Error parsing options\n");
+		kfree(sbi->s_prefix);
+		kfree(sbi);
+		return -EINVAL;
+	}
+	/* N.B. after this point s_prefix must be released */
+
+	sbi->s_flags   = mount_flags;
+	sbi->s_mode    = i;
+	sbi->s_uid     = uid;
+	sbi->s_gid     = gid;
+	sbi->s_reserved= reserved;
+
+	/* Get the size of the device in 512-byte blocks.
+	 * If we later see that the partition uses bigger
+	 * blocks, we will have to change it.
+	 */
+
+	size = sb->s_bdev->bd_inode->i_size >> 9;
+	pr_debug("AFFS: initial blocksize=%d, #blocks=%d\n", 512, size);
+
+	affs_set_blocksize(sb, PAGE_SIZE);
+	/* Try to find root block. Its location depends on the block size. */
+
+	i = 512;
+	j = 4096;
+	if (blocksize > 0) {
+		i = j = blocksize;
+		size = size / (blocksize / 512);
+	}
+	for (blocksize = i, key = 0; blocksize <= j; blocksize <<= 1, size >>= 1) {
+		sbi->s_root_block = root_block;
+		if (root_block < 0)
+			sbi->s_root_block = (reserved + size - 1) / 2;
+		pr_debug("AFFS: setting blocksize to %d\n", blocksize);
+		affs_set_blocksize(sb, blocksize);
+		sbi->s_partition_size = size;
+
+		/* The root block location that was calculated above is not
+		 * correct if the partition size is an odd number of 512-
+		 * byte blocks, which will be rounded down to a number of
+		 * 1024-byte blocks, and if there were an even number of
+		 * reserved blocks. Ideally, all partition checkers should
+		 * report the real number of blocks of the real blocksize,
+		 * but since this just cannot be done, we have to try to
+		 * find the root block anyways. In the above case, it is one
+		 * block behind the calculated one. So we check this one, too.
+		 */
+		for (num_bm = 0; num_bm < 2; num_bm++) {
+			pr_debug("AFFS: Dev %s, trying root=%u, bs=%d, "
+				"size=%d, reserved=%d\n",
+				sb->s_id,
+				sbi->s_root_block + num_bm,
+				blocksize, size, reserved);
+			root_bh = affs_bread(sb, sbi->s_root_block + num_bm);
+			if (!root_bh)
+				continue;
+			if (!affs_checksum_block(sb, root_bh) &&
+			    be32_to_cpu(AFFS_ROOT_HEAD(root_bh)->ptype) == T_SHORT &&
+			    be32_to_cpu(AFFS_ROOT_TAIL(sb, root_bh)->stype) == ST_ROOT) {
+				sbi->s_hashsize    = blocksize / 4 - 56;
+				sbi->s_root_block += num_bm;
+				key                        = 1;
+				goto got_root;
+			}
+			affs_brelse(root_bh);
+			root_bh = NULL;
+		}
+	}
+	if (!silent)
+		printk(KERN_ERR "AFFS: No valid root block on device %s\n",
+			sb->s_id);
+	goto out_error;
+
+	/* N.B. after this point bh must be released */
+got_root:
+	root_block = sbi->s_root_block;
+
+	/* Find out which kind of FS we have */
+	boot_bh = sb_bread(sb, 0);
+	if (!boot_bh) {
+		printk(KERN_ERR "AFFS: Cannot read boot block\n");
+		goto out_error;
+	}
+	memcpy(sig, boot_bh->b_data, 4);
+	brelse(boot_bh);
+	chksum = be32_to_cpu(*(__be32 *)sig);
+
+	/* Dircache filesystems are compatible with non-dircache ones
+	 * when reading. As long as they aren't supported, writing is
+	 * not recommended.
+	 */
+	if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS
+	     || chksum == MUFS_DCOFS) && !(sb->s_flags & MS_RDONLY)) {
+		printk(KERN_NOTICE "AFFS: Dircache FS - mounting %s read only\n",
+			sb->s_id);
+		sb->s_flags |= MS_RDONLY;
+	}
+	switch (chksum) {
+		case MUFS_FS:
+		case MUFS_INTLFFS:
+		case MUFS_DCFFS:
+			sbi->s_flags |= SF_MUFS;
+			/* fall thru */
+		case FS_INTLFFS:
+		case FS_DCFFS:
+			sbi->s_flags |= SF_INTL;
+			break;
+		case MUFS_FFS:
+			sbi->s_flags |= SF_MUFS;
+			break;
+		case FS_FFS:
+			break;
+		case MUFS_OFS:
+			sbi->s_flags |= SF_MUFS;
+			/* fall thru */
+		case FS_OFS:
+			sbi->s_flags |= SF_OFS;
+			sb->s_flags |= MS_NOEXEC;
+			break;
+		case MUFS_DCOFS:
+		case MUFS_INTLOFS:
+			sbi->s_flags |= SF_MUFS;
+		case FS_DCOFS:
+		case FS_INTLOFS:
+			sbi->s_flags |= SF_INTL | SF_OFS;
+			sb->s_flags |= MS_NOEXEC;
+			break;
+		default:
+			printk(KERN_ERR "AFFS: Unknown filesystem on device %s: %08X\n",
+				sb->s_id, chksum);
+			goto out_error;
+	}
+
+	if (mount_flags & SF_VERBOSE) {
+		u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
+		printk(KERN_NOTICE "AFFS: Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
+			len > 31 ? 31 : len,
+			AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1,
+			sig, sig[3] + '0', blocksize);
+	}
+
+	sb->s_flags |= MS_NODEV | MS_NOSUID;
+
+	sbi->s_data_blksize = sb->s_blocksize;
+	if (sbi->s_flags & SF_OFS)
+		sbi->s_data_blksize -= 24;
+
+	/* Keep super block in cache */
+	sbi->s_root_bh = root_bh;
+	/* N.B. after this point s_root_bh must be released */
+
+	tmp_flags = sb->s_flags;
+	if (affs_init_bitmap(sb, &tmp_flags))
+		goto out_error;
+	sb->s_flags = tmp_flags;
+
+	/* set up enough so that it can read an inode */
+
+	root_inode = affs_iget(sb, root_block);
+	if (IS_ERR(root_inode)) {
+		ret = PTR_ERR(root_inode);
+		goto out_error_noinode;
+	}
+
+	if (AFFS_SB(sb)->s_flags & SF_INTL)
+		sb->s_d_op = &affs_intl_dentry_operations;
+	else
+		sb->s_d_op = &affs_dentry_operations;
+
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root) {
+		printk(KERN_ERR "AFFS: Get root inode failed\n");
+		goto out_error;
+	}
+
+	pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
+	return 0;
+
+	/*
+	 * Begin the cascaded cleanup ...
+	 */
+out_error:
+	if (root_inode)
+		iput(root_inode);
+out_error_noinode:
+	kfree(sbi->s_bitmap);
+	affs_brelse(root_bh);
+	kfree(sbi->s_prefix);
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+	return ret;
+}
+
+static int
+affs_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct affs_sb_info	*sbi = AFFS_SB(sb);
+	int			 blocksize;
+	uid_t			 uid;
+	gid_t			 gid;
+	int			 mode;
+	int			 reserved;
+	int			 root_block;
+	unsigned long		 mount_flags;
+	int			 res = 0;
+	char			*new_opts = kstrdup(data, GFP_KERNEL);
+	char			 volume[32];
+	char			*prefix = NULL;
+
+	pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data);
+
+	*flags |= MS_NODIRATIME;
+
+	memcpy(volume, sbi->s_volume, 32);
+	if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block,
+			   &blocksize, &prefix, volume,
+			   &mount_flags)) {
+		kfree(prefix);
+		kfree(new_opts);
+		return -EINVAL;
+	}
+
+	replace_mount_options(sb, new_opts);
+
+	sbi->s_flags = mount_flags;
+	sbi->s_mode  = mode;
+	sbi->s_uid   = uid;
+	sbi->s_gid   = gid;
+	/* protect against readers */
+	spin_lock(&sbi->symlink_lock);
+	if (prefix) {
+		kfree(sbi->s_prefix);
+		sbi->s_prefix = prefix;
+	}
+	memcpy(sbi->s_volume, volume, 32);
+	spin_unlock(&sbi->symlink_lock);
+
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		return 0;
+
+	if (*flags & MS_RDONLY) {
+		affs_write_super(sb);
+		affs_free_bitmap(sb);
+	} else
+		res = affs_init_bitmap(sb, flags);
+
+	return res;
+}
+
+static int
+affs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	int		 free;
+	u64		 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
+	     AFFS_SB(sb)->s_reserved);
+
+	free          = affs_count_free_blocks(sb);
+	buf->f_type    = AFFS_SUPER_MAGIC;
+	buf->f_bsize   = sb->s_blocksize;
+	buf->f_blocks  = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved;
+	buf->f_bfree   = free;
+	buf->f_bavail  = free;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = 30;
+	return 0;
+}
+
+static struct dentry *affs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
+}
+
+static struct file_system_type affs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "affs",
+	.mount		= affs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_affs_fs(void)
+{
+	int err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&affs_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_affs_fs(void)
+{
+	unregister_filesystem(&affs_fs_type);
+	destroy_inodecache();
+}
+
+MODULE_DESCRIPTION("Amiga filesystem support for Linux");
+MODULE_LICENSE("GPL");
+
+module_init(init_affs_fs)
+module_exit(exit_affs_fs)
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
new file mode 100644
index 00000000..ee00f08c
--- /dev/null
+++ b/fs/affs/symlink.c
@@ -0,0 +1,81 @@
+/*
+ *  linux/fs/affs/symlink.c
+ *
+ *  1995  Hans-Joachim Widmaier - Modified for affs.
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  affs symlink handling code
+ */
+
+#include "affs.h"
+
+static int affs_symlink_readpage(struct file *file, struct page *page)
+{
+	struct buffer_head *bh;
+	struct inode *inode = page->mapping->host;
+	char *link = kmap(page);
+	struct slink_front *lf;
+	int err;
+	int			 i, j;
+	char			 c;
+	char			 lc;
+
+	pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino);
+
+	err = -EIO;
+	bh = affs_bread(inode->i_sb, inode->i_ino);
+	if (!bh)
+		goto fail;
+	i  = 0;
+	j  = 0;
+	lf = (struct slink_front *)bh->b_data;
+	lc = 0;
+
+	if (strchr(lf->symname,':')) {	/* Handle assign or volume name */
+		struct affs_sb_info *sbi = AFFS_SB(inode->i_sb);
+		char *pf;
+		spin_lock(&sbi->symlink_lock);
+		pf = sbi->s_prefix ? sbi->s_prefix : "/";
+		while (i < 1023 && (c = pf[i]))
+			link[i++] = c;
+		spin_unlock(&sbi->symlink_lock);
+		while (i < 1023 && lf->symname[j] != ':')
+			link[i++] = lf->symname[j++];
+		if (i < 1023)
+			link[i++] = '/';
+		j++;
+		lc = '/';
+	}
+	while (i < 1023 && (c = lf->symname[j])) {
+		if (c == '/' && lc == '/' && i < 1020) {	/* parent dir */
+			link[i++] = '.';
+			link[i++] = '.';
+		}
+		link[i++] = c;
+		lc = c;
+		j++;
+	}
+	link[i] = '\0';
+	affs_brelse(bh);
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+	return 0;
+fail:
+	SetPageError(page);
+	kunmap(page);
+	unlock_page(page);
+	return err;
+}
+
+const struct address_space_operations affs_symlink_aops = {
+	.readpage	= affs_symlink_readpage,
+};
+
+const struct inode_operations affs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= affs_notify_change,
+};
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
new file mode 100644
index 00000000..8f975f25
--- /dev/null
+++ b/fs/afs/Kconfig
@@ -0,0 +1,30 @@
+config AFS_FS
+	tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	select AF_RXRPC
+	select DNS_RESOLVER
+	help
+	  If you say Y here, you will get an experimental Andrew File System
+	  driver. It currently only supports unsecured read-only AFS access.
+
+	  See <file:Documentation/filesystems/afs.txt> for more information.
+
+	  If unsure, say N.
+
+config AFS_DEBUG
+	bool "AFS dynamic debugging"
+	depends on AFS_FS
+	help
+	  Say Y here to make runtime controllable debugging messages appear.
+
+	  See <file:Documentation/filesystems/afs.txt> for more information.
+
+	  If unsure, say N.
+
+config AFS_FSCACHE
+	bool "Provide AFS client caching support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
+	help
+	  Say Y here if you want AFS data to be cached locally on disk through
+	  the generic filesystem cache manager
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
new file mode 100644
index 00000000..4f64b95d
--- /dev/null
+++ b/fs/afs/Makefile
@@ -0,0 +1,32 @@
+#
+# Makefile for Red Hat Linux AFS client.
+#
+
+afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
+
+kafs-objs := \
+	$(afs-cache-y) \
+	callback.o \
+	cell.o \
+	cmservice.o \
+	dir.o \
+	file.o \
+	flock.o \
+	fsclient.o \
+	inode.o \
+	main.o \
+	misc.o \
+	mntpt.o \
+	proc.o \
+	rxrpc.o \
+	security.o \
+	server.o \
+	super.o \
+	netdevices.o \
+	vlclient.o \
+	vlocation.o \
+	vnode.o \
+	volume.o \
+	write.o
+
+obj-$(CONFIG_AFS_FS)  := kafs.o
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
new file mode 100644
index 00000000..c548aa34
--- /dev/null
+++ b/fs/afs/afs.h
@@ -0,0 +1,177 @@
+/* AFS common types
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef AFS_H
+#define AFS_H
+
+#include <linux/in.h>
+
+#define AFS_MAXCELLNAME	64		/* maximum length of a cell name */
+#define AFS_MAXVOLNAME	64		/* maximum length of a volume name */
+#define AFSNAMEMAX	256		/* maximum length of a filename plus NUL */
+#define AFSPATHMAX	1024		/* maximum length of a pathname plus NUL */
+#define AFSOPAQUEMAX	1024		/* maximum length of an opaque field */
+
+typedef unsigned			afs_volid_t;
+typedef unsigned			afs_vnodeid_t;
+typedef unsigned long long		afs_dataversion_t;
+
+typedef enum {
+	AFSVL_RWVOL,			/* read/write volume */
+	AFSVL_ROVOL,			/* read-only volume */
+	AFSVL_BACKVOL,			/* backup volume */
+} __attribute__((packed)) afs_voltype_t;
+
+typedef enum {
+	AFS_FTYPE_INVALID	= 0,
+	AFS_FTYPE_FILE		= 1,
+	AFS_FTYPE_DIR		= 2,
+	AFS_FTYPE_SYMLINK	= 3,
+} afs_file_type_t;
+
+typedef enum {
+	AFS_LOCK_READ		= 0,	/* read lock request */
+	AFS_LOCK_WRITE		= 1,	/* write lock request */
+} afs_lock_type_t;
+
+#define AFS_LOCKWAIT		(5 * 60) /* time until a lock times out (seconds) */
+
+/*
+ * AFS file identifier
+ */
+struct afs_fid {
+	afs_volid_t	vid;		/* volume ID */
+	afs_vnodeid_t	vnode;		/* file index within volume */
+	unsigned	unique;		/* unique ID number (file index version) */
+};
+
+/*
+ * AFS callback notification
+ */
+typedef enum {
+	AFSCM_CB_UNTYPED	= 0,	/* no type set on CB break */
+	AFSCM_CB_EXCLUSIVE	= 1,	/* CB exclusive to CM [not implemented] */
+	AFSCM_CB_SHARED		= 2,	/* CB shared by other CM's */
+	AFSCM_CB_DROPPED	= 3,	/* CB promise cancelled by file server */
+} afs_callback_type_t;
+
+struct afs_callback {
+	struct afs_fid		fid;		/* file identifier */
+	unsigned		version;	/* callback version */
+	unsigned		expiry;		/* time at which expires */
+	afs_callback_type_t	type;		/* type of callback */
+};
+
+#define AFSCBMAX 50	/* maximum callbacks transferred per bulk op */
+
+/*
+ * AFS volume information
+ */
+struct afs_volume_info {
+	afs_volid_t		vid;		/* volume ID */
+	afs_voltype_t		type;		/* type of this volume */
+	afs_volid_t		type_vids[5];	/* volume ID's for possible types for this vol */
+
+	/* list of fileservers serving this volume */
+	size_t			nservers;	/* number of entries used in servers[] */
+	struct {
+		struct in_addr	addr;		/* fileserver address */
+	} servers[8];
+};
+
+/*
+ * AFS security ACE access mask
+ */
+typedef u32 afs_access_t;
+#define AFS_ACE_READ		0x00000001U	/* - permission to read a file/dir */
+#define AFS_ACE_WRITE		0x00000002U	/* - permission to write/chmod a file */
+#define AFS_ACE_INSERT		0x00000004U	/* - permission to create dirent in a dir */
+#define AFS_ACE_LOOKUP		0x00000008U	/* - permission to lookup a file/dir in a dir */
+#define AFS_ACE_DELETE		0x00000010U	/* - permission to delete a dirent from a dir */
+#define AFS_ACE_LOCK		0x00000020U	/* - permission to lock a file */
+#define AFS_ACE_ADMINISTER	0x00000040U	/* - permission to change ACL */
+#define AFS_ACE_USER_A		0x01000000U	/* - 'A' user-defined permission */
+#define AFS_ACE_USER_B		0x02000000U	/* - 'B' user-defined permission */
+#define AFS_ACE_USER_C		0x04000000U	/* - 'C' user-defined permission */
+#define AFS_ACE_USER_D		0x08000000U	/* - 'D' user-defined permission */
+#define AFS_ACE_USER_E		0x10000000U	/* - 'E' user-defined permission */
+#define AFS_ACE_USER_F		0x20000000U	/* - 'F' user-defined permission */
+#define AFS_ACE_USER_G		0x40000000U	/* - 'G' user-defined permission */
+#define AFS_ACE_USER_H		0x80000000U	/* - 'H' user-defined permission */
+
+/*
+ * AFS file status information
+ */
+struct afs_file_status {
+	unsigned		if_version;	/* interface version */
+#define AFS_FSTATUS_VERSION	1
+
+	afs_file_type_t		type;		/* file type */
+	unsigned		nlink;		/* link count */
+	u64			size;		/* file size */
+	afs_dataversion_t	data_version;	/* current data version */
+	u32			author;		/* author ID */
+	u32			owner;		/* owner ID */
+	u32			group;		/* group ID */
+	afs_access_t		caller_access;	/* access rights for authenticated caller */
+	afs_access_t		anon_access;	/* access rights for unauthenticated caller */
+	umode_t			mode;		/* UNIX mode */
+	struct afs_fid		parent;		/* parent dir ID for non-dirs only */
+	time_t			mtime_client;	/* last time client changed data */
+	time_t			mtime_server;	/* last time server changed data */
+	s32			lock_count;	/* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */
+};
+
+/*
+ * AFS file status change request
+ */
+struct afs_store_status {
+	u32			mask;		/* which bits of the struct are set */
+	u32			mtime_client;	/* last time client changed data */
+	u32			owner;		/* owner ID */
+	u32			group;		/* group ID */
+	umode_t			mode;		/* UNIX mode */
+};
+
+#define AFS_SET_MTIME		0x01		/* set the mtime */
+#define AFS_SET_OWNER		0x02		/* set the owner ID */
+#define AFS_SET_GROUP		0x04		/* set the group ID (unsupported?) */
+#define AFS_SET_MODE		0x08		/* set the UNIX mode */
+#define AFS_SET_SEG_SIZE	0x10		/* set the segment size (unsupported) */
+
+/*
+ * AFS volume synchronisation information
+ */
+struct afs_volsync {
+	time_t			creation;	/* volume creation time */
+};
+
+/*
+ * AFS volume status record
+ */
+struct afs_volume_status {
+	u32			vid;		/* volume ID */
+	u32			parent_id;	/* parent volume ID */
+	u8			online;		/* true if volume currently online and available */
+	u8			in_service;	/* true if volume currently in service */
+	u8			blessed;	/* same as in_service */
+	u8			needs_salvage;	/* true if consistency checking required */
+	u32			type;		/* volume type (afs_voltype_t) */
+	u32			min_quota;	/* minimum space set aside (blocks) */
+	u32			max_quota;	/* maximum space this volume may occupy (blocks) */
+	u32			blocks_in_use;	/* space this volume currently occupies (blocks) */
+	u32			part_blocks_avail; /* space available in volume's partition */
+	u32			part_max_blocks; /* size of volume's partition */
+};
+
+#define AFS_BLOCK_SIZE	1024
+
+#endif /* AFS_H */
diff --git a/fs/afs/afs_cm.h b/fs/afs/afs_cm.h
new file mode 100644
index 00000000..255f5dd6
--- /dev/null
+++ b/fs/afs/afs_cm.h
@@ -0,0 +1,33 @@
+/* AFS Cache Manager definitions
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef AFS_CM_H
+#define AFS_CM_H
+
+#define AFS_CM_PORT		7001	/* AFS file server port */
+#define CM_SERVICE		1	/* AFS File Service ID */
+
+enum AFS_CM_Operations {
+	CBCallBack		= 204,	/* break callback promises */
+	CBInitCallBackState	= 205,	/* initialise callback state */
+	CBProbe			= 206,	/* probe client */
+	CBGetLock		= 207,	/* get contents of CM lock table */
+	CBGetCE			= 208,	/* get cache file description */
+	CBGetXStatsVersion	= 209,	/* get version of extended statistics */
+	CBGetXStats		= 210,	/* get contents of extended statistics data */
+	CBInitCallBackState3	= 213,	/* initialise callback state, version 3 */
+	CBProbeUuid		= 214,	/* check the client hasn't rebooted */
+	CBTellMeAboutYourself	= 65538, /* get client capabilities */
+};
+
+#define AFS_CAP_ERROR_TRANSLATION	0x1
+
+#endif /* AFS_FS_H */
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
new file mode 100644
index 00000000..eb647323
--- /dev/null
+++ b/fs/afs/afs_fs.h
@@ -0,0 +1,56 @@
+/* AFS File Service definitions
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef AFS_FS_H
+#define AFS_FS_H
+
+#define AFS_FS_PORT		7000	/* AFS file server port */
+#define FS_SERVICE		1	/* AFS File Service ID */
+
+enum AFS_FS_Operations {
+	FSFETCHDATA		= 130,	/* AFS Fetch file data */
+	FSFETCHSTATUS		= 132,	/* AFS Fetch file status */
+	FSSTOREDATA		= 133,	/* AFS Store file data */
+	FSSTORESTATUS		= 135,	/* AFS Store file status */
+	FSREMOVEFILE		= 136,	/* AFS Remove a file */
+	FSCREATEFILE		= 137,	/* AFS Create a file */
+	FSRENAME		= 138,	/* AFS Rename or move a file or directory */
+	FSSYMLINK		= 139,	/* AFS Create a symbolic link */
+	FSLINK			= 140,	/* AFS Create a hard link */
+	FSMAKEDIR		= 141,	/* AFS Create a directory */
+	FSREMOVEDIR		= 142,	/* AFS Remove a directory */
+	FSGIVEUPCALLBACKS	= 147,	/* AFS Discard callback promises */
+	FSGETVOLUMEINFO		= 148,	/* AFS Get information about a volume */
+	FSGETVOLUMESTATUS	= 149,	/* AFS Get volume status information */
+	FSGETROOTVOLUME		= 151,	/* AFS Get root volume name */
+	FSSETLOCK		= 156,	/* AFS Request a file lock */
+	FSEXTENDLOCK		= 157,	/* AFS Extend a file lock */
+	FSRELEASELOCK		= 158,	/* AFS Release a file lock */
+	FSLOOKUP		= 161,	/* AFS lookup file in directory */
+	FSFETCHDATA64		= 65537, /* AFS Fetch file data */
+	FSSTOREDATA64		= 65538, /* AFS Store file data */
+};
+
+enum AFS_FS_Errors {
+	VSALVAGE	= 101,	/* volume needs salvaging */
+	VNOVNODE	= 102,	/* no such file/dir (vnode) */
+	VNOVOL		= 103,	/* no such volume or volume unavailable */
+	VVOLEXISTS	= 104,	/* volume name already exists */
+	VNOSERVICE	= 105,	/* volume not currently in service */
+	VOFFLINE	= 106,	/* volume is currently offline (more info available [VVL-spec]) */
+	VONLINE		= 107,	/* volume is already online */
+	VDISKFULL	= 108,	/* disk partition is full */
+	VOVERQUOTA	= 109,	/* volume's maximum quota exceeded */
+	VBUSY		= 110,	/* volume is temporarily unavailable */
+	VMOVED		= 111,	/* volume moved to new server - ask this FS where */
+};
+
+#endif /* AFS_FS_H */
diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h
new file mode 100644
index 00000000..8bbefe00
--- /dev/null
+++ b/fs/afs/afs_vl.h
@@ -0,0 +1,84 @@
+/* AFS Volume Location Service client interface
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef AFS_VL_H
+#define AFS_VL_H
+
+#include "afs.h"
+
+#define AFS_VL_PORT		7003	/* volume location service port */
+#define VL_SERVICE		52	/* RxRPC service ID for the Volume Location service */
+
+enum AFSVL_Operations {
+	VLGETENTRYBYID		= 503,	/* AFS Get Cache Entry By ID operation ID */
+	VLGETENTRYBYNAME	= 504,	/* AFS Get Cache Entry By Name operation ID */
+	VLPROBE			= 514,	/* AFS Probe Volume Location Service operation ID */
+};
+
+enum AFSVL_Errors {
+	AFSVL_IDEXIST 		= 363520,	/* Volume Id entry exists in vl database */
+	AFSVL_IO 		= 363521,	/* I/O related error */
+	AFSVL_NAMEEXIST 	= 363522,	/* Volume name entry exists in vl database */
+	AFSVL_CREATEFAIL 	= 363523,	/* Internal creation failure */
+	AFSVL_NOENT 		= 363524,	/* No such entry */
+	AFSVL_EMPTY 		= 363525,	/* Vl database is empty */
+	AFSVL_ENTDELETED 	= 363526,	/* Entry is deleted (soft delete) */
+	AFSVL_BADNAME 		= 363527,	/* Volume name is illegal */
+	AFSVL_BADINDEX 		= 363528,	/* Index is out of range */
+	AFSVL_BADVOLTYPE 	= 363529,	/* Bad volume type */
+	AFSVL_BADSERVER 	= 363530,	/* Illegal server number (out of range) */
+	AFSVL_BADPARTITION 	= 363531,	/* Bad partition number */
+	AFSVL_REPSFULL 		= 363532,	/* Run out of space for Replication sites */
+	AFSVL_NOREPSERVER 	= 363533,	/* No such Replication server site exists */
+	AFSVL_DUPREPSERVER 	= 363534,	/* Replication site already exists */
+	AFSVL_RWNOTFOUND 	= 363535,	/* Parent R/W entry not found */
+	AFSVL_BADREFCOUNT 	= 363536,	/* Illegal Reference Count number */
+	AFSVL_SIZEEXCEEDED 	= 363537,	/* Vl size for attributes exceeded */
+	AFSVL_BADENTRY 		= 363538,	/* Bad incoming vl entry */
+	AFSVL_BADVOLIDBUMP 	= 363539,	/* Illegal max volid increment */
+	AFSVL_IDALREADYHASHED 	= 363540,	/* RO/BACK id already hashed */
+	AFSVL_ENTRYLOCKED 	= 363541,	/* Vl entry is already locked */
+	AFSVL_BADVOLOPER 	= 363542,	/* Bad volume operation code */
+	AFSVL_BADRELLOCKTYPE 	= 363543,	/* Bad release lock type */
+	AFSVL_RERELEASE 	= 363544,	/* Status report: last release was aborted */
+	AFSVL_BADSERVERFLAG 	= 363545,	/* Invalid replication site server °ag */
+	AFSVL_PERM 		= 363546,	/* No permission access */
+	AFSVL_NOMEM 		= 363547,	/* malloc/realloc failed to alloc enough memory */
+};
+
+/*
+ * maps to "struct vldbentry" in vvl-spec.pdf
+ */
+struct afs_vldbentry {
+	char		name[65];		/* name of volume (with NUL char) */
+	afs_voltype_t	type;			/* volume type */
+	unsigned	num_servers;		/* num servers that hold instances of this vol */
+	unsigned	clone_id;		/* cloning ID */
+
+	unsigned	flags;
+#define AFS_VLF_RWEXISTS	0x1000		/* R/W volume exists */
+#define AFS_VLF_ROEXISTS	0x2000		/* R/O volume exists */
+#define AFS_VLF_BACKEXISTS	0x4000		/* backup volume exists */
+
+	afs_volid_t	volume_ids[3];		/* volume IDs */
+
+	struct {
+		struct in_addr	addr;		/* server address */
+		unsigned	partition;	/* partition ID on this server */
+		unsigned	flags;		/* server specific flags */
+#define AFS_VLSF_NEWREPSITE	0x0001	/* unused */
+#define AFS_VLSF_ROVOL		0x0002	/* this server holds a R/O instance of the volume */
+#define AFS_VLSF_RWVOL		0x0004	/* this server holds a R/W instance of the volume */
+#define AFS_VLSF_BACKVOL	0x0008	/* this server holds a backup instance of the volume */
+	} servers[8];
+};
+
+#endif /* AFS_VL_H */
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
new file mode 100644
index 00000000..577763c3
--- /dev/null
+++ b/fs/afs/cache.c
@@ -0,0 +1,402 @@
+/* AFS caching stuff
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include "internal.h"
+
+static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
+				       void *buffer, uint16_t buflen);
+static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
+				       void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
+						      const void *buffer,
+						      uint16_t buflen);
+
+static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
+					    void *buffer, uint16_t buflen);
+static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
+					    void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_vlocation_cache_check_aux(
+	void *cookie_netfs_data, const void *buffer, uint16_t buflen);
+
+static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
+					 void *buffer, uint16_t buflen);
+
+static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
+					void *buffer, uint16_t buflen);
+static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
+				     uint64_t *size);
+static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
+					void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
+						       const void *buffer,
+						       uint16_t buflen);
+static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
+
+struct fscache_netfs afs_cache_netfs = {
+	.name			= "afs",
+	.version		= 0,
+};
+
+struct fscache_cookie_def afs_cell_cache_index_def = {
+	.name		= "AFS.cell",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= afs_cell_cache_get_key,
+	.get_aux	= afs_cell_cache_get_aux,
+	.check_aux	= afs_cell_cache_check_aux,
+};
+
+struct fscache_cookie_def afs_vlocation_cache_index_def = {
+	.name			= "AFS.vldb",
+	.type			= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key		= afs_vlocation_cache_get_key,
+	.get_aux		= afs_vlocation_cache_get_aux,
+	.check_aux		= afs_vlocation_cache_check_aux,
+};
+
+struct fscache_cookie_def afs_volume_cache_index_def = {
+	.name		= "AFS.volume",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= afs_volume_cache_get_key,
+};
+
+struct fscache_cookie_def afs_vnode_cache_index_def = {
+	.name			= "AFS.vnode",
+	.type			= FSCACHE_COOKIE_TYPE_DATAFILE,
+	.get_key		= afs_vnode_cache_get_key,
+	.get_attr		= afs_vnode_cache_get_attr,
+	.get_aux		= afs_vnode_cache_get_aux,
+	.check_aux		= afs_vnode_cache_check_aux,
+	.now_uncached		= afs_vnode_cache_now_uncached,
+};
+
+/*
+ * set the key for the index entry
+ */
+static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
+				       void *buffer, uint16_t bufmax)
+{
+	const struct afs_cell *cell = cookie_netfs_data;
+	uint16_t klen;
+
+	_enter("%p,%p,%u", cell, buffer, bufmax);
+
+	klen = strlen(cell->name);
+	if (klen > bufmax)
+		return 0;
+
+	memcpy(buffer, cell->name, klen);
+	return klen;
+}
+
+/*
+ * provide new auxiliary cache data
+ */
+static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
+				       void *buffer, uint16_t bufmax)
+{
+	const struct afs_cell *cell = cookie_netfs_data;
+	uint16_t dlen;
+
+	_enter("%p,%p,%u", cell, buffer, bufmax);
+
+	dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]);
+	dlen = min(dlen, bufmax);
+	dlen &= ~(sizeof(cell->vl_addrs[0]) - 1);
+
+	memcpy(buffer, cell->vl_addrs, dlen);
+	return dlen;
+}
+
+/*
+ * check that the auxiliary data indicates that the entry is still valid
+ */
+static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
+						      const void *buffer,
+						      uint16_t buflen)
+{
+	_leave(" = OKAY");
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*****************************************************************************/
+/*
+ * set the key for the index entry
+ */
+static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax)
+{
+	const struct afs_vlocation *vlocation = cookie_netfs_data;
+	uint16_t klen;
+
+	_enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
+
+	klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name));
+	if (klen > bufmax)
+		return 0;
+
+	memcpy(buffer, vlocation->vldb.name, klen);
+
+	_leave(" = %u", klen);
+	return klen;
+}
+
+/*
+ * provide new auxiliary cache data
+ */
+static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax)
+{
+	const struct afs_vlocation *vlocation = cookie_netfs_data;
+	uint16_t dlen;
+
+	_enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
+
+	dlen = sizeof(struct afs_cache_vlocation);
+	dlen -= offsetof(struct afs_cache_vlocation, nservers);
+	if (dlen > bufmax)
+		return 0;
+
+	memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen);
+
+	_leave(" = %u", dlen);
+	return dlen;
+}
+
+/*
+ * check that the auxiliary data indicates that the entry is still valid
+ */
+static
+enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
+						    const void *buffer,
+						    uint16_t buflen)
+{
+	const struct afs_cache_vlocation *cvldb;
+	struct afs_vlocation *vlocation = cookie_netfs_data;
+	uint16_t dlen;
+
+	_enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen);
+
+	/* check the size of the data is what we're expecting */
+	dlen = sizeof(struct afs_cache_vlocation);
+	dlen -= offsetof(struct afs_cache_vlocation, nservers);
+	if (dlen != buflen)
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	cvldb = container_of(buffer, struct afs_cache_vlocation, nservers);
+
+	/* if what's on disk is more valid than what's in memory, then use the
+	 * VL record from the cache */
+	if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) {
+		memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen);
+		vlocation->valid = 1;
+		_leave(" = SUCCESS [c->m]");
+		return FSCACHE_CHECKAUX_OKAY;
+	}
+
+	/* need to update the cache if the cached info differs */
+	if (memcmp(&vlocation->vldb, buffer, dlen) != 0) {
+		/* delete if the volume IDs for this name differ */
+		if (memcmp(&vlocation->vldb.vid, &cvldb->vid,
+			   sizeof(cvldb->vid)) != 0
+		    ) {
+			_leave(" = OBSOLETE");
+			return FSCACHE_CHECKAUX_OBSOLETE;
+		}
+
+		_leave(" = UPDATE");
+		return FSCACHE_CHECKAUX_NEEDS_UPDATE;
+	}
+
+	_leave(" = OKAY");
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*****************************************************************************/
+/*
+ * set the key for the volume index entry
+ */
+static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
+					void *buffer, uint16_t bufmax)
+{
+	const struct afs_volume *volume = cookie_netfs_data;
+	uint16_t klen;
+
+	_enter("{%u},%p,%u", volume->type, buffer, bufmax);
+
+	klen = sizeof(volume->type);
+	if (klen > bufmax)
+		return 0;
+
+	memcpy(buffer, &volume->type, sizeof(volume->type));
+
+	_leave(" = %u", klen);
+	return klen;
+
+}
+
+/*****************************************************************************/
+/*
+ * set the key for the index entry
+ */
+static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
+					void *buffer, uint16_t bufmax)
+{
+	const struct afs_vnode *vnode = cookie_netfs_data;
+	uint16_t klen;
+
+	_enter("{%x,%x,%llx},%p,%u",
+	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+	       buffer, bufmax);
+
+	klen = sizeof(vnode->fid.vnode);
+	if (klen > bufmax)
+		return 0;
+
+	memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode));
+
+	_leave(" = %u", klen);
+	return klen;
+}
+
+/*
+ * provide updated file attributes
+ */
+static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
+				     uint64_t *size)
+{
+	const struct afs_vnode *vnode = cookie_netfs_data;
+
+	_enter("{%x,%x,%llx},",
+	       vnode->fid.vnode, vnode->fid.unique,
+	       vnode->status.data_version);
+
+	*size = vnode->status.size;
+}
+
+/*
+ * provide new auxiliary cache data
+ */
+static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
+					void *buffer, uint16_t bufmax)
+{
+	const struct afs_vnode *vnode = cookie_netfs_data;
+	uint16_t dlen;
+
+	_enter("{%x,%x,%Lx},%p,%u",
+	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+	       buffer, bufmax);
+
+	dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
+	if (dlen > bufmax)
+		return 0;
+
+	memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique));
+	buffer += sizeof(vnode->fid.unique);
+	memcpy(buffer, &vnode->status.data_version,
+	       sizeof(vnode->status.data_version));
+
+	_leave(" = %u", dlen);
+	return dlen;
+}
+
+/*
+ * check that the auxiliary data indicates that the entry is still valid
+ */
+static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
+						       const void *buffer,
+						       uint16_t buflen)
+{
+	struct afs_vnode *vnode = cookie_netfs_data;
+	uint16_t dlen;
+
+	_enter("{%x,%x,%llx},%p,%u",
+	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+	       buffer, buflen);
+
+	/* check the size of the data is what we're expecting */
+	dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
+	if (dlen != buflen) {
+		_leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen);
+		return FSCACHE_CHECKAUX_OBSOLETE;
+	}
+
+	if (memcmp(buffer,
+		   &vnode->fid.unique,
+		   sizeof(vnode->fid.unique)
+		   ) != 0) {
+		unsigned unique;
+
+		memcpy(&unique, buffer, sizeof(unique));
+
+		_leave(" = OBSOLETE [uniq %x != %x]",
+		       unique, vnode->fid.unique);
+		return FSCACHE_CHECKAUX_OBSOLETE;
+	}
+
+	if (memcmp(buffer + sizeof(vnode->fid.unique),
+		   &vnode->status.data_version,
+		   sizeof(vnode->status.data_version)
+		   ) != 0) {
+		afs_dataversion_t version;
+
+		memcpy(&version, buffer + sizeof(vnode->fid.unique),
+		       sizeof(version));
+
+		_leave(" = OBSOLETE [vers %llx != %llx]",
+		       version, vnode->status.data_version);
+		return FSCACHE_CHECKAUX_OBSOLETE;
+	}
+
+	_leave(" = SUCCESS");
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*
+ * indication the cookie is no longer uncached
+ * - this function is called when the backing store currently caching a cookie
+ *   is removed
+ * - the netfs should use this to clean up any markers indicating cached pages
+ * - this is mandatory for any object that may have data
+ */
+static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
+{
+	struct afs_vnode *vnode = cookie_netfs_data;
+	struct pagevec pvec;
+	pgoff_t first;
+	int loop, nr_pages;
+
+	_enter("{%x,%x,%Lx}",
+	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
+
+	pagevec_init(&pvec, 0);
+	first = 0;
+
+	for (;;) {
+		/* grab a bunch of pages to clean */
+		nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
+					  first,
+					  PAGEVEC_SIZE - pagevec_count(&pvec));
+		if (!nr_pages)
+			break;
+
+		for (loop = 0; loop < nr_pages; loop++)
+			ClearPageFsCache(pvec.pages[loop]);
+
+		first = pvec.pages[nr_pages - 1]->index + 1;
+
+		pvec.nr = nr_pages;
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	_leave("");
+}
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
new file mode 100644
index 00000000..587ef512
--- /dev/null
+++ b/fs/afs/callback.c
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
+ *
+ * This software may be freely redistributed under the terms of the
+ * GNU General Public License.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *          David Howells <dhowells@redhat.com>
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/circ_buf.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+#if 0
+unsigned afs_vnode_update_timeout = 10;
+#endif  /*  0  */
+
+#define afs_breakring_space(server) \
+	CIRC_SPACE((server)->cb_break_head, (server)->cb_break_tail,	\
+		   ARRAY_SIZE((server)->cb_break))
+
+//static void afs_callback_updater(struct work_struct *);
+
+static struct workqueue_struct *afs_callback_update_worker;
+
+/*
+ * allow the fileserver to request callback state (re-)initialisation
+ */
+void afs_init_callback_state(struct afs_server *server)
+{
+	struct afs_vnode *vnode;
+
+	_enter("{%p}", server);
+
+	spin_lock(&server->cb_lock);
+
+	/* kill all the promises on record from this server */
+	while (!RB_EMPTY_ROOT(&server->cb_promises)) {
+		vnode = rb_entry(server->cb_promises.rb_node,
+				 struct afs_vnode, cb_promise);
+		_debug("UNPROMISE { vid=%x:%u uq=%u}",
+		       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+		rb_erase(&vnode->cb_promise, &server->cb_promises);
+		vnode->cb_promised = false;
+	}
+
+	spin_unlock(&server->cb_lock);
+	_leave("");
+}
+
+/*
+ * handle the data invalidation side of a callback being broken
+ */
+void afs_broken_callback_work(struct work_struct *work)
+{
+	struct afs_vnode *vnode =
+		container_of(work, struct afs_vnode, cb_broken_work);
+
+	_enter("");
+
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+		return;
+
+	/* we're only interested in dealing with a broken callback on *this*
+	 * vnode and only if no-one else has dealt with it yet */
+	if (!mutex_trylock(&vnode->validate_lock))
+		return; /* someone else is dealing with it */
+
+	if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+		if (S_ISDIR(vnode->vfs_inode.i_mode))
+			afs_clear_permits(vnode);
+
+		if (afs_vnode_fetch_status(vnode, NULL, NULL) < 0)
+			goto out;
+
+		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+			goto out;
+
+		/* if the vnode's data version number changed then its contents
+		 * are different */
+		if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
+			afs_zap_data(vnode);
+	}
+
+out:
+	mutex_unlock(&vnode->validate_lock);
+
+	/* avoid the potential race whereby the mutex_trylock() in this
+	 * function happens again between the clear_bit() and the
+	 * mutex_unlock() */
+	if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+		_debug("requeue");
+		queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+	}
+	_leave("");
+}
+
+/*
+ * actually break a callback
+ */
+static void afs_break_callback(struct afs_server *server,
+			       struct afs_vnode *vnode)
+{
+	_enter("");
+
+	set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+
+	if (vnode->cb_promised) {
+		spin_lock(&vnode->lock);
+
+		_debug("break callback");
+
+		spin_lock(&server->cb_lock);
+		if (vnode->cb_promised) {
+			rb_erase(&vnode->cb_promise, &server->cb_promises);
+			vnode->cb_promised = false;
+		}
+		spin_unlock(&server->cb_lock);
+
+		queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+		if (list_empty(&vnode->granted_locks) &&
+		    !list_empty(&vnode->pending_locks))
+			afs_lock_may_be_available(vnode);
+		spin_unlock(&vnode->lock);
+	}
+}
+
+/*
+ * allow the fileserver to explicitly break one callback
+ * - happens when
+ *   - the backing file is changed
+ *   - a lock is released
+ */
+static void afs_break_one_callback(struct afs_server *server,
+				   struct afs_fid *fid)
+{
+	struct afs_vnode *vnode;
+	struct rb_node *p;
+
+	_debug("find");
+	spin_lock(&server->fs_lock);
+	p = server->fs_vnodes.rb_node;
+	while (p) {
+		vnode = rb_entry(p, struct afs_vnode, server_rb);
+		if (fid->vid < vnode->fid.vid)
+			p = p->rb_left;
+		else if (fid->vid > vnode->fid.vid)
+			p = p->rb_right;
+		else if (fid->vnode < vnode->fid.vnode)
+			p = p->rb_left;
+		else if (fid->vnode > vnode->fid.vnode)
+			p = p->rb_right;
+		else if (fid->unique < vnode->fid.unique)
+			p = p->rb_left;
+		else if (fid->unique > vnode->fid.unique)
+			p = p->rb_right;
+		else
+			goto found;
+	}
+
+	/* not found so we just ignore it (it may have moved to another
+	 * server) */
+not_available:
+	_debug("not avail");
+	spin_unlock(&server->fs_lock);
+	_leave("");
+	return;
+
+found:
+	_debug("found");
+	ASSERTCMP(server, ==, vnode->server);
+
+	if (!igrab(AFS_VNODE_TO_I(vnode)))
+		goto not_available;
+	spin_unlock(&server->fs_lock);
+
+	afs_break_callback(server, vnode);
+	iput(&vnode->vfs_inode);
+	_leave("");
+}
+
+/*
+ * allow the fileserver to break callback promises
+ */
+void afs_break_callbacks(struct afs_server *server, size_t count,
+			 struct afs_callback callbacks[])
+{
+	_enter("%p,%zu,", server, count);
+
+	ASSERT(server != NULL);
+	ASSERTCMP(count, <=, AFSCBMAX);
+
+	for (; count > 0; callbacks++, count--) {
+		_debug("- Fid { vl=%08x n=%u u=%u }  CB { v=%u x=%u t=%u }",
+		       callbacks->fid.vid,
+		       callbacks->fid.vnode,
+		       callbacks->fid.unique,
+		       callbacks->version,
+		       callbacks->expiry,
+		       callbacks->type
+		       );
+		afs_break_one_callback(server, &callbacks->fid);
+	}
+
+	_leave("");
+	return;
+}
+
+/*
+ * record the callback for breaking
+ * - the caller must hold server->cb_lock
+ */
+static void afs_do_give_up_callback(struct afs_server *server,
+				    struct afs_vnode *vnode)
+{
+	struct afs_callback *cb;
+
+	_enter("%p,%p", server, vnode);
+
+	cb = &server->cb_break[server->cb_break_head];
+	cb->fid		= vnode->fid;
+	cb->version	= vnode->cb_version;
+	cb->expiry	= vnode->cb_expiry;
+	cb->type	= vnode->cb_type;
+	smp_wmb();
+	server->cb_break_head =
+		(server->cb_break_head + 1) &
+		(ARRAY_SIZE(server->cb_break) - 1);
+
+	/* defer the breaking of callbacks to try and collect as many as
+	 * possible to ship in one operation */
+	switch (atomic_inc_return(&server->cb_break_n)) {
+	case 1 ... AFSCBMAX - 1:
+		queue_delayed_work(afs_callback_update_worker,
+				   &server->cb_break_work, HZ * 2);
+		break;
+	case AFSCBMAX:
+		afs_flush_callback_breaks(server);
+		break;
+	default:
+		break;
+	}
+
+	ASSERT(server->cb_promises.rb_node != NULL);
+	rb_erase(&vnode->cb_promise, &server->cb_promises);
+	vnode->cb_promised = false;
+	_leave("");
+}
+
+/*
+ * discard the callback on a deleted item
+ */
+void afs_discard_callback_on_delete(struct afs_vnode *vnode)
+{
+	struct afs_server *server = vnode->server;
+
+	_enter("%d", vnode->cb_promised);
+
+	if (!vnode->cb_promised) {
+		_leave(" [not promised]");
+		return;
+	}
+
+	ASSERT(server != NULL);
+
+	spin_lock(&server->cb_lock);
+	if (vnode->cb_promised) {
+		ASSERT(server->cb_promises.rb_node != NULL);
+		rb_erase(&vnode->cb_promise, &server->cb_promises);
+		vnode->cb_promised = false;
+	}
+	spin_unlock(&server->cb_lock);
+	_leave("");
+}
+
+/*
+ * give up the callback registered for a vnode on the file server when the
+ * inode is being cleared
+ */
+void afs_give_up_callback(struct afs_vnode *vnode)
+{
+	struct afs_server *server = vnode->server;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	_enter("%d", vnode->cb_promised);
+
+	_debug("GIVE UP INODE %p", &vnode->vfs_inode);
+
+	if (!vnode->cb_promised) {
+		_leave(" [not promised]");
+		return;
+	}
+
+	ASSERT(server != NULL);
+
+	spin_lock(&server->cb_lock);
+	if (vnode->cb_promised && afs_breakring_space(server) == 0) {
+		add_wait_queue(&server->cb_break_waitq, &myself);
+		for (;;) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (!vnode->cb_promised ||
+			    afs_breakring_space(server) != 0)
+				break;
+			spin_unlock(&server->cb_lock);
+			schedule();
+			spin_lock(&server->cb_lock);
+		}
+		remove_wait_queue(&server->cb_break_waitq, &myself);
+		__set_current_state(TASK_RUNNING);
+	}
+
+	/* of course, it's always possible for the server to break this vnode's
+	 * callback first... */
+	if (vnode->cb_promised)
+		afs_do_give_up_callback(server, vnode);
+
+	spin_unlock(&server->cb_lock);
+	_leave("");
+}
+
+/*
+ * dispatch a deferred give up callbacks operation
+ */
+void afs_dispatch_give_up_callbacks(struct work_struct *work)
+{
+	struct afs_server *server =
+		container_of(work, struct afs_server, cb_break_work.work);
+
+	_enter("");
+
+	/* tell the fileserver to discard the callback promises it has
+	 * - in the event of ENOMEM or some other error, we just forget that we
+	 *   had callbacks entirely, and the server will call us later to break
+	 *   them
+	 */
+	afs_fs_give_up_callbacks(server, &afs_async_call);
+}
+
+/*
+ * flush the outstanding callback breaks on a server
+ */
+void afs_flush_callback_breaks(struct afs_server *server)
+{
+	cancel_delayed_work(&server->cb_break_work);
+	queue_delayed_work(afs_callback_update_worker,
+			   &server->cb_break_work, 0);
+}
+
+#if 0
+/*
+ * update a bunch of callbacks
+ */
+static void afs_callback_updater(struct work_struct *work)
+{
+	struct afs_server *server;
+	struct afs_vnode *vnode, *xvnode;
+	time_t now;
+	long timeout;
+	int ret;
+
+	server = container_of(work, struct afs_server, updater);
+
+	_enter("");
+
+	now = get_seconds();
+
+	/* find the first vnode to update */
+	spin_lock(&server->cb_lock);
+	for (;;) {
+		if (RB_EMPTY_ROOT(&server->cb_promises)) {
+			spin_unlock(&server->cb_lock);
+			_leave(" [nothing]");
+			return;
+		}
+
+		vnode = rb_entry(rb_first(&server->cb_promises),
+				 struct afs_vnode, cb_promise);
+		if (atomic_read(&vnode->usage) > 0)
+			break;
+		rb_erase(&vnode->cb_promise, &server->cb_promises);
+		vnode->cb_promised = false;
+	}
+
+	timeout = vnode->update_at - now;
+	if (timeout > 0) {
+		queue_delayed_work(afs_vnode_update_worker,
+				   &afs_vnode_update, timeout * HZ);
+		spin_unlock(&server->cb_lock);
+		_leave(" [nothing]");
+		return;
+	}
+
+	list_del_init(&vnode->update);
+	atomic_inc(&vnode->usage);
+	spin_unlock(&server->cb_lock);
+
+	/* we can now perform the update */
+	_debug("update %s", vnode->vldb.name);
+	vnode->state = AFS_VL_UPDATING;
+	vnode->upd_rej_cnt = 0;
+	vnode->upd_busy_cnt = 0;
+
+	ret = afs_vnode_update_record(vl, &vldb);
+	switch (ret) {
+	case 0:
+		afs_vnode_apply_update(vl, &vldb);
+		vnode->state = AFS_VL_UPDATING;
+		break;
+	case -ENOMEDIUM:
+		vnode->state = AFS_VL_VOLUME_DELETED;
+		break;
+	default:
+		vnode->state = AFS_VL_UNCERTAIN;
+		break;
+	}
+
+	/* and then reschedule */
+	_debug("reschedule");
+	vnode->update_at = get_seconds() + afs_vnode_update_timeout;
+
+	spin_lock(&server->cb_lock);
+
+	if (!list_empty(&server->cb_promises)) {
+		/* next update in 10 minutes, but wait at least 1 second more
+		 * than the newest record already queued so that we don't spam
+		 * the VL server suddenly with lots of requests
+		 */
+		xvnode = list_entry(server->cb_promises.prev,
+				    struct afs_vnode, update);
+		if (vnode->update_at <= xvnode->update_at)
+			vnode->update_at = xvnode->update_at + 1;
+		xvnode = list_entry(server->cb_promises.next,
+				    struct afs_vnode, update);
+		timeout = xvnode->update_at - now;
+		if (timeout < 0)
+			timeout = 0;
+	} else {
+		timeout = afs_vnode_update_timeout;
+	}
+
+	list_add_tail(&vnode->update, &server->cb_promises);
+
+	_debug("timeout %ld", timeout);
+	queue_delayed_work(afs_vnode_update_worker,
+			   &afs_vnode_update, timeout * HZ);
+	spin_unlock(&server->cb_lock);
+	afs_put_vnode(vl);
+}
+#endif
+
+/*
+ * initialise the callback update process
+ */
+int __init afs_callback_update_init(void)
+{
+	afs_callback_update_worker =
+		create_singlethread_workqueue("kafs_callbackd");
+	return afs_callback_update_worker ? 0 : -ENOMEM;
+}
+
+/*
+ * shut down the callback update process
+ */
+void afs_callback_update_kill(void)
+{
+	destroy_workqueue(afs_callback_update_worker);
+}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
new file mode 100644
index 00000000..3c090b75
--- /dev/null
+++ b/fs/afs/cell.c
@@ -0,0 +1,460 @@
+/* AFS cell and server record management
+ *
+ * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/key.h>
+#include <linux/ctype.h>
+#include <linux/dns_resolver.h>
+#include <linux/sched.h>
+#include <keys/rxrpc-type.h>
+#include "internal.h"
+
+DECLARE_RWSEM(afs_proc_cells_sem);
+LIST_HEAD(afs_proc_cells);
+
+static LIST_HEAD(afs_cells);
+static DEFINE_RWLOCK(afs_cells_lock);
+static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */
+static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq);
+static struct afs_cell *afs_cell_root;
+
+/*
+ * allocate a cell record and fill in its name, VL server address list and
+ * allocate an anonymous key
+ */
+static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen,
+				       char *vllist)
+{
+	struct afs_cell *cell;
+	struct key *key;
+	char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
+	char  *dvllist = NULL, *_vllist = NULL;
+	char  delimiter = ':';
+	int ret;
+
+	_enter("%*.*s,%s", namelen, namelen, name ?: "", vllist);
+
+	BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
+
+	if (namelen > AFS_MAXCELLNAME) {
+		_leave(" = -ENAMETOOLONG");
+		return ERR_PTR(-ENAMETOOLONG);
+	}
+
+	/* allocate and initialise a cell record */
+	cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL);
+	if (!cell) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	memcpy(cell->name, name, namelen);
+	cell->name[namelen] = 0;
+
+	atomic_set(&cell->usage, 1);
+	INIT_LIST_HEAD(&cell->link);
+	rwlock_init(&cell->servers_lock);
+	INIT_LIST_HEAD(&cell->servers);
+	init_rwsem(&cell->vl_sem);
+	INIT_LIST_HEAD(&cell->vl_list);
+	spin_lock_init(&cell->vl_lock);
+
+	/* if the ip address is invalid, try dns query */
+	if (!vllist || strlen(vllist) < 7) {
+		ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL);
+		if (ret < 0) {
+			if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY)
+				/* translate these errors into something
+				 * userspace might understand */
+				ret = -EDESTADDRREQ;
+			_leave(" = %d", ret);
+			return ERR_PTR(ret);
+		}
+		_vllist = dvllist;
+
+		/* change the delimiter for user-space reply */
+		delimiter = ',';
+
+	} else {
+		_vllist = vllist;
+	}
+
+	/* fill in the VL server list from the rest of the string */
+	do {
+		unsigned a, b, c, d;
+
+		next = strchr(_vllist, delimiter);
+		if (next)
+			*next++ = 0;
+
+		if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
+			goto bad_address;
+
+		if (a > 255 || b > 255 || c > 255 || d > 255)
+			goto bad_address;
+
+		cell->vl_addrs[cell->vl_naddrs++].s_addr =
+			htonl((a << 24) | (b << 16) | (c << 8) | d);
+
+	} while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next));
+
+	/* create a key to represent an anonymous user */
+	memcpy(keyname, "afs@", 4);
+	dp = keyname + 4;
+	cp = cell->name;
+	do {
+		*dp++ = toupper(*cp);
+	} while (*cp++);
+
+	key = rxrpc_get_null_key(keyname);
+	if (IS_ERR(key)) {
+		_debug("no key");
+		ret = PTR_ERR(key);
+		goto error;
+	}
+	cell->anonymous_key = key;
+
+	_debug("anon key %p{%x}",
+	       cell->anonymous_key, key_serial(cell->anonymous_key));
+
+	_leave(" = %p", cell);
+	return cell;
+
+bad_address:
+	printk(KERN_ERR "kAFS: bad VL server IP address\n");
+	ret = -EINVAL;
+error:
+	key_put(cell->anonymous_key);
+	kfree(dvllist);
+	kfree(cell);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * afs_cell_crate() - create a cell record
+ * @name:	is the name of the cell.
+ * @namsesz:	is the strlen of the cell name.
+ * @vllist:	is a colon separated list of IP addresses in "a.b.c.d" format.
+ * @retref:	is T to return the cell reference when the cell exists.
+ */
+struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
+				 char *vllist, bool retref)
+{
+	struct afs_cell *cell;
+	int ret;
+
+	_enter("%*.*s,%s", namesz, namesz, name ?: "", vllist);
+
+	down_write(&afs_cells_sem);
+	read_lock(&afs_cells_lock);
+	list_for_each_entry(cell, &afs_cells, link) {
+		if (strncasecmp(cell->name, name, namesz) == 0)
+			goto duplicate_name;
+	}
+	read_unlock(&afs_cells_lock);
+
+	cell = afs_cell_alloc(name, namesz, vllist);
+	if (IS_ERR(cell)) {
+		_leave(" = %ld", PTR_ERR(cell));
+		up_write(&afs_cells_sem);
+		return cell;
+	}
+
+	/* add a proc directory for this cell */
+	ret = afs_proc_cell_setup(cell);
+	if (ret < 0)
+		goto error;
+
+#ifdef CONFIG_AFS_FSCACHE
+	/* put it up for caching (this never returns an error) */
+	cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
+					     &afs_cell_cache_index_def,
+					     cell);
+#endif
+
+	/* add to the cell lists */
+	write_lock(&afs_cells_lock);
+	list_add_tail(&cell->link, &afs_cells);
+	write_unlock(&afs_cells_lock);
+
+	down_write(&afs_proc_cells_sem);
+	list_add_tail(&cell->proc_link, &afs_proc_cells);
+	up_write(&afs_proc_cells_sem);
+	up_write(&afs_cells_sem);
+
+	_leave(" = %p", cell);
+	return cell;
+
+error:
+	up_write(&afs_cells_sem);
+	key_put(cell->anonymous_key);
+	kfree(cell);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+
+duplicate_name:
+	if (retref && !IS_ERR(cell))
+		afs_get_cell(cell);
+
+	read_unlock(&afs_cells_lock);
+	up_write(&afs_cells_sem);
+
+	if (retref) {
+		_leave(" = %p", cell);
+		return cell;
+	}
+
+	_leave(" = -EEXIST");
+	return ERR_PTR(-EEXIST);
+}
+
+/*
+ * set the root cell information
+ * - can be called with a module parameter string
+ * - can be called from a write to /proc/fs/afs/rootcell
+ */
+int afs_cell_init(char *rootcell)
+{
+	struct afs_cell *old_root, *new_root;
+	char *cp;
+
+	_enter("");
+
+	if (!rootcell) {
+		/* module is loaded with no parameters, or built statically.
+		 * - in the future we might initialize cell DB here.
+		 */
+		_leave(" = 0 [no root]");
+		return 0;
+	}
+
+	cp = strchr(rootcell, ':');
+	if (!cp)
+		_debug("kAFS: no VL server IP addresses specified");
+	else
+		*cp++ = 0;
+
+	/* allocate a cell record for the root cell */
+	new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false);
+	if (IS_ERR(new_root)) {
+		_leave(" = %ld", PTR_ERR(new_root));
+		return PTR_ERR(new_root);
+	}
+
+	/* install the new cell */
+	write_lock(&afs_cells_lock);
+	old_root = afs_cell_root;
+	afs_cell_root = new_root;
+	write_unlock(&afs_cells_lock);
+	afs_put_cell(old_root);
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * lookup a cell record
+ */
+struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
+				 bool dns_cell)
+{
+	struct afs_cell *cell;
+
+	_enter("\"%*.*s\",", namesz, namesz, name ?: "");
+
+	down_read(&afs_cells_sem);
+	read_lock(&afs_cells_lock);
+
+	if (name) {
+		/* if the cell was named, look for it in the cell record list */
+		list_for_each_entry(cell, &afs_cells, link) {
+			if (strncmp(cell->name, name, namesz) == 0) {
+				afs_get_cell(cell);
+				goto found;
+			}
+		}
+		cell = ERR_PTR(-ENOENT);
+		if (dns_cell)
+			goto create_cell;
+	found:
+		;
+	} else {
+		cell = afs_cell_root;
+		if (!cell) {
+			/* this should not happen unless user tries to mount
+			 * when root cell is not set. Return an impossibly
+			 * bizarre errno to alert the user. Things like
+			 * ENOENT might be "more appropriate" but they happen
+			 * for other reasons.
+			 */
+			cell = ERR_PTR(-EDESTADDRREQ);
+		} else {
+			afs_get_cell(cell);
+		}
+
+	}
+
+	read_unlock(&afs_cells_lock);
+	up_read(&afs_cells_sem);
+	_leave(" = %p", cell);
+	return cell;
+
+create_cell:
+	read_unlock(&afs_cells_lock);
+	up_read(&afs_cells_sem);
+
+	cell = afs_cell_create(name, namesz, NULL, true);
+
+	_leave(" = %p", cell);
+	return cell;
+}
+
+#if 0
+/*
+ * try and get a cell record
+ */
+struct afs_cell *afs_get_cell_maybe(struct afs_cell *cell)
+{
+	write_lock(&afs_cells_lock);
+
+	if (cell && !list_empty(&cell->link))
+		afs_get_cell(cell);
+	else
+		cell = NULL;
+
+	write_unlock(&afs_cells_lock);
+	return cell;
+}
+#endif  /*  0  */
+
+/*
+ * destroy a cell record
+ */
+void afs_put_cell(struct afs_cell *cell)
+{
+	if (!cell)
+		return;
+
+	_enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
+
+	ASSERTCMP(atomic_read(&cell->usage), >, 0);
+
+	/* to prevent a race, the decrement and the dequeue must be effectively
+	 * atomic */
+	write_lock(&afs_cells_lock);
+
+	if (likely(!atomic_dec_and_test(&cell->usage))) {
+		write_unlock(&afs_cells_lock);
+		_leave("");
+		return;
+	}
+
+	ASSERT(list_empty(&cell->servers));
+	ASSERT(list_empty(&cell->vl_list));
+
+	write_unlock(&afs_cells_lock);
+
+	wake_up(&afs_cells_freeable_wq);
+
+	_leave(" [unused]");
+}
+
+/*
+ * destroy a cell record
+ * - must be called with the afs_cells_sem write-locked
+ * - cell->link should have been broken by the caller
+ */
+static void afs_cell_destroy(struct afs_cell *cell)
+{
+	_enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
+
+	ASSERTCMP(atomic_read(&cell->usage), >=, 0);
+	ASSERT(list_empty(&cell->link));
+
+	/* wait for everyone to stop using the cell */
+	if (atomic_read(&cell->usage) > 0) {
+		DECLARE_WAITQUEUE(myself, current);
+
+		_debug("wait for cell %s", cell->name);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&afs_cells_freeable_wq, &myself);
+
+		while (atomic_read(&cell->usage) > 0) {
+			schedule();
+			set_current_state(TASK_UNINTERRUPTIBLE);
+		}
+
+		remove_wait_queue(&afs_cells_freeable_wq, &myself);
+		set_current_state(TASK_RUNNING);
+	}
+
+	_debug("cell dead");
+	ASSERTCMP(atomic_read(&cell->usage), ==, 0);
+	ASSERT(list_empty(&cell->servers));
+	ASSERT(list_empty(&cell->vl_list));
+
+	afs_proc_cell_remove(cell);
+
+	down_write(&afs_proc_cells_sem);
+	list_del_init(&cell->proc_link);
+	up_write(&afs_proc_cells_sem);
+
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_relinquish_cookie(cell->cache, 0);
+#endif
+	key_put(cell->anonymous_key);
+	kfree(cell);
+
+	_leave(" [destroyed]");
+}
+
+/*
+ * purge in-memory cell database on module unload or afs_init() failure
+ * - the timeout daemon is stopped before calling this
+ */
+void afs_cell_purge(void)
+{
+	struct afs_cell *cell;
+
+	_enter("");
+
+	afs_put_cell(afs_cell_root);
+
+	down_write(&afs_cells_sem);
+
+	while (!list_empty(&afs_cells)) {
+		cell = NULL;
+
+		/* remove the next cell from the front of the list */
+		write_lock(&afs_cells_lock);
+
+		if (!list_empty(&afs_cells)) {
+			cell = list_entry(afs_cells.next,
+					  struct afs_cell, link);
+			list_del_init(&cell->link);
+		}
+
+		write_unlock(&afs_cells_lock);
+
+		if (cell) {
+			_debug("PURGING CELL %s (%d)",
+			       cell->name, atomic_read(&cell->usage));
+
+			/* now the cell should be left with no references */
+			afs_cell_destroy(cell);
+		}
+	}
+
+	up_write(&afs_cells_sem);
+	_leave("");
+}
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
new file mode 100644
index 00000000..1c8c6cc6
--- /dev/null
+++ b/fs/afs/cmservice.c
@@ -0,0 +1,585 @@
+/* AFS Cache Manager Service
+ *
+ * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/ip.h>
+#include "internal.h"
+#include "afs_cm.h"
+
+#if 0
+struct workqueue_struct *afs_cm_workqueue;
+#endif  /*  0  */
+
+static int afs_deliver_cb_init_call_back_state(struct afs_call *,
+					       struct sk_buff *, bool);
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *,
+						struct sk_buff *, bool);
+static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool);
+static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool);
+static int afs_deliver_cb_probe_uuid(struct afs_call *, struct sk_buff *, bool);
+static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *,
+						 struct sk_buff *, bool);
+static void afs_cm_destructor(struct afs_call *);
+
+/*
+ * CB.CallBack operation type
+ */
+static const struct afs_call_type afs_SRXCBCallBack = {
+	.name		= "CB.CallBack",
+	.deliver	= afs_deliver_cb_callback,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
+
+/*
+ * CB.InitCallBackState operation type
+ */
+static const struct afs_call_type afs_SRXCBInitCallBackState = {
+	.name		= "CB.InitCallBackState",
+	.deliver	= afs_deliver_cb_init_call_back_state,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
+
+/*
+ * CB.InitCallBackState3 operation type
+ */
+static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
+	.name		= "CB.InitCallBackState3",
+	.deliver	= afs_deliver_cb_init_call_back_state3,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
+
+/*
+ * CB.Probe operation type
+ */
+static const struct afs_call_type afs_SRXCBProbe = {
+	.name		= "CB.Probe",
+	.deliver	= afs_deliver_cb_probe,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
+
+/*
+ * CB.ProbeUuid operation type
+ */
+static const struct afs_call_type afs_SRXCBProbeUuid = {
+	.name		= "CB.ProbeUuid",
+	.deliver	= afs_deliver_cb_probe_uuid,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
+
+/*
+ * CB.TellMeAboutYourself operation type
+ */
+static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
+	.name		= "CB.TellMeAboutYourself",
+	.deliver	= afs_deliver_cb_tell_me_about_yourself,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
+
+/*
+ * route an incoming cache manager call
+ * - return T if supported, F if not
+ */
+bool afs_cm_incoming_call(struct afs_call *call)
+{
+	u32 operation_id = ntohl(call->operation_ID);
+
+	_enter("{CB.OP %u}", operation_id);
+
+	switch (operation_id) {
+	case CBCallBack:
+		call->type = &afs_SRXCBCallBack;
+		return true;
+	case CBInitCallBackState:
+		call->type = &afs_SRXCBInitCallBackState;
+		return true;
+	case CBInitCallBackState3:
+		call->type = &afs_SRXCBInitCallBackState3;
+		return true;
+	case CBProbe:
+		call->type = &afs_SRXCBProbe;
+		return true;
+	case CBTellMeAboutYourself:
+		call->type = &afs_SRXCBTellMeAboutYourself;
+		return true;
+	default:
+		return false;
+	}
+}
+
+/*
+ * clean up a cache manager call
+ */
+static void afs_cm_destructor(struct afs_call *call)
+{
+	_enter("");
+
+	afs_put_server(call->server);
+	call->server = NULL;
+	kfree(call->buffer);
+	call->buffer = NULL;
+}
+
+/*
+ * allow the fileserver to see if the cache manager is still alive
+ */
+static void SRXAFSCB_CallBack(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, work);
+
+	_enter("");
+
+	/* be sure to send the reply *before* attempting to spam the AFS server
+	 * with FSFetchStatus requests on the vnodes with broken callbacks lest
+	 * the AFS server get into a vicious cycle of trying to break further
+	 * callbacks because it hadn't received completion of the CBCallBack op
+	 * yet */
+	afs_send_empty_reply(call);
+
+	afs_break_callbacks(call->server, call->count, call->request);
+	_leave("");
+}
+
+/*
+ * deliver request data to a CB.CallBack call
+ */
+static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
+				   bool last)
+{
+	struct afs_callback *cb;
+	struct afs_server *server;
+	struct in_addr addr;
+	__be32 *bp;
+	u32 tmp;
+	int ret, loop;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the FID array and its count in two steps */
+	case 1:
+		_debug("extract FID count");
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->count = ntohl(call->tmp);
+		_debug("FID count: %u", call->count);
+		if (call->count > AFSCBMAX)
+			return -EBADMSG;
+
+		call->buffer = kmalloc(call->count * 3 * 4, GFP_KERNEL);
+		if (!call->buffer)
+			return -ENOMEM;
+		call->offset = 0;
+		call->unmarshall++;
+
+	case 2:
+		_debug("extract FID array");
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       call->count * 3 * 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		_debug("unmarshall FID array");
+		call->request = kcalloc(call->count,
+					sizeof(struct afs_callback),
+					GFP_KERNEL);
+		if (!call->request)
+			return -ENOMEM;
+
+		cb = call->request;
+		bp = call->buffer;
+		for (loop = call->count; loop > 0; loop--, cb++) {
+			cb->fid.vid	= ntohl(*bp++);
+			cb->fid.vnode	= ntohl(*bp++);
+			cb->fid.unique	= ntohl(*bp++);
+			cb->type	= AFSCM_CB_UNTYPED;
+		}
+
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the callback array and its count in two steps */
+	case 3:
+		_debug("extract CB count");
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		tmp = ntohl(call->tmp);
+		_debug("CB count: %u", tmp);
+		if (tmp != call->count && tmp != 0)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+		if (tmp == 0)
+			goto empty_cb_array;
+
+	case 4:
+		_debug("extract CB array");
+		ret = afs_extract_data(call, skb, last, call->request,
+				       call->count * 3 * 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		_debug("unmarshall CB array");
+		cb = call->request;
+		bp = call->buffer;
+		for (loop = call->count; loop > 0; loop--, cb++) {
+			cb->version	= ntohl(*bp++);
+			cb->expiry	= ntohl(*bp++);
+			cb->type	= ntohl(*bp++);
+		}
+
+	empty_cb_array:
+		call->offset = 0;
+		call->unmarshall++;
+
+	case 5:
+		_debug("trailer");
+		if (skb->len != 0)
+			return -EBADMSG;
+		break;
+	}
+
+	if (!last)
+		return 0;
+
+	call->state = AFS_CALL_REPLYING;
+
+	/* we'll need the file server record as that tells us which set of
+	 * vnodes to operate upon */
+	memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+	server = afs_find_server(&addr);
+	if (!server)
+		return -ENOTCONN;
+	call->server = server;
+
+	INIT_WORK(&call->work, SRXAFSCB_CallBack);
+	queue_work(afs_wq, &call->work);
+	return 0;
+}
+
+/*
+ * allow the fileserver to request callback state (re-)initialisation
+ */
+static void SRXAFSCB_InitCallBackState(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, work);
+
+	_enter("{%p}", call->server);
+
+	afs_init_callback_state(call->server);
+	afs_send_empty_reply(call);
+	_leave("");
+}
+
+/*
+ * deliver request data to a CB.InitCallBackState call
+ */
+static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
+					       struct sk_buff *skb,
+					       bool last)
+{
+	struct afs_server *server;
+	struct in_addr addr;
+
+	_enter(",{%u},%d", skb->len, last);
+
+	if (skb->len > 0)
+		return -EBADMSG;
+	if (!last)
+		return 0;
+
+	/* no unmarshalling required */
+	call->state = AFS_CALL_REPLYING;
+
+	/* we'll need the file server record as that tells us which set of
+	 * vnodes to operate upon */
+	memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+	server = afs_find_server(&addr);
+	if (!server)
+		return -ENOTCONN;
+	call->server = server;
+
+	INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
+	queue_work(afs_wq, &call->work);
+	return 0;
+}
+
+/*
+ * deliver request data to a CB.InitCallBackState3 call
+ */
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
+						struct sk_buff *skb,
+						bool last)
+{
+	struct afs_server *server;
+	struct in_addr addr;
+
+	_enter(",{%u},%d", skb->len, last);
+
+	if (!last)
+		return 0;
+
+	/* no unmarshalling required */
+	call->state = AFS_CALL_REPLYING;
+
+	/* we'll need the file server record as that tells us which set of
+	 * vnodes to operate upon */
+	memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+	server = afs_find_server(&addr);
+	if (!server)
+		return -ENOTCONN;
+	call->server = server;
+
+	INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
+	queue_work(afs_wq, &call->work);
+	return 0;
+}
+
+/*
+ * allow the fileserver to see if the cache manager is still alive
+ */
+static void SRXAFSCB_Probe(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, work);
+
+	_enter("");
+	afs_send_empty_reply(call);
+	_leave("");
+}
+
+/*
+ * deliver request data to a CB.Probe call
+ */
+static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
+				bool last)
+{
+	_enter(",{%u},%d", skb->len, last);
+
+	if (skb->len > 0)
+		return -EBADMSG;
+	if (!last)
+		return 0;
+
+	/* no unmarshalling required */
+	call->state = AFS_CALL_REPLYING;
+
+	INIT_WORK(&call->work, SRXAFSCB_Probe);
+	queue_work(afs_wq, &call->work);
+	return 0;
+}
+
+/*
+ * allow the fileserver to quickly find out if the fileserver has been rebooted
+ */
+static void SRXAFSCB_ProbeUuid(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, work);
+	struct afs_uuid *r = call->request;
+
+	struct {
+		__be32	match;
+	} reply;
+
+	_enter("");
+
+
+	if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0)
+		reply.match = htonl(0);
+	else
+		reply.match = htonl(1);
+
+	afs_send_simple_reply(call, &reply, sizeof(reply));
+	_leave("");
+}
+
+/*
+ * deliver request data to a CB.ProbeUuid call
+ */
+static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
+				     bool last)
+{
+	struct afs_uuid *r;
+	unsigned loop;
+	__be32 *b;
+	int ret;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	if (skb->len > 0)
+		return -EBADMSG;
+	if (!last)
+		return 0;
+
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL);
+		if (!call->buffer)
+			return -ENOMEM;
+		call->unmarshall++;
+
+	case 1:
+		_debug("extract UUID");
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       11 * sizeof(__be32));
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		_debug("unmarshall UUID");
+		call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL);
+		if (!call->request)
+			return -ENOMEM;
+
+		b = call->buffer;
+		r = call->request;
+		r->time_low			= ntohl(b[0]);
+		r->time_mid			= ntohl(b[1]);
+		r->time_hi_and_version		= ntohl(b[2]);
+		r->clock_seq_hi_and_reserved 	= ntohl(b[3]);
+		r->clock_seq_low		= ntohl(b[4]);
+
+		for (loop = 0; loop < 6; loop++)
+			r->node[loop] = ntohl(b[loop + 5]);
+
+		call->offset = 0;
+		call->unmarshall++;
+
+	case 2:
+		_debug("trailer");
+		if (skb->len != 0)
+			return -EBADMSG;
+		break;
+	}
+
+	if (!last)
+		return 0;
+
+	call->state = AFS_CALL_REPLYING;
+
+	INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
+	queue_work(afs_wq, &call->work);
+	return 0;
+}
+
+/*
+ * allow the fileserver to ask about the cache manager's capabilities
+ */
+static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
+{
+	struct afs_interface *ifs;
+	struct afs_call *call = container_of(work, struct afs_call, work);
+	int loop, nifs;
+
+	struct {
+		struct /* InterfaceAddr */ {
+			__be32 nifs;
+			__be32 uuid[11];
+			__be32 ifaddr[32];
+			__be32 netmask[32];
+			__be32 mtu[32];
+		} ia;
+		struct /* Capabilities */ {
+			__be32 capcount;
+			__be32 caps[1];
+		} cap;
+	} reply;
+
+	_enter("");
+
+	nifs = 0;
+	ifs = kcalloc(32, sizeof(*ifs), GFP_KERNEL);
+	if (ifs) {
+		nifs = afs_get_ipv4_interfaces(ifs, 32, false);
+		if (nifs < 0) {
+			kfree(ifs);
+			ifs = NULL;
+			nifs = 0;
+		}
+	}
+
+	memset(&reply, 0, sizeof(reply));
+	reply.ia.nifs = htonl(nifs);
+
+	reply.ia.uuid[0] = htonl(afs_uuid.time_low);
+	reply.ia.uuid[1] = htonl(afs_uuid.time_mid);
+	reply.ia.uuid[2] = htonl(afs_uuid.time_hi_and_version);
+	reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved);
+	reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low);
+	for (loop = 0; loop < 6; loop++)
+		reply.ia.uuid[loop + 5] = htonl((s8) afs_uuid.node[loop]);
+
+	if (ifs) {
+		for (loop = 0; loop < nifs; loop++) {
+			reply.ia.ifaddr[loop] = ifs[loop].address.s_addr;
+			reply.ia.netmask[loop] = ifs[loop].netmask.s_addr;
+			reply.ia.mtu[loop] = htonl(ifs[loop].mtu);
+		}
+		kfree(ifs);
+	}
+
+	reply.cap.capcount = htonl(1);
+	reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION);
+	afs_send_simple_reply(call, &reply, sizeof(reply));
+
+	_leave("");
+}
+
+/*
+ * deliver request data to a CB.TellMeAboutYourself call
+ */
+static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
+						 struct sk_buff *skb, bool last)
+{
+	_enter(",{%u},%d", skb->len, last);
+
+	if (skb->len > 0)
+		return -EBADMSG;
+	if (!last)
+		return 0;
+
+	/* no unmarshalling required */
+	call->state = AFS_CALL_REPLYING;
+
+	INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself);
+	queue_work(afs_wq, &call->work);
+	return 0;
+}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
new file mode 100644
index 00000000..1b0b1955
--- /dev/null
+++ b/fs/afs/dir.c
@@ -0,0 +1,1184 @@
+/* dir.c: AFS filesystem directory handling
+ *
+ * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
+				 struct nameidata *nd);
+static int afs_dir_open(struct inode *inode, struct file *file);
+static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
+static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
+static int afs_d_delete(const struct dentry *dentry);
+static void afs_d_release(struct dentry *dentry);
+static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
+				  loff_t fpos, u64 ino, unsigned dtype);
+static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
+		      struct nameidata *nd);
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int afs_rmdir(struct inode *dir, struct dentry *dentry);
+static int afs_unlink(struct inode *dir, struct dentry *dentry);
+static int afs_link(struct dentry *from, struct inode *dir,
+		    struct dentry *dentry);
+static int afs_symlink(struct inode *dir, struct dentry *dentry,
+		       const char *content);
+static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry);
+
+const struct file_operations afs_dir_file_operations = {
+	.open		= afs_dir_open,
+	.release	= afs_release,
+	.readdir	= afs_readdir,
+	.lock		= afs_lock,
+	.llseek		= generic_file_llseek,
+};
+
+const struct inode_operations afs_dir_inode_operations = {
+	.create		= afs_create,
+	.lookup		= afs_lookup,
+	.link		= afs_link,
+	.unlink		= afs_unlink,
+	.symlink	= afs_symlink,
+	.mkdir		= afs_mkdir,
+	.rmdir		= afs_rmdir,
+	.rename		= afs_rename,
+	.permission	= afs_permission,
+	.getattr	= afs_getattr,
+	.setattr	= afs_setattr,
+};
+
+const struct dentry_operations afs_fs_dentry_operations = {
+	.d_revalidate	= afs_d_revalidate,
+	.d_delete	= afs_d_delete,
+	.d_release	= afs_d_release,
+	.d_automount	= afs_d_automount,
+};
+
+#define AFS_DIR_HASHTBL_SIZE	128
+#define AFS_DIR_DIRENT_SIZE	32
+#define AFS_DIRENT_PER_BLOCK	64
+
+union afs_dirent {
+	struct {
+		uint8_t		valid;
+		uint8_t		unused[1];
+		__be16		hash_next;
+		__be32		vnode;
+		__be32		unique;
+		uint8_t		name[16];
+		uint8_t		overflow[4];	/* if any char of the name (inc
+						 * NUL) reaches here, consume
+						 * the next dirent too */
+	} u;
+	uint8_t	extended_name[32];
+};
+
+/* AFS directory page header (one at the beginning of every 2048-byte chunk) */
+struct afs_dir_pagehdr {
+	__be16		npages;
+	__be16		magic;
+#define AFS_DIR_MAGIC htons(1234)
+	uint8_t		nentries;
+	uint8_t		bitmap[8];
+	uint8_t		pad[19];
+};
+
+/* directory block layout */
+union afs_dir_block {
+
+	struct afs_dir_pagehdr pagehdr;
+
+	struct {
+		struct afs_dir_pagehdr	pagehdr;
+		uint8_t			alloc_ctrs[128];
+		/* dir hash table */
+		uint16_t		hashtable[AFS_DIR_HASHTBL_SIZE];
+	} hdr;
+
+	union afs_dirent dirents[AFS_DIRENT_PER_BLOCK];
+};
+
+/* layout on a linux VM page */
+struct afs_dir_page {
+	union afs_dir_block blocks[PAGE_SIZE / sizeof(union afs_dir_block)];
+};
+
+struct afs_lookup_cookie {
+	struct afs_fid	fid;
+	const char	*name;
+	size_t		nlen;
+	int		found;
+};
+
+/*
+ * check that a directory page is valid
+ */
+static inline void afs_dir_check_page(struct inode *dir, struct page *page)
+{
+	struct afs_dir_page *dbuf;
+	loff_t latter;
+	int tmp, qty;
+
+#if 0
+	/* check the page count */
+	qty = desc.size / sizeof(dbuf->blocks[0]);
+	if (qty == 0)
+		goto error;
+
+	if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) {
+		printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n",
+		       __func__, dir->i_ino, qty,
+		       ntohs(dbuf->blocks[0].pagehdr.npages));
+		goto error;
+	}
+#endif
+
+	/* determine how many magic numbers there should be in this page */
+	latter = dir->i_size - page_offset(page);
+	if (latter >= PAGE_SIZE)
+		qty = PAGE_SIZE;
+	else
+		qty = latter;
+	qty /= sizeof(union afs_dir_block);
+
+	/* check them */
+	dbuf = page_address(page);
+	for (tmp = 0; tmp < qty; tmp++) {
+		if (dbuf->blocks[tmp].pagehdr.magic != AFS_DIR_MAGIC) {
+			printk("kAFS: %s(%lu): bad magic %d/%d is %04hx\n",
+			       __func__, dir->i_ino, tmp, qty,
+			       ntohs(dbuf->blocks[tmp].pagehdr.magic));
+			goto error;
+		}
+	}
+
+	SetPageChecked(page);
+	return;
+
+error:
+	SetPageChecked(page);
+	SetPageError(page);
+}
+
+/*
+ * discard a page cached in the pagecache
+ */
+static inline void afs_dir_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+/*
+ * get a page into the pagecache
+ */
+static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
+				     struct key *key)
+{
+	struct page *page;
+	_enter("{%lu},%lu", dir->i_ino, index);
+
+	page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (!PageChecked(page))
+			afs_dir_check_page(dir, page);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	afs_dir_put_page(page);
+	_leave(" = -EIO");
+	return ERR_PTR(-EIO);
+}
+
+/*
+ * open an AFS directory file
+ */
+static int afs_dir_open(struct inode *inode, struct file *file)
+{
+	_enter("{%lu}", inode->i_ino);
+
+	BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
+	BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
+
+	if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags))
+		return -ENOENT;
+
+	return afs_open(inode, file);
+}
+
+/*
+ * deal with one block in an AFS directory
+ */
+static int afs_dir_iterate_block(unsigned *fpos,
+				 union afs_dir_block *block,
+				 unsigned blkoff,
+				 void *cookie,
+				 filldir_t filldir)
+{
+	union afs_dirent *dire;
+	unsigned offset, next, curr;
+	size_t nlen;
+	int tmp, ret;
+
+	_enter("%u,%x,%p,,",*fpos,blkoff,block);
+
+	curr = (*fpos - blkoff) / sizeof(union afs_dirent);
+
+	/* walk through the block, an entry at a time */
+	for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries;
+	     offset < AFS_DIRENT_PER_BLOCK;
+	     offset = next
+	     ) {
+		next = offset + 1;
+
+		/* skip entries marked unused in the bitmap */
+		if (!(block->pagehdr.bitmap[offset / 8] &
+		      (1 << (offset % 8)))) {
+			_debug("ENT[%Zu.%u]: unused",
+			       blkoff / sizeof(union afs_dir_block), offset);
+			if (offset >= curr)
+				*fpos = blkoff +
+					next * sizeof(union afs_dirent);
+			continue;
+		}
+
+		/* got a valid entry */
+		dire = &block->dirents[offset];
+		nlen = strnlen(dire->u.name,
+			       sizeof(*block) -
+			       offset * sizeof(union afs_dirent));
+
+		_debug("ENT[%Zu.%u]: %s %Zu \"%s\"",
+		       blkoff / sizeof(union afs_dir_block), offset,
+		       (offset < curr ? "skip" : "fill"),
+		       nlen, dire->u.name);
+
+		/* work out where the next possible entry is */
+		for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_dirent)) {
+			if (next >= AFS_DIRENT_PER_BLOCK) {
+				_debug("ENT[%Zu.%u]:"
+				       " %u travelled beyond end dir block"
+				       " (len %u/%Zu)",
+				       blkoff / sizeof(union afs_dir_block),
+				       offset, next, tmp, nlen);
+				return -EIO;
+			}
+			if (!(block->pagehdr.bitmap[next / 8] &
+			      (1 << (next % 8)))) {
+				_debug("ENT[%Zu.%u]:"
+				       " %u unmarked extension (len %u/%Zu)",
+				       blkoff / sizeof(union afs_dir_block),
+				       offset, next, tmp, nlen);
+				return -EIO;
+			}
+
+			_debug("ENT[%Zu.%u]: ext %u/%Zu",
+			       blkoff / sizeof(union afs_dir_block),
+			       next, tmp, nlen);
+			next++;
+		}
+
+		/* skip if starts before the current position */
+		if (offset < curr)
+			continue;
+
+		/* found the next entry */
+		ret = filldir(cookie,
+			      dire->u.name,
+			      nlen,
+			      blkoff + offset * sizeof(union afs_dirent),
+			      ntohl(dire->u.vnode),
+			      filldir == afs_lookup_filldir ?
+			      ntohl(dire->u.unique) : DT_UNKNOWN);
+		if (ret < 0) {
+			_leave(" = 0 [full]");
+			return 0;
+		}
+
+		*fpos = blkoff + next * sizeof(union afs_dirent);
+	}
+
+	_leave(" = 1 [more]");
+	return 1;
+}
+
+/*
+ * iterate through the data blob that lists the contents of an AFS directory
+ */
+static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
+			   filldir_t filldir, struct key *key)
+{
+	union afs_dir_block *dblock;
+	struct afs_dir_page *dbuf;
+	struct page *page;
+	unsigned blkoff, limit;
+	int ret;
+
+	_enter("{%lu},%u,,", dir->i_ino, *fpos);
+
+	if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
+		_leave(" = -ESTALE");
+		return -ESTALE;
+	}
+
+	/* round the file position up to the next entry boundary */
+	*fpos += sizeof(union afs_dirent) - 1;
+	*fpos &= ~(sizeof(union afs_dirent) - 1);
+
+	/* walk through the blocks in sequence */
+	ret = 0;
+	while (*fpos < dir->i_size) {
+		blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1);
+
+		/* fetch the appropriate page from the directory */
+		page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
+		if (IS_ERR(page)) {
+			ret = PTR_ERR(page);
+			break;
+		}
+
+		limit = blkoff & ~(PAGE_SIZE - 1);
+
+		dbuf = page_address(page);
+
+		/* deal with the individual blocks stashed on this page */
+		do {
+			dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
+					       sizeof(union afs_dir_block)];
+			ret = afs_dir_iterate_block(fpos, dblock, blkoff,
+						    cookie, filldir);
+			if (ret != 1) {
+				afs_dir_put_page(page);
+				goto out;
+			}
+
+			blkoff += sizeof(union afs_dir_block);
+
+		} while (*fpos < dir->i_size && blkoff < limit);
+
+		afs_dir_put_page(page);
+		ret = 0;
+	}
+
+out:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * read an AFS directory
+ */
+static int afs_readdir(struct file *file, void *cookie, filldir_t filldir)
+{
+	unsigned fpos;
+	int ret;
+
+	_enter("{%Ld,{%lu}}",
+	       file->f_pos, file->f_path.dentry->d_inode->i_ino);
+
+	ASSERT(file->private_data != NULL);
+
+	fpos = file->f_pos;
+	ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos,
+			      cookie, filldir, file->private_data);
+	file->f_pos = fpos;
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * search the directory for a name
+ * - if afs_dir_iterate_block() spots this function, it'll pass the FID
+ *   uniquifier through dtype
+ */
+static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
+			      loff_t fpos, u64 ino, unsigned dtype)
+{
+	struct afs_lookup_cookie *cookie = _cookie;
+
+	_enter("{%s,%Zu},%s,%u,,%llu,%u",
+	       cookie->name, cookie->nlen, name, nlen,
+	       (unsigned long long) ino, dtype);
+
+	/* insanity checks first */
+	BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
+	BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
+
+	if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) {
+		_leave(" = 0 [no]");
+		return 0;
+	}
+
+	cookie->fid.vnode = ino;
+	cookie->fid.unique = dtype;
+	cookie->found = 1;
+
+	_leave(" = -1 [found]");
+	return -1;
+}
+
+/*
+ * do a lookup in a directory
+ * - just returns the FID the dentry name maps to if found
+ */
+static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
+			 struct afs_fid *fid, struct key *key)
+{
+	struct afs_lookup_cookie cookie;
+	struct afs_super_info *as;
+	unsigned fpos;
+	int ret;
+
+	_enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
+
+	as = dir->i_sb->s_fs_info;
+
+	/* search the directory */
+	cookie.name	= dentry->d_name.name;
+	cookie.nlen	= dentry->d_name.len;
+	cookie.fid.vid	= as->volume->vid;
+	cookie.found	= 0;
+
+	fpos = 0;
+	ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir,
+			      key);
+	if (ret < 0) {
+		_leave(" = %d [iter]", ret);
+		return ret;
+	}
+
+	ret = -ENOENT;
+	if (!cookie.found) {
+		_leave(" = -ENOENT [not found]");
+		return -ENOENT;
+	}
+
+	*fid = cookie.fid;
+	_leave(" = 0 { vn=%u u=%u }", fid->vnode, fid->unique);
+	return 0;
+}
+
+/*
+ * Try to auto mount the mountpoint with pseudo directory, if the autocell
+ * operation is setted.
+ */
+static struct inode *afs_try_auto_mntpt(
+	int ret, struct dentry *dentry, struct inode *dir, struct key *key,
+	struct afs_fid *fid)
+{
+	const char *devname = dentry->d_name.name;
+	struct afs_vnode *vnode = AFS_FS_I(dir);
+	struct inode *inode;
+
+	_enter("%d, %p{%s}, {%x:%u}, %p",
+	       ret, dentry, devname, vnode->fid.vid, vnode->fid.vnode, key);
+
+	if (ret != -ENOENT ||
+	    !test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
+		goto out;
+
+	inode = afs_iget_autocell(dir, devname, strlen(devname), key);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto out;
+	}
+
+	*fid = AFS_FS_I(inode)->fid;
+	_leave("= %p", inode);
+	return inode;
+
+out:
+	_leave("= %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * look up an entry in a directory
+ */
+static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
+				 struct nameidata *nd)
+{
+	struct afs_vnode *vnode;
+	struct afs_fid fid;
+	struct inode *inode;
+	struct key *key;
+	int ret;
+
+	vnode = AFS_FS_I(dir);
+
+	_enter("{%x:%u},%p{%s},",
+	       vnode->fid.vid, vnode->fid.vnode, dentry, dentry->d_name.name);
+
+	ASSERTCMP(dentry->d_inode, ==, NULL);
+
+	if (dentry->d_name.len >= AFSNAMEMAX) {
+		_leave(" = -ENAMETOOLONG");
+		return ERR_PTR(-ENAMETOOLONG);
+	}
+
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+		_leave(" = -ESTALE");
+		return ERR_PTR(-ESTALE);
+	}
+
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key)) {
+		_leave(" = %ld [key]", PTR_ERR(key));
+		return ERR_CAST(key);
+	}
+
+	ret = afs_validate(vnode, key);
+	if (ret < 0) {
+		key_put(key);
+		_leave(" = %d [val]", ret);
+		return ERR_PTR(ret);
+	}
+
+	ret = afs_do_lookup(dir, dentry, &fid, key);
+	if (ret < 0) {
+		inode = afs_try_auto_mntpt(ret, dentry, dir, key, &fid);
+		if (!IS_ERR(inode)) {
+			key_put(key);
+			goto success;
+		}
+
+		ret = PTR_ERR(inode);
+		key_put(key);
+		if (ret == -ENOENT) {
+			d_add(dentry, NULL);
+			_leave(" = NULL [negative]");
+			return NULL;
+		}
+		_leave(" = %d [do]", ret);
+		return ERR_PTR(ret);
+	}
+	dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version;
+
+	/* instantiate the dentry */
+	inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL);
+	key_put(key);
+	if (IS_ERR(inode)) {
+		_leave(" = %ld", PTR_ERR(inode));
+		return ERR_CAST(inode);
+	}
+
+success:
+	d_add(dentry, inode);
+	_leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }",
+	       fid.vnode,
+	       fid.unique,
+	       dentry->d_inode->i_ino,
+	       dentry->d_inode->i_generation);
+
+	return NULL;
+}
+
+/*
+ * check that a dentry lookup hit has found a valid entry
+ * - NOTE! the hit can be a negative hit too, so we can't assume we have an
+ *   inode
+ */
+static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct afs_vnode *vnode, *dir;
+	struct afs_fid uninitialized_var(fid);
+	struct dentry *parent;
+	struct key *key;
+	void *dir_version;
+	int ret;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	vnode = AFS_FS_I(dentry->d_inode);
+
+	if (dentry->d_inode)
+		_enter("{v={%x:%u} n=%s fl=%lx},",
+		       vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
+		       vnode->flags);
+	else
+		_enter("{neg n=%s}", dentry->d_name.name);
+
+	key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell);
+	if (IS_ERR(key))
+		key = NULL;
+
+	/* lock down the parent dentry so we can peer at it */
+	parent = dget_parent(dentry);
+	if (!parent->d_inode)
+		goto out_bad;
+
+	dir = AFS_FS_I(parent->d_inode);
+
+	/* validate the parent directory */
+	if (test_bit(AFS_VNODE_MODIFIED, &dir->flags))
+		afs_validate(dir, key);
+
+	if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
+		_debug("%s: parent dir deleted", dentry->d_name.name);
+		goto out_bad;
+	}
+
+	dir_version = (void *) (unsigned long) dir->status.data_version;
+	if (dentry->d_fsdata == dir_version)
+		goto out_valid; /* the dir contents are unchanged */
+
+	_debug("dir modified");
+
+	/* search the directory for this vnode */
+	ret = afs_do_lookup(&dir->vfs_inode, dentry, &fid, key);
+	switch (ret) {
+	case 0:
+		/* the filename maps to something */
+		if (!dentry->d_inode)
+			goto out_bad;
+		if (is_bad_inode(dentry->d_inode)) {
+			printk("kAFS: afs_d_revalidate: %s/%s has bad inode\n",
+			       parent->d_name.name, dentry->d_name.name);
+			goto out_bad;
+		}
+
+		/* if the vnode ID has changed, then the dirent points to a
+		 * different file */
+		if (fid.vnode != vnode->fid.vnode) {
+			_debug("%s: dirent changed [%u != %u]",
+			       dentry->d_name.name, fid.vnode,
+			       vnode->fid.vnode);
+			goto not_found;
+		}
+
+		/* if the vnode ID uniqifier has changed, then the file has
+		 * been deleted and replaced, and the original vnode ID has
+		 * been reused */
+		if (fid.unique != vnode->fid.unique) {
+			_debug("%s: file deleted (uq %u -> %u I:%u)",
+			       dentry->d_name.name, fid.unique,
+			       vnode->fid.unique,
+			       dentry->d_inode->i_generation);
+			spin_lock(&vnode->lock);
+			set_bit(AFS_VNODE_DELETED, &vnode->flags);
+			spin_unlock(&vnode->lock);
+			goto not_found;
+		}
+		goto out_valid;
+
+	case -ENOENT:
+		/* the filename is unknown */
+		_debug("%s: dirent not found", dentry->d_name.name);
+		if (dentry->d_inode)
+			goto not_found;
+		goto out_valid;
+
+	default:
+		_debug("failed to iterate dir %s: %d",
+		       parent->d_name.name, ret);
+		goto out_bad;
+	}
+
+out_valid:
+	dentry->d_fsdata = dir_version;
+out_skip:
+	dput(parent);
+	key_put(key);
+	_leave(" = 1 [valid]");
+	return 1;
+
+	/* the dirent, if it exists, now points to a different vnode */
+not_found:
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
+	spin_unlock(&dentry->d_lock);
+
+out_bad:
+	if (dentry->d_inode) {
+		/* don't unhash if we have submounts */
+		if (have_submounts(dentry))
+			goto out_skip;
+	}
+
+	_debug("dropping dentry %s/%s",
+	       parent->d_name.name, dentry->d_name.name);
+	shrink_dcache_parent(dentry);
+	d_drop(dentry);
+	dput(parent);
+	key_put(key);
+
+	_leave(" = 0 [bad]");
+	return 0;
+}
+
+/*
+ * allow the VFS to enquire as to whether a dentry should be unhashed (mustn't
+ * sleep)
+ * - called from dput() when d_count is going to 0.
+ * - return 1 to request dentry be unhashed, 0 otherwise
+ */
+static int afs_d_delete(const struct dentry *dentry)
+{
+	_enter("%s", dentry->d_name.name);
+
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+		goto zap;
+
+	if (dentry->d_inode &&
+	    (test_bit(AFS_VNODE_DELETED,   &AFS_FS_I(dentry->d_inode)->flags) ||
+	     test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(dentry->d_inode)->flags)))
+		goto zap;
+
+	_leave(" = 0 [keep]");
+	return 0;
+
+zap:
+	_leave(" = 1 [zap]");
+	return 1;
+}
+
+/*
+ * handle dentry release
+ */
+static void afs_d_release(struct dentry *dentry)
+{
+	_enter("%s", dentry->d_name.name);
+}
+
+/*
+ * create a directory on an AFS filesystem
+ */
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct afs_file_status status;
+	struct afs_callback cb;
+	struct afs_server *server;
+	struct afs_vnode *dvnode, *vnode;
+	struct afs_fid fid;
+	struct inode *inode;
+	struct key *key;
+	int ret;
+
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%u},{%s},%o",
+	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
+
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	mode |= S_IFDIR;
+	ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
+			       mode, &fid, &status, &cb, &server);
+	if (ret < 0)
+		goto mkdir_error;
+
+	inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
+	if (IS_ERR(inode)) {
+		/* ENOMEM at a really inconvenient time - just abandon the new
+		 * directory on the server */
+		ret = PTR_ERR(inode);
+		goto iget_error;
+	}
+
+	/* apply the status report we've got for the new vnode */
+	vnode = AFS_FS_I(inode);
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+	afs_vnode_finalise_status_update(vnode, server);
+	afs_put_server(server);
+
+	d_instantiate(dentry, inode);
+	if (d_unhashed(dentry)) {
+		_debug("not hashed");
+		d_rehash(dentry);
+	}
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+iget_error:
+	afs_put_server(server);
+mkdir_error:
+	key_put(key);
+error:
+	d_drop(dentry);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * remove a directory from an AFS filesystem
+ */
+static int afs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct afs_vnode *dvnode, *vnode;
+	struct key *key;
+	int ret;
+
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%u},{%s}",
+	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, true);
+	if (ret < 0)
+		goto rmdir_error;
+
+	if (dentry->d_inode) {
+		vnode = AFS_FS_I(dentry->d_inode);
+		clear_nlink(&vnode->vfs_inode);
+		set_bit(AFS_VNODE_DELETED, &vnode->flags);
+		afs_discard_callback_on_delete(vnode);
+	}
+
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+rmdir_error:
+	key_put(key);
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * remove a file from an AFS filesystem
+ */
+static int afs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct afs_vnode *dvnode, *vnode;
+	struct key *key;
+	int ret;
+
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%u},{%s}",
+	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	if (dentry->d_inode) {
+		vnode = AFS_FS_I(dentry->d_inode);
+
+		/* make sure we have a callback promise on the victim */
+		ret = afs_validate(vnode, key);
+		if (ret < 0)
+			goto error;
+	}
+
+	ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, false);
+	if (ret < 0)
+		goto remove_error;
+
+	if (dentry->d_inode) {
+		/* if the file wasn't deleted due to excess hard links, the
+		 * fileserver will break the callback promise on the file - if
+		 * it had one - before it returns to us, and if it was deleted,
+		 * it won't
+		 *
+		 * however, if we didn't have a callback promise outstanding,
+		 * or it was outstanding on a different server, then it won't
+		 * break it either...
+		 */
+		vnode = AFS_FS_I(dentry->d_inode);
+		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+			_debug("AFS_VNODE_DELETED");
+		if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
+			_debug("AFS_VNODE_CB_BROKEN");
+		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+		ret = afs_validate(vnode, key);
+		_debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
+	}
+
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+remove_error:
+	key_put(key);
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * create a regular file on an AFS filesystem
+ */
+static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
+		      struct nameidata *nd)
+{
+	struct afs_file_status status;
+	struct afs_callback cb;
+	struct afs_server *server;
+	struct afs_vnode *dvnode, *vnode;
+	struct afs_fid fid;
+	struct inode *inode;
+	struct key *key;
+	int ret;
+
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%u},{%s},%o,",
+	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
+
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	mode |= S_IFREG;
+	ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
+			       mode, &fid, &status, &cb, &server);
+	if (ret < 0)
+		goto create_error;
+
+	inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
+	if (IS_ERR(inode)) {
+		/* ENOMEM at a really inconvenient time - just abandon the new
+		 * directory on the server */
+		ret = PTR_ERR(inode);
+		goto iget_error;
+	}
+
+	/* apply the status report we've got for the new vnode */
+	vnode = AFS_FS_I(inode);
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+	afs_vnode_finalise_status_update(vnode, server);
+	afs_put_server(server);
+
+	d_instantiate(dentry, inode);
+	if (d_unhashed(dentry)) {
+		_debug("not hashed");
+		d_rehash(dentry);
+	}
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+iget_error:
+	afs_put_server(server);
+create_error:
+	key_put(key);
+error:
+	d_drop(dentry);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * create a hard link between files in an AFS filesystem
+ */
+static int afs_link(struct dentry *from, struct inode *dir,
+		    struct dentry *dentry)
+{
+	struct afs_vnode *dvnode, *vnode;
+	struct key *key;
+	int ret;
+
+	vnode = AFS_FS_I(from->d_inode);
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%u},{%x:%u},{%s}",
+	       vnode->fid.vid, vnode->fid.vnode,
+	       dvnode->fid.vid, dvnode->fid.vnode,
+	       dentry->d_name.name);
+
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	ret = afs_vnode_link(dvnode, vnode, key, dentry->d_name.name);
+	if (ret < 0)
+		goto link_error;
+
+	ihold(&vnode->vfs_inode);
+	d_instantiate(dentry, &vnode->vfs_inode);
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+link_error:
+	key_put(key);
+error:
+	d_drop(dentry);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * create a symlink in an AFS filesystem
+ */
+static int afs_symlink(struct inode *dir, struct dentry *dentry,
+		       const char *content)
+{
+	struct afs_file_status status;
+	struct afs_server *server;
+	struct afs_vnode *dvnode, *vnode;
+	struct afs_fid fid;
+	struct inode *inode;
+	struct key *key;
+	int ret;
+
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%u},{%s},%s",
+	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name,
+	       content);
+
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
+	ret = -EINVAL;
+	if (strlen(content) >= AFSPATHMAX)
+		goto error;
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	ret = afs_vnode_symlink(dvnode, key, dentry->d_name.name, content,
+				&fid, &status, &server);
+	if (ret < 0)
+		goto create_error;
+
+	inode = afs_iget(dir->i_sb, key, &fid, &status, NULL);
+	if (IS_ERR(inode)) {
+		/* ENOMEM at a really inconvenient time - just abandon the new
+		 * directory on the server */
+		ret = PTR_ERR(inode);
+		goto iget_error;
+	}
+
+	/* apply the status report we've got for the new vnode */
+	vnode = AFS_FS_I(inode);
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+	afs_vnode_finalise_status_update(vnode, server);
+	afs_put_server(server);
+
+	d_instantiate(dentry, inode);
+	if (d_unhashed(dentry)) {
+		_debug("not hashed");
+		d_rehash(dentry);
+	}
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+iget_error:
+	afs_put_server(server);
+create_error:
+	key_put(key);
+error:
+	d_drop(dentry);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * rename a file in an AFS filesystem and/or move it between directories
+ */
+static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
+	struct key *key;
+	int ret;
+
+	vnode = AFS_FS_I(old_dentry->d_inode);
+	orig_dvnode = AFS_FS_I(old_dir);
+	new_dvnode = AFS_FS_I(new_dir);
+
+	_enter("{%x:%u},{%x:%u},{%x:%u},{%s}",
+	       orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
+	       vnode->fid.vid, vnode->fid.vnode,
+	       new_dvnode->fid.vid, new_dvnode->fid.vnode,
+	       new_dentry->d_name.name);
+
+	ret = -ENAMETOOLONG;
+	if (new_dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
+	key = afs_request_key(orig_dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	ret = afs_vnode_rename(orig_dvnode, new_dvnode, key,
+			       old_dentry->d_name.name,
+			       new_dentry->d_name.name);
+	if (ret < 0)
+		goto rename_error;
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+rename_error:
+	key_put(key);
+error:
+	d_drop(new_dentry);
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/afs/file.c b/fs/afs/file.c
new file mode 100644
index 00000000..14d89fa5
--- /dev/null
+++ b/fs/afs/file.c
@@ -0,0 +1,378 @@
+/* AFS filesystem file handling
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/gfp.h>
+#include "internal.h"
+
+static int afs_readpage(struct file *file, struct page *page);
+static void afs_invalidatepage(struct page *page, unsigned long offset);
+static int afs_releasepage(struct page *page, gfp_t gfp_flags);
+static int afs_launder_page(struct page *page);
+
+static int afs_readpages(struct file *filp, struct address_space *mapping,
+			 struct list_head *pages, unsigned nr_pages);
+
+const struct file_operations afs_file_operations = {
+	.open		= afs_open,
+	.release	= afs_release,
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= afs_file_write,
+	.mmap		= generic_file_readonly_mmap,
+	.splice_read	= generic_file_splice_read,
+	.fsync		= afs_fsync,
+	.lock		= afs_lock,
+	.flock		= afs_flock,
+};
+
+const struct inode_operations afs_file_inode_operations = {
+	.getattr	= afs_getattr,
+	.setattr	= afs_setattr,
+	.permission	= afs_permission,
+};
+
+const struct address_space_operations afs_fs_aops = {
+	.readpage	= afs_readpage,
+	.readpages	= afs_readpages,
+	.set_page_dirty	= afs_set_page_dirty,
+	.launder_page	= afs_launder_page,
+	.releasepage	= afs_releasepage,
+	.invalidatepage	= afs_invalidatepage,
+	.write_begin	= afs_write_begin,
+	.write_end	= afs_write_end,
+	.writepage	= afs_writepage,
+	.writepages	= afs_writepages,
+};
+
+/*
+ * open an AFS file or directory and attach a key to it
+ */
+int afs_open(struct inode *inode, struct file *file)
+{
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct key *key;
+	int ret;
+
+	_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
+
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key)) {
+		_leave(" = %ld [key]", PTR_ERR(key));
+		return PTR_ERR(key);
+	}
+
+	ret = afs_validate(vnode, key);
+	if (ret < 0) {
+		_leave(" = %d [val]", ret);
+		return ret;
+	}
+
+	file->private_data = key;
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * release an AFS file or directory and discard its key
+ */
+int afs_release(struct inode *inode, struct file *file)
+{
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+
+	_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
+
+	key_put(file->private_data);
+	_leave(" = 0");
+	return 0;
+}
+
+#ifdef CONFIG_AFS_FSCACHE
+/*
+ * deal with notification that a page was read from the cache
+ */
+static void afs_file_readpage_read_complete(struct page *page,
+					    void *data,
+					    int error)
+{
+	_enter("%p,%p,%d", page, data, error);
+
+	/* if the read completes with an error, we just unlock the page and let
+	 * the VM reissue the readpage */
+	if (!error)
+		SetPageUptodate(page);
+	unlock_page(page);
+}
+#endif
+
+/*
+ * read page from file, directory or symlink, given a key to use
+ */
+int afs_page_filler(void *data, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct key *key = data;
+	size_t len;
+	off_t offset;
+	int ret;
+
+	_enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
+
+	BUG_ON(!PageLocked(page));
+
+	ret = -ESTALE;
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+		goto error;
+
+	/* is it cached? */
+#ifdef CONFIG_AFS_FSCACHE
+	ret = fscache_read_or_alloc_page(vnode->cache,
+					 page,
+					 afs_file_readpage_read_complete,
+					 NULL,
+					 GFP_KERNEL);
+#else
+	ret = -ENOBUFS;
+#endif
+	switch (ret) {
+		/* read BIO submitted (page in cache) */
+	case 0:
+		break;
+
+		/* page not yet cached */
+	case -ENODATA:
+		_debug("cache said ENODATA");
+		goto go_on;
+
+		/* page will not be cached */
+	case -ENOBUFS:
+		_debug("cache said ENOBUFS");
+	default:
+	go_on:
+		offset = page->index << PAGE_CACHE_SHIFT;
+		len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
+
+		/* read the contents of the file from the server into the
+		 * page */
+		ret = afs_vnode_fetch_data(vnode, key, offset, len, page);
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				_debug("got NOENT from server"
+				       " - marking file deleted and stale");
+				set_bit(AFS_VNODE_DELETED, &vnode->flags);
+				ret = -ESTALE;
+			}
+
+#ifdef CONFIG_AFS_FSCACHE
+			fscache_uncache_page(vnode->cache, page);
+#endif
+			BUG_ON(PageFsCache(page));
+			goto error;
+		}
+
+		SetPageUptodate(page);
+
+		/* send the page to the cache */
+#ifdef CONFIG_AFS_FSCACHE
+		if (PageFsCache(page) &&
+		    fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
+			fscache_uncache_page(vnode->cache, page);
+			BUG_ON(PageFsCache(page));
+		}
+#endif
+		unlock_page(page);
+	}
+
+	_leave(" = 0");
+	return 0;
+
+error:
+	SetPageError(page);
+	unlock_page(page);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * read page from file, directory or symlink, given a file to nominate the key
+ * to be used
+ */
+static int afs_readpage(struct file *file, struct page *page)
+{
+	struct key *key;
+	int ret;
+
+	if (file) {
+		key = file->private_data;
+		ASSERT(key != NULL);
+		ret = afs_page_filler(key, page);
+	} else {
+		struct inode *inode = page->mapping->host;
+		key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
+		if (IS_ERR(key)) {
+			ret = PTR_ERR(key);
+		} else {
+			ret = afs_page_filler(key, page);
+			key_put(key);
+		}
+	}
+	return ret;
+}
+
+/*
+ * read a set of pages
+ */
+static int afs_readpages(struct file *file, struct address_space *mapping,
+			 struct list_head *pages, unsigned nr_pages)
+{
+	struct key *key = file->private_data;
+	struct afs_vnode *vnode;
+	int ret = 0;
+
+	_enter("{%d},{%lu},,%d",
+	       key_serial(key), mapping->host->i_ino, nr_pages);
+
+	ASSERT(key != NULL);
+
+	vnode = AFS_FS_I(mapping->host);
+	if (vnode->flags & AFS_VNODE_DELETED) {
+		_leave(" = -ESTALE");
+		return -ESTALE;
+	}
+
+	/* attempt to read as many of the pages as possible */
+#ifdef CONFIG_AFS_FSCACHE
+	ret = fscache_read_or_alloc_pages(vnode->cache,
+					  mapping,
+					  pages,
+					  &nr_pages,
+					  afs_file_readpage_read_complete,
+					  NULL,
+					  mapping_gfp_mask(mapping));
+#else
+	ret = -ENOBUFS;
+#endif
+
+	switch (ret) {
+		/* all pages are being read from the cache */
+	case 0:
+		BUG_ON(!list_empty(pages));
+		BUG_ON(nr_pages != 0);
+		_leave(" = 0 [reading all]");
+		return 0;
+
+		/* there were pages that couldn't be read from the cache */
+	case -ENODATA:
+	case -ENOBUFS:
+		break;
+
+		/* other error */
+	default:
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* load the missing pages from the network */
+	ret = read_cache_pages(mapping, pages, afs_page_filler, key);
+
+	_leave(" = %d [netting]", ret);
+	return ret;
+}
+
+/*
+ * write back a dirty page
+ */
+static int afs_launder_page(struct page *page)
+{
+	_enter("{%lu}", page->index);
+
+	return 0;
+}
+
+/*
+ * invalidate part or all of a page
+ * - release a page and clean up its private data if offset is 0 (indicating
+ *   the entire page)
+ */
+static void afs_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
+
+	_enter("{%lu},%lu", page->index, offset);
+
+	BUG_ON(!PageLocked(page));
+
+	/* we clean up only if the entire page is being invalidated */
+	if (offset == 0) {
+#ifdef CONFIG_AFS_FSCACHE
+		if (PageFsCache(page)) {
+			struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+			fscache_wait_on_page_write(vnode->cache, page);
+			fscache_uncache_page(vnode->cache, page);
+		}
+#endif
+
+		if (PagePrivate(page)) {
+			if (wb && !PageWriteback(page)) {
+				set_page_private(page, 0);
+				afs_put_writeback(wb);
+			}
+
+			if (!page_private(page))
+				ClearPagePrivate(page);
+		}
+	}
+
+	_leave("");
+}
+
+/*
+ * release a page and clean up its private state if it's not busy
+ * - return true if the page can now be released, false if not
+ */
+static int afs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+	struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
+	struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+
+	_enter("{{%x:%u}[%lu],%lx},%x",
+	       vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
+	       gfp_flags);
+
+	/* deny if page is being written to the cache and the caller hasn't
+	 * elected to wait */
+#ifdef CONFIG_AFS_FSCACHE
+	if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) {
+		_leave(" = F [cache busy]");
+		return 0;
+	}
+#endif
+
+	if (PagePrivate(page)) {
+		if (wb) {
+			set_page_private(page, 0);
+			afs_put_writeback(wb);
+		}
+		ClearPagePrivate(page);
+	}
+
+	/* indicate that the page can be released */
+	_leave(" = T");
+	return 1;
+}
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
new file mode 100644
index 00000000..757d6645
--- /dev/null
+++ b/fs/afs/flock.c
@@ -0,0 +1,588 @@
+/* AFS file locking support
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "internal.h"
+
+#define AFS_LOCK_GRANTED	0
+#define AFS_LOCK_PENDING	1
+
+static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl);
+static void afs_fl_release_private(struct file_lock *fl);
+
+static struct workqueue_struct *afs_lock_manager;
+static DEFINE_MUTEX(afs_lock_manager_mutex);
+
+static const struct file_lock_operations afs_lock_ops = {
+	.fl_copy_lock		= afs_fl_copy_lock,
+	.fl_release_private	= afs_fl_release_private,
+};
+
+/*
+ * initialise the lock manager thread if it isn't already running
+ */
+static int afs_init_lock_manager(void)
+{
+	int ret;
+
+	ret = 0;
+	if (!afs_lock_manager) {
+		mutex_lock(&afs_lock_manager_mutex);
+		if (!afs_lock_manager) {
+			afs_lock_manager =
+				create_singlethread_workqueue("kafs_lockd");
+			if (!afs_lock_manager)
+				ret = -ENOMEM;
+		}
+		mutex_unlock(&afs_lock_manager_mutex);
+	}
+	return ret;
+}
+
+/*
+ * destroy the lock manager thread if it's running
+ */
+void __exit afs_kill_lock_manager(void)
+{
+	if (afs_lock_manager)
+		destroy_workqueue(afs_lock_manager);
+}
+
+/*
+ * if the callback is broken on this vnode, then the lock may now be available
+ */
+void afs_lock_may_be_available(struct afs_vnode *vnode)
+{
+	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
+	queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0);
+}
+
+/*
+ * the lock will time out in 5 minutes unless we extend it, so schedule
+ * extension in a bit less than that time
+ */
+static void afs_schedule_lock_extension(struct afs_vnode *vnode)
+{
+	queue_delayed_work(afs_lock_manager, &vnode->lock_work,
+			   AFS_LOCKWAIT * HZ / 2);
+}
+
+/*
+ * grant one or more locks (readlocks are allowed to jump the queue if the
+ * first lock in the queue is itself a readlock)
+ * - the caller must hold the vnode lock
+ */
+static void afs_grant_locks(struct afs_vnode *vnode, struct file_lock *fl)
+{
+	struct file_lock *p, *_p;
+
+	list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
+	if (fl->fl_type == F_RDLCK) {
+		list_for_each_entry_safe(p, _p, &vnode->pending_locks,
+					 fl_u.afs.link) {
+			if (p->fl_type == F_RDLCK) {
+				p->fl_u.afs.state = AFS_LOCK_GRANTED;
+				list_move_tail(&p->fl_u.afs.link,
+					       &vnode->granted_locks);
+				wake_up(&p->fl_wait);
+			}
+		}
+	}
+}
+
+/*
+ * do work for a lock, including:
+ * - probing for a lock we're waiting on but didn't get immediately
+ * - extending a lock that's close to timing out
+ */
+void afs_lock_work(struct work_struct *work)
+{
+	struct afs_vnode *vnode =
+		container_of(work, struct afs_vnode, lock_work.work);
+	struct file_lock *fl;
+	afs_lock_type_t type;
+	struct key *key;
+	int ret;
+
+	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
+	spin_lock(&vnode->lock);
+
+	if (test_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) {
+		_debug("unlock");
+		spin_unlock(&vnode->lock);
+
+		/* attempt to release the server lock; if it fails, we just
+		 * wait 5 minutes and it'll time out anyway */
+		ret = afs_vnode_release_lock(vnode, vnode->unlock_key);
+		if (ret < 0)
+			printk(KERN_WARNING "AFS:"
+			       " Failed to release lock on {%x:%x} error %d\n",
+			       vnode->fid.vid, vnode->fid.vnode, ret);
+
+		spin_lock(&vnode->lock);
+		key_put(vnode->unlock_key);
+		vnode->unlock_key = NULL;
+		clear_bit(AFS_VNODE_UNLOCKING, &vnode->flags);
+	}
+
+	/* if we've got a lock, then it must be time to extend that lock as AFS
+	 * locks time out after 5 minutes */
+	if (!list_empty(&vnode->granted_locks)) {
+		_debug("extend");
+
+		if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
+			BUG();
+		fl = list_entry(vnode->granted_locks.next,
+				struct file_lock, fl_u.afs.link);
+		key = key_get(fl->fl_file->private_data);
+		spin_unlock(&vnode->lock);
+
+		ret = afs_vnode_extend_lock(vnode, key);
+		clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
+		key_put(key);
+		switch (ret) {
+		case 0:
+			afs_schedule_lock_extension(vnode);
+			break;
+		default:
+			/* ummm... we failed to extend the lock - retry
+			 * extension shortly */
+			printk(KERN_WARNING "AFS:"
+			       " Failed to extend lock on {%x:%x} error %d\n",
+			       vnode->fid.vid, vnode->fid.vnode, ret);
+			queue_delayed_work(afs_lock_manager, &vnode->lock_work,
+					   HZ * 10);
+			break;
+		}
+		_leave(" [extend]");
+		return;
+	}
+
+	/* if we don't have a granted lock, then we must've been called back by
+	 * the server, and so if might be possible to get a lock we're
+	 * currently waiting for */
+	if (!list_empty(&vnode->pending_locks)) {
+		_debug("get");
+
+		if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
+			BUG();
+		fl = list_entry(vnode->pending_locks.next,
+				struct file_lock, fl_u.afs.link);
+		key = key_get(fl->fl_file->private_data);
+		type = (fl->fl_type == F_RDLCK) ?
+			AFS_LOCK_READ : AFS_LOCK_WRITE;
+		spin_unlock(&vnode->lock);
+
+		ret = afs_vnode_set_lock(vnode, key, type);
+		clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
+		switch (ret) {
+		case -EWOULDBLOCK:
+			_debug("blocked");
+			break;
+		case 0:
+			_debug("acquired");
+			if (type == AFS_LOCK_READ)
+				set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
+			else
+				set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
+			ret = AFS_LOCK_GRANTED;
+		default:
+			spin_lock(&vnode->lock);
+			/* the pending lock may have been withdrawn due to a
+			 * signal */
+			if (list_entry(vnode->pending_locks.next,
+				       struct file_lock, fl_u.afs.link) == fl) {
+				fl->fl_u.afs.state = ret;
+				if (ret == AFS_LOCK_GRANTED)
+					afs_grant_locks(vnode, fl);
+				else
+					list_del_init(&fl->fl_u.afs.link);
+				wake_up(&fl->fl_wait);
+				spin_unlock(&vnode->lock);
+			} else {
+				_debug("withdrawn");
+				clear_bit(AFS_VNODE_READLOCKED, &vnode->flags);
+				clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
+				spin_unlock(&vnode->lock);
+				afs_vnode_release_lock(vnode, key);
+				if (!list_empty(&vnode->pending_locks))
+					afs_lock_may_be_available(vnode);
+			}
+			break;
+		}
+		key_put(key);
+		_leave(" [pend]");
+		return;
+	}
+
+	/* looks like the lock request was withdrawn on a signal */
+	spin_unlock(&vnode->lock);
+	_leave(" [no locks]");
+}
+
+/*
+ * pass responsibility for the unlocking of a vnode on the server to the
+ * manager thread, lest a pending signal in the calling thread interrupt
+ * AF_RXRPC
+ * - the caller must hold the vnode lock
+ */
+static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
+{
+	cancel_delayed_work(&vnode->lock_work);
+	if (!test_and_clear_bit(AFS_VNODE_READLOCKED, &vnode->flags) &&
+	    !test_and_clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags))
+		BUG();
+	if (test_and_set_bit(AFS_VNODE_UNLOCKING, &vnode->flags))
+		BUG();
+	vnode->unlock_key = key_get(key);
+	afs_lock_may_be_available(vnode);
+}
+
+/*
+ * request a lock on a file on the server
+ */
+static int afs_do_setlk(struct file *file, struct file_lock *fl)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+	afs_lock_type_t type;
+	struct key *key = file->private_data;
+	int ret;
+
+	_enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+
+	/* only whole-file locks are supported */
+	if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
+		return -EINVAL;
+
+	ret = afs_init_lock_manager();
+	if (ret < 0)
+		return ret;
+
+	fl->fl_ops = &afs_lock_ops;
+	INIT_LIST_HEAD(&fl->fl_u.afs.link);
+	fl->fl_u.afs.state = AFS_LOCK_PENDING;
+
+	type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
+
+	lock_flocks();
+
+	/* make sure we've got a callback on this file and that our view of the
+	 * data version is up to date */
+	ret = afs_vnode_fetch_status(vnode, NULL, key);
+	if (ret < 0)
+		goto error;
+
+	if (vnode->status.lock_count != 0 && !(fl->fl_flags & FL_SLEEP)) {
+		ret = -EAGAIN;
+		goto error;
+	}
+
+	spin_lock(&vnode->lock);
+
+	/* if we've already got a readlock on the server then we can instantly
+	 * grant another readlock, irrespective of whether there are any
+	 * pending writelocks */
+	if (type == AFS_LOCK_READ &&
+	    vnode->flags & (1 << AFS_VNODE_READLOCKED)) {
+		_debug("instant readlock");
+		ASSERTCMP(vnode->flags &
+			  ((1 << AFS_VNODE_LOCKING) |
+			   (1 << AFS_VNODE_WRITELOCKED)), ==, 0);
+		ASSERT(!list_empty(&vnode->granted_locks));
+		goto sharing_existing_lock;
+	}
+
+	/* if there's no-one else with a lock on this vnode, then we need to
+	 * ask the server for a lock */
+	if (list_empty(&vnode->pending_locks) &&
+	    list_empty(&vnode->granted_locks)) {
+		_debug("not locked");
+		ASSERTCMP(vnode->flags &
+			  ((1 << AFS_VNODE_LOCKING) |
+			   (1 << AFS_VNODE_READLOCKED) |
+			   (1 << AFS_VNODE_WRITELOCKED)), ==, 0);
+		list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
+		set_bit(AFS_VNODE_LOCKING, &vnode->flags);
+		spin_unlock(&vnode->lock);
+
+		ret = afs_vnode_set_lock(vnode, key, type);
+		clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
+		switch (ret) {
+		case 0:
+			_debug("acquired");
+			goto acquired_server_lock;
+		case -EWOULDBLOCK:
+			_debug("would block");
+			spin_lock(&vnode->lock);
+			ASSERT(list_empty(&vnode->granted_locks));
+			ASSERTCMP(vnode->pending_locks.next, ==,
+				  &fl->fl_u.afs.link);
+			goto wait;
+		default:
+			spin_lock(&vnode->lock);
+			list_del_init(&fl->fl_u.afs.link);
+			spin_unlock(&vnode->lock);
+			goto error;
+		}
+	}
+
+	/* otherwise, we need to wait for a local lock to become available */
+	_debug("wait local");
+	list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
+wait:
+	if (!(fl->fl_flags & FL_SLEEP)) {
+		_debug("noblock");
+		ret = -EAGAIN;
+		goto abort_attempt;
+	}
+	spin_unlock(&vnode->lock);
+
+	/* now we need to sleep and wait for the lock manager thread to get the
+	 * lock from the server */
+	_debug("sleep");
+	ret = wait_event_interruptible(fl->fl_wait,
+				       fl->fl_u.afs.state <= AFS_LOCK_GRANTED);
+	if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
+		ret = fl->fl_u.afs.state;
+		if (ret < 0)
+			goto error;
+		spin_lock(&vnode->lock);
+		goto given_lock;
+	}
+
+	/* we were interrupted, but someone may still be in the throes of
+	 * giving us the lock */
+	_debug("intr");
+	ASSERTCMP(ret, ==, -ERESTARTSYS);
+
+	spin_lock(&vnode->lock);
+	if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
+		ret = fl->fl_u.afs.state;
+		if (ret < 0) {
+			spin_unlock(&vnode->lock);
+			goto error;
+		}
+		goto given_lock;
+	}
+
+abort_attempt:
+	/* we aren't going to get the lock, either because we're unwilling to
+	 * wait, or because some signal happened */
+	_debug("abort");
+	if (list_empty(&vnode->granted_locks) &&
+	    vnode->pending_locks.next == &fl->fl_u.afs.link) {
+		if (vnode->pending_locks.prev != &fl->fl_u.afs.link) {
+			/* kick the next pending lock into having a go */
+			list_del_init(&fl->fl_u.afs.link);
+			afs_lock_may_be_available(vnode);
+		}
+	} else {
+		list_del_init(&fl->fl_u.afs.link);
+	}
+	spin_unlock(&vnode->lock);
+	goto error;
+
+acquired_server_lock:
+	/* we've acquired a server lock, but it needs to be renewed after 5
+	 * mins */
+	spin_lock(&vnode->lock);
+	afs_schedule_lock_extension(vnode);
+	if (type == AFS_LOCK_READ)
+		set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
+	else
+		set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
+sharing_existing_lock:
+	/* the lock has been granted as far as we're concerned... */
+	fl->fl_u.afs.state = AFS_LOCK_GRANTED;
+	list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
+given_lock:
+	/* ... but we do still need to get the VFS's blessing */
+	ASSERT(!(vnode->flags & (1 << AFS_VNODE_LOCKING)));
+	ASSERT((vnode->flags & ((1 << AFS_VNODE_READLOCKED) |
+				(1 << AFS_VNODE_WRITELOCKED))) != 0);
+	ret = posix_lock_file(file, fl, NULL);
+	if (ret < 0)
+		goto vfs_rejected_lock;
+	spin_unlock(&vnode->lock);
+
+	/* again, make sure we've got a callback on this file and, again, make
+	 * sure that our view of the data version is up to date (we ignore
+	 * errors incurred here and deal with the consequences elsewhere) */
+	afs_vnode_fetch_status(vnode, NULL, key);
+
+error:
+	unlock_flocks();
+	_leave(" = %d", ret);
+	return ret;
+
+vfs_rejected_lock:
+	/* the VFS rejected the lock we just obtained, so we have to discard
+	 * what we just got */
+	_debug("vfs refused %d", ret);
+	list_del_init(&fl->fl_u.afs.link);
+	if (list_empty(&vnode->granted_locks))
+		afs_defer_unlock(vnode, key);
+	goto abort_attempt;
+}
+
+/*
+ * unlock on a file on the server
+ */
+static int afs_do_unlk(struct file *file, struct file_lock *fl)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+	struct key *key = file->private_data;
+	int ret;
+
+	_enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+
+	/* only whole-file unlocks are supported */
+	if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
+		return -EINVAL;
+
+	fl->fl_ops = &afs_lock_ops;
+	INIT_LIST_HEAD(&fl->fl_u.afs.link);
+	fl->fl_u.afs.state = AFS_LOCK_PENDING;
+
+	spin_lock(&vnode->lock);
+	ret = posix_lock_file(file, fl, NULL);
+	if (ret < 0) {
+		spin_unlock(&vnode->lock);
+		_leave(" = %d [vfs]", ret);
+		return ret;
+	}
+
+	/* discard the server lock only if all granted locks are gone */
+	if (list_empty(&vnode->granted_locks))
+		afs_defer_unlock(vnode, key);
+	spin_unlock(&vnode->lock);
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * return information about a lock we currently hold, if indeed we hold one
+ */
+static int afs_do_getlk(struct file *file, struct file_lock *fl)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+	struct key *key = file->private_data;
+	int ret, lock_count;
+
+	_enter("");
+
+	fl->fl_type = F_UNLCK;
+
+	mutex_lock(&vnode->vfs_inode.i_mutex);
+
+	/* check local lock records first */
+	ret = 0;
+	posix_test_lock(file, fl);
+	if (fl->fl_type == F_UNLCK) {
+		/* no local locks; consult the server */
+		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		if (ret < 0)
+			goto error;
+		lock_count = vnode->status.lock_count;
+		if (lock_count) {
+			if (lock_count > 0)
+				fl->fl_type = F_RDLCK;
+			else
+				fl->fl_type = F_WRLCK;
+			fl->fl_start = 0;
+			fl->fl_end = OFFSET_MAX;
+		}
+	}
+
+error:
+	mutex_unlock(&vnode->vfs_inode.i_mutex);
+	_leave(" = %d [%hd]", ret, fl->fl_type);
+	return ret;
+}
+
+/*
+ * manage POSIX locks on a file
+ */
+int afs_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+
+	_enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
+	       vnode->fid.vid, vnode->fid.vnode, cmd,
+	       fl->fl_type, fl->fl_flags,
+	       (long long) fl->fl_start, (long long) fl->fl_end);
+
+	/* AFS doesn't support mandatory locks */
+	if (__mandatory_lock(&vnode->vfs_inode) && fl->fl_type != F_UNLCK)
+		return -ENOLCK;
+
+	if (IS_GETLK(cmd))
+		return afs_do_getlk(file, fl);
+	if (fl->fl_type == F_UNLCK)
+		return afs_do_unlk(file, fl);
+	return afs_do_setlk(file, fl);
+}
+
+/*
+ * manage FLOCK locks on a file
+ */
+int afs_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+
+	_enter("{%x:%u},%d,{t=%x,fl=%x}",
+	       vnode->fid.vid, vnode->fid.vnode, cmd,
+	       fl->fl_type, fl->fl_flags);
+
+	/*
+	 * No BSD flocks over NFS allowed.
+	 * Note: we could try to fake a POSIX lock request here by
+	 * using ((u32) filp | 0x80000000) or some such as the pid.
+	 * Not sure whether that would be unique, though, or whether
+	 * that would break in other places.
+	 */
+	if (!(fl->fl_flags & FL_FLOCK))
+		return -ENOLCK;
+
+	/* we're simulating flock() locks using posix locks on the server */
+	fl->fl_owner = (fl_owner_t) file;
+	fl->fl_start = 0;
+	fl->fl_end = OFFSET_MAX;
+
+	if (fl->fl_type == F_UNLCK)
+		return afs_do_unlk(file, fl);
+	return afs_do_setlk(file, fl);
+}
+
+/*
+ * the POSIX lock management core VFS code copies the lock record and adds the
+ * copy into its own list, so we need to add that copy to the vnode's lock
+ * queue in the same place as the original (which will be deleted shortly
+ * after)
+ */
+static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl)
+{
+	_enter("");
+
+	list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link);
+}
+
+/*
+ * need to remove this lock from the vnode queue when it's removed from the
+ * VFS's list
+ */
+static void afs_fl_release_private(struct file_lock *fl)
+{
+	_enter("");
+
+	list_del_init(&fl->fl_u.afs.link);
+}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
new file mode 100644
index 00000000..346e3289
--- /dev/null
+++ b/fs/afs/fsclient.c
@@ -0,0 +1,1905 @@
+/* AFS File Server client stubs
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/circ_buf.h>
+#include "internal.h"
+#include "afs_fs.h"
+
+/*
+ * decode an AFSFid block
+ */
+static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid)
+{
+	const __be32 *bp = *_bp;
+
+	fid->vid		= ntohl(*bp++);
+	fid->vnode		= ntohl(*bp++);
+	fid->unique		= ntohl(*bp++);
+	*_bp = bp;
+}
+
+/*
+ * decode an AFSFetchStatus block
+ */
+static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
+				      struct afs_file_status *status,
+				      struct afs_vnode *vnode,
+				      afs_dataversion_t *store_version)
+{
+	afs_dataversion_t expected_version;
+	const __be32 *bp = *_bp;
+	umode_t mode;
+	u64 data_version, size;
+	u32 changed = 0; /* becomes non-zero if ctime-type changes seen */
+
+#define EXTRACT(DST)				\
+	do {					\
+		u32 x = ntohl(*bp++);		\
+		changed |= DST - x;		\
+		DST = x;			\
+	} while (0)
+
+	status->if_version = ntohl(*bp++);
+	EXTRACT(status->type);
+	EXTRACT(status->nlink);
+	size = ntohl(*bp++);
+	data_version = ntohl(*bp++);
+	EXTRACT(status->author);
+	EXTRACT(status->owner);
+	EXTRACT(status->caller_access); /* call ticket dependent */
+	EXTRACT(status->anon_access);
+	EXTRACT(status->mode);
+	EXTRACT(status->parent.vnode);
+	EXTRACT(status->parent.unique);
+	bp++; /* seg size */
+	status->mtime_client = ntohl(*bp++);
+	status->mtime_server = ntohl(*bp++);
+	EXTRACT(status->group);
+	bp++; /* sync counter */
+	data_version |= (u64) ntohl(*bp++) << 32;
+	EXTRACT(status->lock_count);
+	size |= (u64) ntohl(*bp++) << 32;
+	bp++; /* spare 4 */
+	*_bp = bp;
+
+	if (size != status->size) {
+		status->size = size;
+		changed |= true;
+	}
+	status->mode &= S_IALLUGO;
+
+	_debug("vnode time %lx, %lx",
+	       status->mtime_client, status->mtime_server);
+
+	if (vnode) {
+		status->parent.vid = vnode->fid.vid;
+		if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
+			_debug("vnode changed");
+			i_size_write(&vnode->vfs_inode, size);
+			vnode->vfs_inode.i_uid = status->owner;
+			vnode->vfs_inode.i_gid = status->group;
+			vnode->vfs_inode.i_generation = vnode->fid.unique;
+			vnode->vfs_inode.i_nlink = status->nlink;
+
+			mode = vnode->vfs_inode.i_mode;
+			mode &= ~S_IALLUGO;
+			mode |= status->mode;
+			barrier();
+			vnode->vfs_inode.i_mode = mode;
+		}
+
+		vnode->vfs_inode.i_ctime.tv_sec	= status->mtime_server;
+		vnode->vfs_inode.i_mtime	= vnode->vfs_inode.i_ctime;
+		vnode->vfs_inode.i_atime	= vnode->vfs_inode.i_ctime;
+		vnode->vfs_inode.i_version	= data_version;
+	}
+
+	expected_version = status->data_version;
+	if (store_version)
+		expected_version = *store_version;
+
+	if (expected_version != data_version) {
+		status->data_version = data_version;
+		if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
+			_debug("vnode modified %llx on {%x:%u}",
+			       (unsigned long long) data_version,
+			       vnode->fid.vid, vnode->fid.vnode);
+			set_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+			set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
+		}
+	} else if (store_version) {
+		status->data_version = data_version;
+	}
+}
+
+/*
+ * decode an AFSCallBack block
+ */
+static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode)
+{
+	const __be32 *bp = *_bp;
+
+	vnode->cb_version	= ntohl(*bp++);
+	vnode->cb_expiry	= ntohl(*bp++);
+	vnode->cb_type		= ntohl(*bp++);
+	vnode->cb_expires	= vnode->cb_expiry + get_seconds();
+	*_bp = bp;
+}
+
+static void xdr_decode_AFSCallBack_raw(const __be32 **_bp,
+				       struct afs_callback *cb)
+{
+	const __be32 *bp = *_bp;
+
+	cb->version	= ntohl(*bp++);
+	cb->expiry	= ntohl(*bp++);
+	cb->type	= ntohl(*bp++);
+	*_bp = bp;
+}
+
+/*
+ * decode an AFSVolSync block
+ */
+static void xdr_decode_AFSVolSync(const __be32 **_bp,
+				  struct afs_volsync *volsync)
+{
+	const __be32 *bp = *_bp;
+
+	volsync->creation = ntohl(*bp++);
+	bp++; /* spare2 */
+	bp++; /* spare3 */
+	bp++; /* spare4 */
+	bp++; /* spare5 */
+	bp++; /* spare6 */
+	*_bp = bp;
+}
+
+/*
+ * encode the requested attributes into an AFSStoreStatus block
+ */
+static void xdr_encode_AFS_StoreStatus(__be32 **_bp, struct iattr *attr)
+{
+	__be32 *bp = *_bp;
+	u32 mask = 0, mtime = 0, owner = 0, group = 0, mode = 0;
+
+	mask = 0;
+	if (attr->ia_valid & ATTR_MTIME) {
+		mask |= AFS_SET_MTIME;
+		mtime = attr->ia_mtime.tv_sec;
+	}
+
+	if (attr->ia_valid & ATTR_UID) {
+		mask |= AFS_SET_OWNER;
+		owner = attr->ia_uid;
+	}
+
+	if (attr->ia_valid & ATTR_GID) {
+		mask |= AFS_SET_GROUP;
+		group = attr->ia_gid;
+	}
+
+	if (attr->ia_valid & ATTR_MODE) {
+		mask |= AFS_SET_MODE;
+		mode = attr->ia_mode & S_IALLUGO;
+	}
+
+	*bp++ = htonl(mask);
+	*bp++ = htonl(mtime);
+	*bp++ = htonl(owner);
+	*bp++ = htonl(group);
+	*bp++ = htonl(mode);
+	*bp++ = 0;		/* segment size */
+	*_bp = bp;
+}
+
+/*
+ * decode an AFSFetchVolumeStatus block
+ */
+static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp,
+					    struct afs_volume_status *vs)
+{
+	const __be32 *bp = *_bp;
+
+	vs->vid			= ntohl(*bp++);
+	vs->parent_id		= ntohl(*bp++);
+	vs->online		= ntohl(*bp++);
+	vs->in_service		= ntohl(*bp++);
+	vs->blessed		= ntohl(*bp++);
+	vs->needs_salvage	= ntohl(*bp++);
+	vs->type		= ntohl(*bp++);
+	vs->min_quota		= ntohl(*bp++);
+	vs->max_quota		= ntohl(*bp++);
+	vs->blocks_in_use	= ntohl(*bp++);
+	vs->part_blocks_avail	= ntohl(*bp++);
+	vs->part_max_blocks	= ntohl(*bp++);
+	*_bp = bp;
+}
+
+/*
+ * deliver reply data to an FS.FetchStatus
+ */
+static int afs_deliver_fs_fetch_status(struct afs_call *call,
+				       struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+
+	_enter(",,%u", last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+	xdr_decode_AFSCallBack(&bp, vnode);
+	if (call->reply2)
+		xdr_decode_AFSVolSync(&bp, call->reply2);
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.FetchStatus operation type
+ */
+static const struct afs_call_type afs_RXFSFetchStatus = {
+	.name		= "FS.FetchStatus",
+	.deliver	= afs_deliver_fs_fetch_status,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * fetch the status information for a file
+ */
+int afs_fs_fetch_file_status(struct afs_server *server,
+			     struct key *key,
+			     struct afs_vnode *vnode,
+			     struct afs_volsync *volsync,
+			     const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+
+	call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = volsync;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp[0] = htonl(FSFETCHSTATUS);
+	bp[1] = htonl(vnode->fid.vid);
+	bp[2] = htonl(vnode->fid.vnode);
+	bp[3] = htonl(vnode->fid.unique);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.FetchData
+ */
+static int afs_deliver_fs_fetch_data(struct afs_call *call,
+				     struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+	struct page *page;
+	void *buffer;
+	int ret;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall++;
+		if (call->operation_ID != FSFETCHDATA64) {
+			call->unmarshall++;
+			goto no_msw;
+		}
+
+		/* extract the upper part of the returned data length of an
+		 * FSFETCHDATA64 op (which should always be 0 using this
+		 * client) */
+	case 1:
+		_debug("extract data length (MSW)");
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->count = ntohl(call->tmp);
+		_debug("DATA length MSW: %u", call->count);
+		if (call->count > 0)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+
+	no_msw:
+		/* extract the returned data length */
+	case 2:
+		_debug("extract data length");
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->count = ntohl(call->tmp);
+		_debug("DATA length: %u", call->count);
+		if (call->count > PAGE_SIZE)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the returned data */
+	case 3:
+		_debug("extract data");
+		if (call->count > 0) {
+			page = call->reply3;
+			buffer = kmap_atomic(page, KM_USER0);
+			ret = afs_extract_data(call, skb, last, buffer,
+					       call->count);
+			kunmap_atomic(buffer, KM_USER0);
+			switch (ret) {
+			case 0:		break;
+			case -EAGAIN:	return 0;
+			default:	return ret;
+			}
+		}
+
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the metadata */
+	case 4:
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       (21 + 3 + 6) * 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		bp = call->buffer;
+		xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+		xdr_decode_AFSCallBack(&bp, vnode);
+		if (call->reply2)
+			xdr_decode_AFSVolSync(&bp, call->reply2);
+
+		call->offset = 0;
+		call->unmarshall++;
+
+	case 5:
+		_debug("trailer");
+		if (skb->len != 0)
+			return -EBADMSG;
+		break;
+	}
+
+	if (!last)
+		return 0;
+
+	if (call->count < PAGE_SIZE) {
+		_debug("clear");
+		page = call->reply3;
+		buffer = kmap_atomic(page, KM_USER0);
+		memset(buffer + call->count, 0, PAGE_SIZE - call->count);
+		kunmap_atomic(buffer, KM_USER0);
+	}
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.FetchData operation type
+ */
+static const struct afs_call_type afs_RXFSFetchData = {
+	.name		= "FS.FetchData",
+	.deliver	= afs_deliver_fs_fetch_data,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSFetchData64 = {
+	.name		= "FS.FetchData64",
+	.deliver	= afs_deliver_fs_fetch_data,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * fetch data from a very large file
+ */
+static int afs_fs_fetch_data64(struct afs_server *server,
+			       struct key *key,
+			       struct afs_vnode *vnode,
+			       off_t offset, size_t length,
+			       struct page *buffer,
+			       const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	ASSERTCMP(length, <, ULONG_MAX);
+
+	call = afs_alloc_flat_call(&afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = NULL; /* volsync */
+	call->reply3 = buffer;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->operation_ID = FSFETCHDATA64;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp[0] = htonl(FSFETCHDATA64);
+	bp[1] = htonl(vnode->fid.vid);
+	bp[2] = htonl(vnode->fid.vnode);
+	bp[3] = htonl(vnode->fid.unique);
+	bp[4] = htonl(upper_32_bits(offset));
+	bp[5] = htonl((u32) offset);
+	bp[6] = 0;
+	bp[7] = htonl((u32) length);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * fetch data from a file
+ */
+int afs_fs_fetch_data(struct afs_server *server,
+		      struct key *key,
+		      struct afs_vnode *vnode,
+		      off_t offset, size_t length,
+		      struct page *buffer,
+		      const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	if (upper_32_bits(offset) || upper_32_bits(offset + length))
+		return afs_fs_fetch_data64(server, key, vnode, offset, length,
+					   buffer, wait_mode);
+
+	_enter("");
+
+	call = afs_alloc_flat_call(&afs_RXFSFetchData, 24, (21 + 3 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = NULL; /* volsync */
+	call->reply3 = buffer;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->operation_ID = FSFETCHDATA;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp[0] = htonl(FSFETCHDATA);
+	bp[1] = htonl(vnode->fid.vid);
+	bp[2] = htonl(vnode->fid.vnode);
+	bp[3] = htonl(vnode->fid.unique);
+	bp[4] = htonl(offset);
+	bp[5] = htonl(length);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.GiveUpCallBacks
+ */
+static int afs_deliver_fs_give_up_callbacks(struct afs_call *call,
+					    struct sk_buff *skb, bool last)
+{
+	_enter(",{%u},%d", skb->len, last);
+
+	if (skb->len > 0)
+		return -EBADMSG; /* shouldn't be any reply data */
+	return 0;
+}
+
+/*
+ * FS.GiveUpCallBacks operation type
+ */
+static const struct afs_call_type afs_RXFSGiveUpCallBacks = {
+	.name		= "FS.GiveUpCallBacks",
+	.deliver	= afs_deliver_fs_give_up_callbacks,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * give up a set of callbacks
+ * - the callbacks are held in the server->cb_break ring
+ */
+int afs_fs_give_up_callbacks(struct afs_server *server,
+			     const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t ncallbacks;
+	__be32 *bp, *tp;
+	int loop;
+
+	ncallbacks = CIRC_CNT(server->cb_break_head, server->cb_break_tail,
+			      ARRAY_SIZE(server->cb_break));
+
+	_enter("{%zu},", ncallbacks);
+
+	if (ncallbacks == 0)
+		return 0;
+	if (ncallbacks > AFSCBMAX)
+		ncallbacks = AFSCBMAX;
+
+	_debug("break %zu callbacks", ncallbacks);
+
+	call = afs_alloc_flat_call(&afs_RXFSGiveUpCallBacks,
+				   12 + ncallbacks * 6 * 4, 0);
+	if (!call)
+		return -ENOMEM;
+
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	tp = bp + 2 + ncallbacks * 3;
+	*bp++ = htonl(FSGIVEUPCALLBACKS);
+	*bp++ = htonl(ncallbacks);
+	*tp++ = htonl(ncallbacks);
+
+	atomic_sub(ncallbacks, &server->cb_break_n);
+	for (loop = ncallbacks; loop > 0; loop--) {
+		struct afs_callback *cb =
+			&server->cb_break[server->cb_break_tail];
+
+		*bp++ = htonl(cb->fid.vid);
+		*bp++ = htonl(cb->fid.vnode);
+		*bp++ = htonl(cb->fid.unique);
+		*tp++ = htonl(cb->version);
+		*tp++ = htonl(cb->expiry);
+		*tp++ = htonl(cb->type);
+		smp_mb();
+		server->cb_break_tail =
+			(server->cb_break_tail + 1) &
+			(ARRAY_SIZE(server->cb_break) - 1);
+	}
+
+	ASSERT(ncallbacks > 0);
+	wake_up_nr(&server->cb_break_waitq, ncallbacks);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.CreateFile or an FS.MakeDir
+ */
+static int afs_deliver_fs_create_vnode(struct afs_call *call,
+				       struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFid(&bp, call->reply2);
+	xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL);
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+	xdr_decode_AFSCallBack_raw(&bp, call->reply4);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.CreateFile and FS.MakeDir operation type
+ */
+static const struct afs_call_type afs_RXFSCreateXXXX = {
+	.name		= "FS.CreateXXXX",
+	.deliver	= afs_deliver_fs_create_vnode,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * create a file or make a directory
+ */
+int afs_fs_create(struct afs_server *server,
+		  struct key *key,
+		  struct afs_vnode *vnode,
+		  const char *name,
+		  umode_t mode,
+		  struct afs_fid *newfid,
+		  struct afs_file_status *newstatus,
+		  struct afs_callback *newcb,
+		  const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t namesz, reqsz, padsz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+	padsz = (4 - (namesz & 3)) & 3;
+	reqsz = (5 * 4) + namesz + padsz + (6 * 4);
+
+	call = afs_alloc_flat_call(&afs_RXFSCreateXXXX, reqsz,
+				   (3 + 21 + 21 + 3 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = newfid;
+	call->reply3 = newstatus;
+	call->reply4 = newcb;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(S_ISDIR(mode) ? FSMAKEDIR : FSCREATEFILE);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+	*bp++ = htonl(namesz);
+	memcpy(bp, name, namesz);
+	bp = (void *) bp + namesz;
+	if (padsz > 0) {
+		memset(bp, 0, padsz);
+		bp = (void *) bp + padsz;
+	}
+	*bp++ = htonl(AFS_SET_MODE);
+	*bp++ = 0; /* mtime */
+	*bp++ = 0; /* owner */
+	*bp++ = 0; /* group */
+	*bp++ = htonl(mode & S_IALLUGO); /* unix mode */
+	*bp++ = 0; /* segment size */
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.RemoveFile or FS.RemoveDir
+ */
+static int afs_deliver_fs_remove(struct afs_call *call,
+				 struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.RemoveDir/FS.RemoveFile operation type
+ */
+static const struct afs_call_type afs_RXFSRemoveXXXX = {
+	.name		= "FS.RemoveXXXX",
+	.deliver	= afs_deliver_fs_remove,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * remove a file or directory
+ */
+int afs_fs_remove(struct afs_server *server,
+		  struct key *key,
+		  struct afs_vnode *vnode,
+		  const char *name,
+		  bool isdir,
+		  const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t namesz, reqsz, padsz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+	padsz = (4 - (namesz & 3)) & 3;
+	reqsz = (5 * 4) + namesz + padsz;
+
+	call = afs_alloc_flat_call(&afs_RXFSRemoveXXXX, reqsz, (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+	*bp++ = htonl(namesz);
+	memcpy(bp, name, namesz);
+	bp = (void *) bp + namesz;
+	if (padsz > 0) {
+		memset(bp, 0, padsz);
+		bp = (void *) bp + padsz;
+	}
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.Link
+ */
+static int afs_deliver_fs_link(struct afs_call *call,
+			       struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
+	const __be32 *bp;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+	xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode, NULL);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.Link operation type
+ */
+static const struct afs_call_type afs_RXFSLink = {
+	.name		= "FS.Link",
+	.deliver	= afs_deliver_fs_link,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * make a hard link
+ */
+int afs_fs_link(struct afs_server *server,
+		struct key *key,
+		struct afs_vnode *dvnode,
+		struct afs_vnode *vnode,
+		const char *name,
+		const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t namesz, reqsz, padsz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+	padsz = (4 - (namesz & 3)) & 3;
+	reqsz = (5 * 4) + namesz + padsz + (3 * 4);
+
+	call = afs_alloc_flat_call(&afs_RXFSLink, reqsz, (21 + 21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = dvnode;
+	call->reply2 = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSLINK);
+	*bp++ = htonl(dvnode->fid.vid);
+	*bp++ = htonl(dvnode->fid.vnode);
+	*bp++ = htonl(dvnode->fid.unique);
+	*bp++ = htonl(namesz);
+	memcpy(bp, name, namesz);
+	bp = (void *) bp + namesz;
+	if (padsz > 0) {
+		memset(bp, 0, padsz);
+		bp = (void *) bp + padsz;
+	}
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.Symlink
+ */
+static int afs_deliver_fs_symlink(struct afs_call *call,
+				  struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFid(&bp, call->reply2);
+	xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL);
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.Symlink operation type
+ */
+static const struct afs_call_type afs_RXFSSymlink = {
+	.name		= "FS.Symlink",
+	.deliver	= afs_deliver_fs_symlink,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * create a symbolic link
+ */
+int afs_fs_symlink(struct afs_server *server,
+		   struct key *key,
+		   struct afs_vnode *vnode,
+		   const char *name,
+		   const char *contents,
+		   struct afs_fid *newfid,
+		   struct afs_file_status *newstatus,
+		   const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t namesz, reqsz, padsz, c_namesz, c_padsz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+	padsz = (4 - (namesz & 3)) & 3;
+
+	c_namesz = strlen(contents);
+	c_padsz = (4 - (c_namesz & 3)) & 3;
+
+	reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4);
+
+	call = afs_alloc_flat_call(&afs_RXFSSymlink, reqsz,
+				   (3 + 21 + 21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = newfid;
+	call->reply3 = newstatus;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSYMLINK);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+	*bp++ = htonl(namesz);
+	memcpy(bp, name, namesz);
+	bp = (void *) bp + namesz;
+	if (padsz > 0) {
+		memset(bp, 0, padsz);
+		bp = (void *) bp + padsz;
+	}
+	*bp++ = htonl(c_namesz);
+	memcpy(bp, contents, c_namesz);
+	bp = (void *) bp + c_namesz;
+	if (c_padsz > 0) {
+		memset(bp, 0, c_padsz);
+		bp = (void *) bp + c_padsz;
+	}
+	*bp++ = htonl(AFS_SET_MODE);
+	*bp++ = 0; /* mtime */
+	*bp++ = 0; /* owner */
+	*bp++ = 0; /* group */
+	*bp++ = htonl(S_IRWXUGO); /* unix mode */
+	*bp++ = 0; /* segment size */
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.Rename
+ */
+static int afs_deliver_fs_rename(struct afs_call *call,
+				  struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
+	const __be32 *bp;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode, NULL);
+	if (new_dvnode != orig_dvnode)
+		xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode,
+					  NULL);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.Rename operation type
+ */
+static const struct afs_call_type afs_RXFSRename = {
+	.name		= "FS.Rename",
+	.deliver	= afs_deliver_fs_rename,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * create a symbolic link
+ */
+int afs_fs_rename(struct afs_server *server,
+		  struct key *key,
+		  struct afs_vnode *orig_dvnode,
+		  const char *orig_name,
+		  struct afs_vnode *new_dvnode,
+		  const char *new_name,
+		  const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz;
+	__be32 *bp;
+
+	_enter("");
+
+	o_namesz = strlen(orig_name);
+	o_padsz = (4 - (o_namesz & 3)) & 3;
+
+	n_namesz = strlen(new_name);
+	n_padsz = (4 - (n_namesz & 3)) & 3;
+
+	reqsz = (4 * 4) +
+		4 + o_namesz + o_padsz +
+		(3 * 4) +
+		4 + n_namesz + n_padsz;
+
+	call = afs_alloc_flat_call(&afs_RXFSRename, reqsz, (21 + 21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = orig_dvnode;
+	call->reply2 = new_dvnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSRENAME);
+	*bp++ = htonl(orig_dvnode->fid.vid);
+	*bp++ = htonl(orig_dvnode->fid.vnode);
+	*bp++ = htonl(orig_dvnode->fid.unique);
+	*bp++ = htonl(o_namesz);
+	memcpy(bp, orig_name, o_namesz);
+	bp = (void *) bp + o_namesz;
+	if (o_padsz > 0) {
+		memset(bp, 0, o_padsz);
+		bp = (void *) bp + o_padsz;
+	}
+
+	*bp++ = htonl(new_dvnode->fid.vid);
+	*bp++ = htonl(new_dvnode->fid.vnode);
+	*bp++ = htonl(new_dvnode->fid.unique);
+	*bp++ = htonl(n_namesz);
+	memcpy(bp, new_name, n_namesz);
+	bp = (void *) bp + n_namesz;
+	if (n_padsz > 0) {
+		memset(bp, 0, n_padsz);
+		bp = (void *) bp + n_padsz;
+	}
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.StoreData
+ */
+static int afs_deliver_fs_store_data(struct afs_call *call,
+				     struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+
+	_enter(",,%u", last);
+
+	afs_transfer_reply(call, skb);
+	if (!last) {
+		_leave(" = 0 [more]");
+		return 0;
+	}
+
+	if (call->reply_size != call->reply_max) {
+		_leave(" = -EBADMSG [%u != %u]",
+		       call->reply_size, call->reply_max);
+		return -EBADMSG;
+	}
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode,
+				  &call->store_version);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	afs_pages_written_back(vnode, call);
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.StoreData operation type
+ */
+static const struct afs_call_type afs_RXFSStoreData = {
+	.name		= "FS.StoreData",
+	.deliver	= afs_deliver_fs_store_data,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSStoreData64 = {
+	.name		= "FS.StoreData64",
+	.deliver	= afs_deliver_fs_store_data,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * store a set of pages to a very large file
+ */
+static int afs_fs_store_data64(struct afs_server *server,
+			       struct afs_writeback *wb,
+			       pgoff_t first, pgoff_t last,
+			       unsigned offset, unsigned to,
+			       loff_t size, loff_t pos, loff_t i_size,
+			       const struct afs_wait_mode *wait_mode)
+{
+	struct afs_vnode *vnode = wb->vnode;
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode);
+
+	call = afs_alloc_flat_call(&afs_RXFSStoreData64,
+				   (4 + 6 + 3 * 2) * 4,
+				   (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->wb = wb;
+	call->key = wb->key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->mapping = vnode->vfs_inode.i_mapping;
+	call->first = first;
+	call->last = last;
+	call->first_offset = offset;
+	call->last_to = to;
+	call->send_pages = true;
+	call->store_version = vnode->status.data_version + 1;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSTOREDATA64);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	*bp++ = 0; /* mask */
+	*bp++ = 0; /* mtime */
+	*bp++ = 0; /* owner */
+	*bp++ = 0; /* group */
+	*bp++ = 0; /* unix mode */
+	*bp++ = 0; /* segment size */
+
+	*bp++ = htonl(pos >> 32);
+	*bp++ = htonl((u32) pos);
+	*bp++ = htonl(size >> 32);
+	*bp++ = htonl((u32) size);
+	*bp++ = htonl(i_size >> 32);
+	*bp++ = htonl((u32) i_size);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * store a set of pages
+ */
+int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
+		      pgoff_t first, pgoff_t last,
+		      unsigned offset, unsigned to,
+		      const struct afs_wait_mode *wait_mode)
+{
+	struct afs_vnode *vnode = wb->vnode;
+	struct afs_call *call;
+	loff_t size, pos, i_size;
+	__be32 *bp;
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode);
+
+	size = to - offset;
+	if (first != last)
+		size += (loff_t)(last - first) << PAGE_SHIFT;
+	pos = (loff_t)first << PAGE_SHIFT;
+	pos += offset;
+
+	i_size = i_size_read(&vnode->vfs_inode);
+	if (pos + size > i_size)
+		i_size = size + pos;
+
+	_debug("size %llx, at %llx, i_size %llx",
+	       (unsigned long long) size, (unsigned long long) pos,
+	       (unsigned long long) i_size);
+
+	if (pos >> 32 || i_size >> 32 || size >> 32 || (pos + size) >> 32)
+		return afs_fs_store_data64(server, wb, first, last, offset, to,
+					   size, pos, i_size, wait_mode);
+
+	call = afs_alloc_flat_call(&afs_RXFSStoreData,
+				   (4 + 6 + 3) * 4,
+				   (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->wb = wb;
+	call->key = wb->key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->mapping = vnode->vfs_inode.i_mapping;
+	call->first = first;
+	call->last = last;
+	call->first_offset = offset;
+	call->last_to = to;
+	call->send_pages = true;
+	call->store_version = vnode->status.data_version + 1;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSTOREDATA);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	*bp++ = 0; /* mask */
+	*bp++ = 0; /* mtime */
+	*bp++ = 0; /* owner */
+	*bp++ = 0; /* group */
+	*bp++ = 0; /* unix mode */
+	*bp++ = 0; /* segment size */
+
+	*bp++ = htonl(pos);
+	*bp++ = htonl(size);
+	*bp++ = htonl(i_size);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.StoreStatus
+ */
+static int afs_deliver_fs_store_status(struct afs_call *call,
+				       struct sk_buff *skb, bool last)
+{
+	afs_dataversion_t *store_version;
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+
+	_enter(",,%u", last);
+
+	afs_transfer_reply(call, skb);
+	if (!last) {
+		_leave(" = 0 [more]");
+		return 0;
+	}
+
+	if (call->reply_size != call->reply_max) {
+		_leave(" = -EBADMSG [%u != %u]",
+		       call->reply_size, call->reply_max);
+		return -EBADMSG;
+	}
+
+	/* unmarshall the reply once we've received all of it */
+	store_version = NULL;
+	if (call->operation_ID == FSSTOREDATA)
+		store_version = &call->store_version;
+
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, store_version);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.StoreStatus operation type
+ */
+static const struct afs_call_type afs_RXFSStoreStatus = {
+	.name		= "FS.StoreStatus",
+	.deliver	= afs_deliver_fs_store_status,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSStoreData_as_Status = {
+	.name		= "FS.StoreData",
+	.deliver	= afs_deliver_fs_store_status,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSStoreData64_as_Status = {
+	.name		= "FS.StoreData64",
+	.deliver	= afs_deliver_fs_store_status,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * set the attributes on a very large file, using FS.StoreData rather than
+ * FS.StoreStatus so as to alter the file size also
+ */
+static int afs_fs_setattr_size64(struct afs_server *server, struct key *key,
+				 struct afs_vnode *vnode, struct iattr *attr,
+				 const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+
+	ASSERT(attr->ia_valid & ATTR_SIZE);
+
+	call = afs_alloc_flat_call(&afs_RXFSStoreData64_as_Status,
+				   (4 + 6 + 3 * 2) * 4,
+				   (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->store_version = vnode->status.data_version + 1;
+	call->operation_ID = FSSTOREDATA;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSTOREDATA64);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	xdr_encode_AFS_StoreStatus(&bp, attr);
+
+	*bp++ = 0;				/* position of start of write */
+	*bp++ = 0;
+	*bp++ = 0;				/* size of write */
+	*bp++ = 0;
+	*bp++ = htonl(attr->ia_size >> 32);	/* new file length */
+	*bp++ = htonl((u32) attr->ia_size);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * set the attributes on a file, using FS.StoreData rather than FS.StoreStatus
+ * so as to alter the file size also
+ */
+static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
+			       struct afs_vnode *vnode, struct iattr *attr,
+			       const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+
+	ASSERT(attr->ia_valid & ATTR_SIZE);
+	if (attr->ia_size >> 32)
+		return afs_fs_setattr_size64(server, key, vnode, attr,
+					     wait_mode);
+
+	call = afs_alloc_flat_call(&afs_RXFSStoreData_as_Status,
+				   (4 + 6 + 3) * 4,
+				   (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->store_version = vnode->status.data_version + 1;
+	call->operation_ID = FSSTOREDATA;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSTOREDATA);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	xdr_encode_AFS_StoreStatus(&bp, attr);
+
+	*bp++ = 0;				/* position of start of write */
+	*bp++ = 0;				/* size of write */
+	*bp++ = htonl(attr->ia_size);		/* new file length */
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * set the attributes on a file, using FS.StoreData if there's a change in file
+ * size, and FS.StoreStatus otherwise
+ */
+int afs_fs_setattr(struct afs_server *server, struct key *key,
+		   struct afs_vnode *vnode, struct iattr *attr,
+		   const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	if (attr->ia_valid & ATTR_SIZE)
+		return afs_fs_setattr_size(server, key, vnode, attr,
+					   wait_mode);
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+
+	call = afs_alloc_flat_call(&afs_RXFSStoreStatus,
+				   (4 + 6) * 4,
+				   (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->operation_ID = FSSTORESTATUS;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSTORESTATUS);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	xdr_encode_AFS_StoreStatus(&bp, attr);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.GetVolumeStatus
+ */
+static int afs_deliver_fs_get_volume_status(struct afs_call *call,
+					    struct sk_buff *skb, bool last)
+{
+	const __be32 *bp;
+	char *p;
+	int ret;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the returned status record */
+	case 1:
+		_debug("extract status");
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       12 * 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		bp = call->buffer;
+		xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2);
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the volume name length */
+	case 2:
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->count = ntohl(call->tmp);
+		_debug("volname length: %u", call->count);
+		if (call->count >= AFSNAMEMAX)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the volume name */
+	case 3:
+		_debug("extract volname");
+		if (call->count > 0) {
+			ret = afs_extract_data(call, skb, last, call->reply3,
+					       call->count);
+			switch (ret) {
+			case 0:		break;
+			case -EAGAIN:	return 0;
+			default:	return ret;
+			}
+		}
+
+		p = call->reply3;
+		p[call->count] = 0;
+		_debug("volname '%s'", p);
+
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the volume name padding */
+		if ((call->count & 3) == 0) {
+			call->unmarshall++;
+			goto no_volname_padding;
+		}
+		call->count = 4 - (call->count & 3);
+
+	case 4:
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       call->count);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->offset = 0;
+		call->unmarshall++;
+	no_volname_padding:
+
+		/* extract the offline message length */
+	case 5:
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->count = ntohl(call->tmp);
+		_debug("offline msg length: %u", call->count);
+		if (call->count >= AFSNAMEMAX)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the offline message */
+	case 6:
+		_debug("extract offline");
+		if (call->count > 0) {
+			ret = afs_extract_data(call, skb, last, call->reply3,
+					       call->count);
+			switch (ret) {
+			case 0:		break;
+			case -EAGAIN:	return 0;
+			default:	return ret;
+			}
+		}
+
+		p = call->reply3;
+		p[call->count] = 0;
+		_debug("offline '%s'", p);
+
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the offline message padding */
+		if ((call->count & 3) == 0) {
+			call->unmarshall++;
+			goto no_offline_padding;
+		}
+		call->count = 4 - (call->count & 3);
+
+	case 7:
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       call->count);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->offset = 0;
+		call->unmarshall++;
+	no_offline_padding:
+
+		/* extract the message of the day length */
+	case 8:
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->count = ntohl(call->tmp);
+		_debug("motd length: %u", call->count);
+		if (call->count >= AFSNAMEMAX)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the message of the day */
+	case 9:
+		_debug("extract motd");
+		if (call->count > 0) {
+			ret = afs_extract_data(call, skb, last, call->reply3,
+					       call->count);
+			switch (ret) {
+			case 0:		break;
+			case -EAGAIN:	return 0;
+			default:	return ret;
+			}
+		}
+
+		p = call->reply3;
+		p[call->count] = 0;
+		_debug("motd '%s'", p);
+
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the message of the day padding */
+		if ((call->count & 3) == 0) {
+			call->unmarshall++;
+			goto no_motd_padding;
+		}
+		call->count = 4 - (call->count & 3);
+
+	case 10:
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       call->count);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->offset = 0;
+		call->unmarshall++;
+	no_motd_padding:
+
+	case 11:
+		_debug("trailer %d", skb->len);
+		if (skb->len != 0)
+			return -EBADMSG;
+		break;
+	}
+
+	if (!last)
+		return 0;
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * destroy an FS.GetVolumeStatus call
+ */
+static void afs_get_volume_status_call_destructor(struct afs_call *call)
+{
+	kfree(call->reply3);
+	call->reply3 = NULL;
+	afs_flat_call_destructor(call);
+}
+
+/*
+ * FS.GetVolumeStatus operation type
+ */
+static const struct afs_call_type afs_RXFSGetVolumeStatus = {
+	.name		= "FS.GetVolumeStatus",
+	.deliver	= afs_deliver_fs_get_volume_status,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_get_volume_status_call_destructor,
+};
+
+/*
+ * fetch the status of a volume
+ */
+int afs_fs_get_volume_status(struct afs_server *server,
+			     struct key *key,
+			     struct afs_vnode *vnode,
+			     struct afs_volume_status *vs,
+			     const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+	void *tmpbuf;
+
+	_enter("");
+
+	tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL);
+	if (!tmpbuf)
+		return -ENOMEM;
+
+	call = afs_alloc_flat_call(&afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4);
+	if (!call) {
+		kfree(tmpbuf);
+		return -ENOMEM;
+	}
+
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = vs;
+	call->reply3 = tmpbuf;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp[0] = htonl(FSGETVOLUMESTATUS);
+	bp[1] = htonl(vnode->fid.vid);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock
+ */
+static int afs_deliver_fs_xxxx_lock(struct afs_call *call,
+				    struct sk_buff *skb, bool last)
+{
+	const __be32 *bp;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.SetLock operation type
+ */
+static const struct afs_call_type afs_RXFSSetLock = {
+	.name		= "FS.SetLock",
+	.deliver	= afs_deliver_fs_xxxx_lock,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * FS.ExtendLock operation type
+ */
+static const struct afs_call_type afs_RXFSExtendLock = {
+	.name		= "FS.ExtendLock",
+	.deliver	= afs_deliver_fs_xxxx_lock,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * FS.ReleaseLock operation type
+ */
+static const struct afs_call_type afs_RXFSReleaseLock = {
+	.name		= "FS.ReleaseLock",
+	.deliver	= afs_deliver_fs_xxxx_lock,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * get a lock on a file
+ */
+int afs_fs_set_lock(struct afs_server *server,
+		    struct key *key,
+		    struct afs_vnode *vnode,
+		    afs_lock_type_t type,
+		    const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(&afs_RXFSSetLock, 5 * 4, 6 * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSETLOCK);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+	*bp++ = htonl(type);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * extend a lock on a file
+ */
+int afs_fs_extend_lock(struct afs_server *server,
+		       struct key *key,
+		       struct afs_vnode *vnode,
+		       const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(&afs_RXFSExtendLock, 4 * 4, 6 * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSEXTENDLOCK);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * release a lock on a file
+ */
+int afs_fs_release_lock(struct afs_server *server,
+			struct key *key,
+			struct afs_vnode *vnode,
+			const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(&afs_RXFSReleaseLock, 4 * 4, 6 * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSRELEASELOCK);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
new file mode 100644
index 00000000..0fdab6e0
--- /dev/null
+++ b/fs/afs/inode.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2002 Red Hat, Inc. All rights reserved.
+ *
+ * This software may be freely redistributed under the terms of the
+ * GNU General Public License.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *          David Howells <dhowells@redhat.com>
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include "internal.h"
+
+struct afs_iget_data {
+	struct afs_fid		fid;
+	struct afs_volume	*volume;	/* volume on which resides */
+};
+
+/*
+ * map the AFS file status to the inode member variables
+ */
+static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
+{
+	struct inode *inode = AFS_VNODE_TO_I(vnode);
+
+	_debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu",
+	       vnode->status.type,
+	       vnode->status.nlink,
+	       (unsigned long long) vnode->status.size,
+	       vnode->status.data_version,
+	       vnode->status.mode);
+
+	switch (vnode->status.type) {
+	case AFS_FTYPE_FILE:
+		inode->i_mode	= S_IFREG | vnode->status.mode;
+		inode->i_op	= &afs_file_inode_operations;
+		inode->i_fop	= &afs_file_operations;
+		break;
+	case AFS_FTYPE_DIR:
+		inode->i_mode	= S_IFDIR | vnode->status.mode;
+		inode->i_op	= &afs_dir_inode_operations;
+		inode->i_fop	= &afs_dir_file_operations;
+		break;
+	case AFS_FTYPE_SYMLINK:
+		inode->i_mode	= S_IFLNK | vnode->status.mode;
+		inode->i_op	= &page_symlink_inode_operations;
+		break;
+	default:
+		printk("kAFS: AFS vnode with undefined type\n");
+		return -EBADMSG;
+	}
+
+#ifdef CONFIG_AFS_FSCACHE
+	if (vnode->status.size != inode->i_size)
+		fscache_attr_changed(vnode->cache);
+#endif
+
+	inode->i_nlink		= vnode->status.nlink;
+	inode->i_uid		= vnode->status.owner;
+	inode->i_gid		= 0;
+	inode->i_size		= vnode->status.size;
+	inode->i_ctime.tv_sec	= vnode->status.mtime_server;
+	inode->i_ctime.tv_nsec	= 0;
+	inode->i_atime		= inode->i_mtime = inode->i_ctime;
+	inode->i_blocks		= 0;
+	inode->i_generation	= vnode->fid.unique;
+	inode->i_version	= vnode->status.data_version;
+	inode->i_mapping->a_ops	= &afs_fs_aops;
+
+	/* check to see whether a symbolic link is really a mountpoint */
+	if (vnode->status.type == AFS_FTYPE_SYMLINK) {
+		afs_mntpt_check_symlink(vnode, key);
+
+		if (test_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags)) {
+			inode->i_mode	= S_IFDIR | vnode->status.mode;
+			inode->i_op	= &afs_mntpt_inode_operations;
+			inode->i_fop	= &afs_mntpt_file_operations;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * iget5() comparator
+ */
+static int afs_iget5_test(struct inode *inode, void *opaque)
+{
+	struct afs_iget_data *data = opaque;
+
+	return inode->i_ino == data->fid.vnode &&
+		inode->i_generation == data->fid.unique;
+}
+
+/*
+ * iget5() comparator for inode created by autocell operations
+ *
+ * These pseudo inodes don't match anything.
+ */
+static int afs_iget5_autocell_test(struct inode *inode, void *opaque)
+{
+	return 0;
+}
+
+/*
+ * iget5() inode initialiser
+ */
+static int afs_iget5_set(struct inode *inode, void *opaque)
+{
+	struct afs_iget_data *data = opaque;
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+
+	inode->i_ino = data->fid.vnode;
+	inode->i_generation = data->fid.unique;
+	vnode->fid = data->fid;
+	vnode->volume = data->volume;
+
+	return 0;
+}
+
+/*
+ * inode retrieval for autocell
+ */
+struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
+				int namesz, struct key *key)
+{
+	struct afs_iget_data data;
+	struct afs_super_info *as;
+	struct afs_vnode *vnode;
+	struct super_block *sb;
+	struct inode *inode;
+	static atomic_t afs_autocell_ino;
+
+	_enter("{%x:%u},%*.*s,",
+	       AFS_FS_I(dir)->fid.vid, AFS_FS_I(dir)->fid.vnode,
+	       namesz, namesz, dev_name ?: "");
+
+	sb = dir->i_sb;
+	as = sb->s_fs_info;
+	data.volume = as->volume;
+	data.fid.vid = as->volume->vid;
+	data.fid.unique = 0;
+	data.fid.vnode = 0;
+
+	inode = iget5_locked(sb, atomic_inc_return(&afs_autocell_ino),
+			     afs_iget5_autocell_test, afs_iget5_set,
+			     &data);
+	if (!inode) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	_debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }",
+	       inode, inode->i_ino, data.fid.vid, data.fid.vnode,
+	       data.fid.unique);
+
+	vnode = AFS_FS_I(inode);
+
+	/* there shouldn't be an existing inode */
+	BUG_ON(!(inode->i_state & I_NEW));
+
+	inode->i_size		= 0;
+	inode->i_mode		= S_IFDIR | S_IRUGO | S_IXUGO;
+	inode->i_op		= &afs_autocell_inode_operations;
+	inode->i_nlink		= 2;
+	inode->i_uid		= 0;
+	inode->i_gid		= 0;
+	inode->i_ctime.tv_sec	= get_seconds();
+	inode->i_ctime.tv_nsec	= 0;
+	inode->i_atime		= inode->i_mtime = inode->i_ctime;
+	inode->i_blocks		= 0;
+	inode->i_version	= 0;
+	inode->i_generation	= 0;
+
+	set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
+	set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+	inode->i_flags |= S_AUTOMOUNT | S_NOATIME;
+	unlock_new_inode(inode);
+	_leave(" = %p", inode);
+	return inode;
+}
+
+/*
+ * inode retrieval
+ */
+struct inode *afs_iget(struct super_block *sb, struct key *key,
+		       struct afs_fid *fid, struct afs_file_status *status,
+		       struct afs_callback *cb)
+{
+	struct afs_iget_data data = { .fid = *fid };
+	struct afs_super_info *as;
+	struct afs_vnode *vnode;
+	struct inode *inode;
+	int ret;
+
+	_enter(",{%x:%u.%u},,", fid->vid, fid->vnode, fid->unique);
+
+	as = sb->s_fs_info;
+	data.volume = as->volume;
+
+	inode = iget5_locked(sb, fid->vnode, afs_iget5_test, afs_iget5_set,
+			     &data);
+	if (!inode) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	_debug("GOT INODE %p { vl=%x vn=%x, u=%x }",
+	       inode, fid->vid, fid->vnode, fid->unique);
+
+	vnode = AFS_FS_I(inode);
+
+	/* deal with an existing inode */
+	if (!(inode->i_state & I_NEW)) {
+		_leave(" = %p", inode);
+		return inode;
+	}
+
+	if (!status) {
+		/* it's a remotely extant inode */
+		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		if (ret < 0)
+			goto bad_inode;
+	} else {
+		/* it's an inode we just created */
+		memcpy(&vnode->status, status, sizeof(vnode->status));
+
+		if (!cb) {
+			/* it's a symlink we just created (the fileserver
+			 * didn't give us a callback) */
+			vnode->cb_version = 0;
+			vnode->cb_expiry = 0;
+			vnode->cb_type = 0;
+			vnode->cb_expires = get_seconds();
+		} else {
+			vnode->cb_version = cb->version;
+			vnode->cb_expiry = cb->expiry;
+			vnode->cb_type = cb->type;
+			vnode->cb_expires = vnode->cb_expiry + get_seconds();
+		}
+	}
+
+	/* set up caching before mapping the status, as map-status reads the
+	 * first page of symlinks to see if they're really mountpoints */
+	inode->i_size = vnode->status.size;
+#ifdef CONFIG_AFS_FSCACHE
+	vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
+					      &afs_vnode_cache_index_def,
+					      vnode);
+#endif
+
+	ret = afs_inode_map_status(vnode, key);
+	if (ret < 0)
+		goto bad_inode;
+
+	/* success */
+	clear_bit(AFS_VNODE_UNSET, &vnode->flags);
+	inode->i_flags |= S_NOATIME;
+	unlock_new_inode(inode);
+	_leave(" = %p [CB { v=%u t=%u }]", inode, vnode->cb_version, vnode->cb_type);
+	return inode;
+
+	/* failure */
+bad_inode:
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_relinquish_cookie(vnode->cache, 0);
+	vnode->cache = NULL;
+#endif
+	iget_failed(inode);
+	_leave(" = %d [bad]", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * mark the data attached to an inode as obsolete due to a write on the server
+ * - might also want to ditch all the outstanding writes and dirty pages
+ */
+void afs_zap_data(struct afs_vnode *vnode)
+{
+	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
+	/* nuke all the non-dirty pages that aren't locked, mapped or being
+	 * written back in a regular file and completely discard the pages in a
+	 * directory or symlink */
+	if (S_ISREG(vnode->vfs_inode.i_mode))
+		invalidate_remote_inode(&vnode->vfs_inode);
+	else
+		invalidate_inode_pages2(vnode->vfs_inode.i_mapping);
+}
+
+/*
+ * validate a vnode/inode
+ * - there are several things we need to check
+ *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
+ *     symlink)
+ *   - parent dir metadata changed (security changes)
+ *   - dentry data changed (write, truncate)
+ *   - dentry metadata changed (security changes)
+ */
+int afs_validate(struct afs_vnode *vnode, struct key *key)
+{
+	int ret;
+
+	_enter("{v={%x:%u} fl=%lx},%x",
+	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
+	       key_serial(key));
+
+	if (vnode->cb_promised &&
+	    !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+	    !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) &&
+	    !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+		if (vnode->cb_expires < get_seconds() + 10) {
+			_debug("callback expired");
+			set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+		} else {
+			goto valid;
+		}
+	}
+
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+		goto valid;
+
+	mutex_lock(&vnode->validate_lock);
+
+	/* if the promise has expired, we need to check the server again to get
+	 * a new promise - note that if the (parent) directory's metadata was
+	 * changed then the security may be different and we may no longer have
+	 * access */
+	if (!vnode->cb_promised ||
+	    test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+		_debug("not promised");
+		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		if (ret < 0)
+			goto error_unlock;
+		_debug("new promise [fl=%lx]", vnode->flags);
+	}
+
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+		_debug("file already deleted");
+		ret = -ESTALE;
+		goto error_unlock;
+	}
+
+	/* if the vnode's data version number changed then its contents are
+	 * different */
+	if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
+		afs_zap_data(vnode);
+
+	clear_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+	mutex_unlock(&vnode->validate_lock);
+valid:
+	_leave(" = 0");
+	return 0;
+
+error_unlock:
+	mutex_unlock(&vnode->validate_lock);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * read the attributes of an inode
+ */
+int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		      struct kstat *stat)
+{
+	struct inode *inode;
+
+	inode = dentry->d_inode;
+
+	_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+/*
+ * discard an AFS inode
+ */
+int afs_drop_inode(struct inode *inode)
+{
+	_enter("");
+
+	if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags))
+		return generic_delete_inode(inode);
+	else
+		return generic_drop_inode(inode);
+}
+
+/*
+ * clear an AFS inode
+ */
+void afs_evict_inode(struct inode *inode)
+{
+	struct afs_permits *permits;
+	struct afs_vnode *vnode;
+
+	vnode = AFS_FS_I(inode);
+
+	_enter("{%x:%u.%d} v=%u x=%u t=%u }",
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       vnode->cb_version,
+	       vnode->cb_expiry,
+	       vnode->cb_type);
+
+	_debug("CLEAR INODE %p", inode);
+
+	ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
+
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+
+	afs_give_up_callback(vnode);
+
+	if (vnode->server) {
+		spin_lock(&vnode->server->fs_lock);
+		rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
+		spin_unlock(&vnode->server->fs_lock);
+		afs_put_server(vnode->server);
+		vnode->server = NULL;
+	}
+
+	ASSERT(list_empty(&vnode->writebacks));
+	ASSERT(!vnode->cb_promised);
+
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_relinquish_cookie(vnode->cache, 0);
+	vnode->cache = NULL;
+#endif
+
+	mutex_lock(&vnode->permits_lock);
+	permits = vnode->permits;
+	rcu_assign_pointer(vnode->permits, NULL);
+	mutex_unlock(&vnode->permits_lock);
+	if (permits)
+		call_rcu(&permits->rcu, afs_zap_permits);
+
+	_leave("");
+}
+
+/*
+ * set the attributes of an inode
+ */
+int afs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+	struct key *key;
+	int ret;
+
+	_enter("{%x:%u},{n=%s},%x",
+	       vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
+	       attr->ia_valid);
+
+	if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID |
+				ATTR_MTIME))) {
+		_leave(" = 0 [unsupported]");
+		return 0;
+	}
+
+	/* flush any dirty data outstanding on a regular file */
+	if (S_ISREG(vnode->vfs_inode.i_mode)) {
+		filemap_write_and_wait(vnode->vfs_inode.i_mapping);
+		afs_writeback_all(vnode);
+	}
+
+	if (attr->ia_valid & ATTR_FILE) {
+		key = attr->ia_file->private_data;
+	} else {
+		key = afs_request_key(vnode->volume->cell);
+		if (IS_ERR(key)) {
+			ret = PTR_ERR(key);
+			goto error;
+		}
+	}
+
+	ret = afs_vnode_setattr(vnode, key, attr);
+	if (!(attr->ia_valid & ATTR_FILE))
+		key_put(key);
+
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
new file mode 100644
index 00000000..1f3624d3
--- /dev/null
+++ b/fs/afs/internal.h
@@ -0,0 +1,890 @@
+/* internal AFS stuff
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/skbuff.h>
+#include <linux/rxrpc.h>
+#include <linux/key.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/fscache.h>
+#include <linux/backing-dev.h>
+
+#include "afs.h"
+#include "afs_vl.h"
+
+#define AFS_CELL_MAX_ADDRS 15
+
+struct pagevec;
+struct afs_call;
+
+typedef enum {
+	AFS_VL_NEW,			/* new, uninitialised record */
+	AFS_VL_CREATING,		/* creating record */
+	AFS_VL_VALID,			/* record is pending */
+	AFS_VL_NO_VOLUME,		/* no such volume available */
+	AFS_VL_UPDATING,		/* update in progress */
+	AFS_VL_VOLUME_DELETED,		/* volume was deleted */
+	AFS_VL_UNCERTAIN,		/* uncertain state (update failed) */
+} __attribute__((packed)) afs_vlocation_state_t;
+
+struct afs_mount_params {
+	bool			rwpath;		/* T if the parent should be considered R/W */
+	bool			force;		/* T to force cell type */
+	bool			autocell;	/* T if set auto mount operation */
+	afs_voltype_t		type;		/* type of volume requested */
+	int			volnamesz;	/* size of volume name */
+	const char		*volname;	/* name of volume to mount */
+	struct afs_cell		*cell;		/* cell in which to find volume */
+	struct afs_volume	*volume;	/* volume record */
+	struct key		*key;		/* key to use for secure mounting */
+};
+
+/*
+ * definition of how to wait for the completion of an operation
+ */
+struct afs_wait_mode {
+	/* RxRPC received message notification */
+	void (*rx_wakeup)(struct afs_call *call);
+
+	/* synchronous call waiter and call dispatched notification */
+	int (*wait)(struct afs_call *call);
+
+	/* asynchronous call completion */
+	void (*async_complete)(void *reply, int error);
+};
+
+extern const struct afs_wait_mode afs_sync_call;
+extern const struct afs_wait_mode afs_async_call;
+
+/*
+ * a record of an in-progress RxRPC call
+ */
+struct afs_call {
+	const struct afs_call_type *type;	/* type of call */
+	const struct afs_wait_mode *wait_mode;	/* completion wait mode */
+	wait_queue_head_t	waitq;		/* processes awaiting completion */
+	struct work_struct	async_work;	/* asynchronous work processor */
+	struct work_struct	work;		/* actual work processor */
+	struct sk_buff_head	rx_queue;	/* received packets */
+	struct rxrpc_call	*rxcall;	/* RxRPC call handle */
+	struct key		*key;		/* security for this call */
+	struct afs_server	*server;	/* server affected by incoming CM call */
+	void			*request;	/* request data (first part) */
+	struct address_space	*mapping;	/* page set */
+	struct afs_writeback	*wb;		/* writeback being performed */
+	void			*buffer;	/* reply receive buffer */
+	void			*reply;		/* reply buffer (first part) */
+	void			*reply2;	/* reply buffer (second part) */
+	void			*reply3;	/* reply buffer (third part) */
+	void			*reply4;	/* reply buffer (fourth part) */
+	pgoff_t			first;		/* first page in mapping to deal with */
+	pgoff_t			last;		/* last page in mapping to deal with */
+	enum {					/* call state */
+		AFS_CALL_REQUESTING,	/* request is being sent for outgoing call */
+		AFS_CALL_AWAIT_REPLY,	/* awaiting reply to outgoing call */
+		AFS_CALL_AWAIT_OP_ID,	/* awaiting op ID on incoming call */
+		AFS_CALL_AWAIT_REQUEST,	/* awaiting request data on incoming call */
+		AFS_CALL_REPLYING,	/* replying to incoming call */
+		AFS_CALL_AWAIT_ACK,	/* awaiting final ACK of incoming call */
+		AFS_CALL_COMPLETE,	/* successfully completed */
+		AFS_CALL_BUSY,		/* server was busy */
+		AFS_CALL_ABORTED,	/* call was aborted */
+		AFS_CALL_ERROR,		/* call failed due to error */
+	}			state;
+	int			error;		/* error code */
+	unsigned		request_size;	/* size of request data */
+	unsigned		reply_max;	/* maximum size of reply */
+	unsigned		reply_size;	/* current size of reply */
+	unsigned		first_offset;	/* offset into mapping[first] */
+	unsigned		last_to;	/* amount of mapping[last] */
+	unsigned		offset;		/* offset into received data store */
+	unsigned char		unmarshall;	/* unmarshalling phase */
+	bool			incoming;	/* T if incoming call */
+	bool			send_pages;	/* T if data from mapping should be sent */
+	u16			service_id;	/* RxRPC service ID to call */
+	__be16			port;		/* target UDP port */
+	__be32			operation_ID;	/* operation ID for an incoming call */
+	u32			count;		/* count for use in unmarshalling */
+	__be32			tmp;		/* place to extract temporary data */
+	afs_dataversion_t	store_version;	/* updated version expected from store */
+};
+
+struct afs_call_type {
+	const char *name;
+
+	/* deliver request or reply data to an call
+	 * - returning an error will cause the call to be aborted
+	 */
+	int (*deliver)(struct afs_call *call, struct sk_buff *skb,
+		       bool last);
+
+	/* map an abort code to an error number */
+	int (*abort_to_error)(u32 abort_code);
+
+	/* clean up a call */
+	void (*destructor)(struct afs_call *call);
+};
+
+/*
+ * record of an outstanding writeback on a vnode
+ */
+struct afs_writeback {
+	struct list_head	link;		/* link in vnode->writebacks */
+	struct work_struct	writer;		/* work item to perform the writeback */
+	struct afs_vnode	*vnode;		/* vnode to which this write applies */
+	struct key		*key;		/* owner of this write */
+	wait_queue_head_t	waitq;		/* completion and ready wait queue */
+	pgoff_t			first;		/* first page in batch */
+	pgoff_t			point;		/* last page in current store op */
+	pgoff_t			last;		/* last page in batch (inclusive) */
+	unsigned		offset_first;	/* offset into first page of start of write */
+	unsigned		to_last;	/* offset into last page of end of write */
+	int			num_conflicts;	/* count of conflicting writes in list */
+	int			usage;
+	bool			conflicts;	/* T if has dependent conflicts */
+	enum {
+		AFS_WBACK_SYNCING,		/* synchronisation being performed */
+		AFS_WBACK_PENDING,		/* write pending */
+		AFS_WBACK_CONFLICTING,		/* conflicting writes posted */
+		AFS_WBACK_WRITING,		/* writing back */
+		AFS_WBACK_COMPLETE		/* the writeback record has been unlinked */
+	} state __attribute__((packed));
+};
+
+/*
+ * AFS superblock private data
+ * - there's one superblock per volume
+ */
+struct afs_super_info {
+	struct afs_volume	*volume;	/* volume record */
+	char			rwparent;	/* T if parent is R/W AFS volume */
+};
+
+static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+extern struct file_system_type afs_fs_type;
+
+/*
+ * entry in the cached cell catalogue
+ */
+struct afs_cache_cell {
+	char		name[AFS_MAXCELLNAME];	/* cell name (padded with NULs) */
+	struct in_addr	vl_servers[15];		/* cached cell VL servers */
+};
+
+/*
+ * AFS cell record
+ */
+struct afs_cell {
+	atomic_t		usage;
+	struct list_head	link;		/* main cell list link */
+	struct key		*anonymous_key;	/* anonymous user key for this cell */
+	struct list_head	proc_link;	/* /proc cell list link */
+	struct proc_dir_entry	*proc_dir;	/* /proc dir for this cell */
+#ifdef CONFIG_AFS_FSCACHE
+	struct fscache_cookie	*cache;		/* caching cookie */
+#endif
+
+	/* server record management */
+	rwlock_t		servers_lock;	/* active server list lock */
+	struct list_head	servers;	/* active server list */
+
+	/* volume location record management */
+	struct rw_semaphore	vl_sem;		/* volume management serialisation semaphore */
+	struct list_head	vl_list;	/* cell's active VL record list */
+	spinlock_t		vl_lock;	/* vl_list lock */
+	unsigned short		vl_naddrs;	/* number of VL servers in addr list */
+	unsigned short		vl_curr_svix;	/* current server index */
+	struct in_addr		vl_addrs[AFS_CELL_MAX_ADDRS];	/* cell VL server addresses */
+
+	char			name[0];	/* cell name - must go last */
+};
+
+/*
+ * entry in the cached volume location catalogue
+ */
+struct afs_cache_vlocation {
+	/* volume name (lowercase, padded with NULs) */
+	uint8_t			name[AFS_MAXVOLNAME + 1];
+
+	uint8_t			nservers;	/* number of entries used in servers[] */
+	uint8_t			vidmask;	/* voltype mask for vid[] */
+	uint8_t			srvtmask[8];	/* voltype masks for servers[] */
+#define AFS_VOL_VTM_RW	0x01 /* R/W version of the volume is available (on this server) */
+#define AFS_VOL_VTM_RO	0x02 /* R/O version of the volume is available (on this server) */
+#define AFS_VOL_VTM_BAK	0x04 /* backup version of the volume is available (on this server) */
+
+	afs_volid_t		vid[3];		/* volume IDs for R/W, R/O and Bak volumes */
+	struct in_addr		servers[8];	/* fileserver addresses */
+	time_t			rtime;		/* last retrieval time */
+};
+
+/*
+ * volume -> vnode hash table entry
+ */
+struct afs_cache_vhash {
+	afs_voltype_t		vtype;		/* which volume variation */
+	uint8_t			hash_bucket;	/* which hash bucket this represents */
+} __attribute__((packed));
+
+/*
+ * AFS volume location record
+ */
+struct afs_vlocation {
+	atomic_t		usage;
+	time_t			time_of_death;	/* time at which put reduced usage to 0 */
+	struct list_head	link;		/* link in cell volume location list */
+	struct list_head	grave;		/* link in master graveyard list */
+	struct list_head	update;		/* link in master update list */
+	struct afs_cell		*cell;		/* cell to which volume belongs */
+#ifdef CONFIG_AFS_FSCACHE
+	struct fscache_cookie	*cache;		/* caching cookie */
+#endif
+	struct afs_cache_vlocation vldb;	/* volume information DB record */
+	struct afs_volume	*vols[3];	/* volume access record pointer (index by type) */
+	wait_queue_head_t	waitq;		/* status change waitqueue */
+	time_t			update_at;	/* time at which record should be updated */
+	spinlock_t		lock;		/* access lock */
+	afs_vlocation_state_t	state;		/* volume location state */
+	unsigned short		upd_rej_cnt;	/* ENOMEDIUM count during update */
+	unsigned short		upd_busy_cnt;	/* EBUSY count during update */
+	bool			valid;		/* T if valid */
+};
+
+/*
+ * AFS fileserver record
+ */
+struct afs_server {
+	atomic_t		usage;
+	time_t			time_of_death;	/* time at which put reduced usage to 0 */
+	struct in_addr		addr;		/* server address */
+	struct afs_cell		*cell;		/* cell in which server resides */
+	struct list_head	link;		/* link in cell's server list */
+	struct list_head	grave;		/* link in master graveyard list */
+	struct rb_node		master_rb;	/* link in master by-addr tree */
+	struct rw_semaphore	sem;		/* access lock */
+
+	/* file service access */
+	struct rb_root		fs_vnodes;	/* vnodes backed by this server (ordered by FID) */
+	unsigned long		fs_act_jif;	/* time at which last activity occurred */
+	unsigned long		fs_dead_jif;	/* time at which no longer to be considered dead */
+	spinlock_t		fs_lock;	/* access lock */
+	int			fs_state;      	/* 0 or reason FS currently marked dead (-errno) */
+
+	/* callback promise management */
+	struct rb_root		cb_promises;	/* vnode expiration list (ordered earliest first) */
+	struct delayed_work	cb_updater;	/* callback updater */
+	struct delayed_work	cb_break_work;	/* collected break dispatcher */
+	wait_queue_head_t	cb_break_waitq;	/* space available in cb_break waitqueue */
+	spinlock_t		cb_lock;	/* access lock */
+	struct afs_callback	cb_break[64];	/* ring of callbacks awaiting breaking */
+	atomic_t		cb_break_n;	/* number of pending breaks */
+	u8			cb_break_head;	/* head of callback breaking ring */
+	u8			cb_break_tail;	/* tail of callback breaking ring */
+};
+
+/*
+ * AFS volume access record
+ */
+struct afs_volume {
+	atomic_t		usage;
+	struct afs_cell		*cell;		/* cell to which belongs (unrefd ptr) */
+	struct afs_vlocation	*vlocation;	/* volume location */
+#ifdef CONFIG_AFS_FSCACHE
+	struct fscache_cookie	*cache;		/* caching cookie */
+#endif
+	afs_volid_t		vid;		/* volume ID */
+	afs_voltype_t		type;		/* type of volume */
+	char			type_force;	/* force volume type (suppress R/O -> R/W) */
+	unsigned short		nservers;	/* number of server slots filled */
+	unsigned short		rjservers;	/* number of servers discarded due to -ENOMEDIUM */
+	struct afs_server	*servers[8];	/* servers on which volume resides (ordered) */
+	struct rw_semaphore	server_sem;	/* lock for accessing current server */
+	struct backing_dev_info	bdi;
+};
+
+/*
+ * vnode catalogue entry
+ */
+struct afs_cache_vnode {
+	afs_vnodeid_t		vnode_id;	/* vnode ID */
+	unsigned		vnode_unique;	/* vnode ID uniquifier */
+	afs_dataversion_t	data_version;	/* data version */
+};
+
+/*
+ * AFS inode private data
+ */
+struct afs_vnode {
+	struct inode		vfs_inode;	/* the VFS's inode record */
+
+	struct afs_volume	*volume;	/* volume on which vnode resides */
+	struct afs_server	*server;	/* server currently supplying this file */
+	struct afs_fid		fid;		/* the file identifier for this inode */
+	struct afs_file_status	status;		/* AFS status info for this file */
+#ifdef CONFIG_AFS_FSCACHE
+	struct fscache_cookie	*cache;		/* caching cookie */
+#endif
+	struct afs_permits	*permits;	/* cache of permits so far obtained */
+	struct mutex		permits_lock;	/* lock for altering permits list */
+	struct mutex		validate_lock;	/* lock for validating this vnode */
+	wait_queue_head_t	update_waitq;	/* status fetch waitqueue */
+	int			update_cnt;	/* number of outstanding ops that will update the
+						 * status */
+	spinlock_t		writeback_lock;	/* lock for writebacks */
+	spinlock_t		lock;		/* waitqueue/flags lock */
+	unsigned long		flags;
+#define AFS_VNODE_CB_BROKEN	0		/* set if vnode's callback was broken */
+#define AFS_VNODE_UNSET		1		/* set if vnode attributes not yet set */
+#define AFS_VNODE_MODIFIED	2		/* set if vnode's data modified */
+#define AFS_VNODE_ZAP_DATA	3		/* set if vnode's data should be invalidated */
+#define AFS_VNODE_DELETED	4		/* set if vnode deleted on server */
+#define AFS_VNODE_MOUNTPOINT	5		/* set if vnode is a mountpoint symlink */
+#define AFS_VNODE_LOCKING	6		/* set if waiting for lock on vnode */
+#define AFS_VNODE_READLOCKED	7		/* set if vnode is read-locked on the server */
+#define AFS_VNODE_WRITELOCKED	8		/* set if vnode is write-locked on the server */
+#define AFS_VNODE_UNLOCKING	9		/* set if vnode is being unlocked on the server */
+#define AFS_VNODE_AUTOCELL	10		/* set if Vnode is an auto mount point */
+#define AFS_VNODE_PSEUDODIR	11		/* set if Vnode is a pseudo directory */
+
+	long			acl_order;	/* ACL check count (callback break count) */
+
+	struct list_head	writebacks;	/* alterations in pagecache that need writing */
+	struct list_head	pending_locks;	/* locks waiting to be granted */
+	struct list_head	granted_locks;	/* locks granted on this file */
+	struct delayed_work	lock_work;	/* work to be done in locking */
+	struct key		*unlock_key;	/* key to be used in unlocking */
+
+	/* outstanding callback notification on this file */
+	struct rb_node		server_rb;	/* link in server->fs_vnodes */
+	struct rb_node		cb_promise;	/* link in server->cb_promises */
+	struct work_struct	cb_broken_work;	/* work to be done on callback break */
+	time_t			cb_expires;	/* time at which callback expires */
+	time_t			cb_expires_at;	/* time used to order cb_promise */
+	unsigned		cb_version;	/* callback version */
+	unsigned		cb_expiry;	/* callback expiry time */
+	afs_callback_type_t	cb_type;	/* type of callback */
+	bool			cb_promised;	/* true if promise still holds */
+};
+
+/*
+ * cached security record for one user's attempt to access a vnode
+ */
+struct afs_permit {
+	struct key		*key;		/* RxRPC ticket holding a security context */
+	afs_access_t		access_mask;	/* access mask for this key */
+};
+
+/*
+ * cache of security records from attempts to access a vnode
+ */
+struct afs_permits {
+	struct rcu_head		rcu;		/* disposal procedure */
+	int			count;		/* number of records */
+	struct afs_permit	permits[0];	/* the permits so far examined */
+};
+
+/*
+ * record of one of a system's set of network interfaces
+ */
+struct afs_interface {
+	struct in_addr	address;	/* IPv4 address bound to interface */
+	struct in_addr	netmask;	/* netmask applied to address */
+	unsigned	mtu;		/* MTU of interface */
+};
+
+/*
+ * UUID definition [internet draft]
+ * - the timestamp is a 60-bit value, split 32/16/12, and goes in 100ns
+ *   increments since midnight 15th October 1582
+ *   - add AFS_UUID_TO_UNIX_TIME to convert unix time in 100ns units to UUID
+ *     time
+ * - the clock sequence is a 14-bit counter to avoid duplicate times
+ */
+struct afs_uuid {
+	u32		time_low;			/* low part of timestamp */
+	u16		time_mid;			/* mid part of timestamp */
+	u16		time_hi_and_version;		/* high part of timestamp and version  */
+#define AFS_UUID_TO_UNIX_TIME	0x01b21dd213814000ULL
+#define AFS_UUID_TIMEHI_MASK	0x0fff
+#define AFS_UUID_VERSION_TIME	0x1000	/* time-based UUID */
+#define AFS_UUID_VERSION_NAME	0x3000	/* name-based UUID */
+#define AFS_UUID_VERSION_RANDOM	0x4000	/* (pseudo-)random generated UUID */
+	u8		clock_seq_hi_and_reserved;	/* clock seq hi and variant */
+#define AFS_UUID_CLOCKHI_MASK	0x3f
+#define AFS_UUID_VARIANT_STD	0x80
+	u8		clock_seq_low;			/* clock seq low */
+	u8		node[6];			/* spatially unique node ID (MAC addr) */
+};
+
+/*****************************************************************************/
+/*
+ * cache.c
+ */
+#ifdef CONFIG_AFS_FSCACHE
+extern struct fscache_netfs afs_cache_netfs;
+extern struct fscache_cookie_def afs_cell_cache_index_def;
+extern struct fscache_cookie_def afs_vlocation_cache_index_def;
+extern struct fscache_cookie_def afs_volume_cache_index_def;
+extern struct fscache_cookie_def afs_vnode_cache_index_def;
+#else
+#define afs_cell_cache_index_def	(*(struct fscache_cookie_def *) NULL)
+#define afs_vlocation_cache_index_def	(*(struct fscache_cookie_def *) NULL)
+#define afs_volume_cache_index_def	(*(struct fscache_cookie_def *) NULL)
+#define afs_vnode_cache_index_def	(*(struct fscache_cookie_def *) NULL)
+#endif
+
+/*
+ * callback.c
+ */
+extern void afs_init_callback_state(struct afs_server *);
+extern void afs_broken_callback_work(struct work_struct *);
+extern void afs_break_callbacks(struct afs_server *, size_t,
+				struct afs_callback[]);
+extern void afs_discard_callback_on_delete(struct afs_vnode *);
+extern void afs_give_up_callback(struct afs_vnode *);
+extern void afs_dispatch_give_up_callbacks(struct work_struct *);
+extern void afs_flush_callback_breaks(struct afs_server *);
+extern int __init afs_callback_update_init(void);
+extern void afs_callback_update_kill(void);
+
+/*
+ * cell.c
+ */
+extern struct rw_semaphore afs_proc_cells_sem;
+extern struct list_head afs_proc_cells;
+
+#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
+extern int afs_cell_init(char *);
+extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool);
+extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool);
+extern struct afs_cell *afs_grab_cell(struct afs_cell *);
+extern void afs_put_cell(struct afs_cell *);
+extern void afs_cell_purge(void);
+
+/*
+ * cmservice.c
+ */
+extern bool afs_cm_incoming_call(struct afs_call *);
+
+/*
+ * dir.c
+ */
+extern const struct inode_operations afs_dir_inode_operations;
+extern const struct dentry_operations afs_fs_dentry_operations;
+extern const struct file_operations afs_dir_file_operations;
+
+/*
+ * file.c
+ */
+extern const struct address_space_operations afs_fs_aops;
+extern const struct inode_operations afs_file_inode_operations;
+extern const struct file_operations afs_file_operations;
+
+extern int afs_open(struct inode *, struct file *);
+extern int afs_release(struct inode *, struct file *);
+extern int afs_page_filler(void *, struct page *);
+
+/*
+ * flock.c
+ */
+extern void __exit afs_kill_lock_manager(void);
+extern void afs_lock_work(struct work_struct *);
+extern void afs_lock_may_be_available(struct afs_vnode *);
+extern int afs_lock(struct file *, int, struct file_lock *);
+extern int afs_flock(struct file *, int, struct file_lock *);
+
+/*
+ * fsclient.c
+ */
+extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
+				    struct afs_vnode *, struct afs_volsync *,
+				    const struct afs_wait_mode *);
+extern int afs_fs_give_up_callbacks(struct afs_server *,
+				    const struct afs_wait_mode *);
+extern int afs_fs_fetch_data(struct afs_server *, struct key *,
+			     struct afs_vnode *, off_t, size_t, struct page *,
+			     const struct afs_wait_mode *);
+extern int afs_fs_create(struct afs_server *, struct key *,
+			 struct afs_vnode *, const char *, umode_t,
+			 struct afs_fid *, struct afs_file_status *,
+			 struct afs_callback *,
+			 const struct afs_wait_mode *);
+extern int afs_fs_remove(struct afs_server *, struct key *,
+			 struct afs_vnode *, const char *, bool,
+			 const struct afs_wait_mode *);
+extern int afs_fs_link(struct afs_server *, struct key *, struct afs_vnode *,
+		       struct afs_vnode *, const char *,
+		       const struct afs_wait_mode *);
+extern int afs_fs_symlink(struct afs_server *, struct key *,
+			  struct afs_vnode *, const char *, const char *,
+			  struct afs_fid *, struct afs_file_status *,
+			  const struct afs_wait_mode *);
+extern int afs_fs_rename(struct afs_server *, struct key *,
+			 struct afs_vnode *, const char *,
+			 struct afs_vnode *, const char *,
+			 const struct afs_wait_mode *);
+extern int afs_fs_store_data(struct afs_server *, struct afs_writeback *,
+			     pgoff_t, pgoff_t, unsigned, unsigned,
+			     const struct afs_wait_mode *);
+extern int afs_fs_setattr(struct afs_server *, struct key *,
+			  struct afs_vnode *, struct iattr *,
+			  const struct afs_wait_mode *);
+extern int afs_fs_get_volume_status(struct afs_server *, struct key *,
+				    struct afs_vnode *,
+				    struct afs_volume_status *,
+				    const struct afs_wait_mode *);
+extern int afs_fs_set_lock(struct afs_server *, struct key *,
+			   struct afs_vnode *, afs_lock_type_t,
+			   const struct afs_wait_mode *);
+extern int afs_fs_extend_lock(struct afs_server *, struct key *,
+			      struct afs_vnode *,
+			      const struct afs_wait_mode *);
+extern int afs_fs_release_lock(struct afs_server *, struct key *,
+			       struct afs_vnode *,
+			       const struct afs_wait_mode *);
+
+/*
+ * inode.c
+ */
+extern struct inode *afs_iget_autocell(struct inode *, const char *, int,
+				       struct key *);
+extern struct inode *afs_iget(struct super_block *, struct key *,
+			      struct afs_fid *, struct afs_file_status *,
+			      struct afs_callback *);
+extern void afs_zap_data(struct afs_vnode *);
+extern int afs_validate(struct afs_vnode *, struct key *);
+extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int afs_setattr(struct dentry *, struct iattr *);
+extern void afs_evict_inode(struct inode *);
+extern int afs_drop_inode(struct inode *);
+
+/*
+ * main.c
+ */
+extern struct workqueue_struct *afs_wq;
+extern struct afs_uuid afs_uuid;
+
+/*
+ * misc.c
+ */
+extern int afs_abort_to_error(u32);
+
+/*
+ * mntpt.c
+ */
+extern const struct inode_operations afs_mntpt_inode_operations;
+extern const struct inode_operations afs_autocell_inode_operations;
+extern const struct file_operations afs_mntpt_file_operations;
+
+extern struct vfsmount *afs_d_automount(struct path *);
+extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
+extern void afs_mntpt_kill_timer(void);
+
+/*
+ * proc.c
+ */
+extern int afs_proc_init(void);
+extern void afs_proc_cleanup(void);
+extern int afs_proc_cell_setup(struct afs_cell *);
+extern void afs_proc_cell_remove(struct afs_cell *);
+
+/*
+ * rxrpc.c
+ */
+extern int afs_open_socket(void);
+extern void afs_close_socket(void);
+extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t,
+			 const struct afs_wait_mode *);
+extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
+					    size_t, size_t);
+extern void afs_flat_call_destructor(struct afs_call *);
+extern void afs_transfer_reply(struct afs_call *, struct sk_buff *);
+extern void afs_send_empty_reply(struct afs_call *);
+extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
+extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
+			    size_t);
+
+/*
+ * security.c
+ */
+extern void afs_clear_permits(struct afs_vnode *);
+extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
+extern void afs_zap_permits(struct rcu_head *);
+extern struct key *afs_request_key(struct afs_cell *);
+extern int afs_permission(struct inode *, int, unsigned int);
+
+/*
+ * server.c
+ */
+extern spinlock_t afs_server_peer_lock;
+
+#define afs_get_server(S)					\
+do {								\
+	_debug("GET SERVER %d", atomic_read(&(S)->usage));	\
+	atomic_inc(&(S)->usage);				\
+} while(0)
+
+extern struct afs_server *afs_lookup_server(struct afs_cell *,
+					    const struct in_addr *);
+extern struct afs_server *afs_find_server(const struct in_addr *);
+extern void afs_put_server(struct afs_server *);
+extern void __exit afs_purge_servers(void);
+
+/*
+ * super.c
+ */
+extern int afs_fs_init(void);
+extern void afs_fs_exit(void);
+
+/*
+ * use-rtnetlink.c
+ */
+extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool);
+extern int afs_get_MAC_address(u8 *, size_t);
+
+/*
+ * vlclient.c
+ */
+extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
+				    const char *, struct afs_cache_vlocation *,
+				    const struct afs_wait_mode *);
+extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *,
+				  afs_volid_t, afs_voltype_t,
+				  struct afs_cache_vlocation *,
+				  const struct afs_wait_mode *);
+
+/*
+ * vlocation.c
+ */
+#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0)
+
+extern int __init afs_vlocation_update_init(void);
+extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *,
+						  struct key *,
+						  const char *, size_t);
+extern void afs_put_vlocation(struct afs_vlocation *);
+extern void afs_vlocation_purge(void);
+
+/*
+ * vnode.c
+ */
+static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
+{
+	return container_of(inode, struct afs_vnode, vfs_inode);
+}
+
+static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
+{
+	return &vnode->vfs_inode;
+}
+
+extern void afs_vnode_finalise_status_update(struct afs_vnode *,
+					     struct afs_server *);
+extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *,
+				  struct key *);
+extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *,
+				off_t, size_t, struct page *);
+extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *,
+			    umode_t, struct afs_fid *, struct afs_file_status *,
+			    struct afs_callback *, struct afs_server **);
+extern int afs_vnode_remove(struct afs_vnode *, struct key *, const char *,
+			    bool);
+extern int afs_vnode_link(struct afs_vnode *, struct afs_vnode *, struct key *,
+			  const char *);
+extern int afs_vnode_symlink(struct afs_vnode *, struct key *, const char *,
+			     const char *, struct afs_fid *,
+			     struct afs_file_status *, struct afs_server **);
+extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *,
+			    struct key *, const char *, const char *);
+extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t,
+				unsigned, unsigned);
+extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *);
+extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *,
+				       struct afs_volume_status *);
+extern int afs_vnode_set_lock(struct afs_vnode *, struct key *,
+			      afs_lock_type_t);
+extern int afs_vnode_extend_lock(struct afs_vnode *, struct key *);
+extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
+
+/*
+ * volume.c
+ */
+#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
+
+extern void afs_put_volume(struct afs_volume *);
+extern struct afs_volume *afs_volume_lookup(struct afs_mount_params *);
+extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *);
+extern int afs_volume_release_fileserver(struct afs_vnode *,
+					 struct afs_server *, int);
+
+/*
+ * write.c
+ */
+extern int afs_set_page_dirty(struct page *);
+extern void afs_put_writeback(struct afs_writeback *);
+extern int afs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata);
+extern int afs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata);
+extern int afs_writepage(struct page *, struct writeback_control *);
+extern int afs_writepages(struct address_space *, struct writeback_control *);
+extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
+extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
+			      unsigned long, loff_t);
+extern int afs_writeback_all(struct afs_vnode *);
+extern int afs_fsync(struct file *, int);
+
+
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+extern unsigned afs_debug;
+
+#define dbgprintk(FMT,...) \
+	printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
+
+#define kenter(FMT,...)	dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define kleave(FMT,...)	dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define kdebug(FMT,...)	dbgprintk("    "FMT ,##__VA_ARGS__)
+
+
+#if defined(__KDEBUG)
+#define _enter(FMT,...)	kenter(FMT,##__VA_ARGS__)
+#define _leave(FMT,...)	kleave(FMT,##__VA_ARGS__)
+#define _debug(FMT,...)	kdebug(FMT,##__VA_ARGS__)
+
+#elif defined(CONFIG_AFS_DEBUG)
+#define AFS_DEBUG_KENTER	0x01
+#define AFS_DEBUG_KLEAVE	0x02
+#define AFS_DEBUG_KDEBUG	0x04
+
+#define _enter(FMT,...)					\
+do {							\
+	if (unlikely(afs_debug & AFS_DEBUG_KENTER))	\
+		kenter(FMT,##__VA_ARGS__);		\
+} while (0)
+
+#define _leave(FMT,...)					\
+do {							\
+	if (unlikely(afs_debug & AFS_DEBUG_KLEAVE))	\
+		kleave(FMT,##__VA_ARGS__);		\
+} while (0)
+
+#define _debug(FMT,...)					\
+do {							\
+	if (unlikely(afs_debug & AFS_DEBUG_KDEBUG))	\
+		kdebug(FMT,##__VA_ARGS__);		\
+} while (0)
+
+#else
+#define _enter(FMT,...)	no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define _leave(FMT,...)	no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define _debug(FMT,...)	no_printk("    "FMT ,##__VA_ARGS__)
+#endif
+
+/*
+ * debug assertion checking
+ */
+#if 1 // defined(__KDEBUGALL)
+
+#define ASSERT(X)						\
+do {								\
+	if (unlikely(!(X))) {					\
+		printk(KERN_ERR "\n");				\
+		printk(KERN_ERR "AFS: Assertion failed\n");	\
+		BUG();						\
+	}							\
+} while(0)
+
+#define ASSERTCMP(X, OP, Y)						\
+do {									\
+	if (unlikely(!((X) OP (Y)))) {					\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "AFS: Assertion failed\n");		\
+		printk(KERN_ERR "%lu " #OP " %lu is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n",	\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while(0)
+
+#define ASSERTRANGE(L, OP1, N, OP2, H)					\
+do {									\
+	if (unlikely(!((L) OP1 (N)) || !((N) OP2 (H)))) {		\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "AFS: Assertion failed\n");		\
+		printk(KERN_ERR "%lu "#OP1" %lu "#OP2" %lu is false\n",	\
+		       (unsigned long)(L), (unsigned long)(N),		\
+		       (unsigned long)(H));				\
+		printk(KERN_ERR "0x%lx "#OP1" 0x%lx "#OP2" 0x%lx is false\n", \
+		       (unsigned long)(L), (unsigned long)(N),		\
+		       (unsigned long)(H));				\
+		BUG();							\
+	}								\
+} while(0)
+
+#define ASSERTIF(C, X)						\
+do {								\
+	if (unlikely((C) && !(X))) {				\
+		printk(KERN_ERR "\n");				\
+		printk(KERN_ERR "AFS: Assertion failed\n");	\
+		BUG();						\
+	}							\
+} while(0)
+
+#define ASSERTIFCMP(C, X, OP, Y)					\
+do {									\
+	if (unlikely((C) && !((X) OP (Y)))) {				\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "AFS: Assertion failed\n");		\
+		printk(KERN_ERR "%lu " #OP " %lu is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n",	\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while(0)
+
+#else
+
+#define ASSERT(X)				\
+do {						\
+} while(0)
+
+#define ASSERTCMP(X, OP, Y)			\
+do {						\
+} while(0)
+
+#define ASSERTRANGE(L, OP1, N, OP2, H)		\
+do {						\
+} while(0)
+
+#define ASSERTIF(C, X)				\
+do {						\
+} while(0)
+
+#define ASSERTIFCMP(C, X, OP, Y)		\
+do {						\
+} while(0)
+
+#endif /* __KDEBUGALL */
diff --git a/fs/afs/main.c b/fs/afs/main.c
new file mode 100644
index 00000000..42dd2e49
--- /dev/null
+++ b/fs/afs/main.c
@@ -0,0 +1,184 @@
+/* AFS client file system
+ *
+ * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/completion.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+MODULE_DESCRIPTION("AFS Client File System");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+unsigned afs_debug;
+module_param_named(debug, afs_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "AFS debugging mask");
+
+static char *rootcell;
+
+module_param(rootcell, charp, 0);
+MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
+
+struct afs_uuid afs_uuid;
+struct workqueue_struct *afs_wq;
+
+/*
+ * get a client UUID
+ */
+static int __init afs_get_client_UUID(void)
+{
+	struct timespec ts;
+	u64 uuidtime;
+	u16 clockseq;
+	int ret;
+
+	/* read the MAC address of one of the external interfaces and construct
+	 * a UUID from it */
+	ret = afs_get_MAC_address(afs_uuid.node, sizeof(afs_uuid.node));
+	if (ret < 0)
+		return ret;
+
+	getnstimeofday(&ts);
+	uuidtime = (u64) ts.tv_sec * 1000 * 1000 * 10;
+	uuidtime += ts.tv_nsec / 100;
+	uuidtime += AFS_UUID_TO_UNIX_TIME;
+	afs_uuid.time_low = uuidtime;
+	afs_uuid.time_mid = uuidtime >> 32;
+	afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK;
+	afs_uuid.time_hi_and_version = AFS_UUID_VERSION_TIME;
+
+	get_random_bytes(&clockseq, 2);
+	afs_uuid.clock_seq_low = clockseq;
+	afs_uuid.clock_seq_hi_and_reserved =
+		(clockseq >> 8) & AFS_UUID_CLOCKHI_MASK;
+	afs_uuid.clock_seq_hi_and_reserved = AFS_UUID_VARIANT_STD;
+
+	_debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+	       afs_uuid.time_low,
+	       afs_uuid.time_mid,
+	       afs_uuid.time_hi_and_version,
+	       afs_uuid.clock_seq_hi_and_reserved,
+	       afs_uuid.clock_seq_low,
+	       afs_uuid.node[0], afs_uuid.node[1], afs_uuid.node[2],
+	       afs_uuid.node[3], afs_uuid.node[4], afs_uuid.node[5]);
+
+	return 0;
+}
+
+/*
+ * initialise the AFS client FS module
+ */
+static int __init afs_init(void)
+{
+	int ret;
+
+	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
+
+	ret = afs_get_client_UUID();
+	if (ret < 0)
+		return ret;
+
+	/* create workqueue */
+	ret = -ENOMEM;
+	afs_wq = alloc_workqueue("afs", 0, 0);
+	if (!afs_wq)
+		return ret;
+
+	/* register the /proc stuff */
+	ret = afs_proc_init();
+	if (ret < 0)
+		goto error_proc;
+
+#ifdef CONFIG_AFS_FSCACHE
+	/* we want to be able to cache */
+	ret = fscache_register_netfs(&afs_cache_netfs);
+	if (ret < 0)
+		goto error_cache;
+#endif
+
+	/* initialise the cell DB */
+	ret = afs_cell_init(rootcell);
+	if (ret < 0)
+		goto error_cell_init;
+
+	/* initialise the VL update process */
+	ret = afs_vlocation_update_init();
+	if (ret < 0)
+		goto error_vl_update_init;
+
+	/* initialise the callback update process */
+	ret = afs_callback_update_init();
+	if (ret < 0)
+		goto error_callback_update_init;
+
+	/* create the RxRPC transport */
+	ret = afs_open_socket();
+	if (ret < 0)
+		goto error_open_socket;
+
+	/* register the filesystems */
+	ret = afs_fs_init();
+	if (ret < 0)
+		goto error_fs;
+
+	return ret;
+
+error_fs:
+	afs_close_socket();
+error_open_socket:
+	afs_callback_update_kill();
+error_callback_update_init:
+	afs_vlocation_purge();
+error_vl_update_init:
+	afs_cell_purge();
+error_cell_init:
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_unregister_netfs(&afs_cache_netfs);
+error_cache:
+#endif
+	afs_proc_cleanup();
+error_proc:
+	destroy_workqueue(afs_wq);
+	rcu_barrier();
+	printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
+	return ret;
+}
+
+/* XXX late_initcall is kludgy, but the only alternative seems to create
+ * a transport upon the first mount, which is worse. Or is it?
+ */
+late_initcall(afs_init);	/* must be called after net/ to create socket */
+
+/*
+ * clean up on module removal
+ */
+static void __exit afs_exit(void)
+{
+	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n");
+
+	afs_fs_exit();
+	afs_kill_lock_manager();
+	afs_close_socket();
+	afs_purge_servers();
+	afs_callback_update_kill();
+	afs_vlocation_purge();
+	destroy_workqueue(afs_wq);
+	afs_cell_purge();
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_unregister_netfs(&afs_cache_netfs);
+#endif
+	afs_proc_cleanup();
+	rcu_barrier();
+}
+
+module_exit(afs_exit);
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
new file mode 100644
index 00000000..0dd4dafe
--- /dev/null
+++ b/fs/afs/misc.c
@@ -0,0 +1,75 @@
+/* miscellaneous bits
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <rxrpc/packet.h>
+#include "internal.h"
+#include "afs_fs.h"
+
+/*
+ * convert an AFS abort code to a Linux error number
+ */
+int afs_abort_to_error(u32 abort_code)
+{
+	switch (abort_code) {
+	case 13:		return -EACCES;
+	case 27:		return -EFBIG;
+	case 30:		return -EROFS;
+	case VSALVAGE:		return -EIO;
+	case VNOVNODE:		return -ENOENT;
+	case VNOVOL:		return -ENOMEDIUM;
+	case VVOLEXISTS:	return -EEXIST;
+	case VNOSERVICE:	return -EIO;
+	case VOFFLINE:		return -ENOENT;
+	case VONLINE:		return -EEXIST;
+	case VDISKFULL:		return -ENOSPC;
+	case VOVERQUOTA:	return -EDQUOT;
+	case VBUSY:		return -EBUSY;
+	case VMOVED:		return -ENXIO;
+	case 0x2f6df0a:		return -EWOULDBLOCK;
+	case 0x2f6df0c:		return -EACCES;
+	case 0x2f6df0f:		return -EBUSY;
+	case 0x2f6df10:		return -EEXIST;
+	case 0x2f6df11:		return -EXDEV;
+	case 0x2f6df13:		return -ENOTDIR;
+	case 0x2f6df14:		return -EISDIR;
+	case 0x2f6df15:		return -EINVAL;
+	case 0x2f6df1a:		return -EFBIG;
+	case 0x2f6df1b:		return -ENOSPC;
+	case 0x2f6df1d:		return -EROFS;
+	case 0x2f6df1e:		return -EMLINK;
+	case 0x2f6df20:		return -EDOM;
+	case 0x2f6df21:		return -ERANGE;
+	case 0x2f6df22:		return -EDEADLK;
+	case 0x2f6df23:		return -ENAMETOOLONG;
+	case 0x2f6df24:		return -ENOLCK;
+	case 0x2f6df26:		return -ENOTEMPTY;
+	case 0x2f6df78:		return -EDQUOT;
+
+	case RXKADINCONSISTENCY: return -EPROTO;
+	case RXKADPACKETSHORT:	return -EPROTO;
+	case RXKADLEVELFAIL:	return -EKEYREJECTED;
+	case RXKADTICKETLEN:	return -EKEYREJECTED;
+	case RXKADOUTOFSEQUENCE: return -EPROTO;
+	case RXKADNOAUTH:	return -EKEYREJECTED;
+	case RXKADBADKEY:	return -EKEYREJECTED;
+	case RXKADBADTICKET:	return -EKEYREJECTED;
+	case RXKADUNKNOWNKEY:	return -EKEYREJECTED;
+	case RXKADEXPIRED:	return -EKEYEXPIRED;
+	case RXKADSEALEDINCON:	return -EKEYREJECTED;
+	case RXKADDATALEN:	return -EKEYREJECTED;
+	case RXKADILLEGALLEVEL:	return -EKEYREJECTED;
+
+	default:		return -EREMOTEIO;
+	}
+}
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
new file mode 100644
index 00000000..aa591841
--- /dev/null
+++ b/fs/afs/mntpt.c
@@ -0,0 +1,284 @@
+/* mountpoint management
+ *
+ * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/gfp.h>
+#include "internal.h"
+
+
+static struct dentry *afs_mntpt_lookup(struct inode *dir,
+				       struct dentry *dentry,
+				       struct nameidata *nd);
+static int afs_mntpt_open(struct inode *inode, struct file *file);
+static void afs_mntpt_expiry_timed_out(struct work_struct *work);
+
+const struct file_operations afs_mntpt_file_operations = {
+	.open		= afs_mntpt_open,
+	.llseek		= noop_llseek,
+};
+
+const struct inode_operations afs_mntpt_inode_operations = {
+	.lookup		= afs_mntpt_lookup,
+	.readlink	= page_readlink,
+	.getattr	= afs_getattr,
+};
+
+const struct inode_operations afs_autocell_inode_operations = {
+	.getattr	= afs_getattr,
+};
+
+static LIST_HEAD(afs_vfsmounts);
+static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
+
+static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
+
+/*
+ * check a symbolic link to see whether it actually encodes a mountpoint
+ * - sets the AFS_VNODE_MOUNTPOINT flag on the vnode appropriately
+ */
+int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
+{
+	struct page *page;
+	size_t size;
+	char *buf;
+	int ret;
+
+	_enter("{%x:%u,%u}",
+	       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+
+	/* read the contents of the symlink into the pagecache */
+	page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
+			       afs_page_filler, key);
+	if (IS_ERR(page)) {
+		ret = PTR_ERR(page);
+		goto out;
+	}
+
+	ret = -EIO;
+	if (PageError(page))
+		goto out_free;
+
+	buf = kmap(page);
+
+	/* examine the symlink's contents */
+	size = vnode->status.size;
+	_debug("symlink to %*.*s", (int) size, (int) size, buf);
+
+	if (size > 2 &&
+	    (buf[0] == '%' || buf[0] == '#') &&
+	    buf[size - 1] == '.'
+	    ) {
+		_debug("symlink is a mountpoint");
+		spin_lock(&vnode->lock);
+		set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+		vnode->vfs_inode.i_flags |= S_AUTOMOUNT;
+		spin_unlock(&vnode->lock);
+	}
+
+	ret = 0;
+
+	kunmap(page);
+out_free:
+	page_cache_release(page);
+out:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * no valid lookup procedure on this sort of dir
+ */
+static struct dentry *afs_mntpt_lookup(struct inode *dir,
+				       struct dentry *dentry,
+				       struct nameidata *nd)
+{
+	_enter("%p,%p{%p{%s},%s}",
+	       dir,
+	       dentry,
+	       dentry->d_parent,
+	       dentry->d_parent ?
+	       dentry->d_parent->d_name.name : (const unsigned char *) "",
+	       dentry->d_name.name);
+
+	return ERR_PTR(-EREMOTE);
+}
+
+/*
+ * no valid open procedure on this sort of dir
+ */
+static int afs_mntpt_open(struct inode *inode, struct file *file)
+{
+	_enter("%p,%p{%p{%s},%s}",
+	       inode, file,
+	       file->f_path.dentry->d_parent,
+	       file->f_path.dentry->d_parent ?
+	       file->f_path.dentry->d_parent->d_name.name :
+	       (const unsigned char *) "",
+	       file->f_path.dentry->d_name.name);
+
+	return -EREMOTE;
+}
+
+/*
+ * create a vfsmount to be automounted
+ */
+static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
+{
+	struct afs_super_info *super;
+	struct vfsmount *mnt;
+	struct afs_vnode *vnode;
+	struct page *page;
+	char *devname, *options;
+	bool rwpath = false;
+	int ret;
+
+	_enter("{%s}", mntpt->d_name.name);
+
+	BUG_ON(!mntpt->d_inode);
+
+	ret = -ENOMEM;
+	devname = (char *) get_zeroed_page(GFP_KERNEL);
+	if (!devname)
+		goto error_no_devname;
+
+	options = (char *) get_zeroed_page(GFP_KERNEL);
+	if (!options)
+		goto error_no_options;
+
+	vnode = AFS_FS_I(mntpt->d_inode);
+	if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
+		/* if the directory is a pseudo directory, use the d_name */
+		static const char afs_root_cell[] = ":root.cell.";
+		unsigned size = mntpt->d_name.len;
+
+		ret = -ENOENT;
+		if (size < 2 || size > AFS_MAXCELLNAME)
+			goto error_no_page;
+
+		if (mntpt->d_name.name[0] == '.') {
+			devname[0] = '#';
+			memcpy(devname + 1, mntpt->d_name.name, size - 1);
+			memcpy(devname + size, afs_root_cell,
+			       sizeof(afs_root_cell));
+			rwpath = true;
+		} else {
+			devname[0] = '%';
+			memcpy(devname + 1, mntpt->d_name.name, size);
+			memcpy(devname + size + 1, afs_root_cell,
+			       sizeof(afs_root_cell));
+		}
+	} else {
+		/* read the contents of the AFS special symlink */
+		loff_t size = i_size_read(mntpt->d_inode);
+		char *buf;
+
+		ret = -EINVAL;
+		if (size > PAGE_SIZE - 1)
+			goto error_no_page;
+
+		page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
+		if (IS_ERR(page)) {
+			ret = PTR_ERR(page);
+			goto error_no_page;
+		}
+
+		ret = -EIO;
+		if (PageError(page))
+			goto error;
+
+		buf = kmap_atomic(page, KM_USER0);
+		memcpy(devname, buf, size);
+		kunmap_atomic(buf, KM_USER0);
+		page_cache_release(page);
+		page = NULL;
+	}
+
+	/* work out what options we want */
+	super = AFS_FS_S(mntpt->d_sb);
+	memcpy(options, "cell=", 5);
+	strcpy(options + 5, super->volume->cell->name);
+	if (super->volume->type == AFSVL_RWVOL || rwpath)
+		strcat(options, ",rwpath");
+
+	/* try and do the mount */
+	_debug("--- attempting mount %s -o %s ---", devname, options);
+	mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
+	_debug("--- mount result %p ---", mnt);
+
+	free_page((unsigned long) devname);
+	free_page((unsigned long) options);
+	_leave(" = %p", mnt);
+	return mnt;
+
+error:
+	page_cache_release(page);
+error_no_page:
+	free_page((unsigned long) options);
+error_no_options:
+	free_page((unsigned long) devname);
+error_no_devname:
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * handle an automount point
+ */
+struct vfsmount *afs_d_automount(struct path *path)
+{
+	struct vfsmount *newmnt;
+
+	_enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
+
+	newmnt = afs_mntpt_do_automount(path->dentry);
+	if (IS_ERR(newmnt))
+		return newmnt;
+
+	mntget(newmnt); /* prevent immediate expiration */
+	mnt_set_expiry(newmnt, &afs_vfsmounts);
+	queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+			   afs_mntpt_expiry_timeout * HZ);
+	_leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
+	return newmnt;
+}
+
+/*
+ * handle mountpoint expiry timer going off
+ */
+static void afs_mntpt_expiry_timed_out(struct work_struct *work)
+{
+	_enter("");
+
+	if (!list_empty(&afs_vfsmounts)) {
+		mark_mounts_for_expiry(&afs_vfsmounts);
+		queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+				   afs_mntpt_expiry_timeout * HZ);
+	}
+
+	_leave("");
+}
+
+/*
+ * kill the AFS mountpoint timer if it's still running
+ */
+void afs_mntpt_kill_timer(void)
+{
+	_enter("");
+
+	ASSERT(list_empty(&afs_vfsmounts));
+	cancel_delayed_work_sync(&afs_mntpt_expiry_timer);
+}
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
new file mode 100644
index 00000000..7ad36506
--- /dev/null
+++ b/fs/afs/netdevices.c
@@ -0,0 +1,68 @@
+/* AFS network device helpers
+ *
+ * Copyright (c) 2007 Patrick McHardy <kaber@trash.net>
+ */
+
+#include <linux/string.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <net/net_namespace.h>
+#include "internal.h"
+
+/*
+ * get a MAC address from a random ethernet interface that has a real one
+ * - the buffer will normally be 6 bytes in size
+ */
+int afs_get_MAC_address(u8 *mac, size_t maclen)
+{
+	struct net_device *dev;
+	int ret = -ENODEV;
+
+	BUG_ON(maclen != ETH_ALEN);
+
+	rtnl_lock();
+	dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER);
+	if (dev) {
+		memcpy(mac, dev->dev_addr, maclen);
+		ret = 0;
+	}
+	rtnl_unlock();
+	return ret;
+}
+
+/*
+ * get a list of this system's interface IPv4 addresses, netmasks and MTUs
+ * - maxbufs must be at least 1
+ * - returns the number of interface records in the buffer
+ */
+int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs,
+			    bool wantloopback)
+{
+	struct net_device *dev;
+	struct in_device *idev;
+	int n = 0;
+
+	ASSERT(maxbufs > 0);
+
+	rtnl_lock();
+	for_each_netdev(&init_net, dev) {
+		if (dev->type == ARPHRD_LOOPBACK && !wantloopback)
+			continue;
+		idev = __in_dev_get_rtnl(dev);
+		if (!idev)
+			continue;
+		for_primary_ifa(idev) {
+			bufs[n].address.s_addr = ifa->ifa_address;
+			bufs[n].netmask.s_addr = ifa->ifa_mask;
+			bufs[n].mtu = dev->mtu;
+			n++;
+			if (n >= maxbufs)
+				goto out;
+		} endfor_ifa(idev);
+	}
+out:
+	rtnl_unlock();
+	return n;
+}
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
new file mode 100644
index 00000000..096b23f8
--- /dev/null
+++ b/fs/afs/proc.c
@@ -0,0 +1,744 @@
+/* /proc interface for AFS
+ *
+ * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static struct proc_dir_entry *proc_afs;
+
+
+static int afs_proc_cells_open(struct inode *inode, struct file *file);
+static void *afs_proc_cells_start(struct seq_file *p, loff_t *pos);
+static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos);
+static void afs_proc_cells_stop(struct seq_file *p, void *v);
+static int afs_proc_cells_show(struct seq_file *m, void *v);
+static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
+				    size_t size, loff_t *_pos);
+
+static const struct seq_operations afs_proc_cells_ops = {
+	.start	= afs_proc_cells_start,
+	.next	= afs_proc_cells_next,
+	.stop	= afs_proc_cells_stop,
+	.show	= afs_proc_cells_show,
+};
+
+static const struct file_operations afs_proc_cells_fops = {
+	.open		= afs_proc_cells_open,
+	.read		= seq_read,
+	.write		= afs_proc_cells_write,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+	.owner		= THIS_MODULE,
+};
+
+static int afs_proc_rootcell_open(struct inode *inode, struct file *file);
+static int afs_proc_rootcell_release(struct inode *inode, struct file *file);
+static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
+				      size_t size, loff_t *_pos);
+static ssize_t afs_proc_rootcell_write(struct file *file,
+				       const char __user *buf,
+				       size_t size, loff_t *_pos);
+
+static const struct file_operations afs_proc_rootcell_fops = {
+	.open		= afs_proc_rootcell_open,
+	.read		= afs_proc_rootcell_read,
+	.write		= afs_proc_rootcell_write,
+	.llseek		= no_llseek,
+	.release	= afs_proc_rootcell_release,
+	.owner		= THIS_MODULE,
+};
+
+static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file);
+static int afs_proc_cell_volumes_release(struct inode *inode,
+					 struct file *file);
+static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos);
+static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
+					loff_t *pos);
+static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v);
+static int afs_proc_cell_volumes_show(struct seq_file *m, void *v);
+
+static const struct seq_operations afs_proc_cell_volumes_ops = {
+	.start	= afs_proc_cell_volumes_start,
+	.next	= afs_proc_cell_volumes_next,
+	.stop	= afs_proc_cell_volumes_stop,
+	.show	= afs_proc_cell_volumes_show,
+};
+
+static const struct file_operations afs_proc_cell_volumes_fops = {
+	.open		= afs_proc_cell_volumes_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= afs_proc_cell_volumes_release,
+	.owner		= THIS_MODULE,
+};
+
+static int afs_proc_cell_vlservers_open(struct inode *inode,
+					struct file *file);
+static int afs_proc_cell_vlservers_release(struct inode *inode,
+					   struct file *file);
+static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos);
+static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
+					  loff_t *pos);
+static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v);
+static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v);
+
+static const struct seq_operations afs_proc_cell_vlservers_ops = {
+	.start	= afs_proc_cell_vlservers_start,
+	.next	= afs_proc_cell_vlservers_next,
+	.stop	= afs_proc_cell_vlservers_stop,
+	.show	= afs_proc_cell_vlservers_show,
+};
+
+static const struct file_operations afs_proc_cell_vlservers_fops = {
+	.open		= afs_proc_cell_vlservers_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= afs_proc_cell_vlservers_release,
+	.owner		= THIS_MODULE,
+};
+
+static int afs_proc_cell_servers_open(struct inode *inode, struct file *file);
+static int afs_proc_cell_servers_release(struct inode *inode,
+					 struct file *file);
+static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos);
+static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
+					loff_t *pos);
+static void afs_proc_cell_servers_stop(struct seq_file *p, void *v);
+static int afs_proc_cell_servers_show(struct seq_file *m, void *v);
+
+static const struct seq_operations afs_proc_cell_servers_ops = {
+	.start	= afs_proc_cell_servers_start,
+	.next	= afs_proc_cell_servers_next,
+	.stop	= afs_proc_cell_servers_stop,
+	.show	= afs_proc_cell_servers_show,
+};
+
+static const struct file_operations afs_proc_cell_servers_fops = {
+	.open		= afs_proc_cell_servers_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= afs_proc_cell_servers_release,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * initialise the /proc/fs/afs/ directory
+ */
+int afs_proc_init(void)
+{
+	struct proc_dir_entry *p;
+
+	_enter("");
+
+	proc_afs = proc_mkdir("fs/afs", NULL);
+	if (!proc_afs)
+		goto error_dir;
+
+	p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops);
+	if (!p)
+		goto error_cells;
+
+	p = proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops);
+	if (!p)
+		goto error_rootcell;
+
+	_leave(" = 0");
+	return 0;
+
+error_rootcell:
+ 	remove_proc_entry("cells", proc_afs);
+error_cells:
+	remove_proc_entry("fs/afs", NULL);
+error_dir:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/afs/ directory
+ */
+void afs_proc_cleanup(void)
+{
+	remove_proc_entry("rootcell", proc_afs);
+	remove_proc_entry("cells", proc_afs);
+	remove_proc_entry("fs/afs", NULL);
+}
+
+/*
+ * open "/proc/fs/afs/cells" which provides a summary of extant cells
+ */
+static int afs_proc_cells_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open(file, &afs_proc_cells_ops);
+	if (ret < 0)
+		return ret;
+
+	m = file->private_data;
+	m->private = PDE(inode)->data;
+
+	return 0;
+}
+
+/*
+ * set up the iterator to start reading from the cells list and return the
+ * first item
+ */
+static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
+{
+	/* lock the list against modification */
+	down_read(&afs_proc_cells_sem);
+	return seq_list_start_head(&afs_proc_cells, *_pos);
+}
+
+/*
+ * move to next cell in cells list
+ */
+static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &afs_proc_cells, pos);
+}
+
+/*
+ * clean up after reading from the cells list
+ */
+static void afs_proc_cells_stop(struct seq_file *p, void *v)
+{
+	up_read(&afs_proc_cells_sem);
+}
+
+/*
+ * display a header line followed by a load of cell lines
+ */
+static int afs_proc_cells_show(struct seq_file *m, void *v)
+{
+	struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
+
+	if (v == &afs_proc_cells) {
+		/* display header on line 1 */
+		seq_puts(m, "USE NAME\n");
+		return 0;
+	}
+
+	/* display one cell per line on subsequent lines */
+	seq_printf(m, "%3d %s\n",
+		   atomic_read(&cell->usage), cell->name);
+	return 0;
+}
+
+/*
+ * handle writes to /proc/fs/afs/cells
+ * - to add cells: echo "add <cellname> <IP>[:<IP>][:<IP>]"
+ */
+static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
+				    size_t size, loff_t *_pos)
+{
+	char *kbuf, *name, *args;
+	int ret;
+
+	/* start by dragging the command into memory */
+	if (size <= 1 || size >= PAGE_SIZE)
+		return -EINVAL;
+
+	kbuf = kmalloc(size + 1, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	ret = -EFAULT;
+	if (copy_from_user(kbuf, buf, size) != 0)
+		goto done;
+	kbuf[size] = 0;
+
+	/* trim to first NL */
+	name = memchr(kbuf, '\n', size);
+	if (name)
+		*name = 0;
+
+	/* split into command, name and argslist */
+	name = strchr(kbuf, ' ');
+	if (!name)
+		goto inval;
+	do {
+		*name++ = 0;
+	} while(*name == ' ');
+	if (!*name)
+		goto inval;
+
+	args = strchr(name, ' ');
+	if (!args)
+		goto inval;
+	do {
+		*args++ = 0;
+	} while(*args == ' ');
+	if (!*args)
+		goto inval;
+
+	/* determine command to perform */
+	_debug("cmd=%s name=%s args=%s", kbuf, name, args);
+
+	if (strcmp(kbuf, "add") == 0) {
+		struct afs_cell *cell;
+
+		cell = afs_cell_create(name, strlen(name), args, false);
+		if (IS_ERR(cell)) {
+			ret = PTR_ERR(cell);
+			goto done;
+		}
+
+		afs_put_cell(cell);
+		printk("kAFS: Added new cell '%s'\n", name);
+	} else {
+		goto inval;
+	}
+
+	ret = size;
+
+done:
+	kfree(kbuf);
+	_leave(" = %d", ret);
+	return ret;
+
+inval:
+	ret = -EINVAL;
+	printk("kAFS: Invalid Command on /proc/fs/afs/cells file\n");
+	goto done;
+}
+
+/*
+ * Stubs for /proc/fs/afs/rootcell
+ */
+static int afs_proc_rootcell_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static int afs_proc_rootcell_release(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
+				      size_t size, loff_t *_pos)
+{
+	return 0;
+}
+
+/*
+ * handle writes to /proc/fs/afs/rootcell
+ * - to initialize rootcell: echo "cell.name:192.168.231.14"
+ */
+static ssize_t afs_proc_rootcell_write(struct file *file,
+				       const char __user *buf,
+				       size_t size, loff_t *_pos)
+{
+	char *kbuf, *s;
+	int ret;
+
+	/* start by dragging the command into memory */
+	if (size <= 1 || size >= PAGE_SIZE)
+		return -EINVAL;
+
+	ret = -ENOMEM;
+	kbuf = kmalloc(size + 1, GFP_KERNEL);
+	if (!kbuf)
+		goto nomem;
+
+	ret = -EFAULT;
+	if (copy_from_user(kbuf, buf, size) != 0)
+		goto infault;
+	kbuf[size] = 0;
+
+	/* trim to first NL */
+	s = memchr(kbuf, '\n', size);
+	if (s)
+		*s = 0;
+
+	/* determine command to perform */
+	_debug("rootcell=%s", kbuf);
+
+	ret = afs_cell_init(kbuf);
+	if (ret >= 0)
+		ret = size;	/* consume everything, always */
+
+infault:
+	kfree(kbuf);
+nomem:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * initialise /proc/fs/afs/<cell>/
+ */
+int afs_proc_cell_setup(struct afs_cell *cell)
+{
+	struct proc_dir_entry *p;
+
+	_enter("%p{%s}", cell, cell->name);
+
+	cell->proc_dir = proc_mkdir(cell->name, proc_afs);
+	if (!cell->proc_dir)
+		goto error_dir;
+
+	p = proc_create_data("servers", 0, cell->proc_dir,
+			     &afs_proc_cell_servers_fops, cell);
+	if (!p)
+		goto error_servers;
+
+	p = proc_create_data("vlservers", 0, cell->proc_dir,
+			     &afs_proc_cell_vlservers_fops, cell);
+	if (!p)
+		goto error_vlservers;
+
+	p = proc_create_data("volumes", 0, cell->proc_dir,
+			     &afs_proc_cell_volumes_fops, cell);
+	if (!p)
+		goto error_volumes;
+
+	_leave(" = 0");
+	return 0;
+
+error_volumes:
+	remove_proc_entry("vlservers", cell->proc_dir);
+error_vlservers:
+	remove_proc_entry("servers", cell->proc_dir);
+error_servers:
+	remove_proc_entry(cell->name, proc_afs);
+error_dir:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * remove /proc/fs/afs/<cell>/
+ */
+void afs_proc_cell_remove(struct afs_cell *cell)
+{
+	_enter("");
+
+	remove_proc_entry("volumes", cell->proc_dir);
+	remove_proc_entry("vlservers", cell->proc_dir);
+	remove_proc_entry("servers", cell->proc_dir);
+	remove_proc_entry(cell->name, proc_afs);
+
+	_leave("");
+}
+
+/*
+ * open "/proc/fs/afs/<cell>/volumes" which provides a summary of extant cells
+ */
+static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
+{
+	struct afs_cell *cell;
+	struct seq_file *m;
+	int ret;
+
+	cell = PDE(inode)->data;
+	if (!cell)
+		return -ENOENT;
+
+	ret = seq_open(file, &afs_proc_cell_volumes_ops);
+	if (ret < 0)
+		return ret;
+
+	m = file->private_data;
+	m->private = cell;
+
+	return 0;
+}
+
+/*
+ * close the file and release the ref to the cell
+ */
+static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file)
+{
+	return seq_release(inode, file);
+}
+
+/*
+ * set up the iterator to start reading from the cells list and return the
+ * first item
+ */
+static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
+{
+	struct afs_cell *cell = m->private;
+
+	_enter("cell=%p pos=%Ld", cell, *_pos);
+
+	/* lock the list against modification */
+	down_read(&cell->vl_sem);
+	return seq_list_start_head(&cell->vl_list, *_pos);
+}
+
+/*
+ * move to next cell in cells list
+ */
+static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
+					loff_t *_pos)
+{
+	struct afs_cell *cell = p->private;
+
+	_enter("cell=%p pos=%Ld", cell, *_pos);
+	return seq_list_next(v, &cell->vl_list, _pos);
+}
+
+/*
+ * clean up after reading from the cells list
+ */
+static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v)
+{
+	struct afs_cell *cell = p->private;
+
+	up_read(&cell->vl_sem);
+}
+
+static const char afs_vlocation_states[][4] = {
+	[AFS_VL_NEW]			= "New",
+	[AFS_VL_CREATING]		= "Crt",
+	[AFS_VL_VALID]			= "Val",
+	[AFS_VL_NO_VOLUME]		= "NoV",
+	[AFS_VL_UPDATING]		= "Upd",
+	[AFS_VL_VOLUME_DELETED]		= "Del",
+	[AFS_VL_UNCERTAIN]		= "Unc",
+};
+
+/*
+ * display a header line followed by a load of volume lines
+ */
+static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
+{
+	struct afs_cell *cell = m->private;
+	struct afs_vlocation *vlocation =
+		list_entry(v, struct afs_vlocation, link);
+
+	/* display header on line 1 */
+	if (v == &cell->vl_list) {
+		seq_puts(m, "USE STT VLID[0]  VLID[1]  VLID[2]  NAME\n");
+		return 0;
+	}
+
+	/* display one cell per line on subsequent lines */
+	seq_printf(m, "%3d %s %08x %08x %08x %s\n",
+		   atomic_read(&vlocation->usage),
+		   afs_vlocation_states[vlocation->state],
+		   vlocation->vldb.vid[0],
+		   vlocation->vldb.vid[1],
+		   vlocation->vldb.vid[2],
+		   vlocation->vldb.name);
+
+	return 0;
+}
+
+/*
+ * open "/proc/fs/afs/<cell>/vlservers" which provides a list of volume
+ * location server
+ */
+static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
+{
+	struct afs_cell *cell;
+	struct seq_file *m;
+	int ret;
+
+	cell = PDE(inode)->data;
+	if (!cell)
+		return -ENOENT;
+
+	ret = seq_open(file, &afs_proc_cell_vlservers_ops);
+	if (ret<0)
+		return ret;
+
+	m = file->private_data;
+	m->private = cell;
+
+	return 0;
+}
+
+/*
+ * close the file and release the ref to the cell
+ */
+static int afs_proc_cell_vlservers_release(struct inode *inode,
+					   struct file *file)
+{
+	return seq_release(inode, file);
+}
+
+/*
+ * set up the iterator to start reading from the cells list and return the
+ * first item
+ */
+static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
+{
+	struct afs_cell *cell = m->private;
+	loff_t pos = *_pos;
+
+	_enter("cell=%p pos=%Ld", cell, *_pos);
+
+	/* lock the list against modification */
+	down_read(&cell->vl_sem);
+
+	/* allow for the header line */
+	if (!pos)
+		return (void *) 1;
+	pos--;
+
+	if (pos >= cell->vl_naddrs)
+		return NULL;
+
+	return &cell->vl_addrs[pos];
+}
+
+/*
+ * move to next cell in cells list
+ */
+static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
+					  loff_t *_pos)
+{
+	struct afs_cell *cell = p->private;
+	loff_t pos;
+
+	_enter("cell=%p{nad=%u} pos=%Ld", cell, cell->vl_naddrs, *_pos);
+
+	pos = *_pos;
+	(*_pos)++;
+	if (pos >= cell->vl_naddrs)
+		return NULL;
+
+	return &cell->vl_addrs[pos];
+}
+
+/*
+ * clean up after reading from the cells list
+ */
+static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v)
+{
+	struct afs_cell *cell = p->private;
+
+	up_read(&cell->vl_sem);
+}
+
+/*
+ * display a header line followed by a load of volume lines
+ */
+static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
+{
+	struct in_addr *addr = v;
+
+	/* display header on line 1 */
+	if (v == (struct in_addr *) 1) {
+		seq_puts(m, "ADDRESS\n");
+		return 0;
+	}
+
+	/* display one cell per line on subsequent lines */
+	seq_printf(m, "%pI4\n", &addr->s_addr);
+	return 0;
+}
+
+/*
+ * open "/proc/fs/afs/<cell>/servers" which provides a summary of active
+ * servers
+ */
+static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
+{
+	struct afs_cell *cell;
+	struct seq_file *m;
+	int ret;
+
+	cell = PDE(inode)->data;
+	if (!cell)
+		return -ENOENT;
+
+	ret = seq_open(file, &afs_proc_cell_servers_ops);
+	if (ret < 0)
+		return ret;
+
+	m = file->private_data;
+	m->private = cell;
+	return 0;
+}
+
+/*
+ * close the file and release the ref to the cell
+ */
+static int afs_proc_cell_servers_release(struct inode *inode,
+					 struct file *file)
+{
+	return seq_release(inode, file);
+}
+
+/*
+ * set up the iterator to start reading from the cells list and return the
+ * first item
+ */
+static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
+	__acquires(m->private->servers_lock)
+{
+	struct afs_cell *cell = m->private;
+
+	_enter("cell=%p pos=%Ld", cell, *_pos);
+
+	/* lock the list against modification */
+	read_lock(&cell->servers_lock);
+	return seq_list_start_head(&cell->servers, *_pos);
+}
+
+/*
+ * move to next cell in cells list
+ */
+static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
+					loff_t *_pos)
+{
+	struct afs_cell *cell = p->private;
+
+	_enter("cell=%p pos=%Ld", cell, *_pos);
+	return seq_list_next(v, &cell->servers, _pos);
+}
+
+/*
+ * clean up after reading from the cells list
+ */
+static void afs_proc_cell_servers_stop(struct seq_file *p, void *v)
+	__releases(p->private->servers_lock)
+{
+	struct afs_cell *cell = p->private;
+
+	read_unlock(&cell->servers_lock);
+}
+
+/*
+ * display a header line followed by a load of volume lines
+ */
+static int afs_proc_cell_servers_show(struct seq_file *m, void *v)
+{
+	struct afs_cell *cell = m->private;
+	struct afs_server *server = list_entry(v, struct afs_server, link);
+	char ipaddr[20];
+
+	/* display header on line 1 */
+	if (v == &cell->servers) {
+		seq_puts(m, "USE ADDR            STATE\n");
+		return 0;
+	}
+
+	/* display one cell per line on subsequent lines */
+	sprintf(ipaddr, "%pI4", &server->addr);
+	seq_printf(m, "%3d %-15.15s %5d\n",
+		   atomic_read(&server->usage), ipaddr, server->fs_state);
+
+	return 0;
+}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
new file mode 100644
index 00000000..8ad8c2a0
--- /dev/null
+++ b/fs/afs/rxrpc.c
@@ -0,0 +1,859 @@
+/* Maintain an RxRPC server socket to do AFS communications through
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <rxrpc/packet.h>
+#include "internal.h"
+#include "afs_cm.h"
+
+static struct socket *afs_socket; /* my RxRPC socket */
+static struct workqueue_struct *afs_async_calls;
+static atomic_t afs_outstanding_calls;
+static atomic_t afs_outstanding_skbs;
+
+static void afs_wake_up_call_waiter(struct afs_call *);
+static int afs_wait_for_call_to_complete(struct afs_call *);
+static void afs_wake_up_async_call(struct afs_call *);
+static int afs_dont_wait_for_call_to_complete(struct afs_call *);
+static void afs_process_async_call(struct work_struct *);
+static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *);
+static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool);
+
+/* synchronous call management */
+const struct afs_wait_mode afs_sync_call = {
+	.rx_wakeup	= afs_wake_up_call_waiter,
+	.wait		= afs_wait_for_call_to_complete,
+};
+
+/* asynchronous call management */
+const struct afs_wait_mode afs_async_call = {
+	.rx_wakeup	= afs_wake_up_async_call,
+	.wait		= afs_dont_wait_for_call_to_complete,
+};
+
+/* asynchronous incoming call management */
+static const struct afs_wait_mode afs_async_incoming_call = {
+	.rx_wakeup	= afs_wake_up_async_call,
+};
+
+/* asynchronous incoming call initial processing */
+static const struct afs_call_type afs_RXCMxxxx = {
+	.name		= "CB.xxxx",
+	.deliver	= afs_deliver_cm_op_id,
+	.abort_to_error	= afs_abort_to_error,
+};
+
+static void afs_collect_incoming_call(struct work_struct *);
+
+static struct sk_buff_head afs_incoming_calls;
+static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
+
+/*
+ * open an RxRPC socket and bind it to be a server for callback notifications
+ * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
+ */
+int afs_open_socket(void)
+{
+	struct sockaddr_rxrpc srx;
+	struct socket *socket;
+	int ret;
+
+	_enter("");
+
+	skb_queue_head_init(&afs_incoming_calls);
+
+	afs_async_calls = create_singlethread_workqueue("kafsd");
+	if (!afs_async_calls) {
+		_leave(" = -ENOMEM [wq]");
+		return -ENOMEM;
+	}
+
+	ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
+	if (ret < 0) {
+		destroy_workqueue(afs_async_calls);
+		_leave(" = %d [socket]", ret);
+		return ret;
+	}
+
+	socket->sk->sk_allocation = GFP_NOFS;
+
+	/* bind the callback manager's address to make this a server socket */
+	srx.srx_family			= AF_RXRPC;
+	srx.srx_service			= CM_SERVICE;
+	srx.transport_type		= SOCK_DGRAM;
+	srx.transport_len		= sizeof(srx.transport.sin);
+	srx.transport.sin.sin_family	= AF_INET;
+	srx.transport.sin.sin_port	= htons(AFS_CM_PORT);
+	memset(&srx.transport.sin.sin_addr, 0,
+	       sizeof(srx.transport.sin.sin_addr));
+
+	ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+	if (ret < 0) {
+		sock_release(socket);
+		destroy_workqueue(afs_async_calls);
+		_leave(" = %d [bind]", ret);
+		return ret;
+	}
+
+	rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor);
+
+	afs_socket = socket;
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * close the RxRPC socket AFS was using
+ */
+void afs_close_socket(void)
+{
+	_enter("");
+
+	sock_release(afs_socket);
+
+	_debug("dework");
+	destroy_workqueue(afs_async_calls);
+
+	ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0);
+	ASSERTCMP(atomic_read(&afs_outstanding_calls), ==, 0);
+	_leave("");
+}
+
+/*
+ * note that the data in a socket buffer is now delivered and that the buffer
+ * should be freed
+ */
+static void afs_data_delivered(struct sk_buff *skb)
+{
+	if (!skb) {
+		_debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs));
+		dump_stack();
+	} else {
+		_debug("DLVR %p{%u} [%d]",
+		       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+		if (atomic_dec_return(&afs_outstanding_skbs) == -1)
+			BUG();
+		rxrpc_kernel_data_delivered(skb);
+	}
+}
+
+/*
+ * free a socket buffer
+ */
+static void afs_free_skb(struct sk_buff *skb)
+{
+	if (!skb) {
+		_debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs));
+		dump_stack();
+	} else {
+		_debug("FREE %p{%u} [%d]",
+		       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+		if (atomic_dec_return(&afs_outstanding_skbs) == -1)
+			BUG();
+		rxrpc_kernel_free_skb(skb);
+	}
+}
+
+/*
+ * free a call
+ */
+static void afs_free_call(struct afs_call *call)
+{
+	_debug("DONE %p{%s} [%d]",
+	       call, call->type->name, atomic_read(&afs_outstanding_calls));
+	if (atomic_dec_return(&afs_outstanding_calls) == -1)
+		BUG();
+
+	ASSERTCMP(call->rxcall, ==, NULL);
+	ASSERT(!work_pending(&call->async_work));
+	ASSERT(skb_queue_empty(&call->rx_queue));
+	ASSERT(call->type->name != NULL);
+
+	kfree(call->request);
+	kfree(call);
+}
+
+/*
+ * allocate a call with flat request and reply buffers
+ */
+struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
+				     size_t request_size, size_t reply_size)
+{
+	struct afs_call *call;
+
+	call = kzalloc(sizeof(*call), GFP_NOFS);
+	if (!call)
+		goto nomem_call;
+
+	_debug("CALL %p{%s} [%d]",
+	       call, type->name, atomic_read(&afs_outstanding_calls));
+	atomic_inc(&afs_outstanding_calls);
+
+	call->type = type;
+	call->request_size = request_size;
+	call->reply_max = reply_size;
+
+	if (request_size) {
+		call->request = kmalloc(request_size, GFP_NOFS);
+		if (!call->request)
+			goto nomem_free;
+	}
+
+	if (reply_size) {
+		call->buffer = kmalloc(reply_size, GFP_NOFS);
+		if (!call->buffer)
+			goto nomem_free;
+	}
+
+	init_waitqueue_head(&call->waitq);
+	skb_queue_head_init(&call->rx_queue);
+	return call;
+
+nomem_free:
+	afs_free_call(call);
+nomem_call:
+	return NULL;
+}
+
+/*
+ * clean up a call with flat buffer
+ */
+void afs_flat_call_destructor(struct afs_call *call)
+{
+	_enter("");
+
+	kfree(call->request);
+	call->request = NULL;
+	kfree(call->buffer);
+	call->buffer = NULL;
+}
+
+/*
+ * attach the data from a bunch of pages on an inode to a call
+ */
+static int afs_send_pages(struct afs_call *call, struct msghdr *msg,
+			  struct kvec *iov)
+{
+	struct page *pages[8];
+	unsigned count, n, loop, offset, to;
+	pgoff_t first = call->first, last = call->last;
+	int ret;
+
+	_enter("");
+
+	offset = call->first_offset;
+	call->first_offset = 0;
+
+	do {
+		_debug("attach %lx-%lx", first, last);
+
+		count = last - first + 1;
+		if (count > ARRAY_SIZE(pages))
+			count = ARRAY_SIZE(pages);
+		n = find_get_pages_contig(call->mapping, first, count, pages);
+		ASSERTCMP(n, ==, count);
+
+		loop = 0;
+		do {
+			msg->msg_flags = 0;
+			to = PAGE_SIZE;
+			if (first + loop >= last)
+				to = call->last_to;
+			else
+				msg->msg_flags = MSG_MORE;
+			iov->iov_base = kmap(pages[loop]) + offset;
+			iov->iov_len = to - offset;
+			offset = 0;
+
+			_debug("- range %u-%u%s",
+			       offset, to, msg->msg_flags ? " [more]" : "");
+			msg->msg_iov = (struct iovec *) iov;
+			msg->msg_iovlen = 1;
+
+			/* have to change the state *before* sending the last
+			 * packet as RxRPC might give us the reply before it
+			 * returns from sending the request */
+			if (first + loop >= last)
+				call->state = AFS_CALL_AWAIT_REPLY;
+			ret = rxrpc_kernel_send_data(call->rxcall, msg,
+						     to - offset);
+			kunmap(pages[loop]);
+			if (ret < 0)
+				break;
+		} while (++loop < count);
+		first += count;
+
+		for (loop = 0; loop < count; loop++)
+			put_page(pages[loop]);
+		if (ret < 0)
+			break;
+	} while (first <= last);
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * initiate a call
+ */
+int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
+		  const struct afs_wait_mode *wait_mode)
+{
+	struct sockaddr_rxrpc srx;
+	struct rxrpc_call *rxcall;
+	struct msghdr msg;
+	struct kvec iov[1];
+	int ret;
+	struct sk_buff *skb;
+
+	_enter("%x,{%d},", addr->s_addr, ntohs(call->port));
+
+	ASSERT(call->type != NULL);
+	ASSERT(call->type->name != NULL);
+
+	_debug("____MAKE %p{%s,%x} [%d]____",
+	       call, call->type->name, key_serial(call->key),
+	       atomic_read(&afs_outstanding_calls));
+
+	call->wait_mode = wait_mode;
+	INIT_WORK(&call->async_work, afs_process_async_call);
+
+	memset(&srx, 0, sizeof(srx));
+	srx.srx_family = AF_RXRPC;
+	srx.srx_service = call->service_id;
+	srx.transport_type = SOCK_DGRAM;
+	srx.transport_len = sizeof(srx.transport.sin);
+	srx.transport.sin.sin_family = AF_INET;
+	srx.transport.sin.sin_port = call->port;
+	memcpy(&srx.transport.sin.sin_addr, addr, 4);
+
+	/* create a call */
+	rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key,
+					 (unsigned long) call, gfp);
+	call->key = NULL;
+	if (IS_ERR(rxcall)) {
+		ret = PTR_ERR(rxcall);
+		goto error_kill_call;
+	}
+
+	call->rxcall = rxcall;
+
+	/* send the request */
+	iov[0].iov_base	= call->request;
+	iov[0].iov_len	= call->request_size;
+
+	msg.msg_name		= NULL;
+	msg.msg_namelen		= 0;
+	msg.msg_iov		= (struct iovec *) iov;
+	msg.msg_iovlen		= 1;
+	msg.msg_control		= NULL;
+	msg.msg_controllen	= 0;
+	msg.msg_flags		= (call->send_pages ? MSG_MORE : 0);
+
+	/* have to change the state *before* sending the last packet as RxRPC
+	 * might give us the reply before it returns from sending the
+	 * request */
+	if (!call->send_pages)
+		call->state = AFS_CALL_AWAIT_REPLY;
+	ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size);
+	if (ret < 0)
+		goto error_do_abort;
+
+	if (call->send_pages) {
+		ret = afs_send_pages(call, &msg, iov);
+		if (ret < 0)
+			goto error_do_abort;
+	}
+
+	/* at this point, an async call may no longer exist as it may have
+	 * already completed */
+	return wait_mode->wait(call);
+
+error_do_abort:
+	rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
+	while ((skb = skb_dequeue(&call->rx_queue)))
+		afs_free_skb(skb);
+	rxrpc_kernel_end_call(rxcall);
+	call->rxcall = NULL;
+error_kill_call:
+	call->type->destructor(call);
+	afs_free_call(call);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * handles intercepted messages that were arriving in the socket's Rx queue
+ * - called with the socket receive queue lock held to ensure message ordering
+ * - called with softirqs disabled
+ */
+static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
+			       struct sk_buff *skb)
+{
+	struct afs_call *call = (struct afs_call *) user_call_ID;
+
+	_enter("%p,,%u", call, skb->mark);
+
+	_debug("ICPT %p{%u} [%d]",
+	       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+
+	ASSERTCMP(sk, ==, afs_socket->sk);
+	atomic_inc(&afs_outstanding_skbs);
+
+	if (!call) {
+		/* its an incoming call for our callback service */
+		skb_queue_tail(&afs_incoming_calls, skb);
+		queue_work(afs_wq, &afs_collect_incoming_call_work);
+	} else {
+		/* route the messages directly to the appropriate call */
+		skb_queue_tail(&call->rx_queue, skb);
+		call->wait_mode->rx_wakeup(call);
+	}
+
+	_leave("");
+}
+
+/*
+ * deliver messages to a call
+ */
+static void afs_deliver_to_call(struct afs_call *call)
+{
+	struct sk_buff *skb;
+	bool last;
+	u32 abort_code;
+	int ret;
+
+	_enter("");
+
+	while ((call->state == AFS_CALL_AWAIT_REPLY ||
+		call->state == AFS_CALL_AWAIT_OP_ID ||
+		call->state == AFS_CALL_AWAIT_REQUEST ||
+		call->state == AFS_CALL_AWAIT_ACK) &&
+	       (skb = skb_dequeue(&call->rx_queue))) {
+		switch (skb->mark) {
+		case RXRPC_SKB_MARK_DATA:
+			_debug("Rcv DATA");
+			last = rxrpc_kernel_is_data_last(skb);
+			ret = call->type->deliver(call, skb, last);
+			switch (ret) {
+			case 0:
+				if (last &&
+				    call->state == AFS_CALL_AWAIT_REPLY)
+					call->state = AFS_CALL_COMPLETE;
+				break;
+			case -ENOTCONN:
+				abort_code = RX_CALL_DEAD;
+				goto do_abort;
+			case -ENOTSUPP:
+				abort_code = RX_INVALID_OPERATION;
+				goto do_abort;
+			default:
+				abort_code = RXGEN_CC_UNMARSHAL;
+				if (call->state != AFS_CALL_AWAIT_REPLY)
+					abort_code = RXGEN_SS_UNMARSHAL;
+			do_abort:
+				rxrpc_kernel_abort_call(call->rxcall,
+							abort_code);
+				call->error = ret;
+				call->state = AFS_CALL_ERROR;
+				break;
+			}
+			afs_data_delivered(skb);
+			skb = NULL;
+			continue;
+		case RXRPC_SKB_MARK_FINAL_ACK:
+			_debug("Rcv ACK");
+			call->state = AFS_CALL_COMPLETE;
+			break;
+		case RXRPC_SKB_MARK_BUSY:
+			_debug("Rcv BUSY");
+			call->error = -EBUSY;
+			call->state = AFS_CALL_BUSY;
+			break;
+		case RXRPC_SKB_MARK_REMOTE_ABORT:
+			abort_code = rxrpc_kernel_get_abort_code(skb);
+			call->error = call->type->abort_to_error(abort_code);
+			call->state = AFS_CALL_ABORTED;
+			_debug("Rcv ABORT %u -> %d", abort_code, call->error);
+			break;
+		case RXRPC_SKB_MARK_NET_ERROR:
+			call->error = -rxrpc_kernel_get_error_number(skb);
+			call->state = AFS_CALL_ERROR;
+			_debug("Rcv NET ERROR %d", call->error);
+			break;
+		case RXRPC_SKB_MARK_LOCAL_ERROR:
+			call->error = -rxrpc_kernel_get_error_number(skb);
+			call->state = AFS_CALL_ERROR;
+			_debug("Rcv LOCAL ERROR %d", call->error);
+			break;
+		default:
+			BUG();
+			break;
+		}
+
+		afs_free_skb(skb);
+	}
+
+	/* make sure the queue is empty if the call is done with (we might have
+	 * aborted the call early because of an unmarshalling error) */
+	if (call->state >= AFS_CALL_COMPLETE) {
+		while ((skb = skb_dequeue(&call->rx_queue)))
+			afs_free_skb(skb);
+		if (call->incoming) {
+			rxrpc_kernel_end_call(call->rxcall);
+			call->rxcall = NULL;
+			call->type->destructor(call);
+			afs_free_call(call);
+		}
+	}
+
+	_leave("");
+}
+
+/*
+ * wait synchronously for a call to complete
+ */
+static int afs_wait_for_call_to_complete(struct afs_call *call)
+{
+	struct sk_buff *skb;
+	int ret;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	_enter("");
+
+	add_wait_queue(&call->waitq, &myself);
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		/* deliver any messages that are in the queue */
+		if (!skb_queue_empty(&call->rx_queue)) {
+			__set_current_state(TASK_RUNNING);
+			afs_deliver_to_call(call);
+			continue;
+		}
+
+		ret = call->error;
+		if (call->state >= AFS_CALL_COMPLETE)
+			break;
+		ret = -EINTR;
+		if (signal_pending(current))
+			break;
+		schedule();
+	}
+
+	remove_wait_queue(&call->waitq, &myself);
+	__set_current_state(TASK_RUNNING);
+
+	/* kill the call */
+	if (call->state < AFS_CALL_COMPLETE) {
+		_debug("call incomplete");
+		rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD);
+		while ((skb = skb_dequeue(&call->rx_queue)))
+			afs_free_skb(skb);
+	}
+
+	_debug("call complete");
+	rxrpc_kernel_end_call(call->rxcall);
+	call->rxcall = NULL;
+	call->type->destructor(call);
+	afs_free_call(call);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * wake up a waiting call
+ */
+static void afs_wake_up_call_waiter(struct afs_call *call)
+{
+	wake_up(&call->waitq);
+}
+
+/*
+ * wake up an asynchronous call
+ */
+static void afs_wake_up_async_call(struct afs_call *call)
+{
+	_enter("");
+	queue_work(afs_async_calls, &call->async_work);
+}
+
+/*
+ * put a call into asynchronous mode
+ * - mustn't touch the call descriptor as the call my have completed by the
+ *   time we get here
+ */
+static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
+{
+	_enter("");
+	return -EINPROGRESS;
+}
+
+/*
+ * delete an asynchronous call
+ */
+static void afs_delete_async_call(struct work_struct *work)
+{
+	struct afs_call *call =
+		container_of(work, struct afs_call, async_work);
+
+	_enter("");
+
+	afs_free_call(call);
+
+	_leave("");
+}
+
+/*
+ * perform processing on an asynchronous call
+ * - on a multiple-thread workqueue this work item may try to run on several
+ *   CPUs at the same time
+ */
+static void afs_process_async_call(struct work_struct *work)
+{
+	struct afs_call *call =
+		container_of(work, struct afs_call, async_work);
+
+	_enter("");
+
+	if (!skb_queue_empty(&call->rx_queue))
+		afs_deliver_to_call(call);
+
+	if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) {
+		if (call->wait_mode->async_complete)
+			call->wait_mode->async_complete(call->reply,
+							call->error);
+		call->reply = NULL;
+
+		/* kill the call */
+		rxrpc_kernel_end_call(call->rxcall);
+		call->rxcall = NULL;
+		if (call->type->destructor)
+			call->type->destructor(call);
+
+		/* we can't just delete the call because the work item may be
+		 * queued */
+		PREPARE_WORK(&call->async_work, afs_delete_async_call);
+		queue_work(afs_async_calls, &call->async_work);
+	}
+
+	_leave("");
+}
+
+/*
+ * empty a socket buffer into a flat reply buffer
+ */
+void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb)
+{
+	size_t len = skb->len;
+
+	if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0)
+		BUG();
+	call->reply_size += len;
+}
+
+/*
+ * accept the backlog of incoming calls
+ */
+static void afs_collect_incoming_call(struct work_struct *work)
+{
+	struct rxrpc_call *rxcall;
+	struct afs_call *call = NULL;
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&afs_incoming_calls))) {
+		_debug("new call");
+
+		/* don't need the notification */
+		afs_free_skb(skb);
+
+		if (!call) {
+			call = kzalloc(sizeof(struct afs_call), GFP_KERNEL);
+			if (!call) {
+				rxrpc_kernel_reject_call(afs_socket);
+				return;
+			}
+
+			INIT_WORK(&call->async_work, afs_process_async_call);
+			call->wait_mode = &afs_async_incoming_call;
+			call->type = &afs_RXCMxxxx;
+			init_waitqueue_head(&call->waitq);
+			skb_queue_head_init(&call->rx_queue);
+			call->state = AFS_CALL_AWAIT_OP_ID;
+
+			_debug("CALL %p{%s} [%d]",
+			       call, call->type->name,
+			       atomic_read(&afs_outstanding_calls));
+			atomic_inc(&afs_outstanding_calls);
+		}
+
+		rxcall = rxrpc_kernel_accept_call(afs_socket,
+						  (unsigned long) call);
+		if (!IS_ERR(rxcall)) {
+			call->rxcall = rxcall;
+			call = NULL;
+		}
+	}
+
+	if (call)
+		afs_free_call(call);
+}
+
+/*
+ * grab the operation ID from an incoming cache manager call
+ */
+static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
+				bool last)
+{
+	size_t len = skb->len;
+	void *oibuf = (void *) &call->operation_ID;
+
+	_enter("{%u},{%zu},%d", call->offset, len, last);
+
+	ASSERTCMP(call->offset, <, 4);
+
+	/* the operation ID forms the first four bytes of the request data */
+	len = min_t(size_t, len, 4 - call->offset);
+	if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0)
+		BUG();
+	if (!pskb_pull(skb, len))
+		BUG();
+	call->offset += len;
+
+	if (call->offset < 4) {
+		if (last) {
+			_leave(" = -EBADMSG [op ID short]");
+			return -EBADMSG;
+		}
+		_leave(" = 0 [incomplete]");
+		return 0;
+	}
+
+	call->state = AFS_CALL_AWAIT_REQUEST;
+
+	/* ask the cache manager to route the call (it'll change the call type
+	 * if successful) */
+	if (!afs_cm_incoming_call(call))
+		return -ENOTSUPP;
+
+	/* pass responsibility for the remainer of this message off to the
+	 * cache manager op */
+	return call->type->deliver(call, skb, last);
+}
+
+/*
+ * send an empty reply
+ */
+void afs_send_empty_reply(struct afs_call *call)
+{
+	struct msghdr msg;
+	struct iovec iov[1];
+
+	_enter("");
+
+	iov[0].iov_base		= NULL;
+	iov[0].iov_len		= 0;
+	msg.msg_name		= NULL;
+	msg.msg_namelen		= 0;
+	msg.msg_iov		= iov;
+	msg.msg_iovlen		= 0;
+	msg.msg_control		= NULL;
+	msg.msg_controllen	= 0;
+	msg.msg_flags		= 0;
+
+	call->state = AFS_CALL_AWAIT_ACK;
+	switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) {
+	case 0:
+		_leave(" [replied]");
+		return;
+
+	case -ENOMEM:
+		_debug("oom");
+		rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+	default:
+		rxrpc_kernel_end_call(call->rxcall);
+		call->rxcall = NULL;
+		call->type->destructor(call);
+		afs_free_call(call);
+		_leave(" [error]");
+		return;
+	}
+}
+
+/*
+ * send a simple reply
+ */
+void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
+{
+	struct msghdr msg;
+	struct iovec iov[1];
+	int n;
+
+	_enter("");
+
+	iov[0].iov_base		= (void *) buf;
+	iov[0].iov_len		= len;
+	msg.msg_name		= NULL;
+	msg.msg_namelen		= 0;
+	msg.msg_iov		= iov;
+	msg.msg_iovlen		= 1;
+	msg.msg_control		= NULL;
+	msg.msg_controllen	= 0;
+	msg.msg_flags		= 0;
+
+	call->state = AFS_CALL_AWAIT_ACK;
+	n = rxrpc_kernel_send_data(call->rxcall, &msg, len);
+	if (n >= 0) {
+		_leave(" [replied]");
+		return;
+	}
+	if (n == -ENOMEM) {
+		_debug("oom");
+		rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+	}
+	rxrpc_kernel_end_call(call->rxcall);
+	call->rxcall = NULL;
+	call->type->destructor(call);
+	afs_free_call(call);
+	_leave(" [error]");
+}
+
+/*
+ * extract a piece of data from the received data socket buffers
+ */
+int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
+		     bool last, void *buf, size_t count)
+{
+	size_t len = skb->len;
+
+	_enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count);
+
+	ASSERTCMP(call->offset, <, count);
+
+	len = min_t(size_t, len, count - call->offset);
+	if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 ||
+	    !pskb_pull(skb, len))
+		BUG();
+	call->offset += len;
+
+	if (call->offset < count) {
+		if (last) {
+			_leave(" = -EBADMSG [%d < %zu]", call->offset, count);
+			return -EBADMSG;
+		}
+		_leave(" = -EAGAIN");
+		return -EAGAIN;
+	}
+	return 0;
+}
diff --git a/fs/afs/security.c b/fs/afs/security.c
new file mode 100644
index 00000000..f44b9d35
--- /dev/null
+++ b/fs/afs/security.c
@@ -0,0 +1,363 @@
+/* AFS security handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include <keys/rxrpc-type.h>
+#include "internal.h"
+
+/*
+ * get a key
+ */
+struct key *afs_request_key(struct afs_cell *cell)
+{
+	struct key *key;
+
+	_enter("{%x}", key_serial(cell->anonymous_key));
+
+	_debug("key %s", cell->anonymous_key->description);
+	key = request_key(&key_type_rxrpc, cell->anonymous_key->description,
+			  NULL);
+	if (IS_ERR(key)) {
+		if (PTR_ERR(key) != -ENOKEY) {
+			_leave(" = %ld", PTR_ERR(key));
+			return key;
+		}
+
+		/* act as anonymous user */
+		_leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
+		return key_get(cell->anonymous_key);
+	} else {
+		/* act as authorised user */
+		_leave(" = {%x} [auth]", key_serial(key));
+		return key;
+	}
+}
+
+/*
+ * dispose of a permits list
+ */
+void afs_zap_permits(struct rcu_head *rcu)
+{
+	struct afs_permits *permits =
+		container_of(rcu, struct afs_permits, rcu);
+	int loop;
+
+	_enter("{%d}", permits->count);
+
+	for (loop = permits->count - 1; loop >= 0; loop--)
+		key_put(permits->permits[loop].key);
+	kfree(permits);
+}
+
+/*
+ * dispose of a permits list in which all the key pointers have been copied
+ */
+static void afs_dispose_of_permits(struct rcu_head *rcu)
+{
+	struct afs_permits *permits =
+		container_of(rcu, struct afs_permits, rcu);
+
+	_enter("{%d}", permits->count);
+
+	kfree(permits);
+}
+
+/*
+ * get the authorising vnode - this is the specified inode itself if it's a
+ * directory or it's the parent directory if the specified inode is a file or
+ * symlink
+ * - the caller must release the ref on the inode
+ */
+static struct afs_vnode *afs_get_auth_inode(struct afs_vnode *vnode,
+					    struct key *key)
+{
+	struct afs_vnode *auth_vnode;
+	struct inode *auth_inode;
+
+	_enter("");
+
+	if (S_ISDIR(vnode->vfs_inode.i_mode)) {
+		auth_inode = igrab(&vnode->vfs_inode);
+		ASSERT(auth_inode != NULL);
+	} else {
+		auth_inode = afs_iget(vnode->vfs_inode.i_sb, key,
+				      &vnode->status.parent, NULL, NULL);
+		if (IS_ERR(auth_inode))
+			return ERR_CAST(auth_inode);
+	}
+
+	auth_vnode = AFS_FS_I(auth_inode);
+	_leave(" = {%x}", auth_vnode->fid.vnode);
+	return auth_vnode;
+}
+
+/*
+ * clear the permit cache on a directory vnode
+ */
+void afs_clear_permits(struct afs_vnode *vnode)
+{
+	struct afs_permits *permits;
+
+	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
+	mutex_lock(&vnode->permits_lock);
+	permits = vnode->permits;
+	rcu_assign_pointer(vnode->permits, NULL);
+	mutex_unlock(&vnode->permits_lock);
+
+	if (permits)
+		call_rcu(&permits->rcu, afs_zap_permits);
+	_leave("");
+}
+
+/*
+ * add the result obtained for a vnode to its or its parent directory's cache
+ * for the key used to access it
+ */
+void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
+{
+	struct afs_permits *permits, *xpermits;
+	struct afs_permit *permit;
+	struct afs_vnode *auth_vnode;
+	int count, loop;
+
+	_enter("{%x:%u},%x,%lx",
+	       vnode->fid.vid, vnode->fid.vnode, key_serial(key), acl_order);
+
+	auth_vnode = afs_get_auth_inode(vnode, key);
+	if (IS_ERR(auth_vnode)) {
+		_leave(" [get error %ld]", PTR_ERR(auth_vnode));
+		return;
+	}
+
+	mutex_lock(&auth_vnode->permits_lock);
+
+	/* guard against a rename being detected whilst we waited for the
+	 * lock */
+	if (memcmp(&auth_vnode->fid, &vnode->status.parent,
+		   sizeof(struct afs_fid)) != 0) {
+		_debug("renamed");
+		goto out_unlock;
+	}
+
+	/* have to be careful as the directory's callback may be broken between
+	 * us receiving the status we're trying to cache and us getting the
+	 * lock to update the cache for the status */
+	if (auth_vnode->acl_order - acl_order > 0) {
+		_debug("ACL changed?");
+		goto out_unlock;
+	}
+
+	/* always update the anonymous mask */
+	_debug("anon access %x", vnode->status.anon_access);
+	auth_vnode->status.anon_access = vnode->status.anon_access;
+	if (key == vnode->volume->cell->anonymous_key)
+		goto out_unlock;
+
+	xpermits = auth_vnode->permits;
+	count = 0;
+	if (xpermits) {
+		/* see if the permit is already in the list
+		 * - if it is then we just amend the list
+		 */
+		count = xpermits->count;
+		permit = xpermits->permits;
+		for (loop = count; loop > 0; loop--) {
+			if (permit->key == key) {
+				permit->access_mask =
+					vnode->status.caller_access;
+				goto out_unlock;
+			}
+			permit++;
+		}
+	}
+
+	permits = kmalloc(sizeof(*permits) + sizeof(*permit) * (count + 1),
+			  GFP_NOFS);
+	if (!permits)
+		goto out_unlock;
+
+	if (xpermits)
+		memcpy(permits->permits, xpermits->permits,
+			count * sizeof(struct afs_permit));
+
+	_debug("key %x access %x",
+	       key_serial(key), vnode->status.caller_access);
+	permits->permits[count].access_mask = vnode->status.caller_access;
+	permits->permits[count].key = key_get(key);
+	permits->count = count + 1;
+
+	rcu_assign_pointer(auth_vnode->permits, permits);
+	if (xpermits)
+		call_rcu(&xpermits->rcu, afs_dispose_of_permits);
+
+out_unlock:
+	mutex_unlock(&auth_vnode->permits_lock);
+	iput(&auth_vnode->vfs_inode);
+	_leave("");
+}
+
+/*
+ * check with the fileserver to see if the directory or parent directory is
+ * permitted to be accessed with this authorisation, and if so, what access it
+ * is granted
+ */
+static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
+			    afs_access_t *_access)
+{
+	struct afs_permits *permits;
+	struct afs_permit *permit;
+	struct afs_vnode *auth_vnode;
+	bool valid;
+	int loop, ret;
+
+	_enter("{%x:%u},%x",
+	       vnode->fid.vid, vnode->fid.vnode, key_serial(key));
+
+	auth_vnode = afs_get_auth_inode(vnode, key);
+	if (IS_ERR(auth_vnode)) {
+		*_access = 0;
+		_leave(" = %ld", PTR_ERR(auth_vnode));
+		return PTR_ERR(auth_vnode);
+	}
+
+	ASSERT(S_ISDIR(auth_vnode->vfs_inode.i_mode));
+
+	/* check the permits to see if we've got one yet */
+	if (key == auth_vnode->volume->cell->anonymous_key) {
+		_debug("anon");
+		*_access = auth_vnode->status.anon_access;
+		valid = true;
+	} else {
+		valid = false;
+		rcu_read_lock();
+		permits = rcu_dereference(auth_vnode->permits);
+		if (permits) {
+			permit = permits->permits;
+			for (loop = permits->count; loop > 0; loop--) {
+				if (permit->key == key) {
+					_debug("found in cache");
+					*_access = permit->access_mask;
+					valid = true;
+					break;
+				}
+				permit++;
+			}
+		}
+		rcu_read_unlock();
+	}
+
+	if (!valid) {
+		/* check the status on the file we're actually interested in
+		 * (the post-processing will cache the result on auth_vnode) */
+		_debug("no valid permit");
+
+		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+		ret = afs_vnode_fetch_status(vnode, auth_vnode, key);
+		if (ret < 0) {
+			iput(&auth_vnode->vfs_inode);
+			*_access = 0;
+			_leave(" = %d", ret);
+			return ret;
+		}
+		*_access = vnode->status.caller_access;
+	}
+
+	iput(&auth_vnode->vfs_inode);
+	_leave(" = 0 [access %x]", *_access);
+	return 0;
+}
+
+/*
+ * check the permissions on an AFS file
+ * - AFS ACLs are attached to directories only, and a file is controlled by its
+ *   parent directory's ACL
+ */
+int afs_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	afs_access_t uninitialized_var(access);
+	struct key *key;
+	int ret;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	_enter("{{%x:%u},%lx},%x,",
+	       vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
+
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key)) {
+		_leave(" = %ld [key]", PTR_ERR(key));
+		return PTR_ERR(key);
+	}
+
+	/* if the promise has expired, we need to check the server again */
+	if (!vnode->cb_promised) {
+		_debug("not promised");
+		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		if (ret < 0)
+			goto error;
+		_debug("new promise [fl=%lx]", vnode->flags);
+	}
+
+	/* check the permits to see if we've got one yet */
+	ret = afs_check_permit(vnode, key, &access);
+	if (ret < 0)
+		goto error;
+
+	/* interpret the access mask */
+	_debug("REQ %x ACC %x on %s",
+	       mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file");
+
+	if (S_ISDIR(inode->i_mode)) {
+		if (mask & MAY_EXEC) {
+			if (!(access & AFS_ACE_LOOKUP))
+				goto permission_denied;
+		} else if (mask & MAY_READ) {
+			if (!(access & AFS_ACE_READ))
+				goto permission_denied;
+		} else if (mask & MAY_WRITE) {
+			if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */
+					AFS_ACE_INSERT | /* create, mkdir, symlink, rename to */
+					AFS_ACE_WRITE))) /* chmod */
+				goto permission_denied;
+		} else {
+			BUG();
+		}
+	} else {
+		if (!(access & AFS_ACE_LOOKUP))
+			goto permission_denied;
+		if (mask & (MAY_EXEC | MAY_READ)) {
+			if (!(access & AFS_ACE_READ))
+				goto permission_denied;
+		} else if (mask & MAY_WRITE) {
+			if (!(access & AFS_ACE_WRITE))
+				goto permission_denied;
+		}
+	}
+
+	key_put(key);
+	ret = generic_permission(inode, mask, flags, NULL);
+	_leave(" = %d", ret);
+	return ret;
+
+permission_denied:
+	ret = -EACCES;
+error:
+	key_put(key);
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/afs/server.c b/fs/afs/server.c
new file mode 100644
index 00000000..d59b7516
--- /dev/null
+++ b/fs/afs/server.c
@@ -0,0 +1,328 @@
+/* AFS server record management
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static unsigned afs_server_timeout = 10;	/* server timeout in seconds */
+
+static void afs_reap_server(struct work_struct *);
+
+/* tree of all the servers, indexed by IP address */
+static struct rb_root afs_servers = RB_ROOT;
+static DEFINE_RWLOCK(afs_servers_lock);
+
+/* LRU list of all the servers not currently in use */
+static LIST_HEAD(afs_server_graveyard);
+static DEFINE_SPINLOCK(afs_server_graveyard_lock);
+static DECLARE_DELAYED_WORK(afs_server_reaper, afs_reap_server);
+
+/*
+ * install a server record in the master tree
+ */
+static int afs_install_server(struct afs_server *server)
+{
+	struct afs_server *xserver;
+	struct rb_node **pp, *p;
+	int ret;
+
+	_enter("%p", server);
+
+	write_lock(&afs_servers_lock);
+
+	ret = -EEXIST;
+	pp = &afs_servers.rb_node;
+	p = NULL;
+	while (*pp) {
+		p = *pp;
+		_debug("- consider %p", p);
+		xserver = rb_entry(p, struct afs_server, master_rb);
+		if (server->addr.s_addr < xserver->addr.s_addr)
+			pp = &(*pp)->rb_left;
+		else if (server->addr.s_addr > xserver->addr.s_addr)
+			pp = &(*pp)->rb_right;
+		else
+			goto error;
+	}
+
+	rb_link_node(&server->master_rb, p, pp);
+	rb_insert_color(&server->master_rb, &afs_servers);
+	ret = 0;
+
+error:
+	write_unlock(&afs_servers_lock);
+	return ret;
+}
+
+/*
+ * allocate a new server record
+ */
+static struct afs_server *afs_alloc_server(struct afs_cell *cell,
+					   const struct in_addr *addr)
+{
+	struct afs_server *server;
+
+	_enter("");
+
+	server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
+	if (server) {
+		atomic_set(&server->usage, 1);
+		server->cell = cell;
+
+		INIT_LIST_HEAD(&server->link);
+		INIT_LIST_HEAD(&server->grave);
+		init_rwsem(&server->sem);
+		spin_lock_init(&server->fs_lock);
+		server->fs_vnodes = RB_ROOT;
+		server->cb_promises = RB_ROOT;
+		spin_lock_init(&server->cb_lock);
+		init_waitqueue_head(&server->cb_break_waitq);
+		INIT_DELAYED_WORK(&server->cb_break_work,
+				  afs_dispatch_give_up_callbacks);
+
+		memcpy(&server->addr, addr, sizeof(struct in_addr));
+		server->addr.s_addr = addr->s_addr;
+		_leave(" = %p{%d}", server, atomic_read(&server->usage));
+	} else {
+		_leave(" = NULL [nomem]");
+	}
+	return server;
+}
+
+/*
+ * get an FS-server record for a cell
+ */
+struct afs_server *afs_lookup_server(struct afs_cell *cell,
+				     const struct in_addr *addr)
+{
+	struct afs_server *server, *candidate;
+
+	_enter("%p,%pI4", cell, &addr->s_addr);
+
+	/* quick scan of the list to see if we already have the server */
+	read_lock(&cell->servers_lock);
+
+	list_for_each_entry(server, &cell->servers, link) {
+		if (server->addr.s_addr == addr->s_addr)
+			goto found_server_quickly;
+	}
+	read_unlock(&cell->servers_lock);
+
+	candidate = afs_alloc_server(cell, addr);
+	if (!candidate) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	write_lock(&cell->servers_lock);
+
+	/* check the cell's server list again */
+	list_for_each_entry(server, &cell->servers, link) {
+		if (server->addr.s_addr == addr->s_addr)
+			goto found_server;
+	}
+
+	_debug("new");
+	server = candidate;
+	if (afs_install_server(server) < 0)
+		goto server_in_two_cells;
+
+	afs_get_cell(cell);
+	list_add_tail(&server->link, &cell->servers);
+
+	write_unlock(&cell->servers_lock);
+	_leave(" = %p{%d}", server, atomic_read(&server->usage));
+	return server;
+
+	/* found a matching server quickly */
+found_server_quickly:
+	_debug("found quickly");
+	afs_get_server(server);
+	read_unlock(&cell->servers_lock);
+no_longer_unused:
+	if (!list_empty(&server->grave)) {
+		spin_lock(&afs_server_graveyard_lock);
+		list_del_init(&server->grave);
+		spin_unlock(&afs_server_graveyard_lock);
+	}
+	_leave(" = %p{%d}", server, atomic_read(&server->usage));
+	return server;
+
+	/* found a matching server on the second pass */
+found_server:
+	_debug("found");
+	afs_get_server(server);
+	write_unlock(&cell->servers_lock);
+	kfree(candidate);
+	goto no_longer_unused;
+
+	/* found a server that seems to be in two cells */
+server_in_two_cells:
+	write_unlock(&cell->servers_lock);
+	kfree(candidate);
+	printk(KERN_NOTICE "kAFS: Server %pI4 appears to be in two cells\n",
+	       addr);
+	_leave(" = -EEXIST");
+	return ERR_PTR(-EEXIST);
+}
+
+/*
+ * look up a server by its IP address
+ */
+struct afs_server *afs_find_server(const struct in_addr *_addr)
+{
+	struct afs_server *server = NULL;
+	struct rb_node *p;
+	struct in_addr addr = *_addr;
+
+	_enter("%pI4", &addr.s_addr);
+
+	read_lock(&afs_servers_lock);
+
+	p = afs_servers.rb_node;
+	while (p) {
+		server = rb_entry(p, struct afs_server, master_rb);
+
+		_debug("- consider %p", p);
+
+		if (addr.s_addr < server->addr.s_addr) {
+			p = p->rb_left;
+		} else if (addr.s_addr > server->addr.s_addr) {
+			p = p->rb_right;
+		} else {
+			afs_get_server(server);
+			goto found;
+		}
+	}
+
+	server = NULL;
+found:
+	read_unlock(&afs_servers_lock);
+	ASSERTIFCMP(server, server->addr.s_addr, ==, addr.s_addr);
+	_leave(" = %p", server);
+	return server;
+}
+
+/*
+ * destroy a server record
+ * - removes from the cell list
+ */
+void afs_put_server(struct afs_server *server)
+{
+	if (!server)
+		return;
+
+	_enter("%p{%d}", server, atomic_read(&server->usage));
+
+	_debug("PUT SERVER %d", atomic_read(&server->usage));
+
+	ASSERTCMP(atomic_read(&server->usage), >, 0);
+
+	if (likely(!atomic_dec_and_test(&server->usage))) {
+		_leave("");
+		return;
+	}
+
+	afs_flush_callback_breaks(server);
+
+	spin_lock(&afs_server_graveyard_lock);
+	if (atomic_read(&server->usage) == 0) {
+		list_move_tail(&server->grave, &afs_server_graveyard);
+		server->time_of_death = get_seconds();
+		queue_delayed_work(afs_wq, &afs_server_reaper,
+				   afs_server_timeout * HZ);
+	}
+	spin_unlock(&afs_server_graveyard_lock);
+	_leave(" [dead]");
+}
+
+/*
+ * destroy a dead server
+ */
+static void afs_destroy_server(struct afs_server *server)
+{
+	_enter("%p", server);
+
+	ASSERTIF(server->cb_break_head != server->cb_break_tail,
+		 delayed_work_pending(&server->cb_break_work));
+
+	ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL);
+	ASSERTCMP(server->cb_promises.rb_node, ==, NULL);
+	ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail);
+	ASSERTCMP(atomic_read(&server->cb_break_n), ==, 0);
+
+	afs_put_cell(server->cell);
+	kfree(server);
+}
+
+/*
+ * reap dead server records
+ */
+static void afs_reap_server(struct work_struct *work)
+{
+	LIST_HEAD(corpses);
+	struct afs_server *server;
+	unsigned long delay, expiry;
+	time_t now;
+
+	now = get_seconds();
+	spin_lock(&afs_server_graveyard_lock);
+
+	while (!list_empty(&afs_server_graveyard)) {
+		server = list_entry(afs_server_graveyard.next,
+				    struct afs_server, grave);
+
+		/* the queue is ordered most dead first */
+		expiry = server->time_of_death + afs_server_timeout;
+		if (expiry > now) {
+			delay = (expiry - now) * HZ;
+			if (!queue_delayed_work(afs_wq, &afs_server_reaper,
+						delay)) {
+				cancel_delayed_work(&afs_server_reaper);
+				queue_delayed_work(afs_wq, &afs_server_reaper,
+						   delay);
+			}
+			break;
+		}
+
+		write_lock(&server->cell->servers_lock);
+		write_lock(&afs_servers_lock);
+		if (atomic_read(&server->usage) > 0) {
+			list_del_init(&server->grave);
+		} else {
+			list_move_tail(&server->grave, &corpses);
+			list_del_init(&server->link);
+			rb_erase(&server->master_rb, &afs_servers);
+		}
+		write_unlock(&afs_servers_lock);
+		write_unlock(&server->cell->servers_lock);
+	}
+
+	spin_unlock(&afs_server_graveyard_lock);
+
+	/* now reap the corpses we've extracted */
+	while (!list_empty(&corpses)) {
+		server = list_entry(corpses.next, struct afs_server, grave);
+		list_del(&server->grave);
+		afs_destroy_server(server);
+	}
+}
+
+/*
+ * discard all the server records for rmmod
+ */
+void __exit afs_purge_servers(void)
+{
+	afs_server_timeout = 0;
+	cancel_delayed_work(&afs_server_reaper);
+	queue_delayed_work(afs_wq, &afs_server_reaper, 0);
+}
diff --git a/fs/afs/super.c b/fs/afs/super.c
new file mode 100644
index 00000000..356dcf09
--- /dev/null
+++ b/fs/afs/super.c
@@ -0,0 +1,550 @@
+/* AFS superblock handling
+ *
+ * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
+ *
+ * This software may be freely redistributed under the terms of the
+ * GNU General Public License.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Authors: David Howells <dhowells@redhat.com>
+ *          David Woodhouse <dwmw2@infradead.org>
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/parser.h>
+#include <linux/statfs.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
+
+static void afs_i_init_once(void *foo);
+static struct dentry *afs_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data);
+static void afs_kill_super(struct super_block *sb);
+static struct inode *afs_alloc_inode(struct super_block *sb);
+static void afs_destroy_inode(struct inode *inode);
+static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
+
+struct file_system_type afs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "afs",
+	.mount		= afs_mount,
+	.kill_sb	= afs_kill_super,
+	.fs_flags	= 0,
+};
+
+static const struct super_operations afs_super_ops = {
+	.statfs		= afs_statfs,
+	.alloc_inode	= afs_alloc_inode,
+	.drop_inode	= afs_drop_inode,
+	.destroy_inode	= afs_destroy_inode,
+	.evict_inode	= afs_evict_inode,
+	.show_options	= generic_show_options,
+};
+
+static struct kmem_cache *afs_inode_cachep;
+static atomic_t afs_count_active_inodes;
+
+enum {
+	afs_no_opt,
+	afs_opt_cell,
+	afs_opt_rwpath,
+	afs_opt_vol,
+	afs_opt_autocell,
+};
+
+static const match_table_t afs_options_list = {
+	{ afs_opt_cell,		"cell=%s"	},
+	{ afs_opt_rwpath,	"rwpath"	},
+	{ afs_opt_vol,		"vol=%s"	},
+	{ afs_opt_autocell,	"autocell"	},
+	{ afs_no_opt,		NULL		},
+};
+
+/*
+ * initialise the filesystem
+ */
+int __init afs_fs_init(void)
+{
+	int ret;
+
+	_enter("");
+
+	/* create ourselves an inode cache */
+	atomic_set(&afs_count_active_inodes, 0);
+
+	ret = -ENOMEM;
+	afs_inode_cachep = kmem_cache_create("afs_inode_cache",
+					     sizeof(struct afs_vnode),
+					     0,
+					     SLAB_HWCACHE_ALIGN,
+					     afs_i_init_once);
+	if (!afs_inode_cachep) {
+		printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n");
+		return ret;
+	}
+
+	/* now export our filesystem to lesser mortals */
+	ret = register_filesystem(&afs_fs_type);
+	if (ret < 0) {
+		kmem_cache_destroy(afs_inode_cachep);
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * clean up the filesystem
+ */
+void __exit afs_fs_exit(void)
+{
+	_enter("");
+
+	afs_mntpt_kill_timer();
+	unregister_filesystem(&afs_fs_type);
+
+	if (atomic_read(&afs_count_active_inodes) != 0) {
+		printk("kAFS: %d active inode objects still present\n",
+		       atomic_read(&afs_count_active_inodes));
+		BUG();
+	}
+
+	kmem_cache_destroy(afs_inode_cachep);
+	_leave("");
+}
+
+/*
+ * parse the mount options
+ * - this function has been shamelessly adapted from the ext3 fs which
+ *   shamelessly adapted it from the msdos fs
+ */
+static int afs_parse_options(struct afs_mount_params *params,
+			     char *options, const char **devname)
+{
+	struct afs_cell *cell;
+	substring_t args[MAX_OPT_ARGS];
+	char *p;
+	int token;
+
+	_enter("%s", options);
+
+	options[PAGE_SIZE - 1] = 0;
+
+	while ((p = strsep(&options, ","))) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, afs_options_list, args);
+		switch (token) {
+		case afs_opt_cell:
+			cell = afs_cell_lookup(args[0].from,
+					       args[0].to - args[0].from,
+					       false);
+			if (IS_ERR(cell))
+				return PTR_ERR(cell);
+			afs_put_cell(params->cell);
+			params->cell = cell;
+			break;
+
+		case afs_opt_rwpath:
+			params->rwpath = 1;
+			break;
+
+		case afs_opt_vol:
+			*devname = args[0].from;
+			break;
+
+		case afs_opt_autocell:
+			params->autocell = 1;
+			break;
+
+		default:
+			printk(KERN_ERR "kAFS:"
+			       " Unknown or invalid mount option: '%s'\n", p);
+			return -EINVAL;
+		}
+	}
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * parse a device name to get cell name, volume name, volume type and R/W
+ * selector
+ * - this can be one of the following:
+ *	"%[cell:]volume[.]"		R/W volume
+ *	"#[cell:]volume[.]"		R/O or R/W volume (rwpath=0),
+ *					 or R/W (rwpath=1) volume
+ *	"%[cell:]volume.readonly"	R/O volume
+ *	"#[cell:]volume.readonly"	R/O volume
+ *	"%[cell:]volume.backup"		Backup volume
+ *	"#[cell:]volume.backup"		Backup volume
+ */
+static int afs_parse_device_name(struct afs_mount_params *params,
+				 const char *name)
+{
+	struct afs_cell *cell;
+	const char *cellname, *suffix;
+	int cellnamesz;
+
+	_enter(",%s", name);
+
+	if (!name) {
+		printk(KERN_ERR "kAFS: no volume name specified\n");
+		return -EINVAL;
+	}
+
+	if ((name[0] != '%' && name[0] != '#') || !name[1]) {
+		printk(KERN_ERR "kAFS: unparsable volume name\n");
+		return -EINVAL;
+	}
+
+	/* determine the type of volume we're looking for */
+	params->type = AFSVL_ROVOL;
+	params->force = false;
+	if (params->rwpath || name[0] == '%') {
+		params->type = AFSVL_RWVOL;
+		params->force = true;
+	}
+	name++;
+
+	/* split the cell name out if there is one */
+	params->volname = strchr(name, ':');
+	if (params->volname) {
+		cellname = name;
+		cellnamesz = params->volname - name;
+		params->volname++;
+	} else {
+		params->volname = name;
+		cellname = NULL;
+		cellnamesz = 0;
+	}
+
+	/* the volume type is further affected by a possible suffix */
+	suffix = strrchr(params->volname, '.');
+	if (suffix) {
+		if (strcmp(suffix, ".readonly") == 0) {
+			params->type = AFSVL_ROVOL;
+			params->force = true;
+		} else if (strcmp(suffix, ".backup") == 0) {
+			params->type = AFSVL_BACKVOL;
+			params->force = true;
+		} else if (suffix[1] == 0) {
+		} else {
+			suffix = NULL;
+		}
+	}
+
+	params->volnamesz = suffix ?
+		suffix - params->volname : strlen(params->volname);
+
+	_debug("cell %*.*s [%p]",
+	       cellnamesz, cellnamesz, cellname ?: "", params->cell);
+
+	/* lookup the cell record */
+	if (cellname || !params->cell) {
+		cell = afs_cell_lookup(cellname, cellnamesz, true);
+		if (IS_ERR(cell)) {
+			printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
+			       cellnamesz, cellnamesz, cellname ?: "");
+			return PTR_ERR(cell);
+		}
+		afs_put_cell(params->cell);
+		params->cell = cell;
+	}
+
+	_debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
+	       params->cell->name, params->cell,
+	       params->volnamesz, params->volnamesz, params->volname,
+	       suffix ?: "-", params->type, params->force ? " FORCE" : "");
+
+	return 0;
+}
+
+/*
+ * check a superblock to see if it's the one we're looking for
+ */
+static int afs_test_super(struct super_block *sb, void *data)
+{
+	struct afs_super_info *as1 = data;
+	struct afs_super_info *as = sb->s_fs_info;
+
+	return as->volume == as1->volume;
+}
+
+static int afs_set_super(struct super_block *sb, void *data)
+{
+	sb->s_fs_info = data;
+	return set_anon_super(sb, NULL);
+}
+
+/*
+ * fill in the superblock
+ */
+static int afs_fill_super(struct super_block *sb,
+			  struct afs_mount_params *params)
+{
+	struct afs_super_info *as = sb->s_fs_info;
+	struct afs_fid fid;
+	struct dentry *root = NULL;
+	struct inode *inode = NULL;
+	int ret;
+
+	_enter("");
+
+	/* fill in the superblock */
+	sb->s_blocksize		= PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits	= PAGE_CACHE_SHIFT;
+	sb->s_magic		= AFS_FS_MAGIC;
+	sb->s_op		= &afs_super_ops;
+	sb->s_bdi		= &as->volume->bdi;
+	strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id));
+
+	/* allocate the root inode and dentry */
+	fid.vid		= as->volume->vid;
+	fid.vnode	= 1;
+	fid.unique	= 1;
+	inode = afs_iget(sb, params->key, &fid, NULL, NULL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	if (params->autocell)
+		set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
+
+	ret = -ENOMEM;
+	root = d_alloc_root(inode);
+	if (!root)
+		goto error;
+
+	sb->s_d_op = &afs_fs_dentry_operations;
+	sb->s_root = root;
+
+	_leave(" = 0");
+	return 0;
+
+error:
+	iput(inode);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * get an AFS superblock
+ */
+static struct dentry *afs_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *options)
+{
+	struct afs_mount_params params;
+	struct super_block *sb;
+	struct afs_volume *vol;
+	struct key *key;
+	char *new_opts = kstrdup(options, GFP_KERNEL);
+	struct afs_super_info *as;
+	int ret;
+
+	_enter(",,%s,%p", dev_name, options);
+
+	memset(&params, 0, sizeof(params));
+
+	/* parse the options and device name */
+	if (options) {
+		ret = afs_parse_options(&params, options, &dev_name);
+		if (ret < 0)
+			goto error;
+	}
+
+	ret = afs_parse_device_name(&params, dev_name);
+	if (ret < 0)
+		goto error;
+
+	/* try and do the mount securely */
+	key = afs_request_key(params.cell);
+	if (IS_ERR(key)) {
+		_leave(" = %ld [key]", PTR_ERR(key));
+		ret = PTR_ERR(key);
+		goto error;
+	}
+	params.key = key;
+
+	/* parse the device name */
+	vol = afs_volume_lookup(&params);
+	if (IS_ERR(vol)) {
+		ret = PTR_ERR(vol);
+		goto error;
+	}
+
+	/* allocate a superblock info record */
+	as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
+	if (!as) {
+		ret = -ENOMEM;
+		afs_put_volume(vol);
+		goto error;
+	}
+	as->volume = vol;
+
+	/* allocate a deviceless superblock */
+	sb = sget(fs_type, afs_test_super, afs_set_super, as);
+	if (IS_ERR(sb)) {
+		ret = PTR_ERR(sb);
+		afs_put_volume(vol);
+		kfree(as);
+		goto error;
+	}
+
+	if (!sb->s_root) {
+		/* initial superblock/root creation */
+		_debug("create");
+		sb->s_flags = flags;
+		ret = afs_fill_super(sb, &params);
+		if (ret < 0) {
+			deactivate_locked_super(sb);
+			goto error;
+		}
+		save_mount_options(sb, new_opts);
+		sb->s_flags |= MS_ACTIVE;
+	} else {
+		_debug("reuse");
+		ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
+		afs_put_volume(vol);
+		kfree(as);
+	}
+
+	afs_put_cell(params.cell);
+	kfree(new_opts);
+	_leave(" = 0 [%p]", sb);
+	return dget(sb->s_root);
+
+error:
+	afs_put_cell(params.cell);
+	key_put(params.key);
+	kfree(new_opts);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+}
+
+static void afs_kill_super(struct super_block *sb)
+{
+	struct afs_super_info *as = sb->s_fs_info;
+	kill_anon_super(sb);
+	afs_put_volume(as->volume);
+	kfree(as);
+}
+
+/*
+ * initialise an inode cache slab element prior to any use
+ */
+static void afs_i_init_once(void *_vnode)
+{
+	struct afs_vnode *vnode = _vnode;
+
+	memset(vnode, 0, sizeof(*vnode));
+	inode_init_once(&vnode->vfs_inode);
+	init_waitqueue_head(&vnode->update_waitq);
+	mutex_init(&vnode->permits_lock);
+	mutex_init(&vnode->validate_lock);
+	spin_lock_init(&vnode->writeback_lock);
+	spin_lock_init(&vnode->lock);
+	INIT_LIST_HEAD(&vnode->writebacks);
+	INIT_LIST_HEAD(&vnode->pending_locks);
+	INIT_LIST_HEAD(&vnode->granted_locks);
+	INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work);
+	INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
+}
+
+/*
+ * allocate an AFS inode struct from our slab cache
+ */
+static struct inode *afs_alloc_inode(struct super_block *sb)
+{
+	struct afs_vnode *vnode;
+
+	vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
+	if (!vnode)
+		return NULL;
+
+	atomic_inc(&afs_count_active_inodes);
+
+	memset(&vnode->fid, 0, sizeof(vnode->fid));
+	memset(&vnode->status, 0, sizeof(vnode->status));
+
+	vnode->volume		= NULL;
+	vnode->update_cnt	= 0;
+	vnode->flags		= 1 << AFS_VNODE_UNSET;
+	vnode->cb_promised	= false;
+
+	_leave(" = %p", &vnode->vfs_inode);
+	return &vnode->vfs_inode;
+}
+
+static void afs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(afs_inode_cachep, vnode);
+}
+
+/*
+ * destroy an AFS inode struct
+ */
+static void afs_destroy_inode(struct inode *inode)
+{
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+
+	_enter("%p{%x:%u}", inode, vnode->fid.vid, vnode->fid.vnode);
+
+	_debug("DESTROY INODE %p", inode);
+
+	ASSERTCMP(vnode->server, ==, NULL);
+
+	call_rcu(&inode->i_rcu, afs_i_callback);
+	atomic_dec(&afs_count_active_inodes);
+}
+
+/*
+ * return information about an AFS volume
+ */
+static int afs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct afs_volume_status vs;
+	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+	struct key *key;
+	int ret;
+
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+
+	ret = afs_vnode_get_volume_status(vnode, key, &vs);
+	key_put(key);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	buf->f_type	= dentry->d_sb->s_magic;
+	buf->f_bsize	= AFS_BLOCK_SIZE;
+	buf->f_namelen	= AFSNAMEMAX - 1;
+
+	if (vs.max_quota == 0)
+		buf->f_blocks = vs.part_max_blocks;
+	else
+		buf->f_blocks = vs.max_quota;
+	buf->f_bavail = buf->f_bfree = buf->f_blocks - vs.blocks_in_use;
+	return 0;
+}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
new file mode 100644
index 00000000..340afd0c
--- /dev/null
+++ b/fs/afs/vlclient.c
@@ -0,0 +1,219 @@
+/* AFS Volume Location Service client
+ *
+ * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+/*
+ * map volume locator abort codes to error codes
+ */
+static int afs_vl_abort_to_error(u32 abort_code)
+{
+	_enter("%u", abort_code);
+
+	switch (abort_code) {
+	case AFSVL_IDEXIST:		return -EEXIST;
+	case AFSVL_IO:			return -EREMOTEIO;
+	case AFSVL_NAMEEXIST:		return -EEXIST;
+	case AFSVL_CREATEFAIL:		return -EREMOTEIO;
+	case AFSVL_NOENT:		return -ENOMEDIUM;
+	case AFSVL_EMPTY:		return -ENOMEDIUM;
+	case AFSVL_ENTDELETED:		return -ENOMEDIUM;
+	case AFSVL_BADNAME:		return -EINVAL;
+	case AFSVL_BADINDEX:		return -EINVAL;
+	case AFSVL_BADVOLTYPE:		return -EINVAL;
+	case AFSVL_BADSERVER:		return -EINVAL;
+	case AFSVL_BADPARTITION:	return -EINVAL;
+	case AFSVL_REPSFULL:		return -EFBIG;
+	case AFSVL_NOREPSERVER:		return -ENOENT;
+	case AFSVL_DUPREPSERVER:	return -EEXIST;
+	case AFSVL_RWNOTFOUND:		return -ENOENT;
+	case AFSVL_BADREFCOUNT:		return -EINVAL;
+	case AFSVL_SIZEEXCEEDED:	return -EINVAL;
+	case AFSVL_BADENTRY:		return -EINVAL;
+	case AFSVL_BADVOLIDBUMP:	return -EINVAL;
+	case AFSVL_IDALREADYHASHED:	return -EINVAL;
+	case AFSVL_ENTRYLOCKED:		return -EBUSY;
+	case AFSVL_BADVOLOPER:		return -EBADRQC;
+	case AFSVL_BADRELLOCKTYPE:	return -EINVAL;
+	case AFSVL_RERELEASE:		return -EREMOTEIO;
+	case AFSVL_BADSERVERFLAG:	return -EINVAL;
+	case AFSVL_PERM:		return -EACCES;
+	case AFSVL_NOMEM:		return -EREMOTEIO;
+	default:
+		return afs_abort_to_error(abort_code);
+	}
+}
+
+/*
+ * deliver reply data to a VL.GetEntryByXXX call
+ */
+static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call,
+					   struct sk_buff *skb, bool last)
+{
+	struct afs_cache_vlocation *entry;
+	__be32 *bp;
+	u32 tmp;
+	int loop;
+
+	_enter(",,%u", last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	entry = call->reply;
+	bp = call->buffer;
+
+	for (loop = 0; loop < 64; loop++)
+		entry->name[loop] = ntohl(*bp++);
+	entry->name[loop] = 0;
+	bp++; /* final NUL */
+
+	bp++; /* type */
+	entry->nservers = ntohl(*bp++);
+
+	for (loop = 0; loop < 8; loop++)
+		entry->servers[loop].s_addr = *bp++;
+
+	bp += 8; /* partition IDs */
+
+	for (loop = 0; loop < 8; loop++) {
+		tmp = ntohl(*bp++);
+		entry->srvtmask[loop] = 0;
+		if (tmp & AFS_VLSF_RWVOL)
+			entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
+		if (tmp & AFS_VLSF_ROVOL)
+			entry->srvtmask[loop] |= AFS_VOL_VTM_RO;
+		if (tmp & AFS_VLSF_BACKVOL)
+			entry->srvtmask[loop] |= AFS_VOL_VTM_BAK;
+	}
+
+	entry->vid[0] = ntohl(*bp++);
+	entry->vid[1] = ntohl(*bp++);
+	entry->vid[2] = ntohl(*bp++);
+
+	bp++; /* clone ID */
+
+	tmp = ntohl(*bp++); /* flags */
+	entry->vidmask = 0;
+	if (tmp & AFS_VLF_RWEXISTS)
+		entry->vidmask |= AFS_VOL_VTM_RW;
+	if (tmp & AFS_VLF_ROEXISTS)
+		entry->vidmask |= AFS_VOL_VTM_RO;
+	if (tmp & AFS_VLF_BACKEXISTS)
+		entry->vidmask |= AFS_VOL_VTM_BAK;
+	if (!entry->vidmask)
+		return -EBADMSG;
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * VL.GetEntryByName operation type
+ */
+static const struct afs_call_type afs_RXVLGetEntryByName = {
+	.name		= "VL.GetEntryByName",
+	.deliver	= afs_deliver_vl_get_entry_by_xxx,
+	.abort_to_error	= afs_vl_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * VL.GetEntryById operation type
+ */
+static const struct afs_call_type afs_RXVLGetEntryById = {
+	.name		= "VL.GetEntryById",
+	.deliver	= afs_deliver_vl_get_entry_by_xxx,
+	.abort_to_error	= afs_vl_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * dispatch a get volume entry by name operation
+ */
+int afs_vl_get_entry_by_name(struct in_addr *addr,
+			     struct key *key,
+			     const char *volname,
+			     struct afs_cache_vlocation *entry,
+			     const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t volnamesz, reqsz, padsz;
+	__be32 *bp;
+
+	_enter("");
+
+	volnamesz = strlen(volname);
+	padsz = (4 - (volnamesz & 3)) & 3;
+	reqsz = 8 + volnamesz + padsz;
+
+	call = afs_alloc_flat_call(&afs_RXVLGetEntryByName, reqsz, 384);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = entry;
+	call->service_id = VL_SERVICE;
+	call->port = htons(AFS_VL_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(VLGETENTRYBYNAME);
+	*bp++ = htonl(volnamesz);
+	memcpy(bp, volname, volnamesz);
+	if (padsz > 0)
+		memset((void *) bp + volnamesz, 0, padsz);
+
+	/* initiate the call */
+	return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
+}
+
+/*
+ * dispatch a get volume entry by ID operation
+ */
+int afs_vl_get_entry_by_id(struct in_addr *addr,
+			   struct key *key,
+			   afs_volid_t volid,
+			   afs_voltype_t voltype,
+			   struct afs_cache_vlocation *entry,
+			   const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(&afs_RXVLGetEntryById, 12, 384);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = entry;
+	call->service_id = VL_SERVICE;
+	call->port = htons(AFS_VL_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(VLGETENTRYBYID);
+	*bp++ = htonl(volid);
+	*bp   = htonl(voltype);
+
+	/* initiate the call */
+	return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
+}
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
new file mode 100644
index 00000000..431984d2
--- /dev/null
+++ b/fs/afs/vlocation.c
@@ -0,0 +1,726 @@
+/* AFS volume location management
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+static unsigned afs_vlocation_timeout = 10;	/* volume location timeout in seconds */
+static unsigned afs_vlocation_update_timeout = 10 * 60;
+
+static void afs_vlocation_reaper(struct work_struct *);
+static void afs_vlocation_updater(struct work_struct *);
+
+static LIST_HEAD(afs_vlocation_updates);
+static LIST_HEAD(afs_vlocation_graveyard);
+static DEFINE_SPINLOCK(afs_vlocation_updates_lock);
+static DEFINE_SPINLOCK(afs_vlocation_graveyard_lock);
+static DECLARE_DELAYED_WORK(afs_vlocation_reap, afs_vlocation_reaper);
+static DECLARE_DELAYED_WORK(afs_vlocation_update, afs_vlocation_updater);
+static struct workqueue_struct *afs_vlocation_update_worker;
+
+/*
+ * iterate through the VL servers in a cell until one of them admits knowing
+ * about the volume in question
+ */
+static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
+					   struct key *key,
+					   struct afs_cache_vlocation *vldb)
+{
+	struct afs_cell *cell = vl->cell;
+	struct in_addr addr;
+	int count, ret;
+
+	_enter("%s,%s", cell->name, vl->vldb.name);
+
+	down_write(&vl->cell->vl_sem);
+	ret = -ENOMEDIUM;
+	for (count = cell->vl_naddrs; count > 0; count--) {
+		addr = cell->vl_addrs[cell->vl_curr_svix];
+
+		_debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
+
+		/* attempt to access the VL server */
+		ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb,
+					       &afs_sync_call);
+		switch (ret) {
+		case 0:
+			goto out;
+		case -ENOMEM:
+		case -ENONET:
+		case -ENETUNREACH:
+		case -EHOSTUNREACH:
+		case -ECONNREFUSED:
+			if (ret == -ENOMEM || ret == -ENONET)
+				goto out;
+			goto rotate;
+		case -ENOMEDIUM:
+		case -EKEYREJECTED:
+		case -EKEYEXPIRED:
+			goto out;
+		default:
+			ret = -EIO;
+			goto rotate;
+		}
+
+		/* rotate the server records upon lookup failure */
+	rotate:
+		cell->vl_curr_svix++;
+		cell->vl_curr_svix %= cell->vl_naddrs;
+	}
+
+out:
+	up_write(&vl->cell->vl_sem);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * iterate through the VL servers in a cell until one of them admits knowing
+ * about the volume in question
+ */
+static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
+					 struct key *key,
+					 afs_volid_t volid,
+					 afs_voltype_t voltype,
+					 struct afs_cache_vlocation *vldb)
+{
+	struct afs_cell *cell = vl->cell;
+	struct in_addr addr;
+	int count, ret;
+
+	_enter("%s,%x,%d,", cell->name, volid, voltype);
+
+	down_write(&vl->cell->vl_sem);
+	ret = -ENOMEDIUM;
+	for (count = cell->vl_naddrs; count > 0; count--) {
+		addr = cell->vl_addrs[cell->vl_curr_svix];
+
+		_debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
+
+		/* attempt to access the VL server */
+		ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb,
+					     &afs_sync_call);
+		switch (ret) {
+		case 0:
+			goto out;
+		case -ENOMEM:
+		case -ENONET:
+		case -ENETUNREACH:
+		case -EHOSTUNREACH:
+		case -ECONNREFUSED:
+			if (ret == -ENOMEM || ret == -ENONET)
+				goto out;
+			goto rotate;
+		case -EBUSY:
+			vl->upd_busy_cnt++;
+			if (vl->upd_busy_cnt <= 3) {
+				if (vl->upd_busy_cnt > 1) {
+					/* second+ BUSY - sleep a little bit */
+					set_current_state(TASK_UNINTERRUPTIBLE);
+					schedule_timeout(1);
+					__set_current_state(TASK_RUNNING);
+				}
+				continue;
+			}
+			break;
+		case -ENOMEDIUM:
+			vl->upd_rej_cnt++;
+			goto rotate;
+		default:
+			ret = -EIO;
+			goto rotate;
+		}
+
+		/* rotate the server records upon lookup failure */
+	rotate:
+		cell->vl_curr_svix++;
+		cell->vl_curr_svix %= cell->vl_naddrs;
+		vl->upd_busy_cnt = 0;
+	}
+
+out:
+	if (ret < 0 && vl->upd_rej_cnt > 0) {
+		printk(KERN_NOTICE "kAFS:"
+		       " Active volume no longer valid '%s'\n",
+		       vl->vldb.name);
+		vl->valid = 0;
+		ret = -ENOMEDIUM;
+	}
+
+	up_write(&vl->cell->vl_sem);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * allocate a volume location record
+ */
+static struct afs_vlocation *afs_vlocation_alloc(struct afs_cell *cell,
+						 const char *name,
+						 size_t namesz)
+{
+	struct afs_vlocation *vl;
+
+	vl = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
+	if (vl) {
+		vl->cell = cell;
+		vl->state = AFS_VL_NEW;
+		atomic_set(&vl->usage, 1);
+		INIT_LIST_HEAD(&vl->link);
+		INIT_LIST_HEAD(&vl->grave);
+		INIT_LIST_HEAD(&vl->update);
+		init_waitqueue_head(&vl->waitq);
+		spin_lock_init(&vl->lock);
+		memcpy(vl->vldb.name, name, namesz);
+	}
+
+	_leave(" = %p", vl);
+	return vl;
+}
+
+/*
+ * update record if we found it in the cache
+ */
+static int afs_vlocation_update_record(struct afs_vlocation *vl,
+				       struct key *key,
+				       struct afs_cache_vlocation *vldb)
+{
+	afs_voltype_t voltype;
+	afs_volid_t vid;
+	int ret;
+
+	/* try to look up a cached volume in the cell VL databases by ID */
+	_debug("Locally Cached: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
+	       vl->vldb.name,
+	       vl->vldb.vidmask,
+	       ntohl(vl->vldb.servers[0].s_addr),
+	       vl->vldb.srvtmask[0],
+	       ntohl(vl->vldb.servers[1].s_addr),
+	       vl->vldb.srvtmask[1],
+	       ntohl(vl->vldb.servers[2].s_addr),
+	       vl->vldb.srvtmask[2]);
+
+	_debug("Vids: %08x %08x %08x",
+	       vl->vldb.vid[0],
+	       vl->vldb.vid[1],
+	       vl->vldb.vid[2]);
+
+	if (vl->vldb.vidmask & AFS_VOL_VTM_RW) {
+		vid = vl->vldb.vid[0];
+		voltype = AFSVL_RWVOL;
+	} else if (vl->vldb.vidmask & AFS_VOL_VTM_RO) {
+		vid = vl->vldb.vid[1];
+		voltype = AFSVL_ROVOL;
+	} else if (vl->vldb.vidmask & AFS_VOL_VTM_BAK) {
+		vid = vl->vldb.vid[2];
+		voltype = AFSVL_BACKVOL;
+	} else {
+		BUG();
+		vid = 0;
+		voltype = 0;
+	}
+
+	/* contact the server to make sure the volume is still available
+	 * - TODO: need to handle disconnected operation here
+	 */
+	ret = afs_vlocation_access_vl_by_id(vl, key, vid, voltype, vldb);
+	switch (ret) {
+		/* net error */
+	default:
+		printk(KERN_WARNING "kAFS:"
+		       " failed to update volume '%s' (%x) up in '%s': %d\n",
+		       vl->vldb.name, vid, vl->cell->name, ret);
+		_leave(" = %d", ret);
+		return ret;
+
+		/* pulled from local cache into memory */
+	case 0:
+		_leave(" = 0");
+		return 0;
+
+		/* uh oh... looks like the volume got deleted */
+	case -ENOMEDIUM:
+		printk(KERN_ERR "kAFS:"
+		       " volume '%s' (%x) does not exist '%s'\n",
+		       vl->vldb.name, vid, vl->cell->name);
+
+		/* TODO: make existing record unavailable */
+		_leave(" = %d", ret);
+		return ret;
+	}
+}
+
+/*
+ * apply the update to a VL record
+ */
+static void afs_vlocation_apply_update(struct afs_vlocation *vl,
+				       struct afs_cache_vlocation *vldb)
+{
+	_debug("Done VL Lookup: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
+	       vldb->name, vldb->vidmask,
+	       ntohl(vldb->servers[0].s_addr), vldb->srvtmask[0],
+	       ntohl(vldb->servers[1].s_addr), vldb->srvtmask[1],
+	       ntohl(vldb->servers[2].s_addr), vldb->srvtmask[2]);
+
+	_debug("Vids: %08x %08x %08x",
+	       vldb->vid[0], vldb->vid[1], vldb->vid[2]);
+
+	if (strcmp(vldb->name, vl->vldb.name) != 0)
+		printk(KERN_NOTICE "kAFS:"
+		       " name of volume '%s' changed to '%s' on server\n",
+		       vl->vldb.name, vldb->name);
+
+	vl->vldb = *vldb;
+
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_update_cookie(vl->cache);
+#endif
+}
+
+/*
+ * fill in a volume location record, consulting the cache and the VL server
+ * both
+ */
+static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
+					struct key *key)
+{
+	struct afs_cache_vlocation vldb;
+	int ret;
+
+	_enter("");
+
+	ASSERTCMP(vl->valid, ==, 0);
+
+	memset(&vldb, 0, sizeof(vldb));
+
+	/* see if we have an in-cache copy (will set vl->valid if there is) */
+#ifdef CONFIG_AFS_FSCACHE
+	vl->cache = fscache_acquire_cookie(vl->cell->cache,
+					   &afs_vlocation_cache_index_def, vl);
+#endif
+
+	if (vl->valid) {
+		/* try to update a known volume in the cell VL databases by
+		 * ID as the name may have changed */
+		_debug("found in cache");
+		ret = afs_vlocation_update_record(vl, key, &vldb);
+	} else {
+		/* try to look up an unknown volume in the cell VL databases by
+		 * name */
+		ret = afs_vlocation_access_vl_by_name(vl, key, &vldb);
+		if (ret < 0) {
+			printk("kAFS: failed to locate '%s' in cell '%s'\n",
+			       vl->vldb.name, vl->cell->name);
+			return ret;
+		}
+	}
+
+	afs_vlocation_apply_update(vl, &vldb);
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * queue a vlocation record for updates
+ */
+static void afs_vlocation_queue_for_updates(struct afs_vlocation *vl)
+{
+	struct afs_vlocation *xvl;
+
+	/* wait at least 10 minutes before updating... */
+	vl->update_at = get_seconds() + afs_vlocation_update_timeout;
+
+	spin_lock(&afs_vlocation_updates_lock);
+
+	if (!list_empty(&afs_vlocation_updates)) {
+		/* ... but wait at least 1 second more than the newest record
+		 * already queued so that we don't spam the VL server suddenly
+		 * with lots of requests
+		 */
+		xvl = list_entry(afs_vlocation_updates.prev,
+				 struct afs_vlocation, update);
+		if (vl->update_at <= xvl->update_at)
+			vl->update_at = xvl->update_at + 1;
+	} else {
+		queue_delayed_work(afs_vlocation_update_worker,
+				   &afs_vlocation_update,
+				   afs_vlocation_update_timeout * HZ);
+	}
+
+	list_add_tail(&vl->update, &afs_vlocation_updates);
+	spin_unlock(&afs_vlocation_updates_lock);
+}
+
+/*
+ * lookup volume location
+ * - iterate through the VL servers in a cell until one of them admits knowing
+ *   about the volume in question
+ * - lookup in the local cache if not able to find on the VL server
+ * - insert/update in the local cache if did get a VL response
+ */
+struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell,
+					   struct key *key,
+					   const char *name,
+					   size_t namesz)
+{
+	struct afs_vlocation *vl;
+	int ret;
+
+	_enter("{%s},{%x},%*.*s,%zu",
+	       cell->name, key_serial(key),
+	       (int) namesz, (int) namesz, name, namesz);
+
+	if (namesz >= sizeof(vl->vldb.name)) {
+		_leave(" = -ENAMETOOLONG");
+		return ERR_PTR(-ENAMETOOLONG);
+	}
+
+	/* see if we have an in-memory copy first */
+	down_write(&cell->vl_sem);
+	spin_lock(&cell->vl_lock);
+	list_for_each_entry(vl, &cell->vl_list, link) {
+		if (vl->vldb.name[namesz] != '\0')
+			continue;
+		if (memcmp(vl->vldb.name, name, namesz) == 0)
+			goto found_in_memory;
+	}
+	spin_unlock(&cell->vl_lock);
+
+	/* not in the cell's in-memory lists - create a new record */
+	vl = afs_vlocation_alloc(cell, name, namesz);
+	if (!vl) {
+		up_write(&cell->vl_sem);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	afs_get_cell(cell);
+
+	list_add_tail(&vl->link, &cell->vl_list);
+	vl->state = AFS_VL_CREATING;
+	up_write(&cell->vl_sem);
+
+fill_in_record:
+	ret = afs_vlocation_fill_in_record(vl, key);
+	if (ret < 0)
+		goto error_abandon;
+	spin_lock(&vl->lock);
+	vl->state = AFS_VL_VALID;
+	spin_unlock(&vl->lock);
+	wake_up(&vl->waitq);
+
+	/* update volume entry in local cache */
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_update_cookie(vl->cache);
+#endif
+
+	/* schedule for regular updates */
+	afs_vlocation_queue_for_updates(vl);
+	goto success;
+
+found_in_memory:
+	/* found in memory */
+	_debug("found in memory");
+	atomic_inc(&vl->usage);
+	spin_unlock(&cell->vl_lock);
+	if (!list_empty(&vl->grave)) {
+		spin_lock(&afs_vlocation_graveyard_lock);
+		list_del_init(&vl->grave);
+		spin_unlock(&afs_vlocation_graveyard_lock);
+	}
+	up_write(&cell->vl_sem);
+
+	/* see if it was an abandoned record that we might try filling in */
+	spin_lock(&vl->lock);
+	while (vl->state != AFS_VL_VALID) {
+		afs_vlocation_state_t state = vl->state;
+
+		_debug("invalid [state %d]", state);
+
+		if (state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME) {
+			vl->state = AFS_VL_CREATING;
+			spin_unlock(&vl->lock);
+			goto fill_in_record;
+		}
+
+		/* must now wait for creation or update by someone else to
+		 * complete */
+		_debug("wait");
+
+		spin_unlock(&vl->lock);
+		ret = wait_event_interruptible(vl->waitq,
+					       vl->state == AFS_VL_NEW ||
+					       vl->state == AFS_VL_VALID ||
+					       vl->state == AFS_VL_NO_VOLUME);
+		if (ret < 0)
+			goto error;
+		spin_lock(&vl->lock);
+	}
+	spin_unlock(&vl->lock);
+
+success:
+	_leave(" = %p", vl);
+	return vl;
+
+error_abandon:
+	spin_lock(&vl->lock);
+	vl->state = AFS_VL_NEW;
+	spin_unlock(&vl->lock);
+	wake_up(&vl->waitq);
+error:
+	ASSERT(vl != NULL);
+	afs_put_vlocation(vl);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * finish using a volume location record
+ */
+void afs_put_vlocation(struct afs_vlocation *vl)
+{
+	if (!vl)
+		return;
+
+	_enter("%s", vl->vldb.name);
+
+	ASSERTCMP(atomic_read(&vl->usage), >, 0);
+
+	if (likely(!atomic_dec_and_test(&vl->usage))) {
+		_leave("");
+		return;
+	}
+
+	spin_lock(&afs_vlocation_graveyard_lock);
+	if (atomic_read(&vl->usage) == 0) {
+		_debug("buried");
+		list_move_tail(&vl->grave, &afs_vlocation_graveyard);
+		vl->time_of_death = get_seconds();
+		queue_delayed_work(afs_wq, &afs_vlocation_reap,
+				   afs_vlocation_timeout * HZ);
+
+		/* suspend updates on this record */
+		if (!list_empty(&vl->update)) {
+			spin_lock(&afs_vlocation_updates_lock);
+			list_del_init(&vl->update);
+			spin_unlock(&afs_vlocation_updates_lock);
+		}
+	}
+	spin_unlock(&afs_vlocation_graveyard_lock);
+	_leave(" [killed?]");
+}
+
+/*
+ * destroy a dead volume location record
+ */
+static void afs_vlocation_destroy(struct afs_vlocation *vl)
+{
+	_enter("%p", vl);
+
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_relinquish_cookie(vl->cache, 0);
+#endif
+	afs_put_cell(vl->cell);
+	kfree(vl);
+}
+
+/*
+ * reap dead volume location records
+ */
+static void afs_vlocation_reaper(struct work_struct *work)
+{
+	LIST_HEAD(corpses);
+	struct afs_vlocation *vl;
+	unsigned long delay, expiry;
+	time_t now;
+
+	_enter("");
+
+	now = get_seconds();
+	spin_lock(&afs_vlocation_graveyard_lock);
+
+	while (!list_empty(&afs_vlocation_graveyard)) {
+		vl = list_entry(afs_vlocation_graveyard.next,
+				struct afs_vlocation, grave);
+
+		_debug("check %p", vl);
+
+		/* the queue is ordered most dead first */
+		expiry = vl->time_of_death + afs_vlocation_timeout;
+		if (expiry > now) {
+			delay = (expiry - now) * HZ;
+			_debug("delay %lu", delay);
+			if (!queue_delayed_work(afs_wq, &afs_vlocation_reap,
+						delay)) {
+				cancel_delayed_work(&afs_vlocation_reap);
+				queue_delayed_work(afs_wq, &afs_vlocation_reap,
+						   delay);
+			}
+			break;
+		}
+
+		spin_lock(&vl->cell->vl_lock);
+		if (atomic_read(&vl->usage) > 0) {
+			_debug("no reap");
+			list_del_init(&vl->grave);
+		} else {
+			_debug("reap");
+			list_move_tail(&vl->grave, &corpses);
+			list_del_init(&vl->link);
+		}
+		spin_unlock(&vl->cell->vl_lock);
+	}
+
+	spin_unlock(&afs_vlocation_graveyard_lock);
+
+	/* now reap the corpses we've extracted */
+	while (!list_empty(&corpses)) {
+		vl = list_entry(corpses.next, struct afs_vlocation, grave);
+		list_del(&vl->grave);
+		afs_vlocation_destroy(vl);
+	}
+
+	_leave("");
+}
+
+/*
+ * initialise the VL update process
+ */
+int __init afs_vlocation_update_init(void)
+{
+	afs_vlocation_update_worker =
+		create_singlethread_workqueue("kafs_vlupdated");
+	return afs_vlocation_update_worker ? 0 : -ENOMEM;
+}
+
+/*
+ * discard all the volume location records for rmmod
+ */
+void afs_vlocation_purge(void)
+{
+	afs_vlocation_timeout = 0;
+
+	spin_lock(&afs_vlocation_updates_lock);
+	list_del_init(&afs_vlocation_updates);
+	spin_unlock(&afs_vlocation_updates_lock);
+	cancel_delayed_work(&afs_vlocation_update);
+	queue_delayed_work(afs_vlocation_update_worker,
+			   &afs_vlocation_update, 0);
+	destroy_workqueue(afs_vlocation_update_worker);
+
+	cancel_delayed_work(&afs_vlocation_reap);
+	queue_delayed_work(afs_wq, &afs_vlocation_reap, 0);
+}
+
+/*
+ * update a volume location
+ */
+static void afs_vlocation_updater(struct work_struct *work)
+{
+	struct afs_cache_vlocation vldb;
+	struct afs_vlocation *vl, *xvl;
+	time_t now;
+	long timeout;
+	int ret;
+
+	_enter("");
+
+	now = get_seconds();
+
+	/* find a record to update */
+	spin_lock(&afs_vlocation_updates_lock);
+	for (;;) {
+		if (list_empty(&afs_vlocation_updates)) {
+			spin_unlock(&afs_vlocation_updates_lock);
+			_leave(" [nothing]");
+			return;
+		}
+
+		vl = list_entry(afs_vlocation_updates.next,
+				struct afs_vlocation, update);
+		if (atomic_read(&vl->usage) > 0)
+			break;
+		list_del_init(&vl->update);
+	}
+
+	timeout = vl->update_at - now;
+	if (timeout > 0) {
+		queue_delayed_work(afs_vlocation_update_worker,
+				   &afs_vlocation_update, timeout * HZ);
+		spin_unlock(&afs_vlocation_updates_lock);
+		_leave(" [nothing]");
+		return;
+	}
+
+	list_del_init(&vl->update);
+	atomic_inc(&vl->usage);
+	spin_unlock(&afs_vlocation_updates_lock);
+
+	/* we can now perform the update */
+	_debug("update %s", vl->vldb.name);
+	vl->state = AFS_VL_UPDATING;
+	vl->upd_rej_cnt = 0;
+	vl->upd_busy_cnt = 0;
+
+	ret = afs_vlocation_update_record(vl, NULL, &vldb);
+	spin_lock(&vl->lock);
+	switch (ret) {
+	case 0:
+		afs_vlocation_apply_update(vl, &vldb);
+		vl->state = AFS_VL_VALID;
+		break;
+	case -ENOMEDIUM:
+		vl->state = AFS_VL_VOLUME_DELETED;
+		break;
+	default:
+		vl->state = AFS_VL_UNCERTAIN;
+		break;
+	}
+	spin_unlock(&vl->lock);
+	wake_up(&vl->waitq);
+
+	/* and then reschedule */
+	_debug("reschedule");
+	vl->update_at = get_seconds() + afs_vlocation_update_timeout;
+
+	spin_lock(&afs_vlocation_updates_lock);
+
+	if (!list_empty(&afs_vlocation_updates)) {
+		/* next update in 10 minutes, but wait at least 1 second more
+		 * than the newest record already queued so that we don't spam
+		 * the VL server suddenly with lots of requests
+		 */
+		xvl = list_entry(afs_vlocation_updates.prev,
+				 struct afs_vlocation, update);
+		if (vl->update_at <= xvl->update_at)
+			vl->update_at = xvl->update_at + 1;
+		xvl = list_entry(afs_vlocation_updates.next,
+				 struct afs_vlocation, update);
+		timeout = xvl->update_at - now;
+		if (timeout < 0)
+			timeout = 0;
+	} else {
+		timeout = afs_vlocation_update_timeout;
+	}
+
+	ASSERT(list_empty(&vl->update));
+
+	list_add_tail(&vl->update, &afs_vlocation_updates);
+
+	_debug("timeout %ld", timeout);
+	queue_delayed_work(afs_vlocation_update_worker,
+			   &afs_vlocation_update, timeout * HZ);
+	spin_unlock(&afs_vlocation_updates_lock);
+	afs_put_vlocation(vl);
+}
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
new file mode 100644
index 00000000..25cf4c3f
--- /dev/null
+++ b/fs/afs/vnode.c
@@ -0,0 +1,1025 @@
+/* AFS vnode management
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+#if 0
+static noinline bool dump_tree_aux(struct rb_node *node, struct rb_node *parent,
+				   int depth, char lr)
+{
+	struct afs_vnode *vnode;
+	bool bad = false;
+
+	if (!node)
+		return false;
+
+	if (node->rb_left)
+		bad = dump_tree_aux(node->rb_left, node, depth + 2, '/');
+
+	vnode = rb_entry(node, struct afs_vnode, cb_promise);
+	_debug("%c %*.*s%c%p {%d}",
+	       rb_is_red(node) ? 'R' : 'B',
+	       depth, depth, "", lr,
+	       vnode, vnode->cb_expires_at);
+	if (rb_parent(node) != parent) {
+		printk("BAD: %p != %p\n", rb_parent(node), parent);
+		bad = true;
+	}
+
+	if (node->rb_right)
+		bad |= dump_tree_aux(node->rb_right, node, depth + 2, '\\');
+
+	return bad;
+}
+
+static noinline void dump_tree(const char *name, struct afs_server *server)
+{
+	_enter("%s", name);
+	if (dump_tree_aux(server->cb_promises.rb_node, NULL, 0, '-'))
+		BUG();
+}
+#endif
+
+/*
+ * insert a vnode into the backing server's vnode tree
+ */
+static void afs_install_vnode(struct afs_vnode *vnode,
+			      struct afs_server *server)
+{
+	struct afs_server *old_server = vnode->server;
+	struct afs_vnode *xvnode;
+	struct rb_node *parent, **p;
+
+	_enter("%p,%p", vnode, server);
+
+	if (old_server) {
+		spin_lock(&old_server->fs_lock);
+		rb_erase(&vnode->server_rb, &old_server->fs_vnodes);
+		spin_unlock(&old_server->fs_lock);
+	}
+
+	afs_get_server(server);
+	vnode->server = server;
+	afs_put_server(old_server);
+
+	/* insert into the server's vnode tree in FID order */
+	spin_lock(&server->fs_lock);
+
+	parent = NULL;
+	p = &server->fs_vnodes.rb_node;
+	while (*p) {
+		parent = *p;
+		xvnode = rb_entry(parent, struct afs_vnode, server_rb);
+		if (vnode->fid.vid < xvnode->fid.vid)
+			p = &(*p)->rb_left;
+		else if (vnode->fid.vid > xvnode->fid.vid)
+			p = &(*p)->rb_right;
+		else if (vnode->fid.vnode < xvnode->fid.vnode)
+			p = &(*p)->rb_left;
+		else if (vnode->fid.vnode > xvnode->fid.vnode)
+			p = &(*p)->rb_right;
+		else if (vnode->fid.unique < xvnode->fid.unique)
+			p = &(*p)->rb_left;
+		else if (vnode->fid.unique > xvnode->fid.unique)
+			p = &(*p)->rb_right;
+		else
+			BUG(); /* can't happen unless afs_iget() malfunctions */
+	}
+
+	rb_link_node(&vnode->server_rb, parent, p);
+	rb_insert_color(&vnode->server_rb, &server->fs_vnodes);
+
+	spin_unlock(&server->fs_lock);
+	_leave("");
+}
+
+/*
+ * insert a vnode into the promising server's update/expiration tree
+ * - caller must hold vnode->lock
+ */
+static void afs_vnode_note_promise(struct afs_vnode *vnode,
+				   struct afs_server *server)
+{
+	struct afs_server *old_server;
+	struct afs_vnode *xvnode;
+	struct rb_node *parent, **p;
+
+	_enter("%p,%p", vnode, server);
+
+	ASSERT(server != NULL);
+
+	old_server = vnode->server;
+	if (vnode->cb_promised) {
+		if (server == old_server &&
+		    vnode->cb_expires == vnode->cb_expires_at) {
+			_leave(" [no change]");
+			return;
+		}
+
+		spin_lock(&old_server->cb_lock);
+		if (vnode->cb_promised) {
+			_debug("delete");
+			rb_erase(&vnode->cb_promise, &old_server->cb_promises);
+			vnode->cb_promised = false;
+		}
+		spin_unlock(&old_server->cb_lock);
+	}
+
+	if (vnode->server != server)
+		afs_install_vnode(vnode, server);
+
+	vnode->cb_expires_at = vnode->cb_expires;
+	_debug("PROMISE on %p {%lu}",
+	       vnode, (unsigned long) vnode->cb_expires_at);
+
+	/* abuse an RB-tree to hold the expiration order (we may have multiple
+	 * items with the same expiration time) */
+	spin_lock(&server->cb_lock);
+
+	parent = NULL;
+	p = &server->cb_promises.rb_node;
+	while (*p) {
+		parent = *p;
+		xvnode = rb_entry(parent, struct afs_vnode, cb_promise);
+		if (vnode->cb_expires_at < xvnode->cb_expires_at)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&vnode->cb_promise, parent, p);
+	rb_insert_color(&vnode->cb_promise, &server->cb_promises);
+	vnode->cb_promised = true;
+
+	spin_unlock(&server->cb_lock);
+	_leave("");
+}
+
+/*
+ * handle remote file deletion by discarding the callback promise
+ */
+static void afs_vnode_deleted_remotely(struct afs_vnode *vnode)
+{
+	struct afs_server *server;
+
+	_enter("{%p}", vnode->server);
+
+	set_bit(AFS_VNODE_DELETED, &vnode->flags);
+
+	server = vnode->server;
+	if (server) {
+		if (vnode->cb_promised) {
+			spin_lock(&server->cb_lock);
+			if (vnode->cb_promised) {
+				rb_erase(&vnode->cb_promise,
+					 &server->cb_promises);
+				vnode->cb_promised = false;
+			}
+			spin_unlock(&server->cb_lock);
+		}
+
+		spin_lock(&server->fs_lock);
+		rb_erase(&vnode->server_rb, &server->fs_vnodes);
+		spin_unlock(&server->fs_lock);
+
+		vnode->server = NULL;
+		afs_put_server(server);
+	} else {
+		ASSERT(!vnode->cb_promised);
+	}
+
+	_leave("");
+}
+
+/*
+ * finish off updating the recorded status of a file after a successful
+ * operation completion
+ * - starts callback expiry timer
+ * - adds to server's callback list
+ */
+void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
+				      struct afs_server *server)
+{
+	struct afs_server *oldserver = NULL;
+
+	_enter("%p,%p", vnode, server);
+
+	spin_lock(&vnode->lock);
+	clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+	afs_vnode_note_promise(vnode, server);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+
+	wake_up_all(&vnode->update_waitq);
+	afs_put_server(oldserver);
+	_leave("");
+}
+
+/*
+ * finish off updating the recorded status of a file after an operation failed
+ */
+static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret)
+{
+	_enter("{%x:%u},%d", vnode->fid.vid, vnode->fid.vnode, ret);
+
+	spin_lock(&vnode->lock);
+
+	clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+
+	if (ret == -ENOENT) {
+		/* the file was deleted on the server */
+		_debug("got NOENT from server - marking file deleted");
+		afs_vnode_deleted_remotely(vnode);
+	}
+
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+
+	wake_up_all(&vnode->update_waitq);
+	_leave("");
+}
+
+/*
+ * fetch file status from the volume
+ * - don't issue a fetch if:
+ *   - the changed bit is not set and there's a valid callback
+ *   - there are any outstanding ops that will fetch the status
+ * - TODO implement local caching
+ */
+int afs_vnode_fetch_status(struct afs_vnode *vnode,
+			   struct afs_vnode *auth_vnode, struct key *key)
+{
+	struct afs_server *server;
+	unsigned long acl_order;
+	int ret;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	_enter("%s,{%x:%u.%u}",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+
+	if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+	    vnode->cb_promised) {
+		_leave(" [unchanged]");
+		return 0;
+	}
+
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+		_leave(" [deleted]");
+		return -ENOENT;
+	}
+
+	acl_order = 0;
+	if (auth_vnode)
+		acl_order = auth_vnode->acl_order;
+
+	spin_lock(&vnode->lock);
+
+	if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+	    vnode->cb_promised) {
+		spin_unlock(&vnode->lock);
+		_leave(" [unchanged]");
+		return 0;
+	}
+
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+
+	if (vnode->update_cnt > 0) {
+		/* someone else started a fetch */
+		_debug("wait on fetch %d", vnode->update_cnt);
+
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		ASSERT(myself.func != NULL);
+		add_wait_queue(&vnode->update_waitq, &myself);
+
+		/* wait for the status to be updated */
+		for (;;) {
+			if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
+				break;
+			if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+				break;
+
+			/* check to see if it got updated and invalidated all
+			 * before we saw it */
+			if (vnode->update_cnt == 0) {
+				remove_wait_queue(&vnode->update_waitq,
+						  &myself);
+				set_current_state(TASK_RUNNING);
+				goto get_anyway;
+			}
+
+			spin_unlock(&vnode->lock);
+
+			schedule();
+			set_current_state(TASK_UNINTERRUPTIBLE);
+
+			spin_lock(&vnode->lock);
+		}
+
+		remove_wait_queue(&vnode->update_waitq, &myself);
+		spin_unlock(&vnode->lock);
+		set_current_state(TASK_RUNNING);
+
+		return test_bit(AFS_VNODE_DELETED, &vnode->flags) ?
+			-ENOENT : 0;
+	}
+
+get_anyway:
+	/* okay... we're going to have to initiate the op */
+	vnode->update_cnt++;
+
+	spin_unlock(&vnode->lock);
+
+	/* merge AFS status fetches and clear outstanding callback on this
+	 * vnode */
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %p{%08x}",
+		       server, ntohl(server->addr.s_addr));
+
+		ret = afs_fs_fetch_file_status(server, key, vnode, NULL,
+					       &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		_debug("adjust");
+		if (auth_vnode)
+			afs_cache_permit(vnode, key, acl_order);
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_put_server(server);
+	} else {
+		_debug("failed [%d]", ret);
+		afs_vnode_status_update_failed(vnode, ret);
+	}
+
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+
+	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+	return PTR_ERR(server);
+}
+
+/*
+ * fetch file data from the volume
+ * - TODO implement caching
+ */
+int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
+			 off_t offset, size_t length, struct page *page)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,,,",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	/* this op will fetch the status */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	/* merge in AFS status fetches and clear outstanding callback on this
+	 * vnode */
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_fetch_data(server, key, vnode, offset, length,
+					page, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	return PTR_ERR(server);
+}
+
+/*
+ * make a file or a directory
+ */
+int afs_vnode_create(struct afs_vnode *vnode, struct key *key,
+		     const char *name, umode_t mode, struct afs_fid *newfid,
+		     struct afs_file_status *newstatus,
+		     struct afs_callback *newcb, struct afs_server **_server)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,%s,,",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key),
+	       name);
+
+	/* this op will fetch the status on the directory we're creating in */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_create(server, key, vnode, name, mode, newfid,
+				    newstatus, newcb, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		*_server = server;
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+		*_server = NULL;
+	}
+
+	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+	return PTR_ERR(server);
+}
+
+/*
+ * remove a file or directory
+ */
+int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name,
+		     bool isdir)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,%s",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key),
+	       name);
+
+	/* this op will fetch the status on the directory we're removing from */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_remove(server, key, vnode, name, isdir,
+				    &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+	}
+
+	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+	return PTR_ERR(server);
+}
+
+/*
+ * create a hard link
+ */
+int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
+			  struct key *key, const char *name)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%s{%x:%u.%u},%x,%s",
+	       dvnode->volume->vlocation->vldb.name,
+	       dvnode->fid.vid,
+	       dvnode->fid.vnode,
+	       dvnode->fid.unique,
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key),
+	       name);
+
+	/* this op will fetch the status on the directory we're removing from */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+	spin_lock(&dvnode->lock);
+	dvnode->update_cnt++;
+	spin_unlock(&dvnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(dvnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_link(server, key, dvnode, vnode, name,
+				  &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(dvnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_vnode_finalise_status_update(dvnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+		afs_vnode_status_update_failed(dvnode, ret);
+	}
+
+	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	spin_lock(&dvnode->lock);
+	dvnode->update_cnt--;
+	ASSERTCMP(dvnode->update_cnt, >=, 0);
+	spin_unlock(&dvnode->lock);
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+	return PTR_ERR(server);
+}
+
+/*
+ * create a symbolic link
+ */
+int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key,
+		      const char *name, const char *content,
+		      struct afs_fid *newfid,
+		      struct afs_file_status *newstatus,
+		      struct afs_server **_server)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,%s,%s,,,",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key),
+	       name, content);
+
+	/* this op will fetch the status on the directory we're creating in */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_symlink(server, key, vnode, name, content,
+				     newfid, newstatus, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		*_server = server;
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+		*_server = NULL;
+	}
+
+	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+	return PTR_ERR(server);
+}
+
+/*
+ * rename a file
+ */
+int afs_vnode_rename(struct afs_vnode *orig_dvnode,
+		     struct afs_vnode *new_dvnode,
+		     struct key *key,
+		     const char *orig_name,
+		     const char *new_name)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%s{%u,%u,%u},%x,%s,%s",
+	       orig_dvnode->volume->vlocation->vldb.name,
+	       orig_dvnode->fid.vid,
+	       orig_dvnode->fid.vnode,
+	       orig_dvnode->fid.unique,
+	       new_dvnode->volume->vlocation->vldb.name,
+	       new_dvnode->fid.vid,
+	       new_dvnode->fid.vnode,
+	       new_dvnode->fid.unique,
+	       key_serial(key),
+	       orig_name,
+	       new_name);
+
+	/* this op will fetch the status on both the directories we're dealing
+	 * with */
+	spin_lock(&orig_dvnode->lock);
+	orig_dvnode->update_cnt++;
+	spin_unlock(&orig_dvnode->lock);
+	if (new_dvnode != orig_dvnode) {
+		spin_lock(&new_dvnode->lock);
+		new_dvnode->update_cnt++;
+		spin_unlock(&new_dvnode->lock);
+	}
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(orig_dvnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_rename(server, key, orig_dvnode, orig_name,
+				    new_dvnode, new_name, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(orig_dvnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(orig_dvnode, server);
+		if (new_dvnode != orig_dvnode)
+			afs_vnode_finalise_status_update(new_dvnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(orig_dvnode, ret);
+		if (new_dvnode != orig_dvnode)
+			afs_vnode_status_update_failed(new_dvnode, ret);
+	}
+
+	_leave(" = %d [cnt %d]", ret, orig_dvnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&orig_dvnode->lock);
+	orig_dvnode->update_cnt--;
+	ASSERTCMP(orig_dvnode->update_cnt, >=, 0);
+	spin_unlock(&orig_dvnode->lock);
+	if (new_dvnode != orig_dvnode) {
+		spin_lock(&new_dvnode->lock);
+		new_dvnode->update_cnt--;
+		ASSERTCMP(new_dvnode->update_cnt, >=, 0);
+		spin_unlock(&new_dvnode->lock);
+	}
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt);
+	return PTR_ERR(server);
+}
+
+/*
+ * write to a file
+ */
+int afs_vnode_store_data(struct afs_writeback *wb, pgoff_t first, pgoff_t last,
+			 unsigned offset, unsigned to)
+{
+	struct afs_server *server;
+	struct afs_vnode *vnode = wb->vnode;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,%lx,%lx,%x,%x",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(wb->key),
+	       first, last, offset, to);
+
+	/* this op will fetch the status */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_store_data(server, wb, first, last, offset, to,
+					&afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	return PTR_ERR(server);
+}
+
+/*
+ * set the attributes on a file
+ */
+int afs_vnode_setattr(struct afs_vnode *vnode, struct key *key,
+		      struct iattr *attr)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	/* this op will fetch the status */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_setattr(server, key, vnode, attr, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	return PTR_ERR(server);
+}
+
+/*
+ * get the status of a volume
+ */
+int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
+				struct afs_volume_status *vs)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_get_volume_status(server, key, vnode, vs, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0)
+		afs_put_server(server);
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	return PTR_ERR(server);
+}
+
+/*
+ * get a lock on a file
+ */
+int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key,
+		       afs_lock_type_t type)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,%u",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key), type);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_set_lock(server, key, vnode, type, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0)
+		afs_put_server(server);
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	return PTR_ERR(server);
+}
+
+/*
+ * extend a lock on a file
+ */
+int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_extend_lock(server, key, vnode, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0)
+		afs_put_server(server);
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	return PTR_ERR(server);
+}
+
+/*
+ * release a lock on a file
+ */
+int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_release_lock(server, key, vnode, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0)
+		afs_put_server(server);
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	return PTR_ERR(server);
+}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
new file mode 100644
index 00000000..401eeb21
--- /dev/null
+++ b/fs/afs/volume.c
@@ -0,0 +1,401 @@
+/* AFS volume management
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" };
+
+/*
+ * lookup a volume by name
+ * - this can be one of the following:
+ *	"%[cell:]volume[.]"		R/W volume
+ *	"#[cell:]volume[.]"		R/O or R/W volume (rwparent=0),
+ *					 or R/W (rwparent=1) volume
+ *	"%[cell:]volume.readonly"	R/O volume
+ *	"#[cell:]volume.readonly"	R/O volume
+ *	"%[cell:]volume.backup"		Backup volume
+ *	"#[cell:]volume.backup"		Backup volume
+ *
+ * The cell name is optional, and defaults to the current cell.
+ *
+ * See "The Rules of Mount Point Traversal" in Chapter 5 of the AFS SysAdmin
+ * Guide
+ * - Rule 1: Explicit type suffix forces access of that type or nothing
+ *           (no suffix, then use Rule 2 & 3)
+ * - Rule 2: If parent volume is R/O, then mount R/O volume by preference, R/W
+ *           if not available
+ * - Rule 3: If parent volume is R/W, then only mount R/W volume unless
+ *           explicitly told otherwise
+ */
+struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
+{
+	struct afs_vlocation *vlocation = NULL;
+	struct afs_volume *volume = NULL;
+	struct afs_server *server = NULL;
+	char srvtmask;
+	int ret, loop;
+
+	_enter("{%*.*s,%d}",
+	       params->volnamesz, params->volnamesz, params->volname, params->rwpath);
+
+	/* lookup the volume location record */
+	vlocation = afs_vlocation_lookup(params->cell, params->key,
+					 params->volname, params->volnamesz);
+	if (IS_ERR(vlocation)) {
+		ret = PTR_ERR(vlocation);
+		vlocation = NULL;
+		goto error;
+	}
+
+	/* make the final decision on the type we want */
+	ret = -ENOMEDIUM;
+	if (params->force && !(vlocation->vldb.vidmask & (1 << params->type)))
+		goto error;
+
+	srvtmask = 0;
+	for (loop = 0; loop < vlocation->vldb.nservers; loop++)
+		srvtmask |= vlocation->vldb.srvtmask[loop];
+
+	if (params->force) {
+		if (!(srvtmask & (1 << params->type)))
+			goto error;
+	} else if (srvtmask & AFS_VOL_VTM_RO) {
+		params->type = AFSVL_ROVOL;
+	} else if (srvtmask & AFS_VOL_VTM_RW) {
+		params->type = AFSVL_RWVOL;
+	} else {
+		goto error;
+	}
+
+	down_write(&params->cell->vl_sem);
+
+	/* is the volume already active? */
+	if (vlocation->vols[params->type]) {
+		/* yes - re-use it */
+		volume = vlocation->vols[params->type];
+		afs_get_volume(volume);
+		goto success;
+	}
+
+	/* create a new volume record */
+	_debug("creating new volume record");
+
+	ret = -ENOMEM;
+	volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL);
+	if (!volume)
+		goto error_up;
+
+	atomic_set(&volume->usage, 1);
+	volume->type		= params->type;
+	volume->type_force	= params->force;
+	volume->cell		= params->cell;
+	volume->vid		= vlocation->vldb.vid[params->type];
+
+	ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
+	if (ret)
+		goto error_bdi;
+
+	init_rwsem(&volume->server_sem);
+
+	/* look up all the applicable server records */
+	for (loop = 0; loop < 8; loop++) {
+		if (vlocation->vldb.srvtmask[loop] & (1 << volume->type)) {
+			server = afs_lookup_server(
+			       volume->cell, &vlocation->vldb.servers[loop]);
+			if (IS_ERR(server)) {
+				ret = PTR_ERR(server);
+				goto error_discard;
+			}
+
+			volume->servers[volume->nservers] = server;
+			volume->nservers++;
+		}
+	}
+
+	/* attach the cache and volume location */
+#ifdef CONFIG_AFS_FSCACHE
+	volume->cache = fscache_acquire_cookie(vlocation->cache,
+					       &afs_volume_cache_index_def,
+					       volume);
+#endif
+	afs_get_vlocation(vlocation);
+	volume->vlocation = vlocation;
+
+	vlocation->vols[volume->type] = volume;
+
+success:
+	_debug("kAFS selected %s volume %08x",
+	       afs_voltypes[volume->type], volume->vid);
+	up_write(&params->cell->vl_sem);
+	afs_put_vlocation(vlocation);
+	_leave(" = %p", volume);
+	return volume;
+
+	/* clean up */
+error_up:
+	up_write(&params->cell->vl_sem);
+error:
+	afs_put_vlocation(vlocation);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+
+error_discard:
+	bdi_destroy(&volume->bdi);
+error_bdi:
+	up_write(&params->cell->vl_sem);
+
+	for (loop = volume->nservers - 1; loop >= 0; loop--)
+		afs_put_server(volume->servers[loop]);
+
+	kfree(volume);
+	goto error;
+}
+
+/*
+ * destroy a volume record
+ */
+void afs_put_volume(struct afs_volume *volume)
+{
+	struct afs_vlocation *vlocation;
+	int loop;
+
+	if (!volume)
+		return;
+
+	_enter("%p", volume);
+
+	ASSERTCMP(atomic_read(&volume->usage), >, 0);
+
+	vlocation = volume->vlocation;
+
+	/* to prevent a race, the decrement and the dequeue must be effectively
+	 * atomic */
+	down_write(&vlocation->cell->vl_sem);
+
+	if (likely(!atomic_dec_and_test(&volume->usage))) {
+		up_write(&vlocation->cell->vl_sem);
+		_leave("");
+		return;
+	}
+
+	vlocation->vols[volume->type] = NULL;
+
+	up_write(&vlocation->cell->vl_sem);
+
+	/* finish cleaning up the volume */
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_relinquish_cookie(volume->cache, 0);
+#endif
+	afs_put_vlocation(vlocation);
+
+	for (loop = volume->nservers - 1; loop >= 0; loop--)
+		afs_put_server(volume->servers[loop]);
+
+	bdi_destroy(&volume->bdi);
+	kfree(volume);
+
+	_leave(" [destroyed]");
+}
+
+/*
+ * pick a server to use to try accessing this volume
+ * - returns with an elevated usage count on the server chosen
+ */
+struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode)
+{
+	struct afs_volume *volume = vnode->volume;
+	struct afs_server *server;
+	int ret, state, loop;
+
+	_enter("%s", volume->vlocation->vldb.name);
+
+	/* stick with the server we're already using if we can */
+	if (vnode->server && vnode->server->fs_state == 0) {
+		afs_get_server(vnode->server);
+		_leave(" = %p [current]", vnode->server);
+		return vnode->server;
+	}
+
+	down_read(&volume->server_sem);
+
+	/* handle the no-server case */
+	if (volume->nservers == 0) {
+		ret = volume->rjservers ? -ENOMEDIUM : -ESTALE;
+		up_read(&volume->server_sem);
+		_leave(" = %d [no servers]", ret);
+		return ERR_PTR(ret);
+	}
+
+	/* basically, just search the list for the first live server and use
+	 * that */
+	ret = 0;
+	for (loop = 0; loop < volume->nservers; loop++) {
+		server = volume->servers[loop];
+		state = server->fs_state;
+
+		_debug("consider %d [%d]", loop, state);
+
+		switch (state) {
+			/* found an apparently healthy server */
+		case 0:
+			afs_get_server(server);
+			up_read(&volume->server_sem);
+			_leave(" = %p (picked %08x)",
+			       server, ntohl(server->addr.s_addr));
+			return server;
+
+		case -ENETUNREACH:
+			if (ret == 0)
+				ret = state;
+			break;
+
+		case -EHOSTUNREACH:
+			if (ret == 0 ||
+			    ret == -ENETUNREACH)
+				ret = state;
+			break;
+
+		case -ECONNREFUSED:
+			if (ret == 0 ||
+			    ret == -ENETUNREACH ||
+			    ret == -EHOSTUNREACH)
+				ret = state;
+			break;
+
+		default:
+		case -EREMOTEIO:
+			if (ret == 0 ||
+			    ret == -ENETUNREACH ||
+			    ret == -EHOSTUNREACH ||
+			    ret == -ECONNREFUSED)
+				ret = state;
+			break;
+		}
+	}
+
+	/* no available servers
+	 * - TODO: handle the no active servers case better
+	 */
+	up_read(&volume->server_sem);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * release a server after use
+ * - releases the ref on the server struct that was acquired by picking
+ * - records result of using a particular server to access a volume
+ * - return 0 to try again, 1 if okay or to issue error
+ * - the caller must release the server struct if result was 0
+ */
+int afs_volume_release_fileserver(struct afs_vnode *vnode,
+				  struct afs_server *server,
+				  int result)
+{
+	struct afs_volume *volume = vnode->volume;
+	unsigned loop;
+
+	_enter("%s,%08x,%d",
+	       volume->vlocation->vldb.name, ntohl(server->addr.s_addr),
+	       result);
+
+	switch (result) {
+		/* success */
+	case 0:
+		server->fs_act_jif = jiffies;
+		server->fs_state = 0;
+		_leave("");
+		return 1;
+
+		/* the fileserver denied all knowledge of the volume */
+	case -ENOMEDIUM:
+		server->fs_act_jif = jiffies;
+		down_write(&volume->server_sem);
+
+		/* firstly, find where the server is in the active list (if it
+		 * is) */
+		for (loop = 0; loop < volume->nservers; loop++)
+			if (volume->servers[loop] == server)
+				goto present;
+
+		/* no longer there - may have been discarded by another op */
+		goto try_next_server_upw;
+
+	present:
+		volume->nservers--;
+		memmove(&volume->servers[loop],
+			&volume->servers[loop + 1],
+			sizeof(volume->servers[loop]) *
+			(volume->nservers - loop));
+		volume->servers[volume->nservers] = NULL;
+		afs_put_server(server);
+		volume->rjservers++;
+
+		if (volume->nservers > 0)
+			/* another server might acknowledge its existence */
+			goto try_next_server_upw;
+
+		/* handle the case where all the fileservers have rejected the
+		 * volume
+		 * - TODO: try asking the fileservers for volume information
+		 * - TODO: contact the VL server again to see if the volume is
+		 *         no longer registered
+		 */
+		up_write(&volume->server_sem);
+		afs_put_server(server);
+		_leave(" [completely rejected]");
+		return 1;
+
+		/* problem reaching the server */
+	case -ENETUNREACH:
+	case -EHOSTUNREACH:
+	case -ECONNREFUSED:
+	case -ETIME:
+	case -ETIMEDOUT:
+	case -EREMOTEIO:
+		/* mark the server as dead
+		 * TODO: vary dead timeout depending on error
+		 */
+		spin_lock(&server->fs_lock);
+		if (!server->fs_state) {
+			server->fs_dead_jif = jiffies + HZ * 10;
+			server->fs_state = result;
+			printk("kAFS: SERVER DEAD state=%d\n", result);
+		}
+		spin_unlock(&server->fs_lock);
+		goto try_next_server;
+
+		/* miscellaneous error */
+	default:
+		server->fs_act_jif = jiffies;
+	case -ENOMEM:
+	case -ENONET:
+		/* tell the caller to accept the result */
+		afs_put_server(server);
+		_leave(" [local failure]");
+		return 1;
+	}
+
+	/* tell the caller to loop around and try the next server */
+try_next_server_upw:
+	up_write(&volume->server_sem);
+try_next_server:
+	afs_put_server(server);
+	_leave(" [try next server]");
+	return 0;
+}
diff --git a/fs/afs/write.c b/fs/afs/write.c
new file mode 100644
index 00000000..b806285f
--- /dev/null
+++ b/fs/afs/write.c
@@ -0,0 +1,754 @@
+/* handling of writes to regular files and writing back to the server
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/backing-dev.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+static int afs_write_back_from_locked_page(struct afs_writeback *wb,
+					   struct page *page);
+
+/*
+ * mark a page as having been made dirty and thus needing writeback
+ */
+int afs_set_page_dirty(struct page *page)
+{
+	_enter("");
+	return __set_page_dirty_nobuffers(page);
+}
+
+/*
+ * unlink a writeback record because its usage has reached zero
+ * - must be called with the wb->vnode->writeback_lock held
+ */
+static void afs_unlink_writeback(struct afs_writeback *wb)
+{
+	struct afs_writeback *front;
+	struct afs_vnode *vnode = wb->vnode;
+
+	list_del_init(&wb->link);
+	if (!list_empty(&vnode->writebacks)) {
+		/* if an fsync rises to the front of the queue then wake it
+		 * up */
+		front = list_entry(vnode->writebacks.next,
+				   struct afs_writeback, link);
+		if (front->state == AFS_WBACK_SYNCING) {
+			_debug("wake up sync");
+			front->state = AFS_WBACK_COMPLETE;
+			wake_up(&front->waitq);
+		}
+	}
+}
+
+/*
+ * free a writeback record
+ */
+static void afs_free_writeback(struct afs_writeback *wb)
+{
+	_enter("");
+	key_put(wb->key);
+	kfree(wb);
+}
+
+/*
+ * dispose of a reference to a writeback record
+ */
+void afs_put_writeback(struct afs_writeback *wb)
+{
+	struct afs_vnode *vnode = wb->vnode;
+
+	_enter("{%d}", wb->usage);
+
+	spin_lock(&vnode->writeback_lock);
+	if (--wb->usage == 0)
+		afs_unlink_writeback(wb);
+	else
+		wb = NULL;
+	spin_unlock(&vnode->writeback_lock);
+	if (wb)
+		afs_free_writeback(wb);
+}
+
+/*
+ * partly or wholly fill a page that's under preparation for writing
+ */
+static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
+			 loff_t pos, struct page *page)
+{
+	loff_t i_size;
+	int ret;
+	int len;
+
+	_enter(",,%llu", (unsigned long long)pos);
+
+	i_size = i_size_read(&vnode->vfs_inode);
+	if (pos + PAGE_CACHE_SIZE > i_size)
+		len = i_size - pos;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	ret = afs_vnode_fetch_data(vnode, key, pos, len, page);
+	if (ret < 0) {
+		if (ret == -ENOENT) {
+			_debug("got NOENT from server"
+			       " - marking file deleted and stale");
+			set_bit(AFS_VNODE_DELETED, &vnode->flags);
+			ret = -ESTALE;
+		}
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * prepare to perform part of a write to a page
+ */
+int afs_write_begin(struct file *file, struct address_space *mapping,
+		    loff_t pos, unsigned len, unsigned flags,
+		    struct page **pagep, void **fsdata)
+{
+	struct afs_writeback *candidate, *wb;
+	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+	struct page *page;
+	struct key *key = file->private_data;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned to = from + len;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	int ret;
+
+	_enter("{%x:%u},{%lx},%u,%u",
+	       vnode->fid.vid, vnode->fid.vnode, index, from, to);
+
+	candidate = kzalloc(sizeof(*candidate), GFP_KERNEL);
+	if (!candidate)
+		return -ENOMEM;
+	candidate->vnode = vnode;
+	candidate->first = candidate->last = index;
+	candidate->offset_first = from;
+	candidate->to_last = to;
+	INIT_LIST_HEAD(&candidate->link);
+	candidate->usage = 1;
+	candidate->state = AFS_WBACK_PENDING;
+	init_waitqueue_head(&candidate->waitq);
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page) {
+		kfree(candidate);
+		return -ENOMEM;
+	}
+	*pagep = page;
+	/* page won't leak in error case: it eventually gets cleaned off LRU */
+
+	if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) {
+		ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page);
+		if (ret < 0) {
+			kfree(candidate);
+			_leave(" = %d [prep]", ret);
+			return ret;
+		}
+		SetPageUptodate(page);
+	}
+
+try_again:
+	spin_lock(&vnode->writeback_lock);
+
+	/* see if this page is already pending a writeback under a suitable key
+	 * - if so we can just join onto that one */
+	wb = (struct afs_writeback *) page_private(page);
+	if (wb) {
+		if (wb->key == key && wb->state == AFS_WBACK_PENDING)
+			goto subsume_in_current_wb;
+		goto flush_conflicting_wb;
+	}
+
+	if (index > 0) {
+		/* see if we can find an already pending writeback that we can
+		 * append this page to */
+		list_for_each_entry(wb, &vnode->writebacks, link) {
+			if (wb->last == index - 1 && wb->key == key &&
+			    wb->state == AFS_WBACK_PENDING)
+				goto append_to_previous_wb;
+		}
+	}
+
+	list_add_tail(&candidate->link, &vnode->writebacks);
+	candidate->key = key_get(key);
+	spin_unlock(&vnode->writeback_lock);
+	SetPagePrivate(page);
+	set_page_private(page, (unsigned long) candidate);
+	_leave(" = 0 [new]");
+	return 0;
+
+subsume_in_current_wb:
+	_debug("subsume");
+	ASSERTRANGE(wb->first, <=, index, <=, wb->last);
+	if (index == wb->first && from < wb->offset_first)
+		wb->offset_first = from;
+	if (index == wb->last && to > wb->to_last)
+		wb->to_last = to;
+	spin_unlock(&vnode->writeback_lock);
+	kfree(candidate);
+	_leave(" = 0 [sub]");
+	return 0;
+
+append_to_previous_wb:
+	_debug("append into %lx-%lx", wb->first, wb->last);
+	wb->usage++;
+	wb->last++;
+	wb->to_last = to;
+	spin_unlock(&vnode->writeback_lock);
+	SetPagePrivate(page);
+	set_page_private(page, (unsigned long) wb);
+	kfree(candidate);
+	_leave(" = 0 [app]");
+	return 0;
+
+	/* the page is currently bound to another context, so if it's dirty we
+	 * need to flush it before we can use the new context */
+flush_conflicting_wb:
+	_debug("flush conflict");
+	if (wb->state == AFS_WBACK_PENDING)
+		wb->state = AFS_WBACK_CONFLICTING;
+	spin_unlock(&vnode->writeback_lock);
+	if (PageDirty(page)) {
+		ret = afs_write_back_from_locked_page(wb, page);
+		if (ret < 0) {
+			afs_put_writeback(candidate);
+			_leave(" = %d", ret);
+			return ret;
+		}
+	}
+
+	/* the page holds a ref on the writeback record */
+	afs_put_writeback(wb);
+	set_page_private(page, 0);
+	ClearPagePrivate(page);
+	goto try_again;
+}
+
+/*
+ * finalise part of a write to a page
+ */
+int afs_write_end(struct file *file, struct address_space *mapping,
+		  loff_t pos, unsigned len, unsigned copied,
+		  struct page *page, void *fsdata)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+	loff_t i_size, maybe_i_size;
+
+	_enter("{%x:%u},{%lx}",
+	       vnode->fid.vid, vnode->fid.vnode, page->index);
+
+	maybe_i_size = pos + copied;
+
+	i_size = i_size_read(&vnode->vfs_inode);
+	if (maybe_i_size > i_size) {
+		spin_lock(&vnode->writeback_lock);
+		i_size = i_size_read(&vnode->vfs_inode);
+		if (maybe_i_size > i_size)
+			i_size_write(&vnode->vfs_inode, maybe_i_size);
+		spin_unlock(&vnode->writeback_lock);
+	}
+
+	set_page_dirty(page);
+	if (PageDirty(page))
+		_debug("dirtied");
+	unlock_page(page);
+	page_cache_release(page);
+
+	return copied;
+}
+
+/*
+ * kill all the pages in the given range
+ */
+static void afs_kill_pages(struct afs_vnode *vnode, bool error,
+			   pgoff_t first, pgoff_t last)
+{
+	struct pagevec pv;
+	unsigned count, loop;
+
+	_enter("{%x:%u},%lx-%lx",
+	       vnode->fid.vid, vnode->fid.vnode, first, last);
+
+	pagevec_init(&pv, 0);
+
+	do {
+		_debug("kill %lx-%lx", first, last);
+
+		count = last - first + 1;
+		if (count > PAGEVEC_SIZE)
+			count = PAGEVEC_SIZE;
+		pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
+					      first, count, pv.pages);
+		ASSERTCMP(pv.nr, ==, count);
+
+		for (loop = 0; loop < count; loop++) {
+			ClearPageUptodate(pv.pages[loop]);
+			if (error)
+				SetPageError(pv.pages[loop]);
+			end_page_writeback(pv.pages[loop]);
+		}
+
+		__pagevec_release(&pv);
+	} while (first < last);
+
+	_leave("");
+}
+
+/*
+ * synchronously write back the locked page and any subsequent non-locked dirty
+ * pages also covered by the same writeback record
+ */
+static int afs_write_back_from_locked_page(struct afs_writeback *wb,
+					   struct page *primary_page)
+{
+	struct page *pages[8], *page;
+	unsigned long count;
+	unsigned n, offset, to;
+	pgoff_t start, first, last;
+	int loop, ret;
+
+	_enter(",%lx", primary_page->index);
+
+	count = 1;
+	if (!clear_page_dirty_for_io(primary_page))
+		BUG();
+	if (test_set_page_writeback(primary_page))
+		BUG();
+
+	/* find all consecutive lockable dirty pages, stopping when we find a
+	 * page that is not immediately lockable, is not dirty or is missing,
+	 * or we reach the end of the range */
+	start = primary_page->index;
+	if (start >= wb->last)
+		goto no_more;
+	start++;
+	do {
+		_debug("more %lx [%lx]", start, count);
+		n = wb->last - start + 1;
+		if (n > ARRAY_SIZE(pages))
+			n = ARRAY_SIZE(pages);
+		n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping,
+					  start, n, pages);
+		_debug("fgpc %u", n);
+		if (n == 0)
+			goto no_more;
+		if (pages[0]->index != start) {
+			do {
+				put_page(pages[--n]);
+			} while (n > 0);
+			goto no_more;
+		}
+
+		for (loop = 0; loop < n; loop++) {
+			page = pages[loop];
+			if (page->index > wb->last)
+				break;
+			if (!trylock_page(page))
+				break;
+			if (!PageDirty(page) ||
+			    page_private(page) != (unsigned long) wb) {
+				unlock_page(page);
+				break;
+			}
+			if (!clear_page_dirty_for_io(page))
+				BUG();
+			if (test_set_page_writeback(page))
+				BUG();
+			unlock_page(page);
+			put_page(page);
+		}
+		count += loop;
+		if (loop < n) {
+			for (; loop < n; loop++)
+				put_page(pages[loop]);
+			goto no_more;
+		}
+
+		start += loop;
+	} while (start <= wb->last && count < 65536);
+
+no_more:
+	/* we now have a contiguous set of dirty pages, each with writeback set
+	 * and the dirty mark cleared; the first page is locked and must remain
+	 * so, all the rest are unlocked */
+	first = primary_page->index;
+	last = first + count - 1;
+
+	offset = (first == wb->first) ? wb->offset_first : 0;
+	to = (last == wb->last) ? wb->to_last : PAGE_SIZE;
+
+	_debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to);
+
+	ret = afs_vnode_store_data(wb, first, last, offset, to);
+	if (ret < 0) {
+		switch (ret) {
+		case -EDQUOT:
+		case -ENOSPC:
+			set_bit(AS_ENOSPC,
+				&wb->vnode->vfs_inode.i_mapping->flags);
+			break;
+		case -EROFS:
+		case -EIO:
+		case -EREMOTEIO:
+		case -EFBIG:
+		case -ENOENT:
+		case -ENOMEDIUM:
+		case -ENXIO:
+			afs_kill_pages(wb->vnode, true, first, last);
+			set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags);
+			break;
+		case -EACCES:
+		case -EPERM:
+		case -ENOKEY:
+		case -EKEYEXPIRED:
+		case -EKEYREJECTED:
+		case -EKEYREVOKED:
+			afs_kill_pages(wb->vnode, false, first, last);
+			break;
+		default:
+			break;
+		}
+	} else {
+		ret = count;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * write a page back to the server
+ * - the caller locked the page for us
+ */
+int afs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct afs_writeback *wb;
+	int ret;
+
+	_enter("{%lx},", page->index);
+
+	wb = (struct afs_writeback *) page_private(page);
+	ASSERT(wb != NULL);
+
+	ret = afs_write_back_from_locked_page(wb, page);
+	unlock_page(page);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return 0;
+	}
+
+	wbc->nr_to_write -= ret;
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * write a region of pages back to the server
+ */
+static int afs_writepages_region(struct address_space *mapping,
+				 struct writeback_control *wbc,
+				 pgoff_t index, pgoff_t end, pgoff_t *_next)
+{
+	struct afs_writeback *wb;
+	struct page *page;
+	int ret, n;
+
+	_enter(",,%lx,%lx,", index, end);
+
+	do {
+		n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY,
+				       1, &page);
+		if (!n)
+			break;
+
+		_debug("wback %lx", page->index);
+
+		if (page->index > end) {
+			*_next = index;
+			page_cache_release(page);
+			_leave(" = 0 [%lx]", *_next);
+			return 0;
+		}
+
+		/* at this point we hold neither mapping->tree_lock nor lock on
+		 * the page itself: the page may be truncated or invalidated
+		 * (changing page->mapping to NULL), or even swizzled back from
+		 * swapper_space to tmpfs file mapping
+		 */
+		lock_page(page);
+
+		if (page->mapping != mapping) {
+			unlock_page(page);
+			page_cache_release(page);
+			continue;
+		}
+
+		if (wbc->sync_mode != WB_SYNC_NONE)
+			wait_on_page_writeback(page);
+
+		if (PageWriteback(page) || !PageDirty(page)) {
+			unlock_page(page);
+			continue;
+		}
+
+		wb = (struct afs_writeback *) page_private(page);
+		ASSERT(wb != NULL);
+
+		spin_lock(&wb->vnode->writeback_lock);
+		wb->state = AFS_WBACK_WRITING;
+		spin_unlock(&wb->vnode->writeback_lock);
+
+		ret = afs_write_back_from_locked_page(wb, page);
+		unlock_page(page);
+		page_cache_release(page);
+		if (ret < 0) {
+			_leave(" = %d", ret);
+			return ret;
+		}
+
+		wbc->nr_to_write -= ret;
+
+		cond_resched();
+	} while (index < end && wbc->nr_to_write > 0);
+
+	*_next = index;
+	_leave(" = 0 [%lx]", *_next);
+	return 0;
+}
+
+/*
+ * write some of the pending data back to the server
+ */
+int afs_writepages(struct address_space *mapping,
+		   struct writeback_control *wbc)
+{
+	pgoff_t start, end, next;
+	int ret;
+
+	_enter("");
+
+	if (wbc->range_cyclic) {
+		start = mapping->writeback_index;
+		end = -1;
+		ret = afs_writepages_region(mapping, wbc, start, end, &next);
+		if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
+			ret = afs_writepages_region(mapping, wbc, 0, start,
+						    &next);
+		mapping->writeback_index = next;
+	} else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
+		end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT);
+		ret = afs_writepages_region(mapping, wbc, 0, end, &next);
+		if (wbc->nr_to_write > 0)
+			mapping->writeback_index = next;
+	} else {
+		start = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		ret = afs_writepages_region(mapping, wbc, start, end, &next);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * completion of write to server
+ */
+void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
+{
+	struct afs_writeback *wb = call->wb;
+	struct pagevec pv;
+	unsigned count, loop;
+	pgoff_t first = call->first, last = call->last;
+	bool free_wb;
+
+	_enter("{%x:%u},{%lx-%lx}",
+	       vnode->fid.vid, vnode->fid.vnode, first, last);
+
+	ASSERT(wb != NULL);
+
+	pagevec_init(&pv, 0);
+
+	do {
+		_debug("done %lx-%lx", first, last);
+
+		count = last - first + 1;
+		if (count > PAGEVEC_SIZE)
+			count = PAGEVEC_SIZE;
+		pv.nr = find_get_pages_contig(call->mapping, first, count,
+					      pv.pages);
+		ASSERTCMP(pv.nr, ==, count);
+
+		spin_lock(&vnode->writeback_lock);
+		for (loop = 0; loop < count; loop++) {
+			struct page *page = pv.pages[loop];
+			end_page_writeback(page);
+			if (page_private(page) == (unsigned long) wb) {
+				set_page_private(page, 0);
+				ClearPagePrivate(page);
+				wb->usage--;
+			}
+		}
+		free_wb = false;
+		if (wb->usage == 0) {
+			afs_unlink_writeback(wb);
+			free_wb = true;
+		}
+		spin_unlock(&vnode->writeback_lock);
+		first += count;
+		if (free_wb) {
+			afs_free_writeback(wb);
+			wb = NULL;
+		}
+
+		__pagevec_release(&pv);
+	} while (first <= last);
+
+	_leave("");
+}
+
+/*
+ * write to an AFS file
+ */
+ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
+		       unsigned long nr_segs, loff_t pos)
+{
+	struct dentry *dentry = iocb->ki_filp->f_path.dentry;
+	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+	ssize_t result;
+	size_t count = iov_length(iov, nr_segs);
+
+	_enter("{%x.%u},{%zu},%lu,",
+	       vnode->fid.vid, vnode->fid.vnode, count, nr_segs);
+
+	if (IS_SWAPFILE(&vnode->vfs_inode)) {
+		printk(KERN_INFO
+		       "AFS: Attempt to write to active swap file!\n");
+		return -EBUSY;
+	}
+
+	if (!count)
+		return 0;
+
+	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+	if (IS_ERR_VALUE(result)) {
+		_leave(" = %zd", result);
+		return result;
+	}
+
+	_leave(" = %zd", result);
+	return result;
+}
+
+/*
+ * flush the vnode to the fileserver
+ */
+int afs_writeback_all(struct afs_vnode *vnode)
+{
+	struct address_space *mapping = vnode->vfs_inode.i_mapping;
+	struct writeback_control wbc = {
+		.sync_mode	= WB_SYNC_ALL,
+		.nr_to_write	= LONG_MAX,
+		.range_cyclic	= 1,
+	};
+	int ret;
+
+	_enter("");
+
+	ret = mapping->a_ops->writepages(mapping, &wbc);
+	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * flush any dirty pages for this process, and check for write errors.
+ * - the return status from this call provides a reliable indication of
+ *   whether any write errors occurred for this process.
+ */
+int afs_fsync(struct file *file, int datasync)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct afs_writeback *wb, *xwb;
+	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+	int ret;
+
+	_enter("{%x:%u},{n=%s},%d",
+	       vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
+	       datasync);
+
+	/* use a writeback record as a marker in the queue - when this reaches
+	 * the front of the queue, all the outstanding writes are either
+	 * completed or rejected */
+	wb = kzalloc(sizeof(*wb), GFP_KERNEL);
+	if (!wb)
+		return -ENOMEM;
+	wb->vnode = vnode;
+	wb->first = 0;
+	wb->last = -1;
+	wb->offset_first = 0;
+	wb->to_last = PAGE_SIZE;
+	wb->usage = 1;
+	wb->state = AFS_WBACK_SYNCING;
+	init_waitqueue_head(&wb->waitq);
+
+	spin_lock(&vnode->writeback_lock);
+	list_for_each_entry(xwb, &vnode->writebacks, link) {
+		if (xwb->state == AFS_WBACK_PENDING)
+			xwb->state = AFS_WBACK_CONFLICTING;
+	}
+	list_add_tail(&wb->link, &vnode->writebacks);
+	spin_unlock(&vnode->writeback_lock);
+
+	/* push all the outstanding writebacks to the server */
+	ret = afs_writeback_all(vnode);
+	if (ret < 0) {
+		afs_put_writeback(wb);
+		_leave(" = %d [wb]", ret);
+		return ret;
+	}
+
+	/* wait for the preceding writes to actually complete */
+	ret = wait_event_interruptible(wb->waitq,
+				       wb->state == AFS_WBACK_COMPLETE ||
+				       vnode->writebacks.next == &wb->link);
+	afs_put_writeback(wb);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * notification that a previously read-only page is about to become writable
+ * - if it returns an error, the caller will deliver a bus error signal
+ */
+int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host);
+
+	_enter("{{%x:%u}},{%lx}",
+	       vnode->fid.vid, vnode->fid.vnode, page->index);
+
+	/* wait for the page to be written to the cache before we allow it to
+	 * be modified */
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_wait_on_page_write(vnode->cache, page);
+#endif
+
+	_leave(" = 0");
+	return 0;
+}
diff --git a/fs/aio.c b/fs/aio.c
new file mode 100644
index 00000000..278ed7dc
--- /dev/null
+++ b/fs/aio.c
@@ -0,0 +1,1795 @@
+/*
+ *	An async IO implementation for Linux
+ *	Written by Benjamin LaHaise <bcrl@kvack.org>
+ *
+ *	Implements an efficient asynchronous io interface.
+ *
+ *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
+ *
+ *	See ../COPYING for licensing terms.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/aio_abi.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+#include <linux/backing-dev.h>
+#include <linux/uio.h>
+
+#define DEBUG 0
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_context.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/aio.h>
+#include <linux/highmem.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/eventfd.h>
+#include <linux/blkdev.h>
+#include <linux/compat.h>
+
+#include <asm/kmap_types.h>
+#include <asm/uaccess.h>
+
+#if DEBUG > 1
+#define dprintk		printk
+#else
+#define dprintk(x...)	do { ; } while (0)
+#endif
+
+/*------ sysctl variables----*/
+static DEFINE_SPINLOCK(aio_nr_lock);
+unsigned long aio_nr;		/* current system wide number of aio requests */
+unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
+/*----end sysctl variables---*/
+
+static struct kmem_cache	*kiocb_cachep;
+static struct kmem_cache	*kioctx_cachep;
+
+static struct workqueue_struct *aio_wq;
+
+/* Used for rare fput completion. */
+static void aio_fput_routine(struct work_struct *);
+static DECLARE_WORK(fput_work, aio_fput_routine);
+
+static DEFINE_SPINLOCK(fput_lock);
+static LIST_HEAD(fput_head);
+
+static void aio_kick_handler(struct work_struct *);
+static void aio_queue_work(struct kioctx *);
+
+/* aio_setup
+ *	Creates the slab caches used by the aio routines, panic on
+ *	failure as this is done early during the boot sequence.
+ */
+static int __init aio_setup(void)
+{
+	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+
+	aio_wq = alloc_workqueue("aio", 0, 1);	/* used to limit concurrency */
+	BUG_ON(!aio_wq);
+
+	pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
+
+	return 0;
+}
+__initcall(aio_setup);
+
+static void aio_free_ring(struct kioctx *ctx)
+{
+	struct aio_ring_info *info = &ctx->ring_info;
+	long i;
+
+	for (i=0; i<info->nr_pages; i++)
+		put_page(info->ring_pages[i]);
+
+	if (info->mmap_size) {
+		down_write(&ctx->mm->mmap_sem);
+		do_munmap(ctx->mm, info->mmap_base, info->mmap_size);
+		up_write(&ctx->mm->mmap_sem);
+	}
+
+	if (info->ring_pages && info->ring_pages != info->internal_pages)
+		kfree(info->ring_pages);
+	info->ring_pages = NULL;
+	info->nr = 0;
+}
+
+static int aio_setup_ring(struct kioctx *ctx)
+{
+	struct aio_ring *ring;
+	struct aio_ring_info *info = &ctx->ring_info;
+	unsigned nr_events = ctx->max_reqs;
+	unsigned long size;
+	int nr_pages;
+
+	/* Compensate for the ring buffer's head/tail overlap entry */
+	nr_events += 2;	/* 1 is required, 2 for good luck */
+
+	size = sizeof(struct aio_ring);
+	size += sizeof(struct io_event) * nr_events;
+	nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+	if (nr_pages < 0)
+		return -EINVAL;
+
+	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
+
+	info->nr = 0;
+	info->ring_pages = info->internal_pages;
+	if (nr_pages > AIO_RING_PAGES) {
+		info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+		if (!info->ring_pages)
+			return -ENOMEM;
+	}
+
+	info->mmap_size = nr_pages * PAGE_SIZE;
+	dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
+	down_write(&ctx->mm->mmap_sem);
+	info->mmap_base = do_mmap(NULL, 0, info->mmap_size, 
+				  PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE,
+				  0);
+	if (IS_ERR((void *)info->mmap_base)) {
+		up_write(&ctx->mm->mmap_sem);
+		info->mmap_size = 0;
+		aio_free_ring(ctx);
+		return -EAGAIN;
+	}
+
+	dprintk("mmap address: 0x%08lx\n", info->mmap_base);
+	info->nr_pages = get_user_pages(current, ctx->mm,
+					info->mmap_base, nr_pages, 
+					1, 0, info->ring_pages, NULL);
+	up_write(&ctx->mm->mmap_sem);
+
+	if (unlikely(info->nr_pages != nr_pages)) {
+		aio_free_ring(ctx);
+		return -EAGAIN;
+	}
+
+	ctx->user_id = info->mmap_base;
+
+	info->nr = nr_events;		/* trusted copy */
+
+	ring = kmap_atomic(info->ring_pages[0], KM_USER0);
+	ring->nr = nr_events;	/* user copy */
+	ring->id = ctx->user_id;
+	ring->head = ring->tail = 0;
+	ring->magic = AIO_RING_MAGIC;
+	ring->compat_features = AIO_RING_COMPAT_FEATURES;
+	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
+	ring->header_length = sizeof(struct aio_ring);
+	kunmap_atomic(ring, KM_USER0);
+
+	return 0;
+}
+
+
+/* aio_ring_event: returns a pointer to the event at the given index from
+ * kmap_atomic(, km).  Release the pointer with put_aio_ring_event();
+ */
+#define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))
+#define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
+#define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
+
+#define aio_ring_event(info, nr, km) ({					\
+	unsigned pos = (nr) + AIO_EVENTS_OFFSET;			\
+	struct io_event *__event;					\
+	__event = kmap_atomic(						\
+			(info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
+	__event += pos % AIO_EVENTS_PER_PAGE;				\
+	__event;							\
+})
+
+#define put_aio_ring_event(event, km) do {	\
+	struct io_event *__event = (event);	\
+	(void)__event;				\
+	kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
+} while(0)
+
+static void ctx_rcu_free(struct rcu_head *head)
+{
+	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+	unsigned nr_events = ctx->max_reqs;
+
+	kmem_cache_free(kioctx_cachep, ctx);
+
+	if (nr_events) {
+		spin_lock(&aio_nr_lock);
+		BUG_ON(aio_nr - nr_events > aio_nr);
+		aio_nr -= nr_events;
+		spin_unlock(&aio_nr_lock);
+	}
+}
+
+/* __put_ioctx
+ *	Called when the last user of an aio context has gone away,
+ *	and the struct needs to be freed.
+ */
+static void __put_ioctx(struct kioctx *ctx)
+{
+	BUG_ON(ctx->reqs_active);
+
+	cancel_delayed_work(&ctx->wq);
+	cancel_work_sync(&ctx->wq.work);
+	aio_free_ring(ctx);
+	mmdrop(ctx->mm);
+	ctx->mm = NULL;
+	pr_debug("__put_ioctx: freeing %p\n", ctx);
+	call_rcu(&ctx->rcu_head, ctx_rcu_free);
+}
+
+static inline int try_get_ioctx(struct kioctx *kioctx)
+{
+	return atomic_inc_not_zero(&kioctx->users);
+}
+
+static inline void put_ioctx(struct kioctx *kioctx)
+{
+	BUG_ON(atomic_read(&kioctx->users) <= 0);
+	if (unlikely(atomic_dec_and_test(&kioctx->users)))
+		__put_ioctx(kioctx);
+}
+
+/* ioctx_alloc
+ *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
+ */
+static struct kioctx *ioctx_alloc(unsigned nr_events)
+{
+	struct mm_struct *mm;
+	struct kioctx *ctx;
+	int did_sync = 0;
+
+	/* Prevent overflows */
+	if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
+	    (nr_events > (0x10000000U / sizeof(struct kiocb)))) {
+		pr_debug("ENOMEM: nr_events too high\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if ((unsigned long)nr_events > aio_max_nr)
+		return ERR_PTR(-EAGAIN);
+
+	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	ctx->max_reqs = nr_events;
+	mm = ctx->mm = current->mm;
+	atomic_inc(&mm->mm_count);
+
+	atomic_set(&ctx->users, 2);
+	spin_lock_init(&ctx->ctx_lock);
+	spin_lock_init(&ctx->ring_info.ring_lock);
+	init_waitqueue_head(&ctx->wait);
+
+	INIT_LIST_HEAD(&ctx->active_reqs);
+	INIT_LIST_HEAD(&ctx->run_list);
+	INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler);
+
+	if (aio_setup_ring(ctx) < 0)
+		goto out_freectx;
+
+	/* limit the number of system wide aios */
+	do {
+		spin_lock_bh(&aio_nr_lock);
+		if (aio_nr + nr_events > aio_max_nr ||
+		    aio_nr + nr_events < aio_nr)
+			ctx->max_reqs = 0;
+		else
+			aio_nr += ctx->max_reqs;
+		spin_unlock_bh(&aio_nr_lock);
+		if (ctx->max_reqs || did_sync)
+			break;
+
+		/* wait for rcu callbacks to have completed before giving up */
+		synchronize_rcu();
+		did_sync = 1;
+		ctx->max_reqs = nr_events;
+	} while (1);
+
+	if (ctx->max_reqs == 0)
+		goto out_cleanup;
+
+	/* now link into global list. */
+	spin_lock(&mm->ioctx_lock);
+	hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
+	spin_unlock(&mm->ioctx_lock);
+
+	dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
+		ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
+	return ctx;
+
+out_cleanup:
+	__put_ioctx(ctx);
+	return ERR_PTR(-EAGAIN);
+
+out_freectx:
+	mmdrop(mm);
+	kmem_cache_free(kioctx_cachep, ctx);
+	ctx = ERR_PTR(-ENOMEM);
+
+	dprintk("aio: error allocating ioctx %p\n", ctx);
+	return ctx;
+}
+
+/* aio_cancel_all
+ *	Cancels all outstanding aio requests on an aio context.  Used 
+ *	when the processes owning a context have all exited to encourage 
+ *	the rapid destruction of the kioctx.
+ */
+static void aio_cancel_all(struct kioctx *ctx)
+{
+	int (*cancel)(struct kiocb *, struct io_event *);
+	struct io_event res;
+	spin_lock_irq(&ctx->ctx_lock);
+	ctx->dead = 1;
+	while (!list_empty(&ctx->active_reqs)) {
+		struct list_head *pos = ctx->active_reqs.next;
+		struct kiocb *iocb = list_kiocb(pos);
+		list_del_init(&iocb->ki_list);
+		cancel = iocb->ki_cancel;
+		kiocbSetCancelled(iocb);
+		if (cancel) {
+			iocb->ki_users++;
+			spin_unlock_irq(&ctx->ctx_lock);
+			cancel(iocb, &res);
+			spin_lock_irq(&ctx->ctx_lock);
+		}
+	}
+	spin_unlock_irq(&ctx->ctx_lock);
+}
+
+static void wait_for_all_aios(struct kioctx *ctx)
+{
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	spin_lock_irq(&ctx->ctx_lock);
+	if (!ctx->reqs_active)
+		goto out;
+
+	add_wait_queue(&ctx->wait, &wait);
+	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+	while (ctx->reqs_active) {
+		spin_unlock_irq(&ctx->ctx_lock);
+		io_schedule();
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		spin_lock_irq(&ctx->ctx_lock);
+	}
+	__set_task_state(tsk, TASK_RUNNING);
+	remove_wait_queue(&ctx->wait, &wait);
+
+out:
+	spin_unlock_irq(&ctx->ctx_lock);
+}
+
+/* wait_on_sync_kiocb:
+ *	Waits on the given sync kiocb to complete.
+ */
+ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
+{
+	while (iocb->ki_users) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!iocb->ki_users)
+			break;
+		io_schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+	return iocb->ki_user_data;
+}
+EXPORT_SYMBOL(wait_on_sync_kiocb);
+
+/* exit_aio: called when the last user of mm goes away.  At this point, 
+ * there is no way for any new requests to be submited or any of the 
+ * io_* syscalls to be called on the context.  However, there may be 
+ * outstanding requests which hold references to the context; as they 
+ * go away, they will call put_ioctx and release any pinned memory
+ * associated with the request (held via struct page * references).
+ */
+void exit_aio(struct mm_struct *mm)
+{
+	struct kioctx *ctx;
+
+	while (!hlist_empty(&mm->ioctx_list)) {
+		ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
+		hlist_del_rcu(&ctx->list);
+
+		aio_cancel_all(ctx);
+
+		wait_for_all_aios(ctx);
+		/*
+		 * Ensure we don't leave the ctx on the aio_wq
+		 */
+		cancel_work_sync(&ctx->wq.work);
+
+		if (1 != atomic_read(&ctx->users))
+			printk(KERN_DEBUG
+				"exit_aio:ioctx still alive: %d %d %d\n",
+				atomic_read(&ctx->users), ctx->dead,
+				ctx->reqs_active);
+		put_ioctx(ctx);
+	}
+}
+
+/* aio_get_req
+ *	Allocate a slot for an aio request.  Increments the users count
+ * of the kioctx so that the kioctx stays around until all requests are
+ * complete.  Returns NULL if no requests are free.
+ *
+ * Returns with kiocb->users set to 2.  The io submit code path holds
+ * an extra reference while submitting the i/o.
+ * This prevents races between the aio code path referencing the
+ * req (after submitting it) and aio_complete() freeing the req.
+ */
+static struct kiocb *__aio_get_req(struct kioctx *ctx)
+{
+	struct kiocb *req = NULL;
+	struct aio_ring *ring;
+	int okay = 0;
+
+	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
+	if (unlikely(!req))
+		return NULL;
+
+	req->ki_flags = 0;
+	req->ki_users = 2;
+	req->ki_key = 0;
+	req->ki_ctx = ctx;
+	req->ki_cancel = NULL;
+	req->ki_retry = NULL;
+	req->ki_dtor = NULL;
+	req->private = NULL;
+	req->ki_iovec = NULL;
+	INIT_LIST_HEAD(&req->ki_run_list);
+	req->ki_eventfd = NULL;
+
+	/* Check if the completion queue has enough free space to
+	 * accept an event from this io.
+	 */
+	spin_lock_irq(&ctx->ctx_lock);
+	ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0);
+	if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) {
+		list_add(&req->ki_list, &ctx->active_reqs);
+		ctx->reqs_active++;
+		okay = 1;
+	}
+	kunmap_atomic(ring, KM_USER0);
+	spin_unlock_irq(&ctx->ctx_lock);
+
+	if (!okay) {
+		kmem_cache_free(kiocb_cachep, req);
+		req = NULL;
+	}
+
+	return req;
+}
+
+static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+{
+	struct kiocb *req;
+	/* Handle a potential starvation case -- should be exceedingly rare as 
+	 * requests will be stuck on fput_head only if the aio_fput_routine is 
+	 * delayed and the requests were the last user of the struct file.
+	 */
+	req = __aio_get_req(ctx);
+	if (unlikely(NULL == req)) {
+		aio_fput_routine(NULL);
+		req = __aio_get_req(ctx);
+	}
+	return req;
+}
+
+static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
+{
+	assert_spin_locked(&ctx->ctx_lock);
+
+	if (req->ki_eventfd != NULL)
+		eventfd_ctx_put(req->ki_eventfd);
+	if (req->ki_dtor)
+		req->ki_dtor(req);
+	if (req->ki_iovec != &req->ki_inline_vec)
+		kfree(req->ki_iovec);
+	kmem_cache_free(kiocb_cachep, req);
+	ctx->reqs_active--;
+
+	if (unlikely(!ctx->reqs_active && ctx->dead))
+		wake_up_all(&ctx->wait);
+}
+
+static void aio_fput_routine(struct work_struct *data)
+{
+	spin_lock_irq(&fput_lock);
+	while (likely(!list_empty(&fput_head))) {
+		struct kiocb *req = list_kiocb(fput_head.next);
+		struct kioctx *ctx = req->ki_ctx;
+
+		list_del(&req->ki_list);
+		spin_unlock_irq(&fput_lock);
+
+		/* Complete the fput(s) */
+		if (req->ki_filp != NULL)
+			fput(req->ki_filp);
+
+		/* Link the iocb into the context's free list */
+		rcu_read_lock();
+		spin_lock_irq(&ctx->ctx_lock);
+		really_put_req(ctx, req);
+		/*
+		 * at that point ctx might've been killed, but actual
+		 * freeing is RCU'd
+		 */
+		spin_unlock_irq(&ctx->ctx_lock);
+		rcu_read_unlock();
+
+		spin_lock_irq(&fput_lock);
+	}
+	spin_unlock_irq(&fput_lock);
+}
+
+/* __aio_put_req
+ *	Returns true if this put was the last user of the request.
+ */
+static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
+{
+	dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
+		req, atomic_long_read(&req->ki_filp->f_count));
+
+	assert_spin_locked(&ctx->ctx_lock);
+
+	req->ki_users--;
+	BUG_ON(req->ki_users < 0);
+	if (likely(req->ki_users))
+		return 0;
+	list_del(&req->ki_list);		/* remove from active_reqs */
+	req->ki_cancel = NULL;
+	req->ki_retry = NULL;
+
+	/*
+	 * Try to optimize the aio and eventfd file* puts, by avoiding to
+	 * schedule work in case it is not final fput() time. In normal cases,
+	 * we would not be holding the last reference to the file*, so
+	 * this function will be executed w/out any aio kthread wakeup.
+	 */
+	if (unlikely(!fput_atomic(req->ki_filp))) {
+		spin_lock(&fput_lock);
+		list_add(&req->ki_list, &fput_head);
+		spin_unlock(&fput_lock);
+		schedule_work(&fput_work);
+	} else {
+		req->ki_filp = NULL;
+		really_put_req(ctx, req);
+	}
+	return 1;
+}
+
+/* aio_put_req
+ *	Returns true if this put was the last user of the kiocb,
+ *	false if the request is still in use.
+ */
+int aio_put_req(struct kiocb *req)
+{
+	struct kioctx *ctx = req->ki_ctx;
+	int ret;
+	spin_lock_irq(&ctx->ctx_lock);
+	ret = __aio_put_req(ctx, req);
+	spin_unlock_irq(&ctx->ctx_lock);
+	return ret;
+}
+EXPORT_SYMBOL(aio_put_req);
+
+static struct kioctx *lookup_ioctx(unsigned long ctx_id)
+{
+	struct mm_struct *mm = current->mm;
+	struct kioctx *ctx, *ret = NULL;
+	struct hlist_node *n;
+
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
+		/*
+		 * RCU protects us against accessing freed memory but
+		 * we have to be careful not to get a reference when the
+		 * reference count already dropped to 0 (ctx->dead test
+		 * is unreliable because of races).
+		 */
+		if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
+			ret = ctx;
+			break;
+		}
+	}
+
+	rcu_read_unlock();
+	return ret;
+}
+
+/*
+ * Queue up a kiocb to be retried. Assumes that the kiocb
+ * has already been marked as kicked, and places it on
+ * the retry run list for the corresponding ioctx, if it
+ * isn't already queued. Returns 1 if it actually queued
+ * the kiocb (to tell the caller to activate the work
+ * queue to process it), or 0, if it found that it was
+ * already queued.
+ */
+static inline int __queue_kicked_iocb(struct kiocb *iocb)
+{
+	struct kioctx *ctx = iocb->ki_ctx;
+
+	assert_spin_locked(&ctx->ctx_lock);
+
+	if (list_empty(&iocb->ki_run_list)) {
+		list_add_tail(&iocb->ki_run_list,
+			&ctx->run_list);
+		return 1;
+	}
+	return 0;
+}
+
+/* aio_run_iocb
+ *	This is the core aio execution routine. It is
+ *	invoked both for initial i/o submission and
+ *	subsequent retries via the aio_kick_handler.
+ *	Expects to be invoked with iocb->ki_ctx->lock
+ *	already held. The lock is released and reacquired
+ *	as needed during processing.
+ *
+ * Calls the iocb retry method (already setup for the
+ * iocb on initial submission) for operation specific
+ * handling, but takes care of most of common retry
+ * execution details for a given iocb. The retry method
+ * needs to be non-blocking as far as possible, to avoid
+ * holding up other iocbs waiting to be serviced by the
+ * retry kernel thread.
+ *
+ * The trickier parts in this code have to do with
+ * ensuring that only one retry instance is in progress
+ * for a given iocb at any time. Providing that guarantee
+ * simplifies the coding of individual aio operations as
+ * it avoids various potential races.
+ */
+static ssize_t aio_run_iocb(struct kiocb *iocb)
+{
+	struct kioctx	*ctx = iocb->ki_ctx;
+	ssize_t (*retry)(struct kiocb *);
+	ssize_t ret;
+
+	if (!(retry = iocb->ki_retry)) {
+		printk("aio_run_iocb: iocb->ki_retry = NULL\n");
+		return 0;
+	}
+
+	/*
+	 * We don't want the next retry iteration for this
+	 * operation to start until this one has returned and
+	 * updated the iocb state. However, wait_queue functions
+	 * can trigger a kick_iocb from interrupt context in the
+	 * meantime, indicating that data is available for the next
+	 * iteration. We want to remember that and enable the
+	 * next retry iteration _after_ we are through with
+	 * this one.
+	 *
+	 * So, in order to be able to register a "kick", but
+	 * prevent it from being queued now, we clear the kick
+	 * flag, but make the kick code *think* that the iocb is
+	 * still on the run list until we are actually done.
+	 * When we are done with this iteration, we check if
+	 * the iocb was kicked in the meantime and if so, queue
+	 * it up afresh.
+	 */
+
+	kiocbClearKicked(iocb);
+
+	/*
+	 * This is so that aio_complete knows it doesn't need to
+	 * pull the iocb off the run list (We can't just call
+	 * INIT_LIST_HEAD because we don't want a kick_iocb to
+	 * queue this on the run list yet)
+	 */
+	iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL;
+	spin_unlock_irq(&ctx->ctx_lock);
+
+	/* Quit retrying if the i/o has been cancelled */
+	if (kiocbIsCancelled(iocb)) {
+		ret = -EINTR;
+		aio_complete(iocb, ret, 0);
+		/* must not access the iocb after this */
+		goto out;
+	}
+
+	/*
+	 * Now we are all set to call the retry method in async
+	 * context.
+	 */
+	ret = retry(iocb);
+
+	if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
+		/*
+		 * There's no easy way to restart the syscall since other AIO's
+		 * may be already running. Just fail this IO with EINTR.
+		 */
+		if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+			     ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
+			ret = -EINTR;
+		aio_complete(iocb, ret, 0);
+	}
+out:
+	spin_lock_irq(&ctx->ctx_lock);
+
+	if (-EIOCBRETRY == ret) {
+		/*
+		 * OK, now that we are done with this iteration
+		 * and know that there is more left to go,
+		 * this is where we let go so that a subsequent
+		 * "kick" can start the next iteration
+		 */
+
+		/* will make __queue_kicked_iocb succeed from here on */
+		INIT_LIST_HEAD(&iocb->ki_run_list);
+		/* we must queue the next iteration ourselves, if it
+		 * has already been kicked */
+		if (kiocbIsKicked(iocb)) {
+			__queue_kicked_iocb(iocb);
+
+			/*
+			 * __queue_kicked_iocb will always return 1 here, because
+			 * iocb->ki_run_list is empty at this point so it should
+			 * be safe to unconditionally queue the context into the
+			 * work queue.
+			 */
+			aio_queue_work(ctx);
+		}
+	}
+	return ret;
+}
+
+/*
+ * __aio_run_iocbs:
+ * 	Process all pending retries queued on the ioctx
+ * 	run list.
+ * Assumes it is operating within the aio issuer's mm
+ * context.
+ */
+static int __aio_run_iocbs(struct kioctx *ctx)
+{
+	struct kiocb *iocb;
+	struct list_head run_list;
+
+	assert_spin_locked(&ctx->ctx_lock);
+
+	list_replace_init(&ctx->run_list, &run_list);
+	while (!list_empty(&run_list)) {
+		iocb = list_entry(run_list.next, struct kiocb,
+			ki_run_list);
+		list_del(&iocb->ki_run_list);
+		/*
+		 * Hold an extra reference while retrying i/o.
+		 */
+		iocb->ki_users++;       /* grab extra reference */
+		aio_run_iocb(iocb);
+		__aio_put_req(ctx, iocb);
+ 	}
+	if (!list_empty(&ctx->run_list))
+		return 1;
+	return 0;
+}
+
+static void aio_queue_work(struct kioctx * ctx)
+{
+	unsigned long timeout;
+	/*
+	 * if someone is waiting, get the work started right
+	 * away, otherwise, use a longer delay
+	 */
+	smp_mb();
+	if (waitqueue_active(&ctx->wait))
+		timeout = 1;
+	else
+		timeout = HZ/10;
+	queue_delayed_work(aio_wq, &ctx->wq, timeout);
+}
+
+/*
+ * aio_run_all_iocbs:
+ *	Process all pending retries queued on the ioctx
+ *	run list, and keep running them until the list
+ *	stays empty.
+ * Assumes it is operating within the aio issuer's mm context.
+ */
+static inline void aio_run_all_iocbs(struct kioctx *ctx)
+{
+	spin_lock_irq(&ctx->ctx_lock);
+	while (__aio_run_iocbs(ctx))
+		;
+	spin_unlock_irq(&ctx->ctx_lock);
+}
+
+/*
+ * aio_kick_handler:
+ * 	Work queue handler triggered to process pending
+ * 	retries on an ioctx. Takes on the aio issuer's
+ *	mm context before running the iocbs, so that
+ *	copy_xxx_user operates on the issuer's address
+ *      space.
+ * Run on aiod's context.
+ */
+static void aio_kick_handler(struct work_struct *work)
+{
+	struct kioctx *ctx = container_of(work, struct kioctx, wq.work);
+	mm_segment_t oldfs = get_fs();
+	struct mm_struct *mm;
+	int requeue;
+
+	set_fs(USER_DS);
+	use_mm(ctx->mm);
+	spin_lock_irq(&ctx->ctx_lock);
+	requeue =__aio_run_iocbs(ctx);
+	mm = ctx->mm;
+	spin_unlock_irq(&ctx->ctx_lock);
+ 	unuse_mm(mm);
+	set_fs(oldfs);
+	/*
+	 * we're in a worker thread already, don't use queue_delayed_work,
+	 */
+	if (requeue)
+		queue_delayed_work(aio_wq, &ctx->wq, 0);
+}
+
+
+/*
+ * Called by kick_iocb to queue the kiocb for retry
+ * and if required activate the aio work queue to process
+ * it
+ */
+static void try_queue_kicked_iocb(struct kiocb *iocb)
+{
+ 	struct kioctx	*ctx = iocb->ki_ctx;
+	unsigned long flags;
+	int run = 0;
+
+	spin_lock_irqsave(&ctx->ctx_lock, flags);
+	/* set this inside the lock so that we can't race with aio_run_iocb()
+	 * testing it and putting the iocb on the run list under the lock */
+	if (!kiocbTryKick(iocb))
+		run = __queue_kicked_iocb(iocb);
+	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+	if (run)
+		aio_queue_work(ctx);
+}
+
+/*
+ * kick_iocb:
+ *      Called typically from a wait queue callback context
+ *      to trigger a retry of the iocb.
+ *      The retry is usually executed by aio workqueue
+ *      threads (See aio_kick_handler).
+ */
+void kick_iocb(struct kiocb *iocb)
+{
+	/* sync iocbs are easy: they can only ever be executing from a 
+	 * single context. */
+	if (is_sync_kiocb(iocb)) {
+		kiocbSetKicked(iocb);
+	        wake_up_process(iocb->ki_obj.tsk);
+		return;
+	}
+
+	try_queue_kicked_iocb(iocb);
+}
+EXPORT_SYMBOL(kick_iocb);
+
+/* aio_complete
+ *	Called when the io request on the given iocb is complete.
+ *	Returns true if this is the last user of the request.  The 
+ *	only other user of the request can be the cancellation code.
+ */
+int aio_complete(struct kiocb *iocb, long res, long res2)
+{
+	struct kioctx	*ctx = iocb->ki_ctx;
+	struct aio_ring_info	*info;
+	struct aio_ring	*ring;
+	struct io_event	*event;
+	unsigned long	flags;
+	unsigned long	tail;
+	int		ret;
+
+	/*
+	 * Special case handling for sync iocbs:
+	 *  - events go directly into the iocb for fast handling
+	 *  - the sync task with the iocb in its stack holds the single iocb
+	 *    ref, no other paths have a way to get another ref
+	 *  - the sync task helpfully left a reference to itself in the iocb
+	 */
+	if (is_sync_kiocb(iocb)) {
+		BUG_ON(iocb->ki_users != 1);
+		iocb->ki_user_data = res;
+		iocb->ki_users = 0;
+		wake_up_process(iocb->ki_obj.tsk);
+		return 1;
+	}
+
+	info = &ctx->ring_info;
+
+	/* add a completion event to the ring buffer.
+	 * must be done holding ctx->ctx_lock to prevent
+	 * other code from messing with the tail
+	 * pointer since we might be called from irq
+	 * context.
+	 */
+	spin_lock_irqsave(&ctx->ctx_lock, flags);
+
+	if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
+		list_del_init(&iocb->ki_run_list);
+
+	/*
+	 * cancelled requests don't get events, userland was given one
+	 * when the event got cancelled.
+	 */
+	if (kiocbIsCancelled(iocb))
+		goto put_rq;
+
+	ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
+
+	tail = info->tail;
+	event = aio_ring_event(info, tail, KM_IRQ0);
+	if (++tail >= info->nr)
+		tail = 0;
+
+	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
+	event->data = iocb->ki_user_data;
+	event->res = res;
+	event->res2 = res2;
+
+	dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
+		ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+		res, res2);
+
+	/* after flagging the request as done, we
+	 * must never even look at it again
+	 */
+	smp_wmb();	/* make event visible before updating tail */
+
+	info->tail = tail;
+	ring->tail = tail;
+
+	put_aio_ring_event(event, KM_IRQ0);
+	kunmap_atomic(ring, KM_IRQ1);
+
+	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+
+	/*
+	 * Check if the user asked us to deliver the result through an
+	 * eventfd. The eventfd_signal() function is safe to be called
+	 * from IRQ context.
+	 */
+	if (iocb->ki_eventfd != NULL)
+		eventfd_signal(iocb->ki_eventfd, 1);
+
+put_rq:
+	/* everything turned out well, dispose of the aiocb. */
+	ret = __aio_put_req(ctx, iocb);
+
+	/*
+	 * We have to order our ring_info tail store above and test
+	 * of the wait list below outside the wait lock.  This is
+	 * like in wake_up_bit() where clearing a bit has to be
+	 * ordered with the unlocked test.
+	 */
+	smp_mb();
+
+	if (waitqueue_active(&ctx->wait))
+		wake_up(&ctx->wait);
+
+	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(aio_complete);
+
+/* aio_read_evt
+ *	Pull an event off of the ioctx's event ring.  Returns the number of 
+ *	events fetched (0 or 1 ;-)
+ *	FIXME: make this use cmpxchg.
+ *	TODO: make the ringbuffer user mmap()able (requires FIXME).
+ */
+static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
+{
+	struct aio_ring_info *info = &ioctx->ring_info;
+	struct aio_ring *ring;
+	unsigned long head;
+	int ret = 0;
+
+	ring = kmap_atomic(info->ring_pages[0], KM_USER0);
+	dprintk("in aio_read_evt h%lu t%lu m%lu\n",
+		 (unsigned long)ring->head, (unsigned long)ring->tail,
+		 (unsigned long)ring->nr);
+
+	if (ring->head == ring->tail)
+		goto out;
+
+	spin_lock(&info->ring_lock);
+
+	head = ring->head % info->nr;
+	if (head != ring->tail) {
+		struct io_event *evp = aio_ring_event(info, head, KM_USER1);
+		*ent = *evp;
+		head = (head + 1) % info->nr;
+		smp_mb(); /* finish reading the event before updatng the head */
+		ring->head = head;
+		ret = 1;
+		put_aio_ring_event(evp, KM_USER1);
+	}
+	spin_unlock(&info->ring_lock);
+
+out:
+	kunmap_atomic(ring, KM_USER0);
+	dprintk("leaving aio_read_evt: %d  h%lu t%lu\n", ret,
+		 (unsigned long)ring->head, (unsigned long)ring->tail);
+	return ret;
+}
+
+struct aio_timeout {
+	struct timer_list	timer;
+	int			timed_out;
+	struct task_struct	*p;
+};
+
+static void timeout_func(unsigned long data)
+{
+	struct aio_timeout *to = (struct aio_timeout *)data;
+
+	to->timed_out = 1;
+	wake_up_process(to->p);
+}
+
+static inline void init_timeout(struct aio_timeout *to)
+{
+	setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to);
+	to->timed_out = 0;
+	to->p = current;
+}
+
+static inline void set_timeout(long start_jiffies, struct aio_timeout *to,
+			       const struct timespec *ts)
+{
+	to->timer.expires = start_jiffies + timespec_to_jiffies(ts);
+	if (time_after(to->timer.expires, jiffies))
+		add_timer(&to->timer);
+	else
+		to->timed_out = 1;
+}
+
+static inline void clear_timeout(struct aio_timeout *to)
+{
+	del_singleshot_timer_sync(&to->timer);
+}
+
+static int read_events(struct kioctx *ctx,
+			long min_nr, long nr,
+			struct io_event __user *event,
+			struct timespec __user *timeout)
+{
+	long			start_jiffies = jiffies;
+	struct task_struct	*tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	int			ret;
+	int			i = 0;
+	struct io_event		ent;
+	struct aio_timeout	to;
+	int			retry = 0;
+
+	/* needed to zero any padding within an entry (there shouldn't be 
+	 * any, but C is fun!
+	 */
+	memset(&ent, 0, sizeof(ent));
+retry:
+	ret = 0;
+	while (likely(i < nr)) {
+		ret = aio_read_evt(ctx, &ent);
+		if (unlikely(ret <= 0))
+			break;
+
+		dprintk("read event: %Lx %Lx %Lx %Lx\n",
+			ent.data, ent.obj, ent.res, ent.res2);
+
+		/* Could we split the check in two? */
+		ret = -EFAULT;
+		if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
+			dprintk("aio: lost an event due to EFAULT.\n");
+			break;
+		}
+		ret = 0;
+
+		/* Good, event copied to userland, update counts. */
+		event ++;
+		i ++;
+	}
+
+	if (min_nr <= i)
+		return i;
+	if (ret)
+		return ret;
+
+	/* End fast path */
+
+	/* racey check, but it gets redone */
+	if (!retry && unlikely(!list_empty(&ctx->run_list))) {
+		retry = 1;
+		aio_run_all_iocbs(ctx);
+		goto retry;
+	}
+
+	init_timeout(&to);
+	if (timeout) {
+		struct timespec	ts;
+		ret = -EFAULT;
+		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
+			goto out;
+
+		set_timeout(start_jiffies, &to, &ts);
+	}
+
+	while (likely(i < nr)) {
+		add_wait_queue_exclusive(&ctx->wait, &wait);
+		do {
+			set_task_state(tsk, TASK_INTERRUPTIBLE);
+			ret = aio_read_evt(ctx, &ent);
+			if (ret)
+				break;
+			if (min_nr <= i)
+				break;
+			if (unlikely(ctx->dead)) {
+				ret = -EINVAL;
+				break;
+			}
+			if (to.timed_out)	/* Only check after read evt */
+				break;
+			/* Try to only show up in io wait if there are ops
+			 *  in flight */
+			if (ctx->reqs_active)
+				io_schedule();
+			else
+				schedule();
+			if (signal_pending(tsk)) {
+				ret = -EINTR;
+				break;
+			}
+			/*ret = aio_read_evt(ctx, &ent);*/
+		} while (1) ;
+
+		set_task_state(tsk, TASK_RUNNING);
+		remove_wait_queue(&ctx->wait, &wait);
+
+		if (unlikely(ret <= 0))
+			break;
+
+		ret = -EFAULT;
+		if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
+			dprintk("aio: lost an event due to EFAULT.\n");
+			break;
+		}
+
+		/* Good, event copied to userland, update counts. */
+		event ++;
+		i ++;
+	}
+
+	if (timeout)
+		clear_timeout(&to);
+out:
+	destroy_timer_on_stack(&to.timer);
+	return i ? i : ret;
+}
+
+/* Take an ioctx and remove it from the list of ioctx's.  Protects 
+ * against races with itself via ->dead.
+ */
+static void io_destroy(struct kioctx *ioctx)
+{
+	struct mm_struct *mm = current->mm;
+	int was_dead;
+
+	/* delete the entry from the list is someone else hasn't already */
+	spin_lock(&mm->ioctx_lock);
+	was_dead = ioctx->dead;
+	ioctx->dead = 1;
+	hlist_del_rcu(&ioctx->list);
+	spin_unlock(&mm->ioctx_lock);
+
+	dprintk("aio_release(%p)\n", ioctx);
+	if (likely(!was_dead))
+		put_ioctx(ioctx);	/* twice for the list */
+
+	aio_cancel_all(ioctx);
+	wait_for_all_aios(ioctx);
+
+	/*
+	 * Wake up any waiters.  The setting of ctx->dead must be seen
+	 * by other CPUs at this point.  Right now, we rely on the
+	 * locking done by the above calls to ensure this consistency.
+	 */
+	wake_up_all(&ioctx->wait);
+	put_ioctx(ioctx);	/* once for the lookup */
+}
+
+/* sys_io_setup:
+ *	Create an aio_context capable of receiving at least nr_events.
+ *	ctxp must not point to an aio_context that already exists, and
+ *	must be initialized to 0 prior to the call.  On successful
+ *	creation of the aio_context, *ctxp is filled in with the resulting 
+ *	handle.  May fail with -EINVAL if *ctxp is not initialized,
+ *	if the specified nr_events exceeds internal limits.  May fail 
+ *	with -EAGAIN if the specified nr_events exceeds the user's limit 
+ *	of available events.  May fail with -ENOMEM if insufficient kernel
+ *	resources are available.  May fail with -EFAULT if an invalid
+ *	pointer is passed for ctxp.  Will fail with -ENOSYS if not
+ *	implemented.
+ */
+SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
+{
+	struct kioctx *ioctx = NULL;
+	unsigned long ctx;
+	long ret;
+
+	ret = get_user(ctx, ctxp);
+	if (unlikely(ret))
+		goto out;
+
+	ret = -EINVAL;
+	if (unlikely(ctx || nr_events == 0)) {
+		pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
+		         ctx, nr_events);
+		goto out;
+	}
+
+	ioctx = ioctx_alloc(nr_events);
+	ret = PTR_ERR(ioctx);
+	if (!IS_ERR(ioctx)) {
+		ret = put_user(ioctx->user_id, ctxp);
+		if (!ret) {
+			put_ioctx(ioctx);
+			return 0;
+		}
+		io_destroy(ioctx);
+	}
+
+out:
+	return ret;
+}
+
+/* sys_io_destroy:
+ *	Destroy the aio_context specified.  May cancel any outstanding 
+ *	AIOs and block on completion.  Will fail with -ENOSYS if not
+ *	implemented.  May fail with -EINVAL if the context pointed to
+ *	is invalid.
+ */
+SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
+{
+	struct kioctx *ioctx = lookup_ioctx(ctx);
+	if (likely(NULL != ioctx)) {
+		io_destroy(ioctx);
+		return 0;
+	}
+	pr_debug("EINVAL: io_destroy: invalid context id\n");
+	return -EINVAL;
+}
+
+static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
+{
+	struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg];
+
+	BUG_ON(ret <= 0);
+
+	while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) {
+		ssize_t this = min((ssize_t)iov->iov_len, ret);
+		iov->iov_base += this;
+		iov->iov_len -= this;
+		iocb->ki_left -= this;
+		ret -= this;
+		if (iov->iov_len == 0) {
+			iocb->ki_cur_seg++;
+			iov++;
+		}
+	}
+
+	/* the caller should not have done more io than what fit in
+	 * the remaining iovecs */
+	BUG_ON(ret > 0 && iocb->ki_left == 0);
+}
+
+static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
+			 unsigned long, loff_t);
+	ssize_t ret = 0;
+	unsigned short opcode;
+
+	if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
+		(iocb->ki_opcode == IOCB_CMD_PREAD)) {
+		rw_op = file->f_op->aio_read;
+		opcode = IOCB_CMD_PREADV;
+	} else {
+		rw_op = file->f_op->aio_write;
+		opcode = IOCB_CMD_PWRITEV;
+	}
+
+	/* This matches the pread()/pwrite() logic */
+	if (iocb->ki_pos < 0)
+		return -EINVAL;
+
+	do {
+		ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
+			    iocb->ki_nr_segs - iocb->ki_cur_seg,
+			    iocb->ki_pos);
+		if (ret > 0)
+			aio_advance_iovec(iocb, ret);
+
+	/* retry all partial writes.  retry partial reads as long as its a
+	 * regular file. */
+	} while (ret > 0 && iocb->ki_left > 0 &&
+		 (opcode == IOCB_CMD_PWRITEV ||
+		  (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
+
+	/* This means we must have transferred all that we could */
+	/* No need to retry anymore */
+	if ((ret == 0) || (iocb->ki_left == 0))
+		ret = iocb->ki_nbytes - iocb->ki_left;
+
+	/* If we managed to write some out we return that, rather than
+	 * the eventual error. */
+	if (opcode == IOCB_CMD_PWRITEV
+	    && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY
+	    && iocb->ki_nbytes - iocb->ki_left)
+		ret = iocb->ki_nbytes - iocb->ki_left;
+
+	return ret;
+}
+
+static ssize_t aio_fdsync(struct kiocb *iocb)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t ret = -EINVAL;
+
+	if (file->f_op->aio_fsync)
+		ret = file->f_op->aio_fsync(iocb, 1);
+	return ret;
+}
+
+static ssize_t aio_fsync(struct kiocb *iocb)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t ret = -EINVAL;
+
+	if (file->f_op->aio_fsync)
+		ret = file->f_op->aio_fsync(iocb, 0);
+	return ret;
+}
+
+static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
+{
+	ssize_t ret;
+
+#ifdef CONFIG_COMPAT
+	if (compat)
+		ret = compat_rw_copy_check_uvector(type,
+				(struct compat_iovec __user *)kiocb->ki_buf,
+				kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+				&kiocb->ki_iovec);
+	else
+#endif
+		ret = rw_copy_check_uvector(type,
+				(struct iovec __user *)kiocb->ki_buf,
+				kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+				&kiocb->ki_iovec);
+	if (ret < 0)
+		goto out;
+
+	ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret);
+	if (ret < 0)
+		goto out;
+
+	kiocb->ki_nr_segs = kiocb->ki_nbytes;
+	kiocb->ki_cur_seg = 0;
+	/* ki_nbytes/left now reflect bytes instead of segs */
+	kiocb->ki_nbytes = ret;
+	kiocb->ki_left = ret;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb)
+{
+	int bytes;
+
+	bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left);
+	if (bytes < 0)
+		return bytes;
+
+	kiocb->ki_iovec = &kiocb->ki_inline_vec;
+	kiocb->ki_iovec->iov_base = kiocb->ki_buf;
+	kiocb->ki_iovec->iov_len = bytes;
+	kiocb->ki_nr_segs = 1;
+	kiocb->ki_cur_seg = 0;
+	return 0;
+}
+
+/*
+ * aio_setup_iocb:
+ *	Performs the initial checks and aio retry method
+ *	setup for the kiocb at the time of io submission.
+ */
+static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
+{
+	struct file *file = kiocb->ki_filp;
+	ssize_t ret = 0;
+
+	switch (kiocb->ki_opcode) {
+	case IOCB_CMD_PREAD:
+		ret = -EBADF;
+		if (unlikely(!(file->f_mode & FMODE_READ)))
+			break;
+		ret = -EFAULT;
+		if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf,
+			kiocb->ki_left)))
+			break;
+		ret = aio_setup_single_vector(READ, file, kiocb);
+		if (ret)
+			break;
+		ret = -EINVAL;
+		if (file->f_op->aio_read)
+			kiocb->ki_retry = aio_rw_vect_retry;
+		break;
+	case IOCB_CMD_PWRITE:
+		ret = -EBADF;
+		if (unlikely(!(file->f_mode & FMODE_WRITE)))
+			break;
+		ret = -EFAULT;
+		if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf,
+			kiocb->ki_left)))
+			break;
+		ret = aio_setup_single_vector(WRITE, file, kiocb);
+		if (ret)
+			break;
+		ret = -EINVAL;
+		if (file->f_op->aio_write)
+			kiocb->ki_retry = aio_rw_vect_retry;
+		break;
+	case IOCB_CMD_PREADV:
+		ret = -EBADF;
+		if (unlikely(!(file->f_mode & FMODE_READ)))
+			break;
+		ret = aio_setup_vectored_rw(READ, kiocb, compat);
+		if (ret)
+			break;
+		ret = -EINVAL;
+		if (file->f_op->aio_read)
+			kiocb->ki_retry = aio_rw_vect_retry;
+		break;
+	case IOCB_CMD_PWRITEV:
+		ret = -EBADF;
+		if (unlikely(!(file->f_mode & FMODE_WRITE)))
+			break;
+		ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
+		if (ret)
+			break;
+		ret = -EINVAL;
+		if (file->f_op->aio_write)
+			kiocb->ki_retry = aio_rw_vect_retry;
+		break;
+	case IOCB_CMD_FDSYNC:
+		ret = -EINVAL;
+		if (file->f_op->aio_fsync)
+			kiocb->ki_retry = aio_fdsync;
+		break;
+	case IOCB_CMD_FSYNC:
+		ret = -EINVAL;
+		if (file->f_op->aio_fsync)
+			kiocb->ki_retry = aio_fsync;
+		break;
+	default:
+		dprintk("EINVAL: io_submit: no operation provided\n");
+		ret = -EINVAL;
+	}
+
+	if (!kiocb->ki_retry)
+		return ret;
+
+	return 0;
+}
+
+static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
+			 struct iocb *iocb, bool compat)
+{
+	struct kiocb *req;
+	struct file *file;
+	ssize_t ret;
+
+	/* enforce forwards compatibility on users */
+	if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
+		pr_debug("EINVAL: io_submit: reserve field set\n");
+		return -EINVAL;
+	}
+
+	/* prevent overflows */
+	if (unlikely(
+	    (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
+	    (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
+	    ((ssize_t)iocb->aio_nbytes < 0)
+	   )) {
+		pr_debug("EINVAL: io_submit: overflow check\n");
+		return -EINVAL;
+	}
+
+	file = fget(iocb->aio_fildes);
+	if (unlikely(!file))
+		return -EBADF;
+
+	req = aio_get_req(ctx);		/* returns with 2 references to req */
+	if (unlikely(!req)) {
+		fput(file);
+		return -EAGAIN;
+	}
+	req->ki_filp = file;
+	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
+		/*
+		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
+		 * instance of the file* now. The file descriptor must be
+		 * an eventfd() fd, and will be signaled for each completed
+		 * event using the eventfd_signal() function.
+		 */
+		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
+		if (IS_ERR(req->ki_eventfd)) {
+			ret = PTR_ERR(req->ki_eventfd);
+			req->ki_eventfd = NULL;
+			goto out_put_req;
+		}
+	}
+
+	ret = put_user(req->ki_key, &user_iocb->aio_key);
+	if (unlikely(ret)) {
+		dprintk("EFAULT: aio_key\n");
+		goto out_put_req;
+	}
+
+	req->ki_obj.user = user_iocb;
+	req->ki_user_data = iocb->aio_data;
+	req->ki_pos = iocb->aio_offset;
+
+	req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf;
+	req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
+	req->ki_opcode = iocb->aio_lio_opcode;
+
+	ret = aio_setup_iocb(req, compat);
+
+	if (ret)
+		goto out_put_req;
+
+	spin_lock_irq(&ctx->ctx_lock);
+	/*
+	 * We could have raced with io_destroy() and are currently holding a
+	 * reference to ctx which should be destroyed. We cannot submit IO
+	 * since ctx gets freed as soon as io_submit() puts its reference.  The
+	 * check here is reliable: io_destroy() sets ctx->dead before waiting
+	 * for outstanding IO and the barrier between these two is realized by
+	 * unlock of mm->ioctx_lock and lock of ctx->ctx_lock.  Analogously we
+	 * increment ctx->reqs_active before checking for ctx->dead and the
+	 * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
+	 * don't see ctx->dead set here, io_destroy() waits for our IO to
+	 * finish.
+	 */
+	if (ctx->dead) {
+		spin_unlock_irq(&ctx->ctx_lock);
+		ret = -EINVAL;
+		goto out_put_req;
+	}
+	aio_run_iocb(req);
+	if (!list_empty(&ctx->run_list)) {
+		/* drain the run list */
+		while (__aio_run_iocbs(ctx))
+			;
+	}
+	spin_unlock_irq(&ctx->ctx_lock);
+
+	aio_put_req(req);	/* drop extra ref to req */
+	return 0;
+
+out_put_req:
+	aio_put_req(req);	/* drop extra ref to req */
+	aio_put_req(req);	/* drop i/o ref to req */
+	return ret;
+}
+
+long do_io_submit(aio_context_t ctx_id, long nr,
+		  struct iocb __user *__user *iocbpp, bool compat)
+{
+	struct kioctx *ctx;
+	long ret = 0;
+	int i;
+	struct blk_plug plug;
+
+	if (unlikely(nr < 0))
+		return -EINVAL;
+
+	if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
+		nr = LONG_MAX/sizeof(*iocbpp);
+
+	if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
+		return -EFAULT;
+
+	ctx = lookup_ioctx(ctx_id);
+	if (unlikely(!ctx)) {
+		pr_debug("EINVAL: io_submit: invalid context id\n");
+		return -EINVAL;
+	}
+
+	blk_start_plug(&plug);
+
+	/*
+	 * AKPM: should this return a partial result if some of the IOs were
+	 * successfully submitted?
+	 */
+	for (i=0; i<nr; i++) {
+		struct iocb __user *user_iocb;
+		struct iocb tmp;
+
+		if (unlikely(__get_user(user_iocb, iocbpp + i))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = io_submit_one(ctx, user_iocb, &tmp, compat);
+		if (ret)
+			break;
+	}
+	blk_finish_plug(&plug);
+
+	put_ioctx(ctx);
+	return i ? i : ret;
+}
+
+/* sys_io_submit:
+ *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+ *	the number of iocbs queued.  May return -EINVAL if the aio_context
+ *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
+ *	*iocbpp[0] is not properly initialized, if the operation specified
+ *	is invalid for the file descriptor in the iocb.  May fail with
+ *	-EFAULT if any of the data structures point to invalid data.  May
+ *	fail with -EBADF if the file descriptor specified in the first
+ *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
+ *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
+ *	fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
+		struct iocb __user * __user *, iocbpp)
+{
+	return do_io_submit(ctx_id, nr, iocbpp, 0);
+}
+
+/* lookup_kiocb
+ *	Finds a given iocb for cancellation.
+ */
+static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
+				  u32 key)
+{
+	struct list_head *pos;
+
+	assert_spin_locked(&ctx->ctx_lock);
+
+	/* TODO: use a hash or array, this sucks. */
+	list_for_each(pos, &ctx->active_reqs) {
+		struct kiocb *kiocb = list_kiocb(pos);
+		if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key)
+			return kiocb;
+	}
+	return NULL;
+}
+
+/* sys_io_cancel:
+ *	Attempts to cancel an iocb previously passed to io_submit.  If
+ *	the operation is successfully cancelled, the resulting event is
+ *	copied into the memory pointed to by result without being placed
+ *	into the completion queue and 0 is returned.  May fail with
+ *	-EFAULT if any of the data structures pointed to are invalid.
+ *	May fail with -EINVAL if aio_context specified by ctx_id is
+ *	invalid.  May fail with -EAGAIN if the iocb specified was not
+ *	cancelled.  Will fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
+		struct io_event __user *, result)
+{
+	int (*cancel)(struct kiocb *iocb, struct io_event *res);
+	struct kioctx *ctx;
+	struct kiocb *kiocb;
+	u32 key;
+	int ret;
+
+	ret = get_user(key, &iocb->aio_key);
+	if (unlikely(ret))
+		return -EFAULT;
+
+	ctx = lookup_ioctx(ctx_id);
+	if (unlikely(!ctx))
+		return -EINVAL;
+
+	spin_lock_irq(&ctx->ctx_lock);
+	ret = -EAGAIN;
+	kiocb = lookup_kiocb(ctx, iocb, key);
+	if (kiocb && kiocb->ki_cancel) {
+		cancel = kiocb->ki_cancel;
+		kiocb->ki_users ++;
+		kiocbSetCancelled(kiocb);
+	} else
+		cancel = NULL;
+	spin_unlock_irq(&ctx->ctx_lock);
+
+	if (NULL != cancel) {
+		struct io_event tmp;
+		pr_debug("calling cancel\n");
+		memset(&tmp, 0, sizeof(tmp));
+		tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user;
+		tmp.data = kiocb->ki_user_data;
+		ret = cancel(kiocb, &tmp);
+		if (!ret) {
+			/* Cancellation succeeded -- copy the result
+			 * into the user's buffer.
+			 */
+			if (copy_to_user(result, &tmp, sizeof(tmp)))
+				ret = -EFAULT;
+		}
+	} else
+		ret = -EINVAL;
+
+	put_ioctx(ctx);
+
+	return ret;
+}
+
+/* io_getevents:
+ *	Attempts to read at least min_nr events and up to nr events from
+ *	the completion queue for the aio_context specified by ctx_id. If
+ *	it succeeds, the number of read events is returned. May fail with
+ *	-EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
+ *	out of range, if timeout is out of range.  May fail with -EFAULT
+ *	if any of the memory specified is invalid.  May return 0 or
+ *	< min_nr if the timeout specified by timeout has elapsed
+ *	before sufficient events are available, where timeout == NULL
+ *	specifies an infinite timeout. Note that the timeout pointed to by
+ *	timeout is relative and will be updated if not NULL and the
+ *	operation blocks. Will fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
+		long, min_nr,
+		long, nr,
+		struct io_event __user *, events,
+		struct timespec __user *, timeout)
+{
+	struct kioctx *ioctx = lookup_ioctx(ctx_id);
+	long ret = -EINVAL;
+
+	if (likely(ioctx)) {
+		if (likely(min_nr <= nr && min_nr >= 0))
+			ret = read_events(ioctx, min_nr, nr, events, timeout);
+		put_ioctx(ioctx);
+	}
+
+	asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout);
+	return ret;
+}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
new file mode 100644
index 00000000..c5567cb7
--- /dev/null
+++ b/fs/anon_inodes.c
@@ -0,0 +1,244 @@
+/*
+ *  fs/anon_inodes.c
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *
+ *  Thanks to Arnd Bergmann for code review and suggestions.
+ *  More changes for Thomas Gleixner suggestions.
+ *
+ */
+
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/magic.h>
+#include <linux/anon_inodes.h>
+
+#include <asm/uaccess.h>
+
+static struct vfsmount *anon_inode_mnt __read_mostly;
+static struct inode *anon_inode_inode;
+static const struct file_operations anon_inode_fops;
+
+/*
+ * anon_inodefs_dname() is called from d_path().
+ */
+static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s",
+				dentry->d_name.name);
+}
+
+static const struct dentry_operations anon_inodefs_dentry_operations = {
+	.d_dname	= anon_inodefs_dname,
+};
+
+static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
+{
+	return mount_pseudo(fs_type, "anon_inode:", NULL,
+			&anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
+}
+
+static struct file_system_type anon_inode_fs_type = {
+	.name		= "anon_inodefs",
+	.mount		= anon_inodefs_mount,
+	.kill_sb	= kill_anon_super,
+};
+
+/*
+ * nop .set_page_dirty method so that people can use .page_mkwrite on
+ * anon inodes.
+ */
+static int anon_set_page_dirty(struct page *page)
+{
+	return 0;
+};
+
+static const struct address_space_operations anon_aops = {
+	.set_page_dirty = anon_set_page_dirty,
+};
+
+/**
+ * anon_inode_getfile - creates a new file instance by hooking it up to an
+ *                      anonymous inode, and a dentry that describe the "class"
+ *                      of the file
+ *
+ * @name:    [in]    name of the "class" of the new file
+ * @fops:    [in]    file operations for the new file
+ * @priv:    [in]    private data for the new file (will be file's private_data)
+ * @flags:   [in]    flags
+ *
+ * Creates a new file by hooking it on a single inode. This is useful for files
+ * that do not need to have a full-fledged inode in order to operate correctly.
+ * All the files created with anon_inode_getfile() will share a single inode,
+ * hence saving memory and avoiding code duplication for the file/inode/dentry
+ * setup.  Returns the newly created file* or an error pointer.
+ */
+struct file *anon_inode_getfile(const char *name,
+				const struct file_operations *fops,
+				void *priv, int flags)
+{
+	struct qstr this;
+	struct path path;
+	struct file *file;
+	int error;
+
+	if (IS_ERR(anon_inode_inode))
+		return ERR_PTR(-ENODEV);
+
+	if (fops->owner && !try_module_get(fops->owner))
+		return ERR_PTR(-ENOENT);
+
+	/*
+	 * Link the inode to a directory entry by creating a unique name
+	 * using the inode sequence number.
+	 */
+	error = -ENOMEM;
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = 0;
+	path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
+	if (!path.dentry)
+		goto err_module;
+
+	path.mnt = mntget(anon_inode_mnt);
+	/*
+	 * We know the anon_inode inode count is always greater than zero,
+	 * so ihold() is safe.
+	 */
+	ihold(anon_inode_inode);
+
+	d_instantiate(path.dentry, anon_inode_inode);
+
+	error = -ENFILE;
+	file = alloc_file(&path, OPEN_FMODE(flags), fops);
+	if (!file)
+		goto err_dput;
+	file->f_mapping = anon_inode_inode->i_mapping;
+
+	file->f_pos = 0;
+	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
+	file->f_version = 0;
+	file->private_data = priv;
+
+	return file;
+
+err_dput:
+	path_put(&path);
+err_module:
+	module_put(fops->owner);
+	return ERR_PTR(error);
+}
+EXPORT_SYMBOL_GPL(anon_inode_getfile);
+
+/**
+ * anon_inode_getfd - creates a new file instance by hooking it up to an
+ *                    anonymous inode, and a dentry that describe the "class"
+ *                    of the file
+ *
+ * @name:    [in]    name of the "class" of the new file
+ * @fops:    [in]    file operations for the new file
+ * @priv:    [in]    private data for the new file (will be file's private_data)
+ * @flags:   [in]    flags
+ *
+ * Creates a new file by hooking it on a single inode. This is useful for files
+ * that do not need to have a full-fledged inode in order to operate correctly.
+ * All the files created with anon_inode_getfd() will share a single inode,
+ * hence saving memory and avoiding code duplication for the file/inode/dentry
+ * setup.  Returns new descriptor or an error code.
+ */
+int anon_inode_getfd(const char *name, const struct file_operations *fops,
+		     void *priv, int flags)
+{
+	int error, fd;
+	struct file *file;
+
+	error = get_unused_fd_flags(flags);
+	if (error < 0)
+		return error;
+	fd = error;
+
+	file = anon_inode_getfile(name, fops, priv, flags);
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto err_put_unused_fd;
+	}
+	fd_install(fd, file);
+
+	return fd;
+
+err_put_unused_fd:
+	put_unused_fd(fd);
+	return error;
+}
+EXPORT_SYMBOL_GPL(anon_inode_getfd);
+
+/*
+ * A single inode exists for all anon_inode files. Contrary to pipes,
+ * anon_inode inodes have no associated per-instance data, so we need
+ * only allocate one of them.
+ */
+static struct inode *anon_inode_mkinode(void)
+{
+	struct inode *inode = new_inode(anon_inode_mnt->mnt_sb);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode->i_ino = get_next_ino();
+	inode->i_fop = &anon_inode_fops;
+
+	inode->i_mapping->a_ops = &anon_aops;
+
+	/*
+	 * Mark the inode dirty from the very beginning,
+	 * that way it will never be moved to the dirty
+	 * list because mark_inode_dirty() will think
+	 * that it already _is_ on the dirty list.
+	 */
+	inode->i_state = I_DIRTY;
+	inode->i_mode = S_IRUSR | S_IWUSR;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	inode->i_flags |= S_PRIVATE;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	return inode;
+}
+
+static int __init anon_inode_init(void)
+{
+	int error;
+
+	error = register_filesystem(&anon_inode_fs_type);
+	if (error)
+		goto err_exit;
+	anon_inode_mnt = kern_mount(&anon_inode_fs_type);
+	if (IS_ERR(anon_inode_mnt)) {
+		error = PTR_ERR(anon_inode_mnt);
+		goto err_unregister_filesystem;
+	}
+	anon_inode_inode = anon_inode_mkinode();
+	if (IS_ERR(anon_inode_inode)) {
+		error = PTR_ERR(anon_inode_inode);
+		goto err_mntput;
+	}
+
+	return 0;
+
+err_mntput:
+	mntput(anon_inode_mnt);
+err_unregister_filesystem:
+	unregister_filesystem(&anon_inode_fs_type);
+err_exit:
+	panic(KERN_ERR "anon_inode_init() failed (%d)\n", error);
+}
+
+fs_initcall(anon_inode_init);
+
diff --git a/fs/attr.c b/fs/attr.c
new file mode 100644
index 00000000..caf2aa52
--- /dev/null
+++ b/fs/attr.c
@@ -0,0 +1,252 @@
+/*
+ *  linux/fs/attr.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  changes by Thomas Schoebel-Theuer
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/capability.h>
+#include <linux/fsnotify.h>
+#include <linux/fcntl.h>
+#include <linux/security.h>
+
+/**
+ * inode_change_ok - check if attribute changes to an inode are allowed
+ * @inode:	inode to check
+ * @attr:	attributes to change
+ *
+ * Check if we are allowed to change the attributes contained in @attr
+ * in the given inode.  This includes the normal unix access permission
+ * checks, as well as checks for rlimits and others.
+ *
+ * Should be called as the first thing in ->setattr implementations,
+ * possibly after taking additional locks.
+ */
+int inode_change_ok(const struct inode *inode, struct iattr *attr)
+{
+	unsigned int ia_valid = attr->ia_valid;
+
+	/*
+	 * First check size constraints.  These can't be overriden using
+	 * ATTR_FORCE.
+	 */
+	if (ia_valid & ATTR_SIZE) {
+		int error = inode_newsize_ok(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	/* If force is set do it anyway. */
+	if (ia_valid & ATTR_FORCE)
+		return 0;
+
+	/* Make sure a caller can chown. */
+	if ((ia_valid & ATTR_UID) &&
+	    (current_fsuid() != inode->i_uid ||
+	     attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN))
+		return -EPERM;
+
+	/* Make sure caller can chgrp. */
+	if ((ia_valid & ATTR_GID) &&
+	    (current_fsuid() != inode->i_uid ||
+	    (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) &&
+	    !capable(CAP_CHOWN))
+		return -EPERM;
+
+	/* Make sure a caller can chmod. */
+	if (ia_valid & ATTR_MODE) {
+		if (!inode_owner_or_capable(inode))
+			return -EPERM;
+		/* Also check the setgid bit! */
+		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
+				inode->i_gid) && !capable(CAP_FSETID))
+			attr->ia_mode &= ~S_ISGID;
+	}
+
+	/* Check for setting the inode time. */
+	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
+		if (!inode_owner_or_capable(inode))
+			return -EPERM;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(inode_change_ok);
+
+/**
+ * inode_newsize_ok - may this inode be truncated to a given size
+ * @inode:	the inode to be truncated
+ * @offset:	the new size to assign to the inode
+ * @Returns:	0 on success, -ve errno on failure
+ *
+ * inode_newsize_ok must be called with i_mutex held.
+ *
+ * inode_newsize_ok will check filesystem limits and ulimits to check that the
+ * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
+ * when necessary. Caller must not proceed with inode size change if failure is
+ * returned. @inode must be a file (not directory), with appropriate
+ * permissions to allow truncate (inode_newsize_ok does NOT check these
+ * conditions).
+ */
+int inode_newsize_ok(const struct inode *inode, loff_t offset)
+{
+	if (inode->i_size < offset) {
+		unsigned long limit;
+
+		limit = rlimit(RLIMIT_FSIZE);
+		if (limit != RLIM_INFINITY && offset > limit)
+			goto out_sig;
+		if (offset > inode->i_sb->s_maxbytes)
+			goto out_big;
+	} else {
+		/*
+		 * truncation of in-use swapfiles is disallowed - it would
+		 * cause subsequent swapout to scribble on the now-freed
+		 * blocks.
+		 */
+		if (IS_SWAPFILE(inode))
+			return -ETXTBSY;
+	}
+
+	return 0;
+out_sig:
+	send_sig(SIGXFSZ, current, 0);
+out_big:
+	return -EFBIG;
+}
+EXPORT_SYMBOL(inode_newsize_ok);
+
+/**
+ * setattr_copy - copy simple metadata updates into the generic inode
+ * @inode:	the inode to be updated
+ * @attr:	the new attributes
+ *
+ * setattr_copy must be called with i_mutex held.
+ *
+ * setattr_copy updates the inode's metadata with that specified
+ * in attr. Noticeably missing is inode size update, which is more complex
+ * as it requires pagecache updates.
+ *
+ * The inode is not marked as dirty after this operation. The rationale is
+ * that for "simple" filesystems, the struct inode is the inode storage.
+ * The caller is free to mark the inode dirty afterwards if needed.
+ */
+void setattr_copy(struct inode *inode, const struct iattr *attr)
+{
+	unsigned int ia_valid = attr->ia_valid;
+
+	if (ia_valid & ATTR_UID)
+		inode->i_uid = attr->ia_uid;
+	if (ia_valid & ATTR_GID)
+		inode->i_gid = attr->ia_gid;
+	if (ia_valid & ATTR_ATIME)
+		inode->i_atime = timespec_trunc(attr->ia_atime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_MTIME)
+		inode->i_mtime = timespec_trunc(attr->ia_mtime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_CTIME)
+		inode->i_ctime = timespec_trunc(attr->ia_ctime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = attr->ia_mode;
+
+		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+			mode &= ~S_ISGID;
+		inode->i_mode = mode;
+	}
+}
+EXPORT_SYMBOL(setattr_copy);
+
+int notify_change(struct dentry * dentry, struct iattr * attr)
+{
+	struct inode *inode = dentry->d_inode;
+	mode_t mode = inode->i_mode;
+	int error;
+	struct timespec now;
+	unsigned int ia_valid = attr->ia_valid;
+
+	if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
+		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+			return -EPERM;
+	}
+
+	if ((ia_valid & ATTR_MODE)) {
+		mode_t amode = attr->ia_mode;
+		/* Flag setting protected by i_mutex */
+		if (is_sxid(amode))
+			inode->i_flags &= ~S_NOSEC;
+	}
+
+	now = current_fs_time(inode->i_sb);
+
+	attr->ia_ctime = now;
+	if (!(ia_valid & ATTR_ATIME_SET))
+		attr->ia_atime = now;
+	if (!(ia_valid & ATTR_MTIME_SET))
+		attr->ia_mtime = now;
+	if (ia_valid & ATTR_KILL_PRIV) {
+		attr->ia_valid &= ~ATTR_KILL_PRIV;
+		ia_valid &= ~ATTR_KILL_PRIV;
+		error = security_inode_need_killpriv(dentry);
+		if (error > 0)
+			error = security_inode_killpriv(dentry);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * We now pass ATTR_KILL_S*ID to the lower level setattr function so
+	 * that the function has the ability to reinterpret a mode change
+	 * that's due to these bits. This adds an implicit restriction that
+	 * no function will ever call notify_change with both ATTR_MODE and
+	 * ATTR_KILL_S*ID set.
+	 */
+	if ((ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) &&
+	    (ia_valid & ATTR_MODE))
+		BUG();
+
+	if (ia_valid & ATTR_KILL_SUID) {
+		if (mode & S_ISUID) {
+			ia_valid = attr->ia_valid |= ATTR_MODE;
+			attr->ia_mode = (inode->i_mode & ~S_ISUID);
+		}
+	}
+	if (ia_valid & ATTR_KILL_SGID) {
+		if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+			if (!(ia_valid & ATTR_MODE)) {
+				ia_valid = attr->ia_valid |= ATTR_MODE;
+				attr->ia_mode = inode->i_mode;
+			}
+			attr->ia_mode &= ~S_ISGID;
+		}
+	}
+	if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
+		return 0;
+
+	error = security_inode_setattr(dentry, attr);
+	if (error)
+		return error;
+
+	if (ia_valid & ATTR_SIZE)
+		down_write(&dentry->d_inode->i_alloc_sem);
+
+	if (inode->i_op->setattr)
+		error = inode->i_op->setattr(dentry, attr);
+	else
+		error = simple_setattr(dentry, attr);
+
+	if (ia_valid & ATTR_SIZE)
+		up_write(&dentry->d_inode->i_alloc_sem);
+
+	if (!error)
+		fsnotify_change(dentry, ia_valid);
+
+	return error;
+}
+
+EXPORT_SYMBOL(notify_change);
diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig
new file mode 100644
index 00000000..1204d638
--- /dev/null
+++ b/fs/autofs4/Kconfig
@@ -0,0 +1,20 @@
+config AUTOFS4_FS
+	tristate "Kernel automounter version 4 support (also supports v3)"
+	help
+	  The automounter is a tool to automatically mount remote file systems
+	  on demand. This implementation is partially kernel-based to reduce
+	  overhead in the already-mounted case; this is unlike the BSD
+	  automounter (amd), which is a pure user space daemon.
+
+	  To use the automounter you need the user-space tools from
+	  <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also
+	  want to answer Y to "NFS file system support", below.
+
+	  To compile this support as a module, choose M here: the module will be
+	  called autofs4.  You will need to add "alias autofs autofs4" to your
+	  modules configuration file.
+
+	  If you are not a part of a fairly large, distributed network or
+	  don't have a laptop which needs to dynamically reconfigure to the
+	  local network, you probably do not need an automounter, and can say
+	  N here.
diff --git a/fs/autofs4/Makefile b/fs/autofs4/Makefile
new file mode 100644
index 00000000..a811c1f7
--- /dev/null
+++ b/fs/autofs4/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux autofs-filesystem routines.
+#
+
+obj-$(CONFIG_AUTOFS4_FS) += autofs4.o
+
+autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
new file mode 100644
index 00000000..756d3286
--- /dev/null
+++ b/fs/autofs4/autofs_i.h
@@ -0,0 +1,350 @@
+/* -*- c -*- ------------------------------------------------------------- *
+ *   
+ * linux/fs/autofs/autofs_i.h
+ *
+ *   Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
+ *   Copyright 2005-2006 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/* Internal header file for autofs */
+
+#include <linux/auto_fs4.h>
+#include <linux/auto_dev-ioctl.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+
+/* This is the range of ioctl() numbers we claim as ours */
+#define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
+#define AUTOFS_IOC_COUNT     32
+
+#define AUTOFS_DEV_IOCTL_IOC_FIRST	(AUTOFS_DEV_IOCTL_VERSION)
+#define AUTOFS_DEV_IOCTL_IOC_COUNT	(AUTOFS_IOC_COUNT - 11)
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/string.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <asm/current.h>
+#include <asm/uaccess.h>
+
+/* #define DEBUG */
+
+#ifdef DEBUG
+#define DPRINTK(fmt, args...)				\
+do {							\
+	printk(KERN_DEBUG "pid %d: %s: " fmt "\n",	\
+		current->pid, __func__, ##args);	\
+} while (0)
+#else
+#define DPRINTK(fmt, args...) do {} while (0)
+#endif
+
+#define AUTOFS_WARN(fmt, args...)			\
+do {							\
+	printk(KERN_WARNING "pid %d: %s: " fmt "\n",	\
+		current->pid, __func__, ##args);	\
+} while (0)
+
+#define AUTOFS_ERROR(fmt, args...)			\
+do {							\
+	printk(KERN_ERR "pid %d: %s: " fmt "\n",	\
+		current->pid, __func__, ##args);	\
+} while (0)
+
+/* Unified info structure.  This is pointed to by both the dentry and
+   inode structures.  Each file in the filesystem has an instance of this
+   structure.  It holds a reference to the dentry, so dentries are never
+   flushed while the file exists.  All name lookups are dealt with at the
+   dentry level, although the filesystem can interfere in the validation
+   process.  Readdir is implemented by traversing the dentry lists. */
+struct autofs_info {
+	struct dentry	*dentry;
+	struct inode	*inode;
+
+	int		flags;
+
+	struct completion expire_complete;
+
+	struct list_head active;
+	int active_count;
+
+	struct list_head expiring;
+
+	struct autofs_sb_info *sbi;
+	unsigned long last_used;
+	atomic_t count;
+
+	uid_t uid;
+	gid_t gid;
+};
+
+#define AUTOFS_INF_EXPIRING	(1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_PENDING	(1<<2) /* dentry pending mount */
+
+struct autofs_wait_queue {
+	wait_queue_head_t queue;
+	struct autofs_wait_queue *next;
+	autofs_wqt_t wait_queue_token;
+	/* We use the following to see what we are waiting for */
+	struct qstr name;
+	u32 dev;
+	u64 ino;
+	uid_t uid;
+	gid_t gid;
+	pid_t pid;
+	pid_t tgid;
+	/* This is for status reporting upon return */
+	int status;
+	unsigned int wait_ctr;
+};
+
+#define AUTOFS_SBI_MAGIC 0x6d4a556d
+
+struct autofs_sb_info {
+	u32 magic;
+	int pipefd;
+	struct file *pipe;
+	pid_t oz_pgrp;
+	int catatonic;
+	int version;
+	int sub_version;
+	int min_proto;
+	int max_proto;
+	unsigned long exp_timeout;
+	unsigned int type;
+	int reghost_enabled;
+	int needs_reghost;
+	struct super_block *sb;
+	struct mutex wq_mutex;
+	spinlock_t fs_lock;
+	struct autofs_wait_queue *queues; /* Wait queue pointer */
+	spinlock_t lookup_lock;
+	struct list_head active_list;
+	struct list_head expiring_list;
+};
+
+static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
+{
+	return (struct autofs_sb_info *)(sb->s_fs_info);
+}
+
+static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
+{
+	return (struct autofs_info *)(dentry->d_fsdata);
+}
+
+/* autofs4_oz_mode(): do we see the man behind the curtain?  (The
+   processes which do manipulations for us in user space sees the raw
+   filesystem without "magic".) */
+
+static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
+	return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp;
+}
+
+/* Does a dentry have some pending activity? */
+static inline int autofs4_ispending(struct dentry *dentry)
+{
+	struct autofs_info *inf = autofs4_dentry_ino(dentry);
+
+	if (inf->flags & AUTOFS_INF_PENDING)
+		return 1;
+
+	if (inf->flags & AUTOFS_INF_EXPIRING)
+		return 1;
+
+	return 0;
+}
+
+struct inode *autofs4_get_inode(struct super_block *, mode_t);
+void autofs4_free_ino(struct autofs_info *);
+
+/* Expiration */
+int is_autofs4_dentry(struct dentry *);
+int autofs4_expire_wait(struct dentry *dentry);
+int autofs4_expire_run(struct super_block *, struct vfsmount *,
+			struct autofs_sb_info *,
+			struct autofs_packet_expire __user *);
+int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+			    struct autofs_sb_info *sbi, int when);
+int autofs4_expire_multi(struct super_block *, struct vfsmount *,
+			struct autofs_sb_info *, int __user *);
+struct dentry *autofs4_expire_direct(struct super_block *sb,
+				     struct vfsmount *mnt,
+				     struct autofs_sb_info *sbi, int how);
+struct dentry *autofs4_expire_indirect(struct super_block *sb,
+				       struct vfsmount *mnt,
+				       struct autofs_sb_info *sbi, int how);
+
+/* Device node initialization */
+
+int autofs_dev_ioctl_init(void);
+void autofs_dev_ioctl_exit(void);
+
+/* Operations structures */
+
+extern const struct inode_operations autofs4_symlink_inode_operations;
+extern const struct inode_operations autofs4_dir_inode_operations;
+extern const struct file_operations autofs4_dir_operations;
+extern const struct file_operations autofs4_root_operations;
+extern const struct dentry_operations autofs4_dentry_operations;
+
+/* VFS automount flags management functions */
+
+static inline void __managed_dentry_set_automount(struct dentry *dentry)
+{
+	dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+}
+
+static inline void managed_dentry_set_automount(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_set_automount(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_clear_automount(struct dentry *dentry)
+{
+	dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
+}
+
+static inline void managed_dentry_clear_automount(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_clear_automount(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_set_transit(struct dentry *dentry)
+{
+	dentry->d_flags |= DCACHE_MANAGE_TRANSIT;
+}
+
+static inline void managed_dentry_set_transit(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_set_transit(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_clear_transit(struct dentry *dentry)
+{
+	dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT;
+}
+
+static inline void managed_dentry_clear_transit(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_clear_transit(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_set_managed(struct dentry *dentry)
+{
+	dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+
+static inline void managed_dentry_set_managed(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_set_managed(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_clear_managed(struct dentry *dentry)
+{
+	dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+
+static inline void managed_dentry_clear_managed(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_clear_managed(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+/* Initializing function */
+
+int autofs4_fill_super(struct super_block *, void *, int);
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *);
+void autofs4_clean_ino(struct autofs_info *);
+
+static inline int autofs_prepare_pipe(struct file *pipe)
+{
+	if (!pipe->f_op || !pipe->f_op->write)
+		return -EINVAL;
+	if (!S_ISFIFO(pipe->f_dentry->d_inode->i_mode))
+		return -EINVAL;
+	/* We want a packet pipe */
+	pipe->f_flags |= O_DIRECT;
+	return 0;
+}
+
+/* Queue management functions */
+
+int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
+int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
+void autofs4_catatonic_mode(struct autofs_sb_info *);
+
+static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
+{
+	return new_encode_dev(sbi->sb->s_dev);
+}
+
+static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi)
+{
+	return sbi->sb->s_root->d_inode->i_ino;
+}
+
+static inline int simple_positive(struct dentry *dentry)
+{
+	return dentry->d_inode && !d_unhashed(dentry);
+}
+
+static inline void __autofs4_add_expiring(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino) {
+		if (list_empty(&ino->expiring))
+			list_add(&ino->expiring, &sbi->expiring_list);
+	}
+	return;
+}
+
+static inline void autofs4_add_expiring(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino) {
+		spin_lock(&sbi->lookup_lock);
+		if (list_empty(&ino->expiring))
+			list_add(&ino->expiring, &sbi->expiring_list);
+		spin_unlock(&sbi->lookup_lock);
+	}
+	return;
+}
+
+static inline void autofs4_del_expiring(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino) {
+		spin_lock(&sbi->lookup_lock);
+		if (!list_empty(&ino->expiring))
+			list_del_init(&ino->expiring);
+		spin_unlock(&sbi->lookup_lock);
+	}
+	return;
+}
+
+extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
new file mode 100644
index 00000000..de542716
--- /dev/null
+++ b/fs/autofs4/dev-ioctl.c
@@ -0,0 +1,762 @@
+/*
+ * Copyright 2008 Red Hat, Inc. All rights reserved.
+ * Copyright 2008 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/namei.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/sched.h>
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <linux/magic.h>
+#include <linux/dcache.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+
+#include "autofs_i.h"
+
+/*
+ * This module implements an interface for routing autofs ioctl control
+ * commands via a miscellaneous device file.
+ *
+ * The alternate interface is needed because we need to be able open
+ * an ioctl file descriptor on an autofs mount that may be covered by
+ * another mount. This situation arises when starting automount(8)
+ * or other user space daemon which uses direct mounts or offset
+ * mounts (used for autofs lazy mount/umount of nested mount trees),
+ * which have been left busy at at service shutdown.
+ */
+
+#define AUTOFS_DEV_IOCTL_SIZE	sizeof(struct autofs_dev_ioctl)
+
+typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *,
+			struct autofs_dev_ioctl *);
+
+static int check_name(const char *name)
+{
+	if (!strchr(name, '/'))
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * Check a string doesn't overrun the chunk of
+ * memory we copied from user land.
+ */
+static int invalid_str(char *str, size_t size)
+{
+	if (memchr(str, 0, size))
+		return 0;
+	return -EINVAL;
+}
+
+/*
+ * Check that the user compiled against correct version of autofs
+ * misc device code.
+ *
+ * As well as checking the version compatibility this always copies
+ * the kernel interface version out.
+ */
+static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
+{
+	int err = 0;
+
+	if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) ||
+	    (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) {
+		AUTOFS_WARN("ioctl control interface version mismatch: "
+		     "kernel(%u.%u), user(%u.%u), cmd(%d)",
+		     AUTOFS_DEV_IOCTL_VERSION_MAJOR,
+		     AUTOFS_DEV_IOCTL_VERSION_MINOR,
+		     param->ver_major, param->ver_minor, cmd);
+		err = -EINVAL;
+	}
+
+	/* Fill in the kernel version. */
+	param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
+	param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
+
+	return err;
+}
+
+/*
+ * Copy parameter control struct, including a possible path allocated
+ * at the end of the struct.
+ */
+static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
+{
+	struct autofs_dev_ioctl tmp;
+
+	if (copy_from_user(&tmp, in, sizeof(tmp)))
+		return ERR_PTR(-EFAULT);
+
+	if (tmp.size < sizeof(tmp))
+		return ERR_PTR(-EINVAL);
+
+	return memdup_user(in, tmp.size);
+}
+
+static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
+{
+	kfree(param);
+	return;
+}
+
+/*
+ * Check sanity of parameter control fields and if a path is present
+ * check that it is terminated and contains at least one "/".
+ */
+static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
+{
+	int err;
+
+	err = check_dev_ioctl_version(cmd, param);
+	if (err) {
+		AUTOFS_WARN("invalid device control module version "
+		     "supplied for cmd(0x%08x)", cmd);
+		goto out;
+	}
+
+	if (param->size > sizeof(*param)) {
+		err = invalid_str(param->path, param->size - sizeof(*param));
+		if (err) {
+			AUTOFS_WARN(
+			  "path string terminator missing for cmd(0x%08x)",
+			  cmd);
+			goto out;
+		}
+
+		err = check_name(param->path);
+		if (err) {
+			AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
+				    cmd);
+			goto out;
+		}
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+/*
+ * Get the autofs super block info struct from the file opened on
+ * the autofs mount point.
+ */
+static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f)
+{
+	struct autofs_sb_info *sbi = NULL;
+	struct inode *inode;
+
+	if (f) {
+		inode = f->f_path.dentry->d_inode;
+		sbi = autofs4_sbi(inode->i_sb);
+	}
+	return sbi;
+}
+
+/* Return autofs module protocol version */
+static int autofs_dev_ioctl_protover(struct file *fp,
+				     struct autofs_sb_info *sbi,
+				     struct autofs_dev_ioctl *param)
+{
+	param->protover.version = sbi->version;
+	return 0;
+}
+
+/* Return autofs module protocol sub version */
+static int autofs_dev_ioctl_protosubver(struct file *fp,
+					struct autofs_sb_info *sbi,
+					struct autofs_dev_ioctl *param)
+{
+	param->protosubver.sub_version = sbi->sub_version;
+	return 0;
+}
+
+static int find_autofs_mount(const char *pathname,
+			     struct path *res,
+			     int test(struct path *path, void *data),
+			     void *data)
+{
+	struct path path;
+	int err = kern_path(pathname, 0, &path);
+	if (err)
+		return err;
+	err = -ENOENT;
+	while (path.dentry == path.mnt->mnt_root) {
+		if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) {
+			if (test(&path, data)) {
+				path_get(&path);
+				if (!err) /* already found some */
+					path_put(res);
+				*res = path;
+				err = 0;
+			}
+		}
+		if (!follow_up(&path))
+			break;
+	}
+	path_put(&path);
+	return err;
+}
+
+static int test_by_dev(struct path *path, void *p)
+{
+	return path->mnt->mnt_sb->s_dev == *(dev_t *)p;
+}
+
+static int test_by_type(struct path *path, void *p)
+{
+	struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
+	return ino && ino->sbi->type & *(unsigned *)p;
+}
+
+static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
+{
+	struct files_struct *files = current->files;
+	struct fdtable *fdt;
+
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	BUG_ON(fdt->fd[fd] != NULL);
+	rcu_assign_pointer(fdt->fd[fd], file);
+	FD_SET(fd, fdt->close_on_exec);
+	spin_unlock(&files->file_lock);
+}
+
+
+/*
+ * Open a file descriptor on the autofs mount point corresponding
+ * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
+ */
+static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
+{
+	int err, fd;
+
+	fd = get_unused_fd();
+	if (likely(fd >= 0)) {
+		struct file *filp;
+		struct path path;
+
+		err = find_autofs_mount(name, &path, test_by_dev, &devid);
+		if (err)
+			goto out;
+
+		/*
+		 * Find autofs super block that has the device number
+		 * corresponding to the autofs fs we want to open.
+		 */
+
+		filp = dentry_open(path.dentry, path.mnt, O_RDONLY,
+				   current_cred());
+		if (IS_ERR(filp)) {
+			err = PTR_ERR(filp);
+			goto out;
+		}
+
+		autofs_dev_ioctl_fd_install(fd, filp);
+	}
+
+	return fd;
+
+out:
+	put_unused_fd(fd);
+	return err;
+}
+
+/* Open a file descriptor on an autofs mount point */
+static int autofs_dev_ioctl_openmount(struct file *fp,
+				      struct autofs_sb_info *sbi,
+				      struct autofs_dev_ioctl *param)
+{
+	const char *path;
+	dev_t devid;
+	int err, fd;
+
+	/* param->path has already been checked */
+	if (!param->openmount.devid)
+		return -EINVAL;
+
+	param->ioctlfd = -1;
+
+	path = param->path;
+	devid = new_decode_dev(param->openmount.devid);
+
+	err = 0;
+	fd = autofs_dev_ioctl_open_mountpoint(path, devid);
+	if (unlikely(fd < 0)) {
+		err = fd;
+		goto out;
+	}
+
+	param->ioctlfd = fd;
+out:
+	return err;
+}
+
+/* Close file descriptor allocated above (user can also use close(2)). */
+static int autofs_dev_ioctl_closemount(struct file *fp,
+				       struct autofs_sb_info *sbi,
+				       struct autofs_dev_ioctl *param)
+{
+	return sys_close(param->ioctlfd);
+}
+
+/*
+ * Send "ready" status for an existing wait (either a mount or an expire
+ * request).
+ */
+static int autofs_dev_ioctl_ready(struct file *fp,
+				  struct autofs_sb_info *sbi,
+				  struct autofs_dev_ioctl *param)
+{
+	autofs_wqt_t token;
+
+	token = (autofs_wqt_t) param->ready.token;
+	return autofs4_wait_release(sbi, token, 0);
+}
+
+/*
+ * Send "fail" status for an existing wait (either a mount or an expire
+ * request).
+ */
+static int autofs_dev_ioctl_fail(struct file *fp,
+				 struct autofs_sb_info *sbi,
+				 struct autofs_dev_ioctl *param)
+{
+	autofs_wqt_t token;
+	int status;
+
+	token = (autofs_wqt_t) param->fail.token;
+	status = param->fail.status ? param->fail.status : -ENOENT;
+	return autofs4_wait_release(sbi, token, status);
+}
+
+/*
+ * Set the pipe fd for kernel communication to the daemon.
+ *
+ * Normally this is set at mount using an option but if we
+ * are reconnecting to a busy mount then we need to use this
+ * to tell the autofs mount about the new kernel pipe fd. In
+ * order to protect mounts against incorrectly setting the
+ * pipefd we also require that the autofs mount be catatonic.
+ *
+ * This also sets the process group id used to identify the
+ * controlling process (eg. the owning automount(8) daemon).
+ */
+static int autofs_dev_ioctl_setpipefd(struct file *fp,
+				      struct autofs_sb_info *sbi,
+				      struct autofs_dev_ioctl *param)
+{
+	int pipefd;
+	int err = 0;
+
+	if (param->setpipefd.pipefd == -1)
+		return -EINVAL;
+
+	pipefd = param->setpipefd.pipefd;
+
+	mutex_lock(&sbi->wq_mutex);
+	if (!sbi->catatonic) {
+		mutex_unlock(&sbi->wq_mutex);
+		return -EBUSY;
+	} else {
+		struct file *pipe = fget(pipefd);
+		if (!pipe) {
+			err = -EBADF;
+			goto out;
+		}
+		if (autofs_prepare_pipe(pipe) < 0) {
+			err = -EPIPE;
+			fput(pipe);
+			goto out;
+		}
+		sbi->oz_pgrp = task_pgrp_nr(current);
+		sbi->pipefd = pipefd;
+		sbi->pipe = pipe;
+		sbi->catatonic = 0;
+	}
+out:
+	mutex_unlock(&sbi->wq_mutex);
+	return err;
+}
+
+/*
+ * Make the autofs mount point catatonic, no longer responsive to
+ * mount requests. Also closes the kernel pipe file descriptor.
+ */
+static int autofs_dev_ioctl_catatonic(struct file *fp,
+				      struct autofs_sb_info *sbi,
+				      struct autofs_dev_ioctl *param)
+{
+	autofs4_catatonic_mode(sbi);
+	return 0;
+}
+
+/* Set the autofs mount timeout */
+static int autofs_dev_ioctl_timeout(struct file *fp,
+				    struct autofs_sb_info *sbi,
+				    struct autofs_dev_ioctl *param)
+{
+	unsigned long timeout;
+
+	timeout = param->timeout.timeout;
+	param->timeout.timeout = sbi->exp_timeout / HZ;
+	sbi->exp_timeout = timeout * HZ;
+	return 0;
+}
+
+/*
+ * Return the uid and gid of the last request for the mount
+ *
+ * When reconstructing an autofs mount tree with active mounts
+ * we need to re-connect to mounts that may have used the original
+ * process uid and gid (or string variations of them) for mount
+ * lookups within the map entry.
+ */
+static int autofs_dev_ioctl_requester(struct file *fp,
+				      struct autofs_sb_info *sbi,
+				      struct autofs_dev_ioctl *param)
+{
+	struct autofs_info *ino;
+	struct path path;
+	dev_t devid;
+	int err = -ENOENT;
+
+	if (param->size <= sizeof(*param)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	devid = sbi->sb->s_dev;
+
+	param->requester.uid = param->requester.gid = -1;
+
+	err = find_autofs_mount(param->path, &path, test_by_dev, &devid);
+	if (err)
+		goto out;
+
+	ino = autofs4_dentry_ino(path.dentry);
+	if (ino) {
+		err = 0;
+		autofs4_expire_wait(path.dentry);
+		spin_lock(&sbi->fs_lock);
+		param->requester.uid = ino->uid;
+		param->requester.gid = ino->gid;
+		spin_unlock(&sbi->fs_lock);
+	}
+	path_put(&path);
+out:
+	return err;
+}
+
+/*
+ * Call repeatedly until it returns -EAGAIN, meaning there's nothing
+ * more that can be done.
+ */
+static int autofs_dev_ioctl_expire(struct file *fp,
+				   struct autofs_sb_info *sbi,
+				   struct autofs_dev_ioctl *param)
+{
+	struct vfsmount *mnt;
+	int how;
+
+	how = param->expire.how;
+	mnt = fp->f_path.mnt;
+
+	return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how);
+}
+
+/* Check if autofs mount point is in use */
+static int autofs_dev_ioctl_askumount(struct file *fp,
+				      struct autofs_sb_info *sbi,
+				      struct autofs_dev_ioctl *param)
+{
+	param->askumount.may_umount = 0;
+	if (may_umount(fp->f_path.mnt))
+		param->askumount.may_umount = 1;
+	return 0;
+}
+
+/*
+ * Check if the given path is a mountpoint.
+ *
+ * If we are supplied with the file descriptor of an autofs
+ * mount we're looking for a specific mount. In this case
+ * the path is considered a mountpoint if it is itself a
+ * mountpoint or contains a mount, such as a multi-mount
+ * without a root mount. In this case we return 1 if the
+ * path is a mount point and the super magic of the covering
+ * mount if there is one or 0 if it isn't a mountpoint.
+ *
+ * If we aren't supplied with a file descriptor then we
+ * lookup the nameidata of the path and check if it is the
+ * root of a mount. If a type is given we are looking for
+ * a particular autofs mount and if we don't find a match
+ * we return fail. If the located nameidata path is the
+ * root of a mount we return 1 along with the super magic
+ * of the mount or 0 otherwise.
+ *
+ * In both cases the the device number (as returned by
+ * new_encode_dev()) is also returned.
+ */
+static int autofs_dev_ioctl_ismountpoint(struct file *fp,
+					 struct autofs_sb_info *sbi,
+					 struct autofs_dev_ioctl *param)
+{
+	struct path path;
+	const char *name;
+	unsigned int type;
+	unsigned int devid, magic;
+	int err = -ENOENT;
+
+	if (param->size <= sizeof(*param)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	name = param->path;
+	type = param->ismountpoint.in.type;
+
+	param->ismountpoint.out.devid = devid = 0;
+	param->ismountpoint.out.magic = magic = 0;
+
+	if (!fp || param->ioctlfd == -1) {
+		if (autofs_type_any(type))
+			err = kern_path(name, LOOKUP_FOLLOW, &path);
+		else
+			err = find_autofs_mount(name, &path, test_by_type, &type);
+		if (err)
+			goto out;
+		devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
+		err = 0;
+		if (path.mnt->mnt_root == path.dentry) {
+			err = 1;
+			magic = path.mnt->mnt_sb->s_magic;
+		}
+	} else {
+		dev_t dev = sbi->sb->s_dev;
+
+		err = find_autofs_mount(name, &path, test_by_dev, &dev);
+		if (err)
+			goto out;
+
+		devid = new_encode_dev(dev);
+
+		err = have_submounts(path.dentry);
+
+		if (follow_down_one(&path))
+			magic = path.mnt->mnt_sb->s_magic;
+	}
+
+	param->ismountpoint.out.devid = devid;
+	param->ismountpoint.out.magic = magic;
+	path_put(&path);
+out:
+	return err;
+}
+
+/*
+ * Our range of ioctl numbers isn't 0 based so we need to shift
+ * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table
+ * lookup.
+ */
+#define cmd_idx(cmd)	(cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST))
+
+static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
+{
+	static struct {
+		int cmd;
+		ioctl_fn fn;
+	} _ioctls[] = {
+		{cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL},
+		{cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD),
+			 autofs_dev_ioctl_protover},
+		{cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD),
+			 autofs_dev_ioctl_protosubver},
+		{cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD),
+			 autofs_dev_ioctl_openmount},
+		{cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD),
+			 autofs_dev_ioctl_closemount},
+		{cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD),
+			 autofs_dev_ioctl_ready},
+		{cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD),
+			 autofs_dev_ioctl_fail},
+		{cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD),
+			 autofs_dev_ioctl_setpipefd},
+		{cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD),
+			 autofs_dev_ioctl_catatonic},
+		{cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD),
+			 autofs_dev_ioctl_timeout},
+		{cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD),
+			 autofs_dev_ioctl_requester},
+		{cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD),
+			 autofs_dev_ioctl_expire},
+		{cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD),
+			 autofs_dev_ioctl_askumount},
+		{cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD),
+			 autofs_dev_ioctl_ismountpoint}
+	};
+	unsigned int idx = cmd_idx(cmd);
+
+	return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn;
+}
+
+/* ioctl dispatcher */
+static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user)
+{
+	struct autofs_dev_ioctl *param;
+	struct file *fp;
+	struct autofs_sb_info *sbi;
+	unsigned int cmd_first, cmd;
+	ioctl_fn fn = NULL;
+	int err = 0;
+
+	/* only root can play with this */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST);
+	cmd = _IOC_NR(command);
+
+	if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) ||
+	    cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) {
+		return -ENOTTY;
+	}
+
+	/* Copy the parameters into kernel space. */
+	param = copy_dev_ioctl(user);
+	if (IS_ERR(param))
+		return PTR_ERR(param);
+
+	err = validate_dev_ioctl(command, param);
+	if (err)
+		goto out;
+
+	/* The validate routine above always sets the version */
+	if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD)
+		goto done;
+
+	fn = lookup_dev_ioctl(cmd);
+	if (!fn) {
+		AUTOFS_WARN("unknown command 0x%08x", command);
+		return -ENOTTY;
+	}
+
+	fp = NULL;
+	sbi = NULL;
+
+	/*
+	 * For obvious reasons the openmount can't have a file
+	 * descriptor yet. We don't take a reference to the
+	 * file during close to allow for immediate release.
+	 */
+	if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD &&
+	    cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) {
+		fp = fget(param->ioctlfd);
+		if (!fp) {
+			if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD)
+				goto cont;
+			err = -EBADF;
+			goto out;
+		}
+
+		if (!fp->f_op) {
+			err = -ENOTTY;
+			fput(fp);
+			goto out;
+		}
+
+		sbi = autofs_dev_ioctl_sbi(fp);
+		if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) {
+			err = -EINVAL;
+			fput(fp);
+			goto out;
+		}
+
+		/*
+		 * Admin needs to be able to set the mount catatonic in
+		 * order to be able to perform the re-open.
+		 */
+		if (!autofs4_oz_mode(sbi) &&
+		    cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) {
+			err = -EACCES;
+			fput(fp);
+			goto out;
+		}
+	}
+cont:
+	err = fn(fp, sbi, param);
+
+	if (fp)
+		fput(fp);
+done:
+	if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE))
+		err = -EFAULT;
+out:
+	free_dev_ioctl(param);
+	return err;
+}
+
+static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
+{
+	int err;
+	err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u);
+	return (long) err;
+}
+
+#ifdef CONFIG_COMPAT
+static long autofs_dev_ioctl_compat(struct file *file, uint command, ulong u)
+{
+	return (long) autofs_dev_ioctl(file, command, (ulong) compat_ptr(u));
+}
+#else
+#define autofs_dev_ioctl_compat NULL
+#endif
+
+static const struct file_operations _dev_ioctl_fops = {
+	.unlocked_ioctl	 = autofs_dev_ioctl,
+	.compat_ioctl = autofs_dev_ioctl_compat,
+	.owner	 = THIS_MODULE,
+	.llseek = noop_llseek,
+};
+
+static struct miscdevice _autofs_dev_ioctl_misc = {
+	.minor		= AUTOFS_MINOR,
+	.name  		= AUTOFS_DEVICE_NAME,
+	.fops  		= &_dev_ioctl_fops
+};
+
+MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
+MODULE_ALIAS("devname:autofs");
+
+/* Register/deregister misc character device */
+int autofs_dev_ioctl_init(void)
+{
+	int r;
+
+	r = misc_register(&_autofs_dev_ioctl_misc);
+	if (r) {
+		AUTOFS_ERROR("misc_register failed for control device");
+		return r;
+	}
+
+	return 0;
+}
+
+void autofs_dev_ioctl_exit(void)
+{
+	misc_deregister(&_autofs_dev_ioctl_misc);
+	return;
+}
+
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
new file mode 100644
index 00000000..450f529a
--- /dev/null
+++ b/fs/autofs4/expire.c
@@ -0,0 +1,588 @@
+/* -*- c -*- --------------------------------------------------------------- *
+ *
+ * linux/fs/autofs/expire.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *  Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ *  Copyright 2001-2006 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include "autofs_i.h"
+
+static unsigned long now;
+
+/* Check if a dentry can be expired */
+static inline int autofs4_can_expire(struct dentry *dentry,
+					unsigned long timeout, int do_now)
+{
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
+	/* dentry in the process of being deleted */
+	if (ino == NULL)
+		return 0;
+
+	if (!do_now) {
+		/* Too young to die */
+		if (!timeout || time_after(ino->last_used + timeout, now))
+			return 0;
+
+		/* update last_used here :-
+		   - obviously makes sense if it is in use now
+		   - less obviously, prevents rapid-fire expire
+		     attempts if expire fails the first time */
+		ino->last_used = now;
+	}
+	return 1;
+}
+
+/* Check a mount point for busyness */
+static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
+{
+	struct dentry *top = dentry;
+	struct path path = {.mnt = mnt, .dentry = dentry};
+	int status = 1;
+
+	DPRINTK("dentry %p %.*s",
+		dentry, (int)dentry->d_name.len, dentry->d_name.name);
+
+	path_get(&path);
+
+	if (!follow_down_one(&path))
+		goto done;
+
+	if (is_autofs4_dentry(path.dentry)) {
+		struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb);
+
+		/* This is an autofs submount, we can't expire it */
+		if (autofs_type_indirect(sbi->type))
+			goto done;
+
+		/*
+		 * Otherwise it's an offset mount and we need to check
+		 * if we can umount its mount, if there is one.
+		 */
+		if (!d_mountpoint(path.dentry)) {
+			status = 0;
+			goto done;
+		}
+	}
+
+	/* Update the expiry counter if fs is busy */
+	if (!may_umount_tree(path.mnt)) {
+		struct autofs_info *ino = autofs4_dentry_ino(top);
+		ino->last_used = jiffies;
+		goto done;
+	}
+
+	status = 0;
+done:
+	DPRINTK("returning = %d", status);
+	path_put(&path);
+	return status;
+}
+
+/*
+ * Calculate and dget next entry in the subdirs list under root.
+ */
+static struct dentry *get_next_positive_subdir(struct dentry *prev,
+						struct dentry *root)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
+	struct list_head *next;
+	struct dentry *p, *q;
+
+	spin_lock(&sbi->lookup_lock);
+
+	if (prev == NULL) {
+		spin_lock(&root->d_lock);
+		prev = dget_dlock(root);
+		next = prev->d_subdirs.next;
+		p = prev;
+		goto start;
+	}
+
+	p = prev;
+	spin_lock(&p->d_lock);
+again:
+	next = p->d_u.d_child.next;
+start:
+	if (next == &root->d_subdirs) {
+		spin_unlock(&p->d_lock);
+		spin_unlock(&sbi->lookup_lock);
+		dput(prev);
+		return NULL;
+	}
+
+	q = list_entry(next, struct dentry, d_u.d_child);
+
+	spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
+	/* Negative dentry - try next */
+	if (!simple_positive(q)) {
+		spin_unlock(&p->d_lock);
+		p = q;
+		goto again;
+	}
+	dget_dlock(q);
+	spin_unlock(&q->d_lock);
+	spin_unlock(&p->d_lock);
+	spin_unlock(&sbi->lookup_lock);
+
+	dput(prev);
+
+	return q;
+}
+
+/*
+ * Calculate and dget next entry in top down tree traversal.
+ */
+static struct dentry *get_next_positive_dentry(struct dentry *prev,
+						struct dentry *root)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
+	struct list_head *next;
+	struct dentry *p, *ret;
+
+	if (prev == NULL)
+		return dget(root);
+
+	spin_lock(&sbi->lookup_lock);
+relock:
+	p = prev;
+	spin_lock(&p->d_lock);
+again:
+	next = p->d_subdirs.next;
+	if (next == &p->d_subdirs) {
+		while (1) {
+			struct dentry *parent;
+
+			if (p == root) {
+				spin_unlock(&p->d_lock);
+				spin_unlock(&sbi->lookup_lock);
+				dput(prev);
+				return NULL;
+			}
+
+			parent = p->d_parent;
+			if (!spin_trylock(&parent->d_lock)) {
+				spin_unlock(&p->d_lock);
+				cpu_relax();
+				goto relock;
+			}
+			spin_unlock(&p->d_lock);
+			next = p->d_u.d_child.next;
+			p = parent;
+			if (next != &parent->d_subdirs)
+				break;
+		}
+	}
+	ret = list_entry(next, struct dentry, d_u.d_child);
+
+	spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
+	/* Negative dentry - try next */
+	if (!simple_positive(ret)) {
+		spin_unlock(&p->d_lock);
+		p = ret;
+		goto again;
+	}
+	dget_dlock(ret);
+	spin_unlock(&ret->d_lock);
+	spin_unlock(&p->d_lock);
+	spin_unlock(&sbi->lookup_lock);
+
+	dput(prev);
+
+	return ret;
+}
+
+/*
+ * Check a direct mount point for busyness.
+ * Direct mounts have similar expiry semantics to tree mounts.
+ * The tree is not busy iff no mountpoints are busy and there are no
+ * autofs submounts.
+ */
+static int autofs4_direct_busy(struct vfsmount *mnt,
+				struct dentry *top,
+				unsigned long timeout,
+				int do_now)
+{
+	DPRINTK("top %p %.*s",
+		top, (int) top->d_name.len, top->d_name.name);
+
+	/* If it's busy update the expiry counters */
+	if (!may_umount_tree(mnt)) {
+		struct autofs_info *ino = autofs4_dentry_ino(top);
+		if (ino)
+			ino->last_used = jiffies;
+		return 1;
+	}
+
+	/* Timeout of a direct mount is determined by its top dentry */
+	if (!autofs4_can_expire(top, timeout, do_now))
+		return 1;
+
+	return 0;
+}
+
+/* Check a directory tree of mount points for busyness
+ * The tree is not busy iff no mountpoints are busy
+ */
+static int autofs4_tree_busy(struct vfsmount *mnt,
+	       		     struct dentry *top,
+			     unsigned long timeout,
+			     int do_now)
+{
+	struct autofs_info *top_ino = autofs4_dentry_ino(top);
+	struct dentry *p;
+
+	DPRINTK("top %p %.*s",
+		top, (int)top->d_name.len, top->d_name.name);
+
+	/* Negative dentry - give up */
+	if (!simple_positive(top))
+		return 1;
+
+	p = NULL;
+	while ((p = get_next_positive_dentry(p, top))) {
+		DPRINTK("dentry %p %.*s",
+			p, (int) p->d_name.len, p->d_name.name);
+
+		/*
+		 * Is someone visiting anywhere in the subtree ?
+		 * If there's no mount we need to check the usage
+		 * count for the autofs dentry.
+		 * If the fs is busy update the expiry counter.
+		 */
+		if (d_mountpoint(p)) {
+			if (autofs4_mount_busy(mnt, p)) {
+				top_ino->last_used = jiffies;
+				dput(p);
+				return 1;
+			}
+		} else {
+			struct autofs_info *ino = autofs4_dentry_ino(p);
+			unsigned int ino_count = atomic_read(&ino->count);
+
+			/*
+			 * Clean stale dentries below that have not been
+			 * invalidated after a mount fail during lookup
+			 */
+			d_invalidate(p);
+
+			/* allow for dget above and top is already dgot */
+			if (p == top)
+				ino_count += 2;
+			else
+				ino_count++;
+
+			if (p->d_count > ino_count) {
+				top_ino->last_used = jiffies;
+				dput(p);
+				return 1;
+			}
+		}
+	}
+
+	/* Timeout of a tree mount is ultimately determined by its top dentry */
+	if (!autofs4_can_expire(top, timeout, do_now))
+		return 1;
+
+	return 0;
+}
+
+static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
+					   struct dentry *parent,
+					   unsigned long timeout,
+					   int do_now)
+{
+	struct dentry *p;
+
+	DPRINTK("parent %p %.*s",
+		parent, (int)parent->d_name.len, parent->d_name.name);
+
+	p = NULL;
+	while ((p = get_next_positive_dentry(p, parent))) {
+		DPRINTK("dentry %p %.*s",
+			p, (int) p->d_name.len, p->d_name.name);
+
+		if (d_mountpoint(p)) {
+			/* Can we umount this guy */
+			if (autofs4_mount_busy(mnt, p))
+				continue;
+
+			/* Can we expire this guy */
+			if (autofs4_can_expire(p, timeout, do_now))
+				return p;
+		}
+	}
+	return NULL;
+}
+
+/* Check if we can expire a direct mount (possibly a tree) */
+struct dentry *autofs4_expire_direct(struct super_block *sb,
+				     struct vfsmount *mnt,
+				     struct autofs_sb_info *sbi,
+				     int how)
+{
+	unsigned long timeout;
+	struct dentry *root = dget(sb->s_root);
+	int do_now = how & AUTOFS_EXP_IMMEDIATE;
+	struct autofs_info *ino;
+
+	if (!root)
+		return NULL;
+
+	now = jiffies;
+	timeout = sbi->exp_timeout;
+
+	spin_lock(&sbi->fs_lock);
+	ino = autofs4_dentry_ino(root);
+	/* No point expiring a pending mount */
+	if (ino->flags & AUTOFS_INF_PENDING)
+		goto out;
+	if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
+		struct autofs_info *ino = autofs4_dentry_ino(root);
+		ino->flags |= AUTOFS_INF_EXPIRING;
+		init_completion(&ino->expire_complete);
+		spin_unlock(&sbi->fs_lock);
+		return root;
+	}
+out:
+	spin_unlock(&sbi->fs_lock);
+	dput(root);
+
+	return NULL;
+}
+
+/*
+ * Find an eligible tree to time-out
+ * A tree is eligible if :-
+ *  - it is unused by any user process
+ *  - it has been unused for exp_timeout time
+ */
+struct dentry *autofs4_expire_indirect(struct super_block *sb,
+				       struct vfsmount *mnt,
+				       struct autofs_sb_info *sbi,
+				       int how)
+{
+	unsigned long timeout;
+	struct dentry *root = sb->s_root;
+	struct dentry *dentry;
+	struct dentry *expired = NULL;
+	int do_now = how & AUTOFS_EXP_IMMEDIATE;
+	int exp_leaves = how & AUTOFS_EXP_LEAVES;
+	struct autofs_info *ino;
+	unsigned int ino_count;
+
+	if (!root)
+		return NULL;
+
+	now = jiffies;
+	timeout = sbi->exp_timeout;
+
+	dentry = NULL;
+	while ((dentry = get_next_positive_subdir(dentry, root))) {
+		spin_lock(&sbi->fs_lock);
+		ino = autofs4_dentry_ino(dentry);
+		/* No point expiring a pending mount */
+		if (ino->flags & AUTOFS_INF_PENDING)
+			goto next;
+
+		/*
+		 * Case 1: (i) indirect mount or top level pseudo direct mount
+		 *	   (autofs-4.1).
+		 *	   (ii) indirect mount with offset mount, check the "/"
+		 *	   offset (autofs-5.0+).
+		 */
+		if (d_mountpoint(dentry)) {
+			DPRINTK("checking mountpoint %p %.*s",
+				dentry, (int)dentry->d_name.len, dentry->d_name.name);
+
+			/* Path walk currently on this dentry? */
+			ino_count = atomic_read(&ino->count) + 2;
+			if (dentry->d_count > ino_count)
+				goto next;
+
+			/* Can we umount this guy */
+			if (autofs4_mount_busy(mnt, dentry))
+				goto next;
+
+			/* Can we expire this guy */
+			if (autofs4_can_expire(dentry, timeout, do_now)) {
+				expired = dentry;
+				goto found;
+			}
+			goto next;
+		}
+
+		if (simple_empty(dentry))
+			goto next;
+
+		/* Case 2: tree mount, expire iff entire tree is not busy */
+		if (!exp_leaves) {
+			/* Path walk currently on this dentry? */
+			ino_count = atomic_read(&ino->count) + 1;
+			if (dentry->d_count > ino_count)
+				goto next;
+
+			if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
+				expired = dentry;
+				goto found;
+			}
+		/*
+		 * Case 3: pseudo direct mount, expire individual leaves
+		 *	   (autofs-4.1).
+		 */
+		} else {
+			/* Path walk currently on this dentry? */
+			ino_count = atomic_read(&ino->count) + 1;
+			if (dentry->d_count > ino_count)
+				goto next;
+
+			expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
+			if (expired) {
+				dput(dentry);
+				goto found;
+			}
+		}
+next:
+		spin_unlock(&sbi->fs_lock);
+	}
+	return NULL;
+
+found:
+	DPRINTK("returning %p %.*s",
+		expired, (int)expired->d_name.len, expired->d_name.name);
+	ino = autofs4_dentry_ino(expired);
+	ino->flags |= AUTOFS_INF_EXPIRING;
+	init_completion(&ino->expire_complete);
+	spin_unlock(&sbi->fs_lock);
+	spin_lock(&sbi->lookup_lock);
+	spin_lock(&expired->d_parent->d_lock);
+	spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
+	list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
+	spin_unlock(&expired->d_lock);
+	spin_unlock(&expired->d_parent->d_lock);
+	spin_unlock(&sbi->lookup_lock);
+	return expired;
+}
+
+int autofs4_expire_wait(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	int status;
+
+	/* Block on any pending expire */
+	spin_lock(&sbi->fs_lock);
+	if (ino->flags & AUTOFS_INF_EXPIRING) {
+		spin_unlock(&sbi->fs_lock);
+
+		DPRINTK("waiting for expire %p name=%.*s",
+			 dentry, dentry->d_name.len, dentry->d_name.name);
+
+		status = autofs4_wait(sbi, dentry, NFY_NONE);
+		wait_for_completion(&ino->expire_complete);
+
+		DPRINTK("expire done status=%d", status);
+
+		if (d_unhashed(dentry))
+			return -EAGAIN;
+
+		return status;
+	}
+	spin_unlock(&sbi->fs_lock);
+
+	return 0;
+}
+
+/* Perform an expiry operation */
+int autofs4_expire_run(struct super_block *sb,
+		      struct vfsmount *mnt,
+		      struct autofs_sb_info *sbi,
+		      struct autofs_packet_expire __user *pkt_p)
+{
+	struct autofs_packet_expire pkt;
+	struct autofs_info *ino;
+	struct dentry *dentry;
+	int ret = 0;
+
+	memset(&pkt,0,sizeof pkt);
+
+	pkt.hdr.proto_version = sbi->version;
+	pkt.hdr.type = autofs_ptype_expire;
+
+	if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL)
+		return -EAGAIN;
+
+	pkt.len = dentry->d_name.len;
+	memcpy(pkt.name, dentry->d_name.name, pkt.len);
+	pkt.name[pkt.len] = '\0';
+	dput(dentry);
+
+	if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) )
+		ret = -EFAULT;
+
+	spin_lock(&sbi->fs_lock);
+	ino = autofs4_dentry_ino(dentry);
+	ino->flags &= ~AUTOFS_INF_EXPIRING;
+	complete_all(&ino->expire_complete);
+	spin_unlock(&sbi->fs_lock);
+
+	return ret;
+}
+
+int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+			    struct autofs_sb_info *sbi, int when)
+{
+	struct dentry *dentry;
+	int ret = -EAGAIN;
+
+	if (autofs_type_trigger(sbi->type))
+		dentry = autofs4_expire_direct(sb, mnt, sbi, when);
+	else
+		dentry = autofs4_expire_indirect(sb, mnt, sbi, when);
+
+	if (dentry) {
+		struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
+		/* This is synchronous because it makes the daemon a
+                   little easier */
+		ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
+
+		spin_lock(&sbi->fs_lock);
+		ino->flags &= ~AUTOFS_INF_EXPIRING;
+		spin_lock(&dentry->d_lock);
+		if (!ret) {
+			if ((IS_ROOT(dentry) ||
+			    (autofs_type_indirect(sbi->type) &&
+			     IS_ROOT(dentry->d_parent))) &&
+			    !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+				__managed_dentry_set_automount(dentry);
+		}
+		spin_unlock(&dentry->d_lock);
+		complete_all(&ino->expire_complete);
+		spin_unlock(&sbi->fs_lock);
+		dput(dentry);
+	}
+
+	return ret;
+}
+
+/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
+   more to be done */
+int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+			struct autofs_sb_info *sbi, int __user *arg)
+{
+	int do_now = 0;
+
+	if (arg && get_user(do_now, arg))
+		return -EFAULT;
+
+	return autofs4_do_expire_multi(sb, mnt, sbi, do_now);
+}
+
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
new file mode 100644
index 00000000..c038727b
--- /dev/null
+++ b/fs/autofs4/init.c
@@ -0,0 +1,51 @@
+/* -*- c -*- --------------------------------------------------------------- *
+ *
+ * linux/fs/autofs/init.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include "autofs_i.h"
+
+static struct dentry *autofs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_nodev(fs_type, flags, data, autofs4_fill_super);
+}
+
+static struct file_system_type autofs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "autofs",
+	.mount		= autofs_mount,
+	.kill_sb	= autofs4_kill_sb,
+};
+
+static int __init init_autofs4_fs(void)
+{
+	int err;
+
+	err = register_filesystem(&autofs_fs_type);
+	if (err)
+		return err;
+
+	autofs_dev_ioctl_init();
+
+	return err;
+}
+
+static void __exit exit_autofs4_fs(void)
+{
+	autofs_dev_ioctl_exit();
+	unregister_filesystem(&autofs_fs_type);
+}
+
+module_init(init_autofs4_fs) 
+module_exit(exit_autofs4_fs)
+MODULE_LICENSE("GPL");
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
new file mode 100644
index 00000000..7c26678e
--- /dev/null
+++ b/fs/autofs4/inode.c
@@ -0,0 +1,353 @@
+/* -*- c -*- --------------------------------------------------------------- *
+ *
+ * linux/fs/autofs/inode.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *  Copyright 2005-2006 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/parser.h>
+#include <linux/bitops.h>
+#include <linux/magic.h>
+#include "autofs_i.h"
+#include <linux/module.h>
+
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
+{
+	struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
+	if (ino) {
+		INIT_LIST_HEAD(&ino->active);
+		INIT_LIST_HEAD(&ino->expiring);
+		ino->last_used = jiffies;
+		ino->sbi = sbi;
+	}
+	return ino;
+}
+
+void autofs4_clean_ino(struct autofs_info *ino)
+{
+	ino->uid = 0;
+	ino->gid = 0;
+	ino->last_used = jiffies;
+}
+
+void autofs4_free_ino(struct autofs_info *ino)
+{
+	kfree(ino);
+}
+
+void autofs4_kill_sb(struct super_block *sb)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(sb);
+
+	/*
+	 * In the event of a failure in get_sb_nodev the superblock
+	 * info is not present so nothing else has been setup, so
+	 * just call kill_anon_super when we are called from
+	 * deactivate_super.
+	 */
+	if (!sbi)
+		goto out_kill_sb;
+
+	/* Free wait queues, close pipe */
+	autofs4_catatonic_mode(sbi);
+
+	sb->s_fs_info = NULL;
+	kfree(sbi);
+
+out_kill_sb:
+	DPRINTK("shutting down");
+	kill_litter_super(sb);
+}
+
+static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(mnt->mnt_sb);
+	struct inode *root_inode = mnt->mnt_sb->s_root->d_inode;
+
+	if (!sbi)
+		return 0;
+
+	seq_printf(m, ",fd=%d", sbi->pipefd);
+	if (root_inode->i_uid != 0)
+		seq_printf(m, ",uid=%u", root_inode->i_uid);
+	if (root_inode->i_gid != 0)
+		seq_printf(m, ",gid=%u", root_inode->i_gid);
+	seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
+	seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
+	seq_printf(m, ",minproto=%d", sbi->min_proto);
+	seq_printf(m, ",maxproto=%d", sbi->max_proto);
+
+	if (autofs_type_offset(sbi->type))
+		seq_printf(m, ",offset");
+	else if (autofs_type_direct(sbi->type))
+		seq_printf(m, ",direct");
+	else
+		seq_printf(m, ",indirect");
+
+	return 0;
+}
+
+static void autofs4_evict_inode(struct inode *inode)
+{
+	end_writeback(inode);
+	kfree(inode->i_private);
+}
+
+static const struct super_operations autofs4_sops = {
+	.statfs		= simple_statfs,
+	.show_options	= autofs4_show_options,
+	.evict_inode	= autofs4_evict_inode,
+};
+
+enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
+	Opt_indirect, Opt_direct, Opt_offset};
+
+static const match_table_t tokens = {
+	{Opt_fd, "fd=%u"},
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_pgrp, "pgrp=%u"},
+	{Opt_minproto, "minproto=%u"},
+	{Opt_maxproto, "maxproto=%u"},
+	{Opt_indirect, "indirect"},
+	{Opt_direct, "direct"},
+	{Opt_offset, "offset"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
+		pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+
+	*uid = current_uid();
+	*gid = current_gid();
+	*pgrp = task_pgrp_nr(current);
+
+	*minproto = AUTOFS_MIN_PROTO_VERSION;
+	*maxproto = AUTOFS_MAX_PROTO_VERSION;
+
+	*pipefd = -1;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_fd:
+			if (match_int(args, pipefd))
+				return 1;
+			break;
+		case Opt_uid:
+			if (match_int(args, &option))
+				return 1;
+			*uid = option;
+			break;
+		case Opt_gid:
+			if (match_int(args, &option))
+				return 1;
+			*gid = option;
+			break;
+		case Opt_pgrp:
+			if (match_int(args, &option))
+				return 1;
+			*pgrp = option;
+			break;
+		case Opt_minproto:
+			if (match_int(args, &option))
+				return 1;
+			*minproto = option;
+			break;
+		case Opt_maxproto:
+			if (match_int(args, &option))
+				return 1;
+			*maxproto = option;
+			break;
+		case Opt_indirect:
+			set_autofs_type_indirect(type);
+			break;
+		case Opt_direct:
+			set_autofs_type_direct(type);
+			break;
+		case Opt_offset:
+			set_autofs_type_offset(type);
+			break;
+		default:
+			return 1;
+		}
+	}
+	return (*pipefd < 0);
+}
+
+int autofs4_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct inode * root_inode;
+	struct dentry * root;
+	struct file * pipe;
+	int pipefd;
+	struct autofs_sb_info *sbi;
+	struct autofs_info *ino;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		goto fail_unlock;
+	DPRINTK("starting up, sbi = %p",sbi);
+
+	s->s_fs_info = sbi;
+	sbi->magic = AUTOFS_SBI_MAGIC;
+	sbi->pipefd = -1;
+	sbi->pipe = NULL;
+	sbi->catatonic = 1;
+	sbi->exp_timeout = 0;
+	sbi->oz_pgrp = task_pgrp_nr(current);
+	sbi->sb = s;
+	sbi->version = 0;
+	sbi->sub_version = 0;
+	set_autofs_type_indirect(&sbi->type);
+	sbi->min_proto = 0;
+	sbi->max_proto = 0;
+	mutex_init(&sbi->wq_mutex);
+	spin_lock_init(&sbi->fs_lock);
+	sbi->queues = NULL;
+	spin_lock_init(&sbi->lookup_lock);
+	INIT_LIST_HEAD(&sbi->active_list);
+	INIT_LIST_HEAD(&sbi->expiring_list);
+	s->s_blocksize = 1024;
+	s->s_blocksize_bits = 10;
+	s->s_magic = AUTOFS_SUPER_MAGIC;
+	s->s_op = &autofs4_sops;
+	s->s_d_op = &autofs4_dentry_operations;
+	s->s_time_gran = 1;
+
+	/*
+	 * Get the root inode and dentry, but defer checking for errors.
+	 */
+	ino = autofs4_new_ino(sbi);
+	if (!ino)
+		goto fail_free;
+	root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
+	if (!root_inode)
+		goto fail_ino;
+
+	root = d_alloc_root(root_inode);
+	if (!root)
+		goto fail_iput;
+	pipe = NULL;
+
+	root->d_fsdata = ino;
+
+	/* Can this call block? */
+	if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
+				&sbi->oz_pgrp, &sbi->type, &sbi->min_proto,
+				&sbi->max_proto)) {
+		printk("autofs: called with bogus options\n");
+		goto fail_dput;
+	}
+
+	if (autofs_type_trigger(sbi->type))
+		__managed_dentry_set_managed(root);
+
+	root_inode->i_fop = &autofs4_root_operations;
+	root_inode->i_op = &autofs4_dir_inode_operations;
+
+	/* Couldn't this be tested earlier? */
+	if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
+	    sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
+		printk("autofs: kernel does not match daemon version "
+		       "daemon (%d, %d) kernel (%d, %d)\n",
+			sbi->min_proto, sbi->max_proto,
+			AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
+		goto fail_dput;
+	}
+
+	/* Establish highest kernel protocol version */
+	if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION)
+		sbi->version = AUTOFS_MAX_PROTO_VERSION;
+	else
+		sbi->version = sbi->max_proto;
+	sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
+
+	DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp);
+	pipe = fget(pipefd);
+	
+	if (!pipe) {
+		printk("autofs: could not open pipe file descriptor\n");
+		goto fail_dput;
+	}
+	if (autofs_prepare_pipe(pipe) < 0)
+		goto fail_fput;
+	sbi->pipe = pipe;
+	sbi->pipefd = pipefd;
+	sbi->catatonic = 0;
+
+	/*
+	 * Success! Install the root dentry now to indicate completion.
+	 */
+	s->s_root = root;
+	return 0;
+	
+	/*
+	 * Failure ... clean up.
+	 */
+fail_fput:
+	printk("autofs: pipe file descriptor does not contain proper ops\n");
+	fput(pipe);
+	/* fall through */
+fail_dput:
+	dput(root);
+	goto fail_free;
+fail_iput:
+	printk("autofs: get root dentry failed\n");
+	iput(root_inode);
+fail_ino:
+	kfree(ino);
+fail_free:
+	kfree(sbi);
+	s->s_fs_info = NULL;
+fail_unlock:
+	return -EINVAL;
+}
+
+struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (inode == NULL)
+		return NULL;
+
+	inode->i_mode = mode;
+	if (sb->s_root) {
+		inode->i_uid = sb->s_root->d_inode->i_uid;
+		inode->i_gid = sb->s_root->d_inode->i_gid;
+	}
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_ino = get_next_ino();
+
+	if (S_ISDIR(mode)) {
+		inode->i_nlink = 2;
+		inode->i_op = &autofs4_dir_inode_operations;
+		inode->i_fop = &autofs4_dir_operations;
+	} else if (S_ISLNK(mode)) {
+		inode->i_op = &autofs4_symlink_inode_operations;
+	}
+
+	return inode;
+}
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
new file mode 100644
index 00000000..f55ae23b
--- /dev/null
+++ b/fs/autofs4/root.c
@@ -0,0 +1,895 @@
+/* -*- c -*- --------------------------------------------------------------- *
+ *
+ * linux/fs/autofs/root.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *  Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ *  Copyright 2001-2006 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/compat.h>
+#include <linux/mutex.h>
+
+#include "autofs_i.h"
+
+static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
+static int autofs4_dir_unlink(struct inode *,struct dentry *);
+static int autofs4_dir_rmdir(struct inode *,struct dentry *);
+static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
+static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
+#ifdef CONFIG_COMPAT
+static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
+#endif
+static int autofs4_dir_open(struct inode *inode, struct file *file);
+static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
+static struct vfsmount *autofs4_d_automount(struct path *);
+static int autofs4_d_manage(struct dentry *, bool);
+static void autofs4_dentry_release(struct dentry *);
+
+const struct file_operations autofs4_root_operations = {
+	.open		= dcache_dir_open,
+	.release	= dcache_dir_close,
+	.read		= generic_read_dir,
+	.readdir	= dcache_readdir,
+	.llseek		= dcache_dir_lseek,
+	.unlocked_ioctl	= autofs4_root_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= autofs4_root_compat_ioctl,
+#endif
+};
+
+const struct file_operations autofs4_dir_operations = {
+	.open		= autofs4_dir_open,
+	.release	= dcache_dir_close,
+	.read		= generic_read_dir,
+	.readdir	= dcache_readdir,
+	.llseek		= dcache_dir_lseek,
+};
+
+const struct inode_operations autofs4_dir_inode_operations = {
+	.lookup		= autofs4_lookup,
+	.unlink		= autofs4_dir_unlink,
+	.symlink	= autofs4_dir_symlink,
+	.mkdir		= autofs4_dir_mkdir,
+	.rmdir		= autofs4_dir_rmdir,
+};
+
+const struct dentry_operations autofs4_dentry_operations = {
+	.d_automount	= autofs4_d_automount,
+	.d_manage	= autofs4_d_manage,
+	.d_release	= autofs4_dentry_release,
+};
+
+static void autofs4_add_active(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino) {
+		spin_lock(&sbi->lookup_lock);
+		if (!ino->active_count) {
+			if (list_empty(&ino->active))
+				list_add(&ino->active, &sbi->active_list);
+		}
+		ino->active_count++;
+		spin_unlock(&sbi->lookup_lock);
+	}
+	return;
+}
+
+static void autofs4_del_active(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino) {
+		spin_lock(&sbi->lookup_lock);
+		ino->active_count--;
+		if (!ino->active_count) {
+			if (!list_empty(&ino->active))
+				list_del_init(&ino->active);
+		}
+		spin_unlock(&sbi->lookup_lock);
+	}
+	return;
+}
+
+static int autofs4_dir_open(struct inode *inode, struct file *file)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+
+	DPRINTK("file=%p dentry=%p %.*s",
+		file, dentry, dentry->d_name.len, dentry->d_name.name);
+
+	if (autofs4_oz_mode(sbi))
+		goto out;
+
+	/*
+	 * An empty directory in an autofs file system is always a
+	 * mount point. The daemon must have failed to mount this
+	 * during lookup so it doesn't exist. This can happen, for
+	 * example, if user space returns an incorrect status for a
+	 * mount request. Otherwise we're doing a readdir on the
+	 * autofs file system so just let the libfs routines handle
+	 * it.
+	 */
+	spin_lock(&sbi->lookup_lock);
+	spin_lock(&dentry->d_lock);
+	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
+		spin_unlock(&dentry->d_lock);
+		spin_unlock(&sbi->lookup_lock);
+		return -ENOENT;
+	}
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&sbi->lookup_lock);
+
+out:
+	return dcache_dir_open(inode, file);
+}
+
+static void autofs4_dentry_release(struct dentry *de)
+{
+	struct autofs_info *ino = autofs4_dentry_ino(de);
+	struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
+
+	DPRINTK("releasing %p", de);
+
+	if (!ino)
+		return;
+
+	if (sbi) {
+		spin_lock(&sbi->lookup_lock);
+		if (!list_empty(&ino->active))
+			list_del(&ino->active);
+		if (!list_empty(&ino->expiring))
+			list_del(&ino->expiring);
+		spin_unlock(&sbi->lookup_lock);
+	}
+
+	autofs4_free_ino(ino);
+}
+
+static struct dentry *autofs4_lookup_active(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct dentry *parent = dentry->d_parent;
+	struct qstr *name = &dentry->d_name;
+	unsigned int len = name->len;
+	unsigned int hash = name->hash;
+	const unsigned char *str = name->name;
+	struct list_head *p, *head;
+
+	spin_lock(&sbi->lookup_lock);
+	head = &sbi->active_list;
+	list_for_each(p, head) {
+		struct autofs_info *ino;
+		struct dentry *active;
+		struct qstr *qstr;
+
+		ino = list_entry(p, struct autofs_info, active);
+		active = ino->dentry;
+
+		spin_lock(&active->d_lock);
+
+		/* Already gone? */
+		if (active->d_count == 0)
+			goto next;
+
+		qstr = &active->d_name;
+
+		if (active->d_name.hash != hash)
+			goto next;
+		if (active->d_parent != parent)
+			goto next;
+
+		if (qstr->len != len)
+			goto next;
+		if (memcmp(qstr->name, str, len))
+			goto next;
+
+		if (d_unhashed(active)) {
+			dget_dlock(active);
+			spin_unlock(&active->d_lock);
+			spin_unlock(&sbi->lookup_lock);
+			return active;
+		}
+next:
+		spin_unlock(&active->d_lock);
+	}
+	spin_unlock(&sbi->lookup_lock);
+
+	return NULL;
+}
+
+static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct dentry *parent = dentry->d_parent;
+	struct qstr *name = &dentry->d_name;
+	unsigned int len = name->len;
+	unsigned int hash = name->hash;
+	const unsigned char *str = name->name;
+	struct list_head *p, *head;
+
+	spin_lock(&sbi->lookup_lock);
+	head = &sbi->expiring_list;
+	list_for_each(p, head) {
+		struct autofs_info *ino;
+		struct dentry *expiring;
+		struct qstr *qstr;
+
+		ino = list_entry(p, struct autofs_info, expiring);
+		expiring = ino->dentry;
+
+		spin_lock(&expiring->d_lock);
+
+		/* Bad luck, we've already been dentry_iput */
+		if (!expiring->d_inode)
+			goto next;
+
+		qstr = &expiring->d_name;
+
+		if (expiring->d_name.hash != hash)
+			goto next;
+		if (expiring->d_parent != parent)
+			goto next;
+
+		if (qstr->len != len)
+			goto next;
+		if (memcmp(qstr->name, str, len))
+			goto next;
+
+		if (d_unhashed(expiring)) {
+			dget_dlock(expiring);
+			spin_unlock(&expiring->d_lock);
+			spin_unlock(&sbi->lookup_lock);
+			return expiring;
+		}
+next:
+		spin_unlock(&expiring->d_lock);
+	}
+	spin_unlock(&sbi->lookup_lock);
+
+	return NULL;
+}
+
+static int autofs4_mount_wait(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	int status = 0;
+
+	if (ino->flags & AUTOFS_INF_PENDING) {
+		DPRINTK("waiting for mount name=%.*s",
+			dentry->d_name.len, dentry->d_name.name);
+		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
+		DPRINTK("mount wait done status=%d", status);
+	}
+	ino->last_used = jiffies;
+	return status;
+}
+
+static int do_expire_wait(struct dentry *dentry)
+{
+	struct dentry *expiring;
+
+	expiring = autofs4_lookup_expiring(dentry);
+	if (!expiring)
+		return autofs4_expire_wait(dentry);
+	else {
+		/*
+		 * If we are racing with expire the request might not
+		 * be quite complete, but the directory has been removed
+		 * so it must have been successful, just wait for it.
+		 */
+		autofs4_expire_wait(expiring);
+		autofs4_del_expiring(expiring);
+		dput(expiring);
+	}
+	return 0;
+}
+
+static struct dentry *autofs4_mountpoint_changed(struct path *path)
+{
+	struct dentry *dentry = path->dentry;
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+
+	/*
+	 * If this is an indirect mount the dentry could have gone away
+	 * as a result of an expire and a new one created.
+	 */
+	if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
+		struct dentry *parent = dentry->d_parent;
+		struct autofs_info *ino;
+		struct dentry *new = d_lookup(parent, &dentry->d_name);
+		if (!new)
+			return NULL;
+		ino = autofs4_dentry_ino(new);
+		ino->last_used = jiffies;
+		dput(path->dentry);
+		path->dentry = new;
+	}
+	return path->dentry;
+}
+
+static struct vfsmount *autofs4_d_automount(struct path *path)
+{
+	struct dentry *dentry = path->dentry;
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	int status;
+
+	DPRINTK("dentry=%p %.*s",
+		dentry, dentry->d_name.len, dentry->d_name.name);
+
+	/* The daemon never triggers a mount. */
+	if (autofs4_oz_mode(sbi))
+		return NULL;
+
+	/*
+	 * If an expire request is pending everyone must wait.
+	 * If the expire fails we're still mounted so continue
+	 * the follow and return. A return of -EAGAIN (which only
+	 * happens with indirect mounts) means the expire completed
+	 * and the directory was removed, so just go ahead and try
+	 * the mount.
+	 */
+	status = do_expire_wait(dentry);
+	if (status && status != -EAGAIN)
+		return NULL;
+
+	/* Callback to the daemon to perform the mount or wait */
+	spin_lock(&sbi->fs_lock);
+	if (ino->flags & AUTOFS_INF_PENDING) {
+		spin_unlock(&sbi->fs_lock);
+		status = autofs4_mount_wait(dentry);
+		if (status)
+			return ERR_PTR(status);
+		spin_lock(&sbi->fs_lock);
+		goto done;
+	}
+
+	/*
+	 * If the dentry is a symlink it's equivalent to a directory
+	 * having d_mountpoint() true, so there's no need to call back
+	 * to the daemon.
+	 */
+	if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
+		goto done;
+	if (!d_mountpoint(dentry)) {
+		/*
+		 * It's possible that user space hasn't removed directories
+		 * after umounting a rootless multi-mount, although it
+		 * should. For v5 have_submounts() is sufficient to handle
+		 * this because the leaves of the directory tree under the
+		 * mount never trigger mounts themselves (they have an autofs
+		 * trigger mount mounted on them). But v4 pseudo direct mounts
+		 * do need the leaves to to trigger mounts. In this case we
+		 * have no choice but to use the list_empty() check and
+		 * require user space behave.
+		 */
+		if (sbi->version > 4) {
+			if (have_submounts(dentry))
+				goto done;
+		} else {
+			spin_lock(&dentry->d_lock);
+			if (!list_empty(&dentry->d_subdirs)) {
+				spin_unlock(&dentry->d_lock);
+				goto done;
+			}
+			spin_unlock(&dentry->d_lock);
+		}
+		ino->flags |= AUTOFS_INF_PENDING;
+		spin_unlock(&sbi->fs_lock);
+		status = autofs4_mount_wait(dentry);
+		if (status)
+			return ERR_PTR(status);
+		spin_lock(&sbi->fs_lock);
+		ino->flags &= ~AUTOFS_INF_PENDING;
+	}
+done:
+	if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
+		/*
+		 * Any needed mounting has been completed and the path
+		 * updated so clear DCACHE_NEED_AUTOMOUNT so we don't
+		 * call ->d_automount() on rootless multi-mounts since
+		 * it can lead to an incorrect ELOOP error return.
+		 *
+		 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
+		 * symlinks as in all other cases the dentry will be covered by
+		 * an actual mount so ->d_automount() won't be called during
+		 * the follow.
+		 */
+		spin_lock(&dentry->d_lock);
+		if ((!d_mountpoint(dentry) &&
+		    !list_empty(&dentry->d_subdirs)) ||
+		    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
+			__managed_dentry_clear_automount(dentry);
+		spin_unlock(&dentry->d_lock);
+	}
+	spin_unlock(&sbi->fs_lock);
+
+	/* Mount succeeded, check if we ended up with a new dentry */
+	dentry = autofs4_mountpoint_changed(path);
+	if (!dentry)
+		return ERR_PTR(-ENOENT);
+
+	return NULL;
+}
+
+int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+
+	DPRINTK("dentry=%p %.*s",
+		dentry, dentry->d_name.len, dentry->d_name.name);
+
+	/* The daemon never waits. */
+	if (autofs4_oz_mode(sbi)) {
+		if (rcu_walk)
+			return 0;
+		if (!d_mountpoint(dentry))
+			return -EISDIR;
+		return 0;
+	}
+
+	/* We need to sleep, so we need pathwalk to be in ref-mode */
+	if (rcu_walk)
+		return -ECHILD;
+
+	/* Wait for pending expires */
+	do_expire_wait(dentry);
+
+	/*
+	 * This dentry may be under construction so wait on mount
+	 * completion.
+	 */
+	return autofs4_mount_wait(dentry);
+}
+
+/* Lookups in the root directory */
+static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct autofs_sb_info *sbi;
+	struct autofs_info *ino;
+	struct dentry *active;
+
+	DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name);
+
+	/* File name too long to exist */
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	sbi = autofs4_sbi(dir->i_sb);
+
+	DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
+		current->pid, task_pgrp_nr(current), sbi->catatonic,
+		autofs4_oz_mode(sbi));
+
+	active = autofs4_lookup_active(dentry);
+	if (active) {
+		return active;
+	} else {
+		/*
+		 * A dentry that is not within the root can never trigger a
+		 * mount operation, unless the directory already exists, so we
+		 * can return fail immediately.  The daemon however does need
+		 * to create directories within the file system.
+		 */
+		if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent))
+			return ERR_PTR(-ENOENT);
+
+		/* Mark entries in the root as mount triggers */
+		if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
+			__managed_dentry_set_managed(dentry);
+
+		ino = autofs4_new_ino(sbi);
+		if (!ino)
+			return ERR_PTR(-ENOMEM);
+
+		dentry->d_fsdata = ino;
+		ino->dentry = dentry;
+
+		autofs4_add_active(dentry);
+
+		d_instantiate(dentry, NULL);
+	}
+	return NULL;
+}
+
+static int autofs4_dir_symlink(struct inode *dir, 
+			       struct dentry *dentry,
+			       const char *symname)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	struct autofs_info *p_ino;
+	struct inode *inode;
+	size_t size = strlen(symname);
+	char *cp;
+
+	DPRINTK("%s <- %.*s", symname,
+		dentry->d_name.len, dentry->d_name.name);
+
+	if (!autofs4_oz_mode(sbi))
+		return -EACCES;
+
+	BUG_ON(!ino);
+
+	autofs4_clean_ino(ino);
+
+	autofs4_del_active(dentry);
+
+	cp = kmalloc(size + 1, GFP_KERNEL);
+	if (!cp)
+		return -ENOMEM;
+
+	strcpy(cp, symname);
+
+	inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
+	if (!inode) {
+		kfree(cp);
+		if (!dentry->d_fsdata)
+			kfree(ino);
+		return -ENOMEM;
+	}
+	inode->i_private = cp;
+	inode->i_size = size;
+	d_add(dentry, inode);
+
+	dget(dentry);
+	atomic_inc(&ino->count);
+	p_ino = autofs4_dentry_ino(dentry->d_parent);
+	if (p_ino && dentry->d_parent != dentry)
+		atomic_inc(&p_ino->count);
+
+	dir->i_mtime = CURRENT_TIME;
+
+	return 0;
+}
+
+/*
+ * NOTE!
+ *
+ * Normal filesystems would do a "d_delete()" to tell the VFS dcache
+ * that the file no longer exists. However, doing that means that the
+ * VFS layer can turn the dentry into a negative dentry.  We don't want
+ * this, because the unlink is probably the result of an expire.
+ * We simply d_drop it and add it to a expiring list in the super block,
+ * which allows the dentry lookup to check for an incomplete expire.
+ *
+ * If a process is blocked on the dentry waiting for the expire to finish,
+ * it will invalidate the dentry and try to mount with a new one.
+ *
+ * Also see autofs4_dir_rmdir()..
+ */
+static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	struct autofs_info *p_ino;
+	
+	/* This allows root to remove symlinks */
+	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (atomic_dec_and_test(&ino->count)) {
+		p_ino = autofs4_dentry_ino(dentry->d_parent);
+		if (p_ino && dentry->d_parent != dentry)
+			atomic_dec(&p_ino->count);
+	}
+	dput(ino->dentry);
+
+	dentry->d_inode->i_size = 0;
+	clear_nlink(dentry->d_inode);
+
+	dir->i_mtime = CURRENT_TIME;
+
+	spin_lock(&sbi->lookup_lock);
+	__autofs4_add_expiring(dentry);
+	spin_lock(&dentry->d_lock);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&sbi->lookup_lock);
+
+	return 0;
+}
+
+/*
+ * Version 4 of autofs provides a pseudo direct mount implementation
+ * that relies on directories at the leaves of a directory tree under
+ * an indirect mount to trigger mounts. To allow for this we need to
+ * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves
+ * of the directory tree. There is no need to clear the automount flag
+ * following a mount or restore it after an expire because these mounts
+ * are always covered. However, it is necessary to ensure that these
+ * flags are clear on non-empty directories to avoid unnecessary calls
+ * during path walks.
+ */
+static void autofs_set_leaf_automount_flags(struct dentry *dentry)
+{
+	struct dentry *parent;
+
+	/* root and dentrys in the root are already handled */
+	if (IS_ROOT(dentry->d_parent))
+		return;
+
+	managed_dentry_set_managed(dentry);
+
+	parent = dentry->d_parent;
+	/* only consider parents below dentrys in the root */
+	if (IS_ROOT(parent->d_parent))
+		return;
+	managed_dentry_clear_managed(parent);
+	return;
+}
+
+static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
+{
+	struct list_head *d_child;
+	struct dentry *parent;
+
+	/* flags for dentrys in the root are handled elsewhere */
+	if (IS_ROOT(dentry->d_parent))
+		return;
+
+	managed_dentry_clear_managed(dentry);
+
+	parent = dentry->d_parent;
+	/* only consider parents below dentrys in the root */
+	if (IS_ROOT(parent->d_parent))
+		return;
+	d_child = &dentry->d_u.d_child;
+	/* Set parent managed if it's becoming empty */
+	if (d_child->next == &parent->d_subdirs &&
+	    d_child->prev == &parent->d_subdirs)
+		managed_dentry_set_managed(parent);
+	return;
+}
+
+static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	struct autofs_info *p_ino;
+	
+	DPRINTK("dentry %p, removing %.*s",
+		dentry, dentry->d_name.len, dentry->d_name.name);
+
+	if (!autofs4_oz_mode(sbi))
+		return -EACCES;
+
+	spin_lock(&sbi->lookup_lock);
+	spin_lock(&dentry->d_lock);
+	if (!list_empty(&dentry->d_subdirs)) {
+		spin_unlock(&dentry->d_lock);
+		spin_unlock(&sbi->lookup_lock);
+		return -ENOTEMPTY;
+	}
+	__autofs4_add_expiring(dentry);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&sbi->lookup_lock);
+
+	if (sbi->version < 5)
+		autofs_clear_leaf_automount_flags(dentry);
+
+	if (atomic_dec_and_test(&ino->count)) {
+		p_ino = autofs4_dentry_ino(dentry->d_parent);
+		if (p_ino && dentry->d_parent != dentry)
+			atomic_dec(&p_ino->count);
+	}
+	dput(ino->dentry);
+	dentry->d_inode->i_size = 0;
+	clear_nlink(dentry->d_inode);
+
+	if (dir->i_nlink)
+		drop_nlink(dir);
+
+	return 0;
+}
+
+static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	struct autofs_info *p_ino;
+	struct inode *inode;
+
+	if (!autofs4_oz_mode(sbi))
+		return -EACCES;
+
+	DPRINTK("dentry %p, creating %.*s",
+		dentry, dentry->d_name.len, dentry->d_name.name);
+
+	BUG_ON(!ino);
+
+	autofs4_clean_ino(ino);
+
+	autofs4_del_active(dentry);
+
+	inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555);
+	if (!inode)
+		return -ENOMEM;
+	d_add(dentry, inode);
+
+	if (sbi->version < 5)
+		autofs_set_leaf_automount_flags(dentry);
+
+	dget(dentry);
+	atomic_inc(&ino->count);
+	p_ino = autofs4_dentry_ino(dentry->d_parent);
+	if (p_ino && dentry->d_parent != dentry)
+		atomic_inc(&p_ino->count);
+	inc_nlink(dir);
+	dir->i_mtime = CURRENT_TIME;
+
+	return 0;
+}
+
+/* Get/set timeout ioctl() operation */
+#ifdef CONFIG_COMPAT
+static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
+					 compat_ulong_t __user *p)
+{
+	int rv;
+	unsigned long ntimeout;
+
+	if ((rv = get_user(ntimeout, p)) ||
+	     (rv = put_user(sbi->exp_timeout/HZ, p)))
+		return rv;
+
+	if (ntimeout > UINT_MAX/HZ)
+		sbi->exp_timeout = 0;
+	else
+		sbi->exp_timeout = ntimeout * HZ;
+
+	return 0;
+}
+#endif
+
+static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
+					 unsigned long __user *p)
+{
+	int rv;
+	unsigned long ntimeout;
+
+	if ((rv = get_user(ntimeout, p)) ||
+	     (rv = put_user(sbi->exp_timeout/HZ, p)))
+		return rv;
+
+	if (ntimeout > ULONG_MAX/HZ)
+		sbi->exp_timeout = 0;
+	else
+		sbi->exp_timeout = ntimeout * HZ;
+
+	return 0;
+}
+
+/* Return protocol version */
+static inline int autofs4_get_protover(struct autofs_sb_info *sbi, int __user *p)
+{
+	return put_user(sbi->version, p);
+}
+
+/* Return protocol sub version */
+static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user *p)
+{
+	return put_user(sbi->sub_version, p);
+}
+
+/*
+* Tells the daemon whether it can umount the autofs mount.
+*/
+static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
+{
+	int status = 0;
+
+	if (may_umount(mnt))
+		status = 1;
+
+	DPRINTK("returning %d", status);
+
+	status = put_user(status, p);
+
+	return status;
+}
+
+/* Identify autofs4_dentries - this is so we can tell if there's
+   an extra dentry refcount or not.  We only hold a refcount on the
+   dentry if its non-negative (ie, d_inode != NULL)
+*/
+int is_autofs4_dentry(struct dentry *dentry)
+{
+	return dentry && dentry->d_inode &&
+		dentry->d_op == &autofs4_dentry_operations &&
+		dentry->d_fsdata != NULL;
+}
+
+/*
+ * ioctl()'s on the root directory is the chief method for the daemon to
+ * generate kernel reactions
+ */
+static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
+				       unsigned int cmd, unsigned long arg)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
+	void __user *p = (void __user *)arg;
+
+	DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u",
+		cmd,arg,sbi,task_pgrp_nr(current));
+
+	if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
+	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
+		return -ENOTTY;
+	
+	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	
+	switch(cmd) {
+	case AUTOFS_IOC_READY:	/* Wait queue: go ahead and retry */
+		return autofs4_wait_release(sbi,(autofs_wqt_t)arg,0);
+	case AUTOFS_IOC_FAIL:	/* Wait queue: fail with ENOENT */
+		return autofs4_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
+	case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
+		autofs4_catatonic_mode(sbi);
+		return 0;
+	case AUTOFS_IOC_PROTOVER: /* Get protocol version */
+		return autofs4_get_protover(sbi, p);
+	case AUTOFS_IOC_PROTOSUBVER: /* Get protocol sub version */
+		return autofs4_get_protosubver(sbi, p);
+	case AUTOFS_IOC_SETTIMEOUT:
+		return autofs4_get_set_timeout(sbi, p);
+#ifdef CONFIG_COMPAT
+	case AUTOFS_IOC_SETTIMEOUT32:
+		return autofs4_compat_get_set_timeout(sbi, p);
+#endif
+
+	case AUTOFS_IOC_ASKUMOUNT:
+		return autofs4_ask_umount(filp->f_path.mnt, p);
+
+	/* return a single thing to expire */
+	case AUTOFS_IOC_EXPIRE:
+		return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p);
+	/* same as above, but can send multiple expires through pipe */
+	case AUTOFS_IOC_EXPIRE_MULTI:
+		return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p);
+
+	default:
+		return -ENOSYS;
+	}
+}
+
+static long autofs4_root_ioctl(struct file *filp,
+			       unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
+}
+
+#ifdef CONFIG_COMPAT
+static long autofs4_root_compat_ioctl(struct file *filp,
+			     unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int ret;
+
+	if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
+		ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
+	else
+		ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
+			(unsigned long)compat_ptr(arg));
+
+	return ret;
+}
+#endif
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
new file mode 100644
index 00000000..f27c094a
--- /dev/null
+++ b/fs/autofs4/symlink.c
@@ -0,0 +1,24 @@
+/* -*- c -*- --------------------------------------------------------------- *
+ *
+ * linux/fs/autofs/symlink.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include "autofs_i.h"
+
+static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	nd_set_link(nd, dentry->d_inode->i_private);
+	return NULL;
+}
+
+const struct inode_operations autofs4_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= autofs4_follow_link
+};
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
new file mode 100644
index 00000000..813ea10f
--- /dev/null
+++ b/fs/autofs4/waitq.c
@@ -0,0 +1,552 @@
+/* -*- c -*- --------------------------------------------------------------- *
+ *
+ * linux/fs/autofs/waitq.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *  Copyright 2001-2006 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/signal.h>
+#include <linux/file.h>
+#include "autofs_i.h"
+
+/* We make this a static variable rather than a part of the superblock; it
+   is better if we don't reassign numbers easily even across filesystems */
+static autofs_wqt_t autofs4_next_wait_queue = 1;
+
+/* These are the signals we allow interrupting a pending mount */
+#define SHUTDOWN_SIGS	(sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT))
+
+void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
+{
+	struct autofs_wait_queue *wq, *nwq;
+
+	mutex_lock(&sbi->wq_mutex);
+	if (sbi->catatonic) {
+		mutex_unlock(&sbi->wq_mutex);
+		return;
+	}
+
+	DPRINTK("entering catatonic mode");
+
+	sbi->catatonic = 1;
+	wq = sbi->queues;
+	sbi->queues = NULL;	/* Erase all wait queues */
+	while (wq) {
+		nwq = wq->next;
+		wq->status = -ENOENT; /* Magic is gone - report failure */
+		if (wq->name.name) {
+			kfree(wq->name.name);
+			wq->name.name = NULL;
+		}
+		wq->wait_ctr--;
+		wake_up_interruptible(&wq->queue);
+		wq = nwq;
+	}
+	fput(sbi->pipe);	/* Close the pipe */
+	sbi->pipe = NULL;
+	sbi->pipefd = -1;
+	mutex_unlock(&sbi->wq_mutex);
+}
+
+static int autofs4_write(struct file *file, const void *addr, int bytes)
+{
+	unsigned long sigpipe, flags;
+	mm_segment_t fs;
+	const char *data = (const char *)addr;
+	ssize_t wr = 0;
+
+	/** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
+
+	sigpipe = sigismember(&current->pending.signal, SIGPIPE);
+
+	/* Save pointer to user space and point back to kernel space */
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	while (bytes &&
+	       (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
+		data += wr;
+		bytes -= wr;
+	}
+
+	set_fs(fs);
+
+	/* Keep the currently executing process from receiving a
+	   SIGPIPE unless it was already supposed to get one */
+	if (wr == -EPIPE && !sigpipe) {
+		spin_lock_irqsave(&current->sighand->siglock, flags);
+		sigdelset(&current->pending.signal, SIGPIPE);
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	}
+
+	return (bytes > 0);
+}
+
+static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
+				 struct autofs_wait_queue *wq,
+				 int type)
+{
+	union {
+		struct autofs_packet_hdr hdr;
+		union autofs_packet_union v4_pkt;
+		union autofs_v5_packet_union v5_pkt;
+	} pkt;
+	struct file *pipe = NULL;
+	size_t pktsz;
+
+	DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
+		wq->wait_queue_token, wq->name.len, wq->name.name, type);
+
+	memset(&pkt,0,sizeof pkt); /* For security reasons */
+
+	pkt.hdr.proto_version = sbi->version;
+	pkt.hdr.type = type;
+	switch (type) {
+	/* Kernel protocol v4 missing and expire packets */
+	case autofs_ptype_missing:
+	{
+		struct autofs_packet_missing *mp = &pkt.v4_pkt.missing;
+
+		pktsz = sizeof(*mp);
+
+		mp->wait_queue_token = wq->wait_queue_token;
+		mp->len = wq->name.len;
+		memcpy(mp->name, wq->name.name, wq->name.len);
+		mp->name[wq->name.len] = '\0';
+		break;
+	}
+	case autofs_ptype_expire_multi:
+	{
+		struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi;
+
+		pktsz = sizeof(*ep);
+
+		ep->wait_queue_token = wq->wait_queue_token;
+		ep->len = wq->name.len;
+		memcpy(ep->name, wq->name.name, wq->name.len);
+		ep->name[wq->name.len] = '\0';
+		break;
+	}
+	/*
+	 * Kernel protocol v5 packet for handling indirect and direct
+	 * mount missing and expire requests
+	 */
+	case autofs_ptype_missing_indirect:
+	case autofs_ptype_expire_indirect:
+	case autofs_ptype_missing_direct:
+	case autofs_ptype_expire_direct:
+	{
+		struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
+
+		pktsz = sizeof(*packet);
+
+		packet->wait_queue_token = wq->wait_queue_token;
+		packet->len = wq->name.len;
+		memcpy(packet->name, wq->name.name, wq->name.len);
+		packet->name[wq->name.len] = '\0';
+		packet->dev = wq->dev;
+		packet->ino = wq->ino;
+		packet->uid = wq->uid;
+		packet->gid = wq->gid;
+		packet->pid = wq->pid;
+		packet->tgid = wq->tgid;
+		break;
+	}
+	default:
+		printk("autofs4_notify_daemon: bad type %d!\n", type);
+		return;
+	}
+
+	/* Check if we have become catatonic */
+	mutex_lock(&sbi->wq_mutex);
+	if (!sbi->catatonic) {
+		pipe = sbi->pipe;
+		get_file(pipe);
+	}
+	mutex_unlock(&sbi->wq_mutex);
+
+	if (pipe) {
+		if (autofs4_write(pipe, &pkt, pktsz))
+			autofs4_catatonic_mode(sbi);
+		fput(pipe);
+	}
+}
+
+static int autofs4_getpath(struct autofs_sb_info *sbi,
+			   struct dentry *dentry, char **name)
+{
+	struct dentry *root = sbi->sb->s_root;
+	struct dentry *tmp;
+	char *buf;
+	char *p;
+	int len;
+	unsigned seq;
+
+rename_retry:
+	buf = *name;
+	len = 0;
+
+	seq = read_seqbegin(&rename_lock);
+	rcu_read_lock();
+	spin_lock(&sbi->fs_lock);
+	for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
+		len += tmp->d_name.len + 1;
+
+	if (!len || --len > NAME_MAX) {
+		spin_unlock(&sbi->fs_lock);
+		rcu_read_unlock();
+		if (read_seqretry(&rename_lock, seq))
+			goto rename_retry;
+		return 0;
+	}
+
+	*(buf + len) = '\0';
+	p = buf + len - dentry->d_name.len;
+	strncpy(p, dentry->d_name.name, dentry->d_name.len);
+
+	for (tmp = dentry->d_parent; tmp != root ; tmp = tmp->d_parent) {
+		*(--p) = '/';
+		p -= tmp->d_name.len;
+		strncpy(p, tmp->d_name.name, tmp->d_name.len);
+	}
+	spin_unlock(&sbi->fs_lock);
+	rcu_read_unlock();
+	if (read_seqretry(&rename_lock, seq))
+		goto rename_retry;
+
+	return len;
+}
+
+static struct autofs_wait_queue *
+autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
+{
+	struct autofs_wait_queue *wq;
+
+	for (wq = sbi->queues; wq; wq = wq->next) {
+		if (wq->name.hash == qstr->hash &&
+		    wq->name.len == qstr->len &&
+		    wq->name.name &&
+			 !memcmp(wq->name.name, qstr->name, qstr->len))
+			break;
+	}
+	return wq;
+}
+
+/*
+ * Check if we have a valid request.
+ * Returns
+ * 1 if the request should continue.
+ *   In this case we can return an autofs_wait_queue entry if one is
+ *   found or NULL to idicate a new wait needs to be created.
+ * 0 or a negative errno if the request shouldn't continue.
+ */
+static int validate_request(struct autofs_wait_queue **wait,
+			    struct autofs_sb_info *sbi,
+			    struct qstr *qstr,
+			    struct dentry*dentry, enum autofs_notify notify)
+{
+	struct autofs_wait_queue *wq;
+	struct autofs_info *ino;
+
+	/* Wait in progress, continue; */
+	wq = autofs4_find_wait(sbi, qstr);
+	if (wq) {
+		*wait = wq;
+		return 1;
+	}
+
+	*wait = NULL;
+
+	/* If we don't yet have any info this is a new request */
+	ino = autofs4_dentry_ino(dentry);
+	if (!ino)
+		return 1;
+
+	/*
+	 * If we've been asked to wait on an existing expire (NFY_NONE)
+	 * but there is no wait in the queue ...
+	 */
+	if (notify == NFY_NONE) {
+		/*
+		 * Either we've betean the pending expire to post it's
+		 * wait or it finished while we waited on the mutex.
+		 * So we need to wait till either, the wait appears
+		 * or the expire finishes.
+		 */
+
+		while (ino->flags & AUTOFS_INF_EXPIRING) {
+			mutex_unlock(&sbi->wq_mutex);
+			schedule_timeout_interruptible(HZ/10);
+			if (mutex_lock_interruptible(&sbi->wq_mutex))
+				return -EINTR;
+
+			wq = autofs4_find_wait(sbi, qstr);
+			if (wq) {
+				*wait = wq;
+				return 1;
+			}
+		}
+
+		/*
+		 * Not ideal but the status has already gone. Of the two
+		 * cases where we wait on NFY_NONE neither depend on the
+		 * return status of the wait.
+		 */
+		return 0;
+	}
+
+	/*
+	 * If we've been asked to trigger a mount and the request
+	 * completed while we waited on the mutex ...
+	 */
+	if (notify == NFY_MOUNT) {
+		struct dentry *new = NULL;
+		int valid = 1;
+
+		/*
+		 * If the dentry was successfully mounted while we slept
+		 * on the wait queue mutex we can return success. If it
+		 * isn't mounted (doesn't have submounts for the case of
+		 * a multi-mount with no mount at it's base) we can
+		 * continue on and create a new request.
+		 */
+		if (!IS_ROOT(dentry)) {
+			if (dentry->d_inode && d_unhashed(dentry)) {
+				struct dentry *parent = dentry->d_parent;
+				new = d_lookup(parent, &dentry->d_name);
+				if (new)
+					dentry = new;
+			}
+		}
+		if (have_submounts(dentry))
+			valid = 0;
+
+		if (new)
+			dput(new);
+		return valid;
+	}
+
+	return 1;
+}
+
+int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
+		enum autofs_notify notify)
+{
+	struct autofs_wait_queue *wq;
+	struct qstr qstr;
+	char *name;
+	int status, ret, type;
+
+	/* In catatonic mode, we don't wait for nobody */
+	if (sbi->catatonic)
+		return -ENOENT;
+
+	if (!dentry->d_inode) {
+		/*
+		 * A wait for a negative dentry is invalid for certain
+		 * cases. A direct or offset mount "always" has its mount
+		 * point directory created and so the request dentry must
+		 * be positive or the map key doesn't exist. The situation
+		 * is very similar for indirect mounts except only dentrys
+		 * in the root of the autofs file system may be negative.
+		 */
+		if (autofs_type_trigger(sbi->type))
+			return -ENOENT;
+		else if (!IS_ROOT(dentry->d_parent))
+			return -ENOENT;
+	}
+
+	name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+
+	/* If this is a direct mount request create a dummy name */
+	if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type))
+		qstr.len = sprintf(name, "%p", dentry);
+	else {
+		qstr.len = autofs4_getpath(sbi, dentry, &name);
+		if (!qstr.len) {
+			kfree(name);
+			return -ENOENT;
+		}
+	}
+	qstr.name = name;
+	qstr.hash = full_name_hash(name, qstr.len);
+
+	if (mutex_lock_interruptible(&sbi->wq_mutex)) {
+		kfree(qstr.name);
+		return -EINTR;
+	}
+
+	ret = validate_request(&wq, sbi, &qstr, dentry, notify);
+	if (ret <= 0) {
+		if (ret == 0)
+			mutex_unlock(&sbi->wq_mutex);
+		kfree(qstr.name);
+		return ret;
+	}
+
+	if (!wq) {
+		/* Create a new wait queue */
+		wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
+		if (!wq) {
+			kfree(qstr.name);
+			mutex_unlock(&sbi->wq_mutex);
+			return -ENOMEM;
+		}
+
+		wq->wait_queue_token = autofs4_next_wait_queue;
+		if (++autofs4_next_wait_queue == 0)
+			autofs4_next_wait_queue = 1;
+		wq->next = sbi->queues;
+		sbi->queues = wq;
+		init_waitqueue_head(&wq->queue);
+		memcpy(&wq->name, &qstr, sizeof(struct qstr));
+		wq->dev = autofs4_get_dev(sbi);
+		wq->ino = autofs4_get_ino(sbi);
+		wq->uid = current_uid();
+		wq->gid = current_gid();
+		wq->pid = current->pid;
+		wq->tgid = current->tgid;
+		wq->status = -EINTR; /* Status return if interrupted */
+		wq->wait_ctr = 2;
+		mutex_unlock(&sbi->wq_mutex);
+
+		if (sbi->version < 5) {
+			if (notify == NFY_MOUNT)
+				type = autofs_ptype_missing;
+			else
+				type = autofs_ptype_expire_multi;
+		} else {
+			if (notify == NFY_MOUNT)
+				type = autofs_type_trigger(sbi->type) ?
+					autofs_ptype_missing_direct :
+					 autofs_ptype_missing_indirect;
+			else
+				type = autofs_type_trigger(sbi->type) ?
+					autofs_ptype_expire_direct :
+					autofs_ptype_expire_indirect;
+		}
+
+		DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
+			(unsigned long) wq->wait_queue_token, wq->name.len,
+			wq->name.name, notify);
+
+		/* autofs4_notify_daemon() may block */
+		autofs4_notify_daemon(sbi, wq, type);
+	} else {
+		wq->wait_ctr++;
+		mutex_unlock(&sbi->wq_mutex);
+		kfree(qstr.name);
+		DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
+			(unsigned long) wq->wait_queue_token, wq->name.len,
+			wq->name.name, notify);
+	}
+
+	/*
+	 * wq->name.name is NULL iff the lock is already released
+	 * or the mount has been made catatonic.
+	 */
+	if (wq->name.name) {
+		/* Block all but "shutdown" signals while waiting */
+		sigset_t oldset;
+		unsigned long irqflags;
+
+		spin_lock_irqsave(&current->sighand->siglock, irqflags);
+		oldset = current->blocked;
+		siginitsetinv(&current->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]);
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
+
+		wait_event_interruptible(wq->queue, wq->name.name == NULL);
+
+		spin_lock_irqsave(&current->sighand->siglock, irqflags);
+		current->blocked = oldset;
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
+	} else {
+		DPRINTK("skipped sleeping");
+	}
+
+	status = wq->status;
+
+	/*
+	 * For direct and offset mounts we need to track the requester's
+	 * uid and gid in the dentry info struct. This is so it can be
+	 * supplied, on request, by the misc device ioctl interface.
+	 * This is needed during daemon resatart when reconnecting
+	 * to existing, active, autofs mounts. The uid and gid (and
+	 * related string values) may be used for macro substitution
+	 * in autofs mount maps.
+	 */
+	if (!status) {
+		struct autofs_info *ino;
+		struct dentry *de = NULL;
+
+		/* direct mount or browsable map */
+		ino = autofs4_dentry_ino(dentry);
+		if (!ino) {
+			/* If not lookup actual dentry used */
+			de = d_lookup(dentry->d_parent, &dentry->d_name);
+			if (de)
+				ino = autofs4_dentry_ino(de);
+		}
+
+		/* Set mount requester */
+		if (ino) {
+			spin_lock(&sbi->fs_lock);
+			ino->uid = wq->uid;
+			ino->gid = wq->gid;
+			spin_unlock(&sbi->fs_lock);
+		}
+
+		if (de)
+			dput(de);
+	}
+
+	/* Are we the last process to need status? */
+	mutex_lock(&sbi->wq_mutex);
+	if (!--wq->wait_ctr)
+		kfree(wq);
+	mutex_unlock(&sbi->wq_mutex);
+
+	return status;
+}
+
+
+int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
+{
+	struct autofs_wait_queue *wq, **wql;
+
+	mutex_lock(&sbi->wq_mutex);
+	for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) {
+		if (wq->wait_queue_token == wait_queue_token)
+			break;
+	}
+
+	if (!wq) {
+		mutex_unlock(&sbi->wq_mutex);
+		return -EINVAL;
+	}
+
+	*wql = wq->next;	/* Unlink from chain */
+	kfree(wq->name.name);
+	wq->name.name = NULL;	/* Do not wait on this queue */
+	wq->status = status;
+	wake_up_interruptible(&wq->queue);
+	if (!--wq->wait_ctr)
+		kfree(wq);
+	mutex_unlock(&sbi->wq_mutex);
+
+	return 0;
+}
+
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
new file mode 100644
index 00000000..bfcb18fe
--- /dev/null
+++ b/fs/bad_inode.c
@@ -0,0 +1,360 @@
+/*
+ *  linux/fs/bad_inode.c
+ *
+ *  Copyright (C) 1997, Stephen Tweedie
+ *
+ *  Provide stub functions for unreadable inodes
+ *
+ *  Fabian Frederick : August 2003 - All file operations assigned to EIO
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/stat.h>
+#include <linux/time.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+
+
+static loff_t bad_file_llseek(struct file *file, loff_t offset, int origin)
+{
+	return -EIO;
+}
+
+static ssize_t bad_file_read(struct file *filp, char __user *buf,
+			size_t size, loff_t *ppos)
+{
+        return -EIO;
+}
+
+static ssize_t bad_file_write(struct file *filp, const char __user *buf,
+			size_t siz, loff_t *ppos)
+{
+        return -EIO;
+}
+
+static ssize_t bad_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+			unsigned long nr_segs, loff_t pos)
+{
+	return -EIO;
+}
+
+static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			unsigned long nr_segs, loff_t pos)
+{
+	return -EIO;
+}
+
+static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	return -EIO;
+}
+
+static unsigned int bad_file_poll(struct file *filp, poll_table *wait)
+{
+	return POLLERR;
+}
+
+static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd,
+			unsigned long arg)
+{
+	return -EIO;
+}
+
+static long bad_file_compat_ioctl(struct file *file, unsigned int cmd,
+			unsigned long arg)
+{
+	return -EIO;
+}
+
+static int bad_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return -EIO;
+}
+
+static int bad_file_open(struct inode *inode, struct file *filp)
+{
+	return -EIO;
+}
+
+static int bad_file_flush(struct file *file, fl_owner_t id)
+{
+	return -EIO;
+}
+
+static int bad_file_release(struct inode *inode, struct file *filp)
+{
+	return -EIO;
+}
+
+static int bad_file_fsync(struct file *file, int datasync)
+{
+	return -EIO;
+}
+
+static int bad_file_aio_fsync(struct kiocb *iocb, int datasync)
+{
+	return -EIO;
+}
+
+static int bad_file_fasync(int fd, struct file *filp, int on)
+{
+	return -EIO;
+}
+
+static int bad_file_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	return -EIO;
+}
+
+static ssize_t bad_file_sendpage(struct file *file, struct page *page,
+			int off, size_t len, loff_t *pos, int more)
+{
+	return -EIO;
+}
+
+static unsigned long bad_file_get_unmapped_area(struct file *file,
+				unsigned long addr, unsigned long len,
+				unsigned long pgoff, unsigned long flags)
+{
+	return -EIO;
+}
+
+static int bad_file_check_flags(int flags)
+{
+	return -EIO;
+}
+
+static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	return -EIO;
+}
+
+static ssize_t bad_file_splice_write(struct pipe_inode_info *pipe,
+			struct file *out, loff_t *ppos, size_t len,
+			unsigned int flags)
+{
+	return -EIO;
+}
+
+static ssize_t bad_file_splice_read(struct file *in, loff_t *ppos,
+			struct pipe_inode_info *pipe, size_t len,
+			unsigned int flags)
+{
+	return -EIO;
+}
+
+static const struct file_operations bad_file_ops =
+{
+	.llseek		= bad_file_llseek,
+	.read		= bad_file_read,
+	.write		= bad_file_write,
+	.aio_read	= bad_file_aio_read,
+	.aio_write	= bad_file_aio_write,
+	.readdir	= bad_file_readdir,
+	.poll		= bad_file_poll,
+	.unlocked_ioctl	= bad_file_unlocked_ioctl,
+	.compat_ioctl	= bad_file_compat_ioctl,
+	.mmap		= bad_file_mmap,
+	.open		= bad_file_open,
+	.flush		= bad_file_flush,
+	.release	= bad_file_release,
+	.fsync		= bad_file_fsync,
+	.aio_fsync	= bad_file_aio_fsync,
+	.fasync		= bad_file_fasync,
+	.lock		= bad_file_lock,
+	.sendpage	= bad_file_sendpage,
+	.get_unmapped_area = bad_file_get_unmapped_area,
+	.check_flags	= bad_file_check_flags,
+	.flock		= bad_file_flock,
+	.splice_write	= bad_file_splice_write,
+	.splice_read	= bad_file_splice_read,
+};
+
+static int bad_inode_create (struct inode *dir, struct dentry *dentry,
+		int mode, struct nameidata *nd)
+{
+	return -EIO;
+}
+
+static struct dentry *bad_inode_lookup(struct inode *dir,
+			struct dentry *dentry, struct nameidata *nd)
+{
+	return ERR_PTR(-EIO);
+}
+
+static int bad_inode_link (struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	return -EIO;
+}
+
+static int bad_inode_unlink(struct inode *dir, struct dentry *dentry)
+{
+	return -EIO;
+}
+
+static int bad_inode_symlink (struct inode *dir, struct dentry *dentry,
+		const char *symname)
+{
+	return -EIO;
+}
+
+static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry,
+			int mode)
+{
+	return -EIO;
+}
+
+static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry)
+{
+	return -EIO;
+}
+
+static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
+			int mode, dev_t rdev)
+{
+	return -EIO;
+}
+
+static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	return -EIO;
+}
+
+static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
+		int buflen)
+{
+	return -EIO;
+}
+
+static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	return -EIO;
+}
+
+static int bad_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			struct kstat *stat)
+{
+	return -EIO;
+}
+
+static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs)
+{
+	return -EIO;
+}
+
+static int bad_inode_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags)
+{
+	return -EIO;
+}
+
+static ssize_t bad_inode_getxattr(struct dentry *dentry, const char *name,
+			void *buffer, size_t size)
+{
+	return -EIO;
+}
+
+static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer,
+			size_t buffer_size)
+{
+	return -EIO;
+}
+
+static int bad_inode_removexattr(struct dentry *dentry, const char *name)
+{
+	return -EIO;
+}
+
+static const struct inode_operations bad_inode_ops =
+{
+	.create		= bad_inode_create,
+	.lookup		= bad_inode_lookup,
+	.link		= bad_inode_link,
+	.unlink		= bad_inode_unlink,
+	.symlink	= bad_inode_symlink,
+	.mkdir		= bad_inode_mkdir,
+	.rmdir		= bad_inode_rmdir,
+	.mknod		= bad_inode_mknod,
+	.rename		= bad_inode_rename,
+	.readlink	= bad_inode_readlink,
+	/* follow_link must be no-op, otherwise unmounting this inode
+	   won't work */
+	/* put_link returns void */
+	/* truncate returns void */
+	.permission	= bad_inode_permission,
+	.getattr	= bad_inode_getattr,
+	.setattr	= bad_inode_setattr,
+	.setxattr	= bad_inode_setxattr,
+	.getxattr	= bad_inode_getxattr,
+	.listxattr	= bad_inode_listxattr,
+	.removexattr	= bad_inode_removexattr,
+	/* truncate_range returns void */
+};
+
+
+/*
+ * When a filesystem is unable to read an inode due to an I/O error in
+ * its read_inode() function, it can call make_bad_inode() to return a
+ * set of stubs which will return EIO errors as required. 
+ *
+ * We only need to do limited initialisation: all other fields are
+ * preinitialised to zero automatically.
+ */
+ 
+/**
+ *	make_bad_inode - mark an inode bad due to an I/O error
+ *	@inode: Inode to mark bad
+ *
+ *	When an inode cannot be read due to a media or remote network
+ *	failure this function makes the inode "bad" and causes I/O operations
+ *	on it to fail from this point on.
+ */
+ 
+void make_bad_inode(struct inode *inode)
+{
+	remove_inode_hash(inode);
+
+	inode->i_mode = S_IFREG;
+	inode->i_atime = inode->i_mtime = inode->i_ctime =
+		current_fs_time(inode->i_sb);
+	inode->i_op = &bad_inode_ops;	
+	inode->i_fop = &bad_file_ops;	
+}
+EXPORT_SYMBOL(make_bad_inode);
+
+/*
+ * This tests whether an inode has been flagged as bad. The test uses
+ * &bad_inode_ops to cover the case of invalidated inodes as well as
+ * those created by make_bad_inode() above.
+ */
+ 
+/**
+ *	is_bad_inode - is an inode errored
+ *	@inode: inode to test
+ *
+ *	Returns true if the inode in question has been marked as bad.
+ */
+ 
+int is_bad_inode(struct inode *inode)
+{
+	return (inode->i_op == &bad_inode_ops);	
+}
+
+EXPORT_SYMBOL(is_bad_inode);
+
+/**
+ * iget_failed - Mark an under-construction inode as dead and release it
+ * @inode: The inode to discard
+ *
+ * Mark an under-construction inode as dead and release it.
+ */
+void iget_failed(struct inode *inode)
+{
+	make_bad_inode(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+}
+EXPORT_SYMBOL(iget_failed);
diff --git a/fs/befs/ChangeLog b/fs/befs/ChangeLog
new file mode 100644
index 00000000..75a461cf
--- /dev/null
+++ b/fs/befs/ChangeLog
@@ -0,0 +1,417 @@
+Version 0.92 (2002-03-29)
+==========
+* Minor cleanup. Ran Lindent on the sources.
+
+Version 0.92 (2002-03-27)
+==========
+* Fixed module makefile problem. It was not compiling all the correct 
+    source files!
+* Removed duplicated function definition
+* Fixed potential null pointer dereference when reporting an error
+
+Version 0.91 (2002-03-26)
+==========
+* Oy! Fixed stupid bug that would cause an unresolved symbol error.
+	Thanks to Laszlo Boszormenyi for pointing this out to me.
+
+Version 0.9 (2002-03-14)
+==========
+* Added Sergey S. Kostyliov's patch to eliminate memcpy() overhead
+	from b+tree operations. Changes the befs_read_datastream() interface.
+
+* Segregated the functions that interface directly with the linux  vfs 
+	interface into their own file called linuxvfs.c. [WD]
+
+Version 0.64 (2002-02-07)
+==========
+* Did the string comparison really right this time (btree.c) [WD]
+
+* Fixed up some places where I assumed that a long int could hold
+	a pointer value. (btree.c) [WD]
+
+* Andrew Farnham <andrewfarnham@uq.net.au> pointed out that the module
+	wouldn't work on older (<2.4.10) kernels due to an unresolved symbol.
+	This is bad, since 2.4.9 is still the current RedHat kernel. I added
+	a workaround for this problem (compatibility.h) [WD]
+
+* Sergey S. Kostyliov made befs_find_key() use a binary search to find 
+	keys within btree nodes, rather than the linear search we were using 
+	before. (btree.c) [Sergey S. Kostyliov <rathamahata@php4.ru>]
+
+* Made a debian package of the source for use with kernel-package. [WD]
+
+
+Version 0.63 (2002-01-31)
+==========
+* Fixed bug in befs_find_brun_indirect() that would result in the wrong
+	block being read. It was introduced when adding byteswapping in 
+	0.61. (datastream.c) [WD]
+
+* Fixed a longstanding bug in befs_find_key() that would result in it 
+	finding the first key that is a substring of the string it is searching
+	for. For example, this would cause files in the same directory with 
+	names like file1 and file2 to mysteriously be duplicates of each other 
+	(because they have the same inode number). Many thanks to Pavel Roskin 
+	for reporting this serious bug!!!
+	(btree.c) [WD]
+
+* Added support for long symlinks, after Axel Dorfler explained up how 
+	they work. I had forgotten all about them. (inode.c, symlink.c) [WD]
+
+* Documentation improvements in source. [WD]
+
+* Makefile fix for independent module when CONFIG_MODVERSION is set in 
+	kernel config [Pavel Roskin <proski@gnu.org>]
+
+* Compile warning fix for namei.c. [Sergey S. Kostyliov <rathamahata@php4.ru>]
+
+
+Version 0.62
+==========
+* Fixed makefile for module install [WD]
+
+
+Version 0.61 (2002-01-20)
+==========
+* Made functions in endian.h to do the correct byteswapping, no matter
+	the arch. [WD]
+
+* Abbandoned silly checks for a NULL superblock pointer in debug.c. [WD]
+
+* Misc code cleanups. Also cleanup of this changelog file. [WD]
+
+* Added byteswapping to all metadata reads from disk.
+	Uses the functions from endian.h [WD]
+
+* Remove the typedef of struct super_block to vfs_sb, as it offended
+	certain peoples' aesthetic sense. [WD]
+
+* Ditto with the befs_read_block() interface. [WD]
+ 
+
+Version 0.6 (2001-12-15)
+==========
+* Cleanup of NLS functions (util.c) [WD]
+
+* Make directory lookup/read use the NLS if an iocharset is provided. [WD]
+
+* Fixed stupid bug where specifying the uid or gid mount options as '0' 
+	would result in the filesystem using the on-disk uid and gid. [WD]
+
+* Added mount option to control debug printing. 
+	The option is, simply enough, 'debug'. 
+	(super.c, debug.c) [WD]
+
+* Removed notion of btree handle from btree.c. It was unnecessary, as the
+	linux VFS doesn't allow us to keep any state between calls. Updated 
+	dir.c, namei.c befs_fs.h to account for it. [WD]
+
+* Improved handleing of overflow nodes when listing directories. 
+	Now works for overflow nodes hanging off of nodes other than the root 
+	node. This is the cleaner solution to Brent Miszalaski's problem. [WD]
+
+* Added new debug/warning/error print functions in debug.c. 
+	More flexible. Will soon be controllable at mount time 
+	(see TODO). [WD]
+
+* Rewrote datastream position lookups.
+	(datastream.c) [WD]
+
+* Moved the TODO list to its own file.
+
+
+Version 0.50 (2001-11-13)
+==========
+* Added workaround for mis-understanding of the nature of the b+trees used 
+	in directories. A cleaner solution will come after I've thought about it 
+	for a while. Thanks to Brent Miszalaski for finding and reporting this bug. 
+	(btree.c) [WD]
+
+* Minor cleanups
+
+* Added test for "impossible" condition of empty internal nodes in 
+	seekleaf() in btree.c [WD]
+
+* Implemented the abstracted read_block() in io.c [WD]
+
+* Cleaned up the inode validation in inode.c [WD]
+
+* Anton Altaparmakov figured out (by asking Linus :) ) what was causing the 
+ 	hanging disk io problem. It turns out you need to have the sync_pages 
+	callback defined in your address_space_ops, even if it just uses the 
+	default linux-supplied implementation. Fixed. Works now.
+	(file.c) [WD]
+
+* Anton Altaparmakov and Christoph Hellwig alerted me to the fact that 
+	filesystem code should be using GFP_NOFS instead of GFP_KERNEL as the 
+	priority parameter to kmalloc(). Fixed. 
+	(datastream.c, btree.c super.c inode.c) [WD]
+
+* Anton also told me that the blocksize is not allowed to be larger than 
+	the page size in linux, which is 4k i386. Oops. Added a test for 
+	(blocksize > PAGE_SIZE), and refuse to mount in that case. What this 
+	practically means is that 8k blocksize volumes won't work without a major
+	restructuring of the driver (or an alpha or other 64bit hardware). [WD]
+
+* Cleaned up the befs_count_blocks() function. Much smarter now. 
+	And somewhat smaller too. [WD]
+
+* Made inode allocations use a slab cache 
+	(super.c inode.c) [WD]
+
+* Moved the freeing of the private inode section from put_inode() to 
+	clear_inode(). This fixes a potential free twice type bug. Put_inode() 
+	can be called multiple times for each inode struct. [WD]
+
+* Converted all non vfs-callback functions to use befs_sb_info as the 
+	superblock type, rather than struct super_block. This is for 
+	portablity. [WD]
+
+* Fixed a couple of compile warnings due to use of malloc.h, when slab.h 
+	is the new way. (inode.c, super.c) [WD]
+
+* Fixed erronous includes of linux/befs_fs_i.h and linux/befs_fs_sb.h 
+	in inode.c [WD]
+
+Version 0.45 (2001-10-29)
+==========
+* Added functions to get the private superblock and inode structures from 
+	their enclosing public structures. Switched all references to the 
+	private portions to use them. (many files) [WD]
+
+* Made read_super and read_inode allocate the private portions of those 
+	structures into the generic pointer fields of the public structures 
+	with kmalloc(). put_super and put_inode free them. This allows us not 
+	to have to touch the definitions of the public structures in 
+	include/linux/fs.h. Also, befs_inode_info is huge (because of the 
+	symlink string). (super.c, inode.c, befs_fs.h) [WD]
+
+* Fixed a thinko that was corrupting file reads after the first block_run 
+	is done being read. (datastream.c) [WD]
+
+* Removed fsync() hooks, since a read-only filesystem doesn't need them. 
+	[Christoph Hellwig].
+
+* Fixed befs_readlink() (symlink.c) [Christoph Hellwig].
+
+* Removed all the Read-Write stuff. I'll redo it when it is time to add 
+	write support (various files) [WD].
+
+* Removed prototypes for functions who's definitions have been removed 
+	(befs_fs.h) [WD].
+
+
+Version 0.4 (2001-10-28)
+==========
+* Made it an option to use the old non-pagecache befs_file_read() for 
+	testing purposes. (fs/Config.in)
+
+* Fixed unused variable warnings when compiling without debugging.
+
+* Fixed a bug where the inode and super_block didn't get their blockbits 
+	fields set (inode.c and super.c). 
+
+* Release patch version 11. AKA befs-driver version 0.4.
+
+* Thats right. New versioning scheme. 
+	I've done some serious testing on it now (on my box anyhow), and it 
+	seems stable and not outragously slow. Existing features are more-or-less 
+	correct (see TODO list). But it isn't 1.0 yet. I think 0.4 gives me some 
+	headroom before the big 1.0.
+
+
+2001-10-26
+==========
+* Fixed date format in this file. Was I smoking crack?
+
+* Removed old datastream code from file.c, since it is nolonger used.
+
+* Generic_read_file() is now used to read regular file data. 
+	It doesn't chew up the buffer cache (it does page io instead), and seems 
+	to be about as fast (even though it has to look up each file block 
+	indivdualy). And it knows about doing readahead, which is a major plus. 
+	So it does i/o in much larger chunks. It is the correct linux way. It 
+	uses befs_get_block() by way of befs_readpage() to find the disk offsets 
+	of blocks, which in turn calls befs_fpos2brun() in datastream.c to do 
+	the hard work of finding the disk block number.
+
+* Changed method of checking for a dirty filesystem in befs_read_super 
+	(super.c). Now we check to see if log_start and log_end differ. If so, 
+	the journal needs to be replayed, and the filesystem cannot be mounted.
+
+* Fixed an extra instance of MOD_DEC_USE_COUNT in super.c
+
+* Fixed a problem with reading the superblock on devices with large sector 
+	sizes (such as cdroms) on linux 2.4.10 and up.
+
+2001-10-24
+==========
+* Fix nasty bug in converting block numbers to struct befs_inode_addr. 
+	Subtle, because the old version was only sometimes wrong. 
+	Probably responsible for lots of problems. (inode.c)
+
+* Fix bug with reading an empty directory. (btree.c and dir.c)
+
+* This one looks good. Release patch version 10
+
+2001-10-23
+==========
+* Added btree searching function.
+
+* Use befs_btree_find in befs_lookup (namei.c)
+
+* Additional comments in btree.c
+
+2001-10-22
+==========
+* Added B+tree reading functions (in btree.c). 
+	Made befs_readdir() use them them instead of the cruft in index.c.
+
+2001-09-11
+==========
+* Converted befs_read_file() to use the new datastream code.
+
+* Finally updated the README file.
+
+* Added many comments.
+
+* Posted version 6
+
+* Removed byte-order conversion code. 
+	I have no intention of supporting it, and it was very ugly. 
+	Flow control with #ifdef (ugh). Maybe I'll redo it once 
+	native byteorder works 100%.
+
+2001-09-10
+==========
+* Finished implementing read_datastream()
+
+* made befs_read_brun() more general
+	Supports an offset to start at and a max bytes to read
+	Added a wrapper function to give the old call
+
+2001-09-30
+==========
+* Discovered that the datastream handleing code in file.c is quite deficient 
+	in several respects. For one thing, it doesn't deal with indirect blocks
+
+* Rewrote datastream handleing.
+
+* Created io.c, for io related functions.
+	Previously, the befs_bread() funtions lived in file.c
+	Created the befs_read_brun() function.
+
+
+2001-09-07
+==========
+* Made a function to actually count the number of fs blocks used by a file.
+	And helper functions.
+	(fs/befs/inode.c)
+
+2001-09-05
+==========
+* Fixed a misunderstanding of the inode fields. 
+	This fixed the problmem with wrong file sizes from du and others.
+	The i_blocks field of the inode struct is not the number of blocks for the
+	inode, it is the number of blocks for the file.	Also, i_blksize is not
+	necessarily the size of the inode, although in  practice it works out.
+	Changed to blocksize of filesystem.
+	(fs/befs/inode.c)
+
+* Permanently removed code that had been provisionally ifdefed out of befs_fs.h
+
+* Since we don't support access time, make that field zero, instead of 
+	copying m_time.
+	(fs/befs/inode.c)
+
+* Added sanity check for inode reading
+	Make sure inode we got was the one we asked for. 
+	(fs/befs/inode.c)
+
+* Code cleanup
+	Local pointers to commonly used structures in inode.c.
+	Got rid of abominations befs_iaddr2inode() and befs_inode2ino(). 
+	Replaced with single function iaddr2blockno().
+	(fs/befs/super.c) (fs/befs/inode.c)
+
+2001-09-01
+==========
+* Fixed the problem with statfs where it would always claim the disk was 
+	half full, due to improper understanding of the statfs fields.
+	(fs/befs/super.c)
+
+* Posted verion 4 of the patch
+
+2001-09-01
+==========
+* Changed the macros in befs_fs.h to inline functions.
+	More readable. Typesafe. Better
+	(include/linux/befs_fs.h)
+
+* Moved type definitions from befs_fs.h to a new file, befs_fs_types.h 
+	Because befs_fs_i.h and befs_fs_sb.h were including befs_fs.h for the 
+	typedefs, and they are inlcuded in <linux/fs.h>, which has definitions 
+	that I want the inline functions in befs_fs.h to be able to see. Nasty
+	circularity.
+	(include/linux/befs_fs.h)
+
+2001-08-30
+==========
+* Cleaned up some wording.
+
+* Added additional consitency checks on mount
+	Check block_size agrees with block_shift
+	Check flags == BEFS_CLEAN
+	(fs/befs/super.c)
+
+* Tell the kernel to only mount befs read-only. 
+	By setting the MS_RDONLY flag in befs_read_super().
+	Not that it was possible to write before. But now the kernel won't even try.
+	(fs/befs/super.c)
+
+* Got rid of kernel warning on mount.
+	The kernel doesn't like it if you call set_blocksize() on a device when 
+	you have some of its blocks open. Moved the second set_blocksize() to the
+	very end of befs_read_super(), after we are done with the disk superblock.
+	(fs/befs/super.c)
+	
+* Fixed wrong number of args bug in befs_dump_inode
+	(fs/befs/debug.c)
+
+* Solved lots of type mismatches in kprint()s
+	(everwhere)
+
+2001-08-27
+==========
+* Cleaned up the fs/Config.in entries a bit, now slightly more descriptive.
+
+* BeFS depends on NLS, so I made activating BeFS enable the NLS questions
+	(fs/nls/Config.in)
+
+* Added Configure.help entries for CONFIG_BEFS_FS and CONFIG_DEBUG_BEFS
+	(Documentation/Configure.help)
+
+2001-08-??
+==========
+* Removed superblock locking calls in befs_read_super(). In 2.4, the VFS 
+	hands us a super_block struct that is already locked.
+
+2001-08-13
+==========
+* Will Dyson <will_dyson@pobox.com> is now attempting to maintain this module
+	Makoto Kato <m_kato@ga2.so-net.ne.jp> is original author.Daniel Berlin 
+	also did some work on it (fixing it up for the later 2.3.x kernels, IIRC).
+
+* Fixed compile errors on 2.4.1 kernel (WD)
+	Resolve rejected patches
+	Accommodate changed NLS interface (util.h)
+	Needed to include <linux/slab.h> in most files
+	Makefile changes
+	fs/Config.in changes
+
+* Tried to niceify the code using the ext2 fs as a guide
+	Declare befs_fs_type using the DECLARE_FSTYPE_DEV() macro
+
+* Made it a configure option to turn on debugging (fs/Config.in)
+
+* Compiles on 2.4.7
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
new file mode 100644
index 00000000..7835d30f
--- /dev/null
+++ b/fs/befs/Kconfig
@@ -0,0 +1,26 @@
+config BEFS_FS
+	tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	select NLS
+	help
+	  The BeOS File System (BeFS) is the native file system of Be, Inc's
+	  BeOS. Notable features include support for arbitrary attributes
+	  on files and directories, and database-like indices on selected
+	  attributes. (Also note that this driver doesn't make those features
+	  available at this time). It is a 64 bit filesystem, so it supports
+	  extremely large volumes and files.
+
+	  If you use this filesystem, you should also say Y to at least one
+	  of the NLS (native language support) options below.
+
+	  If you don't know what this is about, say N.
+
+	  To compile this as a module, choose M here: the module will be
+	  called befs.
+
+config BEFS_DEBUG
+	bool "Debug BeFS"
+	depends on BEFS_FS
+	help
+	  If you say Y here, you can use the 'debug' mount option to enable
+	  debugging output from the driver.
diff --git a/fs/befs/Makefile b/fs/befs/Makefile
new file mode 100644
index 00000000..2f370bd7
--- /dev/null
+++ b/fs/befs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux BeOS filesystem routines.
+#
+ 
+obj-$(CONFIG_BEFS_FS) += befs.o
+
+befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o
diff --git a/fs/befs/TODO b/fs/befs/TODO
new file mode 100644
index 00000000..3250921a
--- /dev/null
+++ b/fs/befs/TODO
@@ -0,0 +1,14 @@
+TODO
+==========
+
+* Convert comments to the Kernel-Doc format.
+
+* Befs_fs.h has gotten big and messy. No reason not to break it up into 
+	smaller peices.
+
+* See if Alexander Viro's option parser made it into the kernel tree. 
+	Use that if we can. (include/linux/parser.h)
+
+* See if we really need separate types for on-disk and in-memory 
+	representations of the superblock and inode.
+
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
new file mode 100644
index 00000000..d9a40abd
--- /dev/null
+++ b/fs/befs/befs.h
@@ -0,0 +1,156 @@
+/*
+ * befs.h
+ *
+ * Copyright (C) 2001-2002 Will Dyson <will_dyson@pobox.com>
+ * Copyright (C) 1999 Makoto Kato (m_kato@ga2.so-net.ne.jp)
+ */
+
+#ifndef _LINUX_BEFS_H
+#define _LINUX_BEFS_H
+
+#include "befs_fs_types.h"
+
+/* used in debug.c */
+#define BEFS_VERSION "0.9.3"
+
+
+typedef u64 befs_blocknr_t;
+/*
+ * BeFS in memory structures
+ */
+
+typedef struct befs_mount_options {
+	gid_t gid;
+	uid_t uid;
+	int use_gid;
+	int use_uid;
+	int debug;
+	char *iocharset;
+} befs_mount_options;
+
+typedef struct befs_sb_info {
+	u32 magic1;
+	u32 block_size;
+	u32 block_shift;
+	int byte_order;
+	befs_off_t num_blocks;
+	befs_off_t used_blocks;
+	u32 inode_size;
+	u32 magic2;
+
+	/* Allocation group information */
+	u32 blocks_per_ag;
+	u32 ag_shift;
+	u32 num_ags;
+
+	/* jornal log entry */
+	befs_block_run log_blocks;
+	befs_off_t log_start;
+	befs_off_t log_end;
+
+	befs_inode_addr root_dir;
+	befs_inode_addr indices;
+	u32 magic3;
+
+	befs_mount_options mount_opts;
+	struct nls_table *nls;
+
+} befs_sb_info;
+
+typedef struct befs_inode_info {
+	u32 i_flags;
+	u32 i_type;
+
+	befs_inode_addr i_inode_num;
+	befs_inode_addr i_parent;
+	befs_inode_addr i_attribute;
+
+	union {
+		befs_data_stream ds;
+		char symlink[BEFS_SYMLINK_LEN];
+	} i_data;
+
+	struct inode vfs_inode;
+
+} befs_inode_info;
+
+enum befs_err {
+	BEFS_OK,
+	BEFS_ERR,
+	BEFS_BAD_INODE,
+	BEFS_BT_END,
+	BEFS_BT_EMPTY,
+	BEFS_BT_MATCH,
+	BEFS_BT_PARMATCH,
+	BEFS_BT_NOT_FOUND
+};
+
+
+/****************************/
+/* debug.c */
+void befs_error(const struct super_block *sb, const char *fmt, ...);
+void befs_warning(const struct super_block *sb, const char *fmt, ...);
+void befs_debug(const struct super_block *sb, const char *fmt, ...);
+
+void befs_dump_super_block(const struct super_block *sb, befs_super_block *);
+void befs_dump_inode(const struct super_block *sb, befs_inode *);
+void befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super *);
+void befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *);
+/****************************/
+
+
+/* Gets a pointer to the private portion of the super_block
+ * structure from the public part
+ */
+static inline befs_sb_info *
+BEFS_SB(const struct super_block *super)
+{
+	return (befs_sb_info *) super->s_fs_info;
+}
+
+static inline befs_inode_info *
+BEFS_I(const struct inode *inode)
+{
+	return list_entry(inode, struct befs_inode_info, vfs_inode);
+}
+
+static inline befs_blocknr_t
+iaddr2blockno(struct super_block *sb, befs_inode_addr * iaddr)
+{
+	return ((iaddr->allocation_group << BEFS_SB(sb)->ag_shift) +
+		iaddr->start);
+}
+
+static inline befs_inode_addr
+blockno2iaddr(struct super_block *sb, befs_blocknr_t blockno)
+{
+	befs_inode_addr iaddr;
+	iaddr.allocation_group = blockno >> BEFS_SB(sb)->ag_shift;
+	iaddr.start =
+	    blockno - (iaddr.allocation_group << BEFS_SB(sb)->ag_shift);
+	iaddr.len = 1;
+
+	return iaddr;
+}
+
+static inline unsigned int
+befs_iaddrs_per_block(struct super_block *sb)
+{
+	return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr);
+}
+
+static inline int
+befs_iaddr_is_empty(befs_inode_addr * iaddr)
+{
+	return (!iaddr->allocation_group) && (!iaddr->start) && (!iaddr->len);
+}
+
+static inline size_t
+befs_brun_size(struct super_block *sb, befs_block_run run)
+{
+	return BEFS_SB(sb)->block_size * run.len;
+}
+
+#include "endian.h"
+
+#endif				/* _LINUX_BEFS_H */
diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h
new file mode 100644
index 00000000..eb557d9d
--- /dev/null
+++ b/fs/befs/befs_fs_types.h
@@ -0,0 +1,255 @@
+/*
+ * fs/befs/befs_fs_types.h
+ *
+ * Copyright (C) 2001 Will Dyson (will@cs.earlham.edu)
+ *
+ *
+ *
+ * from linux/include/linux/befs_fs.h
+ *
+ * Copyright (C) 1999 Makoto Kato (m_kato@ga2.so-net.ne.jp)
+ *
+ */
+
+#ifndef _LINUX_BEFS_FS_TYPES
+#define _LINUX_BEFS_FS_TYPES
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#endif /*__KERNEL__*/
+
+#define PACKED __attribute__ ((__packed__))
+
+/*
+ * Max name lengths of BFS
+ */
+
+#define BEFS_NAME_LEN 255
+
+#define BEFS_SYMLINK_LEN 144
+#define BEFS_NUM_DIRECT_BLOCKS 12
+#define B_OS_NAME_LENGTH 32
+
+/* The datastream blocks mapped by the double-indirect
+ * block are always 4 fs blocks long.
+ * This eliminates the need for linear searches among
+ * the potentially huge number of indirect blocks
+ *
+ * Err. Should that be 4 fs blocks or 4k???
+ * It matters on large blocksize volumes
+ */
+#define BEFS_DBLINDIR_BRUN_LEN 4
+
+/*
+ * Flags of superblock
+ */
+
+enum super_flags {
+	BEFS_BYTESEX_BE,
+	BEFS_BYTESEX_LE,
+	BEFS_CLEAN = 0x434c454e,
+	BEFS_DIRTY = 0x44495254,
+	BEFS_SUPER_MAGIC1 = 0x42465331,	/* BFS1 */
+	BEFS_SUPER_MAGIC2 = 0xdd121031,
+	BEFS_SUPER_MAGIC3 = 0x15b6830e,
+};
+
+#define BEFS_BYTEORDER_NATIVE 0x42494745
+#define BEFS_BYTEORDER_NATIVE_LE (__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE)
+#define BEFS_BYTEORDER_NATIVE_BE (__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE)
+
+#define BEFS_SUPER_MAGIC BEFS_SUPER_MAGIC1
+#define BEFS_SUPER_MAGIC1_LE (__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1)
+#define BEFS_SUPER_MAGIC1_BE (__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1)
+
+/*
+ * Flags of inode
+ */
+
+#define BEFS_INODE_MAGIC1 0x3bbe0ad9
+
+enum inode_flags {
+	BEFS_INODE_IN_USE = 0x00000001,
+	BEFS_ATTR_INODE = 0x00000004,
+	BEFS_INODE_LOGGED = 0x00000008,
+	BEFS_INODE_DELETED = 0x00000010,
+	BEFS_LONG_SYMLINK = 0x00000040,
+	BEFS_PERMANENT_FLAG = 0x0000ffff,
+	BEFS_INODE_NO_CREATE = 0x00010000,
+	BEFS_INODE_WAS_WRITTEN = 0x00020000,
+	BEFS_NO_TRANSACTION = 0x00040000,
+};
+/* 
+ * On-Disk datastructures of BeFS
+ */
+
+typedef u64 __bitwise fs64;
+typedef u32 __bitwise fs32;
+typedef u16 __bitwise fs16;
+
+typedef u64 befs_off_t;
+typedef fs64 befs_time_t;
+
+/* Block runs */
+typedef struct {
+	fs32 allocation_group;
+	fs16 start;
+	fs16 len;
+} PACKED befs_disk_block_run;
+
+typedef struct {
+	u32 allocation_group;
+	u16 start;
+	u16 len;
+} PACKED befs_block_run;
+
+typedef befs_disk_block_run befs_disk_inode_addr;
+typedef befs_block_run befs_inode_addr;
+
+/*
+ * The Superblock Structure
+ */
+typedef struct {
+	char name[B_OS_NAME_LENGTH];
+	fs32 magic1;
+	fs32 fs_byte_order;
+
+	fs32 block_size;
+	fs32 block_shift;
+
+	fs64 num_blocks;
+	fs64 used_blocks;
+
+	fs32 inode_size;
+
+	fs32 magic2;
+	fs32 blocks_per_ag;
+	fs32 ag_shift;
+	fs32 num_ags;
+
+	fs32 flags;
+
+	befs_disk_block_run log_blocks;
+	fs64 log_start;
+	fs64 log_end;
+
+	fs32 magic3;
+	befs_disk_inode_addr root_dir;
+	befs_disk_inode_addr indices;
+
+} PACKED befs_super_block;
+
+/* 
+ * Note: the indirect and dbl_indir block_runs may
+ * be longer than one block!
+ */
+typedef struct {
+	befs_disk_block_run direct[BEFS_NUM_DIRECT_BLOCKS];
+	fs64 max_direct_range;
+	befs_disk_block_run indirect;
+	fs64 max_indirect_range;
+	befs_disk_block_run double_indirect;
+	fs64 max_double_indirect_range;
+	fs64 size;
+} PACKED befs_disk_data_stream;
+
+typedef struct {
+	befs_block_run direct[BEFS_NUM_DIRECT_BLOCKS];
+	befs_off_t max_direct_range;
+	befs_block_run indirect;
+	befs_off_t max_indirect_range;
+	befs_block_run double_indirect;
+	befs_off_t max_double_indirect_range;
+	befs_off_t size;
+} PACKED befs_data_stream;
+
+/* Attribute */
+typedef struct {
+	fs32 type;
+	fs16 name_size;
+	fs16 data_size;
+	char name[1];
+} PACKED befs_small_data;
+
+/* Inode structure */
+typedef struct {
+	fs32 magic1;
+	befs_disk_inode_addr inode_num;
+	fs32 uid;
+	fs32 gid;
+	fs32 mode;
+	fs32 flags;
+	befs_time_t create_time;
+	befs_time_t last_modified_time;
+	befs_disk_inode_addr parent;
+	befs_disk_inode_addr attributes;
+	fs32 type;
+
+	fs32 inode_size;
+	fs32 etc;		/* not use */
+
+	union {
+		befs_disk_data_stream datastream;
+		char symlink[BEFS_SYMLINK_LEN];
+	} data;
+
+	fs32 pad[4];		/* not use */
+	befs_small_data small_data[1];
+} PACKED befs_inode;
+
+/*
+ * B+tree superblock
+ */
+
+#define BEFS_BTREE_MAGIC 0x69f6c2e8
+
+enum btree_types {
+	BTREE_STRING_TYPE = 0,
+	BTREE_INT32_TYPE = 1,
+	BTREE_UINT32_TYPE = 2,
+	BTREE_INT64_TYPE = 3,
+	BTREE_UINT64_TYPE = 4,
+	BTREE_FLOAT_TYPE = 5,
+	BTREE_DOUBLE_TYPE = 6
+};
+
+typedef struct {
+	fs32 magic;
+	fs32 node_size;
+	fs32 max_depth;
+	fs32 data_type;
+	fs64 root_node_ptr;
+	fs64 free_node_ptr;
+	fs64 max_size;
+} PACKED befs_disk_btree_super;
+
+typedef struct {
+	u32 magic;
+	u32 node_size;
+	u32 max_depth;
+	u32 data_type;
+	befs_off_t root_node_ptr;
+	befs_off_t free_node_ptr;
+	befs_off_t max_size;
+} PACKED befs_btree_super;
+
+/*
+ * Header structure of each btree node
+ */
+typedef struct {
+	fs64 left;
+	fs64 right;
+	fs64 overflow;
+	fs16 all_key_count;
+	fs16 all_key_length;
+} PACKED befs_btree_nodehead;
+
+typedef struct {
+	befs_off_t left;
+	befs_off_t right;
+	befs_off_t overflow;
+	u16 all_key_count;
+	u16 all_key_length;
+} PACKED befs_host_btree_nodehead;
+
+#endif				/* _LINUX_BEFS_FS_TYPES */
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
new file mode 100644
index 00000000..a66c9b11
--- /dev/null
+++ b/fs/befs/btree.c
@@ -0,0 +1,787 @@
+/*
+ * linux/fs/befs/btree.c
+ *
+ * Copyright (C) 2001-2002 Will Dyson <will_dyson@pobox.com>
+ *
+ * Licensed under the GNU GPL. See the file COPYING for details.
+ *
+ * 2002-02-05: Sergey S. Kostyliov added binary search within
+ * 		btree nodes.
+ *
+ * Many thanks to:
+ *
+ * Dominic Giampaolo, author of "Practical File System
+ * Design with the Be File System", for such a helpful book.
+ * 
+ * Marcus J. Ranum, author of the b+tree package in 
+ * comp.sources.misc volume 10. This code is not copied from that
+ * work, but it is partially based on it.
+ *
+ * Makoto Kato, author of the original BeFS for linux filesystem
+ * driver.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/buffer_head.h>
+
+#include "befs.h"
+#include "btree.h"
+#include "datastream.h"
+
+/*
+ * The btree functions in this file are built on top of the
+ * datastream.c interface, which is in turn built on top of the
+ * io.c interface.
+ */
+
+/* Befs B+tree structure:
+ * 
+ * The first thing in the tree is the tree superblock. It tells you
+ * all kinds of useful things about the tree, like where the rootnode
+ * is located, and the size of the nodes (always 1024 with current version
+ * of BeOS).
+ *
+ * The rest of the tree consists of a series of nodes. Nodes contain a header
+ * (struct befs_btree_nodehead), the packed key data, an array of shorts 
+ * containing the ending offsets for each of the keys, and an array of
+ * befs_off_t values. In interior nodes, the keys are the ending keys for 
+ * the childnode they point to, and the values are offsets into the 
+ * datastream containing the tree. 
+ */
+
+/* Note:
+ * 
+ * The book states 2 confusing things about befs b+trees. First, 
+ * it states that the overflow field of node headers is used by internal nodes
+ * to point to another node that "effectively continues this one". Here is what
+ * I believe that means. Each key in internal nodes points to another node that
+ * contains key values less than itself. Inspection reveals that the last key 
+ * in the internal node is not the last key in the index. Keys that are 
+ * greater than the last key in the internal node go into the overflow node. 
+ * I imagine there is a performance reason for this.
+ *
+ * Second, it states that the header of a btree node is sufficient to 
+ * distinguish internal nodes from leaf nodes. Without saying exactly how. 
+ * After figuring out the first, it becomes obvious that internal nodes have
+ * overflow nodes and leafnodes do not.
+ */
+
+/* 
+ * Currently, this code is only good for directory B+trees.
+ * In order to be used for other BFS indexes, it needs to be extended to handle
+ * duplicate keys and non-string keytypes (int32, int64, float, double).
+ */
+
+/*
+ * In memory structure of each btree node
+ */
+typedef struct {
+	befs_host_btree_nodehead head;	/* head of node converted to cpu byteorder */
+	struct buffer_head *bh;
+	befs_btree_nodehead *od_node;	/* on disk node */
+} befs_btree_node;
+
+/* local constants */
+static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL;
+
+/* local functions */
+static int befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
+			       befs_btree_super * bt_super,
+			       befs_btree_node * this_node,
+			       befs_off_t * node_off);
+
+static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
+			      befs_btree_super * sup);
+
+static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
+			     befs_btree_node * node, befs_off_t node_off);
+
+static int befs_leafnode(befs_btree_node * node);
+
+static fs16 *befs_bt_keylen_index(befs_btree_node * node);
+
+static fs64 *befs_bt_valarray(befs_btree_node * node);
+
+static char *befs_bt_keydata(befs_btree_node * node);
+
+static int befs_find_key(struct super_block *sb, befs_btree_node * node,
+			 const char *findkey, befs_off_t * value);
+
+static char *befs_bt_get_key(struct super_block *sb, befs_btree_node * node,
+			     int index, u16 * keylen);
+
+static int befs_compare_strings(const void *key1, int keylen1,
+				const void *key2, int keylen2);
+
+/**
+ * befs_bt_read_super - read in btree superblock convert to cpu byteorder
+ * @sb: Filesystem superblock
+ * @ds: Datastream to read from
+ * @sup: Buffer in which to place the btree superblock
+ *
+ * Calls befs_read_datastream to read in the btree superblock and
+ * makes sure it is in cpu byteorder, byteswapping if necessary.
+ *
+ * On success, returns BEFS_OK and *@sup contains the btree superblock,
+ * in cpu byte order.
+ *
+ * On failure, BEFS_ERR is returned.
+ */
+static int
+befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
+		   befs_btree_super * sup)
+{
+	struct buffer_head *bh = NULL;
+	befs_disk_btree_super *od_sup = NULL;
+
+	befs_debug(sb, "---> befs_btree_read_super()");
+
+	bh = befs_read_datastream(sb, ds, 0, NULL);
+
+	if (!bh) {
+		befs_error(sb, "Couldn't read index header.");
+		goto error;
+	}
+	od_sup = (befs_disk_btree_super *) bh->b_data;
+	befs_dump_index_entry(sb, od_sup);
+
+	sup->magic = fs32_to_cpu(sb, od_sup->magic);
+	sup->node_size = fs32_to_cpu(sb, od_sup->node_size);
+	sup->max_depth = fs32_to_cpu(sb, od_sup->max_depth);
+	sup->data_type = fs32_to_cpu(sb, od_sup->data_type);
+	sup->root_node_ptr = fs64_to_cpu(sb, od_sup->root_node_ptr);
+	sup->free_node_ptr = fs64_to_cpu(sb, od_sup->free_node_ptr);
+	sup->max_size = fs64_to_cpu(sb, od_sup->max_size);
+
+	brelse(bh);
+	if (sup->magic != BEFS_BTREE_MAGIC) {
+		befs_error(sb, "Index header has bad magic.");
+		goto error;
+	}
+
+	befs_debug(sb, "<--- befs_btree_read_super()");
+	return BEFS_OK;
+
+      error:
+	befs_debug(sb, "<--- befs_btree_read_super() ERROR");
+	return BEFS_ERR;
+}
+
+/**
+ * befs_bt_read_node - read in btree node and convert to cpu byteorder
+ * @sb: Filesystem superblock
+ * @ds: Datastream to read from
+ * @node: Buffer in which to place the btree node
+ * @node_off: Starting offset (in bytes) of the node in @ds
+ *
+ * Calls befs_read_datastream to read in the indicated btree node and
+ * makes sure its header fields are in cpu byteorder, byteswapping if
+ * necessary.
+ * Note: node->bh must be NULL when this function called first
+ * time. Don't forget brelse(node->bh) after last call.
+ *
+ * On success, returns BEFS_OK and *@node contains the btree node that
+ * starts at @node_off, with the node->head fields in cpu byte order.
+ *
+ * On failure, BEFS_ERR is returned.
+ */
+
+static int
+befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
+		  befs_btree_node * node, befs_off_t node_off)
+{
+	uint off = 0;
+
+	befs_debug(sb, "---> befs_bt_read_node()");
+
+	if (node->bh)
+		brelse(node->bh);
+
+	node->bh = befs_read_datastream(sb, ds, node_off, &off);
+	if (!node->bh) {
+		befs_error(sb, "befs_bt_read_node() failed to read "
+			   "node at %Lu", node_off);
+		befs_debug(sb, "<--- befs_bt_read_node() ERROR");
+
+		return BEFS_ERR;
+	}
+	node->od_node =
+	    (befs_btree_nodehead *) ((void *) node->bh->b_data + off);
+
+	befs_dump_index_node(sb, node->od_node);
+
+	node->head.left = fs64_to_cpu(sb, node->od_node->left);
+	node->head.right = fs64_to_cpu(sb, node->od_node->right);
+	node->head.overflow = fs64_to_cpu(sb, node->od_node->overflow);
+	node->head.all_key_count =
+	    fs16_to_cpu(sb, node->od_node->all_key_count);
+	node->head.all_key_length =
+	    fs16_to_cpu(sb, node->od_node->all_key_length);
+
+	befs_debug(sb, "<--- befs_btree_read_node()");
+	return BEFS_OK;
+}
+
+/**
+ * befs_btree_find - Find a key in a befs B+tree
+ * @sb: Filesystem superblock
+ * @ds: Datastream containing btree
+ * @key: Key string to lookup in btree
+ * @value: Value stored with @key
+ *
+ * On success, returns BEFS_OK and sets *@value to the value stored
+ * with @key (usually the disk block number of an inode).
+ *
+ * On failure, returns BEFS_ERR or BEFS_BT_NOT_FOUND.
+ * 
+ * Algorithm: 
+ *   Read the superblock and rootnode of the b+tree.
+ *   Drill down through the interior nodes using befs_find_key().
+ *   Once at the correct leaf node, use befs_find_key() again to get the
+ *   actuall value stored with the key.
+ */
+int
+befs_btree_find(struct super_block *sb, befs_data_stream * ds,
+		const char *key, befs_off_t * value)
+{
+	befs_btree_node *this_node = NULL;
+	befs_btree_super bt_super;
+	befs_off_t node_off;
+	int res;
+
+	befs_debug(sb, "---> befs_btree_find() Key: %s", key);
+
+	if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
+		befs_error(sb,
+			   "befs_btree_find() failed to read index superblock");
+		goto error;
+	}
+
+	this_node = kmalloc(sizeof (befs_btree_node),
+						GFP_NOFS);
+	if (!this_node) {
+		befs_error(sb, "befs_btree_find() failed to allocate %u "
+			   "bytes of memory", sizeof (befs_btree_node));
+		goto error;
+	}
+
+	this_node->bh = NULL;
+
+	/* read in root node */
+	node_off = bt_super.root_node_ptr;
+	if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
+		befs_error(sb, "befs_btree_find() failed to read "
+			   "node at %Lu", node_off);
+		goto error_alloc;
+	}
+
+	while (!befs_leafnode(this_node)) {
+		res = befs_find_key(sb, this_node, key, &node_off);
+		if (res == BEFS_BT_NOT_FOUND)
+			node_off = this_node->head.overflow;
+		/* if no match, go to overflow node */
+		if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
+			befs_error(sb, "befs_btree_find() failed to read "
+				   "node at %Lu", node_off);
+			goto error_alloc;
+		}
+	}
+
+	/* at the correct leaf node now */
+
+	res = befs_find_key(sb, this_node, key, value);
+
+	brelse(this_node->bh);
+	kfree(this_node);
+
+	if (res != BEFS_BT_MATCH) {
+		befs_debug(sb, "<--- befs_btree_find() Key %s not found", key);
+		*value = 0;
+		return BEFS_BT_NOT_FOUND;
+	}
+	befs_debug(sb, "<--- befs_btree_find() Found key %s, value %Lu",
+		   key, *value);
+	return BEFS_OK;
+
+      error_alloc:
+	kfree(this_node);
+      error:
+	*value = 0;
+	befs_debug(sb, "<--- befs_btree_find() ERROR");
+	return BEFS_ERR;
+}
+
+/**
+ * befs_find_key - Search for a key within a node
+ * @sb: Filesystem superblock
+ * @node: Node to find the key within
+ * @key: Keystring to search for
+ * @value: If key is found, the value stored with the key is put here
+ *
+ * finds exact match if one exists, and returns BEFS_BT_MATCH
+ * If no exact match, finds first key in node that is greater
+ * (alphabetically) than the search key and returns BEFS_BT_PARMATCH
+ * (for partial match, I guess). Can you think of something better to
+ * call it?
+ *
+ * If no key was a match or greater than the search key, return
+ * BEFS_BT_NOT_FOUND.
+ *
+ * Use binary search instead of a linear.
+ */
+static int
+befs_find_key(struct super_block *sb, befs_btree_node * node,
+	      const char *findkey, befs_off_t * value)
+{
+	int first, last, mid;
+	int eq;
+	u16 keylen;
+	int findkey_len;
+	char *thiskey;
+	fs64 *valarray;
+
+	befs_debug(sb, "---> befs_find_key() %s", findkey);
+
+	*value = 0;
+
+	findkey_len = strlen(findkey);
+
+	/* if node can not contain key, just skeep this node */
+	last = node->head.all_key_count - 1;
+	thiskey = befs_bt_get_key(sb, node, last, &keylen);
+
+	eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len);
+	if (eq < 0) {
+		befs_debug(sb, "<--- befs_find_key() %s not found", findkey);
+		return BEFS_BT_NOT_FOUND;
+	}
+
+	valarray = befs_bt_valarray(node);
+
+	/* simple binary search */
+	first = 0;
+	mid = 0;
+	while (last >= first) {
+		mid = (last + first) / 2;
+		befs_debug(sb, "first: %d, last: %d, mid: %d", first, last,
+			   mid);
+		thiskey = befs_bt_get_key(sb, node, mid, &keylen);
+		eq = befs_compare_strings(thiskey, keylen, findkey,
+					  findkey_len);
+
+		if (eq == 0) {
+			befs_debug(sb, "<--- befs_find_key() found %s at %d",
+				   thiskey, mid);
+
+			*value = fs64_to_cpu(sb, valarray[mid]);
+			return BEFS_BT_MATCH;
+		}
+		if (eq > 0)
+			last = mid - 1;
+		else
+			first = mid + 1;
+	}
+	if (eq < 0)
+		*value = fs64_to_cpu(sb, valarray[mid + 1]);
+	else
+		*value = fs64_to_cpu(sb, valarray[mid]);
+	befs_debug(sb, "<--- befs_find_key() found %s at %d", thiskey, mid);
+	return BEFS_BT_PARMATCH;
+}
+
+/**
+ * befs_btree_read - Traverse leafnodes of a btree
+ * @sb: Filesystem superblock
+ * @ds: Datastream containing btree
+ * @key_no: Key number (alphabetical order) of key to read
+ * @bufsize: Size of the buffer to return key in
+ * @keybuf: Pointer to a buffer to put the key in
+ * @keysize: Length of the returned key
+ * @value: Value stored with the returned key
+ *
+ * Heres how it works: Key_no is the index of the key/value pair to 
+ * return in keybuf/value.
+ * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is 
+ * the number of charecters in the key (just a convenience).
+ *
+ * Algorithm:
+ *   Get the first leafnode of the tree. See if the requested key is in that
+ *   node. If not, follow the node->right link to the next leafnode. Repeat 
+ *   until the (key_no)th key is found or the tree is out of keys.
+ */
+int
+befs_btree_read(struct super_block *sb, befs_data_stream * ds,
+		loff_t key_no, size_t bufsize, char *keybuf, size_t * keysize,
+		befs_off_t * value)
+{
+	befs_btree_node *this_node;
+	befs_btree_super bt_super;
+	befs_off_t node_off = 0;
+	int cur_key;
+	fs64 *valarray;
+	char *keystart;
+	u16 keylen;
+	int res;
+
+	uint key_sum = 0;
+
+	befs_debug(sb, "---> befs_btree_read()");
+
+	if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
+		befs_error(sb,
+			   "befs_btree_read() failed to read index superblock");
+		goto error;
+	}
+
+	if ((this_node = (befs_btree_node *)
+	     kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) {
+		befs_error(sb, "befs_btree_read() failed to allocate %u "
+			   "bytes of memory", sizeof (befs_btree_node));
+		goto error;
+	}
+
+	node_off = bt_super.root_node_ptr;
+	this_node->bh = NULL;
+
+	/* seeks down to first leafnode, reads it into this_node */
+	res = befs_btree_seekleaf(sb, ds, &bt_super, this_node, &node_off);
+	if (res == BEFS_BT_EMPTY) {
+		brelse(this_node->bh);
+		kfree(this_node);
+		*value = 0;
+		*keysize = 0;
+		befs_debug(sb, "<--- befs_btree_read() Tree is EMPTY");
+		return BEFS_BT_EMPTY;
+	} else if (res == BEFS_ERR) {
+		goto error_alloc;
+	}
+
+	/* find the leaf node containing the key_no key */
+
+	while (key_sum + this_node->head.all_key_count <= key_no) {
+
+		/* no more nodes to look in: key_no is too large */
+		if (this_node->head.right == befs_bt_inval) {
+			*keysize = 0;
+			*value = 0;
+			befs_debug(sb,
+				   "<--- befs_btree_read() END of keys at %Lu",
+				   key_sum + this_node->head.all_key_count);
+			brelse(this_node->bh);
+			kfree(this_node);
+			return BEFS_BT_END;
+		}
+
+		key_sum += this_node->head.all_key_count;
+		node_off = this_node->head.right;
+
+		if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
+			befs_error(sb, "befs_btree_read() failed to read "
+				   "node at %Lu", node_off);
+			goto error_alloc;
+		}
+	}
+
+	/* how many keys into this_node is key_no */
+	cur_key = key_no - key_sum;
+
+	/* get pointers to datastructures within the node body */
+	valarray = befs_bt_valarray(this_node);
+
+	keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen);
+
+	befs_debug(sb, "Read [%Lu,%d]: keysize %d", node_off, cur_key, keylen);
+
+	if (bufsize < keylen + 1) {
+		befs_error(sb, "befs_btree_read() keybuf too small (%u) "
+			   "for key of size %d", bufsize, keylen);
+		brelse(this_node->bh);
+		goto error_alloc;
+	};
+
+	strncpy(keybuf, keystart, keylen);
+	*value = fs64_to_cpu(sb, valarray[cur_key]);
+	*keysize = keylen;
+	keybuf[keylen] = '\0';
+
+	befs_debug(sb, "Read [%Lu,%d]: Key \"%.*s\", Value %Lu", node_off,
+		   cur_key, keylen, keybuf, *value);
+
+	brelse(this_node->bh);
+	kfree(this_node);
+
+	befs_debug(sb, "<--- befs_btree_read()");
+
+	return BEFS_OK;
+
+      error_alloc:
+	kfree(this_node);
+
+      error:
+	*keysize = 0;
+	*value = 0;
+	befs_debug(sb, "<--- befs_btree_read() ERROR");
+	return BEFS_ERR;
+}
+
+/**
+ * befs_btree_seekleaf - Find the first leafnode in the btree
+ * @sb: Filesystem superblock
+ * @ds: Datastream containing btree
+ * @bt_super: Pointer to the superblock of the btree
+ * @this_node: Buffer to return the leafnode in
+ * @node_off: Pointer to offset of current node within datastream. Modified
+ * 		by the function.
+ *
+ *
+ * Helper function for btree traverse. Moves the current position to the 
+ * start of the first leaf node.
+ *
+ * Also checks for an empty tree. If there are no keys, returns BEFS_BT_EMPTY.
+ */
+static int
+befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
+		    befs_btree_super * bt_super, befs_btree_node * this_node,
+		    befs_off_t * node_off)
+{
+
+	befs_debug(sb, "---> befs_btree_seekleaf()");
+
+	if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
+		befs_error(sb, "befs_btree_seekleaf() failed to read "
+			   "node at %Lu", *node_off);
+		goto error;
+	}
+	befs_debug(sb, "Seekleaf to root node %Lu", *node_off);
+
+	if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) {
+		befs_debug(sb, "<--- befs_btree_seekleaf() Tree is EMPTY");
+		return BEFS_BT_EMPTY;
+	}
+
+	while (!befs_leafnode(this_node)) {
+
+		if (this_node->head.all_key_count == 0) {
+			befs_debug(sb, "befs_btree_seekleaf() encountered "
+				   "an empty interior node: %Lu. Using Overflow "
+				   "node: %Lu", *node_off,
+				   this_node->head.overflow);
+			*node_off = this_node->head.overflow;
+		} else {
+			fs64 *valarray = befs_bt_valarray(this_node);
+			*node_off = fs64_to_cpu(sb, valarray[0]);
+		}
+		if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
+			befs_error(sb, "befs_btree_seekleaf() failed to read "
+				   "node at %Lu", *node_off);
+			goto error;
+		}
+
+		befs_debug(sb, "Seekleaf to child node %Lu", *node_off);
+	}
+	befs_debug(sb, "Node %Lu is a leaf node", *node_off);
+
+	return BEFS_OK;
+
+      error:
+	befs_debug(sb, "<--- befs_btree_seekleaf() ERROR");
+	return BEFS_ERR;
+}
+
+/**
+ * befs_leafnode - Determine if the btree node is a leaf node or an 
+ * interior node
+ * @node: Pointer to node structure to test
+ * 
+ * Return 1 if leaf, 0 if interior
+ */
+static int
+befs_leafnode(befs_btree_node * node)
+{
+	/* all interior nodes (and only interior nodes) have an overflow node */
+	if (node->head.overflow == befs_bt_inval)
+		return 1;
+	else
+		return 0;
+}
+
+/**
+ * befs_bt_keylen_index - Finds start of keylen index in a node
+ * @node: Pointer to the node structure to find the keylen index within
+ *
+ * Returns a pointer to the start of the key length index array
+ * of the B+tree node *@node
+ *
+ * "The length of all the keys in the node is added to the size of the
+ * header and then rounded up to a multiple of four to get the beginning
+ * of the key length index" (p.88, practical filesystem design).
+ *
+ * Except that rounding up to 8 works, and rounding up to 4 doesn't.
+ */
+static fs16 *
+befs_bt_keylen_index(befs_btree_node * node)
+{
+	const int keylen_align = 8;
+	unsigned long int off =
+	    (sizeof (befs_btree_nodehead) + node->head.all_key_length);
+	ulong tmp = off % keylen_align;
+
+	if (tmp)
+		off += keylen_align - tmp;
+
+	return (fs16 *) ((void *) node->od_node + off);
+}
+
+/**
+ * befs_bt_valarray - Finds the start of value array in a node
+ * @node: Pointer to the node structure to find the value array within
+ *
+ * Returns a pointer to the start of the value array
+ * of the node pointed to by the node header
+ */
+static fs64 *
+befs_bt_valarray(befs_btree_node * node)
+{
+	void *keylen_index_start = (void *) befs_bt_keylen_index(node);
+	size_t keylen_index_size = node->head.all_key_count * sizeof (fs16);
+
+	return (fs64 *) (keylen_index_start + keylen_index_size);
+}
+
+/**
+ * befs_bt_keydata - Finds start of keydata array in a node
+ * @node: Pointer to the node structure to find the keydata array within
+ *
+ * Returns a pointer to the start of the keydata array
+ * of the node pointed to by the node header 
+ */
+static char *
+befs_bt_keydata(befs_btree_node * node)
+{
+	return (char *) ((void *) node->od_node + sizeof (befs_btree_nodehead));
+}
+
+/**
+ * befs_bt_get_key - returns a pointer to the start of a key
+ * @sb: filesystem superblock
+ * @node: node in which to look for the key
+ * @index: the index of the key to get
+ * @keylen: modified to be the length of the key at @index
+ *
+ * Returns a valid pointer into @node on success.
+ * Returns NULL on failure (bad input) and sets *@keylen = 0
+ */
+static char *
+befs_bt_get_key(struct super_block *sb, befs_btree_node * node,
+		int index, u16 * keylen)
+{
+	int prev_key_end;
+	char *keystart;
+	fs16 *keylen_index;
+
+	if (index < 0 || index > node->head.all_key_count) {
+		*keylen = 0;
+		return NULL;
+	}
+
+	keystart = befs_bt_keydata(node);
+	keylen_index = befs_bt_keylen_index(node);
+
+	if (index == 0)
+		prev_key_end = 0;
+	else
+		prev_key_end = fs16_to_cpu(sb, keylen_index[index - 1]);
+
+	*keylen = fs16_to_cpu(sb, keylen_index[index]) - prev_key_end;
+
+	return keystart + prev_key_end;
+}
+
+/**
+ * befs_compare_strings - compare two strings
+ * @key1: pointer to the first key to be compared 
+ * @keylen1: length in bytes of key1
+ * @key2: pointer to the second key to be compared
+ * @kelen2: length in bytes of key2
+ *
+ * Returns 0 if @key1 and @key2 are equal.
+ * Returns >0 if @key1 is greater.
+ * Returns <0 if @key2 is greater..
+ */
+static int
+befs_compare_strings(const void *key1, int keylen1,
+		     const void *key2, int keylen2)
+{
+	int len = min_t(int, keylen1, keylen2);
+	int result = strncmp(key1, key2, len);
+	if (result == 0)
+		result = keylen1 - keylen2;
+	return result;
+}
+
+/* These will be used for non-string keyed btrees */
+#if 0
+static int
+btree_compare_int32(cont void *key1, int keylen1, const void *key2, int keylen2)
+{
+	return *(int32_t *) key1 - *(int32_t *) key2;
+}
+
+static int
+btree_compare_uint32(cont void *key1, int keylen1,
+		     const void *key2, int keylen2)
+{
+	if (*(u_int32_t *) key1 == *(u_int32_t *) key2)
+		return 0;
+	else if (*(u_int32_t *) key1 > *(u_int32_t *) key2)
+		return 1;
+
+	return -1;
+}
+static int
+btree_compare_int64(cont void *key1, int keylen1, const void *key2, int keylen2)
+{
+	if (*(int64_t *) key1 == *(int64_t *) key2)
+		return 0;
+	else if (*(int64_t *) key1 > *(int64_t *) key2)
+		return 1;
+
+	return -1;
+}
+
+static int
+btree_compare_uint64(cont void *key1, int keylen1,
+		     const void *key2, int keylen2)
+{
+	if (*(u_int64_t *) key1 == *(u_int64_t *) key2)
+		return 0;
+	else if (*(u_int64_t *) key1 > *(u_int64_t *) key2)
+		return 1;
+
+	return -1;
+}
+
+static int
+btree_compare_float(cont void *key1, int keylen1, const void *key2, int keylen2)
+{
+	float result = *(float *) key1 - *(float *) key2;
+	if (result == 0.0f)
+		return 0;
+
+	return (result < 0.0f) ? -1 : 1;
+}
+
+static int
+btree_compare_double(cont void *key1, int keylen1,
+		     const void *key2, int keylen2)
+{
+	double result = *(double *) key1 - *(double *) key2;
+	if (result == 0.0)
+		return 0;
+
+	return (result < 0.0) ? -1 : 1;
+}
+#endif				//0
diff --git a/fs/befs/btree.h b/fs/befs/btree.h
new file mode 100644
index 00000000..92e781e5
--- /dev/null
+++ b/fs/befs/btree.h
@@ -0,0 +1,13 @@
+/*
+ * btree.h
+ * 
+ */
+
+
+int befs_btree_find(struct super_block *sb, befs_data_stream * ds,
+		    const char *key, befs_off_t * value);
+
+int befs_btree_read(struct super_block *sb, befs_data_stream * ds,
+		    loff_t key_no, size_t bufsize, char *keybuf,
+		    size_t * keysize, befs_off_t * value);
+
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
new file mode 100644
index 00000000..59096b5e
--- /dev/null
+++ b/fs/befs/datastream.c
@@ -0,0 +1,526 @@
+/*
+ * linux/fs/befs/datastream.c
+ *
+ * Copyright (C) 2001 Will Dyson <will_dyson@pobox.com>
+ *
+ * Based on portions of file.c by Makoto Kato <m_kato@ga2.so-net.ne.jp>
+ *
+ * Many thanks to Dominic Giampaolo, author of "Practical File System
+ * Design with the Be File System", for such a helpful book.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+
+#include "befs.h"
+#include "datastream.h"
+#include "io.h"
+
+const befs_inode_addr BAD_IADDR = { 0, 0, 0 };
+
+static int befs_find_brun_direct(struct super_block *sb,
+				 befs_data_stream * data,
+				 befs_blocknr_t blockno, befs_block_run * run);
+
+static int befs_find_brun_indirect(struct super_block *sb,
+				   befs_data_stream * data,
+				   befs_blocknr_t blockno,
+				   befs_block_run * run);
+
+static int befs_find_brun_dblindirect(struct super_block *sb,
+				      befs_data_stream * data,
+				      befs_blocknr_t blockno,
+				      befs_block_run * run);
+
+/**
+ * befs_read_datastream - get buffer_head containing data, starting from pos.
+ * @sb: Filesystem superblock
+ * @ds: datastrem to find data with
+ * @pos: start of data
+ * @off: offset of data in buffer_head->b_data
+ *
+ * Returns pointer to buffer_head containing data starting with offset @off,
+ * if you don't need to know offset just set @off = NULL.
+ */
+struct buffer_head *
+befs_read_datastream(struct super_block *sb, befs_data_stream * ds,
+		     befs_off_t pos, uint * off)
+{
+	struct buffer_head *bh = NULL;
+	befs_block_run run;
+	befs_blocknr_t block;	/* block coresponding to pos */
+
+	befs_debug(sb, "---> befs_read_datastream() %Lu", pos);
+	block = pos >> BEFS_SB(sb)->block_shift;
+	if (off)
+		*off = pos - (block << BEFS_SB(sb)->block_shift);
+
+	if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) {
+		befs_error(sb, "BeFS: Error finding disk addr of block %lu",
+			   block);
+		befs_debug(sb, "<--- befs_read_datastream() ERROR");
+		return NULL;
+	}
+	bh = befs_bread_iaddr(sb, run);
+	if (!bh) {
+		befs_error(sb, "BeFS: Error reading block %lu from datastream",
+			   block);
+		return NULL;
+	}
+
+	befs_debug(sb, "<--- befs_read_datastream() read data, starting at %Lu",
+		   pos);
+
+	return bh;
+}
+
+/*
+ * Takes a file position and gives back a brun who's starting block
+ * is block number fblock of the file.
+ * 
+ * Returns BEFS_OK or BEFS_ERR.
+ * 
+ * Calls specialized functions for each of the three possible
+ * datastream regions.
+ *
+ * 2001-11-15 Will Dyson
+ */
+int
+befs_fblock2brun(struct super_block *sb, befs_data_stream * data,
+		 befs_blocknr_t fblock, befs_block_run * run)
+{
+	int err;
+	befs_off_t pos = fblock << BEFS_SB(sb)->block_shift;
+
+	if (pos < data->max_direct_range) {
+		err = befs_find_brun_direct(sb, data, fblock, run);
+
+	} else if (pos < data->max_indirect_range) {
+		err = befs_find_brun_indirect(sb, data, fblock, run);
+
+	} else if (pos < data->max_double_indirect_range) {
+		err = befs_find_brun_dblindirect(sb, data, fblock, run);
+
+	} else {
+		befs_error(sb,
+			   "befs_fblock2brun() was asked to find block %lu, "
+			   "which is not mapped by the datastream\n", fblock);
+		err = BEFS_ERR;
+	}
+	return err;
+}
+
+/**
+ * befs_read_lsmylink - read long symlink from datastream.
+ * @sb: Filesystem superblock 
+ * @ds: Datastrem to read from
+ * @buf: Buffer in which to place long symlink data
+ * @len: Length of the long symlink in bytes
+ *
+ * Returns the number of bytes read
+ */
+size_t
+befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
+		   befs_off_t len)
+{
+	befs_off_t bytes_read = 0;	/* bytes readed */
+	u16 plen;
+	struct buffer_head *bh = NULL;
+	befs_debug(sb, "---> befs_read_lsymlink() length: %Lu", len);
+
+	while (bytes_read < len) {
+		bh = befs_read_datastream(sb, ds, bytes_read, NULL);
+		if (!bh) {
+			befs_error(sb, "BeFS: Error reading datastream block "
+				   "starting from %Lu", bytes_read);
+			befs_debug(sb, "<--- befs_read_lsymlink() ERROR");
+			return bytes_read;
+
+		}
+		plen = ((bytes_read + BEFS_SB(sb)->block_size) < len) ?
+		    BEFS_SB(sb)->block_size : len - bytes_read;
+		memcpy(buff + bytes_read, bh->b_data, plen);
+		brelse(bh);
+		bytes_read += plen;
+	}
+
+	befs_debug(sb, "<--- befs_read_lsymlink() read %u bytes", bytes_read);
+	return bytes_read;
+}
+
+/**
+ * befs_count_blocks - blocks used by a file
+ * @sb: Filesystem superblock
+ * @ds: Datastream of the file
+ *
+ * Counts the number of fs blocks that the file represented by
+ * inode occupies on the filesystem, counting both regular file
+ * data and filesystem metadata (and eventually attribute data
+ * when we support attributes)
+*/
+
+befs_blocknr_t
+befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
+{
+	befs_blocknr_t blocks;
+	befs_blocknr_t datablocks;	/* File data blocks */
+	befs_blocknr_t metablocks;	/* FS metadata blocks */
+	befs_sb_info *befs_sb = BEFS_SB(sb);
+
+	befs_debug(sb, "---> befs_count_blocks()");
+
+	datablocks = ds->size >> befs_sb->block_shift;
+	if (ds->size & (befs_sb->block_size - 1))
+		datablocks += 1;
+
+	metablocks = 1;		/* Start with 1 block for inode */
+
+	/* Size of indirect block */
+	if (ds->size > ds->max_direct_range)
+		metablocks += ds->indirect.len;
+
+	/*
+	   Double indir block, plus all the indirect blocks it mapps
+	   In the double-indirect range, all block runs of data are
+	   BEFS_DBLINDIR_BRUN_LEN blocks long. Therefore, we know 
+	   how many data block runs are in the double-indirect region,
+	   and from that we know how many indirect blocks it takes to
+	   map them. We assume that the indirect blocks are also
+	   BEFS_DBLINDIR_BRUN_LEN blocks long.
+	 */
+	if (ds->size > ds->max_indirect_range && ds->max_indirect_range != 0) {
+		uint dbl_bytes;
+		uint dbl_bruns;
+		uint indirblocks;
+
+		dbl_bytes =
+		    ds->max_double_indirect_range - ds->max_indirect_range;
+		dbl_bruns =
+		    dbl_bytes / (befs_sb->block_size * BEFS_DBLINDIR_BRUN_LEN);
+		indirblocks = dbl_bruns / befs_iaddrs_per_block(sb);
+
+		metablocks += ds->double_indirect.len;
+		metablocks += indirblocks;
+	}
+
+	blocks = datablocks + metablocks;
+	befs_debug(sb, "<--- befs_count_blocks() %u blocks", blocks);
+
+	return blocks;
+}
+
+/*
+	Finds the block run that starts at file block number blockno
+	in the file represented by the datastream data, if that 
+	blockno is in the direct region of the datastream.
+	
+	sb: the superblock
+	data: the datastream
+	blockno: the blocknumber to find
+	run: The found run is passed back through this pointer
+	
+	Return value is BEFS_OK if the blockrun is found, BEFS_ERR
+	otherwise.
+	
+	Algorithm:
+	Linear search. Checks each element of array[] to see if it
+	contains the blockno-th filesystem block. This is necessary
+	because the block runs map variable amounts of data. Simply
+	keeps a count of the number of blocks searched so far (sum),
+	incrementing this by the length of each block run as we come
+	across it. Adds sum to *count before returning (this is so
+	you can search multiple arrays that are logicaly one array,
+	as in the indirect region code).
+	
+	When/if blockno is found, if blockno is inside of a block 
+	run as stored on disk, we offset the start and length members
+	of the block run, so that blockno is the start and len is
+	still valid (the run ends in the same place).
+	
+	2001-11-15 Will Dyson
+*/
+static int
+befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
+		      befs_blocknr_t blockno, befs_block_run * run)
+{
+	int i;
+	befs_block_run *array = data->direct;
+	befs_blocknr_t sum;
+	befs_blocknr_t max_block =
+	    data->max_direct_range >> BEFS_SB(sb)->block_shift;
+
+	befs_debug(sb, "---> befs_find_brun_direct(), find %lu", blockno);
+
+	if (blockno > max_block) {
+		befs_error(sb, "befs_find_brun_direct() passed block outside of"
+			   "direct region");
+		return BEFS_ERR;
+	}
+
+	for (i = 0, sum = 0; i < BEFS_NUM_DIRECT_BLOCKS;
+	     sum += array[i].len, i++) {
+		if (blockno >= sum && blockno < sum + (array[i].len)) {
+			int offset = blockno - sum;
+			run->allocation_group = array[i].allocation_group;
+			run->start = array[i].start + offset;
+			run->len = array[i].len - offset;
+
+			befs_debug(sb, "---> befs_find_brun_direct(), "
+				   "found %lu at direct[%d]", blockno, i);
+			return BEFS_OK;
+		}
+	}
+
+	befs_debug(sb, "---> befs_find_brun_direct() ERROR");
+	return BEFS_ERR;
+}
+
+/*
+	Finds the block run that starts at file block number blockno
+	in the file represented by the datastream data, if that 
+	blockno is in the indirect region of the datastream.
+	
+	sb: the superblock
+	data: the datastream
+	blockno: the blocknumber to find
+	run: The found run is passed back through this pointer
+	
+	Return value is BEFS_OK if the blockrun is found, BEFS_ERR
+	otherwise.
+	
+	Algorithm:
+	For each block in the indirect run of the datastream, read
+	it in and search through it for	search_blk.
+	
+	XXX:
+	Really should check to make sure blockno is inside indirect
+	region.
+	
+	2001-11-15 Will Dyson
+*/
+static int
+befs_find_brun_indirect(struct super_block *sb,
+			befs_data_stream * data, befs_blocknr_t blockno,
+			befs_block_run * run)
+{
+	int i, j;
+	befs_blocknr_t sum = 0;
+	befs_blocknr_t indir_start_blk;
+	befs_blocknr_t search_blk;
+	struct buffer_head *indirblock;
+	befs_disk_block_run *array;
+
+	befs_block_run indirect = data->indirect;
+	befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect);
+	int arraylen = befs_iaddrs_per_block(sb);
+
+	befs_debug(sb, "---> befs_find_brun_indirect(), find %lu", blockno);
+
+	indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift;
+	search_blk = blockno - indir_start_blk;
+
+	/* Examine blocks of the indirect run one at a time */
+	for (i = 0; i < indirect.len; i++) {
+		indirblock = befs_bread(sb, indirblockno + i);
+		if (indirblock == NULL) {
+			befs_debug(sb,
+				   "---> befs_find_brun_indirect() failed to "
+				   "read disk block %lu from the indirect brun",
+				   indirblockno + i);
+			return BEFS_ERR;
+		}
+
+		array = (befs_disk_block_run *) indirblock->b_data;
+
+		for (j = 0; j < arraylen; ++j) {
+			int len = fs16_to_cpu(sb, array[j].len);
+
+			if (search_blk >= sum && search_blk < sum + len) {
+				int offset = search_blk - sum;
+				run->allocation_group =
+				    fs32_to_cpu(sb, array[j].allocation_group);
+				run->start =
+				    fs16_to_cpu(sb, array[j].start) + offset;
+				run->len =
+				    fs16_to_cpu(sb, array[j].len) - offset;
+
+				brelse(indirblock);
+				befs_debug(sb,
+					   "<--- befs_find_brun_indirect() found "
+					   "file block %lu at indirect[%d]",
+					   blockno, j + (i * arraylen));
+				return BEFS_OK;
+			}
+			sum += len;
+		}
+
+		brelse(indirblock);
+	}
+
+	/* Only fallthrough is an error */
+	befs_error(sb, "BeFS: befs_find_brun_indirect() failed to find "
+		   "file block %lu", blockno);
+
+	befs_debug(sb, "<--- befs_find_brun_indirect() ERROR");
+	return BEFS_ERR;
+}
+
+/*
+	Finds the block run that starts at file block number blockno
+	in the file represented by the datastream data, if that 
+	blockno is in the double-indirect region of the datastream.
+	
+	sb: the superblock
+	data: the datastream
+	blockno: the blocknumber to find
+	run: The found run is passed back through this pointer
+	
+	Return value is BEFS_OK if the blockrun is found, BEFS_ERR
+	otherwise.
+	
+	Algorithm:
+	The block runs in the double-indirect region are different.
+	They are always allocated 4 fs blocks at a time, so each
+	block run maps a constant amount of file data. This means
+	that we can directly calculate how many block runs into the
+	double-indirect region we need to go to get to the one that
+	maps a particular filesystem block.
+	
+	We do this in two stages. First we calculate which of the
+	inode addresses in the double-indirect block will point us
+	to the indirect block that contains the mapping for the data,
+	then we calculate which of the inode addresses in that 
+	indirect block maps the data block we are after.
+	
+	Oh, and once we've done that, we actually read in the blocks 
+	that contain the inode addresses we calculated above. Even 
+	though the double-indirect run may be several blocks long, 
+	we can calculate which of those blocks will contain the index
+	we are after and only read that one. We then follow it to 
+	the indirect block and perform a  similar process to find
+	the actual block run that maps the data block we are interested
+	in.
+	
+	Then we offset the run as in befs_find_brun_array() and we are 
+	done.
+	
+	2001-11-15 Will Dyson
+*/
+static int
+befs_find_brun_dblindirect(struct super_block *sb,
+			   befs_data_stream * data, befs_blocknr_t blockno,
+			   befs_block_run * run)
+{
+	int dblindir_indx;
+	int indir_indx;
+	int offset;
+	int dbl_which_block;
+	int which_block;
+	int dbl_block_indx;
+	int block_indx;
+	off_t dblindir_leftover;
+	befs_blocknr_t blockno_at_run_start;
+	struct buffer_head *dbl_indir_block;
+	struct buffer_head *indir_block;
+	befs_block_run indir_run;
+	befs_disk_inode_addr *iaddr_array = NULL;
+	befs_sb_info *befs_sb = BEFS_SB(sb);
+
+	befs_blocknr_t indir_start_blk =
+	    data->max_indirect_range >> befs_sb->block_shift;
+
+	off_t dbl_indir_off = blockno - indir_start_blk;
+
+	/* number of data blocks mapped by each of the iaddrs in
+	 * the indirect block pointed to by the double indirect block
+	 */
+	size_t iblklen = BEFS_DBLINDIR_BRUN_LEN;
+
+	/* number of data blocks mapped by each of the iaddrs in
+	 * the double indirect block
+	 */
+	size_t diblklen = iblklen * befs_iaddrs_per_block(sb)
+	    * BEFS_DBLINDIR_BRUN_LEN;
+
+	befs_debug(sb, "---> befs_find_brun_dblindirect() find %lu", blockno);
+
+	/* First, discover which of the double_indir->indir blocks
+	 * contains pos. Then figure out how much of pos that
+	 * accounted for. Then discover which of the iaddrs in
+	 * the indirect block contains pos.
+	 */
+
+	dblindir_indx = dbl_indir_off / diblklen;
+	dblindir_leftover = dbl_indir_off % diblklen;
+	indir_indx = dblindir_leftover / diblklen;
+
+	/* Read double indirect block */
+	dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb);
+	if (dbl_which_block > data->double_indirect.len) {
+		befs_error(sb, "The double-indirect index calculated by "
+			   "befs_read_brun_dblindirect(), %d, is outside the range "
+			   "of the double-indirect block", dblindir_indx);
+		return BEFS_ERR;
+	}
+
+	dbl_indir_block =
+	    befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) +
+					dbl_which_block);
+	if (dbl_indir_block == NULL) {
+		befs_error(sb, "befs_read_brun_dblindirect() couldn't read the "
+			   "double-indirect block at blockno %lu",
+			   iaddr2blockno(sb,
+					 &data->double_indirect) +
+			   dbl_which_block);
+		brelse(dbl_indir_block);
+		return BEFS_ERR;
+	}
+
+	dbl_block_indx =
+	    dblindir_indx - (dbl_which_block * befs_iaddrs_per_block(sb));
+	iaddr_array = (befs_disk_inode_addr *) dbl_indir_block->b_data;
+	indir_run = fsrun_to_cpu(sb, iaddr_array[dbl_block_indx]);
+	brelse(dbl_indir_block);
+	iaddr_array = NULL;
+
+	/* Read indirect block */
+	which_block = indir_indx / befs_iaddrs_per_block(sb);
+	if (which_block > indir_run.len) {
+		befs_error(sb, "The indirect index calculated by "
+			   "befs_read_brun_dblindirect(), %d, is outside the range "
+			   "of the indirect block", indir_indx);
+		return BEFS_ERR;
+	}
+
+	indir_block =
+	    befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block);
+	if (indir_block == NULL) {
+		befs_error(sb, "befs_read_brun_dblindirect() couldn't read the "
+			   "indirect block at blockno %lu",
+			   iaddr2blockno(sb, &indir_run) + which_block);
+		brelse(indir_block);
+		return BEFS_ERR;
+	}
+
+	block_indx = indir_indx - (which_block * befs_iaddrs_per_block(sb));
+	iaddr_array = (befs_disk_inode_addr *) indir_block->b_data;
+	*run = fsrun_to_cpu(sb, iaddr_array[block_indx]);
+	brelse(indir_block);
+	iaddr_array = NULL;
+
+	blockno_at_run_start = indir_start_blk;
+	blockno_at_run_start += diblklen * dblindir_indx;
+	blockno_at_run_start += iblklen * indir_indx;
+	offset = blockno - blockno_at_run_start;
+
+	run->start += offset;
+	run->len -= offset;
+
+	befs_debug(sb, "Found file block %lu in double_indirect[%d][%d],"
+		   " double_indirect_leftover = %lu",
+		   blockno, dblindir_indx, indir_indx, dblindir_leftover);
+
+	return BEFS_OK;
+}
diff --git a/fs/befs/datastream.h b/fs/befs/datastream.h
new file mode 100644
index 00000000..45e8a3c9
--- /dev/null
+++ b/fs/befs/datastream.h
@@ -0,0 +1,19 @@
+/*
+ * datastream.h
+ *
+ */
+
+struct buffer_head *befs_read_datastream(struct super_block *sb,
+					 befs_data_stream * ds, befs_off_t pos,
+					 uint * off);
+
+int befs_fblock2brun(struct super_block *sb, befs_data_stream * data,
+		     befs_blocknr_t fblock, befs_block_run * run);
+
+size_t befs_read_lsymlink(struct super_block *sb, befs_data_stream * data,
+			  void *buff, befs_off_t len);
+
+befs_blocknr_t befs_count_blocks(struct super_block *sb, befs_data_stream * ds);
+
+extern const befs_inode_addr BAD_IADDR;
+
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
new file mode 100644
index 00000000..622e7377
--- /dev/null
+++ b/fs/befs/debug.c
@@ -0,0 +1,282 @@
+/*
+ *  linux/fs/befs/debug.c
+ * 
+ * Copyright (C) 2001 Will Dyson (will_dyson at pobox.com)
+ *
+ * With help from the ntfs-tng driver by Anton Altparmakov
+ *
+ * Copyright (C) 1999  Makoto Kato (m_kato@ga2.so-net.ne.jp)
+ *
+ * debug functions
+ */
+
+#ifdef __KERNEL__
+
+#include <stdarg.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+
+#endif				/* __KERNEL__ */
+
+#include "befs.h"
+
+#define ERRBUFSIZE 1024
+
+void
+befs_error(const struct super_block *sb, const char *fmt, ...)
+{
+	va_list args;
+	char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
+	if (err_buf == NULL) {
+		printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE);
+		return;
+	}
+
+	va_start(args, fmt);
+	vsnprintf(err_buf, ERRBUFSIZE, fmt, args);
+	va_end(args);
+
+	printk(KERN_ERR "BeFS(%s): %s\n", sb->s_id, err_buf);
+	kfree(err_buf);
+}
+
+void
+befs_warning(const struct super_block *sb, const char *fmt, ...)
+{
+	va_list args;
+	char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
+	if (err_buf == NULL) {
+		printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE);
+		return;
+	}
+
+	va_start(args, fmt);
+	vsnprintf(err_buf, ERRBUFSIZE, fmt, args);
+	va_end(args);
+
+	printk(KERN_WARNING "BeFS(%s): %s\n", sb->s_id, err_buf);
+
+	kfree(err_buf);
+}
+
+void
+befs_debug(const struct super_block *sb, const char *fmt, ...)
+{
+#ifdef CONFIG_BEFS_DEBUG
+
+	va_list args;
+	char *err_buf = NULL;
+
+	if (BEFS_SB(sb)->mount_opts.debug) {
+		err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
+		if (err_buf == NULL) {
+			printk(KERN_ERR "could not allocate %d bytes\n",
+				ERRBUFSIZE);
+			return;
+		}
+
+		va_start(args, fmt);
+		vsnprintf(err_buf, ERRBUFSIZE, fmt, args);
+		va_end(args);
+
+		printk(KERN_DEBUG "BeFS(%s): %s\n", sb->s_id, err_buf);
+
+		kfree(err_buf);
+	}
+
+#endif				//CONFIG_BEFS_DEBUG
+}
+
+void
+befs_dump_inode(const struct super_block *sb, befs_inode * inode)
+{
+#ifdef CONFIG_BEFS_DEBUG
+
+	befs_block_run tmp_run;
+
+	befs_debug(sb, "befs_inode information");
+
+	befs_debug(sb, "  magic1 %08x", fs32_to_cpu(sb, inode->magic1));
+
+	tmp_run = fsrun_to_cpu(sb, inode->inode_num);
+	befs_debug(sb, "  inode_num %u, %hu, %hu",
+		   tmp_run.allocation_group, tmp_run.start, tmp_run.len);
+
+	befs_debug(sb, "  uid %u", fs32_to_cpu(sb, inode->uid));
+	befs_debug(sb, "  gid %u", fs32_to_cpu(sb, inode->gid));
+	befs_debug(sb, "  mode %08x", fs32_to_cpu(sb, inode->mode));
+	befs_debug(sb, "  flags %08x", fs32_to_cpu(sb, inode->flags));
+	befs_debug(sb, "  create_time %Lu",
+		   fs64_to_cpu(sb, inode->create_time));
+	befs_debug(sb, "  last_modified_time %Lu",
+		   fs64_to_cpu(sb, inode->last_modified_time));
+
+	tmp_run = fsrun_to_cpu(sb, inode->parent);
+	befs_debug(sb, "  parent [%u, %hu, %hu]",
+		   tmp_run.allocation_group, tmp_run.start, tmp_run.len);
+
+	tmp_run = fsrun_to_cpu(sb, inode->attributes);
+	befs_debug(sb, "  attributes [%u, %hu, %hu]",
+		   tmp_run.allocation_group, tmp_run.start, tmp_run.len);
+
+	befs_debug(sb, "  type %08x", fs32_to_cpu(sb, inode->type));
+	befs_debug(sb, "  inode_size %u", fs32_to_cpu(sb, inode->inode_size));
+
+	if (S_ISLNK(fs32_to_cpu(sb, inode->mode))) {
+		befs_debug(sb, "  Symbolic link [%s]", inode->data.symlink);
+	} else {
+		int i;
+
+		for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; i++) {
+			tmp_run =
+			    fsrun_to_cpu(sb, inode->data.datastream.direct[i]);
+			befs_debug(sb, "  direct %d [%u, %hu, %hu]", i,
+				   tmp_run.allocation_group, tmp_run.start,
+				   tmp_run.len);
+		}
+		befs_debug(sb, "  max_direct_range %Lu",
+			   fs64_to_cpu(sb,
+				       inode->data.datastream.
+				       max_direct_range));
+
+		tmp_run = fsrun_to_cpu(sb, inode->data.datastream.indirect);
+		befs_debug(sb, "  indirect [%u, %hu, %hu]",
+			   tmp_run.allocation_group,
+			   tmp_run.start, tmp_run.len);
+
+		befs_debug(sb, "  max_indirect_range %Lu",
+			   fs64_to_cpu(sb,
+				       inode->data.datastream.
+				       max_indirect_range));
+
+		tmp_run =
+		    fsrun_to_cpu(sb, inode->data.datastream.double_indirect);
+		befs_debug(sb, "  double indirect [%u, %hu, %hu]",
+			   tmp_run.allocation_group, tmp_run.start,
+			   tmp_run.len);
+
+		befs_debug(sb, "  max_double_indirect_range %Lu",
+			   fs64_to_cpu(sb,
+				       inode->data.datastream.
+				       max_double_indirect_range));
+
+		befs_debug(sb, "  size %Lu",
+			   fs64_to_cpu(sb, inode->data.datastream.size));
+	}
+
+#endif				//CONFIG_BEFS_DEBUG
+}
+
+/*
+ * Display super block structure for debug.
+ */
+
+void
+befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
+{
+#ifdef CONFIG_BEFS_DEBUG
+
+	befs_block_run tmp_run;
+
+	befs_debug(sb, "befs_super_block information");
+
+	befs_debug(sb, "  name %s", sup->name);
+	befs_debug(sb, "  magic1 %08x", fs32_to_cpu(sb, sup->magic1));
+	befs_debug(sb, "  fs_byte_order %08x",
+		   fs32_to_cpu(sb, sup->fs_byte_order));
+
+	befs_debug(sb, "  block_size %u", fs32_to_cpu(sb, sup->block_size));
+	befs_debug(sb, "  block_shift %u", fs32_to_cpu(sb, sup->block_shift));
+
+	befs_debug(sb, "  num_blocks %Lu", fs64_to_cpu(sb, sup->num_blocks));
+	befs_debug(sb, "  used_blocks %Lu", fs64_to_cpu(sb, sup->used_blocks));
+
+	befs_debug(sb, "  magic2 %08x", fs32_to_cpu(sb, sup->magic2));
+	befs_debug(sb, "  blocks_per_ag %u",
+		   fs32_to_cpu(sb, sup->blocks_per_ag));
+	befs_debug(sb, "  ag_shift %u", fs32_to_cpu(sb, sup->ag_shift));
+	befs_debug(sb, "  num_ags %u", fs32_to_cpu(sb, sup->num_ags));
+
+	befs_debug(sb, "  flags %08x", fs32_to_cpu(sb, sup->flags));
+
+	tmp_run = fsrun_to_cpu(sb, sup->log_blocks);
+	befs_debug(sb, "  log_blocks %u, %hu, %hu",
+		   tmp_run.allocation_group, tmp_run.start, tmp_run.len);
+
+	befs_debug(sb, "  log_start %Ld", fs64_to_cpu(sb, sup->log_start));
+	befs_debug(sb, "  log_end %Ld", fs64_to_cpu(sb, sup->log_end));
+
+	befs_debug(sb, "  magic3 %08x", fs32_to_cpu(sb, sup->magic3));
+
+	tmp_run = fsrun_to_cpu(sb, sup->root_dir);
+	befs_debug(sb, "  root_dir %u, %hu, %hu",
+		   tmp_run.allocation_group, tmp_run.start, tmp_run.len);
+
+	tmp_run = fsrun_to_cpu(sb, sup->indices);
+	befs_debug(sb, "  indices %u, %hu, %hu",
+		   tmp_run.allocation_group, tmp_run.start, tmp_run.len);
+
+#endif				//CONFIG_BEFS_DEBUG
+}
+
+#if 0
+/* unused */
+void
+befs_dump_small_data(const struct super_block *sb, befs_small_data * sd)
+{
+}
+
+/* unused */
+void
+befs_dump_run(const struct super_block *sb, befs_disk_block_run run)
+{
+#ifdef CONFIG_BEFS_DEBUG
+
+	befs_block_run n = fsrun_to_cpu(sb, run);
+
+	befs_debug(sb, "[%u, %hu, %hu]", n.allocation_group, n.start, n.len);
+
+#endif				//CONFIG_BEFS_DEBUG
+}
+#endif  /*  0  */
+
+void
+befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super * super)
+{
+#ifdef CONFIG_BEFS_DEBUG
+
+	befs_debug(sb, "Btree super structure");
+	befs_debug(sb, "  magic %08x", fs32_to_cpu(sb, super->magic));
+	befs_debug(sb, "  node_size %u", fs32_to_cpu(sb, super->node_size));
+	befs_debug(sb, "  max_depth %08x", fs32_to_cpu(sb, super->max_depth));
+
+	befs_debug(sb, "  data_type %08x", fs32_to_cpu(sb, super->data_type));
+	befs_debug(sb, "  root_node_pointer %016LX",
+		   fs64_to_cpu(sb, super->root_node_ptr));
+	befs_debug(sb, "  free_node_pointer %016LX",
+		   fs64_to_cpu(sb, super->free_node_ptr));
+	befs_debug(sb, "  maximum size %016LX",
+		   fs64_to_cpu(sb, super->max_size));
+
+#endif				//CONFIG_BEFS_DEBUG
+}
+
+void
+befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead * node)
+{
+#ifdef CONFIG_BEFS_DEBUG
+
+	befs_debug(sb, "Btree node structure");
+	befs_debug(sb, "  left %016LX", fs64_to_cpu(sb, node->left));
+	befs_debug(sb, "  right %016LX", fs64_to_cpu(sb, node->right));
+	befs_debug(sb, "  overflow %016LX", fs64_to_cpu(sb, node->overflow));
+	befs_debug(sb, "  all_key_count %hu",
+		   fs16_to_cpu(sb, node->all_key_count));
+	befs_debug(sb, "  all_key_length %hu",
+		   fs16_to_cpu(sb, node->all_key_length));
+
+#endif				//CONFIG_BEFS_DEBUG
+}
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
new file mode 100644
index 00000000..27223878
--- /dev/null
+++ b/fs/befs/endian.h
@@ -0,0 +1,125 @@
+/*
+ * linux/fs/befs/endian.h
+ *
+ * Copyright (C) 2001 Will Dyson <will_dyson@pobox.com>
+ *
+ * Partially based on similar funtions in the sysv driver.
+ */
+
+#ifndef LINUX_BEFS_ENDIAN
+#define LINUX_BEFS_ENDIAN
+
+#include <asm/byteorder.h>
+
+static inline u64
+fs64_to_cpu(const struct super_block *sb, fs64 n)
+{
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
+		return le64_to_cpu((__force __le64)n);
+	else
+		return be64_to_cpu((__force __be64)n);
+}
+
+static inline fs64
+cpu_to_fs64(const struct super_block *sb, u64 n)
+{
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
+		return (__force fs64)cpu_to_le64(n);
+	else
+		return (__force fs64)cpu_to_be64(n);
+}
+
+static inline u32
+fs32_to_cpu(const struct super_block *sb, fs32 n)
+{
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
+		return le32_to_cpu((__force __le32)n);
+	else
+		return be32_to_cpu((__force __be32)n);
+}
+
+static inline fs32
+cpu_to_fs32(const struct super_block *sb, u32 n)
+{
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
+		return (__force fs32)cpu_to_le32(n);
+	else
+		return (__force fs32)cpu_to_be32(n);
+}
+
+static inline u16
+fs16_to_cpu(const struct super_block *sb, fs16 n)
+{
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
+		return le16_to_cpu((__force __le16)n);
+	else
+		return be16_to_cpu((__force __be16)n);
+}
+
+static inline fs16
+cpu_to_fs16(const struct super_block *sb, u16 n)
+{
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
+		return (__force fs16)cpu_to_le16(n);
+	else
+		return (__force fs16)cpu_to_be16(n);
+}
+
+/* Composite types below here */
+
+static inline befs_block_run
+fsrun_to_cpu(const struct super_block *sb, befs_disk_block_run n)
+{
+	befs_block_run run;
+
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) {
+		run.allocation_group = le32_to_cpu((__force __le32)n.allocation_group);
+		run.start = le16_to_cpu((__force __le16)n.start);
+		run.len = le16_to_cpu((__force __le16)n.len);
+	} else {
+		run.allocation_group = be32_to_cpu((__force __be32)n.allocation_group);
+		run.start = be16_to_cpu((__force __be16)n.start);
+		run.len = be16_to_cpu((__force __be16)n.len);
+	}
+	return run;
+}
+
+static inline befs_disk_block_run
+cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
+{
+	befs_disk_block_run run;
+
+	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) {
+		run.allocation_group = cpu_to_le32(n.allocation_group);
+		run.start = cpu_to_le16(n.start);
+		run.len = cpu_to_le16(n.len);
+	} else {
+		run.allocation_group = cpu_to_be32(n.allocation_group);
+		run.start = cpu_to_be16(n.start);
+		run.len = cpu_to_be16(n.len);
+	}
+	return run;
+}
+
+static inline befs_data_stream
+fsds_to_cpu(const struct super_block *sb, const befs_disk_data_stream *n)
+{
+	befs_data_stream data;
+	int i;
+
+	for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i)
+		data.direct[i] = fsrun_to_cpu(sb, n->direct[i]);
+
+	data.max_direct_range = fs64_to_cpu(sb, n->max_direct_range);
+	data.indirect = fsrun_to_cpu(sb, n->indirect);
+	data.max_indirect_range = fs64_to_cpu(sb, n->max_indirect_range);
+	data.double_indirect = fsrun_to_cpu(sb, n->double_indirect);
+	data.max_double_indirect_range = fs64_to_cpu(sb,
+						     n->
+						     max_double_indirect_range);
+	data.size = fs64_to_cpu(sb, n->size);
+
+	return data;
+}
+
+#endif				//LINUX_BEFS_ENDIAN
diff --git a/fs/befs/inode.c b/fs/befs/inode.c
new file mode 100644
index 00000000..94c17f9a
--- /dev/null
+++ b/fs/befs/inode.c
@@ -0,0 +1,52 @@
+/*
+ * inode.c
+ * 
+ * Copyright (C) 2001 Will Dyson <will_dyson@pobox.com>
+ */
+
+#include <linux/fs.h>
+
+#include "befs.h"
+#include "inode.h"
+
+/*
+	Validates the correctness of the befs inode
+	Returns BEFS_OK if the inode should be used, otherwise
+	returns BEFS_BAD_INODE
+*/
+int
+befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
+		 befs_blocknr_t inode)
+{
+	u32 magic1 = fs32_to_cpu(sb, raw_inode->magic1);
+	befs_inode_addr ino_num = fsrun_to_cpu(sb, raw_inode->inode_num);
+	u32 flags = fs32_to_cpu(sb, raw_inode->flags);
+
+	/* check magic header. */
+	if (magic1 != BEFS_INODE_MAGIC1) {
+		befs_error(sb,
+			   "Inode has a bad magic header - inode = %lu", inode);
+		return BEFS_BAD_INODE;
+	}
+
+	/*
+	 * Sanity check2: inodes store their own block address. Check it.
+	 */
+	if (inode != iaddr2blockno(sb, &ino_num)) {
+		befs_error(sb, "inode blocknr field disagrees with vfs "
+			   "VFS: %lu, Inode %lu",
+			   inode, iaddr2blockno(sb, &ino_num));
+		return BEFS_BAD_INODE;
+	}
+
+	/*
+	 * check flag
+	 */
+
+	if (!(flags & BEFS_INODE_IN_USE)) {
+		befs_error(sb, "inode is not used - inode = %lu", inode);
+		return BEFS_BAD_INODE;
+	}
+
+	return BEFS_OK;
+}
diff --git a/fs/befs/inode.h b/fs/befs/inode.h
new file mode 100644
index 00000000..9dc7fd9b
--- /dev/null
+++ b/fs/befs/inode.h
@@ -0,0 +1,8 @@
+/*
+ * inode.h
+ * 
+ */
+
+int befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
+		     befs_blocknr_t inode);
+
diff --git a/fs/befs/io.c b/fs/befs/io.c
new file mode 100644
index 00000000..ddef98aa
--- /dev/null
+++ b/fs/befs/io.c
@@ -0,0 +1,83 @@
+/*
+ * linux/fs/befs/io.c
+ *
+ * Copyright (C) 2001 Will Dyson <will_dyson@pobox.com
+ *
+ * Based on portions of file.c and inode.c 
+ * by Makoto Kato (m_kato@ga2.so-net.ne.jp)
+ *
+ * Many thanks to Dominic Giampaolo, author of Practical File System
+ * Design with the Be File System, for such a helpful book.
+ *
+ */
+
+#include <linux/buffer_head.h>
+
+#include "befs.h"
+#include "io.h"
+
+/*
+ * Converts befs notion of disk addr to a disk offset and uses
+ * linux kernel function sb_bread() to get the buffer containing
+ * the offset. -Will Dyson
+ *
+ */
+
+struct buffer_head *
+befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
+{
+	struct buffer_head *bh = NULL;
+	befs_blocknr_t block = 0;
+	befs_sb_info *befs_sb = BEFS_SB(sb);
+
+	befs_debug(sb, "---> Enter befs_read_iaddr() "
+		   "[%u, %hu, %hu]",
+		   iaddr.allocation_group, iaddr.start, iaddr.len);
+
+	if (iaddr.allocation_group > befs_sb->num_ags) {
+		befs_error(sb, "BEFS: Invalid allocation group %u, max is %u",
+			   iaddr.allocation_group, befs_sb->num_ags);
+		goto error;
+	}
+
+	block = iaddr2blockno(sb, &iaddr);
+
+	befs_debug(sb, "befs_read_iaddr: offset = %lu", block);
+
+	bh = sb_bread(sb, block);
+
+	if (bh == NULL) {
+		befs_error(sb, "Failed to read block %lu", block);
+		goto error;
+	}
+
+	befs_debug(sb, "<--- befs_read_iaddr()");
+	return bh;
+
+      error:
+	befs_debug(sb, "<--- befs_read_iaddr() ERROR");
+	return NULL;
+}
+
+struct buffer_head *
+befs_bread(struct super_block *sb, befs_blocknr_t block)
+{
+	struct buffer_head *bh = NULL;
+
+	befs_debug(sb, "---> Enter befs_read() %Lu", block);
+
+	bh = sb_bread(sb, block);
+
+	if (bh == NULL) {
+		befs_error(sb, "Failed to read block %lu", block);
+		goto error;
+	}
+
+	befs_debug(sb, "<--- befs_read()");
+
+	return bh;
+
+      error:
+	befs_debug(sb, "<--- befs_read() ERROR");
+	return NULL;
+}
diff --git a/fs/befs/io.h b/fs/befs/io.h
new file mode 100644
index 00000000..9b78266b
--- /dev/null
+++ b/fs/befs/io.h
@@ -0,0 +1,9 @@
+/*
+ * io.h
+ */
+
+struct buffer_head *befs_bread_iaddr(struct super_block *sb,
+				     befs_inode_addr iaddr);
+
+struct buffer_head *befs_bread(struct super_block *sb, befs_blocknr_t block);
+
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
new file mode 100644
index 00000000..720d885e
--- /dev/null
+++ b/fs/befs/linuxvfs.c
@@ -0,0 +1,979 @@
+/*
+ * linux/fs/befs/linuxvfs.c
+ *
+ * Copyright (C) 2001 Will Dyson <will_dyson@pobox.com
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <linux/nls.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/parser.h>
+#include <linux/namei.h>
+
+#include "befs.h"
+#include "btree.h"
+#include "inode.h"
+#include "datastream.h"
+#include "super.h"
+#include "io.h"
+
+MODULE_DESCRIPTION("BeOS File System (BeFS) driver");
+MODULE_AUTHOR("Will Dyson");
+MODULE_LICENSE("GPL");
+
+/* The units the vfs expects inode->i_blocks to be in */
+#define VFS_BLOCK_SIZE 512
+
+static int befs_readdir(struct file *, void *, filldir_t);
+static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+static int befs_readpage(struct file *file, struct page *page);
+static sector_t befs_bmap(struct address_space *mapping, sector_t block);
+static struct dentry *befs_lookup(struct inode *, struct dentry *, struct nameidata *);
+static struct inode *befs_iget(struct super_block *, unsigned long);
+static struct inode *befs_alloc_inode(struct super_block *sb);
+static void befs_destroy_inode(struct inode *inode);
+static int befs_init_inodecache(void);
+static void befs_destroy_inodecache(void);
+static void *befs_follow_link(struct dentry *, struct nameidata *);
+static void befs_put_link(struct dentry *, struct nameidata *, void *);
+static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
+			char **out, int *out_len);
+static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
+			char **out, int *out_len);
+static void befs_put_super(struct super_block *);
+static int befs_remount(struct super_block *, int *, char *);
+static int befs_statfs(struct dentry *, struct kstatfs *);
+static int parse_options(char *, befs_mount_options *);
+
+static const struct super_operations befs_sops = {
+	.alloc_inode	= befs_alloc_inode,	/* allocate a new inode */
+	.destroy_inode	= befs_destroy_inode, /* deallocate an inode */
+	.put_super	= befs_put_super,	/* uninit super */
+	.statfs		= befs_statfs,	/* statfs */
+	.remount_fs	= befs_remount,
+	.show_options	= generic_show_options,
+};
+
+/* slab cache for befs_inode_info objects */
+static struct kmem_cache *befs_inode_cachep;
+
+static const struct file_operations befs_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= befs_readdir,
+	.llseek		= generic_file_llseek,
+};
+
+static const struct inode_operations befs_dir_inode_operations = {
+	.lookup		= befs_lookup,
+};
+
+static const struct address_space_operations befs_aops = {
+	.readpage	= befs_readpage,
+	.bmap		= befs_bmap,
+};
+
+static const struct inode_operations befs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= befs_follow_link,
+	.put_link	= befs_put_link,
+};
+
+/* 
+ * Called by generic_file_read() to read a page of data
+ * 
+ * In turn, simply calls a generic block read function and
+ * passes it the address of befs_get_block, for mapping file
+ * positions to disk blocks.
+ */
+static int
+befs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, befs_get_block);
+}
+
+static sector_t
+befs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, befs_get_block);
+}
+
+/* 
+ * Generic function to map a file position (block) to a 
+ * disk offset (passed back in bh_result).
+ *
+ * Used by many higher level functions.
+ *
+ * Calls befs_fblock2brun() in datastream.c to do the real work.
+ *
+ * -WD 10-26-01
+ */
+
+static int
+befs_get_block(struct inode *inode, sector_t block,
+	       struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
+	befs_block_run run = BAD_IADDR;
+	int res = 0;
+	ulong disk_off;
+
+	befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld",
+		   inode->i_ino, block);
+
+	if (block < 0) {
+		befs_error(sb, "befs_get_block() was asked for a block "
+			   "number less than zero: block %ld in inode %lu",
+			   block, inode->i_ino);
+		return -EIO;
+	}
+
+	if (create) {
+		befs_error(sb, "befs_get_block() was asked to write to "
+			   "block %ld in inode %lu", block, inode->i_ino);
+		return -EPERM;
+	}
+
+	res = befs_fblock2brun(sb, ds, block, &run);
+	if (res != BEFS_OK) {
+		befs_error(sb,
+			   "<--- befs_get_block() for inode %lu, block "
+			   "%ld ERROR", inode->i_ino, block);
+		return -EFBIG;
+	}
+
+	disk_off = (ulong) iaddr2blockno(sb, &run);
+
+	map_bh(bh_result, inode->i_sb, disk_off);
+
+	befs_debug(sb, "<--- befs_get_block() for inode %lu, block %ld, "
+		   "disk address %lu", inode->i_ino, block, disk_off);
+
+	return 0;
+}
+
+static struct dentry *
+befs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = NULL;
+	struct super_block *sb = dir->i_sb;
+	befs_data_stream *ds = &BEFS_I(dir)->i_data.ds;
+	befs_off_t offset;
+	int ret;
+	int utfnamelen;
+	char *utfname;
+	const char *name = dentry->d_name.name;
+
+	befs_debug(sb, "---> befs_lookup() "
+		   "name %s inode %ld", dentry->d_name.name, dir->i_ino);
+
+	/* Convert to UTF-8 */
+	if (BEFS_SB(sb)->nls) {
+		ret =
+		    befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen);
+		if (ret < 0) {
+			befs_debug(sb, "<--- befs_lookup() ERROR");
+			return ERR_PTR(ret);
+		}
+		ret = befs_btree_find(sb, ds, utfname, &offset);
+		kfree(utfname);
+
+	} else {
+		ret = befs_btree_find(sb, ds, dentry->d_name.name, &offset);
+	}
+
+	if (ret == BEFS_BT_NOT_FOUND) {
+		befs_debug(sb, "<--- befs_lookup() %s not found",
+			   dentry->d_name.name);
+		return ERR_PTR(-ENOENT);
+
+	} else if (ret != BEFS_OK || offset == 0) {
+		befs_warning(sb, "<--- befs_lookup() Error");
+		return ERR_PTR(-ENODATA);
+	}
+
+	inode = befs_iget(dir->i_sb, (ino_t) offset);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	d_add(dentry, inode);
+
+	befs_debug(sb, "<--- befs_lookup()");
+
+	return NULL;
+}
+
+static int
+befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
+	befs_off_t value;
+	int result;
+	size_t keysize;
+	unsigned char d_type;
+	char keybuf[BEFS_NAME_LEN + 1];
+	char *nlsname;
+	int nlsnamelen;
+	const char *dirname = filp->f_path.dentry->d_name.name;
+
+	befs_debug(sb, "---> befs_readdir() "
+		   "name %s, inode %ld, filp->f_pos %Ld",
+		   dirname, inode->i_ino, filp->f_pos);
+
+	result = befs_btree_read(sb, ds, filp->f_pos, BEFS_NAME_LEN + 1,
+				 keybuf, &keysize, &value);
+
+	if (result == BEFS_ERR) {
+		befs_debug(sb, "<--- befs_readdir() ERROR");
+		befs_error(sb, "IO error reading %s (inode %lu)",
+			   dirname, inode->i_ino);
+		return -EIO;
+
+	} else if (result == BEFS_BT_END) {
+		befs_debug(sb, "<--- befs_readdir() END");
+		return 0;
+
+	} else if (result == BEFS_BT_EMPTY) {
+		befs_debug(sb, "<--- befs_readdir() Empty directory");
+		return 0;
+	}
+
+	d_type = DT_UNKNOWN;
+
+	/* Convert to NLS */
+	if (BEFS_SB(sb)->nls) {
+		result =
+		    befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
+		if (result < 0) {
+			befs_debug(sb, "<--- befs_readdir() ERROR");
+			return result;
+		}
+		result = filldir(dirent, nlsname, nlsnamelen, filp->f_pos,
+				 (ino_t) value, d_type);
+		kfree(nlsname);
+
+	} else {
+		result = filldir(dirent, keybuf, keysize, filp->f_pos,
+				 (ino_t) value, d_type);
+	}
+
+	filp->f_pos++;
+
+	befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos);
+
+	return 0;
+}
+
+static struct inode *
+befs_alloc_inode(struct super_block *sb)
+{
+        struct befs_inode_info *bi;
+        bi = (struct befs_inode_info *)kmem_cache_alloc(befs_inode_cachep,
+							GFP_KERNEL);
+        if (!bi)
+                return NULL;
+        return &bi->vfs_inode;
+}
+
+static void befs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
+}
+
+static void befs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, befs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+        struct befs_inode_info *bi = (struct befs_inode_info *) foo;
+
+	inode_init_once(&bi->vfs_inode);
+}
+
+static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct buffer_head *bh = NULL;
+	befs_inode *raw_inode = NULL;
+
+	befs_sb_info *befs_sb = BEFS_SB(sb);
+	befs_inode_info *befs_ino = NULL;
+	struct inode *inode;
+	long ret = -EIO;
+
+	befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino);
+
+	inode = iget_locked(sb, ino);
+	if (IS_ERR(inode))
+		return inode;
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	befs_ino = BEFS_I(inode);
+
+	/* convert from vfs's inode number to befs's inode number */
+	befs_ino->i_inode_num = blockno2iaddr(sb, inode->i_ino);
+
+	befs_debug(sb, "  real inode number [%u, %hu, %hu]",
+		   befs_ino->i_inode_num.allocation_group,
+		   befs_ino->i_inode_num.start, befs_ino->i_inode_num.len);
+
+	bh = befs_bread(sb, inode->i_ino);
+	if (!bh) {
+		befs_error(sb, "unable to read inode block - "
+			   "inode = %lu", inode->i_ino);
+		goto unacquire_none;
+	}
+
+	raw_inode = (befs_inode *) bh->b_data;
+
+	befs_dump_inode(sb, raw_inode);
+
+	if (befs_check_inode(sb, raw_inode, inode->i_ino) != BEFS_OK) {
+		befs_error(sb, "Bad inode: %lu", inode->i_ino);
+		goto unacquire_bh;
+	}
+
+	inode->i_mode = (umode_t) fs32_to_cpu(sb, raw_inode->mode);
+
+	/*
+	 * set uid and gid.  But since current BeOS is single user OS, so
+	 * you can change by "uid" or "gid" options.
+	 */   
+
+	inode->i_uid = befs_sb->mount_opts.use_uid ?
+	    befs_sb->mount_opts.uid : (uid_t) fs32_to_cpu(sb, raw_inode->uid);
+	inode->i_gid = befs_sb->mount_opts.use_gid ?
+	    befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid);
+
+	inode->i_nlink = 1;
+
+	/*
+	 * BEFS's time is 64 bits, but current VFS is 32 bits...
+	 * BEFS don't have access time. Nor inode change time. VFS
+	 * doesn't have creation time.
+	 * Also, the lower 16 bits of the last_modified_time and 
+	 * create_time are just a counter to help ensure uniqueness
+	 * for indexing purposes. (PFD, page 54)
+	 */
+
+	inode->i_mtime.tv_sec =
+	    fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16;
+	inode->i_mtime.tv_nsec = 0;   /* lower 16 bits are not a time */	
+	inode->i_ctime = inode->i_mtime;
+	inode->i_atime = inode->i_mtime;
+
+	befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num);
+	befs_ino->i_parent = fsrun_to_cpu(sb, raw_inode->parent);
+	befs_ino->i_attribute = fsrun_to_cpu(sb, raw_inode->attributes);
+	befs_ino->i_flags = fs32_to_cpu(sb, raw_inode->flags);
+
+	if (S_ISLNK(inode->i_mode) && !(befs_ino->i_flags & BEFS_LONG_SYMLINK)){
+		inode->i_size = 0;
+		inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE;
+		strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
+			BEFS_SYMLINK_LEN - 1);
+		befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0';
+	} else {
+		int num_blks;
+
+		befs_ino->i_data.ds =
+		    fsds_to_cpu(sb, &raw_inode->data.datastream);
+
+		num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds);
+		inode->i_blocks =
+		    num_blks * (befs_sb->block_size / VFS_BLOCK_SIZE);
+		inode->i_size = befs_ino->i_data.ds.size;
+	}
+
+	inode->i_mapping->a_ops = &befs_aops;
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_fop = &generic_ro_fops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &befs_dir_inode_operations;
+		inode->i_fop = &befs_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &befs_symlink_inode_operations;
+	} else {
+		befs_error(sb, "Inode %lu is not a regular file, "
+			   "directory or symlink. THAT IS WRONG! BeFS has no "
+			   "on disk special files", inode->i_ino);
+		goto unacquire_bh;
+	}
+
+	brelse(bh);
+	befs_debug(sb, "<--- befs_read_inode()");
+	unlock_new_inode(inode);
+	return inode;
+
+      unacquire_bh:
+	brelse(bh);
+
+      unacquire_none:
+	iget_failed(inode);
+	befs_debug(sb, "<--- befs_read_inode() - Bad inode");
+	return ERR_PTR(ret);
+}
+
+/* Initialize the inode cache. Called at fs setup.
+ *
+ * Taken from NFS implementation by Al Viro.
+ */
+static int
+befs_init_inodecache(void)
+{
+	befs_inode_cachep = kmem_cache_create("befs_inode_cache",
+					      sizeof (struct befs_inode_info),
+					      0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					      init_once);
+	if (befs_inode_cachep == NULL) {
+		printk(KERN_ERR "befs_init_inodecache: "
+		       "Couldn't initialize inode slabcache\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/* Called at fs teardown.
+ * 
+ * Taken from NFS implementation by Al Viro.
+ */
+static void
+befs_destroy_inodecache(void)
+{
+	kmem_cache_destroy(befs_inode_cachep);
+}
+
+/*
+ * The inode of symbolic link is different to data stream.
+ * The data stream become link name. Unless the LONG_SYMLINK
+ * flag is set.
+ */
+static void *
+befs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	befs_inode_info *befs_ino = BEFS_I(dentry->d_inode);
+	char *link;
+
+	if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
+		struct super_block *sb = dentry->d_sb;
+		befs_data_stream *data = &befs_ino->i_data.ds;
+		befs_off_t len = data->size;
+
+		if (len == 0) {
+			befs_error(sb, "Long symlink with illegal length");
+			link = ERR_PTR(-EIO);
+		} else {
+			befs_debug(sb, "Follow long symlink");
+
+			link = kmalloc(len, GFP_NOFS);
+			if (!link) {
+				link = ERR_PTR(-ENOMEM);
+			} else if (befs_read_lsymlink(sb, data, link, len) != len) {
+				kfree(link);
+				befs_error(sb, "Failed to read entire long symlink");
+				link = ERR_PTR(-EIO);
+			} else {
+				link[len - 1] = '\0';
+			}
+		}
+	} else {
+		link = befs_ino->i_data.symlink;
+	}
+
+	nd_set_link(nd, link);
+	return NULL;
+}
+
+static void befs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+	befs_inode_info *befs_ino = BEFS_I(dentry->d_inode);
+	if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
+		char *link = nd_get_link(nd);
+		if (!IS_ERR(link))
+			kfree(link);
+	}
+}
+
+/*
+ * UTF-8 to NLS charset  convert routine
+ * 
+ *
+ * Changed 8/10/01 by Will Dyson. Now use uni2char() / char2uni() rather than
+ * the nls tables directly
+ */
+
+static int
+befs_utf2nls(struct super_block *sb, const char *in,
+	     int in_len, char **out, int *out_len)
+{
+	struct nls_table *nls = BEFS_SB(sb)->nls;
+	int i, o;
+	unicode_t uni;
+	int unilen, utflen;
+	char *result;
+	/* The utf8->nls conversion won't make the final nls string bigger
+	 * than the utf one, but if the string is pure ascii they'll have the
+	 * same width and an extra char is needed to save the additional \0
+	 */
+	int maxlen = in_len + 1;
+
+	befs_debug(sb, "---> utf2nls()");
+
+	if (!nls) {
+		befs_error(sb, "befs_utf2nls called with no NLS table loaded");
+		return -EINVAL;
+	}
+
+	*out = result = kmalloc(maxlen, GFP_NOFS);
+	if (!*out) {
+		befs_error(sb, "befs_utf2nls() cannot allocate memory");
+		*out_len = 0;
+		return -ENOMEM;
+	}
+
+	for (i = o = 0; i < in_len; i += utflen, o += unilen) {
+
+		/* convert from UTF-8 to Unicode */
+		utflen = utf8_to_utf32(&in[i], in_len - i, &uni);
+		if (utflen < 0)
+			goto conv_err;
+
+		/* convert from Unicode to nls */
+		if (uni > MAX_WCHAR_T)
+			goto conv_err;
+		unilen = nls->uni2char(uni, &result[o], in_len - o);
+		if (unilen < 0)
+			goto conv_err;
+	}
+	result[o] = '\0';
+	*out_len = o;
+
+	befs_debug(sb, "<--- utf2nls()");
+
+	return o;
+
+      conv_err:
+	befs_error(sb, "Name using character set %s contains a character that "
+		   "cannot be converted to unicode.", nls->charset);
+	befs_debug(sb, "<--- utf2nls()");
+	kfree(result);
+	return -EILSEQ;
+}
+
+/**
+ * befs_nls2utf - Convert NLS string to utf8 encodeing
+ * @sb: Superblock
+ * @src: Input string buffer in NLS format
+ * @srclen: Length of input string in bytes
+ * @dest: The output string in UTF-8 format
+ * @destlen: Length of the output buffer
+ * 
+ * Converts input string @src, which is in the format of the loaded NLS map,
+ * into a utf8 string.
+ * 
+ * The destination string @dest is allocated by this function and the caller is
+ * responsible for freeing it with kfree()
+ * 
+ * On return, *@destlen is the length of @dest in bytes.
+ *
+ * On success, the return value is the number of utf8 characters written to
+ * the output buffer @dest.
+ *  
+ * On Failure, a negative number coresponding to the error code is returned.
+ */
+
+static int
+befs_nls2utf(struct super_block *sb, const char *in,
+	     int in_len, char **out, int *out_len)
+{
+	struct nls_table *nls = BEFS_SB(sb)->nls;
+	int i, o;
+	wchar_t uni;
+	int unilen, utflen;
+	char *result;
+	/* There're nls characters that will translate to 3-chars-wide UTF-8
+	 * characters, a additional byte is needed to save the final \0
+	 * in special cases */
+	int maxlen = (3 * in_len) + 1;
+
+	befs_debug(sb, "---> nls2utf()\n");
+
+	if (!nls) {
+		befs_error(sb, "befs_nls2utf called with no NLS table loaded.");
+		return -EINVAL;
+	}
+
+	*out = result = kmalloc(maxlen, GFP_NOFS);
+	if (!*out) {
+		befs_error(sb, "befs_nls2utf() cannot allocate memory");
+		*out_len = 0;
+		return -ENOMEM;
+	}
+
+	for (i = o = 0; i < in_len; i += unilen, o += utflen) {
+
+		/* convert from nls to unicode */
+		unilen = nls->char2uni(&in[i], in_len - i, &uni);
+		if (unilen < 0)
+			goto conv_err;
+
+		/* convert from unicode to UTF-8 */
+		utflen = utf32_to_utf8(uni, &result[o], 3);
+		if (utflen <= 0)
+			goto conv_err;
+	}
+
+	result[o] = '\0';
+	*out_len = o;
+
+	befs_debug(sb, "<--- nls2utf()");
+
+	return i;
+
+      conv_err:
+	befs_error(sb, "Name using charecter set %s contains a charecter that "
+		   "cannot be converted to unicode.", nls->charset);
+	befs_debug(sb, "<--- nls2utf()");
+	kfree(result);
+	return -EILSEQ;
+}
+
+/**
+ * Use the
+ *
+ */
+enum {
+	Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
+};
+
+static const match_table_t befs_tokens = {
+	{Opt_uid, "uid=%d"},
+	{Opt_gid, "gid=%d"},
+	{Opt_charset, "iocharset=%s"},
+	{Opt_debug, "debug"},
+	{Opt_err, NULL}
+};
+
+static int
+parse_options(char *options, befs_mount_options * opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+
+	/* Initialize options */
+	opts->uid = 0;
+	opts->gid = 0;
+	opts->use_uid = 0;
+	opts->use_gid = 0;
+	opts->iocharset = NULL;
+	opts->debug = 0;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, befs_tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0) {
+				printk(KERN_ERR "BeFS: Invalid uid %d, "
+						"using default\n", option);
+				break;
+			}
+			opts->uid = option;
+			opts->use_uid = 1;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0) {
+				printk(KERN_ERR "BeFS: Invalid gid %d, "
+						"using default\n", option);
+				break;
+			}
+			opts->gid = option;
+			opts->use_gid = 1;
+			break;
+		case Opt_charset:
+			kfree(opts->iocharset);
+			opts->iocharset = match_strdup(&args[0]);
+			if (!opts->iocharset) {
+				printk(KERN_ERR "BeFS: allocation failure for "
+						"iocharset string\n");
+				return 0;
+			}
+			break;
+		case Opt_debug:
+			opts->debug = 1;
+			break;
+		default:
+			printk(KERN_ERR "BeFS: Unrecognized mount option \"%s\" "
+					"or missing value\n", p);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/* This function has the responsibiltiy of getting the
+ * filesystem ready for unmounting. 
+ * Basically, we free everything that we allocated in
+ * befs_read_inode
+ */
+static void
+befs_put_super(struct super_block *sb)
+{
+	kfree(BEFS_SB(sb)->mount_opts.iocharset);
+	BEFS_SB(sb)->mount_opts.iocharset = NULL;
+	unload_nls(BEFS_SB(sb)->nls);
+	kfree(sb->s_fs_info);
+	sb->s_fs_info = NULL;
+}
+
+/* Allocate private field of the superblock, fill it.
+ *
+ * Finish filling the public superblock fields
+ * Make the root directory
+ * Load a set of NLS translations if needed.
+ */
+static int
+befs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct buffer_head *bh;
+	befs_sb_info *befs_sb;
+	befs_super_block *disk_sb;
+	struct inode *root;
+	long ret = -EINVAL;
+	const unsigned long sb_block = 0;
+	const off_t x86_sb_off = 512;
+
+	save_mount_options(sb, data);
+
+	sb->s_fs_info = kmalloc(sizeof (*befs_sb), GFP_KERNEL);
+	if (sb->s_fs_info == NULL) {
+		printk(KERN_ERR
+		       "BeFS(%s): Unable to allocate memory for private "
+		       "portion of superblock. Bailing.\n", sb->s_id);
+		goto unacquire_none;
+	}
+	befs_sb = BEFS_SB(sb);
+	memset(befs_sb, 0, sizeof(befs_sb_info));
+
+	if (!parse_options((char *) data, &befs_sb->mount_opts)) {
+		befs_error(sb, "cannot parse mount options");
+		goto unacquire_priv_sbp;
+	}
+
+	befs_debug(sb, "---> befs_fill_super()");
+
+#ifndef CONFIG_BEFS_RW
+	if (!(sb->s_flags & MS_RDONLY)) {
+		befs_warning(sb,
+			     "No write support. Marking filesystem read-only");
+		sb->s_flags |= MS_RDONLY;
+	}
+#endif				/* CONFIG_BEFS_RW */
+
+	/*
+	 * Set dummy blocksize to read super block.
+	 * Will be set to real fs blocksize later.
+	 *
+	 * Linux 2.4.10 and later refuse to read blocks smaller than
+	 * the hardsect size for the device. But we also need to read at 
+	 * least 1k to get the second 512 bytes of the volume.
+	 * -WD 10-26-01
+	 */ 
+	sb_min_blocksize(sb, 1024);
+
+	if (!(bh = sb_bread(sb, sb_block))) {
+		befs_error(sb, "unable to read superblock");
+		goto unacquire_priv_sbp;
+	}
+
+	/* account for offset of super block on x86 */
+	disk_sb = (befs_super_block *) bh->b_data;
+	if ((disk_sb->magic1 == BEFS_SUPER_MAGIC1_LE) ||
+	    (disk_sb->magic1 == BEFS_SUPER_MAGIC1_BE)) {
+		befs_debug(sb, "Using PPC superblock location");
+	} else {
+		befs_debug(sb, "Using x86 superblock location");
+		disk_sb =
+		    (befs_super_block *) ((void *) bh->b_data + x86_sb_off);
+	}
+
+	if (befs_load_sb(sb, disk_sb) != BEFS_OK)
+		goto unacquire_bh;
+
+	befs_dump_super_block(sb, disk_sb);
+
+	brelse(bh);
+
+	if (befs_check_sb(sb) != BEFS_OK)
+		goto unacquire_priv_sbp;
+
+	if( befs_sb->num_blocks > ~((sector_t)0) ) {
+		befs_error(sb, "blocks count: %Lu "
+			"is larger than the host can use",
+			befs_sb->num_blocks);
+		goto unacquire_priv_sbp;
+	}
+
+	/*
+	 * set up enough so that it can read an inode
+	 * Fill in kernel superblock fields from private sb
+	 */
+	sb->s_magic = BEFS_SUPER_MAGIC;
+	/* Set real blocksize of fs */
+	sb_set_blocksize(sb, (ulong) befs_sb->block_size);
+	sb->s_op = &befs_sops;
+	root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir)));
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto unacquire_priv_sbp;
+	}
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		iput(root);
+		befs_error(sb, "get root inode failed");
+		goto unacquire_priv_sbp;
+	}
+
+	/* load nls library */
+	if (befs_sb->mount_opts.iocharset) {
+		befs_debug(sb, "Loading nls: %s",
+			   befs_sb->mount_opts.iocharset);
+		befs_sb->nls = load_nls(befs_sb->mount_opts.iocharset);
+		if (!befs_sb->nls) {
+			befs_warning(sb, "Cannot load nls %s"
+					" loading default nls",
+					befs_sb->mount_opts.iocharset);
+			befs_sb->nls = load_nls_default();
+		}
+	/* load default nls if none is specified  in mount options */
+	} else {
+		befs_debug(sb, "Loading default nls");
+		befs_sb->nls = load_nls_default();
+	}
+
+	return 0;
+/*****************/
+      unacquire_bh:
+	brelse(bh);
+
+      unacquire_priv_sbp:
+	kfree(befs_sb->mount_opts.iocharset);
+	kfree(sb->s_fs_info);
+
+      unacquire_none:
+	sb->s_fs_info = NULL;
+	return ret;
+}
+
+static int
+befs_remount(struct super_block *sb, int *flags, char *data)
+{
+	if (!(*flags & MS_RDONLY))
+		return -EINVAL;
+	return 0;
+}
+
+static int
+befs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	befs_debug(sb, "---> befs_statfs()");
+
+	buf->f_type = BEFS_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = BEFS_SB(sb)->num_blocks;
+	buf->f_bfree = BEFS_SB(sb)->num_blocks - BEFS_SB(sb)->used_blocks;
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = 0;	/* UNKNOWN */
+	buf->f_ffree = 0;	/* UNKNOWN */
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = BEFS_NAME_LEN;
+
+	befs_debug(sb, "<--- befs_statfs()");
+
+	return 0;
+}
+
+static struct dentry *
+befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name,
+	    void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super);
+}
+
+static struct file_system_type befs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "befs",
+	.mount		= befs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,	
+};
+
+static int __init
+init_befs_fs(void)
+{
+	int err;
+
+	printk(KERN_INFO "BeFS version: %s\n", BEFS_VERSION);
+
+	err = befs_init_inodecache();
+	if (err)
+		goto unacquire_none;
+
+	err = register_filesystem(&befs_fs_type);
+	if (err)
+		goto unacquire_inodecache;
+
+	return 0;
+
+unacquire_inodecache:
+	befs_destroy_inodecache();
+
+unacquire_none:
+	return err;
+}
+
+static void __exit
+exit_befs_fs(void)
+{
+	befs_destroy_inodecache();
+
+	unregister_filesystem(&befs_fs_type);
+}
+
+/*
+Macros that typecheck the init and exit functions,
+ensures that they are called at init and cleanup,
+and eliminates warnings about unused functions.
+*/
+module_init(init_befs_fs)
+module_exit(exit_befs_fs)
diff --git a/fs/befs/super.c b/fs/befs/super.c
new file mode 100644
index 00000000..ca40f828
--- /dev/null
+++ b/fs/befs/super.c
@@ -0,0 +1,112 @@
+/*
+ * super.c
+ *
+ * Copyright (C) 2001-2002 Will Dyson <will_dyson@pobox.com>
+ *
+ * Licensed under the GNU GPL. See the file COPYING for details.
+ *
+ */
+
+#include <linux/fs.h>
+#include <asm/page.h> /* for PAGE_SIZE */
+
+#include "befs.h"
+#include "super.h"
+
+/**
+ * load_befs_sb -- Read from disk and properly byteswap all the fields
+ * of the befs superblock
+ *
+ *
+ *
+ *
+ */
+int
+befs_load_sb(struct super_block *sb, befs_super_block * disk_sb)
+{
+	befs_sb_info *befs_sb = BEFS_SB(sb);
+
+	/* Check the byte order of the filesystem */
+	if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE)
+	    befs_sb->byte_order = BEFS_BYTESEX_LE;
+	else if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_BE)
+	    befs_sb->byte_order = BEFS_BYTESEX_BE;
+
+	befs_sb->magic1 = fs32_to_cpu(sb, disk_sb->magic1);
+	befs_sb->magic2 = fs32_to_cpu(sb, disk_sb->magic2);
+	befs_sb->magic3 = fs32_to_cpu(sb, disk_sb->magic3);
+	befs_sb->block_size = fs32_to_cpu(sb, disk_sb->block_size);
+	befs_sb->block_shift = fs32_to_cpu(sb, disk_sb->block_shift);
+	befs_sb->num_blocks = fs64_to_cpu(sb, disk_sb->num_blocks);
+	befs_sb->used_blocks = fs64_to_cpu(sb, disk_sb->used_blocks);
+	befs_sb->inode_size = fs32_to_cpu(sb, disk_sb->inode_size);
+
+	befs_sb->blocks_per_ag = fs32_to_cpu(sb, disk_sb->blocks_per_ag);
+	befs_sb->ag_shift = fs32_to_cpu(sb, disk_sb->ag_shift);
+	befs_sb->num_ags = fs32_to_cpu(sb, disk_sb->num_ags);
+
+	befs_sb->log_blocks = fsrun_to_cpu(sb, disk_sb->log_blocks);
+	befs_sb->log_start = fs64_to_cpu(sb, disk_sb->log_start);
+	befs_sb->log_end = fs64_to_cpu(sb, disk_sb->log_end);
+
+	befs_sb->root_dir = fsrun_to_cpu(sb, disk_sb->root_dir);
+	befs_sb->indices = fsrun_to_cpu(sb, disk_sb->indices);
+	befs_sb->nls = NULL;
+
+	return BEFS_OK;
+}
+
+int
+befs_check_sb(struct super_block *sb)
+{
+	befs_sb_info *befs_sb = BEFS_SB(sb);
+
+	/* Check magic headers of super block */
+	if ((befs_sb->magic1 != BEFS_SUPER_MAGIC1)
+	    || (befs_sb->magic2 != BEFS_SUPER_MAGIC2)
+	    || (befs_sb->magic3 != BEFS_SUPER_MAGIC3)) {
+		befs_error(sb, "invalid magic header");
+		return BEFS_ERR;
+	}
+
+	/*
+	 * Check blocksize of BEFS.
+	 *
+	 * Blocksize of BEFS is 1024, 2048, 4096 or 8192.
+	 */
+
+	if ((befs_sb->block_size != 1024)
+	    && (befs_sb->block_size != 2048)
+	    && (befs_sb->block_size != 4096)
+	    && (befs_sb->block_size != 8192)) {
+		befs_error(sb, "invalid blocksize: %u", befs_sb->block_size);
+		return BEFS_ERR;
+	}
+
+	if (befs_sb->block_size > PAGE_SIZE) {
+		befs_error(sb, "blocksize(%u) cannot be larger"
+			   "than system pagesize(%lu)", befs_sb->block_size,
+			   PAGE_SIZE);
+		return BEFS_ERR;
+	}
+
+	/*
+	   * block_shift and block_size encode the same information
+	   * in different ways as a consistency check.
+	 */
+
+	if ((1 << befs_sb->block_shift) != befs_sb->block_size) {
+		befs_error(sb, "block_shift disagrees with block_size. "
+			   "Corruption likely.");
+		return BEFS_ERR;
+	}
+
+	if (befs_sb->log_start != befs_sb->log_end) {
+		befs_error(sb, "Filesystem not clean! There are blocks in the "
+			   "journal. You must boot into BeOS and mount this volume "
+			   "to make it clean.");
+		return BEFS_ERR;
+	}
+
+	return BEFS_OK;
+}
diff --git a/fs/befs/super.h b/fs/befs/super.h
new file mode 100644
index 00000000..dc455637
--- /dev/null
+++ b/fs/befs/super.h
@@ -0,0 +1,8 @@
+/*
+ * super.h
+ */
+
+int befs_load_sb(struct super_block *sb, befs_super_block * disk_sb);
+
+int befs_check_sb(struct super_block *sb);
+
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
new file mode 100644
index 00000000..c2336c62
--- /dev/null
+++ b/fs/bfs/Kconfig
@@ -0,0 +1,19 @@
+config BFS_FS
+	tristate "BFS file system support (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	help
+	  Boot File System (BFS) is a file system used under SCO UnixWare to
+	  allow the bootloader access to the kernel image and other important
+	  files during the boot process.  It is usually mounted under /stand
+	  and corresponds to the slice marked as "STAND" in the UnixWare
+	  partition.  You should say Y if you want to read or write the files
+	  on your /stand slice from within Linux.  You then also need to say Y
+	  to "UnixWare slices support", below.  More information about the BFS
+	  file system is contained in the file
+	  <file:Documentation/filesystems/bfs.txt>.
+
+	  If you don't know what this is about, say N.
+
+	  To compile this as a module, choose M here: the module will be called
+	  bfs.  Note that the file system of your root partition (the one
+	  containing the directory /) cannot be compiled as a module.
diff --git a/fs/bfs/Makefile b/fs/bfs/Makefile
new file mode 100644
index 00000000..c787b36d
--- /dev/null
+++ b/fs/bfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for BFS filesystem.
+#
+
+obj-$(CONFIG_BFS_FS) += bfs.o
+
+bfs-objs := inode.o file.o dir.o
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
new file mode 100644
index 00000000..f7f87e23
--- /dev/null
+++ b/fs/bfs/bfs.h
@@ -0,0 +1,59 @@
+/*
+ *	fs/bfs/bfs.h
+ *	Copyright (C) 1999 Tigran Aivazian <tigran@veritas.com>
+ */
+#ifndef _FS_BFS_BFS_H
+#define _FS_BFS_BFS_H
+
+#include <linux/bfs_fs.h>
+
+/*
+ * BFS file system in-core superblock info
+ */
+struct bfs_sb_info {
+	unsigned long si_blocks;
+	unsigned long si_freeb;
+	unsigned long si_freei;
+	unsigned long si_lf_eblk;
+	unsigned long si_lasti;
+	unsigned long *si_imap;
+	struct mutex bfs_lock;
+};
+
+/*
+ * BFS file system in-core inode info
+ */
+struct bfs_inode_info {
+	unsigned long i_dsk_ino; /* inode number from the disk, can be 0 */
+	unsigned long i_sblock;
+	unsigned long i_eblock;
+	struct inode vfs_inode;
+};
+
+static inline struct bfs_sb_info *BFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct bfs_inode_info *BFS_I(struct inode *inode)
+{
+	return container_of(inode, struct bfs_inode_info, vfs_inode);
+}
+
+
+#define printf(format, args...) \
+	printk(KERN_ERR "BFS-fs: %s(): " format, __func__, ## args)
+
+/* inode.c */
+extern struct inode *bfs_iget(struct super_block *sb, unsigned long ino);
+
+/* file.c */
+extern const struct inode_operations bfs_file_inops;
+extern const struct file_operations bfs_file_operations;
+extern const struct address_space_operations bfs_aops;
+
+/* dir.c */
+extern const struct inode_operations bfs_dir_inops;
+extern const struct file_operations bfs_dir_operations;
+
+#endif /* _FS_BFS_BFS_H */
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
new file mode 100644
index 00000000..b14cebfd
--- /dev/null
+++ b/fs/bfs/dir.c
@@ -0,0 +1,374 @@
+/*
+ *	fs/bfs/dir.c
+ *	BFS directory operations.
+ *	Copyright (C) 1999,2000  Tigran Aivazian <tigran@veritas.com>
+ *      Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005
+ */
+
+#include <linux/time.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/sched.h>
+#include "bfs.h"
+
+#undef DEBUG
+
+#ifdef DEBUG
+#define dprintf(x...)	printf(x)
+#else
+#define dprintf(x...)
+#endif
+
+static int bfs_add_entry(struct inode *dir, const unsigned char *name,
+						int namelen, int ino);
+static struct buffer_head *bfs_find_entry(struct inode *dir,
+				const unsigned char *name, int namelen,
+				struct bfs_dirent **res_dir);
+
+static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
+{
+	struct inode *dir = f->f_path.dentry->d_inode;
+	struct buffer_head *bh;
+	struct bfs_dirent *de;
+	struct bfs_sb_info *info = BFS_SB(dir->i_sb);
+	unsigned int offset;
+	int block;
+
+	mutex_lock(&info->bfs_lock);
+
+	if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
+		printf("Bad f_pos=%08lx for %s:%08lx\n",
+					(unsigned long)f->f_pos,
+					dir->i_sb->s_id, dir->i_ino);
+		mutex_unlock(&info->bfs_lock);
+		return -EBADF;
+	}
+
+	while (f->f_pos < dir->i_size) {
+		offset = f->f_pos & (BFS_BSIZE - 1);
+		block = BFS_I(dir)->i_sblock + (f->f_pos >> BFS_BSIZE_BITS);
+		bh = sb_bread(dir->i_sb, block);
+		if (!bh) {
+			f->f_pos += BFS_BSIZE - offset;
+			continue;
+		}
+		do {
+			de = (struct bfs_dirent *)(bh->b_data + offset);
+			if (de->ino) {
+				int size = strnlen(de->name, BFS_NAMELEN);
+				if (filldir(dirent, de->name, size, f->f_pos,
+						le16_to_cpu(de->ino),
+						DT_UNKNOWN) < 0) {
+					brelse(bh);
+					mutex_unlock(&info->bfs_lock);
+					return 0;
+				}
+			}
+			offset += BFS_DIRENT_SIZE;
+			f->f_pos += BFS_DIRENT_SIZE;
+		} while ((offset < BFS_BSIZE) && (f->f_pos < dir->i_size));
+		brelse(bh);
+	}
+
+	mutex_unlock(&info->bfs_lock);
+	return 0;	
+}
+
+const struct file_operations bfs_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= bfs_readdir,
+	.fsync		= generic_file_fsync,
+	.llseek		= generic_file_llseek,
+};
+
+extern void dump_imap(const char *, struct super_block *);
+
+static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
+						struct nameidata *nd)
+{
+	int err;
+	struct inode *inode;
+	struct super_block *s = dir->i_sb;
+	struct bfs_sb_info *info = BFS_SB(s);
+	unsigned long ino;
+
+	inode = new_inode(s);
+	if (!inode)
+		return -ENOSPC;
+	mutex_lock(&info->bfs_lock);
+	ino = find_first_zero_bit(info->si_imap, info->si_lasti + 1);
+	if (ino > info->si_lasti) {
+		mutex_unlock(&info->bfs_lock);
+		iput(inode);
+		return -ENOSPC;
+	}
+	set_bit(ino, info->si_imap);
+	info->si_freei--;
+	inode_init_owner(inode, dir, mode);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_blocks = 0;
+	inode->i_op = &bfs_file_inops;
+	inode->i_fop = &bfs_file_operations;
+	inode->i_mapping->a_ops = &bfs_aops;
+	inode->i_ino = ino;
+	BFS_I(inode)->i_dsk_ino = ino;
+	BFS_I(inode)->i_sblock = 0;
+	BFS_I(inode)->i_eblock = 0;
+	insert_inode_hash(inode);
+        mark_inode_dirty(inode);
+	dump_imap("create", s);
+
+	err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len,
+							inode->i_ino);
+	if (err) {
+		inode_dec_link_count(inode);
+		mutex_unlock(&info->bfs_lock);
+		iput(inode);
+		return err;
+	}
+	mutex_unlock(&info->bfs_lock);
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry,
+						struct nameidata *nd)
+{
+	struct inode *inode = NULL;
+	struct buffer_head *bh;
+	struct bfs_dirent *de;
+	struct bfs_sb_info *info = BFS_SB(dir->i_sb);
+
+	if (dentry->d_name.len > BFS_NAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	mutex_lock(&info->bfs_lock);
+	bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
+	if (bh) {
+		unsigned long ino = (unsigned long)le16_to_cpu(de->ino);
+		brelse(bh);
+		inode = bfs_iget(dir->i_sb, ino);
+		if (IS_ERR(inode)) {
+			mutex_unlock(&info->bfs_lock);
+			return ERR_CAST(inode);
+		}
+	}
+	mutex_unlock(&info->bfs_lock);
+	d_add(dentry, inode);
+	return NULL;
+}
+
+static int bfs_link(struct dentry *old, struct inode *dir,
+						struct dentry *new)
+{
+	struct inode *inode = old->d_inode;
+	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
+	int err;
+
+	mutex_lock(&info->bfs_lock);
+	err = bfs_add_entry(dir, new->d_name.name, new->d_name.len,
+							inode->i_ino);
+	if (err) {
+		mutex_unlock(&info->bfs_lock);
+		return err;
+	}
+	inc_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	ihold(inode);
+	d_instantiate(new, inode);
+	mutex_unlock(&info->bfs_lock);
+	return 0;
+}
+
+static int bfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int error = -ENOENT;
+	struct inode *inode = dentry->d_inode;
+	struct buffer_head *bh;
+	struct bfs_dirent *de;
+	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
+
+	mutex_lock(&info->bfs_lock);
+	bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
+	if (!bh || (le16_to_cpu(de->ino) != inode->i_ino))
+		goto out_brelse;
+
+	if (!inode->i_nlink) {
+		printf("unlinking non-existent file %s:%lu (nlink=%d)\n",
+					inode->i_sb->s_id, inode->i_ino,
+					inode->i_nlink);
+		inode->i_nlink = 1;
+	}
+	de->ino = 0;
+	mark_buffer_dirty_inode(bh, dir);
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+	error = 0;
+
+out_brelse:
+	brelse(bh);
+	mutex_unlock(&info->bfs_lock);
+	return error;
+}
+
+static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode, *new_inode;
+	struct buffer_head *old_bh, *new_bh;
+	struct bfs_dirent *old_de, *new_de;
+	struct bfs_sb_info *info;
+	int error = -ENOENT;
+
+	old_bh = new_bh = NULL;
+	old_inode = old_dentry->d_inode;
+	if (S_ISDIR(old_inode->i_mode))
+		return -EINVAL;
+
+	info = BFS_SB(old_inode->i_sb);
+
+	mutex_lock(&info->bfs_lock);
+	old_bh = bfs_find_entry(old_dir, 
+				old_dentry->d_name.name, 
+				old_dentry->d_name.len, &old_de);
+
+	if (!old_bh || (le16_to_cpu(old_de->ino) != old_inode->i_ino))
+		goto end_rename;
+
+	error = -EPERM;
+	new_inode = new_dentry->d_inode;
+	new_bh = bfs_find_entry(new_dir, 
+				new_dentry->d_name.name, 
+				new_dentry->d_name.len, &new_de);
+
+	if (new_bh && !new_inode) {
+		brelse(new_bh);
+		new_bh = NULL;
+	}
+	if (!new_bh) {
+		error = bfs_add_entry(new_dir, 
+					new_dentry->d_name.name,
+					new_dentry->d_name.len,
+					old_inode->i_ino);
+		if (error)
+			goto end_rename;
+	}
+	old_de->ino = 0;
+	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(old_dir);
+	if (new_inode) {
+		new_inode->i_ctime = CURRENT_TIME_SEC;
+		inode_dec_link_count(new_inode);
+	}
+	mark_buffer_dirty_inode(old_bh, old_dir);
+	error = 0;
+
+end_rename:
+	mutex_unlock(&info->bfs_lock);
+	brelse(old_bh);
+	brelse(new_bh);
+	return error;
+}
+
+const struct inode_operations bfs_dir_inops = {
+	.create			= bfs_create,
+	.lookup			= bfs_lookup,
+	.link			= bfs_link,
+	.unlink			= bfs_unlink,
+	.rename			= bfs_rename,
+};
+
+static int bfs_add_entry(struct inode *dir, const unsigned char *name,
+							int namelen, int ino)
+{
+	struct buffer_head *bh;
+	struct bfs_dirent *de;
+	int block, sblock, eblock, off, pos;
+	int i;
+
+	dprintf("name=%s, namelen=%d\n", name, namelen);
+
+	if (!namelen)
+		return -ENOENT;
+	if (namelen > BFS_NAMELEN)
+		return -ENAMETOOLONG;
+
+	sblock = BFS_I(dir)->i_sblock;
+	eblock = BFS_I(dir)->i_eblock;
+	for (block = sblock; block <= eblock; block++) {
+		bh = sb_bread(dir->i_sb, block);
+		if (!bh)
+			return -ENOSPC;
+		for (off = 0; off < BFS_BSIZE; off += BFS_DIRENT_SIZE) {
+			de = (struct bfs_dirent *)(bh->b_data + off);
+			if (!de->ino) {
+				pos = (block - sblock) * BFS_BSIZE + off;
+				if (pos >= dir->i_size) {
+					dir->i_size += BFS_DIRENT_SIZE;
+					dir->i_ctime = CURRENT_TIME_SEC;
+				}
+				dir->i_mtime = CURRENT_TIME_SEC;
+				mark_inode_dirty(dir);
+				de->ino = cpu_to_le16((u16)ino);
+				for (i = 0; i < BFS_NAMELEN; i++)
+					de->name[i] =
+						(i < namelen) ? name[i] : 0;
+				mark_buffer_dirty_inode(bh, dir);
+				brelse(bh);
+				return 0;
+			}
+		}
+		brelse(bh);
+	}
+	return -ENOSPC;
+}
+
+static inline int bfs_namecmp(int len, const unsigned char *name,
+							const char *buffer)
+{
+	if ((len < BFS_NAMELEN) && buffer[len])
+		return 0;
+	return !memcmp(name, buffer, len);
+}
+
+static struct buffer_head *bfs_find_entry(struct inode *dir,
+			const unsigned char *name, int namelen,
+			struct bfs_dirent **res_dir)
+{
+	unsigned long block = 0, offset = 0;
+	struct buffer_head *bh = NULL;
+	struct bfs_dirent *de;
+
+	*res_dir = NULL;
+	if (namelen > BFS_NAMELEN)
+		return NULL;
+
+	while (block * BFS_BSIZE + offset < dir->i_size) {
+		if (!bh) {
+			bh = sb_bread(dir->i_sb, BFS_I(dir)->i_sblock + block);
+			if (!bh) {
+				block++;
+				continue;
+			}
+		}
+		de = (struct bfs_dirent *)(bh->b_data + offset);
+		offset += BFS_DIRENT_SIZE;
+		if (le16_to_cpu(de->ino) &&
+				bfs_namecmp(namelen, name, de->name)) {
+			*res_dir = de;
+			return bh;
+		}
+		if (offset < bh->b_size)
+			continue;
+		brelse(bh);
+		bh = NULL;
+		offset = 0;
+		block++;
+	}
+	brelse(bh);
+	return NULL;
+}
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
new file mode 100644
index 00000000..f20e8a71
--- /dev/null
+++ b/fs/bfs/file.c
@@ -0,0 +1,194 @@
+/*
+ *	fs/bfs/file.c
+ *	BFS file operations.
+ *	Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com>
+ *
+ *	Make the file block allocation algorithm understand the size
+ *	of the underlying block device.
+ *	Copyright (C) 2007 Dmitri Vorobiev <dmitri.vorobiev@gmail.com>
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include "bfs.h"
+
+#undef DEBUG
+
+#ifdef DEBUG
+#define dprintf(x...)	printf(x)
+#else
+#define dprintf(x...)
+#endif
+
+const struct file_operations bfs_file_operations = {
+	.llseek 	= generic_file_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.splice_read	= generic_file_splice_read,
+};
+
+static int bfs_move_block(unsigned long from, unsigned long to,
+					struct super_block *sb)
+{
+	struct buffer_head *bh, *new;
+
+	bh = sb_bread(sb, from);
+	if (!bh)
+		return -EIO;
+	new = sb_getblk(sb, to);
+	memcpy(new->b_data, bh->b_data, bh->b_size);
+	mark_buffer_dirty(new);
+	bforget(bh);
+	brelse(new);
+	return 0;
+}
+
+static int bfs_move_blocks(struct super_block *sb, unsigned long start,
+				unsigned long end, unsigned long where)
+{
+	unsigned long i;
+
+	dprintf("%08lx-%08lx->%08lx\n", start, end, where);
+	for (i = start; i <= end; i++)
+		if(bfs_move_block(i, where + i, sb)) {
+			dprintf("failed to move block %08lx -> %08lx\n", i,
+								where + i);
+			return -EIO;
+		}
+	return 0;
+}
+
+static int bfs_get_block(struct inode *inode, sector_t block,
+			struct buffer_head *bh_result, int create)
+{
+	unsigned long phys;
+	int err;
+	struct super_block *sb = inode->i_sb;
+	struct bfs_sb_info *info = BFS_SB(sb);
+	struct bfs_inode_info *bi = BFS_I(inode);
+
+	phys = bi->i_sblock + block;
+	if (!create) {
+		if (phys <= bi->i_eblock) {
+			dprintf("c=%d, b=%08lx, phys=%09lx (granted)\n",
+                                create, (unsigned long)block, phys);
+			map_bh(bh_result, sb, phys);
+		}
+		return 0;
+	}
+
+	/*
+	 * If the file is not empty and the requested block is within the
+	 * range of blocks allocated for this file, we can grant it.
+	 */
+	if (bi->i_sblock && (phys <= bi->i_eblock)) {
+		dprintf("c=%d, b=%08lx, phys=%08lx (interim block granted)\n", 
+				create, (unsigned long)block, phys);
+		map_bh(bh_result, sb, phys);
+		return 0;
+	}
+
+	/* The file will be extended, so let's see if there is enough space. */
+	if (phys >= info->si_blocks)
+		return -ENOSPC;
+
+	/* The rest has to be protected against itself. */
+	mutex_lock(&info->bfs_lock);
+
+	/*
+	 * If the last data block for this file is the last allocated
+	 * block, we can extend the file trivially, without moving it
+	 * anywhere.
+	 */
+	if (bi->i_eblock == info->si_lf_eblk) {
+		dprintf("c=%d, b=%08lx, phys=%08lx (simple extension)\n", 
+				create, (unsigned long)block, phys);
+		map_bh(bh_result, sb, phys);
+		info->si_freeb -= phys - bi->i_eblock;
+		info->si_lf_eblk = bi->i_eblock = phys;
+		mark_inode_dirty(inode);
+		err = 0;
+		goto out;
+	}
+
+	/* Ok, we have to move this entire file to the next free block. */
+	phys = info->si_lf_eblk + 1;
+	if (phys + block >= info->si_blocks) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	if (bi->i_sblock) {
+		err = bfs_move_blocks(inode->i_sb, bi->i_sblock, 
+						bi->i_eblock, phys);
+		if (err) {
+			dprintf("failed to move ino=%08lx -> fs corruption\n",
+								inode->i_ino);
+			goto out;
+		}
+	} else
+		err = 0;
+
+	dprintf("c=%d, b=%08lx, phys=%08lx (moved)\n",
+                create, (unsigned long)block, phys);
+	bi->i_sblock = phys;
+	phys += block;
+	info->si_lf_eblk = bi->i_eblock = phys;
+
+	/*
+	 * This assumes nothing can write the inode back while we are here
+	 * and thus update inode->i_blocks! (XXX)
+	 */
+	info->si_freeb -= bi->i_eblock - bi->i_sblock + 1 - inode->i_blocks;
+	mark_inode_dirty(inode);
+	map_bh(bh_result, sb, phys);
+out:
+	mutex_unlock(&info->bfs_lock);
+	return err;
+}
+
+static int bfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, bfs_get_block, wbc);
+}
+
+static int bfs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, bfs_get_block);
+}
+
+static int bfs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				bfs_get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
+
+static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, bfs_get_block);
+}
+
+const struct address_space_operations bfs_aops = {
+	.readpage	= bfs_readpage,
+	.writepage	= bfs_writepage,
+	.write_begin	= bfs_write_begin,
+	.write_end	= generic_write_end,
+	.bmap		= bfs_bmap,
+};
+
+const struct inode_operations bfs_file_inops;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
new file mode 100644
index 00000000..a8e37f81
--- /dev/null
+++ b/fs/bfs/inode.c
@@ -0,0 +1,496 @@
+/*
+ *	fs/bfs/inode.c
+ *	BFS superblock and inode operations.
+ *	Copyright (C) 1999-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *	From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds.
+ *
+ *      Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/writeback.h>
+#include <asm/uaccess.h>
+#include "bfs.h"
+
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux");
+MODULE_LICENSE("GPL");
+
+#undef DEBUG
+
+#ifdef DEBUG
+#define dprintf(x...)	printf(x)
+#else
+#define dprintf(x...)
+#endif
+
+void dump_imap(const char *prefix, struct super_block *s);
+
+struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct bfs_inode *di;
+	struct inode *inode;
+	struct buffer_head *bh;
+	int block, off;
+
+	inode = iget_locked(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) {
+		printf("Bad inode number %s:%08lx\n", inode->i_sb->s_id, ino);
+		goto error;
+	}
+
+	block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
+	bh = sb_bread(inode->i_sb, block);
+	if (!bh) {
+		printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id,
+									ino);
+		goto error;
+	}
+
+	off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
+	di = (struct bfs_inode *)bh->b_data + off;
+
+	inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode);
+	if (le32_to_cpu(di->i_vtype) == BFS_VDIR) {
+		inode->i_mode |= S_IFDIR;
+		inode->i_op = &bfs_dir_inops;
+		inode->i_fop = &bfs_dir_operations;
+	} else if (le32_to_cpu(di->i_vtype) == BFS_VREG) {
+		inode->i_mode |= S_IFREG;
+		inode->i_op = &bfs_file_inops;
+		inode->i_fop = &bfs_file_operations;
+		inode->i_mapping->a_ops = &bfs_aops;
+	}
+
+	BFS_I(inode)->i_sblock =  le32_to_cpu(di->i_sblock);
+	BFS_I(inode)->i_eblock =  le32_to_cpu(di->i_eblock);
+	BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino);
+	inode->i_uid =  le32_to_cpu(di->i_uid);
+	inode->i_gid =  le32_to_cpu(di->i_gid);
+	inode->i_nlink =  le32_to_cpu(di->i_nlink);
+	inode->i_size = BFS_FILESIZE(di);
+	inode->i_blocks = BFS_FILEBLOCKS(di);
+	inode->i_atime.tv_sec =  le32_to_cpu(di->i_atime);
+	inode->i_mtime.tv_sec =  le32_to_cpu(di->i_mtime);
+	inode->i_ctime.tv_sec =  le32_to_cpu(di->i_ctime);
+	inode->i_atime.tv_nsec = 0;
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_ctime.tv_nsec = 0;
+
+	brelse(bh);
+	unlock_new_inode(inode);
+	return inode;
+
+error:
+	iget_failed(inode);
+	return ERR_PTR(-EIO);
+}
+
+static struct bfs_inode *find_inode(struct super_block *sb, u16 ino, struct buffer_head **p)
+{
+	if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(sb)->si_lasti)) {
+		printf("Bad inode number %s:%08x\n", sb->s_id, ino);
+		return ERR_PTR(-EIO);
+	}
+
+	ino -= BFS_ROOT_INO;
+
+	*p = sb_bread(sb, 1 + ino / BFS_INODES_PER_BLOCK);
+	if (!*p) {
+		printf("Unable to read inode %s:%08x\n", sb->s_id, ino);
+		return ERR_PTR(-EIO);
+	}
+
+	return (struct bfs_inode *)(*p)->b_data +  ino % BFS_INODES_PER_BLOCK;
+}
+
+static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
+	unsigned int ino = (u16)inode->i_ino;
+        unsigned long i_sblock;
+	struct bfs_inode *di;
+	struct buffer_head *bh;
+	int err = 0;
+
+        dprintf("ino=%08x\n", ino);
+
+	di = find_inode(inode->i_sb, ino, &bh);
+	if (IS_ERR(di))
+		return PTR_ERR(di);
+
+	mutex_lock(&info->bfs_lock);
+
+	if (ino == BFS_ROOT_INO)
+		di->i_vtype = cpu_to_le32(BFS_VDIR);
+	else
+		di->i_vtype = cpu_to_le32(BFS_VREG);
+
+	di->i_ino = cpu_to_le16(ino);
+	di->i_mode = cpu_to_le32(inode->i_mode);
+	di->i_uid = cpu_to_le32(inode->i_uid);
+	di->i_gid = cpu_to_le32(inode->i_gid);
+	di->i_nlink = cpu_to_le32(inode->i_nlink);
+	di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+        i_sblock = BFS_I(inode)->i_sblock;
+	di->i_sblock = cpu_to_le32(i_sblock);
+	di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
+	di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
+
+	mark_buffer_dirty(bh);
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			err = -EIO;
+	}
+	brelse(bh);
+	mutex_unlock(&info->bfs_lock);
+	return err;
+}
+
+static void bfs_evict_inode(struct inode *inode)
+{
+	unsigned long ino = inode->i_ino;
+	struct bfs_inode *di;
+	struct buffer_head *bh;
+	struct super_block *s = inode->i_sb;
+	struct bfs_sb_info *info = BFS_SB(s);
+	struct bfs_inode_info *bi = BFS_I(inode);
+
+	dprintf("ino=%08lx\n", ino);
+
+	truncate_inode_pages(&inode->i_data, 0);
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
+
+	if (inode->i_nlink)
+		return;
+
+	di = find_inode(s, inode->i_ino, &bh);
+	if (IS_ERR(di))
+		return;
+
+	mutex_lock(&info->bfs_lock);
+	/* clear on-disk inode */
+	memset(di, 0, sizeof(struct bfs_inode));
+	mark_buffer_dirty(bh);
+	brelse(bh);
+
+        if (bi->i_dsk_ino) {
+		if (bi->i_sblock)
+			info->si_freeb += bi->i_eblock + 1 - bi->i_sblock;
+		info->si_freei++;
+		clear_bit(ino, info->si_imap);
+		dump_imap("delete_inode", s);
+        }
+
+	/*
+	 * If this was the last file, make the previous block
+	 * "last block of the last file" even if there is no
+	 * real file there, saves us 1 gap.
+	 */
+	if (info->si_lf_eblk == bi->i_eblock)
+		info->si_lf_eblk = bi->i_sblock - 1;
+	mutex_unlock(&info->bfs_lock);
+}
+
+static void bfs_put_super(struct super_block *s)
+{
+	struct bfs_sb_info *info = BFS_SB(s);
+
+	if (!info)
+		return;
+
+	mutex_destroy(&info->bfs_lock);
+	kfree(info->si_imap);
+	kfree(info);
+	s->s_fs_info = NULL;
+}
+
+static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *s = dentry->d_sb;
+	struct bfs_sb_info *info = BFS_SB(s);
+	u64 id = huge_encode_dev(s->s_bdev->bd_dev);
+	buf->f_type = BFS_MAGIC;
+	buf->f_bsize = s->s_blocksize;
+	buf->f_blocks = info->si_blocks;
+	buf->f_bfree = buf->f_bavail = info->si_freeb;
+	buf->f_files = info->si_lasti + 1 - BFS_ROOT_INO;
+	buf->f_ffree = info->si_freei;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = BFS_NAMELEN;
+	return 0;
+}
+
+static struct kmem_cache *bfs_inode_cachep;
+
+static struct inode *bfs_alloc_inode(struct super_block *sb)
+{
+	struct bfs_inode_info *bi;
+	bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL);
+	if (!bi)
+		return NULL;
+	return &bi->vfs_inode;
+}
+
+static void bfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
+}
+
+static void bfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, bfs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct bfs_inode_info *bi = foo;
+
+	inode_init_once(&bi->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
+					     sizeof(struct bfs_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (bfs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(bfs_inode_cachep);
+}
+
+static const struct super_operations bfs_sops = {
+	.alloc_inode	= bfs_alloc_inode,
+	.destroy_inode	= bfs_destroy_inode,
+	.write_inode	= bfs_write_inode,
+	.evict_inode	= bfs_evict_inode,
+	.put_super	= bfs_put_super,
+	.statfs		= bfs_statfs,
+};
+
+void dump_imap(const char *prefix, struct super_block *s)
+{
+#ifdef DEBUG
+	int i;
+	char *tmpbuf = (char *)get_zeroed_page(GFP_KERNEL);
+
+	if (!tmpbuf)
+		return;
+	for (i = BFS_SB(s)->si_lasti; i >= 0; i--) {
+		if (i > PAGE_SIZE - 100) break;
+		if (test_bit(i, BFS_SB(s)->si_imap))
+			strcat(tmpbuf, "1");
+		else
+			strcat(tmpbuf, "0");
+	}
+	printf("BFS-fs: %s: lasti=%08lx <%s>\n",
+				prefix, BFS_SB(s)->si_lasti, tmpbuf);
+	free_page((unsigned long)tmpbuf);
+#endif
+}
+
+static int bfs_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct buffer_head *bh, *sbh;
+	struct bfs_super_block *bfs_sb;
+	struct inode *inode;
+	unsigned i, imap_len;
+	struct bfs_sb_info *info;
+	int ret = -EINVAL;
+	unsigned long i_sblock, i_eblock, i_eoff, s_size;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+	mutex_init(&info->bfs_lock);
+	s->s_fs_info = info;
+
+	sb_set_blocksize(s, BFS_BSIZE);
+
+	sbh = sb_bread(s, 0);
+	if (!sbh)
+		goto out;
+	bfs_sb = (struct bfs_super_block *)sbh->b_data;
+	if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
+		if (!silent)
+			printf("No BFS filesystem on %s (magic=%08x)\n", 
+				s->s_id,  le32_to_cpu(bfs_sb->s_magic));
+		goto out1;
+	}
+	if (BFS_UNCLEAN(bfs_sb, s) && !silent)
+		printf("%s is unclean, continuing\n", s->s_id);
+
+	s->s_magic = BFS_MAGIC;
+
+	if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
+		printf("Superblock is corrupted\n");
+		goto out1;
+	}
+
+	info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
+					sizeof(struct bfs_inode)
+					+ BFS_ROOT_INO - 1;
+	imap_len = (info->si_lasti / 8) + 1;
+	info->si_imap = kzalloc(imap_len, GFP_KERNEL);
+	if (!info->si_imap)
+		goto out1;
+	for (i = 0; i < BFS_ROOT_INO; i++)
+		set_bit(i, info->si_imap);
+
+	s->s_op = &bfs_sops;
+	inode = bfs_iget(s, BFS_ROOT_INO);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto out2;
+	}
+	s->s_root = d_alloc_root(inode);
+	if (!s->s_root) {
+		iput(inode);
+		ret = -ENOMEM;
+		goto out2;
+	}
+
+	info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS;
+	info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1
+			- le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
+	info->si_freei = 0;
+	info->si_lf_eblk = 0;
+
+	/* can we read the last block? */
+	bh = sb_bread(s, info->si_blocks - 1);
+	if (!bh) {
+		printf("Last block not available: %lu\n", info->si_blocks - 1);
+		ret = -EIO;
+		goto out3;
+	}
+	brelse(bh);
+
+	bh = NULL;
+	for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) {
+		struct bfs_inode *di;
+		int block = (i - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
+		int off = (i - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
+		unsigned long eblock;
+
+		if (!off) {
+			brelse(bh);
+			bh = sb_bread(s, block);
+		}
+
+		if (!bh)
+			continue;
+
+		di = (struct bfs_inode *)bh->b_data + off;
+
+		/* test if filesystem is not corrupted */
+
+		i_eoff = le32_to_cpu(di->i_eoffset);
+		i_sblock = le32_to_cpu(di->i_sblock);
+		i_eblock = le32_to_cpu(di->i_eblock);
+		s_size = le32_to_cpu(bfs_sb->s_end);
+
+		if (i_sblock > info->si_blocks ||
+			i_eblock > info->si_blocks ||
+			i_sblock > i_eblock ||
+			i_eoff > s_size ||
+			i_sblock * BFS_BSIZE > i_eoff) {
+
+			printf("Inode 0x%08x corrupted\n", i);
+
+			brelse(bh);
+			ret = -EIO;
+			goto out3;
+		}
+
+		if (!di->i_ino) {
+			info->si_freei++;
+			continue;
+		}
+		set_bit(i, info->si_imap);
+		info->si_freeb -= BFS_FILEBLOCKS(di);
+
+		eblock =  le32_to_cpu(di->i_eblock);
+		if (eblock > info->si_lf_eblk)
+			info->si_lf_eblk = eblock;
+	}
+	brelse(bh);
+	brelse(sbh);
+	dump_imap("read_super", s);
+	return 0;
+
+out3:
+	dput(s->s_root);
+	s->s_root = NULL;
+out2:
+	kfree(info->si_imap);
+out1:
+	brelse(sbh);
+out:
+	mutex_destroy(&info->bfs_lock);
+	kfree(info);
+	s->s_fs_info = NULL;
+	return ret;
+}
+
+static struct dentry *bfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
+}
+
+static struct file_system_type bfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "bfs",
+	.mount		= bfs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_bfs_fs(void)
+{
+	int err = init_inodecache();
+	if (err)
+		goto out1;
+        err = register_filesystem(&bfs_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_bfs_fs(void)
+{
+	unregister_filesystem(&bfs_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_bfs_fs)
+module_exit(exit_bfs_fs)
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
new file mode 100644
index 00000000..a6395bdb
--- /dev/null
+++ b/fs/binfmt_aout.c
@@ -0,0 +1,467 @@
+/*
+ *  linux/fs/binfmt_aout.c
+ *
+ *  Copyright (C) 1991, 1992, 1996  Linus Torvalds
+ */
+
+#include <linux/module.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/a.out.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/binfmts.h>
+#include <linux/personality.h>
+#include <linux/init.h>
+#include <linux/coredump.h>
+#include <linux/slab.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/a.out-core.h>
+
+static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
+static int load_aout_library(struct file*);
+static int aout_core_dump(struct coredump_params *cprm);
+
+static struct linux_binfmt aout_format = {
+	.module		= THIS_MODULE,
+	.load_binary	= load_aout_binary,
+	.load_shlib	= load_aout_library,
+	.core_dump	= aout_core_dump,
+	.min_coredump	= PAGE_SIZE
+};
+
+#define BAD_ADDR(x)	((unsigned long)(x) >= TASK_SIZE)
+
+static int set_brk(unsigned long start, unsigned long end)
+{
+	start = PAGE_ALIGN(start);
+	end = PAGE_ALIGN(end);
+	if (end > start) {
+		unsigned long addr;
+		down_write(&current->mm->mmap_sem);
+		addr = do_brk(start, end - start);
+		up_write(&current->mm->mmap_sem);
+		if (BAD_ADDR(addr))
+			return addr;
+	}
+	return 0;
+}
+
+/*
+ * Routine writes a core dump image in the current directory.
+ * Currently only a stub-function.
+ *
+ * Note that setuid/setgid files won't make a core-dump if the uid/gid
+ * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
+ * field, which also makes sure the core-dumps won't be recursive if the
+ * dumping of the process results in another error..
+ */
+
+static int aout_core_dump(struct coredump_params *cprm)
+{
+	struct file *file = cprm->file;
+	mm_segment_t fs;
+	int has_dumped = 0;
+	void __user *dump_start;
+	int dump_size;
+	struct user dump;
+#ifdef __alpha__
+#       define START_DATA(u)	((void __user *)u.start_data)
+#else
+#	define START_DATA(u)	((void __user *)((u.u_tsize << PAGE_SHIFT) + \
+				 u.start_code))
+#endif
+#       define START_STACK(u)   ((void __user *)u.start_stack)
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	has_dumped = 1;
+	current->flags |= PF_DUMPCORE;
+       	strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
+	dump.u_ar0 = offsetof(struct user, regs);
+	dump.signal = cprm->signr;
+	aout_dump_thread(cprm->regs, &dump);
+
+/* If the size of the dump file exceeds the rlimit, then see what would happen
+   if we wrote the stack, but not the data area.  */
+	if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit)
+		dump.u_dsize = 0;
+
+/* Make sure we have enough room to write the stack and data areas. */
+	if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
+		dump.u_ssize = 0;
+
+/* make sure we actually have a data and stack area to dump */
+	set_fs(USER_DS);
+	if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
+		dump.u_dsize = 0;
+	if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
+		dump.u_ssize = 0;
+
+	set_fs(KERNEL_DS);
+/* struct user */
+	if (!dump_write(file, &dump, sizeof(dump)))
+		goto end_coredump;
+/* Now dump all of the user data.  Include malloced stuff as well */
+	if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump)))
+		goto end_coredump;
+/* now we start writing out the user space info */
+	set_fs(USER_DS);
+/* Dump the data area */
+	if (dump.u_dsize != 0) {
+		dump_start = START_DATA(dump);
+		dump_size = dump.u_dsize << PAGE_SHIFT;
+		if (!dump_write(file, dump_start, dump_size))
+			goto end_coredump;
+	}
+/* Now prepare to dump the stack area */
+	if (dump.u_ssize != 0) {
+		dump_start = START_STACK(dump);
+		dump_size = dump.u_ssize << PAGE_SHIFT;
+		if (!dump_write(file, dump_start, dump_size))
+			goto end_coredump;
+	}
+end_coredump:
+	set_fs(fs);
+	return has_dumped;
+}
+
+/*
+ * create_aout_tables() parses the env- and arg-strings in new user
+ * memory and creates the pointer tables from them, and puts their
+ * addresses on the "stack", returning the new stack pointer value.
+ */
+static unsigned long __user *create_aout_tables(char __user *p, struct linux_binprm * bprm)
+{
+	char __user * __user *argv;
+	char __user * __user *envp;
+	unsigned long __user *sp;
+	int argc = bprm->argc;
+	int envc = bprm->envc;
+
+	sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p);
+#ifdef __alpha__
+/* whee.. test-programs are so much fun. */
+	put_user(0, --sp);
+	put_user(0, --sp);
+	if (bprm->loader) {
+		put_user(0, --sp);
+		put_user(1003, --sp);
+		put_user(bprm->loader, --sp);
+		put_user(1002, --sp);
+	}
+	put_user(bprm->exec, --sp);
+	put_user(1001, --sp);
+#endif
+	sp -= envc+1;
+	envp = (char __user * __user *) sp;
+	sp -= argc+1;
+	argv = (char __user * __user *) sp;
+#ifndef __alpha__
+	put_user((unsigned long) envp,--sp);
+	put_user((unsigned long) argv,--sp);
+#endif
+	put_user(argc,--sp);
+	current->mm->arg_start = (unsigned long) p;
+	while (argc-->0) {
+		char c;
+		put_user(p,argv++);
+		do {
+			get_user(c,p++);
+		} while (c);
+	}
+	put_user(NULL,argv);
+	current->mm->arg_end = current->mm->env_start = (unsigned long) p;
+	while (envc-->0) {
+		char c;
+		put_user(p,envp++);
+		do {
+			get_user(c,p++);
+		} while (c);
+	}
+	put_user(NULL,envp);
+	current->mm->env_end = (unsigned long) p;
+	return sp;
+}
+
+/*
+ * These are the functions used to load a.out style executables and shared
+ * libraries.  There is no binary dependent code anywhere else.
+ */
+
+static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+{
+	struct exec ex;
+	unsigned long error;
+	unsigned long fd_offset;
+	unsigned long rlim;
+	int retval;
+
+	ex = *((struct exec *) bprm->buf);		/* exec-header */
+	if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
+	     N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
+	    N_TRSIZE(ex) || N_DRSIZE(ex) ||
+	    i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+		return -ENOEXEC;
+	}
+
+	/*
+	 * Requires a mmap handler. This prevents people from using a.out
+	 * as part of an exploit attack against /proc-related vulnerabilities.
+	 */
+	if (!bprm->file->f_op || !bprm->file->f_op->mmap)
+		return -ENOEXEC;
+
+	fd_offset = N_TXTOFF(ex);
+
+	/* Check initial limits. This avoids letting people circumvent
+	 * size limits imposed on them by creating programs with large
+	 * arrays in the data or bss.
+	 */
+	rlim = rlimit(RLIMIT_DATA);
+	if (rlim >= RLIM_INFINITY)
+		rlim = ~0;
+	if (ex.a_data + ex.a_bss > rlim)
+		return -ENOMEM;
+
+	/* Flush all traces of the currently running executable */
+	retval = flush_old_exec(bprm);
+	if (retval)
+		return retval;
+
+	/* OK, This is the point of no return */
+#ifdef __alpha__
+	SET_AOUT_PERSONALITY(bprm, ex);
+#else
+	set_personality(PER_LINUX);
+#endif
+	setup_new_exec(bprm);
+
+	current->mm->end_code = ex.a_text +
+		(current->mm->start_code = N_TXTADDR(ex));
+	current->mm->end_data = ex.a_data +
+		(current->mm->start_data = N_DATADDR(ex));
+	current->mm->brk = ex.a_bss +
+		(current->mm->start_brk = N_BSSADDR(ex));
+	current->mm->free_area_cache = current->mm->mmap_base;
+	current->mm->cached_hole_size = 0;
+
+	install_exec_creds(bprm);
+ 	current->flags &= ~PF_FORKNOEXEC;
+
+	if (N_MAGIC(ex) == OMAGIC) {
+		unsigned long text_addr, map_size;
+		loff_t pos;
+
+		text_addr = N_TXTADDR(ex);
+
+#ifdef __alpha__
+		pos = fd_offset;
+		map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1;
+#else
+		pos = 32;
+		map_size = ex.a_text+ex.a_data;
+#endif
+		down_write(&current->mm->mmap_sem);
+		error = do_brk(text_addr & PAGE_MASK, map_size);
+		up_write(&current->mm->mmap_sem);
+		if (error != (text_addr & PAGE_MASK)) {
+			send_sig(SIGKILL, current, 0);
+			return error;
+		}
+
+		error = bprm->file->f_op->read(bprm->file,
+			  (char __user *)text_addr,
+			  ex.a_text+ex.a_data, &pos);
+		if ((signed long)error < 0) {
+			send_sig(SIGKILL, current, 0);
+			return error;
+		}
+			 
+		flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
+	} else {
+		if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
+		    (N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
+		{
+			printk(KERN_NOTICE "executable not page aligned\n");
+		}
+
+		if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit())
+		{
+			printk(KERN_WARNING 
+			       "fd_offset is not page aligned. Please convert program: %s\n",
+			       bprm->file->f_path.dentry->d_name.name);
+		}
+
+		if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
+			loff_t pos = fd_offset;
+			down_write(&current->mm->mmap_sem);
+			do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
+			up_write(&current->mm->mmap_sem);
+			bprm->file->f_op->read(bprm->file,
+					(char __user *)N_TXTADDR(ex),
+					ex.a_text+ex.a_data, &pos);
+			flush_icache_range((unsigned long) N_TXTADDR(ex),
+					   (unsigned long) N_TXTADDR(ex) +
+					   ex.a_text+ex.a_data);
+			goto beyond_if;
+		}
+
+		down_write(&current->mm->mmap_sem);
+		error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
+			PROT_READ | PROT_EXEC,
+			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
+			fd_offset);
+		up_write(&current->mm->mmap_sem);
+
+		if (error != N_TXTADDR(ex)) {
+			send_sig(SIGKILL, current, 0);
+			return error;
+		}
+
+		down_write(&current->mm->mmap_sem);
+ 		error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
+				PROT_READ | PROT_WRITE | PROT_EXEC,
+				MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
+				fd_offset + ex.a_text);
+		up_write(&current->mm->mmap_sem);
+		if (error != N_DATADDR(ex)) {
+			send_sig(SIGKILL, current, 0);
+			return error;
+		}
+	}
+beyond_if:
+	set_binfmt(&aout_format);
+
+	retval = set_brk(current->mm->start_brk, current->mm->brk);
+	if (retval < 0) {
+		send_sig(SIGKILL, current, 0);
+		return retval;
+	}
+
+	retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
+	if (retval < 0) { 
+		/* Someone check-me: is this error path enough? */ 
+		send_sig(SIGKILL, current, 0); 
+		return retval;
+	}
+
+	current->mm->start_stack =
+		(unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
+#ifdef __alpha__
+	regs->gp = ex.a_gpvalue;
+#endif
+	start_thread(regs, ex.a_entry, current->mm->start_stack);
+	return 0;
+}
+
+static int load_aout_library(struct file *file)
+{
+	struct inode * inode;
+	unsigned long bss, start_addr, len;
+	unsigned long error;
+	int retval;
+	struct exec ex;
+
+	inode = file->f_path.dentry->d_inode;
+
+	retval = -ENOEXEC;
+	error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
+	if (error != sizeof(ex))
+		goto out;
+
+	/* We come in here for the regular a.out style of shared libraries */
+	if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
+	    N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
+	    i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+		goto out;
+	}
+
+	/*
+	 * Requires a mmap handler. This prevents people from using a.out
+	 * as part of an exploit attack against /proc-related vulnerabilities.
+	 */
+	if (!file->f_op || !file->f_op->mmap)
+		goto out;
+
+	if (N_FLAGS(ex))
+		goto out;
+
+	/* For  QMAGIC, the starting address is 0x20 into the page.  We mask
+	   this off to get the starting address for the page */
+
+	start_addr =  ex.a_entry & 0xfffff000;
+
+	if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
+		loff_t pos = N_TXTOFF(ex);
+
+		if (printk_ratelimit())
+		{
+			printk(KERN_WARNING 
+			       "N_TXTOFF is not page aligned. Please convert library: %s\n",
+			       file->f_path.dentry->d_name.name);
+		}
+		down_write(&current->mm->mmap_sem);
+		do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
+		up_write(&current->mm->mmap_sem);
+		
+		file->f_op->read(file, (char __user *)start_addr,
+			ex.a_text + ex.a_data, &pos);
+		flush_icache_range((unsigned long) start_addr,
+				   (unsigned long) start_addr + ex.a_text + ex.a_data);
+
+		retval = 0;
+		goto out;
+	}
+	/* Now use mmap to map the library into memory. */
+	down_write(&current->mm->mmap_sem);
+	error = do_mmap(file, start_addr, ex.a_text + ex.a_data,
+			PROT_READ | PROT_WRITE | PROT_EXEC,
+			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
+			N_TXTOFF(ex));
+	up_write(&current->mm->mmap_sem);
+	retval = error;
+	if (error != start_addr)
+		goto out;
+
+	len = PAGE_ALIGN(ex.a_text + ex.a_data);
+	bss = ex.a_text + ex.a_data + ex.a_bss;
+	if (bss > len) {
+		down_write(&current->mm->mmap_sem);
+		error = do_brk(start_addr + len, bss - len);
+		up_write(&current->mm->mmap_sem);
+		retval = error;
+		if (error != start_addr + len)
+			goto out;
+	}
+	retval = 0;
+out:
+	return retval;
+}
+
+static int __init init_aout_binfmt(void)
+{
+	return register_binfmt(&aout_format);
+}
+
+static void __exit exit_aout_binfmt(void)
+{
+	unregister_binfmt(&aout_format);
+}
+
+core_initcall(init_aout_binfmt);
+module_exit(exit_aout_binfmt);
+MODULE_LICENSE("GPL");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
new file mode 100644
index 00000000..618493e4
--- /dev/null
+++ b/fs/binfmt_elf.c
@@ -0,0 +1,2092 @@
+/*
+ * linux/fs/binfmt_elf.c
+ *
+ * These are the functions used to load ELF format executables as used
+ * on SVr4 machines.  Information on the format may be found in the book
+ * "UNIX SYSTEM V RELEASE 4 Programmers Guide: Ansi C and Programming Support
+ * Tools".
+ *
+ * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com).
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/binfmts.h>
+#include <linux/string.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/personality.h>
+#include <linux/elfcore.h>
+#include <linux/init.h>
+#include <linux/highuid.h>
+#include <linux/compiler.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+#include <linux/random.h>
+#include <linux/elf.h>
+#include <linux/utsname.h>
+#include <linux/coredump.h>
+#include <asm/uaccess.h>
+#include <asm/param.h>
+#include <asm/page.h>
+
+static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
+static int load_elf_library(struct file *);
+static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
+				int, int, unsigned long);
+
+/*
+ * If we don't support core dumping, then supply a NULL so we
+ * don't even try.
+ */
+#ifdef CONFIG_ELF_CORE
+static int elf_core_dump(struct coredump_params *cprm);
+#else
+#define elf_core_dump	NULL
+#endif
+
+#if ELF_EXEC_PAGESIZE > PAGE_SIZE
+#define ELF_MIN_ALIGN	ELF_EXEC_PAGESIZE
+#else
+#define ELF_MIN_ALIGN	PAGE_SIZE
+#endif
+
+#ifndef ELF_CORE_EFLAGS
+#define ELF_CORE_EFLAGS	0
+#endif
+
+#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1))
+#define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1))
+#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
+
+static struct linux_binfmt elf_format = {
+	.module		= THIS_MODULE,
+	.load_binary	= load_elf_binary,
+	.load_shlib	= load_elf_library,
+	.core_dump	= elf_core_dump,
+	.min_coredump	= ELF_EXEC_PAGESIZE,
+};
+
+#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
+
+static int set_brk(unsigned long start, unsigned long end)
+{
+	start = ELF_PAGEALIGN(start);
+	end = ELF_PAGEALIGN(end);
+	if (end > start) {
+		unsigned long addr;
+		down_write(&current->mm->mmap_sem);
+		addr = do_brk(start, end - start);
+		up_write(&current->mm->mmap_sem);
+		if (BAD_ADDR(addr))
+			return addr;
+	}
+	current->mm->start_brk = current->mm->brk = end;
+	return 0;
+}
+
+/* We need to explicitly zero any fractional pages
+   after the data section (i.e. bss).  This would
+   contain the junk from the file that should not
+   be in memory
+ */
+static int padzero(unsigned long elf_bss)
+{
+	unsigned long nbyte;
+
+	nbyte = ELF_PAGEOFFSET(elf_bss);
+	if (nbyte) {
+		nbyte = ELF_MIN_ALIGN - nbyte;
+		if (clear_user((void __user *) elf_bss, nbyte))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+/* Let's use some macros to make this stack manipulation a little clearer */
+#ifdef CONFIG_STACK_GROWSUP
+#define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) + (items))
+#define STACK_ROUND(sp, items) \
+	((15 + (unsigned long) ((sp) + (items))) &~ 15UL)
+#define STACK_ALLOC(sp, len) ({ \
+	elf_addr_t __user *old_sp = (elf_addr_t __user *)sp; sp += len; \
+	old_sp; })
+#else
+#define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) - (items))
+#define STACK_ROUND(sp, items) \
+	(((unsigned long) (sp - items)) &~ 15UL)
+#define STACK_ALLOC(sp, len) ({ sp -= len ; sp; })
+#endif
+
+#ifndef ELF_BASE_PLATFORM
+/*
+ * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture.
+ * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value
+ * will be copied to the user stack in the same manner as AT_PLATFORM.
+ */
+#define ELF_BASE_PLATFORM NULL
+#endif
+
+static int
+create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
+		unsigned long load_addr, unsigned long interp_load_addr)
+{
+	unsigned long p = bprm->p;
+	int argc = bprm->argc;
+	int envc = bprm->envc;
+	elf_addr_t __user *argv;
+	elf_addr_t __user *envp;
+	elf_addr_t __user *sp;
+	elf_addr_t __user *u_platform;
+	elf_addr_t __user *u_base_platform;
+	elf_addr_t __user *u_rand_bytes;
+	const char *k_platform = ELF_PLATFORM;
+	const char *k_base_platform = ELF_BASE_PLATFORM;
+	unsigned char k_rand_bytes[16];
+	int items;
+	elf_addr_t *elf_info;
+	int ei_index = 0;
+	const struct cred *cred = current_cred();
+	struct vm_area_struct *vma;
+
+	/*
+	 * In some cases (e.g. Hyper-Threading), we want to avoid L1
+	 * evictions by the processes running on the same package. One
+	 * thing we can do is to shuffle the initial stack for them.
+	 */
+
+	p = arch_align_stack(p);
+
+	/*
+	 * If this architecture has a platform capability string, copy it
+	 * to userspace.  In some cases (Sparc), this info is impossible
+	 * for userspace to get any other way, in others (i386) it is
+	 * merely difficult.
+	 */
+	u_platform = NULL;
+	if (k_platform) {
+		size_t len = strlen(k_platform) + 1;
+
+		u_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
+		if (__copy_to_user(u_platform, k_platform, len))
+			return -EFAULT;
+	}
+
+	/*
+	 * If this architecture has a "base" platform capability
+	 * string, copy it to userspace.
+	 */
+	u_base_platform = NULL;
+	if (k_base_platform) {
+		size_t len = strlen(k_base_platform) + 1;
+
+		u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
+		if (__copy_to_user(u_base_platform, k_base_platform, len))
+			return -EFAULT;
+	}
+
+	/*
+	 * Generate 16 random bytes for userspace PRNG seeding.
+	 */
+	get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+	u_rand_bytes = (elf_addr_t __user *)
+		       STACK_ALLOC(p, sizeof(k_rand_bytes));
+	if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
+		return -EFAULT;
+
+	/* Create the ELF interpreter info */
+	elf_info = (elf_addr_t *)current->mm->saved_auxv;
+	/* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
+#define NEW_AUX_ENT(id, val) \
+	do { \
+		elf_info[ei_index++] = id; \
+		elf_info[ei_index++] = val; \
+	} while (0)
+
+#ifdef ARCH_DLINFO
+	/* 
+	 * ARCH_DLINFO must come first so PPC can do its special alignment of
+	 * AUXV.
+	 * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in
+	 * ARCH_DLINFO changes
+	 */
+	ARCH_DLINFO;
+#endif
+	NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
+	NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
+	NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
+	NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff);
+	NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
+	NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
+	NEW_AUX_ENT(AT_BASE, interp_load_addr);
+	NEW_AUX_ENT(AT_FLAGS, 0);
+	NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
+	NEW_AUX_ENT(AT_UID, cred->uid);
+	NEW_AUX_ENT(AT_EUID, cred->euid);
+	NEW_AUX_ENT(AT_GID, cred->gid);
+	NEW_AUX_ENT(AT_EGID, cred->egid);
+ 	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+	NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
+	NEW_AUX_ENT(AT_EXECFN, bprm->exec);
+	if (k_platform) {
+		NEW_AUX_ENT(AT_PLATFORM,
+			    (elf_addr_t)(unsigned long)u_platform);
+	}
+	if (k_base_platform) {
+		NEW_AUX_ENT(AT_BASE_PLATFORM,
+			    (elf_addr_t)(unsigned long)u_base_platform);
+	}
+	if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
+		NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
+	}
+#undef NEW_AUX_ENT
+	/* AT_NULL is zero; clear the rest too */
+	memset(&elf_info[ei_index], 0,
+	       sizeof current->mm->saved_auxv - ei_index * sizeof elf_info[0]);
+
+	/* And advance past the AT_NULL entry.  */
+	ei_index += 2;
+
+	sp = STACK_ADD(p, ei_index);
+
+	items = (argc + 1) + (envc + 1) + 1;
+	bprm->p = STACK_ROUND(sp, items);
+
+	/* Point sp at the lowest address on the stack */
+#ifdef CONFIG_STACK_GROWSUP
+	sp = (elf_addr_t __user *)bprm->p - items - ei_index;
+	bprm->exec = (unsigned long)sp; /* XXX: PARISC HACK */
+#else
+	sp = (elf_addr_t __user *)bprm->p;
+#endif
+
+
+	/*
+	 * Grow the stack manually; some architectures have a limit on how
+	 * far ahead a user-space access may be in order to grow the stack.
+	 */
+	vma = find_extend_vma(current->mm, bprm->p);
+	if (!vma)
+		return -EFAULT;
+
+	/* Now, let's put argc (and argv, envp if appropriate) on the stack */
+	if (__put_user(argc, sp++))
+		return -EFAULT;
+	argv = sp;
+	envp = argv + argc + 1;
+
+	/* Populate argv and envp */
+	p = current->mm->arg_end = current->mm->arg_start;
+	while (argc-- > 0) {
+		size_t len;
+		if (__put_user((elf_addr_t)p, argv++))
+			return -EFAULT;
+		len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
+			return -EINVAL;
+		p += len;
+	}
+	if (__put_user(0, argv))
+		return -EFAULT;
+	current->mm->arg_end = current->mm->env_start = p;
+	while (envc-- > 0) {
+		size_t len;
+		if (__put_user((elf_addr_t)p, envp++))
+			return -EFAULT;
+		len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
+			return -EINVAL;
+		p += len;
+	}
+	if (__put_user(0, envp))
+		return -EFAULT;
+	current->mm->env_end = p;
+
+	/* Put the elf_info on the stack in the right place.  */
+	sp = (elf_addr_t __user *)envp + 1;
+	if (copy_to_user(sp, elf_info, ei_index * sizeof(elf_addr_t)))
+		return -EFAULT;
+	return 0;
+}
+
+static unsigned long elf_map(struct file *filep, unsigned long addr,
+		struct elf_phdr *eppnt, int prot, int type,
+		unsigned long total_size)
+{
+	unsigned long map_addr;
+	unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
+	unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
+	addr = ELF_PAGESTART(addr);
+	size = ELF_PAGEALIGN(size);
+
+	/* mmap() will return -EINVAL if given a zero size, but a
+	 * segment with zero filesize is perfectly valid */
+	if (!size)
+		return addr;
+
+	down_write(&current->mm->mmap_sem);
+	/*
+	* total_size is the size of the ELF (interpreter) image.
+	* The _first_ mmap needs to know the full size, otherwise
+	* randomization might put this image into an overlapping
+	* position with the ELF binary image. (since size < total_size)
+	* So we first map the 'big' image - and unmap the remainder at
+	* the end. (which unmap is needed for ELF images with holes.)
+	*/
+	if (total_size) {
+		total_size = ELF_PAGEALIGN(total_size);
+		map_addr = do_mmap(filep, addr, total_size, prot, type, off);
+		if (!BAD_ADDR(map_addr))
+			do_munmap(current->mm, map_addr+size, total_size-size);
+	} else
+		map_addr = do_mmap(filep, addr, size, prot, type, off);
+
+	up_write(&current->mm->mmap_sem);
+	return(map_addr);
+}
+
+static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
+{
+	int i, first_idx = -1, last_idx = -1;
+
+	for (i = 0; i < nr; i++) {
+		if (cmds[i].p_type == PT_LOAD) {
+			last_idx = i;
+			if (first_idx == -1)
+				first_idx = i;
+		}
+	}
+	if (first_idx == -1)
+		return 0;
+
+	return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
+				ELF_PAGESTART(cmds[first_idx].p_vaddr);
+}
+
+
+/* This is much more generalized than the library routine read function,
+   so we keep this separate.  Technically the library read function
+   is only provided so that we can read a.out libraries that have
+   an ELF header */
+
+static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
+		struct file *interpreter, unsigned long *interp_map_addr,
+		unsigned long no_base)
+{
+	struct elf_phdr *elf_phdata;
+	struct elf_phdr *eppnt;
+	unsigned long load_addr = 0;
+	int load_addr_set = 0;
+	unsigned long last_bss = 0, elf_bss = 0;
+	unsigned long error = ~0UL;
+	unsigned long total_size;
+	int retval, i, size;
+
+	/* First of all, some simple consistency checks */
+	if (interp_elf_ex->e_type != ET_EXEC &&
+	    interp_elf_ex->e_type != ET_DYN)
+		goto out;
+	if (!elf_check_arch(interp_elf_ex))
+		goto out;
+	if (!interpreter->f_op || !interpreter->f_op->mmap)
+		goto out;
+
+	/*
+	 * If the size of this structure has changed, then punt, since
+	 * we will be doing the wrong thing.
+	 */
+	if (interp_elf_ex->e_phentsize != sizeof(struct elf_phdr))
+		goto out;
+	if (interp_elf_ex->e_phnum < 1 ||
+		interp_elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr))
+		goto out;
+
+	/* Now read in all of the header information */
+	size = sizeof(struct elf_phdr) * interp_elf_ex->e_phnum;
+	if (size > ELF_MIN_ALIGN)
+		goto out;
+	elf_phdata = kmalloc(size, GFP_KERNEL);
+	if (!elf_phdata)
+		goto out;
+
+	retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
+			     (char *)elf_phdata, size);
+	error = -EIO;
+	if (retval != size) {
+		if (retval < 0)
+			error = retval;	
+		goto out_close;
+	}
+
+	total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum);
+	if (!total_size) {
+		error = -EINVAL;
+		goto out_close;
+	}
+
+	eppnt = elf_phdata;
+	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
+		if (eppnt->p_type == PT_LOAD) {
+			int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
+			int elf_prot = 0;
+			unsigned long vaddr = 0;
+			unsigned long k, map_addr;
+
+			if (eppnt->p_flags & PF_R)
+		    		elf_prot = PROT_READ;
+			if (eppnt->p_flags & PF_W)
+				elf_prot |= PROT_WRITE;
+			if (eppnt->p_flags & PF_X)
+				elf_prot |= PROT_EXEC;
+			vaddr = eppnt->p_vaddr;
+			if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
+				elf_type |= MAP_FIXED;
+			else if (no_base && interp_elf_ex->e_type == ET_DYN)
+				load_addr = -vaddr;
+
+			map_addr = elf_map(interpreter, load_addr + vaddr,
+					eppnt, elf_prot, elf_type, total_size);
+			total_size = 0;
+			if (!*interp_map_addr)
+				*interp_map_addr = map_addr;
+			error = map_addr;
+			if (BAD_ADDR(map_addr))
+				goto out_close;
+
+			if (!load_addr_set &&
+			    interp_elf_ex->e_type == ET_DYN) {
+				load_addr = map_addr - ELF_PAGESTART(vaddr);
+				load_addr_set = 1;
+			}
+
+			/*
+			 * Check to see if the section's size will overflow the
+			 * allowed task size. Note that p_filesz must always be
+			 * <= p_memsize so it's only necessary to check p_memsz.
+			 */
+			k = load_addr + eppnt->p_vaddr;
+			if (BAD_ADDR(k) ||
+			    eppnt->p_filesz > eppnt->p_memsz ||
+			    eppnt->p_memsz > TASK_SIZE ||
+			    TASK_SIZE - eppnt->p_memsz < k) {
+				error = -ENOMEM;
+				goto out_close;
+			}
+
+			/*
+			 * Find the end of the file mapping for this phdr, and
+			 * keep track of the largest address we see for this.
+			 */
+			k = load_addr + eppnt->p_vaddr + eppnt->p_filesz;
+			if (k > elf_bss)
+				elf_bss = k;
+
+			/*
+			 * Do the same thing for the memory mapping - between
+			 * elf_bss and last_bss is the bss section.
+			 */
+			k = load_addr + eppnt->p_memsz + eppnt->p_vaddr;
+			if (k > last_bss)
+				last_bss = k;
+		}
+	}
+
+	if (last_bss > elf_bss) {
+		/*
+		 * Now fill out the bss section.  First pad the last page up
+		 * to the page boundary, and then perform a mmap to make sure
+		 * that there are zero-mapped pages up to and including the
+		 * last bss page.
+		 */
+		if (padzero(elf_bss)) {
+			error = -EFAULT;
+			goto out_close;
+		}
+
+		/* What we have mapped so far */
+		elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
+
+		/* Map the last of the bss segment */
+		down_write(&current->mm->mmap_sem);
+		error = do_brk(elf_bss, last_bss - elf_bss);
+		up_write(&current->mm->mmap_sem);
+		if (BAD_ADDR(error))
+			goto out_close;
+	}
+
+	error = load_addr;
+
+out_close:
+	kfree(elf_phdata);
+out:
+	return error;
+}
+
+/*
+ * These are the functions used to load ELF style executables and shared
+ * libraries.  There is no binary dependent code anywhere else.
+ */
+
+#define INTERPRETER_NONE 0
+#define INTERPRETER_ELF 2
+
+#ifndef STACK_RND_MASK
+#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))	/* 8MB of VA */
+#endif
+
+static unsigned long randomize_stack_top(unsigned long stack_top)
+{
+	unsigned int random_variable = 0;
+
+	if ((current->flags & PF_RANDOMIZE) &&
+		!(current->personality & ADDR_NO_RANDOMIZE)) {
+		random_variable = get_random_int() & STACK_RND_MASK;
+		random_variable <<= PAGE_SHIFT;
+	}
+#ifdef CONFIG_STACK_GROWSUP
+	return PAGE_ALIGN(stack_top) + random_variable;
+#else
+	return PAGE_ALIGN(stack_top) - random_variable;
+#endif
+}
+
+static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+{
+	struct file *interpreter = NULL; /* to shut gcc up */
+ 	unsigned long load_addr = 0, load_bias = 0;
+	int load_addr_set = 0;
+	char * elf_interpreter = NULL;
+	unsigned long error;
+	struct elf_phdr *elf_ppnt, *elf_phdata;
+	unsigned long elf_bss, elf_brk;
+	int retval, i;
+	unsigned int size;
+	unsigned long elf_entry;
+	unsigned long interp_load_addr = 0;
+	unsigned long start_code, end_code, start_data, end_data;
+	unsigned long reloc_func_desc __maybe_unused = 0;
+	int executable_stack = EXSTACK_DEFAULT;
+	unsigned long def_flags = 0;
+	struct {
+		struct elfhdr elf_ex;
+		struct elfhdr interp_elf_ex;
+	} *loc;
+
+	loc = kmalloc(sizeof(*loc), GFP_KERNEL);
+	if (!loc) {
+		retval = -ENOMEM;
+		goto out_ret;
+	}
+	
+	/* Get the exec-header */
+	loc->elf_ex = *((struct elfhdr *)bprm->buf);
+
+	retval = -ENOEXEC;
+	/* First of all, some simple consistency checks */
+	if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+		goto out;
+
+	if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
+		goto out;
+	if (!elf_check_arch(&loc->elf_ex))
+		goto out;
+	if (!bprm->file->f_op || !bprm->file->f_op->mmap)
+		goto out;
+
+	/* Now read in all of the header information */
+	if (loc->elf_ex.e_phentsize != sizeof(struct elf_phdr))
+		goto out;
+	if (loc->elf_ex.e_phnum < 1 ||
+	 	loc->elf_ex.e_phnum > 65536U / sizeof(struct elf_phdr))
+		goto out;
+	size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr);
+	retval = -ENOMEM;
+	elf_phdata = kmalloc(size, GFP_KERNEL);
+	if (!elf_phdata)
+		goto out;
+
+	retval = kernel_read(bprm->file, loc->elf_ex.e_phoff,
+			     (char *)elf_phdata, size);
+	if (retval != size) {
+		if (retval >= 0)
+			retval = -EIO;
+		goto out_free_ph;
+	}
+
+	elf_ppnt = elf_phdata;
+	elf_bss = 0;
+	elf_brk = 0;
+
+	start_code = ~0UL;
+	end_code = 0;
+	start_data = 0;
+	end_data = 0;
+
+	for (i = 0; i < loc->elf_ex.e_phnum; i++) {
+		if (elf_ppnt->p_type == PT_INTERP) {
+			/* This is the program interpreter used for
+			 * shared libraries - for now assume that this
+			 * is an a.out format binary
+			 */
+			retval = -ENOEXEC;
+			if (elf_ppnt->p_filesz > PATH_MAX || 
+			    elf_ppnt->p_filesz < 2)
+				goto out_free_ph;
+
+			retval = -ENOMEM;
+			elf_interpreter = kmalloc(elf_ppnt->p_filesz,
+						  GFP_KERNEL);
+			if (!elf_interpreter)
+				goto out_free_ph;
+
+			retval = kernel_read(bprm->file, elf_ppnt->p_offset,
+					     elf_interpreter,
+					     elf_ppnt->p_filesz);
+			if (retval != elf_ppnt->p_filesz) {
+				if (retval >= 0)
+					retval = -EIO;
+				goto out_free_interp;
+			}
+			/* make sure path is NULL terminated */
+			retval = -ENOEXEC;
+			if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
+				goto out_free_interp;
+
+			interpreter = open_exec(elf_interpreter);
+			retval = PTR_ERR(interpreter);
+			if (IS_ERR(interpreter))
+				goto out_free_interp;
+
+			/*
+			 * If the binary is not readable then enforce
+			 * mm->dumpable = 0 regardless of the interpreter's
+			 * permissions.
+			 */
+			if (file_permission(interpreter, MAY_READ) < 0)
+				bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+
+			retval = kernel_read(interpreter, 0, bprm->buf,
+					     BINPRM_BUF_SIZE);
+			if (retval != BINPRM_BUF_SIZE) {
+				if (retval >= 0)
+					retval = -EIO;
+				goto out_free_dentry;
+			}
+
+			/* Get the exec headers */
+			loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
+			break;
+		}
+		elf_ppnt++;
+	}
+
+	elf_ppnt = elf_phdata;
+	for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++)
+		if (elf_ppnt->p_type == PT_GNU_STACK) {
+			if (elf_ppnt->p_flags & PF_X)
+				executable_stack = EXSTACK_ENABLE_X;
+			else
+				executable_stack = EXSTACK_DISABLE_X;
+			break;
+		}
+
+	/* Some simple consistency checks for the interpreter */
+	if (elf_interpreter) {
+		retval = -ELIBBAD;
+		/* Not an ELF interpreter */
+		if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+			goto out_free_dentry;
+		/* Verify the interpreter has a valid arch */
+		if (!elf_check_arch(&loc->interp_elf_ex))
+			goto out_free_dentry;
+	}
+
+	/* Flush all traces of the currently running executable */
+	retval = flush_old_exec(bprm);
+	if (retval)
+		goto out_free_dentry;
+
+	/* OK, This is the point of no return */
+	current->flags &= ~PF_FORKNOEXEC;
+	current->mm->def_flags = def_flags;
+
+	/* Do this immediately, since STACK_TOP as used in setup_arg_pages
+	   may depend on the personality.  */
+	SET_PERSONALITY(loc->elf_ex);
+	if (elf_read_implies_exec(loc->elf_ex, executable_stack))
+		current->personality |= READ_IMPLIES_EXEC;
+
+	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+		current->flags |= PF_RANDOMIZE;
+
+	setup_new_exec(bprm);
+
+	/* Do this so that we can load the interpreter, if need be.  We will
+	   change some of these later */
+	current->mm->free_area_cache = current->mm->mmap_base;
+	current->mm->cached_hole_size = 0;
+	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
+				 executable_stack);
+	if (retval < 0) {
+		send_sig(SIGKILL, current, 0);
+		goto out_free_dentry;
+	}
+	
+	current->mm->start_stack = bprm->p;
+
+	/* Now we do a little grungy work by mmapping the ELF image into
+	   the correct location in memory. */
+	for(i = 0, elf_ppnt = elf_phdata;
+	    i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
+		int elf_prot = 0, elf_flags;
+		unsigned long k, vaddr;
+
+		if (elf_ppnt->p_type != PT_LOAD)
+			continue;
+
+		if (unlikely (elf_brk > elf_bss)) {
+			unsigned long nbyte;
+	            
+			/* There was a PT_LOAD segment with p_memsz > p_filesz
+			   before this one. Map anonymous pages, if needed,
+			   and clear the area.  */
+			retval = set_brk(elf_bss + load_bias,
+					 elf_brk + load_bias);
+			if (retval) {
+				send_sig(SIGKILL, current, 0);
+				goto out_free_dentry;
+			}
+			nbyte = ELF_PAGEOFFSET(elf_bss);
+			if (nbyte) {
+				nbyte = ELF_MIN_ALIGN - nbyte;
+				if (nbyte > elf_brk - elf_bss)
+					nbyte = elf_brk - elf_bss;
+				if (clear_user((void __user *)elf_bss +
+							load_bias, nbyte)) {
+					/*
+					 * This bss-zeroing can fail if the ELF
+					 * file specifies odd protections. So
+					 * we don't check the return value
+					 */
+				}
+			}
+		}
+
+		if (elf_ppnt->p_flags & PF_R)
+			elf_prot |= PROT_READ;
+		if (elf_ppnt->p_flags & PF_W)
+			elf_prot |= PROT_WRITE;
+		if (elf_ppnt->p_flags & PF_X)
+			elf_prot |= PROT_EXEC;
+
+		elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
+
+		vaddr = elf_ppnt->p_vaddr;
+		if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
+			elf_flags |= MAP_FIXED;
+		} else if (loc->elf_ex.e_type == ET_DYN) {
+			/* Try and get dynamic programs out of the way of the
+			 * default mmap base, as well as whatever program they
+			 * might try to exec.  This is because the brk will
+			 * follow the loader, and is not movable.  */
+#if defined(CONFIG_X86) || defined(CONFIG_ARM)
+			/* Memory randomization might have been switched off
+			 * in runtime via sysctl.
+			 * If that is the case, retain the original non-zero
+			 * load_bias value in order to establish proper
+			 * non-randomized mappings.
+			 */
+			if (current->flags & PF_RANDOMIZE)
+				load_bias = 0;
+			else
+				load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
+#else
+			load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
+#endif
+		}
+
+		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
+				elf_prot, elf_flags, 0);
+		if (BAD_ADDR(error)) {
+			send_sig(SIGKILL, current, 0);
+			retval = IS_ERR((void *)error) ?
+				PTR_ERR((void*)error) : -EINVAL;
+			goto out_free_dentry;
+		}
+
+		if (!load_addr_set) {
+			load_addr_set = 1;
+			load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
+			if (loc->elf_ex.e_type == ET_DYN) {
+				load_bias += error -
+				             ELF_PAGESTART(load_bias + vaddr);
+				load_addr += load_bias;
+				reloc_func_desc = load_bias;
+			}
+		}
+		k = elf_ppnt->p_vaddr;
+		if (k < start_code)
+			start_code = k;
+		if (start_data < k)
+			start_data = k;
+
+		/*
+		 * Check to see if the section's size will overflow the
+		 * allowed task size. Note that p_filesz must always be
+		 * <= p_memsz so it is only necessary to check p_memsz.
+		 */
+		if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||
+		    elf_ppnt->p_memsz > TASK_SIZE ||
+		    TASK_SIZE - elf_ppnt->p_memsz < k) {
+			/* set_brk can never work. Avoid overflows. */
+			send_sig(SIGKILL, current, 0);
+			retval = -EINVAL;
+			goto out_free_dentry;
+		}
+
+		k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;
+
+		if (k > elf_bss)
+			elf_bss = k;
+		if ((elf_ppnt->p_flags & PF_X) && end_code < k)
+			end_code = k;
+		if (end_data < k)
+			end_data = k;
+		k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
+		if (k > elf_brk)
+			elf_brk = k;
+	}
+
+	loc->elf_ex.e_entry += load_bias;
+	elf_bss += load_bias;
+	elf_brk += load_bias;
+	start_code += load_bias;
+	end_code += load_bias;
+	start_data += load_bias;
+	end_data += load_bias;
+
+	/* Calling set_brk effectively mmaps the pages that we need
+	 * for the bss and break sections.  We must do this before
+	 * mapping in the interpreter, to make sure it doesn't wind
+	 * up getting placed where the bss needs to go.
+	 */
+	retval = set_brk(elf_bss, elf_brk);
+	if (retval) {
+		send_sig(SIGKILL, current, 0);
+		goto out_free_dentry;
+	}
+	if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
+		send_sig(SIGSEGV, current, 0);
+		retval = -EFAULT; /* Nobody gets to see this, but.. */
+		goto out_free_dentry;
+	}
+
+	if (elf_interpreter) {
+		unsigned long uninitialized_var(interp_map_addr);
+
+		elf_entry = load_elf_interp(&loc->interp_elf_ex,
+					    interpreter,
+					    &interp_map_addr,
+					    load_bias);
+		if (!IS_ERR((void *)elf_entry)) {
+			/*
+			 * load_elf_interp() returns relocation
+			 * adjustment
+			 */
+			interp_load_addr = elf_entry;
+			elf_entry += loc->interp_elf_ex.e_entry;
+		}
+		if (BAD_ADDR(elf_entry)) {
+			force_sig(SIGSEGV, current);
+			retval = IS_ERR((void *)elf_entry) ?
+					(int)elf_entry : -EINVAL;
+			goto out_free_dentry;
+		}
+		reloc_func_desc = interp_load_addr;
+
+		allow_write_access(interpreter);
+		fput(interpreter);
+		kfree(elf_interpreter);
+	} else {
+		elf_entry = loc->elf_ex.e_entry;
+		if (BAD_ADDR(elf_entry)) {
+			force_sig(SIGSEGV, current);
+			retval = -EINVAL;
+			goto out_free_dentry;
+		}
+	}
+
+	kfree(elf_phdata);
+
+	set_binfmt(&elf_format);
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+	retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
+	if (retval < 0) {
+		send_sig(SIGKILL, current, 0);
+		goto out;
+	}
+#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
+
+	install_exec_creds(bprm);
+	current->flags &= ~PF_FORKNOEXEC;
+	retval = create_elf_tables(bprm, &loc->elf_ex,
+			  load_addr, interp_load_addr);
+	if (retval < 0) {
+		send_sig(SIGKILL, current, 0);
+		goto out;
+	}
+	/* N.B. passed_fileno might not be initialized? */
+	current->mm->end_code = end_code;
+	current->mm->start_code = start_code;
+	current->mm->start_data = start_data;
+	current->mm->end_data = end_data;
+	current->mm->start_stack = bprm->p;
+
+#ifdef arch_randomize_brk
+	if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
+		current->mm->brk = current->mm->start_brk =
+			arch_randomize_brk(current->mm);
+#ifdef CONFIG_COMPAT_BRK
+		current->brk_randomized = 1;
+#endif
+	}
+#endif
+
+	if (current->personality & MMAP_PAGE_ZERO) {
+		/* Why this, you ask???  Well SVr4 maps page 0 as read-only,
+		   and some applications "depend" upon this behavior.
+		   Since we do not have the power to recompile these, we
+		   emulate the SVr4 behavior. Sigh. */
+		down_write(&current->mm->mmap_sem);
+		error = do_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
+				MAP_FIXED | MAP_PRIVATE, 0);
+		up_write(&current->mm->mmap_sem);
+	}
+
+#ifdef ELF_PLAT_INIT
+	/*
+	 * The ABI may specify that certain registers be set up in special
+	 * ways (on i386 %edx is the address of a DT_FINI function, for
+	 * example.  In addition, it may also specify (eg, PowerPC64 ELF)
+	 * that the e_entry field is the address of the function descriptor
+	 * for the startup routine, rather than the address of the startup
+	 * routine itself.  This macro performs whatever initialization to
+	 * the regs structure is required as well as any relocations to the
+	 * function descriptor entries when executing dynamically links apps.
+	 */
+	ELF_PLAT_INIT(regs, reloc_func_desc);
+#endif
+
+	start_thread(regs, elf_entry, bprm->p);
+	retval = 0;
+out:
+	kfree(loc);
+out_ret:
+	return retval;
+
+	/* error cleanup */
+out_free_dentry:
+	allow_write_access(interpreter);
+	if (interpreter)
+		fput(interpreter);
+out_free_interp:
+	kfree(elf_interpreter);
+out_free_ph:
+	kfree(elf_phdata);
+	goto out;
+}
+
+/* This is really simpleminded and specialized - we are loading an
+   a.out library that is given an ELF header. */
+static int load_elf_library(struct file *file)
+{
+	struct elf_phdr *elf_phdata;
+	struct elf_phdr *eppnt;
+	unsigned long elf_bss, bss, len;
+	int retval, error, i, j;
+	struct elfhdr elf_ex;
+
+	error = -ENOEXEC;
+	retval = kernel_read(file, 0, (char *)&elf_ex, sizeof(elf_ex));
+	if (retval != sizeof(elf_ex))
+		goto out;
+
+	if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+		goto out;
+
+	/* First of all, some simple consistency checks */
+	if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 ||
+	    !elf_check_arch(&elf_ex) || !file->f_op || !file->f_op->mmap)
+		goto out;
+
+	/* Now read in all of the header information */
+
+	j = sizeof(struct elf_phdr) * elf_ex.e_phnum;
+	/* j < ELF_MIN_ALIGN because elf_ex.e_phnum <= 2 */
+
+	error = -ENOMEM;
+	elf_phdata = kmalloc(j, GFP_KERNEL);
+	if (!elf_phdata)
+		goto out;
+
+	eppnt = elf_phdata;
+	error = -ENOEXEC;
+	retval = kernel_read(file, elf_ex.e_phoff, (char *)eppnt, j);
+	if (retval != j)
+		goto out_free_ph;
+
+	for (j = 0, i = 0; i<elf_ex.e_phnum; i++)
+		if ((eppnt + i)->p_type == PT_LOAD)
+			j++;
+	if (j != 1)
+		goto out_free_ph;
+
+	while (eppnt->p_type != PT_LOAD)
+		eppnt++;
+
+	/* Now use mmap to map the library into memory. */
+	down_write(&current->mm->mmap_sem);
+	error = do_mmap(file,
+			ELF_PAGESTART(eppnt->p_vaddr),
+			(eppnt->p_filesz +
+			 ELF_PAGEOFFSET(eppnt->p_vaddr)),
+			PROT_READ | PROT_WRITE | PROT_EXEC,
+			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
+			(eppnt->p_offset -
+			 ELF_PAGEOFFSET(eppnt->p_vaddr)));
+	up_write(&current->mm->mmap_sem);
+	if (error != ELF_PAGESTART(eppnt->p_vaddr))
+		goto out_free_ph;
+
+	elf_bss = eppnt->p_vaddr + eppnt->p_filesz;
+	if (padzero(elf_bss)) {
+		error = -EFAULT;
+		goto out_free_ph;
+	}
+
+	len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr +
+			    ELF_MIN_ALIGN - 1);
+	bss = eppnt->p_memsz + eppnt->p_vaddr;
+	if (bss > len) {
+		down_write(&current->mm->mmap_sem);
+		do_brk(len, bss - len);
+		up_write(&current->mm->mmap_sem);
+	}
+	error = 0;
+
+out_free_ph:
+	kfree(elf_phdata);
+out:
+	return error;
+}
+
+#ifdef CONFIG_ELF_CORE
+/*
+ * ELF core dumper
+ *
+ * Modelled on fs/exec.c:aout_core_dump()
+ * Jeremy Fitzhardinge <jeremy@sw.oz.au>
+ */
+
+/*
+ * Decide what to dump of a segment, part, all or none.
+ */
+static unsigned long vma_dump_size(struct vm_area_struct *vma,
+				   unsigned long mm_flags)
+{
+#define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
+
+	/* The vma can be set up to tell us the answer directly.  */
+	if (vma->vm_flags & VM_ALWAYSDUMP)
+		goto whole;
+
+	/* Hugetlb memory check */
+	if (vma->vm_flags & VM_HUGETLB) {
+		if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
+			goto whole;
+		if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
+			goto whole;
+	}
+
+	/* Do not dump I/O mapped devices or special mappings */
+	if (vma->vm_flags & (VM_IO | VM_RESERVED))
+		return 0;
+
+	/* By default, dump shared memory if mapped from an anonymous file. */
+	if (vma->vm_flags & VM_SHARED) {
+		if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ?
+		    FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
+			goto whole;
+		return 0;
+	}
+
+	/* Dump segments that have been written to.  */
+	if (vma->anon_vma && FILTER(ANON_PRIVATE))
+		goto whole;
+	if (vma->vm_file == NULL)
+		return 0;
+
+	if (FILTER(MAPPED_PRIVATE))
+		goto whole;
+
+	/*
+	 * If this looks like the beginning of a DSO or executable mapping,
+	 * check for an ELF header.  If we find one, dump the first page to
+	 * aid in determining what was mapped here.
+	 */
+	if (FILTER(ELF_HEADERS) &&
+	    vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
+		u32 __user *header = (u32 __user *) vma->vm_start;
+		u32 word;
+		mm_segment_t fs = get_fs();
+		/*
+		 * Doing it this way gets the constant folded by GCC.
+		 */
+		union {
+			u32 cmp;
+			char elfmag[SELFMAG];
+		} magic;
+		BUILD_BUG_ON(SELFMAG != sizeof word);
+		magic.elfmag[EI_MAG0] = ELFMAG0;
+		magic.elfmag[EI_MAG1] = ELFMAG1;
+		magic.elfmag[EI_MAG2] = ELFMAG2;
+		magic.elfmag[EI_MAG3] = ELFMAG3;
+		/*
+		 * Switch to the user "segment" for get_user(),
+		 * then put back what elf_core_dump() had in place.
+		 */
+		set_fs(USER_DS);
+		if (unlikely(get_user(word, header)))
+			word = 0;
+		set_fs(fs);
+		if (word == magic.cmp)
+			return PAGE_SIZE;
+	}
+
+#undef	FILTER
+
+	return 0;
+
+whole:
+	return vma->vm_end - vma->vm_start;
+}
+
+/* An ELF note in memory */
+struct memelfnote
+{
+	const char *name;
+	int type;
+	unsigned int datasz;
+	void *data;
+};
+
+static int notesize(struct memelfnote *en)
+{
+	int sz;
+
+	sz = sizeof(struct elf_note);
+	sz += roundup(strlen(en->name) + 1, 4);
+	sz += roundup(en->datasz, 4);
+
+	return sz;
+}
+
+#define DUMP_WRITE(addr, nr, foffset)	\
+	do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
+
+static int alignfile(struct file *file, loff_t *foffset)
+{
+	static const char buf[4] = { 0, };
+	DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
+	return 1;
+}
+
+static int writenote(struct memelfnote *men, struct file *file,
+			loff_t *foffset)
+{
+	struct elf_note en;
+	en.n_namesz = strlen(men->name) + 1;
+	en.n_descsz = men->datasz;
+	en.n_type = men->type;
+
+	DUMP_WRITE(&en, sizeof(en), foffset);
+	DUMP_WRITE(men->name, en.n_namesz, foffset);
+	if (!alignfile(file, foffset))
+		return 0;
+	DUMP_WRITE(men->data, men->datasz, foffset);
+	if (!alignfile(file, foffset))
+		return 0;
+
+	return 1;
+}
+#undef DUMP_WRITE
+
+static void fill_elf_header(struct elfhdr *elf, int segs,
+			    u16 machine, u32 flags, u8 osabi)
+{
+	memset(elf, 0, sizeof(*elf));
+
+	memcpy(elf->e_ident, ELFMAG, SELFMAG);
+	elf->e_ident[EI_CLASS] = ELF_CLASS;
+	elf->e_ident[EI_DATA] = ELF_DATA;
+	elf->e_ident[EI_VERSION] = EV_CURRENT;
+	elf->e_ident[EI_OSABI] = ELF_OSABI;
+
+	elf->e_type = ET_CORE;
+	elf->e_machine = machine;
+	elf->e_version = EV_CURRENT;
+	elf->e_phoff = sizeof(struct elfhdr);
+	elf->e_flags = flags;
+	elf->e_ehsize = sizeof(struct elfhdr);
+	elf->e_phentsize = sizeof(struct elf_phdr);
+	elf->e_phnum = segs;
+
+	return;
+}
+
+static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
+{
+	phdr->p_type = PT_NOTE;
+	phdr->p_offset = offset;
+	phdr->p_vaddr = 0;
+	phdr->p_paddr = 0;
+	phdr->p_filesz = sz;
+	phdr->p_memsz = 0;
+	phdr->p_flags = 0;
+	phdr->p_align = 0;
+	return;
+}
+
+static void fill_note(struct memelfnote *note, const char *name, int type, 
+		unsigned int sz, void *data)
+{
+	note->name = name;
+	note->type = type;
+	note->datasz = sz;
+	note->data = data;
+	return;
+}
+
+/*
+ * fill up all the fields in prstatus from the given task struct, except
+ * registers which need to be filled up separately.
+ */
+static void fill_prstatus(struct elf_prstatus *prstatus,
+		struct task_struct *p, long signr)
+{
+	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
+	prstatus->pr_sigpend = p->pending.signal.sig[0];
+	prstatus->pr_sighold = p->blocked.sig[0];
+	rcu_read_lock();
+	prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
+	prstatus->pr_pid = task_pid_vnr(p);
+	prstatus->pr_pgrp = task_pgrp_vnr(p);
+	prstatus->pr_sid = task_session_vnr(p);
+	if (thread_group_leader(p)) {
+		struct task_cputime cputime;
+
+		/*
+		 * This is the record for the group leader.  It shows the
+		 * group-wide total, not its individual thread total.
+		 */
+		thread_group_cputime(p, &cputime);
+		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
+		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
+	} else {
+		cputime_to_timeval(p->utime, &prstatus->pr_utime);
+		cputime_to_timeval(p->stime, &prstatus->pr_stime);
+	}
+	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
+	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
+}
+
+static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
+		       struct mm_struct *mm)
+{
+	const struct cred *cred;
+	unsigned int i, len;
+	
+	/* first copy the parameters from user space */
+	memset(psinfo, 0, sizeof(struct elf_prpsinfo));
+
+	len = mm->arg_end - mm->arg_start;
+	if (len >= ELF_PRARGSZ)
+		len = ELF_PRARGSZ-1;
+	if (copy_from_user(&psinfo->pr_psargs,
+		           (const char __user *)mm->arg_start, len))
+		return -EFAULT;
+	for(i = 0; i < len; i++)
+		if (psinfo->pr_psargs[i] == 0)
+			psinfo->pr_psargs[i] = ' ';
+	psinfo->pr_psargs[len] = 0;
+
+	rcu_read_lock();
+	psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
+	psinfo->pr_pid = task_pid_vnr(p);
+	psinfo->pr_pgrp = task_pgrp_vnr(p);
+	psinfo->pr_sid = task_session_vnr(p);
+
+	i = p->state ? ffz(~p->state) + 1 : 0;
+	psinfo->pr_state = i;
+	psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i];
+	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
+	psinfo->pr_nice = task_nice(p);
+	psinfo->pr_flag = p->flags;
+	rcu_read_lock();
+	cred = __task_cred(p);
+	SET_UID(psinfo->pr_uid, cred->uid);
+	SET_GID(psinfo->pr_gid, cred->gid);
+	rcu_read_unlock();
+	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
+	
+	return 0;
+}
+
+static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
+{
+	elf_addr_t *auxv = (elf_addr_t *) mm->saved_auxv;
+	int i = 0;
+	do
+		i += 2;
+	while (auxv[i - 2] != AT_NULL);
+	fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
+}
+
+#ifdef CORE_DUMP_USE_REGSET
+#include <linux/regset.h>
+
+struct elf_thread_core_info {
+	struct elf_thread_core_info *next;
+	struct task_struct *task;
+	struct elf_prstatus prstatus;
+	struct memelfnote notes[0];
+};
+
+struct elf_note_info {
+	struct elf_thread_core_info *thread;
+	struct memelfnote psinfo;
+	struct memelfnote auxv;
+	size_t size;
+	int thread_notes;
+};
+
+/*
+ * When a regset has a writeback hook, we call it on each thread before
+ * dumping user memory.  On register window machines, this makes sure the
+ * user memory backing the register data is up to date before we read it.
+ */
+static void do_thread_regset_writeback(struct task_struct *task,
+				       const struct user_regset *regset)
+{
+	if (regset->writeback)
+		regset->writeback(task, regset, 1);
+}
+
+static int fill_thread_core_info(struct elf_thread_core_info *t,
+				 const struct user_regset_view *view,
+				 long signr, size_t *total)
+{
+	unsigned int i;
+
+	/*
+	 * NT_PRSTATUS is the one special case, because the regset data
+	 * goes into the pr_reg field inside the note contents, rather
+	 * than being the whole note contents.  We fill the reset in here.
+	 * We assume that regset 0 is NT_PRSTATUS.
+	 */
+	fill_prstatus(&t->prstatus, t->task, signr);
+	(void) view->regsets[0].get(t->task, &view->regsets[0],
+				    0, sizeof(t->prstatus.pr_reg),
+				    &t->prstatus.pr_reg, NULL);
+
+	fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
+		  sizeof(t->prstatus), &t->prstatus);
+	*total += notesize(&t->notes[0]);
+
+	do_thread_regset_writeback(t->task, &view->regsets[0]);
+
+	/*
+	 * Each other regset might generate a note too.  For each regset
+	 * that has no core_note_type or is inactive, we leave t->notes[i]
+	 * all zero and we'll know to skip writing it later.
+	 */
+	for (i = 1; i < view->n; ++i) {
+		const struct user_regset *regset = &view->regsets[i];
+		do_thread_regset_writeback(t->task, regset);
+		if (regset->core_note_type && regset->get &&
+		    (!regset->active || regset->active(t->task, regset))) {
+			int ret;
+			size_t size = regset->n * regset->size;
+			void *data = kmalloc(size, GFP_KERNEL);
+			if (unlikely(!data))
+				return 0;
+			ret = regset->get(t->task, regset,
+					  0, size, data, NULL);
+			if (unlikely(ret))
+				kfree(data);
+			else {
+				if (regset->core_note_type != NT_PRFPREG)
+					fill_note(&t->notes[i], "LINUX",
+						  regset->core_note_type,
+						  size, data);
+				else {
+					t->prstatus.pr_fpvalid = 1;
+					fill_note(&t->notes[i], "CORE",
+						  NT_PRFPREG, size, data);
+				}
+				*total += notesize(&t->notes[i]);
+			}
+		}
+	}
+
+	return 1;
+}
+
+static int fill_note_info(struct elfhdr *elf, int phdrs,
+			  struct elf_note_info *info,
+			  long signr, struct pt_regs *regs)
+{
+	struct task_struct *dump_task = current;
+	const struct user_regset_view *view = task_user_regset_view(dump_task);
+	struct elf_thread_core_info *t;
+	struct elf_prpsinfo *psinfo;
+	struct core_thread *ct;
+	unsigned int i;
+
+	info->size = 0;
+	info->thread = NULL;
+
+	psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
+	if (psinfo == NULL)
+		return 0;
+
+	fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+
+	/*
+	 * Figure out how many notes we're going to need for each thread.
+	 */
+	info->thread_notes = 0;
+	for (i = 0; i < view->n; ++i)
+		if (view->regsets[i].core_note_type != 0)
+			++info->thread_notes;
+
+	/*
+	 * Sanity check.  We rely on regset 0 being in NT_PRSTATUS,
+	 * since it is our one special case.
+	 */
+	if (unlikely(info->thread_notes == 0) ||
+	    unlikely(view->regsets[0].core_note_type != NT_PRSTATUS)) {
+		WARN_ON(1);
+		return 0;
+	}
+
+	/*
+	 * Initialize the ELF file header.
+	 */
+	fill_elf_header(elf, phdrs,
+			view->e_machine, view->e_flags, view->ei_osabi);
+
+	/*
+	 * Allocate a structure for each thread.
+	 */
+	for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) {
+		t = kzalloc(offsetof(struct elf_thread_core_info,
+				     notes[info->thread_notes]),
+			    GFP_KERNEL);
+		if (unlikely(!t))
+			return 0;
+
+		t->task = ct->task;
+		if (ct->task == dump_task || !info->thread) {
+			t->next = info->thread;
+			info->thread = t;
+		} else {
+			/*
+			 * Make sure to keep the original task at
+			 * the head of the list.
+			 */
+			t->next = info->thread->next;
+			info->thread->next = t;
+		}
+	}
+
+	/*
+	 * Now fill in each thread's information.
+	 */
+	for (t = info->thread; t != NULL; t = t->next)
+		if (!fill_thread_core_info(t, view, signr, &info->size))
+			return 0;
+
+	/*
+	 * Fill in the two process-wide notes.
+	 */
+	fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
+	info->size += notesize(&info->psinfo);
+
+	fill_auxv_note(&info->auxv, current->mm);
+	info->size += notesize(&info->auxv);
+
+	return 1;
+}
+
+static size_t get_note_info_size(struct elf_note_info *info)
+{
+	return info->size;
+}
+
+/*
+ * Write all the notes for each thread.  When writing the first thread, the
+ * process-wide notes are interleaved after the first thread-specific note.
+ */
+static int write_note_info(struct elf_note_info *info,
+			   struct file *file, loff_t *foffset)
+{
+	bool first = 1;
+	struct elf_thread_core_info *t = info->thread;
+
+	do {
+		int i;
+
+		if (!writenote(&t->notes[0], file, foffset))
+			return 0;
+
+		if (first && !writenote(&info->psinfo, file, foffset))
+			return 0;
+		if (first && !writenote(&info->auxv, file, foffset))
+			return 0;
+
+		for (i = 1; i < info->thread_notes; ++i)
+			if (t->notes[i].data &&
+			    !writenote(&t->notes[i], file, foffset))
+				return 0;
+
+		first = 0;
+		t = t->next;
+	} while (t);
+
+	return 1;
+}
+
+static void free_note_info(struct elf_note_info *info)
+{
+	struct elf_thread_core_info *threads = info->thread;
+	while (threads) {
+		unsigned int i;
+		struct elf_thread_core_info *t = threads;
+		threads = t->next;
+		WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus);
+		for (i = 1; i < info->thread_notes; ++i)
+			kfree(t->notes[i].data);
+		kfree(t);
+	}
+	kfree(info->psinfo.data);
+}
+
+#else
+
+/* Here is the structure in which status of each thread is captured. */
+struct elf_thread_status
+{
+	struct list_head list;
+	struct elf_prstatus prstatus;	/* NT_PRSTATUS */
+	elf_fpregset_t fpu;		/* NT_PRFPREG */
+	struct task_struct *thread;
+#ifdef ELF_CORE_COPY_XFPREGS
+	elf_fpxregset_t xfpu;		/* ELF_CORE_XFPREG_TYPE */
+#endif
+	struct memelfnote notes[3];
+	int num_notes;
+};
+
+/*
+ * In order to add the specific thread information for the elf file format,
+ * we need to keep a linked list of every threads pr_status and then create
+ * a single section for them in the final core file.
+ */
+static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
+{
+	int sz = 0;
+	struct task_struct *p = t->thread;
+	t->num_notes = 0;
+
+	fill_prstatus(&t->prstatus, p, signr);
+	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);	
+	
+	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+		  &(t->prstatus));
+	t->num_notes++;
+	sz += notesize(&t->notes[0]);
+
+	if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL,
+								&t->fpu))) {
+		fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
+			  &(t->fpu));
+		t->num_notes++;
+		sz += notesize(&t->notes[1]);
+	}
+
+#ifdef ELF_CORE_COPY_XFPREGS
+	if (elf_core_copy_task_xfpregs(p, &t->xfpu)) {
+		fill_note(&t->notes[2], "LINUX", ELF_CORE_XFPREG_TYPE,
+			  sizeof(t->xfpu), &t->xfpu);
+		t->num_notes++;
+		sz += notesize(&t->notes[2]);
+	}
+#endif	
+	return sz;
+}
+
+struct elf_note_info {
+	struct memelfnote *notes;
+	struct elf_prstatus *prstatus;	/* NT_PRSTATUS */
+	struct elf_prpsinfo *psinfo;	/* NT_PRPSINFO */
+	struct list_head thread_list;
+	elf_fpregset_t *fpu;
+#ifdef ELF_CORE_COPY_XFPREGS
+	elf_fpxregset_t *xfpu;
+#endif
+	int thread_status_size;
+	int numnote;
+};
+
+static int elf_note_info_init(struct elf_note_info *info)
+{
+	memset(info, 0, sizeof(*info));
+	INIT_LIST_HEAD(&info->thread_list);
+
+	/* Allocate space for six ELF notes */
+	info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL);
+	if (!info->notes)
+		return 0;
+	info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
+	if (!info->psinfo)
+		goto notes_free;
+	info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
+	if (!info->prstatus)
+		goto psinfo_free;
+	info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
+	if (!info->fpu)
+		goto prstatus_free;
+#ifdef ELF_CORE_COPY_XFPREGS
+	info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
+	if (!info->xfpu)
+		goto fpu_free;
+#endif
+	return 1;
+#ifdef ELF_CORE_COPY_XFPREGS
+ fpu_free:
+	kfree(info->fpu);
+#endif
+ prstatus_free:
+	kfree(info->prstatus);
+ psinfo_free:
+	kfree(info->psinfo);
+ notes_free:
+	kfree(info->notes);
+	return 0;
+}
+
+static int fill_note_info(struct elfhdr *elf, int phdrs,
+			  struct elf_note_info *info,
+			  long signr, struct pt_regs *regs)
+{
+	struct list_head *t;
+
+	if (!elf_note_info_init(info))
+		return 0;
+
+	if (signr) {
+		struct core_thread *ct;
+		struct elf_thread_status *ets;
+
+		for (ct = current->mm->core_state->dumper.next;
+						ct; ct = ct->next) {
+			ets = kzalloc(sizeof(*ets), GFP_KERNEL);
+			if (!ets)
+				return 0;
+
+			ets->thread = ct->task;
+			list_add(&ets->list, &info->thread_list);
+		}
+
+		list_for_each(t, &info->thread_list) {
+			int sz;
+
+			ets = list_entry(t, struct elf_thread_status, list);
+			sz = elf_dump_thread_status(signr, ets);
+			info->thread_status_size += sz;
+		}
+	}
+	/* now collect the dump for the current */
+	memset(info->prstatus, 0, sizeof(*info->prstatus));
+	fill_prstatus(info->prstatus, current, signr);
+	elf_core_copy_regs(&info->prstatus->pr_reg, regs);
+
+	/* Set up header */
+	fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI);
+
+	/*
+	 * Set up the notes in similar form to SVR4 core dumps made
+	 * with info from their /proc.
+	 */
+
+	fill_note(info->notes + 0, "CORE", NT_PRSTATUS,
+		  sizeof(*info->prstatus), info->prstatus);
+	fill_psinfo(info->psinfo, current->group_leader, current->mm);
+	fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
+		  sizeof(*info->psinfo), info->psinfo);
+
+	info->numnote = 2;
+
+	fill_auxv_note(&info->notes[info->numnote++], current->mm);
+
+	/* Try to dump the FPU. */
+	info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
+							       info->fpu);
+	if (info->prstatus->pr_fpvalid)
+		fill_note(info->notes + info->numnote++,
+			  "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
+#ifdef ELF_CORE_COPY_XFPREGS
+	if (elf_core_copy_task_xfpregs(current, info->xfpu))
+		fill_note(info->notes + info->numnote++,
+			  "LINUX", ELF_CORE_XFPREG_TYPE,
+			  sizeof(*info->xfpu), info->xfpu);
+#endif
+
+	return 1;
+}
+
+static size_t get_note_info_size(struct elf_note_info *info)
+{
+	int sz = 0;
+	int i;
+
+	for (i = 0; i < info->numnote; i++)
+		sz += notesize(info->notes + i);
+
+	sz += info->thread_status_size;
+
+	return sz;
+}
+
+static int write_note_info(struct elf_note_info *info,
+			   struct file *file, loff_t *foffset)
+{
+	int i;
+	struct list_head *t;
+
+	for (i = 0; i < info->numnote; i++)
+		if (!writenote(info->notes + i, file, foffset))
+			return 0;
+
+	/* write out the thread status notes section */
+	list_for_each(t, &info->thread_list) {
+		struct elf_thread_status *tmp =
+				list_entry(t, struct elf_thread_status, list);
+
+		for (i = 0; i < tmp->num_notes; i++)
+			if (!writenote(&tmp->notes[i], file, foffset))
+				return 0;
+	}
+
+	return 1;
+}
+
+static void free_note_info(struct elf_note_info *info)
+{
+	while (!list_empty(&info->thread_list)) {
+		struct list_head *tmp = info->thread_list.next;
+		list_del(tmp);
+		kfree(list_entry(tmp, struct elf_thread_status, list));
+	}
+
+	kfree(info->prstatus);
+	kfree(info->psinfo);
+	kfree(info->notes);
+	kfree(info->fpu);
+#ifdef ELF_CORE_COPY_XFPREGS
+	kfree(info->xfpu);
+#endif
+}
+
+#endif
+
+static struct vm_area_struct *first_vma(struct task_struct *tsk,
+					struct vm_area_struct *gate_vma)
+{
+	struct vm_area_struct *ret = tsk->mm->mmap;
+
+	if (ret)
+		return ret;
+	return gate_vma;
+}
+/*
+ * Helper function for iterating across a vma list.  It ensures that the caller
+ * will visit `gate_vma' prior to terminating the search.
+ */
+static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
+					struct vm_area_struct *gate_vma)
+{
+	struct vm_area_struct *ret;
+
+	ret = this_vma->vm_next;
+	if (ret)
+		return ret;
+	if (this_vma == gate_vma)
+		return NULL;
+	return gate_vma;
+}
+
+static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
+			     elf_addr_t e_shoff, int segs)
+{
+	elf->e_shoff = e_shoff;
+	elf->e_shentsize = sizeof(*shdr4extnum);
+	elf->e_shnum = 1;
+	elf->e_shstrndx = SHN_UNDEF;
+
+	memset(shdr4extnum, 0, sizeof(*shdr4extnum));
+
+	shdr4extnum->sh_type = SHT_NULL;
+	shdr4extnum->sh_size = elf->e_shnum;
+	shdr4extnum->sh_link = elf->e_shstrndx;
+	shdr4extnum->sh_info = segs;
+}
+
+static size_t elf_core_vma_data_size(struct vm_area_struct *gate_vma,
+				     unsigned long mm_flags)
+{
+	struct vm_area_struct *vma;
+	size_t size = 0;
+
+	for (vma = first_vma(current, gate_vma); vma != NULL;
+	     vma = next_vma(vma, gate_vma))
+		size += vma_dump_size(vma, mm_flags);
+	return size;
+}
+
+/*
+ * Actual dumper
+ *
+ * This is a two-pass process; first we find the offsets of the bits,
+ * and then they are actually written out.  If we run out of core limit
+ * we just truncate.
+ */
+static int elf_core_dump(struct coredump_params *cprm)
+{
+	int has_dumped = 0;
+	mm_segment_t fs;
+	int segs;
+	size_t size = 0;
+	struct vm_area_struct *vma, *gate_vma;
+	struct elfhdr *elf = NULL;
+	loff_t offset = 0, dataoff, foffset;
+	struct elf_note_info info;
+	struct elf_phdr *phdr4note = NULL;
+	struct elf_shdr *shdr4extnum = NULL;
+	Elf_Half e_phnum;
+	elf_addr_t e_shoff;
+
+	/*
+	 * We no longer stop all VM operations.
+	 * 
+	 * This is because those proceses that could possibly change map_count
+	 * or the mmap / vma pages are now blocked in do_exit on current
+	 * finishing this core dump.
+	 *
+	 * Only ptrace can touch these memory addresses, but it doesn't change
+	 * the map_count or the pages allocated. So no possibility of crashing
+	 * exists while dumping the mm->vm_next areas to the core file.
+	 */
+  
+	/* alloc memory for large data structures: too large to be on stack */
+	elf = kmalloc(sizeof(*elf), GFP_KERNEL);
+	if (!elf)
+		goto out;
+	/*
+	 * The number of segs are recored into ELF header as 16bit value.
+	 * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
+	 */
+	segs = current->mm->map_count;
+	segs += elf_core_extra_phdrs();
+
+	gate_vma = get_gate_vma(current->mm);
+	if (gate_vma != NULL)
+		segs++;
+
+	/* for notes section */
+	segs++;
+
+	/* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
+	 * this, kernel supports extended numbering. Have a look at
+	 * include/linux/elf.h for further information. */
+	e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
+
+	/*
+	 * Collect all the non-memory information about the process for the
+	 * notes.  This also sets up the file header.
+	 */
+	if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs))
+		goto cleanup;
+
+	has_dumped = 1;
+	current->flags |= PF_DUMPCORE;
+  
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	offset += sizeof(*elf);				/* Elf header */
+	offset += segs * sizeof(struct elf_phdr);	/* Program headers */
+	foffset = offset;
+
+	/* Write notes phdr entry */
+	{
+		size_t sz = get_note_info_size(&info);
+
+		sz += elf_coredump_extra_notes_size();
+
+		phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
+		if (!phdr4note)
+			goto end_coredump;
+
+		fill_elf_note_phdr(phdr4note, sz, offset);
+		offset += sz;
+	}
+
+	dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
+
+	offset += elf_core_vma_data_size(gate_vma, cprm->mm_flags);
+	offset += elf_core_extra_data_size();
+	e_shoff = offset;
+
+	if (e_phnum == PN_XNUM) {
+		shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
+		if (!shdr4extnum)
+			goto end_coredump;
+		fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
+	}
+
+	offset = dataoff;
+
+	size += sizeof(*elf);
+	if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
+		goto end_coredump;
+
+	size += sizeof(*phdr4note);
+	if (size > cprm->limit
+	    || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
+		goto end_coredump;
+
+	/* Write program headers for segments dump */
+	for (vma = first_vma(current, gate_vma); vma != NULL;
+			vma = next_vma(vma, gate_vma)) {
+		struct elf_phdr phdr;
+
+		phdr.p_type = PT_LOAD;
+		phdr.p_offset = offset;
+		phdr.p_vaddr = vma->vm_start;
+		phdr.p_paddr = 0;
+		phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags);
+		phdr.p_memsz = vma->vm_end - vma->vm_start;
+		offset += phdr.p_filesz;
+		phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
+		if (vma->vm_flags & VM_WRITE)
+			phdr.p_flags |= PF_W;
+		if (vma->vm_flags & VM_EXEC)
+			phdr.p_flags |= PF_X;
+		phdr.p_align = ELF_EXEC_PAGESIZE;
+
+		size += sizeof(phdr);
+		if (size > cprm->limit
+		    || !dump_write(cprm->file, &phdr, sizeof(phdr)))
+			goto end_coredump;
+	}
+
+	if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
+		goto end_coredump;
+
+ 	/* write out the notes section */
+	if (!write_note_info(&info, cprm->file, &foffset))
+		goto end_coredump;
+
+	if (elf_coredump_extra_notes_write(cprm->file, &foffset))
+		goto end_coredump;
+
+	/* Align to page */
+	if (!dump_seek(cprm->file, dataoff - foffset))
+		goto end_coredump;
+
+	for (vma = first_vma(current, gate_vma); vma != NULL;
+			vma = next_vma(vma, gate_vma)) {
+		unsigned long addr;
+		unsigned long end;
+
+		end = vma->vm_start + vma_dump_size(vma, cprm->mm_flags);
+
+		for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
+			struct page *page;
+			int stop;
+
+			page = get_dump_page(addr);
+			if (page) {
+				void *kaddr = kmap(page);
+				stop = ((size += PAGE_SIZE) > cprm->limit) ||
+					!dump_write(cprm->file, kaddr,
+						    PAGE_SIZE);
+				kunmap(page);
+				page_cache_release(page);
+			} else
+				stop = !dump_seek(cprm->file, PAGE_SIZE);
+			if (stop)
+				goto end_coredump;
+		}
+	}
+
+	if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
+		goto end_coredump;
+
+	if (e_phnum == PN_XNUM) {
+		size += sizeof(*shdr4extnum);
+		if (size > cprm->limit
+		    || !dump_write(cprm->file, shdr4extnum,
+				   sizeof(*shdr4extnum)))
+			goto end_coredump;
+	}
+
+end_coredump:
+	set_fs(fs);
+
+cleanup:
+	free_note_info(&info);
+	kfree(shdr4extnum);
+	kfree(phdr4note);
+	kfree(elf);
+out:
+	return has_dumped;
+}
+
+#endif		/* CONFIG_ELF_CORE */
+
+static int __init init_elf_binfmt(void)
+{
+	return register_binfmt(&elf_format);
+}
+
+static void __exit exit_elf_binfmt(void)
+{
+	/* Remove the COFF and ELF loaders. */
+	unregister_binfmt(&elf_format);
+}
+
+core_initcall(init_elf_binfmt);
+module_exit(exit_elf_binfmt);
+MODULE_LICENSE("GPL");
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
new file mode 100644
index 00000000..2bc5dc64
--- /dev/null
+++ b/fs/binfmt_elf_fdpic.c
@@ -0,0 +1,1875 @@
+/* binfmt_elf_fdpic.c: FDPIC ELF binary format
+ *
+ * Copyright (C) 2003, 2004, 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ * Derived from binfmt_elf.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/binfmts.h>
+#include <linux/string.h>
+#include <linux/file.h>
+#include <linux/fcntl.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+#include <linux/highmem.h>
+#include <linux/highuid.h>
+#include <linux/personality.h>
+#include <linux/ptrace.h>
+#include <linux/init.h>
+#include <linux/elf.h>
+#include <linux/elf-fdpic.h>
+#include <linux/elfcore.h>
+#include <linux/coredump.h>
+
+#include <asm/uaccess.h>
+#include <asm/param.h>
+#include <asm/pgalloc.h>
+
+typedef char *elf_caddr_t;
+
+#if 0
+#define kdebug(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ )
+#else
+#define kdebug(fmt, ...) do {} while(0)
+#endif
+
+#if 0
+#define kdcore(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ )
+#else
+#define kdcore(fmt, ...) do {} while(0)
+#endif
+
+MODULE_LICENSE("GPL");
+
+static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *);
+static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *);
+static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *,
+			      struct mm_struct *, const char *);
+
+static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *,
+				   struct elf_fdpic_params *,
+				   struct elf_fdpic_params *);
+
+#ifndef CONFIG_MMU
+static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *,
+					    unsigned long *);
+static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *,
+						   struct file *,
+						   struct mm_struct *);
+#endif
+
+static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *,
+					     struct file *, struct mm_struct *);
+
+#ifdef CONFIG_ELF_CORE
+static int elf_fdpic_core_dump(struct coredump_params *cprm);
+#endif
+
+static struct linux_binfmt elf_fdpic_format = {
+	.module		= THIS_MODULE,
+	.load_binary	= load_elf_fdpic_binary,
+#ifdef CONFIG_ELF_CORE
+	.core_dump	= elf_fdpic_core_dump,
+#endif
+	.min_coredump	= ELF_EXEC_PAGESIZE,
+};
+
+static int __init init_elf_fdpic_binfmt(void)
+{
+	return register_binfmt(&elf_fdpic_format);
+}
+
+static void __exit exit_elf_fdpic_binfmt(void)
+{
+	unregister_binfmt(&elf_fdpic_format);
+}
+
+core_initcall(init_elf_fdpic_binfmt);
+module_exit(exit_elf_fdpic_binfmt);
+
+static int is_elf_fdpic(struct elfhdr *hdr, struct file *file)
+{
+	if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0)
+		return 0;
+	if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN)
+		return 0;
+	if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr))
+		return 0;
+	if (!file->f_op || !file->f_op->mmap)
+		return 0;
+	return 1;
+}
+
+/*****************************************************************************/
+/*
+ * read the program headers table into memory
+ */
+static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params,
+				 struct file *file)
+{
+	struct elf32_phdr *phdr;
+	unsigned long size;
+	int retval, loop;
+
+	if (params->hdr.e_phentsize != sizeof(struct elf_phdr))
+		return -ENOMEM;
+	if (params->hdr.e_phnum > 65536U / sizeof(struct elf_phdr))
+		return -ENOMEM;
+
+	size = params->hdr.e_phnum * sizeof(struct elf_phdr);
+	params->phdrs = kmalloc(size, GFP_KERNEL);
+	if (!params->phdrs)
+		return -ENOMEM;
+
+	retval = kernel_read(file, params->hdr.e_phoff,
+			     (char *) params->phdrs, size);
+	if (unlikely(retval != size))
+		return retval < 0 ? retval : -ENOEXEC;
+
+	/* determine stack size for this binary */
+	phdr = params->phdrs;
+	for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
+		if (phdr->p_type != PT_GNU_STACK)
+			continue;
+
+		if (phdr->p_flags & PF_X)
+			params->flags |= ELF_FDPIC_FLAG_EXEC_STACK;
+		else
+			params->flags |= ELF_FDPIC_FLAG_NOEXEC_STACK;
+
+		params->stack_size = phdr->p_memsz;
+		break;
+	}
+
+	return 0;
+}
+
+/*****************************************************************************/
+/*
+ * load an fdpic binary into various bits of memory
+ */
+static int load_elf_fdpic_binary(struct linux_binprm *bprm,
+				 struct pt_regs *regs)
+{
+	struct elf_fdpic_params exec_params, interp_params;
+	struct elf_phdr *phdr;
+	unsigned long stack_size, entryaddr;
+#ifdef ELF_FDPIC_PLAT_INIT
+	unsigned long dynaddr;
+#endif
+#ifndef CONFIG_MMU
+	unsigned long stack_prot;
+#endif
+	struct file *interpreter = NULL; /* to shut gcc up */
+	char *interpreter_name = NULL;
+	int executable_stack;
+	int retval, i;
+
+	kdebug("____ LOAD %d ____", current->pid);
+
+	memset(&exec_params, 0, sizeof(exec_params));
+	memset(&interp_params, 0, sizeof(interp_params));
+
+	exec_params.hdr = *(struct elfhdr *) bprm->buf;
+	exec_params.flags = ELF_FDPIC_FLAG_PRESENT | ELF_FDPIC_FLAG_EXECUTABLE;
+
+	/* check that this is a binary we know how to deal with */
+	retval = -ENOEXEC;
+	if (!is_elf_fdpic(&exec_params.hdr, bprm->file))
+		goto error;
+
+	/* read the program header table */
+	retval = elf_fdpic_fetch_phdrs(&exec_params, bprm->file);
+	if (retval < 0)
+		goto error;
+
+	/* scan for a program header that specifies an interpreter */
+	phdr = exec_params.phdrs;
+
+	for (i = 0; i < exec_params.hdr.e_phnum; i++, phdr++) {
+		switch (phdr->p_type) {
+		case PT_INTERP:
+			retval = -ENOMEM;
+			if (phdr->p_filesz > PATH_MAX)
+				goto error;
+			retval = -ENOENT;
+			if (phdr->p_filesz < 2)
+				goto error;
+
+			/* read the name of the interpreter into memory */
+			interpreter_name = kmalloc(phdr->p_filesz, GFP_KERNEL);
+			if (!interpreter_name)
+				goto error;
+
+			retval = kernel_read(bprm->file,
+					     phdr->p_offset,
+					     interpreter_name,
+					     phdr->p_filesz);
+			if (unlikely(retval != phdr->p_filesz)) {
+				if (retval >= 0)
+					retval = -ENOEXEC;
+				goto error;
+			}
+
+			retval = -ENOENT;
+			if (interpreter_name[phdr->p_filesz - 1] != '\0')
+				goto error;
+
+			kdebug("Using ELF interpreter %s", interpreter_name);
+
+			/* replace the program with the interpreter */
+			interpreter = open_exec(interpreter_name);
+			retval = PTR_ERR(interpreter);
+			if (IS_ERR(interpreter)) {
+				interpreter = NULL;
+				goto error;
+			}
+
+			/*
+			 * If the binary is not readable then enforce
+			 * mm->dumpable = 0 regardless of the interpreter's
+			 * permissions.
+			 */
+			if (file_permission(interpreter, MAY_READ) < 0)
+				bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+
+			retval = kernel_read(interpreter, 0, bprm->buf,
+					     BINPRM_BUF_SIZE);
+			if (unlikely(retval != BINPRM_BUF_SIZE)) {
+				if (retval >= 0)
+					retval = -ENOEXEC;
+				goto error;
+			}
+
+			interp_params.hdr = *((struct elfhdr *) bprm->buf);
+			break;
+
+		case PT_LOAD:
+#ifdef CONFIG_MMU
+			if (exec_params.load_addr == 0)
+				exec_params.load_addr = phdr->p_vaddr;
+#endif
+			break;
+		}
+
+	}
+
+	if (elf_check_const_displacement(&exec_params.hdr))
+		exec_params.flags |= ELF_FDPIC_FLAG_CONSTDISP;
+
+	/* perform insanity checks on the interpreter */
+	if (interpreter_name) {
+		retval = -ELIBBAD;
+		if (!is_elf_fdpic(&interp_params.hdr, interpreter))
+			goto error;
+
+		interp_params.flags = ELF_FDPIC_FLAG_PRESENT;
+
+		/* read the interpreter's program header table */
+		retval = elf_fdpic_fetch_phdrs(&interp_params, interpreter);
+		if (retval < 0)
+			goto error;
+	}
+
+	stack_size = exec_params.stack_size;
+	if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
+		executable_stack = EXSTACK_ENABLE_X;
+	else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
+		executable_stack = EXSTACK_DISABLE_X;
+	else
+		executable_stack = EXSTACK_DEFAULT;
+
+	if (stack_size == 0) {
+		stack_size = interp_params.stack_size;
+		if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
+			executable_stack = EXSTACK_ENABLE_X;
+		else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
+			executable_stack = EXSTACK_DISABLE_X;
+		else
+			executable_stack = EXSTACK_DEFAULT;
+	}
+
+	retval = -ENOEXEC;
+	if (stack_size == 0)
+		goto error;
+
+	if (elf_check_const_displacement(&interp_params.hdr))
+		interp_params.flags |= ELF_FDPIC_FLAG_CONSTDISP;
+
+	/* flush all traces of the currently running executable */
+	retval = flush_old_exec(bprm);
+	if (retval)
+		goto error;
+
+	/* there's now no turning back... the old userspace image is dead,
+	 * defunct, deceased, etc. after this point we have to exit via
+	 * error_kill */
+	set_personality(PER_LINUX_FDPIC);
+	if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
+		current->personality |= READ_IMPLIES_EXEC;
+
+	setup_new_exec(bprm);
+
+	set_binfmt(&elf_fdpic_format);
+
+	current->mm->start_code = 0;
+	current->mm->end_code = 0;
+	current->mm->start_stack = 0;
+	current->mm->start_data = 0;
+	current->mm->end_data = 0;
+	current->mm->context.exec_fdpic_loadmap = 0;
+	current->mm->context.interp_fdpic_loadmap = 0;
+
+	current->flags &= ~PF_FORKNOEXEC;
+
+#ifdef CONFIG_MMU
+	elf_fdpic_arch_lay_out_mm(&exec_params,
+				  &interp_params,
+				  &current->mm->start_stack,
+				  &current->mm->start_brk);
+
+	retval = setup_arg_pages(bprm, current->mm->start_stack,
+				 executable_stack);
+	if (retval < 0) {
+		send_sig(SIGKILL, current, 0);
+		goto error_kill;
+	}
+#endif
+
+	/* load the executable and interpreter into memory */
+	retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm,
+				    "executable");
+	if (retval < 0)
+		goto error_kill;
+
+	if (interpreter_name) {
+		retval = elf_fdpic_map_file(&interp_params, interpreter,
+					    current->mm, "interpreter");
+		if (retval < 0) {
+			printk(KERN_ERR "Unable to load interpreter\n");
+			goto error_kill;
+		}
+
+		allow_write_access(interpreter);
+		fput(interpreter);
+		interpreter = NULL;
+	}
+
+#ifdef CONFIG_MMU
+	if (!current->mm->start_brk)
+		current->mm->start_brk = current->mm->end_data;
+
+	current->mm->brk = current->mm->start_brk =
+		PAGE_ALIGN(current->mm->start_brk);
+
+#else
+	/* create a stack and brk area big enough for everyone
+	 * - the brk heap starts at the bottom and works up
+	 * - the stack starts at the top and works down
+	 */
+	stack_size = (stack_size + PAGE_SIZE - 1) & PAGE_MASK;
+	if (stack_size < PAGE_SIZE * 2)
+		stack_size = PAGE_SIZE * 2;
+
+	stack_prot = PROT_READ | PROT_WRITE;
+	if (executable_stack == EXSTACK_ENABLE_X ||
+	    (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC))
+		stack_prot |= PROT_EXEC;
+
+	down_write(&current->mm->mmap_sem);
+	current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot,
+					 MAP_PRIVATE | MAP_ANONYMOUS |
+					 MAP_UNINITIALIZED | MAP_GROWSDOWN,
+					 0);
+
+	if (IS_ERR_VALUE(current->mm->start_brk)) {
+		up_write(&current->mm->mmap_sem);
+		retval = current->mm->start_brk;
+		current->mm->start_brk = 0;
+		goto error_kill;
+	}
+
+	up_write(&current->mm->mmap_sem);
+
+	current->mm->brk = current->mm->start_brk;
+	current->mm->context.end_brk = current->mm->start_brk;
+	current->mm->context.end_brk +=
+		(stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0;
+	current->mm->start_stack = current->mm->start_brk + stack_size;
+#endif
+
+	install_exec_creds(bprm);
+	current->flags &= ~PF_FORKNOEXEC;
+	if (create_elf_fdpic_tables(bprm, current->mm,
+				    &exec_params, &interp_params) < 0)
+		goto error_kill;
+
+	kdebug("- start_code  %lx", current->mm->start_code);
+	kdebug("- end_code    %lx", current->mm->end_code);
+	kdebug("- start_data  %lx", current->mm->start_data);
+	kdebug("- end_data    %lx", current->mm->end_data);
+	kdebug("- start_brk   %lx", current->mm->start_brk);
+	kdebug("- brk         %lx", current->mm->brk);
+	kdebug("- start_stack %lx", current->mm->start_stack);
+
+#ifdef ELF_FDPIC_PLAT_INIT
+	/*
+	 * The ABI may specify that certain registers be set up in special
+	 * ways (on i386 %edx is the address of a DT_FINI function, for
+	 * example.  This macro performs whatever initialization to
+	 * the regs structure is required.
+	 */
+	dynaddr = interp_params.dynamic_addr ?: exec_params.dynamic_addr;
+	ELF_FDPIC_PLAT_INIT(regs, exec_params.map_addr, interp_params.map_addr,
+			    dynaddr);
+#endif
+
+	/* everything is now ready... get the userspace context ready to roll */
+	entryaddr = interp_params.entry_addr ?: exec_params.entry_addr;
+	start_thread(regs, entryaddr, current->mm->start_stack);
+
+	retval = 0;
+
+error:
+	if (interpreter) {
+		allow_write_access(interpreter);
+		fput(interpreter);
+	}
+	kfree(interpreter_name);
+	kfree(exec_params.phdrs);
+	kfree(exec_params.loadmap);
+	kfree(interp_params.phdrs);
+	kfree(interp_params.loadmap);
+	return retval;
+
+	/* unrecoverable error - kill the process */
+error_kill:
+	send_sig(SIGSEGV, current, 0);
+	goto error;
+
+}
+
+/*****************************************************************************/
+
+#ifndef ELF_BASE_PLATFORM
+/*
+ * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture.
+ * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value
+ * will be copied to the user stack in the same manner as AT_PLATFORM.
+ */
+#define ELF_BASE_PLATFORM NULL
+#endif
+
+/*
+ * present useful information to the program by shovelling it onto the new
+ * process's stack
+ */
+static int create_elf_fdpic_tables(struct linux_binprm *bprm,
+				   struct mm_struct *mm,
+				   struct elf_fdpic_params *exec_params,
+				   struct elf_fdpic_params *interp_params)
+{
+	const struct cred *cred = current_cred();
+	unsigned long sp, csp, nitems;
+	elf_caddr_t __user *argv, *envp;
+	size_t platform_len = 0, len;
+	char *k_platform, *k_base_platform;
+	char __user *u_platform, *u_base_platform, *p;
+	long hwcap;
+	int loop;
+	int nr;	/* reset for each csp adjustment */
+
+#ifdef CONFIG_MMU
+	/* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions
+	 * by the processes running on the same package. One thing we can do is
+	 * to shuffle the initial stack for them, so we give the architecture
+	 * an opportunity to do so here.
+	 */
+	sp = arch_align_stack(bprm->p);
+#else
+	sp = mm->start_stack;
+
+	/* stack the program arguments and environment */
+	if (elf_fdpic_transfer_args_to_stack(bprm, &sp) < 0)
+		return -EFAULT;
+#endif
+
+	hwcap = ELF_HWCAP;
+
+	/*
+	 * If this architecture has a platform capability string, copy it
+	 * to userspace.  In some cases (Sparc), this info is impossible
+	 * for userspace to get any other way, in others (i386) it is
+	 * merely difficult.
+	 */
+	k_platform = ELF_PLATFORM;
+	u_platform = NULL;
+
+	if (k_platform) {
+		platform_len = strlen(k_platform) + 1;
+		sp -= platform_len;
+		u_platform = (char __user *) sp;
+		if (__copy_to_user(u_platform, k_platform, platform_len) != 0)
+			return -EFAULT;
+	}
+
+	/*
+	 * If this architecture has a "base" platform capability
+	 * string, copy it to userspace.
+	 */
+	k_base_platform = ELF_BASE_PLATFORM;
+	u_base_platform = NULL;
+
+	if (k_base_platform) {
+		platform_len = strlen(k_base_platform) + 1;
+		sp -= platform_len;
+		u_base_platform = (char __user *) sp;
+		if (__copy_to_user(u_base_platform, k_base_platform, platform_len) != 0)
+			return -EFAULT;
+	}
+
+	sp &= ~7UL;
+
+	/* stack the load map(s) */
+	len = sizeof(struct elf32_fdpic_loadmap);
+	len += sizeof(struct elf32_fdpic_loadseg) * exec_params->loadmap->nsegs;
+	sp = (sp - len) & ~7UL;
+	exec_params->map_addr = sp;
+
+	if (copy_to_user((void __user *) sp, exec_params->loadmap, len) != 0)
+		return -EFAULT;
+
+	current->mm->context.exec_fdpic_loadmap = (unsigned long) sp;
+
+	if (interp_params->loadmap) {
+		len = sizeof(struct elf32_fdpic_loadmap);
+		len += sizeof(struct elf32_fdpic_loadseg) *
+			interp_params->loadmap->nsegs;
+		sp = (sp - len) & ~7UL;
+		interp_params->map_addr = sp;
+
+		if (copy_to_user((void __user *) sp, interp_params->loadmap,
+				 len) != 0)
+			return -EFAULT;
+
+		current->mm->context.interp_fdpic_loadmap = (unsigned long) sp;
+	}
+
+	/* force 16 byte _final_ alignment here for generality */
+#define DLINFO_ITEMS 15
+
+	nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) +
+		(k_base_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH;
+
+	if (bprm->interp_flags & BINPRM_FLAGS_EXECFD)
+		nitems++;
+
+	csp = sp;
+	sp -= nitems * 2 * sizeof(unsigned long);
+	sp -= (bprm->envc + 1) * sizeof(char *);	/* envv[] */
+	sp -= (bprm->argc + 1) * sizeof(char *);	/* argv[] */
+	sp -= 1 * sizeof(unsigned long);		/* argc */
+
+	csp -= sp & 15UL;
+	sp -= sp & 15UL;
+
+	/* put the ELF interpreter info on the stack */
+#define NEW_AUX_ENT(id, val)						\
+	do {								\
+		struct { unsigned long _id, _val; } __user *ent;	\
+									\
+		ent = (void __user *) csp;				\
+		__put_user((id), &ent[nr]._id);				\
+		__put_user((val), &ent[nr]._val);			\
+		nr++;							\
+	} while (0)
+
+	nr = 0;
+	csp -= 2 * sizeof(unsigned long);
+	NEW_AUX_ENT(AT_NULL, 0);
+	if (k_platform) {
+		nr = 0;
+		csp -= 2 * sizeof(unsigned long);
+		NEW_AUX_ENT(AT_PLATFORM,
+			    (elf_addr_t) (unsigned long) u_platform);
+	}
+
+	if (k_base_platform) {
+		nr = 0;
+		csp -= 2 * sizeof(unsigned long);
+		NEW_AUX_ENT(AT_BASE_PLATFORM,
+			    (elf_addr_t) (unsigned long) u_base_platform);
+	}
+
+	if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
+		nr = 0;
+		csp -= 2 * sizeof(unsigned long);
+		NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
+	}
+
+	nr = 0;
+	csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long);
+	NEW_AUX_ENT(AT_HWCAP,	hwcap);
+	NEW_AUX_ENT(AT_PAGESZ,	PAGE_SIZE);
+	NEW_AUX_ENT(AT_CLKTCK,	CLOCKS_PER_SEC);
+	NEW_AUX_ENT(AT_PHDR,	exec_params->ph_addr);
+	NEW_AUX_ENT(AT_PHENT,	sizeof(struct elf_phdr));
+	NEW_AUX_ENT(AT_PHNUM,	exec_params->hdr.e_phnum);
+	NEW_AUX_ENT(AT_BASE,	interp_params->elfhdr_addr);
+	NEW_AUX_ENT(AT_FLAGS,	0);
+	NEW_AUX_ENT(AT_ENTRY,	exec_params->entry_addr);
+	NEW_AUX_ENT(AT_UID,	(elf_addr_t) cred->uid);
+	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) cred->euid);
+	NEW_AUX_ENT(AT_GID,	(elf_addr_t) cred->gid);
+	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) cred->egid);
+	NEW_AUX_ENT(AT_SECURE,	security_bprm_secureexec(bprm));
+	NEW_AUX_ENT(AT_EXECFN,	bprm->exec);
+
+#ifdef ARCH_DLINFO
+	nr = 0;
+	csp -= AT_VECTOR_SIZE_ARCH * 2 * sizeof(unsigned long);
+
+	/* ARCH_DLINFO must come last so platform specific code can enforce
+	 * special alignment requirements on the AUXV if necessary (eg. PPC).
+	 */
+	ARCH_DLINFO;
+#endif
+#undef NEW_AUX_ENT
+
+	/* allocate room for argv[] and envv[] */
+	csp -= (bprm->envc + 1) * sizeof(elf_caddr_t);
+	envp = (elf_caddr_t __user *) csp;
+	csp -= (bprm->argc + 1) * sizeof(elf_caddr_t);
+	argv = (elf_caddr_t __user *) csp;
+
+	/* stack argc */
+	csp -= sizeof(unsigned long);
+	__put_user(bprm->argc, (unsigned long __user *) csp);
+
+	BUG_ON(csp != sp);
+
+	/* fill in the argv[] array */
+#ifdef CONFIG_MMU
+	current->mm->arg_start = bprm->p;
+#else
+	current->mm->arg_start = current->mm->start_stack -
+		(MAX_ARG_PAGES * PAGE_SIZE - bprm->p);
+#endif
+
+	p = (char __user *) current->mm->arg_start;
+	for (loop = bprm->argc; loop > 0; loop--) {
+		__put_user((elf_caddr_t) p, argv++);
+		len = strnlen_user(p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
+			return -EINVAL;
+		p += len;
+	}
+	__put_user(NULL, argv);
+	current->mm->arg_end = (unsigned long) p;
+
+	/* fill in the envv[] array */
+	current->mm->env_start = (unsigned long) p;
+	for (loop = bprm->envc; loop > 0; loop--) {
+		__put_user((elf_caddr_t)(unsigned long) p, envp++);
+		len = strnlen_user(p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
+			return -EINVAL;
+		p += len;
+	}
+	__put_user(NULL, envp);
+	current->mm->env_end = (unsigned long) p;
+
+	mm->start_stack = (unsigned long) sp;
+	return 0;
+}
+
+/*****************************************************************************/
+/*
+ * transfer the program arguments and environment from the holding pages onto
+ * the stack
+ */
+#ifndef CONFIG_MMU
+static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm,
+					    unsigned long *_sp)
+{
+	unsigned long index, stop, sp;
+	char *src;
+	int ret = 0;
+
+	stop = bprm->p >> PAGE_SHIFT;
+	sp = *_sp;
+
+	for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
+		src = kmap(bprm->page[index]);
+		sp -= PAGE_SIZE;
+		if (copy_to_user((void *) sp, src, PAGE_SIZE) != 0)
+			ret = -EFAULT;
+		kunmap(bprm->page[index]);
+		if (ret < 0)
+			goto out;
+	}
+
+	*_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15;
+
+out:
+	return ret;
+}
+#endif
+
+/*****************************************************************************/
+/*
+ * load the appropriate binary image (executable or interpreter) into memory
+ * - we assume no MMU is available
+ * - if no other PIC bits are set in params->hdr->e_flags
+ *   - we assume that the LOADable segments in the binary are independently relocatable
+ *   - we assume R/O executable segments are shareable
+ * - else
+ *   - we assume the loadable parts of the image to require fixed displacement
+ *   - the image is not shareable
+ */
+static int elf_fdpic_map_file(struct elf_fdpic_params *params,
+			      struct file *file,
+			      struct mm_struct *mm,
+			      const char *what)
+{
+	struct elf32_fdpic_loadmap *loadmap;
+#ifdef CONFIG_MMU
+	struct elf32_fdpic_loadseg *mseg;
+#endif
+	struct elf32_fdpic_loadseg *seg;
+	struct elf32_phdr *phdr;
+	unsigned long load_addr, stop;
+	unsigned nloads, tmp;
+	size_t size;
+	int loop, ret;
+
+	/* allocate a load map table */
+	nloads = 0;
+	for (loop = 0; loop < params->hdr.e_phnum; loop++)
+		if (params->phdrs[loop].p_type == PT_LOAD)
+			nloads++;
+
+	if (nloads == 0)
+		return -ELIBBAD;
+
+	size = sizeof(*loadmap) + nloads * sizeof(*seg);
+	loadmap = kzalloc(size, GFP_KERNEL);
+	if (!loadmap)
+		return -ENOMEM;
+
+	params->loadmap = loadmap;
+
+	loadmap->version = ELF32_FDPIC_LOADMAP_VERSION;
+	loadmap->nsegs = nloads;
+
+	load_addr = params->load_addr;
+	seg = loadmap->segs;
+
+	/* map the requested LOADs into the memory space */
+	switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) {
+	case ELF_FDPIC_FLAG_CONSTDISP:
+	case ELF_FDPIC_FLAG_CONTIGUOUS:
+#ifndef CONFIG_MMU
+		ret = elf_fdpic_map_file_constdisp_on_uclinux(params, file, mm);
+		if (ret < 0)
+			return ret;
+		break;
+#endif
+	default:
+		ret = elf_fdpic_map_file_by_direct_mmap(params, file, mm);
+		if (ret < 0)
+			return ret;
+		break;
+	}
+
+	/* map the entry point */
+	if (params->hdr.e_entry) {
+		seg = loadmap->segs;
+		for (loop = loadmap->nsegs; loop > 0; loop--, seg++) {
+			if (params->hdr.e_entry >= seg->p_vaddr &&
+			    params->hdr.e_entry < seg->p_vaddr + seg->p_memsz) {
+				params->entry_addr =
+					(params->hdr.e_entry - seg->p_vaddr) +
+					seg->addr;
+				break;
+			}
+		}
+	}
+
+	/* determine where the program header table has wound up if mapped */
+	stop = params->hdr.e_phoff;
+	stop += params->hdr.e_phnum * sizeof (struct elf_phdr);
+	phdr = params->phdrs;
+
+	for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
+		if (phdr->p_type != PT_LOAD)
+			continue;
+
+		if (phdr->p_offset > params->hdr.e_phoff ||
+		    phdr->p_offset + phdr->p_filesz < stop)
+			continue;
+
+		seg = loadmap->segs;
+		for (loop = loadmap->nsegs; loop > 0; loop--, seg++) {
+			if (phdr->p_vaddr >= seg->p_vaddr &&
+			    phdr->p_vaddr + phdr->p_filesz <=
+			    seg->p_vaddr + seg->p_memsz) {
+				params->ph_addr =
+					(phdr->p_vaddr - seg->p_vaddr) +
+					seg->addr +
+					params->hdr.e_phoff - phdr->p_offset;
+				break;
+			}
+		}
+		break;
+	}
+
+	/* determine where the dynamic section has wound up if there is one */
+	phdr = params->phdrs;
+	for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
+		if (phdr->p_type != PT_DYNAMIC)
+			continue;
+
+		seg = loadmap->segs;
+		for (loop = loadmap->nsegs; loop > 0; loop--, seg++) {
+			if (phdr->p_vaddr >= seg->p_vaddr &&
+			    phdr->p_vaddr + phdr->p_memsz <=
+			    seg->p_vaddr + seg->p_memsz) {
+				params->dynamic_addr =
+					(phdr->p_vaddr - seg->p_vaddr) +
+					seg->addr;
+
+				/* check the dynamic section contains at least
+				 * one item, and that the last item is a NULL
+				 * entry */
+				if (phdr->p_memsz == 0 ||
+				    phdr->p_memsz % sizeof(Elf32_Dyn) != 0)
+					goto dynamic_error;
+
+				tmp = phdr->p_memsz / sizeof(Elf32_Dyn);
+				if (((Elf32_Dyn *)
+				     params->dynamic_addr)[tmp - 1].d_tag != 0)
+					goto dynamic_error;
+				break;
+			}
+		}
+		break;
+	}
+
+	/* now elide adjacent segments in the load map on MMU linux
+	 * - on uClinux the holes between may actually be filled with system
+	 *   stuff or stuff from other processes
+	 */
+#ifdef CONFIG_MMU
+	nloads = loadmap->nsegs;
+	mseg = loadmap->segs;
+	seg = mseg + 1;
+	for (loop = 1; loop < nloads; loop++) {
+		/* see if we have a candidate for merging */
+		if (seg->p_vaddr - mseg->p_vaddr == seg->addr - mseg->addr) {
+			load_addr = PAGE_ALIGN(mseg->addr + mseg->p_memsz);
+			if (load_addr == (seg->addr & PAGE_MASK)) {
+				mseg->p_memsz +=
+					load_addr -
+					(mseg->addr + mseg->p_memsz);
+				mseg->p_memsz += seg->addr & ~PAGE_MASK;
+				mseg->p_memsz += seg->p_memsz;
+				loadmap->nsegs--;
+				continue;
+			}
+		}
+
+		mseg++;
+		if (mseg != seg)
+			*mseg = *seg;
+	}
+#endif
+
+	kdebug("Mapped Object [%s]:", what);
+	kdebug("- elfhdr   : %lx", params->elfhdr_addr);
+	kdebug("- entry    : %lx", params->entry_addr);
+	kdebug("- PHDR[]   : %lx", params->ph_addr);
+	kdebug("- DYNAMIC[]: %lx", params->dynamic_addr);
+	seg = loadmap->segs;
+	for (loop = 0; loop < loadmap->nsegs; loop++, seg++)
+		kdebug("- LOAD[%d] : %08x-%08x [va=%x ms=%x]",
+		       loop,
+		       seg->addr, seg->addr + seg->p_memsz - 1,
+		       seg->p_vaddr, seg->p_memsz);
+
+	return 0;
+
+dynamic_error:
+	printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n",
+	       what, file->f_path.dentry->d_inode->i_ino);
+	return -ELIBBAD;
+}
+
+/*****************************************************************************/
+/*
+ * map a file with constant displacement under uClinux
+ */
+#ifndef CONFIG_MMU
+static int elf_fdpic_map_file_constdisp_on_uclinux(
+	struct elf_fdpic_params *params,
+	struct file *file,
+	struct mm_struct *mm)
+{
+	struct elf32_fdpic_loadseg *seg;
+	struct elf32_phdr *phdr;
+	unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags;
+	loff_t fpos;
+	int loop, ret;
+
+	load_addr = params->load_addr;
+	seg = params->loadmap->segs;
+
+	/* determine the bounds of the contiguous overall allocation we must
+	 * make */
+	phdr = params->phdrs;
+	for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
+		if (params->phdrs[loop].p_type != PT_LOAD)
+			continue;
+
+		if (base > phdr->p_vaddr)
+			base = phdr->p_vaddr;
+		if (top < phdr->p_vaddr + phdr->p_memsz)
+			top = phdr->p_vaddr + phdr->p_memsz;
+	}
+
+	/* allocate one big anon block for everything */
+	mflags = MAP_PRIVATE;
+	if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE)
+		mflags |= MAP_EXECUTABLE;
+
+	down_write(&mm->mmap_sem);
+	maddr = do_mmap(NULL, load_addr, top - base,
+			PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0);
+	up_write(&mm->mmap_sem);
+	if (IS_ERR_VALUE(maddr))
+		return (int) maddr;
+
+	if (load_addr != 0)
+		load_addr += PAGE_ALIGN(top - base);
+
+	/* and then load the file segments into it */
+	phdr = params->phdrs;
+	for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
+		if (params->phdrs[loop].p_type != PT_LOAD)
+			continue;
+
+		fpos = phdr->p_offset;
+
+		seg->addr = maddr + (phdr->p_vaddr - base);
+		seg->p_vaddr = phdr->p_vaddr;
+		seg->p_memsz = phdr->p_memsz;
+
+		ret = file->f_op->read(file, (void *) seg->addr,
+				       phdr->p_filesz, &fpos);
+		if (ret < 0)
+			return ret;
+
+		/* map the ELF header address if in this segment */
+		if (phdr->p_offset == 0)
+			params->elfhdr_addr = seg->addr;
+
+		/* clear any space allocated but not loaded */
+		if (phdr->p_filesz < phdr->p_memsz) {
+			if (clear_user((void *) (seg->addr + phdr->p_filesz),
+				       phdr->p_memsz - phdr->p_filesz))
+				return -EFAULT;
+		}
+
+		if (mm) {
+			if (phdr->p_flags & PF_X) {
+				if (!mm->start_code) {
+					mm->start_code = seg->addr;
+					mm->end_code = seg->addr +
+						phdr->p_memsz;
+				}
+			} else if (!mm->start_data) {
+				mm->start_data = seg->addr;
+				mm->end_data = seg->addr + phdr->p_memsz;
+			}
+		}
+
+		seg++;
+	}
+
+	return 0;
+}
+#endif
+
+/*****************************************************************************/
+/*
+ * map a binary by direct mmap() of the individual PT_LOAD segments
+ */
+static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
+					     struct file *file,
+					     struct mm_struct *mm)
+{
+	struct elf32_fdpic_loadseg *seg;
+	struct elf32_phdr *phdr;
+	unsigned long load_addr, delta_vaddr;
+	int loop, dvset;
+
+	load_addr = params->load_addr;
+	delta_vaddr = 0;
+	dvset = 0;
+
+	seg = params->loadmap->segs;
+
+	/* deal with each load segment separately */
+	phdr = params->phdrs;
+	for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
+		unsigned long maddr, disp, excess, excess1;
+		int prot = 0, flags;
+
+		if (phdr->p_type != PT_LOAD)
+			continue;
+
+		kdebug("[LOAD] va=%lx of=%lx fs=%lx ms=%lx",
+		       (unsigned long) phdr->p_vaddr,
+		       (unsigned long) phdr->p_offset,
+		       (unsigned long) phdr->p_filesz,
+		       (unsigned long) phdr->p_memsz);
+
+		/* determine the mapping parameters */
+		if (phdr->p_flags & PF_R) prot |= PROT_READ;
+		if (phdr->p_flags & PF_W) prot |= PROT_WRITE;
+		if (phdr->p_flags & PF_X) prot |= PROT_EXEC;
+
+		flags = MAP_PRIVATE | MAP_DENYWRITE;
+		if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE)
+			flags |= MAP_EXECUTABLE;
+
+		maddr = 0;
+
+		switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) {
+		case ELF_FDPIC_FLAG_INDEPENDENT:
+			/* PT_LOADs are independently locatable */
+			break;
+
+		case ELF_FDPIC_FLAG_HONOURVADDR:
+			/* the specified virtual address must be honoured */
+			maddr = phdr->p_vaddr;
+			flags |= MAP_FIXED;
+			break;
+
+		case ELF_FDPIC_FLAG_CONSTDISP:
+			/* constant displacement
+			 * - can be mapped anywhere, but must be mapped as a
+			 *   unit
+			 */
+			if (!dvset) {
+				maddr = load_addr;
+				delta_vaddr = phdr->p_vaddr;
+				dvset = 1;
+			} else {
+				maddr = load_addr + phdr->p_vaddr - delta_vaddr;
+				flags |= MAP_FIXED;
+			}
+			break;
+
+		case ELF_FDPIC_FLAG_CONTIGUOUS:
+			/* contiguity handled later */
+			break;
+
+		default:
+			BUG();
+		}
+
+		maddr &= PAGE_MASK;
+
+		/* create the mapping */
+		disp = phdr->p_vaddr & ~PAGE_MASK;
+		down_write(&mm->mmap_sem);
+		maddr = do_mmap(file, maddr, phdr->p_memsz + disp, prot, flags,
+				phdr->p_offset - disp);
+		up_write(&mm->mmap_sem);
+
+		kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx",
+		       loop, phdr->p_memsz + disp, prot, flags,
+		       phdr->p_offset - disp, maddr);
+
+		if (IS_ERR_VALUE(maddr))
+			return (int) maddr;
+
+		if ((params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) ==
+		    ELF_FDPIC_FLAG_CONTIGUOUS)
+			load_addr += PAGE_ALIGN(phdr->p_memsz + disp);
+
+		seg->addr = maddr + disp;
+		seg->p_vaddr = phdr->p_vaddr;
+		seg->p_memsz = phdr->p_memsz;
+
+		/* map the ELF header address if in this segment */
+		if (phdr->p_offset == 0)
+			params->elfhdr_addr = seg->addr;
+
+		/* clear the bit between beginning of mapping and beginning of
+		 * PT_LOAD */
+		if (prot & PROT_WRITE && disp > 0) {
+			kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
+			if (clear_user((void __user *) maddr, disp))
+				return -EFAULT;
+			maddr += disp;
+		}
+
+		/* clear any space allocated but not loaded
+		 * - on uClinux we can just clear the lot
+		 * - on MMU linux we'll get a SIGBUS beyond the last page
+		 *   extant in the file
+		 */
+		excess = phdr->p_memsz - phdr->p_filesz;
+		excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK);
+
+#ifdef CONFIG_MMU
+		if (excess > excess1) {
+			unsigned long xaddr = maddr + phdr->p_filesz + excess1;
+			unsigned long xmaddr;
+
+			flags |= MAP_FIXED | MAP_ANONYMOUS;
+			down_write(&mm->mmap_sem);
+			xmaddr = do_mmap(NULL, xaddr, excess - excess1,
+					 prot, flags, 0);
+			up_write(&mm->mmap_sem);
+
+			kdebug("mmap[%d] <anon>"
+			       " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx",
+			       loop, xaddr, excess - excess1, prot, flags,
+			       xmaddr);
+
+			if (xmaddr != xaddr)
+				return -ENOMEM;
+		}
+
+		if (prot & PROT_WRITE && excess1 > 0) {
+			kdebug("clear[%d] ad=%lx sz=%lx",
+			       loop, maddr + phdr->p_filesz, excess1);
+			if (clear_user((void __user *) maddr + phdr->p_filesz,
+				       excess1))
+				return -EFAULT;
+		}
+
+#else
+		if (excess > 0) {
+			kdebug("clear[%d] ad=%lx sz=%lx",
+			       loop, maddr + phdr->p_filesz, excess);
+			if (clear_user((void *) maddr + phdr->p_filesz, excess))
+				return -EFAULT;
+		}
+#endif
+
+		if (mm) {
+			if (phdr->p_flags & PF_X) {
+				if (!mm->start_code) {
+					mm->start_code = maddr;
+					mm->end_code = maddr + phdr->p_memsz;
+				}
+			} else if (!mm->start_data) {
+				mm->start_data = maddr;
+				mm->end_data = maddr + phdr->p_memsz;
+			}
+		}
+
+		seg++;
+	}
+
+	return 0;
+}
+
+/*****************************************************************************/
+/*
+ * ELF-FDPIC core dumper
+ *
+ * Modelled on fs/exec.c:aout_core_dump()
+ * Jeremy Fitzhardinge <jeremy@sw.oz.au>
+ *
+ * Modelled on fs/binfmt_elf.c core dumper
+ */
+#ifdef CONFIG_ELF_CORE
+
+/*
+ * Decide whether a segment is worth dumping; default is yes to be
+ * sure (missing info is worse than too much; etc).
+ * Personally I'd include everything, and use the coredump limit...
+ *
+ * I think we should skip something. But I am not sure how. H.J.
+ */
+static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
+{
+	int dump_ok;
+
+	/* Do not dump I/O mapped devices or special mappings */
+	if (vma->vm_flags & (VM_IO | VM_RESERVED)) {
+		kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags);
+		return 0;
+	}
+
+	/* If we may not read the contents, don't allow us to dump
+	 * them either. "dump_write()" can't handle it anyway.
+	 */
+	if (!(vma->vm_flags & VM_READ)) {
+		kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags);
+		return 0;
+	}
+
+	/* By default, dump shared memory if mapped from an anonymous file. */
+	if (vma->vm_flags & VM_SHARED) {
+		if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0) {
+			dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags);
+			kdcore("%08lx: %08lx: %s (share)", vma->vm_start,
+			       vma->vm_flags, dump_ok ? "yes" : "no");
+			return dump_ok;
+		}
+
+		dump_ok = test_bit(MMF_DUMP_MAPPED_SHARED, &mm_flags);
+		kdcore("%08lx: %08lx: %s (share)", vma->vm_start,
+		       vma->vm_flags, dump_ok ? "yes" : "no");
+		return dump_ok;
+	}
+
+#ifdef CONFIG_MMU
+	/* By default, if it hasn't been written to, don't write it out */
+	if (!vma->anon_vma) {
+		dump_ok = test_bit(MMF_DUMP_MAPPED_PRIVATE, &mm_flags);
+		kdcore("%08lx: %08lx: %s (!anon)", vma->vm_start,
+		       vma->vm_flags, dump_ok ? "yes" : "no");
+		return dump_ok;
+	}
+#endif
+
+	dump_ok = test_bit(MMF_DUMP_ANON_PRIVATE, &mm_flags);
+	kdcore("%08lx: %08lx: %s", vma->vm_start, vma->vm_flags,
+	       dump_ok ? "yes" : "no");
+	return dump_ok;
+}
+
+/* An ELF note in memory */
+struct memelfnote
+{
+	const char *name;
+	int type;
+	unsigned int datasz;
+	void *data;
+};
+
+static int notesize(struct memelfnote *en)
+{
+	int sz;
+
+	sz = sizeof(struct elf_note);
+	sz += roundup(strlen(en->name) + 1, 4);
+	sz += roundup(en->datasz, 4);
+
+	return sz;
+}
+
+/* #define DEBUG */
+
+#define DUMP_WRITE(addr, nr, foffset)	\
+	do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
+
+static int alignfile(struct file *file, loff_t *foffset)
+{
+	static const char buf[4] = { 0, };
+	DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
+	return 1;
+}
+
+static int writenote(struct memelfnote *men, struct file *file,
+			loff_t *foffset)
+{
+	struct elf_note en;
+	en.n_namesz = strlen(men->name) + 1;
+	en.n_descsz = men->datasz;
+	en.n_type = men->type;
+
+	DUMP_WRITE(&en, sizeof(en), foffset);
+	DUMP_WRITE(men->name, en.n_namesz, foffset);
+	if (!alignfile(file, foffset))
+		return 0;
+	DUMP_WRITE(men->data, men->datasz, foffset);
+	if (!alignfile(file, foffset))
+		return 0;
+
+	return 1;
+}
+#undef DUMP_WRITE
+
+static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
+{
+	memcpy(elf->e_ident, ELFMAG, SELFMAG);
+	elf->e_ident[EI_CLASS] = ELF_CLASS;
+	elf->e_ident[EI_DATA] = ELF_DATA;
+	elf->e_ident[EI_VERSION] = EV_CURRENT;
+	elf->e_ident[EI_OSABI] = ELF_OSABI;
+	memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
+
+	elf->e_type = ET_CORE;
+	elf->e_machine = ELF_ARCH;
+	elf->e_version = EV_CURRENT;
+	elf->e_entry = 0;
+	elf->e_phoff = sizeof(struct elfhdr);
+	elf->e_shoff = 0;
+	elf->e_flags = ELF_FDPIC_CORE_EFLAGS;
+	elf->e_ehsize = sizeof(struct elfhdr);
+	elf->e_phentsize = sizeof(struct elf_phdr);
+	elf->e_phnum = segs;
+	elf->e_shentsize = 0;
+	elf->e_shnum = 0;
+	elf->e_shstrndx = 0;
+	return;
+}
+
+static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
+{
+	phdr->p_type = PT_NOTE;
+	phdr->p_offset = offset;
+	phdr->p_vaddr = 0;
+	phdr->p_paddr = 0;
+	phdr->p_filesz = sz;
+	phdr->p_memsz = 0;
+	phdr->p_flags = 0;
+	phdr->p_align = 0;
+	return;
+}
+
+static inline void fill_note(struct memelfnote *note, const char *name, int type,
+		unsigned int sz, void *data)
+{
+	note->name = name;
+	note->type = type;
+	note->datasz = sz;
+	note->data = data;
+	return;
+}
+
+/*
+ * fill up all the fields in prstatus from the given task struct, except
+ * registers which need to be filled up separately.
+ */
+static void fill_prstatus(struct elf_prstatus *prstatus,
+			  struct task_struct *p, long signr)
+{
+	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
+	prstatus->pr_sigpend = p->pending.signal.sig[0];
+	prstatus->pr_sighold = p->blocked.sig[0];
+	rcu_read_lock();
+	prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
+	prstatus->pr_pid = task_pid_vnr(p);
+	prstatus->pr_pgrp = task_pgrp_vnr(p);
+	prstatus->pr_sid = task_session_vnr(p);
+	if (thread_group_leader(p)) {
+		struct task_cputime cputime;
+
+		/*
+		 * This is the record for the group leader.  It shows the
+		 * group-wide total, not its individual thread total.
+		 */
+		thread_group_cputime(p, &cputime);
+		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
+		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
+	} else {
+		cputime_to_timeval(p->utime, &prstatus->pr_utime);
+		cputime_to_timeval(p->stime, &prstatus->pr_stime);
+	}
+	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
+	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
+
+	prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap;
+	prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap;
+}
+
+static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
+		       struct mm_struct *mm)
+{
+	const struct cred *cred;
+	unsigned int i, len;
+
+	/* first copy the parameters from user space */
+	memset(psinfo, 0, sizeof(struct elf_prpsinfo));
+
+	len = mm->arg_end - mm->arg_start;
+	if (len >= ELF_PRARGSZ)
+		len = ELF_PRARGSZ - 1;
+	if (copy_from_user(&psinfo->pr_psargs,
+		           (const char __user *) mm->arg_start, len))
+		return -EFAULT;
+	for (i = 0; i < len; i++)
+		if (psinfo->pr_psargs[i] == 0)
+			psinfo->pr_psargs[i] = ' ';
+	psinfo->pr_psargs[len] = 0;
+
+	rcu_read_lock();
+	psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
+	psinfo->pr_pid = task_pid_vnr(p);
+	psinfo->pr_pgrp = task_pgrp_vnr(p);
+	psinfo->pr_sid = task_session_vnr(p);
+
+	i = p->state ? ffz(~p->state) + 1 : 0;
+	psinfo->pr_state = i;
+	psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i];
+	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
+	psinfo->pr_nice = task_nice(p);
+	psinfo->pr_flag = p->flags;
+	rcu_read_lock();
+	cred = __task_cred(p);
+	SET_UID(psinfo->pr_uid, cred->uid);
+	SET_GID(psinfo->pr_gid, cred->gid);
+	rcu_read_unlock();
+	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
+
+	return 0;
+}
+
+/* Here is the structure in which status of each thread is captured. */
+struct elf_thread_status
+{
+	struct list_head list;
+	struct elf_prstatus prstatus;	/* NT_PRSTATUS */
+	elf_fpregset_t fpu;		/* NT_PRFPREG */
+	struct task_struct *thread;
+#ifdef ELF_CORE_COPY_XFPREGS
+	elf_fpxregset_t xfpu;		/* ELF_CORE_XFPREG_TYPE */
+#endif
+	struct memelfnote notes[3];
+	int num_notes;
+};
+
+/*
+ * In order to add the specific thread information for the elf file format,
+ * we need to keep a linked list of every thread's pr_status and then create
+ * a single section for them in the final core file.
+ */
+static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
+{
+	struct task_struct *p = t->thread;
+	int sz = 0;
+
+	t->num_notes = 0;
+
+	fill_prstatus(&t->prstatus, p, signr);
+	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
+
+	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+		  &t->prstatus);
+	t->num_notes++;
+	sz += notesize(&t->notes[0]);
+
+	t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, &t->fpu);
+	if (t->prstatus.pr_fpvalid) {
+		fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
+			  &t->fpu);
+		t->num_notes++;
+		sz += notesize(&t->notes[1]);
+	}
+
+#ifdef ELF_CORE_COPY_XFPREGS
+	if (elf_core_copy_task_xfpregs(p, &t->xfpu)) {
+		fill_note(&t->notes[2], "LINUX", ELF_CORE_XFPREG_TYPE,
+			  sizeof(t->xfpu), &t->xfpu);
+		t->num_notes++;
+		sz += notesize(&t->notes[2]);
+	}
+#endif
+	return sz;
+}
+
+static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
+			     elf_addr_t e_shoff, int segs)
+{
+	elf->e_shoff = e_shoff;
+	elf->e_shentsize = sizeof(*shdr4extnum);
+	elf->e_shnum = 1;
+	elf->e_shstrndx = SHN_UNDEF;
+
+	memset(shdr4extnum, 0, sizeof(*shdr4extnum));
+
+	shdr4extnum->sh_type = SHT_NULL;
+	shdr4extnum->sh_size = elf->e_shnum;
+	shdr4extnum->sh_link = elf->e_shstrndx;
+	shdr4extnum->sh_info = segs;
+}
+
+/*
+ * dump the segments for an MMU process
+ */
+#ifdef CONFIG_MMU
+static int elf_fdpic_dump_segments(struct file *file, size_t *size,
+			   unsigned long *limit, unsigned long mm_flags)
+{
+	struct vm_area_struct *vma;
+	int err = 0;
+
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
+		unsigned long addr;
+
+		if (!maydump(vma, mm_flags))
+			continue;
+
+		for (addr = vma->vm_start; addr < vma->vm_end;
+							addr += PAGE_SIZE) {
+			struct page *page = get_dump_page(addr);
+			if (page) {
+				void *kaddr = kmap(page);
+				*size += PAGE_SIZE;
+				if (*size > *limit)
+					err = -EFBIG;
+				else if (!dump_write(file, kaddr, PAGE_SIZE))
+					err = -EIO;
+				kunmap(page);
+				page_cache_release(page);
+			} else if (!dump_seek(file, PAGE_SIZE))
+				err = -EFBIG;
+			if (err)
+				goto out;
+		}
+	}
+out:
+	return err;
+}
+#endif
+
+/*
+ * dump the segments for a NOMMU process
+ */
+#ifndef CONFIG_MMU
+static int elf_fdpic_dump_segments(struct file *file, size_t *size,
+			   unsigned long *limit, unsigned long mm_flags)
+{
+	struct vm_area_struct *vma;
+
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
+		if (!maydump(vma, mm_flags))
+			continue;
+
+		if ((*size += PAGE_SIZE) > *limit)
+			return -EFBIG;
+
+		if (!dump_write(file, (void *) vma->vm_start,
+				vma->vm_end - vma->vm_start))
+			return -EIO;
+	}
+
+	return 0;
+}
+#endif
+
+static size_t elf_core_vma_data_size(unsigned long mm_flags)
+{
+	struct vm_area_struct *vma;
+	size_t size = 0;
+
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next)
+		if (maydump(vma, mm_flags))
+			size += vma->vm_end - vma->vm_start;
+	return size;
+}
+
+/*
+ * Actual dumper
+ *
+ * This is a two-pass process; first we find the offsets of the bits,
+ * and then they are actually written out.  If we run out of core limit
+ * we just truncate.
+ */
+static int elf_fdpic_core_dump(struct coredump_params *cprm)
+{
+#define	NUM_NOTES	6
+	int has_dumped = 0;
+	mm_segment_t fs;
+	int segs;
+	size_t size = 0;
+	int i;
+	struct vm_area_struct *vma;
+	struct elfhdr *elf = NULL;
+	loff_t offset = 0, dataoff, foffset;
+	int numnote;
+	struct memelfnote *notes = NULL;
+	struct elf_prstatus *prstatus = NULL;	/* NT_PRSTATUS */
+	struct elf_prpsinfo *psinfo = NULL;	/* NT_PRPSINFO */
+ 	LIST_HEAD(thread_list);
+ 	struct list_head *t;
+	elf_fpregset_t *fpu = NULL;
+#ifdef ELF_CORE_COPY_XFPREGS
+	elf_fpxregset_t *xfpu = NULL;
+#endif
+	int thread_status_size = 0;
+	elf_addr_t *auxv;
+	struct elf_phdr *phdr4note = NULL;
+	struct elf_shdr *shdr4extnum = NULL;
+	Elf_Half e_phnum;
+	elf_addr_t e_shoff;
+
+	/*
+	 * We no longer stop all VM operations.
+	 *
+	 * This is because those proceses that could possibly change map_count
+	 * or the mmap / vma pages are now blocked in do_exit on current
+	 * finishing this core dump.
+	 *
+	 * Only ptrace can touch these memory addresses, but it doesn't change
+	 * the map_count or the pages allocated. So no possibility of crashing
+	 * exists while dumping the mm->vm_next areas to the core file.
+	 */
+
+	/* alloc memory for large data structures: too large to be on stack */
+	elf = kmalloc(sizeof(*elf), GFP_KERNEL);
+	if (!elf)
+		goto cleanup;
+	prstatus = kzalloc(sizeof(*prstatus), GFP_KERNEL);
+	if (!prstatus)
+		goto cleanup;
+	psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
+	if (!psinfo)
+		goto cleanup;
+	notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL);
+	if (!notes)
+		goto cleanup;
+	fpu = kmalloc(sizeof(*fpu), GFP_KERNEL);
+	if (!fpu)
+		goto cleanup;
+#ifdef ELF_CORE_COPY_XFPREGS
+	xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL);
+	if (!xfpu)
+		goto cleanup;
+#endif
+
+	if (cprm->signr) {
+		struct core_thread *ct;
+		struct elf_thread_status *tmp;
+
+		for (ct = current->mm->core_state->dumper.next;
+						ct; ct = ct->next) {
+			tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+			if (!tmp)
+				goto cleanup;
+
+			tmp->thread = ct->task;
+			list_add(&tmp->list, &thread_list);
+		}
+
+		list_for_each(t, &thread_list) {
+			struct elf_thread_status *tmp;
+			int sz;
+
+			tmp = list_entry(t, struct elf_thread_status, list);
+			sz = elf_dump_thread_status(cprm->signr, tmp);
+			thread_status_size += sz;
+		}
+	}
+
+	/* now collect the dump for the current */
+	fill_prstatus(prstatus, current, cprm->signr);
+	elf_core_copy_regs(&prstatus->pr_reg, cprm->regs);
+
+	segs = current->mm->map_count;
+	segs += elf_core_extra_phdrs();
+
+	/* for notes section */
+	segs++;
+
+	/* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
+	 * this, kernel supports extended numbering. Have a look at
+	 * include/linux/elf.h for further information. */
+	e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
+
+	/* Set up header */
+	fill_elf_fdpic_header(elf, e_phnum);
+
+	has_dumped = 1;
+	current->flags |= PF_DUMPCORE;
+
+	/*
+	 * Set up the notes in similar form to SVR4 core dumps made
+	 * with info from their /proc.
+	 */
+
+	fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus);
+	fill_psinfo(psinfo, current->group_leader, current->mm);
+	fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+
+	numnote = 2;
+
+	auxv = (elf_addr_t *) current->mm->saved_auxv;
+
+	i = 0;
+	do
+		i += 2;
+	while (auxv[i - 2] != AT_NULL);
+	fill_note(&notes[numnote++], "CORE", NT_AUXV,
+		  i * sizeof(elf_addr_t), auxv);
+
+  	/* Try to dump the FPU. */
+	if ((prstatus->pr_fpvalid =
+	     elf_core_copy_task_fpregs(current, cprm->regs, fpu)))
+		fill_note(notes + numnote++,
+			  "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
+#ifdef ELF_CORE_COPY_XFPREGS
+	if (elf_core_copy_task_xfpregs(current, xfpu))
+		fill_note(notes + numnote++,
+			  "LINUX", ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu);
+#endif
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	offset += sizeof(*elf);				/* Elf header */
+	offset += segs * sizeof(struct elf_phdr);	/* Program headers */
+	foffset = offset;
+
+	/* Write notes phdr entry */
+	{
+		int sz = 0;
+
+		for (i = 0; i < numnote; i++)
+			sz += notesize(notes + i);
+
+		sz += thread_status_size;
+
+		phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
+		if (!phdr4note)
+			goto end_coredump;
+
+		fill_elf_note_phdr(phdr4note, sz, offset);
+		offset += sz;
+	}
+
+	/* Page-align dumped data */
+	dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
+
+	offset += elf_core_vma_data_size(cprm->mm_flags);
+	offset += elf_core_extra_data_size();
+	e_shoff = offset;
+
+	if (e_phnum == PN_XNUM) {
+		shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
+		if (!shdr4extnum)
+			goto end_coredump;
+		fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
+	}
+
+	offset = dataoff;
+
+	size += sizeof(*elf);
+	if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
+		goto end_coredump;
+
+	size += sizeof(*phdr4note);
+	if (size > cprm->limit
+	    || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
+		goto end_coredump;
+
+	/* write program headers for segments dump */
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
+		struct elf_phdr phdr;
+		size_t sz;
+
+		sz = vma->vm_end - vma->vm_start;
+
+		phdr.p_type = PT_LOAD;
+		phdr.p_offset = offset;
+		phdr.p_vaddr = vma->vm_start;
+		phdr.p_paddr = 0;
+		phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0;
+		phdr.p_memsz = sz;
+		offset += phdr.p_filesz;
+		phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
+		if (vma->vm_flags & VM_WRITE)
+			phdr.p_flags |= PF_W;
+		if (vma->vm_flags & VM_EXEC)
+			phdr.p_flags |= PF_X;
+		phdr.p_align = ELF_EXEC_PAGESIZE;
+
+		size += sizeof(phdr);
+		if (size > cprm->limit
+		    || !dump_write(cprm->file, &phdr, sizeof(phdr)))
+			goto end_coredump;
+	}
+
+	if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
+		goto end_coredump;
+
+ 	/* write out the notes section */
+	for (i = 0; i < numnote; i++)
+		if (!writenote(notes + i, cprm->file, &foffset))
+			goto end_coredump;
+
+	/* write out the thread status notes section */
+	list_for_each(t, &thread_list) {
+		struct elf_thread_status *tmp =
+				list_entry(t, struct elf_thread_status, list);
+
+		for (i = 0; i < tmp->num_notes; i++)
+			if (!writenote(&tmp->notes[i], cprm->file, &foffset))
+				goto end_coredump;
+	}
+
+	if (!dump_seek(cprm->file, dataoff - foffset))
+		goto end_coredump;
+
+	if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit,
+				    cprm->mm_flags) < 0)
+		goto end_coredump;
+
+	if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
+		goto end_coredump;
+
+	if (e_phnum == PN_XNUM) {
+		size += sizeof(*shdr4extnum);
+		if (size > cprm->limit
+		    || !dump_write(cprm->file, shdr4extnum,
+				   sizeof(*shdr4extnum)))
+			goto end_coredump;
+	}
+
+	if (cprm->file->f_pos != offset) {
+		/* Sanity check */
+		printk(KERN_WARNING
+		       "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n",
+		       cprm->file->f_pos, offset);
+	}
+
+end_coredump:
+	set_fs(fs);
+
+cleanup:
+	while (!list_empty(&thread_list)) {
+		struct list_head *tmp = thread_list.next;
+		list_del(tmp);
+		kfree(list_entry(tmp, struct elf_thread_status, list));
+	}
+	kfree(phdr4note);
+	kfree(elf);
+	kfree(prstatus);
+	kfree(psinfo);
+	kfree(notes);
+	kfree(fpu);
+	kfree(shdr4extnum);
+#ifdef ELF_CORE_COPY_XFPREGS
+	kfree(xfpu);
+#endif
+	return has_dumped;
+#undef NUM_NOTES
+}
+
+#endif		/* CONFIG_ELF_CORE */
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
new file mode 100644
index 00000000..b8e8b0ac
--- /dev/null
+++ b/fs/binfmt_em86.c
@@ -0,0 +1,113 @@
+/*
+ *  linux/fs/binfmt_em86.c
+ *
+ *  Based on linux/fs/binfmt_script.c
+ *  Copyright (C) 1996  Martin von Löwis
+ *  original #!-checking implemented by tytso.
+ *
+ *  em86 changes Copyright (C) 1997  Jim Paradis
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/binfmts.h>
+#include <linux/elf.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/errno.h>
+
+
+#define EM86_INTERP	"/usr/bin/em86"
+#define EM86_I_NAME	"em86"
+
+static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
+{
+	char *interp, *i_name, *i_arg;
+	struct file * file;
+	int retval;
+	struct elfhdr	elf_ex;
+
+	/* Make sure this is a Linux/Intel ELF executable... */
+	elf_ex = *((struct elfhdr *)bprm->buf);
+
+	if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+		return  -ENOEXEC;
+
+	/* First of all, some simple consistency checks */
+	if ((elf_ex.e_type != ET_EXEC && elf_ex.e_type != ET_DYN) ||
+		(!((elf_ex.e_machine == EM_386) || (elf_ex.e_machine == EM_486))) ||
+		(!bprm->file->f_op || !bprm->file->f_op->mmap)) {
+			return -ENOEXEC;
+	}
+
+	bprm->recursion_depth++; /* Well, the bang-shell is implicit... */
+	allow_write_access(bprm->file);
+	fput(bprm->file);
+	bprm->file = NULL;
+
+	/* Unlike in the script case, we don't have to do any hairy
+	 * parsing to find our interpreter... it's hardcoded!
+	 */
+	interp = EM86_INTERP;
+	i_name = EM86_I_NAME;
+	i_arg = NULL;		/* We reserve the right to add an arg later */
+
+	/*
+	 * Splice in (1) the interpreter's name for argv[0]
+	 *           (2) (optional) argument to interpreter
+	 *           (3) filename of emulated file (replace argv[0])
+	 *
+	 * This is done in reverse order, because of how the
+	 * user environment and arguments are stored.
+	 */
+	remove_arg_zero(bprm);
+	retval = copy_strings_kernel(1, &bprm->filename, bprm);
+	if (retval < 0) return retval; 
+	bprm->argc++;
+	if (i_arg) {
+		retval = copy_strings_kernel(1, &i_arg, bprm);
+		if (retval < 0) return retval; 
+		bprm->argc++;
+	}
+	retval = copy_strings_kernel(1, &i_name, bprm);
+	if (retval < 0)	return retval;
+	bprm->argc++;
+
+	/*
+	 * OK, now restart the process with the interpreter's inode.
+	 * Note that we use open_exec() as the name is now in kernel
+	 * space, and we don't need to copy it.
+	 */
+	file = open_exec(interp);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	bprm->file = file;
+
+	retval = prepare_binprm(bprm);
+	if (retval < 0)
+		return retval;
+
+	return search_binary_handler(bprm, regs);
+}
+
+static struct linux_binfmt em86_format = {
+	.module		= THIS_MODULE,
+	.load_binary	= load_em86,
+};
+
+static int __init init_em86_binfmt(void)
+{
+	return register_binfmt(&em86_format);
+}
+
+static void __exit exit_em86_binfmt(void)
+{
+	unregister_binfmt(&em86_format);
+}
+
+core_initcall(init_em86_binfmt);
+module_exit(exit_em86_binfmt);
+MODULE_LICENSE("GPL");
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
new file mode 100644
index 00000000..1bffbe0e
--- /dev/null
+++ b/fs/binfmt_flat.c
@@ -0,0 +1,960 @@
+/****************************************************************************/
+/*
+ *  linux/fs/binfmt_flat.c
+ *
+ *	Copyright (C) 2000-2003 David McCullough <davidm@snapgear.com>
+ *	Copyright (C) 2002 Greg Ungerer <gerg@snapgear.com>
+ *	Copyright (C) 2002 SnapGear, by Paul Dale <pauli@snapgear.com>
+ *	Copyright (C) 2000, 2001 Lineo, by David McCullough <davidm@lineo.com>
+ *  based heavily on:
+ *
+ *  linux/fs/binfmt_aout.c:
+ *      Copyright (C) 1991, 1992, 1996  Linus Torvalds
+ *  linux/fs/binfmt_flat.c for 2.0 kernel
+ *	    Copyright (C) 1998  Kenneth Albanowski <kjahds@kjahds.com>
+ *	JAN/99 -- coded full program relocation (gerg@snapgear.com)
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/slab.h>
+#include <linux/binfmts.h>
+#include <linux/personality.h>
+#include <linux/init.h>
+#include <linux/flat.h>
+#include <linux/syscalls.h>
+
+#include <asm/byteorder.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+#include <asm/cacheflush.h>
+#include <asm/page.h>
+
+/****************************************************************************/
+
+#if 0
+#define DEBUG 1
+#endif
+
+#ifdef DEBUG
+#define	DBG_FLT(a...)	printk(a)
+#else
+#define	DBG_FLT(a...)
+#endif
+
+/*
+ * User data (data section and bss) needs to be aligned.
+ * We pick 0x20 here because it is the max value elf2flt has always
+ * used in producing FLAT files, and because it seems to be large
+ * enough to make all the gcc alignment related tests happy.
+ */
+#define FLAT_DATA_ALIGN	(0x20)
+
+/*
+ * User data (stack) also needs to be aligned.
+ * Here we can be a bit looser than the data sections since this
+ * needs to only meet arch ABI requirements.
+ */
+#define FLAT_STACK_ALIGN	max_t(unsigned long, sizeof(void *), ARCH_SLAB_MINALIGN)
+
+#define RELOC_FAILED 0xff00ff01		/* Relocation incorrect somewhere */
+#define UNLOADED_LIB 0x7ff000ff		/* Placeholder for unused library */
+
+struct lib_info {
+	struct {
+		unsigned long start_code;		/* Start of text segment */
+		unsigned long start_data;		/* Start of data segment */
+		unsigned long start_brk;		/* End of data segment */
+		unsigned long text_len;			/* Length of text segment */
+		unsigned long entry;			/* Start address for this module */
+		unsigned long build_date;		/* When this one was compiled */
+		short loaded;				/* Has this library been loaded? */
+	} lib_list[MAX_SHARED_LIBS];
+};
+
+#ifdef CONFIG_BINFMT_SHARED_FLAT
+static int load_flat_shared_library(int id, struct lib_info *p);
+#endif
+
+static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs);
+static int flat_core_dump(struct coredump_params *cprm);
+
+static struct linux_binfmt flat_format = {
+	.module		= THIS_MODULE,
+	.load_binary	= load_flat_binary,
+	.core_dump	= flat_core_dump,
+	.min_coredump	= PAGE_SIZE
+};
+
+/****************************************************************************/
+/*
+ * Routine writes a core dump image in the current directory.
+ * Currently only a stub-function.
+ */
+
+static int flat_core_dump(struct coredump_params *cprm)
+{
+	printk("Process %s:%d received signr %d and should have core dumped\n",
+			current->comm, current->pid, (int) cprm->signr);
+	return(1);
+}
+
+/****************************************************************************/
+/*
+ * create_flat_tables() parses the env- and arg-strings in new user
+ * memory and creates the pointer tables from them, and puts their
+ * addresses on the "stack", returning the new stack pointer value.
+ */
+
+static unsigned long create_flat_tables(
+	unsigned long pp,
+	struct linux_binprm * bprm)
+{
+	unsigned long *argv,*envp;
+	unsigned long * sp;
+	char * p = (char*)pp;
+	int argc = bprm->argc;
+	int envc = bprm->envc;
+	char uninitialized_var(dummy);
+
+	sp = (unsigned long *)p;
+	sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
+	sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN);
+	argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
+	envp = argv + (argc + 1);
+
+	if (flat_argvp_envp_on_stack()) {
+		put_user((unsigned long) envp, sp + 2);
+		put_user((unsigned long) argv, sp + 1);
+	}
+
+	put_user(argc, sp);
+	current->mm->arg_start = (unsigned long) p;
+	while (argc-->0) {
+		put_user((unsigned long) p, argv++);
+		do {
+			get_user(dummy, p); p++;
+		} while (dummy);
+	}
+	put_user((unsigned long) NULL, argv);
+	current->mm->arg_end = current->mm->env_start = (unsigned long) p;
+	while (envc-->0) {
+		put_user((unsigned long)p, envp); envp++;
+		do {
+			get_user(dummy, p); p++;
+		} while (dummy);
+	}
+	put_user((unsigned long) NULL, envp);
+	current->mm->env_end = (unsigned long) p;
+	return (unsigned long)sp;
+}
+
+/****************************************************************************/
+
+#ifdef CONFIG_BINFMT_ZFLAT
+
+#include <linux/zlib.h>
+
+#define LBUFSIZE	4000
+
+/* gzip flag byte */
+#define ASCII_FLAG   0x01 /* bit 0 set: file probably ASCII text */
+#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */
+#define EXTRA_FIELD  0x04 /* bit 2 set: extra field present */
+#define ORIG_NAME    0x08 /* bit 3 set: original file name present */
+#define COMMENT      0x10 /* bit 4 set: file comment present */
+#define ENCRYPTED    0x20 /* bit 5 set: file is encrypted */
+#define RESERVED     0xC0 /* bit 6,7:   reserved */
+
+static int decompress_exec(
+	struct linux_binprm *bprm,
+	unsigned long offset,
+	char *dst,
+	long len,
+	int fd)
+{
+	unsigned char *buf;
+	z_stream strm;
+	loff_t fpos;
+	int ret, retval;
+
+	DBG_FLT("decompress_exec(offset=%x,buf=%x,len=%x)\n",(int)offset, (int)dst, (int)len);
+
+	memset(&strm, 0, sizeof(strm));
+	strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
+	if (strm.workspace == NULL) {
+		DBG_FLT("binfmt_flat: no memory for decompress workspace\n");
+		return -ENOMEM;
+	}
+	buf = kmalloc(LBUFSIZE, GFP_KERNEL);
+	if (buf == NULL) {
+		DBG_FLT("binfmt_flat: no memory for read buffer\n");
+		retval = -ENOMEM;
+		goto out_free;
+	}
+
+	/* Read in first chunk of data and parse gzip header. */
+	fpos = offset;
+	ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
+
+	strm.next_in = buf;
+	strm.avail_in = ret;
+	strm.total_in = 0;
+
+	retval = -ENOEXEC;
+
+	/* Check minimum size -- gzip header */
+	if (ret < 10) {
+		DBG_FLT("binfmt_flat: file too small?\n");
+		goto out_free_buf;
+	}
+
+	/* Check gzip magic number */
+	if ((buf[0] != 037) || ((buf[1] != 0213) && (buf[1] != 0236))) {
+		DBG_FLT("binfmt_flat: unknown compression magic?\n");
+		goto out_free_buf;
+	}
+
+	/* Check gzip method */
+	if (buf[2] != 8) {
+		DBG_FLT("binfmt_flat: unknown compression method?\n");
+		goto out_free_buf;
+	}
+	/* Check gzip flags */
+	if ((buf[3] & ENCRYPTED) || (buf[3] & CONTINUATION) ||
+	    (buf[3] & RESERVED)) {
+		DBG_FLT("binfmt_flat: unknown flags?\n");
+		goto out_free_buf;
+	}
+
+	ret = 10;
+	if (buf[3] & EXTRA_FIELD) {
+		ret += 2 + buf[10] + (buf[11] << 8);
+		if (unlikely(LBUFSIZE <= ret)) {
+			DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n");
+			goto out_free_buf;
+		}
+	}
+	if (buf[3] & ORIG_NAME) {
+		while (ret < LBUFSIZE && buf[ret++] != 0)
+			;
+		if (unlikely(LBUFSIZE == ret)) {
+			DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n");
+			goto out_free_buf;
+		}
+	}
+	if (buf[3] & COMMENT) {
+		while (ret < LBUFSIZE && buf[ret++] != 0)
+			;
+		if (unlikely(LBUFSIZE == ret)) {
+			DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n");
+			goto out_free_buf;
+		}
+	}
+
+	strm.next_in += ret;
+	strm.avail_in -= ret;
+
+	strm.next_out = dst;
+	strm.avail_out = len;
+	strm.total_out = 0;
+
+	if (zlib_inflateInit2(&strm, -MAX_WBITS) != Z_OK) {
+		DBG_FLT("binfmt_flat: zlib init failed?\n");
+		goto out_free_buf;
+	}
+
+	while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK) {
+		ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
+		if (ret <= 0)
+			break;
+		len -= ret;
+
+		strm.next_in = buf;
+		strm.avail_in = ret;
+		strm.total_in = 0;
+	}
+
+	if (ret < 0) {
+		DBG_FLT("binfmt_flat: decompression failed (%d), %s\n",
+			ret, strm.msg);
+		goto out_zlib;
+	}
+
+	retval = 0;
+out_zlib:
+	zlib_inflateEnd(&strm);
+out_free_buf:
+	kfree(buf);
+out_free:
+	kfree(strm.workspace);
+	return retval;
+}
+
+#endif /* CONFIG_BINFMT_ZFLAT */
+
+/****************************************************************************/
+
+static unsigned long
+calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
+{
+	unsigned long addr;
+	int id;
+	unsigned long start_brk;
+	unsigned long start_data;
+	unsigned long text_len;
+	unsigned long start_code;
+
+#ifdef CONFIG_BINFMT_SHARED_FLAT
+	if (r == 0)
+		id = curid;	/* Relocs of 0 are always self referring */
+	else {
+		id = (r >> 24) & 0xff;	/* Find ID for this reloc */
+		r &= 0x00ffffff;	/* Trim ID off here */
+	}
+	if (id >= MAX_SHARED_LIBS) {
+		printk("BINFMT_FLAT: reference 0x%x to shared library %d",
+				(unsigned) r, id);
+		goto failed;
+	}
+	if (curid != id) {
+		if (internalp) {
+			printk("BINFMT_FLAT: reloc address 0x%x not in same module "
+					"(%d != %d)", (unsigned) r, curid, id);
+			goto failed;
+		} else if ( ! p->lib_list[id].loaded &&
+				IS_ERR_VALUE(load_flat_shared_library(id, p))) {
+			printk("BINFMT_FLAT: failed to load library %d", id);
+			goto failed;
+		}
+		/* Check versioning information (i.e. time stamps) */
+		if (p->lib_list[id].build_date && p->lib_list[curid].build_date &&
+				p->lib_list[curid].build_date < p->lib_list[id].build_date) {
+			printk("BINFMT_FLAT: library %d is younger than %d", id, curid);
+			goto failed;
+		}
+	}
+#else
+	id = 0;
+#endif
+
+	start_brk = p->lib_list[id].start_brk;
+	start_data = p->lib_list[id].start_data;
+	start_code = p->lib_list[id].start_code;
+	text_len = p->lib_list[id].text_len;
+
+	if (!flat_reloc_valid(r, start_brk - start_data + text_len)) {
+		printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)",
+		       (int) r,(int)(start_brk-start_data+text_len),(int)text_len);
+		goto failed;
+	}
+
+	if (r < text_len)			/* In text segment */
+		addr = r + start_code;
+	else					/* In data segment */
+		addr = r - text_len + start_data;
+
+	/* Range checked already above so doing the range tests is redundant...*/
+	return(addr);
+
+failed:
+	printk(", killing %s!\n", current->comm);
+	send_sig(SIGSEGV, current, 0);
+
+	return RELOC_FAILED;
+}
+
+/****************************************************************************/
+
+void old_reloc(unsigned long rl)
+{
+#ifdef DEBUG
+	char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" };
+#endif
+	flat_v2_reloc_t	r;
+	unsigned long *ptr;
+	
+	r.value = rl;
+#if defined(CONFIG_COLDFIRE)
+	ptr = (unsigned long *) (current->mm->start_code + r.reloc.offset);
+#else
+	ptr = (unsigned long *) (current->mm->start_data + r.reloc.offset);
+#endif
+
+#ifdef DEBUG
+	printk("Relocation of variable at DATASEG+%x "
+		"(address %p, currently %x) into segment %s\n",
+		r.reloc.offset, ptr, (int)*ptr, segment[r.reloc.type]);
+#endif
+	
+	switch (r.reloc.type) {
+	case OLD_FLAT_RELOC_TYPE_TEXT:
+		*ptr += current->mm->start_code;
+		break;
+	case OLD_FLAT_RELOC_TYPE_DATA:
+		*ptr += current->mm->start_data;
+		break;
+	case OLD_FLAT_RELOC_TYPE_BSS:
+		*ptr += current->mm->end_data;
+		break;
+	default:
+		printk("BINFMT_FLAT: Unknown relocation type=%x\n", r.reloc.type);
+		break;
+	}
+
+#ifdef DEBUG
+	printk("Relocation became %x\n", (int)*ptr);
+#endif
+}		
+
+/****************************************************************************/
+
+static int load_flat_file(struct linux_binprm * bprm,
+		struct lib_info *libinfo, int id, unsigned long *extra_stack)
+{
+	struct flat_hdr * hdr;
+	unsigned long textpos = 0, datapos = 0, result;
+	unsigned long realdatastart = 0;
+	unsigned long text_len, data_len, bss_len, stack_len, flags;
+	unsigned long len, memp = 0;
+	unsigned long memp_size, extra, rlim;
+	unsigned long *reloc = 0, *rp;
+	struct inode *inode;
+	int i, rev, relocs = 0;
+	loff_t fpos;
+	unsigned long start_code, end_code;
+	int ret;
+
+	hdr = ((struct flat_hdr *) bprm->buf);		/* exec-header */
+	inode = bprm->file->f_path.dentry->d_inode;
+
+	text_len  = ntohl(hdr->data_start);
+	data_len  = ntohl(hdr->data_end) - ntohl(hdr->data_start);
+	bss_len   = ntohl(hdr->bss_end) - ntohl(hdr->data_end);
+	stack_len = ntohl(hdr->stack_size);
+	if (extra_stack) {
+		stack_len += *extra_stack;
+		*extra_stack = stack_len;
+	}
+	relocs    = ntohl(hdr->reloc_count);
+	flags     = ntohl(hdr->flags);
+	rev       = ntohl(hdr->rev);
+
+	if (strncmp(hdr->magic, "bFLT", 4)) {
+		/*
+		 * Previously, here was a printk to tell people
+		 *   "BINFMT_FLAT: bad header magic".
+		 * But for the kernel which also use ELF FD-PIC format, this
+		 * error message is confusing.
+		 * because a lot of people do not manage to produce good
+		 */
+		ret = -ENOEXEC;
+		goto err;
+	}
+
+	if (flags & FLAT_FLAG_KTRACE)
+		printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename);
+
+	if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) {
+		printk("BINFMT_FLAT: bad flat file version 0x%x (supported "
+			"0x%lx and 0x%lx)\n",
+			rev, FLAT_VERSION, OLD_FLAT_VERSION);
+		ret = -ENOEXEC;
+		goto err;
+	}
+	
+	/* Don't allow old format executables to use shared libraries */
+	if (rev == OLD_FLAT_VERSION && id != 0) {
+		printk("BINFMT_FLAT: shared libraries are not available before rev 0x%x\n",
+				(int) FLAT_VERSION);
+		ret = -ENOEXEC;
+		goto err;
+	}
+
+	/*
+	 * fix up the flags for the older format,  there were all kinds
+	 * of endian hacks,  this only works for the simple cases
+	 */
+	if (rev == OLD_FLAT_VERSION && flat_old_ram_flag(flags))
+		flags = FLAT_FLAG_RAM;
+
+#ifndef CONFIG_BINFMT_ZFLAT
+	if (flags & (FLAT_FLAG_GZIP|FLAT_FLAG_GZDATA)) {
+		printk("Support for ZFLAT executables is not enabled.\n");
+		ret = -ENOEXEC;
+		goto err;
+	}
+#endif
+
+	/*
+	 * Check initial limits. This avoids letting people circumvent
+	 * size limits imposed on them by creating programs with large
+	 * arrays in the data or bss.
+	 */
+	rlim = rlimit(RLIMIT_DATA);
+	if (rlim >= RLIM_INFINITY)
+		rlim = ~0;
+	if (data_len + bss_len > rlim) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/* Flush all traces of the currently running executable */
+	if (id == 0) {
+		result = flush_old_exec(bprm);
+		if (result) {
+			ret = result;
+			goto err;
+		}
+
+		/* OK, This is the point of no return */
+		set_personality(PER_LINUX_32BIT);
+		setup_new_exec(bprm);
+	}
+
+	/*
+	 * calculate the extra space we need to map in
+	 */
+	extra = max_t(unsigned long, bss_len + stack_len,
+			relocs * sizeof(unsigned long));
+
+	/*
+	 * there are a couple of cases here,  the separate code/data
+	 * case,  and then the fully copied to RAM case which lumps
+	 * it all together.
+	 */
+	if ((flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP)) == 0) {
+		/*
+		 * this should give us a ROM ptr,  but if it doesn't we don't
+		 * really care
+		 */
+		DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n");
+
+		down_write(&current->mm->mmap_sem);
+		textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
+				  MAP_PRIVATE|MAP_EXECUTABLE, 0);
+		up_write(&current->mm->mmap_sem);
+		if (!textpos || IS_ERR_VALUE(textpos)) {
+			if (!textpos)
+				textpos = (unsigned long) -ENOMEM;
+			printk("Unable to mmap process text, errno %d\n", (int)-textpos);
+			ret = textpos;
+			goto err;
+		}
+
+		len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+		len = PAGE_ALIGN(len);
+		down_write(&current->mm->mmap_sem);
+		realdatastart = do_mmap(0, 0, len,
+			PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
+		up_write(&current->mm->mmap_sem);
+
+		if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) {
+			if (!realdatastart)
+				realdatastart = (unsigned long) -ENOMEM;
+			printk("Unable to allocate RAM for process data, errno %d\n",
+					(int)-realdatastart);
+			do_munmap(current->mm, textpos, text_len);
+			ret = realdatastart;
+			goto err;
+		}
+		datapos = ALIGN(realdatastart +
+				MAX_SHARED_LIBS * sizeof(unsigned long),
+				FLAT_DATA_ALIGN);
+
+		DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n",
+				(int)(data_len + bss_len + stack_len), (int)datapos);
+
+		fpos = ntohl(hdr->data_start);
+#ifdef CONFIG_BINFMT_ZFLAT
+		if (flags & FLAT_FLAG_GZDATA) {
+			result = decompress_exec(bprm, fpos, (char *) datapos, 
+						 data_len + (relocs * sizeof(unsigned long)), 0);
+		} else
+#endif
+		{
+			result = bprm->file->f_op->read(bprm->file, (char *) datapos,
+					data_len + (relocs * sizeof(unsigned long)), &fpos);
+		}
+		if (IS_ERR_VALUE(result)) {
+			printk("Unable to read data+bss, errno %d\n", (int)-result);
+			do_munmap(current->mm, textpos, text_len);
+			do_munmap(current->mm, realdatastart, len);
+			ret = result;
+			goto err;
+		}
+
+		reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len));
+		memp = realdatastart;
+		memp_size = len;
+	} else {
+
+		len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+		len = PAGE_ALIGN(len);
+		down_write(&current->mm->mmap_sem);
+		textpos = do_mmap(0, 0, len,
+			PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
+		up_write(&current->mm->mmap_sem);
+
+		if (!textpos || IS_ERR_VALUE(textpos)) {
+			if (!textpos)
+				textpos = (unsigned long) -ENOMEM;
+			printk("Unable to allocate RAM for process text/data, errno %d\n",
+					(int)-textpos);
+			ret = textpos;
+			goto err;
+		}
+
+		realdatastart = textpos + ntohl(hdr->data_start);
+		datapos = ALIGN(realdatastart +
+				MAX_SHARED_LIBS * sizeof(unsigned long),
+				FLAT_DATA_ALIGN);
+
+		reloc = (unsigned long *)
+			(datapos + (ntohl(hdr->reloc_start) - text_len));
+		memp = textpos;
+		memp_size = len;
+#ifdef CONFIG_BINFMT_ZFLAT
+		/*
+		 * load it all in and treat it like a RAM load from now on
+		 */
+		if (flags & FLAT_FLAG_GZIP) {
+			result = decompress_exec(bprm, sizeof (struct flat_hdr),
+					 (((char *) textpos) + sizeof (struct flat_hdr)),
+					 (text_len + data_len + (relocs * sizeof(unsigned long))
+						  - sizeof (struct flat_hdr)),
+					 0);
+			memmove((void *) datapos, (void *) realdatastart,
+					data_len + (relocs * sizeof(unsigned long)));
+		} else if (flags & FLAT_FLAG_GZDATA) {
+			fpos = 0;
+			result = bprm->file->f_op->read(bprm->file,
+					(char *) textpos, text_len, &fpos);
+			if (!IS_ERR_VALUE(result))
+				result = decompress_exec(bprm, text_len, (char *) datapos,
+						 data_len + (relocs * sizeof(unsigned long)), 0);
+		}
+		else
+#endif
+		{
+			fpos = 0;
+			result = bprm->file->f_op->read(bprm->file,
+					(char *) textpos, text_len, &fpos);
+			if (!IS_ERR_VALUE(result)) {
+				fpos = ntohl(hdr->data_start);
+				result = bprm->file->f_op->read(bprm->file, (char *) datapos,
+					data_len + (relocs * sizeof(unsigned long)), &fpos);
+			}
+		}
+		if (IS_ERR_VALUE(result)) {
+			printk("Unable to read code+data+bss, errno %d\n",(int)-result);
+			do_munmap(current->mm, textpos, text_len + data_len + extra +
+				MAX_SHARED_LIBS * sizeof(unsigned long));
+			ret = result;
+			goto err;
+		}
+	}
+
+	if (flags & FLAT_FLAG_KTRACE)
+		printk("Mapping is %x, Entry point is %x, data_start is %x\n",
+			(int)textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start));
+
+	/* The main program needs a little extra setup in the task structure */
+	start_code = textpos + sizeof (struct flat_hdr);
+	end_code = textpos + text_len;
+	if (id == 0) {
+		current->mm->start_code = start_code;
+		current->mm->end_code = end_code;
+		current->mm->start_data = datapos;
+		current->mm->end_data = datapos + data_len;
+		/*
+		 * set up the brk stuff, uses any slack left in data/bss/stack
+		 * allocation.  We put the brk after the bss (between the bss
+		 * and stack) like other platforms.
+		 * Userspace code relies on the stack pointer starting out at
+		 * an address right at the end of a page.
+		 */
+		current->mm->start_brk = datapos + data_len + bss_len;
+		current->mm->brk = (current->mm->start_brk + 3) & ~3;
+		current->mm->context.end_brk = memp + memp_size - stack_len;
+	}
+
+	if (flags & FLAT_FLAG_KTRACE)
+		printk("%s %s: TEXT=%x-%x DATA=%x-%x BSS=%x-%x\n",
+			id ? "Lib" : "Load", bprm->filename,
+			(int) start_code, (int) end_code,
+			(int) datapos,
+			(int) (datapos + data_len),
+			(int) (datapos + data_len),
+			(int) (((datapos + data_len + bss_len) + 3) & ~3));
+
+	text_len -= sizeof(struct flat_hdr); /* the real code len */
+
+	/* Store the current module values into the global library structure */
+	libinfo->lib_list[id].start_code = start_code;
+	libinfo->lib_list[id].start_data = datapos;
+	libinfo->lib_list[id].start_brk = datapos + data_len + bss_len;
+	libinfo->lib_list[id].text_len = text_len;
+	libinfo->lib_list[id].loaded = 1;
+	libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos;
+	libinfo->lib_list[id].build_date = ntohl(hdr->build_date);
+	
+	/*
+	 * We just load the allocations into some temporary memory to
+	 * help simplify all this mumbo jumbo
+	 *
+	 * We've got two different sections of relocation entries.
+	 * The first is the GOT which resides at the beginning of the data segment
+	 * and is terminated with a -1.  This one can be relocated in place.
+	 * The second is the extra relocation entries tacked after the image's
+	 * data segment. These require a little more processing as the entry is
+	 * really an offset into the image which contains an offset into the
+	 * image.
+	 */
+	if (flags & FLAT_FLAG_GOTPIC) {
+		for (rp = (unsigned long *)datapos; *rp != 0xffffffff; rp++) {
+			unsigned long addr;
+			if (*rp) {
+				addr = calc_reloc(*rp, libinfo, id, 0);
+				if (addr == RELOC_FAILED) {
+					ret = -ENOEXEC;
+					goto err;
+				}
+				*rp = addr;
+			}
+		}
+	}
+
+	/*
+	 * Now run through the relocation entries.
+	 * We've got to be careful here as C++ produces relocatable zero
+	 * entries in the constructor and destructor tables which are then
+	 * tested for being not zero (which will always occur unless we're
+	 * based from address zero).  This causes an endless loop as __start
+	 * is at zero.  The solution used is to not relocate zero addresses.
+	 * This has the negative side effect of not allowing a global data
+	 * reference to be statically initialised to _stext (I've moved
+	 * __start to address 4 so that is okay).
+	 */
+	if (rev > OLD_FLAT_VERSION) {
+		unsigned long persistent = 0;
+		for (i=0; i < relocs; i++) {
+			unsigned long addr, relval;
+
+			/* Get the address of the pointer to be
+			   relocated (of course, the address has to be
+			   relocated first).  */
+			relval = ntohl(reloc[i]);
+			if (flat_set_persistent (relval, &persistent))
+				continue;
+			addr = flat_get_relocate_addr(relval);
+			rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1);
+			if (rp == (unsigned long *)RELOC_FAILED) {
+				ret = -ENOEXEC;
+				goto err;
+			}
+
+			/* Get the pointer's value.  */
+			addr = flat_get_addr_from_rp(rp, relval, flags,
+							&persistent);
+			if (addr != 0) {
+				/*
+				 * Do the relocation.  PIC relocs in the data section are
+				 * already in target order
+				 */
+				if ((flags & FLAT_FLAG_GOTPIC) == 0)
+					addr = ntohl(addr);
+				addr = calc_reloc(addr, libinfo, id, 0);
+				if (addr == RELOC_FAILED) {
+					ret = -ENOEXEC;
+					goto err;
+				}
+
+				/* Write back the relocated pointer.  */
+				flat_put_addr_at_rp(rp, addr, relval);
+			}
+		}
+	} else {
+		for (i=0; i < relocs; i++)
+			old_reloc(ntohl(reloc[i]));
+	}
+	
+	flush_icache_range(start_code, end_code);
+
+	/* zero the BSS,  BRK and stack areas */
+	memset((void*)(datapos + data_len), 0, bss_len + 
+			(memp + memp_size - stack_len -		/* end brk */
+			libinfo->lib_list[id].start_brk) +	/* start brk */
+			stack_len);
+
+	return 0;
+err:
+	return ret;
+}
+
+
+/****************************************************************************/
+#ifdef CONFIG_BINFMT_SHARED_FLAT
+
+/*
+ * Load a shared library into memory.  The library gets its own data
+ * segment (including bss) but not argv/argc/environ.
+ */
+
+static int load_flat_shared_library(int id, struct lib_info *libs)
+{
+	struct linux_binprm bprm;
+	int res;
+	char buf[16];
+
+	memset(&bprm, 0, sizeof(bprm));
+
+	/* Create the file name */
+	sprintf(buf, "/lib/lib%d.so", id);
+
+	/* Open the file up */
+	bprm.filename = buf;
+	bprm.file = open_exec(bprm.filename);
+	res = PTR_ERR(bprm.file);
+	if (IS_ERR(bprm.file))
+		return res;
+
+	bprm.cred = prepare_exec_creds();
+	res = -ENOMEM;
+	if (!bprm.cred)
+		goto out;
+
+	/* We don't really care about recalculating credentials at this point
+	 * as we're past the point of no return and are dealing with shared
+	 * libraries.
+	 */
+	bprm.cred_prepared = 1;
+
+	res = prepare_binprm(&bprm);
+
+	if (!IS_ERR_VALUE(res))
+		res = load_flat_file(&bprm, libs, id, NULL);
+
+	abort_creds(bprm.cred);
+
+out:
+	allow_write_access(bprm.file);
+	fput(bprm.file);
+
+	return(res);
+}
+
+#endif /* CONFIG_BINFMT_SHARED_FLAT */
+/****************************************************************************/
+
+/*
+ * These are the functions used to load flat style executables and shared
+ * libraries.  There is no binary dependent code anywhere else.
+ */
+
+static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+{
+	struct lib_info libinfo;
+	unsigned long p = bprm->p;
+	unsigned long stack_len;
+	unsigned long start_addr;
+	unsigned long *sp;
+	int res;
+	int i, j;
+
+	memset(&libinfo, 0, sizeof(libinfo));
+	/*
+	 * We have to add the size of our arguments to our stack size
+	 * otherwise it's too easy for users to create stack overflows
+	 * by passing in a huge argument list.  And yes,  we have to be
+	 * pedantic and include space for the argv/envp array as it may have
+	 * a lot of entries.
+	 */
+#define TOP_OF_ARGS (PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *))
+	stack_len = TOP_OF_ARGS - bprm->p;             /* the strings */
+	stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
+	stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
+	stack_len += FLAT_STACK_ALIGN - 1;  /* reserve for upcoming alignment */
+	
+	res = load_flat_file(bprm, &libinfo, 0, &stack_len);
+	if (IS_ERR_VALUE(res))
+		return res;
+	
+	/* Update data segment pointers for all libraries */
+	for (i=0; i<MAX_SHARED_LIBS; i++)
+		if (libinfo.lib_list[i].loaded)
+			for (j=0; j<MAX_SHARED_LIBS; j++)
+				(-(j+1))[(unsigned long *)(libinfo.lib_list[i].start_data)] =
+					(libinfo.lib_list[j].loaded)?
+						libinfo.lib_list[j].start_data:UNLOADED_LIB;
+
+	install_exec_creds(bprm);
+ 	current->flags &= ~PF_FORKNOEXEC;
+
+	set_binfmt(&flat_format);
+
+	p = ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4;
+	DBG_FLT("p=%x\n", (int)p);
+
+	/* copy the arg pages onto the stack, this could be more efficient :-) */
+	for (i = TOP_OF_ARGS - 1; i >= bprm->p; i--)
+		* (char *) --p =
+			((char *) page_address(bprm->page[i/PAGE_SIZE]))[i % PAGE_SIZE];
+
+	sp = (unsigned long *) create_flat_tables(p, bprm);
+	
+	/* Fake some return addresses to ensure the call chain will
+	 * initialise library in order for us.  We are required to call
+	 * lib 1 first, then 2, ... and finally the main program (id 0).
+	 */
+	start_addr = libinfo.lib_list[0].entry;
+
+#ifdef CONFIG_BINFMT_SHARED_FLAT
+	for (i = MAX_SHARED_LIBS-1; i>0; i--) {
+		if (libinfo.lib_list[i].loaded) {
+			/* Push previos first to call address */
+			--sp;	put_user(start_addr, sp);
+			start_addr = libinfo.lib_list[i].entry;
+		}
+	}
+#endif
+	
+	/* Stash our initial stack pointer into the mm structure */
+	current->mm->start_stack = (unsigned long )sp;
+
+#ifdef FLAT_PLAT_INIT
+	FLAT_PLAT_INIT(regs);
+#endif
+	DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n",
+		(int)regs, (int)start_addr, (int)current->mm->start_stack);
+	
+	start_thread(regs, start_addr, current->mm->start_stack);
+
+	return 0;
+}
+
+/****************************************************************************/
+
+static int __init init_flat_binfmt(void)
+{
+	return register_binfmt(&flat_format);
+}
+
+/****************************************************************************/
+
+core_initcall(init_flat_binfmt);
+
+/****************************************************************************/
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
new file mode 100644
index 00000000..1befe2ec
--- /dev/null
+++ b/fs/binfmt_misc.c
@@ -0,0 +1,746 @@
+/*
+ *  binfmt_misc.c
+ *
+ *  Copyright (C) 1997 Richard Günther
+ *
+ *  binfmt_misc detects binaries via a magic or filename extension and invokes
+ *  a specified wrapper. This should obsolete binfmt_java, binfmt_em86 and
+ *  binfmt_mz.
+ *
+ *  1997-04-25 first version
+ *  [...]
+ *  1997-05-19 cleanup
+ *  1997-06-26 hpa: pass the real filename rather than argv[0]
+ *  1997-06-30 minor cleanup
+ *  1997-08-09 removed extension stripping, locking cleanup
+ *  2001-02-28 AV: rewritten into something that resembles C. Original didn't.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/binfmts.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/syscalls.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
+
+enum {
+	VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
+};
+
+static LIST_HEAD(entries);
+static int enabled = 1;
+
+enum {Enabled, Magic};
+#define MISC_FMT_PRESERVE_ARGV0 (1<<31)
+#define MISC_FMT_OPEN_BINARY (1<<30)
+#define MISC_FMT_CREDENTIALS (1<<29)
+
+typedef struct {
+	struct list_head list;
+	unsigned long flags;		/* type, status, etc. */
+	int offset;			/* offset of magic */
+	int size;			/* size of magic/mask */
+	char *magic;			/* magic or filename extension */
+	char *mask;			/* mask, NULL for exact match */
+	char *interpreter;		/* filename of interpreter */
+	char *name;
+	struct dentry *dentry;
+} Node;
+
+static DEFINE_RWLOCK(entries_lock);
+static struct file_system_type bm_fs_type;
+static struct vfsmount *bm_mnt;
+static int entry_count;
+
+/* 
+ * Check if we support the binfmt
+ * if we do, return the node, else NULL
+ * locking is done in load_misc_binary
+ */
+static Node *check_file(struct linux_binprm *bprm)
+{
+	char *p = strrchr(bprm->interp, '.');
+	struct list_head *l;
+
+	list_for_each(l, &entries) {
+		Node *e = list_entry(l, Node, list);
+		char *s;
+		int j;
+
+		if (!test_bit(Enabled, &e->flags))
+			continue;
+
+		if (!test_bit(Magic, &e->flags)) {
+			if (p && !strcmp(e->magic, p + 1))
+				return e;
+			continue;
+		}
+
+		s = bprm->buf + e->offset;
+		if (e->mask) {
+			for (j = 0; j < e->size; j++)
+				if ((*s++ ^ e->magic[j]) & e->mask[j])
+					break;
+		} else {
+			for (j = 0; j < e->size; j++)
+				if ((*s++ ^ e->magic[j]))
+					break;
+		}
+		if (j == e->size)
+			return e;
+	}
+	return NULL;
+}
+
+/*
+ * the loader itself
+ */
+static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+{
+	Node *fmt;
+	struct file * interp_file = NULL;
+	char iname[BINPRM_BUF_SIZE];
+	const char *iname_addr = iname;
+	int retval;
+	int fd_binary = -1;
+
+	retval = -ENOEXEC;
+	if (!enabled)
+		goto _ret;
+
+	retval = -ENOEXEC;
+	if (bprm->recursion_depth > BINPRM_MAX_RECURSION)
+		goto _ret;
+
+	/* to keep locking time low, we copy the interpreter string */
+	read_lock(&entries_lock);
+	fmt = check_file(bprm);
+	if (fmt)
+		strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE);
+	read_unlock(&entries_lock);
+	if (!fmt)
+		goto _ret;
+
+	if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) {
+		retval = remove_arg_zero(bprm);
+		if (retval)
+			goto _ret;
+	}
+
+	if (fmt->flags & MISC_FMT_OPEN_BINARY) {
+
+		/* if the binary should be opened on behalf of the
+		 * interpreter than keep it open and assign descriptor
+		 * to it */
+ 		fd_binary = get_unused_fd();
+ 		if (fd_binary < 0) {
+ 			retval = fd_binary;
+ 			goto _ret;
+ 		}
+ 		fd_install(fd_binary, bprm->file);
+
+		/* if the binary is not readable than enforce mm->dumpable=0
+		   regardless of the interpreter's permissions */
+		if (file_permission(bprm->file, MAY_READ))
+			bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+
+		allow_write_access(bprm->file);
+		bprm->file = NULL;
+
+		/* mark the bprm that fd should be passed to interp */
+		bprm->interp_flags |= BINPRM_FLAGS_EXECFD;
+		bprm->interp_data = fd_binary;
+
+ 	} else {
+ 		allow_write_access(bprm->file);
+ 		fput(bprm->file);
+ 		bprm->file = NULL;
+ 	}
+	/* make argv[1] be the path to the binary */
+	retval = copy_strings_kernel (1, &bprm->interp, bprm);
+	if (retval < 0)
+		goto _error;
+	bprm->argc++;
+
+	/* add the interp as argv[0] */
+	retval = copy_strings_kernel (1, &iname_addr, bprm);
+	if (retval < 0)
+		goto _error;
+	bprm->argc ++;
+
+	bprm->interp = iname;	/* for binfmt_script */
+
+	interp_file = open_exec (iname);
+	retval = PTR_ERR (interp_file);
+	if (IS_ERR (interp_file))
+		goto _error;
+
+	bprm->file = interp_file;
+	if (fmt->flags & MISC_FMT_CREDENTIALS) {
+		/*
+		 * No need to call prepare_binprm(), it's already been
+		 * done.  bprm->buf is stale, update from interp_file.
+		 */
+		memset(bprm->buf, 0, BINPRM_BUF_SIZE);
+		retval = kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
+	} else
+		retval = prepare_binprm (bprm);
+
+	if (retval < 0)
+		goto _error;
+
+	bprm->recursion_depth++;
+
+	retval = search_binary_handler (bprm, regs);
+	if (retval < 0)
+		goto _error;
+
+_ret:
+	return retval;
+_error:
+	if (fd_binary > 0)
+		sys_close(fd_binary);
+	bprm->interp_flags = 0;
+	bprm->interp_data = 0;
+	goto _ret;
+}
+
+/* Command parsers */
+
+/*
+ * parses and copies one argument enclosed in del from *sp to *dp,
+ * recognising the \x special.
+ * returns pointer to the copied argument or NULL in case of an
+ * error (and sets err) or null argument length.
+ */
+static char *scanarg(char *s, char del)
+{
+	char c;
+
+	while ((c = *s++) != del) {
+		if (c == '\\' && *s == 'x') {
+			s++;
+			if (!isxdigit(*s++))
+				return NULL;
+			if (!isxdigit(*s++))
+				return NULL;
+		}
+	}
+	return s;
+}
+
+static int unquote(char *from)
+{
+	char c = 0, *s = from, *p = from;
+
+	while ((c = *s++) != '\0') {
+		if (c == '\\' && *s == 'x') {
+			s++;
+			c = toupper(*s++);
+			*p = (c - (isdigit(c) ? '0' : 'A' - 10)) << 4;
+			c = toupper(*s++);
+			*p++ |= c - (isdigit(c) ? '0' : 'A' - 10);
+			continue;
+		}
+		*p++ = c;
+	}
+	return p - from;
+}
+
+static char * check_special_flags (char * sfs, Node * e)
+{
+	char * p = sfs;
+	int cont = 1;
+
+	/* special flags */
+	while (cont) {
+		switch (*p) {
+			case 'P':
+				p++;
+				e->flags |= MISC_FMT_PRESERVE_ARGV0;
+				break;
+			case 'O':
+				p++;
+				e->flags |= MISC_FMT_OPEN_BINARY;
+				break;
+			case 'C':
+				p++;
+				/* this flags also implies the
+				   open-binary flag */
+				e->flags |= (MISC_FMT_CREDENTIALS |
+						MISC_FMT_OPEN_BINARY);
+				break;
+			default:
+				cont = 0;
+		}
+	}
+
+	return p;
+}
+/*
+ * This registers a new binary format, it recognises the syntax
+ * ':name:type:offset:magic:mask:interpreter:flags'
+ * where the ':' is the IFS, that can be chosen with the first char
+ */
+static Node *create_entry(const char __user *buffer, size_t count)
+{
+	Node *e;
+	int memsize, err;
+	char *buf, *p;
+	char del;
+
+	/* some sanity checks */
+	err = -EINVAL;
+	if ((count < 11) || (count > 256))
+		goto out;
+
+	err = -ENOMEM;
+	memsize = sizeof(Node) + count + 8;
+	e = kmalloc(memsize, GFP_USER);
+	if (!e)
+		goto out;
+
+	p = buf = (char *)e + sizeof(Node);
+
+	memset(e, 0, sizeof(Node));
+	if (copy_from_user(buf, buffer, count))
+		goto Efault;
+
+	del = *p++;	/* delimeter */
+
+	memset(buf+count, del, 8);
+
+	e->name = p;
+	p = strchr(p, del);
+	if (!p)
+		goto Einval;
+	*p++ = '\0';
+	if (!e->name[0] ||
+	    !strcmp(e->name, ".") ||
+	    !strcmp(e->name, "..") ||
+	    strchr(e->name, '/'))
+		goto Einval;
+	switch (*p++) {
+		case 'E': e->flags = 1<<Enabled; break;
+		case 'M': e->flags = (1<<Enabled) | (1<<Magic); break;
+		default: goto Einval;
+	}
+	if (*p++ != del)
+		goto Einval;
+	if (test_bit(Magic, &e->flags)) {
+		char *s = strchr(p, del);
+		if (!s)
+			goto Einval;
+		*s++ = '\0';
+		e->offset = simple_strtoul(p, &p, 10);
+		if (*p++)
+			goto Einval;
+		e->magic = p;
+		p = scanarg(p, del);
+		if (!p)
+			goto Einval;
+		p[-1] = '\0';
+		if (!e->magic[0])
+			goto Einval;
+		e->mask = p;
+		p = scanarg(p, del);
+		if (!p)
+			goto Einval;
+		p[-1] = '\0';
+		if (!e->mask[0])
+			e->mask = NULL;
+		e->size = unquote(e->magic);
+		if (e->mask && unquote(e->mask) != e->size)
+			goto Einval;
+		if (e->size + e->offset > BINPRM_BUF_SIZE)
+			goto Einval;
+	} else {
+		p = strchr(p, del);
+		if (!p)
+			goto Einval;
+		*p++ = '\0';
+		e->magic = p;
+		p = strchr(p, del);
+		if (!p)
+			goto Einval;
+		*p++ = '\0';
+		if (!e->magic[0] || strchr(e->magic, '/'))
+			goto Einval;
+		p = strchr(p, del);
+		if (!p)
+			goto Einval;
+		*p++ = '\0';
+	}
+	e->interpreter = p;
+	p = strchr(p, del);
+	if (!p)
+		goto Einval;
+	*p++ = '\0';
+	if (!e->interpreter[0])
+		goto Einval;
+
+
+	p = check_special_flags (p, e);
+
+	if (*p == '\n')
+		p++;
+	if (p != buf + count)
+		goto Einval;
+	return e;
+
+out:
+	return ERR_PTR(err);
+
+Efault:
+	kfree(e);
+	return ERR_PTR(-EFAULT);
+Einval:
+	kfree(e);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Set status of entry/binfmt_misc:
+ * '1' enables, '0' disables and '-1' clears entry/binfmt_misc
+ */
+static int parse_command(const char __user *buffer, size_t count)
+{
+	char s[4];
+
+	if (!count)
+		return 0;
+	if (count > 3)
+		return -EINVAL;
+	if (copy_from_user(s, buffer, count))
+		return -EFAULT;
+	if (s[count-1] == '\n')
+		count--;
+	if (count == 1 && s[0] == '0')
+		return 1;
+	if (count == 1 && s[0] == '1')
+		return 2;
+	if (count == 2 && s[0] == '-' && s[1] == '1')
+		return 3;
+	return -EINVAL;
+}
+
+/* generic stuff */
+
+static void entry_status(Node *e, char *page)
+{
+	char *dp;
+	char *status = "disabled";
+	const char * flags = "flags: ";
+
+	if (test_bit(Enabled, &e->flags))
+		status = "enabled";
+
+	if (!VERBOSE_STATUS) {
+		sprintf(page, "%s\n", status);
+		return;
+	}
+
+	sprintf(page, "%s\ninterpreter %s\n", status, e->interpreter);
+	dp = page + strlen(page);
+
+	/* print the special flags */
+	sprintf (dp, "%s", flags);
+	dp += strlen (flags);
+	if (e->flags & MISC_FMT_PRESERVE_ARGV0) {
+		*dp ++ = 'P';
+	}
+	if (e->flags & MISC_FMT_OPEN_BINARY) {
+		*dp ++ = 'O';
+	}
+	if (e->flags & MISC_FMT_CREDENTIALS) {
+		*dp ++ = 'C';
+	}
+	*dp ++ = '\n';
+
+
+	if (!test_bit(Magic, &e->flags)) {
+		sprintf(dp, "extension .%s\n", e->magic);
+	} else {
+		int i;
+
+		sprintf(dp, "offset %i\nmagic ", e->offset);
+		dp = page + strlen(page);
+		for (i = 0; i < e->size; i++) {
+			sprintf(dp, "%02x", 0xff & (int) (e->magic[i]));
+			dp += 2;
+		}
+		if (e->mask) {
+			sprintf(dp, "\nmask ");
+			dp += 6;
+			for (i = 0; i < e->size; i++) {
+				sprintf(dp, "%02x", 0xff & (int) (e->mask[i]));
+				dp += 2;
+			}
+		}
+		*dp++ = '\n';
+		*dp = '\0';
+	}
+}
+
+static struct inode *bm_get_inode(struct super_block *sb, int mode)
+{
+	struct inode * inode = new_inode(sb);
+
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode->i_mode = mode;
+		inode->i_atime = inode->i_mtime = inode->i_ctime =
+			current_fs_time(inode->i_sb);
+	}
+	return inode;
+}
+
+static void bm_evict_inode(struct inode *inode)
+{
+	end_writeback(inode);
+	kfree(inode->i_private);
+}
+
+static void kill_node(Node *e)
+{
+	struct dentry *dentry;
+
+	write_lock(&entries_lock);
+	dentry = e->dentry;
+	if (dentry) {
+		list_del_init(&e->list);
+		e->dentry = NULL;
+	}
+	write_unlock(&entries_lock);
+
+	if (dentry) {
+		dentry->d_inode->i_nlink--;
+		d_drop(dentry);
+		dput(dentry);
+		simple_release_fs(&bm_mnt, &entry_count);
+	}
+}
+
+/* /<entry> */
+
+static ssize_t
+bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos)
+{
+	Node *e = file->f_path.dentry->d_inode->i_private;
+	ssize_t res;
+	char *page;
+
+	if (!(page = (char*) __get_free_page(GFP_KERNEL)))
+		return -ENOMEM;
+
+	entry_status(e, page);
+
+	res = simple_read_from_buffer(buf, nbytes, ppos, page, strlen(page));
+
+	free_page((unsigned long) page);
+	return res;
+}
+
+static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct dentry *root;
+	Node *e = file->f_path.dentry->d_inode->i_private;
+	int res = parse_command(buffer, count);
+
+	switch (res) {
+		case 1: clear_bit(Enabled, &e->flags);
+			break;
+		case 2: set_bit(Enabled, &e->flags);
+			break;
+		case 3: root = dget(file->f_path.mnt->mnt_sb->s_root);
+			mutex_lock(&root->d_inode->i_mutex);
+
+			kill_node(e);
+
+			mutex_unlock(&root->d_inode->i_mutex);
+			dput(root);
+			break;
+		default: return res;
+	}
+	return count;
+}
+
+static const struct file_operations bm_entry_operations = {
+	.read		= bm_entry_read,
+	.write		= bm_entry_write,
+	.llseek		= default_llseek,
+};
+
+/* /register */
+
+static ssize_t bm_register_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *ppos)
+{
+	Node *e;
+	struct inode *inode;
+	struct dentry *root, *dentry;
+	struct super_block *sb = file->f_path.mnt->mnt_sb;
+	int err = 0;
+
+	e = create_entry(buffer, count);
+
+	if (IS_ERR(e))
+		return PTR_ERR(e);
+
+	root = dget(sb->s_root);
+	mutex_lock(&root->d_inode->i_mutex);
+	dentry = lookup_one_len(e->name, root, strlen(e->name));
+	err = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto out;
+
+	err = -EEXIST;
+	if (dentry->d_inode)
+		goto out2;
+
+	inode = bm_get_inode(sb, S_IFREG | 0644);
+
+	err = -ENOMEM;
+	if (!inode)
+		goto out2;
+
+	err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
+	if (err) {
+		iput(inode);
+		inode = NULL;
+		goto out2;
+	}
+
+	e->dentry = dget(dentry);
+	inode->i_private = e;
+	inode->i_fop = &bm_entry_operations;
+
+	d_instantiate(dentry, inode);
+	write_lock(&entries_lock);
+	list_add(&e->list, &entries);
+	write_unlock(&entries_lock);
+
+	err = 0;
+out2:
+	dput(dentry);
+out:
+	mutex_unlock(&root->d_inode->i_mutex);
+	dput(root);
+
+	if (err) {
+		kfree(e);
+		return -EINVAL;
+	}
+	return count;
+}
+
+static const struct file_operations bm_register_operations = {
+	.write		= bm_register_write,
+	.llseek		= noop_llseek,
+};
+
+/* /status */
+
+static ssize_t
+bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
+{
+	char *s = enabled ? "enabled\n" : "disabled\n";
+
+	return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
+}
+
+static ssize_t bm_status_write(struct file * file, const char __user * buffer,
+		size_t count, loff_t *ppos)
+{
+	int res = parse_command(buffer, count);
+	struct dentry *root;
+
+	switch (res) {
+		case 1: enabled = 0; break;
+		case 2: enabled = 1; break;
+		case 3: root = dget(file->f_path.mnt->mnt_sb->s_root);
+			mutex_lock(&root->d_inode->i_mutex);
+
+			while (!list_empty(&entries))
+				kill_node(list_entry(entries.next, Node, list));
+
+			mutex_unlock(&root->d_inode->i_mutex);
+			dput(root);
+		default: return res;
+	}
+	return count;
+}
+
+static const struct file_operations bm_status_operations = {
+	.read		= bm_status_read,
+	.write		= bm_status_write,
+	.llseek		= default_llseek,
+};
+
+/* Superblock handling */
+
+static const struct super_operations s_ops = {
+	.statfs		= simple_statfs,
+	.evict_inode	= bm_evict_inode,
+};
+
+static int bm_fill_super(struct super_block * sb, void * data, int silent)
+{
+	static struct tree_descr bm_files[] = {
+		[2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
+		[3] = {"register", &bm_register_operations, S_IWUSR},
+		/* last one */ {""}
+	};
+	int err = simple_fill_super(sb, 0x42494e4d, bm_files);
+	if (!err)
+		sb->s_op = &s_ops;
+	return err;
+}
+
+static struct dentry *bm_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_single(fs_type, flags, data, bm_fill_super);
+}
+
+static struct linux_binfmt misc_format = {
+	.module = THIS_MODULE,
+	.load_binary = load_misc_binary,
+};
+
+static struct file_system_type bm_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "binfmt_misc",
+	.mount		= bm_mount,
+	.kill_sb	= kill_litter_super,
+};
+
+static int __init init_misc_binfmt(void)
+{
+	int err = register_filesystem(&bm_fs_type);
+	if (!err) {
+		err = insert_binfmt(&misc_format);
+		if (err)
+			unregister_filesystem(&bm_fs_type);
+	}
+	return err;
+}
+
+static void __exit exit_misc_binfmt(void)
+{
+	unregister_binfmt(&misc_format);
+	unregister_filesystem(&bm_fs_type);
+}
+
+core_initcall(init_misc_binfmt);
+module_exit(exit_misc_binfmt);
+MODULE_LICENSE("GPL");
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
new file mode 100644
index 00000000..396a9884
--- /dev/null
+++ b/fs/binfmt_script.c
@@ -0,0 +1,118 @@
+/*
+ *  linux/fs/binfmt_script.c
+ *
+ *  Copyright (C) 1996  Martin von Löwis
+ *  original #!-checking implemented by tytso.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/binfmts.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+
+static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
+{
+	const char *i_arg, *i_name;
+	char *cp;
+	struct file *file;
+	char interp[BINPRM_BUF_SIZE];
+	int retval;
+
+	if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') ||
+	    (bprm->recursion_depth > BINPRM_MAX_RECURSION))
+		return -ENOEXEC;
+	/*
+	 * This section does the #! interpretation.
+	 * Sorta complicated, but hopefully it will work.  -TYT
+	 */
+
+	bprm->recursion_depth++;
+	allow_write_access(bprm->file);
+	fput(bprm->file);
+	bprm->file = NULL;
+
+	bprm->buf[BINPRM_BUF_SIZE - 1] = '\0';
+	if ((cp = strchr(bprm->buf, '\n')) == NULL)
+		cp = bprm->buf+BINPRM_BUF_SIZE-1;
+	*cp = '\0';
+	while (cp > bprm->buf) {
+		cp--;
+		if ((*cp == ' ') || (*cp == '\t'))
+			*cp = '\0';
+		else
+			break;
+	}
+	for (cp = bprm->buf+2; (*cp == ' ') || (*cp == '\t'); cp++);
+	if (*cp == '\0') 
+		return -ENOEXEC; /* No interpreter name found */
+	i_name = cp;
+	i_arg = NULL;
+	for ( ; *cp && (*cp != ' ') && (*cp != '\t'); cp++)
+		/* nothing */ ;
+	while ((*cp == ' ') || (*cp == '\t'))
+		*cp++ = '\0';
+	if (*cp)
+		i_arg = cp;
+	strcpy (interp, i_name);
+	/*
+	 * OK, we've parsed out the interpreter name and
+	 * (optional) argument.
+	 * Splice in (1) the interpreter's name for argv[0]
+	 *           (2) (optional) argument to interpreter
+	 *           (3) filename of shell script (replace argv[0])
+	 *
+	 * This is done in reverse order, because of how the
+	 * user environment and arguments are stored.
+	 */
+	retval = remove_arg_zero(bprm);
+	if (retval)
+		return retval;
+	retval = copy_strings_kernel(1, &bprm->interp, bprm);
+	if (retval < 0) return retval; 
+	bprm->argc++;
+	if (i_arg) {
+		retval = copy_strings_kernel(1, &i_arg, bprm);
+		if (retval < 0) return retval; 
+		bprm->argc++;
+	}
+	retval = copy_strings_kernel(1, &i_name, bprm);
+	if (retval) return retval; 
+	bprm->argc++;
+	bprm->interp = interp;
+
+	/*
+	 * OK, now restart the process with the interpreter's dentry.
+	 */
+	file = open_exec(interp);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	bprm->file = file;
+	retval = prepare_binprm(bprm);
+	if (retval < 0)
+		return retval;
+	return search_binary_handler(bprm,regs);
+}
+
+static struct linux_binfmt script_format = {
+	.module		= THIS_MODULE,
+	.load_binary	= load_script,
+};
+
+static int __init init_script_binfmt(void)
+{
+	return register_binfmt(&script_format);
+}
+
+static void __exit exit_script_binfmt(void)
+{
+	unregister_binfmt(&script_format);
+}
+
+core_initcall(init_script_binfmt);
+module_exit(exit_script_binfmt);
+MODULE_LICENSE("GPL");
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
new file mode 100644
index 00000000..cc8560f6
--- /dev/null
+++ b/fs/binfmt_som.c
@@ -0,0 +1,304 @@
+/*
+ * linux/fs/binfmt_som.c
+ *
+ * These are the functions used to load SOM format executables as used
+ * by HP-UX.  
+ *
+ * Copyright 1999 Matthew Wilcox <willy@bofh.ai>
+ * based on binfmt_elf which is
+ * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com).
+ */
+
+#include <linux/module.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/binfmts.h>
+#include <linux/som.h>
+#include <linux/string.h>
+#include <linux/file.h>
+#include <linux/fcntl.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/shm.h>
+#include <linux/personality.h>
+#include <linux/init.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+
+
+#include <linux/elf.h>
+
+static int load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs);
+static int load_som_library(struct file *);
+
+/*
+ * If we don't support core dumping, then supply a NULL so we
+ * don't even try.
+ */
+#if 0
+static int som_core_dump(struct coredump_params *cprm);
+#else
+#define som_core_dump	NULL
+#endif
+
+#define SOM_PAGESTART(_v) ((_v) & ~(unsigned long)(SOM_PAGESIZE-1))
+#define SOM_PAGEOFFSET(_v) ((_v) & (SOM_PAGESIZE-1))
+#define SOM_PAGEALIGN(_v) (((_v) + SOM_PAGESIZE - 1) & ~(SOM_PAGESIZE - 1))
+
+static struct linux_binfmt som_format = {
+	.module		= THIS_MODULE,
+	.load_binary	= load_som_binary,
+	.load_shlib	= load_som_library,
+	.core_dump	= som_core_dump,
+	.min_coredump	= SOM_PAGESIZE
+};
+
+/*
+ * create_som_tables() parses the env- and arg-strings in new user
+ * memory and creates the pointer tables from them, and puts their
+ * addresses on the "stack", returning the new stack pointer value.
+ */
+static void create_som_tables(struct linux_binprm *bprm)
+{
+	char **argv, **envp;
+	int argc = bprm->argc;
+	int envc = bprm->envc;
+	unsigned long p;
+	unsigned long *sp;
+
+	/* Word-align the stack pointer */
+	sp = (unsigned long *)((bprm->p + 3) & ~3);
+
+	envp = (char **) sp;
+	sp += envc + 1;
+	argv = (char **) sp;
+	sp += argc + 1;
+
+	__put_user((unsigned long) envp,++sp);
+	__put_user((unsigned long) argv,++sp);
+
+	__put_user(argc, ++sp);
+
+	bprm->p = (unsigned long) sp;
+
+	p = current->mm->arg_start;
+	while (argc-- > 0) {
+		__put_user((char *)p,argv++);
+		p += strlen_user((char *)p);
+	}
+	__put_user(NULL, argv);
+	current->mm->arg_end = current->mm->env_start = p;
+	while (envc-- > 0) {
+		__put_user((char *)p,envp++);
+		p += strlen_user((char *)p);
+	}
+	__put_user(NULL, envp);
+	current->mm->env_end = p;
+}
+
+static int check_som_header(struct som_hdr *som_ex)
+{
+	int *buf = (int *)som_ex;
+	int i, ck;
+
+	if (som_ex->system_id != SOM_SID_PARISC_1_0 &&
+	    som_ex->system_id != SOM_SID_PARISC_1_1 &&
+	    som_ex->system_id != SOM_SID_PARISC_2_0)
+		return -ENOEXEC;
+
+	if (som_ex->a_magic != SOM_EXEC_NONSHARE &&
+	    som_ex->a_magic != SOM_EXEC_SHARE &&
+	    som_ex->a_magic != SOM_EXEC_DEMAND)
+		return -ENOEXEC;
+
+	if (som_ex->version_id != SOM_ID_OLD &&
+	    som_ex->version_id != SOM_ID_NEW)
+		return -ENOEXEC;
+
+	ck = 0;
+	for (i=0; i<32; i++)
+		ck ^= buf[i];
+	if (ck != 0)
+		return -ENOEXEC;
+
+	return 0;
+}
+
+static int map_som_binary(struct file *file,
+		const struct som_exec_auxhdr *hpuxhdr)
+{
+	unsigned long code_start, code_size, data_start, data_size;
+	unsigned long bss_start, som_brk;
+	int retval;
+	int prot = PROT_READ | PROT_EXEC;
+	int flags = MAP_FIXED|MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE;
+
+	mm_segment_t old_fs = get_fs();
+	set_fs(get_ds());
+
+	code_start = SOM_PAGESTART(hpuxhdr->exec_tmem);
+	code_size = SOM_PAGEALIGN(hpuxhdr->exec_tsize);
+	current->mm->start_code = code_start;
+	current->mm->end_code = code_start + code_size;
+	down_write(&current->mm->mmap_sem);
+	retval = do_mmap(file, code_start, code_size, prot,
+			flags, SOM_PAGESTART(hpuxhdr->exec_tfile));
+	up_write(&current->mm->mmap_sem);
+	if (retval < 0 && retval > -1024)
+		goto out;
+
+	data_start = SOM_PAGESTART(hpuxhdr->exec_dmem);
+	data_size = SOM_PAGEALIGN(hpuxhdr->exec_dsize);
+	current->mm->start_data = data_start;
+	current->mm->end_data = bss_start = data_start + data_size;
+	down_write(&current->mm->mmap_sem);
+	retval = do_mmap(file, data_start, data_size,
+			prot | PROT_WRITE, flags,
+			SOM_PAGESTART(hpuxhdr->exec_dfile));
+	up_write(&current->mm->mmap_sem);
+	if (retval < 0 && retval > -1024)
+		goto out;
+
+	som_brk = bss_start + SOM_PAGEALIGN(hpuxhdr->exec_bsize);
+	current->mm->start_brk = current->mm->brk = som_brk;
+	down_write(&current->mm->mmap_sem);
+	retval = do_mmap(NULL, bss_start, som_brk - bss_start,
+			prot | PROT_WRITE, MAP_FIXED | MAP_PRIVATE, 0);
+	up_write(&current->mm->mmap_sem);
+	if (retval > 0 || retval < -1024)
+		retval = 0;
+out:
+	set_fs(old_fs);
+	return retval;
+}
+
+
+/*
+ * These are the functions used to load SOM executables and shared
+ * libraries.  There is no binary dependent code anywhere else.
+ */
+
+static int
+load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+{
+	int retval;
+	unsigned int size;
+	unsigned long som_entry;
+	struct som_hdr *som_ex;
+	struct som_exec_auxhdr *hpuxhdr;
+
+	/* Get the exec-header */
+	som_ex = (struct som_hdr *) bprm->buf;
+
+	retval = check_som_header(som_ex);
+	if (retval != 0)
+		goto out;
+
+	/* Now read in the auxiliary header information */
+
+	retval = -ENOMEM;
+	size = som_ex->aux_header_size;
+	if (size > SOM_PAGESIZE)
+		goto out;
+	hpuxhdr = kmalloc(size, GFP_KERNEL);
+	if (!hpuxhdr)
+		goto out;
+
+	retval = kernel_read(bprm->file, som_ex->aux_header_location,
+			(char *) hpuxhdr, size);
+	if (retval != size) {
+		if (retval >= 0)
+			retval = -EIO;
+		goto out_free;
+	}
+
+	/* Flush all traces of the currently running executable */
+	retval = flush_old_exec(bprm);
+	if (retval)
+		goto out_free;
+
+	/* OK, This is the point of no return */
+	current->flags &= ~PF_FORKNOEXEC;
+	current->personality = PER_HPUX;
+	setup_new_exec(bprm);
+
+	/* Set the task size for HP-UX processes such that
+	 * the gateway page is outside the address space.
+	 * This can be fixed later, but for now, this is much
+	 * easier.
+	 */
+
+	current->thread.task_size = 0xc0000000;
+
+	/* Set map base to allow enough room for hp-ux heap growth */
+
+	current->thread.map_base = 0x80000000;
+
+	retval = map_som_binary(bprm->file, hpuxhdr);
+	if (retval < 0)
+		goto out_free;
+
+	som_entry = hpuxhdr->exec_entry;
+	kfree(hpuxhdr);
+
+	set_binfmt(&som_format);
+	install_exec_creds(bprm);
+	setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
+
+	create_som_tables(bprm);
+
+	current->mm->start_stack = bprm->p;
+
+#if 0
+	printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
+	printk("(end_code) %08lx\n" , (unsigned long) current->mm->end_code);
+	printk("(start_code) %08lx\n" , (unsigned long) current->mm->start_code);
+	printk("(end_data) %08lx\n" , (unsigned long) current->mm->end_data);
+	printk("(start_stack) %08lx\n" , (unsigned long) current->mm->start_stack);
+	printk("(brk) %08lx\n" , (unsigned long) current->mm->brk);
+#endif
+
+	map_hpux_gateway_page(current,current->mm);
+
+	start_thread_som(regs, som_entry, bprm->p);
+	return 0;
+
+	/* error cleanup */
+out_free:
+	kfree(hpuxhdr);
+out:
+	return retval;
+}
+
+static int load_som_library(struct file *f)
+{
+/* No lib support in SOM yet.  gizza chance.. */
+	return -ENOEXEC;
+}
+	/* Install the SOM loader.
+	 * N.B. We *rely* on the table being the right size with the
+	 * right number of free slots...
+	 */
+
+static int __init init_som_binfmt(void)
+{
+	return register_binfmt(&som_format);
+}
+
+static void __exit exit_som_binfmt(void)
+{
+	/* Remove the SOM loader. */
+	unregister_binfmt(&som_format);
+}
+
+core_initcall(init_som_binfmt);
+module_exit(exit_som_binfmt);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
new file mode 100644
index 00000000..9c5e6b2c
--- /dev/null
+++ b/fs/bio-integrity.c
@@ -0,0 +1,807 @@
+/*
+ * bio-integrity.c - bio data integrity extensions
+ *
+ * Copyright (C) 2007, 2008, 2009 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+
+struct integrity_slab {
+	struct kmem_cache *slab;
+	unsigned short nr_vecs;
+	char name[8];
+};
+
+#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) }
+struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = {
+	IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES),
+};
+#undef IS
+
+static struct workqueue_struct *kintegrityd_wq;
+
+static inline unsigned int vecs_to_idx(unsigned int nr)
+{
+	switch (nr) {
+	case 1:
+		return 0;
+	case 2 ... 4:
+		return 1;
+	case 5 ... 16:
+		return 2;
+	case 17 ... 64:
+		return 3;
+	case 65 ... 128:
+		return 4;
+	case 129 ... BIO_MAX_PAGES:
+		return 5;
+	default:
+		BUG();
+	}
+}
+
+static inline int use_bip_pool(unsigned int idx)
+{
+	if (idx == BIOVEC_MAX_IDX)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * @bio:	bio to attach integrity metadata to
+ * @gfp_mask:	Memory allocation mask
+ * @nr_vecs:	Number of integrity metadata scatter-gather elements
+ * @bs:		bio_set to allocate from
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
+							 gfp_t gfp_mask,
+							 unsigned int nr_vecs,
+							 struct bio_set *bs)
+{
+	struct bio_integrity_payload *bip;
+	unsigned int idx = vecs_to_idx(nr_vecs);
+
+	BUG_ON(bio == NULL);
+	bip = NULL;
+
+	/* Lower order allocations come straight from slab */
+	if (!use_bip_pool(idx))
+		bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask);
+
+	/* Use mempool if lower order alloc failed or max vecs were requested */
+	if (bip == NULL) {
+		idx = BIOVEC_MAX_IDX;  /* so we free the payload properly later */
+		bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+
+		if (unlikely(bip == NULL)) {
+			printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+			return NULL;
+		}
+	}
+
+	memset(bip, 0, sizeof(*bip));
+
+	bip->bip_slab = idx;
+	bip->bip_bio = bio;
+	bio->bi_integrity = bip;
+
+	return bip;
+}
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio:	bio to attach integrity metadata to
+ * @gfp_mask:	Memory allocation mask
+ * @nr_vecs:	Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+						  gfp_t gfp_mask,
+						  unsigned int nr_vecs)
+{
+	return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
+EXPORT_SYMBOL(bio_integrity_alloc);
+
+/**
+ * bio_integrity_free - Free bio integrity payload
+ * @bio:	bio containing bip to be freed
+ * @bs:		bio_set this bio was allocated from
+ *
+ * Description: Used to free the integrity portion of a bio. Usually
+ * called from bio_free().
+ */
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+
+	BUG_ON(bip == NULL);
+
+	/* A cloned bio doesn't own the integrity metadata */
+	if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
+	    && bip->bip_buf != NULL)
+		kfree(bip->bip_buf);
+
+	if (use_bip_pool(bip->bip_slab))
+		mempool_free(bip, bs->bio_integrity_pool);
+	else
+		kmem_cache_free(bip_slab[bip->bip_slab].slab, bip);
+
+	bio->bi_integrity = NULL;
+}
+EXPORT_SYMBOL(bio_integrity_free);
+
+/**
+ * bio_integrity_add_page - Attach integrity metadata
+ * @bio:	bio to update
+ * @page:	page containing integrity metadata
+ * @len:	number of bytes of integrity metadata in page
+ * @offset:	start offset within page
+ *
+ * Description: Attach a page containing integrity metadata to bio.
+ */
+int bio_integrity_add_page(struct bio *bio, struct page *page,
+			   unsigned int len, unsigned int offset)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct bio_vec *iv;
+
+	if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) {
+		printk(KERN_ERR "%s: bip_vec full\n", __func__);
+		return 0;
+	}
+
+	iv = bip_vec_idx(bip, bip->bip_vcnt);
+	BUG_ON(iv == NULL);
+
+	iv->bv_page = page;
+	iv->bv_len = len;
+	iv->bv_offset = offset;
+	bip->bip_vcnt++;
+
+	return len;
+}
+EXPORT_SYMBOL(bio_integrity_add_page);
+
+static int bdev_integrity_enabled(struct block_device *bdev, int rw)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bdev);
+
+	if (bi == NULL)
+		return 0;
+
+	if (rw == READ && bi->verify_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_READ))
+		return 1;
+
+	if (rw == WRITE && bi->generate_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_WRITE))
+		return 1;
+
+	return 0;
+}
+
+/**
+ * bio_integrity_enabled - Check whether integrity can be passed
+ * @bio:	bio to check
+ *
+ * Description: Determines whether bio_integrity_prep() can be called
+ * on this bio or not.	bio data direction and target device must be
+ * set prior to calling.  The functions honors the write_generate and
+ * read_verify flags in sysfs.
+ */
+int bio_integrity_enabled(struct bio *bio)
+{
+	/* Already protected? */
+	if (bio_integrity(bio))
+		return 0;
+
+	return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
+}
+EXPORT_SYMBOL(bio_integrity_enabled);
+
+/**
+ * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
+ * @bi:		blk_integrity profile for device
+ * @sectors:	Number of 512 sectors to convert
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the hardware
+ * sector size of the storage device.  Convert the block layer sectors
+ * to physical sectors.
+ */
+static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
+						    unsigned int sectors)
+{
+	/* At this point there are only 512b or 4096b DIF/EPP devices */
+	if (bi->sector_size == 4096)
+		return sectors >>= 3;
+
+	return sectors;
+}
+
+/**
+ * bio_integrity_tag_size - Retrieve integrity tag space
+ * @bio:	bio to inspect
+ *
+ * Description: Returns the maximum number of tag bytes that can be
+ * attached to this bio. Filesystems can use this to determine how
+ * much metadata to attach to an I/O.
+ */
+unsigned int bio_integrity_tag_size(struct bio *bio)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+
+	BUG_ON(bio->bi_size == 0);
+
+	return bi->tag_size * (bio->bi_size / bi->sector_size);
+}
+EXPORT_SYMBOL(bio_integrity_tag_size);
+
+int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	unsigned int nr_sectors;
+
+	BUG_ON(bip->bip_buf == NULL);
+
+	if (bi->tag_size == 0)
+		return -1;
+
+	nr_sectors = bio_integrity_hw_sectors(bi,
+					DIV_ROUND_UP(len, bi->tag_size));
+
+	if (nr_sectors * bi->tuple_size > bip->bip_size) {
+		printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
+		       __func__, nr_sectors * bi->tuple_size, bip->bip_size);
+		return -1;
+	}
+
+	if (set)
+		bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+	else
+		bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+
+	return 0;
+}
+
+/**
+ * bio_integrity_set_tag - Attach a tag buffer to a bio
+ * @bio:	bio to attach buffer to
+ * @tag_buf:	Pointer to a buffer containing tag data
+ * @len:	Length of the included buffer
+ *
+ * Description: Use this function to tag a bio by leveraging the extra
+ * space provided by devices formatted with integrity protection.  The
+ * size of the integrity buffer must be <= to the size reported by
+ * bio_integrity_tag_size().
+ */
+int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+	BUG_ON(bio_data_dir(bio) != WRITE);
+
+	return bio_integrity_tag(bio, tag_buf, len, 1);
+}
+EXPORT_SYMBOL(bio_integrity_set_tag);
+
+/**
+ * bio_integrity_get_tag - Retrieve a tag buffer from a bio
+ * @bio:	bio to retrieve buffer from
+ * @tag_buf:	Pointer to a buffer for the tag data
+ * @len:	Length of the target buffer
+ *
+ * Description: Use this function to retrieve the tag buffer from a
+ * completed I/O. The size of the integrity buffer must be <= to the
+ * size reported by bio_integrity_tag_size().
+ */
+int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+	BUG_ON(bio_data_dir(bio) != READ);
+
+	return bio_integrity_tag(bio, tag_buf, len, 0);
+}
+EXPORT_SYMBOL(bio_integrity_get_tag);
+
+/**
+ * bio_integrity_generate - Generate integrity metadata for a bio
+ * @bio:	bio to generate integrity metadata for
+ *
+ * Description: Generates integrity metadata for a bio by calling the
+ * block device's generation callback function.  The bio must have a
+ * bip attached with enough room to accommodate the generated
+ * integrity metadata.
+ */
+static void bio_integrity_generate(struct bio *bio)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct blk_integrity_exchg bix;
+	struct bio_vec *bv;
+	sector_t sector = bio->bi_sector;
+	unsigned int i, sectors, total;
+	void *prot_buf = bio->bi_integrity->bip_buf;
+
+	total = 0;
+	bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+	bix.sector_size = bi->sector_size;
+
+	bio_for_each_segment(bv, bio, i) {
+		void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+		bix.data_buf = kaddr + bv->bv_offset;
+		bix.data_size = bv->bv_len;
+		bix.prot_buf = prot_buf;
+		bix.sector = sector;
+
+		bi->generate_fn(&bix);
+
+		sectors = bv->bv_len / bi->sector_size;
+		sector += sectors;
+		prot_buf += sectors * bi->tuple_size;
+		total += sectors * bi->tuple_size;
+		BUG_ON(total > bio->bi_integrity->bip_size);
+
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+}
+
+static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
+{
+	if (bi)
+		return bi->tuple_size;
+
+	return 0;
+}
+
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio:	bio to prepare
+ *
+ * Description: Allocates a buffer for integrity metadata, maps the
+ * pages and attaches them to a bio.  The bio must have data
+ * direction, target device and start sector set priot to calling.  In
+ * the WRITE case, integrity metadata will be generated using the
+ * block device's integrity function.  In the READ case, the buffer
+ * will be prepared for DMA and a suitable end_io handler set up.
+ */
+int bio_integrity_prep(struct bio *bio)
+{
+	struct bio_integrity_payload *bip;
+	struct blk_integrity *bi;
+	struct request_queue *q;
+	void *buf;
+	unsigned long start, end;
+	unsigned int len, nr_pages;
+	unsigned int bytes, offset, i;
+	unsigned int sectors;
+
+	bi = bdev_get_integrity(bio->bi_bdev);
+	q = bdev_get_queue(bio->bi_bdev);
+	BUG_ON(bi == NULL);
+	BUG_ON(bio_integrity(bio));
+
+	sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
+
+	/* Allocate kernel buffer for protection data */
+	len = sectors * blk_integrity_tuple_size(bi);
+	buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
+	if (unlikely(buf == NULL)) {
+		printk(KERN_ERR "could not allocate integrity buffer\n");
+		return -ENOMEM;
+	}
+
+	end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	start = ((unsigned long) buf) >> PAGE_SHIFT;
+	nr_pages = end - start;
+
+	/* Allocate bio integrity payload and integrity vectors */
+	bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
+	if (unlikely(bip == NULL)) {
+		printk(KERN_ERR "could not allocate data integrity bioset\n");
+		kfree(buf);
+		return -EIO;
+	}
+
+	bip->bip_buf = buf;
+	bip->bip_size = len;
+	bip->bip_sector = bio->bi_sector;
+
+	/* Map it */
+	offset = offset_in_page(buf);
+	for (i = 0 ; i < nr_pages ; i++) {
+		int ret;
+		bytes = PAGE_SIZE - offset;
+
+		if (len <= 0)
+			break;
+
+		if (bytes > len)
+			bytes = len;
+
+		ret = bio_integrity_add_page(bio, virt_to_page(buf),
+					     bytes, offset);
+
+		if (ret == 0)
+			return 0;
+
+		if (ret < bytes)
+			break;
+
+		buf += bytes;
+		len -= bytes;
+		offset = 0;
+	}
+
+	/* Install custom I/O completion handler if read verify is enabled */
+	if (bio_data_dir(bio) == READ) {
+		bip->bip_end_io = bio->bi_end_io;
+		bio->bi_end_io = bio_integrity_endio;
+	}
+
+	/* Auto-generate integrity metadata if this is a write */
+	if (bio_data_dir(bio) == WRITE)
+		bio_integrity_generate(bio);
+
+	return 0;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+
+/**
+ * bio_integrity_verify - Verify integrity metadata for a bio
+ * @bio:	bio to verify
+ *
+ * Description: This function is called to verify the integrity of a
+ * bio.	 The data in the bio io_vec is compared to the integrity
+ * metadata returned by the HBA.
+ */
+static int bio_integrity_verify(struct bio *bio)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct blk_integrity_exchg bix;
+	struct bio_vec *bv;
+	sector_t sector = bio->bi_integrity->bip_sector;
+	unsigned int i, sectors, total, ret;
+	void *prot_buf = bio->bi_integrity->bip_buf;
+
+	ret = total = 0;
+	bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+	bix.sector_size = bi->sector_size;
+
+	bio_for_each_segment(bv, bio, i) {
+		void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+		bix.data_buf = kaddr + bv->bv_offset;
+		bix.data_size = bv->bv_len;
+		bix.prot_buf = prot_buf;
+		bix.sector = sector;
+
+		ret = bi->verify_fn(&bix);
+
+		if (ret) {
+			kunmap_atomic(kaddr, KM_USER0);
+			return ret;
+		}
+
+		sectors = bv->bv_len / bi->sector_size;
+		sector += sectors;
+		prot_buf += sectors * bi->tuple_size;
+		total += sectors * bi->tuple_size;
+		BUG_ON(total > bio->bi_integrity->bip_size);
+
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+
+	return ret;
+}
+
+/**
+ * bio_integrity_verify_fn - Integrity I/O completion worker
+ * @work:	Work struct stored in bio to be verified
+ *
+ * Description: This workqueue function is called to complete a READ
+ * request.  The function verifies the transferred integrity metadata
+ * and then calls the original bio end_io function.
+ */
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+	struct bio_integrity_payload *bip =
+		container_of(work, struct bio_integrity_payload, bip_work);
+	struct bio *bio = bip->bip_bio;
+	int error;
+
+	error = bio_integrity_verify(bio);
+
+	/* Restore original bio completion handler */
+	bio->bi_end_io = bip->bip_end_io;
+	bio_endio(bio, error);
+}
+
+/**
+ * bio_integrity_endio - Integrity I/O completion function
+ * @bio:	Protected bio
+ * @error:	Pointer to errno
+ *
+ * Description: Completion for integrity I/O
+ *
+ * Normally I/O completion is done in interrupt context.  However,
+ * verifying I/O integrity is a time-consuming task which must be run
+ * in process context.	This function postpones completion
+ * accordingly.
+ */
+void bio_integrity_endio(struct bio *bio, int error)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+
+	BUG_ON(bip->bip_bio != bio);
+
+	/* In case of an I/O error there is no point in verifying the
+	 * integrity metadata.  Restore original bio end_io handler
+	 * and run it.
+	 */
+	if (error) {
+		bio->bi_end_io = bip->bip_end_io;
+		bio_endio(bio, error);
+
+		return;
+	}
+
+	INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
+	queue_work(kintegrityd_wq, &bip->bip_work);
+}
+EXPORT_SYMBOL(bio_integrity_endio);
+
+/**
+ * bio_integrity_mark_head - Advance bip_vec skip bytes
+ * @bip:	Integrity vector to advance
+ * @skip:	Number of bytes to advance it
+ */
+void bio_integrity_mark_head(struct bio_integrity_payload *bip,
+			     unsigned int skip)
+{
+	struct bio_vec *iv;
+	unsigned int i;
+
+	bip_for_each_vec(iv, bip, i) {
+		if (skip == 0) {
+			bip->bip_idx = i;
+			return;
+		} else if (skip >= iv->bv_len) {
+			skip -= iv->bv_len;
+		} else { /* skip < iv->bv_len) */
+			iv->bv_offset += skip;
+			iv->bv_len -= skip;
+			bip->bip_idx = i;
+			return;
+		}
+	}
+}
+
+/**
+ * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
+ * @bip:	Integrity vector to truncate
+ * @len:	New length of integrity vector
+ */
+void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
+			     unsigned int len)
+{
+	struct bio_vec *iv;
+	unsigned int i;
+
+	bip_for_each_vec(iv, bip, i) {
+		if (len == 0) {
+			bip->bip_vcnt = i;
+			return;
+		} else if (len >= iv->bv_len) {
+			len -= iv->bv_len;
+		} else { /* len < iv->bv_len) */
+			iv->bv_len = len;
+			len = 0;
+		}
+	}
+}
+
+/**
+ * bio_integrity_advance - Advance integrity vector
+ * @bio:	bio whose integrity vector to update
+ * @bytes_done:	number of data bytes that have been completed
+ *
+ * Description: This function calculates how many integrity bytes the
+ * number of completed data bytes correspond to and advances the
+ * integrity vector accordingly.
+ */
+void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	unsigned int nr_sectors;
+
+	BUG_ON(bip == NULL);
+	BUG_ON(bi == NULL);
+
+	nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
+	bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_advance);
+
+/**
+ * bio_integrity_trim - Trim integrity vector
+ * @bio:	bio whose integrity vector to update
+ * @offset:	offset to first data sector
+ * @sectors:	number of data sectors
+ *
+ * Description: Used to trim the integrity vector in a cloned bio.
+ * The ivec will be advanced corresponding to 'offset' data sectors
+ * and the length will be truncated corresponding to 'len' data
+ * sectors.
+ */
+void bio_integrity_trim(struct bio *bio, unsigned int offset,
+			unsigned int sectors)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	unsigned int nr_sectors;
+
+	BUG_ON(bip == NULL);
+	BUG_ON(bi == NULL);
+	BUG_ON(!bio_flagged(bio, BIO_CLONED));
+
+	nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+	bip->bip_sector = bip->bip_sector + offset;
+	bio_integrity_mark_head(bip, offset * bi->tuple_size);
+	bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_trim);
+
+/**
+ * bio_integrity_split - Split integrity metadata
+ * @bio:	Protected bio
+ * @bp:		Resulting bio_pair
+ * @sectors:	Offset
+ *
+ * Description: Splits an integrity page into a bio_pair.
+ */
+void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
+{
+	struct blk_integrity *bi;
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	unsigned int nr_sectors;
+
+	if (bio_integrity(bio) == 0)
+		return;
+
+	bi = bdev_get_integrity(bio->bi_bdev);
+	BUG_ON(bi == NULL);
+	BUG_ON(bip->bip_vcnt != 1);
+
+	nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+
+	bp->bio1.bi_integrity = &bp->bip1;
+	bp->bio2.bi_integrity = &bp->bip2;
+
+	bp->iv1 = bip->bip_vec[0];
+	bp->iv2 = bip->bip_vec[0];
+
+	bp->bip1.bip_vec[0] = bp->iv1;
+	bp->bip2.bip_vec[0] = bp->iv2;
+
+	bp->iv1.bv_len = sectors * bi->tuple_size;
+	bp->iv2.bv_offset += sectors * bi->tuple_size;
+	bp->iv2.bv_len -= sectors * bi->tuple_size;
+
+	bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
+	bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
+
+	bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
+	bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
+}
+EXPORT_SYMBOL(bio_integrity_split);
+
+/**
+ * bio_integrity_clone - Callback for cloning bios with integrity metadata
+ * @bio:	New bio
+ * @bio_src:	Original bio
+ * @gfp_mask:	Memory allocation mask
+ * @bs:		bio_set to allocate bip from
+ *
+ * Description:	Called to allocate a bip when cloning a bio
+ */
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+			gfp_t gfp_mask, struct bio_set *bs)
+{
+	struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
+	struct bio_integrity_payload *bip;
+
+	BUG_ON(bip_src == NULL);
+
+	bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs);
+
+	if (bip == NULL)
+		return -EIO;
+
+	memcpy(bip->bip_vec, bip_src->bip_vec,
+	       bip_src->bip_vcnt * sizeof(struct bio_vec));
+
+	bip->bip_sector = bip_src->bip_sector;
+	bip->bip_vcnt = bip_src->bip_vcnt;
+	bip->bip_idx = bip_src->bip_idx;
+
+	return 0;
+}
+EXPORT_SYMBOL(bio_integrity_clone);
+
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
+{
+	unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
+
+	if (bs->bio_integrity_pool)
+		return 0;
+
+	bs->bio_integrity_pool =
+		mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
+
+	if (!bs->bio_integrity_pool)
+		return -1;
+
+	return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+
+void bioset_integrity_free(struct bio_set *bs)
+{
+	if (bs->bio_integrity_pool)
+		mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+
+void __init bio_integrity_init(void)
+{
+	unsigned int i;
+
+	/*
+	 * kintegrityd won't block much but may burn a lot of CPU cycles.
+	 * Make it highpri CPU intensive wq with max concurrency of 1.
+	 */
+	kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
+					 WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
+	if (!kintegrityd_wq)
+		panic("Failed to create kintegrityd\n");
+
+	for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) {
+		unsigned int size;
+
+		size = sizeof(struct bio_integrity_payload)
+			+ bip_slab[i].nr_vecs * sizeof(struct bio_vec);
+
+		bip_slab[i].slab =
+			kmem_cache_create(bip_slab[i].name, size, 0,
+					  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	}
+}
diff --git a/fs/bio.c b/fs/bio.c
new file mode 100644
index 00000000..9bfade8a
--- /dev/null
+++ b/fs/bio.c
@@ -0,0 +1,1692 @@
+/*
+ * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ *
+ */
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mempool.h>
+#include <linux/workqueue.h>
+#include <scsi/sg.h>		/* for struct sg_iovec */
+
+#include <trace/events/block.h>
+
+/*
+ * Test patch to inline a certain number of bi_io_vec's inside the bio
+ * itself, to shrink a bio data allocation from two mempool calls to one
+ */
+#define BIO_INLINE_VECS		4
+
+static mempool_t *bio_split_pool __read_mostly;
+
+/*
+ * if you change this list, also change bvec_alloc or things will
+ * break badly! cannot be bigger than what you can fit into an
+ * unsigned short
+ */
+#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
+static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
+	BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
+};
+#undef BV
+
+/*
+ * fs_bio_set is the bio_set containing bio and iovec memory pools used by
+ * IO code that does not need private memory pools.
+ */
+struct bio_set *fs_bio_set;
+
+/*
+ * Our slab pool management
+ */
+struct bio_slab {
+	struct kmem_cache *slab;
+	unsigned int slab_ref;
+	unsigned int slab_size;
+	char name[8];
+};
+static DEFINE_MUTEX(bio_slab_lock);
+static struct bio_slab *bio_slabs;
+static unsigned int bio_slab_nr, bio_slab_max;
+
+static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
+{
+	unsigned int sz = sizeof(struct bio) + extra_size;
+	struct kmem_cache *slab = NULL;
+	struct bio_slab *bslab;
+	unsigned int i, entry = -1;
+
+	mutex_lock(&bio_slab_lock);
+
+	i = 0;
+	while (i < bio_slab_nr) {
+		bslab = &bio_slabs[i];
+
+		if (!bslab->slab && entry == -1)
+			entry = i;
+		else if (bslab->slab_size == sz) {
+			slab = bslab->slab;
+			bslab->slab_ref++;
+			break;
+		}
+		i++;
+	}
+
+	if (slab)
+		goto out_unlock;
+
+	if (bio_slab_nr == bio_slab_max && entry == -1) {
+		bio_slab_max <<= 1;
+		bio_slabs = krealloc(bio_slabs,
+				     bio_slab_max * sizeof(struct bio_slab),
+				     GFP_KERNEL);
+		if (!bio_slabs)
+			goto out_unlock;
+	}
+	if (entry == -1)
+		entry = bio_slab_nr++;
+
+	bslab = &bio_slabs[entry];
+
+	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
+	slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!slab)
+		goto out_unlock;
+
+	printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
+	bslab->slab = slab;
+	bslab->slab_ref = 1;
+	bslab->slab_size = sz;
+out_unlock:
+	mutex_unlock(&bio_slab_lock);
+	return slab;
+}
+
+static void bio_put_slab(struct bio_set *bs)
+{
+	struct bio_slab *bslab = NULL;
+	unsigned int i;
+
+	mutex_lock(&bio_slab_lock);
+
+	for (i = 0; i < bio_slab_nr; i++) {
+		if (bs->bio_slab == bio_slabs[i].slab) {
+			bslab = &bio_slabs[i];
+			break;
+		}
+	}
+
+	if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
+		goto out;
+
+	WARN_ON(!bslab->slab_ref);
+
+	if (--bslab->slab_ref)
+		goto out;
+
+	kmem_cache_destroy(bslab->slab);
+	bslab->slab = NULL;
+
+out:
+	mutex_unlock(&bio_slab_lock);
+}
+
+unsigned int bvec_nr_vecs(unsigned short idx)
+{
+	return bvec_slabs[idx].nr_vecs;
+}
+
+void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
+{
+	BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
+
+	if (idx == BIOVEC_MAX_IDX)
+		mempool_free(bv, bs->bvec_pool);
+	else {
+		struct biovec_slab *bvs = bvec_slabs + idx;
+
+		kmem_cache_free(bvs->slab, bv);
+	}
+}
+
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
+			      struct bio_set *bs)
+{
+	struct bio_vec *bvl;
+
+	/*
+	 * see comment near bvec_array define!
+	 */
+	switch (nr) {
+	case 1:
+		*idx = 0;
+		break;
+	case 2 ... 4:
+		*idx = 1;
+		break;
+	case 5 ... 16:
+		*idx = 2;
+		break;
+	case 17 ... 64:
+		*idx = 3;
+		break;
+	case 65 ... 128:
+		*idx = 4;
+		break;
+	case 129 ... BIO_MAX_PAGES:
+		*idx = 5;
+		break;
+	default:
+		return NULL;
+	}
+
+	/*
+	 * idx now points to the pool we want to allocate from. only the
+	 * 1-vec entry pool is mempool backed.
+	 */
+	if (*idx == BIOVEC_MAX_IDX) {
+fallback:
+		bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
+	} else {
+		struct biovec_slab *bvs = bvec_slabs + *idx;
+		gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
+
+		/*
+		 * Make this allocation restricted and don't dump info on
+		 * allocation failures, since we'll fallback to the mempool
+		 * in case of failure.
+		 */
+		__gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+
+		/*
+		 * Try a slab allocation. If this fails and __GFP_WAIT
+		 * is set, retry with the 1-entry mempool
+		 */
+		bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
+		if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
+			*idx = BIOVEC_MAX_IDX;
+			goto fallback;
+		}
+	}
+
+	return bvl;
+}
+
+void bio_free(struct bio *bio, struct bio_set *bs)
+{
+	void *p;
+
+	if (bio_has_allocated_vec(bio))
+		bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
+
+	if (bio_integrity(bio))
+		bio_integrity_free(bio, bs);
+
+	/*
+	 * If we have front padding, adjust the bio pointer before freeing
+	 */
+	p = bio;
+	if (bs->front_pad)
+		p -= bs->front_pad;
+
+	mempool_free(p, bs->bio_pool);
+}
+EXPORT_SYMBOL(bio_free);
+
+void bio_init(struct bio *bio)
+{
+	memset(bio, 0, sizeof(*bio));
+	bio->bi_flags = 1 << BIO_UPTODATE;
+	bio->bi_comp_cpu = -1;
+	atomic_set(&bio->bi_cnt, 1);
+}
+EXPORT_SYMBOL(bio_init);
+
+/**
+ * bio_alloc_bioset - allocate a bio for I/O
+ * @gfp_mask:   the GFP_ mask given to the slab allocator
+ * @nr_iovecs:	number of iovecs to pre-allocate
+ * @bs:		the bio_set to allocate from.
+ *
+ * Description:
+ *   bio_alloc_bioset will try its own mempool to satisfy the allocation.
+ *   If %__GFP_WAIT is set then we will block on the internal pool waiting
+ *   for a &struct bio to become free.
+ *
+ *   Note that the caller must set ->bi_destructor on successful return
+ *   of a bio, to do the appropriate freeing of the bio once the reference
+ *   count drops to zero.
+ **/
+struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
+{
+	unsigned long idx = BIO_POOL_NONE;
+	struct bio_vec *bvl = NULL;
+	struct bio *bio;
+	void *p;
+
+	p = mempool_alloc(bs->bio_pool, gfp_mask);
+	if (unlikely(!p))
+		return NULL;
+	bio = p + bs->front_pad;
+
+	bio_init(bio);
+
+	if (unlikely(!nr_iovecs))
+		goto out_set;
+
+	if (nr_iovecs <= BIO_INLINE_VECS) {
+		bvl = bio->bi_inline_vecs;
+		nr_iovecs = BIO_INLINE_VECS;
+	} else {
+		bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+		if (unlikely(!bvl))
+			goto err_free;
+
+		nr_iovecs = bvec_nr_vecs(idx);
+	}
+out_set:
+	bio->bi_flags |= idx << BIO_POOL_OFFSET;
+	bio->bi_max_vecs = nr_iovecs;
+	bio->bi_io_vec = bvl;
+	return bio;
+
+err_free:
+	mempool_free(p, bs->bio_pool);
+	return NULL;
+}
+EXPORT_SYMBOL(bio_alloc_bioset);
+
+static void bio_fs_destructor(struct bio *bio)
+{
+	bio_free(bio, fs_bio_set);
+}
+
+/**
+ *	bio_alloc - allocate a new bio, memory pool backed
+ *	@gfp_mask: allocation mask to use
+ *	@nr_iovecs: number of iovecs
+ *
+ *	bio_alloc will allocate a bio and associated bio_vec array that can hold
+ *	at least @nr_iovecs entries. Allocations will be done from the
+ *	fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc.
+ *
+ *	If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
+ *	a bio. This is due to the mempool guarantees. To make this work, callers
+ *	must never allocate more than 1 bio at a time from this pool. Callers
+ *	that need to allocate more than 1 bio must always submit the previously
+ *	allocated bio for IO before attempting to allocate a new one. Failure to
+ *	do so can cause livelocks under memory pressure.
+ *
+ *	RETURNS:
+ *	Pointer to new bio on success, NULL on failure.
+ */
+struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
+{
+	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+
+	if (bio)
+		bio->bi_destructor = bio_fs_destructor;
+
+	return bio;
+}
+EXPORT_SYMBOL(bio_alloc);
+
+static void bio_kmalloc_destructor(struct bio *bio)
+{
+	if (bio_integrity(bio))
+		bio_integrity_free(bio, fs_bio_set);
+	kfree(bio);
+}
+
+/**
+ * bio_kmalloc - allocate a bio for I/O using kmalloc()
+ * @gfp_mask:   the GFP_ mask given to the slab allocator
+ * @nr_iovecs:	number of iovecs to pre-allocate
+ *
+ * Description:
+ *   Allocate a new bio with @nr_iovecs bvecs.  If @gfp_mask contains
+ *   %__GFP_WAIT, the allocation is guaranteed to succeed.
+ *
+ **/
+struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
+{
+	struct bio *bio;
+
+	if (nr_iovecs > UIO_MAXIOV)
+		return NULL;
+
+	bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
+		      gfp_mask);
+	if (unlikely(!bio))
+		return NULL;
+
+	bio_init(bio);
+	bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
+	bio->bi_max_vecs = nr_iovecs;
+	bio->bi_io_vec = bio->bi_inline_vecs;
+	bio->bi_destructor = bio_kmalloc_destructor;
+
+	return bio;
+}
+EXPORT_SYMBOL(bio_kmalloc);
+
+void zero_fill_bio(struct bio *bio)
+{
+	unsigned long flags;
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment(bv, bio, i) {
+		char *data = bvec_kmap_irq(bv, &flags);
+		memset(data, 0, bv->bv_len);
+		flush_dcache_page(bv->bv_page);
+		bvec_kunmap_irq(data, &flags);
+	}
+}
+EXPORT_SYMBOL(zero_fill_bio);
+
+/**
+ * bio_put - release a reference to a bio
+ * @bio:   bio to release reference to
+ *
+ * Description:
+ *   Put a reference to a &struct bio, either one you have gotten with
+ *   bio_alloc, bio_get or bio_clone. The last put of a bio will free it.
+ **/
+void bio_put(struct bio *bio)
+{
+	BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
+
+	/*
+	 * last put frees it
+	 */
+	if (atomic_dec_and_test(&bio->bi_cnt)) {
+		bio->bi_next = NULL;
+		bio->bi_destructor(bio);
+	}
+}
+EXPORT_SYMBOL(bio_put);
+
+inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
+{
+	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
+		blk_recount_segments(q, bio);
+
+	return bio->bi_phys_segments;
+}
+EXPORT_SYMBOL(bio_phys_segments);
+
+/**
+ * 	__bio_clone	-	clone a bio
+ * 	@bio: destination bio
+ * 	@bio_src: bio to clone
+ *
+ *	Clone a &bio. Caller will own the returned bio, but not
+ *	the actual data it points to. Reference count of returned
+ * 	bio will be one.
+ */
+void __bio_clone(struct bio *bio, struct bio *bio_src)
+{
+	memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
+		bio_src->bi_max_vecs * sizeof(struct bio_vec));
+
+	/*
+	 * most users will be overriding ->bi_bdev with a new target,
+	 * so we don't set nor calculate new physical/hw segment counts here
+	 */
+	bio->bi_sector = bio_src->bi_sector;
+	bio->bi_bdev = bio_src->bi_bdev;
+	bio->bi_flags |= 1 << BIO_CLONED;
+	bio->bi_rw = bio_src->bi_rw;
+	bio->bi_vcnt = bio_src->bi_vcnt;
+	bio->bi_size = bio_src->bi_size;
+	bio->bi_idx = bio_src->bi_idx;
+}
+EXPORT_SYMBOL(__bio_clone);
+
+/**
+ *	bio_clone	-	clone a bio
+ *	@bio: bio to clone
+ *	@gfp_mask: allocation priority
+ *
+ * 	Like __bio_clone, only also allocates the returned bio
+ */
+struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
+{
+	struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
+
+	if (!b)
+		return NULL;
+
+	b->bi_destructor = bio_fs_destructor;
+	__bio_clone(b, bio);
+
+	if (bio_integrity(bio)) {
+		int ret;
+
+		ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
+
+		if (ret < 0) {
+			bio_put(b);
+			return NULL;
+		}
+	}
+
+	return b;
+}
+EXPORT_SYMBOL(bio_clone);
+
+/**
+ *	bio_get_nr_vecs		- return approx number of vecs
+ *	@bdev:  I/O target
+ *
+ *	Return the approximate number of pages we can send to this target.
+ *	There's no guarantee that you will be able to fit this number of pages
+ *	into a bio, it does not account for dynamic restrictions that vary
+ *	on offset.
+ */
+int bio_get_nr_vecs(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	int nr_pages;
+
+	nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (nr_pages > queue_max_segments(q))
+		nr_pages = queue_max_segments(q);
+
+	return nr_pages;
+}
+EXPORT_SYMBOL(bio_get_nr_vecs);
+
+static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
+			  *page, unsigned int len, unsigned int offset,
+			  unsigned short max_sectors)
+{
+	int retried_segments = 0;
+	struct bio_vec *bvec;
+
+	/*
+	 * cloned bio must not modify vec list
+	 */
+	if (unlikely(bio_flagged(bio, BIO_CLONED)))
+		return 0;
+
+	if (((bio->bi_size + len) >> 9) > max_sectors)
+		return 0;
+
+	/*
+	 * For filesystems with a blocksize smaller than the pagesize
+	 * we will often be called with the same page as last time and
+	 * a consecutive offset.  Optimize this special case.
+	 */
+	if (bio->bi_vcnt > 0) {
+		struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+		if (page == prev->bv_page &&
+		    offset == prev->bv_offset + prev->bv_len) {
+			unsigned int prev_bv_len = prev->bv_len;
+			prev->bv_len += len;
+
+			if (q->merge_bvec_fn) {
+				struct bvec_merge_data bvm = {
+					/* prev_bvec is already charged in
+					   bi_size, discharge it in order to
+					   simulate merging updated prev_bvec
+					   as new bvec. */
+					.bi_bdev = bio->bi_bdev,
+					.bi_sector = bio->bi_sector,
+					.bi_size = bio->bi_size - prev_bv_len,
+					.bi_rw = bio->bi_rw,
+				};
+
+				if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
+					prev->bv_len -= len;
+					return 0;
+				}
+			}
+
+			goto done;
+		}
+	}
+
+	if (bio->bi_vcnt >= bio->bi_max_vecs)
+		return 0;
+
+	/*
+	 * we might lose a segment or two here, but rather that than
+	 * make this too complex.
+	 */
+
+	while (bio->bi_phys_segments >= queue_max_segments(q)) {
+
+		if (retried_segments)
+			return 0;
+
+		retried_segments = 1;
+		blk_recount_segments(q, bio);
+	}
+
+	/*
+	 * setup the new entry, we might clear it again later if we
+	 * cannot add the page
+	 */
+	bvec = &bio->bi_io_vec[bio->bi_vcnt];
+	bvec->bv_page = page;
+	bvec->bv_len = len;
+	bvec->bv_offset = offset;
+
+	/*
+	 * if queue has other restrictions (eg varying max sector size
+	 * depending on offset), it can specify a merge_bvec_fn in the
+	 * queue to get further control
+	 */
+	if (q->merge_bvec_fn) {
+		struct bvec_merge_data bvm = {
+			.bi_bdev = bio->bi_bdev,
+			.bi_sector = bio->bi_sector,
+			.bi_size = bio->bi_size,
+			.bi_rw = bio->bi_rw,
+		};
+
+		/*
+		 * merge_bvec_fn() returns number of bytes it can accept
+		 * at this offset
+		 */
+		if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
+			bvec->bv_page = NULL;
+			bvec->bv_len = 0;
+			bvec->bv_offset = 0;
+			return 0;
+		}
+	}
+
+	/* If we may be able to merge these biovecs, force a recount */
+	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
+		bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+
+	bio->bi_vcnt++;
+	bio->bi_phys_segments++;
+ done:
+	bio->bi_size += len;
+	return len;
+}
+
+/**
+ *	bio_add_pc_page	-	attempt to add page to bio
+ *	@q: the target queue
+ *	@bio: destination bio
+ *	@page: page to add
+ *	@len: vec entry length
+ *	@offset: vec entry offset
+ *
+ *	Attempt to add a page to the bio_vec maplist. This can fail for a
+ *	number of reasons, such as the bio being full or target block device
+ *	limitations. The target block device must allow bio's up to PAGE_SIZE,
+ *	so it is always possible to add a single page to an empty bio.
+ *
+ *	This should only be used by REQ_PC bios.
+ */
+int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
+		    unsigned int len, unsigned int offset)
+{
+	return __bio_add_page(q, bio, page, len, offset,
+			      queue_max_hw_sectors(q));
+}
+EXPORT_SYMBOL(bio_add_pc_page);
+
+/**
+ *	bio_add_page	-	attempt to add page to bio
+ *	@bio: destination bio
+ *	@page: page to add
+ *	@len: vec entry length
+ *	@offset: vec entry offset
+ *
+ *	Attempt to add a page to the bio_vec maplist. This can fail for a
+ *	number of reasons, such as the bio being full or target block device
+ *	limitations. The target block device must allow bio's up to PAGE_SIZE,
+ *	so it is always possible to add a single page to an empty bio.
+ */
+int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
+		 unsigned int offset)
+{
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
+}
+EXPORT_SYMBOL(bio_add_page);
+
+struct bio_map_data {
+	struct bio_vec *iovecs;
+	struct sg_iovec *sgvecs;
+	int nr_sgvecs;
+	int is_our_pages;
+};
+
+static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
+			     struct sg_iovec *iov, int iov_count,
+			     int is_our_pages)
+{
+	memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
+	memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
+	bmd->nr_sgvecs = iov_count;
+	bmd->is_our_pages = is_our_pages;
+	bio->bi_private = bmd;
+}
+
+static void bio_free_map_data(struct bio_map_data *bmd)
+{
+	kfree(bmd->iovecs);
+	kfree(bmd->sgvecs);
+	kfree(bmd);
+}
+
+static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
+					       gfp_t gfp_mask)
+{
+	struct bio_map_data *bmd;
+
+	if (iov_count > UIO_MAXIOV)
+		return NULL;
+
+	bmd = kmalloc(sizeof(*bmd), gfp_mask);
+	if (!bmd)
+		return NULL;
+
+	bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
+	if (!bmd->iovecs) {
+		kfree(bmd);
+		return NULL;
+	}
+
+	bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
+	if (bmd->sgvecs)
+		return bmd;
+
+	kfree(bmd->iovecs);
+	kfree(bmd);
+	return NULL;
+}
+
+static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
+			  struct sg_iovec *iov, int iov_count,
+			  int to_user, int from_user, int do_free_page)
+{
+	int ret = 0, i;
+	struct bio_vec *bvec;
+	int iov_idx = 0;
+	unsigned int iov_off = 0;
+
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		char *bv_addr = page_address(bvec->bv_page);
+		unsigned int bv_len = iovecs[i].bv_len;
+
+		while (bv_len && iov_idx < iov_count) {
+			unsigned int bytes;
+			char __user *iov_addr;
+
+			bytes = min_t(unsigned int,
+				      iov[iov_idx].iov_len - iov_off, bv_len);
+			iov_addr = iov[iov_idx].iov_base + iov_off;
+
+			if (!ret) {
+				if (to_user)
+					ret = copy_to_user(iov_addr, bv_addr,
+							   bytes);
+
+				if (from_user)
+					ret = copy_from_user(bv_addr, iov_addr,
+							     bytes);
+
+				if (ret)
+					ret = -EFAULT;
+			}
+
+			bv_len -= bytes;
+			bv_addr += bytes;
+			iov_addr += bytes;
+			iov_off += bytes;
+
+			if (iov[iov_idx].iov_len == iov_off) {
+				iov_idx++;
+				iov_off = 0;
+			}
+		}
+
+		if (do_free_page)
+			__free_page(bvec->bv_page);
+	}
+
+	return ret;
+}
+
+/**
+ *	bio_uncopy_user	-	finish previously mapped bio
+ *	@bio: bio being terminated
+ *
+ *	Free pages allocated from bio_copy_user() and write back data
+ *	to user space in case of a read.
+ */
+int bio_uncopy_user(struct bio *bio)
+{
+	struct bio_map_data *bmd = bio->bi_private;
+	int ret = 0;
+
+	if (!bio_flagged(bio, BIO_NULL_MAPPED))
+		ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
+				     bmd->nr_sgvecs, bio_data_dir(bio) == READ,
+				     0, bmd->is_our_pages);
+	bio_free_map_data(bmd);
+	bio_put(bio);
+	return ret;
+}
+EXPORT_SYMBOL(bio_uncopy_user);
+
+/**
+ *	bio_copy_user_iov	-	copy user data to bio
+ *	@q: destination block queue
+ *	@map_data: pointer to the rq_map_data holding pages (if necessary)
+ *	@iov:	the iovec.
+ *	@iov_count: number of elements in the iovec
+ *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
+ *
+ *	Prepares and returns a bio for indirect user io, bouncing data
+ *	to/from kernel pages as necessary. Must be paired with
+ *	call bio_uncopy_user() on io completion.
+ */
+struct bio *bio_copy_user_iov(struct request_queue *q,
+			      struct rq_map_data *map_data,
+			      struct sg_iovec *iov, int iov_count,
+			      int write_to_vm, gfp_t gfp_mask)
+{
+	struct bio_map_data *bmd;
+	struct bio_vec *bvec;
+	struct page *page;
+	struct bio *bio;
+	int i, ret;
+	int nr_pages = 0;
+	unsigned int len = 0;
+	unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
+
+	for (i = 0; i < iov_count; i++) {
+		unsigned long uaddr;
+		unsigned long end;
+		unsigned long start;
+
+		uaddr = (unsigned long)iov[i].iov_base;
+		end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		start = uaddr >> PAGE_SHIFT;
+
+		/*
+		 * Overflow, abort
+		 */
+		if (end < start)
+			return ERR_PTR(-EINVAL);
+
+		nr_pages += end - start;
+		len += iov[i].iov_len;
+	}
+
+	if (offset)
+		nr_pages++;
+
+	bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
+	if (!bmd)
+		return ERR_PTR(-ENOMEM);
+
+	ret = -ENOMEM;
+	bio = bio_kmalloc(gfp_mask, nr_pages);
+	if (!bio)
+		goto out_bmd;
+
+	if (!write_to_vm)
+		bio->bi_rw |= REQ_WRITE;
+
+	ret = 0;
+
+	if (map_data) {
+		nr_pages = 1 << map_data->page_order;
+		i = map_data->offset / PAGE_SIZE;
+	}
+	while (len) {
+		unsigned int bytes = PAGE_SIZE;
+
+		bytes -= offset;
+
+		if (bytes > len)
+			bytes = len;
+
+		if (map_data) {
+			if (i == map_data->nr_entries * nr_pages) {
+				ret = -ENOMEM;
+				break;
+			}
+
+			page = map_data->pages[i / nr_pages];
+			page += (i % nr_pages);
+
+			i++;
+		} else {
+			page = alloc_page(q->bounce_gfp | gfp_mask);
+			if (!page) {
+				ret = -ENOMEM;
+				break;
+			}
+		}
+
+		if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
+			break;
+
+		len -= bytes;
+		offset = 0;
+	}
+
+	if (ret)
+		goto cleanup;
+
+	/*
+	 * success
+	 */
+	if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
+	    (map_data && map_data->from_user)) {
+		ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0);
+		if (ret)
+			goto cleanup;
+	}
+
+	bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
+	return bio;
+cleanup:
+	if (!map_data)
+		bio_for_each_segment(bvec, bio, i)
+			__free_page(bvec->bv_page);
+
+	bio_put(bio);
+out_bmd:
+	bio_free_map_data(bmd);
+	return ERR_PTR(ret);
+}
+
+/**
+ *	bio_copy_user	-	copy user data to bio
+ *	@q: destination block queue
+ *	@map_data: pointer to the rq_map_data holding pages (if necessary)
+ *	@uaddr: start of user address
+ *	@len: length in bytes
+ *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
+ *
+ *	Prepares and returns a bio for indirect user io, bouncing data
+ *	to/from kernel pages as necessary. Must be paired with
+ *	call bio_uncopy_user() on io completion.
+ */
+struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
+			  unsigned long uaddr, unsigned int len,
+			  int write_to_vm, gfp_t gfp_mask)
+{
+	struct sg_iovec iov;
+
+	iov.iov_base = (void __user *)uaddr;
+	iov.iov_len = len;
+
+	return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
+}
+EXPORT_SYMBOL(bio_copy_user);
+
+static struct bio *__bio_map_user_iov(struct request_queue *q,
+				      struct block_device *bdev,
+				      struct sg_iovec *iov, int iov_count,
+				      int write_to_vm, gfp_t gfp_mask)
+{
+	int i, j;
+	int nr_pages = 0;
+	struct page **pages;
+	struct bio *bio;
+	int cur_page = 0;
+	int ret, offset;
+
+	for (i = 0; i < iov_count; i++) {
+		unsigned long uaddr = (unsigned long)iov[i].iov_base;
+		unsigned long len = iov[i].iov_len;
+		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		unsigned long start = uaddr >> PAGE_SHIFT;
+
+		/*
+		 * Overflow, abort
+		 */
+		if (end < start)
+			return ERR_PTR(-EINVAL);
+
+		nr_pages += end - start;
+		/*
+		 * buffer must be aligned to at least hardsector size for now
+		 */
+		if (uaddr & queue_dma_alignment(q))
+			return ERR_PTR(-EINVAL);
+	}
+
+	if (!nr_pages)
+		return ERR_PTR(-EINVAL);
+
+	bio = bio_kmalloc(gfp_mask, nr_pages);
+	if (!bio)
+		return ERR_PTR(-ENOMEM);
+
+	ret = -ENOMEM;
+	pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
+	if (!pages)
+		goto out;
+
+	for (i = 0; i < iov_count; i++) {
+		unsigned long uaddr = (unsigned long)iov[i].iov_base;
+		unsigned long len = iov[i].iov_len;
+		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		unsigned long start = uaddr >> PAGE_SHIFT;
+		const int local_nr_pages = end - start;
+		const int page_limit = cur_page + local_nr_pages;
+
+		ret = get_user_pages_fast(uaddr, local_nr_pages,
+				write_to_vm, &pages[cur_page]);
+		if (ret < local_nr_pages) {
+			ret = -EFAULT;
+			goto out_unmap;
+		}
+
+		offset = uaddr & ~PAGE_MASK;
+		for (j = cur_page; j < page_limit; j++) {
+			unsigned int bytes = PAGE_SIZE - offset;
+
+			if (len <= 0)
+				break;
+			
+			if (bytes > len)
+				bytes = len;
+
+			/*
+			 * sorry...
+			 */
+			if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
+					    bytes)
+				break;
+
+			len -= bytes;
+			offset = 0;
+		}
+
+		cur_page = j;
+		/*
+		 * release the pages we didn't map into the bio, if any
+		 */
+		while (j < page_limit)
+			page_cache_release(pages[j++]);
+	}
+
+	kfree(pages);
+
+	/*
+	 * set data direction, and check if mapped pages need bouncing
+	 */
+	if (!write_to_vm)
+		bio->bi_rw |= REQ_WRITE;
+
+	bio->bi_bdev = bdev;
+	bio->bi_flags |= (1 << BIO_USER_MAPPED);
+	return bio;
+
+ out_unmap:
+	for (i = 0; i < nr_pages; i++) {
+		if(!pages[i])
+			break;
+		page_cache_release(pages[i]);
+	}
+ out:
+	kfree(pages);
+	bio_put(bio);
+	return ERR_PTR(ret);
+}
+
+/**
+ *	bio_map_user	-	map user address into bio
+ *	@q: the struct request_queue for the bio
+ *	@bdev: destination block device
+ *	@uaddr: start of user address
+ *	@len: length in bytes
+ *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
+ *
+ *	Map the user space address into a bio suitable for io to a block
+ *	device. Returns an error pointer in case of error.
+ */
+struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
+			 unsigned long uaddr, unsigned int len, int write_to_vm,
+			 gfp_t gfp_mask)
+{
+	struct sg_iovec iov;
+
+	iov.iov_base = (void __user *)uaddr;
+	iov.iov_len = len;
+
+	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
+}
+EXPORT_SYMBOL(bio_map_user);
+
+/**
+ *	bio_map_user_iov - map user sg_iovec table into bio
+ *	@q: the struct request_queue for the bio
+ *	@bdev: destination block device
+ *	@iov:	the iovec.
+ *	@iov_count: number of elements in the iovec
+ *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
+ *
+ *	Map the user space address into a bio suitable for io to a block
+ *	device. Returns an error pointer in case of error.
+ */
+struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
+			     struct sg_iovec *iov, int iov_count,
+			     int write_to_vm, gfp_t gfp_mask)
+{
+	struct bio *bio;
+
+	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
+				 gfp_mask);
+	if (IS_ERR(bio))
+		return bio;
+
+	/*
+	 * subtle -- if __bio_map_user() ended up bouncing a bio,
+	 * it would normally disappear when its bi_end_io is run.
+	 * however, we need it for the unmap, so grab an extra
+	 * reference to it
+	 */
+	bio_get(bio);
+
+	return bio;
+}
+
+static void __bio_unmap_user(struct bio *bio)
+{
+	struct bio_vec *bvec;
+	int i;
+
+	/*
+	 * make sure we dirty pages we wrote to
+	 */
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		if (bio_data_dir(bio) == READ)
+			set_page_dirty_lock(bvec->bv_page);
+
+		page_cache_release(bvec->bv_page);
+	}
+
+	bio_put(bio);
+}
+
+/**
+ *	bio_unmap_user	-	unmap a bio
+ *	@bio:		the bio being unmapped
+ *
+ *	Unmap a bio previously mapped by bio_map_user(). Must be called with
+ *	a process context.
+ *
+ *	bio_unmap_user() may sleep.
+ */
+void bio_unmap_user(struct bio *bio)
+{
+	__bio_unmap_user(bio);
+	bio_put(bio);
+}
+EXPORT_SYMBOL(bio_unmap_user);
+
+static void bio_map_kern_endio(struct bio *bio, int err)
+{
+	bio_put(bio);
+}
+
+static struct bio *__bio_map_kern(struct request_queue *q, void *data,
+				  unsigned int len, gfp_t gfp_mask)
+{
+	unsigned long kaddr = (unsigned long)data;
+	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	unsigned long start = kaddr >> PAGE_SHIFT;
+	const int nr_pages = end - start;
+	int offset, i;
+	struct bio *bio;
+
+	bio = bio_kmalloc(gfp_mask, nr_pages);
+	if (!bio)
+		return ERR_PTR(-ENOMEM);
+
+	offset = offset_in_page(kaddr);
+	for (i = 0; i < nr_pages; i++) {
+		unsigned int bytes = PAGE_SIZE - offset;
+
+		if (len <= 0)
+			break;
+
+		if (bytes > len)
+			bytes = len;
+
+		if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
+				    offset) < bytes)
+			break;
+
+		data += bytes;
+		len -= bytes;
+		offset = 0;
+	}
+
+	bio->bi_end_io = bio_map_kern_endio;
+	return bio;
+}
+
+/**
+ *	bio_map_kern	-	map kernel address into bio
+ *	@q: the struct request_queue for the bio
+ *	@data: pointer to buffer to map
+ *	@len: length in bytes
+ *	@gfp_mask: allocation flags for bio allocation
+ *
+ *	Map the kernel address into a bio suitable for io to a block
+ *	device. Returns an error pointer in case of error.
+ */
+struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
+			 gfp_t gfp_mask)
+{
+	struct bio *bio;
+
+	bio = __bio_map_kern(q, data, len, gfp_mask);
+	if (IS_ERR(bio))
+		return bio;
+
+	if (bio->bi_size == len)
+		return bio;
+
+	/*
+	 * Don't support partial mappings.
+	 */
+	bio_put(bio);
+	return ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL(bio_map_kern);
+
+static void bio_copy_kern_endio(struct bio *bio, int err)
+{
+	struct bio_vec *bvec;
+	const int read = bio_data_dir(bio) == READ;
+	struct bio_map_data *bmd = bio->bi_private;
+	int i;
+	char *p = bmd->sgvecs[0].iov_base;
+
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		char *addr = page_address(bvec->bv_page);
+		int len = bmd->iovecs[i].bv_len;
+
+		if (read)
+			memcpy(p, addr, len);
+
+		__free_page(bvec->bv_page);
+		p += len;
+	}
+
+	bio_free_map_data(bmd);
+	bio_put(bio);
+}
+
+/**
+ *	bio_copy_kern	-	copy kernel address into bio
+ *	@q: the struct request_queue for the bio
+ *	@data: pointer to buffer to copy
+ *	@len: length in bytes
+ *	@gfp_mask: allocation flags for bio and page allocation
+ *	@reading: data direction is READ
+ *
+ *	copy the kernel address into a bio suitable for io to a block
+ *	device. Returns an error pointer in case of error.
+ */
+struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
+			  gfp_t gfp_mask, int reading)
+{
+	struct bio *bio;
+	struct bio_vec *bvec;
+	int i;
+
+	bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
+	if (IS_ERR(bio))
+		return bio;
+
+	if (!reading) {
+		void *p = data;
+
+		bio_for_each_segment(bvec, bio, i) {
+			char *addr = page_address(bvec->bv_page);
+
+			memcpy(addr, p, bvec->bv_len);
+			p += bvec->bv_len;
+		}
+	}
+
+	bio->bi_end_io = bio_copy_kern_endio;
+
+	return bio;
+}
+EXPORT_SYMBOL(bio_copy_kern);
+
+/*
+ * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
+ * for performing direct-IO in BIOs.
+ *
+ * The problem is that we cannot run set_page_dirty() from interrupt context
+ * because the required locks are not interrupt-safe.  So what we can do is to
+ * mark the pages dirty _before_ performing IO.  And in interrupt context,
+ * check that the pages are still dirty.   If so, fine.  If not, redirty them
+ * in process context.
+ *
+ * We special-case compound pages here: normally this means reads into hugetlb
+ * pages.  The logic in here doesn't really work right for compound pages
+ * because the VM does not uniformly chase down the head page in all cases.
+ * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
+ * handle them at all.  So we skip compound pages here at an early stage.
+ *
+ * Note that this code is very hard to test under normal circumstances because
+ * direct-io pins the pages with get_user_pages().  This makes
+ * is_page_cache_freeable return false, and the VM will not clean the pages.
+ * But other code (eg, pdflush) could clean the pages if they are mapped
+ * pagecache.
+ *
+ * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
+ * deferred bio dirtying paths.
+ */
+
+/*
+ * bio_set_pages_dirty() will mark all the bio's pages as dirty.
+ */
+void bio_set_pages_dirty(struct bio *bio)
+{
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int i;
+
+	for (i = 0; i < bio->bi_vcnt; i++) {
+		struct page *page = bvec[i].bv_page;
+
+		if (page && !PageCompound(page))
+			set_page_dirty_lock(page);
+	}
+}
+
+static void bio_release_pages(struct bio *bio)
+{
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int i;
+
+	for (i = 0; i < bio->bi_vcnt; i++) {
+		struct page *page = bvec[i].bv_page;
+
+		if (page)
+			put_page(page);
+	}
+}
+
+/*
+ * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
+ * If they are, then fine.  If, however, some pages are clean then they must
+ * have been written out during the direct-IO read.  So we take another ref on
+ * the BIO and the offending pages and re-dirty the pages in process context.
+ *
+ * It is expected that bio_check_pages_dirty() will wholly own the BIO from
+ * here on.  It will run one page_cache_release() against each page and will
+ * run one bio_put() against the BIO.
+ */
+
+static void bio_dirty_fn(struct work_struct *work);
+
+static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
+static DEFINE_SPINLOCK(bio_dirty_lock);
+static struct bio *bio_dirty_list;
+
+/*
+ * This runs in process context
+ */
+static void bio_dirty_fn(struct work_struct *work)
+{
+	unsigned long flags;
+	struct bio *bio;
+
+	spin_lock_irqsave(&bio_dirty_lock, flags);
+	bio = bio_dirty_list;
+	bio_dirty_list = NULL;
+	spin_unlock_irqrestore(&bio_dirty_lock, flags);
+
+	while (bio) {
+		struct bio *next = bio->bi_private;
+
+		bio_set_pages_dirty(bio);
+		bio_release_pages(bio);
+		bio_put(bio);
+		bio = next;
+	}
+}
+
+void bio_check_pages_dirty(struct bio *bio)
+{
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int nr_clean_pages = 0;
+	int i;
+
+	for (i = 0; i < bio->bi_vcnt; i++) {
+		struct page *page = bvec[i].bv_page;
+
+		if (PageDirty(page) || PageCompound(page)) {
+			page_cache_release(page);
+			bvec[i].bv_page = NULL;
+		} else {
+			nr_clean_pages++;
+		}
+	}
+
+	if (nr_clean_pages) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&bio_dirty_lock, flags);
+		bio->bi_private = bio_dirty_list;
+		bio_dirty_list = bio;
+		spin_unlock_irqrestore(&bio_dirty_lock, flags);
+		schedule_work(&bio_dirty_work);
+	} else {
+		bio_put(bio);
+	}
+}
+
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+void bio_flush_dcache_pages(struct bio *bi)
+{
+	int i;
+	struct bio_vec *bvec;
+
+	bio_for_each_segment(bvec, bi, i)
+		flush_dcache_page(bvec->bv_page);
+}
+EXPORT_SYMBOL(bio_flush_dcache_pages);
+#endif
+
+/**
+ * bio_endio - end I/O on a bio
+ * @bio:	bio
+ * @error:	error, if any
+ *
+ * Description:
+ *   bio_endio() will end I/O on the whole bio. bio_endio() is the
+ *   preferred way to end I/O on a bio, it takes care of clearing
+ *   BIO_UPTODATE on error. @error is 0 on success, and and one of the
+ *   established -Exxxx (-EIO, for instance) error values in case
+ *   something went wrong. No one should call bi_end_io() directly on a
+ *   bio unless they own it and thus know that it has an end_io
+ *   function.
+ **/
+void bio_endio(struct bio *bio, int error)
+{
+	if (error)
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		error = -EIO;
+
+	if (bio->bi_end_io)
+		bio->bi_end_io(bio, error);
+}
+EXPORT_SYMBOL(bio_endio);
+
+void bio_pair_release(struct bio_pair *bp)
+{
+	if (atomic_dec_and_test(&bp->cnt)) {
+		struct bio *master = bp->bio1.bi_private;
+
+		bio_endio(master, bp->error);
+		mempool_free(bp, bp->bio2.bi_private);
+	}
+}
+EXPORT_SYMBOL(bio_pair_release);
+
+static void bio_pair_end_1(struct bio *bi, int err)
+{
+	struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
+
+	if (err)
+		bp->error = err;
+
+	bio_pair_release(bp);
+}
+
+static void bio_pair_end_2(struct bio *bi, int err)
+{
+	struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
+
+	if (err)
+		bp->error = err;
+
+	bio_pair_release(bp);
+}
+
+/*
+ * split a bio - only worry about a bio with a single page in its iovec
+ */
+struct bio_pair *bio_split(struct bio *bi, int first_sectors)
+{
+	struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
+
+	if (!bp)
+		return bp;
+
+	trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
+				bi->bi_sector + first_sectors);
+
+	BUG_ON(bi->bi_vcnt != 1);
+	BUG_ON(bi->bi_idx != 0);
+	atomic_set(&bp->cnt, 3);
+	bp->error = 0;
+	bp->bio1 = *bi;
+	bp->bio2 = *bi;
+	bp->bio2.bi_sector += first_sectors;
+	bp->bio2.bi_size -= first_sectors << 9;
+	bp->bio1.bi_size = first_sectors << 9;
+
+	bp->bv1 = bi->bi_io_vec[0];
+	bp->bv2 = bi->bi_io_vec[0];
+	bp->bv2.bv_offset += first_sectors << 9;
+	bp->bv2.bv_len -= first_sectors << 9;
+	bp->bv1.bv_len = first_sectors << 9;
+
+	bp->bio1.bi_io_vec = &bp->bv1;
+	bp->bio2.bi_io_vec = &bp->bv2;
+
+	bp->bio1.bi_max_vecs = 1;
+	bp->bio2.bi_max_vecs = 1;
+
+	bp->bio1.bi_end_io = bio_pair_end_1;
+	bp->bio2.bi_end_io = bio_pair_end_2;
+
+	bp->bio1.bi_private = bi;
+	bp->bio2.bi_private = bio_split_pool;
+
+	if (bio_integrity(bi))
+		bio_integrity_split(bi, bp, first_sectors);
+
+	return bp;
+}
+EXPORT_SYMBOL(bio_split);
+
+/**
+ *      bio_sector_offset - Find hardware sector offset in bio
+ *      @bio:           bio to inspect
+ *      @index:         bio_vec index
+ *      @offset:        offset in bv_page
+ *
+ *      Return the number of hardware sectors between beginning of bio
+ *      and an end point indicated by a bio_vec index and an offset
+ *      within that vector's page.
+ */
+sector_t bio_sector_offset(struct bio *bio, unsigned short index,
+			   unsigned int offset)
+{
+	unsigned int sector_sz;
+	struct bio_vec *bv;
+	sector_t sectors;
+	int i;
+
+	sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
+	sectors = 0;
+
+	if (index >= bio->bi_idx)
+		index = bio->bi_vcnt - 1;
+
+	__bio_for_each_segment(bv, bio, i, 0) {
+		if (i == index) {
+			if (offset > bv->bv_offset)
+				sectors += (offset - bv->bv_offset) / sector_sz;
+			break;
+		}
+
+		sectors += bv->bv_len / sector_sz;
+	}
+
+	return sectors;
+}
+EXPORT_SYMBOL(bio_sector_offset);
+
+/*
+ * create memory pools for biovec's in a bio_set.
+ * use the global biovec slabs created for general use.
+ */
+static int biovec_create_pools(struct bio_set *bs, int pool_entries)
+{
+	struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
+
+	bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
+	if (!bs->bvec_pool)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void biovec_free_pools(struct bio_set *bs)
+{
+	mempool_destroy(bs->bvec_pool);
+}
+
+void bioset_free(struct bio_set *bs)
+{
+	if (bs->bio_pool)
+		mempool_destroy(bs->bio_pool);
+
+	bioset_integrity_free(bs);
+	biovec_free_pools(bs);
+	bio_put_slab(bs);
+
+	kfree(bs);
+}
+EXPORT_SYMBOL(bioset_free);
+
+/**
+ * bioset_create  - Create a bio_set
+ * @pool_size:	Number of bio and bio_vecs to cache in the mempool
+ * @front_pad:	Number of bytes to allocate in front of the returned bio
+ *
+ * Description:
+ *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
+ *    to ask for a number of bytes to be allocated in front of the bio.
+ *    Front pad allocation is useful for embedding the bio inside
+ *    another structure, to avoid allocating extra data to go with the bio.
+ *    Note that the bio must be embedded at the END of that structure always,
+ *    or things will break badly.
+ */
+struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
+{
+	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
+	struct bio_set *bs;
+
+	bs = kzalloc(sizeof(*bs), GFP_KERNEL);
+	if (!bs)
+		return NULL;
+
+	bs->front_pad = front_pad;
+
+	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
+	if (!bs->bio_slab) {
+		kfree(bs);
+		return NULL;
+	}
+
+	bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
+	if (!bs->bio_pool)
+		goto bad;
+
+	if (!biovec_create_pools(bs, pool_size))
+		return bs;
+
+bad:
+	bioset_free(bs);
+	return NULL;
+}
+EXPORT_SYMBOL(bioset_create);
+
+static void __init biovec_init_slabs(void)
+{
+	int i;
+
+	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
+		int size;
+		struct biovec_slab *bvs = bvec_slabs + i;
+
+		if (bvs->nr_vecs <= BIO_INLINE_VECS) {
+			bvs->slab = NULL;
+			continue;
+		}
+
+		size = bvs->nr_vecs * sizeof(struct bio_vec);
+		bvs->slab = kmem_cache_create(bvs->name, size, 0,
+                                SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	}
+}
+
+static int __init init_bio(void)
+{
+	bio_slab_max = 2;
+	bio_slab_nr = 0;
+	bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
+	if (!bio_slabs)
+		panic("bio: can't allocate bios\n");
+
+	bio_integrity_init();
+	biovec_init_slabs();
+
+	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
+	if (!fs_bio_set)
+		panic("bio: can't allocate bios\n");
+
+	if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
+		panic("bio: can't create integrity pool\n");
+
+	bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
+						     sizeof(struct bio_pair));
+	if (!bio_split_pool)
+		panic("bio: can't create split pool\n");
+
+	return 0;
+}
+subsys_initcall(init_bio);
diff --git a/fs/block_dev.c b/fs/block_dev.c
new file mode 100644
index 00000000..a580028e
--- /dev/null
+++ b/fs/block_dev.c
@@ -0,0 +1,1671 @@
+/*
+ *  linux/fs/block_dev.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
+ */
+
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/fcntl.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/major.h>
+#include <linux/device_cgroup.h>
+#include <linux/highmem.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/blkpg.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+#include <linux/mount.h>
+#include <linux/uio.h>
+#include <linux/namei.h>
+#include <linux/log2.h>
+#include <linux/kmemleak.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+struct bdev_inode {
+	struct block_device bdev;
+	struct inode vfs_inode;
+};
+
+static const struct address_space_operations def_blk_aops;
+
+static inline struct bdev_inode *BDEV_I(struct inode *inode)
+{
+	return container_of(inode, struct bdev_inode, vfs_inode);
+}
+
+inline struct block_device *I_BDEV(struct inode *inode)
+{
+	return &BDEV_I(inode)->bdev;
+}
+
+EXPORT_SYMBOL(I_BDEV);
+
+/*
+ * move the inode from it's current bdi to the a new bdi. if the inode is dirty
+ * we need to move it onto the dirty list of @dst so that the inode is always
+ * on the right list.
+ */
+static void bdev_inode_switch_bdi(struct inode *inode,
+			struct backing_dev_info *dst)
+{
+	spin_lock(&inode_wb_list_lock);
+	spin_lock(&inode->i_lock);
+	inode->i_data.backing_dev_info = dst;
+	if (inode->i_state & I_DIRTY)
+		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_wb_list_lock);
+}
+
+sector_t blkdev_max_block(struct block_device *bdev)
+{
+	sector_t retval = ~((sector_t)0);
+	loff_t sz = i_size_read(bdev->bd_inode);
+
+	if (sz) {
+		unsigned int size = block_size(bdev);
+		unsigned int sizebits = blksize_bits(size);
+		retval = (sz >> sizebits);
+	}
+	return retval;
+}
+
+/* Kill _all_ buffers and pagecache , dirty or not.. */
+static void kill_bdev(struct block_device *bdev)
+{
+	if (bdev->bd_inode->i_mapping->nrpages == 0)
+		return;
+	invalidate_bh_lrus();
+	truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
+}	
+
+int set_blocksize(struct block_device *bdev, int size)
+{
+	/* Size must be a power of two, and between 512 and PAGE_SIZE */
+	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
+		return -EINVAL;
+
+	/* Size cannot be smaller than the size supported by the device */
+	if (size < bdev_logical_block_size(bdev))
+		return -EINVAL;
+
+	/* Don't change the size if it is same as current */
+	if (bdev->bd_block_size != size) {
+		sync_blockdev(bdev);
+		bdev->bd_block_size = size;
+		bdev->bd_inode->i_blkbits = blksize_bits(size);
+		kill_bdev(bdev);
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(set_blocksize);
+
+int sb_set_blocksize(struct super_block *sb, int size)
+{
+	if (set_blocksize(sb->s_bdev, size))
+		return 0;
+	/* If we get here, we know size is power of two
+	 * and it's value is between 512 and PAGE_SIZE */
+	sb->s_blocksize = size;
+	sb->s_blocksize_bits = blksize_bits(size);
+	return sb->s_blocksize;
+}
+
+EXPORT_SYMBOL(sb_set_blocksize);
+
+int sb_min_blocksize(struct super_block *sb, int size)
+{
+	int minsize = bdev_logical_block_size(sb->s_bdev);
+	if (size < minsize)
+		size = minsize;
+	return sb_set_blocksize(sb, size);
+}
+
+EXPORT_SYMBOL(sb_min_blocksize);
+
+static int
+blkdev_get_block(struct inode *inode, sector_t iblock,
+		struct buffer_head *bh, int create)
+{
+	if (iblock >= blkdev_max_block(I_BDEV(inode))) {
+		if (create)
+			return -EIO;
+
+		/*
+		 * for reads, we're just trying to fill a partial page.
+		 * return a hole, they will have to call get_block again
+		 * before they can fill it, and they will get -EIO at that
+		 * time
+		 */
+		return 0;
+	}
+	bh->b_bdev = I_BDEV(inode);
+	bh->b_blocknr = iblock;
+	set_buffer_mapped(bh);
+	return 0;
+}
+
+static int
+blkdev_get_blocks(struct inode *inode, sector_t iblock,
+		struct buffer_head *bh, int create)
+{
+	sector_t end_block = blkdev_max_block(I_BDEV(inode));
+	unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
+
+	if ((iblock + max_blocks) > end_block) {
+		max_blocks = end_block - iblock;
+		if ((long)max_blocks <= 0) {
+			if (create)
+				return -EIO;	/* write fully beyond EOF */
+			/*
+			 * It is a read which is fully beyond EOF.  We return
+			 * a !buffer_mapped buffer
+			 */
+			max_blocks = 0;
+		}
+	}
+
+	bh->b_bdev = I_BDEV(inode);
+	bh->b_blocknr = iblock;
+	bh->b_size = max_blocks << inode->i_blkbits;
+	if (max_blocks)
+		set_buffer_mapped(bh);
+	return 0;
+}
+
+static ssize_t
+blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+			loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+
+	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
+				    nr_segs, blkdev_get_blocks, NULL, NULL, 0);
+}
+
+int __sync_blockdev(struct block_device *bdev, int wait)
+{
+	if (!bdev)
+		return 0;
+	if (!wait)
+		return filemap_flush(bdev->bd_inode->i_mapping);
+	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
+}
+
+/*
+ * Write out and wait upon all the dirty data associated with a block
+ * device via its mapping.  Does not take the superblock lock.
+ */
+int sync_blockdev(struct block_device *bdev)
+{
+	return __sync_blockdev(bdev, 1);
+}
+EXPORT_SYMBOL(sync_blockdev);
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * device.   Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_bdev(struct block_device *bdev)
+{
+	struct super_block *sb = get_super(bdev);
+	if (sb) {
+		int res = sync_filesystem(sb);
+		drop_super(sb);
+		return res;
+	}
+	return sync_blockdev(bdev);
+}
+EXPORT_SYMBOL(fsync_bdev);
+
+/**
+ * freeze_bdev  --  lock a filesystem and force it into a consistent state
+ * @bdev:	blockdevice to lock
+ *
+ * If a superblock is found on this device, we take the s_umount semaphore
+ * on it to make sure nobody unmounts until the snapshot creation is done.
+ * The reference counter (bd_fsfreeze_count) guarantees that only the last
+ * unfreeze process can unfreeze the frozen filesystem actually when multiple
+ * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
+ * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
+ * actually.
+ */
+struct super_block *freeze_bdev(struct block_device *bdev)
+{
+	struct super_block *sb;
+	int error = 0;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (++bdev->bd_fsfreeze_count > 1) {
+		/*
+		 * We don't even need to grab a reference - the first call
+		 * to freeze_bdev grab an active reference and only the last
+		 * thaw_bdev drops it.
+		 */
+		sb = get_super(bdev);
+		drop_super(sb);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return sb;
+	}
+
+	sb = get_active_super(bdev);
+	if (!sb)
+		goto out;
+	error = freeze_super(sb);
+	if (error) {
+		deactivate_super(sb);
+		bdev->bd_fsfreeze_count--;
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return ERR_PTR(error);
+	}
+	deactivate_super(sb);
+ out:
+	sync_blockdev(bdev);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	return sb;	/* thaw_bdev releases s->s_umount */
+}
+EXPORT_SYMBOL(freeze_bdev);
+
+/**
+ * thaw_bdev  -- unlock filesystem
+ * @bdev:	blockdevice to unlock
+ * @sb:		associated superblock
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_bdev().
+ */
+int thaw_bdev(struct block_device *bdev, struct super_block *sb)
+{
+	int error = -EINVAL;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (!bdev->bd_fsfreeze_count)
+		goto out;
+
+	error = 0;
+	if (--bdev->bd_fsfreeze_count > 0)
+		goto out;
+
+	if (!sb)
+		goto out;
+
+	error = thaw_super(sb);
+	if (error) {
+		bdev->bd_fsfreeze_count++;
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return error;
+	}
+out:
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(thaw_bdev);
+
+static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, blkdev_get_block, wbc);
+}
+
+static int blkdev_readpage(struct file * file, struct page * page)
+{
+	return block_read_full_page(page, blkdev_get_block);
+}
+
+static int blkdev_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	return block_write_begin(mapping, pos, len, flags, pagep,
+				 blkdev_get_block);
+}
+
+static int blkdev_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	int ret;
+	ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	return ret;
+}
+
+/*
+ * private llseek:
+ * for a block special file file->f_path.dentry->d_inode->i_size is zero
+ * so we compute the size by hand (just as in block_read/write above)
+ */
+static loff_t block_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *bd_inode = file->f_mapping->host;
+	loff_t size;
+	loff_t retval;
+
+	mutex_lock(&bd_inode->i_mutex);
+	size = i_size_read(bd_inode);
+
+	switch (origin) {
+		case 2:
+			offset += size;
+			break;
+		case 1:
+			offset += file->f_pos;
+	}
+	retval = -EINVAL;
+	if (offset >= 0 && offset <= size) {
+		if (offset != file->f_pos) {
+			file->f_pos = offset;
+		}
+		retval = offset;
+	}
+	mutex_unlock(&bd_inode->i_mutex);
+	return retval;
+}
+	
+int blkdev_fsync(struct file *filp, int datasync)
+{
+	struct inode *bd_inode = filp->f_mapping->host;
+	struct block_device *bdev = I_BDEV(bd_inode);
+	int error;
+
+	/*
+	 * There is no need to serialise calls to blkdev_issue_flush with
+	 * i_mutex and doing so causes performance issues with concurrent
+	 * O_SYNC writers to a block device.
+	 */
+	mutex_unlock(&bd_inode->i_mutex);
+
+	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
+	if (error == -EOPNOTSUPP)
+		error = 0;
+
+	mutex_lock(&bd_inode->i_mutex);
+
+	return error;
+}
+EXPORT_SYMBOL(blkdev_fsync);
+
+/*
+ * pseudo-fs
+ */
+
+static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
+static struct kmem_cache * bdev_cachep __read_mostly;
+
+static struct inode *bdev_alloc_inode(struct super_block *sb)
+{
+	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void bdev_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct bdev_inode *bdi = BDEV_I(inode);
+
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(bdev_cachep, bdi);
+}
+
+static void bdev_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, bdev_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct bdev_inode *ei = (struct bdev_inode *) foo;
+	struct block_device *bdev = &ei->bdev;
+
+	memset(bdev, 0, sizeof(*bdev));
+	mutex_init(&bdev->bd_mutex);
+	INIT_LIST_HEAD(&bdev->bd_inodes);
+	INIT_LIST_HEAD(&bdev->bd_list);
+#ifdef CONFIG_SYSFS
+	INIT_LIST_HEAD(&bdev->bd_holder_disks);
+#endif
+	inode_init_once(&ei->vfs_inode);
+	/* Initialize mutex for freeze. */
+	mutex_init(&bdev->bd_fsfreeze_mutex);
+}
+
+static inline void __bd_forget(struct inode *inode)
+{
+	list_del_init(&inode->i_devices);
+	inode->i_bdev = NULL;
+	inode->i_mapping = &inode->i_data;
+}
+
+static void bdev_evict_inode(struct inode *inode)
+{
+	struct block_device *bdev = &BDEV_I(inode)->bdev;
+	struct list_head *p;
+	truncate_inode_pages(&inode->i_data, 0);
+	invalidate_inode_buffers(inode); /* is it needed here? */
+	end_writeback(inode);
+	spin_lock(&bdev_lock);
+	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
+		__bd_forget(list_entry(p, struct inode, i_devices));
+	}
+	list_del_init(&bdev->bd_list);
+	spin_unlock(&bdev_lock);
+}
+
+static const struct super_operations bdev_sops = {
+	.statfs = simple_statfs,
+	.alloc_inode = bdev_alloc_inode,
+	.destroy_inode = bdev_destroy_inode,
+	.drop_inode = generic_delete_inode,
+	.evict_inode = bdev_evict_inode,
+};
+
+static struct dentry *bd_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
+}
+
+static struct file_system_type bd_type = {
+	.name		= "bdev",
+	.mount		= bd_mount,
+	.kill_sb	= kill_anon_super,
+};
+
+struct super_block *blockdev_superblock __read_mostly;
+
+void __init bdev_cache_init(void)
+{
+	int err;
+	struct vfsmount *bd_mnt;
+
+	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
+			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+				SLAB_MEM_SPREAD|SLAB_PANIC),
+			init_once);
+	err = register_filesystem(&bd_type);
+	if (err)
+		panic("Cannot register bdev pseudo-fs");
+	bd_mnt = kern_mount(&bd_type);
+	if (IS_ERR(bd_mnt))
+		panic("Cannot create bdev pseudo-fs");
+	/*
+	 * This vfsmount structure is only used to obtain the
+	 * blockdev_superblock, so tell kmemleak not to report it.
+	 */
+	kmemleak_not_leak(bd_mnt);
+	blockdev_superblock = bd_mnt->mnt_sb;	/* For writeback */
+}
+
+/*
+ * Most likely _very_ bad one - but then it's hardly critical for small
+ * /dev and can be fixed when somebody will need really large one.
+ * Keep in mind that it will be fed through icache hash function too.
+ */
+static inline unsigned long hash(dev_t dev)
+{
+	return MAJOR(dev)+MINOR(dev);
+}
+
+static int bdev_test(struct inode *inode, void *data)
+{
+	return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
+}
+
+static int bdev_set(struct inode *inode, void *data)
+{
+	BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
+	return 0;
+}
+
+static LIST_HEAD(all_bdevs);
+
+struct block_device *bdget(dev_t dev)
+{
+	struct block_device *bdev;
+	struct inode *inode;
+
+	inode = iget5_locked(blockdev_superblock, hash(dev),
+			bdev_test, bdev_set, &dev);
+
+	if (!inode)
+		return NULL;
+
+	bdev = &BDEV_I(inode)->bdev;
+
+	if (inode->i_state & I_NEW) {
+		bdev->bd_contains = NULL;
+		bdev->bd_inode = inode;
+		bdev->bd_block_size = (1 << inode->i_blkbits);
+		bdev->bd_part_count = 0;
+		bdev->bd_invalidated = 0;
+		inode->i_mode = S_IFBLK;
+		inode->i_rdev = dev;
+		inode->i_bdev = bdev;
+		inode->i_data.a_ops = &def_blk_aops;
+		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
+		inode->i_data.backing_dev_info = &default_backing_dev_info;
+		spin_lock(&bdev_lock);
+		list_add(&bdev->bd_list, &all_bdevs);
+		spin_unlock(&bdev_lock);
+		unlock_new_inode(inode);
+	}
+	return bdev;
+}
+
+EXPORT_SYMBOL(bdget);
+
+/**
+ * bdgrab -- Grab a reference to an already referenced block device
+ * @bdev:	Block device to grab a reference to.
+ */
+struct block_device *bdgrab(struct block_device *bdev)
+{
+	ihold(bdev->bd_inode);
+	return bdev;
+}
+
+long nr_blockdev_pages(void)
+{
+	struct block_device *bdev;
+	long ret = 0;
+	spin_lock(&bdev_lock);
+	list_for_each_entry(bdev, &all_bdevs, bd_list) {
+		ret += bdev->bd_inode->i_mapping->nrpages;
+	}
+	spin_unlock(&bdev_lock);
+	return ret;
+}
+
+void bdput(struct block_device *bdev)
+{
+	iput(bdev->bd_inode);
+}
+
+EXPORT_SYMBOL(bdput);
+ 
+static struct block_device *bd_acquire(struct inode *inode)
+{
+	struct block_device *bdev;
+
+	spin_lock(&bdev_lock);
+	bdev = inode->i_bdev;
+	if (bdev) {
+		ihold(bdev->bd_inode);
+		spin_unlock(&bdev_lock);
+		return bdev;
+	}
+	spin_unlock(&bdev_lock);
+
+	bdev = bdget(inode->i_rdev);
+	if (bdev) {
+		spin_lock(&bdev_lock);
+		if (!inode->i_bdev) {
+			/*
+			 * We take an additional reference to bd_inode,
+			 * and it's released in clear_inode() of inode.
+			 * So, we can access it via ->i_mapping always
+			 * without igrab().
+			 */
+			ihold(bdev->bd_inode);
+			inode->i_bdev = bdev;
+			inode->i_mapping = bdev->bd_inode->i_mapping;
+			list_add(&inode->i_devices, &bdev->bd_inodes);
+		}
+		spin_unlock(&bdev_lock);
+	}
+	return bdev;
+}
+
+/* Call when you free inode */
+
+void bd_forget(struct inode *inode)
+{
+	struct block_device *bdev = NULL;
+
+	spin_lock(&bdev_lock);
+	if (inode->i_bdev) {
+		if (!sb_is_blkdev_sb(inode->i_sb))
+			bdev = inode->i_bdev;
+		__bd_forget(inode);
+	}
+	spin_unlock(&bdev_lock);
+
+	if (bdev)
+		iput(bdev->bd_inode);
+}
+
+/**
+ * bd_may_claim - test whether a block device can be claimed
+ * @bdev: block device of interest
+ * @whole: whole block device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Test whether @bdev can be claimed by @holder.
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).
+ *
+ * RETURNS:
+ * %true if @bdev can be claimed, %false otherwise.
+ */
+static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
+			 void *holder)
+{
+	if (bdev->bd_holder == holder)
+		return true;	 /* already a holder */
+	else if (bdev->bd_holder != NULL)
+		return false; 	 /* held by someone else */
+	else if (bdev->bd_contains == bdev)
+		return true;  	 /* is a whole device which isn't held */
+
+	else if (whole->bd_holder == bd_may_claim)
+		return true; 	 /* is a partition of a device that is being partitioned */
+	else if (whole->bd_holder != NULL)
+		return false;	 /* is a partition of a held device */
+	else
+		return true;	 /* is a partition of an un-held device */
+}
+
+/**
+ * bd_prepare_to_claim - prepare to claim a block device
+ * @bdev: block device of interest
+ * @whole: the whole device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Prepare to claim @bdev.  This function fails if @bdev is already
+ * claimed by another holder and waits if another claiming is in
+ * progress.  This function doesn't actually claim.  On successful
+ * return, the caller has ownership of bd_claiming and bd_holder[s].
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).  Might release bdev_lock, sleep and regrab
+ * it multiple times.
+ *
+ * RETURNS:
+ * 0 if @bdev can be claimed, -EBUSY otherwise.
+ */
+static int bd_prepare_to_claim(struct block_device *bdev,
+			       struct block_device *whole, void *holder)
+{
+retry:
+	/* if someone else claimed, fail */
+	if (!bd_may_claim(bdev, whole, holder))
+		return -EBUSY;
+
+	/* if claiming is already in progress, wait for it to finish */
+	if (whole->bd_claiming) {
+		wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&bdev_lock);
+		schedule();
+		finish_wait(wq, &wait);
+		spin_lock(&bdev_lock);
+		goto retry;
+	}
+
+	/* yay, all mine */
+	return 0;
+}
+
+/**
+ * bd_start_claiming - start claiming a block device
+ * @bdev: block device of interest
+ * @holder: holder trying to claim @bdev
+ *
+ * @bdev is about to be opened exclusively.  Check @bdev can be opened
+ * exclusively and mark that an exclusive open is in progress.  Each
+ * successful call to this function must be matched with a call to
+ * either bd_finish_claiming() or bd_abort_claiming() (which do not
+ * fail).
+ *
+ * This function is used to gain exclusive access to the block device
+ * without actually causing other exclusive open attempts to fail. It
+ * should be used when the open sequence itself requires exclusive
+ * access but may subsequently fail.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to the block device containing @bdev on success, ERR_PTR()
+ * value on failure.
+ */
+static struct block_device *bd_start_claiming(struct block_device *bdev,
+					      void *holder)
+{
+	struct gendisk *disk;
+	struct block_device *whole;
+	int partno, err;
+
+	might_sleep();
+
+	/*
+	 * @bdev might not have been initialized properly yet, look up
+	 * and grab the outer block device the hard way.
+	 */
+	disk = get_gendisk(bdev->bd_dev, &partno);
+	if (!disk)
+		return ERR_PTR(-ENXIO);
+
+	/*
+	 * Normally, @bdev should equal what's returned from bdget_disk()
+	 * if partno is 0; however, some drivers (floppy) use multiple
+	 * bdev's for the same physical device and @bdev may be one of the
+	 * aliases.  Keep @bdev if partno is 0.  This means claimer
+	 * tracking is broken for those devices but it has always been that
+	 * way.
+	 */
+	if (partno)
+		whole = bdget_disk(disk, 0);
+	else
+		whole = bdgrab(bdev);
+
+	module_put(disk->fops->owner);
+	put_disk(disk);
+	if (!whole)
+		return ERR_PTR(-ENOMEM);
+
+	/* prepare to claim, if successful, mark claiming in progress */
+	spin_lock(&bdev_lock);
+
+	err = bd_prepare_to_claim(bdev, whole, holder);
+	if (err == 0) {
+		whole->bd_claiming = holder;
+		spin_unlock(&bdev_lock);
+		return whole;
+	} else {
+		spin_unlock(&bdev_lock);
+		bdput(whole);
+		return ERR_PTR(err);
+	}
+}
+
+#ifdef CONFIG_SYSFS
+struct bd_holder_disk {
+	struct list_head	list;
+	struct gendisk		*disk;
+	int			refcnt;
+};
+
+static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
+						  struct gendisk *disk)
+{
+	struct bd_holder_disk *holder;
+
+	list_for_each_entry(holder, &bdev->bd_holder_disks, list)
+		if (holder->disk == disk)
+			return holder;
+	return NULL;
+}
+
+static int add_symlink(struct kobject *from, struct kobject *to)
+{
+	return sysfs_create_link(from, to, kobject_name(to));
+}
+
+static void del_symlink(struct kobject *from, struct kobject *to)
+{
+	sysfs_remove_link(from, kobject_name(to));
+}
+
+/**
+ * bd_link_disk_holder - create symlinks between holding disk and slave bdev
+ * @bdev: the claimed slave bdev
+ * @disk: the holding disk
+ *
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
+ *
+ * This functions creates the following sysfs symlinks.
+ *
+ * - from "slaves" directory of the holder @disk to the claimed @bdev
+ * - from "holders" directory of the @bdev to the holder @disk
+ *
+ * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
+ * passed to bd_link_disk_holder(), then:
+ *
+ *   /sys/block/dm-0/slaves/sda --> /sys/block/sda
+ *   /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
+ *
+ * The caller must have claimed @bdev before calling this function and
+ * ensure that both @bdev and @disk are valid during the creation and
+ * lifetime of these symlinks.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
+{
+	struct bd_holder_disk *holder;
+	int ret = 0;
+
+	mutex_lock(&bdev->bd_mutex);
+
+	WARN_ON_ONCE(!bdev->bd_holder);
+
+	/* FIXME: remove the following once add_disk() handles errors */
+	if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
+		goto out_unlock;
+
+	holder = bd_find_holder_disk(bdev, disk);
+	if (holder) {
+		holder->refcnt++;
+		goto out_unlock;
+	}
+
+	holder = kzalloc(sizeof(*holder), GFP_KERNEL);
+	if (!holder) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	INIT_LIST_HEAD(&holder->list);
+	holder->disk = disk;
+	holder->refcnt = 1;
+
+	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+	if (ret)
+		goto out_free;
+
+	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
+	if (ret)
+		goto out_del;
+	/*
+	 * bdev could be deleted beneath us which would implicitly destroy
+	 * the holder directory.  Hold on to it.
+	 */
+	kobject_get(bdev->bd_part->holder_dir);
+
+	list_add(&holder->list, &bdev->bd_holder_disks);
+	goto out_unlock;
+
+out_del:
+	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+out_free:
+	kfree(holder);
+out_unlock:
+	mutex_unlock(&bdev->bd_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(bd_link_disk_holder);
+
+/**
+ * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
+ * @bdev: the calimed slave bdev
+ * @disk: the holding disk
+ *
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
+{
+	struct bd_holder_disk *holder;
+
+	mutex_lock(&bdev->bd_mutex);
+
+	holder = bd_find_holder_disk(bdev, disk);
+
+	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
+		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+		del_symlink(bdev->bd_part->holder_dir,
+			    &disk_to_dev(disk)->kobj);
+		kobject_put(bdev->bd_part->holder_dir);
+		list_del_init(&holder->list);
+		kfree(holder);
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
+}
+EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
+#endif
+
+/**
+ * flush_disk - invalidates all buffer-cache entries on a disk
+ *
+ * @bdev:      struct block device to be flushed
+ * @kill_dirty: flag to guide handling of dirty inodes
+ *
+ * Invalidates all buffer-cache entries on a disk. It should be called
+ * when a disk has been changed -- either by a media change or online
+ * resize.
+ */
+static void flush_disk(struct block_device *bdev, bool kill_dirty)
+{
+	if (__invalidate_device(bdev, kill_dirty)) {
+		char name[BDEVNAME_SIZE] = "";
+
+		if (bdev->bd_disk)
+			disk_name(bdev->bd_disk, 0, name);
+		printk(KERN_WARNING "VFS: busy inodes on changed media or "
+		       "resized disk %s\n", name);
+	}
+
+	if (!bdev->bd_disk)
+		return;
+	if (disk_partitionable(bdev->bd_disk))
+		bdev->bd_invalidated = 1;
+}
+
+/**
+ * check_disk_size_change - checks for disk size change and adjusts bdev size.
+ * @disk: struct gendisk to check
+ * @bdev: struct bdev to adjust.
+ *
+ * This routine checks to see if the bdev size does not match the disk size
+ * and adjusts it if it differs.
+ */
+void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
+{
+	loff_t disk_size, bdev_size;
+
+	disk_size = (loff_t)get_capacity(disk) << 9;
+	bdev_size = i_size_read(bdev->bd_inode);
+	if (disk_size != bdev_size) {
+		char name[BDEVNAME_SIZE];
+
+		disk_name(disk, 0, name);
+		printk(KERN_INFO
+		       "%s: detected capacity change from %lld to %lld\n",
+		       name, bdev_size, disk_size);
+		i_size_write(bdev->bd_inode, disk_size);
+		flush_disk(bdev, false);
+	}
+}
+EXPORT_SYMBOL(check_disk_size_change);
+
+/**
+ * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
+ * @disk: struct gendisk to be revalidated
+ *
+ * This routine is a wrapper for lower-level driver's revalidate_disk
+ * call-backs.  It is used to do common pre and post operations needed
+ * for all revalidate_disk operations.
+ */
+int revalidate_disk(struct gendisk *disk)
+{
+	struct block_device *bdev;
+	int ret = 0;
+
+	if (disk->fops->revalidate_disk)
+		ret = disk->fops->revalidate_disk(disk);
+
+	bdev = bdget_disk(disk, 0);
+	if (!bdev)
+		return ret;
+
+	mutex_lock(&bdev->bd_mutex);
+	check_disk_size_change(disk, bdev);
+	mutex_unlock(&bdev->bd_mutex);
+	bdput(bdev);
+	return ret;
+}
+EXPORT_SYMBOL(revalidate_disk);
+
+/*
+ * This routine checks whether a removable media has been changed,
+ * and invalidates all buffer-cache-entries in that case. This
+ * is a relatively slow routine, so we have to try to minimize using
+ * it. Thus it is called only upon a 'mount' or 'open'. This
+ * is the best way of combining speed and utility, I think.
+ * People changing diskettes in the middle of an operation deserve
+ * to lose :-)
+ */
+int check_disk_change(struct block_device *bdev)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	const struct block_device_operations *bdops = disk->fops;
+	unsigned int events;
+
+	events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
+				   DISK_EVENT_EJECT_REQUEST);
+	if (!(events & DISK_EVENT_MEDIA_CHANGE))
+		return 0;
+
+	flush_disk(bdev, true);
+	if (bdops->revalidate_disk)
+		bdops->revalidate_disk(bdev->bd_disk);
+	return 1;
+}
+
+EXPORT_SYMBOL(check_disk_change);
+
+void bd_set_size(struct block_device *bdev, loff_t size)
+{
+	unsigned bsize = bdev_logical_block_size(bdev);
+
+	bdev->bd_inode->i_size = size;
+	while (bsize < PAGE_CACHE_SIZE) {
+		if (size & bsize)
+			break;
+		bsize <<= 1;
+	}
+	bdev->bd_block_size = bsize;
+	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
+}
+EXPORT_SYMBOL(bd_set_size);
+
+static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
+
+/*
+ * bd_mutex locking:
+ *
+ *  mutex_lock(part->bd_mutex)
+ *    mutex_lock_nested(whole->bd_mutex, 1)
+ */
+
+static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
+{
+	struct gendisk *disk;
+	struct module *owner;
+	int ret;
+	int partno;
+	int perm = 0;
+
+	if (mode & FMODE_READ)
+		perm |= MAY_READ;
+	if (mode & FMODE_WRITE)
+		perm |= MAY_WRITE;
+	/*
+	 * hooks: /n/, see "layering violations".
+	 */
+	if (!for_part) {
+		ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+		if (ret != 0) {
+			bdput(bdev);
+			return ret;
+		}
+	}
+
+ restart:
+
+	ret = -ENXIO;
+	disk = get_gendisk(bdev->bd_dev, &partno);
+	if (!disk)
+		goto out;
+	owner = disk->fops->owner;
+
+	disk_block_events(disk);
+	mutex_lock_nested(&bdev->bd_mutex, for_part);
+	if (!bdev->bd_openers) {
+		bdev->bd_disk = disk;
+		bdev->bd_contains = bdev;
+		if (!partno) {
+			struct backing_dev_info *bdi;
+
+			ret = -ENXIO;
+			bdev->bd_part = disk_get_part(disk, partno);
+			if (!bdev->bd_part)
+				goto out_clear;
+
+			ret = 0;
+			if (disk->fops->open) {
+				ret = disk->fops->open(bdev, mode);
+				if (ret == -ERESTARTSYS) {
+					/* Lost a race with 'disk' being
+					 * deleted, try again.
+					 * See md.c
+					 */
+					disk_put_part(bdev->bd_part);
+					bdev->bd_part = NULL;
+					bdev->bd_disk = NULL;
+					mutex_unlock(&bdev->bd_mutex);
+					disk_unblock_events(disk);
+					put_disk(disk);
+					module_put(owner);
+					goto restart;
+				}
+			}
+
+			if (!ret && !bdev->bd_openers) {
+				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+				bdi = blk_get_backing_dev_info(bdev);
+				if (bdi == NULL)
+					bdi = &default_backing_dev_info;
+				bdev_inode_switch_bdi(bdev->bd_inode, bdi);
+			}
+
+			/*
+			 * If the device is invalidated, rescan partition
+			 * if open succeeded or failed with -ENOMEDIUM.
+			 * The latter is necessary to prevent ghost
+			 * partitions on a removed medium.
+			 */
+			if (bdev->bd_invalidated) {
+				if (!ret)
+					rescan_partitions(disk, bdev);
+				else if (ret == -ENOMEDIUM)
+					invalidate_partitions(disk, bdev);
+			}
+			if (ret)
+				goto out_clear;
+		} else {
+			struct block_device *whole;
+			whole = bdget_disk(disk, 0);
+			ret = -ENOMEM;
+			if (!whole)
+				goto out_clear;
+			BUG_ON(for_part);
+			ret = __blkdev_get(whole, mode, 1);
+			if (ret)
+				goto out_clear;
+			bdev->bd_contains = whole;
+			bdev_inode_switch_bdi(bdev->bd_inode,
+				whole->bd_inode->i_data.backing_dev_info);
+			bdev->bd_part = disk_get_part(disk, partno);
+			if (!(disk->flags & GENHD_FL_UP) ||
+			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
+				ret = -ENXIO;
+				goto out_clear;
+			}
+			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
+		}
+	} else {
+		if (bdev->bd_contains == bdev) {
+			ret = 0;
+			if (bdev->bd_disk->fops->open)
+				ret = bdev->bd_disk->fops->open(bdev, mode);
+			/* the same as first opener case, read comment there */
+			if (bdev->bd_invalidated) {
+				if (!ret)
+					rescan_partitions(bdev->bd_disk, bdev);
+				else if (ret == -ENOMEDIUM)
+					invalidate_partitions(bdev->bd_disk, bdev);
+			}
+			if (ret)
+				goto out_unlock_bdev;
+		}
+		/* only one opener holds refs to the module and disk */
+		put_disk(disk);
+		module_put(owner);
+	}
+	bdev->bd_openers++;
+	if (for_part)
+		bdev->bd_part_count++;
+	mutex_unlock(&bdev->bd_mutex);
+	disk_unblock_events(disk);
+	return 0;
+
+ out_clear:
+	disk_put_part(bdev->bd_part);
+	bdev->bd_disk = NULL;
+	bdev->bd_part = NULL;
+	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
+	if (bdev != bdev->bd_contains)
+		__blkdev_put(bdev->bd_contains, mode, 1);
+	bdev->bd_contains = NULL;
+ out_unlock_bdev:
+	mutex_unlock(&bdev->bd_mutex);
+	disk_unblock_events(disk);
+	put_disk(disk);
+	module_put(owner);
+ out:
+	bdput(bdev);
+
+	return ret;
+}
+
+/**
+ * blkdev_get - open a block device
+ * @bdev: block_device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open @bdev with @mode.  If @mode includes %FMODE_EXCL, @bdev is
+ * open with exclusive access.  Specifying %FMODE_EXCL with %NULL
+ * @holder is invalid.  Exclusive opens may nest for the same @holder.
+ *
+ * On success, the reference count of @bdev is unchanged.  On failure,
+ * @bdev is put.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
+{
+	struct block_device *whole = NULL;
+	int res;
+
+	WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
+
+	if ((mode & FMODE_EXCL) && holder) {
+		whole = bd_start_claiming(bdev, holder);
+		if (IS_ERR(whole)) {
+			bdput(bdev);
+			return PTR_ERR(whole);
+		}
+	}
+
+	res = __blkdev_get(bdev, mode, 0);
+
+	if (whole) {
+		struct gendisk *disk = whole->bd_disk;
+
+		/* finish claiming */
+		mutex_lock(&bdev->bd_mutex);
+		spin_lock(&bdev_lock);
+
+		if (!res) {
+			BUG_ON(!bd_may_claim(bdev, whole, holder));
+			/*
+			 * Note that for a whole device bd_holders
+			 * will be incremented twice, and bd_holder
+			 * will be set to bd_may_claim before being
+			 * set to holder
+			 */
+			whole->bd_holders++;
+			whole->bd_holder = bd_may_claim;
+			bdev->bd_holders++;
+			bdev->bd_holder = holder;
+		}
+
+		/* tell others that we're done */
+		BUG_ON(whole->bd_claiming != holder);
+		whole->bd_claiming = NULL;
+		wake_up_bit(&whole->bd_claiming, 0);
+
+		spin_unlock(&bdev_lock);
+
+		/*
+		 * Block event polling for write claims if requested.  Any
+		 * write holder makes the write_holder state stick until
+		 * all are released.  This is good enough and tracking
+		 * individual writeable reference is too fragile given the
+		 * way @mode is used in blkdev_get/put().
+		 */
+		if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
+		    (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
+			bdev->bd_write_holder = true;
+			disk_block_events(disk);
+		}
+
+		mutex_unlock(&bdev->bd_mutex);
+		bdput(whole);
+	}
+
+	return res;
+}
+EXPORT_SYMBOL(blkdev_get);
+
+/**
+ * blkdev_get_by_path - open a block device by name
+ * @path: path to the block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by the device file at @path.  @mode
+ * and @holder are identical to blkdev_get().
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+					void *holder)
+{
+	struct block_device *bdev;
+	int err;
+
+	bdev = lookup_bdev(path);
+	if (IS_ERR(bdev))
+		return bdev;
+
+	err = blkdev_get(bdev, mode, holder);
+	if (err)
+		return ERR_PTR(err);
+
+	if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+		blkdev_put(bdev, mode);
+		return ERR_PTR(-EACCES);
+	}
+
+	return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_path);
+
+/**
+ * blkdev_get_by_dev - open a block device by device number
+ * @dev: device number of block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by device number @dev.  @mode and
+ * @holder are identical to blkdev_get().
+ *
+ * Use it ONLY if you really do not have anything better - i.e. when
+ * you are behind a truly sucky interface and all you are given is a
+ * device number.  _Never_ to be used for internal purposes.  If you
+ * ever need it - reconsider your API.
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
+{
+	struct block_device *bdev;
+	int err;
+
+	bdev = bdget(dev);
+	if (!bdev)
+		return ERR_PTR(-ENOMEM);
+
+	err = blkdev_get(bdev, mode, holder);
+	if (err)
+		return ERR_PTR(err);
+
+	return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_dev);
+
+static int blkdev_open(struct inode * inode, struct file * filp)
+{
+	struct block_device *bdev;
+
+	/*
+	 * Preserve backwards compatibility and allow large file access
+	 * even if userspace doesn't ask for it explicitly. Some mkfs
+	 * binary needs it. We might want to drop this workaround
+	 * during an unstable branch.
+	 */
+	filp->f_flags |= O_LARGEFILE;
+
+	if (filp->f_flags & O_NDELAY)
+		filp->f_mode |= FMODE_NDELAY;
+	if (filp->f_flags & O_EXCL)
+		filp->f_mode |= FMODE_EXCL;
+	if ((filp->f_flags & O_ACCMODE) == 3)
+		filp->f_mode |= FMODE_WRITE_IOCTL;
+
+	bdev = bd_acquire(inode);
+	if (bdev == NULL)
+		return -ENOMEM;
+
+	filp->f_mapping = bdev->bd_inode->i_mapping;
+
+	return blkdev_get(bdev, filp->f_mode, filp);
+}
+
+static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
+{
+	int ret = 0;
+	struct gendisk *disk = bdev->bd_disk;
+	struct block_device *victim = NULL;
+
+	mutex_lock_nested(&bdev->bd_mutex, for_part);
+	if (for_part)
+		bdev->bd_part_count--;
+
+	if (!--bdev->bd_openers) {
+		WARN_ON_ONCE(bdev->bd_holders);
+		sync_blockdev(bdev);
+		kill_bdev(bdev);
+		/* ->release can cause the old bdi to disappear,
+		 * so must switch it out first
+		 */
+		bdev_inode_switch_bdi(bdev->bd_inode,
+					&default_backing_dev_info);
+	}
+	if (bdev->bd_contains == bdev) {
+		if (disk->fops->release)
+			ret = disk->fops->release(disk, mode);
+	}
+	if (!bdev->bd_openers) {
+		struct module *owner = disk->fops->owner;
+
+		disk_put_part(bdev->bd_part);
+		bdev->bd_part = NULL;
+		bdev->bd_disk = NULL;
+		if (bdev != bdev->bd_contains)
+			victim = bdev->bd_contains;
+		bdev->bd_contains = NULL;
+
+		put_disk(disk);
+		module_put(owner);
+	}
+	mutex_unlock(&bdev->bd_mutex);
+	bdput(bdev);
+	if (victim)
+		__blkdev_put(victim, mode, 1);
+	return ret;
+}
+
+int blkdev_put(struct block_device *bdev, fmode_t mode)
+{
+	if (mode & FMODE_EXCL) {
+		bool bdev_free;
+
+		/*
+		 * Release a claim on the device.  The holder fields
+		 * are protected with bdev_lock.  bd_mutex is to
+		 * synchronize disk_holder unlinking.
+		 */
+		mutex_lock(&bdev->bd_mutex);
+		spin_lock(&bdev_lock);
+
+		WARN_ON_ONCE(--bdev->bd_holders < 0);
+		WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
+
+		/* bd_contains might point to self, check in a separate step */
+		if ((bdev_free = !bdev->bd_holders))
+			bdev->bd_holder = NULL;
+		if (!bdev->bd_contains->bd_holders)
+			bdev->bd_contains->bd_holder = NULL;
+
+		spin_unlock(&bdev_lock);
+
+		/*
+		 * If this was the last claim, remove holder link and
+		 * unblock evpoll if it was a write holder.
+		 */
+		if (bdev_free) {
+			if (bdev->bd_write_holder) {
+				disk_unblock_events(bdev->bd_disk);
+				disk_check_events(bdev->bd_disk);
+				bdev->bd_write_holder = false;
+			}
+		}
+
+		mutex_unlock(&bdev->bd_mutex);
+	}
+
+	return __blkdev_put(bdev, mode, 0);
+}
+EXPORT_SYMBOL(blkdev_put);
+
+static int blkdev_close(struct inode * inode, struct file * filp)
+{
+	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+
+	return blkdev_put(bdev, filp->f_mode);
+}
+
+static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	struct block_device *bdev = I_BDEV(file->f_mapping->host);
+	fmode_t mode = file->f_mode;
+
+	/*
+	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
+	 * to updated it before every ioctl.
+	 */
+	if (file->f_flags & O_NDELAY)
+		mode |= FMODE_NDELAY;
+	else
+		mode &= ~FMODE_NDELAY;
+
+	return blkdev_ioctl(bdev, mode, cmd, arg);
+}
+
+/*
+ * Write data to the block device.  Only intended for the block device itself
+ * and the raw driver which basically is a fake block device.
+ *
+ * Does not take i_mutex for the write and thus is not for general purpose
+ * use.
+ */
+ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			 unsigned long nr_segs, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t ret;
+
+	BUG_ON(iocb->ki_pos != pos);
+
+	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+	if (ret > 0 || ret == -EIOCBQUEUED) {
+		ssize_t err;
+
+		err = generic_write_sync(file, pos, ret);
+		if (err < 0 && ret > 0)
+			ret = err;
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_aio_write);
+
+/*
+ * Try to release a page associated with block device when the system
+ * is under memory pressure.
+ */
+static int blkdev_releasepage(struct page *page, gfp_t wait)
+{
+	struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
+
+	if (super && super->s_op->bdev_try_to_free_page)
+		return super->s_op->bdev_try_to_free_page(super, page, wait);
+
+	return try_to_free_buffers(page);
+}
+
+static const struct address_space_operations def_blk_aops = {
+	.readpage	= blkdev_readpage,
+	.writepage	= blkdev_writepage,
+	.write_begin	= blkdev_write_begin,
+	.write_end	= blkdev_write_end,
+	.writepages	= generic_writepages,
+	.releasepage	= blkdev_releasepage,
+	.direct_IO	= blkdev_direct_IO,
+};
+
+const struct file_operations def_blk_fops = {
+	.open		= blkdev_open,
+	.release	= blkdev_close,
+	.llseek		= block_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+  	.aio_read	= generic_file_aio_read,
+	.aio_write	= blkdev_aio_write,
+	.mmap		= generic_file_mmap,
+	.fsync		= blkdev_fsync,
+	.unlocked_ioctl	= block_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= compat_blkdev_ioctl,
+#endif
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+};
+
+int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
+{
+	int res;
+	mm_segment_t old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	res = blkdev_ioctl(bdev, 0, cmd, arg);
+	set_fs(old_fs);
+	return res;
+}
+
+EXPORT_SYMBOL(ioctl_by_bdev);
+
+/**
+ * lookup_bdev  - lookup a struct block_device by name
+ * @pathname:	special file representing the block device
+ *
+ * Get a reference to the blockdevice at @pathname in the current
+ * namespace if possible and return it.  Return ERR_PTR(error)
+ * otherwise.
+ */
+struct block_device *lookup_bdev(const char *pathname)
+{
+	struct block_device *bdev;
+	struct inode *inode;
+	struct path path;
+	int error;
+
+	if (!pathname || !*pathname)
+		return ERR_PTR(-EINVAL);
+
+	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
+	if (error)
+		return ERR_PTR(error);
+
+	inode = path.dentry->d_inode;
+	error = -ENOTBLK;
+	if (!S_ISBLK(inode->i_mode))
+		goto fail;
+	error = -EACCES;
+	if (path.mnt->mnt_flags & MNT_NODEV)
+		goto fail;
+	error = -ENOMEM;
+	bdev = bd_acquire(inode);
+	if (!bdev)
+		goto fail;
+out:
+	path_put(&path);
+	return bdev;
+fail:
+	bdev = ERR_PTR(error);
+	goto out;
+}
+EXPORT_SYMBOL(lookup_bdev);
+
+int __invalidate_device(struct block_device *bdev, bool kill_dirty)
+{
+	struct super_block *sb = get_super(bdev);
+	int res = 0;
+
+	if (sb) {
+		/*
+		 * no need to lock the super, get_super holds the
+		 * read mutex so the filesystem cannot go away
+		 * under us (->put_super runs with the write lock
+		 * hold).
+		 */
+		shrink_dcache_sb(sb);
+		res = invalidate_inodes(sb, kill_dirty);
+		drop_super(sb);
+	}
+	invalidate_bdev(bdev);
+	return res;
+}
+EXPORT_SYMBOL(__invalidate_device);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
new file mode 100644
index 00000000..ecb9fd3b
--- /dev/null
+++ b/fs/btrfs/Kconfig
@@ -0,0 +1,33 @@
+config BTRFS_FS
+	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
+	depends on EXPERIMENTAL
+	select LIBCRC32C
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	select LZO_COMPRESS
+	select LZO_DECOMPRESS
+	help
+	  Btrfs is a new filesystem with extents, writable snapshotting,
+	  support for multiple devices and many more features.
+
+	  Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
+	  FINALIZED.  You should say N here unless you are interested in
+	  testing Btrfs with non-critical data.
+
+	  To compile this file system support as a module, choose M here. The
+	  module will be called btrfs.
+
+	  If unsure, say N.
+
+config BTRFS_FS_POSIX_ACL
+	bool "Btrfs POSIX Access Control Lists"
+	depends on BTRFS_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 00000000..9b72dcf1
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,10 @@
+
+obj-$(CONFIG_BTRFS_FS) := btrfs.o
+
+btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+	   file-item.o inode-item.o inode-map.o disk-io.o \
+	   transaction.o inode.o file.o tree-defrag.o \
+	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
+	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
+	   export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
+	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 00000000..f66fc995
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,335 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "xattr.h"
+
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+
+static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+{
+	int size;
+	const char *name;
+	char *value = NULL;
+	struct posix_acl *acl;
+
+	if (!IS_POSIXACL(inode))
+		return NULL;
+
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+
+	size = __btrfs_getxattr(inode, name, "", 0);
+	if (size > 0) {
+		value = kzalloc(size, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		size = __btrfs_getxattr(inode, name, value, size);
+		if (size > 0) {
+			acl = posix_acl_from_xattr(value, size);
+			if (IS_ERR(acl)) {
+				kfree(value);
+				return acl;
+			}
+			set_cached_acl(inode, type, acl);
+		}
+		kfree(value);
+	} else if (size == -ENOENT || size == -ENODATA || size == 0) {
+		/* FIXME, who returns -ENOENT?  I think nobody */
+		acl = NULL;
+		set_cached_acl(inode, type, acl);
+	} else {
+		acl = ERR_PTR(-EIO);
+	}
+
+	return acl;
+}
+
+static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
+		void *value, size_t size, int type)
+{
+	struct posix_acl *acl;
+	int ret = 0;
+
+	if (!IS_POSIXACL(dentry->d_inode))
+		return -EOPNOTSUPP;
+
+	acl = btrfs_get_acl(dentry->d_inode, type);
+
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	ret = posix_acl_to_xattr(acl, value, size);
+	posix_acl_release(acl);
+
+	return ret;
+}
+
+/*
+ * Needs to be called with fs_mutex held
+ */
+static int btrfs_set_acl(struct btrfs_trans_handle *trans,
+			 struct inode *inode, struct posix_acl *acl, int type)
+{
+	int ret, size = 0;
+	const char *name;
+	char *value = NULL;
+	mode_t mode;
+
+	if (acl) {
+		ret = posix_acl_valid(acl);
+		if (ret < 0)
+			return ret;
+		ret = 0;
+	}
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		mode = inode->i_mode;
+		name = POSIX_ACL_XATTR_ACCESS;
+		if (acl) {
+			ret = posix_acl_equiv_mode(acl, &mode);
+			if (ret < 0)
+				return ret;
+			inode->i_mode = mode;
+		}
+		ret = 0;
+		break;
+	case ACL_TYPE_DEFAULT:
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EINVAL : 0;
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(size, GFP_NOFS);
+		if (!value) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = posix_acl_to_xattr(acl, value, size);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
+out:
+	kfree(value);
+
+	if (!ret)
+		set_cached_acl(inode, type, acl);
+
+	return ret;
+}
+
+static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	int ret;
+	struct posix_acl *acl = NULL;
+
+	if (!inode_owner_or_capable(dentry->d_inode))
+		return -EPERM;
+
+	if (!IS_POSIXACL(dentry->d_inode))
+		return -EOPNOTSUPP;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+
+		if (acl) {
+			ret = posix_acl_valid(acl);
+			if (ret)
+				goto out;
+		}
+	}
+
+	ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
+out:
+	posix_acl_release(acl);
+
+	return ret;
+}
+
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
+{
+	int error = -EAGAIN;
+
+	if (flags & IPERM_FLAG_RCU) {
+		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+			error = -ECHILD;
+
+	} else {
+		struct posix_acl *acl;
+		acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (acl) {
+			error = posix_acl_permission(inode, acl, mask);
+			posix_acl_release(acl);
+		}
+	}
+
+	return error;
+}
+
+/*
+ * btrfs_init_acl is already generally called under fs_mutex, so the locking
+ * stuff has been fixed to work with that.  If the locking stuff changes, we
+ * need to re-evaluate the acl locking stuff.
+ */
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+		   struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *acl = NULL;
+	int ret = 0;
+
+	/* this happens with subvols */
+	if (!dir)
+		return 0;
+
+	if (!S_ISLNK(inode->i_mode)) {
+		if (IS_POSIXACL(dir)) {
+			acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
+			if (IS_ERR(acl))
+				return PTR_ERR(acl);
+		}
+
+		if (!acl)
+			inode->i_mode &= ~current_umask();
+	}
+
+	if (IS_POSIXACL(dir) && acl) {
+		struct posix_acl *clone;
+		mode_t mode;
+
+		if (S_ISDIR(inode->i_mode)) {
+			ret = btrfs_set_acl(trans, inode, acl,
+					    ACL_TYPE_DEFAULT);
+			if (ret)
+				goto failed;
+		}
+		clone = posix_acl_clone(acl, GFP_NOFS);
+		ret = -ENOMEM;
+		if (!clone)
+			goto failed;
+
+		mode = inode->i_mode;
+		ret = posix_acl_create_masq(clone, &mode);
+		if (ret >= 0) {
+			inode->i_mode = mode;
+			if (ret > 0) {
+				/* we need an acl */
+				ret = btrfs_set_acl(trans, inode, clone,
+						    ACL_TYPE_ACCESS);
+			}
+		}
+		posix_acl_release(clone);
+	}
+failed:
+	posix_acl_release(acl);
+
+	return ret;
+}
+
+int btrfs_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+	int ret = 0;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	if (!IS_POSIXACL(inode))
+		return 0;
+
+	acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR_OR_NULL(acl))
+		return PTR_ERR(acl);
+
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+
+	ret = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!ret)
+		ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS);
+
+	posix_acl_release(clone);
+
+	return ret;
+}
+
+const struct xattr_handler btrfs_xattr_acl_default_handler = {
+	.prefix = POSIX_ACL_XATTR_DEFAULT,
+	.flags	= ACL_TYPE_DEFAULT,
+	.get	= btrfs_xattr_acl_get,
+	.set	= btrfs_xattr_acl_set,
+};
+
+const struct xattr_handler btrfs_xattr_acl_access_handler = {
+	.prefix = POSIX_ACL_XATTR_ACCESS,
+	.flags	= ACL_TYPE_ACCESS,
+	.get	= btrfs_xattr_acl_get,
+	.set	= btrfs_xattr_acl_set,
+};
+
+#else /* CONFIG_BTRFS_FS_POSIX_ACL */
+
+int btrfs_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+		   struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+
+#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 00000000..7ec14097
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,718 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/freezer.h>
+#include "async-thread.h"
+
+#define WORK_QUEUED_BIT 0
+#define WORK_DONE_BIT 1
+#define WORK_ORDER_DONE_BIT 2
+#define WORK_HIGH_PRIO_BIT 3
+
+/*
+ * container for the kthread task pointer and the list of pending work
+ * One of these is allocated per thread.
+ */
+struct btrfs_worker_thread {
+	/* pool we belong to */
+	struct btrfs_workers *workers;
+
+	/* list of struct btrfs_work that are waiting for service */
+	struct list_head pending;
+	struct list_head prio_pending;
+
+	/* list of worker threads from struct btrfs_workers */
+	struct list_head worker_list;
+
+	/* kthread */
+	struct task_struct *task;
+
+	/* number of things on the pending list */
+	atomic_t num_pending;
+
+	/* reference counter for this struct */
+	atomic_t refs;
+
+	unsigned long sequence;
+
+	/* protects the pending list. */
+	spinlock_t lock;
+
+	/* set to non-zero when this thread is already awake and kicking */
+	int working;
+
+	/* are we currently idle */
+	int idle;
+};
+
+/*
+ * btrfs_start_workers uses kthread_run, which can block waiting for memory
+ * for a very long time.  It will actually throttle on page writeback,
+ * and so it may not make progress until after our btrfs worker threads
+ * process all of the pending work structs in their queue
+ *
+ * This means we can't use btrfs_start_workers from inside a btrfs worker
+ * thread that is used as part of cleaning dirty memory, which pretty much
+ * involves all of the worker threads.
+ *
+ * Instead we have a helper queue who never has more than one thread
+ * where we scheduler thread start operations.  This worker_start struct
+ * is used to contain the work and hold a pointer to the queue that needs
+ * another worker.
+ */
+struct worker_start {
+	struct btrfs_work work;
+	struct btrfs_workers *queue;
+};
+
+static void start_new_worker_func(struct btrfs_work *work)
+{
+	struct worker_start *start;
+	start = container_of(work, struct worker_start, work);
+	btrfs_start_workers(start->queue, 1);
+	kfree(start);
+}
+
+static int start_new_worker(struct btrfs_workers *queue)
+{
+	struct worker_start *start;
+	int ret;
+
+	start = kzalloc(sizeof(*start), GFP_NOFS);
+	if (!start)
+		return -ENOMEM;
+
+	start->work.func = start_new_worker_func;
+	start->queue = queue;
+	ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
+	if (ret)
+		kfree(start);
+	return ret;
+}
+
+/*
+ * helper function to move a thread onto the idle list after it
+ * has finished some requests.
+ */
+static void check_idle_worker(struct btrfs_worker_thread *worker)
+{
+	if (!worker->idle && atomic_read(&worker->num_pending) <
+	    worker->workers->idle_thresh / 2) {
+		unsigned long flags;
+		spin_lock_irqsave(&worker->workers->lock, flags);
+		worker->idle = 1;
+
+		/* the list may be empty if the worker is just starting */
+		if (!list_empty(&worker->worker_list)) {
+			list_move(&worker->worker_list,
+				 &worker->workers->idle_list);
+		}
+		spin_unlock_irqrestore(&worker->workers->lock, flags);
+	}
+}
+
+/*
+ * helper function to move a thread off the idle list after new
+ * pending work is added.
+ */
+static void check_busy_worker(struct btrfs_worker_thread *worker)
+{
+	if (worker->idle && atomic_read(&worker->num_pending) >=
+	    worker->workers->idle_thresh) {
+		unsigned long flags;
+		spin_lock_irqsave(&worker->workers->lock, flags);
+		worker->idle = 0;
+
+		if (!list_empty(&worker->worker_list)) {
+			list_move_tail(&worker->worker_list,
+				      &worker->workers->worker_list);
+		}
+		spin_unlock_irqrestore(&worker->workers->lock, flags);
+	}
+}
+
+static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
+{
+	struct btrfs_workers *workers = worker->workers;
+	unsigned long flags;
+
+	rmb();
+	if (!workers->atomic_start_pending)
+		return;
+
+	spin_lock_irqsave(&workers->lock, flags);
+	if (!workers->atomic_start_pending)
+		goto out;
+
+	workers->atomic_start_pending = 0;
+	if (workers->num_workers + workers->num_workers_starting >=
+	    workers->max_workers)
+		goto out;
+
+	workers->num_workers_starting += 1;
+	spin_unlock_irqrestore(&workers->lock, flags);
+	start_new_worker(workers);
+	return;
+
+out:
+	spin_unlock_irqrestore(&workers->lock, flags);
+}
+
+static noinline int run_ordered_completions(struct btrfs_workers *workers,
+					    struct btrfs_work *work)
+{
+	if (!workers->ordered)
+		return 0;
+
+	set_bit(WORK_DONE_BIT, &work->flags);
+
+	spin_lock(&workers->order_lock);
+
+	while (1) {
+		if (!list_empty(&workers->prio_order_list)) {
+			work = list_entry(workers->prio_order_list.next,
+					  struct btrfs_work, order_list);
+		} else if (!list_empty(&workers->order_list)) {
+			work = list_entry(workers->order_list.next,
+					  struct btrfs_work, order_list);
+		} else {
+			break;
+		}
+		if (!test_bit(WORK_DONE_BIT, &work->flags))
+			break;
+
+		/* we are going to call the ordered done function, but
+		 * we leave the work item on the list as a barrier so
+		 * that later work items that are done don't have their
+		 * functions called before this one returns
+		 */
+		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
+			break;
+
+		spin_unlock(&workers->order_lock);
+
+		work->ordered_func(work);
+
+		/* now take the lock again and call the freeing code */
+		spin_lock(&workers->order_lock);
+		list_del(&work->order_list);
+		work->ordered_free(work);
+	}
+
+	spin_unlock(&workers->order_lock);
+	return 0;
+}
+
+static void put_worker(struct btrfs_worker_thread *worker)
+{
+	if (atomic_dec_and_test(&worker->refs))
+		kfree(worker);
+}
+
+static int try_worker_shutdown(struct btrfs_worker_thread *worker)
+{
+	int freeit = 0;
+
+	spin_lock_irq(&worker->lock);
+	spin_lock(&worker->workers->lock);
+	if (worker->workers->num_workers > 1 &&
+	    worker->idle &&
+	    !worker->working &&
+	    !list_empty(&worker->worker_list) &&
+	    list_empty(&worker->prio_pending) &&
+	    list_empty(&worker->pending) &&
+	    atomic_read(&worker->num_pending) == 0) {
+		freeit = 1;
+		list_del_init(&worker->worker_list);
+		worker->workers->num_workers--;
+	}
+	spin_unlock(&worker->workers->lock);
+	spin_unlock_irq(&worker->lock);
+
+	if (freeit)
+		put_worker(worker);
+	return freeit;
+}
+
+static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
+					struct list_head *prio_head,
+					struct list_head *head)
+{
+	struct btrfs_work *work = NULL;
+	struct list_head *cur = NULL;
+
+	if(!list_empty(prio_head))
+		cur = prio_head->next;
+
+	smp_mb();
+	if (!list_empty(&worker->prio_pending))
+		goto refill;
+
+	if (!list_empty(head))
+		cur = head->next;
+
+	if (cur)
+		goto out;
+
+refill:
+	spin_lock_irq(&worker->lock);
+	list_splice_tail_init(&worker->prio_pending, prio_head);
+	list_splice_tail_init(&worker->pending, head);
+
+	if (!list_empty(prio_head))
+		cur = prio_head->next;
+	else if (!list_empty(head))
+		cur = head->next;
+	spin_unlock_irq(&worker->lock);
+
+	if (!cur)
+		goto out_fail;
+
+out:
+	work = list_entry(cur, struct btrfs_work, list);
+
+out_fail:
+	return work;
+}
+
+/*
+ * main loop for servicing work items
+ */
+static int worker_loop(void *arg)
+{
+	struct btrfs_worker_thread *worker = arg;
+	struct list_head head;
+	struct list_head prio_head;
+	struct btrfs_work *work;
+
+	INIT_LIST_HEAD(&head);
+	INIT_LIST_HEAD(&prio_head);
+
+	do {
+again:
+		while (1) {
+
+
+			work = get_next_work(worker, &prio_head, &head);
+			if (!work)
+				break;
+
+			list_del(&work->list);
+			clear_bit(WORK_QUEUED_BIT, &work->flags);
+
+			work->worker = worker;
+
+			work->func(work);
+
+			atomic_dec(&worker->num_pending);
+			/*
+			 * unless this is an ordered work queue,
+			 * 'work' was probably freed by func above.
+			 */
+			run_ordered_completions(worker->workers, work);
+
+			check_pending_worker_creates(worker);
+
+		}
+
+		spin_lock_irq(&worker->lock);
+		check_idle_worker(worker);
+
+		if (freezing(current)) {
+			worker->working = 0;
+			spin_unlock_irq(&worker->lock);
+			refrigerator();
+		} else {
+			spin_unlock_irq(&worker->lock);
+			if (!kthread_should_stop()) {
+				cpu_relax();
+				/*
+				 * we've dropped the lock, did someone else
+				 * jump_in?
+				 */
+				smp_mb();
+				if (!list_empty(&worker->pending) ||
+				    !list_empty(&worker->prio_pending))
+					continue;
+
+				/*
+				 * this short schedule allows more work to
+				 * come in without the queue functions
+				 * needing to go through wake_up_process()
+				 *
+				 * worker->working is still 1, so nobody
+				 * is going to try and wake us up
+				 */
+				schedule_timeout(1);
+				smp_mb();
+				if (!list_empty(&worker->pending) ||
+				    !list_empty(&worker->prio_pending))
+					continue;
+
+				if (kthread_should_stop())
+					break;
+
+				/* still no more work?, sleep for real */
+				spin_lock_irq(&worker->lock);
+				set_current_state(TASK_INTERRUPTIBLE);
+				if (!list_empty(&worker->pending) ||
+				    !list_empty(&worker->prio_pending)) {
+					spin_unlock_irq(&worker->lock);
+					set_current_state(TASK_RUNNING);
+					goto again;
+				}
+
+				/*
+				 * this makes sure we get a wakeup when someone
+				 * adds something new to the queue
+				 */
+				worker->working = 0;
+				spin_unlock_irq(&worker->lock);
+
+				if (!kthread_should_stop()) {
+					schedule_timeout(HZ * 120);
+					if (!worker->working &&
+					    try_worker_shutdown(worker)) {
+						return 0;
+					}
+				}
+			}
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+	return 0;
+}
+
+/*
+ * this will wait for all the worker threads to shutdown
+ */
+int btrfs_stop_workers(struct btrfs_workers *workers)
+{
+	struct list_head *cur;
+	struct btrfs_worker_thread *worker;
+	int can_stop;
+
+	spin_lock_irq(&workers->lock);
+	list_splice_init(&workers->idle_list, &workers->worker_list);
+	while (!list_empty(&workers->worker_list)) {
+		cur = workers->worker_list.next;
+		worker = list_entry(cur, struct btrfs_worker_thread,
+				    worker_list);
+
+		atomic_inc(&worker->refs);
+		workers->num_workers -= 1;
+		if (!list_empty(&worker->worker_list)) {
+			list_del_init(&worker->worker_list);
+			put_worker(worker);
+			can_stop = 1;
+		} else
+			can_stop = 0;
+		spin_unlock_irq(&workers->lock);
+		if (can_stop)
+			kthread_stop(worker->task);
+		spin_lock_irq(&workers->lock);
+		put_worker(worker);
+	}
+	spin_unlock_irq(&workers->lock);
+	return 0;
+}
+
+/*
+ * simple init on struct btrfs_workers
+ */
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+			struct btrfs_workers *async_helper)
+{
+	workers->num_workers = 0;
+	workers->num_workers_starting = 0;
+	INIT_LIST_HEAD(&workers->worker_list);
+	INIT_LIST_HEAD(&workers->idle_list);
+	INIT_LIST_HEAD(&workers->order_list);
+	INIT_LIST_HEAD(&workers->prio_order_list);
+	spin_lock_init(&workers->lock);
+	spin_lock_init(&workers->order_lock);
+	workers->max_workers = max;
+	workers->idle_thresh = 32;
+	workers->name = name;
+	workers->ordered = 0;
+	workers->atomic_start_pending = 0;
+	workers->atomic_worker_start = async_helper;
+}
+
+/*
+ * starts new worker threads.  This does not enforce the max worker
+ * count in case you need to temporarily go past it.
+ */
+static int __btrfs_start_workers(struct btrfs_workers *workers,
+				 int num_workers)
+{
+	struct btrfs_worker_thread *worker;
+	int ret = 0;
+	int i;
+
+	for (i = 0; i < num_workers; i++) {
+		worker = kzalloc(sizeof(*worker), GFP_NOFS);
+		if (!worker) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+
+		INIT_LIST_HEAD(&worker->pending);
+		INIT_LIST_HEAD(&worker->prio_pending);
+		INIT_LIST_HEAD(&worker->worker_list);
+		spin_lock_init(&worker->lock);
+
+		atomic_set(&worker->num_pending, 0);
+		atomic_set(&worker->refs, 1);
+		worker->workers = workers;
+		worker->task = kthread_run(worker_loop, worker,
+					   "btrfs-%s-%d", workers->name,
+					   workers->num_workers + i);
+		if (IS_ERR(worker->task)) {
+			ret = PTR_ERR(worker->task);
+			kfree(worker);
+			goto fail;
+		}
+		spin_lock_irq(&workers->lock);
+		list_add_tail(&worker->worker_list, &workers->idle_list);
+		worker->idle = 1;
+		workers->num_workers++;
+		workers->num_workers_starting--;
+		WARN_ON(workers->num_workers_starting < 0);
+		spin_unlock_irq(&workers->lock);
+	}
+	return 0;
+fail:
+	btrfs_stop_workers(workers);
+	return ret;
+}
+
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+	spin_lock_irq(&workers->lock);
+	workers->num_workers_starting += num_workers;
+	spin_unlock_irq(&workers->lock);
+	return __btrfs_start_workers(workers, num_workers);
+}
+
+/*
+ * run through the list and find a worker thread that doesn't have a lot
+ * to do right now.  This can return null if we aren't yet at the thread
+ * count limit and all of the threads are busy.
+ */
+static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+{
+	struct btrfs_worker_thread *worker;
+	struct list_head *next;
+	int enforce_min;
+
+	enforce_min = (workers->num_workers + workers->num_workers_starting) <
+		workers->max_workers;
+
+	/*
+	 * if we find an idle thread, don't move it to the end of the
+	 * idle list.  This improves the chance that the next submission
+	 * will reuse the same thread, and maybe catch it while it is still
+	 * working
+	 */
+	if (!list_empty(&workers->idle_list)) {
+		next = workers->idle_list.next;
+		worker = list_entry(next, struct btrfs_worker_thread,
+				    worker_list);
+		return worker;
+	}
+	if (enforce_min || list_empty(&workers->worker_list))
+		return NULL;
+
+	/*
+	 * if we pick a busy task, move the task to the end of the list.
+	 * hopefully this will keep things somewhat evenly balanced.
+	 * Do the move in batches based on the sequence number.  This groups
+	 * requests submitted at roughly the same time onto the same worker.
+	 */
+	next = workers->worker_list.next;
+	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
+	worker->sequence++;
+
+	if (worker->sequence % workers->idle_thresh == 0)
+		list_move_tail(next, &workers->worker_list);
+	return worker;
+}
+
+/*
+ * selects a worker thread to take the next job.  This will either find
+ * an idle worker, start a new worker up to the max count, or just return
+ * one of the existing busy workers.
+ */
+static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+{
+	struct btrfs_worker_thread *worker;
+	unsigned long flags;
+	struct list_head *fallback;
+
+again:
+	spin_lock_irqsave(&workers->lock, flags);
+	worker = next_worker(workers);
+
+	if (!worker) {
+		if (workers->num_workers + workers->num_workers_starting >=
+		    workers->max_workers) {
+			goto fallback;
+		} else if (workers->atomic_worker_start) {
+			workers->atomic_start_pending = 1;
+			goto fallback;
+		} else {
+			workers->num_workers_starting++;
+			spin_unlock_irqrestore(&workers->lock, flags);
+			/* we're below the limit, start another worker */
+			__btrfs_start_workers(workers, 1);
+			goto again;
+		}
+	}
+	goto found;
+
+fallback:
+	fallback = NULL;
+	/*
+	 * we have failed to find any workers, just
+	 * return the first one we can find.
+	 */
+	if (!list_empty(&workers->worker_list))
+		fallback = workers->worker_list.next;
+	if (!list_empty(&workers->idle_list))
+		fallback = workers->idle_list.next;
+	BUG_ON(!fallback);
+	worker = list_entry(fallback,
+		  struct btrfs_worker_thread, worker_list);
+found:
+	/*
+	 * this makes sure the worker doesn't exit before it is placed
+	 * onto a busy/idle list
+	 */
+	atomic_inc(&worker->num_pending);
+	spin_unlock_irqrestore(&workers->lock, flags);
+	return worker;
+}
+
+/*
+ * btrfs_requeue_work just puts the work item back on the tail of the list
+ * it was taken from.  It is intended for use with long running work functions
+ * that make some progress and want to give the cpu up for others.
+ */
+int btrfs_requeue_work(struct btrfs_work *work)
+{
+	struct btrfs_worker_thread *worker = work->worker;
+	unsigned long flags;
+	int wake = 0;
+
+	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+		goto out;
+
+	spin_lock_irqsave(&worker->lock, flags);
+	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+		list_add_tail(&work->list, &worker->prio_pending);
+	else
+		list_add_tail(&work->list, &worker->pending);
+	atomic_inc(&worker->num_pending);
+
+	/* by definition we're busy, take ourselves off the idle
+	 * list
+	 */
+	if (worker->idle) {
+		spin_lock(&worker->workers->lock);
+		worker->idle = 0;
+		list_move_tail(&worker->worker_list,
+			      &worker->workers->worker_list);
+		spin_unlock(&worker->workers->lock);
+	}
+	if (!worker->working) {
+		wake = 1;
+		worker->working = 1;
+	}
+
+	if (wake)
+		wake_up_process(worker->task);
+	spin_unlock_irqrestore(&worker->lock, flags);
+out:
+
+	return 0;
+}
+
+void btrfs_set_work_high_prio(struct btrfs_work *work)
+{
+	set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+}
+
+/*
+ * places a struct btrfs_work into the pending queue of one of the kthreads
+ */
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+{
+	struct btrfs_worker_thread *worker;
+	unsigned long flags;
+	int wake = 0;
+
+	/* don't requeue something already on a list */
+	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+		goto out;
+
+	worker = find_worker(workers);
+	if (workers->ordered) {
+		/*
+		 * you're not allowed to do ordered queues from an
+		 * interrupt handler
+		 */
+		spin_lock(&workers->order_lock);
+		if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
+			list_add_tail(&work->order_list,
+				      &workers->prio_order_list);
+		} else {
+			list_add_tail(&work->order_list, &workers->order_list);
+		}
+		spin_unlock(&workers->order_lock);
+	} else {
+		INIT_LIST_HEAD(&work->order_list);
+	}
+
+	spin_lock_irqsave(&worker->lock, flags);
+
+	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+		list_add_tail(&work->list, &worker->prio_pending);
+	else
+		list_add_tail(&work->list, &worker->pending);
+	check_busy_worker(worker);
+
+	/*
+	 * avoid calling into wake_up_process if this thread has already
+	 * been kicked
+	 */
+	if (!worker->working)
+		wake = 1;
+	worker->working = 1;
+
+	if (wake)
+		wake_up_process(worker->task);
+	spin_unlock_irqrestore(&worker->lock, flags);
+
+out:
+	return 0;
+}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 00000000..5077746c
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ASYNC_THREAD_
+#define __BTRFS_ASYNC_THREAD_
+
+struct btrfs_worker_thread;
+
+/*
+ * This is similar to a workqueue, but it is meant to spread the operations
+ * across all available cpus instead of just the CPU that was used to
+ * queue the work.  There is also some batching introduced to try and
+ * cut down on context switches.
+ *
+ * By default threads are added on demand up to 2 * the number of cpus.
+ * Changing struct btrfs_workers->max_workers is one way to prevent
+ * demand creation of kthreads.
+ *
+ * the basic model of these worker threads is to embed a btrfs_work
+ * structure in your own data struct, and use container_of in a
+ * work function to get back to your data struct.
+ */
+struct btrfs_work {
+	/*
+	 * func should be set to the function you want called
+	 * your work struct is passed as the only arg
+	 *
+	 * ordered_func must be set for work sent to an ordered work queue,
+	 * and it is called to complete a given work item in the same
+	 * order they were sent to the queue.
+	 */
+	void (*func)(struct btrfs_work *work);
+	void (*ordered_func)(struct btrfs_work *work);
+	void (*ordered_free)(struct btrfs_work *work);
+
+	/*
+	 * flags should be set to zero.  It is used to make sure the
+	 * struct is only inserted once into the list.
+	 */
+	unsigned long flags;
+
+	/* don't touch these */
+	struct btrfs_worker_thread *worker;
+	struct list_head list;
+	struct list_head order_list;
+};
+
+struct btrfs_workers {
+	/* current number of running workers */
+	int num_workers;
+
+	int num_workers_starting;
+
+	/* max number of workers allowed.  changed by btrfs_start_workers */
+	int max_workers;
+
+	/* once a worker has this many requests or fewer, it is idle */
+	int idle_thresh;
+
+	/* force completions in the order they were queued */
+	int ordered;
+
+	/* more workers required, but in an interrupt handler */
+	int atomic_start_pending;
+
+	/*
+	 * are we allowed to sleep while starting workers or are we required
+	 * to start them at a later time?  If we can't sleep, this indicates
+	 * which queue we need to use to schedule thread creation.
+	 */
+	struct btrfs_workers *atomic_worker_start;
+
+	/* list with all the work threads.  The workers on the idle thread
+	 * may be actively servicing jobs, but they haven't yet hit the
+	 * idle thresh limit above.
+	 */
+	struct list_head worker_list;
+	struct list_head idle_list;
+
+	/*
+	 * when operating in ordered mode, this maintains the list
+	 * of work items waiting for completion
+	 */
+	struct list_head order_list;
+	struct list_head prio_order_list;
+
+	/* lock for finding the next worker thread to queue on */
+	spinlock_t lock;
+
+	/* lock for the ordered lists */
+	spinlock_t order_lock;
+
+	/* extra name for this worker, used for current->name */
+	char *name;
+};
+
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+int btrfs_stop_workers(struct btrfs_workers *workers);
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+			struct btrfs_workers *async_starter);
+int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_set_work_high_prio(struct btrfs_work *work);
+#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 00000000..52d7eca8
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_I__
+#define __BTRFS_I__
+
+#include "extent_map.h"
+#include "extent_io.h"
+#include "ordered-data.h"
+#include "delayed-inode.h"
+
+/* in memory btrfs inode */
+struct btrfs_inode {
+	/* which subvolume this inode belongs to */
+	struct btrfs_root *root;
+
+	/* key used to find this inode on disk.  This is used by the code
+	 * to read in roots of subvolumes
+	 */
+	struct btrfs_key location;
+
+	/* the extent_tree has caches of all the extent mappings to disk */
+	struct extent_map_tree extent_tree;
+
+	/* the io_tree does range state (DIRTY, LOCKED etc) */
+	struct extent_io_tree io_tree;
+
+	/* special utility tree used to record which mirrors have already been
+	 * tried when checksums fail for a given block
+	 */
+	struct extent_io_tree io_failure_tree;
+
+	/* held while logging the inode in tree-log.c */
+	struct mutex log_mutex;
+
+	/* used to order data wrt metadata */
+	struct btrfs_ordered_inode_tree ordered_tree;
+
+	/* for keeping track of orphaned inodes */
+	struct list_head i_orphan;
+
+	/* list of all the delalloc inodes in the FS.  There are times we need
+	 * to write all the delalloc pages to disk, and this list is used
+	 * to walk them all.
+	 */
+	struct list_head delalloc_inodes;
+
+	/*
+	 * list for tracking inodes that must be sent to disk before a
+	 * rename or truncate commit
+	 */
+	struct list_head ordered_operations;
+
+	/* node for the red-black tree that links inodes in subvolume root */
+	struct rb_node rb_node;
+
+	/* the space_info for where this inode's data allocations are done */
+	struct btrfs_space_info *space_info;
+
+	/* full 64 bit generation number, struct vfs_inode doesn't have a big
+	 * enough field for this.
+	 */
+	u64 generation;
+
+	/* sequence number for NFS changes */
+	u64 sequence;
+
+	/*
+	 * transid of the trans_handle that last modified this inode
+	 */
+	u64 last_trans;
+
+	/*
+	 * log transid when this inode was last modified
+	 */
+	u64 last_sub_trans;
+
+	/*
+	 * transid that last logged this inode
+	 */
+	u64 logged_trans;
+
+	/* total number of bytes pending delalloc, used by stat to calc the
+	 * real block usage of the file
+	 */
+	u64 delalloc_bytes;
+
+	/* total number of bytes that may be used for this inode for
+	 * delalloc
+	 */
+	u64 reserved_bytes;
+
+	/*
+	 * the size of the file stored in the metadata on disk.  data=ordered
+	 * means the in-memory i_size might be larger than the size on disk
+	 * because not all the blocks are written yet.
+	 */
+	u64 disk_i_size;
+
+	/* flags field from the on disk inode */
+	u32 flags;
+
+	/*
+	 * if this is a directory then index_cnt is the counter for the index
+	 * number for new files that are created
+	 */
+	u64 index_cnt;
+
+	/* the fsync log has some corner cases that mean we have to check
+	 * directories to see if any unlinks have been done before
+	 * the directory was logged.  See tree-log.c for all the
+	 * details
+	 */
+	u64 last_unlink_trans;
+
+	/*
+	 * Counters to keep track of the number of extent item's we may use due
+	 * to delalloc and such.  outstanding_extents is the number of extent
+	 * items we think we'll end up using, and reserved_extents is the number
+	 * of extent items we've reserved metadata for.
+	 */
+	atomic_t outstanding_extents;
+	atomic_t reserved_extents;
+
+	/*
+	 * ordered_data_close is set by truncate when a file that used
+	 * to have good data has been truncated to zero.  When it is set
+	 * the btrfs file release call will add this inode to the
+	 * ordered operations list so that we make sure to flush out any
+	 * new data the application may have written before commit.
+	 *
+	 * yes, its silly to have a single bitflag, but we might grow more
+	 * of these.
+	 */
+	unsigned ordered_data_close:1;
+	unsigned orphan_meta_reserved:1;
+	unsigned dummy_inode:1;
+	unsigned in_defrag:1;
+
+	/*
+	 * always compress this one file
+	 */
+	unsigned force_compress:4;
+
+	struct btrfs_delayed_node *delayed_node;
+
+	struct inode vfs_inode;
+};
+
+extern unsigned char btrfs_filetype_table[];
+
+static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
+{
+	return container_of(inode, struct btrfs_inode, vfs_inode);
+}
+
+static inline u64 btrfs_ino(struct inode *inode)
+{
+	u64 ino = BTRFS_I(inode)->location.objectid;
+
+	if (ino <= BTRFS_FIRST_FREE_OBJECTID)
+		ino = inode->i_ino;
+	return ino;
+}
+
+static inline void btrfs_i_size_write(struct inode *inode, u64 size)
+{
+	i_size_write(inode, size);
+	BTRFS_I(inode)->disk_i_size = size;
+}
+
+#endif
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
new file mode 100644
index 00000000..7c4503ef
--- /dev/null
+++ b/fs/btrfs/compat.h
@@ -0,0 +1,7 @@
+#ifndef _COMPAT_H_
+#define _COMPAT_H_
+
+#define btrfs_drop_nlink(inode) drop_nlink(inode)
+#define btrfs_inc_nlink(inode)	inc_nlink(inode)
+
+#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 00000000..bfe42b03
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,1029 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/slab.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+
+struct compressed_bio {
+	/* number of bios pending for this compressed extent */
+	atomic_t pending_bios;
+
+	/* the pages with the compressed data on them */
+	struct page **compressed_pages;
+
+	/* inode that owns this data */
+	struct inode *inode;
+
+	/* starting offset in the inode for our pages */
+	u64 start;
+
+	/* number of bytes in the inode we're working on */
+	unsigned long len;
+
+	/* number of bytes on disk */
+	unsigned long compressed_len;
+
+	/* the compression algorithm for this bio */
+	int compress_type;
+
+	/* number of compressed pages in the array */
+	unsigned long nr_pages;
+
+	/* IO errors */
+	int errors;
+	int mirror_num;
+
+	/* for reads, this is the bio we are copying the data into */
+	struct bio *orig_bio;
+
+	/*
+	 * the start of a variable length array of checksums only
+	 * used by reads
+	 */
+	u32 sums;
+};
+
+static inline int compressed_bio_size(struct btrfs_root *root,
+				      unsigned long disk_size)
+{
+	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+	return sizeof(struct compressed_bio) +
+		((disk_size + root->sectorsize - 1) / root->sectorsize) *
+		csum_size;
+}
+
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+					u64 first_byte, gfp_t gfp_flags)
+{
+	int nr_vecs;
+
+	nr_vecs = bio_get_nr_vecs(bdev);
+	return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
+}
+
+static int check_compressed_csum(struct inode *inode,
+				 struct compressed_bio *cb,
+				 u64 disk_start)
+{
+	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct page *page;
+	unsigned long i;
+	char *kaddr;
+	u32 csum;
+	u32 *cb_sum = &cb->sums;
+
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+		return 0;
+
+	for (i = 0; i < cb->nr_pages; i++) {
+		page = cb->compressed_pages[i];
+		csum = ~(u32)0;
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
+		btrfs_csum_final(csum, (char *)&csum);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		if (csum != *cb_sum) {
+			printk(KERN_INFO "btrfs csum failed ino %llu "
+			       "extent %llu csum %u "
+			       "wanted %u mirror %d\n",
+			       (unsigned long long)btrfs_ino(inode),
+			       (unsigned long long)disk_start,
+			       csum, *cb_sum, cb->mirror_num);
+			ret = -EIO;
+			goto fail;
+		}
+		cb_sum++;
+
+	}
+	ret = 0;
+fail:
+	return ret;
+}
+
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+	struct compressed_bio *cb = bio->bi_private;
+	struct inode *inode;
+	struct page *page;
+	unsigned long index;
+	int ret;
+
+	if (err)
+		cb->errors = 1;
+
+	/* if there are more bios still pending for this compressed
+	 * extent, just exit
+	 */
+	if (!atomic_dec_and_test(&cb->pending_bios))
+		goto out;
+
+	inode = cb->inode;
+	ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
+	if (ret)
+		goto csum_failed;
+
+	/* ok, we're the last bio for this extent, lets start
+	 * the decompression.
+	 */
+	ret = btrfs_decompress_biovec(cb->compress_type,
+				      cb->compressed_pages,
+				      cb->start,
+				      cb->orig_bio->bi_io_vec,
+				      cb->orig_bio->bi_vcnt,
+				      cb->compressed_len);
+csum_failed:
+	if (ret)
+		cb->errors = 1;
+
+	/* release the compressed pages */
+	index = 0;
+	for (index = 0; index < cb->nr_pages; index++) {
+		page = cb->compressed_pages[index];
+		page->mapping = NULL;
+		page_cache_release(page);
+	}
+
+	/* do io completion on the original bio */
+	if (cb->errors) {
+		bio_io_error(cb->orig_bio);
+	} else {
+		int bio_index = 0;
+		struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
+
+		/*
+		 * we have verified the checksum already, set page
+		 * checked so the end_io handlers know about it
+		 */
+		while (bio_index < cb->orig_bio->bi_vcnt) {
+			SetPageChecked(bvec->bv_page);
+			bvec++;
+			bio_index++;
+		}
+		bio_endio(cb->orig_bio, 0);
+	}
+
+	/* finally free the cb struct */
+	kfree(cb->compressed_pages);
+	kfree(cb);
+out:
+	bio_put(bio);
+}
+
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline int end_compressed_writeback(struct inode *inode, u64 start,
+					     unsigned long ram_size)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+	struct page *pages[16];
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int ret;
+
+	while (nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long,
+				     nr_pages, ARRAY_SIZE(pages)), pages);
+		if (ret == 0) {
+			nr_pages -= 1;
+			index += 1;
+			continue;
+		}
+		for (i = 0; i < ret; i++) {
+			end_page_writeback(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+	}
+	/* the inode may be gone now */
+	return 0;
+}
+
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+	struct extent_io_tree *tree;
+	struct compressed_bio *cb = bio->bi_private;
+	struct inode *inode;
+	struct page *page;
+	unsigned long index;
+
+	if (err)
+		cb->errors = 1;
+
+	/* if there are more bios still pending for this compressed
+	 * extent, just exit
+	 */
+	if (!atomic_dec_and_test(&cb->pending_bios))
+		goto out;
+
+	/* ok, we're the last bio for this extent, step one is to
+	 * call back into the FS and do all the end_io operations
+	 */
+	inode = cb->inode;
+	tree = &BTRFS_I(inode)->io_tree;
+	cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
+	tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+					 cb->start,
+					 cb->start + cb->len - 1,
+					 NULL, 1);
+	cb->compressed_pages[0]->mapping = NULL;
+
+	end_compressed_writeback(inode, cb->start, cb->len);
+	/* note, our inode could be gone now */
+
+	/*
+	 * release the compressed pages, these came from alloc_page and
+	 * are not attached to the inode at all
+	 */
+	index = 0;
+	for (index = 0; index < cb->nr_pages; index++) {
+		page = cb->compressed_pages[index];
+		page->mapping = NULL;
+		page_cache_release(page);
+	}
+
+	/* finally free the cb struct */
+	kfree(cb->compressed_pages);
+	kfree(cb);
+out:
+	bio_put(bio);
+}
+
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+				 unsigned long len, u64 disk_start,
+				 unsigned long compressed_len,
+				 struct page **compressed_pages,
+				 unsigned long nr_pages)
+{
+	struct bio *bio = NULL;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct compressed_bio *cb;
+	unsigned long bytes_left;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int pg_index = 0;
+	struct page *page;
+	u64 first_byte = disk_start;
+	struct block_device *bdev;
+	int ret;
+
+	WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+	if (!cb)
+		return -ENOMEM;
+	atomic_set(&cb->pending_bios, 0);
+	cb->errors = 0;
+	cb->inode = inode;
+	cb->start = start;
+	cb->len = len;
+	cb->mirror_num = 0;
+	cb->compressed_pages = compressed_pages;
+	cb->compressed_len = compressed_len;
+	cb->orig_bio = NULL;
+	cb->nr_pages = nr_pages;
+
+	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+	if(!bio) {
+		kfree(cb);
+		return -ENOMEM;
+	}
+	bio->bi_private = cb;
+	bio->bi_end_io = end_compressed_bio_write;
+	atomic_inc(&cb->pending_bios);
+
+	/* create and submit bios for the compressed pages */
+	bytes_left = compressed_len;
+	for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
+		page = compressed_pages[pg_index];
+		page->mapping = inode->i_mapping;
+		if (bio->bi_size)
+			ret = io_tree->ops->merge_bio_hook(page, 0,
+							   PAGE_CACHE_SIZE,
+							   bio, 0);
+		else
+			ret = 0;
+
+		page->mapping = NULL;
+		if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+		    PAGE_CACHE_SIZE) {
+			bio_get(bio);
+
+			/*
+			 * inc the count before we submit the bio so
+			 * we know the end IO handler won't happen before
+			 * we inc the count.  Otherwise, the cb might get
+			 * freed before we're done setting it up
+			 */
+			atomic_inc(&cb->pending_bios);
+			ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+			BUG_ON(ret);
+
+			ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+			BUG_ON(ret);
+
+			ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+			BUG_ON(ret);
+
+			bio_put(bio);
+
+			bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+			bio->bi_private = cb;
+			bio->bi_end_io = end_compressed_bio_write;
+			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+		}
+		if (bytes_left < PAGE_CACHE_SIZE) {
+			printk("bytes left %lu compress len %lu nr %lu\n",
+			       bytes_left, cb->compressed_len, cb->nr_pages);
+		}
+		bytes_left -= PAGE_CACHE_SIZE;
+		first_byte += PAGE_CACHE_SIZE;
+		cond_resched();
+	}
+	bio_get(bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	BUG_ON(ret);
+
+	ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+	BUG_ON(ret);
+
+	ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+	BUG_ON(ret);
+
+	bio_put(bio);
+	return 0;
+}
+
+static noinline int add_ra_bio_pages(struct inode *inode,
+				     u64 compressed_end,
+				     struct compressed_bio *cb)
+{
+	unsigned long end_index;
+	unsigned long pg_index;
+	u64 last_offset;
+	u64 isize = i_size_read(inode);
+	int ret;
+	struct page *page;
+	unsigned long nr_pages = 0;
+	struct extent_map *em;
+	struct address_space *mapping = inode->i_mapping;
+	struct extent_map_tree *em_tree;
+	struct extent_io_tree *tree;
+	u64 end;
+	int misses = 0;
+
+	page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
+	last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
+	em_tree = &BTRFS_I(inode)->extent_tree;
+	tree = &BTRFS_I(inode)->io_tree;
+
+	if (isize == 0)
+		return 0;
+
+	end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+
+	while (last_offset < compressed_end) {
+		pg_index = last_offset >> PAGE_CACHE_SHIFT;
+
+		if (pg_index > end_index)
+			break;
+
+		rcu_read_lock();
+		page = radix_tree_lookup(&mapping->page_tree, pg_index);
+		rcu_read_unlock();
+		if (page) {
+			misses++;
+			if (misses > 4)
+				break;
+			goto next;
+		}
+
+		page = __page_cache_alloc(mapping_gfp_mask(mapping) &
+								~__GFP_FS);
+		if (!page)
+			break;
+
+		if (add_to_page_cache_lru(page, mapping, pg_index,
+								GFP_NOFS)) {
+			page_cache_release(page);
+			goto next;
+		}
+
+		end = last_offset + PAGE_CACHE_SIZE - 1;
+		/*
+		 * at this point, we have a locked page in the page cache
+		 * for these bytes in the file.  But, we have to make
+		 * sure they map to this compressed extent on disk.
+		 */
+		set_page_extent_mapped(page);
+		lock_extent(tree, last_offset, end, GFP_NOFS);
+		read_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, last_offset,
+					   PAGE_CACHE_SIZE);
+		read_unlock(&em_tree->lock);
+
+		if (!em || last_offset < em->start ||
+		    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+		    (em->block_start >> 9) != cb->orig_bio->bi_sector) {
+			free_extent_map(em);
+			unlock_extent(tree, last_offset, end, GFP_NOFS);
+			unlock_page(page);
+			page_cache_release(page);
+			break;
+		}
+		free_extent_map(em);
+
+		if (page->index == end_index) {
+			char *userpage;
+			size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
+
+			if (zero_offset) {
+				int zeros;
+				zeros = PAGE_CACHE_SIZE - zero_offset;
+				userpage = kmap_atomic(page, KM_USER0);
+				memset(userpage + zero_offset, 0, zeros);
+				flush_dcache_page(page);
+				kunmap_atomic(userpage, KM_USER0);
+			}
+		}
+
+		ret = bio_add_page(cb->orig_bio, page,
+				   PAGE_CACHE_SIZE, 0);
+
+		if (ret == PAGE_CACHE_SIZE) {
+			nr_pages++;
+			page_cache_release(page);
+		} else {
+			unlock_extent(tree, last_offset, end, GFP_NOFS);
+			unlock_page(page);
+			page_cache_release(page);
+			break;
+		}
+next:
+		last_offset += PAGE_CACHE_SIZE;
+	}
+	return 0;
+}
+
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it.  We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags)
+{
+	struct extent_io_tree *tree;
+	struct extent_map_tree *em_tree;
+	struct compressed_bio *cb;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+	unsigned long compressed_len;
+	unsigned long nr_pages;
+	unsigned long pg_index;
+	struct page *page;
+	struct block_device *bdev;
+	struct bio *comp_bio;
+	u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+	u64 em_len;
+	u64 em_start;
+	struct extent_map *em;
+	int ret = -ENOMEM;
+	u32 *sums;
+
+	tree = &BTRFS_I(inode)->io_tree;
+	em_tree = &BTRFS_I(inode)->extent_tree;
+
+	/* we need the actual starting offset of this extent in the file */
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree,
+				   page_offset(bio->bi_io_vec->bv_page),
+				   PAGE_CACHE_SIZE);
+	read_unlock(&em_tree->lock);
+
+	compressed_len = em->block_len;
+	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+	if (!cb)
+		goto out;
+
+	atomic_set(&cb->pending_bios, 0);
+	cb->errors = 0;
+	cb->inode = inode;
+	cb->mirror_num = mirror_num;
+	sums = &cb->sums;
+
+	cb->start = em->orig_start;
+	em_len = em->len;
+	em_start = em->start;
+
+	free_extent_map(em);
+	em = NULL;
+
+	cb->len = uncompressed_len;
+	cb->compressed_len = compressed_len;
+	cb->compress_type = extent_compress_type(bio_flags);
+	cb->orig_bio = bio;
+
+	nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+				 PAGE_CACHE_SIZE;
+	cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
+				       GFP_NOFS);
+	if (!cb->compressed_pages)
+		goto fail1;
+
+	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	for (pg_index = 0; pg_index < nr_pages; pg_index++) {
+		cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
+							      __GFP_HIGHMEM);
+		if (!cb->compressed_pages[pg_index])
+			goto fail2;
+	}
+	cb->nr_pages = nr_pages;
+
+	add_ra_bio_pages(inode, em_start + em_len, cb);
+
+	/* include any pages we added in add_ra-bio_pages */
+	uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+	cb->len = uncompressed_len;
+
+	comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+	if (!comp_bio)
+		goto fail2;
+	comp_bio->bi_private = cb;
+	comp_bio->bi_end_io = end_compressed_bio_read;
+	atomic_inc(&cb->pending_bios);
+
+	for (pg_index = 0; pg_index < nr_pages; pg_index++) {
+		page = cb->compressed_pages[pg_index];
+		page->mapping = inode->i_mapping;
+		page->index = em_start >> PAGE_CACHE_SHIFT;
+
+		if (comp_bio->bi_size)
+			ret = tree->ops->merge_bio_hook(page, 0,
+							PAGE_CACHE_SIZE,
+							comp_bio, 0);
+		else
+			ret = 0;
+
+		page->mapping = NULL;
+		if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+		    PAGE_CACHE_SIZE) {
+			bio_get(comp_bio);
+
+			ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+			BUG_ON(ret);
+
+			/*
+			 * inc the count before we submit the bio so
+			 * we know the end IO handler won't happen before
+			 * we inc the count.  Otherwise, the cb might get
+			 * freed before we're done setting it up
+			 */
+			atomic_inc(&cb->pending_bios);
+
+			if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+				ret = btrfs_lookup_bio_sums(root, inode,
+							comp_bio, sums);
+				BUG_ON(ret);
+			}
+			sums += (comp_bio->bi_size + root->sectorsize - 1) /
+				root->sectorsize;
+
+			ret = btrfs_map_bio(root, READ, comp_bio,
+					    mirror_num, 0);
+			BUG_ON(ret);
+
+			bio_put(comp_bio);
+
+			comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+							GFP_NOFS);
+			comp_bio->bi_private = cb;
+			comp_bio->bi_end_io = end_compressed_bio_read;
+
+			bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
+		}
+		cur_disk_byte += PAGE_CACHE_SIZE;
+	}
+	bio_get(comp_bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+	BUG_ON(ret);
+
+	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+		ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+		BUG_ON(ret);
+	}
+
+	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
+	BUG_ON(ret);
+
+	bio_put(comp_bio);
+	return 0;
+
+fail2:
+	for (pg_index = 0; pg_index < nr_pages; pg_index++)
+		free_page((unsigned long)cb->compressed_pages[pg_index]);
+
+	kfree(cb->compressed_pages);
+fail1:
+	kfree(cb);
+out:
+	free_extent_map(em);
+	return ret;
+}
+
+static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
+static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
+static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
+static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
+static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+
+struct btrfs_compress_op *btrfs_compress_op[] = {
+	&btrfs_zlib_compress,
+	&btrfs_lzo_compress,
+};
+
+int __init btrfs_init_compress(void)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+		INIT_LIST_HEAD(&comp_idle_workspace[i]);
+		spin_lock_init(&comp_workspace_lock[i]);
+		atomic_set(&comp_alloc_workspace[i], 0);
+		init_waitqueue_head(&comp_workspace_wait[i]);
+	}
+	return 0;
+}
+
+/*
+ * this finds an available workspace or allocates a new one
+ * ERR_PTR is returned if things go bad.
+ */
+static struct list_head *find_workspace(int type)
+{
+	struct list_head *workspace;
+	int cpus = num_online_cpus();
+	int idx = type - 1;
+
+	struct list_head *idle_workspace	= &comp_idle_workspace[idx];
+	spinlock_t *workspace_lock		= &comp_workspace_lock[idx];
+	atomic_t *alloc_workspace		= &comp_alloc_workspace[idx];
+	wait_queue_head_t *workspace_wait	= &comp_workspace_wait[idx];
+	int *num_workspace			= &comp_num_workspace[idx];
+again:
+	spin_lock(workspace_lock);
+	if (!list_empty(idle_workspace)) {
+		workspace = idle_workspace->next;
+		list_del(workspace);
+		(*num_workspace)--;
+		spin_unlock(workspace_lock);
+		return workspace;
+
+	}
+	if (atomic_read(alloc_workspace) > cpus) {
+		DEFINE_WAIT(wait);
+
+		spin_unlock(workspace_lock);
+		prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+		if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+			schedule();
+		finish_wait(workspace_wait, &wait);
+		goto again;
+	}
+	atomic_inc(alloc_workspace);
+	spin_unlock(workspace_lock);
+
+	workspace = btrfs_compress_op[idx]->alloc_workspace();
+	if (IS_ERR(workspace)) {
+		atomic_dec(alloc_workspace);
+		wake_up(workspace_wait);
+	}
+	return workspace;
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static void free_workspace(int type, struct list_head *workspace)
+{
+	int idx = type - 1;
+	struct list_head *idle_workspace	= &comp_idle_workspace[idx];
+	spinlock_t *workspace_lock		= &comp_workspace_lock[idx];
+	atomic_t *alloc_workspace		= &comp_alloc_workspace[idx];
+	wait_queue_head_t *workspace_wait	= &comp_workspace_wait[idx];
+	int *num_workspace			= &comp_num_workspace[idx];
+
+	spin_lock(workspace_lock);
+	if (*num_workspace < num_online_cpus()) {
+		list_add_tail(workspace, idle_workspace);
+		(*num_workspace)++;
+		spin_unlock(workspace_lock);
+		goto wake;
+	}
+	spin_unlock(workspace_lock);
+
+	btrfs_compress_op[idx]->free_workspace(workspace);
+	atomic_dec(alloc_workspace);
+wake:
+	if (waitqueue_active(workspace_wait))
+		wake_up(workspace_wait);
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+	struct list_head *workspace;
+	int i;
+
+	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+		while (!list_empty(&comp_idle_workspace[i])) {
+			workspace = comp_idle_workspace[i].next;
+			list_del(workspace);
+			btrfs_compress_op[i]->free_workspace(workspace);
+			atomic_dec(&comp_alloc_workspace[i]);
+		}
+	}
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_compress_pages(int type, struct address_space *mapping,
+			 u64 start, unsigned long len,
+			 struct page **pages,
+			 unsigned long nr_dest_pages,
+			 unsigned long *out_pages,
+			 unsigned long *total_in,
+			 unsigned long *total_out,
+			 unsigned long max_out)
+{
+	struct list_head *workspace;
+	int ret;
+
+	workspace = find_workspace(type);
+	if (IS_ERR(workspace))
+		return -1;
+
+	ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+						      start, len, pages,
+						      nr_dest_pages, out_pages,
+						      total_in, total_out,
+						      max_out);
+	free_workspace(type, workspace);
+	return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
+			    struct bio_vec *bvec, int vcnt, size_t srclen)
+{
+	struct list_head *workspace;
+	int ret;
+
+	workspace = find_workspace(type);
+	if (IS_ERR(workspace))
+		return -ENOMEM;
+
+	ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
+							 disk_start,
+							 bvec, vcnt, srclen);
+	free_workspace(type, workspace);
+	return ret;
+}
+
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+		     unsigned long start_byte, size_t srclen, size_t destlen)
+{
+	struct list_head *workspace;
+	int ret;
+
+	workspace = find_workspace(type);
+	if (IS_ERR(workspace))
+		return -ENOMEM;
+
+	ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+						  dest_page, start_byte,
+						  srclen, destlen);
+
+	free_workspace(type, workspace);
+	return ret;
+}
+
+void btrfs_exit_compress(void)
+{
+	free_workspaces();
+}
+
+/*
+ * Copy uncompressed data from working buffer to pages.
+ *
+ * buf_start is the byte offset we're of the start of our workspace buffer.
+ *
+ * total_out is the last byte of the buffer
+ */
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+			      unsigned long total_out, u64 disk_start,
+			      struct bio_vec *bvec, int vcnt,
+			      unsigned long *pg_index,
+			      unsigned long *pg_offset)
+{
+	unsigned long buf_offset;
+	unsigned long current_buf_start;
+	unsigned long start_byte;
+	unsigned long working_bytes = total_out - buf_start;
+	unsigned long bytes;
+	char *kaddr;
+	struct page *page_out = bvec[*pg_index].bv_page;
+
+	/*
+	 * start byte is the first byte of the page we're currently
+	 * copying into relative to the start of the compressed data.
+	 */
+	start_byte = page_offset(page_out) - disk_start;
+
+	/* we haven't yet hit data corresponding to this page */
+	if (total_out <= start_byte)
+		return 1;
+
+	/*
+	 * the start of the data we care about is offset into
+	 * the middle of our working buffer
+	 */
+	if (total_out > start_byte && buf_start < start_byte) {
+		buf_offset = start_byte - buf_start;
+		working_bytes -= buf_offset;
+	} else {
+		buf_offset = 0;
+	}
+	current_buf_start = buf_start;
+
+	/* copy bytes from the working buffer into the pages */
+	while (working_bytes > 0) {
+		bytes = min(PAGE_CACHE_SIZE - *pg_offset,
+			    PAGE_CACHE_SIZE - buf_offset);
+		bytes = min(bytes, working_bytes);
+		kaddr = kmap_atomic(page_out, KM_USER0);
+		memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
+		kunmap_atomic(kaddr, KM_USER0);
+		flush_dcache_page(page_out);
+
+		*pg_offset += bytes;
+		buf_offset += bytes;
+		working_bytes -= bytes;
+		current_buf_start += bytes;
+
+		/* check if we need to pick another page */
+		if (*pg_offset == PAGE_CACHE_SIZE) {
+			(*pg_index)++;
+			if (*pg_index >= vcnt)
+				return 0;
+
+			page_out = bvec[*pg_index].bv_page;
+			*pg_offset = 0;
+			start_byte = page_offset(page_out) - disk_start;
+
+			/*
+			 * make sure our new page is covered by this
+			 * working buffer
+			 */
+			if (total_out <= start_byte)
+				return 1;
+
+			/*
+			 * the next page in the biovec might not be adjacent
+			 * to the last page, but it might still be found
+			 * inside this working buffer. bump our offset pointer
+			 */
+			if (total_out > start_byte &&
+			    current_buf_start < start_byte) {
+				buf_offset = start_byte - buf_start;
+				working_bytes = total_out - start_byte;
+				current_buf_start = buf_start + buf_offset;
+			}
+		}
+	}
+
+	return 1;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 00000000..a12059f4
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+
+int btrfs_init_compress(void);
+void btrfs_exit_compress(void);
+
+int btrfs_compress_pages(int type, struct address_space *mapping,
+			 u64 start, unsigned long len,
+			 struct page **pages,
+			 unsigned long nr_dest_pages,
+			 unsigned long *out_pages,
+			 unsigned long *total_in,
+			 unsigned long *total_out,
+			 unsigned long max_out);
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
+			    struct bio_vec *bvec, int vcnt, size_t srclen);
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+		     unsigned long start_byte, size_t srclen, size_t destlen);
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+			      unsigned long total_out, u64 disk_start,
+			      struct bio_vec *bvec, int vcnt,
+			      unsigned long *pg_index,
+			      unsigned long *pg_offset);
+
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+				  unsigned long len, u64 disk_start,
+				  unsigned long compressed_len,
+				  struct page **compressed_pages,
+				  unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags);
+
+struct btrfs_compress_op {
+	struct list_head *(*alloc_workspace)(void);
+
+	void (*free_workspace)(struct list_head *workspace);
+
+	int (*compress_pages)(struct list_head *workspace,
+			      struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out);
+
+	int (*decompress_biovec)(struct list_head *workspace,
+				 struct page **pages_in,
+				 u64 disk_start,
+				 struct bio_vec *bvec,
+				 int vcnt,
+				 size_t srclen);
+
+	int (*decompress)(struct list_head *workspace,
+			  unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen);
+};
+
+extern struct btrfs_compress_op btrfs_zlib_compress;
+extern struct btrfs_compress_op btrfs_lzo_compress;
+
+#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 00000000..2e667868
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,4402 @@
+/*
+ * Copyright (C) 2007,2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "locking.h"
+
+static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_path *path, int level);
+static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *ins_key,
+		      struct btrfs_path *path, int data_size, int extend);
+static int push_node_left(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *dst,
+			  struct extent_buffer *src, int empty);
+static int balance_node_right(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct extent_buffer *dst_buf,
+			      struct extent_buffer *src_buf);
+static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_path *path, int level, int slot);
+
+struct btrfs_path *btrfs_alloc_path(void)
+{
+	struct btrfs_path *path;
+	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
+	return path;
+}
+
+/*
+ * set all locked nodes in the path to blocking locks.  This should
+ * be done before scheduling
+ */
+noinline void btrfs_set_path_blocking(struct btrfs_path *p)
+{
+	int i;
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+		if (p->nodes[i] && p->locks[i])
+			btrfs_set_lock_blocking(p->nodes[i]);
+	}
+}
+
+/*
+ * reset all the locked nodes in the patch to spinning locks.
+ *
+ * held is used to keep lockdep happy, when lockdep is enabled
+ * we set held to a blocking lock before we go around and
+ * retake all the spinlocks in the path.  You can safely use NULL
+ * for held
+ */
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
+					struct extent_buffer *held)
+{
+	int i;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/* lockdep really cares that we take all of these spinlocks
+	 * in the right order.  If any of the locks in the path are not
+	 * currently blocking, it is going to complain.  So, make really
+	 * really sure by forcing the path to blocking before we clear
+	 * the path blocking.
+	 */
+	if (held)
+		btrfs_set_lock_blocking(held);
+	btrfs_set_path_blocking(p);
+#endif
+
+	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
+		if (p->nodes[i] && p->locks[i])
+			btrfs_clear_lock_blocking(p->nodes[i]);
+	}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (held)
+		btrfs_clear_lock_blocking(held);
+#endif
+}
+
+/* this also releases the path */
+void btrfs_free_path(struct btrfs_path *p)
+{
+	if (!p)
+		return;
+	btrfs_release_path(p);
+	kmem_cache_free(btrfs_path_cachep, p);
+}
+
+/*
+ * path release drops references on the extent buffers in the path
+ * and it drops any locks held by this path
+ *
+ * It is safe to call this on paths that no locks or extent buffers held.
+ */
+noinline void btrfs_release_path(struct btrfs_path *p)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+		p->slots[i] = 0;
+		if (!p->nodes[i])
+			continue;
+		if (p->locks[i]) {
+			btrfs_tree_unlock(p->nodes[i]);
+			p->locks[i] = 0;
+		}
+		free_extent_buffer(p->nodes[i]);
+		p->nodes[i] = NULL;
+	}
+}
+
+/*
+ * safely gets a reference on the root node of a tree.  A lock
+ * is not taken, so a concurrent writer may put a different node
+ * at the root of the tree.  See btrfs_lock_root_node for the
+ * looping required.
+ *
+ * The extent buffer returned by this has a reference taken, so
+ * it won't disappear.  It may stop being the root of the tree
+ * at any time because there are no locks held.
+ */
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
+{
+	struct extent_buffer *eb;
+
+	rcu_read_lock();
+	eb = rcu_dereference(root->node);
+	extent_buffer_get(eb);
+	rcu_read_unlock();
+	return eb;
+}
+
+/* loop around taking references on and locking the root node of the
+ * tree until you end up with a lock on the root.  A locked buffer
+ * is returned, with a reference held.
+ */
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
+{
+	struct extent_buffer *eb;
+
+	while (1) {
+		eb = btrfs_root_node(root);
+		btrfs_tree_lock(eb);
+		if (eb == root->node)
+			break;
+		btrfs_tree_unlock(eb);
+		free_extent_buffer(eb);
+	}
+	return eb;
+}
+
+/* cowonly root (everything not a reference counted cow subvolume), just get
+ * put onto a simple dirty list.  transaction.c walks this to make sure they
+ * get properly updated on disk.
+ */
+static void add_root_to_dirty_list(struct btrfs_root *root)
+{
+	if (root->track_dirty && list_empty(&root->dirty_list)) {
+		list_add(&root->dirty_list,
+			 &root->fs_info->dirty_cowonly_roots);
+	}
+}
+
+/*
+ * used by snapshot creation to make a copy of a root for a tree with
+ * a given objectid.  The buffer with the new root node is returned in
+ * cow_ret, and this func returns zero on success or a negative error code.
+ */
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      struct extent_buffer *buf,
+		      struct extent_buffer **cow_ret, u64 new_root_objectid)
+{
+	struct extent_buffer *cow;
+	int ret = 0;
+	int level;
+	struct btrfs_disk_key disk_key;
+
+	WARN_ON(root->ref_cows && trans->transid !=
+		root->fs_info->running_transaction->transid);
+	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+
+	level = btrfs_header_level(buf);
+	if (level == 0)
+		btrfs_item_key(buf, &disk_key, 0);
+	else
+		btrfs_node_key(buf, &disk_key, 0);
+
+	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
+				     new_root_objectid, &disk_key, level,
+				     buf->start, 0);
+	if (IS_ERR(cow))
+		return PTR_ERR(cow);
+
+	copy_extent_buffer(cow, buf, 0, 0, cow->len);
+	btrfs_set_header_bytenr(cow, cow->start);
+	btrfs_set_header_generation(cow, trans->transid);
+	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
+	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+				     BTRFS_HEADER_FLAG_RELOC);
+	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+	else
+		btrfs_set_header_owner(cow, new_root_objectid);
+
+	write_extent_buffer(cow, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(cow),
+			    BTRFS_FSID_SIZE);
+
+	WARN_ON(btrfs_header_generation(buf) > trans->transid);
+	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+		ret = btrfs_inc_ref(trans, root, cow, 1);
+	else
+		ret = btrfs_inc_ref(trans, root, cow, 0);
+
+	if (ret)
+		return ret;
+
+	btrfs_mark_buffer_dirty(cow);
+	*cow_ret = cow;
+	return 0;
+}
+
+/*
+ * check if the tree block can be shared by multiple trees
+ */
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+			      struct extent_buffer *buf)
+{
+	/*
+	 * Tree blocks not in refernece counted trees and tree roots
+	 * are never shared. If a block was allocated after the last
+	 * snapshot and the block was not allocated by tree relocation,
+	 * we know the block is not shared.
+	 */
+	if (root->ref_cows &&
+	    buf != root->node && buf != root->commit_root &&
+	    (btrfs_header_generation(buf) <=
+	     btrfs_root_last_snapshot(&root->root_item) ||
+	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+		return 1;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (root->ref_cows &&
+	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+		return 1;
+#endif
+	return 0;
+}
+
+static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct extent_buffer *buf,
+				       struct extent_buffer *cow,
+				       int *last_ref)
+{
+	u64 refs;
+	u64 owner;
+	u64 flags;
+	u64 new_flags = 0;
+	int ret;
+
+	/*
+	 * Backrefs update rules:
+	 *
+	 * Always use full backrefs for extent pointers in tree block
+	 * allocated by tree relocation.
+	 *
+	 * If a shared tree block is no longer referenced by its owner
+	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
+	 * use full backrefs for extent pointers in tree block.
+	 *
+	 * If a tree block is been relocating
+	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
+	 * use full backrefs for extent pointers in tree block.
+	 * The reason for this is some operations (such as drop tree)
+	 * are only allowed for blocks use full backrefs.
+	 */
+
+	if (btrfs_block_can_be_shared(root, buf)) {
+		ret = btrfs_lookup_extent_info(trans, root, buf->start,
+					       buf->len, &refs, &flags);
+		BUG_ON(ret);
+		BUG_ON(refs == 0);
+	} else {
+		refs = 1;
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+		else
+			flags = 0;
+	}
+
+	owner = btrfs_header_owner(buf);
+	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
+	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+
+	if (refs > 1) {
+		if ((owner == root->root_key.objectid ||
+		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
+		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
+			ret = btrfs_inc_ref(trans, root, buf, 1);
+			BUG_ON(ret);
+
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID) {
+				ret = btrfs_dec_ref(trans, root, buf, 0);
+				BUG_ON(ret);
+				ret = btrfs_inc_ref(trans, root, cow, 1);
+				BUG_ON(ret);
+			}
+			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+		} else {
+
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID)
+				ret = btrfs_inc_ref(trans, root, cow, 1);
+			else
+				ret = btrfs_inc_ref(trans, root, cow, 0);
+			BUG_ON(ret);
+		}
+		if (new_flags != 0) {
+			ret = btrfs_set_disk_extent_flags(trans, root,
+							  buf->start,
+							  buf->len,
+							  new_flags, 0);
+			BUG_ON(ret);
+		}
+	} else {
+		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID)
+				ret = btrfs_inc_ref(trans, root, cow, 1);
+			else
+				ret = btrfs_inc_ref(trans, root, cow, 0);
+			BUG_ON(ret);
+			ret = btrfs_dec_ref(trans, root, buf, 1);
+			BUG_ON(ret);
+		}
+		clean_tree_block(trans, root, buf);
+		*last_ref = 1;
+	}
+	return 0;
+}
+
+/*
+ * does the dirty work in cow of a single block.  The parent block (if
+ * supplied) is updated to point to the new cow copy.  The new buffer is marked
+ * dirty and returned locked.  If you modify the block it needs to be marked
+ * dirty again.
+ *
+ * search_start -- an allocation hint for the new block
+ *
+ * empty_size -- a hint that you plan on doing more cow.  This is the size in
+ * bytes the allocator should try to find free next to the block it returns.
+ * This is just a hint and may be ignored by the allocator.
+ */
+static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct extent_buffer *buf,
+			     struct extent_buffer *parent, int parent_slot,
+			     struct extent_buffer **cow_ret,
+			     u64 search_start, u64 empty_size)
+{
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *cow;
+	int level;
+	int last_ref = 0;
+	int unlock_orig = 0;
+	u64 parent_start;
+
+	if (*cow_ret == buf)
+		unlock_orig = 1;
+
+	btrfs_assert_tree_locked(buf);
+
+	WARN_ON(root->ref_cows && trans->transid !=
+		root->fs_info->running_transaction->transid);
+	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+
+	level = btrfs_header_level(buf);
+
+	if (level == 0)
+		btrfs_item_key(buf, &disk_key, 0);
+	else
+		btrfs_node_key(buf, &disk_key, 0);
+
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		if (parent)
+			parent_start = parent->start;
+		else
+			parent_start = 0;
+	} else
+		parent_start = 0;
+
+	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
+				     root->root_key.objectid, &disk_key,
+				     level, search_start, empty_size);
+	if (IS_ERR(cow))
+		return PTR_ERR(cow);
+
+	/* cow is set to blocking by btrfs_init_new_buffer */
+
+	copy_extent_buffer(cow, buf, 0, 0, cow->len);
+	btrfs_set_header_bytenr(cow, cow->start);
+	btrfs_set_header_generation(cow, trans->transid);
+	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
+	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+				     BTRFS_HEADER_FLAG_RELOC);
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+	else
+		btrfs_set_header_owner(cow, root->root_key.objectid);
+
+	write_extent_buffer(cow, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(cow),
+			    BTRFS_FSID_SIZE);
+
+	update_ref_for_cow(trans, root, buf, cow, &last_ref);
+
+	if (root->ref_cows)
+		btrfs_reloc_cow_block(trans, root, buf, cow);
+
+	if (buf == root->node) {
+		WARN_ON(parent && parent != buf);
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+			parent_start = buf->start;
+		else
+			parent_start = 0;
+
+		extent_buffer_get(cow);
+		rcu_assign_pointer(root->node, cow);
+
+		btrfs_free_tree_block(trans, root, buf, parent_start,
+				      last_ref);
+		free_extent_buffer(buf);
+		add_root_to_dirty_list(root);
+	} else {
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+			parent_start = parent->start;
+		else
+			parent_start = 0;
+
+		WARN_ON(trans->transid != btrfs_header_generation(parent));
+		btrfs_set_node_blockptr(parent, parent_slot,
+					cow->start);
+		btrfs_set_node_ptr_generation(parent, parent_slot,
+					      trans->transid);
+		btrfs_mark_buffer_dirty(parent);
+		btrfs_free_tree_block(trans, root, buf, parent_start,
+				      last_ref);
+	}
+	if (unlock_orig)
+		btrfs_tree_unlock(buf);
+	free_extent_buffer(buf);
+	btrfs_mark_buffer_dirty(cow);
+	*cow_ret = cow;
+	return 0;
+}
+
+static inline int should_cow_block(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct extent_buffer *buf)
+{
+	if (btrfs_header_generation(buf) == trans->transid &&
+	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
+	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+		return 0;
+	return 1;
+}
+
+/*
+ * cows a single block, see __btrfs_cow_block for the real work.
+ * This version of it has extra checks so that a block isn't cow'd more than
+ * once per transaction, as long as it hasn't been written yet
+ */
+noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct extent_buffer *buf,
+		    struct extent_buffer *parent, int parent_slot,
+		    struct extent_buffer **cow_ret)
+{
+	u64 search_start;
+	int ret;
+
+	if (trans->transaction != root->fs_info->running_transaction) {
+		printk(KERN_CRIT "trans %llu running %llu\n",
+		       (unsigned long long)trans->transid,
+		       (unsigned long long)
+		       root->fs_info->running_transaction->transid);
+		WARN_ON(1);
+	}
+	if (trans->transid != root->fs_info->generation) {
+		printk(KERN_CRIT "trans %llu running %llu\n",
+		       (unsigned long long)trans->transid,
+		       (unsigned long long)root->fs_info->generation);
+		WARN_ON(1);
+	}
+
+	if (!should_cow_block(trans, root, buf)) {
+		*cow_ret = buf;
+		return 0;
+	}
+
+	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+
+	if (parent)
+		btrfs_set_lock_blocking(parent);
+	btrfs_set_lock_blocking(buf);
+
+	ret = __btrfs_cow_block(trans, root, buf, parent,
+				 parent_slot, cow_ret, search_start, 0);
+
+	trace_btrfs_cow_block(root, buf, *cow_ret);
+
+	return ret;
+}
+
+/*
+ * helper function for defrag to decide if two blocks pointed to by a
+ * node are actually close by
+ */
+static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
+{
+	if (blocknr < other && other - (blocknr + blocksize) < 32768)
+		return 1;
+	if (blocknr > other && blocknr - (other + blocksize) < 32768)
+		return 1;
+	return 0;
+}
+
+/*
+ * compare two keys in a memcmp fashion
+ */
+static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
+{
+	struct btrfs_key k1;
+
+	btrfs_disk_key_to_cpu(&k1, disk);
+
+	return btrfs_comp_cpu_keys(&k1, k2);
+}
+
+/*
+ * same as comp_keys only with two btrfs_key's
+ */
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+{
+	if (k1->objectid > k2->objectid)
+		return 1;
+	if (k1->objectid < k2->objectid)
+		return -1;
+	if (k1->type > k2->type)
+		return 1;
+	if (k1->type < k2->type)
+		return -1;
+	if (k1->offset > k2->offset)
+		return 1;
+	if (k1->offset < k2->offset)
+		return -1;
+	return 0;
+}
+
+/*
+ * this is used by the defrag code to go through all the
+ * leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order
+ */
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct extent_buffer *parent,
+		       int start_slot, int cache_only, u64 *last_ret,
+		       struct btrfs_key *progress)
+{
+	struct extent_buffer *cur;
+	u64 blocknr;
+	u64 gen;
+	u64 search_start = *last_ret;
+	u64 last_block = 0;
+	u64 other;
+	u32 parent_nritems;
+	int end_slot;
+	int i;
+	int err = 0;
+	int parent_level;
+	int uptodate;
+	u32 blocksize;
+	int progress_passed = 0;
+	struct btrfs_disk_key disk_key;
+
+	parent_level = btrfs_header_level(parent);
+	if (cache_only && parent_level != 1)
+		return 0;
+
+	if (trans->transaction != root->fs_info->running_transaction)
+		WARN_ON(1);
+	if (trans->transid != root->fs_info->generation)
+		WARN_ON(1);
+
+	parent_nritems = btrfs_header_nritems(parent);
+	blocksize = btrfs_level_size(root, parent_level - 1);
+	end_slot = parent_nritems;
+
+	if (parent_nritems == 1)
+		return 0;
+
+	btrfs_set_lock_blocking(parent);
+
+	for (i = start_slot; i < end_slot; i++) {
+		int close = 1;
+
+		if (!parent->map_token) {
+			map_extent_buffer(parent,
+					btrfs_node_key_ptr_offset(i),
+					sizeof(struct btrfs_key_ptr),
+					&parent->map_token, &parent->kaddr,
+					&parent->map_start, &parent->map_len,
+					KM_USER1);
+		}
+		btrfs_node_key(parent, &disk_key, i);
+		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
+			continue;
+
+		progress_passed = 1;
+		blocknr = btrfs_node_blockptr(parent, i);
+		gen = btrfs_node_ptr_generation(parent, i);
+		if (last_block == 0)
+			last_block = blocknr;
+
+		if (i > 0) {
+			other = btrfs_node_blockptr(parent, i - 1);
+			close = close_blocks(blocknr, other, blocksize);
+		}
+		if (!close && i < end_slot - 2) {
+			other = btrfs_node_blockptr(parent, i + 1);
+			close = close_blocks(blocknr, other, blocksize);
+		}
+		if (close) {
+			last_block = blocknr;
+			continue;
+		}
+		if (parent->map_token) {
+			unmap_extent_buffer(parent, parent->map_token,
+					    KM_USER1);
+			parent->map_token = NULL;
+		}
+
+		cur = btrfs_find_tree_block(root, blocknr, blocksize);
+		if (cur)
+			uptodate = btrfs_buffer_uptodate(cur, gen);
+		else
+			uptodate = 0;
+		if (!cur || !uptodate) {
+			if (cache_only) {
+				free_extent_buffer(cur);
+				continue;
+			}
+			if (!cur) {
+				cur = read_tree_block(root, blocknr,
+							 blocksize, gen);
+				if (!cur)
+					return -EIO;
+			} else if (!uptodate) {
+				btrfs_read_buffer(cur, gen);
+			}
+		}
+		if (search_start == 0)
+			search_start = last_block;
+
+		btrfs_tree_lock(cur);
+		btrfs_set_lock_blocking(cur);
+		err = __btrfs_cow_block(trans, root, cur, parent, i,
+					&cur, search_start,
+					min(16 * blocksize,
+					    (end_slot - i) * blocksize));
+		if (err) {
+			btrfs_tree_unlock(cur);
+			free_extent_buffer(cur);
+			break;
+		}
+		search_start = cur->start;
+		last_block = cur->start;
+		*last_ret = search_start;
+		btrfs_tree_unlock(cur);
+		free_extent_buffer(cur);
+	}
+	if (parent->map_token) {
+		unmap_extent_buffer(parent, parent->map_token,
+				    KM_USER1);
+		parent->map_token = NULL;
+	}
+	return err;
+}
+
+/*
+ * The leaf data grows from end-to-front in the node.
+ * this returns the address of the start of the last item,
+ * which is the stop of the leaf data stack
+ */
+static inline unsigned int leaf_data_end(struct btrfs_root *root,
+					 struct extent_buffer *leaf)
+{
+	u32 nr = btrfs_header_nritems(leaf);
+	if (nr == 0)
+		return BTRFS_LEAF_DATA_SIZE(root);
+	return btrfs_item_offset_nr(leaf, nr - 1);
+}
+
+
+/*
+ * search for key in the extent_buffer.  The items start at offset p,
+ * and they are item_size apart.  There are 'max' items in p.
+ *
+ * the slot in the array is returned via slot, and it points to
+ * the place where you would insert key if it is not found in
+ * the array.
+ *
+ * slot may point to max if the key is bigger than all of the keys
+ */
+static noinline int generic_bin_search(struct extent_buffer *eb,
+				       unsigned long p,
+				       int item_size, struct btrfs_key *key,
+				       int max, int *slot)
+{
+	int low = 0;
+	int high = max;
+	int mid;
+	int ret;
+	struct btrfs_disk_key *tmp = NULL;
+	struct btrfs_disk_key unaligned;
+	unsigned long offset;
+	char *map_token = NULL;
+	char *kaddr = NULL;
+	unsigned long map_start = 0;
+	unsigned long map_len = 0;
+	int err;
+
+	while (low < high) {
+		mid = (low + high) / 2;
+		offset = p + mid * item_size;
+
+		if (!map_token || offset < map_start ||
+		    (offset + sizeof(struct btrfs_disk_key)) >
+		    map_start + map_len) {
+			if (map_token) {
+				unmap_extent_buffer(eb, map_token, KM_USER0);
+				map_token = NULL;
+			}
+
+			err = map_private_extent_buffer(eb, offset,
+						sizeof(struct btrfs_disk_key),
+						&map_token, &kaddr,
+						&map_start, &map_len, KM_USER0);
+
+			if (!err) {
+				tmp = (struct btrfs_disk_key *)(kaddr + offset -
+							map_start);
+			} else {
+				read_extent_buffer(eb, &unaligned,
+						   offset, sizeof(unaligned));
+				tmp = &unaligned;
+			}
+
+		} else {
+			tmp = (struct btrfs_disk_key *)(kaddr + offset -
+							map_start);
+		}
+		ret = comp_keys(tmp, key);
+
+		if (ret < 0)
+			low = mid + 1;
+		else if (ret > 0)
+			high = mid;
+		else {
+			*slot = mid;
+			if (map_token)
+				unmap_extent_buffer(eb, map_token, KM_USER0);
+			return 0;
+		}
+	}
+	*slot = low;
+	if (map_token)
+		unmap_extent_buffer(eb, map_token, KM_USER0);
+	return 1;
+}
+
+/*
+ * simple bin_search frontend that does the right thing for
+ * leaves vs nodes
+ */
+static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+		      int level, int *slot)
+{
+	if (level == 0) {
+		return generic_bin_search(eb,
+					  offsetof(struct btrfs_leaf, items),
+					  sizeof(struct btrfs_item),
+					  key, btrfs_header_nritems(eb),
+					  slot);
+	} else {
+		return generic_bin_search(eb,
+					  offsetof(struct btrfs_node, ptrs),
+					  sizeof(struct btrfs_key_ptr),
+					  key, btrfs_header_nritems(eb),
+					  slot);
+	}
+	return -1;
+}
+
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+		     int level, int *slot)
+{
+	return bin_search(eb, key, level, slot);
+}
+
+static void root_add_used(struct btrfs_root *root, u32 size)
+{
+	spin_lock(&root->accounting_lock);
+	btrfs_set_root_used(&root->root_item,
+			    btrfs_root_used(&root->root_item) + size);
+	spin_unlock(&root->accounting_lock);
+}
+
+static void root_sub_used(struct btrfs_root *root, u32 size)
+{
+	spin_lock(&root->accounting_lock);
+	btrfs_set_root_used(&root->root_item,
+			    btrfs_root_used(&root->root_item) - size);
+	spin_unlock(&root->accounting_lock);
+}
+
+/* given a node and slot number, this reads the blocks it points to.  The
+ * extent buffer is returned with a reference taken (but unlocked).
+ * NULL is returned on error.
+ */
+static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
+				   struct extent_buffer *parent, int slot)
+{
+	int level = btrfs_header_level(parent);
+	if (slot < 0)
+		return NULL;
+	if (slot >= btrfs_header_nritems(parent))
+		return NULL;
+
+	BUG_ON(level == 0);
+
+	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
+		       btrfs_level_size(root, level - 1),
+		       btrfs_node_ptr_generation(parent, slot));
+}
+
+/*
+ * node level balancing, used to make sure nodes are in proper order for
+ * item deletion.  We balance from the top down, so we have to make sure
+ * that a deletion won't leave an node completely empty later on.
+ */
+static noinline int balance_level(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 struct btrfs_path *path, int level)
+{
+	struct extent_buffer *right = NULL;
+	struct extent_buffer *mid;
+	struct extent_buffer *left = NULL;
+	struct extent_buffer *parent = NULL;
+	int ret = 0;
+	int wret;
+	int pslot;
+	int orig_slot = path->slots[level];
+	u64 orig_ptr;
+
+	if (level == 0)
+		return 0;
+
+	mid = path->nodes[level];
+
+	WARN_ON(!path->locks[level]);
+	WARN_ON(btrfs_header_generation(mid) != trans->transid);
+
+	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
+
+	if (level < BTRFS_MAX_LEVEL - 1)
+		parent = path->nodes[level + 1];
+	pslot = path->slots[level + 1];
+
+	/*
+	 * deal with the case where there is only one pointer in the root
+	 * by promoting the node below to a root
+	 */
+	if (!parent) {
+		struct extent_buffer *child;
+
+		if (btrfs_header_nritems(mid) != 1)
+			return 0;
+
+		/* promote the child to a root */
+		child = read_node_slot(root, mid, 0);
+		BUG_ON(!child);
+		btrfs_tree_lock(child);
+		btrfs_set_lock_blocking(child);
+		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+		if (ret) {
+			btrfs_tree_unlock(child);
+			free_extent_buffer(child);
+			goto enospc;
+		}
+
+		rcu_assign_pointer(root->node, child);
+
+		add_root_to_dirty_list(root);
+		btrfs_tree_unlock(child);
+
+		path->locks[level] = 0;
+		path->nodes[level] = NULL;
+		clean_tree_block(trans, root, mid);
+		btrfs_tree_unlock(mid);
+		/* once for the path */
+		free_extent_buffer(mid);
+
+		root_sub_used(root, mid->len);
+		btrfs_free_tree_block(trans, root, mid, 0, 1);
+		/* once for the root ptr */
+		free_extent_buffer(mid);
+		return 0;
+	}
+	if (btrfs_header_nritems(mid) >
+	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
+		return 0;
+
+	btrfs_header_nritems(mid);
+
+	left = read_node_slot(root, parent, pslot - 1);
+	if (left) {
+		btrfs_tree_lock(left);
+		btrfs_set_lock_blocking(left);
+		wret = btrfs_cow_block(trans, root, left,
+				       parent, pslot - 1, &left);
+		if (wret) {
+			ret = wret;
+			goto enospc;
+		}
+	}
+	right = read_node_slot(root, parent, pslot + 1);
+	if (right) {
+		btrfs_tree_lock(right);
+		btrfs_set_lock_blocking(right);
+		wret = btrfs_cow_block(trans, root, right,
+				       parent, pslot + 1, &right);
+		if (wret) {
+			ret = wret;
+			goto enospc;
+		}
+	}
+
+	/* first, try to make some room in the middle buffer */
+	if (left) {
+		orig_slot += btrfs_header_nritems(left);
+		wret = push_node_left(trans, root, left, mid, 1);
+		if (wret < 0)
+			ret = wret;
+		btrfs_header_nritems(mid);
+	}
+
+	/*
+	 * then try to empty the right most buffer into the middle
+	 */
+	if (right) {
+		wret = push_node_left(trans, root, mid, right, 1);
+		if (wret < 0 && wret != -ENOSPC)
+			ret = wret;
+		if (btrfs_header_nritems(right) == 0) {
+			clean_tree_block(trans, root, right);
+			btrfs_tree_unlock(right);
+			wret = del_ptr(trans, root, path, level + 1, pslot +
+				       1);
+			if (wret)
+				ret = wret;
+			root_sub_used(root, right->len);
+			btrfs_free_tree_block(trans, root, right, 0, 1);
+			free_extent_buffer(right);
+			right = NULL;
+		} else {
+			struct btrfs_disk_key right_key;
+			btrfs_node_key(right, &right_key, 0);
+			btrfs_set_node_key(parent, &right_key, pslot + 1);
+			btrfs_mark_buffer_dirty(parent);
+		}
+	}
+	if (btrfs_header_nritems(mid) == 1) {
+		/*
+		 * we're not allowed to leave a node with one item in the
+		 * tree during a delete.  A deletion from lower in the tree
+		 * could try to delete the only pointer in this node.
+		 * So, pull some keys from the left.
+		 * There has to be a left pointer at this point because
+		 * otherwise we would have pulled some pointers from the
+		 * right
+		 */
+		BUG_ON(!left);
+		wret = balance_node_right(trans, root, mid, left);
+		if (wret < 0) {
+			ret = wret;
+			goto enospc;
+		}
+		if (wret == 1) {
+			wret = push_node_left(trans, root, left, mid, 1);
+			if (wret < 0)
+				ret = wret;
+		}
+		BUG_ON(wret == 1);
+	}
+	if (btrfs_header_nritems(mid) == 0) {
+		clean_tree_block(trans, root, mid);
+		btrfs_tree_unlock(mid);
+		wret = del_ptr(trans, root, path, level + 1, pslot);
+		if (wret)
+			ret = wret;
+		root_sub_used(root, mid->len);
+		btrfs_free_tree_block(trans, root, mid, 0, 1);
+		free_extent_buffer(mid);
+		mid = NULL;
+	} else {
+		/* update the parent key to reflect our changes */
+		struct btrfs_disk_key mid_key;
+		btrfs_node_key(mid, &mid_key, 0);
+		btrfs_set_node_key(parent, &mid_key, pslot);
+		btrfs_mark_buffer_dirty(parent);
+	}
+
+	/* update the path */
+	if (left) {
+		if (btrfs_header_nritems(left) > orig_slot) {
+			extent_buffer_get(left);
+			/* left was locked after cow */
+			path->nodes[level] = left;
+			path->slots[level + 1] -= 1;
+			path->slots[level] = orig_slot;
+			if (mid) {
+				btrfs_tree_unlock(mid);
+				free_extent_buffer(mid);
+			}
+		} else {
+			orig_slot -= btrfs_header_nritems(left);
+			path->slots[level] = orig_slot;
+		}
+	}
+	/* double check we haven't messed things up */
+	if (orig_ptr !=
+	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
+		BUG();
+enospc:
+	if (right) {
+		btrfs_tree_unlock(right);
+		free_extent_buffer(right);
+	}
+	if (left) {
+		if (path->nodes[level] != left)
+			btrfs_tree_unlock(left);
+		free_extent_buffer(left);
+	}
+	return ret;
+}
+
+/* Node balancing for insertion.  Here we only split or push nodes around
+ * when they are completely full.  This is also done top down, so we
+ * have to be pessimistic.
+ */
+static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, int level)
+{
+	struct extent_buffer *right = NULL;
+	struct extent_buffer *mid;
+	struct extent_buffer *left = NULL;
+	struct extent_buffer *parent = NULL;
+	int ret = 0;
+	int wret;
+	int pslot;
+	int orig_slot = path->slots[level];
+
+	if (level == 0)
+		return 1;
+
+	mid = path->nodes[level];
+	WARN_ON(btrfs_header_generation(mid) != trans->transid);
+
+	if (level < BTRFS_MAX_LEVEL - 1)
+		parent = path->nodes[level + 1];
+	pslot = path->slots[level + 1];
+
+	if (!parent)
+		return 1;
+
+	left = read_node_slot(root, parent, pslot - 1);
+
+	/* first, try to make some room in the middle buffer */
+	if (left) {
+		u32 left_nr;
+
+		btrfs_tree_lock(left);
+		btrfs_set_lock_blocking(left);
+
+		left_nr = btrfs_header_nritems(left);
+		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+			wret = 1;
+		} else {
+			ret = btrfs_cow_block(trans, root, left, parent,
+					      pslot - 1, &left);
+			if (ret)
+				wret = 1;
+			else {
+				wret = push_node_left(trans, root,
+						      left, mid, 0);
+			}
+		}
+		if (wret < 0)
+			ret = wret;
+		if (wret == 0) {
+			struct btrfs_disk_key disk_key;
+			orig_slot += left_nr;
+			btrfs_node_key(mid, &disk_key, 0);
+			btrfs_set_node_key(parent, &disk_key, pslot);
+			btrfs_mark_buffer_dirty(parent);
+			if (btrfs_header_nritems(left) > orig_slot) {
+				path->nodes[level] = left;
+				path->slots[level + 1] -= 1;
+				path->slots[level] = orig_slot;
+				btrfs_tree_unlock(mid);
+				free_extent_buffer(mid);
+			} else {
+				orig_slot -=
+					btrfs_header_nritems(left);
+				path->slots[level] = orig_slot;
+				btrfs_tree_unlock(left);
+				free_extent_buffer(left);
+			}
+			return 0;
+		}
+		btrfs_tree_unlock(left);
+		free_extent_buffer(left);
+	}
+	right = read_node_slot(root, parent, pslot + 1);
+
+	/*
+	 * then try to empty the right most buffer into the middle
+	 */
+	if (right) {
+		u32 right_nr;
+
+		btrfs_tree_lock(right);
+		btrfs_set_lock_blocking(right);
+
+		right_nr = btrfs_header_nritems(right);
+		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+			wret = 1;
+		} else {
+			ret = btrfs_cow_block(trans, root, right,
+					      parent, pslot + 1,
+					      &right);
+			if (ret)
+				wret = 1;
+			else {
+				wret = balance_node_right(trans, root,
+							  right, mid);
+			}
+		}
+		if (wret < 0)
+			ret = wret;
+		if (wret == 0) {
+			struct btrfs_disk_key disk_key;
+
+			btrfs_node_key(right, &disk_key, 0);
+			btrfs_set_node_key(parent, &disk_key, pslot + 1);
+			btrfs_mark_buffer_dirty(parent);
+
+			if (btrfs_header_nritems(mid) <= orig_slot) {
+				path->nodes[level] = right;
+				path->slots[level + 1] += 1;
+				path->slots[level] = orig_slot -
+					btrfs_header_nritems(mid);
+				btrfs_tree_unlock(mid);
+				free_extent_buffer(mid);
+			} else {
+				btrfs_tree_unlock(right);
+				free_extent_buffer(right);
+			}
+			return 0;
+		}
+		btrfs_tree_unlock(right);
+		free_extent_buffer(right);
+	}
+	return 1;
+}
+
+/*
+ * readahead one full node of leaves, finding things that are close
+ * to the block in 'slot', and triggering ra on them.
+ */
+static void reada_for_search(struct btrfs_root *root,
+			     struct btrfs_path *path,
+			     int level, int slot, u64 objectid)
+{
+	struct extent_buffer *node;
+	struct btrfs_disk_key disk_key;
+	u32 nritems;
+	u64 search;
+	u64 target;
+	u64 nread = 0;
+	u64 gen;
+	int direction = path->reada;
+	struct extent_buffer *eb;
+	u32 nr;
+	u32 blocksize;
+	u32 nscan = 0;
+	bool map = true;
+
+	if (level != 1)
+		return;
+
+	if (!path->nodes[level])
+		return;
+
+	node = path->nodes[level];
+
+	search = btrfs_node_blockptr(node, slot);
+	blocksize = btrfs_level_size(root, level - 1);
+	eb = btrfs_find_tree_block(root, search, blocksize);
+	if (eb) {
+		free_extent_buffer(eb);
+		return;
+	}
+
+	target = search;
+
+	nritems = btrfs_header_nritems(node);
+	nr = slot;
+	if (node->map_token || path->skip_locking)
+		map = false;
+
+	while (1) {
+		if (map && !node->map_token) {
+			unsigned long offset = btrfs_node_key_ptr_offset(nr);
+			map_private_extent_buffer(node, offset,
+						  sizeof(struct btrfs_key_ptr),
+						  &node->map_token,
+						  &node->kaddr,
+						  &node->map_start,
+						  &node->map_len, KM_USER1);
+		}
+		if (direction < 0) {
+			if (nr == 0)
+				break;
+			nr--;
+		} else if (direction > 0) {
+			nr++;
+			if (nr >= nritems)
+				break;
+		}
+		if (path->reada < 0 && objectid) {
+			btrfs_node_key(node, &disk_key, nr);
+			if (btrfs_disk_key_objectid(&disk_key) != objectid)
+				break;
+		}
+		search = btrfs_node_blockptr(node, nr);
+		if ((search <= target && target - search <= 65536) ||
+		    (search > target && search - target <= 65536)) {
+			gen = btrfs_node_ptr_generation(node, nr);
+			if (map && node->map_token) {
+				unmap_extent_buffer(node, node->map_token,
+						    KM_USER1);
+				node->map_token = NULL;
+			}
+			readahead_tree_block(root, search, blocksize, gen);
+			nread += blocksize;
+		}
+		nscan++;
+		if ((nread > 65536 || nscan > 32))
+			break;
+	}
+	if (map && node->map_token) {
+		unmap_extent_buffer(node, node->map_token, KM_USER1);
+		node->map_token = NULL;
+	}
+}
+
+/*
+ * returns -EAGAIN if it had to drop the path, or zero if everything was in
+ * cache
+ */
+static noinline int reada_for_balance(struct btrfs_root *root,
+				      struct btrfs_path *path, int level)
+{
+	int slot;
+	int nritems;
+	struct extent_buffer *parent;
+	struct extent_buffer *eb;
+	u64 gen;
+	u64 block1 = 0;
+	u64 block2 = 0;
+	int ret = 0;
+	int blocksize;
+
+	parent = path->nodes[level + 1];
+	if (!parent)
+		return 0;
+
+	nritems = btrfs_header_nritems(parent);
+	slot = path->slots[level + 1];
+	blocksize = btrfs_level_size(root, level);
+
+	if (slot > 0) {
+		block1 = btrfs_node_blockptr(parent, slot - 1);
+		gen = btrfs_node_ptr_generation(parent, slot - 1);
+		eb = btrfs_find_tree_block(root, block1, blocksize);
+		if (eb && btrfs_buffer_uptodate(eb, gen))
+			block1 = 0;
+		free_extent_buffer(eb);
+	}
+	if (slot + 1 < nritems) {
+		block2 = btrfs_node_blockptr(parent, slot + 1);
+		gen = btrfs_node_ptr_generation(parent, slot + 1);
+		eb = btrfs_find_tree_block(root, block2, blocksize);
+		if (eb && btrfs_buffer_uptodate(eb, gen))
+			block2 = 0;
+		free_extent_buffer(eb);
+	}
+	if (block1 || block2) {
+		ret = -EAGAIN;
+
+		/* release the whole path */
+		btrfs_release_path(path);
+
+		/* read the blocks */
+		if (block1)
+			readahead_tree_block(root, block1, blocksize, 0);
+		if (block2)
+			readahead_tree_block(root, block2, blocksize, 0);
+
+		if (block1) {
+			eb = read_tree_block(root, block1, blocksize, 0);
+			free_extent_buffer(eb);
+		}
+		if (block2) {
+			eb = read_tree_block(root, block2, blocksize, 0);
+			free_extent_buffer(eb);
+		}
+	}
+	return ret;
+}
+
+
+/*
+ * when we walk down the tree, it is usually safe to unlock the higher layers
+ * in the tree.  The exceptions are when our path goes through slot 0, because
+ * operations on the tree might require changing key pointers higher up in the
+ * tree.
+ *
+ * callers might also have set path->keep_locks, which tells this code to keep
+ * the lock if the path points to the last slot in the block.  This is part of
+ * walking through the tree, and selecting the next slot in the higher block.
+ *
+ * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
+ * if lowest_unlock is 1, level 0 won't be unlocked
+ */
+static noinline void unlock_up(struct btrfs_path *path, int level,
+			       int lowest_unlock)
+{
+	int i;
+	int skip_level = level;
+	int no_skips = 0;
+	struct extent_buffer *t;
+
+	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+		if (!path->nodes[i])
+			break;
+		if (!path->locks[i])
+			break;
+		if (!no_skips && path->slots[i] == 0) {
+			skip_level = i + 1;
+			continue;
+		}
+		if (!no_skips && path->keep_locks) {
+			u32 nritems;
+			t = path->nodes[i];
+			nritems = btrfs_header_nritems(t);
+			if (nritems < 1 || path->slots[i] >= nritems - 1) {
+				skip_level = i + 1;
+				continue;
+			}
+		}
+		if (skip_level < i && i >= lowest_unlock)
+			no_skips = 1;
+
+		t = path->nodes[i];
+		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+			btrfs_tree_unlock(t);
+			path->locks[i] = 0;
+		}
+	}
+}
+
+/*
+ * This releases any locks held in the path starting at level and
+ * going all the way up to the root.
+ *
+ * btrfs_search_slot will keep the lock held on higher nodes in a few
+ * corner cases, such as COW of the block at slot zero in the node.  This
+ * ignores those rules, and it should only be called when there are no
+ * more updates to be done higher up in the tree.
+ */
+noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
+{
+	int i;
+
+	if (path->keep_locks)
+		return;
+
+	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+		if (!path->nodes[i])
+			continue;
+		if (!path->locks[i])
+			continue;
+		btrfs_tree_unlock(path->nodes[i]);
+		path->locks[i] = 0;
+	}
+}
+
+/*
+ * helper function for btrfs_search_slot.  The goal is to find a block
+ * in cache without setting the path to blocking.  If we find the block
+ * we return zero and the path is unchanged.
+ *
+ * If we can't find the block, we set the path blocking and do some
+ * reada.  -EAGAIN is returned and the search must be repeated.
+ */
+static int
+read_block_for_search(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct btrfs_path *p,
+		       struct extent_buffer **eb_ret, int level, int slot,
+		       struct btrfs_key *key)
+{
+	u64 blocknr;
+	u64 gen;
+	u32 blocksize;
+	struct extent_buffer *b = *eb_ret;
+	struct extent_buffer *tmp;
+	int ret;
+
+	blocknr = btrfs_node_blockptr(b, slot);
+	gen = btrfs_node_ptr_generation(b, slot);
+	blocksize = btrfs_level_size(root, level - 1);
+
+	tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+	if (tmp) {
+		if (btrfs_buffer_uptodate(tmp, 0)) {
+			if (btrfs_buffer_uptodate(tmp, gen)) {
+				/*
+				 * we found an up to date block without
+				 * sleeping, return
+				 * right away
+				 */
+				*eb_ret = tmp;
+				return 0;
+			}
+			/* the pages were up to date, but we failed
+			 * the generation number check.  Do a full
+			 * read for the generation number that is correct.
+			 * We must do this without dropping locks so
+			 * we can trust our generation number
+			 */
+			free_extent_buffer(tmp);
+			tmp = read_tree_block(root, blocknr, blocksize, gen);
+			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+				*eb_ret = tmp;
+				return 0;
+			}
+			free_extent_buffer(tmp);
+			btrfs_release_path(p);
+			return -EIO;
+		}
+	}
+
+	/*
+	 * reduce lock contention at high levels
+	 * of the btree by dropping locks before
+	 * we read.  Don't release the lock on the current
+	 * level because we need to walk this node to figure
+	 * out which blocks to read.
+	 */
+	btrfs_unlock_up_safe(p, level + 1);
+	btrfs_set_path_blocking(p);
+
+	free_extent_buffer(tmp);
+	if (p->reada)
+		reada_for_search(root, p, level, slot, key->objectid);
+
+	btrfs_release_path(p);
+
+	ret = -EAGAIN;
+	tmp = read_tree_block(root, blocknr, blocksize, 0);
+	if (tmp) {
+		/*
+		 * If the read above didn't mark this buffer up to date,
+		 * it will never end up being up to date.  Set ret to EIO now
+		 * and give up so that our caller doesn't loop forever
+		 * on our EAGAINs.
+		 */
+		if (!btrfs_buffer_uptodate(tmp, 0))
+			ret = -EIO;
+		free_extent_buffer(tmp);
+	}
+	return ret;
+}
+
+/*
+ * helper function for btrfs_search_slot.  This does all of the checks
+ * for node-level blocks and does any balancing required based on
+ * the ins_len.
+ *
+ * If no extra work was required, zero is returned.  If we had to
+ * drop the path, -EAGAIN is returned and btrfs_search_slot must
+ * start over
+ */
+static int
+setup_nodes_for_search(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct btrfs_path *p,
+		       struct extent_buffer *b, int level, int ins_len)
+{
+	int ret;
+	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
+	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+		int sret;
+
+		sret = reada_for_balance(root, p, level);
+		if (sret)
+			goto again;
+
+		btrfs_set_path_blocking(p);
+		sret = split_node(trans, root, p, level);
+		btrfs_clear_path_blocking(p, NULL);
+
+		BUG_ON(sret > 0);
+		if (sret) {
+			ret = sret;
+			goto done;
+		}
+		b = p->nodes[level];
+	} else if (ins_len < 0 && btrfs_header_nritems(b) <
+		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
+		int sret;
+
+		sret = reada_for_balance(root, p, level);
+		if (sret)
+			goto again;
+
+		btrfs_set_path_blocking(p);
+		sret = balance_level(trans, root, p, level);
+		btrfs_clear_path_blocking(p, NULL);
+
+		if (sret) {
+			ret = sret;
+			goto done;
+		}
+		b = p->nodes[level];
+		if (!b) {
+			btrfs_release_path(p);
+			goto again;
+		}
+		BUG_ON(btrfs_header_nritems(b) == 1);
+	}
+	return 0;
+
+again:
+	ret = -EAGAIN;
+done:
+	return ret;
+}
+
+/*
+ * look for key in the tree.  path is filled in with nodes along the way
+ * if key is found, we return zero and you can find the item in the leaf
+ * level of the path (level 0)
+ *
+ * If the key isn't found, the path points to the slot where it should
+ * be inserted, and 1 is returned.  If there are other errors during the
+ * search a negative error number is returned.
+ *
+ * if ins_len > 0, nodes and leaves will be split as we walk down the
+ * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
+ * possible)
+ */
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_path *p, int
+		      ins_len, int cow)
+{
+	struct extent_buffer *b;
+	int slot;
+	int ret;
+	int err;
+	int level;
+	int lowest_unlock = 1;
+	u8 lowest_level = 0;
+
+	lowest_level = p->lowest_level;
+	WARN_ON(lowest_level && ins_len > 0);
+	WARN_ON(p->nodes[0] != NULL);
+
+	if (ins_len < 0)
+		lowest_unlock = 2;
+
+again:
+	if (p->search_commit_root) {
+		b = root->commit_root;
+		extent_buffer_get(b);
+		if (!p->skip_locking)
+			btrfs_tree_lock(b);
+	} else {
+		if (p->skip_locking)
+			b = btrfs_root_node(root);
+		else
+			b = btrfs_lock_root_node(root);
+	}
+
+	while (b) {
+		level = btrfs_header_level(b);
+
+		/*
+		 * setup the path here so we can release it under lock
+		 * contention with the cow code
+		 */
+		p->nodes[level] = b;
+		if (!p->skip_locking)
+			p->locks[level] = 1;
+
+		if (cow) {
+			/*
+			 * if we don't really need to cow this block
+			 * then we don't want to set the path blocking,
+			 * so we test it here
+			 */
+			if (!should_cow_block(trans, root, b))
+				goto cow_done;
+
+			btrfs_set_path_blocking(p);
+
+			err = btrfs_cow_block(trans, root, b,
+					      p->nodes[level + 1],
+					      p->slots[level + 1], &b);
+			if (err) {
+				ret = err;
+				goto done;
+			}
+		}
+cow_done:
+		BUG_ON(!cow && ins_len);
+
+		p->nodes[level] = b;
+		if (!p->skip_locking)
+			p->locks[level] = 1;
+
+		btrfs_clear_path_blocking(p, NULL);
+
+		/*
+		 * we have a lock on b and as long as we aren't changing
+		 * the tree, there is no way to for the items in b to change.
+		 * It is safe to drop the lock on our parent before we
+		 * go through the expensive btree search on b.
+		 *
+		 * If cow is true, then we might be changing slot zero,
+		 * which may require changing the parent.  So, we can't
+		 * drop the lock until after we know which slot we're
+		 * operating on.
+		 */
+		if (!cow)
+			btrfs_unlock_up_safe(p, level + 1);
+
+		ret = bin_search(b, key, level, &slot);
+
+		if (level != 0) {
+			int dec = 0;
+			if (ret && slot > 0) {
+				dec = 1;
+				slot -= 1;
+			}
+			p->slots[level] = slot;
+			err = setup_nodes_for_search(trans, root, p, b, level,
+						     ins_len);
+			if (err == -EAGAIN)
+				goto again;
+			if (err) {
+				ret = err;
+				goto done;
+			}
+			b = p->nodes[level];
+			slot = p->slots[level];
+
+			unlock_up(p, level, lowest_unlock);
+
+			if (level == lowest_level) {
+				if (dec)
+					p->slots[level]++;
+				goto done;
+			}
+
+			err = read_block_for_search(trans, root, p,
+						    &b, level, slot, key);
+			if (err == -EAGAIN)
+				goto again;
+			if (err) {
+				ret = err;
+				goto done;
+			}
+
+			if (!p->skip_locking) {
+				btrfs_clear_path_blocking(p, NULL);
+				err = btrfs_try_spin_lock(b);
+
+				if (!err) {
+					btrfs_set_path_blocking(p);
+					btrfs_tree_lock(b);
+					btrfs_clear_path_blocking(p, b);
+				}
+			}
+		} else {
+			p->slots[level] = slot;
+			if (ins_len > 0 &&
+			    btrfs_leaf_free_space(root, b) < ins_len) {
+				btrfs_set_path_blocking(p);
+				err = split_leaf(trans, root, key,
+						 p, ins_len, ret == 0);
+				btrfs_clear_path_blocking(p, NULL);
+
+				BUG_ON(err > 0);
+				if (err) {
+					ret = err;
+					goto done;
+				}
+			}
+			if (!p->search_for_split)
+				unlock_up(p, level, lowest_unlock);
+			goto done;
+		}
+	}
+	ret = 1;
+done:
+	/*
+	 * we don't really know what they plan on doing with the path
+	 * from here on, so for now just mark it as blocking
+	 */
+	if (!p->leave_spinning)
+		btrfs_set_path_blocking(p);
+	if (ret < 0)
+		btrfs_release_path(p);
+	return ret;
+}
+
+/*
+ * adjust the pointers going up the tree, starting at level
+ * making sure the right key of each node is points to 'key'.
+ * This is used after shifting pointers to the left, so it stops
+ * fixing up pointers when a given leaf/node is not in slot 0 of the
+ * higher levels
+ *
+ * If this fails to write a tree block, it returns -1, but continues
+ * fixing up the blocks in ram so the tree is consistent.
+ */
+static int fixup_low_keys(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct btrfs_path *path,
+			  struct btrfs_disk_key *key, int level)
+{
+	int i;
+	int ret = 0;
+	struct extent_buffer *t;
+
+	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+		int tslot = path->slots[i];
+		if (!path->nodes[i])
+			break;
+		t = path->nodes[i];
+		btrfs_set_node_key(t, key, tslot);
+		btrfs_mark_buffer_dirty(path->nodes[i]);
+		if (tslot != 0)
+			break;
+	}
+	return ret;
+}
+
+/*
+ * update item key.
+ *
+ * This function isn't completely safe. It's the caller's responsibility
+ * that the new key won't break the order
+ */
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *new_key)
+{
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *eb;
+	int slot;
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	if (slot > 0) {
+		btrfs_item_key(eb, &disk_key, slot - 1);
+		if (comp_keys(&disk_key, new_key) >= 0)
+			return -1;
+	}
+	if (slot < btrfs_header_nritems(eb) - 1) {
+		btrfs_item_key(eb, &disk_key, slot + 1);
+		if (comp_keys(&disk_key, new_key) <= 0)
+			return -1;
+	}
+
+	btrfs_cpu_key_to_disk(&disk_key, new_key);
+	btrfs_set_item_key(eb, &disk_key, slot);
+	btrfs_mark_buffer_dirty(eb);
+	if (slot == 0)
+		fixup_low_keys(trans, root, path, &disk_key, 1);
+	return 0;
+}
+
+/*
+ * try to push data from one node into the next node left in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the left hand block.
+ */
+static int push_node_left(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *dst,
+			  struct extent_buffer *src, int empty)
+{
+	int push_items = 0;
+	int src_nritems;
+	int dst_nritems;
+	int ret = 0;
+
+	src_nritems = btrfs_header_nritems(src);
+	dst_nritems = btrfs_header_nritems(dst);
+	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+	WARN_ON(btrfs_header_generation(src) != trans->transid);
+	WARN_ON(btrfs_header_generation(dst) != trans->transid);
+
+	if (!empty && src_nritems <= 8)
+		return 1;
+
+	if (push_items <= 0)
+		return 1;
+
+	if (empty) {
+		push_items = min(src_nritems, push_items);
+		if (push_items < src_nritems) {
+			/* leave at least 8 pointers in the node if
+			 * we aren't going to empty it
+			 */
+			if (src_nritems - push_items < 8) {
+				if (push_items <= 8)
+					return 1;
+				push_items -= 8;
+			}
+		}
+	} else
+		push_items = min(src_nritems - 8, push_items);
+
+	copy_extent_buffer(dst, src,
+			   btrfs_node_key_ptr_offset(dst_nritems),
+			   btrfs_node_key_ptr_offset(0),
+			   push_items * sizeof(struct btrfs_key_ptr));
+
+	if (push_items < src_nritems) {
+		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
+				      btrfs_node_key_ptr_offset(push_items),
+				      (src_nritems - push_items) *
+				      sizeof(struct btrfs_key_ptr));
+	}
+	btrfs_set_header_nritems(src, src_nritems - push_items);
+	btrfs_set_header_nritems(dst, dst_nritems + push_items);
+	btrfs_mark_buffer_dirty(src);
+	btrfs_mark_buffer_dirty(dst);
+
+	return ret;
+}
+
+/*
+ * try to push data from one node into the next node right in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the right hand block.
+ *
+ * this will  only push up to 1/2 the contents of the left node over
+ */
+static int balance_node_right(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct extent_buffer *dst,
+			      struct extent_buffer *src)
+{
+	int push_items = 0;
+	int max_push;
+	int src_nritems;
+	int dst_nritems;
+	int ret = 0;
+
+	WARN_ON(btrfs_header_generation(src) != trans->transid);
+	WARN_ON(btrfs_header_generation(dst) != trans->transid);
+
+	src_nritems = btrfs_header_nritems(src);
+	dst_nritems = btrfs_header_nritems(dst);
+	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+	if (push_items <= 0)
+		return 1;
+
+	if (src_nritems < 4)
+		return 1;
+
+	max_push = src_nritems / 2 + 1;
+	/* don't try to empty the node */
+	if (max_push >= src_nritems)
+		return 1;
+
+	if (max_push < push_items)
+		push_items = max_push;
+
+	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
+				      btrfs_node_key_ptr_offset(0),
+				      (dst_nritems) *
+				      sizeof(struct btrfs_key_ptr));
+
+	copy_extent_buffer(dst, src,
+			   btrfs_node_key_ptr_offset(0),
+			   btrfs_node_key_ptr_offset(src_nritems - push_items),
+			   push_items * sizeof(struct btrfs_key_ptr));
+
+	btrfs_set_header_nritems(src, src_nritems - push_items);
+	btrfs_set_header_nritems(dst, dst_nritems + push_items);
+
+	btrfs_mark_buffer_dirty(src);
+	btrfs_mark_buffer_dirty(dst);
+
+	return ret;
+}
+
+/*
+ * helper function to insert a new root level in the tree.
+ * A new node is allocated, and a single item is inserted to
+ * point to the existing root
+ *
+ * returns zero on success or < 0 on failure.
+ */
+static noinline int insert_new_root(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_path *path, int level)
+{
+	u64 lower_gen;
+	struct extent_buffer *lower;
+	struct extent_buffer *c;
+	struct extent_buffer *old;
+	struct btrfs_disk_key lower_key;
+
+	BUG_ON(path->nodes[level]);
+	BUG_ON(path->nodes[level-1] != root->node);
+
+	lower = path->nodes[level-1];
+	if (level == 1)
+		btrfs_item_key(lower, &lower_key, 0);
+	else
+		btrfs_node_key(lower, &lower_key, 0);
+
+	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
+				   root->root_key.objectid, &lower_key,
+				   level, root->node->start, 0);
+	if (IS_ERR(c))
+		return PTR_ERR(c);
+
+	root_add_used(root, root->nodesize);
+
+	memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_nritems(c, 1);
+	btrfs_set_header_level(c, level);
+	btrfs_set_header_bytenr(c, c->start);
+	btrfs_set_header_generation(c, trans->transid);
+	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(c, root->root_key.objectid);
+
+	write_extent_buffer(c, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(c),
+			    BTRFS_FSID_SIZE);
+
+	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
+			    BTRFS_UUID_SIZE);
+
+	btrfs_set_node_key(c, &lower_key, 0);
+	btrfs_set_node_blockptr(c, 0, lower->start);
+	lower_gen = btrfs_header_generation(lower);
+	WARN_ON(lower_gen != trans->transid);
+
+	btrfs_set_node_ptr_generation(c, 0, lower_gen);
+
+	btrfs_mark_buffer_dirty(c);
+
+	old = root->node;
+	rcu_assign_pointer(root->node, c);
+
+	/* the super has an extra ref to root->node */
+	free_extent_buffer(old);
+
+	add_root_to_dirty_list(root);
+	extent_buffer_get(c);
+	path->nodes[level] = c;
+	path->locks[level] = 1;
+	path->slots[level] = 0;
+	return 0;
+}
+
+/*
+ * worker function to insert a single pointer in a node.
+ * the node should have enough room for the pointer already
+ *
+ * slot and level indicate where you want the key to go, and
+ * blocknr is the block the key points to.
+ *
+ * returns zero on success and < 0 on any error
+ */
+static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_path *path, struct btrfs_disk_key
+		      *key, u64 bytenr, int slot, int level)
+{
+	struct extent_buffer *lower;
+	int nritems;
+
+	BUG_ON(!path->nodes[level]);
+	btrfs_assert_tree_locked(path->nodes[level]);
+	lower = path->nodes[level];
+	nritems = btrfs_header_nritems(lower);
+	BUG_ON(slot > nritems);
+	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
+		BUG();
+	if (slot != nritems) {
+		memmove_extent_buffer(lower,
+			      btrfs_node_key_ptr_offset(slot + 1),
+			      btrfs_node_key_ptr_offset(slot),
+			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
+	}
+	btrfs_set_node_key(lower, key, slot);
+	btrfs_set_node_blockptr(lower, slot, bytenr);
+	WARN_ON(trans->transid == 0);
+	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
+	btrfs_set_header_nritems(lower, nritems + 1);
+	btrfs_mark_buffer_dirty(lower);
+	return 0;
+}
+
+/*
+ * split the node at the specified level in path in two.
+ * The path is corrected to point to the appropriate node after the split
+ *
+ * Before splitting this tries to make some room in the node by pushing
+ * left and right, if either one works, it returns right away.
+ *
+ * returns 0 on success and < 0 on failure
+ */
+static noinline int split_node(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_path *path, int level)
+{
+	struct extent_buffer *c;
+	struct extent_buffer *split;
+	struct btrfs_disk_key disk_key;
+	int mid;
+	int ret;
+	int wret;
+	u32 c_nritems;
+
+	c = path->nodes[level];
+	WARN_ON(btrfs_header_generation(c) != trans->transid);
+	if (c == root->node) {
+		/* trying to split the root, lets make a new one */
+		ret = insert_new_root(trans, root, path, level + 1);
+		if (ret)
+			return ret;
+	} else {
+		ret = push_nodes_for_insert(trans, root, path, level);
+		c = path->nodes[level];
+		if (!ret && btrfs_header_nritems(c) <
+		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
+			return 0;
+		if (ret < 0)
+			return ret;
+	}
+
+	c_nritems = btrfs_header_nritems(c);
+	mid = (c_nritems + 1) / 2;
+	btrfs_node_key(c, &disk_key, mid);
+
+	split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
+					root->root_key.objectid,
+					&disk_key, level, c->start, 0);
+	if (IS_ERR(split))
+		return PTR_ERR(split);
+
+	root_add_used(root, root->nodesize);
+
+	memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_level(split, btrfs_header_level(c));
+	btrfs_set_header_bytenr(split, split->start);
+	btrfs_set_header_generation(split, trans->transid);
+	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(split, root->root_key.objectid);
+	write_extent_buffer(split, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(split),
+			    BTRFS_FSID_SIZE);
+	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
+			    BTRFS_UUID_SIZE);
+
+
+	copy_extent_buffer(split, c,
+			   btrfs_node_key_ptr_offset(0),
+			   btrfs_node_key_ptr_offset(mid),
+			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
+	btrfs_set_header_nritems(split, c_nritems - mid);
+	btrfs_set_header_nritems(c, mid);
+	ret = 0;
+
+	btrfs_mark_buffer_dirty(c);
+	btrfs_mark_buffer_dirty(split);
+
+	wret = insert_ptr(trans, root, path, &disk_key, split->start,
+			  path->slots[level + 1] + 1,
+			  level + 1);
+	if (wret)
+		ret = wret;
+
+	if (path->slots[level] >= mid) {
+		path->slots[level] -= mid;
+		btrfs_tree_unlock(c);
+		free_extent_buffer(c);
+		path->nodes[level] = split;
+		path->slots[level + 1] += 1;
+	} else {
+		btrfs_tree_unlock(split);
+		free_extent_buffer(split);
+	}
+	return ret;
+}
+
+/*
+ * how many bytes are required to store the items in a leaf.  start
+ * and nr indicate which items in the leaf to check.  This totals up the
+ * space used both by the item structs and the item data
+ */
+static int leaf_space_used(struct extent_buffer *l, int start, int nr)
+{
+	int data_len;
+	int nritems = btrfs_header_nritems(l);
+	int end = min(nritems, start + nr) - 1;
+
+	if (!nr)
+		return 0;
+	data_len = btrfs_item_end_nr(l, start);
+	data_len = data_len - btrfs_item_offset_nr(l, end);
+	data_len += sizeof(struct btrfs_item) * nr;
+	WARN_ON(data_len < 0);
+	return data_len;
+}
+
+/*
+ * The space between the end of the leaf items and
+ * the start of the leaf data.  IOW, how much room
+ * the leaf has left for both items and data
+ */
+noinline int btrfs_leaf_free_space(struct btrfs_root *root,
+				   struct extent_buffer *leaf)
+{
+	int nritems = btrfs_header_nritems(leaf);
+	int ret;
+	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
+	if (ret < 0) {
+		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
+		       "used %d nritems %d\n",
+		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
+		       leaf_space_used(leaf, 0, nritems), nritems);
+	}
+	return ret;
+}
+
+/*
+ * min slot controls the lowest index we're willing to push to the
+ * right.  We'll push up to and including min_slot, but no lower
+ */
+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      int data_size, int empty,
+				      struct extent_buffer *right,
+				      int free_space, u32 left_nritems,
+				      u32 min_slot)
+{
+	struct extent_buffer *left = path->nodes[0];
+	struct extent_buffer *upper = path->nodes[1];
+	struct btrfs_disk_key disk_key;
+	int slot;
+	u32 i;
+	int push_space = 0;
+	int push_items = 0;
+	struct btrfs_item *item;
+	u32 nr;
+	u32 right_nritems;
+	u32 data_end;
+	u32 this_item_size;
+
+	if (empty)
+		nr = 0;
+	else
+		nr = max_t(u32, 1, min_slot);
+
+	if (path->slots[0] >= left_nritems)
+		push_space += data_size;
+
+	slot = path->slots[1];
+	i = left_nritems - 1;
+	while (i >= nr) {
+		item = btrfs_item_nr(left, i);
+
+		if (!empty && push_items > 0) {
+			if (path->slots[0] > i)
+				break;
+			if (path->slots[0] == i) {
+				int space = btrfs_leaf_free_space(root, left);
+				if (space + push_space * 2 > free_space)
+					break;
+			}
+		}
+
+		if (path->slots[0] == i)
+			push_space += data_size;
+
+		if (!left->map_token) {
+			map_extent_buffer(left, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&left->map_token, &left->kaddr,
+					&left->map_start, &left->map_len,
+					KM_USER1);
+		}
+
+		this_item_size = btrfs_item_size(left, item);
+		if (this_item_size + sizeof(*item) + push_space > free_space)
+			break;
+
+		push_items++;
+		push_space += this_item_size + sizeof(*item);
+		if (i == 0)
+			break;
+		i--;
+	}
+	if (left->map_token) {
+		unmap_extent_buffer(left, left->map_token, KM_USER1);
+		left->map_token = NULL;
+	}
+
+	if (push_items == 0)
+		goto out_unlock;
+
+	if (!empty && push_items == left_nritems)
+		WARN_ON(1);
+
+	/* push left to right */
+	right_nritems = btrfs_header_nritems(right);
+
+	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
+	push_space -= leaf_data_end(root, left);
+
+	/* make room in the right data area */
+	data_end = leaf_data_end(root, right);
+	memmove_extent_buffer(right,
+			      btrfs_leaf_data(right) + data_end - push_space,
+			      btrfs_leaf_data(right) + data_end,
+			      BTRFS_LEAF_DATA_SIZE(root) - data_end);
+
+	/* copy from the left data area */
+	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
+		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
+		     btrfs_leaf_data(left) + leaf_data_end(root, left),
+		     push_space);
+
+	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
+			      btrfs_item_nr_offset(0),
+			      right_nritems * sizeof(struct btrfs_item));
+
+	/* copy the items from left to right */
+	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
+		   btrfs_item_nr_offset(left_nritems - push_items),
+		   push_items * sizeof(struct btrfs_item));
+
+	/* update the item pointers */
+	right_nritems += push_items;
+	btrfs_set_header_nritems(right, right_nritems);
+	push_space = BTRFS_LEAF_DATA_SIZE(root);
+	for (i = 0; i < right_nritems; i++) {
+		item = btrfs_item_nr(right, i);
+		if (!right->map_token) {
+			map_extent_buffer(right, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&right->map_token, &right->kaddr,
+					&right->map_start, &right->map_len,
+					KM_USER1);
+		}
+		push_space -= btrfs_item_size(right, item);
+		btrfs_set_item_offset(right, item, push_space);
+	}
+
+	if (right->map_token) {
+		unmap_extent_buffer(right, right->map_token, KM_USER1);
+		right->map_token = NULL;
+	}
+	left_nritems -= push_items;
+	btrfs_set_header_nritems(left, left_nritems);
+
+	if (left_nritems)
+		btrfs_mark_buffer_dirty(left);
+	else
+		clean_tree_block(trans, root, left);
+
+	btrfs_mark_buffer_dirty(right);
+
+	btrfs_item_key(right, &disk_key, 0);
+	btrfs_set_node_key(upper, &disk_key, slot + 1);
+	btrfs_mark_buffer_dirty(upper);
+
+	/* then fixup the leaf pointer in the path */
+	if (path->slots[0] >= left_nritems) {
+		path->slots[0] -= left_nritems;
+		if (btrfs_header_nritems(path->nodes[0]) == 0)
+			clean_tree_block(trans, root, path->nodes[0]);
+		btrfs_tree_unlock(path->nodes[0]);
+		free_extent_buffer(path->nodes[0]);
+		path->nodes[0] = right;
+		path->slots[1] += 1;
+	} else {
+		btrfs_tree_unlock(right);
+		free_extent_buffer(right);
+	}
+	return 0;
+
+out_unlock:
+	btrfs_tree_unlock(right);
+	free_extent_buffer(right);
+	return 1;
+}
+
+/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ *
+ * this will push starting from min_slot to the end of the leaf.  It won't
+ * push any slot lower than min_slot
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+			   *root, struct btrfs_path *path,
+			   int min_data_size, int data_size,
+			   int empty, u32 min_slot)
+{
+	struct extent_buffer *left = path->nodes[0];
+	struct extent_buffer *right;
+	struct extent_buffer *upper;
+	int slot;
+	int free_space;
+	u32 left_nritems;
+	int ret;
+
+	if (!path->nodes[1])
+		return 1;
+
+	slot = path->slots[1];
+	upper = path->nodes[1];
+	if (slot >= btrfs_header_nritems(upper) - 1)
+		return 1;
+
+	btrfs_assert_tree_locked(path->nodes[1]);
+
+	right = read_node_slot(root, upper, slot + 1);
+	if (right == NULL)
+		return 1;
+
+	btrfs_tree_lock(right);
+	btrfs_set_lock_blocking(right);
+
+	free_space = btrfs_leaf_free_space(root, right);
+	if (free_space < data_size)
+		goto out_unlock;
+
+	/* cow and double check */
+	ret = btrfs_cow_block(trans, root, right, upper,
+			      slot + 1, &right);
+	if (ret)
+		goto out_unlock;
+
+	free_space = btrfs_leaf_free_space(root, right);
+	if (free_space < data_size)
+		goto out_unlock;
+
+	left_nritems = btrfs_header_nritems(left);
+	if (left_nritems == 0)
+		goto out_unlock;
+
+	return __push_leaf_right(trans, root, path, min_data_size, empty,
+				right, free_space, left_nritems, min_slot);
+out_unlock:
+	btrfs_tree_unlock(right);
+	free_extent_buffer(right);
+	return 1;
+}
+
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items.  The
+ * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
+ * items
+ */
+static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct btrfs_path *path, int data_size,
+				     int empty, struct extent_buffer *left,
+				     int free_space, u32 right_nritems,
+				     u32 max_slot)
+{
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *right = path->nodes[0];
+	int i;
+	int push_space = 0;
+	int push_items = 0;
+	struct btrfs_item *item;
+	u32 old_left_nritems;
+	u32 nr;
+	int ret = 0;
+	int wret;
+	u32 this_item_size;
+	u32 old_left_item_size;
+
+	if (empty)
+		nr = min(right_nritems, max_slot);
+	else
+		nr = min(right_nritems - 1, max_slot);
+
+	for (i = 0; i < nr; i++) {
+		item = btrfs_item_nr(right, i);
+		if (!right->map_token) {
+			map_extent_buffer(right, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&right->map_token, &right->kaddr,
+					&right->map_start, &right->map_len,
+					KM_USER1);
+		}
+
+		if (!empty && push_items > 0) {
+			if (path->slots[0] < i)
+				break;
+			if (path->slots[0] == i) {
+				int space = btrfs_leaf_free_space(root, right);
+				if (space + push_space * 2 > free_space)
+					break;
+			}
+		}
+
+		if (path->slots[0] == i)
+			push_space += data_size;
+
+		this_item_size = btrfs_item_size(right, item);
+		if (this_item_size + sizeof(*item) + push_space > free_space)
+			break;
+
+		push_items++;
+		push_space += this_item_size + sizeof(*item);
+	}
+
+	if (right->map_token) {
+		unmap_extent_buffer(right, right->map_token, KM_USER1);
+		right->map_token = NULL;
+	}
+
+	if (push_items == 0) {
+		ret = 1;
+		goto out;
+	}
+	if (!empty && push_items == btrfs_header_nritems(right))
+		WARN_ON(1);
+
+	/* push data from right to left */
+	copy_extent_buffer(left, right,
+			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
+			   btrfs_item_nr_offset(0),
+			   push_items * sizeof(struct btrfs_item));
+
+	push_space = BTRFS_LEAF_DATA_SIZE(root) -
+		     btrfs_item_offset_nr(right, push_items - 1);
+
+	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
+		     leaf_data_end(root, left) - push_space,
+		     btrfs_leaf_data(right) +
+		     btrfs_item_offset_nr(right, push_items - 1),
+		     push_space);
+	old_left_nritems = btrfs_header_nritems(left);
+	BUG_ON(old_left_nritems <= 0);
+
+	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
+	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
+		u32 ioff;
+
+		item = btrfs_item_nr(left, i);
+		if (!left->map_token) {
+			map_extent_buffer(left, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&left->map_token, &left->kaddr,
+					&left->map_start, &left->map_len,
+					KM_USER1);
+		}
+
+		ioff = btrfs_item_offset(left, item);
+		btrfs_set_item_offset(left, item,
+		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
+	}
+	btrfs_set_header_nritems(left, old_left_nritems + push_items);
+	if (left->map_token) {
+		unmap_extent_buffer(left, left->map_token, KM_USER1);
+		left->map_token = NULL;
+	}
+
+	/* fixup right node */
+	if (push_items > right_nritems) {
+		printk(KERN_CRIT "push items %d nr %u\n", push_items,
+		       right_nritems);
+		WARN_ON(1);
+	}
+
+	if (push_items < right_nritems) {
+		push_space = btrfs_item_offset_nr(right, push_items - 1) -
+						  leaf_data_end(root, right);
+		memmove_extent_buffer(right, btrfs_leaf_data(right) +
+				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
+				      btrfs_leaf_data(right) +
+				      leaf_data_end(root, right), push_space);
+
+		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
+			      btrfs_item_nr_offset(push_items),
+			     (btrfs_header_nritems(right) - push_items) *
+			     sizeof(struct btrfs_item));
+	}
+	right_nritems -= push_items;
+	btrfs_set_header_nritems(right, right_nritems);
+	push_space = BTRFS_LEAF_DATA_SIZE(root);
+	for (i = 0; i < right_nritems; i++) {
+		item = btrfs_item_nr(right, i);
+
+		if (!right->map_token) {
+			map_extent_buffer(right, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&right->map_token, &right->kaddr,
+					&right->map_start, &right->map_len,
+					KM_USER1);
+		}
+
+		push_space = push_space - btrfs_item_size(right, item);
+		btrfs_set_item_offset(right, item, push_space);
+	}
+	if (right->map_token) {
+		unmap_extent_buffer(right, right->map_token, KM_USER1);
+		right->map_token = NULL;
+	}
+
+	btrfs_mark_buffer_dirty(left);
+	if (right_nritems)
+		btrfs_mark_buffer_dirty(right);
+	else
+		clean_tree_block(trans, root, right);
+
+	btrfs_item_key(right, &disk_key, 0);
+	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
+	if (wret)
+		ret = wret;
+
+	/* then fixup the leaf pointer in the path */
+	if (path->slots[0] < push_items) {
+		path->slots[0] += old_left_nritems;
+		btrfs_tree_unlock(path->nodes[0]);
+		free_extent_buffer(path->nodes[0]);
+		path->nodes[0] = left;
+		path->slots[1] -= 1;
+	} else {
+		btrfs_tree_unlock(left);
+		free_extent_buffer(left);
+		path->slots[0] -= push_items;
+	}
+	BUG_ON(path->slots[0] < 0);
+	return ret;
+out:
+	btrfs_tree_unlock(left);
+	free_extent_buffer(left);
+	return ret;
+}
+
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items.  The
+ * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
+ * items
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, struct btrfs_path *path, int min_data_size,
+			  int data_size, int empty, u32 max_slot)
+{
+	struct extent_buffer *right = path->nodes[0];
+	struct extent_buffer *left;
+	int slot;
+	int free_space;
+	u32 right_nritems;
+	int ret = 0;
+
+	slot = path->slots[1];
+	if (slot == 0)
+		return 1;
+	if (!path->nodes[1])
+		return 1;
+
+	right_nritems = btrfs_header_nritems(right);
+	if (right_nritems == 0)
+		return 1;
+
+	btrfs_assert_tree_locked(path->nodes[1]);
+
+	left = read_node_slot(root, path->nodes[1], slot - 1);
+	if (left == NULL)
+		return 1;
+
+	btrfs_tree_lock(left);
+	btrfs_set_lock_blocking(left);
+
+	free_space = btrfs_leaf_free_space(root, left);
+	if (free_space < data_size) {
+		ret = 1;
+		goto out;
+	}
+
+	/* cow and double check */
+	ret = btrfs_cow_block(trans, root, left,
+			      path->nodes[1], slot - 1, &left);
+	if (ret) {
+		/* we hit -ENOSPC, but it isn't fatal here */
+		ret = 1;
+		goto out;
+	}
+
+	free_space = btrfs_leaf_free_space(root, left);
+	if (free_space < data_size) {
+		ret = 1;
+		goto out;
+	}
+
+	return __push_leaf_left(trans, root, path, min_data_size,
+			       empty, left, free_space, right_nritems,
+			       max_slot);
+out:
+	btrfs_tree_unlock(left);
+	free_extent_buffer(left);
+	return ret;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int copy_for_split(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_path *path,
+			       struct extent_buffer *l,
+			       struct extent_buffer *right,
+			       int slot, int mid, int nritems)
+{
+	int data_copy_size;
+	int rt_data_off;
+	int i;
+	int ret = 0;
+	int wret;
+	struct btrfs_disk_key disk_key;
+
+	nritems = nritems - mid;
+	btrfs_set_header_nritems(right, nritems);
+	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+
+	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+			   btrfs_item_nr_offset(mid),
+			   nritems * sizeof(struct btrfs_item));
+
+	copy_extent_buffer(right, l,
+		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+		     data_copy_size, btrfs_leaf_data(l) +
+		     leaf_data_end(root, l), data_copy_size);
+
+	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+		      btrfs_item_end_nr(l, mid);
+
+	for (i = 0; i < nritems; i++) {
+		struct btrfs_item *item = btrfs_item_nr(right, i);
+		u32 ioff;
+
+		if (!right->map_token) {
+			map_extent_buffer(right, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&right->map_token, &right->kaddr,
+					&right->map_start, &right->map_len,
+					KM_USER1);
+		}
+
+		ioff = btrfs_item_offset(right, item);
+		btrfs_set_item_offset(right, item, ioff + rt_data_off);
+	}
+
+	if (right->map_token) {
+		unmap_extent_buffer(right, right->map_token, KM_USER1);
+		right->map_token = NULL;
+	}
+
+	btrfs_set_header_nritems(l, mid);
+	ret = 0;
+	btrfs_item_key(right, &disk_key, 0);
+	wret = insert_ptr(trans, root, path, &disk_key, right->start,
+			  path->slots[1] + 1, 1);
+	if (wret)
+		ret = wret;
+
+	btrfs_mark_buffer_dirty(right);
+	btrfs_mark_buffer_dirty(l);
+	BUG_ON(path->slots[0] != slot);
+
+	if (mid <= slot) {
+		btrfs_tree_unlock(path->nodes[0]);
+		free_extent_buffer(path->nodes[0]);
+		path->nodes[0] = right;
+		path->slots[0] -= mid;
+		path->slots[1] += 1;
+	} else {
+		btrfs_tree_unlock(right);
+		free_extent_buffer(right);
+	}
+
+	BUG_ON(path->slots[0] < 0);
+
+	return ret;
+}
+
+/*
+ * double splits happen when we need to insert a big item in the middle
+ * of a leaf.  A double split can leave us with 3 mostly empty leaves:
+ * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
+ *          A                 B                 C
+ *
+ * We avoid this by trying to push the items on either side of our target
+ * into the adjacent leaves.  If all goes well we can avoid the double split
+ * completely.
+ */
+static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  int data_size)
+{
+	int ret;
+	int progress = 0;
+	int slot;
+	u32 nritems;
+
+	slot = path->slots[0];
+
+	/*
+	 * try to push all the items after our slot into the
+	 * right leaf
+	 */
+	ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0)
+		progress++;
+
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	/*
+	 * our goal is to get our slot at the start or end of a leaf.  If
+	 * we've done so we're done
+	 */
+	if (path->slots[0] == 0 || path->slots[0] == nritems)
+		return 0;
+
+	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+		return 0;
+
+	/* try to push all the items before our slot into the next leaf */
+	slot = path->slots[0];
+	ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0)
+		progress++;
+
+	if (progress)
+		return 0;
+	return 1;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int split_leaf(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_key *ins_key,
+			       struct btrfs_path *path, int data_size,
+			       int extend)
+{
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *l;
+	u32 nritems;
+	int mid;
+	int slot;
+	struct extent_buffer *right;
+	int ret = 0;
+	int wret;
+	int split;
+	int num_doubles = 0;
+	int tried_avoid_double = 0;
+
+	l = path->nodes[0];
+	slot = path->slots[0];
+	if (extend && data_size + btrfs_item_size_nr(l, slot) +
+	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
+		return -EOVERFLOW;
+
+	/* first try to make some room by pushing left and right */
+	if (data_size) {
+		wret = push_leaf_right(trans, root, path, data_size,
+				       data_size, 0, 0);
+		if (wret < 0)
+			return wret;
+		if (wret) {
+			wret = push_leaf_left(trans, root, path, data_size,
+					      data_size, 0, (u32)-1);
+			if (wret < 0)
+				return wret;
+		}
+		l = path->nodes[0];
+
+		/* did the pushes work? */
+		if (btrfs_leaf_free_space(root, l) >= data_size)
+			return 0;
+	}
+
+	if (!path->nodes[1]) {
+		ret = insert_new_root(trans, root, path, 1);
+		if (ret)
+			return ret;
+	}
+again:
+	split = 1;
+	l = path->nodes[0];
+	slot = path->slots[0];
+	nritems = btrfs_header_nritems(l);
+	mid = (nritems + 1) / 2;
+
+	if (mid <= slot) {
+		if (nritems == 1 ||
+		    leaf_space_used(l, mid, nritems - mid) + data_size >
+			BTRFS_LEAF_DATA_SIZE(root)) {
+			if (slot >= nritems) {
+				split = 0;
+			} else {
+				mid = slot;
+				if (mid != nritems &&
+				    leaf_space_used(l, mid, nritems - mid) +
+				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+					if (data_size && !tried_avoid_double)
+						goto push_for_double;
+					split = 2;
+				}
+			}
+		}
+	} else {
+		if (leaf_space_used(l, 0, mid) + data_size >
+			BTRFS_LEAF_DATA_SIZE(root)) {
+			if (!extend && data_size && slot == 0) {
+				split = 0;
+			} else if ((extend || !data_size) && slot == 0) {
+				mid = 1;
+			} else {
+				mid = slot;
+				if (mid != nritems &&
+				    leaf_space_used(l, mid, nritems - mid) +
+				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+					if (data_size && !tried_avoid_double)
+						goto push_for_double;
+					split = 2 ;
+				}
+			}
+		}
+	}
+
+	if (split == 0)
+		btrfs_cpu_key_to_disk(&disk_key, ins_key);
+	else
+		btrfs_item_key(l, &disk_key, mid);
+
+	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+					root->root_key.objectid,
+					&disk_key, 0, l->start, 0);
+	if (IS_ERR(right))
+		return PTR_ERR(right);
+
+	root_add_used(root, root->leafsize);
+
+	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_bytenr(right, right->start);
+	btrfs_set_header_generation(right, trans->transid);
+	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(right, root->root_key.objectid);
+	btrfs_set_header_level(right, 0);
+	write_extent_buffer(right, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(right),
+			    BTRFS_FSID_SIZE);
+
+	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
+			    BTRFS_UUID_SIZE);
+
+	if (split == 0) {
+		if (mid <= slot) {
+			btrfs_set_header_nritems(right, 0);
+			wret = insert_ptr(trans, root, path,
+					  &disk_key, right->start,
+					  path->slots[1] + 1, 1);
+			if (wret)
+				ret = wret;
+
+			btrfs_tree_unlock(path->nodes[0]);
+			free_extent_buffer(path->nodes[0]);
+			path->nodes[0] = right;
+			path->slots[0] = 0;
+			path->slots[1] += 1;
+		} else {
+			btrfs_set_header_nritems(right, 0);
+			wret = insert_ptr(trans, root, path,
+					  &disk_key,
+					  right->start,
+					  path->slots[1], 1);
+			if (wret)
+				ret = wret;
+			btrfs_tree_unlock(path->nodes[0]);
+			free_extent_buffer(path->nodes[0]);
+			path->nodes[0] = right;
+			path->slots[0] = 0;
+			if (path->slots[1] == 0) {
+				wret = fixup_low_keys(trans, root,
+						path, &disk_key, 1);
+				if (wret)
+					ret = wret;
+			}
+		}
+		btrfs_mark_buffer_dirty(right);
+		return ret;
+	}
+
+	ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
+	BUG_ON(ret);
+
+	if (split == 2) {
+		BUG_ON(num_doubles != 0);
+		num_doubles++;
+		goto again;
+	}
+
+	return ret;
+
+push_for_double:
+	push_for_double_split(trans, root, path, data_size);
+	tried_avoid_double = 1;
+	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+		return 0;
+	goto again;
+}
+
+static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 struct btrfs_path *path, int ins_len)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	u64 extent_len = 0;
+	u32 item_size;
+	int ret;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
+	       key.type != BTRFS_EXTENT_CSUM_KEY);
+
+	if (btrfs_leaf_free_space(root, leaf) >= ins_len)
+		return 0;
+
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	if (key.type == BTRFS_EXTENT_DATA_KEY) {
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+	}
+	btrfs_release_path(path);
+
+	path->keep_locks = 1;
+	path->search_for_split = 1;
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	path->search_for_split = 0;
+	if (ret < 0)
+		goto err;
+
+	ret = -EAGAIN;
+	leaf = path->nodes[0];
+	/* if our item isn't there or got smaller, return now */
+	if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+		goto err;
+
+	/* the leaf has  changed, it now has room.  return now */
+	if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
+		goto err;
+
+	if (key.type == BTRFS_EXTENT_DATA_KEY) {
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
+			goto err;
+	}
+
+	btrfs_set_path_blocking(path);
+	ret = split_leaf(trans, root, &key, path, ins_len, 1);
+	if (ret)
+		goto err;
+
+	path->keep_locks = 0;
+	btrfs_unlock_up_safe(path, 1);
+	return 0;
+err:
+	path->keep_locks = 0;
+	return ret;
+}
+
+static noinline int split_item(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_path *path,
+			       struct btrfs_key *new_key,
+			       unsigned long split_offset)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	struct btrfs_item *new_item;
+	int slot;
+	char *buf;
+	u32 nritems;
+	u32 item_size;
+	u32 orig_offset;
+	struct btrfs_disk_key disk_key;
+
+	leaf = path->nodes[0];
+	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+
+	btrfs_set_path_blocking(path);
+
+	item = btrfs_item_nr(leaf, path->slots[0]);
+	orig_offset = btrfs_item_offset(leaf, item);
+	item_size = btrfs_item_size(leaf, item);
+
+	buf = kmalloc(item_size, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+
+	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
+			    path->slots[0]), item_size);
+
+	slot = path->slots[0] + 1;
+	nritems = btrfs_header_nritems(leaf);
+	if (slot != nritems) {
+		/* shift the items */
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
+				btrfs_item_nr_offset(slot),
+				(nritems - slot) * sizeof(struct btrfs_item));
+	}
+
+	btrfs_cpu_key_to_disk(&disk_key, new_key);
+	btrfs_set_item_key(leaf, &disk_key, slot);
+
+	new_item = btrfs_item_nr(leaf, slot);
+
+	btrfs_set_item_offset(leaf, new_item, orig_offset);
+	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
+
+	btrfs_set_item_offset(leaf, item,
+			      orig_offset + item_size - split_offset);
+	btrfs_set_item_size(leaf, item, split_offset);
+
+	btrfs_set_header_nritems(leaf, nritems + 1);
+
+	/* write the data for the start of the original item */
+	write_extent_buffer(leaf, buf,
+			    btrfs_item_ptr_offset(leaf, path->slots[0]),
+			    split_offset);
+
+	/* write the data for the new item */
+	write_extent_buffer(leaf, buf + split_offset,
+			    btrfs_item_ptr_offset(leaf, slot),
+			    item_size - split_offset);
+	btrfs_mark_buffer_dirty(leaf);
+
+	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
+	kfree(buf);
+	return 0;
+}
+
+/*
+ * This function splits a single item into two items,
+ * giving 'new_key' to the new item and splitting the
+ * old one at split_offset (from the start of the item).
+ *
+ * The path may be released by this operation.  After
+ * the split, the path is pointing to the old item.  The
+ * new item is going to be in the same node as the old one.
+ *
+ * Note, the item being split must be smaller enough to live alone on
+ * a tree block with room for one extra struct btrfs_item
+ *
+ * This allows us to split the item in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_path *path,
+		     struct btrfs_key *new_key,
+		     unsigned long split_offset)
+{
+	int ret;
+	ret = setup_leaf_for_split(trans, root, path,
+				   sizeof(struct btrfs_item));
+	if (ret)
+		return ret;
+
+	ret = split_item(trans, root, path, new_key, split_offset);
+	return ret;
+}
+
+/*
+ * This function duplicate a item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item
+ * is contiguous with the original item.
+ *
+ * This allows us to split file extent in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 struct btrfs_path *path,
+			 struct btrfs_key *new_key)
+{
+	struct extent_buffer *leaf;
+	int ret;
+	u32 item_size;
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ret = setup_leaf_for_split(trans, root, path,
+				   item_size + sizeof(struct btrfs_item));
+	if (ret)
+		return ret;
+
+	path->slots[0]++;
+	ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
+				     item_size, item_size +
+				     sizeof(struct btrfs_item), 1);
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	memcpy_extent_buffer(leaf,
+			     btrfs_item_ptr_offset(leaf, path->slots[0]),
+			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+			     item_size);
+	return 0;
+}
+
+/*
+ * make the item pointed to by the path smaller.  new_size indicates
+ * how small to make it, and from_end tells us if we just chop bytes
+ * off the end of the item or if we shift the item to chop bytes off
+ * the front.
+ */
+int btrfs_truncate_item(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct btrfs_path *path,
+			u32 new_size, int from_end)
+{
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	u32 nritems;
+	unsigned int data_end;
+	unsigned int old_data_start;
+	unsigned int old_size;
+	unsigned int size_diff;
+	int i;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+
+	old_size = btrfs_item_size_nr(leaf, slot);
+	if (old_size == new_size)
+		return 0;
+
+	nritems = btrfs_header_nritems(leaf);
+	data_end = leaf_data_end(root, leaf);
+
+	old_data_start = btrfs_item_offset_nr(leaf, slot);
+
+	size_diff = old_size - new_size;
+
+	BUG_ON(slot < 0);
+	BUG_ON(slot >= nritems);
+
+	/*
+	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+	 */
+	/* first correct the data pointers */
+	for (i = slot; i < nritems; i++) {
+		u32 ioff;
+		item = btrfs_item_nr(leaf, i);
+
+		if (!leaf->map_token) {
+			map_extent_buffer(leaf, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&leaf->map_token, &leaf->kaddr,
+					&leaf->map_start, &leaf->map_len,
+					KM_USER1);
+		}
+
+		ioff = btrfs_item_offset(leaf, item);
+		btrfs_set_item_offset(leaf, item, ioff + size_diff);
+	}
+
+	if (leaf->map_token) {
+		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+		leaf->map_token = NULL;
+	}
+
+	/* shift the data */
+	if (from_end) {
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end + size_diff, btrfs_leaf_data(leaf) +
+			      data_end, old_data_start + new_size - data_end);
+	} else {
+		struct btrfs_disk_key disk_key;
+		u64 offset;
+
+		btrfs_item_key(leaf, &disk_key, slot);
+
+		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
+			unsigned long ptr;
+			struct btrfs_file_extent_item *fi;
+
+			fi = btrfs_item_ptr(leaf, slot,
+					    struct btrfs_file_extent_item);
+			fi = (struct btrfs_file_extent_item *)(
+			     (unsigned long)fi - size_diff);
+
+			if (btrfs_file_extent_type(leaf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE) {
+				ptr = btrfs_item_ptr_offset(leaf, slot);
+				memmove_extent_buffer(leaf, ptr,
+				      (unsigned long)fi,
+				      offsetof(struct btrfs_file_extent_item,
+						 disk_bytenr));
+			}
+		}
+
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end + size_diff, btrfs_leaf_data(leaf) +
+			      data_end, old_data_start - data_end);
+
+		offset = btrfs_disk_key_offset(&disk_key);
+		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
+		btrfs_set_item_key(leaf, &disk_key, slot);
+		if (slot == 0)
+			fixup_low_keys(trans, root, path, &disk_key, 1);
+	}
+
+	item = btrfs_item_nr(leaf, slot);
+	btrfs_set_item_size(leaf, item, new_size);
+	btrfs_mark_buffer_dirty(leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+	return 0;
+}
+
+/*
+ * make the item pointed to by the path bigger, data_size is the new size.
+ */
+int btrfs_extend_item(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, struct btrfs_path *path,
+		      u32 data_size)
+{
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	u32 nritems;
+	unsigned int data_end;
+	unsigned int old_data;
+	unsigned int old_size;
+	int i;
+
+	leaf = path->nodes[0];
+
+	nritems = btrfs_header_nritems(leaf);
+	data_end = leaf_data_end(root, leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < data_size) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+	slot = path->slots[0];
+	old_data = btrfs_item_end_nr(leaf, slot);
+
+	BUG_ON(slot < 0);
+	if (slot >= nritems) {
+		btrfs_print_leaf(root, leaf);
+		printk(KERN_CRIT "slot %d too large, nritems %d\n",
+		       slot, nritems);
+		BUG_ON(1);
+	}
+
+	/*
+	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+	 */
+	/* first correct the data pointers */
+	for (i = slot; i < nritems; i++) {
+		u32 ioff;
+		item = btrfs_item_nr(leaf, i);
+
+		if (!leaf->map_token) {
+			map_extent_buffer(leaf, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&leaf->map_token, &leaf->kaddr,
+					&leaf->map_start, &leaf->map_len,
+					KM_USER1);
+		}
+		ioff = btrfs_item_offset(leaf, item);
+		btrfs_set_item_offset(leaf, item, ioff - data_size);
+	}
+
+	if (leaf->map_token) {
+		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+		leaf->map_token = NULL;
+	}
+
+	/* shift the data */
+	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+		      data_end - data_size, btrfs_leaf_data(leaf) +
+		      data_end, old_data - data_end);
+
+	data_end = old_data;
+	old_size = btrfs_item_size_nr(leaf, slot);
+	item = btrfs_item_nr(leaf, slot);
+	btrfs_set_item_size(leaf, item, old_size + data_size);
+	btrfs_mark_buffer_dirty(leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+	return 0;
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ * Returns the number of keys that were inserted.
+ */
+int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    int nr)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	int ret = 0;
+	int slot;
+	int i;
+	u32 nritems;
+	u32 total_data = 0;
+	u32 total_size = 0;
+	unsigned int data_end;
+	struct btrfs_disk_key disk_key;
+	struct btrfs_key found_key;
+
+	for (i = 0; i < nr; i++) {
+		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
+		    BTRFS_LEAF_DATA_SIZE(root)) {
+			break;
+			nr = i;
+		}
+		total_data += data_size[i];
+		total_size += data_size[i] + sizeof(struct btrfs_item);
+	}
+	BUG_ON(nr == 0);
+
+	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+	if (ret == 0)
+		return -EEXIST;
+	if (ret < 0)
+		goto out;
+
+	leaf = path->nodes[0];
+
+	nritems = btrfs_header_nritems(leaf);
+	data_end = leaf_data_end(root, leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < total_size) {
+		for (i = nr; i >= 0; i--) {
+			total_data -= data_size[i];
+			total_size -= data_size[i] + sizeof(struct btrfs_item);
+			if (total_size < btrfs_leaf_free_space(root, leaf))
+				break;
+		}
+		nr = i;
+	}
+
+	slot = path->slots[0];
+	BUG_ON(slot < 0);
+
+	if (slot != nritems) {
+		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+
+		item = btrfs_item_nr(leaf, slot);
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/* figure out how many keys we can insert in here */
+		total_data = data_size[0];
+		for (i = 1; i < nr; i++) {
+			if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
+				break;
+			total_data += data_size[i];
+		}
+		nr = i;
+
+		if (old_data < data_end) {
+			btrfs_print_leaf(root, leaf);
+			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+			       slot, old_data, data_end);
+			BUG_ON(1);
+		}
+		/*
+		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+		 */
+		/* first correct the data pointers */
+		WARN_ON(leaf->map_token);
+		for (i = slot; i < nritems; i++) {
+			u32 ioff;
+
+			item = btrfs_item_nr(leaf, i);
+			if (!leaf->map_token) {
+				map_extent_buffer(leaf, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&leaf->map_token, &leaf->kaddr,
+					&leaf->map_start, &leaf->map_len,
+					KM_USER1);
+			}
+
+			ioff = btrfs_item_offset(leaf, item);
+			btrfs_set_item_offset(leaf, item, ioff - total_data);
+		}
+		if (leaf->map_token) {
+			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+			leaf->map_token = NULL;
+		}
+
+		/* shift the items */
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+			      btrfs_item_nr_offset(slot),
+			      (nritems - slot) * sizeof(struct btrfs_item));
+
+		/* shift the data */
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end - total_data, btrfs_leaf_data(leaf) +
+			      data_end, old_data - data_end);
+		data_end = old_data;
+	} else {
+		/*
+		 * this sucks but it has to be done, if we are inserting at
+		 * the end of the leaf only insert 1 of the items, since we
+		 * have no way of knowing whats on the next leaf and we'd have
+		 * to drop our current locks to figure it out
+		 */
+		nr = 1;
+	}
+
+	/* setup the item for the new data */
+	for (i = 0; i < nr; i++) {
+		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+		btrfs_set_item_key(leaf, &disk_key, slot + i);
+		item = btrfs_item_nr(leaf, slot + i);
+		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+		data_end -= data_size[i];
+		btrfs_set_item_size(leaf, item, data_size[i]);
+	}
+	btrfs_set_header_nritems(leaf, nritems + nr);
+	btrfs_mark_buffer_dirty(leaf);
+
+	ret = 0;
+	if (slot == 0) {
+		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+	}
+
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+out:
+	if (!ret)
+		ret = nr;
+	return ret;
+}
+
+/*
+ * this is a helper for btrfs_insert_empty_items, the main goal here is
+ * to save stack depth by doing the bulk of the work in a function
+ * that doesn't call btrfs_search_slot
+ */
+int setup_items_for_insert(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, struct btrfs_path *path,
+			   struct btrfs_key *cpu_key, u32 *data_size,
+			   u32 total_data, u32 total_size, int nr)
+{
+	struct btrfs_item *item;
+	int i;
+	u32 nritems;
+	unsigned int data_end;
+	struct btrfs_disk_key disk_key;
+	int ret;
+	struct extent_buffer *leaf;
+	int slot;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+
+	nritems = btrfs_header_nritems(leaf);
+	data_end = leaf_data_end(root, leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < total_size) {
+		btrfs_print_leaf(root, leaf);
+		printk(KERN_CRIT "not enough freespace need %u have %d\n",
+		       total_size, btrfs_leaf_free_space(root, leaf));
+		BUG();
+	}
+
+	if (slot != nritems) {
+		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+
+		if (old_data < data_end) {
+			btrfs_print_leaf(root, leaf);
+			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+			       slot, old_data, data_end);
+			BUG_ON(1);
+		}
+		/*
+		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+		 */
+		/* first correct the data pointers */
+		WARN_ON(leaf->map_token);
+		for (i = slot; i < nritems; i++) {
+			u32 ioff;
+
+			item = btrfs_item_nr(leaf, i);
+			if (!leaf->map_token) {
+				map_extent_buffer(leaf, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&leaf->map_token, &leaf->kaddr,
+					&leaf->map_start, &leaf->map_len,
+					KM_USER1);
+			}
+
+			ioff = btrfs_item_offset(leaf, item);
+			btrfs_set_item_offset(leaf, item, ioff - total_data);
+		}
+		if (leaf->map_token) {
+			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+			leaf->map_token = NULL;
+		}
+
+		/* shift the items */
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+			      btrfs_item_nr_offset(slot),
+			      (nritems - slot) * sizeof(struct btrfs_item));
+
+		/* shift the data */
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end - total_data, btrfs_leaf_data(leaf) +
+			      data_end, old_data - data_end);
+		data_end = old_data;
+	}
+
+	/* setup the item for the new data */
+	for (i = 0; i < nr; i++) {
+		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+		btrfs_set_item_key(leaf, &disk_key, slot + i);
+		item = btrfs_item_nr(leaf, slot + i);
+		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+		data_end -= data_size[i];
+		btrfs_set_item_size(leaf, item, data_size[i]);
+	}
+
+	btrfs_set_header_nritems(leaf, nritems + nr);
+
+	ret = 0;
+	if (slot == 0) {
+		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+	}
+	btrfs_unlock_up_safe(path, 1);
+	btrfs_mark_buffer_dirty(leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+	return ret;
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    int nr)
+{
+	int ret = 0;
+	int slot;
+	int i;
+	u32 total_size = 0;
+	u32 total_data = 0;
+
+	for (i = 0; i < nr; i++)
+		total_data += data_size[i];
+
+	total_size = total_data + (nr * sizeof(struct btrfs_item));
+	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+	if (ret == 0)
+		return -EEXIST;
+	if (ret < 0)
+		goto out;
+
+	slot = path->slots[0];
+	BUG_ON(slot < 0);
+
+	ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
+			       total_data, total_size, nr);
+
+out:
+	return ret;
+}
+
+/*
+ * Given a key and some data, insert an item into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *cpu_key, void *data, u32
+		      data_size)
+{
+	int ret = 0;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+	if (!ret) {
+		leaf = path->nodes[0];
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		write_extent_buffer(leaf, data, ptr, data_size);
+		btrfs_mark_buffer_dirty(leaf);
+	}
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * delete the pointer from a given node.
+ *
+ * the tree should have been previously balanced so the deletion does not
+ * empty a node.
+ */
+static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_path *path, int level, int slot)
+{
+	struct extent_buffer *parent = path->nodes[level];
+	u32 nritems;
+	int ret = 0;
+	int wret;
+
+	nritems = btrfs_header_nritems(parent);
+	if (slot != nritems - 1) {
+		memmove_extent_buffer(parent,
+			      btrfs_node_key_ptr_offset(slot),
+			      btrfs_node_key_ptr_offset(slot + 1),
+			      sizeof(struct btrfs_key_ptr) *
+			      (nritems - slot - 1));
+	}
+	nritems--;
+	btrfs_set_header_nritems(parent, nritems);
+	if (nritems == 0 && parent == root->node) {
+		BUG_ON(btrfs_header_level(root->node) != 1);
+		/* just turn the root into a leaf and break */
+		btrfs_set_header_level(root->node, 0);
+	} else if (slot == 0) {
+		struct btrfs_disk_key disk_key;
+
+		btrfs_node_key(parent, &disk_key, 0);
+		wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
+		if (wret)
+			ret = wret;
+	}
+	btrfs_mark_buffer_dirty(parent);
+	return ret;
+}
+
+/*
+ * a helper function to delete the leaf pointed to by path->slots[1] and
+ * path->nodes[1].
+ *
+ * This deletes the pointer in path->nodes[1] and frees the leaf
+ * block extent.  zero is returned if it all worked out, < 0 otherwise.
+ *
+ * The path must have already been setup for deleting the leaf, including
+ * all the proper balancing.  path->nodes[1] must be locked.
+ */
+static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   struct extent_buffer *leaf)
+{
+	int ret;
+
+	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
+	ret = del_ptr(trans, root, path, 1, path->slots[1]);
+	if (ret)
+		return ret;
+
+	/*
+	 * btrfs_free_extent is expensive, we want to make sure we
+	 * aren't holding any locks when we call it
+	 */
+	btrfs_unlock_up_safe(path, 0);
+
+	root_sub_used(root, leaf->len);
+
+	btrfs_free_tree_block(trans, root, leaf, 0, 1);
+	return 0;
+}
+/*
+ * delete the item at the leaf level in path.  If that empties
+ * the leaf, remove it from the tree
+ */
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		    struct btrfs_path *path, int slot, int nr)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	int last_off;
+	int dsize = 0;
+	int ret = 0;
+	int wret;
+	int i;
+	u32 nritems;
+
+	leaf = path->nodes[0];
+	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
+
+	for (i = 0; i < nr; i++)
+		dsize += btrfs_item_size_nr(leaf, slot + i);
+
+	nritems = btrfs_header_nritems(leaf);
+
+	if (slot + nr != nritems) {
+		int data_end = leaf_data_end(root, leaf);
+
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end + dsize,
+			      btrfs_leaf_data(leaf) + data_end,
+			      last_off - data_end);
+
+		for (i = slot + nr; i < nritems; i++) {
+			u32 ioff;
+
+			item = btrfs_item_nr(leaf, i);
+			if (!leaf->map_token) {
+				map_extent_buffer(leaf, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&leaf->map_token, &leaf->kaddr,
+					&leaf->map_start, &leaf->map_len,
+					KM_USER1);
+			}
+			ioff = btrfs_item_offset(leaf, item);
+			btrfs_set_item_offset(leaf, item, ioff + dsize);
+		}
+
+		if (leaf->map_token) {
+			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+			leaf->map_token = NULL;
+		}
+
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
+			      btrfs_item_nr_offset(slot + nr),
+			      sizeof(struct btrfs_item) *
+			      (nritems - slot - nr));
+	}
+	btrfs_set_header_nritems(leaf, nritems - nr);
+	nritems -= nr;
+
+	/* delete the leaf if we've emptied it */
+	if (nritems == 0) {
+		if (leaf == root->node) {
+			btrfs_set_header_level(leaf, 0);
+		} else {
+			btrfs_set_path_blocking(path);
+			clean_tree_block(trans, root, leaf);
+			ret = btrfs_del_leaf(trans, root, path, leaf);
+			BUG_ON(ret);
+		}
+	} else {
+		int used = leaf_space_used(leaf, 0, nritems);
+		if (slot == 0) {
+			struct btrfs_disk_key disk_key;
+
+			btrfs_item_key(leaf, &disk_key, 0);
+			wret = fixup_low_keys(trans, root, path,
+					      &disk_key, 1);
+			if (wret)
+				ret = wret;
+		}
+
+		/* delete the leaf if it is mostly empty */
+		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
+			/* push_leaf_left fixes the path.
+			 * make sure the path still points to our leaf
+			 * for possible call to del_ptr below
+			 */
+			slot = path->slots[1];
+			extent_buffer_get(leaf);
+
+			btrfs_set_path_blocking(path);
+			wret = push_leaf_left(trans, root, path, 1, 1,
+					      1, (u32)-1);
+			if (wret < 0 && wret != -ENOSPC)
+				ret = wret;
+
+			if (path->nodes[0] == leaf &&
+			    btrfs_header_nritems(leaf)) {
+				wret = push_leaf_right(trans, root, path, 1,
+						       1, 1, 0);
+				if (wret < 0 && wret != -ENOSPC)
+					ret = wret;
+			}
+
+			if (btrfs_header_nritems(leaf) == 0) {
+				path->slots[1] = slot;
+				ret = btrfs_del_leaf(trans, root, path, leaf);
+				BUG_ON(ret);
+				free_extent_buffer(leaf);
+			} else {
+				/* if we're still in the path, make sure
+				 * we're dirty.  Otherwise, one of the
+				 * push_leaf functions must have already
+				 * dirtied this buffer
+				 */
+				if (path->nodes[0] == leaf)
+					btrfs_mark_buffer_dirty(leaf);
+				free_extent_buffer(leaf);
+			}
+		} else {
+			btrfs_mark_buffer_dirty(leaf);
+		}
+	}
+	return ret;
+}
+
+/*
+ * search the tree again to find a leaf with lesser keys
+ * returns 0 if it found something or 1 if there are no lesser leaves.
+ * returns < 0 on io errors.
+ *
+ * This may release the path, and so you may lose any locks held at the
+ * time you call it.
+ */
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+	struct btrfs_key key;
+	struct btrfs_disk_key found_key;
+	int ret;
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
+
+	if (key.offset > 0)
+		key.offset--;
+	else if (key.type > 0)
+		key.type--;
+	else if (key.objectid > 0)
+		key.objectid--;
+	else
+		return 1;
+
+	btrfs_release_path(path);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+	btrfs_item_key(path->nodes[0], &found_key, 0);
+	ret = comp_keys(&found_key, &key);
+	if (ret < 0)
+		return 0;
+	return 1;
+}
+
+/*
+ * A helper function to walk down the tree starting at min_key, and looking
+ * for nodes or leaves that are either in cache or have a minimum
+ * transaction id.  This is used by the btree defrag code, and tree logging
+ *
+ * This does not cow, but it does stuff the starting key it finds back
+ * into min_key, so you can call btrfs_search_slot with cow=1 on the
+ * key and get a writable path.
+ *
+ * This does lock as it descends, and path->keep_locks should be set
+ * to 1 by the caller.
+ *
+ * This honors path->lowest_level to prevent descent past a given level
+ * of the tree.
+ *
+ * min_trans indicates the oldest transaction that you are interested
+ * in walking through.  Any nodes or leaves older than min_trans are
+ * skipped over (without reading them).
+ *
+ * returns zero if something useful was found, < 0 on error and 1 if there
+ * was nothing in the tree that matched the search criteria.
+ */
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+			 struct btrfs_key *max_key,
+			 struct btrfs_path *path, int cache_only,
+			 u64 min_trans)
+{
+	struct extent_buffer *cur;
+	struct btrfs_key found_key;
+	int slot;
+	int sret;
+	u32 nritems;
+	int level;
+	int ret = 1;
+
+	WARN_ON(!path->keep_locks);
+again:
+	cur = btrfs_lock_root_node(root);
+	level = btrfs_header_level(cur);
+	WARN_ON(path->nodes[level]);
+	path->nodes[level] = cur;
+	path->locks[level] = 1;
+
+	if (btrfs_header_generation(cur) < min_trans) {
+		ret = 1;
+		goto out;
+	}
+	while (1) {
+		nritems = btrfs_header_nritems(cur);
+		level = btrfs_header_level(cur);
+		sret = bin_search(cur, min_key, level, &slot);
+
+		/* at the lowest level, we're done, setup the path and exit */
+		if (level == path->lowest_level) {
+			if (slot >= nritems)
+				goto find_next_key;
+			ret = 0;
+			path->slots[level] = slot;
+			btrfs_item_key_to_cpu(cur, &found_key, slot);
+			goto out;
+		}
+		if (sret && slot > 0)
+			slot--;
+		/*
+		 * check this node pointer against the cache_only and
+		 * min_trans parameters.  If it isn't in cache or is too
+		 * old, skip to the next one.
+		 */
+		while (slot < nritems) {
+			u64 blockptr;
+			u64 gen;
+			struct extent_buffer *tmp;
+			struct btrfs_disk_key disk_key;
+
+			blockptr = btrfs_node_blockptr(cur, slot);
+			gen = btrfs_node_ptr_generation(cur, slot);
+			if (gen < min_trans) {
+				slot++;
+				continue;
+			}
+			if (!cache_only)
+				break;
+
+			if (max_key) {
+				btrfs_node_key(cur, &disk_key, slot);
+				if (comp_keys(&disk_key, max_key) >= 0) {
+					ret = 1;
+					goto out;
+				}
+			}
+
+			tmp = btrfs_find_tree_block(root, blockptr,
+					    btrfs_level_size(root, level - 1));
+
+			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+				free_extent_buffer(tmp);
+				break;
+			}
+			if (tmp)
+				free_extent_buffer(tmp);
+			slot++;
+		}
+find_next_key:
+		/*
+		 * we didn't find a candidate key in this node, walk forward
+		 * and find another one
+		 */
+		if (slot >= nritems) {
+			path->slots[level] = slot;
+			btrfs_set_path_blocking(path);
+			sret = btrfs_find_next_key(root, path, min_key, level,
+						  cache_only, min_trans);
+			if (sret == 0) {
+				btrfs_release_path(path);
+				goto again;
+			} else {
+				goto out;
+			}
+		}
+		/* save our key for returning back */
+		btrfs_node_key_to_cpu(cur, &found_key, slot);
+		path->slots[level] = slot;
+		if (level == path->lowest_level) {
+			ret = 0;
+			unlock_up(path, level, 1);
+			goto out;
+		}
+		btrfs_set_path_blocking(path);
+		cur = read_node_slot(root, cur, slot);
+		BUG_ON(!cur);
+
+		btrfs_tree_lock(cur);
+
+		path->locks[level - 1] = 1;
+		path->nodes[level - 1] = cur;
+		unlock_up(path, level, 1);
+		btrfs_clear_path_blocking(path, NULL);
+	}
+out:
+	if (ret == 0)
+		memcpy(min_key, &found_key, sizeof(found_key));
+	btrfs_set_path_blocking(path);
+	return ret;
+}
+
+/*
+ * this is similar to btrfs_next_leaf, but does not try to preserve
+ * and fixup the path.  It looks for and returns the next key in the
+ * tree based on the current path and the cache_only and min_trans
+ * parameters.
+ *
+ * 0 is returned if another key is found, < 0 if there are any errors
+ * and 1 is returned if there are no higher keys in the tree
+ *
+ * path->keep_locks should be set to 1 on the search made before
+ * calling this function.
+ */
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+			struct btrfs_key *key, int level,
+			int cache_only, u64 min_trans)
+{
+	int slot;
+	struct extent_buffer *c;
+
+	WARN_ON(!path->keep_locks);
+	while (level < BTRFS_MAX_LEVEL) {
+		if (!path->nodes[level])
+			return 1;
+
+		slot = path->slots[level] + 1;
+		c = path->nodes[level];
+next:
+		if (slot >= btrfs_header_nritems(c)) {
+			int ret;
+			int orig_lowest;
+			struct btrfs_key cur_key;
+			if (level + 1 >= BTRFS_MAX_LEVEL ||
+			    !path->nodes[level + 1])
+				return 1;
+
+			if (path->locks[level + 1]) {
+				level++;
+				continue;
+			}
+
+			slot = btrfs_header_nritems(c) - 1;
+			if (level == 0)
+				btrfs_item_key_to_cpu(c, &cur_key, slot);
+			else
+				btrfs_node_key_to_cpu(c, &cur_key, slot);
+
+			orig_lowest = path->lowest_level;
+			btrfs_release_path(path);
+			path->lowest_level = level;
+			ret = btrfs_search_slot(NULL, root, &cur_key, path,
+						0, 0);
+			path->lowest_level = orig_lowest;
+			if (ret < 0)
+				return ret;
+
+			c = path->nodes[level];
+			slot = path->slots[level];
+			if (ret == 0)
+				slot++;
+			goto next;
+		}
+
+		if (level == 0)
+			btrfs_item_key_to_cpu(c, key, slot);
+		else {
+			u64 blockptr = btrfs_node_blockptr(c, slot);
+			u64 gen = btrfs_node_ptr_generation(c, slot);
+
+			if (cache_only) {
+				struct extent_buffer *cur;
+				cur = btrfs_find_tree_block(root, blockptr,
+					    btrfs_level_size(root, level - 1));
+				if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
+					slot++;
+					if (cur)
+						free_extent_buffer(cur);
+					goto next;
+				}
+				free_extent_buffer(cur);
+			}
+			if (gen < min_trans) {
+				slot++;
+				goto next;
+			}
+			btrfs_node_key_to_cpu(c, key, slot);
+		}
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * search the tree again to find a leaf with greater keys
+ * returns 0 if it found something or 1 if there are no greater leaves.
+ * returns < 0 on io errors.
+ */
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+	int slot;
+	int level;
+	struct extent_buffer *c;
+	struct extent_buffer *next;
+	struct btrfs_key key;
+	u32 nritems;
+	int ret;
+	int old_spinning = path->leave_spinning;
+	int force_blocking = 0;
+
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	if (nritems == 0)
+		return 1;
+
+	/*
+	 * we take the blocks in an order that upsets lockdep.  Using
+	 * blocking mode is the only way around it.
+	 */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	force_blocking = 1;
+#endif
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+again:
+	level = 1;
+	next = NULL;
+	btrfs_release_path(path);
+
+	path->keep_locks = 1;
+
+	if (!force_blocking)
+		path->leave_spinning = 1;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	path->keep_locks = 0;
+
+	if (ret < 0)
+		return ret;
+
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	/*
+	 * by releasing the path above we dropped all our locks.  A balance
+	 * could have added more items next to the key that used to be
+	 * at the very end of the block.  So, check again here and
+	 * advance the path if there are now more items available.
+	 */
+	if (nritems > 0 && path->slots[0] < nritems - 1) {
+		if (ret == 0)
+			path->slots[0]++;
+		ret = 0;
+		goto done;
+	}
+
+	while (level < BTRFS_MAX_LEVEL) {
+		if (!path->nodes[level]) {
+			ret = 1;
+			goto done;
+		}
+
+		slot = path->slots[level] + 1;
+		c = path->nodes[level];
+		if (slot >= btrfs_header_nritems(c)) {
+			level++;
+			if (level == BTRFS_MAX_LEVEL) {
+				ret = 1;
+				goto done;
+			}
+			continue;
+		}
+
+		if (next) {
+			btrfs_tree_unlock(next);
+			free_extent_buffer(next);
+		}
+
+		next = c;
+		ret = read_block_for_search(NULL, root, path, &next, level,
+					    slot, &key);
+		if (ret == -EAGAIN)
+			goto again;
+
+		if (ret < 0) {
+			btrfs_release_path(path);
+			goto done;
+		}
+
+		if (!path->skip_locking) {
+			ret = btrfs_try_spin_lock(next);
+			if (!ret) {
+				btrfs_set_path_blocking(path);
+				btrfs_tree_lock(next);
+				if (!force_blocking)
+					btrfs_clear_path_blocking(path, next);
+			}
+			if (force_blocking)
+				btrfs_set_lock_blocking(next);
+		}
+		break;
+	}
+	path->slots[level] = slot;
+	while (1) {
+		level--;
+		c = path->nodes[level];
+		if (path->locks[level])
+			btrfs_tree_unlock(c);
+
+		free_extent_buffer(c);
+		path->nodes[level] = next;
+		path->slots[level] = 0;
+		if (!path->skip_locking)
+			path->locks[level] = 1;
+
+		if (!level)
+			break;
+
+		ret = read_block_for_search(NULL, root, path, &next, level,
+					    0, &key);
+		if (ret == -EAGAIN)
+			goto again;
+
+		if (ret < 0) {
+			btrfs_release_path(path);
+			goto done;
+		}
+
+		if (!path->skip_locking) {
+			btrfs_assert_tree_locked(path->nodes[level]);
+			ret = btrfs_try_spin_lock(next);
+			if (!ret) {
+				btrfs_set_path_blocking(path);
+				btrfs_tree_lock(next);
+				if (!force_blocking)
+					btrfs_clear_path_blocking(path, next);
+			}
+			if (force_blocking)
+				btrfs_set_lock_blocking(next);
+		}
+	}
+	ret = 0;
+done:
+	unlock_up(path, 0, 1);
+	path->leave_spinning = old_spinning;
+	if (!old_spinning)
+		btrfs_set_path_blocking(path);
+
+	return ret;
+}
+
+/*
+ * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
+ * searching until it gets past min_objectid or finds an item of 'type'
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid,
+			int type)
+{
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int ret;
+
+	while (1) {
+		if (path->slots[0] == 0) {
+			btrfs_set_path_blocking(path);
+			ret = btrfs_prev_leaf(root, path);
+			if (ret != 0)
+				return ret;
+		} else {
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (nritems == 0)
+			return 1;
+		if (path->slots[0] == nritems)
+			path->slots[0]--;
+
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid < min_objectid)
+			break;
+		if (found_key.type == type)
+			return 0;
+		if (found_key.objectid == min_objectid &&
+		    found_key.type < type)
+			break;
+	}
+	return 1;
+}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 00000000..66179bcb
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,2683 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_CTREE__
+#define __BTRFS_CTREE__
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
+#include <linux/rwsem.h>
+#include <linux/completion.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/kobject.h>
+#include <trace/events/btrfs.h>
+#include <asm/kmap_types.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "async-thread.h"
+#include "ioctl.h"
+
+struct btrfs_trans_handle;
+struct btrfs_transaction;
+struct btrfs_pending_snapshot;
+extern struct kmem_cache *btrfs_trans_handle_cachep;
+extern struct kmem_cache *btrfs_transaction_cachep;
+extern struct kmem_cache *btrfs_bit_radix_cachep;
+extern struct kmem_cache *btrfs_path_cachep;
+extern struct kmem_cache *btrfs_free_space_cachep;
+struct btrfs_ordered_sum;
+
+#define BTRFS_MAGIC "_BHRfS_M"
+
+#define BTRFS_MAX_LEVEL 8
+
+#define BTRFS_COMPAT_EXTENT_TREE_V0
+
+/*
+ * files bigger than this get some pre-flushing when they are added
+ * to the ordered operations list.  That way we limit the total
+ * work done by the commit
+ */
+#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
+
+/* holds pointers to all of the tree roots */
+#define BTRFS_ROOT_TREE_OBJECTID 1ULL
+
+/* stores information about which extents are in use, and reference counts */
+#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
+
+/*
+ * chunk tree stores translations from logical -> physical block numbering
+ * the super block points to the chunk tree
+ */
+#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
+
+/*
+ * stores information about which areas of a given device are in use.
+ * one per device.  The tree of tree roots points to the device tree
+ */
+#define BTRFS_DEV_TREE_OBJECTID 4ULL
+
+/* one per subvolume, storing files and directories */
+#define BTRFS_FS_TREE_OBJECTID 5ULL
+
+/* directory objectid inside the root tree */
+#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
+
+/* holds checksums of all the data extents */
+#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+
+/* orhpan objectid for tracking unlinked/truncated files */
+#define BTRFS_ORPHAN_OBJECTID -5ULL
+
+/* does write ahead logging to speed up fsyncs */
+#define BTRFS_TREE_LOG_OBJECTID -6ULL
+#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
+
+/* for space balancing */
+#define BTRFS_TREE_RELOC_OBJECTID -8ULL
+#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
+
+/*
+ * extent checksums all have this objectid
+ * this allows them to share the logging tree
+ * for fsyncs
+ */
+#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
+
+/* For storing free space cache */
+#define BTRFS_FREE_SPACE_OBJECTID -11ULL
+
+/*
+ * The inode number assigned to the special inode for sotring
+ * free ino cache
+ */
+#define BTRFS_FREE_INO_OBJECTID -12ULL
+
+/* dummy objectid represents multiple objectids */
+#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
+
+/*
+ * All files have objectids in this range.
+ */
+#define BTRFS_FIRST_FREE_OBJECTID 256ULL
+#define BTRFS_LAST_FREE_OBJECTID -256ULL
+#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
+
+
+/*
+ * the device items go into the chunk tree.  The key is in the form
+ * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
+ */
+#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+
+#define BTRFS_BTREE_INODE_OBJECTID 1
+
+#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
+
+/*
+ * we can actually store much bigger names, but lets not confuse the rest
+ * of linux
+ */
+#define BTRFS_NAME_LEN 255
+
+/* 32 bytes in various csum fields */
+#define BTRFS_CSUM_SIZE 32
+
+/* csum types */
+#define BTRFS_CSUM_TYPE_CRC32	0
+
+static int btrfs_csum_sizes[] = { 4, 0 };
+
+/* four bytes for CRC32 */
+#define BTRFS_EMPTY_DIR_SIZE 0
+
+#define BTRFS_FT_UNKNOWN	0
+#define BTRFS_FT_REG_FILE	1
+#define BTRFS_FT_DIR		2
+#define BTRFS_FT_CHRDEV		3
+#define BTRFS_FT_BLKDEV		4
+#define BTRFS_FT_FIFO		5
+#define BTRFS_FT_SOCK		6
+#define BTRFS_FT_SYMLINK	7
+#define BTRFS_FT_XATTR		8
+#define BTRFS_FT_MAX		9
+
+/*
+ * The key defines the order in the tree, and so it also defines (optimal)
+ * block layout.
+ *
+ * objectid corresponds to the inode number.
+ *
+ * type tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with type of 1 might refer to the inode data,
+ * type of 2 may point to file data in the btree and type == 3 may point to
+ * extents.
+ *
+ * offset is the starting byte offset for this key in the stream.
+ *
+ * btrfs_disk_key is in disk byte order.  struct btrfs_key is always
+ * in cpu native order.  Otherwise they are identical and their sizes
+ * should be the same (ie both packed)
+ */
+struct btrfs_disk_key {
+	__le64 objectid;
+	u8 type;
+	__le64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_key {
+	u64 objectid;
+	u8 type;
+	u64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_mapping_tree {
+	struct extent_map_tree map_tree;
+};
+
+struct btrfs_dev_item {
+	/* the internal btrfs device id */
+	__le64 devid;
+
+	/* size of the device */
+	__le64 total_bytes;
+
+	/* bytes used */
+	__le64 bytes_used;
+
+	/* optimal io alignment for this device */
+	__le32 io_align;
+
+	/* optimal io width for this device */
+	__le32 io_width;
+
+	/* minimal io size for this device */
+	__le32 sector_size;
+
+	/* type and info about this device */
+	__le64 type;
+
+	/* expected generation for this device */
+	__le64 generation;
+
+	/*
+	 * starting byte of this partition on the device,
+	 * to allow for stripe alignment in the future
+	 */
+	__le64 start_offset;
+
+	/* grouping information for allocation decisions */
+	__le32 dev_group;
+
+	/* seek speed 0-100 where 100 is fastest */
+	u8 seek_speed;
+
+	/* bandwidth 0-100 where 100 is fastest */
+	u8 bandwidth;
+
+	/* btrfs generated uuid for this device */
+	u8 uuid[BTRFS_UUID_SIZE];
+
+	/* uuid of FS who owns this device */
+	u8 fsid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_stripe {
+	__le64 devid;
+	__le64 offset;
+	u8 dev_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_chunk {
+	/* size of this chunk in bytes */
+	__le64 length;
+
+	/* objectid of the root referencing this chunk */
+	__le64 owner;
+
+	__le64 stripe_len;
+	__le64 type;
+
+	/* optimal io alignment for this chunk */
+	__le32 io_align;
+
+	/* optimal io width for this chunk */
+	__le32 io_width;
+
+	/* minimal io size for this chunk */
+	__le32 sector_size;
+
+	/* 2^16 stripes is quite a lot, a second limit is the size of a single
+	 * item in the btree
+	 */
+	__le16 num_stripes;
+
+	/* sub stripes only matter for raid10 */
+	__le16 sub_stripes;
+	struct btrfs_stripe stripe;
+	/* additional stripes go here */
+} __attribute__ ((__packed__));
+
+#define BTRFS_FREE_SPACE_EXTENT	1
+#define BTRFS_FREE_SPACE_BITMAP	2
+
+struct btrfs_free_space_entry {
+	__le64 offset;
+	__le64 bytes;
+	u8 type;
+} __attribute__ ((__packed__));
+
+struct btrfs_free_space_header {
+	struct btrfs_disk_key location;
+	__le64 generation;
+	__le64 num_entries;
+	__le64 num_bitmaps;
+} __attribute__ ((__packed__));
+
+static inline unsigned long btrfs_chunk_item_size(int num_stripes)
+{
+	BUG_ON(num_stripes == 0);
+	return sizeof(struct btrfs_chunk) +
+		sizeof(struct btrfs_stripe) * (num_stripes - 1);
+}
+
+#define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
+#define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
+
+/*
+ * File system states
+ */
+
+/* Errors detected */
+#define BTRFS_SUPER_FLAG_ERROR		(1ULL << 2)
+
+#define BTRFS_SUPER_FLAG_SEEDING	(1ULL << 32)
+#define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)
+
+#define BTRFS_BACKREF_REV_MAX		256
+#define BTRFS_BACKREF_REV_SHIFT		56
+#define BTRFS_BACKREF_REV_MASK		(((u64)BTRFS_BACKREF_REV_MAX - 1) << \
+					 BTRFS_BACKREF_REV_SHIFT)
+
+#define BTRFS_OLD_BACKREF_REV		0
+#define BTRFS_MIXED_BACKREF_REV		1
+
+/*
+ * every tree block (leaf or node) starts with this header.
+ */
+struct btrfs_header {
+	/* these first four must match the super block */
+	u8 csum[BTRFS_CSUM_SIZE];
+	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+	__le64 bytenr; /* which block this node is supposed to live in */
+	__le64 flags;
+
+	/* allowed to be different from the super from here on down */
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+	__le64 generation;
+	__le64 owner;
+	__le32 nritems;
+	u8 level;
+} __attribute__ ((__packed__));
+
+#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
+				      sizeof(struct btrfs_header)) / \
+				     sizeof(struct btrfs_key_ptr))
+#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
+#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
+#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
+					sizeof(struct btrfs_item) - \
+					sizeof(struct btrfs_file_extent_item))
+#define BTRFS_MAX_XATTR_SIZE(r)	(BTRFS_LEAF_DATA_SIZE(r) - \
+				 sizeof(struct btrfs_item) -\
+				 sizeof(struct btrfs_dir_item))
+
+
+/*
+ * this is a very generous portion of the super block, giving us
+ * room to translate 14 chunks with 3 stripes each.
+ */
+#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
+#define BTRFS_LABEL_SIZE 256
+
+/*
+ * the super block basically lists the main trees of the FS
+ * it currently lacks any block count etc etc
+ */
+struct btrfs_super_block {
+	u8 csum[BTRFS_CSUM_SIZE];
+	/* the first 4 fields must match struct btrfs_header */
+	u8 fsid[BTRFS_FSID_SIZE];    /* FS specific uuid */
+	__le64 bytenr; /* this block number */
+	__le64 flags;
+
+	/* allowed to be different from the btrfs_header from here own down */
+	__le64 magic;
+	__le64 generation;
+	__le64 root;
+	__le64 chunk_root;
+	__le64 log_root;
+
+	/* this will help find the new super based on the log root */
+	__le64 log_root_transid;
+	__le64 total_bytes;
+	__le64 bytes_used;
+	__le64 root_dir_objectid;
+	__le64 num_devices;
+	__le32 sectorsize;
+	__le32 nodesize;
+	__le32 leafsize;
+	__le32 stripesize;
+	__le32 sys_chunk_array_size;
+	__le64 chunk_root_generation;
+	__le64 compat_flags;
+	__le64 compat_ro_flags;
+	__le64 incompat_flags;
+	__le16 csum_type;
+	u8 root_level;
+	u8 chunk_root_level;
+	u8 log_root_level;
+	struct btrfs_dev_item dev_item;
+
+	char label[BTRFS_LABEL_SIZE];
+
+	__le64 cache_generation;
+
+	/* future expansion */
+	__le64 reserved[31];
+	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+} __attribute__ ((__packed__));
+
+/*
+ * Compat flags that we support.  If any incompat flags are set other than the
+ * ones specified below then we will fail to mount
+ */
+#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
+#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
+#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS	(1ULL << 2)
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO	(1ULL << 3)
+
+#define BTRFS_FEATURE_COMPAT_SUPP		0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
+#define BTRFS_FEATURE_INCOMPAT_SUPP			\
+	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
+	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
+	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
+
+/*
+ * A leaf is full of items. offset and size tell us where to find
+ * the item in the leaf (relative to the start of the data area)
+ */
+struct btrfs_item {
+	struct btrfs_disk_key key;
+	__le32 offset;
+	__le32 size;
+} __attribute__ ((__packed__));
+
+/*
+ * leaves have an item area and a data area:
+ * [item0, item1....itemN] [free space] [dataN...data1, data0]
+ *
+ * The data is separate from the items to get the keys closer together
+ * during searches.
+ */
+struct btrfs_leaf {
+	struct btrfs_header header;
+	struct btrfs_item items[];
+} __attribute__ ((__packed__));
+
+/*
+ * all non-leaf blocks are nodes, they hold only keys and pointers to
+ * other blocks
+ */
+struct btrfs_key_ptr {
+	struct btrfs_disk_key key;
+	__le64 blockptr;
+	__le64 generation;
+} __attribute__ ((__packed__));
+
+struct btrfs_node {
+	struct btrfs_header header;
+	struct btrfs_key_ptr ptrs[];
+} __attribute__ ((__packed__));
+
+/*
+ * btrfs_paths remember the path taken from the root down to the leaf.
+ * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
+ * to any other levels that are present.
+ *
+ * The slots array records the index of the item or block pointer
+ * used while walking the tree.
+ */
+struct btrfs_path {
+	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
+	int slots[BTRFS_MAX_LEVEL];
+	/* if there is real range locking, this locks field will change */
+	int locks[BTRFS_MAX_LEVEL];
+	int reada;
+	/* keep some upper locks as we walk down */
+	int lowest_level;
+
+	/*
+	 * set by btrfs_split_item, tells search_slot to keep all locks
+	 * and to force calls to keep space in the nodes
+	 */
+	unsigned int search_for_split:1;
+	unsigned int keep_locks:1;
+	unsigned int skip_locking:1;
+	unsigned int leave_spinning:1;
+	unsigned int search_commit_root:1;
+};
+
+/*
+ * items in the extent btree are used to record the objectid of the
+ * owner of the block and the number of references
+ */
+
+struct btrfs_extent_item {
+	__le64 refs;
+	__le64 generation;
+	__le64 flags;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_item_v0 {
+	__le32 refs;
+} __attribute__ ((__packed__));
+
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
+					sizeof(struct btrfs_item))
+
+#define BTRFS_EXTENT_FLAG_DATA		(1ULL << 0)
+#define BTRFS_EXTENT_FLAG_TREE_BLOCK	(1ULL << 1)
+
+/* following flags only apply to tree blocks */
+
+/* use full backrefs for extent pointers in the block */
+#define BTRFS_BLOCK_FLAG_FULL_BACKREF	(1ULL << 8)
+
+/*
+ * this flag is only used internally by scrub and may be changed at any time
+ * it is only declared here to avoid collisions
+ */
+#define BTRFS_EXTENT_FLAG_SUPER		(1ULL << 48)
+
+struct btrfs_tree_block_info {
+	struct btrfs_disk_key key;
+	u8 level;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_data_ref {
+	__le64 root;
+	__le64 objectid;
+	__le64 offset;
+	__le32 count;
+} __attribute__ ((__packed__));
+
+struct btrfs_shared_data_ref {
+	__le32 count;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_inline_ref {
+	u8 type;
+	__le64 offset;
+} __attribute__ ((__packed__));
+
+/* old style backrefs item */
+struct btrfs_extent_ref_v0 {
+	__le64 root;
+	__le64 generation;
+	__le64 objectid;
+	__le32 count;
+} __attribute__ ((__packed__));
+
+
+/* dev extents record free space on individual devices.  The owner
+ * field points back to the chunk allocation mapping tree that allocated
+ * the extent.  The chunk tree uuid field is a way to double check the owner
+ */
+struct btrfs_dev_extent {
+	__le64 chunk_tree;
+	__le64 chunk_objectid;
+	__le64 chunk_offset;
+	__le64 length;
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_inode_ref {
+	__le64 index;
+	__le16 name_len;
+	/* name goes here */
+} __attribute__ ((__packed__));
+
+struct btrfs_timespec {
+	__le64 sec;
+	__le32 nsec;
+} __attribute__ ((__packed__));
+
+enum btrfs_compression_type {
+	BTRFS_COMPRESS_NONE  = 0,
+	BTRFS_COMPRESS_ZLIB  = 1,
+	BTRFS_COMPRESS_LZO   = 2,
+	BTRFS_COMPRESS_TYPES = 2,
+	BTRFS_COMPRESS_LAST  = 3,
+};
+
+struct btrfs_inode_item {
+	/* nfs style generation number */
+	__le64 generation;
+	/* transid that last touched this inode */
+	__le64 transid;
+	__le64 size;
+	__le64 nbytes;
+	__le64 block_group;
+	__le32 nlink;
+	__le32 uid;
+	__le32 gid;
+	__le32 mode;
+	__le64 rdev;
+	__le64 flags;
+
+	/* modification sequence number for NFS */
+	__le64 sequence;
+
+	/*
+	 * a little future expansion, for more than this we can
+	 * just grow the inode item and version it
+	 */
+	__le64 reserved[4];
+	struct btrfs_timespec atime;
+	struct btrfs_timespec ctime;
+	struct btrfs_timespec mtime;
+	struct btrfs_timespec otime;
+} __attribute__ ((__packed__));
+
+struct btrfs_dir_log_item {
+	__le64 end;
+} __attribute__ ((__packed__));
+
+struct btrfs_dir_item {
+	struct btrfs_disk_key location;
+	__le64 transid;
+	__le16 data_len;
+	__le16 name_len;
+	u8 type;
+} __attribute__ ((__packed__));
+
+#define BTRFS_ROOT_SUBVOL_RDONLY	(1ULL << 0)
+
+struct btrfs_root_item {
+	struct btrfs_inode_item inode;
+	__le64 generation;
+	__le64 root_dirid;
+	__le64 bytenr;
+	__le64 byte_limit;
+	__le64 bytes_used;
+	__le64 last_snapshot;
+	__le64 flags;
+	__le32 refs;
+	struct btrfs_disk_key drop_progress;
+	u8 drop_level;
+	u8 level;
+} __attribute__ ((__packed__));
+
+/*
+ * this is used for both forward and backward root refs
+ */
+struct btrfs_root_ref {
+	__le64 dirid;
+	__le64 sequence;
+	__le16 name_len;
+} __attribute__ ((__packed__));
+
+#define BTRFS_FILE_EXTENT_INLINE 0
+#define BTRFS_FILE_EXTENT_REG 1
+#define BTRFS_FILE_EXTENT_PREALLOC 2
+
+struct btrfs_file_extent_item {
+	/*
+	 * transaction id that created this extent
+	 */
+	__le64 generation;
+	/*
+	 * max number of bytes to hold this extent in ram
+	 * when we split a compressed extent we can't know how big
+	 * each of the resulting pieces will be.  So, this is
+	 * an upper limit on the size of the extent in ram instead of
+	 * an exact limit.
+	 */
+	__le64 ram_bytes;
+
+	/*
+	 * 32 bits for the various ways we might encode the data,
+	 * including compression and encryption.  If any of these
+	 * are set to something a given disk format doesn't understand
+	 * it is treated like an incompat flag for reading and writing,
+	 * but not for stat.
+	 */
+	u8 compression;
+	u8 encryption;
+	__le16 other_encoding; /* spare for later use */
+
+	/* are we inline data or a real extent? */
+	u8 type;
+
+	/*
+	 * disk space consumed by the extent, checksum blocks are included
+	 * in these numbers
+	 */
+	__le64 disk_bytenr;
+	__le64 disk_num_bytes;
+	/*
+	 * the logical offset in file blocks (no csums)
+	 * this extent record is for.  This allows a file extent to point
+	 * into the middle of an existing extent on disk, sharing it
+	 * between two snapshots (useful if some bytes in the middle of the
+	 * extent have changed
+	 */
+	__le64 offset;
+	/*
+	 * the logical number of file blocks (no csums included).  This
+	 * always reflects the size uncompressed and without encoding.
+	 */
+	__le64 num_bytes;
+
+} __attribute__ ((__packed__));
+
+struct btrfs_csum_item {
+	u8 csum;
+} __attribute__ ((__packed__));
+
+/* different types of block groups (and chunks) */
+#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
+#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
+#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
+#define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
+#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
+#define BTRFS_NR_RAID_TYPES	   5
+
+struct btrfs_block_group_item {
+	__le64 used;
+	__le64 chunk_objectid;
+	__le64 flags;
+} __attribute__ ((__packed__));
+
+struct btrfs_space_info {
+	u64 flags;
+
+	u64 total_bytes;	/* total bytes in the space,
+				   this doesn't take mirrors into account */
+	u64 bytes_used;		/* total bytes used,
+				   this doesn't take mirrors into account */
+	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
+				   transaction finishes */
+	u64 bytes_reserved;	/* total bytes the allocator has reserved for
+				   current allocations */
+	u64 bytes_readonly;	/* total bytes that are read only */
+
+	u64 bytes_may_use;	/* number of bytes that may be used for
+				   delalloc/allocations */
+	u64 disk_used;		/* total bytes used on disk */
+	u64 disk_total;		/* total bytes on disk, takes mirrors into
+				   account */
+
+	/*
+	 * we bump reservation progress every time we decrement
+	 * bytes_reserved.  This way people waiting for reservations
+	 * know something good has happened and they can check
+	 * for progress.  The number here isn't to be trusted, it
+	 * just shows reclaim activity
+	 */
+	unsigned long reservation_progress;
+
+	unsigned int full:1;	/* indicates that we cannot allocate any more
+				   chunks for this space */
+	unsigned int chunk_alloc:1;	/* set if we are allocating a chunk */
+
+	unsigned int force_alloc;	/* set if we need to force a chunk
+					   alloc for this space */
+
+	struct list_head list;
+
+	/* for block groups in our same type */
+	struct list_head block_groups[BTRFS_NR_RAID_TYPES];
+	spinlock_t lock;
+	struct rw_semaphore groups_sem;
+	atomic_t caching_threads;
+};
+
+struct btrfs_block_rsv {
+	u64 size;
+	u64 reserved;
+	u64 freed[2];
+	struct btrfs_space_info *space_info;
+	struct list_head list;
+	spinlock_t lock;
+	atomic_t usage;
+	unsigned int priority:8;
+	unsigned int durable:1;
+	unsigned int refill_used:1;
+	unsigned int full:1;
+};
+
+/*
+ * free clusters are used to claim free space in relatively large chunks,
+ * allowing us to do less seeky writes.  They are used for all metadata
+ * allocations and data allocations in ssd mode.
+ */
+struct btrfs_free_cluster {
+	spinlock_t lock;
+	spinlock_t refill_lock;
+	struct rb_root root;
+
+	/* largest extent in this cluster */
+	u64 max_size;
+
+	/* first extent starting offset */
+	u64 window_start;
+
+	struct btrfs_block_group_cache *block_group;
+	/*
+	 * when a cluster is allocated from a block group, we put the
+	 * cluster onto a list in the block group so that it can
+	 * be freed before the block group is freed.
+	 */
+	struct list_head block_group_list;
+};
+
+enum btrfs_caching_type {
+	BTRFS_CACHE_NO		= 0,
+	BTRFS_CACHE_STARTED	= 1,
+	BTRFS_CACHE_FINISHED	= 2,
+};
+
+enum btrfs_disk_cache_state {
+	BTRFS_DC_WRITTEN	= 0,
+	BTRFS_DC_ERROR		= 1,
+	BTRFS_DC_CLEAR		= 2,
+	BTRFS_DC_SETUP		= 3,
+	BTRFS_DC_NEED_WRITE	= 4,
+};
+
+struct btrfs_caching_control {
+	struct list_head list;
+	struct mutex mutex;
+	wait_queue_head_t wait;
+	struct btrfs_block_group_cache *block_group;
+	u64 progress;
+	atomic_t count;
+};
+
+struct btrfs_block_group_cache {
+	struct btrfs_key key;
+	struct btrfs_block_group_item item;
+	struct btrfs_fs_info *fs_info;
+	struct inode *inode;
+	spinlock_t lock;
+	u64 pinned;
+	u64 reserved;
+	u64 reserved_pinned;
+	u64 bytes_super;
+	u64 flags;
+	u64 sectorsize;
+	unsigned int ro:1;
+	unsigned int dirty:1;
+	unsigned int iref:1;
+
+	int disk_cache_state;
+
+	/* cache tracking stuff */
+	int cached;
+	struct btrfs_caching_control *caching_ctl;
+	u64 last_byte_to_unpin;
+
+	struct btrfs_space_info *space_info;
+
+	/* free space cache stuff */
+	struct btrfs_free_space_ctl *free_space_ctl;
+
+	/* block group cache stuff */
+	struct rb_node cache_node;
+
+	/* for block groups in the same raid type */
+	struct list_head list;
+
+	/* usage count */
+	atomic_t count;
+
+	/* List of struct btrfs_free_clusters for this block group.
+	 * Today it will only have one thing on it, but that may change
+	 */
+	struct list_head cluster_list;
+};
+
+struct reloc_control;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_delayed_root;
+struct btrfs_fs_info {
+	u8 fsid[BTRFS_FSID_SIZE];
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+	struct btrfs_root *extent_root;
+	struct btrfs_root *tree_root;
+	struct btrfs_root *chunk_root;
+	struct btrfs_root *dev_root;
+	struct btrfs_root *fs_root;
+	struct btrfs_root *csum_root;
+
+	/* the log root tree is a directory of all the other log roots */
+	struct btrfs_root *log_root_tree;
+
+	spinlock_t fs_roots_radix_lock;
+	struct radix_tree_root fs_roots_radix;
+
+	/* block group cache stuff */
+	spinlock_t block_group_cache_lock;
+	struct rb_root block_group_cache_tree;
+
+	struct extent_io_tree freed_extents[2];
+	struct extent_io_tree *pinned_extents;
+
+	/* logical->physical extent mapping */
+	struct btrfs_mapping_tree mapping_tree;
+
+	/*
+	 * block reservation for extent, checksum, root tree and
+	 * delayed dir index item
+	 */
+	struct btrfs_block_rsv global_block_rsv;
+	/* block reservation for delay allocation */
+	struct btrfs_block_rsv delalloc_block_rsv;
+	/* block reservation for metadata operations */
+	struct btrfs_block_rsv trans_block_rsv;
+	/* block reservation for chunk tree */
+	struct btrfs_block_rsv chunk_block_rsv;
+
+	struct btrfs_block_rsv empty_block_rsv;
+
+	/* list of block reservations that cross multiple transactions */
+	struct list_head durable_block_rsv_list;
+
+	struct mutex durable_block_rsv_mutex;
+
+	u64 generation;
+	u64 last_trans_committed;
+
+	/*
+	 * this is updated to the current trans every time a full commit
+	 * is required instead of the faster short fsync log commits
+	 */
+	u64 last_trans_log_full_commit;
+	unsigned long mount_opt:20;
+	unsigned long compress_type:4;
+	u64 max_inline;
+	u64 alloc_start;
+	struct btrfs_transaction *running_transaction;
+	wait_queue_head_t transaction_throttle;
+	wait_queue_head_t transaction_wait;
+	wait_queue_head_t transaction_blocked_wait;
+	wait_queue_head_t async_submit_wait;
+
+	struct btrfs_super_block super_copy;
+	struct btrfs_super_block super_for_commit;
+	struct block_device *__bdev;
+	struct super_block *sb;
+	struct inode *btree_inode;
+	struct backing_dev_info bdi;
+	struct mutex tree_log_mutex;
+	struct mutex transaction_kthread_mutex;
+	struct mutex cleaner_mutex;
+	struct mutex chunk_mutex;
+	struct mutex volume_mutex;
+	/*
+	 * this protects the ordered operations list only while we are
+	 * processing all of the entries on it.  This way we make
+	 * sure the commit code doesn't find the list temporarily empty
+	 * because another function happens to be doing non-waiting preflush
+	 * before jumping into the main commit.
+	 */
+	struct mutex ordered_operations_mutex;
+	struct rw_semaphore extent_commit_sem;
+
+	struct rw_semaphore cleanup_work_sem;
+
+	struct rw_semaphore subvol_sem;
+	struct srcu_struct subvol_srcu;
+
+	spinlock_t trans_lock;
+	/*
+	 * the reloc mutex goes with the trans lock, it is taken
+	 * during commit to protect us from the relocation code
+	 */
+	struct mutex reloc_mutex;
+
+	struct list_head trans_list;
+	struct list_head hashers;
+	struct list_head dead_roots;
+	struct list_head caching_block_groups;
+
+	spinlock_t delayed_iput_lock;
+	struct list_head delayed_iputs;
+
+	atomic_t nr_async_submits;
+	atomic_t async_submit_draining;
+	atomic_t nr_async_bios;
+	atomic_t async_delalloc_pages;
+	atomic_t open_ioctl_trans;
+
+	/*
+	 * this is used by the balancing code to wait for all the pending
+	 * ordered extents
+	 */
+	spinlock_t ordered_extent_lock;
+
+	/*
+	 * all of the data=ordered extents pending writeback
+	 * these can span multiple transactions and basically include
+	 * every dirty data page that isn't from nodatacow
+	 */
+	struct list_head ordered_extents;
+
+	/*
+	 * all of the inodes that have delalloc bytes.  It is possible for
+	 * this list to be empty even when there is still dirty data=ordered
+	 * extents waiting to finish IO.
+	 */
+	struct list_head delalloc_inodes;
+
+	/*
+	 * special rename and truncate targets that must be on disk before
+	 * we're allowed to commit.  This is basically the ext3 style
+	 * data=ordered list.
+	 */
+	struct list_head ordered_operations;
+
+	/*
+	 * there is a pool of worker threads for checksumming during writes
+	 * and a pool for checksumming after reads.  This is because readers
+	 * can run with FS locks held, and the writers may be waiting for
+	 * those locks.  We don't want ordering in the pending list to cause
+	 * deadlocks, and so the two are serviced separately.
+	 *
+	 * A third pool does submit_bio to avoid deadlocking with the other
+	 * two
+	 */
+	struct btrfs_workers generic_worker;
+	struct btrfs_workers workers;
+	struct btrfs_workers delalloc_workers;
+	struct btrfs_workers endio_workers;
+	struct btrfs_workers endio_meta_workers;
+	struct btrfs_workers endio_meta_write_workers;
+	struct btrfs_workers endio_write_workers;
+	struct btrfs_workers endio_freespace_worker;
+	struct btrfs_workers submit_workers;
+	/*
+	 * fixup workers take dirty pages that didn't properly go through
+	 * the cow mechanism and make them safe to write.  It happens
+	 * for the sys_munmap function call path
+	 */
+	struct btrfs_workers fixup_workers;
+	struct btrfs_workers delayed_workers;
+	struct task_struct *transaction_kthread;
+	struct task_struct *cleaner_kthread;
+	int thread_pool_size;
+
+	struct kobject super_kobj;
+	struct completion kobj_unregister;
+	int do_barriers;
+	int closing;
+	int log_root_recovering;
+	int enospc_unlink;
+	int trans_no_join;
+
+	u64 total_pinned;
+
+	/* protected by the delalloc lock, used to keep from writing
+	 * metadata until there is a nice batch
+	 */
+	u64 dirty_metadata_bytes;
+	struct list_head dirty_cowonly_roots;
+
+	struct btrfs_fs_devices *fs_devices;
+
+	/*
+	 * the space_info list is almost entirely read only.  It only changes
+	 * when we add a new raid type to the FS, and that happens
+	 * very rarely.  RCU is used to protect it.
+	 */
+	struct list_head space_info;
+
+	struct reloc_control *reloc_ctl;
+
+	spinlock_t delalloc_lock;
+	u64 delalloc_bytes;
+
+	/* data_alloc_cluster is only used in ssd mode */
+	struct btrfs_free_cluster data_alloc_cluster;
+
+	/* all metadata allocations go through this cluster */
+	struct btrfs_free_cluster meta_alloc_cluster;
+
+	/* auto defrag inodes go here */
+	spinlock_t defrag_inodes_lock;
+	struct rb_root defrag_inodes;
+	atomic_t defrag_running;
+
+	spinlock_t ref_cache_lock;
+	u64 total_ref_cache_size;
+
+	u64 avail_data_alloc_bits;
+	u64 avail_metadata_alloc_bits;
+	u64 avail_system_alloc_bits;
+	u64 data_alloc_profile;
+	u64 metadata_alloc_profile;
+	u64 system_alloc_profile;
+
+	unsigned data_chunk_allocations;
+	unsigned metadata_ratio;
+
+	void *bdev_holder;
+
+	/* private scrub information */
+	struct mutex scrub_lock;
+	atomic_t scrubs_running;
+	atomic_t scrub_pause_req;
+	atomic_t scrubs_paused;
+	atomic_t scrub_cancel_req;
+	wait_queue_head_t scrub_pause_wait;
+	struct rw_semaphore scrub_super_lock;
+	int scrub_workers_refcnt;
+	struct btrfs_workers scrub_workers;
+
+	/* filesystem state */
+	u64 fs_state;
+
+	struct btrfs_delayed_root *delayed_root;
+};
+
+/*
+ * in ram representation of the tree.  extent_root is used for all allocations
+ * and for the extent tree extent_root root.
+ */
+struct btrfs_root {
+	struct extent_buffer *node;
+
+	struct extent_buffer *commit_root;
+	struct btrfs_root *log_root;
+	struct btrfs_root *reloc_root;
+
+	struct btrfs_root_item root_item;
+	struct btrfs_key root_key;
+	struct btrfs_fs_info *fs_info;
+	struct extent_io_tree dirty_log_pages;
+
+	struct kobject root_kobj;
+	struct completion kobj_unregister;
+	struct mutex objectid_mutex;
+
+	spinlock_t accounting_lock;
+	struct btrfs_block_rsv *block_rsv;
+
+	/* free ino cache stuff */
+	struct mutex fs_commit_mutex;
+	struct btrfs_free_space_ctl *free_ino_ctl;
+	enum btrfs_caching_type cached;
+	spinlock_t cache_lock;
+	wait_queue_head_t cache_wait;
+	struct btrfs_free_space_ctl *free_ino_pinned;
+	u64 cache_progress;
+	struct inode *cache_inode;
+
+	struct mutex log_mutex;
+	wait_queue_head_t log_writer_wait;
+	wait_queue_head_t log_commit_wait[2];
+	atomic_t log_writers;
+	atomic_t log_commit[2];
+	unsigned long log_transid;
+	unsigned long last_log_commit;
+	unsigned long log_batch;
+	pid_t log_start_pid;
+	bool log_multiple_pids;
+
+	u64 objectid;
+	u64 last_trans;
+
+	/* data allocations are done in sectorsize units */
+	u32 sectorsize;
+
+	/* node allocations are done in nodesize units */
+	u32 nodesize;
+
+	/* leaf allocations are done in leafsize units */
+	u32 leafsize;
+
+	u32 stripesize;
+
+	u32 type;
+
+	u64 highest_objectid;
+
+	/* btrfs_record_root_in_trans is a multi-step process,
+	 * and it can race with the balancing code.   But the
+	 * race is very small, and only the first time the root
+	 * is added to each transaction.  So in_trans_setup
+	 * is used to tell us when more checks are required
+	 */
+	unsigned long in_trans_setup;
+	int ref_cows;
+	int track_dirty;
+	int in_radix;
+
+	u64 defrag_trans_start;
+	struct btrfs_key defrag_progress;
+	struct btrfs_key defrag_max;
+	int defrag_running;
+	char *name;
+
+	/* the dirty list is only used by non-reference counted roots */
+	struct list_head dirty_list;
+
+	struct list_head root_list;
+
+	spinlock_t orphan_lock;
+	struct list_head orphan_list;
+	struct btrfs_block_rsv *orphan_block_rsv;
+	int orphan_item_inserted;
+	int orphan_cleanup_state;
+
+	spinlock_t inode_lock;
+	/* red-black tree that keeps track of in-memory inodes */
+	struct rb_root inode_tree;
+
+	/*
+	 * radix tree that keeps track of delayed nodes of every inode,
+	 * protected by inode_lock
+	 */
+	struct radix_tree_root delayed_nodes_tree;
+	/*
+	 * right now this just gets used so that a root has its own devid
+	 * for stat.  It may be used for more later
+	 */
+	struct super_block anon_super;
+};
+
+struct btrfs_ioctl_defrag_range_args {
+	/* start of the defrag operation */
+	__u64 start;
+
+	/* number of bytes to defrag, use (u64)-1 to say all */
+	__u64 len;
+
+	/*
+	 * flags for the operation, which can include turning
+	 * on compression for this one defrag
+	 */
+	__u64 flags;
+
+	/*
+	 * any extent bigger than this will be considered
+	 * already defragged.  Use 0 to take the kernel default
+	 * Use 1 to say every single extent must be rewritten
+	 */
+	__u32 extent_thresh;
+
+	/*
+	 * which compression method to use if turning on compression
+	 * for this defrag operation.  If unspecified, zlib will
+	 * be used
+	 */
+	__u32 compress_type;
+
+	/* spare for later */
+	__u32 unused[4];
+};
+
+
+/*
+ * inode items have the data typically returned from stat and store other
+ * info about object characteristics.  There is one for every file and dir in
+ * the FS
+ */
+#define BTRFS_INODE_ITEM_KEY		1
+#define BTRFS_INODE_REF_KEY		12
+#define BTRFS_XATTR_ITEM_KEY		24
+#define BTRFS_ORPHAN_ITEM_KEY		48
+/* reserve 2-15 close to the inode for later flexibility */
+
+/*
+ * dir items are the name -> inode pointers in a directory.  There is one
+ * for every name in a directory.
+ */
+#define BTRFS_DIR_LOG_ITEM_KEY  60
+#define BTRFS_DIR_LOG_INDEX_KEY 72
+#define BTRFS_DIR_ITEM_KEY	84
+#define BTRFS_DIR_INDEX_KEY	96
+/*
+ * extent data is for file data
+ */
+#define BTRFS_EXTENT_DATA_KEY	108
+
+/*
+ * extent csums are stored in a separate tree and hold csums for
+ * an entire extent on disk.
+ */
+#define BTRFS_EXTENT_CSUM_KEY	128
+
+/*
+ * root items point to tree roots.  They are typically in the root
+ * tree used by the super block to find all the other trees
+ */
+#define BTRFS_ROOT_ITEM_KEY	132
+
+/*
+ * root backrefs tie subvols and snapshots to the directory entries that
+ * reference them
+ */
+#define BTRFS_ROOT_BACKREF_KEY	144
+
+/*
+ * root refs make a fast index for listing all of the snapshots and
+ * subvolumes referenced by a given root.  They point directly to the
+ * directory item in the root that references the subvol
+ */
+#define BTRFS_ROOT_REF_KEY	156
+
+/*
+ * extent items are in the extent map tree.  These record which blocks
+ * are used, and how many references there are to each block
+ */
+#define BTRFS_EXTENT_ITEM_KEY	168
+
+#define BTRFS_TREE_BLOCK_REF_KEY	176
+
+#define BTRFS_EXTENT_DATA_REF_KEY	178
+
+#define BTRFS_EXTENT_REF_V0_KEY		180
+
+#define BTRFS_SHARED_BLOCK_REF_KEY	182
+
+#define BTRFS_SHARED_DATA_REF_KEY	184
+
+/*
+ * block groups give us hints into the extent allocation trees.  Which
+ * blocks are free etc etc
+ */
+#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
+
+#define BTRFS_DEV_EXTENT_KEY	204
+#define BTRFS_DEV_ITEM_KEY	216
+#define BTRFS_CHUNK_ITEM_KEY	228
+
+/*
+ * string items are for debugging.  They just store a short string of
+ * data in the FS
+ */
+#define BTRFS_STRING_ITEM_KEY	253
+
+/*
+ * Flags for mount options.
+ *
+ * Note: don't forget to add new options to btrfs_show_options()
+ */
+#define BTRFS_MOUNT_NODATASUM		(1 << 0)
+#define BTRFS_MOUNT_NODATACOW		(1 << 1)
+#define BTRFS_MOUNT_NOBARRIER		(1 << 2)
+#define BTRFS_MOUNT_SSD			(1 << 3)
+#define BTRFS_MOUNT_DEGRADED		(1 << 4)
+#define BTRFS_MOUNT_COMPRESS		(1 << 5)
+#define BTRFS_MOUNT_NOTREELOG           (1 << 6)
+#define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
+#define BTRFS_MOUNT_SSD_SPREAD		(1 << 8)
+#define BTRFS_MOUNT_NOSSD		(1 << 9)
+#define BTRFS_MOUNT_DISCARD		(1 << 10)
+#define BTRFS_MOUNT_FORCE_COMPRESS      (1 << 11)
+#define BTRFS_MOUNT_SPACE_CACHE		(1 << 12)
+#define BTRFS_MOUNT_CLEAR_CACHE		(1 << 13)
+#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
+#define BTRFS_MOUNT_ENOSPC_DEBUG	 (1 << 15)
+#define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)
+#define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17)
+
+#define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
+#define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_test_opt(root, opt)	((root)->fs_info->mount_opt & \
+					 BTRFS_MOUNT_##opt)
+/*
+ * Inode flags
+ */
+#define BTRFS_INODE_NODATASUM		(1 << 0)
+#define BTRFS_INODE_NODATACOW		(1 << 1)
+#define BTRFS_INODE_READONLY		(1 << 2)
+#define BTRFS_INODE_NOCOMPRESS		(1 << 3)
+#define BTRFS_INODE_PREALLOC		(1 << 4)
+#define BTRFS_INODE_SYNC		(1 << 5)
+#define BTRFS_INODE_IMMUTABLE		(1 << 6)
+#define BTRFS_INODE_APPEND		(1 << 7)
+#define BTRFS_INODE_NODUMP		(1 << 8)
+#define BTRFS_INODE_NOATIME		(1 << 9)
+#define BTRFS_INODE_DIRSYNC		(1 << 10)
+#define BTRFS_INODE_COMPRESS		(1 << 11)
+
+#define BTRFS_INODE_ROOT_ITEM_INIT	(1 << 31)
+
+/* some macros to generate set/get funcs for the struct fields.  This
+ * assumes there is a lefoo_to_cpu for every type, so lets make a simple
+ * one for u8:
+ */
+#define le8_to_cpu(v) (v)
+#define cpu_to_le8(v) (v)
+#define __le8 u8
+
+#define read_eb_member(eb, ptr, type, member, result) (			\
+	read_extent_buffer(eb, (char *)(result),			\
+			   ((unsigned long)(ptr)) +			\
+			    offsetof(type, member),			\
+			   sizeof(((type *)0)->member)))
+
+#define write_eb_member(eb, ptr, type, member, result) (		\
+	write_extent_buffer(eb, (char *)(result),			\
+			   ((unsigned long)(ptr)) +			\
+			    offsetof(type, member),			\
+			   sizeof(((type *)0)->member)))
+
+#ifndef BTRFS_SETGET_FUNCS
+#define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
+u##bits btrfs_##name(struct extent_buffer *eb, type *s);		\
+void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
+#endif
+
+#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
+static inline u##bits btrfs_##name(struct extent_buffer *eb)		\
+{									\
+	type *p = kmap_atomic(eb->first_page, KM_USER0);		\
+	u##bits res = le##bits##_to_cpu(p->member);			\
+	kunmap_atomic(p, KM_USER0);					\
+	return res;							\
+}									\
+static inline void btrfs_set_##name(struct extent_buffer *eb,		\
+				    u##bits val)			\
+{									\
+	type *p = kmap_atomic(eb->first_page, KM_USER0);		\
+	p->member = cpu_to_le##bits(val);				\
+	kunmap_atomic(p, KM_USER0);					\
+}
+
+#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)		\
+static inline u##bits btrfs_##name(type *s)				\
+{									\
+	return le##bits##_to_cpu(s->member);				\
+}									\
+static inline void btrfs_set_##name(type *s, u##bits val)		\
+{									\
+	s->member = cpu_to_le##bits(val);				\
+}
+
+BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
+BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
+BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
+BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
+		   start_offset, 64);
+BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
+BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
+BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
+BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
+			 total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
+			 bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
+			 io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
+			 io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
+			 sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
+			 dev_group, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
+			 seek_speed, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
+			 bandwidth, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
+			 generation, 64);
+
+static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
+{
+	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
+}
+
+static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
+{
+	return (char *)d + offsetof(struct btrfs_dev_item, fsid);
+}
+
+BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
+BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
+BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
+BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
+{
+	return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
+}
+
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
+			 stripe_len, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
+			 io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
+			 io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
+			 sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
+			 num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
+			 sub_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
+						   int nr)
+{
+	unsigned long offset = (unsigned long)c;
+	offset += offsetof(struct btrfs_chunk, stripe);
+	offset += nr * sizeof(struct btrfs_stripe);
+	return (struct btrfs_stripe *)offset;
+}
+
+static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
+}
+
+static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
+					 struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
+					 struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
+}
+
+/* struct btrfs_block_group_item */
+BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
+			 used, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
+			 used, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
+			struct btrfs_block_group_item, chunk_objectid, 64);
+
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
+		   struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_flags,
+		   struct btrfs_block_group_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_flags,
+			struct btrfs_block_group_item, flags, 64);
+
+/* struct btrfs_inode_ref */
+BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+
+/* struct btrfs_inode_item */
+BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
+BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
+BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
+BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
+BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+
+static inline struct btrfs_timespec *
+btrfs_inode_atime(struct btrfs_inode_item *inode_item)
+{
+	unsigned long ptr = (unsigned long)inode_item;
+	ptr += offsetof(struct btrfs_inode_item, atime);
+	return (struct btrfs_timespec *)ptr;
+}
+
+static inline struct btrfs_timespec *
+btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
+{
+	unsigned long ptr = (unsigned long)inode_item;
+	ptr += offsetof(struct btrfs_inode_item, mtime);
+	return (struct btrfs_timespec *)ptr;
+}
+
+static inline struct btrfs_timespec *
+btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
+{
+	unsigned long ptr = (unsigned long)inode_item;
+	ptr += offsetof(struct btrfs_inode_item, ctime);
+	return (struct btrfs_timespec *)ptr;
+}
+
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
+
+/* struct btrfs_dev_extent */
+BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
+		   chunk_tree, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
+		   chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
+		   chunk_offset, 64);
+BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+
+static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
+{
+	unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
+	return (u8 *)((unsigned long)dev + ptr);
+}
+
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
+BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
+
+BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32);
+
+
+BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
+
+static inline void btrfs_tree_block_key(struct extent_buffer *eb,
+					struct btrfs_tree_block_info *item,
+					struct btrfs_disk_key *key)
+{
+	read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+static inline void btrfs_set_tree_block_key(struct extent_buffer *eb,
+					    struct btrfs_tree_block_info *item,
+					    struct btrfs_disk_key *key)
+{
+	write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
+		   root, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
+		   objectid, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
+		   offset, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
+		   count, 32);
+
+BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
+		   count, 32);
+
+BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
+		   type, 8);
+BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
+		   offset, 64);
+
+static inline u32 btrfs_extent_inline_ref_size(int type)
+{
+	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    type == BTRFS_SHARED_BLOCK_REF_KEY)
+		return sizeof(struct btrfs_extent_inline_ref);
+	if (type == BTRFS_SHARED_DATA_REF_KEY)
+		return sizeof(struct btrfs_shared_data_ref) +
+		       sizeof(struct btrfs_extent_inline_ref);
+	if (type == BTRFS_EXTENT_DATA_REF_KEY)
+		return sizeof(struct btrfs_extent_data_ref) +
+		       offsetof(struct btrfs_extent_inline_ref, offset);
+	BUG();
+	return 0;
+}
+
+BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64);
+BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64);
+BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
+
+/* struct btrfs_node */
+BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
+
+static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
+{
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
+					   int nr, u64 val)
+{
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
+{
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
+						 int nr, u64 val)
+{
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline unsigned long btrfs_node_key_ptr_offset(int nr)
+{
+	return offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+}
+
+void btrfs_node_key(struct extent_buffer *eb,
+		    struct btrfs_disk_key *disk_key, int nr);
+
+static inline void btrfs_set_node_key(struct extent_buffer *eb,
+				      struct btrfs_disk_key *disk_key, int nr)
+{
+	unsigned long ptr;
+	ptr = btrfs_node_key_ptr_offset(nr);
+	write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+		       struct btrfs_key_ptr, key, disk_key);
+}
+
+/* struct btrfs_item */
+BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+
+static inline unsigned long btrfs_item_nr_offset(int nr)
+{
+	return offsetof(struct btrfs_leaf, items) +
+		sizeof(struct btrfs_item) * nr;
+}
+
+static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
+					       int nr)
+{
+	return (struct btrfs_item *)btrfs_item_nr_offset(nr);
+}
+
+static inline u32 btrfs_item_end(struct extent_buffer *eb,
+				 struct btrfs_item *item)
+{
+	return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
+}
+
+static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
+{
+	return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
+}
+
+static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
+{
+	return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
+}
+
+static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
+{
+	return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
+}
+
+static inline void btrfs_item_key(struct extent_buffer *eb,
+			   struct btrfs_disk_key *disk_key, int nr)
+{
+	struct btrfs_item *item = btrfs_item_nr(eb, nr);
+	read_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+static inline void btrfs_set_item_key(struct extent_buffer *eb,
+			       struct btrfs_disk_key *disk_key, int nr)
+{
+	struct btrfs_item *item = btrfs_item_nr(eb, nr);
+	write_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
+
+/*
+ * struct btrfs_root_ref
+ */
+BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+
+/* struct btrfs_dir_item */
+BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
+BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
+
+static inline void btrfs_dir_item_key(struct extent_buffer *eb,
+				      struct btrfs_dir_item *item,
+				      struct btrfs_disk_key *key)
+{
+	read_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
+					  struct btrfs_dir_item *item,
+					  struct btrfs_disk_key *key)
+{
+	write_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
+		   num_entries, 64);
+BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
+		   num_bitmaps, 64);
+BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
+		   generation, 64);
+
+static inline void btrfs_free_space_key(struct extent_buffer *eb,
+					struct btrfs_free_space_header *h,
+					struct btrfs_disk_key *key)
+{
+	read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
+					    struct btrfs_free_space_header *h,
+					    struct btrfs_disk_key *key)
+{
+	write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+/* struct btrfs_disk_key */
+BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
+			 objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
+					 struct btrfs_disk_key *disk)
+{
+	cpu->offset = le64_to_cpu(disk->offset);
+	cpu->type = disk->type;
+	cpu->objectid = le64_to_cpu(disk->objectid);
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
+					 struct btrfs_key *cpu)
+{
+	disk->offset = cpu_to_le64(cpu->offset);
+	disk->type = cpu->type;
+	disk->objectid = cpu_to_le64(cpu->objectid);
+}
+
+static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
+				  struct btrfs_key *key, int nr)
+{
+	struct btrfs_disk_key disk_key;
+	btrfs_node_key(eb, &disk_key, nr);
+	btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
+				  struct btrfs_key *key, int nr)
+{
+	struct btrfs_disk_key disk_key;
+	btrfs_item_key(eb, &disk_key, nr);
+	btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
+				      struct btrfs_dir_item *item,
+				      struct btrfs_key *key)
+{
+	struct btrfs_disk_key disk_key;
+	btrfs_dir_item_key(eb, item, &disk_key);
+	btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+
+static inline u8 btrfs_key_type(struct btrfs_key *key)
+{
+	return key->type;
+}
+
+static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
+{
+	key->type = val;
+}
+
+/* struct btrfs_header */
+BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
+			  generation, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
+
+static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
+{
+	return (btrfs_header_flags(eb) & flag) == flag;
+}
+
+static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
+{
+	u64 flags = btrfs_header_flags(eb);
+	btrfs_set_header_flags(eb, flags | flag);
+	return (flags & flag) == flag;
+}
+
+static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
+{
+	u64 flags = btrfs_header_flags(eb);
+	btrfs_set_header_flags(eb, flags & ~flag);
+	return (flags & flag) == flag;
+}
+
+static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
+{
+	u64 flags = btrfs_header_flags(eb);
+	return flags >> BTRFS_BACKREF_REV_SHIFT;
+}
+
+static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
+						int rev)
+{
+	u64 flags = btrfs_header_flags(eb);
+	flags &= ~BTRFS_BACKREF_REV_MASK;
+	flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
+	btrfs_set_header_flags(eb, flags);
+}
+
+static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
+{
+	unsigned long ptr = offsetof(struct btrfs_header, fsid);
+	return (u8 *)ptr;
+}
+
+static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+{
+	unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
+	return (u8 *)ptr;
+}
+
+static inline int btrfs_is_leaf(struct extent_buffer *eb)
+{
+	return btrfs_header_level(eb) == 0;
+}
+
+/* struct btrfs_root_item */
+BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
+BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
+			 last_snapshot, 64);
+
+static inline bool btrfs_root_readonly(struct btrfs_root *root)
+{
+	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
+}
+
+/* struct btrfs_super_block */
+
+BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
+			 struct btrfs_super_block, sys_chunk_array_size, 32);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
+			 struct btrfs_super_block, chunk_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
+			 root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
+			 chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
+			 chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
+			 log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
+			 log_root_transid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
+			 log_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
+			 total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
+			 bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
+			 sectorsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
+			 nodesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
+			 leafsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
+			 stripesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
+			 root_dir_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
+			 num_devices, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
+			 compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
+			 compat_ro_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
+			 incompat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
+			 csum_type, 16);
+BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
+			 cache_generation, 64);
+
+static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
+{
+	int t = btrfs_super_csum_type(s);
+	BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
+	return btrfs_csum_sizes[t];
+}
+
+static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
+{
+	return offsetof(struct btrfs_leaf, items);
+}
+
+/* struct btrfs_file_extent_item */
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+
+static inline unsigned long
+btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
+{
+	unsigned long offset = (unsigned long)e;
+	offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
+	return offset;
+}
+
+static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
+{
+	return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
+}
+
+BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
+		   disk_bytenr, 64);
+BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
+		   disk_num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
+		  offset, 64);
+BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
+		   num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+		   ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+		   compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+		   encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+		   other_encoding, 16);
+
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+					       struct btrfs_file_extent_item *e)
+{
+	return btrfs_file_extent_ram_bytes(eb, e);
+}
+
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers.  If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+						    struct btrfs_item *e)
+{
+	unsigned long offset;
+	offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
+	return btrfs_item_size(eb, e) - offset;
+}
+
+static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
+{
+	if (level == 0)
+		return root->leafsize;
+	return root->nodesize;
+}
+
+/* helper function to cast into the data area of the leaf. */
+#define btrfs_item_ptr(leaf, slot, type) \
+	((type *)(btrfs_leaf_data(leaf) + \
+	btrfs_item_offset_nr(leaf, slot)))
+
+#define btrfs_item_ptr_offset(leaf, slot) \
+	((unsigned long)(btrfs_leaf_data(leaf) + \
+	btrfs_item_offset_nr(leaf, slot)))
+
+static inline struct dentry *fdentry(struct file *file)
+{
+	return file->f_path.dentry;
+}
+
+static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
+{
+	return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+		(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
+}
+
+/* extent-tree.c */
+static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
+						 int num_items)
+{
+	return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+		3 * num_items;
+}
+
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, unsigned long count);
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 bytenr,
+			     u64 num_bytes, u64 *refs, u64 *flags);
+int btrfs_pin_extent(struct btrfs_root *root,
+		     u64 bytenr, u64 num, int reserved);
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 objectid, u64 offset, u64 bytenr);
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+						 struct btrfs_fs_info *info,
+						 u64 bytenr);
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+u64 btrfs_find_block_group(struct btrfs_root *root,
+			   u64 search_start, u64 search_hint, int owner);
+struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root, u32 blocksize,
+					u64 parent, u64 root_objectid,
+					struct btrfs_disk_key *key, int level,
+					u64 hint, u64 empty_size);
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct extent_buffer *buf,
+			   u64 parent, int last_ref);
+struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root,
+					    u64 bytenr, u32 blocksize,
+					    int level);
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 root_objectid, u64 owner,
+				     u64 offset, struct btrfs_key *ins);
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   u64 root_objectid, u64 owner, u64 offset,
+				   struct btrfs_key *ins);
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 num_bytes, u64 min_alloc_size,
+				  u64 empty_size, u64 hint_byte,
+				  u64 search_end, struct btrfs_key *ins,
+				  u64 data);
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref);
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref);
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 flags,
+				int is_data);
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent,
+		      u64 root_objectid, u64 owner, u64 offset);
+
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+				u64 num_bytes, int reserve, int sinfo);
+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root);
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 root_objectid, u64 owner, u64 offset);
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root);
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
+int btrfs_free_block_groups(struct btrfs_fs_info *info);
+int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 bytes_used,
+			   u64 type, u64 chunk_objectid, u64 chunk_offset,
+			   u64 size);
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 group_start);
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				int num_items);
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root);
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+				  struct inode *inode);
+void btrfs_orphan_release_metadata(struct inode *inode);
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+				struct btrfs_pending_snapshot *pending);
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_free_block_rsv(struct btrfs_root *root,
+			  struct btrfs_block_rsv *rsv);
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+				 struct btrfs_block_rsv *rsv);
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv,
+			u64 num_bytes);
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_block_rsv *block_rsv,
+			  u64 min_reserved, int min_factor);
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+			    struct btrfs_block_rsv *dst_rsv,
+			    u64 num_bytes);
+void btrfs_block_rsv_release(struct btrfs_root *root,
+			     struct btrfs_block_rsv *block_rsv,
+			     u64 num_bytes);
+int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_block_rsv *rsv);
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+			     struct btrfs_block_group_cache *cache);
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+			     struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+int btrfs_error_unpin_extent_range(struct btrfs_root *root,
+				   u64 start, u64 end);
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+			       u64 num_bytes, u64 *actual_bytes);
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 type);
+int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
+
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+/* ctree.c */
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+		     int level, int *slot);
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
+int btrfs_previous_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid,
+			int type);
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *new_key);
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+			struct btrfs_key *key, int lowest_level,
+			int cache_only, u64 min_trans);
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+			 struct btrfs_key *max_key,
+			 struct btrfs_path *path, int cache_only,
+			 u64 min_trans);
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct extent_buffer *buf,
+		    struct extent_buffer *parent, int parent_slot,
+		    struct extent_buffer **cow_ret);
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      struct extent_buffer *buf,
+		      struct extent_buffer **cow_ret, u64 new_root_objectid);
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+			      struct extent_buffer *buf);
+int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_path *path, u32 data_size);
+int btrfs_truncate_item(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct btrfs_path *path,
+			u32 new_size, int from_end);
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_path *path,
+		     struct btrfs_key *new_key,
+		     unsigned long split_offset);
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 struct btrfs_path *path,
+			 struct btrfs_key *new_key);
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_path *p, int
+		      ins_len, int cow);
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct extent_buffer *parent,
+		       int start_slot, int cache_only, u64 *last_ret,
+		       struct btrfs_key *progress);
+void btrfs_release_path(struct btrfs_path *p);
+struct btrfs_path *btrfs_alloc_path(void);
+void btrfs_free_path(struct btrfs_path *p);
+void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_clear_path_blocking(struct btrfs_path *p,
+			       struct extent_buffer *held);
+void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
+
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_path *path, int slot, int nr);
+static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path)
+{
+	return btrfs_del_items(trans, root, path, path->slots[0], 1);
+}
+
+int setup_items_for_insert(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, struct btrfs_path *path,
+			   struct btrfs_key *cpu_key, u32 *data_size,
+			   u32 total_data, u32 total_size, int nr);
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, void *data, u32 data_size);
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path,
+			     struct btrfs_key *cpu_key, u32 *data_size, int nr);
+
+static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  struct btrfs_key *key,
+					  u32 data_size)
+{
+	return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+}
+
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_drop_snapshot(struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv, int update_ref);
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct extent_buffer *node,
+			struct extent_buffer *parent);
+static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+{
+	/*
+	 * Get synced with close_ctree()
+	 */
+	smp_mb();
+	return fs_info->closing;
+}
+
+/* root-item.c */
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+			struct btrfs_path *path,
+			u64 root_id, u64 ref_id);
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *tree_root,
+		       u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+		       const char *name, int name_len);
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *tree_root,
+		       u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+		       const char *name, int name_len);
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_key *key);
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_root_item
+		      *item);
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_root_item
+		      *item);
+int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
+			 btrfs_root_item *item, struct btrfs_key *key);
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
+int btrfs_set_root_node(struct btrfs_root_item *item,
+			struct extent_buffer *node);
+void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
+
+/* dir-item.c */
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, const char *name,
+			  int name_len, struct inode *dir,
+			  struct btrfs_key *location, u8 type, u64 index);
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     struct btrfs_path *path, u64 dir,
+					     const char *name, int name_len,
+					     int mod);
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 dir,
+			    u64 objectid, const char *name, int name_len,
+			    int mod);
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+			    struct btrfs_path *path, u64 dirid,
+			    const char *name, int name_len);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      const char *name, int name_len);
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      struct btrfs_dir_item *di);
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 objectid,
+			    const char *name, u16 name_len,
+			    const void *data, u16 data_len);
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, u64 dir,
+					  const char *name, u16 name_len,
+					  int mod);
+int verify_dir_item(struct btrfs_root *root,
+		    struct extent_buffer *leaf,
+		    struct btrfs_dir_item *dir_item);
+
+/* orphan.c */
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 offset);
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 offset);
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
+
+/* inode-item.c */
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid, u64 index);
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid, u64 *index);
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct btrfs_path *path,
+			const char *name, int name_len,
+			u64 inode_objectid, u64 ref_objectid, int mod);
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid);
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+		       *root, struct btrfs_path *path,
+		       struct btrfs_key *location, int mod);
+
+/* file-item.c */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, u64 bytenr, u64 len);
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+			  struct bio *bio, u32 *dst);
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+			      struct bio *bio, u64 logical_offset, u32 *dst);
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     u64 objectid, u64 pos,
+			     u64 disk_offset, u64 disk_num_bytes,
+			     u64 num_bytes, u64 offset, u64 ram_bytes,
+			     u8 compression, u8 encryption, u16 other_encoding);
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid,
+			     u64 bytenr, int mod);
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_ordered_sum *sums);
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+		       struct bio *bio, u64 file_start, int contig);
+struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, int cow);
+int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, struct btrfs_path *path,
+			u64 isize);
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+			     struct list_head *list, int search_commit);
+/* inode.c */
+
+/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
+#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
+#define ClearPageChecked ClearPageFsMisc
+#define SetPageChecked SetPageFsMisc
+#define PageChecked PageFsMisc
+#endif
+
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
+int btrfs_set_inode_index(struct inode *dir, u64 *index);
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct inode *dir, struct inode *inode,
+		       const char *name, int name_len);
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+		   struct inode *parent_inode, struct inode *inode,
+		   const char *name, int name_len, int add_backref, u64 index);
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct inode *dir, u64 objectid,
+			const char *name, int name_len);
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct inode *inode, u64 new_size,
+			       u32 min_type);
+
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+			      struct extent_state **cached_state);
+int btrfs_writepages(struct address_space *mapping,
+		     struct writeback_control *wbc);
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *new_root, u64 new_dirid);
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+			 size_t size, struct bio *bio, unsigned long bio_flags);
+
+unsigned long btrfs_force_ra(struct address_space *mapping,
+			      struct file_ra_state *ra, struct file *file,
+			      pgoff_t offset, pgoff_t last_index);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+int btrfs_readpage(struct file *file, struct page *page);
+void btrfs_evict_inode(struct inode *inode);
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
+void btrfs_dirty_inode(struct inode *inode, int flags);
+struct inode *btrfs_alloc_inode(struct super_block *sb);
+void btrfs_destroy_inode(struct inode *inode);
+int btrfs_drop_inode(struct inode *inode);
+int btrfs_init_cachep(void);
+void btrfs_destroy_cachep(void);
+long btrfs_ioctl_trans_end(struct file *file);
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+			 struct btrfs_root *root, int *was_new);
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+				    size_t pg_offset, u64 start, u64 end,
+				    int create);
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *inode);
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
+int btrfs_orphan_cleanup(struct btrfs_root *root);
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+				struct btrfs_pending_snapshot *pending,
+				u64 *bytes_to_reserve);
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+				struct btrfs_pending_snapshot *pending);
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root);
+int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
+int btrfs_invalidate_inodes(struct btrfs_root *root);
+void btrfs_add_delayed_iput(struct inode *inode);
+void btrfs_run_delayed_iputs(struct btrfs_root *root);
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+			      u64 start, u64 num_bytes, u64 min_size,
+			      loff_t actual_len, u64 *alloc_hint);
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+				    struct btrfs_trans_handle *trans, int mode,
+				    u64 start, u64 num_bytes, u64 min_size,
+				    loff_t actual_len, u64 *alloc_hint);
+extern const struct dentry_operations btrfs_dentry_operations;
+
+/* ioctl.c */
+long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+void btrfs_update_iflags(struct inode *inode);
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
+int btrfs_defrag_file(struct inode *inode, struct file *file,
+		      struct btrfs_ioctl_defrag_range_args *range,
+		      u64 newer_than, unsigned long max_pages);
+/* file.c */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+			   struct inode *inode);
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+int btrfs_sync_file(struct file *file, int datasync);
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			    int skip_pinned);
+extern const struct file_operations btrfs_file_operations;
+int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
+		       u64 start, u64 end, u64 *hint_byte, int drop_cache);
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+			      struct inode *inode, u64 start, u64 end);
+int btrfs_release_file(struct inode *inode, struct file *file);
+void btrfs_drop_pages(struct page **pages, size_t num_pages);
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+		      struct page **pages, size_t num_pages,
+		      loff_t pos, size_t write_bytes,
+		      struct extent_state **cached);
+
+/* tree-defrag.c */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, int cache_only);
+
+/* sysfs.c */
+int btrfs_init_sysfs(void);
+void btrfs_exit_sysfs(void);
+
+/* xattr.c */
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+
+/* super.c */
+int btrfs_parse_options(struct btrfs_root *root, char *options);
+int btrfs_sync_fs(struct super_block *sb, int wait);
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+		     unsigned int line, int errno);
+
+#define btrfs_std_error(fs_info, errno)				\
+do {								\
+	if ((errno))						\
+		__btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+} while (0)
+
+/* acl.c */
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
+#else
+#define btrfs_check_acl NULL
+#endif
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+		   struct inode *inode, struct inode *dir);
+int btrfs_acl_chmod(struct inode *inode);
+
+/* relocation.c */
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root);
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_root *root);
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, struct extent_buffer *buf,
+			   struct extent_buffer *cow);
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+			      struct btrfs_pending_snapshot *pending,
+			      u64 *bytes_to_reserve);
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+			      struct btrfs_pending_snapshot *pending);
+
+/* scrub.c */
+int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
+		    struct btrfs_scrub_progress *progress, int readonly);
+int btrfs_scrub_pause(struct btrfs_root *root);
+int btrfs_scrub_pause_super(struct btrfs_root *root);
+int btrfs_scrub_continue(struct btrfs_root *root);
+int btrfs_scrub_continue_super(struct btrfs_root *root);
+int btrfs_scrub_cancel(struct btrfs_root *root);
+int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
+int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+			 struct btrfs_scrub_progress *progress);
+
+#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
new file mode 100644
index 00000000..98c68e65
--- /dev/null
+++ b/fs/btrfs/delayed-inode.c
@@ -0,0 +1,1773 @@
+/*
+ * Copyright (C) 2011 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "delayed-inode.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+#define BTRFS_DELAYED_WRITEBACK		400
+#define BTRFS_DELAYED_BACKGROUND	100
+
+static struct kmem_cache *delayed_node_cache;
+
+int __init btrfs_delayed_inode_init(void)
+{
+	delayed_node_cache = kmem_cache_create("delayed_node",
+					sizeof(struct btrfs_delayed_node),
+					0,
+					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+					NULL);
+	if (!delayed_node_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void btrfs_delayed_inode_exit(void)
+{
+	if (delayed_node_cache)
+		kmem_cache_destroy(delayed_node_cache);
+}
+
+static inline void btrfs_init_delayed_node(
+				struct btrfs_delayed_node *delayed_node,
+				struct btrfs_root *root, u64 inode_id)
+{
+	delayed_node->root = root;
+	delayed_node->inode_id = inode_id;
+	atomic_set(&delayed_node->refs, 0);
+	delayed_node->count = 0;
+	delayed_node->in_list = 0;
+	delayed_node->inode_dirty = 0;
+	delayed_node->ins_root = RB_ROOT;
+	delayed_node->del_root = RB_ROOT;
+	mutex_init(&delayed_node->mutex);
+	delayed_node->index_cnt = 0;
+	INIT_LIST_HEAD(&delayed_node->n_list);
+	INIT_LIST_HEAD(&delayed_node->p_list);
+	delayed_node->bytes_reserved = 0;
+}
+
+static inline int btrfs_is_continuous_delayed_item(
+					struct btrfs_delayed_item *item1,
+					struct btrfs_delayed_item *item2)
+{
+	if (item1->key.type == BTRFS_DIR_INDEX_KEY &&
+	    item1->key.objectid == item2->key.objectid &&
+	    item1->key.type == item2->key.type &&
+	    item1->key.offset + 1 == item2->key.offset)
+		return 1;
+	return 0;
+}
+
+static inline struct btrfs_delayed_root *btrfs_get_delayed_root(
+							struct btrfs_root *root)
+{
+	return root->fs_info->delayed_root;
+}
+
+static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
+{
+	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+	struct btrfs_root *root = btrfs_inode->root;
+	u64 ino = btrfs_ino(inode);
+	struct btrfs_delayed_node *node;
+
+	node = ACCESS_ONCE(btrfs_inode->delayed_node);
+	if (node) {
+		atomic_inc(&node->refs);
+		return node;
+	}
+
+	spin_lock(&root->inode_lock);
+	node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+	if (node) {
+		if (btrfs_inode->delayed_node) {
+			atomic_inc(&node->refs);	/* can be accessed */
+			BUG_ON(btrfs_inode->delayed_node != node);
+			spin_unlock(&root->inode_lock);
+			return node;
+		}
+		btrfs_inode->delayed_node = node;
+		atomic_inc(&node->refs);	/* can be accessed */
+		atomic_inc(&node->refs);	/* cached in the inode */
+		spin_unlock(&root->inode_lock);
+		return node;
+	}
+	spin_unlock(&root->inode_lock);
+
+	return NULL;
+}
+
+static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
+							struct inode *inode)
+{
+	struct btrfs_delayed_node *node;
+	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+	struct btrfs_root *root = btrfs_inode->root;
+	u64 ino = btrfs_ino(inode);
+	int ret;
+
+again:
+	node = btrfs_get_delayed_node(inode);
+	if (node)
+		return node;
+
+	node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+	btrfs_init_delayed_node(node, root, ino);
+
+	atomic_inc(&node->refs);	/* cached in the btrfs inode */
+	atomic_inc(&node->refs);	/* can be accessed */
+
+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	if (ret) {
+		kmem_cache_free(delayed_node_cache, node);
+		return ERR_PTR(ret);
+	}
+
+	spin_lock(&root->inode_lock);
+	ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
+	if (ret == -EEXIST) {
+		kmem_cache_free(delayed_node_cache, node);
+		spin_unlock(&root->inode_lock);
+		radix_tree_preload_end();
+		goto again;
+	}
+	btrfs_inode->delayed_node = node;
+	spin_unlock(&root->inode_lock);
+	radix_tree_preload_end();
+
+	return node;
+}
+
+/*
+ * Call it when holding delayed_node->mutex
+ *
+ * If mod = 1, add this node into the prepared list.
+ */
+static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
+				     struct btrfs_delayed_node *node,
+				     int mod)
+{
+	spin_lock(&root->lock);
+	if (node->in_list) {
+		if (!list_empty(&node->p_list))
+			list_move_tail(&node->p_list, &root->prepare_list);
+		else if (mod)
+			list_add_tail(&node->p_list, &root->prepare_list);
+	} else {
+		list_add_tail(&node->n_list, &root->node_list);
+		list_add_tail(&node->p_list, &root->prepare_list);
+		atomic_inc(&node->refs);	/* inserted into list */
+		root->nodes++;
+		node->in_list = 1;
+	}
+	spin_unlock(&root->lock);
+}
+
+/* Call it when holding delayed_node->mutex */
+static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
+				       struct btrfs_delayed_node *node)
+{
+	spin_lock(&root->lock);
+	if (node->in_list) {
+		root->nodes--;
+		atomic_dec(&node->refs);	/* not in the list */
+		list_del_init(&node->n_list);
+		if (!list_empty(&node->p_list))
+			list_del_init(&node->p_list);
+		node->in_list = 0;
+	}
+	spin_unlock(&root->lock);
+}
+
+struct btrfs_delayed_node *btrfs_first_delayed_node(
+			struct btrfs_delayed_root *delayed_root)
+{
+	struct list_head *p;
+	struct btrfs_delayed_node *node = NULL;
+
+	spin_lock(&delayed_root->lock);
+	if (list_empty(&delayed_root->node_list))
+		goto out;
+
+	p = delayed_root->node_list.next;
+	node = list_entry(p, struct btrfs_delayed_node, n_list);
+	atomic_inc(&node->refs);
+out:
+	spin_unlock(&delayed_root->lock);
+
+	return node;
+}
+
+struct btrfs_delayed_node *btrfs_next_delayed_node(
+						struct btrfs_delayed_node *node)
+{
+	struct btrfs_delayed_root *delayed_root;
+	struct list_head *p;
+	struct btrfs_delayed_node *next = NULL;
+
+	delayed_root = node->root->fs_info->delayed_root;
+	spin_lock(&delayed_root->lock);
+	if (!node->in_list) {	/* not in the list */
+		if (list_empty(&delayed_root->node_list))
+			goto out;
+		p = delayed_root->node_list.next;
+	} else if (list_is_last(&node->n_list, &delayed_root->node_list))
+		goto out;
+	else
+		p = node->n_list.next;
+
+	next = list_entry(p, struct btrfs_delayed_node, n_list);
+	atomic_inc(&next->refs);
+out:
+	spin_unlock(&delayed_root->lock);
+
+	return next;
+}
+
+static void __btrfs_release_delayed_node(
+				struct btrfs_delayed_node *delayed_node,
+				int mod)
+{
+	struct btrfs_delayed_root *delayed_root;
+
+	if (!delayed_node)
+		return;
+
+	delayed_root = delayed_node->root->fs_info->delayed_root;
+
+	mutex_lock(&delayed_node->mutex);
+	if (delayed_node->count)
+		btrfs_queue_delayed_node(delayed_root, delayed_node, mod);
+	else
+		btrfs_dequeue_delayed_node(delayed_root, delayed_node);
+	mutex_unlock(&delayed_node->mutex);
+
+	if (atomic_dec_and_test(&delayed_node->refs)) {
+		struct btrfs_root *root = delayed_node->root;
+		spin_lock(&root->inode_lock);
+		if (atomic_read(&delayed_node->refs) == 0) {
+			radix_tree_delete(&root->delayed_nodes_tree,
+					  delayed_node->inode_id);
+			kmem_cache_free(delayed_node_cache, delayed_node);
+		}
+		spin_unlock(&root->inode_lock);
+	}
+}
+
+static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
+{
+	__btrfs_release_delayed_node(node, 0);
+}
+
+struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
+					struct btrfs_delayed_root *delayed_root)
+{
+	struct list_head *p;
+	struct btrfs_delayed_node *node = NULL;
+
+	spin_lock(&delayed_root->lock);
+	if (list_empty(&delayed_root->prepare_list))
+		goto out;
+
+	p = delayed_root->prepare_list.next;
+	list_del_init(p);
+	node = list_entry(p, struct btrfs_delayed_node, p_list);
+	atomic_inc(&node->refs);
+out:
+	spin_unlock(&delayed_root->lock);
+
+	return node;
+}
+
+static inline void btrfs_release_prepared_delayed_node(
+					struct btrfs_delayed_node *node)
+{
+	__btrfs_release_delayed_node(node, 1);
+}
+
+struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+{
+	struct btrfs_delayed_item *item;
+	item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
+	if (item) {
+		item->data_len = data_len;
+		item->ins_or_del = 0;
+		item->bytes_reserved = 0;
+		item->delayed_node = NULL;
+		atomic_set(&item->refs, 1);
+	}
+	return item;
+}
+
+/*
+ * __btrfs_lookup_delayed_item - look up the delayed item by key
+ * @delayed_node: pointer to the delayed node
+ * @key:	  the key to look up
+ * @prev:	  used to store the prev item if the right item isn't found
+ * @next:	  used to store the next item if the right item isn't found
+ *
+ * Note: if we don't find the right item, we will return the prev item and
+ * the next item.
+ */
+static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
+				struct rb_root *root,
+				struct btrfs_key *key,
+				struct btrfs_delayed_item **prev,
+				struct btrfs_delayed_item **next)
+{
+	struct rb_node *node, *prev_node = NULL;
+	struct btrfs_delayed_item *delayed_item = NULL;
+	int ret = 0;
+
+	node = root->rb_node;
+
+	while (node) {
+		delayed_item = rb_entry(node, struct btrfs_delayed_item,
+					rb_node);
+		prev_node = node;
+		ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
+		if (ret < 0)
+			node = node->rb_right;
+		else if (ret > 0)
+			node = node->rb_left;
+		else
+			return delayed_item;
+	}
+
+	if (prev) {
+		if (!prev_node)
+			*prev = NULL;
+		else if (ret < 0)
+			*prev = delayed_item;
+		else if ((node = rb_prev(prev_node)) != NULL) {
+			*prev = rb_entry(node, struct btrfs_delayed_item,
+					 rb_node);
+		} else
+			*prev = NULL;
+	}
+
+	if (next) {
+		if (!prev_node)
+			*next = NULL;
+		else if (ret > 0)
+			*next = delayed_item;
+		else if ((node = rb_next(prev_node)) != NULL) {
+			*next = rb_entry(node, struct btrfs_delayed_item,
+					 rb_node);
+		} else
+			*next = NULL;
+	}
+	return NULL;
+}
+
+struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
+					struct btrfs_delayed_node *delayed_node,
+					struct btrfs_key *key)
+{
+	struct btrfs_delayed_item *item;
+
+	item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+					   NULL, NULL);
+	return item;
+}
+
+struct btrfs_delayed_item *__btrfs_lookup_delayed_deletion_item(
+					struct btrfs_delayed_node *delayed_node,
+					struct btrfs_key *key)
+{
+	struct btrfs_delayed_item *item;
+
+	item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
+					   NULL, NULL);
+	return item;
+}
+
+struct btrfs_delayed_item *__btrfs_search_delayed_insertion_item(
+					struct btrfs_delayed_node *delayed_node,
+					struct btrfs_key *key)
+{
+	struct btrfs_delayed_item *item, *next;
+
+	item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+					   NULL, &next);
+	if (!item)
+		item = next;
+
+	return item;
+}
+
+struct btrfs_delayed_item *__btrfs_search_delayed_deletion_item(
+					struct btrfs_delayed_node *delayed_node,
+					struct btrfs_key *key)
+{
+	struct btrfs_delayed_item *item, *next;
+
+	item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
+					   NULL, &next);
+	if (!item)
+		item = next;
+
+	return item;
+}
+
+static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
+				    struct btrfs_delayed_item *ins,
+				    int action)
+{
+	struct rb_node **p, *node;
+	struct rb_node *parent_node = NULL;
+	struct rb_root *root;
+	struct btrfs_delayed_item *item;
+	int cmp;
+
+	if (action == BTRFS_DELAYED_INSERTION_ITEM)
+		root = &delayed_node->ins_root;
+	else if (action == BTRFS_DELAYED_DELETION_ITEM)
+		root = &delayed_node->del_root;
+	else
+		BUG();
+	p = &root->rb_node;
+	node = &ins->rb_node;
+
+	while (*p) {
+		parent_node = *p;
+		item = rb_entry(parent_node, struct btrfs_delayed_item,
+				 rb_node);
+
+		cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
+		if (cmp < 0)
+			p = &(*p)->rb_right;
+		else if (cmp > 0)
+			p = &(*p)->rb_left;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(node, parent_node, p);
+	rb_insert_color(node, root);
+	ins->delayed_node = delayed_node;
+	ins->ins_or_del = action;
+
+	if (ins->key.type == BTRFS_DIR_INDEX_KEY &&
+	    action == BTRFS_DELAYED_INSERTION_ITEM &&
+	    ins->key.offset >= delayed_node->index_cnt)
+			delayed_node->index_cnt = ins->key.offset + 1;
+
+	delayed_node->count++;
+	atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
+	return 0;
+}
+
+static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node,
+					      struct btrfs_delayed_item *item)
+{
+	return __btrfs_add_delayed_item(node, item,
+					BTRFS_DELAYED_INSERTION_ITEM);
+}
+
+static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
+					     struct btrfs_delayed_item *item)
+{
+	return __btrfs_add_delayed_item(node, item,
+					BTRFS_DELAYED_DELETION_ITEM);
+}
+
+static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
+{
+	struct rb_root *root;
+	struct btrfs_delayed_root *delayed_root;
+
+	delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
+
+	BUG_ON(!delayed_root);
+	BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
+	       delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
+
+	if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+		root = &delayed_item->delayed_node->ins_root;
+	else
+		root = &delayed_item->delayed_node->del_root;
+
+	rb_erase(&delayed_item->rb_node, root);
+	delayed_item->delayed_node->count--;
+	atomic_dec(&delayed_root->items);
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND &&
+	    waitqueue_active(&delayed_root->wait))
+		wake_up(&delayed_root->wait);
+}
+
+static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
+{
+	if (item) {
+		__btrfs_remove_delayed_item(item);
+		if (atomic_dec_and_test(&item->refs))
+			kfree(item);
+	}
+}
+
+struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
+					struct btrfs_delayed_node *delayed_node)
+{
+	struct rb_node *p;
+	struct btrfs_delayed_item *item = NULL;
+
+	p = rb_first(&delayed_node->ins_root);
+	if (p)
+		item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+	return item;
+}
+
+struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
+					struct btrfs_delayed_node *delayed_node)
+{
+	struct rb_node *p;
+	struct btrfs_delayed_item *item = NULL;
+
+	p = rb_first(&delayed_node->del_root);
+	if (p)
+		item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+	return item;
+}
+
+struct btrfs_delayed_item *__btrfs_next_delayed_item(
+						struct btrfs_delayed_item *item)
+{
+	struct rb_node *p;
+	struct btrfs_delayed_item *next = NULL;
+
+	p = rb_next(&item->rb_node);
+	if (p)
+		next = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+	return next;
+}
+
+static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
+						   u64 root_id)
+{
+	struct btrfs_key root_key;
+
+	if (root->objectid == root_id)
+		return root;
+
+	root_key.objectid = root_id;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+	return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
+}
+
+static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
+					       struct btrfs_root *root,
+					       struct btrfs_delayed_item *item)
+{
+	struct btrfs_block_rsv *src_rsv;
+	struct btrfs_block_rsv *dst_rsv;
+	u64 num_bytes;
+	int ret;
+
+	if (!trans->bytes_reserved)
+		return 0;
+
+	src_rsv = trans->block_rsv;
+	dst_rsv = &root->fs_info->global_block_rsv;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+	if (!ret)
+		item->bytes_reserved = num_bytes;
+
+	return ret;
+}
+
+static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
+						struct btrfs_delayed_item *item)
+{
+	struct btrfs_block_rsv *rsv;
+
+	if (!item->bytes_reserved)
+		return;
+
+	rsv = &root->fs_info->global_block_rsv;
+	btrfs_block_rsv_release(root, rsv,
+				item->bytes_reserved);
+}
+
+static int btrfs_delayed_inode_reserve_metadata(
+					struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_delayed_node *node)
+{
+	struct btrfs_block_rsv *src_rsv;
+	struct btrfs_block_rsv *dst_rsv;
+	u64 num_bytes;
+	int ret;
+
+	if (!trans->bytes_reserved)
+		return 0;
+
+	src_rsv = trans->block_rsv;
+	dst_rsv = &root->fs_info->global_block_rsv;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+	if (!ret)
+		node->bytes_reserved = num_bytes;
+
+	return ret;
+}
+
+static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
+						struct btrfs_delayed_node *node)
+{
+	struct btrfs_block_rsv *rsv;
+
+	if (!node->bytes_reserved)
+		return;
+
+	rsv = &root->fs_info->global_block_rsv;
+	btrfs_block_rsv_release(root, rsv,
+				node->bytes_reserved);
+	node->bytes_reserved = 0;
+}
+
+/*
+ * This helper will insert some continuous items into the same leaf according
+ * to the free space of the leaf.
+ */
+static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_path *path,
+				struct btrfs_delayed_item *item)
+{
+	struct btrfs_delayed_item *curr, *next;
+	int free_space;
+	int total_data_size = 0, total_size = 0;
+	struct extent_buffer *leaf;
+	char *data_ptr;
+	struct btrfs_key *keys;
+	u32 *data_size;
+	struct list_head head;
+	int slot;
+	int nitems;
+	int i;
+	int ret = 0;
+
+	BUG_ON(!path->nodes[0]);
+
+	leaf = path->nodes[0];
+	free_space = btrfs_leaf_free_space(root, leaf);
+	INIT_LIST_HEAD(&head);
+
+	next = item;
+	nitems = 0;
+
+	/*
+	 * count the number of the continuous items that we can insert in batch
+	 */
+	while (total_size + next->data_len + sizeof(struct btrfs_item) <=
+	       free_space) {
+		total_data_size += next->data_len;
+		total_size += next->data_len + sizeof(struct btrfs_item);
+		list_add_tail(&next->tree_list, &head);
+		nitems++;
+
+		curr = next;
+		next = __btrfs_next_delayed_item(curr);
+		if (!next)
+			break;
+
+		if (!btrfs_is_continuous_delayed_item(curr, next))
+			break;
+	}
+
+	if (!nitems) {
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * we need allocate some memory space, but it might cause the task
+	 * to sleep, so we set all locked nodes in the path to blocking locks
+	 * first.
+	 */
+	btrfs_set_path_blocking(path);
+
+	keys = kmalloc(sizeof(struct btrfs_key) * nitems, GFP_NOFS);
+	if (!keys) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	data_size = kmalloc(sizeof(u32) * nitems, GFP_NOFS);
+	if (!data_size) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	/* get keys of all the delayed items */
+	i = 0;
+	list_for_each_entry(next, &head, tree_list) {
+		keys[i] = next->key;
+		data_size[i] = next->data_len;
+		i++;
+	}
+
+	/* reset all the locked nodes in the patch to spinning locks. */
+	btrfs_clear_path_blocking(path, NULL);
+
+	/* insert the keys of the items */
+	ret = setup_items_for_insert(trans, root, path, keys, data_size,
+				     total_data_size, total_size, nitems);
+	if (ret)
+		goto error;
+
+	/* insert the dir index items */
+	slot = path->slots[0];
+	list_for_each_entry_safe(curr, next, &head, tree_list) {
+		data_ptr = btrfs_item_ptr(leaf, slot, char);
+		write_extent_buffer(leaf, &curr->data,
+				    (unsigned long)data_ptr,
+				    curr->data_len);
+		slot++;
+
+		btrfs_delayed_item_release_metadata(root, curr);
+
+		list_del(&curr->tree_list);
+		btrfs_release_delayed_item(curr);
+	}
+
+error:
+	kfree(data_size);
+	kfree(keys);
+out:
+	return ret;
+}
+
+/*
+ * This helper can just do simple insertion that needn't extend item for new
+ * data, such as directory name index insertion, inode insertion.
+ */
+static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct btrfs_path *path,
+				     struct btrfs_delayed_item *delayed_item)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	char *ptr;
+	int ret;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
+				      delayed_item->data_len);
+	if (ret < 0 && ret != -EEXIST)
+		return ret;
+
+	leaf = path->nodes[0];
+
+	item = btrfs_item_nr(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+
+	write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
+			    delayed_item->data_len);
+	btrfs_mark_buffer_dirty(leaf);
+
+	btrfs_delayed_item_release_metadata(root, delayed_item);
+	return 0;
+}
+
+/*
+ * we insert an item first, then if there are some continuous items, we try
+ * to insert those items into the same leaf.
+ */
+static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
+				      struct btrfs_path *path,
+				      struct btrfs_root *root,
+				      struct btrfs_delayed_node *node)
+{
+	struct btrfs_delayed_item *curr, *prev;
+	int ret = 0;
+
+do_again:
+	mutex_lock(&node->mutex);
+	curr = __btrfs_first_delayed_insertion_item(node);
+	if (!curr)
+		goto insert_end;
+
+	ret = btrfs_insert_delayed_item(trans, root, path, curr);
+	if (ret < 0) {
+		btrfs_release_path(path);
+		goto insert_end;
+	}
+
+	prev = curr;
+	curr = __btrfs_next_delayed_item(prev);
+	if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
+		/* insert the continuous items into the same leaf */
+		path->slots[0]++;
+		btrfs_batch_insert_items(trans, root, path, curr);
+	}
+	btrfs_release_delayed_item(prev);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	btrfs_release_path(path);
+	mutex_unlock(&node->mutex);
+	goto do_again;
+
+insert_end:
+	mutex_unlock(&node->mutex);
+	return ret;
+}
+
+static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct btrfs_delayed_item *item)
+{
+	struct btrfs_delayed_item *curr, *next;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct list_head head;
+	int nitems, i, last_item;
+	int ret = 0;
+
+	BUG_ON(!path->nodes[0]);
+
+	leaf = path->nodes[0];
+
+	i = path->slots[0];
+	last_item = btrfs_header_nritems(leaf) - 1;
+	if (i > last_item)
+		return -ENOENT;	/* FIXME: Is errno suitable? */
+
+	next = item;
+	INIT_LIST_HEAD(&head);
+	btrfs_item_key_to_cpu(leaf, &key, i);
+	nitems = 0;
+	/*
+	 * count the number of the dir index items that we can delete in batch
+	 */
+	while (btrfs_comp_cpu_keys(&next->key, &key) == 0) {
+		list_add_tail(&next->tree_list, &head);
+		nitems++;
+
+		curr = next;
+		next = __btrfs_next_delayed_item(curr);
+		if (!next)
+			break;
+
+		if (!btrfs_is_continuous_delayed_item(curr, next))
+			break;
+
+		i++;
+		if (i > last_item)
+			break;
+		btrfs_item_key_to_cpu(leaf, &key, i);
+	}
+
+	if (!nitems)
+		return 0;
+
+	ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
+	if (ret)
+		goto out;
+
+	list_for_each_entry_safe(curr, next, &head, tree_list) {
+		btrfs_delayed_item_release_metadata(root, curr);
+		list_del(&curr->tree_list);
+		btrfs_release_delayed_item(curr);
+	}
+
+out:
+	return ret;
+}
+
+static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
+				      struct btrfs_path *path,
+				      struct btrfs_root *root,
+				      struct btrfs_delayed_node *node)
+{
+	struct btrfs_delayed_item *curr, *prev;
+	int ret = 0;
+
+do_again:
+	mutex_lock(&node->mutex);
+	curr = __btrfs_first_delayed_deletion_item(node);
+	if (!curr)
+		goto delete_fail;
+
+	ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
+	if (ret < 0)
+		goto delete_fail;
+	else if (ret > 0) {
+		/*
+		 * can't find the item which the node points to, so this node
+		 * is invalid, just drop it.
+		 */
+		prev = curr;
+		curr = __btrfs_next_delayed_item(prev);
+		btrfs_release_delayed_item(prev);
+		ret = 0;
+		btrfs_release_path(path);
+		if (curr)
+			goto do_again;
+		else
+			goto delete_fail;
+	}
+
+	btrfs_batch_delete_items(trans, root, path, curr);
+	btrfs_release_path(path);
+	mutex_unlock(&node->mutex);
+	goto do_again;
+
+delete_fail:
+	btrfs_release_path(path);
+	mutex_unlock(&node->mutex);
+	return ret;
+}
+
+static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
+{
+	struct btrfs_delayed_root *delayed_root;
+
+	if (delayed_node && delayed_node->inode_dirty) {
+		BUG_ON(!delayed_node->root);
+		delayed_node->inode_dirty = 0;
+		delayed_node->count--;
+
+		delayed_root = delayed_node->root->fs_info->delayed_root;
+		atomic_dec(&delayed_root->items);
+		if (atomic_read(&delayed_root->items) <
+		    BTRFS_DELAYED_BACKGROUND &&
+		    waitqueue_active(&delayed_root->wait))
+			wake_up(&delayed_root->wait);
+	}
+}
+
+static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct btrfs_delayed_node *node)
+{
+	struct btrfs_key key;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *leaf;
+	int ret;
+
+	mutex_lock(&node->mutex);
+	if (!node->inode_dirty) {
+		mutex_unlock(&node->mutex);
+		return 0;
+	}
+
+	key.objectid = node->inode_id;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+	ret = btrfs_lookup_inode(trans, root, path, &key, 1);
+	if (ret > 0) {
+		btrfs_release_path(path);
+		mutex_unlock(&node->mutex);
+		return -ENOENT;
+	} else if (ret < 0) {
+		mutex_unlock(&node->mutex);
+		return ret;
+	}
+
+	btrfs_unlock_up_safe(path, 1);
+	leaf = path->nodes[0];
+	inode_item = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_inode_item);
+	write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
+			    sizeof(struct btrfs_inode_item));
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+
+	btrfs_delayed_inode_release_metadata(root, node);
+	btrfs_release_delayed_inode(node);
+	mutex_unlock(&node->mutex);
+
+	return 0;
+}
+
+/* Called when committing the transaction. */
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
+{
+	struct btrfs_delayed_root *delayed_root;
+	struct btrfs_delayed_node *curr_node, *prev_node;
+	struct btrfs_path *path;
+	struct btrfs_block_rsv *block_rsv;
+	int ret = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->leave_spinning = 1;
+
+	block_rsv = trans->block_rsv;
+	trans->block_rsv = &root->fs_info->global_block_rsv;
+
+	delayed_root = btrfs_get_delayed_root(root);
+
+	curr_node = btrfs_first_delayed_node(delayed_root);
+	while (curr_node) {
+		root = curr_node->root;
+		ret = btrfs_insert_delayed_items(trans, path, root,
+						 curr_node);
+		if (!ret)
+			ret = btrfs_delete_delayed_items(trans, path, root,
+							 curr_node);
+		if (!ret)
+			ret = btrfs_update_delayed_inode(trans, root, path,
+							 curr_node);
+		if (ret) {
+			btrfs_release_delayed_node(curr_node);
+			break;
+		}
+
+		prev_node = curr_node;
+		curr_node = btrfs_next_delayed_node(curr_node);
+		btrfs_release_delayed_node(prev_node);
+	}
+
+	btrfs_free_path(path);
+	trans->block_rsv = block_rsv;
+	return ret;
+}
+
+static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+					      struct btrfs_delayed_node *node)
+{
+	struct btrfs_path *path;
+	struct btrfs_block_rsv *block_rsv;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->leave_spinning = 1;
+
+	block_rsv = trans->block_rsv;
+	trans->block_rsv = &node->root->fs_info->global_block_rsv;
+
+	ret = btrfs_insert_delayed_items(trans, path, node->root, node);
+	if (!ret)
+		ret = btrfs_delete_delayed_items(trans, path, node->root, node);
+	if (!ret)
+		ret = btrfs_update_delayed_inode(trans, node->root, path, node);
+	btrfs_free_path(path);
+
+	trans->block_rsv = block_rsv;
+	return ret;
+}
+
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+				     struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+	int ret;
+
+	if (!delayed_node)
+		return 0;
+
+	mutex_lock(&delayed_node->mutex);
+	if (!delayed_node->count) {
+		mutex_unlock(&delayed_node->mutex);
+		btrfs_release_delayed_node(delayed_node);
+		return 0;
+	}
+	mutex_unlock(&delayed_node->mutex);
+
+	ret = __btrfs_commit_inode_delayed_items(trans, delayed_node);
+	btrfs_release_delayed_node(delayed_node);
+	return ret;
+}
+
+void btrfs_remove_delayed_node(struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node;
+
+	delayed_node = ACCESS_ONCE(BTRFS_I(inode)->delayed_node);
+	if (!delayed_node)
+		return;
+
+	BTRFS_I(inode)->delayed_node = NULL;
+	btrfs_release_delayed_node(delayed_node);
+}
+
+struct btrfs_async_delayed_node {
+	struct btrfs_root *root;
+	struct btrfs_delayed_node *delayed_node;
+	struct btrfs_work work;
+};
+
+static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
+{
+	struct btrfs_async_delayed_node *async_node;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct btrfs_delayed_node *delayed_node = NULL;
+	struct btrfs_root *root;
+	struct btrfs_block_rsv *block_rsv;
+	unsigned long nr = 0;
+	int need_requeue = 0;
+	int ret;
+
+	async_node = container_of(work, struct btrfs_async_delayed_node, work);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		goto out;
+	path->leave_spinning = 1;
+
+	delayed_node = async_node->delayed_node;
+	root = delayed_node->root;
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		goto free_path;
+
+	block_rsv = trans->block_rsv;
+	trans->block_rsv = &root->fs_info->global_block_rsv;
+
+	ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
+	if (!ret)
+		ret = btrfs_delete_delayed_items(trans, path, root,
+						 delayed_node);
+
+	if (!ret)
+		btrfs_update_delayed_inode(trans, root, path, delayed_node);
+
+	/*
+	 * Maybe new delayed items have been inserted, so we need requeue
+	 * the work. Besides that, we must dequeue the empty delayed nodes
+	 * to avoid the race between delayed items balance and the worker.
+	 * The race like this:
+	 * 	Task1				Worker thread
+	 * 					count == 0, needn't requeue
+	 * 					  also needn't insert the
+	 * 					  delayed node into prepare
+	 * 					  list again.
+	 * 	add lots of delayed items
+	 * 	queue the delayed node
+	 * 	  already in the list,
+	 * 	  and not in the prepare
+	 * 	  list, it means the delayed
+	 * 	  node is being dealt with
+	 * 	  by the worker.
+	 * 	do delayed items balance
+	 * 	  the delayed node is being
+	 * 	  dealt with by the worker
+	 * 	  now, just wait.
+	 * 	  				the worker goto idle.
+	 * Task1 will sleep until the transaction is commited.
+	 */
+	mutex_lock(&delayed_node->mutex);
+	if (delayed_node->count)
+		need_requeue = 1;
+	else
+		btrfs_dequeue_delayed_node(root->fs_info->delayed_root,
+					   delayed_node);
+	mutex_unlock(&delayed_node->mutex);
+
+	nr = trans->blocks_used;
+
+	trans->block_rsv = block_rsv;
+	btrfs_end_transaction_dmeta(trans, root);
+	__btrfs_btree_balance_dirty(root, nr);
+free_path:
+	btrfs_free_path(path);
+out:
+	if (need_requeue)
+		btrfs_requeue_work(&async_node->work);
+	else {
+		btrfs_release_prepared_delayed_node(delayed_node);
+		kfree(async_node);
+	}
+}
+
+static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
+				     struct btrfs_root *root, int all)
+{
+	struct btrfs_async_delayed_node *async_node;
+	struct btrfs_delayed_node *curr;
+	int count = 0;
+
+again:
+	curr = btrfs_first_prepared_delayed_node(delayed_root);
+	if (!curr)
+		return 0;
+
+	async_node = kmalloc(sizeof(*async_node), GFP_NOFS);
+	if (!async_node) {
+		btrfs_release_prepared_delayed_node(curr);
+		return -ENOMEM;
+	}
+
+	async_node->root = root;
+	async_node->delayed_node = curr;
+
+	async_node->work.func = btrfs_async_run_delayed_node_done;
+	async_node->work.flags = 0;
+
+	btrfs_queue_worker(&root->fs_info->delayed_workers, &async_node->work);
+	count++;
+
+	if (all || count < 4)
+		goto again;
+
+	return 0;
+}
+
+void btrfs_assert_delayed_root_empty(struct btrfs_root *root)
+{
+	struct btrfs_delayed_root *delayed_root;
+	delayed_root = btrfs_get_delayed_root(root);
+	WARN_ON(btrfs_first_delayed_node(delayed_root));
+}
+
+void btrfs_balance_delayed_items(struct btrfs_root *root)
+{
+	struct btrfs_delayed_root *delayed_root;
+
+	delayed_root = btrfs_get_delayed_root(root);
+
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+		return;
+
+	if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
+		int ret;
+		ret = btrfs_wq_run_delayed_node(delayed_root, root, 1);
+		if (ret)
+			return;
+
+		wait_event_interruptible_timeout(
+				delayed_root->wait,
+				(atomic_read(&delayed_root->items) <
+				 BTRFS_DELAYED_BACKGROUND),
+				HZ);
+		return;
+	}
+
+	btrfs_wq_run_delayed_node(delayed_root, root, 0);
+}
+
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root, const char *name,
+				   int name_len, struct inode *dir,
+				   struct btrfs_disk_key *disk_key, u8 type,
+				   u64 index)
+{
+	struct btrfs_delayed_node *delayed_node;
+	struct btrfs_delayed_item *delayed_item;
+	struct btrfs_dir_item *dir_item;
+	int ret;
+
+	delayed_node = btrfs_get_or_create_delayed_node(dir);
+	if (IS_ERR(delayed_node))
+		return PTR_ERR(delayed_node);
+
+	delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
+	if (!delayed_item) {
+		ret = -ENOMEM;
+		goto release_node;
+	}
+
+	ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+	/*
+	 * we have reserved enough space when we start a new transaction,
+	 * so reserving metadata failure is impossible
+	 */
+	BUG_ON(ret);
+
+	delayed_item->key.objectid = btrfs_ino(dir);
+	btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
+	delayed_item->key.offset = index;
+
+	dir_item = (struct btrfs_dir_item *)delayed_item->data;
+	dir_item->location = *disk_key;
+	dir_item->transid = cpu_to_le64(trans->transid);
+	dir_item->data_len = 0;
+	dir_item->name_len = cpu_to_le16(name_len);
+	dir_item->type = type;
+	memcpy((char *)(dir_item + 1), name, name_len);
+
+	mutex_lock(&delayed_node->mutex);
+	ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
+	if (unlikely(ret)) {
+		printk(KERN_ERR "err add delayed dir index item(name: %s) into "
+				"the insertion tree of the delayed node"
+				"(root id: %llu, inode id: %llu, errno: %d)\n",
+				name,
+				(unsigned long long)delayed_node->root->objectid,
+				(unsigned long long)delayed_node->inode_id,
+				ret);
+		BUG();
+	}
+	mutex_unlock(&delayed_node->mutex);
+
+release_node:
+	btrfs_release_delayed_node(delayed_node);
+	return ret;
+}
+
+static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root,
+					       struct btrfs_delayed_node *node,
+					       struct btrfs_key *key)
+{
+	struct btrfs_delayed_item *item;
+
+	mutex_lock(&node->mutex);
+	item = __btrfs_lookup_delayed_insertion_item(node, key);
+	if (!item) {
+		mutex_unlock(&node->mutex);
+		return 1;
+	}
+
+	btrfs_delayed_item_release_metadata(root, item);
+	btrfs_release_delayed_item(item);
+	mutex_unlock(&node->mutex);
+	return 0;
+}
+
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root, struct inode *dir,
+				   u64 index)
+{
+	struct btrfs_delayed_node *node;
+	struct btrfs_delayed_item *item;
+	struct btrfs_key item_key;
+	int ret;
+
+	node = btrfs_get_or_create_delayed_node(dir);
+	if (IS_ERR(node))
+		return PTR_ERR(node);
+
+	item_key.objectid = btrfs_ino(dir);
+	btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY);
+	item_key.offset = index;
+
+	ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
+	if (!ret)
+		goto end;
+
+	item = btrfs_alloc_delayed_item(0);
+	if (!item) {
+		ret = -ENOMEM;
+		goto end;
+	}
+
+	item->key = item_key;
+
+	ret = btrfs_delayed_item_reserve_metadata(trans, root, item);
+	/*
+	 * we have reserved enough space when we start a new transaction,
+	 * so reserving metadata failure is impossible.
+	 */
+	BUG_ON(ret);
+
+	mutex_lock(&node->mutex);
+	ret = __btrfs_add_delayed_deletion_item(node, item);
+	if (unlikely(ret)) {
+		printk(KERN_ERR "err add delayed dir index item(index: %llu) "
+				"into the deletion tree of the delayed node"
+				"(root id: %llu, inode id: %llu, errno: %d)\n",
+				(unsigned long long)index,
+				(unsigned long long)node->root->objectid,
+				(unsigned long long)node->inode_id,
+				ret);
+		BUG();
+	}
+	mutex_unlock(&node->mutex);
+end:
+	btrfs_release_delayed_node(node);
+	return ret;
+}
+
+int btrfs_inode_delayed_dir_index_count(struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+
+	if (!delayed_node)
+		return -ENOENT;
+
+	/*
+	 * Since we have held i_mutex of this directory, it is impossible that
+	 * a new directory index is added into the delayed node and index_cnt
+	 * is updated now. So we needn't lock the delayed node.
+	 */
+	if (!delayed_node->index_cnt) {
+		btrfs_release_delayed_node(delayed_node);
+		return -EINVAL;
+	}
+
+	BTRFS_I(inode)->index_cnt = delayed_node->index_cnt;
+	btrfs_release_delayed_node(delayed_node);
+	return 0;
+}
+
+void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
+			     struct list_head *del_list)
+{
+	struct btrfs_delayed_node *delayed_node;
+	struct btrfs_delayed_item *item;
+
+	delayed_node = btrfs_get_delayed_node(inode);
+	if (!delayed_node)
+		return;
+
+	mutex_lock(&delayed_node->mutex);
+	item = __btrfs_first_delayed_insertion_item(delayed_node);
+	while (item) {
+		atomic_inc(&item->refs);
+		list_add_tail(&item->readdir_list, ins_list);
+		item = __btrfs_next_delayed_item(item);
+	}
+
+	item = __btrfs_first_delayed_deletion_item(delayed_node);
+	while (item) {
+		atomic_inc(&item->refs);
+		list_add_tail(&item->readdir_list, del_list);
+		item = __btrfs_next_delayed_item(item);
+	}
+	mutex_unlock(&delayed_node->mutex);
+	/*
+	 * This delayed node is still cached in the btrfs inode, so refs
+	 * must be > 1 now, and we needn't check it is going to be freed
+	 * or not.
+	 *
+	 * Besides that, this function is used to read dir, we do not
+	 * insert/delete delayed items in this period. So we also needn't
+	 * requeue or dequeue this delayed node.
+	 */
+	atomic_dec(&delayed_node->refs);
+}
+
+void btrfs_put_delayed_items(struct list_head *ins_list,
+			     struct list_head *del_list)
+{
+	struct btrfs_delayed_item *curr, *next;
+
+	list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+		list_del(&curr->readdir_list);
+		if (atomic_dec_and_test(&curr->refs))
+			kfree(curr);
+	}
+
+	list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+		list_del(&curr->readdir_list);
+		if (atomic_dec_and_test(&curr->refs))
+			kfree(curr);
+	}
+}
+
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+				  u64 index)
+{
+	struct btrfs_delayed_item *curr, *next;
+	int ret;
+
+	if (list_empty(del_list))
+		return 0;
+
+	list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+		if (curr->key.offset > index)
+			break;
+
+		list_del(&curr->readdir_list);
+		ret = (curr->key.offset == index);
+
+		if (atomic_dec_and_test(&curr->refs))
+			kfree(curr);
+
+		if (ret)
+			return 1;
+		else
+			continue;
+	}
+	return 0;
+}
+
+/*
+ * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
+ *
+ */
+int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
+				    filldir_t filldir,
+				    struct list_head *ins_list)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_delayed_item *curr, *next;
+	struct btrfs_key location;
+	char *name;
+	int name_len;
+	int over = 0;
+	unsigned char d_type;
+
+	if (list_empty(ins_list))
+		return 0;
+
+	/*
+	 * Changing the data of the delayed item is impossible. So
+	 * we needn't lock them. And we have held i_mutex of the
+	 * directory, nobody can delete any directory indexes now.
+	 */
+	list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+		list_del(&curr->readdir_list);
+
+		if (curr->key.offset < filp->f_pos) {
+			if (atomic_dec_and_test(&curr->refs))
+				kfree(curr);
+			continue;
+		}
+
+		filp->f_pos = curr->key.offset;
+
+		di = (struct btrfs_dir_item *)curr->data;
+		name = (char *)(di + 1);
+		name_len = le16_to_cpu(di->name_len);
+
+		d_type = btrfs_filetype_table[di->type];
+		btrfs_disk_key_to_cpu(&location, &di->location);
+
+		over = filldir(dirent, name, name_len, curr->key.offset,
+			       location.objectid, d_type);
+
+		if (atomic_dec_and_test(&curr->refs))
+			kfree(curr);
+
+		if (over)
+			return 1;
+	}
+	return 0;
+}
+
+BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
+			 sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
+			 transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
+			 nbytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
+			 block_group, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
+
+static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
+				  struct btrfs_inode_item *inode_item,
+				  struct inode *inode)
+{
+	btrfs_set_stack_inode_uid(inode_item, inode->i_uid);
+	btrfs_set_stack_inode_gid(inode_item, inode->i_gid);
+	btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
+	btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
+	btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
+	btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
+	btrfs_set_stack_inode_generation(inode_item,
+					 BTRFS_I(inode)->generation);
+	btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence);
+	btrfs_set_stack_inode_transid(inode_item, trans->transid);
+	btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
+	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
+	btrfs_set_stack_inode_block_group(inode_item, 0);
+
+	btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
+				     inode->i_atime.tv_sec);
+	btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item),
+				      inode->i_atime.tv_nsec);
+
+	btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item),
+				     inode->i_mtime.tv_sec);
+	btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item),
+				      inode->i_mtime.tv_nsec);
+
+	btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item),
+				     inode->i_ctime.tv_sec);
+	btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item),
+				      inode->i_ctime.tv_nsec);
+}
+
+int btrfs_fill_inode(struct inode *inode, u32 *rdev)
+{
+	struct btrfs_delayed_node *delayed_node;
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_timespec *tspec;
+
+	delayed_node = btrfs_get_delayed_node(inode);
+	if (!delayed_node)
+		return -ENOENT;
+
+	mutex_lock(&delayed_node->mutex);
+	if (!delayed_node->inode_dirty) {
+		mutex_unlock(&delayed_node->mutex);
+		btrfs_release_delayed_node(delayed_node);
+		return -ENOENT;
+	}
+
+	inode_item = &delayed_node->inode_item;
+
+	inode->i_uid = btrfs_stack_inode_uid(inode_item);
+	inode->i_gid = btrfs_stack_inode_gid(inode_item);
+	btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
+	inode->i_mode = btrfs_stack_inode_mode(inode_item);
+	inode->i_nlink = btrfs_stack_inode_nlink(inode_item);
+	inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
+	BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
+	BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
+	inode->i_rdev = 0;
+	*rdev = btrfs_stack_inode_rdev(inode_item);
+	BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
+
+	tspec = btrfs_inode_atime(inode_item);
+	inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec);
+	inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+
+	tspec = btrfs_inode_mtime(inode_item);
+	inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec);
+	inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+
+	tspec = btrfs_inode_ctime(inode_item);
+	inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec);
+	inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+
+	inode->i_generation = BTRFS_I(inode)->generation;
+	BTRFS_I(inode)->index_cnt = (u64)-1;
+
+	mutex_unlock(&delayed_node->mutex);
+	btrfs_release_delayed_node(delayed_node);
+	return 0;
+}
+
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node;
+	int ret = 0;
+
+	delayed_node = btrfs_get_or_create_delayed_node(inode);
+	if (IS_ERR(delayed_node))
+		return PTR_ERR(delayed_node);
+
+	mutex_lock(&delayed_node->mutex);
+	if (delayed_node->inode_dirty) {
+		fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+		goto release_node;
+	}
+
+	ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
+	/*
+	 * we must reserve enough space when we start a new transaction,
+	 * so reserving metadata failure is impossible
+	 */
+	BUG_ON(ret);
+
+	fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+	delayed_node->inode_dirty = 1;
+	delayed_node->count++;
+	atomic_inc(&root->fs_info->delayed_root->items);
+release_node:
+	mutex_unlock(&delayed_node->mutex);
+	btrfs_release_delayed_node(delayed_node);
+	return ret;
+}
+
+static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
+{
+	struct btrfs_root *root = delayed_node->root;
+	struct btrfs_delayed_item *curr_item, *prev_item;
+
+	mutex_lock(&delayed_node->mutex);
+	curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
+	while (curr_item) {
+		btrfs_delayed_item_release_metadata(root, curr_item);
+		prev_item = curr_item;
+		curr_item = __btrfs_next_delayed_item(prev_item);
+		btrfs_release_delayed_item(prev_item);
+	}
+
+	curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
+	while (curr_item) {
+		btrfs_delayed_item_release_metadata(root, curr_item);
+		prev_item = curr_item;
+		curr_item = __btrfs_next_delayed_item(prev_item);
+		btrfs_release_delayed_item(prev_item);
+	}
+
+	if (delayed_node->inode_dirty) {
+		btrfs_delayed_inode_release_metadata(root, delayed_node);
+		btrfs_release_delayed_inode(delayed_node);
+	}
+	mutex_unlock(&delayed_node->mutex);
+}
+
+void btrfs_kill_delayed_inode_items(struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node;
+
+	delayed_node = btrfs_get_delayed_node(inode);
+	if (!delayed_node)
+		return;
+
+	__btrfs_kill_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node);
+}
+
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
+{
+	u64 inode_id = 0;
+	struct btrfs_delayed_node *delayed_nodes[8];
+	int i, n;
+
+	while (1) {
+		spin_lock(&root->inode_lock);
+		n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
+					   (void **)delayed_nodes, inode_id,
+					   ARRAY_SIZE(delayed_nodes));
+		if (!n) {
+			spin_unlock(&root->inode_lock);
+			break;
+		}
+
+		inode_id = delayed_nodes[n - 1]->inode_id + 1;
+
+		for (i = 0; i < n; i++)
+			atomic_inc(&delayed_nodes[i]->refs);
+		spin_unlock(&root->inode_lock);
+
+		for (i = 0; i < n; i++) {
+			__btrfs_kill_delayed_node(delayed_nodes[i]);
+			btrfs_release_delayed_node(delayed_nodes[i]);
+		}
+	}
+}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
new file mode 100644
index 00000000..8d27af4b
--- /dev/null
+++ b/fs/btrfs/delayed-inode.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2011 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __DELAYED_TREE_OPERATION_H
+#define __DELAYED_TREE_OPERATION_H
+
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <asm/atomic.h>
+
+#include "ctree.h"
+
+/* types of the delayed item */
+#define BTRFS_DELAYED_INSERTION_ITEM	1
+#define BTRFS_DELAYED_DELETION_ITEM	2
+
+struct btrfs_delayed_root {
+	spinlock_t lock;
+	struct list_head node_list;
+	/*
+	 * Used for delayed nodes which is waiting to be dealt with by the
+	 * worker. If the delayed node is inserted into the work queue, we
+	 * drop it from this list.
+	 */
+	struct list_head prepare_list;
+	atomic_t items;		/* for delayed items */
+	int nodes;		/* for delayed nodes */
+	wait_queue_head_t wait;
+};
+
+struct btrfs_delayed_node {
+	u64 inode_id;
+	u64 bytes_reserved;
+	struct btrfs_root *root;
+	/* Used to add the node into the delayed root's node list. */
+	struct list_head n_list;
+	/*
+	 * Used to add the node into the prepare list, the nodes in this list
+	 * is waiting to be dealt with by the async worker.
+	 */
+	struct list_head p_list;
+	struct rb_root ins_root;
+	struct rb_root del_root;
+	struct mutex mutex;
+	struct btrfs_inode_item inode_item;
+	atomic_t refs;
+	u64 index_cnt;
+	bool in_list;
+	bool inode_dirty;
+	int count;
+};
+
+struct btrfs_delayed_item {
+	struct rb_node rb_node;
+	struct btrfs_key key;
+	struct list_head tree_list;	/* used for batch insert/delete items */
+	struct list_head readdir_list;	/* used for readdir items */
+	u64 bytes_reserved;
+	struct btrfs_delayed_node *delayed_node;
+	atomic_t refs;
+	int ins_or_del;
+	u32 data_len;
+	char data[0];
+};
+
+static inline void btrfs_init_delayed_root(
+				struct btrfs_delayed_root *delayed_root)
+{
+	atomic_set(&delayed_root->items, 0);
+	delayed_root->nodes = 0;
+	spin_lock_init(&delayed_root->lock);
+	init_waitqueue_head(&delayed_root->wait);
+	INIT_LIST_HEAD(&delayed_root->node_list);
+	INIT_LIST_HEAD(&delayed_root->prepare_list);
+}
+
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root, const char *name,
+				   int name_len, struct inode *dir,
+				   struct btrfs_disk_key *disk_key, u8 type,
+				   u64 index);
+
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root, struct inode *dir,
+				   u64 index);
+
+int btrfs_inode_delayed_dir_index_count(struct inode *inode);
+
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root);
+
+void btrfs_balance_delayed_items(struct btrfs_root *root);
+
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+				     struct inode *inode);
+/* Used for evicting the inode. */
+void btrfs_remove_delayed_node(struct inode *inode);
+void btrfs_kill_delayed_inode_items(struct inode *inode);
+
+
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, struct inode *inode);
+int btrfs_fill_inode(struct inode *inode, u32 *rdev);
+
+/* Used for drop dead root */
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
+
+/* Used for readdir() */
+void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
+			     struct list_head *del_list);
+void btrfs_put_delayed_items(struct list_head *ins_list,
+			     struct list_head *del_list);
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+				  u64 index);
+int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
+				    filldir_t filldir,
+				    struct list_head *ins_list);
+
+/* for init */
+int __init btrfs_delayed_inode_init(void);
+void btrfs_delayed_inode_exit(void);
+
+/* for debugging */
+void btrfs_assert_delayed_root_empty(struct btrfs_root *root);
+
+#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 00000000..125cf76f
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,711 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include "ctree.h"
+#include "delayed-ref.h"
+#include "transaction.h"
+
+/*
+ * delayed back reference update tracking.  For subvolume trees
+ * we queue up extent allocations and backref maintenance for
+ * delayed processing.   This avoids deep call chains where we
+ * add extents in the middle of btrfs_search_slot, and it allows
+ * us to buffer up frequently modified backrefs in an rb tree instead
+ * of hammering updates on the extent allocation tree.
+ */
+
+/*
+ * compare two delayed tree backrefs with same bytenr and type
+ */
+static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
+			  struct btrfs_delayed_tree_ref *ref1)
+{
+	if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
+		if (ref1->root < ref2->root)
+			return -1;
+		if (ref1->root > ref2->root)
+			return 1;
+	} else {
+		if (ref1->parent < ref2->parent)
+			return -1;
+		if (ref1->parent > ref2->parent)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * compare two delayed data backrefs with same bytenr and type
+ */
+static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
+			  struct btrfs_delayed_data_ref *ref1)
+{
+	if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
+		if (ref1->root < ref2->root)
+			return -1;
+		if (ref1->root > ref2->root)
+			return 1;
+		if (ref1->objectid < ref2->objectid)
+			return -1;
+		if (ref1->objectid > ref2->objectid)
+			return 1;
+		if (ref1->offset < ref2->offset)
+			return -1;
+		if (ref1->offset > ref2->offset)
+			return 1;
+	} else {
+		if (ref1->parent < ref2->parent)
+			return -1;
+		if (ref1->parent > ref2->parent)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * entries in the rb tree are ordered by the byte number of the extent,
+ * type of the delayed backrefs and content of delayed backrefs.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref2,
+		      struct btrfs_delayed_ref_node *ref1)
+{
+	if (ref1->bytenr < ref2->bytenr)
+		return -1;
+	if (ref1->bytenr > ref2->bytenr)
+		return 1;
+	if (ref1->is_head && ref2->is_head)
+		return 0;
+	if (ref2->is_head)
+		return -1;
+	if (ref1->is_head)
+		return 1;
+	if (ref1->type < ref2->type)
+		return -1;
+	if (ref1->type > ref2->type)
+		return 1;
+	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
+		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
+				      btrfs_delayed_node_to_tree_ref(ref1));
+	} else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
+		   ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
+		return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
+				      btrfs_delayed_node_to_data_ref(ref1));
+	}
+	BUG();
+	return 0;
+}
+
+/*
+ * insert a new ref into the rbtree.  This returns any existing refs
+ * for the same (bytenr,parent) tuple, or NULL if the new node was properly
+ * inserted.
+ */
+static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
+						  struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent_node = NULL;
+	struct btrfs_delayed_ref_node *entry;
+	struct btrfs_delayed_ref_node *ins;
+	int cmp;
+
+	ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+	while (*p) {
+		parent_node = *p;
+		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+				 rb_node);
+
+		cmp = comp_entry(entry, ins);
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else if (cmp > 0)
+			p = &(*p)->rb_right;
+		else
+			return entry;
+	}
+
+	rb_link_node(node, parent_node, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+/*
+ * find an head entry based on bytenr. This returns the delayed ref
+ * head if it was able to find one, or NULL if nothing was in that spot
+ */
+static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
+				  u64 bytenr,
+				  struct btrfs_delayed_ref_node **last)
+{
+	struct rb_node *n = root->rb_node;
+	struct btrfs_delayed_ref_node *entry;
+	int cmp;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+		WARN_ON(!entry->in_tree);
+		if (last)
+			*last = entry;
+
+		if (bytenr < entry->bytenr)
+			cmp = -1;
+		else if (bytenr > entry->bytenr)
+			cmp = 1;
+		else if (!btrfs_delayed_ref_is_head(entry))
+			cmp = 1;
+		else
+			cmp = 0;
+
+		if (cmp < 0)
+			n = n->rb_left;
+		else if (cmp > 0)
+			n = n->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+			   struct btrfs_delayed_ref_head *head)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	assert_spin_locked(&delayed_refs->lock);
+	if (mutex_trylock(&head->mutex))
+		return 0;
+
+	atomic_inc(&head->node.refs);
+	spin_unlock(&delayed_refs->lock);
+
+	mutex_lock(&head->mutex);
+	spin_lock(&delayed_refs->lock);
+	if (!head->node.in_tree) {
+		mutex_unlock(&head->mutex);
+		btrfs_put_delayed_ref(&head->node);
+		return -EAGAIN;
+	}
+	btrfs_put_delayed_ref(&head->node);
+	return 0;
+}
+
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+			   struct list_head *cluster, u64 start)
+{
+	int count = 0;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct rb_node *node;
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_ref_head *head;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	if (start == 0) {
+		node = rb_first(&delayed_refs->root);
+	} else {
+		ref = NULL;
+		find_ref_head(&delayed_refs->root, start, &ref);
+		if (ref) {
+			struct btrfs_delayed_ref_node *tmp;
+
+			node = rb_prev(&ref->rb_node);
+			while (node) {
+				tmp = rb_entry(node,
+					       struct btrfs_delayed_ref_node,
+					       rb_node);
+				if (tmp->bytenr < start)
+					break;
+				ref = tmp;
+				node = rb_prev(&ref->rb_node);
+			}
+			node = &ref->rb_node;
+		} else
+			node = rb_first(&delayed_refs->root);
+	}
+again:
+	while (node && count < 32) {
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		if (btrfs_delayed_ref_is_head(ref)) {
+			head = btrfs_delayed_node_to_head(ref);
+			if (list_empty(&head->cluster)) {
+				list_add_tail(&head->cluster, cluster);
+				delayed_refs->run_delayed_start =
+					head->node.bytenr;
+				count++;
+
+				WARN_ON(delayed_refs->num_heads_ready == 0);
+				delayed_refs->num_heads_ready--;
+			} else if (count) {
+				/* the goal of the clustering is to find extents
+				 * that are likely to end up in the same extent
+				 * leaf on disk.  So, we don't want them spread
+				 * all over the tree.  Stop now if we've hit
+				 * a head that was already in use
+				 */
+				break;
+			}
+		}
+		node = rb_next(node);
+	}
+	if (count) {
+		return 0;
+	} else if (start) {
+		/*
+		 * we've gone to the end of the rbtree without finding any
+		 * clusters.  start from the beginning and try again
+		 */
+		start = 0;
+		node = rb_first(&delayed_refs->root);
+		goto again;
+	}
+	return 1;
+}
+
+/*
+ * helper function to update an extent delayed ref in the
+ * rbtree.  existing and update must both have the same
+ * bytenr and parent
+ *
+ * This may free existing if the update cancels out whatever
+ * operation it was doing.
+ */
+static noinline void
+update_existing_ref(struct btrfs_trans_handle *trans,
+		    struct btrfs_delayed_ref_root *delayed_refs,
+		    struct btrfs_delayed_ref_node *existing,
+		    struct btrfs_delayed_ref_node *update)
+{
+	if (update->action != existing->action) {
+		/*
+		 * this is effectively undoing either an add or a
+		 * drop.  We decrement the ref_mod, and if it goes
+		 * down to zero we just delete the entry without
+		 * every changing the extent allocation tree.
+		 */
+		existing->ref_mod--;
+		if (existing->ref_mod == 0) {
+			rb_erase(&existing->rb_node,
+				 &delayed_refs->root);
+			existing->in_tree = 0;
+			btrfs_put_delayed_ref(existing);
+			delayed_refs->num_entries--;
+			if (trans->delayed_ref_updates)
+				trans->delayed_ref_updates--;
+		} else {
+			WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
+				existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
+		}
+	} else {
+		WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
+			existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
+		/*
+		 * the action on the existing ref matches
+		 * the action on the ref we're trying to add.
+		 * Bump the ref_mod by one so the backref that
+		 * is eventually added/removed has the correct
+		 * reference count
+		 */
+		existing->ref_mod += update->ref_mod;
+	}
+}
+
+/*
+ * helper function to update the accounting in the head ref
+ * existing and update must have the same bytenr
+ */
+static noinline void
+update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
+			 struct btrfs_delayed_ref_node *update)
+{
+	struct btrfs_delayed_ref_head *existing_ref;
+	struct btrfs_delayed_ref_head *ref;
+
+	existing_ref = btrfs_delayed_node_to_head(existing);
+	ref = btrfs_delayed_node_to_head(update);
+	BUG_ON(existing_ref->is_data != ref->is_data);
+
+	if (ref->must_insert_reserved) {
+		/* if the extent was freed and then
+		 * reallocated before the delayed ref
+		 * entries were processed, we can end up
+		 * with an existing head ref without
+		 * the must_insert_reserved flag set.
+		 * Set it again here
+		 */
+		existing_ref->must_insert_reserved = ref->must_insert_reserved;
+
+		/*
+		 * update the num_bytes so we make sure the accounting
+		 * is done correctly
+		 */
+		existing->num_bytes = update->num_bytes;
+
+	}
+
+	if (ref->extent_op) {
+		if (!existing_ref->extent_op) {
+			existing_ref->extent_op = ref->extent_op;
+		} else {
+			if (ref->extent_op->update_key) {
+				memcpy(&existing_ref->extent_op->key,
+				       &ref->extent_op->key,
+				       sizeof(ref->extent_op->key));
+				existing_ref->extent_op->update_key = 1;
+			}
+			if (ref->extent_op->update_flags) {
+				existing_ref->extent_op->flags_to_set |=
+					ref->extent_op->flags_to_set;
+				existing_ref->extent_op->update_flags = 1;
+			}
+			kfree(ref->extent_op);
+		}
+	}
+	/*
+	 * update the reference mod on the head to reflect this new operation
+	 */
+	existing->ref_mod += update->ref_mod;
+}
+
+/*
+ * helper function to actually insert a head node into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count.
+ */
+static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
+					struct btrfs_delayed_ref_node *ref,
+					u64 bytenr, u64 num_bytes,
+					int action, int is_data)
+{
+	struct btrfs_delayed_ref_node *existing;
+	struct btrfs_delayed_ref_head *head_ref = NULL;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int count_mod = 1;
+	int must_insert_reserved = 0;
+
+	/*
+	 * the head node stores the sum of all the mods, so dropping a ref
+	 * should drop the sum in the head node by one.
+	 */
+	if (action == BTRFS_UPDATE_DELAYED_HEAD)
+		count_mod = 0;
+	else if (action == BTRFS_DROP_DELAYED_REF)
+		count_mod = -1;
+
+	/*
+	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
+	 * the reserved accounting when the extent is finally added, or
+	 * if a later modification deletes the delayed ref without ever
+	 * inserting the extent into the extent allocation tree.
+	 * ref->must_insert_reserved is the flag used to record
+	 * that accounting mods are required.
+	 *
+	 * Once we record must_insert_reserved, switch the action to
+	 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
+	 */
+	if (action == BTRFS_ADD_DELAYED_EXTENT)
+		must_insert_reserved = 1;
+	else
+		must_insert_reserved = 0;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	/* first set the basic ref node struct up */
+	atomic_set(&ref->refs, 1);
+	ref->bytenr = bytenr;
+	ref->num_bytes = num_bytes;
+	ref->ref_mod = count_mod;
+	ref->type  = 0;
+	ref->action  = 0;
+	ref->is_head = 1;
+	ref->in_tree = 1;
+
+	head_ref = btrfs_delayed_node_to_head(ref);
+	head_ref->must_insert_reserved = must_insert_reserved;
+	head_ref->is_data = is_data;
+
+	INIT_LIST_HEAD(&head_ref->cluster);
+	mutex_init(&head_ref->mutex);
+
+	trace_btrfs_delayed_ref_head(ref, head_ref, action);
+
+	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+
+	if (existing) {
+		update_existing_head_ref(existing, ref);
+		/*
+		 * we've updated the existing ref, free the newly
+		 * allocated ref
+		 */
+		kfree(ref);
+	} else {
+		delayed_refs->num_heads++;
+		delayed_refs->num_heads_ready++;
+		delayed_refs->num_entries++;
+		trans->delayed_ref_updates++;
+	}
+	return 0;
+}
+
+/*
+ * helper to insert a delayed tree ref into the rbtree.
+ */
+static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+					 struct btrfs_delayed_ref_node *ref,
+					 u64 bytenr, u64 num_bytes, u64 parent,
+					 u64 ref_root, int level, int action)
+{
+	struct btrfs_delayed_ref_node *existing;
+	struct btrfs_delayed_tree_ref *full_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	if (action == BTRFS_ADD_DELAYED_EXTENT)
+		action = BTRFS_ADD_DELAYED_REF;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	/* first set the basic ref node struct up */
+	atomic_set(&ref->refs, 1);
+	ref->bytenr = bytenr;
+	ref->num_bytes = num_bytes;
+	ref->ref_mod = 1;
+	ref->action = action;
+	ref->is_head = 0;
+	ref->in_tree = 1;
+
+	full_ref = btrfs_delayed_node_to_tree_ref(ref);
+	if (parent) {
+		full_ref->parent = parent;
+		ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
+	} else {
+		full_ref->root = ref_root;
+		ref->type = BTRFS_TREE_BLOCK_REF_KEY;
+	}
+	full_ref->level = level;
+
+	trace_btrfs_delayed_tree_ref(ref, full_ref, action);
+
+	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+
+	if (existing) {
+		update_existing_ref(trans, delayed_refs, existing, ref);
+		/*
+		 * we've updated the existing ref, free the newly
+		 * allocated ref
+		 */
+		kfree(ref);
+	} else {
+		delayed_refs->num_entries++;
+		trans->delayed_ref_updates++;
+	}
+	return 0;
+}
+
+/*
+ * helper to insert a delayed data ref into the rbtree.
+ */
+static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+					 struct btrfs_delayed_ref_node *ref,
+					 u64 bytenr, u64 num_bytes, u64 parent,
+					 u64 ref_root, u64 owner, u64 offset,
+					 int action)
+{
+	struct btrfs_delayed_ref_node *existing;
+	struct btrfs_delayed_data_ref *full_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	if (action == BTRFS_ADD_DELAYED_EXTENT)
+		action = BTRFS_ADD_DELAYED_REF;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	/* first set the basic ref node struct up */
+	atomic_set(&ref->refs, 1);
+	ref->bytenr = bytenr;
+	ref->num_bytes = num_bytes;
+	ref->ref_mod = 1;
+	ref->action = action;
+	ref->is_head = 0;
+	ref->in_tree = 1;
+
+	full_ref = btrfs_delayed_node_to_data_ref(ref);
+	if (parent) {
+		full_ref->parent = parent;
+		ref->type = BTRFS_SHARED_DATA_REF_KEY;
+	} else {
+		full_ref->root = ref_root;
+		ref->type = BTRFS_EXTENT_DATA_REF_KEY;
+	}
+	full_ref->objectid = owner;
+	full_ref->offset = offset;
+
+	trace_btrfs_delayed_data_ref(ref, full_ref, action);
+
+	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+
+	if (existing) {
+		update_existing_ref(trans, delayed_refs, existing, ref);
+		/*
+		 * we've updated the existing ref, free the newly
+		 * allocated ref
+		 */
+		kfree(ref);
+	} else {
+		delayed_refs->num_entries++;
+		trans->delayed_ref_updates++;
+	}
+	return 0;
+}
+
+/*
+ * add a delayed tree ref.  This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ */
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes, u64 parent,
+			       u64 ref_root,  int level, int action,
+			       struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_delayed_tree_ref *ref;
+	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int ret;
+
+	BUG_ON(extent_op && extent_op->is_data);
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	if (!head_ref) {
+		kfree(ref);
+		return -ENOMEM;
+	}
+
+	head_ref->extent_op = extent_op;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
+	/*
+	 * insert both the head node and the new ref without dropping
+	 * the spin lock
+	 */
+	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
+				   action, 0);
+	BUG_ON(ret);
+
+	ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
+				   parent, ref_root, level, action);
+	BUG_ON(ret);
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
+
+/*
+ * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
+ */
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes,
+			       u64 parent, u64 ref_root,
+			       u64 owner, u64 offset, int action,
+			       struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_delayed_data_ref *ref;
+	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int ret;
+
+	BUG_ON(extent_op && !extent_op->is_data);
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	if (!head_ref) {
+		kfree(ref);
+		return -ENOMEM;
+	}
+
+	head_ref->extent_op = extent_op;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
+	/*
+	 * insert both the head node and the new ref without dropping
+	 * the spin lock
+	 */
+	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
+				   action, 1);
+	BUG_ON(ret);
+
+	ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
+				   parent, ref_root, owner, offset, action);
+	BUG_ON(ret);
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
+
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+				u64 bytenr, u64 num_bytes,
+				struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int ret;
+
+	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	if (!head_ref)
+		return -ENOMEM;
+
+	head_ref->extent_op = extent_op;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
+	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+				   extent_op->is_data);
+	BUG_ON(ret);
+
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
+
+/*
+ * this does a simple search for the head node for a given extent.
+ * It must be called with the delayed ref spinlock held, and it returns
+ * the head node if any where found, or NULL if not.
+ */
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+	if (ref)
+		return btrfs_delayed_node_to_head(ref);
+	return NULL;
+}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 00000000..e287e3b0
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DELAYED_REF__
+#define __DELAYED_REF__
+
+/* these are the possible values of struct btrfs_delayed_ref->action */
+#define BTRFS_ADD_DELAYED_REF    1 /* add one backref to the tree */
+#define BTRFS_DROP_DELAYED_REF   2 /* delete one backref from the tree */
+#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
+#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+
+struct btrfs_delayed_ref_node {
+	struct rb_node rb_node;
+
+	/* the starting bytenr of the extent */
+	u64 bytenr;
+
+	/* the size of the extent */
+	u64 num_bytes;
+
+	/* ref count on this data structure */
+	atomic_t refs;
+
+	/*
+	 * how many refs is this entry adding or deleting.  For
+	 * head refs, this may be a negative number because it is keeping
+	 * track of the total mods done to the reference count.
+	 * For individual refs, this will always be a positive number
+	 *
+	 * It may be more than one, since it is possible for a single
+	 * parent to have more than one ref on an extent
+	 */
+	int ref_mod;
+
+	unsigned int action:8;
+	unsigned int type:8;
+	/* is this node still in the rbtree? */
+	unsigned int is_head:1;
+	unsigned int in_tree:1;
+};
+
+struct btrfs_delayed_extent_op {
+	struct btrfs_disk_key key;
+	u64 flags_to_set;
+	unsigned int update_key:1;
+	unsigned int update_flags:1;
+	unsigned int is_data:1;
+};
+
+/*
+ * the head refs are used to hold a lock on a given extent, which allows us
+ * to make sure that only one process is running the delayed refs
+ * at a time for a single extent.  They also store the sum of all the
+ * reference count modifications we've queued up.
+ */
+struct btrfs_delayed_ref_head {
+	struct btrfs_delayed_ref_node node;
+
+	/*
+	 * the mutex is held while running the refs, and it is also
+	 * held when checking the sum of reference modifications.
+	 */
+	struct mutex mutex;
+
+	struct list_head cluster;
+
+	struct btrfs_delayed_extent_op *extent_op;
+	/*
+	 * when a new extent is allocated, it is just reserved in memory
+	 * The actual extent isn't inserted into the extent allocation tree
+	 * until the delayed ref is processed.  must_insert_reserved is
+	 * used to flag a delayed ref so the accounting can be updated
+	 * when a full insert is done.
+	 *
+	 * It is possible the extent will be freed before it is ever
+	 * inserted into the extent allocation tree.  In this case
+	 * we need to update the in ram accounting to properly reflect
+	 * the free has happened.
+	 */
+	unsigned int must_insert_reserved:1;
+	unsigned int is_data:1;
+};
+
+struct btrfs_delayed_tree_ref {
+	struct btrfs_delayed_ref_node node;
+	union {
+		u64 root;
+		u64 parent;
+	};
+	int level;
+};
+
+struct btrfs_delayed_data_ref {
+	struct btrfs_delayed_ref_node node;
+	union {
+		u64 root;
+		u64 parent;
+	};
+	u64 objectid;
+	u64 offset;
+};
+
+struct btrfs_delayed_ref_root {
+	struct rb_root root;
+
+	/* this spin lock protects the rbtree and the entries inside */
+	spinlock_t lock;
+
+	/* how many delayed ref updates we've queued, used by the
+	 * throttling code
+	 */
+	unsigned long num_entries;
+
+	/* total number of head nodes in tree */
+	unsigned long num_heads;
+
+	/* total number of head nodes ready for processing */
+	unsigned long num_heads_ready;
+
+	/*
+	 * set when the tree is flushing before a transaction commit,
+	 * used by the throttling code to decide if new updates need
+	 * to be run right away
+	 */
+	int flushing;
+
+	u64 run_delayed_start;
+};
+
+static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+	WARN_ON(atomic_read(&ref->refs) == 0);
+	if (atomic_dec_and_test(&ref->refs)) {
+		WARN_ON(ref->in_tree);
+		kfree(ref);
+	}
+}
+
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes, u64 parent,
+			       u64 ref_root, int level, int action,
+			       struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes,
+			       u64 parent, u64 ref_root,
+			       u64 owner, u64 offset, int action,
+			       struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+				u64 bytenr, u64 num_bytes,
+				struct btrfs_delayed_extent_op *extent_op);
+
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+			   struct btrfs_delayed_ref_head *head);
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+			   struct list_head *cluster, u64 search_start);
+/*
+ * a node might live in a head or a regular ref, this lets you
+ * test for the proper type to use.
+ */
+static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
+{
+	return node->is_head;
+}
+
+/*
+ * helper functions to cast a node into its container
+ */
+static inline struct btrfs_delayed_tree_ref *
+btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
+{
+	WARN_ON(btrfs_delayed_ref_is_head(node));
+	return container_of(node, struct btrfs_delayed_tree_ref, node);
+}
+
+static inline struct btrfs_delayed_data_ref *
+btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
+{
+	WARN_ON(btrfs_delayed_ref_is_head(node));
+	return container_of(node, struct btrfs_delayed_data_ref, node);
+}
+
+static inline struct btrfs_delayed_ref_head *
+btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
+{
+	WARN_ON(!btrfs_delayed_ref_is_head(node));
+	return container_of(node, struct btrfs_delayed_ref_head, node);
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 00000000..685f2593
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,453 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+#include "transaction.h"
+
+/*
+ * insert a name into a directory, doing overflow properly if there is a hash
+ * collision.  data_size indicates how big the item inserted should be.  On
+ * success a struct btrfs_dir_item pointer is returned, otherwise it is
+ * an ERR_PTR.
+ *
+ * The name is not copied into the dir item, you have to do that yourself.
+ */
+static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
+						   *trans,
+						   struct btrfs_root *root,
+						   struct btrfs_path *path,
+						   struct btrfs_key *cpu_key,
+						   u32 data_size,
+						   const char *name,
+						   int name_len)
+{
+	int ret;
+	char *ptr;
+	struct btrfs_item *item;
+	struct extent_buffer *leaf;
+
+	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+	if (ret == -EEXIST) {
+		struct btrfs_dir_item *di;
+		di = btrfs_match_dir_item_name(root, path, name, name_len);
+		if (di)
+			return ERR_PTR(-EEXIST);
+		ret = btrfs_extend_item(trans, root, path, data_size);
+	}
+	if (ret < 0)
+		return ERR_PTR(ret);
+	WARN_ON(ret > 0);
+	leaf = path->nodes[0];
+	item = btrfs_item_nr(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+	BUG_ON(data_size > btrfs_item_size(leaf, item));
+	ptr += btrfs_item_size(leaf, item) - data_size;
+	return (struct btrfs_dir_item *)ptr;
+}
+
+/*
+ * xattrs work a lot like directories, this inserts an xattr item
+ * into the tree
+ */
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 objectid,
+			    const char *name, u16 name_len,
+			    const void *data, u16 data_len)
+{
+	int ret = 0;
+	struct btrfs_dir_item *dir_item;
+	unsigned long name_ptr, data_ptr;
+	struct btrfs_key key, location;
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *leaf;
+	u32 data_size;
+
+	BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
+
+	key.objectid = objectid;
+	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+	key.offset = btrfs_name_hash(name, name_len);
+
+	data_size = sizeof(*dir_item) + name_len + data_len;
+	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+					name, name_len);
+	/*
+	 * FIXME: at some point we should handle xattr's that are larger than
+	 * what we can fit in our leaf.  We set location to NULL b/c we arent
+	 * pointing at anything else, that will change if we store the xattr
+	 * data in a separate inode.
+	 */
+	BUG_ON(IS_ERR(dir_item));
+	memset(&location, 0, sizeof(location));
+
+	leaf = path->nodes[0];
+	btrfs_cpu_key_to_disk(&disk_key, &location);
+	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+	btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
+	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+	btrfs_set_dir_data_len(leaf, dir_item, data_len);
+	name_ptr = (unsigned long)(dir_item + 1);
+	data_ptr = (unsigned long)((char *)name_ptr + name_len);
+
+	write_extent_buffer(leaf, name, name_ptr, name_len);
+	write_extent_buffer(leaf, data, data_ptr, data_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	return ret;
+}
+
+/*
+ * insert a directory item in the tree, doing all the magic for
+ * both indexes. 'dir' indicates which objectid to insert it into,
+ * 'location' is the key to stuff into the directory item, 'type' is the
+ * type of the inode we're pointing to, and 'index' is the sequence number
+ * to use for the second index (if one is created).
+ */
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, const char *name, int name_len,
+			  struct inode *dir, struct btrfs_key *location,
+			  u8 type, u64 index)
+{
+	int ret = 0;
+	int ret2 = 0;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *dir_item;
+	struct extent_buffer *leaf;
+	unsigned long name_ptr;
+	struct btrfs_key key;
+	struct btrfs_disk_key disk_key;
+	u32 data_size;
+
+	key.objectid = btrfs_ino(dir);
+	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+	key.offset = btrfs_name_hash(name, name_len);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->leave_spinning = 1;
+
+	btrfs_cpu_key_to_disk(&disk_key, location);
+
+	data_size = sizeof(*dir_item) + name_len;
+	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+					name, name_len);
+	if (IS_ERR(dir_item)) {
+		ret = PTR_ERR(dir_item);
+		if (ret == -EEXIST)
+			goto second_insert;
+		goto out_free;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+	btrfs_set_dir_type(leaf, dir_item, type);
+	btrfs_set_dir_data_len(leaf, dir_item, 0);
+	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+	name_ptr = (unsigned long)(dir_item + 1);
+
+	write_extent_buffer(leaf, name, name_ptr, name_len);
+	btrfs_mark_buffer_dirty(leaf);
+
+second_insert:
+	/* FIXME, use some real flag for selecting the extra index */
+	if (root == root->fs_info->tree_root) {
+		ret = 0;
+		goto out_free;
+	}
+	btrfs_release_path(path);
+
+	ret2 = btrfs_insert_delayed_dir_index(trans, root, name, name_len, dir,
+					      &disk_key, type, index);
+out_free:
+	btrfs_free_path(path);
+	if (ret)
+		return ret;
+	if (ret2)
+		return ret2;
+	return 0;
+}
+
+/*
+ * lookup a directory item based on name.  'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ */
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     struct btrfs_path *path, u64 dir,
+					     const char *name, int name_len,
+					     int mod)
+{
+	int ret;
+	struct btrfs_key key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+
+	key.objectid = dir;
+	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+
+	key.offset = btrfs_name_hash(name, name_len);
+
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			return NULL;
+		path->slots[0]--;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+	if (found_key.objectid != dir ||
+	    btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
+	    found_key.offset != key.offset)
+		return NULL;
+
+	return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+/*
+ * lookup a directory item based on index.  'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ *
+ * The name is used to make sure the index really points to the name you were
+ * looking for.
+ */
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 dir,
+			    u64 objectid, const char *name, int name_len,
+			    int mod)
+{
+	int ret;
+	struct btrfs_key key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+
+	key.objectid = dir;
+	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+	key.offset = objectid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return ERR_PTR(-ENOENT);
+	return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+			    struct btrfs_path *path, u64 dirid,
+			    const char *name, int name_len)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	u32 nritems;
+	int ret;
+
+	key.objectid = dirid;
+	key.type = BTRFS_DIR_INDEX_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	leaf = path->nodes[0];
+	nritems = btrfs_header_nritems(leaf);
+
+	while (1) {
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				return ERR_PTR(ret);
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
+			break;
+
+		di = btrfs_match_dir_item_name(root, path, name, name_len);
+		if (di)
+			return di;
+
+		path->slots[0]++;
+	}
+	return NULL;
+}
+
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, u64 dir,
+					  const char *name, u16 name_len,
+					  int mod)
+{
+	int ret;
+	struct btrfs_key key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+
+	key.objectid = dir;
+	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+	key.offset = btrfs_name_hash(name, name_len);
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			return NULL;
+		path->slots[0]--;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+	if (found_key.objectid != dir ||
+	    btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
+	    found_key.offset != key.offset)
+		return NULL;
+
+	return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+/*
+ * helper function to look at the directory item pointed to by 'path'
+ * this walks through all the entries in a dir item and finds one
+ * for a specific name.
+ */
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      const char *name, int name_len)
+{
+	struct btrfs_dir_item *dir_item;
+	unsigned long name_ptr;
+	u32 total_len;
+	u32 cur = 0;
+	u32 this_len;
+	struct extent_buffer *leaf;
+
+	leaf = path->nodes[0];
+	dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+	if (verify_dir_item(root, leaf, dir_item))
+		return NULL;
+
+	total_len = btrfs_item_size_nr(leaf, path->slots[0]);
+	while (cur < total_len) {
+		this_len = sizeof(*dir_item) +
+			btrfs_dir_name_len(leaf, dir_item) +
+			btrfs_dir_data_len(leaf, dir_item);
+		name_ptr = (unsigned long)(dir_item + 1);
+
+		if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
+		    memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
+			return dir_item;
+
+		cur += this_len;
+		dir_item = (struct btrfs_dir_item *)((char *)dir_item +
+						     this_len);
+	}
+	return NULL;
+}
+
+/*
+ * given a pointer into a directory item, delete it.  This
+ * handles items that have more than one entry in them.
+ */
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      struct btrfs_dir_item *di)
+{
+
+	struct extent_buffer *leaf;
+	u32 sub_item_len;
+	u32 item_len;
+	int ret = 0;
+
+	leaf = path->nodes[0];
+	sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
+		btrfs_dir_data_len(leaf, di);
+	item_len = btrfs_item_size_nr(leaf, path->slots[0]);
+	if (sub_item_len == item_len) {
+		ret = btrfs_del_item(trans, root, path);
+	} else {
+		/* MARKER */
+		unsigned long ptr = (unsigned long)di;
+		unsigned long start;
+
+		start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+			item_len - (ptr + sub_item_len - start));
+		ret = btrfs_truncate_item(trans, root, path,
+					  item_len - sub_item_len, 1);
+	}
+	return ret;
+}
+
+int verify_dir_item(struct btrfs_root *root,
+		    struct extent_buffer *leaf,
+		    struct btrfs_dir_item *dir_item)
+{
+	u16 namelen = BTRFS_NAME_LEN;
+	u8 type = btrfs_dir_type(leaf, dir_item);
+
+	if (type >= BTRFS_FT_MAX) {
+		printk(KERN_CRIT "btrfs: invalid dir item type: %d\n",
+		       (int)type);
+		return 1;
+	}
+
+	if (type == BTRFS_FT_XATTR)
+		namelen = XATTR_NAME_MAX;
+
+	if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
+		printk(KERN_CRIT "btrfs: invalid dir item name len: %u\n",
+		       (unsigned)btrfs_dir_data_len(leaf, dir_item));
+		return 1;
+	}
+
+	/* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
+	if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) {
+		printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n",
+		       (unsigned)btrfs_dir_data_len(leaf, dir_item));
+		return 1;
+	}
+
+	return 0;
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 00000000..1ac8db5d
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,3096 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+#include <linux/swap.h>
+#include <linux/radix-tree.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/crc32c.h>
+#include <linux/slab.h>
+#include <linux/migrate.h>
+#include <linux/ratelimit.h>
+#include <asm/unaligned.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "async-thread.h"
+#include "locking.h"
+#include "tree-log.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
+
+static struct extent_io_ops btree_extent_io_ops;
+static void end_workqueue_fn(struct btrfs_work *work);
+static void free_fs_root(struct btrfs_root *root);
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+				    int read_only);
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+				      struct btrfs_root *root);
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages,
+					int mark);
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+				       struct extent_io_tree *pinned_extents);
+static int btrfs_cleanup_transaction(struct btrfs_root *root);
+
+/*
+ * end_io_wq structs are used to do processing in task context when an IO is
+ * complete.  This is used during reads to verify checksums, and it is used
+ * by writes to insert metadata for new file extents after IO is complete.
+ */
+struct end_io_wq {
+	struct bio *bio;
+	bio_end_io_t *end_io;
+	void *private;
+	struct btrfs_fs_info *info;
+	int error;
+	int metadata;
+	struct list_head list;
+	struct btrfs_work work;
+};
+
+/*
+ * async submit bios are used to offload expensive checksumming
+ * onto the worker threads.  They checksum file and metadata bios
+ * just before they are sent down the IO stack.
+ */
+struct async_submit_bio {
+	struct inode *inode;
+	struct bio *bio;
+	struct list_head list;
+	extent_submit_bio_hook_t *submit_bio_start;
+	extent_submit_bio_hook_t *submit_bio_done;
+	int rw;
+	int mirror_num;
+	unsigned long bio_flags;
+	/*
+	 * bio_offset is optional, can be used if the pages in the bio
+	 * can't tell us where in the file the bio should go
+	 */
+	u64 bio_offset;
+	struct btrfs_work work;
+};
+
+/* These are used to set the lockdep class on the extent buffer locks.
+ * The class is set by the readpage_end_io_hook after the buffer has
+ * passed csum validation but before the pages are unlocked.
+ *
+ * The lockdep class is also set by btrfs_init_new_buffer on freshly
+ * allocated blocks.
+ *
+ * The class is based on the level in the tree block, which allows lockdep
+ * to know that lower nodes nest inside the locks of higher nodes.
+ *
+ * We also add a check to make sure the highest level of the tree is
+ * the same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this
+ * code needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# if BTRFS_MAX_LEVEL != 8
+#  error
+# endif
+static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
+static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
+	/* leaf */
+	"btrfs-extent-00",
+	"btrfs-extent-01",
+	"btrfs-extent-02",
+	"btrfs-extent-03",
+	"btrfs-extent-04",
+	"btrfs-extent-05",
+	"btrfs-extent-06",
+	"btrfs-extent-07",
+	/* highest possible level */
+	"btrfs-extent-08",
+};
+#endif
+
+/*
+ * extents on the btree inode are pretty simple, there's one extent
+ * that covers the entire device
+ */
+static struct extent_map *btree_get_extent(struct inode *inode,
+		struct page *page, size_t pg_offset, u64 start, u64 len,
+		int create)
+{
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	int ret;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, len);
+	if (em) {
+		em->bdev =
+			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+		read_unlock(&em_tree->lock);
+		goto out;
+	}
+	read_unlock(&em_tree->lock);
+
+	em = alloc_extent_map();
+	if (!em) {
+		em = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	em->start = 0;
+	em->len = (u64)-1;
+	em->block_len = (u64)-1;
+	em->block_start = 0;
+	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	write_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em);
+	if (ret == -EEXIST) {
+		u64 failed_start = em->start;
+		u64 failed_len = em->len;
+
+		free_extent_map(em);
+		em = lookup_extent_mapping(em_tree, start, len);
+		if (em) {
+			ret = 0;
+		} else {
+			em = lookup_extent_mapping(em_tree, failed_start,
+						   failed_len);
+			ret = -EIO;
+		}
+	} else if (ret) {
+		free_extent_map(em);
+		em = NULL;
+	}
+	write_unlock(&em_tree->lock);
+
+	if (ret)
+		em = ERR_PTR(ret);
+out:
+	return em;
+}
+
+u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
+{
+	return crc32c(seed, data, len);
+}
+
+void btrfs_csum_final(u32 crc, char *result)
+{
+	put_unaligned_le32(~crc, result);
+}
+
+/*
+ * compute the csum for a btree block, and either verify it or write it
+ * into the csum field of the block.
+ */
+static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
+			   int verify)
+{
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+	char *result = NULL;
+	unsigned long len;
+	unsigned long cur_len;
+	unsigned long offset = BTRFS_CSUM_SIZE;
+	char *map_token = NULL;
+	char *kaddr;
+	unsigned long map_start;
+	unsigned long map_len;
+	int err;
+	u32 crc = ~(u32)0;
+	unsigned long inline_result;
+
+	len = buf->len - offset;
+	while (len > 0) {
+		err = map_private_extent_buffer(buf, offset, 32,
+					&map_token, &kaddr,
+					&map_start, &map_len, KM_USER0);
+		if (err)
+			return 1;
+		cur_len = min(len, map_len - (offset - map_start));
+		crc = btrfs_csum_data(root, kaddr + offset - map_start,
+				      crc, cur_len);
+		len -= cur_len;
+		offset += cur_len;
+		unmap_extent_buffer(buf, map_token, KM_USER0);
+	}
+	if (csum_size > sizeof(inline_result)) {
+		result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
+		if (!result)
+			return 1;
+	} else {
+		result = (char *)&inline_result;
+	}
+
+	btrfs_csum_final(crc, result);
+
+	if (verify) {
+		if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
+			u32 val;
+			u32 found = 0;
+			memcpy(&found, result, csum_size);
+
+			read_extent_buffer(buf, &val, 0, csum_size);
+			printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
+				       "failed on %llu wanted %X found %X "
+				       "level %d\n",
+				       root->fs_info->sb->s_id,
+				       (unsigned long long)buf->start, val, found,
+				       btrfs_header_level(buf));
+			if (result != (char *)&inline_result)
+				kfree(result);
+			return 1;
+		}
+	} else {
+		write_extent_buffer(buf, result, 0, csum_size);
+	}
+	if (result != (char *)&inline_result)
+		kfree(result);
+	return 0;
+}
+
+/*
+ * we can't consider a given block up to date unless the transid of the
+ * block matches the transid in the parent node's pointer.  This is how we
+ * detect blocks that either didn't get written at all or got written
+ * in the wrong place.
+ */
+static int verify_parent_transid(struct extent_io_tree *io_tree,
+				 struct extent_buffer *eb, u64 parent_transid)
+{
+	struct extent_state *cached_state = NULL;
+	int ret;
+
+	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
+		return 0;
+
+	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
+			 0, &cached_state, GFP_NOFS);
+	if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
+	    btrfs_header_generation(eb) == parent_transid) {
+		ret = 0;
+		goto out;
+	}
+	printk_ratelimited("parent transid verify failed on %llu wanted %llu "
+		       "found %llu\n",
+		       (unsigned long long)eb->start,
+		       (unsigned long long)parent_transid,
+		       (unsigned long long)btrfs_header_generation(eb));
+	ret = 1;
+	clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
+out:
+	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
+			     &cached_state, GFP_NOFS);
+	return ret;
+}
+
+/*
+ * helper to read a given tree block, doing retries as required when
+ * the checksums don't match and we have alternate mirrors to try.
+ */
+static int btree_read_extent_buffer_pages(struct btrfs_root *root,
+					  struct extent_buffer *eb,
+					  u64 start, u64 parent_transid)
+{
+	struct extent_io_tree *io_tree;
+	int ret;
+	int num_copies = 0;
+	int mirror_num = 0;
+
+	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+	while (1) {
+		ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+					       btree_get_extent, mirror_num);
+		if (!ret &&
+		    !verify_parent_transid(io_tree, eb, parent_transid))
+			return ret;
+
+		/*
+		 * This buffer's crc is fine, but its contents are corrupted, so
+		 * there is no reason to read the other copies, they won't be
+		 * any less wrong.
+		 */
+		if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+			return ret;
+
+		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+					      eb->start, eb->len);
+		if (num_copies == 1)
+			return ret;
+
+		mirror_num++;
+		if (mirror_num > num_copies)
+			return ret;
+	}
+	return -EIO;
+}
+
+/*
+ * checksum a dirty tree block before IO.  This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block
+ */
+
+static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
+{
+	struct extent_io_tree *tree;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 found_start;
+	unsigned long len;
+	struct extent_buffer *eb;
+	int ret;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+	if (page->private == EXTENT_PAGE_PRIVATE) {
+		WARN_ON(1);
+		goto out;
+	}
+	if (!page->private) {
+		WARN_ON(1);
+		goto out;
+	}
+	len = page->private >> 2;
+	WARN_ON(len == 0);
+
+	eb = alloc_extent_buffer(tree, start, len, page);
+	if (eb == NULL) {
+		WARN_ON(1);
+		goto out;
+	}
+	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
+					     btrfs_header_generation(eb));
+	BUG_ON(ret);
+	WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
+
+	found_start = btrfs_header_bytenr(eb);
+	if (found_start != start) {
+		WARN_ON(1);
+		goto err;
+	}
+	if (eb->first_page != page) {
+		WARN_ON(1);
+		goto err;
+	}
+	if (!PageUptodate(page)) {
+		WARN_ON(1);
+		goto err;
+	}
+	csum_tree_block(root, eb, 0);
+err:
+	free_extent_buffer(eb);
+out:
+	return 0;
+}
+
+static int check_tree_block_fsid(struct btrfs_root *root,
+				 struct extent_buffer *eb)
+{
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	u8 fsid[BTRFS_UUID_SIZE];
+	int ret = 1;
+
+	read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
+			   BTRFS_FSID_SIZE);
+	while (fs_devices) {
+		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+			ret = 0;
+			break;
+		}
+		fs_devices = fs_devices->seed;
+	}
+	return ret;
+}
+
+#define CORRUPT(reason, eb, root, slot)				\
+	printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu,"	\
+	       "root=%llu, slot=%d\n", reason,			\
+	       (unsigned long long)btrfs_header_bytenr(eb),	\
+	       (unsigned long long)root->objectid, slot)
+
+static noinline int check_leaf(struct btrfs_root *root,
+			       struct extent_buffer *leaf)
+{
+	struct btrfs_key key;
+	struct btrfs_key leaf_key;
+	u32 nritems = btrfs_header_nritems(leaf);
+	int slot;
+
+	if (nritems == 0)
+		return 0;
+
+	/* Check the 0 item */
+	if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
+	    BTRFS_LEAF_DATA_SIZE(root)) {
+		CORRUPT("invalid item offset size pair", leaf, root, 0);
+		return -EIO;
+	}
+
+	/*
+	 * Check to make sure each items keys are in the correct order and their
+	 * offsets make sense.  We only have to loop through nritems-1 because
+	 * we check the current slot against the next slot, which verifies the
+	 * next slot's offset+size makes sense and that the current's slot
+	 * offset is correct.
+	 */
+	for (slot = 0; slot < nritems - 1; slot++) {
+		btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
+		btrfs_item_key_to_cpu(leaf, &key, slot + 1);
+
+		/* Make sure the keys are in the right order */
+		if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
+			CORRUPT("bad key order", leaf, root, slot);
+			return -EIO;
+		}
+
+		/*
+		 * Make sure the offset and ends are right, remember that the
+		 * item data starts at the end of the leaf and grows towards the
+		 * front.
+		 */
+		if (btrfs_item_offset_nr(leaf, slot) !=
+			btrfs_item_end_nr(leaf, slot + 1)) {
+			CORRUPT("slot offset bad", leaf, root, slot);
+			return -EIO;
+		}
+
+		/*
+		 * Check to make sure that we don't point outside of the leaf,
+		 * just incase all the items are consistent to eachother, but
+		 * all point outside of the leaf.
+		 */
+		if (btrfs_item_end_nr(leaf, slot) >
+		    BTRFS_LEAF_DATA_SIZE(root)) {
+			CORRUPT("slot end outside of leaf", leaf, root, slot);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
+{
+	lockdep_set_class_and_name(&eb->lock,
+			   &btrfs_eb_class[level],
+			   btrfs_eb_name[level]);
+}
+#endif
+
+static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+			       struct extent_state *state)
+{
+	struct extent_io_tree *tree;
+	u64 found_start;
+	int found_level;
+	unsigned long len;
+	struct extent_buffer *eb;
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+	int ret = 0;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	if (page->private == EXTENT_PAGE_PRIVATE)
+		goto out;
+	if (!page->private)
+		goto out;
+
+	len = page->private >> 2;
+	WARN_ON(len == 0);
+
+	eb = alloc_extent_buffer(tree, start, len, page);
+	if (eb == NULL) {
+		ret = -EIO;
+		goto out;
+	}
+
+	found_start = btrfs_header_bytenr(eb);
+	if (found_start != start) {
+		printk_ratelimited(KERN_INFO "btrfs bad tree block start "
+			       "%llu %llu\n",
+			       (unsigned long long)found_start,
+			       (unsigned long long)eb->start);
+		ret = -EIO;
+		goto err;
+	}
+	if (eb->first_page != page) {
+		printk(KERN_INFO "btrfs bad first page %lu %lu\n",
+		       eb->first_page->index, page->index);
+		WARN_ON(1);
+		ret = -EIO;
+		goto err;
+	}
+	if (check_tree_block_fsid(root, eb)) {
+		printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
+			       (unsigned long long)eb->start);
+		ret = -EIO;
+		goto err;
+	}
+	found_level = btrfs_header_level(eb);
+
+	btrfs_set_buffer_lockdep_class(eb, found_level);
+
+	ret = csum_tree_block(root, eb, 1);
+	if (ret) {
+		ret = -EIO;
+		goto err;
+	}
+
+	/*
+	 * If this is a leaf block and it is corrupt, set the corrupt bit so
+	 * that we don't try and read the other copies of this block, just
+	 * return -EIO.
+	 */
+	if (found_level == 0 && check_leaf(root, eb)) {
+		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+		ret = -EIO;
+	}
+
+	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
+	end = eb->start + end - 1;
+err:
+	free_extent_buffer(eb);
+out:
+	return ret;
+}
+
+static void end_workqueue_bio(struct bio *bio, int err)
+{
+	struct end_io_wq *end_io_wq = bio->bi_private;
+	struct btrfs_fs_info *fs_info;
+
+	fs_info = end_io_wq->info;
+	end_io_wq->error = err;
+	end_io_wq->work.func = end_workqueue_fn;
+	end_io_wq->work.flags = 0;
+
+	if (bio->bi_rw & REQ_WRITE) {
+		if (end_io_wq->metadata == 1)
+			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
+					   &end_io_wq->work);
+		else if (end_io_wq->metadata == 2)
+			btrfs_queue_worker(&fs_info->endio_freespace_worker,
+					   &end_io_wq->work);
+		else
+			btrfs_queue_worker(&fs_info->endio_write_workers,
+					   &end_io_wq->work);
+	} else {
+		if (end_io_wq->metadata)
+			btrfs_queue_worker(&fs_info->endio_meta_workers,
+					   &end_io_wq->work);
+		else
+			btrfs_queue_worker(&fs_info->endio_workers,
+					   &end_io_wq->work);
+	}
+}
+
+/*
+ * For the metadata arg you want
+ *
+ * 0 - if data
+ * 1 - if normal metadta
+ * 2 - if writing to the free space cache area
+ */
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+			int metadata)
+{
+	struct end_io_wq *end_io_wq;
+	end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
+	if (!end_io_wq)
+		return -ENOMEM;
+
+	end_io_wq->private = bio->bi_private;
+	end_io_wq->end_io = bio->bi_end_io;
+	end_io_wq->info = info;
+	end_io_wq->error = 0;
+	end_io_wq->bio = bio;
+	end_io_wq->metadata = metadata;
+
+	bio->bi_private = end_io_wq;
+	bio->bi_end_io = end_workqueue_bio;
+	return 0;
+}
+
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
+{
+	unsigned long limit = min_t(unsigned long,
+				    info->workers.max_workers,
+				    info->fs_devices->open_devices);
+	return 256 * limit;
+}
+
+static void run_one_async_start(struct btrfs_work *work)
+{
+	struct async_submit_bio *async;
+
+	async = container_of(work, struct  async_submit_bio, work);
+	async->submit_bio_start(async->inode, async->rw, async->bio,
+			       async->mirror_num, async->bio_flags,
+			       async->bio_offset);
+}
+
+static void run_one_async_done(struct btrfs_work *work)
+{
+	struct btrfs_fs_info *fs_info;
+	struct async_submit_bio *async;
+	int limit;
+
+	async = container_of(work, struct  async_submit_bio, work);
+	fs_info = BTRFS_I(async->inode)->root->fs_info;
+
+	limit = btrfs_async_submit_limit(fs_info);
+	limit = limit * 2 / 3;
+
+	atomic_dec(&fs_info->nr_async_submits);
+
+	if (atomic_read(&fs_info->nr_async_submits) < limit &&
+	    waitqueue_active(&fs_info->async_submit_wait))
+		wake_up(&fs_info->async_submit_wait);
+
+	async->submit_bio_done(async->inode, async->rw, async->bio,
+			       async->mirror_num, async->bio_flags,
+			       async->bio_offset);
+}
+
+static void run_one_async_free(struct btrfs_work *work)
+{
+	struct async_submit_bio *async;
+
+	async = container_of(work, struct  async_submit_bio, work);
+	kfree(async);
+}
+
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+			int rw, struct bio *bio, int mirror_num,
+			unsigned long bio_flags,
+			u64 bio_offset,
+			extent_submit_bio_hook_t *submit_bio_start,
+			extent_submit_bio_hook_t *submit_bio_done)
+{
+	struct async_submit_bio *async;
+
+	async = kmalloc(sizeof(*async), GFP_NOFS);
+	if (!async)
+		return -ENOMEM;
+
+	async->inode = inode;
+	async->rw = rw;
+	async->bio = bio;
+	async->mirror_num = mirror_num;
+	async->submit_bio_start = submit_bio_start;
+	async->submit_bio_done = submit_bio_done;
+
+	async->work.func = run_one_async_start;
+	async->work.ordered_func = run_one_async_done;
+	async->work.ordered_free = run_one_async_free;
+
+	async->work.flags = 0;
+	async->bio_flags = bio_flags;
+	async->bio_offset = bio_offset;
+
+	atomic_inc(&fs_info->nr_async_submits);
+
+	if (rw & REQ_SYNC)
+		btrfs_set_work_high_prio(&async->work);
+
+	btrfs_queue_worker(&fs_info->workers, &async->work);
+
+	while (atomic_read(&fs_info->async_submit_draining) &&
+	      atomic_read(&fs_info->nr_async_submits)) {
+		wait_event(fs_info->async_submit_wait,
+			   (atomic_read(&fs_info->nr_async_submits) == 0));
+	}
+
+	return 0;
+}
+
+static int btree_csum_one_bio(struct bio *bio)
+{
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int bio_index = 0;
+	struct btrfs_root *root;
+
+	WARN_ON(bio->bi_vcnt <= 0);
+	while (bio_index < bio->bi_vcnt) {
+		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
+		csum_dirty_buffer(root, bvec->bv_page);
+		bio_index++;
+		bvec++;
+	}
+	return 0;
+}
+
+static int __btree_submit_bio_start(struct inode *inode, int rw,
+				    struct bio *bio, int mirror_num,
+				    unsigned long bio_flags,
+				    u64 bio_offset)
+{
+	/*
+	 * when we're called for a write, we're already in the async
+	 * submission context.  Just jump into btrfs_map_bio
+	 */
+	btree_csum_one_bio(bio);
+	return 0;
+}
+
+static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags,
+				 u64 bio_offset)
+{
+	/*
+	 * when we're called for a write, we're already in the async
+	 * submission context.  Just jump into btrfs_map_bio
+	 */
+	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+}
+
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags,
+				 u64 bio_offset)
+{
+	int ret;
+
+	ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+					  bio, 1);
+	BUG_ON(ret);
+
+	if (!(rw & REQ_WRITE)) {
+		/*
+		 * called for a read, do the setup so that checksum validation
+		 * can happen in the async kernel threads
+		 */
+		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				     mirror_num, 0);
+	}
+
+	/*
+	 * kthread helpers are used to submit writes so that checksumming
+	 * can happen in parallel across all CPUs
+	 */
+	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+				   inode, rw, bio, mirror_num, 0,
+				   bio_offset,
+				   __btree_submit_bio_start,
+				   __btree_submit_bio_done);
+}
+
+#ifdef CONFIG_MIGRATION
+static int btree_migratepage(struct address_space *mapping,
+			struct page *newpage, struct page *page)
+{
+	/*
+	 * we can't safely write a btree page from here,
+	 * we haven't done the locking hook
+	 */
+	if (PageDirty(page))
+		return -EAGAIN;
+	/*
+	 * Buffers may be managed in a filesystem specific way.
+	 * We must have no buffers or drop them.
+	 */
+	if (page_has_private(page) &&
+	    !try_to_release_page(page, GFP_KERNEL))
+		return -EAGAIN;
+	return migrate_page(mapping, newpage, page);
+}
+#endif
+
+static int btree_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct extent_io_tree *tree;
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+	struct extent_buffer *eb;
+	int was_dirty;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	if (!(current->flags & PF_MEMALLOC)) {
+		return extent_write_full_page(tree, page,
+					      btree_get_extent, wbc);
+	}
+
+	redirty_page_for_writepage(wbc, page);
+	eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
+	WARN_ON(!eb);
+
+	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+	if (!was_dirty) {
+		spin_lock(&root->fs_info->delalloc_lock);
+		root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
+		spin_unlock(&root->fs_info->delalloc_lock);
+	}
+	free_extent_buffer(eb);
+
+	unlock_page(page);
+	return 0;
+}
+
+static int btree_writepages(struct address_space *mapping,
+			    struct writeback_control *wbc)
+{
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(mapping->host)->io_tree;
+	if (wbc->sync_mode == WB_SYNC_NONE) {
+		struct btrfs_root *root = BTRFS_I(mapping->host)->root;
+		u64 num_dirty;
+		unsigned long thresh = 32 * 1024 * 1024;
+
+		if (wbc->for_kupdate)
+			return 0;
+
+		/* this is a bit racy, but that's ok */
+		num_dirty = root->fs_info->dirty_metadata_bytes;
+		if (num_dirty < thresh)
+			return 0;
+	}
+	return extent_writepages(tree, mapping, btree_get_extent, wbc);
+}
+
+static int btree_readpage(struct file *file, struct page *page)
+{
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	return extent_read_full_page(tree, page, btree_get_extent);
+}
+
+static int btree_releasepage(struct page *page, gfp_t gfp_flags)
+{
+	struct extent_io_tree *tree;
+	struct extent_map_tree *map;
+	int ret;
+
+	if (PageWriteback(page) || PageDirty(page))
+		return 0;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	map = &BTRFS_I(page->mapping->host)->extent_tree;
+
+	ret = try_release_extent_state(map, tree, page, gfp_flags);
+	if (!ret)
+		return 0;
+
+	ret = try_release_extent_buffer(tree, page);
+	if (ret == 1) {
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
+	}
+
+	return ret;
+}
+
+static void btree_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	extent_invalidatepage(tree, page, offset);
+	btree_releasepage(page, GFP_NOFS);
+	if (PagePrivate(page)) {
+		printk(KERN_WARNING "btrfs warning page private not zero "
+		       "on page %llu\n", (unsigned long long)page_offset(page));
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
+	}
+}
+
+static const struct address_space_operations btree_aops = {
+	.readpage	= btree_readpage,
+	.writepage	= btree_writepage,
+	.writepages	= btree_writepages,
+	.releasepage	= btree_releasepage,
+	.invalidatepage = btree_invalidatepage,
+#ifdef CONFIG_MIGRATION
+	.migratepage	= btree_migratepage,
+#endif
+};
+
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+			 u64 parent_transid)
+{
+	struct extent_buffer *buf = NULL;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	int ret = 0;
+
+	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+	if (!buf)
+		return 0;
+	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
+				 buf, 0, 0, btree_get_extent, 0);
+	free_extent_buffer(buf);
+	return ret;
+}
+
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+					    u64 bytenr, u32 blocksize)
+{
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_buffer *eb;
+	eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+				bytenr, blocksize);
+	return eb;
+}
+
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+						 u64 bytenr, u32 blocksize)
+{
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_buffer *eb;
+
+	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+				 bytenr, blocksize, NULL);
+	return eb;
+}
+
+
+int btrfs_write_tree_block(struct extent_buffer *buf)
+{
+	return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
+					buf->start + buf->len - 1);
+}
+
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+{
+	return filemap_fdatawait_range(buf->first_page->mapping,
+				       buf->start, buf->start + buf->len - 1);
+}
+
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+				      u32 blocksize, u64 parent_transid)
+{
+	struct extent_buffer *buf = NULL;
+	int ret;
+
+	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+	if (!buf)
+		return NULL;
+
+	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+
+	if (ret == 0)
+		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
+	return buf;
+
+}
+
+int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		     struct extent_buffer *buf)
+{
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	if (btrfs_header_generation(buf) ==
+	    root->fs_info->running_transaction->transid) {
+		btrfs_assert_tree_locked(buf);
+
+		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+			spin_lock(&root->fs_info->delalloc_lock);
+			if (root->fs_info->dirty_metadata_bytes >= buf->len)
+				root->fs_info->dirty_metadata_bytes -= buf->len;
+			else
+				WARN_ON(1);
+			spin_unlock(&root->fs_info->delalloc_lock);
+		}
+
+		/* ugh, clear_extent_buffer_dirty needs to lock the page */
+		btrfs_set_lock_blocking(buf);
+		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+					  buf);
+	}
+	return 0;
+}
+
+static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+			u32 stripesize, struct btrfs_root *root,
+			struct btrfs_fs_info *fs_info,
+			u64 objectid)
+{
+	root->node = NULL;
+	root->commit_root = NULL;
+	root->sectorsize = sectorsize;
+	root->nodesize = nodesize;
+	root->leafsize = leafsize;
+	root->stripesize = stripesize;
+	root->ref_cows = 0;
+	root->track_dirty = 0;
+	root->in_radix = 0;
+	root->orphan_item_inserted = 0;
+	root->orphan_cleanup_state = 0;
+
+	root->fs_info = fs_info;
+	root->objectid = objectid;
+	root->last_trans = 0;
+	root->highest_objectid = 0;
+	root->name = NULL;
+	root->inode_tree = RB_ROOT;
+	INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+	root->block_rsv = NULL;
+	root->orphan_block_rsv = NULL;
+
+	INIT_LIST_HEAD(&root->dirty_list);
+	INIT_LIST_HEAD(&root->orphan_list);
+	INIT_LIST_HEAD(&root->root_list);
+	spin_lock_init(&root->orphan_lock);
+	spin_lock_init(&root->inode_lock);
+	spin_lock_init(&root->accounting_lock);
+	mutex_init(&root->objectid_mutex);
+	mutex_init(&root->log_mutex);
+	init_waitqueue_head(&root->log_writer_wait);
+	init_waitqueue_head(&root->log_commit_wait[0]);
+	init_waitqueue_head(&root->log_commit_wait[1]);
+	atomic_set(&root->log_commit[0], 0);
+	atomic_set(&root->log_commit[1], 0);
+	atomic_set(&root->log_writers, 0);
+	root->log_batch = 0;
+	root->log_transid = 0;
+	root->last_log_commit = 0;
+	extent_io_tree_init(&root->dirty_log_pages,
+			     fs_info->btree_inode->i_mapping);
+
+	memset(&root->root_key, 0, sizeof(root->root_key));
+	memset(&root->root_item, 0, sizeof(root->root_item));
+	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+	memset(&root->root_kobj, 0, sizeof(root->root_kobj));
+	root->defrag_trans_start = fs_info->generation;
+	init_completion(&root->kobj_unregister);
+	root->defrag_running = 0;
+	root->root_key.objectid = objectid;
+	root->anon_super.s_root = NULL;
+	root->anon_super.s_dev = 0;
+	INIT_LIST_HEAD(&root->anon_super.s_list);
+	INIT_LIST_HEAD(&root->anon_super.s_instances);
+	init_rwsem(&root->anon_super.s_umount);
+
+	return 0;
+}
+
+static int find_and_setup_root(struct btrfs_root *tree_root,
+			       struct btrfs_fs_info *fs_info,
+			       u64 objectid,
+			       struct btrfs_root *root)
+{
+	int ret;
+	u32 blocksize;
+	u64 generation;
+
+	__setup_root(tree_root->nodesize, tree_root->leafsize,
+		     tree_root->sectorsize, tree_root->stripesize,
+		     root, fs_info, objectid);
+	ret = btrfs_find_last_root(tree_root, objectid,
+				   &root->root_item, &root->root_key);
+	if (ret > 0)
+		return -ENOENT;
+	BUG_ON(ret);
+
+	generation = btrfs_root_generation(&root->root_item);
+	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+				     blocksize, generation);
+	if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
+		free_extent_buffer(root->node);
+		return -EIO;
+	}
+	root->commit_root = btrfs_root_node(root);
+	return 0;
+}
+
+static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
+					 struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct extent_buffer *leaf;
+
+	root = kzalloc(sizeof(*root), GFP_NOFS);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
+
+	__setup_root(tree_root->nodesize, tree_root->leafsize,
+		     tree_root->sectorsize, tree_root->stripesize,
+		     root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+
+	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+	/*
+	 * log trees do not get reference counted because they go away
+	 * before a real commit is actually done.  They do store pointers
+	 * to file data extents, and those reference counts still get
+	 * updated (along with back refs to the log tree).
+	 */
+	root->ref_cows = 0;
+
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+				      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		kfree(root);
+		return ERR_CAST(leaf);
+	}
+
+	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
+	root->node = leaf;
+
+	write_extent_buffer(root->node, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(root->node),
+			    BTRFS_FSID_SIZE);
+	btrfs_mark_buffer_dirty(root->node);
+	btrfs_tree_unlock(root->node);
+	return root;
+}
+
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *log_root;
+
+	log_root = alloc_log_tree(trans, fs_info);
+	if (IS_ERR(log_root))
+		return PTR_ERR(log_root);
+	WARN_ON(fs_info->log_root_tree);
+	fs_info->log_root_tree = log_root;
+	return 0;
+}
+
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root)
+{
+	struct btrfs_root *log_root;
+	struct btrfs_inode_item *inode_item;
+
+	log_root = alloc_log_tree(trans, root->fs_info);
+	if (IS_ERR(log_root))
+		return PTR_ERR(log_root);
+
+	log_root->last_trans = trans->transid;
+	log_root->root_key.offset = root->root_key.objectid;
+
+	inode_item = &log_root->root_item.inode;
+	inode_item->generation = cpu_to_le64(1);
+	inode_item->size = cpu_to_le64(3);
+	inode_item->nlink = cpu_to_le32(1);
+	inode_item->nbytes = cpu_to_le64(root->leafsize);
+	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+
+	btrfs_set_root_node(&log_root->root_item, log_root->node);
+
+	WARN_ON(root->log_root);
+	root->log_root = log_root;
+	root->log_transid = 0;
+	root->last_log_commit = 0;
+	return 0;
+}
+
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+					       struct btrfs_key *location)
+{
+	struct btrfs_root *root;
+	struct btrfs_fs_info *fs_info = tree_root->fs_info;
+	struct btrfs_path *path;
+	struct extent_buffer *l;
+	u64 generation;
+	u32 blocksize;
+	int ret = 0;
+
+	root = kzalloc(sizeof(*root), GFP_NOFS);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
+	if (location->offset == (u64)-1) {
+		ret = find_and_setup_root(tree_root, fs_info,
+					  location->objectid, root);
+		if (ret) {
+			kfree(root);
+			return ERR_PTR(ret);
+		}
+		goto out;
+	}
+
+	__setup_root(tree_root->nodesize, tree_root->leafsize,
+		     tree_root->sectorsize, tree_root->stripesize,
+		     root, fs_info, location->objectid);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		kfree(root);
+		return ERR_PTR(-ENOMEM);
+	}
+	ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
+	if (ret == 0) {
+		l = path->nodes[0];
+		read_extent_buffer(l, &root->root_item,
+				btrfs_item_ptr_offset(l, path->slots[0]),
+				sizeof(root->root_item));
+		memcpy(&root->root_key, location, sizeof(*location));
+	}
+	btrfs_free_path(path);
+	if (ret) {
+		kfree(root);
+		if (ret > 0)
+			ret = -ENOENT;
+		return ERR_PTR(ret);
+	}
+
+	generation = btrfs_root_generation(&root->root_item);
+	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+				     blocksize, generation);
+	root->commit_root = btrfs_root_node(root);
+	BUG_ON(!root->node);
+out:
+	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+		root->ref_cows = 1;
+		btrfs_check_and_init_root_item(&root->root_item);
+	}
+
+	return root;
+}
+
+struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+					      struct btrfs_key *location)
+{
+	struct btrfs_root *root;
+	int ret;
+
+	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+		return fs_info->tree_root;
+	if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+		return fs_info->extent_root;
+	if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
+		return fs_info->chunk_root;
+	if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
+		return fs_info->dev_root;
+	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
+		return fs_info->csum_root;
+again:
+	spin_lock(&fs_info->fs_roots_radix_lock);
+	root = radix_tree_lookup(&fs_info->fs_roots_radix,
+				 (unsigned long)location->objectid);
+	spin_unlock(&fs_info->fs_roots_radix_lock);
+	if (root)
+		return root;
+
+	root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+	if (IS_ERR(root))
+		return root;
+
+	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+					GFP_NOFS);
+	if (!root->free_ino_pinned || !root->free_ino_ctl) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	btrfs_init_free_ino_ctl(root);
+	mutex_init(&root->fs_commit_mutex);
+	spin_lock_init(&root->cache_lock);
+	init_waitqueue_head(&root->cache_wait);
+
+	ret = set_anon_super(&root->anon_super, NULL);
+	if (ret)
+		goto fail;
+
+	if (btrfs_root_refs(&root->root_item) == 0) {
+		ret = -ENOENT;
+		goto fail;
+	}
+
+	ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+	if (ret < 0)
+		goto fail;
+	if (ret == 0)
+		root->orphan_item_inserted = 1;
+
+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	if (ret)
+		goto fail;
+
+	spin_lock(&fs_info->fs_roots_radix_lock);
+	ret = radix_tree_insert(&fs_info->fs_roots_radix,
+				(unsigned long)root->root_key.objectid,
+				root);
+	if (ret == 0)
+		root->in_radix = 1;
+
+	spin_unlock(&fs_info->fs_roots_radix_lock);
+	radix_tree_preload_end();
+	if (ret) {
+		if (ret == -EEXIST) {
+			free_fs_root(root);
+			goto again;
+		}
+		goto fail;
+	}
+
+	ret = btrfs_find_dead_roots(fs_info->tree_root,
+				    root->root_key.objectid);
+	WARN_ON(ret);
+	return root;
+fail:
+	free_fs_root(root);
+	return ERR_PTR(ret);
+}
+
+static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+{
+	struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
+	int ret = 0;
+	struct btrfs_device *device;
+	struct backing_dev_info *bdi;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
+		if (!device->bdev)
+			continue;
+		bdi = blk_get_backing_dev_info(device->bdev);
+		if (bdi && bdi_congested(bdi, bdi_bits)) {
+			ret = 1;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+/*
+ * If this fails, caller must call bdi_destroy() to get rid of the
+ * bdi again.
+ */
+static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
+{
+	int err;
+
+	bdi->capabilities = BDI_CAP_MAP_COPY;
+	err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
+	if (err)
+		return err;
+
+	bdi->ra_pages	= default_backing_dev_info.ra_pages;
+	bdi->congested_fn	= btrfs_congested_fn;
+	bdi->congested_data	= info;
+	return 0;
+}
+
+static int bio_ready_for_csum(struct bio *bio)
+{
+	u64 length = 0;
+	u64 buf_len = 0;
+	u64 start = 0;
+	struct page *page;
+	struct extent_io_tree *io_tree = NULL;
+	struct bio_vec *bvec;
+	int i;
+	int ret;
+
+	bio_for_each_segment(bvec, bio, i) {
+		page = bvec->bv_page;
+		if (page->private == EXTENT_PAGE_PRIVATE) {
+			length += bvec->bv_len;
+			continue;
+		}
+		if (!page->private) {
+			length += bvec->bv_len;
+			continue;
+		}
+		length = bvec->bv_len;
+		buf_len = page->private >> 2;
+		start = page_offset(page) + bvec->bv_offset;
+		io_tree = &BTRFS_I(page->mapping->host)->io_tree;
+	}
+	/* are we fully contained in this bio? */
+	if (buf_len <= length)
+		return 1;
+
+	ret = extent_range_uptodate(io_tree, start + length,
+				    start + buf_len - 1);
+	return ret;
+}
+
+/*
+ * called by the kthread helper functions to finally call the bio end_io
+ * functions.  This is where read checksum verification actually happens
+ */
+static void end_workqueue_fn(struct btrfs_work *work)
+{
+	struct bio *bio;
+	struct end_io_wq *end_io_wq;
+	struct btrfs_fs_info *fs_info;
+	int error;
+
+	end_io_wq = container_of(work, struct end_io_wq, work);
+	bio = end_io_wq->bio;
+	fs_info = end_io_wq->info;
+
+	/* metadata bio reads are special because the whole tree block must
+	 * be checksummed at once.  This makes sure the entire block is in
+	 * ram and up to date before trying to verify things.  For
+	 * blocksize <= pagesize, it is basically a noop
+	 */
+	if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
+	    !bio_ready_for_csum(bio)) {
+		btrfs_queue_worker(&fs_info->endio_meta_workers,
+				   &end_io_wq->work);
+		return;
+	}
+	error = end_io_wq->error;
+	bio->bi_private = end_io_wq->private;
+	bio->bi_end_io = end_io_wq->end_io;
+	kfree(end_io_wq);
+	bio_endio(bio, error);
+}
+
+static int cleaner_kthread(void *arg)
+{
+	struct btrfs_root *root = arg;
+
+	do {
+		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+
+		if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
+		    mutex_trylock(&root->fs_info->cleaner_mutex)) {
+			btrfs_run_delayed_iputs(root);
+			btrfs_clean_old_snapshots(root);
+			mutex_unlock(&root->fs_info->cleaner_mutex);
+			btrfs_run_defrag_inodes(root->fs_info);
+		}
+
+		if (freezing(current)) {
+			refrigerator();
+		} else {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (!kthread_should_stop())
+				schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+	return 0;
+}
+
+static int transaction_kthread(void *arg)
+{
+	struct btrfs_root *root = arg;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_transaction *cur;
+	u64 transid;
+	unsigned long now;
+	unsigned long delay;
+	int ret;
+
+	do {
+		delay = HZ * 30;
+		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+		mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+		spin_lock(&root->fs_info->trans_lock);
+		cur = root->fs_info->running_transaction;
+		if (!cur) {
+			spin_unlock(&root->fs_info->trans_lock);
+			goto sleep;
+		}
+
+		now = get_seconds();
+		if (!cur->blocked &&
+		    (now < cur->start_time || now - cur->start_time < 30)) {
+			spin_unlock(&root->fs_info->trans_lock);
+			delay = HZ * 5;
+			goto sleep;
+		}
+		transid = cur->transid;
+		spin_unlock(&root->fs_info->trans_lock);
+
+		trans = btrfs_join_transaction(root);
+		BUG_ON(IS_ERR(trans));
+		if (transid == trans->transid) {
+			ret = btrfs_commit_transaction(trans, root);
+			BUG_ON(ret);
+		} else {
+			btrfs_end_transaction(trans, root);
+		}
+sleep:
+		wake_up_process(root->fs_info->cleaner_kthread);
+		mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+
+		if (freezing(current)) {
+			refrigerator();
+		} else {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (!kthread_should_stop() &&
+			    !btrfs_transaction_blocked(root->fs_info))
+				schedule_timeout(delay);
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+	return 0;
+}
+
+struct btrfs_root *open_ctree(struct super_block *sb,
+			      struct btrfs_fs_devices *fs_devices,
+			      char *options)
+{
+	u32 sectorsize;
+	u32 nodesize;
+	u32 leafsize;
+	u32 blocksize;
+	u32 stripesize;
+	u64 generation;
+	u64 features;
+	struct btrfs_key location;
+	struct buffer_head *bh;
+	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
+						 GFP_NOFS);
+	struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
+						 GFP_NOFS);
+	struct btrfs_root *tree_root = btrfs_sb(sb);
+	struct btrfs_fs_info *fs_info = NULL;
+	struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
+						GFP_NOFS);
+	struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
+					      GFP_NOFS);
+	struct btrfs_root *log_tree_root;
+
+	int ret;
+	int err = -EINVAL;
+
+	struct btrfs_super_block *disk_super;
+
+	if (!extent_root || !tree_root || !tree_root->fs_info ||
+	    !chunk_root || !dev_root || !csum_root) {
+		err = -ENOMEM;
+		goto fail;
+	}
+	fs_info = tree_root->fs_info;
+
+	ret = init_srcu_struct(&fs_info->subvol_srcu);
+	if (ret) {
+		err = ret;
+		goto fail;
+	}
+
+	ret = setup_bdi(fs_info, &fs_info->bdi);
+	if (ret) {
+		err = ret;
+		goto fail_srcu;
+	}
+
+	fs_info->btree_inode = new_inode(sb);
+	if (!fs_info->btree_inode) {
+		err = -ENOMEM;
+		goto fail_bdi;
+	}
+
+	fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
+
+	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+	INIT_LIST_HEAD(&fs_info->trans_list);
+	INIT_LIST_HEAD(&fs_info->dead_roots);
+	INIT_LIST_HEAD(&fs_info->delayed_iputs);
+	INIT_LIST_HEAD(&fs_info->hashers);
+	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+	INIT_LIST_HEAD(&fs_info->ordered_operations);
+	INIT_LIST_HEAD(&fs_info->caching_block_groups);
+	spin_lock_init(&fs_info->delalloc_lock);
+	spin_lock_init(&fs_info->trans_lock);
+	spin_lock_init(&fs_info->ref_cache_lock);
+	spin_lock_init(&fs_info->fs_roots_radix_lock);
+	spin_lock_init(&fs_info->delayed_iput_lock);
+	spin_lock_init(&fs_info->defrag_inodes_lock);
+	mutex_init(&fs_info->reloc_mutex);
+
+	init_completion(&fs_info->kobj_unregister);
+	fs_info->tree_root = tree_root;
+	fs_info->extent_root = extent_root;
+	fs_info->csum_root = csum_root;
+	fs_info->chunk_root = chunk_root;
+	fs_info->dev_root = dev_root;
+	fs_info->fs_devices = fs_devices;
+	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
+	INIT_LIST_HEAD(&fs_info->space_info);
+	btrfs_mapping_init(&fs_info->mapping_tree);
+	btrfs_init_block_rsv(&fs_info->global_block_rsv);
+	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
+	btrfs_init_block_rsv(&fs_info->trans_block_rsv);
+	btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
+	btrfs_init_block_rsv(&fs_info->empty_block_rsv);
+	INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
+	mutex_init(&fs_info->durable_block_rsv_mutex);
+	atomic_set(&fs_info->nr_async_submits, 0);
+	atomic_set(&fs_info->async_delalloc_pages, 0);
+	atomic_set(&fs_info->async_submit_draining, 0);
+	atomic_set(&fs_info->nr_async_bios, 0);
+	atomic_set(&fs_info->defrag_running, 0);
+	fs_info->sb = sb;
+	fs_info->max_inline = 8192 * 1024;
+	fs_info->metadata_ratio = 0;
+	fs_info->defrag_inodes = RB_ROOT;
+	fs_info->trans_no_join = 0;
+
+	fs_info->thread_pool_size = min_t(unsigned long,
+					  num_online_cpus() + 2, 8);
+
+	INIT_LIST_HEAD(&fs_info->ordered_extents);
+	spin_lock_init(&fs_info->ordered_extent_lock);
+	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
+					GFP_NOFS);
+	if (!fs_info->delayed_root) {
+		err = -ENOMEM;
+		goto fail_iput;
+	}
+	btrfs_init_delayed_root(fs_info->delayed_root);
+
+	mutex_init(&fs_info->scrub_lock);
+	atomic_set(&fs_info->scrubs_running, 0);
+	atomic_set(&fs_info->scrub_pause_req, 0);
+	atomic_set(&fs_info->scrubs_paused, 0);
+	atomic_set(&fs_info->scrub_cancel_req, 0);
+	init_waitqueue_head(&fs_info->scrub_pause_wait);
+	init_rwsem(&fs_info->scrub_super_lock);
+	fs_info->scrub_workers_refcnt = 0;
+
+	sb->s_blocksize = 4096;
+	sb->s_blocksize_bits = blksize_bits(4096);
+	sb->s_bdi = &fs_info->bdi;
+
+	fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+	fs_info->btree_inode->i_nlink = 1;
+	/*
+	 * we set the i_size on the btree inode to the max possible int.
+	 * the real end of the address space is determined by all of
+	 * the devices in the system
+	 */
+	fs_info->btree_inode->i_size = OFFSET_MAX;
+	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+	fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
+
+	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
+	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
+			     fs_info->btree_inode->i_mapping);
+	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
+
+	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+
+	BTRFS_I(fs_info->btree_inode)->root = tree_root;
+	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+	       sizeof(struct btrfs_key));
+	BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
+	insert_inode_hash(fs_info->btree_inode);
+
+	spin_lock_init(&fs_info->block_group_cache_lock);
+	fs_info->block_group_cache_tree = RB_ROOT;
+
+	extent_io_tree_init(&fs_info->freed_extents[0],
+			     fs_info->btree_inode->i_mapping);
+	extent_io_tree_init(&fs_info->freed_extents[1],
+			     fs_info->btree_inode->i_mapping);
+	fs_info->pinned_extents = &fs_info->freed_extents[0];
+	fs_info->do_barriers = 1;
+
+
+	mutex_init(&fs_info->ordered_operations_mutex);
+	mutex_init(&fs_info->tree_log_mutex);
+	mutex_init(&fs_info->chunk_mutex);
+	mutex_init(&fs_info->transaction_kthread_mutex);
+	mutex_init(&fs_info->cleaner_mutex);
+	mutex_init(&fs_info->volume_mutex);
+	init_rwsem(&fs_info->extent_commit_sem);
+	init_rwsem(&fs_info->cleanup_work_sem);
+	init_rwsem(&fs_info->subvol_sem);
+
+	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
+	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
+
+	init_waitqueue_head(&fs_info->transaction_throttle);
+	init_waitqueue_head(&fs_info->transaction_wait);
+	init_waitqueue_head(&fs_info->transaction_blocked_wait);
+	init_waitqueue_head(&fs_info->async_submit_wait);
+
+	__setup_root(4096, 4096, 4096, 4096, tree_root,
+		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
+
+	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
+	if (!bh) {
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+
+	memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
+	memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
+	       sizeof(fs_info->super_for_commit));
+	brelse(bh);
+
+	memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+
+	disk_super = &fs_info->super_copy;
+	if (!btrfs_super_root(disk_super))
+		goto fail_alloc;
+
+	/* check FS state, whether FS is broken. */
+	fs_info->fs_state |= btrfs_super_flags(disk_super);
+
+	btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+
+	/*
+	 * In the long term, we'll store the compression type in the super
+	 * block, and it'll be used for per file compression control.
+	 */
+	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+
+	ret = btrfs_parse_options(tree_root, options);
+	if (ret) {
+		err = ret;
+		goto fail_alloc;
+	}
+
+	features = btrfs_super_incompat_flags(disk_super) &
+		~BTRFS_FEATURE_INCOMPAT_SUPP;
+	if (features) {
+		printk(KERN_ERR "BTRFS: couldn't mount because of "
+		       "unsupported optional features (%Lx).\n",
+		       (unsigned long long)features);
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+
+	features = btrfs_super_incompat_flags(disk_super);
+	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+	if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
+		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+	btrfs_set_super_incompat_flags(disk_super, features);
+
+	features = btrfs_super_compat_ro_flags(disk_super) &
+		~BTRFS_FEATURE_COMPAT_RO_SUPP;
+	if (!(sb->s_flags & MS_RDONLY) && features) {
+		printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
+		       "unsupported option features (%Lx).\n",
+		       (unsigned long long)features);
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+
+	btrfs_init_workers(&fs_info->generic_worker,
+			   "genwork", 1, NULL);
+
+	btrfs_init_workers(&fs_info->workers, "worker",
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
+
+	btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
+
+	btrfs_init_workers(&fs_info->submit_workers, "submit",
+			   min_t(u64, fs_devices->num_devices,
+			   fs_info->thread_pool_size),
+			   &fs_info->generic_worker);
+
+	/* a higher idle thresh on the submit workers makes it much more
+	 * likely that bios will be send down in a sane order to the
+	 * devices
+	 */
+	fs_info->submit_workers.idle_thresh = 64;
+
+	fs_info->workers.idle_thresh = 16;
+	fs_info->workers.ordered = 1;
+
+	fs_info->delalloc_workers.idle_thresh = 2;
+	fs_info->delalloc_workers.ordered = 1;
+
+	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
+			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->endio_workers, "endio",
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->endio_meta_write_workers,
+			   "endio-meta-write", fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
+			   1, &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
+
+	/*
+	 * endios are largely parallel and should have a very
+	 * low idle thresh
+	 */
+	fs_info->endio_workers.idle_thresh = 4;
+	fs_info->endio_meta_workers.idle_thresh = 4;
+
+	fs_info->endio_write_workers.idle_thresh = 2;
+	fs_info->endio_meta_write_workers.idle_thresh = 2;
+
+	btrfs_start_workers(&fs_info->workers, 1);
+	btrfs_start_workers(&fs_info->generic_worker, 1);
+	btrfs_start_workers(&fs_info->submit_workers, 1);
+	btrfs_start_workers(&fs_info->delalloc_workers, 1);
+	btrfs_start_workers(&fs_info->fixup_workers, 1);
+	btrfs_start_workers(&fs_info->endio_workers, 1);
+	btrfs_start_workers(&fs_info->endio_meta_workers, 1);
+	btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
+	btrfs_start_workers(&fs_info->endio_write_workers, 1);
+	btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
+	btrfs_start_workers(&fs_info->delayed_workers, 1);
+
+	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+				    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
+
+	nodesize = btrfs_super_nodesize(disk_super);
+	leafsize = btrfs_super_leafsize(disk_super);
+	sectorsize = btrfs_super_sectorsize(disk_super);
+	stripesize = btrfs_super_stripesize(disk_super);
+	tree_root->nodesize = nodesize;
+	tree_root->leafsize = leafsize;
+	tree_root->sectorsize = sectorsize;
+	tree_root->stripesize = stripesize;
+
+	sb->s_blocksize = sectorsize;
+	sb->s_blocksize_bits = blksize_bits(sectorsize);
+
+	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+		    sizeof(disk_super->magic))) {
+		printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
+		goto fail_sb_buffer;
+	}
+
+	mutex_lock(&fs_info->chunk_mutex);
+	ret = btrfs_read_sys_array(tree_root);
+	mutex_unlock(&fs_info->chunk_mutex);
+	if (ret) {
+		printk(KERN_WARNING "btrfs: failed to read the system "
+		       "array on %s\n", sb->s_id);
+		goto fail_sb_buffer;
+	}
+
+	blocksize = btrfs_level_size(tree_root,
+				     btrfs_super_chunk_root_level(disk_super));
+	generation = btrfs_super_chunk_root_generation(disk_super);
+
+	__setup_root(nodesize, leafsize, sectorsize, stripesize,
+		     chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+
+	chunk_root->node = read_tree_block(chunk_root,
+					   btrfs_super_chunk_root(disk_super),
+					   blocksize, generation);
+	BUG_ON(!chunk_root->node);
+	if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+		printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
+		       sb->s_id);
+		goto fail_chunk_root;
+	}
+	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
+	chunk_root->commit_root = btrfs_root_node(chunk_root);
+
+	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
+	   (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
+	   BTRFS_UUID_SIZE);
+
+	mutex_lock(&fs_info->chunk_mutex);
+	ret = btrfs_read_chunk_tree(chunk_root);
+	mutex_unlock(&fs_info->chunk_mutex);
+	if (ret) {
+		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
+		       sb->s_id);
+		goto fail_chunk_root;
+	}
+
+	btrfs_close_extra_devices(fs_devices);
+
+	blocksize = btrfs_level_size(tree_root,
+				     btrfs_super_root_level(disk_super));
+	generation = btrfs_super_generation(disk_super);
+
+	tree_root->node = read_tree_block(tree_root,
+					  btrfs_super_root(disk_super),
+					  blocksize, generation);
+	if (!tree_root->node)
+		goto fail_chunk_root;
+	if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+		printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
+		       sb->s_id);
+		goto fail_tree_root;
+	}
+	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
+	tree_root->commit_root = btrfs_root_node(tree_root);
+
+	ret = find_and_setup_root(tree_root, fs_info,
+				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
+	if (ret)
+		goto fail_tree_root;
+	extent_root->track_dirty = 1;
+
+	ret = find_and_setup_root(tree_root, fs_info,
+				  BTRFS_DEV_TREE_OBJECTID, dev_root);
+	if (ret)
+		goto fail_extent_root;
+	dev_root->track_dirty = 1;
+
+	ret = find_and_setup_root(tree_root, fs_info,
+				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
+	if (ret)
+		goto fail_dev_root;
+
+	csum_root->track_dirty = 1;
+
+	fs_info->generation = generation;
+	fs_info->last_trans_committed = generation;
+	fs_info->data_alloc_profile = (u64)-1;
+	fs_info->metadata_alloc_profile = (u64)-1;
+	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+
+	ret = btrfs_init_space_info(fs_info);
+	if (ret) {
+		printk(KERN_ERR "Failed to initial space info: %d\n", ret);
+		goto fail_block_groups;
+	}
+
+	ret = btrfs_read_block_groups(extent_root);
+	if (ret) {
+		printk(KERN_ERR "Failed to read block groups: %d\n", ret);
+		goto fail_block_groups;
+	}
+
+	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
+					       "btrfs-cleaner");
+	if (IS_ERR(fs_info->cleaner_kthread))
+		goto fail_block_groups;
+
+	fs_info->transaction_kthread = kthread_run(transaction_kthread,
+						   tree_root,
+						   "btrfs-transaction");
+	if (IS_ERR(fs_info->transaction_kthread))
+		goto fail_cleaner;
+
+	if (!btrfs_test_opt(tree_root, SSD) &&
+	    !btrfs_test_opt(tree_root, NOSSD) &&
+	    !fs_info->fs_devices->rotating) {
+		printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
+		       "mode\n");
+		btrfs_set_opt(fs_info->mount_opt, SSD);
+	}
+
+	/* do not make disk changes in broken FS */
+	if (btrfs_super_log_root(disk_super) != 0 &&
+	    !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
+		u64 bytenr = btrfs_super_log_root(disk_super);
+
+		if (fs_devices->rw_devices == 0) {
+			printk(KERN_WARNING "Btrfs log replay required "
+			       "on RO media\n");
+			err = -EIO;
+			goto fail_trans_kthread;
+		}
+		blocksize =
+		     btrfs_level_size(tree_root,
+				      btrfs_super_log_root_level(disk_super));
+
+		log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+		if (!log_tree_root) {
+			err = -ENOMEM;
+			goto fail_trans_kthread;
+		}
+
+		__setup_root(nodesize, leafsize, sectorsize, stripesize,
+			     log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+
+		log_tree_root->node = read_tree_block(tree_root, bytenr,
+						      blocksize,
+						      generation + 1);
+		ret = btrfs_recover_log_trees(log_tree_root);
+		BUG_ON(ret);
+
+		if (sb->s_flags & MS_RDONLY) {
+			ret =  btrfs_commit_super(tree_root);
+			BUG_ON(ret);
+		}
+	}
+
+	ret = btrfs_find_orphan_roots(tree_root);
+	BUG_ON(ret);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		ret = btrfs_cleanup_fs_roots(fs_info);
+		BUG_ON(ret);
+
+		ret = btrfs_recover_relocation(tree_root);
+		if (ret < 0) {
+			printk(KERN_WARNING
+			       "btrfs: failed to recover relocation\n");
+			err = -EINVAL;
+			goto fail_trans_kthread;
+		}
+	}
+
+	location.objectid = BTRFS_FS_TREE_OBJECTID;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+	location.offset = (u64)-1;
+
+	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
+	if (!fs_info->fs_root)
+		goto fail_trans_kthread;
+	if (IS_ERR(fs_info->fs_root)) {
+		err = PTR_ERR(fs_info->fs_root);
+		goto fail_trans_kthread;
+	}
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		down_read(&fs_info->cleanup_work_sem);
+		err = btrfs_orphan_cleanup(fs_info->fs_root);
+		if (!err)
+			err = btrfs_orphan_cleanup(fs_info->tree_root);
+		up_read(&fs_info->cleanup_work_sem);
+		if (err) {
+			close_ctree(tree_root);
+			return ERR_PTR(err);
+		}
+	}
+
+	return tree_root;
+
+fail_trans_kthread:
+	kthread_stop(fs_info->transaction_kthread);
+fail_cleaner:
+	kthread_stop(fs_info->cleaner_kthread);
+
+	/*
+	 * make sure we're done with the btree inode before we stop our
+	 * kthreads
+	 */
+	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+
+fail_block_groups:
+	btrfs_free_block_groups(fs_info);
+	free_extent_buffer(csum_root->node);
+	free_extent_buffer(csum_root->commit_root);
+fail_dev_root:
+	free_extent_buffer(dev_root->node);
+	free_extent_buffer(dev_root->commit_root);
+fail_extent_root:
+	free_extent_buffer(extent_root->node);
+	free_extent_buffer(extent_root->commit_root);
+fail_tree_root:
+	free_extent_buffer(tree_root->node);
+	free_extent_buffer(tree_root->commit_root);
+fail_chunk_root:
+	free_extent_buffer(chunk_root->node);
+	free_extent_buffer(chunk_root->commit_root);
+fail_sb_buffer:
+	btrfs_stop_workers(&fs_info->generic_worker);
+	btrfs_stop_workers(&fs_info->fixup_workers);
+	btrfs_stop_workers(&fs_info->delalloc_workers);
+	btrfs_stop_workers(&fs_info->workers);
+	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+	btrfs_stop_workers(&fs_info->endio_write_workers);
+	btrfs_stop_workers(&fs_info->endio_freespace_worker);
+	btrfs_stop_workers(&fs_info->submit_workers);
+	btrfs_stop_workers(&fs_info->delayed_workers);
+fail_alloc:
+	kfree(fs_info->delayed_root);
+fail_iput:
+	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+	iput(fs_info->btree_inode);
+
+	btrfs_close_devices(fs_info->fs_devices);
+	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+fail_bdi:
+	bdi_destroy(&fs_info->bdi);
+fail_srcu:
+	cleanup_srcu_struct(&fs_info->subvol_srcu);
+fail:
+	kfree(extent_root);
+	kfree(tree_root);
+	kfree(fs_info);
+	kfree(chunk_root);
+	kfree(dev_root);
+	kfree(csum_root);
+	return ERR_PTR(err);
+}
+
+static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+	char b[BDEVNAME_SIZE];
+
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		printk_ratelimited(KERN_WARNING "lost page write due to "
+					"I/O error on %s\n",
+				       bdevname(bh->b_bdev, b));
+		/* note, we dont' set_buffer_write_io_error because we have
+		 * our own ways of dealing with the IO errors
+		 */
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
+{
+	struct buffer_head *bh;
+	struct buffer_head *latest = NULL;
+	struct btrfs_super_block *super;
+	int i;
+	u64 transid = 0;
+	u64 bytenr;
+
+	/* we would like to check all the supers, but that would make
+	 * a btrfs mount succeed after a mkfs from a different FS.
+	 * So, we need to add a special mount option to scan for
+	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+	 */
+	for (i = 0; i < 1; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
+			break;
+		bh = __bread(bdev, bytenr / 4096, 4096);
+		if (!bh)
+			continue;
+
+		super = (struct btrfs_super_block *)bh->b_data;
+		if (btrfs_super_bytenr(super) != bytenr ||
+		    strncmp((char *)(&super->magic), BTRFS_MAGIC,
+			    sizeof(super->magic))) {
+			brelse(bh);
+			continue;
+		}
+
+		if (!latest || btrfs_super_generation(super) > transid) {
+			brelse(latest);
+			latest = bh;
+			transid = btrfs_super_generation(super);
+		} else {
+			brelse(bh);
+		}
+	}
+	return latest;
+}
+
+/*
+ * this should be called twice, once with wait == 0 and
+ * once with wait == 1.  When wait == 0 is done, all the buffer heads
+ * we write are pinned.
+ *
+ * They are released when wait == 1 is done.
+ * max_mirrors must be the same for both runs, and it indicates how
+ * many supers on this one device should be written.
+ *
+ * max_mirrors == 0 means to write them all.
+ */
+static int write_dev_supers(struct btrfs_device *device,
+			    struct btrfs_super_block *sb,
+			    int do_barriers, int wait, int max_mirrors)
+{
+	struct buffer_head *bh;
+	int i;
+	int ret;
+	int errors = 0;
+	u32 crc;
+	u64 bytenr;
+	int last_barrier = 0;
+
+	if (max_mirrors == 0)
+		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
+
+	/* make sure only the last submit_bh does a barrier */
+	if (do_barriers) {
+		for (i = 0; i < max_mirrors; i++) {
+			bytenr = btrfs_sb_offset(i);
+			if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+			    device->total_bytes)
+				break;
+			last_barrier = i;
+		}
+	}
+
+	for (i = 0; i < max_mirrors; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+			break;
+
+		if (wait) {
+			bh = __find_get_block(device->bdev, bytenr / 4096,
+					      BTRFS_SUPER_INFO_SIZE);
+			BUG_ON(!bh);
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				errors++;
+
+			/* drop our reference */
+			brelse(bh);
+
+			/* drop the reference from the wait == 0 run */
+			brelse(bh);
+			continue;
+		} else {
+			btrfs_set_super_bytenr(sb, bytenr);
+
+			crc = ~(u32)0;
+			crc = btrfs_csum_data(NULL, (char *)sb +
+					      BTRFS_CSUM_SIZE, crc,
+					      BTRFS_SUPER_INFO_SIZE -
+					      BTRFS_CSUM_SIZE);
+			btrfs_csum_final(crc, sb->csum);
+
+			/*
+			 * one reference for us, and we leave it for the
+			 * caller
+			 */
+			bh = __getblk(device->bdev, bytenr / 4096,
+				      BTRFS_SUPER_INFO_SIZE);
+			memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+
+			/* one reference for submit_bh */
+			get_bh(bh);
+
+			set_buffer_uptodate(bh);
+			lock_buffer(bh);
+			bh->b_end_io = btrfs_end_buffer_write_sync;
+		}
+
+		if (i == last_barrier && do_barriers)
+			ret = submit_bh(WRITE_FLUSH_FUA, bh);
+		else
+			ret = submit_bh(WRITE_SYNC, bh);
+
+		if (ret)
+			errors++;
+	}
+	return errors < i ? 0 : -1;
+}
+
+int write_all_supers(struct btrfs_root *root, int max_mirrors)
+{
+	struct list_head *head;
+	struct btrfs_device *dev;
+	struct btrfs_super_block *sb;
+	struct btrfs_dev_item *dev_item;
+	int ret;
+	int do_barriers;
+	int max_errors;
+	int total_errors = 0;
+	u64 flags;
+
+	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+	do_barriers = !btrfs_test_opt(root, NOBARRIER);
+
+	sb = &root->fs_info->super_for_commit;
+	dev_item = &sb->dev_item;
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	head = &root->fs_info->fs_devices->devices;
+	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (!dev->bdev) {
+			total_errors++;
+			continue;
+		}
+		if (!dev->in_fs_metadata || !dev->writeable)
+			continue;
+
+		btrfs_set_stack_device_generation(dev_item, 0);
+		btrfs_set_stack_device_type(dev_item, dev->type);
+		btrfs_set_stack_device_id(dev_item, dev->devid);
+		btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
+		btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+		btrfs_set_stack_device_io_align(dev_item, dev->io_align);
+		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
+		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
+		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
+		memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
+
+		flags = btrfs_super_flags(sb);
+		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
+
+		ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
+		if (ret)
+			total_errors++;
+	}
+	if (total_errors > max_errors) {
+		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+		       total_errors);
+		BUG();
+	}
+
+	total_errors = 0;
+	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (!dev->bdev)
+			continue;
+		if (!dev->in_fs_metadata || !dev->writeable)
+			continue;
+
+		ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
+		if (ret)
+			total_errors++;
+	}
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+	if (total_errors > max_errors) {
+		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+		       total_errors);
+		BUG();
+	}
+	return 0;
+}
+
+int write_ctree_super(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, int max_mirrors)
+{
+	int ret;
+
+	ret = write_all_supers(root, max_mirrors);
+	return ret;
+}
+
+int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+{
+	spin_lock(&fs_info->fs_roots_radix_lock);
+	radix_tree_delete(&fs_info->fs_roots_radix,
+			  (unsigned long)root->root_key.objectid);
+	spin_unlock(&fs_info->fs_roots_radix_lock);
+
+	if (btrfs_root_refs(&root->root_item) == 0)
+		synchronize_srcu(&fs_info->subvol_srcu);
+
+	__btrfs_remove_free_space_cache(root->free_ino_pinned);
+	__btrfs_remove_free_space_cache(root->free_ino_ctl);
+	free_fs_root(root);
+	return 0;
+}
+
+static void free_fs_root(struct btrfs_root *root)
+{
+	iput(root->cache_inode);
+	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+	if (root->anon_super.s_dev) {
+		down_write(&root->anon_super.s_umount);
+		kill_anon_super(&root->anon_super);
+	}
+	free_extent_buffer(root->node);
+	free_extent_buffer(root->commit_root);
+	kfree(root->free_ino_ctl);
+	kfree(root->free_ino_pinned);
+	kfree(root->name);
+	kfree(root);
+}
+
+static int del_fs_roots(struct btrfs_fs_info *fs_info)
+{
+	int ret;
+	struct btrfs_root *gang[8];
+	int i;
+
+	while (!list_empty(&fs_info->dead_roots)) {
+		gang[0] = list_entry(fs_info->dead_roots.next,
+				     struct btrfs_root, root_list);
+		list_del(&gang[0]->root_list);
+
+		if (gang[0]->in_radix) {
+			btrfs_free_fs_root(fs_info, gang[0]);
+		} else {
+			free_extent_buffer(gang[0]->node);
+			free_extent_buffer(gang[0]->commit_root);
+			kfree(gang[0]);
+		}
+	}
+
+	while (1) {
+		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+					     (void **)gang, 0,
+					     ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++)
+			btrfs_free_fs_root(fs_info, gang[i]);
+	}
+	return 0;
+}
+
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
+{
+	u64 root_objectid = 0;
+	struct btrfs_root *gang[8];
+	int i;
+	int ret;
+
+	while (1) {
+		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+					     (void **)gang, root_objectid,
+					     ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+
+		root_objectid = gang[ret - 1]->root_key.objectid + 1;
+		for (i = 0; i < ret; i++) {
+			int err;
+
+			root_objectid = gang[i]->root_key.objectid;
+			err = btrfs_orphan_cleanup(gang[i]);
+			if (err)
+				return err;
+		}
+		root_objectid++;
+	}
+	return 0;
+}
+
+int btrfs_commit_super(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	mutex_lock(&root->fs_info->cleaner_mutex);
+	btrfs_run_delayed_iputs(root);
+	btrfs_clean_old_snapshots(root);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
+
+	/* wait until ongoing cleanup work done */
+	down_write(&root->fs_info->cleanup_work_sem);
+	up_write(&root->fs_info->cleanup_work_sem);
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+	ret = btrfs_commit_transaction(trans, root);
+	BUG_ON(ret);
+	/* run commit again to drop the original snapshot */
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+	btrfs_commit_transaction(trans, root);
+	ret = btrfs_write_and_wait_transaction(NULL, root);
+	BUG_ON(ret);
+
+	ret = write_ctree_super(NULL, root, 0);
+	return ret;
+}
+
+int close_ctree(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	int ret;
+
+	fs_info->closing = 1;
+	smp_mb();
+
+	btrfs_scrub_cancel(root);
+
+	/* wait for any defraggers to finish */
+	wait_event(fs_info->transaction_wait,
+		   (atomic_read(&fs_info->defrag_running) == 0));
+
+	/* clear out the rbtree of defraggable inodes */
+	btrfs_run_defrag_inodes(root->fs_info);
+
+	btrfs_put_block_group_cache(fs_info);
+
+	/*
+	 * Here come 2 situations when btrfs is broken to flip readonly:
+	 *
+	 * 1. when btrfs flips readonly somewhere else before
+	 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
+	 * and btrfs will skip to write sb directly to keep
+	 * ERROR state on disk.
+	 *
+	 * 2. when btrfs flips readonly just in btrfs_commit_super,
+	 * and in such case, btrfs cannot write sb via btrfs_commit_super,
+	 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
+	 * btrfs will cleanup all FS resources first and write sb then.
+	 */
+	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+		ret = btrfs_commit_super(root);
+		if (ret)
+			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+	}
+
+	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+		ret = btrfs_error_commit_super(root);
+		if (ret)
+			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+	}
+
+	kthread_stop(root->fs_info->transaction_kthread);
+	kthread_stop(root->fs_info->cleaner_kthread);
+
+	fs_info->closing = 2;
+	smp_mb();
+
+	if (fs_info->delalloc_bytes) {
+		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
+		       (unsigned long long)fs_info->delalloc_bytes);
+	}
+	if (fs_info->total_ref_cache_size) {
+		printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
+		       (unsigned long long)fs_info->total_ref_cache_size);
+	}
+
+	free_extent_buffer(fs_info->extent_root->node);
+	free_extent_buffer(fs_info->extent_root->commit_root);
+	free_extent_buffer(fs_info->tree_root->node);
+	free_extent_buffer(fs_info->tree_root->commit_root);
+	free_extent_buffer(root->fs_info->chunk_root->node);
+	free_extent_buffer(root->fs_info->chunk_root->commit_root);
+	free_extent_buffer(root->fs_info->dev_root->node);
+	free_extent_buffer(root->fs_info->dev_root->commit_root);
+	free_extent_buffer(root->fs_info->csum_root->node);
+	free_extent_buffer(root->fs_info->csum_root->commit_root);
+
+	btrfs_free_block_groups(root->fs_info);
+
+	del_fs_roots(fs_info);
+
+	iput(fs_info->btree_inode);
+	kfree(fs_info->delayed_root);
+
+	btrfs_stop_workers(&fs_info->generic_worker);
+	btrfs_stop_workers(&fs_info->fixup_workers);
+	btrfs_stop_workers(&fs_info->delalloc_workers);
+	btrfs_stop_workers(&fs_info->workers);
+	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+	btrfs_stop_workers(&fs_info->endio_write_workers);
+	btrfs_stop_workers(&fs_info->endio_freespace_worker);
+	btrfs_stop_workers(&fs_info->submit_workers);
+	btrfs_stop_workers(&fs_info->delayed_workers);
+
+	btrfs_close_devices(fs_info->fs_devices);
+	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
+	bdi_destroy(&fs_info->bdi);
+	cleanup_srcu_struct(&fs_info->subvol_srcu);
+
+	kfree(fs_info->extent_root);
+	kfree(fs_info->tree_root);
+	kfree(fs_info->chunk_root);
+	kfree(fs_info->dev_root);
+	kfree(fs_info->csum_root);
+	kfree(fs_info);
+
+	return 0;
+}
+
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
+{
+	int ret;
+	struct inode *btree_inode = buf->first_page->mapping->host;
+
+	ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
+				     NULL);
+	if (!ret)
+		return ret;
+
+	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
+				    parent_transid);
+	return !ret;
+}
+
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
+{
+	struct inode *btree_inode = buf->first_page->mapping->host;
+	return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
+					  buf);
+}
+
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
+{
+	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+	u64 transid = btrfs_header_generation(buf);
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	int was_dirty;
+
+	btrfs_assert_tree_locked(buf);
+	if (transid != root->fs_info->generation) {
+		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+		       "found %llu running %llu\n",
+			(unsigned long long)buf->start,
+			(unsigned long long)transid,
+			(unsigned long long)root->fs_info->generation);
+		WARN_ON(1);
+	}
+	was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+					    buf);
+	if (!was_dirty) {
+		spin_lock(&root->fs_info->delalloc_lock);
+		root->fs_info->dirty_metadata_bytes += buf->len;
+		spin_unlock(&root->fs_info->delalloc_lock);
+	}
+}
+
+void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+{
+	/*
+	 * looks as though older kernels can get into trouble with
+	 * this code, they end up stuck in balance_dirty_pages forever
+	 */
+	u64 num_dirty;
+	unsigned long thresh = 32 * 1024 * 1024;
+
+	if (current->flags & PF_MEMALLOC)
+		return;
+
+	btrfs_balance_delayed_items(root);
+
+	num_dirty = root->fs_info->dirty_metadata_bytes;
+
+	if (num_dirty > thresh) {
+		balance_dirty_pages_ratelimited_nr(
+				   root->fs_info->btree_inode->i_mapping, 1);
+	}
+	return;
+}
+
+void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+{
+	/*
+	 * looks as though older kernels can get into trouble with
+	 * this code, they end up stuck in balance_dirty_pages forever
+	 */
+	u64 num_dirty;
+	unsigned long thresh = 32 * 1024 * 1024;
+
+	if (current->flags & PF_MEMALLOC)
+		return;
+
+	num_dirty = root->fs_info->dirty_metadata_bytes;
+
+	if (num_dirty > thresh) {
+		balance_dirty_pages_ratelimited_nr(
+				   root->fs_info->btree_inode->i_mapping, 1);
+	}
+	return;
+}
+
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+{
+	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+	int ret;
+	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+	if (ret == 0)
+		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
+	return ret;
+}
+
+int btree_lock_page_hook(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_buffer *eb;
+	unsigned long len;
+	u64 bytenr = page_offset(page);
+
+	if (page->private == EXTENT_PAGE_PRIVATE)
+		goto out;
+
+	len = page->private >> 2;
+	eb = find_extent_buffer(io_tree, bytenr, len);
+	if (!eb)
+		goto out;
+
+	btrfs_tree_lock(eb);
+	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+
+	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+		spin_lock(&root->fs_info->delalloc_lock);
+		if (root->fs_info->dirty_metadata_bytes >= eb->len)
+			root->fs_info->dirty_metadata_bytes -= eb->len;
+		else
+			WARN_ON(1);
+		spin_unlock(&root->fs_info->delalloc_lock);
+	}
+
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
+out:
+	lock_page(page);
+	return 0;
+}
+
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+			      int read_only)
+{
+	if (read_only)
+		return;
+
+	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+		printk(KERN_WARNING "warning: mount fs with errors, "
+		       "running btrfsck is recommended\n");
+}
+
+int btrfs_error_commit_super(struct btrfs_root *root)
+{
+	int ret;
+
+	mutex_lock(&root->fs_info->cleaner_mutex);
+	btrfs_run_delayed_iputs(root);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
+
+	down_write(&root->fs_info->cleanup_work_sem);
+	up_write(&root->fs_info->cleanup_work_sem);
+
+	/* cleanup FS via transaction */
+	btrfs_cleanup_transaction(root);
+
+	ret = write_ctree_super(NULL, root, 0);
+
+	return ret;
+}
+
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+{
+	struct btrfs_inode *btrfs_inode;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	mutex_lock(&root->fs_info->ordered_operations_mutex);
+	spin_lock(&root->fs_info->ordered_extent_lock);
+
+	list_splice_init(&root->fs_info->ordered_operations, &splice);
+	while (!list_empty(&splice)) {
+		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+					 ordered_operations);
+
+		list_del_init(&btrfs_inode->ordered_operations);
+
+		btrfs_invalidate_inodes(btrfs_inode->root);
+	}
+
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+	return 0;
+}
+
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+{
+	struct list_head splice;
+	struct btrfs_ordered_extent *ordered;
+	struct inode *inode;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&root->fs_info->ordered_extent_lock);
+
+	list_splice_init(&root->fs_info->ordered_extents, &splice);
+	while (!list_empty(&splice)) {
+		ordered = list_entry(splice.next, struct btrfs_ordered_extent,
+				     root_extent_list);
+
+		list_del_init(&ordered->root_extent_list);
+		atomic_inc(&ordered->refs);
+
+		/* the inode may be getting freed (in sys_unlink path). */
+		inode = igrab(ordered->inode);
+
+		spin_unlock(&root->fs_info->ordered_extent_lock);
+		if (inode)
+			iput(inode);
+
+		atomic_set(&ordered->refs, 1);
+		btrfs_put_ordered_extent(ordered);
+
+		spin_lock(&root->fs_info->ordered_extent_lock);
+	}
+
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+
+	return 0;
+}
+
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+				      struct btrfs_root *root)
+{
+	struct rb_node *node;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_node *ref;
+	int ret = 0;
+
+	delayed_refs = &trans->delayed_refs;
+
+	spin_lock(&delayed_refs->lock);
+	if (delayed_refs->num_entries == 0) {
+		spin_unlock(&delayed_refs->lock);
+		printk(KERN_INFO "delayed_refs has NO entry\n");
+		return ret;
+	}
+
+	node = rb_first(&delayed_refs->root);
+	while (node) {
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		node = rb_next(node);
+
+		ref->in_tree = 0;
+		rb_erase(&ref->rb_node, &delayed_refs->root);
+		delayed_refs->num_entries--;
+
+		atomic_set(&ref->refs, 1);
+		if (btrfs_delayed_ref_is_head(ref)) {
+			struct btrfs_delayed_ref_head *head;
+
+			head = btrfs_delayed_node_to_head(ref);
+			mutex_lock(&head->mutex);
+			kfree(head->extent_op);
+			delayed_refs->num_heads--;
+			if (list_empty(&head->cluster))
+				delayed_refs->num_heads_ready--;
+			list_del_init(&head->cluster);
+			mutex_unlock(&head->mutex);
+		}
+
+		spin_unlock(&delayed_refs->lock);
+		btrfs_put_delayed_ref(ref);
+
+		cond_resched();
+		spin_lock(&delayed_refs->lock);
+	}
+
+	spin_unlock(&delayed_refs->lock);
+
+	return ret;
+}
+
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+{
+	struct btrfs_pending_snapshot *snapshot;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	list_splice_init(&t->pending_snapshots, &splice);
+
+	while (!list_empty(&splice)) {
+		snapshot = list_entry(splice.next,
+				      struct btrfs_pending_snapshot,
+				      list);
+
+		list_del_init(&snapshot->list);
+
+		kfree(snapshot);
+	}
+
+	return 0;
+}
+
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+{
+	struct btrfs_inode *btrfs_inode;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&root->fs_info->delalloc_lock);
+	list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+
+	while (!list_empty(&splice)) {
+		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+				    delalloc_inodes);
+
+		list_del_init(&btrfs_inode->delalloc_inodes);
+
+		btrfs_invalidate_inodes(btrfs_inode->root);
+	}
+
+	spin_unlock(&root->fs_info->delalloc_lock);
+
+	return 0;
+}
+
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages,
+					int mark)
+{
+	int ret;
+	struct page *page;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_buffer *eb;
+	u64 start = 0;
+	u64 end;
+	u64 offset;
+	unsigned long index;
+
+	while (1) {
+		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+					    mark);
+		if (ret)
+			break;
+
+		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+		while (start <= end) {
+			index = start >> PAGE_CACHE_SHIFT;
+			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+			page = find_get_page(btree_inode->i_mapping, index);
+			if (!page)
+				continue;
+			offset = page_offset(page);
+
+			spin_lock(&dirty_pages->buffer_lock);
+			eb = radix_tree_lookup(
+			     &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
+					       offset >> PAGE_CACHE_SHIFT);
+			spin_unlock(&dirty_pages->buffer_lock);
+			if (eb) {
+				ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+							 &eb->bflags);
+				atomic_set(&eb->refs, 1);
+			}
+			if (PageWriteback(page))
+				end_page_writeback(page);
+
+			lock_page(page);
+			if (PageDirty(page)) {
+				clear_page_dirty_for_io(page);
+				spin_lock_irq(&page->mapping->tree_lock);
+				radix_tree_tag_clear(&page->mapping->page_tree,
+							page_index(page),
+							PAGECACHE_TAG_DIRTY);
+				spin_unlock_irq(&page->mapping->tree_lock);
+			}
+
+			page->mapping->a_ops->invalidatepage(page, 0);
+			unlock_page(page);
+		}
+	}
+
+	return ret;
+}
+
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+				       struct extent_io_tree *pinned_extents)
+{
+	struct extent_io_tree *unpin;
+	u64 start;
+	u64 end;
+	int ret;
+
+	unpin = pinned_extents;
+	while (1) {
+		ret = find_first_extent_bit(unpin, 0, &start, &end,
+					    EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		/* opt_discard */
+		if (btrfs_test_opt(root, DISCARD))
+			ret = btrfs_error_discard_extent(root, start,
+							 end + 1 - start,
+							 NULL);
+
+		clear_extent_dirty(unpin, start, end, GFP_NOFS);
+		btrfs_error_unpin_extent_range(root, start, end);
+		cond_resched();
+	}
+
+	return 0;
+}
+
+static int btrfs_cleanup_transaction(struct btrfs_root *root)
+{
+	struct btrfs_transaction *t;
+	LIST_HEAD(list);
+
+	WARN_ON(1);
+
+	mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+	spin_lock(&root->fs_info->trans_lock);
+	list_splice_init(&root->fs_info->trans_list, &list);
+	root->fs_info->trans_no_join = 1;
+	spin_unlock(&root->fs_info->trans_lock);
+
+	while (!list_empty(&list)) {
+		t = list_entry(list.next, struct btrfs_transaction, list);
+		if (!t)
+			break;
+
+		btrfs_destroy_ordered_operations(root);
+
+		btrfs_destroy_ordered_extents(root);
+
+		btrfs_destroy_delayed_refs(t, root);
+
+		btrfs_block_rsv_release(root,
+					&root->fs_info->trans_block_rsv,
+					t->dirty_pages.dirty_bytes);
+
+		/* FIXME: cleanup wait for commit */
+		t->in_commit = 1;
+		t->blocked = 1;
+		if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
+			wake_up(&root->fs_info->transaction_blocked_wait);
+
+		t->blocked = 0;
+		if (waitqueue_active(&root->fs_info->transaction_wait))
+			wake_up(&root->fs_info->transaction_wait);
+
+		t->commit_done = 1;
+		if (waitqueue_active(&t->commit_wait))
+			wake_up(&t->commit_wait);
+
+		btrfs_destroy_pending_snapshots(t);
+
+		btrfs_destroy_delalloc_inodes(root);
+
+		spin_lock(&root->fs_info->trans_lock);
+		root->fs_info->running_transaction = NULL;
+		spin_unlock(&root->fs_info->trans_lock);
+
+		btrfs_destroy_marked_extents(root, &t->dirty_pages,
+					     EXTENT_DIRTY);
+
+		btrfs_destroy_pinned_extent(root,
+					    root->fs_info->pinned_extents);
+
+		atomic_set(&t->use_count, 0);
+		list_del_init(&t->list);
+		memset(t, 0, sizeof(*t));
+		kmem_cache_free(btrfs_transaction_cachep, t);
+	}
+
+	spin_lock(&root->fs_info->trans_lock);
+	root->fs_info->trans_no_join = 0;
+	spin_unlock(&root->fs_info->trans_lock);
+	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+
+	return 0;
+}
+
+static struct extent_io_ops btree_extent_io_ops = {
+	.write_cache_pages_lock_hook = btree_lock_page_hook,
+	.readpage_end_io_hook = btree_readpage_end_io_hook,
+	.submit_bio_hook = btree_submit_bio_hook,
+	/* note we're sharing with inode.c for the merge bio hook */
+	.merge_bio_hook = btrfs_merge_bio_hook,
+};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 00000000..a0b610a6
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __DISKIO__
+#define __DISKIO__
+
+#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
+#define BTRFS_SUPER_INFO_SIZE 4096
+
+#define BTRFS_SUPER_MIRROR_MAX	 3
+#define BTRFS_SUPER_MIRROR_SHIFT 12
+
+static inline u64 btrfs_sb_offset(int mirror)
+{
+	u64 start = 16 * 1024;
+	if (mirror)
+		return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
+	return BTRFS_SUPER_INFO_OFFSET;
+}
+
+struct btrfs_device;
+struct btrfs_fs_devices;
+
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+				      u32 blocksize, u64 parent_transid);
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+			 u64 parent_transid);
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+						   u64 bytenr, u32 blocksize);
+int clean_tree_block(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root, struct extent_buffer *buf);
+struct btrfs_root *open_ctree(struct super_block *sb,
+			      struct btrfs_fs_devices *fs_devices,
+			      char *options);
+int close_ctree(struct btrfs_root *root);
+int write_ctree_super(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, int max_mirrors);
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_commit_super(struct btrfs_root *root);
+int btrfs_error_commit_super(struct btrfs_root *root);
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+					    u64 bytenr, u32 blocksize);
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+					       struct btrfs_key *location);
+struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+					      struct btrfs_key *location);
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
+void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
+u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
+void btrfs_csum_final(u32 crc, char *result);
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+			int metadata);
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+			int rw, struct bio *bio, int mirror_num,
+			unsigned long bio_flags, u64 bio_offset,
+			extent_submit_bio_hook_t *submit_bio_start,
+			extent_submit_bio_hook_t *submit_bio_done);
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
+int btrfs_write_tree_block(struct extent_buffer *buf);
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info);
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root);
+int btree_lock_page_hook(struct page *page);
+
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
+#else
+static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
+						 int level)
+{
+}
+#endif
+#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 00000000..1b8dc337
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,317 @@
+#include <linux/fs.h>
+#include <linux/types.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "export.h"
+#include "compat.h"
+
+#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
+						 parent_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
+					     parent_root_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
+
+static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+			   int connectable)
+{
+	struct btrfs_fid *fid = (struct btrfs_fid *)fh;
+	struct inode *inode = dentry->d_inode;
+	int len = *max_len;
+	int type;
+
+	if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+		*max_len = BTRFS_FID_SIZE_CONNECTABLE;
+		return 255;
+	} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
+		*max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+		return 255;
+	}
+
+	len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
+	type = FILEID_BTRFS_WITHOUT_PARENT;
+
+	fid->objectid = btrfs_ino(inode);
+	fid->root_objectid = BTRFS_I(inode)->root->objectid;
+	fid->gen = inode->i_generation;
+
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		struct inode *parent;
+		u64 parent_root_id;
+
+		spin_lock(&dentry->d_lock);
+
+		parent = dentry->d_parent->d_inode;
+		fid->parent_objectid = BTRFS_I(parent)->location.objectid;
+		fid->parent_gen = parent->i_generation;
+		parent_root_id = BTRFS_I(parent)->root->objectid;
+
+		spin_unlock(&dentry->d_lock);
+
+		if (parent_root_id != fid->root_objectid) {
+			fid->parent_root_objectid = parent_root_id;
+			len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
+			type = FILEID_BTRFS_WITH_PARENT_ROOT;
+		} else {
+			len = BTRFS_FID_SIZE_CONNECTABLE;
+			type = FILEID_BTRFS_WITH_PARENT;
+		}
+	}
+
+	*max_len = len;
+	return type;
+}
+
+static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+				       u64 root_objectid, u32 generation,
+				       int check_generation)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+	struct btrfs_root *root;
+	struct inode *inode;
+	struct btrfs_key key;
+	int index;
+	int err = 0;
+
+	if (objectid < BTRFS_FIRST_FREE_OBJECTID)
+		return ERR_PTR(-ESTALE);
+
+	key.objectid = root_objectid;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	key.offset = (u64)-1;
+
+	index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto fail;
+	}
+
+	if (btrfs_root_refs(&root->root_item) == 0) {
+		err = -ENOENT;
+		goto fail;
+	}
+
+	key.objectid = objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+
+	inode = btrfs_iget(sb, &key, root, NULL);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto fail;
+	}
+
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+	if (check_generation && generation != inode->i_generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return d_obtain_alias(inode);
+fail:
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+	return ERR_PTR(err);
+}
+
+static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+	u64 objectid, root_objectid;
+	u32 generation;
+
+	if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+		if (fh_len !=  BTRFS_FID_SIZE_CONNECTABLE)
+			return NULL;
+		root_objectid = fid->root_objectid;
+	} else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
+		if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
+			return NULL;
+		root_objectid = fid->parent_root_objectid;
+	} else
+		return NULL;
+
+	objectid = fid->parent_objectid;
+	generation = fid->parent_gen;
+
+	return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+}
+
+static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+	u64 objectid, root_objectid;
+	u32 generation;
+
+	if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
+	     fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
+	    (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
+	     fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
+	    (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
+	     fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
+		return NULL;
+
+	objectid = fid->objectid;
+	root_objectid = fid->root_objectid;
+	generation = fid->gen;
+
+	return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+}
+
+static struct dentry *btrfs_get_parent(struct dentry *child)
+{
+	struct inode *dir = child->d_inode;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_root_ref *ref;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return ERR_PTR(-ENOMEM);
+
+	if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) {
+		key.objectid = root->root_key.objectid;
+		key.type = BTRFS_ROOT_BACKREF_KEY;
+		key.offset = (u64)-1;
+		root = root->fs_info->tree_root;
+	} else {
+		key.objectid = btrfs_ino(dir);
+		key.type = BTRFS_INODE_REF_KEY;
+		key.offset = (u64)-1;
+	}
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto fail;
+
+	BUG_ON(ret == 0);
+	if (path->slots[0] == 0) {
+		ret = -ENOENT;
+		goto fail;
+	}
+
+	path->slots[0]--;
+	leaf = path->nodes[0];
+
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	if (found_key.objectid != key.objectid || found_key.type != key.type) {
+		ret = -ENOENT;
+		goto fail;
+	}
+
+	if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_root_ref);
+		key.objectid = btrfs_root_ref_dirid(leaf, ref);
+	} else {
+		key.objectid = found_key.offset;
+	}
+	btrfs_free_path(path);
+
+	if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+		return btrfs_get_dentry(root->fs_info->sb, key.objectid,
+					found_key.offset, 0, 0);
+	}
+
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+fail:
+	btrfs_free_path(path);
+	return ERR_PTR(ret);
+}
+
+static int btrfs_get_name(struct dentry *parent, char *name,
+			  struct dentry *child)
+{
+	struct inode *inode = child->d_inode;
+	struct inode *dir = parent->d_inode;
+	struct btrfs_path *path;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_inode_ref *iref;
+	struct btrfs_root_ref *rref;
+	struct extent_buffer *leaf;
+	unsigned long name_ptr;
+	struct btrfs_key key;
+	int name_len;
+	int ret;
+	u64 ino;
+
+	if (!dir || !inode)
+		return -EINVAL;
+
+	if (!S_ISDIR(dir->i_mode))
+		return -EINVAL;
+
+	ino = btrfs_ino(inode);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->leave_spinning = 1;
+
+	if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+		key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+		key.type = BTRFS_ROOT_BACKREF_KEY;
+		key.offset = (u64)-1;
+		root = root->fs_info->tree_root;
+	} else {
+		key.objectid = ino;
+		key.offset = btrfs_ino(dir);
+		key.type = BTRFS_INODE_REF_KEY;
+	}
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		btrfs_free_path(path);
+		return ret;
+	} else if (ret > 0) {
+		if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+			path->slots[0]--;
+		} else {
+			btrfs_free_path(path);
+			return -ENOENT;
+		}
+	}
+	leaf = path->nodes[0];
+
+	if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+		rref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_root_ref);
+		name_ptr = (unsigned long)(rref + 1);
+		name_len = btrfs_root_ref_name_len(leaf, rref);
+	} else {
+		iref = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_inode_ref);
+		name_ptr = (unsigned long)(iref + 1);
+		name_len = btrfs_inode_ref_name_len(leaf, iref);
+	}
+
+	read_extent_buffer(leaf, name, name_ptr, name_len);
+	btrfs_free_path(path);
+
+	/*
+	 * have to add the null termination to make sure that reconnect_path
+	 * gets the right len for strlen
+	 */
+	name[name_len] = '\0';
+
+	return 0;
+}
+
+const struct export_operations btrfs_export_ops = {
+	.encode_fh	= btrfs_encode_fh,
+	.fh_to_dentry	= btrfs_fh_to_dentry,
+	.fh_to_parent	= btrfs_fh_to_parent,
+	.get_parent	= btrfs_get_parent,
+	.get_name	= btrfs_get_name,
+};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 00000000..074348a9
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
+#ifndef BTRFS_EXPORT_H
+#define BTRFS_EXPORT_H
+
+#include <linux/exportfs.h>
+
+extern const struct export_operations btrfs_export_ops;
+
+struct btrfs_fid {
+	u64 objectid;
+	u64 root_objectid;
+	u32 gen;
+
+	u64 parent_objectid;
+	u32 parent_gen;
+
+	u64 parent_root_objectid;
+} __attribute__ ((packed));
+
+#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 00000000..7e20a65d
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,7347 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/sort.h>
+#include <linux/rcupdate.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include "compat.h"
+#include "hash.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "free-space-cache.h"
+
+/* control flags for do_chunk_alloc's force field
+ * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
+ * if we really need one.
+ *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
+ * CHUNK_ALLOC_LIMITED means to only try and allocate one
+ * if we have very few chunks already allocated.  This is
+ * used as part of the clustering code to help make sure
+ * we have a good pool of storage to cluster in, without
+ * filling the FS with empty chunks
+ *
+ */
+enum {
+	CHUNK_ALLOC_NO_FORCE = 0,
+	CHUNK_ALLOC_FORCE = 1,
+	CHUNK_ALLOC_LIMITED = 2,
+};
+
+static int update_block_group(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      u64 bytenr, u64 num_bytes, int alloc);
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 parent,
+				u64 root_objectid, u64 owner_objectid,
+				u64 owner_offset, int refs_to_drop,
+				struct btrfs_delayed_extent_op *extra_op);
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+				    struct extent_buffer *leaf,
+				    struct btrfs_extent_item *ei);
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      u64 parent, u64 root_objectid,
+				      u64 flags, u64 owner, u64 offset,
+				      struct btrfs_key *ins, int ref_mod);
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 parent, u64 root_objectid,
+				     u64 flags, struct btrfs_disk_key *key,
+				     int level, struct btrfs_key *ins);
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *extent_root, u64 alloc_bytes,
+			  u64 flags, int force);
+static int find_next_key(struct btrfs_path *path, int level,
+			 struct btrfs_key *key);
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+			    int dump_block_groups);
+
+static noinline int
+block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+	smp_mb();
+	return cache->cached == BTRFS_CACHE_FINISHED;
+}
+
+static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
+{
+	return (cache->flags & bits) == bits;
+}
+
+static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+{
+	atomic_inc(&cache->count);
+}
+
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
+{
+	if (atomic_dec_and_test(&cache->count)) {
+		WARN_ON(cache->pinned > 0);
+		WARN_ON(cache->reserved > 0);
+		WARN_ON(cache->reserved_pinned > 0);
+		kfree(cache->free_space_ctl);
+		kfree(cache);
+	}
+}
+
+/*
+ * this adds the block group to the fs_info rb tree for the block group
+ * cache
+ */
+static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+				struct btrfs_block_group_cache *block_group)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct btrfs_block_group_cache *cache;
+
+	spin_lock(&info->block_group_cache_lock);
+	p = &info->block_group_cache_tree.rb_node;
+
+	while (*p) {
+		parent = *p;
+		cache = rb_entry(parent, struct btrfs_block_group_cache,
+				 cache_node);
+		if (block_group->key.objectid < cache->key.objectid) {
+			p = &(*p)->rb_left;
+		} else if (block_group->key.objectid > cache->key.objectid) {
+			p = &(*p)->rb_right;
+		} else {
+			spin_unlock(&info->block_group_cache_lock);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&block_group->cache_node, parent, p);
+	rb_insert_color(&block_group->cache_node,
+			&info->block_group_cache_tree);
+	spin_unlock(&info->block_group_cache_lock);
+
+	return 0;
+}
+
+/*
+ * This will return the block group at or after bytenr if contains is 0, else
+ * it will return the block group that contains the bytenr
+ */
+static struct btrfs_block_group_cache *
+block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
+			      int contains)
+{
+	struct btrfs_block_group_cache *cache, *ret = NULL;
+	struct rb_node *n;
+	u64 end, start;
+
+	spin_lock(&info->block_group_cache_lock);
+	n = info->block_group_cache_tree.rb_node;
+
+	while (n) {
+		cache = rb_entry(n, struct btrfs_block_group_cache,
+				 cache_node);
+		end = cache->key.objectid + cache->key.offset - 1;
+		start = cache->key.objectid;
+
+		if (bytenr < start) {
+			if (!contains && (!ret || start < ret->key.objectid))
+				ret = cache;
+			n = n->rb_left;
+		} else if (bytenr > start) {
+			if (contains && bytenr <= end) {
+				ret = cache;
+				break;
+			}
+			n = n->rb_right;
+		} else {
+			ret = cache;
+			break;
+		}
+	}
+	if (ret)
+		btrfs_get_block_group(ret);
+	spin_unlock(&info->block_group_cache_lock);
+
+	return ret;
+}
+
+static int add_excluded_extent(struct btrfs_root *root,
+			       u64 start, u64 num_bytes)
+{
+	u64 end = start + num_bytes - 1;
+	set_extent_bits(&root->fs_info->freed_extents[0],
+			start, end, EXTENT_UPTODATE, GFP_NOFS);
+	set_extent_bits(&root->fs_info->freed_extents[1],
+			start, end, EXTENT_UPTODATE, GFP_NOFS);
+	return 0;
+}
+
+static void free_excluded_extents(struct btrfs_root *root,
+				  struct btrfs_block_group_cache *cache)
+{
+	u64 start, end;
+
+	start = cache->key.objectid;
+	end = start + cache->key.offset - 1;
+
+	clear_extent_bits(&root->fs_info->freed_extents[0],
+			  start, end, EXTENT_UPTODATE, GFP_NOFS);
+	clear_extent_bits(&root->fs_info->freed_extents[1],
+			  start, end, EXTENT_UPTODATE, GFP_NOFS);
+}
+
+static int exclude_super_stripes(struct btrfs_root *root,
+				 struct btrfs_block_group_cache *cache)
+{
+	u64 bytenr;
+	u64 *logical;
+	int stripe_len;
+	int i, nr, ret;
+
+	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
+		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
+		cache->bytes_super += stripe_len;
+		ret = add_excluded_extent(root, cache->key.objectid,
+					  stripe_len);
+		BUG_ON(ret);
+	}
+
+	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+		bytenr = btrfs_sb_offset(i);
+		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+				       cache->key.objectid, bytenr,
+				       0, &logical, &nr, &stripe_len);
+		BUG_ON(ret);
+
+		while (nr--) {
+			cache->bytes_super += stripe_len;
+			ret = add_excluded_extent(root, logical[nr],
+						  stripe_len);
+			BUG_ON(ret);
+		}
+
+		kfree(logical);
+	}
+	return 0;
+}
+
+static struct btrfs_caching_control *
+get_caching_control(struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_caching_control *ctl;
+
+	spin_lock(&cache->lock);
+	if (cache->cached != BTRFS_CACHE_STARTED) {
+		spin_unlock(&cache->lock);
+		return NULL;
+	}
+
+	/* We're loading it the fast way, so we don't have a caching_ctl. */
+	if (!cache->caching_ctl) {
+		spin_unlock(&cache->lock);
+		return NULL;
+	}
+
+	ctl = cache->caching_ctl;
+	atomic_inc(&ctl->count);
+	spin_unlock(&cache->lock);
+	return ctl;
+}
+
+static void put_caching_control(struct btrfs_caching_control *ctl)
+{
+	if (atomic_dec_and_test(&ctl->count))
+		kfree(ctl);
+}
+
+/*
+ * this is only called by cache_block_group, since we could have freed extents
+ * we need to check the pinned_extents for any extents that can't be used yet
+ * since their free space will be released as soon as the transaction commits.
+ */
+static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+			      struct btrfs_fs_info *info, u64 start, u64 end)
+{
+	u64 extent_start, extent_end, size, total_added = 0;
+	int ret;
+
+	while (start < end) {
+		ret = find_first_extent_bit(info->pinned_extents, start,
+					    &extent_start, &extent_end,
+					    EXTENT_DIRTY | EXTENT_UPTODATE);
+		if (ret)
+			break;
+
+		if (extent_start <= start) {
+			start = extent_end + 1;
+		} else if (extent_start > start && extent_start < end) {
+			size = extent_start - start;
+			total_added += size;
+			ret = btrfs_add_free_space(block_group, start,
+						   size);
+			BUG_ON(ret);
+			start = extent_end + 1;
+		} else {
+			break;
+		}
+	}
+
+	if (start < end) {
+		size = end - start;
+		total_added += size;
+		ret = btrfs_add_free_space(block_group, start, size);
+		BUG_ON(ret);
+	}
+
+	return total_added;
+}
+
+static int caching_kthread(void *data)
+{
+	struct btrfs_block_group_cache *block_group = data;
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
+	struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	u64 total_found = 0;
+	u64 last = 0;
+	u32 nritems;
+	int ret = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+
+	/*
+	 * We don't want to deadlock with somebody trying to allocate a new
+	 * extent for the extent root while also trying to search the extent
+	 * root to add free space.  So we skip locking and search the commit
+	 * root, since its read-only
+	 */
+	path->skip_locking = 1;
+	path->search_commit_root = 1;
+	path->reada = 1;
+
+	key.objectid = last;
+	key.offset = 0;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+again:
+	mutex_lock(&caching_ctl->mutex);
+	/* need to make sure the commit_root doesn't disappear */
+	down_read(&fs_info->extent_commit_sem);
+
+	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto err;
+
+	leaf = path->nodes[0];
+	nritems = btrfs_header_nritems(leaf);
+
+	while (1) {
+		if (btrfs_fs_closing(fs_info) > 1) {
+			last = (u64)-1;
+			break;
+		}
+
+		if (path->slots[0] < nritems) {
+			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		} else {
+			ret = find_next_key(path, 0, &key);
+			if (ret)
+				break;
+
+			if (need_resched() ||
+			    btrfs_next_leaf(extent_root, path)) {
+				caching_ctl->progress = last;
+				btrfs_release_path(path);
+				up_read(&fs_info->extent_commit_sem);
+				mutex_unlock(&caching_ctl->mutex);
+				cond_resched();
+				goto again;
+			}
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			continue;
+		}
+
+		if (key.objectid < block_group->key.objectid) {
+			path->slots[0]++;
+			continue;
+		}
+
+		if (key.objectid >= block_group->key.objectid +
+		    block_group->key.offset)
+			break;
+
+		if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+			total_found += add_new_free_space(block_group,
+							  fs_info, last,
+							  key.objectid);
+			last = key.objectid + key.offset;
+
+			if (total_found > (1024 * 1024 * 2)) {
+				total_found = 0;
+				wake_up(&caching_ctl->wait);
+			}
+		}
+		path->slots[0]++;
+	}
+	ret = 0;
+
+	total_found += add_new_free_space(block_group, fs_info, last,
+					  block_group->key.objectid +
+					  block_group->key.offset);
+	caching_ctl->progress = (u64)-1;
+
+	spin_lock(&block_group->lock);
+	block_group->caching_ctl = NULL;
+	block_group->cached = BTRFS_CACHE_FINISHED;
+	spin_unlock(&block_group->lock);
+
+err:
+	btrfs_free_path(path);
+	up_read(&fs_info->extent_commit_sem);
+
+	free_excluded_extents(extent_root, block_group);
+
+	mutex_unlock(&caching_ctl->mutex);
+	wake_up(&caching_ctl->wait);
+
+	put_caching_control(caching_ctl);
+	atomic_dec(&block_group->space_info->caching_threads);
+	btrfs_put_block_group(block_group);
+
+	return 0;
+}
+
+static int cache_block_group(struct btrfs_block_group_cache *cache,
+			     struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     int load_cache_only)
+{
+	struct btrfs_fs_info *fs_info = cache->fs_info;
+	struct btrfs_caching_control *caching_ctl;
+	struct task_struct *tsk;
+	int ret = 0;
+
+	smp_mb();
+	if (cache->cached != BTRFS_CACHE_NO)
+		return 0;
+
+	/*
+	 * We can't do the read from on-disk cache during a commit since we need
+	 * to have the normal tree locking.  Also if we are currently trying to
+	 * allocate blocks for the tree root we can't do the fast caching since
+	 * we likely hold important locks.
+	 */
+	if (trans && (!trans->transaction->in_commit) &&
+	    (root && root != root->fs_info->tree_root)) {
+		spin_lock(&cache->lock);
+		if (cache->cached != BTRFS_CACHE_NO) {
+			spin_unlock(&cache->lock);
+			return 0;
+		}
+		cache->cached = BTRFS_CACHE_STARTED;
+		spin_unlock(&cache->lock);
+
+		ret = load_free_space_cache(fs_info, cache);
+
+		spin_lock(&cache->lock);
+		if (ret == 1) {
+			cache->cached = BTRFS_CACHE_FINISHED;
+			cache->last_byte_to_unpin = (u64)-1;
+		} else {
+			cache->cached = BTRFS_CACHE_NO;
+		}
+		spin_unlock(&cache->lock);
+		if (ret == 1) {
+			free_excluded_extents(fs_info->extent_root, cache);
+			return 0;
+		}
+	}
+
+	if (load_cache_only)
+		return 0;
+
+	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
+	BUG_ON(!caching_ctl);
+
+	INIT_LIST_HEAD(&caching_ctl->list);
+	mutex_init(&caching_ctl->mutex);
+	init_waitqueue_head(&caching_ctl->wait);
+	caching_ctl->block_group = cache;
+	caching_ctl->progress = cache->key.objectid;
+	/* one for caching kthread, one for caching block group list */
+	atomic_set(&caching_ctl->count, 2);
+
+	spin_lock(&cache->lock);
+	if (cache->cached != BTRFS_CACHE_NO) {
+		spin_unlock(&cache->lock);
+		kfree(caching_ctl);
+		return 0;
+	}
+	cache->caching_ctl = caching_ctl;
+	cache->cached = BTRFS_CACHE_STARTED;
+	spin_unlock(&cache->lock);
+
+	down_write(&fs_info->extent_commit_sem);
+	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
+	up_write(&fs_info->extent_commit_sem);
+
+	atomic_inc(&cache->space_info->caching_threads);
+	btrfs_get_block_group(cache);
+
+	tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+			  cache->key.objectid);
+	if (IS_ERR(tsk)) {
+		ret = PTR_ERR(tsk);
+		printk(KERN_ERR "error running thread %d\n", ret);
+		BUG();
+	}
+
+	return ret;
+}
+
+/*
+ * return the block group that starts at or after bytenr
+ */
+static struct btrfs_block_group_cache *
+btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = block_group_cache_tree_search(info, bytenr, 0);
+
+	return cache;
+}
+
+/*
+ * return the block group that contains the given bytenr
+ */
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+						 struct btrfs_fs_info *info,
+						 u64 bytenr)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = block_group_cache_tree_search(info, bytenr, 1);
+
+	return cache;
+}
+
+static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
+						  u64 flags)
+{
+	struct list_head *head = &info->space_info;
+	struct btrfs_space_info *found;
+
+	flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
+		 BTRFS_BLOCK_GROUP_METADATA;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags & flags) {
+			rcu_read_unlock();
+			return found;
+		}
+	}
+	rcu_read_unlock();
+	return NULL;
+}
+
+/*
+ * after adding space to the filesystem, we need to clear the full flags
+ * on all the space infos.
+ */
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
+{
+	struct list_head *head = &info->space_info;
+	struct btrfs_space_info *found;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list)
+		found->full = 0;
+	rcu_read_unlock();
+}
+
+static u64 div_factor(u64 num, int factor)
+{
+	if (factor == 10)
+		return num;
+	num *= factor;
+	do_div(num, 10);
+	return num;
+}
+
+static u64 div_factor_fine(u64 num, int factor)
+{
+	if (factor == 100)
+		return num;
+	num *= factor;
+	do_div(num, 100);
+	return num;
+}
+
+u64 btrfs_find_block_group(struct btrfs_root *root,
+			   u64 search_start, u64 search_hint, int owner)
+{
+	struct btrfs_block_group_cache *cache;
+	u64 used;
+	u64 last = max(search_hint, search_start);
+	u64 group_start = 0;
+	int full_search = 0;
+	int factor = 9;
+	int wrapped = 0;
+again:
+	while (1) {
+		cache = btrfs_lookup_first_block_group(root->fs_info, last);
+		if (!cache)
+			break;
+
+		spin_lock(&cache->lock);
+		last = cache->key.objectid + cache->key.offset;
+		used = btrfs_block_group_used(&cache->item);
+
+		if ((full_search || !cache->ro) &&
+		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
+			if (used + cache->pinned + cache->reserved <
+			    div_factor(cache->key.offset, factor)) {
+				group_start = cache->key.objectid;
+				spin_unlock(&cache->lock);
+				btrfs_put_block_group(cache);
+				goto found;
+			}
+		}
+		spin_unlock(&cache->lock);
+		btrfs_put_block_group(cache);
+		cond_resched();
+	}
+	if (!wrapped) {
+		last = search_start;
+		wrapped = 1;
+		goto again;
+	}
+	if (!full_search && factor < 10) {
+		last = search_start;
+		full_search = 1;
+		factor = 10;
+		goto again;
+	}
+found:
+	return group_start;
+}
+
+/* simple helper to search for an existing extent at a given offset */
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	key.objectid = start;
+	key.offset = len;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
+				0, 0);
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper function to lookup reference count and flags of extent.
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. the head
+ * node may also store the extent flags to set. This way you can check
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
+ */
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 bytenr,
+			     u64 num_bytes, u64 *refs, u64 *flags)
+{
+	struct btrfs_delayed_ref_head *head;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_path *path;
+	struct btrfs_extent_item *ei;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	u32 item_size;
+	u64 num_refs;
+	u64 extent_flags;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = num_bytes;
+	if (!trans) {
+		path->skip_locking = 1;
+		path->search_commit_root = 1;
+	}
+again:
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+				&key, path, 0, 0);
+	if (ret < 0)
+		goto out_free;
+
+	if (ret == 0) {
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		if (item_size >= sizeof(*ei)) {
+			ei = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_extent_item);
+			num_refs = btrfs_extent_refs(leaf, ei);
+			extent_flags = btrfs_extent_flags(leaf, ei);
+		} else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+			struct btrfs_extent_item_v0 *ei0;
+			BUG_ON(item_size != sizeof(*ei0));
+			ei0 = btrfs_item_ptr(leaf, path->slots[0],
+					     struct btrfs_extent_item_v0);
+			num_refs = btrfs_extent_refs_v0(leaf, ei0);
+			/* FIXME: this isn't correct for data */
+			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+			BUG();
+#endif
+		}
+		BUG_ON(num_refs == 0);
+	} else {
+		num_refs = 0;
+		extent_flags = 0;
+		ret = 0;
+	}
+
+	if (!trans)
+		goto out;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	head = btrfs_find_delayed_ref_head(trans, bytenr);
+	if (head) {
+		if (!mutex_trylock(&head->mutex)) {
+			atomic_inc(&head->node.refs);
+			spin_unlock(&delayed_refs->lock);
+
+			btrfs_release_path(path);
+
+			/*
+			 * Mutex was contended, block until it's released and try
+			 * again
+			 */
+			mutex_lock(&head->mutex);
+			mutex_unlock(&head->mutex);
+			btrfs_put_delayed_ref(&head->node);
+			goto again;
+		}
+		if (head->extent_op && head->extent_op->update_flags)
+			extent_flags |= head->extent_op->flags_to_set;
+		else
+			BUG_ON(num_refs == 0);
+
+		num_refs += head->node.ref_mod;
+		mutex_unlock(&head->mutex);
+	}
+	spin_unlock(&delayed_refs->lock);
+out:
+	WARN_ON(num_refs == 0);
+	if (refs)
+		*refs = num_refs;
+	if (flags)
+		*flags = extent_flags;
+out_free:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Back reference rules.  Back refs have three main goals:
+ *
+ * 1) differentiate between all holders of references to an extent so that
+ *    when a reference is dropped we can make sure it was a valid reference
+ *    before freeing the extent.
+ *
+ * 2) Provide enough information to quickly find the holders of an extent
+ *    if we notice a given block is corrupted or bad.
+ *
+ * 3) Make it easy to migrate blocks for FS shrinking or storage pool
+ *    maintenance.  This is actually the same as #2, but with a slightly
+ *    different use case.
+ *
+ * There are two kinds of back refs. The implicit back refs is optimized
+ * for pointers in non-shared tree blocks. For a given pointer in a block,
+ * back refs of this kind provide information about the block's owner tree
+ * and the pointer's key. These information allow us to find the block by
+ * b-tree searching. The full back refs is for pointers in tree blocks not
+ * referenced by their owner trees. The location of tree block is recorded
+ * in the back refs. Actually the full back refs is generic, and can be
+ * used in all cases the implicit back refs is used. The major shortcoming
+ * of the full back refs is its overhead. Every time a tree block gets
+ * COWed, we have to update back refs entry for all pointers in it.
+ *
+ * For a newly allocated tree block, we use implicit back refs for
+ * pointers in it. This means most tree related operations only involve
+ * implicit back refs. For a tree block created in old transaction, the
+ * only way to drop a reference to it is COW it. So we can detect the
+ * event that tree block loses its owner tree's reference and do the
+ * back refs conversion.
+ *
+ * When a tree block is COW'd through a tree, there are four cases:
+ *
+ * The reference count of the block is one and the tree is the block's
+ * owner tree. Nothing to do in this case.
+ *
+ * The reference count of the block is one and the tree is not the
+ * block's owner tree. In this case, full back refs is used for pointers
+ * in the block. Remove these full back refs, add implicit back refs for
+ * every pointers in the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * the block's owner tree. In this case, implicit back refs is used for
+ * pointers in the block. Add full back refs for every pointers in the
+ * block, increase lower level extents' reference counts. The original
+ * implicit back refs are entailed to the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * not the block's owner tree. Add implicit back refs for every pointer in
+ * the new block, increase lower level extents' reference count.
+ *
+ * Back Reference Key composing:
+ *
+ * The key objectid corresponds to the first byte in the extent,
+ * The key type is used to differentiate between types of back refs.
+ * There are different meanings of the key offset for different types
+ * of back refs.
+ *
+ * File extents can be referenced by:
+ *
+ * - multiple snapshots, subvolumes, or different generations in one subvol
+ * - different files inside a single subvolume
+ * - different offsets inside a file (bookend extents in file.c)
+ *
+ * The extent ref structure for the implicit back refs has fields for:
+ *
+ * - Objectid of the subvolume root
+ * - objectid of the file holding the reference
+ * - original offset in the file
+ * - how many bookend extents
+ *
+ * The key offset for the implicit back refs is hash of the first
+ * three fields.
+ *
+ * The extent ref structure for the full back refs has field for:
+ *
+ * - number of pointers in the tree leaf
+ *
+ * The key offset for the implicit back refs is the first byte of
+ * the tree leaf
+ *
+ * When a file extent is allocated, The implicit back refs is used.
+ * the fields are filled in:
+ *
+ *     (root_key.objectid, inode objectid, offset in file, 1)
+ *
+ * When a file extent is removed file truncation, we find the
+ * corresponding implicit back refs and check the following fields:
+ *
+ *     (btrfs_header_owner(leaf), inode objectid, offset in file)
+ *
+ * Btree extents can be referenced by:
+ *
+ * - Different subvolumes
+ *
+ * Both the implicit back refs and the full back refs for tree blocks
+ * only consist of key. The key offset for the implicit back refs is
+ * objectid of block's owner tree. The key offset for the full back refs
+ * is the first byte of parent block.
+ *
+ * When implicit back refs is used, information about the lowest key and
+ * level of the tree block are required. These information are stored in
+ * tree block info structure.
+ */
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_path *path,
+				  u64 owner, u32 extra_size)
+{
+	struct btrfs_extent_item *item;
+	struct btrfs_extent_item_v0 *ei0;
+	struct btrfs_extent_ref_v0 *ref0;
+	struct btrfs_tree_block_info *bi;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u32 new_size = sizeof(*item);
+	u64 refs;
+	int ret;
+
+	leaf = path->nodes[0];
+	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
+
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	ei0 = btrfs_item_ptr(leaf, path->slots[0],
+			     struct btrfs_extent_item_v0);
+	refs = btrfs_extent_refs_v0(leaf, ei0);
+
+	if (owner == (u64)-1) {
+		while (1) {
+			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret < 0)
+					return ret;
+				BUG_ON(ret > 0);
+				leaf = path->nodes[0];
+			}
+			btrfs_item_key_to_cpu(leaf, &found_key,
+					      path->slots[0]);
+			BUG_ON(key.objectid != found_key.objectid);
+			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
+				path->slots[0]++;
+				continue;
+			}
+			ref0 = btrfs_item_ptr(leaf, path->slots[0],
+					      struct btrfs_extent_ref_v0);
+			owner = btrfs_ref_objectid_v0(leaf, ref0);
+			break;
+		}
+	}
+	btrfs_release_path(path);
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID)
+		new_size += sizeof(*bi);
+
+	new_size -= sizeof(*ei0);
+	ret = btrfs_search_slot(trans, root, &key, path,
+				new_size + extra_size, 1);
+	if (ret < 0)
+		return ret;
+	BUG_ON(ret);
+
+	ret = btrfs_extend_item(trans, root, path, new_size);
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	btrfs_set_extent_refs(leaf, item, refs);
+	/* FIXME: get real generation */
+	btrfs_set_extent_generation(leaf, item, 0);
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		btrfs_set_extent_flags(leaf, item,
+				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
+				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
+		bi = (struct btrfs_tree_block_info *)(item + 1);
+		/* FIXME: get first key of the block */
+		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
+		btrfs_set_tree_block_level(leaf, bi, (int)owner);
+	} else {
+		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	return 0;
+}
+#endif
+
+static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
+{
+	u32 high_crc = ~(u32)0;
+	u32 low_crc = ~(u32)0;
+	__le64 lenum;
+
+	lenum = cpu_to_le64(root_objectid);
+	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+	lenum = cpu_to_le64(owner);
+	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+	lenum = cpu_to_le64(offset);
+	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+
+	return ((u64)high_crc << 31) ^ (u64)low_crc;
+}
+
+static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
+				     struct btrfs_extent_data_ref *ref)
+{
+	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
+				    btrfs_extent_data_ref_objectid(leaf, ref),
+				    btrfs_extent_data_ref_offset(leaf, ref));
+}
+
+static int match_extent_data_ref(struct extent_buffer *leaf,
+				 struct btrfs_extent_data_ref *ref,
+				 u64 root_objectid, u64 owner, u64 offset)
+{
+	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
+	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
+	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
+		return 0;
+	return 1;
+}
+
+static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct btrfs_path *path,
+					   u64 bytenr, u64 parent,
+					   u64 root_objectid,
+					   u64 owner, u64 offset)
+{
+	struct btrfs_key key;
+	struct btrfs_extent_data_ref *ref;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int ret;
+	int recow;
+	int err = -ENOENT;
+
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_DATA_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_EXTENT_DATA_REF_KEY;
+		key.offset = hash_extent_data_ref(root_objectid,
+						  owner, offset);
+	}
+again:
+	recow = 0;
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0) {
+		err = ret;
+		goto fail;
+	}
+
+	if (parent) {
+		if (!ret)
+			return 0;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		key.type = BTRFS_EXTENT_REF_V0_KEY;
+		btrfs_release_path(path);
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0) {
+			err = ret;
+			goto fail;
+		}
+		if (!ret)
+			return 0;
+#endif
+		goto fail;
+	}
+
+	leaf = path->nodes[0];
+	nritems = btrfs_header_nritems(leaf);
+	while (1) {
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				err = ret;
+			if (ret)
+				goto fail;
+
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			recow = 1;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != bytenr ||
+		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
+			goto fail;
+
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_data_ref);
+
+		if (match_extent_data_ref(leaf, ref, root_objectid,
+					  owner, offset)) {
+			if (recow) {
+				btrfs_release_path(path);
+				goto again;
+			}
+			err = 0;
+			break;
+		}
+		path->slots[0]++;
+	}
+fail:
+	return err;
+}
+
+static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct btrfs_path *path,
+					   u64 bytenr, u64 parent,
+					   u64 root_objectid, u64 owner,
+					   u64 offset, int refs_to_add)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	u32 size;
+	u32 num_refs;
+	int ret;
+
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_DATA_REF_KEY;
+		key.offset = parent;
+		size = sizeof(struct btrfs_shared_data_ref);
+	} else {
+		key.type = BTRFS_EXTENT_DATA_REF_KEY;
+		key.offset = hash_extent_data_ref(root_objectid,
+						  owner, offset);
+		size = sizeof(struct btrfs_extent_data_ref);
+	}
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
+	if (ret && ret != -EEXIST)
+		goto fail;
+
+	leaf = path->nodes[0];
+	if (parent) {
+		struct btrfs_shared_data_ref *ref;
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_shared_data_ref);
+		if (ret == 0) {
+			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
+		} else {
+			num_refs = btrfs_shared_data_ref_count(leaf, ref);
+			num_refs += refs_to_add;
+			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
+		}
+	} else {
+		struct btrfs_extent_data_ref *ref;
+		while (ret == -EEXIST) {
+			ref = btrfs_item_ptr(leaf, path->slots[0],
+					     struct btrfs_extent_data_ref);
+			if (match_extent_data_ref(leaf, ref, root_objectid,
+						  owner, offset))
+				break;
+			btrfs_release_path(path);
+			key.offset++;
+			ret = btrfs_insert_empty_item(trans, root, path, &key,
+						      size);
+			if (ret && ret != -EEXIST)
+				goto fail;
+
+			leaf = path->nodes[0];
+		}
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_data_ref);
+		if (ret == 0) {
+			btrfs_set_extent_data_ref_root(leaf, ref,
+						       root_objectid);
+			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
+		} else {
+			num_refs = btrfs_extent_data_ref_count(leaf, ref);
+			num_refs += refs_to_add;
+			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
+		}
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	ret = 0;
+fail:
+	btrfs_release_path(path);
+	return ret;
+}
+
+static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct btrfs_path *path,
+					   int refs_to_drop)
+{
+	struct btrfs_key key;
+	struct btrfs_extent_data_ref *ref1 = NULL;
+	struct btrfs_shared_data_ref *ref2 = NULL;
+	struct extent_buffer *leaf;
+	u32 num_refs = 0;
+	int ret = 0;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+		ref1 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_data_ref);
+		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+		ref2 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_shared_data_ref);
+		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+		struct btrfs_extent_ref_v0 *ref0;
+		ref0 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_ref_v0);
+		num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+	} else {
+		BUG();
+	}
+
+	BUG_ON(num_refs < refs_to_drop);
+	num_refs -= refs_to_drop;
+
+	if (num_refs == 0) {
+		ret = btrfs_del_item(trans, root, path);
+	} else {
+		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
+			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
+		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
+			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		else {
+			struct btrfs_extent_ref_v0 *ref0;
+			ref0 = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_extent_ref_v0);
+			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
+		}
+#endif
+		btrfs_mark_buffer_dirty(leaf);
+	}
+	return ret;
+}
+
+static noinline u32 extent_data_ref_count(struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  struct btrfs_extent_inline_ref *iref)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_data_ref *ref1;
+	struct btrfs_shared_data_ref *ref2;
+	u32 num_refs = 0;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	if (iref) {
+		if (btrfs_extent_inline_ref_type(leaf, iref) ==
+		    BTRFS_EXTENT_DATA_REF_KEY) {
+			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
+			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+		} else {
+			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
+			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+		}
+	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+		ref1 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_data_ref);
+		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+		ref2 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_shared_data_ref);
+		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+		struct btrfs_extent_ref_v0 *ref0;
+		ref0 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_ref_v0);
+		num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+	} else {
+		WARN_ON(1);
+	}
+	return num_refs;
+}
+
+static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 root_objectid)
+{
+	struct btrfs_key key;
+	int ret;
+
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_TREE_BLOCK_REF_KEY;
+		key.offset = root_objectid;
+	}
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (ret == -ENOENT && parent) {
+		btrfs_release_path(path);
+		key.type = BTRFS_EXTENT_REF_V0_KEY;
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret > 0)
+			ret = -ENOENT;
+	}
+#endif
+	return ret;
+}
+
+static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 root_objectid)
+{
+	struct btrfs_key key;
+	int ret;
+
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_TREE_BLOCK_REF_KEY;
+		key.offset = root_objectid;
+	}
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+	btrfs_release_path(path);
+	return ret;
+}
+
+static inline int extent_ref_type(u64 parent, u64 owner)
+{
+	int type;
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		if (parent > 0)
+			type = BTRFS_SHARED_BLOCK_REF_KEY;
+		else
+			type = BTRFS_TREE_BLOCK_REF_KEY;
+	} else {
+		if (parent > 0)
+			type = BTRFS_SHARED_DATA_REF_KEY;
+		else
+			type = BTRFS_EXTENT_DATA_REF_KEY;
+	}
+	return type;
+}
+
+static int find_next_key(struct btrfs_path *path, int level,
+			 struct btrfs_key *key)
+
+{
+	for (; level < BTRFS_MAX_LEVEL; level++) {
+		if (!path->nodes[level])
+			break;
+		if (path->slots[level] + 1 >=
+		    btrfs_header_nritems(path->nodes[level]))
+			continue;
+		if (level == 0)
+			btrfs_item_key_to_cpu(path->nodes[level], key,
+					      path->slots[level] + 1);
+		else
+			btrfs_node_key_to_cpu(path->nodes[level], key,
+					      path->slots[level] + 1);
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * look for inline back ref. if back ref is found, *ref_ret is set
+ * to the address of inline back ref, and 0 is returned.
+ *
+ * if back ref isn't found, *ref_ret is set to the address where it
+ * should be inserted, and -ENOENT is returned.
+ *
+ * if insert is true and there are too many inline back refs, the path
+ * points to the extent item, and -EAGAIN is returned.
+ *
+ * NOTE: inline back refs are ordered in the same way that back ref
+ *	 items in the tree are ordered.
+ */
+static noinline_for_stack
+int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref **ref_ret,
+				 u64 bytenr, u64 num_bytes,
+				 u64 parent, u64 root_objectid,
+				 u64 owner, u64 offset, int insert)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *iref;
+	u64 flags;
+	u64 item_size;
+	unsigned long ptr;
+	unsigned long end;
+	int extra_size;
+	int type;
+	int want;
+	int ret;
+	int err = 0;
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = num_bytes;
+
+	want = extent_ref_type(parent, owner);
+	if (insert) {
+		extra_size = btrfs_extent_inline_ref_size(want);
+		path->keep_locks = 1;
+	} else
+		extra_size = -1;
+	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		if (!insert) {
+			err = -ENOENT;
+			goto out;
+		}
+		ret = convert_extent_item_v0(trans, root, path, owner,
+					     extra_size);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	}
+#endif
+	BUG_ON(item_size < sizeof(*ei));
+
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	flags = btrfs_extent_flags(leaf, ei);
+
+	ptr = (unsigned long)(ei + 1);
+	end = (unsigned long)ei + item_size;
+
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		ptr += sizeof(struct btrfs_tree_block_info);
+		BUG_ON(ptr > end);
+	} else {
+		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+	}
+
+	err = -ENOENT;
+	while (1) {
+		if (ptr >= end) {
+			WARN_ON(ptr > end);
+			break;
+		}
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(leaf, iref);
+		if (want < type)
+			break;
+		if (want > type) {
+			ptr += btrfs_extent_inline_ref_size(type);
+			continue;
+		}
+
+		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+			struct btrfs_extent_data_ref *dref;
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			if (match_extent_data_ref(leaf, dref, root_objectid,
+						  owner, offset)) {
+				err = 0;
+				break;
+			}
+			if (hash_extent_data_ref_item(leaf, dref) <
+			    hash_extent_data_ref(root_objectid, owner, offset))
+				break;
+		} else {
+			u64 ref_offset;
+			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+			if (parent > 0) {
+				if (parent == ref_offset) {
+					err = 0;
+					break;
+				}
+				if (ref_offset < parent)
+					break;
+			} else {
+				if (root_objectid == ref_offset) {
+					err = 0;
+					break;
+				}
+				if (ref_offset < root_objectid)
+					break;
+			}
+		}
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+	if (err == -ENOENT && insert) {
+		if (item_size + extra_size >=
+		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
+			err = -EAGAIN;
+			goto out;
+		}
+		/*
+		 * To add new inline back ref, we have to make sure
+		 * there is no corresponding back ref item.
+		 * For simplicity, we just do not add new inline back
+		 * ref if there is any kind of item for this block
+		 */
+		if (find_next_key(path, 0, &key) == 0 &&
+		    key.objectid == bytenr &&
+		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
+			err = -EAGAIN;
+			goto out;
+		}
+	}
+	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
+out:
+	if (insert) {
+		path->keep_locks = 0;
+		btrfs_unlock_up_safe(path, 1);
+	}
+	return err;
+}
+
+/*
+ * helper to add new inline back ref
+ */
+static noinline_for_stack
+int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_path *path,
+				struct btrfs_extent_inline_ref *iref,
+				u64 parent, u64 root_objectid,
+				u64 owner, u64 offset, int refs_to_add,
+				struct btrfs_delayed_extent_op *extent_op)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	unsigned long ptr;
+	unsigned long end;
+	unsigned long item_offset;
+	u64 refs;
+	int size;
+	int type;
+	int ret;
+
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	item_offset = (unsigned long)iref - (unsigned long)ei;
+
+	type = extent_ref_type(parent, owner);
+	size = btrfs_extent_inline_ref_size(type);
+
+	ret = btrfs_extend_item(trans, root, path, size);
+
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, ei);
+	refs += refs_to_add;
+	btrfs_set_extent_refs(leaf, ei, refs);
+	if (extent_op)
+		__run_delayed_extent_op(extent_op, leaf, ei);
+
+	ptr = (unsigned long)ei + item_offset;
+	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
+	if (ptr < end - size)
+		memmove_extent_buffer(leaf, ptr + size, ptr,
+				      end - size - ptr);
+
+	iref = (struct btrfs_extent_inline_ref *)ptr;
+	btrfs_set_extent_inline_ref_type(leaf, iref, type);
+	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+		struct btrfs_extent_data_ref *dref;
+		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
+		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
+		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
+		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
+	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+		struct btrfs_shared_data_ref *sref;
+		sref = (struct btrfs_shared_data_ref *)(iref + 1);
+		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+	} else {
+		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	return 0;
+}
+
+static int lookup_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref **ref_ret,
+				 u64 bytenr, u64 num_bytes, u64 parent,
+				 u64 root_objectid, u64 owner, u64 offset)
+{
+	int ret;
+
+	ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
+					   bytenr, num_bytes, parent,
+					   root_objectid, owner, offset, 0);
+	if (ret != -ENOENT)
+		return ret;
+
+	btrfs_release_path(path);
+	*ref_ret = NULL;
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
+					    root_objectid);
+	} else {
+		ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
+					     root_objectid, owner, offset);
+	}
+	return ret;
+}
+
+/*
+ * helper to update/remove inline back ref
+ */
+static noinline_for_stack
+int update_inline_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref *iref,
+				 int refs_to_mod,
+				 struct btrfs_delayed_extent_op *extent_op)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_data_ref *dref = NULL;
+	struct btrfs_shared_data_ref *sref = NULL;
+	unsigned long ptr;
+	unsigned long end;
+	u32 item_size;
+	int size;
+	int type;
+	int ret;
+	u64 refs;
+
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, ei);
+	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
+	refs += refs_to_mod;
+	btrfs_set_extent_refs(leaf, ei, refs);
+	if (extent_op)
+		__run_delayed_extent_op(extent_op, leaf, ei);
+
+	type = btrfs_extent_inline_ref_type(leaf, iref);
+
+	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+		refs = btrfs_extent_data_ref_count(leaf, dref);
+	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+		sref = (struct btrfs_shared_data_ref *)(iref + 1);
+		refs = btrfs_shared_data_ref_count(leaf, sref);
+	} else {
+		refs = 1;
+		BUG_ON(refs_to_mod != -1);
+	}
+
+	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
+	refs += refs_to_mod;
+
+	if (refs > 0) {
+		if (type == BTRFS_EXTENT_DATA_REF_KEY)
+			btrfs_set_extent_data_ref_count(leaf, dref, refs);
+		else
+			btrfs_set_shared_data_ref_count(leaf, sref, refs);
+	} else {
+		size =  btrfs_extent_inline_ref_size(type);
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		ptr = (unsigned long)iref;
+		end = (unsigned long)ei + item_size;
+		if (ptr + size < end)
+			memmove_extent_buffer(leaf, ptr, ptr + size,
+					      end - ptr - size);
+		item_size -= size;
+		ret = btrfs_truncate_item(trans, root, path, item_size, 1);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	return 0;
+}
+
+static noinline_for_stack
+int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 u64 bytenr, u64 num_bytes, u64 parent,
+				 u64 root_objectid, u64 owner,
+				 u64 offset, int refs_to_add,
+				 struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_extent_inline_ref *iref;
+	int ret;
+
+	ret = lookup_inline_extent_backref(trans, root, path, &iref,
+					   bytenr, num_bytes, parent,
+					   root_objectid, owner, offset, 1);
+	if (ret == 0) {
+		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+		ret = update_inline_extent_backref(trans, root, path, iref,
+						   refs_to_add, extent_op);
+	} else if (ret == -ENOENT) {
+		ret = setup_inline_extent_backref(trans, root, path, iref,
+						  parent, root_objectid,
+						  owner, offset, refs_to_add,
+						  extent_op);
+	}
+	return ret;
+}
+
+static int insert_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 u64 bytenr, u64 parent, u64 root_objectid,
+				 u64 owner, u64 offset, int refs_to_add)
+{
+	int ret;
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		BUG_ON(refs_to_add != 1);
+		ret = insert_tree_block_ref(trans, root, path, bytenr,
+					    parent, root_objectid);
+	} else {
+		ret = insert_extent_data_ref(trans, root, path, bytenr,
+					     parent, root_objectid,
+					     owner, offset, refs_to_add);
+	}
+	return ret;
+}
+
+static int remove_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref *iref,
+				 int refs_to_drop, int is_data)
+{
+	int ret;
+
+	BUG_ON(!is_data && refs_to_drop != 1);
+	if (iref) {
+		ret = update_inline_extent_backref(trans, root, path, iref,
+						   -refs_to_drop, NULL);
+	} else if (is_data) {
+		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+	} else {
+		ret = btrfs_del_item(trans, root, path);
+	}
+	return ret;
+}
+
+static int btrfs_issue_discard(struct block_device *bdev,
+				u64 start, u64 len)
+{
+	return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+}
+
+static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+				u64 num_bytes, u64 *actual_bytes)
+{
+	int ret;
+	u64 discarded_bytes = 0;
+	struct btrfs_multi_bio *multi = NULL;
+
+
+	/* Tell the block device(s) that the sectors can be discarded */
+	ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
+			      bytenr, &num_bytes, &multi, 0);
+	if (!ret) {
+		struct btrfs_bio_stripe *stripe = multi->stripes;
+		int i;
+
+
+		for (i = 0; i < multi->num_stripes; i++, stripe++) {
+			if (!stripe->dev->can_discard)
+				continue;
+
+			ret = btrfs_issue_discard(stripe->dev->bdev,
+						  stripe->physical,
+						  stripe->length);
+			if (!ret)
+				discarded_bytes += stripe->length;
+			else if (ret != -EOPNOTSUPP)
+				break;
+
+			/*
+			 * Just in case we get back EOPNOTSUPP for some reason,
+			 * just ignore the return value so we don't screw up
+			 * people calling discard_extent.
+			 */
+			ret = 0;
+		}
+		kfree(multi);
+	}
+
+	if (actual_bytes)
+		*actual_bytes = discarded_bytes;
+
+
+	return ret;
+}
+
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 root_objectid, u64 owner, u64 offset)
+{
+	int ret;
+	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
+	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+					parent, root_objectid, (int)owner,
+					BTRFS_ADD_DELAYED_REF, NULL);
+	} else {
+		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+					parent, root_objectid, owner, offset,
+					BTRFS_ADD_DELAYED_REF, NULL);
+	}
+	return ret;
+}
+
+static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 bytenr, u64 num_bytes,
+				  u64 parent, u64 root_objectid,
+				  u64 owner, u64 offset, int refs_to_add,
+				  struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *item;
+	u64 refs;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 1;
+	path->leave_spinning = 1;
+	/* this will setup the path even if it fails to insert the back ref */
+	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
+					   path, bytenr, num_bytes, parent,
+					   root_objectid, owner, offset,
+					   refs_to_add, extent_op);
+	if (ret == 0)
+		goto out;
+
+	if (ret != -EAGAIN) {
+		err = ret;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, item);
+	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
+	if (extent_op)
+		__run_delayed_extent_op(extent_op, leaf, item);
+
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+
+	path->reada = 1;
+	path->leave_spinning = 1;
+
+	/* now insert the actual backref */
+	ret = insert_extent_backref(trans, root->fs_info->extent_root,
+				    path, bytenr, parent, root_objectid,
+				    owner, offset, refs_to_add);
+	BUG_ON(ret);
+out:
+	btrfs_free_path(path);
+	return err;
+}
+
+static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_delayed_ref_node *node,
+				struct btrfs_delayed_extent_op *extent_op,
+				int insert_reserved)
+{
+	int ret = 0;
+	struct btrfs_delayed_data_ref *ref;
+	struct btrfs_key ins;
+	u64 parent = 0;
+	u64 ref_root = 0;
+	u64 flags = 0;
+
+	ins.objectid = node->bytenr;
+	ins.offset = node->num_bytes;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+	ref = btrfs_delayed_node_to_data_ref(node);
+	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
+		parent = ref->parent;
+	else
+		ref_root = ref->root;
+
+	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+		if (extent_op) {
+			BUG_ON(extent_op->update_key);
+			flags |= extent_op->flags_to_set;
+		}
+		ret = alloc_reserved_file_extent(trans, root,
+						 parent, ref_root, flags,
+						 ref->objectid, ref->offset,
+						 &ins, node->ref_mod);
+	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
+		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+					     node->num_bytes, parent,
+					     ref_root, ref->objectid,
+					     ref->offset, node->ref_mod,
+					     extent_op);
+	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
+		ret = __btrfs_free_extent(trans, root, node->bytenr,
+					  node->num_bytes, parent,
+					  ref_root, ref->objectid,
+					  ref->offset, node->ref_mod,
+					  extent_op);
+	} else {
+		BUG();
+	}
+	return ret;
+}
+
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+				    struct extent_buffer *leaf,
+				    struct btrfs_extent_item *ei)
+{
+	u64 flags = btrfs_extent_flags(leaf, ei);
+	if (extent_op->update_flags) {
+		flags |= extent_op->flags_to_set;
+		btrfs_set_extent_flags(leaf, ei, flags);
+	}
+
+	if (extent_op->update_key) {
+		struct btrfs_tree_block_info *bi;
+		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
+	}
+}
+
+static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_delayed_ref_node *node,
+				 struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct btrfs_extent_item *ei;
+	struct extent_buffer *leaf;
+	u32 item_size;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = node->bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = node->num_bytes;
+
+	path->reada = 1;
+	path->leave_spinning = 1;
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+				path, 0, 1);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	if (ret > 0) {
+		err = -EIO;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
+					     path, (u64)-1, 0);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	}
+#endif
+	BUG_ON(item_size < sizeof(*ei));
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	__run_delayed_extent_op(extent_op, leaf, ei);
+
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	return err;
+}
+
+static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_delayed_ref_node *node,
+				struct btrfs_delayed_extent_op *extent_op,
+				int insert_reserved)
+{
+	int ret = 0;
+	struct btrfs_delayed_tree_ref *ref;
+	struct btrfs_key ins;
+	u64 parent = 0;
+	u64 ref_root = 0;
+
+	ins.objectid = node->bytenr;
+	ins.offset = node->num_bytes;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+	ref = btrfs_delayed_node_to_tree_ref(node);
+	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+		parent = ref->parent;
+	else
+		ref_root = ref->root;
+
+	BUG_ON(node->ref_mod != 1);
+	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+		BUG_ON(!extent_op || !extent_op->update_flags ||
+		       !extent_op->update_key);
+		ret = alloc_reserved_tree_block(trans, root,
+						parent, ref_root,
+						extent_op->flags_to_set,
+						&extent_op->key,
+						ref->level, &ins);
+	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
+		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+					     node->num_bytes, parent, ref_root,
+					     ref->level, 0, 1, extent_op);
+	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
+		ret = __btrfs_free_extent(trans, root, node->bytenr,
+					  node->num_bytes, parent, ref_root,
+					  ref->level, 0, 1, extent_op);
+	} else {
+		BUG();
+	}
+	return ret;
+}
+
+/* helper function to actually process a single delayed ref entry */
+static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_delayed_ref_node *node,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int insert_reserved)
+{
+	int ret;
+	if (btrfs_delayed_ref_is_head(node)) {
+		struct btrfs_delayed_ref_head *head;
+		/*
+		 * we've hit the end of the chain and we were supposed
+		 * to insert this extent into the tree.  But, it got
+		 * deleted before we ever needed to insert it, so all
+		 * we have to do is clean up the accounting
+		 */
+		BUG_ON(extent_op);
+		head = btrfs_delayed_node_to_head(node);
+		if (insert_reserved) {
+			btrfs_pin_extent(root, node->bytenr,
+					 node->num_bytes, 1);
+			if (head->is_data) {
+				ret = btrfs_del_csums(trans, root,
+						      node->bytenr,
+						      node->num_bytes);
+				BUG_ON(ret);
+			}
+		}
+		mutex_unlock(&head->mutex);
+		return 0;
+	}
+
+	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+		ret = run_delayed_tree_ref(trans, root, node, extent_op,
+					   insert_reserved);
+	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+		 node->type == BTRFS_SHARED_DATA_REF_KEY)
+		ret = run_delayed_data_ref(trans, root, node, extent_op,
+					   insert_reserved);
+	else
+		BUG();
+	return ret;
+}
+
+static noinline struct btrfs_delayed_ref_node *
+select_delayed_ref(struct btrfs_delayed_ref_head *head)
+{
+	struct rb_node *node;
+	struct btrfs_delayed_ref_node *ref;
+	int action = BTRFS_ADD_DELAYED_REF;
+again:
+	/*
+	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
+	 * this prevents ref count from going down to zero when
+	 * there still are pending delayed ref.
+	 */
+	node = rb_prev(&head->node.rb_node);
+	while (1) {
+		if (!node)
+			break;
+		ref = rb_entry(node, struct btrfs_delayed_ref_node,
+				rb_node);
+		if (ref->bytenr != head->node.bytenr)
+			break;
+		if (ref->action == action)
+			return ref;
+		node = rb_prev(node);
+	}
+	if (action == BTRFS_ADD_DELAYED_REF) {
+		action = BTRFS_DROP_DELAYED_REF;
+		goto again;
+	}
+	return NULL;
+}
+
+static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct list_head *cluster)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_ref_head *locked_ref = NULL;
+	struct btrfs_delayed_extent_op *extent_op;
+	int ret;
+	int count = 0;
+	int must_insert_reserved = 0;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	while (1) {
+		if (!locked_ref) {
+			/* pick a new head ref from the cluster list */
+			if (list_empty(cluster))
+				break;
+
+			locked_ref = list_entry(cluster->next,
+				     struct btrfs_delayed_ref_head, cluster);
+
+			/* grab the lock that says we are going to process
+			 * all the refs for this head */
+			ret = btrfs_delayed_ref_lock(trans, locked_ref);
+
+			/*
+			 * we may have dropped the spin lock to get the head
+			 * mutex lock, and that might have given someone else
+			 * time to free the head.  If that's true, it has been
+			 * removed from our list and we can move on.
+			 */
+			if (ret == -EAGAIN) {
+				locked_ref = NULL;
+				count++;
+				continue;
+			}
+		}
+
+		/*
+		 * record the must insert reserved flag before we
+		 * drop the spin lock.
+		 */
+		must_insert_reserved = locked_ref->must_insert_reserved;
+		locked_ref->must_insert_reserved = 0;
+
+		extent_op = locked_ref->extent_op;
+		locked_ref->extent_op = NULL;
+
+		/*
+		 * locked_ref is the head node, so we have to go one
+		 * node back for any delayed ref updates
+		 */
+		ref = select_delayed_ref(locked_ref);
+		if (!ref) {
+			/* All delayed refs have been processed, Go ahead
+			 * and send the head node to run_one_delayed_ref,
+			 * so that any accounting fixes can happen
+			 */
+			ref = &locked_ref->node;
+
+			if (extent_op && must_insert_reserved) {
+				kfree(extent_op);
+				extent_op = NULL;
+			}
+
+			if (extent_op) {
+				spin_unlock(&delayed_refs->lock);
+
+				ret = run_delayed_extent_op(trans, root,
+							    ref, extent_op);
+				BUG_ON(ret);
+				kfree(extent_op);
+
+				cond_resched();
+				spin_lock(&delayed_refs->lock);
+				continue;
+			}
+
+			list_del_init(&locked_ref->cluster);
+			locked_ref = NULL;
+		}
+
+		ref->in_tree = 0;
+		rb_erase(&ref->rb_node, &delayed_refs->root);
+		delayed_refs->num_entries--;
+
+		spin_unlock(&delayed_refs->lock);
+
+		ret = run_one_delayed_ref(trans, root, ref, extent_op,
+					  must_insert_reserved);
+		BUG_ON(ret);
+
+		btrfs_put_delayed_ref(ref);
+		kfree(extent_op);
+		count++;
+
+		cond_resched();
+		spin_lock(&delayed_refs->lock);
+	}
+	return count;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far.  count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, unsigned long count)
+{
+	struct rb_node *node;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_node *ref;
+	struct list_head cluster;
+	int ret;
+	int run_all = count == (unsigned long)-1;
+	int run_most = 0;
+
+	if (root == root->fs_info->extent_root)
+		root = root->fs_info->tree_root;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	INIT_LIST_HEAD(&cluster);
+again:
+	spin_lock(&delayed_refs->lock);
+	if (count == 0) {
+		count = delayed_refs->num_entries * 2;
+		run_most = 1;
+	}
+	while (1) {
+		if (!(run_all || run_most) &&
+		    delayed_refs->num_heads_ready < 64)
+			break;
+
+		/*
+		 * go find something we can process in the rbtree.  We start at
+		 * the beginning of the tree, and then build a cluster
+		 * of refs to process starting at the first one we are able to
+		 * lock
+		 */
+		ret = btrfs_find_ref_cluster(trans, &cluster,
+					     delayed_refs->run_delayed_start);
+		if (ret)
+			break;
+
+		ret = run_clustered_refs(trans, root, &cluster);
+		BUG_ON(ret < 0);
+
+		count -= min_t(unsigned long, ret, count);
+
+		if (count == 0)
+			break;
+	}
+
+	if (run_all) {
+		node = rb_first(&delayed_refs->root);
+		if (!node)
+			goto out;
+		count = (unsigned long)-1;
+
+		while (node) {
+			ref = rb_entry(node, struct btrfs_delayed_ref_node,
+				       rb_node);
+			if (btrfs_delayed_ref_is_head(ref)) {
+				struct btrfs_delayed_ref_head *head;
+
+				head = btrfs_delayed_node_to_head(ref);
+				atomic_inc(&ref->refs);
+
+				spin_unlock(&delayed_refs->lock);
+				/*
+				 * Mutex was contended, block until it's
+				 * released and try again
+				 */
+				mutex_lock(&head->mutex);
+				mutex_unlock(&head->mutex);
+
+				btrfs_put_delayed_ref(ref);
+				cond_resched();
+				goto again;
+			}
+			node = rb_next(node);
+		}
+		spin_unlock(&delayed_refs->lock);
+		schedule_timeout(1);
+		goto again;
+	}
+out:
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
+
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 flags,
+				int is_data)
+{
+	struct btrfs_delayed_extent_op *extent_op;
+	int ret;
+
+	extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+	if (!extent_op)
+		return -ENOMEM;
+
+	extent_op->flags_to_set = flags;
+	extent_op->update_flags = 1;
+	extent_op->update_key = 0;
+	extent_op->is_data = is_data ? 1 : 0;
+
+	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+	if (ret)
+		kfree(extent_op);
+	return ret;
+}
+
+static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      u64 objectid, u64 offset, u64 bytenr)
+{
+	struct btrfs_delayed_ref_head *head;
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_data_ref *data_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct rb_node *node;
+	int ret = 0;
+
+	ret = -ENOENT;
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	head = btrfs_find_delayed_ref_head(trans, bytenr);
+	if (!head)
+		goto out;
+
+	if (!mutex_trylock(&head->mutex)) {
+		atomic_inc(&head->node.refs);
+		spin_unlock(&delayed_refs->lock);
+
+		btrfs_release_path(path);
+
+		/*
+		 * Mutex was contended, block until it's released and let
+		 * caller try again
+		 */
+		mutex_lock(&head->mutex);
+		mutex_unlock(&head->mutex);
+		btrfs_put_delayed_ref(&head->node);
+		return -EAGAIN;
+	}
+
+	node = rb_prev(&head->node.rb_node);
+	if (!node)
+		goto out_unlock;
+
+	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+
+	if (ref->bytenr != bytenr)
+		goto out_unlock;
+
+	ret = 1;
+	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
+		goto out_unlock;
+
+	data_ref = btrfs_delayed_node_to_data_ref(ref);
+
+	node = rb_prev(node);
+	if (node) {
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		if (ref->bytenr == bytenr)
+			goto out_unlock;
+	}
+
+	if (data_ref->root != root->root_key.objectid ||
+	    data_ref->objectid != objectid || data_ref->offset != offset)
+		goto out_unlock;
+
+	ret = 0;
+out_unlock:
+	mutex_unlock(&head->mutex);
+out:
+	spin_unlock(&delayed_refs->lock);
+	return ret;
+}
+
+static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					u64 objectid, u64 offset, u64 bytenr)
+{
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_data_ref *ref;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_extent_item *ei;
+	struct btrfs_key key;
+	u32 item_size;
+	int ret;
+
+	key.objectid = bytenr;
+	key.offset = (u64)-1;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+
+	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0);
+
+	ret = -ENOENT;
+	if (path->slots[0] == 0)
+		goto out;
+
+	path->slots[0]--;
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
+		goto out;
+
+	ret = 1;
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+		goto out;
+	}
+#endif
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+
+	if (item_size != sizeof(*ei) +
+	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
+		goto out;
+
+	if (btrfs_extent_generation(leaf, ei) <=
+	    btrfs_root_last_snapshot(&root->root_item))
+		goto out;
+
+	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+	if (btrfs_extent_inline_ref_type(leaf, iref) !=
+	    BTRFS_EXTENT_DATA_REF_KEY)
+		goto out;
+
+	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+	if (btrfs_extent_refs(leaf, ei) !=
+	    btrfs_extent_data_ref_count(leaf, ref) ||
+	    btrfs_extent_data_ref_root(leaf, ref) !=
+	    root->root_key.objectid ||
+	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
+	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
+		goto out;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 objectid, u64 offset, u64 bytenr)
+{
+	struct btrfs_path *path;
+	int ret;
+	int ret2;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOENT;
+
+	do {
+		ret = check_committed_ref(trans, root, path, objectid,
+					  offset, bytenr);
+		if (ret && ret != -ENOENT)
+			goto out;
+
+		ret2 = check_delayed_ref(trans, root, path, objectid,
+					 offset, bytenr);
+	} while (ret2 == -EAGAIN);
+
+	if (ret2 && ret2 != -ENOENT) {
+		ret = ret2;
+		goto out;
+	}
+
+	if (ret != -ENOENT || ret2 != -ENOENT)
+		ret = 0;
+out:
+	btrfs_free_path(path);
+	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+		WARN_ON(ret > 0);
+	return ret;
+}
+
+static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct extent_buffer *buf,
+			   int full_backref, int inc)
+{
+	u64 bytenr;
+	u64 num_bytes;
+	u64 parent;
+	u64 ref_root;
+	u32 nritems;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	int i;
+	int level;
+	int ret = 0;
+	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
+			    u64, u64, u64, u64, u64, u64);
+
+	ref_root = btrfs_header_owner(buf);
+	nritems = btrfs_header_nritems(buf);
+	level = btrfs_header_level(buf);
+
+	if (!root->ref_cows && level == 0)
+		return 0;
+
+	if (inc)
+		process_func = btrfs_inc_extent_ref;
+	else
+		process_func = btrfs_free_extent;
+
+	if (full_backref)
+		parent = buf->start;
+	else
+		parent = 0;
+
+	for (i = 0; i < nritems; i++) {
+		if (level == 0) {
+			btrfs_item_key_to_cpu(buf, &key, i);
+			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+				continue;
+			fi = btrfs_item_ptr(buf, i,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(buf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE)
+				continue;
+			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (bytenr == 0)
+				continue;
+
+			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
+			key.offset -= btrfs_file_extent_offset(buf, fi);
+			ret = process_func(trans, root, bytenr, num_bytes,
+					   parent, ref_root, key.objectid,
+					   key.offset);
+			if (ret)
+				goto fail;
+		} else {
+			bytenr = btrfs_node_blockptr(buf, i);
+			num_bytes = btrfs_level_size(root, level - 1);
+			ret = process_func(trans, root, bytenr, num_bytes,
+					   parent, ref_root, level - 1, 0);
+			if (ret)
+				goto fail;
+		}
+	}
+	return 0;
+fail:
+	BUG();
+	return ret;
+}
+
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref)
+{
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+}
+
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref)
+{
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+}
+
+static int write_one_cache_group(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_block_group_cache *cache)
+{
+	int ret;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	unsigned long bi;
+	struct extent_buffer *leaf;
+
+	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
+	if (ret < 0)
+		goto fail;
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+fail:
+	if (ret)
+		return ret;
+	return 0;
+
+}
+
+static struct btrfs_block_group_cache *
+next_block_group(struct btrfs_root *root,
+		 struct btrfs_block_group_cache *cache)
+{
+	struct rb_node *node;
+	spin_lock(&root->fs_info->block_group_cache_lock);
+	node = rb_next(&cache->cache_node);
+	btrfs_put_block_group(cache);
+	if (node) {
+		cache = rb_entry(node, struct btrfs_block_group_cache,
+				 cache_node);
+		btrfs_get_block_group(cache);
+	} else
+		cache = NULL;
+	spin_unlock(&root->fs_info->block_group_cache_lock);
+	return cache;
+}
+
+static int cache_save_setup(struct btrfs_block_group_cache *block_group,
+			    struct btrfs_trans_handle *trans,
+			    struct btrfs_path *path)
+{
+	struct btrfs_root *root = block_group->fs_info->tree_root;
+	struct inode *inode = NULL;
+	u64 alloc_hint = 0;
+	int dcs = BTRFS_DC_ERROR;
+	int num_pages = 0;
+	int retries = 0;
+	int ret = 0;
+
+	/*
+	 * If this block group is smaller than 100 megs don't bother caching the
+	 * block group.
+	 */
+	if (block_group->key.offset < (100 * 1024 * 1024)) {
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+
+again:
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+		ret = PTR_ERR(inode);
+		btrfs_release_path(path);
+		goto out;
+	}
+
+	if (IS_ERR(inode)) {
+		BUG_ON(retries);
+		retries++;
+
+		if (block_group->ro)
+			goto out_free;
+
+		ret = create_free_space_inode(root, trans, block_group, path);
+		if (ret)
+			goto out_free;
+		goto again;
+	}
+
+	/*
+	 * We want to set the generation to 0, that way if anything goes wrong
+	 * from here on out we know not to trust this cache when we load up next
+	 * time.
+	 */
+	BTRFS_I(inode)->generation = 0;
+	ret = btrfs_update_inode(trans, root, inode);
+	WARN_ON(ret);
+
+	if (i_size_read(inode) > 0) {
+		ret = btrfs_truncate_free_space_cache(root, trans, path,
+						      inode);
+		if (ret)
+			goto out_put;
+	}
+
+	spin_lock(&block_group->lock);
+	if (block_group->cached != BTRFS_CACHE_FINISHED) {
+		/* We're not cached, don't bother trying to write stuff out */
+		dcs = BTRFS_DC_WRITTEN;
+		spin_unlock(&block_group->lock);
+		goto out_put;
+	}
+	spin_unlock(&block_group->lock);
+
+	num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+	if (!num_pages)
+		num_pages = 1;
+
+	/*
+	 * Just to make absolutely sure we have enough space, we're going to
+	 * preallocate 12 pages worth of space for each block group.  In
+	 * practice we ought to use at most 8, but we need extra space so we can
+	 * add our header and have a terminator between the extents and the
+	 * bitmaps.
+	 */
+	num_pages *= 16;
+	num_pages *= PAGE_CACHE_SIZE;
+
+	ret = btrfs_check_data_free_space(inode, num_pages);
+	if (ret)
+		goto out_put;
+
+	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
+					      num_pages, num_pages,
+					      &alloc_hint);
+	if (!ret)
+		dcs = BTRFS_DC_SETUP;
+	btrfs_free_reserved_data_space(inode, num_pages);
+out_put:
+	iput(inode);
+out_free:
+	btrfs_release_path(path);
+out:
+	spin_lock(&block_group->lock);
+	block_group->disk_cache_state = dcs;
+	spin_unlock(&block_group->lock);
+
+	return ret;
+}
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root)
+{
+	struct btrfs_block_group_cache *cache;
+	int err = 0;
+	struct btrfs_path *path;
+	u64 last = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+again:
+	while (1) {
+		cache = btrfs_lookup_first_block_group(root->fs_info, last);
+		while (cache) {
+			if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+				break;
+			cache = next_block_group(root, cache);
+		}
+		if (!cache) {
+			if (last == 0)
+				break;
+			last = 0;
+			continue;
+		}
+		err = cache_save_setup(cache, trans, path);
+		last = cache->key.objectid + cache->key.offset;
+		btrfs_put_block_group(cache);
+	}
+
+	while (1) {
+		if (last == 0) {
+			err = btrfs_run_delayed_refs(trans, root,
+						     (unsigned long)-1);
+			BUG_ON(err);
+		}
+
+		cache = btrfs_lookup_first_block_group(root->fs_info, last);
+		while (cache) {
+			if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
+				btrfs_put_block_group(cache);
+				goto again;
+			}
+
+			if (cache->dirty)
+				break;
+			cache = next_block_group(root, cache);
+		}
+		if (!cache) {
+			if (last == 0)
+				break;
+			last = 0;
+			continue;
+		}
+
+		if (cache->disk_cache_state == BTRFS_DC_SETUP)
+			cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
+		cache->dirty = 0;
+		last = cache->key.objectid + cache->key.offset;
+
+		err = write_one_cache_group(trans, root, path, cache);
+		BUG_ON(err);
+		btrfs_put_block_group(cache);
+	}
+
+	while (1) {
+		/*
+		 * I don't think this is needed since we're just marking our
+		 * preallocated extent as written, but just in case it can't
+		 * hurt.
+		 */
+		if (last == 0) {
+			err = btrfs_run_delayed_refs(trans, root,
+						     (unsigned long)-1);
+			BUG_ON(err);
+		}
+
+		cache = btrfs_lookup_first_block_group(root->fs_info, last);
+		while (cache) {
+			/*
+			 * Really this shouldn't happen, but it could if we
+			 * couldn't write the entire preallocated extent and
+			 * splitting the extent resulted in a new block.
+			 */
+			if (cache->dirty) {
+				btrfs_put_block_group(cache);
+				goto again;
+			}
+			if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+				break;
+			cache = next_block_group(root, cache);
+		}
+		if (!cache) {
+			if (last == 0)
+				break;
+			last = 0;
+			continue;
+		}
+
+		btrfs_write_out_cache(root, trans, cache, path);
+
+		/*
+		 * If we didn't have an error then the cache state is still
+		 * NEED_WRITE, so we can set it to WRITTEN.
+		 */
+		if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+			cache->disk_cache_state = BTRFS_DC_WRITTEN;
+		last = cache->key.objectid + cache->key.offset;
+		btrfs_put_block_group(cache);
+	}
+
+	btrfs_free_path(path);
+	return 0;
+}
+
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
+{
+	struct btrfs_block_group_cache *block_group;
+	int readonly = 0;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+	if (!block_group || block_group->ro)
+		readonly = 1;
+	if (block_group)
+		btrfs_put_block_group(block_group);
+	return readonly;
+}
+
+static int update_space_info(struct btrfs_fs_info *info, u64 flags,
+			     u64 total_bytes, u64 bytes_used,
+			     struct btrfs_space_info **space_info)
+{
+	struct btrfs_space_info *found;
+	int i;
+	int factor;
+
+	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+		     BTRFS_BLOCK_GROUP_RAID10))
+		factor = 2;
+	else
+		factor = 1;
+
+	found = __find_space_info(info, flags);
+	if (found) {
+		spin_lock(&found->lock);
+		found->total_bytes += total_bytes;
+		found->disk_total += total_bytes * factor;
+		found->bytes_used += bytes_used;
+		found->disk_used += bytes_used * factor;
+		found->full = 0;
+		spin_unlock(&found->lock);
+		*space_info = found;
+		return 0;
+	}
+	found = kzalloc(sizeof(*found), GFP_NOFS);
+	if (!found)
+		return -ENOMEM;
+
+	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+		INIT_LIST_HEAD(&found->block_groups[i]);
+	init_rwsem(&found->groups_sem);
+	spin_lock_init(&found->lock);
+	found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
+				BTRFS_BLOCK_GROUP_SYSTEM |
+				BTRFS_BLOCK_GROUP_METADATA);
+	found->total_bytes = total_bytes;
+	found->disk_total = total_bytes * factor;
+	found->bytes_used = bytes_used;
+	found->disk_used = bytes_used * factor;
+	found->bytes_pinned = 0;
+	found->bytes_reserved = 0;
+	found->bytes_readonly = 0;
+	found->bytes_may_use = 0;
+	found->full = 0;
+	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
+	found->chunk_alloc = 0;
+	*space_info = found;
+	list_add_rcu(&found->list, &info->space_info);
+	atomic_set(&found->caching_threads, 0);
+	return 0;
+}
+
+static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
+				   BTRFS_BLOCK_GROUP_RAID1 |
+				   BTRFS_BLOCK_GROUP_RAID10 |
+				   BTRFS_BLOCK_GROUP_DUP);
+	if (extra_flags) {
+		if (flags & BTRFS_BLOCK_GROUP_DATA)
+			fs_info->avail_data_alloc_bits |= extra_flags;
+		if (flags & BTRFS_BLOCK_GROUP_METADATA)
+			fs_info->avail_metadata_alloc_bits |= extra_flags;
+		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+			fs_info->avail_system_alloc_bits |= extra_flags;
+	}
+}
+
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
+{
+	/*
+	 * we add in the count of missing devices because we want
+	 * to make sure that any RAID levels on a degraded FS
+	 * continue to be honored.
+	 */
+	u64 num_devices = root->fs_info->fs_devices->rw_devices +
+		root->fs_info->fs_devices->missing_devices;
+
+	if (num_devices == 1)
+		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+	if (num_devices < 4)
+		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
+
+	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
+	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+		      BTRFS_BLOCK_GROUP_RAID10))) {
+		flags &= ~BTRFS_BLOCK_GROUP_DUP;
+	}
+
+	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
+	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
+		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
+	}
+
+	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
+	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
+	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
+	     (flags & BTRFS_BLOCK_GROUP_DUP)))
+		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+	return flags;
+}
+
+static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
+{
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		flags |= root->fs_info->avail_data_alloc_bits &
+			 root->fs_info->data_alloc_profile;
+	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		flags |= root->fs_info->avail_system_alloc_bits &
+			 root->fs_info->system_alloc_profile;
+	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		flags |= root->fs_info->avail_metadata_alloc_bits &
+			 root->fs_info->metadata_alloc_profile;
+	return btrfs_reduce_alloc_profile(root, flags);
+}
+
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+{
+	u64 flags;
+
+	if (data)
+		flags = BTRFS_BLOCK_GROUP_DATA;
+	else if (root == root->fs_info->chunk_root)
+		flags = BTRFS_BLOCK_GROUP_SYSTEM;
+	else
+		flags = BTRFS_BLOCK_GROUP_METADATA;
+
+	return get_alloc_profile(root, flags);
+}
+
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+{
+	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
+						       BTRFS_BLOCK_GROUP_DATA);
+}
+
+/*
+ * This will check the space that the inode allocates from to make sure we have
+ * enough space for bytes.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 used;
+	int ret = 0, committed = 0, alloc_chunk = 1;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	if (root == root->fs_info->tree_root ||
+	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
+		alloc_chunk = 0;
+		committed = 1;
+	}
+
+	data_sinfo = BTRFS_I(inode)->space_info;
+	if (!data_sinfo)
+		goto alloc;
+
+again:
+	/* make sure we have enough space to handle the data first */
+	spin_lock(&data_sinfo->lock);
+	used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
+		data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
+		data_sinfo->bytes_may_use;
+
+	if (used + bytes > data_sinfo->total_bytes) {
+		struct btrfs_trans_handle *trans;
+
+		/*
+		 * if we don't have enough free bytes in this space then we need
+		 * to alloc a new chunk.
+		 */
+		if (!data_sinfo->full && alloc_chunk) {
+			u64 alloc_target;
+
+			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
+			spin_unlock(&data_sinfo->lock);
+alloc:
+			alloc_target = btrfs_get_alloc_profile(root, 1);
+			trans = btrfs_join_transaction(root);
+			if (IS_ERR(trans))
+				return PTR_ERR(trans);
+
+			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+					     bytes + 2 * 1024 * 1024,
+					     alloc_target,
+					     CHUNK_ALLOC_NO_FORCE);
+			btrfs_end_transaction(trans, root);
+			if (ret < 0) {
+				if (ret != -ENOSPC)
+					return ret;
+				else
+					goto commit_trans;
+			}
+
+			if (!data_sinfo) {
+				btrfs_set_inode_space_info(root, inode);
+				data_sinfo = BTRFS_I(inode)->space_info;
+			}
+			goto again;
+		}
+
+		/*
+		 * If we have less pinned bytes than we want to allocate then
+		 * don't bother committing the transaction, it won't help us.
+		 */
+		if (data_sinfo->bytes_pinned < bytes)
+			committed = 1;
+		spin_unlock(&data_sinfo->lock);
+
+		/* commit the current transaction and try again */
+commit_trans:
+		if (!committed &&
+		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
+			committed = 1;
+			trans = btrfs_join_transaction(root);
+			if (IS_ERR(trans))
+				return PTR_ERR(trans);
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
+
+		return -ENOSPC;
+	}
+	data_sinfo->bytes_may_use += bytes;
+	BTRFS_I(inode)->reserved_bytes += bytes;
+	spin_unlock(&data_sinfo->lock);
+
+	return 0;
+}
+
+/*
+ * called when we are clearing an delalloc extent from the
+ * inode's io_tree or there was an error for whatever reason
+ * after calling btrfs_check_data_free_space
+ */
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_space_info *data_sinfo;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	data_sinfo = BTRFS_I(inode)->space_info;
+	spin_lock(&data_sinfo->lock);
+	data_sinfo->bytes_may_use -= bytes;
+	BTRFS_I(inode)->reserved_bytes -= bytes;
+	spin_unlock(&data_sinfo->lock);
+}
+
+static void force_metadata_allocation(struct btrfs_fs_info *info)
+{
+	struct list_head *head = &info->space_info;
+	struct btrfs_space_info *found;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+			found->force_alloc = CHUNK_ALLOC_FORCE;
+	}
+	rcu_read_unlock();
+}
+
+static int should_alloc_chunk(struct btrfs_root *root,
+			      struct btrfs_space_info *sinfo, u64 alloc_bytes,
+			      int force)
+{
+	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
+	u64 thresh;
+
+	if (force == CHUNK_ALLOC_FORCE)
+		return 1;
+
+	/*
+	 * in limited mode, we want to have some free space up to
+	 * about 1% of the FS size.
+	 */
+	if (force == CHUNK_ALLOC_LIMITED) {
+		thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+		thresh = max_t(u64, 64 * 1024 * 1024,
+			       div_factor_fine(thresh, 1));
+
+		if (num_bytes - num_allocated < thresh)
+			return 1;
+	}
+
+	/*
+	 * we have two similar checks here, one based on percentage
+	 * and once based on a hard number of 256MB.  The idea
+	 * is that if we have a good amount of free
+	 * room, don't allocate a chunk.  A good mount is
+	 * less than 80% utilized of the chunks we have allocated,
+	 * or more than 256MB free
+	 */
+	if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+		return 0;
+
+	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
+		return 0;
+
+	thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+
+	/* 256MB or 5% of the FS */
+	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+
+	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+		return 0;
+	return 1;
+}
+
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *extent_root, u64 alloc_bytes,
+			  u64 flags, int force)
+{
+	struct btrfs_space_info *space_info;
+	struct btrfs_fs_info *fs_info = extent_root->fs_info;
+	int wait_for_alloc = 0;
+	int ret = 0;
+
+	flags = btrfs_reduce_alloc_profile(extent_root, flags);
+
+	space_info = __find_space_info(extent_root->fs_info, flags);
+	if (!space_info) {
+		ret = update_space_info(extent_root->fs_info, flags,
+					0, 0, &space_info);
+		BUG_ON(ret);
+	}
+	BUG_ON(!space_info);
+
+again:
+	spin_lock(&space_info->lock);
+	if (space_info->force_alloc)
+		force = space_info->force_alloc;
+	if (space_info->full) {
+		spin_unlock(&space_info->lock);
+		return 0;
+	}
+
+	if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
+		spin_unlock(&space_info->lock);
+		return 0;
+	} else if (space_info->chunk_alloc) {
+		wait_for_alloc = 1;
+	} else {
+		space_info->chunk_alloc = 1;
+	}
+
+	spin_unlock(&space_info->lock);
+
+	mutex_lock(&fs_info->chunk_mutex);
+
+	/*
+	 * The chunk_mutex is held throughout the entirety of a chunk
+	 * allocation, so once we've acquired the chunk_mutex we know that the
+	 * other guy is done and we need to recheck and see if we should
+	 * allocate.
+	 */
+	if (wait_for_alloc) {
+		mutex_unlock(&fs_info->chunk_mutex);
+		wait_for_alloc = 0;
+		goto again;
+	}
+
+	/*
+	 * If we have mixed data/metadata chunks we want to make sure we keep
+	 * allocating mixed chunks instead of individual chunks.
+	 */
+	if (btrfs_mixed_space_info(space_info))
+		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
+
+	/*
+	 * if we're doing a data chunk, go ahead and make sure that
+	 * we keep a reasonable number of metadata chunks allocated in the
+	 * FS as well.
+	 */
+	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
+		fs_info->data_chunk_allocations++;
+		if (!(fs_info->data_chunk_allocations %
+		      fs_info->metadata_ratio))
+			force_metadata_allocation(fs_info);
+	}
+
+	ret = btrfs_alloc_chunk(trans, extent_root, flags);
+	spin_lock(&space_info->lock);
+	if (ret)
+		space_info->full = 1;
+	else
+		ret = 1;
+
+	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+	space_info->chunk_alloc = 0;
+	spin_unlock(&space_info->lock);
+	mutex_unlock(&extent_root->fs_info->chunk_mutex);
+	return ret;
+}
+
+/*
+ * shrink metadata reservation for delalloc
+ */
+static int shrink_delalloc(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 to_reclaim, int sync)
+{
+	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_space_info *space_info;
+	u64 reserved;
+	u64 max_reclaim;
+	u64 reclaimed = 0;
+	long time_left;
+	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
+	int loops = 0;
+	unsigned long progress;
+
+	block_rsv = &root->fs_info->delalloc_block_rsv;
+	space_info = block_rsv->space_info;
+
+	smp_mb();
+	reserved = space_info->bytes_reserved;
+	progress = space_info->reservation_progress;
+
+	if (reserved == 0)
+		return 0;
+
+	max_reclaim = min(reserved, to_reclaim);
+
+	while (loops < 1024) {
+		/* have the flusher threads jump in and do some IO */
+		smp_mb();
+		nr_pages = min_t(unsigned long, nr_pages,
+		       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
+		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
+
+		spin_lock(&space_info->lock);
+		if (reserved > space_info->bytes_reserved)
+			reclaimed += reserved - space_info->bytes_reserved;
+		reserved = space_info->bytes_reserved;
+		spin_unlock(&space_info->lock);
+
+		loops++;
+
+		if (reserved == 0 || reclaimed >= max_reclaim)
+			break;
+
+		if (trans && trans->transaction->blocked)
+			return -EAGAIN;
+
+		time_left = schedule_timeout_interruptible(1);
+
+		/* We were interrupted, exit */
+		if (time_left)
+			break;
+
+		/* we've kicked the IO a few times, if anything has been freed,
+		 * exit.  There is no sense in looping here for a long time
+		 * when we really need to commit the transaction, or there are
+		 * just too many writers without enough free space
+		 */
+
+		if (loops > 3) {
+			smp_mb();
+			if (progress != space_info->reservation_progress)
+				break;
+		}
+
+	}
+	return reclaimed >= to_reclaim;
+}
+
+/*
+ * Retries tells us how many times we've called reserve_metadata_bytes.  The
+ * idea is if this is the first call (retries == 0) then we will add to our
+ * reserved count if we can't make the allocation in order to hold our place
+ * while we go and try and free up space.  That way for retries > 1 we don't try
+ * and add space, we just check to see if the amount of unused space is >= the
+ * total space, meaning that our reservation is valid.
+ *
+ * However if we don't intend to retry this reservation, pass -1 as retries so
+ * that it short circuits this logic.
+ */
+static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_block_rsv *block_rsv,
+				  u64 orig_bytes, int flush)
+{
+	struct btrfs_space_info *space_info = block_rsv->space_info;
+	u64 unused;
+	u64 num_bytes = orig_bytes;
+	int retries = 0;
+	int ret = 0;
+	bool reserved = false;
+	bool committed = false;
+
+again:
+	ret = -ENOSPC;
+	if (reserved)
+		num_bytes = 0;
+
+	spin_lock(&space_info->lock);
+	unused = space_info->bytes_used + space_info->bytes_reserved +
+		 space_info->bytes_pinned + space_info->bytes_readonly +
+		 space_info->bytes_may_use;
+
+	/*
+	 * The idea here is that we've not already over-reserved the block group
+	 * then we can go ahead and save our reservation first and then start
+	 * flushing if we need to.  Otherwise if we've already overcommitted
+	 * lets start flushing stuff first and then come back and try to make
+	 * our reservation.
+	 */
+	if (unused <= space_info->total_bytes) {
+		unused = space_info->total_bytes - unused;
+		if (unused >= num_bytes) {
+			if (!reserved)
+				space_info->bytes_reserved += orig_bytes;
+			ret = 0;
+		} else {
+			/*
+			 * Ok set num_bytes to orig_bytes since we aren't
+			 * overocmmitted, this way we only try and reclaim what
+			 * we need.
+			 */
+			num_bytes = orig_bytes;
+		}
+	} else {
+		/*
+		 * Ok we're over committed, set num_bytes to the overcommitted
+		 * amount plus the amount of bytes that we need for this
+		 * reservation.
+		 */
+		num_bytes = unused - space_info->total_bytes +
+			(orig_bytes * (retries + 1));
+	}
+
+	/*
+	 * Couldn't make our reservation, save our place so while we're trying
+	 * to reclaim space we can actually use it instead of somebody else
+	 * stealing it from us.
+	 */
+	if (ret && !reserved) {
+		space_info->bytes_reserved += orig_bytes;
+		reserved = true;
+	}
+
+	spin_unlock(&space_info->lock);
+
+	if (!ret)
+		return 0;
+
+	if (!flush)
+		goto out;
+
+	/*
+	 * We do synchronous shrinking since we don't actually unreserve
+	 * metadata until after the IO is completed.
+	 */
+	ret = shrink_delalloc(trans, root, num_bytes, 1);
+	if (ret > 0)
+		return 0;
+	else if (ret < 0)
+		goto out;
+
+	/*
+	 * So if we were overcommitted it's possible that somebody else flushed
+	 * out enough space and we simply didn't have enough space to reclaim,
+	 * so go back around and try again.
+	 */
+	if (retries < 2) {
+		retries++;
+		goto again;
+	}
+
+	spin_lock(&space_info->lock);
+	/*
+	 * Not enough space to be reclaimed, don't bother committing the
+	 * transaction.
+	 */
+	if (space_info->bytes_pinned < orig_bytes)
+		ret = -ENOSPC;
+	spin_unlock(&space_info->lock);
+	if (ret)
+		goto out;
+
+	ret = -EAGAIN;
+	if (trans || committed)
+		goto out;
+
+	ret = -ENOSPC;
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		goto out;
+	ret = btrfs_commit_transaction(trans, root);
+	if (!ret) {
+		trans = NULL;
+		committed = true;
+		goto again;
+	}
+
+out:
+	if (reserved) {
+		spin_lock(&space_info->lock);
+		space_info->bytes_reserved -= orig_bytes;
+		spin_unlock(&space_info->lock);
+	}
+
+	return ret;
+}
+
+static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root)
+{
+	struct btrfs_block_rsv *block_rsv;
+	if (root->ref_cows)
+		block_rsv = trans->block_rsv;
+	else
+		block_rsv = root->block_rsv;
+
+	if (!block_rsv)
+		block_rsv = &root->fs_info->empty_block_rsv;
+
+	return block_rsv;
+}
+
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+			       u64 num_bytes)
+{
+	int ret = -ENOSPC;
+	spin_lock(&block_rsv->lock);
+	if (block_rsv->reserved >= num_bytes) {
+		block_rsv->reserved -= num_bytes;
+		if (block_rsv->reserved < block_rsv->size)
+			block_rsv->full = 0;
+		ret = 0;
+	}
+	spin_unlock(&block_rsv->lock);
+	return ret;
+}
+
+static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+				u64 num_bytes, int update_size)
+{
+	spin_lock(&block_rsv->lock);
+	block_rsv->reserved += num_bytes;
+	if (update_size)
+		block_rsv->size += num_bytes;
+	else if (block_rsv->reserved >= block_rsv->size)
+		block_rsv->full = 1;
+	spin_unlock(&block_rsv->lock);
+}
+
+static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+				    struct btrfs_block_rsv *dest, u64 num_bytes)
+{
+	struct btrfs_space_info *space_info = block_rsv->space_info;
+
+	spin_lock(&block_rsv->lock);
+	if (num_bytes == (u64)-1)
+		num_bytes = block_rsv->size;
+	block_rsv->size -= num_bytes;
+	if (block_rsv->reserved >= block_rsv->size) {
+		num_bytes = block_rsv->reserved - block_rsv->size;
+		block_rsv->reserved = block_rsv->size;
+		block_rsv->full = 1;
+	} else {
+		num_bytes = 0;
+	}
+	spin_unlock(&block_rsv->lock);
+
+	if (num_bytes > 0) {
+		if (dest) {
+			spin_lock(&dest->lock);
+			if (!dest->full) {
+				u64 bytes_to_add;
+
+				bytes_to_add = dest->size - dest->reserved;
+				bytes_to_add = min(num_bytes, bytes_to_add);
+				dest->reserved += bytes_to_add;
+				if (dest->reserved >= dest->size)
+					dest->full = 1;
+				num_bytes -= bytes_to_add;
+			}
+			spin_unlock(&dest->lock);
+		}
+		if (num_bytes) {
+			spin_lock(&space_info->lock);
+			space_info->bytes_reserved -= num_bytes;
+			space_info->reservation_progress++;
+			spin_unlock(&space_info->lock);
+		}
+	}
+}
+
+static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
+				   struct btrfs_block_rsv *dst, u64 num_bytes)
+{
+	int ret;
+
+	ret = block_rsv_use_bytes(src, num_bytes);
+	if (ret)
+		return ret;
+
+	block_rsv_add_bytes(dst, num_bytes, 1);
+	return 0;
+}
+
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+{
+	memset(rsv, 0, sizeof(*rsv));
+	spin_lock_init(&rsv->lock);
+	atomic_set(&rsv->usage, 1);
+	rsv->priority = 6;
+	INIT_LIST_HEAD(&rsv->list);
+}
+
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+{
+	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+	if (!block_rsv)
+		return NULL;
+
+	btrfs_init_block_rsv(block_rsv);
+	block_rsv->space_info = __find_space_info(fs_info,
+						  BTRFS_BLOCK_GROUP_METADATA);
+	return block_rsv;
+}
+
+void btrfs_free_block_rsv(struct btrfs_root *root,
+			  struct btrfs_block_rsv *rsv)
+{
+	if (rsv && atomic_dec_and_test(&rsv->usage)) {
+		btrfs_block_rsv_release(root, rsv, (u64)-1);
+		if (!rsv->durable)
+			kfree(rsv);
+	}
+}
+
+/*
+ * make the block_rsv struct be able to capture freed space.
+ * the captured space will re-add to the the block_rsv struct
+ * after transaction commit
+ */
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+				 struct btrfs_block_rsv *block_rsv)
+{
+	block_rsv->durable = 1;
+	mutex_lock(&fs_info->durable_block_rsv_mutex);
+	list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
+	mutex_unlock(&fs_info->durable_block_rsv_mutex);
+}
+
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv,
+			u64 num_bytes)
+{
+	int ret;
+
+	if (num_bytes == 0)
+		return 0;
+
+	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
+	if (!ret) {
+		block_rsv_add_bytes(block_rsv, num_bytes, 1);
+		return 0;
+	}
+
+	return ret;
+}
+
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_block_rsv *block_rsv,
+			  u64 min_reserved, int min_factor)
+{
+	u64 num_bytes = 0;
+	int commit_trans = 0;
+	int ret = -ENOSPC;
+
+	if (!block_rsv)
+		return 0;
+
+	spin_lock(&block_rsv->lock);
+	if (min_factor > 0)
+		num_bytes = div_factor(block_rsv->size, min_factor);
+	if (min_reserved > num_bytes)
+		num_bytes = min_reserved;
+
+	if (block_rsv->reserved >= num_bytes) {
+		ret = 0;
+	} else {
+		num_bytes -= block_rsv->reserved;
+		if (block_rsv->durable &&
+		    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
+			commit_trans = 1;
+	}
+	spin_unlock(&block_rsv->lock);
+	if (!ret)
+		return 0;
+
+	if (block_rsv->refill_used) {
+		ret = reserve_metadata_bytes(trans, root, block_rsv,
+					     num_bytes, 0);
+		if (!ret) {
+			block_rsv_add_bytes(block_rsv, num_bytes, 0);
+			return 0;
+		}
+	}
+
+	if (commit_trans) {
+		if (trans)
+			return -EAGAIN;
+
+		trans = btrfs_join_transaction(root);
+		BUG_ON(IS_ERR(trans));
+		ret = btrfs_commit_transaction(trans, root);
+		return 0;
+	}
+
+	return -ENOSPC;
+}
+
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+			    struct btrfs_block_rsv *dst_rsv,
+			    u64 num_bytes)
+{
+	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+void btrfs_block_rsv_release(struct btrfs_root *root,
+			     struct btrfs_block_rsv *block_rsv,
+			     u64 num_bytes)
+{
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+	if (global_rsv->full || global_rsv == block_rsv ||
+	    block_rsv->space_info != global_rsv->space_info)
+		global_rsv = NULL;
+	block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+}
+
+/*
+ * helper to calculate size of global block reservation.
+ * the desired value is sum of space used by extent tree,
+ * checksum tree and root tree
+ */
+static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_space_info *sinfo;
+	u64 num_bytes;
+	u64 meta_used;
+	u64 data_used;
+	int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+
+	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+	spin_lock(&sinfo->lock);
+	data_used = sinfo->bytes_used;
+	spin_unlock(&sinfo->lock);
+
+	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+	spin_lock(&sinfo->lock);
+	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
+		data_used = 0;
+	meta_used = sinfo->bytes_used;
+	spin_unlock(&sinfo->lock);
+
+	num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
+		    csum_size * 2;
+	num_bytes += div64_u64(data_used + meta_used, 50);
+
+	if (num_bytes * 3 > meta_used)
+		num_bytes = div64_u64(meta_used, 3);
+
+	return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+}
+
+static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+	struct btrfs_space_info *sinfo = block_rsv->space_info;
+	u64 num_bytes;
+
+	num_bytes = calc_global_metadata_size(fs_info);
+
+	spin_lock(&block_rsv->lock);
+	spin_lock(&sinfo->lock);
+
+	block_rsv->size = num_bytes;
+
+	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
+		    sinfo->bytes_reserved + sinfo->bytes_readonly +
+		    sinfo->bytes_may_use;
+
+	if (sinfo->total_bytes > num_bytes) {
+		num_bytes = sinfo->total_bytes - num_bytes;
+		block_rsv->reserved += num_bytes;
+		sinfo->bytes_reserved += num_bytes;
+	}
+
+	if (block_rsv->reserved >= block_rsv->size) {
+		num_bytes = block_rsv->reserved - block_rsv->size;
+		sinfo->bytes_reserved -= num_bytes;
+		sinfo->reservation_progress++;
+		block_rsv->reserved = block_rsv->size;
+		block_rsv->full = 1;
+	}
+
+	spin_unlock(&sinfo->lock);
+	spin_unlock(&block_rsv->lock);
+}
+
+static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_space_info *space_info;
+
+	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+	fs_info->chunk_block_rsv.space_info = space_info;
+	fs_info->chunk_block_rsv.priority = 10;
+
+	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+	fs_info->global_block_rsv.space_info = space_info;
+	fs_info->global_block_rsv.priority = 10;
+	fs_info->global_block_rsv.refill_used = 1;
+	fs_info->delalloc_block_rsv.space_info = space_info;
+	fs_info->trans_block_rsv.space_info = space_info;
+	fs_info->empty_block_rsv.space_info = space_info;
+	fs_info->empty_block_rsv.priority = 10;
+
+	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
+	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
+	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
+
+	btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
+
+	btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
+
+	update_global_block_rsv(fs_info);
+}
+
+static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+	block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
+	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
+	WARN_ON(fs_info->trans_block_rsv.size > 0);
+	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+	WARN_ON(fs_info->chunk_block_rsv.size > 0);
+	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+}
+
+int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_block_rsv *rsv)
+{
+	struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
+	u64 num_bytes;
+	int ret;
+
+	/*
+	 * Truncate should be freeing data, but give us 2 items just in case it
+	 * needs to use some space.  We may want to be smarter about this in the
+	 * future.
+	 */
+	num_bytes = btrfs_calc_trans_metadata_size(root, 2);
+
+	/* We already have enough bytes, just return */
+	if (rsv->reserved >= num_bytes)
+		return 0;
+
+	num_bytes -= rsv->reserved;
+
+	/*
+	 * You should have reserved enough space before hand to do this, so this
+	 * should not fail.
+	 */
+	ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
+	BUG_ON(ret);
+
+	return 0;
+}
+
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 int num_items)
+{
+	u64 num_bytes;
+	int ret;
+
+	if (num_items == 0 || root->fs_info->chunk_root == root)
+		return 0;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+	ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
+				  num_bytes);
+	if (!ret) {
+		trans->bytes_reserved += num_bytes;
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+	}
+	return ret;
+}
+
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root)
+{
+	if (!trans->bytes_reserved)
+		return;
+
+	BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
+	btrfs_block_rsv_release(root, trans->block_rsv,
+				trans->bytes_reserved);
+	trans->bytes_reserved = 0;
+}
+
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+				  struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
+
+	/*
+	 * We need to hold space in order to delete our orphan item once we've
+	 * added it, so this takes the reservation so we can release it later
+	 * when we are truly done with the orphan item.
+	 */
+	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+void btrfs_orphan_release_metadata(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
+}
+
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+				struct btrfs_pending_snapshot *pending)
+{
+	struct btrfs_root *root = pending->root;
+	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
+	/*
+	 * two for root back/forward refs, two for directory entries
+	 * and one for root of the snapshot.
+	 */
+	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
+	dst_rsv->space_info = src_rsv->space_info;
+	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+{
+	return num_bytes >>= 3;
+}
+
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
+	u64 to_reserve;
+	int nr_extents;
+	int reserved_extents;
+	int ret;
+
+	if (btrfs_transaction_in_commit(root->fs_info))
+		schedule_timeout(1);
+
+	num_bytes = ALIGN(num_bytes, root->sectorsize);
+
+	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
+	reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
+
+	if (nr_extents > reserved_extents) {
+		nr_extents -= reserved_extents;
+		to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+	} else {
+		nr_extents = 0;
+		to_reserve = 0;
+	}
+
+	to_reserve += calc_csum_metadata_size(inode, num_bytes);
+	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
+	if (ret)
+		return ret;
+
+	atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents);
+	atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+
+	block_rsv_add_bytes(block_rsv, to_reserve, 1);
+
+	if (block_rsv->size > 512 * 1024 * 1024)
+		shrink_delalloc(NULL, root, to_reserve, 0);
+
+	return 0;
+}
+
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 to_free;
+	int nr_extents;
+	int reserved_extents;
+
+	num_bytes = ALIGN(num_bytes, root->sectorsize);
+	atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+	WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
+
+	reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
+	do {
+		int old, new;
+
+		nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
+		if (nr_extents >= reserved_extents) {
+			nr_extents = 0;
+			break;
+		}
+		old = reserved_extents;
+		nr_extents = reserved_extents - nr_extents;
+		new = reserved_extents - nr_extents;
+		old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents,
+				     reserved_extents, new);
+		if (likely(old == reserved_extents))
+			break;
+		reserved_extents = old;
+	} while (1);
+
+	to_free = calc_csum_metadata_size(inode, num_bytes);
+	if (nr_extents > 0)
+		to_free += btrfs_calc_trans_metadata_size(root, nr_extents);
+
+	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
+				to_free);
+}
+
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+{
+	int ret;
+
+	ret = btrfs_check_data_free_space(inode, num_bytes);
+	if (ret)
+		return ret;
+
+	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
+	if (ret) {
+		btrfs_free_reserved_data_space(inode, num_bytes);
+		return ret;
+	}
+
+	return 0;
+}
+
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+{
+	btrfs_delalloc_release_metadata(inode, num_bytes);
+	btrfs_free_reserved_data_space(inode, num_bytes);
+}
+
+static int update_block_group(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      u64 bytenr, u64 num_bytes, int alloc)
+{
+	struct btrfs_block_group_cache *cache = NULL;
+	struct btrfs_fs_info *info = root->fs_info;
+	u64 total = num_bytes;
+	u64 old_val;
+	u64 byte_in_group;
+	int factor;
+
+	/* block accounting for super block */
+	spin_lock(&info->delalloc_lock);
+	old_val = btrfs_super_bytes_used(&info->super_copy);
+	if (alloc)
+		old_val += num_bytes;
+	else
+		old_val -= num_bytes;
+	btrfs_set_super_bytes_used(&info->super_copy, old_val);
+	spin_unlock(&info->delalloc_lock);
+
+	while (total) {
+		cache = btrfs_lookup_block_group(info, bytenr);
+		if (!cache)
+			return -1;
+		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
+				    BTRFS_BLOCK_GROUP_RAID1 |
+				    BTRFS_BLOCK_GROUP_RAID10))
+			factor = 2;
+		else
+			factor = 1;
+		/*
+		 * If this block group has free space cache written out, we
+		 * need to make sure to load it if we are removing space.  This
+		 * is because we need the unpinning stage to actually add the
+		 * space back to the block group, otherwise we will leak space.
+		 */
+		if (!alloc && cache->cached == BTRFS_CACHE_NO)
+			cache_block_group(cache, trans, NULL, 1);
+
+		byte_in_group = bytenr - cache->key.objectid;
+		WARN_ON(byte_in_group > cache->key.offset);
+
+		spin_lock(&cache->space_info->lock);
+		spin_lock(&cache->lock);
+
+		if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+		    cache->disk_cache_state < BTRFS_DC_CLEAR)
+			cache->disk_cache_state = BTRFS_DC_CLEAR;
+
+		cache->dirty = 1;
+		old_val = btrfs_block_group_used(&cache->item);
+		num_bytes = min(total, cache->key.offset - byte_in_group);
+		if (alloc) {
+			old_val += num_bytes;
+			btrfs_set_block_group_used(&cache->item, old_val);
+			cache->reserved -= num_bytes;
+			cache->space_info->bytes_reserved -= num_bytes;
+			cache->space_info->reservation_progress++;
+			cache->space_info->bytes_used += num_bytes;
+			cache->space_info->disk_used += num_bytes * factor;
+			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
+		} else {
+			old_val -= num_bytes;
+			btrfs_set_block_group_used(&cache->item, old_val);
+			cache->pinned += num_bytes;
+			cache->space_info->bytes_pinned += num_bytes;
+			cache->space_info->bytes_used -= num_bytes;
+			cache->space_info->disk_used -= num_bytes * factor;
+			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
+
+			set_extent_dirty(info->pinned_extents,
+					 bytenr, bytenr + num_bytes - 1,
+					 GFP_NOFS | __GFP_NOFAIL);
+		}
+		btrfs_put_block_group(cache);
+		total -= num_bytes;
+		bytenr += num_bytes;
+	}
+	return 0;
+}
+
+static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
+{
+	struct btrfs_block_group_cache *cache;
+	u64 bytenr;
+
+	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
+	if (!cache)
+		return 0;
+
+	bytenr = cache->key.objectid;
+	btrfs_put_block_group(cache);
+
+	return bytenr;
+}
+
+static int pin_down_extent(struct btrfs_root *root,
+			   struct btrfs_block_group_cache *cache,
+			   u64 bytenr, u64 num_bytes, int reserved)
+{
+	spin_lock(&cache->space_info->lock);
+	spin_lock(&cache->lock);
+	cache->pinned += num_bytes;
+	cache->space_info->bytes_pinned += num_bytes;
+	if (reserved) {
+		cache->reserved -= num_bytes;
+		cache->space_info->bytes_reserved -= num_bytes;
+		cache->space_info->reservation_progress++;
+	}
+	spin_unlock(&cache->lock);
+	spin_unlock(&cache->space_info->lock);
+
+	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
+			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+	return 0;
+}
+
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+		     u64 bytenr, u64 num_bytes, int reserved)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+	BUG_ON(!cache);
+
+	pin_down_extent(root, cache, bytenr, num_bytes, reserved);
+
+	btrfs_put_block_group(cache);
+	return 0;
+}
+
+/*
+ * update size of reserved extents. this function may return -EAGAIN
+ * if 'reserve' is true or 'sinfo' is false.
+ */
+int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+				u64 num_bytes, int reserve, int sinfo)
+{
+	int ret = 0;
+	if (sinfo) {
+		struct btrfs_space_info *space_info = cache->space_info;
+		spin_lock(&space_info->lock);
+		spin_lock(&cache->lock);
+		if (reserve) {
+			if (cache->ro) {
+				ret = -EAGAIN;
+			} else {
+				cache->reserved += num_bytes;
+				space_info->bytes_reserved += num_bytes;
+			}
+		} else {
+			if (cache->ro)
+				space_info->bytes_readonly += num_bytes;
+			cache->reserved -= num_bytes;
+			space_info->bytes_reserved -= num_bytes;
+			space_info->reservation_progress++;
+		}
+		spin_unlock(&cache->lock);
+		spin_unlock(&space_info->lock);
+	} else {
+		spin_lock(&cache->lock);
+		if (cache->ro) {
+			ret = -EAGAIN;
+		} else {
+			if (reserve)
+				cache->reserved += num_bytes;
+			else
+				cache->reserved -= num_bytes;
+		}
+		spin_unlock(&cache->lock);
+	}
+	return ret;
+}
+
+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_caching_control *next;
+	struct btrfs_caching_control *caching_ctl;
+	struct btrfs_block_group_cache *cache;
+
+	down_write(&fs_info->extent_commit_sem);
+
+	list_for_each_entry_safe(caching_ctl, next,
+				 &fs_info->caching_block_groups, list) {
+		cache = caching_ctl->block_group;
+		if (block_group_cache_done(cache)) {
+			cache->last_byte_to_unpin = (u64)-1;
+			list_del_init(&caching_ctl->list);
+			put_caching_control(caching_ctl);
+		} else {
+			cache->last_byte_to_unpin = caching_ctl->progress;
+		}
+	}
+
+	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+		fs_info->pinned_extents = &fs_info->freed_extents[1];
+	else
+		fs_info->pinned_extents = &fs_info->freed_extents[0];
+
+	up_write(&fs_info->extent_commit_sem);
+
+	update_global_block_rsv(fs_info);
+	return 0;
+}
+
+static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_block_group_cache *cache = NULL;
+	u64 len;
+
+	while (start <= end) {
+		if (!cache ||
+		    start >= cache->key.objectid + cache->key.offset) {
+			if (cache)
+				btrfs_put_block_group(cache);
+			cache = btrfs_lookup_block_group(fs_info, start);
+			BUG_ON(!cache);
+		}
+
+		len = cache->key.objectid + cache->key.offset - start;
+		len = min(len, end + 1 - start);
+
+		if (start < cache->last_byte_to_unpin) {
+			len = min(len, cache->last_byte_to_unpin - start);
+			btrfs_add_free_space(cache, start, len);
+		}
+
+		start += len;
+
+		spin_lock(&cache->space_info->lock);
+		spin_lock(&cache->lock);
+		cache->pinned -= len;
+		cache->space_info->bytes_pinned -= len;
+		if (cache->ro) {
+			cache->space_info->bytes_readonly += len;
+		} else if (cache->reserved_pinned > 0) {
+			len = min(len, cache->reserved_pinned);
+			cache->reserved_pinned -= len;
+			cache->space_info->bytes_reserved += len;
+		}
+		spin_unlock(&cache->lock);
+		spin_unlock(&cache->space_info->lock);
+	}
+
+	if (cache)
+		btrfs_put_block_group(cache);
+	return 0;
+}
+
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct extent_io_tree *unpin;
+	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_block_rsv *next_rsv;
+	u64 start;
+	u64 end;
+	int idx;
+	int ret;
+
+	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+		unpin = &fs_info->freed_extents[1];
+	else
+		unpin = &fs_info->freed_extents[0];
+
+	while (1) {
+		ret = find_first_extent_bit(unpin, 0, &start, &end,
+					    EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		if (btrfs_test_opt(root, DISCARD))
+			ret = btrfs_discard_extent(root, start,
+						   end + 1 - start, NULL);
+
+		clear_extent_dirty(unpin, start, end, GFP_NOFS);
+		unpin_extent_range(root, start, end);
+		cond_resched();
+	}
+
+	mutex_lock(&fs_info->durable_block_rsv_mutex);
+	list_for_each_entry_safe(block_rsv, next_rsv,
+				 &fs_info->durable_block_rsv_list, list) {
+
+		idx = trans->transid & 0x1;
+		if (block_rsv->freed[idx] > 0) {
+			block_rsv_add_bytes(block_rsv,
+					    block_rsv->freed[idx], 0);
+			block_rsv->freed[idx] = 0;
+		}
+		if (atomic_read(&block_rsv->usage) == 0) {
+			btrfs_block_rsv_release(root, block_rsv, (u64)-1);
+
+			if (block_rsv->freed[0] == 0 &&
+			    block_rsv->freed[1] == 0) {
+				list_del_init(&block_rsv->list);
+				kfree(block_rsv);
+			}
+		} else {
+			btrfs_block_rsv_release(root, block_rsv, 0);
+		}
+	}
+	mutex_unlock(&fs_info->durable_block_rsv_mutex);
+
+	return 0;
+}
+
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 parent,
+				u64 root_objectid, u64 owner_objectid,
+				u64 owner_offset, int refs_to_drop,
+				struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_root *extent_root = info->extent_root;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *iref;
+	int ret;
+	int is_data;
+	int extent_slot = 0;
+	int found_extent = 0;
+	int num_to_del = 1;
+	u32 item_size;
+	u64 refs;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 1;
+	path->leave_spinning = 1;
+
+	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
+	BUG_ON(!is_data && refs_to_drop != 1);
+
+	ret = lookup_extent_backref(trans, extent_root, path, &iref,
+				    bytenr, num_bytes, parent,
+				    root_objectid, owner_objectid,
+				    owner_offset);
+	if (ret == 0) {
+		extent_slot = path->slots[0];
+		while (extent_slot >= 0) {
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      extent_slot);
+			if (key.objectid != bytenr)
+				break;
+			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+			    key.offset == num_bytes) {
+				found_extent = 1;
+				break;
+			}
+			if (path->slots[0] - extent_slot > 5)
+				break;
+			extent_slot--;
+		}
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
+		if (found_extent && item_size < sizeof(*ei))
+			found_extent = 0;
+#endif
+		if (!found_extent) {
+			BUG_ON(iref);
+			ret = remove_extent_backref(trans, extent_root, path,
+						    NULL, refs_to_drop,
+						    is_data);
+			BUG_ON(ret);
+			btrfs_release_path(path);
+			path->leave_spinning = 1;
+
+			key.objectid = bytenr;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+			key.offset = num_bytes;
+
+			ret = btrfs_search_slot(trans, extent_root,
+						&key, path, -1, 1);
+			if (ret) {
+				printk(KERN_ERR "umm, got %d back from search"
+				       ", was looking for %llu\n", ret,
+				       (unsigned long long)bytenr);
+				btrfs_print_leaf(extent_root, path->nodes[0]);
+			}
+			BUG_ON(ret);
+			extent_slot = path->slots[0];
+		}
+	} else {
+		btrfs_print_leaf(extent_root, path->nodes[0]);
+		WARN_ON(1);
+		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
+		       "parent %llu root %llu  owner %llu offset %llu\n",
+		       (unsigned long long)bytenr,
+		       (unsigned long long)parent,
+		       (unsigned long long)root_objectid,
+		       (unsigned long long)owner_objectid,
+		       (unsigned long long)owner_offset);
+	}
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, extent_slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		BUG_ON(found_extent || extent_slot != path->slots[0]);
+		ret = convert_extent_item_v0(trans, extent_root, path,
+					     owner_objectid, 0);
+		BUG_ON(ret < 0);
+
+		btrfs_release_path(path);
+		path->leave_spinning = 1;
+
+		key.objectid = bytenr;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = num_bytes;
+
+		ret = btrfs_search_slot(trans, extent_root, &key, path,
+					-1, 1);
+		if (ret) {
+			printk(KERN_ERR "umm, got %d back from search"
+			       ", was looking for %llu\n", ret,
+			       (unsigned long long)bytenr);
+			btrfs_print_leaf(extent_root, path->nodes[0]);
+		}
+		BUG_ON(ret);
+		extent_slot = path->slots[0];
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, extent_slot);
+	}
+#endif
+	BUG_ON(item_size < sizeof(*ei));
+	ei = btrfs_item_ptr(leaf, extent_slot,
+			    struct btrfs_extent_item);
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+		struct btrfs_tree_block_info *bi;
+		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
+		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
+	}
+
+	refs = btrfs_extent_refs(leaf, ei);
+	BUG_ON(refs < refs_to_drop);
+	refs -= refs_to_drop;
+
+	if (refs > 0) {
+		if (extent_op)
+			__run_delayed_extent_op(extent_op, leaf, ei);
+		/*
+		 * In the case of inline back ref, reference count will
+		 * be updated by remove_extent_backref
+		 */
+		if (iref) {
+			BUG_ON(!found_extent);
+		} else {
+			btrfs_set_extent_refs(leaf, ei, refs);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+		if (found_extent) {
+			ret = remove_extent_backref(trans, extent_root, path,
+						    iref, refs_to_drop,
+						    is_data);
+			BUG_ON(ret);
+		}
+	} else {
+		if (found_extent) {
+			BUG_ON(is_data && refs_to_drop !=
+			       extent_data_ref_count(root, path, iref));
+			if (iref) {
+				BUG_ON(path->slots[0] != extent_slot);
+			} else {
+				BUG_ON(path->slots[0] != extent_slot + 1);
+				path->slots[0] = extent_slot;
+				num_to_del = 2;
+			}
+		}
+
+		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
+				      num_to_del);
+		BUG_ON(ret);
+		btrfs_release_path(path);
+
+		if (is_data) {
+			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
+			BUG_ON(ret);
+		} else {
+			invalidate_mapping_pages(info->btree_inode->i_mapping,
+			     bytenr >> PAGE_CACHE_SHIFT,
+			     (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
+		}
+
+		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
+		BUG_ON(ret);
+	}
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * when we free an block, it is possible (and likely) that we free the last
+ * delayed ref for that extent as well.  This searches the delayed ref tree for
+ * a given extent, and if there are no other delayed refs to be processed, it
+ * removes it from the tree.
+ */
+static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root, u64 bytenr)
+{
+	struct btrfs_delayed_ref_head *head;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_node *ref;
+	struct rb_node *node;
+	int ret = 0;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	head = btrfs_find_delayed_ref_head(trans, bytenr);
+	if (!head)
+		goto out;
+
+	node = rb_prev(&head->node.rb_node);
+	if (!node)
+		goto out;
+
+	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+
+	/* there are still entries for this ref, we can't drop it */
+	if (ref->bytenr == bytenr)
+		goto out;
+
+	if (head->extent_op) {
+		if (!head->must_insert_reserved)
+			goto out;
+		kfree(head->extent_op);
+		head->extent_op = NULL;
+	}
+
+	/*
+	 * waiting for the lock here would deadlock.  If someone else has it
+	 * locked they are already in the process of dropping it anyway
+	 */
+	if (!mutex_trylock(&head->mutex))
+		goto out;
+
+	/*
+	 * at this point we have a head with no other entries.  Go
+	 * ahead and process it.
+	 */
+	head->node.in_tree = 0;
+	rb_erase(&head->node.rb_node, &delayed_refs->root);
+
+	delayed_refs->num_entries--;
+
+	/*
+	 * we don't take a ref on the node because we're removing it from the
+	 * tree, so we just steal the ref the tree was holding.
+	 */
+	delayed_refs->num_heads--;
+	if (list_empty(&head->cluster))
+		delayed_refs->num_heads_ready--;
+
+	list_del_init(&head->cluster);
+	spin_unlock(&delayed_refs->lock);
+
+	BUG_ON(head->extent_op);
+	if (head->must_insert_reserved)
+		ret = 1;
+
+	mutex_unlock(&head->mutex);
+	btrfs_put_delayed_ref(&head->node);
+	return ret;
+out:
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
+
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct extent_buffer *buf,
+			   u64 parent, int last_ref)
+{
+	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_block_group_cache *cache = NULL;
+	int ret;
+
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+		ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
+						parent, root->root_key.objectid,
+						btrfs_header_level(buf),
+						BTRFS_DROP_DELAYED_REF, NULL);
+		BUG_ON(ret);
+	}
+
+	if (!last_ref)
+		return;
+
+	block_rsv = get_block_rsv(trans, root);
+	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+	if (block_rsv->space_info != cache->space_info)
+		goto out;
+
+	if (btrfs_header_generation(buf) == trans->transid) {
+		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+			ret = check_ref_cleanup(trans, root, buf->start);
+			if (!ret)
+				goto pin;
+		}
+
+		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+			pin_down_extent(root, cache, buf->start, buf->len, 1);
+			goto pin;
+		}
+
+		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+
+		btrfs_add_free_space(cache, buf->start, buf->len);
+		ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
+		if (ret == -EAGAIN) {
+			/* block group became read-only */
+			btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
+			goto out;
+		}
+
+		ret = 1;
+		spin_lock(&block_rsv->lock);
+		if (block_rsv->reserved < block_rsv->size) {
+			block_rsv->reserved += buf->len;
+			ret = 0;
+		}
+		spin_unlock(&block_rsv->lock);
+
+		if (ret) {
+			spin_lock(&cache->space_info->lock);
+			cache->space_info->bytes_reserved -= buf->len;
+			cache->space_info->reservation_progress++;
+			spin_unlock(&cache->space_info->lock);
+		}
+		goto out;
+	}
+pin:
+	if (block_rsv->durable && !cache->ro) {
+		ret = 0;
+		spin_lock(&cache->lock);
+		if (!cache->ro) {
+			cache->reserved_pinned += buf->len;
+			ret = 1;
+		}
+		spin_unlock(&cache->lock);
+
+		if (ret) {
+			spin_lock(&block_rsv->lock);
+			block_rsv->freed[trans->transid & 0x1] += buf->len;
+			spin_unlock(&block_rsv->lock);
+		}
+	}
+out:
+	/*
+	 * Deleting the buffer, clear the corrupt flag since it doesn't matter
+	 * anymore.
+	 */
+	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
+	btrfs_put_block_group(cache);
+}
+
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent,
+		      u64 root_objectid, u64 owner, u64 offset)
+{
+	int ret;
+
+	/*
+	 * tree log blocks never actually go into the extent allocation
+	 * tree, just update pinning info and exit early.
+	 */
+	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
+		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
+		/* unlocks the pinned mutex */
+		btrfs_pin_extent(root, bytenr, num_bytes, 1);
+		ret = 0;
+	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+					parent, root_objectid, (int)owner,
+					BTRFS_DROP_DELAYED_REF, NULL);
+		BUG_ON(ret);
+	} else {
+		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+					parent, root_objectid, owner,
+					offset, BTRFS_DROP_DELAYED_REF, NULL);
+		BUG_ON(ret);
+	}
+	return ret;
+}
+
+static u64 stripe_align(struct btrfs_root *root, u64 val)
+{
+	u64 mask = ((u64)root->stripesize - 1);
+	u64 ret = (val + mask) & ~mask;
+	return ret;
+}
+
+/*
+ * when we wait for progress in the block group caching, its because
+ * our allocation attempt failed at least once.  So, we must sleep
+ * and let some progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to
+ * show up, and then it will check the block group free space numbers
+ * for our min num_bytes.  Another option is to have it go ahead
+ * and look in the rbtree for a free extent of a given size, but this
+ * is a good start.
+ */
+static noinline int
+wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+				u64 num_bytes)
+{
+	struct btrfs_caching_control *caching_ctl;
+	DEFINE_WAIT(wait);
+
+	caching_ctl = get_caching_control(cache);
+	if (!caching_ctl)
+		return 0;
+
+	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
+		   (cache->free_space_ctl->free_space >= num_bytes));
+
+	put_caching_control(caching_ctl);
+	return 0;
+}
+
+static noinline int
+wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_caching_control *caching_ctl;
+	DEFINE_WAIT(wait);
+
+	caching_ctl = get_caching_control(cache);
+	if (!caching_ctl)
+		return 0;
+
+	wait_event(caching_ctl->wait, block_group_cache_done(cache));
+
+	put_caching_control(caching_ctl);
+	return 0;
+}
+
+static int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+	int index;
+	if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+		index = 0;
+	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+		index = 1;
+	else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+		index = 2;
+	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+		index = 3;
+	else
+		index = 4;
+	return index;
+}
+
+enum btrfs_loop_type {
+	LOOP_FIND_IDEAL = 0,
+	LOOP_CACHING_NOWAIT = 1,
+	LOOP_CACHING_WAIT = 2,
+	LOOP_ALLOC_CHUNK = 3,
+	LOOP_NO_EMPTY_SIZE = 4,
+};
+
+/*
+ * walks the btree of allocated extents and find a hole of a given size.
+ * The key ins is changed to record the hole:
+ * ins->objectid == block start
+ * ins->flags = BTRFS_EXTENT_ITEM_KEY
+ * ins->offset == number of blocks
+ * Any available blocks before search_start are skipped.
+ */
+static noinline int find_free_extent(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *orig_root,
+				     u64 num_bytes, u64 empty_size,
+				     u64 search_start, u64 search_end,
+				     u64 hint_byte, struct btrfs_key *ins,
+				     u64 data)
+{
+	int ret = 0;
+	struct btrfs_root *root = orig_root->fs_info->extent_root;
+	struct btrfs_free_cluster *last_ptr = NULL;
+	struct btrfs_block_group_cache *block_group = NULL;
+	int empty_cluster = 2 * 1024 * 1024;
+	int allowed_chunk_alloc = 0;
+	int done_chunk_alloc = 0;
+	struct btrfs_space_info *space_info;
+	int last_ptr_loop = 0;
+	int loop = 0;
+	int index = 0;
+	bool found_uncached_bg = false;
+	bool failed_cluster_refill = false;
+	bool failed_alloc = false;
+	bool use_cluster = true;
+	u64 ideal_cache_percent = 0;
+	u64 ideal_cache_offset = 0;
+
+	WARN_ON(num_bytes < root->sectorsize);
+	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+	ins->objectid = 0;
+	ins->offset = 0;
+
+	space_info = __find_space_info(root->fs_info, data);
+	if (!space_info) {
+		printk(KERN_ERR "No space info for %llu\n", data);
+		return -ENOSPC;
+	}
+
+	/*
+	 * If the space info is for both data and metadata it means we have a
+	 * small filesystem and we can't use the clustering stuff.
+	 */
+	if (btrfs_mixed_space_info(space_info))
+		use_cluster = false;
+
+	if (orig_root->ref_cows || empty_size)
+		allowed_chunk_alloc = 1;
+
+	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
+		last_ptr = &root->fs_info->meta_alloc_cluster;
+		if (!btrfs_test_opt(root, SSD))
+			empty_cluster = 64 * 1024;
+	}
+
+	if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
+	    btrfs_test_opt(root, SSD)) {
+		last_ptr = &root->fs_info->data_alloc_cluster;
+	}
+
+	if (last_ptr) {
+		spin_lock(&last_ptr->lock);
+		if (last_ptr->block_group)
+			hint_byte = last_ptr->window_start;
+		spin_unlock(&last_ptr->lock);
+	}
+
+	search_start = max(search_start, first_logical_byte(root, 0));
+	search_start = max(search_start, hint_byte);
+
+	if (!last_ptr)
+		empty_cluster = 0;
+
+	if (search_start == hint_byte) {
+ideal_cache:
+		block_group = btrfs_lookup_block_group(root->fs_info,
+						       search_start);
+		/*
+		 * we don't want to use the block group if it doesn't match our
+		 * allocation bits, or if its not cached.
+		 *
+		 * However if we are re-searching with an ideal block group
+		 * picked out then we don't care that the block group is cached.
+		 */
+		if (block_group && block_group_bits(block_group, data) &&
+		    (block_group->cached != BTRFS_CACHE_NO ||
+		     search_start == ideal_cache_offset)) {
+			down_read(&space_info->groups_sem);
+			if (list_empty(&block_group->list) ||
+			    block_group->ro) {
+				/*
+				 * someone is removing this block group,
+				 * we can't jump into the have_block_group
+				 * target because our list pointers are not
+				 * valid
+				 */
+				btrfs_put_block_group(block_group);
+				up_read(&space_info->groups_sem);
+			} else {
+				index = get_block_group_index(block_group);
+				goto have_block_group;
+			}
+		} else if (block_group) {
+			btrfs_put_block_group(block_group);
+		}
+	}
+search:
+	down_read(&space_info->groups_sem);
+	list_for_each_entry(block_group, &space_info->block_groups[index],
+			    list) {
+		u64 offset;
+		int cached;
+
+		btrfs_get_block_group(block_group);
+		search_start = block_group->key.objectid;
+
+		/*
+		 * this can happen if we end up cycling through all the
+		 * raid types, but we want to make sure we only allocate
+		 * for the proper type.
+		 */
+		if (!block_group_bits(block_group, data)) {
+		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
+				BTRFS_BLOCK_GROUP_RAID1 |
+				BTRFS_BLOCK_GROUP_RAID10;
+
+			/*
+			 * if they asked for extra copies and this block group
+			 * doesn't provide them, bail.  This does allow us to
+			 * fill raid0 from raid1.
+			 */
+			if ((data & extra) && !(block_group->flags & extra))
+				goto loop;
+		}
+
+have_block_group:
+		if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+			u64 free_percent;
+
+			ret = cache_block_group(block_group, trans,
+						orig_root, 1);
+			if (block_group->cached == BTRFS_CACHE_FINISHED)
+				goto have_block_group;
+
+			free_percent = btrfs_block_group_used(&block_group->item);
+			free_percent *= 100;
+			free_percent = div64_u64(free_percent,
+						 block_group->key.offset);
+			free_percent = 100 - free_percent;
+			if (free_percent > ideal_cache_percent &&
+			    likely(!block_group->ro)) {
+				ideal_cache_offset = block_group->key.objectid;
+				ideal_cache_percent = free_percent;
+			}
+
+			/*
+			 * We only want to start kthread caching if we are at
+			 * the point where we will wait for caching to make
+			 * progress, or if our ideal search is over and we've
+			 * found somebody to start caching.
+			 */
+			if (loop > LOOP_CACHING_NOWAIT ||
+			    (loop > LOOP_FIND_IDEAL &&
+			     atomic_read(&space_info->caching_threads) < 2)) {
+				ret = cache_block_group(block_group, trans,
+							orig_root, 0);
+				BUG_ON(ret);
+			}
+			found_uncached_bg = true;
+
+			/*
+			 * If loop is set for cached only, try the next block
+			 * group.
+			 */
+			if (loop == LOOP_FIND_IDEAL)
+				goto loop;
+		}
+
+		cached = block_group_cache_done(block_group);
+		if (unlikely(!cached))
+			found_uncached_bg = true;
+
+		if (unlikely(block_group->ro))
+			goto loop;
+
+		spin_lock(&block_group->free_space_ctl->tree_lock);
+		if (cached &&
+		    block_group->free_space_ctl->free_space <
+		    num_bytes + empty_size) {
+			spin_unlock(&block_group->free_space_ctl->tree_lock);
+			goto loop;
+		}
+		spin_unlock(&block_group->free_space_ctl->tree_lock);
+
+		/*
+		 * Ok we want to try and use the cluster allocator, so lets look
+		 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
+		 * have tried the cluster allocator plenty of times at this
+		 * point and not have found anything, so we are likely way too
+		 * fragmented for the clustering stuff to find anything, so lets
+		 * just skip it and let the allocator find whatever block it can
+		 * find
+		 */
+		if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
+			/*
+			 * the refill lock keeps out other
+			 * people trying to start a new cluster
+			 */
+			spin_lock(&last_ptr->refill_lock);
+			if (last_ptr->block_group &&
+			    (last_ptr->block_group->ro ||
+			    !block_group_bits(last_ptr->block_group, data))) {
+				offset = 0;
+				goto refill_cluster;
+			}
+
+			offset = btrfs_alloc_from_cluster(block_group, last_ptr,
+						 num_bytes, search_start);
+			if (offset) {
+				/* we have a block, we're done */
+				spin_unlock(&last_ptr->refill_lock);
+				goto checks;
+			}
+
+			spin_lock(&last_ptr->lock);
+			/*
+			 * whoops, this cluster doesn't actually point to
+			 * this block group.  Get a ref on the block
+			 * group is does point to and try again
+			 */
+			if (!last_ptr_loop && last_ptr->block_group &&
+			    last_ptr->block_group != block_group) {
+
+				btrfs_put_block_group(block_group);
+				block_group = last_ptr->block_group;
+				btrfs_get_block_group(block_group);
+				spin_unlock(&last_ptr->lock);
+				spin_unlock(&last_ptr->refill_lock);
+
+				last_ptr_loop = 1;
+				search_start = block_group->key.objectid;
+				/*
+				 * we know this block group is properly
+				 * in the list because
+				 * btrfs_remove_block_group, drops the
+				 * cluster before it removes the block
+				 * group from the list
+				 */
+				goto have_block_group;
+			}
+			spin_unlock(&last_ptr->lock);
+refill_cluster:
+			/*
+			 * this cluster didn't work out, free it and
+			 * start over
+			 */
+			btrfs_return_cluster_to_free_space(NULL, last_ptr);
+
+			last_ptr_loop = 0;
+
+			/* allocate a cluster in this block group */
+			ret = btrfs_find_space_cluster(trans, root,
+					       block_group, last_ptr,
+					       offset, num_bytes,
+					       empty_cluster + empty_size);
+			if (ret == 0) {
+				/*
+				 * now pull our allocation out of this
+				 * cluster
+				 */
+				offset = btrfs_alloc_from_cluster(block_group,
+						  last_ptr, num_bytes,
+						  search_start);
+				if (offset) {
+					/* we found one, proceed */
+					spin_unlock(&last_ptr->refill_lock);
+					goto checks;
+				}
+			} else if (!cached && loop > LOOP_CACHING_NOWAIT
+				   && !failed_cluster_refill) {
+				spin_unlock(&last_ptr->refill_lock);
+
+				failed_cluster_refill = true;
+				wait_block_group_cache_progress(block_group,
+				       num_bytes + empty_cluster + empty_size);
+				goto have_block_group;
+			}
+
+			/*
+			 * at this point we either didn't find a cluster
+			 * or we weren't able to allocate a block from our
+			 * cluster.  Free the cluster we've been trying
+			 * to use, and go to the next block group
+			 */
+			btrfs_return_cluster_to_free_space(NULL, last_ptr);
+			spin_unlock(&last_ptr->refill_lock);
+			goto loop;
+		}
+
+		offset = btrfs_find_space_for_alloc(block_group, search_start,
+						    num_bytes, empty_size);
+		/*
+		 * If we didn't find a chunk, and we haven't failed on this
+		 * block group before, and this block group is in the middle of
+		 * caching and we are ok with waiting, then go ahead and wait
+		 * for progress to be made, and set failed_alloc to true.
+		 *
+		 * If failed_alloc is true then we've already waited on this
+		 * block group once and should move on to the next block group.
+		 */
+		if (!offset && !failed_alloc && !cached &&
+		    loop > LOOP_CACHING_NOWAIT) {
+			wait_block_group_cache_progress(block_group,
+						num_bytes + empty_size);
+			failed_alloc = true;
+			goto have_block_group;
+		} else if (!offset) {
+			goto loop;
+		}
+checks:
+		search_start = stripe_align(root, offset);
+		/* move on to the next group */
+		if (search_start + num_bytes >= search_end) {
+			btrfs_add_free_space(block_group, offset, num_bytes);
+			goto loop;
+		}
+
+		/* move on to the next group */
+		if (search_start + num_bytes >
+		    block_group->key.objectid + block_group->key.offset) {
+			btrfs_add_free_space(block_group, offset, num_bytes);
+			goto loop;
+		}
+
+		ins->objectid = search_start;
+		ins->offset = num_bytes;
+
+		if (offset < search_start)
+			btrfs_add_free_space(block_group, offset,
+					     search_start - offset);
+		BUG_ON(offset > search_start);
+
+		ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
+					    (data & BTRFS_BLOCK_GROUP_DATA));
+		if (ret == -EAGAIN) {
+			btrfs_add_free_space(block_group, offset, num_bytes);
+			goto loop;
+		}
+
+		/* we are all good, lets return */
+		ins->objectid = search_start;
+		ins->offset = num_bytes;
+
+		if (offset < search_start)
+			btrfs_add_free_space(block_group, offset,
+					     search_start - offset);
+		BUG_ON(offset > search_start);
+		btrfs_put_block_group(block_group);
+		break;
+loop:
+		failed_cluster_refill = false;
+		failed_alloc = false;
+		BUG_ON(index != get_block_group_index(block_group));
+		btrfs_put_block_group(block_group);
+	}
+	up_read(&space_info->groups_sem);
+
+	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+		goto search;
+
+	/* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
+	 *			for them to make caching progress.  Also
+	 *			determine the best possible bg to cache
+	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
+	 *			caching kthreads as we move along
+	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+	 *			again
+	 */
+	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
+		index = 0;
+		if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
+			found_uncached_bg = false;
+			loop++;
+			if (!ideal_cache_percent &&
+			    atomic_read(&space_info->caching_threads))
+				goto search;
+
+			/*
+			 * 1 of the following 2 things have happened so far
+			 *
+			 * 1) We found an ideal block group for caching that
+			 * is mostly full and will cache quickly, so we might
+			 * as well wait for it.
+			 *
+			 * 2) We searched for cached only and we didn't find
+			 * anything, and we didn't start any caching kthreads
+			 * either, so chances are we will loop through and
+			 * start a couple caching kthreads, and then come back
+			 * around and just wait for them.  This will be slower
+			 * because we will have 2 caching kthreads reading at
+			 * the same time when we could have just started one
+			 * and waited for it to get far enough to give us an
+			 * allocation, so go ahead and go to the wait caching
+			 * loop.
+			 */
+			loop = LOOP_CACHING_WAIT;
+			search_start = ideal_cache_offset;
+			ideal_cache_percent = 0;
+			goto ideal_cache;
+		} else if (loop == LOOP_FIND_IDEAL) {
+			/*
+			 * Didn't find a uncached bg, wait on anything we find
+			 * next.
+			 */
+			loop = LOOP_CACHING_WAIT;
+			goto search;
+		}
+
+		loop++;
+
+		if (loop == LOOP_ALLOC_CHUNK) {
+		       if (allowed_chunk_alloc) {
+				ret = do_chunk_alloc(trans, root, num_bytes +
+						     2 * 1024 * 1024, data,
+						     CHUNK_ALLOC_LIMITED);
+				allowed_chunk_alloc = 0;
+				if (ret == 1)
+					done_chunk_alloc = 1;
+			} else if (!done_chunk_alloc &&
+				   space_info->force_alloc ==
+				   CHUNK_ALLOC_NO_FORCE) {
+				space_info->force_alloc = CHUNK_ALLOC_LIMITED;
+			}
+
+		       /*
+			* We didn't allocate a chunk, go ahead and drop the
+			* empty size and loop again.
+			*/
+		       if (!done_chunk_alloc)
+			       loop = LOOP_NO_EMPTY_SIZE;
+		}
+
+		if (loop == LOOP_NO_EMPTY_SIZE) {
+			empty_size = 0;
+			empty_cluster = 0;
+		}
+
+		goto search;
+	} else if (!ins->objectid) {
+		ret = -ENOSPC;
+	} else if (ins->objectid) {
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+			    int dump_block_groups)
+{
+	struct btrfs_block_group_cache *cache;
+	int index = 0;
+
+	spin_lock(&info->lock);
+	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+	       (unsigned long long)(info->total_bytes - info->bytes_used -
+				    info->bytes_pinned - info->bytes_reserved -
+				    info->bytes_readonly),
+	       (info->full) ? "" : "not ");
+	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
+	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
+	       (unsigned long long)info->total_bytes,
+	       (unsigned long long)info->bytes_used,
+	       (unsigned long long)info->bytes_pinned,
+	       (unsigned long long)info->bytes_reserved,
+	       (unsigned long long)info->bytes_may_use,
+	       (unsigned long long)info->bytes_readonly);
+	spin_unlock(&info->lock);
+
+	if (!dump_block_groups)
+		return;
+
+	down_read(&info->groups_sem);
+again:
+	list_for_each_entry(cache, &info->block_groups[index], list) {
+		spin_lock(&cache->lock);
+		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
+		       "%llu pinned %llu reserved\n",
+		       (unsigned long long)cache->key.objectid,
+		       (unsigned long long)cache->key.offset,
+		       (unsigned long long)btrfs_block_group_used(&cache->item),
+		       (unsigned long long)cache->pinned,
+		       (unsigned long long)cache->reserved);
+		btrfs_dump_free_space(cache, bytes);
+		spin_unlock(&cache->lock);
+	}
+	if (++index < BTRFS_NR_RAID_TYPES)
+		goto again;
+	up_read(&info->groups_sem);
+}
+
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 num_bytes, u64 min_alloc_size,
+			 u64 empty_size, u64 hint_byte,
+			 u64 search_end, struct btrfs_key *ins,
+			 u64 data)
+{
+	int ret;
+	u64 search_start = 0;
+
+	data = btrfs_get_alloc_profile(root, data);
+again:
+	/*
+	 * the only place that sets empty_size is btrfs_realloc_node, which
+	 * is not called recursively on allocations
+	 */
+	if (empty_size || root->ref_cows)
+		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+				     num_bytes + 2 * 1024 * 1024, data,
+				     CHUNK_ALLOC_NO_FORCE);
+
+	WARN_ON(num_bytes < root->sectorsize);
+	ret = find_free_extent(trans, root, num_bytes, empty_size,
+			       search_start, search_end, hint_byte,
+			       ins, data);
+
+	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
+		num_bytes = num_bytes >> 1;
+		num_bytes = num_bytes & ~(root->sectorsize - 1);
+		num_bytes = max(num_bytes, min_alloc_size);
+		do_chunk_alloc(trans, root->fs_info->extent_root,
+			       num_bytes, data, CHUNK_ALLOC_FORCE);
+		goto again;
+	}
+	if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
+		struct btrfs_space_info *sinfo;
+
+		sinfo = __find_space_info(root->fs_info, data);
+		printk(KERN_ERR "btrfs allocation failed flags %llu, "
+		       "wanted %llu\n", (unsigned long long)data,
+		       (unsigned long long)num_bytes);
+		dump_space_info(sinfo, num_bytes, 1);
+	}
+
+	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
+
+	return ret;
+}
+
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+	struct btrfs_block_group_cache *cache;
+	int ret = 0;
+
+	cache = btrfs_lookup_block_group(root->fs_info, start);
+	if (!cache) {
+		printk(KERN_ERR "Unable to find block group for %llu\n",
+		       (unsigned long long)start);
+		return -ENOSPC;
+	}
+
+	if (btrfs_test_opt(root, DISCARD))
+		ret = btrfs_discard_extent(root, start, len, NULL);
+
+	btrfs_add_free_space(cache, start, len);
+	btrfs_update_reserved_bytes(cache, len, 0, 1);
+	btrfs_put_block_group(cache);
+
+	trace_btrfs_reserved_extent_free(root, start, len);
+
+	return ret;
+}
+
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      u64 parent, u64 root_objectid,
+				      u64 flags, u64 owner, u64 offset,
+				      struct btrfs_key *ins, int ref_mod)
+{
+	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_extent_item *extent_item;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	int type;
+	u32 size;
+
+	if (parent > 0)
+		type = BTRFS_SHARED_DATA_REF_KEY;
+	else
+		type = BTRFS_EXTENT_DATA_REF_KEY;
+
+	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
+				      ins, size);
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	extent_item = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_item);
+	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
+	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+	btrfs_set_extent_flags(leaf, extent_item,
+			       flags | BTRFS_EXTENT_FLAG_DATA);
+
+	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+	btrfs_set_extent_inline_ref_type(leaf, iref, type);
+	if (parent > 0) {
+		struct btrfs_shared_data_ref *ref;
+		ref = (struct btrfs_shared_data_ref *)(iref + 1);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
+	} else {
+		struct btrfs_extent_data_ref *ref;
+		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
+		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
+	}
+
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_free_path(path);
+
+	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+	if (ret) {
+		printk(KERN_ERR "btrfs update block group failed for %llu "
+		       "%llu\n", (unsigned long long)ins->objectid,
+		       (unsigned long long)ins->offset);
+		BUG();
+	}
+	return ret;
+}
+
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 parent, u64 root_objectid,
+				     u64 flags, struct btrfs_disk_key *key,
+				     int level, struct btrfs_key *ins)
+{
+	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_extent_item *extent_item;
+	struct btrfs_tree_block_info *block_info;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
+				      ins, size);
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	extent_item = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_item);
+	btrfs_set_extent_refs(leaf, extent_item, 1);
+	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+	btrfs_set_extent_flags(leaf, extent_item,
+			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
+	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
+
+	btrfs_set_tree_block_key(leaf, block_info, key);
+	btrfs_set_tree_block_level(leaf, block_info, level);
+
+	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+	if (parent > 0) {
+		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+		btrfs_set_extent_inline_ref_type(leaf, iref,
+						 BTRFS_SHARED_BLOCK_REF_KEY);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+	} else {
+		btrfs_set_extent_inline_ref_type(leaf, iref,
+						 BTRFS_TREE_BLOCK_REF_KEY);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+	}
+
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_free_path(path);
+
+	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+	if (ret) {
+		printk(KERN_ERR "btrfs update block group failed for %llu "
+		       "%llu\n", (unsigned long long)ins->objectid,
+		       (unsigned long long)ins->offset);
+		BUG();
+	}
+	return ret;
+}
+
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 root_objectid, u64 owner,
+				     u64 offset, struct btrfs_key *ins)
+{
+	int ret;
+
+	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+
+	ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
+					 0, root_objectid, owner, offset,
+					 BTRFS_ADD_DELAYED_EXTENT, NULL);
+	return ret;
+}
+
+/*
+ * this is used by the tree logging recovery code.  It records that
+ * an extent has been allocated and makes sure to clear the free
+ * space cache bits as well
+ */
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   u64 root_objectid, u64 owner, u64 offset,
+				   struct btrfs_key *ins)
+{
+	int ret;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_caching_control *caching_ctl;
+	u64 start = ins->objectid;
+	u64 num_bytes = ins->offset;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+	cache_block_group(block_group, trans, NULL, 0);
+	caching_ctl = get_caching_control(block_group);
+
+	if (!caching_ctl) {
+		BUG_ON(!block_group_cache_done(block_group));
+		ret = btrfs_remove_free_space(block_group, start, num_bytes);
+		BUG_ON(ret);
+	} else {
+		mutex_lock(&caching_ctl->mutex);
+
+		if (start >= caching_ctl->progress) {
+			ret = add_excluded_extent(root, start, num_bytes);
+			BUG_ON(ret);
+		} else if (start + num_bytes <= caching_ctl->progress) {
+			ret = btrfs_remove_free_space(block_group,
+						      start, num_bytes);
+			BUG_ON(ret);
+		} else {
+			num_bytes = caching_ctl->progress - start;
+			ret = btrfs_remove_free_space(block_group,
+						      start, num_bytes);
+			BUG_ON(ret);
+
+			start = caching_ctl->progress;
+			num_bytes = ins->objectid + ins->offset -
+				    caching_ctl->progress;
+			ret = add_excluded_extent(root, start, num_bytes);
+			BUG_ON(ret);
+		}
+
+		mutex_unlock(&caching_ctl->mutex);
+		put_caching_control(caching_ctl);
+	}
+
+	ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
+	BUG_ON(ret);
+	btrfs_put_block_group(block_group);
+	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
+					 0, owner, offset, ins, 1);
+	return ret;
+}
+
+struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root,
+					    u64 bytenr, u32 blocksize,
+					    int level)
+{
+	struct extent_buffer *buf;
+
+	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+	btrfs_set_header_generation(buf, trans->transid);
+	btrfs_set_buffer_lockdep_class(buf, level);
+	btrfs_tree_lock(buf);
+	clean_tree_block(trans, root, buf);
+
+	btrfs_set_lock_blocking(buf);
+	btrfs_set_buffer_uptodate(buf);
+
+	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+		/*
+		 * we allow two log transactions at a time, use different
+		 * EXENT bit to differentiate dirty pages.
+		 */
+		if (root->log_transid % 2 == 0)
+			set_extent_dirty(&root->dirty_log_pages, buf->start,
+					buf->start + buf->len - 1, GFP_NOFS);
+		else
+			set_extent_new(&root->dirty_log_pages, buf->start,
+					buf->start + buf->len - 1, GFP_NOFS);
+	} else {
+		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
+			 buf->start + buf->len - 1, GFP_NOFS);
+	}
+	trans->blocks_used++;
+	/* this returns a buffer locked for blocking */
+	return buf;
+}
+
+static struct btrfs_block_rsv *
+use_block_rsv(struct btrfs_trans_handle *trans,
+	      struct btrfs_root *root, u32 blocksize)
+{
+	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+	int ret;
+
+	block_rsv = get_block_rsv(trans, root);
+
+	if (block_rsv->size == 0) {
+		ret = reserve_metadata_bytes(trans, root, block_rsv,
+					     blocksize, 0);
+		/*
+		 * If we couldn't reserve metadata bytes try and use some from
+		 * the global reserve.
+		 */
+		if (ret && block_rsv != global_rsv) {
+			ret = block_rsv_use_bytes(global_rsv, blocksize);
+			if (!ret)
+				return global_rsv;
+			return ERR_PTR(ret);
+		} else if (ret) {
+			return ERR_PTR(ret);
+		}
+		return block_rsv;
+	}
+
+	ret = block_rsv_use_bytes(block_rsv, blocksize);
+	if (!ret)
+		return block_rsv;
+	if (ret) {
+		WARN_ON(1);
+		ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
+					     0);
+		if (!ret) {
+			spin_lock(&block_rsv->lock);
+			block_rsv->size += blocksize;
+			spin_unlock(&block_rsv->lock);
+			return block_rsv;
+		} else if (ret && block_rsv != global_rsv) {
+			ret = block_rsv_use_bytes(global_rsv, blocksize);
+			if (!ret)
+				return global_rsv;
+		}
+	}
+
+	return ERR_PTR(-ENOSPC);
+}
+
+static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+{
+	block_rsv_add_bytes(block_rsv, blocksize, 0);
+	block_rsv_release_bytes(block_rsv, NULL, 0);
+}
+
+/*
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
+ * returns the tree buffer or NULL.
+ */
+struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root, u32 blocksize,
+					u64 parent, u64 root_objectid,
+					struct btrfs_disk_key *key, int level,
+					u64 hint, u64 empty_size)
+{
+	struct btrfs_key ins;
+	struct btrfs_block_rsv *block_rsv;
+	struct extent_buffer *buf;
+	u64 flags = 0;
+	int ret;
+
+
+	block_rsv = use_block_rsv(trans, root, blocksize);
+	if (IS_ERR(block_rsv))
+		return ERR_CAST(block_rsv);
+
+	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
+				   empty_size, hint, (u64)-1, &ins, 0);
+	if (ret) {
+		unuse_block_rsv(block_rsv, blocksize);
+		return ERR_PTR(ret);
+	}
+
+	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
+				    blocksize, level);
+	BUG_ON(IS_ERR(buf));
+
+	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		if (parent == 0)
+			parent = ins.objectid;
+		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+	} else
+		BUG_ON(parent > 0);
+
+	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+		struct btrfs_delayed_extent_op *extent_op;
+		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+		BUG_ON(!extent_op);
+		if (key)
+			memcpy(&extent_op->key, key, sizeof(extent_op->key));
+		else
+			memset(&extent_op->key, 0, sizeof(extent_op->key));
+		extent_op->flags_to_set = flags;
+		extent_op->update_key = 1;
+		extent_op->update_flags = 1;
+		extent_op->is_data = 0;
+
+		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+					ins.offset, parent, root_objectid,
+					level, BTRFS_ADD_DELAYED_EXTENT,
+					extent_op);
+		BUG_ON(ret);
+	}
+	return buf;
+}
+
+struct walk_control {
+	u64 refs[BTRFS_MAX_LEVEL];
+	u64 flags[BTRFS_MAX_LEVEL];
+	struct btrfs_key update_progress;
+	int stage;
+	int level;
+	int shared_level;
+	int update_ref;
+	int keep_locks;
+	int reada_slot;
+	int reada_count;
+};
+
+#define DROP_REFERENCE	1
+#define UPDATE_BACKREF	2
+
+static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct walk_control *wc,
+				     struct btrfs_path *path)
+{
+	u64 bytenr;
+	u64 generation;
+	u64 refs;
+	u64 flags;
+	u32 nritems;
+	u32 blocksize;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	int ret;
+	int slot;
+	int nread = 0;
+
+	if (path->slots[wc->level] < wc->reada_slot) {
+		wc->reada_count = wc->reada_count * 2 / 3;
+		wc->reada_count = max(wc->reada_count, 2);
+	} else {
+		wc->reada_count = wc->reada_count * 3 / 2;
+		wc->reada_count = min_t(int, wc->reada_count,
+					BTRFS_NODEPTRS_PER_BLOCK(root));
+	}
+
+	eb = path->nodes[wc->level];
+	nritems = btrfs_header_nritems(eb);
+	blocksize = btrfs_level_size(root, wc->level - 1);
+
+	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
+		if (nread >= wc->reada_count)
+			break;
+
+		cond_resched();
+		bytenr = btrfs_node_blockptr(eb, slot);
+		generation = btrfs_node_ptr_generation(eb, slot);
+
+		if (slot == path->slots[wc->level])
+			goto reada;
+
+		if (wc->stage == UPDATE_BACKREF &&
+		    generation <= root->root_key.offset)
+			continue;
+
+		/* We don't lock the tree block, it's OK to be racy here */
+		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+					       &refs, &flags);
+		BUG_ON(ret);
+		BUG_ON(refs == 0);
+
+		if (wc->stage == DROP_REFERENCE) {
+			if (refs == 1)
+				goto reada;
+
+			if (wc->level == 1 &&
+			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+				continue;
+			if (!wc->update_ref ||
+			    generation <= root->root_key.offset)
+				continue;
+			btrfs_node_key_to_cpu(eb, &key, slot);
+			ret = btrfs_comp_cpu_keys(&key,
+						  &wc->update_progress);
+			if (ret < 0)
+				continue;
+		} else {
+			if (wc->level == 1 &&
+			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+				continue;
+		}
+reada:
+		ret = readahead_tree_block(root, bytenr, blocksize,
+					   generation);
+		if (ret)
+			break;
+		nread++;
+	}
+	wc->reada_slot = slot;
+}
+
+/*
+ * hepler to process tree block while walking down the tree.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function updates
+ * back refs for pointers in the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
+ */
+static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   struct walk_control *wc, int lookup_info)
+{
+	int level = wc->level;
+	struct extent_buffer *eb = path->nodes[level];
+	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+	int ret;
+
+	if (wc->stage == UPDATE_BACKREF &&
+	    btrfs_header_owner(eb) != root->root_key.objectid)
+		return 1;
+
+	/*
+	 * when reference count of tree block is 1, it won't increase
+	 * again. once full backref flag is set, we never clear it.
+	 */
+	if (lookup_info &&
+	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
+		BUG_ON(!path->locks[level]);
+		ret = btrfs_lookup_extent_info(trans, root,
+					       eb->start, eb->len,
+					       &wc->refs[level],
+					       &wc->flags[level]);
+		BUG_ON(ret);
+		BUG_ON(wc->refs[level] == 0);
+	}
+
+	if (wc->stage == DROP_REFERENCE) {
+		if (wc->refs[level] > 1)
+			return 1;
+
+		if (path->locks[level] && !wc->keep_locks) {
+			btrfs_tree_unlock(eb);
+			path->locks[level] = 0;
+		}
+		return 0;
+	}
+
+	/* wc->stage == UPDATE_BACKREF */
+	if (!(wc->flags[level] & flag)) {
+		BUG_ON(!path->locks[level]);
+		ret = btrfs_inc_ref(trans, root, eb, 1);
+		BUG_ON(ret);
+		ret = btrfs_dec_ref(trans, root, eb, 0);
+		BUG_ON(ret);
+		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
+						  eb->len, flag, 0);
+		BUG_ON(ret);
+		wc->flags[level] |= flag;
+	}
+
+	/*
+	 * the block is shared by multiple trees, so it's not good to
+	 * keep the tree lock
+	 */
+	if (path->locks[level] && level > 0) {
+		btrfs_tree_unlock(eb);
+		path->locks[level] = 0;
+	}
+	return 0;
+}
+
+/*
+ * hepler to process tree block pointer.
+ *
+ * when wc->stage == DROP_REFERENCE, this function checks
+ * reference count of the block pointed to. if the block
+ * is shared and we need update back refs for the subtree
+ * rooted at the block, this function changes wc->stage to
+ * UPDATE_BACKREF. if the block is shared and there is no
+ * need to update back, this function drops the reference
+ * to the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
+ */
+static noinline int do_walk_down(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct walk_control *wc, int *lookup_info)
+{
+	u64 bytenr;
+	u64 generation;
+	u64 parent;
+	u32 blocksize;
+	struct btrfs_key key;
+	struct extent_buffer *next;
+	int level = wc->level;
+	int reada = 0;
+	int ret = 0;
+
+	generation = btrfs_node_ptr_generation(path->nodes[level],
+					       path->slots[level]);
+	/*
+	 * if the lower level block was created before the snapshot
+	 * was created, we know there is no need to update back refs
+	 * for the subtree
+	 */
+	if (wc->stage == UPDATE_BACKREF &&
+	    generation <= root->root_key.offset) {
+		*lookup_info = 1;
+		return 1;
+	}
+
+	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+	blocksize = btrfs_level_size(root, level - 1);
+
+	next = btrfs_find_tree_block(root, bytenr, blocksize);
+	if (!next) {
+		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+		if (!next)
+			return -ENOMEM;
+		reada = 1;
+	}
+	btrfs_tree_lock(next);
+	btrfs_set_lock_blocking(next);
+
+	ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+				       &wc->refs[level - 1],
+				       &wc->flags[level - 1]);
+	BUG_ON(ret);
+	BUG_ON(wc->refs[level - 1] == 0);
+	*lookup_info = 0;
+
+	if (wc->stage == DROP_REFERENCE) {
+		if (wc->refs[level - 1] > 1) {
+			if (level == 1 &&
+			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+				goto skip;
+
+			if (!wc->update_ref ||
+			    generation <= root->root_key.offset)
+				goto skip;
+
+			btrfs_node_key_to_cpu(path->nodes[level], &key,
+					      path->slots[level]);
+			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
+			if (ret < 0)
+				goto skip;
+
+			wc->stage = UPDATE_BACKREF;
+			wc->shared_level = level - 1;
+		}
+	} else {
+		if (level == 1 &&
+		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+			goto skip;
+	}
+
+	if (!btrfs_buffer_uptodate(next, generation)) {
+		btrfs_tree_unlock(next);
+		free_extent_buffer(next);
+		next = NULL;
+		*lookup_info = 1;
+	}
+
+	if (!next) {
+		if (reada && level == 1)
+			reada_walk_down(trans, root, wc, path);
+		next = read_tree_block(root, bytenr, blocksize, generation);
+		if (!next)
+			return -EIO;
+		btrfs_tree_lock(next);
+		btrfs_set_lock_blocking(next);
+	}
+
+	level--;
+	BUG_ON(level != btrfs_header_level(next));
+	path->nodes[level] = next;
+	path->slots[level] = 0;
+	path->locks[level] = 1;
+	wc->level = level;
+	if (wc->level == 1)
+		wc->reada_slot = 0;
+	return 0;
+skip:
+	wc->refs[level - 1] = 0;
+	wc->flags[level - 1] = 0;
+	if (wc->stage == DROP_REFERENCE) {
+		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+			parent = path->nodes[level]->start;
+		} else {
+			BUG_ON(root->root_key.objectid !=
+			       btrfs_header_owner(path->nodes[level]));
+			parent = 0;
+		}
+
+		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
+					root->root_key.objectid, level - 1, 0);
+		BUG_ON(ret);
+	}
+	btrfs_tree_unlock(next);
+	free_extent_buffer(next);
+	*lookup_info = 1;
+	return 1;
+}
+
+/*
+ * hepler to process tree block while walking up the tree.
+ *
+ * when wc->stage == DROP_REFERENCE, this function drops
+ * reference count on the block.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function changes
+ * wc->stage back to DROP_REFERENCE if we changed wc->stage
+ * to UPDATE_BACKREF previously while processing the block.
+ *
+ * NOTE: return value 1 means we should stop walking up.
+ */
+static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct walk_control *wc)
+{
+	int ret;
+	int level = wc->level;
+	struct extent_buffer *eb = path->nodes[level];
+	u64 parent = 0;
+
+	if (wc->stage == UPDATE_BACKREF) {
+		BUG_ON(wc->shared_level < level);
+		if (level < wc->shared_level)
+			goto out;
+
+		ret = find_next_key(path, level + 1, &wc->update_progress);
+		if (ret > 0)
+			wc->update_ref = 0;
+
+		wc->stage = DROP_REFERENCE;
+		wc->shared_level = -1;
+		path->slots[level] = 0;
+
+		/*
+		 * check reference count again if the block isn't locked.
+		 * we should start walking down the tree again if reference
+		 * count is one.
+		 */
+		if (!path->locks[level]) {
+			BUG_ON(level == 0);
+			btrfs_tree_lock(eb);
+			btrfs_set_lock_blocking(eb);
+			path->locks[level] = 1;
+
+			ret = btrfs_lookup_extent_info(trans, root,
+						       eb->start, eb->len,
+						       &wc->refs[level],
+						       &wc->flags[level]);
+			BUG_ON(ret);
+			BUG_ON(wc->refs[level] == 0);
+			if (wc->refs[level] == 1) {
+				btrfs_tree_unlock(eb);
+				path->locks[level] = 0;
+				return 1;
+			}
+		}
+	}
+
+	/* wc->stage == DROP_REFERENCE */
+	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
+
+	if (wc->refs[level] == 1) {
+		if (level == 0) {
+			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+				ret = btrfs_dec_ref(trans, root, eb, 1);
+			else
+				ret = btrfs_dec_ref(trans, root, eb, 0);
+			BUG_ON(ret);
+		}
+		/* make block locked assertion in clean_tree_block happy */
+		if (!path->locks[level] &&
+		    btrfs_header_generation(eb) == trans->transid) {
+			btrfs_tree_lock(eb);
+			btrfs_set_lock_blocking(eb);
+			path->locks[level] = 1;
+		}
+		clean_tree_block(trans, root, eb);
+	}
+
+	if (eb == root->node) {
+		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+			parent = eb->start;
+		else
+			BUG_ON(root->root_key.objectid !=
+			       btrfs_header_owner(eb));
+	} else {
+		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+			parent = path->nodes[level + 1]->start;
+		else
+			BUG_ON(root->root_key.objectid !=
+			       btrfs_header_owner(path->nodes[level + 1]));
+	}
+
+	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
+out:
+	wc->refs[level] = 0;
+	wc->flags[level] = 0;
+	return 0;
+}
+
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   struct walk_control *wc)
+{
+	int level = wc->level;
+	int lookup_info = 1;
+	int ret;
+
+	while (level >= 0) {
+		ret = walk_down_proc(trans, root, path, wc, lookup_info);
+		if (ret > 0)
+			break;
+
+		if (level == 0)
+			break;
+
+		if (path->slots[level] >=
+		    btrfs_header_nritems(path->nodes[level]))
+			break;
+
+		ret = do_walk_down(trans, root, path, wc, &lookup_info);
+		if (ret > 0) {
+			path->slots[level]++;
+			continue;
+		} else if (ret < 0)
+			return ret;
+		level = wc->level;
+	}
+	return 0;
+}
+
+static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct walk_control *wc, int max_level)
+{
+	int level = wc->level;
+	int ret;
+
+	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
+	while (level < max_level && path->nodes[level]) {
+		wc->level = level;
+		if (path->slots[level] + 1 <
+		    btrfs_header_nritems(path->nodes[level])) {
+			path->slots[level]++;
+			return 0;
+		} else {
+			ret = walk_up_proc(trans, root, path, wc);
+			if (ret > 0)
+				return 0;
+
+			if (path->locks[level]) {
+				btrfs_tree_unlock(path->nodes[level]);
+				path->locks[level] = 0;
+			}
+			free_extent_buffer(path->nodes[level]);
+			path->nodes[level] = NULL;
+			level++;
+		}
+	}
+	return 1;
+}
+
+/*
+ * drop a subvolume tree.
+ *
+ * this function traverses the tree freeing any blocks that only
+ * referenced by the tree.
+ *
+ * when a shared tree block is found. this function decreases its
+ * reference count by one. if update_ref is true, this function
+ * also make sure backrefs for the shared block and all lower level
+ * blocks are properly updated.
+ */
+int btrfs_drop_snapshot(struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv, int update_ref)
+{
+	struct btrfs_path *path;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *tree_root = root->fs_info->tree_root;
+	struct btrfs_root_item *root_item = &root->root_item;
+	struct walk_control *wc;
+	struct btrfs_key key;
+	int err = 0;
+	int ret;
+	int level;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	wc = kzalloc(sizeof(*wc), GFP_NOFS);
+	BUG_ON(!wc);
+
+	trans = btrfs_start_transaction(tree_root, 0);
+	BUG_ON(IS_ERR(trans));
+
+	if (block_rsv)
+		trans->block_rsv = block_rsv;
+
+	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+		level = btrfs_header_level(root->node);
+		path->nodes[level] = btrfs_lock_root_node(root);
+		btrfs_set_lock_blocking(path->nodes[level]);
+		path->slots[level] = 0;
+		path->locks[level] = 1;
+		memset(&wc->update_progress, 0,
+		       sizeof(wc->update_progress));
+	} else {
+		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+		memcpy(&wc->update_progress, &key,
+		       sizeof(wc->update_progress));
+
+		level = root_item->drop_level;
+		BUG_ON(level == 0);
+		path->lowest_level = level;
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		path->lowest_level = 0;
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		WARN_ON(ret > 0);
+
+		/*
+		 * unlock our path, this is safe because only this
+		 * function is allowed to delete this snapshot
+		 */
+		btrfs_unlock_up_safe(path, 0);
+
+		level = btrfs_header_level(root->node);
+		while (1) {
+			btrfs_tree_lock(path->nodes[level]);
+			btrfs_set_lock_blocking(path->nodes[level]);
+
+			ret = btrfs_lookup_extent_info(trans, root,
+						path->nodes[level]->start,
+						path->nodes[level]->len,
+						&wc->refs[level],
+						&wc->flags[level]);
+			BUG_ON(ret);
+			BUG_ON(wc->refs[level] == 0);
+
+			if (level == root_item->drop_level)
+				break;
+
+			btrfs_tree_unlock(path->nodes[level]);
+			WARN_ON(wc->refs[level] != 1);
+			level--;
+		}
+	}
+
+	wc->level = level;
+	wc->shared_level = -1;
+	wc->stage = DROP_REFERENCE;
+	wc->update_ref = update_ref;
+	wc->keep_locks = 0;
+	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+
+	while (1) {
+		ret = walk_down_tree(trans, root, path, wc);
+		if (ret < 0) {
+			err = ret;
+			break;
+		}
+
+		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
+		if (ret < 0) {
+			err = ret;
+			break;
+		}
+
+		if (ret > 0) {
+			BUG_ON(wc->stage != DROP_REFERENCE);
+			break;
+		}
+
+		if (wc->stage == DROP_REFERENCE) {
+			level = wc->level;
+			btrfs_node_key(path->nodes[level],
+				       &root_item->drop_progress,
+				       path->slots[level]);
+			root_item->drop_level = level;
+		}
+
+		BUG_ON(wc->level == 0);
+		if (btrfs_should_end_transaction(trans, tree_root)) {
+			ret = btrfs_update_root(trans, tree_root,
+						&root->root_key,
+						root_item);
+			BUG_ON(ret);
+
+			btrfs_end_transaction_throttle(trans, tree_root);
+			trans = btrfs_start_transaction(tree_root, 0);
+			BUG_ON(IS_ERR(trans));
+			if (block_rsv)
+				trans->block_rsv = block_rsv;
+		}
+	}
+	btrfs_release_path(path);
+	BUG_ON(err);
+
+	ret = btrfs_del_root(trans, tree_root, &root->root_key);
+	BUG_ON(ret);
+
+	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+		ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
+					   NULL, NULL);
+		BUG_ON(ret < 0);
+		if (ret > 0) {
+			/* if we fail to delete the orphan item this time
+			 * around, it'll get picked up the next time.
+			 *
+			 * The most common failure here is just -ENOENT.
+			 */
+			btrfs_del_orphan_item(trans, tree_root,
+					      root->root_key.objectid);
+		}
+	}
+
+	if (root->in_radix) {
+		btrfs_free_fs_root(tree_root->fs_info, root);
+	} else {
+		free_extent_buffer(root->node);
+		free_extent_buffer(root->commit_root);
+		kfree(root);
+	}
+out:
+	btrfs_end_transaction_throttle(trans, tree_root);
+	kfree(wc);
+	btrfs_free_path(path);
+	return err;
+}
+
+/*
+ * drop subtree rooted at tree block 'node'.
+ *
+ * NOTE: this function will unlock and release tree block 'node'
+ */
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct extent_buffer *node,
+			struct extent_buffer *parent)
+{
+	struct btrfs_path *path;
+	struct walk_control *wc;
+	int level;
+	int parent_level;
+	int ret = 0;
+	int wret;
+
+	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	wc = kzalloc(sizeof(*wc), GFP_NOFS);
+	if (!wc) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	btrfs_assert_tree_locked(parent);
+	parent_level = btrfs_header_level(parent);
+	extent_buffer_get(parent);
+	path->nodes[parent_level] = parent;
+	path->slots[parent_level] = btrfs_header_nritems(parent);
+
+	btrfs_assert_tree_locked(node);
+	level = btrfs_header_level(node);
+	path->nodes[level] = node;
+	path->slots[level] = 0;
+	path->locks[level] = 1;
+
+	wc->refs[parent_level] = 1;
+	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+	wc->level = level;
+	wc->shared_level = -1;
+	wc->stage = DROP_REFERENCE;
+	wc->update_ref = 0;
+	wc->keep_locks = 1;
+	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+
+	while (1) {
+		wret = walk_down_tree(trans, root, path, wc);
+		if (wret < 0) {
+			ret = wret;
+			break;
+		}
+
+		wret = walk_up_tree(trans, root, path, wc, parent_level);
+		if (wret < 0)
+			ret = wret;
+		if (wret != 0)
+			break;
+	}
+
+	kfree(wc);
+	btrfs_free_path(path);
+	return ret;
+}
+
+static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
+{
+	u64 num_devices;
+	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
+		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+
+	/*
+	 * we add in the count of missing devices because we want
+	 * to make sure that any RAID levels on a degraded FS
+	 * continue to be honored.
+	 */
+	num_devices = root->fs_info->fs_devices->rw_devices +
+		root->fs_info->fs_devices->missing_devices;
+
+	if (num_devices == 1) {
+		stripped |= BTRFS_BLOCK_GROUP_DUP;
+		stripped = flags & ~stripped;
+
+		/* turn raid0 into single device chunks */
+		if (flags & BTRFS_BLOCK_GROUP_RAID0)
+			return stripped;
+
+		/* turn mirroring into duplication */
+		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+			     BTRFS_BLOCK_GROUP_RAID10))
+			return stripped | BTRFS_BLOCK_GROUP_DUP;
+		return flags;
+	} else {
+		/* they already had raid on here, just return */
+		if (flags & stripped)
+			return flags;
+
+		stripped |= BTRFS_BLOCK_GROUP_DUP;
+		stripped = flags & ~stripped;
+
+		/* switch duplicated blocks with raid1 */
+		if (flags & BTRFS_BLOCK_GROUP_DUP)
+			return stripped | BTRFS_BLOCK_GROUP_RAID1;
+
+		/* turn single device chunks into raid0 */
+		return stripped | BTRFS_BLOCK_GROUP_RAID0;
+	}
+	return flags;
+}
+
+static int set_block_group_ro(struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_space_info *sinfo = cache->space_info;
+	u64 num_bytes;
+	int ret = -ENOSPC;
+
+	if (cache->ro)
+		return 0;
+
+	spin_lock(&sinfo->lock);
+	spin_lock(&cache->lock);
+	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+		    cache->bytes_super - btrfs_block_group_used(&cache->item);
+
+	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
+	    sinfo->bytes_may_use + sinfo->bytes_readonly +
+	    cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
+		sinfo->bytes_readonly += num_bytes;
+		sinfo->bytes_reserved += cache->reserved_pinned;
+		cache->reserved_pinned = 0;
+		cache->ro = 1;
+		ret = 0;
+	}
+
+	spin_unlock(&cache->lock);
+	spin_unlock(&sinfo->lock);
+	return ret;
+}
+
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+			     struct btrfs_block_group_cache *cache)
+
+{
+	struct btrfs_trans_handle *trans;
+	u64 alloc_flags;
+	int ret;
+
+	BUG_ON(cache->ro);
+
+	trans = btrfs_join_transaction(root);
+	BUG_ON(IS_ERR(trans));
+
+	alloc_flags = update_block_group_flags(root, cache->flags);
+	if (alloc_flags != cache->flags)
+		do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+			       CHUNK_ALLOC_FORCE);
+
+	ret = set_block_group_ro(cache);
+	if (!ret)
+		goto out;
+	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
+	ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+			     CHUNK_ALLOC_FORCE);
+	if (ret < 0)
+		goto out;
+	ret = set_block_group_ro(cache);
+out:
+	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 type)
+{
+	u64 alloc_flags = get_alloc_profile(root, type);
+	return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+			      CHUNK_ALLOC_FORCE);
+}
+
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * list. takes mirrors into account.
+ */
+static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+{
+	struct btrfs_block_group_cache *block_group;
+	u64 free_bytes = 0;
+	int factor;
+
+	list_for_each_entry(block_group, groups_list, list) {
+		spin_lock(&block_group->lock);
+
+		if (!block_group->ro) {
+			spin_unlock(&block_group->lock);
+			continue;
+		}
+
+		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
+					  BTRFS_BLOCK_GROUP_RAID10 |
+					  BTRFS_BLOCK_GROUP_DUP))
+			factor = 2;
+		else
+			factor = 1;
+
+		free_bytes += (block_group->key.offset -
+			       btrfs_block_group_used(&block_group->item)) *
+			       factor;
+
+		spin_unlock(&block_group->lock);
+	}
+
+	return free_bytes;
+}
+
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * space_info. takes mirrors into account.
+ */
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+{
+	int i;
+	u64 free_bytes = 0;
+
+	spin_lock(&sinfo->lock);
+
+	for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+		if (!list_empty(&sinfo->block_groups[i]))
+			free_bytes += __btrfs_get_ro_block_group_free_space(
+						&sinfo->block_groups[i]);
+
+	spin_unlock(&sinfo->lock);
+
+	return free_bytes;
+}
+
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+			      struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_space_info *sinfo = cache->space_info;
+	u64 num_bytes;
+
+	BUG_ON(!cache->ro);
+
+	spin_lock(&sinfo->lock);
+	spin_lock(&cache->lock);
+	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+		    cache->bytes_super - btrfs_block_group_used(&cache->item);
+	sinfo->bytes_readonly -= num_bytes;
+	cache->ro = 0;
+	spin_unlock(&cache->lock);
+	spin_unlock(&sinfo->lock);
+	return 0;
+}
+
+/*
+ * checks to see if its even possible to relocate this block group.
+ *
+ * @return - -1 if it's not a good idea to relocate this block group, 0 if its
+ * ok to go ahead and try.
+ */
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_space_info *space_info;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	struct btrfs_device *device;
+	int full = 0;
+	int ret = 0;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+
+	/* odd, couldn't find the block group, leave it alone */
+	if (!block_group)
+		return -1;
+
+	/* no bytes used, we're good */
+	if (!btrfs_block_group_used(&block_group->item))
+		goto out;
+
+	space_info = block_group->space_info;
+	spin_lock(&space_info->lock);
+
+	full = space_info->full;
+
+	/*
+	 * if this is the last block group we have in this space, we can't
+	 * relocate it unless we're able to allocate a new chunk below.
+	 *
+	 * Otherwise, we need to make sure we have room in the space to handle
+	 * all of the extents from this block group.  If we can, we're good
+	 */
+	if ((space_info->total_bytes != block_group->key.offset) &&
+	   (space_info->bytes_used + space_info->bytes_reserved +
+	    space_info->bytes_pinned + space_info->bytes_readonly +
+	    btrfs_block_group_used(&block_group->item) <
+	    space_info->total_bytes)) {
+		spin_unlock(&space_info->lock);
+		goto out;
+	}
+	spin_unlock(&space_info->lock);
+
+	/*
+	 * ok we don't have enough space, but maybe we have free space on our
+	 * devices to allocate new chunks for relocation, so loop through our
+	 * alloc devices and guess if we have enough space.  However, if we
+	 * were marked as full, then we know there aren't enough chunks, and we
+	 * can just return.
+	 */
+	ret = -1;
+	if (full)
+		goto out;
+
+	mutex_lock(&root->fs_info->chunk_mutex);
+	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+		u64 min_free = btrfs_block_group_used(&block_group->item);
+		u64 dev_offset;
+
+		/*
+		 * check to make sure we can actually find a chunk with enough
+		 * space to fit our block group in.
+		 */
+		if (device->total_bytes > device->bytes_used + min_free) {
+			ret = find_free_dev_extent(NULL, device, min_free,
+						   &dev_offset, NULL);
+			if (!ret)
+				break;
+			ret = -1;
+		}
+	}
+	mutex_unlock(&root->fs_info->chunk_mutex);
+out:
+	btrfs_put_block_group(block_group);
+	return ret;
+}
+
+static int find_first_block_group(struct btrfs_root *root,
+		struct btrfs_path *path, struct btrfs_key *key)
+{
+	int ret = 0;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	int slot;
+
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto out;
+			break;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		if (found_key.objectid >= key->objectid &&
+		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+			ret = 0;
+			goto out;
+		}
+		path->slots[0]++;
+	}
+out:
+	return ret;
+}
+
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
+{
+	struct btrfs_block_group_cache *block_group;
+	u64 last = 0;
+
+	while (1) {
+		struct inode *inode;
+
+		block_group = btrfs_lookup_first_block_group(info, last);
+		while (block_group) {
+			spin_lock(&block_group->lock);
+			if (block_group->iref)
+				break;
+			spin_unlock(&block_group->lock);
+			block_group = next_block_group(info->tree_root,
+						       block_group);
+		}
+		if (!block_group) {
+			if (last == 0)
+				break;
+			last = 0;
+			continue;
+		}
+
+		inode = block_group->inode;
+		block_group->iref = 0;
+		block_group->inode = NULL;
+		spin_unlock(&block_group->lock);
+		iput(inode);
+		last = block_group->key.objectid + block_group->key.offset;
+		btrfs_put_block_group(block_group);
+	}
+}
+
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_space_info *space_info;
+	struct btrfs_caching_control *caching_ctl;
+	struct rb_node *n;
+
+	down_write(&info->extent_commit_sem);
+	while (!list_empty(&info->caching_block_groups)) {
+		caching_ctl = list_entry(info->caching_block_groups.next,
+					 struct btrfs_caching_control, list);
+		list_del(&caching_ctl->list);
+		put_caching_control(caching_ctl);
+	}
+	up_write(&info->extent_commit_sem);
+
+	spin_lock(&info->block_group_cache_lock);
+	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+		block_group = rb_entry(n, struct btrfs_block_group_cache,
+				       cache_node);
+		rb_erase(&block_group->cache_node,
+			 &info->block_group_cache_tree);
+		spin_unlock(&info->block_group_cache_lock);
+
+		down_write(&block_group->space_info->groups_sem);
+		list_del(&block_group->list);
+		up_write(&block_group->space_info->groups_sem);
+
+		if (block_group->cached == BTRFS_CACHE_STARTED)
+			wait_block_group_cache_done(block_group);
+
+		/*
+		 * We haven't cached this block group, which means we could
+		 * possibly have excluded extents on this block group.
+		 */
+		if (block_group->cached == BTRFS_CACHE_NO)
+			free_excluded_extents(info->extent_root, block_group);
+
+		btrfs_remove_free_space_cache(block_group);
+		btrfs_put_block_group(block_group);
+
+		spin_lock(&info->block_group_cache_lock);
+	}
+	spin_unlock(&info->block_group_cache_lock);
+
+	/* now that all the block groups are freed, go through and
+	 * free all the space_info structs.  This is only called during
+	 * the final stages of unmount, and so we know nobody is
+	 * using them.  We call synchronize_rcu() once before we start,
+	 * just to be on the safe side.
+	 */
+	synchronize_rcu();
+
+	release_global_block_rsv(info);
+
+	while(!list_empty(&info->space_info)) {
+		space_info = list_entry(info->space_info.next,
+					struct btrfs_space_info,
+					list);
+		if (space_info->bytes_pinned > 0 ||
+		    space_info->bytes_reserved > 0) {
+			WARN_ON(1);
+			dump_space_info(space_info, 0, 0);
+		}
+		list_del(&space_info->list);
+		kfree(space_info);
+	}
+	return 0;
+}
+
+static void __link_block_group(struct btrfs_space_info *space_info,
+			       struct btrfs_block_group_cache *cache)
+{
+	int index = get_block_group_index(cache);
+
+	down_write(&space_info->groups_sem);
+	list_add_tail(&cache->list, &space_info->block_groups[index]);
+	up_write(&space_info->groups_sem);
+}
+
+int btrfs_read_block_groups(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_space_info *space_info;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	int need_clear = 0;
+	u64 cache_gen;
+
+	root = info->extent_root;
+	key.objectid = 0;
+	key.offset = 0;
+	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 1;
+
+	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+	if (cache_gen != 0 &&
+	    btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+		need_clear = 1;
+	if (btrfs_test_opt(root, CLEAR_CACHE))
+		need_clear = 1;
+	if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
+		printk(KERN_INFO "btrfs: disk space caching is enabled\n");
+
+	while (1) {
+		ret = find_first_block_group(root, path, &key);
+		if (ret > 0)
+			break;
+		if (ret != 0)
+			goto error;
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		cache = kzalloc(sizeof(*cache), GFP_NOFS);
+		if (!cache) {
+			ret = -ENOMEM;
+			goto error;
+		}
+		cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+						GFP_NOFS);
+		if (!cache->free_space_ctl) {
+			kfree(cache);
+			ret = -ENOMEM;
+			goto error;
+		}
+
+		atomic_set(&cache->count, 1);
+		spin_lock_init(&cache->lock);
+		cache->fs_info = info;
+		INIT_LIST_HEAD(&cache->list);
+		INIT_LIST_HEAD(&cache->cluster_list);
+
+		if (need_clear)
+			cache->disk_cache_state = BTRFS_DC_CLEAR;
+
+		read_extent_buffer(leaf, &cache->item,
+				   btrfs_item_ptr_offset(leaf, path->slots[0]),
+				   sizeof(cache->item));
+		memcpy(&cache->key, &found_key, sizeof(found_key));
+
+		key.objectid = found_key.objectid + found_key.offset;
+		btrfs_release_path(path);
+		cache->flags = btrfs_block_group_flags(&cache->item);
+		cache->sectorsize = root->sectorsize;
+
+		btrfs_init_free_space_ctl(cache);
+
+		/*
+		 * We need to exclude the super stripes now so that the space
+		 * info has super bytes accounted for, otherwise we'll think
+		 * we have more space than we actually do.
+		 */
+		exclude_super_stripes(root, cache);
+
+		/*
+		 * check for two cases, either we are full, and therefore
+		 * don't need to bother with the caching work since we won't
+		 * find any space, or we are empty, and we can just add all
+		 * the space in and be done with it.  This saves us _alot_ of
+		 * time, particularly in the full case.
+		 */
+		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+			cache->last_byte_to_unpin = (u64)-1;
+			cache->cached = BTRFS_CACHE_FINISHED;
+			free_excluded_extents(root, cache);
+		} else if (btrfs_block_group_used(&cache->item) == 0) {
+			cache->last_byte_to_unpin = (u64)-1;
+			cache->cached = BTRFS_CACHE_FINISHED;
+			add_new_free_space(cache, root->fs_info,
+					   found_key.objectid,
+					   found_key.objectid +
+					   found_key.offset);
+			free_excluded_extents(root, cache);
+		}
+
+		ret = update_space_info(info, cache->flags, found_key.offset,
+					btrfs_block_group_used(&cache->item),
+					&space_info);
+		BUG_ON(ret);
+		cache->space_info = space_info;
+		spin_lock(&cache->space_info->lock);
+		cache->space_info->bytes_readonly += cache->bytes_super;
+		spin_unlock(&cache->space_info->lock);
+
+		__link_block_group(space_info, cache);
+
+		ret = btrfs_add_block_group_cache(root->fs_info, cache);
+		BUG_ON(ret);
+
+		set_avail_alloc_bits(root->fs_info, cache->flags);
+		if (btrfs_chunk_readonly(root, cache->key.objectid))
+			set_block_group_ro(cache);
+	}
+
+	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
+		if (!(get_alloc_profile(root, space_info->flags) &
+		      (BTRFS_BLOCK_GROUP_RAID10 |
+		       BTRFS_BLOCK_GROUP_RAID1 |
+		       BTRFS_BLOCK_GROUP_DUP)))
+			continue;
+		/*
+		 * avoid allocating from un-mirrored block group if there are
+		 * mirrored block groups.
+		 */
+		list_for_each_entry(cache, &space_info->block_groups[3], list)
+			set_block_group_ro(cache);
+		list_for_each_entry(cache, &space_info->block_groups[4], list)
+			set_block_group_ro(cache);
+	}
+
+	init_global_block_rsv(info);
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 bytes_used,
+			   u64 type, u64 chunk_objectid, u64 chunk_offset,
+			   u64 size)
+{
+	int ret;
+	struct btrfs_root *extent_root;
+	struct btrfs_block_group_cache *cache;
+
+	extent_root = root->fs_info->extent_root;
+
+	root->fs_info->last_trans_log_full_commit = trans->transid;
+
+	cache = kzalloc(sizeof(*cache), GFP_NOFS);
+	if (!cache)
+		return -ENOMEM;
+	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+					GFP_NOFS);
+	if (!cache->free_space_ctl) {
+		kfree(cache);
+		return -ENOMEM;
+	}
+
+	cache->key.objectid = chunk_offset;
+	cache->key.offset = size;
+	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+	cache->sectorsize = root->sectorsize;
+	cache->fs_info = root->fs_info;
+
+	atomic_set(&cache->count, 1);
+	spin_lock_init(&cache->lock);
+	INIT_LIST_HEAD(&cache->list);
+	INIT_LIST_HEAD(&cache->cluster_list);
+
+	btrfs_init_free_space_ctl(cache);
+
+	btrfs_set_block_group_used(&cache->item, bytes_used);
+	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
+	cache->flags = type;
+	btrfs_set_block_group_flags(&cache->item, type);
+
+	cache->last_byte_to_unpin = (u64)-1;
+	cache->cached = BTRFS_CACHE_FINISHED;
+	exclude_super_stripes(root, cache);
+
+	add_new_free_space(cache, root->fs_info, chunk_offset,
+			   chunk_offset + size);
+
+	free_excluded_extents(root, cache);
+
+	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
+				&cache->space_info);
+	BUG_ON(ret);
+
+	spin_lock(&cache->space_info->lock);
+	cache->space_info->bytes_readonly += cache->bytes_super;
+	spin_unlock(&cache->space_info->lock);
+
+	__link_block_group(cache->space_info, cache);
+
+	ret = btrfs_add_block_group_cache(root->fs_info, cache);
+	BUG_ON(ret);
+
+	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
+				sizeof(cache->item));
+	BUG_ON(ret);
+
+	set_avail_alloc_bits(extent_root->fs_info, type);
+
+	return 0;
+}
+
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 group_start)
+{
+	struct btrfs_path *path;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_free_cluster *cluster;
+	struct btrfs_root *tree_root = root->fs_info->tree_root;
+	struct btrfs_key key;
+	struct inode *inode;
+	int ret;
+	int factor;
+
+	root = root->fs_info->extent_root;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
+	BUG_ON(!block_group);
+	BUG_ON(!block_group->ro);
+
+	/*
+	 * Free the reserved super bytes from this block group before
+	 * remove it.
+	 */
+	free_excluded_extents(root, block_group);
+
+	memcpy(&key, &block_group->key, sizeof(key));
+	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
+				  BTRFS_BLOCK_GROUP_RAID1 |
+				  BTRFS_BLOCK_GROUP_RAID10))
+		factor = 2;
+	else
+		factor = 1;
+
+	/* make sure this block group isn't part of an allocation cluster */
+	cluster = &root->fs_info->data_alloc_cluster;
+	spin_lock(&cluster->refill_lock);
+	btrfs_return_cluster_to_free_space(block_group, cluster);
+	spin_unlock(&cluster->refill_lock);
+
+	/*
+	 * make sure this block group isn't part of a metadata
+	 * allocation cluster
+	 */
+	cluster = &root->fs_info->meta_alloc_cluster;
+	spin_lock(&cluster->refill_lock);
+	btrfs_return_cluster_to_free_space(block_group, cluster);
+	spin_unlock(&cluster->refill_lock);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (!IS_ERR(inode)) {
+		btrfs_orphan_add(trans, inode);
+		clear_nlink(inode);
+		/* One for the block groups ref */
+		spin_lock(&block_group->lock);
+		if (block_group->iref) {
+			block_group->iref = 0;
+			block_group->inode = NULL;
+			spin_unlock(&block_group->lock);
+			iput(inode);
+		} else {
+			spin_unlock(&block_group->lock);
+		}
+		/* One for our lookup ref */
+		iput(inode);
+	}
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = block_group->key.objectid;
+	key.type = 0;
+
+	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0)
+		btrfs_release_path(path);
+	if (ret == 0) {
+		ret = btrfs_del_item(trans, tree_root, path);
+		if (ret)
+			goto out;
+		btrfs_release_path(path);
+	}
+
+	spin_lock(&root->fs_info->block_group_cache_lock);
+	rb_erase(&block_group->cache_node,
+		 &root->fs_info->block_group_cache_tree);
+	spin_unlock(&root->fs_info->block_group_cache_lock);
+
+	down_write(&block_group->space_info->groups_sem);
+	/*
+	 * we must use list_del_init so people can check to see if they
+	 * are still on the list after taking the semaphore
+	 */
+	list_del_init(&block_group->list);
+	up_write(&block_group->space_info->groups_sem);
+
+	if (block_group->cached == BTRFS_CACHE_STARTED)
+		wait_block_group_cache_done(block_group);
+
+	btrfs_remove_free_space_cache(block_group);
+
+	spin_lock(&block_group->space_info->lock);
+	block_group->space_info->total_bytes -= block_group->key.offset;
+	block_group->space_info->bytes_readonly -= block_group->key.offset;
+	block_group->space_info->disk_total -= block_group->key.offset * factor;
+	spin_unlock(&block_group->space_info->lock);
+
+	memcpy(&key, &block_group->key, sizeof(key));
+
+	btrfs_clear_space_info_full(root->fs_info);
+
+	btrfs_put_block_group(block_group);
+	btrfs_put_block_group(block_group);
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -EIO;
+	if (ret < 0)
+		goto out;
+
+	ret = btrfs_del_item(trans, root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_space_info *space_info;
+	struct btrfs_super_block *disk_super;
+	u64 features;
+	u64 flags;
+	int mixed = 0;
+	int ret;
+
+	disk_super = &fs_info->super_copy;
+	if (!btrfs_super_root(disk_super))
+		return 1;
+
+	features = btrfs_super_incompat_flags(disk_super);
+	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+		mixed = 1;
+
+	flags = BTRFS_BLOCK_GROUP_SYSTEM;
+	ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+	if (ret)
+		goto out;
+
+	if (mixed) {
+		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
+		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+	} else {
+		flags = BTRFS_BLOCK_GROUP_METADATA;
+		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+		if (ret)
+			goto out;
+
+		flags = BTRFS_BLOCK_GROUP_DATA;
+		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+	}
+out:
+	return ret;
+}
+
+int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+{
+	return unpin_extent_range(root, start, end);
+}
+
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+			       u64 num_bytes, u64 *actual_bytes)
+{
+	return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
+}
+
+int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_block_group_cache *cache = NULL;
+	u64 group_trimmed;
+	u64 start;
+	u64 end;
+	u64 trimmed = 0;
+	int ret = 0;
+
+	cache = btrfs_lookup_block_group(fs_info, range->start);
+
+	while (cache) {
+		if (cache->key.objectid >= (range->start + range->len)) {
+			btrfs_put_block_group(cache);
+			break;
+		}
+
+		start = max(range->start, cache->key.objectid);
+		end = min(range->start + range->len,
+				cache->key.objectid + cache->key.offset);
+
+		if (end - start >= range->minlen) {
+			if (!block_group_cache_done(cache)) {
+				ret = cache_block_group(cache, NULL, root, 0);
+				if (!ret)
+					wait_block_group_cache_done(cache);
+			}
+			ret = btrfs_trim_block_group(cache,
+						     &group_trimmed,
+						     start,
+						     end,
+						     range->minlen);
+
+			trimmed += group_trimmed;
+			if (ret) {
+				btrfs_put_block_group(cache);
+				break;
+			}
+		}
+
+		cache = next_block_group(fs_info->tree_root, cache);
+	}
+
+	range->len = trimmed;
+	return ret;
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 00000000..7055d11c
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3890 @@
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/prefetch.h>
+#include <linux/cleancache.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "compat.h"
+#include "ctree.h"
+#include "btrfs_inode.h"
+
+static struct kmem_cache *extent_state_cache;
+static struct kmem_cache *extent_buffer_cache;
+
+static LIST_HEAD(buffers);
+static LIST_HEAD(states);
+
+#define LEAK_DEBUG 0
+#if LEAK_DEBUG
+static DEFINE_SPINLOCK(leak_lock);
+#endif
+
+#define BUFFER_LRU_MAX 64
+
+struct tree_entry {
+	u64 start;
+	u64 end;
+	struct rb_node rb_node;
+};
+
+struct extent_page_data {
+	struct bio *bio;
+	struct extent_io_tree *tree;
+	get_extent_t *get_extent;
+
+	/* tells writepage not to lock the state bits for this range
+	 * it still does the unlocking
+	 */
+	unsigned int extent_locked:1;
+
+	/* tells the submit_bio code to use a WRITE_SYNC */
+	unsigned int sync_io:1;
+};
+
+int __init extent_io_init(void)
+{
+	extent_state_cache = kmem_cache_create("extent_state",
+			sizeof(struct extent_state), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!extent_state_cache)
+		return -ENOMEM;
+
+	extent_buffer_cache = kmem_cache_create("extent_buffers",
+			sizeof(struct extent_buffer), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!extent_buffer_cache)
+		goto free_state_cache;
+	return 0;
+
+free_state_cache:
+	kmem_cache_destroy(extent_state_cache);
+	return -ENOMEM;
+}
+
+void extent_io_exit(void)
+{
+	struct extent_state *state;
+	struct extent_buffer *eb;
+
+	while (!list_empty(&states)) {
+		state = list_entry(states.next, struct extent_state, leak_list);
+		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
+		       "state %lu in tree %p refs %d\n",
+		       (unsigned long long)state->start,
+		       (unsigned long long)state->end,
+		       state->state, state->tree, atomic_read(&state->refs));
+		list_del(&state->leak_list);
+		kmem_cache_free(extent_state_cache, state);
+
+	}
+
+	while (!list_empty(&buffers)) {
+		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
+		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
+		       "refs %d\n", (unsigned long long)eb->start,
+		       eb->len, atomic_read(&eb->refs));
+		list_del(&eb->leak_list);
+		kmem_cache_free(extent_buffer_cache, eb);
+	}
+	if (extent_state_cache)
+		kmem_cache_destroy(extent_state_cache);
+	if (extent_buffer_cache)
+		kmem_cache_destroy(extent_buffer_cache);
+}
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+			 struct address_space *mapping)
+{
+	tree->state = RB_ROOT;
+	INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
+	tree->ops = NULL;
+	tree->dirty_bytes = 0;
+	spin_lock_init(&tree->lock);
+	spin_lock_init(&tree->buffer_lock);
+	tree->mapping = mapping;
+}
+
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+	struct extent_state *state;
+#if LEAK_DEBUG
+	unsigned long flags;
+#endif
+
+	state = kmem_cache_alloc(extent_state_cache, mask);
+	if (!state)
+		return state;
+	state->state = 0;
+	state->private = 0;
+	state->tree = NULL;
+#if LEAK_DEBUG
+	spin_lock_irqsave(&leak_lock, flags);
+	list_add(&state->leak_list, &states);
+	spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+	atomic_set(&state->refs, 1);
+	init_waitqueue_head(&state->wq);
+	return state;
+}
+
+void free_extent_state(struct extent_state *state)
+{
+	if (!state)
+		return;
+	if (atomic_dec_and_test(&state->refs)) {
+#if LEAK_DEBUG
+		unsigned long flags;
+#endif
+		WARN_ON(state->tree);
+#if LEAK_DEBUG
+		spin_lock_irqsave(&leak_lock, flags);
+		list_del(&state->leak_list);
+		spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+		kmem_cache_free(extent_state_cache, state);
+	}
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+				   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct tree_entry *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct tree_entry, rb_node);
+
+		if (offset < entry->start)
+			p = &(*p)->rb_left;
+		else if (offset > entry->end)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	entry = rb_entry(node, struct tree_entry, rb_node);
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
+				     struct rb_node **prev_ret,
+				     struct rb_node **next_ret)
+{
+	struct rb_root *root = &tree->state;
+	struct rb_node *n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *orig_prev = NULL;
+	struct tree_entry *entry;
+	struct tree_entry *prev_entry = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct tree_entry, rb_node);
+		prev = n;
+		prev_entry = entry;
+
+		if (offset < entry->start)
+			n = n->rb_left;
+		else if (offset > entry->end)
+			n = n->rb_right;
+		else
+			return n;
+	}
+
+	if (prev_ret) {
+		orig_prev = prev;
+		while (prev && offset > prev_entry->end) {
+			prev = rb_next(prev);
+			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		}
+		*prev_ret = prev;
+		prev = orig_prev;
+	}
+
+	if (next_ret) {
+		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		while (prev && offset < prev_entry->start) {
+			prev = rb_prev(prev);
+			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		}
+		*next_ret = prev;
+	}
+	return NULL;
+}
+
+static inline struct rb_node *tree_search(struct extent_io_tree *tree,
+					  u64 offset)
+{
+	struct rb_node *prev = NULL;
+	struct rb_node *ret;
+
+	ret = __etree_search(tree, offset, &prev, NULL);
+	if (!ret)
+		return prev;
+	return ret;
+}
+
+static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
+		     struct extent_state *other)
+{
+	if (tree->ops && tree->ops->merge_extent_hook)
+		tree->ops->merge_extent_hook(tree->mapping->host, new,
+					     other);
+}
+
+/*
+ * utility function to look for merge candidates inside a given range.
+ * Any extents with matching state are merged together into a single
+ * extent in the tree.  Extents with EXTENT_IO in their state field
+ * are not merged because the end_io handlers need to be able to do
+ * operations on them without sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static int merge_state(struct extent_io_tree *tree,
+		       struct extent_state *state)
+{
+	struct extent_state *other;
+	struct rb_node *other_node;
+
+	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+		return 0;
+
+	other_node = rb_prev(&state->rb_node);
+	if (other_node) {
+		other = rb_entry(other_node, struct extent_state, rb_node);
+		if (other->end == state->start - 1 &&
+		    other->state == state->state) {
+			merge_cb(tree, state, other);
+			state->start = other->start;
+			other->tree = NULL;
+			rb_erase(&other->rb_node, &tree->state);
+			free_extent_state(other);
+		}
+	}
+	other_node = rb_next(&state->rb_node);
+	if (other_node) {
+		other = rb_entry(other_node, struct extent_state, rb_node);
+		if (other->start == state->end + 1 &&
+		    other->state == state->state) {
+			merge_cb(tree, state, other);
+			other->start = state->start;
+			state->tree = NULL;
+			rb_erase(&state->rb_node, &tree->state);
+			free_extent_state(state);
+			state = NULL;
+		}
+	}
+
+	return 0;
+}
+
+static int set_state_cb(struct extent_io_tree *tree,
+			 struct extent_state *state, int *bits)
+{
+	if (tree->ops && tree->ops->set_bit_hook) {
+		return tree->ops->set_bit_hook(tree->mapping->host,
+					       state, bits);
+	}
+
+	return 0;
+}
+
+static void clear_state_cb(struct extent_io_tree *tree,
+			   struct extent_state *state, int *bits)
+{
+	if (tree->ops && tree->ops->clear_bit_hook)
+		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
+}
+
+/*
+ * insert an extent_state struct into the tree.  'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally.  This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+			struct extent_state *state, u64 start, u64 end,
+			int *bits)
+{
+	struct rb_node *node;
+	int bits_to_set = *bits & ~EXTENT_CTLBITS;
+	int ret;
+
+	if (end < start) {
+		printk(KERN_ERR "btrfs end < start %llu %llu\n",
+		       (unsigned long long)end,
+		       (unsigned long long)start);
+		WARN_ON(1);
+	}
+	state->start = start;
+	state->end = end;
+	ret = set_state_cb(tree, state, bits);
+	if (ret)
+		return ret;
+
+	if (bits_to_set & EXTENT_DIRTY)
+		tree->dirty_bytes += end - start + 1;
+	state->state |= bits_to_set;
+	node = tree_insert(&tree->state, end, &state->rb_node);
+	if (node) {
+		struct extent_state *found;
+		found = rb_entry(node, struct extent_state, rb_node);
+		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
+		       "%llu %llu\n", (unsigned long long)found->start,
+		       (unsigned long long)found->end,
+		       (unsigned long long)start, (unsigned long long)end);
+		free_extent_state(state);
+		return -EEXIST;
+	}
+	state->tree = tree;
+	merge_state(tree, state);
+	return 0;
+}
+
+static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
+		     u64 split)
+{
+	if (tree->ops && tree->ops->split_extent_hook)
+		return tree->ops->split_extent_hook(tree->mapping->host,
+						    orig, split);
+	return 0;
+}
+
+/*
+ * split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half.  'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end].  After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+		       struct extent_state *prealloc, u64 split)
+{
+	struct rb_node *node;
+
+	split_cb(tree, orig, split);
+
+	prealloc->start = orig->start;
+	prealloc->end = split - 1;
+	prealloc->state = orig->state;
+	orig->start = split;
+
+	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
+	if (node) {
+		free_extent_state(prealloc);
+		return -EEXIST;
+	}
+	prealloc->tree = tree;
+	return 0;
+}
+
+/*
+ * utility function to clear some bits in an extent state struct.
+ * it will optionally wake up any one waiting on this state (wake == 1), or
+ * forcibly remove the state from the tree (delete == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static int clear_state_bit(struct extent_io_tree *tree,
+			    struct extent_state *state,
+			    int *bits, int wake)
+{
+	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
+	int ret = state->state & bits_to_clear;
+
+	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+		u64 range = state->end - state->start + 1;
+		WARN_ON(range > tree->dirty_bytes);
+		tree->dirty_bytes -= range;
+	}
+	clear_state_cb(tree, state, bits);
+	state->state &= ~bits_to_clear;
+	if (wake)
+		wake_up(&state->wq);
+	if (state->state == 0) {
+		if (state->tree) {
+			rb_erase(&state->rb_node, &tree->state);
+			state->tree = NULL;
+			free_extent_state(state);
+		} else {
+			WARN_ON(1);
+		}
+	} else {
+		merge_state(tree, state);
+	}
+	return ret;
+}
+
+static struct extent_state *
+alloc_extent_state_atomic(struct extent_state *prealloc)
+{
+	if (!prealloc)
+		prealloc = alloc_extent_state(GFP_ATOMIC);
+
+	return prealloc;
+}
+
+/*
+ * clear some bits on a range in the tree.  This may require splitting
+ * or inserting elements in the tree, so the gfp mask is used to
+ * indicate which allocations or sleeping are allowed.
+ *
+ * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
+ * the given range from the tree regardless of state (ie for truncate).
+ *
+ * the range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns < 0 on error, > 0 if any of the
+ * bits were already set, or zero if none of the bits were already set.
+ */
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		     int bits, int wake, int delete,
+		     struct extent_state **cached_state,
+		     gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *cached;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *next_node;
+	struct rb_node *node;
+	u64 last_end;
+	int err;
+	int set = 0;
+	int clear = 0;
+
+	if (delete)
+		bits |= ~EXTENT_CTLBITS;
+	bits |= EXTENT_FIRST_DELALLOC;
+
+	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+		clear = 1;
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		if (!prealloc)
+			return -ENOMEM;
+	}
+
+	spin_lock(&tree->lock);
+	if (cached_state) {
+		cached = *cached_state;
+
+		if (clear) {
+			*cached_state = NULL;
+			cached_state = NULL;
+		}
+
+		if (cached && cached->tree && cached->start == start) {
+			if (clear)
+				atomic_dec(&cached->refs);
+			state = cached;
+			goto hit_next;
+		}
+		if (clear)
+			free_extent_state(cached);
+	}
+	/*
+	 * this search will find the extents that end after
+	 * our range starts
+	 */
+	node = tree_search(tree, start);
+	if (!node)
+		goto out;
+	state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+	if (state->start > end)
+		goto out;
+	WARN_ON(state->end < start);
+	last_end = state->end;
+
+	/*
+	 *     | ---- desired range ---- |
+	 *  | state | or
+	 *  | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip
+	 * bits on second half.
+	 *
+	 * If the extent we found extends past our range, we
+	 * just split and search again.  It'll get split again
+	 * the next time though.
+	 *
+	 * If the extent we found is inside our range, we clear
+	 * the desired bit on it.
+	 */
+
+	if (state->start < start) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
+		err = split_state(tree, state, prealloc, start);
+		BUG_ON(err == -EEXIST);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			set |= clear_state_bit(tree, state, &bits, wake);
+			if (last_end == (u64)-1)
+				goto out;
+			start = last_end + 1;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and clear the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
+		err = split_state(tree, state, prealloc, end + 1);
+		BUG_ON(err == -EEXIST);
+		if (wake)
+			wake_up(&state->wq);
+
+		set |= clear_state_bit(tree, prealloc, &bits, wake);
+
+		prealloc = NULL;
+		goto out;
+	}
+
+	if (state->end < end && prealloc && !need_resched())
+		next_node = rb_next(&state->rb_node);
+	else
+		next_node = NULL;
+
+	set |= clear_state_bit(tree, state, &bits, wake);
+	if (last_end == (u64)-1)
+		goto out;
+	start = last_end + 1;
+	if (start <= end && next_node) {
+		state = rb_entry(next_node, struct extent_state,
+				 rb_node);
+		if (state->start == start)
+			goto hit_next;
+	}
+	goto search_again;
+
+out:
+	spin_unlock(&tree->lock);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return set;
+
+search_again:
+	if (start > end)
+		goto out;
+	spin_unlock(&tree->lock);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+
+static int wait_on_state(struct extent_io_tree *tree,
+			 struct extent_state *state)
+		__releases(tree->lock)
+		__acquires(tree->lock)
+{
+	DEFINE_WAIT(wait);
+	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+	spin_unlock(&tree->lock);
+	schedule();
+	spin_lock(&tree->lock);
+	finish_wait(&state->wq, &wait);
+	return 0;
+}
+
+/*
+ * waits for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+{
+	struct extent_state *state;
+	struct rb_node *node;
+
+	spin_lock(&tree->lock);
+again:
+	while (1) {
+		/*
+		 * this search will find all the extents that end after
+		 * our range starts
+		 */
+		node = tree_search(tree, start);
+		if (!node)
+			break;
+
+		state = rb_entry(node, struct extent_state, rb_node);
+
+		if (state->start > end)
+			goto out;
+
+		if (state->state & bits) {
+			start = state->start;
+			atomic_inc(&state->refs);
+			wait_on_state(tree, state);
+			free_extent_state(state);
+			goto again;
+		}
+		start = state->end + 1;
+
+		if (start > end)
+			break;
+
+		if (need_resched()) {
+			spin_unlock(&tree->lock);
+			cond_resched();
+			spin_lock(&tree->lock);
+		}
+	}
+out:
+	spin_unlock(&tree->lock);
+	return 0;
+}
+
+static int set_state_bits(struct extent_io_tree *tree,
+			   struct extent_state *state,
+			   int *bits)
+{
+	int ret;
+	int bits_to_set = *bits & ~EXTENT_CTLBITS;
+
+	ret = set_state_cb(tree, state, bits);
+	if (ret)
+		return ret;
+	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+		u64 range = state->end - state->start + 1;
+		tree->dirty_bytes += range;
+	}
+	state->state |= bits_to_set;
+
+	return 0;
+}
+
+static void cache_state(struct extent_state *state,
+			struct extent_state **cached_ptr)
+{
+	if (cached_ptr && !(*cached_ptr)) {
+		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
+			*cached_ptr = state;
+			atomic_inc(&state->refs);
+		}
+	}
+}
+
+static void uncache_state(struct extent_state **cached_ptr)
+{
+	if (cached_ptr && (*cached_ptr)) {
+		struct extent_state *state = *cached_ptr;
+		*cached_ptr = NULL;
+		free_extent_state(state);
+	}
+}
+
+/*
+ * set some bits on a range in the tree.  This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set.  The start of the
+ * existing range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive This takes the tree lock.
+ */
+
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   int bits, int exclusive_bits, u64 *failed_start,
+		   struct extent_state **cached_state, gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	int err = 0;
+	u64 last_start;
+	u64 last_end;
+
+	bits |= EXTENT_FIRST_DELALLOC;
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		BUG_ON(!prealloc);
+	}
+
+	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->start == start && state->tree) {
+			node = &state->rb_node;
+			goto hit_next;
+		}
+	}
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
+		err = insert_state(tree, prealloc, start, end, &bits);
+		prealloc = NULL;
+		BUG_ON(err == -EEXIST);
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+	last_start = state->start;
+	last_end = state->end;
+
+	/*
+	 * | ---- desired range ---- |
+	 * | state |
+	 *
+	 * Just lock what we found and keep going
+	 */
+	if (state->start == start && state->end <= end) {
+		struct rb_node *next_node;
+		if (state->state & exclusive_bits) {
+			*failed_start = state->start;
+			err = -EEXIST;
+			goto out;
+		}
+
+		err = set_state_bits(tree, state, &bits);
+		if (err)
+			goto out;
+
+		next_node = rb_next(node);
+		cache_state(state, cached_state);
+		merge_state(tree, state);
+		if (last_end == (u64)-1)
+			goto out;
+
+		start = last_end + 1;
+		if (next_node && start < end && prealloc && !need_resched()) {
+			state = rb_entry(next_node, struct extent_state,
+					 rb_node);
+			if (state->start == start)
+				goto hit_next;
+		}
+		goto search_again;
+	}
+
+	/*
+	 *     | ---- desired range ---- |
+	 * | state |
+	 *   or
+	 * | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip bits on
+	 * second half.
+	 *
+	 * If the extent we found extends past our
+	 * range, we just split and search again.  It'll get split
+	 * again the next time though.
+	 *
+	 * If the extent we found is inside our range, we set the
+	 * desired bit on it.
+	 */
+	if (state->start < start) {
+		if (state->state & exclusive_bits) {
+			*failed_start = start;
+			err = -EEXIST;
+			goto out;
+		}
+
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
+		err = split_state(tree, state, prealloc, start);
+		BUG_ON(err == -EEXIST);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			err = set_state_bits(tree, state, &bits);
+			if (err)
+				goto out;
+			cache_state(state, cached_state);
+			merge_state(tree, state);
+			if (last_end == (u64)-1)
+				goto out;
+			start = last_end + 1;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *     | state | or               | state |
+	 *
+	 * There's a hole, we need to insert something in it and
+	 * ignore the extent we found.
+	 */
+	if (state->start > start) {
+		u64 this_end;
+		if (end < last_start)
+			this_end = end;
+		else
+			this_end = last_start - 1;
+
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
+
+		/*
+		 * Avoid to free 'prealloc' if it can be merged with
+		 * the later extent.
+		 */
+		atomic_inc(&prealloc->refs);
+		err = insert_state(tree, prealloc, start, this_end,
+				   &bits);
+		BUG_ON(err == -EEXIST);
+		if (err) {
+			free_extent_state(prealloc);
+			prealloc = NULL;
+			goto out;
+		}
+		cache_state(prealloc, cached_state);
+		free_extent_state(prealloc);
+		prealloc = NULL;
+		start = this_end + 1;
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and set the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		if (state->state & exclusive_bits) {
+			*failed_start = start;
+			err = -EEXIST;
+			goto out;
+		}
+
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
+		err = split_state(tree, state, prealloc, end + 1);
+		BUG_ON(err == -EEXIST);
+
+		err = set_state_bits(tree, prealloc, &bits);
+		if (err) {
+			prealloc = NULL;
+			goto out;
+		}
+		cache_state(prealloc, cached_state);
+		merge_state(tree, prealloc);
+		prealloc = NULL;
+		goto out;
+	}
+
+	goto search_again;
+
+out:
+	spin_unlock(&tree->lock);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return err;
+
+search_again:
+	if (start > end)
+		goto out;
+	spin_unlock(&tree->lock);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+
+/* wrappers around set/clear extent bit */
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+			      NULL, mask);
+}
+
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		    int bits, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, bits, 0, NULL,
+			      NULL, mask);
+}
+
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		      int bits, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
+}
+
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+			struct extent_state **cached_state, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end,
+			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+			      0, NULL, cached_state, mask);
+}
+
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end,
+				EXTENT_DIRTY | EXTENT_DELALLOC |
+				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
+}
+
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+			      NULL, mask);
+}
+
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			struct extent_state **cached_state, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
+			      NULL, cached_state, mask);
+}
+
+static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+				 u64 end, struct extent_state **cached_state,
+				 gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+				cached_state, mask);
+}
+
+/*
+ * either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		     int bits, struct extent_state **cached_state, gfp_t mask)
+{
+	int err;
+	u64 failed_start;
+	while (1) {
+		err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+				     EXTENT_LOCKED, &failed_start,
+				     cached_state, mask);
+		if (err == -EEXIST && (mask & __GFP_WAIT)) {
+			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+			start = failed_start;
+		} else {
+			break;
+		}
+		WARN_ON(start > end);
+	}
+	return err;
+}
+
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+	return lock_extent_bits(tree, start, end, 0, NULL, mask);
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+		    gfp_t mask)
+{
+	int err;
+	u64 failed_start;
+
+	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+			     &failed_start, NULL, mask);
+	if (err == -EEXIST) {
+		if (failed_start > start)
+			clear_extent_bit(tree, start, failed_start - 1,
+					 EXTENT_LOCKED, 1, 0, NULL, mask);
+		return 0;
+	}
+	return 1;
+}
+
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+			 struct extent_state **cached, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+				mask);
+}
+
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
+				mask);
+}
+
+/*
+ * helper function to set both pages and extents in the tree writeback
+ */
+static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		BUG_ON(!page);
+		set_page_writeback(page);
+		page_cache_release(page);
+		index++;
+	}
+	return 0;
+}
+
+/*
+ * find the first offset in the io tree with 'bits' set. zero is
+ * returned if we find something, and *start_ret and *end_ret are
+ * set to reflect the state struct that was found.
+ *
+ * If nothing was found, 1 is returned, < 0 on error
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+			  u64 *start_ret, u64 *end_ret, int bits)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 1;
+
+	spin_lock(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node)
+		goto out;
+
+	while (1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->end >= start && (state->state & bits)) {
+			*start_ret = state->start;
+			*end_ret = state->end;
+			ret = 0;
+			break;
+		}
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	spin_unlock(&tree->lock);
+	return ret;
+}
+
+/* find the first state struct with 'bits' set after 'start', and
+ * return it.  tree->lock must be held.  NULL will returned if
+ * nothing was found after 'start'
+ */
+struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+						 u64 start, int bits)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node)
+		goto out;
+
+	while (1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->end >= start && (state->state & bits))
+			return state;
+
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	return NULL;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+					u64 *start, u64 *end, u64 max_bytes,
+					struct extent_state **cached_state)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	u64 cur_start = *start;
+	u64 found = 0;
+	u64 total_bytes = 0;
+
+	spin_lock(&tree->lock);
+
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, cur_start);
+	if (!node) {
+		if (!found)
+			*end = (u64)-1;
+		goto out;
+	}
+
+	while (1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (found && (state->start != cur_start ||
+			      (state->state & EXTENT_BOUNDARY))) {
+			goto out;
+		}
+		if (!(state->state & EXTENT_DELALLOC)) {
+			if (!found)
+				*end = state->end;
+			goto out;
+		}
+		if (!found) {
+			*start = state->start;
+			*cached_state = state;
+			atomic_inc(&state->refs);
+		}
+		found++;
+		*end = state->end;
+		cur_start = state->end + 1;
+		node = rb_next(node);
+		if (!node)
+			break;
+		total_bytes += state->end - state->start + 1;
+		if (total_bytes >= max_bytes)
+			break;
+	}
+out:
+	spin_unlock(&tree->lock);
+	return found;
+}
+
+static noinline int __unlock_for_delalloc(struct inode *inode,
+					  struct page *locked_page,
+					  u64 start, u64 end)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+
+	if (index == locked_page->index && end_index == index)
+		return 0;
+
+	while (nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long, nr_pages,
+				     ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+			if (pages[i] != locked_page)
+				unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	return 0;
+}
+
+static noinline int lock_delalloc_pages(struct inode *inode,
+					struct page *locked_page,
+					u64 delalloc_start,
+					u64 delalloc_end)
+{
+	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+	unsigned long start_index = index;
+	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+	unsigned long pages_locked = 0;
+	struct page *pages[16];
+	unsigned long nrpages;
+	int ret;
+	int i;
+
+	/* the caller is responsible for locking the start index */
+	if (index == locked_page->index && index == end_index)
+		return 0;
+
+	/* skip the page at the start index */
+	nrpages = end_index - index + 1;
+	while (nrpages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long,
+				     nrpages, ARRAY_SIZE(pages)), pages);
+		if (ret == 0) {
+			ret = -EAGAIN;
+			goto done;
+		}
+		/* now we have an array of pages, lock them all */
+		for (i = 0; i < ret; i++) {
+			/*
+			 * the caller is taking responsibility for
+			 * locked_page
+			 */
+			if (pages[i] != locked_page) {
+				lock_page(pages[i]);
+				if (!PageDirty(pages[i]) ||
+				    pages[i]->mapping != inode->i_mapping) {
+					ret = -EAGAIN;
+					unlock_page(pages[i]);
+					page_cache_release(pages[i]);
+					goto done;
+				}
+			}
+			page_cache_release(pages[i]);
+			pages_locked++;
+		}
+		nrpages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	ret = 0;
+done:
+	if (ret && pages_locked) {
+		__unlock_for_delalloc(inode, locked_page,
+			      delalloc_start,
+			      ((u64)(start_index + pages_locked - 1)) <<
+			      PAGE_CACHE_SHIFT);
+	}
+	return ret;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_lock_delalloc_range(struct inode *inode,
+					     struct extent_io_tree *tree,
+					     struct page *locked_page,
+					     u64 *start, u64 *end,
+					     u64 max_bytes)
+{
+	u64 delalloc_start;
+	u64 delalloc_end;
+	u64 found;
+	struct extent_state *cached_state = NULL;
+	int ret;
+	int loops = 0;
+
+again:
+	/* step one, find a bunch of delalloc bytes starting at start */
+	delalloc_start = *start;
+	delalloc_end = 0;
+	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+				    max_bytes, &cached_state);
+	if (!found || delalloc_end <= *start) {
+		*start = delalloc_start;
+		*end = delalloc_end;
+		free_extent_state(cached_state);
+		return found;
+	}
+
+	/*
+	 * start comes from the offset of locked_page.  We have to lock
+	 * pages in order, so we can't process delalloc bytes before
+	 * locked_page
+	 */
+	if (delalloc_start < *start)
+		delalloc_start = *start;
+
+	/*
+	 * make sure to limit the number of pages we try to lock down
+	 * if we're looping.
+	 */
+	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
+		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+
+	/* step two, lock all the pages after the page that has start */
+	ret = lock_delalloc_pages(inode, locked_page,
+				  delalloc_start, delalloc_end);
+	if (ret == -EAGAIN) {
+		/* some of the pages are gone, lets avoid looping by
+		 * shortening the size of the delalloc range we're searching
+		 */
+		free_extent_state(cached_state);
+		if (!loops) {
+			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+			max_bytes = PAGE_CACHE_SIZE - offset;
+			loops = 1;
+			goto again;
+		} else {
+			found = 0;
+			goto out_failed;
+		}
+	}
+	BUG_ON(ret);
+
+	/* step three, lock the state bits for the whole range */
+	lock_extent_bits(tree, delalloc_start, delalloc_end,
+			 0, &cached_state, GFP_NOFS);
+
+	/* then test to make sure it is all still delalloc */
+	ret = test_range_bit(tree, delalloc_start, delalloc_end,
+			     EXTENT_DELALLOC, 1, cached_state);
+	if (!ret) {
+		unlock_extent_cached(tree, delalloc_start, delalloc_end,
+				     &cached_state, GFP_NOFS);
+		__unlock_for_delalloc(inode, locked_page,
+			      delalloc_start, delalloc_end);
+		cond_resched();
+		goto again;
+	}
+	free_extent_state(cached_state);
+	*start = delalloc_start;
+	*end = delalloc_end;
+out_failed:
+	return found;
+}
+
+int extent_clear_unlock_delalloc(struct inode *inode,
+				struct extent_io_tree *tree,
+				u64 start, u64 end, struct page *locked_page,
+				unsigned long op)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int clear_bits = 0;
+
+	if (op & EXTENT_CLEAR_UNLOCK)
+		clear_bits |= EXTENT_LOCKED;
+	if (op & EXTENT_CLEAR_DIRTY)
+		clear_bits |= EXTENT_DIRTY;
+
+	if (op & EXTENT_CLEAR_DELALLOC)
+		clear_bits |= EXTENT_DELALLOC;
+
+	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
+	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
+		    EXTENT_SET_PRIVATE2)))
+		return 0;
+
+	while (nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long,
+				     nr_pages, ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+
+			if (op & EXTENT_SET_PRIVATE2)
+				SetPagePrivate2(pages[i]);
+
+			if (pages[i] == locked_page) {
+				page_cache_release(pages[i]);
+				continue;
+			}
+			if (op & EXTENT_CLEAR_DIRTY)
+				clear_page_dirty_for_io(pages[i]);
+			if (op & EXTENT_SET_WRITEBACK)
+				set_page_writeback(pages[i]);
+			if (op & EXTENT_END_WRITEBACK)
+				end_page_writeback(pages[i]);
+			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
+				unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	return 0;
+}
+
+/*
+ * count the number of bytes in the tree that have a given bit(s)
+ * set.  This can be fairly slow, except for EXTENT_DIRTY which is
+ * cached.  The total number found is returned.
+ */
+u64 count_range_bits(struct extent_io_tree *tree,
+		     u64 *start, u64 search_end, u64 max_bytes,
+		     unsigned long bits, int contig)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	u64 cur_start = *start;
+	u64 total_bytes = 0;
+	u64 last = 0;
+	int found = 0;
+
+	if (search_end <= cur_start) {
+		WARN_ON(1);
+		return 0;
+	}
+
+	spin_lock(&tree->lock);
+	if (cur_start == 0 && bits == EXTENT_DIRTY) {
+		total_bytes = tree->dirty_bytes;
+		goto out;
+	}
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, cur_start);
+	if (!node)
+		goto out;
+
+	while (1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->start > search_end)
+			break;
+		if (contig && found && state->start > last + 1)
+			break;
+		if (state->end >= cur_start && (state->state & bits) == bits) {
+			total_bytes += min(search_end, state->end) + 1 -
+				       max(cur_start, state->start);
+			if (total_bytes >= max_bytes)
+				break;
+			if (!found) {
+				*start = max(cur_start, state->start);
+				found = 1;
+			}
+			last = state->end;
+		} else if (contig && found) {
+			break;
+		}
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	spin_unlock(&tree->lock);
+	return total_bytes;
+}
+
+/*
+ * set the private field for a given byte offset in the tree.  If there isn't
+ * an extent_state there already, this does nothing.
+ */
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 0;
+
+	spin_lock(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start != start) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state->private = private;
+out:
+	spin_unlock(&tree->lock);
+	return ret;
+}
+
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 0;
+
+	spin_lock(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start != start) {
+		ret = -ENOENT;
+		goto out;
+	}
+	*private = state->private;
+out:
+	spin_unlock(&tree->lock);
+	return ret;
+}
+
+/*
+ * searches a range in the state tree for a given mask.
+ * If 'filled' == 1, this returns 1 only if every extent in the tree
+ * has the bits set.  Otherwise, 1 is returned if any bit in the
+ * range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   int bits, int filled, struct extent_state *cached)
+{
+	struct extent_state *state = NULL;
+	struct rb_node *node;
+	int bitset = 0;
+
+	spin_lock(&tree->lock);
+	if (cached && cached->tree && cached->start == start)
+		node = &cached->rb_node;
+	else
+		node = tree_search(tree, start);
+	while (node && start <= end) {
+		state = rb_entry(node, struct extent_state, rb_node);
+
+		if (filled && state->start > start) {
+			bitset = 0;
+			break;
+		}
+
+		if (state->start > end)
+			break;
+
+		if (state->state & bits) {
+			bitset = 1;
+			if (!filled)
+				break;
+		} else if (filled) {
+			bitset = 0;
+			break;
+		}
+
+		if (state->end == (u64)-1)
+			break;
+
+		start = state->end + 1;
+		if (start > end)
+			break;
+		node = rb_next(node);
+		if (!node) {
+			if (filled)
+				bitset = 0;
+			break;
+		}
+	}
+	spin_unlock(&tree->lock);
+	return bitset;
+}
+
+/*
+ * helper function to set a given page up to date if all the
+ * extents in the tree for that page are up to date
+ */
+static int check_page_uptodate(struct extent_io_tree *tree,
+			       struct page *page)
+{
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
+		SetPageUptodate(page);
+	return 0;
+}
+
+/*
+ * helper function to unlock a page if all the extents in the tree
+ * for that page are unlocked
+ */
+static int check_page_locked(struct extent_io_tree *tree,
+			     struct page *page)
+{
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
+		unlock_page(page);
+	return 0;
+}
+
+/*
+ * helper function to end page writeback if all the extents
+ * in the tree for that page are done with writeback
+ */
+static int check_page_writeback(struct extent_io_tree *tree,
+			     struct page *page)
+{
+	end_page_writeback(page);
+	return 0;
+}
+
+/* lots and lots of room for performance fixes in the end_bio funcs */
+
+/*
+ * after a writepage IO is done, we need to:
+ * clear the uptodate bits on error
+ * clear the writeback bits in the extent tree for this IO
+ * end_page_writeback if the page has no more pending IO
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_writepage(struct bio *bio, int err)
+{
+	int uptodate = err == 0;
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_io_tree *tree;
+	u64 start;
+	u64 end;
+	int whole_page;
+	int ret;
+
+	do {
+		struct page *page = bvec->bv_page;
+		tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+			 bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+			whole_page = 1;
+		else
+			whole_page = 0;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+		if (tree->ops && tree->ops->writepage_end_io_hook) {
+			ret = tree->ops->writepage_end_io_hook(page, start,
+						       end, NULL, uptodate);
+			if (ret)
+				uptodate = 0;
+		}
+
+		if (!uptodate && tree->ops &&
+		    tree->ops->writepage_io_failed_hook) {
+			ret = tree->ops->writepage_io_failed_hook(bio, page,
+							 start, end, NULL);
+			if (ret == 0) {
+				uptodate = (err == 0);
+				continue;
+			}
+		}
+
+		if (!uptodate) {
+			clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		if (whole_page)
+			end_page_writeback(page);
+		else
+			check_page_writeback(tree, page);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+}
+
+/*
+ * after a readpage IO is done, we need to:
+ * clear the uptodate bits on error
+ * set the uptodate bits if things worked
+ * set the page up to date if all extents in the tree are uptodate
+ * clear the lock bit in the extent tree
+ * unlock the page if there are no other extents locked for it
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_readpage(struct bio *bio, int err)
+{
+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	struct extent_io_tree *tree;
+	u64 start;
+	u64 end;
+	int whole_page;
+	int ret;
+
+	if (err)
+		uptodate = 0;
+
+	do {
+		struct page *page = bvec->bv_page;
+		struct extent_state *cached = NULL;
+		struct extent_state *state;
+
+		tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+			bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+			whole_page = 1;
+		else
+			whole_page = 0;
+
+		if (++bvec <= bvec_end)
+			prefetchw(&bvec->bv_page->flags);
+
+		spin_lock(&tree->lock);
+		state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
+		if (state && state->start == start) {
+			/*
+			 * take a reference on the state, unlock will drop
+			 * the ref
+			 */
+			cache_state(state, &cached);
+		}
+		spin_unlock(&tree->lock);
+
+		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
+			ret = tree->ops->readpage_end_io_hook(page, start, end,
+							      state);
+			if (ret)
+				uptodate = 0;
+		}
+		if (!uptodate && tree->ops &&
+		    tree->ops->readpage_io_failed_hook) {
+			ret = tree->ops->readpage_io_failed_hook(bio, page,
+							 start, end, NULL);
+			if (ret == 0) {
+				uptodate =
+					test_bit(BIO_UPTODATE, &bio->bi_flags);
+				if (err)
+					uptodate = 0;
+				uncache_state(&cached);
+				continue;
+			}
+		}
+
+		if (uptodate) {
+			set_extent_uptodate(tree, start, end, &cached,
+					    GFP_ATOMIC);
+		}
+		unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
+
+		if (whole_page) {
+			if (uptodate) {
+				SetPageUptodate(page);
+			} else {
+				ClearPageUptodate(page);
+				SetPageError(page);
+			}
+			unlock_page(page);
+		} else {
+			if (uptodate) {
+				check_page_uptodate(tree, page);
+			} else {
+				ClearPageUptodate(page);
+				SetPageError(page);
+			}
+			check_page_locked(tree, page);
+		}
+	} while (bvec <= bvec_end);
+
+	bio_put(bio);
+}
+
+struct bio *
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+		gfp_t gfp_flags)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(gfp_flags, nr_vecs);
+
+	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
+
+	if (bio) {
+		bio->bi_size = 0;
+		bio->bi_bdev = bdev;
+		bio->bi_sector = first_sector;
+	}
+	return bio;
+}
+
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
+			  unsigned long bio_flags)
+{
+	int ret = 0;
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct page *page = bvec->bv_page;
+	struct extent_io_tree *tree = bio->bi_private;
+	u64 start;
+
+	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+
+	bio->bi_private = NULL;
+
+	bio_get(bio);
+
+	if (tree->ops && tree->ops->submit_bio_hook)
+		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+					   mirror_num, bio_flags, start);
+	else
+		submit_bio(rw, bio);
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+	bio_put(bio);
+	return ret;
+}
+
+static int submit_extent_page(int rw, struct extent_io_tree *tree,
+			      struct page *page, sector_t sector,
+			      size_t size, unsigned long offset,
+			      struct block_device *bdev,
+			      struct bio **bio_ret,
+			      unsigned long max_pages,
+			      bio_end_io_t end_io_func,
+			      int mirror_num,
+			      unsigned long prev_bio_flags,
+			      unsigned long bio_flags)
+{
+	int ret = 0;
+	struct bio *bio;
+	int nr;
+	int contig = 0;
+	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
+	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
+	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
+
+	if (bio_ret && *bio_ret) {
+		bio = *bio_ret;
+		if (old_compressed)
+			contig = bio->bi_sector == sector;
+		else
+			contig = bio->bi_sector + (bio->bi_size >> 9) ==
+				sector;
+
+		if (prev_bio_flags != bio_flags || !contig ||
+		    (tree->ops && tree->ops->merge_bio_hook &&
+		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
+					       bio_flags)) ||
+		    bio_add_page(bio, page, page_size, offset) < page_size) {
+			ret = submit_one_bio(rw, bio, mirror_num,
+					     prev_bio_flags);
+			bio = NULL;
+		} else {
+			return 0;
+		}
+	}
+	if (this_compressed)
+		nr = BIO_MAX_PAGES;
+	else
+		nr = bio_get_nr_vecs(bdev);
+
+	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+	if (!bio)
+		return -ENOMEM;
+
+	bio_add_page(bio, page, page_size, offset);
+	bio->bi_end_io = end_io_func;
+	bio->bi_private = tree;
+
+	if (bio_ret)
+		*bio_ret = bio;
+	else
+		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
+
+	return ret;
+}
+
+void set_page_extent_mapped(struct page *page)
+{
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		page_cache_get(page);
+		set_page_private(page, EXTENT_PAGE_PRIVATE);
+	}
+}
+
+static void set_page_extent_head(struct page *page, unsigned long len)
+{
+	WARN_ON(!PagePrivate(page));
+	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
+}
+
+/*
+ * basic readpage implementation.  Locked extent state structs are inserted
+ * into the tree that are removed when the IO is done (by the end_io
+ * handlers)
+ */
+static int __extent_read_full_page(struct extent_io_tree *tree,
+				   struct page *page,
+				   get_extent_t *get_extent,
+				   struct bio **bio, int mirror_num,
+				   unsigned long *bio_flags)
+{
+	struct inode *inode = page->mapping->host;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 end;
+	u64 cur = start;
+	u64 extent_offset;
+	u64 last_byte = i_size_read(inode);
+	u64 block_start;
+	u64 cur_end;
+	sector_t sector;
+	struct extent_map *em;
+	struct block_device *bdev;
+	struct btrfs_ordered_extent *ordered;
+	int ret;
+	int nr = 0;
+	size_t pg_offset = 0;
+	size_t iosize;
+	size_t disk_io_size;
+	size_t blocksize = inode->i_sb->s_blocksize;
+	unsigned long this_bio_flag = 0;
+
+	set_page_extent_mapped(page);
+
+	if (!PageUptodate(page)) {
+		if (cleancache_get_page(page) == 0) {
+			BUG_ON(blocksize != PAGE_SIZE);
+			goto out;
+		}
+	}
+
+	end = page_end;
+	while (1) {
+		lock_extent(tree, start, end, GFP_NOFS);
+		ordered = btrfs_lookup_ordered_extent(inode, start);
+		if (!ordered)
+			break;
+		unlock_extent(tree, start, end, GFP_NOFS);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+		char *userpage;
+		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+
+		if (zero_offset) {
+			iosize = PAGE_CACHE_SIZE - zero_offset;
+			userpage = kmap_atomic(page, KM_USER0);
+			memset(userpage + zero_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage, KM_USER0);
+		}
+	}
+	while (cur <= end) {
+		if (cur >= last_byte) {
+			char *userpage;
+			struct extent_state *cached = NULL;
+
+			iosize = PAGE_CACHE_SIZE - pg_offset;
+			userpage = kmap_atomic(page, KM_USER0);
+			memset(userpage + pg_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage, KM_USER0);
+			set_extent_uptodate(tree, cur, cur + iosize - 1,
+					    &cached, GFP_NOFS);
+			unlock_extent_cached(tree, cur, cur + iosize - 1,
+					     &cached, GFP_NOFS);
+			break;
+		}
+		em = get_extent(inode, page, pg_offset, cur,
+				end - cur + 1, 0);
+		if (IS_ERR_OR_NULL(em)) {
+			SetPageError(page);
+			unlock_extent(tree, cur, end, GFP_NOFS);
+			break;
+		}
+		extent_offset = cur - em->start;
+		BUG_ON(extent_map_end(em) <= cur);
+		BUG_ON(end < cur);
+
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+			this_bio_flag = EXTENT_BIO_COMPRESSED;
+			extent_set_compress_type(&this_bio_flag,
+						 em->compress_type);
+		}
+
+		iosize = min(extent_map_end(em) - cur, end - cur + 1);
+		cur_end = min(extent_map_end(em) - 1, end);
+		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
+			disk_io_size = em->block_len;
+			sector = em->block_start >> 9;
+		} else {
+			sector = (em->block_start + extent_offset) >> 9;
+			disk_io_size = iosize;
+		}
+		bdev = em->bdev;
+		block_start = em->block_start;
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			block_start = EXTENT_MAP_HOLE;
+		free_extent_map(em);
+		em = NULL;
+
+		/* we've found a hole, just zero and go on */
+		if (block_start == EXTENT_MAP_HOLE) {
+			char *userpage;
+			struct extent_state *cached = NULL;
+
+			userpage = kmap_atomic(page, KM_USER0);
+			memset(userpage + pg_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage, KM_USER0);
+
+			set_extent_uptodate(tree, cur, cur + iosize - 1,
+					    &cached, GFP_NOFS);
+			unlock_extent_cached(tree, cur, cur + iosize - 1,
+			                     &cached, GFP_NOFS);
+			cur = cur + iosize;
+			pg_offset += iosize;
+			continue;
+		}
+		/* the get_extent function already copied into the page */
+		if (test_range_bit(tree, cur, cur_end,
+				   EXTENT_UPTODATE, 1, NULL)) {
+			check_page_uptodate(tree, page);
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			pg_offset += iosize;
+			continue;
+		}
+		/* we have an inline extent but it didn't get marked up
+		 * to date.  Error out
+		 */
+		if (block_start == EXTENT_MAP_INLINE) {
+			SetPageError(page);
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			pg_offset += iosize;
+			continue;
+		}
+
+		ret = 0;
+		if (tree->ops && tree->ops->readpage_io_hook) {
+			ret = tree->ops->readpage_io_hook(page, cur,
+							  cur + iosize - 1);
+		}
+		if (!ret) {
+			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+			pnr -= page->index;
+			ret = submit_extent_page(READ, tree, page,
+					 sector, disk_io_size, pg_offset,
+					 bdev, bio, pnr,
+					 end_bio_extent_readpage, mirror_num,
+					 *bio_flags,
+					 this_bio_flag);
+			nr++;
+			*bio_flags = this_bio_flag;
+		}
+		if (ret)
+			SetPageError(page);
+		cur = cur + iosize;
+		pg_offset += iosize;
+	}
+out:
+	if (!nr) {
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+	return 0;
+}
+
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+			    get_extent_t *get_extent)
+{
+	struct bio *bio = NULL;
+	unsigned long bio_flags = 0;
+	int ret;
+
+	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+				      &bio_flags);
+	if (bio)
+		ret = submit_one_bio(READ, bio, 0, bio_flags);
+	return ret;
+}
+
+static noinline void update_nr_written(struct page *page,
+				      struct writeback_control *wbc,
+				      unsigned long nr_written)
+{
+	wbc->nr_to_write -= nr_written;
+	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+		page->mapping->writeback_index = page->index + nr_written;
+}
+
+/*
+ * the writepage semantics are similar to regular writepage.  extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback.  Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+			      void *data)
+{
+	struct inode *inode = page->mapping->host;
+	struct extent_page_data *epd = data;
+	struct extent_io_tree *tree = epd->tree;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 delalloc_start;
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 end;
+	u64 cur = start;
+	u64 extent_offset;
+	u64 last_byte = i_size_read(inode);
+	u64 block_start;
+	u64 iosize;
+	sector_t sector;
+	struct extent_state *cached_state = NULL;
+	struct extent_map *em;
+	struct block_device *bdev;
+	int ret;
+	int nr = 0;
+	size_t pg_offset = 0;
+	size_t blocksize;
+	loff_t i_size = i_size_read(inode);
+	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+	u64 nr_delalloc;
+	u64 delalloc_end;
+	int page_started;
+	int compressed;
+	int write_flags;
+	unsigned long nr_written = 0;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		write_flags = WRITE_SYNC;
+	else
+		write_flags = WRITE;
+
+	trace___extent_writepage(page, inode, wbc);
+
+	WARN_ON(!PageLocked(page));
+	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+	if (page->index > end_index ||
+	   (page->index == end_index && !pg_offset)) {
+		page->mapping->a_ops->invalidatepage(page, 0);
+		unlock_page(page);
+		return 0;
+	}
+
+	if (page->index == end_index) {
+		char *userpage;
+
+		userpage = kmap_atomic(page, KM_USER0);
+		memset(userpage + pg_offset, 0,
+		       PAGE_CACHE_SIZE - pg_offset);
+		kunmap_atomic(userpage, KM_USER0);
+		flush_dcache_page(page);
+	}
+	pg_offset = 0;
+
+	set_page_extent_mapped(page);
+
+	delalloc_start = start;
+	delalloc_end = 0;
+	page_started = 0;
+	if (!epd->extent_locked) {
+		u64 delalloc_to_write = 0;
+		/*
+		 * make sure the wbc mapping index is at least updated
+		 * to this page.
+		 */
+		update_nr_written(page, wbc, 0);
+
+		while (delalloc_end < page_end) {
+			nr_delalloc = find_lock_delalloc_range(inode, tree,
+						       page,
+						       &delalloc_start,
+						       &delalloc_end,
+						       128 * 1024 * 1024);
+			if (nr_delalloc == 0) {
+				delalloc_start = delalloc_end + 1;
+				continue;
+			}
+			tree->ops->fill_delalloc(inode, page, delalloc_start,
+						 delalloc_end, &page_started,
+						 &nr_written);
+			/*
+			 * delalloc_end is already one less than the total
+			 * length, so we don't subtract one from
+			 * PAGE_CACHE_SIZE
+			 */
+			delalloc_to_write += (delalloc_end - delalloc_start +
+					      PAGE_CACHE_SIZE) >>
+					      PAGE_CACHE_SHIFT;
+			delalloc_start = delalloc_end + 1;
+		}
+		if (wbc->nr_to_write < delalloc_to_write) {
+			int thresh = 8192;
+
+			if (delalloc_to_write < thresh * 2)
+				thresh = delalloc_to_write;
+			wbc->nr_to_write = min_t(u64, delalloc_to_write,
+						 thresh);
+		}
+
+		/* did the fill delalloc function already unlock and start
+		 * the IO?
+		 */
+		if (page_started) {
+			ret = 0;
+			/*
+			 * we've unlocked the page, so we can't update
+			 * the mapping's writeback index, just update
+			 * nr_to_write.
+			 */
+			wbc->nr_to_write -= nr_written;
+			goto done_unlocked;
+		}
+	}
+	if (tree->ops && tree->ops->writepage_start_hook) {
+		ret = tree->ops->writepage_start_hook(page, start,
+						      page_end);
+		if (ret == -EAGAIN) {
+			redirty_page_for_writepage(wbc, page);
+			update_nr_written(page, wbc, nr_written);
+			unlock_page(page);
+			ret = 0;
+			goto done_unlocked;
+		}
+	}
+
+	/*
+	 * we don't want to touch the inode after unlocking the page,
+	 * so we update the mapping writeback index now
+	 */
+	update_nr_written(page, wbc, nr_written + 1);
+
+	end = page_end;
+	if (last_byte <= start) {
+		if (tree->ops && tree->ops->writepage_end_io_hook)
+			tree->ops->writepage_end_io_hook(page, start,
+							 page_end, NULL, 1);
+		goto done;
+	}
+
+	blocksize = inode->i_sb->s_blocksize;
+
+	while (cur <= end) {
+		if (cur >= last_byte) {
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, cur,
+							 page_end, NULL, 1);
+			break;
+		}
+		em = epd->get_extent(inode, page, pg_offset, cur,
+				     end - cur + 1, 1);
+		if (IS_ERR_OR_NULL(em)) {
+			SetPageError(page);
+			break;
+		}
+
+		extent_offset = cur - em->start;
+		BUG_ON(extent_map_end(em) <= cur);
+		BUG_ON(end < cur);
+		iosize = min(extent_map_end(em) - cur, end - cur + 1);
+		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		sector = (em->block_start + extent_offset) >> 9;
+		bdev = em->bdev;
+		block_start = em->block_start;
+		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+		free_extent_map(em);
+		em = NULL;
+
+		/*
+		 * compressed and inline extents are written through other
+		 * paths in the FS
+		 */
+		if (compressed || block_start == EXTENT_MAP_HOLE ||
+		    block_start == EXTENT_MAP_INLINE) {
+			/*
+			 * end_io notification does not happen here for
+			 * compressed extents
+			 */
+			if (!compressed && tree->ops &&
+			    tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, cur,
+							 cur + iosize - 1,
+							 NULL, 1);
+			else if (compressed) {
+				/* we don't want to end_page_writeback on
+				 * a compressed extent.  this happens
+				 * elsewhere
+				 */
+				nr++;
+			}
+
+			cur += iosize;
+			pg_offset += iosize;
+			continue;
+		}
+		/* leave this out until we have a page_mkwrite call */
+		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
+				   EXTENT_DIRTY, 0, NULL)) {
+			cur = cur + iosize;
+			pg_offset += iosize;
+			continue;
+		}
+
+		if (tree->ops && tree->ops->writepage_io_hook) {
+			ret = tree->ops->writepage_io_hook(page, cur,
+						cur + iosize - 1);
+		} else {
+			ret = 0;
+		}
+		if (ret) {
+			SetPageError(page);
+		} else {
+			unsigned long max_nr = end_index + 1;
+
+			set_range_writeback(tree, cur, cur + iosize - 1);
+			if (!PageWriteback(page)) {
+				printk(KERN_ERR "btrfs warning page %lu not "
+				       "writeback, cur %llu end %llu\n",
+				       page->index, (unsigned long long)cur,
+				       (unsigned long long)end);
+			}
+
+			ret = submit_extent_page(write_flags, tree, page,
+						 sector, iosize, pg_offset,
+						 bdev, &epd->bio, max_nr,
+						 end_bio_extent_writepage,
+						 0, 0, 0);
+			if (ret)
+				SetPageError(page);
+		}
+		cur = cur + iosize;
+		pg_offset += iosize;
+		nr++;
+	}
+done:
+	if (nr == 0) {
+		/* make sure the mapping tag for page dirty gets cleared */
+		set_page_writeback(page);
+		end_page_writeback(page);
+	}
+	unlock_page(page);
+
+done_unlocked:
+
+	/* drop our reference on any cached states */
+	free_extent_state(cached_state);
+	return 0;
+}
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+static int extent_write_cache_pages(struct extent_io_tree *tree,
+			     struct address_space *mapping,
+			     struct writeback_control *wbc,
+			     writepage_t writepage, void *data,
+			     void (*flush_fn)(void *))
+{
+	int ret = 0;
+	int done = 0;
+	int nr_to_write_done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	int scanned = 0;
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		scanned = 1;
+	}
+retry:
+	while (!done && !nr_to_write_done && (index <= end) &&
+	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			      PAGECACHE_TAG_DIRTY, min(end - index,
+				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		unsigned i;
+
+		scanned = 1;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point we hold neither mapping->tree_lock nor
+			 * lock on the page itself: the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or even
+			 * swizzled back from swapper_space to tmpfs file
+			 * mapping
+			 */
+			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
+				tree->ops->write_cache_pages_lock_hook(page);
+			else
+				lock_page(page);
+
+			if (unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (!wbc->range_cyclic && page->index > end) {
+				done = 1;
+				unlock_page(page);
+				continue;
+			}
+
+			if (wbc->sync_mode != WB_SYNC_NONE) {
+				if (PageWriteback(page))
+					flush_fn(data);
+				wait_on_page_writeback(page);
+			}
+
+			if (PageWriteback(page) ||
+			    !clear_page_dirty_for_io(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			ret = (*writepage)(page, wbc, data);
+
+			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+				unlock_page(page);
+				ret = 0;
+			}
+			if (ret)
+				done = 1;
+
+			/*
+			 * the filesystem may choose to bump up nr_to_write.
+			 * We have to make sure to honor the new nr_to_write
+			 * at any time
+			 */
+			nr_to_write_done = wbc->nr_to_write <= 0;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+	return ret;
+}
+
+static void flush_epd_write_bio(struct extent_page_data *epd)
+{
+	if (epd->bio) {
+		if (epd->sync_io)
+			submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
+		else
+			submit_one_bio(WRITE, epd->bio, 0, 0);
+		epd->bio = NULL;
+	}
+}
+
+static noinline void flush_write_bio(void *data)
+{
+	struct extent_page_data *epd = data;
+	flush_epd_write_bio(epd);
+}
+
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+			  get_extent_t *get_extent,
+			  struct writeback_control *wbc)
+{
+	int ret;
+	struct address_space *mapping = page->mapping;
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+		.extent_locked = 0,
+		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+	};
+	struct writeback_control wbc_writepages = {
+		.sync_mode	= wbc->sync_mode,
+		.older_than_this = NULL,
+		.nr_to_write	= 64,
+		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
+		.range_end	= (loff_t)-1,
+	};
+
+	ret = __extent_writepage(page, wbc, &epd);
+
+	extent_write_cache_pages(tree, mapping, &wbc_writepages,
+				 __extent_writepage, &epd, flush_write_bio);
+	flush_epd_write_bio(&epd);
+	return ret;
+}
+
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+			      u64 start, u64 end, get_extent_t *get_extent,
+			      int mode)
+{
+	int ret = 0;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
+		PAGE_CACHE_SHIFT;
+
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+		.extent_locked = 1,
+		.sync_io = mode == WB_SYNC_ALL,
+	};
+	struct writeback_control wbc_writepages = {
+		.sync_mode	= mode,
+		.older_than_this = NULL,
+		.nr_to_write	= nr_pages * 2,
+		.range_start	= start,
+		.range_end	= end + 1,
+	};
+
+	while (start <= end) {
+		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+		if (clear_page_dirty_for_io(page))
+			ret = __extent_writepage(page, &wbc_writepages, &epd);
+		else {
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, start,
+						 start + PAGE_CACHE_SIZE - 1,
+						 NULL, 1);
+			unlock_page(page);
+		}
+		page_cache_release(page);
+		start += PAGE_CACHE_SIZE;
+	}
+
+	flush_epd_write_bio(&epd);
+	return ret;
+}
+
+int extent_writepages(struct extent_io_tree *tree,
+		      struct address_space *mapping,
+		      get_extent_t *get_extent,
+		      struct writeback_control *wbc)
+{
+	int ret = 0;
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+		.extent_locked = 0,
+		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+	};
+
+	ret = extent_write_cache_pages(tree, mapping, wbc,
+				       __extent_writepage, &epd,
+				       flush_write_bio);
+	flush_epd_write_bio(&epd);
+	return ret;
+}
+
+int extent_readpages(struct extent_io_tree *tree,
+		     struct address_space *mapping,
+		     struct list_head *pages, unsigned nr_pages,
+		     get_extent_t get_extent)
+{
+	struct bio *bio = NULL;
+	unsigned page_idx;
+	unsigned long bio_flags = 0;
+
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page = list_entry(pages->prev, struct page, lru);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+		if (!add_to_page_cache_lru(page, mapping,
+					page->index, GFP_NOFS)) {
+			__extent_read_full_page(tree, page, get_extent,
+						&bio, 0, &bio_flags);
+		}
+		page_cache_release(page);
+	}
+	BUG_ON(!list_empty(pages));
+	if (bio)
+		submit_one_bio(READ, bio, 0, bio_flags);
+	return 0;
+}
+
+/*
+ * basic invalidatepage code, this waits on any locked or writeback
+ * ranges corresponding to the page, and then deletes any extent state
+ * records from the tree
+ */
+int extent_invalidatepage(struct extent_io_tree *tree,
+			  struct page *page, unsigned long offset)
+{
+	struct extent_state *cached_state = NULL;
+	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+
+	start += (offset + blocksize - 1) & ~(blocksize - 1);
+	if (start > end)
+		return 0;
+
+	lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
+	wait_on_page_writeback(page);
+	clear_extent_bit(tree, start, end,
+			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+			 EXTENT_DO_ACCOUNTING,
+			 1, 1, &cached_state, GFP_NOFS);
+	return 0;
+}
+
+/*
+ * a helper for releasepage, this tests for areas of the page that
+ * are locked or under IO and drops the related state bits if it is safe
+ * to drop the page.
+ */
+int try_release_extent_state(struct extent_map_tree *map,
+			     struct extent_io_tree *tree, struct page *page,
+			     gfp_t mask)
+{
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	int ret = 1;
+
+	if (test_range_bit(tree, start, end,
+			   EXTENT_IOBITS, 0, NULL))
+		ret = 0;
+	else {
+		if ((mask & GFP_NOFS) == GFP_NOFS)
+			mask = GFP_NOFS;
+		/*
+		 * at this point we can safely clear everything except the
+		 * locked bit and the nodatasum bit
+		 */
+		ret = clear_extent_bit(tree, start, end,
+				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
+				 0, 0, NULL, mask);
+
+		/* if clear_extent_bit failed for enomem reasons,
+		 * we can't allow the release to continue.
+		 */
+		if (ret < 0)
+			ret = 0;
+		else
+			ret = 1;
+	}
+	return ret;
+}
+
+/*
+ * a helper for releasepage.  As long as there are no locked extents
+ * in the range corresponding to the page, both state records and extent
+ * map records are removed
+ */
+int try_release_extent_mapping(struct extent_map_tree *map,
+			       struct extent_io_tree *tree, struct page *page,
+			       gfp_t mask)
+{
+	struct extent_map *em;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+
+	if ((mask & __GFP_WAIT) &&
+	    page->mapping->host->i_size > 16 * 1024 * 1024) {
+		u64 len;
+		while (start <= end) {
+			len = end - start + 1;
+			write_lock(&map->lock);
+			em = lookup_extent_mapping(map, start, len);
+			if (IS_ERR_OR_NULL(em)) {
+				write_unlock(&map->lock);
+				break;
+			}
+			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+			    em->start != start) {
+				write_unlock(&map->lock);
+				free_extent_map(em);
+				break;
+			}
+			if (!test_range_bit(tree, em->start,
+					    extent_map_end(em) - 1,
+					    EXTENT_LOCKED | EXTENT_WRITEBACK,
+					    0, NULL)) {
+				remove_extent_mapping(map, em);
+				/* once for the rb tree */
+				free_extent_map(em);
+			}
+			start = extent_map_end(em);
+			write_unlock(&map->lock);
+
+			/* once for us */
+			free_extent_map(em);
+		}
+	}
+	return try_release_extent_state(map, tree, page, mask);
+}
+
+/*
+ * helper function for fiemap, which doesn't want to see any holes.
+ * This maps until we find something past 'last'
+ */
+static struct extent_map *get_extent_skip_holes(struct inode *inode,
+						u64 offset,
+						u64 last,
+						get_extent_t *get_extent)
+{
+	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
+	struct extent_map *em;
+	u64 len;
+
+	if (offset >= last)
+		return NULL;
+
+	while(1) {
+		len = last - offset;
+		if (len == 0)
+			break;
+		len = (len + sectorsize - 1) & ~(sectorsize - 1);
+		em = get_extent(inode, NULL, 0, offset, len, 0);
+		if (IS_ERR_OR_NULL(em))
+			return em;
+
+		/* if this isn't a hole return it */
+		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
+		    em->block_start != EXTENT_MAP_HOLE) {
+			return em;
+		}
+
+		/* this is a hole, advance to the next extent */
+		offset = extent_map_end(em);
+		free_extent_map(em);
+		if (offset >= last)
+			break;
+	}
+	return NULL;
+}
+
+int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len, get_extent_t *get_extent)
+{
+	int ret = 0;
+	u64 off = start;
+	u64 max = start + len;
+	u32 flags = 0;
+	u32 found_type;
+	u64 last;
+	u64 last_for_get_extent = 0;
+	u64 disko = 0;
+	u64 isize = i_size_read(inode);
+	struct btrfs_key found_key;
+	struct extent_map *em = NULL;
+	struct extent_state *cached_state = NULL;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *item;
+	int end = 0;
+	u64 em_start = 0;
+	u64 em_len = 0;
+	u64 em_end = 0;
+	unsigned long emflags;
+
+	if (len == 0)
+		return -EINVAL;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->leave_spinning = 1;
+
+	/*
+	 * lookup the last file extent.  We're not using i_size here
+	 * because there might be preallocation past i_size
+	 */
+	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
+				       path, btrfs_ino(inode), -1, 0);
+	if (ret < 0) {
+		btrfs_free_path(path);
+		return ret;
+	}
+	WARN_ON(!ret);
+	path->slots[0]--;
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_file_extent_item);
+	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+	found_type = btrfs_key_type(&found_key);
+
+	/* No extents, but there might be delalloc bits */
+	if (found_key.objectid != btrfs_ino(inode) ||
+	    found_type != BTRFS_EXTENT_DATA_KEY) {
+		/* have to trust i_size as the end */
+		last = (u64)-1;
+		last_for_get_extent = isize;
+	} else {
+		/*
+		 * remember the start of the last extent.  There are a
+		 * bunch of different factors that go into the length of the
+		 * extent, so its much less complex to remember where it started
+		 */
+		last = found_key.offset;
+		last_for_get_extent = last + 1;
+	}
+	btrfs_free_path(path);
+
+	/*
+	 * we might have some extents allocated but more delalloc past those
+	 * extents.  so, we trust isize unless the start of the last extent is
+	 * beyond isize
+	 */
+	if (last < isize) {
+		last = (u64)-1;
+		last_for_get_extent = isize;
+	}
+
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
+			 &cached_state, GFP_NOFS);
+
+	em = get_extent_skip_holes(inode, off, last_for_get_extent,
+				   get_extent);
+	if (!em)
+		goto out;
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out;
+	}
+
+	while (!end) {
+		u64 offset_in_extent;
+
+		/* break if the extent we found is outside the range */
+		if (em->start >= max || extent_map_end(em) < off)
+			break;
+
+		/*
+		 * get_extent may return an extent that starts before our
+		 * requested range.  We have to make sure the ranges
+		 * we return to fiemap always move forward and don't
+		 * overlap, so adjust the offsets here
+		 */
+		em_start = max(em->start, off);
+
+		/*
+		 * record the offset from the start of the extent
+		 * for adjusting the disk offset below
+		 */
+		offset_in_extent = em_start - em->start;
+		em_end = extent_map_end(em);
+		em_len = em_end - em_start;
+		emflags = em->flags;
+		disko = 0;
+		flags = 0;
+
+		/*
+		 * bump off for our next call to get_extent
+		 */
+		off = extent_map_end(em);
+		if (off >= max)
+			end = 1;
+
+		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
+			end = 1;
+			flags |= FIEMAP_EXTENT_LAST;
+		} else if (em->block_start == EXTENT_MAP_INLINE) {
+			flags |= (FIEMAP_EXTENT_DATA_INLINE |
+				  FIEMAP_EXTENT_NOT_ALIGNED);
+		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
+			flags |= (FIEMAP_EXTENT_DELALLOC |
+				  FIEMAP_EXTENT_UNKNOWN);
+		} else {
+			disko = em->block_start + offset_in_extent;
+		}
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+			flags |= FIEMAP_EXTENT_ENCODED;
+
+		free_extent_map(em);
+		em = NULL;
+		if ((em_start >= last) || em_len == (u64)-1 ||
+		   (last == (u64)-1 && isize <= em_end)) {
+			flags |= FIEMAP_EXTENT_LAST;
+			end = 1;
+		}
+
+		/* now scan forward to see if this is really the last extent. */
+		em = get_extent_skip_holes(inode, off, last_for_get_extent,
+					   get_extent);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			goto out;
+		}
+		if (!em) {
+			flags |= FIEMAP_EXTENT_LAST;
+			end = 1;
+		}
+		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+					      em_len, flags);
+		if (ret)
+			goto out_free;
+	}
+out_free:
+	free_extent_map(em);
+out:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
+			     &cached_state, GFP_NOFS);
+	return ret;
+}
+
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+					      unsigned long i)
+{
+	struct page *p;
+	struct address_space *mapping;
+
+	if (i == 0)
+		return eb->first_page;
+	i += eb->start >> PAGE_CACHE_SHIFT;
+	mapping = eb->first_page->mapping;
+	if (!mapping)
+		return NULL;
+
+	/*
+	 * extent_buffer_page is only called after pinning the page
+	 * by increasing the reference count.  So we know the page must
+	 * be in the radix tree.
+	 */
+	rcu_read_lock();
+	p = radix_tree_lookup(&mapping->page_tree, i);
+	rcu_read_unlock();
+
+	return p;
+}
+
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+		(start >> PAGE_CACHE_SHIFT);
+}
+
+static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
+						   u64 start,
+						   unsigned long len,
+						   gfp_t mask)
+{
+	struct extent_buffer *eb = NULL;
+#if LEAK_DEBUG
+	unsigned long flags;
+#endif
+
+	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+	if (eb == NULL)
+		return NULL;
+	eb->start = start;
+	eb->len = len;
+	spin_lock_init(&eb->lock);
+	init_waitqueue_head(&eb->lock_wq);
+
+#if LEAK_DEBUG
+	spin_lock_irqsave(&leak_lock, flags);
+	list_add(&eb->leak_list, &buffers);
+	spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+	atomic_set(&eb->refs, 1);
+
+	return eb;
+}
+
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+#if LEAK_DEBUG
+	unsigned long flags;
+	spin_lock_irqsave(&leak_lock, flags);
+	list_del(&eb->leak_list);
+	spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+	kmem_cache_free(extent_buffer_cache, eb);
+}
+
+/*
+ * Helper for releasing extent buffer page.
+ */
+static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
+						unsigned long start_idx)
+{
+	unsigned long index;
+	struct page *page;
+
+	if (!eb->first_page)
+		return;
+
+	index = num_extent_pages(eb->start, eb->len);
+	if (start_idx >= index)
+		return;
+
+	do {
+		index--;
+		page = extent_buffer_page(eb, index);
+		if (page)
+			page_cache_release(page);
+	} while (index != start_idx);
+}
+
+/*
+ * Helper for releasing the extent buffer.
+ */
+static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
+{
+	btrfs_release_extent_buffer_page(eb, 0);
+	__free_extent_buffer(eb);
+}
+
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+					  u64 start, unsigned long len,
+					  struct page *page0)
+{
+	unsigned long num_pages = num_extent_pages(start, len);
+	unsigned long i;
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	struct extent_buffer *eb;
+	struct extent_buffer *exists = NULL;
+	struct page *p;
+	struct address_space *mapping = tree->mapping;
+	int uptodate = 1;
+	int ret;
+
+	rcu_read_lock();
+	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+	if (eb && atomic_inc_not_zero(&eb->refs)) {
+		rcu_read_unlock();
+		mark_page_accessed(eb->first_page);
+		return eb;
+	}
+	rcu_read_unlock();
+
+	eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
+	if (!eb)
+		return NULL;
+
+	if (page0) {
+		eb->first_page = page0;
+		i = 1;
+		index++;
+		page_cache_get(page0);
+		mark_page_accessed(page0);
+		set_page_extent_mapped(page0);
+		set_page_extent_head(page0, len);
+		uptodate = PageUptodate(page0);
+	} else {
+		i = 0;
+	}
+	for (; i < num_pages; i++, index++) {
+		p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM);
+		if (!p) {
+			WARN_ON(1);
+			goto free_eb;
+		}
+		set_page_extent_mapped(p);
+		mark_page_accessed(p);
+		if (i == 0) {
+			eb->first_page = p;
+			set_page_extent_head(p, len);
+		} else {
+			set_page_private(p, EXTENT_PAGE_PRIVATE);
+		}
+		if (!PageUptodate(p))
+			uptodate = 0;
+
+		/*
+		 * see below about how we avoid a nasty race with release page
+		 * and why we unlock later
+		 */
+		if (i != 0)
+			unlock_page(p);
+	}
+	if (uptodate)
+		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+
+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	if (ret)
+		goto free_eb;
+
+	spin_lock(&tree->buffer_lock);
+	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
+	if (ret == -EEXIST) {
+		exists = radix_tree_lookup(&tree->buffer,
+						start >> PAGE_CACHE_SHIFT);
+		/* add one reference for the caller */
+		atomic_inc(&exists->refs);
+		spin_unlock(&tree->buffer_lock);
+		radix_tree_preload_end();
+		goto free_eb;
+	}
+	/* add one reference for the tree */
+	atomic_inc(&eb->refs);
+	spin_unlock(&tree->buffer_lock);
+	radix_tree_preload_end();
+
+	/*
+	 * there is a race where release page may have
+	 * tried to find this extent buffer in the radix
+	 * but failed.  It will tell the VM it is safe to
+	 * reclaim the, and it will clear the page private bit.
+	 * We must make sure to set the page private bit properly
+	 * after the extent buffer is in the radix tree so
+	 * it doesn't get lost
+	 */
+	set_page_extent_mapped(eb->first_page);
+	set_page_extent_head(eb->first_page, eb->len);
+	if (!page0)
+		unlock_page(eb->first_page);
+	return eb;
+
+free_eb:
+	if (eb->first_page && !page0)
+		unlock_page(eb->first_page);
+
+	if (!atomic_dec_and_test(&eb->refs))
+		return exists;
+	btrfs_release_extent_buffer(eb);
+	return exists;
+}
+
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+					 u64 start, unsigned long len)
+{
+	struct extent_buffer *eb;
+
+	rcu_read_lock();
+	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+	if (eb && atomic_inc_not_zero(&eb->refs)) {
+		rcu_read_unlock();
+		mark_page_accessed(eb->first_page);
+		return eb;
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+void free_extent_buffer(struct extent_buffer *eb)
+{
+	if (!eb)
+		return;
+
+	if (!atomic_dec_and_test(&eb->refs))
+		return;
+
+	WARN_ON(1);
+}
+
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+			      struct extent_buffer *eb)
+{
+	unsigned long i;
+	unsigned long num_pages;
+	struct page *page;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+
+	for (i = 0; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if (!PageDirty(page))
+			continue;
+
+		lock_page(page);
+		WARN_ON(!PagePrivate(page));
+
+		set_page_extent_mapped(page);
+		if (i == 0)
+			set_page_extent_head(page, eb->len);
+
+		clear_page_dirty_for_io(page);
+		spin_lock_irq(&page->mapping->tree_lock);
+		if (!PageDirty(page)) {
+			radix_tree_tag_clear(&page->mapping->page_tree,
+						page_index(page),
+						PAGECACHE_TAG_DIRTY);
+		}
+		spin_unlock_irq(&page->mapping->tree_lock);
+		unlock_page(page);
+	}
+	return 0;
+}
+
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+			     struct extent_buffer *eb)
+{
+	unsigned long i;
+	unsigned long num_pages;
+	int was_dirty = 0;
+
+	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++)
+		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+	return was_dirty;
+}
+
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+				struct extent_buffer *eb,
+				struct extent_state **cached_state)
+{
+	unsigned long i;
+	struct page *page;
+	unsigned long num_pages;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+
+	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+			      cached_state, GFP_NOFS);
+	for (i = 0; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if (page)
+			ClearPageUptodate(page);
+	}
+	return 0;
+}
+
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+				struct extent_buffer *eb)
+{
+	unsigned long i;
+	struct page *page;
+	unsigned long num_pages;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+
+	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+			    NULL, GFP_NOFS);
+	for (i = 0; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+		    ((i == num_pages - 1) &&
+		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+			check_page_uptodate(tree, page);
+			continue;
+		}
+		SetPageUptodate(page);
+	}
+	return 0;
+}
+
+int extent_range_uptodate(struct extent_io_tree *tree,
+			  u64 start, u64 end)
+{
+	struct page *page;
+	int ret;
+	int pg_uptodate = 1;
+	int uptodate;
+	unsigned long index;
+
+	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
+	if (ret)
+		return 1;
+	while (start <= end) {
+		index = start >> PAGE_CACHE_SHIFT;
+		page = find_get_page(tree->mapping, index);
+		uptodate = PageUptodate(page);
+		page_cache_release(page);
+		if (!uptodate) {
+			pg_uptodate = 0;
+			break;
+		}
+		start += PAGE_CACHE_SIZE;
+	}
+	return pg_uptodate;
+}
+
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+			   struct extent_buffer *eb,
+			   struct extent_state *cached_state)
+{
+	int ret = 0;
+	unsigned long num_pages;
+	unsigned long i;
+	struct page *page;
+	int pg_uptodate = 1;
+
+	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+		return 1;
+
+	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+			   EXTENT_UPTODATE, 1, cached_state);
+	if (ret)
+		return ret;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if (!PageUptodate(page)) {
+			pg_uptodate = 0;
+			break;
+		}
+	}
+	return pg_uptodate;
+}
+
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+			     struct extent_buffer *eb,
+			     u64 start, int wait,
+			     get_extent_t *get_extent, int mirror_num)
+{
+	unsigned long i;
+	unsigned long start_i;
+	struct page *page;
+	int err;
+	int ret = 0;
+	int locked_pages = 0;
+	int all_uptodate = 1;
+	int inc_all_pages = 0;
+	unsigned long num_pages;
+	struct bio *bio = NULL;
+	unsigned long bio_flags = 0;
+
+	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+		return 0;
+
+	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+			   EXTENT_UPTODATE, 1, NULL)) {
+		return 0;
+	}
+
+	if (start) {
+		WARN_ON(start < eb->start);
+		start_i = (start >> PAGE_CACHE_SHIFT) -
+			(eb->start >> PAGE_CACHE_SHIFT);
+	} else {
+		start_i = 0;
+	}
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = start_i; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if (!wait) {
+			if (!trylock_page(page))
+				goto unlock_exit;
+		} else {
+			lock_page(page);
+		}
+		locked_pages++;
+		if (!PageUptodate(page))
+			all_uptodate = 0;
+	}
+	if (all_uptodate) {
+		if (start_i == 0)
+			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+		goto unlock_exit;
+	}
+
+	for (i = start_i; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+
+		WARN_ON(!PagePrivate(page));
+
+		set_page_extent_mapped(page);
+		if (i == 0)
+			set_page_extent_head(page, eb->len);
+
+		if (inc_all_pages)
+			page_cache_get(page);
+		if (!PageUptodate(page)) {
+			if (start_i == 0)
+				inc_all_pages = 1;
+			ClearPageError(page);
+			err = __extent_read_full_page(tree, page,
+						      get_extent, &bio,
+						      mirror_num, &bio_flags);
+			if (err)
+				ret = err;
+		} else {
+			unlock_page(page);
+		}
+	}
+
+	if (bio)
+		submit_one_bio(READ, bio, mirror_num, bio_flags);
+
+	if (ret || !wait)
+		return ret;
+
+	for (i = start_i; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		wait_on_page_locked(page);
+		if (!PageUptodate(page))
+			ret = -EIO;
+	}
+
+	if (!ret)
+		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+	return ret;
+
+unlock_exit:
+	i = start_i;
+	while (locked_pages > 0) {
+		page = extent_buffer_page(eb, i);
+		i++;
+		unlock_page(page);
+		locked_pages--;
+	}
+	return ret;
+}
+
+void read_extent_buffer(struct extent_buffer *eb, void *dstv,
+			unsigned long start,
+			unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *dst = (char *)dstv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = extent_buffer_page(eb, i);
+
+		cur = min(len, (PAGE_CACHE_SIZE - offset));
+		kaddr = kmap_atomic(page, KM_USER1);
+		memcpy(dst, kaddr + offset, cur);
+		kunmap_atomic(kaddr, KM_USER1);
+
+		dst += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
+			       unsigned long min_len, char **token, char **map,
+			       unsigned long *map_start,
+			       unsigned long *map_len, int km)
+{
+	size_t offset = start & (PAGE_CACHE_SIZE - 1);
+	char *kaddr;
+	struct page *p;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	unsigned long end_i = (start_offset + start + min_len - 1) >>
+		PAGE_CACHE_SHIFT;
+
+	if (i != end_i)
+		return -EINVAL;
+
+	if (i == 0) {
+		offset = start_offset;
+		*map_start = 0;
+	} else {
+		offset = 0;
+		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
+	}
+
+	if (start + min_len > eb->len) {
+		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+		       "wanted %lu %lu\n", (unsigned long long)eb->start,
+		       eb->len, start, min_len);
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	p = extent_buffer_page(eb, i);
+	kaddr = kmap_atomic(p, km);
+	*token = kaddr;
+	*map = kaddr + offset;
+	*map_len = PAGE_CACHE_SIZE - offset;
+	return 0;
+}
+
+int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
+		      unsigned long min_len,
+		      char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km)
+{
+	int err;
+	int save = 0;
+	if (eb->map_token) {
+		unmap_extent_buffer(eb, eb->map_token, km);
+		eb->map_token = NULL;
+		save = 1;
+	}
+	err = map_private_extent_buffer(eb, start, min_len, token, map,
+				       map_start, map_len, km);
+	if (!err && save) {
+		eb->map_token = *token;
+		eb->kaddr = *map;
+		eb->map_start = *map_start;
+		eb->map_len = *map_len;
+	}
+	return err;
+}
+
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
+{
+	kunmap_atomic(token, km);
+}
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+			  unsigned long start,
+			  unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *ptr = (char *)ptrv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	int ret = 0;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = extent_buffer_page(eb, i);
+
+		cur = min(len, (PAGE_CACHE_SIZE - offset));
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		ret = memcmp(ptr, kaddr + offset, cur);
+		kunmap_atomic(kaddr, KM_USER0);
+		if (ret)
+			break;
+
+		ptr += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+	return ret;
+}
+
+void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
+			 unsigned long start, unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *src = (char *)srcv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = extent_buffer_page(eb, i);
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, PAGE_CACHE_SIZE - offset);
+		kaddr = kmap_atomic(page, KM_USER1);
+		memcpy(kaddr + offset, src, cur);
+		kunmap_atomic(kaddr, KM_USER1);
+
+		src += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+			  unsigned long start, unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = extent_buffer_page(eb, i);
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, PAGE_CACHE_SIZE - offset);
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + offset, c, cur);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+			unsigned long dst_offset, unsigned long src_offset,
+			unsigned long len)
+{
+	u64 dst_len = dst->len;
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(src->len != dst_len);
+
+	offset = (start_offset + dst_offset) &
+		((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = extent_buffer_page(dst, i);
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		read_extent_buffer(src, kaddr + offset, src_offset, cur);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		src_offset += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+
+static void move_pages(struct page *dst_page, struct page *src_page,
+		       unsigned long dst_off, unsigned long src_off,
+		       unsigned long len)
+{
+	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+	if (dst_page == src_page) {
+		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
+	} else {
+		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
+		char *p = dst_kaddr + dst_off + len;
+		char *s = src_kaddr + src_off + len;
+
+		while (len--)
+			*--p = *--s;
+
+		kunmap_atomic(src_kaddr, KM_USER1);
+	}
+	kunmap_atomic(dst_kaddr, KM_USER0);
+}
+
+static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
+{
+	unsigned long distance = (src > dst) ? src - dst : dst - src;
+	return distance < len;
+}
+
+static void copy_pages(struct page *dst_page, struct page *src_page,
+		       unsigned long dst_off, unsigned long src_off,
+		       unsigned long len)
+{
+	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+	char *src_kaddr;
+
+	if (dst_page != src_page) {
+		src_kaddr = kmap_atomic(src_page, KM_USER1);
+	} else {
+		src_kaddr = dst_kaddr;
+		BUG_ON(areas_overlap(src_off, dst_off, len));
+	}
+
+	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+	kunmap_atomic(dst_kaddr, KM_USER0);
+	if (dst_page != src_page)
+		kunmap_atomic(src_kaddr, KM_USER1);
+}
+
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len)
+{
+	size_t cur;
+	size_t dst_off_in_page;
+	size_t src_off_in_page;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long dst_i;
+	unsigned long src_i;
+
+	if (src_offset + len > dst->len) {
+		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+		       "len %lu dst len %lu\n", src_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset + len > dst->len) {
+		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
+		BUG_ON(1);
+	}
+
+	while (len > 0) {
+		dst_off_in_page = (start_offset + dst_offset) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+		src_off_in_page = (start_offset + src_offset) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+
+		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
+
+		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
+					       src_off_in_page));
+		cur = min_t(unsigned long, cur,
+			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
+
+		copy_pages(extent_buffer_page(dst, dst_i),
+			   extent_buffer_page(dst, src_i),
+			   dst_off_in_page, src_off_in_page, cur);
+
+		src_offset += cur;
+		dst_offset += cur;
+		len -= cur;
+	}
+}
+
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len)
+{
+	size_t cur;
+	size_t dst_off_in_page;
+	size_t src_off_in_page;
+	unsigned long dst_end = dst_offset + len - 1;
+	unsigned long src_end = src_offset + len - 1;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long dst_i;
+	unsigned long src_i;
+
+	if (src_offset + len > dst->len) {
+		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+		       "len %lu len %lu\n", src_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset + len > dst->len) {
+		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+		       "len %lu len %lu\n", dst_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (!areas_overlap(src_offset, dst_offset, len)) {
+		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
+		return;
+	}
+	while (len > 0) {
+		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
+		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
+
+		dst_off_in_page = (start_offset + dst_end) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+		src_off_in_page = (start_offset + src_end) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+
+		cur = min_t(unsigned long, len, src_off_in_page + 1);
+		cur = min(cur, dst_off_in_page + 1);
+		move_pages(extent_buffer_page(dst, dst_i),
+			   extent_buffer_page(dst, src_i),
+			   dst_off_in_page - cur + 1,
+			   src_off_in_page - cur + 1, cur);
+
+		dst_end -= cur;
+		src_end -= cur;
+		len -= cur;
+	}
+}
+
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+	struct extent_buffer *eb =
+			container_of(head, struct extent_buffer, rcu_head);
+
+	btrfs_release_extent_buffer(eb);
+}
+
+int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
+{
+	u64 start = page_offset(page);
+	struct extent_buffer *eb;
+	int ret = 1;
+
+	spin_lock(&tree->buffer_lock);
+	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+	if (!eb) {
+		spin_unlock(&tree->buffer_lock);
+		return ret;
+	}
+
+	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * set @eb->refs to 0 if it is already 1, and then release the @eb.
+	 * Or go back.
+	 */
+	if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
+		ret = 0;
+		goto out;
+	}
+
+	radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+out:
+	spin_unlock(&tree->buffer_lock);
+
+	/* at this point we can safely release the extent buffer */
+	if (atomic_read(&eb->refs) == 0)
+		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
+	return ret;
+}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 00000000..a11a92ee
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,300 @@
+#ifndef __EXTENTIO__
+#define __EXTENTIO__
+
+#include <linux/rbtree.h>
+
+/* bits for the extent state */
+#define EXTENT_DIRTY 1
+#define EXTENT_WRITEBACK (1 << 1)
+#define EXTENT_UPTODATE (1 << 2)
+#define EXTENT_LOCKED (1 << 3)
+#define EXTENT_NEW (1 << 4)
+#define EXTENT_DELALLOC (1 << 5)
+#define EXTENT_DEFRAG (1 << 6)
+#define EXTENT_DEFRAG_DONE (1 << 7)
+#define EXTENT_BUFFER_FILLED (1 << 8)
+#define EXTENT_BOUNDARY (1 << 9)
+#define EXTENT_NODATASUM (1 << 10)
+#define EXTENT_DO_ACCOUNTING (1 << 11)
+#define EXTENT_FIRST_DELALLOC (1 << 12)
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
+
+/*
+ * flags for bio submission. The high bits indicate the compression
+ * type for this bio
+ */
+#define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_FLAG_SHIFT 16
+
+/* these are bit numbers for test/set bit */
+#define EXTENT_BUFFER_UPTODATE 0
+#define EXTENT_BUFFER_BLOCKING 1
+#define EXTENT_BUFFER_DIRTY 2
+#define EXTENT_BUFFER_CORRUPT 3
+
+/* these are flags for extent_clear_unlock_delalloc */
+#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
+#define EXTENT_CLEAR_UNLOCK	 0x2
+#define EXTENT_CLEAR_DELALLOC	 0x4
+#define EXTENT_CLEAR_DIRTY	 0x8
+#define EXTENT_SET_WRITEBACK	 0x10
+#define EXTENT_END_WRITEBACK	 0x20
+#define EXTENT_SET_PRIVATE2	 0x40
+#define EXTENT_CLEAR_ACCOUNTING  0x80
+
+/*
+ * page->private values.  Every page that is controlled by the extent
+ * map has page->private set to one.
+ */
+#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
+
+struct extent_state;
+
+typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
+				       struct bio *bio, int mirror_num,
+				       unsigned long bio_flags, u64 bio_offset);
+struct extent_io_ops {
+	int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+			     u64 start, u64 end, int *page_started,
+			     unsigned long *nr_written);
+	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
+	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
+	extent_submit_bio_hook_t *submit_bio_hook;
+	int (*merge_bio_hook)(struct page *page, unsigned long offset,
+			      size_t size, struct bio *bio,
+			      unsigned long bio_flags);
+	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
+	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
+				       u64 start, u64 end,
+				       struct extent_state *state);
+	int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
+					u64 start, u64 end,
+				       struct extent_state *state);
+	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
+				    struct extent_state *state);
+	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
+				      struct extent_state *state, int uptodate);
+	int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
+			    int *bits);
+	int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
+			      int *bits);
+	int (*merge_extent_hook)(struct inode *inode,
+				 struct extent_state *new,
+				 struct extent_state *other);
+	int (*split_extent_hook)(struct inode *inode,
+				 struct extent_state *orig, u64 split);
+	int (*write_cache_pages_lock_hook)(struct page *page);
+};
+
+struct extent_io_tree {
+	struct rb_root state;
+	struct radix_tree_root buffer;
+	struct address_space *mapping;
+	u64 dirty_bytes;
+	spinlock_t lock;
+	spinlock_t buffer_lock;
+	struct extent_io_ops *ops;
+};
+
+struct extent_state {
+	u64 start;
+	u64 end; /* inclusive */
+	struct rb_node rb_node;
+
+	/* ADD NEW ELEMENTS AFTER THIS */
+	struct extent_io_tree *tree;
+	wait_queue_head_t wq;
+	atomic_t refs;
+	unsigned long state;
+	u64 split_start;
+	u64 split_end;
+
+	/* for use by the FS */
+	u64 private;
+
+	struct list_head leak_list;
+};
+
+struct extent_buffer {
+	u64 start;
+	unsigned long len;
+	char *map_token;
+	char *kaddr;
+	unsigned long map_start;
+	unsigned long map_len;
+	struct page *first_page;
+	unsigned long bflags;
+	struct list_head leak_list;
+	struct rcu_head rcu_head;
+	atomic_t refs;
+
+	/* the spinlock is used to protect most operations */
+	spinlock_t lock;
+
+	/*
+	 * when we keep the lock held while blocking, waiters go onto
+	 * the wq
+	 */
+	wait_queue_head_t lock_wq;
+};
+
+static inline void extent_set_compress_type(unsigned long *bio_flags,
+					    int compress_type)
+{
+	*bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
+}
+
+static inline int extent_compress_type(unsigned long bio_flags)
+{
+	return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
+}
+
+struct extent_map_tree;
+
+typedef struct extent_map *(get_extent_t)(struct inode *inode,
+					  struct page *page,
+					  size_t pg_offset,
+					  u64 start, u64 len,
+					  int create);
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+			 struct address_space *mapping);
+int try_release_extent_mapping(struct extent_map_tree *map,
+			       struct extent_io_tree *tree, struct page *page,
+			       gfp_t mask);
+int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
+int try_release_extent_state(struct extent_map_tree *map,
+			     struct extent_io_tree *tree, struct page *page,
+			     gfp_t mask);
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		     int bits, struct extent_state **cached, gfp_t mask);
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+			 struct extent_state **cached, gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+		    gfp_t mask);
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+			  get_extent_t *get_extent);
+int __init extent_io_init(void);
+void extent_io_exit(void);
+
+u64 count_range_bits(struct extent_io_tree *tree,
+		     u64 *start, u64 search_end,
+		     u64 max_bytes, unsigned long bits, int contig);
+
+void free_extent_state(struct extent_state *state);
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   int bits, int filled, struct extent_state *cached_state);
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		      int bits, gfp_t mask);
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		     int bits, int wake, int delete, struct extent_state **cached,
+		     gfp_t mask);
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		    int bits, gfp_t mask);
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   int bits, int exclusive_bits, u64 *failed_start,
+		   struct extent_state **cached_state, gfp_t mask);
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			struct extent_state **cached_state, gfp_t mask);
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+		   gfp_t mask);
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask);
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+			struct extent_state **cached_state, gfp_t mask);
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+			  u64 *start_ret, u64 *end_ret, int bits);
+struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+						 u64 start, int bits);
+int extent_invalidatepage(struct extent_io_tree *tree,
+			  struct page *page, unsigned long offset);
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+			  get_extent_t *get_extent,
+			  struct writeback_control *wbc);
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+			      u64 start, u64 end, get_extent_t *get_extent,
+			      int mode);
+int extent_writepages(struct extent_io_tree *tree,
+		      struct address_space *mapping,
+		      get_extent_t *get_extent,
+		      struct writeback_control *wbc);
+int extent_readpages(struct extent_io_tree *tree,
+		     struct address_space *mapping,
+		     struct list_head *pages, unsigned nr_pages,
+		     get_extent_t get_extent);
+int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len, get_extent_t *get_extent);
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
+void set_page_extent_mapped(struct page *page);
+
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+					  u64 start, unsigned long len,
+					  struct page *page0);
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+					 u64 start, unsigned long len);
+void free_extent_buffer(struct extent_buffer *eb);
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+			     struct extent_buffer *eb, u64 start, int wait,
+			     get_extent_t *get_extent, int mirror_num);
+
+static inline void extent_buffer_get(struct extent_buffer *eb)
+{
+	atomic_inc(&eb->refs);
+}
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+			  unsigned long start,
+			  unsigned long len);
+void read_extent_buffer(struct extent_buffer *eb, void *dst,
+			unsigned long start,
+			unsigned long len);
+void write_extent_buffer(struct extent_buffer *eb, const void *src,
+			 unsigned long start, unsigned long len);
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+			unsigned long dst_offset, unsigned long src_offset,
+			unsigned long len);
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len);
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len);
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+			  unsigned long start, unsigned long len);
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+			      struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+			     struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+			       struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+				struct extent_buffer *eb,
+				struct extent_state **cached_state);
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+			   struct extent_buffer *eb,
+			   struct extent_state *cached_state);
+int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+		      unsigned long min_len, char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km);
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+		      unsigned long min_len, char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km);
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
+int extent_range_uptodate(struct extent_io_tree *tree,
+			  u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode,
+				struct extent_io_tree *tree,
+				u64 start, u64 end, struct page *locked_page,
+				unsigned long op);
+struct bio *
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+		gfp_t gfp_flags);
+#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 00000000..2d041034
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,418 @@
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include "ctree.h"
+#include "extent_map.h"
+
+
+static struct kmem_cache *extent_map_cache;
+
+int __init extent_map_init(void)
+{
+	extent_map_cache = kmem_cache_create("extent_map",
+			sizeof(struct extent_map), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!extent_map_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void extent_map_exit(void)
+{
+	if (extent_map_cache)
+		kmem_cache_destroy(extent_map_cache);
+}
+
+/**
+ * extent_map_tree_init - initialize extent map tree
+ * @tree:		tree to initialize
+ *
+ * Initialize the extent tree @tree.  Should be called for each new inode
+ * or other user of the extent_map interface.
+ */
+void extent_map_tree_init(struct extent_map_tree *tree)
+{
+	tree->map = RB_ROOT;
+	rwlock_init(&tree->lock);
+}
+
+/**
+ * alloc_extent_map - allocate new extent map structure
+ *
+ * Allocate a new extent_map structure.  The new structure is
+ * returned with a reference count of one and needs to be
+ * freed using free_extent_map()
+ */
+struct extent_map *alloc_extent_map(void)
+{
+	struct extent_map *em;
+	em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
+	if (!em)
+		return NULL;
+	em->in_tree = 0;
+	em->flags = 0;
+	em->compress_type = BTRFS_COMPRESS_NONE;
+	atomic_set(&em->refs, 1);
+	return em;
+}
+
+/**
+ * free_extent_map - drop reference count of an extent_map
+ * @em:		extent map beeing releasead
+ *
+ * Drops the reference out on @em by one and free the structure
+ * if the reference count hits zero.
+ */
+void free_extent_map(struct extent_map *em)
+{
+	if (!em)
+		return;
+	WARN_ON(atomic_read(&em->refs) == 0);
+	if (atomic_dec_and_test(&em->refs)) {
+		WARN_ON(em->in_tree);
+		kmem_cache_free(extent_map_cache, em);
+	}
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+				   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct extent_map *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct extent_map, rb_node);
+
+		WARN_ON(!entry->in_tree);
+
+		if (offset < entry->start)
+			p = &(*p)->rb_left;
+		else if (offset >= extent_map_end(entry))
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	entry = rb_entry(node, struct extent_map, rb_node);
+	entry->in_tree = 1;
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+/*
+ * search through the tree for an extent_map with a given offset.  If
+ * it can't be found, try to find some neighboring extents
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
+				     struct rb_node **prev_ret,
+				     struct rb_node **next_ret)
+{
+	struct rb_node *n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *orig_prev = NULL;
+	struct extent_map *entry;
+	struct extent_map *prev_entry = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct extent_map, rb_node);
+		prev = n;
+		prev_entry = entry;
+
+		WARN_ON(!entry->in_tree);
+
+		if (offset < entry->start)
+			n = n->rb_left;
+		else if (offset >= extent_map_end(entry))
+			n = n->rb_right;
+		else
+			return n;
+	}
+
+	if (prev_ret) {
+		orig_prev = prev;
+		while (prev && offset >= extent_map_end(prev_entry)) {
+			prev = rb_next(prev);
+			prev_entry = rb_entry(prev, struct extent_map, rb_node);
+		}
+		*prev_ret = prev;
+		prev = orig_prev;
+	}
+
+	if (next_ret) {
+		prev_entry = rb_entry(prev, struct extent_map, rb_node);
+		while (prev && offset < prev_entry->start) {
+			prev = rb_prev(prev);
+			prev_entry = rb_entry(prev, struct extent_map, rb_node);
+		}
+		*next_ret = prev;
+	}
+	return NULL;
+}
+
+/* check to see if two extent_map structs are adjacent and safe to merge */
+static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+{
+	if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
+		return 0;
+
+	/*
+	 * don't merge compressed extents, we need to know their
+	 * actual size
+	 */
+	if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+		return 0;
+
+	if (extent_map_end(prev) == next->start &&
+	    prev->flags == next->flags &&
+	    prev->bdev == next->bdev &&
+	    ((next->block_start == EXTENT_MAP_HOLE &&
+	      prev->block_start == EXTENT_MAP_HOLE) ||
+	     (next->block_start == EXTENT_MAP_INLINE &&
+	      prev->block_start == EXTENT_MAP_INLINE) ||
+	     (next->block_start == EXTENT_MAP_DELALLOC &&
+	      prev->block_start == EXTENT_MAP_DELALLOC) ||
+	     (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
+	      next->block_start == extent_map_block_end(prev)))) {
+		return 1;
+	}
+	return 0;
+}
+
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+{
+	int ret = 0;
+	struct extent_map *merge = NULL;
+	struct rb_node *rb;
+	struct extent_map *em;
+
+	write_lock(&tree->lock);
+	em = lookup_extent_mapping(tree, start, len);
+
+	WARN_ON(!em || em->start != start);
+
+	if (!em)
+		goto out;
+
+	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+	if (em->start != 0) {
+		rb = rb_prev(&em->rb_node);
+		if (rb)
+			merge = rb_entry(rb, struct extent_map, rb_node);
+		if (rb && mergable_maps(merge, em)) {
+			em->start = merge->start;
+			em->len += merge->len;
+			em->block_len += merge->block_len;
+			em->block_start = merge->block_start;
+			merge->in_tree = 0;
+			rb_erase(&merge->rb_node, &tree->map);
+			free_extent_map(merge);
+		}
+	}
+
+	rb = rb_next(&em->rb_node);
+	if (rb)
+		merge = rb_entry(rb, struct extent_map, rb_node);
+	if (rb && mergable_maps(em, merge)) {
+		em->len += merge->len;
+		em->block_len += merge->len;
+		rb_erase(&merge->rb_node, &tree->map);
+		merge->in_tree = 0;
+		free_extent_map(merge);
+	}
+
+	free_extent_map(em);
+out:
+	write_unlock(&tree->lock);
+	return ret;
+
+}
+
+/**
+ * add_extent_mapping - add new extent map to the extent tree
+ * @tree:	tree to insert new map in
+ * @em:		map to insert
+ *
+ * Insert @em into @tree or perform a simple forward/backward merge with
+ * existing mappings.  The extent_map struct passed in will be inserted
+ * into the tree directly, with an additional reference taken, or a
+ * reference dropped if the merge attempt was successful.
+ */
+int add_extent_mapping(struct extent_map_tree *tree,
+		       struct extent_map *em)
+{
+	int ret = 0;
+	struct extent_map *merge = NULL;
+	struct rb_node *rb;
+	struct extent_map *exist;
+
+	exist = lookup_extent_mapping(tree, em->start, em->len);
+	if (exist) {
+		free_extent_map(exist);
+		ret = -EEXIST;
+		goto out;
+	}
+	rb = tree_insert(&tree->map, em->start, &em->rb_node);
+	if (rb) {
+		ret = -EEXIST;
+		goto out;
+	}
+	atomic_inc(&em->refs);
+	if (em->start != 0) {
+		rb = rb_prev(&em->rb_node);
+		if (rb)
+			merge = rb_entry(rb, struct extent_map, rb_node);
+		if (rb && mergable_maps(merge, em)) {
+			em->start = merge->start;
+			em->len += merge->len;
+			em->block_len += merge->block_len;
+			em->block_start = merge->block_start;
+			merge->in_tree = 0;
+			rb_erase(&merge->rb_node, &tree->map);
+			free_extent_map(merge);
+		}
+	 }
+	rb = rb_next(&em->rb_node);
+	if (rb)
+		merge = rb_entry(rb, struct extent_map, rb_node);
+	if (rb && mergable_maps(em, merge)) {
+		em->len += merge->len;
+		em->block_len += merge->len;
+		rb_erase(&merge->rb_node, &tree->map);
+		merge->in_tree = 0;
+		free_extent_map(merge);
+	}
+out:
+	return ret;
+}
+
+/* simple helper to do math around the end of an extent, handling wrap */
+static u64 range_end(u64 start, u64 len)
+{
+	if (start + len < start)
+		return (u64)-1;
+	return start + len;
+}
+
+/**
+ * lookup_extent_mapping - lookup extent_map
+ * @tree:	tree to lookup in
+ * @start:	byte offset to start the search
+ * @len:	length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.  There may be additional objects in the tree that
+ * intersect, so check the object returned carefully to make sure that no
+ * additional lookups are needed.
+ */
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 len)
+{
+	struct extent_map *em;
+	struct rb_node *rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *next = NULL;
+	u64 end = range_end(start, len);
+
+	rb_node = __tree_search(&tree->map, start, &prev, &next);
+	if (!rb_node && prev) {
+		em = rb_entry(prev, struct extent_map, rb_node);
+		if (end > em->start && start < extent_map_end(em))
+			goto found;
+	}
+	if (!rb_node && next) {
+		em = rb_entry(next, struct extent_map, rb_node);
+		if (end > em->start && start < extent_map_end(em))
+			goto found;
+	}
+	if (!rb_node) {
+		em = NULL;
+		goto out;
+	}
+	if (IS_ERR(rb_node)) {
+		em = ERR_CAST(rb_node);
+		goto out;
+	}
+	em = rb_entry(rb_node, struct extent_map, rb_node);
+	if (end > em->start && start < extent_map_end(em))
+		goto found;
+
+	em = NULL;
+	goto out;
+
+found:
+	atomic_inc(&em->refs);
+out:
+	return em;
+}
+
+/**
+ * search_extent_mapping - find a nearby extent map
+ * @tree:	tree to lookup in
+ * @start:	byte offset to start the search
+ * @len:	length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.
+ *
+ * If one can't be found, any nearby extent may be returned
+ */
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 len)
+{
+	struct extent_map *em;
+	struct rb_node *rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *next = NULL;
+
+	rb_node = __tree_search(&tree->map, start, &prev, &next);
+	if (!rb_node && prev) {
+		em = rb_entry(prev, struct extent_map, rb_node);
+		goto found;
+	}
+	if (!rb_node && next) {
+		em = rb_entry(next, struct extent_map, rb_node);
+		goto found;
+	}
+	if (!rb_node) {
+		em = NULL;
+		goto out;
+	}
+	if (IS_ERR(rb_node)) {
+		em = ERR_CAST(rb_node);
+		goto out;
+	}
+	em = rb_entry(rb_node, struct extent_map, rb_node);
+	goto found;
+
+	em = NULL;
+	goto out;
+
+found:
+	atomic_inc(&em->refs);
+out:
+	return em;
+}
+
+/**
+ * remove_extent_mapping - removes an extent_map from the extent tree
+ * @tree:	extent tree to remove from
+ * @em:		extent map beeing removed
+ *
+ * Removes @em from @tree.  No reference counts are dropped, and no checks
+ * are done to see if the range is in use
+ */
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+{
+	int ret = 0;
+
+	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+	rb_erase(&em->rb_node, &tree->map);
+	em->in_tree = 0;
+	return ret;
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 00000000..33a7890b
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,66 @@
+#ifndef __EXTENTMAP__
+#define __EXTENTMAP__
+
+#include <linux/rbtree.h>
+
+#define EXTENT_MAP_LAST_BYTE (u64)-4
+#define EXTENT_MAP_HOLE (u64)-3
+#define EXTENT_MAP_INLINE (u64)-2
+#define EXTENT_MAP_DELALLOC (u64)-1
+
+/* bits for the flags field */
+#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+#define EXTENT_FLAG_COMPRESSED 1
+#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
+#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+
+struct extent_map {
+	struct rb_node rb_node;
+
+	/* all of these are in bytes */
+	u64 start;
+	u64 len;
+	u64 orig_start;
+	u64 block_start;
+	u64 block_len;
+	unsigned long flags;
+	struct block_device *bdev;
+	atomic_t refs;
+	unsigned int in_tree:1;
+	unsigned int compress_type:4;
+};
+
+struct extent_map_tree {
+	struct rb_root map;
+	rwlock_t lock;
+};
+
+static inline u64 extent_map_end(struct extent_map *em)
+{
+	if (em->start + em->len < em->start)
+		return (u64)-1;
+	return em->start + em->len;
+}
+
+static inline u64 extent_map_block_end(struct extent_map *em)
+{
+	if (em->block_start + em->block_len < em->block_start)
+		return (u64)-1;
+	return em->block_start + em->block_len;
+}
+
+void extent_map_tree_init(struct extent_map_tree *tree);
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 len);
+int add_extent_mapping(struct extent_map_tree *tree,
+		       struct extent_map *em);
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+
+struct extent_map *alloc_extent_map(void);
+void free_extent_map(struct extent_map *em);
+int __init extent_map_init(void);
+void extent_map_exit(void);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 len);
+#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 00000000..90d4ee52
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,869 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+
+#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+				   sizeof(struct btrfs_item) * 2) / \
+				  size) - 1))
+
+#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
+				   sizeof(struct btrfs_ordered_sum)) / \
+				   sizeof(struct btrfs_sector_sum) * \
+				   (r)->sectorsize - (r)->sectorsize)
+
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     u64 objectid, u64 pos,
+			     u64 disk_offset, u64 disk_num_bytes,
+			     u64 num_bytes, u64 offset, u64 ram_bytes,
+			     u8 compression, u8 encryption, u16 other_encoding)
+{
+	int ret = 0;
+	struct btrfs_file_extent_item *item;
+	struct btrfs_key file_key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	file_key.objectid = objectid;
+	file_key.offset = pos;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+				      sizeof(*item));
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret);
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0],
+			      struct btrfs_file_extent_item);
+	btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
+	btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
+	btrfs_set_file_extent_offset(leaf, item, offset);
+	btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+	btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
+	btrfs_set_file_extent_generation(leaf, item, trans->transid);
+	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+	btrfs_set_file_extent_compression(leaf, item, compression);
+	btrfs_set_file_extent_encryption(leaf, item, encryption);
+	btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, int cow)
+{
+	int ret;
+	struct btrfs_key file_key;
+	struct btrfs_key found_key;
+	struct btrfs_csum_item *item;
+	struct extent_buffer *leaf;
+	u64 csum_offset = 0;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+	int csums_in_item;
+
+	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	file_key.offset = bytenr;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+	ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
+	if (ret < 0)
+		goto fail;
+	leaf = path->nodes[0];
+	if (ret > 0) {
+		ret = 1;
+		if (path->slots[0] == 0)
+			goto fail;
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
+			goto fail;
+
+		csum_offset = (bytenr - found_key.offset) >>
+				root->fs_info->sb->s_blocksize_bits;
+		csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
+		csums_in_item /= csum_size;
+
+		if (csum_offset >= csums_in_item) {
+			ret = -EFBIG;
+			goto fail;
+		}
+	}
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+	item = (struct btrfs_csum_item *)((unsigned char *)item +
+					  csum_offset * csum_size);
+	return item;
+fail:
+	if (ret > 0)
+		ret = -ENOENT;
+	return ERR_PTR(ret);
+}
+
+
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid,
+			     u64 offset, int mod)
+{
+	int ret;
+	struct btrfs_key file_key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+
+	file_key.objectid = objectid;
+	file_key.offset = offset;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+	ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
+	return ret;
+}
+
+
+static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
+				   struct inode *inode, struct bio *bio,
+				   u64 logical_offset, u32 *dst, int dio)
+{
+	u32 sum;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int bio_index = 0;
+	u64 offset = 0;
+	u64 item_start_offset = 0;
+	u64 item_last_offset = 0;
+	u64 disk_bytenr;
+	u32 diff;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_csum_item *item = NULL;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	if (bio->bi_size > PAGE_CACHE_SIZE * 8)
+		path->reada = 2;
+
+	WARN_ON(bio->bi_vcnt <= 0);
+
+	disk_bytenr = (u64)bio->bi_sector << 9;
+	if (dio)
+		offset = logical_offset;
+	while (bio_index < bio->bi_vcnt) {
+		if (!dio)
+			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+		ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
+		if (ret == 0)
+			goto found;
+
+		if (!item || disk_bytenr < item_start_offset ||
+		    disk_bytenr >= item_last_offset) {
+			struct btrfs_key found_key;
+			u32 item_size;
+
+			if (item)
+				btrfs_release_path(path);
+			item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
+						 path, disk_bytenr, 0);
+			if (IS_ERR(item)) {
+				ret = PTR_ERR(item);
+				if (ret == -ENOENT || ret == -EFBIG)
+					ret = 0;
+				sum = 0;
+				if (BTRFS_I(inode)->root->root_key.objectid ==
+				    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+					set_extent_bits(io_tree, offset,
+						offset + bvec->bv_len - 1,
+						EXTENT_NODATASUM, GFP_NOFS);
+				} else {
+					printk(KERN_INFO "btrfs no csum found "
+					       "for inode %llu start %llu\n",
+					       (unsigned long long)
+					       btrfs_ino(inode),
+					       (unsigned long long)offset);
+				}
+				item = NULL;
+				btrfs_release_path(path);
+				goto found;
+			}
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      path->slots[0]);
+
+			item_start_offset = found_key.offset;
+			item_size = btrfs_item_size_nr(path->nodes[0],
+						       path->slots[0]);
+			item_last_offset = item_start_offset +
+				(item_size / csum_size) *
+				root->sectorsize;
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_csum_item);
+		}
+		/*
+		 * this byte range must be able to fit inside
+		 * a single leaf so it will also fit inside a u32
+		 */
+		diff = disk_bytenr - item_start_offset;
+		diff = diff / root->sectorsize;
+		diff = diff * csum_size;
+
+		read_extent_buffer(path->nodes[0], &sum,
+				   ((unsigned long)item) + diff,
+				   csum_size);
+found:
+		if (dst)
+			*dst++ = sum;
+		else
+			set_state_private(io_tree, offset, sum);
+		disk_bytenr += bvec->bv_len;
+		offset += bvec->bv_len;
+		bio_index++;
+		bvec++;
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+			  struct bio *bio, u32 *dst)
+{
+	return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
+}
+
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+			      struct bio *bio, u64 offset, u32 *dst)
+{
+	return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+}
+
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+			     struct list_head *list, int search_commit)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct btrfs_csum_item *item;
+	unsigned long offset;
+	int ret;
+	size_t size;
+	u64 csum_end;
+	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	if (search_commit) {
+		path->skip_locking = 1;
+		path->reada = 2;
+		path->search_commit_root = 1;
+	}
+
+	key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	key.offset = start;
+	key.type = BTRFS_EXTENT_CSUM_KEY;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto fail;
+	if (ret > 0 && path->slots[0] > 0) {
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+		if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
+		    key.type == BTRFS_EXTENT_CSUM_KEY) {
+			offset = (start - key.offset) >>
+				 root->fs_info->sb->s_blocksize_bits;
+			if (offset * csum_size <
+			    btrfs_item_size_nr(leaf, path->slots[0] - 1))
+				path->slots[0]--;
+		}
+	}
+
+	while (start <= end) {
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto fail;
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+		    key.type != BTRFS_EXTENT_CSUM_KEY)
+			break;
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.offset > end)
+			break;
+
+		if (key.offset > start)
+			start = key.offset;
+
+		size = btrfs_item_size_nr(leaf, path->slots[0]);
+		csum_end = key.offset + (size / csum_size) * root->sectorsize;
+		if (csum_end <= start) {
+			path->slots[0]++;
+			continue;
+		}
+
+		csum_end = min(csum_end, end + 1);
+		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				      struct btrfs_csum_item);
+		while (start < csum_end) {
+			size = min_t(size_t, csum_end - start,
+					MAX_ORDERED_SUM_BYTES(root));
+			sums = kzalloc(btrfs_ordered_sum_size(root, size),
+					GFP_NOFS);
+			BUG_ON(!sums);
+
+			sector_sum = sums->sums;
+			sums->bytenr = start;
+			sums->len = size;
+
+			offset = (start - key.offset) >>
+				root->fs_info->sb->s_blocksize_bits;
+			offset *= csum_size;
+
+			while (size > 0) {
+				read_extent_buffer(path->nodes[0],
+						&sector_sum->sum,
+						((unsigned long)item) +
+						offset, csum_size);
+				sector_sum->bytenr = start;
+
+				size -= root->sectorsize;
+				start += root->sectorsize;
+				offset += csum_size;
+				sector_sum++;
+			}
+			list_add_tail(&sums->list, list);
+		}
+		path->slots[0]++;
+	}
+	ret = 0;
+fail:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+		       struct bio *bio, u64 file_start, int contig)
+{
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct btrfs_ordered_extent *ordered;
+	char *data;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int bio_index = 0;
+	unsigned long total_bytes = 0;
+	unsigned long this_sum_bytes = 0;
+	u64 offset;
+	u64 disk_bytenr;
+
+	WARN_ON(bio->bi_vcnt <= 0);
+	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
+	if (!sums)
+		return -ENOMEM;
+
+	sector_sum = sums->sums;
+	disk_bytenr = (u64)bio->bi_sector << 9;
+	sums->len = bio->bi_size;
+	INIT_LIST_HEAD(&sums->list);
+
+	if (contig)
+		offset = file_start;
+	else
+		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+	ordered = btrfs_lookup_ordered_extent(inode, offset);
+	BUG_ON(!ordered);
+	sums->bytenr = ordered->start;
+
+	while (bio_index < bio->bi_vcnt) {
+		if (!contig)
+			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+		if (!contig && (offset >= ordered->file_offset + ordered->len ||
+		    offset < ordered->file_offset)) {
+			unsigned long bytes_left;
+			sums->len = this_sum_bytes;
+			this_sum_bytes = 0;
+			btrfs_add_ordered_sum(inode, ordered, sums);
+			btrfs_put_ordered_extent(ordered);
+
+			bytes_left = bio->bi_size - total_bytes;
+
+			sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
+				       GFP_NOFS);
+			BUG_ON(!sums);
+			sector_sum = sums->sums;
+			sums->len = bytes_left;
+			ordered = btrfs_lookup_ordered_extent(inode, offset);
+			BUG_ON(!ordered);
+			sums->bytenr = ordered->start;
+		}
+
+		data = kmap_atomic(bvec->bv_page, KM_USER0);
+		sector_sum->sum = ~(u32)0;
+		sector_sum->sum = btrfs_csum_data(root,
+						  data + bvec->bv_offset,
+						  sector_sum->sum,
+						  bvec->bv_len);
+		kunmap_atomic(data, KM_USER0);
+		btrfs_csum_final(sector_sum->sum,
+				 (char *)&sector_sum->sum);
+		sector_sum->bytenr = disk_bytenr;
+
+		sector_sum++;
+		bio_index++;
+		total_bytes += bvec->bv_len;
+		this_sum_bytes += bvec->bv_len;
+		disk_bytenr += bvec->bv_len;
+		offset += bvec->bv_len;
+		bvec++;
+	}
+	this_sum_bytes = 0;
+	btrfs_add_ordered_sum(inode, ordered, sums);
+	btrfs_put_ordered_extent(ordered);
+	return 0;
+}
+
+/*
+ * helper function for csum removal, this expects the
+ * key to describe the csum pointed to by the path, and it expects
+ * the csum to overlap the range [bytenr, len]
+ *
+ * The csum should not be entirely contained in the range and the
+ * range should not be entirely contained in the csum.
+ *
+ * This calls btrfs_truncate_item with the correct args based on the
+ * overlap, and fixes up the key as required.
+ */
+static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct btrfs_key *key,
+				      u64 bytenr, u64 len)
+{
+	struct extent_buffer *leaf;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u64 csum_end;
+	u64 end_byte = bytenr + len;
+	u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+	int ret;
+
+	leaf = path->nodes[0];
+	csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+	csum_end <<= root->fs_info->sb->s_blocksize_bits;
+	csum_end += key->offset;
+
+	if (key->offset < bytenr && csum_end <= end_byte) {
+		/*
+		 *         [ bytenr - len ]
+		 *         [   ]
+		 *   [csum     ]
+		 *   A simple truncate off the end of the item
+		 */
+		u32 new_size = (bytenr - key->offset) >> blocksize_bits;
+		new_size *= csum_size;
+		ret = btrfs_truncate_item(trans, root, path, new_size, 1);
+	} else if (key->offset >= bytenr && csum_end > end_byte &&
+		   end_byte > key->offset) {
+		/*
+		 *         [ bytenr - len ]
+		 *                 [ ]
+		 *                 [csum     ]
+		 * we need to truncate from the beginning of the csum
+		 */
+		u32 new_size = (csum_end - end_byte) >> blocksize_bits;
+		new_size *= csum_size;
+
+		ret = btrfs_truncate_item(trans, root, path, new_size, 0);
+
+		key->offset = end_byte;
+		ret = btrfs_set_item_key_safe(trans, root, path, key);
+		BUG_ON(ret);
+	} else {
+		BUG();
+	}
+	return 0;
+}
+
+/*
+ * deletes the csum items from the csum tree for a given
+ * range of bytes.
+ */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, u64 bytenr, u64 len)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	u64 end_byte = bytenr + len;
+	u64 csum_end;
+	struct extent_buffer *leaf;
+	int ret;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+	int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+
+	root = root->fs_info->csum_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while (1) {
+		key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+		key.offset = end_byte - 1;
+		key.type = BTRFS_EXTENT_CSUM_KEY;
+
+		path->leave_spinning = 1;
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		} else if (ret < 0) {
+			break;
+		}
+
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+		if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+		    key.type != BTRFS_EXTENT_CSUM_KEY) {
+			break;
+		}
+
+		if (key.offset >= end_byte)
+			break;
+
+		csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+		csum_end <<= blocksize_bits;
+		csum_end += key.offset;
+
+		/* this csum ends before we start, we're done */
+		if (csum_end <= bytenr)
+			break;
+
+		/* delete the entire item, it is inside our range */
+		if (key.offset >= bytenr && csum_end <= end_byte) {
+			ret = btrfs_del_item(trans, root, path);
+			if (ret)
+				goto out;
+			if (key.offset == bytenr)
+				break;
+		} else if (key.offset < bytenr && csum_end > end_byte) {
+			unsigned long offset;
+			unsigned long shift_len;
+			unsigned long item_offset;
+			/*
+			 *        [ bytenr - len ]
+			 *     [csum                ]
+			 *
+			 * Our bytes are in the middle of the csum,
+			 * we need to split this item and insert a new one.
+			 *
+			 * But we can't drop the path because the
+			 * csum could change, get removed, extended etc.
+			 *
+			 * The trick here is the max size of a csum item leaves
+			 * enough room in the tree block for a single
+			 * item header.  So, we split the item in place,
+			 * adding a new header pointing to the existing
+			 * bytes.  Then we loop around again and we have
+			 * a nicely formed csum item that we can neatly
+			 * truncate.
+			 */
+			offset = (bytenr - key.offset) >> blocksize_bits;
+			offset *= csum_size;
+
+			shift_len = (len >> blocksize_bits) * csum_size;
+
+			item_offset = btrfs_item_ptr_offset(leaf,
+							    path->slots[0]);
+
+			memset_extent_buffer(leaf, 0, item_offset + offset,
+					     shift_len);
+			key.offset = bytenr;
+
+			/*
+			 * btrfs_split_item returns -EAGAIN when the
+			 * item changed size or key
+			 */
+			ret = btrfs_split_item(trans, root, path, &key, offset);
+			BUG_ON(ret && ret != -EAGAIN);
+
+			key.offset = end_byte - 1;
+		} else {
+			ret = truncate_one_csum(trans, root, path,
+						&key, bytenr, len);
+			BUG_ON(ret);
+			if (key.offset < bytenr)
+				break;
+		}
+		btrfs_release_path(path);
+	}
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_ordered_sum *sums)
+{
+	u64 bytenr;
+	int ret;
+	struct btrfs_key file_key;
+	struct btrfs_key found_key;
+	u64 next_offset;
+	u64 total_bytes = 0;
+	int found_next;
+	struct btrfs_path *path;
+	struct btrfs_csum_item *item;
+	struct btrfs_csum_item *item_end;
+	struct extent_buffer *leaf = NULL;
+	u64 csum_offset;
+	struct btrfs_sector_sum *sector_sum;
+	u32 nritems;
+	u32 ins_size;
+	char *eb_map;
+	char *eb_token;
+	unsigned long map_len;
+	unsigned long map_start;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	sector_sum = sums->sums;
+again:
+	next_offset = (u64)-1;
+	found_next = 0;
+	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	file_key.offset = sector_sum->bytenr;
+	bytenr = sector_sum->bytenr;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+
+	item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
+	if (!IS_ERR(item)) {
+		leaf = path->nodes[0];
+		ret = 0;
+		goto found;
+	}
+	ret = PTR_ERR(item);
+	if (ret != -EFBIG && ret != -ENOENT)
+		goto fail_unlock;
+
+	if (ret == -EFBIG) {
+		u32 item_size;
+		/* we found one, but it isn't big enough yet */
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		if ((item_size / csum_size) >=
+		    MAX_CSUM_ITEMS(root, csum_size)) {
+			/* already at max size, make a new one */
+			goto insert;
+		}
+	} else {
+		int slot = path->slots[0] + 1;
+		/* we didn't find a csum item, insert one */
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		if (path->slots[0] >= nritems - 1) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 1)
+				found_next = 1;
+			if (ret != 0)
+				goto insert;
+			slot = 0;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+		if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+		    found_key.type != BTRFS_EXTENT_CSUM_KEY) {
+			found_next = 1;
+			goto insert;
+		}
+		next_offset = found_key.offset;
+		found_next = 1;
+		goto insert;
+	}
+
+	/*
+	 * at this point, we know the tree has an item, but it isn't big
+	 * enough yet to put our csum in.  Grow it
+	 */
+	btrfs_release_path(path);
+	ret = btrfs_search_slot(trans, root, &file_key, path,
+				csum_size, 1);
+	if (ret < 0)
+		goto fail_unlock;
+
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			goto insert;
+		path->slots[0]--;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	csum_offset = (bytenr - found_key.offset) >>
+			root->fs_info->sb->s_blocksize_bits;
+
+	if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
+	    found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+	    csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
+		goto insert;
+	}
+
+	if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
+	    csum_size) {
+		u32 diff = (csum_offset + 1) * csum_size;
+
+		/*
+		 * is the item big enough already?  we dropped our lock
+		 * before and need to recheck
+		 */
+		if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
+			goto csum;
+
+		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
+		if (diff != csum_size)
+			goto insert;
+
+		ret = btrfs_extend_item(trans, root, path, diff);
+		goto csum;
+	}
+
+insert:
+	btrfs_release_path(path);
+	csum_offset = 0;
+	if (found_next) {
+		u64 tmp = total_bytes + root->sectorsize;
+		u64 next_sector = sector_sum->bytenr;
+		struct btrfs_sector_sum *next = sector_sum + 1;
+
+		while (tmp < sums->len) {
+			if (next_sector + root->sectorsize != next->bytenr)
+				break;
+			tmp += root->sectorsize;
+			next_sector = next->bytenr;
+			next++;
+		}
+		tmp = min(tmp, next_offset - file_key.offset);
+		tmp >>= root->fs_info->sb->s_blocksize_bits;
+		tmp = max((u64)1, tmp);
+		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
+		ins_size = csum_size * tmp;
+	} else {
+		ins_size = csum_size;
+	}
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+				      ins_size);
+	path->leave_spinning = 0;
+	if (ret < 0)
+		goto fail_unlock;
+	if (ret != 0) {
+		WARN_ON(1);
+		goto fail_unlock;
+	}
+csum:
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+	ret = 0;
+	item = (struct btrfs_csum_item *)((unsigned char *)item +
+					  csum_offset * csum_size);
+found:
+	item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
+				      btrfs_item_size_nr(leaf, path->slots[0]));
+	eb_token = NULL;
+next_sector:
+
+	if (!eb_token ||
+	   (unsigned long)item + csum_size >= map_start + map_len) {
+		int err;
+
+		if (eb_token)
+			unmap_extent_buffer(leaf, eb_token, KM_USER1);
+		eb_token = NULL;
+		err = map_private_extent_buffer(leaf, (unsigned long)item,
+						csum_size,
+						&eb_token, &eb_map,
+						&map_start, &map_len, KM_USER1);
+		if (err)
+			eb_token = NULL;
+	}
+	if (eb_token) {
+		memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
+		       &sector_sum->sum, csum_size);
+	} else {
+		write_extent_buffer(leaf, &sector_sum->sum,
+				    (unsigned long)item, csum_size);
+	}
+
+	total_bytes += root->sectorsize;
+	sector_sum++;
+	if (total_bytes < sums->len) {
+		item = (struct btrfs_csum_item *)((char *)item +
+						  csum_size);
+		if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
+		    sector_sum->bytenr) {
+			bytenr = sector_sum->bytenr;
+			goto next_sector;
+		}
+	}
+	if (eb_token) {
+		unmap_extent_buffer(leaf, eb_token, KM_USER1);
+		eb_token = NULL;
+	}
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	if (total_bytes < sums->len) {
+		btrfs_release_path(path);
+		cond_resched();
+		goto again;
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+
+fail_unlock:
+	goto out;
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 00000000..fa4ef18b
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,1683 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/falloc.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/slab.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "tree-log.h"
+#include "locking.h"
+#include "compat.h"
+
+/*
+ * when auto defrag is enabled we
+ * queue up these defrag structs to remember which
+ * inodes need defragging passes
+ */
+struct inode_defrag {
+	struct rb_node rb_node;
+	/* objectid */
+	u64 ino;
+	/*
+	 * transid where the defrag was added, we search for
+	 * extents newer than this
+	 */
+	u64 transid;
+
+	/* root objectid */
+	u64 root;
+
+	/* last offset we were able to defrag */
+	u64 last_offset;
+
+	/* if we've wrapped around back to zero once already */
+	int cycled;
+};
+
+/* pop a record for an inode into the defrag tree.  The lock
+ * must be held already
+ *
+ * If you're inserting a record for an older transid than an
+ * existing record, the transid already in the tree is lowered
+ *
+ * If an existing record is found the defrag item you
+ * pass in is freed
+ */
+static int __btrfs_add_inode_defrag(struct inode *inode,
+				    struct inode_defrag *defrag)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct inode_defrag *entry;
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+
+	p = &root->fs_info->defrag_inodes.rb_node;
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+		if (defrag->ino < entry->ino)
+			p = &parent->rb_left;
+		else if (defrag->ino > entry->ino)
+			p = &parent->rb_right;
+		else {
+			/* if we're reinserting an entry for
+			 * an old defrag run, make sure to
+			 * lower the transid of our existing record
+			 */
+			if (defrag->transid < entry->transid)
+				entry->transid = defrag->transid;
+			if (defrag->last_offset > entry->last_offset)
+				entry->last_offset = defrag->last_offset;
+			goto exists;
+		}
+	}
+	BTRFS_I(inode)->in_defrag = 1;
+	rb_link_node(&defrag->rb_node, parent, p);
+	rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
+	return 0;
+
+exists:
+	kfree(defrag);
+	return 0;
+
+}
+
+/*
+ * insert a defrag record for this inode if auto defrag is
+ * enabled
+ */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+			   struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct inode_defrag *defrag;
+	int ret = 0;
+	u64 transid;
+
+	if (!btrfs_test_opt(root, AUTO_DEFRAG))
+		return 0;
+
+	if (btrfs_fs_closing(root->fs_info))
+		return 0;
+
+	if (BTRFS_I(inode)->in_defrag)
+		return 0;
+
+	if (trans)
+		transid = trans->transid;
+	else
+		transid = BTRFS_I(inode)->root->last_trans;
+
+	defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+	if (!defrag)
+		return -ENOMEM;
+
+	defrag->ino = btrfs_ino(inode);
+	defrag->transid = transid;
+	defrag->root = root->root_key.objectid;
+
+	spin_lock(&root->fs_info->defrag_inodes_lock);
+	if (!BTRFS_I(inode)->in_defrag)
+		ret = __btrfs_add_inode_defrag(inode, defrag);
+	spin_unlock(&root->fs_info->defrag_inodes_lock);
+	return ret;
+}
+
+/*
+ * must be called with the defrag_inodes lock held
+ */
+struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
+					     struct rb_node **next)
+{
+	struct inode_defrag *entry = NULL;
+	struct rb_node *p;
+	struct rb_node *parent = NULL;
+
+	p = info->defrag_inodes.rb_node;
+	while (p) {
+		parent = p;
+		entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+		if (ino < entry->ino)
+			p = parent->rb_left;
+		else if (ino > entry->ino)
+			p = parent->rb_right;
+		else
+			return entry;
+	}
+
+	if (next) {
+		while (parent && ino > entry->ino) {
+			parent = rb_next(parent);
+			entry = rb_entry(parent, struct inode_defrag, rb_node);
+		}
+		*next = parent;
+	}
+	return NULL;
+}
+
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+	struct inode_defrag *defrag;
+	struct btrfs_root *inode_root;
+	struct inode *inode;
+	struct rb_node *n;
+	struct btrfs_key key;
+	struct btrfs_ioctl_defrag_range_args range;
+	u64 first_ino = 0;
+	int num_defrag;
+	int defrag_batch = 1024;
+
+	memset(&range, 0, sizeof(range));
+	range.len = (u64)-1;
+
+	atomic_inc(&fs_info->defrag_running);
+	spin_lock(&fs_info->defrag_inodes_lock);
+	while(1) {
+		n = NULL;
+
+		/* find an inode to defrag */
+		defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
+		if (!defrag) {
+			if (n)
+				defrag = rb_entry(n, struct inode_defrag, rb_node);
+			else if (first_ino) {
+				first_ino = 0;
+				continue;
+			} else {
+				break;
+			}
+		}
+
+		/* remove it from the rbtree */
+		first_ino = defrag->ino + 1;
+		rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
+
+		if (btrfs_fs_closing(fs_info))
+			goto next_free;
+
+		spin_unlock(&fs_info->defrag_inodes_lock);
+
+		/* get the inode */
+		key.objectid = defrag->root;
+		btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+		key.offset = (u64)-1;
+		inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+		if (IS_ERR(inode_root))
+			goto next;
+
+		key.objectid = defrag->ino;
+		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+		key.offset = 0;
+
+		inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+		if (IS_ERR(inode))
+			goto next;
+
+		/* do a chunk of defrag */
+		BTRFS_I(inode)->in_defrag = 0;
+		range.start = defrag->last_offset;
+		num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+					       defrag_batch);
+		/*
+		 * if we filled the whole defrag batch, there
+		 * must be more work to do.  Queue this defrag
+		 * again
+		 */
+		if (num_defrag == defrag_batch) {
+			defrag->last_offset = range.start;
+			__btrfs_add_inode_defrag(inode, defrag);
+			/*
+			 * we don't want to kfree defrag, we added it back to
+			 * the rbtree
+			 */
+			defrag = NULL;
+		} else if (defrag->last_offset && !defrag->cycled) {
+			/*
+			 * we didn't fill our defrag batch, but
+			 * we didn't start at zero.  Make sure we loop
+			 * around to the start of the file.
+			 */
+			defrag->last_offset = 0;
+			defrag->cycled = 1;
+			__btrfs_add_inode_defrag(inode, defrag);
+			defrag = NULL;
+		}
+
+		iput(inode);
+next:
+		spin_lock(&fs_info->defrag_inodes_lock);
+next_free:
+		kfree(defrag);
+	}
+	spin_unlock(&fs_info->defrag_inodes_lock);
+
+	atomic_dec(&fs_info->defrag_running);
+
+	/*
+	 * during unmount, we use the transaction_wait queue to
+	 * wait for the defragger to stop
+	 */
+	wake_up(&fs_info->transaction_wait);
+	return 0;
+}
+
+/* simple helper to fault in pages and copy.  This should go away
+ * and be replaced with calls into generic code.
+ */
+static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
+					 size_t write_bytes,
+					 struct page **prepared_pages,
+					 struct iov_iter *i)
+{
+	size_t copied = 0;
+	size_t total_copied = 0;
+	int pg = 0;
+	int offset = pos & (PAGE_CACHE_SIZE - 1);
+
+	while (write_bytes > 0) {
+		size_t count = min_t(size_t,
+				     PAGE_CACHE_SIZE - offset, write_bytes);
+		struct page *page = prepared_pages[pg];
+		/*
+		 * Copy data from userspace to the current page
+		 *
+		 * Disable pagefault to avoid recursive lock since
+		 * the pages are already locked
+		 */
+		pagefault_disable();
+		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+		pagefault_enable();
+
+		/* Flush processor's dcache for this page */
+		flush_dcache_page(page);
+
+		/*
+		 * if we get a partial write, we can end up with
+		 * partially up to date pages.  These add
+		 * a lot of complexity, so make sure they don't
+		 * happen by forcing this copy to be retried.
+		 *
+		 * The rest of the btrfs_file_write code will fall
+		 * back to page at a time copies after we return 0.
+		 */
+		if (!PageUptodate(page) && copied < count)
+			copied = 0;
+
+		iov_iter_advance(i, copied);
+		write_bytes -= copied;
+		total_copied += copied;
+
+		/* Return to btrfs_file_aio_write to fault page */
+		if (unlikely(copied == 0))
+			break;
+
+		if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+			offset += copied;
+		} else {
+			pg++;
+			offset = 0;
+		}
+	}
+	return total_copied;
+}
+
+/*
+ * unlocks pages after btrfs_file_write is done with them
+ */
+void btrfs_drop_pages(struct page **pages, size_t num_pages)
+{
+	size_t i;
+	for (i = 0; i < num_pages; i++) {
+		/* page checked is some magic around finding pages that
+		 * have been modified without going through btrfs_set_page_dirty
+		 * clear it here
+		 */
+		ClearPageChecked(pages[i]);
+		unlock_page(pages[i]);
+		mark_page_accessed(pages[i]);
+		page_cache_release(pages[i]);
+	}
+}
+
+/*
+ * after copy_from_user, pages need to be dirtied and we need to make
+ * sure holes are created between the current EOF and the start of
+ * any next extents (if required).
+ *
+ * this also makes the decision about creating an inline extent vs
+ * doing real data extents, marking pages dirty and delalloc as required.
+ */
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+		      struct page **pages, size_t num_pages,
+		      loff_t pos, size_t write_bytes,
+		      struct extent_state **cached)
+{
+	int err = 0;
+	int i;
+	u64 num_bytes;
+	u64 start_pos;
+	u64 end_of_last_block;
+	u64 end_pos = pos + write_bytes;
+	loff_t isize = i_size_read(inode);
+
+	start_pos = pos & ~((u64)root->sectorsize - 1);
+	num_bytes = (write_bytes + pos - start_pos +
+		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	end_of_last_block = start_pos + num_bytes - 1;
+	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
+					cached);
+	if (err)
+		return err;
+
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = pages[i];
+		SetPageUptodate(p);
+		ClearPageChecked(p);
+		set_page_dirty(p);
+	}
+
+	/*
+	 * we've only changed i_size in ram, and we haven't updated
+	 * the disk i_size.  There is no need to log the inode
+	 * at this time.
+	 */
+	if (end_pos > isize)
+		i_size_write(inode, end_pos);
+	return 0;
+}
+
+/*
+ * this drops all the extents in the cache that intersect the range
+ * [start, end].  Existing extents are split as required.
+ */
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			    int skip_pinned)
+{
+	struct extent_map *em;
+	struct extent_map *split = NULL;
+	struct extent_map *split2 = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	u64 len = end - start + 1;
+	int ret;
+	int testend = 1;
+	unsigned long flags;
+	int compressed = 0;
+
+	WARN_ON(end < start);
+	if (end == (u64)-1) {
+		len = (u64)-1;
+		testend = 0;
+	}
+	while (1) {
+		if (!split)
+			split = alloc_extent_map();
+		if (!split2)
+			split2 = alloc_extent_map();
+		BUG_ON(!split || !split2);
+
+		write_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, start, len);
+		if (!em) {
+			write_unlock(&em_tree->lock);
+			break;
+		}
+		flags = em->flags;
+		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+			if (testend && em->start + em->len >= start + len) {
+				free_extent_map(em);
+				write_unlock(&em_tree->lock);
+				break;
+			}
+			start = em->start + em->len;
+			if (testend)
+				len = start + len - (em->start + em->len);
+			free_extent_map(em);
+			write_unlock(&em_tree->lock);
+			continue;
+		}
+		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+		remove_extent_mapping(em_tree, em);
+
+		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+		    em->start < start) {
+			split->start = em->start;
+			split->len = start - em->start;
+			split->orig_start = em->orig_start;
+			split->block_start = em->block_start;
+
+			if (compressed)
+				split->block_len = em->block_len;
+			else
+				split->block_len = split->len;
+
+			split->bdev = em->bdev;
+			split->flags = flags;
+			split->compress_type = em->compress_type;
+			ret = add_extent_mapping(em_tree, split);
+			BUG_ON(ret);
+			free_extent_map(split);
+			split = split2;
+			split2 = NULL;
+		}
+		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+		    testend && em->start + em->len > start + len) {
+			u64 diff = start + len - em->start;
+
+			split->start = start + len;
+			split->len = em->start + em->len - (start + len);
+			split->bdev = em->bdev;
+			split->flags = flags;
+			split->compress_type = em->compress_type;
+
+			if (compressed) {
+				split->block_len = em->block_len;
+				split->block_start = em->block_start;
+				split->orig_start = em->orig_start;
+			} else {
+				split->block_len = split->len;
+				split->block_start = em->block_start + diff;
+				split->orig_start = split->start;
+			}
+
+			ret = add_extent_mapping(em_tree, split);
+			BUG_ON(ret);
+			free_extent_map(split);
+			split = NULL;
+		}
+		write_unlock(&em_tree->lock);
+
+		/* once for us */
+		free_extent_map(em);
+		/* once for the tree*/
+		free_extent_map(em);
+	}
+	if (split)
+		free_extent_map(split);
+	if (split2)
+		free_extent_map(split2);
+	return 0;
+}
+
+/*
+ * this is very complex, but the basic idea is to drop all extents
+ * in the range start - end.  hint_block is filled in with a block number
+ * that would be a good hint to the block allocator for this file.
+ *
+ * If an extent intersects the range but is not entirely inside the range
+ * it is either truncated or split.  Anything entirely inside the range
+ * is deleted from the tree.
+ */
+int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
+		       u64 start, u64 end, u64 *hint_byte, int drop_cache)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key new_key;
+	u64 ino = btrfs_ino(inode);
+	u64 search_start = start;
+	u64 disk_bytenr = 0;
+	u64 num_bytes = 0;
+	u64 extent_offset = 0;
+	u64 extent_end = 0;
+	int del_nr = 0;
+	int del_slot = 0;
+	int extent_type;
+	int recow;
+	int ret;
+
+	if (drop_cache)
+		btrfs_drop_extent_cache(inode, start, end - 1, 0);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while (1) {
+		recow = 0;
+		ret = btrfs_lookup_file_extent(trans, root, path, ino,
+					       search_start, -1);
+		if (ret < 0)
+			break;
+		if (ret > 0 && path->slots[0] > 0 && search_start == start) {
+			leaf = path->nodes[0];
+			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+			if (key.objectid == ino &&
+			    key.type == BTRFS_EXTENT_DATA_KEY)
+				path->slots[0]--;
+		}
+		ret = 0;
+next_slot:
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			BUG_ON(del_nr > 0);
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				break;
+			if (ret > 0) {
+				ret = 0;
+				break;
+			}
+			leaf = path->nodes[0];
+			recow = 1;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid > ino ||
+		    key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+			break;
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(leaf, fi);
+
+		if (extent_type == BTRFS_FILE_EXTENT_REG ||
+		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+			extent_offset = btrfs_file_extent_offset(leaf, fi);
+			extent_end = key.offset +
+				btrfs_file_extent_num_bytes(leaf, fi);
+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			extent_end = key.offset +
+				btrfs_file_extent_inline_len(leaf, fi);
+		} else {
+			WARN_ON(1);
+			extent_end = search_start;
+		}
+
+		if (extent_end <= search_start) {
+			path->slots[0]++;
+			goto next_slot;
+		}
+
+		search_start = max(key.offset, start);
+		if (recow) {
+			btrfs_release_path(path);
+			continue;
+		}
+
+		/*
+		 *     | - range to drop - |
+		 *  | -------- extent -------- |
+		 */
+		if (start > key.offset && end < extent_end) {
+			BUG_ON(del_nr > 0);
+			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+
+			memcpy(&new_key, &key, sizeof(new_key));
+			new_key.offset = start;
+			ret = btrfs_duplicate_item(trans, root, path,
+						   &new_key);
+			if (ret == -EAGAIN) {
+				btrfs_release_path(path);
+				continue;
+			}
+			if (ret < 0)
+				break;
+
+			leaf = path->nodes[0];
+			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							start - key.offset);
+
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+
+			extent_offset += start - key.offset;
+			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_end - start);
+			btrfs_mark_buffer_dirty(leaf);
+
+			if (disk_bytenr > 0) {
+				ret = btrfs_inc_extent_ref(trans, root,
+						disk_bytenr, num_bytes, 0,
+						root->root_key.objectid,
+						new_key.objectid,
+						start - extent_offset);
+				BUG_ON(ret);
+				*hint_byte = disk_bytenr;
+			}
+			key.offset = start;
+		}
+		/*
+		 *  | ---- range to drop ----- |
+		 *      | -------- extent -------- |
+		 */
+		if (start <= key.offset && end < extent_end) {
+			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+
+			memcpy(&new_key, &key, sizeof(new_key));
+			new_key.offset = end;
+			btrfs_set_item_key_safe(trans, root, path, &new_key);
+
+			extent_offset += end - key.offset;
+			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_end - end);
+			btrfs_mark_buffer_dirty(leaf);
+			if (disk_bytenr > 0) {
+				inode_sub_bytes(inode, end - key.offset);
+				*hint_byte = disk_bytenr;
+			}
+			break;
+		}
+
+		search_start = extent_end;
+		/*
+		 *       | ---- range to drop ----- |
+		 *  | -------- extent -------- |
+		 */
+		if (start > key.offset && end >= extent_end) {
+			BUG_ON(del_nr > 0);
+			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							start - key.offset);
+			btrfs_mark_buffer_dirty(leaf);
+			if (disk_bytenr > 0) {
+				inode_sub_bytes(inode, extent_end - start);
+				*hint_byte = disk_bytenr;
+			}
+			if (end == extent_end)
+				break;
+
+			path->slots[0]++;
+			goto next_slot;
+		}
+
+		/*
+		 *  | ---- range to drop ----- |
+		 *    | ------ extent ------ |
+		 */
+		if (start <= key.offset && end >= extent_end) {
+			if (del_nr == 0) {
+				del_slot = path->slots[0];
+				del_nr = 1;
+			} else {
+				BUG_ON(del_slot + del_nr != path->slots[0]);
+				del_nr++;
+			}
+
+			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				inode_sub_bytes(inode,
+						extent_end - key.offset);
+				extent_end = ALIGN(extent_end,
+						   root->sectorsize);
+			} else if (disk_bytenr > 0) {
+				ret = btrfs_free_extent(trans, root,
+						disk_bytenr, num_bytes, 0,
+						root->root_key.objectid,
+						key.objectid, key.offset -
+						extent_offset);
+				BUG_ON(ret);
+				inode_sub_bytes(inode,
+						extent_end - key.offset);
+				*hint_byte = disk_bytenr;
+			}
+
+			if (end == extent_end)
+				break;
+
+			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
+				path->slots[0]++;
+				goto next_slot;
+			}
+
+			ret = btrfs_del_items(trans, root, path, del_slot,
+					      del_nr);
+			BUG_ON(ret);
+
+			del_nr = 0;
+			del_slot = 0;
+
+			btrfs_release_path(path);
+			continue;
+		}
+
+		BUG_ON(1);
+	}
+
+	if (del_nr > 0) {
+		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+		BUG_ON(ret);
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int extent_mergeable(struct extent_buffer *leaf, int slot,
+			    u64 objectid, u64 bytenr, u64 orig_offset,
+			    u64 *start, u64 *end)
+{
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 extent_end;
+
+	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+		return 0;
+
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
+		return 0;
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
+	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
+	    btrfs_file_extent_compression(leaf, fi) ||
+	    btrfs_file_extent_encryption(leaf, fi) ||
+	    btrfs_file_extent_other_encoding(leaf, fi))
+		return 0;
+
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	if ((*start && *start != key.offset) || (*end && *end != extent_end))
+		return 0;
+
+	*start = key.offset;
+	*end = extent_end;
+	return 1;
+}
+
+/*
+ * Mark extent in the range start - end as written.
+ *
+ * This changes extent type from 'pre-allocated' to 'regular'. If only
+ * part of extent is marked as written, the extent will be split into
+ * two or three.
+ */
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+			      struct inode *inode, u64 start, u64 end)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_buffer *leaf;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	struct btrfs_key new_key;
+	u64 bytenr;
+	u64 num_bytes;
+	u64 extent_end;
+	u64 orig_offset;
+	u64 other_start;
+	u64 other_end;
+	u64 split;
+	int del_nr = 0;
+	int del_slot = 0;
+	int recow;
+	int ret;
+	u64 ino = btrfs_ino(inode);
+
+	btrfs_drop_extent_cache(inode, start, end - 1, 0);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+again:
+	recow = 0;
+	split = start;
+	key.objectid = ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = split;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0 && path->slots[0] > 0)
+		path->slots[0]--;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	BUG_ON(btrfs_file_extent_type(leaf, fi) !=
+	       BTRFS_FILE_EXTENT_PREALLOC);
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	BUG_ON(key.offset > start || extent_end < end);
+
+	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
+	memcpy(&new_key, &key, sizeof(new_key));
+
+	if (start == key.offset && end < extent_end) {
+		other_start = 0;
+		other_end = start;
+		if (extent_mergeable(leaf, path->slots[0] - 1,
+				     ino, bytenr, orig_offset,
+				     &other_start, &other_end)) {
+			new_key.offset = end;
+			btrfs_set_item_key_safe(trans, root, path, &new_key);
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_end - end);
+			btrfs_set_file_extent_offset(leaf, fi,
+						     end - orig_offset);
+			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							end - other_start);
+			btrfs_mark_buffer_dirty(leaf);
+			goto out;
+		}
+	}
+
+	if (start > key.offset && end == extent_end) {
+		other_start = end;
+		other_end = 0;
+		if (extent_mergeable(leaf, path->slots[0] + 1,
+				     ino, bytenr, orig_offset,
+				     &other_start, &other_end)) {
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							start - key.offset);
+			path->slots[0]++;
+			new_key.offset = start;
+			btrfs_set_item_key_safe(trans, root, path, &new_key);
+
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							other_end - start);
+			btrfs_set_file_extent_offset(leaf, fi,
+						     start - orig_offset);
+			btrfs_mark_buffer_dirty(leaf);
+			goto out;
+		}
+	}
+
+	while (start > key.offset || end < extent_end) {
+		if (key.offset == start)
+			split = end;
+
+		new_key.offset = split;
+		ret = btrfs_duplicate_item(trans, root, path, &new_key);
+		if (ret == -EAGAIN) {
+			btrfs_release_path(path);
+			goto again;
+		}
+		BUG_ON(ret < 0);
+
+		leaf = path->nodes[0];
+		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+				    struct btrfs_file_extent_item);
+		btrfs_set_file_extent_num_bytes(leaf, fi,
+						split - key.offset);
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+
+		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
+		btrfs_set_file_extent_num_bytes(leaf, fi,
+						extent_end - split);
+		btrfs_mark_buffer_dirty(leaf);
+
+		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
+					   root->root_key.objectid,
+					   ino, orig_offset);
+		BUG_ON(ret);
+
+		if (split == start) {
+			key.offset = start;
+		} else {
+			BUG_ON(start != key.offset);
+			path->slots[0]--;
+			extent_end = end;
+		}
+		recow = 1;
+	}
+
+	other_start = end;
+	other_end = 0;
+	if (extent_mergeable(leaf, path->slots[0] + 1,
+			     ino, bytenr, orig_offset,
+			     &other_start, &other_end)) {
+		if (recow) {
+			btrfs_release_path(path);
+			goto again;
+		}
+		extent_end = other_end;
+		del_slot = path->slots[0] + 1;
+		del_nr++;
+		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+					0, root->root_key.objectid,
+					ino, orig_offset);
+		BUG_ON(ret);
+	}
+	other_start = 0;
+	other_end = start;
+	if (extent_mergeable(leaf, path->slots[0] - 1,
+			     ino, bytenr, orig_offset,
+			     &other_start, &other_end)) {
+		if (recow) {
+			btrfs_release_path(path);
+			goto again;
+		}
+		key.offset = other_start;
+		del_slot = path->slots[0];
+		del_nr++;
+		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+					0, root->root_key.objectid,
+					ino, orig_offset);
+		BUG_ON(ret);
+	}
+	if (del_nr == 0) {
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+			   struct btrfs_file_extent_item);
+		btrfs_set_file_extent_type(leaf, fi,
+					   BTRFS_FILE_EXTENT_REG);
+		btrfs_mark_buffer_dirty(leaf);
+	} else {
+		fi = btrfs_item_ptr(leaf, del_slot - 1,
+			   struct btrfs_file_extent_item);
+		btrfs_set_file_extent_type(leaf, fi,
+					   BTRFS_FILE_EXTENT_REG);
+		btrfs_set_file_extent_num_bytes(leaf, fi,
+						extent_end - key.offset);
+		btrfs_mark_buffer_dirty(leaf);
+
+		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+		BUG_ON(ret);
+	}
+out:
+	btrfs_free_path(path);
+	return 0;
+}
+
+/*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct page *page, u64 pos)
+{
+	int ret = 0;
+
+	if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+		ret = btrfs_readpage(NULL, page);
+		if (ret)
+			return ret;
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			unlock_page(page);
+			return -EIO;
+		}
+	}
+	return 0;
+}
+
+/*
+ * this gets pages into the page cache and locks them down, it also properly
+ * waits for data=ordered extents to finish before allowing the pages to be
+ * modified.
+ */
+static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
+			 struct page **pages, size_t num_pages,
+			 loff_t pos, unsigned long first_index,
+			 unsigned long last_index, size_t write_bytes)
+{
+	struct extent_state *cached_state = NULL;
+	int i;
+	unsigned long index = pos >> PAGE_CACHE_SHIFT;
+	struct inode *inode = fdentry(file)->d_inode;
+	int err = 0;
+	int faili = 0;
+	u64 start_pos;
+	u64 last_pos;
+
+	start_pos = pos & ~((u64)root->sectorsize - 1);
+	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
+
+	if (start_pos > inode->i_size) {
+		err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
+		if (err)
+			return err;
+	}
+
+again:
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = grab_cache_page(inode->i_mapping, index + i);
+		if (!pages[i]) {
+			faili = i - 1;
+			err = -ENOMEM;
+			goto fail;
+		}
+
+		if (i == 0)
+			err = prepare_uptodate_page(pages[i], pos);
+		if (i == num_pages - 1)
+			err = prepare_uptodate_page(pages[i],
+						    pos + write_bytes);
+		if (err) {
+			page_cache_release(pages[i]);
+			faili = i - 1;
+			goto fail;
+		}
+		wait_on_page_writeback(pages[i]);
+	}
+	err = 0;
+	if (start_pos < inode->i_size) {
+		struct btrfs_ordered_extent *ordered;
+		lock_extent_bits(&BTRFS_I(inode)->io_tree,
+				 start_pos, last_pos - 1, 0, &cached_state,
+				 GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    last_pos - 1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > start_pos &&
+		    ordered->file_offset < last_pos) {
+			btrfs_put_ordered_extent(ordered);
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     start_pos, last_pos - 1,
+					     &cached_state, GFP_NOFS);
+			for (i = 0; i < num_pages; i++) {
+				unlock_page(pages[i]);
+				page_cache_release(pages[i]);
+			}
+			btrfs_wait_ordered_range(inode, start_pos,
+						 last_pos - start_pos);
+			goto again;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
+				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+				  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
+				  GFP_NOFS);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+				     start_pos, last_pos - 1, &cached_state,
+				     GFP_NOFS);
+	}
+	for (i = 0; i < num_pages; i++) {
+		clear_page_dirty_for_io(pages[i]);
+		set_page_extent_mapped(pages[i]);
+		WARN_ON(!PageLocked(pages[i]));
+	}
+	return 0;
+fail:
+	while (faili >= 0) {
+		unlock_page(pages[faili]);
+		page_cache_release(pages[faili]);
+		faili--;
+	}
+	return err;
+
+}
+
+static noinline ssize_t __btrfs_buffered_write(struct file *file,
+					       struct iov_iter *i,
+					       loff_t pos)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct page **pages = NULL;
+	unsigned long first_index;
+	unsigned long last_index;
+	size_t num_written = 0;
+	int nrptrs;
+	int ret = 0;
+
+	nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
+		     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+		     (sizeof(struct page *)));
+	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	first_index = pos >> PAGE_CACHE_SHIFT;
+	last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
+
+	while (iov_iter_count(i) > 0) {
+		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+		size_t write_bytes = min(iov_iter_count(i),
+					 nrptrs * (size_t)PAGE_CACHE_SIZE -
+					 offset);
+		size_t num_pages = (write_bytes + offset +
+				    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		size_t dirty_pages;
+		size_t copied;
+
+		WARN_ON(num_pages > nrptrs);
+
+		/*
+		 * Fault pages before locking them in prepare_pages
+		 * to avoid recursive lock
+		 */
+		if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = btrfs_delalloc_reserve_space(inode,
+					num_pages << PAGE_CACHE_SHIFT);
+		if (ret)
+			break;
+
+		/*
+		 * This is going to setup the pages array with the number of
+		 * pages we want, so we don't really need to worry about the
+		 * contents of pages from loop to loop
+		 */
+		ret = prepare_pages(root, file, pages, num_pages,
+				    pos, first_index, last_index,
+				    write_bytes);
+		if (ret) {
+			btrfs_delalloc_release_space(inode,
+					num_pages << PAGE_CACHE_SHIFT);
+			break;
+		}
+
+		copied = btrfs_copy_from_user(pos, num_pages,
+					   write_bytes, pages, i);
+
+		/*
+		 * if we have trouble faulting in the pages, fall
+		 * back to one page at a time
+		 */
+		if (copied < write_bytes)
+			nrptrs = 1;
+
+		if (copied == 0)
+			dirty_pages = 0;
+		else
+			dirty_pages = (copied + offset +
+				       PAGE_CACHE_SIZE - 1) >>
+				       PAGE_CACHE_SHIFT;
+
+		/*
+		 * If we had a short copy we need to release the excess delaloc
+		 * bytes we reserved.  We need to increment outstanding_extents
+		 * because btrfs_delalloc_release_space will decrement it, but
+		 * we still have an outstanding extent for the chunk we actually
+		 * managed to copy.
+		 */
+		if (num_pages > dirty_pages) {
+			if (copied > 0)
+				atomic_inc(
+					&BTRFS_I(inode)->outstanding_extents);
+			btrfs_delalloc_release_space(inode,
+					(num_pages - dirty_pages) <<
+					PAGE_CACHE_SHIFT);
+		}
+
+		if (copied > 0) {
+			ret = btrfs_dirty_pages(root, inode, pages,
+						dirty_pages, pos, copied,
+						NULL);
+			if (ret) {
+				btrfs_delalloc_release_space(inode,
+					dirty_pages << PAGE_CACHE_SHIFT);
+				btrfs_drop_pages(pages, num_pages);
+				break;
+			}
+		}
+
+		btrfs_drop_pages(pages, num_pages);
+
+		cond_resched();
+
+		balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+						   dirty_pages);
+		if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+			btrfs_btree_balance_dirty(root, 1);
+		btrfs_throttle(root);
+
+		pos += copied;
+		num_written += copied;
+	}
+
+	kfree(pages);
+
+	return num_written ? num_written : ret;
+}
+
+static ssize_t __btrfs_direct_write(struct kiocb *iocb,
+				    const struct iovec *iov,
+				    unsigned long nr_segs, loff_t pos,
+				    loff_t *ppos, size_t count, size_t ocount)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = fdentry(file)->d_inode;
+	struct iov_iter i;
+	ssize_t written;
+	ssize_t written_buffered;
+	loff_t endbyte;
+	int err;
+
+	written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
+					    count, ocount);
+
+	/*
+	 * the generic O_DIRECT will update in-memory i_size after the
+	 * DIOs are done.  But our endio handlers that update the on
+	 * disk i_size never update past the in memory i_size.  So we
+	 * need one more update here to catch any additions to the
+	 * file
+	 */
+	if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+		btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+		mark_inode_dirty(inode);
+	}
+
+	if (written < 0 || written == count)
+		return written;
+
+	pos += written;
+	count -= written;
+	iov_iter_init(&i, iov, nr_segs, count, written);
+	written_buffered = __btrfs_buffered_write(file, &i, pos);
+	if (written_buffered < 0) {
+		err = written_buffered;
+		goto out;
+	}
+	endbyte = pos + written_buffered - 1;
+	err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+	if (err)
+		goto out;
+	written += written_buffered;
+	*ppos = pos + written_buffered;
+	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
+				 endbyte >> PAGE_CACHE_SHIFT);
+out:
+	return written ? written : err;
+}
+
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
+				    const struct iovec *iov,
+				    unsigned long nr_segs, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	loff_t *ppos = &iocb->ki_pos;
+	ssize_t num_written = 0;
+	ssize_t err = 0;
+	size_t count, ocount;
+
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+	mutex_lock(&inode->i_mutex);
+
+	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+	if (err) {
+		mutex_unlock(&inode->i_mutex);
+		goto out;
+	}
+	count = ocount;
+
+	current->backing_dev_info = inode->i_mapping->backing_dev_info;
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err) {
+		mutex_unlock(&inode->i_mutex);
+		goto out;
+	}
+
+	if (count == 0) {
+		mutex_unlock(&inode->i_mutex);
+		goto out;
+	}
+
+	err = file_remove_suid(file);
+	if (err) {
+		mutex_unlock(&inode->i_mutex);
+		goto out;
+	}
+
+	/*
+	 * If BTRFS flips readonly due to some impossible error
+	 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+	 * although we have opened a file as writable, we have
+	 * to stop this write operation to ensure FS consistency.
+	 */
+	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+		mutex_unlock(&inode->i_mutex);
+		err = -EROFS;
+		goto out;
+	}
+
+	file_update_time(file);
+	BTRFS_I(inode)->sequence++;
+
+	if (unlikely(file->f_flags & O_DIRECT)) {
+		num_written = __btrfs_direct_write(iocb, iov, nr_segs,
+						   pos, ppos, count, ocount);
+	} else {
+		struct iov_iter i;
+
+		iov_iter_init(&i, iov, nr_segs, count, num_written);
+
+		num_written = __btrfs_buffered_write(file, &i, pos);
+		if (num_written > 0)
+			*ppos = pos + num_written;
+	}
+
+	mutex_unlock(&inode->i_mutex);
+
+	/*
+	 * we want to make sure fsync finds this change
+	 * but we haven't joined a transaction running right now.
+	 *
+	 * Later on, someone is sure to update the inode and get the
+	 * real transid recorded.
+	 *
+	 * We set last_trans now to the fs_info generation + 1,
+	 * this will either be one more than the running transaction
+	 * or the generation used for the next transaction if there isn't
+	 * one running right now.
+	 */
+	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+	if (num_written > 0 || num_written == -EIOCBQUEUED) {
+		err = generic_write_sync(file, pos, num_written);
+		if (err < 0 && num_written > 0)
+			num_written = err;
+	}
+out:
+	current->backing_dev_info = NULL;
+	return num_written ? num_written : err;
+}
+
+int btrfs_release_file(struct inode *inode, struct file *filp)
+{
+	/*
+	 * ordered_data_close is set by settattr when we are about to truncate
+	 * a file from a non-zero size to a zero size.  This tries to
+	 * flush down new bytes that may have been written if the
+	 * application were using truncate to replace a file in place.
+	 */
+	if (BTRFS_I(inode)->ordered_data_close) {
+		BTRFS_I(inode)->ordered_data_close = 0;
+		btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+		if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+			filemap_flush(inode->i_mapping);
+	}
+	if (filp->private_data)
+		btrfs_ioctl_trans_end(filp);
+	return 0;
+}
+
+/*
+ * fsync call for both files and directories.  This logs the inode into
+ * the tree log instead of forcing full commits whenever possible.
+ *
+ * It needs to call filemap_fdatawait so that all ordered extent updates are
+ * in the metadata btree are up to date for copying to the log.
+ *
+ * It drops the inode mutex before doing the tree log commit.  This is an
+ * important optimization for directories because holding the mutex prevents
+ * new operations on the dir while we write to disk.
+ */
+int btrfs_sync_file(struct file *file, int datasync)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+	struct btrfs_trans_handle *trans;
+
+	trace_btrfs_sync_file(file, datasync);
+
+	/* we wait first, since the writeback may change the inode */
+	root->log_batch++;
+	/* the VFS called filemap_fdatawrite for us */
+	btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	root->log_batch++;
+
+	/*
+	 * check the transaction that last modified this inode
+	 * and see if its already been committed
+	 */
+	if (!BTRFS_I(inode)->last_trans)
+		goto out;
+
+	/*
+	 * if the last transaction that changed this file was before
+	 * the current transaction, we can bail out now without any
+	 * syncing
+	 */
+	smp_mb();
+	if (BTRFS_I(inode)->last_trans <=
+	    root->fs_info->last_trans_committed) {
+		BTRFS_I(inode)->last_trans = 0;
+		goto out;
+	}
+
+	/*
+	 * ok we haven't committed the transaction yet, lets do a commit
+	 */
+	if (file->private_data)
+		btrfs_ioctl_trans_end(file);
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	ret = btrfs_log_dentry_safe(trans, root, dentry);
+	if (ret < 0)
+		goto out;
+
+	/* we've logged all the items and now have a consistent
+	 * version of the file in the log.  It is possible that
+	 * someone will come in and modify the file, but that's
+	 * fine because the log is consistent on disk, and we
+	 * have references to all of the file's extents
+	 *
+	 * It is possible that someone will come in and log the
+	 * file again, but that will end up using the synchronization
+	 * inside btrfs_sync_log to keep things safe.
+	 */
+	mutex_unlock(&dentry->d_inode->i_mutex);
+
+	if (ret != BTRFS_NO_LOG_SYNC) {
+		if (ret > 0) {
+			ret = btrfs_commit_transaction(trans, root);
+		} else {
+			ret = btrfs_sync_log(trans, root);
+			if (ret == 0)
+				ret = btrfs_end_transaction(trans, root);
+			else
+				ret = btrfs_commit_transaction(trans, root);
+		}
+	} else {
+		ret = btrfs_end_transaction(trans, root);
+	}
+	mutex_lock(&dentry->d_inode->i_mutex);
+out:
+	return ret > 0 ? -EIO : ret;
+}
+
+static const struct vm_operations_struct btrfs_file_vm_ops = {
+	.fault		= filemap_fault,
+	.page_mkwrite	= btrfs_page_mkwrite,
+};
+
+static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
+{
+	struct address_space *mapping = filp->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+
+	file_accessed(filp);
+	vma->vm_ops = &btrfs_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+
+	return 0;
+}
+
+static long btrfs_fallocate(struct file *file, int mode,
+			    loff_t offset, loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct extent_state *cached_state = NULL;
+	u64 cur_offset;
+	u64 last_byte;
+	u64 alloc_start;
+	u64 alloc_end;
+	u64 alloc_hint = 0;
+	u64 locked_end;
+	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+	struct extent_map *em;
+	int ret;
+
+	alloc_start = offset & ~mask;
+	alloc_end =  (offset + len + mask) & ~mask;
+
+	/* We only support the FALLOC_FL_KEEP_SIZE mode */
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
+	/*
+	 * wait for ordered IO before we have any locks.  We'll loop again
+	 * below with the locks held.
+	 */
+	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
+	mutex_lock(&inode->i_mutex);
+	ret = inode_newsize_ok(inode, alloc_end);
+	if (ret)
+		goto out;
+
+	if (alloc_start > inode->i_size) {
+		ret = btrfs_cont_expand(inode, i_size_read(inode),
+					alloc_start);
+		if (ret)
+			goto out;
+	}
+
+	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+	if (ret)
+		goto out;
+
+	locked_end = alloc_end - 1;
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+
+		/* the extent lock is ordered inside the running
+		 * transaction
+		 */
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+				 locked_end, 0, &cached_state, GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    alloc_end - 1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > alloc_start &&
+		    ordered->file_offset < alloc_end) {
+			btrfs_put_ordered_extent(ordered);
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     alloc_start, locked_end,
+					     &cached_state, GFP_NOFS);
+			/*
+			 * we can't wait on the range with the transaction
+			 * running or with the extent lock held
+			 */
+			btrfs_wait_ordered_range(inode, alloc_start,
+						 alloc_end - alloc_start);
+		} else {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+	}
+
+	cur_offset = alloc_start;
+	while (1) {
+		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+				      alloc_end - cur_offset, 0);
+		BUG_ON(IS_ERR_OR_NULL(em));
+		last_byte = min(extent_map_end(em), alloc_end);
+		last_byte = (last_byte + mask) & ~mask;
+		if (em->block_start == EXTENT_MAP_HOLE ||
+		    (cur_offset >= inode->i_size &&
+		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+							last_byte - cur_offset,
+							1 << inode->i_blkbits,
+							offset + len,
+							&alloc_hint);
+			if (ret < 0) {
+				free_extent_map(em);
+				break;
+			}
+		}
+		free_extent_map(em);
+
+		cur_offset = last_byte;
+		if (cur_offset >= alloc_end) {
+			ret = 0;
+			break;
+		}
+	}
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+			     &cached_state, GFP_NOFS);
+
+	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+const struct file_operations btrfs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read       = generic_file_aio_read,
+	.splice_read	= generic_file_splice_read,
+	.aio_write	= btrfs_file_aio_write,
+	.mmap		= btrfs_file_mmap,
+	.open		= generic_file_open,
+	.release	= btrfs_release_file,
+	.fsync		= btrfs_sync_file,
+	.fallocate	= btrfs_fallocate,
+	.unlocked_ioctl	= btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= btrfs_ioctl,
+#endif
+};
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 00000000..bf0d6156
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,2693 @@
+/*
+ * Copyright (C) 2008 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/math64.h>
+#include "ctree.h"
+#include "free-space-cache.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "extent_io.h"
+#include "inode-map.h"
+
+#define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
+#define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
+
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
+			   struct btrfs_free_space *info);
+
+static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
+					       struct btrfs_path *path,
+					       u64 offset)
+{
+	struct btrfs_key key;
+	struct btrfs_key location;
+	struct btrfs_disk_key disk_key;
+	struct btrfs_free_space_header *header;
+	struct extent_buffer *leaf;
+	struct inode *inode = NULL;
+	int ret;
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = offset;
+	key.type = 0;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0) {
+		btrfs_release_path(path);
+		return ERR_PTR(-ENOENT);
+	}
+
+	leaf = path->nodes[0];
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	btrfs_free_space_key(leaf, header, &disk_key);
+	btrfs_disk_key_to_cpu(&location, &disk_key);
+	btrfs_release_path(path);
+
+	inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+	if (IS_ERR(inode))
+		return inode;
+	if (is_bad_inode(inode)) {
+		iput(inode);
+		return ERR_PTR(-ENOENT);
+	}
+
+	inode->i_mapping->flags &= ~__GFP_FS;
+
+	return inode;
+}
+
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+				      struct btrfs_block_group_cache
+				      *block_group, struct btrfs_path *path)
+{
+	struct inode *inode = NULL;
+
+	spin_lock(&block_group->lock);
+	if (block_group->inode)
+		inode = igrab(block_group->inode);
+	spin_unlock(&block_group->lock);
+	if (inode)
+		return inode;
+
+	inode = __lookup_free_space_inode(root, path,
+					  block_group->key.objectid);
+	if (IS_ERR(inode))
+		return inode;
+
+	spin_lock(&block_group->lock);
+	if (!btrfs_fs_closing(root->fs_info)) {
+		block_group->inode = igrab(inode);
+		block_group->iref = 1;
+	}
+	spin_unlock(&block_group->lock);
+
+	return inode;
+}
+
+int __create_free_space_inode(struct btrfs_root *root,
+			      struct btrfs_trans_handle *trans,
+			      struct btrfs_path *path, u64 ino, u64 offset)
+{
+	struct btrfs_key key;
+	struct btrfs_disk_key disk_key;
+	struct btrfs_free_space_header *header;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *leaf;
+	int ret;
+
+	ret = btrfs_insert_empty_inode(trans, root, path, ino);
+	if (ret)
+		return ret;
+
+	leaf = path->nodes[0];
+	inode_item = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_inode_item);
+	btrfs_item_key(leaf, &disk_key, path->slots[0]);
+	memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
+			     sizeof(*inode_item));
+	btrfs_set_inode_generation(leaf, inode_item, trans->transid);
+	btrfs_set_inode_size(leaf, inode_item, 0);
+	btrfs_set_inode_nbytes(leaf, inode_item, 0);
+	btrfs_set_inode_uid(leaf, inode_item, 0);
+	btrfs_set_inode_gid(leaf, inode_item, 0);
+	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
+	btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
+			      BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
+	btrfs_set_inode_nlink(leaf, inode_item, 1);
+	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
+	btrfs_set_inode_block_group(leaf, inode_item, offset);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = offset;
+	key.type = 0;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(struct btrfs_free_space_header));
+	if (ret < 0) {
+		btrfs_release_path(path);
+		return ret;
+	}
+	leaf = path->nodes[0];
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
+	btrfs_set_free_space_key(leaf, header, &disk_key);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+
+	return 0;
+}
+
+int create_free_space_inode(struct btrfs_root *root,
+			    struct btrfs_trans_handle *trans,
+			    struct btrfs_block_group_cache *block_group,
+			    struct btrfs_path *path)
+{
+	int ret;
+	u64 ino;
+
+	ret = btrfs_find_free_objectid(root, &ino);
+	if (ret < 0)
+		return ret;
+
+	return __create_free_space_inode(root, trans, path, ino,
+					 block_group->key.objectid);
+}
+
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+				    struct btrfs_trans_handle *trans,
+				    struct btrfs_path *path,
+				    struct inode *inode)
+{
+	loff_t oldsize;
+	int ret = 0;
+
+	trans->block_rsv = root->orphan_block_rsv;
+	ret = btrfs_block_rsv_check(trans, root,
+				    root->orphan_block_rsv,
+				    0, 5);
+	if (ret)
+		return ret;
+
+	oldsize = i_size_read(inode);
+	btrfs_i_size_write(inode, 0);
+	truncate_pagecache(inode, oldsize, 0);
+
+	/*
+	 * We don't need an orphan item because truncating the free space cache
+	 * will never be split across transactions.
+	 */
+	ret = btrfs_truncate_inode_items(trans, root, inode,
+					 0, BTRFS_EXTENT_DATA_KEY);
+	if (ret) {
+		WARN_ON(1);
+		return ret;
+	}
+
+	ret = btrfs_update_inode(trans, root, inode);
+	return ret;
+}
+
+static int readahead_cache(struct inode *inode)
+{
+	struct file_ra_state *ra;
+	unsigned long last_index;
+
+	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+	if (!ra)
+		return -ENOMEM;
+
+	file_ra_state_init(ra, inode->i_mapping);
+	last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+
+	page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
+
+	kfree(ra);
+
+	return 0;
+}
+
+int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
+			    struct btrfs_free_space_ctl *ctl,
+			    struct btrfs_path *path, u64 offset)
+{
+	struct btrfs_free_space_header *header;
+	struct extent_buffer *leaf;
+	struct page *page;
+	u32 *checksums = NULL, *crc;
+	char *disk_crcs = NULL;
+	struct btrfs_key key;
+	struct list_head bitmaps;
+	u64 num_entries;
+	u64 num_bitmaps;
+	u64 generation;
+	u32 cur_crc = ~(u32)0;
+	pgoff_t index = 0;
+	unsigned long first_page_offset;
+	int num_checksums;
+	int ret = 0;
+
+	INIT_LIST_HEAD(&bitmaps);
+
+	/* Nothing in the space cache, goodbye */
+	if (!i_size_read(inode))
+		goto out;
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = offset;
+	key.type = 0;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	else if (ret > 0) {
+		btrfs_release_path(path);
+		ret = 0;
+		goto out;
+	}
+
+	ret = -1;
+
+	leaf = path->nodes[0];
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	num_entries = btrfs_free_space_entries(leaf, header);
+	num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
+	generation = btrfs_free_space_generation(leaf, header);
+	btrfs_release_path(path);
+
+	if (BTRFS_I(inode)->generation != generation) {
+		printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
+		       " not match free space cache generation (%llu)\n",
+		       (unsigned long long)BTRFS_I(inode)->generation,
+		       (unsigned long long)generation);
+		goto out;
+	}
+
+	if (!num_entries)
+		goto out;
+
+	/* Setup everything for doing checksumming */
+	num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
+	checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
+	if (!checksums)
+		goto out;
+	first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
+	disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
+	if (!disk_crcs)
+		goto out;
+
+	ret = readahead_cache(inode);
+	if (ret)
+		goto out;
+
+	while (1) {
+		struct btrfs_free_space_entry *entry;
+		struct btrfs_free_space *e;
+		void *addr;
+		unsigned long offset = 0;
+		unsigned long start_offset = 0;
+		int need_loop = 0;
+
+		if (!num_entries && !num_bitmaps)
+			break;
+
+		if (index == 0) {
+			start_offset = first_page_offset;
+			offset = start_offset;
+		}
+
+		page = grab_cache_page(inode->i_mapping, index);
+		if (!page)
+			goto free_cache;
+
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				printk(KERN_ERR "btrfs: error reading free "
+				       "space cache\n");
+				goto free_cache;
+			}
+		}
+		addr = kmap(page);
+
+		if (index == 0) {
+			u64 *gen;
+
+			memcpy(disk_crcs, addr, first_page_offset);
+			gen = addr + (sizeof(u32) * num_checksums);
+			if (*gen != BTRFS_I(inode)->generation) {
+				printk(KERN_ERR "btrfs: space cache generation"
+				       " (%llu) does not match inode (%llu)\n",
+				       (unsigned long long)*gen,
+				       (unsigned long long)
+				       BTRFS_I(inode)->generation);
+				kunmap(page);
+				unlock_page(page);
+				page_cache_release(page);
+				goto free_cache;
+			}
+			crc = (u32 *)disk_crcs;
+		}
+		entry = addr + start_offset;
+
+		/* First lets check our crc before we do anything fun */
+		cur_crc = ~(u32)0;
+		cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
+					  PAGE_CACHE_SIZE - start_offset);
+		btrfs_csum_final(cur_crc, (char *)&cur_crc);
+		if (cur_crc != *crc) {
+			printk(KERN_ERR "btrfs: crc mismatch for page %lu\n",
+			       index);
+			kunmap(page);
+			unlock_page(page);
+			page_cache_release(page);
+			goto free_cache;
+		}
+		crc++;
+
+		while (1) {
+			if (!num_entries)
+				break;
+
+			need_loop = 1;
+			e = kmem_cache_zalloc(btrfs_free_space_cachep,
+					      GFP_NOFS);
+			if (!e) {
+				kunmap(page);
+				unlock_page(page);
+				page_cache_release(page);
+				goto free_cache;
+			}
+
+			e->offset = le64_to_cpu(entry->offset);
+			e->bytes = le64_to_cpu(entry->bytes);
+			if (!e->bytes) {
+				kunmap(page);
+				kmem_cache_free(btrfs_free_space_cachep, e);
+				unlock_page(page);
+				page_cache_release(page);
+				goto free_cache;
+			}
+
+			if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
+				spin_lock(&ctl->tree_lock);
+				ret = link_free_space(ctl, e);
+				spin_unlock(&ctl->tree_lock);
+				if (ret) {
+					printk(KERN_ERR "Duplicate entries in "
+					       "free space cache, dumping\n");
+					kunmap(page);
+					unlock_page(page);
+					page_cache_release(page);
+					goto free_cache;
+				}
+			} else {
+				e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+				if (!e->bitmap) {
+					kunmap(page);
+					kmem_cache_free(
+						btrfs_free_space_cachep, e);
+					unlock_page(page);
+					page_cache_release(page);
+					goto free_cache;
+				}
+				spin_lock(&ctl->tree_lock);
+				ret = link_free_space(ctl, e);
+				ctl->total_bitmaps++;
+				ctl->op->recalc_thresholds(ctl);
+				spin_unlock(&ctl->tree_lock);
+				if (ret) {
+					printk(KERN_ERR "Duplicate entries in "
+					       "free space cache, dumping\n");
+					kunmap(page);
+					unlock_page(page);
+					page_cache_release(page);
+					goto free_cache;
+				}
+				list_add_tail(&e->list, &bitmaps);
+			}
+
+			num_entries--;
+			offset += sizeof(struct btrfs_free_space_entry);
+			if (offset + sizeof(struct btrfs_free_space_entry) >=
+			    PAGE_CACHE_SIZE)
+				break;
+			entry++;
+		}
+
+		/*
+		 * We read an entry out of this page, we need to move on to the
+		 * next page.
+		 */
+		if (need_loop) {
+			kunmap(page);
+			goto next;
+		}
+
+		/*
+		 * We add the bitmaps at the end of the entries in order that
+		 * the bitmap entries are added to the cache.
+		 */
+		e = list_entry(bitmaps.next, struct btrfs_free_space, list);
+		list_del_init(&e->list);
+		memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
+		kunmap(page);
+		num_bitmaps--;
+next:
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+	}
+
+	ret = 1;
+out:
+	kfree(checksums);
+	kfree(disk_crcs);
+	return ret;
+free_cache:
+	__btrfs_remove_free_space_cache(ctl);
+	goto out;
+}
+
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+			  struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_root *root = fs_info->tree_root;
+	struct inode *inode;
+	struct btrfs_path *path;
+	int ret;
+	bool matched;
+	u64 used = btrfs_block_group_used(&block_group->item);
+
+	/*
+	 * If we're unmounting then just return, since this does a search on the
+	 * normal root and not the commit root and we could deadlock.
+	 */
+	if (btrfs_fs_closing(fs_info))
+		return 0;
+
+	/*
+	 * If this block group has been marked to be cleared for one reason or
+	 * another then we can't trust the on disk cache, so just return.
+	 */
+	spin_lock(&block_group->lock);
+	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+	spin_unlock(&block_group->lock);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return 0;
+
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (IS_ERR(inode)) {
+		btrfs_free_path(path);
+		return 0;
+	}
+
+	ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
+				      path, block_group->key.objectid);
+	btrfs_free_path(path);
+	if (ret <= 0)
+		goto out;
+
+	spin_lock(&ctl->tree_lock);
+	matched = (ctl->free_space == (block_group->key.offset - used -
+				       block_group->bytes_super));
+	spin_unlock(&ctl->tree_lock);
+
+	if (!matched) {
+		__btrfs_remove_free_space_cache(ctl);
+		printk(KERN_ERR "block group %llu has an wrong amount of free "
+		       "space\n", block_group->key.objectid);
+		ret = -1;
+	}
+out:
+	if (ret < 0) {
+		/* This cache is bogus, make sure it gets cleared */
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_CLEAR;
+		spin_unlock(&block_group->lock);
+		ret = 0;
+
+		printk(KERN_ERR "btrfs: failed to load free space cache "
+		       "for block group %llu\n", block_group->key.objectid);
+	}
+
+	iput(inode);
+	return ret;
+}
+
+int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+			    struct btrfs_free_space_ctl *ctl,
+			    struct btrfs_block_group_cache *block_group,
+			    struct btrfs_trans_handle *trans,
+			    struct btrfs_path *path, u64 offset)
+{
+	struct btrfs_free_space_header *header;
+	struct extent_buffer *leaf;
+	struct rb_node *node;
+	struct list_head *pos, *n;
+	struct page **pages;
+	struct page *page;
+	struct extent_state *cached_state = NULL;
+	struct btrfs_free_cluster *cluster = NULL;
+	struct extent_io_tree *unpin = NULL;
+	struct list_head bitmap_list;
+	struct btrfs_key key;
+	u64 start, end, len;
+	u64 bytes = 0;
+	u32 *crc, *checksums;
+	unsigned long first_page_offset;
+	int index = 0, num_pages = 0;
+	int entries = 0;
+	int bitmaps = 0;
+	int ret = -1;
+	bool next_page = false;
+	bool out_of_space = false;
+
+	INIT_LIST_HEAD(&bitmap_list);
+
+	node = rb_first(&ctl->free_space_offset);
+	if (!node)
+		return 0;
+
+	if (!i_size_read(inode))
+		return -1;
+
+	num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+		PAGE_CACHE_SHIFT;
+
+	/* Since the first page has all of our checksums and our generation we
+	 * need to calculate the offset into the page that we can start writing
+	 * our entries.
+	 */
+	first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
+
+	filemap_write_and_wait(inode->i_mapping);
+	btrfs_wait_ordered_range(inode, inode->i_size &
+				 ~(root->sectorsize - 1), (u64)-1);
+
+	/* make sure we don't overflow that first page */
+	if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) {
+		/* this is really the same as running out of space, where we also return 0 */
+		printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n");
+		ret = 0;
+		goto out_update;
+	}
+
+	/* We need a checksum per page. */
+	crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
+	if (!crc)
+		return -1;
+
+	pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
+	if (!pages) {
+		kfree(crc);
+		return -1;
+	}
+
+	/* Get the cluster for this block_group if it exists */
+	if (block_group && !list_empty(&block_group->cluster_list))
+		cluster = list_entry(block_group->cluster_list.next,
+				     struct btrfs_free_cluster,
+				     block_group_list);
+
+	/*
+	 * We shouldn't have switched the pinned extents yet so this is the
+	 * right one
+	 */
+	unpin = root->fs_info->pinned_extents;
+
+	/*
+	 * Lock all pages first so we can lock the extent safely.
+	 *
+	 * NOTE: Because we hold the ref the entire time we're going to write to
+	 * the page find_get_page should never fail, so we don't do a check
+	 * after find_get_page at this point.  Just putting this here so people
+	 * know and don't freak out.
+	 */
+	while (index < num_pages) {
+		page = grab_cache_page(inode->i_mapping, index);
+		if (!page) {
+			int i;
+
+			for (i = 0; i < num_pages; i++) {
+				unlock_page(pages[i]);
+				page_cache_release(pages[i]);
+			}
+			goto out_free;
+		}
+		pages[index] = page;
+		index++;
+	}
+
+	index = 0;
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+			 0, &cached_state, GFP_NOFS);
+
+	/*
+	 * When searching for pinned extents, we need to start at our start
+	 * offset.
+	 */
+	if (block_group)
+		start = block_group->key.objectid;
+
+	/* Write out the extent entries */
+	do {
+		struct btrfs_free_space_entry *entry;
+		void *addr;
+		unsigned long offset = 0;
+		unsigned long start_offset = 0;
+
+		next_page = false;
+
+		if (index == 0) {
+			start_offset = first_page_offset;
+			offset = start_offset;
+		}
+
+		if (index >= num_pages) {
+			out_of_space = true;
+			break;
+		}
+
+		page = pages[index];
+
+		addr = kmap(page);
+		entry = addr + start_offset;
+
+		memset(addr, 0, PAGE_CACHE_SIZE);
+		while (node && !next_page) {
+			struct btrfs_free_space *e;
+
+			e = rb_entry(node, struct btrfs_free_space, offset_index);
+			entries++;
+
+			entry->offset = cpu_to_le64(e->offset);
+			entry->bytes = cpu_to_le64(e->bytes);
+			if (e->bitmap) {
+				entry->type = BTRFS_FREE_SPACE_BITMAP;
+				list_add_tail(&e->list, &bitmap_list);
+				bitmaps++;
+			} else {
+				entry->type = BTRFS_FREE_SPACE_EXTENT;
+			}
+			node = rb_next(node);
+			if (!node && cluster) {
+				node = rb_first(&cluster->root);
+				cluster = NULL;
+			}
+			offset += sizeof(struct btrfs_free_space_entry);
+			if (offset + sizeof(struct btrfs_free_space_entry) >=
+			    PAGE_CACHE_SIZE)
+				next_page = true;
+			entry++;
+		}
+
+		/*
+		 * We want to add any pinned extents to our free space cache
+		 * so we don't leak the space
+		 */
+		while (block_group && !next_page &&
+		       (start < block_group->key.objectid +
+			block_group->key.offset)) {
+			ret = find_first_extent_bit(unpin, start, &start, &end,
+						    EXTENT_DIRTY);
+			if (ret) {
+				ret = 0;
+				break;
+			}
+
+			/* This pinned extent is out of our range */
+			if (start >= block_group->key.objectid +
+			    block_group->key.offset)
+				break;
+
+			len = block_group->key.objectid +
+				block_group->key.offset - start;
+			len = min(len, end + 1 - start);
+
+			entries++;
+			entry->offset = cpu_to_le64(start);
+			entry->bytes = cpu_to_le64(len);
+			entry->type = BTRFS_FREE_SPACE_EXTENT;
+
+			start = end + 1;
+			offset += sizeof(struct btrfs_free_space_entry);
+			if (offset + sizeof(struct btrfs_free_space_entry) >=
+			    PAGE_CACHE_SIZE)
+				next_page = true;
+			entry++;
+		}
+		*crc = ~(u32)0;
+		*crc = btrfs_csum_data(root, addr + start_offset, *crc,
+				       PAGE_CACHE_SIZE - start_offset);
+		kunmap(page);
+
+		btrfs_csum_final(*crc, (char *)crc);
+		crc++;
+
+		bytes += PAGE_CACHE_SIZE;
+
+		index++;
+	} while (node || next_page);
+
+	/* Write out the bitmaps */
+	list_for_each_safe(pos, n, &bitmap_list) {
+		void *addr;
+		struct btrfs_free_space *entry =
+			list_entry(pos, struct btrfs_free_space, list);
+
+		if (index >= num_pages) {
+			out_of_space = true;
+			break;
+		}
+		page = pages[index];
+
+		addr = kmap(page);
+		memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
+		*crc = ~(u32)0;
+		*crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
+		kunmap(page);
+		btrfs_csum_final(*crc, (char *)crc);
+		crc++;
+		bytes += PAGE_CACHE_SIZE;
+
+		list_del_init(&entry->list);
+		index++;
+	}
+
+	if (out_of_space) {
+		btrfs_drop_pages(pages, num_pages);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+				     i_size_read(inode) - 1, &cached_state,
+				     GFP_NOFS);
+		ret = 0;
+		goto out_free;
+	}
+
+	/* Zero out the rest of the pages just to make sure */
+	while (index < num_pages) {
+		void *addr;
+
+		page = pages[index];
+		addr = kmap(page);
+		memset(addr, 0, PAGE_CACHE_SIZE);
+		kunmap(page);
+		bytes += PAGE_CACHE_SIZE;
+		index++;
+	}
+
+	/* Write the checksums and trans id to the first page */
+	{
+		void *addr;
+		u64 *gen;
+
+		page = pages[0];
+
+		addr = kmap(page);
+		memcpy(addr, checksums, sizeof(u32) * num_pages);
+		gen = addr + (sizeof(u32) * num_pages);
+		*gen = trans->transid;
+		kunmap(page);
+	}
+
+	ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
+					    bytes, &cached_state);
+	btrfs_drop_pages(pages, num_pages);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+
+	if (ret) {
+		ret = 0;
+		goto out_free;
+	}
+
+	BTRFS_I(inode)->generation = trans->transid;
+
+	filemap_write_and_wait(inode->i_mapping);
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = offset;
+	key.type = 0;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+	if (ret < 0) {
+		ret = -1;
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+				 EXTENT_DIRTY | EXTENT_DELALLOC |
+				 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
+		goto out_free;
+	}
+	leaf = path->nodes[0];
+	if (ret > 0) {
+		struct btrfs_key found_key;
+		BUG_ON(!path->slots[0]);
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
+		    found_key.offset != offset) {
+			ret = -1;
+			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+					 EXTENT_DIRTY | EXTENT_DELALLOC |
+					 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
+					 GFP_NOFS);
+			btrfs_release_path(path);
+			goto out_free;
+		}
+	}
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	btrfs_set_free_space_entries(leaf, header, entries);
+	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
+	btrfs_set_free_space_generation(leaf, header, trans->transid);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+
+	ret = 1;
+
+out_free:
+	kfree(checksums);
+	kfree(pages);
+
+out_update:
+	if (ret != 1) {
+		invalidate_inode_pages2_range(inode->i_mapping, 0, index);
+		BTRFS_I(inode)->generation = 0;
+	}
+	btrfs_update_inode(trans, root, inode);
+	return ret;
+}
+
+int btrfs_write_out_cache(struct btrfs_root *root,
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_block_group_cache *block_group,
+			  struct btrfs_path *path)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct inode *inode;
+	int ret = 0;
+
+	root = root->fs_info->tree_root;
+
+	spin_lock(&block_group->lock);
+	if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+	spin_unlock(&block_group->lock);
+
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (IS_ERR(inode))
+		return 0;
+
+	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
+				      path, block_group->key.objectid);
+	if (ret < 0) {
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_ERROR;
+		spin_unlock(&block_group->lock);
+		ret = 0;
+
+		printk(KERN_ERR "btrfs: failed to write free space cace "
+		       "for block group %llu\n", block_group->key.objectid);
+	}
+
+	iput(inode);
+	return ret;
+}
+
+static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
+					  u64 offset)
+{
+	BUG_ON(offset < bitmap_start);
+	offset -= bitmap_start;
+	return (unsigned long)(div_u64(offset, unit));
+}
+
+static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
+{
+	return (unsigned long)(div_u64(bytes, unit));
+}
+
+static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
+				   u64 offset)
+{
+	u64 bitmap_start;
+	u64 bytes_per_bitmap;
+
+	bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
+	bitmap_start = offset - ctl->start;
+	bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+	bitmap_start *= bytes_per_bitmap;
+	bitmap_start += ctl->start;
+
+	return bitmap_start;
+}
+
+static int tree_insert_offset(struct rb_root *root, u64 offset,
+			      struct rb_node *node, int bitmap)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_free_space *info;
+
+	while (*p) {
+		parent = *p;
+		info = rb_entry(parent, struct btrfs_free_space, offset_index);
+
+		if (offset < info->offset) {
+			p = &(*p)->rb_left;
+		} else if (offset > info->offset) {
+			p = &(*p)->rb_right;
+		} else {
+			/*
+			 * we could have a bitmap entry and an extent entry
+			 * share the same offset.  If this is the case, we want
+			 * the extent entry to always be found first if we do a
+			 * linear search through the tree, since we want to have
+			 * the quickest allocation time, and allocating from an
+			 * extent is faster than allocating from a bitmap.  So
+			 * if we're inserting a bitmap and we find an entry at
+			 * this offset, we want to go right, or after this entry
+			 * logically.  If we are inserting an extent and we've
+			 * found a bitmap, we want to go left, or before
+			 * logically.
+			 */
+			if (bitmap) {
+				if (info->bitmap) {
+					WARN_ON_ONCE(1);
+					return -EEXIST;
+				}
+				p = &(*p)->rb_right;
+			} else {
+				if (!info->bitmap) {
+					WARN_ON_ONCE(1);
+					return -EEXIST;
+				}
+				p = &(*p)->rb_left;
+			}
+		}
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+
+	return 0;
+}
+
+/*
+ * searches the tree for the given offset.
+ *
+ * fuzzy - If this is set, then we are trying to make an allocation, and we just
+ * want a section that has at least bytes size and comes at or after the given
+ * offset.
+ */
+static struct btrfs_free_space *
+tree_search_offset(struct btrfs_free_space_ctl *ctl,
+		   u64 offset, int bitmap_only, int fuzzy)
+{
+	struct rb_node *n = ctl->free_space_offset.rb_node;
+	struct btrfs_free_space *entry, *prev = NULL;
+
+	/* find entry that is closest to the 'offset' */
+	while (1) {
+		if (!n) {
+			entry = NULL;
+			break;
+		}
+
+		entry = rb_entry(n, struct btrfs_free_space, offset_index);
+		prev = entry;
+
+		if (offset < entry->offset)
+			n = n->rb_left;
+		else if (offset > entry->offset)
+			n = n->rb_right;
+		else
+			break;
+	}
+
+	if (bitmap_only) {
+		if (!entry)
+			return NULL;
+		if (entry->bitmap)
+			return entry;
+
+		/*
+		 * bitmap entry and extent entry may share same offset,
+		 * in that case, bitmap entry comes after extent entry.
+		 */
+		n = rb_next(n);
+		if (!n)
+			return NULL;
+		entry = rb_entry(n, struct btrfs_free_space, offset_index);
+		if (entry->offset != offset)
+			return NULL;
+
+		WARN_ON(!entry->bitmap);
+		return entry;
+	} else if (entry) {
+		if (entry->bitmap) {
+			/*
+			 * if previous extent entry covers the offset,
+			 * we should return it instead of the bitmap entry
+			 */
+			n = &entry->offset_index;
+			while (1) {
+				n = rb_prev(n);
+				if (!n)
+					break;
+				prev = rb_entry(n, struct btrfs_free_space,
+						offset_index);
+				if (!prev->bitmap) {
+					if (prev->offset + prev->bytes > offset)
+						entry = prev;
+					break;
+				}
+			}
+		}
+		return entry;
+	}
+
+	if (!prev)
+		return NULL;
+
+	/* find last entry before the 'offset' */
+	entry = prev;
+	if (entry->offset > offset) {
+		n = rb_prev(&entry->offset_index);
+		if (n) {
+			entry = rb_entry(n, struct btrfs_free_space,
+					offset_index);
+			BUG_ON(entry->offset > offset);
+		} else {
+			if (fuzzy)
+				return entry;
+			else
+				return NULL;
+		}
+	}
+
+	if (entry->bitmap) {
+		n = &entry->offset_index;
+		while (1) {
+			n = rb_prev(n);
+			if (!n)
+				break;
+			prev = rb_entry(n, struct btrfs_free_space,
+					offset_index);
+			if (!prev->bitmap) {
+				if (prev->offset + prev->bytes > offset)
+					return prev;
+				break;
+			}
+		}
+		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
+			return entry;
+	} else if (entry->offset + entry->bytes > offset)
+		return entry;
+
+	if (!fuzzy)
+		return NULL;
+
+	while (1) {
+		if (entry->bitmap) {
+			if (entry->offset + BITS_PER_BITMAP *
+			    ctl->unit > offset)
+				break;
+		} else {
+			if (entry->offset + entry->bytes > offset)
+				break;
+		}
+
+		n = rb_next(&entry->offset_index);
+		if (!n)
+			return NULL;
+		entry = rb_entry(n, struct btrfs_free_space, offset_index);
+	}
+	return entry;
+}
+
+static inline void
+__unlink_free_space(struct btrfs_free_space_ctl *ctl,
+		    struct btrfs_free_space *info)
+{
+	rb_erase(&info->offset_index, &ctl->free_space_offset);
+	ctl->free_extents--;
+}
+
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info)
+{
+	__unlink_free_space(ctl, info);
+	ctl->free_space -= info->bytes;
+}
+
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
+			   struct btrfs_free_space *info)
+{
+	int ret = 0;
+
+	BUG_ON(!info->bitmap && !info->bytes);
+	ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
+				 &info->offset_index, (info->bitmap != NULL));
+	if (ret)
+		return ret;
+
+	ctl->free_space += info->bytes;
+	ctl->free_extents++;
+	return ret;
+}
+
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+	struct btrfs_block_group_cache *block_group = ctl->private;
+	u64 max_bytes;
+	u64 bitmap_bytes;
+	u64 extent_bytes;
+	u64 size = block_group->key.offset;
+	u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+	int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
+
+	BUG_ON(ctl->total_bitmaps > max_bitmaps);
+
+	/*
+	 * The goal is to keep the total amount of memory used per 1gb of space
+	 * at or below 32k, so we need to adjust how much memory we allow to be
+	 * used by extent based free space tracking
+	 */
+	if (size < 1024 * 1024 * 1024)
+		max_bytes = MAX_CACHE_BYTES_PER_GIG;
+	else
+		max_bytes = MAX_CACHE_BYTES_PER_GIG *
+			div64_u64(size, 1024 * 1024 * 1024);
+
+	/*
+	 * we want to account for 1 more bitmap than what we have so we can make
+	 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
+	 * we add more bitmaps.
+	 */
+	bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
+
+	if (bitmap_bytes >= max_bytes) {
+		ctl->extents_thresh = 0;
+		return;
+	}
+
+	/*
+	 * we want the extent entry threshold to always be at most 1/2 the maxw
+	 * bytes we can have, or whatever is less than that.
+	 */
+	extent_bytes = max_bytes - bitmap_bytes;
+	extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
+
+	ctl->extents_thresh =
+		div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
+}
+
+static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info, u64 offset,
+			      u64 bytes)
+{
+	unsigned long start, count;
+
+	start = offset_to_bit(info->offset, ctl->unit, offset);
+	count = bytes_to_bits(bytes, ctl->unit);
+	BUG_ON(start + count > BITS_PER_BITMAP);
+
+	bitmap_clear(info->bitmap, start, count);
+
+	info->bytes -= bytes;
+	ctl->free_space -= bytes;
+}
+
+static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
+			    struct btrfs_free_space *info, u64 offset,
+			    u64 bytes)
+{
+	unsigned long start, count;
+
+	start = offset_to_bit(info->offset, ctl->unit, offset);
+	count = bytes_to_bits(bytes, ctl->unit);
+	BUG_ON(start + count > BITS_PER_BITMAP);
+
+	bitmap_set(info->bitmap, start, count);
+
+	info->bytes += bytes;
+	ctl->free_space += bytes;
+}
+
+static int search_bitmap(struct btrfs_free_space_ctl *ctl,
+			 struct btrfs_free_space *bitmap_info, u64 *offset,
+			 u64 *bytes)
+{
+	unsigned long found_bits = 0;
+	unsigned long bits, i;
+	unsigned long next_zero;
+
+	i = offset_to_bit(bitmap_info->offset, ctl->unit,
+			  max_t(u64, *offset, bitmap_info->offset));
+	bits = bytes_to_bits(*bytes, ctl->unit);
+
+	for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
+	     i < BITS_PER_BITMAP;
+	     i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+		next_zero = find_next_zero_bit(bitmap_info->bitmap,
+					       BITS_PER_BITMAP, i);
+		if ((next_zero - i) >= bits) {
+			found_bits = next_zero - i;
+			break;
+		}
+		i = next_zero;
+	}
+
+	if (found_bits) {
+		*offset = (u64)(i * ctl->unit) + bitmap_info->offset;
+		*bytes = (u64)(found_bits) * ctl->unit;
+		return 0;
+	}
+
+	return -1;
+}
+
+static struct btrfs_free_space *
+find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
+{
+	struct btrfs_free_space *entry;
+	struct rb_node *node;
+	int ret;
+
+	if (!ctl->free_space_offset.rb_node)
+		return NULL;
+
+	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
+	if (!entry)
+		return NULL;
+
+	for (node = &entry->offset_index; node; node = rb_next(node)) {
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		if (entry->bytes < *bytes)
+			continue;
+
+		if (entry->bitmap) {
+			ret = search_bitmap(ctl, entry, offset, bytes);
+			if (!ret)
+				return entry;
+			continue;
+		}
+
+		*offset = entry->offset;
+		*bytes = entry->bytes;
+		return entry;
+	}
+
+	return NULL;
+}
+
+static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
+			   struct btrfs_free_space *info, u64 offset)
+{
+	info->offset = offset_to_bitmap(ctl, offset);
+	info->bytes = 0;
+	link_free_space(ctl, info);
+	ctl->total_bitmaps++;
+
+	ctl->op->recalc_thresholds(ctl);
+}
+
+static void free_bitmap(struct btrfs_free_space_ctl *ctl,
+			struct btrfs_free_space *bitmap_info)
+{
+	unlink_free_space(ctl, bitmap_info);
+	kfree(bitmap_info->bitmap);
+	kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
+	ctl->total_bitmaps--;
+	ctl->op->recalc_thresholds(ctl);
+}
+
+static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *bitmap_info,
+			      u64 *offset, u64 *bytes)
+{
+	u64 end;
+	u64 search_start, search_bytes;
+	int ret;
+
+again:
+	end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
+
+	/*
+	 * XXX - this can go away after a few releases.
+	 *
+	 * since the only user of btrfs_remove_free_space is the tree logging
+	 * stuff, and the only way to test that is under crash conditions, we
+	 * want to have this debug stuff here just in case somethings not
+	 * working.  Search the bitmap for the space we are trying to use to
+	 * make sure its actually there.  If its not there then we need to stop
+	 * because something has gone wrong.
+	 */
+	search_start = *offset;
+	search_bytes = *bytes;
+	search_bytes = min(search_bytes, end - search_start + 1);
+	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
+	BUG_ON(ret < 0 || search_start != *offset);
+
+	if (*offset > bitmap_info->offset && *offset + *bytes > end) {
+		bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1);
+		*bytes -= end - *offset + 1;
+		*offset = end + 1;
+	} else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
+		bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes);
+		*bytes = 0;
+	}
+
+	if (*bytes) {
+		struct rb_node *next = rb_next(&bitmap_info->offset_index);
+		if (!bitmap_info->bytes)
+			free_bitmap(ctl, bitmap_info);
+
+		/*
+		 * no entry after this bitmap, but we still have bytes to
+		 * remove, so something has gone wrong.
+		 */
+		if (!next)
+			return -EINVAL;
+
+		bitmap_info = rb_entry(next, struct btrfs_free_space,
+				       offset_index);
+
+		/*
+		 * if the next entry isn't a bitmap we need to return to let the
+		 * extent stuff do its work.
+		 */
+		if (!bitmap_info->bitmap)
+			return -EAGAIN;
+
+		/*
+		 * Ok the next item is a bitmap, but it may not actually hold
+		 * the information for the rest of this free space stuff, so
+		 * look for it, and if we don't find it return so we can try
+		 * everything over again.
+		 */
+		search_start = *offset;
+		search_bytes = *bytes;
+		ret = search_bitmap(ctl, bitmap_info, &search_start,
+				    &search_bytes);
+		if (ret < 0 || search_start != *offset)
+			return -EAGAIN;
+
+		goto again;
+	} else if (!bitmap_info->bytes)
+		free_bitmap(ctl, bitmap_info);
+
+	return 0;
+}
+
+static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
+			       struct btrfs_free_space *info, u64 offset,
+			       u64 bytes)
+{
+	u64 bytes_to_set = 0;
+	u64 end;
+
+	end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
+
+	bytes_to_set = min(end - offset, bytes);
+
+	bitmap_set_bits(ctl, info, offset, bytes_to_set);
+
+	return bytes_to_set;
+
+}
+
+static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+		      struct btrfs_free_space *info)
+{
+	struct btrfs_block_group_cache *block_group = ctl->private;
+
+	/*
+	 * If we are below the extents threshold then we can add this as an
+	 * extent, and don't have to deal with the bitmap
+	 */
+	if (ctl->free_extents < ctl->extents_thresh) {
+		/*
+		 * If this block group has some small extents we don't want to
+		 * use up all of our free slots in the cache with them, we want
+		 * to reserve them to larger extents, however if we have plent
+		 * of cache left then go ahead an dadd them, no sense in adding
+		 * the overhead of a bitmap if we don't have to.
+		 */
+		if (info->bytes <= block_group->sectorsize * 4) {
+			if (ctl->free_extents * 2 <= ctl->extents_thresh)
+				return false;
+		} else {
+			return false;
+		}
+	}
+
+	/*
+	 * some block groups are so tiny they can't be enveloped by a bitmap, so
+	 * don't even bother to create a bitmap for this
+	 */
+	if (BITS_PER_BITMAP * block_group->sectorsize >
+	    block_group->key.offset)
+		return false;
+
+	return true;
+}
+
+static struct btrfs_free_space_op free_space_op = {
+	.recalc_thresholds	= recalculate_thresholds,
+	.use_bitmap		= use_bitmap,
+};
+
+static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info)
+{
+	struct btrfs_free_space *bitmap_info;
+	struct btrfs_block_group_cache *block_group = NULL;
+	int added = 0;
+	u64 bytes, offset, bytes_added;
+	int ret;
+
+	bytes = info->bytes;
+	offset = info->offset;
+
+	if (!ctl->op->use_bitmap(ctl, info))
+		return 0;
+
+	if (ctl->op == &free_space_op)
+		block_group = ctl->private;
+again:
+	/*
+	 * Since we link bitmaps right into the cluster we need to see if we
+	 * have a cluster here, and if so and it has our bitmap we need to add
+	 * the free space to that bitmap.
+	 */
+	if (block_group && !list_empty(&block_group->cluster_list)) {
+		struct btrfs_free_cluster *cluster;
+		struct rb_node *node;
+		struct btrfs_free_space *entry;
+
+		cluster = list_entry(block_group->cluster_list.next,
+				     struct btrfs_free_cluster,
+				     block_group_list);
+		spin_lock(&cluster->lock);
+		node = rb_first(&cluster->root);
+		if (!node) {
+			spin_unlock(&cluster->lock);
+			goto no_cluster_bitmap;
+		}
+
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		if (!entry->bitmap) {
+			spin_unlock(&cluster->lock);
+			goto no_cluster_bitmap;
+		}
+
+		if (entry->offset == offset_to_bitmap(ctl, offset)) {
+			bytes_added = add_bytes_to_bitmap(ctl, entry,
+							  offset, bytes);
+			bytes -= bytes_added;
+			offset += bytes_added;
+		}
+		spin_unlock(&cluster->lock);
+		if (!bytes) {
+			ret = 1;
+			goto out;
+		}
+	}
+
+no_cluster_bitmap:
+	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+					 1, 0);
+	if (!bitmap_info) {
+		BUG_ON(added);
+		goto new_bitmap;
+	}
+
+	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+	bytes -= bytes_added;
+	offset += bytes_added;
+	added = 0;
+
+	if (!bytes) {
+		ret = 1;
+		goto out;
+	} else
+		goto again;
+
+new_bitmap:
+	if (info && info->bitmap) {
+		add_new_bitmap(ctl, info, offset);
+		added = 1;
+		info = NULL;
+		goto again;
+	} else {
+		spin_unlock(&ctl->tree_lock);
+
+		/* no pre-allocated info, allocate a new one */
+		if (!info) {
+			info = kmem_cache_zalloc(btrfs_free_space_cachep,
+						 GFP_NOFS);
+			if (!info) {
+				spin_lock(&ctl->tree_lock);
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+
+		/* allocate the bitmap */
+		info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+		spin_lock(&ctl->tree_lock);
+		if (!info->bitmap) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		goto again;
+	}
+
+out:
+	if (info) {
+		if (info->bitmap)
+			kfree(info->bitmap);
+		kmem_cache_free(btrfs_free_space_cachep, info);
+	}
+
+	return ret;
+}
+
+static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
+			  struct btrfs_free_space *info, bool update_stat)
+{
+	struct btrfs_free_space *left_info;
+	struct btrfs_free_space *right_info;
+	bool merged = false;
+	u64 offset = info->offset;
+	u64 bytes = info->bytes;
+
+	/*
+	 * first we want to see if there is free space adjacent to the range we
+	 * are adding, if there is remove that struct and add a new one to
+	 * cover the entire range
+	 */
+	right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
+	if (right_info && rb_prev(&right_info->offset_index))
+		left_info = rb_entry(rb_prev(&right_info->offset_index),
+				     struct btrfs_free_space, offset_index);
+	else
+		left_info = tree_search_offset(ctl, offset - 1, 0, 0);
+
+	if (right_info && !right_info->bitmap) {
+		if (update_stat)
+			unlink_free_space(ctl, right_info);
+		else
+			__unlink_free_space(ctl, right_info);
+		info->bytes += right_info->bytes;
+		kmem_cache_free(btrfs_free_space_cachep, right_info);
+		merged = true;
+	}
+
+	if (left_info && !left_info->bitmap &&
+	    left_info->offset + left_info->bytes == offset) {
+		if (update_stat)
+			unlink_free_space(ctl, left_info);
+		else
+			__unlink_free_space(ctl, left_info);
+		info->offset = left_info->offset;
+		info->bytes += left_info->bytes;
+		kmem_cache_free(btrfs_free_space_cachep, left_info);
+		merged = true;
+	}
+
+	return merged;
+}
+
+int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
+			   u64 offset, u64 bytes)
+{
+	struct btrfs_free_space *info;
+	int ret = 0;
+
+	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
+	if (!info)
+		return -ENOMEM;
+
+	info->offset = offset;
+	info->bytes = bytes;
+
+	spin_lock(&ctl->tree_lock);
+
+	if (try_merge_free_space(ctl, info, true))
+		goto link;
+
+	/*
+	 * There was no extent directly to the left or right of this new
+	 * extent then we know we're going to have to allocate a new extent, so
+	 * before we do that see if we need to drop this into a bitmap
+	 */
+	ret = insert_into_bitmap(ctl, info);
+	if (ret < 0) {
+		goto out;
+	} else if (ret) {
+		ret = 0;
+		goto out;
+	}
+link:
+	ret = link_free_space(ctl, info);
+	if (ret)
+		kmem_cache_free(btrfs_free_space_cachep, info);
+out:
+	spin_unlock(&ctl->tree_lock);
+
+	if (ret) {
+		printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
+		BUG_ON(ret == -EEXIST);
+	}
+
+	return ret;
+}
+
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			    u64 offset, u64 bytes)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *info;
+	struct btrfs_free_space *next_info = NULL;
+	int ret = 0;
+
+	spin_lock(&ctl->tree_lock);
+
+again:
+	info = tree_search_offset(ctl, offset, 0, 0);
+	if (!info) {
+		/*
+		 * oops didn't find an extent that matched the space we wanted
+		 * to remove, look for a bitmap instead
+		 */
+		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+					  1, 0);
+		if (!info) {
+			WARN_ON(1);
+			goto out_lock;
+		}
+	}
+
+	if (info->bytes < bytes && rb_next(&info->offset_index)) {
+		u64 end;
+		next_info = rb_entry(rb_next(&info->offset_index),
+					     struct btrfs_free_space,
+					     offset_index);
+
+		if (next_info->bitmap)
+			end = next_info->offset +
+			      BITS_PER_BITMAP * ctl->unit - 1;
+		else
+			end = next_info->offset + next_info->bytes;
+
+		if (next_info->bytes < bytes ||
+		    next_info->offset > offset || offset > end) {
+			printk(KERN_CRIT "Found free space at %llu, size %llu,"
+			      " trying to use %llu\n",
+			      (unsigned long long)info->offset,
+			      (unsigned long long)info->bytes,
+			      (unsigned long long)bytes);
+			WARN_ON(1);
+			ret = -EINVAL;
+			goto out_lock;
+		}
+
+		info = next_info;
+	}
+
+	if (info->bytes == bytes) {
+		unlink_free_space(ctl, info);
+		if (info->bitmap) {
+			kfree(info->bitmap);
+			ctl->total_bitmaps--;
+		}
+		kmem_cache_free(btrfs_free_space_cachep, info);
+		goto out_lock;
+	}
+
+	if (!info->bitmap && info->offset == offset) {
+		unlink_free_space(ctl, info);
+		info->offset += bytes;
+		info->bytes -= bytes;
+		link_free_space(ctl, info);
+		goto out_lock;
+	}
+
+	if (!info->bitmap && info->offset <= offset &&
+	    info->offset + info->bytes >= offset + bytes) {
+		u64 old_start = info->offset;
+		/*
+		 * we're freeing space in the middle of the info,
+		 * this can happen during tree log replay
+		 *
+		 * first unlink the old info and then
+		 * insert it again after the hole we're creating
+		 */
+		unlink_free_space(ctl, info);
+		if (offset + bytes < info->offset + info->bytes) {
+			u64 old_end = info->offset + info->bytes;
+
+			info->offset = offset + bytes;
+			info->bytes = old_end - info->offset;
+			ret = link_free_space(ctl, info);
+			WARN_ON(ret);
+			if (ret)
+				goto out_lock;
+		} else {
+			/* the hole we're creating ends at the end
+			 * of the info struct, just free the info
+			 */
+			kmem_cache_free(btrfs_free_space_cachep, info);
+		}
+		spin_unlock(&ctl->tree_lock);
+
+		/* step two, insert a new info struct to cover
+		 * anything before the hole
+		 */
+		ret = btrfs_add_free_space(block_group, old_start,
+					   offset - old_start);
+		WARN_ON(ret);
+		goto out;
+	}
+
+	ret = remove_from_bitmap(ctl, info, &offset, &bytes);
+	if (ret == -EAGAIN)
+		goto again;
+	BUG_ON(ret);
+out_lock:
+	spin_unlock(&ctl->tree_lock);
+out:
+	return ret;
+}
+
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+			   u64 bytes)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *info;
+	struct rb_node *n;
+	int count = 0;
+
+	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
+		info = rb_entry(n, struct btrfs_free_space, offset_index);
+		if (info->bytes >= bytes)
+			count++;
+		printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
+		       (unsigned long long)info->offset,
+		       (unsigned long long)info->bytes,
+		       (info->bitmap) ? "yes" : "no");
+	}
+	printk(KERN_INFO "block group has cluster?: %s\n",
+	       list_empty(&block_group->cluster_list) ? "no" : "yes");
+	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
+	       "\n", count);
+}
+
+void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+
+	spin_lock_init(&ctl->tree_lock);
+	ctl->unit = block_group->sectorsize;
+	ctl->start = block_group->key.objectid;
+	ctl->private = block_group;
+	ctl->op = &free_space_op;
+
+	/*
+	 * we only want to have 32k of ram per block group for keeping
+	 * track of free space, and if we pass 1/2 of that we want to
+	 * start converting things over to using bitmaps
+	 */
+	ctl->extents_thresh = ((1024 * 32) / 2) /
+				sizeof(struct btrfs_free_space);
+}
+
+/*
+ * for a given cluster, put all of its extents back into the free
+ * space cache.  If the block group passed doesn't match the block group
+ * pointed to by the cluster, someone else raced in and freed the
+ * cluster already.  In that case, we just return without changing anything
+ */
+static int
+__btrfs_return_cluster_to_free_space(
+			     struct btrfs_block_group_cache *block_group,
+			     struct btrfs_free_cluster *cluster)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry;
+	struct rb_node *node;
+
+	spin_lock(&cluster->lock);
+	if (cluster->block_group != block_group)
+		goto out;
+
+	cluster->block_group = NULL;
+	cluster->window_start = 0;
+	list_del_init(&cluster->block_group_list);
+
+	node = rb_first(&cluster->root);
+	while (node) {
+		bool bitmap;
+
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		node = rb_next(&entry->offset_index);
+		rb_erase(&entry->offset_index, &cluster->root);
+
+		bitmap = (entry->bitmap != NULL);
+		if (!bitmap)
+			try_merge_free_space(ctl, entry, false);
+		tree_insert_offset(&ctl->free_space_offset,
+				   entry->offset, &entry->offset_index, bitmap);
+	}
+	cluster->root = RB_ROOT;
+
+out:
+	spin_unlock(&cluster->lock);
+	btrfs_put_block_group(block_group);
+	return 0;
+}
+
+void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl)
+{
+	struct btrfs_free_space *info;
+	struct rb_node *node;
+
+	while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
+		info = rb_entry(node, struct btrfs_free_space, offset_index);
+		if (!info->bitmap) {
+			unlink_free_space(ctl, info);
+			kmem_cache_free(btrfs_free_space_cachep, info);
+		} else {
+			free_bitmap(ctl, info);
+		}
+		if (need_resched()) {
+			spin_unlock(&ctl->tree_lock);
+			cond_resched();
+			spin_lock(&ctl->tree_lock);
+		}
+	}
+}
+
+void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
+{
+	spin_lock(&ctl->tree_lock);
+	__btrfs_remove_free_space_cache_locked(ctl);
+	spin_unlock(&ctl->tree_lock);
+}
+
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_cluster *cluster;
+	struct list_head *head;
+
+	spin_lock(&ctl->tree_lock);
+	while ((head = block_group->cluster_list.next) !=
+	       &block_group->cluster_list) {
+		cluster = list_entry(head, struct btrfs_free_cluster,
+				     block_group_list);
+
+		WARN_ON(cluster->block_group != block_group);
+		__btrfs_return_cluster_to_free_space(block_group, cluster);
+		if (need_resched()) {
+			spin_unlock(&ctl->tree_lock);
+			cond_resched();
+			spin_lock(&ctl->tree_lock);
+		}
+	}
+	__btrfs_remove_free_space_cache_locked(ctl);
+	spin_unlock(&ctl->tree_lock);
+
+}
+
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+			       u64 offset, u64 bytes, u64 empty_size)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry = NULL;
+	u64 bytes_search = bytes + empty_size;
+	u64 ret = 0;
+
+	spin_lock(&ctl->tree_lock);
+	entry = find_free_space(ctl, &offset, &bytes_search);
+	if (!entry)
+		goto out;
+
+	ret = offset;
+	if (entry->bitmap) {
+		bitmap_clear_bits(ctl, entry, offset, bytes);
+		if (!entry->bytes)
+			free_bitmap(ctl, entry);
+	} else {
+		unlink_free_space(ctl, entry);
+		entry->offset += bytes;
+		entry->bytes -= bytes;
+		if (!entry->bytes)
+			kmem_cache_free(btrfs_free_space_cachep, entry);
+		else
+			link_free_space(ctl, entry);
+	}
+
+out:
+	spin_unlock(&ctl->tree_lock);
+
+	return ret;
+}
+
+/*
+ * given a cluster, put all of its extents back into the free space
+ * cache.  If a block group is passed, this function will only free
+ * a cluster that belongs to the passed block group.
+ *
+ * Otherwise, it'll get a reference on the block group pointed to by the
+ * cluster and remove the cluster from it.
+ */
+int btrfs_return_cluster_to_free_space(
+			       struct btrfs_block_group_cache *block_group,
+			       struct btrfs_free_cluster *cluster)
+{
+	struct btrfs_free_space_ctl *ctl;
+	int ret;
+
+	/* first, get a safe pointer to the block group */
+	spin_lock(&cluster->lock);
+	if (!block_group) {
+		block_group = cluster->block_group;
+		if (!block_group) {
+			spin_unlock(&cluster->lock);
+			return 0;
+		}
+	} else if (cluster->block_group != block_group) {
+		/* someone else has already freed it don't redo their work */
+		spin_unlock(&cluster->lock);
+		return 0;
+	}
+	atomic_inc(&block_group->count);
+	spin_unlock(&cluster->lock);
+
+	ctl = block_group->free_space_ctl;
+
+	/* now return any extents the cluster had on it */
+	spin_lock(&ctl->tree_lock);
+	ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
+	spin_unlock(&ctl->tree_lock);
+
+	/* finally drop our ref */
+	btrfs_put_block_group(block_group);
+	return ret;
+}
+
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
+				   struct btrfs_free_cluster *cluster,
+				   struct btrfs_free_space *entry,
+				   u64 bytes, u64 min_start)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	int err;
+	u64 search_start = cluster->window_start;
+	u64 search_bytes = bytes;
+	u64 ret = 0;
+
+	search_start = min_start;
+	search_bytes = bytes;
+
+	err = search_bitmap(ctl, entry, &search_start, &search_bytes);
+	if (err)
+		return 0;
+
+	ret = search_start;
+	bitmap_clear_bits(ctl, entry, ret, bytes);
+
+	return ret;
+}
+
+/*
+ * given a cluster, try to allocate 'bytes' from it, returns 0
+ * if it couldn't find anything suitably large, or a logical disk offset
+ * if things worked out
+ */
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+			     struct btrfs_free_cluster *cluster, u64 bytes,
+			     u64 min_start)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry = NULL;
+	struct rb_node *node;
+	u64 ret = 0;
+
+	spin_lock(&cluster->lock);
+	if (bytes > cluster->max_size)
+		goto out;
+
+	if (cluster->block_group != block_group)
+		goto out;
+
+	node = rb_first(&cluster->root);
+	if (!node)
+		goto out;
+
+	entry = rb_entry(node, struct btrfs_free_space, offset_index);
+	while(1) {
+		if (entry->bytes < bytes ||
+		    (!entry->bitmap && entry->offset < min_start)) {
+			node = rb_next(&entry->offset_index);
+			if (!node)
+				break;
+			entry = rb_entry(node, struct btrfs_free_space,
+					 offset_index);
+			continue;
+		}
+
+		if (entry->bitmap) {
+			ret = btrfs_alloc_from_bitmap(block_group,
+						      cluster, entry, bytes,
+						      min_start);
+			if (ret == 0) {
+				node = rb_next(&entry->offset_index);
+				if (!node)
+					break;
+				entry = rb_entry(node, struct btrfs_free_space,
+						 offset_index);
+				continue;
+			}
+		} else {
+
+			ret = entry->offset;
+
+			entry->offset += bytes;
+			entry->bytes -= bytes;
+		}
+
+		if (entry->bytes == 0)
+			rb_erase(&entry->offset_index, &cluster->root);
+		break;
+	}
+out:
+	spin_unlock(&cluster->lock);
+
+	if (!ret)
+		return 0;
+
+	spin_lock(&ctl->tree_lock);
+
+	ctl->free_space -= bytes;
+	if (entry->bytes == 0) {
+		ctl->free_extents--;
+		if (entry->bitmap) {
+			kfree(entry->bitmap);
+			ctl->total_bitmaps--;
+			ctl->op->recalc_thresholds(ctl);
+		}
+		kmem_cache_free(btrfs_free_space_cachep, entry);
+	}
+
+	spin_unlock(&ctl->tree_lock);
+
+	return ret;
+}
+
+static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
+				struct btrfs_free_space *entry,
+				struct btrfs_free_cluster *cluster,
+				u64 offset, u64 bytes, u64 min_bytes)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	unsigned long next_zero;
+	unsigned long i;
+	unsigned long search_bits;
+	unsigned long total_bits;
+	unsigned long found_bits;
+	unsigned long start = 0;
+	unsigned long total_found = 0;
+	int ret;
+	bool found = false;
+
+	i = offset_to_bit(entry->offset, block_group->sectorsize,
+			  max_t(u64, offset, entry->offset));
+	search_bits = bytes_to_bits(bytes, block_group->sectorsize);
+	total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+
+again:
+	found_bits = 0;
+	for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
+	     i < BITS_PER_BITMAP;
+	     i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+		next_zero = find_next_zero_bit(entry->bitmap,
+					       BITS_PER_BITMAP, i);
+		if (next_zero - i >= search_bits) {
+			found_bits = next_zero - i;
+			break;
+		}
+		i = next_zero;
+	}
+
+	if (!found_bits)
+		return -ENOSPC;
+
+	if (!found) {
+		start = i;
+		found = true;
+	}
+
+	total_found += found_bits;
+
+	if (cluster->max_size < found_bits * block_group->sectorsize)
+		cluster->max_size = found_bits * block_group->sectorsize;
+
+	if (total_found < total_bits) {
+		i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
+		if (i - start > total_bits * 2) {
+			total_found = 0;
+			cluster->max_size = 0;
+			found = false;
+		}
+		goto again;
+	}
+
+	cluster->window_start = start * block_group->sectorsize +
+		entry->offset;
+	rb_erase(&entry->offset_index, &ctl->free_space_offset);
+	ret = tree_insert_offset(&cluster->root, entry->offset,
+				 &entry->offset_index, 1);
+	BUG_ON(ret);
+
+	return 0;
+}
+
+/*
+ * This searches the block group for just extents to fill the cluster with.
+ */
+static noinline int
+setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
+			struct btrfs_free_cluster *cluster,
+			struct list_head *bitmaps, u64 offset, u64 bytes,
+			u64 min_bytes)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *first = NULL;
+	struct btrfs_free_space *entry = NULL;
+	struct btrfs_free_space *prev = NULL;
+	struct btrfs_free_space *last;
+	struct rb_node *node;
+	u64 window_start;
+	u64 window_free;
+	u64 max_extent;
+	u64 max_gap = 128 * 1024;
+
+	entry = tree_search_offset(ctl, offset, 0, 1);
+	if (!entry)
+		return -ENOSPC;
+
+	/*
+	 * We don't want bitmaps, so just move along until we find a normal
+	 * extent entry.
+	 */
+	while (entry->bitmap) {
+		if (list_empty(&entry->list))
+			list_add_tail(&entry->list, bitmaps);
+		node = rb_next(&entry->offset_index);
+		if (!node)
+			return -ENOSPC;
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+	}
+
+	window_start = entry->offset;
+	window_free = entry->bytes;
+	max_extent = entry->bytes;
+	first = entry;
+	last = entry;
+	prev = entry;
+
+	while (window_free <= min_bytes) {
+		node = rb_next(&entry->offset_index);
+		if (!node)
+			return -ENOSPC;
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+
+		if (entry->bitmap) {
+			if (list_empty(&entry->list))
+				list_add_tail(&entry->list, bitmaps);
+			continue;
+		}
+
+		/*
+		 * we haven't filled the empty size and the window is
+		 * very large.  reset and try again
+		 */
+		if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
+		    entry->offset - window_start > (min_bytes * 2)) {
+			first = entry;
+			window_start = entry->offset;
+			window_free = entry->bytes;
+			last = entry;
+			max_extent = entry->bytes;
+		} else {
+			last = entry;
+			window_free += entry->bytes;
+			if (entry->bytes > max_extent)
+				max_extent = entry->bytes;
+		}
+		prev = entry;
+	}
+
+	cluster->window_start = first->offset;
+
+	node = &first->offset_index;
+
+	/*
+	 * now we've found our entries, pull them out of the free space
+	 * cache and put them into the cluster rbtree
+	 */
+	do {
+		int ret;
+
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		node = rb_next(&entry->offset_index);
+		if (entry->bitmap)
+			continue;
+
+		rb_erase(&entry->offset_index, &ctl->free_space_offset);
+		ret = tree_insert_offset(&cluster->root, entry->offset,
+					 &entry->offset_index, 0);
+		BUG_ON(ret);
+	} while (node && entry != last);
+
+	cluster->max_size = max_extent;
+
+	return 0;
+}
+
+/*
+ * This specifically looks for bitmaps that may work in the cluster, we assume
+ * that we have already failed to find extents that will work.
+ */
+static noinline int
+setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
+		     struct btrfs_free_cluster *cluster,
+		     struct list_head *bitmaps, u64 offset, u64 bytes,
+		     u64 min_bytes)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry;
+	struct rb_node *node;
+	int ret = -ENOSPC;
+
+	if (ctl->total_bitmaps == 0)
+		return -ENOSPC;
+
+	/*
+	 * First check our cached list of bitmaps and see if there is an entry
+	 * here that will work.
+	 */
+	list_for_each_entry(entry, bitmaps, list) {
+		if (entry->bytes < min_bytes)
+			continue;
+		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
+					   bytes, min_bytes);
+		if (!ret)
+			return 0;
+	}
+
+	/*
+	 * If we do have entries on our list and we are here then we didn't find
+	 * anything, so go ahead and get the next entry after the last entry in
+	 * this list and start the search from there.
+	 */
+	if (!list_empty(bitmaps)) {
+		entry = list_entry(bitmaps->prev, struct btrfs_free_space,
+				   list);
+		node = rb_next(&entry->offset_index);
+		if (!node)
+			return -ENOSPC;
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		goto search;
+	}
+
+	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
+	if (!entry)
+		return -ENOSPC;
+
+search:
+	node = &entry->offset_index;
+	do {
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		node = rb_next(&entry->offset_index);
+		if (!entry->bitmap)
+			continue;
+		if (entry->bytes < min_bytes)
+			continue;
+		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
+					   bytes, min_bytes);
+	} while (ret && node);
+
+	return ret;
+}
+
+/*
+ * here we try to find a cluster of blocks in a block group.  The goal
+ * is to find at least bytes free and up to empty_size + bytes free.
+ * We might not find them all in one contiguous area.
+ *
+ * returns zero and sets up cluster if things worked out, otherwise
+ * it returns -enospc
+ */
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_block_group_cache *block_group,
+			     struct btrfs_free_cluster *cluster,
+			     u64 offset, u64 bytes, u64 empty_size)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct list_head bitmaps;
+	struct btrfs_free_space *entry, *tmp;
+	u64 min_bytes;
+	int ret;
+
+	/* for metadata, allow allocates with more holes */
+	if (btrfs_test_opt(root, SSD_SPREAD)) {
+		min_bytes = bytes + empty_size;
+	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+		/*
+		 * we want to do larger allocations when we are
+		 * flushing out the delayed refs, it helps prevent
+		 * making more work as we go along.
+		 */
+		if (trans->transaction->delayed_refs.flushing)
+			min_bytes = max(bytes, (bytes + empty_size) >> 1);
+		else
+			min_bytes = max(bytes, (bytes + empty_size) >> 4);
+	} else
+		min_bytes = max(bytes, (bytes + empty_size) >> 2);
+
+	spin_lock(&ctl->tree_lock);
+
+	/*
+	 * If we know we don't have enough space to make a cluster don't even
+	 * bother doing all the work to try and find one.
+	 */
+	if (ctl->free_space < min_bytes) {
+		spin_unlock(&ctl->tree_lock);
+		return -ENOSPC;
+	}
+
+	spin_lock(&cluster->lock);
+
+	/* someone already found a cluster, hooray */
+	if (cluster->block_group) {
+		ret = 0;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&bitmaps);
+	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
+				      bytes, min_bytes);
+	if (ret)
+		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
+					   offset, bytes, min_bytes);
+
+	/* Clear our temporary list */
+	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
+		list_del_init(&entry->list);
+
+	if (!ret) {
+		atomic_inc(&block_group->count);
+		list_add_tail(&cluster->block_group_list,
+			      &block_group->cluster_list);
+		cluster->block_group = block_group;
+	}
+out:
+	spin_unlock(&cluster->lock);
+	spin_unlock(&ctl->tree_lock);
+
+	return ret;
+}
+
+/*
+ * simple code to zero out a cluster
+ */
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
+{
+	spin_lock_init(&cluster->lock);
+	spin_lock_init(&cluster->refill_lock);
+	cluster->root = RB_ROOT;
+	cluster->max_size = 0;
+	INIT_LIST_HEAD(&cluster->block_group_list);
+	cluster->block_group = NULL;
+}
+
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+			   u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry = NULL;
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
+	u64 bytes = 0;
+	u64 actually_trimmed;
+	int ret = 0;
+
+	*trimmed = 0;
+
+	while (start < end) {
+		spin_lock(&ctl->tree_lock);
+
+		if (ctl->free_space < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			break;
+		}
+
+		entry = tree_search_offset(ctl, start, 0, 1);
+		if (!entry)
+			entry = tree_search_offset(ctl,
+						   offset_to_bitmap(ctl, start),
+						   1, 1);
+
+		if (!entry || entry->offset >= end) {
+			spin_unlock(&ctl->tree_lock);
+			break;
+		}
+
+		if (entry->bitmap) {
+			ret = search_bitmap(ctl, entry, &start, &bytes);
+			if (!ret) {
+				if (start >= end) {
+					spin_unlock(&ctl->tree_lock);
+					break;
+				}
+				bytes = min(bytes, end - start);
+				bitmap_clear_bits(ctl, entry, start, bytes);
+				if (entry->bytes == 0)
+					free_bitmap(ctl, entry);
+			} else {
+				start = entry->offset + BITS_PER_BITMAP *
+					block_group->sectorsize;
+				spin_unlock(&ctl->tree_lock);
+				ret = 0;
+				continue;
+			}
+		} else {
+			start = entry->offset;
+			bytes = min(entry->bytes, end - start);
+			unlink_free_space(ctl, entry);
+			kmem_cache_free(btrfs_free_space_cachep, entry);
+		}
+
+		spin_unlock(&ctl->tree_lock);
+
+		if (bytes >= minlen) {
+			int update_ret;
+			update_ret = btrfs_update_reserved_bytes(block_group,
+								 bytes, 1, 1);
+
+			ret = btrfs_error_discard_extent(fs_info->extent_root,
+							 start,
+							 bytes,
+							 &actually_trimmed);
+
+			btrfs_add_free_space(block_group, start, bytes);
+			if (!update_ret)
+				btrfs_update_reserved_bytes(block_group,
+							    bytes, 0, 1);
+
+			if (ret)
+				break;
+			*trimmed += actually_trimmed;
+		}
+		start += bytes;
+		bytes = 0;
+
+		if (fatal_signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		cond_resched();
+	}
+
+	return ret;
+}
+
+/*
+ * Find the left-most item in the cache tree, and then return the
+ * smallest inode number in the item.
+ *
+ * Note: the returned inode number may not be the smallest one in
+ * the tree, if the left-most item is a bitmap.
+ */
+u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
+{
+	struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
+	struct btrfs_free_space *entry = NULL;
+	u64 ino = 0;
+
+	spin_lock(&ctl->tree_lock);
+
+	if (RB_EMPTY_ROOT(&ctl->free_space_offset))
+		goto out;
+
+	entry = rb_entry(rb_first(&ctl->free_space_offset),
+			 struct btrfs_free_space, offset_index);
+
+	if (!entry->bitmap) {
+		ino = entry->offset;
+
+		unlink_free_space(ctl, entry);
+		entry->offset++;
+		entry->bytes--;
+		if (!entry->bytes)
+			kmem_cache_free(btrfs_free_space_cachep, entry);
+		else
+			link_free_space(ctl, entry);
+	} else {
+		u64 offset = 0;
+		u64 count = 1;
+		int ret;
+
+		ret = search_bitmap(ctl, entry, &offset, &count);
+		BUG_ON(ret);
+
+		ino = offset;
+		bitmap_clear_bits(ctl, entry, offset, 1);
+		if (entry->bytes == 0)
+			free_bitmap(ctl, entry);
+	}
+out:
+	spin_unlock(&ctl->tree_lock);
+
+	return ino;
+}
+
+struct inode *lookup_free_ino_inode(struct btrfs_root *root,
+				    struct btrfs_path *path)
+{
+	struct inode *inode = NULL;
+
+	spin_lock(&root->cache_lock);
+	if (root->cache_inode)
+		inode = igrab(root->cache_inode);
+	spin_unlock(&root->cache_lock);
+	if (inode)
+		return inode;
+
+	inode = __lookup_free_space_inode(root, path, 0);
+	if (IS_ERR(inode))
+		return inode;
+
+	spin_lock(&root->cache_lock);
+	if (!btrfs_fs_closing(root->fs_info))
+		root->cache_inode = igrab(inode);
+	spin_unlock(&root->cache_lock);
+
+	return inode;
+}
+
+int create_free_ino_inode(struct btrfs_root *root,
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_path *path)
+{
+	return __create_free_space_inode(root, trans, path,
+					 BTRFS_FREE_INO_OBJECTID, 0);
+}
+
+int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct btrfs_path *path;
+	struct inode *inode;
+	int ret = 0;
+	u64 root_gen = btrfs_root_generation(&root->root_item);
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
+	/*
+	 * If we're unmounting then just return, since this does a search on the
+	 * normal root and not the commit root and we could deadlock.
+	 */
+	if (btrfs_fs_closing(fs_info))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return 0;
+
+	inode = lookup_free_ino_inode(root, path);
+	if (IS_ERR(inode))
+		goto out;
+
+	if (root_gen != BTRFS_I(inode)->generation)
+		goto out_put;
+
+	ret = __load_free_space_cache(root, inode, ctl, path, 0);
+
+	if (ret < 0)
+		printk(KERN_ERR "btrfs: failed to load free ino cache for "
+		       "root %llu\n", root->root_key.objectid);
+out_put:
+	iput(inode);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_write_out_ino_cache(struct btrfs_root *root,
+			      struct btrfs_trans_handle *trans,
+			      struct btrfs_path *path)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct inode *inode;
+	int ret;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
+	inode = lookup_free_ino_inode(root, path);
+	if (IS_ERR(inode))
+		return 0;
+
+	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
+	if (ret < 0)
+		printk(KERN_ERR "btrfs: failed to write free ino cache "
+		       "for root %llu\n", root->root_key.objectid);
+
+	iput(inode);
+	return ret;
+}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 00000000..8f2613f7
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_FREE_SPACE_CACHE
+#define __BTRFS_FREE_SPACE_CACHE
+
+struct btrfs_free_space {
+	struct rb_node offset_index;
+	u64 offset;
+	u64 bytes;
+	unsigned long *bitmap;
+	struct list_head list;
+};
+
+struct btrfs_free_space_ctl {
+	spinlock_t tree_lock;
+	struct rb_root free_space_offset;
+	u64 free_space;
+	int extents_thresh;
+	int free_extents;
+	int total_bitmaps;
+	int unit;
+	u64 start;
+	struct btrfs_free_space_op *op;
+	void *private;
+};
+
+struct btrfs_free_space_op {
+	void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl);
+	bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
+			   struct btrfs_free_space *info);
+};
+
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+				      struct btrfs_block_group_cache
+				      *block_group, struct btrfs_path *path);
+int create_free_space_inode(struct btrfs_root *root,
+			    struct btrfs_trans_handle *trans,
+			    struct btrfs_block_group_cache *block_group,
+			    struct btrfs_path *path);
+
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+				    struct btrfs_trans_handle *trans,
+				    struct btrfs_path *path,
+				    struct inode *inode);
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+			  struct btrfs_block_group_cache *block_group);
+int btrfs_write_out_cache(struct btrfs_root *root,
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_block_group_cache *block_group,
+			  struct btrfs_path *path);
+
+struct inode *lookup_free_ino_inode(struct btrfs_root *root,
+				    struct btrfs_path *path);
+int create_free_ino_inode(struct btrfs_root *root,
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_path *path);
+int load_free_ino_cache(struct btrfs_fs_info *fs_info,
+			struct btrfs_root *root);
+int btrfs_write_out_ino_cache(struct btrfs_root *root,
+			      struct btrfs_trans_handle *trans,
+			      struct btrfs_path *path);
+
+void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
+int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
+			   u64 bytenr, u64 size);
+static inline int
+btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+		     u64 bytenr, u64 size)
+{
+	return __btrfs_add_free_space(block_group->free_space_ctl,
+				      bytenr, size);
+}
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			    u64 bytenr, u64 size);
+void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
+				     *block_group);
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+			       u64 offset, u64 bytes, u64 empty_size);
+u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+			   u64 bytes);
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_block_group_cache *block_group,
+			     struct btrfs_free_cluster *cluster,
+			     u64 offset, u64 bytes, u64 empty_size);
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+			     struct btrfs_free_cluster *cluster, u64 bytes,
+			     u64 min_start);
+int btrfs_return_cluster_to_free_space(
+			       struct btrfs_block_group_cache *block_group,
+			       struct btrfs_free_cluster *cluster);
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+			   u64 *trimmed, u64 start, u64 end, u64 minlen);
+#endif
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 00000000..db2ff977
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __HASH__
+#define __HASH__
+
+#include <linux/crc32c.h>
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+	return crc32c((u32)~1, name, len);
+}
+#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 00000000..baa74f3d
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+static int find_name_in_backref(struct btrfs_path *path, const char *name,
+			 int name_len, struct btrfs_inode_ref **ref_ret)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_inode_ref *ref;
+	unsigned long ptr;
+	unsigned long name_ptr;
+	u32 item_size;
+	u32 cur_offset = 0;
+	int len;
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	while (cur_offset < item_size) {
+		ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+		len = btrfs_inode_ref_name_len(leaf, ref);
+		name_ptr = (unsigned long)(ref + 1);
+		cur_offset += len + sizeof(*ref);
+		if (len != name_len)
+			continue;
+		if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
+			*ref_ret = ref;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct btrfs_path *path,
+			const char *name, int name_len,
+			u64 inode_objectid, u64 ref_objectid, int mod)
+{
+	struct btrfs_key key;
+	struct btrfs_inode_ref *ref;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+	int ret;
+
+	key.objectid = inode_objectid;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = ref_objectid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return NULL;
+	if (!find_name_in_backref(path, name, name_len, &ref))
+		return NULL;
+	return ref;
+}
+
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid, u64 *index)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_inode_ref *ref;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+	unsigned long item_start;
+	u32 item_size;
+	u32 sub_item_len;
+	int ret;
+	int del_len = name_len + sizeof(*ref);
+
+	key.objectid = inode_objectid;
+	key.offset = ref_objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	} else if (ret < 0) {
+		goto out;
+	}
+	if (!find_name_in_backref(path, name, name_len, &ref)) {
+		ret = -ENOENT;
+		goto out;
+	}
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+
+	if (index)
+		*index = btrfs_inode_ref_index(leaf, ref);
+
+	if (del_len == item_size) {
+		ret = btrfs_del_item(trans, root, path);
+		goto out;
+	}
+	ptr = (unsigned long)ref;
+	sub_item_len = name_len + sizeof(*ref);
+	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+			      item_size - (ptr + sub_item_len - item_start));
+	ret = btrfs_truncate_item(trans, root, path,
+				  item_size - sub_item_len, 1);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_inode_ref *ref;
+	unsigned long ptr;
+	int ret;
+	int ins_len = name_len + sizeof(*ref);
+
+	key.objectid = inode_objectid;
+	key.offset = ref_objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      ins_len);
+	if (ret == -EEXIST) {
+		u32 old_size;
+
+		if (find_name_in_backref(path, name, name_len, &ref))
+			goto out;
+
+		old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+		ret = btrfs_extend_item(trans, root, path, ins_len);
+		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				     struct btrfs_inode_ref);
+		ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
+		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+		ptr = (unsigned long)(ref + 1);
+		ret = 0;
+	} else if (ret < 0) {
+		if (ret == -EOVERFLOW)
+			ret = -EMLINK;
+		goto out;
+	} else {
+		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				     struct btrfs_inode_ref);
+		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+		ptr = (unsigned long)(ref + 1);
+	}
+	write_extent_buffer(path->nodes[0], name, ptr, name_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid)
+{
+	struct btrfs_key key;
+	int ret;
+	key.objectid = objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(struct btrfs_inode_item));
+	return ret;
+}
+
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+		       *root, struct btrfs_path *path,
+		       struct btrfs_key *location, int mod)
+{
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+	int ret;
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_key found_key;
+
+	ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
+	if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
+	    location->offset == (u64)-1 && path->slots[0] != 0) {
+		slot = path->slots[0] - 1;
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (found_key.objectid == location->objectid &&
+		    btrfs_key_type(&found_key) == btrfs_key_type(location)) {
+			path->slots[0]--;
+			return 0;
+		}
+	}
+	return ret;
+}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 00000000..b4087e0f
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,545 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
+#include "transaction.h"
+
+static int caching_kthread(void *data)
+{
+	struct btrfs_root *root = data;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	u64 last = (u64)-1;
+	int slot;
+	int ret;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* Since the commit root is read-only, we can safely skip locking. */
+	path->skip_locking = 1;
+	path->search_commit_root = 1;
+	path->reada = 2;
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.offset = 0;
+	key.type = BTRFS_INODE_ITEM_KEY;
+again:
+	/* need to make sure the commit_root doesn't disappear */
+	mutex_lock(&root->fs_commit_mutex);
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		if (btrfs_fs_closing(fs_info))
+			goto out;
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+
+			if (need_resched() ||
+			    btrfs_transaction_in_commit(fs_info)) {
+				leaf = path->nodes[0];
+
+				if (btrfs_header_nritems(leaf) == 0) {
+					WARN_ON(1);
+					break;
+				}
+
+				/*
+				 * Save the key so we can advances forward
+				 * in the next search.
+				 */
+				btrfs_item_key_to_cpu(leaf, &key, 0);
+				btrfs_release_path(path);
+				root->cache_progress = last;
+				mutex_unlock(&root->fs_commit_mutex);
+				schedule_timeout(1);
+				goto again;
+			} else
+				continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		if (key.type != BTRFS_INODE_ITEM_KEY)
+			goto next;
+
+		if (key.objectid >= root->highest_objectid)
+			break;
+
+		if (last != (u64)-1 && last + 1 != key.objectid) {
+			__btrfs_add_free_space(ctl, last + 1,
+					       key.objectid - last - 1);
+			wake_up(&root->cache_wait);
+		}
+
+		last = key.objectid;
+next:
+		path->slots[0]++;
+	}
+
+	if (last < root->highest_objectid - 1) {
+		__btrfs_add_free_space(ctl, last + 1,
+				       root->highest_objectid - last - 1);
+	}
+
+	spin_lock(&root->cache_lock);
+	root->cached = BTRFS_CACHE_FINISHED;
+	spin_unlock(&root->cache_lock);
+
+	root->cache_progress = (u64)-1;
+	btrfs_unpin_free_ino(root);
+out:
+	wake_up(&root->cache_wait);
+	mutex_unlock(&root->fs_commit_mutex);
+
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+static void start_caching(struct btrfs_root *root)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct task_struct *tsk;
+	int ret;
+	u64 objectid;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return;
+
+	spin_lock(&root->cache_lock);
+	if (root->cached != BTRFS_CACHE_NO) {
+		spin_unlock(&root->cache_lock);
+		return;
+	}
+
+	root->cached = BTRFS_CACHE_STARTED;
+	spin_unlock(&root->cache_lock);
+
+	ret = load_free_ino_cache(root->fs_info, root);
+	if (ret == 1) {
+		spin_lock(&root->cache_lock);
+		root->cached = BTRFS_CACHE_FINISHED;
+		spin_unlock(&root->cache_lock);
+		return;
+	}
+
+	/*
+	 * It can be quite time-consuming to fill the cache by searching
+	 * through the extent tree, and this can keep ino allocation path
+	 * waiting. Therefore at start we quickly find out the highest
+	 * inode number and we know we can use inode numbers which fall in
+	 * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
+	 */
+	ret = btrfs_find_free_objectid(root, &objectid);
+	if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
+		__btrfs_add_free_space(ctl, objectid,
+				       BTRFS_LAST_FREE_OBJECTID - objectid + 1);
+	}
+
+	tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
+			  root->root_key.objectid);
+	BUG_ON(IS_ERR(tsk));
+}
+
+int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
+{
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return btrfs_find_free_objectid(root, objectid);
+
+again:
+	*objectid = btrfs_find_ino_for_alloc(root);
+
+	if (*objectid != 0)
+		return 0;
+
+	start_caching(root);
+
+	wait_event(root->cache_wait,
+		   root->cached == BTRFS_CACHE_FINISHED ||
+		   root->free_ino_ctl->free_space > 0);
+
+	if (root->cached == BTRFS_CACHE_FINISHED &&
+	    root->free_ino_ctl->free_space == 0)
+		return -ENOSPC;
+	else
+		goto again;
+}
+
+void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return;
+
+again:
+	if (root->cached == BTRFS_CACHE_FINISHED) {
+		__btrfs_add_free_space(ctl, objectid, 1);
+	} else {
+		/*
+		 * If we are in the process of caching free ino chunks,
+		 * to avoid adding the same inode number to the free_ino
+		 * tree twice due to cross transaction, we'll leave it
+		 * in the pinned tree until a transaction is committed
+		 * or the caching work is done.
+		 */
+
+		mutex_lock(&root->fs_commit_mutex);
+		spin_lock(&root->cache_lock);
+		if (root->cached == BTRFS_CACHE_FINISHED) {
+			spin_unlock(&root->cache_lock);
+			mutex_unlock(&root->fs_commit_mutex);
+			goto again;
+		}
+		spin_unlock(&root->cache_lock);
+
+		start_caching(root);
+
+		if (objectid <= root->cache_progress ||
+		    objectid > root->highest_objectid)
+			__btrfs_add_free_space(ctl, objectid, 1);
+		else
+			__btrfs_add_free_space(pinned, objectid, 1);
+
+		mutex_unlock(&root->fs_commit_mutex);
+	}
+}
+
+/*
+ * When a transaction is committed, we'll move those inode numbers which
+ * are smaller than root->cache_progress from pinned tree to free_ino tree,
+ * and others will just be dropped, because the commit root we were
+ * searching has changed.
+ *
+ * Must be called with root->fs_commit_mutex held
+ */
+void btrfs_unpin_free_ino(struct btrfs_root *root)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
+	struct btrfs_free_space *info;
+	struct rb_node *n;
+	u64 count;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return;
+
+	while (1) {
+		n = rb_first(rbroot);
+		if (!n)
+			break;
+
+		info = rb_entry(n, struct btrfs_free_space, offset_index);
+		BUG_ON(info->bitmap);
+
+		if (info->offset > root->cache_progress)
+			goto free;
+		else if (info->offset + info->bytes > root->cache_progress)
+			count = root->cache_progress - info->offset + 1;
+		else
+			count = info->bytes;
+
+		__btrfs_add_free_space(ctl, info->offset, count);
+free:
+		rb_erase(&info->offset_index, rbroot);
+		kfree(info);
+	}
+}
+
+#define INIT_THRESHOLD	(((1024 * 32) / 2) / sizeof(struct btrfs_free_space))
+#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+
+/*
+ * The goal is to keep the memory used by the free_ino tree won't
+ * exceed the memory if we use bitmaps only.
+ */
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+	struct btrfs_free_space *info;
+	struct rb_node *n;
+	int max_ino;
+	int max_bitmaps;
+
+	n = rb_last(&ctl->free_space_offset);
+	if (!n) {
+		ctl->extents_thresh = INIT_THRESHOLD;
+		return;
+	}
+	info = rb_entry(n, struct btrfs_free_space, offset_index);
+
+	/*
+	 * Find the maximum inode number in the filesystem. Note we
+	 * ignore the fact that this can be a bitmap, because we are
+	 * not doing precise calculation.
+	 */
+	max_ino = info->bytes - 1;
+
+	max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
+	if (max_bitmaps <= ctl->total_bitmaps) {
+		ctl->extents_thresh = 0;
+		return;
+	}
+
+	ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
+				PAGE_CACHE_SIZE / sizeof(*info);
+}
+
+/*
+ * We don't fall back to bitmap, if we are below the extents threshold
+ * or this chunk of inode numbers is a big one.
+ */
+static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+		       struct btrfs_free_space *info)
+{
+	if (ctl->free_extents < ctl->extents_thresh ||
+	    info->bytes > INODES_PER_BITMAP / 10)
+		return false;
+
+	return true;
+}
+
+static struct btrfs_free_space_op free_ino_op = {
+	.recalc_thresholds	= recalculate_thresholds,
+	.use_bitmap		= use_bitmap,
+};
+
+static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+}
+
+static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info)
+{
+	/*
+	 * We always use extents for two reasons:
+	 *
+	 * - The pinned tree is only used during the process of caching
+	 *   work.
+	 * - Make code simpler. See btrfs_unpin_free_ino().
+	 */
+	return false;
+}
+
+static struct btrfs_free_space_op pinned_free_ino_op = {
+	.recalc_thresholds	= pinned_recalc_thresholds,
+	.use_bitmap		= pinned_use_bitmap,
+};
+
+void btrfs_init_free_ino_ctl(struct btrfs_root *root)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+
+	spin_lock_init(&ctl->tree_lock);
+	ctl->unit = 1;
+	ctl->start = 0;
+	ctl->private = NULL;
+	ctl->op = &free_ino_op;
+
+	/*
+	 * Initially we allow to use 16K of ram to cache chunks of
+	 * inode numbers before we resort to bitmaps. This is somewhat
+	 * arbitrary, but it will be adjusted in runtime.
+	 */
+	ctl->extents_thresh = INIT_THRESHOLD;
+
+	spin_lock_init(&pinned->tree_lock);
+	pinned->unit = 1;
+	pinned->start = 0;
+	pinned->private = NULL;
+	pinned->extents_thresh = 0;
+	pinned->op = &pinned_free_ino_op;
+}
+
+int btrfs_save_ino_cache(struct btrfs_root *root,
+			 struct btrfs_trans_handle *trans)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct btrfs_path *path;
+	struct inode *inode;
+	u64 alloc_hint = 0;
+	int ret;
+	int prealloc;
+	bool retry = false;
+
+	/* only fs tree and subvol/snap needs ino cache */
+	if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
+	    (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
+	     root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
+		return 0;
+
+	/* Don't save inode cache if we are deleting this root */
+	if (btrfs_root_refs(&root->root_item) == 0 &&
+	    root != root->fs_info->tree_root)
+		return 0;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+again:
+	inode = lookup_free_ino_inode(root, path);
+	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+		ret = PTR_ERR(inode);
+		goto out;
+	}
+
+	if (IS_ERR(inode)) {
+		BUG_ON(retry);
+		retry = true;
+
+		ret = create_free_ino_inode(root, trans, path);
+		if (ret)
+			goto out;
+		goto again;
+	}
+
+	BTRFS_I(inode)->generation = 0;
+	ret = btrfs_update_inode(trans, root, inode);
+	WARN_ON(ret);
+
+	if (i_size_read(inode) > 0) {
+		ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+		if (ret)
+			goto out_put;
+	}
+
+	spin_lock(&root->cache_lock);
+	if (root->cached != BTRFS_CACHE_FINISHED) {
+		ret = -1;
+		spin_unlock(&root->cache_lock);
+		goto out_put;
+	}
+	spin_unlock(&root->cache_lock);
+
+	spin_lock(&ctl->tree_lock);
+	prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
+	prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE);
+	prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE;
+	spin_unlock(&ctl->tree_lock);
+
+	/* Just to make sure we have enough space */
+	prealloc += 8 * PAGE_CACHE_SIZE;
+
+	ret = btrfs_check_data_free_space(inode, prealloc);
+	if (ret)
+		goto out_put;
+
+	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
+					      prealloc, prealloc, &alloc_hint);
+	if (ret)
+		goto out_put;
+	btrfs_free_reserved_data_space(inode, prealloc);
+
+out_put:
+	iput(inode);
+out:
+	if (ret == 0)
+		ret = btrfs_write_out_ino_cache(root, trans, path);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct extent_buffer *l;
+	struct btrfs_key search_key;
+	struct btrfs_key found_key;
+	int slot;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
+	search_key.type = -1;
+	search_key.offset = (u64)-1;
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+	BUG_ON(ret == 0);
+	if (path->slots[0] > 0) {
+		slot = path->slots[0] - 1;
+		l = path->nodes[0];
+		btrfs_item_key_to_cpu(l, &found_key, slot);
+		*objectid = max_t(u64, found_key.objectid,
+				  BTRFS_FIRST_FREE_OBJECTID - 1);
+	} else {
+		*objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
+{
+	int ret;
+	mutex_lock(&root->objectid_mutex);
+
+	if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
+		ret = btrfs_find_highest_objectid(root,
+						  &root->highest_objectid);
+		if (ret)
+			goto out;
+	}
+
+	if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	*objectid = ++root->highest_objectid;
+	ret = 0;
+out:
+	mutex_unlock(&root->objectid_mutex);
+	return ret;
+}
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
new file mode 100644
index 00000000..ddb347bf
--- /dev/null
+++ b/fs/btrfs/inode-map.h
@@ -0,0 +1,13 @@
+#ifndef __BTRFS_INODE_MAP
+#define __BTRFS_INODE_MAP
+
+void btrfs_init_free_ino_ctl(struct btrfs_root *root);
+void btrfs_unpin_free_ino(struct btrfs_root *root);
+void btrfs_return_ino(struct btrfs_root *root, u64 objectid);
+int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid);
+int btrfs_save_ino_cache(struct btrfs_root *root,
+			 struct btrfs_trans_handle *trans);
+
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
+
+#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 00000000..d42e6bfd
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,7459 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/falloc.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "xattr.h"
+#include "tree-log.h"
+#include "compression.h"
+#include "locking.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
+
+struct btrfs_iget_args {
+	u64 ino;
+	struct btrfs_root *root;
+};
+
+static const struct inode_operations btrfs_dir_inode_operations;
+static const struct inode_operations btrfs_symlink_inode_operations;
+static const struct inode_operations btrfs_dir_ro_inode_operations;
+static const struct inode_operations btrfs_special_inode_operations;
+static const struct inode_operations btrfs_file_inode_operations;
+static const struct address_space_operations btrfs_aops;
+static const struct address_space_operations btrfs_symlink_aops;
+static const struct file_operations btrfs_dir_file_operations;
+static struct extent_io_ops btrfs_extent_io_ops;
+
+static struct kmem_cache *btrfs_inode_cachep;
+struct kmem_cache *btrfs_trans_handle_cachep;
+struct kmem_cache *btrfs_transaction_cachep;
+struct kmem_cache *btrfs_path_cachep;
+struct kmem_cache *btrfs_free_space_cachep;
+
+#define S_SHIFT 12
+static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
+};
+
+static int btrfs_setsize(struct inode *inode, loff_t newsize);
+static int btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+static noinline int cow_file_range(struct inode *inode,
+				   struct page *locked_page,
+				   u64 start, u64 end, int *page_started,
+				   unsigned long *nr_written, int unlock);
+
+static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
+				     struct inode *inode,  struct inode *dir,
+				     const struct qstr *qstr)
+{
+	int err;
+
+	err = btrfs_init_acl(trans, inode, dir);
+	if (!err)
+		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
+	return err;
+}
+
+/*
+ * this does all the hard work for inserting an inline extent into
+ * the btree.  The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode,
+				u64 start, size_t size, size_t compressed_size,
+				int compress_type,
+				struct page **compressed_pages)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct page *page = NULL;
+	char *kaddr;
+	unsigned long ptr;
+	struct btrfs_file_extent_item *ei;
+	int err = 0;
+	int ret;
+	size_t cur_size = size;
+	size_t datasize;
+	unsigned long offset;
+
+	if (compressed_size && compressed_pages)
+		cur_size = compressed_size;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+
+	key.objectid = btrfs_ino(inode);
+	key.offset = start;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+	datasize = btrfs_file_extent_calc_inline_size(cur_size);
+
+	inode_add_bytes(inode, size);
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      datasize);
+	BUG_ON(ret);
+	if (ret) {
+		err = ret;
+		goto fail;
+	}
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+	btrfs_set_file_extent_encryption(leaf, ei, 0);
+	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+	ptr = btrfs_file_extent_inline_start(ei);
+
+	if (compress_type != BTRFS_COMPRESS_NONE) {
+		struct page *cpage;
+		int i = 0;
+		while (compressed_size > 0) {
+			cpage = compressed_pages[i];
+			cur_size = min_t(unsigned long, compressed_size,
+				       PAGE_CACHE_SIZE);
+
+			kaddr = kmap_atomic(cpage, KM_USER0);
+			write_extent_buffer(leaf, kaddr, ptr, cur_size);
+			kunmap_atomic(kaddr, KM_USER0);
+
+			i++;
+			ptr += cur_size;
+			compressed_size -= cur_size;
+		}
+		btrfs_set_file_extent_compression(leaf, ei,
+						  compress_type);
+	} else {
+		page = find_get_page(inode->i_mapping,
+				     start >> PAGE_CACHE_SHIFT);
+		btrfs_set_file_extent_compression(leaf, ei, 0);
+		kaddr = kmap_atomic(page, KM_USER0);
+		offset = start & (PAGE_CACHE_SIZE - 1);
+		write_extent_buffer(leaf, kaddr + offset, ptr, size);
+		kunmap_atomic(kaddr, KM_USER0);
+		page_cache_release(page);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_free_path(path);
+
+	/*
+	 * we're an inline extent, so nobody can
+	 * extend the file past i_size without locking
+	 * a page we already have locked.
+	 *
+	 * We must do any isize and inode updates
+	 * before we unlock the pages.  Otherwise we
+	 * could end up racing with unlink.
+	 */
+	BTRFS_I(inode)->disk_i_size = inode->i_size;
+	btrfs_update_inode(trans, root, inode);
+
+	return 0;
+fail:
+	btrfs_free_path(path);
+	return err;
+}
+
+
+/*
+ * conditionally insert an inline extent into the file.  This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct inode *inode, u64 start, u64 end,
+				 size_t compressed_size, int compress_type,
+				 struct page **compressed_pages)
+{
+	u64 isize = i_size_read(inode);
+	u64 actual_end = min(end + 1, isize);
+	u64 inline_len = actual_end - start;
+	u64 aligned_end = (end + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
+	u64 hint_byte;
+	u64 data_len = inline_len;
+	int ret;
+
+	if (compressed_size)
+		data_len = compressed_size;
+
+	if (start > 0 ||
+	    actual_end >= PAGE_CACHE_SIZE ||
+	    data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+	    (!compressed_size &&
+	    (actual_end & (root->sectorsize - 1)) == 0) ||
+	    end + 1 < isize ||
+	    data_len > root->fs_info->max_inline) {
+		return 1;
+	}
+
+	ret = btrfs_drop_extents(trans, inode, start, aligned_end,
+				 &hint_byte, 1);
+	BUG_ON(ret);
+
+	if (isize > actual_end)
+		inline_len = min_t(u64, isize, actual_end);
+	ret = insert_inline_extent(trans, root, inode, start,
+				   inline_len, compressed_size,
+				   compress_type, compressed_pages);
+	BUG_ON(ret);
+	btrfs_delalloc_release_metadata(inode, end + 1 - start);
+	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
+	return 0;
+}
+
+struct async_extent {
+	u64 start;
+	u64 ram_size;
+	u64 compressed_size;
+	struct page **pages;
+	unsigned long nr_pages;
+	int compress_type;
+	struct list_head list;
+};
+
+struct async_cow {
+	struct inode *inode;
+	struct btrfs_root *root;
+	struct page *locked_page;
+	u64 start;
+	u64 end;
+	struct list_head extents;
+	struct btrfs_work work;
+};
+
+static noinline int add_async_extent(struct async_cow *cow,
+				     u64 start, u64 ram_size,
+				     u64 compressed_size,
+				     struct page **pages,
+				     unsigned long nr_pages,
+				     int compress_type)
+{
+	struct async_extent *async_extent;
+
+	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+	BUG_ON(!async_extent);
+	async_extent->start = start;
+	async_extent->ram_size = ram_size;
+	async_extent->compressed_size = compressed_size;
+	async_extent->pages = pages;
+	async_extent->nr_pages = nr_pages;
+	async_extent->compress_type = compress_type;
+	list_add_tail(&async_extent->list, &cow->extents);
+	return 0;
+}
+
+/*
+ * we create compressed extents in two phases.  The first
+ * phase compresses a range of pages that have already been
+ * locked (both pages and state bits are locked).
+ *
+ * This is done inside an ordered work queue, and the compression
+ * is spread across many cpus.  The actual IO submission is step
+ * two, and the ordered work queue takes care of making sure that
+ * happens in the same order things were put onto the queue by
+ * writepages and friends.
+ *
+ * If this code finds it can't get good compression, it puts an
+ * entry onto the work queue to write the uncompressed bytes.  This
+ * makes sure that both compressed inodes and uncompressed inodes
+ * are written in the same order that pdflush sent them down.
+ */
+static noinline int compress_file_range(struct inode *inode,
+					struct page *locked_page,
+					u64 start, u64 end,
+					struct async_cow *async_cow,
+					int *num_added)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	u64 num_bytes;
+	u64 blocksize = root->sectorsize;
+	u64 actual_end;
+	u64 isize = i_size_read(inode);
+	int ret = 0;
+	struct page **pages = NULL;
+	unsigned long nr_pages;
+	unsigned long nr_pages_ret = 0;
+	unsigned long total_compressed = 0;
+	unsigned long total_in = 0;
+	unsigned long max_compressed = 128 * 1024;
+	unsigned long max_uncompressed = 128 * 1024;
+	int i;
+	int will_compress;
+	int compress_type = root->fs_info->compress_type;
+
+	/* if this is a small write inside eof, kick off a defragbot */
+	if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
+		btrfs_add_inode_defrag(NULL, inode);
+
+	actual_end = min_t(u64, isize, end + 1);
+again:
+	will_compress = 0;
+	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
+	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+
+	/*
+	 * we don't want to send crud past the end of i_size through
+	 * compression, that's just a waste of CPU time.  So, if the
+	 * end of the file is before the start of our current
+	 * requested range of bytes, we bail out to the uncompressed
+	 * cleanup code that can deal with all of this.
+	 *
+	 * It isn't really the fastest way to fix things, but this is a
+	 * very uncommon corner.
+	 */
+	if (actual_end <= start)
+		goto cleanup_and_bail_uncompressed;
+
+	total_compressed = actual_end - start;
+
+	/* we want to make sure that amount of ram required to uncompress
+	 * an extent is reasonable, so we limit the total size in ram
+	 * of a compressed extent to 128k.  This is a crucial number
+	 * because it also controls how easily we can spread reads across
+	 * cpus for decompression.
+	 *
+	 * We also want to make sure the amount of IO required to do
+	 * a random read is reasonably small, so we limit the size of
+	 * a compressed extent to 128k.
+	 */
+	total_compressed = min(total_compressed, max_uncompressed);
+	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+	num_bytes = max(blocksize,  num_bytes);
+	total_in = 0;
+	ret = 0;
+
+	/*
+	 * we do compression for mount -o compress and when the
+	 * inode has not been flagged as nocompress.  This flag can
+	 * change at any time if we discover bad compression ratios.
+	 */
+	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
+	    (btrfs_test_opt(root, COMPRESS) ||
+	     (BTRFS_I(inode)->force_compress) ||
+	     (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
+		WARN_ON(pages);
+		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+		BUG_ON(!pages);
+
+		if (BTRFS_I(inode)->force_compress)
+			compress_type = BTRFS_I(inode)->force_compress;
+
+		ret = btrfs_compress_pages(compress_type,
+					   inode->i_mapping, start,
+					   total_compressed, pages,
+					   nr_pages, &nr_pages_ret,
+					   &total_in,
+					   &total_compressed,
+					   max_compressed);
+
+		if (!ret) {
+			unsigned long offset = total_compressed &
+				(PAGE_CACHE_SIZE - 1);
+			struct page *page = pages[nr_pages_ret - 1];
+			char *kaddr;
+
+			/* zero the tail end of the last page, we might be
+			 * sending it down to disk
+			 */
+			if (offset) {
+				kaddr = kmap_atomic(page, KM_USER0);
+				memset(kaddr + offset, 0,
+				       PAGE_CACHE_SIZE - offset);
+				kunmap_atomic(kaddr, KM_USER0);
+			}
+			will_compress = 1;
+		}
+	}
+	if (start == 0) {
+		trans = btrfs_join_transaction(root);
+		BUG_ON(IS_ERR(trans));
+		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+		/* lets try to make an inline extent */
+		if (ret || total_in < (actual_end - start)) {
+			/* we didn't compress the entire range, try
+			 * to make an uncompressed inline extent.
+			 */
+			ret = cow_file_range_inline(trans, root, inode,
+						    start, end, 0, 0, NULL);
+		} else {
+			/* try making a compressed inline extent */
+			ret = cow_file_range_inline(trans, root, inode,
+						    start, end,
+						    total_compressed,
+						    compress_type, pages);
+		}
+		if (ret == 0) {
+			/*
+			 * inline extent creation worked, we don't need
+			 * to create any more async work items.  Unlock
+			 * and free up our temp pages.
+			 */
+			extent_clear_unlock_delalloc(inode,
+			     &BTRFS_I(inode)->io_tree,
+			     start, end, NULL,
+			     EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+			     EXTENT_CLEAR_DELALLOC |
+			     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
+
+			btrfs_end_transaction(trans, root);
+			goto free_pages_out;
+		}
+		btrfs_end_transaction(trans, root);
+	}
+
+	if (will_compress) {
+		/*
+		 * we aren't doing an inline extent round the compressed size
+		 * up to a block size boundary so the allocator does sane
+		 * things
+		 */
+		total_compressed = (total_compressed + blocksize - 1) &
+			~(blocksize - 1);
+
+		/*
+		 * one last check to make sure the compression is really a
+		 * win, compare the page count read with the blocks on disk
+		 */
+		total_in = (total_in + PAGE_CACHE_SIZE - 1) &
+			~(PAGE_CACHE_SIZE - 1);
+		if (total_compressed >= total_in) {
+			will_compress = 0;
+		} else {
+			num_bytes = total_in;
+		}
+	}
+	if (!will_compress && pages) {
+		/*
+		 * the compression code ran but failed to make things smaller,
+		 * free any pages it allocated and our page pointer array
+		 */
+		for (i = 0; i < nr_pages_ret; i++) {
+			WARN_ON(pages[i]->mapping);
+			page_cache_release(pages[i]);
+		}
+		kfree(pages);
+		pages = NULL;
+		total_compressed = 0;
+		nr_pages_ret = 0;
+
+		/* flag the file so we don't compress in the future */
+		if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
+		    !(BTRFS_I(inode)->force_compress)) {
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+		}
+	}
+	if (will_compress) {
+		*num_added += 1;
+
+		/* the async work queues will take care of doing actual
+		 * allocation on disk for these compressed pages,
+		 * and will submit them to the elevator.
+		 */
+		add_async_extent(async_cow, start, num_bytes,
+				 total_compressed, pages, nr_pages_ret,
+				 compress_type);
+
+		if (start + num_bytes < end) {
+			start += num_bytes;
+			pages = NULL;
+			cond_resched();
+			goto again;
+		}
+	} else {
+cleanup_and_bail_uncompressed:
+		/*
+		 * No compression, but we still need to write the pages in
+		 * the file we've been given so far.  redirty the locked
+		 * page if it corresponds to our extent and set things up
+		 * for the async work queue to run cow_file_range to do
+		 * the normal delalloc dance
+		 */
+		if (page_offset(locked_page) >= start &&
+		    page_offset(locked_page) <= end) {
+			__set_page_dirty_nobuffers(locked_page);
+			/* unlocked later on in the async handlers */
+		}
+		add_async_extent(async_cow, start, end - start + 1,
+				 0, NULL, 0, BTRFS_COMPRESS_NONE);
+		*num_added += 1;
+	}
+
+out:
+	return 0;
+
+free_pages_out:
+	for (i = 0; i < nr_pages_ret; i++) {
+		WARN_ON(pages[i]->mapping);
+		page_cache_release(pages[i]);
+	}
+	kfree(pages);
+
+	goto out;
+}
+
+/*
+ * phase two of compressed writeback.  This is the ordered portion
+ * of the code, which only gets called in the order the work was
+ * queued.  We walk all the async extents created by compress_file_range
+ * and send them down to the disk.
+ */
+static noinline int submit_compressed_extents(struct inode *inode,
+					      struct async_cow *async_cow)
+{
+	struct async_extent *async_extent;
+	u64 alloc_hint = 0;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key ins;
+	struct extent_map *em;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree;
+	int ret = 0;
+
+	if (list_empty(&async_cow->extents))
+		return 0;
+
+
+	while (!list_empty(&async_cow->extents)) {
+		async_extent = list_entry(async_cow->extents.next,
+					  struct async_extent, list);
+		list_del(&async_extent->list);
+
+		io_tree = &BTRFS_I(inode)->io_tree;
+
+retry:
+		/* did the compression code fall back to uncompressed IO? */
+		if (!async_extent->pages) {
+			int page_started = 0;
+			unsigned long nr_written = 0;
+
+			lock_extent(io_tree, async_extent->start,
+					 async_extent->start +
+					 async_extent->ram_size - 1, GFP_NOFS);
+
+			/* allocate blocks */
+			ret = cow_file_range(inode, async_cow->locked_page,
+					     async_extent->start,
+					     async_extent->start +
+					     async_extent->ram_size - 1,
+					     &page_started, &nr_written, 0);
+
+			/*
+			 * if page_started, cow_file_range inserted an
+			 * inline extent and took care of all the unlocking
+			 * and IO for us.  Otherwise, we need to submit
+			 * all those pages down to the drive.
+			 */
+			if (!page_started && !ret)
+				extent_write_locked_range(io_tree,
+						  inode, async_extent->start,
+						  async_extent->start +
+						  async_extent->ram_size - 1,
+						  btrfs_get_extent,
+						  WB_SYNC_ALL);
+			kfree(async_extent);
+			cond_resched();
+			continue;
+		}
+
+		lock_extent(io_tree, async_extent->start,
+			    async_extent->start + async_extent->ram_size - 1,
+			    GFP_NOFS);
+
+		trans = btrfs_join_transaction(root);
+		BUG_ON(IS_ERR(trans));
+		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+		ret = btrfs_reserve_extent(trans, root,
+					   async_extent->compressed_size,
+					   async_extent->compressed_size,
+					   0, alloc_hint,
+					   (u64)-1, &ins, 1);
+		btrfs_end_transaction(trans, root);
+
+		if (ret) {
+			int i;
+			for (i = 0; i < async_extent->nr_pages; i++) {
+				WARN_ON(async_extent->pages[i]->mapping);
+				page_cache_release(async_extent->pages[i]);
+			}
+			kfree(async_extent->pages);
+			async_extent->nr_pages = 0;
+			async_extent->pages = NULL;
+			unlock_extent(io_tree, async_extent->start,
+				      async_extent->start +
+				      async_extent->ram_size - 1, GFP_NOFS);
+			goto retry;
+		}
+
+		/*
+		 * here we're doing allocation and writeback of the
+		 * compressed pages
+		 */
+		btrfs_drop_extent_cache(inode, async_extent->start,
+					async_extent->start +
+					async_extent->ram_size - 1, 0);
+
+		em = alloc_extent_map();
+		BUG_ON(!em);
+		em->start = async_extent->start;
+		em->len = async_extent->ram_size;
+		em->orig_start = em->start;
+
+		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		em->compress_type = async_extent->compress_type;
+		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
+		while (1) {
+			write_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em);
+			write_unlock(&em_tree->lock);
+			if (ret != -EEXIST) {
+				free_extent_map(em);
+				break;
+			}
+			btrfs_drop_extent_cache(inode, async_extent->start,
+						async_extent->start +
+						async_extent->ram_size - 1, 0);
+		}
+
+		ret = btrfs_add_ordered_extent_compress(inode,
+						async_extent->start,
+						ins.objectid,
+						async_extent->ram_size,
+						ins.offset,
+						BTRFS_ORDERED_COMPRESSED,
+						async_extent->compress_type);
+		BUG_ON(ret);
+
+		/*
+		 * clear dirty, set writeback and unlock the pages.
+		 */
+		extent_clear_unlock_delalloc(inode,
+				&BTRFS_I(inode)->io_tree,
+				async_extent->start,
+				async_extent->start +
+				async_extent->ram_size - 1,
+				NULL, EXTENT_CLEAR_UNLOCK_PAGE |
+				EXTENT_CLEAR_UNLOCK |
+				EXTENT_CLEAR_DELALLOC |
+				EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
+
+		ret = btrfs_submit_compressed_write(inode,
+				    async_extent->start,
+				    async_extent->ram_size,
+				    ins.objectid,
+				    ins.offset, async_extent->pages,
+				    async_extent->nr_pages);
+
+		BUG_ON(ret);
+		alloc_hint = ins.objectid + ins.offset;
+		kfree(async_extent);
+		cond_resched();
+	}
+
+	return 0;
+}
+
+static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+				      u64 num_bytes)
+{
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	u64 alloc_hint = 0;
+
+	read_lock(&em_tree->lock);
+	em = search_extent_mapping(em_tree, start, num_bytes);
+	if (em) {
+		/*
+		 * if block start isn't an actual block number then find the
+		 * first block in this inode and use that as a hint.  If that
+		 * block is also bogus then just don't worry about it.
+		 */
+		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+			free_extent_map(em);
+			em = search_extent_mapping(em_tree, 0, 0);
+			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+				alloc_hint = em->block_start;
+			if (em)
+				free_extent_map(em);
+		} else {
+			alloc_hint = em->block_start;
+			free_extent_map(em);
+		}
+	}
+	read_unlock(&em_tree->lock);
+
+	return alloc_hint;
+}
+
+static inline bool is_free_space_inode(struct btrfs_root *root,
+				       struct inode *inode)
+{
+	if (root == root->fs_info->tree_root ||
+	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
+		return true;
+	return false;
+}
+
+/*
+ * when extent_io.c finds a delayed allocation range in the file,
+ * the call backs end up in this code.  The basic idea is to
+ * allocate extents on disk for the range, and create ordered data structs
+ * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already.  We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it.  It may be clean and already done with
+ * IO when we return.
+ */
+static noinline int cow_file_range(struct inode *inode,
+				   struct page *locked_page,
+				   u64 start, u64 end, int *page_started,
+				   unsigned long *nr_written,
+				   int unlock)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	u64 alloc_hint = 0;
+	u64 num_bytes;
+	unsigned long ram_size;
+	u64 disk_num_bytes;
+	u64 cur_alloc_size;
+	u64 blocksize = root->sectorsize;
+	struct btrfs_key ins;
+	struct extent_map *em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	int ret = 0;
+
+	BUG_ON(is_free_space_inode(root, inode));
+	trans = btrfs_join_transaction(root);
+	BUG_ON(IS_ERR(trans));
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+	num_bytes = max(blocksize,  num_bytes);
+	disk_num_bytes = num_bytes;
+	ret = 0;
+
+	/* if this is a small write inside eof, kick off defrag */
+	if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024)
+		btrfs_add_inode_defrag(trans, inode);
+
+	if (start == 0) {
+		/* lets try to make an inline extent */
+		ret = cow_file_range_inline(trans, root, inode,
+					    start, end, 0, 0, NULL);
+		if (ret == 0) {
+			extent_clear_unlock_delalloc(inode,
+				     &BTRFS_I(inode)->io_tree,
+				     start, end, NULL,
+				     EXTENT_CLEAR_UNLOCK_PAGE |
+				     EXTENT_CLEAR_UNLOCK |
+				     EXTENT_CLEAR_DELALLOC |
+				     EXTENT_CLEAR_DIRTY |
+				     EXTENT_SET_WRITEBACK |
+				     EXTENT_END_WRITEBACK);
+
+			*nr_written = *nr_written +
+			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+			*page_started = 1;
+			ret = 0;
+			goto out;
+		}
+	}
+
+	BUG_ON(disk_num_bytes >
+	       btrfs_super_total_bytes(&root->fs_info->super_copy));
+
+	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
+	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+
+	while (disk_num_bytes > 0) {
+		unsigned long op;
+
+		cur_alloc_size = disk_num_bytes;
+		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+					   root->sectorsize, 0, alloc_hint,
+					   (u64)-1, &ins, 1);
+		BUG_ON(ret);
+
+		em = alloc_extent_map();
+		BUG_ON(!em);
+		em->start = start;
+		em->orig_start = em->start;
+		ram_size = ins.offset;
+		em->len = ins.offset;
+
+		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+		while (1) {
+			write_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em);
+			write_unlock(&em_tree->lock);
+			if (ret != -EEXIST) {
+				free_extent_map(em);
+				break;
+			}
+			btrfs_drop_extent_cache(inode, start,
+						start + ram_size - 1, 0);
+		}
+
+		cur_alloc_size = ins.offset;
+		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
+					       ram_size, cur_alloc_size, 0);
+		BUG_ON(ret);
+
+		if (root->root_key.objectid ==
+		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+			ret = btrfs_reloc_clone_csums(inode, start,
+						      cur_alloc_size);
+			BUG_ON(ret);
+		}
+
+		if (disk_num_bytes < cur_alloc_size)
+			break;
+
+		/* we're not doing compressed IO, don't unlock the first
+		 * page (which the caller expects to stay locked), don't
+		 * clear any dirty bits and don't set any writeback bits
+		 *
+		 * Do set the Private2 bit so we know this page was properly
+		 * setup for writepage
+		 */
+		op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
+		op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
+			EXTENT_SET_PRIVATE2;
+
+		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+					     start, start + ram_size - 1,
+					     locked_page, op);
+		disk_num_bytes -= cur_alloc_size;
+		num_bytes -= cur_alloc_size;
+		alloc_hint = ins.objectid + ins.offset;
+		start += cur_alloc_size;
+	}
+out:
+	ret = 0;
+	btrfs_end_transaction(trans, root);
+
+	return ret;
+}
+
+/*
+ * work queue call back to started compression on a file and pages
+ */
+static noinline void async_cow_start(struct btrfs_work *work)
+{
+	struct async_cow *async_cow;
+	int num_added = 0;
+	async_cow = container_of(work, struct async_cow, work);
+
+	compress_file_range(async_cow->inode, async_cow->locked_page,
+			    async_cow->start, async_cow->end, async_cow,
+			    &num_added);
+	if (num_added == 0)
+		async_cow->inode = NULL;
+}
+
+/*
+ * work queue call back to submit previously compressed pages
+ */
+static noinline void async_cow_submit(struct btrfs_work *work)
+{
+	struct async_cow *async_cow;
+	struct btrfs_root *root;
+	unsigned long nr_pages;
+
+	async_cow = container_of(work, struct async_cow, work);
+
+	root = async_cow->root;
+	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
+		PAGE_CACHE_SHIFT;
+
+	atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
+
+	if (atomic_read(&root->fs_info->async_delalloc_pages) <
+	    5 * 1042 * 1024 &&
+	    waitqueue_active(&root->fs_info->async_submit_wait))
+		wake_up(&root->fs_info->async_submit_wait);
+
+	if (async_cow->inode)
+		submit_compressed_extents(async_cow->inode, async_cow);
+}
+
+static noinline void async_cow_free(struct btrfs_work *work)
+{
+	struct async_cow *async_cow;
+	async_cow = container_of(work, struct async_cow, work);
+	kfree(async_cow);
+}
+
+static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+				u64 start, u64 end, int *page_started,
+				unsigned long *nr_written)
+{
+	struct async_cow *async_cow;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	unsigned long nr_pages;
+	u64 cur_end;
+	int limit = 10 * 1024 * 1042;
+
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
+			 1, 0, NULL, GFP_NOFS);
+	while (start < end) {
+		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+		BUG_ON(!async_cow);
+		async_cow->inode = inode;
+		async_cow->root = root;
+		async_cow->locked_page = locked_page;
+		async_cow->start = start;
+
+		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+			cur_end = end;
+		else
+			cur_end = min(end, start + 512 * 1024 - 1);
+
+		async_cow->end = cur_end;
+		INIT_LIST_HEAD(&async_cow->extents);
+
+		async_cow->work.func = async_cow_start;
+		async_cow->work.ordered_func = async_cow_submit;
+		async_cow->work.ordered_free = async_cow_free;
+		async_cow->work.flags = 0;
+
+		nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
+			PAGE_CACHE_SHIFT;
+		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
+
+		btrfs_queue_worker(&root->fs_info->delalloc_workers,
+				   &async_cow->work);
+
+		if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
+			wait_event(root->fs_info->async_submit_wait,
+			   (atomic_read(&root->fs_info->async_delalloc_pages) <
+			    limit));
+		}
+
+		while (atomic_read(&root->fs_info->async_submit_draining) &&
+		      atomic_read(&root->fs_info->async_delalloc_pages)) {
+			wait_event(root->fs_info->async_submit_wait,
+			  (atomic_read(&root->fs_info->async_delalloc_pages) ==
+			   0));
+		}
+
+		*nr_written += nr_pages;
+		start = cur_end + 1;
+	}
+	*page_started = 1;
+	return 0;
+}
+
+static noinline int csum_exist_in_range(struct btrfs_root *root,
+					u64 bytenr, u64 num_bytes)
+{
+	int ret;
+	struct btrfs_ordered_sum *sums;
+	LIST_HEAD(list);
+
+	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
+				       bytenr + num_bytes - 1, &list, 0);
+	if (ret == 0 && list_empty(&list))
+		return 0;
+
+	while (!list_empty(&list)) {
+		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+		list_del(&sums->list);
+		kfree(sums);
+	}
+	return 1;
+}
+
+/*
+ * when nowcow writeback call back.  This checks for snapshots or COW copies
+ * of the extents that exist in the file, and COWs the file as required.
+ *
+ * If no cow copies or snapshots exist, we write directly to the existing
+ * blocks on disk
+ */
+static noinline int run_delalloc_nocow(struct inode *inode,
+				       struct page *locked_page,
+			      u64 start, u64 end, int *page_started, int force,
+			      unsigned long *nr_written)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct extent_buffer *leaf;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key found_key;
+	u64 cow_start;
+	u64 cur_offset;
+	u64 extent_end;
+	u64 extent_offset;
+	u64 disk_bytenr;
+	u64 num_bytes;
+	int extent_type;
+	int ret;
+	int type;
+	int nocow;
+	int check_prev = 1;
+	bool nolock;
+	u64 ino = btrfs_ino(inode);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	nolock = is_free_space_inode(root, inode);
+
+	if (nolock)
+		trans = btrfs_join_transaction_nolock(root);
+	else
+		trans = btrfs_join_transaction(root);
+
+	BUG_ON(IS_ERR(trans));
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+	cow_start = (u64)-1;
+	cur_offset = start;
+	while (1) {
+		ret = btrfs_lookup_file_extent(trans, root, path, ino,
+					       cur_offset, 0);
+		BUG_ON(ret < 0);
+		if (ret > 0 && path->slots[0] > 0 && check_prev) {
+			leaf = path->nodes[0];
+			btrfs_item_key_to_cpu(leaf, &found_key,
+					      path->slots[0] - 1);
+			if (found_key.objectid == ino &&
+			    found_key.type == BTRFS_EXTENT_DATA_KEY)
+				path->slots[0]--;
+		}
+		check_prev = 0;
+next_slot:
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				BUG_ON(1);
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		nocow = 0;
+		disk_bytenr = 0;
+		num_bytes = 0;
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		if (found_key.objectid > ino ||
+		    found_key.type > BTRFS_EXTENT_DATA_KEY ||
+		    found_key.offset > end)
+			break;
+
+		if (found_key.offset > cur_offset) {
+			extent_end = found_key.offset;
+			extent_type = 0;
+			goto out_check;
+		}
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(leaf, fi);
+
+		if (extent_type == BTRFS_FILE_EXTENT_REG ||
+		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+			extent_offset = btrfs_file_extent_offset(leaf, fi);
+			extent_end = found_key.offset +
+				btrfs_file_extent_num_bytes(leaf, fi);
+			if (extent_end <= start) {
+				path->slots[0]++;
+				goto next_slot;
+			}
+			if (disk_bytenr == 0)
+				goto out_check;
+			if (btrfs_file_extent_compression(leaf, fi) ||
+			    btrfs_file_extent_encryption(leaf, fi) ||
+			    btrfs_file_extent_other_encoding(leaf, fi))
+				goto out_check;
+			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
+				goto out_check;
+			if (btrfs_extent_readonly(root, disk_bytenr))
+				goto out_check;
+			if (btrfs_cross_ref_exist(trans, root, ino,
+						  found_key.offset -
+						  extent_offset, disk_bytenr))
+				goto out_check;
+			disk_bytenr += extent_offset;
+			disk_bytenr += cur_offset - found_key.offset;
+			num_bytes = min(end + 1, extent_end) - cur_offset;
+			/*
+			 * force cow if csum exists in the range.
+			 * this ensure that csum for a given extent are
+			 * either valid or do not exist.
+			 */
+			if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+				goto out_check;
+			nocow = 1;
+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			extent_end = found_key.offset +
+				btrfs_file_extent_inline_len(leaf, fi);
+			extent_end = ALIGN(extent_end, root->sectorsize);
+		} else {
+			BUG_ON(1);
+		}
+out_check:
+		if (extent_end <= start) {
+			path->slots[0]++;
+			goto next_slot;
+		}
+		if (!nocow) {
+			if (cow_start == (u64)-1)
+				cow_start = cur_offset;
+			cur_offset = extent_end;
+			if (cur_offset > end)
+				break;
+			path->slots[0]++;
+			goto next_slot;
+		}
+
+		btrfs_release_path(path);
+		if (cow_start != (u64)-1) {
+			ret = cow_file_range(inode, locked_page, cow_start,
+					found_key.offset - 1, page_started,
+					nr_written, 1);
+			BUG_ON(ret);
+			cow_start = (u64)-1;
+		}
+
+		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			struct extent_map *em;
+			struct extent_map_tree *em_tree;
+			em_tree = &BTRFS_I(inode)->extent_tree;
+			em = alloc_extent_map();
+			BUG_ON(!em);
+			em->start = cur_offset;
+			em->orig_start = em->start;
+			em->len = num_bytes;
+			em->block_len = num_bytes;
+			em->block_start = disk_bytenr;
+			em->bdev = root->fs_info->fs_devices->latest_bdev;
+			set_bit(EXTENT_FLAG_PINNED, &em->flags);
+			while (1) {
+				write_lock(&em_tree->lock);
+				ret = add_extent_mapping(em_tree, em);
+				write_unlock(&em_tree->lock);
+				if (ret != -EEXIST) {
+					free_extent_map(em);
+					break;
+				}
+				btrfs_drop_extent_cache(inode, em->start,
+						em->start + em->len - 1, 0);
+			}
+			type = BTRFS_ORDERED_PREALLOC;
+		} else {
+			type = BTRFS_ORDERED_NOCOW;
+		}
+
+		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
+					       num_bytes, num_bytes, type);
+		BUG_ON(ret);
+
+		if (root->root_key.objectid ==
+		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+			ret = btrfs_reloc_clone_csums(inode, cur_offset,
+						      num_bytes);
+			BUG_ON(ret);
+		}
+
+		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+				cur_offset, cur_offset + num_bytes - 1,
+				locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
+				EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
+				EXTENT_SET_PRIVATE2);
+		cur_offset = extent_end;
+		if (cur_offset > end)
+			break;
+	}
+	btrfs_release_path(path);
+
+	if (cur_offset <= end && cow_start == (u64)-1)
+		cow_start = cur_offset;
+	if (cow_start != (u64)-1) {
+		ret = cow_file_range(inode, locked_page, cow_start, end,
+				     page_started, nr_written, 1);
+		BUG_ON(ret);
+	}
+
+	if (nolock) {
+		ret = btrfs_end_transaction_nolock(trans, root);
+		BUG_ON(ret);
+	} else {
+		ret = btrfs_end_transaction(trans, root);
+		BUG_ON(ret);
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
+/*
+ * extent_io.c call back to do delayed allocation processing
+ */
+static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+			      u64 start, u64 end, int *page_started,
+			      unsigned long *nr_written)
+{
+	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
+		ret = run_delalloc_nocow(inode, locked_page, start, end,
+					 page_started, 1, nr_written);
+	else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
+		ret = run_delalloc_nocow(inode, locked_page, start, end,
+					 page_started, 0, nr_written);
+	else if (!btrfs_test_opt(root, COMPRESS) &&
+		 !(BTRFS_I(inode)->force_compress) &&
+		 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))
+		ret = cow_file_range(inode, locked_page, start, end,
+				      page_started, nr_written, 1);
+	else
+		ret = cow_file_range_async(inode, locked_page, start, end,
+					   page_started, nr_written);
+	return ret;
+}
+
+static int btrfs_split_extent_hook(struct inode *inode,
+				   struct extent_state *orig, u64 split)
+{
+	/* not delalloc, ignore it */
+	if (!(orig->state & EXTENT_DELALLOC))
+		return 0;
+
+	atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+	return 0;
+}
+
+/*
+ * extent_io.c merge_extent_hook, used to track merged delayed allocation
+ * extents so we can keep track of new extents that are just merged onto old
+ * extents, such as when we are doing sequential writes, so we can properly
+ * account for the metadata space we'll need.
+ */
+static int btrfs_merge_extent_hook(struct inode *inode,
+				   struct extent_state *new,
+				   struct extent_state *other)
+{
+	/* not delalloc, ignore it */
+	if (!(other->state & EXTENT_DELALLOC))
+		return 0;
+
+	atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+	return 0;
+}
+
+/*
+ * extent_io.c set_bit_hook, used to track delayed allocation
+ * bytes in this file, and to maintain the list of inodes that
+ * have pending delalloc work to be done.
+ */
+static int btrfs_set_bit_hook(struct inode *inode,
+			      struct extent_state *state, int *bits)
+{
+
+	/*
+	 * set_bit and clear bit hooks normally require _irqsave/restore
+	 * but in this case, we are only testing for the DELALLOC
+	 * bit, which is only set or cleared with irqs on
+	 */
+	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+		u64 len = state->end + 1 - state->start;
+		bool do_list = !is_free_space_inode(root, inode);
+
+		if (*bits & EXTENT_FIRST_DELALLOC)
+			*bits &= ~EXTENT_FIRST_DELALLOC;
+		else
+			atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+
+		spin_lock(&root->fs_info->delalloc_lock);
+		BTRFS_I(inode)->delalloc_bytes += len;
+		root->fs_info->delalloc_bytes += len;
+		if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+			list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+				      &root->fs_info->delalloc_inodes);
+		}
+		spin_unlock(&root->fs_info->delalloc_lock);
+	}
+	return 0;
+}
+
+/*
+ * extent_io.c clear_bit_hook, see set_bit_hook for why
+ */
+static int btrfs_clear_bit_hook(struct inode *inode,
+				struct extent_state *state, int *bits)
+{
+	/*
+	 * set_bit and clear bit hooks normally require _irqsave/restore
+	 * but in this case, we are only testing for the DELALLOC
+	 * bit, which is only set or cleared with irqs on
+	 */
+	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+		u64 len = state->end + 1 - state->start;
+		bool do_list = !is_free_space_inode(root, inode);
+
+		if (*bits & EXTENT_FIRST_DELALLOC)
+			*bits &= ~EXTENT_FIRST_DELALLOC;
+		else if (!(*bits & EXTENT_DO_ACCOUNTING))
+			atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+
+		if (*bits & EXTENT_DO_ACCOUNTING)
+			btrfs_delalloc_release_metadata(inode, len);
+
+		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+		    && do_list)
+			btrfs_free_reserved_data_space(inode, len);
+
+		spin_lock(&root->fs_info->delalloc_lock);
+		root->fs_info->delalloc_bytes -= len;
+		BTRFS_I(inode)->delalloc_bytes -= len;
+
+		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
+		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+		}
+		spin_unlock(&root->fs_info->delalloc_lock);
+	}
+	return 0;
+}
+
+/*
+ * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
+ * we don't create bios that span stripes or chunks
+ */
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+			 size_t size, struct bio *bio,
+			 unsigned long bio_flags)
+{
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+	struct btrfs_mapping_tree *map_tree;
+	u64 logical = (u64)bio->bi_sector << 9;
+	u64 length = 0;
+	u64 map_length;
+	int ret;
+
+	if (bio_flags & EXTENT_BIO_COMPRESSED)
+		return 0;
+
+	length = bio->bi_size;
+	map_tree = &root->fs_info->mapping_tree;
+	map_length = length;
+	ret = btrfs_map_block(map_tree, READ, logical,
+			      &map_length, NULL, 0);
+
+	if (map_length < length + size)
+		return 1;
+	return ret;
+}
+
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time.   All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_start(struct inode *inode, int rw,
+				    struct bio *bio, int mirror_num,
+				    unsigned long bio_flags,
+				    u64 bio_offset)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+
+	ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+	BUG_ON(ret);
+	return 0;
+}
+
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time.   All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+			  int mirror_num, unsigned long bio_flags,
+			  u64 bio_offset)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
+}
+
+/*
+ * extent_io.c submission hook. This does the right thing for csum calculation
+ * on write, or reading the csums from the tree before a read
+ */
+static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+			  int mirror_num, unsigned long bio_flags,
+			  u64 bio_offset)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+	int skip_sum;
+
+	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+	if (is_free_space_inode(root, inode))
+		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
+	else
+		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	BUG_ON(ret);
+
+	if (!(rw & REQ_WRITE)) {
+		if (bio_flags & EXTENT_BIO_COMPRESSED) {
+			return btrfs_submit_compressed_read(inode, bio,
+						    mirror_num, bio_flags);
+		} else if (!skip_sum) {
+			ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
+			if (ret)
+				return ret;
+		}
+		goto mapit;
+	} else if (!skip_sum) {
+		/* csum items have already been cloned */
+		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+			goto mapit;
+		/* we're doing a write, do the async checksumming */
+		return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+				   inode, rw, bio, mirror_num,
+				   bio_flags, bio_offset,
+				   __btrfs_submit_bio_start,
+				   __btrfs_submit_bio_done);
+	}
+
+mapit:
+	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
+}
+
+/*
+ * given a list of ordered sums record them in the inode.  This happens
+ * at IO completion time based on sums calculated at bio submission time.
+ */
+static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
+			     struct inode *inode, u64 file_offset,
+			     struct list_head *list)
+{
+	struct btrfs_ordered_sum *sum;
+
+	list_for_each_entry(sum, list, list) {
+		btrfs_csum_file_blocks(trans,
+		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
+	}
+	return 0;
+}
+
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+			      struct extent_state **cached_state)
+{
+	if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
+		WARN_ON(1);
+	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
+				   cached_state, GFP_NOFS);
+}
+
+/* see btrfs_writepage_start_hook for details on why this is required */
+struct btrfs_writepage_fixup {
+	struct page *page;
+	struct btrfs_work work;
+};
+
+static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+{
+	struct btrfs_writepage_fixup *fixup;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
+	struct page *page;
+	struct inode *inode;
+	u64 page_start;
+	u64 page_end;
+
+	fixup = container_of(work, struct btrfs_writepage_fixup, work);
+	page = fixup->page;
+again:
+	lock_page(page);
+	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
+		ClearPageChecked(page);
+		goto out_page;
+	}
+
+	inode = page->mapping->host;
+	page_start = page_offset(page);
+	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+			 &cached_state, GFP_NOFS);
+
+	/* already ordered? We're done */
+	if (PagePrivate2(page))
+		goto out;
+
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
+				     page_end, &cached_state, GFP_NOFS);
+		unlock_page(page);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		goto again;
+	}
+
+	BUG();
+	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
+	ClearPageChecked(page);
+out:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			     &cached_state, GFP_NOFS);
+out_page:
+	unlock_page(page);
+	page_cache_release(page);
+	kfree(fixup);
+}
+
+/*
+ * There are a few paths in the higher layers of the kernel that directly
+ * set the page dirty bit without asking the filesystem if it is a
+ * good idea.  This causes problems because we want to make sure COW
+ * properly happens and the data=ordered rules are followed.
+ *
+ * In our case any range that doesn't have the ORDERED bit set
+ * hasn't been properly setup for IO.  We kick off an async process
+ * to fix it up.  The async helper will wait for ordered extents, set
+ * the delalloc bit and make it safe to write the page.
+ */
+static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_writepage_fixup *fixup;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	/* this page is properly in the ordered list */
+	if (TestClearPagePrivate2(page))
+		return 0;
+
+	if (PageChecked(page))
+		return -EAGAIN;
+
+	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+	if (!fixup)
+		return -EAGAIN;
+
+	SetPageChecked(page);
+	page_cache_get(page);
+	fixup->work.func = btrfs_writepage_fixup_worker;
+	fixup->page = page;
+	btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+	return -EAGAIN;
+}
+
+static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+				       struct inode *inode, u64 file_pos,
+				       u64 disk_bytenr, u64 disk_num_bytes,
+				       u64 num_bytes, u64 ram_bytes,
+				       u8 compression, u8 encryption,
+				       u16 other_encoding, int extent_type)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key ins;
+	u64 hint;
+	int ret;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	path->leave_spinning = 1;
+
+	/*
+	 * we may be replacing one extent in the tree with another.
+	 * The new extent is pinned in the extent map, and we don't want
+	 * to drop it from the cache until it is completely in the btree.
+	 *
+	 * So, tell btrfs_drop_extents to leave this extent in the cache.
+	 * the caller is expected to unpin it and allow it to be merged
+	 * with the others.
+	 */
+	ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
+				 &hint, 0);
+	BUG_ON(ret);
+
+	ins.objectid = btrfs_ino(inode);
+	ins.offset = file_pos;
+	ins.type = BTRFS_EXTENT_DATA_KEY;
+	ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
+	BUG_ON(ret);
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+	btrfs_set_file_extent_type(leaf, fi, extent_type);
+	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
+	btrfs_set_file_extent_offset(leaf, fi, 0);
+	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+	btrfs_set_file_extent_compression(leaf, fi, compression);
+	btrfs_set_file_extent_encryption(leaf, fi, encryption);
+	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+
+	btrfs_unlock_up_safe(path, 1);
+	btrfs_set_lock_blocking(leaf);
+
+	btrfs_mark_buffer_dirty(leaf);
+
+	inode_add_bytes(inode, num_bytes);
+
+	ins.objectid = disk_bytenr;
+	ins.offset = disk_num_bytes;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	ret = btrfs_alloc_reserved_file_extent(trans, root,
+					root->root_key.objectid,
+					btrfs_ino(inode), file_pos, &ins);
+	BUG_ON(ret);
+	btrfs_free_path(path);
+
+	return 0;
+}
+
+/*
+ * helper function for btrfs_finish_ordered_io, this
+ * just reads in some of the csum leaves to prime them into ram
+ * before we start the transaction.  It limits the amount of btree
+ * reads required while inside the transaction.
+ */
+/* as ordered data IO finishes, this gets called so we can finish
+ * an ordered extent if the range of bytes in the file it covers are
+ * fully written.
+ */
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_ordered_extent *ordered_extent = NULL;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_state *cached_state = NULL;
+	int compress_type = 0;
+	int ret;
+	bool nolock;
+
+	ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+					     end - start + 1);
+	if (!ret)
+		return 0;
+	BUG_ON(!ordered_extent);
+
+	nolock = is_free_space_inode(root, inode);
+
+	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+		BUG_ON(!list_empty(&ordered_extent->list));
+		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+		if (!ret) {
+			if (nolock)
+				trans = btrfs_join_transaction_nolock(root);
+			else
+				trans = btrfs_join_transaction(root);
+			BUG_ON(IS_ERR(trans));
+			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+			ret = btrfs_update_inode(trans, root, inode);
+			BUG_ON(ret);
+		}
+		goto out;
+	}
+
+	lock_extent_bits(io_tree, ordered_extent->file_offset,
+			 ordered_extent->file_offset + ordered_extent->len - 1,
+			 0, &cached_state, GFP_NOFS);
+
+	if (nolock)
+		trans = btrfs_join_transaction_nolock(root);
+	else
+		trans = btrfs_join_transaction(root);
+	BUG_ON(IS_ERR(trans));
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+		compress_type = ordered_extent->compress_type;
+	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+		BUG_ON(compress_type);
+		ret = btrfs_mark_extent_written(trans, inode,
+						ordered_extent->file_offset,
+						ordered_extent->file_offset +
+						ordered_extent->len);
+		BUG_ON(ret);
+	} else {
+		BUG_ON(root == root->fs_info->tree_root);
+		ret = insert_reserved_file_extent(trans, inode,
+						ordered_extent->file_offset,
+						ordered_extent->start,
+						ordered_extent->disk_len,
+						ordered_extent->len,
+						ordered_extent->len,
+						compress_type, 0, 0,
+						BTRFS_FILE_EXTENT_REG);
+		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+				   ordered_extent->file_offset,
+				   ordered_extent->len);
+		BUG_ON(ret);
+	}
+	unlock_extent_cached(io_tree, ordered_extent->file_offset,
+			     ordered_extent->file_offset +
+			     ordered_extent->len - 1, &cached_state, GFP_NOFS);
+
+	add_pending_csums(trans, inode, ordered_extent->file_offset,
+			  &ordered_extent->list);
+
+	ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+	if (!ret) {
+		ret = btrfs_update_inode(trans, root, inode);
+		BUG_ON(ret);
+	}
+	ret = 0;
+out:
+	if (nolock) {
+		if (trans)
+			btrfs_end_transaction_nolock(trans, root);
+	} else {
+		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+		if (trans)
+			btrfs_end_transaction(trans, root);
+	}
+
+	/* once for us */
+	btrfs_put_ordered_extent(ordered_extent);
+	/* once for the tree */
+	btrfs_put_ordered_extent(ordered_extent);
+
+	return 0;
+}
+
+static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+				struct extent_state *state, int uptodate)
+{
+	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
+
+	ClearPagePrivate2(page);
+	return btrfs_finish_ordered_io(page->mapping->host, start, end);
+}
+
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+	struct page *page;
+	u64 start;
+	u64 len;
+	u64 logical;
+	unsigned long bio_flags;
+	int last_mirror;
+};
+
+static int btrfs_io_failed_hook(struct bio *failed_bio,
+			 struct page *page, u64 start, u64 end,
+			 struct extent_state *state)
+{
+	struct io_failure_record *failrec = NULL;
+	u64 private;
+	struct extent_map *em;
+	struct inode *inode = page->mapping->host;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct bio *bio;
+	int num_copies;
+	int ret;
+	int rw;
+	u64 logical;
+
+	ret = get_state_private(failure_tree, start, &private);
+	if (ret) {
+		failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
+		if (!failrec)
+			return -ENOMEM;
+		failrec->start = start;
+		failrec->len = end - start + 1;
+		failrec->last_mirror = 0;
+		failrec->bio_flags = 0;
+
+		read_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, start, failrec->len);
+		if (em->start > start || em->start + em->len < start) {
+			free_extent_map(em);
+			em = NULL;
+		}
+		read_unlock(&em_tree->lock);
+
+		if (IS_ERR_OR_NULL(em)) {
+			kfree(failrec);
+			return -EIO;
+		}
+		logical = start - em->start;
+		logical = em->block_start + logical;
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+			logical = em->block_start;
+			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+			extent_set_compress_type(&failrec->bio_flags,
+						 em->compress_type);
+		}
+		failrec->logical = logical;
+		free_extent_map(em);
+		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
+				EXTENT_DIRTY, GFP_NOFS);
+		set_state_private(failure_tree, start,
+				 (u64)(unsigned long)failrec);
+	} else {
+		failrec = (struct io_failure_record *)(unsigned long)private;
+	}
+	num_copies = btrfs_num_copies(
+			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
+			      failrec->logical, failrec->len);
+	failrec->last_mirror++;
+	if (!state) {
+		spin_lock(&BTRFS_I(inode)->io_tree.lock);
+		state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+						    failrec->start,
+						    EXTENT_LOCKED);
+		if (state && state->start != failrec->start)
+			state = NULL;
+		spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+	}
+	if (!state || failrec->last_mirror > num_copies) {
+		set_state_private(failure_tree, failrec->start, 0);
+		clear_extent_bits(failure_tree, failrec->start,
+				  failrec->start + failrec->len - 1,
+				  EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+		kfree(failrec);
+		return -EIO;
+	}
+	bio = bio_alloc(GFP_NOFS, 1);
+	bio->bi_private = state;
+	bio->bi_end_io = failed_bio->bi_end_io;
+	bio->bi_sector = failrec->logical >> 9;
+	bio->bi_bdev = failed_bio->bi_bdev;
+	bio->bi_size = 0;
+
+	bio_add_page(bio, page, failrec->len, start - page_offset(page));
+	if (failed_bio->bi_rw & REQ_WRITE)
+		rw = WRITE;
+	else
+		rw = READ;
+
+	ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
+						      failrec->last_mirror,
+						      failrec->bio_flags, 0);
+	return ret;
+}
+
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+static int btrfs_clean_io_failures(struct inode *inode, u64 start)
+{
+	u64 private;
+	u64 private_failure;
+	struct io_failure_record *failure;
+	int ret;
+
+	private = 0;
+	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+			     (u64)-1, 1, EXTENT_DIRTY, 0)) {
+		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
+					start, &private_failure);
+		if (ret == 0) {
+			failure = (struct io_failure_record *)(unsigned long)
+				   private_failure;
+			set_state_private(&BTRFS_I(inode)->io_failure_tree,
+					  failure->start, 0);
+			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+					  failure->start,
+					  failure->start + failure->len - 1,
+					  EXTENT_DIRTY | EXTENT_LOCKED,
+					  GFP_NOFS);
+			kfree(failure);
+		}
+	}
+	return 0;
+}
+
+/*
+ * when reads are done, we need to check csums to verify the data is correct
+ * if there's a match, we allow the bio to finish.  If not, we go through
+ * the io_failure_record routines to find good copies
+ */
+static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+			       struct extent_state *state)
+{
+	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
+	struct inode *inode = page->mapping->host;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	char *kaddr;
+	u64 private = ~(u32)0;
+	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u32 csum = ~(u32)0;
+
+	if (PageChecked(page)) {
+		ClearPageChecked(page);
+		goto good;
+	}
+
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+		goto good;
+
+	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
+		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
+				  GFP_NOFS);
+		return 0;
+	}
+
+	if (state && state->start == start) {
+		private = state->private;
+		ret = 0;
+	} else {
+		ret = get_state_private(io_tree, start, &private);
+	}
+	kaddr = kmap_atomic(page, KM_USER0);
+	if (ret)
+		goto zeroit;
+
+	csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
+	btrfs_csum_final(csum, (char *)&csum);
+	if (csum != private)
+		goto zeroit;
+
+	kunmap_atomic(kaddr, KM_USER0);
+good:
+	/* if the io failure tree for this inode is non-empty,
+	 * check to see if we've recovered from a failed IO
+	 */
+	btrfs_clean_io_failures(inode, start);
+	return 0;
+
+zeroit:
+	printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u "
+		       "private %llu\n",
+		       (unsigned long long)btrfs_ino(page->mapping->host),
+		       (unsigned long long)start, csum,
+		       (unsigned long long)private);
+	memset(kaddr + offset, 1, end - start + 1);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+	if (private == 0)
+		return 0;
+	return -EIO;
+}
+
+struct delayed_iput {
+	struct list_head list;
+	struct inode *inode;
+};
+
+void btrfs_add_delayed_iput(struct inode *inode)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct delayed_iput *delayed;
+
+	if (atomic_add_unless(&inode->i_count, -1, 1))
+		return;
+
+	delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
+	delayed->inode = inode;
+
+	spin_lock(&fs_info->delayed_iput_lock);
+	list_add_tail(&delayed->list, &fs_info->delayed_iputs);
+	spin_unlock(&fs_info->delayed_iput_lock);
+}
+
+void btrfs_run_delayed_iputs(struct btrfs_root *root)
+{
+	LIST_HEAD(list);
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct delayed_iput *delayed;
+	int empty;
+
+	spin_lock(&fs_info->delayed_iput_lock);
+	empty = list_empty(&fs_info->delayed_iputs);
+	spin_unlock(&fs_info->delayed_iput_lock);
+	if (empty)
+		return;
+
+	down_read(&root->fs_info->cleanup_work_sem);
+	spin_lock(&fs_info->delayed_iput_lock);
+	list_splice_init(&fs_info->delayed_iputs, &list);
+	spin_unlock(&fs_info->delayed_iput_lock);
+
+	while (!list_empty(&list)) {
+		delayed = list_entry(list.next, struct delayed_iput, list);
+		list_del(&delayed->list);
+		iput(delayed->inode);
+		kfree(delayed);
+	}
+	up_read(&root->fs_info->cleanup_work_sem);
+}
+
+/*
+ * calculate extra metadata reservation when snapshotting a subvolume
+ * contains orphan files.
+ */
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+				struct btrfs_pending_snapshot *pending,
+				u64 *bytes_to_reserve)
+{
+	struct btrfs_root *root;
+	struct btrfs_block_rsv *block_rsv;
+	u64 num_bytes;
+	int index;
+
+	root = pending->root;
+	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+		return;
+
+	block_rsv = root->orphan_block_rsv;
+
+	/* orphan block reservation for the snapshot */
+	num_bytes = block_rsv->size;
+
+	/*
+	 * after the snapshot is created, COWing tree blocks may use more
+	 * space than it frees. So we should make sure there is enough
+	 * reserved space.
+	 */
+	index = trans->transid & 0x1;
+	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+		num_bytes += block_rsv->size -
+			     (block_rsv->reserved + block_rsv->freed[index]);
+	}
+
+	*bytes_to_reserve += num_bytes;
+}
+
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+				struct btrfs_pending_snapshot *pending)
+{
+	struct btrfs_root *root = pending->root;
+	struct btrfs_root *snap = pending->snap;
+	struct btrfs_block_rsv *block_rsv;
+	u64 num_bytes;
+	int index;
+	int ret;
+
+	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+		return;
+
+	/* refill source subvolume's orphan block reservation */
+	block_rsv = root->orphan_block_rsv;
+	index = trans->transid & 0x1;
+	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+		num_bytes = block_rsv->size -
+			    (block_rsv->reserved + block_rsv->freed[index]);
+		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+					      root->orphan_block_rsv,
+					      num_bytes);
+		BUG_ON(ret);
+	}
+
+	/* setup orphan block reservation for the snapshot */
+	block_rsv = btrfs_alloc_block_rsv(snap);
+	BUG_ON(!block_rsv);
+
+	btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
+	snap->orphan_block_rsv = block_rsv;
+
+	num_bytes = root->orphan_block_rsv->size;
+	ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+				      block_rsv, num_bytes);
+	BUG_ON(ret);
+
+#if 0
+	/* insert orphan item for the snapshot */
+	WARN_ON(!root->orphan_item_inserted);
+	ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+				       snap->root_key.objectid);
+	BUG_ON(ret);
+	snap->orphan_item_inserted = 1;
+#endif
+}
+
+enum btrfs_orphan_cleanup_state {
+	ORPHAN_CLEANUP_STARTED	= 1,
+	ORPHAN_CLEANUP_DONE	= 2,
+};
+
+/*
+ * This is called in transaction commmit time. If there are no orphan
+ * files in the subvolume, it removes orphan item and frees block_rsv
+ * structure.
+ */
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root)
+{
+	int ret;
+
+	if (!list_empty(&root->orphan_list) ||
+	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
+		return;
+
+	if (root->orphan_item_inserted &&
+	    btrfs_root_refs(&root->root_item) > 0) {
+		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+					    root->root_key.objectid);
+		BUG_ON(ret);
+		root->orphan_item_inserted = 0;
+	}
+
+	if (root->orphan_block_rsv) {
+		WARN_ON(root->orphan_block_rsv->size > 0);
+		btrfs_free_block_rsv(root, root->orphan_block_rsv);
+		root->orphan_block_rsv = NULL;
+	}
+}
+
+/*
+ * This creates an orphan entry for the given inode in case something goes
+ * wrong in the middle of an unlink/truncate.
+ *
+ * NOTE: caller of this function should reserve 5 units of metadata for
+ *	 this function.
+ */
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_block_rsv *block_rsv = NULL;
+	int reserve = 0;
+	int insert = 0;
+	int ret;
+
+	if (!root->orphan_block_rsv) {
+		block_rsv = btrfs_alloc_block_rsv(root);
+		BUG_ON(!block_rsv);
+	}
+
+	spin_lock(&root->orphan_lock);
+	if (!root->orphan_block_rsv) {
+		root->orphan_block_rsv = block_rsv;
+	} else if (block_rsv) {
+		btrfs_free_block_rsv(root, block_rsv);
+		block_rsv = NULL;
+	}
+
+	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+#if 0
+		/*
+		 * For proper ENOSPC handling, we should do orphan
+		 * cleanup when mounting. But this introduces backward
+		 * compatibility issue.
+		 */
+		if (!xchg(&root->orphan_item_inserted, 1))
+			insert = 2;
+		else
+			insert = 1;
+#endif
+		insert = 1;
+	}
+
+	if (!BTRFS_I(inode)->orphan_meta_reserved) {
+		BTRFS_I(inode)->orphan_meta_reserved = 1;
+		reserve = 1;
+	}
+	spin_unlock(&root->orphan_lock);
+
+	if (block_rsv)
+		btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
+
+	/* grab metadata reservation from transaction handle */
+	if (reserve) {
+		ret = btrfs_orphan_reserve_metadata(trans, inode);
+		BUG_ON(ret);
+	}
+
+	/* insert an orphan item to track this unlinked/truncated file */
+	if (insert >= 1) {
+		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
+		BUG_ON(ret);
+	}
+
+	/* insert an orphan item to track subvolume contains orphan files */
+	if (insert >= 2) {
+		ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+					       root->root_key.objectid);
+		BUG_ON(ret);
+	}
+	return 0;
+}
+
+/*
+ * We have done the truncate/delete so we can go ahead and remove the orphan
+ * item for this particular inode.
+ */
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int delete_item = 0;
+	int release_rsv = 0;
+	int ret = 0;
+
+	spin_lock(&root->orphan_lock);
+	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+		list_del_init(&BTRFS_I(inode)->i_orphan);
+		delete_item = 1;
+	}
+
+	if (BTRFS_I(inode)->orphan_meta_reserved) {
+		BTRFS_I(inode)->orphan_meta_reserved = 0;
+		release_rsv = 1;
+	}
+	spin_unlock(&root->orphan_lock);
+
+	if (trans && delete_item) {
+		ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
+		BUG_ON(ret);
+	}
+
+	if (release_rsv)
+		btrfs_orphan_release_metadata(inode);
+
+	return 0;
+}
+
+/*
+ * this cleans up any orphans that may be left on the list from the last use
+ * of this root.
+ */
+int btrfs_orphan_cleanup(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key, found_key;
+	struct btrfs_trans_handle *trans;
+	struct inode *inode;
+	int ret = 0, nr_unlink = 0, nr_truncate = 0;
+
+	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	path->reada = -1;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = (u64)-1;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+
+		/*
+		 * if ret == 0 means we found what we were searching for, which
+		 * is weird, but possible, so only screw with path if we didn't
+		 * find the key and see if we have stuff that matches
+		 */
+		if (ret > 0) {
+			ret = 0;
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+
+		/* pull out the item */
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		/* make sure the item matches what we want */
+		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
+			break;
+		if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
+			break;
+
+		/* release the path since we're done with it */
+		btrfs_release_path(path);
+
+		/*
+		 * this is where we are basically btrfs_lookup, without the
+		 * crossing root thing.  we store the inode number in the
+		 * offset of the orphan item.
+		 */
+		found_key.objectid = found_key.offset;
+		found_key.type = BTRFS_INODE_ITEM_KEY;
+		found_key.offset = 0;
+		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
+		if (IS_ERR(inode)) {
+			ret = PTR_ERR(inode);
+			goto out;
+		}
+
+		/*
+		 * add this inode to the orphan list so btrfs_orphan_del does
+		 * the proper thing when we hit it
+		 */
+		spin_lock(&root->orphan_lock);
+		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+		spin_unlock(&root->orphan_lock);
+
+		/*
+		 * if this is a bad inode, means we actually succeeded in
+		 * removing the inode, but not the orphan record, which means
+		 * we need to manually delete the orphan since iput will just
+		 * do a destroy_inode
+		 */
+		if (is_bad_inode(inode)) {
+			trans = btrfs_start_transaction(root, 0);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				goto out;
+			}
+			btrfs_orphan_del(trans, inode);
+			btrfs_end_transaction(trans, root);
+			iput(inode);
+			continue;
+		}
+
+		/* if we have links, this was a truncate, lets do that */
+		if (inode->i_nlink) {
+			if (!S_ISREG(inode->i_mode)) {
+				WARN_ON(1);
+				iput(inode);
+				continue;
+			}
+			nr_truncate++;
+			ret = btrfs_truncate(inode);
+		} else {
+			nr_unlink++;
+		}
+
+		/* this will do delete_inode and everything for us */
+		iput(inode);
+		if (ret)
+			goto out;
+	}
+	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
+
+	if (root->orphan_block_rsv)
+		btrfs_block_rsv_release(root, root->orphan_block_rsv,
+					(u64)-1);
+
+	if (root->orphan_block_rsv || root->orphan_item_inserted) {
+		trans = btrfs_join_transaction(root);
+		if (!IS_ERR(trans))
+			btrfs_end_transaction(trans, root);
+	}
+
+	if (nr_unlink)
+		printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
+	if (nr_truncate)
+		printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+
+out:
+	if (ret)
+		printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret);
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * very simple check to peek ahead in the leaf looking for xattrs.  If we
+ * don't find any xattrs, we know there can't be any acls.
+ *
+ * slot is the slot the inode is in, objectid is the objectid of the inode
+ */
+static noinline int acls_after_inode_item(struct extent_buffer *leaf,
+					  int slot, u64 objectid)
+{
+	u32 nritems = btrfs_header_nritems(leaf);
+	struct btrfs_key found_key;
+	int scanned = 0;
+
+	slot++;
+	while (slot < nritems) {
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/* we found a different objectid, there must not be acls */
+		if (found_key.objectid != objectid)
+			return 0;
+
+		/* we found an xattr, assume we've got an acl */
+		if (found_key.type == BTRFS_XATTR_ITEM_KEY)
+			return 1;
+
+		/*
+		 * we found a key greater than an xattr key, there can't
+		 * be any acls later on
+		 */
+		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
+			return 0;
+
+		slot++;
+		scanned++;
+
+		/*
+		 * it goes inode, inode backrefs, xattrs, extents,
+		 * so if there are a ton of hard links to an inode there can
+		 * be a lot of backrefs.  Don't waste time searching too hard,
+		 * this is just an optimization
+		 */
+		if (scanned >= 8)
+			break;
+	}
+	/* we hit the end of the leaf before we found an xattr or
+	 * something larger than an xattr.  We have to assume the inode
+	 * has acls
+	 */
+	return 1;
+}
+
+/*
+ * read an inode from the btree into the in-memory inode
+ */
+static void btrfs_read_locked_inode(struct inode *inode)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_timespec *tspec;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key location;
+	int maybe_acls;
+	u32 rdev;
+	int ret;
+	bool filled = false;
+
+	ret = btrfs_fill_inode(inode, &rdev);
+	if (!ret)
+		filled = true;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	path->leave_spinning = 1;
+	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+
+	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
+	if (ret)
+		goto make_bad;
+
+	leaf = path->nodes[0];
+
+	if (filled)
+		goto cache_acl;
+
+	inode_item = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_inode_item);
+	if (!leaf->map_token)
+		map_private_extent_buffer(leaf, (unsigned long)inode_item,
+					  sizeof(struct btrfs_inode_item),
+					  &leaf->map_token, &leaf->kaddr,
+					  &leaf->map_start, &leaf->map_len,
+					  KM_USER1);
+
+	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
+	inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
+	inode->i_uid = btrfs_inode_uid(leaf, inode_item);
+	inode->i_gid = btrfs_inode_gid(leaf, inode_item);
+	btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
+
+	tspec = btrfs_inode_atime(inode_item);
+	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
+	tspec = btrfs_inode_mtime(inode_item);
+	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
+	tspec = btrfs_inode_ctime(inode_item);
+	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
+	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
+	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+	BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
+	inode->i_generation = BTRFS_I(inode)->generation;
+	inode->i_rdev = 0;
+	rdev = btrfs_inode_rdev(leaf, inode_item);
+
+	BTRFS_I(inode)->index_cnt = (u64)-1;
+	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+cache_acl:
+	/*
+	 * try to precache a NULL acl entry for files that don't have
+	 * any xattrs or acls
+	 */
+	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
+					   btrfs_ino(inode));
+	if (!maybe_acls)
+		cache_no_acl(inode);
+
+	if (leaf->map_token) {
+		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+		leaf->map_token = NULL;
+	}
+
+	btrfs_free_path(path);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_mapping->a_ops = &btrfs_aops;
+		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+		inode->i_fop = &btrfs_file_operations;
+		inode->i_op = &btrfs_file_inode_operations;
+		break;
+	case S_IFDIR:
+		inode->i_fop = &btrfs_dir_file_operations;
+		if (root == root->fs_info->tree_root)
+			inode->i_op = &btrfs_dir_ro_inode_operations;
+		else
+			inode->i_op = &btrfs_dir_inode_operations;
+		break;
+	case S_IFLNK:
+		inode->i_op = &btrfs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &btrfs_symlink_aops;
+		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+		break;
+	default:
+		inode->i_op = &btrfs_special_inode_operations;
+		init_special_inode(inode, inode->i_mode, rdev);
+		break;
+	}
+
+	btrfs_update_iflags(inode);
+	return;
+
+make_bad:
+	btrfs_free_path(path);
+	make_bad_inode(inode);
+}
+
+/*
+ * given a leaf and an inode, copy the inode fields into the leaf
+ */
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+			    struct extent_buffer *leaf,
+			    struct btrfs_inode_item *item,
+			    struct inode *inode)
+{
+	if (!leaf->map_token)
+		map_private_extent_buffer(leaf, (unsigned long)item,
+					  sizeof(struct btrfs_inode_item),
+					  &leaf->map_token, &leaf->kaddr,
+					  &leaf->map_start, &leaf->map_len,
+					  KM_USER1);
+
+	btrfs_set_inode_uid(leaf, item, inode->i_uid);
+	btrfs_set_inode_gid(leaf, item, inode->i_gid);
+	btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
+	btrfs_set_inode_mode(leaf, item, inode->i_mode);
+	btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+	btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
+			       inode->i_atime.tv_sec);
+	btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
+				inode->i_atime.tv_nsec);
+
+	btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
+			       inode->i_mtime.tv_sec);
+	btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
+				inode->i_mtime.tv_nsec);
+
+	btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
+			       inode->i_ctime.tv_sec);
+	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
+				inode->i_ctime.tv_nsec);
+
+	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+	btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
+	btrfs_set_inode_transid(leaf, item, trans->transid);
+	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
+	btrfs_set_inode_block_group(leaf, item, 0);
+
+	if (leaf->map_token) {
+		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+		leaf->map_token = NULL;
+	}
+}
+
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode)
+{
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	int ret;
+
+	/*
+	 * If the inode is a free space inode, we can deadlock during commit
+	 * if we put it into the delayed code.
+	 *
+	 * The data relocation inode should also be directly updated
+	 * without delay
+	 */
+	if (!is_free_space_inode(root, inode)
+	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+		ret = btrfs_delayed_update_inode(trans, root, inode);
+		if (!ret)
+			btrfs_set_inode_last_trans(trans, inode);
+		return ret;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+	ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
+				 1);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		goto failed;
+	}
+
+	btrfs_unlock_up_safe(path, 1);
+	leaf = path->nodes[0];
+	inode_item = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_inode_item);
+
+	fill_inode_item(trans, leaf, inode_item, inode);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_set_inode_last_trans(trans, inode);
+	ret = 0;
+failed:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * unlink helper that gets used here in inode.c and in the tree logging
+ * recovery code.  It remove a link in a directory with a given name, and
+ * also drops the back refs in the inode to the directory
+ */
+static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct inode *dir, struct inode *inode,
+				const char *name, int name_len)
+{
+	struct btrfs_path *path;
+	int ret = 0;
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	u64 index;
+	u64 ino = btrfs_ino(inode);
+	u64 dir_ino = btrfs_ino(dir);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	path->leave_spinning = 1;
+	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
+				    name, name_len, -1);
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto err;
+	}
+	if (!di) {
+		ret = -ENOENT;
+		goto err;
+	}
+	leaf = path->nodes[0];
+	btrfs_dir_item_key_to_cpu(leaf, di, &key);
+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	if (ret)
+		goto err;
+	btrfs_release_path(path);
+
+	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
+				  dir_ino, &index);
+	if (ret) {
+		printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
+		       "inode %llu parent %llu\n", name_len, name,
+		       (unsigned long long)ino, (unsigned long long)dir_ino);
+		goto err;
+	}
+
+	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
+	if (ret)
+		goto err;
+
+	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
+					 inode, dir_ino);
+	BUG_ON(ret != 0 && ret != -ENOENT);
+
+	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
+					   dir, index);
+	if (ret == -ENOENT)
+		ret = 0;
+err:
+	btrfs_free_path(path);
+	if (ret)
+		goto out;
+
+	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+	inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	btrfs_update_inode(trans, root, dir);
+out:
+	return ret;
+}
+
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct inode *dir, struct inode *inode,
+		       const char *name, int name_len)
+{
+	int ret;
+	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+	if (!ret) {
+		btrfs_drop_nlink(inode);
+		ret = btrfs_update_inode(trans, root, inode);
+	}
+	return ret;
+}
+		
+
+/* helper to check if there is any shared block in the path */
+static int check_path_shared(struct btrfs_root *root,
+			     struct btrfs_path *path)
+{
+	struct extent_buffer *eb;
+	int level;
+	u64 refs = 1;
+
+	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+		int ret;
+
+		if (!path->nodes[level])
+			break;
+		eb = path->nodes[level];
+		if (!btrfs_block_can_be_shared(root, eb))
+			continue;
+		ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
+					       &refs, NULL);
+		if (refs > 1)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * helper to start transaction for unlink and rmdir.
+ *
+ * unlink and rmdir are special in btrfs, they do not always free space.
+ * so in enospc case, we should make sure they will free space before
+ * allowing them to use the global metadata reservation.
+ */
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
+						       struct dentry *dentry)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_path *path;
+	struct btrfs_inode_ref *ref;
+	struct btrfs_dir_item *di;
+	struct inode *inode = dentry->d_inode;
+	u64 index;
+	int check_link = 1;
+	int err = -ENOSPC;
+	int ret;
+	u64 ino = btrfs_ino(inode);
+	u64 dir_ino = btrfs_ino(dir);
+
+	trans = btrfs_start_transaction(root, 10);
+	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+		return trans;
+
+	if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+		return ERR_PTR(-ENOSPC);
+
+	/* check if there is someone else holds reference */
+	if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
+		return ERR_PTR(-ENOSPC);
+
+	if (atomic_read(&inode->i_count) > 2)
+		return ERR_PTR(-ENOSPC);
+
+	if (xchg(&root->fs_info->enospc_unlink, 1))
+		return ERR_PTR(-ENOSPC);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		root->fs_info->enospc_unlink = 0;
+		return ERR_PTR(-ENOMEM);
+	}
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		root->fs_info->enospc_unlink = 0;
+		return trans;
+	}
+
+	path->skip_locking = 1;
+	path->search_commit_root = 1;
+
+	ret = btrfs_lookup_inode(trans, root, path,
+				&BTRFS_I(dir)->location, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	if (ret == 0) {
+		if (check_path_shared(root, path))
+			goto out;
+	} else {
+		check_link = 0;
+	}
+	btrfs_release_path(path);
+
+	ret = btrfs_lookup_inode(trans, root, path,
+				&BTRFS_I(inode)->location, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	if (ret == 0) {
+		if (check_path_shared(root, path))
+			goto out;
+	} else {
+		check_link = 0;
+	}
+	btrfs_release_path(path);
+
+	if (ret == 0 && S_ISREG(inode->i_mode)) {
+		ret = btrfs_lookup_file_extent(trans, root, path,
+					       ino, (u64)-1, 0);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		BUG_ON(ret == 0);
+		if (check_path_shared(root, path))
+			goto out;
+		btrfs_release_path(path);
+	}
+
+	if (!check_link) {
+		err = 0;
+		goto out;
+	}
+
+	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
+				dentry->d_name.name, dentry->d_name.len, 0);
+	if (IS_ERR(di)) {
+		err = PTR_ERR(di);
+		goto out;
+	}
+	if (di) {
+		if (check_path_shared(root, path))
+			goto out;
+	} else {
+		err = 0;
+		goto out;
+	}
+	btrfs_release_path(path);
+
+	ref = btrfs_lookup_inode_ref(trans, root, path,
+				dentry->d_name.name, dentry->d_name.len,
+				ino, dir_ino, 0);
+	if (IS_ERR(ref)) {
+		err = PTR_ERR(ref);
+		goto out;
+	}
+	BUG_ON(!ref);
+	if (check_path_shared(root, path))
+		goto out;
+	index = btrfs_inode_ref_index(path->nodes[0], ref);
+	btrfs_release_path(path);
+
+	/*
+	 * This is a commit root search, if we can lookup inode item and other
+	 * relative items in the commit root, it means the transaction of
+	 * dir/file creation has been committed, and the dir index item that we
+	 * delay to insert has also been inserted into the commit root. So
+	 * we needn't worry about the delayed insertion of the dir index item
+	 * here.
+	 */
+	di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
+				dentry->d_name.name, dentry->d_name.len, 0);
+	if (IS_ERR(di)) {
+		err = PTR_ERR(di);
+		goto out;
+	}
+	BUG_ON(ret == -ENOENT);
+	if (check_path_shared(root, path))
+		goto out;
+
+	err = 0;
+out:
+	btrfs_free_path(path);
+	if (err) {
+		btrfs_end_transaction(trans, root);
+		root->fs_info->enospc_unlink = 0;
+		return ERR_PTR(err);
+	}
+
+	trans->block_rsv = &root->fs_info->global_block_rsv;
+	return trans;
+}
+
+static void __unlink_end_trans(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+		BUG_ON(!root->fs_info->enospc_unlink);
+		root->fs_info->enospc_unlink = 0;
+	}
+	btrfs_end_transaction_throttle(trans, root);
+}
+
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_trans_handle *trans;
+	struct inode *inode = dentry->d_inode;
+	int ret;
+	unsigned long nr = 0;
+
+	trans = __unlink_start_trans(dir, dentry);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
+
+	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+				 dentry->d_name.name, dentry->d_name.len);
+	BUG_ON(ret);
+
+	if (inode->i_nlink == 0) {
+		ret = btrfs_orphan_add(trans, inode);
+		BUG_ON(ret);
+	}
+
+	nr = trans->blocks_used;
+	__unlink_end_trans(trans, root);
+	btrfs_btree_balance_dirty(root, nr);
+	return ret;
+}
+
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct inode *dir, u64 objectid,
+			const char *name, int name_len)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	u64 index;
+	int ret;
+	u64 dir_ino = btrfs_ino(dir);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
+				   name, name_len, -1);
+	BUG_ON(IS_ERR_OR_NULL(di));
+
+	leaf = path->nodes[0];
+	btrfs_dir_item_key_to_cpu(leaf, di, &key);
+	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	BUG_ON(ret);
+	btrfs_release_path(path);
+
+	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+				 objectid, root->root_key.objectid,
+				 dir_ino, &index, name, name_len);
+	if (ret < 0) {
+		BUG_ON(ret != -ENOENT);
+		di = btrfs_search_dir_index_item(root, path, dir_ino,
+						 name, name_len);
+		BUG_ON(IS_ERR_OR_NULL(di));
+
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		btrfs_release_path(path);
+		index = key.offset;
+	}
+	btrfs_release_path(path);
+
+	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
+	BUG_ON(ret);
+
+	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode(trans, root, dir);
+	BUG_ON(ret);
+
+	btrfs_free_path(path);
+	return 0;
+}
+
+static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int err = 0;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_trans_handle *trans;
+	unsigned long nr = 0;
+
+	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
+	    btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+		return -ENOTEMPTY;
+
+	trans = __unlink_start_trans(dir, dentry);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+		err = btrfs_unlink_subvol(trans, root, dir,
+					  BTRFS_I(inode)->location.objectid,
+					  dentry->d_name.name,
+					  dentry->d_name.len);
+		goto out;
+	}
+
+	err = btrfs_orphan_add(trans, inode);
+	if (err)
+		goto out;
+
+	/* now the directory is empty */
+	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+				 dentry->d_name.name, dentry->d_name.len);
+	if (!err)
+		btrfs_i_size_write(inode, 0);
+out:
+	nr = trans->blocks_used;
+	__unlink_end_trans(trans, root);
+	btrfs_btree_balance_dirty(root, nr);
+
+	return err;
+}
+
+/*
+ * this can truncate away extent items, csum items and directory items.
+ * It starts at a high offset and removes keys until it can't find
+ * any higher than new_size
+ *
+ * csum items that cross the new i_size are truncated to the new size
+ * as well.
+ *
+ * min_type is the minimum key type to truncate down to.  If set to 0, this
+ * will kill all the items on this inode, including the INODE_ITEM_KEY.
+ */
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct inode *inode,
+			       u64 new_size, u32 min_type)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u64 extent_start = 0;
+	u64 extent_num_bytes = 0;
+	u64 extent_offset = 0;
+	u64 item_end = 0;
+	u64 mask = root->sectorsize - 1;
+	u32 found_type = (u8)-1;
+	int found_extent;
+	int del_item;
+	int pending_del_nr = 0;
+	int pending_del_slot = 0;
+	int extent_type = -1;
+	int encoding;
+	int ret;
+	int err = 0;
+	u64 ino = btrfs_ino(inode);
+
+	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
+
+	if (root->ref_cows || root == root->fs_info->tree_root)
+		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+
+	/*
+	 * This function is also used to drop the items in the log tree before
+	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
+	 * it is used to drop the loged items. So we shouldn't kill the delayed
+	 * items.
+	 */
+	if (min_type == 0 && root == BTRFS_I(inode)->root)
+		btrfs_kill_delayed_inode_items(inode);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	path->reada = -1;
+
+	key.objectid = ino;
+	key.offset = (u64)-1;
+	key.type = (u8)-1;
+
+search_again:
+	path->leave_spinning = 1;
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+
+	if (ret > 0) {
+		/* there are no items in the tree for us to truncate, we're
+		 * done
+		 */
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+	}
+
+	while (1) {
+		fi = NULL;
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		found_type = btrfs_key_type(&found_key);
+		encoding = 0;
+
+		if (found_key.objectid != ino)
+			break;
+
+		if (found_type < min_type)
+			break;
+
+		item_end = found_key.offset;
+		if (found_type == BTRFS_EXTENT_DATA_KEY) {
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			extent_type = btrfs_file_extent_type(leaf, fi);
+			encoding = btrfs_file_extent_compression(leaf, fi);
+			encoding |= btrfs_file_extent_encryption(leaf, fi);
+			encoding |= btrfs_file_extent_other_encoding(leaf, fi);
+
+			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+				item_end +=
+				    btrfs_file_extent_num_bytes(leaf, fi);
+			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				item_end += btrfs_file_extent_inline_len(leaf,
+									 fi);
+			}
+			item_end--;
+		}
+		if (found_type > min_type) {
+			del_item = 1;
+		} else {
+			if (item_end < new_size)
+				break;
+			if (found_key.offset >= new_size)
+				del_item = 1;
+			else
+				del_item = 0;
+		}
+		found_extent = 0;
+		/* FIXME, shrink the extent if the ref count is only 1 */
+		if (found_type != BTRFS_EXTENT_DATA_KEY)
+			goto delete;
+
+		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+			u64 num_dec;
+			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
+			if (!del_item && !encoding) {
+				u64 orig_num_bytes =
+					btrfs_file_extent_num_bytes(leaf, fi);
+				extent_num_bytes = new_size -
+					found_key.offset + root->sectorsize - 1;
+				extent_num_bytes = extent_num_bytes &
+					~((u64)root->sectorsize - 1);
+				btrfs_set_file_extent_num_bytes(leaf, fi,
+							 extent_num_bytes);
+				num_dec = (orig_num_bytes -
+					   extent_num_bytes);
+				if (root->ref_cows && extent_start != 0)
+					inode_sub_bytes(inode, num_dec);
+				btrfs_mark_buffer_dirty(leaf);
+			} else {
+				extent_num_bytes =
+					btrfs_file_extent_disk_num_bytes(leaf,
+									 fi);
+				extent_offset = found_key.offset -
+					btrfs_file_extent_offset(leaf, fi);
+
+				/* FIXME blocksize != 4096 */
+				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
+				if (extent_start != 0) {
+					found_extent = 1;
+					if (root->ref_cows)
+						inode_sub_bytes(inode, num_dec);
+				}
+			}
+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			/*
+			 * we can't truncate inline items that have had
+			 * special encodings
+			 */
+			if (!del_item &&
+			    btrfs_file_extent_compression(leaf, fi) == 0 &&
+			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
+			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
+				u32 size = new_size - found_key.offset;
+
+				if (root->ref_cows) {
+					inode_sub_bytes(inode, item_end + 1 -
+							new_size);
+				}
+				size =
+				    btrfs_file_extent_calc_inline_size(size);
+				ret = btrfs_truncate_item(trans, root, path,
+							  size, 1);
+			} else if (root->ref_cows) {
+				inode_sub_bytes(inode, item_end + 1 -
+						found_key.offset);
+			}
+		}
+delete:
+		if (del_item) {
+			if (!pending_del_nr) {
+				/* no pending yet, add ourselves */
+				pending_del_slot = path->slots[0];
+				pending_del_nr = 1;
+			} else if (pending_del_nr &&
+				   path->slots[0] + 1 == pending_del_slot) {
+				/* hop on the pending chunk */
+				pending_del_nr++;
+				pending_del_slot = path->slots[0];
+			} else {
+				BUG();
+			}
+		} else {
+			break;
+		}
+		if (found_extent && (root->ref_cows ||
+				     root == root->fs_info->tree_root)) {
+			btrfs_set_path_blocking(path);
+			ret = btrfs_free_extent(trans, root, extent_start,
+						extent_num_bytes, 0,
+						btrfs_header_owner(leaf),
+						ino, extent_offset);
+			BUG_ON(ret);
+		}
+
+		if (found_type == BTRFS_INODE_ITEM_KEY)
+			break;
+
+		if (path->slots[0] == 0 ||
+		    path->slots[0] != pending_del_slot) {
+			if (root->ref_cows &&
+			    BTRFS_I(inode)->location.objectid !=
+						BTRFS_FREE_INO_OBJECTID) {
+				err = -EAGAIN;
+				goto out;
+			}
+			if (pending_del_nr) {
+				ret = btrfs_del_items(trans, root, path,
+						pending_del_slot,
+						pending_del_nr);
+				BUG_ON(ret);
+				pending_del_nr = 0;
+			}
+			btrfs_release_path(path);
+			goto search_again;
+		} else {
+			path->slots[0]--;
+		}
+	}
+out:
+	if (pending_del_nr) {
+		ret = btrfs_del_items(trans, root, path, pending_del_slot,
+				      pending_del_nr);
+		BUG_ON(ret);
+	}
+	btrfs_free_path(path);
+	return err;
+}
+
+/*
+ * taken from block_truncate_page, but does cow as it zeros out
+ * any bytes left in the last page in the file.
+ */
+static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+{
+	struct inode *inode = mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
+	char *kaddr;
+	u32 blocksize = root->sectorsize;
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	struct page *page;
+	int ret = 0;
+	u64 page_start;
+	u64 page_end;
+
+	if ((offset & (blocksize - 1)) == 0)
+		goto out;
+	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	if (ret)
+		goto out;
+
+	ret = -ENOMEM;
+again:
+	page = grab_cache_page(mapping, index);
+	if (!page) {
+		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+		goto out;
+	}
+
+	page_start = page_offset(page);
+	page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+	if (!PageUptodate(page)) {
+		ret = btrfs_readpage(NULL, page);
+		lock_page(page);
+		if (page->mapping != mapping) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto again;
+		}
+		if (!PageUptodate(page)) {
+			ret = -EIO;
+			goto out_unlock;
+		}
+	}
+	wait_on_page_writeback(page);
+
+	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
+			 GFP_NOFS);
+	set_page_extent_mapped(page);
+
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
+		unlock_page(page);
+		page_cache_release(page);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+		goto again;
+	}
+
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+			  0, 0, &cached_state, GFP_NOFS);
+
+	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+					&cached_state);
+	if (ret) {
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
+		goto out_unlock;
+	}
+
+	ret = 0;
+	if (offset != PAGE_CACHE_SIZE) {
+		kaddr = kmap(page);
+		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+		flush_dcache_page(page);
+		kunmap(page);
+	}
+	ClearPageChecked(page);
+	set_page_dirty(page);
+	unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
+			     GFP_NOFS);
+
+out_unlock:
+	if (ret)
+		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+/*
+ * This function puts in dummy file extents for the area we're creating a hole
+ * for.  So if we are truncating this file to a larger size we need to insert
+ * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
+ * the range between oldsize and size
+ */
+int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map *em = NULL;
+	struct extent_state *cached_state = NULL;
+	u64 mask = root->sectorsize - 1;
+	u64 hole_start = (oldsize + mask) & ~mask;
+	u64 block_end = (size + mask) & ~mask;
+	u64 last_byte;
+	u64 cur_offset;
+	u64 hole_size;
+	int err = 0;
+
+	if (size <= hole_start)
+		return 0;
+
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		btrfs_wait_ordered_range(inode, hole_start,
+					 block_end - hole_start);
+		lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+				 &cached_state, GFP_NOFS);
+		ordered = btrfs_lookup_ordered_extent(inode, hole_start);
+		if (!ordered)
+			break;
+		unlock_extent_cached(io_tree, hole_start, block_end - 1,
+				     &cached_state, GFP_NOFS);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	cur_offset = hole_start;
+	while (1) {
+		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+				block_end - cur_offset, 0);
+		BUG_ON(IS_ERR_OR_NULL(em));
+		last_byte = min(extent_map_end(em), block_end);
+		last_byte = (last_byte + mask) & ~mask;
+		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+			u64 hint_byte = 0;
+			hole_size = last_byte - cur_offset;
+
+			trans = btrfs_start_transaction(root, 2);
+			if (IS_ERR(trans)) {
+				err = PTR_ERR(trans);
+				break;
+			}
+
+			err = btrfs_drop_extents(trans, inode, cur_offset,
+						 cur_offset + hole_size,
+						 &hint_byte, 1);
+			if (err)
+				break;
+
+			err = btrfs_insert_file_extent(trans, root,
+					btrfs_ino(inode), cur_offset, 0,
+					0, hole_size, 0, hole_size,
+					0, 0, 0);
+			if (err)
+				break;
+
+			btrfs_drop_extent_cache(inode, hole_start,
+					last_byte - 1, 0);
+
+			btrfs_end_transaction(trans, root);
+		}
+		free_extent_map(em);
+		em = NULL;
+		cur_offset = last_byte;
+		if (cur_offset >= block_end)
+			break;
+	}
+
+	free_extent_map(em);
+	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
+			     GFP_NOFS);
+	return err;
+}
+
+static int btrfs_setsize(struct inode *inode, loff_t newsize)
+{
+	loff_t oldsize = i_size_read(inode);
+	int ret;
+
+	if (newsize == oldsize)
+		return 0;
+
+	if (newsize > oldsize) {
+		i_size_write(inode, newsize);
+		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+		truncate_pagecache(inode, oldsize, newsize);
+		ret = btrfs_cont_expand(inode, oldsize, newsize);
+		if (ret) {
+			btrfs_setsize(inode, oldsize);
+			return ret;
+		}
+
+		mark_inode_dirty(inode);
+	} else {
+
+		/*
+		 * We're truncating a file that used to have good data down to
+		 * zero. Make sure it gets into the ordered flush list so that
+		 * any new writes get down to disk quickly.
+		 */
+		if (newsize == 0)
+			BTRFS_I(inode)->ordered_data_close = 1;
+
+		/* we don't support swapfiles, so vmtruncate shouldn't fail */
+		truncate_setsize(inode, newsize);
+		ret = btrfs_truncate(inode);
+	}
+
+	return ret;
+}
+
+static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int err;
+
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+		err = btrfs_setsize(inode, attr->ia_size);
+		if (err)
+			return err;
+	}
+
+	if (attr->ia_valid) {
+		setattr_copy(inode, attr);
+		mark_inode_dirty(inode);
+
+		if (attr->ia_valid & ATTR_MODE)
+			err = btrfs_acl_chmod(inode);
+	}
+
+	return err;
+}
+
+void btrfs_evict_inode(struct inode *inode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	unsigned long nr;
+	int ret;
+
+	trace_btrfs_inode_evict(inode);
+
+	truncate_inode_pages(&inode->i_data, 0);
+	if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
+			       is_free_space_inode(root, inode)))
+		goto no_delete;
+
+	if (is_bad_inode(inode)) {
+		btrfs_orphan_del(NULL, inode);
+		goto no_delete;
+	}
+	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
+	btrfs_wait_ordered_range(inode, 0, (u64)-1);
+
+	if (root->fs_info->log_root_recovering) {
+		BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
+		goto no_delete;
+	}
+
+	if (inode->i_nlink > 0) {
+		BUG_ON(btrfs_root_refs(&root->root_item) != 0);
+		goto no_delete;
+	}
+
+	btrfs_i_size_write(inode, 0);
+
+	while (1) {
+		trans = btrfs_join_transaction(root);
+		BUG_ON(IS_ERR(trans));
+		trans->block_rsv = root->orphan_block_rsv;
+
+		ret = btrfs_block_rsv_check(trans, root,
+					    root->orphan_block_rsv, 0, 5);
+		if (ret) {
+			BUG_ON(ret != -EAGAIN);
+			ret = btrfs_commit_transaction(trans, root);
+			BUG_ON(ret);
+			continue;
+		}
+
+		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+		if (ret != -EAGAIN)
+			break;
+
+		nr = trans->blocks_used;
+		btrfs_end_transaction(trans, root);
+		trans = NULL;
+		btrfs_btree_balance_dirty(root, nr);
+
+	}
+
+	if (ret == 0) {
+		ret = btrfs_orphan_del(trans, inode);
+		BUG_ON(ret);
+	}
+
+	if (!(root == root->fs_info->tree_root ||
+	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
+		btrfs_return_ino(root, btrfs_ino(inode));
+
+	nr = trans->blocks_used;
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root, nr);
+no_delete:
+	end_writeback(inode);
+	return;
+}
+
+/*
+ * this returns the key found in the dir entry in the location pointer.
+ * If no dir entries were found, location->objectid is 0.
+ */
+static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
+			       struct btrfs_key *location)
+{
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	int ret = 0;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
+				    namelen, 0);
+	if (IS_ERR(di))
+		ret = PTR_ERR(di);
+
+	if (IS_ERR_OR_NULL(di))
+		goto out_err;
+
+	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
+out:
+	btrfs_free_path(path);
+	return ret;
+out_err:
+	location->objectid = 0;
+	goto out;
+}
+
+/*
+ * when we hit a tree root in a directory, the btrfs part of the inode
+ * needs to be changed to reflect the root directory of the tree root.  This
+ * is kind of like crossing a mount point.
+ */
+static int fixup_tree_root_location(struct btrfs_root *root,
+				    struct inode *dir,
+				    struct dentry *dentry,
+				    struct btrfs_key *location,
+				    struct btrfs_root **sub_root)
+{
+	struct btrfs_path *path;
+	struct btrfs_root *new_root;
+	struct btrfs_root_ref *ref;
+	struct extent_buffer *leaf;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = -ENOENT;
+	ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
+				  BTRFS_I(dir)->root->root_key.objectid,
+				  location->objectid);
+	if (ret) {
+		if (ret < 0)
+			err = ret;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
+	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
+		goto out;
+
+	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
+				   (unsigned long)(ref + 1),
+				   dentry->d_name.len);
+	if (ret)
+		goto out;
+
+	btrfs_release_path(path);
+
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
+	if (IS_ERR(new_root)) {
+		err = PTR_ERR(new_root);
+		goto out;
+	}
+
+	if (btrfs_root_refs(&new_root->root_item) == 0) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	*sub_root = new_root;
+	location->objectid = btrfs_root_dirid(&new_root->root_item);
+	location->type = BTRFS_INODE_ITEM_KEY;
+	location->offset = 0;
+	err = 0;
+out:
+	btrfs_free_path(path);
+	return err;
+}
+
+static void inode_tree_add(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_inode *entry;
+	struct rb_node **p;
+	struct rb_node *parent;
+	u64 ino = btrfs_ino(inode);
+again:
+	p = &root->inode_tree.rb_node;
+	parent = NULL;
+
+	if (inode_unhashed(inode))
+		return;
+
+	spin_lock(&root->inode_lock);
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct btrfs_inode, rb_node);
+
+		if (ino < btrfs_ino(&entry->vfs_inode))
+			p = &parent->rb_left;
+		else if (ino > btrfs_ino(&entry->vfs_inode))
+			p = &parent->rb_right;
+		else {
+			WARN_ON(!(entry->vfs_inode.i_state &
+				  (I_WILL_FREE | I_FREEING)));
+			rb_erase(parent, &root->inode_tree);
+			RB_CLEAR_NODE(parent);
+			spin_unlock(&root->inode_lock);
+			goto again;
+		}
+	}
+	rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
+	rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+	spin_unlock(&root->inode_lock);
+}
+
+static void inode_tree_del(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int empty = 0;
+
+	spin_lock(&root->inode_lock);
+	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
+		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+		empty = RB_EMPTY_ROOT(&root->inode_tree);
+	}
+	spin_unlock(&root->inode_lock);
+
+	/*
+	 * Free space cache has inodes in the tree root, but the tree root has a
+	 * root_refs of 0, so this could end up dropping the tree root as a
+	 * snapshot, so we need the extra !root->fs_info->tree_root check to
+	 * make sure we don't drop it.
+	 */
+	if (empty && btrfs_root_refs(&root->root_item) == 0 &&
+	    root != root->fs_info->tree_root) {
+		synchronize_srcu(&root->fs_info->subvol_srcu);
+		spin_lock(&root->inode_lock);
+		empty = RB_EMPTY_ROOT(&root->inode_tree);
+		spin_unlock(&root->inode_lock);
+		if (empty)
+			btrfs_add_dead_root(root);
+	}
+}
+
+int btrfs_invalidate_inodes(struct btrfs_root *root)
+{
+	struct rb_node *node;
+	struct rb_node *prev;
+	struct btrfs_inode *entry;
+	struct inode *inode;
+	u64 objectid = 0;
+
+	WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+
+	spin_lock(&root->inode_lock);
+again:
+	node = root->inode_tree.rb_node;
+	prev = NULL;
+	while (node) {
+		prev = node;
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+		if (objectid < btrfs_ino(&entry->vfs_inode))
+			node = node->rb_left;
+		else if (objectid > btrfs_ino(&entry->vfs_inode))
+			node = node->rb_right;
+		else
+			break;
+	}
+	if (!node) {
+		while (prev) {
+			entry = rb_entry(prev, struct btrfs_inode, rb_node);
+			if (objectid <= btrfs_ino(&entry->vfs_inode)) {
+				node = prev;
+				break;
+			}
+			prev = rb_next(prev);
+		}
+	}
+	while (node) {
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+		objectid = btrfs_ino(&entry->vfs_inode) + 1;
+		inode = igrab(&entry->vfs_inode);
+		if (inode) {
+			spin_unlock(&root->inode_lock);
+			if (atomic_read(&inode->i_count) > 1)
+				d_prune_aliases(inode);
+			/*
+			 * btrfs_drop_inode will have it removed from
+			 * the inode cache when its usage count
+			 * hits zero.
+			 */
+			iput(inode);
+			cond_resched();
+			spin_lock(&root->inode_lock);
+			goto again;
+		}
+
+		if (cond_resched_lock(&root->inode_lock))
+			goto again;
+
+		node = rb_next(node);
+	}
+	spin_unlock(&root->inode_lock);
+	return 0;
+}
+
+static int btrfs_init_locked_inode(struct inode *inode, void *p)
+{
+	struct btrfs_iget_args *args = p;
+	inode->i_ino = args->ino;
+	BTRFS_I(inode)->root = args->root;
+	btrfs_set_inode_space_info(args->root, inode);
+	return 0;
+}
+
+static int btrfs_find_actor(struct inode *inode, void *opaque)
+{
+	struct btrfs_iget_args *args = opaque;
+	return args->ino == btrfs_ino(inode) &&
+		args->root == BTRFS_I(inode)->root;
+}
+
+static struct inode *btrfs_iget_locked(struct super_block *s,
+				       u64 objectid,
+				       struct btrfs_root *root)
+{
+	struct inode *inode;
+	struct btrfs_iget_args args;
+	args.ino = objectid;
+	args.root = root;
+
+	inode = iget5_locked(s, objectid, btrfs_find_actor,
+			     btrfs_init_locked_inode,
+			     (void *)&args);
+	return inode;
+}
+
+/* Get an inode object given its location and corresponding root.
+ * Returns in *is_new if the inode was read from disk
+ */
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+			 struct btrfs_root *root, int *new)
+{
+	struct inode *inode;
+
+	inode = btrfs_iget_locked(s, location->objectid, root);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (inode->i_state & I_NEW) {
+		BTRFS_I(inode)->root = root;
+		memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
+		btrfs_read_locked_inode(inode);
+		inode_tree_add(inode);
+		unlock_new_inode(inode);
+		if (new)
+			*new = 1;
+	}
+
+	return inode;
+}
+
+static struct inode *new_simple_dir(struct super_block *s,
+				    struct btrfs_key *key,
+				    struct btrfs_root *root)
+{
+	struct inode *inode = new_inode(s);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	BTRFS_I(inode)->root = root;
+	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
+	BTRFS_I(inode)->dummy_inode = 1;
+
+	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+	inode->i_op = &simple_dir_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+	return inode;
+}
+
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_root *sub_root = root;
+	struct btrfs_key location;
+	int index;
+	int ret;
+
+	if (dentry->d_name.len > BTRFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ret = btrfs_inode_by_name(dir, dentry, &location);
+
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	if (location.objectid == 0)
+		return NULL;
+
+	if (location.type == BTRFS_INODE_ITEM_KEY) {
+		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
+		return inode;
+	}
+
+	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
+
+	index = srcu_read_lock(&root->fs_info->subvol_srcu);
+	ret = fixup_tree_root_location(root, dir, dentry,
+				       &location, &sub_root);
+	if (ret < 0) {
+		if (ret != -ENOENT)
+			inode = ERR_PTR(ret);
+		else
+			inode = new_simple_dir(dir->i_sb, &location, sub_root);
+	} else {
+		inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
+	}
+	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
+
+	if (!IS_ERR(inode) && root != sub_root) {
+		down_read(&root->fs_info->cleanup_work_sem);
+		if (!(inode->i_sb->s_flags & MS_RDONLY))
+			ret = btrfs_orphan_cleanup(sub_root);
+		up_read(&root->fs_info->cleanup_work_sem);
+		if (ret)
+			inode = ERR_PTR(ret);
+	}
+
+	return inode;
+}
+
+static int btrfs_dentry_delete(const struct dentry *dentry)
+{
+	struct btrfs_root *root;
+
+	if (!dentry->d_inode && !IS_ROOT(dentry))
+		dentry = dentry->d_parent;
+
+	if (dentry->d_inode) {
+		root = BTRFS_I(dentry->d_inode)->root;
+		if (btrfs_root_refs(&root->root_item) == 0)
+			return 1;
+	}
+	return 0;
+}
+
+static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode *inode;
+
+	inode = btrfs_lookup_dentry(dir, dentry);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	return d_splice_alias(inode, dentry);
+}
+
+unsigned char btrfs_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int btrfs_real_readdir(struct file *filp, void *dirent,
+			      filldir_t filldir)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_item *item;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+	struct list_head ins_list;
+	struct list_head del_list;
+	int ret;
+	struct extent_buffer *leaf;
+	int slot;
+	unsigned char d_type;
+	int over = 0;
+	u32 di_cur;
+	u32 di_total;
+	u32 di_len;
+	int key_type = BTRFS_DIR_INDEX_KEY;
+	char tmp_name[32];
+	char *name_ptr;
+	int name_len;
+	int is_curr = 0;	/* filp->f_pos points to the current index? */
+
+	/* FIXME, use a real flag for deciding about the key type */
+	if (root->fs_info->tree_root == root)
+		key_type = BTRFS_DIR_ITEM_KEY;
+
+	/* special case for "." */
+	if (filp->f_pos == 0) {
+		over = filldir(dirent, ".", 1,
+			       filp->f_pos, btrfs_ino(inode), DT_DIR);
+		if (over)
+			return 0;
+		filp->f_pos = 1;
+	}
+	/* special case for .., just use the back ref */
+	if (filp->f_pos == 1) {
+		u64 pino = parent_ino(filp->f_path.dentry);
+		over = filldir(dirent, "..", 2,
+			       filp->f_pos, pino, DT_DIR);
+		if (over)
+			return 0;
+		filp->f_pos = 2;
+	}
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 1;
+
+	if (key_type == BTRFS_DIR_INDEX_KEY) {
+		INIT_LIST_HEAD(&ins_list);
+		INIT_LIST_HEAD(&del_list);
+		btrfs_get_delayed_items(inode, &ins_list, &del_list);
+	}
+
+	btrfs_set_key_type(&key, key_type);
+	key.offset = filp->f_pos;
+	key.objectid = btrfs_ino(inode);
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto err;
+
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto err;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		item = btrfs_item_nr(leaf, slot);
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		if (found_key.objectid != key.objectid)
+			break;
+		if (btrfs_key_type(&found_key) != key_type)
+			break;
+		if (found_key.offset < filp->f_pos)
+			goto next;
+		if (key_type == BTRFS_DIR_INDEX_KEY &&
+		    btrfs_should_delete_dir_index(&del_list,
+						  found_key.offset))
+			goto next;
+
+		filp->f_pos = found_key.offset;
+		is_curr = 1;
+
+		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+		di_cur = 0;
+		di_total = btrfs_item_size(leaf, item);
+
+		while (di_cur < di_total) {
+			struct btrfs_key location;
+
+			if (verify_dir_item(root, leaf, di))
+				break;
+
+			name_len = btrfs_dir_name_len(leaf, di);
+			if (name_len <= sizeof(tmp_name)) {
+				name_ptr = tmp_name;
+			} else {
+				name_ptr = kmalloc(name_len, GFP_NOFS);
+				if (!name_ptr) {
+					ret = -ENOMEM;
+					goto err;
+				}
+			}
+			read_extent_buffer(leaf, name_ptr,
+					   (unsigned long)(di + 1), name_len);
+
+			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
+			btrfs_dir_item_key_to_cpu(leaf, di, &location);
+
+			/* is this a reference to our own snapshot? If so
+			 * skip it
+			 */
+			if (location.type == BTRFS_ROOT_ITEM_KEY &&
+			    location.objectid == root->root_key.objectid) {
+				over = 0;
+				goto skip;
+			}
+			over = filldir(dirent, name_ptr, name_len,
+				       found_key.offset, location.objectid,
+				       d_type);
+
+skip:
+			if (name_ptr != tmp_name)
+				kfree(name_ptr);
+
+			if (over)
+				goto nopos;
+			di_len = btrfs_dir_name_len(leaf, di) +
+				 btrfs_dir_data_len(leaf, di) + sizeof(*di);
+			di_cur += di_len;
+			di = (struct btrfs_dir_item *)((char *)di + di_len);
+		}
+next:
+		path->slots[0]++;
+	}
+
+	if (key_type == BTRFS_DIR_INDEX_KEY) {
+		if (is_curr)
+			filp->f_pos++;
+		ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
+						      &ins_list);
+		if (ret)
+			goto nopos;
+	}
+
+	/* Reached end of directory/root. Bump pos past the last item. */
+	if (key_type == BTRFS_DIR_INDEX_KEY)
+		/*
+		 * 32-bit glibc will use getdents64, but then strtol -
+		 * so the last number we can serve is this.
+		 */
+		filp->f_pos = 0x7fffffff;
+	else
+		filp->f_pos++;
+nopos:
+	ret = 0;
+err:
+	if (key_type == BTRFS_DIR_INDEX_KEY)
+		btrfs_put_delayed_items(&ins_list, &del_list);
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+	bool nolock = false;
+
+	if (BTRFS_I(inode)->dummy_inode)
+		return 0;
+
+	if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode))
+		nolock = true;
+
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		if (nolock)
+			trans = btrfs_join_transaction_nolock(root);
+		else
+			trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+		if (nolock)
+			ret = btrfs_end_transaction_nolock(trans, root);
+		else
+			ret = btrfs_commit_transaction(trans, root);
+	}
+	return ret;
+}
+
+/*
+ * This is somewhat expensive, updating the tree every time the
+ * inode changes.  But, it is most likely to find the inode in cache.
+ * FIXME, needs more benchmarking...there are no reasons other than performance
+ * to keep or drop this code.
+ */
+void btrfs_dirty_inode(struct inode *inode, int flags)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	if (BTRFS_I(inode)->dummy_inode)
+		return;
+
+	trans = btrfs_join_transaction(root);
+	BUG_ON(IS_ERR(trans));
+
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret && ret == -ENOSPC) {
+		/* whoops, lets try again with the full transaction */
+		btrfs_end_transaction(trans, root);
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			printk_ratelimited(KERN_ERR "btrfs: fail to "
+				       "dirty  inode %llu error %ld\n",
+				       (unsigned long long)btrfs_ino(inode),
+				       PTR_ERR(trans));
+			return;
+		}
+
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret) {
+			printk_ratelimited(KERN_ERR "btrfs: fail to "
+				       "dirty  inode %llu error %d\n",
+				       (unsigned long long)btrfs_ino(inode),
+				       ret);
+		}
+	}
+	btrfs_end_transaction(trans, root);
+	if (BTRFS_I(inode)->delayed_node)
+		btrfs_balance_delayed_items(root);
+}
+
+/*
+ * find the highest existing sequence number in a directory
+ * and then set the in-memory index_cnt variable to reflect
+ * free sequence numbers
+ */
+static int btrfs_set_inode_index_count(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key key, found_key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	int ret;
+
+	key.objectid = btrfs_ino(inode);
+	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+	key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	/* FIXME: we should be able to handle this */
+	if (ret == 0)
+		goto out;
+	ret = 0;
+
+	/*
+	 * MAGIC NUMBER EXPLANATION:
+	 * since we search a directory based on f_pos we have to start at 2
+	 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
+	 * else has to start at 2
+	 */
+	if (path->slots[0] == 0) {
+		BTRFS_I(inode)->index_cnt = 2;
+		goto out;
+	}
+
+	path->slots[0]--;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+	if (found_key.objectid != btrfs_ino(inode) ||
+	    btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
+		BTRFS_I(inode)->index_cnt = 2;
+		goto out;
+	}
+
+	BTRFS_I(inode)->index_cnt = found_key.offset + 1;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper to find a free sequence number in a given directory.  This current
+ * code is very simple, later versions will do smarter things in the btree
+ */
+int btrfs_set_inode_index(struct inode *dir, u64 *index)
+{
+	int ret = 0;
+
+	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
+		ret = btrfs_inode_delayed_dir_index_count(dir);
+		if (ret) {
+			ret = btrfs_set_inode_index_count(dir);
+			if (ret)
+				return ret;
+		}
+	}
+
+	*index = BTRFS_I(dir)->index_cnt;
+	BTRFS_I(dir)->index_cnt++;
+
+	return ret;
+}
+
+static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct inode *dir,
+				     const char *name, int name_len,
+				     u64 ref_objectid, u64 objectid, int mode,
+				     u64 *index)
+{
+	struct inode *inode;
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_key *location;
+	struct btrfs_path *path;
+	struct btrfs_inode_ref *ref;
+	struct btrfs_key key[2];
+	u32 sizes[2];
+	unsigned long ptr;
+	int ret;
+	int owner;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	inode = new_inode(root->fs_info->sb);
+	if (!inode) {
+		btrfs_free_path(path);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/*
+	 * we have to initialize this early, so we can reclaim the inode
+	 * number if we fail afterwards in this function.
+	 */
+	inode->i_ino = objectid;
+
+	if (dir) {
+		trace_btrfs_inode_request(dir);
+
+		ret = btrfs_set_inode_index(dir, index);
+		if (ret) {
+			btrfs_free_path(path);
+			iput(inode);
+			return ERR_PTR(ret);
+		}
+	}
+	/*
+	 * index_cnt is ignored for everything but a dir,
+	 * btrfs_get_inode_index_count has an explanation for the magic
+	 * number
+	 */
+	BTRFS_I(inode)->index_cnt = 2;
+	BTRFS_I(inode)->root = root;
+	BTRFS_I(inode)->generation = trans->transid;
+	inode->i_generation = BTRFS_I(inode)->generation;
+	btrfs_set_inode_space_info(root, inode);
+
+	if (mode & S_IFDIR)
+		owner = 0;
+	else
+		owner = 1;
+
+	key[0].objectid = objectid;
+	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
+	key[0].offset = 0;
+
+	key[1].objectid = objectid;
+	btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+	key[1].offset = ref_objectid;
+
+	sizes[0] = sizeof(struct btrfs_inode_item);
+	sizes[1] = name_len + sizeof(*ref);
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
+	if (ret != 0)
+		goto fail;
+
+	inode_init_owner(inode, dir, mode);
+	inode_set_bytes(inode, 0);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				  struct btrfs_inode_item);
+	fill_inode_item(trans, path->nodes[0], inode_item, inode);
+
+	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+			     struct btrfs_inode_ref);
+	btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+	btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
+	ptr = (unsigned long)(ref + 1);
+	write_extent_buffer(path->nodes[0], name, ptr, name_len);
+
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_free_path(path);
+
+	location = &BTRFS_I(inode)->location;
+	location->objectid = objectid;
+	location->offset = 0;
+	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+
+	btrfs_inherit_iflags(inode, dir);
+
+	if ((mode & S_IFREG)) {
+		if (btrfs_test_opt(root, NODATASUM))
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+		if (btrfs_test_opt(root, NODATACOW) ||
+		    (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+	}
+
+	insert_inode_hash(inode);
+	inode_tree_add(inode);
+
+	trace_btrfs_inode_new(inode);
+	btrfs_set_inode_last_trans(trans, inode);
+
+	return inode;
+fail:
+	if (dir)
+		BTRFS_I(dir)->index_cnt--;
+	btrfs_free_path(path);
+	iput(inode);
+	return ERR_PTR(ret);
+}
+
+static inline u8 btrfs_inode_type(struct inode *inode)
+{
+	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
+}
+
+/*
+ * utility function to add 'inode' into 'parent_inode' with
+ * a give name and a given sequence number.
+ * if 'add_backref' is true, also insert a backref from the
+ * inode to the parent directory.
+ */
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+		   struct inode *parent_inode, struct inode *inode,
+		   const char *name, int name_len, int add_backref, u64 index)
+{
+	int ret = 0;
+	struct btrfs_key key;
+	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
+	u64 ino = btrfs_ino(inode);
+	u64 parent_ino = btrfs_ino(parent_inode);
+
+	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
+	} else {
+		key.objectid = ino;
+		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+		key.offset = 0;
+	}
+
+	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+					 key.objectid, root->root_key.objectid,
+					 parent_ino, index, name, name_len);
+	} else if (add_backref) {
+		ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
+					     parent_ino, index);
+	}
+
+	if (ret == 0) {
+		ret = btrfs_insert_dir_item(trans, root, name, name_len,
+					    parent_inode, &key,
+					    btrfs_inode_type(inode), index);
+		BUG_ON(ret);
+
+		btrfs_i_size_write(parent_inode, parent_inode->i_size +
+				   name_len * 2);
+		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+		ret = btrfs_update_inode(trans, root, parent_inode);
+	}
+	return ret;
+}
+
+static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
+			    struct inode *dir, struct dentry *dentry,
+			    struct inode *inode, int backref, u64 index)
+{
+	int err = btrfs_add_link(trans, dir, inode,
+				 dentry->d_name.name, dentry->d_name.len,
+				 backref, index);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	if (err > 0)
+		err = -EEXIST;
+	return err;
+}
+
+static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
+			int mode, dev_t rdev)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode = NULL;
+	int err;
+	int drop_inode = 0;
+	u64 objectid;
+	unsigned long nr = 0;
+	u64 index = 0;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	/*
+	 * 2 for inode item and ref
+	 * 2 for dir items
+	 * 1 for xattr if selinux is on
+	 */
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	err = btrfs_find_free_ino(root, &objectid);
+	if (err)
+		goto out_unlock;
+
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+				dentry->d_name.len, btrfs_ino(dir), objectid,
+				mode, &index);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_unlock;
+	}
+
+	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+
+	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+	if (err)
+		drop_inode = 1;
+	else {
+		inode->i_op = &btrfs_special_inode_operations;
+		init_special_inode(inode, inode->i_mode, rdev);
+		btrfs_update_inode(trans, root, inode);
+	}
+out_unlock:
+	nr = trans->blocks_used;
+	btrfs_end_transaction_throttle(trans, root);
+	btrfs_btree_balance_dirty(root, nr);
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	return err;
+}
+
+static int btrfs_create(struct inode *dir, struct dentry *dentry,
+			int mode, struct nameidata *nd)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode = NULL;
+	int drop_inode = 0;
+	int err;
+	unsigned long nr = 0;
+	u64 objectid;
+	u64 index = 0;
+
+	/*
+	 * 2 for inode item and ref
+	 * 2 for dir items
+	 * 1 for xattr if selinux is on
+	 */
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	err = btrfs_find_free_ino(root, &objectid);
+	if (err)
+		goto out_unlock;
+
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+				dentry->d_name.len, btrfs_ino(dir), objectid,
+				mode, &index);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_unlock;
+	}
+
+	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+
+	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+	if (err)
+		drop_inode = 1;
+	else {
+		inode->i_mapping->a_ops = &btrfs_aops;
+		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+		inode->i_fop = &btrfs_file_operations;
+		inode->i_op = &btrfs_file_inode_operations;
+		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+	}
+out_unlock:
+	nr = trans->blocks_used;
+	btrfs_end_transaction_throttle(trans, root);
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_btree_balance_dirty(root, nr);
+	return err;
+}
+
+static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode = old_dentry->d_inode;
+	u64 index;
+	unsigned long nr = 0;
+	int err;
+	int drop_inode = 0;
+
+	/* do not allow sys_link's with other subvols of the same device */
+	if (root->objectid != BTRFS_I(inode)->root->objectid)
+		return -EXDEV;
+
+	if (inode->i_nlink == ~0U)
+		return -EMLINK;
+
+	err = btrfs_set_inode_index(dir, &index);
+	if (err)
+		goto fail;
+
+	/*
+	 * 2 items for inode and inode ref
+	 * 2 items for dir items
+	 * 1 item for parent inode
+	 */
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto fail;
+	}
+
+	btrfs_inc_nlink(inode);
+	inode->i_ctime = CURRENT_TIME;
+	ihold(inode);
+
+	err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
+
+	if (err) {
+		drop_inode = 1;
+	} else {
+		struct dentry *parent = dget_parent(dentry);
+		err = btrfs_update_inode(trans, root, inode);
+		BUG_ON(err);
+		btrfs_log_new_name(trans, inode, NULL, parent);
+		dput(parent);
+	}
+
+	nr = trans->blocks_used;
+	btrfs_end_transaction_throttle(trans, root);
+fail:
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_btree_balance_dirty(root, nr);
+	return err;
+}
+
+static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode = NULL;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	int err = 0;
+	int drop_on_err = 0;
+	u64 objectid = 0;
+	u64 index = 0;
+	unsigned long nr = 1;
+
+	/*
+	 * 2 items for inode and ref
+	 * 2 items for dir items
+	 * 1 for xattr if selinux is on
+	 */
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	err = btrfs_find_free_ino(root, &objectid);
+	if (err)
+		goto out_fail;
+
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+				dentry->d_name.len, btrfs_ino(dir), objectid,
+				S_IFDIR | mode, &index);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_fail;
+	}
+
+	drop_on_err = 1;
+
+	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+	if (err)
+		goto out_fail;
+
+	inode->i_op = &btrfs_dir_inode_operations;
+	inode->i_fop = &btrfs_dir_file_operations;
+
+	btrfs_i_size_write(inode, 0);
+	err = btrfs_update_inode(trans, root, inode);
+	if (err)
+		goto out_fail;
+
+	err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
+			     dentry->d_name.len, 0, index);
+	if (err)
+		goto out_fail;
+
+	d_instantiate(dentry, inode);
+	drop_on_err = 0;
+
+out_fail:
+	nr = trans->blocks_used;
+	btrfs_end_transaction_throttle(trans, root);
+	if (drop_on_err)
+		iput(inode);
+	btrfs_btree_balance_dirty(root, nr);
+	return err;
+}
+
+/* helper for btfs_get_extent.  Given an existing extent in the tree,
+ * and an extent that you want to insert, deal with overlap and insert
+ * the new extent into the tree.
+ */
+static int merge_extent_mapping(struct extent_map_tree *em_tree,
+				struct extent_map *existing,
+				struct extent_map *em,
+				u64 map_start, u64 map_len)
+{
+	u64 start_diff;
+
+	BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+	start_diff = map_start - em->start;
+	em->start = map_start;
+	em->len = map_len;
+	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+		em->block_start += start_diff;
+		em->block_len -= start_diff;
+	}
+	return add_extent_mapping(em_tree, em);
+}
+
+static noinline int uncompress_inline(struct btrfs_path *path,
+				      struct inode *inode, struct page *page,
+				      size_t pg_offset, u64 extent_offset,
+				      struct btrfs_file_extent_item *item)
+{
+	int ret;
+	struct extent_buffer *leaf = path->nodes[0];
+	char *tmp;
+	size_t max_size;
+	unsigned long inline_size;
+	unsigned long ptr;
+	int compress_type;
+
+	WARN_ON(pg_offset != 0);
+	compress_type = btrfs_file_extent_compression(leaf, item);
+	max_size = btrfs_file_extent_ram_bytes(leaf, item);
+	inline_size = btrfs_file_extent_inline_item_len(leaf,
+					btrfs_item_nr(leaf, path->slots[0]));
+	tmp = kmalloc(inline_size, GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+	ptr = btrfs_file_extent_inline_start(item);
+
+	read_extent_buffer(leaf, tmp, ptr, inline_size);
+
+	max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
+	ret = btrfs_decompress(compress_type, tmp, page,
+			       extent_offset, inline_size, max_size);
+	if (ret) {
+		char *kaddr = kmap_atomic(page, KM_USER0);
+		unsigned long copy_size = min_t(u64,
+				  PAGE_CACHE_SIZE - pg_offset,
+				  max_size - extent_offset);
+		memset(kaddr + pg_offset, 0, copy_size);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+	kfree(tmp);
+	return 0;
+}
+
+/*
+ * a bit scary, this does extent mapping from logical file offset to the disk.
+ * the ugly parts come from merging extents from the disk with the in-ram
+ * representation.  This gets more complex because of the data=ordered code,
+ * where the in-ram extents might be locked pending data=ordered completion.
+ *
+ * This also copies inline extents directly into the page.
+ */
+
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+				    size_t pg_offset, u64 start, u64 len,
+				    int create)
+{
+	int ret;
+	int err = 0;
+	u64 bytenr;
+	u64 extent_start = 0;
+	u64 extent_end = 0;
+	u64 objectid = btrfs_ino(inode);
+	u32 found_type;
+	struct btrfs_path *path = NULL;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *item;
+	struct extent_buffer *leaf;
+	struct btrfs_key found_key;
+	struct extent_map *em = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_trans_handle *trans = NULL;
+	int compress_type;
+
+again:
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, len);
+	if (em)
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+	read_unlock(&em_tree->lock);
+
+	if (em) {
+		if (em->start > start || em->start + em->len <= start)
+			free_extent_map(em);
+		else if (em->block_start == EXTENT_MAP_INLINE && page)
+			free_extent_map(em);
+		else
+			goto out;
+	}
+	em = alloc_extent_map();
+	if (!em) {
+		err = -ENOMEM;
+		goto out;
+	}
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	em->start = EXTENT_MAP_HOLE;
+	em->orig_start = EXTENT_MAP_HOLE;
+	em->len = (u64)-1;
+	em->block_len = (u64)-1;
+
+	if (!path) {
+		path = btrfs_alloc_path();
+		if (!path) {
+			err = -ENOMEM;
+			goto out;
+		}
+		/*
+		 * Chances are we'll be called again, so go ahead and do
+		 * readahead
+		 */
+		path->reada = 1;
+	}
+
+	ret = btrfs_lookup_file_extent(trans, root, path,
+				       objectid, start, trans != NULL);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+
+	if (ret != 0) {
+		if (path->slots[0] == 0)
+			goto not_found;
+		path->slots[0]--;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0],
+			      struct btrfs_file_extent_item);
+	/* are we inside the extent that was found? */
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	found_type = btrfs_key_type(&found_key);
+	if (found_key.objectid != objectid ||
+	    found_type != BTRFS_EXTENT_DATA_KEY) {
+		goto not_found;
+	}
+
+	found_type = btrfs_file_extent_type(leaf, item);
+	extent_start = found_key.offset;
+	compress_type = btrfs_file_extent_compression(leaf, item);
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		extent_end = extent_start +
+		       btrfs_file_extent_num_bytes(leaf, item);
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		size_t size;
+		size = btrfs_file_extent_inline_len(leaf, item);
+		extent_end = (extent_start + size + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
+	}
+
+	if (start >= extent_end) {
+		path->slots[0]++;
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				err = ret;
+				goto out;
+			}
+			if (ret > 0)
+				goto not_found;
+			leaf = path->nodes[0];
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid != objectid ||
+		    found_key.type != BTRFS_EXTENT_DATA_KEY)
+			goto not_found;
+		if (start + len <= found_key.offset)
+			goto not_found;
+		em->start = start;
+		em->len = found_key.offset - start;
+		goto not_found_em;
+	}
+
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		em->start = extent_start;
+		em->len = extent_end - extent_start;
+		em->orig_start = extent_start -
+				 btrfs_file_extent_offset(leaf, item);
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
+		if (bytenr == 0) {
+			em->block_start = EXTENT_MAP_HOLE;
+			goto insert;
+		}
+		if (compress_type != BTRFS_COMPRESS_NONE) {
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+			em->compress_type = compress_type;
+			em->block_start = bytenr;
+			em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
+									 item);
+		} else {
+			bytenr += btrfs_file_extent_offset(leaf, item);
+			em->block_start = bytenr;
+			em->block_len = em->len;
+			if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
+				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+		}
+		goto insert;
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		unsigned long ptr;
+		char *map;
+		size_t size;
+		size_t extent_offset;
+		size_t copy_size;
+
+		em->block_start = EXTENT_MAP_INLINE;
+		if (!page || create) {
+			em->start = extent_start;
+			em->len = extent_end - extent_start;
+			goto out;
+		}
+
+		size = btrfs_file_extent_inline_len(leaf, item);
+		extent_offset = page_offset(page) + pg_offset - extent_start;
+		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
+				size - extent_offset);
+		em->start = extent_start + extent_offset;
+		em->len = (copy_size + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
+		em->orig_start = EXTENT_MAP_INLINE;
+		if (compress_type) {
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+			em->compress_type = compress_type;
+		}
+		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+		if (create == 0 && !PageUptodate(page)) {
+			if (btrfs_file_extent_compression(leaf, item) !=
+			    BTRFS_COMPRESS_NONE) {
+				ret = uncompress_inline(path, inode, page,
+							pg_offset,
+							extent_offset, item);
+				BUG_ON(ret);
+			} else {
+				map = kmap(page);
+				read_extent_buffer(leaf, map + pg_offset, ptr,
+						   copy_size);
+				if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+					memset(map + pg_offset + copy_size, 0,
+					       PAGE_CACHE_SIZE - pg_offset -
+					       copy_size);
+				}
+				kunmap(page);
+			}
+			flush_dcache_page(page);
+		} else if (create && PageUptodate(page)) {
+			WARN_ON(1);
+			if (!trans) {
+				kunmap(page);
+				free_extent_map(em);
+				em = NULL;
+
+				btrfs_release_path(path);
+				trans = btrfs_join_transaction(root);
+
+				if (IS_ERR(trans))
+					return ERR_CAST(trans);
+				goto again;
+			}
+			map = kmap(page);
+			write_extent_buffer(leaf, map + pg_offset, ptr,
+					    copy_size);
+			kunmap(page);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+		set_extent_uptodate(io_tree, em->start,
+				    extent_map_end(em) - 1, NULL, GFP_NOFS);
+		goto insert;
+	} else {
+		printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
+		WARN_ON(1);
+	}
+not_found:
+	em->start = start;
+	em->len = len;
+not_found_em:
+	em->block_start = EXTENT_MAP_HOLE;
+	set_bit(EXTENT_FLAG_VACANCY, &em->flags);
+insert:
+	btrfs_release_path(path);
+	if (em->start > start || extent_map_end(em) <= start) {
+		printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
+		       "[%llu %llu]\n", (unsigned long long)em->start,
+		       (unsigned long long)em->len,
+		       (unsigned long long)start,
+		       (unsigned long long)len);
+		err = -EIO;
+		goto out;
+	}
+
+	err = 0;
+	write_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em);
+	/* it is possible that someone inserted the extent into the tree
+	 * while we had the lock dropped.  It is also possible that
+	 * an overlapping map exists in the tree
+	 */
+	if (ret == -EEXIST) {
+		struct extent_map *existing;
+
+		ret = 0;
+
+		existing = lookup_extent_mapping(em_tree, start, len);
+		if (existing && (existing->start > start ||
+		    existing->start + existing->len <= start)) {
+			free_extent_map(existing);
+			existing = NULL;
+		}
+		if (!existing) {
+			existing = lookup_extent_mapping(em_tree, em->start,
+							 em->len);
+			if (existing) {
+				err = merge_extent_mapping(em_tree, existing,
+							   em, start,
+							   root->sectorsize);
+				free_extent_map(existing);
+				if (err) {
+					free_extent_map(em);
+					em = NULL;
+				}
+			} else {
+				err = -EIO;
+				free_extent_map(em);
+				em = NULL;
+			}
+		} else {
+			free_extent_map(em);
+			em = existing;
+			err = 0;
+		}
+	}
+	write_unlock(&em_tree->lock);
+out:
+
+	trace_btrfs_get_extent(root, em);
+
+	if (path)
+		btrfs_free_path(path);
+	if (trans) {
+		ret = btrfs_end_transaction(trans, root);
+		if (!err)
+			err = ret;
+	}
+	if (err) {
+		free_extent_map(em);
+		return ERR_PTR(err);
+	}
+	return em;
+}
+
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+					   size_t pg_offset, u64 start, u64 len,
+					   int create)
+{
+	struct extent_map *em;
+	struct extent_map *hole_em = NULL;
+	u64 range_start = start;
+	u64 end;
+	u64 found;
+	u64 found_end;
+	int err = 0;
+
+	em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+	if (IS_ERR(em))
+		return em;
+	if (em) {
+		/*
+		 * if our em maps to a hole, there might
+		 * actually be delalloc bytes behind it
+		 */
+		if (em->block_start != EXTENT_MAP_HOLE)
+			return em;
+		else
+			hole_em = em;
+	}
+
+	/* check to see if we've wrapped (len == -1 or similar) */
+	end = start + len;
+	if (end < start)
+		end = (u64)-1;
+	else
+		end -= 1;
+
+	em = NULL;
+
+	/* ok, we didn't find anything, lets look for delalloc */
+	found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
+				 end, len, EXTENT_DELALLOC, 1);
+	found_end = range_start + found;
+	if (found_end < range_start)
+		found_end = (u64)-1;
+
+	/*
+	 * we didn't find anything useful, return
+	 * the original results from get_extent()
+	 */
+	if (range_start > end || found_end <= start) {
+		em = hole_em;
+		hole_em = NULL;
+		goto out;
+	}
+
+	/* adjust the range_start to make sure it doesn't
+	 * go backwards from the start they passed in
+	 */
+	range_start = max(start,range_start);
+	found = found_end - range_start;
+
+	if (found > 0) {
+		u64 hole_start = start;
+		u64 hole_len = len;
+
+		em = alloc_extent_map();
+		if (!em) {
+			err = -ENOMEM;
+			goto out;
+		}
+		/*
+		 * when btrfs_get_extent can't find anything it
+		 * returns one huge hole
+		 *
+		 * make sure what it found really fits our range, and
+		 * adjust to make sure it is based on the start from
+		 * the caller
+		 */
+		if (hole_em) {
+			u64 calc_end = extent_map_end(hole_em);
+
+			if (calc_end <= start || (hole_em->start > end)) {
+				free_extent_map(hole_em);
+				hole_em = NULL;
+			} else {
+				hole_start = max(hole_em->start, start);
+				hole_len = calc_end - hole_start;
+			}
+		}
+		em->bdev = NULL;
+		if (hole_em && range_start > hole_start) {
+			/* our hole starts before our delalloc, so we
+			 * have to return just the parts of the hole
+			 * that go until  the delalloc starts
+			 */
+			em->len = min(hole_len,
+				      range_start - hole_start);
+			em->start = hole_start;
+			em->orig_start = hole_start;
+			/*
+			 * don't adjust block start at all,
+			 * it is fixed at EXTENT_MAP_HOLE
+			 */
+			em->block_start = hole_em->block_start;
+			em->block_len = hole_len;
+		} else {
+			em->start = range_start;
+			em->len = found;
+			em->orig_start = range_start;
+			em->block_start = EXTENT_MAP_DELALLOC;
+			em->block_len = found;
+		}
+	} else if (hole_em) {
+		return hole_em;
+	}
+out:
+
+	free_extent_map(hole_em);
+	if (err) {
+		free_extent_map(em);
+		return ERR_PTR(err);
+	}
+	return em;
+}
+
+static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+						  struct extent_map *em,
+						  u64 start, u64 len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct btrfs_key ins;
+	u64 alloc_hint;
+	int ret;
+	bool insert = false;
+
+	/*
+	 * Ok if the extent map we looked up is a hole and is for the exact
+	 * range we want, there is no reason to allocate a new one, however if
+	 * it is not right then we need to free this one and drop the cache for
+	 * our range.
+	 */
+	if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
+	    em->len != len) {
+		free_extent_map(em);
+		em = NULL;
+		insert = true;
+		btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return ERR_CAST(trans);
+
+	if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
+		btrfs_add_inode_defrag(trans, inode);
+
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+	alloc_hint = get_extent_allocation_hint(inode, start, len);
+	ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
+				   alloc_hint, (u64)-1, &ins, 1);
+	if (ret) {
+		em = ERR_PTR(ret);
+		goto out;
+	}
+
+	if (!em) {
+		em = alloc_extent_map();
+		if (!em) {
+			em = ERR_PTR(-ENOMEM);
+			goto out;
+		}
+	}
+
+	em->start = start;
+	em->orig_start = em->start;
+	em->len = ins.offset;
+
+	em->block_start = ins.objectid;
+	em->block_len = ins.offset;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+
+	/*
+	 * We need to do this because if we're using the original em we searched
+	 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
+	 */
+	em->flags = 0;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+	while (insert) {
+		write_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em);
+		write_unlock(&em_tree->lock);
+		if (ret != -EEXIST)
+			break;
+		btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+	}
+
+	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+					   ins.offset, ins.offset, 0);
+	if (ret) {
+		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+		em = ERR_PTR(ret);
+	}
+out:
+	btrfs_end_transaction(trans, root);
+	return em;
+}
+
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+				      struct inode *inode, u64 offset, u64 len)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct extent_buffer *leaf;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 disk_bytenr;
+	u64 backref_offset;
+	u64 extent_end;
+	u64 num_bytes;
+	int slot;
+	int found_type;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
+				       offset, 0);
+	if (ret < 0)
+		goto out;
+
+	slot = path->slots[0];
+	if (ret == 1) {
+		if (slot == 0) {
+			/* can't find the item, must cow */
+			ret = 0;
+			goto out;
+		}
+		slot--;
+	}
+	ret = 0;
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (key.objectid != btrfs_ino(inode) ||
+	    key.type != BTRFS_EXTENT_DATA_KEY) {
+		/* not our file or wrong item type, must cow */
+		goto out;
+	}
+
+	if (key.offset > offset) {
+		/* Wrong offset, must cow */
+		goto out;
+	}
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+	found_type = btrfs_file_extent_type(leaf, fi);
+	if (found_type != BTRFS_FILE_EXTENT_REG &&
+	    found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+		/* not a regular extent, must cow */
+		goto out;
+	}
+	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+	backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	if (extent_end < offset + len) {
+		/* extent doesn't include our full range, must cow */
+		goto out;
+	}
+
+	if (btrfs_extent_readonly(root, disk_bytenr))
+		goto out;
+
+	/*
+	 * look for other files referencing this extent, if we
+	 * find any we must cow
+	 */
+	if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
+				  key.offset - backref_offset, disk_bytenr))
+		goto out;
+
+	/*
+	 * adjust disk_bytenr and num_bytes to cover just the bytes
+	 * in this extent we are about to write.  If there
+	 * are any csums in that range we have to cow in order
+	 * to keep the csums correct
+	 */
+	disk_bytenr += backref_offset;
+	disk_bytenr += offset - key.offset;
+	num_bytes = min(offset + len, extent_end) - offset;
+	if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+				goto out;
+	/*
+	 * all of the above have passed, it is safe to overwrite this extent
+	 * without cow
+	 */
+	ret = 1;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	struct extent_map *em;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 start = iblock << inode->i_blkbits;
+	u64 len = bh_result->b_size;
+	struct btrfs_trans_handle *trans;
+
+	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+
+	/*
+	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+	 * io.  INLINE is special, and we could probably kludge it in here, but
+	 * it's still buffered so for safety lets just fall back to the generic
+	 * buffered path.
+	 *
+	 * For COMPRESSED we _have_ to read the entire extent in so we can
+	 * decompress it, so there will be buffering required no matter what we
+	 * do, so go ahead and fallback to buffered.
+	 *
+	 * We return -ENOTBLK because thats what makes DIO go ahead and go back
+	 * to buffered IO.  Don't blame me, this is the price we pay for using
+	 * the generic code.
+	 */
+	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+	    em->block_start == EXTENT_MAP_INLINE) {
+		free_extent_map(em);
+		return -ENOTBLK;
+	}
+
+	/* Just a good old fashioned hole, return */
+	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+		free_extent_map(em);
+		/* DIO will do one hole at a time, so just unlock a sector */
+		unlock_extent(&BTRFS_I(inode)->io_tree, start,
+			      start + root->sectorsize - 1, GFP_NOFS);
+		return 0;
+	}
+
+	/*
+	 * We don't allocate a new extent in the following cases
+	 *
+	 * 1) The inode is marked as NODATACOW.  In this case we'll just use the
+	 * existing extent.
+	 * 2) The extent is marked as PREALLOC.  We're good to go here and can
+	 * just use the extent.
+	 *
+	 */
+	if (!create) {
+		len = em->len - (start - em->start);
+		goto map;
+	}
+
+	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+	     em->block_start != EXTENT_MAP_HOLE)) {
+		int type;
+		int ret;
+		u64 block_start;
+
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			type = BTRFS_ORDERED_PREALLOC;
+		else
+			type = BTRFS_ORDERED_NOCOW;
+		len = min(len, em->len - (start - em->start));
+		block_start = em->block_start + (start - em->start);
+
+		/*
+		 * we're not going to log anything, but we do need
+		 * to make sure the current transaction stays open
+		 * while we look for nocow cross refs
+		 */
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans))
+			goto must_cow;
+
+		if (can_nocow_odirect(trans, inode, start, len) == 1) {
+			ret = btrfs_add_ordered_extent_dio(inode, start,
+					   block_start, len, len, type);
+			btrfs_end_transaction(trans, root);
+			if (ret) {
+				free_extent_map(em);
+				return ret;
+			}
+			goto unlock;
+		}
+		btrfs_end_transaction(trans, root);
+	}
+must_cow:
+	/*
+	 * this will cow the extent, reset the len in case we changed
+	 * it above
+	 */
+	len = bh_result->b_size;
+	em = btrfs_new_extent_direct(inode, em, start, len);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+	len = min(len, em->len - (start - em->start));
+unlock:
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+			  EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+			  0, NULL, GFP_NOFS);
+map:
+	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+		inode->i_blkbits;
+	bh_result->b_size = len;
+	bh_result->b_bdev = em->bdev;
+	set_buffer_mapped(bh_result);
+	if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+		set_buffer_new(bh_result);
+
+	free_extent_map(em);
+
+	return 0;
+}
+
+struct btrfs_dio_private {
+	struct inode *inode;
+	u64 logical_offset;
+	u64 disk_bytenr;
+	u64 bytes;
+	u32 *csums;
+	void *private;
+
+	/* number of bios pending for this dio */
+	atomic_t pending_bios;
+
+	/* IO errors */
+	int errors;
+
+	struct bio *orig_bio;
+};
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+	struct btrfs_dio_private *dip = bio->bi_private;
+	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	struct inode *inode = dip->inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 start;
+	u32 *private = dip->csums;
+
+	start = dip->logical_offset;
+	do {
+		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+			struct page *page = bvec->bv_page;
+			char *kaddr;
+			u32 csum = ~(u32)0;
+			unsigned long flags;
+
+			local_irq_save(flags);
+			kaddr = kmap_atomic(page, KM_IRQ0);
+			csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+					       csum, bvec->bv_len);
+			btrfs_csum_final(csum, (char *)&csum);
+			kunmap_atomic(kaddr, KM_IRQ0);
+			local_irq_restore(flags);
+
+			flush_dcache_page(bvec->bv_page);
+			if (csum != *private) {
+				printk(KERN_ERR "btrfs csum failed ino %llu off"
+				      " %llu csum %u private %u\n",
+				      (unsigned long long)btrfs_ino(inode),
+				      (unsigned long long)start,
+				      csum, *private);
+				err = -EIO;
+			}
+		}
+
+		start += bvec->bv_len;
+		private++;
+		bvec++;
+	} while (bvec <= bvec_end);
+
+	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+		      dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+	bio->bi_private = dip->private;
+
+	kfree(dip->csums);
+	kfree(dip);
+
+	/* If we had a csum failure make sure to clear the uptodate flag */
+	if (err)
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	dio_end_io(bio, err);
+}
+
+static void btrfs_endio_direct_write(struct bio *bio, int err)
+{
+	struct btrfs_dio_private *dip = bio->bi_private;
+	struct inode *inode = dip->inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_ordered_extent *ordered = NULL;
+	struct extent_state *cached_state = NULL;
+	u64 ordered_offset = dip->logical_offset;
+	u64 ordered_bytes = dip->bytes;
+	int ret;
+
+	if (err)
+		goto out_done;
+again:
+	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
+						   &ordered_offset,
+						   ordered_bytes);
+	if (!ret)
+		goto out_test;
+
+	BUG_ON(!ordered);
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		err = -ENOMEM;
+		goto out;
+	}
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+		ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+		if (!ret)
+			ret = btrfs_update_inode(trans, root, inode);
+		err = ret;
+		goto out;
+	}
+
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+			 ordered->file_offset + ordered->len - 1, 0,
+			 &cached_state, GFP_NOFS);
+
+	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+		ret = btrfs_mark_extent_written(trans, inode,
+						ordered->file_offset,
+						ordered->file_offset +
+						ordered->len);
+		if (ret) {
+			err = ret;
+			goto out_unlock;
+		}
+	} else {
+		ret = insert_reserved_file_extent(trans, inode,
+						  ordered->file_offset,
+						  ordered->start,
+						  ordered->disk_len,
+						  ordered->len,
+						  ordered->len,
+						  0, 0, 0,
+						  BTRFS_FILE_EXTENT_REG);
+		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+				   ordered->file_offset, ordered->len);
+		if (ret) {
+			err = ret;
+			WARN_ON(1);
+			goto out_unlock;
+		}
+	}
+
+	add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
+	ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+	if (!ret)
+		btrfs_update_inode(trans, root, inode);
+	ret = 0;
+out_unlock:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+			     ordered->file_offset + ordered->len - 1,
+			     &cached_state, GFP_NOFS);
+out:
+	btrfs_delalloc_release_metadata(inode, ordered->len);
+	btrfs_end_transaction(trans, root);
+	ordered_offset = ordered->file_offset + ordered->len;
+	btrfs_put_ordered_extent(ordered);
+	btrfs_put_ordered_extent(ordered);
+
+out_test:
+	/*
+	 * our bio might span multiple ordered extents.  If we haven't
+	 * completed the accounting for the whole dio, go back and try again
+	 */
+	if (ordered_offset < dip->logical_offset + dip->bytes) {
+		ordered_bytes = dip->logical_offset + dip->bytes -
+			ordered_offset;
+		goto again;
+	}
+out_done:
+	bio->bi_private = dip->private;
+
+	kfree(dip->csums);
+	kfree(dip);
+
+	/* If we had an error make sure to clear the uptodate flag */
+	if (err)
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	dio_end_io(bio, err);
+}
+
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+				    struct bio *bio, int mirror_num,
+				    unsigned long bio_flags, u64 offset)
+{
+	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+	BUG_ON(ret);
+	return 0;
+}
+
+static void btrfs_end_dio_bio(struct bio *bio, int err)
+{
+	struct btrfs_dio_private *dip = bio->bi_private;
+
+	if (err) {
+		printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
+		      "sector %#Lx len %u err no %d\n",
+		      (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw,
+		      (unsigned long long)bio->bi_sector, bio->bi_size, err);
+		dip->errors = 1;
+
+		/*
+		 * before atomic variable goto zero, we must make sure
+		 * dip->errors is perceived to be set.
+		 */
+		smp_mb__before_atomic_dec();
+	}
+
+	/* if there are more bios still pending for this dio, just exit */
+	if (!atomic_dec_and_test(&dip->pending_bios))
+		goto out;
+
+	if (dip->errors)
+		bio_io_error(dip->orig_bio);
+	else {
+		set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
+		bio_endio(dip->orig_bio, 0);
+	}
+out:
+	bio_put(bio);
+}
+
+static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
+				       u64 first_sector, gfp_t gfp_flags)
+{
+	int nr_vecs = bio_get_nr_vecs(bdev);
+	return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
+}
+
+static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+					 int rw, u64 file_offset, int skip_sum,
+					 u32 *csums, int async_submit)
+{
+	int write = rw & REQ_WRITE;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	bio_get(bio);
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	if (ret)
+		goto err;
+
+	if (skip_sum)
+		goto map;
+
+	if (write && async_submit) {
+		ret = btrfs_wq_submit_bio(root->fs_info,
+				   inode, rw, bio, 0, 0,
+				   file_offset,
+				   __btrfs_submit_bio_start_direct_io,
+				   __btrfs_submit_bio_done);
+		goto err;
+	} else if (write) {
+		/*
+		 * If we aren't doing async submit, calculate the csum of the
+		 * bio now.
+		 */
+		ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
+		if (ret)
+			goto err;
+	} else if (!skip_sum) {
+		ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
+					  file_offset, csums);
+		if (ret)
+			goto err;
+	}
+
+map:
+	ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
+err:
+	bio_put(bio);
+	return ret;
+}
+
+static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
+				    int skip_sum)
+{
+	struct inode *inode = dip->inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+	struct bio *bio;
+	struct bio *orig_bio = dip->orig_bio;
+	struct bio_vec *bvec = orig_bio->bi_io_vec;
+	u64 start_sector = orig_bio->bi_sector;
+	u64 file_offset = dip->logical_offset;
+	u64 submit_len = 0;
+	u64 map_length;
+	int nr_pages = 0;
+	u32 *csums = dip->csums;
+	int ret = 0;
+	int async_submit = 0;
+	int write = rw & REQ_WRITE;
+
+	map_length = orig_bio->bi_size;
+	ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+			      &map_length, NULL, 0);
+	if (ret) {
+		bio_put(orig_bio);
+		return -EIO;
+	}
+
+	if (map_length >= orig_bio->bi_size) {
+		bio = orig_bio;
+		goto submit;
+	}
+
+	async_submit = 1;
+	bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
+	if (!bio)
+		return -ENOMEM;
+	bio->bi_private = dip;
+	bio->bi_end_io = btrfs_end_dio_bio;
+	atomic_inc(&dip->pending_bios);
+
+	while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
+		if (unlikely(map_length < submit_len + bvec->bv_len ||
+		    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+				 bvec->bv_offset) < bvec->bv_len)) {
+			/*
+			 * inc the count before we submit the bio so
+			 * we know the end IO handler won't happen before
+			 * we inc the count. Otherwise, the dip might get freed
+			 * before we're done setting it up
+			 */
+			atomic_inc(&dip->pending_bios);
+			ret = __btrfs_submit_dio_bio(bio, inode, rw,
+						     file_offset, skip_sum,
+						     csums, async_submit);
+			if (ret) {
+				bio_put(bio);
+				atomic_dec(&dip->pending_bios);
+				goto out_err;
+			}
+
+			/* Write's use the ordered csums */
+			if (!write && !skip_sum)
+				csums = csums + nr_pages;
+			start_sector += submit_len >> 9;
+			file_offset += submit_len;
+
+			submit_len = 0;
+			nr_pages = 0;
+
+			bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
+						  start_sector, GFP_NOFS);
+			if (!bio)
+				goto out_err;
+			bio->bi_private = dip;
+			bio->bi_end_io = btrfs_end_dio_bio;
+
+			map_length = orig_bio->bi_size;
+			ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+					      &map_length, NULL, 0);
+			if (ret) {
+				bio_put(bio);
+				goto out_err;
+			}
+		} else {
+			submit_len += bvec->bv_len;
+			nr_pages ++;
+			bvec++;
+		}
+	}
+
+submit:
+	ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
+				     csums, async_submit);
+	if (!ret)
+		return 0;
+
+	bio_put(bio);
+out_err:
+	dip->errors = 1;
+	/*
+	 * before atomic variable goto zero, we must
+	 * make sure dip->errors is perceived to be set.
+	 */
+	smp_mb__before_atomic_dec();
+	if (atomic_dec_and_test(&dip->pending_bios))
+		bio_io_error(dip->orig_bio);
+
+	/* bio_end_io() will handle error, so we needn't return it */
+	return 0;
+}
+
+static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
+				loff_t file_offset)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_dio_private *dip;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int skip_sum;
+	int write = rw & REQ_WRITE;
+	int ret = 0;
+
+	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+	dip = kmalloc(sizeof(*dip), GFP_NOFS);
+	if (!dip) {
+		ret = -ENOMEM;
+		goto free_ordered;
+	}
+	dip->csums = NULL;
+
+	/* Write's use the ordered csum stuff, so we don't need dip->csums */
+	if (!write && !skip_sum) {
+		dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+		if (!dip->csums) {
+			kfree(dip);
+			ret = -ENOMEM;
+			goto free_ordered;
+		}
+	}
+
+	dip->private = bio->bi_private;
+	dip->inode = inode;
+	dip->logical_offset = file_offset;
+
+	dip->bytes = 0;
+	do {
+		dip->bytes += bvec->bv_len;
+		bvec++;
+	} while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+
+	dip->disk_bytenr = (u64)bio->bi_sector << 9;
+	bio->bi_private = dip;
+	dip->errors = 0;
+	dip->orig_bio = bio;
+	atomic_set(&dip->pending_bios, 0);
+
+	if (write)
+		bio->bi_end_io = btrfs_endio_direct_write;
+	else
+		bio->bi_end_io = btrfs_endio_direct_read;
+
+	ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
+	if (!ret)
+		return;
+free_ordered:
+	/*
+	 * If this is a write, we need to clean up the reserved space and kill
+	 * the ordered extent.
+	 */
+	if (write) {
+		struct btrfs_ordered_extent *ordered;
+		ordered = btrfs_lookup_ordered_extent(inode, file_offset);
+		if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+			btrfs_free_reserved_extent(root, ordered->start,
+						   ordered->disk_len);
+		btrfs_put_ordered_extent(ordered);
+		btrfs_put_ordered_extent(ordered);
+	}
+	bio_endio(bio, ret);
+}
+
+static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+			const struct iovec *iov, loff_t offset,
+			unsigned long nr_segs)
+{
+	int seg;
+	int i;
+	size_t size;
+	unsigned long addr;
+	unsigned blocksize_mask = root->sectorsize - 1;
+	ssize_t retval = -EINVAL;
+	loff_t end = offset;
+
+	if (offset & blocksize_mask)
+		goto out;
+
+	/* Check the memory alignment.  Blocks cannot straddle pages */
+	for (seg = 0; seg < nr_segs; seg++) {
+		addr = (unsigned long)iov[seg].iov_base;
+		size = iov[seg].iov_len;
+		end += size;
+		if ((addr & blocksize_mask) || (size & blocksize_mask))
+			goto out;
+
+		/* If this is a write we don't need to check anymore */
+		if (rw & WRITE)
+			continue;
+
+		/*
+		 * Check to make sure we don't have duplicate iov_base's in this
+		 * iovec, if so return EINVAL, otherwise we'll get csum errors
+		 * when reading back.
+		 */
+		for (i = seg + 1; i < nr_segs; i++) {
+			if (iov[seg].iov_base == iov[i].iov_base)
+				goto out;
+		}
+	}
+	retval = 0;
+out:
+	return retval;
+}
+static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
+			const struct iovec *iov, loff_t offset,
+			unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
+	u64 lockstart, lockend;
+	ssize_t ret;
+	int writing = rw & WRITE;
+	int write_bits = 0;
+	size_t count = iov_length(iov, nr_segs);
+
+	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
+			    offset, nr_segs)) {
+		return 0;
+	}
+
+	lockstart = offset;
+	lockend = offset + count - 1;
+
+	if (writing) {
+		ret = btrfs_delalloc_reserve_space(inode, count);
+		if (ret)
+			goto out;
+	}
+
+	while (1) {
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 0, &cached_state, GFP_NOFS);
+		/*
+		 * We're concerned with the entire range that we're going to be
+		 * doing DIO to, so we need to make sure theres no ordered
+		 * extents in this range.
+		 */
+		ordered = btrfs_lookup_ordered_range(inode, lockstart,
+						     lockend - lockstart + 1);
+		if (!ordered)
+			break;
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				     &cached_state, GFP_NOFS);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+		cond_resched();
+	}
+
+	/*
+	 * we don't use btrfs_set_extent_delalloc because we don't want
+	 * the dirty or uptodate bits
+	 */
+	if (writing) {
+		write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				     EXTENT_DELALLOC, 0, NULL, &cached_state,
+				     GFP_NOFS);
+		if (ret) {
+			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+					 lockend, EXTENT_LOCKED | write_bits,
+					 1, 0, &cached_state, GFP_NOFS);
+			goto out;
+		}
+	}
+
+	free_extent_state(cached_state);
+	cached_state = NULL;
+
+	ret = __blockdev_direct_IO(rw, iocb, inode,
+		   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+		   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+		   btrfs_submit_direct, 0);
+
+	if (ret < 0 && ret != -EIOCBQUEUED) {
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+			      offset + iov_length(iov, nr_segs) - 1,
+			      EXTENT_LOCKED | write_bits, 1, 0,
+			      &cached_state, GFP_NOFS);
+	} else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
+		/*
+		 * We're falling back to buffered, unlock the section we didn't
+		 * do IO on.
+		 */
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+			      offset + iov_length(iov, nr_segs) - 1,
+			      EXTENT_LOCKED | write_bits, 1, 0,
+			      &cached_state, GFP_NOFS);
+	}
+out:
+	free_extent_state(cached_state);
+	return ret;
+}
+
+static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len)
+{
+	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
+}
+
+int btrfs_readpage(struct file *file, struct page *page)
+{
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	return extent_read_full_page(tree, page, btrfs_get_extent);
+}
+
+static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct extent_io_tree *tree;
+
+
+	if (current->flags & PF_MEMALLOC) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return 0;
+	}
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+}
+
+int btrfs_writepages(struct address_space *mapping,
+		     struct writeback_control *wbc)
+{
+	struct extent_io_tree *tree;
+
+	tree = &BTRFS_I(mapping->host)->io_tree;
+	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
+}
+
+static int
+btrfs_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(mapping->host)->io_tree;
+	return extent_readpages(tree, mapping, pages, nr_pages,
+				btrfs_get_extent);
+}
+static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+	struct extent_io_tree *tree;
+	struct extent_map_tree *map;
+	int ret;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	map = &BTRFS_I(page->mapping->host)->extent_tree;
+	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
+	if (ret == 1) {
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
+	}
+	return ret;
+}
+
+static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+	if (PageWriteback(page) || PageDirty(page))
+		return 0;
+	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
+}
+
+static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct extent_io_tree *tree;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
+	u64 page_start = page_offset(page);
+	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+
+	/*
+	 * we have the page locked, so new writeback can't start,
+	 * and the dirty bit won't be cleared while we are here.
+	 *
+	 * Wait for IO on this page so that we can safely clear
+	 * the PagePrivate2 bit and do ordered accounting
+	 */
+	wait_on_page_writeback(page);
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	if (offset) {
+		btrfs_releasepage(page, GFP_NOFS);
+		return;
+	}
+	lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
+			 GFP_NOFS);
+	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+					   page_offset(page));
+	if (ordered) {
+		/*
+		 * IO on this page will never be started, so we need
+		 * to account for any ordered extents now
+		 */
+		clear_extent_bit(tree, page_start, page_end,
+				 EXTENT_DIRTY | EXTENT_DELALLOC |
+				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
+				 &cached_state, GFP_NOFS);
+		/*
+		 * whoever cleared the private bit is responsible
+		 * for the finish_ordered_io
+		 */
+		if (TestClearPagePrivate2(page)) {
+			btrfs_finish_ordered_io(page->mapping->host,
+						page_start, page_end);
+		}
+		btrfs_put_ordered_extent(ordered);
+		cached_state = NULL;
+		lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
+				 GFP_NOFS);
+	}
+	clear_extent_bit(tree, page_start, page_end,
+		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+		 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
+	__btrfs_releasepage(page, GFP_NOFS);
+
+	ClearPageChecked(page);
+	if (PagePrivate(page)) {
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
+	}
+}
+
+/*
+ * btrfs_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF.  Because
+ * vmtruncate() writes the inode size before removing pages, once we have the
+ * page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = fdentry(vma->vm_file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
+	char *kaddr;
+	unsigned long zero_start;
+	loff_t size;
+	int ret;
+	u64 page_start;
+	u64 page_end;
+
+	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	if (ret) {
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else /* -ENOSPC, -EIO, etc */
+			ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+
+	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+again:
+	lock_page(page);
+	size = i_size_read(inode);
+	page_start = page_offset(page);
+	page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+	if ((page->mapping != inode->i_mapping) ||
+	    (page_start >= size)) {
+		/* page got truncated out from underneath us */
+		goto out_unlock;
+	}
+	wait_on_page_writeback(page);
+
+	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
+			 GFP_NOFS);
+	set_page_extent_mapped(page);
+
+	/*
+	 * we can't set the delalloc bits if there are pending ordered
+	 * extents.  Drop our locks and wait for them to finish
+	 */
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
+		unlock_page(page);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+		goto again;
+	}
+
+	/*
+	 * XXX - page_mkwrite gets called every time the page is dirtied, even
+	 * if it was already dirty, so for space accounting reasons we need to
+	 * clear any delalloc bits for the range we are fixing to save.  There
+	 * is probably a better way to do this, but for now keep consistent with
+	 * prepare_pages in the normal write path.
+	 */
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+			  0, 0, &cached_state, GFP_NOFS);
+
+	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+					&cached_state);
+	if (ret) {
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
+		ret = VM_FAULT_SIGBUS;
+		goto out_unlock;
+	}
+	ret = 0;
+
+	/* page is wholly or partially inside EOF */
+	if (page_start + PAGE_CACHE_SIZE > size)
+		zero_start = size & ~PAGE_CACHE_MASK;
+	else
+		zero_start = PAGE_CACHE_SIZE;
+
+	if (zero_start != PAGE_CACHE_SIZE) {
+		kaddr = kmap(page);
+		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+		flush_dcache_page(page);
+		kunmap(page);
+	}
+	ClearPageChecked(page);
+	set_page_dirty(page);
+	SetPageUptodate(page);
+
+	BTRFS_I(inode)->last_trans = root->fs_info->generation;
+	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+
+	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
+
+out_unlock:
+	if (!ret)
+		return VM_FAULT_LOCKED;
+	unlock_page(page);
+	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+out:
+	return ret;
+}
+
+static int btrfs_truncate(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_block_rsv *rsv;
+	int ret;
+	int err = 0;
+	struct btrfs_trans_handle *trans;
+	unsigned long nr;
+	u64 mask = root->sectorsize - 1;
+
+	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+	if (ret)
+		return ret;
+
+	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+	btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+
+	/*
+	 * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
+	 * 3 things going on here
+	 *
+	 * 1) We need to reserve space for our orphan item and the space to
+	 * delete our orphan item.  Lord knows we don't want to have a dangling
+	 * orphan item because we didn't reserve space to remove it.
+	 *
+	 * 2) We need to reserve space to update our inode.
+	 *
+	 * 3) We need to have something to cache all the space that is going to
+	 * be free'd up by the truncate operation, but also have some slack
+	 * space reserved in case it uses space during the truncate (thank you
+	 * very much snapshotting).
+	 *
+	 * And we need these to all be seperate.  The fact is we can use alot of
+	 * space doing the truncate, and we have no earthly idea how much space
+	 * we will use, so we need the truncate reservation to be seperate so it
+	 * doesn't end up using space reserved for updating the inode or
+	 * removing the orphan item.  We also need to be able to stop the
+	 * transaction and start a new one, which means we need to be able to
+	 * update the inode several times, and we have no idea of knowing how
+	 * many times that will be, so we can't just reserve 1 item for the
+	 * entirety of the opration, so that has to be done seperately as well.
+	 * Then there is the orphan item, which does indeed need to be held on
+	 * to for the whole operation, and we need nobody to touch this reserved
+	 * space except the orphan code.
+	 *
+	 * So that leaves us with
+	 *
+	 * 1) root->orphan_block_rsv - for the orphan deletion.
+	 * 2) rsv - for the truncate reservation, which we will steal from the
+	 * transaction reservation.
+	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
+	 * updating the inode.
+	 */
+	rsv = btrfs_alloc_block_rsv(root);
+	if (!rsv)
+		return -ENOMEM;
+	btrfs_add_durable_block_rsv(root->fs_info, rsv);
+
+	trans = btrfs_start_transaction(root, 4);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out;
+	}
+
+	/*
+	 * Reserve space for the truncate process.  Truncate should be adding
+	 * space, but if there are snapshots it may end up using space.
+	 */
+	ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
+	BUG_ON(ret);
+
+	ret = btrfs_orphan_add(trans, inode);
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		goto out;
+	}
+
+	nr = trans->blocks_used;
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root, nr);
+
+	/*
+	 * Ok so we've already migrated our bytes over for the truncate, so here
+	 * just reserve the one slot we need for updating the inode.
+	 */
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out;
+	}
+	trans->block_rsv = rsv;
+
+	/*
+	 * setattr is responsible for setting the ordered_data_close flag,
+	 * but that is only tested during the last file release.  That
+	 * could happen well after the next commit, leaving a great big
+	 * window where new writes may get lost if someone chooses to write
+	 * to this file after truncating to zero
+	 *
+	 * The inode doesn't have any dirty data here, and so if we commit
+	 * this is a noop.  If someone immediately starts writing to the inode
+	 * it is very likely we'll catch some of their writes in this
+	 * transaction, and the commit will find this file on the ordered
+	 * data list with good things to send down.
+	 *
+	 * This is a best effort solution, there is still a window where
+	 * using truncate to replace the contents of the file will
+	 * end up with a zero length file after a crash.
+	 */
+	if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+		btrfs_add_ordered_operation(trans, root, inode);
+
+	while (1) {
+		if (!trans) {
+			trans = btrfs_start_transaction(root, 3);
+			if (IS_ERR(trans)) {
+				err = PTR_ERR(trans);
+				goto out;
+			}
+
+			ret = btrfs_truncate_reserve_metadata(trans, root,
+							      rsv);
+			BUG_ON(ret);
+
+			trans->block_rsv = rsv;
+		}
+
+		ret = btrfs_truncate_inode_items(trans, root, inode,
+						 inode->i_size,
+						 BTRFS_EXTENT_DATA_KEY);
+		if (ret != -EAGAIN) {
+			err = ret;
+			break;
+		}
+
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret) {
+			err = ret;
+			break;
+		}
+
+		nr = trans->blocks_used;
+		btrfs_end_transaction(trans, root);
+		trans = NULL;
+		btrfs_btree_balance_dirty(root, nr);
+	}
+
+	if (ret == 0 && inode->i_nlink > 0) {
+		trans->block_rsv = root->orphan_block_rsv;
+		ret = btrfs_orphan_del(trans, inode);
+		if (ret)
+			err = ret;
+	} else if (ret && inode->i_nlink > 0) {
+		/*
+		 * Failed to do the truncate, remove us from the in memory
+		 * orphan list.
+		 */
+		ret = btrfs_orphan_del(NULL, inode);
+	}
+
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret && !err)
+		err = ret;
+
+	nr = trans->blocks_used;
+	ret = btrfs_end_transaction_throttle(trans, root);
+	btrfs_btree_balance_dirty(root, nr);
+
+out:
+	btrfs_free_block_rsv(root, rsv);
+
+	if (ret && !err)
+		err = ret;
+
+	return err;
+}
+
+/*
+ * create a new subvolume directory/inode (helper for the ioctl).
+ */
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *new_root, u64 new_dirid)
+{
+	struct inode *inode;
+	int err;
+	u64 index = 0;
+
+	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
+				new_dirid, S_IFDIR | 0700, &index);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	inode->i_op = &btrfs_dir_inode_operations;
+	inode->i_fop = &btrfs_dir_file_operations;
+
+	inode->i_nlink = 1;
+	btrfs_i_size_write(inode, 0);
+
+	err = btrfs_update_inode(trans, new_root, inode);
+	BUG_ON(err);
+
+	iput(inode);
+	return 0;
+}
+
+/* helper function for file defrag and space balancing.  This
+ * forces readahead on a given range of bytes in an inode
+ */
+unsigned long btrfs_force_ra(struct address_space *mapping,
+			      struct file_ra_state *ra, struct file *file,
+			      pgoff_t offset, pgoff_t last_index)
+{
+	pgoff_t req_size = last_index - offset + 1;
+
+	page_cache_sync_readahead(mapping, ra, file, offset, req_size);
+	return offset + req_size;
+}
+
+struct inode *btrfs_alloc_inode(struct super_block *sb)
+{
+	struct btrfs_inode *ei;
+	struct inode *inode;
+
+	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+
+	ei->root = NULL;
+	ei->space_info = NULL;
+	ei->generation = 0;
+	ei->sequence = 0;
+	ei->last_trans = 0;
+	ei->last_sub_trans = 0;
+	ei->logged_trans = 0;
+	ei->delalloc_bytes = 0;
+	ei->reserved_bytes = 0;
+	ei->disk_i_size = 0;
+	ei->flags = 0;
+	ei->index_cnt = (u64)-1;
+	ei->last_unlink_trans = 0;
+
+	atomic_set(&ei->outstanding_extents, 0);
+	atomic_set(&ei->reserved_extents, 0);
+
+	ei->ordered_data_close = 0;
+	ei->orphan_meta_reserved = 0;
+	ei->dummy_inode = 0;
+	ei->in_defrag = 0;
+	ei->force_compress = BTRFS_COMPRESS_NONE;
+
+	ei->delayed_node = NULL;
+
+	inode = &ei->vfs_inode;
+	extent_map_tree_init(&ei->extent_tree);
+	extent_io_tree_init(&ei->io_tree, &inode->i_data);
+	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
+	mutex_init(&ei->log_mutex);
+	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+	INIT_LIST_HEAD(&ei->i_orphan);
+	INIT_LIST_HEAD(&ei->delalloc_inodes);
+	INIT_LIST_HEAD(&ei->ordered_operations);
+	RB_CLEAR_NODE(&ei->rb_node);
+
+	return inode;
+}
+
+static void btrfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+
+void btrfs_destroy_inode(struct inode *inode)
+{
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	WARN_ON(!list_empty(&inode->i_dentry));
+	WARN_ON(inode->i_data.nrpages);
+	WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
+	WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents));
+
+	/*
+	 * This can happen where we create an inode, but somebody else also
+	 * created the same inode and we need to destroy the one we already
+	 * created.
+	 */
+	if (!root)
+		goto free;
+
+	/*
+	 * Make sure we're properly removed from the ordered operation
+	 * lists.
+	 */
+	smp_mb();
+	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
+		spin_lock(&root->fs_info->ordered_extent_lock);
+		list_del_init(&BTRFS_I(inode)->ordered_operations);
+		spin_unlock(&root->fs_info->ordered_extent_lock);
+	}
+
+	spin_lock(&root->orphan_lock);
+	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+		printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
+		       (unsigned long long)btrfs_ino(inode));
+		list_del_init(&BTRFS_I(inode)->i_orphan);
+	}
+	spin_unlock(&root->orphan_lock);
+
+	while (1) {
+		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
+		if (!ordered)
+			break;
+		else {
+			printk(KERN_ERR "btrfs found ordered "
+			       "extent %llu %llu on inode cleanup\n",
+			       (unsigned long long)ordered->file_offset,
+			       (unsigned long long)ordered->len);
+			btrfs_remove_ordered_extent(inode, ordered);
+			btrfs_put_ordered_extent(ordered);
+			btrfs_put_ordered_extent(ordered);
+		}
+	}
+	inode_tree_del(inode);
+	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+free:
+	btrfs_remove_delayed_node(inode);
+	call_rcu(&inode->i_rcu, btrfs_i_callback);
+}
+
+int btrfs_drop_inode(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (btrfs_root_refs(&root->root_item) == 0 &&
+	    !is_free_space_inode(root, inode))
+		return 1;
+	else
+		return generic_drop_inode(inode);
+}
+
+static void init_once(void *foo)
+{
+	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+void btrfs_destroy_cachep(void)
+{
+	if (btrfs_inode_cachep)
+		kmem_cache_destroy(btrfs_inode_cachep);
+	if (btrfs_trans_handle_cachep)
+		kmem_cache_destroy(btrfs_trans_handle_cachep);
+	if (btrfs_transaction_cachep)
+		kmem_cache_destroy(btrfs_transaction_cachep);
+	if (btrfs_path_cachep)
+		kmem_cache_destroy(btrfs_path_cachep);
+	if (btrfs_free_space_cachep)
+		kmem_cache_destroy(btrfs_free_space_cachep);
+}
+
+int btrfs_init_cachep(void)
+{
+	btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
+			sizeof(struct btrfs_inode), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
+	if (!btrfs_inode_cachep)
+		goto fail;
+
+	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+			sizeof(struct btrfs_trans_handle), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_trans_handle_cachep)
+		goto fail;
+
+	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+			sizeof(struct btrfs_transaction), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_transaction_cachep)
+		goto fail;
+
+	btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
+			sizeof(struct btrfs_path), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_path_cachep)
+		goto fail;
+
+	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
+			sizeof(struct btrfs_free_space), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_free_space_cachep)
+		goto fail;
+
+	return 0;
+fail:
+	btrfs_destroy_cachep();
+	return -ENOMEM;
+}
+
+static int btrfs_getattr(struct vfsmount *mnt,
+			 struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	generic_fillattr(inode, stat);
+	stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
+	stat->blksize = PAGE_CACHE_SIZE;
+	stat->blocks = (inode_get_bytes(inode) +
+			BTRFS_I(inode)->delalloc_bytes) >> 9;
+	return 0;
+}
+
+/*
+ * If a file is moved, it will inherit the cow and compression flags of the new
+ * directory.
+ */
+static void fixup_inode_flags(struct inode *dir, struct inode *inode)
+{
+	struct btrfs_inode *b_dir = BTRFS_I(dir);
+	struct btrfs_inode *b_inode = BTRFS_I(inode);
+
+	if (b_dir->flags & BTRFS_INODE_NODATACOW)
+		b_inode->flags |= BTRFS_INODE_NODATACOW;
+	else
+		b_inode->flags &= ~BTRFS_INODE_NODATACOW;
+
+	if (b_dir->flags & BTRFS_INODE_COMPRESS)
+		b_inode->flags |= BTRFS_INODE_COMPRESS;
+	else
+		b_inode->flags &= ~BTRFS_INODE_COMPRESS;
+}
+
+static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			   struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(old_dir)->root;
+	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct timespec ctime = CURRENT_TIME;
+	u64 index = 0;
+	u64 root_objectid;
+	int ret;
+	u64 old_ino = btrfs_ino(old_inode);
+
+	if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+		return -EPERM;
+
+	/* we only allow rename subvolume link between subvolumes */
+	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+		return -EXDEV;
+
+	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
+	    (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
+		return -ENOTEMPTY;
+
+	if (S_ISDIR(old_inode->i_mode) && new_inode &&
+	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+		return -ENOTEMPTY;
+	/*
+	 * we're using rename to replace one file with another.
+	 * and the replacement file is large.  Start IO on it now so
+	 * we don't add too much work to the end of the transaction
+	 */
+	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
+	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+		filemap_flush(old_inode->i_mapping);
+
+	/* close the racy window with snapshot create/destroy ioctl */
+	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+		down_read(&root->fs_info->subvol_sem);
+	/*
+	 * We want to reserve the absolute worst case amount of items.  So if
+	 * both inodes are subvols and we need to unlink them then that would
+	 * require 4 item modifications, but if they are both normal inodes it
+	 * would require 5 item modifications, so we'll assume their normal
+	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+	 * should cover the worst case number of items we'll modify.
+	 */
+	trans = btrfs_start_transaction(root, 20);
+	if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto out_notrans;
+        }
+
+	if (dest != root)
+		btrfs_record_root_in_trans(trans, dest);
+
+	ret = btrfs_set_inode_index(new_dir, &index);
+	if (ret)
+		goto out_fail;
+
+	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		/* force full log commit if subvolume involved. */
+		root->fs_info->last_trans_log_full_commit = trans->transid;
+	} else {
+		ret = btrfs_insert_inode_ref(trans, dest,
+					     new_dentry->d_name.name,
+					     new_dentry->d_name.len,
+					     old_ino,
+					     btrfs_ino(new_dir), index);
+		if (ret)
+			goto out_fail;
+		/*
+		 * this is an ugly little race, but the rename is required
+		 * to make sure that if we crash, the inode is either at the
+		 * old name or the new one.  pinning the log transaction lets
+		 * us make sure we don't allow a log commit to come in after
+		 * we unlink the name but before we add the new name back in.
+		 */
+		btrfs_pin_log_trans(root);
+	}
+	/*
+	 * make sure the inode gets flushed if it is replacing
+	 * something.
+	 */
+	if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
+		btrfs_add_ordered_operation(trans, root, old_inode);
+
+	old_dir->i_ctime = old_dir->i_mtime = ctime;
+	new_dir->i_ctime = new_dir->i_mtime = ctime;
+	old_inode->i_ctime = ctime;
+
+	if (old_dentry->d_parent != new_dentry->d_parent)
+		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+
+	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
+					old_dentry->d_name.name,
+					old_dentry->d_name.len);
+	} else {
+		ret = __btrfs_unlink_inode(trans, root, old_dir,
+					old_dentry->d_inode,
+					old_dentry->d_name.name,
+					old_dentry->d_name.len);
+		if (!ret)
+			ret = btrfs_update_inode(trans, root, old_inode);
+	}
+	BUG_ON(ret);
+
+	if (new_inode) {
+		new_inode->i_ctime = CURRENT_TIME;
+		if (unlikely(btrfs_ino(new_inode) ==
+			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+			root_objectid = BTRFS_I(new_inode)->location.objectid;
+			ret = btrfs_unlink_subvol(trans, dest, new_dir,
+						root_objectid,
+						new_dentry->d_name.name,
+						new_dentry->d_name.len);
+			BUG_ON(new_inode->i_nlink == 0);
+		} else {
+			ret = btrfs_unlink_inode(trans, dest, new_dir,
+						 new_dentry->d_inode,
+						 new_dentry->d_name.name,
+						 new_dentry->d_name.len);
+		}
+		BUG_ON(ret);
+		if (new_inode->i_nlink == 0) {
+			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
+			BUG_ON(ret);
+		}
+	}
+
+	fixup_inode_flags(new_dir, old_inode);
+
+	ret = btrfs_add_link(trans, new_dir, old_inode,
+			     new_dentry->d_name.name,
+			     new_dentry->d_name.len, 0, index);
+	BUG_ON(ret);
+
+	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+		struct dentry *parent = dget_parent(new_dentry);
+		btrfs_log_new_name(trans, old_inode, old_dir, parent);
+		dput(parent);
+		btrfs_end_log_trans(root);
+	}
+out_fail:
+	btrfs_end_transaction_throttle(trans, root);
+out_notrans:
+	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+		up_read(&root->fs_info->subvol_sem);
+
+	return ret;
+}
+
+/*
+ * some fairly slow code that needs optimization. This walks the list
+ * of all the inodes with pending delalloc and forces them to disk.
+ */
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+{
+	struct list_head *head = &root->fs_info->delalloc_inodes;
+	struct btrfs_inode *binode;
+	struct inode *inode;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	spin_lock(&root->fs_info->delalloc_lock);
+	while (!list_empty(head)) {
+		binode = list_entry(head->next, struct btrfs_inode,
+				    delalloc_inodes);
+		inode = igrab(&binode->vfs_inode);
+		if (!inode)
+			list_del_init(&binode->delalloc_inodes);
+		spin_unlock(&root->fs_info->delalloc_lock);
+		if (inode) {
+			filemap_flush(inode->i_mapping);
+			if (delay_iput)
+				btrfs_add_delayed_iput(inode);
+			else
+				iput(inode);
+		}
+		cond_resched();
+		spin_lock(&root->fs_info->delalloc_lock);
+	}
+	spin_unlock(&root->fs_info->delalloc_lock);
+
+	/* the filemap_flush will queue IO into the worker threads, but
+	 * we have to make sure the IO is actually started and that
+	 * ordered extents get created before we return
+	 */
+	atomic_inc(&root->fs_info->async_submit_draining);
+	while (atomic_read(&root->fs_info->nr_async_submits) ||
+	      atomic_read(&root->fs_info->async_delalloc_pages)) {
+		wait_event(root->fs_info->async_submit_wait,
+		   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+	}
+	atomic_dec(&root->fs_info->async_submit_draining);
+	return 0;
+}
+
+static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
+			 const char *symname)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct inode *inode = NULL;
+	int err;
+	int drop_inode = 0;
+	u64 objectid;
+	u64 index = 0 ;
+	int name_len;
+	int datasize;
+	unsigned long ptr;
+	struct btrfs_file_extent_item *ei;
+	struct extent_buffer *leaf;
+	unsigned long nr = 0;
+
+	name_len = strlen(symname) + 1;
+	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
+		return -ENAMETOOLONG;
+
+	/*
+	 * 2 items for inode item and ref
+	 * 2 items for dir items
+	 * 1 item for xattr if selinux is on
+	 */
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	err = btrfs_find_free_ino(root, &objectid);
+	if (err)
+		goto out_unlock;
+
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+				dentry->d_name.len, btrfs_ino(dir), objectid,
+				S_IFLNK|S_IRWXUGO, &index);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_unlock;
+	}
+
+	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+
+	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+	if (err)
+		drop_inode = 1;
+	else {
+		inode->i_mapping->a_ops = &btrfs_aops;
+		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+		inode->i_fop = &btrfs_file_operations;
+		inode->i_op = &btrfs_file_inode_operations;
+		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+	}
+	if (drop_inode)
+		goto out_unlock;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	key.objectid = btrfs_ino(inode);
+	key.offset = 0;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+	datasize = btrfs_file_extent_calc_inline_size(name_len);
+	err = btrfs_insert_empty_item(trans, root, path, &key,
+				      datasize);
+	if (err) {
+		drop_inode = 1;
+		btrfs_free_path(path);
+		goto out_unlock;
+	}
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+	btrfs_set_file_extent_type(leaf, ei,
+				   BTRFS_FILE_EXTENT_INLINE);
+	btrfs_set_file_extent_encryption(leaf, ei, 0);
+	btrfs_set_file_extent_compression(leaf, ei, 0);
+	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
+
+	ptr = btrfs_file_extent_inline_start(ei);
+	write_extent_buffer(leaf, symname, ptr, name_len);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_free_path(path);
+
+	inode->i_op = &btrfs_symlink_inode_operations;
+	inode->i_mapping->a_ops = &btrfs_symlink_aops;
+	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+	inode_set_bytes(inode, name_len);
+	btrfs_i_size_write(inode, name_len - 1);
+	err = btrfs_update_inode(trans, root, inode);
+	if (err)
+		drop_inode = 1;
+
+out_unlock:
+	nr = trans->blocks_used;
+	btrfs_end_transaction_throttle(trans, root);
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_btree_balance_dirty(root, nr);
+	return err;
+}
+
+static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
+				       u64 start, u64 num_bytes, u64 min_size,
+				       loff_t actual_len, u64 *alloc_hint,
+				       struct btrfs_trans_handle *trans)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key ins;
+	u64 cur_offset = start;
+	u64 i_size;
+	int ret = 0;
+	bool own_trans = true;
+
+	if (trans)
+		own_trans = false;
+	while (num_bytes > 0) {
+		if (own_trans) {
+			trans = btrfs_start_transaction(root, 3);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				break;
+			}
+		}
+
+		ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
+					   0, *alloc_hint, (u64)-1, &ins, 1);
+		if (ret) {
+			if (own_trans)
+				btrfs_end_transaction(trans, root);
+			break;
+		}
+
+		ret = insert_reserved_file_extent(trans, inode,
+						  cur_offset, ins.objectid,
+						  ins.offset, ins.offset,
+						  ins.offset, 0, 0, 0,
+						  BTRFS_FILE_EXTENT_PREALLOC);
+		BUG_ON(ret);
+		btrfs_drop_extent_cache(inode, cur_offset,
+					cur_offset + ins.offset -1, 0);
+
+		num_bytes -= ins.offset;
+		cur_offset += ins.offset;
+		*alloc_hint = ins.objectid + ins.offset;
+
+		inode->i_ctime = CURRENT_TIME;
+		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
+		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+		    (actual_len > inode->i_size) &&
+		    (cur_offset > inode->i_size)) {
+			if (cur_offset > actual_len)
+				i_size = actual_len;
+			else
+				i_size = cur_offset;
+			i_size_write(inode, i_size);
+			btrfs_ordered_update_i_size(inode, i_size, NULL);
+		}
+
+		ret = btrfs_update_inode(trans, root, inode);
+		BUG_ON(ret);
+
+		if (own_trans)
+			btrfs_end_transaction(trans, root);
+	}
+	return ret;
+}
+
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+			      u64 start, u64 num_bytes, u64 min_size,
+			      loff_t actual_len, u64 *alloc_hint)
+{
+	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+					   min_size, actual_len, alloc_hint,
+					   NULL);
+}
+
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+				    struct btrfs_trans_handle *trans, int mode,
+				    u64 start, u64 num_bytes, u64 min_size,
+				    loff_t actual_len, u64 *alloc_hint)
+{
+	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+					   min_size, actual_len, alloc_hint, trans);
+}
+
+static int btrfs_set_page_dirty(struct page *page)
+{
+	return __set_page_dirty_nobuffers(page);
+}
+
+static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
+		return -EROFS;
+	if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
+		return -EACCES;
+	return generic_permission(inode, mask, flags, btrfs_check_acl);
+}
+
+static const struct inode_operations btrfs_dir_inode_operations = {
+	.getattr	= btrfs_getattr,
+	.lookup		= btrfs_lookup,
+	.create		= btrfs_create,
+	.unlink		= btrfs_unlink,
+	.link		= btrfs_link,
+	.mkdir		= btrfs_mkdir,
+	.rmdir		= btrfs_rmdir,
+	.rename		= btrfs_rename,
+	.symlink	= btrfs_symlink,
+	.setattr	= btrfs_setattr,
+	.mknod		= btrfs_mknod,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
+	.listxattr	= btrfs_listxattr,
+	.removexattr	= btrfs_removexattr,
+	.permission	= btrfs_permission,
+};
+static const struct inode_operations btrfs_dir_ro_inode_operations = {
+	.lookup		= btrfs_lookup,
+	.permission	= btrfs_permission,
+};
+
+static const struct file_operations btrfs_dir_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= btrfs_real_readdir,
+	.unlocked_ioctl	= btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= btrfs_ioctl,
+#endif
+	.release        = btrfs_release_file,
+	.fsync		= btrfs_sync_file,
+};
+
+static struct extent_io_ops btrfs_extent_io_ops = {
+	.fill_delalloc = run_delalloc_range,
+	.submit_bio_hook = btrfs_submit_bio_hook,
+	.merge_bio_hook = btrfs_merge_bio_hook,
+	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
+	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
+	.writepage_start_hook = btrfs_writepage_start_hook,
+	.readpage_io_failed_hook = btrfs_io_failed_hook,
+	.set_bit_hook = btrfs_set_bit_hook,
+	.clear_bit_hook = btrfs_clear_bit_hook,
+	.merge_extent_hook = btrfs_merge_extent_hook,
+	.split_extent_hook = btrfs_split_extent_hook,
+};
+
+/*
+ * btrfs doesn't support the bmap operation because swapfiles
+ * use bmap to make a mapping of extents in the file.  They assume
+ * these extents won't change over the life of the file and they
+ * use the bmap result to do IO directly to the drive.
+ *
+ * the btrfs bmap call would return logical addresses that aren't
+ * suitable for IO and they also will change frequently as COW
+ * operations happen.  So, swapfile + btrfs == corruption.
+ *
+ * For now we're avoiding this by dropping bmap.
+ */
+static const struct address_space_operations btrfs_aops = {
+	.readpage	= btrfs_readpage,
+	.writepage	= btrfs_writepage,
+	.writepages	= btrfs_writepages,
+	.readpages	= btrfs_readpages,
+	.direct_IO	= btrfs_direct_IO,
+	.invalidatepage = btrfs_invalidatepage,
+	.releasepage	= btrfs_releasepage,
+	.set_page_dirty	= btrfs_set_page_dirty,
+	.error_remove_page = generic_error_remove_page,
+};
+
+static const struct address_space_operations btrfs_symlink_aops = {
+	.readpage	= btrfs_readpage,
+	.writepage	= btrfs_writepage,
+	.invalidatepage = btrfs_invalidatepage,
+	.releasepage	= btrfs_releasepage,
+};
+
+static const struct inode_operations btrfs_file_inode_operations = {
+	.getattr	= btrfs_getattr,
+	.setattr	= btrfs_setattr,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
+	.listxattr      = btrfs_listxattr,
+	.removexattr	= btrfs_removexattr,
+	.permission	= btrfs_permission,
+	.fiemap		= btrfs_fiemap,
+};
+static const struct inode_operations btrfs_special_inode_operations = {
+	.getattr	= btrfs_getattr,
+	.setattr	= btrfs_setattr,
+	.permission	= btrfs_permission,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
+	.listxattr	= btrfs_listxattr,
+	.removexattr	= btrfs_removexattr,
+};
+static const struct inode_operations btrfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.getattr	= btrfs_getattr,
+	.permission	= btrfs_permission,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
+	.listxattr	= btrfs_listxattr,
+	.removexattr	= btrfs_removexattr,
+};
+
+const struct dentry_operations btrfs_dentry_operations = {
+	.d_delete	= btrfs_dentry_delete,
+};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 00000000..a3c4751e
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,2914 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/security.h>
+#include <linux/xattr.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "locking.h"
+#include "inode-map.h"
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & ~FS_DIRSYNC_FL;
+	else
+		return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
+}
+
+/*
+ * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
+ */
+static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
+{
+	unsigned int iflags = 0;
+
+	if (flags & BTRFS_INODE_SYNC)
+		iflags |= FS_SYNC_FL;
+	if (flags & BTRFS_INODE_IMMUTABLE)
+		iflags |= FS_IMMUTABLE_FL;
+	if (flags & BTRFS_INODE_APPEND)
+		iflags |= FS_APPEND_FL;
+	if (flags & BTRFS_INODE_NODUMP)
+		iflags |= FS_NODUMP_FL;
+	if (flags & BTRFS_INODE_NOATIME)
+		iflags |= FS_NOATIME_FL;
+	if (flags & BTRFS_INODE_DIRSYNC)
+		iflags |= FS_DIRSYNC_FL;
+	if (flags & BTRFS_INODE_NODATACOW)
+		iflags |= FS_NOCOW_FL;
+
+	if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS))
+		iflags |= FS_COMPR_FL;
+	else if (flags & BTRFS_INODE_NOCOMPRESS)
+		iflags |= FS_NOCOMP_FL;
+
+	return iflags;
+}
+
+/*
+ * Update inode->i_flags based on the btrfs internal flags.
+ */
+void btrfs_update_iflags(struct inode *inode)
+{
+	struct btrfs_inode *ip = BTRFS_I(inode);
+
+	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+
+	if (ip->flags & BTRFS_INODE_SYNC)
+		inode->i_flags |= S_SYNC;
+	if (ip->flags & BTRFS_INODE_IMMUTABLE)
+		inode->i_flags |= S_IMMUTABLE;
+	if (ip->flags & BTRFS_INODE_APPEND)
+		inode->i_flags |= S_APPEND;
+	if (ip->flags & BTRFS_INODE_NOATIME)
+		inode->i_flags |= S_NOATIME;
+	if (ip->flags & BTRFS_INODE_DIRSYNC)
+		inode->i_flags |= S_DIRSYNC;
+}
+
+/*
+ * Inherit flags from the parent inode.
+ *
+ * Unlike extN we don't have any flags we don't want to inherit currently.
+ */
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
+{
+	unsigned int flags;
+
+	if (!dir)
+		return;
+
+	flags = BTRFS_I(dir)->flags;
+
+	if (S_ISREG(inode->i_mode))
+		flags &= ~BTRFS_INODE_DIRSYNC;
+	else if (!S_ISDIR(inode->i_mode))
+		flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
+
+	BTRFS_I(inode)->flags = flags;
+	btrfs_update_iflags(inode);
+}
+
+static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
+{
+	struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
+	unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
+
+	if (copy_to_user(arg, &flags, sizeof(flags)))
+		return -EFAULT;
+	return 0;
+}
+
+static int check_flags(unsigned int flags)
+{
+	if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+		      FS_NOATIME_FL | FS_NODUMP_FL | \
+		      FS_SYNC_FL | FS_DIRSYNC_FL | \
+		      FS_NOCOMP_FL | FS_COMPR_FL |
+		      FS_NOCOW_FL))
+		return -EOPNOTSUPP;
+
+	if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct btrfs_inode *ip = BTRFS_I(inode);
+	struct btrfs_root *root = ip->root;
+	struct btrfs_trans_handle *trans;
+	unsigned int flags, oldflags;
+	int ret;
+
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	if (copy_from_user(&flags, arg, sizeof(flags)))
+		return -EFAULT;
+
+	ret = check_flags(flags);
+	if (ret)
+		return ret;
+
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
+	mutex_lock(&inode->i_mutex);
+
+	flags = btrfs_mask_flags(inode->i_mode, flags);
+	oldflags = btrfs_flags_to_ioctl(ip->flags);
+	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+		if (!capable(CAP_LINUX_IMMUTABLE)) {
+			ret = -EPERM;
+			goto out_unlock;
+		}
+	}
+
+	ret = mnt_want_write(file->f_path.mnt);
+	if (ret)
+		goto out_unlock;
+
+	if (flags & FS_SYNC_FL)
+		ip->flags |= BTRFS_INODE_SYNC;
+	else
+		ip->flags &= ~BTRFS_INODE_SYNC;
+	if (flags & FS_IMMUTABLE_FL)
+		ip->flags |= BTRFS_INODE_IMMUTABLE;
+	else
+		ip->flags &= ~BTRFS_INODE_IMMUTABLE;
+	if (flags & FS_APPEND_FL)
+		ip->flags |= BTRFS_INODE_APPEND;
+	else
+		ip->flags &= ~BTRFS_INODE_APPEND;
+	if (flags & FS_NODUMP_FL)
+		ip->flags |= BTRFS_INODE_NODUMP;
+	else
+		ip->flags &= ~BTRFS_INODE_NODUMP;
+	if (flags & FS_NOATIME_FL)
+		ip->flags |= BTRFS_INODE_NOATIME;
+	else
+		ip->flags &= ~BTRFS_INODE_NOATIME;
+	if (flags & FS_DIRSYNC_FL)
+		ip->flags |= BTRFS_INODE_DIRSYNC;
+	else
+		ip->flags &= ~BTRFS_INODE_DIRSYNC;
+	if (flags & FS_NOCOW_FL)
+		ip->flags |= BTRFS_INODE_NODATACOW;
+	else
+		ip->flags &= ~BTRFS_INODE_NODATACOW;
+
+	/*
+	 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
+	 * flag may be changed automatically if compression code won't make
+	 * things smaller.
+	 */
+	if (flags & FS_NOCOMP_FL) {
+		ip->flags &= ~BTRFS_INODE_COMPRESS;
+		ip->flags |= BTRFS_INODE_NOCOMPRESS;
+	} else if (flags & FS_COMPR_FL) {
+		ip->flags |= BTRFS_INODE_COMPRESS;
+		ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
+	} else {
+		ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+	}
+
+	trans = btrfs_join_transaction(root);
+	BUG_ON(IS_ERR(trans));
+
+	ret = btrfs_update_inode(trans, root, inode);
+	BUG_ON(ret);
+
+	btrfs_update_iflags(inode);
+	inode->i_ctime = CURRENT_TIME;
+	btrfs_end_transaction(trans, root);
+
+	mnt_drop_write(file->f_path.mnt);
+
+	ret = 0;
+ out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	return put_user(inode->i_generation, arg);
+}
+
+static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_device *device;
+	struct request_queue *q;
+	struct fstrim_range range;
+	u64 minlen = ULLONG_MAX;
+	u64 num_devices = 0;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
+				dev_list) {
+		if (!device->bdev)
+			continue;
+		q = bdev_get_queue(device->bdev);
+		if (blk_queue_discard(q)) {
+			num_devices++;
+			minlen = min((u64)q->limits.discard_granularity,
+				     minlen);
+		}
+	}
+	rcu_read_unlock();
+	if (!num_devices)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&range, arg, sizeof(range)))
+		return -EFAULT;
+
+	range.minlen = max(range.minlen, minlen);
+	ret = btrfs_trim_fs(root, &range);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(arg, &range, sizeof(range)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static noinline int create_subvol(struct btrfs_root *root,
+				  struct dentry *dentry,
+				  char *name, int namelen,
+				  u64 *async_transid)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key key;
+	struct btrfs_root_item root_item;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *leaf;
+	struct btrfs_root *new_root;
+	struct dentry *parent = dget_parent(dentry);
+	struct inode *dir;
+	int ret;
+	int err;
+	u64 objectid;
+	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
+	u64 index = 0;
+
+	ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
+	if (ret) {
+		dput(parent);
+		return ret;
+	}
+
+	dir = parent->d_inode;
+
+	/*
+	 * 1 - inode item
+	 * 2 - refs
+	 * 1 - root item
+	 * 2 - dir items
+	 */
+	trans = btrfs_start_transaction(root, 6);
+	if (IS_ERR(trans)) {
+		dput(parent);
+		return PTR_ERR(trans);
+	}
+
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+				      0, objectid, NULL, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		ret = PTR_ERR(leaf);
+		goto fail;
+	}
+
+	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(leaf, objectid);
+
+	write_extent_buffer(leaf, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(leaf),
+			    BTRFS_FSID_SIZE);
+	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+			    BTRFS_UUID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+
+	inode_item = &root_item.inode;
+	memset(inode_item, 0, sizeof(*inode_item));
+	inode_item->generation = cpu_to_le64(1);
+	inode_item->size = cpu_to_le64(3);
+	inode_item->nlink = cpu_to_le32(1);
+	inode_item->nbytes = cpu_to_le64(root->leafsize);
+	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+
+	root_item.flags = 0;
+	root_item.byte_limit = 0;
+	inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT);
+
+	btrfs_set_root_bytenr(&root_item, leaf->start);
+	btrfs_set_root_generation(&root_item, trans->transid);
+	btrfs_set_root_level(&root_item, 0);
+	btrfs_set_root_refs(&root_item, 1);
+	btrfs_set_root_used(&root_item, leaf->len);
+	btrfs_set_root_last_snapshot(&root_item, 0);
+
+	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+	root_item.drop_level = 0;
+
+	btrfs_tree_unlock(leaf);
+	free_extent_buffer(leaf);
+	leaf = NULL;
+
+	btrfs_set_root_dirid(&root_item, new_dirid);
+
+	key.objectid = objectid;
+	key.offset = 0;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+				&root_item);
+	if (ret)
+		goto fail;
+
+	key.offset = (u64)-1;
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+	BUG_ON(IS_ERR(new_root));
+
+	btrfs_record_root_in_trans(trans, new_root);
+
+	ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
+	/*
+	 * insert the directory item
+	 */
+	ret = btrfs_set_inode_index(dir, &index);
+	BUG_ON(ret);
+
+	ret = btrfs_insert_dir_item(trans, root,
+				    name, namelen, dir, &key,
+				    BTRFS_FT_DIR, index);
+	if (ret)
+		goto fail;
+
+	btrfs_i_size_write(dir, dir->i_size + namelen * 2);
+	ret = btrfs_update_inode(trans, root, dir);
+	BUG_ON(ret);
+
+	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+				 objectid, root->root_key.objectid,
+				 btrfs_ino(dir), index, name, namelen);
+
+	BUG_ON(ret);
+
+	d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
+fail:
+	dput(parent);
+	if (async_transid) {
+		*async_transid = trans->transid;
+		err = btrfs_commit_transaction_async(trans, root, 1);
+	} else {
+		err = btrfs_commit_transaction(trans, root);
+	}
+	if (err && !ret)
+		ret = err;
+	return ret;
+}
+
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+			   char *name, int namelen, u64 *async_transid,
+			   bool readonly)
+{
+	struct inode *inode;
+	struct dentry *parent;
+	struct btrfs_pending_snapshot *pending_snapshot;
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	if (!root->ref_cows)
+		return -EINVAL;
+
+	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+	if (!pending_snapshot)
+		return -ENOMEM;
+
+	btrfs_init_block_rsv(&pending_snapshot->block_rsv);
+	pending_snapshot->dentry = dentry;
+	pending_snapshot->root = root;
+	pending_snapshot->readonly = readonly;
+
+	trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto fail;
+	}
+
+	ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
+	BUG_ON(ret);
+
+	spin_lock(&root->fs_info->trans_lock);
+	list_add(&pending_snapshot->list,
+		 &trans->transaction->pending_snapshots);
+	spin_unlock(&root->fs_info->trans_lock);
+	if (async_transid) {
+		*async_transid = trans->transid;
+		ret = btrfs_commit_transaction_async(trans,
+				     root->fs_info->extent_root, 1);
+	} else {
+		ret = btrfs_commit_transaction(trans,
+					       root->fs_info->extent_root);
+	}
+	BUG_ON(ret);
+
+	ret = pending_snapshot->error;
+	if (ret)
+		goto fail;
+
+	ret = btrfs_orphan_cleanup(pending_snapshot->snap);
+	if (ret)
+		goto fail;
+
+	parent = dget_parent(dentry);
+	inode = btrfs_lookup_dentry(parent->d_inode, dentry);
+	dput(parent);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto fail;
+	}
+	BUG_ON(!inode);
+	d_instantiate(dentry, inode);
+	ret = 0;
+fail:
+	kfree(pending_snapshot);
+	return ret;
+}
+
+/*  copy of check_sticky in fs/namei.c()
+* It's inline, so penalty for filesystems that don't use sticky bit is
+* minimal.
+*/
+static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
+{
+	uid_t fsuid = current_fsuid();
+
+	if (!(dir->i_mode & S_ISVTX))
+		return 0;
+	if (inode->i_uid == fsuid)
+		return 0;
+	if (dir->i_uid == fsuid)
+		return 0;
+	return !capable(CAP_FOWNER);
+}
+
+/*  copy of may_delete in fs/namei.c()
+ *	Check whether we can remove a link victim from directory dir, check
+ *  whether the type of victim is right.
+ *  1. We can't do it if dir is read-only (done in permission())
+ *  2. We should have write and exec permissions on dir
+ *  3. We can't remove anything from append-only dir
+ *  4. We can't do anything with immutable dir (done in permission())
+ *  5. If the sticky bit on dir is set we should either
+ *	a. be owner of dir, or
+ *	b. be owner of victim, or
+ *	c. have CAP_FOWNER capability
+ *  6. If the victim is append-only or immutable we can't do antyhing with
+ *     links pointing to it.
+ *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ *  9. We can't remove a root or mountpoint.
+ * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ *     nfs_async_unlink().
+ */
+
+static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
+{
+	int error;
+
+	if (!victim->d_inode)
+		return -ENOENT;
+
+	BUG_ON(victim->d_parent->d_inode != dir);
+	audit_inode_child(victim, dir);
+
+	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	if (error)
+		return error;
+	if (IS_APPEND(dir))
+		return -EPERM;
+	if (btrfs_check_sticky(dir, victim->d_inode)||
+		IS_APPEND(victim->d_inode)||
+	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
+		return -EPERM;
+	if (isdir) {
+		if (!S_ISDIR(victim->d_inode->i_mode))
+			return -ENOTDIR;
+		if (IS_ROOT(victim))
+			return -EBUSY;
+	} else if (S_ISDIR(victim->d_inode->i_mode))
+		return -EISDIR;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
+		return -EBUSY;
+	return 0;
+}
+
+/* copy of may_create in fs/namei.c() */
+static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+{
+	if (child->d_inode)
+		return -EEXIST;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/*
+ * Create a new subvolume below @parent.  This is largely modeled after
+ * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
+ * inside this filesystem so it's quite a bit simpler.
+ */
+static noinline int btrfs_mksubvol(struct path *parent,
+				   char *name, int namelen,
+				   struct btrfs_root *snap_src,
+				   u64 *async_transid, bool readonly)
+{
+	struct inode *dir  = parent->dentry->d_inode;
+	struct dentry *dentry;
+	int error;
+
+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+
+	dentry = lookup_one_len(name, parent->dentry, namelen);
+	error = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto out_unlock;
+
+	error = -EEXIST;
+	if (dentry->d_inode)
+		goto out_dput;
+
+	error = mnt_want_write(parent->mnt);
+	if (error)
+		goto out_dput;
+
+	error = btrfs_may_create(dir, dentry);
+	if (error)
+		goto out_drop_write;
+
+	down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+
+	if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
+		goto out_up_read;
+
+	if (snap_src) {
+		error = create_snapshot(snap_src, dentry,
+					name, namelen, async_transid, readonly);
+	} else {
+		error = create_subvol(BTRFS_I(dir)->root, dentry,
+				      name, namelen, async_transid);
+	}
+	if (!error)
+		fsnotify_mkdir(dir, dentry);
+out_up_read:
+	up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+out_drop_write:
+	mnt_drop_write(parent->mnt);
+out_dput:
+	dput(dentry);
+out_unlock:
+	mutex_unlock(&dir->i_mutex);
+	return error;
+}
+
+/*
+ * When we're defragging a range, we don't want to kick it off again
+ * if it is really just waiting for delalloc to send it down.
+ * If we find a nice big extent or delalloc range for the bytes in the
+ * file you want to defrag, we return 0 to let you know to skip this
+ * part of the file
+ */
+static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
+{
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map *em = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	u64 end;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+	read_unlock(&em_tree->lock);
+
+	if (em) {
+		end = extent_map_end(em);
+		free_extent_map(em);
+		if (end - offset > thresh)
+			return 0;
+	}
+	/* if we already have a nice delalloc here, just stop */
+	thresh /= 2;
+	end = count_range_bits(io_tree, &offset, offset + thresh,
+			       thresh, EXTENT_DELALLOC, 1);
+	if (end >= thresh)
+		return 0;
+	return 1;
+}
+
+/*
+ * helper function to walk through a file and find extents
+ * newer than a specific transid, and smaller than thresh.
+ *
+ * This is used by the defragging code to find new and small
+ * extents
+ */
+static int find_new_extents(struct btrfs_root *root,
+			    struct inode *inode, u64 newer_than,
+			    u64 *off, int thresh)
+{
+	struct btrfs_path *path;
+	struct btrfs_key min_key;
+	struct btrfs_key max_key;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *extent;
+	int type;
+	int ret;
+	u64 ino = btrfs_ino(inode);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	min_key.objectid = ino;
+	min_key.type = BTRFS_EXTENT_DATA_KEY;
+	min_key.offset = *off;
+
+	max_key.objectid = ino;
+	max_key.type = (u8)-1;
+	max_key.offset = (u64)-1;
+
+	path->keep_locks = 1;
+
+	while(1) {
+		ret = btrfs_search_forward(root, &min_key, &max_key,
+					   path, 0, newer_than);
+		if (ret != 0)
+			goto none;
+		if (min_key.objectid != ino)
+			goto none;
+		if (min_key.type != BTRFS_EXTENT_DATA_KEY)
+			goto none;
+
+		leaf = path->nodes[0];
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_file_extent_item);
+
+		type = btrfs_file_extent_type(leaf, extent);
+		if (type == BTRFS_FILE_EXTENT_REG &&
+		    btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
+		    check_defrag_in_cache(inode, min_key.offset, thresh)) {
+			*off = min_key.offset;
+			btrfs_free_path(path);
+			return 0;
+		}
+
+		if (min_key.offset == (u64)-1)
+			goto none;
+
+		min_key.offset++;
+		btrfs_release_path(path);
+	}
+none:
+	btrfs_free_path(path);
+	return -ENOENT;
+}
+
+static int should_defrag_range(struct inode *inode, u64 start, u64 len,
+			       int thresh, u64 *last_len, u64 *skip,
+			       u64 *defrag_end)
+{
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map *em = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	int ret = 1;
+
+	/*
+	 * make sure that once we start defragging and extent, we keep on
+	 * defragging it
+	 */
+	if (start < *defrag_end)
+		return 1;
+
+	*skip = 0;
+
+	/*
+	 * hopefully we have this extent in the tree already, try without
+	 * the full extent lock
+	 */
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, len);
+	read_unlock(&em_tree->lock);
+
+	if (!em) {
+		/* get the big lock and read metadata off disk */
+		lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+		em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+		unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+
+		if (IS_ERR(em))
+			return 0;
+	}
+
+	/* this will cover holes, and inline extents */
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE)
+		ret = 0;
+
+	/*
+	 * we hit a real extent, if it is big don't bother defragging it again
+	 */
+	if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
+		ret = 0;
+
+	/*
+	 * last_len ends up being a counter of how many bytes we've defragged.
+	 * every time we choose not to defrag an extent, we reset *last_len
+	 * so that the next tiny extent will force a defrag.
+	 *
+	 * The end result of this is that tiny extents before a single big
+	 * extent will force at least part of that big extent to be defragged.
+	 */
+	if (ret) {
+		*last_len += len;
+		*defrag_end = extent_map_end(em);
+	} else {
+		*last_len = 0;
+		*skip = extent_map_end(em);
+		*defrag_end = 0;
+	}
+
+	free_extent_map(em);
+	return ret;
+}
+
+/*
+ * it doesn't do much good to defrag one or two pages
+ * at a time.  This pulls in a nice chunk of pages
+ * to COW and defrag.
+ *
+ * It also makes sure the delalloc code has enough
+ * dirty data to avoid making new small extents as part
+ * of the defrag
+ *
+ * It's a good idea to start RA on this range
+ * before calling this.
+ */
+static int cluster_pages_for_defrag(struct inode *inode,
+				    struct page **pages,
+				    unsigned long start_index,
+				    int num_pages)
+{
+	unsigned long file_end;
+	u64 isize = i_size_read(inode);
+	u64 page_start;
+	u64 page_end;
+	int ret;
+	int i;
+	int i_done;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
+
+	if (isize == 0)
+		return 0;
+	file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+
+	ret = btrfs_delalloc_reserve_space(inode,
+					   num_pages << PAGE_CACHE_SHIFT);
+	if (ret)
+		return ret;
+again:
+	ret = 0;
+	i_done = 0;
+
+	/* step one, lock all the pages */
+	for (i = 0; i < num_pages; i++) {
+		struct page *page;
+		page = grab_cache_page(inode->i_mapping,
+					    start_index + i);
+		if (!page)
+			break;
+
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				ret = -EIO;
+				break;
+			}
+		}
+		isize = i_size_read(inode);
+		file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+		if (!isize || page->index > file_end ||
+		    page->mapping != inode->i_mapping) {
+			/* whoops, we blew past eof, skip this page */
+			unlock_page(page);
+			page_cache_release(page);
+			break;
+		}
+		pages[i] = page;
+		i_done++;
+	}
+	if (!i_done || ret)
+		goto out;
+
+	if (!(inode->i_sb->s_flags & MS_ACTIVE))
+		goto out;
+
+	/*
+	 * so now we have a nice long stream of locked
+	 * and up to date pages, lets wait on them
+	 */
+	for (i = 0; i < i_done; i++)
+		wait_on_page_writeback(pages[i]);
+
+	page_start = page_offset(pages[0]);
+	page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
+
+	lock_extent_bits(&BTRFS_I(inode)->io_tree,
+			 page_start, page_end - 1, 0, &cached_state,
+			 GFP_NOFS);
+	ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1);
+	if (ordered &&
+	    ordered->file_offset + ordered->len > page_start &&
+	    ordered->file_offset < page_end) {
+		btrfs_put_ordered_extent(ordered);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+				     page_start, page_end - 1,
+				     &cached_state, GFP_NOFS);
+		for (i = 0; i < i_done; i++) {
+			unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		btrfs_wait_ordered_range(inode, page_start,
+					 page_end - page_start);
+		goto again;
+	}
+	if (ordered)
+		btrfs_put_ordered_extent(ordered);
+
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
+			  page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+			  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
+			  GFP_NOFS);
+
+	if (i_done != num_pages) {
+		atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+		btrfs_delalloc_release_space(inode,
+				     (num_pages - i_done) << PAGE_CACHE_SHIFT);
+	}
+
+
+	btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
+				  &cached_state);
+
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+			     page_start, page_end - 1, &cached_state,
+			     GFP_NOFS);
+
+	for (i = 0; i < i_done; i++) {
+		clear_page_dirty_for_io(pages[i]);
+		ClearPageChecked(pages[i]);
+		set_page_extent_mapped(pages[i]);
+		set_page_dirty(pages[i]);
+		unlock_page(pages[i]);
+		page_cache_release(pages[i]);
+	}
+	return i_done;
+out:
+	for (i = 0; i < i_done; i++) {
+		unlock_page(pages[i]);
+		page_cache_release(pages[i]);
+	}
+	btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT);
+	return ret;
+
+}
+
+int btrfs_defrag_file(struct inode *inode, struct file *file,
+		      struct btrfs_ioctl_defrag_range_args *range,
+		      u64 newer_than, unsigned long max_to_defrag)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_super_block *disk_super;
+	struct file_ra_state *ra = NULL;
+	unsigned long last_index;
+	u64 features;
+	u64 last_len = 0;
+	u64 skip = 0;
+	u64 defrag_end = 0;
+	u64 newer_off = range->start;
+	int newer_left = 0;
+	unsigned long i;
+	int ret;
+	int defrag_count = 0;
+	int compress_type = BTRFS_COMPRESS_ZLIB;
+	int extent_thresh = range->extent_thresh;
+	int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+	u64 new_align = ~((u64)128 * 1024 - 1);
+	struct page **pages = NULL;
+
+	if (extent_thresh == 0)
+		extent_thresh = 256 * 1024;
+
+	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+		if (range->compress_type > BTRFS_COMPRESS_TYPES)
+			return -EINVAL;
+		if (range->compress_type)
+			compress_type = range->compress_type;
+	}
+
+	if (inode->i_size == 0)
+		return 0;
+
+	/*
+	 * if we were not given a file, allocate a readahead
+	 * context
+	 */
+	if (!file) {
+		ra = kzalloc(sizeof(*ra), GFP_NOFS);
+		if (!ra)
+			return -ENOMEM;
+		file_ra_state_init(ra, inode->i_mapping);
+	} else {
+		ra = &file->f_ra;
+	}
+
+	pages = kmalloc(sizeof(struct page *) * newer_cluster,
+			GFP_NOFS);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out_ra;
+	}
+
+	/* find the last page to defrag */
+	if (range->start + range->len > range->start) {
+		last_index = min_t(u64, inode->i_size - 1,
+			 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
+	} else {
+		last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	if (newer_than) {
+		ret = find_new_extents(root, inode, newer_than,
+				       &newer_off, 64 * 1024);
+		if (!ret) {
+			range->start = newer_off;
+			/*
+			 * we always align our defrag to help keep
+			 * the extents in the file evenly spaced
+			 */
+			i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+			newer_left = newer_cluster;
+		} else
+			goto out_ra;
+	} else {
+		i = range->start >> PAGE_CACHE_SHIFT;
+	}
+	if (!max_to_defrag)
+		max_to_defrag = last_index - 1;
+
+	while (i <= last_index && defrag_count < max_to_defrag) {
+		/*
+		 * make sure we stop running if someone unmounts
+		 * the FS
+		 */
+		if (!(inode->i_sb->s_flags & MS_ACTIVE))
+			break;
+
+		if (!newer_than &&
+		    !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+					PAGE_CACHE_SIZE,
+					extent_thresh,
+					&last_len, &skip,
+					&defrag_end)) {
+			unsigned long next;
+			/*
+			 * the should_defrag function tells us how much to skip
+			 * bump our counter by the suggested amount
+			 */
+			next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+			i = max(i + 1, next);
+			continue;
+		}
+		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
+			BTRFS_I(inode)->force_compress = compress_type;
+
+		btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
+
+		ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
+		if (ret < 0)
+			goto out_ra;
+
+		defrag_count += ret;
+		balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
+		i += ret;
+
+		if (newer_than) {
+			if (newer_off == (u64)-1)
+				break;
+
+			newer_off = max(newer_off + 1,
+					(u64)i << PAGE_CACHE_SHIFT);
+
+			ret = find_new_extents(root, inode,
+					       newer_than, &newer_off,
+					       64 * 1024);
+			if (!ret) {
+				range->start = newer_off;
+				i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+				newer_left = newer_cluster;
+			} else {
+				break;
+			}
+		} else {
+			i++;
+		}
+	}
+
+	if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
+		filemap_flush(inode->i_mapping);
+
+	if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+		/* the filemap_flush will queue IO into the worker threads, but
+		 * we have to make sure the IO is actually started and that
+		 * ordered extents get created before we return
+		 */
+		atomic_inc(&root->fs_info->async_submit_draining);
+		while (atomic_read(&root->fs_info->nr_async_submits) ||
+		      atomic_read(&root->fs_info->async_delalloc_pages)) {
+			wait_event(root->fs_info->async_submit_wait,
+			   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+			    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+		}
+		atomic_dec(&root->fs_info->async_submit_draining);
+
+		mutex_lock(&inode->i_mutex);
+		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
+		mutex_unlock(&inode->i_mutex);
+	}
+
+	disk_super = &root->fs_info->super_copy;
+	features = btrfs_super_incompat_flags(disk_super);
+	if (range->compress_type == BTRFS_COMPRESS_LZO) {
+		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+		btrfs_set_super_incompat_flags(disk_super, features);
+	}
+
+	if (!file)
+		kfree(ra);
+	return defrag_count;
+
+out_ra:
+	if (!file)
+		kfree(ra);
+	kfree(pages);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+					void __user *arg)
+{
+	u64 new_size;
+	u64 old_size;
+	u64 devid = 1;
+	struct btrfs_ioctl_vol_args *vol_args;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_device *device = NULL;
+	char *sizestr;
+	char *devstr = NULL;
+	int ret = 0;
+	int mod = 0;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
+
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+
+	mutex_lock(&root->fs_info->volume_mutex);
+	sizestr = vol_args->name;
+	devstr = strchr(sizestr, ':');
+	if (devstr) {
+		char *end;
+		sizestr = devstr + 1;
+		*devstr = '\0';
+		devstr = vol_args->name;
+		devid = simple_strtoull(devstr, &end, 10);
+		printk(KERN_INFO "resizing devid %llu\n",
+		       (unsigned long long)devid);
+	}
+	device = btrfs_find_device(root, devid, NULL, NULL);
+	if (!device) {
+		printk(KERN_INFO "resizer unable to find device %llu\n",
+		       (unsigned long long)devid);
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	if (!strcmp(sizestr, "max"))
+		new_size = device->bdev->bd_inode->i_size;
+	else {
+		if (sizestr[0] == '-') {
+			mod = -1;
+			sizestr++;
+		} else if (sizestr[0] == '+') {
+			mod = 1;
+			sizestr++;
+		}
+		new_size = memparse(sizestr, NULL);
+		if (new_size == 0) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+	}
+
+	old_size = device->total_bytes;
+
+	if (mod < 0) {
+		if (new_size > old_size) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		new_size = old_size - new_size;
+	} else if (mod > 0) {
+		new_size = old_size + new_size;
+	}
+
+	if (new_size < 256 * 1024 * 1024) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	if (new_size > device->bdev->bd_inode->i_size) {
+		ret = -EFBIG;
+		goto out_unlock;
+	}
+
+	do_div(new_size, root->sectorsize);
+	new_size *= root->sectorsize;
+
+	printk(KERN_INFO "new size for %s is %llu\n",
+		device->name, (unsigned long long)new_size);
+
+	if (new_size > old_size) {
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			goto out_unlock;
+		}
+		ret = btrfs_grow_device(trans, device, new_size);
+		btrfs_commit_transaction(trans, root);
+	} else {
+		ret = btrfs_shrink_device(device, new_size);
+	}
+
+out_unlock:
+	mutex_unlock(&root->fs_info->volume_mutex);
+	kfree(vol_args);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
+						    char *name,
+						    unsigned long fd,
+						    int subvol,
+						    u64 *transid,
+						    bool readonly)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct file *src_file;
+	int namelen;
+	int ret = 0;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	namelen = strlen(name);
+	if (strchr(name, '/')) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (subvol) {
+		ret = btrfs_mksubvol(&file->f_path, name, namelen,
+				     NULL, transid, readonly);
+	} else {
+		struct inode *src_inode;
+		src_file = fget(fd);
+		if (!src_file) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		src_inode = src_file->f_path.dentry->d_inode;
+		if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
+			printk(KERN_INFO "btrfs: Snapshot src from "
+			       "another FS\n");
+			ret = -EINVAL;
+			fput(src_file);
+			goto out;
+		}
+		ret = btrfs_mksubvol(&file->f_path, name, namelen,
+				     BTRFS_I(src_inode)->root,
+				     transid, readonly);
+		fput(src_file);
+	}
+out:
+	return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create(struct file *file,
+					    void __user *arg, int subvol)
+{
+	struct btrfs_ioctl_vol_args *vol_args;
+	int ret;
+
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+
+	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+					      vol_args->fd, subvol,
+					      NULL, false);
+
+	kfree(vol_args);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
+					       void __user *arg, int subvol)
+{
+	struct btrfs_ioctl_vol_args_v2 *vol_args;
+	int ret;
+	u64 transid = 0;
+	u64 *ptr = NULL;
+	bool readonly = false;
+
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
+	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+
+	if (vol_args->flags &
+	    ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+		ptr = &transid;
+	if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
+		readonly = true;
+
+	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+					      vol_args->fd, subvol,
+					      ptr, readonly);
+
+	if (ret == 0 && ptr &&
+	    copy_to_user(arg +
+			 offsetof(struct btrfs_ioctl_vol_args_v2,
+				  transid), ptr, sizeof(*ptr)))
+		ret = -EFAULT;
+out:
+	kfree(vol_args);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
+						void __user *arg)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+	u64 flags = 0;
+
+	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
+		return -EINVAL;
+
+	down_read(&root->fs_info->subvol_sem);
+	if (btrfs_root_readonly(root))
+		flags |= BTRFS_SUBVOL_RDONLY;
+	up_read(&root->fs_info->subvol_sem);
+
+	if (copy_to_user(arg, &flags, sizeof(flags)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
+					      void __user *arg)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	u64 root_flags;
+	u64 flags;
+	int ret = 0;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
+		return -EINVAL;
+
+	if (copy_from_user(&flags, arg, sizeof(flags)))
+		return -EFAULT;
+
+	if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
+		return -EINVAL;
+
+	if (flags & ~BTRFS_SUBVOL_RDONLY)
+		return -EOPNOTSUPP;
+
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
+	down_write(&root->fs_info->subvol_sem);
+
+	/* nothing to do */
+	if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
+		goto out;
+
+	root_flags = btrfs_root_flags(&root->root_item);
+	if (flags & BTRFS_SUBVOL_RDONLY)
+		btrfs_set_root_flags(&root->root_item,
+				     root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
+	else
+		btrfs_set_root_flags(&root->root_item,
+				     root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
+
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_reset;
+	}
+
+	ret = btrfs_update_root(trans, root->fs_info->tree_root,
+				&root->root_key, &root->root_item);
+
+	btrfs_commit_transaction(trans, root);
+out_reset:
+	if (ret)
+		btrfs_set_root_flags(&root->root_item, root_flags);
+out:
+	up_write(&root->fs_info->subvol_sem);
+	return ret;
+}
+
+/*
+ * helper to check if the subvolume references other subvolumes
+ */
+static noinline int may_destroy_subvol(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = root->root_key.objectid;
+	key.type = BTRFS_ROOT_REF_KEY;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
+				&key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0);
+
+	ret = 0;
+	if (path->slots[0] > 0) {
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid == root->root_key.objectid &&
+		    key.type == BTRFS_ROOT_REF_KEY)
+			ret = -ENOTEMPTY;
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int key_in_sk(struct btrfs_key *key,
+			      struct btrfs_ioctl_search_key *sk)
+{
+	struct btrfs_key test;
+	int ret;
+
+	test.objectid = sk->min_objectid;
+	test.type = sk->min_type;
+	test.offset = sk->min_offset;
+
+	ret = btrfs_comp_cpu_keys(key, &test);
+	if (ret < 0)
+		return 0;
+
+	test.objectid = sk->max_objectid;
+	test.type = sk->max_type;
+	test.offset = sk->max_offset;
+
+	ret = btrfs_comp_cpu_keys(key, &test);
+	if (ret > 0)
+		return 0;
+	return 1;
+}
+
+static noinline int copy_to_sk(struct btrfs_root *root,
+			       struct btrfs_path *path,
+			       struct btrfs_key *key,
+			       struct btrfs_ioctl_search_key *sk,
+			       char *buf,
+			       unsigned long *sk_offset,
+			       int *num_found)
+{
+	u64 found_transid;
+	struct extent_buffer *leaf;
+	struct btrfs_ioctl_search_header sh;
+	unsigned long item_off;
+	unsigned long item_len;
+	int nritems;
+	int i;
+	int slot;
+	int ret = 0;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	nritems = btrfs_header_nritems(leaf);
+
+	if (btrfs_header_generation(leaf) > sk->max_transid) {
+		i = nritems;
+		goto advance_key;
+	}
+	found_transid = btrfs_header_generation(leaf);
+
+	for (i = slot; i < nritems; i++) {
+		item_off = btrfs_item_ptr_offset(leaf, i);
+		item_len = btrfs_item_size_nr(leaf, i);
+
+		if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
+			item_len = 0;
+
+		if (sizeof(sh) + item_len + *sk_offset >
+		    BTRFS_SEARCH_ARGS_BUFSIZE) {
+			ret = 1;
+			goto overflow;
+		}
+
+		btrfs_item_key_to_cpu(leaf, key, i);
+		if (!key_in_sk(key, sk))
+			continue;
+
+		sh.objectid = key->objectid;
+		sh.offset = key->offset;
+		sh.type = key->type;
+		sh.len = item_len;
+		sh.transid = found_transid;
+
+		/* copy search result header */
+		memcpy(buf + *sk_offset, &sh, sizeof(sh));
+		*sk_offset += sizeof(sh);
+
+		if (item_len) {
+			char *p = buf + *sk_offset;
+			/* copy the item */
+			read_extent_buffer(leaf, p,
+					   item_off, item_len);
+			*sk_offset += item_len;
+		}
+		(*num_found)++;
+
+		if (*num_found >= sk->nr_items)
+			break;
+	}
+advance_key:
+	ret = 0;
+	if (key->offset < (u64)-1 && key->offset < sk->max_offset)
+		key->offset++;
+	else if (key->type < (u8)-1 && key->type < sk->max_type) {
+		key->offset = 0;
+		key->type++;
+	} else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
+		key->offset = 0;
+		key->type = 0;
+		key->objectid++;
+	} else
+		ret = 1;
+overflow:
+	return ret;
+}
+
+static noinline int search_ioctl(struct inode *inode,
+				 struct btrfs_ioctl_search_args *args)
+{
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	struct btrfs_key max_key;
+	struct btrfs_path *path;
+	struct btrfs_ioctl_search_key *sk = &args->key;
+	struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
+	int ret;
+	int num_found = 0;
+	unsigned long sk_offset = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	if (sk->tree_id == 0) {
+		/* search the root of the inode that was passed */
+		root = BTRFS_I(inode)->root;
+	} else {
+		key.objectid = sk->tree_id;
+		key.type = BTRFS_ROOT_ITEM_KEY;
+		key.offset = (u64)-1;
+		root = btrfs_read_fs_root_no_name(info, &key);
+		if (IS_ERR(root)) {
+			printk(KERN_ERR "could not find root %llu\n",
+			       sk->tree_id);
+			btrfs_free_path(path);
+			return -ENOENT;
+		}
+	}
+
+	key.objectid = sk->min_objectid;
+	key.type = sk->min_type;
+	key.offset = sk->min_offset;
+
+	max_key.objectid = sk->max_objectid;
+	max_key.type = sk->max_type;
+	max_key.offset = sk->max_offset;
+
+	path->keep_locks = 1;
+
+	while(1) {
+		ret = btrfs_search_forward(root, &key, &max_key, path, 0,
+					   sk->min_transid);
+		if (ret != 0) {
+			if (ret > 0)
+				ret = 0;
+			goto err;
+		}
+		ret = copy_to_sk(root, path, &key, sk, args->buf,
+				 &sk_offset, &num_found);
+		btrfs_release_path(path);
+		if (ret || num_found >= sk->nr_items)
+			break;
+
+	}
+	ret = 0;
+err:
+	sk->nr_items = num_found;
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_tree_search(struct file *file,
+					   void __user *argp)
+{
+	 struct btrfs_ioctl_search_args *args;
+	 struct inode *inode;
+	 int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	args = memdup_user(argp, sizeof(*args));
+	if (IS_ERR(args))
+		return PTR_ERR(args);
+
+	inode = fdentry(file)->d_inode;
+	ret = search_ioctl(inode, args);
+	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+		ret = -EFAULT;
+	kfree(args);
+	return ret;
+}
+
+/*
+ * Search INODE_REFs to identify path name of 'dirid' directory
+ * in a 'tree_id' tree. and sets path name to 'name'.
+ */
+static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
+				u64 tree_id, u64 dirid, char *name)
+{
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	char *ptr;
+	int ret = -1;
+	int slot;
+	int len;
+	int total_len = 0;
+	struct btrfs_inode_ref *iref;
+	struct extent_buffer *l;
+	struct btrfs_path *path;
+
+	if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
+		name[0]='\0';
+		return 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
+
+	key.objectid = tree_id;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	root = btrfs_read_fs_root_no_name(info, &key);
+	if (IS_ERR(root)) {
+		printk(KERN_ERR "could not find root %llu\n", tree_id);
+		ret = -ENOENT;
+		goto out;
+	}
+
+	key.objectid = dirid;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = (u64)-1;
+
+	while(1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+		if (ret > 0 && slot > 0)
+			slot--;
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (ret > 0 && (key.objectid != dirid ||
+				key.type != BTRFS_INODE_REF_KEY)) {
+			ret = -ENOENT;
+			goto out;
+		}
+
+		iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
+		len = btrfs_inode_ref_name_len(l, iref);
+		ptr -= len + 1;
+		total_len += len + 1;
+		if (ptr < name)
+			goto out;
+
+		*(ptr + len) = '/';
+		read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
+
+		if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
+			break;
+
+		btrfs_release_path(path);
+		key.objectid = key.offset;
+		key.offset = (u64)-1;
+		dirid = key.objectid;
+
+	}
+	if (ptr < name)
+		goto out;
+	memcpy(name, ptr, total_len);
+	name[total_len]='\0';
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_ino_lookup(struct file *file,
+					   void __user *argp)
+{
+	 struct btrfs_ioctl_ino_lookup_args *args;
+	 struct inode *inode;
+	 int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	args = memdup_user(argp, sizeof(*args));
+	if (IS_ERR(args))
+		return PTR_ERR(args);
+
+	inode = fdentry(file)->d_inode;
+
+	if (args->treeid == 0)
+		args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+
+	ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
+					args->treeid, args->objectid,
+					args->name);
+
+	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+		ret = -EFAULT;
+
+	kfree(args);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_snap_destroy(struct file *file,
+					     void __user *arg)
+{
+	struct dentry *parent = fdentry(file);
+	struct dentry *dentry;
+	struct inode *dir = parent->d_inode;
+	struct inode *inode;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_root *dest = NULL;
+	struct btrfs_ioctl_vol_args *vol_args;
+	struct btrfs_trans_handle *trans;
+	int namelen;
+	int ret;
+	int err = 0;
+
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
+
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+	namelen = strlen(vol_args->name);
+	if (strchr(vol_args->name, '/') ||
+	    strncmp(vol_args->name, "..", namelen) == 0) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = mnt_want_write(file->f_path.mnt);
+	if (err)
+		goto out;
+
+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_one_len(vol_args->name, parent, namelen);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out_unlock_dir;
+	}
+
+	if (!dentry->d_inode) {
+		err = -ENOENT;
+		goto out_dput;
+	}
+
+	inode = dentry->d_inode;
+	dest = BTRFS_I(inode)->root;
+	if (!capable(CAP_SYS_ADMIN)){
+		/*
+		 * Regular user.  Only allow this with a special mount
+		 * option, when the user has write+exec access to the
+		 * subvol root, and when rmdir(2) would have been
+		 * allowed.
+		 *
+		 * Note that this is _not_ check that the subvol is
+		 * empty or doesn't contain data that we wouldn't
+		 * otherwise be able to delete.
+		 *
+		 * Users who want to delete empty subvols should try
+		 * rmdir(2).
+		 */
+		err = -EPERM;
+		if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+			goto out_dput;
+
+		/*
+		 * Do not allow deletion if the parent dir is the same
+		 * as the dir to be deleted.  That means the ioctl
+		 * must be called on the dentry referencing the root
+		 * of the subvol, not a random directory contained
+		 * within it.
+		 */
+		err = -EINVAL;
+		if (root == dest)
+			goto out_dput;
+
+		err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
+		if (err)
+			goto out_dput;
+
+		/* check if subvolume may be deleted by a non-root user */
+		err = btrfs_may_delete(dir, dentry, 1);
+		if (err)
+			goto out_dput;
+	}
+
+	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
+		err = -EINVAL;
+		goto out_dput;
+	}
+
+	mutex_lock(&inode->i_mutex);
+	err = d_invalidate(dentry);
+	if (err)
+		goto out_unlock;
+
+	down_write(&root->fs_info->subvol_sem);
+
+	err = may_destroy_subvol(dest);
+	if (err)
+		goto out_up_write;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_up_write;
+	}
+	trans->block_rsv = &root->fs_info->global_block_rsv;
+
+	ret = btrfs_unlink_subvol(trans, root, dir,
+				dest->root_key.objectid,
+				dentry->d_name.name,
+				dentry->d_name.len);
+	BUG_ON(ret);
+
+	btrfs_record_root_in_trans(trans, dest);
+
+	memset(&dest->root_item.drop_progress, 0,
+		sizeof(dest->root_item.drop_progress));
+	dest->root_item.drop_level = 0;
+	btrfs_set_root_refs(&dest->root_item, 0);
+
+	if (!xchg(&dest->orphan_item_inserted, 1)) {
+		ret = btrfs_insert_orphan_item(trans,
+					root->fs_info->tree_root,
+					dest->root_key.objectid);
+		BUG_ON(ret);
+	}
+
+	ret = btrfs_end_transaction(trans, root);
+	BUG_ON(ret);
+	inode->i_flags |= S_DEAD;
+out_up_write:
+	up_write(&root->fs_info->subvol_sem);
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	if (!err) {
+		shrink_dcache_sb(root->fs_info->sb);
+		btrfs_invalidate_inodes(dest);
+		d_delete(dentry);
+	}
+out_dput:
+	dput(dentry);
+out_unlock_dir:
+	mutex_unlock(&dir->i_mutex);
+	mnt_drop_write(file->f_path.mnt);
+out:
+	kfree(vol_args);
+	return err;
+}
+
+static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_ioctl_defrag_range_args *range;
+	int ret;
+
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	ret = mnt_want_write(file->f_path.mnt);
+	if (ret)
+		return ret;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFDIR:
+		if (!capable(CAP_SYS_ADMIN)) {
+			ret = -EPERM;
+			goto out;
+		}
+		ret = btrfs_defrag_root(root, 0);
+		if (ret)
+			goto out;
+		ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
+		break;
+	case S_IFREG:
+		if (!(file->f_mode & FMODE_WRITE)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		range = kzalloc(sizeof(*range), GFP_KERNEL);
+		if (!range) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		if (argp) {
+			if (copy_from_user(range, argp,
+					   sizeof(*range))) {
+				ret = -EFAULT;
+				kfree(range);
+				goto out;
+			}
+			/* compression requires us to start the IO */
+			if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+				range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
+				range->extent_thresh = (u32)-1;
+			}
+		} else {
+			/* the rest are all set to zero by kzalloc */
+			range->len = (u64)-1;
+		}
+		ret = btrfs_defrag_file(fdentry(file)->d_inode, file,
+					range, 0, 0);
+		if (ret > 0)
+			ret = 0;
+		kfree(range);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+out:
+	mnt_drop_write(file->f_path.mnt);
+	return ret;
+}
+
+static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_vol_args *vol_args;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
+
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+	ret = btrfs_init_new_device(root, vol_args->name);
+
+	kfree(vol_args);
+	return ret;
+}
+
+static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_vol_args *vol_args;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
+
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+	ret = btrfs_rm_device(root, vol_args->name);
+
+	kfree(vol_args);
+	return ret;
+}
+
+static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_fs_info_args *fi_args;
+	struct btrfs_device *device;
+	struct btrfs_device *next;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
+	if (!fi_args)
+		return -ENOMEM;
+
+	fi_args->num_devices = fs_devices->num_devices;
+	memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+		if (device->devid > fi_args->max_id)
+			fi_args->max_id = device->devid;
+	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+	if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
+		ret = -EFAULT;
+
+	kfree(fi_args);
+	return ret;
+}
+
+static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_dev_info_args *di_args;
+	struct btrfs_device *dev;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	int ret = 0;
+	char *s_uuid = NULL;
+	char empty_uuid[BTRFS_UUID_SIZE] = {0};
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	di_args = memdup_user(arg, sizeof(*di_args));
+	if (IS_ERR(di_args))
+		return PTR_ERR(di_args);
+
+	if (memcmp(empty_uuid, di_args->uuid, BTRFS_UUID_SIZE) != 0)
+		s_uuid = di_args->uuid;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+	if (!dev) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	di_args->devid = dev->devid;
+	di_args->bytes_used = dev->bytes_used;
+	di_args->total_bytes = dev->total_bytes;
+	memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
+	strncpy(di_args->path, dev->name, sizeof(di_args->path));
+
+out:
+	if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
+		ret = -EFAULT;
+
+	kfree(di_args);
+	return ret;
+}
+
+static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+				       u64 off, u64 olen, u64 destoff)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct file *src_file;
+	struct inode *src;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	char *buf;
+	struct btrfs_key key;
+	u32 nritems;
+	int slot;
+	int ret;
+	u64 len = olen;
+	u64 bs = root->fs_info->sb->s_blocksize;
+	u64 hint_byte;
+
+	/*
+	 * TODO:
+	 * - split compressed inline extents.  annoying: we need to
+	 *   decompress into destination's address_space (the file offset
+	 *   may change, so source mapping won't do), then recompress (or
+	 *   otherwise reinsert) a subrange.
+	 * - allow ranges within the same file to be cloned (provided
+	 *   they don't overlap)?
+	 */
+
+	/* the destination must be opened for writing */
+	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
+		return -EINVAL;
+
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	ret = mnt_want_write(file->f_path.mnt);
+	if (ret)
+		return ret;
+
+	src_file = fget(srcfd);
+	if (!src_file) {
+		ret = -EBADF;
+		goto out_drop_write;
+	}
+
+	src = src_file->f_dentry->d_inode;
+
+	ret = -EINVAL;
+	if (src == inode)
+		goto out_fput;
+
+	/* the src must be open for reading */
+	if (!(src_file->f_mode & FMODE_READ))
+		goto out_fput;
+
+	ret = -EISDIR;
+	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
+		goto out_fput;
+
+	ret = -EXDEV;
+	if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
+		goto out_fput;
+
+	ret = -ENOMEM;
+	buf = vmalloc(btrfs_level_size(root, 0));
+	if (!buf)
+		goto out_fput;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		vfree(buf);
+		goto out_fput;
+	}
+	path->reada = 2;
+
+	if (inode < src) {
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+	} else {
+		mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+	}
+
+	/* determine range to clone */
+	ret = -EINVAL;
+	if (off + len > src->i_size || off + len < off)
+		goto out_unlock;
+	if (len == 0)
+		olen = len = src->i_size - off;
+	/* if we extend to eof, continue to block boundary */
+	if (off + len == src->i_size)
+		len = ALIGN(src->i_size, bs) - off;
+
+	/* verify the end result is block aligned */
+	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
+	    !IS_ALIGNED(destoff, bs))
+		goto out_unlock;
+
+	/* do any pending delalloc/csum calc on src, one way or
+	   another, and lock file content */
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(src, off+len);
+		if (!ordered &&
+		    !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
+				   EXTENT_DELALLOC, 0, NULL))
+			break;
+		unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		btrfs_wait_ordered_range(src, off, len);
+	}
+
+	/* clone data */
+	key.objectid = btrfs_ino(src);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = 0;
+
+	while (1) {
+		/*
+		 * note the key will change type as we walk through the
+		 * tree.
+		 */
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				break;
+			nritems = btrfs_header_nritems(path->nodes[0]);
+		}
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+		    key.objectid != btrfs_ino(src))
+			break;
+
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+			struct btrfs_file_extent_item *extent;
+			int type;
+			u32 size;
+			struct btrfs_key new_key;
+			u64 disko = 0, diskl = 0;
+			u64 datao = 0, datal = 0;
+			u8 comp;
+			u64 endoff;
+
+			size = btrfs_item_size_nr(leaf, slot);
+			read_extent_buffer(leaf, buf,
+					   btrfs_item_ptr_offset(leaf, slot),
+					   size);
+
+			extent = btrfs_item_ptr(leaf, slot,
+						struct btrfs_file_extent_item);
+			comp = btrfs_file_extent_compression(leaf, extent);
+			type = btrfs_file_extent_type(leaf, extent);
+			if (type == BTRFS_FILE_EXTENT_REG ||
+			    type == BTRFS_FILE_EXTENT_PREALLOC) {
+				disko = btrfs_file_extent_disk_bytenr(leaf,
+								      extent);
+				diskl = btrfs_file_extent_disk_num_bytes(leaf,
+								 extent);
+				datao = btrfs_file_extent_offset(leaf, extent);
+				datal = btrfs_file_extent_num_bytes(leaf,
+								    extent);
+			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+				/* take upper bound, may be compressed */
+				datal = btrfs_file_extent_ram_bytes(leaf,
+								    extent);
+			}
+			btrfs_release_path(path);
+
+			if (key.offset + datal <= off ||
+			    key.offset >= off+len)
+				goto next;
+
+			memcpy(&new_key, &key, sizeof(new_key));
+			new_key.objectid = btrfs_ino(inode);
+			if (off <= key.offset)
+				new_key.offset = key.offset + destoff - off;
+			else
+				new_key.offset = destoff;
+
+			trans = btrfs_start_transaction(root, 1);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				goto out;
+			}
+
+			if (type == BTRFS_FILE_EXTENT_REG ||
+			    type == BTRFS_FILE_EXTENT_PREALLOC) {
+				if (off > key.offset) {
+					datao += off - key.offset;
+					datal -= off - key.offset;
+				}
+
+				if (key.offset + datal > off + len)
+					datal = off + len - key.offset;
+
+				ret = btrfs_drop_extents(trans, inode,
+							 new_key.offset,
+							 new_key.offset + datal,
+							 &hint_byte, 1);
+				BUG_ON(ret);
+
+				ret = btrfs_insert_empty_item(trans, root, path,
+							      &new_key, size);
+				BUG_ON(ret);
+
+				leaf = path->nodes[0];
+				slot = path->slots[0];
+				write_extent_buffer(leaf, buf,
+					    btrfs_item_ptr_offset(leaf, slot),
+					    size);
+
+				extent = btrfs_item_ptr(leaf, slot,
+						struct btrfs_file_extent_item);
+
+				/* disko == 0 means it's a hole */
+				if (!disko)
+					datao = 0;
+
+				btrfs_set_file_extent_offset(leaf, extent,
+							     datao);
+				btrfs_set_file_extent_num_bytes(leaf, extent,
+								datal);
+				if (disko) {
+					inode_add_bytes(inode, datal);
+					ret = btrfs_inc_extent_ref(trans, root,
+							disko, diskl, 0,
+							root->root_key.objectid,
+							btrfs_ino(inode),
+							new_key.offset - datao);
+					BUG_ON(ret);
+				}
+			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+				u64 skip = 0;
+				u64 trim = 0;
+				if (off > key.offset) {
+					skip = off - key.offset;
+					new_key.offset += skip;
+				}
+
+				if (key.offset + datal > off+len)
+					trim = key.offset + datal - (off+len);
+
+				if (comp && (skip || trim)) {
+					ret = -EINVAL;
+					btrfs_end_transaction(trans, root);
+					goto out;
+				}
+				size -= skip + trim;
+				datal -= skip + trim;
+
+				ret = btrfs_drop_extents(trans, inode,
+							 new_key.offset,
+							 new_key.offset + datal,
+							 &hint_byte, 1);
+				BUG_ON(ret);
+
+				ret = btrfs_insert_empty_item(trans, root, path,
+							      &new_key, size);
+				BUG_ON(ret);
+
+				if (skip) {
+					u32 start =
+					  btrfs_file_extent_calc_inline_size(0);
+					memmove(buf+start, buf+start+skip,
+						datal);
+				}
+
+				leaf = path->nodes[0];
+				slot = path->slots[0];
+				write_extent_buffer(leaf, buf,
+					    btrfs_item_ptr_offset(leaf, slot),
+					    size);
+				inode_add_bytes(inode, datal);
+			}
+
+			btrfs_mark_buffer_dirty(leaf);
+			btrfs_release_path(path);
+
+			inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+			/*
+			 * we round up to the block size at eof when
+			 * determining which extents to clone above,
+			 * but shouldn't round up the file size
+			 */
+			endoff = new_key.offset + datal;
+			if (endoff > destoff+olen)
+				endoff = destoff+olen;
+			if (endoff > inode->i_size)
+				btrfs_i_size_write(inode, endoff);
+
+			BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
+			ret = btrfs_update_inode(trans, root, inode);
+			BUG_ON(ret);
+			btrfs_end_transaction(trans, root);
+		}
+next:
+		btrfs_release_path(path);
+		key.offset++;
+	}
+	ret = 0;
+out:
+	btrfs_release_path(path);
+	unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+out_unlock:
+	mutex_unlock(&src->i_mutex);
+	mutex_unlock(&inode->i_mutex);
+	vfree(buf);
+	btrfs_free_path(path);
+out_fput:
+	fput(src_file);
+out_drop_write:
+	mnt_drop_write(file->f_path.mnt);
+	return ret;
+}
+
+static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
+{
+	struct btrfs_ioctl_clone_range_args args;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+	return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
+				 args.src_length, args.dest_offset);
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks.  They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+static long btrfs_ioctl_trans_start(struct file *file)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	ret = -EPERM;
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	ret = -EINPROGRESS;
+	if (file->private_data)
+		goto out;
+
+	ret = -EROFS;
+	if (btrfs_root_readonly(root))
+		goto out;
+
+	ret = mnt_want_write(file->f_path.mnt);
+	if (ret)
+		goto out;
+
+	atomic_inc(&root->fs_info->open_ioctl_trans);
+
+	ret = -ENOMEM;
+	trans = btrfs_start_ioctl_transaction(root);
+	if (IS_ERR(trans))
+		goto out_drop;
+
+	file->private_data = trans;
+	return 0;
+
+out_drop:
+	atomic_dec(&root->fs_info->open_ioctl_trans);
+	mnt_drop_write(file->f_path.mnt);
+out:
+	return ret;
+}
+
+static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_root *new_root;
+	struct btrfs_dir_item *di;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct btrfs_key location;
+	struct btrfs_disk_key disk_key;
+	struct btrfs_super_block *disk_super;
+	u64 features;
+	u64 objectid = 0;
+	u64 dir_id;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&objectid, argp, sizeof(objectid)))
+		return -EFAULT;
+
+	if (!objectid)
+		objectid = root->root_key.objectid;
+
+	location.objectid = objectid;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+	location.offset = (u64)-1;
+
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+	if (IS_ERR(new_root))
+		return PTR_ERR(new_root);
+
+	if (btrfs_root_refs(&new_root->root_item) == 0)
+		return -ENOENT;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->leave_spinning = 1;
+
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+
+	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+	di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
+				   dir_id, "default", 7, 1);
+	if (IS_ERR_OR_NULL(di)) {
+		btrfs_free_path(path);
+		btrfs_end_transaction(trans, root);
+		printk(KERN_ERR "Umm, you don't have the default dir item, "
+		       "this isn't going to work\n");
+		return -ENOENT;
+	}
+
+	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
+	btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_free_path(path);
+
+	disk_super = &root->fs_info->super_copy;
+	features = btrfs_super_incompat_flags(disk_super);
+	if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
+		features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
+		btrfs_set_super_incompat_flags(disk_super, features);
+	}
+	btrfs_end_transaction(trans, root);
+
+	return 0;
+}
+
+static void get_block_group_info(struct list_head *groups_list,
+				 struct btrfs_ioctl_space_info *space)
+{
+	struct btrfs_block_group_cache *block_group;
+
+	space->total_bytes = 0;
+	space->used_bytes = 0;
+	space->flags = 0;
+	list_for_each_entry(block_group, groups_list, list) {
+		space->flags = block_group->flags;
+		space->total_bytes += block_group->key.offset;
+		space->used_bytes +=
+			btrfs_block_group_used(&block_group->item);
+	}
+}
+
+long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_space_args space_args;
+	struct btrfs_ioctl_space_info space;
+	struct btrfs_ioctl_space_info *dest;
+	struct btrfs_ioctl_space_info *dest_orig;
+	struct btrfs_ioctl_space_info __user *user_dest;
+	struct btrfs_space_info *info;
+	u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+		       BTRFS_BLOCK_GROUP_SYSTEM,
+		       BTRFS_BLOCK_GROUP_METADATA,
+		       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+	int num_types = 4;
+	int alloc_size;
+	int ret = 0;
+	u64 slot_count = 0;
+	int i, c;
+
+	if (copy_from_user(&space_args,
+			   (struct btrfs_ioctl_space_args __user *)arg,
+			   sizeof(space_args)))
+		return -EFAULT;
+
+	for (i = 0; i < num_types; i++) {
+		struct btrfs_space_info *tmp;
+
+		info = NULL;
+		rcu_read_lock();
+		list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+					list) {
+			if (tmp->flags == types[i]) {
+				info = tmp;
+				break;
+			}
+		}
+		rcu_read_unlock();
+
+		if (!info)
+			continue;
+
+		down_read(&info->groups_sem);
+		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+			if (!list_empty(&info->block_groups[c]))
+				slot_count++;
+		}
+		up_read(&info->groups_sem);
+	}
+
+	/* space_slots == 0 means they are asking for a count */
+	if (space_args.space_slots == 0) {
+		space_args.total_spaces = slot_count;
+		goto out;
+	}
+
+	slot_count = min_t(u64, space_args.space_slots, slot_count);
+
+	alloc_size = sizeof(*dest) * slot_count;
+
+	/* we generally have at most 6 or so space infos, one for each raid
+	 * level.  So, a whole page should be more than enough for everyone
+	 */
+	if (alloc_size > PAGE_CACHE_SIZE)
+		return -ENOMEM;
+
+	space_args.total_spaces = 0;
+	dest = kmalloc(alloc_size, GFP_NOFS);
+	if (!dest)
+		return -ENOMEM;
+	dest_orig = dest;
+
+	/* now we have a buffer to copy into */
+	for (i = 0; i < num_types; i++) {
+		struct btrfs_space_info *tmp;
+
+		if (!slot_count)
+			break;
+
+		info = NULL;
+		rcu_read_lock();
+		list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+					list) {
+			if (tmp->flags == types[i]) {
+				info = tmp;
+				break;
+			}
+		}
+		rcu_read_unlock();
+
+		if (!info)
+			continue;
+		down_read(&info->groups_sem);
+		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+			if (!list_empty(&info->block_groups[c])) {
+				get_block_group_info(&info->block_groups[c],
+						     &space);
+				memcpy(dest, &space, sizeof(space));
+				dest++;
+				space_args.total_spaces++;
+				slot_count--;
+			}
+			if (!slot_count)
+				break;
+		}
+		up_read(&info->groups_sem);
+	}
+
+	user_dest = (struct btrfs_ioctl_space_info *)
+		(arg + sizeof(struct btrfs_ioctl_space_args));
+
+	if (copy_to_user(user_dest, dest_orig, alloc_size))
+		ret = -EFAULT;
+
+	kfree(dest_orig);
+out:
+	if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks.  They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+long btrfs_ioctl_trans_end(struct file *file)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+
+	trans = file->private_data;
+	if (!trans)
+		return -EINVAL;
+	file->private_data = NULL;
+
+	btrfs_end_transaction(trans, root);
+
+	atomic_dec(&root->fs_info->open_ioctl_trans);
+
+	mnt_drop_write(file->f_path.mnt);
+	return 0;
+}
+
+static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+{
+	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
+	struct btrfs_trans_handle *trans;
+	u64 transid;
+	int ret;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+	transid = trans->transid;
+	ret = btrfs_commit_transaction_async(trans, root, 0);
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+
+	if (argp)
+		if (copy_to_user(argp, &transid, sizeof(transid)))
+			return -EFAULT;
+	return 0;
+}
+
+static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+{
+	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
+	u64 transid;
+
+	if (argp) {
+		if (copy_from_user(&transid, argp, sizeof(transid)))
+			return -EFAULT;
+	} else {
+		transid = 0;  /* current trans */
+	}
+	return btrfs_wait_for_commit(root, transid);
+}
+
+static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
+{
+	int ret;
+	struct btrfs_ioctl_scrub_args *sa;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa))
+		return PTR_ERR(sa);
+
+	ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
+			      &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
+
+	if (copy_to_user(arg, sa, sizeof(*sa)))
+		ret = -EFAULT;
+
+	kfree(sa);
+	return ret;
+}
+
+static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btrfs_scrub_cancel(root);
+}
+
+static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
+				       void __user *arg)
+{
+	struct btrfs_ioctl_scrub_args *sa;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa))
+		return PTR_ERR(sa);
+
+	ret = btrfs_scrub_progress(root, sa->devid, &sa->progress);
+
+	if (copy_to_user(arg, sa, sizeof(*sa)))
+		ret = -EFAULT;
+
+	kfree(sa);
+	return ret;
+}
+
+long btrfs_ioctl(struct file *file, unsigned int
+		cmd, unsigned long arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		return btrfs_ioctl_getflags(file, argp);
+	case FS_IOC_SETFLAGS:
+		return btrfs_ioctl_setflags(file, argp);
+	case FS_IOC_GETVERSION:
+		return btrfs_ioctl_getversion(file, argp);
+	case FITRIM:
+		return btrfs_ioctl_fitrim(file, argp);
+	case BTRFS_IOC_SNAP_CREATE:
+		return btrfs_ioctl_snap_create(file, argp, 0);
+	case BTRFS_IOC_SNAP_CREATE_V2:
+		return btrfs_ioctl_snap_create_v2(file, argp, 0);
+	case BTRFS_IOC_SUBVOL_CREATE:
+		return btrfs_ioctl_snap_create(file, argp, 1);
+	case BTRFS_IOC_SNAP_DESTROY:
+		return btrfs_ioctl_snap_destroy(file, argp);
+	case BTRFS_IOC_SUBVOL_GETFLAGS:
+		return btrfs_ioctl_subvol_getflags(file, argp);
+	case BTRFS_IOC_SUBVOL_SETFLAGS:
+		return btrfs_ioctl_subvol_setflags(file, argp);
+	case BTRFS_IOC_DEFAULT_SUBVOL:
+		return btrfs_ioctl_default_subvol(file, argp);
+	case BTRFS_IOC_DEFRAG:
+		return btrfs_ioctl_defrag(file, NULL);
+	case BTRFS_IOC_DEFRAG_RANGE:
+		return btrfs_ioctl_defrag(file, argp);
+	case BTRFS_IOC_RESIZE:
+		return btrfs_ioctl_resize(root, argp);
+	case BTRFS_IOC_ADD_DEV:
+		return btrfs_ioctl_add_dev(root, argp);
+	case BTRFS_IOC_RM_DEV:
+		return btrfs_ioctl_rm_dev(root, argp);
+	case BTRFS_IOC_FS_INFO:
+		return btrfs_ioctl_fs_info(root, argp);
+	case BTRFS_IOC_DEV_INFO:
+		return btrfs_ioctl_dev_info(root, argp);
+	case BTRFS_IOC_BALANCE:
+		return btrfs_balance(root->fs_info->dev_root);
+	case BTRFS_IOC_CLONE:
+		return btrfs_ioctl_clone(file, arg, 0, 0, 0);
+	case BTRFS_IOC_CLONE_RANGE:
+		return btrfs_ioctl_clone_range(file, argp);
+	case BTRFS_IOC_TRANS_START:
+		return btrfs_ioctl_trans_start(file);
+	case BTRFS_IOC_TRANS_END:
+		return btrfs_ioctl_trans_end(file);
+	case BTRFS_IOC_TREE_SEARCH:
+		return btrfs_ioctl_tree_search(file, argp);
+	case BTRFS_IOC_INO_LOOKUP:
+		return btrfs_ioctl_ino_lookup(file, argp);
+	case BTRFS_IOC_SPACE_INFO:
+		return btrfs_ioctl_space_info(root, argp);
+	case BTRFS_IOC_SYNC:
+		btrfs_sync_fs(file->f_dentry->d_sb, 1);
+		return 0;
+	case BTRFS_IOC_START_SYNC:
+		return btrfs_ioctl_start_sync(file, argp);
+	case BTRFS_IOC_WAIT_SYNC:
+		return btrfs_ioctl_wait_sync(file, argp);
+	case BTRFS_IOC_SCRUB:
+		return btrfs_ioctl_scrub(root, argp);
+	case BTRFS_IOC_SCRUB_CANCEL:
+		return btrfs_ioctl_scrub_cancel(root, argp);
+	case BTRFS_IOC_SCRUB_PROGRESS:
+		return btrfs_ioctl_scrub_progress(root, argp);
+	}
+
+	return -ENOTTY;
+}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 00000000..ad1ea789
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __IOCTL_
+#define __IOCTL_
+#include <linux/ioctl.h>
+
+#define BTRFS_IOCTL_MAGIC 0x94
+#define BTRFS_VOL_NAME_MAX 255
+
+/* this should be 4k */
+#define BTRFS_PATH_NAME_MAX 4087
+struct btrfs_ioctl_vol_args {
+	__s64 fd;
+	char name[BTRFS_PATH_NAME_MAX + 1];
+};
+
+#define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
+#define BTRFS_SUBVOL_RDONLY		(1ULL << 1)
+#define BTRFS_FSID_SIZE 16
+#define BTRFS_UUID_SIZE 16
+
+#define BTRFS_SUBVOL_NAME_MAX 4039
+struct btrfs_ioctl_vol_args_v2 {
+	__s64 fd;
+	__u64 transid;
+	__u64 flags;
+	__u64 unused[4];
+	char name[BTRFS_SUBVOL_NAME_MAX + 1];
+};
+
+/*
+ * structure to report errors and progress to userspace, either as a
+ * result of a finished scrub, a canceled scrub or a progress inquiry
+ */
+struct btrfs_scrub_progress {
+	__u64 data_extents_scrubbed;	/* # of data extents scrubbed */
+	__u64 tree_extents_scrubbed;	/* # of tree extents scrubbed */
+	__u64 data_bytes_scrubbed;	/* # of data bytes scrubbed */
+	__u64 tree_bytes_scrubbed;	/* # of tree bytes scrubbed */
+	__u64 read_errors;		/* # of read errors encountered (EIO) */
+	__u64 csum_errors;		/* # of failed csum checks */
+	__u64 verify_errors;		/* # of occurences, where the metadata
+					 * of a tree block did not match the
+					 * expected values, like generation or
+					 * logical */
+	__u64 no_csum;			/* # of 4k data block for which no csum
+					 * is present, probably the result of
+					 * data written with nodatasum */
+	__u64 csum_discards;		/* # of csum for which no data was found
+					 * in the extent tree. */
+	__u64 super_errors;		/* # of bad super blocks encountered */
+	__u64 malloc_errors;		/* # of internal kmalloc errors. These
+					 * will likely cause an incomplete
+					 * scrub */
+	__u64 uncorrectable_errors;	/* # of errors where either no intact
+					 * copy was found or the writeback
+					 * failed */
+	__u64 corrected_errors;		/* # of errors corrected */
+	__u64 last_physical;		/* last physical address scrubbed. In
+					 * case a scrub was aborted, this can
+					 * be used to restart the scrub */
+	__u64 unverified_errors;	/* # of occurences where a read for a
+					 * full (64k) bio failed, but the re-
+					 * check succeeded for each 4k piece.
+					 * Intermittent error. */
+};
+
+#define BTRFS_SCRUB_READONLY	1
+struct btrfs_ioctl_scrub_args {
+	__u64 devid;				/* in */
+	__u64 start;				/* in */
+	__u64 end;				/* in */
+	__u64 flags;				/* in */
+	struct btrfs_scrub_progress progress;	/* out */
+	/* pad to 1k */
+	__u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
+};
+
+#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+struct btrfs_ioctl_dev_info_args {
+	__u64 devid;				/* in/out */
+	__u8 uuid[BTRFS_UUID_SIZE];		/* in/out */
+	__u64 bytes_used;			/* out */
+	__u64 total_bytes;			/* out */
+	__u64 unused[379];			/* pad to 4k */
+	__u8 path[BTRFS_DEVICE_PATH_NAME_MAX];	/* out */
+};
+
+struct btrfs_ioctl_fs_info_args {
+	__u64 max_id;				/* out */
+	__u64 num_devices;			/* out */
+	__u8 fsid[BTRFS_FSID_SIZE];		/* out */
+	__u64 reserved[124];			/* pad to 1k */
+};
+
+#define BTRFS_INO_LOOKUP_PATH_MAX 4080
+struct btrfs_ioctl_ino_lookup_args {
+	__u64 treeid;
+	__u64 objectid;
+	char name[BTRFS_INO_LOOKUP_PATH_MAX];
+};
+
+struct btrfs_ioctl_search_key {
+	/* which root are we searching.  0 is the tree of tree roots */
+	__u64 tree_id;
+
+	/* keys returned will be >= min and <= max */
+	__u64 min_objectid;
+	__u64 max_objectid;
+
+	/* keys returned will be >= min and <= max */
+	__u64 min_offset;
+	__u64 max_offset;
+
+	/* max and min transids to search for */
+	__u64 min_transid;
+	__u64 max_transid;
+
+	/* keys returned will be >= min and <= max */
+	__u32 min_type;
+	__u32 max_type;
+
+	/*
+	 * how many items did userland ask for, and how many are we
+	 * returning
+	 */
+	__u32 nr_items;
+
+	/* align to 64 bits */
+	__u32 unused;
+
+	/* some extra for later */
+	__u64 unused1;
+	__u64 unused2;
+	__u64 unused3;
+	__u64 unused4;
+};
+
+struct btrfs_ioctl_search_header {
+	__u64 transid;
+	__u64 objectid;
+	__u64 offset;
+	__u32 type;
+	__u32 len;
+};
+
+#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
+/*
+ * the buf is an array of search headers where
+ * each header is followed by the actual item
+ * the type field is expanded to 32 bits for alignment
+ */
+struct btrfs_ioctl_search_args {
+	struct btrfs_ioctl_search_key key;
+	char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
+};
+
+struct btrfs_ioctl_clone_range_args {
+  __s64 src_fd;
+  __u64 src_offset, src_length;
+  __u64 dest_offset;
+};
+
+/* flags for the defrag range ioctl */
+#define BTRFS_DEFRAG_RANGE_COMPRESS 1
+#define BTRFS_DEFRAG_RANGE_START_IO 2
+
+struct btrfs_ioctl_space_info {
+	__u64 flags;
+	__u64 total_bytes;
+	__u64 used_bytes;
+};
+
+struct btrfs_ioctl_space_args {
+	__u64 space_slots;
+	__u64 total_spaces;
+	struct btrfs_ioctl_space_info spaces[0];
+};
+
+#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
+				   struct btrfs_ioctl_vol_args)
+/* trans start and trans end are dangerous, and only for
+ * use by applications that know how to avoid the
+ * resulting deadlocks
+ */
+#define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
+#define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
+#define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
+
+#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
+#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
+				   struct btrfs_ioctl_vol_args)
+
+#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
+				  struct btrfs_ioctl_clone_range_args)
+
+#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
+				struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
+				struct btrfs_ioctl_defrag_range_args)
+#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
+				   struct btrfs_ioctl_search_args)
+#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
+				   struct btrfs_ioctl_ino_lookup_args)
+#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
+#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
+				    struct btrfs_ioctl_space_args)
+#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
+#define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
+#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
+				   struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
+#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
+#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
+			      struct btrfs_ioctl_scrub_args)
+#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28)
+#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \
+				       struct btrfs_ioctl_scrub_args)
+#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \
+				 struct btrfs_ioctl_dev_info_args)
+#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
+			       struct btrfs_ioctl_fs_info_args)
+#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 00000000..66fa43dc
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
+#include <asm/bug.h>
+#include "ctree.h"
+#include "extent_io.h"
+#include "locking.h"
+
+static inline void spin_nested(struct extent_buffer *eb)
+{
+	spin_lock(&eb->lock);
+}
+
+/*
+ * Setting a lock to blocking will drop the spinlock and set the
+ * flag that forces other procs who want the lock to wait.  After
+ * this you can safely schedule with the lock held.
+ */
+void btrfs_set_lock_blocking(struct extent_buffer *eb)
+{
+	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+		set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+		spin_unlock(&eb->lock);
+	}
+	/* exit with the spin lock released and the bit set */
+}
+
+/*
+ * clearing the blocking flag will take the spinlock again.
+ * After this you can't safely schedule
+ */
+void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+{
+	if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+		spin_nested(eb);
+		clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+		smp_mb__after_clear_bit();
+	}
+	/* exit with the spin lock held */
+}
+
+/*
+ * unfortunately, many of the places that currently set a lock to blocking
+ * don't end up blocking for very long, and often they don't block
+ * at all.  For a dbench 50 run, if we don't spin on the blocking bit
+ * at all, the context switch rate can jump up to 400,000/sec or more.
+ *
+ * So, we're still stuck with this crummy spin on the blocking bit,
+ * at least until the most common causes of the short blocks
+ * can be dealt with.
+ */
+static int btrfs_spin_on_block(struct extent_buffer *eb)
+{
+	int i;
+
+	for (i = 0; i < 512; i++) {
+		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			return 1;
+		if (need_resched())
+			break;
+		cpu_relax();
+	}
+	return 0;
+}
+
+/*
+ * This is somewhat different from trylock.  It will take the
+ * spinlock but if it finds the lock is set to blocking, it will
+ * return without the lock held.
+ *
+ * returns 1 if it was able to take the lock and zero otherwise
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_try_spin_lock(struct extent_buffer *eb)
+{
+	int i;
+
+	if (btrfs_spin_on_block(eb)) {
+		spin_nested(eb);
+		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			return 1;
+		spin_unlock(&eb->lock);
+	}
+	/* spin for a bit on the BLOCKING flag */
+	for (i = 0; i < 2; i++) {
+		cpu_relax();
+		if (!btrfs_spin_on_block(eb))
+			break;
+
+		spin_nested(eb);
+		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			return 1;
+		spin_unlock(&eb->lock);
+	}
+	return 0;
+}
+
+/*
+ * the autoremove wake function will return 0 if it tried to wake up
+ * a process that was already awake, which means that process won't
+ * count as an exclusive wakeup.  The waitq code will continue waking
+ * procs until it finds one that was actually sleeping.
+ *
+ * For btrfs, this isn't quite what we want.  We want a single proc
+ * to be notified that the lock is ready for taking.  If that proc
+ * already happen to be awake, great, it will loop around and try for
+ * the lock.
+ *
+ * So, btrfs_wake_function always returns 1, even when the proc that we
+ * tried to wake up was already awake.
+ */
+static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
+			       int sync, void *key)
+{
+	autoremove_wake_function(wait, mode, sync, key);
+	return 1;
+}
+
+/*
+ * returns with the extent buffer spinlocked.
+ *
+ * This will spin and/or wait as required to take the lock, and then
+ * return with the spinlock held.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_tree_lock(struct extent_buffer *eb)
+{
+	DEFINE_WAIT(wait);
+	wait.func = btrfs_wake_function;
+
+	if (!btrfs_spin_on_block(eb))
+		goto sleep;
+
+	while(1) {
+		spin_nested(eb);
+
+		/* nobody is blocking, exit with the spinlock held */
+		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			return 0;
+
+		/*
+		 * we have the spinlock, but the real owner is blocking.
+		 * wait for them
+		 */
+		spin_unlock(&eb->lock);
+
+		/*
+		 * spin for a bit, and if the blocking flag goes away,
+		 * loop around
+		 */
+		cpu_relax();
+		if (btrfs_spin_on_block(eb))
+			continue;
+sleep:
+		prepare_to_wait_exclusive(&eb->lock_wq, &wait,
+					  TASK_UNINTERRUPTIBLE);
+
+		if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			schedule();
+
+		finish_wait(&eb->lock_wq, &wait);
+	}
+	return 0;
+}
+
+int btrfs_tree_unlock(struct extent_buffer *eb)
+{
+	/*
+	 * if we were a blocking owner, we don't have the spinlock held
+	 * just clear the bit and look for waiters
+	 */
+	if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+		smp_mb__after_clear_bit();
+	else
+		spin_unlock(&eb->lock);
+
+	if (waitqueue_active(&eb->lock_wq))
+		wake_up(&eb->lock_wq);
+	return 0;
+}
+
+void btrfs_assert_tree_locked(struct extent_buffer *eb)
+{
+	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+		assert_spin_locked(&eb->lock);
+}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 00000000..5c33a560
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_LOCKING_
+#define __BTRFS_LOCKING_
+
+int btrfs_tree_lock(struct extent_buffer *eb);
+int btrfs_tree_unlock(struct extent_buffer *eb);
+int btrfs_try_spin_lock(struct extent_buffer *eb);
+
+void btrfs_set_lock_blocking(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking(struct extent_buffer *eb);
+void btrfs_assert_tree_locked(struct extent_buffer *eb);
+#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 00000000..a178f5eb
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,427 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/lzo.h>
+#include "compression.h"
+
+#define LZO_LEN	4
+
+struct workspace {
+	void *mem;
+	void *buf;	/* where compressed data goes */
+	void *cbuf;	/* where decompressed data goes */
+	struct list_head list;
+};
+
+static void lzo_free_workspace(struct list_head *ws)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+	vfree(workspace->buf);
+	vfree(workspace->cbuf);
+	vfree(workspace->mem);
+	kfree(workspace);
+}
+
+static struct list_head *lzo_alloc_workspace(void)
+{
+	struct workspace *workspace;
+
+	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+	if (!workspace)
+		return ERR_PTR(-ENOMEM);
+
+	workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
+	workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+	workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+	if (!workspace->mem || !workspace->buf || !workspace->cbuf)
+		goto fail;
+
+	INIT_LIST_HEAD(&workspace->list);
+
+	return &workspace->list;
+fail:
+	lzo_free_workspace(&workspace->list);
+	return ERR_PTR(-ENOMEM);
+}
+
+static inline void write_compress_length(char *buf, size_t len)
+{
+	__le32 dlen;
+
+	dlen = cpu_to_le32(len);
+	memcpy(buf, &dlen, LZO_LEN);
+}
+
+static inline size_t read_compress_length(char *buf)
+{
+	__le32 dlen;
+
+	memcpy(&dlen, buf, LZO_LEN);
+	return le32_to_cpu(dlen);
+}
+
+static int lzo_compress_pages(struct list_head *ws,
+			      struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	int ret = 0;
+	char *data_in;
+	char *cpage_out;
+	int nr_pages = 0;
+	struct page *in_page = NULL;
+	struct page *out_page = NULL;
+	unsigned long bytes_left;
+
+	size_t in_len;
+	size_t out_len;
+	char *buf;
+	unsigned long tot_in = 0;
+	unsigned long tot_out = 0;
+	unsigned long pg_bytes_left;
+	unsigned long out_offset;
+	unsigned long bytes;
+
+	*out_pages = 0;
+	*total_out = 0;
+	*total_in = 0;
+
+	in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+	data_in = kmap(in_page);
+
+	/*
+	 * store the size of all chunks of compressed data in
+	 * the first 4 bytes
+	 */
+	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (out_page == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	cpage_out = kmap(out_page);
+	out_offset = LZO_LEN;
+	tot_out = LZO_LEN;
+	pages[0] = out_page;
+	nr_pages = 1;
+	pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+
+	/* compress at most one page of data each time */
+	in_len = min(len, PAGE_CACHE_SIZE);
+	while (tot_in < len) {
+		ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
+				       &out_len, workspace->mem);
+		if (ret != LZO_E_OK) {
+			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+			       ret);
+			ret = -1;
+			goto out;
+		}
+
+		/* store the size of this chunk of compressed data */
+		write_compress_length(cpage_out + out_offset, out_len);
+		tot_out += LZO_LEN;
+		out_offset += LZO_LEN;
+		pg_bytes_left -= LZO_LEN;
+
+		tot_in += in_len;
+		tot_out += out_len;
+
+		/* copy bytes from the working buffer into the pages */
+		buf = workspace->cbuf;
+		while (out_len) {
+			bytes = min_t(unsigned long, pg_bytes_left, out_len);
+
+			memcpy(cpage_out + out_offset, buf, bytes);
+
+			out_len -= bytes;
+			pg_bytes_left -= bytes;
+			buf += bytes;
+			out_offset += bytes;
+
+			/*
+			 * we need another page for writing out.
+			 *
+			 * Note if there's less than 4 bytes left, we just
+			 * skip to a new page.
+			 */
+			if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
+			    pg_bytes_left == 0) {
+				if (pg_bytes_left) {
+					memset(cpage_out + out_offset, 0,
+					       pg_bytes_left);
+					tot_out += pg_bytes_left;
+				}
+
+				/* we're done, don't allocate new page */
+				if (out_len == 0 && tot_in >= len)
+					break;
+
+				kunmap(out_page);
+				if (nr_pages == nr_dest_pages) {
+					out_page = NULL;
+					ret = -1;
+					goto out;
+				}
+
+				out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+				if (out_page == NULL) {
+					ret = -ENOMEM;
+					goto out;
+				}
+				cpage_out = kmap(out_page);
+				pages[nr_pages++] = out_page;
+
+				pg_bytes_left = PAGE_CACHE_SIZE;
+				out_offset = 0;
+			}
+		}
+
+		/* we're making it bigger, give up */
+		if (tot_in > 8192 && tot_in < tot_out)
+			goto out;
+
+		/* we're all done */
+		if (tot_in >= len)
+			break;
+
+		if (tot_out > max_out)
+			break;
+
+		bytes_left = len - tot_in;
+		kunmap(in_page);
+		page_cache_release(in_page);
+
+		start += PAGE_CACHE_SIZE;
+		in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+		data_in = kmap(in_page);
+		in_len = min(bytes_left, PAGE_CACHE_SIZE);
+	}
+
+	if (tot_out > tot_in)
+		goto out;
+
+	/* store the size of all chunks of compressed data */
+	cpage_out = kmap(pages[0]);
+	write_compress_length(cpage_out, tot_out);
+
+	kunmap(pages[0]);
+
+	ret = 0;
+	*total_out = tot_out;
+	*total_in = tot_in;
+out:
+	*out_pages = nr_pages;
+	if (out_page)
+		kunmap(out_page);
+
+	if (in_page) {
+		kunmap(in_page);
+		page_cache_release(in_page);
+	}
+
+	return ret;
+}
+
+static int lzo_decompress_biovec(struct list_head *ws,
+				 struct page **pages_in,
+				 u64 disk_start,
+				 struct bio_vec *bvec,
+				 int vcnt,
+				 size_t srclen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	int ret = 0, ret2;
+	char *data_in;
+	unsigned long page_in_index = 0;
+	unsigned long page_out_index = 0;
+	unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+					PAGE_CACHE_SIZE;
+	unsigned long buf_start;
+	unsigned long buf_offset = 0;
+	unsigned long bytes;
+	unsigned long working_bytes;
+	unsigned long pg_offset;
+
+	size_t in_len;
+	size_t out_len;
+	unsigned long in_offset;
+	unsigned long in_page_bytes_left;
+	unsigned long tot_in;
+	unsigned long tot_out;
+	unsigned long tot_len;
+	char *buf;
+	bool may_late_unmap, need_unmap;
+
+	data_in = kmap(pages_in[0]);
+	tot_len = read_compress_length(data_in);
+
+	tot_in = LZO_LEN;
+	in_offset = LZO_LEN;
+	tot_len = min_t(size_t, srclen, tot_len);
+	in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+
+	tot_out = 0;
+	pg_offset = 0;
+
+	while (tot_in < tot_len) {
+		in_len = read_compress_length(data_in + in_offset);
+		in_page_bytes_left -= LZO_LEN;
+		in_offset += LZO_LEN;
+		tot_in += LZO_LEN;
+
+		tot_in += in_len;
+		working_bytes = in_len;
+		may_late_unmap = need_unmap = false;
+
+		/* fast path: avoid using the working buffer */
+		if (in_page_bytes_left >= in_len) {
+			buf = data_in + in_offset;
+			bytes = in_len;
+			may_late_unmap = true;
+			goto cont;
+		}
+
+		/* copy bytes from the pages into the working buffer */
+		buf = workspace->cbuf;
+		buf_offset = 0;
+		while (working_bytes) {
+			bytes = min(working_bytes, in_page_bytes_left);
+
+			memcpy(buf + buf_offset, data_in + in_offset, bytes);
+			buf_offset += bytes;
+cont:
+			working_bytes -= bytes;
+			in_page_bytes_left -= bytes;
+			in_offset += bytes;
+
+			/* check if we need to pick another page */
+			if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
+			    || in_page_bytes_left == 0) {
+				tot_in += in_page_bytes_left;
+
+				if (working_bytes == 0 && tot_in >= tot_len)
+					break;
+
+				if (page_in_index + 1 >= total_pages_in) {
+					ret = -1;
+					goto done;
+				}
+
+				if (may_late_unmap)
+					need_unmap = true;
+				else
+					kunmap(pages_in[page_in_index]);
+
+				data_in = kmap(pages_in[++page_in_index]);
+
+				in_page_bytes_left = PAGE_CACHE_SIZE;
+				in_offset = 0;
+			}
+		}
+
+		out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
+		ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
+					    &out_len);
+		if (need_unmap)
+			kunmap(pages_in[page_in_index - 1]);
+		if (ret != LZO_E_OK) {
+			printk(KERN_WARNING "btrfs decompress failed\n");
+			ret = -1;
+			break;
+		}
+
+		buf_start = tot_out;
+		tot_out += out_len;
+
+		ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+						 tot_out, disk_start,
+						 bvec, vcnt,
+						 &page_out_index, &pg_offset);
+		if (ret2 == 0)
+			break;
+	}
+done:
+	kunmap(pages_in[page_in_index]);
+	return ret;
+}
+
+static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	size_t in_len;
+	size_t out_len;
+	size_t tot_len;
+	int ret = 0;
+	char *kaddr;
+	unsigned long bytes;
+
+	BUG_ON(srclen < LZO_LEN);
+
+	tot_len = read_compress_length(data_in);
+	data_in += LZO_LEN;
+
+	in_len = read_compress_length(data_in);
+	data_in += LZO_LEN;
+
+	out_len = PAGE_CACHE_SIZE;
+	ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
+	if (ret != LZO_E_OK) {
+		printk(KERN_WARNING "btrfs decompress failed!\n");
+		ret = -1;
+		goto out;
+	}
+
+	if (out_len < start_byte) {
+		ret = -1;
+		goto out;
+	}
+
+	bytes = min_t(unsigned long, destlen, out_len - start_byte);
+
+	kaddr = kmap_atomic(dest_page, KM_USER0);
+	memcpy(kaddr, workspace->buf + start_byte, bytes);
+	kunmap_atomic(kaddr, KM_USER0);
+out:
+	return ret;
+}
+
+struct btrfs_compress_op btrfs_lzo_compress = {
+	.alloc_workspace	= lzo_alloc_workspace,
+	.free_workspace		= lzo_free_workspace,
+	.compress_pages		= lzo_compress_pages,
+	.decompress_biovec	= lzo_decompress_biovec,
+	.decompress		= lzo_decompress,
+};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 00000000..a1c94042
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,983 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "extent_io.h"
+
+static u64 entry_end(struct btrfs_ordered_extent *entry)
+{
+	if (entry->file_offset + entry->len < entry->file_offset)
+		return (u64)-1;
+	return entry->file_offset + entry->len;
+}
+
+/* returns NULL if the insertion worked, or it returns the node it did find
+ * in the tree
+ */
+static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
+				   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_ordered_extent *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
+
+		if (file_offset < entry->file_offset)
+			p = &(*p)->rb_left;
+		else if (file_offset >= entry_end(entry))
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+/*
+ * look for a given offset in the tree, and if it can't be found return the
+ * first lesser offset
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
+				     struct rb_node **prev_ret)
+{
+	struct rb_node *n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *test;
+	struct btrfs_ordered_extent *entry;
+	struct btrfs_ordered_extent *prev_entry = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+		prev = n;
+		prev_entry = entry;
+
+		if (file_offset < entry->file_offset)
+			n = n->rb_left;
+		else if (file_offset >= entry_end(entry))
+			n = n->rb_right;
+		else
+			return n;
+	}
+	if (!prev_ret)
+		return NULL;
+
+	while (prev && file_offset >= entry_end(prev_entry)) {
+		test = rb_next(prev);
+		if (!test)
+			break;
+		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+				      rb_node);
+		if (file_offset < entry_end(prev_entry))
+			break;
+
+		prev = test;
+	}
+	if (prev)
+		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
+				      rb_node);
+	while (prev && file_offset < entry_end(prev_entry)) {
+		test = rb_prev(prev);
+		if (!test)
+			break;
+		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+				      rb_node);
+		prev = test;
+	}
+	*prev_ret = prev;
+	return NULL;
+}
+
+/*
+ * helper to check if a given offset is inside a given entry
+ */
+static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
+{
+	if (file_offset < entry->file_offset ||
+	    entry->file_offset + entry->len <= file_offset)
+		return 0;
+	return 1;
+}
+
+static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
+			  u64 len)
+{
+	if (file_offset + len <= entry->file_offset ||
+	    entry->file_offset + entry->len <= file_offset)
+		return 0;
+	return 1;
+}
+
+/*
+ * look find the first ordered struct that has this offset, otherwise
+ * the first one less than this offset
+ */
+static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+					  u64 file_offset)
+{
+	struct rb_root *root = &tree->tree;
+	struct rb_node *prev = NULL;
+	struct rb_node *ret;
+	struct btrfs_ordered_extent *entry;
+
+	if (tree->last) {
+		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+				 rb_node);
+		if (offset_in_entry(entry, file_offset))
+			return tree->last;
+	}
+	ret = __tree_search(root, file_offset, &prev);
+	if (!ret)
+		ret = prev;
+	if (ret)
+		tree->last = ret;
+	return ret;
+}
+
+/* allocate and add a new ordered_extent into the per-inode tree.
+ * file_offset is the logical offset in the file
+ *
+ * start is the disk block number of an extent already reserved in the
+ * extent allocation tree
+ *
+ * len is the length of the extent
+ *
+ * The tree is given a single reference on the ordered extent that was
+ * inserted.
+ */
+static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+				      u64 start, u64 len, u64 disk_len,
+				      int type, int dio, int compress_type)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	entry = kzalloc(sizeof(*entry), GFP_NOFS);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->file_offset = file_offset;
+	entry->start = start;
+	entry->len = len;
+	entry->disk_len = disk_len;
+	entry->bytes_left = len;
+	entry->inode = inode;
+	entry->compress_type = compress_type;
+	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
+		set_bit(type, &entry->flags);
+
+	if (dio)
+		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
+
+	/* one ref for the tree */
+	atomic_set(&entry->refs, 1);
+	init_waitqueue_head(&entry->wait);
+	INIT_LIST_HEAD(&entry->list);
+	INIT_LIST_HEAD(&entry->root_extent_list);
+
+	trace_btrfs_ordered_extent_add(inode, entry);
+
+	spin_lock(&tree->lock);
+	node = tree_insert(&tree->tree, file_offset,
+			   &entry->rb_node);
+	BUG_ON(node);
+	spin_unlock(&tree->lock);
+
+	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+	list_add_tail(&entry->root_extent_list,
+		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
+	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+
+	BUG_ON(node);
+	return 0;
+}
+
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+			     u64 start, u64 len, u64 disk_len, int type)
+{
+	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+					  disk_len, type, 0,
+					  BTRFS_COMPRESS_NONE);
+}
+
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+				 u64 start, u64 len, u64 disk_len, int type)
+{
+	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+					  disk_len, type, 1,
+					  BTRFS_COMPRESS_NONE);
+}
+
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+				      u64 start, u64 len, u64 disk_len,
+				      int type, int compress_type)
+{
+	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+					  disk_len, type, 0,
+					  compress_type);
+}
+
+/*
+ * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
+ * when an ordered extent is finished.  If the list covers more than one
+ * ordered extent, it is split across multiples.
+ */
+int btrfs_add_ordered_sum(struct inode *inode,
+			  struct btrfs_ordered_extent *entry,
+			  struct btrfs_ordered_sum *sum)
+{
+	struct btrfs_ordered_inode_tree *tree;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock(&tree->lock);
+	list_add_tail(&sum->list, &entry->list);
+	spin_unlock(&tree->lock);
+	return 0;
+}
+
+/*
+ * this is used to account for finished IO across a given range
+ * of the file.  The IO may span ordered extents.  If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ *
+ * file_offset is updated to one byte past the range that is recorded as
+ * complete.  This allows you to walk forward in the file.
+ */
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+				   struct btrfs_ordered_extent **cached,
+				   u64 *file_offset, u64 io_size)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+	int ret;
+	u64 dec_end;
+	u64 dec_start;
+	u64 to_dec;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock(&tree->lock);
+	node = tree_search(tree, *file_offset);
+	if (!node) {
+		ret = 1;
+		goto out;
+	}
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, *file_offset)) {
+		ret = 1;
+		goto out;
+	}
+
+	dec_start = max(*file_offset, entry->file_offset);
+	dec_end = min(*file_offset + io_size, entry->file_offset +
+		      entry->len);
+	*file_offset = dec_end;
+	if (dec_start > dec_end) {
+		printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
+		       (unsigned long long)dec_start,
+		       (unsigned long long)dec_end);
+	}
+	to_dec = dec_end - dec_start;
+	if (to_dec > entry->bytes_left) {
+		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+		       (unsigned long long)entry->bytes_left,
+		       (unsigned long long)to_dec);
+	}
+	entry->bytes_left -= to_dec;
+	if (entry->bytes_left == 0)
+		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+	else
+		ret = 1;
+out:
+	if (!ret && cached && entry) {
+		*cached = entry;
+		atomic_inc(&entry->refs);
+	}
+	spin_unlock(&tree->lock);
+	return ret == 0;
+}
+
+/*
+ * this is used to account for finished IO across a given range
+ * of the file.  The IO should not span ordered extents.  If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ */
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+				   struct btrfs_ordered_extent **cached,
+				   u64 file_offset, u64 io_size)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+	int ret;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock(&tree->lock);
+	node = tree_search(tree, file_offset);
+	if (!node) {
+		ret = 1;
+		goto out;
+	}
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, file_offset)) {
+		ret = 1;
+		goto out;
+	}
+
+	if (io_size > entry->bytes_left) {
+		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+		       (unsigned long long)entry->bytes_left,
+		       (unsigned long long)io_size);
+	}
+	entry->bytes_left -= io_size;
+	if (entry->bytes_left == 0)
+		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+	else
+		ret = 1;
+out:
+	if (!ret && cached && entry) {
+		*cached = entry;
+		atomic_inc(&entry->refs);
+	}
+	spin_unlock(&tree->lock);
+	return ret == 0;
+}
+
+/*
+ * used to drop a reference on an ordered extent.  This will free
+ * the extent if the last reference is dropped
+ */
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+	struct list_head *cur;
+	struct btrfs_ordered_sum *sum;
+
+	trace_btrfs_ordered_extent_put(entry->inode, entry);
+
+	if (atomic_dec_and_test(&entry->refs)) {
+		while (!list_empty(&entry->list)) {
+			cur = entry->list.next;
+			sum = list_entry(cur, struct btrfs_ordered_sum, list);
+			list_del(&sum->list);
+			kfree(sum);
+		}
+		kfree(entry);
+	}
+	return 0;
+}
+
+/*
+ * remove an ordered extent from the tree.  No references are dropped
+ * and you must wake_up entry->wait.  You must hold the tree lock
+ * while you call this function.
+ */
+static int __btrfs_remove_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct rb_node *node;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	node = &entry->rb_node;
+	rb_erase(node, &tree->tree);
+	tree->last = NULL;
+	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+
+	spin_lock(&root->fs_info->ordered_extent_lock);
+	list_del_init(&entry->root_extent_list);
+
+	trace_btrfs_ordered_extent_remove(inode, entry);
+
+	/*
+	 * we have no more ordered extents for this inode and
+	 * no dirty pages.  We can safely remove it from the
+	 * list of ordered extents
+	 */
+	if (RB_EMPTY_ROOT(&tree->tree) &&
+	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+		list_del_init(&BTRFS_I(inode)->ordered_operations);
+	}
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+
+	return 0;
+}
+
+/*
+ * remove an ordered extent from the tree.  No references are dropped
+ * but any waiters are woken.
+ */
+int btrfs_remove_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	int ret;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock(&tree->lock);
+	ret = __btrfs_remove_ordered_extent(inode, entry);
+	spin_unlock(&tree->lock);
+	wake_up(&entry->wait);
+
+	return ret;
+}
+
+/*
+ * wait for all the ordered extents in a root.  This is done when balancing
+ * space between drives.
+ */
+int btrfs_wait_ordered_extents(struct btrfs_root *root,
+			       int nocow_only, int delay_iput)
+{
+	struct list_head splice;
+	struct list_head *cur;
+	struct btrfs_ordered_extent *ordered;
+	struct inode *inode;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&root->fs_info->ordered_extent_lock);
+	list_splice_init(&root->fs_info->ordered_extents, &splice);
+	while (!list_empty(&splice)) {
+		cur = splice.next;
+		ordered = list_entry(cur, struct btrfs_ordered_extent,
+				     root_extent_list);
+		if (nocow_only &&
+		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
+		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+			list_move(&ordered->root_extent_list,
+				  &root->fs_info->ordered_extents);
+			cond_resched_lock(&root->fs_info->ordered_extent_lock);
+			continue;
+		}
+
+		list_del_init(&ordered->root_extent_list);
+		atomic_inc(&ordered->refs);
+
+		/*
+		 * the inode may be getting freed (in sys_unlink path).
+		 */
+		inode = igrab(ordered->inode);
+
+		spin_unlock(&root->fs_info->ordered_extent_lock);
+
+		if (inode) {
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			if (delay_iput)
+				btrfs_add_delayed_iput(inode);
+			else
+				iput(inode);
+		} else {
+			btrfs_put_ordered_extent(ordered);
+		}
+
+		spin_lock(&root->fs_info->ordered_extent_lock);
+	}
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+	return 0;
+}
+
+/*
+ * this is used during transaction commit to write all the inodes
+ * added to the ordered operation list.  These files must be fully on
+ * disk before the transaction commits.
+ *
+ * we have two modes here, one is to just start the IO via filemap_flush
+ * and the other is to wait for all the io.  When we wait, we have an
+ * extra check to make sure the ordered operation list really is empty
+ * before we return
+ */
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+{
+	struct btrfs_inode *btrfs_inode;
+	struct inode *inode;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	mutex_lock(&root->fs_info->ordered_operations_mutex);
+	spin_lock(&root->fs_info->ordered_extent_lock);
+again:
+	list_splice_init(&root->fs_info->ordered_operations, &splice);
+
+	while (!list_empty(&splice)) {
+		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+				   ordered_operations);
+
+		inode = &btrfs_inode->vfs_inode;
+
+		list_del_init(&btrfs_inode->ordered_operations);
+
+		/*
+		 * the inode may be getting freed (in sys_unlink path).
+		 */
+		inode = igrab(inode);
+
+		if (!wait && inode) {
+			list_add_tail(&BTRFS_I(inode)->ordered_operations,
+			      &root->fs_info->ordered_operations);
+		}
+		spin_unlock(&root->fs_info->ordered_extent_lock);
+
+		if (inode) {
+			if (wait)
+				btrfs_wait_ordered_range(inode, 0, (u64)-1);
+			else
+				filemap_flush(inode->i_mapping);
+			btrfs_add_delayed_iput(inode);
+		}
+
+		cond_resched();
+		spin_lock(&root->fs_info->ordered_extent_lock);
+	}
+	if (wait && !list_empty(&root->fs_info->ordered_operations))
+		goto again;
+
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+	return 0;
+}
+
+/*
+ * Used to start IO or wait for a given ordered extent to finish.
+ *
+ * If wait is one, this effectively waits on page writeback for all the pages
+ * in the extent, and it waits on the io completion code to insert
+ * metadata into the btree corresponding to the extent
+ */
+void btrfs_start_ordered_extent(struct inode *inode,
+				       struct btrfs_ordered_extent *entry,
+				       int wait)
+{
+	u64 start = entry->file_offset;
+	u64 end = start + entry->len - 1;
+
+	trace_btrfs_ordered_extent_start(inode, entry);
+
+	/*
+	 * pages in the range can be dirty, clean or writeback.  We
+	 * start IO on any dirty ones so the wait doesn't stall waiting
+	 * for pdflush to find them
+	 */
+	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+		filemap_fdatawrite_range(inode->i_mapping, start, end);
+	if (wait) {
+		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+						 &entry->flags));
+	}
+}
+
+/*
+ * Used to wait on ordered extents across a large range of bytes.
+ */
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+{
+	u64 end;
+	u64 orig_end;
+	struct btrfs_ordered_extent *ordered;
+	int found;
+
+	if (start + len < start) {
+		orig_end = INT_LIMIT(loff_t);
+	} else {
+		orig_end = start + len - 1;
+		if (orig_end > INT_LIMIT(loff_t))
+			orig_end = INT_LIMIT(loff_t);
+	}
+again:
+	/* start IO across the range first to instantiate any delalloc
+	 * extents
+	 */
+	filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+
+	/* The compression code will leave pages locked but return from
+	 * writepage without setting the page writeback.  Starting again
+	 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
+	 */
+	filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+
+	filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+
+	end = orig_end;
+	found = 0;
+	while (1) {
+		ordered = btrfs_lookup_first_ordered_extent(inode, end);
+		if (!ordered)
+			break;
+		if (ordered->file_offset > orig_end) {
+			btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered->file_offset + ordered->len < start) {
+			btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		found++;
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		end = ordered->file_offset;
+		btrfs_put_ordered_extent(ordered);
+		if (end == 0 || end == start)
+			break;
+		end--;
+	}
+	if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+			   EXTENT_DELALLOC, 0, NULL)) {
+		schedule_timeout(1);
+		goto again;
+	}
+	return 0;
+}
+
+/*
+ * find an ordered extent corresponding to file_offset.  return NULL if
+ * nothing is found, otherwise take a reference on the extent and return it
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+							 u64 file_offset)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock(&tree->lock);
+	node = tree_search(tree, file_offset);
+	if (!node)
+		goto out;
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, file_offset))
+		entry = NULL;
+	if (entry)
+		atomic_inc(&entry->refs);
+out:
+	spin_unlock(&tree->lock);
+	return entry;
+}
+
+/* Since the DIO code tries to lock a wide area we need to look for any ordered
+ * extents that exist in the range, rather than just the start of the range.
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+							u64 file_offset,
+							u64 len)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock(&tree->lock);
+	node = tree_search(tree, file_offset);
+	if (!node) {
+		node = tree_search(tree, file_offset + len);
+		if (!node)
+			goto out;
+	}
+
+	while (1) {
+		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+		if (range_overlaps(entry, file_offset, len))
+			break;
+
+		if (entry->file_offset >= file_offset + len) {
+			entry = NULL;
+			break;
+		}
+		entry = NULL;
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	if (entry)
+		atomic_inc(&entry->refs);
+	spin_unlock(&tree->lock);
+	return entry;
+}
+
+/*
+ * lookup and return any extent before 'file_offset'.  NULL is returned
+ * if none is found
+ */
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock(&tree->lock);
+	node = tree_search(tree, file_offset);
+	if (!node)
+		goto out;
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	atomic_inc(&entry->refs);
+out:
+	spin_unlock(&tree->lock);
+	return entry;
+}
+
+/*
+ * After an extent is done, call this to conditionally update the on disk
+ * i_size.  i_size is updated to cover any fully written part of the file.
+ */
+int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
+				struct btrfs_ordered_extent *ordered)
+{
+	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	u64 disk_i_size;
+	u64 new_i_size;
+	u64 i_size_test;
+	u64 i_size = i_size_read(inode);
+	struct rb_node *node;
+	struct rb_node *prev = NULL;
+	struct btrfs_ordered_extent *test;
+	int ret = 1;
+
+	if (ordered)
+		offset = entry_end(ordered);
+	else
+		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
+
+	spin_lock(&tree->lock);
+	disk_i_size = BTRFS_I(inode)->disk_i_size;
+
+	/* truncate file */
+	if (disk_i_size > i_size) {
+		BTRFS_I(inode)->disk_i_size = i_size;
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * if the disk i_size is already at the inode->i_size, or
+	 * this ordered extent is inside the disk i_size, we're done
+	 */
+	if (disk_i_size == i_size || offset <= disk_i_size) {
+		goto out;
+	}
+
+	/*
+	 * we can't update the disk_isize if there are delalloc bytes
+	 * between disk_i_size and  this ordered extent
+	 */
+	if (test_range_bit(io_tree, disk_i_size, offset - 1,
+			   EXTENT_DELALLOC, 0, NULL)) {
+		goto out;
+	}
+	/*
+	 * walk backward from this ordered extent to disk_i_size.
+	 * if we find an ordered extent then we can't update disk i_size
+	 * yet
+	 */
+	if (ordered) {
+		node = rb_prev(&ordered->rb_node);
+	} else {
+		prev = tree_search(tree, offset);
+		/*
+		 * we insert file extents without involving ordered struct,
+		 * so there should be no ordered struct cover this offset
+		 */
+		if (prev) {
+			test = rb_entry(prev, struct btrfs_ordered_extent,
+					rb_node);
+			BUG_ON(offset_in_entry(test, offset));
+		}
+		node = prev;
+	}
+	while (node) {
+		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+		if (test->file_offset + test->len <= disk_i_size)
+			break;
+		if (test->file_offset >= i_size)
+			break;
+		if (test->file_offset >= disk_i_size)
+			goto out;
+		node = rb_prev(node);
+	}
+	new_i_size = min_t(u64, offset, i_size);
+
+	/*
+	 * at this point, we know we can safely update i_size to at least
+	 * the offset from this ordered extent.  But, we need to
+	 * walk forward and see if ios from higher up in the file have
+	 * finished.
+	 */
+	if (ordered) {
+		node = rb_next(&ordered->rb_node);
+	} else {
+		if (prev)
+			node = rb_next(prev);
+		else
+			node = rb_first(&tree->tree);
+	}
+	i_size_test = 0;
+	if (node) {
+		/*
+		 * do we have an area where IO might have finished
+		 * between our ordered extent and the next one.
+		 */
+		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+		if (test->file_offset > offset)
+			i_size_test = test->file_offset;
+	} else {
+		i_size_test = i_size;
+	}
+
+	/*
+	 * i_size_test is the end of a region after this ordered
+	 * extent where there are no ordered extents.  As long as there
+	 * are no delalloc bytes in this area, it is safe to update
+	 * disk_i_size to the end of the region.
+	 */
+	if (i_size_test > offset &&
+	    !test_range_bit(io_tree, offset, i_size_test - 1,
+			    EXTENT_DELALLOC, 0, NULL)) {
+		new_i_size = min_t(u64, i_size_test, i_size);
+	}
+	BTRFS_I(inode)->disk_i_size = new_i_size;
+	ret = 0;
+out:
+	/*
+	 * we need to remove the ordered extent with the tree lock held
+	 * so that other people calling this function don't find our fully
+	 * processed ordered entry and skip updating the i_size
+	 */
+	if (ordered)
+		__btrfs_remove_ordered_extent(inode, ordered);
+	spin_unlock(&tree->lock);
+	if (ordered)
+		wake_up(&ordered->wait);
+	return ret;
+}
+
+/*
+ * search the ordered extents for one corresponding to 'offset' and
+ * try to find a checksum.  This is used because we allow pages to
+ * be reclaimed before their checksum is actually put into the btree
+ */
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+			   u32 *sum)
+{
+	struct btrfs_ordered_sum *ordered_sum;
+	struct btrfs_sector_sum *sector_sums;
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+	unsigned long num_sectors;
+	unsigned long i;
+	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
+	int ret = 1;
+
+	ordered = btrfs_lookup_ordered_extent(inode, offset);
+	if (!ordered)
+		return 1;
+
+	spin_lock(&tree->lock);
+	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
+		if (disk_bytenr >= ordered_sum->bytenr) {
+			num_sectors = ordered_sum->len / sectorsize;
+			sector_sums = ordered_sum->sums;
+			for (i = 0; i < num_sectors; i++) {
+				if (sector_sums[i].bytenr == disk_bytenr) {
+					*sum = sector_sums[i].sum;
+					ret = 0;
+					goto out;
+				}
+			}
+		}
+	}
+out:
+	spin_unlock(&tree->lock);
+	btrfs_put_ordered_extent(ordered);
+	return ret;
+}
+
+
+/*
+ * add a given inode to the list of inodes that must be fully on
+ * disk before a transaction commit finishes.
+ *
+ * This basically gives us the ext3 style data=ordered mode, and it is mostly
+ * used to make sure renamed files are fully on disk.
+ *
+ * It is a noop if the inode is already fully on disk.
+ *
+ * If trans is not null, we'll do a friendly check for a transaction that
+ * is already flushing things and force the IO down ourselves.
+ */
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct inode *inode)
+{
+	u64 last_mod;
+
+	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
+
+	/*
+	 * if this file hasn't been changed since the last transaction
+	 * commit, we can safely return without doing anything
+	 */
+	if (last_mod < root->fs_info->last_trans_committed)
+		return 0;
+
+	/*
+	 * the transaction is already committing.  Just start the IO and
+	 * don't bother with all of this list nonsense
+	 */
+	if (trans && root->fs_info->running_transaction->blocked) {
+		btrfs_wait_ordered_range(inode, 0, (u64)-1);
+		return 0;
+	}
+
+	spin_lock(&root->fs_info->ordered_extent_lock);
+	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
+		list_add_tail(&BTRFS_I(inode)->ordered_operations,
+			      &root->fs_info->ordered_operations);
+	}
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+
+	return 0;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 00000000..ff1f69aa
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ORDERED_DATA__
+#define __BTRFS_ORDERED_DATA__
+
+/* one of these per inode */
+struct btrfs_ordered_inode_tree {
+	spinlock_t lock;
+	struct rb_root tree;
+	struct rb_node *last;
+};
+
+/*
+ * these are used to collect checksums done just before bios submission.
+ * They are attached via a list into the ordered extent, and
+ * checksum items are inserted into the tree after all the blocks in
+ * the ordered extent are on disk
+ */
+struct btrfs_sector_sum {
+	/* bytenr on disk */
+	u64 bytenr;
+	u32 sum;
+};
+
+struct btrfs_ordered_sum {
+	/* bytenr is the start of this extent on disk */
+	u64 bytenr;
+
+	/*
+	 * this is the length in bytes covered by the sums array below.
+	 */
+	unsigned long len;
+	struct list_head list;
+	/* last field is a variable length array of btrfs_sector_sums */
+	struct btrfs_sector_sum sums[];
+};
+
+/*
+ * bits for the flags field:
+ *
+ * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
+ * It is used to make sure metadata is inserted into the tree only once
+ * per extent.
+ *
+ * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
+ * rbtree, just before waking any waiters.  It is used to indicate the
+ * IO is done and any metadata is inserted into the tree.
+ */
+#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
+
+#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
+
+#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
+
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
+
+#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+
+#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
+
+struct btrfs_ordered_extent {
+	/* logical offset in the file */
+	u64 file_offset;
+
+	/* disk byte number */
+	u64 start;
+
+	/* ram length of the extent in bytes */
+	u64 len;
+
+	/* extent length on disk */
+	u64 disk_len;
+
+	/* number of bytes that still need writing */
+	u64 bytes_left;
+
+	/* flags (described above) */
+	unsigned long flags;
+
+	/* compression algorithm */
+	int compress_type;
+
+	/* reference count */
+	atomic_t refs;
+
+	/* the inode we belong to */
+	struct inode *inode;
+
+	/* list of checksums for insertion when the extent io is done */
+	struct list_head list;
+
+	/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
+	wait_queue_head_t wait;
+
+	/* our friendly rbtree entry */
+	struct rb_node rb_node;
+
+	/* a per root list of all the pending ordered extents */
+	struct list_head root_extent_list;
+};
+
+
+/*
+ * calculates the total size you need to allocate for an ordered sum
+ * structure spanning 'bytes' in the file
+ */
+static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
+					 unsigned long bytes)
+{
+	unsigned long num_sectors = (bytes + root->sectorsize - 1) /
+		root->sectorsize;
+	num_sectors++;
+	return sizeof(struct btrfs_ordered_sum) +
+		num_sectors * sizeof(struct btrfs_sector_sum);
+}
+
+static inline void
+btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
+{
+	spin_lock_init(&t->lock);
+	t->tree = RB_ROOT;
+	t->last = NULL;
+}
+
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+int btrfs_remove_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry);
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+				   struct btrfs_ordered_extent **cached,
+				   u64 file_offset, u64 io_size);
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+				   struct btrfs_ordered_extent **cached,
+				   u64 *file_offset, u64 io_size);
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+			     u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+				 u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+				      u64 start, u64 len, u64 disk_len,
+				      int type, int compress_type);
+int btrfs_add_ordered_sum(struct inode *inode,
+			  struct btrfs_ordered_extent *entry,
+			  struct btrfs_ordered_sum *sum);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+							 u64 file_offset);
+void btrfs_start_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry, int wait);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+							u64 file_offset,
+							u64 len);
+int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
+				struct btrfs_ordered_extent *ordered);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct inode *inode);
+int btrfs_wait_ordered_extents(struct btrfs_root *root,
+			       int nocow_only, int delay_iput);
+#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 00000000..f8be2509
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2008 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 offset)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret = 0;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = offset;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 offset)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret = 0;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = offset;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, root, path);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = offset;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+	btrfs_free_path(path);
+	return ret;
+}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 00000000..fb2605d9
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
+{
+	int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
+	int i;
+	printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
+	       "num_stripes %d\n",
+	       (unsigned long long)btrfs_chunk_length(eb, chunk),
+	       (unsigned long long)btrfs_chunk_owner(eb, chunk),
+	       (unsigned long long)btrfs_chunk_type(eb, chunk),
+	       num_stripes);
+	for (i = 0 ; i < num_stripes ; i++) {
+		printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
+		      (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
+		      (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
+	}
+}
+static void print_dev_item(struct extent_buffer *eb,
+			   struct btrfs_dev_item *dev_item)
+{
+	printk(KERN_INFO "\t\tdev item devid %llu "
+	       "total_bytes %llu bytes used %llu\n",
+	       (unsigned long long)btrfs_device_id(eb, dev_item),
+	       (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
+	       (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
+}
+static void print_extent_data_ref(struct extent_buffer *eb,
+				  struct btrfs_extent_data_ref *ref)
+{
+	printk(KERN_INFO "\t\textent data backref root %llu "
+	       "objectid %llu offset %llu count %u\n",
+	       (unsigned long long)btrfs_extent_data_ref_root(eb, ref),
+	       (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref),
+	       (unsigned long long)btrfs_extent_data_ref_offset(eb, ref),
+	       btrfs_extent_data_ref_count(eb, ref));
+}
+
+static void print_extent_item(struct extent_buffer *eb, int slot)
+{
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_extent_data_ref *dref;
+	struct btrfs_shared_data_ref *sref;
+	struct btrfs_disk_key key;
+	unsigned long end;
+	unsigned long ptr;
+	int type;
+	u32 item_size = btrfs_item_size_nr(eb, slot);
+	u64 flags;
+	u64 offset;
+
+	if (item_size < sizeof(*ei)) {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		struct btrfs_extent_item_v0 *ei0;
+		BUG_ON(item_size != sizeof(*ei0));
+		ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0);
+		printk(KERN_INFO "\t\textent refs %u\n",
+		       btrfs_extent_refs_v0(eb, ei0));
+		return;
+#else
+		BUG();
+#endif
+	}
+
+	ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
+	flags = btrfs_extent_flags(eb, ei);
+
+	printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n",
+	       (unsigned long long)btrfs_extent_refs(eb, ei),
+	       (unsigned long long)btrfs_extent_generation(eb, ei),
+	       (unsigned long long)flags);
+
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		struct btrfs_tree_block_info *info;
+		info = (struct btrfs_tree_block_info *)(ei + 1);
+		btrfs_tree_block_key(eb, info, &key);
+		printk(KERN_INFO "\t\ttree block key (%llu %x %llu) "
+		       "level %d\n",
+		       (unsigned long long)btrfs_disk_key_objectid(&key),
+		       key.type,
+		       (unsigned long long)btrfs_disk_key_offset(&key),
+		       btrfs_tree_block_level(eb, info));
+		iref = (struct btrfs_extent_inline_ref *)(info + 1);
+	} else {
+		iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+	}
+
+	ptr = (unsigned long)iref;
+	end = (unsigned long)ei + item_size;
+	while (ptr < end) {
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(eb, iref);
+		offset = btrfs_extent_inline_ref_offset(eb, iref);
+		switch (type) {
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\ttree block backref "
+				"root %llu\n", (unsigned long long)offset);
+			break;
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\tshared block backref "
+				"parent %llu\n", (unsigned long long)offset);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY:
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			print_extent_data_ref(eb, dref);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY:
+			sref = (struct btrfs_shared_data_ref *)(iref + 1);
+			printk(KERN_INFO "\t\tshared data backref "
+			       "parent %llu count %u\n",
+			       (unsigned long long)offset,
+			       btrfs_shared_data_ref_count(eb, sref));
+			break;
+		default:
+			BUG();
+		}
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+	WARN_ON(ptr > end);
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
+{
+	struct btrfs_extent_ref_v0 *ref0;
+
+	ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0);
+	printk("\t\textent back ref root %llu gen %llu "
+		"owner %llu num_refs %lu\n",
+		(unsigned long long)btrfs_ref_root_v0(eb, ref0),
+		(unsigned long long)btrfs_ref_generation_v0(eb, ref0),
+		(unsigned long long)btrfs_ref_objectid_v0(eb, ref0),
+		(unsigned long)btrfs_ref_count_v0(eb, ref0));
+}
+#endif
+
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
+{
+	int i;
+	u32 type;
+	u32 nr = btrfs_header_nritems(l);
+	struct btrfs_item *item;
+	struct btrfs_root_item *ri;
+	struct btrfs_dir_item *di;
+	struct btrfs_inode_item *ii;
+	struct btrfs_block_group_item *bi;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_extent_data_ref *dref;
+	struct btrfs_shared_data_ref *sref;
+	struct btrfs_dev_extent *dev_extent;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+
+	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
+		(unsigned long long)btrfs_header_bytenr(l), nr,
+		btrfs_leaf_free_space(root, l));
+	for (i = 0 ; i < nr ; i++) {
+		item = btrfs_item_nr(l, i);
+		btrfs_item_key_to_cpu(l, &key, i);
+		type = btrfs_key_type(&key);
+		printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
+		       "itemsize %d\n",
+			i,
+			(unsigned long long)key.objectid, type,
+			(unsigned long long)key.offset,
+			btrfs_item_offset(l, item), btrfs_item_size(l, item));
+		switch (type) {
+		case BTRFS_INODE_ITEM_KEY:
+			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
+			printk(KERN_INFO "\t\tinode generation %llu size %llu "
+			       "mode %o\n",
+			       (unsigned long long)
+			       btrfs_inode_generation(l, ii),
+			      (unsigned long long)btrfs_inode_size(l, ii),
+			       btrfs_inode_mode(l, ii));
+			break;
+		case BTRFS_DIR_ITEM_KEY:
+			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
+			btrfs_dir_item_key_to_cpu(l, di, &found_key);
+			printk(KERN_INFO "\t\tdir oid %llu type %u\n",
+				(unsigned long long)found_key.objectid,
+				btrfs_dir_type(l, di));
+			break;
+		case BTRFS_ROOT_ITEM_KEY:
+			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
+			printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
+				(unsigned long long)
+				btrfs_disk_root_bytenr(l, ri),
+				btrfs_disk_root_refs(l, ri));
+			break;
+		case BTRFS_EXTENT_ITEM_KEY:
+			print_extent_item(l, i);
+			break;
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\ttree block backref\n");
+			break;
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\tshared block backref\n");
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY:
+			dref = btrfs_item_ptr(l, i,
+					      struct btrfs_extent_data_ref);
+			print_extent_data_ref(l, dref);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY:
+			sref = btrfs_item_ptr(l, i,
+					      struct btrfs_shared_data_ref);
+			printk(KERN_INFO "\t\tshared data backref count %u\n",
+			       btrfs_shared_data_ref_count(l, sref));
+			break;
+		case BTRFS_EXTENT_DATA_KEY:
+			fi = btrfs_item_ptr(l, i,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(l, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE) {
+				printk(KERN_INFO "\t\tinline extent data "
+				       "size %u\n",
+				       btrfs_file_extent_inline_len(l, fi));
+				break;
+			}
+			printk(KERN_INFO "\t\textent data disk bytenr %llu "
+			       "nr %llu\n",
+			       (unsigned long long)
+			       btrfs_file_extent_disk_bytenr(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_disk_num_bytes(l, fi));
+			printk(KERN_INFO "\t\textent data offset %llu "
+			       "nr %llu ram %llu\n",
+			       (unsigned long long)
+			       btrfs_file_extent_offset(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_num_bytes(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_ram_bytes(l, fi));
+			break;
+		case BTRFS_EXTENT_REF_V0_KEY:
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+			print_extent_ref_v0(l, i);
+#else
+			BUG();
+#endif
+			break;
+		case BTRFS_BLOCK_GROUP_ITEM_KEY:
+			bi = btrfs_item_ptr(l, i,
+					    struct btrfs_block_group_item);
+			printk(KERN_INFO "\t\tblock group used %llu\n",
+			       (unsigned long long)
+			       btrfs_disk_block_group_used(l, bi));
+			break;
+		case BTRFS_CHUNK_ITEM_KEY:
+			print_chunk(l, btrfs_item_ptr(l, i,
+						      struct btrfs_chunk));
+			break;
+		case BTRFS_DEV_ITEM_KEY:
+			print_dev_item(l, btrfs_item_ptr(l, i,
+					struct btrfs_dev_item));
+			break;
+		case BTRFS_DEV_EXTENT_KEY:
+			dev_extent = btrfs_item_ptr(l, i,
+						    struct btrfs_dev_extent);
+			printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
+			       "\t\tchunk objectid %llu chunk offset %llu "
+			       "length %llu\n",
+			       (unsigned long long)
+			       btrfs_dev_extent_chunk_tree(l, dev_extent),
+			       (unsigned long long)
+			       btrfs_dev_extent_chunk_objectid(l, dev_extent),
+			       (unsigned long long)
+			       btrfs_dev_extent_chunk_offset(l, dev_extent),
+			       (unsigned long long)
+			       btrfs_dev_extent_length(l, dev_extent));
+		};
+	}
+}
+
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
+{
+	int i; u32 nr;
+	struct btrfs_key key;
+	int level;
+
+	if (!c)
+		return;
+	nr = btrfs_header_nritems(c);
+	level = btrfs_header_level(c);
+	if (level == 0) {
+		btrfs_print_leaf(root, c);
+		return;
+	}
+	printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
+	       (unsigned long long)btrfs_header_bytenr(c),
+	      level, nr,
+	       (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
+	for (i = 0; i < nr; i++) {
+		btrfs_node_key_to_cpu(c, &key, i);
+		printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
+		       i,
+		       (unsigned long long)key.objectid,
+		       key.type,
+		       (unsigned long long)key.offset,
+		       (unsigned long long)btrfs_node_blockptr(c, i));
+	}
+	for (i = 0; i < nr; i++) {
+		struct extent_buffer *next = read_tree_block(root,
+					btrfs_node_blockptr(c, i),
+					btrfs_level_size(root, level - 1),
+					btrfs_node_ptr_generation(c, i));
+		if (btrfs_is_leaf(next) &&
+		   level != 1)
+			BUG();
+		if (btrfs_header_level(next) !=
+		       level - 1)
+			BUG();
+		btrfs_print_tree(root, next);
+		free_extent_buffer(next);
+	}
+}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 00000000..da75efe5
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __PRINT_TREE_
+#define __PRINT_TREE_
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
+#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
new file mode 100644
index 00000000..82d569cb
--- /dev/null
+++ b/fs/btrfs/ref-cache.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include "ctree.h"
+#include "ref-cache.h"
+#include "transaction.h"
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+				   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_leaf_ref *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
+
+		if (bytenr < entry->bytenr)
+			p = &(*p)->rb_left;
+		else if (bytenr > entry->bytenr)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+	struct rb_node *n = root->rb_node;
+	struct btrfs_leaf_ref *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
+		WARN_ON(!entry->in_tree);
+
+		if (bytenr < entry->bytenr)
+			n = n->rb_left;
+		else if (bytenr > entry->bytenr)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	return NULL;
+}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
new file mode 100644
index 00000000..24f7001f
--- /dev/null
+++ b/fs/btrfs/ref-cache.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __REFCACHE__
+#define __REFCACHE__
+
+struct btrfs_extent_info {
+	/* bytenr and num_bytes find the extent in the extent allocation tree */
+	u64 bytenr;
+	u64 num_bytes;
+
+	/* objectid and offset find the back reference for the file */
+	u64 objectid;
+	u64 offset;
+};
+
+struct btrfs_leaf_ref {
+	struct rb_node rb_node;
+	struct btrfs_leaf_ref_tree *tree;
+	int in_tree;
+	atomic_t usage;
+
+	u64 root_gen;
+	u64 bytenr;
+	u64 owner;
+	u64 generation;
+	int nritems;
+
+	struct list_head list;
+	struct btrfs_extent_info extents[];
+};
+
+static inline size_t btrfs_leaf_ref_size(int nr_extents)
+{
+	return sizeof(struct btrfs_leaf_ref) +
+	       sizeof(struct btrfs_extent_info) * nr_extents;
+}
+#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
new file mode 100644
index 00000000..5e0a3dc7
--- /dev/null
+++ b/fs/btrfs/relocation.c
@@ -0,0 +1,4413 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "btrfs_inode.h"
+#include "async-thread.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
+
+/*
+ * backref_node, mapping_node and tree_block start with this
+ */
+struct tree_entry {
+	struct rb_node rb_node;
+	u64 bytenr;
+};
+
+/*
+ * present a tree block in the backref cache
+ */
+struct backref_node {
+	struct rb_node rb_node;
+	u64 bytenr;
+
+	u64 new_bytenr;
+	/* objectid of tree block owner, can be not uptodate */
+	u64 owner;
+	/* link to pending, changed or detached list */
+	struct list_head list;
+	/* list of upper level blocks reference this block */
+	struct list_head upper;
+	/* list of child blocks in the cache */
+	struct list_head lower;
+	/* NULL if this node is not tree root */
+	struct btrfs_root *root;
+	/* extent buffer got by COW the block */
+	struct extent_buffer *eb;
+	/* level of tree block */
+	unsigned int level:8;
+	/* is the block in non-reference counted tree */
+	unsigned int cowonly:1;
+	/* 1 if no child node in the cache */
+	unsigned int lowest:1;
+	/* is the extent buffer locked */
+	unsigned int locked:1;
+	/* has the block been processed */
+	unsigned int processed:1;
+	/* have backrefs of this block been checked */
+	unsigned int checked:1;
+	/*
+	 * 1 if corresponding block has been cowed but some upper
+	 * level block pointers may not point to the new location
+	 */
+	unsigned int pending:1;
+	/*
+	 * 1 if the backref node isn't connected to any other
+	 * backref node.
+	 */
+	unsigned int detached:1;
+};
+
+/*
+ * present a block pointer in the backref cache
+ */
+struct backref_edge {
+	struct list_head list[2];
+	struct backref_node *node[2];
+};
+
+#define LOWER	0
+#define UPPER	1
+
+struct backref_cache {
+	/* red black tree of all backref nodes in the cache */
+	struct rb_root rb_root;
+	/* for passing backref nodes to btrfs_reloc_cow_block */
+	struct backref_node *path[BTRFS_MAX_LEVEL];
+	/*
+	 * list of blocks that have been cowed but some block
+	 * pointers in upper level blocks may not reflect the
+	 * new location
+	 */
+	struct list_head pending[BTRFS_MAX_LEVEL];
+	/* list of backref nodes with no child node */
+	struct list_head leaves;
+	/* list of blocks that have been cowed in current transaction */
+	struct list_head changed;
+	/* list of detached backref node. */
+	struct list_head detached;
+
+	u64 last_trans;
+
+	int nr_nodes;
+	int nr_edges;
+};
+
+/*
+ * map address of tree root to tree
+ */
+struct mapping_node {
+	struct rb_node rb_node;
+	u64 bytenr;
+	void *data;
+};
+
+struct mapping_tree {
+	struct rb_root rb_root;
+	spinlock_t lock;
+};
+
+/*
+ * present a tree block to process
+ */
+struct tree_block {
+	struct rb_node rb_node;
+	u64 bytenr;
+	struct btrfs_key key;
+	unsigned int level:8;
+	unsigned int key_ready:1;
+};
+
+#define MAX_EXTENTS 128
+
+struct file_extent_cluster {
+	u64 start;
+	u64 end;
+	u64 boundary[MAX_EXTENTS];
+	unsigned int nr;
+};
+
+struct reloc_control {
+	/* block group to relocate */
+	struct btrfs_block_group_cache *block_group;
+	/* extent tree */
+	struct btrfs_root *extent_root;
+	/* inode for moving data */
+	struct inode *data_inode;
+
+	struct btrfs_block_rsv *block_rsv;
+
+	struct backref_cache backref_cache;
+
+	struct file_extent_cluster cluster;
+	/* tree blocks have been processed */
+	struct extent_io_tree processed_blocks;
+	/* map start of tree root to corresponding reloc tree */
+	struct mapping_tree reloc_root_tree;
+	/* list of reloc trees */
+	struct list_head reloc_roots;
+	/* size of metadata reservation for merging reloc trees */
+	u64 merging_rsv_size;
+	/* size of relocated tree nodes */
+	u64 nodes_relocated;
+
+	u64 search_start;
+	u64 extents_found;
+
+	unsigned int stage:8;
+	unsigned int create_reloc_tree:1;
+	unsigned int merge_reloc_tree:1;
+	unsigned int found_file_extent:1;
+	unsigned int commit_transaction:1;
+};
+
+/* stages of data relocation */
+#define MOVE_DATA_EXTENTS	0
+#define UPDATE_DATA_PTRS	1
+
+static void remove_backref_node(struct backref_cache *cache,
+				struct backref_node *node);
+static void __mark_block_processed(struct reloc_control *rc,
+				   struct backref_node *node);
+
+static void mapping_tree_init(struct mapping_tree *tree)
+{
+	tree->rb_root = RB_ROOT;
+	spin_lock_init(&tree->lock);
+}
+
+static void backref_cache_init(struct backref_cache *cache)
+{
+	int i;
+	cache->rb_root = RB_ROOT;
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+		INIT_LIST_HEAD(&cache->pending[i]);
+	INIT_LIST_HEAD(&cache->changed);
+	INIT_LIST_HEAD(&cache->detached);
+	INIT_LIST_HEAD(&cache->leaves);
+}
+
+static void backref_cache_cleanup(struct backref_cache *cache)
+{
+	struct backref_node *node;
+	int i;
+
+	while (!list_empty(&cache->detached)) {
+		node = list_entry(cache->detached.next,
+				  struct backref_node, list);
+		remove_backref_node(cache, node);
+	}
+
+	while (!list_empty(&cache->leaves)) {
+		node = list_entry(cache->leaves.next,
+				  struct backref_node, lower);
+		remove_backref_node(cache, node);
+	}
+
+	cache->last_trans = 0;
+
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+		BUG_ON(!list_empty(&cache->pending[i]));
+	BUG_ON(!list_empty(&cache->changed));
+	BUG_ON(!list_empty(&cache->detached));
+	BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+	BUG_ON(cache->nr_nodes);
+	BUG_ON(cache->nr_edges);
+}
+
+static struct backref_node *alloc_backref_node(struct backref_cache *cache)
+{
+	struct backref_node *node;
+
+	node = kzalloc(sizeof(*node), GFP_NOFS);
+	if (node) {
+		INIT_LIST_HEAD(&node->list);
+		INIT_LIST_HEAD(&node->upper);
+		INIT_LIST_HEAD(&node->lower);
+		RB_CLEAR_NODE(&node->rb_node);
+		cache->nr_nodes++;
+	}
+	return node;
+}
+
+static void free_backref_node(struct backref_cache *cache,
+			      struct backref_node *node)
+{
+	if (node) {
+		cache->nr_nodes--;
+		kfree(node);
+	}
+}
+
+static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
+{
+	struct backref_edge *edge;
+
+	edge = kzalloc(sizeof(*edge), GFP_NOFS);
+	if (edge)
+		cache->nr_edges++;
+	return edge;
+}
+
+static void free_backref_edge(struct backref_cache *cache,
+			      struct backref_edge *edge)
+{
+	if (edge) {
+		cache->nr_edges--;
+		kfree(edge);
+	}
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+				   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct tree_entry *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct tree_entry, rb_node);
+
+		if (bytenr < entry->bytenr)
+			p = &(*p)->rb_left;
+		else if (bytenr > entry->bytenr)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+	struct rb_node *n = root->rb_node;
+	struct tree_entry *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct tree_entry, rb_node);
+
+		if (bytenr < entry->bytenr)
+			n = n->rb_left;
+		else if (bytenr > entry->bytenr)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	return NULL;
+}
+
+/*
+ * walk up backref nodes until reach node presents tree root
+ */
+static struct backref_node *walk_up_backref(struct backref_node *node,
+					    struct backref_edge *edges[],
+					    int *index)
+{
+	struct backref_edge *edge;
+	int idx = *index;
+
+	while (!list_empty(&node->upper)) {
+		edge = list_entry(node->upper.next,
+				  struct backref_edge, list[LOWER]);
+		edges[idx++] = edge;
+		node = edge->node[UPPER];
+	}
+	BUG_ON(node->detached);
+	*index = idx;
+	return node;
+}
+
+/*
+ * walk down backref nodes to find start of next reference path
+ */
+static struct backref_node *walk_down_backref(struct backref_edge *edges[],
+					      int *index)
+{
+	struct backref_edge *edge;
+	struct backref_node *lower;
+	int idx = *index;
+
+	while (idx > 0) {
+		edge = edges[idx - 1];
+		lower = edge->node[LOWER];
+		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
+			idx--;
+			continue;
+		}
+		edge = list_entry(edge->list[LOWER].next,
+				  struct backref_edge, list[LOWER]);
+		edges[idx - 1] = edge;
+		*index = idx;
+		return edge->node[UPPER];
+	}
+	*index = 0;
+	return NULL;
+}
+
+static void unlock_node_buffer(struct backref_node *node)
+{
+	if (node->locked) {
+		btrfs_tree_unlock(node->eb);
+		node->locked = 0;
+	}
+}
+
+static void drop_node_buffer(struct backref_node *node)
+{
+	if (node->eb) {
+		unlock_node_buffer(node);
+		free_extent_buffer(node->eb);
+		node->eb = NULL;
+	}
+}
+
+static void drop_backref_node(struct backref_cache *tree,
+			      struct backref_node *node)
+{
+	BUG_ON(!list_empty(&node->upper));
+
+	drop_node_buffer(node);
+	list_del(&node->list);
+	list_del(&node->lower);
+	if (!RB_EMPTY_NODE(&node->rb_node))
+		rb_erase(&node->rb_node, &tree->rb_root);
+	free_backref_node(tree, node);
+}
+
+/*
+ * remove a backref node from the backref cache
+ */
+static void remove_backref_node(struct backref_cache *cache,
+				struct backref_node *node)
+{
+	struct backref_node *upper;
+	struct backref_edge *edge;
+
+	if (!node)
+		return;
+
+	BUG_ON(!node->lowest && !node->detached);
+	while (!list_empty(&node->upper)) {
+		edge = list_entry(node->upper.next, struct backref_edge,
+				  list[LOWER]);
+		upper = edge->node[UPPER];
+		list_del(&edge->list[LOWER]);
+		list_del(&edge->list[UPPER]);
+		free_backref_edge(cache, edge);
+
+		if (RB_EMPTY_NODE(&upper->rb_node)) {
+			BUG_ON(!list_empty(&node->upper));
+			drop_backref_node(cache, node);
+			node = upper;
+			node->lowest = 1;
+			continue;
+		}
+		/*
+		 * add the node to leaf node list if no other
+		 * child block cached.
+		 */
+		if (list_empty(&upper->lower)) {
+			list_add_tail(&upper->lower, &cache->leaves);
+			upper->lowest = 1;
+		}
+	}
+
+	drop_backref_node(cache, node);
+}
+
+static void update_backref_node(struct backref_cache *cache,
+				struct backref_node *node, u64 bytenr)
+{
+	struct rb_node *rb_node;
+	rb_erase(&node->rb_node, &cache->rb_root);
+	node->bytenr = bytenr;
+	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+	BUG_ON(rb_node);
+}
+
+/*
+ * update backref cache after a transaction commit
+ */
+static int update_backref_cache(struct btrfs_trans_handle *trans,
+				struct backref_cache *cache)
+{
+	struct backref_node *node;
+	int level = 0;
+
+	if (cache->last_trans == 0) {
+		cache->last_trans = trans->transid;
+		return 0;
+	}
+
+	if (cache->last_trans == trans->transid)
+		return 0;
+
+	/*
+	 * detached nodes are used to avoid unnecessary backref
+	 * lookup. transaction commit changes the extent tree.
+	 * so the detached nodes are no longer useful.
+	 */
+	while (!list_empty(&cache->detached)) {
+		node = list_entry(cache->detached.next,
+				  struct backref_node, list);
+		remove_backref_node(cache, node);
+	}
+
+	while (!list_empty(&cache->changed)) {
+		node = list_entry(cache->changed.next,
+				  struct backref_node, list);
+		list_del_init(&node->list);
+		BUG_ON(node->pending);
+		update_backref_node(cache, node, node->new_bytenr);
+	}
+
+	/*
+	 * some nodes can be left in the pending list if there were
+	 * errors during processing the pending nodes.
+	 */
+	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+		list_for_each_entry(node, &cache->pending[level], list) {
+			BUG_ON(!node->pending);
+			if (node->bytenr == node->new_bytenr)
+				continue;
+			update_backref_node(cache, node, node->new_bytenr);
+		}
+	}
+
+	cache->last_trans = 0;
+	return 1;
+}
+
+
+static int should_ignore_root(struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+
+	if (!root->ref_cows)
+		return 0;
+
+	reloc_root = root->reloc_root;
+	if (!reloc_root)
+		return 0;
+
+	if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
+	    root->fs_info->running_transaction->transid - 1)
+		return 0;
+	/*
+	 * if there is reloc tree and it was created in previous
+	 * transaction backref lookup can find the reloc tree,
+	 * so backref node for the fs tree root is useless for
+	 * relocation.
+	 */
+	return 1;
+}
+/*
+ * find reloc tree by address of tree root
+ */
+static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
+					  u64 bytenr)
+{
+	struct rb_node *rb_node;
+	struct mapping_node *node;
+	struct btrfs_root *root = NULL;
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
+	if (rb_node) {
+		node = rb_entry(rb_node, struct mapping_node, rb_node);
+		root = (struct btrfs_root *)node->data;
+	}
+	spin_unlock(&rc->reloc_root_tree.lock);
+	return root;
+}
+
+static int is_cowonly_root(u64 root_objectid)
+{
+	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+		return 1;
+	return 0;
+}
+
+static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
+					u64 root_objectid)
+{
+	struct btrfs_key key;
+
+	key.objectid = root_objectid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	if (is_cowonly_root(root_objectid))
+		key.offset = 0;
+	else
+		key.offset = (u64)-1;
+
+	return btrfs_read_fs_root_no_name(fs_info, &key);
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static noinline_for_stack
+struct btrfs_root *find_tree_root(struct reloc_control *rc,
+				  struct extent_buffer *leaf,
+				  struct btrfs_extent_ref_v0 *ref0)
+{
+	struct btrfs_root *root;
+	u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
+	u64 generation = btrfs_ref_generation_v0(leaf, ref0);
+
+	BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
+
+	root = read_fs_root(rc->extent_root->fs_info, root_objectid);
+	BUG_ON(IS_ERR(root));
+
+	if (root->ref_cows &&
+	    generation != btrfs_root_generation(&root->root_item))
+		return NULL;
+
+	return root;
+}
+#endif
+
+static noinline_for_stack
+int find_inline_backref(struct extent_buffer *leaf, int slot,
+			unsigned long *ptr, unsigned long *end)
+{
+	struct btrfs_extent_item *ei;
+	struct btrfs_tree_block_info *bi;
+	u32 item_size;
+
+	item_size = btrfs_item_size_nr(leaf, slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+		return 1;
+	}
+#endif
+	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
+		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
+
+	if (item_size <= sizeof(*ei) + sizeof(*bi)) {
+		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
+		return 1;
+	}
+
+	bi = (struct btrfs_tree_block_info *)(ei + 1);
+	*ptr = (unsigned long)(bi + 1);
+	*end = (unsigned long)ei + item_size;
+	return 0;
+}
+
+/*
+ * build backref tree for a given tree block. root of the backref tree
+ * corresponds the tree block, leaves of the backref tree correspond
+ * roots of b-trees that reference the tree block.
+ *
+ * the basic idea of this function is check backrefs of a given block
+ * to find upper level blocks that refernece the block, and then check
+ * bakcrefs of these upper level blocks recursively. the recursion stop
+ * when tree root is reached or backrefs for the block is cached.
+ *
+ * NOTE: if we find backrefs for a block are cached, we know backrefs
+ * for all upper level blocks that directly/indirectly reference the
+ * block are also cached.
+ */
+static noinline_for_stack
+struct backref_node *build_backref_tree(struct reloc_control *rc,
+					struct btrfs_key *node_key,
+					int level, u64 bytenr)
+{
+	struct backref_cache *cache = &rc->backref_cache;
+	struct btrfs_path *path1;
+	struct btrfs_path *path2;
+	struct extent_buffer *eb;
+	struct btrfs_root *root;
+	struct backref_node *cur;
+	struct backref_node *upper;
+	struct backref_node *lower;
+	struct backref_node *node = NULL;
+	struct backref_node *exist = NULL;
+	struct backref_edge *edge;
+	struct rb_node *rb_node;
+	struct btrfs_key key;
+	unsigned long end;
+	unsigned long ptr;
+	LIST_HEAD(list);
+	LIST_HEAD(useless);
+	int cowonly;
+	int ret;
+	int err = 0;
+
+	path1 = btrfs_alloc_path();
+	path2 = btrfs_alloc_path();
+	if (!path1 || !path2) {
+		err = -ENOMEM;
+		goto out;
+	}
+	path1->reada = 1;
+	path2->reada = 2;
+
+	node = alloc_backref_node(cache);
+	if (!node) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	node->bytenr = bytenr;
+	node->level = level;
+	node->lowest = 1;
+	cur = node;
+again:
+	end = 0;
+	ptr = 0;
+	key.objectid = cur->bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	path1->search_commit_root = 1;
+	path1->skip_locking = 1;
+	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
+				0, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	BUG_ON(!ret || !path1->slots[0]);
+
+	path1->slots[0]--;
+
+	WARN_ON(cur->checked);
+	if (!list_empty(&cur->upper)) {
+		/*
+		 * the backref was added previously when processing
+		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
+		 */
+		BUG_ON(!list_is_singular(&cur->upper));
+		edge = list_entry(cur->upper.next, struct backref_edge,
+				  list[LOWER]);
+		BUG_ON(!list_empty(&edge->list[UPPER]));
+		exist = edge->node[UPPER];
+		/*
+		 * add the upper level block to pending list if we need
+		 * check its backrefs
+		 */
+		if (!exist->checked)
+			list_add_tail(&edge->list[UPPER], &list);
+	} else {
+		exist = NULL;
+	}
+
+	while (1) {
+		cond_resched();
+		eb = path1->nodes[0];
+
+		if (ptr >= end) {
+			if (path1->slots[0] >= btrfs_header_nritems(eb)) {
+				ret = btrfs_next_leaf(rc->extent_root, path1);
+				if (ret < 0) {
+					err = ret;
+					goto out;
+				}
+				if (ret > 0)
+					break;
+				eb = path1->nodes[0];
+			}
+
+			btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
+			if (key.objectid != cur->bytenr) {
+				WARN_ON(exist);
+				break;
+			}
+
+			if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+				ret = find_inline_backref(eb, path1->slots[0],
+							  &ptr, &end);
+				if (ret)
+					goto next;
+			}
+		}
+
+		if (ptr < end) {
+			/* update key for inline back ref */
+			struct btrfs_extent_inline_ref *iref;
+			iref = (struct btrfs_extent_inline_ref *)ptr;
+			key.type = btrfs_extent_inline_ref_type(eb, iref);
+			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
+				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
+		}
+
+		if (exist &&
+		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
+		      exist->owner == key.offset) ||
+		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
+		      exist->bytenr == key.offset))) {
+			exist = NULL;
+			goto next;
+		}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
+		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+			if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+				struct btrfs_extent_ref_v0 *ref0;
+				ref0 = btrfs_item_ptr(eb, path1->slots[0],
+						struct btrfs_extent_ref_v0);
+				if (key.objectid == key.offset) {
+					root = find_tree_root(rc, eb, ref0);
+					if (root && !should_ignore_root(root))
+						cur->root = root;
+					else
+						list_add(&cur->list, &useless);
+					break;
+				}
+				if (is_cowonly_root(btrfs_ref_root_v0(eb,
+								      ref0)))
+					cur->cowonly = 1;
+			}
+#else
+		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
+#endif
+			if (key.objectid == key.offset) {
+				/*
+				 * only root blocks of reloc trees use
+				 * backref of this type.
+				 */
+				root = find_reloc_root(rc, cur->bytenr);
+				BUG_ON(!root);
+				cur->root = root;
+				break;
+			}
+
+			edge = alloc_backref_edge(cache);
+			if (!edge) {
+				err = -ENOMEM;
+				goto out;
+			}
+			rb_node = tree_search(&cache->rb_root, key.offset);
+			if (!rb_node) {
+				upper = alloc_backref_node(cache);
+				if (!upper) {
+					free_backref_edge(cache, edge);
+					err = -ENOMEM;
+					goto out;
+				}
+				upper->bytenr = key.offset;
+				upper->level = cur->level + 1;
+				/*
+				 *  backrefs for the upper level block isn't
+				 *  cached, add the block to pending list
+				 */
+				list_add_tail(&edge->list[UPPER], &list);
+			} else {
+				upper = rb_entry(rb_node, struct backref_node,
+						 rb_node);
+				BUG_ON(!upper->checked);
+				INIT_LIST_HEAD(&edge->list[UPPER]);
+			}
+			list_add_tail(&edge->list[LOWER], &cur->upper);
+			edge->node[LOWER] = cur;
+			edge->node[UPPER] = upper;
+
+			goto next;
+		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
+			goto next;
+		}
+
+		/* key.type == BTRFS_TREE_BLOCK_REF_KEY */
+		root = read_fs_root(rc->extent_root->fs_info, key.offset);
+		if (IS_ERR(root)) {
+			err = PTR_ERR(root);
+			goto out;
+		}
+
+		if (!root->ref_cows)
+			cur->cowonly = 1;
+
+		if (btrfs_root_level(&root->root_item) == cur->level) {
+			/* tree root */
+			BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+			       cur->bytenr);
+			if (should_ignore_root(root))
+				list_add(&cur->list, &useless);
+			else
+				cur->root = root;
+			break;
+		}
+
+		level = cur->level + 1;
+
+		/*
+		 * searching the tree to find upper level blocks
+		 * reference the block.
+		 */
+		path2->search_commit_root = 1;
+		path2->skip_locking = 1;
+		path2->lowest_level = level;
+		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
+		path2->lowest_level = 0;
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		if (ret > 0 && path2->slots[level] > 0)
+			path2->slots[level]--;
+
+		eb = path2->nodes[level];
+		WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
+			cur->bytenr);
+
+		lower = cur;
+		for (; level < BTRFS_MAX_LEVEL; level++) {
+			if (!path2->nodes[level]) {
+				BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+				       lower->bytenr);
+				if (should_ignore_root(root))
+					list_add(&lower->list, &useless);
+				else
+					lower->root = root;
+				break;
+			}
+
+			edge = alloc_backref_edge(cache);
+			if (!edge) {
+				err = -ENOMEM;
+				goto out;
+			}
+
+			eb = path2->nodes[level];
+			rb_node = tree_search(&cache->rb_root, eb->start);
+			if (!rb_node) {
+				upper = alloc_backref_node(cache);
+				if (!upper) {
+					free_backref_edge(cache, edge);
+					err = -ENOMEM;
+					goto out;
+				}
+				upper->bytenr = eb->start;
+				upper->owner = btrfs_header_owner(eb);
+				upper->level = lower->level + 1;
+				if (!root->ref_cows)
+					upper->cowonly = 1;
+
+				/*
+				 * if we know the block isn't shared
+				 * we can void checking its backrefs.
+				 */
+				if (btrfs_block_can_be_shared(root, eb))
+					upper->checked = 0;
+				else
+					upper->checked = 1;
+
+				/*
+				 * add the block to pending list if we
+				 * need check its backrefs. only block
+				 * at 'cur->level + 1' is added to the
+				 * tail of pending list. this guarantees
+				 * we check backrefs from lower level
+				 * blocks to upper level blocks.
+				 */
+				if (!upper->checked &&
+				    level == cur->level + 1) {
+					list_add_tail(&edge->list[UPPER],
+						      &list);
+				} else
+					INIT_LIST_HEAD(&edge->list[UPPER]);
+			} else {
+				upper = rb_entry(rb_node, struct backref_node,
+						 rb_node);
+				BUG_ON(!upper->checked);
+				INIT_LIST_HEAD(&edge->list[UPPER]);
+				if (!upper->owner)
+					upper->owner = btrfs_header_owner(eb);
+			}
+			list_add_tail(&edge->list[LOWER], &lower->upper);
+			edge->node[LOWER] = lower;
+			edge->node[UPPER] = upper;
+
+			if (rb_node)
+				break;
+			lower = upper;
+			upper = NULL;
+		}
+		btrfs_release_path(path2);
+next:
+		if (ptr < end) {
+			ptr += btrfs_extent_inline_ref_size(key.type);
+			if (ptr >= end) {
+				WARN_ON(ptr > end);
+				ptr = 0;
+				end = 0;
+			}
+		}
+		if (ptr >= end)
+			path1->slots[0]++;
+	}
+	btrfs_release_path(path1);
+
+	cur->checked = 1;
+	WARN_ON(exist);
+
+	/* the pending list isn't empty, take the first block to process */
+	if (!list_empty(&list)) {
+		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+		list_del_init(&edge->list[UPPER]);
+		cur = edge->node[UPPER];
+		goto again;
+	}
+
+	/*
+	 * everything goes well, connect backref nodes and insert backref nodes
+	 * into the cache.
+	 */
+	BUG_ON(!node->checked);
+	cowonly = node->cowonly;
+	if (!cowonly) {
+		rb_node = tree_insert(&cache->rb_root, node->bytenr,
+				      &node->rb_node);
+		BUG_ON(rb_node);
+		list_add_tail(&node->lower, &cache->leaves);
+	}
+
+	list_for_each_entry(edge, &node->upper, list[LOWER])
+		list_add_tail(&edge->list[UPPER], &list);
+
+	while (!list_empty(&list)) {
+		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+		list_del_init(&edge->list[UPPER]);
+		upper = edge->node[UPPER];
+		if (upper->detached) {
+			list_del(&edge->list[LOWER]);
+			lower = edge->node[LOWER];
+			free_backref_edge(cache, edge);
+			if (list_empty(&lower->upper))
+				list_add(&lower->list, &useless);
+			continue;
+		}
+
+		if (!RB_EMPTY_NODE(&upper->rb_node)) {
+			if (upper->lowest) {
+				list_del_init(&upper->lower);
+				upper->lowest = 0;
+			}
+
+			list_add_tail(&edge->list[UPPER], &upper->lower);
+			continue;
+		}
+
+		BUG_ON(!upper->checked);
+		BUG_ON(cowonly != upper->cowonly);
+		if (!cowonly) {
+			rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+					      &upper->rb_node);
+			BUG_ON(rb_node);
+		}
+
+		list_add_tail(&edge->list[UPPER], &upper->lower);
+
+		list_for_each_entry(edge, &upper->upper, list[LOWER])
+			list_add_tail(&edge->list[UPPER], &list);
+	}
+	/*
+	 * process useless backref nodes. backref nodes for tree leaves
+	 * are deleted from the cache. backref nodes for upper level
+	 * tree blocks are left in the cache to avoid unnecessary backref
+	 * lookup.
+	 */
+	while (!list_empty(&useless)) {
+		upper = list_entry(useless.next, struct backref_node, list);
+		list_del_init(&upper->list);
+		BUG_ON(!list_empty(&upper->upper));
+		if (upper == node)
+			node = NULL;
+		if (upper->lowest) {
+			list_del_init(&upper->lower);
+			upper->lowest = 0;
+		}
+		while (!list_empty(&upper->lower)) {
+			edge = list_entry(upper->lower.next,
+					  struct backref_edge, list[UPPER]);
+			list_del(&edge->list[UPPER]);
+			list_del(&edge->list[LOWER]);
+			lower = edge->node[LOWER];
+			free_backref_edge(cache, edge);
+
+			if (list_empty(&lower->upper))
+				list_add(&lower->list, &useless);
+		}
+		__mark_block_processed(rc, upper);
+		if (upper->level > 0) {
+			list_add(&upper->list, &cache->detached);
+			upper->detached = 1;
+		} else {
+			rb_erase(&upper->rb_node, &cache->rb_root);
+			free_backref_node(cache, upper);
+		}
+	}
+out:
+	btrfs_free_path(path1);
+	btrfs_free_path(path2);
+	if (err) {
+		while (!list_empty(&useless)) {
+			lower = list_entry(useless.next,
+					   struct backref_node, upper);
+			list_del_init(&lower->upper);
+		}
+		upper = node;
+		INIT_LIST_HEAD(&list);
+		while (upper) {
+			if (RB_EMPTY_NODE(&upper->rb_node)) {
+				list_splice_tail(&upper->upper, &list);
+				free_backref_node(cache, upper);
+			}
+
+			if (list_empty(&list))
+				break;
+
+			edge = list_entry(list.next, struct backref_edge,
+					  list[LOWER]);
+			list_del(&edge->list[LOWER]);
+			upper = edge->node[UPPER];
+			free_backref_edge(cache, edge);
+		}
+		return ERR_PTR(err);
+	}
+	BUG_ON(node && node->detached);
+	return node;
+}
+
+/*
+ * helper to add backref node for the newly created snapshot.
+ * the backref node is created by cloning backref node that
+ * corresponds to root of source tree
+ */
+static int clone_backref_node(struct btrfs_trans_handle *trans,
+			      struct reloc_control *rc,
+			      struct btrfs_root *src,
+			      struct btrfs_root *dest)
+{
+	struct btrfs_root *reloc_root = src->reloc_root;
+	struct backref_cache *cache = &rc->backref_cache;
+	struct backref_node *node = NULL;
+	struct backref_node *new_node;
+	struct backref_edge *edge;
+	struct backref_edge *new_edge;
+	struct rb_node *rb_node;
+
+	if (cache->last_trans > 0)
+		update_backref_cache(trans, cache);
+
+	rb_node = tree_search(&cache->rb_root, src->commit_root->start);
+	if (rb_node) {
+		node = rb_entry(rb_node, struct backref_node, rb_node);
+		if (node->detached)
+			node = NULL;
+		else
+			BUG_ON(node->new_bytenr != reloc_root->node->start);
+	}
+
+	if (!node) {
+		rb_node = tree_search(&cache->rb_root,
+				      reloc_root->commit_root->start);
+		if (rb_node) {
+			node = rb_entry(rb_node, struct backref_node,
+					rb_node);
+			BUG_ON(node->detached);
+		}
+	}
+
+	if (!node)
+		return 0;
+
+	new_node = alloc_backref_node(cache);
+	if (!new_node)
+		return -ENOMEM;
+
+	new_node->bytenr = dest->node->start;
+	new_node->level = node->level;
+	new_node->lowest = node->lowest;
+	new_node->checked = 1;
+	new_node->root = dest;
+
+	if (!node->lowest) {
+		list_for_each_entry(edge, &node->lower, list[UPPER]) {
+			new_edge = alloc_backref_edge(cache);
+			if (!new_edge)
+				goto fail;
+
+			new_edge->node[UPPER] = new_node;
+			new_edge->node[LOWER] = edge->node[LOWER];
+			list_add_tail(&new_edge->list[UPPER],
+				      &new_node->lower);
+		}
+	}
+
+	rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
+			      &new_node->rb_node);
+	BUG_ON(rb_node);
+
+	if (!new_node->lowest) {
+		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
+			list_add_tail(&new_edge->list[LOWER],
+				      &new_edge->node[LOWER]->upper);
+		}
+	}
+	return 0;
+fail:
+	while (!list_empty(&new_node->lower)) {
+		new_edge = list_entry(new_node->lower.next,
+				      struct backref_edge, list[UPPER]);
+		list_del(&new_edge->list[UPPER]);
+		free_backref_edge(cache, new_edge);
+	}
+	free_backref_node(cache, new_node);
+	return -ENOMEM;
+}
+
+/*
+ * helper to add 'address of tree root -> reloc tree' mapping
+ */
+static int __add_reloc_root(struct btrfs_root *root)
+{
+	struct rb_node *rb_node;
+	struct mapping_node *node;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+	node = kmalloc(sizeof(*node), GFP_NOFS);
+	BUG_ON(!node);
+
+	node->bytenr = root->node->start;
+	node->data = root;
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+			      node->bytenr, &node->rb_node);
+	spin_unlock(&rc->reloc_root_tree.lock);
+	BUG_ON(rb_node);
+
+	list_add_tail(&root->root_list, &rc->reloc_roots);
+	return 0;
+}
+
+/*
+ * helper to update/delete the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static int __update_reloc_root(struct btrfs_root *root, int del)
+{
+	struct rb_node *rb_node;
+	struct mapping_node *node = NULL;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+			      root->commit_root->start);
+	if (rb_node) {
+		node = rb_entry(rb_node, struct mapping_node, rb_node);
+		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+	}
+	spin_unlock(&rc->reloc_root_tree.lock);
+
+	BUG_ON((struct btrfs_root *)node->data != root);
+
+	if (!del) {
+		spin_lock(&rc->reloc_root_tree.lock);
+		node->bytenr = root->node->start;
+		rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+				      node->bytenr, &node->rb_node);
+		spin_unlock(&rc->reloc_root_tree.lock);
+		BUG_ON(rb_node);
+	} else {
+		list_del_init(&root->root_list);
+		kfree(node);
+	}
+	return 0;
+}
+
+static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root, u64 objectid)
+{
+	struct btrfs_root *reloc_root;
+	struct extent_buffer *eb;
+	struct btrfs_root_item *root_item;
+	struct btrfs_key root_key;
+	int ret;
+
+	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+	BUG_ON(!root_item);
+
+	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = objectid;
+
+	if (root->root_key.objectid == objectid) {
+		/* called by btrfs_init_reloc_root */
+		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+				      BTRFS_TREE_RELOC_OBJECTID);
+		BUG_ON(ret);
+
+		btrfs_set_root_last_snapshot(&root->root_item,
+					     trans->transid - 1);
+	} else {
+		/*
+		 * called by btrfs_reloc_post_snapshot_hook.
+		 * the source tree is a reloc tree, all tree blocks
+		 * modified after it was created have RELOC flag
+		 * set in their headers. so it's OK to not update
+		 * the 'last_snapshot'.
+		 */
+		ret = btrfs_copy_root(trans, root, root->node, &eb,
+				      BTRFS_TREE_RELOC_OBJECTID);
+		BUG_ON(ret);
+	}
+
+	memcpy(root_item, &root->root_item, sizeof(*root_item));
+	btrfs_set_root_bytenr(root_item, eb->start);
+	btrfs_set_root_level(root_item, btrfs_header_level(eb));
+	btrfs_set_root_generation(root_item, trans->transid);
+
+	if (root->root_key.objectid == objectid) {
+		btrfs_set_root_refs(root_item, 0);
+		memset(&root_item->drop_progress, 0,
+		       sizeof(struct btrfs_disk_key));
+		root_item->drop_level = 0;
+	}
+
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
+
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+				&root_key, root_item);
+	BUG_ON(ret);
+	kfree(root_item);
+
+	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+						 &root_key);
+	BUG_ON(IS_ERR(reloc_root));
+	reloc_root->last_trans = trans->transid;
+	return reloc_root;
+}
+
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
+	int clear_rsv = 0;
+
+	if (root->reloc_root) {
+		reloc_root = root->reloc_root;
+		reloc_root->last_trans = trans->transid;
+		return 0;
+	}
+
+	if (!rc || !rc->create_reloc_tree ||
+	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+		return 0;
+
+	if (!trans->block_rsv) {
+		trans->block_rsv = rc->block_rsv;
+		clear_rsv = 1;
+	}
+	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+	if (clear_rsv)
+		trans->block_rsv = NULL;
+
+	__add_reloc_root(reloc_root);
+	root->reloc_root = reloc_root;
+	return 0;
+}
+
+/*
+ * update root item of reloc tree
+ */
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	struct btrfs_root_item *root_item;
+	int del = 0;
+	int ret;
+
+	if (!root->reloc_root)
+		goto out;
+
+	reloc_root = root->reloc_root;
+	root_item = &reloc_root->root_item;
+
+	if (root->fs_info->reloc_ctl->merge_reloc_tree &&
+	    btrfs_root_refs(root_item) == 0) {
+		root->reloc_root = NULL;
+		del = 1;
+	}
+
+	__update_reloc_root(reloc_root, del);
+
+	if (reloc_root->commit_root != reloc_root->node) {
+		btrfs_set_root_node(root_item, reloc_root->node);
+		free_extent_buffer(reloc_root->commit_root);
+		reloc_root->commit_root = btrfs_root_node(reloc_root);
+	}
+
+	ret = btrfs_update_root(trans, root->fs_info->tree_root,
+				&reloc_root->root_key, root_item);
+	BUG_ON(ret);
+
+out:
+	return 0;
+}
+
+/*
+ * helper to find first cached inode with inode number >= objectid
+ * in a subvolume
+ */
+static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
+{
+	struct rb_node *node;
+	struct rb_node *prev;
+	struct btrfs_inode *entry;
+	struct inode *inode;
+
+	spin_lock(&root->inode_lock);
+again:
+	node = root->inode_tree.rb_node;
+	prev = NULL;
+	while (node) {
+		prev = node;
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+		if (objectid < btrfs_ino(&entry->vfs_inode))
+			node = node->rb_left;
+		else if (objectid > btrfs_ino(&entry->vfs_inode))
+			node = node->rb_right;
+		else
+			break;
+	}
+	if (!node) {
+		while (prev) {
+			entry = rb_entry(prev, struct btrfs_inode, rb_node);
+			if (objectid <= btrfs_ino(&entry->vfs_inode)) {
+				node = prev;
+				break;
+			}
+			prev = rb_next(prev);
+		}
+	}
+	while (node) {
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+		inode = igrab(&entry->vfs_inode);
+		if (inode) {
+			spin_unlock(&root->inode_lock);
+			return inode;
+		}
+
+		objectid = btrfs_ino(&entry->vfs_inode) + 1;
+		if (cond_resched_lock(&root->inode_lock))
+			goto again;
+
+		node = rb_next(node);
+	}
+	spin_unlock(&root->inode_lock);
+	return NULL;
+}
+
+static int in_block_group(u64 bytenr,
+			  struct btrfs_block_group_cache *block_group)
+{
+	if (bytenr >= block_group->key.objectid &&
+	    bytenr < block_group->key.objectid + block_group->key.offset)
+		return 1;
+	return 0;
+}
+
+/*
+ * get new location of data
+ */
+static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
+			    u64 bytenr, u64 num_bytes)
+{
+	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
+	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
+				       bytenr, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+
+	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
+	       btrfs_file_extent_compression(leaf, fi) ||
+	       btrfs_file_extent_encryption(leaf, fi) ||
+	       btrfs_file_extent_other_encoding(leaf, fi));
+
+	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
+		ret = 1;
+		goto out;
+	}
+
+	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * update file extent items in the tree leaf to point to
+ * the new locations.
+ */
+static noinline_for_stack
+int replace_file_extents(struct btrfs_trans_handle *trans,
+			 struct reloc_control *rc,
+			 struct btrfs_root *root,
+			 struct extent_buffer *leaf)
+{
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	struct inode *inode = NULL;
+	u64 parent;
+	u64 bytenr;
+	u64 new_bytenr = 0;
+	u64 num_bytes;
+	u64 end;
+	u32 nritems;
+	u32 i;
+	int ret;
+	int first = 1;
+	int dirty = 0;
+
+	if (rc->stage != UPDATE_DATA_PTRS)
+		return 0;
+
+	/* reloc trees always use full backref */
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+		parent = leaf->start;
+	else
+		parent = 0;
+
+	nritems = btrfs_header_nritems(leaf);
+	for (i = 0; i < nritems; i++) {
+		cond_resched();
+		btrfs_item_key_to_cpu(leaf, &key, i);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+		if (bytenr == 0)
+			continue;
+		if (!in_block_group(bytenr, rc->block_group))
+			continue;
+
+		/*
+		 * if we are modifying block in fs tree, wait for readpage
+		 * to complete and drop the extent cache
+		 */
+		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+			if (first) {
+				inode = find_next_inode(root, key.objectid);
+				first = 0;
+			} else if (inode && btrfs_ino(inode) < key.objectid) {
+				btrfs_add_delayed_iput(inode);
+				inode = find_next_inode(root, key.objectid);
+			}
+			if (inode && btrfs_ino(inode) == key.objectid) {
+				end = key.offset +
+				      btrfs_file_extent_num_bytes(leaf, fi);
+				WARN_ON(!IS_ALIGNED(key.offset,
+						    root->sectorsize));
+				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+				end--;
+				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+						      key.offset, end,
+						      GFP_NOFS);
+				if (!ret)
+					continue;
+
+				btrfs_drop_extent_cache(inode, key.offset, end,
+							1);
+				unlock_extent(&BTRFS_I(inode)->io_tree,
+					      key.offset, end, GFP_NOFS);
+			}
+		}
+
+		ret = get_new_location(rc->data_inode, &new_bytenr,
+				       bytenr, num_bytes);
+		if (ret > 0) {
+			WARN_ON(1);
+			continue;
+		}
+		BUG_ON(ret < 0);
+
+		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
+		dirty = 1;
+
+		key.offset -= btrfs_file_extent_offset(leaf, fi);
+		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
+					   num_bytes, parent,
+					   btrfs_header_owner(leaf),
+					   key.objectid, key.offset);
+		BUG_ON(ret);
+
+		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+					parent, btrfs_header_owner(leaf),
+					key.objectid, key.offset);
+		BUG_ON(ret);
+	}
+	if (dirty)
+		btrfs_mark_buffer_dirty(leaf);
+	if (inode)
+		btrfs_add_delayed_iput(inode);
+	return 0;
+}
+
+static noinline_for_stack
+int memcmp_node_keys(struct extent_buffer *eb, int slot,
+		     struct btrfs_path *path, int level)
+{
+	struct btrfs_disk_key key1;
+	struct btrfs_disk_key key2;
+	btrfs_node_key(eb, &key1, slot);
+	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
+	return memcmp(&key1, &key2, sizeof(key1));
+}
+
+/*
+ * try to replace tree blocks in fs tree with the new blocks
+ * in reloc tree. tree blocks haven't been modified since the
+ * reloc tree was create can be replaced.
+ *
+ * if a block was replaced, level of the block + 1 is returned.
+ * if no block got replaced, 0 is returned. if there are other
+ * errors, a negative error number is returned.
+ */
+static noinline_for_stack
+int replace_path(struct btrfs_trans_handle *trans,
+		 struct btrfs_root *dest, struct btrfs_root *src,
+		 struct btrfs_path *path, struct btrfs_key *next_key,
+		 int lowest_level, int max_level)
+{
+	struct extent_buffer *eb;
+	struct extent_buffer *parent;
+	struct btrfs_key key;
+	u64 old_bytenr;
+	u64 new_bytenr;
+	u64 old_ptr_gen;
+	u64 new_ptr_gen;
+	u64 last_snapshot;
+	u32 blocksize;
+	int cow = 0;
+	int level;
+	int ret;
+	int slot;
+
+	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+
+	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
+again:
+	slot = path->slots[lowest_level];
+	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
+
+	eb = btrfs_lock_root_node(dest);
+	btrfs_set_lock_blocking(eb);
+	level = btrfs_header_level(eb);
+
+	if (level < lowest_level) {
+		btrfs_tree_unlock(eb);
+		free_extent_buffer(eb);
+		return 0;
+	}
+
+	if (cow) {
+		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+		BUG_ON(ret);
+	}
+	btrfs_set_lock_blocking(eb);
+
+	if (next_key) {
+		next_key->objectid = (u64)-1;
+		next_key->type = (u8)-1;
+		next_key->offset = (u64)-1;
+	}
+
+	parent = eb;
+	while (1) {
+		level = btrfs_header_level(parent);
+		BUG_ON(level < lowest_level);
+
+		ret = btrfs_bin_search(parent, &key, level, &slot);
+		if (ret && slot > 0)
+			slot--;
+
+		if (next_key && slot + 1 < btrfs_header_nritems(parent))
+			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
+
+		old_bytenr = btrfs_node_blockptr(parent, slot);
+		blocksize = btrfs_level_size(dest, level - 1);
+		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
+
+		if (level <= max_level) {
+			eb = path->nodes[level];
+			new_bytenr = btrfs_node_blockptr(eb,
+							path->slots[level]);
+			new_ptr_gen = btrfs_node_ptr_generation(eb,
+							path->slots[level]);
+		} else {
+			new_bytenr = 0;
+			new_ptr_gen = 0;
+		}
+
+		if (new_bytenr > 0 && new_bytenr == old_bytenr) {
+			WARN_ON(1);
+			ret = level;
+			break;
+		}
+
+		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
+		    memcmp_node_keys(parent, slot, path, level)) {
+			if (level <= lowest_level) {
+				ret = 0;
+				break;
+			}
+
+			eb = read_tree_block(dest, old_bytenr, blocksize,
+					     old_ptr_gen);
+			BUG_ON(!eb);
+			btrfs_tree_lock(eb);
+			if (cow) {
+				ret = btrfs_cow_block(trans, dest, eb, parent,
+						      slot, &eb);
+				BUG_ON(ret);
+			}
+			btrfs_set_lock_blocking(eb);
+
+			btrfs_tree_unlock(parent);
+			free_extent_buffer(parent);
+
+			parent = eb;
+			continue;
+		}
+
+		if (!cow) {
+			btrfs_tree_unlock(parent);
+			free_extent_buffer(parent);
+			cow = 1;
+			goto again;
+		}
+
+		btrfs_node_key_to_cpu(path->nodes[level], &key,
+				      path->slots[level]);
+		btrfs_release_path(path);
+
+		path->lowest_level = level;
+		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
+		path->lowest_level = 0;
+		BUG_ON(ret);
+
+		/*
+		 * swap blocks in fs tree and reloc tree.
+		 */
+		btrfs_set_node_blockptr(parent, slot, new_bytenr);
+		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
+		btrfs_mark_buffer_dirty(parent);
+
+		btrfs_set_node_blockptr(path->nodes[level],
+					path->slots[level], old_bytenr);
+		btrfs_set_node_ptr_generation(path->nodes[level],
+					      path->slots[level], old_ptr_gen);
+		btrfs_mark_buffer_dirty(path->nodes[level]);
+
+		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
+					path->nodes[level]->start,
+					src->root_key.objectid, level - 1, 0);
+		BUG_ON(ret);
+		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
+					0, dest->root_key.objectid, level - 1,
+					0);
+		BUG_ON(ret);
+
+		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
+					path->nodes[level]->start,
+					src->root_key.objectid, level - 1, 0);
+		BUG_ON(ret);
+
+		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
+					0, dest->root_key.objectid, level - 1,
+					0);
+		BUG_ON(ret);
+
+		btrfs_unlock_up_safe(path, 0);
+
+		ret = level;
+		break;
+	}
+	btrfs_tree_unlock(parent);
+	free_extent_buffer(parent);
+	return ret;
+}
+
+/*
+ * helper to find next relocated block in reloc tree
+ */
+static noinline_for_stack
+int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+		       int *level)
+{
+	struct extent_buffer *eb;
+	int i;
+	u64 last_snapshot;
+	u32 nritems;
+
+	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+
+	for (i = 0; i < *level; i++) {
+		free_extent_buffer(path->nodes[i]);
+		path->nodes[i] = NULL;
+	}
+
+	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
+		eb = path->nodes[i];
+		nritems = btrfs_header_nritems(eb);
+		while (path->slots[i] + 1 < nritems) {
+			path->slots[i]++;
+			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
+			    last_snapshot)
+				continue;
+
+			*level = i;
+			return 0;
+		}
+		free_extent_buffer(path->nodes[i]);
+		path->nodes[i] = NULL;
+	}
+	return 1;
+}
+
+/*
+ * walk down reloc tree to find relocated block of lowest level
+ */
+static noinline_for_stack
+int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+			 int *level)
+{
+	struct extent_buffer *eb = NULL;
+	int i;
+	u64 bytenr;
+	u64 ptr_gen = 0;
+	u64 last_snapshot;
+	u32 blocksize;
+	u32 nritems;
+
+	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+
+	for (i = *level; i > 0; i--) {
+		eb = path->nodes[i];
+		nritems = btrfs_header_nritems(eb);
+		while (path->slots[i] < nritems) {
+			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
+			if (ptr_gen > last_snapshot)
+				break;
+			path->slots[i]++;
+		}
+		if (path->slots[i] >= nritems) {
+			if (i == *level)
+				break;
+			*level = i + 1;
+			return 0;
+		}
+		if (i == 1) {
+			*level = i;
+			return 0;
+		}
+
+		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
+		blocksize = btrfs_level_size(root, i - 1);
+		eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
+		BUG_ON(btrfs_header_level(eb) != i - 1);
+		path->nodes[i - 1] = eb;
+		path->slots[i - 1] = 0;
+	}
+	return 1;
+}
+
+/*
+ * invalidate extent cache for file extents whose key in range of
+ * [min_key, max_key)
+ */
+static int invalidate_extent_cache(struct btrfs_root *root,
+				   struct btrfs_key *min_key,
+				   struct btrfs_key *max_key)
+{
+	struct inode *inode = NULL;
+	u64 objectid;
+	u64 start, end;
+	u64 ino;
+
+	objectid = min_key->objectid;
+	while (1) {
+		cond_resched();
+		iput(inode);
+
+		if (objectid > max_key->objectid)
+			break;
+
+		inode = find_next_inode(root, objectid);
+		if (!inode)
+			break;
+		ino = btrfs_ino(inode);
+
+		if (ino > max_key->objectid) {
+			iput(inode);
+			break;
+		}
+
+		objectid = ino + 1;
+		if (!S_ISREG(inode->i_mode))
+			continue;
+
+		if (unlikely(min_key->objectid == ino)) {
+			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
+				continue;
+			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
+				start = 0;
+			else {
+				start = min_key->offset;
+				WARN_ON(!IS_ALIGNED(start, root->sectorsize));
+			}
+		} else {
+			start = 0;
+		}
+
+		if (unlikely(max_key->objectid == ino)) {
+			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
+				continue;
+			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
+				end = (u64)-1;
+			} else {
+				if (max_key->offset == 0)
+					continue;
+				end = max_key->offset;
+				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+				end--;
+			}
+		} else {
+			end = (u64)-1;
+		}
+
+		/* the lock_extent waits for readpage to complete */
+		lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+		btrfs_drop_extent_cache(inode, start, end, 1);
+		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+	}
+	return 0;
+}
+
+static int find_next_key(struct btrfs_path *path, int level,
+			 struct btrfs_key *key)
+
+{
+	while (level < BTRFS_MAX_LEVEL) {
+		if (!path->nodes[level])
+			break;
+		if (path->slots[level] + 1 <
+		    btrfs_header_nritems(path->nodes[level])) {
+			btrfs_node_key_to_cpu(path->nodes[level], key,
+					      path->slots[level] + 1);
+			return 0;
+		}
+		level++;
+	}
+	return 1;
+}
+
+/*
+ * merge the relocated tree blocks in reloc tree with corresponding
+ * fs tree.
+ */
+static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
+					       struct btrfs_root *root)
+{
+	LIST_HEAD(inode_list);
+	struct btrfs_key key;
+	struct btrfs_key next_key;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *reloc_root;
+	struct btrfs_root_item *root_item;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	unsigned long nr;
+	int level;
+	int max_level;
+	int replaced = 0;
+	int ret;
+	int err = 0;
+	u32 min_reserved;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 1;
+
+	reloc_root = root->reloc_root;
+	root_item = &reloc_root->root_item;
+
+	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+		level = btrfs_root_level(root_item);
+		extent_buffer_get(reloc_root->node);
+		path->nodes[level] = reloc_root->node;
+		path->slots[level] = 0;
+	} else {
+		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+
+		level = root_item->drop_level;
+		BUG_ON(level == 0);
+		path->lowest_level = level;
+		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+		path->lowest_level = 0;
+		if (ret < 0) {
+			btrfs_free_path(path);
+			return ret;
+		}
+
+		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
+				      path->slots[level]);
+		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
+
+		btrfs_unlock_up_safe(path, 0);
+	}
+
+	min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+	memset(&next_key, 0, sizeof(next_key));
+
+	while (1) {
+		trans = btrfs_start_transaction(root, 0);
+		BUG_ON(IS_ERR(trans));
+		trans->block_rsv = rc->block_rsv;
+
+		ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
+					    min_reserved, 0);
+		if (ret) {
+			BUG_ON(ret != -EAGAIN);
+			ret = btrfs_commit_transaction(trans, root);
+			BUG_ON(ret);
+			continue;
+		}
+
+		replaced = 0;
+		max_level = level;
+
+		ret = walk_down_reloc_tree(reloc_root, path, &level);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		if (ret > 0)
+			break;
+
+		if (!find_next_key(path, level, &key) &&
+		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
+			ret = 0;
+		} else {
+			ret = replace_path(trans, root, reloc_root, path,
+					   &next_key, level, max_level);
+		}
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+
+		if (ret > 0) {
+			level = ret;
+			btrfs_node_key_to_cpu(path->nodes[level], &key,
+					      path->slots[level]);
+			replaced = 1;
+		}
+
+		ret = walk_up_reloc_tree(reloc_root, path, &level);
+		if (ret > 0)
+			break;
+
+		BUG_ON(level == 0);
+		/*
+		 * save the merging progress in the drop_progress.
+		 * this is OK since root refs == 1 in this case.
+		 */
+		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
+			       path->slots[level]);
+		root_item->drop_level = level;
+
+		nr = trans->blocks_used;
+		btrfs_end_transaction_throttle(trans, root);
+
+		btrfs_btree_balance_dirty(root, nr);
+
+		if (replaced && rc->stage == UPDATE_DATA_PTRS)
+			invalidate_extent_cache(root, &key, &next_key);
+	}
+
+	/*
+	 * handle the case only one block in the fs tree need to be
+	 * relocated and the block is tree root.
+	 */
+	leaf = btrfs_lock_root_node(root);
+	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+	btrfs_tree_unlock(leaf);
+	free_extent_buffer(leaf);
+	if (ret < 0)
+		err = ret;
+out:
+	btrfs_free_path(path);
+
+	if (err == 0) {
+		memset(&root_item->drop_progress, 0,
+		       sizeof(root_item->drop_progress));
+		root_item->drop_level = 0;
+		btrfs_set_root_refs(root_item, 0);
+		btrfs_update_reloc_root(trans, root);
+	}
+
+	nr = trans->blocks_used;
+	btrfs_end_transaction_throttle(trans, root);
+
+	btrfs_btree_balance_dirty(root, nr);
+
+	if (replaced && rc->stage == UPDATE_DATA_PTRS)
+		invalidate_extent_cache(root, &key, &next_key);
+
+	return err;
+}
+
+static noinline_for_stack
+int prepare_to_merge(struct reloc_control *rc, int err)
+{
+	struct btrfs_root *root = rc->extent_root;
+	struct btrfs_root *reloc_root;
+	struct btrfs_trans_handle *trans;
+	LIST_HEAD(reloc_roots);
+	u64 num_bytes = 0;
+	int ret;
+
+	mutex_lock(&root->fs_info->reloc_mutex);
+	rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+	rc->merging_rsv_size += rc->nodes_relocated * 2;
+	mutex_unlock(&root->fs_info->reloc_mutex);
+
+again:
+	if (!err) {
+		num_bytes = rc->merging_rsv_size;
+		ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
+					  num_bytes);
+		if (ret)
+			err = ret;
+	}
+
+	trans = btrfs_join_transaction(rc->extent_root);
+	if (IS_ERR(trans)) {
+		if (!err)
+			btrfs_block_rsv_release(rc->extent_root,
+						rc->block_rsv, num_bytes);
+		return PTR_ERR(trans);
+	}
+
+	if (!err) {
+		if (num_bytes != rc->merging_rsv_size) {
+			btrfs_end_transaction(trans, rc->extent_root);
+			btrfs_block_rsv_release(rc->extent_root,
+						rc->block_rsv, num_bytes);
+			goto again;
+		}
+	}
+
+	rc->merge_reloc_tree = 1;
+
+	while (!list_empty(&rc->reloc_roots)) {
+		reloc_root = list_entry(rc->reloc_roots.next,
+					struct btrfs_root, root_list);
+		list_del_init(&reloc_root->root_list);
+
+		root = read_fs_root(reloc_root->fs_info,
+				    reloc_root->root_key.offset);
+		BUG_ON(IS_ERR(root));
+		BUG_ON(root->reloc_root != reloc_root);
+
+		/*
+		 * set reference count to 1, so btrfs_recover_relocation
+		 * knows it should resumes merging
+		 */
+		if (!err)
+			btrfs_set_root_refs(&reloc_root->root_item, 1);
+		btrfs_update_reloc_root(trans, root);
+
+		list_add(&reloc_root->root_list, &reloc_roots);
+	}
+
+	list_splice(&reloc_roots, &rc->reloc_roots);
+
+	if (!err)
+		btrfs_commit_transaction(trans, rc->extent_root);
+	else
+		btrfs_end_transaction(trans, rc->extent_root);
+	return err;
+}
+
+static noinline_for_stack
+int merge_reloc_roots(struct reloc_control *rc)
+{
+	struct btrfs_root *root;
+	struct btrfs_root *reloc_root;
+	LIST_HEAD(reloc_roots);
+	int found = 0;
+	int ret;
+again:
+	root = rc->extent_root;
+
+	/*
+	 * this serializes us with btrfs_record_root_in_transaction,
+	 * we have to make sure nobody is in the middle of
+	 * adding their roots to the list while we are
+	 * doing this splice
+	 */
+	mutex_lock(&root->fs_info->reloc_mutex);
+	list_splice_init(&rc->reloc_roots, &reloc_roots);
+	mutex_unlock(&root->fs_info->reloc_mutex);
+
+	while (!list_empty(&reloc_roots)) {
+		found = 1;
+		reloc_root = list_entry(reloc_roots.next,
+					struct btrfs_root, root_list);
+
+		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+			root = read_fs_root(reloc_root->fs_info,
+					    reloc_root->root_key.offset);
+			BUG_ON(IS_ERR(root));
+			BUG_ON(root->reloc_root != reloc_root);
+
+			ret = merge_reloc_root(rc, root);
+			BUG_ON(ret);
+		} else {
+			list_del_init(&reloc_root->root_list);
+		}
+		btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
+	}
+
+	if (found) {
+		found = 0;
+		goto again;
+	}
+	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+	return 0;
+}
+
+static void free_block_list(struct rb_root *blocks)
+{
+	struct tree_block *block;
+	struct rb_node *rb_node;
+	while ((rb_node = rb_first(blocks))) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+		rb_erase(rb_node, blocks);
+		kfree(block);
+	}
+}
+
+static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *reloc_root)
+{
+	struct btrfs_root *root;
+
+	if (reloc_root->last_trans == trans->transid)
+		return 0;
+
+	root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
+	BUG_ON(IS_ERR(root));
+	BUG_ON(root->reloc_root != reloc_root);
+
+	return btrfs_record_root_in_trans(trans, root);
+}
+
+static noinline_for_stack
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+				     struct reloc_control *rc,
+				     struct backref_node *node,
+				     struct backref_edge *edges[], int *nr)
+{
+	struct backref_node *next;
+	struct btrfs_root *root;
+	int index = 0;
+
+	next = node;
+	while (1) {
+		cond_resched();
+		next = walk_up_backref(next, edges, &index);
+		root = next->root;
+		BUG_ON(!root);
+		BUG_ON(!root->ref_cows);
+
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+			record_reloc_root_in_trans(trans, root);
+			break;
+		}
+
+		btrfs_record_root_in_trans(trans, root);
+		root = root->reloc_root;
+
+		if (next->new_bytenr != root->node->start) {
+			BUG_ON(next->new_bytenr);
+			BUG_ON(!list_empty(&next->list));
+			next->new_bytenr = root->node->start;
+			next->root = root;
+			list_add_tail(&next->list,
+				      &rc->backref_cache.changed);
+			__mark_block_processed(rc, next);
+			break;
+		}
+
+		WARN_ON(1);
+		root = NULL;
+		next = walk_down_backref(edges, &index);
+		if (!next || next->level <= node->level)
+			break;
+	}
+	if (!root)
+		return NULL;
+
+	*nr = index;
+	next = node;
+	/* setup backref node path for btrfs_reloc_cow_block */
+	while (1) {
+		rc->backref_cache.path[next->level] = next;
+		if (--index < 0)
+			break;
+		next = edges[index]->node[UPPER];
+	}
+	return root;
+}
+
+/*
+ * select a tree root for relocation. return NULL if the block
+ * is reference counted. we should use do_relocation() in this
+ * case. return a tree root pointer if the block isn't reference
+ * counted. return -ENOENT if the block is root of reloc tree.
+ */
+static noinline_for_stack
+struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
+				   struct backref_node *node)
+{
+	struct backref_node *next;
+	struct btrfs_root *root;
+	struct btrfs_root *fs_root = NULL;
+	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+	int index = 0;
+
+	next = node;
+	while (1) {
+		cond_resched();
+		next = walk_up_backref(next, edges, &index);
+		root = next->root;
+		BUG_ON(!root);
+
+		/* no other choice for non-references counted tree */
+		if (!root->ref_cows)
+			return root;
+
+		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
+			fs_root = root;
+
+		if (next != node)
+			return NULL;
+
+		next = walk_down_backref(edges, &index);
+		if (!next || next->level <= node->level)
+			break;
+	}
+
+	if (!fs_root)
+		return ERR_PTR(-ENOENT);
+	return fs_root;
+}
+
+static noinline_for_stack
+u64 calcu_metadata_size(struct reloc_control *rc,
+			struct backref_node *node, int reserve)
+{
+	struct backref_node *next = node;
+	struct backref_edge *edge;
+	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+	u64 num_bytes = 0;
+	int index = 0;
+
+	BUG_ON(reserve && node->processed);
+
+	while (next) {
+		cond_resched();
+		while (1) {
+			if (next->processed && (reserve || next != node))
+				break;
+
+			num_bytes += btrfs_level_size(rc->extent_root,
+						      next->level);
+
+			if (list_empty(&next->upper))
+				break;
+
+			edge = list_entry(next->upper.next,
+					  struct backref_edge, list[LOWER]);
+			edges[index++] = edge;
+			next = edge->node[UPPER];
+		}
+		next = walk_down_backref(edges, &index);
+	}
+	return num_bytes;
+}
+
+static int reserve_metadata_space(struct btrfs_trans_handle *trans,
+				  struct reloc_control *rc,
+				  struct backref_node *node)
+{
+	struct btrfs_root *root = rc->extent_root;
+	u64 num_bytes;
+	int ret;
+
+	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
+
+	trans->block_rsv = rc->block_rsv;
+	ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
+	if (ret) {
+		if (ret == -EAGAIN)
+			rc->commit_transaction = 1;
+		return ret;
+	}
+
+	return 0;
+}
+
+static void release_metadata_space(struct reloc_control *rc,
+				   struct backref_node *node)
+{
+	u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
+	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
+}
+
+/*
+ * relocate a block tree, and then update pointers in upper level
+ * blocks that reference the block to point to the new location.
+ *
+ * if called by link_to_upper, the block has already been relocated.
+ * in that case this function just updates pointers.
+ */
+static int do_relocation(struct btrfs_trans_handle *trans,
+			 struct reloc_control *rc,
+			 struct backref_node *node,
+			 struct btrfs_key *key,
+			 struct btrfs_path *path, int lowest)
+{
+	struct backref_node *upper;
+	struct backref_edge *edge;
+	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+	struct btrfs_root *root;
+	struct extent_buffer *eb;
+	u32 blocksize;
+	u64 bytenr;
+	u64 generation;
+	int nr;
+	int slot;
+	int ret;
+	int err = 0;
+
+	BUG_ON(lowest && node->eb);
+
+	path->lowest_level = node->level + 1;
+	rc->backref_cache.path[node->level] = node;
+	list_for_each_entry(edge, &node->upper, list[LOWER]) {
+		cond_resched();
+
+		upper = edge->node[UPPER];
+		root = select_reloc_root(trans, rc, upper, edges, &nr);
+		BUG_ON(!root);
+
+		if (upper->eb && !upper->locked) {
+			if (!lowest) {
+				ret = btrfs_bin_search(upper->eb, key,
+						       upper->level, &slot);
+				BUG_ON(ret);
+				bytenr = btrfs_node_blockptr(upper->eb, slot);
+				if (node->eb->start == bytenr)
+					goto next;
+			}
+			drop_node_buffer(upper);
+		}
+
+		if (!upper->eb) {
+			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+			BUG_ON(ret > 0);
+
+			if (!upper->eb) {
+				upper->eb = path->nodes[upper->level];
+				path->nodes[upper->level] = NULL;
+			} else {
+				BUG_ON(upper->eb != path->nodes[upper->level]);
+			}
+
+			upper->locked = 1;
+			path->locks[upper->level] = 0;
+
+			slot = path->slots[upper->level];
+			btrfs_release_path(path);
+		} else {
+			ret = btrfs_bin_search(upper->eb, key, upper->level,
+					       &slot);
+			BUG_ON(ret);
+		}
+
+		bytenr = btrfs_node_blockptr(upper->eb, slot);
+		if (lowest) {
+			BUG_ON(bytenr != node->bytenr);
+		} else {
+			if (node->eb->start == bytenr)
+				goto next;
+		}
+
+		blocksize = btrfs_level_size(root, node->level);
+		generation = btrfs_node_ptr_generation(upper->eb, slot);
+		eb = read_tree_block(root, bytenr, blocksize, generation);
+		if (!eb) {
+			err = -EIO;
+			goto next;
+		}
+		btrfs_tree_lock(eb);
+		btrfs_set_lock_blocking(eb);
+
+		if (!node->eb) {
+			ret = btrfs_cow_block(trans, root, eb, upper->eb,
+					      slot, &eb);
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
+			if (ret < 0) {
+				err = ret;
+				goto next;
+			}
+			BUG_ON(node->eb != eb);
+		} else {
+			btrfs_set_node_blockptr(upper->eb, slot,
+						node->eb->start);
+			btrfs_set_node_ptr_generation(upper->eb, slot,
+						      trans->transid);
+			btrfs_mark_buffer_dirty(upper->eb);
+
+			ret = btrfs_inc_extent_ref(trans, root,
+						node->eb->start, blocksize,
+						upper->eb->start,
+						btrfs_header_owner(upper->eb),
+						node->level, 0);
+			BUG_ON(ret);
+
+			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
+			BUG_ON(ret);
+		}
+next:
+		if (!upper->pending)
+			drop_node_buffer(upper);
+		else
+			unlock_node_buffer(upper);
+		if (err)
+			break;
+	}
+
+	if (!err && node->pending) {
+		drop_node_buffer(node);
+		list_move_tail(&node->list, &rc->backref_cache.changed);
+		node->pending = 0;
+	}
+
+	path->lowest_level = 0;
+	BUG_ON(err == -ENOSPC);
+	return err;
+}
+
+static int link_to_upper(struct btrfs_trans_handle *trans,
+			 struct reloc_control *rc,
+			 struct backref_node *node,
+			 struct btrfs_path *path)
+{
+	struct btrfs_key key;
+
+	btrfs_node_key_to_cpu(node->eb, &key, 0);
+	return do_relocation(trans, rc, node, &key, path, 0);
+}
+
+static int finish_pending_nodes(struct btrfs_trans_handle *trans,
+				struct reloc_control *rc,
+				struct btrfs_path *path, int err)
+{
+	LIST_HEAD(list);
+	struct backref_cache *cache = &rc->backref_cache;
+	struct backref_node *node;
+	int level;
+	int ret;
+
+	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+		while (!list_empty(&cache->pending[level])) {
+			node = list_entry(cache->pending[level].next,
+					  struct backref_node, list);
+			list_move_tail(&node->list, &list);
+			BUG_ON(!node->pending);
+
+			if (!err) {
+				ret = link_to_upper(trans, rc, node, path);
+				if (ret < 0)
+					err = ret;
+			}
+		}
+		list_splice_init(&list, &cache->pending[level]);
+	}
+	return err;
+}
+
+static void mark_block_processed(struct reloc_control *rc,
+				 u64 bytenr, u32 blocksize)
+{
+	set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
+			EXTENT_DIRTY, GFP_NOFS);
+}
+
+static void __mark_block_processed(struct reloc_control *rc,
+				   struct backref_node *node)
+{
+	u32 blocksize;
+	if (node->level == 0 ||
+	    in_block_group(node->bytenr, rc->block_group)) {
+		blocksize = btrfs_level_size(rc->extent_root, node->level);
+		mark_block_processed(rc, node->bytenr, blocksize);
+	}
+	node->processed = 1;
+}
+
+/*
+ * mark a block and all blocks directly/indirectly reference the block
+ * as processed.
+ */
+static void update_processed_blocks(struct reloc_control *rc,
+				    struct backref_node *node)
+{
+	struct backref_node *next = node;
+	struct backref_edge *edge;
+	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+	int index = 0;
+
+	while (next) {
+		cond_resched();
+		while (1) {
+			if (next->processed)
+				break;
+
+			__mark_block_processed(rc, next);
+
+			if (list_empty(&next->upper))
+				break;
+
+			edge = list_entry(next->upper.next,
+					  struct backref_edge, list[LOWER]);
+			edges[index++] = edge;
+			next = edge->node[UPPER];
+		}
+		next = walk_down_backref(edges, &index);
+	}
+}
+
+static int tree_block_processed(u64 bytenr, u32 blocksize,
+				struct reloc_control *rc)
+{
+	if (test_range_bit(&rc->processed_blocks, bytenr,
+			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
+		return 1;
+	return 0;
+}
+
+static int get_tree_block_key(struct reloc_control *rc,
+			      struct tree_block *block)
+{
+	struct extent_buffer *eb;
+
+	BUG_ON(block->key_ready);
+	eb = read_tree_block(rc->extent_root, block->bytenr,
+			     block->key.objectid, block->key.offset);
+	BUG_ON(!eb);
+	WARN_ON(btrfs_header_level(eb) != block->level);
+	if (block->level == 0)
+		btrfs_item_key_to_cpu(eb, &block->key, 0);
+	else
+		btrfs_node_key_to_cpu(eb, &block->key, 0);
+	free_extent_buffer(eb);
+	block->key_ready = 1;
+	return 0;
+}
+
+static int reada_tree_block(struct reloc_control *rc,
+			    struct tree_block *block)
+{
+	BUG_ON(block->key_ready);
+	readahead_tree_block(rc->extent_root, block->bytenr,
+			     block->key.objectid, block->key.offset);
+	return 0;
+}
+
+/*
+ * helper function to relocate a tree block
+ */
+static int relocate_tree_block(struct btrfs_trans_handle *trans,
+				struct reloc_control *rc,
+				struct backref_node *node,
+				struct btrfs_key *key,
+				struct btrfs_path *path)
+{
+	struct btrfs_root *root;
+	int release = 0;
+	int ret = 0;
+
+	if (!node)
+		return 0;
+
+	BUG_ON(node->processed);
+	root = select_one_root(trans, node);
+	if (root == ERR_PTR(-ENOENT)) {
+		update_processed_blocks(rc, node);
+		goto out;
+	}
+
+	if (!root || root->ref_cows) {
+		ret = reserve_metadata_space(trans, rc, node);
+		if (ret)
+			goto out;
+		release = 1;
+	}
+
+	if (root) {
+		if (root->ref_cows) {
+			BUG_ON(node->new_bytenr);
+			BUG_ON(!list_empty(&node->list));
+			btrfs_record_root_in_trans(trans, root);
+			root = root->reloc_root;
+			node->new_bytenr = root->node->start;
+			node->root = root;
+			list_add_tail(&node->list, &rc->backref_cache.changed);
+		} else {
+			path->lowest_level = node->level;
+			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+			btrfs_release_path(path);
+			if (ret > 0)
+				ret = 0;
+		}
+		if (!ret)
+			update_processed_blocks(rc, node);
+	} else {
+		ret = do_relocation(trans, rc, node, key, path, 1);
+	}
+out:
+	if (ret || node->level == 0 || node->cowonly) {
+		if (release)
+			release_metadata_space(rc, node);
+		remove_backref_node(&rc->backref_cache, node);
+	}
+	return ret;
+}
+
+/*
+ * relocate a list of blocks
+ */
+static noinline_for_stack
+int relocate_tree_blocks(struct btrfs_trans_handle *trans,
+			 struct reloc_control *rc, struct rb_root *blocks)
+{
+	struct backref_node *node;
+	struct btrfs_path *path;
+	struct tree_block *block;
+	struct rb_node *rb_node;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	rb_node = rb_first(blocks);
+	while (rb_node) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+		if (!block->key_ready)
+			reada_tree_block(rc, block);
+		rb_node = rb_next(rb_node);
+	}
+
+	rb_node = rb_first(blocks);
+	while (rb_node) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+		if (!block->key_ready)
+			get_tree_block_key(rc, block);
+		rb_node = rb_next(rb_node);
+	}
+
+	rb_node = rb_first(blocks);
+	while (rb_node) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+
+		node = build_backref_tree(rc, &block->key,
+					  block->level, block->bytenr);
+		if (IS_ERR(node)) {
+			err = PTR_ERR(node);
+			goto out;
+		}
+
+		ret = relocate_tree_block(trans, rc, node, &block->key,
+					  path);
+		if (ret < 0) {
+			if (ret != -EAGAIN || rb_node == rb_first(blocks))
+				err = ret;
+			goto out;
+		}
+		rb_node = rb_next(rb_node);
+	}
+out:
+	free_block_list(blocks);
+	err = finish_pending_nodes(trans, rc, path, err);
+
+	btrfs_free_path(path);
+	return err;
+}
+
+static noinline_for_stack
+int prealloc_file_extent_cluster(struct inode *inode,
+				 struct file_extent_cluster *cluster)
+{
+	u64 alloc_hint = 0;
+	u64 start;
+	u64 end;
+	u64 offset = BTRFS_I(inode)->index_cnt;
+	u64 num_bytes;
+	int nr = 0;
+	int ret = 0;
+
+	BUG_ON(cluster->start != cluster->boundary[0]);
+	mutex_lock(&inode->i_mutex);
+
+	ret = btrfs_check_data_free_space(inode, cluster->end +
+					  1 - cluster->start);
+	if (ret)
+		goto out;
+
+	while (nr < cluster->nr) {
+		start = cluster->boundary[nr] - offset;
+		if (nr + 1 < cluster->nr)
+			end = cluster->boundary[nr + 1] - 1 - offset;
+		else
+			end = cluster->end - offset;
+
+		lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+		num_bytes = end + 1 - start;
+		ret = btrfs_prealloc_file_range(inode, 0, start,
+						num_bytes, num_bytes,
+						end + 1, &alloc_hint);
+		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+		if (ret)
+			break;
+		nr++;
+	}
+	btrfs_free_reserved_data_space(inode, cluster->end +
+				       1 - cluster->start);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+static noinline_for_stack
+int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
+			 u64 block_start)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	int ret = 0;
+
+	em = alloc_extent_map();
+	if (!em)
+		return -ENOMEM;
+
+	em->start = start;
+	em->len = end + 1 - start;
+	em->block_len = em->len;
+	em->block_start = block_start;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+	while (1) {
+		write_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em);
+		write_unlock(&em_tree->lock);
+		if (ret != -EEXIST) {
+			free_extent_map(em);
+			break;
+		}
+		btrfs_drop_extent_cache(inode, start, end, 0);
+	}
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+	return ret;
+}
+
+static int relocate_file_extent_cluster(struct inode *inode,
+					struct file_extent_cluster *cluster)
+{
+	u64 page_start;
+	u64 page_end;
+	u64 offset = BTRFS_I(inode)->index_cnt;
+	unsigned long index;
+	unsigned long last_index;
+	struct page *page;
+	struct file_ra_state *ra;
+	int nr = 0;
+	int ret = 0;
+
+	if (!cluster->nr)
+		return 0;
+
+	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+	if (!ra)
+		return -ENOMEM;
+
+	ret = prealloc_file_extent_cluster(inode, cluster);
+	if (ret)
+		goto out;
+
+	file_ra_state_init(ra, inode->i_mapping);
+
+	ret = setup_extent_mapping(inode, cluster->start - offset,
+				   cluster->end - offset, cluster->start);
+	if (ret)
+		goto out;
+
+	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+	while (index <= last_index) {
+		ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+		if (ret)
+			goto out;
+
+		page = find_lock_page(inode->i_mapping, index);
+		if (!page) {
+			page_cache_sync_readahead(inode->i_mapping,
+						  ra, NULL, index,
+						  last_index + 1 - index);
+			page = grab_cache_page(inode->i_mapping, index);
+			if (!page) {
+				btrfs_delalloc_release_metadata(inode,
+							PAGE_CACHE_SIZE);
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+
+		if (PageReadahead(page)) {
+			page_cache_async_readahead(inode->i_mapping,
+						   ra, NULL, page, index,
+						   last_index + 1 - index);
+		}
+
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				btrfs_delalloc_release_metadata(inode,
+							PAGE_CACHE_SIZE);
+				ret = -EIO;
+				goto out;
+			}
+		}
+
+		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+		page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+		lock_extent(&BTRFS_I(inode)->io_tree,
+			    page_start, page_end, GFP_NOFS);
+
+		set_page_extent_mapped(page);
+
+		if (nr < cluster->nr &&
+		    page_start + offset == cluster->boundary[nr]) {
+			set_extent_bits(&BTRFS_I(inode)->io_tree,
+					page_start, page_end,
+					EXTENT_BOUNDARY, GFP_NOFS);
+			nr++;
+		}
+
+		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
+		set_page_dirty(page);
+
+		unlock_extent(&BTRFS_I(inode)->io_tree,
+			      page_start, page_end, GFP_NOFS);
+		unlock_page(page);
+		page_cache_release(page);
+
+		index++;
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+		btrfs_throttle(BTRFS_I(inode)->root);
+	}
+	WARN_ON(nr != cluster->nr);
+out:
+	kfree(ra);
+	return ret;
+}
+
+static noinline_for_stack
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
+			 struct file_extent_cluster *cluster)
+{
+	int ret;
+
+	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
+		ret = relocate_file_extent_cluster(inode, cluster);
+		if (ret)
+			return ret;
+		cluster->nr = 0;
+	}
+
+	if (!cluster->nr)
+		cluster->start = extent_key->objectid;
+	else
+		BUG_ON(cluster->nr >= MAX_EXTENTS);
+	cluster->end = extent_key->objectid + extent_key->offset - 1;
+	cluster->boundary[cluster->nr] = extent_key->objectid;
+	cluster->nr++;
+
+	if (cluster->nr >= MAX_EXTENTS) {
+		ret = relocate_file_extent_cluster(inode, cluster);
+		if (ret)
+			return ret;
+		cluster->nr = 0;
+	}
+	return 0;
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static int get_ref_objectid_v0(struct reloc_control *rc,
+			       struct btrfs_path *path,
+			       struct btrfs_key *extent_key,
+			       u64 *ref_objectid, int *path_change)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_ref_v0 *ref0;
+	int ret;
+	int slot;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	while (1) {
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(rc->extent_root, path);
+			if (ret < 0)
+				return ret;
+			BUG_ON(ret > 0);
+			leaf = path->nodes[0];
+			slot = path->slots[0];
+			if (path_change)
+				*path_change = 1;
+		}
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid != extent_key->objectid)
+			return -ENOENT;
+
+		if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
+			slot++;
+			continue;
+		}
+		ref0 = btrfs_item_ptr(leaf, slot,
+				struct btrfs_extent_ref_v0);
+		*ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
+		break;
+	}
+	return 0;
+}
+#endif
+
+/*
+ * helper to add a tree block to the list.
+ * the major work is getting the generation and level of the block
+ */
+static int add_tree_block(struct reloc_control *rc,
+			  struct btrfs_key *extent_key,
+			  struct btrfs_path *path,
+			  struct rb_root *blocks)
+{
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct btrfs_tree_block_info *bi;
+	struct tree_block *block;
+	struct rb_node *rb_node;
+	u32 item_size;
+	int level = -1;
+	int generation;
+
+	eb =  path->nodes[0];
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+	if (item_size >= sizeof(*ei) + sizeof(*bi)) {
+		ei = btrfs_item_ptr(eb, path->slots[0],
+				struct btrfs_extent_item);
+		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		generation = btrfs_extent_generation(eb, ei);
+		level = btrfs_tree_block_level(eb, bi);
+	} else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		u64 ref_owner;
+		int ret;
+
+		BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+		ret = get_ref_objectid_v0(rc, path, extent_key,
+					  &ref_owner, NULL);
+		if (ret < 0)
+			return ret;
+		BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
+		level = (int)ref_owner;
+		/* FIXME: get real generation */
+		generation = 0;
+#else
+		BUG();
+#endif
+	}
+
+	btrfs_release_path(path);
+
+	BUG_ON(level == -1);
+
+	block = kmalloc(sizeof(*block), GFP_NOFS);
+	if (!block)
+		return -ENOMEM;
+
+	block->bytenr = extent_key->objectid;
+	block->key.objectid = extent_key->offset;
+	block->key.offset = generation;
+	block->level = level;
+	block->key_ready = 0;
+
+	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+	BUG_ON(rb_node);
+
+	return 0;
+}
+
+/*
+ * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
+ */
+static int __add_tree_block(struct reloc_control *rc,
+			    u64 bytenr, u32 blocksize,
+			    struct rb_root *blocks)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret;
+
+	if (tree_block_processed(bytenr, blocksize, rc))
+		return 0;
+
+	if (tree_search(blocks, bytenr))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = blocksize;
+
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret);
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+	ret = add_tree_block(rc, &key, path, blocks);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper to check if the block use full backrefs for pointers in it
+ */
+static int block_use_full_backref(struct reloc_control *rc,
+				  struct extent_buffer *eb)
+{
+	u64 flags;
+	int ret;
+
+	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
+	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
+		return 1;
+
+	ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
+				       eb->start, eb->len, NULL, &flags);
+	BUG_ON(ret);
+
+	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+		ret = 1;
+	else
+		ret = 0;
+	return ret;
+}
+
+static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
+				    struct inode *inode, u64 ino)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct btrfs_root *root = fs_info->tree_root;
+	struct btrfs_trans_handle *trans;
+	unsigned long nr;
+	int ret = 0;
+
+	if (inode)
+		goto truncate;
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+	if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) {
+		if (inode && !IS_ERR(inode))
+			iput(inode);
+		return -ENOENT;
+	}
+
+truncate:
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+
+	btrfs_free_path(path);
+	nr = trans->blocks_used;
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root, nr);
+out:
+	iput(inode);
+	return ret;
+}
+
+/*
+ * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
+ * this function scans fs tree to find blocks reference the data extent
+ */
+static int find_data_references(struct reloc_control *rc,
+				struct btrfs_key *extent_key,
+				struct extent_buffer *leaf,
+				struct btrfs_extent_data_ref *ref,
+				struct rb_root *blocks)
+{
+	struct btrfs_path *path;
+	struct tree_block *block;
+	struct btrfs_root *root;
+	struct btrfs_file_extent_item *fi;
+	struct rb_node *rb_node;
+	struct btrfs_key key;
+	u64 ref_root;
+	u64 ref_objectid;
+	u64 ref_offset;
+	u32 ref_count;
+	u32 nritems;
+	int err = 0;
+	int added = 0;
+	int counted;
+	int ret;
+
+	ref_root = btrfs_extent_data_ref_root(leaf, ref);
+	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
+	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
+	ref_count = btrfs_extent_data_ref_count(leaf, ref);
+
+	/*
+	 * This is an extent belonging to the free space cache, lets just delete
+	 * it and redo the search.
+	 */
+	if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
+		ret = delete_block_group_cache(rc->extent_root->fs_info,
+					       NULL, ref_objectid);
+		if (ret != -ENOENT)
+			return ret;
+		ret = 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 1;
+
+	root = read_fs_root(rc->extent_root->fs_info, ref_root);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto out;
+	}
+
+	key.objectid = ref_objectid;
+	key.offset = ref_offset;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	nritems = btrfs_header_nritems(leaf);
+	/*
+	 * the references in tree blocks that use full backrefs
+	 * are not counted in
+	 */
+	if (block_use_full_backref(rc, leaf))
+		counted = 0;
+	else
+		counted = 1;
+	rb_node = tree_search(blocks, leaf->start);
+	if (rb_node) {
+		if (counted)
+			added = 1;
+		else
+			path->slots[0] = nritems;
+	}
+
+	while (ref_count > 0) {
+		while (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				err = ret;
+				goto out;
+			}
+			if (ret > 0) {
+				WARN_ON(1);
+				goto out;
+			}
+
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			added = 0;
+
+			if (block_use_full_backref(rc, leaf))
+				counted = 0;
+			else
+				counted = 1;
+			rb_node = tree_search(blocks, leaf->start);
+			if (rb_node) {
+				if (counted)
+					added = 1;
+				else
+					path->slots[0] = nritems;
+			}
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != ref_objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY) {
+			WARN_ON(1);
+			break;
+		}
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			goto next;
+
+		if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+		    extent_key->objectid)
+			goto next;
+
+		key.offset -= btrfs_file_extent_offset(leaf, fi);
+		if (key.offset != ref_offset)
+			goto next;
+
+		if (counted)
+			ref_count--;
+		if (added)
+			goto next;
+
+		if (!tree_block_processed(leaf->start, leaf->len, rc)) {
+			block = kmalloc(sizeof(*block), GFP_NOFS);
+			if (!block) {
+				err = -ENOMEM;
+				break;
+			}
+			block->bytenr = leaf->start;
+			btrfs_item_key_to_cpu(leaf, &block->key, 0);
+			block->level = 0;
+			block->key_ready = 1;
+			rb_node = tree_insert(blocks, block->bytenr,
+					      &block->rb_node);
+			BUG_ON(rb_node);
+		}
+		if (counted)
+			added = 1;
+		else
+			path->slots[0] = nritems;
+next:
+		path->slots[0]++;
+
+	}
+out:
+	btrfs_free_path(path);
+	return err;
+}
+
+/*
+ * hepler to find all tree blocks that reference a given data extent
+ */
+static noinline_for_stack
+int add_data_references(struct reloc_control *rc,
+			struct btrfs_key *extent_key,
+			struct btrfs_path *path,
+			struct rb_root *blocks)
+{
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	struct btrfs_extent_data_ref *dref;
+	struct btrfs_extent_inline_ref *iref;
+	unsigned long ptr;
+	unsigned long end;
+	u32 blocksize = btrfs_level_size(rc->extent_root, 0);
+	int ret;
+	int err = 0;
+
+	eb = path->nodes[0];
+	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
+	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
+		ptr = end;
+	else
+#endif
+		ptr += sizeof(struct btrfs_extent_item);
+
+	while (ptr < end) {
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		key.type = btrfs_extent_inline_ref_type(eb, iref);
+		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+			ret = __add_tree_block(rc, key.offset, blocksize,
+					       blocks);
+		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			ret = find_data_references(rc, extent_key,
+						   eb, dref, blocks);
+		} else {
+			BUG();
+		}
+		ptr += btrfs_extent_inline_ref_size(key.type);
+	}
+	WARN_ON(ptr > end);
+
+	while (1) {
+		cond_resched();
+		eb = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(rc->extent_root, path);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+			if (ret > 0)
+				break;
+			eb = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+		if (key.objectid != extent_key->objectid)
+			break;
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
+		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+#else
+		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+#endif
+			ret = __add_tree_block(rc, key.offset, blocksize,
+					       blocks);
+		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+			dref = btrfs_item_ptr(eb, path->slots[0],
+					      struct btrfs_extent_data_ref);
+			ret = find_data_references(rc, extent_key,
+						   eb, dref, blocks);
+		} else {
+			ret = 0;
+		}
+		if (ret) {
+			err = ret;
+			break;
+		}
+		path->slots[0]++;
+	}
+	btrfs_release_path(path);
+	if (err)
+		free_block_list(blocks);
+	return err;
+}
+
+/*
+ * hepler to find next unprocessed extent
+ */
+static noinline_for_stack
+int find_next_extent(struct btrfs_trans_handle *trans,
+		     struct reloc_control *rc, struct btrfs_path *path,
+		     struct btrfs_key *extent_key)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	u64 start, end, last;
+	int ret;
+
+	last = rc->block_group->key.objectid + rc->block_group->key.offset;
+	while (1) {
+		cond_resched();
+		if (rc->search_start >= last) {
+			ret = 1;
+			break;
+		}
+
+		key.objectid = rc->search_start;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = 0;
+
+		path->search_commit_root = 1;
+		path->skip_locking = 1;
+		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
+					0, 0);
+		if (ret < 0)
+			break;
+next:
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(rc->extent_root, path);
+			if (ret != 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid >= last) {
+			ret = 1;
+			break;
+		}
+
+		if (key.type != BTRFS_EXTENT_ITEM_KEY ||
+		    key.objectid + key.offset <= rc->search_start) {
+			path->slots[0]++;
+			goto next;
+		}
+
+		ret = find_first_extent_bit(&rc->processed_blocks,
+					    key.objectid, &start, &end,
+					    EXTENT_DIRTY);
+
+		if (ret == 0 && start <= key.objectid) {
+			btrfs_release_path(path);
+			rc->search_start = end + 1;
+		} else {
+			rc->search_start = key.objectid + key.offset;
+			memcpy(extent_key, &key, sizeof(key));
+			return 0;
+		}
+	}
+	btrfs_release_path(path);
+	return ret;
+}
+
+static void set_reloc_control(struct reloc_control *rc)
+{
+	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+
+	mutex_lock(&fs_info->reloc_mutex);
+	fs_info->reloc_ctl = rc;
+	mutex_unlock(&fs_info->reloc_mutex);
+}
+
+static void unset_reloc_control(struct reloc_control *rc)
+{
+	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+
+	mutex_lock(&fs_info->reloc_mutex);
+	fs_info->reloc_ctl = NULL;
+	mutex_unlock(&fs_info->reloc_mutex);
+}
+
+static int check_extent_flags(u64 flags)
+{
+	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+		return 1;
+	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
+	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+		return 1;
+	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+		return 1;
+	return 0;
+}
+
+static noinline_for_stack
+int prepare_to_relocate(struct reloc_control *rc)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+	if (!rc->block_rsv)
+		return -ENOMEM;
+
+	/*
+	 * reserve some space for creating reloc trees.
+	 * btrfs_init_reloc_root will use them when there
+	 * is no reservation in transaction handle.
+	 */
+	ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
+				  rc->extent_root->nodesize * 256);
+	if (ret)
+		return ret;
+
+	rc->block_rsv->refill_used = 1;
+	btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
+
+	memset(&rc->cluster, 0, sizeof(rc->cluster));
+	rc->search_start = rc->block_group->key.objectid;
+	rc->extents_found = 0;
+	rc->nodes_relocated = 0;
+	rc->merging_rsv_size = 0;
+
+	rc->create_reloc_tree = 1;
+	set_reloc_control(rc);
+
+	trans = btrfs_join_transaction(rc->extent_root);
+	BUG_ON(IS_ERR(trans));
+	btrfs_commit_transaction(trans, rc->extent_root);
+	return 0;
+}
+
+static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+{
+	struct rb_root blocks = RB_ROOT;
+	struct btrfs_key key;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_path *path;
+	struct btrfs_extent_item *ei;
+	unsigned long nr;
+	u64 flags;
+	u32 item_size;
+	int ret;
+	int err = 0;
+	int progress = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 1;
+
+	ret = prepare_to_relocate(rc);
+	if (ret) {
+		err = ret;
+		goto out_free;
+	}
+
+	while (1) {
+		progress++;
+		trans = btrfs_start_transaction(rc->extent_root, 0);
+		BUG_ON(IS_ERR(trans));
+restart:
+		if (update_backref_cache(trans, &rc->backref_cache)) {
+			btrfs_end_transaction(trans, rc->extent_root);
+			continue;
+		}
+
+		ret = find_next_extent(trans, rc, path, &key);
+		if (ret < 0)
+			err = ret;
+		if (ret != 0)
+			break;
+
+		rc->extents_found++;
+
+		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_extent_item);
+		item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+		if (item_size >= sizeof(*ei)) {
+			flags = btrfs_extent_flags(path->nodes[0], ei);
+			ret = check_extent_flags(flags);
+			BUG_ON(ret);
+
+		} else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+			u64 ref_owner;
+			int path_change = 0;
+
+			BUG_ON(item_size !=
+			       sizeof(struct btrfs_extent_item_v0));
+			ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
+						  &path_change);
+			if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
+				flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+			else
+				flags = BTRFS_EXTENT_FLAG_DATA;
+
+			if (path_change) {
+				btrfs_release_path(path);
+
+				path->search_commit_root = 1;
+				path->skip_locking = 1;
+				ret = btrfs_search_slot(NULL, rc->extent_root,
+							&key, path, 0, 0);
+				if (ret < 0) {
+					err = ret;
+					break;
+				}
+				BUG_ON(ret > 0);
+			}
+#else
+			BUG();
+#endif
+		}
+
+		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+			ret = add_tree_block(rc, &key, path, &blocks);
+		} else if (rc->stage == UPDATE_DATA_PTRS &&
+			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
+			ret = add_data_references(rc, &key, path, &blocks);
+		} else {
+			btrfs_release_path(path);
+			ret = 0;
+		}
+		if (ret < 0) {
+			err = ret;
+			break;
+		}
+
+		if (!RB_EMPTY_ROOT(&blocks)) {
+			ret = relocate_tree_blocks(trans, rc, &blocks);
+			if (ret < 0) {
+				if (ret != -EAGAIN) {
+					err = ret;
+					break;
+				}
+				rc->extents_found--;
+				rc->search_start = key.objectid;
+			}
+		}
+
+		ret = btrfs_block_rsv_check(trans, rc->extent_root,
+					    rc->block_rsv, 0, 5);
+		if (ret < 0) {
+			if (ret != -EAGAIN) {
+				err = ret;
+				WARN_ON(1);
+				break;
+			}
+			rc->commit_transaction = 1;
+		}
+
+		if (rc->commit_transaction) {
+			rc->commit_transaction = 0;
+			ret = btrfs_commit_transaction(trans, rc->extent_root);
+			BUG_ON(ret);
+		} else {
+			nr = trans->blocks_used;
+			btrfs_end_transaction_throttle(trans, rc->extent_root);
+			btrfs_btree_balance_dirty(rc->extent_root, nr);
+		}
+		trans = NULL;
+
+		if (rc->stage == MOVE_DATA_EXTENTS &&
+		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
+			rc->found_file_extent = 1;
+			ret = relocate_data_extent(rc->data_inode,
+						   &key, &rc->cluster);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+		}
+	}
+	if (trans && progress && err == -ENOSPC) {
+		ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
+					      rc->block_group->flags);
+		if (ret == 0) {
+			err = 0;
+			progress = 0;
+			goto restart;
+		}
+	}
+
+	btrfs_release_path(path);
+	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+			  GFP_NOFS);
+
+	if (trans) {
+		nr = trans->blocks_used;
+		btrfs_end_transaction_throttle(trans, rc->extent_root);
+		btrfs_btree_balance_dirty(rc->extent_root, nr);
+	}
+
+	if (!err) {
+		ret = relocate_file_extent_cluster(rc->data_inode,
+						   &rc->cluster);
+		if (ret < 0)
+			err = ret;
+	}
+
+	rc->create_reloc_tree = 0;
+	set_reloc_control(rc);
+
+	backref_cache_cleanup(&rc->backref_cache);
+	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
+
+	err = prepare_to_merge(rc, err);
+
+	merge_reloc_roots(rc);
+
+	rc->merge_reloc_tree = 0;
+	unset_reloc_control(rc);
+	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
+
+	/* get rid of pinned extents */
+	trans = btrfs_join_transaction(rc->extent_root);
+	if (IS_ERR(trans))
+		err = PTR_ERR(trans);
+	else
+		btrfs_commit_transaction(trans, rc->extent_root);
+out_free:
+	btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
+	btrfs_free_path(path);
+	return err;
+}
+
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root, u64 objectid)
+{
+	struct btrfs_path *path;
+	struct btrfs_inode_item *item;
+	struct extent_buffer *leaf;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+	btrfs_set_inode_generation(leaf, item, 1);
+	btrfs_set_inode_size(leaf, item, 0);
+	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+					  BTRFS_INODE_PREALLOC);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper to create inode for data relocation.
+ * the inode is in data relocation tree and its link count is 0
+ */
+static noinline_for_stack
+struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+				 struct btrfs_block_group_cache *group)
+{
+	struct inode *inode = NULL;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	unsigned long nr;
+	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+	int err = 0;
+
+	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
+	if (IS_ERR(root))
+		return ERR_CAST(root);
+
+	trans = btrfs_start_transaction(root, 6);
+	if (IS_ERR(trans))
+		return ERR_CAST(trans);
+
+	err = btrfs_find_free_objectid(root, &objectid);
+	if (err)
+		goto out;
+
+	err = __insert_orphan_inode(trans, root, objectid);
+	BUG_ON(err);
+
+	key.objectid = objectid;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+	BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
+	BTRFS_I(inode)->index_cnt = group->key.objectid;
+
+	err = btrfs_orphan_add(trans, inode);
+out:
+	nr = trans->blocks_used;
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root, nr);
+	if (err) {
+		if (inode)
+			iput(inode);
+		inode = ERR_PTR(err);
+	}
+	return inode;
+}
+
+static struct reloc_control *alloc_reloc_control(void)
+{
+	struct reloc_control *rc;
+
+	rc = kzalloc(sizeof(*rc), GFP_NOFS);
+	if (!rc)
+		return NULL;
+
+	INIT_LIST_HEAD(&rc->reloc_roots);
+	backref_cache_init(&rc->backref_cache);
+	mapping_tree_init(&rc->reloc_root_tree);
+	extent_io_tree_init(&rc->processed_blocks, NULL);
+	return rc;
+}
+
+/*
+ * function to relocate all extents in a block group.
+ */
+int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+{
+	struct btrfs_fs_info *fs_info = extent_root->fs_info;
+	struct reloc_control *rc;
+	struct inode *inode;
+	struct btrfs_path *path;
+	int ret;
+	int rw = 0;
+	int err = 0;
+
+	rc = alloc_reloc_control();
+	if (!rc)
+		return -ENOMEM;
+
+	rc->extent_root = extent_root;
+
+	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
+	BUG_ON(!rc->block_group);
+
+	if (!rc->block_group->ro) {
+		ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
+		if (ret) {
+			err = ret;
+			goto out;
+		}
+		rw = 1;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
+					path);
+	btrfs_free_path(path);
+
+	if (!IS_ERR(inode))
+		ret = delete_block_group_cache(fs_info, inode, 0);
+	else
+		ret = PTR_ERR(inode);
+
+	if (ret && ret != -ENOENT) {
+		err = ret;
+		goto out;
+	}
+
+	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
+	if (IS_ERR(rc->data_inode)) {
+		err = PTR_ERR(rc->data_inode);
+		rc->data_inode = NULL;
+		goto out;
+	}
+
+	printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
+	       (unsigned long long)rc->block_group->key.objectid,
+	       (unsigned long long)rc->block_group->flags);
+
+	btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+	btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
+
+	while (1) {
+		mutex_lock(&fs_info->cleaner_mutex);
+
+		btrfs_clean_old_snapshots(fs_info->tree_root);
+		ret = relocate_block_group(rc);
+
+		mutex_unlock(&fs_info->cleaner_mutex);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+
+		if (rc->extents_found == 0)
+			break;
+
+		printk(KERN_INFO "btrfs: found %llu extents\n",
+			(unsigned long long)rc->extents_found);
+
+		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
+			btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
+			invalidate_mapping_pages(rc->data_inode->i_mapping,
+						 0, -1);
+			rc->stage = UPDATE_DATA_PTRS;
+		}
+	}
+
+	filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
+				     rc->block_group->key.objectid,
+				     rc->block_group->key.objectid +
+				     rc->block_group->key.offset - 1);
+
+	WARN_ON(rc->block_group->pinned > 0);
+	WARN_ON(rc->block_group->reserved > 0);
+	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
+out:
+	if (err && rw)
+		btrfs_set_block_group_rw(extent_root, rc->block_group);
+	iput(rc->data_inode);
+	btrfs_put_block_group(rc->block_group);
+	kfree(rc);
+	return err;
+}
+
+static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
+	BUG_ON(IS_ERR(trans));
+
+	memset(&root->root_item.drop_progress, 0,
+		sizeof(root->root_item.drop_progress));
+	root->root_item.drop_level = 0;
+	btrfs_set_root_refs(&root->root_item, 0);
+	ret = btrfs_update_root(trans, root->fs_info->tree_root,
+				&root->root_key, &root->root_item);
+	BUG_ON(ret);
+
+	ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
+	BUG_ON(ret);
+	return 0;
+}
+
+/*
+ * recover relocation interrupted by system crash.
+ *
+ * this function resumes merging reloc trees with corresponding fs trees.
+ * this is important for keeping the sharing of tree blocks
+ */
+int btrfs_recover_relocation(struct btrfs_root *root)
+{
+	LIST_HEAD(reloc_roots);
+	struct btrfs_key key;
+	struct btrfs_root *fs_root;
+	struct btrfs_root *reloc_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct reloc_control *rc = NULL;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = -1;
+
+	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
+					path, 0, 0);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		btrfs_release_path(path);
+
+		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
+		    key.type != BTRFS_ROOT_ITEM_KEY)
+			break;
+
+		reloc_root = btrfs_read_fs_root_no_radix(root, &key);
+		if (IS_ERR(reloc_root)) {
+			err = PTR_ERR(reloc_root);
+			goto out;
+		}
+
+		list_add(&reloc_root->root_list, &reloc_roots);
+
+		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+			fs_root = read_fs_root(root->fs_info,
+					       reloc_root->root_key.offset);
+			if (IS_ERR(fs_root)) {
+				ret = PTR_ERR(fs_root);
+				if (ret != -ENOENT) {
+					err = ret;
+					goto out;
+				}
+				mark_garbage_root(reloc_root);
+			}
+		}
+
+		if (key.offset == 0)
+			break;
+
+		key.offset--;
+	}
+	btrfs_release_path(path);
+
+	if (list_empty(&reloc_roots))
+		goto out;
+
+	rc = alloc_reloc_control();
+	if (!rc) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	rc->extent_root = root->fs_info->extent_root;
+
+	set_reloc_control(rc);
+
+	trans = btrfs_join_transaction(rc->extent_root);
+	if (IS_ERR(trans)) {
+		unset_reloc_control(rc);
+		err = PTR_ERR(trans);
+		goto out_free;
+	}
+
+	rc->merge_reloc_tree = 1;
+
+	while (!list_empty(&reloc_roots)) {
+		reloc_root = list_entry(reloc_roots.next,
+					struct btrfs_root, root_list);
+		list_del(&reloc_root->root_list);
+
+		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
+			list_add_tail(&reloc_root->root_list,
+				      &rc->reloc_roots);
+			continue;
+		}
+
+		fs_root = read_fs_root(root->fs_info,
+				       reloc_root->root_key.offset);
+		BUG_ON(IS_ERR(fs_root));
+
+		__add_reloc_root(reloc_root);
+		fs_root->reloc_root = reloc_root;
+	}
+
+	btrfs_commit_transaction(trans, rc->extent_root);
+
+	merge_reloc_roots(rc);
+
+	unset_reloc_control(rc);
+
+	trans = btrfs_join_transaction(rc->extent_root);
+	if (IS_ERR(trans))
+		err = PTR_ERR(trans);
+	else
+		btrfs_commit_transaction(trans, rc->extent_root);
+out_free:
+	kfree(rc);
+out:
+	while (!list_empty(&reloc_roots)) {
+		reloc_root = list_entry(reloc_roots.next,
+					struct btrfs_root, root_list);
+		list_del(&reloc_root->root_list);
+		free_extent_buffer(reloc_root->node);
+		free_extent_buffer(reloc_root->commit_root);
+		kfree(reloc_root);
+	}
+	btrfs_free_path(path);
+
+	if (err == 0) {
+		/* cleanup orphan inode in data relocation tree */
+		fs_root = read_fs_root(root->fs_info,
+				       BTRFS_DATA_RELOC_TREE_OBJECTID);
+		if (IS_ERR(fs_root))
+			err = PTR_ERR(fs_root);
+		else
+			err = btrfs_orphan_cleanup(fs_root);
+	}
+	return err;
+}
+
+/*
+ * helper to add ordered checksum for data relocation.
+ *
+ * cloning checksum properly handles the nodatasum extents.
+ * it also saves CPU time to re-calculate the checksum.
+ */
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+{
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	size_t offset;
+	int ret;
+	u64 disk_bytenr;
+	LIST_HEAD(list);
+
+	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+
+	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+				       disk_bytenr + len - 1, &list, 0);
+
+	while (!list_empty(&list)) {
+		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+		list_del_init(&sums->list);
+
+		sector_sum = sums->sums;
+		sums->bytenr = ordered->start;
+
+		offset = 0;
+		while (offset < sums->len) {
+			sector_sum->bytenr += ordered->start - disk_bytenr;
+			sector_sum++;
+			offset += root->sectorsize;
+		}
+
+		btrfs_add_ordered_sum(inode, ordered, sums);
+	}
+	btrfs_put_ordered_extent(ordered);
+	return ret;
+}
+
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, struct extent_buffer *buf,
+			   struct extent_buffer *cow)
+{
+	struct reloc_control *rc;
+	struct backref_node *node;
+	int first_cow = 0;
+	int level;
+	int ret;
+
+	rc = root->fs_info->reloc_ctl;
+	if (!rc)
+		return;
+
+	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
+	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+
+	level = btrfs_header_level(buf);
+	if (btrfs_header_generation(buf) <=
+	    btrfs_root_last_snapshot(&root->root_item))
+		first_cow = 1;
+
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
+	    rc->create_reloc_tree) {
+		WARN_ON(!first_cow && level == 0);
+
+		node = rc->backref_cache.path[level];
+		BUG_ON(node->bytenr != buf->start &&
+		       node->new_bytenr != buf->start);
+
+		drop_node_buffer(node);
+		extent_buffer_get(cow);
+		node->eb = cow;
+		node->new_bytenr = cow->start;
+
+		if (!node->pending) {
+			list_move_tail(&node->list,
+				       &rc->backref_cache.pending[level]);
+			node->pending = 1;
+		}
+
+		if (first_cow)
+			__mark_block_processed(rc, node);
+
+		if (first_cow && level > 0)
+			rc->nodes_relocated += buf->len;
+	}
+
+	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
+		ret = replace_file_extents(trans, rc, root, cow);
+		BUG_ON(ret);
+	}
+}
+
+/*
+ * called before creating snapshot. it calculates metadata reservation
+ * requried for relocating tree blocks in the snapshot
+ */
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+			      struct btrfs_pending_snapshot *pending,
+			      u64 *bytes_to_reserve)
+{
+	struct btrfs_root *root;
+	struct reloc_control *rc;
+
+	root = pending->root;
+	if (!root->reloc_root)
+		return;
+
+	rc = root->fs_info->reloc_ctl;
+	if (!rc->merge_reloc_tree)
+		return;
+
+	root = root->reloc_root;
+	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
+	/*
+	 * relocation is in the stage of merging trees. the space
+	 * used by merging a reloc tree is twice the size of
+	 * relocated tree nodes in the worst case. half for cowing
+	 * the reloc tree, half for cowing the fs tree. the space
+	 * used by cowing the reloc tree will be freed after the
+	 * tree is dropped. if we create snapshot, cowing the fs
+	 * tree may use more space than it frees. so we need
+	 * reserve extra space.
+	 */
+	*bytes_to_reserve += rc->nodes_relocated;
+}
+
+/*
+ * called after snapshot is created. migrate block reservation
+ * and create reloc root for the newly created snapshot
+ */
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+			       struct btrfs_pending_snapshot *pending)
+{
+	struct btrfs_root *root = pending->root;
+	struct btrfs_root *reloc_root;
+	struct btrfs_root *new_root;
+	struct reloc_control *rc;
+	int ret;
+
+	if (!root->reloc_root)
+		return;
+
+	rc = root->fs_info->reloc_ctl;
+	rc->merging_rsv_size += rc->nodes_relocated;
+
+	if (rc->merge_reloc_tree) {
+		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+					      rc->block_rsv,
+					      rc->nodes_relocated);
+		BUG_ON(ret);
+	}
+
+	new_root = pending->snap;
+	reloc_root = create_reloc_root(trans, root->reloc_root,
+				       new_root->root_key.objectid);
+
+	__add_reloc_root(reloc_root);
+	new_root->reloc_root = reloc_root;
+
+	if (rc->create_reloc_tree) {
+		ret = clone_backref_node(trans, rc, root, reloc_root);
+		BUG_ON(ret);
+	}
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 00000000..ebe45443
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,450 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+/*
+ * lookup the root with the highest offset for a given objectid.  The key we do
+ * find is copied into 'key'.  If we find something return 0, otherwise 1, < 0
+ * on error.
+ */
+int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
+			struct btrfs_root_item *item, struct btrfs_key *key)
+{
+	struct btrfs_path *path;
+	struct btrfs_key search_key;
+	struct btrfs_key found_key;
+	struct extent_buffer *l;
+	int ret;
+	int slot;
+
+	search_key.objectid = objectid;
+	search_key.type = BTRFS_ROOT_ITEM_KEY;
+	search_key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	BUG_ON(ret == 0);
+	if (path->slots[0] == 0) {
+		ret = 1;
+		goto out;
+	}
+	l = path->nodes[0];
+	slot = path->slots[0] - 1;
+	btrfs_item_key_to_cpu(l, &found_key, slot);
+	if (found_key.objectid != objectid ||
+	    found_key.type != BTRFS_ROOT_ITEM_KEY) {
+		ret = 1;
+		goto out;
+	}
+	if (item)
+		read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
+				   sizeof(*item));
+	if (key)
+		memcpy(key, &found_key, sizeof(found_key));
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_set_root_node(struct btrfs_root_item *item,
+			struct extent_buffer *node)
+{
+	btrfs_set_root_bytenr(item, node->start);
+	btrfs_set_root_level(item, btrfs_header_level(node));
+	btrfs_set_root_generation(item, btrfs_header_generation(node));
+	return 0;
+}
+
+/*
+ * copy the data in 'item' into the btree
+ */
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_root_item
+		      *item)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *l;
+	int ret;
+	int slot;
+	unsigned long ptr;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret != 0) {
+		btrfs_print_leaf(root, path->nodes[0]);
+		printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
+		       (unsigned long long)key->objectid, key->type,
+		       (unsigned long long)key->offset);
+		BUG_ON(1);
+	}
+
+	l = path->nodes[0];
+	slot = path->slots[0];
+	ptr = btrfs_item_ptr_offset(l, slot);
+	write_extent_buffer(l, item, ptr, sizeof(*item));
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_root_item
+		      *item)
+{
+	int ret;
+	ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
+	return ret;
+}
+
+/*
+ * at mount time we want to find all the old transaction snapshots that were in
+ * the process of being deleted if we crashed.  This is any root item with an
+ * offset lower than the latest root.  They need to be queued for deletion to
+ * finish what was happening when we crashed.
+ */
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
+{
+	struct btrfs_root *dead_root;
+	struct btrfs_root_item *ri;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+	int ret;
+	u32 nritems;
+	struct extent_buffer *leaf;
+	int slot;
+
+	key.objectid = objectid;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	key.offset = 0;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+again:
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto err;
+	while (1) {
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		slot = path->slots[0];
+		if (slot >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret)
+				break;
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			slot = path->slots[0];
+		}
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
+			goto next;
+
+		if (key.objectid < objectid)
+			goto next;
+
+		if (key.objectid > objectid)
+			break;
+
+		ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
+		if (btrfs_disk_root_refs(leaf, ri) != 0)
+			goto next;
+
+		memcpy(&found_key, &key, sizeof(key));
+		key.offset++;
+		btrfs_release_path(path);
+		dead_root =
+			btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+						    &found_key);
+		if (IS_ERR(dead_root)) {
+			ret = PTR_ERR(dead_root);
+			goto err;
+		}
+
+		ret = btrfs_add_dead_root(dead_root);
+		if (ret)
+			goto err;
+		goto again;
+next:
+		slot++;
+		path->slots[0]++;
+	}
+	ret = 0;
+err:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key root_key;
+	struct btrfs_root *root;
+	int err = 0;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = 0;
+
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+		if (ret < 0) {
+			err = ret;
+			break;
+		}
+
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(tree_root, path);
+			if (ret < 0)
+				err = ret;
+			if (ret != 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		btrfs_release_path(path);
+
+		if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
+		    key.type != BTRFS_ORPHAN_ITEM_KEY)
+			break;
+
+		root_key.objectid = key.offset;
+		key.offset++;
+
+		root = btrfs_read_fs_root_no_name(tree_root->fs_info,
+						  &root_key);
+		if (!IS_ERR(root))
+			continue;
+
+		ret = PTR_ERR(root);
+		if (ret != -ENOENT) {
+			err = ret;
+			break;
+		}
+
+		ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
+		if (ret) {
+			err = ret;
+			break;
+		}
+	}
+
+	btrfs_free_path(path);
+	return err;
+}
+
+/* drop the root item for 'key' from 'root' */
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_key *key)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_root_item *ri;
+	struct extent_buffer *leaf;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	ret = btrfs_search_slot(trans, root, key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	BUG_ON(ret != 0);
+	leaf = path->nodes[0];
+	ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
+
+	ret = btrfs_del_item(trans, root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *tree_root,
+		       u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+		       const char *name, int name_len)
+
+{
+	struct btrfs_path *path;
+	struct btrfs_root_ref *ref;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	unsigned long ptr;
+	int err = 0;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = root_id;
+	key.type = BTRFS_ROOT_BACKREF_KEY;
+	key.offset = ref_id;
+again:
+	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+	BUG_ON(ret < 0);
+	if (ret == 0) {
+		leaf = path->nodes[0];
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_root_ref);
+
+		WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
+		WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
+		ptr = (unsigned long)(ref + 1);
+		WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
+		*sequence = btrfs_root_ref_sequence(leaf, ref);
+
+		ret = btrfs_del_item(trans, tree_root, path);
+		if (ret) {
+			err = ret;
+			goto out;
+		}
+	} else
+		err = -ENOENT;
+
+	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+		btrfs_release_path(path);
+		key.objectid = ref_id;
+		key.type = BTRFS_ROOT_REF_KEY;
+		key.offset = root_id;
+		goto again;
+	}
+
+out:
+	btrfs_free_path(path);
+	return err;
+}
+
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+		   struct btrfs_path *path,
+		   u64 root_id, u64 ref_id)
+{
+	struct btrfs_key key;
+	int ret;
+
+	key.objectid = root_id;
+	key.type = BTRFS_ROOT_REF_KEY;
+	key.offset = ref_id;
+
+	ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+	return ret;
+}
+
+/*
+ * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
+ * or BTRFS_ROOT_BACKREF_KEY.
+ *
+ * The dirid, sequence, name and name_len refer to the directory entry
+ * that is referencing the root.
+ *
+ * For a forward ref, the root_id is the id of the tree referencing
+ * the root and ref_id is the id of the subvol  or snapshot.
+ *
+ * For a back ref the root_id is the id of the subvol or snapshot and
+ * ref_id is the id of the tree referencing it.
+ */
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *tree_root,
+		       u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+		       const char *name, int name_len)
+{
+	struct btrfs_key key;
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root_ref *ref;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = root_id;
+	key.type = BTRFS_ROOT_BACKREF_KEY;
+	key.offset = ref_id;
+again:
+	ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
+				      sizeof(*ref) + name_len);
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+	btrfs_set_root_ref_dirid(leaf, ref, dirid);
+	btrfs_set_root_ref_sequence(leaf, ref, sequence);
+	btrfs_set_root_ref_name_len(leaf, ref, name_len);
+	ptr = (unsigned long)(ref + 1);
+	write_extent_buffer(leaf, name, ptr, name_len);
+	btrfs_mark_buffer_dirty(leaf);
+
+	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+		btrfs_release_path(path);
+		key.objectid = ref_id;
+		key.type = BTRFS_ROOT_REF_KEY;
+		key.offset = root_id;
+		goto again;
+	}
+
+	btrfs_free_path(path);
+	return 0;
+}
+
+/*
+ * Old btrfs forgets to init root_item->flags and root_item->byte_limit
+ * for subvolumes. To work around this problem, we steal a bit from
+ * root_item->inode_item->flags, and use it to indicate if those fields
+ * have been properly initialized.
+ */
+void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
+{
+	u64 inode_flags = le64_to_cpu(root_item->inode.flags);
+
+	if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
+		inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
+		root_item->inode.flags = cpu_to_le64(inode_flags);
+		root_item->flags = 0;
+		root_item->byte_limit = 0;
+	}
+}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
new file mode 100644
index 00000000..a8d03d5e
--- /dev/null
+++ b/fs/btrfs/scrub.c
@@ -0,0 +1,1395 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "ordered-data.h"
+
+/*
+ * This is only the first step towards a full-features scrub. It reads all
+ * extent and super block and verifies the checksums. In case a bad checksum
+ * is found or the extent cannot be read, good data will be written back if
+ * any can be found.
+ *
+ * Future enhancements:
+ *  - To enhance the performance, better read-ahead strategies for the
+ *    extent-tree can be employed.
+ *  - In case an unrepairable extent is encountered, track which files are
+ *    affected and report them
+ *  - In case of a read error on files with nodatasum, map the file and read
+ *    the extent to trigger a writeback of the good copy
+ *  - track and record media errors, throw out bad devices
+ *  - add a mode to also read unallocated space
+ *  - make the prefetch cancellable
+ */
+
+struct scrub_bio;
+struct scrub_page;
+struct scrub_dev;
+static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_checksum(struct btrfs_work *work);
+static int scrub_checksum_data(struct scrub_dev *sdev,
+			       struct scrub_page *spag, void *buffer);
+static int scrub_checksum_tree_block(struct scrub_dev *sdev,
+				     struct scrub_page *spag, u64 logical,
+				     void *buffer);
+static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
+static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
+static void scrub_fixup_end_io(struct bio *bio, int err);
+static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
+			  struct page *page);
+static void scrub_fixup(struct scrub_bio *sbio, int ix);
+
+#define SCRUB_PAGES_PER_BIO	16	/* 64k per bio */
+#define SCRUB_BIOS_PER_DEV	16	/* 1 MB per device in flight */
+
+struct scrub_page {
+	u64			flags;  /* extent flags */
+	u64			generation;
+	u64			mirror_num;
+	int			have_csum;
+	u8			csum[BTRFS_CSUM_SIZE];
+};
+
+struct scrub_bio {
+	int			index;
+	struct scrub_dev	*sdev;
+	struct bio		*bio;
+	int			err;
+	u64			logical;
+	u64			physical;
+	struct scrub_page	spag[SCRUB_PAGES_PER_BIO];
+	u64			count;
+	int			next_free;
+	struct btrfs_work	work;
+};
+
+struct scrub_dev {
+	struct scrub_bio	*bios[SCRUB_BIOS_PER_DEV];
+	struct btrfs_device	*dev;
+	int			first_free;
+	int			curr;
+	atomic_t		in_flight;
+	spinlock_t		list_lock;
+	wait_queue_head_t	list_wait;
+	u16			csum_size;
+	struct list_head	csum_list;
+	atomic_t		cancel_req;
+	int			readonly;
+	/*
+	 * statistics
+	 */
+	struct btrfs_scrub_progress stat;
+	spinlock_t		stat_lock;
+};
+
+static void scrub_free_csums(struct scrub_dev *sdev)
+{
+	while (!list_empty(&sdev->csum_list)) {
+		struct btrfs_ordered_sum *sum;
+		sum = list_first_entry(&sdev->csum_list,
+				       struct btrfs_ordered_sum, list);
+		list_del(&sum->list);
+		kfree(sum);
+	}
+}
+
+static void scrub_free_bio(struct bio *bio)
+{
+	int i;
+	struct page *last_page = NULL;
+
+	if (!bio)
+		return;
+
+	for (i = 0; i < bio->bi_vcnt; ++i) {
+		if (bio->bi_io_vec[i].bv_page == last_page)
+			continue;
+		last_page = bio->bi_io_vec[i].bv_page;
+		__free_page(last_page);
+	}
+	bio_put(bio);
+}
+
+static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
+{
+	int i;
+
+	if (!sdev)
+		return;
+
+	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+		struct scrub_bio *sbio = sdev->bios[i];
+
+		if (!sbio)
+			break;
+
+		scrub_free_bio(sbio->bio);
+		kfree(sbio);
+	}
+
+	scrub_free_csums(sdev);
+	kfree(sdev);
+}
+
+static noinline_for_stack
+struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
+{
+	struct scrub_dev *sdev;
+	int		i;
+	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+
+	sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
+	if (!sdev)
+		goto nomem;
+	sdev->dev = dev;
+	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+		struct scrub_bio *sbio;
+
+		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
+		if (!sbio)
+			goto nomem;
+		sdev->bios[i] = sbio;
+
+		sbio->index = i;
+		sbio->sdev = sdev;
+		sbio->count = 0;
+		sbio->work.func = scrub_checksum;
+
+		if (i != SCRUB_BIOS_PER_DEV-1)
+			sdev->bios[i]->next_free = i + 1;
+		 else
+			sdev->bios[i]->next_free = -1;
+	}
+	sdev->first_free = 0;
+	sdev->curr = -1;
+	atomic_set(&sdev->in_flight, 0);
+	atomic_set(&sdev->cancel_req, 0);
+	sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+	INIT_LIST_HEAD(&sdev->csum_list);
+
+	spin_lock_init(&sdev->list_lock);
+	spin_lock_init(&sdev->stat_lock);
+	init_waitqueue_head(&sdev->list_wait);
+	return sdev;
+
+nomem:
+	scrub_free_dev(sdev);
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * scrub_recheck_error gets called when either verification of the page
+ * failed or the bio failed to read, e.g. with EIO. In the latter case,
+ * recheck_error gets called for every page in the bio, even though only
+ * one may be bad
+ */
+static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
+{
+	if (sbio->err) {
+		if (scrub_fixup_io(READ, sbio->sdev->dev->bdev,
+				   (sbio->physical + ix * PAGE_SIZE) >> 9,
+				   sbio->bio->bi_io_vec[ix].bv_page) == 0) {
+			if (scrub_fixup_check(sbio, ix) == 0)
+				return;
+		}
+	}
+
+	scrub_fixup(sbio, ix);
+}
+
+static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
+{
+	int ret = 1;
+	struct page *page;
+	void *buffer;
+	u64 flags = sbio->spag[ix].flags;
+
+	page = sbio->bio->bi_io_vec[ix].bv_page;
+	buffer = kmap_atomic(page, KM_USER0);
+	if (flags & BTRFS_EXTENT_FLAG_DATA) {
+		ret = scrub_checksum_data(sbio->sdev,
+					  sbio->spag + ix, buffer);
+	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		ret = scrub_checksum_tree_block(sbio->sdev,
+						sbio->spag + ix,
+						sbio->logical + ix * PAGE_SIZE,
+						buffer);
+	} else {
+		WARN_ON(1);
+	}
+	kunmap_atomic(buffer, KM_USER0);
+
+	return ret;
+}
+
+static void scrub_fixup_end_io(struct bio *bio, int err)
+{
+	complete((struct completion *)bio->bi_private);
+}
+
+static void scrub_fixup(struct scrub_bio *sbio, int ix)
+{
+	struct scrub_dev *sdev = sbio->sdev;
+	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+	struct btrfs_multi_bio *multi = NULL;
+	u64 logical = sbio->logical + ix * PAGE_SIZE;
+	u64 length;
+	int i;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(complete);
+
+	if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
+	    (sbio->spag[ix].have_csum == 0)) {
+		/*
+		 * nodatasum, don't try to fix anything
+		 * FIXME: we can do better, open the inode and trigger a
+		 * writeback
+		 */
+		goto uncorrectable;
+	}
+
+	length = PAGE_SIZE;
+	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
+			      &multi, 0);
+	if (ret || !multi || length < PAGE_SIZE) {
+		printk(KERN_ERR
+		       "scrub_fixup: btrfs_map_block failed us for %llu\n",
+		       (unsigned long long)logical);
+		WARN_ON(1);
+		return;
+	}
+
+	if (multi->num_stripes == 1)
+		/* there aren't any replicas */
+		goto uncorrectable;
+
+	/*
+	 * first find a good copy
+	 */
+	for (i = 0; i < multi->num_stripes; ++i) {
+		if (i == sbio->spag[ix].mirror_num)
+			continue;
+
+		if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
+				   multi->stripes[i].physical >> 9,
+				   sbio->bio->bi_io_vec[ix].bv_page)) {
+			/* I/O-error, this is not a good copy */
+			continue;
+		}
+
+		if (scrub_fixup_check(sbio, ix) == 0)
+			break;
+	}
+	if (i == multi->num_stripes)
+		goto uncorrectable;
+
+	if (!sdev->readonly) {
+		/*
+		 * bi_io_vec[ix].bv_page now contains good data, write it back
+		 */
+		if (scrub_fixup_io(WRITE, sdev->dev->bdev,
+				   (sbio->physical + ix * PAGE_SIZE) >> 9,
+				   sbio->bio->bi_io_vec[ix].bv_page)) {
+			/* I/O-error, writeback failed, give up */
+			goto uncorrectable;
+		}
+	}
+
+	kfree(multi);
+	spin_lock(&sdev->stat_lock);
+	++sdev->stat.corrected_errors;
+	spin_unlock(&sdev->stat_lock);
+
+	if (printk_ratelimit())
+		printk(KERN_ERR "btrfs: fixed up at %llu\n",
+		       (unsigned long long)logical);
+	return;
+
+uncorrectable:
+	kfree(multi);
+	spin_lock(&sdev->stat_lock);
+	++sdev->stat.uncorrectable_errors;
+	spin_unlock(&sdev->stat_lock);
+
+	if (printk_ratelimit())
+		printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
+			 (unsigned long long)logical);
+}
+
+static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
+			 struct page *page)
+{
+	struct bio *bio = NULL;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(complete);
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	bio->bi_bdev = bdev;
+	bio->bi_sector = sector;
+	bio_add_page(bio, page, PAGE_SIZE, 0);
+	bio->bi_end_io = scrub_fixup_end_io;
+	bio->bi_private = &complete;
+	submit_bio(rw, bio);
+
+	/* this will also unplug the queue */
+	wait_for_completion(&complete);
+
+	ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_put(bio);
+	return ret;
+}
+
+static void scrub_bio_end_io(struct bio *bio, int err)
+{
+	struct scrub_bio *sbio = bio->bi_private;
+	struct scrub_dev *sdev = sbio->sdev;
+	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+
+	sbio->err = err;
+	sbio->bio = bio;
+
+	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+}
+
+static void scrub_checksum(struct btrfs_work *work)
+{
+	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+	struct scrub_dev *sdev = sbio->sdev;
+	struct page *page;
+	void *buffer;
+	int i;
+	u64 flags;
+	u64 logical;
+	int ret;
+
+	if (sbio->err) {
+		for (i = 0; i < sbio->count; ++i)
+			scrub_recheck_error(sbio, i);
+
+		sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+		sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
+		sbio->bio->bi_phys_segments = 0;
+		sbio->bio->bi_idx = 0;
+
+		for (i = 0; i < sbio->count; i++) {
+			struct bio_vec *bi;
+			bi = &sbio->bio->bi_io_vec[i];
+			bi->bv_offset = 0;
+			bi->bv_len = PAGE_SIZE;
+		}
+
+		spin_lock(&sdev->stat_lock);
+		++sdev->stat.read_errors;
+		spin_unlock(&sdev->stat_lock);
+		goto out;
+	}
+	for (i = 0; i < sbio->count; ++i) {
+		page = sbio->bio->bi_io_vec[i].bv_page;
+		buffer = kmap_atomic(page, KM_USER0);
+		flags = sbio->spag[i].flags;
+		logical = sbio->logical + i * PAGE_SIZE;
+		ret = 0;
+		if (flags & BTRFS_EXTENT_FLAG_DATA) {
+			ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
+		} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+			ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
+							logical, buffer);
+		} else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
+			BUG_ON(i);
+			(void)scrub_checksum_super(sbio, buffer);
+		} else {
+			WARN_ON(1);
+		}
+		kunmap_atomic(buffer, KM_USER0);
+		if (ret)
+			scrub_recheck_error(sbio, i);
+	}
+
+out:
+	scrub_free_bio(sbio->bio);
+	sbio->bio = NULL;
+	spin_lock(&sdev->list_lock);
+	sbio->next_free = sdev->first_free;
+	sdev->first_free = sbio->index;
+	spin_unlock(&sdev->list_lock);
+	atomic_dec(&sdev->in_flight);
+	wake_up(&sdev->list_wait);
+}
+
+static int scrub_checksum_data(struct scrub_dev *sdev,
+			       struct scrub_page *spag, void *buffer)
+{
+	u8 csum[BTRFS_CSUM_SIZE];
+	u32 crc = ~(u32)0;
+	int fail = 0;
+	struct btrfs_root *root = sdev->dev->dev_root;
+
+	if (!spag->have_csum)
+		return 0;
+
+	crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
+	btrfs_csum_final(crc, csum);
+	if (memcmp(csum, spag->csum, sdev->csum_size))
+		fail = 1;
+
+	spin_lock(&sdev->stat_lock);
+	++sdev->stat.data_extents_scrubbed;
+	sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
+	if (fail)
+		++sdev->stat.csum_errors;
+	spin_unlock(&sdev->stat_lock);
+
+	return fail;
+}
+
+static int scrub_checksum_tree_block(struct scrub_dev *sdev,
+				     struct scrub_page *spag, u64 logical,
+				     void *buffer)
+{
+	struct btrfs_header *h;
+	struct btrfs_root *root = sdev->dev->dev_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u8 csum[BTRFS_CSUM_SIZE];
+	u32 crc = ~(u32)0;
+	int fail = 0;
+	int crc_fail = 0;
+
+	/*
+	 * we don't use the getter functions here, as we
+	 * a) don't have an extent buffer and
+	 * b) the page is already kmapped
+	 */
+	h = (struct btrfs_header *)buffer;
+
+	if (logical != le64_to_cpu(h->bytenr))
+		++fail;
+
+	if (spag->generation != le64_to_cpu(h->generation))
+		++fail;
+
+	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+		++fail;
+
+	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+		   BTRFS_UUID_SIZE))
+		++fail;
+
+	crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
+			      PAGE_SIZE - BTRFS_CSUM_SIZE);
+	btrfs_csum_final(crc, csum);
+	if (memcmp(csum, h->csum, sdev->csum_size))
+		++crc_fail;
+
+	spin_lock(&sdev->stat_lock);
+	++sdev->stat.tree_extents_scrubbed;
+	sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
+	if (crc_fail)
+		++sdev->stat.csum_errors;
+	if (fail)
+		++sdev->stat.verify_errors;
+	spin_unlock(&sdev->stat_lock);
+
+	return fail || crc_fail;
+}
+
+static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
+{
+	struct btrfs_super_block *s;
+	u64 logical;
+	struct scrub_dev *sdev = sbio->sdev;
+	struct btrfs_root *root = sdev->dev->dev_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u8 csum[BTRFS_CSUM_SIZE];
+	u32 crc = ~(u32)0;
+	int fail = 0;
+
+	s = (struct btrfs_super_block *)buffer;
+	logical = sbio->logical;
+
+	if (logical != le64_to_cpu(s->bytenr))
+		++fail;
+
+	if (sbio->spag[0].generation != le64_to_cpu(s->generation))
+		++fail;
+
+	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+		++fail;
+
+	crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
+			      PAGE_SIZE - BTRFS_CSUM_SIZE);
+	btrfs_csum_final(crc, csum);
+	if (memcmp(csum, s->csum, sbio->sdev->csum_size))
+		++fail;
+
+	if (fail) {
+		/*
+		 * if we find an error in a super block, we just report it.
+		 * They will get written with the next transaction commit
+		 * anyway
+		 */
+		spin_lock(&sdev->stat_lock);
+		++sdev->stat.super_errors;
+		spin_unlock(&sdev->stat_lock);
+	}
+
+	return fail;
+}
+
+static int scrub_submit(struct scrub_dev *sdev)
+{
+	struct scrub_bio *sbio;
+	struct bio *bio;
+	int i;
+
+	if (sdev->curr == -1)
+		return 0;
+
+	sbio = sdev->bios[sdev->curr];
+
+	bio = bio_alloc(GFP_NOFS, sbio->count);
+	if (!bio)
+		goto nomem;
+
+	bio->bi_private = sbio;
+	bio->bi_end_io = scrub_bio_end_io;
+	bio->bi_bdev = sdev->dev->bdev;
+	bio->bi_sector = sbio->physical >> 9;
+
+	for (i = 0; i < sbio->count; ++i) {
+		struct page *page;
+		int ret;
+
+		page = alloc_page(GFP_NOFS);
+		if (!page)
+			goto nomem;
+
+		ret = bio_add_page(bio, page, PAGE_SIZE, 0);
+		if (!ret) {
+			__free_page(page);
+			goto nomem;
+		}
+	}
+
+	sbio->err = 0;
+	sdev->curr = -1;
+	atomic_inc(&sdev->in_flight);
+
+	submit_bio(READ, bio);
+
+	return 0;
+
+nomem:
+	scrub_free_bio(bio);
+
+	return -ENOMEM;
+}
+
+static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
+		      u64 physical, u64 flags, u64 gen, u64 mirror_num,
+		      u8 *csum, int force)
+{
+	struct scrub_bio *sbio;
+
+again:
+	/*
+	 * grab a fresh bio or wait for one to become available
+	 */
+	while (sdev->curr == -1) {
+		spin_lock(&sdev->list_lock);
+		sdev->curr = sdev->first_free;
+		if (sdev->curr != -1) {
+			sdev->first_free = sdev->bios[sdev->curr]->next_free;
+			sdev->bios[sdev->curr]->next_free = -1;
+			sdev->bios[sdev->curr]->count = 0;
+			spin_unlock(&sdev->list_lock);
+		} else {
+			spin_unlock(&sdev->list_lock);
+			wait_event(sdev->list_wait, sdev->first_free != -1);
+		}
+	}
+	sbio = sdev->bios[sdev->curr];
+	if (sbio->count == 0) {
+		sbio->physical = physical;
+		sbio->logical = logical;
+	} else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
+		   sbio->logical + sbio->count * PAGE_SIZE != logical) {
+		int ret;
+
+		ret = scrub_submit(sdev);
+		if (ret)
+			return ret;
+		goto again;
+	}
+	sbio->spag[sbio->count].flags = flags;
+	sbio->spag[sbio->count].generation = gen;
+	sbio->spag[sbio->count].have_csum = 0;
+	sbio->spag[sbio->count].mirror_num = mirror_num;
+	if (csum) {
+		sbio->spag[sbio->count].have_csum = 1;
+		memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
+	}
+	++sbio->count;
+	if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
+		int ret;
+
+		ret = scrub_submit(sdev);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
+			   u8 *csum)
+{
+	struct btrfs_ordered_sum *sum = NULL;
+	int ret = 0;
+	unsigned long i;
+	unsigned long num_sectors;
+	u32 sectorsize = sdev->dev->dev_root->sectorsize;
+
+	while (!list_empty(&sdev->csum_list)) {
+		sum = list_first_entry(&sdev->csum_list,
+				       struct btrfs_ordered_sum, list);
+		if (sum->bytenr > logical)
+			return 0;
+		if (sum->bytenr + sum->len > logical)
+			break;
+
+		++sdev->stat.csum_discards;
+		list_del(&sum->list);
+		kfree(sum);
+		sum = NULL;
+	}
+	if (!sum)
+		return 0;
+
+	num_sectors = sum->len / sectorsize;
+	for (i = 0; i < num_sectors; ++i) {
+		if (sum->sums[i].bytenr == logical) {
+			memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
+			ret = 1;
+			break;
+		}
+	}
+	if (ret && i == num_sectors - 1) {
+		list_del(&sum->list);
+		kfree(sum);
+	}
+	return ret;
+}
+
+/* scrub extent tries to collect up to 64 kB for each bio */
+static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
+			u64 physical, u64 flags, u64 gen, u64 mirror_num)
+{
+	int ret;
+	u8 csum[BTRFS_CSUM_SIZE];
+
+	while (len) {
+		u64 l = min_t(u64, len, PAGE_SIZE);
+		int have_csum = 0;
+
+		if (flags & BTRFS_EXTENT_FLAG_DATA) {
+			/* push csums to sbio */
+			have_csum = scrub_find_csum(sdev, logical, l, csum);
+			if (have_csum == 0)
+				++sdev->stat.no_csum;
+		}
+		ret = scrub_page(sdev, logical, l, physical, flags, gen,
+				 mirror_num, have_csum ? csum : NULL, 0);
+		if (ret)
+			return ret;
+		len -= l;
+		logical += l;
+		physical += l;
+	}
+	return 0;
+}
+
+static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
+	struct map_lookup *map, int num, u64 base, u64 length)
+{
+	struct btrfs_path *path;
+	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_root *csum_root = fs_info->csum_root;
+	struct btrfs_extent_item *extent;
+	struct blk_plug plug;
+	u64 flags;
+	int ret;
+	int slot;
+	int i;
+	u64 nstripes;
+	int start_stripe;
+	struct extent_buffer *l;
+	struct btrfs_key key;
+	u64 physical;
+	u64 logical;
+	u64 generation;
+	u64 mirror_num;
+
+	u64 increment = map->stripe_len;
+	u64 offset;
+
+	nstripes = length;
+	offset = 0;
+	do_div(nstripes, map->stripe_len);
+	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+		offset = map->stripe_len * num;
+		increment = map->stripe_len * map->num_stripes;
+		mirror_num = 0;
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+		int factor = map->num_stripes / map->sub_stripes;
+		offset = map->stripe_len * (num / map->sub_stripes);
+		increment = map->stripe_len * factor;
+		mirror_num = num % map->sub_stripes;
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+		increment = map->stripe_len;
+		mirror_num = num % map->num_stripes;
+	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+		increment = map->stripe_len;
+		mirror_num = num % map->num_stripes;
+	} else {
+		increment = map->stripe_len;
+		mirror_num = 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 2;
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+
+	/*
+	 * find all extents for each stripe and just read them to get
+	 * them into the page cache
+	 * FIXME: we can do better. build a more intelligent prefetching
+	 */
+	logical = base + offset;
+	physical = map->stripes[num].physical;
+	ret = 0;
+	for (i = 0; i < nstripes; ++i) {
+		key.objectid = logical;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = (u64)0;
+
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out_noplug;
+
+		/*
+		 * we might miss half an extent here, but that doesn't matter,
+		 * as it's only the prefetch
+		 */
+		while (1) {
+			l = path->nodes[0];
+			slot = path->slots[0];
+			if (slot >= btrfs_header_nritems(l)) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret == 0)
+					continue;
+				if (ret < 0)
+					goto out_noplug;
+
+				break;
+			}
+			btrfs_item_key_to_cpu(l, &key, slot);
+
+			if (key.objectid >= logical + map->stripe_len)
+				break;
+
+			path->slots[0]++;
+		}
+		btrfs_release_path(path);
+		logical += increment;
+		physical += map->stripe_len;
+		cond_resched();
+	}
+
+	/*
+	 * collect all data csums for the stripe to avoid seeking during
+	 * the scrub. This might currently (crc32) end up to be about 1MB
+	 */
+	start_stripe = 0;
+	blk_start_plug(&plug);
+again:
+	logical = base + offset + start_stripe * increment;
+	for (i = start_stripe; i < nstripes; ++i) {
+		ret = btrfs_lookup_csums_range(csum_root, logical,
+					       logical + map->stripe_len - 1,
+					       &sdev->csum_list, 1);
+		if (ret)
+			goto out;
+
+		logical += increment;
+		cond_resched();
+	}
+	/*
+	 * now find all extents for each stripe and scrub them
+	 */
+	logical = base + offset + start_stripe * increment;
+	physical = map->stripes[num].physical + start_stripe * map->stripe_len;
+	ret = 0;
+	for (i = start_stripe; i < nstripes; ++i) {
+		/*
+		 * canceled?
+		 */
+		if (atomic_read(&fs_info->scrub_cancel_req) ||
+		    atomic_read(&sdev->cancel_req)) {
+			ret = -ECANCELED;
+			goto out;
+		}
+		/*
+		 * check to see if we have to pause
+		 */
+		if (atomic_read(&fs_info->scrub_pause_req)) {
+			/* push queued extents */
+			scrub_submit(sdev);
+			wait_event(sdev->list_wait,
+				   atomic_read(&sdev->in_flight) == 0);
+			atomic_inc(&fs_info->scrubs_paused);
+			wake_up(&fs_info->scrub_pause_wait);
+			mutex_lock(&fs_info->scrub_lock);
+			while (atomic_read(&fs_info->scrub_pause_req)) {
+				mutex_unlock(&fs_info->scrub_lock);
+				wait_event(fs_info->scrub_pause_wait,
+				   atomic_read(&fs_info->scrub_pause_req) == 0);
+				mutex_lock(&fs_info->scrub_lock);
+			}
+			atomic_dec(&fs_info->scrubs_paused);
+			mutex_unlock(&fs_info->scrub_lock);
+			wake_up(&fs_info->scrub_pause_wait);
+			scrub_free_csums(sdev);
+			start_stripe = i;
+			goto again;
+		}
+
+		key.objectid = logical;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = (u64)0;
+
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+		if (ret > 0) {
+			ret = btrfs_previous_item(root, path, 0,
+						  BTRFS_EXTENT_ITEM_KEY);
+			if (ret < 0)
+				goto out;
+			if (ret > 0) {
+				/* there's no smaller item, so stick with the
+				 * larger one */
+				btrfs_release_path(path);
+				ret = btrfs_search_slot(NULL, root, &key,
+							path, 0, 0);
+				if (ret < 0)
+					goto out;
+			}
+		}
+
+		while (1) {
+			l = path->nodes[0];
+			slot = path->slots[0];
+			if (slot >= btrfs_header_nritems(l)) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret == 0)
+					continue;
+				if (ret < 0)
+					goto out;
+
+				break;
+			}
+			btrfs_item_key_to_cpu(l, &key, slot);
+
+			if (key.objectid + key.offset <= logical)
+				goto next;
+
+			if (key.objectid >= logical + map->stripe_len)
+				break;
+
+			if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
+				goto next;
+
+			extent = btrfs_item_ptr(l, slot,
+						struct btrfs_extent_item);
+			flags = btrfs_extent_flags(l, extent);
+			generation = btrfs_extent_generation(l, extent);
+
+			if (key.objectid < logical &&
+			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+				printk(KERN_ERR
+				       "btrfs scrub: tree block %llu spanning "
+				       "stripes, ignored. logical=%llu\n",
+				       (unsigned long long)key.objectid,
+				       (unsigned long long)logical);
+				goto next;
+			}
+
+			/*
+			 * trim extent to this stripe
+			 */
+			if (key.objectid < logical) {
+				key.offset -= logical - key.objectid;
+				key.objectid = logical;
+			}
+			if (key.objectid + key.offset >
+			    logical + map->stripe_len) {
+				key.offset = logical + map->stripe_len -
+					     key.objectid;
+			}
+
+			ret = scrub_extent(sdev, key.objectid, key.offset,
+					   key.objectid - logical + physical,
+					   flags, generation, mirror_num);
+			if (ret)
+				goto out;
+
+next:
+			path->slots[0]++;
+		}
+		btrfs_release_path(path);
+		logical += increment;
+		physical += map->stripe_len;
+		spin_lock(&sdev->stat_lock);
+		sdev->stat.last_physical = physical;
+		spin_unlock(&sdev->stat_lock);
+	}
+	/* push queued extents */
+	scrub_submit(sdev);
+
+out:
+	blk_finish_plug(&plug);
+out_noplug:
+	btrfs_free_path(path);
+	return ret < 0 ? ret : 0;
+}
+
+static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
+	u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length)
+{
+	struct btrfs_mapping_tree *map_tree =
+		&sdev->dev->dev_root->fs_info->mapping_tree;
+	struct map_lookup *map;
+	struct extent_map *em;
+	int i;
+	int ret = -EINVAL;
+
+	read_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+	read_unlock(&map_tree->map_tree.lock);
+
+	if (!em)
+		return -EINVAL;
+
+	map = (struct map_lookup *)em->bdev;
+	if (em->start != chunk_offset)
+		goto out;
+
+	if (em->len < length)
+		goto out;
+
+	for (i = 0; i < map->num_stripes; ++i) {
+		if (map->stripes[i].dev == sdev->dev) {
+			ret = scrub_stripe(sdev, map, i, chunk_offset, length);
+			if (ret)
+				goto out;
+		}
+	}
+out:
+	free_extent_map(em);
+
+	return ret;
+}
+
+static noinline_for_stack
+int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
+{
+	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_path *path;
+	struct btrfs_root *root = sdev->dev->dev_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u64 length;
+	u64 chunk_tree;
+	u64 chunk_objectid;
+	u64 chunk_offset;
+	int ret;
+	int slot;
+	struct extent_buffer *l;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_block_group_cache *cache;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 2;
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+
+	key.objectid = sdev->dev->devid;
+	key.offset = 0ull;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			if (path->slots[0] >=
+			    btrfs_header_nritems(path->nodes[0])) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret)
+					break;
+			}
+		}
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(l, &found_key, slot);
+
+		if (found_key.objectid != sdev->dev->devid)
+			break;
+
+		if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
+			break;
+
+		if (found_key.offset >= end)
+			break;
+
+		if (found_key.offset < key.offset)
+			break;
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		length = btrfs_dev_extent_length(l, dev_extent);
+
+		if (found_key.offset + length <= start) {
+			key.offset = found_key.offset + length;
+			btrfs_release_path(path);
+			continue;
+		}
+
+		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+
+		/*
+		 * get a reference on the corresponding block group to prevent
+		 * the chunk from going away while we scrub it
+		 */
+		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+		if (!cache) {
+			ret = -ENOENT;
+			break;
+		}
+		ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
+				  chunk_offset, length);
+		btrfs_put_block_group(cache);
+		if (ret)
+			break;
+
+		key.offset = found_key.offset + length;
+		btrfs_release_path(path);
+	}
+
+	btrfs_free_path(path);
+
+	/*
+	 * ret can still be 1 from search_slot or next_leaf,
+	 * that's not an error
+	 */
+	return ret < 0 ? ret : 0;
+}
+
+static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
+{
+	int	i;
+	u64	bytenr;
+	u64	gen;
+	int	ret;
+	struct btrfs_device *device = sdev->dev;
+	struct btrfs_root *root = device->dev_root;
+
+	gen = root->fs_info->last_trans_committed;
+
+	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+			break;
+
+		ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
+				 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+		if (ret)
+			return ret;
+	}
+	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+
+	return 0;
+}
+
+/*
+ * get a reference count on fs_info->scrub_workers. start worker if necessary
+ */
+static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	mutex_lock(&fs_info->scrub_lock);
+	if (fs_info->scrub_workers_refcnt == 0) {
+		btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+			   fs_info->thread_pool_size, &fs_info->generic_worker);
+		fs_info->scrub_workers.idle_thresh = 4;
+		btrfs_start_workers(&fs_info->scrub_workers, 1);
+	}
+	++fs_info->scrub_workers_refcnt;
+	mutex_unlock(&fs_info->scrub_lock);
+
+	return 0;
+}
+
+static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	mutex_lock(&fs_info->scrub_lock);
+	if (--fs_info->scrub_workers_refcnt == 0)
+		btrfs_stop_workers(&fs_info->scrub_workers);
+	WARN_ON(fs_info->scrub_workers_refcnt < 0);
+	mutex_unlock(&fs_info->scrub_lock);
+}
+
+
+int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
+		    struct btrfs_scrub_progress *progress, int readonly)
+{
+	struct scrub_dev *sdev;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	int ret;
+	struct btrfs_device *dev;
+
+	if (btrfs_fs_closing(root->fs_info))
+		return -EINVAL;
+
+	/*
+	 * check some assumptions
+	 */
+	if (root->sectorsize != PAGE_SIZE ||
+	    root->sectorsize != root->leafsize ||
+	    root->sectorsize != root->nodesize) {
+		printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
+		return -EINVAL;
+	}
+
+	ret = scrub_workers_get(root);
+	if (ret)
+		return ret;
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	dev = btrfs_find_device(root, devid, NULL, NULL);
+	if (!dev || dev->missing) {
+		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+		scrub_workers_put(root);
+		return -ENODEV;
+	}
+	mutex_lock(&fs_info->scrub_lock);
+
+	if (!dev->in_fs_metadata) {
+		mutex_unlock(&fs_info->scrub_lock);
+		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+		scrub_workers_put(root);
+		return -ENODEV;
+	}
+
+	if (dev->scrub_device) {
+		mutex_unlock(&fs_info->scrub_lock);
+		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+		scrub_workers_put(root);
+		return -EINPROGRESS;
+	}
+	sdev = scrub_setup_dev(dev);
+	if (IS_ERR(sdev)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+		scrub_workers_put(root);
+		return PTR_ERR(sdev);
+	}
+	sdev->readonly = readonly;
+	dev->scrub_device = sdev;
+
+	atomic_inc(&fs_info->scrubs_running);
+	mutex_unlock(&fs_info->scrub_lock);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	down_read(&fs_info->scrub_super_lock);
+	ret = scrub_supers(sdev);
+	up_read(&fs_info->scrub_super_lock);
+
+	if (!ret)
+		ret = scrub_enumerate_chunks(sdev, start, end);
+
+	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+
+	atomic_dec(&fs_info->scrubs_running);
+	wake_up(&fs_info->scrub_pause_wait);
+
+	if (progress)
+		memcpy(progress, &sdev->stat, sizeof(*progress));
+
+	mutex_lock(&fs_info->scrub_lock);
+	dev->scrub_device = NULL;
+	mutex_unlock(&fs_info->scrub_lock);
+
+	scrub_free_dev(sdev);
+	scrub_workers_put(root);
+
+	return ret;
+}
+
+int btrfs_scrub_pause(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	mutex_lock(&fs_info->scrub_lock);
+	atomic_inc(&fs_info->scrub_pause_req);
+	while (atomic_read(&fs_info->scrubs_paused) !=
+	       atomic_read(&fs_info->scrubs_running)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		wait_event(fs_info->scrub_pause_wait,
+			   atomic_read(&fs_info->scrubs_paused) ==
+			   atomic_read(&fs_info->scrubs_running));
+		mutex_lock(&fs_info->scrub_lock);
+	}
+	mutex_unlock(&fs_info->scrub_lock);
+
+	return 0;
+}
+
+int btrfs_scrub_continue(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	atomic_dec(&fs_info->scrub_pause_req);
+	wake_up(&fs_info->scrub_pause_wait);
+	return 0;
+}
+
+int btrfs_scrub_pause_super(struct btrfs_root *root)
+{
+	down_write(&root->fs_info->scrub_super_lock);
+	return 0;
+}
+
+int btrfs_scrub_continue_super(struct btrfs_root *root)
+{
+	up_write(&root->fs_info->scrub_super_lock);
+	return 0;
+}
+
+int btrfs_scrub_cancel(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	mutex_lock(&fs_info->scrub_lock);
+	if (!atomic_read(&fs_info->scrubs_running)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		return -ENOTCONN;
+	}
+
+	atomic_inc(&fs_info->scrub_cancel_req);
+	while (atomic_read(&fs_info->scrubs_running)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		wait_event(fs_info->scrub_pause_wait,
+			   atomic_read(&fs_info->scrubs_running) == 0);
+		mutex_lock(&fs_info->scrub_lock);
+	}
+	atomic_dec(&fs_info->scrub_cancel_req);
+	mutex_unlock(&fs_info->scrub_lock);
+
+	return 0;
+}
+
+int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct scrub_dev *sdev;
+
+	mutex_lock(&fs_info->scrub_lock);
+	sdev = dev->scrub_device;
+	if (!sdev) {
+		mutex_unlock(&fs_info->scrub_lock);
+		return -ENOTCONN;
+	}
+	atomic_inc(&sdev->cancel_req);
+	while (dev->scrub_device) {
+		mutex_unlock(&fs_info->scrub_lock);
+		wait_event(fs_info->scrub_pause_wait,
+			   dev->scrub_device == NULL);
+		mutex_lock(&fs_info->scrub_lock);
+	}
+	mutex_unlock(&fs_info->scrub_lock);
+
+	return 0;
+}
+int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_device *dev;
+	int ret;
+
+	/*
+	 * we have to hold the device_list_mutex here so the device
+	 * does not go away in cancel_dev. FIXME: find a better solution
+	 */
+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	dev = btrfs_find_device(root, devid, NULL, NULL);
+	if (!dev) {
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		return -ENODEV;
+	}
+	ret = btrfs_scrub_cancel_dev(root, dev);
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+	return ret;
+}
+
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+			 struct btrfs_scrub_progress *progress)
+{
+	struct btrfs_device *dev;
+	struct scrub_dev *sdev = NULL;
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	dev = btrfs_find_device(root, devid, NULL, NULL);
+	if (dev)
+		sdev = dev->scrub_device;
+	if (sdev)
+		memcpy(progress, &sdev->stat, sizeof(*progress));
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
+}
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 00000000..c0f7ecaf
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/highmem.h>
+
+/* this is some deeply nasty code.  ctree.h has a different
+ * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
+ *
+ * The end result is that anyone who #includes ctree.h gets a
+ * declaration for the btrfs_set_foo functions and btrfs_foo functions
+ *
+ * This file declares the macros and then #includes ctree.h, which results
+ * in cpp creating the function here based on the template below.
+ *
+ * These setget functions do all the extent_buffer related mapping
+ * required to efficiently read and write specific fields in the extent
+ * buffers.  Every pointer to metadata items in btrfs is really just
+ * an unsigned long offset into the extent buffer which has been
+ * cast to a specific type.  This gives us all the gcc type checking.
+ *
+ * The extent buffer api is used to do all the kmapping and page
+ * spanning work required to get extent buffers in highmem and have
+ * a metadata blocksize different from the page size.
+ *
+ * The macro starts with a simple function prototype declaration so that
+ * sparse won't complain about it being static.
+ */
+
+#define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
+u##bits btrfs_##name(struct extent_buffer *eb, type *s);		\
+void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);	\
+u##bits btrfs_##name(struct extent_buffer *eb,				\
+				   type *s)				\
+{									\
+	unsigned long part_offset = (unsigned long)s;			\
+	unsigned long offset = part_offset + offsetof(type, member);	\
+	type *p;							\
+	/* ugly, but we want the fast path here */			\
+	if (eb->map_token && offset >= eb->map_start &&			\
+	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
+	    eb->map_len) {						\
+		p = (type *)(eb->kaddr + part_offset - eb->map_start);	\
+		return le##bits##_to_cpu(p->member);			\
+	}								\
+	{								\
+		int err;						\
+		char *map_token;					\
+		char *kaddr;						\
+		int unmap_on_exit = (eb->map_token == NULL);		\
+		unsigned long map_start;				\
+		unsigned long map_len;					\
+		u##bits res;						\
+		err = map_extent_buffer(eb, offset,			\
+				sizeof(((type *)0)->member),		\
+				&map_token, &kaddr,			\
+				&map_start, &map_len, KM_USER1);	\
+		if (err) {						\
+			__le##bits leres;				\
+			read_eb_member(eb, s, type, member, &leres);	\
+			return le##bits##_to_cpu(leres);		\
+		}							\
+		p = (type *)(kaddr + part_offset - map_start);		\
+		res = le##bits##_to_cpu(p->member);			\
+		if (unmap_on_exit)					\
+			unmap_extent_buffer(eb, map_token, KM_USER1);	\
+		return res;						\
+	}								\
+}									\
+void btrfs_set_##name(struct extent_buffer *eb,				\
+				    type *s, u##bits val)		\
+{									\
+	unsigned long part_offset = (unsigned long)s;			\
+	unsigned long offset = part_offset + offsetof(type, member);	\
+	type *p;							\
+	/* ugly, but we want the fast path here */			\
+	if (eb->map_token && offset >= eb->map_start &&			\
+	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
+	    eb->map_len) {						\
+		p = (type *)(eb->kaddr + part_offset - eb->map_start);	\
+		p->member = cpu_to_le##bits(val);			\
+		return;							\
+	}								\
+	{								\
+		int err;						\
+		char *map_token;					\
+		char *kaddr;						\
+		int unmap_on_exit = (eb->map_token == NULL);		\
+		unsigned long map_start;				\
+		unsigned long map_len;					\
+		err = map_extent_buffer(eb, offset,			\
+				sizeof(((type *)0)->member),		\
+				&map_token, &kaddr,			\
+				&map_start, &map_len, KM_USER1);	\
+		if (err) {						\
+			__le##bits val2;				\
+			val2 = cpu_to_le##bits(val);			\
+			write_eb_member(eb, s, type, member, &val2);	\
+			return;						\
+		}							\
+		p = (type *)(kaddr + part_offset - map_start);		\
+		p->member = cpu_to_le##bits(val);			\
+		if (unmap_on_exit)					\
+			unmap_extent_buffer(eb, map_token, KM_USER1);	\
+	}								\
+}
+
+#include "ctree.h"
+
+void btrfs_node_key(struct extent_buffer *eb,
+		    struct btrfs_disk_key *disk_key, int nr)
+{
+	unsigned long ptr = btrfs_node_key_ptr_offset(nr);
+	if (eb->map_token && ptr >= eb->map_start &&
+	    ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
+		memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
+			sizeof(*disk_key));
+		return;
+	} else if (eb->map_token) {
+		unmap_extent_buffer(eb, eb->map_token, KM_USER1);
+		eb->map_token = NULL;
+	}
+	read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+		       struct btrfs_key_ptr, key, disk_key);
+}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 00000000..15634d46
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,1305 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/parser.h>
+#include <linux/ctype.h>
+#include <linux/namei.h>
+#include <linux/miscdevice.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/cleancache.h>
+#include "compat.h"
+#include "delayed-inode.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "xattr.h"
+#include "volumes.h"
+#include "version.h"
+#include "export.h"
+#include "compression.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/btrfs.h>
+
+static const struct super_operations btrfs_super_ops;
+
+static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
+				      char nbuf[16])
+{
+	char *errstr = NULL;
+
+	switch (errno) {
+	case -EIO:
+		errstr = "IO failure";
+		break;
+	case -ENOMEM:
+		errstr = "Out of memory";
+		break;
+	case -EROFS:
+		errstr = "Readonly filesystem";
+		break;
+	default:
+		if (nbuf) {
+			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+				errstr = nbuf;
+		}
+		break;
+	}
+
+	return errstr;
+}
+
+static void __save_error_info(struct btrfs_fs_info *fs_info)
+{
+	/*
+	 * today we only save the error info into ram.  Long term we'll
+	 * also send it down to the disk
+	 */
+	fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
+}
+
+/* NOTE:
+ *	We move write_super stuff at umount in order to avoid deadlock
+ *	for umount hold all lock.
+ */
+static void save_error_info(struct btrfs_fs_info *fs_info)
+{
+	__save_error_info(fs_info);
+}
+
+/* btrfs handle error by forcing the filesystem readonly */
+static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
+{
+	struct super_block *sb = fs_info->sb;
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+		sb->s_flags |= MS_RDONLY;
+		printk(KERN_INFO "btrfs is forced readonly\n");
+	}
+}
+
+/*
+ * __btrfs_std_error decodes expected errors from the caller and
+ * invokes the approciate error response.
+ */
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+		     unsigned int line, int errno)
+{
+	struct super_block *sb = fs_info->sb;
+	char nbuf[16];
+	const char *errstr;
+
+	/*
+	 * Special case: if the error is EROFS, and we're already
+	 * under MS_RDONLY, then it is safe here.
+	 */
+	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+		return;
+
+	errstr = btrfs_decode_error(fs_info, errno, nbuf);
+	printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
+		sb->s_id, function, line, errstr);
+	save_error_info(fs_info);
+
+	btrfs_handle_error(fs_info);
+}
+
+static void btrfs_put_super(struct super_block *sb)
+{
+	struct btrfs_root *root = btrfs_sb(sb);
+	int ret;
+
+	ret = close_ctree(root);
+	sb->s_fs_info = NULL;
+
+	(void)ret; /* FIXME: need to fix VFS to return error? */
+}
+
+enum {
+	Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
+	Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
+	Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
+	Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
+	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
+	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
+	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
+	Opt_inode_cache, Opt_err,
+};
+
+static match_table_t tokens = {
+	{Opt_degraded, "degraded"},
+	{Opt_subvol, "subvol=%s"},
+	{Opt_subvolid, "subvolid=%d"},
+	{Opt_device, "device=%s"},
+	{Opt_nodatasum, "nodatasum"},
+	{Opt_nodatacow, "nodatacow"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_max_inline, "max_inline=%s"},
+	{Opt_alloc_start, "alloc_start=%s"},
+	{Opt_thread_pool, "thread_pool=%d"},
+	{Opt_compress, "compress"},
+	{Opt_compress_type, "compress=%s"},
+	{Opt_compress_force, "compress-force"},
+	{Opt_compress_force_type, "compress-force=%s"},
+	{Opt_ssd, "ssd"},
+	{Opt_ssd_spread, "ssd_spread"},
+	{Opt_nossd, "nossd"},
+	{Opt_noacl, "noacl"},
+	{Opt_notreelog, "notreelog"},
+	{Opt_flushoncommit, "flushoncommit"},
+	{Opt_ratio, "metadata_ratio=%d"},
+	{Opt_discard, "discard"},
+	{Opt_space_cache, "space_cache"},
+	{Opt_clear_cache, "clear_cache"},
+	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+	{Opt_enospc_debug, "enospc_debug"},
+	{Opt_subvolrootid, "subvolrootid=%d"},
+	{Opt_defrag, "autodefrag"},
+	{Opt_inode_cache, "inode_cache"},
+	{Opt_err, NULL},
+};
+
+/*
+ * Regular mount options parser.  Everything that is needed only when
+ * reading in a new superblock is parsed here.
+ */
+int btrfs_parse_options(struct btrfs_root *root, char *options)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	substring_t args[MAX_OPT_ARGS];
+	char *p, *num, *orig;
+	int intarg;
+	int ret = 0;
+	char *compress_type;
+	bool compress_force = false;
+
+	if (!options)
+		return 0;
+
+	/*
+	 * strsep changes the string, duplicate it because parse_options
+	 * gets called twice
+	 */
+	options = kstrdup(options, GFP_NOFS);
+	if (!options)
+		return -ENOMEM;
+
+	orig = options;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_degraded:
+			printk(KERN_INFO "btrfs: allowing degraded mounts\n");
+			btrfs_set_opt(info->mount_opt, DEGRADED);
+			break;
+		case Opt_subvol:
+		case Opt_subvolid:
+		case Opt_subvolrootid:
+		case Opt_device:
+			/*
+			 * These are parsed by btrfs_parse_early_options
+			 * and can be happily ignored here.
+			 */
+			break;
+		case Opt_nodatasum:
+			printk(KERN_INFO "btrfs: setting nodatasum\n");
+			btrfs_set_opt(info->mount_opt, NODATASUM);
+			break;
+		case Opt_nodatacow:
+			printk(KERN_INFO "btrfs: setting nodatacow\n");
+			btrfs_set_opt(info->mount_opt, NODATACOW);
+			btrfs_set_opt(info->mount_opt, NODATASUM);
+			break;
+		case Opt_compress_force:
+		case Opt_compress_force_type:
+			compress_force = true;
+		case Opt_compress:
+		case Opt_compress_type:
+			if (token == Opt_compress ||
+			    token == Opt_compress_force ||
+			    strcmp(args[0].from, "zlib") == 0) {
+				compress_type = "zlib";
+				info->compress_type = BTRFS_COMPRESS_ZLIB;
+			} else if (strcmp(args[0].from, "lzo") == 0) {
+				compress_type = "lzo";
+				info->compress_type = BTRFS_COMPRESS_LZO;
+			} else {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			btrfs_set_opt(info->mount_opt, COMPRESS);
+			if (compress_force) {
+				btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+				pr_info("btrfs: force %s compression\n",
+					compress_type);
+			} else
+				pr_info("btrfs: use %s compression\n",
+					compress_type);
+			break;
+		case Opt_ssd:
+			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
+			btrfs_set_opt(info->mount_opt, SSD);
+			break;
+		case Opt_ssd_spread:
+			printk(KERN_INFO "btrfs: use spread ssd "
+			       "allocation scheme\n");
+			btrfs_set_opt(info->mount_opt, SSD);
+			btrfs_set_opt(info->mount_opt, SSD_SPREAD);
+			break;
+		case Opt_nossd:
+			printk(KERN_INFO "btrfs: not using ssd allocation "
+			       "scheme\n");
+			btrfs_set_opt(info->mount_opt, NOSSD);
+			btrfs_clear_opt(info->mount_opt, SSD);
+			btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
+			break;
+		case Opt_nobarrier:
+			printk(KERN_INFO "btrfs: turning off barriers\n");
+			btrfs_set_opt(info->mount_opt, NOBARRIER);
+			break;
+		case Opt_thread_pool:
+			intarg = 0;
+			match_int(&args[0], &intarg);
+			if (intarg) {
+				info->thread_pool_size = intarg;
+				printk(KERN_INFO "btrfs: thread pool %d\n",
+				       info->thread_pool_size);
+			}
+			break;
+		case Opt_max_inline:
+			num = match_strdup(&args[0]);
+			if (num) {
+				info->max_inline = memparse(num, NULL);
+				kfree(num);
+
+				if (info->max_inline) {
+					info->max_inline = max_t(u64,
+						info->max_inline,
+						root->sectorsize);
+				}
+				printk(KERN_INFO "btrfs: max_inline at %llu\n",
+					(unsigned long long)info->max_inline);
+			}
+			break;
+		case Opt_alloc_start:
+			num = match_strdup(&args[0]);
+			if (num) {
+				info->alloc_start = memparse(num, NULL);
+				kfree(num);
+				printk(KERN_INFO
+					"btrfs: allocations start at %llu\n",
+					(unsigned long long)info->alloc_start);
+			}
+			break;
+		case Opt_noacl:
+			root->fs_info->sb->s_flags &= ~MS_POSIXACL;
+			break;
+		case Opt_notreelog:
+			printk(KERN_INFO "btrfs: disabling tree log\n");
+			btrfs_set_opt(info->mount_opt, NOTREELOG);
+			break;
+		case Opt_flushoncommit:
+			printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
+			btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
+			break;
+		case Opt_ratio:
+			intarg = 0;
+			match_int(&args[0], &intarg);
+			if (intarg) {
+				info->metadata_ratio = intarg;
+				printk(KERN_INFO "btrfs: metadata ratio %d\n",
+				       info->metadata_ratio);
+			}
+			break;
+		case Opt_discard:
+			btrfs_set_opt(info->mount_opt, DISCARD);
+			break;
+		case Opt_space_cache:
+			printk(KERN_INFO "btrfs: enabling disk space caching\n");
+			btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+			break;
+		case Opt_inode_cache:
+			printk(KERN_INFO "btrfs: enabling inode map caching\n");
+			btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
+			break;
+		case Opt_clear_cache:
+			printk(KERN_INFO "btrfs: force clearing of disk cache\n");
+			btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
+			break;
+		case Opt_user_subvol_rm_allowed:
+			btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+			break;
+		case Opt_enospc_debug:
+			btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+			break;
+		case Opt_defrag:
+			printk(KERN_INFO "btrfs: enabling auto defrag");
+			btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
+			break;
+		case Opt_err:
+			printk(KERN_INFO "btrfs: unrecognized mount option "
+			       "'%s'\n", p);
+			ret = -EINVAL;
+			goto out;
+		default:
+			break;
+		}
+	}
+out:
+	kfree(orig);
+	return ret;
+}
+
+/*
+ * Parse mount options that are required early in the mount process.
+ *
+ * All other options will be parsed on much later in the mount process and
+ * only when we need to allocate a new super block.
+ */
+static int btrfs_parse_early_options(const char *options, fmode_t flags,
+		void *holder, char **subvol_name, u64 *subvol_objectid,
+		u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *opts, *orig, *p;
+	int error = 0;
+	int intarg;
+
+	if (!options)
+		goto out;
+
+	/*
+	 * strsep changes the string, duplicate it because parse_options
+	 * gets called twice
+	 */
+	opts = kstrdup(options, GFP_KERNEL);
+	if (!opts)
+		return -ENOMEM;
+	orig = opts;
+
+	while ((p = strsep(&opts, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_subvol:
+			*subvol_name = match_strdup(&args[0]);
+			break;
+		case Opt_subvolid:
+			intarg = 0;
+			error = match_int(&args[0], &intarg);
+			if (!error) {
+				/* we want the original fs_tree */
+				if (!intarg)
+					*subvol_objectid =
+						BTRFS_FS_TREE_OBJECTID;
+				else
+					*subvol_objectid = intarg;
+			}
+			break;
+		case Opt_subvolrootid:
+			intarg = 0;
+			error = match_int(&args[0], &intarg);
+			if (!error) {
+				/* we want the original fs_tree */
+				if (!intarg)
+					*subvol_rootid =
+						BTRFS_FS_TREE_OBJECTID;
+				else
+					*subvol_rootid = intarg;
+			}
+			break;
+		case Opt_device:
+			error = btrfs_scan_one_device(match_strdup(&args[0]),
+					flags, holder, fs_devices);
+			if (error)
+				goto out_free_opts;
+			break;
+		default:
+			break;
+		}
+	}
+
+ out_free_opts:
+	kfree(orig);
+ out:
+	/*
+	 * If no subvolume name is specified we use the default one.  Allocate
+	 * a copy of the string "." here so that code later in the
+	 * mount path doesn't care if it's the default volume or another one.
+	 */
+	if (!*subvol_name) {
+		*subvol_name = kstrdup(".", GFP_KERNEL);
+		if (!*subvol_name)
+			return -ENOMEM;
+	}
+	return error;
+}
+
+static struct dentry *get_default_root(struct super_block *sb,
+				       u64 subvol_objectid)
+{
+	struct btrfs_root *root = sb->s_fs_info;
+	struct btrfs_root *new_root;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	struct btrfs_key location;
+	struct inode *inode;
+	struct dentry *dentry;
+	u64 dir_id;
+	int new = 0;
+
+	/*
+	 * We have a specific subvol we want to mount, just setup location and
+	 * go look up the root.
+	 */
+	if (subvol_objectid) {
+		location.objectid = subvol_objectid;
+		location.type = BTRFS_ROOT_ITEM_KEY;
+		location.offset = (u64)-1;
+		goto find_root;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return ERR_PTR(-ENOMEM);
+	path->leave_spinning = 1;
+
+	/*
+	 * Find the "default" dir item which points to the root item that we
+	 * will mount by default if we haven't been given a specific subvolume
+	 * to mount.
+	 */
+	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
+	if (IS_ERR(di)) {
+		btrfs_free_path(path);
+		return ERR_CAST(di);
+	}
+	if (!di) {
+		/*
+		 * Ok the default dir item isn't there.  This is weird since
+		 * it's always been there, but don't freak out, just try and
+		 * mount to root most subvolume.
+		 */
+		btrfs_free_path(path);
+		dir_id = BTRFS_FIRST_FREE_OBJECTID;
+		new_root = root->fs_info->fs_root;
+		goto setup_root;
+	}
+
+	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+	btrfs_free_path(path);
+
+find_root:
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+	if (IS_ERR(new_root))
+		return ERR_CAST(new_root);
+
+	if (btrfs_root_refs(&new_root->root_item) == 0)
+		return ERR_PTR(-ENOENT);
+
+	dir_id = btrfs_root_dirid(&new_root->root_item);
+setup_root:
+	location.objectid = dir_id;
+	location.type = BTRFS_INODE_ITEM_KEY;
+	location.offset = 0;
+
+	inode = btrfs_iget(sb, &location, new_root, &new);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	/*
+	 * If we're just mounting the root most subvol put the inode and return
+	 * a reference to the dentry.  We will have already gotten a reference
+	 * to the inode in btrfs_fill_super so we're good to go.
+	 */
+	if (!new && sb->s_root->d_inode == inode) {
+		iput(inode);
+		return dget(sb->s_root);
+	}
+
+	if (new) {
+		const struct qstr name = { .name = "/", .len = 1 };
+
+		/*
+		 * New inode, we need to make the dentry a sibling of s_root so
+		 * everything gets cleaned up properly on unmount.
+		 */
+		dentry = d_alloc(sb->s_root, &name);
+		if (!dentry) {
+			iput(inode);
+			return ERR_PTR(-ENOMEM);
+		}
+		d_splice_alias(inode, dentry);
+	} else {
+		/*
+		 * We found the inode in cache, just find a dentry for it and
+		 * put the reference to the inode we just got.
+		 */
+		dentry = d_find_alias(inode);
+		iput(inode);
+	}
+
+	return dentry;
+}
+
+static int btrfs_fill_super(struct super_block *sb,
+			    struct btrfs_fs_devices *fs_devices,
+			    void *data, int silent)
+{
+	struct inode *inode;
+	struct dentry *root_dentry;
+	struct btrfs_root *tree_root;
+	struct btrfs_key key;
+	int err;
+
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_magic = BTRFS_SUPER_MAGIC;
+	sb->s_op = &btrfs_super_ops;
+	sb->s_d_op = &btrfs_dentry_operations;
+	sb->s_export_op = &btrfs_export_ops;
+	sb->s_xattr = btrfs_xattr_handlers;
+	sb->s_time_gran = 1;
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+	sb->s_flags |= MS_POSIXACL;
+#endif
+
+	tree_root = open_ctree(sb, fs_devices, (char *)data);
+
+	if (IS_ERR(tree_root)) {
+		printk("btrfs: open_ctree failed\n");
+		return PTR_ERR(tree_root);
+	}
+	sb->s_fs_info = tree_root;
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto fail_close;
+	}
+
+	root_dentry = d_alloc_root(inode);
+	if (!root_dentry) {
+		iput(inode);
+		err = -ENOMEM;
+		goto fail_close;
+	}
+
+	sb->s_root = root_dentry;
+
+	save_mount_options(sb, data);
+	cleancache_init_fs(sb);
+	return 0;
+
+fail_close:
+	close_ctree(tree_root);
+	return err;
+}
+
+int btrfs_sync_fs(struct super_block *sb, int wait)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = btrfs_sb(sb);
+	int ret;
+
+	trace_btrfs_sync_fs(wait);
+
+	if (!wait) {
+		filemap_flush(root->fs_info->btree_inode->i_mapping);
+		return 0;
+	}
+
+	btrfs_start_delalloc_inodes(root, 0);
+	btrfs_wait_ordered_extents(root, 0, 0);
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+	ret = btrfs_commit_transaction(trans, root);
+	return ret;
+}
+
+static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
+	struct btrfs_fs_info *info = root->fs_info;
+	char *compress_type;
+
+	if (btrfs_test_opt(root, DEGRADED))
+		seq_puts(seq, ",degraded");
+	if (btrfs_test_opt(root, NODATASUM))
+		seq_puts(seq, ",nodatasum");
+	if (btrfs_test_opt(root, NODATACOW))
+		seq_puts(seq, ",nodatacow");
+	if (btrfs_test_opt(root, NOBARRIER))
+		seq_puts(seq, ",nobarrier");
+	if (info->max_inline != 8192 * 1024)
+		seq_printf(seq, ",max_inline=%llu",
+			   (unsigned long long)info->max_inline);
+	if (info->alloc_start != 0)
+		seq_printf(seq, ",alloc_start=%llu",
+			   (unsigned long long)info->alloc_start);
+	if (info->thread_pool_size !=  min_t(unsigned long,
+					     num_online_cpus() + 2, 8))
+		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
+	if (btrfs_test_opt(root, COMPRESS)) {
+		if (info->compress_type == BTRFS_COMPRESS_ZLIB)
+			compress_type = "zlib";
+		else
+			compress_type = "lzo";
+		if (btrfs_test_opt(root, FORCE_COMPRESS))
+			seq_printf(seq, ",compress-force=%s", compress_type);
+		else
+			seq_printf(seq, ",compress=%s", compress_type);
+	}
+	if (btrfs_test_opt(root, NOSSD))
+		seq_puts(seq, ",nossd");
+	if (btrfs_test_opt(root, SSD_SPREAD))
+		seq_puts(seq, ",ssd_spread");
+	else if (btrfs_test_opt(root, SSD))
+		seq_puts(seq, ",ssd");
+	if (btrfs_test_opt(root, NOTREELOG))
+		seq_puts(seq, ",notreelog");
+	if (btrfs_test_opt(root, FLUSHONCOMMIT))
+		seq_puts(seq, ",flushoncommit");
+	if (btrfs_test_opt(root, DISCARD))
+		seq_puts(seq, ",discard");
+	if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
+		seq_puts(seq, ",noacl");
+	if (btrfs_test_opt(root, SPACE_CACHE))
+		seq_puts(seq, ",space_cache");
+	if (btrfs_test_opt(root, CLEAR_CACHE))
+		seq_puts(seq, ",clear_cache");
+	if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+		seq_puts(seq, ",user_subvol_rm_allowed");
+	if (btrfs_test_opt(root, ENOSPC_DEBUG))
+		seq_puts(seq, ",enospc_debug");
+	if (btrfs_test_opt(root, AUTO_DEFRAG))
+		seq_puts(seq, ",autodefrag");
+	if (btrfs_test_opt(root, INODE_MAP_CACHE))
+		seq_puts(seq, ",inode_cache");
+	return 0;
+}
+
+static int btrfs_test_super(struct super_block *s, void *data)
+{
+	struct btrfs_root *test_root = data;
+	struct btrfs_root *root = btrfs_sb(s);
+
+	/*
+	 * If this super block is going away, return false as it
+	 * can't match as an existing super block.
+	 */
+	if (!atomic_read(&s->s_active))
+		return 0;
+	return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
+}
+
+static int btrfs_set_super(struct super_block *s, void *data)
+{
+	s->s_fs_info = data;
+
+	return set_anon_super(s, data);
+}
+
+
+/*
+ * Find a superblock for the given device / mount point.
+ *
+ * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
+ *	  for multiple device setup.  Make sure to keep it in sync.
+ */
+static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
+		const char *device_name, void *data)
+{
+	struct block_device *bdev = NULL;
+	struct super_block *s;
+	struct dentry *root;
+	struct btrfs_fs_devices *fs_devices = NULL;
+	struct btrfs_root *tree_root = NULL;
+	struct btrfs_fs_info *fs_info = NULL;
+	fmode_t mode = FMODE_READ;
+	char *subvol_name = NULL;
+	u64 subvol_objectid = 0;
+	u64 subvol_rootid = 0;
+	int error = 0;
+
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	error = btrfs_parse_early_options(data, mode, fs_type,
+					  &subvol_name, &subvol_objectid,
+					  &subvol_rootid, &fs_devices);
+	if (error)
+		return ERR_PTR(error);
+
+	error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
+	if (error)
+		goto error_free_subvol_name;
+
+	error = btrfs_open_devices(fs_devices, mode, fs_type);
+	if (error)
+		goto error_free_subvol_name;
+
+	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+		error = -EACCES;
+		goto error_close_devices;
+	}
+
+	/*
+	 * Setup a dummy root and fs_info for test/set super.  This is because
+	 * we don't actually fill this stuff out until open_ctree, but we need
+	 * it for searching for existing supers, so this lets us do that and
+	 * then open_ctree will properly initialize everything later.
+	 */
+	fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
+	tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	if (!fs_info || !tree_root) {
+		error = -ENOMEM;
+		goto error_close_devices;
+	}
+	fs_info->tree_root = tree_root;
+	fs_info->fs_devices = fs_devices;
+	tree_root->fs_info = fs_info;
+
+	bdev = fs_devices->latest_bdev;
+	s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
+	if (IS_ERR(s))
+		goto error_s;
+
+	if (s->s_root) {
+		if ((flags ^ s->s_flags) & MS_RDONLY) {
+			deactivate_locked_super(s);
+			error = -EBUSY;
+			goto error_close_devices;
+		}
+
+		btrfs_close_devices(fs_devices);
+		kfree(fs_info);
+		kfree(tree_root);
+	} else {
+		char b[BDEVNAME_SIZE];
+
+		s->s_flags = flags | MS_NOSEC;
+		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		error = btrfs_fill_super(s, fs_devices, data,
+					 flags & MS_SILENT ? 1 : 0);
+		if (error) {
+			deactivate_locked_super(s);
+			goto error_free_subvol_name;
+		}
+
+		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+		s->s_flags |= MS_ACTIVE;
+	}
+
+	/* if they gave us a subvolume name bind mount into that */
+	if (strcmp(subvol_name, ".")) {
+		struct dentry *new_root;
+
+		root = get_default_root(s, subvol_rootid);
+		if (IS_ERR(root)) {
+			error = PTR_ERR(root);
+			deactivate_locked_super(s);
+			goto error_free_subvol_name;
+		}
+
+		mutex_lock(&root->d_inode->i_mutex);
+		new_root = lookup_one_len(subvol_name, root,
+				      strlen(subvol_name));
+		mutex_unlock(&root->d_inode->i_mutex);
+
+		if (IS_ERR(new_root)) {
+			dput(root);
+			deactivate_locked_super(s);
+			error = PTR_ERR(new_root);
+			goto error_free_subvol_name;
+		}
+		if (!new_root->d_inode) {
+			dput(root);
+			dput(new_root);
+			deactivate_locked_super(s);
+			error = -ENXIO;
+			goto error_free_subvol_name;
+		}
+		dput(root);
+		root = new_root;
+	} else {
+		root = get_default_root(s, subvol_objectid);
+		if (IS_ERR(root)) {
+			error = PTR_ERR(root);
+			deactivate_locked_super(s);
+			goto error_free_subvol_name;
+		}
+	}
+
+	kfree(subvol_name);
+	return root;
+
+error_s:
+	error = PTR_ERR(s);
+error_close_devices:
+	btrfs_close_devices(fs_devices);
+	kfree(fs_info);
+	kfree(tree_root);
+error_free_subvol_name:
+	kfree(subvol_name);
+	return ERR_PTR(error);
+}
+
+static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct btrfs_root *root = btrfs_sb(sb);
+	int ret;
+
+	ret = btrfs_parse_options(root, data);
+	if (ret)
+		return -EINVAL;
+
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		return 0;
+
+	if (*flags & MS_RDONLY) {
+		sb->s_flags |= MS_RDONLY;
+
+		ret =  btrfs_commit_super(root);
+		WARN_ON(ret);
+	} else {
+		if (root->fs_info->fs_devices->rw_devices == 0)
+			return -EACCES;
+
+		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
+			return -EINVAL;
+
+		ret = btrfs_cleanup_fs_roots(root->fs_info);
+		WARN_ON(ret);
+
+		/* recover relocation */
+		ret = btrfs_recover_relocation(root);
+		WARN_ON(ret);
+
+		sb->s_flags &= ~MS_RDONLY;
+	}
+
+	return 0;
+}
+
+/* Used to sort the devices by max_avail(descending sort) */
+static int btrfs_cmp_device_free_bytes(const void *dev_info1,
+				       const void *dev_info2)
+{
+	if (((struct btrfs_device_info *)dev_info1)->max_avail >
+	    ((struct btrfs_device_info *)dev_info2)->max_avail)
+		return -1;
+	else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+		 ((struct btrfs_device_info *)dev_info2)->max_avail)
+		return 1;
+	else
+	return 0;
+}
+
+/*
+ * sort the devices by max_avail, in which max free extent size of each device
+ * is stored.(Descending Sort)
+ */
+static inline void btrfs_descending_sort_devices(
+					struct btrfs_device_info *devices,
+					size_t nr_devices)
+{
+	sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+	     btrfs_cmp_device_free_bytes, NULL);
+}
+
+/*
+ * The helper to calc the free space on the devices that can be used to store
+ * file data.
+ */
+static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_device_info *devices_info;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+	u64 skip_space;
+	u64 type;
+	u64 avail_space;
+	u64 used_space;
+	u64 min_stripe_size;
+	int min_stripes = 1;
+	int i = 0, nr_devices;
+	int ret;
+
+	nr_devices = fs_info->fs_devices->rw_devices;
+	BUG_ON(!nr_devices);
+
+	devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+			       GFP_NOFS);
+	if (!devices_info)
+		return -ENOMEM;
+
+	/* calc min stripe number for data space alloction */
+	type = btrfs_get_alloc_profile(root, 1);
+	if (type & BTRFS_BLOCK_GROUP_RAID0)
+		min_stripes = 2;
+	else if (type & BTRFS_BLOCK_GROUP_RAID1)
+		min_stripes = 2;
+	else if (type & BTRFS_BLOCK_GROUP_RAID10)
+		min_stripes = 4;
+
+	if (type & BTRFS_BLOCK_GROUP_DUP)
+		min_stripe_size = 2 * BTRFS_STRIPE_LEN;
+	else
+		min_stripe_size = BTRFS_STRIPE_LEN;
+
+	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+		if (!device->in_fs_metadata)
+			continue;
+
+		avail_space = device->total_bytes - device->bytes_used;
+
+		/* align with stripe_len */
+		do_div(avail_space, BTRFS_STRIPE_LEN);
+		avail_space *= BTRFS_STRIPE_LEN;
+
+		/*
+		 * In order to avoid overwritting the superblock on the drive,
+		 * btrfs starts at an offset of at least 1MB when doing chunk
+		 * allocation.
+		 */
+		skip_space = 1024 * 1024;
+
+		/* user can set the offset in fs_info->alloc_start. */
+		if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+		    device->total_bytes)
+			skip_space = max(fs_info->alloc_start, skip_space);
+
+		/*
+		 * btrfs can not use the free space in [0, skip_space - 1],
+		 * we must subtract it from the total. In order to implement
+		 * it, we account the used space in this range first.
+		 */
+		ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
+						     &used_space);
+		if (ret) {
+			kfree(devices_info);
+			return ret;
+		}
+
+		/* calc the free space in [0, skip_space - 1] */
+		skip_space -= used_space;
+
+		/*
+		 * we can use the free space in [0, skip_space - 1], subtract
+		 * it from the total.
+		 */
+		if (avail_space && avail_space >= skip_space)
+			avail_space -= skip_space;
+		else
+			avail_space = 0;
+
+		if (avail_space < min_stripe_size)
+			continue;
+
+		devices_info[i].dev = device;
+		devices_info[i].max_avail = avail_space;
+
+		i++;
+	}
+
+	nr_devices = i;
+
+	btrfs_descending_sort_devices(devices_info, nr_devices);
+
+	i = nr_devices - 1;
+	avail_space = 0;
+	while (nr_devices >= min_stripes) {
+		if (devices_info[i].max_avail >= min_stripe_size) {
+			int j;
+			u64 alloc_size;
+
+			avail_space += devices_info[i].max_avail * min_stripes;
+			alloc_size = devices_info[i].max_avail;
+			for (j = i + 1 - min_stripes; j <= i; j++)
+				devices_info[j].max_avail -= alloc_size;
+		}
+		i--;
+		nr_devices--;
+	}
+
+	kfree(devices_info);
+	*free_bytes = avail_space;
+	return 0;
+}
+
+static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
+	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct list_head *head = &root->fs_info->space_info;
+	struct btrfs_space_info *found;
+	u64 total_used = 0;
+	u64 total_free_data = 0;
+	int bits = dentry->d_sb->s_blocksize_bits;
+	__be32 *fsid = (__be32 *)root->fs_info->fsid;
+	int ret;
+
+	/* holding chunk_muext to avoid allocating new chunks */
+	mutex_lock(&root->fs_info->chunk_mutex);
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+			total_free_data += found->disk_total - found->disk_used;
+			total_free_data -=
+				btrfs_account_ro_block_groups_free_space(found);
+		}
+
+		total_used += found->disk_used;
+	}
+	rcu_read_unlock();
+
+	buf->f_namelen = BTRFS_NAME_LEN;
+	buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
+	buf->f_bfree = buf->f_blocks - (total_used >> bits);
+	buf->f_bsize = dentry->d_sb->s_blocksize;
+	buf->f_type = BTRFS_SUPER_MAGIC;
+	buf->f_bavail = total_free_data;
+	ret = btrfs_calc_avail_data_space(root, &total_free_data);
+	if (ret) {
+		mutex_unlock(&root->fs_info->chunk_mutex);
+		return ret;
+	}
+	buf->f_bavail += total_free_data;
+	buf->f_bavail = buf->f_bavail >> bits;
+	mutex_unlock(&root->fs_info->chunk_mutex);
+
+	/* We treat it as constant endianness (it doesn't matter _which_)
+	   because we want the fsid to come out the same whether mounted
+	   on a big-endian or little-endian host */
+	buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
+	buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
+	/* Mask in the root object ID too, to disambiguate subvols */
+	buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
+	buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
+
+	return 0;
+}
+
+static struct file_system_type btrfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "btrfs",
+	.mount		= btrfs_mount,
+	.kill_sb	= kill_anon_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+/*
+ * used by btrfsctl to scan devices when no FS is mounted
+ */
+static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
+				unsigned long arg)
+{
+	struct btrfs_ioctl_vol_args *vol;
+	struct btrfs_fs_devices *fs_devices;
+	int ret = -ENOTTY;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	vol = memdup_user((void __user *)arg, sizeof(*vol));
+	if (IS_ERR(vol))
+		return PTR_ERR(vol);
+
+	switch (cmd) {
+	case BTRFS_IOC_SCAN_DEV:
+		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
+					    &btrfs_fs_type, &fs_devices);
+		break;
+	}
+
+	kfree(vol);
+	return ret;
+}
+
+static int btrfs_freeze(struct super_block *sb)
+{
+	struct btrfs_root *root = btrfs_sb(sb);
+	mutex_lock(&root->fs_info->transaction_kthread_mutex);
+	mutex_lock(&root->fs_info->cleaner_mutex);
+	return 0;
+}
+
+static int btrfs_unfreeze(struct super_block *sb)
+{
+	struct btrfs_root *root = btrfs_sb(sb);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
+	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+	return 0;
+}
+
+static const struct super_operations btrfs_super_ops = {
+	.drop_inode	= btrfs_drop_inode,
+	.evict_inode	= btrfs_evict_inode,
+	.put_super	= btrfs_put_super,
+	.sync_fs	= btrfs_sync_fs,
+	.show_options	= btrfs_show_options,
+	.write_inode	= btrfs_write_inode,
+	.dirty_inode	= btrfs_dirty_inode,
+	.alloc_inode	= btrfs_alloc_inode,
+	.destroy_inode	= btrfs_destroy_inode,
+	.statfs		= btrfs_statfs,
+	.remount_fs	= btrfs_remount,
+	.freeze_fs	= btrfs_freeze,
+	.unfreeze_fs	= btrfs_unfreeze,
+};
+
+static const struct file_operations btrfs_ctl_fops = {
+	.unlocked_ioctl	 = btrfs_control_ioctl,
+	.compat_ioctl = btrfs_control_ioctl,
+	.owner	 = THIS_MODULE,
+	.llseek = noop_llseek,
+};
+
+static struct miscdevice btrfs_misc = {
+	.minor		= BTRFS_MINOR,
+	.name		= "btrfs-control",
+	.fops		= &btrfs_ctl_fops
+};
+
+MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
+MODULE_ALIAS("devname:btrfs-control");
+
+static int btrfs_interface_init(void)
+{
+	return misc_register(&btrfs_misc);
+}
+
+static void btrfs_interface_exit(void)
+{
+	if (misc_deregister(&btrfs_misc) < 0)
+		printk(KERN_INFO "misc_deregister failed for control device");
+}
+
+static int __init init_btrfs_fs(void)
+{
+	int err;
+
+	err = btrfs_init_sysfs();
+	if (err)
+		return err;
+
+	err = btrfs_init_compress();
+	if (err)
+		goto free_sysfs;
+
+	err = btrfs_init_cachep();
+	if (err)
+		goto free_compress;
+
+	err = extent_io_init();
+	if (err)
+		goto free_cachep;
+
+	err = extent_map_init();
+	if (err)
+		goto free_extent_io;
+
+	err = btrfs_delayed_inode_init();
+	if (err)
+		goto free_extent_map;
+
+	err = btrfs_interface_init();
+	if (err)
+		goto free_delayed_inode;
+
+	err = register_filesystem(&btrfs_fs_type);
+	if (err)
+		goto unregister_ioctl;
+
+	printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
+	return 0;
+
+unregister_ioctl:
+	btrfs_interface_exit();
+free_delayed_inode:
+	btrfs_delayed_inode_exit();
+free_extent_map:
+	extent_map_exit();
+free_extent_io:
+	extent_io_exit();
+free_cachep:
+	btrfs_destroy_cachep();
+free_compress:
+	btrfs_exit_compress();
+free_sysfs:
+	btrfs_exit_sysfs();
+	return err;
+}
+
+static void __exit exit_btrfs_fs(void)
+{
+	btrfs_destroy_cachep();
+	btrfs_delayed_inode_exit();
+	extent_map_exit();
+	extent_io_exit();
+	btrfs_interface_exit();
+	unregister_filesystem(&btrfs_fs_type);
+	btrfs_exit_sysfs();
+	btrfs_cleanup_fs_uuids();
+	btrfs_exit_compress();
+}
+
+module_init(init_btrfs_fs)
+module_exit(exit_btrfs_fs)
+
+MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 00000000..daac9ae6
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+/* /sys/fs/btrfs/ entry */
+static struct kset *btrfs_kset;
+
+int btrfs_init_sysfs(void)
+{
+	btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
+	if (!btrfs_kset)
+		return -ENOMEM;
+	return 0;
+}
+
+void btrfs_exit_sysfs(void)
+{
+	kset_unregister(btrfs_kset);
+}
+
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 00000000..51dcec86
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,1463 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "locking.h"
+#include "tree-log.h"
+#include "inode-map.h"
+
+#define BTRFS_ROOT_TRANS_TAG 0
+
+static noinline void put_transaction(struct btrfs_transaction *transaction)
+{
+	WARN_ON(atomic_read(&transaction->use_count) == 0);
+	if (atomic_dec_and_test(&transaction->use_count)) {
+		BUG_ON(!list_empty(&transaction->list));
+		memset(transaction, 0, sizeof(*transaction));
+		kmem_cache_free(btrfs_transaction_cachep, transaction);
+	}
+}
+
+static noinline void switch_commit_root(struct btrfs_root *root)
+{
+	free_extent_buffer(root->commit_root);
+	root->commit_root = btrfs_root_node(root);
+}
+
+/*
+ * either allocate a new transaction or hop into the existing one
+ */
+static noinline int join_transaction(struct btrfs_root *root, int nofail)
+{
+	struct btrfs_transaction *cur_trans;
+
+	spin_lock(&root->fs_info->trans_lock);
+	if (root->fs_info->trans_no_join) {
+		if (!nofail) {
+			spin_unlock(&root->fs_info->trans_lock);
+			return -EBUSY;
+		}
+	}
+
+	cur_trans = root->fs_info->running_transaction;
+	if (cur_trans) {
+		atomic_inc(&cur_trans->use_count);
+		atomic_inc(&cur_trans->num_writers);
+		cur_trans->num_joined++;
+		spin_unlock(&root->fs_info->trans_lock);
+		return 0;
+	}
+	spin_unlock(&root->fs_info->trans_lock);
+
+	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
+	if (!cur_trans)
+		return -ENOMEM;
+	spin_lock(&root->fs_info->trans_lock);
+	if (root->fs_info->running_transaction) {
+		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+		cur_trans = root->fs_info->running_transaction;
+		atomic_inc(&cur_trans->use_count);
+		atomic_inc(&cur_trans->num_writers);
+		cur_trans->num_joined++;
+		spin_unlock(&root->fs_info->trans_lock);
+		return 0;
+	}
+	atomic_set(&cur_trans->num_writers, 1);
+	cur_trans->num_joined = 0;
+	init_waitqueue_head(&cur_trans->writer_wait);
+	init_waitqueue_head(&cur_trans->commit_wait);
+	cur_trans->in_commit = 0;
+	cur_trans->blocked = 0;
+	/*
+	 * One for this trans handle, one so it will live on until we
+	 * commit the transaction.
+	 */
+	atomic_set(&cur_trans->use_count, 2);
+	cur_trans->commit_done = 0;
+	cur_trans->start_time = get_seconds();
+
+	cur_trans->delayed_refs.root = RB_ROOT;
+	cur_trans->delayed_refs.num_entries = 0;
+	cur_trans->delayed_refs.num_heads_ready = 0;
+	cur_trans->delayed_refs.num_heads = 0;
+	cur_trans->delayed_refs.flushing = 0;
+	cur_trans->delayed_refs.run_delayed_start = 0;
+	spin_lock_init(&cur_trans->commit_lock);
+	spin_lock_init(&cur_trans->delayed_refs.lock);
+
+	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+	list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+	extent_io_tree_init(&cur_trans->dirty_pages,
+			     root->fs_info->btree_inode->i_mapping);
+	root->fs_info->generation++;
+	cur_trans->transid = root->fs_info->generation;
+	root->fs_info->running_transaction = cur_trans;
+	spin_unlock(&root->fs_info->trans_lock);
+
+	return 0;
+}
+
+/*
+ * this does all the record keeping required to make sure that a reference
+ * counted root is properly recorded in a given transaction.  This is required
+ * to make sure the old root from before we joined the transaction is deleted
+ * when the transaction commits
+ */
+static int record_root_in_trans(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	if (root->ref_cows && root->last_trans < trans->transid) {
+		WARN_ON(root == root->fs_info->extent_root);
+		WARN_ON(root->commit_root != root->node);
+
+		/*
+		 * see below for in_trans_setup usage rules
+		 * we have the reloc mutex held now, so there
+		 * is only one writer in this function
+		 */
+		root->in_trans_setup = 1;
+
+		/* make sure readers find in_trans_setup before
+		 * they find our root->last_trans update
+		 */
+		smp_wmb();
+
+		spin_lock(&root->fs_info->fs_roots_radix_lock);
+		if (root->last_trans == trans->transid) {
+			spin_unlock(&root->fs_info->fs_roots_radix_lock);
+			return 0;
+		}
+		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+			   (unsigned long)root->root_key.objectid,
+			   BTRFS_ROOT_TRANS_TAG);
+		spin_unlock(&root->fs_info->fs_roots_radix_lock);
+		root->last_trans = trans->transid;
+
+		/* this is pretty tricky.  We don't want to
+		 * take the relocation lock in btrfs_record_root_in_trans
+		 * unless we're really doing the first setup for this root in
+		 * this transaction.
+		 *
+		 * Normally we'd use root->last_trans as a flag to decide
+		 * if we want to take the expensive mutex.
+		 *
+		 * But, we have to set root->last_trans before we
+		 * init the relocation root, otherwise, we trip over warnings
+		 * in ctree.c.  The solution used here is to flag ourselves
+		 * with root->in_trans_setup.  When this is 1, we're still
+		 * fixing up the reloc trees and everyone must wait.
+		 *
+		 * When this is zero, they can trust root->last_trans and fly
+		 * through btrfs_record_root_in_trans without having to take the
+		 * lock.  smp_wmb() makes sure that all the writes above are
+		 * done before we pop in the zero below
+		 */
+		btrfs_init_reloc_root(trans, root);
+		smp_wmb();
+		root->in_trans_setup = 0;
+	}
+	return 0;
+}
+
+
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	if (!root->ref_cows)
+		return 0;
+
+	/*
+	 * see record_root_in_trans for comments about in_trans_setup usage
+	 * and barriers
+	 */
+	smp_rmb();
+	if (root->last_trans == trans->transid &&
+	    !root->in_trans_setup)
+		return 0;
+
+	mutex_lock(&root->fs_info->reloc_mutex);
+	record_root_in_trans(trans, root);
+	mutex_unlock(&root->fs_info->reloc_mutex);
+
+	return 0;
+}
+
+/* wait for commit against the current transaction to become unblocked
+ * when this is done, it is safe to start a new transaction, but the current
+ * transaction might not be fully on disk.
+ */
+static void wait_current_trans(struct btrfs_root *root)
+{
+	struct btrfs_transaction *cur_trans;
+
+	spin_lock(&root->fs_info->trans_lock);
+	cur_trans = root->fs_info->running_transaction;
+	if (cur_trans && cur_trans->blocked) {
+		DEFINE_WAIT(wait);
+		atomic_inc(&cur_trans->use_count);
+		spin_unlock(&root->fs_info->trans_lock);
+		while (1) {
+			prepare_to_wait(&root->fs_info->transaction_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (!cur_trans->blocked)
+				break;
+			schedule();
+		}
+		finish_wait(&root->fs_info->transaction_wait, &wait);
+		put_transaction(cur_trans);
+	} else {
+		spin_unlock(&root->fs_info->trans_lock);
+	}
+}
+
+enum btrfs_trans_type {
+	TRANS_START,
+	TRANS_JOIN,
+	TRANS_USERSPACE,
+	TRANS_JOIN_NOLOCK,
+};
+
+static int may_wait_transaction(struct btrfs_root *root, int type)
+{
+	if (root->fs_info->log_root_recovering)
+		return 0;
+
+	if (type == TRANS_USERSPACE)
+		return 1;
+
+	if (type == TRANS_START &&
+	    !atomic_read(&root->fs_info->open_ioctl_trans))
+		return 1;
+
+	return 0;
+}
+
+static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
+						    u64 num_items, int type)
+{
+	struct btrfs_trans_handle *h;
+	struct btrfs_transaction *cur_trans;
+	int retries = 0;
+	int ret;
+
+	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+		return ERR_PTR(-EROFS);
+
+	if (current->journal_info) {
+		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
+		h = current->journal_info;
+		h->use_count++;
+		h->orig_rsv = h->block_rsv;
+		h->block_rsv = NULL;
+		goto got_it;
+	}
+again:
+	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+	if (!h)
+		return ERR_PTR(-ENOMEM);
+
+	if (may_wait_transaction(root, type))
+		wait_current_trans(root);
+
+	do {
+		ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
+		if (ret == -EBUSY)
+			wait_current_trans(root);
+	} while (ret == -EBUSY);
+
+	if (ret < 0) {
+		kmem_cache_free(btrfs_trans_handle_cachep, h);
+		return ERR_PTR(ret);
+	}
+
+	cur_trans = root->fs_info->running_transaction;
+
+	h->transid = cur_trans->transid;
+	h->transaction = cur_trans;
+	h->blocks_used = 0;
+	h->bytes_reserved = 0;
+	h->delayed_ref_updates = 0;
+	h->use_count = 1;
+	h->block_rsv = NULL;
+	h->orig_rsv = NULL;
+
+	smp_mb();
+	if (cur_trans->blocked && may_wait_transaction(root, type)) {
+		btrfs_commit_transaction(h, root);
+		goto again;
+	}
+
+	if (num_items > 0) {
+		ret = btrfs_trans_reserve_metadata(h, root, num_items);
+		if (ret == -EAGAIN && !retries) {
+			retries++;
+			btrfs_commit_transaction(h, root);
+			goto again;
+		} else if (ret == -EAGAIN) {
+			/*
+			 * We have already retried and got EAGAIN, so really we
+			 * don't have space, so set ret to -ENOSPC.
+			 */
+			ret = -ENOSPC;
+		}
+
+		if (ret < 0) {
+			btrfs_end_transaction(h, root);
+			return ERR_PTR(ret);
+		}
+	}
+
+got_it:
+	btrfs_record_root_in_trans(h, root);
+
+	if (!current->journal_info && type != TRANS_USERSPACE)
+		current->journal_info = h;
+	return h;
+}
+
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+						   int num_items)
+{
+	return start_transaction(root, num_items, TRANS_START);
+}
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
+{
+	return start_transaction(root, 0, TRANS_JOIN);
+}
+
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
+{
+	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
+}
+
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
+{
+	return start_transaction(root, 0, TRANS_USERSPACE);
+}
+
+/* wait for a transaction commit to be fully complete */
+static noinline int wait_for_commit(struct btrfs_root *root,
+				    struct btrfs_transaction *commit)
+{
+	DEFINE_WAIT(wait);
+	while (!commit->commit_done) {
+		prepare_to_wait(&commit->commit_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (commit->commit_done)
+			break;
+		schedule();
+	}
+	finish_wait(&commit->commit_wait, &wait);
+	return 0;
+}
+
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
+{
+	struct btrfs_transaction *cur_trans = NULL, *t;
+	int ret;
+
+	ret = 0;
+	if (transid) {
+		if (transid <= root->fs_info->last_trans_committed)
+			goto out;
+
+		/* find specified transaction */
+		spin_lock(&root->fs_info->trans_lock);
+		list_for_each_entry(t, &root->fs_info->trans_list, list) {
+			if (t->transid == transid) {
+				cur_trans = t;
+				atomic_inc(&cur_trans->use_count);
+				break;
+			}
+			if (t->transid > transid)
+				break;
+		}
+		spin_unlock(&root->fs_info->trans_lock);
+		ret = -EINVAL;
+		if (!cur_trans)
+			goto out;  /* bad transid */
+	} else {
+		/* find newest transaction that is committing | committed */
+		spin_lock(&root->fs_info->trans_lock);
+		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
+					    list) {
+			if (t->in_commit) {
+				if (t->commit_done)
+					break;
+				cur_trans = t;
+				atomic_inc(&cur_trans->use_count);
+				break;
+			}
+		}
+		spin_unlock(&root->fs_info->trans_lock);
+		if (!cur_trans)
+			goto out;  /* nothing committing|committed */
+	}
+
+	wait_for_commit(root, cur_trans);
+
+	put_transaction(cur_trans);
+	ret = 0;
+out:
+	return ret;
+}
+
+void btrfs_throttle(struct btrfs_root *root)
+{
+	if (!atomic_read(&root->fs_info->open_ioctl_trans))
+		wait_current_trans(root);
+}
+
+static int should_end_transaction(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root)
+{
+	int ret;
+	ret = btrfs_block_rsv_check(trans, root,
+				    &root->fs_info->global_block_rsv, 0, 5);
+	return ret ? 1 : 0;
+}
+
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root)
+{
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	int updates;
+
+	smp_mb();
+	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+		return 1;
+
+	updates = trans->delayed_ref_updates;
+	trans->delayed_ref_updates = 0;
+	if (updates)
+		btrfs_run_delayed_refs(trans, root, updates);
+
+	return should_end_transaction(trans, root);
+}
+
+static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, int throttle, int lock)
+{
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	struct btrfs_fs_info *info = root->fs_info;
+	int count = 0;
+
+	if (--trans->use_count) {
+		trans->block_rsv = trans->orig_rsv;
+		return 0;
+	}
+
+	while (count < 4) {
+		unsigned long cur = trans->delayed_ref_updates;
+		trans->delayed_ref_updates = 0;
+		if (cur &&
+		    trans->transaction->delayed_refs.num_heads_ready > 64) {
+			trans->delayed_ref_updates = 0;
+
+			/*
+			 * do a full flush if the transaction is trying
+			 * to close
+			 */
+			if (trans->transaction->delayed_refs.flushing)
+				cur = 0;
+			btrfs_run_delayed_refs(trans, root, cur);
+		} else {
+			break;
+		}
+		count++;
+	}
+
+	btrfs_trans_release_metadata(trans, root);
+
+	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
+	    should_end_transaction(trans, root)) {
+		trans->transaction->blocked = 1;
+		smp_wmb();
+	}
+
+	if (lock && cur_trans->blocked && !cur_trans->in_commit) {
+		if (throttle)
+			return btrfs_commit_transaction(trans, root);
+		else
+			wake_up_process(info->transaction_kthread);
+	}
+
+	WARN_ON(cur_trans != info->running_transaction);
+	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
+	atomic_dec(&cur_trans->num_writers);
+
+	smp_mb();
+	if (waitqueue_active(&cur_trans->writer_wait))
+		wake_up(&cur_trans->writer_wait);
+	put_transaction(cur_trans);
+
+	if (current->journal_info == trans)
+		current->journal_info = NULL;
+	memset(trans, 0, sizeof(*trans));
+	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+	if (throttle)
+		btrfs_run_delayed_iputs(root);
+
+	return 0;
+}
+
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root)
+{
+	int ret;
+
+	ret = __btrfs_end_transaction(trans, root, 0, 1);
+	if (ret)
+		return ret;
+	return 0;
+}
+
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root)
+{
+	int ret;
+
+	ret = __btrfs_end_transaction(trans, root, 1, 1);
+	if (ret)
+		return ret;
+	return 0;
+}
+
+int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root)
+{
+	int ret;
+
+	ret = __btrfs_end_transaction(trans, root, 0, 0);
+	if (ret)
+		return ret;
+	return 0;
+}
+
+int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root)
+{
+	return __btrfs_end_transaction(trans, root, 1, 1);
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees.  This is used to make sure all of
+ * those extents are sent to disk but does not wait on them
+ */
+int btrfs_write_marked_extents(struct btrfs_root *root,
+			       struct extent_io_tree *dirty_pages, int mark)
+{
+	int ret;
+	int err = 0;
+	int werr = 0;
+	struct page *page;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	u64 start = 0;
+	u64 end;
+	unsigned long index;
+
+	while (1) {
+		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+					    mark);
+		if (ret)
+			break;
+		while (start <= end) {
+			cond_resched();
+
+			index = start >> PAGE_CACHE_SHIFT;
+			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+			page = find_get_page(btree_inode->i_mapping, index);
+			if (!page)
+				continue;
+
+			btree_lock_page_hook(page);
+			if (!page->mapping) {
+				unlock_page(page);
+				page_cache_release(page);
+				continue;
+			}
+
+			if (PageWriteback(page)) {
+				if (PageDirty(page))
+					wait_on_page_writeback(page);
+				else {
+					unlock_page(page);
+					page_cache_release(page);
+					continue;
+				}
+			}
+			err = write_one_page(page, 0);
+			if (err)
+				werr = err;
+			page_cache_release(page);
+		}
+	}
+	if (err)
+		werr = err;
+	return werr;
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees.  This is used to make sure all of
+ * those extents are on disk for transaction or log commit.  We wait
+ * on all the pages and clear them from the dirty pages state tree
+ */
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+			      struct extent_io_tree *dirty_pages, int mark)
+{
+	int ret;
+	int err = 0;
+	int werr = 0;
+	struct page *page;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	u64 start = 0;
+	u64 end;
+	unsigned long index;
+
+	while (1) {
+		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+					    mark);
+		if (ret)
+			break;
+
+		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+		while (start <= end) {
+			index = start >> PAGE_CACHE_SHIFT;
+			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+			page = find_get_page(btree_inode->i_mapping, index);
+			if (!page)
+				continue;
+			if (PageDirty(page)) {
+				btree_lock_page_hook(page);
+				wait_on_page_writeback(page);
+				err = write_one_page(page, 0);
+				if (err)
+					werr = err;
+			}
+			wait_on_page_writeback(page);
+			page_cache_release(page);
+			cond_resched();
+		}
+	}
+	if (err)
+		werr = err;
+	return werr;
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees.  This is used to make sure all of
+ * those extents are on disk for transaction or log commit
+ */
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+				struct extent_io_tree *dirty_pages, int mark)
+{
+	int ret;
+	int ret2;
+
+	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
+	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
+	return ret || ret2;
+}
+
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root)
+{
+	if (!trans || !trans->transaction) {
+		struct inode *btree_inode;
+		btree_inode = root->fs_info->btree_inode;
+		return filemap_write_and_wait(btree_inode->i_mapping);
+	}
+	return btrfs_write_and_wait_marked_extents(root,
+					   &trans->transaction->dirty_pages,
+					   EXTENT_DIRTY);
+}
+
+/*
+ * this is used to update the root pointer in the tree of tree roots.
+ *
+ * But, in the case of the extent allocation tree, updating the root
+ * pointer may allocate blocks which may change the root of the extent
+ * allocation tree.
+ *
+ * So, this loops and repeats and makes sure the cowonly root didn't
+ * change while the root pointer was being updated in the metadata.
+ */
+static int update_cowonly_root(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	int ret;
+	u64 old_root_bytenr;
+	u64 old_root_used;
+	struct btrfs_root *tree_root = root->fs_info->tree_root;
+
+	old_root_used = btrfs_root_used(&root->root_item);
+	btrfs_write_dirty_block_groups(trans, root);
+
+	while (1) {
+		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
+		if (old_root_bytenr == root->node->start &&
+		    old_root_used == btrfs_root_used(&root->root_item))
+			break;
+
+		btrfs_set_root_node(&root->root_item, root->node);
+		ret = btrfs_update_root(trans, tree_root,
+					&root->root_key,
+					&root->root_item);
+		BUG_ON(ret);
+
+		old_root_used = btrfs_root_used(&root->root_item);
+		ret = btrfs_write_dirty_block_groups(trans, root);
+		BUG_ON(ret);
+	}
+
+	if (root != root->fs_info->extent_root)
+		switch_commit_root(root);
+
+	return 0;
+}
+
+/*
+ * update all the cowonly tree roots on disk
+ */
+static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct list_head *next;
+	struct extent_buffer *eb;
+	int ret;
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	BUG_ON(ret);
+
+	eb = btrfs_lock_root_node(fs_info->tree_root);
+	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	BUG_ON(ret);
+
+	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
+		next = fs_info->dirty_cowonly_roots.next;
+		list_del_init(next);
+		root = list_entry(next, struct btrfs_root, dirty_list);
+
+		update_cowonly_root(trans, root);
+	}
+
+	down_write(&fs_info->extent_commit_sem);
+	switch_commit_root(fs_info->extent_root);
+	up_write(&fs_info->extent_commit_sem);
+
+	return 0;
+}
+
+/*
+ * dead roots are old snapshots that need to be deleted.  This allocates
+ * a dirty root struct and adds it into the list of dead roots that need to
+ * be deleted
+ */
+int btrfs_add_dead_root(struct btrfs_root *root)
+{
+	spin_lock(&root->fs_info->trans_lock);
+	list_add(&root->root_list, &root->fs_info->dead_roots);
+	spin_unlock(&root->fs_info->trans_lock);
+	return 0;
+}
+
+/*
+ * update all the cowonly tree roots on disk
+ */
+static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root)
+{
+	struct btrfs_root *gang[8];
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	int i;
+	int ret;
+	int err = 0;
+
+	spin_lock(&fs_info->fs_roots_radix_lock);
+	while (1) {
+		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+						 (void **)gang, 0,
+						 ARRAY_SIZE(gang),
+						 BTRFS_ROOT_TRANS_TAG);
+		if (ret == 0)
+			break;
+		for (i = 0; i < ret; i++) {
+			root = gang[i];
+			radix_tree_tag_clear(&fs_info->fs_roots_radix,
+					(unsigned long)root->root_key.objectid,
+					BTRFS_ROOT_TRANS_TAG);
+			spin_unlock(&fs_info->fs_roots_radix_lock);
+
+			btrfs_free_log(trans, root);
+			btrfs_update_reloc_root(trans, root);
+			btrfs_orphan_commit_root(trans, root);
+
+			btrfs_save_ino_cache(root, trans);
+
+			if (root->commit_root != root->node) {
+				mutex_lock(&root->fs_commit_mutex);
+				switch_commit_root(root);
+				btrfs_unpin_free_ino(root);
+				mutex_unlock(&root->fs_commit_mutex);
+
+				btrfs_set_root_node(&root->root_item,
+						    root->node);
+			}
+
+			err = btrfs_update_root(trans, fs_info->tree_root,
+						&root->root_key,
+						&root->root_item);
+			spin_lock(&fs_info->fs_roots_radix_lock);
+			if (err)
+				break;
+		}
+	}
+	spin_unlock(&fs_info->fs_roots_radix_lock);
+	return err;
+}
+
+/*
+ * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
+ * otherwise every leaf in the btree is read and defragged.
+ */
+int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	unsigned long nr;
+
+	if (xchg(&root->defrag_running, 1))
+		return 0;
+
+	while (1) {
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+
+		ret = btrfs_defrag_leaves(trans, root, cacheonly);
+
+		nr = trans->blocks_used;
+		btrfs_end_transaction(trans, root);
+		btrfs_btree_balance_dirty(info->tree_root, nr);
+		cond_resched();
+
+		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
+			break;
+	}
+	root->defrag_running = 0;
+	return ret;
+}
+
+/*
+ * new snapshots need to be created at a very specific time in the
+ * transaction commit.  This does the actual creation
+ */
+static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+				   struct btrfs_fs_info *fs_info,
+				   struct btrfs_pending_snapshot *pending)
+{
+	struct btrfs_key key;
+	struct btrfs_root_item *new_root_item;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *root = pending->root;
+	struct btrfs_root *parent_root;
+	struct inode *parent_inode;
+	struct dentry *parent;
+	struct dentry *dentry;
+	struct extent_buffer *tmp;
+	struct extent_buffer *old;
+	int ret;
+	u64 to_reserve = 0;
+	u64 index = 0;
+	u64 objectid;
+	u64 root_flags;
+
+	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
+	if (!new_root_item) {
+		pending->error = -ENOMEM;
+		goto fail;
+	}
+
+	ret = btrfs_find_free_objectid(tree_root, &objectid);
+	if (ret) {
+		pending->error = ret;
+		goto fail;
+	}
+
+	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+	btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
+
+	if (to_reserve > 0) {
+		ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
+					  to_reserve);
+		if (ret) {
+			pending->error = ret;
+			goto fail;
+		}
+	}
+
+	key.objectid = objectid;
+	key.offset = (u64)-1;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+
+	trans->block_rsv = &pending->block_rsv;
+
+	dentry = pending->dentry;
+	parent = dget_parent(dentry);
+	parent_inode = parent->d_inode;
+	parent_root = BTRFS_I(parent_inode)->root;
+	record_root_in_trans(trans, parent_root);
+
+	/*
+	 * insert the directory item
+	 */
+	ret = btrfs_set_inode_index(parent_inode, &index);
+	BUG_ON(ret);
+	ret = btrfs_insert_dir_item(trans, parent_root,
+				dentry->d_name.name, dentry->d_name.len,
+				parent_inode, &key,
+				BTRFS_FT_DIR, index);
+	BUG_ON(ret);
+
+	btrfs_i_size_write(parent_inode, parent_inode->i_size +
+					 dentry->d_name.len * 2);
+	ret = btrfs_update_inode(trans, parent_root, parent_inode);
+	BUG_ON(ret);
+
+	/*
+	 * pull in the delayed directory update
+	 * and the delayed inode item
+	 * otherwise we corrupt the FS during
+	 * snapshot
+	 */
+	ret = btrfs_run_delayed_items(trans, root);
+	BUG_ON(ret);
+
+	record_root_in_trans(trans, root);
+	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
+	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+	btrfs_check_and_init_root_item(new_root_item);
+
+	root_flags = btrfs_root_flags(new_root_item);
+	if (pending->readonly)
+		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
+	else
+		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
+	btrfs_set_root_flags(new_root_item, root_flags);
+
+	old = btrfs_lock_root_node(root);
+	btrfs_cow_block(trans, root, old, NULL, 0, &old);
+	btrfs_set_lock_blocking(old);
+
+	btrfs_copy_root(trans, root, old, &tmp, objectid);
+	btrfs_tree_unlock(old);
+	free_extent_buffer(old);
+
+	btrfs_set_root_node(new_root_item, tmp);
+	/* record when the snapshot was created in key.offset */
+	key.offset = trans->transid;
+	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
+	btrfs_tree_unlock(tmp);
+	free_extent_buffer(tmp);
+	BUG_ON(ret);
+
+	/*
+	 * insert root back/forward references
+	 */
+	ret = btrfs_add_root_ref(trans, tree_root, objectid,
+				 parent_root->root_key.objectid,
+				 btrfs_ino(parent_inode), index,
+				 dentry->d_name.name, dentry->d_name.len);
+	BUG_ON(ret);
+	dput(parent);
+
+	key.offset = (u64)-1;
+	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
+	BUG_ON(IS_ERR(pending->snap));
+
+	btrfs_reloc_post_snapshot(trans, pending);
+	btrfs_orphan_post_snapshot(trans, pending);
+fail:
+	kfree(new_root_item);
+	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
+	return 0;
+}
+
+/*
+ * create all the snapshots we've scheduled for creation
+ */
+static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
+					     struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_pending_snapshot *pending;
+	struct list_head *head = &trans->transaction->pending_snapshots;
+	int ret;
+
+	list_for_each_entry(pending, head, list) {
+		ret = create_pending_snapshot(trans, fs_info, pending);
+		BUG_ON(ret);
+	}
+	return 0;
+}
+
+static void update_super_roots(struct btrfs_root *root)
+{
+	struct btrfs_root_item *root_item;
+	struct btrfs_super_block *super;
+
+	super = &root->fs_info->super_copy;
+
+	root_item = &root->fs_info->chunk_root->root_item;
+	super->chunk_root = root_item->bytenr;
+	super->chunk_root_generation = root_item->generation;
+	super->chunk_root_level = root_item->level;
+
+	root_item = &root->fs_info->tree_root->root_item;
+	super->root = root_item->bytenr;
+	super->generation = root_item->generation;
+	super->root_level = root_item->level;
+	if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
+		super->cache_generation = root_item->generation;
+}
+
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
+{
+	int ret = 0;
+	spin_lock(&info->trans_lock);
+	if (info->running_transaction)
+		ret = info->running_transaction->in_commit;
+	spin_unlock(&info->trans_lock);
+	return ret;
+}
+
+int btrfs_transaction_blocked(struct btrfs_fs_info *info)
+{
+	int ret = 0;
+	spin_lock(&info->trans_lock);
+	if (info->running_transaction)
+		ret = info->running_transaction->blocked;
+	spin_unlock(&info->trans_lock);
+	return ret;
+}
+
+/*
+ * wait for the current transaction commit to start and block subsequent
+ * transaction joins
+ */
+static void wait_current_trans_commit_start(struct btrfs_root *root,
+					    struct btrfs_transaction *trans)
+{
+	DEFINE_WAIT(wait);
+
+	if (trans->in_commit)
+		return;
+
+	while (1) {
+		prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (trans->in_commit) {
+			finish_wait(&root->fs_info->transaction_blocked_wait,
+				    &wait);
+			break;
+		}
+		schedule();
+		finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
+	}
+}
+
+/*
+ * wait for the current transaction to start and then become unblocked.
+ * caller holds ref.
+ */
+static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
+					 struct btrfs_transaction *trans)
+{
+	DEFINE_WAIT(wait);
+
+	if (trans->commit_done || (trans->in_commit && !trans->blocked))
+		return;
+
+	while (1) {
+		prepare_to_wait(&root->fs_info->transaction_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (trans->commit_done ||
+		    (trans->in_commit && !trans->blocked)) {
+			finish_wait(&root->fs_info->transaction_wait,
+				    &wait);
+			break;
+		}
+		schedule();
+		finish_wait(&root->fs_info->transaction_wait,
+			    &wait);
+	}
+}
+
+/*
+ * commit transactions asynchronously. once btrfs_commit_transaction_async
+ * returns, any subsequent transaction will not be allowed to join.
+ */
+struct btrfs_async_commit {
+	struct btrfs_trans_handle *newtrans;
+	struct btrfs_root *root;
+	struct delayed_work work;
+};
+
+static void do_async_commit(struct work_struct *work)
+{
+	struct btrfs_async_commit *ac =
+		container_of(work, struct btrfs_async_commit, work.work);
+
+	btrfs_commit_transaction(ac->newtrans, ac->root);
+	kfree(ac);
+}
+
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   int wait_for_unblock)
+{
+	struct btrfs_async_commit *ac;
+	struct btrfs_transaction *cur_trans;
+
+	ac = kmalloc(sizeof(*ac), GFP_NOFS);
+	if (!ac)
+		return -ENOMEM;
+
+	INIT_DELAYED_WORK(&ac->work, do_async_commit);
+	ac->root = root;
+	ac->newtrans = btrfs_join_transaction(root);
+	if (IS_ERR(ac->newtrans)) {
+		int err = PTR_ERR(ac->newtrans);
+		kfree(ac);
+		return err;
+	}
+
+	/* take transaction reference */
+	cur_trans = trans->transaction;
+	atomic_inc(&cur_trans->use_count);
+
+	btrfs_end_transaction(trans, root);
+	schedule_delayed_work(&ac->work, 0);
+
+	/* wait for transaction to start and unblock */
+	if (wait_for_unblock)
+		wait_current_trans_commit_start_and_unblock(root, cur_trans);
+	else
+		wait_current_trans_commit_start(root, cur_trans);
+
+	if (current->journal_info == trans)
+		current->journal_info = NULL;
+
+	put_transaction(cur_trans);
+	return 0;
+}
+
+/*
+ * btrfs_transaction state sequence:
+ *    in_commit = 0, blocked = 0  (initial)
+ *    in_commit = 1, blocked = 1
+ *    blocked = 0
+ *    commit_done = 1
+ */
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root)
+{
+	unsigned long joined = 0;
+	struct btrfs_transaction *cur_trans;
+	struct btrfs_transaction *prev_trans = NULL;
+	DEFINE_WAIT(wait);
+	int ret;
+	int should_grow = 0;
+	unsigned long now = get_seconds();
+	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
+
+	btrfs_run_ordered_operations(root, 0);
+
+	/* make a pass through all the delayed refs we have so far
+	 * any runnings procs may add more while we are here
+	 */
+	ret = btrfs_run_delayed_refs(trans, root, 0);
+	BUG_ON(ret);
+
+	btrfs_trans_release_metadata(trans, root);
+
+	cur_trans = trans->transaction;
+	/*
+	 * set the flushing flag so procs in this transaction have to
+	 * start sending their work down.
+	 */
+	cur_trans->delayed_refs.flushing = 1;
+
+	ret = btrfs_run_delayed_refs(trans, root, 0);
+	BUG_ON(ret);
+
+	spin_lock(&cur_trans->commit_lock);
+	if (cur_trans->in_commit) {
+		spin_unlock(&cur_trans->commit_lock);
+		atomic_inc(&cur_trans->use_count);
+		btrfs_end_transaction(trans, root);
+
+		ret = wait_for_commit(root, cur_trans);
+		BUG_ON(ret);
+
+		put_transaction(cur_trans);
+
+		return 0;
+	}
+
+	trans->transaction->in_commit = 1;
+	trans->transaction->blocked = 1;
+	spin_unlock(&cur_trans->commit_lock);
+	wake_up(&root->fs_info->transaction_blocked_wait);
+
+	spin_lock(&root->fs_info->trans_lock);
+	if (cur_trans->list.prev != &root->fs_info->trans_list) {
+		prev_trans = list_entry(cur_trans->list.prev,
+					struct btrfs_transaction, list);
+		if (!prev_trans->commit_done) {
+			atomic_inc(&prev_trans->use_count);
+			spin_unlock(&root->fs_info->trans_lock);
+
+			wait_for_commit(root, prev_trans);
+
+			put_transaction(prev_trans);
+		} else {
+			spin_unlock(&root->fs_info->trans_lock);
+		}
+	} else {
+		spin_unlock(&root->fs_info->trans_lock);
+	}
+
+	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
+		should_grow = 1;
+
+	do {
+		int snap_pending = 0;
+
+		joined = cur_trans->num_joined;
+		if (!list_empty(&trans->transaction->pending_snapshots))
+			snap_pending = 1;
+
+		WARN_ON(cur_trans != trans->transaction);
+
+		if (flush_on_commit || snap_pending) {
+			btrfs_start_delalloc_inodes(root, 1);
+			ret = btrfs_wait_ordered_extents(root, 0, 1);
+			BUG_ON(ret);
+		}
+
+		ret = btrfs_run_delayed_items(trans, root);
+		BUG_ON(ret);
+
+		/*
+		 * rename don't use btrfs_join_transaction, so, once we
+		 * set the transaction to blocked above, we aren't going
+		 * to get any new ordered operations.  We can safely run
+		 * it here and no for sure that nothing new will be added
+		 * to the list
+		 */
+		btrfs_run_ordered_operations(root, 1);
+
+		prepare_to_wait(&cur_trans->writer_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+
+		if (atomic_read(&cur_trans->num_writers) > 1)
+			schedule_timeout(MAX_SCHEDULE_TIMEOUT);
+		else if (should_grow)
+			schedule_timeout(1);
+
+		finish_wait(&cur_trans->writer_wait, &wait);
+	} while (atomic_read(&cur_trans->num_writers) > 1 ||
+		 (should_grow && cur_trans->num_joined != joined));
+
+	/*
+	 * Ok now we need to make sure to block out any other joins while we
+	 * commit the transaction.  We could have started a join before setting
+	 * no_join so make sure to wait for num_writers to == 1 again.
+	 */
+	spin_lock(&root->fs_info->trans_lock);
+	root->fs_info->trans_no_join = 1;
+	spin_unlock(&root->fs_info->trans_lock);
+	wait_event(cur_trans->writer_wait,
+		   atomic_read(&cur_trans->num_writers) == 1);
+
+	/*
+	 * the reloc mutex makes sure that we stop
+	 * the balancing code from coming in and moving
+	 * extents around in the middle of the commit
+	 */
+	mutex_lock(&root->fs_info->reloc_mutex);
+
+	ret = btrfs_run_delayed_items(trans, root);
+	BUG_ON(ret);
+
+	ret = create_pending_snapshots(trans, root->fs_info);
+	BUG_ON(ret);
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	BUG_ON(ret);
+
+	/*
+	 * make sure none of the code above managed to slip in a
+	 * delayed item
+	 */
+	btrfs_assert_delayed_root_empty(root);
+
+	WARN_ON(cur_trans != trans->transaction);
+
+	btrfs_scrub_pause(root);
+	/* btrfs_commit_tree_roots is responsible for getting the
+	 * various roots consistent with each other.  Every pointer
+	 * in the tree of tree roots has to point to the most up to date
+	 * root for every subvolume and other tree.  So, we have to keep
+	 * the tree logging code from jumping in and changing any
+	 * of the trees.
+	 *
+	 * At this point in the commit, there can't be any tree-log
+	 * writers, but a little lower down we drop the trans mutex
+	 * and let new people in.  By holding the tree_log_mutex
+	 * from now until after the super is written, we avoid races
+	 * with the tree-log code.
+	 */
+	mutex_lock(&root->fs_info->tree_log_mutex);
+
+	ret = commit_fs_roots(trans, root);
+	BUG_ON(ret);
+
+	/* commit_fs_roots gets rid of all the tree log roots, it is now
+	 * safe to free the root of tree log roots
+	 */
+	btrfs_free_log_root_tree(trans, root->fs_info);
+
+	ret = commit_cowonly_roots(trans, root);
+	BUG_ON(ret);
+
+	btrfs_prepare_extent_commit(trans, root);
+
+	cur_trans = root->fs_info->running_transaction;
+
+	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
+			    root->fs_info->tree_root->node);
+	switch_commit_root(root->fs_info->tree_root);
+
+	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
+			    root->fs_info->chunk_root->node);
+	switch_commit_root(root->fs_info->chunk_root);
+
+	update_super_roots(root);
+
+	if (!root->fs_info->log_root_recovering) {
+		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
+		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
+	}
+
+	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
+	       sizeof(root->fs_info->super_copy));
+
+	trans->transaction->blocked = 0;
+	spin_lock(&root->fs_info->trans_lock);
+	root->fs_info->running_transaction = NULL;
+	root->fs_info->trans_no_join = 0;
+	spin_unlock(&root->fs_info->trans_lock);
+	mutex_unlock(&root->fs_info->reloc_mutex);
+
+	wake_up(&root->fs_info->transaction_wait);
+
+	ret = btrfs_write_and_wait_transaction(trans, root);
+	BUG_ON(ret);
+	write_ctree_super(trans, root, 0);
+
+	/*
+	 * the super is written, we can safely allow the tree-loggers
+	 * to go about their business
+	 */
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+
+	btrfs_finish_extent_commit(trans, root);
+
+	cur_trans->commit_done = 1;
+
+	root->fs_info->last_trans_committed = cur_trans->transid;
+
+	wake_up(&cur_trans->commit_wait);
+
+	spin_lock(&root->fs_info->trans_lock);
+	list_del_init(&cur_trans->list);
+	spin_unlock(&root->fs_info->trans_lock);
+
+	put_transaction(cur_trans);
+	put_transaction(cur_trans);
+
+	trace_btrfs_transaction_commit(root);
+
+	btrfs_scrub_continue(root);
+
+	if (current->journal_info == trans)
+		current->journal_info = NULL;
+
+	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+	if (current != root->fs_info->transaction_kthread)
+		btrfs_run_delayed_iputs(root);
+
+	return ret;
+}
+
+/*
+ * interface function to delete all the snapshots we have scheduled for deletion
+ */
+int btrfs_clean_old_snapshots(struct btrfs_root *root)
+{
+	LIST_HEAD(list);
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	spin_lock(&fs_info->trans_lock);
+	list_splice_init(&fs_info->dead_roots, &list);
+	spin_unlock(&fs_info->trans_lock);
+
+	while (!list_empty(&list)) {
+		root = list_entry(list.next, struct btrfs_root, root_list);
+		list_del(&root->root_list);
+
+		btrfs_kill_all_delayed_nodes(root);
+
+		if (btrfs_header_backref_rev(root->node) <
+		    BTRFS_MIXED_BACKREF_REV)
+			btrfs_drop_snapshot(root, NULL, 0);
+		else
+			btrfs_drop_snapshot(root, NULL, 1);
+	}
+	return 0;
+}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 00000000..02564e62
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_TRANSACTION__
+#define __BTRFS_TRANSACTION__
+#include "btrfs_inode.h"
+#include "delayed-ref.h"
+
+struct btrfs_transaction {
+	u64 transid;
+	/*
+	 * total writers in this transaction, it must be zero before the
+	 * transaction can end
+	 */
+	atomic_t num_writers;
+	atomic_t use_count;
+
+	unsigned long num_joined;
+
+	spinlock_t commit_lock;
+	int in_commit;
+	int commit_done;
+	int blocked;
+	struct list_head list;
+	struct extent_io_tree dirty_pages;
+	unsigned long start_time;
+	wait_queue_head_t writer_wait;
+	wait_queue_head_t commit_wait;
+	struct list_head pending_snapshots;
+	struct btrfs_delayed_ref_root delayed_refs;
+};
+
+struct btrfs_trans_handle {
+	u64 transid;
+	u64 bytes_reserved;
+	unsigned long use_count;
+	unsigned long blocks_reserved;
+	unsigned long blocks_used;
+	unsigned long delayed_ref_updates;
+	struct btrfs_transaction *transaction;
+	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_block_rsv *orig_rsv;
+};
+
+struct btrfs_pending_snapshot {
+	struct dentry *dentry;
+	struct btrfs_root *root;
+	struct btrfs_root *snap;
+	/* block reservation for the operation */
+	struct btrfs_block_rsv block_rsv;
+	/* extra metadata reseration for relocation */
+	int error;
+	bool readonly;
+	struct list_head list;
+};
+
+static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
+					      struct inode *inode)
+{
+	BTRFS_I(inode)->last_trans = trans->transaction->transid;
+	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+}
+
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root);
+int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+						   int num_items);
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root);
+
+int btrfs_add_dead_root(struct btrfs_root *root);
+int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
+int btrfs_clean_old_snapshots(struct btrfs_root *root);
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root);
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   int wait_for_unblock);
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root);
+int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root);
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root);
+void btrfs_throttle(struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root);
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+				struct extent_io_tree *dirty_pages, int mark);
+int btrfs_write_marked_extents(struct btrfs_root *root,
+				struct extent_io_tree *dirty_pages, int mark);
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+				struct extent_io_tree *dirty_pages, int mark);
+int btrfs_transaction_blocked(struct btrfs_fs_info *info);
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 00000000..3b580ee8
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "locking.h"
+
+/* defrag all the leaves in a given btree.  If cache_only == 1, don't read
+ * things from disk, otherwise read all the leaves and try to get key order to
+ * better reflect disk order
+ */
+
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, int cache_only)
+{
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+	int ret = 0;
+	int wret;
+	int level;
+	int is_extent = 0;
+	int next_key_ret = 0;
+	u64 last_ret = 0;
+	u64 min_trans = 0;
+
+	if (cache_only)
+		goto out;
+
+	if (root->fs_info->extent_root == root) {
+		/*
+		 * there's recursion here right now in the tree locking,
+		 * we can't defrag the extent root without deadlock
+		 */
+		goto out;
+	}
+
+	if (root->ref_cows == 0 && !is_extent)
+		goto out;
+
+	if (btrfs_test_opt(root, SSD))
+		goto out;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	level = btrfs_header_level(root->node);
+
+	if (level == 0)
+		goto out;
+
+	if (root->defrag_progress.objectid == 0) {
+		struct extent_buffer *root_node;
+		u32 nritems;
+
+		root_node = btrfs_lock_root_node(root);
+		btrfs_set_lock_blocking(root_node);
+		nritems = btrfs_header_nritems(root_node);
+		root->defrag_max.objectid = 0;
+		/* from above we know this is not a leaf */
+		btrfs_node_key_to_cpu(root_node, &root->defrag_max,
+				      nritems - 1);
+		btrfs_tree_unlock(root_node);
+		free_extent_buffer(root_node);
+		memset(&key, 0, sizeof(key));
+	} else {
+		memcpy(&key, &root->defrag_progress, sizeof(key));
+	}
+
+	path->keep_locks = 1;
+	if (cache_only)
+		min_trans = root->defrag_trans_start;
+
+	ret = btrfs_search_forward(root, &key, NULL, path,
+				   cache_only, min_trans);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = 0;
+		goto out;
+	}
+	btrfs_release_path(path);
+	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+
+	if (wret < 0) {
+		ret = wret;
+		goto out;
+	}
+	if (!path->nodes[1]) {
+		ret = 0;
+		goto out;
+	}
+	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+	next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
+					   min_trans);
+	ret = btrfs_realloc_node(trans, root,
+				 path->nodes[1], 0,
+				 cache_only, &last_ret,
+				 &root->defrag_progress);
+	if (ret) {
+		WARN_ON(ret == -EAGAIN);
+		goto out;
+	}
+	if (next_key_ret == 0) {
+		memcpy(&root->defrag_progress, &key, sizeof(key));
+		ret = -EAGAIN;
+	}
+out:
+	if (path)
+		btrfs_free_path(path);
+	if (ret == -EAGAIN) {
+		if (root->defrag_max.objectid > root->defrag_progress.objectid)
+			goto done;
+		if (root->defrag_max.type > root->defrag_progress.type)
+			goto done;
+		if (root->defrag_max.offset > root->defrag_progress.offset)
+			goto done;
+		ret = 0;
+	}
+done:
+	if (ret != -EAGAIN) {
+		memset(&root->defrag_progress, 0,
+		       sizeof(root->defrag_progress));
+		root->defrag_trans_start = trans->transid;
+	}
+	return ret;
+}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 00000000..7fa128d3
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,3339 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "print-tree.h"
+#include "compat.h"
+#include "tree-log.h"
+
+/* magic values for the inode_only field in btrfs_log_inode:
+ *
+ * LOG_INODE_ALL means to log everything
+ * LOG_INODE_EXISTS means to log just enough to recreate the inode
+ * during log replay
+ */
+#define LOG_INODE_ALL 0
+#define LOG_INODE_EXISTS 1
+
+/*
+ * directory trouble cases
+ *
+ * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
+ * log, we must force a full commit before doing an fsync of the directory
+ * where the unlink was done.
+ * ---> record transid of last unlink/rename per directory
+ *
+ * mkdir foo/some_dir
+ * normal commit
+ * rename foo/some_dir foo2/some_dir
+ * mkdir foo/some_dir
+ * fsync foo/some_dir/some_file
+ *
+ * The fsync above will unlink the original some_dir without recording
+ * it in its new location (foo2).  After a crash, some_dir will be gone
+ * unless the fsync of some_file forces a full commit
+ *
+ * 2) we must log any new names for any file or dir that is in the fsync
+ * log. ---> check inode while renaming/linking.
+ *
+ * 2a) we must log any new names for any file or dir during rename
+ * when the directory they are being removed from was logged.
+ * ---> check inode and old parent dir during rename
+ *
+ *  2a is actually the more important variant.  With the extra logging
+ *  a crash might unlink the old name without recreating the new one
+ *
+ * 3) after a crash, we must go through any directories with a link count
+ * of zero and redo the rm -rf
+ *
+ * mkdir f1/foo
+ * normal commit
+ * rm -rf f1/foo
+ * fsync(f1)
+ *
+ * The directory f1 was fully removed from the FS, but fsync was never
+ * called on f1, only its parent dir.  After a crash the rm -rf must
+ * be replayed.  This must be able to recurse down the entire
+ * directory tree.  The inode link count fixup code takes care of the
+ * ugly details.
+ */
+
+/*
+ * stages for the tree walking.  The first
+ * stage (0) is to only pin down the blocks we find
+ * the second stage (1) is to make sure that all the inodes
+ * we find in the log are created in the subvolume.
+ *
+ * The last stage is to deal with directories and links and extents
+ * and all the other fun semantics
+ */
+#define LOG_WALK_PIN_ONLY 0
+#define LOG_WALK_REPLAY_INODES 1
+#define LOG_WALK_REPLAY_ALL 2
+
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, struct inode *inode,
+			     int inode_only);
+static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid);
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       u64 dirid, int del_all);
+
+/*
+ * tree logging is a special write ahead log used to make sure that
+ * fsyncs and O_SYNCs can happen without doing full tree commits.
+ *
+ * Full tree commits are expensive because they require commonly
+ * modified blocks to be recowed, creating many dirty pages in the
+ * extent tree an 4x-6x higher write load than ext3.
+ *
+ * Instead of doing a tree commit on every fsync, we use the
+ * key ranges and transaction ids to find items for a given file or directory
+ * that have changed in this transaction.  Those items are copied into
+ * a special tree (one per subvolume root), that tree is written to disk
+ * and then the fsync is considered complete.
+ *
+ * After a crash, items are copied out of the log-tree back into the
+ * subvolume tree.  Any file data extents found are recorded in the extent
+ * allocation tree, and the log-tree freed.
+ *
+ * The log tree is read three times, once to pin down all the extents it is
+ * using in ram and once, once to create all the inodes logged in the tree
+ * and once to do all the other items.
+ */
+
+/*
+ * start a sub transaction and setup the log tree
+ * this increments the log tree writer count to make the people
+ * syncing the tree wait for us to finish
+ */
+static int start_log_trans(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root)
+{
+	int ret;
+	int err = 0;
+
+	mutex_lock(&root->log_mutex);
+	if (root->log_root) {
+		if (!root->log_start_pid) {
+			root->log_start_pid = current->pid;
+			root->log_multiple_pids = false;
+		} else if (root->log_start_pid != current->pid) {
+			root->log_multiple_pids = true;
+		}
+
+		root->log_batch++;
+		atomic_inc(&root->log_writers);
+		mutex_unlock(&root->log_mutex);
+		return 0;
+	}
+	root->log_multiple_pids = false;
+	root->log_start_pid = current->pid;
+	mutex_lock(&root->fs_info->tree_log_mutex);
+	if (!root->fs_info->log_root_tree) {
+		ret = btrfs_init_log_root_tree(trans, root->fs_info);
+		if (ret)
+			err = ret;
+	}
+	if (err == 0 && !root->log_root) {
+		ret = btrfs_add_log_tree(trans, root);
+		if (ret)
+			err = ret;
+	}
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+	root->log_batch++;
+	atomic_inc(&root->log_writers);
+	mutex_unlock(&root->log_mutex);
+	return err;
+}
+
+/*
+ * returns 0 if there was a log transaction running and we were able
+ * to join, or returns -ENOENT if there were not transactions
+ * in progress
+ */
+static int join_running_log_trans(struct btrfs_root *root)
+{
+	int ret = -ENOENT;
+
+	smp_mb();
+	if (!root->log_root)
+		return -ENOENT;
+
+	mutex_lock(&root->log_mutex);
+	if (root->log_root) {
+		ret = 0;
+		atomic_inc(&root->log_writers);
+	}
+	mutex_unlock(&root->log_mutex);
+	return ret;
+}
+
+/*
+ * This either makes the current running log transaction wait
+ * until you call btrfs_end_log_trans() or it makes any future
+ * log transactions wait until you call btrfs_end_log_trans()
+ */
+int btrfs_pin_log_trans(struct btrfs_root *root)
+{
+	int ret = -ENOENT;
+
+	mutex_lock(&root->log_mutex);
+	atomic_inc(&root->log_writers);
+	mutex_unlock(&root->log_mutex);
+	return ret;
+}
+
+/*
+ * indicate we're done making changes to the log tree
+ * and wake up anyone waiting to do a sync
+ */
+int btrfs_end_log_trans(struct btrfs_root *root)
+{
+	if (atomic_dec_and_test(&root->log_writers)) {
+		smp_mb();
+		if (waitqueue_active(&root->log_writer_wait))
+			wake_up(&root->log_writer_wait);
+	}
+	return 0;
+}
+
+
+/*
+ * the walk control struct is used to pass state down the chain when
+ * processing the log tree.  The stage field tells us which part
+ * of the log tree processing we are currently doing.  The others
+ * are state fields used for that specific part
+ */
+struct walk_control {
+	/* should we free the extent on disk when done?  This is used
+	 * at transaction commit time while freeing a log tree
+	 */
+	int free;
+
+	/* should we write out the extent buffer?  This is used
+	 * while flushing the log tree to disk during a sync
+	 */
+	int write;
+
+	/* should we wait for the extent buffer io to finish?  Also used
+	 * while flushing the log tree to disk for a sync
+	 */
+	int wait;
+
+	/* pin only walk, we record which extents on disk belong to the
+	 * log trees
+	 */
+	int pin;
+
+	/* what stage of the replay code we're currently in */
+	int stage;
+
+	/* the root we are currently replaying */
+	struct btrfs_root *replay_dest;
+
+	/* the trans handle for the current replay */
+	struct btrfs_trans_handle *trans;
+
+	/* the function that gets used to process blocks we find in the
+	 * tree.  Note the extent_buffer might not be up to date when it is
+	 * passed in, and it must be checked or read if you need the data
+	 * inside it
+	 */
+	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
+			    struct walk_control *wc, u64 gen);
+};
+
+/*
+ * process_func used to pin down extents, write them or wait on them
+ */
+static int process_one_buffer(struct btrfs_root *log,
+			      struct extent_buffer *eb,
+			      struct walk_control *wc, u64 gen)
+{
+	if (wc->pin)
+		btrfs_pin_extent(log->fs_info->extent_root,
+				 eb->start, eb->len, 0);
+
+	if (btrfs_buffer_uptodate(eb, gen)) {
+		if (wc->write)
+			btrfs_write_tree_block(eb);
+		if (wc->wait)
+			btrfs_wait_tree_block_writeback(eb);
+	}
+	return 0;
+}
+
+/*
+ * Item overwrite used by replay and tree logging.  eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten.  If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static noinline int overwrite_item(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   struct extent_buffer *eb, int slot,
+				   struct btrfs_key *key)
+{
+	int ret;
+	u32 item_size;
+	u64 saved_i_size = 0;
+	int save_old_i_size = 0;
+	unsigned long src_ptr;
+	unsigned long dst_ptr;
+	int overwrite_root = 0;
+
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+		overwrite_root = 1;
+
+	item_size = btrfs_item_size_nr(eb, slot);
+	src_ptr = btrfs_item_ptr_offset(eb, slot);
+
+	/* look for the key in the destination tree */
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret == 0) {
+		char *src_copy;
+		char *dst_copy;
+		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
+						  path->slots[0]);
+		if (dst_size != item_size)
+			goto insert;
+
+		if (item_size == 0) {
+			btrfs_release_path(path);
+			return 0;
+		}
+		dst_copy = kmalloc(item_size, GFP_NOFS);
+		src_copy = kmalloc(item_size, GFP_NOFS);
+		if (!dst_copy || !src_copy) {
+			btrfs_release_path(path);
+			kfree(dst_copy);
+			kfree(src_copy);
+			return -ENOMEM;
+		}
+
+		read_extent_buffer(eb, src_copy, src_ptr, item_size);
+
+		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
+				   item_size);
+		ret = memcmp(dst_copy, src_copy, item_size);
+
+		kfree(dst_copy);
+		kfree(src_copy);
+		/*
+		 * they have the same contents, just return, this saves
+		 * us from cowing blocks in the destination tree and doing
+		 * extra writes that may not have been done by a previous
+		 * sync
+		 */
+		if (ret == 0) {
+			btrfs_release_path(path);
+			return 0;
+		}
+
+	}
+insert:
+	btrfs_release_path(path);
+	/* try to insert the key into the destination tree */
+	ret = btrfs_insert_empty_item(trans, root, path,
+				      key, item_size);
+
+	/* make sure any existing item is the correct size */
+	if (ret == -EEXIST) {
+		u32 found_size;
+		found_size = btrfs_item_size_nr(path->nodes[0],
+						path->slots[0]);
+		if (found_size > item_size) {
+			btrfs_truncate_item(trans, root, path, item_size, 1);
+		} else if (found_size < item_size) {
+			ret = btrfs_extend_item(trans, root, path,
+						item_size - found_size);
+		}
+	} else if (ret) {
+		return ret;
+	}
+	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
+					path->slots[0]);
+
+	/* don't overwrite an existing inode if the generation number
+	 * was logged as zero.  This is done when the tree logging code
+	 * is just logging an inode to make sure it exists after recovery.
+	 *
+	 * Also, don't overwrite i_size on directories during replay.
+	 * log replay inserts and removes directory items based on the
+	 * state of the tree found in the subvolume, and i_size is modified
+	 * as it goes
+	 */
+	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+		struct btrfs_inode_item *src_item;
+		struct btrfs_inode_item *dst_item;
+
+		src_item = (struct btrfs_inode_item *)src_ptr;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+
+		if (btrfs_inode_generation(eb, src_item) == 0)
+			goto no_copy;
+
+		if (overwrite_root &&
+		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+			save_old_i_size = 1;
+			saved_i_size = btrfs_inode_size(path->nodes[0],
+							dst_item);
+		}
+	}
+
+	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
+			   src_ptr, item_size);
+
+	if (save_old_i_size) {
+		struct btrfs_inode_item *dst_item;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+	}
+
+	/* make sure the generation is filled in */
+	if (key->type == BTRFS_INODE_ITEM_KEY) {
+		struct btrfs_inode_item *dst_item;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
+			btrfs_set_inode_generation(path->nodes[0], dst_item,
+						   trans->transid);
+		}
+	}
+no_copy:
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(path);
+	return 0;
+}
+
+/*
+ * simple helper to read an inode off the disk from a given root
+ * This can only be called for subvolume roots and not for the log
+ */
+static noinline struct inode *read_one_inode(struct btrfs_root *root,
+					     u64 objectid)
+{
+	struct btrfs_key key;
+	struct inode *inode;
+
+	key.objectid = objectid;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+	if (IS_ERR(inode)) {
+		inode = NULL;
+	} else if (is_bad_inode(inode)) {
+		iput(inode);
+		inode = NULL;
+	}
+	return inode;
+}
+
+/* replays a single extent in 'eb' at 'slot' with 'key' into the
+ * subvolume 'root'.  path is released on entry and should be released
+ * on exit.
+ *
+ * extents in the log tree have not been allocated out of the extent
+ * tree yet.  So, this completes the allocation, taking a reference
+ * as required if the extent already exists or creating a new extent
+ * if it isn't in the extent allocation tree yet.
+ *
+ * The extent is inserted into the file, dropping any existing extents
+ * from the file that overlap the new one.
+ */
+static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct extent_buffer *eb, int slot,
+				      struct btrfs_key *key)
+{
+	int found_type;
+	u64 mask = root->sectorsize - 1;
+	u64 extent_end;
+	u64 alloc_hint;
+	u64 start = key->offset;
+	u64 saved_nbytes;
+	struct btrfs_file_extent_item *item;
+	struct inode *inode = NULL;
+	unsigned long size;
+	int ret = 0;
+
+	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+	found_type = btrfs_file_extent_type(eb, item);
+
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC)
+		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
+	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		size = btrfs_file_extent_inline_len(eb, item);
+		extent_end = (start + size + mask) & ~mask;
+	} else {
+		ret = 0;
+		goto out;
+	}
+
+	inode = read_one_inode(root, key->objectid);
+	if (!inode) {
+		ret = -EIO;
+		goto out;
+	}
+
+	/*
+	 * first check to see if we already have this extent in the
+	 * file.  This must be done before the btrfs_drop_extents run
+	 * so we don't try to drop this extent.
+	 */
+	ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
+				       start, 0);
+
+	if (ret == 0 &&
+	    (found_type == BTRFS_FILE_EXTENT_REG ||
+	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
+		struct btrfs_file_extent_item cmp1;
+		struct btrfs_file_extent_item cmp2;
+		struct btrfs_file_extent_item *existing;
+		struct extent_buffer *leaf;
+
+		leaf = path->nodes[0];
+		existing = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_file_extent_item);
+
+		read_extent_buffer(eb, &cmp1, (unsigned long)item,
+				   sizeof(cmp1));
+		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
+				   sizeof(cmp2));
+
+		/*
+		 * we already have a pointer to this exact extent,
+		 * we don't have to do anything
+		 */
+		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
+			btrfs_release_path(path);
+			goto out;
+		}
+	}
+	btrfs_release_path(path);
+
+	saved_nbytes = inode_get_bytes(inode);
+	/* drop any overlapping extents */
+	ret = btrfs_drop_extents(trans, inode, start, extent_end,
+				 &alloc_hint, 1);
+	BUG_ON(ret);
+
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		u64 offset;
+		unsigned long dest_offset;
+		struct btrfs_key ins;
+
+		ret = btrfs_insert_empty_item(trans, root, path, key,
+					      sizeof(*item));
+		BUG_ON(ret);
+		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
+						    path->slots[0]);
+		copy_extent_buffer(path->nodes[0], eb, dest_offset,
+				(unsigned long)item,  sizeof(*item));
+
+		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+		ins.type = BTRFS_EXTENT_ITEM_KEY;
+		offset = key->offset - btrfs_file_extent_offset(eb, item);
+
+		if (ins.objectid > 0) {
+			u64 csum_start;
+			u64 csum_end;
+			LIST_HEAD(ordered_sums);
+			/*
+			 * is this extent already allocated in the extent
+			 * allocation tree?  If so, just add a reference
+			 */
+			ret = btrfs_lookup_extent(root, ins.objectid,
+						ins.offset);
+			if (ret == 0) {
+				ret = btrfs_inc_extent_ref(trans, root,
+						ins.objectid, ins.offset,
+						0, root->root_key.objectid,
+						key->objectid, offset);
+				BUG_ON(ret);
+			} else {
+				/*
+				 * insert the extent pointer in the extent
+				 * allocation tree
+				 */
+				ret = btrfs_alloc_logged_file_extent(trans,
+						root, root->root_key.objectid,
+						key->objectid, offset, &ins);
+				BUG_ON(ret);
+			}
+			btrfs_release_path(path);
+
+			if (btrfs_file_extent_compression(eb, item)) {
+				csum_start = ins.objectid;
+				csum_end = csum_start + ins.offset;
+			} else {
+				csum_start = ins.objectid +
+					btrfs_file_extent_offset(eb, item);
+				csum_end = csum_start +
+					btrfs_file_extent_num_bytes(eb, item);
+			}
+
+			ret = btrfs_lookup_csums_range(root->log_root,
+						csum_start, csum_end - 1,
+						&ordered_sums, 0);
+			BUG_ON(ret);
+			while (!list_empty(&ordered_sums)) {
+				struct btrfs_ordered_sum *sums;
+				sums = list_entry(ordered_sums.next,
+						struct btrfs_ordered_sum,
+						list);
+				ret = btrfs_csum_file_blocks(trans,
+						root->fs_info->csum_root,
+						sums);
+				BUG_ON(ret);
+				list_del(&sums->list);
+				kfree(sums);
+			}
+		} else {
+			btrfs_release_path(path);
+		}
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		/* inline extents are easy, we just overwrite them */
+		ret = overwrite_item(trans, root, path, eb, slot, key);
+		BUG_ON(ret);
+	}
+
+	inode_set_bytes(inode, saved_nbytes);
+	btrfs_update_inode(trans, root, inode);
+out:
+	if (inode)
+		iput(inode);
+	return ret;
+}
+
+/*
+ * when cleaning up conflicts between the directory names in the
+ * subvolume, directory names in the log and directory names in the
+ * inode back references, we may have to unlink inodes from directories.
+ *
+ * This is a helper function to do the unlink of a specific directory
+ * item
+ */
+static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct inode *dir,
+				      struct btrfs_dir_item *di)
+{
+	struct inode *inode;
+	char *name;
+	int name_len;
+	struct extent_buffer *leaf;
+	struct btrfs_key location;
+	int ret;
+
+	leaf = path->nodes[0];
+
+	btrfs_dir_item_key_to_cpu(leaf, di, &location);
+	name_len = btrfs_dir_name_len(leaf, di);
+	name = kmalloc(name_len, GFP_NOFS);
+	if (!name)
+		return -ENOMEM;
+
+	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
+	btrfs_release_path(path);
+
+	inode = read_one_inode(root, location.objectid);
+	if (!inode) {
+		kfree(name);
+		return -EIO;
+	}
+
+	ret = link_to_fixup_dir(trans, root, path, location.objectid);
+	BUG_ON(ret);
+
+	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+	BUG_ON(ret);
+	kfree(name);
+
+	iput(inode);
+	return ret;
+}
+
+/*
+ * helper function to see if a given name and sequence number found
+ * in an inode back reference are already in a directory and correctly
+ * point to this inode
+ */
+static noinline int inode_in_dir(struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 u64 dirid, u64 objectid, u64 index,
+				 const char *name, int name_len)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_key location;
+	int match = 0;
+
+	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
+					 index, name, name_len, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+		if (location.objectid != objectid)
+			goto out;
+	} else
+		goto out;
+	btrfs_release_path(path);
+
+	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+		if (location.objectid != objectid)
+			goto out;
+	} else
+		goto out;
+	match = 1;
+out:
+	btrfs_release_path(path);
+	return match;
+}
+
+/*
+ * helper function to check a log tree for a named back reference in
+ * an inode.  This is used to decide if a back reference that is
+ * found in the subvolume conflicts with what we find in the log.
+ *
+ * inode backreferences may have multiple refs in a single item,
+ * during replay we process one reference at a time, and we don't
+ * want to delete valid links to a file from the subvolume if that
+ * link is also in the log.
+ */
+static noinline int backref_in_log(struct btrfs_root *log,
+				   struct btrfs_key *key,
+				   char *name, int namelen)
+{
+	struct btrfs_path *path;
+	struct btrfs_inode_ref *ref;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	unsigned long name_ptr;
+	int found_name_len;
+	int item_size;
+	int ret;
+	int match = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
+	if (ret != 0)
+		goto out;
+
+	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+	ptr_end = ptr + item_size;
+	while (ptr < ptr_end) {
+		ref = (struct btrfs_inode_ref *)ptr;
+		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
+		if (found_name_len == namelen) {
+			name_ptr = (unsigned long)(ref + 1);
+			ret = memcmp_extent_buffer(path->nodes[0], name,
+						   name_ptr, namelen);
+			if (ret == 0) {
+				match = 1;
+				goto out;
+			}
+		}
+		ptr = (unsigned long)(ref + 1) + found_name_len;
+	}
+out:
+	btrfs_free_path(path);
+	return match;
+}
+
+
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function.  (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_root *log,
+				  struct btrfs_path *path,
+				  struct extent_buffer *eb, int slot,
+				  struct btrfs_key *key)
+{
+	struct btrfs_inode_ref *ref;
+	struct btrfs_dir_item *di;
+	struct inode *dir;
+	struct inode *inode;
+	unsigned long ref_ptr;
+	unsigned long ref_end;
+	char *name;
+	int namelen;
+	int ret;
+	int search_done = 0;
+
+	/*
+	 * it is possible that we didn't log all the parent directories
+	 * for a given inode.  If we don't find the dir, just don't
+	 * copy the back ref in.  The link count fixup code will take
+	 * care of the rest
+	 */
+	dir = read_one_inode(root, key->offset);
+	if (!dir)
+		return -ENOENT;
+
+	inode = read_one_inode(root, key->objectid);
+	if (!inode) {
+		iput(dir);
+		return -EIO;
+	}
+
+	ref_ptr = btrfs_item_ptr_offset(eb, slot);
+	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+again:
+	ref = (struct btrfs_inode_ref *)ref_ptr;
+
+	namelen = btrfs_inode_ref_name_len(eb, ref);
+	name = kmalloc(namelen, GFP_NOFS);
+	BUG_ON(!name);
+
+	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
+
+	/* if we already have a perfect match, we're done */
+	if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+			 btrfs_inode_ref_index(eb, ref),
+			 name, namelen)) {
+		goto out;
+	}
+
+	/*
+	 * look for a conflicting back reference in the metadata.
+	 * if we find one we have to unlink that name of the file
+	 * before we add our new link.  Later on, we overwrite any
+	 * existing back reference, and we don't want to create
+	 * dangling pointers in the directory.
+	 */
+
+	if (search_done)
+		goto insert;
+
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret == 0) {
+		char *victim_name;
+		int victim_name_len;
+		struct btrfs_inode_ref *victim_ref;
+		unsigned long ptr;
+		unsigned long ptr_end;
+		struct extent_buffer *leaf = path->nodes[0];
+
+		/* are we trying to overwrite a back ref for the root directory
+		 * if so, just jump out, we're done
+		 */
+		if (key->objectid == key->offset)
+			goto out_nowrite;
+
+		/* check all the names in this back reference to see
+		 * if they are in the log.  if so, we allow them to stay
+		 * otherwise they must be unlinked as a conflict
+		 */
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+		while (ptr < ptr_end) {
+			victim_ref = (struct btrfs_inode_ref *)ptr;
+			victim_name_len = btrfs_inode_ref_name_len(leaf,
+								   victim_ref);
+			victim_name = kmalloc(victim_name_len, GFP_NOFS);
+			BUG_ON(!victim_name);
+
+			read_extent_buffer(leaf, victim_name,
+					   (unsigned long)(victim_ref + 1),
+					   victim_name_len);
+
+			if (!backref_in_log(log, key, victim_name,
+					    victim_name_len)) {
+				btrfs_inc_nlink(inode);
+				btrfs_release_path(path);
+
+				ret = btrfs_unlink_inode(trans, root, dir,
+							 inode, victim_name,
+							 victim_name_len);
+			}
+			kfree(victim_name);
+			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+		}
+		BUG_ON(ret);
+
+		/*
+		 * NOTE: we have searched root tree and checked the
+		 * coresponding ref, it does not need to check again.
+		 */
+		search_done = 1;
+	}
+	btrfs_release_path(path);
+
+	/* look for a conflicting sequence number */
+	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
+					 btrfs_inode_ref_index(eb, ref),
+					 name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		BUG_ON(ret);
+	}
+	btrfs_release_path(path);
+
+	/* look for a conflicing name */
+	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
+				   name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		BUG_ON(ret);
+	}
+	btrfs_release_path(path);
+
+insert:
+	/* insert our name */
+	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
+			     btrfs_inode_ref_index(eb, ref));
+	BUG_ON(ret);
+
+	btrfs_update_inode(trans, root, inode);
+
+out:
+	ref_ptr = (unsigned long)(ref + 1) + namelen;
+	kfree(name);
+	if (ref_ptr < ref_end)
+		goto again;
+
+	/* finally write the back reference in the inode */
+	ret = overwrite_item(trans, root, path, eb, slot, key);
+	BUG_ON(ret);
+
+out_nowrite:
+	btrfs_release_path(path);
+	iput(dir);
+	iput(inode);
+	return 0;
+}
+
+static int insert_orphan_item(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root, u64 offset)
+{
+	int ret;
+	ret = btrfs_find_orphan_item(root, offset);
+	if (ret > 0)
+		ret = btrfs_insert_orphan_item(trans, root, offset);
+	return ret;
+}
+
+
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay.  So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found.  If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct inode *inode)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	u64 nlink = 0;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	int name_len;
+	u64 ino = btrfs_ino(inode);
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &key,
+				      path->slots[0]);
+		if (key.objectid != ino ||
+		    key.type != BTRFS_INODE_REF_KEY)
+			break;
+		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+						   path->slots[0]);
+		while (ptr < ptr_end) {
+			struct btrfs_inode_ref *ref;
+
+			ref = (struct btrfs_inode_ref *)ptr;
+			name_len = btrfs_inode_ref_name_len(path->nodes[0],
+							    ref);
+			ptr = (unsigned long)(ref + 1) + name_len;
+			nlink++;
+		}
+
+		if (key.offset == 0)
+			break;
+		key.offset--;
+		btrfs_release_path(path);
+	}
+	btrfs_release_path(path);
+	if (nlink != inode->i_nlink) {
+		inode->i_nlink = nlink;
+		btrfs_update_inode(trans, root, inode);
+	}
+	BTRFS_I(inode)->index_cnt = (u64)-1;
+
+	if (inode->i_nlink == 0) {
+		if (S_ISDIR(inode->i_mode)) {
+			ret = replay_dir_deletes(trans, root, NULL, path,
+						 ino, 1);
+			BUG_ON(ret);
+		}
+		ret = insert_orphan_item(trans, root, ino);
+		BUG_ON(ret);
+	}
+	btrfs_free_path(path);
+
+	return 0;
+}
+
+static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root,
+					    struct btrfs_path *path)
+{
+	int ret;
+	struct btrfs_key key;
+	struct inode *inode;
+
+	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = (u64)-1;
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0)
+			break;
+
+		if (ret == 1) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
+		    key.type != BTRFS_ORPHAN_ITEM_KEY)
+			break;
+
+		ret = btrfs_del_item(trans, root, path);
+		if (ret)
+			goto out;
+
+		btrfs_release_path(path);
+		inode = read_one_inode(root, key.offset);
+		if (!inode)
+			return -EIO;
+
+		ret = fixup_inode_link_count(trans, root, inode);
+		BUG_ON(ret);
+
+		iput(inode);
+
+		/*
+		 * fixup on a directory may create new entries,
+		 * make sure we always look for the highset possible
+		 * offset
+		 */
+		key.offset = (u64)-1;
+	}
+	ret = 0;
+out:
+	btrfs_release_path(path);
+	return ret;
+}
+
+
+/*
+ * record a given inode in the fixup dir so we can check its link
+ * count when replay is done.  The link count is incremented here
+ * so the inode won't go away until we check it
+ */
+static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      u64 objectid)
+{
+	struct btrfs_key key;
+	int ret = 0;
+	struct inode *inode;
+
+	inode = read_one_inode(root, objectid);
+	if (!inode)
+		return -EIO;
+
+	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = objectid;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+	btrfs_release_path(path);
+	if (ret == 0) {
+		btrfs_inc_nlink(inode);
+		btrfs_update_inode(trans, root, inode);
+	} else if (ret == -EEXIST) {
+		ret = 0;
+	} else {
+		BUG();
+	}
+	iput(inode);
+
+	return ret;
+}
+
+/*
+ * when replaying the log for a directory, we only insert names
+ * for inodes that actually exist.  This means an fsync on a directory
+ * does not implicitly fsync all the new files in it
+ */
+static noinline int insert_one_name(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    u64 dirid, u64 index,
+				    char *name, int name_len, u8 type,
+				    struct btrfs_key *location)
+{
+	struct inode *inode;
+	struct inode *dir;
+	int ret;
+
+	inode = read_one_inode(root, location->objectid);
+	if (!inode)
+		return -ENOENT;
+
+	dir = read_one_inode(root, dirid);
+	if (!dir) {
+		iput(inode);
+		return -EIO;
+	}
+	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
+
+	/* FIXME, put inode into FIXUP list */
+
+	iput(inode);
+	iput(dir);
+	return ret;
+}
+
+/*
+ * take a single entry in a log directory item and replay it into
+ * the subvolume.
+ *
+ * if a conflicting item exists in the subdirectory already,
+ * the inode it points to is unlinked and put into the link count
+ * fix up tree.
+ *
+ * If a name from the log points to a file or directory that does
+ * not exist in the FS, it is skipped.  fsyncs on directories
+ * do not force down inodes inside that directory, just changes to the
+ * names or unlinks in a directory.
+ */
+static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct extent_buffer *eb,
+				    struct btrfs_dir_item *di,
+				    struct btrfs_key *key)
+{
+	char *name;
+	int name_len;
+	struct btrfs_dir_item *dst_di;
+	struct btrfs_key found_key;
+	struct btrfs_key log_key;
+	struct inode *dir;
+	u8 log_type;
+	int exists;
+	int ret;
+
+	dir = read_one_inode(root, key->objectid);
+	if (!dir)
+		return -EIO;
+
+	name_len = btrfs_dir_name_len(eb, di);
+	name = kmalloc(name_len, GFP_NOFS);
+	if (!name)
+		return -ENOMEM;
+
+	log_type = btrfs_dir_type(eb, di);
+	read_extent_buffer(eb, name, (unsigned long)(di + 1),
+		   name_len);
+
+	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
+	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
+	if (exists == 0)
+		exists = 1;
+	else
+		exists = 0;
+	btrfs_release_path(path);
+
+	if (key->type == BTRFS_DIR_ITEM_KEY) {
+		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+				       name, name_len, 1);
+	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
+		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+						     key->objectid,
+						     key->offset, name,
+						     name_len, 1);
+	} else {
+		BUG();
+	}
+	if (IS_ERR_OR_NULL(dst_di)) {
+		/* we need a sequence number to insert, so we only
+		 * do inserts for the BTRFS_DIR_INDEX_KEY types
+		 */
+		if (key->type != BTRFS_DIR_INDEX_KEY)
+			goto out;
+		goto insert;
+	}
+
+	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+	/* the existing item matches the logged item */
+	if (found_key.objectid == log_key.objectid &&
+	    found_key.type == log_key.type &&
+	    found_key.offset == log_key.offset &&
+	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+		goto out;
+	}
+
+	/*
+	 * don't drop the conflicting directory entry if the inode
+	 * for the new entry doesn't exist
+	 */
+	if (!exists)
+		goto out;
+
+	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
+	BUG_ON(ret);
+
+	if (key->type == BTRFS_DIR_INDEX_KEY)
+		goto insert;
+out:
+	btrfs_release_path(path);
+	kfree(name);
+	iput(dir);
+	return 0;
+
+insert:
+	btrfs_release_path(path);
+	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
+			      name, name_len, log_type, &log_key);
+
+	BUG_ON(ret && ret != -ENOENT);
+	goto out;
+}
+
+/*
+ * find all the names in a directory item and reconcile them into
+ * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
+ * one name in a directory item, but the same code gets used for
+ * both directory index types
+ */
+static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct extent_buffer *eb, int slot,
+					struct btrfs_key *key)
+{
+	int ret;
+	u32 item_size = btrfs_item_size_nr(eb, slot);
+	struct btrfs_dir_item *di;
+	int name_len;
+	unsigned long ptr;
+	unsigned long ptr_end;
+
+	ptr = btrfs_item_ptr_offset(eb, slot);
+	ptr_end = ptr + item_size;
+	while (ptr < ptr_end) {
+		di = (struct btrfs_dir_item *)ptr;
+		if (verify_dir_item(root, eb, di))
+			return -EIO;
+		name_len = btrfs_dir_name_len(eb, di);
+		ret = replay_one_name(trans, root, path, eb, di, key);
+		BUG_ON(ret);
+		ptr = (unsigned long)(di + 1);
+		ptr += name_len;
+	}
+	return 0;
+}
+
+/*
+ * directory replay has two parts.  There are the standard directory
+ * items in the log copied from the subvolume, and range items
+ * created in the log while the subvolume was logged.
+ *
+ * The range items tell us which parts of the key space the log
+ * is authoritative for.  During replay, if a key in the subvolume
+ * directory is in a logged range item, but not actually in the log
+ * that means it was deleted from the directory before the fsync
+ * and should be removed.
+ */
+static noinline int find_dir_range(struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   u64 dirid, int key_type,
+				   u64 *start_ret, u64 *end_ret)
+{
+	struct btrfs_key key;
+	u64 found_end;
+	struct btrfs_dir_log_item *item;
+	int ret;
+	int nritems;
+
+	if (*start_ret == (u64)-1)
+		return 1;
+
+	key.objectid = dirid;
+	key.type = key_type;
+	key.offset = *start_ret;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+	}
+	if (ret != 0)
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+	if (key.type != key_type || key.objectid != dirid) {
+		ret = 1;
+		goto next;
+	}
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	found_end = btrfs_dir_log_end(path->nodes[0], item);
+
+	if (*start_ret >= key.offset && *start_ret <= found_end) {
+		ret = 0;
+		*start_ret = key.offset;
+		*end_ret = found_end;
+		goto out;
+	}
+	ret = 1;
+next:
+	/* check the next slot in the tree to see if it is a valid item */
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	if (path->slots[0] >= nritems) {
+		ret = btrfs_next_leaf(root, path);
+		if (ret)
+			goto out;
+	} else {
+		path->slots[0]++;
+	}
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+	if (key.type != key_type || key.objectid != dirid) {
+		ret = 1;
+		goto out;
+	}
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	found_end = btrfs_dir_log_end(path->nodes[0], item);
+	*start_ret = key.offset;
+	*end_ret = found_end;
+	ret = 0;
+out:
+	btrfs_release_path(path);
+	return ret;
+}
+
+/*
+ * this looks for a given directory item in the log.  If the directory
+ * item is not in the log, the item is removed and the inode it points
+ * to is unlinked
+ */
+static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_root *log,
+				      struct btrfs_path *path,
+				      struct btrfs_path *log_path,
+				      struct inode *dir,
+				      struct btrfs_key *dir_key)
+{
+	int ret;
+	struct extent_buffer *eb;
+	int slot;
+	u32 item_size;
+	struct btrfs_dir_item *di;
+	struct btrfs_dir_item *log_di;
+	int name_len;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	char *name;
+	struct inode *inode;
+	struct btrfs_key location;
+
+again:
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	item_size = btrfs_item_size_nr(eb, slot);
+	ptr = btrfs_item_ptr_offset(eb, slot);
+	ptr_end = ptr + item_size;
+	while (ptr < ptr_end) {
+		di = (struct btrfs_dir_item *)ptr;
+		if (verify_dir_item(root, eb, di)) {
+			ret = -EIO;
+			goto out;
+		}
+
+		name_len = btrfs_dir_name_len(eb, di);
+		name = kmalloc(name_len, GFP_NOFS);
+		if (!name) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		read_extent_buffer(eb, name, (unsigned long)(di + 1),
+				  name_len);
+		log_di = NULL;
+		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
+			log_di = btrfs_lookup_dir_item(trans, log, log_path,
+						       dir_key->objectid,
+						       name, name_len, 0);
+		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
+			log_di = btrfs_lookup_dir_index_item(trans, log,
+						     log_path,
+						     dir_key->objectid,
+						     dir_key->offset,
+						     name, name_len, 0);
+		}
+		if (IS_ERR_OR_NULL(log_di)) {
+			btrfs_dir_item_key_to_cpu(eb, di, &location);
+			btrfs_release_path(path);
+			btrfs_release_path(log_path);
+			inode = read_one_inode(root, location.objectid);
+			if (!inode) {
+				kfree(name);
+				return -EIO;
+			}
+
+			ret = link_to_fixup_dir(trans, root,
+						path, location.objectid);
+			BUG_ON(ret);
+			btrfs_inc_nlink(inode);
+			ret = btrfs_unlink_inode(trans, root, dir, inode,
+						 name, name_len);
+			BUG_ON(ret);
+			kfree(name);
+			iput(inode);
+
+			/* there might still be more names under this key
+			 * check and repeat if required
+			 */
+			ret = btrfs_search_slot(NULL, root, dir_key, path,
+						0, 0);
+			if (ret == 0)
+				goto again;
+			ret = 0;
+			goto out;
+		}
+		btrfs_release_path(log_path);
+		kfree(name);
+
+		ptr = (unsigned long)(di + 1);
+		ptr += name_len;
+	}
+	ret = 0;
+out:
+	btrfs_release_path(path);
+	btrfs_release_path(log_path);
+	return ret;
+}
+
+/*
+ * deletion replay happens before we copy any new directory items
+ * out of the log or out of backreferences from inodes.  It
+ * scans the log to find ranges of keys that log is authoritative for,
+ * and then scans the directory to find items in those ranges that are
+ * not present in the log.
+ *
+ * Anything we don't find in the log is unlinked and removed from the
+ * directory.
+ */
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       u64 dirid, int del_all)
+{
+	u64 range_start;
+	u64 range_end;
+	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
+	int ret = 0;
+	struct btrfs_key dir_key;
+	struct btrfs_key found_key;
+	struct btrfs_path *log_path;
+	struct inode *dir;
+
+	dir_key.objectid = dirid;
+	dir_key.type = BTRFS_DIR_ITEM_KEY;
+	log_path = btrfs_alloc_path();
+	if (!log_path)
+		return -ENOMEM;
+
+	dir = read_one_inode(root, dirid);
+	/* it isn't an error if the inode isn't there, that can happen
+	 * because we replay the deletes before we copy in the inode item
+	 * from the log
+	 */
+	if (!dir) {
+		btrfs_free_path(log_path);
+		return 0;
+	}
+again:
+	range_start = 0;
+	range_end = 0;
+	while (1) {
+		if (del_all)
+			range_end = (u64)-1;
+		else {
+			ret = find_dir_range(log, path, dirid, key_type,
+					     &range_start, &range_end);
+			if (ret != 0)
+				break;
+		}
+
+		dir_key.offset = range_start;
+		while (1) {
+			int nritems;
+			ret = btrfs_search_slot(NULL, root, &dir_key, path,
+						0, 0);
+			if (ret < 0)
+				goto out;
+
+			nritems = btrfs_header_nritems(path->nodes[0]);
+			if (path->slots[0] >= nritems) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret)
+					break;
+			}
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      path->slots[0]);
+			if (found_key.objectid != dirid ||
+			    found_key.type != dir_key.type)
+				goto next_type;
+
+			if (found_key.offset > range_end)
+				break;
+
+			ret = check_item_in_log(trans, root, log, path,
+						log_path, dir,
+						&found_key);
+			BUG_ON(ret);
+			if (found_key.offset == (u64)-1)
+				break;
+			dir_key.offset = found_key.offset + 1;
+		}
+		btrfs_release_path(path);
+		if (range_end == (u64)-1)
+			break;
+		range_start = range_end + 1;
+	}
+
+next_type:
+	ret = 0;
+	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
+		key_type = BTRFS_DIR_LOG_INDEX_KEY;
+		dir_key.type = BTRFS_DIR_INDEX_KEY;
+		btrfs_release_path(path);
+		goto again;
+	}
+out:
+	btrfs_release_path(path);
+	btrfs_free_path(log_path);
+	iput(dir);
+	return ret;
+}
+
+/*
+ * the process_func used to replay items from the log tree.  This
+ * gets called in two different stages.  The first stage just looks
+ * for inodes and makes sure they are all copied into the subvolume.
+ *
+ * The second stage copies all the other item types from the log into
+ * the subvolume.  The two stage approach is slower, but gets rid of
+ * lots of complexity around inodes referencing other inodes that exist
+ * only in the log (references come from either directory items or inode
+ * back refs).
+ */
+static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+			     struct walk_control *wc, u64 gen)
+{
+	int nritems;
+	struct btrfs_path *path;
+	struct btrfs_root *root = wc->replay_dest;
+	struct btrfs_key key;
+	int level;
+	int i;
+	int ret;
+
+	btrfs_read_buffer(eb, gen);
+
+	level = btrfs_header_level(eb);
+
+	if (level != 0)
+		return 0;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	nritems = btrfs_header_nritems(eb);
+	for (i = 0; i < nritems; i++) {
+		btrfs_item_key_to_cpu(eb, &key, i);
+
+		/* inode keys are done during the first stage */
+		if (key.type == BTRFS_INODE_ITEM_KEY &&
+		    wc->stage == LOG_WALK_REPLAY_INODES) {
+			struct btrfs_inode_item *inode_item;
+			u32 mode;
+
+			inode_item = btrfs_item_ptr(eb, i,
+					    struct btrfs_inode_item);
+			mode = btrfs_inode_mode(eb, inode_item);
+			if (S_ISDIR(mode)) {
+				ret = replay_dir_deletes(wc->trans,
+					 root, log, path, key.objectid, 0);
+				BUG_ON(ret);
+			}
+			ret = overwrite_item(wc->trans, root, path,
+					     eb, i, &key);
+			BUG_ON(ret);
+
+			/* for regular files, make sure corresponding
+			 * orhpan item exist. extents past the new EOF
+			 * will be truncated later by orphan cleanup.
+			 */
+			if (S_ISREG(mode)) {
+				ret = insert_orphan_item(wc->trans, root,
+							 key.objectid);
+				BUG_ON(ret);
+			}
+
+			ret = link_to_fixup_dir(wc->trans, root,
+						path, key.objectid);
+			BUG_ON(ret);
+		}
+		if (wc->stage < LOG_WALK_REPLAY_ALL)
+			continue;
+
+		/* these keys are simply copied */
+		if (key.type == BTRFS_XATTR_ITEM_KEY) {
+			ret = overwrite_item(wc->trans, root, path,
+					     eb, i, &key);
+			BUG_ON(ret);
+		} else if (key.type == BTRFS_INODE_REF_KEY) {
+			ret = add_inode_ref(wc->trans, root, log, path,
+					    eb, i, &key);
+			BUG_ON(ret && ret != -ENOENT);
+		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
+			ret = replay_one_extent(wc->trans, root, path,
+						eb, i, &key);
+			BUG_ON(ret);
+		} else if (key.type == BTRFS_DIR_ITEM_KEY ||
+			   key.type == BTRFS_DIR_INDEX_KEY) {
+			ret = replay_one_dir_item(wc->trans, root, path,
+						  eb, i, &key);
+			BUG_ON(ret);
+		}
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
+static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path, int *level,
+				   struct walk_control *wc)
+{
+	u64 root_owner;
+	u64 bytenr;
+	u64 ptr_gen;
+	struct extent_buffer *next;
+	struct extent_buffer *cur;
+	struct extent_buffer *parent;
+	u32 blocksize;
+	int ret = 0;
+
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	while (*level > 0) {
+		WARN_ON(*level < 0);
+		WARN_ON(*level >= BTRFS_MAX_LEVEL);
+		cur = path->nodes[*level];
+
+		if (btrfs_header_level(cur) != *level)
+			WARN_ON(1);
+
+		if (path->slots[*level] >=
+		    btrfs_header_nritems(cur))
+			break;
+
+		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+		blocksize = btrfs_level_size(root, *level - 1);
+
+		parent = path->nodes[*level];
+		root_owner = btrfs_header_owner(parent);
+
+		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+		if (!next)
+			return -ENOMEM;
+
+		if (*level == 1) {
+			wc->process_func(root, next, wc, ptr_gen);
+
+			path->slots[*level]++;
+			if (wc->free) {
+				btrfs_read_buffer(next, ptr_gen);
+
+				btrfs_tree_lock(next);
+				clean_tree_block(trans, root, next);
+				btrfs_set_lock_blocking(next);
+				btrfs_wait_tree_block_writeback(next);
+				btrfs_tree_unlock(next);
+
+				WARN_ON(root_owner !=
+					BTRFS_TREE_LOG_OBJECTID);
+				ret = btrfs_free_reserved_extent(root,
+							 bytenr, blocksize);
+				BUG_ON(ret);
+			}
+			free_extent_buffer(next);
+			continue;
+		}
+		btrfs_read_buffer(next, ptr_gen);
+
+		WARN_ON(*level <= 0);
+		if (path->nodes[*level-1])
+			free_extent_buffer(path->nodes[*level-1]);
+		path->nodes[*level-1] = next;
+		*level = btrfs_header_level(next);
+		path->slots[*level] = 0;
+		cond_resched();
+	}
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
+
+	cond_resched();
+	return 0;
+}
+
+static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path, int *level,
+				 struct walk_control *wc)
+{
+	u64 root_owner;
+	int i;
+	int slot;
+	int ret;
+
+	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+		slot = path->slots[i];
+		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
+			path->slots[i]++;
+			*level = i;
+			WARN_ON(*level == 0);
+			return 0;
+		} else {
+			struct extent_buffer *parent;
+			if (path->nodes[*level] == root->node)
+				parent = path->nodes[*level];
+			else
+				parent = path->nodes[*level + 1];
+
+			root_owner = btrfs_header_owner(parent);
+			wc->process_func(root, path->nodes[*level], wc,
+				 btrfs_header_generation(path->nodes[*level]));
+			if (wc->free) {
+				struct extent_buffer *next;
+
+				next = path->nodes[*level];
+
+				btrfs_tree_lock(next);
+				clean_tree_block(trans, root, next);
+				btrfs_set_lock_blocking(next);
+				btrfs_wait_tree_block_writeback(next);
+				btrfs_tree_unlock(next);
+
+				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+				ret = btrfs_free_reserved_extent(root,
+						path->nodes[*level]->start,
+						path->nodes[*level]->len);
+				BUG_ON(ret);
+			}
+			free_extent_buffer(path->nodes[*level]);
+			path->nodes[*level] = NULL;
+			*level = i + 1;
+		}
+	}
+	return 1;
+}
+
+/*
+ * drop the reference count on the tree rooted at 'snap'.  This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+static int walk_log_tree(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *log, struct walk_control *wc)
+{
+	int ret = 0;
+	int wret;
+	int level;
+	struct btrfs_path *path;
+	int i;
+	int orig_level;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	level = btrfs_header_level(log->node);
+	orig_level = level;
+	path->nodes[level] = log->node;
+	extent_buffer_get(log->node);
+	path->slots[level] = 0;
+
+	while (1) {
+		wret = walk_down_log_tree(trans, log, path, &level, wc);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+
+		wret = walk_up_log_tree(trans, log, path, &level, wc);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+	}
+
+	/* was the root node processed? if not, catch it here */
+	if (path->nodes[orig_level]) {
+		wc->process_func(log, path->nodes[orig_level], wc,
+			 btrfs_header_generation(path->nodes[orig_level]));
+		if (wc->free) {
+			struct extent_buffer *next;
+
+			next = path->nodes[orig_level];
+
+			btrfs_tree_lock(next);
+			clean_tree_block(trans, log, next);
+			btrfs_set_lock_blocking(next);
+			btrfs_wait_tree_block_writeback(next);
+			btrfs_tree_unlock(next);
+
+			WARN_ON(log->root_key.objectid !=
+				BTRFS_TREE_LOG_OBJECTID);
+			ret = btrfs_free_reserved_extent(log, next->start,
+							 next->len);
+			BUG_ON(ret);
+		}
+	}
+
+	for (i = 0; i <= orig_level; i++) {
+		if (path->nodes[i]) {
+			free_extent_buffer(path->nodes[i]);
+			path->nodes[i] = NULL;
+		}
+	}
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper function to update the item for a given subvolumes log root
+ * in the tree of log roots
+ */
+static int update_log_root(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *log)
+{
+	int ret;
+
+	if (log->log_transid == 1) {
+		/* insert root item on the first sync */
+		ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
+				&log->root_key, &log->root_item);
+	} else {
+		ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
+				&log->root_key, &log->root_item);
+	}
+	return ret;
+}
+
+static int wait_log_commit(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, unsigned long transid)
+{
+	DEFINE_WAIT(wait);
+	int index = transid % 2;
+
+	/*
+	 * we only allow two pending log transactions at a time,
+	 * so we know that if ours is more than 2 older than the
+	 * current transaction, we're done
+	 */
+	do {
+		prepare_to_wait(&root->log_commit_wait[index],
+				&wait, TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&root->log_mutex);
+
+		if (root->fs_info->last_trans_log_full_commit !=
+		    trans->transid && root->log_transid < transid + 2 &&
+		    atomic_read(&root->log_commit[index]))
+			schedule();
+
+		finish_wait(&root->log_commit_wait[index], &wait);
+		mutex_lock(&root->log_mutex);
+	} while (root->log_transid < transid + 2 &&
+		 atomic_read(&root->log_commit[index]));
+	return 0;
+}
+
+static int wait_for_writer(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root)
+{
+	DEFINE_WAIT(wait);
+	while (atomic_read(&root->log_writers)) {
+		prepare_to_wait(&root->log_writer_wait,
+				&wait, TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&root->log_mutex);
+		if (root->fs_info->last_trans_log_full_commit !=
+		    trans->transid && atomic_read(&root->log_writers))
+			schedule();
+		mutex_lock(&root->log_mutex);
+		finish_wait(&root->log_writer_wait, &wait);
+	}
+	return 0;
+}
+
+/*
+ * btrfs_sync_log does sends a given tree log down to the disk and
+ * updates the super blocks to record it.  When this call is done,
+ * you know that any inodes previously logged are safely on disk only
+ * if it returns 0.
+ *
+ * Any other return value means you need to call btrfs_commit_transaction.
+ * Some of the edge cases for fsyncing directories that have had unlinks
+ * or renames done in the past mean that sometimes the only safe
+ * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
+ * that has happened.
+ */
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+		   struct btrfs_root *root)
+{
+	int index1;
+	int index2;
+	int mark;
+	int ret;
+	struct btrfs_root *log = root->log_root;
+	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
+	unsigned long log_transid = 0;
+
+	mutex_lock(&root->log_mutex);
+	index1 = root->log_transid % 2;
+	if (atomic_read(&root->log_commit[index1])) {
+		wait_log_commit(trans, root, root->log_transid);
+		mutex_unlock(&root->log_mutex);
+		return 0;
+	}
+	atomic_set(&root->log_commit[index1], 1);
+
+	/* wait for previous tree log sync to complete */
+	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
+		wait_log_commit(trans, root, root->log_transid - 1);
+
+	while (1) {
+		unsigned long batch = root->log_batch;
+		if (root->log_multiple_pids) {
+			mutex_unlock(&root->log_mutex);
+			schedule_timeout_uninterruptible(1);
+			mutex_lock(&root->log_mutex);
+		}
+		wait_for_writer(trans, root);
+		if (batch == root->log_batch)
+			break;
+	}
+
+	/* bail out if we need to do a full commit */
+	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+		ret = -EAGAIN;
+		mutex_unlock(&root->log_mutex);
+		goto out;
+	}
+
+	log_transid = root->log_transid;
+	if (log_transid % 2 == 0)
+		mark = EXTENT_DIRTY;
+	else
+		mark = EXTENT_NEW;
+
+	/* we start IO on  all the marked extents here, but we don't actually
+	 * wait for them until later.
+	 */
+	ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
+	BUG_ON(ret);
+
+	btrfs_set_root_node(&log->root_item, log->node);
+
+	root->log_batch = 0;
+	root->log_transid++;
+	log->log_transid = root->log_transid;
+	root->log_start_pid = 0;
+	smp_mb();
+	/*
+	 * IO has been started, blocks of the log tree have WRITTEN flag set
+	 * in their headers. new modifications of the log will be written to
+	 * new positions. so it's safe to allow log writers to go in.
+	 */
+	mutex_unlock(&root->log_mutex);
+
+	mutex_lock(&log_root_tree->log_mutex);
+	log_root_tree->log_batch++;
+	atomic_inc(&log_root_tree->log_writers);
+	mutex_unlock(&log_root_tree->log_mutex);
+
+	ret = update_log_root(trans, log);
+
+	mutex_lock(&log_root_tree->log_mutex);
+	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
+		smp_mb();
+		if (waitqueue_active(&log_root_tree->log_writer_wait))
+			wake_up(&log_root_tree->log_writer_wait);
+	}
+
+	if (ret) {
+		BUG_ON(ret != -ENOSPC);
+		root->fs_info->last_trans_log_full_commit = trans->transid;
+		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+		mutex_unlock(&log_root_tree->log_mutex);
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	index2 = log_root_tree->log_transid % 2;
+	if (atomic_read(&log_root_tree->log_commit[index2])) {
+		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+		wait_log_commit(trans, log_root_tree,
+				log_root_tree->log_transid);
+		mutex_unlock(&log_root_tree->log_mutex);
+		ret = 0;
+		goto out;
+	}
+	atomic_set(&log_root_tree->log_commit[index2], 1);
+
+	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
+		wait_log_commit(trans, log_root_tree,
+				log_root_tree->log_transid - 1);
+	}
+
+	wait_for_writer(trans, log_root_tree);
+
+	/*
+	 * now that we've moved on to the tree of log tree roots,
+	 * check the full commit flag again
+	 */
+	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+		mutex_unlock(&log_root_tree->log_mutex);
+		ret = -EAGAIN;
+		goto out_wake_log_root;
+	}
+
+	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
+				&log_root_tree->dirty_log_pages,
+				EXTENT_DIRTY | EXTENT_NEW);
+	BUG_ON(ret);
+	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+
+	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+				log_root_tree->node->start);
+	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+				btrfs_header_level(log_root_tree->node));
+
+	log_root_tree->log_batch = 0;
+	log_root_tree->log_transid++;
+	smp_mb();
+
+	mutex_unlock(&log_root_tree->log_mutex);
+
+	/*
+	 * nobody else is going to jump in and write the the ctree
+	 * super here because the log_commit atomic below is protecting
+	 * us.  We must be called with a transaction handle pinning
+	 * the running transaction open, so a full commit can't hop
+	 * in and cause problems either.
+	 */
+	btrfs_scrub_pause_super(root);
+	write_ctree_super(trans, root->fs_info->tree_root, 1);
+	btrfs_scrub_continue_super(root);
+	ret = 0;
+
+	mutex_lock(&root->log_mutex);
+	if (root->last_log_commit < log_transid)
+		root->last_log_commit = log_transid;
+	mutex_unlock(&root->log_mutex);
+
+out_wake_log_root:
+	atomic_set(&log_root_tree->log_commit[index2], 0);
+	smp_mb();
+	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
+		wake_up(&log_root_tree->log_commit_wait[index2]);
+out:
+	atomic_set(&root->log_commit[index1], 0);
+	smp_mb();
+	if (waitqueue_active(&root->log_commit_wait[index1]))
+		wake_up(&root->log_commit_wait[index1]);
+	return ret;
+}
+
+static void free_log_tree(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *log)
+{
+	int ret;
+	u64 start;
+	u64 end;
+	struct walk_control wc = {
+		.free = 1,
+		.process_func = process_one_buffer
+	};
+
+	ret = walk_log_tree(trans, log, &wc);
+	BUG_ON(ret);
+
+	while (1) {
+		ret = find_first_extent_bit(&log->dirty_log_pages,
+				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
+		if (ret)
+			break;
+
+		clear_extent_bits(&log->dirty_log_pages, start, end,
+				  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
+	}
+
+	free_extent_buffer(log->node);
+	kfree(log);
+}
+
+/*
+ * free all the extents used by the tree log.  This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+	if (root->log_root) {
+		free_log_tree(trans, root->log_root);
+		root->log_root = NULL;
+	}
+	return 0;
+}
+
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info)
+{
+	if (fs_info->log_root_tree) {
+		free_log_tree(trans, fs_info->log_root_tree);
+		fs_info->log_root_tree = NULL;
+	}
+	return 0;
+}
+
+/*
+ * If both a file and directory are logged, and unlinks or renames are
+ * mixed in, we have a few interesting corners:
+ *
+ * create file X in dir Y
+ * link file X to X.link in dir Y
+ * fsync file X
+ * unlink file X but leave X.link
+ * fsync dir Y
+ *
+ * After a crash we would expect only X.link to exist.  But file X
+ * didn't get fsync'd again so the log has back refs for X and X.link.
+ *
+ * We solve this by removing directory entries and inode backrefs from the
+ * log when a file that was logged in the current transaction is
+ * unlinked.  Any later fsync will include the updated log entries, and
+ * we'll be able to reconstruct the proper directory items from backrefs.
+ *
+ * This optimizations allows us to avoid relogging the entire inode
+ * or the entire directory.
+ */
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 const char *name, int name_len,
+				 struct inode *dir, u64 index)
+{
+	struct btrfs_root *log;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	int ret;
+	int err = 0;
+	int bytes_del = 0;
+	u64 dir_ino = btrfs_ino(dir);
+
+	if (BTRFS_I(dir)->logged_trans < trans->transid)
+		return 0;
+
+	ret = join_running_log_trans(root);
+	if (ret)
+		return 0;
+
+	mutex_lock(&BTRFS_I(dir)->log_mutex);
+
+	log = root->log_root;
+	path = btrfs_alloc_path();
+	if (!path) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
+				   name, name_len, -1);
+	if (IS_ERR(di)) {
+		err = PTR_ERR(di);
+		goto fail;
+	}
+	if (di) {
+		ret = btrfs_delete_one_dir_name(trans, log, path, di);
+		bytes_del += name_len;
+		BUG_ON(ret);
+	}
+	btrfs_release_path(path);
+	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
+					 index, name, name_len, -1);
+	if (IS_ERR(di)) {
+		err = PTR_ERR(di);
+		goto fail;
+	}
+	if (di) {
+		ret = btrfs_delete_one_dir_name(trans, log, path, di);
+		bytes_del += name_len;
+		BUG_ON(ret);
+	}
+
+	/* update the directory size in the log to reflect the names
+	 * we have removed
+	 */
+	if (bytes_del) {
+		struct btrfs_key key;
+
+		key.objectid = dir_ino;
+		key.offset = 0;
+		key.type = BTRFS_INODE_ITEM_KEY;
+		btrfs_release_path(path);
+
+		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+		if (ret < 0) {
+			err = ret;
+			goto fail;
+		}
+		if (ret == 0) {
+			struct btrfs_inode_item *item;
+			u64 i_size;
+
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_inode_item);
+			i_size = btrfs_inode_size(path->nodes[0], item);
+			if (i_size > bytes_del)
+				i_size -= bytes_del;
+			else
+				i_size = 0;
+			btrfs_set_inode_size(path->nodes[0], item, i_size);
+			btrfs_mark_buffer_dirty(path->nodes[0]);
+		} else
+			ret = 0;
+		btrfs_release_path(path);
+	}
+fail:
+	btrfs_free_path(path);
+out_unlock:
+	mutex_unlock(&BTRFS_I(dir)->log_mutex);
+	if (ret == -ENOSPC) {
+		root->fs_info->last_trans_log_full_commit = trans->transid;
+		ret = 0;
+	}
+	btrfs_end_log_trans(root);
+
+	return err;
+}
+
+/* see comments for btrfs_del_dir_entries_in_log */
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       const char *name, int name_len,
+			       struct inode *inode, u64 dirid)
+{
+	struct btrfs_root *log;
+	u64 index;
+	int ret;
+
+	if (BTRFS_I(inode)->logged_trans < trans->transid)
+		return 0;
+
+	ret = join_running_log_trans(root);
+	if (ret)
+		return 0;
+	log = root->log_root;
+	mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
+				  dirid, &index);
+	mutex_unlock(&BTRFS_I(inode)->log_mutex);
+	if (ret == -ENOSPC) {
+		root->fs_info->last_trans_log_full_commit = trans->transid;
+		ret = 0;
+	}
+	btrfs_end_log_trans(root);
+
+	return ret;
+}
+
+/*
+ * creates a range item in the log for 'dirid'.  first_offset and
+ * last_offset tell us which parts of the key space the log should
+ * be considered authoritative for.
+ */
+static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       int key_type, u64 dirid,
+				       u64 first_offset, u64 last_offset)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_dir_log_item *item;
+
+	key.objectid = dirid;
+	key.offset = first_offset;
+	if (key_type == BTRFS_DIR_ITEM_KEY)
+		key.type = BTRFS_DIR_LOG_ITEM_KEY;
+	else
+		key.type = BTRFS_DIR_LOG_INDEX_KEY;
+	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
+	if (ret)
+		return ret;
+
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(path);
+	return 0;
+}
+
+/*
+ * log all the items included in the current transaction for a given
+ * directory.  This also creates the range items in the log tree required
+ * to replay anything deleted before the fsync
+ */
+static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct inode *inode,
+			  struct btrfs_path *path,
+			  struct btrfs_path *dst_path, int key_type,
+			  u64 min_offset, u64 *last_offset_ret)
+{
+	struct btrfs_key min_key;
+	struct btrfs_key max_key;
+	struct btrfs_root *log = root->log_root;
+	struct extent_buffer *src;
+	int err = 0;
+	int ret;
+	int i;
+	int nritems;
+	u64 first_offset = min_offset;
+	u64 last_offset = (u64)-1;
+	u64 ino = btrfs_ino(inode);
+
+	log = root->log_root;
+	max_key.objectid = ino;
+	max_key.offset = (u64)-1;
+	max_key.type = key_type;
+
+	min_key.objectid = ino;
+	min_key.type = key_type;
+	min_key.offset = min_offset;
+
+	path->keep_locks = 1;
+
+	ret = btrfs_search_forward(root, &min_key, &max_key,
+				   path, 0, trans->transid);
+
+	/*
+	 * we didn't find anything from this transaction, see if there
+	 * is anything at all
+	 */
+	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
+		min_key.objectid = ino;
+		min_key.type = key_type;
+		min_key.offset = (u64)-1;
+		btrfs_release_path(path);
+		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+		if (ret < 0) {
+			btrfs_release_path(path);
+			return ret;
+		}
+		ret = btrfs_previous_item(root, path, ino, key_type);
+
+		/* if ret == 0 there are items for this type,
+		 * create a range to tell us the last key of this type.
+		 * otherwise, there are no items in this directory after
+		 * *min_offset, and we create a range to indicate that.
+		 */
+		if (ret == 0) {
+			struct btrfs_key tmp;
+			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
+					      path->slots[0]);
+			if (key_type == tmp.type)
+				first_offset = max(min_offset, tmp.offset) + 1;
+		}
+		goto done;
+	}
+
+	/* go backward to find any previous key */
+	ret = btrfs_previous_item(root, path, ino, key_type);
+	if (ret == 0) {
+		struct btrfs_key tmp;
+		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+		if (key_type == tmp.type) {
+			first_offset = tmp.offset;
+			ret = overwrite_item(trans, log, dst_path,
+					     path->nodes[0], path->slots[0],
+					     &tmp);
+			if (ret) {
+				err = ret;
+				goto done;
+			}
+		}
+	}
+	btrfs_release_path(path);
+
+	/* find the first key from this transaction again */
+	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+	if (ret != 0) {
+		WARN_ON(1);
+		goto done;
+	}
+
+	/*
+	 * we have a block from this transaction, log every item in it
+	 * from our directory
+	 */
+	while (1) {
+		struct btrfs_key tmp;
+		src = path->nodes[0];
+		nritems = btrfs_header_nritems(src);
+		for (i = path->slots[0]; i < nritems; i++) {
+			btrfs_item_key_to_cpu(src, &min_key, i);
+
+			if (min_key.objectid != ino || min_key.type != key_type)
+				goto done;
+			ret = overwrite_item(trans, log, dst_path, src, i,
+					     &min_key);
+			if (ret) {
+				err = ret;
+				goto done;
+			}
+		}
+		path->slots[0] = nritems;
+
+		/*
+		 * look ahead to the next item and see if it is also
+		 * from this directory and from this transaction
+		 */
+		ret = btrfs_next_leaf(root, path);
+		if (ret == 1) {
+			last_offset = (u64)-1;
+			goto done;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+		if (tmp.objectid != ino || tmp.type != key_type) {
+			last_offset = (u64)-1;
+			goto done;
+		}
+		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
+			ret = overwrite_item(trans, log, dst_path,
+					     path->nodes[0], path->slots[0],
+					     &tmp);
+			if (ret)
+				err = ret;
+			else
+				last_offset = tmp.offset;
+			goto done;
+		}
+	}
+done:
+	btrfs_release_path(path);
+	btrfs_release_path(dst_path);
+
+	if (err == 0) {
+		*last_offset_ret = last_offset;
+		/*
+		 * insert the log range keys to indicate where the log
+		 * is valid
+		 */
+		ret = insert_dir_log_key(trans, log, path, key_type,
+					 ino, first_offset, last_offset);
+		if (ret)
+			err = ret;
+	}
+	return err;
+}
+
+/*
+ * logging directories is very similar to logging inodes, We find all the items
+ * from the current transaction and write them to the log.
+ *
+ * The recovery code scans the directory in the subvolume, and if it finds a
+ * key in the range logged that is not present in the log tree, then it means
+ * that dir entry was unlinked during the transaction.
+ *
+ * In order for that scan to work, we must include one key smaller than
+ * the smallest logged by this transaction and one key larger than the largest
+ * key logged by this transaction.
+ */
+static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct inode *inode,
+			  struct btrfs_path *path,
+			  struct btrfs_path *dst_path)
+{
+	u64 min_key;
+	u64 max_key;
+	int ret;
+	int key_type = BTRFS_DIR_ITEM_KEY;
+
+again:
+	min_key = 0;
+	max_key = 0;
+	while (1) {
+		ret = log_dir_items(trans, root, inode, path,
+				    dst_path, key_type, min_key,
+				    &max_key);
+		if (ret)
+			return ret;
+		if (max_key == (u64)-1)
+			break;
+		min_key = max_key + 1;
+	}
+
+	if (key_type == BTRFS_DIR_ITEM_KEY) {
+		key_type = BTRFS_DIR_INDEX_KEY;
+		goto again;
+	}
+	return 0;
+}
+
+/*
+ * a helper function to drop items from the log before we relog an
+ * inode.  max_key_type indicates the highest item type to remove.
+ * This cannot be run for file data extents because it does not
+ * free the extents they point to.
+ */
+static int drop_objectid_items(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *log,
+				  struct btrfs_path *path,
+				  u64 objectid, int max_key_type)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+
+	key.objectid = objectid;
+	key.type = max_key_type;
+	key.offset = (u64)-1;
+
+	while (1) {
+		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+		BUG_ON(ret == 0);
+		if (ret < 0)
+			break;
+
+		if (path->slots[0] == 0)
+			break;
+
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+
+		if (found_key.objectid != objectid)
+			break;
+
+		ret = btrfs_del_item(trans, log, path);
+		if (ret)
+			break;
+		btrfs_release_path(path);
+	}
+	btrfs_release_path(path);
+	return ret;
+}
+
+static noinline int copy_items(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *log,
+			       struct btrfs_path *dst_path,
+			       struct extent_buffer *src,
+			       int start_slot, int nr, int inode_only)
+{
+	unsigned long src_offset;
+	unsigned long dst_offset;
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_inode_item *inode_item;
+	int ret;
+	struct btrfs_key *ins_keys;
+	u32 *ins_sizes;
+	char *ins_data;
+	int i;
+	struct list_head ordered_sums;
+
+	INIT_LIST_HEAD(&ordered_sums);
+
+	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
+			   nr * sizeof(u32), GFP_NOFS);
+	if (!ins_data)
+		return -ENOMEM;
+
+	ins_sizes = (u32 *)ins_data;
+	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+
+	for (i = 0; i < nr; i++) {
+		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
+	}
+	ret = btrfs_insert_empty_items(trans, log, dst_path,
+				       ins_keys, ins_sizes, nr);
+	if (ret) {
+		kfree(ins_data);
+		return ret;
+	}
+
+	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
+		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
+						   dst_path->slots[0]);
+
+		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+
+		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+				   src_offset, ins_sizes[i]);
+
+		if (inode_only == LOG_INODE_EXISTS &&
+		    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+			inode_item = btrfs_item_ptr(dst_path->nodes[0],
+						    dst_path->slots[0],
+						    struct btrfs_inode_item);
+			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
+
+			/* set the generation to zero so the recover code
+			 * can tell the difference between an logging
+			 * just to say 'this inode exists' and a logging
+			 * to say 'update this inode with these values'
+			 */
+			btrfs_set_inode_generation(dst_path->nodes[0],
+						   inode_item, 0);
+		}
+		/* take a reference on file data extents so that truncates
+		 * or deletes of this inode don't have to relog the inode
+		 * again
+		 */
+		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+			int found_type;
+			extent = btrfs_item_ptr(src, start_slot + i,
+						struct btrfs_file_extent_item);
+
+			if (btrfs_file_extent_generation(src, extent) < trans->transid)
+				continue;
+
+			found_type = btrfs_file_extent_type(src, extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG ||
+			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+				u64 ds, dl, cs, cl;
+				ds = btrfs_file_extent_disk_bytenr(src,
+								extent);
+				/* ds == 0 is a hole */
+				if (ds == 0)
+					continue;
+
+				dl = btrfs_file_extent_disk_num_bytes(src,
+								extent);
+				cs = btrfs_file_extent_offset(src, extent);
+				cl = btrfs_file_extent_num_bytes(src,
+								extent);
+				if (btrfs_file_extent_compression(src,
+								  extent)) {
+					cs = 0;
+					cl = dl;
+				}
+
+				ret = btrfs_lookup_csums_range(
+						log->fs_info->csum_root,
+						ds + cs, ds + cs + cl - 1,
+						&ordered_sums, 0);
+				BUG_ON(ret);
+			}
+		}
+	}
+
+	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+	btrfs_release_path(dst_path);
+	kfree(ins_data);
+
+	/*
+	 * we have to do this after the loop above to avoid changing the
+	 * log tree while trying to change the log tree.
+	 */
+	ret = 0;
+	while (!list_empty(&ordered_sums)) {
+		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+						   struct btrfs_ordered_sum,
+						   list);
+		if (!ret)
+			ret = btrfs_csum_file_blocks(trans, log, sums);
+		list_del(&sums->list);
+		kfree(sums);
+	}
+	return ret;
+}
+
+/* log a single inode in the tree log.
+ * At least one parent directory for this inode must exist in the tree
+ * or be logged already.
+ *
+ * Any items from this inode changed by the current transaction are copied
+ * to the log tree.  An extra reference is taken on any extents in this
+ * file, allowing us to avoid a whole pile of corner cases around logging
+ * blocks that have been removed from the tree.
+ *
+ * See LOG_INODE_ALL and related defines for a description of what inode_only
+ * does.
+ *
+ * This handles both files and directories.
+ */
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, struct inode *inode,
+			     int inode_only)
+{
+	struct btrfs_path *path;
+	struct btrfs_path *dst_path;
+	struct btrfs_key min_key;
+	struct btrfs_key max_key;
+	struct btrfs_root *log = root->log_root;
+	struct extent_buffer *src = NULL;
+	int err = 0;
+	int ret;
+	int nritems;
+	int ins_start_slot = 0;
+	int ins_nr;
+	u64 ino = btrfs_ino(inode);
+
+	log = root->log_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	dst_path = btrfs_alloc_path();
+	if (!dst_path) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	min_key.objectid = ino;
+	min_key.type = BTRFS_INODE_ITEM_KEY;
+	min_key.offset = 0;
+
+	max_key.objectid = ino;
+
+	/* today the code can only do partial logging of directories */
+	if (!S_ISDIR(inode->i_mode))
+	    inode_only = LOG_INODE_ALL;
+
+	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+		max_key.type = BTRFS_XATTR_ITEM_KEY;
+	else
+		max_key.type = (u8)-1;
+	max_key.offset = (u64)-1;
+
+	ret = btrfs_commit_inode_delayed_items(trans, inode);
+	if (ret) {
+		btrfs_free_path(path);
+		btrfs_free_path(dst_path);
+		return ret;
+	}
+
+	mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+	/*
+	 * a brute force approach to making sure we get the most uptodate
+	 * copies of everything.
+	 */
+	if (S_ISDIR(inode->i_mode)) {
+		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
+
+		if (inode_only == LOG_INODE_EXISTS)
+			max_key_type = BTRFS_XATTR_ITEM_KEY;
+		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
+	} else {
+		ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+	}
+	if (ret) {
+		err = ret;
+		goto out_unlock;
+	}
+	path->keep_locks = 1;
+
+	while (1) {
+		ins_nr = 0;
+		ret = btrfs_search_forward(root, &min_key, &max_key,
+					   path, 0, trans->transid);
+		if (ret != 0)
+			break;
+again:
+		/* note, ins_nr might be > 0 here, cleanup outside the loop */
+		if (min_key.objectid != ino)
+			break;
+		if (min_key.type > max_key.type)
+			break;
+
+		src = path->nodes[0];
+		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+			ins_nr++;
+			goto next_slot;
+		} else if (!ins_nr) {
+			ins_start_slot = path->slots[0];
+			ins_nr = 1;
+			goto next_slot;
+		}
+
+		ret = copy_items(trans, log, dst_path, src, ins_start_slot,
+				 ins_nr, inode_only);
+		if (ret) {
+			err = ret;
+			goto out_unlock;
+		}
+		ins_nr = 1;
+		ins_start_slot = path->slots[0];
+next_slot:
+
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		path->slots[0]++;
+		if (path->slots[0] < nritems) {
+			btrfs_item_key_to_cpu(path->nodes[0], &min_key,
+					      path->slots[0]);
+			goto again;
+		}
+		if (ins_nr) {
+			ret = copy_items(trans, log, dst_path, src,
+					 ins_start_slot,
+					 ins_nr, inode_only);
+			if (ret) {
+				err = ret;
+				goto out_unlock;
+			}
+			ins_nr = 0;
+		}
+		btrfs_release_path(path);
+
+		if (min_key.offset < (u64)-1)
+			min_key.offset++;
+		else if (min_key.type < (u8)-1)
+			min_key.type++;
+		else if (min_key.objectid < (u64)-1)
+			min_key.objectid++;
+		else
+			break;
+	}
+	if (ins_nr) {
+		ret = copy_items(trans, log, dst_path, src,
+				 ins_start_slot,
+				 ins_nr, inode_only);
+		if (ret) {
+			err = ret;
+			goto out_unlock;
+		}
+		ins_nr = 0;
+	}
+	WARN_ON(ins_nr);
+	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+		btrfs_release_path(path);
+		btrfs_release_path(dst_path);
+		ret = log_directory_changes(trans, root, inode, path, dst_path);
+		if (ret) {
+			err = ret;
+			goto out_unlock;
+		}
+	}
+	BTRFS_I(inode)->logged_trans = trans->transid;
+out_unlock:
+	mutex_unlock(&BTRFS_I(inode)->log_mutex);
+
+	btrfs_free_path(path);
+	btrfs_free_path(dst_path);
+	return err;
+}
+
+/*
+ * follow the dentry parent pointers up the chain and see if any
+ * of the directories in it require a full commit before they can
+ * be logged.  Returns zero if nothing special needs to be done or 1 if
+ * a full commit is required.
+ */
+static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
+					       struct inode *inode,
+					       struct dentry *parent,
+					       struct super_block *sb,
+					       u64 last_committed)
+{
+	int ret = 0;
+	struct btrfs_root *root;
+	struct dentry *old_parent = NULL;
+
+	/*
+	 * for regular files, if its inode is already on disk, we don't
+	 * have to worry about the parents at all.  This is because
+	 * we can use the last_unlink_trans field to record renames
+	 * and other fun in this file.
+	 */
+	if (S_ISREG(inode->i_mode) &&
+	    BTRFS_I(inode)->generation <= last_committed &&
+	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
+			goto out;
+
+	if (!S_ISDIR(inode->i_mode)) {
+		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+			goto out;
+		inode = parent->d_inode;
+	}
+
+	while (1) {
+		BTRFS_I(inode)->logged_trans = trans->transid;
+		smp_mb();
+
+		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+			root = BTRFS_I(inode)->root;
+
+			/*
+			 * make sure any commits to the log are forced
+			 * to be full commits
+			 */
+			root->fs_info->last_trans_log_full_commit =
+				trans->transid;
+			ret = 1;
+			break;
+		}
+
+		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+			break;
+
+		if (IS_ROOT(parent))
+			break;
+
+		parent = dget_parent(parent);
+		dput(old_parent);
+		old_parent = parent;
+		inode = parent->d_inode;
+
+	}
+	dput(old_parent);
+out:
+	return ret;
+}
+
+static int inode_in_log(struct btrfs_trans_handle *trans,
+		 struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+
+	mutex_lock(&root->log_mutex);
+	if (BTRFS_I(inode)->logged_trans == trans->transid &&
+	    BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
+		ret = 1;
+	mutex_unlock(&root->log_mutex);
+	return ret;
+}
+
+
+/*
+ * helper function around btrfs_log_inode to make sure newly created
+ * parent directories also end up in the log.  A minimal inode and backref
+ * only logging is done of any parent directories that are older than
+ * the last committed transaction
+ */
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct inode *inode,
+		    struct dentry *parent, int exists_only)
+{
+	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
+	struct super_block *sb;
+	struct dentry *old_parent = NULL;
+	int ret = 0;
+	u64 last_committed = root->fs_info->last_trans_committed;
+
+	sb = inode->i_sb;
+
+	if (btrfs_test_opt(root, NOTREELOG)) {
+		ret = 1;
+		goto end_no_trans;
+	}
+
+	if (root->fs_info->last_trans_log_full_commit >
+	    root->fs_info->last_trans_committed) {
+		ret = 1;
+		goto end_no_trans;
+	}
+
+	if (root != BTRFS_I(inode)->root ||
+	    btrfs_root_refs(&root->root_item) == 0) {
+		ret = 1;
+		goto end_no_trans;
+	}
+
+	ret = check_parent_dirs_for_sync(trans, inode, parent,
+					 sb, last_committed);
+	if (ret)
+		goto end_no_trans;
+
+	if (inode_in_log(trans, inode)) {
+		ret = BTRFS_NO_LOG_SYNC;
+		goto end_no_trans;
+	}
+
+	ret = start_log_trans(trans, root);
+	if (ret)
+		goto end_trans;
+
+	ret = btrfs_log_inode(trans, root, inode, inode_only);
+	if (ret)
+		goto end_trans;
+
+	/*
+	 * for regular files, if its inode is already on disk, we don't
+	 * have to worry about the parents at all.  This is because
+	 * we can use the last_unlink_trans field to record renames
+	 * and other fun in this file.
+	 */
+	if (S_ISREG(inode->i_mode) &&
+	    BTRFS_I(inode)->generation <= last_committed &&
+	    BTRFS_I(inode)->last_unlink_trans <= last_committed) {
+		ret = 0;
+		goto end_trans;
+	}
+
+	inode_only = LOG_INODE_EXISTS;
+	while (1) {
+		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+			break;
+
+		inode = parent->d_inode;
+		if (root != BTRFS_I(inode)->root)
+			break;
+
+		if (BTRFS_I(inode)->generation >
+		    root->fs_info->last_trans_committed) {
+			ret = btrfs_log_inode(trans, root, inode, inode_only);
+			if (ret)
+				goto end_trans;
+		}
+		if (IS_ROOT(parent))
+			break;
+
+		parent = dget_parent(parent);
+		dput(old_parent);
+		old_parent = parent;
+	}
+	ret = 0;
+end_trans:
+	dput(old_parent);
+	if (ret < 0) {
+		BUG_ON(ret != -ENOSPC);
+		root->fs_info->last_trans_log_full_commit = trans->transid;
+		ret = 1;
+	}
+	btrfs_end_log_trans(root);
+end_no_trans:
+	return ret;
+}
+
+/*
+ * it is not safe to log dentry if the chunk root has added new
+ * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
+ * If this returns 1, you must commit the transaction to safely get your
+ * data on disk.
+ */
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct dentry *dentry)
+{
+	struct dentry *parent = dget_parent(dentry);
+	int ret;
+
+	ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
+	dput(parent);
+
+	return ret;
+}
+
+/*
+ * should be called during mount to recover any replay any log trees
+ * from the FS
+ */
+int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_key tmp_key;
+	struct btrfs_root *log;
+	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+	struct walk_control wc = {
+		.process_func = process_one_buffer,
+		.stage = 0,
+	};
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	fs_info->log_root_recovering = 1;
+
+	trans = btrfs_start_transaction(fs_info->tree_root, 0);
+	BUG_ON(IS_ERR(trans));
+
+	wc.trans = trans;
+	wc.pin = 1;
+
+	ret = walk_log_tree(trans, log_root_tree, &wc);
+	BUG_ON(ret);
+
+again:
+	key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	key.offset = (u64)-1;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		btrfs_release_path(path);
+		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+			break;
+
+		log = btrfs_read_fs_root_no_radix(log_root_tree,
+						  &found_key);
+		BUG_ON(IS_ERR(log));
+
+		tmp_key.objectid = found_key.offset;
+		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
+		tmp_key.offset = (u64)-1;
+
+		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
+		BUG_ON(IS_ERR_OR_NULL(wc.replay_dest));
+
+		wc.replay_dest->log_root = log;
+		btrfs_record_root_in_trans(trans, wc.replay_dest);
+		ret = walk_log_tree(trans, log, &wc);
+		BUG_ON(ret);
+
+		if (wc.stage == LOG_WALK_REPLAY_ALL) {
+			ret = fixup_inode_link_counts(trans, wc.replay_dest,
+						      path);
+			BUG_ON(ret);
+		}
+
+		key.offset = found_key.offset - 1;
+		wc.replay_dest->log_root = NULL;
+		free_extent_buffer(log->node);
+		free_extent_buffer(log->commit_root);
+		kfree(log);
+
+		if (found_key.offset == 0)
+			break;
+	}
+	btrfs_release_path(path);
+
+	/* step one is to pin it all, step two is to replay just inodes */
+	if (wc.pin) {
+		wc.pin = 0;
+		wc.process_func = replay_one_buffer;
+		wc.stage = LOG_WALK_REPLAY_INODES;
+		goto again;
+	}
+	/* step three is to replay everything */
+	if (wc.stage < LOG_WALK_REPLAY_ALL) {
+		wc.stage++;
+		goto again;
+	}
+
+	btrfs_free_path(path);
+
+	free_extent_buffer(log_root_tree->node);
+	log_root_tree->log_root = NULL;
+	fs_info->log_root_recovering = 0;
+
+	/* step 4: commit the transaction, which also unpins the blocks */
+	btrfs_commit_transaction(trans, fs_info->tree_root);
+
+	kfree(log_root_tree);
+	return 0;
+}
+
+/*
+ * there are some corner cases where we want to force a full
+ * commit instead of allowing a directory to be logged.
+ *
+ * They revolve around files there were unlinked from the directory, and
+ * this function updates the parent directory so that a full commit is
+ * properly done if it is fsync'd later after the unlinks are done.
+ */
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+			     struct inode *dir, struct inode *inode,
+			     int for_rename)
+{
+	/*
+	 * when we're logging a file, if it hasn't been renamed
+	 * or unlinked, and its inode is fully committed on disk,
+	 * we don't have to worry about walking up the directory chain
+	 * to log its parents.
+	 *
+	 * So, we use the last_unlink_trans field to put this transid
+	 * into the file.  When the file is logged we check it and
+	 * don't log the parents if the file is fully on disk.
+	 */
+	if (S_ISREG(inode->i_mode))
+		BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+	/*
+	 * if this directory was already logged any new
+	 * names for this file/dir will get recorded
+	 */
+	smp_mb();
+	if (BTRFS_I(dir)->logged_trans == trans->transid)
+		return;
+
+	/*
+	 * if the inode we're about to unlink was logged,
+	 * the log will be properly updated for any new names
+	 */
+	if (BTRFS_I(inode)->logged_trans == trans->transid)
+		return;
+
+	/*
+	 * when renaming files across directories, if the directory
+	 * there we're unlinking from gets fsync'd later on, there's
+	 * no way to find the destination directory later and fsync it
+	 * properly.  So, we have to be conservative and force commits
+	 * so the new name gets discovered.
+	 */
+	if (for_rename)
+		goto record;
+
+	/* we can safely do the unlink without any special recording */
+	return;
+
+record:
+	BTRFS_I(dir)->last_unlink_trans = trans->transid;
+}
+
+/*
+ * Call this after adding a new name for a file and it will properly
+ * update the log to reflect the new name.
+ *
+ * It will return zero if all goes well, and it will return 1 if a
+ * full transaction commit is required.
+ */
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+			struct inode *inode, struct inode *old_dir,
+			struct dentry *parent)
+{
+	struct btrfs_root * root = BTRFS_I(inode)->root;
+
+	/*
+	 * this will force the logging code to walk the dentry chain
+	 * up for the file
+	 */
+	if (S_ISREG(inode->i_mode))
+		BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+	/*
+	 * if this inode hasn't been logged and directory we're renaming it
+	 * from hasn't been logged, we don't need to log it
+	 */
+	if (BTRFS_I(inode)->logged_trans <=
+	    root->fs_info->last_trans_committed &&
+	    (!old_dir || BTRFS_I(old_dir)->logged_trans <=
+		    root->fs_info->last_trans_committed))
+		return 0;
+
+	return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+}
+
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 00000000..2270ac58
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __TREE_LOG_
+#define __TREE_LOG_
+
+/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
+#define BTRFS_NO_LOG_SYNC 256
+
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+		   struct btrfs_root *root);
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info);
+int btrfs_recover_log_trees(struct btrfs_root *tree_root);
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct dentry *dentry);
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 const char *name, int name_len,
+				 struct inode *dir, u64 index);
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       const char *name, int name_len,
+			       struct inode *inode, u64 dirid);
+int btrfs_end_log_trans(struct btrfs_root *root);
+int btrfs_pin_log_trans(struct btrfs_root *root);
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct inode *inode,
+		    struct dentry *parent, int exists_only);
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+			     struct inode *dir, struct inode *inode,
+			     int for_rename);
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+			struct inode *inode, struct inode *old_dir,
+			struct dentry *parent);
+#endif
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
new file mode 100644
index 00000000..9bf3946d
--- /dev/null
+++ b/fs/btrfs/version.h
@@ -0,0 +1,4 @@
+#ifndef __BTRFS_VERSION_H
+#define __BTRFS_VERSION_H
+#define BTRFS_BUILD_VERSION "Btrfs"
+#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 00000000..43baaf0c
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,3718 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+
+static int init_first_rw_device(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_device *device);
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+
+static DEFINE_MUTEX(uuid_mutex);
+static LIST_HEAD(fs_uuids);
+
+static void lock_chunks(struct btrfs_root *root)
+{
+	mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static void unlock_chunks(struct btrfs_root *root)
+{
+	mutex_unlock(&root->fs_info->chunk_mutex);
+}
+
+static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct btrfs_device *device;
+	WARN_ON(fs_devices->opened);
+	while (!list_empty(&fs_devices->devices)) {
+		device = list_entry(fs_devices->devices.next,
+				    struct btrfs_device, dev_list);
+		list_del(&device->dev_list);
+		kfree(device->name);
+		kfree(device);
+	}
+	kfree(fs_devices);
+}
+
+int btrfs_cleanup_fs_uuids(void)
+{
+	struct btrfs_fs_devices *fs_devices;
+
+	while (!list_empty(&fs_uuids)) {
+		fs_devices = list_entry(fs_uuids.next,
+					struct btrfs_fs_devices, list);
+		list_del(&fs_devices->list);
+		free_fs_devices(fs_devices);
+	}
+	return 0;
+}
+
+static noinline struct btrfs_device *__find_device(struct list_head *head,
+						   u64 devid, u8 *uuid)
+{
+	struct btrfs_device *dev;
+
+	list_for_each_entry(dev, head, dev_list) {
+		if (dev->devid == devid &&
+		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
+			return dev;
+		}
+	}
+	return NULL;
+}
+
+static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
+{
+	struct btrfs_fs_devices *fs_devices;
+
+	list_for_each_entry(fs_devices, &fs_uuids, list) {
+		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+			return fs_devices;
+	}
+	return NULL;
+}
+
+static void requeue_list(struct btrfs_pending_bios *pending_bios,
+			struct bio *head, struct bio *tail)
+{
+
+	struct bio *old_head;
+
+	old_head = pending_bios->head;
+	pending_bios->head = head;
+	if (pending_bios->tail)
+		tail->bi_next = old_head;
+	else
+		pending_bios->tail = tail;
+}
+
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device.  This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block.  The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested.  This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+static noinline int run_scheduled_bios(struct btrfs_device *device)
+{
+	struct bio *pending;
+	struct backing_dev_info *bdi;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_pending_bios *pending_bios;
+	struct bio *tail;
+	struct bio *cur;
+	int again = 0;
+	unsigned long num_run;
+	unsigned long batch_run = 0;
+	unsigned long limit;
+	unsigned long last_waited = 0;
+	int force_reg = 0;
+	struct blk_plug plug;
+
+	/*
+	 * this function runs all the bios we've collected for
+	 * a particular device.  We don't want to wander off to
+	 * another device without first sending all of these down.
+	 * So, setup a plug here and finish it off before we return
+	 */
+	blk_start_plug(&plug);
+
+	bdi = blk_get_backing_dev_info(device->bdev);
+	fs_info = device->dev_root->fs_info;
+	limit = btrfs_async_submit_limit(fs_info);
+	limit = limit * 2 / 3;
+
+loop:
+	spin_lock(&device->io_lock);
+
+loop_lock:
+	num_run = 0;
+
+	/* take all the bios off the list at once and process them
+	 * later on (without the lock held).  But, remember the
+	 * tail and other pointers so the bios can be properly reinserted
+	 * into the list if we hit congestion
+	 */
+	if (!force_reg && device->pending_sync_bios.head) {
+		pending_bios = &device->pending_sync_bios;
+		force_reg = 1;
+	} else {
+		pending_bios = &device->pending_bios;
+		force_reg = 0;
+	}
+
+	pending = pending_bios->head;
+	tail = pending_bios->tail;
+	WARN_ON(pending && !tail);
+
+	/*
+	 * if pending was null this time around, no bios need processing
+	 * at all and we can stop.  Otherwise it'll loop back up again
+	 * and do an additional check so no bios are missed.
+	 *
+	 * device->running_pending is used to synchronize with the
+	 * schedule_bio code.
+	 */
+	if (device->pending_sync_bios.head == NULL &&
+	    device->pending_bios.head == NULL) {
+		again = 0;
+		device->running_pending = 0;
+	} else {
+		again = 1;
+		device->running_pending = 1;
+	}
+
+	pending_bios->head = NULL;
+	pending_bios->tail = NULL;
+
+	spin_unlock(&device->io_lock);
+
+	while (pending) {
+
+		rmb();
+		/* we want to work on both lists, but do more bios on the
+		 * sync list than the regular list
+		 */
+		if ((num_run > 32 &&
+		    pending_bios != &device->pending_sync_bios &&
+		    device->pending_sync_bios.head) ||
+		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
+		    device->pending_bios.head)) {
+			spin_lock(&device->io_lock);
+			requeue_list(pending_bios, pending, tail);
+			goto loop_lock;
+		}
+
+		cur = pending;
+		pending = pending->bi_next;
+		cur->bi_next = NULL;
+		atomic_dec(&fs_info->nr_async_bios);
+
+		if (atomic_read(&fs_info->nr_async_bios) < limit &&
+		    waitqueue_active(&fs_info->async_submit_wait))
+			wake_up(&fs_info->async_submit_wait);
+
+		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+
+		submit_bio(cur->bi_rw, cur);
+		num_run++;
+		batch_run++;
+		if (need_resched())
+			cond_resched();
+
+		/*
+		 * we made progress, there is more work to do and the bdi
+		 * is now congested.  Back off and let other work structs
+		 * run instead
+		 */
+		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
+		    fs_info->fs_devices->open_devices > 1) {
+			struct io_context *ioc;
+
+			ioc = current->io_context;
+
+			/*
+			 * the main goal here is that we don't want to
+			 * block if we're going to be able to submit
+			 * more requests without blocking.
+			 *
+			 * This code does two great things, it pokes into
+			 * the elevator code from a filesystem _and_
+			 * it makes assumptions about how batching works.
+			 */
+			if (ioc && ioc->nr_batch_requests > 0 &&
+			    time_before(jiffies, ioc->last_waited + HZ/50UL) &&
+			    (last_waited == 0 ||
+			     ioc->last_waited == last_waited)) {
+				/*
+				 * we want to go through our batch of
+				 * requests and stop.  So, we copy out
+				 * the ioc->last_waited time and test
+				 * against it before looping
+				 */
+				last_waited = ioc->last_waited;
+				if (need_resched())
+					cond_resched();
+				continue;
+			}
+			spin_lock(&device->io_lock);
+			requeue_list(pending_bios, pending, tail);
+			device->running_pending = 1;
+
+			spin_unlock(&device->io_lock);
+			btrfs_requeue_work(&device->work);
+			goto done;
+		}
+	}
+
+	cond_resched();
+	if (again)
+		goto loop;
+
+	spin_lock(&device->io_lock);
+	if (device->pending_bios.head || device->pending_sync_bios.head)
+		goto loop_lock;
+	spin_unlock(&device->io_lock);
+
+done:
+	blk_finish_plug(&plug);
+	return 0;
+}
+
+static void pending_bios_fn(struct btrfs_work *work)
+{
+	struct btrfs_device *device;
+
+	device = container_of(work, struct btrfs_device, work);
+	run_scheduled_bios(device);
+}
+
+static noinline int device_list_add(const char *path,
+			   struct btrfs_super_block *disk_super,
+			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices;
+	u64 found_transid = btrfs_super_generation(disk_super);
+	char *name;
+
+	fs_devices = find_fsid(disk_super->fsid);
+	if (!fs_devices) {
+		fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+		if (!fs_devices)
+			return -ENOMEM;
+		INIT_LIST_HEAD(&fs_devices->devices);
+		INIT_LIST_HEAD(&fs_devices->alloc_list);
+		list_add(&fs_devices->list, &fs_uuids);
+		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
+		fs_devices->latest_devid = devid;
+		fs_devices->latest_trans = found_transid;
+		mutex_init(&fs_devices->device_list_mutex);
+		device = NULL;
+	} else {
+		device = __find_device(&fs_devices->devices, devid,
+				       disk_super->dev_item.uuid);
+	}
+	if (!device) {
+		if (fs_devices->opened)
+			return -EBUSY;
+
+		device = kzalloc(sizeof(*device), GFP_NOFS);
+		if (!device) {
+			/* we can safely leave the fs_devices entry around */
+			return -ENOMEM;
+		}
+		device->devid = devid;
+		device->work.func = pending_bios_fn;
+		memcpy(device->uuid, disk_super->dev_item.uuid,
+		       BTRFS_UUID_SIZE);
+		spin_lock_init(&device->io_lock);
+		device->name = kstrdup(path, GFP_NOFS);
+		if (!device->name) {
+			kfree(device);
+			return -ENOMEM;
+		}
+		INIT_LIST_HEAD(&device->dev_alloc_list);
+
+		mutex_lock(&fs_devices->device_list_mutex);
+		list_add_rcu(&device->dev_list, &fs_devices->devices);
+		mutex_unlock(&fs_devices->device_list_mutex);
+
+		device->fs_devices = fs_devices;
+		fs_devices->num_devices++;
+	} else if (!device->name || strcmp(device->name, path)) {
+		name = kstrdup(path, GFP_NOFS);
+		if (!name)
+			return -ENOMEM;
+		kfree(device->name);
+		device->name = name;
+		if (device->missing) {
+			fs_devices->missing_devices--;
+			device->missing = 0;
+		}
+	}
+
+	if (found_transid > fs_devices->latest_trans) {
+		fs_devices->latest_devid = devid;
+		fs_devices->latest_trans = found_transid;
+	}
+	*fs_devices_ret = fs_devices;
+	return 0;
+}
+
+static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
+{
+	struct btrfs_fs_devices *fs_devices;
+	struct btrfs_device *device;
+	struct btrfs_device *orig_dev;
+
+	fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+	if (!fs_devices)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&fs_devices->devices);
+	INIT_LIST_HEAD(&fs_devices->alloc_list);
+	INIT_LIST_HEAD(&fs_devices->list);
+	mutex_init(&fs_devices->device_list_mutex);
+	fs_devices->latest_devid = orig->latest_devid;
+	fs_devices->latest_trans = orig->latest_trans;
+	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
+
+	/* We have held the volume lock, it is safe to get the devices. */
+	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
+		device = kzalloc(sizeof(*device), GFP_NOFS);
+		if (!device)
+			goto error;
+
+		device->name = kstrdup(orig_dev->name, GFP_NOFS);
+		if (!device->name) {
+			kfree(device);
+			goto error;
+		}
+
+		device->devid = orig_dev->devid;
+		device->work.func = pending_bios_fn;
+		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
+		spin_lock_init(&device->io_lock);
+		INIT_LIST_HEAD(&device->dev_list);
+		INIT_LIST_HEAD(&device->dev_alloc_list);
+
+		list_add(&device->dev_list, &fs_devices->devices);
+		device->fs_devices = fs_devices;
+		fs_devices->num_devices++;
+	}
+	return fs_devices;
+error:
+	free_fs_devices(fs_devices);
+	return ERR_PTR(-ENOMEM);
+}
+
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct btrfs_device *device, *next;
+
+	mutex_lock(&uuid_mutex);
+again:
+	/* This is the initialized path, it is safe to release the devices. */
+	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+		if (device->in_fs_metadata)
+			continue;
+
+		if (device->bdev) {
+			blkdev_put(device->bdev, device->mode);
+			device->bdev = NULL;
+			fs_devices->open_devices--;
+		}
+		if (device->writeable) {
+			list_del_init(&device->dev_alloc_list);
+			device->writeable = 0;
+			fs_devices->rw_devices--;
+		}
+		list_del_init(&device->dev_list);
+		fs_devices->num_devices--;
+		kfree(device->name);
+		kfree(device);
+	}
+
+	if (fs_devices->seed) {
+		fs_devices = fs_devices->seed;
+		goto again;
+	}
+
+	mutex_unlock(&uuid_mutex);
+	return 0;
+}
+
+static void __free_device(struct work_struct *work)
+{
+	struct btrfs_device *device;
+
+	device = container_of(work, struct btrfs_device, rcu_work);
+
+	if (device->bdev)
+		blkdev_put(device->bdev, device->mode);
+
+	kfree(device->name);
+	kfree(device);
+}
+
+static void free_device(struct rcu_head *head)
+{
+	struct btrfs_device *device;
+
+	device = container_of(head, struct btrfs_device, rcu);
+
+	INIT_WORK(&device->rcu_work, __free_device);
+	schedule_work(&device->rcu_work);
+}
+
+static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct btrfs_device *device;
+
+	if (--fs_devices->opened > 0)
+		return 0;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		struct btrfs_device *new_device;
+
+		if (device->bdev)
+			fs_devices->open_devices--;
+
+		if (device->writeable) {
+			list_del_init(&device->dev_alloc_list);
+			fs_devices->rw_devices--;
+		}
+
+		if (device->can_discard)
+			fs_devices->num_can_discard--;
+
+		new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
+		BUG_ON(!new_device);
+		memcpy(new_device, device, sizeof(*new_device));
+		new_device->name = kstrdup(device->name, GFP_NOFS);
+		BUG_ON(device->name && !new_device->name);
+		new_device->bdev = NULL;
+		new_device->writeable = 0;
+		new_device->in_fs_metadata = 0;
+		new_device->can_discard = 0;
+		list_replace_rcu(&device->dev_list, &new_device->dev_list);
+
+		call_rcu(&device->rcu, free_device);
+	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+	WARN_ON(fs_devices->open_devices);
+	WARN_ON(fs_devices->rw_devices);
+	fs_devices->opened = 0;
+	fs_devices->seeding = 0;
+
+	return 0;
+}
+
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct btrfs_fs_devices *seed_devices = NULL;
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	ret = __btrfs_close_devices(fs_devices);
+	if (!fs_devices->opened) {
+		seed_devices = fs_devices->seed;
+		fs_devices->seed = NULL;
+	}
+	mutex_unlock(&uuid_mutex);
+
+	while (seed_devices) {
+		fs_devices = seed_devices;
+		seed_devices = fs_devices->seed;
+		__btrfs_close_devices(fs_devices);
+		free_fs_devices(fs_devices);
+	}
+	return ret;
+}
+
+static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+				fmode_t flags, void *holder)
+{
+	struct request_queue *q;
+	struct block_device *bdev;
+	struct list_head *head = &fs_devices->devices;
+	struct btrfs_device *device;
+	struct block_device *latest_bdev = NULL;
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+	u64 latest_devid = 0;
+	u64 latest_transid = 0;
+	u64 devid;
+	int seeding = 1;
+	int ret = 0;
+
+	flags |= FMODE_EXCL;
+
+	list_for_each_entry(device, head, dev_list) {
+		if (device->bdev)
+			continue;
+		if (!device->name)
+			continue;
+
+		bdev = blkdev_get_by_path(device->name, flags, holder);
+		if (IS_ERR(bdev)) {
+			printk(KERN_INFO "open %s failed\n", device->name);
+			goto error;
+		}
+		set_blocksize(bdev, 4096);
+
+		bh = btrfs_read_dev_super(bdev);
+		if (!bh) {
+			ret = -EINVAL;
+			goto error_close;
+		}
+
+		disk_super = (struct btrfs_super_block *)bh->b_data;
+		devid = btrfs_stack_device_id(&disk_super->dev_item);
+		if (devid != device->devid)
+			goto error_brelse;
+
+		if (memcmp(device->uuid, disk_super->dev_item.uuid,
+			   BTRFS_UUID_SIZE))
+			goto error_brelse;
+
+		device->generation = btrfs_super_generation(disk_super);
+		if (!latest_transid || device->generation > latest_transid) {
+			latest_devid = devid;
+			latest_transid = device->generation;
+			latest_bdev = bdev;
+		}
+
+		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+			device->writeable = 0;
+		} else {
+			device->writeable = !bdev_read_only(bdev);
+			seeding = 0;
+		}
+
+		q = bdev_get_queue(bdev);
+		if (blk_queue_discard(q)) {
+			device->can_discard = 1;
+			fs_devices->num_can_discard++;
+		}
+
+		device->bdev = bdev;
+		device->in_fs_metadata = 0;
+		device->mode = flags;
+
+		if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+			fs_devices->rotating = 1;
+
+		fs_devices->open_devices++;
+		if (device->writeable) {
+			fs_devices->rw_devices++;
+			list_add(&device->dev_alloc_list,
+				 &fs_devices->alloc_list);
+		}
+		brelse(bh);
+		continue;
+
+error_brelse:
+		brelse(bh);
+error_close:
+		blkdev_put(bdev, flags);
+error:
+		continue;
+	}
+	if (fs_devices->open_devices == 0) {
+		ret = -EIO;
+		goto out;
+	}
+	fs_devices->seeding = seeding;
+	fs_devices->opened = 1;
+	fs_devices->latest_bdev = latest_bdev;
+	fs_devices->latest_devid = latest_devid;
+	fs_devices->latest_trans = latest_transid;
+	fs_devices->total_rw_bytes = 0;
+out:
+	return ret;
+}
+
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+		       fmode_t flags, void *holder)
+{
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	if (fs_devices->opened) {
+		fs_devices->opened++;
+		ret = 0;
+	} else {
+		ret = __btrfs_open_devices(fs_devices, flags, holder);
+	}
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
+
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+			  struct btrfs_fs_devices **fs_devices_ret)
+{
+	struct btrfs_super_block *disk_super;
+	struct block_device *bdev;
+	struct buffer_head *bh;
+	int ret;
+	u64 devid;
+	u64 transid;
+
+	mutex_lock(&uuid_mutex);
+
+	flags |= FMODE_EXCL;
+	bdev = blkdev_get_by_path(path, flags, holder);
+
+	if (IS_ERR(bdev)) {
+		ret = PTR_ERR(bdev);
+		goto error;
+	}
+
+	ret = set_blocksize(bdev, 4096);
+	if (ret)
+		goto error_close;
+	bh = btrfs_read_dev_super(bdev);
+	if (!bh) {
+		ret = -EINVAL;
+		goto error_close;
+	}
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+	devid = btrfs_stack_device_id(&disk_super->dev_item);
+	transid = btrfs_super_generation(disk_super);
+	if (disk_super->label[0])
+		printk(KERN_INFO "device label %s ", disk_super->label);
+	else
+		printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
+	printk(KERN_CONT "devid %llu transid %llu %s\n",
+	       (unsigned long long)devid, (unsigned long long)transid, path);
+	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+
+	brelse(bh);
+error_close:
+	blkdev_put(bdev, flags);
+error:
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
+
+/* helper to account the used device space in the range */
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+				   u64 end, u64 *length)
+{
+	struct btrfs_key key;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent;
+	struct btrfs_path *path;
+	u64 extent_end;
+	int ret;
+	int slot;
+	struct extent_buffer *l;
+
+	*length = 0;
+
+	if (start >= device->total_bytes)
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 2;
+
+	key.objectid = device->devid;
+	key.offset = start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = btrfs_previous_item(root, path, key.objectid, key.type);
+		if (ret < 0)
+			goto out;
+	}
+
+	while (1) {
+		l = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto out;
+
+			break;
+		}
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (key.objectid < device->devid)
+			goto next;
+
+		if (key.objectid > device->devid)
+			break;
+
+		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+			goto next;
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		extent_end = key.offset + btrfs_dev_extent_length(l,
+								  dev_extent);
+		if (key.offset <= start && extent_end > end) {
+			*length = end - start + 1;
+			break;
+		} else if (key.offset <= start && extent_end > start)
+			*length += extent_end - start;
+		else if (key.offset > start && extent_end <= end)
+			*length += extent_end - key.offset;
+		else if (key.offset > start && key.offset <= end) {
+			*length += end - key.offset + 1;
+			break;
+		} else if (key.offset > end)
+			break;
+
+next:
+		path->slots[0]++;
+	}
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * find_free_dev_extent - find free space in the specified device
+ * @trans:	transaction handler
+ * @device:	the device which we search the free space in
+ * @num_bytes:	the size of the free space that we need
+ * @start:	store the start of the free space.
+ * @len:	the size of the free space. that we find, or the size of the max
+ * 		free space if we don't find suitable free space
+ *
+ * this uses a pretty simple search, the expectation is that it is
+ * called very infrequently and that a given device has a small number
+ * of extents
+ *
+ * @start is used to store the start of the free space if we find. But if we
+ * don't find suitable free space, it will be used to store the start position
+ * of the max free space.
+ *
+ * @len is used to store the size of the free space that we find.
+ * But if we don't find suitable free space, it is used to store the size of
+ * the max free space.
+ */
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_device *device, u64 num_bytes,
+			 u64 *start, u64 *len)
+{
+	struct btrfs_key key;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent;
+	struct btrfs_path *path;
+	u64 hole_size;
+	u64 max_hole_start;
+	u64 max_hole_size;
+	u64 extent_end;
+	u64 search_start;
+	u64 search_end = device->total_bytes;
+	int ret;
+	int slot;
+	struct extent_buffer *l;
+
+	/* FIXME use last free of some kind */
+
+	/* we don't want to overwrite the superblock on the drive,
+	 * so we make sure to start at an offset of at least 1MB
+	 */
+	search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+
+	max_hole_start = search_start;
+	max_hole_size = 0;
+
+	if (search_start >= search_end) {
+		ret = -ENOSPC;
+		goto error;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto error;
+	}
+	path->reada = 2;
+
+	key.objectid = device->devid;
+	key.offset = search_start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = btrfs_previous_item(root, path, key.objectid, key.type);
+		if (ret < 0)
+			goto out;
+	}
+
+	while (1) {
+		l = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto out;
+
+			break;
+		}
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (key.objectid < device->devid)
+			goto next;
+
+		if (key.objectid > device->devid)
+			break;
+
+		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+			goto next;
+
+		if (key.offset > search_start) {
+			hole_size = key.offset - search_start;
+
+			if (hole_size > max_hole_size) {
+				max_hole_start = search_start;
+				max_hole_size = hole_size;
+			}
+
+			/*
+			 * If this free space is greater than which we need,
+			 * it must be the max free space that we have found
+			 * until now, so max_hole_start must point to the start
+			 * of this free space and the length of this free space
+			 * is stored in max_hole_size. Thus, we return
+			 * max_hole_start and max_hole_size and go back to the
+			 * caller.
+			 */
+			if (hole_size >= num_bytes) {
+				ret = 0;
+				goto out;
+			}
+		}
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		extent_end = key.offset + btrfs_dev_extent_length(l,
+								  dev_extent);
+		if (extent_end > search_start)
+			search_start = extent_end;
+next:
+		path->slots[0]++;
+		cond_resched();
+	}
+
+	hole_size = search_end- search_start;
+	if (hole_size > max_hole_size) {
+		max_hole_start = search_start;
+		max_hole_size = hole_size;
+	}
+
+	/* See above. */
+	if (hole_size < num_bytes)
+		ret = -ENOSPC;
+	else
+		ret = 0;
+
+out:
+	btrfs_free_path(path);
+error:
+	*start = max_hole_start;
+	if (len)
+		*len = max_hole_size;
+	return ret;
+}
+
+static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+			  struct btrfs_device *device,
+			  u64 start)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf = NULL;
+	struct btrfs_dev_extent *extent = NULL;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = device->devid;
+	key.offset = start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0) {
+		ret = btrfs_previous_item(root, path, key.objectid,
+					  BTRFS_DEV_EXTENT_KEY);
+		if (ret)
+			goto out;
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_dev_extent);
+		BUG_ON(found_key.offset > start || found_key.offset +
+		       btrfs_dev_extent_length(leaf, extent) < start);
+	} else if (ret == 0) {
+		leaf = path->nodes[0];
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_dev_extent);
+	}
+	BUG_ON(ret);
+
+	if (device->bytes_used > 0)
+		device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+	ret = btrfs_del_item(trans, root, path);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+			   struct btrfs_device *device,
+			   u64 chunk_tree, u64 chunk_objectid,
+			   u64 chunk_offset, u64 start, u64 num_bytes)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *extent;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	WARN_ON(!device->in_fs_metadata);
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = device->devid;
+	key.offset = start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*extent));
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	extent = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_dev_extent);
+	btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
+	btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
+	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+
+	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+		    (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
+		    BTRFS_UUID_SIZE);
+
+	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int find_next_chunk(struct btrfs_root *root,
+				    u64 objectid, u64 *offset)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_chunk *chunk;
+	struct btrfs_key found_key;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	key.objectid = objectid;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+
+	BUG_ON(ret == 0);
+
+	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
+	if (ret) {
+		*offset = 0;
+	} else {
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		if (found_key.objectid != objectid)
+			*offset = 0;
+		else {
+			chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					       struct btrfs_chunk);
+			*offset = found_key.offset +
+				btrfs_chunk_length(path->nodes[0], chunk);
+		}
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+
+	BUG_ON(ret == 0);
+
+	ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
+				  BTRFS_DEV_ITEM_KEY);
+	if (ret) {
+		*objectid = 1;
+	} else {
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		*objectid = found_key.offset + 1;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * the device information is stored in the chunk root
+ * the btrfs_device struct should be fully filled in
+ */
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_dev_item *dev_item;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	unsigned long ptr;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*dev_item));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_generation(leaf, dev_item, 0);
+	btrfs_set_device_type(leaf, dev_item, device->type);
+	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+	btrfs_set_device_group(leaf, dev_item, 0);
+	btrfs_set_device_seek_speed(leaf, dev_item, 0);
+	btrfs_set_device_bandwidth(leaf, dev_item, 0);
+	btrfs_set_device_start_offset(leaf, dev_item, 0);
+
+	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+	ptr = (unsigned long)btrfs_device_fsid(dev_item);
+	write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_rm_dev_item(struct btrfs_root *root,
+			     struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_trans_handle *trans;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+	lock_chunks(root);
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, root, path);
+	if (ret)
+		goto out;
+out:
+	btrfs_free_path(path);
+	unlock_chunks(root);
+	btrfs_commit_transaction(trans, root);
+	return ret;
+}
+
+int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+{
+	struct btrfs_device *device;
+	struct btrfs_device *next_device;
+	struct block_device *bdev;
+	struct buffer_head *bh = NULL;
+	struct btrfs_super_block *disk_super;
+	struct btrfs_fs_devices *cur_devices;
+	u64 all_avail;
+	u64 devid;
+	u64 num_devices;
+	u8 *dev_uuid;
+	int ret = 0;
+	bool clear_super = false;
+
+	mutex_lock(&uuid_mutex);
+	mutex_lock(&root->fs_info->volume_mutex);
+
+	all_avail = root->fs_info->avail_data_alloc_bits |
+		root->fs_info->avail_system_alloc_bits |
+		root->fs_info->avail_metadata_alloc_bits;
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
+	    root->fs_info->fs_devices->num_devices <= 4) {
+		printk(KERN_ERR "btrfs: unable to go below four devices "
+		       "on raid10\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
+	    root->fs_info->fs_devices->num_devices <= 2) {
+		printk(KERN_ERR "btrfs: unable to go below two "
+		       "devices on raid1\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (strcmp(device_path, "missing") == 0) {
+		struct list_head *devices;
+		struct btrfs_device *tmp;
+
+		device = NULL;
+		devices = &root->fs_info->fs_devices->devices;
+		/*
+		 * It is safe to read the devices since the volume_mutex
+		 * is held.
+		 */
+		list_for_each_entry(tmp, devices, dev_list) {
+			if (tmp->in_fs_metadata && !tmp->bdev) {
+				device = tmp;
+				break;
+			}
+		}
+		bdev = NULL;
+		bh = NULL;
+		disk_super = NULL;
+		if (!device) {
+			printk(KERN_ERR "btrfs: no missing devices found to "
+			       "remove\n");
+			goto out;
+		}
+	} else {
+		bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
+					  root->fs_info->bdev_holder);
+		if (IS_ERR(bdev)) {
+			ret = PTR_ERR(bdev);
+			goto out;
+		}
+
+		set_blocksize(bdev, 4096);
+		bh = btrfs_read_dev_super(bdev);
+		if (!bh) {
+			ret = -EINVAL;
+			goto error_close;
+		}
+		disk_super = (struct btrfs_super_block *)bh->b_data;
+		devid = btrfs_stack_device_id(&disk_super->dev_item);
+		dev_uuid = disk_super->dev_item.uuid;
+		device = btrfs_find_device(root, devid, dev_uuid,
+					   disk_super->fsid);
+		if (!device) {
+			ret = -ENOENT;
+			goto error_brelse;
+		}
+	}
+
+	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
+		printk(KERN_ERR "btrfs: unable to remove the only writeable "
+		       "device\n");
+		ret = -EINVAL;
+		goto error_brelse;
+	}
+
+	if (device->writeable) {
+		lock_chunks(root);
+		list_del_init(&device->dev_alloc_list);
+		unlock_chunks(root);
+		root->fs_info->fs_devices->rw_devices--;
+		clear_super = true;
+	}
+
+	ret = btrfs_shrink_device(device, 0);
+	if (ret)
+		goto error_undo;
+
+	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
+	if (ret)
+		goto error_undo;
+
+	device->in_fs_metadata = 0;
+	btrfs_scrub_cancel_dev(root, device);
+
+	/*
+	 * the device list mutex makes sure that we don't change
+	 * the device list while someone else is writing out all
+	 * the device supers.
+	 */
+
+	cur_devices = device->fs_devices;
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	list_del_rcu(&device->dev_list);
+
+	device->fs_devices->num_devices--;
+
+	if (device->missing)
+		root->fs_info->fs_devices->missing_devices--;
+
+	next_device = list_entry(root->fs_info->fs_devices->devices.next,
+				 struct btrfs_device, dev_list);
+	if (device->bdev == root->fs_info->sb->s_bdev)
+		root->fs_info->sb->s_bdev = next_device->bdev;
+	if (device->bdev == root->fs_info->fs_devices->latest_bdev)
+		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+
+	if (device->bdev)
+		device->fs_devices->open_devices--;
+
+	call_rcu(&device->rcu, free_device);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+
+	if (cur_devices->open_devices == 0) {
+		struct btrfs_fs_devices *fs_devices;
+		fs_devices = root->fs_info->fs_devices;
+		while (fs_devices) {
+			if (fs_devices->seed == cur_devices)
+				break;
+			fs_devices = fs_devices->seed;
+		}
+		fs_devices->seed = cur_devices->seed;
+		cur_devices->seed = NULL;
+		lock_chunks(root);
+		__btrfs_close_devices(cur_devices);
+		unlock_chunks(root);
+		free_fs_devices(cur_devices);
+	}
+
+	/*
+	 * at this point, the device is zero sized.  We want to
+	 * remove it from the devices list and zero out the old super
+	 */
+	if (clear_super) {
+		/* make sure this device isn't detected as part of
+		 * the FS anymore
+		 */
+		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+		set_buffer_dirty(bh);
+		sync_dirty_buffer(bh);
+	}
+
+	ret = 0;
+
+error_brelse:
+	brelse(bh);
+error_close:
+	if (bdev)
+		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
+	mutex_unlock(&uuid_mutex);
+	return ret;
+error_undo:
+	if (device->writeable) {
+		lock_chunks(root);
+		list_add(&device->dev_alloc_list,
+			 &root->fs_info->fs_devices->alloc_list);
+		unlock_chunks(root);
+		root->fs_info->fs_devices->rw_devices++;
+	}
+	goto error_brelse;
+}
+
+/*
+ * does all the dirty work required for changing file system's UUID.
+ */
+static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root)
+{
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	struct btrfs_fs_devices *old_devices;
+	struct btrfs_fs_devices *seed_devices;
+	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct btrfs_device *device;
+	u64 super_flags;
+
+	BUG_ON(!mutex_is_locked(&uuid_mutex));
+	if (!fs_devices->seeding)
+		return -EINVAL;
+
+	seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+	if (!seed_devices)
+		return -ENOMEM;
+
+	old_devices = clone_fs_devices(fs_devices);
+	if (IS_ERR(old_devices)) {
+		kfree(seed_devices);
+		return PTR_ERR(old_devices);
+	}
+
+	list_add(&old_devices->list, &fs_uuids);
+
+	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
+	seed_devices->opened = 1;
+	INIT_LIST_HEAD(&seed_devices->devices);
+	INIT_LIST_HEAD(&seed_devices->alloc_list);
+	mutex_init(&seed_devices->device_list_mutex);
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
+			      synchronize_rcu);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
+	list_for_each_entry(device, &seed_devices->devices, dev_list) {
+		device->fs_devices = seed_devices;
+	}
+
+	fs_devices->seeding = 0;
+	fs_devices->num_devices = 0;
+	fs_devices->open_devices = 0;
+	fs_devices->seed = seed_devices;
+
+	generate_random_uuid(fs_devices->fsid);
+	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	super_flags = btrfs_super_flags(disk_super) &
+		      ~BTRFS_SUPER_FLAG_SEEDING;
+	btrfs_set_super_flags(disk_super, super_flags);
+
+	return 0;
+}
+
+/*
+ * strore the expected generation for seed devices in device items.
+ */
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_dev_item *dev_item;
+	struct btrfs_device *device;
+	struct btrfs_key key;
+	u8 fs_uuid[BTRFS_UUID_SIZE];
+	u8 dev_uuid[BTRFS_UUID_SIZE];
+	u64 devid;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	root = root->fs_info->chunk_root;
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.offset = 0;
+	key.type = BTRFS_DEV_ITEM_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+		if (ret < 0)
+			goto error;
+
+		leaf = path->nodes[0];
+next_slot:
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret > 0)
+				break;
+			if (ret < 0)
+				goto error;
+			leaf = path->nodes[0];
+			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+			btrfs_release_path(path);
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
+		    key.type != BTRFS_DEV_ITEM_KEY)
+			break;
+
+		dev_item = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_dev_item);
+		devid = btrfs_device_id(leaf, dev_item);
+		read_extent_buffer(leaf, dev_uuid,
+				   (unsigned long)btrfs_device_uuid(dev_item),
+				   BTRFS_UUID_SIZE);
+		read_extent_buffer(leaf, fs_uuid,
+				   (unsigned long)btrfs_device_fsid(dev_item),
+				   BTRFS_UUID_SIZE);
+		device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+		BUG_ON(!device);
+
+		if (device->fs_devices->seeding) {
+			btrfs_set_device_generation(leaf, dev_item,
+						    device->generation);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+
+		path->slots[0]++;
+		goto next_slot;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
+{
+	struct request_queue *q;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_device *device;
+	struct block_device *bdev;
+	struct list_head *devices;
+	struct super_block *sb = root->fs_info->sb;
+	u64 total_bytes;
+	int seeding_dev = 0;
+	int ret = 0;
+
+	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
+		return -EINVAL;
+
+	bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+				  root->fs_info->bdev_holder);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+
+	if (root->fs_info->fs_devices->seeding) {
+		seeding_dev = 1;
+		down_write(&sb->s_umount);
+		mutex_lock(&uuid_mutex);
+	}
+
+	filemap_write_and_wait(bdev->bd_inode->i_mapping);
+	mutex_lock(&root->fs_info->volume_mutex);
+
+	devices = &root->fs_info->fs_devices->devices;
+	/*
+	 * we have the volume lock, so we don't need the extra
+	 * device list mutex while reading the list here.
+	 */
+	list_for_each_entry(device, devices, dev_list) {
+		if (device->bdev == bdev) {
+			ret = -EEXIST;
+			goto error;
+		}
+	}
+
+	device = kzalloc(sizeof(*device), GFP_NOFS);
+	if (!device) {
+		/* we can safely leave the fs_devices entry around */
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	device->name = kstrdup(device_path, GFP_NOFS);
+	if (!device->name) {
+		kfree(device);
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	ret = find_next_devid(root, &device->devid);
+	if (ret) {
+		kfree(device->name);
+		kfree(device);
+		goto error;
+	}
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		kfree(device->name);
+		kfree(device);
+		ret = PTR_ERR(trans);
+		goto error;
+	}
+
+	lock_chunks(root);
+
+	q = bdev_get_queue(bdev);
+	if (blk_queue_discard(q))
+		device->can_discard = 1;
+	device->writeable = 1;
+	device->work.func = pending_bios_fn;
+	generate_random_uuid(device->uuid);
+	spin_lock_init(&device->io_lock);
+	device->generation = trans->transid;
+	device->io_width = root->sectorsize;
+	device->io_align = root->sectorsize;
+	device->sector_size = root->sectorsize;
+	device->total_bytes = i_size_read(bdev->bd_inode);
+	device->disk_total_bytes = device->total_bytes;
+	device->dev_root = root->fs_info->dev_root;
+	device->bdev = bdev;
+	device->in_fs_metadata = 1;
+	device->mode = FMODE_EXCL;
+	set_blocksize(device->bdev, 4096);
+
+	if (seeding_dev) {
+		sb->s_flags &= ~MS_RDONLY;
+		ret = btrfs_prepare_sprout(trans, root);
+		BUG_ON(ret);
+	}
+
+	device->fs_devices = root->fs_info->fs_devices;
+
+	/*
+	 * we don't want write_supers to jump in here with our device
+	 * half setup
+	 */
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
+	list_add(&device->dev_alloc_list,
+		 &root->fs_info->fs_devices->alloc_list);
+	root->fs_info->fs_devices->num_devices++;
+	root->fs_info->fs_devices->open_devices++;
+	root->fs_info->fs_devices->rw_devices++;
+	if (device->can_discard)
+		root->fs_info->fs_devices->num_can_discard++;
+	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+
+	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+		root->fs_info->fs_devices->rotating = 1;
+
+	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+				    total_bytes + device->total_bytes);
+
+	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
+	btrfs_set_super_num_devices(&root->fs_info->super_copy,
+				    total_bytes + 1);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	if (seeding_dev) {
+		ret = init_first_rw_device(trans, root, device);
+		BUG_ON(ret);
+		ret = btrfs_finish_sprout(trans, root);
+		BUG_ON(ret);
+	} else {
+		ret = btrfs_add_device(trans, root, device);
+	}
+
+	/*
+	 * we've got more storage, clear any full flags on the space
+	 * infos
+	 */
+	btrfs_clear_space_info_full(root->fs_info);
+
+	unlock_chunks(root);
+	btrfs_commit_transaction(trans, root);
+
+	if (seeding_dev) {
+		mutex_unlock(&uuid_mutex);
+		up_write(&sb->s_umount);
+
+		ret = btrfs_relocate_sys_chunks(root);
+		BUG_ON(ret);
+	}
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
+	return ret;
+error:
+	blkdev_put(bdev, FMODE_EXCL);
+	if (seeding_dev) {
+		mutex_unlock(&uuid_mutex);
+		up_write(&sb->s_umount);
+	}
+	goto out;
+}
+
+static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
+					struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	struct btrfs_dev_item *dev_item;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	root = device->dev_root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_type(leaf, dev_item, device->type);
+	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+	btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
+	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+	btrfs_mark_buffer_dirty(leaf);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size)
+{
+	struct btrfs_super_block *super_copy =
+		&device->dev_root->fs_info->super_copy;
+	u64 old_total = btrfs_super_total_bytes(super_copy);
+	u64 diff = new_size - device->total_bytes;
+
+	if (!device->writeable)
+		return -EACCES;
+	if (new_size <= device->total_bytes)
+		return -EINVAL;
+
+	btrfs_set_super_total_bytes(super_copy, old_total + diff);
+	device->fs_devices->total_rw_bytes += diff;
+
+	device->total_bytes = new_size;
+	device->disk_total_bytes = new_size;
+	btrfs_clear_space_info_full(device->dev_root->fs_info);
+
+	return btrfs_update_device(trans, device);
+}
+
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size)
+{
+	int ret;
+	lock_chunks(device->dev_root);
+	ret = __btrfs_grow_device(trans, device, new_size);
+	unlock_chunks(device->dev_root);
+	return ret;
+}
+
+static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    u64 chunk_tree, u64 chunk_objectid,
+			    u64 chunk_offset)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+
+	root = root->fs_info->chunk_root;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = chunk_objectid;
+	key.offset = chunk_offset;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	BUG_ON(ret);
+
+	ret = btrfs_del_item(trans, root, path);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+			chunk_offset)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_disk_key *disk_key;
+	struct btrfs_chunk *chunk;
+	u8 *ptr;
+	int ret = 0;
+	u32 num_stripes;
+	u32 array_size;
+	u32 len = 0;
+	u32 cur;
+	struct btrfs_key key;
+
+	array_size = btrfs_super_sys_array_size(super_copy);
+
+	ptr = super_copy->sys_chunk_array;
+	cur = 0;
+
+	while (cur < array_size) {
+		disk_key = (struct btrfs_disk_key *)ptr;
+		btrfs_disk_key_to_cpu(&key, disk_key);
+
+		len = sizeof(*disk_key);
+
+		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+			chunk = (struct btrfs_chunk *)(ptr + len);
+			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+			len += btrfs_chunk_item_size(num_stripes);
+		} else {
+			ret = -EIO;
+			break;
+		}
+		if (key.objectid == chunk_objectid &&
+		    key.offset == chunk_offset) {
+			memmove(ptr, ptr + len, array_size - (cur + len));
+			array_size -= len;
+			btrfs_set_super_sys_array_size(super_copy, array_size);
+		} else {
+			ptr += len;
+			cur += len;
+		}
+	}
+	return ret;
+}
+
+static int btrfs_relocate_chunk(struct btrfs_root *root,
+			 u64 chunk_tree, u64 chunk_objectid,
+			 u64 chunk_offset)
+{
+	struct extent_map_tree *em_tree;
+	struct btrfs_root *extent_root;
+	struct btrfs_trans_handle *trans;
+	struct extent_map *em;
+	struct map_lookup *map;
+	int ret;
+	int i;
+
+	root = root->fs_info->chunk_root;
+	extent_root = root->fs_info->extent_root;
+	em_tree = &root->fs_info->mapping_tree.map_tree;
+
+	ret = btrfs_can_relocate(extent_root, chunk_offset);
+	if (ret)
+		return -ENOSPC;
+
+	/* step one, relocate all the extents inside this chunk */
+	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+	if (ret)
+		return ret;
+
+	trans = btrfs_start_transaction(root, 0);
+	BUG_ON(IS_ERR(trans));
+
+	lock_chunks(root);
+
+	/*
+	 * step two, delete the device extents and the
+	 * chunk tree entries
+	 */
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+	read_unlock(&em_tree->lock);
+
+	BUG_ON(em->start > chunk_offset ||
+	       em->start + em->len < chunk_offset);
+	map = (struct map_lookup *)em->bdev;
+
+	for (i = 0; i < map->num_stripes; i++) {
+		ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
+					    map->stripes[i].physical);
+		BUG_ON(ret);
+
+		if (map->stripes[i].dev) {
+			ret = btrfs_update_device(trans, map->stripes[i].dev);
+			BUG_ON(ret);
+		}
+	}
+	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
+			       chunk_offset);
+
+	BUG_ON(ret);
+
+	trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
+
+	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
+		BUG_ON(ret);
+	}
+
+	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+	BUG_ON(ret);
+
+	write_lock(&em_tree->lock);
+	remove_extent_mapping(em_tree, em);
+	write_unlock(&em_tree->lock);
+
+	kfree(map);
+	em->bdev = NULL;
+
+	/* once for the tree */
+	free_extent_map(em);
+	/* once for us */
+	free_extent_map(em);
+
+	unlock_chunks(root);
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+{
+	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_chunk *chunk;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u64 chunk_tree = chunk_root->root_key.objectid;
+	u64 chunk_type;
+	bool retried = false;
+	int failed = 0;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+again:
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto error;
+		BUG_ON(ret == 0);
+
+		ret = btrfs_previous_item(chunk_root, path, key.objectid,
+					  key.type);
+		if (ret < 0)
+			goto error;
+		if (ret > 0)
+			break;
+
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		chunk = btrfs_item_ptr(leaf, path->slots[0],
+				       struct btrfs_chunk);
+		chunk_type = btrfs_chunk_type(leaf, chunk);
+		btrfs_release_path(path);
+
+		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
+			ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+						   found_key.objectid,
+						   found_key.offset);
+			if (ret == -ENOSPC)
+				failed++;
+			else if (ret)
+				BUG();
+		}
+
+		if (found_key.offset == 0)
+			break;
+		key.offset = found_key.offset - 1;
+	}
+	ret = 0;
+	if (failed && !retried) {
+		failed = 0;
+		retried = true;
+		goto again;
+	} else if (failed && retried) {
+		WARN_ON(1);
+		ret = -ENOSPC;
+	}
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static u64 div_factor(u64 num, int factor)
+{
+	if (factor == 10)
+		return num;
+	num *= factor;
+	do_div(num, 10);
+	return num;
+}
+
+int btrfs_balance(struct btrfs_root *dev_root)
+{
+	int ret;
+	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+	struct btrfs_device *device;
+	u64 old_size;
+	u64 size_to_free;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key found_key;
+
+	if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&dev_root->fs_info->volume_mutex);
+	dev_root = dev_root->fs_info->dev_root;
+
+	/* step one make some room on all the devices */
+	list_for_each_entry(device, devices, dev_list) {
+		old_size = device->total_bytes;
+		size_to_free = div_factor(old_size, 1);
+		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+		if (!device->writeable ||
+		    device->total_bytes - device->bytes_used > size_to_free)
+			continue;
+
+		ret = btrfs_shrink_device(device, old_size - size_to_free);
+		if (ret == -ENOSPC)
+			break;
+		BUG_ON(ret);
+
+		trans = btrfs_start_transaction(dev_root, 0);
+		BUG_ON(IS_ERR(trans));
+
+		ret = btrfs_grow_device(trans, device, old_size);
+		BUG_ON(ret);
+
+		btrfs_end_transaction(trans, dev_root);
+	}
+
+	/* step two, relocate all the chunks */
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto error;
+
+		/*
+		 * this shouldn't happen, it means the last relocate
+		 * failed
+		 */
+		if (ret == 0)
+			break;
+
+		ret = btrfs_previous_item(chunk_root, path, 0,
+					  BTRFS_CHUNK_ITEM_KEY);
+		if (ret)
+			break;
+
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		if (found_key.objectid != key.objectid)
+			break;
+
+		/* chunk zero is special */
+		if (found_key.offset == 0)
+			break;
+
+		btrfs_release_path(path);
+		ret = btrfs_relocate_chunk(chunk_root,
+					   chunk_root->root_key.objectid,
+					   found_key.objectid,
+					   found_key.offset);
+		if (ret && ret != -ENOSPC)
+			goto error;
+		key.offset = found_key.offset - 1;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	mutex_unlock(&dev_root->fs_info->volume_mutex);
+	return ret;
+}
+
+/*
+ * shrinking a device means finding all of the device extents past
+ * the new size, and then following the back refs to the chunks.
+ * The chunk relocation code actually frees the device extent
+ */
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_path *path;
+	u64 length;
+	u64 chunk_tree;
+	u64 chunk_objectid;
+	u64 chunk_offset;
+	int ret;
+	int slot;
+	int failed = 0;
+	bool retried = false;
+	struct extent_buffer *l;
+	struct btrfs_key key;
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	u64 old_total = btrfs_super_total_bytes(super_copy);
+	u64 old_size = device->total_bytes;
+	u64 diff = device->total_bytes - new_size;
+
+	if (new_size >= device->total_bytes)
+		return -EINVAL;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 2;
+
+	lock_chunks(root);
+
+	device->total_bytes = new_size;
+	if (device->writeable)
+		device->fs_devices->total_rw_bytes -= diff;
+	unlock_chunks(root);
+
+again:
+	key.objectid = device->devid;
+	key.offset = (u64)-1;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto done;
+
+		ret = btrfs_previous_item(root, path, 0, key.type);
+		if (ret < 0)
+			goto done;
+		if (ret) {
+			ret = 0;
+			btrfs_release_path(path);
+			break;
+		}
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
+		if (key.objectid != device->devid) {
+			btrfs_release_path(path);
+			break;
+		}
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		length = btrfs_dev_extent_length(l, dev_extent);
+
+		if (key.offset + length <= new_size) {
+			btrfs_release_path(path);
+			break;
+		}
+
+		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+		btrfs_release_path(path);
+
+		ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
+					   chunk_offset);
+		if (ret && ret != -ENOSPC)
+			goto done;
+		if (ret == -ENOSPC)
+			failed++;
+		key.offset -= 1;
+	}
+
+	if (failed && !retried) {
+		failed = 0;
+		retried = true;
+		goto again;
+	} else if (failed && retried) {
+		ret = -ENOSPC;
+		lock_chunks(root);
+
+		device->total_bytes = old_size;
+		if (device->writeable)
+			device->fs_devices->total_rw_bytes += diff;
+		unlock_chunks(root);
+		goto done;
+	}
+
+	/* Shrinking succeeded, else we would be at "done". */
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto done;
+	}
+
+	lock_chunks(root);
+
+	device->disk_total_bytes = new_size;
+	/* Now btrfs_update_device() will change the on-disk size. */
+	ret = btrfs_update_device(trans, device);
+	if (ret) {
+		unlock_chunks(root);
+		btrfs_end_transaction(trans, root);
+		goto done;
+	}
+	WARN_ON(diff > old_total);
+	btrfs_set_super_total_bytes(super_copy, old_total - diff);
+	unlock_chunks(root);
+	btrfs_end_transaction(trans, root);
+done:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_key *key,
+			   struct btrfs_chunk *chunk, int item_size)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_disk_key disk_key;
+	u32 array_size;
+	u8 *ptr;
+
+	array_size = btrfs_super_sys_array_size(super_copy);
+	if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+		return -EFBIG;
+
+	ptr = super_copy->sys_chunk_array + array_size;
+	btrfs_cpu_key_to_disk(&disk_key, key);
+	memcpy(ptr, &disk_key, sizeof(disk_key));
+	ptr += sizeof(disk_key);
+	memcpy(ptr, chunk, item_size);
+	item_size += sizeof(disk_key);
+	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+	return 0;
+}
+
+/*
+ * sort the devices in descending order by max_avail, total_avail
+ */
+static int btrfs_cmp_device_info(const void *a, const void *b)
+{
+	const struct btrfs_device_info *di_a = a;
+	const struct btrfs_device_info *di_b = b;
+
+	if (di_a->max_avail > di_b->max_avail)
+		return -1;
+	if (di_a->max_avail < di_b->max_avail)
+		return 1;
+	if (di_a->total_avail > di_b->total_avail)
+		return -1;
+	if (di_a->total_avail < di_b->total_avail)
+		return 1;
+	return 0;
+}
+
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root,
+			       struct map_lookup **map_ret,
+			       u64 *num_bytes_out, u64 *stripe_size_out,
+			       u64 start, u64 type)
+{
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct btrfs_fs_devices *fs_devices = info->fs_devices;
+	struct list_head *cur;
+	struct map_lookup *map = NULL;
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct btrfs_device_info *devices_info = NULL;
+	u64 total_avail;
+	int num_stripes;	/* total number of stripes to allocate */
+	int sub_stripes;	/* sub_stripes info for map */
+	int dev_stripes;	/* stripes per dev */
+	int devs_max;		/* max devs to use */
+	int devs_min;		/* min devs needed */
+	int devs_increment;	/* ndevs has to be a multiple of this */
+	int ncopies;		/* how many copies to data has */
+	int ret;
+	u64 max_stripe_size;
+	u64 max_chunk_size;
+	u64 stripe_size;
+	u64 num_bytes;
+	int ndevs;
+	int i;
+	int j;
+
+	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+	    (type & BTRFS_BLOCK_GROUP_DUP)) {
+		WARN_ON(1);
+		type &= ~BTRFS_BLOCK_GROUP_DUP;
+	}
+
+	if (list_empty(&fs_devices->alloc_list))
+		return -ENOSPC;
+
+	sub_stripes = 1;
+	dev_stripes = 1;
+	devs_increment = 1;
+	ncopies = 1;
+	devs_max = 0;	/* 0 == as many as possible */
+	devs_min = 1;
+
+	/*
+	 * define the properties of each RAID type.
+	 * FIXME: move this to a global table and use it in all RAID
+	 * calculation code
+	 */
+	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
+		dev_stripes = 2;
+		ncopies = 2;
+		devs_max = 1;
+	} else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
+		devs_min = 2;
+	} else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
+		devs_increment = 2;
+		ncopies = 2;
+		devs_max = 2;
+		devs_min = 2;
+	} else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+		sub_stripes = 2;
+		devs_increment = 2;
+		ncopies = 2;
+		devs_min = 4;
+	} else {
+		devs_max = 1;
+	}
+
+	if (type & BTRFS_BLOCK_GROUP_DATA) {
+		max_stripe_size = 1024 * 1024 * 1024;
+		max_chunk_size = 10 * max_stripe_size;
+	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+		max_stripe_size = 256 * 1024 * 1024;
+		max_chunk_size = max_stripe_size;
+	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		max_stripe_size = 8 * 1024 * 1024;
+		max_chunk_size = 2 * max_stripe_size;
+	} else {
+		printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
+		       type);
+		BUG_ON(1);
+	}
+
+	/* we don't want a chunk larger than 10% of writeable space */
+	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+			     max_chunk_size);
+
+	devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+			       GFP_NOFS);
+	if (!devices_info)
+		return -ENOMEM;
+
+	cur = fs_devices->alloc_list.next;
+
+	/*
+	 * in the first pass through the devices list, we gather information
+	 * about the available holes on each device.
+	 */
+	ndevs = 0;
+	while (cur != &fs_devices->alloc_list) {
+		struct btrfs_device *device;
+		u64 max_avail;
+		u64 dev_offset;
+
+		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
+
+		cur = cur->next;
+
+		if (!device->writeable) {
+			printk(KERN_ERR
+			       "btrfs: read-only device in alloc_list\n");
+			WARN_ON(1);
+			continue;
+		}
+
+		if (!device->in_fs_metadata)
+			continue;
+
+		if (device->total_bytes > device->bytes_used)
+			total_avail = device->total_bytes - device->bytes_used;
+		else
+			total_avail = 0;
+		/* avail is off by max(alloc_start, 1MB), but that is the same
+		 * for all devices, so it doesn't hurt the sorting later on
+		 */
+
+		ret = find_free_dev_extent(trans, device,
+					   max_stripe_size * dev_stripes,
+					   &dev_offset, &max_avail);
+		if (ret && ret != -ENOSPC)
+			goto error;
+
+		if (ret == 0)
+			max_avail = max_stripe_size * dev_stripes;
+
+		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
+			continue;
+
+		devices_info[ndevs].dev_offset = dev_offset;
+		devices_info[ndevs].max_avail = max_avail;
+		devices_info[ndevs].total_avail = total_avail;
+		devices_info[ndevs].dev = device;
+		++ndevs;
+	}
+
+	/*
+	 * now sort the devices by hole size / available space
+	 */
+	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+	     btrfs_cmp_device_info, NULL);
+
+	/* round down to number of usable stripes */
+	ndevs -= ndevs % devs_increment;
+
+	if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
+		ret = -ENOSPC;
+		goto error;
+	}
+
+	if (devs_max && ndevs > devs_max)
+		ndevs = devs_max;
+	/*
+	 * the primary goal is to maximize the number of stripes, so use as many
+	 * devices as possible, even if the stripes are not maximum sized.
+	 */
+	stripe_size = devices_info[ndevs-1].max_avail;
+	num_stripes = ndevs * dev_stripes;
+
+	if (stripe_size * num_stripes > max_chunk_size * ncopies) {
+		stripe_size = max_chunk_size * ncopies;
+		do_div(stripe_size, num_stripes);
+	}
+
+	do_div(stripe_size, dev_stripes);
+	do_div(stripe_size, BTRFS_STRIPE_LEN);
+	stripe_size *= BTRFS_STRIPE_LEN;
+
+	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+	if (!map) {
+		ret = -ENOMEM;
+		goto error;
+	}
+	map->num_stripes = num_stripes;
+
+	for (i = 0; i < ndevs; ++i) {
+		for (j = 0; j < dev_stripes; ++j) {
+			int s = i * dev_stripes + j;
+			map->stripes[s].dev = devices_info[i].dev;
+			map->stripes[s].physical = devices_info[i].dev_offset +
+						   j * stripe_size;
+		}
+	}
+	map->sector_size = extent_root->sectorsize;
+	map->stripe_len = BTRFS_STRIPE_LEN;
+	map->io_align = BTRFS_STRIPE_LEN;
+	map->io_width = BTRFS_STRIPE_LEN;
+	map->type = type;
+	map->sub_stripes = sub_stripes;
+
+	*map_ret = map;
+	num_bytes = stripe_size * (num_stripes / ncopies);
+
+	*stripe_size_out = stripe_size;
+	*num_bytes_out = num_bytes;
+
+	trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
+
+	em = alloc_extent_map();
+	if (!em) {
+		ret = -ENOMEM;
+		goto error;
+	}
+	em->bdev = (struct block_device *)map;
+	em->start = start;
+	em->len = num_bytes;
+	em->block_start = 0;
+	em->block_len = em->len;
+
+	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+	write_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em);
+	write_unlock(&em_tree->lock);
+	BUG_ON(ret);
+	free_extent_map(em);
+
+	ret = btrfs_make_block_group(trans, extent_root, 0, type,
+				     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+				     start, num_bytes);
+	BUG_ON(ret);
+
+	for (i = 0; i < map->num_stripes; ++i) {
+		struct btrfs_device *device;
+		u64 dev_offset;
+
+		device = map->stripes[i].dev;
+		dev_offset = map->stripes[i].physical;
+
+		ret = btrfs_alloc_dev_extent(trans, device,
+				info->chunk_root->root_key.objectid,
+				BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+				start, dev_offset, stripe_size);
+		BUG_ON(ret);
+	}
+
+	kfree(devices_info);
+	return 0;
+
+error:
+	kfree(map);
+	kfree(devices_info);
+	return ret;
+}
+
+static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+				struct btrfs_root *extent_root,
+				struct map_lookup *map, u64 chunk_offset,
+				u64 chunk_size, u64 stripe_size)
+{
+	u64 dev_offset;
+	struct btrfs_key key;
+	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	struct btrfs_device *device;
+	struct btrfs_chunk *chunk;
+	struct btrfs_stripe *stripe;
+	size_t item_size = btrfs_chunk_item_size(map->num_stripes);
+	int index = 0;
+	int ret;
+
+	chunk = kzalloc(item_size, GFP_NOFS);
+	if (!chunk)
+		return -ENOMEM;
+
+	index = 0;
+	while (index < map->num_stripes) {
+		device = map->stripes[index].dev;
+		device->bytes_used += stripe_size;
+		ret = btrfs_update_device(trans, device);
+		BUG_ON(ret);
+		index++;
+	}
+
+	index = 0;
+	stripe = &chunk->stripe;
+	while (index < map->num_stripes) {
+		device = map->stripes[index].dev;
+		dev_offset = map->stripes[index].physical;
+
+		btrfs_set_stack_stripe_devid(stripe, device->devid);
+		btrfs_set_stack_stripe_offset(stripe, dev_offset);
+		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
+		stripe++;
+		index++;
+	}
+
+	btrfs_set_stack_chunk_length(chunk, chunk_size);
+	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_type(chunk, map->type);
+	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
+	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
+
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+	key.offset = chunk_offset;
+
+	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+	BUG_ON(ret);
+
+	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+					     item_size);
+		BUG_ON(ret);
+	}
+
+	kfree(chunk);
+	return 0;
+}
+
+/*
+ * Chunk allocation falls into two parts. The first part does works
+ * that make the new allocated chunk useable, but not do any operation
+ * that modifies the chunk tree. The second part does the works that
+ * require modifying the chunk tree. This division is important for the
+ * bootstrap process of adding storage to a seed btrfs.
+ */
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *extent_root, u64 type)
+{
+	u64 chunk_offset;
+	u64 chunk_size;
+	u64 stripe_size;
+	struct map_lookup *map;
+	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	int ret;
+
+	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+			      &chunk_offset);
+	if (ret)
+		return ret;
+
+	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+				  &stripe_size, chunk_offset, type);
+	if (ret)
+		return ret;
+
+	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+				   chunk_size, stripe_size);
+	BUG_ON(ret);
+	return 0;
+}
+
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 struct btrfs_device *device)
+{
+	u64 chunk_offset;
+	u64 sys_chunk_offset;
+	u64 chunk_size;
+	u64 sys_chunk_size;
+	u64 stripe_size;
+	u64 sys_stripe_size;
+	u64 alloc_profile;
+	struct map_lookup *map;
+	struct map_lookup *sys_map;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	int ret;
+
+	ret = find_next_chunk(fs_info->chunk_root,
+			      BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
+	BUG_ON(ret);
+
+	alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
+			(fs_info->metadata_alloc_profile &
+			 fs_info->avail_metadata_alloc_bits);
+	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+
+	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+				  &stripe_size, chunk_offset, alloc_profile);
+	BUG_ON(ret);
+
+	sys_chunk_offset = chunk_offset + chunk_size;
+
+	alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
+			(fs_info->system_alloc_profile &
+			 fs_info->avail_system_alloc_bits);
+	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+
+	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
+				  &sys_chunk_size, &sys_stripe_size,
+				  sys_chunk_offset, alloc_profile);
+	BUG_ON(ret);
+
+	ret = btrfs_add_device(trans, fs_info->chunk_root, device);
+	BUG_ON(ret);
+
+	/*
+	 * Modifying chunk tree needs allocating new blocks from both
+	 * system block group and metadata block group. So we only can
+	 * do operations require modifying the chunk tree after both
+	 * block groups were created.
+	 */
+	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+				   chunk_size, stripe_size);
+	BUG_ON(ret);
+
+	ret = __finish_chunk_alloc(trans, extent_root, sys_map,
+				   sys_chunk_offset, sys_chunk_size,
+				   sys_stripe_size);
+	BUG_ON(ret);
+	return 0;
+}
+
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+	int readonly = 0;
+	int i;
+
+	read_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+	read_unlock(&map_tree->map_tree.lock);
+	if (!em)
+		return 1;
+
+	if (btrfs_test_opt(root, DEGRADED)) {
+		free_extent_map(em);
+		return 0;
+	}
+
+	map = (struct map_lookup *)em->bdev;
+	for (i = 0; i < map->num_stripes; i++) {
+		if (!map->stripes[i].dev->writeable) {
+			readonly = 1;
+			break;
+		}
+	}
+	free_extent_map(em);
+	return readonly;
+}
+
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
+{
+	extent_map_tree_init(&tree->map_tree);
+}
+
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+{
+	struct extent_map *em;
+
+	while (1) {
+		write_lock(&tree->map_tree.lock);
+		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+		if (em)
+			remove_extent_mapping(&tree->map_tree, em);
+		write_unlock(&tree->map_tree.lock);
+		if (!em)
+			break;
+		kfree(em->bdev);
+		/* once for us */
+		free_extent_map(em);
+		/* once for the tree */
+		free_extent_map(em);
+	}
+}
+
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	int ret;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, len);
+	read_unlock(&em_tree->lock);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
+		ret = map->num_stripes;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+		ret = map->sub_stripes;
+	else
+		ret = 1;
+	free_extent_map(em);
+	return ret;
+}
+
+static int find_live_mirror(struct map_lookup *map, int first, int num,
+			    int optimal)
+{
+	int i;
+	if (map->stripes[optimal].dev->bdev)
+		return optimal;
+	for (i = first; i < first + num; i++) {
+		if (map->stripes[i].dev->bdev)
+			return i;
+	}
+	/* we couldn't find one that doesn't fail.  Just return something
+	 * and the io error handling code will clean up eventually
+	 */
+	return optimal;
+}
+
+static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+			     u64 logical, u64 *length,
+			     struct btrfs_multi_bio **multi_ret,
+			     int mirror_num)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	u64 offset;
+	u64 stripe_offset;
+	u64 stripe_end_offset;
+	u64 stripe_nr;
+	u64 stripe_nr_orig;
+	u64 stripe_nr_end;
+	int stripes_allocated = 8;
+	int stripes_required = 1;
+	int stripe_index;
+	int i;
+	int num_stripes;
+	int max_errors = 0;
+	struct btrfs_multi_bio *multi = NULL;
+
+	if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
+		stripes_allocated = 1;
+again:
+	if (multi_ret) {
+		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
+				GFP_NOFS);
+		if (!multi)
+			return -ENOMEM;
+
+		atomic_set(&multi->error, 0);
+	}
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, *length);
+	read_unlock(&em_tree->lock);
+
+	if (!em) {
+		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+		       (unsigned long long)logical,
+		       (unsigned long long)*length);
+		BUG();
+	}
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	offset = logical - em->start;
+
+	if (mirror_num > map->num_stripes)
+		mirror_num = 0;
+
+	/* if our multi bio struct is too small, back off and try again */
+	if (rw & REQ_WRITE) {
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+				 BTRFS_BLOCK_GROUP_DUP)) {
+			stripes_required = map->num_stripes;
+			max_errors = 1;
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+			stripes_required = map->sub_stripes;
+			max_errors = 1;
+		}
+	}
+	if (rw & REQ_DISCARD) {
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+				 BTRFS_BLOCK_GROUP_RAID1 |
+				 BTRFS_BLOCK_GROUP_DUP |
+				 BTRFS_BLOCK_GROUP_RAID10)) {
+			stripes_required = map->num_stripes;
+		}
+	}
+	if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+	    stripes_allocated < stripes_required) {
+		stripes_allocated = map->num_stripes;
+		free_extent_map(em);
+		kfree(multi);
+		goto again;
+	}
+	stripe_nr = offset;
+	/*
+	 * stripe_nr counts the total number of stripes we have to stride
+	 * to get to this block
+	 */
+	do_div(stripe_nr, map->stripe_len);
+
+	stripe_offset = stripe_nr * map->stripe_len;
+	BUG_ON(offset < stripe_offset);
+
+	/* stripe_offset is the offset of this block in its stripe*/
+	stripe_offset = offset - stripe_offset;
+
+	if (rw & REQ_DISCARD)
+		*length = min_t(u64, em->len - offset, *length);
+	else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+			      BTRFS_BLOCK_GROUP_RAID1 |
+			      BTRFS_BLOCK_GROUP_RAID10 |
+			      BTRFS_BLOCK_GROUP_DUP)) {
+		/* we limit the length of each bio to what fits in a stripe */
+		*length = min_t(u64, em->len - offset,
+				map->stripe_len - stripe_offset);
+	} else {
+		*length = em->len - offset;
+	}
+
+	if (!multi_ret)
+		goto out;
+
+	num_stripes = 1;
+	stripe_index = 0;
+	stripe_nr_orig = stripe_nr;
+	stripe_nr_end = (offset + *length + map->stripe_len - 1) &
+			(~(map->stripe_len - 1));
+	do_div(stripe_nr_end, map->stripe_len);
+	stripe_end_offset = stripe_nr_end * map->stripe_len -
+			    (offset + *length);
+	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+		if (rw & REQ_DISCARD)
+			num_stripes = min_t(u64, map->num_stripes,
+					    stripe_nr_end - stripe_nr_orig);
+		stripe_index = do_div(stripe_nr, map->num_stripes);
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+		if (rw & (REQ_WRITE | REQ_DISCARD))
+			num_stripes = map->num_stripes;
+		else if (mirror_num)
+			stripe_index = mirror_num - 1;
+		else {
+			stripe_index = find_live_mirror(map, 0,
+					    map->num_stripes,
+					    current->pid % map->num_stripes);
+		}
+
+	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+		if (rw & (REQ_WRITE | REQ_DISCARD))
+			num_stripes = map->num_stripes;
+		else if (mirror_num)
+			stripe_index = mirror_num - 1;
+
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+		int factor = map->num_stripes / map->sub_stripes;
+
+		stripe_index = do_div(stripe_nr, factor);
+		stripe_index *= map->sub_stripes;
+
+		if (rw & REQ_WRITE)
+			num_stripes = map->sub_stripes;
+		else if (rw & REQ_DISCARD)
+			num_stripes = min_t(u64, map->sub_stripes *
+					    (stripe_nr_end - stripe_nr_orig),
+					    map->num_stripes);
+		else if (mirror_num)
+			stripe_index += mirror_num - 1;
+		else {
+			stripe_index = find_live_mirror(map, stripe_index,
+					      map->sub_stripes, stripe_index +
+					      current->pid % map->sub_stripes);
+		}
+	} else {
+		/*
+		 * after this do_div call, stripe_nr is the number of stripes
+		 * on this device we have to walk to find the data, and
+		 * stripe_index is the number of our device in the stripe array
+		 */
+		stripe_index = do_div(stripe_nr, map->num_stripes);
+	}
+	BUG_ON(stripe_index >= map->num_stripes);
+
+	if (rw & REQ_DISCARD) {
+		for (i = 0; i < num_stripes; i++) {
+			multi->stripes[i].physical =
+				map->stripes[stripe_index].physical +
+				stripe_offset + stripe_nr * map->stripe_len;
+			multi->stripes[i].dev = map->stripes[stripe_index].dev;
+
+			if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+				u64 stripes;
+				u32 last_stripe = 0;
+				int j;
+
+				div_u64_rem(stripe_nr_end - 1,
+					    map->num_stripes,
+					    &last_stripe);
+
+				for (j = 0; j < map->num_stripes; j++) {
+					u32 test;
+
+					div_u64_rem(stripe_nr_end - 1 - j,
+						    map->num_stripes, &test);
+					if (test == stripe_index)
+						break;
+				}
+				stripes = stripe_nr_end - 1 - j;
+				do_div(stripes, map->num_stripes);
+				multi->stripes[i].length = map->stripe_len *
+					(stripes - stripe_nr + 1);
+
+				if (i == 0) {
+					multi->stripes[i].length -=
+						stripe_offset;
+					stripe_offset = 0;
+				}
+				if (stripe_index == last_stripe)
+					multi->stripes[i].length -=
+						stripe_end_offset;
+			} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+				u64 stripes;
+				int j;
+				int factor = map->num_stripes /
+					     map->sub_stripes;
+				u32 last_stripe = 0;
+
+				div_u64_rem(stripe_nr_end - 1,
+					    factor, &last_stripe);
+				last_stripe *= map->sub_stripes;
+
+				for (j = 0; j < factor; j++) {
+					u32 test;
+
+					div_u64_rem(stripe_nr_end - 1 - j,
+						    factor, &test);
+
+					if (test ==
+					    stripe_index / map->sub_stripes)
+						break;
+				}
+				stripes = stripe_nr_end - 1 - j;
+				do_div(stripes, factor);
+				multi->stripes[i].length = map->stripe_len *
+					(stripes - stripe_nr + 1);
+
+				if (i < map->sub_stripes) {
+					multi->stripes[i].length -=
+						stripe_offset;
+					if (i == map->sub_stripes - 1)
+						stripe_offset = 0;
+				}
+				if (stripe_index >= last_stripe &&
+				    stripe_index <= (last_stripe +
+						     map->sub_stripes - 1)) {
+					multi->stripes[i].length -=
+						stripe_end_offset;
+				}
+			} else
+				multi->stripes[i].length = *length;
+
+			stripe_index++;
+			if (stripe_index == map->num_stripes) {
+				/* This could only happen for RAID0/10 */
+				stripe_index = 0;
+				stripe_nr++;
+			}
+		}
+	} else {
+		for (i = 0; i < num_stripes; i++) {
+			multi->stripes[i].physical =
+				map->stripes[stripe_index].physical +
+				stripe_offset +
+				stripe_nr * map->stripe_len;
+			multi->stripes[i].dev =
+				map->stripes[stripe_index].dev;
+			stripe_index++;
+		}
+	}
+	if (multi_ret) {
+		*multi_ret = multi;
+		multi->num_stripes = num_stripes;
+		multi->max_errors = max_errors;
+	}
+out:
+	free_extent_map(em);
+	return 0;
+}
+
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+		      u64 logical, u64 *length,
+		      struct btrfs_multi_bio **multi_ret, int mirror_num)
+{
+	return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
+				 mirror_num);
+}
+
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+		     u64 chunk_start, u64 physical, u64 devid,
+		     u64 **logical, int *naddrs, int *stripe_len)
+{
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	u64 *buf;
+	u64 bytenr;
+	u64 length;
+	u64 stripe_nr;
+	int i, j, nr = 0;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_start, 1);
+	read_unlock(&em_tree->lock);
+
+	BUG_ON(!em || em->start != chunk_start);
+	map = (struct map_lookup *)em->bdev;
+
+	length = em->len;
+	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+		do_div(length, map->num_stripes / map->sub_stripes);
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+		do_div(length, map->num_stripes);
+
+	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+	BUG_ON(!buf);
+
+	for (i = 0; i < map->num_stripes; i++) {
+		if (devid && map->stripes[i].dev->devid != devid)
+			continue;
+		if (map->stripes[i].physical > physical ||
+		    map->stripes[i].physical + length <= physical)
+			continue;
+
+		stripe_nr = physical - map->stripes[i].physical;
+		do_div(stripe_nr, map->stripe_len);
+
+		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+			stripe_nr = stripe_nr * map->num_stripes + i;
+			do_div(stripe_nr, map->sub_stripes);
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+			stripe_nr = stripe_nr * map->num_stripes + i;
+		}
+		bytenr = chunk_start + stripe_nr * map->stripe_len;
+		WARN_ON(nr >= map->num_stripes);
+		for (j = 0; j < nr; j++) {
+			if (buf[j] == bytenr)
+				break;
+		}
+		if (j == nr) {
+			WARN_ON(nr >= map->num_stripes);
+			buf[nr++] = bytenr;
+		}
+	}
+
+	*logical = buf;
+	*naddrs = nr;
+	*stripe_len = map->stripe_len;
+
+	free_extent_map(em);
+	return 0;
+}
+
+static void end_bio_multi_stripe(struct bio *bio, int err)
+{
+	struct btrfs_multi_bio *multi = bio->bi_private;
+	int is_orig_bio = 0;
+
+	if (err)
+		atomic_inc(&multi->error);
+
+	if (bio == multi->orig_bio)
+		is_orig_bio = 1;
+
+	if (atomic_dec_and_test(&multi->stripes_pending)) {
+		if (!is_orig_bio) {
+			bio_put(bio);
+			bio = multi->orig_bio;
+		}
+		bio->bi_private = multi->private;
+		bio->bi_end_io = multi->end_io;
+		/* only send an error to the higher layers if it is
+		 * beyond the tolerance of the multi-bio
+		 */
+		if (atomic_read(&multi->error) > multi->max_errors) {
+			err = -EIO;
+		} else if (err) {
+			/*
+			 * this bio is actually up to date, we didn't
+			 * go over the max number of errors
+			 */
+			set_bit(BIO_UPTODATE, &bio->bi_flags);
+			err = 0;
+		}
+		kfree(multi);
+
+		bio_endio(bio, err);
+	} else if (!is_orig_bio) {
+		bio_put(bio);
+	}
+}
+
+struct async_sched {
+	struct bio *bio;
+	int rw;
+	struct btrfs_fs_info *info;
+	struct btrfs_work work;
+};
+
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+static noinline int schedule_bio(struct btrfs_root *root,
+				 struct btrfs_device *device,
+				 int rw, struct bio *bio)
+{
+	int should_queue = 1;
+	struct btrfs_pending_bios *pending_bios;
+
+	/* don't bother with additional async steps for reads, right now */
+	if (!(rw & REQ_WRITE)) {
+		bio_get(bio);
+		submit_bio(rw, bio);
+		bio_put(bio);
+		return 0;
+	}
+
+	/*
+	 * nr_async_bios allows us to reliably return congestion to the
+	 * higher layers.  Otherwise, the async bio makes it appear we have
+	 * made progress against dirty pages when we've really just put it
+	 * on a queue for later
+	 */
+	atomic_inc(&root->fs_info->nr_async_bios);
+	WARN_ON(bio->bi_next);
+	bio->bi_next = NULL;
+	bio->bi_rw |= rw;
+
+	spin_lock(&device->io_lock);
+	if (bio->bi_rw & REQ_SYNC)
+		pending_bios = &device->pending_sync_bios;
+	else
+		pending_bios = &device->pending_bios;
+
+	if (pending_bios->tail)
+		pending_bios->tail->bi_next = bio;
+
+	pending_bios->tail = bio;
+	if (!pending_bios->head)
+		pending_bios->head = bio;
+	if (device->running_pending)
+		should_queue = 0;
+
+	spin_unlock(&device->io_lock);
+
+	if (should_queue)
+		btrfs_queue_worker(&root->fs_info->submit_workers,
+				   &device->work);
+	return 0;
+}
+
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+		  int mirror_num, int async_submit)
+{
+	struct btrfs_mapping_tree *map_tree;
+	struct btrfs_device *dev;
+	struct bio *first_bio = bio;
+	u64 logical = (u64)bio->bi_sector << 9;
+	u64 length = 0;
+	u64 map_length;
+	struct btrfs_multi_bio *multi = NULL;
+	int ret;
+	int dev_nr = 0;
+	int total_devs = 1;
+
+	length = bio->bi_size;
+	map_tree = &root->fs_info->mapping_tree;
+	map_length = length;
+
+	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
+			      mirror_num);
+	BUG_ON(ret);
+
+	total_devs = multi->num_stripes;
+	if (map_length < length) {
+		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+		       "len %llu\n", (unsigned long long)logical,
+		       (unsigned long long)length,
+		       (unsigned long long)map_length);
+		BUG();
+	}
+	multi->end_io = first_bio->bi_end_io;
+	multi->private = first_bio->bi_private;
+	multi->orig_bio = first_bio;
+	atomic_set(&multi->stripes_pending, multi->num_stripes);
+
+	while (dev_nr < total_devs) {
+		if (total_devs > 1) {
+			if (dev_nr < total_devs - 1) {
+				bio = bio_clone(first_bio, GFP_NOFS);
+				BUG_ON(!bio);
+			} else {
+				bio = first_bio;
+			}
+			bio->bi_private = multi;
+			bio->bi_end_io = end_bio_multi_stripe;
+		}
+		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
+		dev = multi->stripes[dev_nr].dev;
+		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
+			bio->bi_bdev = dev->bdev;
+			if (async_submit)
+				schedule_bio(root, dev, rw, bio);
+			else
+				submit_bio(rw, bio);
+		} else {
+			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
+			bio->bi_sector = logical >> 9;
+			bio_endio(bio, -EIO);
+		}
+		dev_nr++;
+	}
+	if (total_devs == 1)
+		kfree(multi);
+	return 0;
+}
+
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+				       u8 *uuid, u8 *fsid)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *cur_devices;
+
+	cur_devices = root->fs_info->fs_devices;
+	while (cur_devices) {
+		if (!fsid ||
+		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+			device = __find_device(&cur_devices->devices,
+					       devid, uuid);
+			if (device)
+				return device;
+		}
+		cur_devices = cur_devices->seed;
+	}
+	return NULL;
+}
+
+static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
+					    u64 devid, u8 *dev_uuid)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+
+	device = kzalloc(sizeof(*device), GFP_NOFS);
+	if (!device)
+		return NULL;
+	list_add(&device->dev_list,
+		 &fs_devices->devices);
+	device->dev_root = root->fs_info->dev_root;
+	device->devid = devid;
+	device->work.func = pending_bios_fn;
+	device->fs_devices = fs_devices;
+	device->missing = 1;
+	fs_devices->num_devices++;
+	fs_devices->missing_devices++;
+	spin_lock_init(&device->io_lock);
+	INIT_LIST_HEAD(&device->dev_alloc_list);
+	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
+	return device;
+}
+
+static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+			  struct extent_buffer *leaf,
+			  struct btrfs_chunk *chunk)
+{
+	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+	struct map_lookup *map;
+	struct extent_map *em;
+	u64 logical;
+	u64 length;
+	u64 devid;
+	u8 uuid[BTRFS_UUID_SIZE];
+	int num_stripes;
+	int ret;
+	int i;
+
+	logical = key->offset;
+	length = btrfs_chunk_length(leaf, chunk);
+
+	read_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+	read_unlock(&map_tree->map_tree.lock);
+
+	/* already mapped? */
+	if (em && em->start <= logical && em->start + em->len > logical) {
+		free_extent_map(em);
+		return 0;
+	} else if (em) {
+		free_extent_map(em);
+	}
+
+	em = alloc_extent_map();
+	if (!em)
+		return -ENOMEM;
+	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+	if (!map) {
+		free_extent_map(em);
+		return -ENOMEM;
+	}
+
+	em->bdev = (struct block_device *)map;
+	em->start = logical;
+	em->len = length;
+	em->block_start = 0;
+	em->block_len = em->len;
+
+	map->num_stripes = num_stripes;
+	map->io_width = btrfs_chunk_io_width(leaf, chunk);
+	map->io_align = btrfs_chunk_io_align(leaf, chunk);
+	map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
+	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+	map->type = btrfs_chunk_type(leaf, chunk);
+	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+	for (i = 0; i < num_stripes; i++) {
+		map->stripes[i].physical =
+			btrfs_stripe_offset_nr(leaf, chunk, i);
+		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+		read_extent_buffer(leaf, uuid, (unsigned long)
+				   btrfs_stripe_dev_uuid_nr(chunk, i),
+				   BTRFS_UUID_SIZE);
+		map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
+							NULL);
+		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
+			kfree(map);
+			free_extent_map(em);
+			return -EIO;
+		}
+		if (!map->stripes[i].dev) {
+			map->stripes[i].dev =
+				add_missing_dev(root, devid, uuid);
+			if (!map->stripes[i].dev) {
+				kfree(map);
+				free_extent_map(em);
+				return -EIO;
+			}
+		}
+		map->stripes[i].dev->in_fs_metadata = 1;
+	}
+
+	write_lock(&map_tree->map_tree.lock);
+	ret = add_extent_mapping(&map_tree->map_tree, em);
+	write_unlock(&map_tree->map_tree.lock);
+	BUG_ON(ret);
+	free_extent_map(em);
+
+	return 0;
+}
+
+static int fill_device_from_item(struct extent_buffer *leaf,
+				 struct btrfs_dev_item *dev_item,
+				 struct btrfs_device *device)
+{
+	unsigned long ptr;
+
+	device->devid = btrfs_device_id(leaf, dev_item);
+	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+	device->total_bytes = device->disk_total_bytes;
+	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+	device->type = btrfs_device_type(leaf, dev_item);
+	device->io_align = btrfs_device_io_align(leaf, dev_item);
+	device->io_width = btrfs_device_io_width(leaf, dev_item);
+	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+
+	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+
+	return 0;
+}
+
+static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
+{
+	struct btrfs_fs_devices *fs_devices;
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+
+	fs_devices = root->fs_info->fs_devices->seed;
+	while (fs_devices) {
+		if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+			ret = 0;
+			goto out;
+		}
+		fs_devices = fs_devices->seed;
+	}
+
+	fs_devices = find_fsid(fsid);
+	if (!fs_devices) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	fs_devices = clone_fs_devices(fs_devices);
+	if (IS_ERR(fs_devices)) {
+		ret = PTR_ERR(fs_devices);
+		goto out;
+	}
+
+	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
+				   root->fs_info->bdev_holder);
+	if (ret)
+		goto out;
+
+	if (!fs_devices->seeding) {
+		__btrfs_close_devices(fs_devices);
+		free_fs_devices(fs_devices);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	fs_devices->seed = root->fs_info->fs_devices->seed;
+	root->fs_info->fs_devices->seed = fs_devices;
+out:
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
+
+static int read_one_dev(struct btrfs_root *root,
+			struct extent_buffer *leaf,
+			struct btrfs_dev_item *dev_item)
+{
+	struct btrfs_device *device;
+	u64 devid;
+	int ret;
+	u8 fs_uuid[BTRFS_UUID_SIZE];
+	u8 dev_uuid[BTRFS_UUID_SIZE];
+
+	devid = btrfs_device_id(leaf, dev_item);
+	read_extent_buffer(leaf, dev_uuid,
+			   (unsigned long)btrfs_device_uuid(dev_item),
+			   BTRFS_UUID_SIZE);
+	read_extent_buffer(leaf, fs_uuid,
+			   (unsigned long)btrfs_device_fsid(dev_item),
+			   BTRFS_UUID_SIZE);
+
+	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
+		ret = open_seed_devices(root, fs_uuid);
+		if (ret && !btrfs_test_opt(root, DEGRADED))
+			return ret;
+	}
+
+	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+	if (!device || !device->bdev) {
+		if (!btrfs_test_opt(root, DEGRADED))
+			return -EIO;
+
+		if (!device) {
+			printk(KERN_WARNING "warning devid %llu missing\n",
+			       (unsigned long long)devid);
+			device = add_missing_dev(root, devid, dev_uuid);
+			if (!device)
+				return -ENOMEM;
+		} else if (!device->missing) {
+			/*
+			 * this happens when a device that was properly setup
+			 * in the device info lists suddenly goes bad.
+			 * device->bdev is NULL, and so we have to set
+			 * device->missing to one here
+			 */
+			root->fs_info->fs_devices->missing_devices++;
+			device->missing = 1;
+		}
+	}
+
+	if (device->fs_devices != root->fs_info->fs_devices) {
+		BUG_ON(device->writeable);
+		if (device->generation !=
+		    btrfs_device_generation(leaf, dev_item))
+			return -EINVAL;
+	}
+
+	fill_device_from_item(leaf, dev_item, device);
+	device->dev_root = root->fs_info->dev_root;
+	device->in_fs_metadata = 1;
+	if (device->writeable)
+		device->fs_devices->total_rw_bytes += device->total_bytes;
+	ret = 0;
+	return ret;
+}
+
+int btrfs_read_sys_array(struct btrfs_root *root)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct extent_buffer *sb;
+	struct btrfs_disk_key *disk_key;
+	struct btrfs_chunk *chunk;
+	u8 *ptr;
+	unsigned long sb_ptr;
+	int ret = 0;
+	u32 num_stripes;
+	u32 array_size;
+	u32 len = 0;
+	u32 cur;
+	struct btrfs_key key;
+
+	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
+					  BTRFS_SUPER_INFO_SIZE);
+	if (!sb)
+		return -ENOMEM;
+	btrfs_set_buffer_uptodate(sb);
+	btrfs_set_buffer_lockdep_class(sb, 0);
+
+	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
+	array_size = btrfs_super_sys_array_size(super_copy);
+
+	ptr = super_copy->sys_chunk_array;
+	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
+	cur = 0;
+
+	while (cur < array_size) {
+		disk_key = (struct btrfs_disk_key *)ptr;
+		btrfs_disk_key_to_cpu(&key, disk_key);
+
+		len = sizeof(*disk_key); ptr += len;
+		sb_ptr += len;
+		cur += len;
+
+		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+			chunk = (struct btrfs_chunk *)sb_ptr;
+			ret = read_one_chunk(root, &key, sb, chunk);
+			if (ret)
+				break;
+			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+			len = btrfs_chunk_item_size(num_stripes);
+		} else {
+			ret = -EIO;
+			break;
+		}
+		ptr += len;
+		sb_ptr += len;
+		cur += len;
+	}
+	free_extent_buffer(sb);
+	return ret;
+}
+
+int btrfs_read_chunk_tree(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	int ret;
+	int slot;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* first we search for all of the device items, and then we
+	 * read in all of the chunk items.  This way we can create chunk
+	 * mappings that reference all of the devices that are afound
+	 */
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.offset = 0;
+	key.type = 0;
+again:
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+			break;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+			if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
+				break;
+			if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+				struct btrfs_dev_item *dev_item;
+				dev_item = btrfs_item_ptr(leaf, slot,
+						  struct btrfs_dev_item);
+				ret = read_one_dev(root, leaf, dev_item);
+				if (ret)
+					goto error;
+			}
+		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+			struct btrfs_chunk *chunk;
+			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+			ret = read_one_chunk(root, &found_key, leaf, chunk);
+			if (ret)
+				goto error;
+		}
+		path->slots[0]++;
+	}
+	if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+		key.objectid = 0;
+		btrfs_release_path(path);
+		goto again;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 00000000..6d866db4
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,218 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_VOLUMES_
+#define __BTRFS_VOLUMES_
+
+#include <linux/bio.h>
+#include <linux/sort.h>
+#include "async-thread.h"
+
+#define BTRFS_STRIPE_LEN	(64 * 1024)
+
+struct buffer_head;
+struct btrfs_pending_bios {
+	struct bio *head;
+	struct bio *tail;
+};
+
+struct btrfs_device {
+	struct list_head dev_list;
+	struct list_head dev_alloc_list;
+	struct btrfs_fs_devices *fs_devices;
+	struct btrfs_root *dev_root;
+
+	/* regular prio bios */
+	struct btrfs_pending_bios pending_bios;
+	/* WRITE_SYNC bios */
+	struct btrfs_pending_bios pending_sync_bios;
+
+	int running_pending;
+	u64 generation;
+
+	int writeable;
+	int in_fs_metadata;
+	int missing;
+	int can_discard;
+
+	spinlock_t io_lock;
+
+	struct block_device *bdev;
+
+	/* the mode sent to blkdev_get */
+	fmode_t mode;
+
+	char *name;
+
+	/* the internal btrfs device id */
+	u64 devid;
+
+	/* size of the device */
+	u64 total_bytes;
+
+	/* size of the disk */
+	u64 disk_total_bytes;
+
+	/* bytes used */
+	u64 bytes_used;
+
+	/* optimal io alignment for this device */
+	u32 io_align;
+
+	/* optimal io width for this device */
+	u32 io_width;
+
+	/* minimal io size for this device */
+	u32 sector_size;
+
+	/* type and info about this device */
+	u64 type;
+
+	/* physical drive uuid (or lvm uuid) */
+	u8 uuid[BTRFS_UUID_SIZE];
+
+	/* per-device scrub information */
+	struct scrub_dev *scrub_device;
+
+	struct btrfs_work work;
+	struct rcu_head rcu;
+	struct work_struct rcu_work;
+};
+
+struct btrfs_fs_devices {
+	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+
+	/* the device with this id has the most recent copy of the super */
+	u64 latest_devid;
+	u64 latest_trans;
+	u64 num_devices;
+	u64 open_devices;
+	u64 rw_devices;
+	u64 missing_devices;
+	u64 total_rw_bytes;
+	u64 num_can_discard;
+	struct block_device *latest_bdev;
+
+	/* all of the devices in the FS, protected by a mutex
+	 * so we can safely walk it to write out the supers without
+	 * worrying about add/remove by the multi-device code
+	 */
+	struct mutex device_list_mutex;
+	struct list_head devices;
+
+	/* devices not currently being allocated */
+	struct list_head alloc_list;
+	struct list_head list;
+
+	struct btrfs_fs_devices *seed;
+	int seeding;
+
+	int opened;
+
+	/* set when we find or add a device that doesn't have the
+	 * nonrot flag set
+	 */
+	int rotating;
+};
+
+struct btrfs_bio_stripe {
+	struct btrfs_device *dev;
+	u64 physical;
+	u64 length; /* only used for discard mappings */
+};
+
+struct btrfs_multi_bio {
+	atomic_t stripes_pending;
+	bio_end_io_t *end_io;
+	struct bio *orig_bio;
+	void *private;
+	atomic_t error;
+	int max_errors;
+	int num_stripes;
+	struct btrfs_bio_stripe stripes[];
+};
+
+struct btrfs_device_info {
+	struct btrfs_device *dev;
+	u64 dev_offset;
+	u64 max_avail;
+	u64 total_avail;
+};
+
+struct map_lookup {
+	u64 type;
+	int io_align;
+	int io_width;
+	int stripe_len;
+	int sector_size;
+	int num_stripes;
+	int sub_stripes;
+	struct btrfs_bio_stripe stripes[];
+};
+
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+			    (sizeof(struct btrfs_bio_stripe) * (n)))
+
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+				   u64 end, u64 *length);
+
+#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
+			    (sizeof(struct btrfs_bio_stripe) * (n)))
+
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+			   struct btrfs_device *device,
+			   u64 chunk_tree, u64 chunk_objectid,
+			   u64 chunk_offset, u64 start, u64 num_bytes);
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+		    u64 logical, u64 *length,
+		    struct btrfs_multi_bio **multi_ret, int mirror_num);
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+		     u64 chunk_start, u64 physical, u64 devid,
+		     u64 **logical, int *naddrs, int *stripe_len);
+int btrfs_read_sys_array(struct btrfs_root *root);
+int btrfs_read_chunk_tree(struct btrfs_root *root);
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *extent_root, u64 type);
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+		  int mirror_num, int async_submit);
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+		       fmode_t flags, void *holder);
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+			  struct btrfs_fs_devices **fs_devices_ret);
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_device *device);
+int btrfs_rm_device(struct btrfs_root *root, char *device_path);
+int btrfs_cleanup_fs_uuids(void);
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size);
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+				       u8 *uuid, u8 *fsid);
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
+int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_balance(struct btrfs_root *dev_root);
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_device *device, u64 num_bytes,
+			 u64 *start, u64 *max_avail);
+#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 00000000..5366fe45
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "xattr.h"
+#include "disk-io.h"
+
+
+ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+				void *buffer, size_t size)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	int ret = 0;
+	unsigned long data_ptr;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* lookup the xattr by name */
+	di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name,
+				strlen(name), 0);
+	if (!di) {
+		ret = -ENODATA;
+		goto out;
+	} else if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	/* if size is 0, that means we want the size of the attr */
+	if (!size) {
+		ret = btrfs_dir_data_len(leaf, di);
+		goto out;
+	}
+
+	/* now get the data out of our dir_item */
+	if (btrfs_dir_data_len(leaf, di) > size) {
+		ret = -ERANGE;
+		goto out;
+	}
+
+	/*
+	 * The way things are packed into the leaf is like this
+	 * |struct btrfs_dir_item|name|data|
+	 * where name is the xattr name, so security.foo, and data is the
+	 * content of the xattr.  data_ptr points to the location in memory
+	 * where the data starts in the in memory leaf
+	 */
+	data_ptr = (unsigned long)((char *)(di + 1) +
+				   btrfs_dir_name_len(leaf, di));
+	read_extent_buffer(leaf, buffer, data_ptr,
+			   btrfs_dir_data_len(leaf, di));
+	ret = btrfs_dir_data_len(leaf, di);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int do_setxattr(struct btrfs_trans_handle *trans,
+		       struct inode *inode, const char *name,
+		       const void *value, size_t size, int flags)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path;
+	size_t name_len = strlen(name);
+	int ret = 0;
+
+	if (name_len + size > BTRFS_MAX_XATTR_SIZE(root))
+		return -ENOSPC;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* first lets see if we already have this xattr */
+	di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
+				strlen(name), -1);
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto out;
+	}
+
+	/* ok we already have this xattr, lets remove it */
+	if (di) {
+		/* if we want create only exit */
+		if (flags & XATTR_CREATE) {
+			ret = -EEXIST;
+			goto out;
+		}
+
+		ret = btrfs_delete_one_dir_name(trans, root, path, di);
+		BUG_ON(ret);
+		btrfs_release_path(path);
+
+		/* if we don't have a value then we are removing the xattr */
+		if (!value)
+			goto out;
+	} else {
+		btrfs_release_path(path);
+
+		if (flags & XATTR_REPLACE) {
+			/* we couldn't find the attr to replace */
+			ret = -ENODATA;
+			goto out;
+		}
+	}
+
+	/* ok we have to create a completely new xattr */
+	ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
+				      name, name_len, value, size);
+	BUG_ON(ret);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+		     struct inode *inode, const char *name,
+		     const void *value, size_t size, int flags)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	if (trans)
+		return do_setxattr(trans, inode, name, value, size, flags);
+
+	trans = btrfs_start_transaction(root, 2);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	ret = do_setxattr(trans, inode, name, value, size, flags);
+	if (ret)
+		goto out;
+
+	inode->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode(trans, root, inode);
+	BUG_ON(ret);
+out:
+	btrfs_end_transaction_throttle(trans, root);
+	return ret;
+}
+
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct btrfs_key key, found_key;
+	struct inode *inode = dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	int ret = 0, slot;
+	size_t total_size = 0, size_left = size;
+	unsigned long name_ptr;
+	size_t name_len;
+
+	/*
+	 * ok we want all objects associated with this id.
+	 * NOTE: we set key.offset = 0; because we want to start with the
+	 * first xattr that we find and walk forward
+	 */
+	key.objectid = btrfs_ino(inode);
+	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+	key.offset = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 2;
+
+	/* search for our xattrs */
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto err;
+
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+
+		/* this is where we start walking through the path */
+		if (slot >= btrfs_header_nritems(leaf)) {
+			/*
+			 * if we've reached the last slot in this leaf we need
+			 * to go to the next leaf and reset everything
+			 */
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto err;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/* check to make sure this item is what we want */
+		if (found_key.objectid != key.objectid)
+			break;
+		if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
+			break;
+
+		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+		if (verify_dir_item(root, leaf, di))
+			continue;
+
+		name_len = btrfs_dir_name_len(leaf, di);
+		total_size += name_len + 1;
+
+		/* we are just looking for how big our buffer needs to be */
+		if (!size)
+			goto next;
+
+		if (!buffer || (name_len + 1) > size_left) {
+			ret = -ERANGE;
+			goto err;
+		}
+
+		name_ptr = (unsigned long)(di + 1);
+		read_extent_buffer(leaf, buffer, name_ptr, name_len);
+		buffer[name_len] = '\0';
+
+		size_left -= name_len + 1;
+		buffer += name_len + 1;
+next:
+		path->slots[0]++;
+	}
+	ret = total_size;
+
+err:
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+/*
+ * List of handlers for synthetic system.* attributes.  All real ondisk
+ * attributes are handled directly.
+ */
+const struct xattr_handler *btrfs_xattr_handlers[] = {
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+	&btrfs_xattr_acl_access_handler,
+	&btrfs_xattr_acl_default_handler,
+#endif
+	NULL,
+};
+
+/*
+ * Check if the attribute is in a supported namespace.
+ *
+ * This applied after the check for the synthetic attributes in the system
+ * namespace.
+ */
+static bool btrfs_is_valid_xattr(const char *name)
+{
+	return !strncmp(name, XATTR_SECURITY_PREFIX,
+			XATTR_SECURITY_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+
+ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+		       void *buffer, size_t size)
+{
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_getxattr(dentry, name, buffer, size);
+
+	if (!btrfs_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+	return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
+}
+
+int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		   size_t size, int flags)
+{
+	struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+
+	/*
+	 * The permission on security.* and system.* is not checked
+	 * in permission().
+	 */
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_setxattr(dentry, name, value, size, flags);
+
+	if (!btrfs_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	if (size == 0)
+		value = "";  /* empty EA, do not remove */
+
+	return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size,
+				flags);
+}
+
+int btrfs_removexattr(struct dentry *dentry, const char *name)
+{
+	struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+
+	/*
+	 * The permission on security.* and system.* is not checked
+	 * in permission().
+	 */
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_removexattr(dentry, name);
+
+	if (!btrfs_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0,
+				XATTR_REPLACE);
+}
+
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+			      struct inode *inode, struct inode *dir,
+			      const struct qstr *qstr)
+{
+	int err;
+	size_t len;
+	void *value;
+	char *suffix;
+	char *name;
+
+	err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
+					   &len);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+
+	name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
+		       GFP_NOFS);
+	if (!name) {
+		err = -ENOMEM;
+	} else {
+		strcpy(name, XATTR_SECURITY_PREFIX);
+		strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
+		err = __btrfs_setxattr(trans, inode, name, value, len, 0);
+		kfree(name);
+	}
+
+	kfree(suffix);
+	kfree(value);
+	return err;
+}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 00000000..b3cc8039
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __XATTR__
+#define __XATTR__
+
+#include <linux/xattr.h>
+
+extern const struct xattr_handler btrfs_xattr_acl_access_handler;
+extern const struct xattr_handler btrfs_xattr_acl_default_handler;
+extern const struct xattr_handler *btrfs_xattr_handlers[];
+
+extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+		void *buffer, size_t size);
+extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+			    struct inode *inode, const char *name,
+			    const void *value, size_t size, int flags);
+extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+		void *buffer, size_t size);
+extern int btrfs_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags);
+extern int btrfs_removexattr(struct dentry *dentry, const char *name);
+
+extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+				     struct inode *inode, struct inode *dir,
+				     const struct qstr *qstr);
+
+#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 00000000..faccd47c
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,399 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include "compression.h"
+
+struct workspace {
+	z_stream inf_strm;
+	z_stream def_strm;
+	char *buf;
+	struct list_head list;
+};
+
+static void zlib_free_workspace(struct list_head *ws)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+	vfree(workspace->def_strm.workspace);
+	vfree(workspace->inf_strm.workspace);
+	kfree(workspace->buf);
+	kfree(workspace);
+}
+
+static struct list_head *zlib_alloc_workspace(void)
+{
+	struct workspace *workspace;
+
+	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+	if (!workspace)
+		return ERR_PTR(-ENOMEM);
+
+	workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize(
+						MAX_WBITS, MAX_MEM_LEVEL));
+	workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+	workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+	if (!workspace->def_strm.workspace ||
+	    !workspace->inf_strm.workspace || !workspace->buf)
+		goto fail;
+
+	INIT_LIST_HEAD(&workspace->list);
+
+	return &workspace->list;
+fail:
+	zlib_free_workspace(&workspace->list);
+	return ERR_PTR(-ENOMEM);
+}
+
+static int zlib_compress_pages(struct list_head *ws,
+			       struct address_space *mapping,
+			       u64 start, unsigned long len,
+			       struct page **pages,
+			       unsigned long nr_dest_pages,
+			       unsigned long *out_pages,
+			       unsigned long *total_in,
+			       unsigned long *total_out,
+			       unsigned long max_out)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	int ret;
+	char *data_in;
+	char *cpage_out;
+	int nr_pages = 0;
+	struct page *in_page = NULL;
+	struct page *out_page = NULL;
+	unsigned long bytes_left;
+
+	*out_pages = 0;
+	*total_out = 0;
+	*total_in = 0;
+
+	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+		printk(KERN_WARNING "deflateInit failed\n");
+		ret = -1;
+		goto out;
+	}
+
+	workspace->def_strm.total_in = 0;
+	workspace->def_strm.total_out = 0;
+
+	in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+	data_in = kmap(in_page);
+
+	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (out_page == NULL) {
+		ret = -1;
+		goto out;
+	}
+	cpage_out = kmap(out_page);
+	pages[0] = out_page;
+	nr_pages = 1;
+
+	workspace->def_strm.next_in = data_in;
+	workspace->def_strm.next_out = cpage_out;
+	workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+	workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+
+	while (workspace->def_strm.total_in < len) {
+		ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+		if (ret != Z_OK) {
+			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+			       ret);
+			zlib_deflateEnd(&workspace->def_strm);
+			ret = -1;
+			goto out;
+		}
+
+		/* we're making it bigger, give up */
+		if (workspace->def_strm.total_in > 8192 &&
+		    workspace->def_strm.total_in <
+		    workspace->def_strm.total_out) {
+			ret = -1;
+			goto out;
+		}
+		/* we need another page for writing out.  Test this
+		 * before the total_in so we will pull in a new page for
+		 * the stream end if required
+		 */
+		if (workspace->def_strm.avail_out == 0) {
+			kunmap(out_page);
+			if (nr_pages == nr_dest_pages) {
+				out_page = NULL;
+				ret = -1;
+				goto out;
+			}
+			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			if (out_page == NULL) {
+				ret = -1;
+				goto out;
+			}
+			cpage_out = kmap(out_page);
+			pages[nr_pages] = out_page;
+			nr_pages++;
+			workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+			workspace->def_strm.next_out = cpage_out;
+		}
+		/* we're all done */
+		if (workspace->def_strm.total_in >= len)
+			break;
+
+		/* we've read in a full page, get a new one */
+		if (workspace->def_strm.avail_in == 0) {
+			if (workspace->def_strm.total_out > max_out)
+				break;
+
+			bytes_left = len - workspace->def_strm.total_in;
+			kunmap(in_page);
+			page_cache_release(in_page);
+
+			start += PAGE_CACHE_SIZE;
+			in_page = find_get_page(mapping,
+						start >> PAGE_CACHE_SHIFT);
+			data_in = kmap(in_page);
+			workspace->def_strm.avail_in = min(bytes_left,
+							   PAGE_CACHE_SIZE);
+			workspace->def_strm.next_in = data_in;
+		}
+	}
+	workspace->def_strm.avail_in = 0;
+	ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
+	zlib_deflateEnd(&workspace->def_strm);
+
+	if (ret != Z_STREAM_END) {
+		ret = -1;
+		goto out;
+	}
+
+	if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+		ret = -1;
+		goto out;
+	}
+
+	ret = 0;
+	*total_out = workspace->def_strm.total_out;
+	*total_in = workspace->def_strm.total_in;
+out:
+	*out_pages = nr_pages;
+	if (out_page)
+		kunmap(out_page);
+
+	if (in_page) {
+		kunmap(in_page);
+		page_cache_release(in_page);
+	}
+	return ret;
+}
+
+static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
+				  u64 disk_start,
+				  struct bio_vec *bvec,
+				  int vcnt,
+				  size_t srclen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	int ret = 0, ret2;
+	int wbits = MAX_WBITS;
+	char *data_in;
+	size_t total_out = 0;
+	unsigned long page_in_index = 0;
+	unsigned long page_out_index = 0;
+	unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+					PAGE_CACHE_SIZE;
+	unsigned long buf_start;
+	unsigned long pg_offset;
+
+	data_in = kmap(pages_in[page_in_index]);
+	workspace->inf_strm.next_in = data_in;
+	workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
+	workspace->inf_strm.total_in = 0;
+
+	workspace->inf_strm.total_out = 0;
+	workspace->inf_strm.next_out = workspace->buf;
+	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+	pg_offset = 0;
+
+	/* If it's deflate, and it's got no preset dictionary, then
+	   we can tell zlib to skip the adler32 check. */
+	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+		wbits = -((data_in[0] >> 4) + 8);
+		workspace->inf_strm.next_in += 2;
+		workspace->inf_strm.avail_in -= 2;
+	}
+
+	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+		printk(KERN_WARNING "inflateInit failed\n");
+		return -1;
+	}
+	while (workspace->inf_strm.total_in < srclen) {
+		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+		if (ret != Z_OK && ret != Z_STREAM_END)
+			break;
+
+		buf_start = total_out;
+		total_out = workspace->inf_strm.total_out;
+
+		/* we didn't make progress in this inflate call, we're done */
+		if (buf_start == total_out)
+			break;
+
+		ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+						 total_out, disk_start,
+						 bvec, vcnt,
+						 &page_out_index, &pg_offset);
+		if (ret2 == 0) {
+			ret = 0;
+			goto done;
+		}
+
+		workspace->inf_strm.next_out = workspace->buf;
+		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+
+		if (workspace->inf_strm.avail_in == 0) {
+			unsigned long tmp;
+			kunmap(pages_in[page_in_index]);
+			page_in_index++;
+			if (page_in_index >= total_pages_in) {
+				data_in = NULL;
+				break;
+			}
+			data_in = kmap(pages_in[page_in_index]);
+			workspace->inf_strm.next_in = data_in;
+			tmp = srclen - workspace->inf_strm.total_in;
+			workspace->inf_strm.avail_in = min(tmp,
+							   PAGE_CACHE_SIZE);
+		}
+	}
+	if (ret != Z_STREAM_END)
+		ret = -1;
+	else
+		ret = 0;
+done:
+	zlib_inflateEnd(&workspace->inf_strm);
+	if (data_in)
+		kunmap(pages_in[page_in_index]);
+	return ret;
+}
+
+static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+			   struct page *dest_page,
+			   unsigned long start_byte,
+			   size_t srclen, size_t destlen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	int ret = 0;
+	int wbits = MAX_WBITS;
+	unsigned long bytes_left = destlen;
+	unsigned long total_out = 0;
+	char *kaddr;
+
+	workspace->inf_strm.next_in = data_in;
+	workspace->inf_strm.avail_in = srclen;
+	workspace->inf_strm.total_in = 0;
+
+	workspace->inf_strm.next_out = workspace->buf;
+	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+	workspace->inf_strm.total_out = 0;
+	/* If it's deflate, and it's got no preset dictionary, then
+	   we can tell zlib to skip the adler32 check. */
+	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+		wbits = -((data_in[0] >> 4) + 8);
+		workspace->inf_strm.next_in += 2;
+		workspace->inf_strm.avail_in -= 2;
+	}
+
+	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+		printk(KERN_WARNING "inflateInit failed\n");
+		return -1;
+	}
+
+	while (bytes_left > 0) {
+		unsigned long buf_start;
+		unsigned long buf_offset;
+		unsigned long bytes;
+		unsigned long pg_offset = 0;
+
+		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+		if (ret != Z_OK && ret != Z_STREAM_END)
+			break;
+
+		buf_start = total_out;
+		total_out = workspace->inf_strm.total_out;
+
+		if (total_out == buf_start) {
+			ret = -1;
+			break;
+		}
+
+		if (total_out <= start_byte)
+			goto next;
+
+		if (total_out > start_byte && buf_start < start_byte)
+			buf_offset = start_byte - buf_start;
+		else
+			buf_offset = 0;
+
+		bytes = min(PAGE_CACHE_SIZE - pg_offset,
+			    PAGE_CACHE_SIZE - buf_offset);
+		bytes = min(bytes, bytes_left);
+
+		kaddr = kmap_atomic(dest_page, KM_USER0);
+		memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		pg_offset += bytes;
+		bytes_left -= bytes;
+next:
+		workspace->inf_strm.next_out = workspace->buf;
+		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+	}
+
+	if (ret != Z_STREAM_END && bytes_left != 0)
+		ret = -1;
+	else
+		ret = 0;
+
+	zlib_inflateEnd(&workspace->inf_strm);
+	return ret;
+}
+
+struct btrfs_compress_op btrfs_zlib_compress = {
+	.alloc_workspace	= zlib_alloc_workspace,
+	.free_workspace		= zlib_free_workspace,
+	.compress_pages		= zlib_compress_pages,
+	.decompress_biovec	= zlib_decompress_biovec,
+	.decompress		= zlib_decompress,
+};
diff --git a/fs/buffer.c b/fs/buffer.c
new file mode 100644
index 00000000..330cbce1
--- /dev/null
+++ b/fs/buffer.c
@@ -0,0 +1,3326 @@
+/*
+ *  linux/fs/buffer.c
+ *
+ *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
+ */
+
+/*
+ * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
+ *
+ * Removed a lot of unnecessary code and simplified things now that
+ * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
+ *
+ * Speed up hash, lru, and free list operations.  Use gfp() for allocating
+ * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
+ *
+ * Added 32k buffer block sizes - these are required older ARM systems. - RMK
+ *
+ * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
+ */
+
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/capability.h>
+#include <linux/blkdev.h>
+#include <linux/file.h>
+#include <linux/quotaops.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/hash.h>
+#include <linux/suspend.h>
+#include <linux/buffer_head.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/bio.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+#include <linux/mpage.h>
+#include <linux/bit_spinlock.h>
+#include <linux/cleancache.h>
+
+static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
+
+#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
+
+inline void
+init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
+{
+	bh->b_end_io = handler;
+	bh->b_private = private;
+}
+EXPORT_SYMBOL(init_buffer);
+
+static int sleep_on_buffer(void *word)
+{
+	io_schedule();
+	return 0;
+}
+
+void __lock_buffer(struct buffer_head *bh)
+{
+	wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
+							TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(__lock_buffer);
+
+void unlock_buffer(struct buffer_head *bh)
+{
+	clear_bit_unlock(BH_Lock, &bh->b_state);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&bh->b_state, BH_Lock);
+}
+EXPORT_SYMBOL(unlock_buffer);
+
+/*
+ * Block until a buffer comes unlocked.  This doesn't stop it
+ * from becoming locked again - you have to lock it yourself
+ * if you want to preserve its state.
+ */
+void __wait_on_buffer(struct buffer_head * bh)
+{
+	wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(__wait_on_buffer);
+
+static void
+__clear_page_buffers(struct page *page)
+{
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	page_cache_release(page);
+}
+
+
+static int quiet_error(struct buffer_head *bh)
+{
+	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
+		return 0;
+	return 1;
+}
+
+
+static void buffer_io_error(struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
+			bdevname(bh->b_bdev, b),
+			(unsigned long long)bh->b_blocknr);
+}
+
+/*
+ * End-of-IO handler helper function which does not touch the bh after
+ * unlocking it.
+ * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
+ * a race there is benign: unlock_buffer() only use the bh's address for
+ * hashing after unlocking the buffer, so it doesn't actually touch the bh
+ * itself.
+ */
+static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		/* This happens, due to failed READA attempts. */
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+}
+
+/*
+ * Default synchronous end-of-IO handler..  Just mark it up-to-date and
+ * unlock the buffer. This is what ll_rw_block uses too.
+ */
+void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
+{
+	__end_buffer_read_notouch(bh, uptodate);
+	put_bh(bh);
+}
+EXPORT_SYMBOL(end_buffer_read_sync);
+
+void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+	char b[BDEVNAME_SIZE];
+
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		if (!quiet_error(bh)) {
+			buffer_io_error(bh);
+			printk(KERN_WARNING "lost page write due to "
+					"I/O error on %s\n",
+				       bdevname(bh->b_bdev, b));
+		}
+		set_buffer_write_io_error(bh);
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+EXPORT_SYMBOL(end_buffer_write_sync);
+
+/*
+ * Various filesystems appear to want __find_get_block to be non-blocking.
+ * But it's the page lock which protects the buffers.  To get around this,
+ * we get exclusion from try_to_free_buffers with the blockdev mapping's
+ * private_lock.
+ *
+ * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
+ * may be quite high.  This code could TryLock the page, and if that
+ * succeeds, there is no need to take private_lock. (But if
+ * private_lock is contended then so is mapping->tree_lock).
+ */
+static struct buffer_head *
+__find_get_block_slow(struct block_device *bdev, sector_t block)
+{
+	struct inode *bd_inode = bdev->bd_inode;
+	struct address_space *bd_mapping = bd_inode->i_mapping;
+	struct buffer_head *ret = NULL;
+	pgoff_t index;
+	struct buffer_head *bh;
+	struct buffer_head *head;
+	struct page *page;
+	int all_mapped = 1;
+
+	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
+	page = find_get_page(bd_mapping, index);
+	if (!page)
+		goto out;
+
+	spin_lock(&bd_mapping->private_lock);
+	if (!page_has_buffers(page))
+		goto out_unlock;
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (!buffer_mapped(bh))
+			all_mapped = 0;
+		else if (bh->b_blocknr == block) {
+			ret = bh;
+			get_bh(bh);
+			goto out_unlock;
+		}
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	/* we might be here because some of the buffers on this page are
+	 * not mapped.  This is due to various races between
+	 * file io on the block device and getblk.  It gets dealt with
+	 * elsewhere, don't buffer_error if we had some unmapped buffers
+	 */
+	if (all_mapped) {
+		printk("__find_get_block_slow() failed. "
+			"block=%llu, b_blocknr=%llu\n",
+			(unsigned long long)block,
+			(unsigned long long)bh->b_blocknr);
+		printk("b_state=0x%08lx, b_size=%zu\n",
+			bh->b_state, bh->b_size);
+		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
+	}
+out_unlock:
+	spin_unlock(&bd_mapping->private_lock);
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+/* If invalidate_buffers() will trash dirty buffers, it means some kind
+   of fs corruption is going on. Trashing dirty data always imply losing
+   information that was supposed to be just stored on the physical layer
+   by the user.
+
+   Thus invalidate_buffers in general usage is not allwowed to trash
+   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
+   be preserved.  These buffers are simply skipped.
+  
+   We also skip buffers which are still in use.  For example this can
+   happen if a userspace program is reading the block device.
+
+   NOTE: In the case where the user removed a removable-media-disk even if
+   there's still dirty data not synced on disk (due a bug in the device driver
+   or due an error of the user), by not destroying the dirty buffers we could
+   generate corruption also on the next media inserted, thus a parameter is
+   necessary to handle this case in the most safe way possible (trying
+   to not corrupt also the new disk inserted with the data belonging to
+   the old now corrupted disk). Also for the ramdisk the natural thing
+   to do in order to release the ramdisk memory is to destroy dirty buffers.
+
+   These are two special cases. Normal usage imply the device driver
+   to issue a sync on the device (without waiting I/O completion) and
+   then an invalidate_buffers call that doesn't trash dirty buffers.
+
+   For handling cache coherency with the blkdev pagecache the 'update' case
+   is been introduced. It is needed to re-read from disk any pinned
+   buffer. NOTE: re-reading from disk is destructive so we can do it only
+   when we assume nobody is changing the buffercache under our I/O and when
+   we think the disk contains more recent information than the buffercache.
+   The update == 1 pass marks the buffers we need to update, the update == 2
+   pass does the actual I/O. */
+void invalidate_bdev(struct block_device *bdev)
+{
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+	if (mapping->nrpages == 0)
+		return;
+
+	invalidate_bh_lrus();
+	lru_add_drain_all();	/* make sure all lru add caches are flushed */
+	invalidate_mapping_pages(mapping, 0, -1);
+	/* 99% of the time, we don't need to flush the cleancache on the bdev.
+	 * But, for the strange corners, lets be cautious
+	 */
+	cleancache_flush_inode(mapping);
+}
+EXPORT_SYMBOL(invalidate_bdev);
+
+/*
+ * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
+ */
+static void free_more_memory(void)
+{
+	struct zone *zone;
+	int nid;
+
+	wakeup_flusher_threads(1024);
+	yield();
+
+	for_each_online_node(nid) {
+		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
+						gfp_zone(GFP_NOFS), NULL,
+						&zone);
+		if (zone)
+			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
+						GFP_NOFS, NULL);
+	}
+}
+
+/*
+ * I/O completion handler for block_read_full_page() - pages
+ * which come unlocked at the end of I/O.
+ */
+static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
+{
+	unsigned long flags;
+	struct buffer_head *first;
+	struct buffer_head *tmp;
+	struct page *page;
+	int page_uptodate = 1;
+
+	BUG_ON(!buffer_async_read(bh));
+
+	page = bh->b_page;
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		clear_buffer_uptodate(bh);
+		if (!quiet_error(bh))
+			buffer_io_error(bh);
+		SetPageError(page);
+	}
+
+	/*
+	 * Be _very_ careful from here on. Bad things can happen if
+	 * two buffer heads end IO at almost the same time and both
+	 * decide that the page is now completely done.
+	 */
+	first = page_buffers(page);
+	local_irq_save(flags);
+	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	clear_buffer_async_read(bh);
+	unlock_buffer(bh);
+	tmp = bh;
+	do {
+		if (!buffer_uptodate(tmp))
+			page_uptodate = 0;
+		if (buffer_async_read(tmp)) {
+			BUG_ON(!buffer_locked(tmp));
+			goto still_busy;
+		}
+		tmp = tmp->b_this_page;
+	} while (tmp != bh);
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
+
+	/*
+	 * If none of the buffers had errors and they are all
+	 * uptodate then we can set the page uptodate.
+	 */
+	if (page_uptodate && !PageError(page))
+		SetPageUptodate(page);
+	unlock_page(page);
+	return;
+
+still_busy:
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
+	return;
+}
+
+/*
+ * Completion handler for block_write_full_page() - pages which are unlocked
+ * during I/O, and which have PageWriteback cleared upon I/O completion.
+ */
+void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+{
+	char b[BDEVNAME_SIZE];
+	unsigned long flags;
+	struct buffer_head *first;
+	struct buffer_head *tmp;
+	struct page *page;
+
+	BUG_ON(!buffer_async_write(bh));
+
+	page = bh->b_page;
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		if (!quiet_error(bh)) {
+			buffer_io_error(bh);
+			printk(KERN_WARNING "lost page write due to "
+					"I/O error on %s\n",
+			       bdevname(bh->b_bdev, b));
+		}
+		set_bit(AS_EIO, &page->mapping->flags);
+		set_buffer_write_io_error(bh);
+		clear_buffer_uptodate(bh);
+		SetPageError(page);
+	}
+
+	first = page_buffers(page);
+	local_irq_save(flags);
+	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+
+	clear_buffer_async_write(bh);
+	unlock_buffer(bh);
+	tmp = bh->b_this_page;
+	while (tmp != bh) {
+		if (buffer_async_write(tmp)) {
+			BUG_ON(!buffer_locked(tmp));
+			goto still_busy;
+		}
+		tmp = tmp->b_this_page;
+	}
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
+	end_page_writeback(page);
+	return;
+
+still_busy:
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
+	return;
+}
+EXPORT_SYMBOL(end_buffer_async_write);
+
+/*
+ * If a page's buffers are under async readin (end_buffer_async_read
+ * completion) then there is a possibility that another thread of
+ * control could lock one of the buffers after it has completed
+ * but while some of the other buffers have not completed.  This
+ * locked buffer would confuse end_buffer_async_read() into not unlocking
+ * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
+ * that this buffer is not under async I/O.
+ *
+ * The page comes unlocked when it has no locked buffer_async buffers
+ * left.
+ *
+ * PageLocked prevents anyone starting new async I/O reads any of
+ * the buffers.
+ *
+ * PageWriteback is used to prevent simultaneous writeout of the same
+ * page.
+ *
+ * PageLocked prevents anyone from starting writeback of a page which is
+ * under read I/O (PageWriteback is only ever set against a locked page).
+ */
+static void mark_buffer_async_read(struct buffer_head *bh)
+{
+	bh->b_end_io = end_buffer_async_read;
+	set_buffer_async_read(bh);
+}
+
+static void mark_buffer_async_write_endio(struct buffer_head *bh,
+					  bh_end_io_t *handler)
+{
+	bh->b_end_io = handler;
+	set_buffer_async_write(bh);
+}
+
+void mark_buffer_async_write(struct buffer_head *bh)
+{
+	mark_buffer_async_write_endio(bh, end_buffer_async_write);
+}
+EXPORT_SYMBOL(mark_buffer_async_write);
+
+
+/*
+ * fs/buffer.c contains helper functions for buffer-backed address space's
+ * fsync functions.  A common requirement for buffer-based filesystems is
+ * that certain data from the backing blockdev needs to be written out for
+ * a successful fsync().  For example, ext2 indirect blocks need to be
+ * written back and waited upon before fsync() returns.
+ *
+ * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
+ * inode_has_buffers() and invalidate_inode_buffers() are provided for the
+ * management of a list of dependent buffers at ->i_mapping->private_list.
+ *
+ * Locking is a little subtle: try_to_free_buffers() will remove buffers
+ * from their controlling inode's queue when they are being freed.  But
+ * try_to_free_buffers() will be operating against the *blockdev* mapping
+ * at the time, not against the S_ISREG file which depends on those buffers.
+ * So the locking for private_list is via the private_lock in the address_space
+ * which backs the buffers.  Which is different from the address_space 
+ * against which the buffers are listed.  So for a particular address_space,
+ * mapping->private_lock does *not* protect mapping->private_list!  In fact,
+ * mapping->private_list will always be protected by the backing blockdev's
+ * ->private_lock.
+ *
+ * Which introduces a requirement: all buffers on an address_space's
+ * ->private_list must be from the same address_space: the blockdev's.
+ *
+ * address_spaces which do not place buffers at ->private_list via these
+ * utility functions are free to use private_lock and private_list for
+ * whatever they want.  The only requirement is that list_empty(private_list)
+ * be true at clear_inode() time.
+ *
+ * FIXME: clear_inode should not call invalidate_inode_buffers().  The
+ * filesystems should do that.  invalidate_inode_buffers() should just go
+ * BUG_ON(!list_empty).
+ *
+ * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
+ * take an address_space, not an inode.  And it should be called
+ * mark_buffer_dirty_fsync() to clearly define why those buffers are being
+ * queued up.
+ *
+ * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
+ * list if it is already on a list.  Because if the buffer is on a list,
+ * it *must* already be on the right one.  If not, the filesystem is being
+ * silly.  This will save a ton of locking.  But first we have to ensure
+ * that buffers are taken *off* the old inode's list when they are freed
+ * (presumably in truncate).  That requires careful auditing of all
+ * filesystems (do it inside bforget()).  It could also be done by bringing
+ * b_inode back.
+ */
+
+/*
+ * The buffer's backing address_space's private_lock must be held
+ */
+static void __remove_assoc_queue(struct buffer_head *bh)
+{
+	list_del_init(&bh->b_assoc_buffers);
+	WARN_ON(!bh->b_assoc_map);
+	if (buffer_write_io_error(bh))
+		set_bit(AS_EIO, &bh->b_assoc_map->flags);
+	bh->b_assoc_map = NULL;
+}
+
+int inode_has_buffers(struct inode *inode)
+{
+	return !list_empty(&inode->i_data.private_list);
+}
+
+/*
+ * osync is designed to support O_SYNC io.  It waits synchronously for
+ * all already-submitted IO to complete, but does not queue any new
+ * writes to the disk.
+ *
+ * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
+ * you dirty the buffers, and then use osync_inode_buffers to wait for
+ * completion.  Any other dirty buffers which are not yet queued for
+ * write will not be flushed to disk by the osync.
+ */
+static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
+{
+	struct buffer_head *bh;
+	struct list_head *p;
+	int err = 0;
+
+	spin_lock(lock);
+repeat:
+	list_for_each_prev(p, list) {
+		bh = BH_ENTRY(p);
+		if (buffer_locked(bh)) {
+			get_bh(bh);
+			spin_unlock(lock);
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				err = -EIO;
+			brelse(bh);
+			spin_lock(lock);
+			goto repeat;
+		}
+	}
+	spin_unlock(lock);
+	return err;
+}
+
+static void do_thaw_one(struct super_block *sb, void *unused)
+{
+	char b[BDEVNAME_SIZE];
+	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+		printk(KERN_WARNING "Emergency Thaw on %s\n",
+		       bdevname(sb->s_bdev, b));
+}
+
+static void do_thaw_all(struct work_struct *work)
+{
+	iterate_supers(do_thaw_one, NULL);
+	kfree(work);
+	printk(KERN_WARNING "Emergency Thaw complete\n");
+}
+
+/**
+ * emergency_thaw_all -- forcibly thaw every frozen filesystem
+ *
+ * Used for emergency unfreeze of all filesystems via SysRq
+ */
+void emergency_thaw_all(void)
+{
+	struct work_struct *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work) {
+		INIT_WORK(work, do_thaw_all);
+		schedule_work(work);
+	}
+}
+
+/**
+ * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
+ * @mapping: the mapping which wants those buffers written
+ *
+ * Starts I/O against the buffers at mapping->private_list, and waits upon
+ * that I/O.
+ *
+ * Basically, this is a convenience function for fsync().
+ * @mapping is a file or directory which needs those buffers to be written for
+ * a successful fsync().
+ */
+int sync_mapping_buffers(struct address_space *mapping)
+{
+	struct address_space *buffer_mapping = mapping->assoc_mapping;
+
+	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
+		return 0;
+
+	return fsync_buffers_list(&buffer_mapping->private_lock,
+					&mapping->private_list);
+}
+EXPORT_SYMBOL(sync_mapping_buffers);
+
+/*
+ * Called when we've recently written block `bblock', and it is known that
+ * `bblock' was for a buffer_boundary() buffer.  This means that the block at
+ * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
+ * dirty, schedule it for IO.  So that indirects merge nicely with their data.
+ */
+void write_boundary_block(struct block_device *bdev,
+			sector_t bblock, unsigned blocksize)
+{
+	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
+	if (bh) {
+		if (buffer_dirty(bh))
+			ll_rw_block(WRITE, 1, &bh);
+		put_bh(bh);
+	}
+}
+
+void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct address_space *buffer_mapping = bh->b_page->mapping;
+
+	mark_buffer_dirty(bh);
+	if (!mapping->assoc_mapping) {
+		mapping->assoc_mapping = buffer_mapping;
+	} else {
+		BUG_ON(mapping->assoc_mapping != buffer_mapping);
+	}
+	if (!bh->b_assoc_map) {
+		spin_lock(&buffer_mapping->private_lock);
+		list_move_tail(&bh->b_assoc_buffers,
+				&mapping->private_list);
+		bh->b_assoc_map = mapping;
+		spin_unlock(&buffer_mapping->private_lock);
+	}
+}
+EXPORT_SYMBOL(mark_buffer_dirty_inode);
+
+/*
+ * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
+ * dirty.
+ *
+ * If warn is true, then emit a warning if the page is not uptodate and has
+ * not been truncated.
+ */
+static void __set_page_dirty(struct page *page,
+		struct address_space *mapping, int warn)
+{
+	spin_lock_irq(&mapping->tree_lock);
+	if (page->mapping) {	/* Race with truncate? */
+		WARN_ON_ONCE(warn && !PageUptodate(page));
+		account_page_dirtied(page, mapping);
+		radix_tree_tag_set(&mapping->page_tree,
+				page_index(page), PAGECACHE_TAG_DIRTY);
+	}
+	spin_unlock_irq(&mapping->tree_lock);
+	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+}
+
+/*
+ * Add a page to the dirty page list.
+ *
+ * It is a sad fact of life that this function is called from several places
+ * deeply under spinlocking.  It may not sleep.
+ *
+ * If the page has buffers, the uptodate buffers are set dirty, to preserve
+ * dirty-state coherency between the page and the buffers.  It the page does
+ * not have buffers then when they are later attached they will all be set
+ * dirty.
+ *
+ * The buffers are dirtied before the page is dirtied.  There's a small race
+ * window in which a writepage caller may see the page cleanness but not the
+ * buffer dirtiness.  That's fine.  If this code were to set the page dirty
+ * before the buffers, a concurrent writepage caller could clear the page dirty
+ * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
+ * page on the dirty page list.
+ *
+ * We use private_lock to lock against try_to_free_buffers while using the
+ * page's buffer list.  Also use this to protect against clean buffers being
+ * added to the page after it was set dirty.
+ *
+ * FIXME: may need to call ->reservepage here as well.  That's rather up to the
+ * address_space though.
+ */
+int __set_page_dirty_buffers(struct page *page)
+{
+	int newly_dirty;
+	struct address_space *mapping = page_mapping(page);
+
+	if (unlikely(!mapping))
+		return !TestSetPageDirty(page);
+
+	spin_lock(&mapping->private_lock);
+	if (page_has_buffers(page)) {
+		struct buffer_head *head = page_buffers(page);
+		struct buffer_head *bh = head;
+
+		do {
+			set_buffer_dirty(bh);
+			bh = bh->b_this_page;
+		} while (bh != head);
+	}
+	newly_dirty = !TestSetPageDirty(page);
+	spin_unlock(&mapping->private_lock);
+
+	if (newly_dirty)
+		__set_page_dirty(page, mapping, 1);
+	return newly_dirty;
+}
+EXPORT_SYMBOL(__set_page_dirty_buffers);
+
+/*
+ * Write out and wait upon a list of buffers.
+ *
+ * We have conflicting pressures: we want to make sure that all
+ * initially dirty buffers get waited on, but that any subsequently
+ * dirtied buffers don't.  After all, we don't want fsync to last
+ * forever if somebody is actively writing to the file.
+ *
+ * Do this in two main stages: first we copy dirty buffers to a
+ * temporary inode list, queueing the writes as we go.  Then we clean
+ * up, waiting for those writes to complete.
+ * 
+ * During this second stage, any subsequent updates to the file may end
+ * up refiling the buffer on the original inode's dirty list again, so
+ * there is a chance we will end up with a buffer queued for write but
+ * not yet completed on that list.  So, as a final cleanup we go through
+ * the osync code to catch these locked, dirty buffers without requeuing
+ * any newly dirty buffers for write.
+ */
+static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
+{
+	struct buffer_head *bh;
+	struct list_head tmp;
+	struct address_space *mapping;
+	int err = 0, err2;
+	struct blk_plug plug;
+
+	INIT_LIST_HEAD(&tmp);
+	blk_start_plug(&plug);
+
+	spin_lock(lock);
+	while (!list_empty(list)) {
+		bh = BH_ENTRY(list->next);
+		mapping = bh->b_assoc_map;
+		__remove_assoc_queue(bh);
+		/* Avoid race with mark_buffer_dirty_inode() which does
+		 * a lockless check and we rely on seeing the dirty bit */
+		smp_mb();
+		if (buffer_dirty(bh) || buffer_locked(bh)) {
+			list_add(&bh->b_assoc_buffers, &tmp);
+			bh->b_assoc_map = mapping;
+			if (buffer_dirty(bh)) {
+				get_bh(bh);
+				spin_unlock(lock);
+				/*
+				 * Ensure any pending I/O completes so that
+				 * write_dirty_buffer() actually writes the
+				 * current contents - it is a noop if I/O is
+				 * still in flight on potentially older
+				 * contents.
+				 */
+				write_dirty_buffer(bh, WRITE_SYNC);
+
+				/*
+				 * Kick off IO for the previous mapping. Note
+				 * that we will not run the very last mapping,
+				 * wait_on_buffer() will do that for us
+				 * through sync_buffer().
+				 */
+				brelse(bh);
+				spin_lock(lock);
+			}
+		}
+	}
+
+	spin_unlock(lock);
+	blk_finish_plug(&plug);
+	spin_lock(lock);
+
+	while (!list_empty(&tmp)) {
+		bh = BH_ENTRY(tmp.prev);
+		get_bh(bh);
+		mapping = bh->b_assoc_map;
+		__remove_assoc_queue(bh);
+		/* Avoid race with mark_buffer_dirty_inode() which does
+		 * a lockless check and we rely on seeing the dirty bit */
+		smp_mb();
+		if (buffer_dirty(bh)) {
+			list_add(&bh->b_assoc_buffers,
+				 &mapping->private_list);
+			bh->b_assoc_map = mapping;
+		}
+		spin_unlock(lock);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			err = -EIO;
+		brelse(bh);
+		spin_lock(lock);
+	}
+	
+	spin_unlock(lock);
+	err2 = osync_buffers_list(lock, list);
+	if (err)
+		return err;
+	else
+		return err2;
+}
+
+/*
+ * Invalidate any and all dirty buffers on a given inode.  We are
+ * probably unmounting the fs, but that doesn't mean we have already
+ * done a sync().  Just drop the buffers from the inode list.
+ *
+ * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
+ * assumes that all the buffers are against the blockdev.  Not true
+ * for reiserfs.
+ */
+void invalidate_inode_buffers(struct inode *inode)
+{
+	if (inode_has_buffers(inode)) {
+		struct address_space *mapping = &inode->i_data;
+		struct list_head *list = &mapping->private_list;
+		struct address_space *buffer_mapping = mapping->assoc_mapping;
+
+		spin_lock(&buffer_mapping->private_lock);
+		while (!list_empty(list))
+			__remove_assoc_queue(BH_ENTRY(list->next));
+		spin_unlock(&buffer_mapping->private_lock);
+	}
+}
+EXPORT_SYMBOL(invalidate_inode_buffers);
+
+/*
+ * Remove any clean buffers from the inode's buffer list.  This is called
+ * when we're trying to free the inode itself.  Those buffers can pin it.
+ *
+ * Returns true if all buffers were removed.
+ */
+int remove_inode_buffers(struct inode *inode)
+{
+	int ret = 1;
+
+	if (inode_has_buffers(inode)) {
+		struct address_space *mapping = &inode->i_data;
+		struct list_head *list = &mapping->private_list;
+		struct address_space *buffer_mapping = mapping->assoc_mapping;
+
+		spin_lock(&buffer_mapping->private_lock);
+		while (!list_empty(list)) {
+			struct buffer_head *bh = BH_ENTRY(list->next);
+			if (buffer_dirty(bh)) {
+				ret = 0;
+				break;
+			}
+			__remove_assoc_queue(bh);
+		}
+		spin_unlock(&buffer_mapping->private_lock);
+	}
+	return ret;
+}
+
+/*
+ * Create the appropriate buffers when given a page for data area and
+ * the size of each buffer.. Use the bh->b_this_page linked list to
+ * follow the buffers created.  Return NULL if unable to create more
+ * buffers.
+ *
+ * The retry flag is used to differentiate async IO (paging, swapping)
+ * which may not fail from ordinary buffer allocations.
+ */
+struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
+		int retry)
+{
+	struct buffer_head *bh, *head;
+	long offset;
+
+try_again:
+	head = NULL;
+	offset = PAGE_SIZE;
+	while ((offset -= size) >= 0) {
+		bh = alloc_buffer_head(GFP_NOFS);
+		if (!bh)
+			goto no_grow;
+
+		bh->b_bdev = NULL;
+		bh->b_this_page = head;
+		bh->b_blocknr = -1;
+		head = bh;
+
+		bh->b_state = 0;
+		atomic_set(&bh->b_count, 0);
+		bh->b_size = size;
+
+		/* Link the buffer to its page */
+		set_bh_page(bh, page, offset);
+
+		init_buffer(bh, NULL, NULL);
+	}
+	return head;
+/*
+ * In case anything failed, we just free everything we got.
+ */
+no_grow:
+	if (head) {
+		do {
+			bh = head;
+			head = head->b_this_page;
+			free_buffer_head(bh);
+		} while (head);
+	}
+
+	/*
+	 * Return failure for non-async IO requests.  Async IO requests
+	 * are not allowed to fail, so we have to wait until buffer heads
+	 * become available.  But we don't want tasks sleeping with 
+	 * partially complete buffers, so all were released above.
+	 */
+	if (!retry)
+		return NULL;
+
+	/* We're _really_ low on memory. Now we just
+	 * wait for old buffer heads to become free due to
+	 * finishing IO.  Since this is an async request and
+	 * the reserve list is empty, we're sure there are 
+	 * async buffer heads in use.
+	 */
+	free_more_memory();
+	goto try_again;
+}
+EXPORT_SYMBOL_GPL(alloc_page_buffers);
+
+static inline void
+link_dev_buffers(struct page *page, struct buffer_head *head)
+{
+	struct buffer_head *bh, *tail;
+
+	bh = head;
+	do {
+		tail = bh;
+		bh = bh->b_this_page;
+	} while (bh);
+	tail->b_this_page = head;
+	attach_page_buffers(page, head);
+}
+
+/*
+ * Initialise the state of a blockdev page's buffers.
+ */ 
+static void
+init_page_buffers(struct page *page, struct block_device *bdev,
+			sector_t block, int size)
+{
+	struct buffer_head *head = page_buffers(page);
+	struct buffer_head *bh = head;
+	int uptodate = PageUptodate(page);
+	sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode));
+
+	do {
+		if (!buffer_mapped(bh)) {
+			init_buffer(bh, NULL, NULL);
+			bh->b_bdev = bdev;
+			bh->b_blocknr = block;
+			if (uptodate)
+				set_buffer_uptodate(bh);
+			if (block < end_block)
+				set_buffer_mapped(bh);
+		}
+		block++;
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+
+/*
+ * Create the page-cache page that contains the requested block.
+ *
+ * This is user purely for blockdev mappings.
+ */
+static struct page *
+grow_dev_page(struct block_device *bdev, sector_t block,
+		pgoff_t index, int size)
+{
+	struct inode *inode = bdev->bd_inode;
+	struct page *page;
+	struct buffer_head *bh;
+
+	page = find_or_create_page(inode->i_mapping, index,
+		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
+	if (!page)
+		return NULL;
+
+	BUG_ON(!PageLocked(page));
+
+	if (page_has_buffers(page)) {
+		bh = page_buffers(page);
+		if (bh->b_size == size) {
+			init_page_buffers(page, bdev, block, size);
+			return page;
+		}
+		if (!try_to_free_buffers(page))
+			goto failed;
+	}
+
+	/*
+	 * Allocate some buffers for this page
+	 */
+	bh = alloc_page_buffers(page, size, 0);
+	if (!bh)
+		goto failed;
+
+	/*
+	 * Link the page to the buffers and initialise them.  Take the
+	 * lock to be atomic wrt __find_get_block(), which does not
+	 * run under the page lock.
+	 */
+	spin_lock(&inode->i_mapping->private_lock);
+	link_dev_buffers(page, bh);
+	init_page_buffers(page, bdev, block, size);
+	spin_unlock(&inode->i_mapping->private_lock);
+	return page;
+
+failed:
+	BUG();
+	unlock_page(page);
+	page_cache_release(page);
+	return NULL;
+}
+
+/*
+ * Create buffers for the specified block device block's page.  If
+ * that page was dirty, the buffers are set dirty also.
+ */
+static int
+grow_buffers(struct block_device *bdev, sector_t block, int size)
+{
+	struct page *page;
+	pgoff_t index;
+	int sizebits;
+
+	sizebits = -1;
+	do {
+		sizebits++;
+	} while ((size << sizebits) < PAGE_SIZE);
+
+	index = block >> sizebits;
+
+	/*
+	 * Check for a block which wants to lie outside our maximum possible
+	 * pagecache index.  (this comparison is done using sector_t types).
+	 */
+	if (unlikely(index != block >> sizebits)) {
+		char b[BDEVNAME_SIZE];
+
+		printk(KERN_ERR "%s: requested out-of-range block %llu for "
+			"device %s\n",
+			__func__, (unsigned long long)block,
+			bdevname(bdev, b));
+		return -EIO;
+	}
+	block = index << sizebits;
+	/* Create a page with the proper size buffers.. */
+	page = grow_dev_page(bdev, block, index, size);
+	if (!page)
+		return 0;
+	unlock_page(page);
+	page_cache_release(page);
+	return 1;
+}
+
+static struct buffer_head *
+__getblk_slow(struct block_device *bdev, sector_t block, int size)
+{
+	/* Size must be multiple of hard sectorsize */
+	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
+			(size < 512 || size > PAGE_SIZE))) {
+		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
+					size);
+		printk(KERN_ERR "logical block size: %d\n",
+					bdev_logical_block_size(bdev));
+
+		dump_stack();
+		return NULL;
+	}
+
+	for (;;) {
+		struct buffer_head * bh;
+		int ret;
+
+		bh = __find_get_block(bdev, block, size);
+		if (bh)
+			return bh;
+
+		ret = grow_buffers(bdev, block, size);
+		if (ret < 0)
+			return NULL;
+		if (ret == 0)
+			free_more_memory();
+	}
+}
+
+/*
+ * The relationship between dirty buffers and dirty pages:
+ *
+ * Whenever a page has any dirty buffers, the page's dirty bit is set, and
+ * the page is tagged dirty in its radix tree.
+ *
+ * At all times, the dirtiness of the buffers represents the dirtiness of
+ * subsections of the page.  If the page has buffers, the page dirty bit is
+ * merely a hint about the true dirty state.
+ *
+ * When a page is set dirty in its entirety, all its buffers are marked dirty
+ * (if the page has buffers).
+ *
+ * When a buffer is marked dirty, its page is dirtied, but the page's other
+ * buffers are not.
+ *
+ * Also.  When blockdev buffers are explicitly read with bread(), they
+ * individually become uptodate.  But their backing page remains not
+ * uptodate - even if all of its buffers are uptodate.  A subsequent
+ * block_read_full_page() against that page will discover all the uptodate
+ * buffers, will set the page uptodate and will perform no I/O.
+ */
+
+/**
+ * mark_buffer_dirty - mark a buffer_head as needing writeout
+ * @bh: the buffer_head to mark dirty
+ *
+ * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
+ * backing page dirty, then tag the page as dirty in its address_space's radix
+ * tree and then attach the address_space's inode to its superblock's dirty
+ * inode list.
+ *
+ * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
+ * mapping->tree_lock and mapping->host->i_lock.
+ */
+void mark_buffer_dirty(struct buffer_head *bh)
+{
+	WARN_ON_ONCE(!buffer_uptodate(bh));
+
+	/*
+	 * Very *carefully* optimize the it-is-already-dirty case.
+	 *
+	 * Don't let the final "is it dirty" escape to before we
+	 * perhaps modified the buffer.
+	 */
+	if (buffer_dirty(bh)) {
+		smp_mb();
+		if (buffer_dirty(bh))
+			return;
+	}
+
+	if (!test_set_buffer_dirty(bh)) {
+		struct page *page = bh->b_page;
+		if (!TestSetPageDirty(page)) {
+			struct address_space *mapping = page_mapping(page);
+			if (mapping)
+				__set_page_dirty(page, mapping, 0);
+		}
+	}
+}
+EXPORT_SYMBOL(mark_buffer_dirty);
+
+/*
+ * Decrement a buffer_head's reference count.  If all buffers against a page
+ * have zero reference count, are clean and unlocked, and if the page is clean
+ * and unlocked then try_to_free_buffers() may strip the buffers from the page
+ * in preparation for freeing it (sometimes, rarely, buffers are removed from
+ * a page but it ends up not being freed, and buffers may later be reattached).
+ */
+void __brelse(struct buffer_head * buf)
+{
+	if (atomic_read(&buf->b_count)) {
+		put_bh(buf);
+		return;
+	}
+	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
+}
+EXPORT_SYMBOL(__brelse);
+
+/*
+ * bforget() is like brelse(), except it discards any
+ * potentially dirty data.
+ */
+void __bforget(struct buffer_head *bh)
+{
+	clear_buffer_dirty(bh);
+	if (bh->b_assoc_map) {
+		struct address_space *buffer_mapping = bh->b_page->mapping;
+
+		spin_lock(&buffer_mapping->private_lock);
+		list_del_init(&bh->b_assoc_buffers);
+		bh->b_assoc_map = NULL;
+		spin_unlock(&buffer_mapping->private_lock);
+	}
+	__brelse(bh);
+}
+EXPORT_SYMBOL(__bforget);
+
+static struct buffer_head *__bread_slow(struct buffer_head *bh)
+{
+	lock_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		return bh;
+	} else {
+		get_bh(bh);
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, bh);
+		wait_on_buffer(bh);
+		if (buffer_uptodate(bh))
+			return bh;
+	}
+	brelse(bh);
+	return NULL;
+}
+
+/*
+ * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
+ * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
+ * refcount elevated by one when they're in an LRU.  A buffer can only appear
+ * once in a particular CPU's LRU.  A single buffer can be present in multiple
+ * CPU's LRUs at the same time.
+ *
+ * This is a transparent caching front-end to sb_bread(), sb_getblk() and
+ * sb_find_get_block().
+ *
+ * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
+ * a local interrupt disable for that.
+ */
+
+#define BH_LRU_SIZE	8
+
+struct bh_lru {
+	struct buffer_head *bhs[BH_LRU_SIZE];
+};
+
+static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
+
+#ifdef CONFIG_SMP
+#define bh_lru_lock()	local_irq_disable()
+#define bh_lru_unlock()	local_irq_enable()
+#else
+#define bh_lru_lock()	preempt_disable()
+#define bh_lru_unlock()	preempt_enable()
+#endif
+
+static inline void check_irqs_on(void)
+{
+#ifdef irqs_disabled
+	BUG_ON(irqs_disabled());
+#endif
+}
+
+/*
+ * The LRU management algorithm is dopey-but-simple.  Sorry.
+ */
+static void bh_lru_install(struct buffer_head *bh)
+{
+	struct buffer_head *evictee = NULL;
+
+	check_irqs_on();
+	bh_lru_lock();
+	if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
+		struct buffer_head *bhs[BH_LRU_SIZE];
+		int in;
+		int out = 0;
+
+		get_bh(bh);
+		bhs[out++] = bh;
+		for (in = 0; in < BH_LRU_SIZE; in++) {
+			struct buffer_head *bh2 =
+				__this_cpu_read(bh_lrus.bhs[in]);
+
+			if (bh2 == bh) {
+				__brelse(bh2);
+			} else {
+				if (out >= BH_LRU_SIZE) {
+					BUG_ON(evictee != NULL);
+					evictee = bh2;
+				} else {
+					bhs[out++] = bh2;
+				}
+			}
+		}
+		while (out < BH_LRU_SIZE)
+			bhs[out++] = NULL;
+		memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
+	}
+	bh_lru_unlock();
+
+	if (evictee)
+		__brelse(evictee);
+}
+
+/*
+ * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
+ */
+static struct buffer_head *
+lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
+{
+	struct buffer_head *ret = NULL;
+	unsigned int i;
+
+	check_irqs_on();
+	bh_lru_lock();
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
+
+		if (bh && bh->b_bdev == bdev &&
+				bh->b_blocknr == block && bh->b_size == size) {
+			if (i) {
+				while (i) {
+					__this_cpu_write(bh_lrus.bhs[i],
+						__this_cpu_read(bh_lrus.bhs[i - 1]));
+					i--;
+				}
+				__this_cpu_write(bh_lrus.bhs[0], bh);
+			}
+			get_bh(bh);
+			ret = bh;
+			break;
+		}
+	}
+	bh_lru_unlock();
+	return ret;
+}
+
+/*
+ * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
+ * it in the LRU and mark it as accessed.  If it is not present then return
+ * NULL
+ */
+struct buffer_head *
+__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+{
+	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
+
+	if (bh == NULL) {
+		bh = __find_get_block_slow(bdev, block);
+		if (bh)
+			bh_lru_install(bh);
+	}
+	if (bh)
+		touch_buffer(bh);
+	return bh;
+}
+EXPORT_SYMBOL(__find_get_block);
+
+/*
+ * __getblk will locate (and, if necessary, create) the buffer_head
+ * which corresponds to the passed block_device, block and size. The
+ * returned buffer has its reference count incremented.
+ *
+ * __getblk() cannot fail - it just keeps trying.  If you pass it an
+ * illegal block number, __getblk() will happily return a buffer_head
+ * which represents the non-existent block.  Very weird.
+ *
+ * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
+ * attempt is failing.  FIXME, perhaps?
+ */
+struct buffer_head *
+__getblk(struct block_device *bdev, sector_t block, unsigned size)
+{
+	struct buffer_head *bh = __find_get_block(bdev, block, size);
+
+	might_sleep();
+	if (bh == NULL)
+		bh = __getblk_slow(bdev, block, size);
+	return bh;
+}
+EXPORT_SYMBOL(__getblk);
+
+/*
+ * Do async read-ahead on a buffer..
+ */
+void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
+{
+	struct buffer_head *bh = __getblk(bdev, block, size);
+	if (likely(bh)) {
+		ll_rw_block(READA, 1, &bh);
+		brelse(bh);
+	}
+}
+EXPORT_SYMBOL(__breadahead);
+
+/**
+ *  __bread() - reads a specified block and returns the bh
+ *  @bdev: the block_device to read from
+ *  @block: number of block
+ *  @size: size (in bytes) to read
+ * 
+ *  Reads a specified block, and returns buffer head that contains it.
+ *  It returns NULL if the block was unreadable.
+ */
+struct buffer_head *
+__bread(struct block_device *bdev, sector_t block, unsigned size)
+{
+	struct buffer_head *bh = __getblk(bdev, block, size);
+
+	if (likely(bh) && !buffer_uptodate(bh))
+		bh = __bread_slow(bh);
+	return bh;
+}
+EXPORT_SYMBOL(__bread);
+
+/*
+ * invalidate_bh_lrus() is called rarely - but not only at unmount.
+ * This doesn't race because it runs in each cpu either in irq
+ * or with preempt disabled.
+ */
+static void invalidate_bh_lru(void *arg)
+{
+	struct bh_lru *b = &get_cpu_var(bh_lrus);
+	int i;
+
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		brelse(b->bhs[i]);
+		b->bhs[i] = NULL;
+	}
+	put_cpu_var(bh_lrus);
+}
+	
+void invalidate_bh_lrus(void)
+{
+	on_each_cpu(invalidate_bh_lru, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
+
+void set_bh_page(struct buffer_head *bh,
+		struct page *page, unsigned long offset)
+{
+	bh->b_page = page;
+	BUG_ON(offset >= PAGE_SIZE);
+	if (PageHighMem(page))
+		/*
+		 * This catches illegal uses and preserves the offset:
+		 */
+		bh->b_data = (char *)(0 + offset);
+	else
+		bh->b_data = page_address(page) + offset;
+}
+EXPORT_SYMBOL(set_bh_page);
+
+/*
+ * Called when truncating a buffer on a page completely.
+ */
+static void discard_buffer(struct buffer_head * bh)
+{
+	lock_buffer(bh);
+	clear_buffer_dirty(bh);
+	bh->b_bdev = NULL;
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	clear_buffer_delay(bh);
+	clear_buffer_unwritten(bh);
+	unlock_buffer(bh);
+}
+
+/**
+ * block_invalidatepage - invalidate part of all of a buffer-backed page
+ *
+ * @page: the page which is affected
+ * @offset: the index of the truncation point
+ *
+ * block_invalidatepage() is called when all or part of the page has become
+ * invalidatedby a truncate operation.
+ *
+ * block_invalidatepage() does not have to release all buffers, but it must
+ * ensure that no dirty buffer is left outside @offset and that no I/O
+ * is underway against any of the blocks which are outside the truncation
+ * point.  Because the caller is about to free (and possibly reuse) those
+ * blocks on-disk.
+ */
+void block_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct buffer_head *head, *bh, *next;
+	unsigned int curr_off = 0;
+
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		goto out;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+		next = bh->b_this_page;
+
+		/*
+		 * is this block fully invalidated?
+		 */
+		if (offset <= curr_off)
+			discard_buffer(bh);
+		curr_off = next_off;
+		bh = next;
+	} while (bh != head);
+
+	/*
+	 * We release buffers only if the entire page is being invalidated.
+	 * The get_block cached value has been unconditionally invalidated,
+	 * so real IO is not possible anymore.
+	 */
+	if (offset == 0)
+		try_to_release_page(page, 0);
+out:
+	return;
+}
+EXPORT_SYMBOL(block_invalidatepage);
+
+/*
+ * We attach and possibly dirty the buffers atomically wrt
+ * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
+ * is already excluded via the page lock.
+ */
+void create_empty_buffers(struct page *page,
+			unsigned long blocksize, unsigned long b_state)
+{
+	struct buffer_head *bh, *head, *tail;
+
+	head = alloc_page_buffers(page, blocksize, 1);
+	bh = head;
+	do {
+		bh->b_state |= b_state;
+		tail = bh;
+		bh = bh->b_this_page;
+	} while (bh);
+	tail->b_this_page = head;
+
+	spin_lock(&page->mapping->private_lock);
+	if (PageUptodate(page) || PageDirty(page)) {
+		bh = head;
+		do {
+			if (PageDirty(page))
+				set_buffer_dirty(bh);
+			if (PageUptodate(page))
+				set_buffer_uptodate(bh);
+			bh = bh->b_this_page;
+		} while (bh != head);
+	}
+	attach_page_buffers(page, head);
+	spin_unlock(&page->mapping->private_lock);
+}
+EXPORT_SYMBOL(create_empty_buffers);
+
+/*
+ * We are taking a block for data and we don't want any output from any
+ * buffer-cache aliases starting from return from that function and
+ * until the moment when something will explicitly mark the buffer
+ * dirty (hopefully that will not happen until we will free that block ;-)
+ * We don't even need to mark it not-uptodate - nobody can expect
+ * anything from a newly allocated buffer anyway. We used to used
+ * unmap_buffer() for such invalidation, but that was wrong. We definitely
+ * don't want to mark the alias unmapped, for example - it would confuse
+ * anyone who might pick it with bread() afterwards...
+ *
+ * Also..  Note that bforget() doesn't lock the buffer.  So there can
+ * be writeout I/O going on against recently-freed buffers.  We don't
+ * wait on that I/O in bforget() - it's more efficient to wait on the I/O
+ * only if we really need to.  That happens here.
+ */
+void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
+{
+	struct buffer_head *old_bh;
+
+	might_sleep();
+
+	old_bh = __find_get_block_slow(bdev, block);
+	if (old_bh) {
+		clear_buffer_dirty(old_bh);
+		wait_on_buffer(old_bh);
+		clear_buffer_req(old_bh);
+		__brelse(old_bh);
+	}
+}
+EXPORT_SYMBOL(unmap_underlying_metadata);
+
+/*
+ * NOTE! All mapped/uptodate combinations are valid:
+ *
+ *	Mapped	Uptodate	Meaning
+ *
+ *	No	No		"unknown" - must do get_block()
+ *	No	Yes		"hole" - zero-filled
+ *	Yes	No		"allocated" - allocated on disk, not read in
+ *	Yes	Yes		"valid" - allocated and up-to-date in memory.
+ *
+ * "Dirty" is valid only with the last case (mapped+uptodate).
+ */
+
+/*
+ * While block_write_full_page is writing back the dirty buffers under
+ * the page lock, whoever dirtied the buffers may decide to clean them
+ * again at any time.  We handle that by only looking at the buffer
+ * state inside lock_buffer().
+ *
+ * If block_write_full_page() is called for regular writeback
+ * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
+ * locked buffer.   This only can happen if someone has written the buffer
+ * directly, with submit_bh().  At the address_space level PageWriteback
+ * prevents this contention from occurring.
+ *
+ * If block_write_full_page() is called with wbc->sync_mode ==
+ * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
+ * causes the writes to be flagged as synchronous writes.
+ */
+static int __block_write_full_page(struct inode *inode, struct page *page,
+			get_block_t *get_block, struct writeback_control *wbc,
+			bh_end_io_t *handler)
+{
+	int err;
+	sector_t block;
+	sector_t last_block;
+	struct buffer_head *bh, *head;
+	const unsigned blocksize = 1 << inode->i_blkbits;
+	int nr_underway = 0;
+	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
+			WRITE_SYNC : WRITE);
+
+	BUG_ON(!PageLocked(page));
+
+	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
+
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, blocksize,
+					(1 << BH_Dirty)|(1 << BH_Uptodate));
+	}
+
+	/*
+	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
+	 * here, and the (potentially unmapped) buffers may become dirty at
+	 * any time.  If a buffer becomes dirty here after we've inspected it
+	 * then we just miss that fact, and the page stays dirty.
+	 *
+	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
+	 * handle that here by just cleaning them.
+	 */
+
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	head = page_buffers(page);
+	bh = head;
+
+	/*
+	 * Get all the dirty buffers mapped to disk addresses and
+	 * handle any aliases from the underlying blockdev's mapping.
+	 */
+	do {
+		if (block > last_block) {
+			/*
+			 * mapped buffers outside i_size will occur, because
+			 * this page can be outside i_size when there is a
+			 * truncate in progress.
+			 */
+			/*
+			 * The buffer was zeroed by block_write_full_page()
+			 */
+			clear_buffer_dirty(bh);
+			set_buffer_uptodate(bh);
+		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+			   buffer_dirty(bh)) {
+			WARN_ON(bh->b_size != blocksize);
+			err = get_block(inode, block, bh, 1);
+			if (err)
+				goto recover;
+			clear_buffer_delay(bh);
+			if (buffer_new(bh)) {
+				/* blockdev mappings never come here */
+				clear_buffer_new(bh);
+				unmap_underlying_metadata(bh->b_bdev,
+							bh->b_blocknr);
+			}
+		}
+		bh = bh->b_this_page;
+		block++;
+	} while (bh != head);
+
+	do {
+		if (!buffer_mapped(bh))
+			continue;
+		/*
+		 * If it's a fully non-blocking write attempt and we cannot
+		 * lock the buffer then redirty the page.  Note that this can
+		 * potentially cause a busy-wait loop from writeback threads
+		 * and kswapd activity, but those code paths have their own
+		 * higher-level throttling.
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE) {
+			lock_buffer(bh);
+		} else if (!trylock_buffer(bh)) {
+			redirty_page_for_writepage(wbc, page);
+			continue;
+		}
+		if (test_clear_buffer_dirty(bh)) {
+			mark_buffer_async_write_endio(bh, handler);
+		} else {
+			unlock_buffer(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+	/*
+	 * The page and its buffers are protected by PageWriteback(), so we can
+	 * drop the bh refcounts early.
+	 */
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			submit_bh(write_op, bh);
+			nr_underway++;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
+
+	err = 0;
+done:
+	if (nr_underway == 0) {
+		/*
+		 * The page was marked dirty, but the buffers were
+		 * clean.  Someone wrote them back by hand with
+		 * ll_rw_block/submit_bh.  A rare case.
+		 */
+		end_page_writeback(page);
+
+		/*
+		 * The page and buffer_heads can be released at any time from
+		 * here on.
+		 */
+	}
+	return err;
+
+recover:
+	/*
+	 * ENOSPC, or some other error.  We may already have added some
+	 * blocks to the file, so we need to write these out to avoid
+	 * exposing stale data.
+	 * The page is currently locked and not marked for writeback
+	 */
+	bh = head;
+	/* Recovery: lock and submit the mapped buffers */
+	do {
+		if (buffer_mapped(bh) && buffer_dirty(bh) &&
+		    !buffer_delay(bh)) {
+			lock_buffer(bh);
+			mark_buffer_async_write_endio(bh, handler);
+		} else {
+			/*
+			 * The buffer may have been set dirty during
+			 * attachment to a dirty page.
+			 */
+			clear_buffer_dirty(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+	SetPageError(page);
+	BUG_ON(PageWriteback(page));
+	mapping_set_error(page->mapping, err);
+	set_page_writeback(page);
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			clear_buffer_dirty(bh);
+			submit_bh(write_op, bh);
+			nr_underway++;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
+	goto done;
+}
+
+/*
+ * If a page has any new buffers, zero them out here, and mark them uptodate
+ * and dirty so they'll be written out (in order to prevent uninitialised
+ * block data from leaking). And clear the new bit.
+ */
+void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+	unsigned int block_start, block_end;
+	struct buffer_head *head, *bh;
+
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		return;
+
+	bh = head = page_buffers(page);
+	block_start = 0;
+	do {
+		block_end = block_start + bh->b_size;
+
+		if (buffer_new(bh)) {
+			if (block_end > from && block_start < to) {
+				if (!PageUptodate(page)) {
+					unsigned start, size;
+
+					start = max(from, block_start);
+					size = min(to, block_end) - start;
+
+					zero_user(page, start, size);
+					set_buffer_uptodate(bh);
+				}
+
+				clear_buffer_new(bh);
+				mark_buffer_dirty(bh);
+			}
+		}
+
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+EXPORT_SYMBOL(page_zero_new_buffers);
+
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+		get_block_t *get_block)
+{
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned to = from + len;
+	struct inode *inode = page->mapping->host;
+	unsigned block_start, block_end;
+	sector_t block;
+	int err = 0;
+	unsigned blocksize, bbits;
+	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(from > PAGE_CACHE_SIZE);
+	BUG_ON(to > PAGE_CACHE_SIZE);
+	BUG_ON(from > to);
+
+	blocksize = 1 << inode->i_blkbits;
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+	head = page_buffers(page);
+
+	bbits = inode->i_blkbits;
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+
+	for(bh = head, block_start = 0; bh != head || !block_start;
+	    block++, block_start=block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (PageUptodate(page)) {
+				if (!buffer_uptodate(bh))
+					set_buffer_uptodate(bh);
+			}
+			continue;
+		}
+		if (buffer_new(bh))
+			clear_buffer_new(bh);
+		if (!buffer_mapped(bh)) {
+			WARN_ON(bh->b_size != blocksize);
+			err = get_block(inode, block, bh, 1);
+			if (err)
+				break;
+			if (buffer_new(bh)) {
+				unmap_underlying_metadata(bh->b_bdev,
+							bh->b_blocknr);
+				if (PageUptodate(page)) {
+					clear_buffer_new(bh);
+					set_buffer_uptodate(bh);
+					mark_buffer_dirty(bh);
+					continue;
+				}
+				if (block_end > to || block_start < from)
+					zero_user_segments(page,
+						to, block_end,
+						block_start, from);
+				continue;
+			}
+		}
+		if (PageUptodate(page)) {
+			if (!buffer_uptodate(bh))
+				set_buffer_uptodate(bh);
+			continue; 
+		}
+		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+		    !buffer_unwritten(bh) &&
+		     (block_start < from || block_end > to)) {
+			ll_rw_block(READ, 1, &bh);
+			*wait_bh++=bh;
+		}
+	}
+	/*
+	 * If we issued read requests - let them complete.
+	 */
+	while(wait_bh > wait) {
+		wait_on_buffer(*--wait_bh);
+		if (!buffer_uptodate(*wait_bh))
+			err = -EIO;
+	}
+	if (unlikely(err))
+		page_zero_new_buffers(page, from, to);
+	return err;
+}
+EXPORT_SYMBOL(__block_write_begin);
+
+static int __block_commit_write(struct inode *inode, struct page *page,
+		unsigned from, unsigned to)
+{
+	unsigned block_start, block_end;
+	int partial = 0;
+	unsigned blocksize;
+	struct buffer_head *bh, *head;
+
+	blocksize = 1 << inode->i_blkbits;
+
+	for(bh = head = page_buffers(page), block_start = 0;
+	    bh != head || !block_start;
+	    block_start=block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (!buffer_uptodate(bh))
+				partial = 1;
+		} else {
+			set_buffer_uptodate(bh);
+			mark_buffer_dirty(bh);
+		}
+		clear_buffer_new(bh);
+	}
+
+	/*
+	 * If this is a partial write which happened to make all buffers
+	 * uptodate then we can optimize away a bogus readpage() for
+	 * the next read(). Here we 'discover' whether the page went
+	 * uptodate as a result of this (potentially partial) write.
+	 */
+	if (!partial)
+		SetPageUptodate(page);
+	return 0;
+}
+
+/*
+ * block_write_begin takes care of the basic task of block allocation and
+ * bringing partial write blocks uptodate first.
+ *
+ * The filesystem needs to handle block truncation upon failure.
+ */
+int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
+		unsigned flags, struct page **pagep, get_block_t *get_block)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	int status;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+
+	status = __block_write_begin(page, pos, len, get_block);
+	if (unlikely(status)) {
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+	}
+
+	*pagep = page;
+	return status;
+}
+EXPORT_SYMBOL(block_write_begin);
+
+int block_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	unsigned start;
+
+	start = pos & (PAGE_CACHE_SIZE - 1);
+
+	if (unlikely(copied < len)) {
+		/*
+		 * The buffers that were written will now be uptodate, so we
+		 * don't have to worry about a readpage reading them and
+		 * overwriting a partial write. However if we have encountered
+		 * a short write and only partially written into a buffer, it
+		 * will not be marked uptodate, so a readpage might come in and
+		 * destroy our partial write.
+		 *
+		 * Do the simplest thing, and just treat any short write to a
+		 * non uptodate page as a zero-length write, and force the
+		 * caller to redo the whole thing.
+		 */
+		if (!PageUptodate(page))
+			copied = 0;
+
+		page_zero_new_buffers(page, start+copied, start+len);
+	}
+	flush_dcache_page(page);
+
+	/* This could be a short (even 0-length) commit */
+	__block_commit_write(inode, page, start, start+copied);
+
+	return copied;
+}
+EXPORT_SYMBOL(block_write_end);
+
+int generic_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int i_size_changed = 0;
+
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_mutex.
+	 *
+	 * But it's important to update i_size while still holding page lock:
+	 * page writeout could otherwise come in and zero beyond i_size.
+	 */
+	if (pos+copied > inode->i_size) {
+		i_size_write(inode, pos+copied);
+		i_size_changed = 1;
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		mark_inode_dirty(inode);
+
+	return copied;
+}
+EXPORT_SYMBOL(generic_write_end);
+
+/*
+ * block_is_partially_uptodate checks whether buffers within a page are
+ * uptodate or not.
+ *
+ * Returns true if all buffers which correspond to a file portion
+ * we want to read are uptodate.
+ */
+int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
+					unsigned long from)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned block_start, block_end, blocksize;
+	unsigned to;
+	struct buffer_head *bh, *head;
+	int ret = 1;
+
+	if (!page_has_buffers(page))
+		return 0;
+
+	blocksize = 1 << inode->i_blkbits;
+	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
+	to = from + to;
+	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
+		return 0;
+
+	head = page_buffers(page);
+	bh = head;
+	block_start = 0;
+	do {
+		block_end = block_start + blocksize;
+		if (block_end > from && block_start < to) {
+			if (!buffer_uptodate(bh)) {
+				ret = 0;
+				break;
+			}
+			if (block_end >= to)
+				break;
+		}
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	return ret;
+}
+EXPORT_SYMBOL(block_is_partially_uptodate);
+
+/*
+ * Generic "read page" function for block devices that have the normal
+ * get_block functionality. This is most of the block device filesystems.
+ * Reads the page asynchronously --- the unlock_buffer() and
+ * set/clear_buffer_uptodate() functions propagate buffer state into the
+ * page struct once IO has completed.
+ */
+int block_read_full_page(struct page *page, get_block_t *get_block)
+{
+	struct inode *inode = page->mapping->host;
+	sector_t iblock, lblock;
+	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+	unsigned int blocksize;
+	int nr, i;
+	int fully_mapped = 1;
+
+	BUG_ON(!PageLocked(page));
+	blocksize = 1 << inode->i_blkbits;
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+	head = page_buffers(page);
+
+	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
+	bh = head;
+	nr = 0;
+	i = 0;
+
+	do {
+		if (buffer_uptodate(bh))
+			continue;
+
+		if (!buffer_mapped(bh)) {
+			int err = 0;
+
+			fully_mapped = 0;
+			if (iblock < lblock) {
+				WARN_ON(bh->b_size != blocksize);
+				err = get_block(inode, iblock, bh, 0);
+				if (err)
+					SetPageError(page);
+			}
+			if (!buffer_mapped(bh)) {
+				zero_user(page, i * blocksize, blocksize);
+				if (!err)
+					set_buffer_uptodate(bh);
+				continue;
+			}
+			/*
+			 * get_block() might have updated the buffer
+			 * synchronously
+			 */
+			if (buffer_uptodate(bh))
+				continue;
+		}
+		arr[nr++] = bh;
+	} while (i++, iblock++, (bh = bh->b_this_page) != head);
+
+	if (fully_mapped)
+		SetPageMappedToDisk(page);
+
+	if (!nr) {
+		/*
+		 * All buffers are uptodate - we can set the page uptodate
+		 * as well. But not if get_block() returned an error.
+		 */
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+		return 0;
+	}
+
+	/* Stage two: lock the buffers */
+	for (i = 0; i < nr; i++) {
+		bh = arr[i];
+		lock_buffer(bh);
+		mark_buffer_async_read(bh);
+	}
+
+	/*
+	 * Stage 3: start the IO.  Check for uptodateness
+	 * inside the buffer lock in case another process reading
+	 * the underlying blockdev brought it uptodate (the sct fix).
+	 */
+	for (i = 0; i < nr; i++) {
+		bh = arr[i];
+		if (buffer_uptodate(bh))
+			end_buffer_async_read(bh, 1);
+		else
+			submit_bh(READ, bh);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(block_read_full_page);
+
+/* utility function for filesystems that need to do work on expanding
+ * truncates.  Uses filesystem pagecache writes to allow the filesystem to
+ * deal with the hole.  
+ */
+int generic_cont_expand_simple(struct inode *inode, loff_t size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	void *fsdata;
+	int err;
+
+	err = inode_newsize_ok(inode, size);
+	if (err)
+		goto out;
+
+	err = pagecache_write_begin(NULL, mapping, size, 0,
+				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
+				&page, &fsdata);
+	if (err)
+		goto out;
+
+	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
+	BUG_ON(err > 0);
+
+out:
+	return err;
+}
+EXPORT_SYMBOL(generic_cont_expand_simple);
+
+static int cont_expand_zero(struct file *file, struct address_space *mapping,
+			    loff_t pos, loff_t *bytes)
+{
+	struct inode *inode = mapping->host;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	struct page *page;
+	void *fsdata;
+	pgoff_t index, curidx;
+	loff_t curpos;
+	unsigned zerofrom, offset, len;
+	int err = 0;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	offset = pos & ~PAGE_CACHE_MASK;
+
+	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
+		zerofrom = curpos & ~PAGE_CACHE_MASK;
+		if (zerofrom & (blocksize-1)) {
+			*bytes |= (blocksize-1);
+			(*bytes)++;
+		}
+		len = PAGE_CACHE_SIZE - zerofrom;
+
+		err = pagecache_write_begin(file, mapping, curpos, len,
+						AOP_FLAG_UNINTERRUPTIBLE,
+						&page, &fsdata);
+		if (err)
+			goto out;
+		zero_user(page, zerofrom, len);
+		err = pagecache_write_end(file, mapping, curpos, len, len,
+						page, fsdata);
+		if (err < 0)
+			goto out;
+		BUG_ON(err != len);
+		err = 0;
+
+		balance_dirty_pages_ratelimited(mapping);
+	}
+
+	/* page covers the boundary, find the boundary offset */
+	if (index == curidx) {
+		zerofrom = curpos & ~PAGE_CACHE_MASK;
+		/* if we will expand the thing last block will be filled */
+		if (offset <= zerofrom) {
+			goto out;
+		}
+		if (zerofrom & (blocksize-1)) {
+			*bytes |= (blocksize-1);
+			(*bytes)++;
+		}
+		len = offset - zerofrom;
+
+		err = pagecache_write_begin(file, mapping, curpos, len,
+						AOP_FLAG_UNINTERRUPTIBLE,
+						&page, &fsdata);
+		if (err)
+			goto out;
+		zero_user(page, zerofrom, len);
+		err = pagecache_write_end(file, mapping, curpos, len, len,
+						page, fsdata);
+		if (err < 0)
+			goto out;
+		BUG_ON(err != len);
+		err = 0;
+	}
+out:
+	return err;
+}
+
+/*
+ * For moronic filesystems that do not allow holes in file.
+ * We may have to extend the file.
+ */
+int cont_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata,
+			get_block_t *get_block, loff_t *bytes)
+{
+	struct inode *inode = mapping->host;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	unsigned zerofrom;
+	int err;
+
+	err = cont_expand_zero(file, mapping, pos, bytes);
+	if (err)
+		return err;
+
+	zerofrom = *bytes & ~PAGE_CACHE_MASK;
+	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
+		*bytes |= (blocksize-1);
+		(*bytes)++;
+	}
+
+	return block_write_begin(mapping, pos, len, flags, pagep, get_block);
+}
+EXPORT_SYMBOL(cont_write_begin);
+
+int block_commit_write(struct page *page, unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	__block_commit_write(inode,page,from,to);
+	return 0;
+}
+EXPORT_SYMBOL(block_commit_write);
+
+/*
+ * block_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF.  Because
+ * truncate writes the inode size before removing pages, once we have the
+ * page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ *
+ * Direct callers of this function should call vfs_check_frozen() so that page
+ * fault does not busyloop until the fs is thawed.
+ */
+int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+			 get_block_t get_block)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	unsigned long end;
+	loff_t size;
+	int ret;
+
+	lock_page(page);
+	size = i_size_read(inode);
+	if ((page->mapping != inode->i_mapping) ||
+	    (page_offset(page) > size)) {
+		/* We overload EFAULT to mean page got truncated */
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+
+	/* page is wholly or partially inside EOF */
+	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
+		end = size & ~PAGE_CACHE_MASK;
+	else
+		end = PAGE_CACHE_SIZE;
+
+	ret = __block_write_begin(page, 0, end, get_block);
+	if (!ret)
+		ret = block_commit_write(page, 0, end);
+
+	if (unlikely(ret < 0))
+		goto out_unlock;
+	/*
+	 * Freezing in progress? We check after the page is marked dirty and
+	 * with page lock held so if the test here fails, we are sure freezing
+	 * code will wait during syncing until the page fault is done - at that
+	 * point page will be dirty and unlocked so freezing code will write it
+	 * and writeprotect it again.
+	 */
+	set_page_dirty(page);
+	if (inode->i_sb->s_frozen != SB_UNFROZEN) {
+		ret = -EAGAIN;
+		goto out_unlock;
+	}
+	wait_on_page_writeback(page);
+	return 0;
+out_unlock:
+	unlock_page(page);
+	return ret;
+}
+EXPORT_SYMBOL(__block_page_mkwrite);
+
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+		   get_block_t get_block)
+{
+	int ret;
+	struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
+
+	/*
+	 * This check is racy but catches the common case. The check in
+	 * __block_page_mkwrite() is reliable.
+	 */
+	vfs_check_frozen(sb, SB_FREEZE_WRITE);
+	ret = __block_page_mkwrite(vma, vmf, get_block);
+	return block_page_mkwrite_return(ret);
+}
+EXPORT_SYMBOL(block_page_mkwrite);
+
+/*
+ * nobh_write_begin()'s prereads are special: the buffer_heads are freed
+ * immediately, while under the page lock.  So it needs a special end_io
+ * handler which does not touch the bh after unlocking it.
+ */
+static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
+{
+	__end_buffer_read_notouch(bh, uptodate);
+}
+
+/*
+ * Attach the singly-linked list of buffers created by nobh_write_begin, to
+ * the page (converting it to circular linked list and taking care of page
+ * dirty races).
+ */
+static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
+{
+	struct buffer_head *bh;
+
+	BUG_ON(!PageLocked(page));
+
+	spin_lock(&page->mapping->private_lock);
+	bh = head;
+	do {
+		if (PageDirty(page))
+			set_buffer_dirty(bh);
+		if (!bh->b_this_page)
+			bh->b_this_page = head;
+		bh = bh->b_this_page;
+	} while (bh != head);
+	attach_page_buffers(page, head);
+	spin_unlock(&page->mapping->private_lock);
+}
+
+/*
+ * On entry, the page is fully not uptodate.
+ * On exit the page is fully uptodate in the areas outside (from,to)
+ * The filesystem needs to handle block truncation upon failure.
+ */
+int nobh_write_begin(struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata,
+			get_block_t *get_block)
+{
+	struct inode *inode = mapping->host;
+	const unsigned blkbits = inode->i_blkbits;
+	const unsigned blocksize = 1 << blkbits;
+	struct buffer_head *head, *bh;
+	struct page *page;
+	pgoff_t index;
+	unsigned from, to;
+	unsigned block_in_page;
+	unsigned block_start, block_end;
+	sector_t block_in_file;
+	int nr_reads = 0;
+	int ret = 0;
+	int is_mapped_to_disk = 1;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+	*fsdata = NULL;
+
+	if (page_has_buffers(page)) {
+		ret = __block_write_begin(page, pos, len, get_block);
+		if (unlikely(ret))
+			goto out_release;
+		return ret;
+	}
+
+	if (PageMappedToDisk(page))
+		return 0;
+
+	/*
+	 * Allocate buffers so that we can keep track of state, and potentially
+	 * attach them to the page if an error occurs. In the common case of
+	 * no error, they will just be freed again without ever being attached
+	 * to the page (which is all OK, because we're under the page lock).
+	 *
+	 * Be careful: the buffer linked list is a NULL terminated one, rather
+	 * than the circular one we're used to.
+	 */
+	head = alloc_page_buffers(page, blocksize, 0);
+	if (!head) {
+		ret = -ENOMEM;
+		goto out_release;
+	}
+
+	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+
+	/*
+	 * We loop across all blocks in the page, whether or not they are
+	 * part of the affected region.  This is so we can discover if the
+	 * page is fully mapped-to-disk.
+	 */
+	for (block_start = 0, block_in_page = 0, bh = head;
+		  block_start < PAGE_CACHE_SIZE;
+		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
+		int create;
+
+		block_end = block_start + blocksize;
+		bh->b_state = 0;
+		create = 1;
+		if (block_start >= to)
+			create = 0;
+		ret = get_block(inode, block_in_file + block_in_page,
+					bh, create);
+		if (ret)
+			goto failed;
+		if (!buffer_mapped(bh))
+			is_mapped_to_disk = 0;
+		if (buffer_new(bh))
+			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+		if (PageUptodate(page)) {
+			set_buffer_uptodate(bh);
+			continue;
+		}
+		if (buffer_new(bh) || !buffer_mapped(bh)) {
+			zero_user_segments(page, block_start, from,
+							to, block_end);
+			continue;
+		}
+		if (buffer_uptodate(bh))
+			continue;	/* reiserfs does this */
+		if (block_start < from || block_end > to) {
+			lock_buffer(bh);
+			bh->b_end_io = end_buffer_read_nobh;
+			submit_bh(READ, bh);
+			nr_reads++;
+		}
+	}
+
+	if (nr_reads) {
+		/*
+		 * The page is locked, so these buffers are protected from
+		 * any VM or truncate activity.  Hence we don't need to care
+		 * for the buffer_head refcounts.
+		 */
+		for (bh = head; bh; bh = bh->b_this_page) {
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				ret = -EIO;
+		}
+		if (ret)
+			goto failed;
+	}
+
+	if (is_mapped_to_disk)
+		SetPageMappedToDisk(page);
+
+	*fsdata = head; /* to be released by nobh_write_end */
+
+	return 0;
+
+failed:
+	BUG_ON(!ret);
+	/*
+	 * Error recovery is a bit difficult. We need to zero out blocks that
+	 * were newly allocated, and dirty them to ensure they get written out.
+	 * Buffers need to be attached to the page at this point, otherwise
+	 * the handling of potential IO errors during writeout would be hard
+	 * (could try doing synchronous writeout, but what if that fails too?)
+	 */
+	attach_nobh_buffers(page, head);
+	page_zero_new_buffers(page, from, to);
+
+out_release:
+	unlock_page(page);
+	page_cache_release(page);
+	*pagep = NULL;
+
+	return ret;
+}
+EXPORT_SYMBOL(nobh_write_begin);
+
+int nobh_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head *head = fsdata;
+	struct buffer_head *bh;
+	BUG_ON(fsdata != NULL && page_has_buffers(page));
+
+	if (unlikely(copied < len) && head)
+		attach_nobh_buffers(page, head);
+	if (page_has_buffers(page))
+		return generic_write_end(file, mapping, pos, len,
+					copied, page, fsdata);
+
+	SetPageUptodate(page);
+	set_page_dirty(page);
+	if (pos+copied > inode->i_size) {
+		i_size_write(inode, pos+copied);
+		mark_inode_dirty(inode);
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	while (head) {
+		bh = head;
+		head = head->b_this_page;
+		free_buffer_head(bh);
+	}
+
+	return copied;
+}
+EXPORT_SYMBOL(nobh_write_end);
+
+/*
+ * nobh_writepage() - based on block_full_write_page() except
+ * that it tries to operate without attaching bufferheads to
+ * the page.
+ */
+int nobh_writepage(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc)
+{
+	struct inode * const inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	int ret;
+
+	/* Is the page fully inside i_size? */
+	if (page->index < end_index)
+		goto out;
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (page->index >= end_index+1 || !offset) {
+		/*
+		 * The page may have dirty, unmapped buffers.  For example,
+		 * they may have been added in ext3_writepage().  Make them
+		 * freeable here, so the page does not leak.
+		 */
+#if 0
+		/* Not really sure about this  - do we need this ? */
+		if (page->mapping->a_ops->invalidatepage)
+			page->mapping->a_ops->invalidatepage(page, offset);
+#endif
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+out:
+	ret = mpage_writepage(page, get_block, wbc);
+	if (ret == -EAGAIN)
+		ret = __block_write_full_page(inode, page, get_block, wbc,
+					      end_buffer_async_write);
+	return ret;
+}
+EXPORT_SYMBOL(nobh_writepage);
+
+int nobh_truncate_page(struct address_space *mapping,
+			loff_t from, get_block_t *get_block)
+{
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize;
+	sector_t iblock;
+	unsigned length, pos;
+	struct inode *inode = mapping->host;
+	struct page *page;
+	struct buffer_head map_bh;
+	int err;
+
+	blocksize = 1 << inode->i_blkbits;
+	length = offset & (blocksize - 1);
+
+	/* Block boundary? Nothing to do */
+	if (!length)
+		return 0;
+
+	length = blocksize - length;
+	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	page = grab_cache_page(mapping, index);
+	err = -ENOMEM;
+	if (!page)
+		goto out;
+
+	if (page_has_buffers(page)) {
+has_buffers:
+		unlock_page(page);
+		page_cache_release(page);
+		return block_truncate_page(mapping, from, get_block);
+	}
+
+	/* Find the buffer that contains "offset" */
+	pos = blocksize;
+	while (offset >= pos) {
+		iblock++;
+		pos += blocksize;
+	}
+
+	map_bh.b_size = blocksize;
+	map_bh.b_state = 0;
+	err = get_block(inode, iblock, &map_bh, 0);
+	if (err)
+		goto unlock;
+	/* unmapped? It's a hole - nothing to do */
+	if (!buffer_mapped(&map_bh))
+		goto unlock;
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (!PageUptodate(page)) {
+		err = mapping->a_ops->readpage(NULL, page);
+		if (err) {
+			page_cache_release(page);
+			goto out;
+		}
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			err = -EIO;
+			goto unlock;
+		}
+		if (page_has_buffers(page))
+			goto has_buffers;
+	}
+	zero_user(page, offset, length);
+	set_page_dirty(page);
+	err = 0;
+
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return err;
+}
+EXPORT_SYMBOL(nobh_truncate_page);
+
+int block_truncate_page(struct address_space *mapping,
+			loff_t from, get_block_t *get_block)
+{
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize;
+	sector_t iblock;
+	unsigned length, pos;
+	struct inode *inode = mapping->host;
+	struct page *page;
+	struct buffer_head *bh;
+	int err;
+
+	blocksize = 1 << inode->i_blkbits;
+	length = offset & (blocksize - 1);
+
+	/* Block boundary? Nothing to do */
+	if (!length)
+		return 0;
+
+	length = blocksize - length;
+	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	
+	page = grab_cache_page(mapping, index);
+	err = -ENOMEM;
+	if (!page)
+		goto out;
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	/* Find the buffer that contains "offset" */
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	err = 0;
+	if (!buffer_mapped(bh)) {
+		WARN_ON(bh->b_size != blocksize);
+		err = get_block(inode, iblock, bh, 0);
+		if (err)
+			goto unlock;
+		/* unmapped? It's a hole - nothing to do */
+		if (!buffer_mapped(bh))
+			goto unlock;
+	}
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
+		err = -EIO;
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (!buffer_uptodate(bh))
+			goto unlock;
+	}
+
+	zero_user(page, offset, length);
+	mark_buffer_dirty(bh);
+	err = 0;
+
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return err;
+}
+EXPORT_SYMBOL(block_truncate_page);
+
+/*
+ * The generic ->writepage function for buffer-backed address_spaces
+ * this form passes in the end_io handler used to finish the IO.
+ */
+int block_write_full_page_endio(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc, bh_end_io_t *handler)
+{
+	struct inode * const inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+
+	/* Is the page fully inside i_size? */
+	if (page->index < end_index)
+		return __block_write_full_page(inode, page, get_block, wbc,
+					       handler);
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (page->index >= end_index+1 || !offset) {
+		/*
+		 * The page may have dirty, unmapped buffers.  For example,
+		 * they may have been added in ext3_writepage().  Make them
+		 * freeable here, so the page does not leak.
+		 */
+		do_invalidatepage(page, 0);
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	return __block_write_full_page(inode, page, get_block, wbc, handler);
+}
+EXPORT_SYMBOL(block_write_full_page_endio);
+
+/*
+ * The generic ->writepage function for buffer-backed address_spaces
+ */
+int block_write_full_page(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc)
+{
+	return block_write_full_page_endio(page, get_block, wbc,
+					   end_buffer_async_write);
+}
+EXPORT_SYMBOL(block_write_full_page);
+
+sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
+			    get_block_t *get_block)
+{
+	struct buffer_head tmp;
+	struct inode *inode = mapping->host;
+	tmp.b_state = 0;
+	tmp.b_blocknr = 0;
+	tmp.b_size = 1 << inode->i_blkbits;
+	get_block(inode, block, &tmp, 0);
+	return tmp.b_blocknr;
+}
+EXPORT_SYMBOL(generic_block_bmap);
+
+static void end_bio_bh_io_sync(struct bio *bio, int err)
+{
+	struct buffer_head *bh = bio->bi_private;
+
+	if (err == -EOPNOTSUPP) {
+		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+	}
+
+	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
+		set_bit(BH_Quiet, &bh->b_state);
+
+	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
+	bio_put(bio);
+}
+
+int submit_bh(int rw, struct buffer_head * bh)
+{
+	struct bio *bio;
+	int ret = 0;
+
+	BUG_ON(!buffer_locked(bh));
+	BUG_ON(!buffer_mapped(bh));
+	BUG_ON(!bh->b_end_io);
+	BUG_ON(buffer_delay(bh));
+	BUG_ON(buffer_unwritten(bh));
+
+	/*
+	 * Only clear out a write error when rewriting
+	 */
+	if (test_set_buffer_req(bh) && (rw & WRITE))
+		clear_buffer_write_io_error(bh);
+
+	/*
+	 * from here on down, it's all bio -- do the initial mapping,
+	 * submit_bio -> generic_make_request may further map this bio around
+	 */
+	bio = bio_alloc(GFP_NOIO, 1);
+
+	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+	bio->bi_io_vec[0].bv_page = bh->b_page;
+	bio->bi_io_vec[0].bv_len = bh->b_size;
+	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
+
+	bio->bi_vcnt = 1;
+	bio->bi_idx = 0;
+	bio->bi_size = bh->b_size;
+
+	bio->bi_end_io = end_bio_bh_io_sync;
+	bio->bi_private = bh;
+
+	bio_get(bio);
+	submit_bio(rw, bio);
+
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+
+	bio_put(bio);
+	return ret;
+}
+EXPORT_SYMBOL(submit_bh);
+
+/**
+ * ll_rw_block: low-level access to block devices (DEPRECATED)
+ * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
+ * @nr: number of &struct buffer_heads in the array
+ * @bhs: array of pointers to &struct buffer_head
+ *
+ * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
+ * requests an I/O operation on them, either a %READ or a %WRITE.  The third
+ * %READA option is described in the documentation for generic_make_request()
+ * which ll_rw_block() calls.
+ *
+ * This function drops any buffer that it cannot get a lock on (with the
+ * BH_Lock state bit), any buffer that appears to be clean when doing a write
+ * request, and any buffer that appears to be up-to-date when doing read
+ * request.  Further it marks as clean buffers that are processed for
+ * writing (the buffer cache won't assume that they are actually clean
+ * until the buffer gets unlocked).
+ *
+ * ll_rw_block sets b_end_io to simple completion handler that marks
+ * the buffer up-to-date (if approriate), unlocks the buffer and wakes
+ * any waiters. 
+ *
+ * All of the buffers must be for the same device, and must also be a
+ * multiple of the current approved size for the device.
+ */
+void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
+{
+	int i;
+
+	for (i = 0; i < nr; i++) {
+		struct buffer_head *bh = bhs[i];
+
+		if (!trylock_buffer(bh))
+			continue;
+		if (rw == WRITE) {
+			if (test_clear_buffer_dirty(bh)) {
+				bh->b_end_io = end_buffer_write_sync;
+				get_bh(bh);
+				submit_bh(WRITE, bh);
+				continue;
+			}
+		} else {
+			if (!buffer_uptodate(bh)) {
+				bh->b_end_io = end_buffer_read_sync;
+				get_bh(bh);
+				submit_bh(rw, bh);
+				continue;
+			}
+		}
+		unlock_buffer(bh);
+	}
+}
+EXPORT_SYMBOL(ll_rw_block);
+
+void write_dirty_buffer(struct buffer_head *bh, int rw)
+{
+	lock_buffer(bh);
+	if (!test_clear_buffer_dirty(bh)) {
+		unlock_buffer(bh);
+		return;
+	}
+	bh->b_end_io = end_buffer_write_sync;
+	get_bh(bh);
+	submit_bh(rw, bh);
+}
+EXPORT_SYMBOL(write_dirty_buffer);
+
+/*
+ * For a data-integrity writeout, we need to wait upon any in-progress I/O
+ * and then start new I/O and then wait upon it.  The caller must have a ref on
+ * the buffer_head.
+ */
+int __sync_dirty_buffer(struct buffer_head *bh, int rw)
+{
+	int ret = 0;
+
+	WARN_ON(atomic_read(&bh->b_count) < 1);
+	lock_buffer(bh);
+	if (test_clear_buffer_dirty(bh)) {
+		get_bh(bh);
+		bh->b_end_io = end_buffer_write_sync;
+		ret = submit_bh(rw, bh);
+		wait_on_buffer(bh);
+		if (!ret && !buffer_uptodate(bh))
+			ret = -EIO;
+	} else {
+		unlock_buffer(bh);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__sync_dirty_buffer);
+
+int sync_dirty_buffer(struct buffer_head *bh)
+{
+	return __sync_dirty_buffer(bh, WRITE_SYNC);
+}
+EXPORT_SYMBOL(sync_dirty_buffer);
+
+/*
+ * try_to_free_buffers() checks if all the buffers on this particular page
+ * are unused, and releases them if so.
+ *
+ * Exclusion against try_to_free_buffers may be obtained by either
+ * locking the page or by holding its mapping's private_lock.
+ *
+ * If the page is dirty but all the buffers are clean then we need to
+ * be sure to mark the page clean as well.  This is because the page
+ * may be against a block device, and a later reattachment of buffers
+ * to a dirty page will set *all* buffers dirty.  Which would corrupt
+ * filesystem data on the same device.
+ *
+ * The same applies to regular filesystem pages: if all the buffers are
+ * clean then we set the page clean and proceed.  To do that, we require
+ * total exclusion from __set_page_dirty_buffers().  That is obtained with
+ * private_lock.
+ *
+ * try_to_free_buffers() is non-blocking.
+ */
+static inline int buffer_busy(struct buffer_head *bh)
+{
+	return atomic_read(&bh->b_count) |
+		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
+}
+
+static int
+drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
+{
+	struct buffer_head *head = page_buffers(page);
+	struct buffer_head *bh;
+
+	bh = head;
+	do {
+		if (buffer_write_io_error(bh) && page->mapping)
+			set_bit(AS_EIO, &page->mapping->flags);
+		if (buffer_busy(bh))
+			goto failed;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	do {
+		struct buffer_head *next = bh->b_this_page;
+
+		if (bh->b_assoc_map)
+			__remove_assoc_queue(bh);
+		bh = next;
+	} while (bh != head);
+	*buffers_to_free = head;
+	__clear_page_buffers(page);
+	return 1;
+failed:
+	return 0;
+}
+
+int try_to_free_buffers(struct page *page)
+{
+	struct address_space * const mapping = page->mapping;
+	struct buffer_head *buffers_to_free = NULL;
+	int ret = 0;
+
+	BUG_ON(!PageLocked(page));
+	if (PageWriteback(page))
+		return 0;
+
+	if (mapping == NULL) {		/* can this still happen? */
+		ret = drop_buffers(page, &buffers_to_free);
+		goto out;
+	}
+
+	spin_lock(&mapping->private_lock);
+	ret = drop_buffers(page, &buffers_to_free);
+
+	/*
+	 * If the filesystem writes its buffers by hand (eg ext3)
+	 * then we can have clean buffers against a dirty page.  We
+	 * clean the page here; otherwise the VM will never notice
+	 * that the filesystem did any IO at all.
+	 *
+	 * Also, during truncate, discard_buffer will have marked all
+	 * the page's buffers clean.  We discover that here and clean
+	 * the page also.
+	 *
+	 * private_lock must be held over this entire operation in order
+	 * to synchronise against __set_page_dirty_buffers and prevent the
+	 * dirty bit from being lost.
+	 */
+	if (ret)
+		cancel_dirty_page(page, PAGE_CACHE_SIZE);
+	spin_unlock(&mapping->private_lock);
+out:
+	if (buffers_to_free) {
+		struct buffer_head *bh = buffers_to_free;
+
+		do {
+			struct buffer_head *next = bh->b_this_page;
+			free_buffer_head(bh);
+			bh = next;
+		} while (bh != buffers_to_free);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(try_to_free_buffers);
+
+/*
+ * There are no bdflush tunables left.  But distributions are
+ * still running obsolete flush daemons, so we terminate them here.
+ *
+ * Use of bdflush() is deprecated and will be removed in a future kernel.
+ * The `flush-X' kernel threads fully replace bdflush daemons and this call.
+ */
+SYSCALL_DEFINE2(bdflush, int, func, long, data)
+{
+	static int msg_count;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (msg_count < 5) {
+		msg_count++;
+		printk(KERN_INFO
+			"warning: process `%s' used the obsolete bdflush"
+			" system call\n", current->comm);
+		printk(KERN_INFO "Fix your initscripts?\n");
+	}
+
+	if (func == 1)
+		do_exit(0);
+	return 0;
+}
+
+/*
+ * Buffer-head allocation
+ */
+static struct kmem_cache *bh_cachep;
+
+/*
+ * Once the number of bh's in the machine exceeds this level, we start
+ * stripping them in writeback.
+ */
+static int max_buffer_heads;
+
+int buffer_heads_over_limit;
+
+struct bh_accounting {
+	int nr;			/* Number of live bh's */
+	int ratelimit;		/* Limit cacheline bouncing */
+};
+
+static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
+
+static void recalc_bh_state(void)
+{
+	int i;
+	int tot = 0;
+
+	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
+		return;
+	__this_cpu_write(bh_accounting.ratelimit, 0);
+	for_each_online_cpu(i)
+		tot += per_cpu(bh_accounting, i).nr;
+	buffer_heads_over_limit = (tot > max_buffer_heads);
+}
+
+struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
+{
+	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
+	if (ret) {
+		INIT_LIST_HEAD(&ret->b_assoc_buffers);
+		preempt_disable();
+		__this_cpu_inc(bh_accounting.nr);
+		recalc_bh_state();
+		preempt_enable();
+	}
+	return ret;
+}
+EXPORT_SYMBOL(alloc_buffer_head);
+
+void free_buffer_head(struct buffer_head *bh)
+{
+	BUG_ON(!list_empty(&bh->b_assoc_buffers));
+	kmem_cache_free(bh_cachep, bh);
+	preempt_disable();
+	__this_cpu_dec(bh_accounting.nr);
+	recalc_bh_state();
+	preempt_enable();
+}
+EXPORT_SYMBOL(free_buffer_head);
+
+static void buffer_exit_cpu(int cpu)
+{
+	int i;
+	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
+
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		brelse(b->bhs[i]);
+		b->bhs[i] = NULL;
+	}
+	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
+	per_cpu(bh_accounting, cpu).nr = 0;
+}
+
+static int buffer_cpu_notify(struct notifier_block *self,
+			      unsigned long action, void *hcpu)
+{
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
+		buffer_exit_cpu((unsigned long)hcpu);
+	return NOTIFY_OK;
+}
+
+/**
+ * bh_uptodate_or_lock - Test whether the buffer is uptodate
+ * @bh: struct buffer_head
+ *
+ * Return true if the buffer is up-to-date and false,
+ * with the buffer locked, if not.
+ */
+int bh_uptodate_or_lock(struct buffer_head *bh)
+{
+	if (!buffer_uptodate(bh)) {
+		lock_buffer(bh);
+		if (!buffer_uptodate(bh))
+			return 0;
+		unlock_buffer(bh);
+	}
+	return 1;
+}
+EXPORT_SYMBOL(bh_uptodate_or_lock);
+
+/**
+ * bh_submit_read - Submit a locked buffer for reading
+ * @bh: struct buffer_head
+ *
+ * Returns zero on success and -EIO on error.
+ */
+int bh_submit_read(struct buffer_head *bh)
+{
+	BUG_ON(!buffer_locked(bh));
+
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		return 0;
+	}
+
+	get_bh(bh);
+	bh->b_end_io = end_buffer_read_sync;
+	submit_bh(READ, bh);
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return 0;
+	return -EIO;
+}
+EXPORT_SYMBOL(bh_submit_read);
+
+void __init buffer_init(void)
+{
+	int nrpages;
+
+	bh_cachep = kmem_cache_create("buffer_head",
+			sizeof(struct buffer_head), 0,
+				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
+				SLAB_MEM_SPREAD),
+				NULL);
+
+	/*
+	 * Limit the bh occupancy to 10% of ZONE_NORMAL
+	 */
+	nrpages = (nr_free_buffer_pages() * 10) / 100;
+	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
+	hotcpu_notifier(buffer_cpu_notify, 0);
+}
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
new file mode 100644
index 00000000..80e9c616
--- /dev/null
+++ b/fs/cachefiles/Kconfig
@@ -0,0 +1,39 @@
+
+config CACHEFILES
+	tristate "Filesystem caching on files"
+	depends on FSCACHE && BLOCK
+	help
+	  This permits use of a mounted filesystem as a cache for other
+	  filesystems - primarily networking filesystems - thus allowing fast
+	  local disk to enhance the speed of slower devices.
+
+	  See Documentation/filesystems/caching/cachefiles.txt for more
+	  information.
+
+config CACHEFILES_DEBUG
+	bool "Debug CacheFiles"
+	depends on CACHEFILES
+	help
+	  This permits debugging to be dynamically enabled in the filesystem
+	  caching on files module.  If this is set, the debugging output may be
+	  enabled by setting bits in /sys/modules/cachefiles/parameter/debug or
+	  by including a debugging specifier in /etc/cachefilesd.conf.
+
+config CACHEFILES_HISTOGRAM
+	bool "Gather latency information on CacheFiles"
+	depends on CACHEFILES && PROC_FS
+	help
+
+	  This option causes latency information to be gathered on CacheFiles
+	  operation and exported through file:
+
+		/proc/fs/cachefiles/histogram
+
+	  The generation of this histogram adds a certain amount of overhead to
+	  execution as there are a number of points at which data is gathered,
+	  and on a multi-CPU system these may be on cachelines that keep
+	  bouncing between CPUs.  On the other hand, the histogram may be
+	  useful for debugging purposes.  Saying 'N' here is recommended.
+
+	  See Documentation/filesystems/caching/cachefiles.txt for more
+	  information.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
new file mode 100644
index 00000000..32cbab0f
--- /dev/null
+++ b/fs/cachefiles/Makefile
@@ -0,0 +1,18 @@
+#
+# Makefile for caching in a mounted filesystem
+#
+
+cachefiles-y := \
+	bind.o \
+	daemon.o \
+	interface.o \
+	key.o \
+	main.o \
+	namei.o \
+	rdwr.o \
+	security.o \
+	xattr.o
+
+cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o
+
+obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
new file mode 100644
index 00000000..a2603e7c
--- /dev/null
+++ b/fs/cachefiles/bind.c
@@ -0,0 +1,283 @@
+/* Bind and unbind a cache from the filesystem backing it
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/ctype.h>
+#include "internal.h"
+
+static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
+
+/*
+ * bind a directory as a cache
+ */
+int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
+{
+	_enter("{%u,%u,%u,%u,%u,%u},%s",
+	       cache->frun_percent,
+	       cache->fcull_percent,
+	       cache->fstop_percent,
+	       cache->brun_percent,
+	       cache->bcull_percent,
+	       cache->bstop_percent,
+	       args);
+
+	/* start by checking things over */
+	ASSERT(cache->fstop_percent >= 0 &&
+	       cache->fstop_percent < cache->fcull_percent &&
+	       cache->fcull_percent < cache->frun_percent &&
+	       cache->frun_percent  < 100);
+
+	ASSERT(cache->bstop_percent >= 0 &&
+	       cache->bstop_percent < cache->bcull_percent &&
+	       cache->bcull_percent < cache->brun_percent &&
+	       cache->brun_percent  < 100);
+
+	if (*args) {
+		kerror("'bind' command doesn't take an argument");
+		return -EINVAL;
+	}
+
+	if (!cache->rootdirname) {
+		kerror("No cache directory specified");
+		return -EINVAL;
+	}
+
+	/* don't permit already bound caches to be re-bound */
+	if (test_bit(CACHEFILES_READY, &cache->flags)) {
+		kerror("Cache already bound");
+		return -EBUSY;
+	}
+
+	/* make sure we have copies of the tag and dirname strings */
+	if (!cache->tag) {
+		/* the tag string is released by the fops->release()
+		 * function, so we don't release it on error here */
+		cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
+		if (!cache->tag)
+			return -ENOMEM;
+	}
+
+	/* add the cache */
+	return cachefiles_daemon_add_cache(cache);
+}
+
+/*
+ * add a cache
+ */
+static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
+{
+	struct cachefiles_object *fsdef;
+	struct path path;
+	struct kstatfs stats;
+	struct dentry *graveyard, *cachedir, *root;
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter("");
+
+	/* we want to work under the module's security ID */
+	ret = cachefiles_get_security_ID(cache);
+	if (ret < 0)
+		return ret;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+
+	/* allocate the root index object */
+	ret = -ENOMEM;
+
+	fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+	if (!fsdef)
+		goto error_root_object;
+
+	ASSERTCMP(fsdef->backer, ==, NULL);
+
+	atomic_set(&fsdef->usage, 1);
+	fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
+
+	_debug("- fsdef %p", fsdef);
+
+	/* look up the directory at the root of the cache */
+	ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path);
+	if (ret < 0)
+		goto error_open_root;
+
+	cache->mnt = path.mnt;
+	root = path.dentry;
+
+	/* check parameters */
+	ret = -EOPNOTSUPP;
+	if (!root->d_inode ||
+	    !root->d_inode->i_op ||
+	    !root->d_inode->i_op->lookup ||
+	    !root->d_inode->i_op->mkdir ||
+	    !root->d_inode->i_op->setxattr ||
+	    !root->d_inode->i_op->getxattr ||
+	    !root->d_sb ||
+	    !root->d_sb->s_op ||
+	    !root->d_sb->s_op->statfs ||
+	    !root->d_sb->s_op->sync_fs)
+		goto error_unsupported;
+
+	ret = -EROFS;
+	if (root->d_sb->s_flags & MS_RDONLY)
+		goto error_unsupported;
+
+	/* determine the security of the on-disk cache as this governs
+	 * security ID of files we create */
+	ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
+	if (ret < 0)
+		goto error_unsupported;
+
+	/* get the cache size and blocksize */
+	ret = vfs_statfs(&path, &stats);
+	if (ret < 0)
+		goto error_unsupported;
+
+	ret = -ERANGE;
+	if (stats.f_bsize <= 0)
+		goto error_unsupported;
+
+	ret = -EOPNOTSUPP;
+	if (stats.f_bsize > PAGE_SIZE)
+		goto error_unsupported;
+
+	cache->bsize = stats.f_bsize;
+	cache->bshift = 0;
+	if (stats.f_bsize < PAGE_SIZE)
+		cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);
+
+	_debug("blksize %u (shift %u)",
+	       cache->bsize, cache->bshift);
+
+	_debug("size %llu, avail %llu",
+	       (unsigned long long) stats.f_blocks,
+	       (unsigned long long) stats.f_bavail);
+
+	/* set up caching limits */
+	do_div(stats.f_files, 100);
+	cache->fstop = stats.f_files * cache->fstop_percent;
+	cache->fcull = stats.f_files * cache->fcull_percent;
+	cache->frun  = stats.f_files * cache->frun_percent;
+
+	_debug("limits {%llu,%llu,%llu} files",
+	       (unsigned long long) cache->frun,
+	       (unsigned long long) cache->fcull,
+	       (unsigned long long) cache->fstop);
+
+	stats.f_blocks >>= cache->bshift;
+	do_div(stats.f_blocks, 100);
+	cache->bstop = stats.f_blocks * cache->bstop_percent;
+	cache->bcull = stats.f_blocks * cache->bcull_percent;
+	cache->brun  = stats.f_blocks * cache->brun_percent;
+
+	_debug("limits {%llu,%llu,%llu} blocks",
+	       (unsigned long long) cache->brun,
+	       (unsigned long long) cache->bcull,
+	       (unsigned long long) cache->bstop);
+
+	/* get the cache directory and check its type */
+	cachedir = cachefiles_get_directory(cache, root, "cache");
+	if (IS_ERR(cachedir)) {
+		ret = PTR_ERR(cachedir);
+		goto error_unsupported;
+	}
+
+	fsdef->dentry = cachedir;
+	fsdef->fscache.cookie = NULL;
+
+	ret = cachefiles_check_object_type(fsdef);
+	if (ret < 0)
+		goto error_unsupported;
+
+	/* get the graveyard directory */
+	graveyard = cachefiles_get_directory(cache, root, "graveyard");
+	if (IS_ERR(graveyard)) {
+		ret = PTR_ERR(graveyard);
+		goto error_unsupported;
+	}
+
+	cache->graveyard = graveyard;
+
+	/* publish the cache */
+	fscache_init_cache(&cache->cache,
+			   &cachefiles_cache_ops,
+			   "%s",
+			   fsdef->dentry->d_sb->s_id);
+
+	fscache_object_init(&fsdef->fscache, NULL, &cache->cache);
+
+	ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);
+	if (ret < 0)
+		goto error_add_cache;
+
+	/* done */
+	set_bit(CACHEFILES_READY, &cache->flags);
+	dput(root);
+
+	printk(KERN_INFO "CacheFiles:"
+	       " File cache on %s registered\n",
+	       cache->cache.identifier);
+
+	/* check how much space the cache has */
+	cachefiles_has_space(cache, 0, 0);
+	cachefiles_end_secure(cache, saved_cred);
+	return 0;
+
+error_add_cache:
+	dput(cache->graveyard);
+	cache->graveyard = NULL;
+error_unsupported:
+	mntput(cache->mnt);
+	cache->mnt = NULL;
+	dput(fsdef->dentry);
+	fsdef->dentry = NULL;
+	dput(root);
+error_open_root:
+	kmem_cache_free(cachefiles_object_jar, fsdef);
+error_root_object:
+	cachefiles_end_secure(cache, saved_cred);
+	kerror("Failed to register: %d", ret);
+	return ret;
+}
+
+/*
+ * unbind a cache on fd release
+ */
+void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
+{
+	_enter("");
+
+	if (test_bit(CACHEFILES_READY, &cache->flags)) {
+		printk(KERN_INFO "CacheFiles:"
+		       " File cache on %s unregistering\n",
+		       cache->cache.identifier);
+
+		fscache_withdraw_cache(&cache->cache);
+	}
+
+	dput(cache->graveyard);
+	mntput(cache->mnt);
+
+	kfree(cache->rootdirname);
+	kfree(cache->secctx);
+	kfree(cache->tag);
+
+	_leave("");
+}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
new file mode 100644
index 00000000..0a1467b1
--- /dev/null
+++ b/fs/cachefiles/daemon.c
@@ -0,0 +1,752 @@
+/* Daemon interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/fs_struct.h>
+#include "internal.h"
+
+static int cachefiles_daemon_open(struct inode *, struct file *);
+static int cachefiles_daemon_release(struct inode *, struct file *);
+static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t,
+				      loff_t *);
+static ssize_t cachefiles_daemon_write(struct file *, const char __user *,
+				       size_t, loff_t *);
+static unsigned int cachefiles_daemon_poll(struct file *,
+					   struct poll_table_struct *);
+static int cachefiles_daemon_frun(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_brun(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_cull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_debug(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_dir(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_tag(struct cachefiles_cache *, char *);
+
+static unsigned long cachefiles_open;
+
+const struct file_operations cachefiles_daemon_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cachefiles_daemon_open,
+	.release	= cachefiles_daemon_release,
+	.read		= cachefiles_daemon_read,
+	.write		= cachefiles_daemon_write,
+	.poll		= cachefiles_daemon_poll,
+	.llseek		= noop_llseek,
+};
+
+struct cachefiles_daemon_cmd {
+	char name[8];
+	int (*handler)(struct cachefiles_cache *cache, char *args);
+};
+
+static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
+	{ "bind",	cachefiles_daemon_bind		},
+	{ "brun",	cachefiles_daemon_brun		},
+	{ "bcull",	cachefiles_daemon_bcull		},
+	{ "bstop",	cachefiles_daemon_bstop		},
+	{ "cull",	cachefiles_daemon_cull		},
+	{ "debug",	cachefiles_daemon_debug		},
+	{ "dir",	cachefiles_daemon_dir		},
+	{ "frun",	cachefiles_daemon_frun		},
+	{ "fcull",	cachefiles_daemon_fcull		},
+	{ "fstop",	cachefiles_daemon_fstop		},
+	{ "inuse",	cachefiles_daemon_inuse		},
+	{ "secctx",	cachefiles_daemon_secctx	},
+	{ "tag",	cachefiles_daemon_tag		},
+	{ "",		NULL				}
+};
+
+
+/*
+ * do various checks
+ */
+static int cachefiles_daemon_open(struct inode *inode, struct file *file)
+{
+	struct cachefiles_cache *cache;
+
+	_enter("");
+
+	/* only the superuser may do this */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* the cachefiles device may only be open once at a time */
+	if (xchg(&cachefiles_open, 1) == 1)
+		return -EBUSY;
+
+	/* allocate a cache record */
+	cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL);
+	if (!cache) {
+		cachefiles_open = 0;
+		return -ENOMEM;
+	}
+
+	mutex_init(&cache->daemon_mutex);
+	cache->active_nodes = RB_ROOT;
+	rwlock_init(&cache->active_lock);
+	init_waitqueue_head(&cache->daemon_pollwq);
+
+	/* set default caching limits
+	 * - limit at 1% free space and/or free files
+	 * - cull below 5% free space and/or free files
+	 * - cease culling above 7% free space and/or free files
+	 */
+	cache->frun_percent = 7;
+	cache->fcull_percent = 5;
+	cache->fstop_percent = 1;
+	cache->brun_percent = 7;
+	cache->bcull_percent = 5;
+	cache->bstop_percent = 1;
+
+	file->private_data = cache;
+	cache->cachefilesd = file;
+	return 0;
+}
+
+/*
+ * release a cache
+ */
+static int cachefiles_daemon_release(struct inode *inode, struct file *file)
+{
+	struct cachefiles_cache *cache = file->private_data;
+
+	_enter("");
+
+	ASSERT(cache);
+
+	set_bit(CACHEFILES_DEAD, &cache->flags);
+
+	cachefiles_daemon_unbind(cache);
+
+	ASSERT(!cache->active_nodes.rb_node);
+
+	/* clean up the control file interface */
+	cache->cachefilesd = NULL;
+	file->private_data = NULL;
+	cachefiles_open = 0;
+
+	kfree(cache);
+
+	_leave("");
+	return 0;
+}
+
+/*
+ * read the cache state
+ */
+static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
+				      size_t buflen, loff_t *pos)
+{
+	struct cachefiles_cache *cache = file->private_data;
+	char buffer[256];
+	int n;
+
+	//_enter(",,%zu,", buflen);
+
+	if (!test_bit(CACHEFILES_READY, &cache->flags))
+		return 0;
+
+	/* check how much space the cache has */
+	cachefiles_has_space(cache, 0, 0);
+
+	/* summarise */
+	clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
+
+	n = snprintf(buffer, sizeof(buffer),
+		     "cull=%c"
+		     " frun=%llx"
+		     " fcull=%llx"
+		     " fstop=%llx"
+		     " brun=%llx"
+		     " bcull=%llx"
+		     " bstop=%llx",
+		     test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
+		     (unsigned long long) cache->frun,
+		     (unsigned long long) cache->fcull,
+		     (unsigned long long) cache->fstop,
+		     (unsigned long long) cache->brun,
+		     (unsigned long long) cache->bcull,
+		     (unsigned long long) cache->bstop
+		     );
+
+	if (n > buflen)
+		return -EMSGSIZE;
+
+	if (copy_to_user(_buffer, buffer, n) != 0)
+		return -EFAULT;
+
+	return n;
+}
+
+/*
+ * command the cache
+ */
+static ssize_t cachefiles_daemon_write(struct file *file,
+				       const char __user *_data,
+				       size_t datalen,
+				       loff_t *pos)
+{
+	const struct cachefiles_daemon_cmd *cmd;
+	struct cachefiles_cache *cache = file->private_data;
+	ssize_t ret;
+	char *data, *args, *cp;
+
+	//_enter(",,%zu,", datalen);
+
+	ASSERT(cache);
+
+	if (test_bit(CACHEFILES_DEAD, &cache->flags))
+		return -EIO;
+
+	if (datalen < 0 || datalen > PAGE_SIZE - 1)
+		return -EOPNOTSUPP;
+
+	/* drag the command string into the kernel so we can parse it */
+	data = kmalloc(datalen + 1, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	ret = -EFAULT;
+	if (copy_from_user(data, _data, datalen) != 0)
+		goto error;
+
+	data[datalen] = '\0';
+
+	ret = -EINVAL;
+	if (memchr(data, '\0', datalen))
+		goto error;
+
+	/* strip any newline */
+	cp = memchr(data, '\n', datalen);
+	if (cp) {
+		if (cp == data)
+			goto error;
+
+		*cp = '\0';
+	}
+
+	/* parse the command */
+	ret = -EOPNOTSUPP;
+
+	for (args = data; *args; args++)
+		if (isspace(*args))
+			break;
+	if (*args) {
+		if (args == data)
+			goto error;
+		*args = '\0';
+		args = skip_spaces(++args);
+	}
+
+	/* run the appropriate command handler */
+	for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++)
+		if (strcmp(cmd->name, data) == 0)
+			goto found_command;
+
+error:
+	kfree(data);
+	//_leave(" = %zd", ret);
+	return ret;
+
+found_command:
+	mutex_lock(&cache->daemon_mutex);
+
+	ret = -EIO;
+	if (!test_bit(CACHEFILES_DEAD, &cache->flags))
+		ret = cmd->handler(cache, args);
+
+	mutex_unlock(&cache->daemon_mutex);
+
+	if (ret == 0)
+		ret = datalen;
+	goto error;
+}
+
+/*
+ * poll for culling state
+ * - use POLLOUT to indicate culling state
+ */
+static unsigned int cachefiles_daemon_poll(struct file *file,
+					   struct poll_table_struct *poll)
+{
+	struct cachefiles_cache *cache = file->private_data;
+	unsigned int mask;
+
+	poll_wait(file, &cache->daemon_pollwq, poll);
+	mask = 0;
+
+	if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
+		mask |= POLLIN;
+
+	if (test_bit(CACHEFILES_CULLING, &cache->flags))
+		mask |= POLLOUT;
+
+	return mask;
+}
+
+/*
+ * give a range error for cache space constraints
+ * - can be tail-called
+ */
+static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
+					 char *args)
+{
+	kerror("Free space limits must be in range"
+	       " 0%%<=stop<cull<run<100%%");
+
+	return -EINVAL;
+}
+
+/*
+ * set the percentage of files at which to stop culling
+ * - command: "frun <N>%"
+ */
+static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long frun;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	frun = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (frun <= cache->fcull_percent || frun >= 100)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->frun_percent = frun;
+	return 0;
+}
+
+/*
+ * set the percentage of files at which to start culling
+ * - command: "fcull <N>%"
+ */
+static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long fcull;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	fcull = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->fcull_percent = fcull;
+	return 0;
+}
+
+/*
+ * set the percentage of files at which to stop allocating
+ * - command: "fstop <N>%"
+ */
+static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long fstop;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	fstop = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (fstop < 0 || fstop >= cache->fcull_percent)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->fstop_percent = fstop;
+	return 0;
+}
+
+/*
+ * set the percentage of blocks at which to stop culling
+ * - command: "brun <N>%"
+ */
+static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long brun;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	brun = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (brun <= cache->bcull_percent || brun >= 100)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->brun_percent = brun;
+	return 0;
+}
+
+/*
+ * set the percentage of blocks at which to start culling
+ * - command: "bcull <N>%"
+ */
+static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long bcull;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	bcull = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->bcull_percent = bcull;
+	return 0;
+}
+
+/*
+ * set the percentage of blocks at which to stop allocating
+ * - command: "bstop <N>%"
+ */
+static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long bstop;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	bstop = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (bstop < 0 || bstop >= cache->bcull_percent)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->bstop_percent = bstop;
+	return 0;
+}
+
+/*
+ * set the cache directory
+ * - command: "dir <name>"
+ */
+static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
+{
+	char *dir;
+
+	_enter(",%s", args);
+
+	if (!*args) {
+		kerror("Empty directory specified");
+		return -EINVAL;
+	}
+
+	if (cache->rootdirname) {
+		kerror("Second cache directory specified");
+		return -EEXIST;
+	}
+
+	dir = kstrdup(args, GFP_KERNEL);
+	if (!dir)
+		return -ENOMEM;
+
+	cache->rootdirname = dir;
+	return 0;
+}
+
+/*
+ * set the cache security context
+ * - command: "secctx <ctx>"
+ */
+static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
+{
+	char *secctx;
+
+	_enter(",%s", args);
+
+	if (!*args) {
+		kerror("Empty security context specified");
+		return -EINVAL;
+	}
+
+	if (cache->secctx) {
+		kerror("Second security context specified");
+		return -EINVAL;
+	}
+
+	secctx = kstrdup(args, GFP_KERNEL);
+	if (!secctx)
+		return -ENOMEM;
+
+	cache->secctx = secctx;
+	return 0;
+}
+
+/*
+ * set the cache tag
+ * - command: "tag <name>"
+ */
+static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
+{
+	char *tag;
+
+	_enter(",%s", args);
+
+	if (!*args) {
+		kerror("Empty tag specified");
+		return -EINVAL;
+	}
+
+	if (cache->tag)
+		return -EEXIST;
+
+	tag = kstrdup(args, GFP_KERNEL);
+	if (!tag)
+		return -ENOMEM;
+
+	cache->tag = tag;
+	return 0;
+}
+
+/*
+ * request a node in the cache be culled from the current working directory
+ * - command: "cull <name>"
+ */
+static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
+{
+	struct path path;
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter(",%s", args);
+
+	if (strchr(args, '/'))
+		goto inval;
+
+	if (!test_bit(CACHEFILES_READY, &cache->flags)) {
+		kerror("cull applied to unready cache");
+		return -EIO;
+	}
+
+	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+		kerror("cull applied to dead cache");
+		return -EIO;
+	}
+
+	/* extract the directory dentry from the cwd */
+	get_fs_pwd(current->fs, &path);
+
+	if (!S_ISDIR(path.dentry->d_inode->i_mode))
+		goto notdir;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = cachefiles_cull(cache, path.dentry, args);
+	cachefiles_end_secure(cache, saved_cred);
+
+	path_put(&path);
+	_leave(" = %d", ret);
+	return ret;
+
+notdir:
+	path_put(&path);
+	kerror("cull command requires dirfd to be a directory");
+	return -ENOTDIR;
+
+inval:
+	kerror("cull command requires dirfd and filename");
+	return -EINVAL;
+}
+
+/*
+ * set debugging mode
+ * - command: "debug <mask>"
+ */
+static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long mask;
+
+	_enter(",%s", args);
+
+	mask = simple_strtoul(args, &args, 0);
+	if (args[0] != '\0')
+		goto inval;
+
+	cachefiles_debug = mask;
+	_leave(" = 0");
+	return 0;
+
+inval:
+	kerror("debug command requires mask");
+	return -EINVAL;
+}
+
+/*
+ * find out whether an object in the current working directory is in use or not
+ * - command: "inuse <name>"
+ */
+static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
+{
+	struct path path;
+	const struct cred *saved_cred;
+	int ret;
+
+	//_enter(",%s", args);
+
+	if (strchr(args, '/'))
+		goto inval;
+
+	if (!test_bit(CACHEFILES_READY, &cache->flags)) {
+		kerror("inuse applied to unready cache");
+		return -EIO;
+	}
+
+	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+		kerror("inuse applied to dead cache");
+		return -EIO;
+	}
+
+	/* extract the directory dentry from the cwd */
+	get_fs_pwd(current->fs, &path);
+
+	if (!S_ISDIR(path.dentry->d_inode->i_mode))
+		goto notdir;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = cachefiles_check_in_use(cache, path.dentry, args);
+	cachefiles_end_secure(cache, saved_cred);
+
+	path_put(&path);
+	//_leave(" = %d", ret);
+	return ret;
+
+notdir:
+	path_put(&path);
+	kerror("inuse command requires dirfd to be a directory");
+	return -ENOTDIR;
+
+inval:
+	kerror("inuse command requires dirfd and filename");
+	return -EINVAL;
+}
+
+/*
+ * see if we have space for a number of pages and/or a number of files in the
+ * cache
+ */
+int cachefiles_has_space(struct cachefiles_cache *cache,
+			 unsigned fnr, unsigned bnr)
+{
+	struct kstatfs stats;
+	struct path path = {
+		.mnt	= cache->mnt,
+		.dentry	= cache->mnt->mnt_root,
+	};
+	int ret;
+
+	//_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
+	//       (unsigned long long) cache->frun,
+	//       (unsigned long long) cache->fcull,
+	//       (unsigned long long) cache->fstop,
+	//       (unsigned long long) cache->brun,
+	//       (unsigned long long) cache->bcull,
+	//       (unsigned long long) cache->bstop,
+	//       fnr, bnr);
+
+	/* find out how many pages of blockdev are available */
+	memset(&stats, 0, sizeof(stats));
+
+	ret = vfs_statfs(&path, &stats);
+	if (ret < 0) {
+		if (ret == -EIO)
+			cachefiles_io_error(cache, "statfs failed");
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	stats.f_bavail >>= cache->bshift;
+
+	//_debug("avail %llu,%llu",
+	//       (unsigned long long) stats.f_ffree,
+	//       (unsigned long long) stats.f_bavail);
+
+	/* see if there is sufficient space */
+	if (stats.f_ffree > fnr)
+		stats.f_ffree -= fnr;
+	else
+		stats.f_ffree = 0;
+
+	if (stats.f_bavail > bnr)
+		stats.f_bavail -= bnr;
+	else
+		stats.f_bavail = 0;
+
+	ret = -ENOBUFS;
+	if (stats.f_ffree < cache->fstop ||
+	    stats.f_bavail < cache->bstop)
+		goto begin_cull;
+
+	ret = 0;
+	if (stats.f_ffree < cache->fcull ||
+	    stats.f_bavail < cache->bcull)
+		goto begin_cull;
+
+	if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
+	    stats.f_ffree >= cache->frun &&
+	    stats.f_bavail >= cache->brun &&
+	    test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
+	    ) {
+		_debug("cease culling");
+		cachefiles_state_changed(cache);
+	}
+
+	//_leave(" = 0");
+	return 0;
+
+begin_cull:
+	if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
+		_debug("### CULL CACHE ###");
+		cachefiles_state_changed(cache);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
new file mode 100644
index 00000000..1064805e
--- /dev/null
+++ b/fs/cachefiles/interface.c
@@ -0,0 +1,470 @@
+/* FS-Cache interface to CacheFiles
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+
+#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
+
+struct cachefiles_lookup_data {
+	struct cachefiles_xattr	*auxdata;	/* auxiliary data */
+	char			*key;		/* key path */
+};
+
+static int cachefiles_attr_changed(struct fscache_object *_object);
+
+/*
+ * allocate an object record for a cookie lookup and prepare the lookup data
+ */
+static struct fscache_object *cachefiles_alloc_object(
+	struct fscache_cache *_cache,
+	struct fscache_cookie *cookie)
+{
+	struct cachefiles_lookup_data *lookup_data;
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct cachefiles_xattr *auxdata;
+	unsigned keylen, auxlen;
+	void *buffer;
+	char *key;
+
+	cache = container_of(_cache, struct cachefiles_cache, cache);
+
+	_enter("{%s},%p,", cache->cache.identifier, cookie);
+
+	lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL);
+	if (!lookup_data)
+		goto nomem_lookup_data;
+
+	/* create a new object record and a temporary leaf image */
+	object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+	if (!object)
+		goto nomem_object;
+
+	ASSERTCMP(object->backer, ==, NULL);
+
+	BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+	atomic_set(&object->usage, 1);
+
+	fscache_object_init(&object->fscache, cookie, &cache->cache);
+
+	object->type = cookie->def->type;
+
+	/* get hold of the raw key
+	 * - stick the length on the front and leave space on the back for the
+	 *   encoder
+	 */
+	buffer = kmalloc((2 + 512) + 3, GFP_KERNEL);
+	if (!buffer)
+		goto nomem_buffer;
+
+	keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512);
+	ASSERTCMP(keylen, <, 512);
+
+	*(uint16_t *)buffer = keylen;
+	((char *)buffer)[keylen + 2] = 0;
+	((char *)buffer)[keylen + 3] = 0;
+	((char *)buffer)[keylen + 4] = 0;
+
+	/* turn the raw key into something that can work with as a filename */
+	key = cachefiles_cook_key(buffer, keylen + 2, object->type);
+	if (!key)
+		goto nomem_key;
+
+	/* get hold of the auxiliary data and prepend the object type */
+	auxdata = buffer;
+	auxlen = 0;
+	if (cookie->def->get_aux) {
+		auxlen = cookie->def->get_aux(cookie->netfs_data,
+					      auxdata->data, 511);
+		ASSERTCMP(auxlen, <, 511);
+	}
+
+	auxdata->len = auxlen + 1;
+	auxdata->type = cookie->def->type;
+
+	lookup_data->auxdata = auxdata;
+	lookup_data->key = key;
+	object->lookup_data = lookup_data;
+
+	_leave(" = %p [%p]", &object->fscache, lookup_data);
+	return &object->fscache;
+
+nomem_key:
+	kfree(buffer);
+nomem_buffer:
+	BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+	kmem_cache_free(cachefiles_object_jar, object);
+	fscache_object_destroyed(&cache->cache);
+nomem_object:
+	kfree(lookup_data);
+nomem_lookup_data:
+	_leave(" = -ENOMEM");
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * attempt to look up the nominated node in this cache
+ * - return -ETIMEDOUT to be scheduled again
+ */
+static int cachefiles_lookup_object(struct fscache_object *_object)
+{
+	struct cachefiles_lookup_data *lookup_data;
+	struct cachefiles_object *parent, *object;
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter("{OBJ%x}", _object->debug_id);
+
+	cache = container_of(_object->cache, struct cachefiles_cache, cache);
+	parent = container_of(_object->parent,
+			      struct cachefiles_object, fscache);
+	object = container_of(_object, struct cachefiles_object, fscache);
+	lookup_data = object->lookup_data;
+
+	ASSERTCMP(lookup_data, !=, NULL);
+
+	/* look up the key, creating any missing bits */
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = cachefiles_walk_to_object(parent, object,
+					lookup_data->key,
+					lookup_data->auxdata);
+	cachefiles_end_secure(cache, saved_cred);
+
+	/* polish off by setting the attributes of non-index files */
+	if (ret == 0 &&
+	    object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
+		cachefiles_attr_changed(&object->fscache);
+
+	if (ret < 0 && ret != -ETIMEDOUT) {
+		if (ret != -ENOBUFS)
+			printk(KERN_WARNING
+			       "CacheFiles: Lookup failed error %d\n", ret);
+		fscache_object_lookup_error(&object->fscache);
+	}
+
+	_leave(" [%d]", ret);
+	return ret;
+}
+
+/*
+ * indication of lookup completion
+ */
+static void cachefiles_lookup_complete(struct fscache_object *_object)
+{
+	struct cachefiles_object *object;
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+
+	_enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data);
+
+	if (object->lookup_data) {
+		kfree(object->lookup_data->key);
+		kfree(object->lookup_data->auxdata);
+		kfree(object->lookup_data);
+		object->lookup_data = NULL;
+	}
+}
+
+/*
+ * increment the usage count on an inode object (may fail if unmounting)
+ */
+static
+struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
+{
+	struct cachefiles_object *object =
+		container_of(_object, struct cachefiles_object, fscache);
+
+	_enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage));
+
+#ifdef CACHEFILES_DEBUG_SLAB
+	ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+	atomic_inc(&object->usage);
+	return &object->fscache;
+}
+
+/*
+ * update the auxiliary data for an object object on disk
+ */
+static void cachefiles_update_object(struct fscache_object *_object)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_xattr *auxdata;
+	struct cachefiles_cache *cache;
+	struct fscache_cookie *cookie;
+	const struct cred *saved_cred;
+	unsigned auxlen;
+
+	_enter("{OBJ%x}", _object->debug_id);
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache, struct cachefiles_cache,
+			     cache);
+	cookie = object->fscache.cookie;
+
+	if (!cookie->def->get_aux) {
+		_leave(" [no aux]");
+		return;
+	}
+
+	auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL);
+	if (!auxdata) {
+		_leave(" [nomem]");
+		return;
+	}
+
+	auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+	ASSERTCMP(auxlen, <, 511);
+
+	auxdata->len = auxlen + 1;
+	auxdata->type = cookie->def->type;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	cachefiles_update_object_xattr(object, auxdata);
+	cachefiles_end_secure(cache, saved_cred);
+	kfree(auxdata);
+	_leave("");
+}
+
+/*
+ * discard the resources pinned by an object and effect retirement if
+ * requested
+ */
+static void cachefiles_drop_object(struct fscache_object *_object)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+
+	ASSERT(_object);
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+
+	_enter("{OBJ%x,%d}",
+	       object->fscache.debug_id, atomic_read(&object->usage));
+
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+#ifdef CACHEFILES_DEBUG_SLAB
+	ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+	/* delete retired objects */
+	if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
+	    _object != cache->cache.fsdef
+	    ) {
+		_debug("- retire object OBJ%x", object->fscache.debug_id);
+		cachefiles_begin_secure(cache, &saved_cred);
+		cachefiles_delete_object(cache, object);
+		cachefiles_end_secure(cache, saved_cred);
+	}
+
+	/* close the filesystem stuff attached to the object */
+	if (object->backer != object->dentry)
+		dput(object->backer);
+	object->backer = NULL;
+
+	/* note that the object is now inactive */
+	if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
+		write_lock(&cache->active_lock);
+		if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
+					&object->flags))
+			BUG();
+		rb_erase(&object->active_node, &cache->active_nodes);
+		wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+		write_unlock(&cache->active_lock);
+	}
+
+	dput(object->dentry);
+	object->dentry = NULL;
+
+	_leave("");
+}
+
+/*
+ * dispose of a reference to an object
+ */
+static void cachefiles_put_object(struct fscache_object *_object)
+{
+	struct cachefiles_object *object;
+	struct fscache_cache *cache;
+
+	ASSERT(_object);
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+
+	_enter("{OBJ%x,%d}",
+	       object->fscache.debug_id, atomic_read(&object->usage));
+
+#ifdef CACHEFILES_DEBUG_SLAB
+	ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+	ASSERTIFCMP(object->fscache.parent,
+		    object->fscache.parent->n_children, >, 0);
+
+	if (atomic_dec_and_test(&object->usage)) {
+		_debug("- kill object OBJ%x", object->fscache.debug_id);
+
+		ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+		ASSERTCMP(object->fscache.parent, ==, NULL);
+		ASSERTCMP(object->backer, ==, NULL);
+		ASSERTCMP(object->dentry, ==, NULL);
+		ASSERTCMP(object->fscache.n_ops, ==, 0);
+		ASSERTCMP(object->fscache.n_children, ==, 0);
+
+		if (object->lookup_data) {
+			kfree(object->lookup_data->key);
+			kfree(object->lookup_data->auxdata);
+			kfree(object->lookup_data);
+			object->lookup_data = NULL;
+		}
+
+		cache = object->fscache.cache;
+		fscache_object_destroy(&object->fscache);
+		kmem_cache_free(cachefiles_object_jar, object);
+		fscache_object_destroyed(cache);
+	}
+
+	_leave("");
+}
+
+/*
+ * sync a cache
+ */
+static void cachefiles_sync_cache(struct fscache_cache *_cache)
+{
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter("%p", _cache);
+
+	cache = container_of(_cache, struct cachefiles_cache, cache);
+
+	/* make sure all pages pinned by operations on behalf of the netfs are
+	 * written to disc */
+	cachefiles_begin_secure(cache, &saved_cred);
+	down_read(&cache->mnt->mnt_sb->s_umount);
+	ret = sync_filesystem(cache->mnt->mnt_sb);
+	up_read(&cache->mnt->mnt_sb->s_umount);
+	cachefiles_end_secure(cache, saved_cred);
+
+	if (ret == -EIO)
+		cachefiles_io_error(cache,
+				    "Attempt to sync backing fs superblock"
+				    " returned error %d",
+				    ret);
+}
+
+/*
+ * notification the attributes on an object have changed
+ * - called with reads/writes excluded by FS-Cache
+ */
+static int cachefiles_attr_changed(struct fscache_object *_object)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+	struct iattr newattrs;
+	uint64_t ni_size;
+	loff_t oi_size;
+	int ret;
+
+	_object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size);
+
+	_enter("{OBJ%x},[%llu]",
+	       _object->debug_id, (unsigned long long) ni_size);
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	if (ni_size == object->i_size)
+		return 0;
+
+	if (!object->backer)
+		return -ENOBUFS;
+
+	ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+
+	fscache_set_store_limit(&object->fscache, ni_size);
+
+	oi_size = i_size_read(object->backer->d_inode);
+	if (oi_size == ni_size)
+		return 0;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	mutex_lock(&object->backer->d_inode->i_mutex);
+
+	/* if there's an extension to a partial page at the end of the backing
+	 * file, we need to discard the partial page so that we pick up new
+	 * data after it */
+	if (oi_size & ~PAGE_MASK && ni_size > oi_size) {
+		_debug("discard tail %llx", oi_size);
+		newattrs.ia_valid = ATTR_SIZE;
+		newattrs.ia_size = oi_size & PAGE_MASK;
+		ret = notify_change(object->backer, &newattrs);
+		if (ret < 0)
+			goto truncate_failed;
+	}
+
+	newattrs.ia_valid = ATTR_SIZE;
+	newattrs.ia_size = ni_size;
+	ret = notify_change(object->backer, &newattrs);
+
+truncate_failed:
+	mutex_unlock(&object->backer->d_inode->i_mutex);
+	cachefiles_end_secure(cache, saved_cred);
+
+	if (ret == -EIO) {
+		fscache_set_store_limit(&object->fscache, 0);
+		cachefiles_io_error_obj(object, "Size set failed");
+		ret = -ENOBUFS;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * dissociate a cache from all the pages it was backing
+ */
+static void cachefiles_dissociate_pages(struct fscache_cache *cache)
+{
+	_enter("");
+}
+
+const struct fscache_cache_ops cachefiles_cache_ops = {
+	.name			= "cachefiles",
+	.alloc_object		= cachefiles_alloc_object,
+	.lookup_object		= cachefiles_lookup_object,
+	.lookup_complete	= cachefiles_lookup_complete,
+	.grab_object		= cachefiles_grab_object,
+	.update_object		= cachefiles_update_object,
+	.drop_object		= cachefiles_drop_object,
+	.put_object		= cachefiles_put_object,
+	.sync_cache		= cachefiles_sync_cache,
+	.attr_changed		= cachefiles_attr_changed,
+	.read_or_alloc_page	= cachefiles_read_or_alloc_page,
+	.read_or_alloc_pages	= cachefiles_read_or_alloc_pages,
+	.allocate_page		= cachefiles_allocate_page,
+	.allocate_pages		= cachefiles_allocate_pages,
+	.write_page		= cachefiles_write_page,
+	.uncache_page		= cachefiles_uncache_page,
+	.dissociate_pages	= cachefiles_dissociate_pages,
+};
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
new file mode 100644
index 00000000..bd6bc1bd
--- /dev/null
+++ b/fs/cachefiles/internal.h
@@ -0,0 +1,354 @@
+/* General netfs cache on cache files internal defs
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/fscache-cache.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+
+struct cachefiles_cache;
+struct cachefiles_object;
+
+extern unsigned cachefiles_debug;
+#define CACHEFILES_DEBUG_KENTER	1
+#define CACHEFILES_DEBUG_KLEAVE	2
+#define CACHEFILES_DEBUG_KDEBUG	4
+
+/*
+ * node records
+ */
+struct cachefiles_object {
+	struct fscache_object		fscache;	/* fscache handle */
+	struct cachefiles_lookup_data	*lookup_data;	/* cached lookup data */
+	struct dentry			*dentry;	/* the file/dir representing this object */
+	struct dentry			*backer;	/* backing file */
+	loff_t				i_size;		/* object size */
+	unsigned long			flags;
+#define CACHEFILES_OBJECT_ACTIVE	0		/* T if marked active */
+#define CACHEFILES_OBJECT_BURIED	1		/* T if preemptively buried */
+	atomic_t			usage;		/* object usage count */
+	uint8_t				type;		/* object type */
+	uint8_t				new;		/* T if object new */
+	spinlock_t			work_lock;
+	struct rb_node			active_node;	/* link in active tree (dentry is key) */
+};
+
+extern struct kmem_cache *cachefiles_object_jar;
+
+/*
+ * Cache files cache definition
+ */
+struct cachefiles_cache {
+	struct fscache_cache		cache;		/* FS-Cache record */
+	struct vfsmount			*mnt;		/* mountpoint holding the cache */
+	struct dentry			*graveyard;	/* directory into which dead objects go */
+	struct file			*cachefilesd;	/* manager daemon handle */
+	const struct cred		*cache_cred;	/* security override for accessing cache */
+	struct mutex			daemon_mutex;	/* command serialisation mutex */
+	wait_queue_head_t		daemon_pollwq;	/* poll waitqueue for daemon */
+	struct rb_root			active_nodes;	/* active nodes (can't be culled) */
+	rwlock_t			active_lock;	/* lock for active_nodes */
+	atomic_t			gravecounter;	/* graveyard uniquifier */
+	unsigned			frun_percent;	/* when to stop culling (% files) */
+	unsigned			fcull_percent;	/* when to start culling (% files) */
+	unsigned			fstop_percent;	/* when to stop allocating (% files) */
+	unsigned			brun_percent;	/* when to stop culling (% blocks) */
+	unsigned			bcull_percent;	/* when to start culling (% blocks) */
+	unsigned			bstop_percent;	/* when to stop allocating (% blocks) */
+	unsigned			bsize;		/* cache's block size */
+	unsigned			bshift;		/* min(ilog2(PAGE_SIZE / bsize), 0) */
+	uint64_t			frun;		/* when to stop culling */
+	uint64_t			fcull;		/* when to start culling */
+	uint64_t			fstop;		/* when to stop allocating */
+	sector_t			brun;		/* when to stop culling */
+	sector_t			bcull;		/* when to start culling */
+	sector_t			bstop;		/* when to stop allocating */
+	unsigned long			flags;
+#define CACHEFILES_READY		0	/* T if cache prepared */
+#define CACHEFILES_DEAD			1	/* T if cache dead */
+#define CACHEFILES_CULLING		2	/* T if cull engaged */
+#define CACHEFILES_STATE_CHANGED	3	/* T if state changed (poll trigger) */
+	char				*rootdirname;	/* name of cache root directory */
+	char				*secctx;	/* LSM security context */
+	char				*tag;		/* cache binding tag */
+};
+
+/*
+ * backing file read tracking
+ */
+struct cachefiles_one_read {
+	wait_queue_t			monitor;	/* link into monitored waitqueue */
+	struct page			*back_page;	/* backing file page we're waiting for */
+	struct page			*netfs_page;	/* netfs page we're going to fill */
+	struct fscache_retrieval	*op;		/* retrieval op covering this */
+	struct list_head		op_link;	/* link in op's todo list */
+};
+
+/*
+ * backing file write tracking
+ */
+struct cachefiles_one_write {
+	struct page			*netfs_page;	/* netfs page to copy */
+	struct cachefiles_object	*object;
+	struct list_head		obj_link;	/* link in object's lists */
+	fscache_rw_complete_t		end_io_func;
+	void				*context;
+};
+
+/*
+ * auxiliary data xattr buffer
+ */
+struct cachefiles_xattr {
+	uint16_t			len;
+	uint8_t				type;
+	uint8_t				data[];
+};
+
+/*
+ * note change of state for daemon
+ */
+static inline void cachefiles_state_changed(struct cachefiles_cache *cache)
+{
+	set_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
+	wake_up_all(&cache->daemon_pollwq);
+}
+
+/*
+ * bind.c
+ */
+extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args);
+extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
+
+/*
+ * daemon.c
+ */
+extern const struct file_operations cachefiles_daemon_fops;
+
+extern int cachefiles_has_space(struct cachefiles_cache *cache,
+				unsigned fnr, unsigned bnr);
+
+/*
+ * interface.c
+ */
+extern const struct fscache_cache_ops cachefiles_cache_ops;
+
+/*
+ * key.c
+ */
+extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
+
+/*
+ * namei.c
+ */
+extern int cachefiles_delete_object(struct cachefiles_cache *cache,
+				    struct cachefiles_object *object);
+extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
+				     struct cachefiles_object *object,
+				     const char *key,
+				     struct cachefiles_xattr *auxdata);
+extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+					       struct dentry *dir,
+					       const char *name);
+
+extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
+			   char *filename);
+
+extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
+				   struct dentry *dir, char *filename);
+
+/*
+ * proc.c
+ */
+#ifdef CONFIG_CACHEFILES_HISTOGRAM
+extern atomic_t cachefiles_lookup_histogram[HZ];
+extern atomic_t cachefiles_mkdir_histogram[HZ];
+extern atomic_t cachefiles_create_histogram[HZ];
+
+extern int __init cachefiles_proc_init(void);
+extern void cachefiles_proc_cleanup(void);
+static inline
+void cachefiles_hist(atomic_t histogram[], unsigned long start_jif)
+{
+	unsigned long jif = jiffies - start_jif;
+	if (jif >= HZ)
+		jif = HZ - 1;
+	atomic_inc(&histogram[jif]);
+}
+
+#else
+#define cachefiles_proc_init()		(0)
+#define cachefiles_proc_cleanup()	do {} while (0)
+#define cachefiles_hist(hist, start_jif) do {} while (0)
+#endif
+
+/*
+ * rdwr.c
+ */
+extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
+					 struct page *, gfp_t);
+extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *,
+					  struct list_head *, unsigned *,
+					  gfp_t);
+extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *,
+				    gfp_t);
+extern int cachefiles_allocate_pages(struct fscache_retrieval *,
+				     struct list_head *, unsigned *, gfp_t);
+extern int cachefiles_write_page(struct fscache_storage *, struct page *);
+extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
+
+/*
+ * security.c
+ */
+extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
+extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
+					       struct dentry *root,
+					       const struct cred **_saved_cred);
+
+static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
+					   const struct cred **_saved_cred)
+{
+	*_saved_cred = override_creds(cache->cache_cred);
+}
+
+static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
+					 const struct cred *saved_cred)
+{
+	revert_creds(saved_cred);
+}
+
+/*
+ * xattr.c
+ */
+extern int cachefiles_check_object_type(struct cachefiles_object *object);
+extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
+				       struct cachefiles_xattr *auxdata);
+extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
+					  struct cachefiles_xattr *auxdata);
+extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
+					 struct cachefiles_xattr *auxdata);
+extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+					  struct dentry *dentry);
+
+
+/*
+ * error handling
+ */
+#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__)
+
+#define cachefiles_io_error(___cache, FMT, ...)		\
+do {							\
+	kerror("I/O Error: " FMT, ##__VA_ARGS__);	\
+	fscache_io_error(&(___cache)->cache);		\
+	set_bit(CACHEFILES_DEAD, &(___cache)->flags);	\
+} while (0)
+
+#define cachefiles_io_error_obj(object, FMT, ...)			\
+do {									\
+	struct cachefiles_cache *___cache;				\
+									\
+	___cache = container_of((object)->fscache.cache,		\
+				struct cachefiles_cache, cache);	\
+	cachefiles_io_error(___cache, FMT, ##__VA_ARGS__);		\
+} while (0)
+
+
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+	printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+
+
+#if defined(__KDEBUG)
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+
+#elif defined(CONFIG_CACHEFILES_DEBUG)
+#define _enter(FMT, ...)				\
+do {							\
+	if (cachefiles_debug & CACHEFILES_DEBUG_KENTER)	\
+		kenter(FMT, ##__VA_ARGS__);		\
+} while (0)
+
+#define _leave(FMT, ...)				\
+do {							\
+	if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE)	\
+		kleave(FMT, ##__VA_ARGS__);		\
+} while (0)
+
+#define _debug(FMT, ...)				\
+do {							\
+	if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG)	\
+		kdebug(FMT, ##__VA_ARGS__);		\
+} while (0)
+
+#else
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
+#endif
+
+#if 1 /* defined(__KDEBUGALL) */
+
+#define ASSERT(X)							\
+do {									\
+	if (unlikely(!(X))) {						\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTCMP(X, OP, Y)						\
+do {									\
+	if (unlikely(!((X) OP (Y)))) {					\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
+		printk(KERN_ERR "%lx " #OP " %lx is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTIF(C, X)							\
+do {									\
+	if (unlikely((C) && !(X))) {					\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTIFCMP(C, X, OP, Y)					\
+do {									\
+	if (unlikely((C) && !((X) OP (Y)))) {				\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
+		printk(KERN_ERR "%lx " #OP " %lx is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while (0)
+
+#else
+
+#define ASSERT(X)			do {} while (0)
+#define ASSERTCMP(X, OP, Y)		do {} while (0)
+#define ASSERTIF(C, X)			do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)	do {} while (0)
+
+#endif
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
new file mode 100644
index 00000000..81b8b2b3
--- /dev/null
+++ b/fs/cachefiles/key.c
@@ -0,0 +1,159 @@
+/* Key to pathname encoder
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include "internal.h"
+
+static const char cachefiles_charmap[64] =
+	"0123456789"			/* 0 - 9 */
+	"abcdefghijklmnopqrstuvwxyz"	/* 10 - 35 */
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"	/* 36 - 61 */
+	"_-"				/* 62 - 63 */
+	;
+
+static const char cachefiles_filecharmap[256] = {
+	/* we skip space and tab and control chars */
+	[33 ... 46] = 1,		/* '!' -> '.' */
+	/* we skip '/' as it's significant to pathwalk */
+	[48 ... 127] = 1,		/* '0' -> '~' */
+};
+
+/*
+ * turn the raw key into something cooked
+ * - the raw key should include the length in the two bytes at the front
+ * - the key may be up to 514 bytes in length (including the length word)
+ *   - "base64" encode the strange keys, mapping 3 bytes of raw to four of
+ *     cooked
+ *   - need to cut the cooked key into 252 char lengths (189 raw bytes)
+ */
+char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
+{
+	unsigned char csum, ch;
+	unsigned int acc;
+	char *key;
+	int loop, len, max, seg, mark, print;
+
+	_enter(",%d", keylen);
+
+	BUG_ON(keylen < 2 || keylen > 514);
+
+	csum = raw[0] + raw[1];
+	print = 1;
+	for (loop = 2; loop < keylen; loop++) {
+		ch = raw[loop];
+		csum += ch;
+		print &= cachefiles_filecharmap[ch];
+	}
+
+	if (print) {
+		/* if the path is usable ASCII, then we render it directly */
+		max = keylen - 2;
+		max += 2;	/* two base64'd length chars on the front */
+		max += 5;	/* @checksum/M */
+		max += 3 * 2;	/* maximum number of segment dividers (".../M")
+				 * is ((514 + 251) / 252) = 3
+				 */
+		max += 1;	/* NUL on end */
+	} else {
+		/* calculate the maximum length of the cooked key */
+		keylen = (keylen + 2) / 3;
+
+		max = keylen * 4;
+		max += 5;	/* @checksum/M */
+		max += 3 * 2;	/* maximum number of segment dividers (".../M")
+				 * is ((514 + 188) / 189) = 3
+				 */
+		max += 1;	/* NUL on end */
+	}
+
+	max += 1;	/* 2nd NUL on end */
+
+	_debug("max: %d", max);
+
+	key = kmalloc(max, GFP_KERNEL);
+	if (!key)
+		return NULL;
+
+	len = 0;
+
+	/* build the cooked key */
+	sprintf(key, "@%02x%c+", (unsigned) csum, 0);
+	len = 5;
+	mark = len - 1;
+
+	if (print) {
+		acc = *(uint16_t *) raw;
+		raw += 2;
+
+		key[len + 1] = cachefiles_charmap[acc & 63];
+		acc >>= 6;
+		key[len] = cachefiles_charmap[acc & 63];
+		len += 2;
+
+		seg = 250;
+		for (loop = keylen; loop > 0; loop--) {
+			if (seg <= 0) {
+				key[len++] = '\0';
+				mark = len;
+				key[len++] = '+';
+				seg = 252;
+			}
+
+			key[len++] = *raw++;
+			ASSERT(len < max);
+		}
+
+		switch (type) {
+		case FSCACHE_COOKIE_TYPE_INDEX:		type = 'I';	break;
+		case FSCACHE_COOKIE_TYPE_DATAFILE:	type = 'D';	break;
+		default:				type = 'S';	break;
+		}
+	} else {
+		seg = 252;
+		for (loop = keylen; loop > 0; loop--) {
+			if (seg <= 0) {
+				key[len++] = '\0';
+				mark = len;
+				key[len++] = '+';
+				seg = 252;
+			}
+
+			acc = *raw++;
+			acc |= *raw++ << 8;
+			acc |= *raw++ << 16;
+
+			_debug("acc: %06x", acc);
+
+			key[len++] = cachefiles_charmap[acc & 63];
+			acc >>= 6;
+			key[len++] = cachefiles_charmap[acc & 63];
+			acc >>= 6;
+			key[len++] = cachefiles_charmap[acc & 63];
+			acc >>= 6;
+			key[len++] = cachefiles_charmap[acc & 63];
+
+			ASSERT(len < max);
+		}
+
+		switch (type) {
+		case FSCACHE_COOKIE_TYPE_INDEX:		type = 'J';	break;
+		case FSCACHE_COOKIE_TYPE_DATAFILE:	type = 'E';	break;
+		default:				type = 'T';	break;
+		}
+	}
+
+	key[mark] = type;
+	key[len++] = 0;
+	key[len] = 0;
+
+	_leave(" = %p %d", key, len);
+	return key;
+}
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
new file mode 100644
index 00000000..4bfa8cf4
--- /dev/null
+++ b/fs/cachefiles/main.c
@@ -0,0 +1,106 @@
+/* Network filesystem caching backend to use cache files on a premounted
+ * filesystem
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/sysctl.h>
+#include <linux/miscdevice.h>
+#include "internal.h"
+
+unsigned cachefiles_debug;
+module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask");
+
+MODULE_DESCRIPTION("Mounted-filesystem based cache");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+struct kmem_cache *cachefiles_object_jar;
+
+static struct miscdevice cachefiles_dev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "cachefiles",
+	.fops	= &cachefiles_daemon_fops,
+};
+
+static void cachefiles_object_init_once(void *_object)
+{
+	struct cachefiles_object *object = _object;
+
+	memset(object, 0, sizeof(*object));
+	spin_lock_init(&object->work_lock);
+}
+
+/*
+ * initialise the fs caching module
+ */
+static int __init cachefiles_init(void)
+{
+	int ret;
+
+	ret = misc_register(&cachefiles_dev);
+	if (ret < 0)
+		goto error_dev;
+
+	/* create an object jar */
+	ret = -ENOMEM;
+	cachefiles_object_jar =
+		kmem_cache_create("cachefiles_object_jar",
+				  sizeof(struct cachefiles_object),
+				  0,
+				  SLAB_HWCACHE_ALIGN,
+				  cachefiles_object_init_once);
+	if (!cachefiles_object_jar) {
+		printk(KERN_NOTICE
+		       "CacheFiles: Failed to allocate an object jar\n");
+		goto error_object_jar;
+	}
+
+	ret = cachefiles_proc_init();
+	if (ret < 0)
+		goto error_proc;
+
+	printk(KERN_INFO "CacheFiles: Loaded\n");
+	return 0;
+
+error_proc:
+	kmem_cache_destroy(cachefiles_object_jar);
+error_object_jar:
+	misc_deregister(&cachefiles_dev);
+error_dev:
+	kerror("failed to register: %d", ret);
+	return ret;
+}
+
+fs_initcall(cachefiles_init);
+
+/*
+ * clean up on module removal
+ */
+static void __exit cachefiles_exit(void)
+{
+	printk(KERN_INFO "CacheFiles: Unloading\n");
+
+	cachefiles_proc_cleanup();
+	kmem_cache_destroy(cachefiles_object_jar);
+	misc_deregister(&cachefiles_dev);
+}
+
+module_exit(cachefiles_exit);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
new file mode 100644
index 00000000..a0358c21
--- /dev/null
+++ b/fs/cachefiles/namei.c
@@ -0,0 +1,988 @@
+/* CacheFiles path walking and related routines
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/xattr.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+#define CACHEFILES_KEYBUF_SIZE 512
+
+/*
+ * dump debugging info about an object
+ */
+static noinline
+void __cachefiles_printk_object(struct cachefiles_object *object,
+				const char *prefix,
+				u8 *keybuf)
+{
+	struct fscache_cookie *cookie;
+	unsigned keylen, loop;
+
+	printk(KERN_ERR "%sobject: OBJ%x\n",
+	       prefix, object->fscache.debug_id);
+	printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
+	       prefix, fscache_object_states[object->fscache.state],
+	       object->fscache.flags, work_busy(&object->fscache.work),
+	       object->fscache.events,
+	       object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
+	printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
+	       prefix, object->fscache.n_ops, object->fscache.n_in_progress,
+	       object->fscache.n_exclusive);
+	printk(KERN_ERR "%sparent=%p\n",
+	       prefix, object->fscache.parent);
+
+	spin_lock(&object->fscache.lock);
+	cookie = object->fscache.cookie;
+	if (cookie) {
+		printk(KERN_ERR "%scookie=%p [pr=%p nd=%p fl=%lx]\n",
+		       prefix,
+		       object->fscache.cookie,
+		       object->fscache.cookie->parent,
+		       object->fscache.cookie->netfs_data,
+		       object->fscache.cookie->flags);
+		if (keybuf)
+			keylen = cookie->def->get_key(cookie->netfs_data, keybuf,
+						      CACHEFILES_KEYBUF_SIZE);
+		else
+			keylen = 0;
+	} else {
+		printk(KERN_ERR "%scookie=NULL\n", prefix);
+		keylen = 0;
+	}
+	spin_unlock(&object->fscache.lock);
+
+	if (keylen) {
+		printk(KERN_ERR "%skey=[%u] '", prefix, keylen);
+		for (loop = 0; loop < keylen; loop++)
+			printk("%02x", keybuf[loop]);
+		printk("'\n");
+	}
+}
+
+/*
+ * dump debugging info about a pair of objects
+ */
+static noinline void cachefiles_printk_object(struct cachefiles_object *object,
+					      struct cachefiles_object *xobject)
+{
+	u8 *keybuf;
+
+	keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO);
+	if (object)
+		__cachefiles_printk_object(object, "", keybuf);
+	if (xobject)
+		__cachefiles_printk_object(xobject, "x", keybuf);
+	kfree(keybuf);
+}
+
+/*
+ * mark the owner of a dentry, if there is one, to indicate that that dentry
+ * has been preemptively deleted
+ * - the caller must hold the i_mutex on the dentry's parent as required to
+ *   call vfs_unlink(), vfs_rmdir() or vfs_rename()
+ */
+static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
+					  struct dentry *dentry)
+{
+	struct cachefiles_object *object;
+	struct rb_node *p;
+
+	_enter(",'%*.*s'",
+	       dentry->d_name.len, dentry->d_name.len, dentry->d_name.name);
+
+	write_lock(&cache->active_lock);
+
+	p = cache->active_nodes.rb_node;
+	while (p) {
+		object = rb_entry(p, struct cachefiles_object, active_node);
+		if (object->dentry > dentry)
+			p = p->rb_left;
+		else if (object->dentry < dentry)
+			p = p->rb_right;
+		else
+			goto found_dentry;
+	}
+
+	write_unlock(&cache->active_lock);
+	_leave(" [no owner]");
+	return;
+
+	/* found the dentry for  */
+found_dentry:
+	kdebug("preemptive burial: OBJ%x [%s] %p",
+	       object->fscache.debug_id,
+	       fscache_object_states[object->fscache.state],
+	       dentry);
+
+	if (object->fscache.state < FSCACHE_OBJECT_DYING) {
+		printk(KERN_ERR "\n");
+		printk(KERN_ERR "CacheFiles: Error:"
+		       " Can't preemptively bury live object\n");
+		cachefiles_printk_object(object, NULL);
+	} else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
+		printk(KERN_ERR "CacheFiles: Error:"
+		       " Object already preemptively buried\n");
+	}
+
+	write_unlock(&cache->active_lock);
+	_leave(" [owner marked]");
+}
+
+/*
+ * record the fact that an object is now active
+ */
+static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
+					 struct cachefiles_object *object)
+{
+	struct cachefiles_object *xobject;
+	struct rb_node **_p, *_parent = NULL;
+	struct dentry *dentry;
+
+	_enter(",%p", object);
+
+try_again:
+	write_lock(&cache->active_lock);
+
+	if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
+		printk(KERN_ERR "CacheFiles: Error: Object already active\n");
+		cachefiles_printk_object(object, NULL);
+		BUG();
+	}
+
+	dentry = object->dentry;
+	_p = &cache->active_nodes.rb_node;
+	while (*_p) {
+		_parent = *_p;
+		xobject = rb_entry(_parent,
+				   struct cachefiles_object, active_node);
+
+		ASSERT(xobject != object);
+
+		if (xobject->dentry > dentry)
+			_p = &(*_p)->rb_left;
+		else if (xobject->dentry < dentry)
+			_p = &(*_p)->rb_right;
+		else
+			goto wait_for_old_object;
+	}
+
+	rb_link_node(&object->active_node, _parent, _p);
+	rb_insert_color(&object->active_node, &cache->active_nodes);
+
+	write_unlock(&cache->active_lock);
+	_leave(" = 0");
+	return 0;
+
+	/* an old object from a previous incarnation is hogging the slot - we
+	 * need to wait for it to be destroyed */
+wait_for_old_object:
+	if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
+		printk(KERN_ERR "\n");
+		printk(KERN_ERR "CacheFiles: Error:"
+		       " Unexpected object collision\n");
+		cachefiles_printk_object(object, xobject);
+		BUG();
+	}
+	atomic_inc(&xobject->usage);
+	write_unlock(&cache->active_lock);
+
+	if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
+		wait_queue_head_t *wq;
+
+		signed long timeout = 60 * HZ;
+		wait_queue_t wait;
+		bool requeue;
+
+		/* if the object we're waiting for is queued for processing,
+		 * then just put ourselves on the queue behind it */
+		if (work_pending(&xobject->fscache.work)) {
+			_debug("queue OBJ%x behind OBJ%x immediately",
+			       object->fscache.debug_id,
+			       xobject->fscache.debug_id);
+			goto requeue;
+		}
+
+		/* otherwise we sleep until either the object we're waiting for
+		 * is done, or the fscache_object is congested */
+		wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
+		init_wait(&wait);
+		requeue = false;
+		do {
+			prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+			if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
+				break;
+
+			requeue = fscache_object_sleep_till_congested(&timeout);
+		} while (timeout > 0 && !requeue);
+		finish_wait(wq, &wait);
+
+		if (requeue &&
+		    test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
+			_debug("queue OBJ%x behind OBJ%x after wait",
+			       object->fscache.debug_id,
+			       xobject->fscache.debug_id);
+			goto requeue;
+		}
+
+		if (timeout <= 0) {
+			printk(KERN_ERR "\n");
+			printk(KERN_ERR "CacheFiles: Error: Overlong"
+			       " wait for old active object to go away\n");
+			cachefiles_printk_object(object, xobject);
+			goto requeue;
+		}
+	}
+
+	ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));
+
+	cache->cache.ops->put_object(&xobject->fscache);
+	goto try_again;
+
+requeue:
+	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+	cache->cache.ops->put_object(&xobject->fscache);
+	_leave(" = -ETIMEDOUT");
+	return -ETIMEDOUT;
+}
+
+/*
+ * delete an object representation from the cache
+ * - file backed objects are unlinked
+ * - directory backed objects are stuffed into the graveyard for userspace to
+ *   delete
+ * - unlocks the directory mutex
+ */
+static int cachefiles_bury_object(struct cachefiles_cache *cache,
+				  struct dentry *dir,
+				  struct dentry *rep,
+				  bool preemptive)
+{
+	struct dentry *grave, *trap;
+	struct path path, path_to_graveyard;
+	char nbuffer[8 + 8 + 1];
+	int ret;
+
+	_enter(",'%*.*s','%*.*s'",
+	       dir->d_name.len, dir->d_name.len, dir->d_name.name,
+	       rep->d_name.len, rep->d_name.len, rep->d_name.name);
+
+	_debug("remove %p from %p", rep, dir);
+
+	/* non-directories can just be unlinked */
+	if (!S_ISDIR(rep->d_inode->i_mode)) {
+		_debug("unlink stale object");
+
+		path.mnt = cache->mnt;
+		path.dentry = dir;
+		ret = security_path_unlink(&path, rep);
+		if (ret < 0) {
+			cachefiles_io_error(cache, "Unlink security error");
+		} else {
+			ret = vfs_unlink(dir->d_inode, rep);
+
+			if (preemptive)
+				cachefiles_mark_object_buried(cache, rep);
+		}
+
+		mutex_unlock(&dir->d_inode->i_mutex);
+
+		if (ret == -EIO)
+			cachefiles_io_error(cache, "Unlink failed");
+
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* directories have to be moved to the graveyard */
+	_debug("move stale object to graveyard");
+	mutex_unlock(&dir->d_inode->i_mutex);
+
+try_again:
+	/* first step is to make up a grave dentry in the graveyard */
+	sprintf(nbuffer, "%08x%08x",
+		(uint32_t) get_seconds(),
+		(uint32_t) atomic_inc_return(&cache->gravecounter));
+
+	/* do the multiway lock magic */
+	trap = lock_rename(cache->graveyard, dir);
+
+	/* do some checks before getting the grave dentry */
+	if (rep->d_parent != dir) {
+		/* the entry was probably culled when we dropped the parent dir
+		 * lock */
+		unlock_rename(cache->graveyard, dir);
+		_leave(" = 0 [culled?]");
+		return 0;
+	}
+
+	if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {
+		unlock_rename(cache->graveyard, dir);
+		cachefiles_io_error(cache, "Graveyard no longer a directory");
+		return -EIO;
+	}
+
+	if (trap == rep) {
+		unlock_rename(cache->graveyard, dir);
+		cachefiles_io_error(cache, "May not make directory loop");
+		return -EIO;
+	}
+
+	if (d_mountpoint(rep)) {
+		unlock_rename(cache->graveyard, dir);
+		cachefiles_io_error(cache, "Mountpoint in cache");
+		return -EIO;
+	}
+
+	grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
+	if (IS_ERR(grave)) {
+		unlock_rename(cache->graveyard, dir);
+
+		if (PTR_ERR(grave) == -ENOMEM) {
+			_leave(" = -ENOMEM");
+			return -ENOMEM;
+		}
+
+		cachefiles_io_error(cache, "Lookup error %ld",
+				    PTR_ERR(grave));
+		return -EIO;
+	}
+
+	if (grave->d_inode) {
+		unlock_rename(cache->graveyard, dir);
+		dput(grave);
+		grave = NULL;
+		cond_resched();
+		goto try_again;
+	}
+
+	if (d_mountpoint(grave)) {
+		unlock_rename(cache->graveyard, dir);
+		dput(grave);
+		cachefiles_io_error(cache, "Mountpoint in graveyard");
+		return -EIO;
+	}
+
+	/* target should not be an ancestor of source */
+	if (trap == grave) {
+		unlock_rename(cache->graveyard, dir);
+		dput(grave);
+		cachefiles_io_error(cache, "May not make directory loop");
+		return -EIO;
+	}
+
+	/* attempt the rename */
+	path.mnt = cache->mnt;
+	path.dentry = dir;
+	path_to_graveyard.mnt = cache->mnt;
+	path_to_graveyard.dentry = cache->graveyard;
+	ret = security_path_rename(&path, rep, &path_to_graveyard, grave);
+	if (ret < 0) {
+		cachefiles_io_error(cache, "Rename security error %d", ret);
+	} else {
+		ret = vfs_rename(dir->d_inode, rep,
+				 cache->graveyard->d_inode, grave);
+		if (ret != 0 && ret != -ENOMEM)
+			cachefiles_io_error(cache,
+					    "Rename failed with error %d", ret);
+
+		if (preemptive)
+			cachefiles_mark_object_buried(cache, rep);
+	}
+
+	unlock_rename(cache->graveyard, dir);
+	dput(grave);
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * delete an object representation from the cache
+ */
+int cachefiles_delete_object(struct cachefiles_cache *cache,
+			     struct cachefiles_object *object)
+{
+	struct dentry *dir;
+	int ret;
+
+	_enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry);
+
+	ASSERT(object->dentry);
+	ASSERT(object->dentry->d_inode);
+	ASSERT(object->dentry->d_parent);
+
+	dir = dget_parent(object->dentry);
+
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+
+	if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
+		/* object allocation for the same key preemptively deleted this
+		 * object's file so that it could create its own file */
+		_debug("object preemptively buried");
+		mutex_unlock(&dir->d_inode->i_mutex);
+		ret = 0;
+	} else {
+		/* we need to check that our parent is _still_ our parent - it
+		 * may have been renamed */
+		if (dir == object->dentry->d_parent) {
+			ret = cachefiles_bury_object(cache, dir,
+						     object->dentry, false);
+		} else {
+			/* it got moved, presumably by cachefilesd culling it,
+			 * so it's no longer in the key path and we can ignore
+			 * it */
+			mutex_unlock(&dir->d_inode->i_mutex);
+			ret = 0;
+		}
+	}
+
+	dput(dir);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * walk from the parent object to the child object through the backing
+ * filesystem, creating directories as we go
+ */
+int cachefiles_walk_to_object(struct cachefiles_object *parent,
+			      struct cachefiles_object *object,
+			      const char *key,
+			      struct cachefiles_xattr *auxdata)
+{
+	struct cachefiles_cache *cache;
+	struct dentry *dir, *next = NULL;
+	struct path path;
+	unsigned long start;
+	const char *name;
+	int ret, nlen;
+
+	_enter("OBJ%x{%p},OBJ%x,%s,",
+	       parent->fscache.debug_id, parent->dentry,
+	       object->fscache.debug_id, key);
+
+	cache = container_of(parent->fscache.cache,
+			     struct cachefiles_cache, cache);
+	path.mnt = cache->mnt;
+
+	ASSERT(parent->dentry);
+	ASSERT(parent->dentry->d_inode);
+
+	if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {
+		// TODO: convert file to dir
+		_leave("looking up in none directory");
+		return -ENOBUFS;
+	}
+
+	dir = dget(parent->dentry);
+
+advance:
+	/* attempt to transit the first directory component */
+	name = key;
+	nlen = strlen(key);
+
+	/* key ends in a double NUL */
+	key = key + nlen + 1;
+	if (!*key)
+		key = NULL;
+
+lookup_again:
+	/* search the current directory for the element name */
+	_debug("lookup '%s'", name);
+
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+
+	start = jiffies;
+	next = lookup_one_len(name, dir, nlen);
+	cachefiles_hist(cachefiles_lookup_histogram, start);
+	if (IS_ERR(next))
+		goto lookup_error;
+
+	_debug("next -> %p %s", next, next->d_inode ? "positive" : "negative");
+
+	if (!key)
+		object->new = !next->d_inode;
+
+	/* if this element of the path doesn't exist, then the lookup phase
+	 * failed, and we can release any readers in the certain knowledge that
+	 * there's nothing for them to actually read */
+	if (!next->d_inode)
+		fscache_object_lookup_negative(&object->fscache);
+
+	/* we need to create the object if it's negative */
+	if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) {
+		/* index objects and intervening tree levels must be subdirs */
+		if (!next->d_inode) {
+			ret = cachefiles_has_space(cache, 1, 0);
+			if (ret < 0)
+				goto create_error;
+
+			path.dentry = dir;
+			ret = security_path_mkdir(&path, next, 0);
+			if (ret < 0)
+				goto create_error;
+			start = jiffies;
+			ret = vfs_mkdir(dir->d_inode, next, 0);
+			cachefiles_hist(cachefiles_mkdir_histogram, start);
+			if (ret < 0)
+				goto create_error;
+
+			ASSERT(next->d_inode);
+
+			_debug("mkdir -> %p{%p{ino=%lu}}",
+			       next, next->d_inode, next->d_inode->i_ino);
+
+		} else if (!S_ISDIR(next->d_inode->i_mode)) {
+			kerror("inode %lu is not a directory",
+			       next->d_inode->i_ino);
+			ret = -ENOBUFS;
+			goto error;
+		}
+
+	} else {
+		/* non-index objects start out life as files */
+		if (!next->d_inode) {
+			ret = cachefiles_has_space(cache, 1, 0);
+			if (ret < 0)
+				goto create_error;
+
+			path.dentry = dir;
+			ret = security_path_mknod(&path, next, S_IFREG, 0);
+			if (ret < 0)
+				goto create_error;
+			start = jiffies;
+			ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
+			cachefiles_hist(cachefiles_create_histogram, start);
+			if (ret < 0)
+				goto create_error;
+
+			ASSERT(next->d_inode);
+
+			_debug("create -> %p{%p{ino=%lu}}",
+			       next, next->d_inode, next->d_inode->i_ino);
+
+		} else if (!S_ISDIR(next->d_inode->i_mode) &&
+			   !S_ISREG(next->d_inode->i_mode)
+			   ) {
+			kerror("inode %lu is not a file or directory",
+			       next->d_inode->i_ino);
+			ret = -ENOBUFS;
+			goto error;
+		}
+	}
+
+	/* process the next component */
+	if (key) {
+		_debug("advance");
+		mutex_unlock(&dir->d_inode->i_mutex);
+		dput(dir);
+		dir = next;
+		next = NULL;
+		goto advance;
+	}
+
+	/* we've found the object we were looking for */
+	object->dentry = next;
+
+	/* if we've found that the terminal object exists, then we need to
+	 * check its attributes and delete it if it's out of date */
+	if (!object->new) {
+		_debug("validate '%*.*s'",
+		       next->d_name.len, next->d_name.len, next->d_name.name);
+
+		ret = cachefiles_check_object_xattr(object, auxdata);
+		if (ret == -ESTALE) {
+			/* delete the object (the deleter drops the directory
+			 * mutex) */
+			object->dentry = NULL;
+
+			ret = cachefiles_bury_object(cache, dir, next, true);
+			dput(next);
+			next = NULL;
+
+			if (ret < 0)
+				goto delete_error;
+
+			_debug("redo lookup");
+			goto lookup_again;
+		}
+	}
+
+	/* note that we're now using this object */
+	ret = cachefiles_mark_object_active(cache, object);
+
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(dir);
+	dir = NULL;
+
+	if (ret == -ETIMEDOUT)
+		goto mark_active_timed_out;
+
+	_debug("=== OBTAINED_OBJECT ===");
+
+	if (object->new) {
+		/* attach data to a newly constructed terminal object */
+		ret = cachefiles_set_object_xattr(object, auxdata);
+		if (ret < 0)
+			goto check_error;
+	} else {
+		/* always update the atime on an object we've just looked up
+		 * (this is used to keep track of culling, and atimes are only
+		 * updated by read, write and readdir but not lookup or
+		 * open) */
+		touch_atime(cache->mnt, next);
+	}
+
+	/* open a file interface onto a data file */
+	if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		if (S_ISREG(object->dentry->d_inode->i_mode)) {
+			const struct address_space_operations *aops;
+
+			ret = -EPERM;
+			aops = object->dentry->d_inode->i_mapping->a_ops;
+			if (!aops->bmap)
+				goto check_error;
+
+			object->backer = object->dentry;
+		} else {
+			BUG(); // TODO: open file in data-class subdir
+		}
+	}
+
+	object->new = 0;
+	fscache_obtained_object(&object->fscache);
+
+	_leave(" = 0 [%lu]", object->dentry->d_inode->i_ino);
+	return 0;
+
+create_error:
+	_debug("create error %d", ret);
+	if (ret == -EIO)
+		cachefiles_io_error(cache, "Create/mkdir failed");
+	goto error;
+
+mark_active_timed_out:
+	_debug("mark active timed out");
+	goto release_dentry;
+
+check_error:
+	_debug("check error %d", ret);
+	write_lock(&cache->active_lock);
+	rb_erase(&object->active_node, &cache->active_nodes);
+	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+	wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+	write_unlock(&cache->active_lock);
+release_dentry:
+	dput(object->dentry);
+	object->dentry = NULL;
+	goto error_out;
+
+delete_error:
+	_debug("delete error %d", ret);
+	goto error_out2;
+
+lookup_error:
+	_debug("lookup error %ld", PTR_ERR(next));
+	ret = PTR_ERR(next);
+	if (ret == -EIO)
+		cachefiles_io_error(cache, "Lookup failed");
+	next = NULL;
+error:
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(next);
+error_out2:
+	dput(dir);
+error_out:
+	_leave(" = error %d", -ret);
+	return ret;
+}
+
+/*
+ * get a subdirectory
+ */
+struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+					struct dentry *dir,
+					const char *dirname)
+{
+	struct dentry *subdir;
+	unsigned long start;
+	struct path path;
+	int ret;
+
+	_enter(",,%s", dirname);
+
+	/* search the current directory for the element name */
+	mutex_lock(&dir->d_inode->i_mutex);
+
+	start = jiffies;
+	subdir = lookup_one_len(dirname, dir, strlen(dirname));
+	cachefiles_hist(cachefiles_lookup_histogram, start);
+	if (IS_ERR(subdir)) {
+		if (PTR_ERR(subdir) == -ENOMEM)
+			goto nomem_d_alloc;
+		goto lookup_error;
+	}
+
+	_debug("subdir -> %p %s",
+	       subdir, subdir->d_inode ? "positive" : "negative");
+
+	/* we need to create the subdir if it doesn't exist yet */
+	if (!subdir->d_inode) {
+		ret = cachefiles_has_space(cache, 1, 0);
+		if (ret < 0)
+			goto mkdir_error;
+
+		_debug("attempt mkdir");
+
+		path.mnt = cache->mnt;
+		path.dentry = dir;
+		ret = security_path_mkdir(&path, subdir, 0700);
+		if (ret < 0)
+			goto mkdir_error;
+		ret = vfs_mkdir(dir->d_inode, subdir, 0700);
+		if (ret < 0)
+			goto mkdir_error;
+
+		ASSERT(subdir->d_inode);
+
+		_debug("mkdir -> %p{%p{ino=%lu}}",
+		       subdir,
+		       subdir->d_inode,
+		       subdir->d_inode->i_ino);
+	}
+
+	mutex_unlock(&dir->d_inode->i_mutex);
+
+	/* we need to make sure the subdir is a directory */
+	ASSERT(subdir->d_inode);
+
+	if (!S_ISDIR(subdir->d_inode->i_mode)) {
+		kerror("%s is not a directory", dirname);
+		ret = -EIO;
+		goto check_error;
+	}
+
+	ret = -EPERM;
+	if (!subdir->d_inode->i_op ||
+	    !subdir->d_inode->i_op->setxattr ||
+	    !subdir->d_inode->i_op->getxattr ||
+	    !subdir->d_inode->i_op->lookup ||
+	    !subdir->d_inode->i_op->mkdir ||
+	    !subdir->d_inode->i_op->create ||
+	    !subdir->d_inode->i_op->rename ||
+	    !subdir->d_inode->i_op->rmdir ||
+	    !subdir->d_inode->i_op->unlink)
+		goto check_error;
+
+	_leave(" = [%lu]", subdir->d_inode->i_ino);
+	return subdir;
+
+check_error:
+	dput(subdir);
+	_leave(" = %d [check]", ret);
+	return ERR_PTR(ret);
+
+mkdir_error:
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(subdir);
+	kerror("mkdir %s failed with error %d", dirname, ret);
+	return ERR_PTR(ret);
+
+lookup_error:
+	mutex_unlock(&dir->d_inode->i_mutex);
+	ret = PTR_ERR(subdir);
+	kerror("Lookup %s failed with error %d", dirname, ret);
+	return ERR_PTR(ret);
+
+nomem_d_alloc:
+	mutex_unlock(&dir->d_inode->i_mutex);
+	_leave(" = -ENOMEM");
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * find out if an object is in use or not
+ * - if finds object and it's not in use:
+ *   - returns a pointer to the object and a reference on it
+ *   - returns with the directory locked
+ */
+static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
+					      struct dentry *dir,
+					      char *filename)
+{
+	struct cachefiles_object *object;
+	struct rb_node *_n;
+	struct dentry *victim;
+	unsigned long start;
+	int ret;
+
+	//_enter(",%*.*s/,%s",
+	//       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+
+	/* look up the victim */
+	mutex_lock_nested(&dir->d_inode->i_mutex, 1);
+
+	start = jiffies;
+	victim = lookup_one_len(filename, dir, strlen(filename));
+	cachefiles_hist(cachefiles_lookup_histogram, start);
+	if (IS_ERR(victim))
+		goto lookup_error;
+
+	//_debug("victim -> %p %s",
+	//       victim, victim->d_inode ? "positive" : "negative");
+
+	/* if the object is no longer there then we probably retired the object
+	 * at the netfs's request whilst the cull was in progress
+	 */
+	if (!victim->d_inode) {
+		mutex_unlock(&dir->d_inode->i_mutex);
+		dput(victim);
+		_leave(" = -ENOENT [absent]");
+		return ERR_PTR(-ENOENT);
+	}
+
+	/* check to see if we're using this object */
+	read_lock(&cache->active_lock);
+
+	_n = cache->active_nodes.rb_node;
+
+	while (_n) {
+		object = rb_entry(_n, struct cachefiles_object, active_node);
+
+		if (object->dentry > victim)
+			_n = _n->rb_left;
+		else if (object->dentry < victim)
+			_n = _n->rb_right;
+		else
+			goto object_in_use;
+	}
+
+	read_unlock(&cache->active_lock);
+
+	//_leave(" = %p", victim);
+	return victim;
+
+object_in_use:
+	read_unlock(&cache->active_lock);
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(victim);
+	//_leave(" = -EBUSY [in use]");
+	return ERR_PTR(-EBUSY);
+
+lookup_error:
+	mutex_unlock(&dir->d_inode->i_mutex);
+	ret = PTR_ERR(victim);
+	if (ret == -ENOENT) {
+		/* file or dir now absent - probably retired by netfs */
+		_leave(" = -ESTALE [absent]");
+		return ERR_PTR(-ESTALE);
+	}
+
+	if (ret == -EIO) {
+		cachefiles_io_error(cache, "Lookup failed");
+	} else if (ret != -ENOMEM) {
+		kerror("Internal error: %d", ret);
+		ret = -EIO;
+	}
+
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * cull an object if it's not in use
+ * - called only by cache manager daemon
+ */
+int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
+		    char *filename)
+{
+	struct dentry *victim;
+	int ret;
+
+	_enter(",%*.*s/,%s",
+	       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+
+	victim = cachefiles_check_active(cache, dir, filename);
+	if (IS_ERR(victim))
+		return PTR_ERR(victim);
+
+	_debug("victim -> %p %s",
+	       victim, victim->d_inode ? "positive" : "negative");
+
+	/* okay... the victim is not being used so we can cull it
+	 * - start by marking it as stale
+	 */
+	_debug("victim is cullable");
+
+	ret = cachefiles_remove_object_xattr(cache, victim);
+	if (ret < 0)
+		goto error_unlock;
+
+	/*  actually remove the victim (drops the dir mutex) */
+	_debug("bury");
+
+	ret = cachefiles_bury_object(cache, dir, victim, false);
+	if (ret < 0)
+		goto error;
+
+	dput(victim);
+	_leave(" = 0");
+	return 0;
+
+error_unlock:
+	mutex_unlock(&dir->d_inode->i_mutex);
+error:
+	dput(victim);
+	if (ret == -ENOENT) {
+		/* file or dir now absent - probably retired by netfs */
+		_leave(" = -ESTALE [absent]");
+		return -ESTALE;
+	}
+
+	if (ret != -ENOMEM) {
+		kerror("Internal error: %d", ret);
+		ret = -EIO;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * find out if an object is in use or not
+ * - called only by cache manager daemon
+ * - returns -EBUSY or 0 to indicate whether an object is in use or not
+ */
+int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
+			    char *filename)
+{
+	struct dentry *victim;
+
+	//_enter(",%*.*s/,%s",
+	//       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+
+	victim = cachefiles_check_active(cache, dir, filename);
+	if (IS_ERR(victim))
+		return PTR_ERR(victim);
+
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(victim);
+	//_leave(" = 0");
+	return 0;
+}
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
new file mode 100644
index 00000000..eccd3394
--- /dev/null
+++ b/fs/cachefiles/proc.c
@@ -0,0 +1,134 @@
+/* CacheFiles statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+atomic_t cachefiles_lookup_histogram[HZ];
+atomic_t cachefiles_mkdir_histogram[HZ];
+atomic_t cachefiles_create_histogram[HZ];
+
+/*
+ * display the latency histogram
+ */
+static int cachefiles_histogram_show(struct seq_file *m, void *v)
+{
+	unsigned long index;
+	unsigned x, y, z, t;
+
+	switch ((unsigned long) v) {
+	case 1:
+		seq_puts(m, "JIFS  SECS  LOOKUPS   MKDIRS    CREATES\n");
+		return 0;
+	case 2:
+		seq_puts(m, "===== ===== ========= ========= =========\n");
+		return 0;
+	default:
+		index = (unsigned long) v - 3;
+		x = atomic_read(&cachefiles_lookup_histogram[index]);
+		y = atomic_read(&cachefiles_mkdir_histogram[index]);
+		z = atomic_read(&cachefiles_create_histogram[index]);
+		if (x == 0 && y == 0 && z == 0)
+			return 0;
+
+		t = (index * 1000) / HZ;
+
+		seq_printf(m, "%4lu  0.%03u %9u %9u %9u\n", index, t, x, y, z);
+		return 0;
+	}
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+	if ((unsigned long long)*_pos >= HZ + 2)
+		return NULL;
+	if (*_pos == 0)
+		*_pos = 1;
+	return (void *)(unsigned long) *_pos;
+}
+
+/*
+ * move to the next line
+ */
+static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return (unsigned long long)*pos > HZ + 2 ?
+		NULL : (void *)(unsigned long) *pos;
+}
+
+/*
+ * clean up after reading
+ */
+static void cachefiles_histogram_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations cachefiles_histogram_ops = {
+	.start		= cachefiles_histogram_start,
+	.stop		= cachefiles_histogram_stop,
+	.next		= cachefiles_histogram_next,
+	.show		= cachefiles_histogram_show,
+};
+
+/*
+ * open "/proc/fs/cachefiles/XXX" which provide statistics summaries
+ */
+static int cachefiles_histogram_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cachefiles_histogram_ops);
+}
+
+static const struct file_operations cachefiles_histogram_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cachefiles_histogram_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+/*
+ * initialise the /proc/fs/cachefiles/ directory
+ */
+int __init cachefiles_proc_init(void)
+{
+	_enter("");
+
+	if (!proc_mkdir("fs/cachefiles", NULL))
+		goto error_dir;
+
+	if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
+			 &cachefiles_histogram_fops))
+		goto error_histogram;
+
+	_leave(" = 0");
+	return 0;
+
+error_histogram:
+	remove_proc_entry("fs/cachefiles", NULL);
+error_dir:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/cachefiles/ directory
+ */
+void cachefiles_proc_cleanup(void)
+{
+	remove_proc_entry("fs/cachefiles/histogram", NULL);
+	remove_proc_entry("fs/cachefiles", NULL);
+}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
new file mode 100644
index 00000000..0e3c0924
--- /dev/null
+++ b/fs/cachefiles/rdwr.c
@@ -0,0 +1,984 @@
+/* Storage object read/write
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include "internal.h"
+
+/*
+ * detect wake up events generated by the unlocking of pages in which we're
+ * interested
+ * - we use this to detect read completion of backing pages
+ * - the caller holds the waitqueue lock
+ */
+static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
+				  int sync, void *_key)
+{
+	struct cachefiles_one_read *monitor =
+		container_of(wait, struct cachefiles_one_read, monitor);
+	struct cachefiles_object *object;
+	struct wait_bit_key *key = _key;
+	struct page *page = wait->private;
+
+	ASSERT(key);
+
+	_enter("{%lu},%u,%d,{%p,%u}",
+	       monitor->netfs_page->index, mode, sync,
+	       key->flags, key->bit_nr);
+
+	if (key->flags != &page->flags ||
+	    key->bit_nr != PG_locked)
+		return 0;
+
+	_debug("--- monitor %p %lx ---", page, page->flags);
+
+	if (!PageUptodate(page) && !PageError(page)) {
+		/* unlocked, not uptodate and not erronous? */
+		_debug("page probably truncated");
+	}
+
+	/* remove from the waitqueue */
+	list_del(&wait->task_list);
+
+	/* move onto the action list and queue for FS-Cache thread pool */
+	ASSERT(monitor->op);
+
+	object = container_of(monitor->op->op.object,
+			      struct cachefiles_object, fscache);
+
+	spin_lock(&object->work_lock);
+	list_add_tail(&monitor->op_link, &monitor->op->to_do);
+	spin_unlock(&object->work_lock);
+
+	fscache_enqueue_retrieval(monitor->op);
+	return 0;
+}
+
+/*
+ * handle a probably truncated page
+ * - check to see if the page is still relevant and reissue the read if
+ *   possible
+ * - return -EIO on error, -ENODATA if the page is gone, -EINPROGRESS if we
+ *   must wait again and 0 if successful
+ */
+static int cachefiles_read_reissue(struct cachefiles_object *object,
+				   struct cachefiles_one_read *monitor)
+{
+	struct address_space *bmapping = object->backer->d_inode->i_mapping;
+	struct page *backpage = monitor->back_page, *backpage2;
+	int ret;
+
+	kenter("{ino=%lx},{%lx,%lx}",
+	       object->backer->d_inode->i_ino,
+	       backpage->index, backpage->flags);
+
+	/* skip if the page was truncated away completely */
+	if (backpage->mapping != bmapping) {
+		kleave(" = -ENODATA [mapping]");
+		return -ENODATA;
+	}
+
+	backpage2 = find_get_page(bmapping, backpage->index);
+	if (!backpage2) {
+		kleave(" = -ENODATA [gone]");
+		return -ENODATA;
+	}
+
+	if (backpage != backpage2) {
+		put_page(backpage2);
+		kleave(" = -ENODATA [different]");
+		return -ENODATA;
+	}
+
+	/* the page is still there and we already have a ref on it, so we don't
+	 * need a second */
+	put_page(backpage2);
+
+	INIT_LIST_HEAD(&monitor->op_link);
+	add_page_wait_queue(backpage, &monitor->monitor);
+
+	if (trylock_page(backpage)) {
+		ret = -EIO;
+		if (PageError(backpage))
+			goto unlock_discard;
+		ret = 0;
+		if (PageUptodate(backpage))
+			goto unlock_discard;
+
+		kdebug("reissue read");
+		ret = bmapping->a_ops->readpage(NULL, backpage);
+		if (ret < 0)
+			goto unlock_discard;
+	}
+
+	/* but the page may have been read before the monitor was installed, so
+	 * the monitor may miss the event - so we have to ensure that we do get
+	 * one in such a case */
+	if (trylock_page(backpage)) {
+		_debug("jumpstart %p {%lx}", backpage, backpage->flags);
+		unlock_page(backpage);
+	}
+
+	/* it'll reappear on the todo list */
+	kleave(" = -EINPROGRESS");
+	return -EINPROGRESS;
+
+unlock_discard:
+	unlock_page(backpage);
+	spin_lock_irq(&object->work_lock);
+	list_del(&monitor->op_link);
+	spin_unlock_irq(&object->work_lock);
+	kleave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * copy data from backing pages to netfs pages to complete a read operation
+ * - driven by FS-Cache's thread pool
+ */
+static void cachefiles_read_copier(struct fscache_operation *_op)
+{
+	struct cachefiles_one_read *monitor;
+	struct cachefiles_object *object;
+	struct fscache_retrieval *op;
+	struct pagevec pagevec;
+	int error, max;
+
+	op = container_of(_op, struct fscache_retrieval, op);
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+
+	_enter("{ino=%lu}", object->backer->d_inode->i_ino);
+
+	pagevec_init(&pagevec, 0);
+
+	max = 8;
+	spin_lock_irq(&object->work_lock);
+
+	while (!list_empty(&op->to_do)) {
+		monitor = list_entry(op->to_do.next,
+				     struct cachefiles_one_read, op_link);
+		list_del(&monitor->op_link);
+
+		spin_unlock_irq(&object->work_lock);
+
+		_debug("- copy {%lu}", monitor->back_page->index);
+
+	recheck:
+		if (PageUptodate(monitor->back_page)) {
+			copy_highpage(monitor->netfs_page, monitor->back_page);
+
+			pagevec_add(&pagevec, monitor->netfs_page);
+			fscache_mark_pages_cached(monitor->op, &pagevec);
+			error = 0;
+		} else if (!PageError(monitor->back_page)) {
+			/* the page has probably been truncated */
+			error = cachefiles_read_reissue(object, monitor);
+			if (error == -EINPROGRESS)
+				goto next;
+			goto recheck;
+		} else {
+			cachefiles_io_error_obj(
+				object,
+				"Readpage failed on backing file %lx",
+				(unsigned long) monitor->back_page->flags);
+			error = -EIO;
+		}
+
+		page_cache_release(monitor->back_page);
+
+		fscache_end_io(op, monitor->netfs_page, error);
+		page_cache_release(monitor->netfs_page);
+		fscache_put_retrieval(op);
+		kfree(monitor);
+
+	next:
+		/* let the thread pool have some air occasionally */
+		max--;
+		if (max < 0 || need_resched()) {
+			if (!list_empty(&op->to_do))
+				fscache_enqueue_retrieval(op);
+			_leave(" [maxed out]");
+			return;
+		}
+
+		spin_lock_irq(&object->work_lock);
+	}
+
+	spin_unlock_irq(&object->work_lock);
+	_leave("");
+}
+
+/*
+ * read the corresponding page to the given set from the backing file
+ * - an uncertain page is simply discarded, to be tried again another time
+ */
+static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
+					    struct fscache_retrieval *op,
+					    struct page *netpage,
+					    struct pagevec *pagevec)
+{
+	struct cachefiles_one_read *monitor;
+	struct address_space *bmapping;
+	struct page *newpage, *backpage;
+	int ret;
+
+	_enter("");
+
+	pagevec_reinit(pagevec);
+
+	_debug("read back %p{%lu,%d}",
+	       netpage, netpage->index, page_count(netpage));
+
+	monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+	if (!monitor)
+		goto nomem;
+
+	monitor->netfs_page = netpage;
+	monitor->op = fscache_get_retrieval(op);
+
+	init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter);
+
+	/* attempt to get hold of the backing page */
+	bmapping = object->backer->d_inode->i_mapping;
+	newpage = NULL;
+
+	for (;;) {
+		backpage = find_get_page(bmapping, netpage->index);
+		if (backpage)
+			goto backing_page_already_present;
+
+		if (!newpage) {
+			newpage = page_cache_alloc_cold(bmapping);
+			if (!newpage)
+				goto nomem_monitor;
+		}
+
+		ret = add_to_page_cache(newpage, bmapping,
+					netpage->index, GFP_KERNEL);
+		if (ret == 0)
+			goto installed_new_backing_page;
+		if (ret != -EEXIST)
+			goto nomem_page;
+	}
+
+	/* we've installed a new backing page, so now we need to add it
+	 * to the LRU list and start it reading */
+installed_new_backing_page:
+	_debug("- new %p", newpage);
+
+	backpage = newpage;
+	newpage = NULL;
+
+	page_cache_get(backpage);
+	pagevec_add(pagevec, backpage);
+	__pagevec_lru_add_file(pagevec);
+
+read_backing_page:
+	ret = bmapping->a_ops->readpage(NULL, backpage);
+	if (ret < 0)
+		goto read_error;
+
+	/* set the monitor to transfer the data across */
+monitor_backing_page:
+	_debug("- monitor add");
+
+	/* install the monitor */
+	page_cache_get(monitor->netfs_page);
+	page_cache_get(backpage);
+	monitor->back_page = backpage;
+	monitor->monitor.private = backpage;
+	add_page_wait_queue(backpage, &monitor->monitor);
+	monitor = NULL;
+
+	/* but the page may have been read before the monitor was installed, so
+	 * the monitor may miss the event - so we have to ensure that we do get
+	 * one in such a case */
+	if (trylock_page(backpage)) {
+		_debug("jumpstart %p {%lx}", backpage, backpage->flags);
+		unlock_page(backpage);
+	}
+	goto success;
+
+	/* if the backing page is already present, it can be in one of
+	 * three states: read in progress, read failed or read okay */
+backing_page_already_present:
+	_debug("- present");
+
+	if (newpage) {
+		page_cache_release(newpage);
+		newpage = NULL;
+	}
+
+	if (PageError(backpage))
+		goto io_error;
+
+	if (PageUptodate(backpage))
+		goto backing_page_already_uptodate;
+
+	if (!trylock_page(backpage))
+		goto monitor_backing_page;
+	_debug("read %p {%lx}", backpage, backpage->flags);
+	goto read_backing_page;
+
+	/* the backing page is already up to date, attach the netfs
+	 * page to the pagecache and LRU and copy the data across */
+backing_page_already_uptodate:
+	_debug("- uptodate");
+
+	pagevec_add(pagevec, netpage);
+	fscache_mark_pages_cached(op, pagevec);
+
+	copy_highpage(netpage, backpage);
+	fscache_end_io(op, netpage, 0);
+
+success:
+	_debug("success");
+	ret = 0;
+
+out:
+	if (backpage)
+		page_cache_release(backpage);
+	if (monitor) {
+		fscache_put_retrieval(monitor->op);
+		kfree(monitor);
+	}
+	_leave(" = %d", ret);
+	return ret;
+
+read_error:
+	_debug("read error %d", ret);
+	if (ret == -ENOMEM)
+		goto out;
+io_error:
+	cachefiles_io_error_obj(object, "Page read error on backing file");
+	ret = -ENOBUFS;
+	goto out;
+
+nomem_page:
+	page_cache_release(newpage);
+nomem_monitor:
+	fscache_put_retrieval(monitor->op);
+	kfree(monitor);
+nomem:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - if the page is backed by a block in the cache:
+ *   - a read will be started which will call the callback on completion
+ *   - 0 will be returned
+ * - else if the page is unbacked:
+ *   - the metadata will be retained
+ *   - -ENODATA will be returned
+ */
+int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
+				  struct page *page,
+				  gfp_t gfp)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct pagevec pagevec;
+	struct inode *inode;
+	sector_t block0, block;
+	unsigned shift;
+	int ret;
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	_enter("{%p},{%lx},,,", object, page->index);
+
+	if (!object->backer)
+		return -ENOBUFS;
+
+	inode = object->backer->d_inode;
+	ASSERT(S_ISREG(inode->i_mode));
+	ASSERT(inode->i_mapping->a_ops->bmap);
+	ASSERT(inode->i_mapping->a_ops->readpages);
+
+	/* calculate the shift required to use bmap */
+	if (inode->i_sb->s_blocksize > PAGE_SIZE)
+		return -ENOBUFS;
+
+	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+
+	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
+	op->op.flags |= FSCACHE_OP_ASYNC;
+	op->op.processor = cachefiles_read_copier;
+
+	pagevec_init(&pagevec, 0);
+
+	/* we assume the absence or presence of the first block is a good
+	 * enough indication for the page as a whole
+	 * - TODO: don't use bmap() for this as it is _not_ actually good
+	 *   enough for this as it doesn't indicate errors, but it's all we've
+	 *   got for the moment
+	 */
+	block0 = page->index;
+	block0 <<= shift;
+
+	block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0);
+	_debug("%llx -> %llx",
+	       (unsigned long long) block0,
+	       (unsigned long long) block);
+
+	if (block) {
+		/* submit the apparently valid page to the backing fs to be
+		 * read from disk */
+		ret = cachefiles_read_backing_file_one(object, op, page,
+						       &pagevec);
+	} else if (cachefiles_has_space(cache, 0, 1) == 0) {
+		/* there's space in the cache we can use */
+		pagevec_add(&pagevec, page);
+		fscache_mark_pages_cached(op, &pagevec);
+		ret = -ENODATA;
+	} else {
+		ret = -ENOBUFS;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * read the corresponding pages to the given set from the backing file
+ * - any uncertain pages are simply discarded, to be tried again another time
+ */
+static int cachefiles_read_backing_file(struct cachefiles_object *object,
+					struct fscache_retrieval *op,
+					struct list_head *list,
+					struct pagevec *mark_pvec)
+{
+	struct cachefiles_one_read *monitor = NULL;
+	struct address_space *bmapping = object->backer->d_inode->i_mapping;
+	struct pagevec lru_pvec;
+	struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
+	int ret = 0;
+
+	_enter("");
+
+	pagevec_init(&lru_pvec, 0);
+
+	list_for_each_entry_safe(netpage, _n, list, lru) {
+		list_del(&netpage->lru);
+
+		_debug("read back %p{%lu,%d}",
+		       netpage, netpage->index, page_count(netpage));
+
+		if (!monitor) {
+			monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+			if (!monitor)
+				goto nomem;
+
+			monitor->op = fscache_get_retrieval(op);
+			init_waitqueue_func_entry(&monitor->monitor,
+						  cachefiles_read_waiter);
+		}
+
+		for (;;) {
+			backpage = find_get_page(bmapping, netpage->index);
+			if (backpage)
+				goto backing_page_already_present;
+
+			if (!newpage) {
+				newpage = page_cache_alloc_cold(bmapping);
+				if (!newpage)
+					goto nomem;
+			}
+
+			ret = add_to_page_cache(newpage, bmapping,
+						netpage->index, GFP_KERNEL);
+			if (ret == 0)
+				goto installed_new_backing_page;
+			if (ret != -EEXIST)
+				goto nomem;
+		}
+
+		/* we've installed a new backing page, so now we need to add it
+		 * to the LRU list and start it reading */
+	installed_new_backing_page:
+		_debug("- new %p", newpage);
+
+		backpage = newpage;
+		newpage = NULL;
+
+		page_cache_get(backpage);
+		if (!pagevec_add(&lru_pvec, backpage))
+			__pagevec_lru_add_file(&lru_pvec);
+
+	reread_backing_page:
+		ret = bmapping->a_ops->readpage(NULL, backpage);
+		if (ret < 0)
+			goto read_error;
+
+		/* add the netfs page to the pagecache and LRU, and set the
+		 * monitor to transfer the data across */
+	monitor_backing_page:
+		_debug("- monitor add");
+
+		ret = add_to_page_cache(netpage, op->mapping, netpage->index,
+					GFP_KERNEL);
+		if (ret < 0) {
+			if (ret == -EEXIST) {
+				page_cache_release(netpage);
+				continue;
+			}
+			goto nomem;
+		}
+
+		page_cache_get(netpage);
+		if (!pagevec_add(&lru_pvec, netpage))
+			__pagevec_lru_add_file(&lru_pvec);
+
+		/* install a monitor */
+		page_cache_get(netpage);
+		monitor->netfs_page = netpage;
+
+		page_cache_get(backpage);
+		monitor->back_page = backpage;
+		monitor->monitor.private = backpage;
+		add_page_wait_queue(backpage, &monitor->monitor);
+		monitor = NULL;
+
+		/* but the page may have been read before the monitor was
+		 * installed, so the monitor may miss the event - so we have to
+		 * ensure that we do get one in such a case */
+		if (trylock_page(backpage)) {
+			_debug("2unlock %p {%lx}", backpage, backpage->flags);
+			unlock_page(backpage);
+		}
+
+		page_cache_release(backpage);
+		backpage = NULL;
+
+		page_cache_release(netpage);
+		netpage = NULL;
+		continue;
+
+		/* if the backing page is already present, it can be in one of
+		 * three states: read in progress, read failed or read okay */
+	backing_page_already_present:
+		_debug("- present %p", backpage);
+
+		if (PageError(backpage))
+			goto io_error;
+
+		if (PageUptodate(backpage))
+			goto backing_page_already_uptodate;
+
+		_debug("- not ready %p{%lx}", backpage, backpage->flags);
+
+		if (!trylock_page(backpage))
+			goto monitor_backing_page;
+
+		if (PageError(backpage)) {
+			_debug("error %lx", backpage->flags);
+			unlock_page(backpage);
+			goto io_error;
+		}
+
+		if (PageUptodate(backpage))
+			goto backing_page_already_uptodate_unlock;
+
+		/* we've locked a page that's neither up to date nor erroneous,
+		 * so we need to attempt to read it again */
+		goto reread_backing_page;
+
+		/* the backing page is already up to date, attach the netfs
+		 * page to the pagecache and LRU and copy the data across */
+	backing_page_already_uptodate_unlock:
+		_debug("uptodate %lx", backpage->flags);
+		unlock_page(backpage);
+	backing_page_already_uptodate:
+		_debug("- uptodate");
+
+		ret = add_to_page_cache(netpage, op->mapping, netpage->index,
+					GFP_KERNEL);
+		if (ret < 0) {
+			if (ret == -EEXIST) {
+				page_cache_release(netpage);
+				continue;
+			}
+			goto nomem;
+		}
+
+		copy_highpage(netpage, backpage);
+
+		page_cache_release(backpage);
+		backpage = NULL;
+
+		if (!pagevec_add(mark_pvec, netpage))
+			fscache_mark_pages_cached(op, mark_pvec);
+
+		page_cache_get(netpage);
+		if (!pagevec_add(&lru_pvec, netpage))
+			__pagevec_lru_add_file(&lru_pvec);
+
+		fscache_end_io(op, netpage, 0);
+		page_cache_release(netpage);
+		netpage = NULL;
+		continue;
+	}
+
+	netpage = NULL;
+
+	_debug("out");
+
+out:
+	/* tidy up */
+	pagevec_lru_add_file(&lru_pvec);
+
+	if (newpage)
+		page_cache_release(newpage);
+	if (netpage)
+		page_cache_release(netpage);
+	if (backpage)
+		page_cache_release(backpage);
+	if (monitor) {
+		fscache_put_retrieval(op);
+		kfree(monitor);
+	}
+
+	list_for_each_entry_safe(netpage, _n, list, lru) {
+		list_del(&netpage->lru);
+		page_cache_release(netpage);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+nomem:
+	_debug("nomem");
+	ret = -ENOMEM;
+	goto out;
+
+read_error:
+	_debug("read error %d", ret);
+	if (ret == -ENOMEM)
+		goto out;
+io_error:
+	cachefiles_io_error_obj(object, "Page read error on backing file");
+	ret = -ENOBUFS;
+	goto out;
+}
+
+/*
+ * read a list of pages from the cache or allocate blocks in which to store
+ * them
+ */
+int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
+				   struct list_head *pages,
+				   unsigned *nr_pages,
+				   gfp_t gfp)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct list_head backpages;
+	struct pagevec pagevec;
+	struct inode *inode;
+	struct page *page, *_n;
+	unsigned shift, nrbackpages;
+	int ret, ret2, space;
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	_enter("{OBJ%x,%d},,%d,,",
+	       object->fscache.debug_id, atomic_read(&op->op.usage),
+	       *nr_pages);
+
+	if (!object->backer)
+		return -ENOBUFS;
+
+	space = 1;
+	if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
+		space = 0;
+
+	inode = object->backer->d_inode;
+	ASSERT(S_ISREG(inode->i_mode));
+	ASSERT(inode->i_mapping->a_ops->bmap);
+	ASSERT(inode->i_mapping->a_ops->readpages);
+
+	/* calculate the shift required to use bmap */
+	if (inode->i_sb->s_blocksize > PAGE_SIZE)
+		return -ENOBUFS;
+
+	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+
+	pagevec_init(&pagevec, 0);
+
+	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
+	op->op.flags |= FSCACHE_OP_ASYNC;
+	op->op.processor = cachefiles_read_copier;
+
+	INIT_LIST_HEAD(&backpages);
+	nrbackpages = 0;
+
+	ret = space ? -ENODATA : -ENOBUFS;
+	list_for_each_entry_safe(page, _n, pages, lru) {
+		sector_t block0, block;
+
+		/* we assume the absence or presence of the first block is a
+		 * good enough indication for the page as a whole
+		 * - TODO: don't use bmap() for this as it is _not_ actually
+		 *   good enough for this as it doesn't indicate errors, but
+		 *   it's all we've got for the moment
+		 */
+		block0 = page->index;
+		block0 <<= shift;
+
+		block = inode->i_mapping->a_ops->bmap(inode->i_mapping,
+						      block0);
+		_debug("%llx -> %llx",
+		       (unsigned long long) block0,
+		       (unsigned long long) block);
+
+		if (block) {
+			/* we have data - add it to the list to give to the
+			 * backing fs */
+			list_move(&page->lru, &backpages);
+			(*nr_pages)--;
+			nrbackpages++;
+		} else if (space && pagevec_add(&pagevec, page) == 0) {
+			fscache_mark_pages_cached(op, &pagevec);
+			ret = -ENODATA;
+		}
+	}
+
+	if (pagevec_count(&pagevec) > 0)
+		fscache_mark_pages_cached(op, &pagevec);
+
+	if (list_empty(pages))
+		ret = 0;
+
+	/* submit the apparently valid pages to the backing fs to be read from
+	 * disk */
+	if (nrbackpages > 0) {
+		ret2 = cachefiles_read_backing_file(object, op, &backpages,
+						    &pagevec);
+		if (ret2 == -ENOMEM || ret2 == -EINTR)
+			ret = ret2;
+	}
+
+	if (pagevec_count(&pagevec) > 0)
+		fscache_mark_pages_cached(op, &pagevec);
+
+	_leave(" = %d [nr=%u%s]",
+	       ret, *nr_pages, list_empty(pages) ? " empty" : "");
+	return ret;
+}
+
+/*
+ * allocate a block in the cache in which to store a page
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - otherwise:
+ *   - the metadata will be retained
+ *   - 0 will be returned
+ */
+int cachefiles_allocate_page(struct fscache_retrieval *op,
+			     struct page *page,
+			     gfp_t gfp)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct pagevec pagevec;
+	int ret;
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	_enter("%p,{%lx},", object, page->index);
+
+	ret = cachefiles_has_space(cache, 0, 1);
+	if (ret == 0) {
+		pagevec_init(&pagevec, 0);
+		pagevec_add(&pagevec, page);
+		fscache_mark_pages_cached(op, &pagevec);
+	} else {
+		ret = -ENOBUFS;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * allocate blocks in the cache in which to store a set of pages
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if some buffers couldn't be made available
+ * - returns -ENOBUFS if some pages are beyond EOF
+ * - otherwise:
+ *   - -ENODATA will be returned
+ * - metadata will be retained for any page marked
+ */
+int cachefiles_allocate_pages(struct fscache_retrieval *op,
+			      struct list_head *pages,
+			      unsigned *nr_pages,
+			      gfp_t gfp)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct pagevec pagevec;
+	struct page *page;
+	int ret;
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	_enter("%p,,,%d,", object, *nr_pages);
+
+	ret = cachefiles_has_space(cache, 0, *nr_pages);
+	if (ret == 0) {
+		pagevec_init(&pagevec, 0);
+
+		list_for_each_entry(page, pages, lru) {
+			if (pagevec_add(&pagevec, page) == 0)
+				fscache_mark_pages_cached(op, &pagevec);
+		}
+
+		if (pagevec_count(&pagevec) > 0)
+			fscache_mark_pages_cached(op, &pagevec);
+		ret = -ENODATA;
+	} else {
+		ret = -ENOBUFS;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * request a page be stored in the cache
+ * - cache withdrawal is prevented by the caller
+ * - this request may be ignored if there's no cache block available, in which
+ *   case -ENOBUFS will be returned
+ * - if the op is in progress, 0 will be returned
+ */
+int cachefiles_write_page(struct fscache_storage *op, struct page *page)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	mm_segment_t old_fs;
+	struct file *file;
+	loff_t pos, eof;
+	size_t len;
+	void *data;
+	int ret;
+
+	ASSERT(op != NULL);
+	ASSERT(page != NULL);
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+
+	_enter("%p,%p{%lx},,,", object, page, page->index);
+
+	if (!object->backer) {
+		_leave(" = -ENOBUFS");
+		return -ENOBUFS;
+	}
+
+	ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	/* write the page to the backing filesystem and let it store it in its
+	 * own time */
+	dget(object->backer);
+	mntget(cache->mnt);
+	file = dentry_open(object->backer, cache->mnt, O_RDWR,
+			   cache->cache_cred);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+	} else {
+		ret = -EIO;
+		if (file->f_op->write) {
+			pos = (loff_t) page->index << PAGE_SHIFT;
+
+			/* we mustn't write more data than we have, so we have
+			 * to beware of a partial page at EOF */
+			eof = object->fscache.store_limit_l;
+			len = PAGE_SIZE;
+			if (eof & ~PAGE_MASK) {
+				ASSERTCMP(pos, <, eof);
+				if (eof - pos < PAGE_SIZE) {
+					_debug("cut short %llx to %llx",
+					       pos, eof);
+					len = eof - pos;
+					ASSERTCMP(pos + len, ==, eof);
+				}
+			}
+
+			data = kmap(page);
+			old_fs = get_fs();
+			set_fs(KERNEL_DS);
+			ret = file->f_op->write(
+				file, (const void __user *) data, len, &pos);
+			set_fs(old_fs);
+			kunmap(page);
+			if (ret != len)
+				ret = -EIO;
+		}
+		fput(file);
+	}
+
+	if (ret < 0) {
+		if (ret == -EIO)
+			cachefiles_io_error_obj(
+				object, "Write page to backing file failed");
+		ret = -ENOBUFS;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * detach a backing block from a page
+ * - cache withdrawal is prevented by the caller
+ */
+void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	_enter("%p,{%lu}", object, page->index);
+
+	spin_unlock(&object->fscache.cookie->lock);
+}
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
new file mode 100644
index 00000000..039b5011
--- /dev/null
+++ b/fs/cachefiles/security.c
@@ -0,0 +1,120 @@
+/* CacheFiles security management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/cred.h>
+#include "internal.h"
+
+/*
+ * determine the security context within which we access the cache from within
+ * the kernel
+ */
+int cachefiles_get_security_ID(struct cachefiles_cache *cache)
+{
+	struct cred *new;
+	int ret;
+
+	_enter("{%s}", cache->secctx);
+
+	new = prepare_kernel_cred(current);
+	if (!new) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	if (cache->secctx) {
+		ret = set_security_override_from_ctx(new, cache->secctx);
+		if (ret < 0) {
+			put_cred(new);
+			printk(KERN_ERR "CacheFiles:"
+			       " Security denies permission to nominate"
+			       " security context: error %d\n",
+			       ret);
+			goto error;
+		}
+	}
+
+	cache->cache_cred = new;
+	ret = 0;
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * see if mkdir and create can be performed in the root directory
+ */
+static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
+				      struct dentry *root)
+{
+	int ret;
+
+	ret = security_inode_mkdir(root->d_inode, root, 0);
+	if (ret < 0) {
+		printk(KERN_ERR "CacheFiles:"
+		       " Security denies permission to make dirs: error %d",
+		       ret);
+		return ret;
+	}
+
+	ret = security_inode_create(root->d_inode, root, 0);
+	if (ret < 0)
+		printk(KERN_ERR "CacheFiles:"
+		       " Security denies permission to create files: error %d",
+		       ret);
+
+	return ret;
+}
+
+/*
+ * check the security details of the on-disk cache
+ * - must be called with security override in force
+ * - must return with a security override in force - even in the case of an
+ *   error
+ */
+int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
+					struct dentry *root,
+					const struct cred **_saved_cred)
+{
+	struct cred *new;
+	int ret;
+
+	_enter("");
+
+	/* duplicate the cache creds for COW (the override is currently in
+	 * force, so we can use prepare_creds() to do this) */
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	cachefiles_end_secure(cache, *_saved_cred);
+
+	/* use the cache root dir's security context as the basis with
+	 * which create files */
+	ret = set_create_files_as(new, root->d_inode);
+	if (ret < 0) {
+		abort_creds(new);
+		cachefiles_begin_secure(cache, _saved_cred);
+		_leave(" = %d [cfa]", ret);
+		return ret;
+	}
+
+	put_cred(cache->cache_cred);
+	cache->cache_cred = new;
+
+	cachefiles_begin_secure(cache, _saved_cred);
+	ret = cachefiles_check_cache_dir(cache, root);
+
+	if (ret == -EOPNOTSUPP)
+		ret = 0;
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
new file mode 100644
index 00000000..e18b183b
--- /dev/null
+++ b/fs/cachefiles/xattr.c
@@ -0,0 +1,292 @@
+/* CacheFiles extended attribute management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static const char cachefiles_xattr_cache[] =
+	XATTR_USER_PREFIX "CacheFiles.cache";
+
+/*
+ * check the type label on an object
+ * - done using xattrs
+ */
+int cachefiles_check_object_type(struct cachefiles_object *object)
+{
+	struct dentry *dentry = object->dentry;
+	char type[3], xtype[3];
+	int ret;
+
+	ASSERT(dentry);
+	ASSERT(dentry->d_inode);
+
+	if (!object->fscache.cookie)
+		strcpy(type, "C3");
+	else
+		snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
+
+	_enter("%p{%s}", object, type);
+
+	/* attempt to install a type label directly */
+	ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
+			   XATTR_CREATE);
+	if (ret == 0) {
+		_debug("SET"); /* we succeeded */
+		goto error;
+	}
+
+	if (ret != -EEXIST) {
+		kerror("Can't set xattr on %*.*s [%lu] (err %d)",
+		       dentry->d_name.len, dentry->d_name.len,
+		       dentry->d_name.name, dentry->d_inode->i_ino,
+		       -ret);
+		goto error;
+	}
+
+	/* read the current type label */
+	ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
+	if (ret < 0) {
+		if (ret == -ERANGE)
+			goto bad_type_length;
+
+		kerror("Can't read xattr on %*.*s [%lu] (err %d)",
+		       dentry->d_name.len, dentry->d_name.len,
+		       dentry->d_name.name, dentry->d_inode->i_ino,
+		       -ret);
+		goto error;
+	}
+
+	/* check the type is what we're expecting */
+	if (ret != 2)
+		goto bad_type_length;
+
+	if (xtype[0] != type[0] || xtype[1] != type[1])
+		goto bad_type;
+
+	ret = 0;
+
+error:
+	_leave(" = %d", ret);
+	return ret;
+
+bad_type_length:
+	kerror("Cache object %lu type xattr length incorrect",
+	       dentry->d_inode->i_ino);
+	ret = -EIO;
+	goto error;
+
+bad_type:
+	xtype[2] = 0;
+	kerror("Cache object %*.*s [%lu] type %s not %s",
+	       dentry->d_name.len, dentry->d_name.len,
+	       dentry->d_name.name, dentry->d_inode->i_ino,
+	       xtype, type);
+	ret = -EIO;
+	goto error;
+}
+
+/*
+ * set the state xattr on a cache file
+ */
+int cachefiles_set_object_xattr(struct cachefiles_object *object,
+				struct cachefiles_xattr *auxdata)
+{
+	struct dentry *dentry = object->dentry;
+	int ret;
+
+	ASSERT(object->fscache.cookie);
+	ASSERT(dentry);
+
+	_enter("%p,#%d", object, auxdata->len);
+
+	/* attempt to install the cache metadata directly */
+	_debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+
+	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+			   &auxdata->type, auxdata->len,
+			   XATTR_CREATE);
+	if (ret < 0 && ret != -ENOMEM)
+		cachefiles_io_error_obj(
+			object,
+			"Failed to set xattr with error %d", ret);
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * update the state xattr on a cache file
+ */
+int cachefiles_update_object_xattr(struct cachefiles_object *object,
+				   struct cachefiles_xattr *auxdata)
+{
+	struct dentry *dentry = object->dentry;
+	int ret;
+
+	ASSERT(object->fscache.cookie);
+	ASSERT(dentry);
+
+	_enter("%p,#%d", object, auxdata->len);
+
+	/* attempt to install the cache metadata directly */
+	_debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+
+	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+			   &auxdata->type, auxdata->len,
+			   XATTR_REPLACE);
+	if (ret < 0 && ret != -ENOMEM)
+		cachefiles_io_error_obj(
+			object,
+			"Failed to update xattr with error %d", ret);
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * check the state xattr on a cache file
+ * - return -ESTALE if the object should be deleted
+ */
+int cachefiles_check_object_xattr(struct cachefiles_object *object,
+				  struct cachefiles_xattr *auxdata)
+{
+	struct cachefiles_xattr *auxbuf;
+	struct dentry *dentry = object->dentry;
+	int ret;
+
+	_enter("%p,#%d", object, auxdata->len);
+
+	ASSERT(dentry);
+	ASSERT(dentry->d_inode);
+
+	auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
+	if (!auxbuf) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	/* read the current type label */
+	ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
+			   &auxbuf->type, 512 + 1);
+	if (ret < 0) {
+		if (ret == -ENODATA)
+			goto stale; /* no attribute - power went off
+				     * mid-cull? */
+
+		if (ret == -ERANGE)
+			goto bad_type_length;
+
+		cachefiles_io_error_obj(object,
+					"Can't read xattr on %lu (err %d)",
+					dentry->d_inode->i_ino, -ret);
+		goto error;
+	}
+
+	/* check the on-disk object */
+	if (ret < 1)
+		goto bad_type_length;
+
+	if (auxbuf->type != auxdata->type)
+		goto stale;
+
+	auxbuf->len = ret;
+
+	/* consult the netfs */
+	if (object->fscache.cookie->def->check_aux) {
+		enum fscache_checkaux result;
+		unsigned int dlen;
+
+		dlen = auxbuf->len - 1;
+
+		_debug("checkaux %s #%u",
+		       object->fscache.cookie->def->name, dlen);
+
+		result = fscache_check_aux(&object->fscache,
+					   &auxbuf->data, dlen);
+
+		switch (result) {
+			/* entry okay as is */
+		case FSCACHE_CHECKAUX_OKAY:
+			goto okay;
+
+			/* entry requires update */
+		case FSCACHE_CHECKAUX_NEEDS_UPDATE:
+			break;
+
+			/* entry requires deletion */
+		case FSCACHE_CHECKAUX_OBSOLETE:
+			goto stale;
+
+		default:
+			BUG();
+		}
+
+		/* update the current label */
+		ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+				   &auxdata->type, auxdata->len,
+				   XATTR_REPLACE);
+		if (ret < 0) {
+			cachefiles_io_error_obj(object,
+						"Can't update xattr on %lu"
+						" (error %d)",
+						dentry->d_inode->i_ino, -ret);
+			goto error;
+		}
+	}
+
+okay:
+	ret = 0;
+
+error:
+	kfree(auxbuf);
+	_leave(" = %d", ret);
+	return ret;
+
+bad_type_length:
+	kerror("Cache object %lu xattr length incorrect",
+	       dentry->d_inode->i_ino);
+	ret = -EIO;
+	goto error;
+
+stale:
+	ret = -ESTALE;
+	goto error;
+}
+
+/*
+ * remove the object's xattr to mark it stale
+ */
+int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+				   struct dentry *dentry)
+{
+	int ret;
+
+	ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
+	if (ret < 0) {
+		if (ret == -ENOENT || ret == -ENODATA)
+			ret = 0;
+		else if (ret != -ENOMEM)
+			cachefiles_io_error(cache,
+					    "Can't remove xattr from %lu"
+					    " (error %d)",
+					    dentry->d_inode->i_ino, -ret);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 00000000..9eb134ea
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,18 @@
+config CEPH_FS
+        tristate "Ceph distributed file system (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	select CEPH_LIB
+	select LIBCRC32C
+	select CRYPTO_AES
+	select CRYPTO
+	default n
+	help
+	  Choose Y or M here to include support for mounting the
+	  experimental Ceph distributed file system.  Ceph is an extremely
+	  scalable file system designed to provide high performance,
+	  reliable access to petabytes of storage.
+
+	  More information at http://ceph.newdream.net/.
+
+	  If unsure, say N.
+
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 00000000..bd352125
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for CEPH filesystem.
+#
+
+obj-$(CONFIG_CEPH_FS) += ceph.o
+
+ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
+	export.o caps.o snap.o xattr.o \
+	mds_client.o mdsmap.o strings.o ceph_frag.o \
+	debugfs.o
+
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 00000000..5a3953db
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1181 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>	/* generic_writepages */
+#include <linux/slab.h>
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include <linux/ceph/osd_client.h>
+
+/*
+ * Ceph address space ops.
+ *
+ * There are a few funny things going on here.
+ *
+ * The page->private field is used to reference a struct
+ * ceph_snap_context for _every_ dirty page.  This indicates which
+ * snapshot the page was logically dirtied in, and thus which snap
+ * context needs to be associated with the osd write during writeback.
+ *
+ * Similarly, struct ceph_inode_info maintains a set of counters to
+ * count dirty pages on the inode.  In the absence of snapshots,
+ * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
+ *
+ * When a snapshot is taken (that is, when the client receives
+ * notification that a snapshot was taken), each inode with caps and
+ * with dirty pages (dirty pages implies there is a cap) gets a new
+ * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
+ * order, new snaps go to the tail).  The i_wrbuffer_ref_head count is
+ * moved to capsnap->dirty. (Unless a sync write is currently in
+ * progress.  In that case, the capsnap is said to be "pending", new
+ * writes cannot start, and the capsnap isn't "finalized" until the
+ * write completes (or fails) and a final size/mtime for the inode for
+ * that snap can be settled upon.)  i_wrbuffer_ref_head is reset to 0.
+ *
+ * On writeback, we must submit writes to the osd IN SNAP ORDER.  So,
+ * we look for the first capsnap in i_cap_snaps and write out pages in
+ * that snap context _only_.  Then we move on to the next capsnap,
+ * eventually reaching the "live" or "head" context (i.e., pages that
+ * are not yet snapped) and are writing the most recently dirtied
+ * pages.
+ *
+ * Invalidate and so forth must take care to ensure the dirty page
+ * accounting is preserved.
+ */
+
+#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
+#define CONGESTION_OFF_THRESH(congestion_kb)				\
+	(CONGESTION_ON_THRESH(congestion_kb) -				\
+	 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
+
+
+
+/*
+ * Dirty a page.  Optimistically adjust accounting, on the assumption
+ * that we won't race with invalidate.  If we do, readjust.
+ */
+static int ceph_set_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	int undo = 0;
+	struct ceph_snap_context *snapc;
+
+	if (unlikely(!mapping))
+		return !TestSetPageDirty(page);
+
+	if (TestSetPageDirty(page)) {
+		dout("%p set_page_dirty %p idx %lu -- already dirty\n",
+		     mapping->host, page, page->index);
+		return 0;
+	}
+
+	inode = mapping->host;
+	ci = ceph_inode(inode);
+
+	/*
+	 * Note that we're grabbing a snapc ref here without holding
+	 * any locks!
+	 */
+	snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+
+	/* dirty the head */
+	spin_lock(&inode->i_lock);
+	if (ci->i_head_snapc == NULL)
+		ci->i_head_snapc = ceph_get_snap_context(snapc);
+	++ci->i_wrbuffer_ref_head;
+	if (ci->i_wrbuffer_ref == 0)
+		ihold(inode);
+	++ci->i_wrbuffer_ref;
+	dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+	     "snapc %p seq %lld (%d snaps)\n",
+	     mapping->host, page, page->index,
+	     ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+	     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+	     snapc, snapc->seq, snapc->num_snaps);
+	spin_unlock(&inode->i_lock);
+
+	/* now adjust page */
+	spin_lock_irq(&mapping->tree_lock);
+	if (page->mapping) {	/* Race with truncate? */
+		WARN_ON_ONCE(!PageUptodate(page));
+		account_page_dirtied(page, page->mapping);
+		radix_tree_tag_set(&mapping->page_tree,
+				page_index(page), PAGECACHE_TAG_DIRTY);
+
+		/*
+		 * Reference snap context in page->private.  Also set
+		 * PagePrivate so that we get invalidatepage callback.
+		 */
+		page->private = (unsigned long)snapc;
+		SetPagePrivate(page);
+	} else {
+		dout("ANON set_page_dirty %p (raced truncate?)\n", page);
+		undo = 1;
+	}
+
+	spin_unlock_irq(&mapping->tree_lock);
+
+	if (undo)
+		/* whoops, we failed to dirty the page */
+		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+
+	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+	BUG_ON(!PageDirty(page));
+	return 1;
+}
+
+/*
+ * If we are truncating the full page (i.e. offset == 0), adjust the
+ * dirty page counters appropriately.  Only called if there is private
+ * data on the page.
+ */
+static void ceph_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_snap_context *snapc = (void *)page->private;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!page->private);
+	BUG_ON(!PagePrivate(page));
+	BUG_ON(!page->mapping);
+
+	inode = page->mapping->host;
+
+	/*
+	 * We can get non-dirty pages here due to races between
+	 * set_page_dirty and truncate_complete_page; just spit out a
+	 * warning, in case we end up with accounting problems later.
+	 */
+	if (!PageDirty(page))
+		pr_err("%p invalidatepage %p page not dirty\n", inode, page);
+
+	if (offset == 0)
+		ClearPageChecked(page);
+
+	ci = ceph_inode(inode);
+	if (offset == 0) {
+		dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
+		     inode, page, page->index, offset);
+		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+		ceph_put_snap_context(snapc);
+		page->private = 0;
+		ClearPagePrivate(page);
+	} else {
+		dout("%p invalidatepage %p idx %lu partial dirty page\n",
+		     inode, page, page->index);
+	}
+}
+
+/* just a sanity check */
+static int ceph_releasepage(struct page *page, gfp_t g)
+{
+	struct inode *inode = page->mapping ? page->mapping->host : NULL;
+	dout("%p releasepage %p idx %lu\n", inode, page, page->index);
+	WARN_ON(PageDirty(page));
+	WARN_ON(page->private);
+	WARN_ON(PagePrivate(page));
+	return 0;
+}
+
+/*
+ * read a single page, without unlocking it.
+ */
+static int readpage_nounlock(struct file *filp, struct page *page)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc = 
+		&ceph_inode_to_client(inode)->client->osdc;
+	int err = 0;
+	u64 len = PAGE_CACHE_SIZE;
+
+	dout("readpage inode %p file %p page %p index %lu\n",
+	     inode, filp, page, page->index);
+	err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+				  page->index << PAGE_CACHE_SHIFT, &len,
+				  ci->i_truncate_seq, ci->i_truncate_size,
+				  &page, 1, 0);
+	if (err == -ENOENT)
+		err = 0;
+	if (err < 0) {
+		SetPageError(page);
+		goto out;
+	} else if (err < PAGE_CACHE_SIZE) {
+		/* zero fill remainder of page */
+		zero_user_segment(page, err, PAGE_CACHE_SIZE);
+	}
+	SetPageUptodate(page);
+
+out:
+	return err < 0 ? err : 0;
+}
+
+static int ceph_readpage(struct file *filp, struct page *page)
+{
+	int r = readpage_nounlock(filp, page);
+	unlock_page(page);
+	return r;
+}
+
+/*
+ * Build a vector of contiguous pages from the provided page list.
+ */
+static struct page **page_vector_from_list(struct list_head *page_list,
+					   unsigned *nr_pages)
+{
+	struct page **pages;
+	struct page *page;
+	int next_index, contig_pages = 0;
+
+	/* build page vector */
+	pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	BUG_ON(list_empty(page_list));
+	next_index = list_entry(page_list->prev, struct page, lru)->index;
+	list_for_each_entry_reverse(page, page_list, lru) {
+		if (page->index == next_index) {
+			dout("readpages page %d %p\n", contig_pages, page);
+			pages[contig_pages] = page;
+			contig_pages++;
+			next_index++;
+		} else {
+			break;
+		}
+	}
+	*nr_pages = contig_pages;
+	return pages;
+}
+
+/*
+ * Read multiple pages.  Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
+ */
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *page_list, unsigned nr_pages)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc =
+		&ceph_inode_to_client(inode)->client->osdc;
+	int rc = 0;
+	struct page **pages;
+	loff_t offset;
+	u64 len;
+
+	dout("readpages %p file %p nr_pages %d\n",
+	     inode, file, nr_pages);
+
+	pages = page_vector_from_list(page_list, &nr_pages);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	/* guess read extent */
+	offset = pages[0]->index << PAGE_CACHE_SHIFT;
+	len = nr_pages << PAGE_CACHE_SHIFT;
+	rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+				 offset, &len,
+				 ci->i_truncate_seq, ci->i_truncate_size,
+				 pages, nr_pages, 0);
+	if (rc == -ENOENT)
+		rc = 0;
+	if (rc < 0)
+		goto out;
+
+	for (; !list_empty(page_list) && len > 0;
+	     rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
+		struct page *page =
+			list_entry(page_list->prev, struct page, lru);
+
+		list_del(&page->lru);
+
+		if (rc < (int)PAGE_CACHE_SIZE) {
+			/* zero (remainder of) page */
+			int s = rc < 0 ? 0 : rc;
+			zero_user_segment(page, s, PAGE_CACHE_SIZE);
+		}
+
+		if (add_to_page_cache_lru(page, mapping, page->index,
+					  GFP_NOFS)) {
+			page_cache_release(page);
+			dout("readpages %p add_to_page_cache failed %p\n",
+			     inode, page);
+			continue;
+		}
+		dout("readpages %p adding %p idx %lu\n", inode, page,
+		     page->index);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	rc = 0;
+
+out:
+	kfree(pages);
+	return rc;
+}
+
+/*
+ * Get ref for the oldest snapc for an inode with dirty data... that is, the
+ * only snap context we are allowed to write back.
+ */
+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
+						    u64 *snap_size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_snap_context *snapc = NULL;
+	struct ceph_cap_snap *capsnap = NULL;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+		dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
+		     capsnap->context, capsnap->dirty_pages);
+		if (capsnap->dirty_pages) {
+			snapc = ceph_get_snap_context(capsnap->context);
+			if (snap_size)
+				*snap_size = capsnap->size;
+			break;
+		}
+	}
+	if (!snapc && ci->i_wrbuffer_ref_head) {
+		snapc = ceph_get_snap_context(ci->i_head_snapc);
+		dout(" head snapc %p has %d dirty pages\n",
+		     snapc, ci->i_wrbuffer_ref_head);
+	}
+	spin_unlock(&inode->i_lock);
+	return snapc;
+}
+
+/*
+ * Write a single page, but leave the page locked.
+ *
+ * If we get a write error, set the page error bit, but still adjust the
+ * dirty page accounting (i.e., page is no longer dirty).
+ */
+static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_fs_client *fsc;
+	struct ceph_osd_client *osdc;
+	loff_t page_off = page->index << PAGE_CACHE_SHIFT;
+	int len = PAGE_CACHE_SIZE;
+	loff_t i_size;
+	int err = 0;
+	struct ceph_snap_context *snapc, *oldest;
+	u64 snap_size = 0;
+	long writeback_stat;
+
+	dout("writepage %p idx %lu\n", page, page->index);
+
+	if (!page->mapping || !page->mapping->host) {
+		dout("writepage %p - no mapping\n", page);
+		return -EFAULT;
+	}
+	inode = page->mapping->host;
+	ci = ceph_inode(inode);
+	fsc = ceph_inode_to_client(inode);
+	osdc = &fsc->client->osdc;
+
+	/* verify this is a writeable snap context */
+	snapc = (void *)page->private;
+	if (snapc == NULL) {
+		dout("writepage %p page %p not dirty?\n", inode, page);
+		goto out;
+	}
+	oldest = get_oldest_context(inode, &snap_size);
+	if (snapc->seq > oldest->seq) {
+		dout("writepage %p page %p snapc %p not writeable - noop\n",
+		     inode, page, (void *)page->private);
+		/* we should only noop if called by kswapd */
+		WARN_ON((current->flags & PF_MEMALLOC) == 0);
+		ceph_put_snap_context(oldest);
+		goto out;
+	}
+	ceph_put_snap_context(oldest);
+
+	/* is this a partial page at end of file? */
+	if (snap_size)
+		i_size = snap_size;
+	else
+		i_size = i_size_read(inode);
+	if (i_size < page_off + len)
+		len = i_size - page_off;
+
+	dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
+	     inode, page, page->index, page_off, len, snapc);
+
+	writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
+	if (writeback_stat >
+	    CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
+		set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
+
+	set_page_writeback(page);
+	err = ceph_osdc_writepages(osdc, ceph_vino(inode),
+				   &ci->i_layout, snapc,
+				   page_off, len,
+				   ci->i_truncate_seq, ci->i_truncate_size,
+				   &inode->i_mtime,
+				   &page, 1, 0, 0, true);
+	if (err < 0) {
+		dout("writepage setting page/mapping error %d %p\n", err, page);
+		SetPageError(page);
+		mapping_set_error(&inode->i_data, err);
+		if (wbc)
+			wbc->pages_skipped++;
+	} else {
+		dout("writepage cleaned page %p\n", page);
+		err = 0;  /* vfs expects us to return 0 */
+	}
+	page->private = 0;
+	ClearPagePrivate(page);
+	end_page_writeback(page);
+	ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+	ceph_put_snap_context(snapc);  /* page's reference */
+out:
+	return err;
+}
+
+static int ceph_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int err;
+	struct inode *inode = page->mapping->host;
+	BUG_ON(!inode);
+	ihold(inode);
+	err = writepage_nounlock(page, wbc);
+	unlock_page(page);
+	iput(inode);
+	return err;
+}
+
+
+/*
+ * lame release_pages helper.  release_pages() isn't exported to
+ * modules.
+ */
+static void ceph_release_pages(struct page **pages, int num)
+{
+	struct pagevec pvec;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	for (i = 0; i < num; i++) {
+		if (pagevec_add(&pvec, pages[i]) == 0)
+			pagevec_release(&pvec);
+	}
+	pagevec_release(&pvec);
+}
+
+
+/*
+ * async writeback completion handler.
+ *
+ * If we get an error, set the mapping error bit, but not the individual
+ * page error bits.
+ */
+static void writepages_finish(struct ceph_osd_request *req,
+			      struct ceph_msg *msg)
+{
+	struct inode *inode = req->r_inode;
+	struct ceph_osd_reply_head *replyhead;
+	struct ceph_osd_op *op;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	unsigned wrote;
+	struct page *page;
+	int i;
+	struct ceph_snap_context *snapc = req->r_snapc;
+	struct address_space *mapping = inode->i_mapping;
+	__s32 rc = -EIO;
+	u64 bytes = 0;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	long writeback_stat;
+	unsigned issued = ceph_caps_issued(ci);
+
+	/* parse reply */
+	replyhead = msg->front.iov_base;
+	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+	op = (void *)(replyhead + 1);
+	rc = le32_to_cpu(replyhead->result);
+	bytes = le64_to_cpu(op->extent.length);
+
+	if (rc >= 0) {
+		/*
+		 * Assume we wrote the pages we originally sent.  The
+		 * osd might reply with fewer pages if our writeback
+		 * raced with a truncation and was adjusted at the osd,
+		 * so don't believe the reply.
+		 */
+		wrote = req->r_num_pages;
+	} else {
+		wrote = 0;
+		mapping_set_error(mapping, rc);
+	}
+	dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
+	     inode, rc, bytes, wrote);
+
+	/* clean all pages */
+	for (i = 0; i < req->r_num_pages; i++) {
+		page = req->r_pages[i];
+		BUG_ON(!page);
+		WARN_ON(!PageUptodate(page));
+
+		writeback_stat =
+			atomic_long_dec_return(&fsc->writeback_count);
+		if (writeback_stat <
+		    CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
+			clear_bdi_congested(&fsc->backing_dev_info,
+					    BLK_RW_ASYNC);
+
+		ceph_put_snap_context((void *)page->private);
+		page->private = 0;
+		ClearPagePrivate(page);
+		dout("unlocking %d %p\n", i, page);
+		end_page_writeback(page);
+
+		/*
+		 * We lost the cache cap, need to truncate the page before
+		 * it is unlocked, otherwise we'd truncate it later in the
+		 * page truncation thread, possibly losing some data that
+		 * raced its way in
+		 */
+		if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
+			generic_error_remove_page(inode->i_mapping, page);
+
+		unlock_page(page);
+	}
+	dout("%p wrote+cleaned %d pages\n", inode, wrote);
+	ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
+
+	ceph_release_pages(req->r_pages, req->r_num_pages);
+	if (req->r_pages_from_pool)
+		mempool_free(req->r_pages,
+			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
+	else
+		kfree(req->r_pages);
+	ceph_osdc_put_request(req);
+}
+
+/*
+ * allocate a page vec, either directly, or if necessary, via a the
+ * mempool.  we avoid the mempool if we can because req->r_num_pages
+ * may be less than the maximum write size.
+ */
+static void alloc_page_vec(struct ceph_fs_client *fsc,
+			   struct ceph_osd_request *req)
+{
+	req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
+			       GFP_NOFS);
+	if (!req->r_pages) {
+		req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
+		req->r_pages_from_pool = 1;
+		WARN_ON(!req->r_pages);
+	}
+}
+
+/*
+ * initiate async writeback
+ */
+static int ceph_writepages_start(struct address_space *mapping,
+				 struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc;
+	pgoff_t index, start, end;
+	int range_whole = 0;
+	int should_loop = 1;
+	pgoff_t max_pages = 0, max_pages_ever = 0;
+	struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
+	struct pagevec pvec;
+	int done = 0;
+	int rc = 0;
+	unsigned wsize = 1 << inode->i_blkbits;
+	struct ceph_osd_request *req = NULL;
+	int do_sync;
+	u64 snap_size = 0;
+
+	/*
+	 * Include a 'sync' in the OSD request if this is a data
+	 * integrity write (e.g., O_SYNC write or fsync()), or if our
+	 * cap is being revoked.
+	 */
+	do_sync = wbc->sync_mode == WB_SYNC_ALL;
+	if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+		do_sync = 1;
+	dout("writepages_start %p dosync=%d (mode=%s)\n",
+	     inode, do_sync,
+	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+
+	fsc = ceph_inode_to_client(inode);
+	if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
+		pr_warning("writepage_start %p on forced umount\n", inode);
+		return -EIO; /* we're in a forced umount, don't write! */
+	}
+	if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
+		wsize = fsc->mount_options->wsize;
+	if (wsize < PAGE_CACHE_SIZE)
+		wsize = PAGE_CACHE_SIZE;
+	max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
+
+	pagevec_init(&pvec, 0);
+
+	/* where to start/end? */
+	if (wbc->range_cyclic) {
+		start = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+		dout(" cyclic, start at %lu\n", start);
+	} else {
+		start = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		should_loop = 0;
+		dout(" not cyclic, %lu to %lu\n", start, end);
+	}
+	index = start;
+
+retry:
+	/* find oldest snap context with dirty data */
+	ceph_put_snap_context(snapc);
+	snapc = get_oldest_context(inode, &snap_size);
+	if (!snapc) {
+		/* hmm, why does writepages get called when there
+		   is no dirty data? */
+		dout(" no snap context with dirty data?\n");
+		goto out;
+	}
+	dout(" oldest snapc is %p seq %lld (%d snaps)\n",
+	     snapc, snapc->seq, snapc->num_snaps);
+	if (last_snapc && snapc != last_snapc) {
+		/* if we switched to a newer snapc, restart our scan at the
+		 * start of the original file range. */
+		dout("  snapc differs from last pass, restarting at %lu\n",
+		     index);
+		index = start;
+	}
+	last_snapc = snapc;
+
+	while (!done && index <= end) {
+		unsigned i;
+		int first;
+		pgoff_t next;
+		int pvec_pages, locked_pages;
+		struct page *page;
+		int want;
+		u64 offset, len;
+		struct ceph_osd_request_head *reqhead;
+		struct ceph_osd_op *op;
+		long writeback_stat;
+
+		next = 0;
+		locked_pages = 0;
+		max_pages = max_pages_ever;
+
+get_more_pages:
+		first = -1;
+		want = min(end - index,
+			   min((pgoff_t)PAGEVEC_SIZE,
+			       max_pages - (pgoff_t)locked_pages) - 1)
+			+ 1;
+		pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+						PAGECACHE_TAG_DIRTY,
+						want);
+		dout("pagevec_lookup_tag got %d\n", pvec_pages);
+		if (!pvec_pages && !locked_pages)
+			break;
+		for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
+			page = pvec.pages[i];
+			dout("? %p idx %lu\n", page, page->index);
+			if (locked_pages == 0)
+				lock_page(page);  /* first page */
+			else if (!trylock_page(page))
+				break;
+
+			/* only dirty pages, or our accounting breaks */
+			if (unlikely(!PageDirty(page)) ||
+			    unlikely(page->mapping != mapping)) {
+				dout("!dirty or !mapping %p\n", page);
+				unlock_page(page);
+				break;
+			}
+			if (!wbc->range_cyclic && page->index > end) {
+				dout("end of range %p\n", page);
+				done = 1;
+				unlock_page(page);
+				break;
+			}
+			if (next && (page->index != next)) {
+				dout("not consecutive %p\n", page);
+				unlock_page(page);
+				break;
+			}
+			if (wbc->sync_mode != WB_SYNC_NONE) {
+				dout("waiting on writeback %p\n", page);
+				wait_on_page_writeback(page);
+			}
+			if ((snap_size && page_offset(page) > snap_size) ||
+			    (!snap_size &&
+			     page_offset(page) > i_size_read(inode))) {
+				dout("%p page eof %llu\n", page, snap_size ?
+				     snap_size : i_size_read(inode));
+				done = 1;
+				unlock_page(page);
+				break;
+			}
+			if (PageWriteback(page)) {
+				dout("%p under writeback\n", page);
+				unlock_page(page);
+				break;
+			}
+
+			/* only if matching snap context */
+			pgsnapc = (void *)page->private;
+			if (pgsnapc->seq > snapc->seq) {
+				dout("page snapc %p %lld > oldest %p %lld\n",
+				     pgsnapc, pgsnapc->seq, snapc, snapc->seq);
+				unlock_page(page);
+				if (!locked_pages)
+					continue; /* keep looking for snap */
+				break;
+			}
+
+			if (!clear_page_dirty_for_io(page)) {
+				dout("%p !clear_page_dirty_for_io\n", page);
+				unlock_page(page);
+				break;
+			}
+
+			/* ok */
+			if (locked_pages == 0) {
+				/* prepare async write request */
+				offset = (unsigned long long)page->index
+					<< PAGE_CACHE_SHIFT;
+				len = wsize;
+				req = ceph_osdc_new_request(&fsc->client->osdc,
+					    &ci->i_layout,
+					    ceph_vino(inode),
+					    offset, &len,
+					    CEPH_OSD_OP_WRITE,
+					    CEPH_OSD_FLAG_WRITE |
+						    CEPH_OSD_FLAG_ONDISK,
+					    snapc, do_sync,
+					    ci->i_truncate_seq,
+					    ci->i_truncate_size,
+					    &inode->i_mtime, true, 1, 0);
+
+				if (!req) {
+					rc = -ENOMEM;
+					unlock_page(page);
+					break;
+				}
+
+				max_pages = req->r_num_pages;
+
+				alloc_page_vec(fsc, req);
+				req->r_callback = writepages_finish;
+				req->r_inode = inode;
+			}
+
+			/* note position of first page in pvec */
+			if (first < 0)
+				first = i;
+			dout("%p will write page %p idx %lu\n",
+			     inode, page, page->index);
+
+			writeback_stat =
+			       atomic_long_inc_return(&fsc->writeback_count);
+			if (writeback_stat > CONGESTION_ON_THRESH(
+				    fsc->mount_options->congestion_kb)) {
+				set_bdi_congested(&fsc->backing_dev_info,
+						  BLK_RW_ASYNC);
+			}
+
+			set_page_writeback(page);
+			req->r_pages[locked_pages] = page;
+			locked_pages++;
+			next = page->index + 1;
+		}
+
+		/* did we get anything? */
+		if (!locked_pages)
+			goto release_pvec_pages;
+		if (i) {
+			int j;
+			BUG_ON(!locked_pages || first < 0);
+
+			if (pvec_pages && i == pvec_pages &&
+			    locked_pages < max_pages) {
+				dout("reached end pvec, trying for more\n");
+				pagevec_reinit(&pvec);
+				goto get_more_pages;
+			}
+
+			/* shift unused pages over in the pvec...  we
+			 * will need to release them below. */
+			for (j = i; j < pvec_pages; j++) {
+				dout(" pvec leftover page %p\n",
+				     pvec.pages[j]);
+				pvec.pages[j-i+first] = pvec.pages[j];
+			}
+			pvec.nr -= i-first;
+		}
+
+		/* submit the write */
+		offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
+		len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
+			  (u64)locked_pages << PAGE_CACHE_SHIFT);
+		dout("writepages got %d pages at %llu~%llu\n",
+		     locked_pages, offset, len);
+
+		/* revise final length, page count */
+		req->r_num_pages = locked_pages;
+		reqhead = req->r_request->front.iov_base;
+		op = (void *)(reqhead + 1);
+		op->extent.length = cpu_to_le64(len);
+		op->payload_len = cpu_to_le32(len);
+		req->r_request->hdr.data_len = cpu_to_le32(len);
+
+		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
+		BUG_ON(rc);
+		req = NULL;
+
+		/* continue? */
+		index = next;
+		wbc->nr_to_write -= locked_pages;
+		if (wbc->nr_to_write <= 0)
+			done = 1;
+
+release_pvec_pages:
+		dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
+		     pvec.nr ? pvec.pages[0] : NULL);
+		pagevec_release(&pvec);
+
+		if (locked_pages && !done)
+			goto retry;
+	}
+
+	if (should_loop && !done) {
+		/* more to do; loop back to beginning of file */
+		dout("writepages looping back to beginning of file\n");
+		should_loop = 0;
+		index = 0;
+		goto retry;
+	}
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+
+out:
+	if (req)
+		ceph_osdc_put_request(req);
+	ceph_put_snap_context(snapc);
+	dout("writepages done, rc = %d\n", rc);
+	return rc;
+}
+
+
+
+/*
+ * See if a given @snapc is either writeable, or already written.
+ */
+static int context_is_writeable_or_written(struct inode *inode,
+					   struct ceph_snap_context *snapc)
+{
+	struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
+	int ret = !oldest || snapc->seq <= oldest->seq;
+
+	ceph_put_snap_context(oldest);
+	return ret;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ *
+ * called with page locked.
+ * return success with page locked,
+ * or any failure (incl -EAGAIN) with page unlocked.
+ */
+static int ceph_update_writeable_page(struct file *file,
+			    loff_t pos, unsigned len,
+			    struct page *page)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	loff_t page_off = pos & PAGE_CACHE_MASK;
+	int pos_in_page = pos & ~PAGE_CACHE_MASK;
+	int end_in_page = pos_in_page + len;
+	loff_t i_size;
+	int r;
+	struct ceph_snap_context *snapc, *oldest;
+
+retry_locked:
+	/* writepages currently holds page lock, but if we change that later, */
+	wait_on_page_writeback(page);
+
+	/* check snap context */
+	BUG_ON(!ci->i_snap_realm);
+	down_read(&mdsc->snap_rwsem);
+	BUG_ON(!ci->i_snap_realm->cached_context);
+	snapc = (void *)page->private;
+	if (snapc && snapc != ci->i_head_snapc) {
+		/*
+		 * this page is already dirty in another (older) snap
+		 * context!  is it writeable now?
+		 */
+		oldest = get_oldest_context(inode, NULL);
+		up_read(&mdsc->snap_rwsem);
+
+		if (snapc->seq > oldest->seq) {
+			ceph_put_snap_context(oldest);
+			dout(" page %p snapc %p not current or oldest\n",
+			     page, snapc);
+			/*
+			 * queue for writeback, and wait for snapc to
+			 * be writeable or written
+			 */
+			snapc = ceph_get_snap_context(snapc);
+			unlock_page(page);
+			ceph_queue_writeback(inode);
+			r = wait_event_interruptible(ci->i_cap_wq,
+			       context_is_writeable_or_written(inode, snapc));
+			ceph_put_snap_context(snapc);
+			if (r == -ERESTARTSYS)
+				return r;
+			return -EAGAIN;
+		}
+		ceph_put_snap_context(oldest);
+
+		/* yay, writeable, do it now (without dropping page lock) */
+		dout(" page %p snapc %p not current, but oldest\n",
+		     page, snapc);
+		if (!clear_page_dirty_for_io(page))
+			goto retry_locked;
+		r = writepage_nounlock(page, NULL);
+		if (r < 0)
+			goto fail_nosnap;
+		goto retry_locked;
+	}
+
+	if (PageUptodate(page)) {
+		dout(" page %p already uptodate\n", page);
+		return 0;
+	}
+
+	/* full page? */
+	if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
+		return 0;
+
+	/* past end of file? */
+	i_size = inode->i_size;   /* caller holds i_mutex */
+
+	if (i_size + len > inode->i_sb->s_maxbytes) {
+		/* file is too big */
+		r = -EINVAL;
+		goto fail;
+	}
+
+	if (page_off >= i_size ||
+	    (pos_in_page == 0 && (pos+len) >= i_size &&
+	     end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
+		dout(" zeroing %p 0 - %d and %d - %d\n",
+		     page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
+		zero_user_segments(page,
+				   0, pos_in_page,
+				   end_in_page, PAGE_CACHE_SIZE);
+		return 0;
+	}
+
+	/* we need to read it. */
+	up_read(&mdsc->snap_rwsem);
+	r = readpage_nounlock(file, page);
+	if (r < 0)
+		goto fail_nosnap;
+	goto retry_locked;
+
+fail:
+	up_read(&mdsc->snap_rwsem);
+fail_nosnap:
+	unlock_page(page);
+	return r;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ */
+static int ceph_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct page *page;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	int r;
+
+	do {
+		/* get a page */
+		page = grab_cache_page_write_begin(mapping, index, 0);
+		if (!page)
+			return -ENOMEM;
+		*pagep = page;
+
+		dout("write_begin file %p inode %p page %p %d~%d\n", file,
+		     inode, page, (int)pos, (int)len);
+
+		r = ceph_update_writeable_page(file, pos, len, page);
+	} while (r == -EAGAIN);
+
+	return r;
+}
+
+/*
+ * we don't do anything in here that simple_write_end doesn't do
+ * except adjust dirty page accounting and drop read lock on
+ * mdsc->snap_rwsem.
+ */
+static int ceph_write_end(struct file *file, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	int check_cap = 0;
+
+	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
+	     inode, page, (int)pos, (int)copied, (int)len);
+
+	/* zero the stale part of the page if we did a short copy */
+	if (copied < len)
+		zero_user_segment(page, from+copied, len);
+
+	/* did file size increase? */
+	/* (no need for i_size_read(); we caller holds i_mutex */
+	if (pos+copied > inode->i_size)
+		check_cap = ceph_inode_set_size(inode, pos+copied);
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+
+	set_page_dirty(page);
+
+	unlock_page(page);
+	up_read(&mdsc->snap_rwsem);
+	page_cache_release(page);
+
+	if (check_cap)
+		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+
+	return copied;
+}
+
+/*
+ * we set .direct_IO to indicate direct io is supported, but since we
+ * intercept O_DIRECT reads and writes early, this function should
+ * never get called.
+ */
+static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
+			      const struct iovec *iov,
+			      loff_t pos, unsigned long nr_segs)
+{
+	WARN_ON(1);
+	return -EINVAL;
+}
+
+const struct address_space_operations ceph_aops = {
+	.readpage = ceph_readpage,
+	.readpages = ceph_readpages,
+	.writepage = ceph_writepage,
+	.writepages = ceph_writepages_start,
+	.write_begin = ceph_write_begin,
+	.write_end = ceph_write_end,
+	.set_page_dirty = ceph_set_page_dirty,
+	.invalidatepage = ceph_invalidatepage,
+	.releasepage = ceph_releasepage,
+	.direct_IO = ceph_direct_io,
+};
+
+
+/*
+ * vm ops
+ */
+
+/*
+ * Reuse write_begin here for simplicity.
+ */
+static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct page *page = vmf->page;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	loff_t off = page->index << PAGE_CACHE_SHIFT;
+	loff_t size, len;
+	int ret;
+
+	size = i_size_read(inode);
+	if (off + PAGE_CACHE_SIZE <= size)
+		len = PAGE_CACHE_SIZE;
+	else
+		len = size & ~PAGE_CACHE_MASK;
+
+	dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
+	     off, len, page, page->index);
+
+	lock_page(page);
+
+	ret = VM_FAULT_NOPAGE;
+	if ((off > size) ||
+	    (page->mapping != inode->i_mapping))
+		goto out;
+
+	ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+	if (ret == 0) {
+		/* success.  we'll keep the page locked. */
+		set_page_dirty(page);
+		up_read(&mdsc->snap_rwsem);
+		ret = VM_FAULT_LOCKED;
+	} else {
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else
+			ret = VM_FAULT_SIGBUS;
+	}
+out:
+	dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
+	if (ret != VM_FAULT_LOCKED)
+		unlock_page(page);
+	return ret;
+}
+
+static struct vm_operations_struct ceph_vmops = {
+	.fault		= filemap_fault,
+	.page_mkwrite	= ceph_page_mkwrite,
+};
+
+int ceph_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+	file_accessed(file);
+	vma->vm_ops = &ceph_vmops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+	return 0;
+}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 00000000..f605753c
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,3087 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
+#include <linux/ceph/messenger.h>
+
+/*
+ * Capability management
+ *
+ * The Ceph metadata servers control client access to inode metadata
+ * and file data by issuing capabilities, granting clients permission
+ * to read and/or write both inode field and file data to OSDs
+ * (storage nodes).  Each capability consists of a set of bits
+ * indicating which operations are allowed.
+ *
+ * If the client holds a *_SHARED cap, the client has a coherent value
+ * that can be safely read from the cached inode.
+ *
+ * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
+ * client is allowed to change inode attributes (e.g., file size,
+ * mtime), note its dirty state in the ceph_cap, and asynchronously
+ * flush that metadata change to the MDS.
+ *
+ * In the event of a conflicting operation (perhaps by another
+ * client), the MDS will revoke the conflicting client capabilities.
+ *
+ * In order for a client to cache an inode, it must hold a capability
+ * with at least one MDS server.  When inodes are released, release
+ * notifications are batched and periodically sent en masse to the MDS
+ * cluster to release server state.
+ */
+
+
+/*
+ * Generate readable cap strings for debugging output.
+ */
+#define MAX_CAP_STR 20
+static char cap_str[MAX_CAP_STR][40];
+static DEFINE_SPINLOCK(cap_str_lock);
+static int last_cap_str;
+
+static char *gcap_string(char *s, int c)
+{
+	if (c & CEPH_CAP_GSHARED)
+		*s++ = 's';
+	if (c & CEPH_CAP_GEXCL)
+		*s++ = 'x';
+	if (c & CEPH_CAP_GCACHE)
+		*s++ = 'c';
+	if (c & CEPH_CAP_GRD)
+		*s++ = 'r';
+	if (c & CEPH_CAP_GWR)
+		*s++ = 'w';
+	if (c & CEPH_CAP_GBUFFER)
+		*s++ = 'b';
+	if (c & CEPH_CAP_GLAZYIO)
+		*s++ = 'l';
+	return s;
+}
+
+const char *ceph_cap_string(int caps)
+{
+	int i;
+	char *s;
+	int c;
+
+	spin_lock(&cap_str_lock);
+	i = last_cap_str++;
+	if (last_cap_str == MAX_CAP_STR)
+		last_cap_str = 0;
+	spin_unlock(&cap_str_lock);
+
+	s = cap_str[i];
+
+	if (caps & CEPH_CAP_PIN)
+		*s++ = 'p';
+
+	c = (caps >> CEPH_CAP_SAUTH) & 3;
+	if (c) {
+		*s++ = 'A';
+		s = gcap_string(s, c);
+	}
+
+	c = (caps >> CEPH_CAP_SLINK) & 3;
+	if (c) {
+		*s++ = 'L';
+		s = gcap_string(s, c);
+	}
+
+	c = (caps >> CEPH_CAP_SXATTR) & 3;
+	if (c) {
+		*s++ = 'X';
+		s = gcap_string(s, c);
+	}
+
+	c = caps >> CEPH_CAP_SFILE;
+	if (c) {
+		*s++ = 'F';
+		s = gcap_string(s, c);
+	}
+
+	if (s == cap_str[i])
+		*s++ = '-';
+	*s = 0;
+	return cap_str[i];
+}
+
+void ceph_caps_init(struct ceph_mds_client *mdsc)
+{
+	INIT_LIST_HEAD(&mdsc->caps_list);
+	spin_lock_init(&mdsc->caps_list_lock);
+}
+
+void ceph_caps_finalize(struct ceph_mds_client *mdsc)
+{
+	struct ceph_cap *cap;
+
+	spin_lock(&mdsc->caps_list_lock);
+	while (!list_empty(&mdsc->caps_list)) {
+		cap = list_first_entry(&mdsc->caps_list,
+				       struct ceph_cap, caps_item);
+		list_del(&cap->caps_item);
+		kmem_cache_free(ceph_cap_cachep, cap);
+	}
+	mdsc->caps_total_count = 0;
+	mdsc->caps_avail_count = 0;
+	mdsc->caps_use_count = 0;
+	mdsc->caps_reserve_count = 0;
+	mdsc->caps_min_count = 0;
+	spin_unlock(&mdsc->caps_list_lock);
+}
+
+void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
+{
+	spin_lock(&mdsc->caps_list_lock);
+	mdsc->caps_min_count += delta;
+	BUG_ON(mdsc->caps_min_count < 0);
+	spin_unlock(&mdsc->caps_list_lock);
+}
+
+int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+		      struct ceph_cap_reservation *ctx, int need)
+{
+	int i;
+	struct ceph_cap *cap;
+	int have;
+	int alloc = 0;
+	LIST_HEAD(newcaps);
+	int ret = 0;
+
+	dout("reserve caps ctx=%p need=%d\n", ctx, need);
+
+	/* first reserve any caps that are already allocated */
+	spin_lock(&mdsc->caps_list_lock);
+	if (mdsc->caps_avail_count >= need)
+		have = need;
+	else
+		have = mdsc->caps_avail_count;
+	mdsc->caps_avail_count -= have;
+	mdsc->caps_reserve_count += have;
+	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+					 mdsc->caps_reserve_count +
+					 mdsc->caps_avail_count);
+	spin_unlock(&mdsc->caps_list_lock);
+
+	for (i = have; i < need; i++) {
+		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+		if (!cap) {
+			ret = -ENOMEM;
+			goto out_alloc_count;
+		}
+		list_add(&cap->caps_item, &newcaps);
+		alloc++;
+	}
+	BUG_ON(have + alloc != need);
+
+	spin_lock(&mdsc->caps_list_lock);
+	mdsc->caps_total_count += alloc;
+	mdsc->caps_reserve_count += alloc;
+	list_splice(&newcaps, &mdsc->caps_list);
+
+	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+					 mdsc->caps_reserve_count +
+					 mdsc->caps_avail_count);
+	spin_unlock(&mdsc->caps_list_lock);
+
+	ctx->count = need;
+	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
+	     ctx, mdsc->caps_total_count, mdsc->caps_use_count,
+	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
+	return 0;
+
+out_alloc_count:
+	/* we didn't manage to reserve as much as we needed */
+	pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+		   ctx, need, have);
+	return ret;
+}
+
+int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
+			struct ceph_cap_reservation *ctx)
+{
+	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
+	if (ctx->count) {
+		spin_lock(&mdsc->caps_list_lock);
+		BUG_ON(mdsc->caps_reserve_count < ctx->count);
+		mdsc->caps_reserve_count -= ctx->count;
+		mdsc->caps_avail_count += ctx->count;
+		ctx->count = 0;
+		dout("unreserve caps %d = %d used + %d resv + %d avail\n",
+		     mdsc->caps_total_count, mdsc->caps_use_count,
+		     mdsc->caps_reserve_count, mdsc->caps_avail_count);
+		BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+						 mdsc->caps_reserve_count +
+						 mdsc->caps_avail_count);
+		spin_unlock(&mdsc->caps_list_lock);
+	}
+	return 0;
+}
+
+static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
+				struct ceph_cap_reservation *ctx)
+{
+	struct ceph_cap *cap = NULL;
+
+	/* temporary, until we do something about cap import/export */
+	if (!ctx) {
+		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+		if (cap) {
+			mdsc->caps_use_count++;
+			mdsc->caps_total_count++;
+		}
+		return cap;
+	}
+
+	spin_lock(&mdsc->caps_list_lock);
+	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
+	     ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
+	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
+	BUG_ON(!ctx->count);
+	BUG_ON(ctx->count > mdsc->caps_reserve_count);
+	BUG_ON(list_empty(&mdsc->caps_list));
+
+	ctx->count--;
+	mdsc->caps_reserve_count--;
+	mdsc->caps_use_count++;
+
+	cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
+	list_del(&cap->caps_item);
+
+	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+	       mdsc->caps_reserve_count + mdsc->caps_avail_count);
+	spin_unlock(&mdsc->caps_list_lock);
+	return cap;
+}
+
+void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
+{
+	spin_lock(&mdsc->caps_list_lock);
+	dout("put_cap %p %d = %d used + %d resv + %d avail\n",
+	     cap, mdsc->caps_total_count, mdsc->caps_use_count,
+	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
+	mdsc->caps_use_count--;
+	/*
+	 * Keep some preallocated caps around (ceph_min_count), to
+	 * avoid lots of free/alloc churn.
+	 */
+	if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
+				      mdsc->caps_min_count) {
+		mdsc->caps_total_count--;
+		kmem_cache_free(ceph_cap_cachep, cap);
+	} else {
+		mdsc->caps_avail_count++;
+		list_add(&cap->caps_item, &mdsc->caps_list);
+	}
+
+	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+	       mdsc->caps_reserve_count + mdsc->caps_avail_count);
+	spin_unlock(&mdsc->caps_list_lock);
+}
+
+void ceph_reservation_status(struct ceph_fs_client *fsc,
+			     int *total, int *avail, int *used, int *reserved,
+			     int *min)
+{
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+
+	if (total)
+		*total = mdsc->caps_total_count;
+	if (avail)
+		*avail = mdsc->caps_avail_count;
+	if (used)
+		*used = mdsc->caps_use_count;
+	if (reserved)
+		*reserved = mdsc->caps_reserve_count;
+	if (min)
+		*min = mdsc->caps_min_count;
+}
+
+/*
+ * Find ceph_cap for given mds, if any.
+ *
+ * Called with i_lock held.
+ */
+static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+	struct ceph_cap *cap;
+	struct rb_node *n = ci->i_caps.rb_node;
+
+	while (n) {
+		cap = rb_entry(n, struct ceph_cap, ci_node);
+		if (mds < cap->mds)
+			n = n->rb_left;
+		else if (mds > cap->mds)
+			n = n->rb_right;
+		else
+			return cap;
+	}
+	return NULL;
+}
+
+struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+	struct ceph_cap *cap;
+
+	spin_lock(&ci->vfs_inode.i_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	spin_unlock(&ci->vfs_inode.i_lock);
+	return cap;
+}
+
+/*
+ * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
+ */
+static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
+{
+	struct ceph_cap *cap;
+	int mds = -1;
+	struct rb_node *p;
+
+	/* prefer mds with WR|BUFFER|EXCL caps */
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		mds = cap->mds;
+		if (cap->issued & (CEPH_CAP_FILE_WR |
+				   CEPH_CAP_FILE_BUFFER |
+				   CEPH_CAP_FILE_EXCL))
+			break;
+	}
+	return mds;
+}
+
+int ceph_get_cap_mds(struct inode *inode)
+{
+	int mds;
+	spin_lock(&inode->i_lock);
+	mds = __ceph_get_cap_mds(ceph_inode(inode));
+	spin_unlock(&inode->i_lock);
+	return mds;
+}
+
+/*
+ * Called under i_lock.
+ */
+static void __insert_cap_node(struct ceph_inode_info *ci,
+			      struct ceph_cap *new)
+{
+	struct rb_node **p = &ci->i_caps.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_cap *cap = NULL;
+
+	while (*p) {
+		parent = *p;
+		cap = rb_entry(parent, struct ceph_cap, ci_node);
+		if (new->mds < cap->mds)
+			p = &(*p)->rb_left;
+		else if (new->mds > cap->mds)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->ci_node, parent, p);
+	rb_insert_color(&new->ci_node, &ci->i_caps);
+}
+
+/*
+ * (re)set cap hold timeouts, which control the delayed release
+ * of unused caps back to the MDS.  Should be called on cap use.
+ */
+static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
+			       struct ceph_inode_info *ci)
+{
+	struct ceph_mount_options *ma = mdsc->fsc->mount_options;
+
+	ci->i_hold_caps_min = round_jiffies(jiffies +
+					    ma->caps_wanted_delay_min * HZ);
+	ci->i_hold_caps_max = round_jiffies(jiffies +
+					    ma->caps_wanted_delay_max * HZ);
+	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
+	     ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+}
+
+/*
+ * (Re)queue cap at the end of the delayed cap release list.
+ *
+ * If I_FLUSH is set, leave the inode at the front of the list.
+ *
+ * Caller holds i_lock
+ *    -> we take mdsc->cap_delay_lock
+ */
+static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
+				struct ceph_inode_info *ci)
+{
+	__cap_set_timeouts(mdsc, ci);
+	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+	     ci->i_ceph_flags, ci->i_hold_caps_max);
+	if (!mdsc->stopping) {
+		spin_lock(&mdsc->cap_delay_lock);
+		if (!list_empty(&ci->i_cap_delay_list)) {
+			if (ci->i_ceph_flags & CEPH_I_FLUSH)
+				goto no_change;
+			list_del_init(&ci->i_cap_delay_list);
+		}
+		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+no_change:
+		spin_unlock(&mdsc->cap_delay_lock);
+	}
+}
+
+/*
+ * Queue an inode for immediate writeback.  Mark inode with I_FLUSH,
+ * indicating we should send a cap message to flush dirty metadata
+ * asap, and move to the front of the delayed cap list.
+ */
+static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
+				      struct ceph_inode_info *ci)
+{
+	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+	spin_lock(&mdsc->cap_delay_lock);
+	ci->i_ceph_flags |= CEPH_I_FLUSH;
+	if (!list_empty(&ci->i_cap_delay_list))
+		list_del_init(&ci->i_cap_delay_list);
+	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+	spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Cancel delayed work on cap.
+ *
+ * Caller must hold i_lock.
+ */
+static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
+			       struct ceph_inode_info *ci)
+{
+	dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+	if (list_empty(&ci->i_cap_delay_list))
+		return;
+	spin_lock(&mdsc->cap_delay_lock);
+	list_del_init(&ci->i_cap_delay_list);
+	spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Common issue checks for add_cap, handle_cap_grant.
+ */
+static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
+			      unsigned issued)
+{
+	unsigned had = __ceph_caps_issued(ci, NULL);
+
+	/*
+	 * Each time we receive FILE_CACHE anew, we increment
+	 * i_rdcache_gen.
+	 */
+	if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+	    (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
+		ci->i_rdcache_gen++;
+
+	/*
+	 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
+	 * don't know what happened to this directory while we didn't
+	 * have the cap.
+	 */
+	if ((issued & CEPH_CAP_FILE_SHARED) &&
+	    (had & CEPH_CAP_FILE_SHARED) == 0) {
+		ci->i_shared_gen++;
+		if (S_ISDIR(ci->vfs_inode.i_mode)) {
+			dout(" marking %p NOT complete\n", &ci->vfs_inode);
+			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+		}
+	}
+}
+
+/*
+ * Add a capability under the given MDS session.
+ *
+ * Caller should hold session snap_rwsem (read) and s_mutex.
+ *
+ * @fmode is the open file mode, if we are opening a file, otherwise
+ * it is < 0.  (This is so we can atomically add the cap and add an
+ * open file reference to it.)
+ */
+int ceph_add_cap(struct inode *inode,
+		 struct ceph_mds_session *session, u64 cap_id,
+		 int fmode, unsigned issued, unsigned wanted,
+		 unsigned seq, unsigned mseq, u64 realmino, int flags,
+		 struct ceph_cap_reservation *caps_reservation)
+{
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *new_cap = NULL;
+	struct ceph_cap *cap;
+	int mds = session->s_mds;
+	int actual_wanted;
+
+	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
+	     session->s_mds, cap_id, ceph_cap_string(issued), seq);
+
+	/*
+	 * If we are opening the file, include file mode wanted bits
+	 * in wanted.
+	 */
+	if (fmode >= 0)
+		wanted |= ceph_caps_for_mode(fmode);
+
+retry:
+	spin_lock(&inode->i_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	if (!cap) {
+		if (new_cap) {
+			cap = new_cap;
+			new_cap = NULL;
+		} else {
+			spin_unlock(&inode->i_lock);
+			new_cap = get_cap(mdsc, caps_reservation);
+			if (new_cap == NULL)
+				return -ENOMEM;
+			goto retry;
+		}
+
+		cap->issued = 0;
+		cap->implemented = 0;
+		cap->mds = mds;
+		cap->mds_wanted = 0;
+
+		cap->ci = ci;
+		__insert_cap_node(ci, cap);
+
+		/* clear out old exporting info?  (i.e. on cap import) */
+		if (ci->i_cap_exporting_mds == mds) {
+			ci->i_cap_exporting_issued = 0;
+			ci->i_cap_exporting_mseq = 0;
+			ci->i_cap_exporting_mds = -1;
+		}
+
+		/* add to session cap list */
+		cap->session = session;
+		spin_lock(&session->s_cap_lock);
+		list_add_tail(&cap->session_caps, &session->s_caps);
+		session->s_nr_caps++;
+		spin_unlock(&session->s_cap_lock);
+	} else if (new_cap)
+		ceph_put_cap(mdsc, new_cap);
+
+	if (!ci->i_snap_realm) {
+		/*
+		 * add this inode to the appropriate snap realm
+		 */
+		struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
+							       realmino);
+		if (realm) {
+			ceph_get_snap_realm(mdsc, realm);
+			spin_lock(&realm->inodes_with_caps_lock);
+			ci->i_snap_realm = realm;
+			list_add(&ci->i_snap_realm_item,
+				 &realm->inodes_with_caps);
+			spin_unlock(&realm->inodes_with_caps_lock);
+		} else {
+			pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
+			       realmino);
+			WARN_ON(!realm);
+		}
+	}
+
+	__check_cap_issue(ci, cap, issued);
+
+	/*
+	 * If we are issued caps we don't want, or the mds' wanted
+	 * value appears to be off, queue a check so we'll release
+	 * later and/or update the mds wanted value.
+	 */
+	actual_wanted = __ceph_caps_wanted(ci);
+	if ((wanted & ~actual_wanted) ||
+	    (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
+		dout(" issued %s, mds wanted %s, actual %s, queueing\n",
+		     ceph_cap_string(issued), ceph_cap_string(wanted),
+		     ceph_cap_string(actual_wanted));
+		__cap_delay_requeue(mdsc, ci);
+	}
+
+	if (flags & CEPH_CAP_FLAG_AUTH)
+		ci->i_auth_cap = cap;
+	else if (ci->i_auth_cap == cap)
+		ci->i_auth_cap = NULL;
+
+	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
+	     inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
+	     ceph_cap_string(issued|cap->issued), seq, mds);
+	cap->cap_id = cap_id;
+	cap->issued = issued;
+	cap->implemented |= issued;
+	cap->mds_wanted |= wanted;
+	cap->seq = seq;
+	cap->issue_seq = seq;
+	cap->mseq = mseq;
+	cap->cap_gen = session->s_cap_gen;
+
+	if (fmode >= 0)
+		__ceph_get_fmode(ci, fmode);
+	spin_unlock(&inode->i_lock);
+	wake_up_all(&ci->i_cap_wq);
+	return 0;
+}
+
+/*
+ * Return true if cap has not timed out and belongs to the current
+ * generation of the MDS session (i.e. has not gone 'stale' due to
+ * us losing touch with the mds).
+ */
+static int __cap_is_valid(struct ceph_cap *cap)
+{
+	unsigned long ttl;
+	u32 gen;
+
+	spin_lock(&cap->session->s_cap_lock);
+	gen = cap->session->s_cap_gen;
+	ttl = cap->session->s_cap_ttl;
+	spin_unlock(&cap->session->s_cap_lock);
+
+	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
+		dout("__cap_is_valid %p cap %p issued %s "
+		     "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+		     cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Return set of valid cap bits issued to us.  Note that caps time
+ * out, and may be invalidated in bulk if the client session times out
+ * and session->s_cap_gen is bumped.
+ */
+int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
+{
+	int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
+	struct ceph_cap *cap;
+	struct rb_node *p;
+
+	if (implemented)
+		*implemented = 0;
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (!__cap_is_valid(cap))
+			continue;
+		dout("__ceph_caps_issued %p cap %p issued %s\n",
+		     &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+		have |= cap->issued;
+		if (implemented)
+			*implemented |= cap->implemented;
+	}
+	return have;
+}
+
+/*
+ * Get cap bits issued by caps other than @ocap
+ */
+int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
+{
+	int have = ci->i_snap_caps;
+	struct ceph_cap *cap;
+	struct rb_node *p;
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (cap == ocap)
+			continue;
+		if (!__cap_is_valid(cap))
+			continue;
+		have |= cap->issued;
+	}
+	return have;
+}
+
+/*
+ * Move a cap to the end of the LRU (oldest caps at list head, newest
+ * at list tail).
+ */
+static void __touch_cap(struct ceph_cap *cap)
+{
+	struct ceph_mds_session *s = cap->session;
+
+	spin_lock(&s->s_cap_lock);
+	if (s->s_cap_iterator == NULL) {
+		dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+		     s->s_mds);
+		list_move_tail(&cap->session_caps, &s->s_caps);
+	} else {
+		dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
+		     &cap->ci->vfs_inode, cap, s->s_mds);
+	}
+	spin_unlock(&s->s_cap_lock);
+}
+
+/*
+ * Check if we hold the given mask.  If so, move the cap(s) to the
+ * front of their respective LRUs.  (This is the preferred way for
+ * callers to check for caps they want.)
+ */
+int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
+{
+	struct ceph_cap *cap;
+	struct rb_node *p;
+	int have = ci->i_snap_caps;
+
+	if ((have & mask) == mask) {
+		dout("__ceph_caps_issued_mask %p snap issued %s"
+		     " (mask %s)\n", &ci->vfs_inode,
+		     ceph_cap_string(have),
+		     ceph_cap_string(mask));
+		return 1;
+	}
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (!__cap_is_valid(cap))
+			continue;
+		if ((cap->issued & mask) == mask) {
+			dout("__ceph_caps_issued_mask %p cap %p issued %s"
+			     " (mask %s)\n", &ci->vfs_inode, cap,
+			     ceph_cap_string(cap->issued),
+			     ceph_cap_string(mask));
+			if (touch)
+				__touch_cap(cap);
+			return 1;
+		}
+
+		/* does a combination of caps satisfy mask? */
+		have |= cap->issued;
+		if ((have & mask) == mask) {
+			dout("__ceph_caps_issued_mask %p combo issued %s"
+			     " (mask %s)\n", &ci->vfs_inode,
+			     ceph_cap_string(cap->issued),
+			     ceph_cap_string(mask));
+			if (touch) {
+				struct rb_node *q;
+
+				/* touch this + preceding caps */
+				__touch_cap(cap);
+				for (q = rb_first(&ci->i_caps); q != p;
+				     q = rb_next(q)) {
+					cap = rb_entry(q, struct ceph_cap,
+						       ci_node);
+					if (!__cap_is_valid(cap))
+						continue;
+					__touch_cap(cap);
+				}
+			}
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Return true if mask caps are currently being revoked by an MDS.
+ */
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_cap *cap;
+	struct rb_node *p;
+	int ret = 0;
+
+	spin_lock(&inode->i_lock);
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (__cap_is_valid(cap) &&
+		    (cap->implemented & ~cap->issued & mask)) {
+			ret = 1;
+			break;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	dout("ceph_caps_revoking %p %s = %d\n", inode,
+	     ceph_cap_string(mask), ret);
+	return ret;
+}
+
+int __ceph_caps_used(struct ceph_inode_info *ci)
+{
+	int used = 0;
+	if (ci->i_pin_ref)
+		used |= CEPH_CAP_PIN;
+	if (ci->i_rd_ref)
+		used |= CEPH_CAP_FILE_RD;
+	if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
+		used |= CEPH_CAP_FILE_CACHE;
+	if (ci->i_wr_ref)
+		used |= CEPH_CAP_FILE_WR;
+	if (ci->i_wb_ref || ci->i_wrbuffer_ref)
+		used |= CEPH_CAP_FILE_BUFFER;
+	return used;
+}
+
+/*
+ * wanted, by virtue of open file modes
+ */
+int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
+{
+	int want = 0;
+	int mode;
+	for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
+		if (ci->i_nr_by_mode[mode])
+			want |= ceph_caps_for_mode(mode);
+	return want;
+}
+
+/*
+ * Return caps we have registered with the MDS(s) as 'wanted'.
+ */
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
+{
+	struct ceph_cap *cap;
+	struct rb_node *p;
+	int mds_wanted = 0;
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (!__cap_is_valid(cap))
+			continue;
+		mds_wanted |= cap->mds_wanted;
+	}
+	return mds_wanted;
+}
+
+/*
+ * called under i_lock
+ */
+static int __ceph_is_any_caps(struct ceph_inode_info *ci)
+{
+	return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
+}
+
+/*
+ * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
+ *
+ * caller should hold i_lock.
+ * caller will not hold session s_mutex if called from destroy_inode.
+ */
+void __ceph_remove_cap(struct ceph_cap *cap)
+{
+	struct ceph_mds_session *session = cap->session;
+	struct ceph_inode_info *ci = cap->ci;
+	struct ceph_mds_client *mdsc =
+		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+	int removed = 0;
+
+	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+
+	/* remove from session list */
+	spin_lock(&session->s_cap_lock);
+	if (session->s_cap_iterator == cap) {
+		/* not yet, we are iterating over this very cap */
+		dout("__ceph_remove_cap  delaying %p removal from session %p\n",
+		     cap, cap->session);
+	} else {
+		list_del_init(&cap->session_caps);
+		session->s_nr_caps--;
+		cap->session = NULL;
+		removed = 1;
+	}
+	/* protect backpointer with s_cap_lock: see iterate_session_caps */
+	cap->ci = NULL;
+	spin_unlock(&session->s_cap_lock);
+
+	/* remove from inode list */
+	rb_erase(&cap->ci_node, &ci->i_caps);
+	if (ci->i_auth_cap == cap)
+		ci->i_auth_cap = NULL;
+
+	if (removed)
+		ceph_put_cap(mdsc, cap);
+
+	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
+		struct ceph_snap_realm *realm = ci->i_snap_realm;
+		spin_lock(&realm->inodes_with_caps_lock);
+		list_del_init(&ci->i_snap_realm_item);
+		ci->i_snap_realm_counter++;
+		ci->i_snap_realm = NULL;
+		spin_unlock(&realm->inodes_with_caps_lock);
+		ceph_put_snap_realm(mdsc, realm);
+	}
+	if (!__ceph_is_any_real_caps(ci))
+		__cap_delay_cancel(mdsc, ci);
+}
+
+/*
+ * Build and send a cap message to the given MDS.
+ *
+ * Caller should be holding s_mutex.
+ */
+static int send_cap_msg(struct ceph_mds_session *session,
+			u64 ino, u64 cid, int op,
+			int caps, int wanted, int dirty,
+			u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
+			u64 size, u64 max_size,
+			struct timespec *mtime, struct timespec *atime,
+			u64 time_warp_seq,
+			uid_t uid, gid_t gid, mode_t mode,
+			u64 xattr_version,
+			struct ceph_buffer *xattrs_buf,
+			u64 follows)
+{
+	struct ceph_mds_caps *fc;
+	struct ceph_msg *msg;
+
+	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
+	     " seq %u/%u mseq %u follows %lld size %llu/%llu"
+	     " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
+	     cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
+	     ceph_cap_string(dirty),
+	     seq, issue_seq, mseq, follows, size, max_size,
+	     xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
+	if (!msg)
+		return -ENOMEM;
+
+	msg->hdr.tid = cpu_to_le64(flush_tid);
+
+	fc = msg->front.iov_base;
+	memset(fc, 0, sizeof(*fc));
+
+	fc->cap_id = cpu_to_le64(cid);
+	fc->op = cpu_to_le32(op);
+	fc->seq = cpu_to_le32(seq);
+	fc->issue_seq = cpu_to_le32(issue_seq);
+	fc->migrate_seq = cpu_to_le32(mseq);
+	fc->caps = cpu_to_le32(caps);
+	fc->wanted = cpu_to_le32(wanted);
+	fc->dirty = cpu_to_le32(dirty);
+	fc->ino = cpu_to_le64(ino);
+	fc->snap_follows = cpu_to_le64(follows);
+
+	fc->size = cpu_to_le64(size);
+	fc->max_size = cpu_to_le64(max_size);
+	if (mtime)
+		ceph_encode_timespec(&fc->mtime, mtime);
+	if (atime)
+		ceph_encode_timespec(&fc->atime, atime);
+	fc->time_warp_seq = cpu_to_le32(time_warp_seq);
+
+	fc->uid = cpu_to_le32(uid);
+	fc->gid = cpu_to_le32(gid);
+	fc->mode = cpu_to_le32(mode);
+
+	fc->xattr_version = cpu_to_le64(xattr_version);
+	if (xattrs_buf) {
+		msg->middle = ceph_buffer_get(xattrs_buf);
+		fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+		msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+	}
+
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+static void __queue_cap_release(struct ceph_mds_session *session,
+				u64 ino, u64 cap_id, u32 migrate_seq,
+				u32 issue_seq)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_cap_release *head;
+	struct ceph_mds_cap_item *item;
+
+	spin_lock(&session->s_cap_lock);
+	BUG_ON(!session->s_num_cap_releases);
+	msg = list_first_entry(&session->s_cap_releases,
+			       struct ceph_msg, list_head);
+
+	dout(" adding %llx release to mds%d msg %p (%d left)\n",
+	     ino, session->s_mds, msg, session->s_num_cap_releases);
+
+	BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
+	head = msg->front.iov_base;
+	head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+	item = msg->front.iov_base + msg->front.iov_len;
+	item->ino = cpu_to_le64(ino);
+	item->cap_id = cpu_to_le64(cap_id);
+	item->migrate_seq = cpu_to_le32(migrate_seq);
+	item->seq = cpu_to_le32(issue_seq);
+
+	session->s_num_cap_releases--;
+
+	msg->front.iov_len += sizeof(*item);
+	if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+		dout(" release msg %p full\n", msg);
+		list_move_tail(&msg->list_head, &session->s_cap_releases_done);
+	} else {
+		dout(" release msg %p at %d/%d (%d)\n", msg,
+		     (int)le32_to_cpu(head->num),
+		     (int)CEPH_CAPS_PER_RELEASE,
+		     (int)msg->front.iov_len);
+	}
+	spin_unlock(&session->s_cap_lock);
+}
+
+/*
+ * Queue cap releases when an inode is dropped from our cache.  Since
+ * inode is about to be destroyed, there is no need for i_lock.
+ */
+void ceph_queue_caps_release(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct rb_node *p;
+
+	p = rb_first(&ci->i_caps);
+	while (p) {
+		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+		struct ceph_mds_session *session = cap->session;
+
+		__queue_cap_release(session, ceph_ino(inode), cap->cap_id,
+				    cap->mseq, cap->issue_seq);
+		p = rb_next(p);
+		__ceph_remove_cap(cap);
+	}
+}
+
+/*
+ * Send a cap msg on the given inode.  Update our caps state, then
+ * drop i_lock and send the message.
+ *
+ * Make note of max_size reported/requested from mds, revoked caps
+ * that have now been implemented.
+ *
+ * Make half-hearted attempt ot to invalidate page cache if we are
+ * dropping RDCACHE.  Note that this will leave behind locked pages
+ * that we'll then need to deal with elsewhere.
+ *
+ * Return non-zero if delayed release, or we experienced an error
+ * such that the caller should requeue + retry later.
+ *
+ * called with i_lock, then drops it.
+ * caller should hold snap_rwsem (read), s_mutex.
+ */
+static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+		      int op, int used, int want, int retain, int flushing,
+		      unsigned *pflush_tid)
+	__releases(cap->ci->vfs_inode->i_lock)
+{
+	struct ceph_inode_info *ci = cap->ci;
+	struct inode *inode = &ci->vfs_inode;
+	u64 cap_id = cap->cap_id;
+	int held, revoking, dropping, keep;
+	u64 seq, issue_seq, mseq, time_warp_seq, follows;
+	u64 size, max_size;
+	struct timespec mtime, atime;
+	int wake = 0;
+	mode_t mode;
+	uid_t uid;
+	gid_t gid;
+	struct ceph_mds_session *session;
+	u64 xattr_version = 0;
+	struct ceph_buffer *xattr_blob = NULL;
+	int delayed = 0;
+	u64 flush_tid = 0;
+	int i;
+	int ret;
+
+	held = cap->issued | cap->implemented;
+	revoking = cap->implemented & ~cap->issued;
+	retain &= ~revoking;
+	dropping = cap->issued & ~retain;
+
+	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
+	     inode, cap, cap->session,
+	     ceph_cap_string(held), ceph_cap_string(held & retain),
+	     ceph_cap_string(revoking));
+	BUG_ON((retain & CEPH_CAP_PIN) == 0);
+
+	session = cap->session;
+
+	/* don't release wanted unless we've waited a bit. */
+	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+	    time_before(jiffies, ci->i_hold_caps_min)) {
+		dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
+		     ceph_cap_string(cap->issued),
+		     ceph_cap_string(cap->issued & retain),
+		     ceph_cap_string(cap->mds_wanted),
+		     ceph_cap_string(want));
+		want |= cap->mds_wanted;
+		retain |= cap->issued;
+		delayed = 1;
+	}
+	ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+
+	cap->issued &= retain;  /* drop bits we don't want */
+	if (cap->implemented & ~cap->issued) {
+		/*
+		 * Wake up any waiters on wanted -> needed transition.
+		 * This is due to the weird transition from buffered
+		 * to sync IO... we need to flush dirty pages _before_
+		 * allowing sync writes to avoid reordering.
+		 */
+		wake = 1;
+	}
+	cap->implemented &= cap->issued | used;
+	cap->mds_wanted = want;
+
+	if (flushing) {
+		/*
+		 * assign a tid for flush operations so we can avoid
+		 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
+		 * clean type races.  track latest tid for every bit
+		 * so we can handle flush AxFw, flush Fw, and have the
+		 * first ack clean Ax.
+		 */
+		flush_tid = ++ci->i_cap_flush_last_tid;
+		if (pflush_tid)
+			*pflush_tid = flush_tid;
+		dout(" cap_flush_tid %d\n", (int)flush_tid);
+		for (i = 0; i < CEPH_CAP_BITS; i++)
+			if (flushing & (1 << i))
+				ci->i_cap_flush_tid[i] = flush_tid;
+
+		follows = ci->i_head_snapc->seq;
+	} else {
+		follows = 0;
+	}
+
+	keep = cap->implemented;
+	seq = cap->seq;
+	issue_seq = cap->issue_seq;
+	mseq = cap->mseq;
+	size = inode->i_size;
+	ci->i_reported_size = size;
+	max_size = ci->i_wanted_max_size;
+	ci->i_requested_max_size = max_size;
+	mtime = inode->i_mtime;
+	atime = inode->i_atime;
+	time_warp_seq = ci->i_time_warp_seq;
+	uid = inode->i_uid;
+	gid = inode->i_gid;
+	mode = inode->i_mode;
+
+	if (flushing & CEPH_CAP_XATTR_EXCL) {
+		__ceph_build_xattrs_blob(ci);
+		xattr_blob = ci->i_xattrs.blob;
+		xattr_version = ci->i_xattrs.version;
+	}
+
+	spin_unlock(&inode->i_lock);
+
+	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
+		op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+		size, max_size, &mtime, &atime, time_warp_seq,
+		uid, gid, mode, xattr_version, xattr_blob,
+		follows);
+	if (ret < 0) {
+		dout("error sending cap msg, must requeue %p\n", inode);
+		delayed = 1;
+	}
+
+	if (wake)
+		wake_up_all(&ci->i_cap_wq);
+
+	return delayed;
+}
+
+/*
+ * When a snapshot is taken, clients accumulate dirty metadata on
+ * inodes with capabilities in ceph_cap_snaps to describe the file
+ * state at the time the snapshot was taken.  This must be flushed
+ * asynchronously back to the MDS once sync writes complete and dirty
+ * data is written out.
+ *
+ * Unless @again is true, skip cap_snaps that were already sent to
+ * the MDS (i.e., during this session).
+ *
+ * Called under i_lock.  Takes s_mutex as needed.
+ */
+void __ceph_flush_snaps(struct ceph_inode_info *ci,
+			struct ceph_mds_session **psession,
+			int again)
+		__releases(ci->vfs_inode->i_lock)
+		__acquires(ci->vfs_inode->i_lock)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int mds;
+	struct ceph_cap_snap *capsnap;
+	u32 mseq;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
+						    session->s_mutex */
+	u64 next_follows = 0;  /* keep track of how far we've gotten through the
+			     i_cap_snaps list, and skip these entries next time
+			     around to avoid an infinite loop */
+
+	if (psession)
+		session = *psession;
+
+	dout("__flush_snaps %p\n", inode);
+retry:
+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+		/* avoid an infiniute loop after retry */
+		if (capsnap->follows < next_follows)
+			continue;
+		/*
+		 * we need to wait for sync writes to complete and for dirty
+		 * pages to be written out.
+		 */
+		if (capsnap->dirty_pages || capsnap->writing)
+			break;
+
+		/*
+		 * if cap writeback already occurred, we should have dropped
+		 * the capsnap in ceph_put_wrbuffer_cap_refs.
+		 */
+		BUG_ON(capsnap->dirty == 0);
+
+		/* pick mds, take s_mutex */
+		if (ci->i_auth_cap == NULL) {
+			dout("no auth cap (migrating?), doing nothing\n");
+			goto out;
+		}
+
+		/* only flush each capsnap once */
+		if (!again && !list_empty(&capsnap->flushing_item)) {
+			dout("already flushed %p, skipping\n", capsnap);
+			continue;
+		}
+
+		mds = ci->i_auth_cap->session->s_mds;
+		mseq = ci->i_auth_cap->mseq;
+
+		if (session && session->s_mds != mds) {
+			dout("oops, wrong session %p mutex\n", session);
+			mutex_unlock(&session->s_mutex);
+			ceph_put_mds_session(session);
+			session = NULL;
+		}
+		if (!session) {
+			spin_unlock(&inode->i_lock);
+			mutex_lock(&mdsc->mutex);
+			session = __ceph_lookup_mds_session(mdsc, mds);
+			mutex_unlock(&mdsc->mutex);
+			if (session) {
+				dout("inverting session/ino locks on %p\n",
+				     session);
+				mutex_lock(&session->s_mutex);
+			}
+			/*
+			 * if session == NULL, we raced against a cap
+			 * deletion or migration.  retry, and we'll
+			 * get a better @mds value next time.
+			 */
+			spin_lock(&inode->i_lock);
+			goto retry;
+		}
+
+		capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
+		atomic_inc(&capsnap->nref);
+		if (!list_empty(&capsnap->flushing_item))
+			list_del_init(&capsnap->flushing_item);
+		list_add_tail(&capsnap->flushing_item,
+			      &session->s_cap_snaps_flushing);
+		spin_unlock(&inode->i_lock);
+
+		dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
+		     inode, capsnap, capsnap->follows, capsnap->flush_tid);
+		send_cap_msg(session, ceph_vino(inode).ino, 0,
+			     CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+			     capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
+			     capsnap->size, 0,
+			     &capsnap->mtime, &capsnap->atime,
+			     capsnap->time_warp_seq,
+			     capsnap->uid, capsnap->gid, capsnap->mode,
+			     capsnap->xattr_version, capsnap->xattr_blob,
+			     capsnap->follows);
+
+		next_follows = capsnap->follows + 1;
+		ceph_put_cap_snap(capsnap);
+
+		spin_lock(&inode->i_lock);
+		goto retry;
+	}
+
+	/* we flushed them all; remove this inode from the queue */
+	spin_lock(&mdsc->snap_flush_lock);
+	list_del_init(&ci->i_snap_flush_item);
+	spin_unlock(&mdsc->snap_flush_lock);
+
+out:
+	if (psession)
+		*psession = session;
+	else if (session) {
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+	}
+}
+
+static void ceph_flush_snaps(struct ceph_inode_info *ci)
+{
+	struct inode *inode = &ci->vfs_inode;
+
+	spin_lock(&inode->i_lock);
+	__ceph_flush_snaps(ci, NULL, 0);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Mark caps dirty.  If inode is newly dirty, return the dirty flags.
+ * Caller is then responsible for calling __mark_inode_dirty with the
+ * returned flags value.
+ */
+int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+{
+	struct ceph_mds_client *mdsc =
+		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+	struct inode *inode = &ci->vfs_inode;
+	int was = ci->i_dirty_caps;
+	int dirty = 0;
+
+	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+	     ceph_cap_string(mask), ceph_cap_string(was),
+	     ceph_cap_string(was | mask));
+	ci->i_dirty_caps |= mask;
+	if (was == 0) {
+		if (!ci->i_head_snapc)
+			ci->i_head_snapc = ceph_get_snap_context(
+				ci->i_snap_realm->cached_context);
+		dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
+			ci->i_head_snapc);
+		BUG_ON(!list_empty(&ci->i_dirty_item));
+		spin_lock(&mdsc->cap_dirty_lock);
+		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+		spin_unlock(&mdsc->cap_dirty_lock);
+		if (ci->i_flushing_caps == 0) {
+			ihold(inode);
+			dirty |= I_DIRTY_SYNC;
+		}
+	}
+	BUG_ON(list_empty(&ci->i_dirty_item));
+	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
+	    (mask & CEPH_CAP_FILE_BUFFER))
+		dirty |= I_DIRTY_DATASYNC;
+	__cap_delay_requeue(mdsc, ci);
+	return dirty;
+}
+
+/*
+ * Add dirty inode to the flushing list.  Assigned a seq number so we
+ * can wait for caps to flush without starving.
+ *
+ * Called under i_lock.
+ */
+static int __mark_caps_flushing(struct inode *inode,
+				 struct ceph_mds_session *session)
+{
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int flushing;
+
+	BUG_ON(ci->i_dirty_caps == 0);
+	BUG_ON(list_empty(&ci->i_dirty_item));
+
+	flushing = ci->i_dirty_caps;
+	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
+	     ceph_cap_string(flushing),
+	     ceph_cap_string(ci->i_flushing_caps),
+	     ceph_cap_string(ci->i_flushing_caps | flushing));
+	ci->i_flushing_caps |= flushing;
+	ci->i_dirty_caps = 0;
+	dout(" inode %p now !dirty\n", inode);
+
+	spin_lock(&mdsc->cap_dirty_lock);
+	list_del_init(&ci->i_dirty_item);
+
+	ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
+	if (list_empty(&ci->i_flushing_item)) {
+		list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+		mdsc->num_cap_flushing++;
+		dout(" inode %p now flushing seq %lld\n", inode,
+		     ci->i_cap_flush_seq);
+	} else {
+		list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+		dout(" inode %p now flushing (more) seq %lld\n", inode,
+		     ci->i_cap_flush_seq);
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+
+	return flushing;
+}
+
+/*
+ * try to invalidate mapping pages without blocking.
+ */
+static int try_nonblocking_invalidate(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u32 invalidating_gen = ci->i_rdcache_gen;
+
+	spin_unlock(&inode->i_lock);
+	invalidate_mapping_pages(&inode->i_data, 0, -1);
+	spin_lock(&inode->i_lock);
+
+	if (inode->i_data.nrpages == 0 &&
+	    invalidating_gen == ci->i_rdcache_gen) {
+		/* success. */
+		dout("try_nonblocking_invalidate %p success\n", inode);
+		/* save any racing async invalidate some trouble */
+		ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
+		return 0;
+	}
+	dout("try_nonblocking_invalidate %p failed\n", inode);
+	return -1;
+}
+
+/*
+ * Swiss army knife function to examine currently used and wanted
+ * versus held caps.  Release, flush, ack revoked caps to mds as
+ * appropriate.
+ *
+ *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
+ *    cap release further.
+ *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
+ *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
+ *    further delay.
+ */
+void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+		     struct ceph_mds_session *session)
+{
+	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_cap *cap;
+	int file_wanted, used;
+	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
+	int issued, implemented, want, retain, revoking, flushing = 0;
+	int mds = -1;   /* keep track of how far we've gone through i_caps list
+			   to avoid an infinite loop on retry */
+	struct rb_node *p;
+	int tried_invalidate = 0;
+	int delayed = 0, sent = 0, force_requeue = 0, num;
+	int queue_invalidate = 0;
+	int is_delayed = flags & CHECK_CAPS_NODELAY;
+
+	/* if we are unmounting, flush any unused caps immediately. */
+	if (mdsc->stopping)
+		is_delayed = 1;
+
+	spin_lock(&inode->i_lock);
+
+	if (ci->i_ceph_flags & CEPH_I_FLUSH)
+		flags |= CHECK_CAPS_FLUSH;
+
+	/* flush snaps first time around only */
+	if (!list_empty(&ci->i_cap_snaps))
+		__ceph_flush_snaps(ci, &session, 0);
+	goto retry_locked;
+retry:
+	spin_lock(&inode->i_lock);
+retry_locked:
+	file_wanted = __ceph_caps_file_wanted(ci);
+	used = __ceph_caps_used(ci);
+	want = file_wanted | used;
+	issued = __ceph_caps_issued(ci, &implemented);
+	revoking = implemented & ~issued;
+
+	retain = want | CEPH_CAP_PIN;
+	if (!mdsc->stopping && inode->i_nlink > 0) {
+		if (want) {
+			retain |= CEPH_CAP_ANY;       /* be greedy */
+		} else {
+			retain |= CEPH_CAP_ANY_SHARED;
+			/*
+			 * keep RD only if we didn't have the file open RW,
+			 * because then the mds would revoke it anyway to
+			 * journal max_size=0.
+			 */
+			if (ci->i_max_size == 0)
+				retain |= CEPH_CAP_ANY_RD;
+		}
+	}
+
+	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
+	     " issued %s revoking %s retain %s %s%s%s\n", inode,
+	     ceph_cap_string(file_wanted),
+	     ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
+	     ceph_cap_string(ci->i_flushing_caps),
+	     ceph_cap_string(issued), ceph_cap_string(revoking),
+	     ceph_cap_string(retain),
+	     (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
+	     (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
+	     (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+
+	/*
+	 * If we no longer need to hold onto old our caps, and we may
+	 * have cached pages, but don't want them, then try to invalidate.
+	 * If we fail, it's because pages are locked.... try again later.
+	 */
+	if ((!is_delayed || mdsc->stopping) &&
+	    ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
+	    inode->i_data.nrpages &&                 /* have cached pages */
+	    (file_wanted == 0 ||                     /* no open files */
+	     (revoking & (CEPH_CAP_FILE_CACHE|
+			  CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
+	    !tried_invalidate) {
+		dout("check_caps trying to invalidate on %p\n", inode);
+		if (try_nonblocking_invalidate(inode) < 0) {
+			if (revoking & (CEPH_CAP_FILE_CACHE|
+					CEPH_CAP_FILE_LAZYIO)) {
+				dout("check_caps queuing invalidate\n");
+				queue_invalidate = 1;
+				ci->i_rdcache_revoking = ci->i_rdcache_gen;
+			} else {
+				dout("check_caps failed to invalidate pages\n");
+				/* we failed to invalidate pages.  check these
+				   caps again later. */
+				force_requeue = 1;
+				__cap_set_timeouts(mdsc, ci);
+			}
+		}
+		tried_invalidate = 1;
+		goto retry_locked;
+	}
+
+	num = 0;
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		num++;
+
+		/* avoid looping forever */
+		if (mds >= cap->mds ||
+		    ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
+			continue;
+
+		/* NOTE: no side-effects allowed, until we take s_mutex */
+
+		revoking = cap->implemented & ~cap->issued;
+		dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
+		     cap->mds, cap, ceph_cap_string(cap->issued),
+		     ceph_cap_string(cap->implemented),
+		     ceph_cap_string(revoking));
+
+		if (cap == ci->i_auth_cap &&
+		    (cap->issued & CEPH_CAP_FILE_WR)) {
+			/* request larger max_size from MDS? */
+			if (ci->i_wanted_max_size > ci->i_max_size &&
+			    ci->i_wanted_max_size > ci->i_requested_max_size) {
+				dout("requesting new max_size\n");
+				goto ack;
+			}
+
+			/* approaching file_max? */
+			if ((inode->i_size << 1) >= ci->i_max_size &&
+			    (ci->i_reported_size << 1) < ci->i_max_size) {
+				dout("i_size approaching max_size\n");
+				goto ack;
+			}
+		}
+		/* flush anything dirty? */
+		if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
+		    ci->i_dirty_caps) {
+			dout("flushing dirty caps\n");
+			goto ack;
+		}
+
+		/* completed revocation? going down and there are no caps? */
+		if (revoking && (revoking & used) == 0) {
+			dout("completed revocation of %s\n",
+			     ceph_cap_string(cap->implemented & ~cap->issued));
+			goto ack;
+		}
+
+		/* want more caps from mds? */
+		if (want & ~(cap->mds_wanted | cap->issued))
+			goto ack;
+
+		/* things we might delay */
+		if ((cap->issued & ~retain) == 0 &&
+		    cap->mds_wanted == want)
+			continue;     /* nope, all good */
+
+		if (is_delayed)
+			goto ack;
+
+		/* delay? */
+		if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+		    time_before(jiffies, ci->i_hold_caps_max)) {
+			dout(" delaying issued %s -> %s, wanted %s -> %s\n",
+			     ceph_cap_string(cap->issued),
+			     ceph_cap_string(cap->issued & retain),
+			     ceph_cap_string(cap->mds_wanted),
+			     ceph_cap_string(want));
+			delayed++;
+			continue;
+		}
+
+ack:
+		if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+			dout(" skipping %p I_NOFLUSH set\n", inode);
+			continue;
+		}
+
+		if (session && session != cap->session) {
+			dout("oops, wrong session %p mutex\n", session);
+			mutex_unlock(&session->s_mutex);
+			session = NULL;
+		}
+		if (!session) {
+			session = cap->session;
+			if (mutex_trylock(&session->s_mutex) == 0) {
+				dout("inverting session/ino locks on %p\n",
+				     session);
+				spin_unlock(&inode->i_lock);
+				if (took_snap_rwsem) {
+					up_read(&mdsc->snap_rwsem);
+					took_snap_rwsem = 0;
+				}
+				mutex_lock(&session->s_mutex);
+				goto retry;
+			}
+		}
+		/* take snap_rwsem after session mutex */
+		if (!took_snap_rwsem) {
+			if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
+				dout("inverting snap/in locks on %p\n",
+				     inode);
+				spin_unlock(&inode->i_lock);
+				down_read(&mdsc->snap_rwsem);
+				took_snap_rwsem = 1;
+				goto retry;
+			}
+			took_snap_rwsem = 1;
+		}
+
+		if (cap == ci->i_auth_cap && ci->i_dirty_caps)
+			flushing = __mark_caps_flushing(inode, session);
+		else
+			flushing = 0;
+
+		mds = cap->mds;  /* remember mds, so we don't repeat */
+		sent++;
+
+		/* __send_cap drops i_lock */
+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
+				      retain, flushing, NULL);
+		goto retry; /* retake i_lock and restart our cap scan. */
+	}
+
+	/*
+	 * Reschedule delayed caps release if we delayed anything,
+	 * otherwise cancel.
+	 */
+	if (delayed && is_delayed)
+		force_requeue = 1;   /* __send_cap delayed release; requeue */
+	if (!delayed && !is_delayed)
+		__cap_delay_cancel(mdsc, ci);
+	else if (!is_delayed || force_requeue)
+		__cap_delay_requeue(mdsc, ci);
+
+	spin_unlock(&inode->i_lock);
+
+	if (queue_invalidate)
+		ceph_queue_invalidate(inode);
+
+	if (session)
+		mutex_unlock(&session->s_mutex);
+	if (took_snap_rwsem)
+		up_read(&mdsc->snap_rwsem);
+}
+
+/*
+ * Try to flush dirty caps back to the auth mds.
+ */
+static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
+			  unsigned *flush_tid)
+{
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int unlock_session = session ? 0 : 1;
+	int flushing = 0;
+
+retry:
+	spin_lock(&inode->i_lock);
+	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+		dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
+		goto out;
+	}
+	if (ci->i_dirty_caps && ci->i_auth_cap) {
+		struct ceph_cap *cap = ci->i_auth_cap;
+		int used = __ceph_caps_used(ci);
+		int want = __ceph_caps_wanted(ci);
+		int delayed;
+
+		if (!session) {
+			spin_unlock(&inode->i_lock);
+			session = cap->session;
+			mutex_lock(&session->s_mutex);
+			goto retry;
+		}
+		BUG_ON(session != cap->session);
+		if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
+			goto out;
+
+		flushing = __mark_caps_flushing(inode, session);
+
+		/* __send_cap drops i_lock */
+		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
+				     cap->issued | cap->implemented, flushing,
+				     flush_tid);
+		if (!delayed)
+			goto out_unlocked;
+
+		spin_lock(&inode->i_lock);
+		__cap_delay_requeue(mdsc, ci);
+	}
+out:
+	spin_unlock(&inode->i_lock);
+out_unlocked:
+	if (session && unlock_session)
+		mutex_unlock(&session->s_mutex);
+	return flushing;
+}
+
+/*
+ * Return true if we've flushed caps through the given flush_tid.
+ */
+static int caps_are_flushed(struct inode *inode, unsigned tid)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int i, ret = 1;
+
+	spin_lock(&inode->i_lock);
+	for (i = 0; i < CEPH_CAP_BITS; i++)
+		if ((ci->i_flushing_caps & (1 << i)) &&
+		    ci->i_cap_flush_tid[i] <= tid) {
+			/* still flushing this bit */
+			ret = 0;
+			break;
+		}
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+/*
+ * Wait on any unsafe replies for the given inode.  First wait on the
+ * newest request, and make that the upper bound.  Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+static void sync_write_wait(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct list_head *head = &ci->i_unsafe_writes;
+	struct ceph_osd_request *req;
+	u64 last_tid;
+
+	spin_lock(&ci->i_unsafe_lock);
+	if (list_empty(head))
+		goto out;
+
+	/* set upper bound as _last_ entry in chain */
+	req = list_entry(head->prev, struct ceph_osd_request,
+			 r_unsafe_item);
+	last_tid = req->r_tid;
+
+	do {
+		ceph_osdc_get_request(req);
+		spin_unlock(&ci->i_unsafe_lock);
+		dout("sync_write_wait on tid %llu (until %llu)\n",
+		     req->r_tid, last_tid);
+		wait_for_completion(&req->r_safe_completion);
+		spin_lock(&ci->i_unsafe_lock);
+		ceph_osdc_put_request(req);
+
+		/*
+		 * from here on look at first entry in chain, since we
+		 * only want to wait for anything older than last_tid
+		 */
+		if (list_empty(head))
+			break;
+		req = list_entry(head->next, struct ceph_osd_request,
+				 r_unsafe_item);
+	} while (req->r_tid < last_tid);
+out:
+	spin_unlock(&ci->i_unsafe_lock);
+}
+
+int ceph_fsync(struct file *file, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	unsigned flush_tid;
+	int ret;
+	int dirty;
+
+	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
+	sync_write_wait(inode);
+
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret < 0)
+		return ret;
+
+	dirty = try_flush_caps(inode, NULL, &flush_tid);
+	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+
+	/*
+	 * only wait on non-file metadata writeback (the mds
+	 * can recover size and mtime, so we don't need to
+	 * wait for that)
+	 */
+	if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
+		dout("fsync waiting for flush_tid %u\n", flush_tid);
+		ret = wait_event_interruptible(ci->i_cap_wq,
+				       caps_are_flushed(inode, flush_tid));
+	}
+
+	dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
+	return ret;
+}
+
+/*
+ * Flush any dirty caps back to the mds.  If we aren't asked to wait,
+ * queue inode for flush but don't do so immediately, because we can
+ * get by with fewer MDS messages if we wait for data writeback to
+ * complete first.
+ */
+int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	unsigned flush_tid;
+	int err = 0;
+	int dirty;
+	int wait = wbc->sync_mode == WB_SYNC_ALL;
+
+	dout("write_inode %p wait=%d\n", inode, wait);
+	if (wait) {
+		dirty = try_flush_caps(inode, NULL, &flush_tid);
+		if (dirty)
+			err = wait_event_interruptible(ci->i_cap_wq,
+				       caps_are_flushed(inode, flush_tid));
+	} else {
+		struct ceph_mds_client *mdsc =
+			ceph_sb_to_client(inode->i_sb)->mdsc;
+
+		spin_lock(&inode->i_lock);
+		if (__ceph_caps_dirty(ci))
+			__cap_delay_requeue_front(mdsc, ci);
+		spin_unlock(&inode->i_lock);
+	}
+	return err;
+}
+
+/*
+ * After a recovering MDS goes active, we need to resend any caps
+ * we were flushing.
+ *
+ * Caller holds session->s_mutex.
+ */
+static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session)
+{
+	struct ceph_cap_snap *capsnap;
+
+	dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
+	list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
+			    flushing_item) {
+		struct ceph_inode_info *ci = capsnap->ci;
+		struct inode *inode = &ci->vfs_inode;
+		struct ceph_cap *cap;
+
+		spin_lock(&inode->i_lock);
+		cap = ci->i_auth_cap;
+		if (cap && cap->session == session) {
+			dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
+			     cap, capsnap);
+			__ceph_flush_snaps(ci, &session, 1);
+		} else {
+			pr_err("%p auth cap %p not mds%d ???\n", inode,
+			       cap, session->s_mds);
+		}
+		spin_unlock(&inode->i_lock);
+	}
+}
+
+void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_session *session)
+{
+	struct ceph_inode_info *ci;
+
+	kick_flushing_capsnaps(mdsc, session);
+
+	dout("kick_flushing_caps mds%d\n", session->s_mds);
+	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+		struct inode *inode = &ci->vfs_inode;
+		struct ceph_cap *cap;
+		int delayed = 0;
+
+		spin_lock(&inode->i_lock);
+		cap = ci->i_auth_cap;
+		if (cap && cap->session == session) {
+			dout("kick_flushing_caps %p cap %p %s\n", inode,
+			     cap, ceph_cap_string(ci->i_flushing_caps));
+			delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+					     __ceph_caps_used(ci),
+					     __ceph_caps_wanted(ci),
+					     cap->issued | cap->implemented,
+					     ci->i_flushing_caps, NULL);
+			if (delayed) {
+				spin_lock(&inode->i_lock);
+				__cap_delay_requeue(mdsc, ci);
+				spin_unlock(&inode->i_lock);
+			}
+		} else {
+			pr_err("%p auth cap %p not mds%d ???\n", inode,
+			       cap, session->s_mds);
+			spin_unlock(&inode->i_lock);
+		}
+	}
+}
+
+static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
+				     struct ceph_mds_session *session,
+				     struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *cap;
+	int delayed = 0;
+
+	spin_lock(&inode->i_lock);
+	cap = ci->i_auth_cap;
+	dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
+	     ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+	__ceph_flush_snaps(ci, &session, 1);
+	if (ci->i_flushing_caps) {
+		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+				     __ceph_caps_used(ci),
+				     __ceph_caps_wanted(ci),
+				     cap->issued | cap->implemented,
+				     ci->i_flushing_caps, NULL);
+		if (delayed) {
+			spin_lock(&inode->i_lock);
+			__cap_delay_requeue(mdsc, ci);
+			spin_unlock(&inode->i_lock);
+		}
+	} else {
+		spin_unlock(&inode->i_lock);
+	}
+}
+
+
+/*
+ * Take references to capabilities we hold, so that we don't release
+ * them to the MDS prematurely.
+ *
+ * Protected by i_lock.
+ */
+static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+{
+	if (got & CEPH_CAP_PIN)
+		ci->i_pin_ref++;
+	if (got & CEPH_CAP_FILE_RD)
+		ci->i_rd_ref++;
+	if (got & CEPH_CAP_FILE_CACHE)
+		ci->i_rdcache_ref++;
+	if (got & CEPH_CAP_FILE_WR)
+		ci->i_wr_ref++;
+	if (got & CEPH_CAP_FILE_BUFFER) {
+		if (ci->i_wb_ref == 0)
+			ihold(&ci->vfs_inode);
+		ci->i_wb_ref++;
+		dout("__take_cap_refs %p wb %d -> %d (?)\n",
+		     &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
+	}
+}
+
+/*
+ * Try to grab cap references.  Specify those refs we @want, and the
+ * minimal set we @need.  Also include the larger offset we are writing
+ * to (when applicable), and check against max_size here as well.
+ * Note that caller is responsible for ensuring max_size increases are
+ * requested from the MDS.
+ */
+static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
+			    int *got, loff_t endoff, int *check_max, int *err)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int ret = 0;
+	int have, implemented;
+	int file_wanted;
+
+	dout("get_cap_refs %p need %s want %s\n", inode,
+	     ceph_cap_string(need), ceph_cap_string(want));
+	spin_lock(&inode->i_lock);
+
+	/* make sure file is actually open */
+	file_wanted = __ceph_caps_file_wanted(ci);
+	if ((file_wanted & need) == 0) {
+		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
+		     ceph_cap_string(need), ceph_cap_string(file_wanted));
+		*err = -EBADF;
+		ret = 1;
+		goto out;
+	}
+
+	if (need & CEPH_CAP_FILE_WR) {
+		if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
+			dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
+			     inode, endoff, ci->i_max_size);
+			if (endoff > ci->i_wanted_max_size) {
+				*check_max = 1;
+				ret = 1;
+			}
+			goto out;
+		}
+		/*
+		 * If a sync write is in progress, we must wait, so that we
+		 * can get a final snapshot value for size+mtime.
+		 */
+		if (__ceph_have_pending_cap_snap(ci)) {
+			dout("get_cap_refs %p cap_snap_pending\n", inode);
+			goto out;
+		}
+	}
+	have = __ceph_caps_issued(ci, &implemented);
+
+	/*
+	 * disallow writes while a truncate is pending
+	 */
+	if (ci->i_truncate_pending)
+		have &= ~CEPH_CAP_FILE_WR;
+
+	if ((have & need) == need) {
+		/*
+		 * Look at (implemented & ~have & not) so that we keep waiting
+		 * on transition from wanted -> needed caps.  This is needed
+		 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
+		 * going before a prior buffered writeback happens.
+		 */
+		int not = want & ~(have & need);
+		int revoking = implemented & ~have;
+		dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
+		     inode, ceph_cap_string(have), ceph_cap_string(not),
+		     ceph_cap_string(revoking));
+		if ((revoking & not) == 0) {
+			*got = need | (have & want);
+			__take_cap_refs(ci, *got);
+			ret = 1;
+		}
+	} else {
+		dout("get_cap_refs %p have %s needed %s\n", inode,
+		     ceph_cap_string(have), ceph_cap_string(need));
+	}
+out:
+	spin_unlock(&inode->i_lock);
+	dout("get_cap_refs %p ret %d got %s\n", inode,
+	     ret, ceph_cap_string(*got));
+	return ret;
+}
+
+/*
+ * Check the offset we are writing up to against our current
+ * max_size.  If necessary, tell the MDS we want to write to
+ * a larger offset.
+ */
+static void check_max_size(struct inode *inode, loff_t endoff)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int check = 0;
+
+	/* do we need to explicitly request a larger max_size? */
+	spin_lock(&inode->i_lock);
+	if ((endoff >= ci->i_max_size ||
+	     endoff > (inode->i_size << 1)) &&
+	    endoff > ci->i_wanted_max_size) {
+		dout("write %p at large endoff %llu, req max_size\n",
+		     inode, endoff);
+		ci->i_wanted_max_size = endoff;
+		check = 1;
+	}
+	spin_unlock(&inode->i_lock);
+	if (check)
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+}
+
+/*
+ * Wait for caps, and take cap references.  If we can't get a WR cap
+ * due to a small max_size, make sure we check_max_size (and possibly
+ * ask the mds) so we don't get hung up indefinitely.
+ */
+int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
+		  loff_t endoff)
+{
+	int check_max, ret, err;
+
+retry:
+	if (endoff > 0)
+		check_max_size(&ci->vfs_inode, endoff);
+	check_max = 0;
+	err = 0;
+	ret = wait_event_interruptible(ci->i_cap_wq,
+				       try_get_cap_refs(ci, need, want,
+							got, endoff,
+							&check_max, &err));
+	if (err)
+		ret = err;
+	if (check_max)
+		goto retry;
+	return ret;
+}
+
+/*
+ * Take cap refs.  Caller must already know we hold at least one ref
+ * on the caps in question or we don't know this is safe.
+ */
+void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
+{
+	spin_lock(&ci->vfs_inode.i_lock);
+	__take_cap_refs(ci, caps);
+	spin_unlock(&ci->vfs_inode.i_lock);
+}
+
+/*
+ * Release cap refs.
+ *
+ * If we released the last ref on any given cap, call ceph_check_caps
+ * to release (or schedule a release).
+ *
+ * If we are releasing a WR cap (from a sync write), finalize any affected
+ * cap_snap, and wake up any waiters.
+ */
+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int last = 0, put = 0, flushsnaps = 0, wake = 0;
+	struct ceph_cap_snap *capsnap;
+
+	spin_lock(&inode->i_lock);
+	if (had & CEPH_CAP_PIN)
+		--ci->i_pin_ref;
+	if (had & CEPH_CAP_FILE_RD)
+		if (--ci->i_rd_ref == 0)
+			last++;
+	if (had & CEPH_CAP_FILE_CACHE)
+		if (--ci->i_rdcache_ref == 0)
+			last++;
+	if (had & CEPH_CAP_FILE_BUFFER) {
+		if (--ci->i_wb_ref == 0) {
+			last++;
+			put++;
+		}
+		dout("put_cap_refs %p wb %d -> %d (?)\n",
+		     inode, ci->i_wb_ref+1, ci->i_wb_ref);
+	}
+	if (had & CEPH_CAP_FILE_WR)
+		if (--ci->i_wr_ref == 0) {
+			last++;
+			if (!list_empty(&ci->i_cap_snaps)) {
+				capsnap = list_first_entry(&ci->i_cap_snaps,
+						     struct ceph_cap_snap,
+						     ci_item);
+				if (capsnap->writing) {
+					capsnap->writing = 0;
+					flushsnaps =
+						__ceph_finish_cap_snap(ci,
+								       capsnap);
+					wake = 1;
+				}
+			}
+		}
+	spin_unlock(&inode->i_lock);
+
+	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
+	     last ? " last" : "", put ? " put" : "");
+
+	if (last && !flushsnaps)
+		ceph_check_caps(ci, 0, NULL);
+	else if (flushsnaps)
+		ceph_flush_snaps(ci);
+	if (wake)
+		wake_up_all(&ci->i_cap_wq);
+	if (put)
+		iput(inode);
+}
+
+/*
+ * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
+ * context.  Adjust per-snap dirty page accounting as appropriate.
+ * Once all dirty data for a cap_snap is flushed, flush snapped file
+ * metadata back to the MDS.  If we dropped the last ref, call
+ * ceph_check_caps.
+ */
+void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+				struct ceph_snap_context *snapc)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int last = 0;
+	int complete_capsnap = 0;
+	int drop_capsnap = 0;
+	int found = 0;
+	struct ceph_cap_snap *capsnap = NULL;
+
+	spin_lock(&inode->i_lock);
+	ci->i_wrbuffer_ref -= nr;
+	last = !ci->i_wrbuffer_ref;
+
+	if (ci->i_head_snapc == snapc) {
+		ci->i_wrbuffer_ref_head -= nr;
+		if (ci->i_wrbuffer_ref_head == 0 &&
+		    ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
+			BUG_ON(!ci->i_head_snapc);
+			ceph_put_snap_context(ci->i_head_snapc);
+			ci->i_head_snapc = NULL;
+		}
+		dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
+		     inode,
+		     ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
+		     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+		     last ? " LAST" : "");
+	} else {
+		list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+			if (capsnap->context == snapc) {
+				found = 1;
+				break;
+			}
+		}
+		BUG_ON(!found);
+		capsnap->dirty_pages -= nr;
+		if (capsnap->dirty_pages == 0) {
+			complete_capsnap = 1;
+			if (capsnap->dirty == 0)
+				/* cap writeback completed before we created
+				 * the cap_snap; no FLUSHSNAP is needed */
+				drop_capsnap = 1;
+		}
+		dout("put_wrbuffer_cap_refs on %p cap_snap %p "
+		     " snap %lld %d/%d -> %d/%d %s%s%s\n",
+		     inode, capsnap, capsnap->context->seq,
+		     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
+		     ci->i_wrbuffer_ref, capsnap->dirty_pages,
+		     last ? " (wrbuffer last)" : "",
+		     complete_capsnap ? " (complete capsnap)" : "",
+		     drop_capsnap ? " (drop capsnap)" : "");
+		if (drop_capsnap) {
+			ceph_put_snap_context(capsnap->context);
+			list_del(&capsnap->ci_item);
+			list_del(&capsnap->flushing_item);
+			ceph_put_cap_snap(capsnap);
+		}
+	}
+
+	spin_unlock(&inode->i_lock);
+
+	if (last) {
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+		iput(inode);
+	} else if (complete_capsnap) {
+		ceph_flush_snaps(ci);
+		wake_up_all(&ci->i_cap_wq);
+	}
+	if (drop_capsnap)
+		iput(inode);
+}
+
+/*
+ * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
+ * actually be a revocation if it specifies a smaller cap set.)
+ *
+ * caller holds s_mutex and i_lock, we drop both.
+ *
+ * return value:
+ *  0 - ok
+ *  1 - check_caps on auth cap only (writeback)
+ *  2 - check_caps (ack revoke)
+ */
+static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+			     struct ceph_mds_session *session,
+			     struct ceph_cap *cap,
+			     struct ceph_buffer *xattr_buf)
+		__releases(inode->i_lock)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	int seq = le32_to_cpu(grant->seq);
+	int newcaps = le32_to_cpu(grant->caps);
+	int issued, implemented, used, wanted, dirty;
+	u64 size = le64_to_cpu(grant->size);
+	u64 max_size = le64_to_cpu(grant->max_size);
+	struct timespec mtime, atime, ctime;
+	int check_caps = 0;
+	int wake = 0;
+	int writeback = 0;
+	int revoked_rdcache = 0;
+	int queue_invalidate = 0;
+
+	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+	     inode, cap, mds, seq, ceph_cap_string(newcaps));
+	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
+		inode->i_size);
+
+	/*
+	 * If CACHE is being revoked, and we have no dirty buffers,
+	 * try to invalidate (once).  (If there are dirty buffers, we
+	 * will invalidate _after_ writeback.)
+	 */
+	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+	    (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
+	    !ci->i_wrbuffer_ref) {
+		if (try_nonblocking_invalidate(inode) == 0) {
+			revoked_rdcache = 1;
+		} else {
+			/* there were locked pages.. invalidate later
+			   in a separate thread. */
+			if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+				queue_invalidate = 1;
+				ci->i_rdcache_revoking = ci->i_rdcache_gen;
+			}
+		}
+	}
+
+	/* side effects now are allowed */
+
+	issued = __ceph_caps_issued(ci, &implemented);
+	issued |= implemented | __ceph_caps_dirty(ci);
+
+	cap->cap_gen = session->s_cap_gen;
+
+	__check_cap_issue(ci, cap, newcaps);
+
+	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+		inode->i_mode = le32_to_cpu(grant->mode);
+		inode->i_uid = le32_to_cpu(grant->uid);
+		inode->i_gid = le32_to_cpu(grant->gid);
+		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+		     inode->i_uid, inode->i_gid);
+	}
+
+	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+		inode->i_nlink = le32_to_cpu(grant->nlink);
+
+	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
+		int len = le32_to_cpu(grant->xattr_len);
+		u64 version = le64_to_cpu(grant->xattr_version);
+
+		if (version > ci->i_xattrs.version) {
+			dout(" got new xattrs v%llu on %p len %d\n",
+			     version, inode, len);
+			if (ci->i_xattrs.blob)
+				ceph_buffer_put(ci->i_xattrs.blob);
+			ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
+			ci->i_xattrs.version = version;
+		}
+	}
+
+	/* size/ctime/mtime/atime? */
+	ceph_fill_file_size(inode, issued,
+			    le32_to_cpu(grant->truncate_seq),
+			    le64_to_cpu(grant->truncate_size), size);
+	ceph_decode_timespec(&mtime, &grant->mtime);
+	ceph_decode_timespec(&atime, &grant->atime);
+	ceph_decode_timespec(&ctime, &grant->ctime);
+	ceph_fill_file_time(inode, issued,
+			    le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
+			    &atime);
+
+	/* max size increase? */
+	if (max_size != ci->i_max_size) {
+		dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
+		ci->i_max_size = max_size;
+		if (max_size >= ci->i_wanted_max_size) {
+			ci->i_wanted_max_size = 0;  /* reset */
+			ci->i_requested_max_size = 0;
+		}
+		wake = 1;
+	}
+
+	/* check cap bits */
+	wanted = __ceph_caps_wanted(ci);
+	used = __ceph_caps_used(ci);
+	dirty = __ceph_caps_dirty(ci);
+	dout(" my wanted = %s, used = %s, dirty %s\n",
+	     ceph_cap_string(wanted),
+	     ceph_cap_string(used),
+	     ceph_cap_string(dirty));
+	if (wanted != le32_to_cpu(grant->wanted)) {
+		dout("mds wanted %s -> %s\n",
+		     ceph_cap_string(le32_to_cpu(grant->wanted)),
+		     ceph_cap_string(wanted));
+		grant->wanted = cpu_to_le32(wanted);
+	}
+
+	cap->seq = seq;
+
+	/* file layout may have changed */
+	ci->i_layout = grant->layout;
+
+	/* revocation, grant, or no-op? */
+	if (cap->issued & ~newcaps) {
+		int revoking = cap->issued & ~newcaps;
+
+		dout("revocation: %s -> %s (revoking %s)\n",
+		     ceph_cap_string(cap->issued),
+		     ceph_cap_string(newcaps),
+		     ceph_cap_string(revoking));
+		if (revoking & used & CEPH_CAP_FILE_BUFFER)
+			writeback = 1;  /* initiate writeback; will delay ack */
+		else if (revoking == CEPH_CAP_FILE_CACHE &&
+			 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
+			 queue_invalidate)
+			; /* do nothing yet, invalidation will be queued */
+		else if (cap == ci->i_auth_cap)
+			check_caps = 1; /* check auth cap only */
+		else
+			check_caps = 2; /* check all caps */
+		cap->issued = newcaps;
+		cap->implemented |= newcaps;
+	} else if (cap->issued == newcaps) {
+		dout("caps unchanged: %s -> %s\n",
+		     ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
+	} else {
+		dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
+		     ceph_cap_string(newcaps));
+		cap->issued = newcaps;
+		cap->implemented |= newcaps; /* add bits only, to
+					      * avoid stepping on a
+					      * pending revocation */
+		wake = 1;
+	}
+	BUG_ON(cap->issued & ~cap->implemented);
+
+	spin_unlock(&inode->i_lock);
+	if (writeback)
+		/*
+		 * queue inode for writeback: we can't actually call
+		 * filemap_write_and_wait, etc. from message handler
+		 * context.
+		 */
+		ceph_queue_writeback(inode);
+	if (queue_invalidate)
+		ceph_queue_invalidate(inode);
+	if (wake)
+		wake_up_all(&ci->i_cap_wq);
+
+	if (check_caps == 1)
+		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+				session);
+	else if (check_caps == 2)
+		ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+	else
+		mutex_unlock(&session->s_mutex);
+}
+
+/*
+ * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
+ * MDS has been safely committed.
+ */
+static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
+				 struct ceph_mds_caps *m,
+				 struct ceph_mds_session *session,
+				 struct ceph_cap *cap)
+	__releases(inode->i_lock)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	unsigned seq = le32_to_cpu(m->seq);
+	int dirty = le32_to_cpu(m->dirty);
+	int cleaned = 0;
+	int drop = 0;
+	int i;
+
+	for (i = 0; i < CEPH_CAP_BITS; i++)
+		if ((dirty & (1 << i)) &&
+		    flush_tid == ci->i_cap_flush_tid[i])
+			cleaned |= 1 << i;
+
+	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
+	     " flushing %s -> %s\n",
+	     inode, session->s_mds, seq, ceph_cap_string(dirty),
+	     ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
+	     ceph_cap_string(ci->i_flushing_caps & ~cleaned));
+
+	if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+		goto out;
+
+	ci->i_flushing_caps &= ~cleaned;
+
+	spin_lock(&mdsc->cap_dirty_lock);
+	if (ci->i_flushing_caps == 0) {
+		list_del_init(&ci->i_flushing_item);
+		if (!list_empty(&session->s_cap_flushing))
+			dout(" mds%d still flushing cap on %p\n",
+			     session->s_mds,
+			     &list_entry(session->s_cap_flushing.next,
+					 struct ceph_inode_info,
+					 i_flushing_item)->vfs_inode);
+		mdsc->num_cap_flushing--;
+		wake_up_all(&mdsc->cap_flushing_wq);
+		dout(" inode %p now !flushing\n", inode);
+
+		if (ci->i_dirty_caps == 0) {
+			dout(" inode %p now clean\n", inode);
+			BUG_ON(!list_empty(&ci->i_dirty_item));
+			drop = 1;
+			if (ci->i_wrbuffer_ref_head == 0) {
+				BUG_ON(!ci->i_head_snapc);
+				ceph_put_snap_context(ci->i_head_snapc);
+				ci->i_head_snapc = NULL;
+			}
+		} else {
+			BUG_ON(list_empty(&ci->i_dirty_item));
+		}
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+	wake_up_all(&ci->i_cap_wq);
+
+out:
+	spin_unlock(&inode->i_lock);
+	if (drop)
+		iput(inode);
+}
+
+/*
+ * Handle FLUSHSNAP_ACK.  MDS has flushed snap data to disk and we can
+ * throw away our cap_snap.
+ *
+ * Caller hold s_mutex.
+ */
+static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
+				     struct ceph_mds_caps *m,
+				     struct ceph_mds_session *session)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 follows = le64_to_cpu(m->snap_follows);
+	struct ceph_cap_snap *capsnap;
+	int drop = 0;
+
+	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
+	     inode, ci, session->s_mds, follows);
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+		if (capsnap->follows == follows) {
+			if (capsnap->flush_tid != flush_tid) {
+				dout(" cap_snap %p follows %lld tid %lld !="
+				     " %lld\n", capsnap, follows,
+				     flush_tid, capsnap->flush_tid);
+				break;
+			}
+			WARN_ON(capsnap->dirty_pages || capsnap->writing);
+			dout(" removing %p cap_snap %p follows %lld\n",
+			     inode, capsnap, follows);
+			ceph_put_snap_context(capsnap->context);
+			list_del(&capsnap->ci_item);
+			list_del(&capsnap->flushing_item);
+			ceph_put_cap_snap(capsnap);
+			drop = 1;
+			break;
+		} else {
+			dout(" skipping cap_snap %p follows %lld\n",
+			     capsnap, capsnap->follows);
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	if (drop)
+		iput(inode);
+}
+
+/*
+ * Handle TRUNC from MDS, indicating file truncation.
+ *
+ * caller hold s_mutex.
+ */
+static void handle_cap_trunc(struct inode *inode,
+			     struct ceph_mds_caps *trunc,
+			     struct ceph_mds_session *session)
+	__releases(inode->i_lock)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	int seq = le32_to_cpu(trunc->seq);
+	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
+	u64 truncate_size = le64_to_cpu(trunc->truncate_size);
+	u64 size = le64_to_cpu(trunc->size);
+	int implemented = 0;
+	int dirty = __ceph_caps_dirty(ci);
+	int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
+	int queue_trunc = 0;
+
+	issued |= implemented | dirty;
+
+	dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
+	     inode, mds, seq, truncate_size, truncate_seq);
+	queue_trunc = ceph_fill_file_size(inode, issued,
+					  truncate_seq, truncate_size, size);
+	spin_unlock(&inode->i_lock);
+
+	if (queue_trunc)
+		ceph_queue_vmtruncate(inode);
+}
+
+/*
+ * Handle EXPORT from MDS.  Cap is being migrated _from_ this mds to a
+ * different one.  If we are the most recent migration we've seen (as
+ * indicated by mseq), make note of the migrating cap bits for the
+ * duration (until we see the corresponding IMPORT).
+ *
+ * caller holds s_mutex
+ */
+static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
+			      struct ceph_mds_session *session,
+			      int *open_target_sessions)
+{
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	unsigned mseq = le32_to_cpu(ex->migrate_seq);
+	struct ceph_cap *cap = NULL, *t;
+	struct rb_node *p;
+	int remember = 1;
+
+	dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
+	     inode, ci, mds, mseq);
+
+	spin_lock(&inode->i_lock);
+
+	/* make sure we haven't seen a higher mseq */
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		t = rb_entry(p, struct ceph_cap, ci_node);
+		if (ceph_seq_cmp(t->mseq, mseq) > 0) {
+			dout(" higher mseq on cap from mds%d\n",
+			     t->session->s_mds);
+			remember = 0;
+		}
+		if (t->session->s_mds == mds)
+			cap = t;
+	}
+
+	if (cap) {
+		if (remember) {
+			/* make note */
+			ci->i_cap_exporting_mds = mds;
+			ci->i_cap_exporting_mseq = mseq;
+			ci->i_cap_exporting_issued = cap->issued;
+
+			/*
+			 * make sure we have open sessions with all possible
+			 * export targets, so that we get the matching IMPORT
+			 */
+			*open_target_sessions = 1;
+
+			/*
+			 * we can't flush dirty caps that we've seen the
+			 * EXPORT but no IMPORT for
+			 */
+			spin_lock(&mdsc->cap_dirty_lock);
+			if (!list_empty(&ci->i_dirty_item)) {
+				dout(" moving %p to cap_dirty_migrating\n",
+				     inode);
+				list_move(&ci->i_dirty_item,
+					  &mdsc->cap_dirty_migrating);
+			}
+			spin_unlock(&mdsc->cap_dirty_lock);
+		}
+		__ceph_remove_cap(cap);
+	}
+	/* else, we already released it */
+
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Handle cap IMPORT.  If there are temp bits from an older EXPORT,
+ * clean them up.
+ *
+ * caller holds s_mutex.
+ */
+static void handle_cap_import(struct ceph_mds_client *mdsc,
+			      struct inode *inode, struct ceph_mds_caps *im,
+			      struct ceph_mds_session *session,
+			      void *snaptrace, int snaptrace_len)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	unsigned issued = le32_to_cpu(im->caps);
+	unsigned wanted = le32_to_cpu(im->wanted);
+	unsigned seq = le32_to_cpu(im->seq);
+	unsigned mseq = le32_to_cpu(im->migrate_seq);
+	u64 realmino = le64_to_cpu(im->realm);
+	u64 cap_id = le64_to_cpu(im->cap_id);
+
+	if (ci->i_cap_exporting_mds >= 0 &&
+	    ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
+		dout("handle_cap_import inode %p ci %p mds%d mseq %d"
+		     " - cleared exporting from mds%d\n",
+		     inode, ci, mds, mseq,
+		     ci->i_cap_exporting_mds);
+		ci->i_cap_exporting_issued = 0;
+		ci->i_cap_exporting_mseq = 0;
+		ci->i_cap_exporting_mds = -1;
+
+		spin_lock(&mdsc->cap_dirty_lock);
+		if (!list_empty(&ci->i_dirty_item)) {
+			dout(" moving %p back to cap_dirty\n", inode);
+			list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
+		}
+		spin_unlock(&mdsc->cap_dirty_lock);
+	} else {
+		dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
+		     inode, ci, mds, mseq);
+	}
+
+	down_write(&mdsc->snap_rwsem);
+	ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
+			       false);
+	downgrade_write(&mdsc->snap_rwsem);
+	ceph_add_cap(inode, session, cap_id, -1,
+		     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
+		     NULL /* no caps context */);
+	kick_flushing_inode_caps(mdsc, session, inode);
+	up_read(&mdsc->snap_rwsem);
+
+	/* make sure we re-request max_size, if necessary */
+	spin_lock(&inode->i_lock);
+	ci->i_requested_max_size = 0;
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Handle a caps message from the MDS.
+ *
+ * Identify the appropriate session, inode, and call the right handler
+ * based on the cap op.
+ */
+void ceph_handle_caps(struct ceph_mds_session *session,
+		      struct ceph_msg *msg)
+{
+	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct super_block *sb = mdsc->fsc->sb;
+	struct inode *inode;
+	struct ceph_cap *cap;
+	struct ceph_mds_caps *h;
+	int mds = session->s_mds;
+	int op;
+	u32 seq, mseq;
+	struct ceph_vino vino;
+	u64 cap_id;
+	u64 size, max_size;
+	u64 tid;
+	void *snaptrace;
+	size_t snaptrace_len;
+	void *flock;
+	u32 flock_len;
+	int open_target_sessions = 0;
+
+	dout("handle_caps from mds%d\n", mds);
+
+	/* decode */
+	tid = le64_to_cpu(msg->hdr.tid);
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	h = msg->front.iov_base;
+	op = le32_to_cpu(h->op);
+	vino.ino = le64_to_cpu(h->ino);
+	vino.snap = CEPH_NOSNAP;
+	cap_id = le64_to_cpu(h->cap_id);
+	seq = le32_to_cpu(h->seq);
+	mseq = le32_to_cpu(h->migrate_seq);
+	size = le64_to_cpu(h->size);
+	max_size = le64_to_cpu(h->max_size);
+
+	snaptrace = h + 1;
+	snaptrace_len = le32_to_cpu(h->snap_trace_len);
+
+	if (le16_to_cpu(msg->hdr.version) >= 2) {
+		void *p, *end;
+
+		p = snaptrace + snaptrace_len;
+		end = msg->front.iov_base + msg->front.iov_len;
+		ceph_decode_32_safe(&p, end, flock_len, bad);
+		flock = p;
+	} else {
+		flock = NULL;
+		flock_len = 0;
+	}
+
+	mutex_lock(&session->s_mutex);
+	session->s_seq++;
+	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
+	     (unsigned)seq);
+
+	/* lookup ino */
+	inode = ceph_find_inode(sb, vino);
+	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
+	     vino.snap, inode);
+	if (!inode) {
+		dout(" i don't have ino %llx\n", vino.ino);
+
+		if (op == CEPH_CAP_OP_IMPORT)
+			__queue_cap_release(session, vino.ino, cap_id,
+					    mseq, seq);
+		goto flush_cap_releases;
+	}
+
+	/* these will work even if we don't have a cap yet */
+	switch (op) {
+	case CEPH_CAP_OP_FLUSHSNAP_ACK:
+		handle_cap_flushsnap_ack(inode, tid, h, session);
+		goto done;
+
+	case CEPH_CAP_OP_EXPORT:
+		handle_cap_export(inode, h, session, &open_target_sessions);
+		goto done;
+
+	case CEPH_CAP_OP_IMPORT:
+		handle_cap_import(mdsc, inode, h, session,
+				  snaptrace, snaptrace_len);
+		ceph_check_caps(ceph_inode(inode), 0, session);
+		goto done_unlocked;
+	}
+
+	/* the rest require a cap */
+	spin_lock(&inode->i_lock);
+	cap = __get_cap_for_mds(ceph_inode(inode), mds);
+	if (!cap) {
+		dout(" no cap on %p ino %llx.%llx from mds%d\n",
+		     inode, ceph_ino(inode), ceph_snap(inode), mds);
+		spin_unlock(&inode->i_lock);
+		goto flush_cap_releases;
+	}
+
+	/* note that each of these drops i_lock for us */
+	switch (op) {
+	case CEPH_CAP_OP_REVOKE:
+	case CEPH_CAP_OP_GRANT:
+		handle_cap_grant(inode, h, session, cap, msg->middle);
+		goto done_unlocked;
+
+	case CEPH_CAP_OP_FLUSH_ACK:
+		handle_cap_flush_ack(inode, tid, h, session, cap);
+		break;
+
+	case CEPH_CAP_OP_TRUNC:
+		handle_cap_trunc(inode, h, session);
+		break;
+
+	default:
+		spin_unlock(&inode->i_lock);
+		pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
+		       ceph_cap_op_name(op));
+	}
+
+	goto done;
+
+flush_cap_releases:
+	/*
+	 * send any full release message to try to move things
+	 * along for the mds (who clearly thinks we still have this
+	 * cap).
+	 */
+	ceph_add_cap_releases(mdsc, session);
+	ceph_send_cap_releases(mdsc, session);
+
+done:
+	mutex_unlock(&session->s_mutex);
+done_unlocked:
+	if (inode)
+		iput(inode);
+	if (open_target_sessions)
+		ceph_mdsc_open_export_target_sessions(mdsc, session);
+	return;
+
+bad:
+	pr_err("ceph_handle_caps: corrupt message\n");
+	ceph_msg_dump(msg);
+	return;
+}
+
+/*
+ * Delayed work handler to process end of delayed cap release LRU list.
+ */
+void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+{
+	struct ceph_inode_info *ci;
+	int flags = CHECK_CAPS_NODELAY;
+
+	dout("check_delayed_caps\n");
+	while (1) {
+		spin_lock(&mdsc->cap_delay_lock);
+		if (list_empty(&mdsc->cap_delay_list))
+			break;
+		ci = list_first_entry(&mdsc->cap_delay_list,
+				      struct ceph_inode_info,
+				      i_cap_delay_list);
+		if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
+		    time_before(jiffies, ci->i_hold_caps_max))
+			break;
+		list_del_init(&ci->i_cap_delay_list);
+		spin_unlock(&mdsc->cap_delay_lock);
+		dout("check_delayed_caps on %p\n", &ci->vfs_inode);
+		ceph_check_caps(ci, flags, NULL);
+	}
+	spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Flush all dirty caps to the mds
+ */
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
+{
+	struct ceph_inode_info *ci;
+	struct inode *inode;
+
+	dout("flush_dirty_caps\n");
+	spin_lock(&mdsc->cap_dirty_lock);
+	while (!list_empty(&mdsc->cap_dirty)) {
+		ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
+				      i_dirty_item);
+		inode = &ci->vfs_inode;
+		ihold(inode);
+		dout("flush_dirty_caps %p\n", inode);
+		spin_unlock(&mdsc->cap_dirty_lock);
+		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
+		iput(inode);
+		spin_lock(&mdsc->cap_dirty_lock);
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+	dout("flush_dirty_caps done\n");
+}
+
+/*
+ * Drop open file reference.  If we were the last open file,
+ * we may need to release capabilities to the MDS (or schedule
+ * their delayed release).
+ */
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int last = 0;
+
+	spin_lock(&inode->i_lock);
+	dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
+	     ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
+	BUG_ON(ci->i_nr_by_mode[fmode] == 0);
+	if (--ci->i_nr_by_mode[fmode] == 0)
+		last++;
+	spin_unlock(&inode->i_lock);
+
+	if (last && ci->i_vino.snap == CEPH_NOSNAP)
+		ceph_check_caps(ci, 0, NULL);
+}
+
+/*
+ * Helpers for embedding cap and dentry lease releases into mds
+ * requests.
+ *
+ * @force is used by dentry_release (below) to force inclusion of a
+ * record for the directory inode, even when there aren't any caps to
+ * drop.
+ */
+int ceph_encode_inode_release(void **p, struct inode *inode,
+			      int mds, int drop, int unless, int force)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *cap;
+	struct ceph_mds_request_release *rel = *p;
+	int used, dirty;
+	int ret = 0;
+
+	spin_lock(&inode->i_lock);
+	used = __ceph_caps_used(ci);
+	dirty = __ceph_caps_dirty(ci);
+
+	dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
+	     inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
+	     ceph_cap_string(unless));
+
+	/* only drop unused, clean caps */
+	drop &= ~(used | dirty);
+
+	cap = __get_cap_for_mds(ci, mds);
+	if (cap && __cap_is_valid(cap)) {
+		if (force ||
+		    ((cap->issued & drop) &&
+		     (cap->issued & unless) == 0)) {
+			if ((cap->issued & drop) &&
+			    (cap->issued & unless) == 0) {
+				dout("encode_inode_release %p cap %p %s -> "
+				     "%s\n", inode, cap,
+				     ceph_cap_string(cap->issued),
+				     ceph_cap_string(cap->issued & ~drop));
+				cap->issued &= ~drop;
+				cap->implemented &= ~drop;
+				if (ci->i_ceph_flags & CEPH_I_NODELAY) {
+					int wanted = __ceph_caps_wanted(ci);
+					dout("  wanted %s -> %s (act %s)\n",
+					     ceph_cap_string(cap->mds_wanted),
+					     ceph_cap_string(cap->mds_wanted &
+							     ~wanted),
+					     ceph_cap_string(wanted));
+					cap->mds_wanted &= wanted;
+				}
+			} else {
+				dout("encode_inode_release %p cap %p %s"
+				     " (force)\n", inode, cap,
+				     ceph_cap_string(cap->issued));
+			}
+
+			rel->ino = cpu_to_le64(ceph_ino(inode));
+			rel->cap_id = cpu_to_le64(cap->cap_id);
+			rel->seq = cpu_to_le32(cap->seq);
+			rel->issue_seq = cpu_to_le32(cap->issue_seq),
+			rel->mseq = cpu_to_le32(cap->mseq);
+			rel->caps = cpu_to_le32(cap->issued);
+			rel->wanted = cpu_to_le32(cap->mds_wanted);
+			rel->dname_len = 0;
+			rel->dname_seq = 0;
+			*p += sizeof(*rel);
+			ret = 1;
+		} else {
+			dout("encode_inode_release %p cap %p %s\n",
+			     inode, cap, ceph_cap_string(cap->issued));
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+int ceph_encode_dentry_release(void **p, struct dentry *dentry,
+			       int mds, int drop, int unless)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct ceph_mds_request_release *rel = *p;
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	int force = 0;
+	int ret;
+
+	/*
+	 * force an record for the directory caps if we have a dentry lease.
+	 * this is racy (can't take i_lock and d_lock together), but it
+	 * doesn't have to be perfect; the mds will revoke anything we don't
+	 * release.
+	 */
+	spin_lock(&dentry->d_lock);
+	if (di->lease_session && di->lease_session->s_mds == mds)
+		force = 1;
+	spin_unlock(&dentry->d_lock);
+
+	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+
+	spin_lock(&dentry->d_lock);
+	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
+		dout("encode_dentry_release %p mds%d seq %d\n",
+		     dentry, mds, (int)di->lease_seq);
+		rel->dname_len = cpu_to_le32(dentry->d_name.len);
+		memcpy(*p, dentry->d_name.name, dentry->d_name.len);
+		*p += dentry->d_name.len;
+		rel->dname_seq = cpu_to_le32(di->lease_seq);
+		__ceph_mdsc_drop_dentry_lease(dentry);
+	}
+	spin_unlock(&dentry->d_lock);
+	return ret;
+}
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 00000000..bdce8b1f
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,22 @@
+/*
+ * Ceph 'frag' type
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+int ceph_frag_compare(__u32 a, __u32 b)
+{
+	unsigned va = ceph_frag_value(a);
+	unsigned vb = ceph_frag_value(b);
+	if (va < vb)
+		return -1;
+	if (va > vb)
+		return 1;
+	va = ceph_frag_bits(a);
+	vb = ceph_frag_bits(b);
+	if (va < vb)
+		return -1;
+	if (va > vb)
+		return 1;
+	return 0;
+}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 00000000..0dba6915
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,273 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+#include "super.h"
+
+#ifdef CONFIG_DEBUG_FS
+
+#include "mds_client.h"
+
+static int mdsmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_fs_client *fsc = s->private;
+
+	if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
+		return 0;
+	seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
+	seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
+	seq_printf(s, "session_timeout %d\n",
+		       fsc->mdsc->mdsmap->m_session_timeout);
+	seq_printf(s, "session_autoclose %d\n",
+		       fsc->mdsc->mdsmap->m_session_autoclose);
+	for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
+		struct ceph_entity_addr *addr =
+			&fsc->mdsc->mdsmap->m_info[i].addr;
+		int state = fsc->mdsc->mdsmap->m_info[i].state;
+
+		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
+			       ceph_pr_addr(&addr->in_addr),
+			       ceph_mds_state_name(state));
+	}
+	return 0;
+}
+
+/*
+ * mdsc debugfs
+ */
+static int mdsc_show(struct seq_file *s, void *p)
+{
+	struct ceph_fs_client *fsc = s->private;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	struct rb_node *rp;
+	int pathlen;
+	u64 pathbase;
+	char *path;
+
+	mutex_lock(&mdsc->mutex);
+	for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
+		req = rb_entry(rp, struct ceph_mds_request, r_node);
+
+		if (req->r_request && req->r_session)
+			seq_printf(s, "%lld\tmds%d\t", req->r_tid,
+				   req->r_session->s_mds);
+		else if (!req->r_request)
+			seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+		else
+			seq_printf(s, "%lld\t(no session)\t", req->r_tid);
+
+		seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
+
+		if (req->r_got_unsafe)
+			seq_printf(s, "\t(unsafe)");
+		else
+			seq_printf(s, "\t");
+
+		if (req->r_inode) {
+			seq_printf(s, " #%llx", ceph_ino(req->r_inode));
+		} else if (req->r_dentry) {
+			path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+						    &pathbase, 0);
+			if (IS_ERR(path))
+				path = NULL;
+			spin_lock(&req->r_dentry->d_lock);
+			seq_printf(s, " #%llx/%.*s (%s)",
+				   ceph_ino(req->r_dentry->d_parent->d_inode),
+				   req->r_dentry->d_name.len,
+				   req->r_dentry->d_name.name,
+				   path ? path : "");
+			spin_unlock(&req->r_dentry->d_lock);
+			kfree(path);
+		} else if (req->r_path1) {
+			seq_printf(s, " #%llx/%s", req->r_ino1.ino,
+				   req->r_path1);
+		}
+
+		if (req->r_old_dentry) {
+			path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
+						    &pathbase, 0);
+			if (IS_ERR(path))
+				path = NULL;
+			spin_lock(&req->r_old_dentry->d_lock);
+			seq_printf(s, " #%llx/%.*s (%s)",
+			   ceph_ino(req->r_old_dentry->d_parent->d_inode),
+				   req->r_old_dentry->d_name.len,
+				   req->r_old_dentry->d_name.name,
+				   path ? path : "");
+			spin_unlock(&req->r_old_dentry->d_lock);
+			kfree(path);
+		} else if (req->r_path2) {
+			if (req->r_ino2.ino)
+				seq_printf(s, " #%llx/%s", req->r_ino2.ino,
+					   req->r_path2);
+			else
+				seq_printf(s, " %s", req->r_path2);
+		}
+
+		seq_printf(s, "\n");
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	return 0;
+}
+
+static int caps_show(struct seq_file *s, void *p)
+{
+	struct ceph_fs_client *fsc = s->private;
+	int total, avail, used, reserved, min;
+
+	ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
+	seq_printf(s, "total\t\t%d\n"
+		   "avail\t\t%d\n"
+		   "used\t\t%d\n"
+		   "reserved\t%d\n"
+		   "min\t%d\n",
+		   total, avail, used, reserved, min);
+	return 0;
+}
+
+static int dentry_lru_show(struct seq_file *s, void *ptr)
+{
+	struct ceph_fs_client *fsc = s->private;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_dentry_info *di;
+
+	spin_lock(&mdsc->dentry_lru_lock);
+	list_for_each_entry(di, &mdsc->dentry_lru, lru) {
+		struct dentry *dentry = di->dentry;
+		seq_printf(s, "%p %p\t%.*s\n",
+			   di, dentry, dentry->d_name.len, dentry->d_name.name);
+	}
+	spin_unlock(&mdsc->dentry_lru_lock);
+
+	return 0;
+}
+
+CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
+CEPH_DEFINE_SHOW_FUNC(mdsc_show)
+CEPH_DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
+
+
+/*
+ * debugfs
+ */
+static int congestion_kb_set(void *data, u64 val)
+{
+	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
+
+	fsc->mount_options->congestion_kb = (int)val;
+	return 0;
+}
+
+static int congestion_kb_get(void *data, u64 *val)
+{
+	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
+
+	*val = (u64)fsc->mount_options->congestion_kb;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
+			congestion_kb_set, "%llu\n");
+
+
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
+{
+	dout("ceph_fs_debugfs_cleanup\n");
+	debugfs_remove(fsc->debugfs_bdi);
+	debugfs_remove(fsc->debugfs_congestion_kb);
+	debugfs_remove(fsc->debugfs_mdsmap);
+	debugfs_remove(fsc->debugfs_caps);
+	debugfs_remove(fsc->debugfs_mdsc);
+	debugfs_remove(fsc->debugfs_dentry_lru);
+}
+
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
+{
+	char name[100];
+	int err = -ENOMEM;
+
+	dout("ceph_fs_debugfs_init\n");
+	fsc->debugfs_congestion_kb =
+		debugfs_create_file("writeback_congestion_kb",
+				    0600,
+				    fsc->client->debugfs_dir,
+				    fsc,
+				    &congestion_kb_fops);
+	if (!fsc->debugfs_congestion_kb)
+		goto out;
+
+	snprintf(name, sizeof(name), "../../bdi/%s",
+		 dev_name(fsc->backing_dev_info.dev));
+	fsc->debugfs_bdi =
+		debugfs_create_symlink("bdi",
+				       fsc->client->debugfs_dir,
+				       name);
+	if (!fsc->debugfs_bdi)
+		goto out;
+
+	fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
+					0600,
+					fsc->client->debugfs_dir,
+					fsc,
+					&mdsmap_show_fops);
+	if (!fsc->debugfs_mdsmap)
+		goto out;
+
+	fsc->debugfs_mdsc = debugfs_create_file("mdsc",
+						0600,
+						fsc->client->debugfs_dir,
+						fsc,
+						&mdsc_show_fops);
+	if (!fsc->debugfs_mdsc)
+		goto out;
+
+	fsc->debugfs_caps = debugfs_create_file("caps",
+						   0400,
+						   fsc->client->debugfs_dir,
+						   fsc,
+						   &caps_show_fops);
+	if (!fsc->debugfs_caps)
+		goto out;
+
+	fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+					0600,
+					fsc->client->debugfs_dir,
+					fsc,
+					&dentry_lru_show_fops);
+	if (!fsc->debugfs_dentry_lru)
+		goto out;
+
+	return 0;
+
+out:
+	ceph_fs_debugfs_cleanup(fsc);
+	return err;
+}
+
+
+#else  /* CONFIG_DEBUG_FS */
+
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
+{
+	return 0;
+}
+
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
+{
+}
+
+#endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 00000000..ef8f08c3
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1275 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/spinlock.h>
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+/*
+ * Directory operations: readdir, lookup, create, link, unlink,
+ * rename, etc.
+ */
+
+/*
+ * Ceph MDS operations are specified in terms of a base ino and
+ * relative path.  Thus, the client can specify an operation on a
+ * specific inode (e.g., a getattr due to fstat(2)), or as a path
+ * relative to, say, the root directory.
+ *
+ * Normally, we limit ourselves to strict inode ops (no path component)
+ * or dentry operations (a single path component relative to an ino).  The
+ * exception to this is open_root_dentry(), which will open the mount
+ * point by name.
+ */
+
+const struct inode_operations ceph_dir_iops;
+const struct file_operations ceph_dir_fops;
+const struct dentry_operations ceph_dentry_ops;
+
+/*
+ * Initialize ceph dentry state.
+ */
+int ceph_init_dentry(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di;
+
+	if (dentry->d_fsdata)
+		return 0;
+
+	if (dentry->d_parent == NULL ||   /* nfs fh_to_dentry */
+	    ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+		d_set_d_op(dentry, &ceph_dentry_ops);
+	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
+		d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
+	else
+		d_set_d_op(dentry, &ceph_snap_dentry_ops);
+
+	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
+	if (!di)
+		return -ENOMEM;          /* oh well */
+
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_fsdata) {
+		/* lost a race */
+		kmem_cache_free(ceph_dentry_cachep, di);
+		goto out_unlock;
+	}
+	di->dentry = dentry;
+	di->lease_session = NULL;
+	dentry->d_fsdata = di;
+	dentry->d_time = jiffies;
+	ceph_dentry_lru_add(dentry);
+out_unlock:
+	spin_unlock(&dentry->d_lock);
+	return 0;
+}
+
+
+
+/*
+ * for readdir, we encode the directory frag and offset within that
+ * frag into f_pos.
+ */
+static unsigned fpos_frag(loff_t p)
+{
+	return p >> 32;
+}
+static unsigned fpos_off(loff_t p)
+{
+	return p & 0xffffffff;
+}
+
+/*
+ * When possible, we try to satisfy a readdir by peeking at the
+ * dcache.  We make this work by carefully ordering dentries on
+ * d_u.d_child when we initially get results back from the MDS, and
+ * falling back to a "normal" sync readdir if any dentries in the dir
+ * are dropped.
+ *
+ * I_COMPLETE tells indicates we have all dentries in the dir.  It is
+ * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
+ * the MDS if/when the directory is modified).
+ */
+static int __dcache_readdir(struct file *filp,
+			    void *dirent, filldir_t filldir)
+{
+	struct ceph_file_info *fi = filp->private_data;
+	struct dentry *parent = filp->f_dentry;
+	struct inode *dir = parent->d_inode;
+	struct list_head *p;
+	struct dentry *dentry, *last;
+	struct ceph_dentry_info *di;
+	int err = 0;
+
+	/* claim ref on last dentry we returned */
+	last = fi->dentry;
+	fi->dentry = NULL;
+
+	dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
+	     last);
+
+	spin_lock(&parent->d_lock);
+
+	/* start at beginning? */
+	if (filp->f_pos == 2 || last == NULL ||
+	    filp->f_pos < ceph_dentry(last)->offset) {
+		if (list_empty(&parent->d_subdirs))
+			goto out_unlock;
+		p = parent->d_subdirs.prev;
+		dout(" initial p %p/%p\n", p->prev, p->next);
+	} else {
+		p = last->d_u.d_child.prev;
+	}
+
+more:
+	dentry = list_entry(p, struct dentry, d_u.d_child);
+	di = ceph_dentry(dentry);
+	while (1) {
+		dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
+		     d_unhashed(dentry) ? "!hashed" : "hashed",
+		     parent->d_subdirs.prev, parent->d_subdirs.next);
+		if (p == &parent->d_subdirs) {
+			fi->at_end = 1;
+			goto out_unlock;
+		}
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+		if (!d_unhashed(dentry) && dentry->d_inode &&
+		    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
+		    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
+		    filp->f_pos <= di->offset)
+			break;
+		dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
+		     dentry->d_name.len, dentry->d_name.name, di->offset,
+		     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+		     !dentry->d_inode ? " null" : "");
+		spin_unlock(&dentry->d_lock);
+		p = p->prev;
+		dentry = list_entry(p, struct dentry, d_u.d_child);
+		di = ceph_dentry(dentry);
+	}
+
+	dget_dlock(dentry);
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&parent->d_lock);
+
+	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+	filp->f_pos = di->offset;
+	err = filldir(dirent, dentry->d_name.name,
+		      dentry->d_name.len, di->offset,
+		      ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
+		      dentry->d_inode->i_mode >> 12);
+
+	if (last) {
+		if (err < 0) {
+			/* remember our position */
+			fi->dentry = last;
+			fi->next_offset = di->offset;
+		} else {
+			dput(last);
+		}
+	}
+	last = dentry;
+
+	if (err < 0)
+		goto out;
+
+	filp->f_pos++;
+
+	/* make sure a dentry wasn't dropped while we didn't have parent lock */
+	if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
+		dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+		err = -EAGAIN;
+		goto out;
+	}
+
+	spin_lock(&parent->d_lock);
+	p = p->prev;	/* advance to next dentry */
+	goto more;
+
+out_unlock:
+	spin_unlock(&parent->d_lock);
+out:
+	if (last)
+		dput(last);
+	return err;
+}
+
+/*
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
+ */
+static int note_last_dentry(struct ceph_file_info *fi, const char *name,
+			    int len)
+{
+	kfree(fi->last_name);
+	fi->last_name = kmalloc(len+1, GFP_NOFS);
+	if (!fi->last_name)
+		return -ENOMEM;
+	memcpy(fi->last_name, name, len);
+	fi->last_name[len] = 0;
+	dout("note_last_dentry '%s'\n", fi->last_name);
+	return 0;
+}
+
+static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct ceph_file_info *fi = filp->private_data;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	unsigned frag = fpos_frag(filp->f_pos);
+	int off = fpos_off(filp->f_pos);
+	int err;
+	u32 ftype;
+	struct ceph_mds_reply_info_parsed *rinfo;
+	const int max_entries = fsc->mount_options->max_readdir;
+	const int max_bytes = fsc->mount_options->max_readdir_bytes;
+
+	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
+	if (fi->at_end)
+		return 0;
+
+	/* always start with . and .. */
+	if (filp->f_pos == 0) {
+		/* note dir version at start of readdir so we can tell
+		 * if any dentries get dropped */
+		fi->dir_release_count = ci->i_release_count;
+
+		dout("readdir off 0 -> '.'\n");
+		if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+			    ceph_translate_ino(inode->i_sb, inode->i_ino),
+			    inode->i_mode >> 12) < 0)
+			return 0;
+		filp->f_pos = 1;
+		off = 1;
+	}
+	if (filp->f_pos == 1) {
+		ino_t ino = filp->f_dentry->d_parent->d_inode->i_ino;
+		dout("readdir off 1 -> '..'\n");
+		if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+			    ceph_translate_ino(inode->i_sb, ino),
+			    inode->i_mode >> 12) < 0)
+			return 0;
+		filp->f_pos = 2;
+		off = 2;
+	}
+
+	/* can we use the dcache? */
+	spin_lock(&inode->i_lock);
+	if ((filp->f_pos == 2 || fi->dentry) &&
+	    !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
+	    ceph_snap(inode) != CEPH_SNAPDIR &&
+	    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+		spin_unlock(&inode->i_lock);
+		err = __dcache_readdir(filp, dirent, filldir);
+		if (err != -EAGAIN)
+			return err;
+	} else {
+		spin_unlock(&inode->i_lock);
+	}
+	if (fi->dentry) {
+		err = note_last_dentry(fi, fi->dentry->d_name.name,
+				       fi->dentry->d_name.len);
+		if (err)
+			return err;
+		dput(fi->dentry);
+		fi->dentry = NULL;
+	}
+
+	/* proceed with a normal readdir */
+
+more:
+	/* do we have the correct frag content buffered? */
+	if (fi->frag != frag || fi->last_readdir == NULL) {
+		struct ceph_mds_request *req;
+		int op = ceph_snap(inode) == CEPH_SNAPDIR ?
+			CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
+
+		/* discard old result, if any */
+		if (fi->last_readdir) {
+			ceph_mdsc_put_request(fi->last_readdir);
+			fi->last_readdir = NULL;
+		}
+
+		/* requery frag tree, as the frag topology may have changed */
+		frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
+
+		dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
+		     ceph_vinop(inode), frag, fi->last_name);
+		req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+		if (IS_ERR(req))
+			return PTR_ERR(req);
+		req->r_inode = inode;
+		ihold(inode);
+		req->r_dentry = dget(filp->f_dentry);
+		/* hints to request -> mds selection code */
+		req->r_direct_mode = USE_AUTH_MDS;
+		req->r_direct_hash = ceph_frag_value(frag);
+		req->r_direct_is_hash = true;
+		req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+		req->r_readdir_offset = fi->next_offset;
+		req->r_args.readdir.frag = cpu_to_le32(frag);
+		req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
+		req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
+		req->r_num_caps = max_entries + 1;
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
+		if (err < 0) {
+			ceph_mdsc_put_request(req);
+			return err;
+		}
+		dout("readdir got and parsed readdir result=%d"
+		     " on frag %x, end=%d, complete=%d\n", err, frag,
+		     (int)req->r_reply_info.dir_end,
+		     (int)req->r_reply_info.dir_complete);
+
+		if (!req->r_did_prepopulate) {
+			dout("readdir !did_prepopulate");
+			fi->dir_release_count--;    /* preclude I_COMPLETE */
+		}
+
+		/* note next offset and last dentry name */
+		fi->offset = fi->next_offset;
+		fi->last_readdir = req;
+
+		if (req->r_reply_info.dir_end) {
+			kfree(fi->last_name);
+			fi->last_name = NULL;
+			if (ceph_frag_is_rightmost(frag))
+				fi->next_offset = 2;
+			else
+				fi->next_offset = 0;
+		} else {
+			rinfo = &req->r_reply_info;
+			err = note_last_dentry(fi,
+				       rinfo->dir_dname[rinfo->dir_nr-1],
+				       rinfo->dir_dname_len[rinfo->dir_nr-1]);
+			if (err)
+				return err;
+			fi->next_offset += rinfo->dir_nr;
+		}
+	}
+
+	rinfo = &fi->last_readdir->r_reply_info;
+	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
+	     rinfo->dir_nr, off, fi->offset);
+	while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
+		u64 pos = ceph_make_fpos(frag, off);
+		struct ceph_mds_reply_inode *in =
+			rinfo->dir_in[off - fi->offset].in;
+		struct ceph_vino vino;
+		ino_t ino;
+
+		dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
+		     off, off - fi->offset, rinfo->dir_nr, pos,
+		     rinfo->dir_dname_len[off - fi->offset],
+		     rinfo->dir_dname[off - fi->offset], in);
+		BUG_ON(!in);
+		ftype = le32_to_cpu(in->mode) >> 12;
+		vino.ino = le64_to_cpu(in->ino);
+		vino.snap = le64_to_cpu(in->snapid);
+		ino = ceph_vino_to_ino(vino);
+		if (filldir(dirent,
+			    rinfo->dir_dname[off - fi->offset],
+			    rinfo->dir_dname_len[off - fi->offset],
+			    pos,
+			    ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
+			dout("filldir stopping us...\n");
+			return 0;
+		}
+		off++;
+		filp->f_pos = pos + 1;
+	}
+
+	if (fi->last_name) {
+		ceph_mdsc_put_request(fi->last_readdir);
+		fi->last_readdir = NULL;
+		goto more;
+	}
+
+	/* more frags? */
+	if (!ceph_frag_is_rightmost(frag)) {
+		frag = ceph_frag_next(frag);
+		off = 0;
+		filp->f_pos = ceph_make_fpos(frag, off);
+		dout("readdir next frag is %x\n", frag);
+		goto more;
+	}
+	fi->at_end = 1;
+
+	/*
+	 * if dir_release_count still matches the dir, no dentries
+	 * were released during the whole readdir, and we should have
+	 * the complete dir contents in our cache.
+	 */
+	spin_lock(&inode->i_lock);
+	if (ci->i_release_count == fi->dir_release_count) {
+		dout(" marking %p complete\n", inode);
+		/* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
+		ci->i_max_offset = filp->f_pos;
+	}
+	spin_unlock(&inode->i_lock);
+
+	dout("readdir %p filp %p done.\n", inode, filp);
+	return 0;
+}
+
+static void reset_readdir(struct ceph_file_info *fi)
+{
+	if (fi->last_readdir) {
+		ceph_mdsc_put_request(fi->last_readdir);
+		fi->last_readdir = NULL;
+	}
+	kfree(fi->last_name);
+	fi->last_name = NULL;
+	fi->next_offset = 2;  /* compensate for . and .. */
+	if (fi->dentry) {
+		dput(fi->dentry);
+		fi->dentry = NULL;
+	}
+	fi->at_end = 0;
+}
+
+static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file->f_mapping->host;
+	loff_t old_offset = offset;
+	loff_t retval;
+
+	mutex_lock(&inode->i_mutex);
+	switch (origin) {
+	case SEEK_END:
+		offset += inode->i_size + 2;   /* FIXME */
+		break;
+	case SEEK_CUR:
+		offset += file->f_pos;
+	}
+	retval = -EINVAL;
+	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+		if (offset != file->f_pos) {
+			file->f_pos = offset;
+			file->f_version = 0;
+			fi->at_end = 0;
+		}
+		retval = offset;
+
+		/*
+		 * discard buffered readdir content on seekdir(0), or
+		 * seek to new frag, or seek prior to current chunk.
+		 */
+		if (offset == 0 ||
+		    fpos_frag(offset) != fpos_frag(old_offset) ||
+		    fpos_off(offset) < fi->offset) {
+			dout("dir_llseek dropping %p content\n", file);
+			reset_readdir(fi);
+		}
+
+		/* bump dir_release_count if we did a forward seek */
+		if (offset > old_offset)
+			fi->dir_release_count--;
+	}
+	mutex_unlock(&inode->i_mutex);
+	return retval;
+}
+
+/*
+ * Process result of a lookup/open request.
+ *
+ * Mainly, make sure we return the final req->r_dentry (if it already
+ * existed) in place of the original VFS-provided dentry when they
+ * differ.
+ *
+ * Gracefully handle the case where the MDS replies with -ENOENT and
+ * no trace (which it may do, at its discretion, e.g., if it doesn't
+ * care to issue a lease on the negative dentry).
+ */
+struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+				  struct dentry *dentry, int err)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+	struct inode *parent = dentry->d_parent->d_inode;
+
+	/* .snap dir? */
+	if (err == -ENOENT &&
+	    ceph_snap(parent) == CEPH_NOSNAP &&
+	    strcmp(dentry->d_name.name,
+		   fsc->mount_options->snapdir_name) == 0) {
+		struct inode *inode = ceph_get_snapdir(parent);
+		dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
+		     dentry, dentry->d_name.len, dentry->d_name.name, inode);
+		BUG_ON(!d_unhashed(dentry));
+		d_add(dentry, inode);
+		err = 0;
+	}
+
+	if (err == -ENOENT) {
+		/* no trace? */
+		err = 0;
+		if (!req->r_reply_info.head->is_dentry) {
+			dout("ENOENT and no trace, dentry %p inode %p\n",
+			     dentry, dentry->d_inode);
+			if (dentry->d_inode) {
+				d_drop(dentry);
+				err = -ENOENT;
+			} else {
+				d_add(dentry, NULL);
+			}
+		}
+	}
+	if (err)
+		dentry = ERR_PTR(err);
+	else if (dentry != req->r_dentry)
+		dentry = dget(req->r_dentry);   /* we got spliced */
+	else
+		dentry = NULL;
+	return dentry;
+}
+
+static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+{
+	return ceph_ino(inode) == CEPH_INO_ROOT &&
+		strncmp(dentry->d_name.name, ".ceph", 5) == 0;
+}
+
+/*
+ * Look up a single dir entry.  If there is a lookup intent, inform
+ * the MDS so that it gets our 'caps wanted' value in a single op.
+ */
+static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
+				  struct nameidata *nd)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	int op;
+	int err;
+
+	dout("lookup %p dentry %p '%.*s'\n",
+	     dir, dentry, dentry->d_name.len, dentry->d_name.name);
+
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	err = ceph_init_dentry(dentry);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	/* open (but not create!) intent? */
+	if (nd &&
+	    (nd->flags & LOOKUP_OPEN) &&
+	    (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
+	    !(nd->intent.open.flags & O_CREAT)) {
+		int mode = nd->intent.open.create_mode & ~current->fs->umask;
+		return ceph_lookup_open(dir, dentry, nd, mode, 1);
+	}
+
+	/* can we conclude ENOENT locally? */
+	if (dentry->d_inode == NULL) {
+		struct ceph_inode_info *ci = ceph_inode(dir);
+		struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+		spin_lock(&dir->i_lock);
+		dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+		if (strncmp(dentry->d_name.name,
+			    fsc->mount_options->snapdir_name,
+			    dentry->d_name.len) &&
+		    !is_root_ceph_dentry(dir, dentry) &&
+		    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+			spin_unlock(&dir->i_lock);
+			dout(" dir %p complete, -ENOENT\n", dir);
+			d_add(dentry, NULL);
+			di->lease_shared_gen = ci->i_shared_gen;
+			return NULL;
+		}
+		spin_unlock(&dir->i_lock);
+	}
+
+	op = ceph_snap(dir) == CEPH_SNAPDIR ?
+		CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+	req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+	if (IS_ERR(req))
+		return ERR_CAST(req);
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	/* we only need inode linkage */
+	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+	req->r_locked_dir = dir;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	dentry = ceph_finish_lookup(req, dentry, err);
+	ceph_mdsc_put_request(req);  /* will dput(dentry) */
+	dout("lookup result=%p\n", dentry);
+	return dentry;
+}
+
+/*
+ * If we do a create but get no trace back from the MDS, follow up with
+ * a lookup (the VFS expects us to link up the provided dentry).
+ */
+int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
+{
+	struct dentry *result = ceph_lookup(dir, dentry, NULL);
+
+	if (result && !IS_ERR(result)) {
+		/*
+		 * We created the item, then did a lookup, and found
+		 * it was already linked to another inode we already
+		 * had in our cache (and thus got spliced).  Link our
+		 * dentry to that inode, but don't hash it, just in
+		 * case the VFS wants to dereference it.
+		 */
+		BUG_ON(!result->d_inode);
+		d_instantiate(dentry, result->d_inode);
+		return 0;
+	}
+	return PTR_ERR(result);
+}
+
+static int ceph_mknod(struct inode *dir, struct dentry *dentry,
+		      int mode, dev_t rdev)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
+	     dir, dentry, mode, rdev);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		d_drop(dentry);
+		return PTR_ERR(req);
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_locked_dir = dir;
+	req->r_args.mknod.mode = cpu_to_le32(mode);
+	req->r_args.mknod.rdev = cpu_to_le32(rdev);
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	ceph_mdsc_put_request(req);
+	if (err)
+		d_drop(dentry);
+	return err;
+}
+
+static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
+		       struct nameidata *nd)
+{
+	dout("create in dir %p dentry %p name '%.*s'\n",
+	     dir, dentry, dentry->d_name.len, dentry->d_name.name);
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (nd) {
+		BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
+		dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
+		/* hrm, what should i do here if we get aliased? */
+		if (IS_ERR(dentry))
+			return PTR_ERR(dentry);
+		return 0;
+	}
+
+	/* fall back to mknod */
+	return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
+}
+
+static int ceph_symlink(struct inode *dir, struct dentry *dentry,
+			    const char *dest)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		d_drop(dentry);
+		return PTR_ERR(req);
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_path2 = kstrdup(dest, GFP_NOFS);
+	req->r_locked_dir = dir;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	ceph_mdsc_put_request(req);
+	if (err)
+		d_drop(dentry);
+	return err;
+}
+
+static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	int err = -EROFS;
+	int op;
+
+	if (ceph_snap(dir) == CEPH_SNAPDIR) {
+		/* mkdir .snap/foo is a MKSNAP */
+		op = CEPH_MDS_OP_MKSNAP;
+		dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
+		     dentry->d_name.len, dentry->d_name.name, dentry);
+	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
+		dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
+		op = CEPH_MDS_OP_MKDIR;
+	} else {
+		goto out;
+	}
+	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_locked_dir = dir;
+	req->r_args.mkdir.mode = cpu_to_le32(mode);
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	ceph_mdsc_put_request(req);
+out:
+	if (err < 0)
+		d_drop(dentry);
+	return err;
+}
+
+static int ceph_link(struct dentry *old_dentry, struct inode *dir,
+		     struct dentry *dentry)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("link in dir %p old_dentry %p dentry %p\n", dir,
+	     old_dentry, dentry);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		d_drop(dentry);
+		return PTR_ERR(req);
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
+	req->r_locked_dir = dir;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (err) {
+		d_drop(dentry);
+	} else if (!req->r_reply_info.head->is_dentry) {
+		ihold(old_dentry->d_inode);
+		d_instantiate(dentry, old_dentry->d_inode);
+	}
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps.  If it
+ * looks like the link count will hit 0, drop any other caps (other
+ * than PIN) we don't specifically want (due to the file still being
+ * open).
+ */
+static int drop_caps_for_unlink(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+
+	spin_lock(&inode->i_lock);
+	if (inode->i_nlink == 1) {
+		drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
+		ci->i_ceph_flags |= CEPH_I_NODELAY;
+	}
+	spin_unlock(&inode->i_lock);
+	return drop;
+}
+
+/*
+ * rmdir and unlink are differ only by the metadata op code
+ */
+static int ceph_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct inode *inode = dentry->d_inode;
+	struct ceph_mds_request *req;
+	int err = -EROFS;
+	int op;
+
+	if (ceph_snap(dir) == CEPH_SNAPDIR) {
+		/* rmdir .snap/foo is RMSNAP */
+		dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
+		     dentry->d_name.name, dentry);
+		op = CEPH_MDS_OP_RMSNAP;
+	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
+		dout("unlink/rmdir dir %p dn %p inode %p\n",
+		     dir, dentry, inode);
+		op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
+			CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
+	} else
+		goto out;
+	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_locked_dir = dir;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	req->r_inode_drop = drop_caps_for_unlink(inode);
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		d_delete(dentry);
+	ceph_mdsc_put_request(req);
+out:
+	return err;
+}
+
+static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(old_dir) != ceph_snap(new_dir))
+		return -EXDEV;
+	if (ceph_snap(old_dir) != CEPH_NOSNAP ||
+	    ceph_snap(new_dir) != CEPH_NOSNAP)
+		return -EROFS;
+	dout("rename dir %p dentry %p to dir %p dentry %p\n",
+	     old_dir, old_dentry, new_dir, new_dentry);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_dentry = dget(new_dentry);
+	req->r_num_caps = 2;
+	req->r_old_dentry = dget(old_dentry);
+	req->r_locked_dir = new_dir;
+	req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	/* release LINK_RDCACHE on source inode (mds will lock it) */
+	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
+	if (new_dentry->d_inode)
+		req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
+	err = ceph_mdsc_do_request(mdsc, old_dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry) {
+		/*
+		 * Normally d_move() is done by fill_trace (called by
+		 * do_request, above).  If there is no trace, we need
+		 * to do it here.
+		 */
+
+		/* d_move screws up d_subdirs order */
+		ceph_i_clear(new_dir, CEPH_I_COMPLETE);
+
+		d_move(old_dentry, new_dentry);
+
+		/* ensure target dentry is invalidated, despite
+		   rehashing bug in vfs_rename_dir */
+		ceph_invalidate_dentry_lease(new_dentry);
+	}
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * Ensure a dentry lease will no longer revalidate.
+ */
+void ceph_invalidate_dentry_lease(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	dentry->d_time = jiffies;
+	ceph_dentry(dentry)->lease_shared_gen = 0;
+	spin_unlock(&dentry->d_lock);
+}
+
+/*
+ * Check if dentry lease is valid.  If not, delete the lease.  Try to
+ * renew if the least is more than half up.
+ */
+static int dentry_lease_is_valid(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di;
+	struct ceph_mds_session *s;
+	int valid = 0;
+	u32 gen;
+	unsigned long ttl;
+	struct ceph_mds_session *session = NULL;
+	struct inode *dir = NULL;
+	u32 seq = 0;
+
+	spin_lock(&dentry->d_lock);
+	di = ceph_dentry(dentry);
+	if (di && di->lease_session) {
+		s = di->lease_session;
+		spin_lock(&s->s_cap_lock);
+		gen = s->s_cap_gen;
+		ttl = s->s_cap_ttl;
+		spin_unlock(&s->s_cap_lock);
+
+		if (di->lease_gen == gen &&
+		    time_before(jiffies, dentry->d_time) &&
+		    time_before(jiffies, ttl)) {
+			valid = 1;
+			if (di->lease_renew_after &&
+			    time_after(jiffies, di->lease_renew_after)) {
+				/* we should renew */
+				dir = dentry->d_parent->d_inode;
+				session = ceph_get_mds_session(s);
+				seq = di->lease_seq;
+				di->lease_renew_after = 0;
+				di->lease_renew_from = jiffies;
+			}
+		}
+	}
+	spin_unlock(&dentry->d_lock);
+
+	if (session) {
+		ceph_mdsc_lease_send_msg(session, dir, dentry,
+					 CEPH_MDS_LEASE_RENEW, seq);
+		ceph_put_mds_session(session);
+	}
+	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+	return valid;
+}
+
+/*
+ * Check if directory-wide content lease/cap is valid.
+ */
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+{
+	struct ceph_inode_info *ci = ceph_inode(dir);
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	int valid = 0;
+
+	spin_lock(&dir->i_lock);
+	if (ci->i_shared_gen == di->lease_shared_gen)
+		valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+	spin_unlock(&dir->i_lock);
+	dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
+	     dir, (unsigned)ci->i_shared_gen, dentry,
+	     (unsigned)di->lease_shared_gen, valid);
+	return valid;
+}
+
+/*
+ * Check if cached dentry can be trusted.
+ */
+static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *dir;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	dir = dentry->d_parent->d_inode;
+
+	dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
+	     dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
+	     ceph_dentry(dentry)->offset);
+
+	/* always trust cached snapped dentries, snapdir dentry */
+	if (ceph_snap(dir) != CEPH_NOSNAP) {
+		dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
+		     dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+		goto out_touch;
+	}
+	if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
+		goto out_touch;
+
+	if (dentry_lease_is_valid(dentry) ||
+	    dir_lease_is_valid(dir, dentry))
+		goto out_touch;
+
+	dout("d_revalidate %p invalid\n", dentry);
+	d_drop(dentry);
+	return 0;
+out_touch:
+	ceph_dentry_lru_touch(dentry);
+	return 1;
+}
+
+/*
+ * Release our ceph_dentry_info.
+ */
+static void ceph_d_release(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+	dout("d_release %p\n", dentry);
+	if (di) {
+		ceph_dentry_lru_del(dentry);
+		if (di->lease_session)
+			ceph_put_mds_session(di->lease_session);
+		kmem_cache_free(ceph_dentry_cachep, di);
+		dentry->d_fsdata = NULL;
+	}
+}
+
+static int ceph_snapdir_d_revalidate(struct dentry *dentry,
+					  struct nameidata *nd)
+{
+	/*
+	 * Eventually, we'll want to revalidate snapped metadata
+	 * too... probably...
+	 */
+	return 1;
+}
+
+
+
+/*
+ * read() on a dir.  This weird interface hack only works if mounted
+ * with '-o dirstat'.
+ */
+static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
+			     loff_t *ppos)
+{
+	struct ceph_file_info *cf = file->private_data;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int left;
+	const int bufsize = 1024;
+
+	if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+		return -EISDIR;
+
+	if (!cf->dir_info) {
+		cf->dir_info = kmalloc(bufsize, GFP_NOFS);
+		if (!cf->dir_info)
+			return -ENOMEM;
+		cf->dir_info_len =
+			snprintf(cf->dir_info, bufsize,
+				"entries:   %20lld\n"
+				" files:    %20lld\n"
+				" subdirs:  %20lld\n"
+				"rentries:  %20lld\n"
+				" rfiles:   %20lld\n"
+				" rsubdirs: %20lld\n"
+				"rbytes:    %20lld\n"
+				"rctime:    %10ld.%09ld\n",
+				ci->i_files + ci->i_subdirs,
+				ci->i_files,
+				ci->i_subdirs,
+				ci->i_rfiles + ci->i_rsubdirs,
+				ci->i_rfiles,
+				ci->i_rsubdirs,
+				ci->i_rbytes,
+				(long)ci->i_rctime.tv_sec,
+				(long)ci->i_rctime.tv_nsec);
+	}
+
+	if (*ppos >= cf->dir_info_len)
+		return 0;
+	size = min_t(unsigned, size, cf->dir_info_len-*ppos);
+	left = copy_to_user(buf, cf->dir_info + *ppos, size);
+	if (left == size)
+		return -EFAULT;
+	*ppos += (size - left);
+	return size - left;
+}
+
+/*
+ * an fsync() on a dir will wait for any uncommitted directory
+ * operations to commit.
+ */
+static int ceph_dir_fsync(struct file *file, int datasync)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct list_head *head = &ci->i_unsafe_dirops;
+	struct ceph_mds_request *req;
+	u64 last_tid;
+	int ret = 0;
+
+	dout("dir_fsync %p\n", inode);
+	spin_lock(&ci->i_unsafe_lock);
+	if (list_empty(head))
+		goto out;
+
+	req = list_entry(head->prev,
+			 struct ceph_mds_request, r_unsafe_dir_item);
+	last_tid = req->r_tid;
+
+	do {
+		ceph_mdsc_get_request(req);
+		spin_unlock(&ci->i_unsafe_lock);
+		dout("dir_fsync %p wait on tid %llu (until %llu)\n",
+		     inode, req->r_tid, last_tid);
+		if (req->r_timeout) {
+			ret = wait_for_completion_timeout(
+				&req->r_safe_completion, req->r_timeout);
+			if (ret > 0)
+				ret = 0;
+			else if (ret == 0)
+				ret = -EIO;  /* timed out */
+		} else {
+			wait_for_completion(&req->r_safe_completion);
+		}
+		spin_lock(&ci->i_unsafe_lock);
+		ceph_mdsc_put_request(req);
+
+		if (ret || list_empty(head))
+			break;
+		req = list_entry(head->next,
+				 struct ceph_mds_request, r_unsafe_dir_item);
+	} while (req->r_tid < last_tid);
+out:
+	spin_unlock(&ci->i_unsafe_lock);
+	return ret;
+}
+
+/*
+ * We maintain a private dentry LRU.
+ *
+ * FIXME: this needs to be changed to a per-mds lru to be useful.
+ */
+void ceph_dentry_lru_add(struct dentry *dn)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dn);
+	struct ceph_mds_client *mdsc;
+
+	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
+	     dn->d_name.len, dn->d_name.name);
+	if (di) {
+		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+		spin_lock(&mdsc->dentry_lru_lock);
+		list_add_tail(&di->lru, &mdsc->dentry_lru);
+		mdsc->num_dentry++;
+		spin_unlock(&mdsc->dentry_lru_lock);
+	}
+}
+
+void ceph_dentry_lru_touch(struct dentry *dn)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dn);
+	struct ceph_mds_client *mdsc;
+
+	dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
+	     dn->d_name.len, dn->d_name.name, di->offset);
+	if (di) {
+		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+		spin_lock(&mdsc->dentry_lru_lock);
+		list_move_tail(&di->lru, &mdsc->dentry_lru);
+		spin_unlock(&mdsc->dentry_lru_lock);
+	}
+}
+
+void ceph_dentry_lru_del(struct dentry *dn)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dn);
+	struct ceph_mds_client *mdsc;
+
+	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
+	     dn->d_name.len, dn->d_name.name);
+	if (di) {
+		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+		spin_lock(&mdsc->dentry_lru_lock);
+		list_del_init(&di->lru);
+		mdsc->num_dentry--;
+		spin_unlock(&mdsc->dentry_lru_lock);
+	}
+}
+
+/*
+ * Return name hash for a given dentry.  This is dependent on
+ * the parent directory's hash function.
+ */
+unsigned ceph_dentry_hash(struct dentry *dn)
+{
+	struct inode *dir = dn->d_parent->d_inode;
+	struct ceph_inode_info *dci = ceph_inode(dir);
+
+	switch (dci->i_dir_layout.dl_dir_hash) {
+	case 0:	/* for backward compat */
+	case CEPH_STR_HASH_LINUX:
+		return dn->d_name.hash;
+
+	default:
+		return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+				     dn->d_name.name, dn->d_name.len);
+	}
+}
+
+const struct file_operations ceph_dir_fops = {
+	.read = ceph_read_dir,
+	.readdir = ceph_readdir,
+	.llseek = ceph_dir_llseek,
+	.open = ceph_open,
+	.release = ceph_release,
+	.unlocked_ioctl = ceph_ioctl,
+	.fsync = ceph_dir_fsync,
+};
+
+const struct inode_operations ceph_dir_iops = {
+	.lookup = ceph_lookup,
+	.permission = ceph_permission,
+	.getattr = ceph_getattr,
+	.setattr = ceph_setattr,
+	.setxattr = ceph_setxattr,
+	.getxattr = ceph_getxattr,
+	.listxattr = ceph_listxattr,
+	.removexattr = ceph_removexattr,
+	.mknod = ceph_mknod,
+	.symlink = ceph_symlink,
+	.mkdir = ceph_mkdir,
+	.link = ceph_link,
+	.unlink = ceph_unlink,
+	.rmdir = ceph_unlink,
+	.rename = ceph_rename,
+	.create = ceph_create,
+};
+
+const struct dentry_operations ceph_dentry_ops = {
+	.d_revalidate = ceph_d_revalidate,
+	.d_release = ceph_d_release,
+};
+
+const struct dentry_operations ceph_snapdir_dentry_ops = {
+	.d_revalidate = ceph_snapdir_d_revalidate,
+	.d_release = ceph_d_release,
+};
+
+const struct dentry_operations ceph_snap_dentry_ops = {
+	.d_release = ceph_d_release,
+};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 00000000..f67b6875
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,249 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/exportfs.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+/*
+ * NFS export support
+ *
+ * NFS re-export of a ceph mount is, at present, only semireliable.
+ * The basic issue is that the Ceph architectures doesn't lend itself
+ * well to generating filehandles that will remain valid forever.
+ *
+ * So, we do our best.  If you're lucky, your inode will be in the
+ * client's cache.  If it's not, and you have a connectable fh, then
+ * the MDS server may be able to find it for you.  Otherwise, you get
+ * ESTALE.
+ *
+ * There are ways to this more reliable, but in the non-connectable fh
+ * case, we won't every work perfectly, and in the connectable case,
+ * some changes are needed on the MDS side to work better.
+ */
+
+/*
+ * Basic fh
+ */
+struct ceph_nfs_fh {
+	u64 ino;
+} __attribute__ ((packed));
+
+/*
+ * Larger 'connectable' fh that includes parent ino and name hash.
+ * Use this whenever possible, as it works more reliably.
+ */
+struct ceph_nfs_confh {
+	u64 ino, parent_ino;
+	u32 parent_name_hash;
+} __attribute__ ((packed));
+
+static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
+			  int connectable)
+{
+	int type;
+	struct ceph_nfs_fh *fh = (void *)rawfh;
+	struct ceph_nfs_confh *cfh = (void *)rawfh;
+	struct dentry *parent = dentry->d_parent;
+	struct inode *inode = dentry->d_inode;
+	int connected_handle_length = sizeof(*cfh)/4;
+	int handle_length = sizeof(*fh)/4;
+
+	/* don't re-export snaps */
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EINVAL;
+
+	if (*max_len >= connected_handle_length) {
+		dout("encode_fh %p connectable\n", dentry);
+		cfh->ino = ceph_ino(dentry->d_inode);
+		cfh->parent_ino = ceph_ino(parent->d_inode);
+		cfh->parent_name_hash = ceph_dentry_hash(parent);
+		*max_len = connected_handle_length;
+		type = 2;
+	} else if (*max_len >= handle_length) {
+		if (connectable) {
+			*max_len = connected_handle_length;
+			return 255;
+		}
+		dout("encode_fh %p\n", dentry);
+		fh->ino = ceph_ino(dentry->d_inode);
+		*max_len = handle_length;
+		type = 1;
+	} else {
+		*max_len = handle_length;
+		return 255;
+	}
+	return type;
+}
+
+/*
+ * convert regular fh to dentry
+ *
+ * FIXME: we should try harder by querying the mds for the ino.
+ */
+static struct dentry *__fh_to_dentry(struct super_block *sb,
+				     struct ceph_nfs_fh *fh)
+{
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+	struct inode *inode;
+	struct dentry *dentry;
+	struct ceph_vino vino;
+	int err;
+
+	dout("__fh_to_dentry %llx\n", fh->ino);
+	vino.ino = fh->ino;
+	vino.snap = CEPH_NOSNAP;
+	inode = ceph_find_inode(sb, vino);
+	if (!inode) {
+		struct ceph_mds_request *req;
+
+		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
+					       USE_ANY_MDS);
+		if (IS_ERR(req))
+			return ERR_CAST(req);
+
+		req->r_ino1 = vino;
+		req->r_num_caps = 1;
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
+		inode = req->r_target_inode;
+		if (inode)
+			ihold(inode);
+		ceph_mdsc_put_request(req);
+		if (!inode)
+			return ERR_PTR(-ESTALE);
+	}
+
+	dentry = d_obtain_alias(inode);
+	if (IS_ERR(dentry)) {
+		pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
+		       fh->ino, inode);
+		iput(inode);
+		return dentry;
+	}
+	err = ceph_init_dentry(dentry);
+
+	if (err < 0) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
+	return dentry;
+}
+
+/*
+ * convert connectable fh to dentry
+ */
+static struct dentry *__cfh_to_dentry(struct super_block *sb,
+				      struct ceph_nfs_confh *cfh)
+{
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+	struct inode *inode;
+	struct dentry *dentry;
+	struct ceph_vino vino;
+	int err;
+
+	dout("__cfh_to_dentry %llx (%llx/%x)\n",
+	     cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
+
+	vino.ino = cfh->ino;
+	vino.snap = CEPH_NOSNAP;
+	inode = ceph_find_inode(sb, vino);
+	if (!inode) {
+		struct ceph_mds_request *req;
+
+		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
+					       USE_ANY_MDS);
+		if (IS_ERR(req))
+			return ERR_CAST(req);
+
+		req->r_ino1 = vino;
+		req->r_ino2.ino = cfh->parent_ino;
+		req->r_ino2.snap = CEPH_NOSNAP;
+		req->r_path2 = kmalloc(16, GFP_NOFS);
+		snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
+		req->r_num_caps = 1;
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
+		inode = req->r_target_inode;
+		if (inode)
+			ihold(inode);
+		ceph_mdsc_put_request(req);
+		if (!inode)
+			return ERR_PTR(err ? err : -ESTALE);
+	}
+
+	dentry = d_obtain_alias(inode);
+	if (IS_ERR(dentry)) {
+		pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
+		       cfh->ino, inode);
+		iput(inode);
+		return dentry;
+	}
+	err = ceph_init_dentry(dentry);
+	if (err < 0) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
+	return dentry;
+}
+
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
+					int fh_len, int fh_type)
+{
+	if (fh_type == 1)
+		return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
+	else
+		return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
+}
+
+/*
+ * get parent, if possible.
+ *
+ * FIXME: we could do better by querying the mds to discover the
+ * parent.
+ */
+static struct dentry *ceph_fh_to_parent(struct super_block *sb,
+					 struct fid *fid,
+					int fh_len, int fh_type)
+{
+	struct ceph_nfs_confh *cfh = (void *)fid->raw;
+	struct ceph_vino vino;
+	struct inode *inode;
+	struct dentry *dentry;
+	int err;
+
+	if (fh_type == 1)
+		return ERR_PTR(-ESTALE);
+
+	pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
+		 cfh->parent_name_hash);
+
+	vino.ino = cfh->ino;
+	vino.snap = CEPH_NOSNAP;
+	inode = ceph_find_inode(sb, vino);
+	if (!inode)
+		return ERR_PTR(-ESTALE);
+
+	dentry = d_obtain_alias(inode);
+	if (IS_ERR(dentry)) {
+		pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
+		       cfh->ino, inode);
+		iput(inode);
+		return dentry;
+	}
+	err = ceph_init_dentry(dentry);
+	if (err < 0) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
+	return dentry;
+}
+
+const struct export_operations ceph_export_ops = {
+	.encode_fh = ceph_encode_fh,
+	.fh_to_dentry = ceph_fh_to_dentry,
+	.fh_to_parent = ceph_fh_to_parent,
+};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 00000000..4698a5c5
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,828 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+/*
+ * Ceph file operations
+ *
+ * Implement basic open/close functionality, and implement
+ * read/write.
+ *
+ * We implement three modes of file I/O:
+ *  - buffered uses the generic_file_aio_{read,write} helpers
+ *
+ *  - synchronous is used when there is multi-client read/write
+ *    sharing, avoids the page cache, and synchronously waits for an
+ *    ack from the OSD.
+ *
+ *  - direct io takes the variant of the sync path that references
+ *    user pages directly.
+ *
+ * fsync() flushes and waits on dirty pages, but just queues metadata
+ * for writeback: since the MDS can recover size and mtime there is no
+ * need to wait for MDS acknowledgement.
+ */
+
+
+/*
+ * Prepare an open request.  Preallocate ceph_cap to avoid an
+ * inopportune ENOMEM later.
+ */
+static struct ceph_mds_request *
+prepare_open_request(struct super_block *sb, int flags, int create_mode)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	int want_auth = USE_ANY_MDS;
+	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
+
+	if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
+		want_auth = USE_AUTH_MDS;
+
+	req = ceph_mdsc_create_request(mdsc, op, want_auth);
+	if (IS_ERR(req))
+		goto out;
+	req->r_fmode = ceph_flags_to_mode(flags);
+	req->r_args.open.flags = cpu_to_le32(flags);
+	req->r_args.open.mode = cpu_to_le32(create_mode);
+	req->r_args.open.preferred = cpu_to_le32(-1);
+out:
+	return req;
+}
+
+/*
+ * initialize private struct file data.
+ * if we fail, clean up by dropping fmode reference on the ceph_inode
+ */
+static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
+{
+	struct ceph_file_info *cf;
+	int ret = 0;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+		dout("init_file %p %p 0%o (regular)\n", inode, file,
+		     inode->i_mode);
+		cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
+		if (cf == NULL) {
+			ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+			return -ENOMEM;
+		}
+		cf->fmode = fmode;
+		cf->next_offset = 2;
+		file->private_data = cf;
+		BUG_ON(inode->i_fop->release != ceph_release);
+		break;
+
+	case S_IFLNK:
+		dout("init_file %p %p 0%o (symlink)\n", inode, file,
+		     inode->i_mode);
+		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+		break;
+
+	default:
+		dout("init_file %p %p 0%o (special)\n", inode, file,
+		     inode->i_mode);
+		/*
+		 * we need to drop the open ref now, since we don't
+		 * have .release set to ceph_release.
+		 */
+		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+		BUG_ON(inode->i_fop->release == ceph_release);
+
+		/* call the proper open fop */
+		ret = inode->i_fop->open(inode, file);
+	}
+	return ret;
+}
+
+/*
+ * If the filp already has private_data, that means the file was
+ * already opened by intent during lookup, and we do nothing.
+ *
+ * If we already have the requisite capabilities, we can satisfy
+ * the open request locally (no need to request new caps from the
+ * MDS).  We do, however, need to inform the MDS (asynchronously)
+ * if our wanted caps set expands.
+ */
+int ceph_open(struct inode *inode, struct file *file)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_file_info *cf = file->private_data;
+	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+	int err;
+	int flags, fmode, wanted;
+
+	if (cf) {
+		dout("open file %p is already opened\n", file);
+		return 0;
+	}
+
+	/* filter out O_CREAT|O_EXCL; vfs did that already.  yuck. */
+	flags = file->f_flags & ~(O_CREAT|O_EXCL);
+	if (S_ISDIR(inode->i_mode))
+		flags = O_DIRECTORY;  /* mds likes to know */
+
+	dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
+	     ceph_vinop(inode), file, flags, file->f_flags);
+	fmode = ceph_flags_to_mode(flags);
+	wanted = ceph_caps_for_mode(fmode);
+
+	/* snapped files are read-only */
+	if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
+		return -EROFS;
+
+	/* trivially open snapdir */
+	if (ceph_snap(inode) == CEPH_SNAPDIR) {
+		spin_lock(&inode->i_lock);
+		__ceph_get_fmode(ci, fmode);
+		spin_unlock(&inode->i_lock);
+		return ceph_init_file(inode, file, fmode);
+	}
+
+	/*
+	 * No need to block if we have caps on the auth MDS (for
+	 * write) or any MDS (for read).  Update wanted set
+	 * asynchronously.
+	 */
+	spin_lock(&inode->i_lock);
+	if (__ceph_is_any_real_caps(ci) &&
+	    (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
+		int mds_wanted = __ceph_caps_mds_wanted(ci);
+		int issued = __ceph_caps_issued(ci, NULL);
+
+		dout("open %p fmode %d want %s issued %s using existing\n",
+		     inode, fmode, ceph_cap_string(wanted),
+		     ceph_cap_string(issued));
+		__ceph_get_fmode(ci, fmode);
+		spin_unlock(&inode->i_lock);
+
+		/* adjust wanted? */
+		if ((issued & wanted) != wanted &&
+		    (mds_wanted & wanted) != wanted &&
+		    ceph_snap(inode) != CEPH_SNAPDIR)
+			ceph_check_caps(ci, 0, NULL);
+
+		return ceph_init_file(inode, file, fmode);
+	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
+		   (ci->i_snap_caps & wanted) == wanted) {
+		__ceph_get_fmode(ci, fmode);
+		spin_unlock(&inode->i_lock);
+		return ceph_init_file(inode, file, fmode);
+	}
+	spin_unlock(&inode->i_lock);
+
+	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
+	req = prepare_open_request(inode->i_sb, flags, 0);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+	req->r_inode = inode;
+	ihold(inode);
+	req->r_num_caps = 1;
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	if (!err)
+		err = ceph_init_file(inode, file, req->r_fmode);
+	ceph_mdsc_put_request(req);
+	dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
+out:
+	return err;
+}
+
+
+/*
+ * Do a lookup + open with a single request.
+ *
+ * If this succeeds, but some subsequent check in the vfs
+ * may_open() fails, the struct *file gets cleaned up (i.e.
+ * ceph_release gets called).  So fear not!
+ */
+/*
+ * flags
+ *  path_lookup_open   -> LOOKUP_OPEN
+ *  path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
+ */
+struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+				struct nameidata *nd, int mode,
+				int locked_dir)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct file *file = nd->intent.open.file;
+	struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
+	struct ceph_mds_request *req;
+	int err;
+	int flags = nd->intent.open.flags - 1;  /* silly vfs! */
+
+	dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
+	     dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
+
+	/* do the open */
+	req = prepare_open_request(dir->i_sb, flags, mode);
+	if (IS_ERR(req))
+		return ERR_CAST(req);
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	if (flags & O_CREAT) {
+		req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+		req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	}
+	req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	dentry = ceph_finish_lookup(req, dentry, err);
+	if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	if (!err)
+		err = ceph_init_file(req->r_dentry->d_inode, file,
+				     req->r_fmode);
+	ceph_mdsc_put_request(req);
+	dout("ceph_lookup_open result=%p\n", dentry);
+	return dentry;
+}
+
+int ceph_release(struct inode *inode, struct file *file)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_file_info *cf = file->private_data;
+
+	dout("release inode %p file %p\n", inode, file);
+	ceph_put_fmode(ci, cf->fmode);
+	if (cf->last_readdir)
+		ceph_mdsc_put_request(cf->last_readdir);
+	kfree(cf->last_name);
+	kfree(cf->dir_info);
+	dput(cf->dentry);
+	kmem_cache_free(ceph_file_cachep, cf);
+
+	/* wake up anyone waiting for caps on this inode */
+	wake_up_all(&ci->i_cap_wq);
+	return 0;
+}
+
+/*
+ * Read a range of bytes striped over one or more objects.  Iterate over
+ * objects we stripe over.  (That's not atomic, but good enough for now.)
+ *
+ * If we get a short result from the OSD, check against i_size; we need to
+ * only return a short read to the caller if we hit EOF.
+ */
+static int striped_read(struct inode *inode,
+			u64 off, u64 len,
+			struct page **pages, int num_pages,
+			int *checkeof, bool o_direct,
+			unsigned long buf_align)
+{
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 pos, this_len;
+	int io_align, page_align;
+	int left, pages_left;
+	int read;
+	struct page **page_pos;
+	int ret;
+	bool hit_stripe, was_short;
+
+	/*
+	 * we may need to do multiple reads.  not atomic, unfortunately.
+	 */
+	pos = off;
+	left = len;
+	page_pos = pages;
+	pages_left = num_pages;
+	read = 0;
+	io_align = off & ~PAGE_MASK;
+
+more:
+	if (o_direct)
+		page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+	else
+		page_align = pos & ~PAGE_MASK;
+	this_len = left;
+	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
+				  &ci->i_layout, pos, &this_len,
+				  ci->i_truncate_seq,
+				  ci->i_truncate_size,
+				  page_pos, pages_left, page_align);
+	if (ret == -ENOENT)
+		ret = 0;
+	hit_stripe = this_len < left;
+	was_short = ret >= 0 && ret < this_len;
+	dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
+	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
+
+	if (ret > 0) {
+		int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
+
+		if (read < pos - off) {
+			dout(" zero gap %llu to %llu\n", off + read, pos);
+			ceph_zero_page_vector_range(page_align + read,
+						    pos - off - read, pages);
+		}
+		pos += ret;
+		read = pos - off;
+		left -= ret;
+		page_pos += didpages;
+		pages_left -= didpages;
+
+		/* hit stripe? */
+		if (left && hit_stripe)
+			goto more;
+	}
+
+	if (was_short) {
+		/* did we bounce off eof? */
+		if (pos + left > inode->i_size)
+			*checkeof = 1;
+
+		/* zero trailing bytes (inside i_size) */
+		if (left > 0 && pos < inode->i_size) {
+			if (pos + left > inode->i_size)
+				left = inode->i_size - pos;
+
+			dout("zero tail %d\n", left);
+			ceph_zero_page_vector_range(page_align + read, left,
+						    pages);
+			read += left;
+		}
+	}
+
+	if (ret >= 0)
+		ret = read;
+	dout("striped_read returns %d\n", ret);
+	return ret;
+}
+
+/*
+ * Completely synchronous read and write methods.  Direct from __user
+ * buffer to osd, or directly to user pages (if O_DIRECT).
+ *
+ * If the read spans object boundary, just do multiple reads.
+ */
+static ssize_t ceph_sync_read(struct file *file, char __user *data,
+			      unsigned len, loff_t *poff, int *checkeof)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct page **pages;
+	u64 off = *poff;
+	int num_pages, ret;
+
+	dout("sync_read on file %p %llu~%u %s\n", file, off, len,
+	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+	if (file->f_flags & O_DIRECT) {
+		num_pages = calc_pages_for((unsigned long)data, len);
+		pages = ceph_get_direct_page_vector(data, num_pages, true);
+	} else {
+		num_pages = calc_pages_for(off, len);
+		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+	}
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	/*
+	 * flush any page cache pages in this range.  this
+	 * will make concurrent normal and sync io slow,
+	 * but it will at least behave sensibly when they are
+	 * in sequence.
+	 */
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret < 0)
+		goto done;
+
+	ret = striped_read(inode, off, len, pages, num_pages, checkeof,
+			   file->f_flags & O_DIRECT,
+			   (unsigned long)data & ~PAGE_MASK);
+
+	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
+		ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
+	if (ret >= 0)
+		*poff = off + ret;
+
+done:
+	if (file->f_flags & O_DIRECT)
+		ceph_put_page_vector(pages, num_pages, true);
+	else
+		ceph_release_page_vector(pages, num_pages);
+	dout("sync_read result %d\n", ret);
+	return ret;
+}
+
+/*
+ * Write commit callback, called if we requested both an ACK and
+ * ONDISK commit reply from the OSD.
+ */
+static void sync_write_commit(struct ceph_osd_request *req,
+			      struct ceph_msg *msg)
+{
+	struct ceph_inode_info *ci = ceph_inode(req->r_inode);
+
+	dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
+	spin_lock(&ci->i_unsafe_lock);
+	list_del_init(&req->r_unsafe_item);
+	spin_unlock(&ci->i_unsafe_lock);
+	ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+}
+
+/*
+ * Synchronous write, straight from __user pointer or user pages (if
+ * O_DIRECT).
+ *
+ * If write spans object boundary, just do multiple writes.  (For a
+ * correct atomic write, we should e.g. take write locks on all
+ * objects, rollback on failure, etc.)
+ */
+static ssize_t ceph_sync_write(struct file *file, const char __user *data,
+			       size_t left, loff_t *offset)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_osd_request *req;
+	struct page **pages;
+	int num_pages;
+	long long unsigned pos;
+	u64 len;
+	int written = 0;
+	int flags;
+	int do_sync = 0;
+	int check_caps = 0;
+	int page_align, io_align;
+	unsigned long buf_align;
+	int ret;
+	struct timespec mtime = CURRENT_TIME;
+
+	if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("sync_write on file %p %lld~%u %s\n", file, *offset,
+	     (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+	if (file->f_flags & O_APPEND)
+		pos = i_size_read(inode);
+	else
+		pos = *offset;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
+	if (ret < 0)
+		return ret;
+
+	ret = invalidate_inode_pages2_range(inode->i_mapping,
+					    pos >> PAGE_CACHE_SHIFT,
+					    (pos + left) >> PAGE_CACHE_SHIFT);
+	if (ret < 0)
+		dout("invalidate_inode_pages2_range returned %d\n", ret);
+
+	flags = CEPH_OSD_FLAG_ORDERSNAP |
+		CEPH_OSD_FLAG_ONDISK |
+		CEPH_OSD_FLAG_WRITE;
+	if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
+		flags |= CEPH_OSD_FLAG_ACK;
+	else
+		do_sync = 1;
+
+	/*
+	 * we may need to do multiple writes here if we span an object
+	 * boundary.  this isn't atomic, unfortunately.  :(
+	 */
+more:
+	io_align = pos & ~PAGE_MASK;
+	buf_align = (unsigned long)data & ~PAGE_MASK;
+	len = left;
+	if (file->f_flags & O_DIRECT) {
+		/* write from beginning of first page, regardless of
+		   io alignment */
+		page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+		num_pages = calc_pages_for((unsigned long)data, len);
+	} else {
+		page_align = pos & ~PAGE_MASK;
+		num_pages = calc_pages_for(pos, len);
+	}
+	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+				    ceph_vino(inode), pos, &len,
+				    CEPH_OSD_OP_WRITE, flags,
+				    ci->i_snap_realm->cached_context,
+				    do_sync,
+				    ci->i_truncate_seq, ci->i_truncate_size,
+				    &mtime, false, 2, page_align);
+	if (!req)
+		return -ENOMEM;
+
+	if (file->f_flags & O_DIRECT) {
+		pages = ceph_get_direct_page_vector(data, num_pages, false);
+		if (IS_ERR(pages)) {
+			ret = PTR_ERR(pages);
+			goto out;
+		}
+
+		/*
+		 * throw out any page cache pages in this range. this
+		 * may block.
+		 */
+		truncate_inode_pages_range(inode->i_mapping, pos,
+					   (pos+len) | (PAGE_CACHE_SIZE-1));
+	} else {
+		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+		if (IS_ERR(pages)) {
+			ret = PTR_ERR(pages);
+			goto out;
+		}
+		ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
+		if (ret < 0) {
+			ceph_release_page_vector(pages, num_pages);
+			goto out;
+		}
+
+		if ((file->f_flags & O_SYNC) == 0) {
+			/* get a second commit callback */
+			req->r_safe_callback = sync_write_commit;
+			req->r_own_pages = 1;
+		}
+	}
+	req->r_pages = pages;
+	req->r_num_pages = num_pages;
+	req->r_inode = inode;
+
+	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+	if (!ret) {
+		if (req->r_safe_callback) {
+			/*
+			 * Add to inode unsafe list only after we
+			 * start_request so that a tid has been assigned.
+			 */
+			spin_lock(&ci->i_unsafe_lock);
+			list_add_tail(&req->r_unsafe_item,
+				      &ci->i_unsafe_writes);
+			spin_unlock(&ci->i_unsafe_lock);
+			ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
+		}
+		
+		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+		if (ret < 0 && req->r_safe_callback) {
+			spin_lock(&ci->i_unsafe_lock);
+			list_del_init(&req->r_unsafe_item);
+			spin_unlock(&ci->i_unsafe_lock);
+			ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+		}
+	}
+
+	if (file->f_flags & O_DIRECT)
+		ceph_put_page_vector(pages, num_pages, false);
+	else if (file->f_flags & O_SYNC)
+		ceph_release_page_vector(pages, num_pages);
+
+out:
+	ceph_osdc_put_request(req);
+	if (ret == 0) {
+		pos += len;
+		written += len;
+		left -= len;
+		data += written;
+		if (left)
+			goto more;
+
+		ret = written;
+		*offset = pos;
+		if (pos > i_size_read(inode))
+			check_caps = ceph_inode_set_size(inode, pos);
+		if (check_caps)
+			ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
+					NULL);
+	}
+	return ret;
+}
+
+/*
+ * Wrap generic_file_aio_read with checks for cap bits on the inode.
+ * Atomically grab references, so that those bits are not released
+ * back to the MDS mid-read.
+ *
+ * Hmm, the sync read case isn't actually async... should it be?
+ */
+static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
+			     unsigned long nr_segs, loff_t pos)
+{
+	struct file *filp = iocb->ki_filp;
+	struct ceph_file_info *fi = filp->private_data;
+	loff_t *ppos = &iocb->ki_pos;
+	size_t len = iov->iov_len;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	void __user *base = iov->iov_base;
+	ssize_t ret;
+	int want, got = 0;
+	int checkeof = 0, read = 0;
+
+	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)len, inode);
+again:
+	__ceph_do_pending_vmtruncate(inode);
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_CACHE;
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
+	if (ret < 0)
+		goto out;
+	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)len,
+	     ceph_cap_string(got));
+
+	if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
+	    (iocb->ki_filp->f_flags & O_DIRECT) ||
+	    (inode->i_sb->s_flags & MS_SYNCHRONOUS))
+		/* hmm, this isn't really async... */
+		ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
+	else
+		ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+
+out:
+	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
+	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
+	ceph_put_cap_refs(ci, got);
+
+	if (checkeof && ret >= 0) {
+		int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+
+		/* hit EOF or hole? */
+		if (statret == 0 && *ppos < inode->i_size) {
+			dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size);
+			read += ret;
+			base += ret;
+			len -= ret;
+			checkeof = 0;
+			goto again;
+		}
+	}
+	if (ret >= 0)
+		ret += read;
+
+	return ret;
+}
+
+/*
+ * Take cap references to avoid releasing caps to MDS mid-write.
+ *
+ * If we are synchronous, and write with an old snap context, the OSD
+ * may return EOLDSNAPC.  In that case, retry the write.. _after_
+ * dropping our cap refs and allowing the pending snap to logically
+ * complete _before_ this write occurs.
+ *
+ * If we are near ENOSPC, write synchronously.
+ */
+static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
+		       unsigned long nr_segs, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc =
+		&ceph_sb_to_client(inode->i_sb)->client->osdc;
+	loff_t endoff = pos + iov->iov_len;
+	int want, got = 0;
+	int ret, err;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+retry_snap:
+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
+		return -ENOSPC;
+	__ceph_do_pending_vmtruncate(inode);
+	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+	     inode->i_size);
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_BUFFER;
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
+	if (ret < 0)
+		goto out;
+
+	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+	     ceph_cap_string(got));
+
+	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
+	    (iocb->ki_filp->f_flags & O_DIRECT) ||
+	    (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
+		ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
+			&iocb->ki_pos);
+	} else {
+		ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+		if ((ret >= 0 || ret == -EIOCBQUEUED) &&
+		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
+		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
+			err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
+			if (err < 0)
+				ret = err;
+		}
+	}
+	if (ret >= 0) {
+		int dirty;
+		spin_lock(&inode->i_lock);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		spin_unlock(&inode->i_lock);
+		if (dirty)
+			__mark_inode_dirty(inode, dirty);
+	}
+
+out:
+	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+	     ceph_cap_string(got));
+	ceph_put_cap_refs(ci, got);
+
+	if (ret == -EOLDSNAPC) {
+		dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
+		     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
+		goto retry_snap;
+	}
+
+	return ret;
+}
+
+/*
+ * llseek.  be sure to verify file size on SEEK_END.
+ */
+static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_mapping->host;
+	int ret;
+
+	mutex_lock(&inode->i_mutex);
+	__ceph_do_pending_vmtruncate(inode);
+	switch (origin) {
+	case SEEK_END:
+		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+		if (ret < 0) {
+			offset = ret;
+			goto out;
+		}
+		offset += inode->i_size;
+		break;
+	case SEEK_CUR:
+		/*
+		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+		 * position-querying operation.  Avoid rewriting the "same"
+		 * f_pos value back to the file because a concurrent read(),
+		 * write() or lseek() might have altered it
+		 */
+		if (offset == 0) {
+			offset = file->f_pos;
+			goto out;
+		}
+		offset += file->f_pos;
+		break;
+	}
+
+	if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
+		offset = -EINVAL;
+		goto out;
+	}
+
+	/* Special lock needed here? */
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
+	}
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	return offset;
+}
+
+const struct file_operations ceph_file_fops = {
+	.open = ceph_open,
+	.release = ceph_release,
+	.llseek = ceph_llseek,
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = ceph_aio_read,
+	.aio_write = ceph_aio_write,
+	.mmap = ceph_mmap,
+	.fsync = ceph_fsync,
+	.lock = ceph_lock,
+	.flock = ceph_flock,
+	.splice_read = generic_file_splice_read,
+	.splice_write = generic_file_splice_write,
+	.unlocked_ioctl = ceph_ioctl,
+	.compat_ioctl	= ceph_ioctl,
+};
+
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 00000000..d8858e96
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1842 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+#include <linux/vmalloc.h>
+#include <linux/pagevec.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
+
+/*
+ * Ceph inode operations
+ *
+ * Implement basic inode helpers (get, alloc) and inode ops (getattr,
+ * setattr, etc.), xattr helpers, and helpers for assimilating
+ * metadata returned by the MDS into our cache.
+ *
+ * Also define helpers for doing asynchronous writeback, invalidation,
+ * and truncation for the benefit of those who can't afford to block
+ * (typically because they are in the message handler path).
+ */
+
+static const struct inode_operations ceph_symlink_iops;
+
+static void ceph_invalidate_work(struct work_struct *work);
+static void ceph_writeback_work(struct work_struct *work);
+static void ceph_vmtruncate_work(struct work_struct *work);
+
+/*
+ * find or create an inode, given the ceph ino number
+ */
+static int ceph_set_ino_cb(struct inode *inode, void *data)
+{
+	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
+	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+	return 0;
+}
+
+struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
+{
+	struct inode *inode;
+	ino_t t = ceph_vino_to_ino(vino);
+
+	inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+	if (inode == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (inode->i_state & I_NEW) {
+		dout("get_inode created new inode %p %llx.%llx ino %llx\n",
+		     inode, ceph_vinop(inode), (u64)inode->i_ino);
+		unlock_new_inode(inode);
+	}
+
+	dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
+	     vino.snap, inode);
+	return inode;
+}
+
+/*
+ * get/constuct snapdir inode for a given directory
+ */
+struct inode *ceph_get_snapdir(struct inode *parent)
+{
+	struct ceph_vino vino = {
+		.ino = ceph_ino(parent),
+		.snap = CEPH_SNAPDIR,
+	};
+	struct inode *inode = ceph_get_inode(parent->i_sb, vino);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	BUG_ON(!S_ISDIR(parent->i_mode));
+	if (IS_ERR(inode))
+		return inode;
+	inode->i_mode = parent->i_mode;
+	inode->i_uid = parent->i_uid;
+	inode->i_gid = parent->i_gid;
+	inode->i_op = &ceph_dir_iops;
+	inode->i_fop = &ceph_dir_fops;
+	ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
+	ci->i_rbytes = 0;
+	return inode;
+}
+
+const struct inode_operations ceph_file_iops = {
+	.permission = ceph_permission,
+	.setattr = ceph_setattr,
+	.getattr = ceph_getattr,
+	.setxattr = ceph_setxattr,
+	.getxattr = ceph_getxattr,
+	.listxattr = ceph_listxattr,
+	.removexattr = ceph_removexattr,
+};
+
+
+/*
+ * We use a 'frag tree' to keep track of the MDS's directory fragments
+ * for a given inode (usually there is just a single fragment).  We
+ * need to know when a child frag is delegated to a new MDS, or when
+ * it is flagged as replicated, so we can direct our requests
+ * accordingly.
+ */
+
+/*
+ * find/create a frag in the tree
+ */
+static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
+						    u32 f)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct ceph_inode_frag *frag;
+	int c;
+
+	p = &ci->i_fragtree.rb_node;
+	while (*p) {
+		parent = *p;
+		frag = rb_entry(parent, struct ceph_inode_frag, node);
+		c = ceph_frag_compare(f, frag->frag);
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else
+			return frag;
+	}
+
+	frag = kmalloc(sizeof(*frag), GFP_NOFS);
+	if (!frag) {
+		pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
+		       "frag %x\n", &ci->vfs_inode,
+		       ceph_vinop(&ci->vfs_inode), f);
+		return ERR_PTR(-ENOMEM);
+	}
+	frag->frag = f;
+	frag->split_by = 0;
+	frag->mds = -1;
+	frag->ndist = 0;
+
+	rb_link_node(&frag->node, parent, p);
+	rb_insert_color(&frag->node, &ci->i_fragtree);
+
+	dout("get_or_create_frag added %llx.%llx frag %x\n",
+	     ceph_vinop(&ci->vfs_inode), f);
+	return frag;
+}
+
+/*
+ * find a specific frag @f
+ */
+struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
+{
+	struct rb_node *n = ci->i_fragtree.rb_node;
+
+	while (n) {
+		struct ceph_inode_frag *frag =
+			rb_entry(n, struct ceph_inode_frag, node);
+		int c = ceph_frag_compare(f, frag->frag);
+		if (c < 0)
+			n = n->rb_left;
+		else if (c > 0)
+			n = n->rb_right;
+		else
+			return frag;
+	}
+	return NULL;
+}
+
+/*
+ * Choose frag containing the given value @v.  If @pfrag is
+ * specified, copy the frag delegation info to the caller if
+ * it is present.
+ */
+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+		     struct ceph_inode_frag *pfrag,
+		     int *found)
+{
+	u32 t = ceph_frag_make(0, 0);
+	struct ceph_inode_frag *frag;
+	unsigned nway, i;
+	u32 n;
+
+	if (found)
+		*found = 0;
+
+	mutex_lock(&ci->i_fragtree_mutex);
+	while (1) {
+		WARN_ON(!ceph_frag_contains_value(t, v));
+		frag = __ceph_find_frag(ci, t);
+		if (!frag)
+			break; /* t is a leaf */
+		if (frag->split_by == 0) {
+			if (pfrag)
+				memcpy(pfrag, frag, sizeof(*pfrag));
+			if (found)
+				*found = 1;
+			break;
+		}
+
+		/* choose child */
+		nway = 1 << frag->split_by;
+		dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
+		     frag->split_by, nway);
+		for (i = 0; i < nway; i++) {
+			n = ceph_frag_make_child(t, frag->split_by, i);
+			if (ceph_frag_contains_value(n, v)) {
+				t = n;
+				break;
+			}
+		}
+		BUG_ON(i == nway);
+	}
+	dout("choose_frag(%x) = %x\n", v, t);
+
+	mutex_unlock(&ci->i_fragtree_mutex);
+	return t;
+}
+
+/*
+ * Process dirfrag (delegation) info from the mds.  Include leaf
+ * fragment in tree ONLY if ndist > 0.  Otherwise, only
+ * branches/splits are included in i_fragtree)
+ */
+static int ceph_fill_dirfrag(struct inode *inode,
+			     struct ceph_mds_reply_dirfrag *dirinfo)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_inode_frag *frag;
+	u32 id = le32_to_cpu(dirinfo->frag);
+	int mds = le32_to_cpu(dirinfo->auth);
+	int ndist = le32_to_cpu(dirinfo->ndist);
+	int i;
+	int err = 0;
+
+	mutex_lock(&ci->i_fragtree_mutex);
+	if (ndist == 0) {
+		/* no delegation info needed. */
+		frag = __ceph_find_frag(ci, id);
+		if (!frag)
+			goto out;
+		if (frag->split_by == 0) {
+			/* tree leaf, remove */
+			dout("fill_dirfrag removed %llx.%llx frag %x"
+			     " (no ref)\n", ceph_vinop(inode), id);
+			rb_erase(&frag->node, &ci->i_fragtree);
+			kfree(frag);
+		} else {
+			/* tree branch, keep and clear */
+			dout("fill_dirfrag cleared %llx.%llx frag %x"
+			     " referral\n", ceph_vinop(inode), id);
+			frag->mds = -1;
+			frag->ndist = 0;
+		}
+		goto out;
+	}
+
+
+	/* find/add this frag to store mds delegation info */
+	frag = __get_or_create_frag(ci, id);
+	if (IS_ERR(frag)) {
+		/* this is not the end of the world; we can continue
+		   with bad/inaccurate delegation info */
+		pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
+		       ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
+		err = -ENOMEM;
+		goto out;
+	}
+
+	frag->mds = mds;
+	frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
+	for (i = 0; i < frag->ndist; i++)
+		frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
+	dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
+	     ceph_vinop(inode), frag->frag, frag->ndist);
+
+out:
+	mutex_unlock(&ci->i_fragtree_mutex);
+	return err;
+}
+
+
+/*
+ * initialize a newly allocated inode.
+ */
+struct inode *ceph_alloc_inode(struct super_block *sb)
+{
+	struct ceph_inode_info *ci;
+	int i;
+
+	ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+	if (!ci)
+		return NULL;
+
+	dout("alloc_inode %p\n", &ci->vfs_inode);
+
+	ci->i_version = 0;
+	ci->i_time_warp_seq = 0;
+	ci->i_ceph_flags = 0;
+	ci->i_release_count = 0;
+	ci->i_symlink = NULL;
+
+	memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+
+	ci->i_fragtree = RB_ROOT;
+	mutex_init(&ci->i_fragtree_mutex);
+
+	ci->i_xattrs.blob = NULL;
+	ci->i_xattrs.prealloc_blob = NULL;
+	ci->i_xattrs.dirty = false;
+	ci->i_xattrs.index = RB_ROOT;
+	ci->i_xattrs.count = 0;
+	ci->i_xattrs.names_size = 0;
+	ci->i_xattrs.vals_size = 0;
+	ci->i_xattrs.version = 0;
+	ci->i_xattrs.index_version = 0;
+
+	ci->i_caps = RB_ROOT;
+	ci->i_auth_cap = NULL;
+	ci->i_dirty_caps = 0;
+	ci->i_flushing_caps = 0;
+	INIT_LIST_HEAD(&ci->i_dirty_item);
+	INIT_LIST_HEAD(&ci->i_flushing_item);
+	ci->i_cap_flush_seq = 0;
+	ci->i_cap_flush_last_tid = 0;
+	memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
+	init_waitqueue_head(&ci->i_cap_wq);
+	ci->i_hold_caps_min = 0;
+	ci->i_hold_caps_max = 0;
+	INIT_LIST_HEAD(&ci->i_cap_delay_list);
+	ci->i_cap_exporting_mds = 0;
+	ci->i_cap_exporting_mseq = 0;
+	ci->i_cap_exporting_issued = 0;
+	INIT_LIST_HEAD(&ci->i_cap_snaps);
+	ci->i_head_snapc = NULL;
+	ci->i_snap_caps = 0;
+
+	for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
+		ci->i_nr_by_mode[i] = 0;
+
+	ci->i_truncate_seq = 0;
+	ci->i_truncate_size = 0;
+	ci->i_truncate_pending = 0;
+
+	ci->i_max_size = 0;
+	ci->i_reported_size = 0;
+	ci->i_wanted_max_size = 0;
+	ci->i_requested_max_size = 0;
+
+	ci->i_pin_ref = 0;
+	ci->i_rd_ref = 0;
+	ci->i_rdcache_ref = 0;
+	ci->i_wr_ref = 0;
+	ci->i_wb_ref = 0;
+	ci->i_wrbuffer_ref = 0;
+	ci->i_wrbuffer_ref_head = 0;
+	ci->i_shared_gen = 0;
+	ci->i_rdcache_gen = 0;
+	ci->i_rdcache_revoking = 0;
+
+	INIT_LIST_HEAD(&ci->i_unsafe_writes);
+	INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+	spin_lock_init(&ci->i_unsafe_lock);
+
+	ci->i_snap_realm = NULL;
+	INIT_LIST_HEAD(&ci->i_snap_realm_item);
+	INIT_LIST_HEAD(&ci->i_snap_flush_item);
+
+	INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
+	INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
+
+	INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
+
+	return &ci->vfs_inode;
+}
+
+static void ceph_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ceph_inode_cachep, ci);
+}
+
+void ceph_destroy_inode(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_inode_frag *frag;
+	struct rb_node *n;
+
+	dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+
+	ceph_queue_caps_release(inode);
+
+	/*
+	 * we may still have a snap_realm reference if there are stray
+	 * caps in i_cap_exporting_issued or i_snap_caps.
+	 */
+	if (ci->i_snap_realm) {
+		struct ceph_mds_client *mdsc =
+			ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+		struct ceph_snap_realm *realm = ci->i_snap_realm;
+
+		dout(" dropping residual ref to snap realm %p\n", realm);
+		spin_lock(&realm->inodes_with_caps_lock);
+		list_del_init(&ci->i_snap_realm_item);
+		spin_unlock(&realm->inodes_with_caps_lock);
+		ceph_put_snap_realm(mdsc, realm);
+	}
+
+	kfree(ci->i_symlink);
+	while ((n = rb_first(&ci->i_fragtree)) != NULL) {
+		frag = rb_entry(n, struct ceph_inode_frag, node);
+		rb_erase(n, &ci->i_fragtree);
+		kfree(frag);
+	}
+
+	__ceph_destroy_xattrs(ci);
+	if (ci->i_xattrs.blob)
+		ceph_buffer_put(ci->i_xattrs.blob);
+	if (ci->i_xattrs.prealloc_blob)
+		ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+
+	call_rcu(&inode->i_rcu, ceph_i_callback);
+}
+
+
+/*
+ * Helpers to fill in size, ctime, mtime, and atime.  We have to be
+ * careful because either the client or MDS may have more up to date
+ * info, depending on which capabilities are held, and whether
+ * time_warp_seq or truncate_seq have increased.  (Ordinarily, mtime
+ * and size are monotonically increasing, except when utimes() or
+ * truncate() increments the corresponding _seq values.)
+ */
+int ceph_fill_file_size(struct inode *inode, int issued,
+			u32 truncate_seq, u64 truncate_size, u64 size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int queue_trunc = 0;
+
+	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
+	    (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
+		dout("size %lld -> %llu\n", inode->i_size, size);
+		inode->i_size = size;
+		inode->i_blocks = (size + (1<<9) - 1) >> 9;
+		ci->i_reported_size = size;
+		if (truncate_seq != ci->i_truncate_seq) {
+			dout("truncate_seq %u -> %u\n",
+			     ci->i_truncate_seq, truncate_seq);
+			ci->i_truncate_seq = truncate_seq;
+			/*
+			 * If we hold relevant caps, or in the case where we're
+			 * not the only client referencing this file and we
+			 * don't hold those caps, then we need to check whether
+			 * the file is either opened or mmaped
+			 */
+			if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
+				       CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
+				       CEPH_CAP_FILE_EXCL|
+				       CEPH_CAP_FILE_LAZYIO)) ||
+			    mapping_mapped(inode->i_mapping) ||
+			    __ceph_caps_file_wanted(ci)) {
+				ci->i_truncate_pending++;
+				queue_trunc = 1;
+			}
+		}
+	}
+	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
+	    ci->i_truncate_size != truncate_size) {
+		dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
+		     truncate_size);
+		ci->i_truncate_size = truncate_size;
+	}
+	return queue_trunc;
+}
+
+void ceph_fill_file_time(struct inode *inode, int issued,
+			 u64 time_warp_seq, struct timespec *ctime,
+			 struct timespec *mtime, struct timespec *atime)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int warn = 0;
+
+	if (issued & (CEPH_CAP_FILE_EXCL|
+		      CEPH_CAP_FILE_WR|
+		      CEPH_CAP_FILE_BUFFER|
+		      CEPH_CAP_AUTH_EXCL|
+		      CEPH_CAP_XATTR_EXCL)) {
+		if (timespec_compare(ctime, &inode->i_ctime) > 0) {
+			dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
+			     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+			     ctime->tv_sec, ctime->tv_nsec);
+			inode->i_ctime = *ctime;
+		}
+		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
+			/* the MDS did a utimes() */
+			dout("mtime %ld.%09ld -> %ld.%09ld "
+			     "tw %d -> %d\n",
+			     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+			     mtime->tv_sec, mtime->tv_nsec,
+			     ci->i_time_warp_seq, (int)time_warp_seq);
+
+			inode->i_mtime = *mtime;
+			inode->i_atime = *atime;
+			ci->i_time_warp_seq = time_warp_seq;
+		} else if (time_warp_seq == ci->i_time_warp_seq) {
+			/* nobody did utimes(); take the max */
+			if (timespec_compare(mtime, &inode->i_mtime) > 0) {
+				dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
+				     inode->i_mtime.tv_sec,
+				     inode->i_mtime.tv_nsec,
+				     mtime->tv_sec, mtime->tv_nsec);
+				inode->i_mtime = *mtime;
+			}
+			if (timespec_compare(atime, &inode->i_atime) > 0) {
+				dout("atime %ld.%09ld -> %ld.%09ld inc\n",
+				     inode->i_atime.tv_sec,
+				     inode->i_atime.tv_nsec,
+				     atime->tv_sec, atime->tv_nsec);
+				inode->i_atime = *atime;
+			}
+		} else if (issued & CEPH_CAP_FILE_EXCL) {
+			/* we did a utimes(); ignore mds values */
+		} else {
+			warn = 1;
+		}
+	} else {
+		/* we have no write|excl caps; whatever the MDS says is true */
+		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
+			inode->i_ctime = *ctime;
+			inode->i_mtime = *mtime;
+			inode->i_atime = *atime;
+			ci->i_time_warp_seq = time_warp_seq;
+		} else {
+			warn = 1;
+		}
+	}
+	if (warn) /* time_warp_seq shouldn't go backwards */
+		dout("%p mds time_warp_seq %llu < %u\n",
+		     inode, time_warp_seq, ci->i_time_warp_seq);
+}
+
+/*
+ * Populate an inode based on info from mds.  May be called on new or
+ * existing inodes.
+ */
+static int fill_inode(struct inode *inode,
+		      struct ceph_mds_reply_info_in *iinfo,
+		      struct ceph_mds_reply_dirfrag *dirinfo,
+		      struct ceph_mds_session *session,
+		      unsigned long ttl_from, int cap_fmode,
+		      struct ceph_cap_reservation *caps_reservation)
+{
+	struct ceph_mds_reply_inode *info = iinfo->in;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int i;
+	int issued, implemented;
+	struct timespec mtime, atime, ctime;
+	u32 nsplits;
+	struct ceph_buffer *xattr_blob = NULL;
+	int err = 0;
+	int queue_trunc = 0;
+
+	dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
+	     inode, ceph_vinop(inode), le64_to_cpu(info->version),
+	     ci->i_version);
+
+	/*
+	 * prealloc xattr data, if it looks like we'll need it.  only
+	 * if len > 4 (meaning there are actually xattrs; the first 4
+	 * bytes are the xattr count).
+	 */
+	if (iinfo->xattr_len > 4) {
+		xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
+		if (!xattr_blob)
+			pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
+			       iinfo->xattr_len);
+	}
+
+	spin_lock(&inode->i_lock);
+
+	/*
+	 * provided version will be odd if inode value is projected,
+	 * even if stable.  skip the update if we have newer stable
+	 * info (ours>=theirs, e.g. due to racing mds replies), unless
+	 * we are getting projected (unstable) info (in which case the
+	 * version is odd, and we want ours>theirs).
+	 *   us   them
+	 *   2    2     skip
+	 *   3    2     skip
+	 *   3    3     update
+	 */
+	if (le64_to_cpu(info->version) > 0 &&
+	    (ci->i_version & ~1) >= le64_to_cpu(info->version))
+		goto no_change;
+
+	issued = __ceph_caps_issued(ci, &implemented);
+	issued |= implemented | __ceph_caps_dirty(ci);
+
+	/* update inode */
+	ci->i_version = le64_to_cpu(info->version);
+	inode->i_version++;
+	inode->i_rdev = le32_to_cpu(info->rdev);
+
+	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+		inode->i_mode = le32_to_cpu(info->mode);
+		inode->i_uid = le32_to_cpu(info->uid);
+		inode->i_gid = le32_to_cpu(info->gid);
+		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+		     inode->i_uid, inode->i_gid);
+	}
+
+	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+		inode->i_nlink = le32_to_cpu(info->nlink);
+
+	/* be careful with mtime, atime, size */
+	ceph_decode_timespec(&atime, &info->atime);
+	ceph_decode_timespec(&mtime, &info->mtime);
+	ceph_decode_timespec(&ctime, &info->ctime);
+	queue_trunc = ceph_fill_file_size(inode, issued,
+					  le32_to_cpu(info->truncate_seq),
+					  le64_to_cpu(info->truncate_size),
+					  le64_to_cpu(info->size));
+	ceph_fill_file_time(inode, issued,
+			    le32_to_cpu(info->time_warp_seq),
+			    &ctime, &mtime, &atime);
+
+	/* only update max_size on auth cap */
+	if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+	    ci->i_max_size != le64_to_cpu(info->max_size)) {
+		dout("max_size %lld -> %llu\n", ci->i_max_size,
+		     le64_to_cpu(info->max_size));
+		ci->i_max_size = le64_to_cpu(info->max_size);
+	}
+
+	ci->i_layout = info->layout;
+	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+
+	/* xattrs */
+	/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
+	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
+	    le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
+		if (ci->i_xattrs.blob)
+			ceph_buffer_put(ci->i_xattrs.blob);
+		ci->i_xattrs.blob = xattr_blob;
+		if (xattr_blob)
+			memcpy(ci->i_xattrs.blob->vec.iov_base,
+			       iinfo->xattr_data, iinfo->xattr_len);
+		ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+		xattr_blob = NULL;
+	}
+
+	inode->i_mapping->a_ops = &ceph_aops;
+	inode->i_mapping->backing_dev_info =
+		&ceph_sb_to_client(inode->i_sb)->backing_dev_info;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFBLK:
+	case S_IFCHR:
+	case S_IFSOCK:
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		inode->i_op = &ceph_file_iops;
+		break;
+	case S_IFREG:
+		inode->i_op = &ceph_file_iops;
+		inode->i_fop = &ceph_file_fops;
+		break;
+	case S_IFLNK:
+		inode->i_op = &ceph_symlink_iops;
+		if (!ci->i_symlink) {
+			int symlen = iinfo->symlink_len;
+			char *sym;
+
+			BUG_ON(symlen != inode->i_size);
+			spin_unlock(&inode->i_lock);
+
+			err = -ENOMEM;
+			sym = kmalloc(symlen+1, GFP_NOFS);
+			if (!sym)
+				goto out;
+			memcpy(sym, iinfo->symlink, symlen);
+			sym[symlen] = 0;
+
+			spin_lock(&inode->i_lock);
+			if (!ci->i_symlink)
+				ci->i_symlink = sym;
+			else
+				kfree(sym); /* lost a race */
+		}
+		break;
+	case S_IFDIR:
+		inode->i_op = &ceph_dir_iops;
+		inode->i_fop = &ceph_dir_fops;
+
+		ci->i_dir_layout = iinfo->dir_layout;
+
+		ci->i_files = le64_to_cpu(info->files);
+		ci->i_subdirs = le64_to_cpu(info->subdirs);
+		ci->i_rbytes = le64_to_cpu(info->rbytes);
+		ci->i_rfiles = le64_to_cpu(info->rfiles);
+		ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+		ceph_decode_timespec(&ci->i_rctime, &info->rctime);
+
+		/* set dir completion flag? */
+		if (ci->i_files == 0 && ci->i_subdirs == 0 &&
+		    ceph_snap(inode) == CEPH_NOSNAP &&
+		    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+		    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
+		    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+			dout(" marking %p complete (empty)\n", inode);
+			/* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
+			ci->i_max_offset = 2;
+		}
+		break;
+	default:
+		pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
+		       ceph_vinop(inode), inode->i_mode);
+	}
+
+no_change:
+	spin_unlock(&inode->i_lock);
+
+	/* queue truncate if we saw i_size decrease */
+	if (queue_trunc)
+		ceph_queue_vmtruncate(inode);
+
+	/* populate frag tree */
+	/* FIXME: move me up, if/when version reflects fragtree changes */
+	nsplits = le32_to_cpu(info->fragtree.nsplits);
+	mutex_lock(&ci->i_fragtree_mutex);
+	for (i = 0; i < nsplits; i++) {
+		u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
+		struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
+
+		if (IS_ERR(frag))
+			continue;
+		frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
+		dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+	}
+	mutex_unlock(&ci->i_fragtree_mutex);
+
+	/* were we issued a capability? */
+	if (info->cap.caps) {
+		if (ceph_snap(inode) == CEPH_NOSNAP) {
+			ceph_add_cap(inode, session,
+				     le64_to_cpu(info->cap.cap_id),
+				     cap_fmode,
+				     le32_to_cpu(info->cap.caps),
+				     le32_to_cpu(info->cap.wanted),
+				     le32_to_cpu(info->cap.seq),
+				     le32_to_cpu(info->cap.mseq),
+				     le64_to_cpu(info->cap.realm),
+				     info->cap.flags,
+				     caps_reservation);
+		} else {
+			spin_lock(&inode->i_lock);
+			dout(" %p got snap_caps %s\n", inode,
+			     ceph_cap_string(le32_to_cpu(info->cap.caps)));
+			ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
+			if (cap_fmode >= 0)
+				__ceph_get_fmode(ci, cap_fmode);
+			spin_unlock(&inode->i_lock);
+		}
+	} else if (cap_fmode >= 0) {
+		pr_warning("mds issued no caps on %llx.%llx\n",
+			   ceph_vinop(inode));
+		__ceph_get_fmode(ci, cap_fmode);
+	}
+
+	/* update delegation info? */
+	if (dirinfo)
+		ceph_fill_dirfrag(inode, dirinfo);
+
+	err = 0;
+
+out:
+	if (xattr_blob)
+		ceph_buffer_put(xattr_blob);
+	return err;
+}
+
+/*
+ * caller should hold session s_mutex.
+ */
+static void update_dentry_lease(struct dentry *dentry,
+				struct ceph_mds_reply_lease *lease,
+				struct ceph_mds_session *session,
+				unsigned long from_time)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	long unsigned duration = le32_to_cpu(lease->duration_ms);
+	long unsigned ttl = from_time + (duration * HZ) / 1000;
+	long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
+	struct inode *dir;
+
+	/* only track leases on regular dentries */
+	if (dentry->d_op != &ceph_dentry_ops)
+		return;
+
+	spin_lock(&dentry->d_lock);
+	dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
+	     dentry, le16_to_cpu(lease->mask), duration, ttl);
+
+	/* make lease_rdcache_gen match directory */
+	dir = dentry->d_parent->d_inode;
+	di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
+
+	if (lease->mask == 0)
+		goto out_unlock;
+
+	if (di->lease_gen == session->s_cap_gen &&
+	    time_before(ttl, dentry->d_time))
+		goto out_unlock;  /* we already have a newer lease. */
+
+	if (di->lease_session && di->lease_session != session)
+		goto out_unlock;
+
+	ceph_dentry_lru_touch(dentry);
+
+	if (!di->lease_session)
+		di->lease_session = ceph_get_mds_session(session);
+	di->lease_gen = session->s_cap_gen;
+	di->lease_seq = le32_to_cpu(lease->seq);
+	di->lease_renew_after = half_ttl;
+	di->lease_renew_from = 0;
+	dentry->d_time = ttl;
+out_unlock:
+	spin_unlock(&dentry->d_lock);
+	return;
+}
+
+/*
+ * Set dentry's directory position based on the current dir's max, and
+ * order it in d_subdirs, so that dcache_readdir behaves.
+ */
+static void ceph_set_dentry_offset(struct dentry *dn)
+{
+	struct dentry *dir = dn->d_parent;
+	struct inode *inode = dn->d_parent->d_inode;
+	struct ceph_dentry_info *di;
+
+	BUG_ON(!inode);
+
+	di = ceph_dentry(dn);
+
+	spin_lock(&inode->i_lock);
+	if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+		spin_unlock(&inode->i_lock);
+		return;
+	}
+	di->offset = ceph_inode(inode)->i_max_offset++;
+	spin_unlock(&inode->i_lock);
+
+	spin_lock(&dir->d_lock);
+	spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
+	list_move(&dn->d_u.d_child, &dir->d_subdirs);
+	dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
+	     dn->d_u.d_child.prev, dn->d_u.d_child.next);
+	spin_unlock(&dn->d_lock);
+	spin_unlock(&dir->d_lock);
+}
+
+/*
+ * splice a dentry to an inode.
+ * caller must hold directory i_mutex for this to be safe.
+ *
+ * we will only rehash the resulting dentry if @prehash is
+ * true; @prehash will be set to false (for the benefit of
+ * the caller) if we fail.
+ */
+static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
+				    bool *prehash, bool set_offset)
+{
+	struct dentry *realdn;
+
+	BUG_ON(dn->d_inode);
+
+	/* dn must be unhashed */
+	if (!d_unhashed(dn))
+		d_drop(dn);
+	realdn = d_materialise_unique(dn, in);
+	if (IS_ERR(realdn)) {
+		pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
+		       PTR_ERR(realdn), dn, in, ceph_vinop(in));
+		if (prehash)
+			*prehash = false; /* don't rehash on error */
+		dn = realdn; /* note realdn contains the error */
+		goto out;
+	} else if (realdn) {
+		dout("dn %p (%d) spliced with %p (%d) "
+		     "inode %p ino %llx.%llx\n",
+		     dn, dn->d_count,
+		     realdn, realdn->d_count,
+		     realdn->d_inode, ceph_vinop(realdn->d_inode));
+		dput(dn);
+		dn = realdn;
+	} else {
+		BUG_ON(!ceph_dentry(dn));
+		dout("dn %p attached to %p ino %llx.%llx\n",
+		     dn, dn->d_inode, ceph_vinop(dn->d_inode));
+	}
+	if ((!prehash || *prehash) && d_unhashed(dn))
+		d_rehash(dn);
+	if (set_offset)
+		ceph_set_dentry_offset(dn);
+out:
+	return dn;
+}
+
+/*
+ * Incorporate results into the local cache.  This is either just
+ * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
+ * after a lookup).
+ *
+ * A reply may contain
+ *         a directory inode along with a dentry.
+ *  and/or a target inode
+ *
+ * Called with snap_rwsem (read).
+ */
+int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
+		    struct ceph_mds_session *session)
+{
+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	struct inode *in = NULL;
+	struct ceph_mds_reply_inode *ininfo;
+	struct ceph_vino vino;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	int i = 0;
+	int err = 0;
+
+	dout("fill_trace %p is_dentry %d is_target %d\n", req,
+	     rinfo->head->is_dentry, rinfo->head->is_target);
+
+#if 0
+	/*
+	 * Debugging hook:
+	 *
+	 * If we resend completed ops to a recovering mds, we get no
+	 * trace.  Since that is very rare, pretend this is the case
+	 * to ensure the 'no trace' handlers in the callers behave.
+	 *
+	 * Fill in inodes unconditionally to avoid breaking cap
+	 * invariants.
+	 */
+	if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
+		pr_info("fill_trace faking empty trace on %lld %s\n",
+			req->r_tid, ceph_mds_op_name(rinfo->head->op));
+		if (rinfo->head->is_dentry) {
+			rinfo->head->is_dentry = 0;
+			err = fill_inode(req->r_locked_dir,
+					 &rinfo->diri, rinfo->dirfrag,
+					 session, req->r_request_started, -1);
+		}
+		if (rinfo->head->is_target) {
+			rinfo->head->is_target = 0;
+			ininfo = rinfo->targeti.in;
+			vino.ino = le64_to_cpu(ininfo->ino);
+			vino.snap = le64_to_cpu(ininfo->snapid);
+			in = ceph_get_inode(sb, vino);
+			err = fill_inode(in, &rinfo->targeti, NULL,
+					 session, req->r_request_started,
+					 req->r_fmode);
+			iput(in);
+		}
+	}
+#endif
+
+	if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
+		dout("fill_trace reply is empty!\n");
+		if (rinfo->head->result == 0 && req->r_locked_dir)
+			ceph_invalidate_dir_request(req);
+		return 0;
+	}
+
+	if (rinfo->head->is_dentry) {
+		struct inode *dir = req->r_locked_dir;
+
+		err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
+				 session, req->r_request_started, -1,
+				 &req->r_caps_reservation);
+		if (err < 0)
+			return err;
+	}
+
+	/*
+	 * ignore null lease/binding on snapdir ENOENT, or else we
+	 * will have trouble splicing in the virtual snapdir later
+	 */
+	if (rinfo->head->is_dentry && !req->r_aborted &&
+	    (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
+					       fsc->mount_options->snapdir_name,
+					       req->r_dentry->d_name.len))) {
+		/*
+		 * lookup link rename   : null -> possibly existing inode
+		 * mknod symlink mkdir  : null -> new inode
+		 * unlink               : linked -> null
+		 */
+		struct inode *dir = req->r_locked_dir;
+		struct dentry *dn = req->r_dentry;
+		bool have_dir_cap, have_lease;
+
+		BUG_ON(!dn);
+		BUG_ON(!dir);
+		BUG_ON(dn->d_parent->d_inode != dir);
+		BUG_ON(ceph_ino(dir) !=
+		       le64_to_cpu(rinfo->diri.in->ino));
+		BUG_ON(ceph_snap(dir) !=
+		       le64_to_cpu(rinfo->diri.in->snapid));
+
+		/* do we have a lease on the whole dir? */
+		have_dir_cap =
+			(le32_to_cpu(rinfo->diri.in->cap.caps) &
+			 CEPH_CAP_FILE_SHARED);
+
+		/* do we have a dn lease? */
+		have_lease = have_dir_cap ||
+			(le16_to_cpu(rinfo->dlease->mask) &
+			 CEPH_LOCK_DN);
+
+		if (!have_lease)
+			dout("fill_trace  no dentry lease or dir cap\n");
+
+		/* rename? */
+		if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
+			dout(" src %p '%.*s' dst %p '%.*s'\n",
+			     req->r_old_dentry,
+			     req->r_old_dentry->d_name.len,
+			     req->r_old_dentry->d_name.name,
+			     dn, dn->d_name.len, dn->d_name.name);
+			dout("fill_trace doing d_move %p -> %p\n",
+			     req->r_old_dentry, dn);
+
+			d_move(req->r_old_dentry, dn);
+			dout(" src %p '%.*s' dst %p '%.*s'\n",
+			     req->r_old_dentry,
+			     req->r_old_dentry->d_name.len,
+			     req->r_old_dentry->d_name.name,
+			     dn, dn->d_name.len, dn->d_name.name);
+
+			/* ensure target dentry is invalidated, despite
+			   rehashing bug in vfs_rename_dir */
+			ceph_invalidate_dentry_lease(dn);
+
+			/*
+			 * d_move() puts the renamed dentry at the end of
+			 * d_subdirs.  We need to assign it an appropriate
+			 * directory offset so we can behave when holding
+			 * I_COMPLETE.
+			 */
+			ceph_set_dentry_offset(req->r_old_dentry);
+			dout("dn %p gets new offset %lld\n", req->r_old_dentry, 
+			     ceph_dentry(req->r_old_dentry)->offset);
+
+			dn = req->r_old_dentry;  /* use old_dentry */
+			in = dn->d_inode;
+		}
+
+		/* null dentry? */
+		if (!rinfo->head->is_target) {
+			dout("fill_trace null dentry\n");
+			if (dn->d_inode) {
+				dout("d_delete %p\n", dn);
+				d_delete(dn);
+			} else {
+				dout("d_instantiate %p NULL\n", dn);
+				d_instantiate(dn, NULL);
+				if (have_lease && d_unhashed(dn))
+					d_rehash(dn);
+				update_dentry_lease(dn, rinfo->dlease,
+						    session,
+						    req->r_request_started);
+			}
+			goto done;
+		}
+
+		/* attach proper inode */
+		ininfo = rinfo->targeti.in;
+		vino.ino = le64_to_cpu(ininfo->ino);
+		vino.snap = le64_to_cpu(ininfo->snapid);
+		in = dn->d_inode;
+		if (!in) {
+			in = ceph_get_inode(sb, vino);
+			if (IS_ERR(in)) {
+				pr_err("fill_trace bad get_inode "
+				       "%llx.%llx\n", vino.ino, vino.snap);
+				err = PTR_ERR(in);
+				d_delete(dn);
+				goto done;
+			}
+			dn = splice_dentry(dn, in, &have_lease, true);
+			if (IS_ERR(dn)) {
+				err = PTR_ERR(dn);
+				goto done;
+			}
+			req->r_dentry = dn;  /* may have spliced */
+			ihold(in);
+		} else if (ceph_ino(in) == vino.ino &&
+			   ceph_snap(in) == vino.snap) {
+			ihold(in);
+		} else {
+			dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
+			     dn, in, ceph_ino(in), ceph_snap(in),
+			     vino.ino, vino.snap);
+			have_lease = false;
+			in = NULL;
+		}
+
+		if (have_lease)
+			update_dentry_lease(dn, rinfo->dlease, session,
+					    req->r_request_started);
+		dout(" final dn %p\n", dn);
+		i++;
+	} else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+		   req->r_op == CEPH_MDS_OP_MKSNAP) {
+		struct dentry *dn = req->r_dentry;
+
+		/* fill out a snapdir LOOKUPSNAP dentry */
+		BUG_ON(!dn);
+		BUG_ON(!req->r_locked_dir);
+		BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
+		ininfo = rinfo->targeti.in;
+		vino.ino = le64_to_cpu(ininfo->ino);
+		vino.snap = le64_to_cpu(ininfo->snapid);
+		in = ceph_get_inode(sb, vino);
+		if (IS_ERR(in)) {
+			pr_err("fill_inode get_inode badness %llx.%llx\n",
+			       vino.ino, vino.snap);
+			err = PTR_ERR(in);
+			d_delete(dn);
+			goto done;
+		}
+		dout(" linking snapped dir %p to dn %p\n", in, dn);
+		dn = splice_dentry(dn, in, NULL, true);
+		if (IS_ERR(dn)) {
+			err = PTR_ERR(dn);
+			goto done;
+		}
+		req->r_dentry = dn;  /* may have spliced */
+		ihold(in);
+		rinfo->head->is_dentry = 1;  /* fool notrace handlers */
+	}
+
+	if (rinfo->head->is_target) {
+		vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+		vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+
+		if (in == NULL || ceph_ino(in) != vino.ino ||
+		    ceph_snap(in) != vino.snap) {
+			in = ceph_get_inode(sb, vino);
+			if (IS_ERR(in)) {
+				err = PTR_ERR(in);
+				goto done;
+			}
+		}
+		req->r_target_inode = in;
+
+		err = fill_inode(in,
+				 &rinfo->targeti, NULL,
+				 session, req->r_request_started,
+				 (le32_to_cpu(rinfo->head->result) == 0) ?
+				 req->r_fmode : -1,
+				 &req->r_caps_reservation);
+		if (err < 0) {
+			pr_err("fill_inode badness %p %llx.%llx\n",
+			       in, ceph_vinop(in));
+			goto done;
+		}
+	}
+
+done:
+	dout("fill_trace done err=%d\n", err);
+	return err;
+}
+
+/*
+ * Prepopulate our cache with readdir results, leases, etc.
+ */
+int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+			     struct ceph_mds_session *session)
+{
+	struct dentry *parent = req->r_dentry;
+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	struct qstr dname;
+	struct dentry *dn;
+	struct inode *in;
+	int err = 0, i;
+	struct inode *snapdir = NULL;
+	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
+	u64 frag = le32_to_cpu(rhead->args.readdir.frag);
+	struct ceph_dentry_info *di;
+
+	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
+		snapdir = ceph_get_snapdir(parent->d_inode);
+		parent = d_find_alias(snapdir);
+		dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
+		     rinfo->dir_nr, parent);
+	} else {
+		dout("readdir_prepopulate %d items under dn %p\n",
+		     rinfo->dir_nr, parent);
+		if (rinfo->dir_dir)
+			ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
+	}
+
+	for (i = 0; i < rinfo->dir_nr; i++) {
+		struct ceph_vino vino;
+
+		dname.name = rinfo->dir_dname[i];
+		dname.len = rinfo->dir_dname_len[i];
+		dname.hash = full_name_hash(dname.name, dname.len);
+
+		vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+		vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+
+retry_lookup:
+		dn = d_lookup(parent, &dname);
+		dout("d_lookup on parent=%p name=%.*s got %p\n",
+		     parent, dname.len, dname.name, dn);
+
+		if (!dn) {
+			dn = d_alloc(parent, &dname);
+			dout("d_alloc %p '%.*s' = %p\n", parent,
+			     dname.len, dname.name, dn);
+			if (dn == NULL) {
+				dout("d_alloc badness\n");
+				err = -ENOMEM;
+				goto out;
+			}
+			err = ceph_init_dentry(dn);
+			if (err < 0) {
+				dput(dn);
+				goto out;
+			}
+		} else if (dn->d_inode &&
+			   (ceph_ino(dn->d_inode) != vino.ino ||
+			    ceph_snap(dn->d_inode) != vino.snap)) {
+			dout(" dn %p points to wrong inode %p\n",
+			     dn, dn->d_inode);
+			d_delete(dn);
+			dput(dn);
+			goto retry_lookup;
+		} else {
+			/* reorder parent's d_subdirs */
+			spin_lock(&parent->d_lock);
+			spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
+			list_move(&dn->d_u.d_child, &parent->d_subdirs);
+			spin_unlock(&dn->d_lock);
+			spin_unlock(&parent->d_lock);
+		}
+
+		di = dn->d_fsdata;
+		di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
+
+		/* inode */
+		if (dn->d_inode) {
+			in = dn->d_inode;
+		} else {
+			in = ceph_get_inode(parent->d_sb, vino);
+			if (IS_ERR(in)) {
+				dout("new_inode badness\n");
+				d_delete(dn);
+				dput(dn);
+				err = PTR_ERR(in);
+				goto out;
+			}
+			dn = splice_dentry(dn, in, NULL, false);
+			if (IS_ERR(dn))
+				dn = NULL;
+		}
+
+		if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
+			       req->r_request_started, -1,
+			       &req->r_caps_reservation) < 0) {
+			pr_err("fill_inode badness on %p\n", in);
+			goto next_item;
+		}
+		if (dn)
+			update_dentry_lease(dn, rinfo->dir_dlease[i],
+					    req->r_session,
+					    req->r_request_started);
+next_item:
+		if (dn)
+			dput(dn);
+	}
+	req->r_did_prepopulate = true;
+
+out:
+	if (snapdir) {
+		iput(snapdir);
+		dput(parent);
+	}
+	dout("readdir_prepopulate done\n");
+	return err;
+}
+
+int ceph_inode_set_size(struct inode *inode, loff_t size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret = 0;
+
+	spin_lock(&inode->i_lock);
+	dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
+	inode->i_size = size;
+	inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+
+	/* tell the MDS if we are approaching max_size */
+	if ((size << 1) >= ci->i_max_size &&
+	    (ci->i_reported_size << 1) < ci->i_max_size)
+		ret = 1;
+
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+/*
+ * Write back inode data in a worker thread.  (This can't be done
+ * in the message handler context.)
+ */
+void ceph_queue_writeback(struct inode *inode)
+{
+	if (queue_work(ceph_inode_to_client(inode)->wb_wq,
+		       &ceph_inode(inode)->i_wb_work)) {
+		dout("ceph_queue_writeback %p\n", inode);
+		ihold(inode);
+	} else {
+		dout("ceph_queue_writeback %p failed\n", inode);
+	}
+}
+
+static void ceph_writeback_work(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_wb_work);
+	struct inode *inode = &ci->vfs_inode;
+
+	dout("writeback %p\n", inode);
+	filemap_fdatawrite(&inode->i_data);
+	iput(inode);
+}
+
+/*
+ * queue an async invalidation
+ */
+void ceph_queue_invalidate(struct inode *inode)
+{
+	if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
+		       &ceph_inode(inode)->i_pg_inv_work)) {
+		dout("ceph_queue_invalidate %p\n", inode);
+		ihold(inode);
+	} else {
+		dout("ceph_queue_invalidate %p failed\n", inode);
+	}
+}
+
+/*
+ * invalidate any pages that are not dirty or under writeback.  this
+ * includes pages that are clean and mapped.
+ */
+static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
+{
+	struct pagevec pvec;
+	pgoff_t next = 0;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			pgoff_t index;
+			int skip_page =
+				(PageDirty(page) || PageWriteback(page));
+
+			if (!skip_page)
+				skip_page = !trylock_page(page);
+
+			/*
+			 * We really shouldn't be looking at the ->index of an
+			 * unlocked page.  But we're not allowed to lock these
+			 * pages.  So we rely upon nobody altering the ->index
+			 * of this (pinned-by-us) page.
+			 */
+			index = page->index;
+			if (index > next)
+				next = index;
+			next++;
+
+			if (skip_page)
+				continue;
+
+			generic_error_remove_page(mapping, page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+/*
+ * Invalidate inode pages in a worker thread.  (This can't be done
+ * in the message handler context.)
+ */
+static void ceph_invalidate_work(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_pg_inv_work);
+	struct inode *inode = &ci->vfs_inode;
+	u32 orig_gen;
+	int check = 0;
+
+	spin_lock(&inode->i_lock);
+	dout("invalidate_pages %p gen %d revoking %d\n", inode,
+	     ci->i_rdcache_gen, ci->i_rdcache_revoking);
+	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+		/* nevermind! */
+		spin_unlock(&inode->i_lock);
+		goto out;
+	}
+	orig_gen = ci->i_rdcache_gen;
+	spin_unlock(&inode->i_lock);
+
+	ceph_invalidate_nondirty_pages(inode->i_mapping);
+
+	spin_lock(&inode->i_lock);
+	if (orig_gen == ci->i_rdcache_gen &&
+	    orig_gen == ci->i_rdcache_revoking) {
+		dout("invalidate_pages %p gen %d successful\n", inode,
+		     ci->i_rdcache_gen);
+		ci->i_rdcache_revoking--;
+		check = 1;
+	} else {
+		dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
+		     inode, orig_gen, ci->i_rdcache_gen,
+		     ci->i_rdcache_revoking);
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (check)
+		ceph_check_caps(ci, 0, NULL);
+out:
+	iput(inode);
+}
+
+
+/*
+ * called by trunc_wq; take i_mutex ourselves
+ *
+ * We also truncate in a separate thread as well.
+ */
+static void ceph_vmtruncate_work(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_vmtruncate_work);
+	struct inode *inode = &ci->vfs_inode;
+
+	dout("vmtruncate_work %p\n", inode);
+	mutex_lock(&inode->i_mutex);
+	__ceph_do_pending_vmtruncate(inode);
+	mutex_unlock(&inode->i_mutex);
+	iput(inode);
+}
+
+/*
+ * Queue an async vmtruncate.  If we fail to queue work, we will handle
+ * the truncation the next time we call __ceph_do_pending_vmtruncate.
+ */
+void ceph_queue_vmtruncate(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
+		       &ci->i_vmtruncate_work)) {
+		dout("ceph_queue_vmtruncate %p\n", inode);
+		ihold(inode);
+	} else {
+		dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
+		     inode, ci->i_truncate_pending);
+	}
+}
+
+/*
+ * called with i_mutex held.
+ *
+ * Make sure any pending truncation is applied before doing anything
+ * that may depend on it.
+ */
+void __ceph_do_pending_vmtruncate(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 to;
+	int wrbuffer_refs, wake = 0;
+
+retry:
+	spin_lock(&inode->i_lock);
+	if (ci->i_truncate_pending == 0) {
+		dout("__do_pending_vmtruncate %p none pending\n", inode);
+		spin_unlock(&inode->i_lock);
+		return;
+	}
+
+	/*
+	 * make sure any dirty snapped pages are flushed before we
+	 * possibly truncate them.. so write AND block!
+	 */
+	if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
+		dout("__do_pending_vmtruncate %p flushing snaps first\n",
+		     inode);
+		spin_unlock(&inode->i_lock);
+		filemap_write_and_wait_range(&inode->i_data, 0,
+					     inode->i_sb->s_maxbytes);
+		goto retry;
+	}
+
+	to = ci->i_truncate_size;
+	wrbuffer_refs = ci->i_wrbuffer_ref;
+	dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
+	     ci->i_truncate_pending, to);
+	spin_unlock(&inode->i_lock);
+
+	truncate_inode_pages(inode->i_mapping, to);
+
+	spin_lock(&inode->i_lock);
+	ci->i_truncate_pending--;
+	if (ci->i_truncate_pending == 0)
+		wake = 1;
+	spin_unlock(&inode->i_lock);
+
+	if (wrbuffer_refs == 0)
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+	if (wake)
+		wake_up_all(&ci->i_cap_wq);
+}
+
+
+/*
+ * symlinks
+ */
+static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
+	nd_set_link(nd, ci->i_symlink);
+	return NULL;
+}
+
+static const struct inode_operations ceph_symlink_iops = {
+	.readlink = generic_readlink,
+	.follow_link = ceph_sym_follow_link,
+};
+
+/*
+ * setattr
+ */
+int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct inode *parent_inode = dentry->d_parent->d_inode;
+	const unsigned int ia_valid = attr->ia_valid;
+	struct ceph_mds_request *req;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+	int issued;
+	int release = 0, dirtied = 0;
+	int mask = 0;
+	int err = 0;
+	int inode_dirty_flags = 0;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	__ceph_do_pending_vmtruncate(inode);
+
+	err = inode_change_ok(inode, attr);
+	if (err != 0)
+		return err;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	spin_lock(&inode->i_lock);
+	issued = __ceph_caps_issued(ci, NULL);
+	dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+
+	if (ia_valid & ATTR_UID) {
+		dout("setattr %p uid %d -> %d\n", inode,
+		     inode->i_uid, attr->ia_uid);
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			inode->i_uid = attr->ia_uid;
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   attr->ia_uid != inode->i_uid) {
+			req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
+			mask |= CEPH_SETATTR_UID;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+	}
+	if (ia_valid & ATTR_GID) {
+		dout("setattr %p gid %d -> %d\n", inode,
+		     inode->i_gid, attr->ia_gid);
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			inode->i_gid = attr->ia_gid;
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   attr->ia_gid != inode->i_gid) {
+			req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
+			mask |= CEPH_SETATTR_GID;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+	}
+	if (ia_valid & ATTR_MODE) {
+		dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
+		     attr->ia_mode);
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			inode->i_mode = attr->ia_mode;
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   attr->ia_mode != inode->i_mode) {
+			req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
+			mask |= CEPH_SETATTR_MODE;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+	}
+
+	if (ia_valid & ATTR_ATIME) {
+		dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
+		     inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
+		     attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
+		if (issued & CEPH_CAP_FILE_EXCL) {
+			ci->i_time_warp_seq++;
+			inode->i_atime = attr->ia_atime;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+		} else if ((issued & CEPH_CAP_FILE_WR) &&
+			   timespec_compare(&inode->i_atime,
+					    &attr->ia_atime) < 0) {
+			inode->i_atime = attr->ia_atime;
+			dirtied |= CEPH_CAP_FILE_WR;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
+			ceph_encode_timespec(&req->r_args.setattr.atime,
+					     &attr->ia_atime);
+			mask |= CEPH_SETATTR_ATIME;
+			release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
+				CEPH_CAP_FILE_WR;
+		}
+	}
+	if (ia_valid & ATTR_MTIME) {
+		dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
+		     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+		     attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
+		if (issued & CEPH_CAP_FILE_EXCL) {
+			ci->i_time_warp_seq++;
+			inode->i_mtime = attr->ia_mtime;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+		} else if ((issued & CEPH_CAP_FILE_WR) &&
+			   timespec_compare(&inode->i_mtime,
+					    &attr->ia_mtime) < 0) {
+			inode->i_mtime = attr->ia_mtime;
+			dirtied |= CEPH_CAP_FILE_WR;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
+			ceph_encode_timespec(&req->r_args.setattr.mtime,
+					     &attr->ia_mtime);
+			mask |= CEPH_SETATTR_MTIME;
+			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+				CEPH_CAP_FILE_WR;
+		}
+	}
+	if (ia_valid & ATTR_SIZE) {
+		dout("setattr %p size %lld -> %lld\n", inode,
+		     inode->i_size, attr->ia_size);
+		if (attr->ia_size > inode->i_sb->s_maxbytes) {
+			err = -EINVAL;
+			goto out;
+		}
+		if ((issued & CEPH_CAP_FILE_EXCL) &&
+		    attr->ia_size > inode->i_size) {
+			inode->i_size = attr->ia_size;
+			inode->i_blocks =
+				(attr->ia_size + (1 << 9) - 1) >> 9;
+			inode->i_ctime = attr->ia_ctime;
+			ci->i_reported_size = attr->ia_size;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   attr->ia_size != inode->i_size) {
+			req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+			req->r_args.setattr.old_size =
+				cpu_to_le64(inode->i_size);
+			mask |= CEPH_SETATTR_SIZE;
+			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+				CEPH_CAP_FILE_WR;
+		}
+	}
+
+	/* these do nothing */
+	if (ia_valid & ATTR_CTIME) {
+		bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
+					 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
+		dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
+		     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+		     attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
+		     only ? "ctime only" : "ignored");
+		inode->i_ctime = attr->ia_ctime;
+		if (only) {
+			/*
+			 * if kernel wants to dirty ctime but nothing else,
+			 * we need to choose a cap to dirty under, or do
+			 * a almost-no-op setattr
+			 */
+			if (issued & CEPH_CAP_AUTH_EXCL)
+				dirtied |= CEPH_CAP_AUTH_EXCL;
+			else if (issued & CEPH_CAP_FILE_EXCL)
+				dirtied |= CEPH_CAP_FILE_EXCL;
+			else if (issued & CEPH_CAP_XATTR_EXCL)
+				dirtied |= CEPH_CAP_XATTR_EXCL;
+			else
+				mask |= CEPH_SETATTR_CTIME;
+		}
+	}
+	if (ia_valid & ATTR_FILE)
+		dout("setattr %p ATTR_FILE ... hrm!\n", inode);
+
+	if (dirtied) {
+		inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);
+		inode->i_ctime = CURRENT_TIME;
+	}
+
+	release &= issued;
+	spin_unlock(&inode->i_lock);
+
+	if (inode_dirty_flags)
+		__mark_inode_dirty(inode, inode_dirty_flags);
+
+	if (mask) {
+		req->r_inode = inode;
+		ihold(inode);
+		req->r_inode_drop = release;
+		req->r_args.setattr.mask = cpu_to_le32(mask);
+		req->r_num_caps = 1;
+		err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	}
+	dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
+	     ceph_cap_string(dirtied), mask);
+
+	ceph_mdsc_put_request(req);
+	__ceph_do_pending_vmtruncate(inode);
+	return err;
+out:
+	spin_unlock(&inode->i_lock);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * Verify that we have a lease on the given mask.  If not,
+ * do a getattr against an mds.
+ */
+int ceph_do_getattr(struct inode *inode, int mask)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(inode) == CEPH_SNAPDIR) {
+		dout("do_getattr inode %p SNAPDIR\n", inode);
+		return 0;
+	}
+
+	dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
+	if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
+		return 0;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = inode;
+	ihold(inode);
+	req->r_num_caps = 1;
+	req->r_args.getattr.mask = cpu_to_le32(mask);
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	ceph_mdsc_put_request(req);
+	dout("do_getattr result=%d\n", err);
+	return err;
+}
+
+
+/*
+ * Check inode permissions.  We verify we have a valid value for
+ * the AUTH cap, then call the generic handler.
+ */
+int ceph_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	int err;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+
+	if (!err)
+		err = generic_permission(inode, mask, flags, NULL);
+	return err;
+}
+
+/*
+ * Get all attributes.  Hopefully somedata we'll have a statlite()
+ * and can limit the fields we require to be accurate.
+ */
+int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int err;
+
+	err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
+	if (!err) {
+		generic_fillattr(inode, stat);
+		stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
+		if (ceph_snap(inode) != CEPH_NOSNAP)
+			stat->dev = ceph_snap(inode);
+		else
+			stat->dev = 0;
+		if (S_ISDIR(inode->i_mode)) {
+			if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
+						RBYTES))
+				stat->size = ci->i_rbytes;
+			else
+				stat->size = ci->i_files + ci->i_subdirs;
+			stat->blocks = 0;
+			stat->blksize = 65536;
+		}
+	}
+	return err;
+}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 00000000..ef0b5f48
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,255 @@
+#include <linux/in.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include <linux/ceph/ceph_debug.h>
+
+#include "ioctl.h"
+
+
+/*
+ * ioctls
+ */
+
+/*
+ * get and set the file layout
+ */
+static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
+{
+	struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+	struct ceph_ioctl_layout l;
+	int err;
+
+	err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+	if (!err) {
+		l.stripe_unit = ceph_file_layout_su(ci->i_layout);
+		l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+		l.object_size = ceph_file_layout_object_size(ci->i_layout);
+		l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+		l.preferred_osd =
+			(s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
+		if (copy_to_user(arg, &l, sizeof(l)))
+			return -EFAULT;
+	}
+
+	return err;
+}
+
+static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_ioctl_layout l;
+	int err, i;
+
+	/* copy and validate */
+	if (copy_from_user(&l, arg, sizeof(l)))
+		return -EFAULT;
+
+	if ((l.object_size & ~PAGE_MASK) ||
+	    (l.stripe_unit & ~PAGE_MASK) ||
+	    !l.stripe_unit ||
+	    (l.object_size &&
+	     (unsigned)l.object_size % (unsigned)l.stripe_unit))
+		return -EINVAL;
+
+	/* make sure it's a valid data pool */
+	if (l.data_pool > 0) {
+		mutex_lock(&mdsc->mutex);
+		err = -EINVAL;
+		for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+			if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+				err = 0;
+				break;
+			}
+		mutex_unlock(&mdsc->mutex);
+		if (err)
+			return err;
+	}
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = inode;
+	ihold(inode);
+	req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
+
+	req->r_args.setlayout.layout.fl_stripe_unit =
+		cpu_to_le32(l.stripe_unit);
+	req->r_args.setlayout.layout.fl_stripe_count =
+		cpu_to_le32(l.stripe_count);
+	req->r_args.setlayout.layout.fl_object_size =
+		cpu_to_le32(l.object_size);
+	req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
+	req->r_args.setlayout.layout.fl_pg_preferred =
+		cpu_to_le32(l.preferred_osd);
+
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * Set a layout policy on a directory inode. All items in the tree
+ * rooted at this inode will inherit this layout on creation,
+ * (It doesn't apply retroactively )
+ * unless a subdirectory has its own layout policy.
+ */
+static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_mds_request *req;
+	struct ceph_ioctl_layout l;
+	int err, i;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+
+	/* copy and validate */
+	if (copy_from_user(&l, arg, sizeof(l)))
+		return -EFAULT;
+
+	if ((l.object_size & ~PAGE_MASK) ||
+	    (l.stripe_unit & ~PAGE_MASK) ||
+	    !l.stripe_unit ||
+	    (l.object_size &&
+	        (unsigned)l.object_size % (unsigned)l.stripe_unit))
+		return -EINVAL;
+
+	/* make sure it's a valid data pool */
+	if (l.data_pool > 0) {
+		mutex_lock(&mdsc->mutex);
+		err = -EINVAL;
+		for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+			if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+				err = 0;
+				break;
+			}
+		mutex_unlock(&mdsc->mutex);
+		if (err)
+			return err;
+	}
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
+				       USE_AUTH_MDS);
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = inode;
+	ihold(inode);
+
+	req->r_args.setlayout.layout.fl_stripe_unit =
+			cpu_to_le32(l.stripe_unit);
+	req->r_args.setlayout.layout.fl_stripe_count =
+			cpu_to_le32(l.stripe_count);
+	req->r_args.setlayout.layout.fl_object_size =
+			cpu_to_le32(l.object_size);
+	req->r_args.setlayout.layout.fl_pg_pool =
+			cpu_to_le32(l.data_pool);
+	req->r_args.setlayout.layout.fl_pg_preferred =
+			cpu_to_le32(l.preferred_osd);
+
+	err = ceph_mdsc_do_request(mdsc, inode, req);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * Return object name, size/offset information, and location (OSD
+ * number, network address) for a given file offset.
+ */
+static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
+{
+	struct ceph_ioctl_dataloc dl;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc =
+		&ceph_sb_to_client(inode->i_sb)->client->osdc;
+	u64 len = 1, olen;
+	u64 tmp;
+	struct ceph_object_layout ol;
+	struct ceph_pg pgid;
+
+	/* copy and validate */
+	if (copy_from_user(&dl, arg, sizeof(dl)))
+		return -EFAULT;
+
+	down_read(&osdc->map_sem);
+	ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+				      &dl.object_no, &dl.object_offset, &olen);
+	dl.file_offset -= dl.object_offset;
+	dl.object_size = ceph_file_layout_object_size(ci->i_layout);
+	dl.block_size = ceph_file_layout_su(ci->i_layout);
+
+	/* block_offset = object_offset % block_size */
+	tmp = dl.object_offset;
+	dl.block_offset = do_div(tmp, dl.block_size);
+
+	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
+		 ceph_ino(inode), dl.object_no);
+	ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
+				osdc->osdmap);
+
+	pgid = ol.ol_pgid;
+	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+	if (dl.osd >= 0) {
+		struct ceph_entity_addr *a =
+			ceph_osd_addr(osdc->osdmap, dl.osd);
+		if (a)
+			memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
+	} else {
+		memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
+	}
+	up_read(&osdc->map_sem);
+
+	/* send result back to user */
+	if (copy_to_user(arg, &dl, sizeof(dl)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static long ceph_ioctl_lazyio(struct file *file)
+{
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
+		spin_lock(&inode->i_lock);
+		ci->i_nr_by_mode[fi->fmode]--;
+		fi->fmode |= CEPH_FILE_MODE_LAZY;
+		ci->i_nr_by_mode[fi->fmode]++;
+		spin_unlock(&inode->i_lock);
+		dout("ioctl_layzio: file %p marked lazy\n", file);
+
+		ceph_check_caps(ci, 0, NULL);
+	} else {
+		dout("ioctl_layzio: file %p already lazy\n", file);
+	}
+	return 0;
+}
+
+long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
+	switch (cmd) {
+	case CEPH_IOC_GET_LAYOUT:
+		return ceph_ioctl_get_layout(file, (void __user *)arg);
+
+	case CEPH_IOC_SET_LAYOUT:
+		return ceph_ioctl_set_layout(file, (void __user *)arg);
+
+	case CEPH_IOC_SET_LAYOUT_POLICY:
+		return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
+
+	case CEPH_IOC_GET_DATALOC:
+		return ceph_ioctl_get_dataloc(file, (void __user *)arg);
+
+	case CEPH_IOC_LAZYIO:
+		return ceph_ioctl_lazyio(file);
+	}
+
+	return -ENOTTY;
+}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 00000000..52e8fd74
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,44 @@
+#ifndef FS_CEPH_IOCTL_H
+#define FS_CEPH_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define CEPH_IOCTL_MAGIC 0x97
+
+/* just use u64 to align sanely on all archs */
+struct ceph_ioctl_layout {
+	__u64 stripe_unit, stripe_count, object_size;
+	__u64 data_pool;
+	__s64 preferred_osd;
+};
+
+#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1,		\
+				   struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2,		\
+				   struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5,	\
+				   struct ceph_ioctl_layout)
+
+/*
+ * Extract identity, address of the OSD and object storing a given
+ * file offset.
+ */
+struct ceph_ioctl_dataloc {
+	__u64 file_offset;           /* in+out: file offset */
+	__u64 object_offset;         /* out: offset in object */
+	__u64 object_no;             /* out: object # */
+	__u64 object_size;           /* out: object size */
+	char object_name[64];        /* out: object name */
+	__u64 block_offset;          /* out: offset in block */
+	__u64 block_size;            /* out: block length */
+	__s64 osd;                   /* out: osd # */
+	struct sockaddr_storage osd_addr; /* out: osd address */
+};
+
+#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3,	\
+				   struct ceph_ioctl_dataloc)
+
+#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
+
+#endif
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
new file mode 100644
index 00000000..80576d05
--- /dev/null
+++ b/fs/ceph/locks.c
@@ -0,0 +1,286 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/file.h>
+#include <linux/namei.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include <linux/ceph/pagelist.h>
+
+/**
+ * Implement fcntl and flock locking functions.
+ */
+static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
+			     int cmd, u8 wait, struct file_lock *fl)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_mds_client *mdsc =
+		ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+	u64 length = 0;
+
+	req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = inode;
+	ihold(inode);
+
+	/* mds requires start and length rather than start and end */
+	if (LLONG_MAX == fl->fl_end)
+		length = 0;
+	else
+		length = fl->fl_end - fl->fl_start + 1;
+
+	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
+	     "length: %llu, wait: %d, type: %d", (int)lock_type,
+	     (int)operation, (u64)fl->fl_pid, fl->fl_start,
+	     length, wait, fl->fl_type);
+
+	req->r_args.filelock_change.rule = lock_type;
+	req->r_args.filelock_change.type = cmd;
+	req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
+	/* This should be adjusted, but I'm not sure if
+	   namespaces actually get id numbers*/
+	req->r_args.filelock_change.pid_namespace =
+		cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
+	req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
+	req->r_args.filelock_change.length = cpu_to_le64(length);
+	req->r_args.filelock_change.wait = wait;
+
+	err = ceph_mdsc_do_request(mdsc, inode, req);
+
+	if ( operation == CEPH_MDS_OP_GETFILELOCK){
+		fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+		if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
+			fl->fl_type = F_RDLCK;
+		else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
+			fl->fl_type = F_WRLCK;
+		else
+			fl->fl_type = F_UNLCK;
+
+		fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
+		length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
+						 le64_to_cpu(req->r_reply_info.filelock_reply->length);
+		if (length >= 1)
+			fl->fl_end = length -1;
+		else
+			fl->fl_end = 0;
+
+	}
+	ceph_mdsc_put_request(req);
+	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
+	     "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type,
+	     (int)operation, (u64)fl->fl_pid, fl->fl_start,
+	     length, wait, fl->fl_type, err);
+	return err;
+}
+
+/**
+ * Attempt to set an fcntl lock.
+ * For now, this just goes away to the server. Later it may be more awesome.
+ */
+int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	u8 lock_cmd;
+	int err;
+	u8 wait = 0;
+	u16 op = CEPH_MDS_OP_SETFILELOCK;
+
+	fl->fl_nspid = get_pid(task_tgid(current));
+	dout("ceph_lock, fl_pid:%d", fl->fl_pid);
+
+	/* set wait bit as appropriate, then make command as Ceph expects it*/
+	if (F_SETLKW == cmd)
+		wait = 1;
+	if (F_GETLK == cmd)
+		op = CEPH_MDS_OP_GETFILELOCK;
+
+	if (F_RDLCK == fl->fl_type)
+		lock_cmd = CEPH_LOCK_SHARED;
+	else if (F_WRLCK == fl->fl_type)
+		lock_cmd = CEPH_LOCK_EXCL;
+	else
+		lock_cmd = CEPH_LOCK_UNLOCK;
+
+	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
+	if (!err) {
+		if ( op != CEPH_MDS_OP_GETFILELOCK ){
+			dout("mds locked, locking locally");
+			err = posix_lock_file(file, fl, NULL);
+			if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+				/* undo! This should only happen if
+				 * the kernel detects local
+				 * deadlock. */
+				ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+						  CEPH_LOCK_UNLOCK, 0, fl);
+				dout("got %d on posix_lock_file, undid lock",
+				     err);
+			}
+		}
+
+	} else if (err == -ERESTARTSYS) {
+		dout("undoing lock\n");
+		ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+				  CEPH_LOCK_UNLOCK, 0, fl);
+	}
+	return err;
+}
+
+int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	u8 lock_cmd;
+	int err;
+	u8 wait = 1;
+
+	fl->fl_nspid = get_pid(task_tgid(current));
+	dout("ceph_flock, fl_pid:%d", fl->fl_pid);
+
+	/* set wait bit, then clear it out of cmd*/
+	if (cmd & LOCK_NB)
+		wait = 0;
+	cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN);
+	/* set command sequence that Ceph wants to see:
+	   shared lock, exclusive lock, or unlock */
+	if (LOCK_SH == cmd)
+		lock_cmd = CEPH_LOCK_SHARED;
+	else if (LOCK_EX == cmd)
+		lock_cmd = CEPH_LOCK_EXCL;
+	else
+		lock_cmd = CEPH_LOCK_UNLOCK;
+
+	err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
+				file, lock_cmd, wait, fl);
+	if (!err) {
+		err = flock_lock_file_wait(file, fl);
+		if (err) {
+			ceph_lock_message(CEPH_LOCK_FLOCK,
+					  CEPH_MDS_OP_SETFILELOCK,
+					  file, CEPH_LOCK_UNLOCK, 0, fl);
+			dout("got %d on flock_lock_file_wait, undid lock", err);
+		}
+	} else if (err == -ERESTARTSYS) {
+		dout("undoing lock\n");
+		ceph_lock_message(CEPH_LOCK_FLOCK,
+				  CEPH_MDS_OP_SETFILELOCK,
+				  file, CEPH_LOCK_UNLOCK, 0, fl);
+	}
+	return err;
+}
+
+/**
+ * Must be called with BKL already held. Fills in the passed
+ * counter variables, so you can prepare pagelist metadata before calling
+ * ceph_encode_locks.
+ */
+void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
+{
+	struct file_lock *lock;
+
+	*fcntl_count = 0;
+	*flock_count = 0;
+
+	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+		if (lock->fl_flags & FL_POSIX)
+			++(*fcntl_count);
+		else if (lock->fl_flags & FL_FLOCK)
+			++(*flock_count);
+	}
+	dout("counted %d flock locks and %d fcntl locks",
+	     *flock_count, *fcntl_count);
+}
+
+/**
+ * Encode the flock and fcntl locks for the given inode into the pagelist.
+ * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
+ * sequential flock locks.
+ * Must be called with lock_flocks() already held.
+ * If we encounter more of a specific lock type than expected,
+ * we return the value 1.
+ */
+int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
+		      int num_fcntl_locks, int num_flock_locks)
+{
+	struct file_lock *lock;
+	struct ceph_filelock cephlock;
+	int err = 0;
+	int seen_fcntl = 0;
+	int seen_flock = 0;
+
+	dout("encoding %d flock and %d fcntl locks", num_flock_locks,
+	     num_fcntl_locks);
+	err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32));
+	if (err)
+		goto fail;
+	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+		if (lock->fl_flags & FL_POSIX) {
+			++seen_fcntl;
+			if (seen_fcntl > num_fcntl_locks) {
+				err = -ENOSPC;
+				goto fail;
+			}
+			err = lock_to_ceph_filelock(lock, &cephlock);
+			if (err)
+				goto fail;
+			err = ceph_pagelist_append(pagelist, &cephlock,
+					   sizeof(struct ceph_filelock));
+		}
+		if (err)
+			goto fail;
+	}
+
+	err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32));
+	if (err)
+		goto fail;
+	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+		if (lock->fl_flags & FL_FLOCK) {
+			++seen_flock;
+			if (seen_flock > num_flock_locks) {
+				err = -ENOSPC;
+				goto fail;
+			}
+			err = lock_to_ceph_filelock(lock, &cephlock);
+			if (err)
+				goto fail;
+			err = ceph_pagelist_append(pagelist, &cephlock,
+					   sizeof(struct ceph_filelock));
+		}
+		if (err)
+			goto fail;
+	}
+fail:
+	return err;
+}
+
+/*
+ * Given a pointer to a lock, convert it to a ceph filelock
+ */
+int lock_to_ceph_filelock(struct file_lock *lock,
+			  struct ceph_filelock *cephlock)
+{
+	int err = 0;
+
+	cephlock->start = cpu_to_le64(lock->fl_start);
+	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
+	cephlock->client = cpu_to_le64(0);
+	cephlock->pid = cpu_to_le64(lock->fl_pid);
+	cephlock->pid_namespace =
+	        cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
+
+	switch (lock->fl_type) {
+	case F_RDLCK:
+		cephlock->type = CEPH_LOCK_SHARED;
+		break;
+	case F_WRLCK:
+		cephlock->type = CEPH_LOCK_EXCL;
+		break;
+	case F_UNLCK:
+		cephlock->type = CEPH_LOCK_UNLOCK;
+		break;
+	default:
+		dout("Have unknown lock type %d", lock->fl_type);
+		err = -EINVAL;
+	}
+
+	return err;
+}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 00000000..0c1d9175
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3454 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+/*
+ * A cluster of MDS (metadata server) daemons is responsible for
+ * managing the file system namespace (the directory hierarchy and
+ * inodes) and for coordinating shared access to storage.  Metadata is
+ * partitioning hierarchically across a number of servers, and that
+ * partition varies over time as the cluster adjusts the distribution
+ * in order to balance load.
+ *
+ * The MDS client is primarily responsible to managing synchronous
+ * metadata requests for operations like open, unlink, and so forth.
+ * If there is a MDS failure, we find out about it when we (possibly
+ * request and) receive a new MDS map, and can resubmit affected
+ * requests.
+ *
+ * For the most part, though, we take advantage of a lossless
+ * communications channel to the MDS, and do not need to worry about
+ * timing out or resubmitting requests.
+ *
+ * We maintain a stateful "session" with each MDS we interact with.
+ * Within each session, we sent periodic heartbeat messages to ensure
+ * any capabilities or leases we have been issues remain valid.  If
+ * the session times out and goes stale, our leases and capabilities
+ * are no longer valid.
+ */
+
+struct ceph_reconnect_state {
+	struct ceph_pagelist *pagelist;
+	bool flock;
+};
+
+static void __wake_requests(struct ceph_mds_client *mdsc,
+			    struct list_head *head);
+
+static const struct ceph_connection_operations mds_con_ops;
+
+
+/*
+ * mds reply parsing
+ */
+
+/*
+ * parse individual inode info
+ */
+static int parse_reply_info_in(void **p, void *end,
+			       struct ceph_mds_reply_info_in *info,
+			       int features)
+{
+	int err = -EIO;
+
+	info->in = *p;
+	*p += sizeof(struct ceph_mds_reply_inode) +
+		sizeof(*info->in->fragtree.splits) *
+		le32_to_cpu(info->in->fragtree.nsplits);
+
+	ceph_decode_32_safe(p, end, info->symlink_len, bad);
+	ceph_decode_need(p, end, info->symlink_len, bad);
+	info->symlink = *p;
+	*p += info->symlink_len;
+
+	if (features & CEPH_FEATURE_DIRLAYOUTHASH)
+		ceph_decode_copy_safe(p, end, &info->dir_layout,
+				      sizeof(info->dir_layout), bad);
+	else
+		memset(&info->dir_layout, 0, sizeof(info->dir_layout));
+
+	ceph_decode_32_safe(p, end, info->xattr_len, bad);
+	ceph_decode_need(p, end, info->xattr_len, bad);
+	info->xattr_data = *p;
+	*p += info->xattr_len;
+	return 0;
+bad:
+	return err;
+}
+
+/*
+ * parse a normal reply, which may contain a (dir+)dentry and/or a
+ * target inode.
+ */
+static int parse_reply_info_trace(void **p, void *end,
+				  struct ceph_mds_reply_info_parsed *info,
+				  int features)
+{
+	int err;
+
+	if (info->head->is_dentry) {
+		err = parse_reply_info_in(p, end, &info->diri, features);
+		if (err < 0)
+			goto out_bad;
+
+		if (unlikely(*p + sizeof(*info->dirfrag) > end))
+			goto bad;
+		info->dirfrag = *p;
+		*p += sizeof(*info->dirfrag) +
+			sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
+		if (unlikely(*p > end))
+			goto bad;
+
+		ceph_decode_32_safe(p, end, info->dname_len, bad);
+		ceph_decode_need(p, end, info->dname_len, bad);
+		info->dname = *p;
+		*p += info->dname_len;
+		info->dlease = *p;
+		*p += sizeof(*info->dlease);
+	}
+
+	if (info->head->is_target) {
+		err = parse_reply_info_in(p, end, &info->targeti, features);
+		if (err < 0)
+			goto out_bad;
+	}
+
+	if (unlikely(*p != end))
+		goto bad;
+	return 0;
+
+bad:
+	err = -EIO;
+out_bad:
+	pr_err("problem parsing mds trace %d\n", err);
+	return err;
+}
+
+/*
+ * parse readdir results
+ */
+static int parse_reply_info_dir(void **p, void *end,
+				struct ceph_mds_reply_info_parsed *info,
+				int features)
+{
+	u32 num, i = 0;
+	int err;
+
+	info->dir_dir = *p;
+	if (*p + sizeof(*info->dir_dir) > end)
+		goto bad;
+	*p += sizeof(*info->dir_dir) +
+		sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
+	if (*p > end)
+		goto bad;
+
+	ceph_decode_need(p, end, sizeof(num) + 2, bad);
+	num = ceph_decode_32(p);
+	info->dir_end = ceph_decode_8(p);
+	info->dir_complete = ceph_decode_8(p);
+	if (num == 0)
+		goto done;
+
+	/* alloc large array */
+	info->dir_nr = num;
+	info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
+			       sizeof(*info->dir_dname) +
+			       sizeof(*info->dir_dname_len) +
+			       sizeof(*info->dir_dlease),
+			       GFP_NOFS);
+	if (info->dir_in == NULL) {
+		err = -ENOMEM;
+		goto out_bad;
+	}
+	info->dir_dname = (void *)(info->dir_in + num);
+	info->dir_dname_len = (void *)(info->dir_dname + num);
+	info->dir_dlease = (void *)(info->dir_dname_len + num);
+
+	while (num) {
+		/* dentry */
+		ceph_decode_need(p, end, sizeof(u32)*2, bad);
+		info->dir_dname_len[i] = ceph_decode_32(p);
+		ceph_decode_need(p, end, info->dir_dname_len[i], bad);
+		info->dir_dname[i] = *p;
+		*p += info->dir_dname_len[i];
+		dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
+		     info->dir_dname[i]);
+		info->dir_dlease[i] = *p;
+		*p += sizeof(struct ceph_mds_reply_lease);
+
+		/* inode */
+		err = parse_reply_info_in(p, end, &info->dir_in[i], features);
+		if (err < 0)
+			goto out_bad;
+		i++;
+		num--;
+	}
+
+done:
+	if (*p != end)
+		goto bad;
+	return 0;
+
+bad:
+	err = -EIO;
+out_bad:
+	pr_err("problem parsing dir contents %d\n", err);
+	return err;
+}
+
+/*
+ * parse fcntl F_GETLK results
+ */
+static int parse_reply_info_filelock(void **p, void *end,
+				     struct ceph_mds_reply_info_parsed *info,
+				     int features)
+{
+	if (*p + sizeof(*info->filelock_reply) > end)
+		goto bad;
+
+	info->filelock_reply = *p;
+	*p += sizeof(*info->filelock_reply);
+
+	if (unlikely(*p != end))
+		goto bad;
+	return 0;
+
+bad:
+	return -EIO;
+}
+
+/*
+ * parse extra results
+ */
+static int parse_reply_info_extra(void **p, void *end,
+				  struct ceph_mds_reply_info_parsed *info,
+				  int features)
+{
+	if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
+		return parse_reply_info_filelock(p, end, info, features);
+	else
+		return parse_reply_info_dir(p, end, info, features);
+}
+
+/*
+ * parse entire mds reply
+ */
+static int parse_reply_info(struct ceph_msg *msg,
+			    struct ceph_mds_reply_info_parsed *info,
+			    int features)
+{
+	void *p, *end;
+	u32 len;
+	int err;
+
+	info->head = msg->front.iov_base;
+	p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
+	end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
+
+	/* trace */
+	ceph_decode_32_safe(&p, end, len, bad);
+	if (len > 0) {
+		err = parse_reply_info_trace(&p, p+len, info, features);
+		if (err < 0)
+			goto out_bad;
+	}
+
+	/* extra */
+	ceph_decode_32_safe(&p, end, len, bad);
+	if (len > 0) {
+		err = parse_reply_info_extra(&p, p+len, info, features);
+		if (err < 0)
+			goto out_bad;
+	}
+
+	/* snap blob */
+	ceph_decode_32_safe(&p, end, len, bad);
+	info->snapblob_len = len;
+	info->snapblob = p;
+	p += len;
+
+	if (p != end)
+		goto bad;
+	return 0;
+
+bad:
+	err = -EIO;
+out_bad:
+	pr_err("mds parse_reply err %d\n", err);
+	return err;
+}
+
+static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
+{
+	kfree(info->dir_in);
+}
+
+
+/*
+ * sessions
+ */
+static const char *session_state_name(int s)
+{
+	switch (s) {
+	case CEPH_MDS_SESSION_NEW: return "new";
+	case CEPH_MDS_SESSION_OPENING: return "opening";
+	case CEPH_MDS_SESSION_OPEN: return "open";
+	case CEPH_MDS_SESSION_HUNG: return "hung";
+	case CEPH_MDS_SESSION_CLOSING: return "closing";
+	case CEPH_MDS_SESSION_RESTARTING: return "restarting";
+	case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
+	default: return "???";
+	}
+}
+
+static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
+{
+	if (atomic_inc_not_zero(&s->s_ref)) {
+		dout("mdsc get_session %p %d -> %d\n", s,
+		     atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
+		return s;
+	} else {
+		dout("mdsc get_session %p 0 -- FAIL", s);
+		return NULL;
+	}
+}
+
+void ceph_put_mds_session(struct ceph_mds_session *s)
+{
+	dout("mdsc put_session %p %d -> %d\n", s,
+	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
+	if (atomic_dec_and_test(&s->s_ref)) {
+		if (s->s_authorizer)
+		     s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
+			     s->s_mdsc->fsc->client->monc.auth,
+			     s->s_authorizer);
+		kfree(s);
+	}
+}
+
+/*
+ * called under mdsc->mutex
+ */
+struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
+						   int mds)
+{
+	struct ceph_mds_session *session;
+
+	if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
+		return NULL;
+	session = mdsc->sessions[mds];
+	dout("lookup_mds_session %p %d\n", session,
+	     atomic_read(&session->s_ref));
+	get_session(session);
+	return session;
+}
+
+static bool __have_session(struct ceph_mds_client *mdsc, int mds)
+{
+	if (mds >= mdsc->max_sessions)
+		return false;
+	return mdsc->sessions[mds];
+}
+
+static int __verify_registered_session(struct ceph_mds_client *mdsc,
+				       struct ceph_mds_session *s)
+{
+	if (s->s_mds >= mdsc->max_sessions ||
+	    mdsc->sessions[s->s_mds] != s)
+		return -ENOENT;
+	return 0;
+}
+
+/*
+ * create+register a new session for given mds.
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
+						 int mds)
+{
+	struct ceph_mds_session *s;
+
+	s = kzalloc(sizeof(*s), GFP_NOFS);
+	if (!s)
+		return ERR_PTR(-ENOMEM);
+	s->s_mdsc = mdsc;
+	s->s_mds = mds;
+	s->s_state = CEPH_MDS_SESSION_NEW;
+	s->s_ttl = 0;
+	s->s_seq = 0;
+	mutex_init(&s->s_mutex);
+
+	ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
+	s->s_con.private = s;
+	s->s_con.ops = &mds_con_ops;
+	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
+	s->s_con.peer_name.num = cpu_to_le64(mds);
+
+	spin_lock_init(&s->s_cap_lock);
+	s->s_cap_gen = 0;
+	s->s_cap_ttl = 0;
+	s->s_renew_requested = 0;
+	s->s_renew_seq = 0;
+	INIT_LIST_HEAD(&s->s_caps);
+	s->s_nr_caps = 0;
+	s->s_trim_caps = 0;
+	atomic_set(&s->s_ref, 1);
+	INIT_LIST_HEAD(&s->s_waiting);
+	INIT_LIST_HEAD(&s->s_unsafe);
+	s->s_num_cap_releases = 0;
+	s->s_cap_iterator = NULL;
+	INIT_LIST_HEAD(&s->s_cap_releases);
+	INIT_LIST_HEAD(&s->s_cap_releases_done);
+	INIT_LIST_HEAD(&s->s_cap_flushing);
+	INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
+
+	dout("register_session mds%d\n", mds);
+	if (mds >= mdsc->max_sessions) {
+		int newmax = 1 << get_count_order(mds+1);
+		struct ceph_mds_session **sa;
+
+		dout("register_session realloc to %d\n", newmax);
+		sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
+		if (sa == NULL)
+			goto fail_realloc;
+		if (mdsc->sessions) {
+			memcpy(sa, mdsc->sessions,
+			       mdsc->max_sessions * sizeof(void *));
+			kfree(mdsc->sessions);
+		}
+		mdsc->sessions = sa;
+		mdsc->max_sessions = newmax;
+	}
+	mdsc->sessions[mds] = s;
+	atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
+
+	ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+	return s;
+
+fail_realloc:
+	kfree(s);
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static void __unregister_session(struct ceph_mds_client *mdsc,
+			       struct ceph_mds_session *s)
+{
+	dout("__unregister_session mds%d %p\n", s->s_mds, s);
+	BUG_ON(mdsc->sessions[s->s_mds] != s);
+	mdsc->sessions[s->s_mds] = NULL;
+	ceph_con_close(&s->s_con);
+	ceph_put_mds_session(s);
+}
+
+/*
+ * drop session refs in request.
+ *
+ * should be last request ref, or hold mdsc->mutex
+ */
+static void put_request_session(struct ceph_mds_request *req)
+{
+	if (req->r_session) {
+		ceph_put_mds_session(req->r_session);
+		req->r_session = NULL;
+	}
+}
+
+void ceph_mdsc_release_request(struct kref *kref)
+{
+	struct ceph_mds_request *req = container_of(kref,
+						    struct ceph_mds_request,
+						    r_kref);
+	if (req->r_request)
+		ceph_msg_put(req->r_request);
+	if (req->r_reply) {
+		ceph_msg_put(req->r_reply);
+		destroy_reply_info(&req->r_reply_info);
+	}
+	if (req->r_inode) {
+		ceph_put_cap_refs(ceph_inode(req->r_inode),
+				  CEPH_CAP_PIN);
+		iput(req->r_inode);
+	}
+	if (req->r_locked_dir)
+		ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
+				  CEPH_CAP_PIN);
+	if (req->r_target_inode)
+		iput(req->r_target_inode);
+	if (req->r_dentry)
+		dput(req->r_dentry);
+	if (req->r_old_dentry) {
+		ceph_put_cap_refs(
+			ceph_inode(req->r_old_dentry->d_parent->d_inode),
+			CEPH_CAP_PIN);
+		dput(req->r_old_dentry);
+	}
+	kfree(req->r_path1);
+	kfree(req->r_path2);
+	put_request_session(req);
+	ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
+	kfree(req);
+}
+
+/*
+ * lookup session, bump ref if found.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
+					     u64 tid)
+{
+	struct ceph_mds_request *req;
+	struct rb_node *n = mdsc->request_tree.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_mds_request, r_node);
+		if (tid < req->r_tid)
+			n = n->rb_left;
+		else if (tid > req->r_tid)
+			n = n->rb_right;
+		else {
+			ceph_mdsc_get_request(req);
+			return req;
+		}
+	}
+	return NULL;
+}
+
+static void __insert_request(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_request *new)
+{
+	struct rb_node **p = &mdsc->request_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_mds_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_mds_request, r_node);
+		if (new->r_tid < req->r_tid)
+			p = &(*p)->rb_left;
+		else if (new->r_tid > req->r_tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->r_node, parent, p);
+	rb_insert_color(&new->r_node, &mdsc->request_tree);
+}
+
+/*
+ * Register an in-flight request, and assign a tid.  Link to directory
+ * are modifying (if any).
+ *
+ * Called under mdsc->mutex.
+ */
+static void __register_request(struct ceph_mds_client *mdsc,
+			       struct ceph_mds_request *req,
+			       struct inode *dir)
+{
+	req->r_tid = ++mdsc->last_tid;
+	if (req->r_num_caps)
+		ceph_reserve_caps(mdsc, &req->r_caps_reservation,
+				  req->r_num_caps);
+	dout("__register_request %p tid %lld\n", req, req->r_tid);
+	ceph_mdsc_get_request(req);
+	__insert_request(mdsc, req);
+
+	req->r_uid = current_fsuid();
+	req->r_gid = current_fsgid();
+
+	if (dir) {
+		struct ceph_inode_info *ci = ceph_inode(dir);
+
+		ihold(dir);
+		spin_lock(&ci->i_unsafe_lock);
+		req->r_unsafe_dir = dir;
+		list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
+		spin_unlock(&ci->i_unsafe_lock);
+	}
+}
+
+static void __unregister_request(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_request *req)
+{
+	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+	rb_erase(&req->r_node, &mdsc->request_tree);
+	RB_CLEAR_NODE(&req->r_node);
+
+	if (req->r_unsafe_dir) {
+		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
+
+		spin_lock(&ci->i_unsafe_lock);
+		list_del_init(&req->r_unsafe_dir_item);
+		spin_unlock(&ci->i_unsafe_lock);
+
+		iput(req->r_unsafe_dir);
+		req->r_unsafe_dir = NULL;
+	}
+
+	ceph_mdsc_put_request(req);
+}
+
+/*
+ * Choose mds to send request to next.  If there is a hint set in the
+ * request (e.g., due to a prior forward hint from the mds), use that.
+ * Otherwise, consult frag tree and/or caps to identify the
+ * appropriate mds.  If all else fails, choose randomly.
+ *
+ * Called under mdsc->mutex.
+ */
+struct dentry *get_nonsnap_parent(struct dentry *dentry)
+{
+	while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+		dentry = dentry->d_parent;
+	return dentry;
+}
+
+static int __choose_mds(struct ceph_mds_client *mdsc,
+			struct ceph_mds_request *req)
+{
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_cap *cap;
+	int mode = req->r_direct_mode;
+	int mds = -1;
+	u32 hash = req->r_direct_hash;
+	bool is_hash = req->r_direct_is_hash;
+
+	/*
+	 * is there a specific mds we should try?  ignore hint if we have
+	 * no session and the mds is not up (active or recovering).
+	 */
+	if (req->r_resend_mds >= 0 &&
+	    (__have_session(mdsc, req->r_resend_mds) ||
+	     ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
+		dout("choose_mds using resend_mds mds%d\n",
+		     req->r_resend_mds);
+		return req->r_resend_mds;
+	}
+
+	if (mode == USE_RANDOM_MDS)
+		goto random;
+
+	inode = NULL;
+	if (req->r_inode) {
+		inode = req->r_inode;
+	} else if (req->r_dentry) {
+		struct inode *dir = req->r_dentry->d_parent->d_inode;
+
+		if (dir->i_sb != mdsc->fsc->sb) {
+			/* not this fs! */
+			inode = req->r_dentry->d_inode;
+		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
+			/* direct snapped/virtual snapdir requests
+			 * based on parent dir inode */
+			struct dentry *dn =
+				get_nonsnap_parent(req->r_dentry->d_parent);
+			inode = dn->d_inode;
+			dout("__choose_mds using nonsnap parent %p\n", inode);
+		} else if (req->r_dentry->d_inode) {
+			/* dentry target */
+			inode = req->r_dentry->d_inode;
+		} else {
+			/* dir + name */
+			inode = dir;
+			hash = ceph_dentry_hash(req->r_dentry);
+			is_hash = true;
+		}
+	}
+
+	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
+	     (int)hash, mode);
+	if (!inode)
+		goto random;
+	ci = ceph_inode(inode);
+
+	if (is_hash && S_ISDIR(inode->i_mode)) {
+		struct ceph_inode_frag frag;
+		int found;
+
+		ceph_choose_frag(ci, hash, &frag, &found);
+		if (found) {
+			if (mode == USE_ANY_MDS && frag.ndist > 0) {
+				u8 r;
+
+				/* choose a random replica */
+				get_random_bytes(&r, 1);
+				r %= frag.ndist;
+				mds = frag.dist[r];
+				dout("choose_mds %p %llx.%llx "
+				     "frag %u mds%d (%d/%d)\n",
+				     inode, ceph_vinop(inode),
+				     frag.frag, mds,
+				     (int)r, frag.ndist);
+				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+				    CEPH_MDS_STATE_ACTIVE)
+					return mds;
+			}
+
+			/* since this file/dir wasn't known to be
+			 * replicated, then we want to look for the
+			 * authoritative mds. */
+			mode = USE_AUTH_MDS;
+			if (frag.mds >= 0) {
+				/* choose auth mds */
+				mds = frag.mds;
+				dout("choose_mds %p %llx.%llx "
+				     "frag %u mds%d (auth)\n",
+				     inode, ceph_vinop(inode), frag.frag, mds);
+				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+				    CEPH_MDS_STATE_ACTIVE)
+					return mds;
+			}
+		}
+	}
+
+	spin_lock(&inode->i_lock);
+	cap = NULL;
+	if (mode == USE_AUTH_MDS)
+		cap = ci->i_auth_cap;
+	if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
+		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
+	if (!cap) {
+		spin_unlock(&inode->i_lock);
+		goto random;
+	}
+	mds = cap->session->s_mds;
+	dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
+	     inode, ceph_vinop(inode), mds,
+	     cap == ci->i_auth_cap ? "auth " : "", cap);
+	spin_unlock(&inode->i_lock);
+	return mds;
+
+random:
+	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
+	dout("choose_mds chose random mds%d\n", mds);
+	return mds;
+}
+
+
+/*
+ * session messages
+ */
+static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_session_head *h;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
+	if (!msg) {
+		pr_err("create_session_msg ENOMEM creating msg\n");
+		return NULL;
+	}
+	h = msg->front.iov_base;
+	h->op = cpu_to_le32(op);
+	h->seq = cpu_to_le64(seq);
+	return msg;
+}
+
+/*
+ * send session open request.
+ *
+ * called under mdsc->mutex
+ */
+static int __open_session(struct ceph_mds_client *mdsc,
+			  struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	int mstate;
+	int mds = session->s_mds;
+
+	/* wait for mds to go active? */
+	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
+	dout("open_session to mds%d (%s)\n", mds,
+	     ceph_mds_state_name(mstate));
+	session->s_state = CEPH_MDS_SESSION_OPENING;
+	session->s_renew_requested = jiffies;
+
+	/* send connect message */
+	msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
+	if (!msg)
+		return -ENOMEM;
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+/*
+ * open sessions for any export targets for the given mds
+ *
+ * called under mdsc->mutex
+ */
+static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
+					  struct ceph_mds_session *session)
+{
+	struct ceph_mds_info *mi;
+	struct ceph_mds_session *ts;
+	int i, mds = session->s_mds;
+	int target;
+
+	if (mds >= mdsc->mdsmap->m_max_mds)
+		return;
+	mi = &mdsc->mdsmap->m_info[mds];
+	dout("open_export_target_sessions for mds%d (%d targets)\n",
+	     session->s_mds, mi->num_export_targets);
+
+	for (i = 0; i < mi->num_export_targets; i++) {
+		target = mi->export_targets[i];
+		ts = __ceph_lookup_mds_session(mdsc, target);
+		if (!ts) {
+			ts = register_session(mdsc, target);
+			if (IS_ERR(ts))
+				return;
+		}
+		if (session->s_state == CEPH_MDS_SESSION_NEW ||
+		    session->s_state == CEPH_MDS_SESSION_CLOSING)
+			__open_session(mdsc, session);
+		else
+			dout(" mds%d target mds%d %p is %s\n", session->s_mds,
+			     i, ts, session_state_name(ts->s_state));
+		ceph_put_mds_session(ts);
+	}
+}
+
+void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
+					   struct ceph_mds_session *session)
+{
+	mutex_lock(&mdsc->mutex);
+	__open_export_target_sessions(mdsc, session);
+	mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * session caps
+ */
+
+/*
+ * Free preallocated cap messages assigned to this session
+ */
+static void cleanup_cap_releases(struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+
+	spin_lock(&session->s_cap_lock);
+	while (!list_empty(&session->s_cap_releases)) {
+		msg = list_first_entry(&session->s_cap_releases,
+				       struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+	}
+	while (!list_empty(&session->s_cap_releases_done)) {
+		msg = list_first_entry(&session->s_cap_releases_done,
+				       struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+	}
+	spin_unlock(&session->s_cap_lock);
+}
+
+/*
+ * Helper to safely iterate over all caps associated with a session, with
+ * special care taken to handle a racing __ceph_remove_cap().
+ *
+ * Caller must hold session s_mutex.
+ */
+static int iterate_session_caps(struct ceph_mds_session *session,
+				 int (*cb)(struct inode *, struct ceph_cap *,
+					    void *), void *arg)
+{
+	struct list_head *p;
+	struct ceph_cap *cap;
+	struct inode *inode, *last_inode = NULL;
+	struct ceph_cap *old_cap = NULL;
+	int ret;
+
+	dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
+	spin_lock(&session->s_cap_lock);
+	p = session->s_caps.next;
+	while (p != &session->s_caps) {
+		cap = list_entry(p, struct ceph_cap, session_caps);
+		inode = igrab(&cap->ci->vfs_inode);
+		if (!inode) {
+			p = p->next;
+			continue;
+		}
+		session->s_cap_iterator = cap;
+		spin_unlock(&session->s_cap_lock);
+
+		if (last_inode) {
+			iput(last_inode);
+			last_inode = NULL;
+		}
+		if (old_cap) {
+			ceph_put_cap(session->s_mdsc, old_cap);
+			old_cap = NULL;
+		}
+
+		ret = cb(inode, cap, arg);
+		last_inode = inode;
+
+		spin_lock(&session->s_cap_lock);
+		p = p->next;
+		if (cap->ci == NULL) {
+			dout("iterate_session_caps  finishing cap %p removal\n",
+			     cap);
+			BUG_ON(cap->session != session);
+			list_del_init(&cap->session_caps);
+			session->s_nr_caps--;
+			cap->session = NULL;
+			old_cap = cap;  /* put_cap it w/o locks held */
+		}
+		if (ret < 0)
+			goto out;
+	}
+	ret = 0;
+out:
+	session->s_cap_iterator = NULL;
+	spin_unlock(&session->s_cap_lock);
+
+	if (last_inode)
+		iput(last_inode);
+	if (old_cap)
+		ceph_put_cap(session->s_mdsc, old_cap);
+
+	return ret;
+}
+
+static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
+				  void *arg)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int drop = 0;
+
+	dout("removing cap %p, ci is %p, inode is %p\n",
+	     cap, ci, &ci->vfs_inode);
+	spin_lock(&inode->i_lock);
+	__ceph_remove_cap(cap);
+	if (!__ceph_is_any_real_caps(ci)) {
+		struct ceph_mds_client *mdsc =
+			ceph_sb_to_client(inode->i_sb)->mdsc;
+
+		spin_lock(&mdsc->cap_dirty_lock);
+		if (!list_empty(&ci->i_dirty_item)) {
+			pr_info(" dropping dirty %s state for %p %lld\n",
+				ceph_cap_string(ci->i_dirty_caps),
+				inode, ceph_ino(inode));
+			ci->i_dirty_caps = 0;
+			list_del_init(&ci->i_dirty_item);
+			drop = 1;
+		}
+		if (!list_empty(&ci->i_flushing_item)) {
+			pr_info(" dropping dirty+flushing %s state for %p %lld\n",
+				ceph_cap_string(ci->i_flushing_caps),
+				inode, ceph_ino(inode));
+			ci->i_flushing_caps = 0;
+			list_del_init(&ci->i_flushing_item);
+			mdsc->num_cap_flushing--;
+			drop = 1;
+		}
+		if (drop && ci->i_wrbuffer_ref) {
+			pr_info(" dropping dirty data for %p %lld\n",
+				inode, ceph_ino(inode));
+			ci->i_wrbuffer_ref = 0;
+			ci->i_wrbuffer_ref_head = 0;
+			drop++;
+		}
+		spin_unlock(&mdsc->cap_dirty_lock);
+	}
+	spin_unlock(&inode->i_lock);
+	while (drop--)
+		iput(inode);
+	return 0;
+}
+
+/*
+ * caller must hold session s_mutex
+ */
+static void remove_session_caps(struct ceph_mds_session *session)
+{
+	dout("remove_session_caps on %p\n", session);
+	iterate_session_caps(session, remove_session_caps_cb, NULL);
+	BUG_ON(session->s_nr_caps > 0);
+	BUG_ON(!list_empty(&session->s_cap_flushing));
+	cleanup_cap_releases(session);
+}
+
+/*
+ * wake up any threads waiting on this session's caps.  if the cap is
+ * old (didn't get renewed on the client reconnect), remove it now.
+ *
+ * caller must hold s_mutex.
+ */
+static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
+			      void *arg)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	wake_up_all(&ci->i_cap_wq);
+	if (arg) {
+		spin_lock(&inode->i_lock);
+		ci->i_wanted_max_size = 0;
+		ci->i_requested_max_size = 0;
+		spin_unlock(&inode->i_lock);
+	}
+	return 0;
+}
+
+static void wake_up_session_caps(struct ceph_mds_session *session,
+				 int reconnect)
+{
+	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
+	iterate_session_caps(session, wake_up_session_cb,
+			     (void *)(unsigned long)reconnect);
+}
+
+/*
+ * Send periodic message to MDS renewing all currently held caps.  The
+ * ack will reset the expiration for all caps from this session.
+ *
+ * caller holds s_mutex
+ */
+static int send_renew_caps(struct ceph_mds_client *mdsc,
+			   struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	int state;
+
+	if (time_after_eq(jiffies, session->s_cap_ttl) &&
+	    time_after_eq(session->s_cap_ttl, session->s_renew_requested))
+		pr_info("mds%d caps stale\n", session->s_mds);
+	session->s_renew_requested = jiffies;
+
+	/* do not try to renew caps until a recovering mds has reconnected
+	 * with its clients. */
+	state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
+	if (state < CEPH_MDS_STATE_RECONNECT) {
+		dout("send_renew_caps ignoring mds%d (%s)\n",
+		     session->s_mds, ceph_mds_state_name(state));
+		return 0;
+	}
+
+	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
+		ceph_mds_state_name(state));
+	msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+				 ++session->s_renew_seq);
+	if (!msg)
+		return -ENOMEM;
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+/*
+ * Note new cap ttl, and any transition from stale -> not stale (fresh?).
+ *
+ * Called under session->s_mutex
+ */
+static void renewed_caps(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *session, int is_renew)
+{
+	int was_stale;
+	int wake = 0;
+
+	spin_lock(&session->s_cap_lock);
+	was_stale = is_renew && (session->s_cap_ttl == 0 ||
+				 time_after_eq(jiffies, session->s_cap_ttl));
+
+	session->s_cap_ttl = session->s_renew_requested +
+		mdsc->mdsmap->m_session_timeout*HZ;
+
+	if (was_stale) {
+		if (time_before(jiffies, session->s_cap_ttl)) {
+			pr_info("mds%d caps renewed\n", session->s_mds);
+			wake = 1;
+		} else {
+			pr_info("mds%d caps still stale\n", session->s_mds);
+		}
+	}
+	dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
+	     session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
+	     time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
+	spin_unlock(&session->s_cap_lock);
+
+	if (wake)
+		wake_up_session_caps(session, 0);
+}
+
+/*
+ * send a session close request
+ */
+static int request_close_session(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+
+	dout("request_close_session mds%d state %s seq %lld\n",
+	     session->s_mds, session_state_name(session->s_state),
+	     session->s_seq);
+	msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+	if (!msg)
+		return -ENOMEM;
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+/*
+ * Called with s_mutex held.
+ */
+static int __close_session(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *session)
+{
+	if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
+		return 0;
+	session->s_state = CEPH_MDS_SESSION_CLOSING;
+	return request_close_session(mdsc, session);
+}
+
+/*
+ * Trim old(er) caps.
+ *
+ * Because we can't cache an inode without one or more caps, we do
+ * this indirectly: if a cap is unused, we prune its aliases, at which
+ * point the inode will hopefully get dropped to.
+ *
+ * Yes, this is a bit sloppy.  Our only real goal here is to respond to
+ * memory pressure from the MDS, though, so it needn't be perfect.
+ */
+static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
+{
+	struct ceph_mds_session *session = arg;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int used, oissued, mine;
+
+	if (session->s_trim_caps <= 0)
+		return -1;
+
+	spin_lock(&inode->i_lock);
+	mine = cap->issued | cap->implemented;
+	used = __ceph_caps_used(ci);
+	oissued = __ceph_caps_issued_other(ci, cap);
+
+	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
+	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
+	     ceph_cap_string(used));
+	if (ci->i_dirty_caps)
+		goto out;   /* dirty caps */
+	if ((used & ~oissued) & mine)
+		goto out;   /* we need these caps */
+
+	session->s_trim_caps--;
+	if (oissued) {
+		/* we aren't the only cap.. just remove us */
+		__ceph_remove_cap(cap);
+	} else {
+		/* try to drop referring dentries */
+		spin_unlock(&inode->i_lock);
+		d_prune_aliases(inode);
+		dout("trim_caps_cb %p cap %p  pruned, count now %d\n",
+		     inode, cap, atomic_read(&inode->i_count));
+		return 0;
+	}
+
+out:
+	spin_unlock(&inode->i_lock);
+	return 0;
+}
+
+/*
+ * Trim session cap count down to some max number.
+ */
+static int trim_caps(struct ceph_mds_client *mdsc,
+		     struct ceph_mds_session *session,
+		     int max_caps)
+{
+	int trim_caps = session->s_nr_caps - max_caps;
+
+	dout("trim_caps mds%d start: %d / %d, trim %d\n",
+	     session->s_mds, session->s_nr_caps, max_caps, trim_caps);
+	if (trim_caps > 0) {
+		session->s_trim_caps = trim_caps;
+		iterate_session_caps(session, trim_caps_cb, session);
+		dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
+		     session->s_mds, session->s_nr_caps, max_caps,
+			trim_caps - session->s_trim_caps);
+		session->s_trim_caps = 0;
+	}
+	return 0;
+}
+
+/*
+ * Allocate cap_release messages.  If there is a partially full message
+ * in the queue, try to allocate enough to cover it's remainder, so that
+ * we can send it immediately.
+ *
+ * Called under s_mutex.
+ */
+int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
+			  struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg, *partial = NULL;
+	struct ceph_mds_cap_release *head;
+	int err = -ENOMEM;
+	int extra = mdsc->fsc->mount_options->cap_release_safety;
+	int num;
+
+	dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
+	     extra);
+
+	spin_lock(&session->s_cap_lock);
+
+	if (!list_empty(&session->s_cap_releases)) {
+		msg = list_first_entry(&session->s_cap_releases,
+				       struct ceph_msg,
+				 list_head);
+		head = msg->front.iov_base;
+		num = le32_to_cpu(head->num);
+		if (num) {
+			dout(" partial %p with (%d/%d)\n", msg, num,
+			     (int)CEPH_CAPS_PER_RELEASE);
+			extra += CEPH_CAPS_PER_RELEASE - num;
+			partial = msg;
+		}
+	}
+	while (session->s_num_cap_releases < session->s_nr_caps + extra) {
+		spin_unlock(&session->s_cap_lock);
+		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
+				   GFP_NOFS);
+		if (!msg)
+			goto out_unlocked;
+		dout("add_cap_releases %p msg %p now %d\n", session, msg,
+		     (int)msg->front.iov_len);
+		head = msg->front.iov_base;
+		head->num = cpu_to_le32(0);
+		msg->front.iov_len = sizeof(*head);
+		spin_lock(&session->s_cap_lock);
+		list_add(&msg->list_head, &session->s_cap_releases);
+		session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
+	}
+
+	if (partial) {
+		head = partial->front.iov_base;
+		num = le32_to_cpu(head->num);
+		dout(" queueing partial %p with %d/%d\n", partial, num,
+		     (int)CEPH_CAPS_PER_RELEASE);
+		list_move_tail(&partial->list_head,
+			       &session->s_cap_releases_done);
+		session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
+	}
+	err = 0;
+	spin_unlock(&session->s_cap_lock);
+out_unlocked:
+	return err;
+}
+
+/*
+ * flush all dirty inode data to disk.
+ *
+ * returns true if we've flushed through want_flush_seq
+ */
+static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+{
+	int mds, ret = 1;
+
+	dout("check_cap_flush want %lld\n", want_flush_seq);
+	mutex_lock(&mdsc->mutex);
+	for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
+		struct ceph_mds_session *session = mdsc->sessions[mds];
+
+		if (!session)
+			continue;
+		get_session(session);
+		mutex_unlock(&mdsc->mutex);
+
+		mutex_lock(&session->s_mutex);
+		if (!list_empty(&session->s_cap_flushing)) {
+			struct ceph_inode_info *ci =
+				list_entry(session->s_cap_flushing.next,
+					   struct ceph_inode_info,
+					   i_flushing_item);
+			struct inode *inode = &ci->vfs_inode;
+
+			spin_lock(&inode->i_lock);
+			if (ci->i_cap_flush_seq <= want_flush_seq) {
+				dout("check_cap_flush still flushing %p "
+				     "seq %lld <= %lld to mds%d\n", inode,
+				     ci->i_cap_flush_seq, want_flush_seq,
+				     session->s_mds);
+				ret = 0;
+			}
+			spin_unlock(&inode->i_lock);
+		}
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+
+		if (!ret)
+			return ret;
+		mutex_lock(&mdsc->mutex);
+	}
+
+	mutex_unlock(&mdsc->mutex);
+	dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+	return ret;
+}
+
+/*
+ * called under s_mutex
+ */
+void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+			    struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+
+	dout("send_cap_releases mds%d\n", session->s_mds);
+	spin_lock(&session->s_cap_lock);
+	while (!list_empty(&session->s_cap_releases_done)) {
+		msg = list_first_entry(&session->s_cap_releases_done,
+				 struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+		spin_unlock(&session->s_cap_lock);
+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+		dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+		ceph_con_send(&session->s_con, msg);
+		spin_lock(&session->s_cap_lock);
+	}
+	spin_unlock(&session->s_cap_lock);
+}
+
+static void discard_cap_releases(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_cap_release *head;
+	unsigned num;
+
+	dout("discard_cap_releases mds%d\n", session->s_mds);
+	spin_lock(&session->s_cap_lock);
+
+	/* zero out the in-progress message */
+	msg = list_first_entry(&session->s_cap_releases,
+			       struct ceph_msg, list_head);
+	head = msg->front.iov_base;
+	num = le32_to_cpu(head->num);
+	dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
+	head->num = cpu_to_le32(0);
+	session->s_num_cap_releases += num;
+
+	/* requeue completed messages */
+	while (!list_empty(&session->s_cap_releases_done)) {
+		msg = list_first_entry(&session->s_cap_releases_done,
+				 struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+
+		head = msg->front.iov_base;
+		num = le32_to_cpu(head->num);
+		dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
+		     num);
+		session->s_num_cap_releases += num;
+		head->num = cpu_to_le32(0);
+		msg->front.iov_len = sizeof(*head);
+		list_add(&msg->list_head, &session->s_cap_releases);
+	}
+
+	spin_unlock(&session->s_cap_lock);
+}
+
+/*
+ * requests
+ */
+
+/*
+ * Create an mds request.
+ */
+struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
+{
+	struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+
+	if (!req)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&req->r_fill_mutex);
+	req->r_mdsc = mdsc;
+	req->r_started = jiffies;
+	req->r_resend_mds = -1;
+	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+	req->r_fmode = -1;
+	kref_init(&req->r_kref);
+	INIT_LIST_HEAD(&req->r_wait);
+	init_completion(&req->r_completion);
+	init_completion(&req->r_safe_completion);
+	INIT_LIST_HEAD(&req->r_unsafe_item);
+
+	req->r_op = op;
+	req->r_direct_mode = mode;
+	return req;
+}
+
+/*
+ * return oldest (lowest) request, tid in request tree, 0 if none.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
+{
+	if (RB_EMPTY_ROOT(&mdsc->request_tree))
+		return NULL;
+	return rb_entry(rb_first(&mdsc->request_tree),
+			struct ceph_mds_request, r_node);
+}
+
+static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_request *req = __get_oldest_req(mdsc);
+
+	if (req)
+		return req->r_tid;
+	return 0;
+}
+
+/*
+ * Build a dentry's path.  Allocate on heap; caller must kfree.  Based
+ * on build_path_from_dentry in fs/cifs/dir.c.
+ *
+ * If @stop_on_nosnap, generate path relative to the first non-snapped
+ * inode.
+ *
+ * Encode hidden .snap dirs as a double /, i.e.
+ *   foo/.snap/bar -> foo//bar
+ */
+char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+			   int stop_on_nosnap)
+{
+	struct dentry *temp;
+	char *path;
+	int len, pos;
+	unsigned seq;
+
+	if (dentry == NULL)
+		return ERR_PTR(-EINVAL);
+
+retry:
+	len = 0;
+	seq = read_seqbegin(&rename_lock);
+	rcu_read_lock();
+	for (temp = dentry; !IS_ROOT(temp);) {
+		struct inode *inode = temp->d_inode;
+		if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
+			len++;  /* slash only */
+		else if (stop_on_nosnap && inode &&
+			 ceph_snap(inode) == CEPH_NOSNAP)
+			break;
+		else
+			len += 1 + temp->d_name.len;
+		temp = temp->d_parent;
+		if (temp == NULL) {
+			rcu_read_unlock();
+			pr_err("build_path corrupt dentry %p\n", dentry);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+	rcu_read_unlock();
+	if (len)
+		len--;  /* no leading '/' */
+
+	path = kmalloc(len+1, GFP_NOFS);
+	if (path == NULL)
+		return ERR_PTR(-ENOMEM);
+	pos = len;
+	path[pos] = 0;	/* trailing null */
+	rcu_read_lock();
+	for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
+		struct inode *inode;
+
+		spin_lock(&temp->d_lock);
+		inode = temp->d_inode;
+		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
+			dout("build_path path+%d: %p SNAPDIR\n",
+			     pos, temp);
+		} else if (stop_on_nosnap && inode &&
+			   ceph_snap(inode) == CEPH_NOSNAP) {
+			break;
+		} else {
+			pos -= temp->d_name.len;
+			if (pos < 0) {
+				spin_unlock(&temp->d_lock);
+				break;
+			}
+			strncpy(path + pos, temp->d_name.name,
+				temp->d_name.len);
+		}
+		spin_unlock(&temp->d_lock);
+		if (pos)
+			path[--pos] = '/';
+		temp = temp->d_parent;
+		if (temp == NULL) {
+			rcu_read_unlock();
+			pr_err("build_path corrupt dentry\n");
+			kfree(path);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+	rcu_read_unlock();
+	if (pos != 0 || read_seqretry(&rename_lock, seq)) {
+		pr_err("build_path did not end path lookup where "
+		       "expected, namelen is %d, pos is %d\n", len, pos);
+		/* presumably this is only possible if racing with a
+		   rename of one of the parent directories (we can not
+		   lock the dentries above us to prevent this, but
+		   retrying should be harmless) */
+		kfree(path);
+		goto retry;
+	}
+
+	*base = ceph_ino(temp->d_inode);
+	*plen = len;
+	dout("build_path on %p %d built %llx '%.*s'\n",
+	     dentry, dentry->d_count, *base, len, path);
+	return path;
+}
+
+static int build_dentry_path(struct dentry *dentry,
+			     const char **ppath, int *ppathlen, u64 *pino,
+			     int *pfreepath)
+{
+	char *path;
+
+	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
+		*pino = ceph_ino(dentry->d_parent->d_inode);
+		*ppath = dentry->d_name.name;
+		*ppathlen = dentry->d_name.len;
+		return 0;
+	}
+	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	*ppath = path;
+	*pfreepath = 1;
+	return 0;
+}
+
+static int build_inode_path(struct inode *inode,
+			    const char **ppath, int *ppathlen, u64 *pino,
+			    int *pfreepath)
+{
+	struct dentry *dentry;
+	char *path;
+
+	if (ceph_snap(inode) == CEPH_NOSNAP) {
+		*pino = ceph_ino(inode);
+		*ppathlen = 0;
+		return 0;
+	}
+	dentry = d_find_alias(inode);
+	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+	dput(dentry);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	*ppath = path;
+	*pfreepath = 1;
+	return 0;
+}
+
+/*
+ * request arguments may be specified via an inode *, a dentry *, or
+ * an explicit ino+path.
+ */
+static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
+				  const char *rpath, u64 rino,
+				  const char **ppath, int *pathlen,
+				  u64 *ino, int *freepath)
+{
+	int r = 0;
+
+	if (rinode) {
+		r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+		dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
+		     ceph_snap(rinode));
+	} else if (rdentry) {
+		r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
+		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
+		     *ppath);
+	} else if (rpath) {
+		*ino = rino;
+		*ppath = rpath;
+		*pathlen = strlen(rpath);
+		dout(" path %.*s\n", *pathlen, rpath);
+	}
+
+	return r;
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
+					       struct ceph_mds_request *req,
+					       int mds)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_request_head *head;
+	const char *path1 = NULL;
+	const char *path2 = NULL;
+	u64 ino1 = 0, ino2 = 0;
+	int pathlen1 = 0, pathlen2 = 0;
+	int freepath1 = 0, freepath2 = 0;
+	int len;
+	u16 releases;
+	void *p, *end;
+	int ret;
+
+	ret = set_request_path_attr(req->r_inode, req->r_dentry,
+			      req->r_path1, req->r_ino1.ino,
+			      &path1, &pathlen1, &ino1, &freepath1);
+	if (ret < 0) {
+		msg = ERR_PTR(ret);
+		goto out;
+	}
+
+	ret = set_request_path_attr(NULL, req->r_old_dentry,
+			      req->r_path2, req->r_ino2.ino,
+			      &path2, &pathlen2, &ino2, &freepath2);
+	if (ret < 0) {
+		msg = ERR_PTR(ret);
+		goto out_free1;
+	}
+
+	len = sizeof(*head) +
+		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
+
+	/* calculate (max) length for cap releases */
+	len += sizeof(struct ceph_mds_request_release) *
+		(!!req->r_inode_drop + !!req->r_dentry_drop +
+		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
+	if (req->r_dentry_drop)
+		len += req->r_dentry->d_name.len;
+	if (req->r_old_dentry_drop)
+		len += req->r_old_dentry->d_name.len;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
+	if (!msg) {
+		msg = ERR_PTR(-ENOMEM);
+		goto out_free2;
+	}
+
+	msg->hdr.tid = cpu_to_le64(req->r_tid);
+
+	head = msg->front.iov_base;
+	p = msg->front.iov_base + sizeof(*head);
+	end = msg->front.iov_base + msg->front.iov_len;
+
+	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
+	head->op = cpu_to_le32(req->r_op);
+	head->caller_uid = cpu_to_le32(req->r_uid);
+	head->caller_gid = cpu_to_le32(req->r_gid);
+	head->args = req->r_args;
+
+	ceph_encode_filepath(&p, end, ino1, path1);
+	ceph_encode_filepath(&p, end, ino2, path2);
+
+	/* make note of release offset, in case we need to replay */
+	req->r_request_release_offset = p - msg->front.iov_base;
+
+	/* cap releases */
+	releases = 0;
+	if (req->r_inode_drop)
+		releases += ceph_encode_inode_release(&p,
+		      req->r_inode ? req->r_inode : req->r_dentry->d_inode,
+		      mds, req->r_inode_drop, req->r_inode_unless, 0);
+	if (req->r_dentry_drop)
+		releases += ceph_encode_dentry_release(&p, req->r_dentry,
+		       mds, req->r_dentry_drop, req->r_dentry_unless);
+	if (req->r_old_dentry_drop)
+		releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
+		       mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
+	if (req->r_old_inode_drop)
+		releases += ceph_encode_inode_release(&p,
+		      req->r_old_dentry->d_inode,
+		      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
+	head->num_releases = cpu_to_le16(releases);
+
+	BUG_ON(p > end);
+	msg->front.iov_len = p - msg->front.iov_base;
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+
+	msg->pages = req->r_pages;
+	msg->nr_pages = req->r_num_pages;
+	msg->hdr.data_len = cpu_to_le32(req->r_data_len);
+	msg->hdr.data_off = cpu_to_le16(0);
+
+out_free2:
+	if (freepath2)
+		kfree((char *)path2);
+out_free1:
+	if (freepath1)
+		kfree((char *)path1);
+out:
+	return msg;
+}
+
+/*
+ * called under mdsc->mutex if error, under no mutex if
+ * success.
+ */
+static void complete_request(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_request *req)
+{
+	if (req->r_callback)
+		req->r_callback(mdsc, req);
+	else
+		complete_all(&req->r_completion);
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static int __prepare_send_request(struct ceph_mds_client *mdsc,
+				  struct ceph_mds_request *req,
+				  int mds)
+{
+	struct ceph_mds_request_head *rhead;
+	struct ceph_msg *msg;
+	int flags = 0;
+
+	req->r_attempts++;
+	if (req->r_inode) {
+		struct ceph_cap *cap =
+			ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
+
+		if (cap)
+			req->r_sent_on_mseq = cap->mseq;
+		else
+			req->r_sent_on_mseq = -1;
+	}
+	dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
+	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+
+	if (req->r_got_unsafe) {
+		/*
+		 * Replay.  Do not regenerate message (and rebuild
+		 * paths, etc.); just use the original message.
+		 * Rebuilding paths will break for renames because
+		 * d_move mangles the src name.
+		 */
+		msg = req->r_request;
+		rhead = msg->front.iov_base;
+
+		flags = le32_to_cpu(rhead->flags);
+		flags |= CEPH_MDS_FLAG_REPLAY;
+		rhead->flags = cpu_to_le32(flags);
+
+		if (req->r_target_inode)
+			rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+
+		rhead->num_retry = req->r_attempts - 1;
+
+		/* remove cap/dentry releases from message */
+		rhead->num_releases = 0;
+		msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
+		msg->front.iov_len = req->r_request_release_offset;
+		return 0;
+	}
+
+	if (req->r_request) {
+		ceph_msg_put(req->r_request);
+		req->r_request = NULL;
+	}
+	msg = create_request_message(mdsc, req, mds);
+	if (IS_ERR(msg)) {
+		req->r_err = PTR_ERR(msg);
+		complete_request(mdsc, req);
+		return PTR_ERR(msg);
+	}
+	req->r_request = msg;
+
+	rhead = msg->front.iov_base;
+	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
+	if (req->r_got_unsafe)
+		flags |= CEPH_MDS_FLAG_REPLAY;
+	if (req->r_locked_dir)
+		flags |= CEPH_MDS_FLAG_WANT_DENTRY;
+	rhead->flags = cpu_to_le32(flags);
+	rhead->num_fwd = req->r_num_fwd;
+	rhead->num_retry = req->r_attempts - 1;
+	rhead->ino = 0;
+
+	dout(" r_locked_dir = %p\n", req->r_locked_dir);
+	return 0;
+}
+
+/*
+ * send request, or put it on the appropriate wait list.
+ */
+static int __do_request(struct ceph_mds_client *mdsc,
+			struct ceph_mds_request *req)
+{
+	struct ceph_mds_session *session = NULL;
+	int mds = -1;
+	int err = -EAGAIN;
+
+	if (req->r_err || req->r_got_result)
+		goto out;
+
+	if (req->r_timeout &&
+	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
+		dout("do_request timed out\n");
+		err = -EIO;
+		goto finish;
+	}
+
+	put_request_session(req);
+
+	mds = __choose_mds(mdsc, req);
+	if (mds < 0 ||
+	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+		dout("do_request no mds or not active, waiting for map\n");
+		list_add(&req->r_wait, &mdsc->waiting_for_map);
+		goto out;
+	}
+
+	/* get, open session */
+	session = __ceph_lookup_mds_session(mdsc, mds);
+	if (!session) {
+		session = register_session(mdsc, mds);
+		if (IS_ERR(session)) {
+			err = PTR_ERR(session);
+			goto finish;
+		}
+	}
+	req->r_session = get_session(session);
+
+	dout("do_request mds%d session %p state %s\n", mds, session,
+	     session_state_name(session->s_state));
+	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
+	    session->s_state != CEPH_MDS_SESSION_HUNG) {
+		if (session->s_state == CEPH_MDS_SESSION_NEW ||
+		    session->s_state == CEPH_MDS_SESSION_CLOSING)
+			__open_session(mdsc, session);
+		list_add(&req->r_wait, &session->s_waiting);
+		goto out_session;
+	}
+
+	/* send request */
+	req->r_resend_mds = -1;   /* forget any previous mds hint */
+
+	if (req->r_request_started == 0)   /* note request start time */
+		req->r_request_started = jiffies;
+
+	err = __prepare_send_request(mdsc, req, mds);
+	if (!err) {
+		ceph_msg_get(req->r_request);
+		ceph_con_send(&session->s_con, req->r_request);
+	}
+
+out_session:
+	ceph_put_mds_session(session);
+out:
+	return err;
+
+finish:
+	req->r_err = err;
+	complete_request(mdsc, req);
+	goto out;
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static void __wake_requests(struct ceph_mds_client *mdsc,
+			    struct list_head *head)
+{
+	struct ceph_mds_request *req, *nreq;
+
+	list_for_each_entry_safe(req, nreq, head, r_wait) {
+		list_del_init(&req->r_wait);
+		__do_request(mdsc, req);
+	}
+}
+
+/*
+ * Wake up threads with requests pending for @mds, so that they can
+ * resubmit their requests to a possibly different mds.
+ */
+static void kick_requests(struct ceph_mds_client *mdsc, int mds)
+{
+	struct ceph_mds_request *req;
+	struct rb_node *p;
+
+	dout("kick_requests mds%d\n", mds);
+	for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_mds_request, r_node);
+		if (req->r_got_unsafe)
+			continue;
+		if (req->r_session &&
+		    req->r_session->s_mds == mds) {
+			dout(" kicking tid %llu\n", req->r_tid);
+			__do_request(mdsc, req);
+		}
+	}
+}
+
+void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+			      struct ceph_mds_request *req)
+{
+	dout("submit_request on %p\n", req);
+	mutex_lock(&mdsc->mutex);
+	__register_request(mdsc, req, NULL);
+	__do_request(mdsc, req);
+	mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Synchrously perform an mds request.  Take care of all of the
+ * session setup, forwarding, retry details.
+ */
+int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+			 struct inode *dir,
+			 struct ceph_mds_request *req)
+{
+	int err;
+
+	dout("do_request on %p\n", req);
+
+	/* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
+	if (req->r_inode)
+		ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
+	if (req->r_locked_dir)
+		ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+	if (req->r_old_dentry)
+		ceph_get_cap_refs(
+			ceph_inode(req->r_old_dentry->d_parent->d_inode),
+			CEPH_CAP_PIN);
+
+	/* issue */
+	mutex_lock(&mdsc->mutex);
+	__register_request(mdsc, req, dir);
+	__do_request(mdsc, req);
+
+	if (req->r_err) {
+		err = req->r_err;
+		__unregister_request(mdsc, req);
+		dout("do_request early error %d\n", err);
+		goto out;
+	}
+
+	/* wait */
+	mutex_unlock(&mdsc->mutex);
+	dout("do_request waiting\n");
+	if (req->r_timeout) {
+		err = (long)wait_for_completion_killable_timeout(
+			&req->r_completion, req->r_timeout);
+		if (err == 0)
+			err = -EIO;
+	} else {
+		err = wait_for_completion_killable(&req->r_completion);
+	}
+	dout("do_request waited, got %d\n", err);
+	mutex_lock(&mdsc->mutex);
+
+	/* only abort if we didn't race with a real reply */
+	if (req->r_got_result) {
+		err = le32_to_cpu(req->r_reply_info.head->result);
+	} else if (err < 0) {
+		dout("aborted request %lld with %d\n", req->r_tid, err);
+
+		/*
+		 * ensure we aren't running concurrently with
+		 * ceph_fill_trace or ceph_readdir_prepopulate, which
+		 * rely on locks (dir mutex) held by our caller.
+		 */
+		mutex_lock(&req->r_fill_mutex);
+		req->r_err = err;
+		req->r_aborted = true;
+		mutex_unlock(&req->r_fill_mutex);
+
+		if (req->r_locked_dir &&
+		    (req->r_op & CEPH_MDS_OP_WRITE))
+			ceph_invalidate_dir_request(req);
+	} else {
+		err = req->r_err;
+	}
+
+out:
+	mutex_unlock(&mdsc->mutex);
+	dout("do_request %p done, result %d\n", req, err);
+	return err;
+}
+
+/*
+ * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
+ * namespace request.
+ */
+void ceph_invalidate_dir_request(struct ceph_mds_request *req)
+{
+	struct inode *inode = req->r_locked_dir;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
+	spin_lock(&inode->i_lock);
+	ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+	ci->i_release_count++;
+	spin_unlock(&inode->i_lock);
+
+	if (req->r_dentry)
+		ceph_invalidate_dentry_lease(req->r_dentry);
+	if (req->r_old_dentry)
+		ceph_invalidate_dentry_lease(req->r_old_dentry);
+}
+
+/*
+ * Handle mds reply.
+ *
+ * We take the session mutex and parse and process the reply immediately.
+ * This preserves the logical ordering of replies, capabilities, etc., sent
+ * by the MDS as they are applied to our local cache.
+ */
+static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
+{
+	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_mds_reply_head *head = msg->front.iov_base;
+	struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
+	u64 tid;
+	int err, result;
+	int mds = session->s_mds;
+
+	if (msg->front.iov_len < sizeof(*head)) {
+		pr_err("mdsc_handle_reply got corrupt (short) reply\n");
+		ceph_msg_dump(msg);
+		return;
+	}
+
+	/* get request, session */
+	tid = le64_to_cpu(msg->hdr.tid);
+	mutex_lock(&mdsc->mutex);
+	req = __lookup_request(mdsc, tid);
+	if (!req) {
+		dout("handle_reply on unknown tid %llu\n", tid);
+		mutex_unlock(&mdsc->mutex);
+		return;
+	}
+	dout("handle_reply %p\n", req);
+
+	/* correct session? */
+	if (req->r_session != session) {
+		pr_err("mdsc_handle_reply got %llu on session mds%d"
+		       " not mds%d\n", tid, session->s_mds,
+		       req->r_session ? req->r_session->s_mds : -1);
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+
+	/* dup? */
+	if ((req->r_got_unsafe && !head->safe) ||
+	    (req->r_got_safe && head->safe)) {
+		pr_warning("got a dup %s reply on %llu from mds%d\n",
+			   head->safe ? "safe" : "unsafe", tid, mds);
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+	if (req->r_got_safe && !head->safe) {
+		pr_warning("got unsafe after safe on %llu from mds%d\n",
+			   tid, mds);
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+
+	result = le32_to_cpu(head->result);
+
+	/*
+	 * Handle an ESTALE
+	 * if we're not talking to the authority, send to them
+	 * if the authority has changed while we weren't looking,
+	 * send to new authority
+	 * Otherwise we just have to return an ESTALE
+	 */
+	if (result == -ESTALE) {
+		dout("got ESTALE on request %llu", req->r_tid);
+		if (!req->r_inode) {
+			/* do nothing; not an authority problem */
+		} else if (req->r_direct_mode != USE_AUTH_MDS) {
+			dout("not using auth, setting for that now");
+			req->r_direct_mode = USE_AUTH_MDS;
+			__do_request(mdsc, req);
+			mutex_unlock(&mdsc->mutex);
+			goto out;
+		} else  {
+			struct ceph_inode_info *ci = ceph_inode(req->r_inode);
+			struct ceph_cap *cap = NULL;
+
+			if (req->r_session)
+				cap = ceph_get_cap_for_mds(ci,
+						   req->r_session->s_mds);
+
+			dout("already using auth");
+			if ((!cap || cap != ci->i_auth_cap) ||
+			    (cap->mseq != req->r_sent_on_mseq)) {
+				dout("but cap changed, so resending");
+				__do_request(mdsc, req);
+				mutex_unlock(&mdsc->mutex);
+				goto out;
+			}
+		}
+		dout("have to return ESTALE on request %llu", req->r_tid);
+	}
+
+
+	if (head->safe) {
+		req->r_got_safe = true;
+		__unregister_request(mdsc, req);
+		complete_all(&req->r_safe_completion);
+
+		if (req->r_got_unsafe) {
+			/*
+			 * We already handled the unsafe response, now do the
+			 * cleanup.  No need to examine the response; the MDS
+			 * doesn't include any result info in the safe
+			 * response.  And even if it did, there is nothing
+			 * useful we could do with a revised return value.
+			 */
+			dout("got safe reply %llu, mds%d\n", tid, mds);
+			list_del_init(&req->r_unsafe_item);
+
+			/* last unsafe request during umount? */
+			if (mdsc->stopping && !__get_oldest_req(mdsc))
+				complete_all(&mdsc->safe_umount_waiters);
+			mutex_unlock(&mdsc->mutex);
+			goto out;
+		}
+	} else {
+		req->r_got_unsafe = true;
+		list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
+	}
+
+	dout("handle_reply tid %lld result %d\n", tid, result);
+	rinfo = &req->r_reply_info;
+	err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
+	mutex_unlock(&mdsc->mutex);
+
+	mutex_lock(&session->s_mutex);
+	if (err < 0) {
+		pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
+		ceph_msg_dump(msg);
+		goto out_err;
+	}
+
+	/* snap trace */
+	if (rinfo->snapblob_len) {
+		down_write(&mdsc->snap_rwsem);
+		ceph_update_snap_trace(mdsc, rinfo->snapblob,
+			       rinfo->snapblob + rinfo->snapblob_len,
+			       le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
+		downgrade_write(&mdsc->snap_rwsem);
+	} else {
+		down_read(&mdsc->snap_rwsem);
+	}
+
+	/* insert trace into our cache */
+	mutex_lock(&req->r_fill_mutex);
+	err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
+	if (err == 0) {
+		if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
+		    rinfo->dir_nr)
+			ceph_readdir_prepopulate(req, req->r_session);
+		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
+	}
+	mutex_unlock(&req->r_fill_mutex);
+
+	up_read(&mdsc->snap_rwsem);
+out_err:
+	mutex_lock(&mdsc->mutex);
+	if (!req->r_aborted) {
+		if (err) {
+			req->r_err = err;
+		} else {
+			req->r_reply = msg;
+			ceph_msg_get(msg);
+			req->r_got_result = true;
+		}
+	} else {
+		dout("reply arrived after request %lld was aborted\n", tid);
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	ceph_add_cap_releases(mdsc, req->r_session);
+	mutex_unlock(&session->s_mutex);
+
+	/* kick calling process */
+	complete_request(mdsc, req);
+out:
+	ceph_mdsc_put_request(req);
+	return;
+}
+
+
+
+/*
+ * handle mds notification that our request has been forwarded.
+ */
+static void handle_forward(struct ceph_mds_client *mdsc,
+			   struct ceph_mds_session *session,
+			   struct ceph_msg *msg)
+{
+	struct ceph_mds_request *req;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+	u32 next_mds;
+	u32 fwd_seq;
+	int err = -EINVAL;
+	void *p = msg->front.iov_base;
+	void *end = p + msg->front.iov_len;
+
+	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+	next_mds = ceph_decode_32(&p);
+	fwd_seq = ceph_decode_32(&p);
+
+	mutex_lock(&mdsc->mutex);
+	req = __lookup_request(mdsc, tid);
+	if (!req) {
+		dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
+		goto out;  /* dup reply? */
+	}
+
+	if (req->r_aborted) {
+		dout("forward tid %llu aborted, unregistering\n", tid);
+		__unregister_request(mdsc, req);
+	} else if (fwd_seq <= req->r_num_fwd) {
+		dout("forward tid %llu to mds%d - old seq %d <= %d\n",
+		     tid, next_mds, req->r_num_fwd, fwd_seq);
+	} else {
+		/* resend. forward race not possible; mds would drop */
+		dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
+		BUG_ON(req->r_err);
+		BUG_ON(req->r_got_result);
+		req->r_num_fwd = fwd_seq;
+		req->r_resend_mds = next_mds;
+		put_request_session(req);
+		__do_request(mdsc, req);
+	}
+	ceph_mdsc_put_request(req);
+out:
+	mutex_unlock(&mdsc->mutex);
+	return;
+
+bad:
+	pr_err("mdsc_handle_forward decode error err=%d\n", err);
+}
+
+/*
+ * handle a mds session control message
+ */
+static void handle_session(struct ceph_mds_session *session,
+			   struct ceph_msg *msg)
+{
+	struct ceph_mds_client *mdsc = session->s_mdsc;
+	u32 op;
+	u64 seq;
+	int mds = session->s_mds;
+	struct ceph_mds_session_head *h = msg->front.iov_base;
+	int wake = 0;
+
+	/* decode */
+	if (msg->front.iov_len != sizeof(*h))
+		goto bad;
+	op = le32_to_cpu(h->op);
+	seq = le64_to_cpu(h->seq);
+
+	mutex_lock(&mdsc->mutex);
+	if (op == CEPH_SESSION_CLOSE)
+		__unregister_session(mdsc, session);
+	/* FIXME: this ttl calculation is generous */
+	session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
+	mutex_unlock(&mdsc->mutex);
+
+	mutex_lock(&session->s_mutex);
+
+	dout("handle_session mds%d %s %p state %s seq %llu\n",
+	     mds, ceph_session_op_name(op), session,
+	     session_state_name(session->s_state), seq);
+
+	if (session->s_state == CEPH_MDS_SESSION_HUNG) {
+		session->s_state = CEPH_MDS_SESSION_OPEN;
+		pr_info("mds%d came back\n", session->s_mds);
+	}
+
+	switch (op) {
+	case CEPH_SESSION_OPEN:
+		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+			pr_info("mds%d reconnect success\n", session->s_mds);
+		session->s_state = CEPH_MDS_SESSION_OPEN;
+		renewed_caps(mdsc, session, 0);
+		wake = 1;
+		if (mdsc->stopping)
+			__close_session(mdsc, session);
+		break;
+
+	case CEPH_SESSION_RENEWCAPS:
+		if (session->s_renew_seq == seq)
+			renewed_caps(mdsc, session, 1);
+		break;
+
+	case CEPH_SESSION_CLOSE:
+		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+			pr_info("mds%d reconnect denied\n", session->s_mds);
+		remove_session_caps(session);
+		wake = 1; /* for good measure */
+		wake_up_all(&mdsc->session_close_wq);
+		kick_requests(mdsc, mds);
+		break;
+
+	case CEPH_SESSION_STALE:
+		pr_info("mds%d caps went stale, renewing\n",
+			session->s_mds);
+		spin_lock(&session->s_cap_lock);
+		session->s_cap_gen++;
+		session->s_cap_ttl = 0;
+		spin_unlock(&session->s_cap_lock);
+		send_renew_caps(mdsc, session);
+		break;
+
+	case CEPH_SESSION_RECALL_STATE:
+		trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
+		break;
+
+	default:
+		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
+		WARN_ON(1);
+	}
+
+	mutex_unlock(&session->s_mutex);
+	if (wake) {
+		mutex_lock(&mdsc->mutex);
+		__wake_requests(mdsc, &session->s_waiting);
+		mutex_unlock(&mdsc->mutex);
+	}
+	return;
+
+bad:
+	pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
+	       (int)msg->front.iov_len);
+	ceph_msg_dump(msg);
+	return;
+}
+
+
+/*
+ * called under session->mutex.
+ */
+static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session)
+{
+	struct ceph_mds_request *req, *nreq;
+	int err;
+
+	dout("replay_unsafe_requests mds%d\n", session->s_mds);
+
+	mutex_lock(&mdsc->mutex);
+	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
+		err = __prepare_send_request(mdsc, req, session->s_mds);
+		if (!err) {
+			ceph_msg_get(req->r_request);
+			ceph_con_send(&session->s_con, req->r_request);
+		}
+	}
+	mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Encode information about a cap for a reconnect with the MDS.
+ */
+static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
+			  void *arg)
+{
+	union {
+		struct ceph_mds_cap_reconnect v2;
+		struct ceph_mds_cap_reconnect_v1 v1;
+	} rec;
+	size_t reclen;
+	struct ceph_inode_info *ci;
+	struct ceph_reconnect_state *recon_state = arg;
+	struct ceph_pagelist *pagelist = recon_state->pagelist;
+	char *path;
+	int pathlen, err;
+	u64 pathbase;
+	struct dentry *dentry;
+
+	ci = cap->ci;
+
+	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
+	     inode, ceph_vinop(inode), cap, cap->cap_id,
+	     ceph_cap_string(cap->issued));
+	err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
+	if (err)
+		return err;
+
+	dentry = d_find_alias(inode);
+	if (dentry) {
+		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			goto out_dput;
+		}
+	} else {
+		path = NULL;
+		pathlen = 0;
+	}
+	err = ceph_pagelist_encode_string(pagelist, path, pathlen);
+	if (err)
+		goto out_free;
+
+	spin_lock(&inode->i_lock);
+	cap->seq = 0;        /* reset cap seq */
+	cap->issue_seq = 0;  /* and issue_seq */
+
+	if (recon_state->flock) {
+		rec.v2.cap_id = cpu_to_le64(cap->cap_id);
+		rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+		rec.v2.issued = cpu_to_le32(cap->issued);
+		rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+		rec.v2.pathbase = cpu_to_le64(pathbase);
+		rec.v2.flock_len = 0;
+		reclen = sizeof(rec.v2);
+	} else {
+		rec.v1.cap_id = cpu_to_le64(cap->cap_id);
+		rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+		rec.v1.issued = cpu_to_le32(cap->issued);
+		rec.v1.size = cpu_to_le64(inode->i_size);
+		ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
+		ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
+		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+		rec.v1.pathbase = cpu_to_le64(pathbase);
+		reclen = sizeof(rec.v1);
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (recon_state->flock) {
+		int num_fcntl_locks, num_flock_locks;
+		struct ceph_pagelist_cursor trunc_point;
+
+		ceph_pagelist_set_cursor(pagelist, &trunc_point);
+		do {
+			lock_flocks();
+			ceph_count_locks(inode, &num_fcntl_locks,
+					 &num_flock_locks);
+			rec.v2.flock_len = (2*sizeof(u32) +
+					    (num_fcntl_locks+num_flock_locks) *
+					    sizeof(struct ceph_filelock));
+			unlock_flocks();
+
+			/* pre-alloc pagelist */
+			ceph_pagelist_truncate(pagelist, &trunc_point);
+			err = ceph_pagelist_append(pagelist, &rec, reclen);
+			if (!err)
+				err = ceph_pagelist_reserve(pagelist,
+							    rec.v2.flock_len);
+
+			/* encode locks */
+			if (!err) {
+				lock_flocks();
+				err = ceph_encode_locks(inode,
+							pagelist,
+							num_fcntl_locks,
+							num_flock_locks);
+				unlock_flocks();
+			}
+		} while (err == -ENOSPC);
+	} else {
+		err = ceph_pagelist_append(pagelist, &rec, reclen);
+	}
+
+out_free:
+	kfree(path);
+out_dput:
+	dput(dentry);
+	return err;
+}
+
+
+/*
+ * If an MDS fails and recovers, clients need to reconnect in order to
+ * reestablish shared state.  This includes all caps issued through
+ * this session _and_ the snap_realm hierarchy.  Because it's not
+ * clear which snap realms the mds cares about, we send everything we
+ * know about.. that ensures we'll then get any new info the
+ * recovering MDS might have.
+ *
+ * This is a relatively heavyweight operation, but it's rare.
+ *
+ * called with mdsc->mutex held.
+ */
+static void send_mds_reconnect(struct ceph_mds_client *mdsc,
+			       struct ceph_mds_session *session)
+{
+	struct ceph_msg *reply;
+	struct rb_node *p;
+	int mds = session->s_mds;
+	int err = -ENOMEM;
+	struct ceph_pagelist *pagelist;
+	struct ceph_reconnect_state recon_state;
+
+	pr_info("mds%d reconnect start\n", mds);
+
+	pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+	if (!pagelist)
+		goto fail_nopagelist;
+	ceph_pagelist_init(pagelist);
+
+	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
+	if (!reply)
+		goto fail_nomsg;
+
+	mutex_lock(&session->s_mutex);
+	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+	session->s_seq = 0;
+
+	ceph_con_open(&session->s_con,
+		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+	/* replay unsafe requests */
+	replay_unsafe_requests(mdsc, session);
+
+	down_read(&mdsc->snap_rwsem);
+
+	dout("session %p state %s\n", session,
+	     session_state_name(session->s_state));
+
+	/* drop old cap expires; we're about to reestablish that state */
+	discard_cap_releases(mdsc, session);
+
+	/* traverse this session's caps */
+	err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
+	if (err)
+		goto fail;
+
+	recon_state.pagelist = pagelist;
+	recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
+	err = iterate_session_caps(session, encode_caps_cb, &recon_state);
+	if (err < 0)
+		goto fail;
+
+	/*
+	 * snaprealms.  we provide mds with the ino, seq (version), and
+	 * parent for all of our realms.  If the mds has any newer info,
+	 * it will tell us.
+	 */
+	for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
+		struct ceph_snap_realm *realm =
+			rb_entry(p, struct ceph_snap_realm, node);
+		struct ceph_mds_snaprealm_reconnect sr_rec;
+
+		dout(" adding snap realm %llx seq %lld parent %llx\n",
+		     realm->ino, realm->seq, realm->parent_ino);
+		sr_rec.ino = cpu_to_le64(realm->ino);
+		sr_rec.seq = cpu_to_le64(realm->seq);
+		sr_rec.parent = cpu_to_le64(realm->parent_ino);
+		err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
+		if (err)
+			goto fail;
+	}
+
+	reply->pagelist = pagelist;
+	if (recon_state.flock)
+		reply->hdr.version = cpu_to_le16(2);
+	reply->hdr.data_len = cpu_to_le32(pagelist->length);
+	reply->nr_pages = calc_pages_for(0, pagelist->length);
+	ceph_con_send(&session->s_con, reply);
+
+	mutex_unlock(&session->s_mutex);
+
+	mutex_lock(&mdsc->mutex);
+	__wake_requests(mdsc, &session->s_waiting);
+	mutex_unlock(&mdsc->mutex);
+
+	up_read(&mdsc->snap_rwsem);
+	return;
+
+fail:
+	ceph_msg_put(reply);
+	up_read(&mdsc->snap_rwsem);
+	mutex_unlock(&session->s_mutex);
+fail_nomsg:
+	ceph_pagelist_release(pagelist);
+	kfree(pagelist);
+fail_nopagelist:
+	pr_err("error %d preparing reconnect for mds%d\n", err, mds);
+	return;
+}
+
+
+/*
+ * compare old and new mdsmaps, kicking requests
+ * and closing out old connections as necessary
+ *
+ * called under mdsc->mutex.
+ */
+static void check_new_map(struct ceph_mds_client *mdsc,
+			  struct ceph_mdsmap *newmap,
+			  struct ceph_mdsmap *oldmap)
+{
+	int i;
+	int oldstate, newstate;
+	struct ceph_mds_session *s;
+
+	dout("check_new_map new %u old %u\n",
+	     newmap->m_epoch, oldmap->m_epoch);
+
+	for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
+		if (mdsc->sessions[i] == NULL)
+			continue;
+		s = mdsc->sessions[i];
+		oldstate = ceph_mdsmap_get_state(oldmap, i);
+		newstate = ceph_mdsmap_get_state(newmap, i);
+
+		dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
+		     i, ceph_mds_state_name(oldstate),
+		     ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
+		     ceph_mds_state_name(newstate),
+		     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
+		     session_state_name(s->s_state));
+
+		if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
+			   ceph_mdsmap_get_addr(newmap, i),
+			   sizeof(struct ceph_entity_addr))) {
+			if (s->s_state == CEPH_MDS_SESSION_OPENING) {
+				/* the session never opened, just close it
+				 * out now */
+				__wake_requests(mdsc, &s->s_waiting);
+				__unregister_session(mdsc, s);
+			} else {
+				/* just close it */
+				mutex_unlock(&mdsc->mutex);
+				mutex_lock(&s->s_mutex);
+				mutex_lock(&mdsc->mutex);
+				ceph_con_close(&s->s_con);
+				mutex_unlock(&s->s_mutex);
+				s->s_state = CEPH_MDS_SESSION_RESTARTING;
+			}
+
+			/* kick any requests waiting on the recovering mds */
+			kick_requests(mdsc, i);
+		} else if (oldstate == newstate) {
+			continue;  /* nothing new with this mds */
+		}
+
+		/*
+		 * send reconnect?
+		 */
+		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
+		    newstate >= CEPH_MDS_STATE_RECONNECT) {
+			mutex_unlock(&mdsc->mutex);
+			send_mds_reconnect(mdsc, s);
+			mutex_lock(&mdsc->mutex);
+		}
+
+		/*
+		 * kick request on any mds that has gone active.
+		 */
+		if (oldstate < CEPH_MDS_STATE_ACTIVE &&
+		    newstate >= CEPH_MDS_STATE_ACTIVE) {
+			if (oldstate != CEPH_MDS_STATE_CREATING &&
+			    oldstate != CEPH_MDS_STATE_STARTING)
+				pr_info("mds%d recovery completed\n", s->s_mds);
+			kick_requests(mdsc, i);
+			ceph_kick_flushing_caps(mdsc, s);
+			wake_up_session_caps(s, 1);
+		}
+	}
+
+	for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
+		s = mdsc->sessions[i];
+		if (!s)
+			continue;
+		if (!ceph_mdsmap_is_laggy(newmap, i))
+			continue;
+		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+		    s->s_state == CEPH_MDS_SESSION_HUNG ||
+		    s->s_state == CEPH_MDS_SESSION_CLOSING) {
+			dout(" connecting to export targets of laggy mds%d\n",
+			     i);
+			__open_export_target_sessions(mdsc, s);
+		}
+	}
+}
+
+
+
+/*
+ * leases
+ */
+
+/*
+ * caller must hold session s_mutex, dentry->d_lock
+ */
+void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+	ceph_put_mds_session(di->lease_session);
+	di->lease_session = NULL;
+}
+
+static void handle_lease(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *session,
+			 struct ceph_msg *msg)
+{
+	struct super_block *sb = mdsc->fsc->sb;
+	struct inode *inode;
+	struct dentry *parent, *dentry;
+	struct ceph_dentry_info *di;
+	int mds = session->s_mds;
+	struct ceph_mds_lease *h = msg->front.iov_base;
+	u32 seq;
+	struct ceph_vino vino;
+	int mask;
+	struct qstr dname;
+	int release = 0;
+
+	dout("handle_lease from mds%d\n", mds);
+
+	/* decode */
+	if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
+		goto bad;
+	vino.ino = le64_to_cpu(h->ino);
+	vino.snap = CEPH_NOSNAP;
+	mask = le16_to_cpu(h->mask);
+	seq = le32_to_cpu(h->seq);
+	dname.name = (void *)h + sizeof(*h) + sizeof(u32);
+	dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
+	if (dname.len != get_unaligned_le32(h+1))
+		goto bad;
+
+	mutex_lock(&session->s_mutex);
+	session->s_seq++;
+
+	/* lookup inode */
+	inode = ceph_find_inode(sb, vino);
+	dout("handle_lease %s, mask %d, ino %llx %p %.*s\n",
+	     ceph_lease_op_name(h->action), mask, vino.ino, inode,
+	     dname.len, dname.name);
+	if (inode == NULL) {
+		dout("handle_lease no inode %llx\n", vino.ino);
+		goto release;
+	}
+
+	/* dentry */
+	parent = d_find_alias(inode);
+	if (!parent) {
+		dout("no parent dentry on inode %p\n", inode);
+		WARN_ON(1);
+		goto release;  /* hrm... */
+	}
+	dname.hash = full_name_hash(dname.name, dname.len);
+	dentry = d_lookup(parent, &dname);
+	dput(parent);
+	if (!dentry)
+		goto release;
+
+	spin_lock(&dentry->d_lock);
+	di = ceph_dentry(dentry);
+	switch (h->action) {
+	case CEPH_MDS_LEASE_REVOKE:
+		if (di && di->lease_session == session) {
+			if (ceph_seq_cmp(di->lease_seq, seq) > 0)
+				h->seq = cpu_to_le32(di->lease_seq);
+			__ceph_mdsc_drop_dentry_lease(dentry);
+		}
+		release = 1;
+		break;
+
+	case CEPH_MDS_LEASE_RENEW:
+		if (di && di->lease_session == session &&
+		    di->lease_gen == session->s_cap_gen &&
+		    di->lease_renew_from &&
+		    di->lease_renew_after == 0) {
+			unsigned long duration =
+				le32_to_cpu(h->duration_ms) * HZ / 1000;
+
+			di->lease_seq = seq;
+			dentry->d_time = di->lease_renew_from + duration;
+			di->lease_renew_after = di->lease_renew_from +
+				(duration >> 1);
+			di->lease_renew_from = 0;
+		}
+		break;
+	}
+	spin_unlock(&dentry->d_lock);
+	dput(dentry);
+
+	if (!release)
+		goto out;
+
+release:
+	/* let's just reuse the same message */
+	h->action = CEPH_MDS_LEASE_REVOKE_ACK;
+	ceph_msg_get(msg);
+	ceph_con_send(&session->s_con, msg);
+
+out:
+	iput(inode);
+	mutex_unlock(&session->s_mutex);
+	return;
+
+bad:
+	pr_err("corrupt lease message\n");
+	ceph_msg_dump(msg);
+}
+
+void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+			      struct inode *inode,
+			      struct dentry *dentry, char action,
+			      u32 seq)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_lease *lease;
+	int len = sizeof(*lease) + sizeof(u32);
+	int dnamelen = 0;
+
+	dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
+	     inode, dentry, ceph_lease_op_name(action), session->s_mds);
+	dnamelen = dentry->d_name.len;
+	len += dnamelen;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
+	if (!msg)
+		return;
+	lease = msg->front.iov_base;
+	lease->action = action;
+	lease->mask = cpu_to_le16(1);
+	lease->ino = cpu_to_le64(ceph_vino(inode).ino);
+	lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
+	lease->seq = cpu_to_le32(seq);
+	put_unaligned_le32(dnamelen, lease + 1);
+	memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
+
+	/*
+	 * if this is a preemptive lease RELEASE, no need to
+	 * flush request stream, since the actual request will
+	 * soon follow.
+	 */
+	msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
+
+	ceph_con_send(&session->s_con, msg);
+}
+
+/*
+ * Preemptively release a lease we expect to invalidate anyway.
+ * Pass @inode always, @dentry is optional.
+ */
+void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
+			     struct dentry *dentry, int mask)
+{
+	struct ceph_dentry_info *di;
+	struct ceph_mds_session *session;
+	u32 seq;
+
+	BUG_ON(inode == NULL);
+	BUG_ON(dentry == NULL);
+	BUG_ON(mask == 0);
+
+	/* is dentry lease valid? */
+	spin_lock(&dentry->d_lock);
+	di = ceph_dentry(dentry);
+	if (!di || !di->lease_session ||
+	    di->lease_session->s_mds < 0 ||
+	    di->lease_gen != di->lease_session->s_cap_gen ||
+	    !time_before(jiffies, dentry->d_time)) {
+		dout("lease_release inode %p dentry %p -- "
+		     "no lease on %d\n",
+		     inode, dentry, mask);
+		spin_unlock(&dentry->d_lock);
+		return;
+	}
+
+	/* we do have a lease on this dentry; note mds and seq */
+	session = ceph_get_mds_session(di->lease_session);
+	seq = di->lease_seq;
+	__ceph_mdsc_drop_dentry_lease(dentry);
+	spin_unlock(&dentry->d_lock);
+
+	dout("lease_release inode %p dentry %p mask %d to mds%d\n",
+	     inode, dentry, mask, session->s_mds);
+	ceph_mdsc_lease_send_msg(session, inode, dentry,
+				 CEPH_MDS_LEASE_RELEASE, seq);
+	ceph_put_mds_session(session);
+}
+
+/*
+ * drop all leases (and dentry refs) in preparation for umount
+ */
+static void drop_leases(struct ceph_mds_client *mdsc)
+{
+	int i;
+
+	dout("drop_leases\n");
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+		if (!s)
+			continue;
+		mutex_unlock(&mdsc->mutex);
+		mutex_lock(&s->s_mutex);
+		mutex_unlock(&s->s_mutex);
+		ceph_put_mds_session(s);
+		mutex_lock(&mdsc->mutex);
+	}
+	mutex_unlock(&mdsc->mutex);
+}
+
+
+
+/*
+ * delayed work -- periodically trim expired leases, renew caps with mds
+ */
+static void schedule_delayed(struct ceph_mds_client *mdsc)
+{
+	int delay = 5;
+	unsigned hz = round_jiffies_relative(HZ * delay);
+	schedule_delayed_work(&mdsc->delayed_work, hz);
+}
+
+static void delayed_work(struct work_struct *work)
+{
+	int i;
+	struct ceph_mds_client *mdsc =
+		container_of(work, struct ceph_mds_client, delayed_work.work);
+	int renew_interval;
+	int renew_caps;
+
+	dout("mdsc delayed_work\n");
+	ceph_check_delayed_caps(mdsc);
+
+	mutex_lock(&mdsc->mutex);
+	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
+	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
+				   mdsc->last_renew_caps);
+	if (renew_caps)
+		mdsc->last_renew_caps = jiffies;
+
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+		if (s == NULL)
+			continue;
+		if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+			dout("resending session close request for mds%d\n",
+			     s->s_mds);
+			request_close_session(mdsc, s);
+			ceph_put_mds_session(s);
+			continue;
+		}
+		if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+			if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+				s->s_state = CEPH_MDS_SESSION_HUNG;
+				pr_info("mds%d hung\n", s->s_mds);
+			}
+		}
+		if (s->s_state < CEPH_MDS_SESSION_OPEN) {
+			/* this mds is failed or recovering, just wait */
+			ceph_put_mds_session(s);
+			continue;
+		}
+		mutex_unlock(&mdsc->mutex);
+
+		mutex_lock(&s->s_mutex);
+		if (renew_caps)
+			send_renew_caps(mdsc, s);
+		else
+			ceph_con_keepalive(&s->s_con);
+		ceph_add_cap_releases(mdsc, s);
+		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+		    s->s_state == CEPH_MDS_SESSION_HUNG)
+			ceph_send_cap_releases(mdsc, s);
+		mutex_unlock(&s->s_mutex);
+		ceph_put_mds_session(s);
+
+		mutex_lock(&mdsc->mutex);
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	schedule_delayed(mdsc);
+}
+
+int ceph_mdsc_init(struct ceph_fs_client *fsc)
+
+{
+	struct ceph_mds_client *mdsc;
+
+	mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
+	if (!mdsc)
+		return -ENOMEM;
+	mdsc->fsc = fsc;
+	fsc->mdsc = mdsc;
+	mutex_init(&mdsc->mutex);
+	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+	if (mdsc->mdsmap == NULL)
+		return -ENOMEM;
+
+	init_completion(&mdsc->safe_umount_waiters);
+	init_waitqueue_head(&mdsc->session_close_wq);
+	INIT_LIST_HEAD(&mdsc->waiting_for_map);
+	mdsc->sessions = NULL;
+	mdsc->max_sessions = 0;
+	mdsc->stopping = 0;
+	init_rwsem(&mdsc->snap_rwsem);
+	mdsc->snap_realms = RB_ROOT;
+	INIT_LIST_HEAD(&mdsc->snap_empty);
+	spin_lock_init(&mdsc->snap_empty_lock);
+	mdsc->last_tid = 0;
+	mdsc->request_tree = RB_ROOT;
+	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
+	mdsc->last_renew_caps = jiffies;
+	INIT_LIST_HEAD(&mdsc->cap_delay_list);
+	spin_lock_init(&mdsc->cap_delay_lock);
+	INIT_LIST_HEAD(&mdsc->snap_flush_list);
+	spin_lock_init(&mdsc->snap_flush_lock);
+	mdsc->cap_flush_seq = 0;
+	INIT_LIST_HEAD(&mdsc->cap_dirty);
+	INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
+	mdsc->num_cap_flushing = 0;
+	spin_lock_init(&mdsc->cap_dirty_lock);
+	init_waitqueue_head(&mdsc->cap_flushing_wq);
+	spin_lock_init(&mdsc->dentry_lru_lock);
+	INIT_LIST_HEAD(&mdsc->dentry_lru);
+
+	ceph_caps_init(mdsc);
+	ceph_adjust_min_caps(mdsc, fsc->min_caps);
+
+	return 0;
+}
+
+/*
+ * Wait for safe replies on open mds requests.  If we time out, drop
+ * all requests from the tree to avoid dangling dentry refs.
+ */
+static void wait_requests(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_request *req;
+	struct ceph_fs_client *fsc = mdsc->fsc;
+
+	mutex_lock(&mdsc->mutex);
+	if (__get_oldest_req(mdsc)) {
+		mutex_unlock(&mdsc->mutex);
+
+		dout("wait_requests waiting for requests\n");
+		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
+				    fsc->client->options->mount_timeout * HZ);
+
+		/* tear down remaining requests */
+		mutex_lock(&mdsc->mutex);
+		while ((req = __get_oldest_req(mdsc))) {
+			dout("wait_requests timed out on tid %llu\n",
+			     req->r_tid);
+			__unregister_request(mdsc, req);
+		}
+	}
+	mutex_unlock(&mdsc->mutex);
+	dout("wait_requests done\n");
+}
+
+/*
+ * called before mount is ro, and before dentries are torn down.
+ * (hmm, does this still race with new lookups?)
+ */
+void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
+{
+	dout("pre_umount\n");
+	mdsc->stopping = 1;
+
+	drop_leases(mdsc);
+	ceph_flush_dirty_caps(mdsc);
+	wait_requests(mdsc);
+
+	/*
+	 * wait for reply handlers to drop their request refs and
+	 * their inode/dcache refs
+	 */
+	ceph_msgr_flush();
+}
+
+/*
+ * wait for all write mds requests to flush.
+ */
+static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
+{
+	struct ceph_mds_request *req = NULL, *nextreq;
+	struct rb_node *n;
+
+	mutex_lock(&mdsc->mutex);
+	dout("wait_unsafe_requests want %lld\n", want_tid);
+restart:
+	req = __get_oldest_req(mdsc);
+	while (req && req->r_tid <= want_tid) {
+		/* find next request */
+		n = rb_next(&req->r_node);
+		if (n)
+			nextreq = rb_entry(n, struct ceph_mds_request, r_node);
+		else
+			nextreq = NULL;
+		if ((req->r_op & CEPH_MDS_OP_WRITE)) {
+			/* write op */
+			ceph_mdsc_get_request(req);
+			if (nextreq)
+				ceph_mdsc_get_request(nextreq);
+			mutex_unlock(&mdsc->mutex);
+			dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
+			     req->r_tid, want_tid);
+			wait_for_completion(&req->r_safe_completion);
+			mutex_lock(&mdsc->mutex);
+			ceph_mdsc_put_request(req);
+			if (!nextreq)
+				break;  /* next dne before, so we're done! */
+			if (RB_EMPTY_NODE(&nextreq->r_node)) {
+				/* next request was removed from tree */
+				ceph_mdsc_put_request(nextreq);
+				goto restart;
+			}
+			ceph_mdsc_put_request(nextreq);  /* won't go away */
+		}
+		req = nextreq;
+	}
+	mutex_unlock(&mdsc->mutex);
+	dout("wait_unsafe_requests done\n");
+}
+
+void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
+{
+	u64 want_tid, want_flush;
+
+	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+		return;
+
+	dout("sync\n");
+	mutex_lock(&mdsc->mutex);
+	want_tid = mdsc->last_tid;
+	want_flush = mdsc->cap_flush_seq;
+	mutex_unlock(&mdsc->mutex);
+	dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
+
+	ceph_flush_dirty_caps(mdsc);
+
+	wait_unsafe_requests(mdsc, want_tid);
+	wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
+}
+
+/*
+ * true if all sessions are closed, or we force unmount
+ */
+bool done_closing_sessions(struct ceph_mds_client *mdsc)
+{
+	int i, n = 0;
+
+	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+		return true;
+
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++)
+		if (mdsc->sessions[i])
+			n++;
+	mutex_unlock(&mdsc->mutex);
+	return n == 0;
+}
+
+/*
+ * called after sb is ro.
+ */
+void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_session *session;
+	int i;
+	struct ceph_fs_client *fsc = mdsc->fsc;
+	unsigned long timeout = fsc->client->options->mount_timeout * HZ;
+
+	dout("close_sessions\n");
+
+	/* close sessions */
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		session = __ceph_lookup_mds_session(mdsc, i);
+		if (!session)
+			continue;
+		mutex_unlock(&mdsc->mutex);
+		mutex_lock(&session->s_mutex);
+		__close_session(mdsc, session);
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+		mutex_lock(&mdsc->mutex);
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	dout("waiting for sessions to close\n");
+	wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
+			   timeout);
+
+	/* tear down remaining sessions */
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		if (mdsc->sessions[i]) {
+			session = get_session(mdsc->sessions[i]);
+			__unregister_session(mdsc, session);
+			mutex_unlock(&mdsc->mutex);
+			mutex_lock(&session->s_mutex);
+			remove_session_caps(session);
+			mutex_unlock(&session->s_mutex);
+			ceph_put_mds_session(session);
+			mutex_lock(&mdsc->mutex);
+		}
+	}
+	WARN_ON(!list_empty(&mdsc->cap_delay_list));
+	mutex_unlock(&mdsc->mutex);
+
+	ceph_cleanup_empty_realms(mdsc);
+
+	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+
+	dout("stopped\n");
+}
+
+static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+{
+	dout("stop\n");
+	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+	if (mdsc->mdsmap)
+		ceph_mdsmap_destroy(mdsc->mdsmap);
+	kfree(mdsc->sessions);
+	ceph_caps_finalize(mdsc);
+}
+
+void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
+{
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+
+	dout("mdsc_destroy %p\n", mdsc);
+	ceph_mdsc_stop(mdsc);
+
+	/* flush out any connection work with references to us */
+	ceph_msgr_flush();
+
+	fsc->mdsc = NULL;
+	kfree(mdsc);
+	dout("mdsc_destroy %p done\n", mdsc);
+}
+
+
+/*
+ * handle mds map update.
+ */
+void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+	u32 epoch;
+	u32 maplen;
+	void *p = msg->front.iov_base;
+	void *end = p + msg->front.iov_len;
+	struct ceph_mdsmap *newmap, *oldmap;
+	struct ceph_fsid fsid;
+	int err = -EINVAL;
+
+	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
+		return;
+	epoch = ceph_decode_32(&p);
+	maplen = ceph_decode_32(&p);
+	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
+
+	/* do we need it? */
+	ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
+	mutex_lock(&mdsc->mutex);
+	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
+		dout("handle_map epoch %u <= our %u\n",
+		     epoch, mdsc->mdsmap->m_epoch);
+		mutex_unlock(&mdsc->mutex);
+		return;
+	}
+
+	newmap = ceph_mdsmap_decode(&p, end);
+	if (IS_ERR(newmap)) {
+		err = PTR_ERR(newmap);
+		goto bad_unlock;
+	}
+
+	/* swap into place */
+	if (mdsc->mdsmap) {
+		oldmap = mdsc->mdsmap;
+		mdsc->mdsmap = newmap;
+		check_new_map(mdsc, newmap, oldmap);
+		ceph_mdsmap_destroy(oldmap);
+	} else {
+		mdsc->mdsmap = newmap;  /* first mds map */
+	}
+	mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+
+	__wake_requests(mdsc, &mdsc->waiting_for_map);
+
+	mutex_unlock(&mdsc->mutex);
+	schedule_delayed(mdsc);
+	return;
+
+bad_unlock:
+	mutex_unlock(&mdsc->mutex);
+bad:
+	pr_err("error decoding mdsmap %d\n", err);
+	return;
+}
+
+static struct ceph_connection *con_get(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+
+	if (get_session(s)) {
+		dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
+		return con;
+	}
+	dout("mdsc con_get %p FAIL\n", s);
+	return NULL;
+}
+
+static void con_put(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+
+	dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1);
+	ceph_put_mds_session(s);
+}
+
+/*
+ * if the client is unresponsive for long enough, the mds will kill
+ * the session entirely.
+ */
+static void peer_reset(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+
+	pr_warning("mds%d closed our session\n", s->s_mds);
+	send_mds_reconnect(mdsc, s);
+}
+
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	mutex_lock(&mdsc->mutex);
+	if (__verify_registered_session(mdsc, s) < 0) {
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	switch (type) {
+	case CEPH_MSG_MDS_MAP:
+		ceph_mdsc_handle_map(mdsc, msg);
+		break;
+	case CEPH_MSG_CLIENT_SESSION:
+		handle_session(s, msg);
+		break;
+	case CEPH_MSG_CLIENT_REPLY:
+		handle_reply(s, msg);
+		break;
+	case CEPH_MSG_CLIENT_REQUEST_FORWARD:
+		handle_forward(mdsc, s, msg);
+		break;
+	case CEPH_MSG_CLIENT_CAPS:
+		ceph_handle_caps(s, msg);
+		break;
+	case CEPH_MSG_CLIENT_SNAP:
+		ceph_handle_snap(mdsc, s, msg);
+		break;
+	case CEPH_MSG_CLIENT_LEASE:
+		handle_lease(mdsc, s, msg);
+		break;
+
+	default:
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+out:
+	ceph_msg_put(msg);
+}
+
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+			  void **buf, int *len, int *proto,
+			  void **reply_buf, int *reply_len, int force_new)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+	int ret = 0;
+
+	if (force_new && s->s_authorizer) {
+		ac->ops->destroy_authorizer(ac, s->s_authorizer);
+		s->s_authorizer = NULL;
+	}
+	if (s->s_authorizer == NULL) {
+		if (ac->ops->create_authorizer) {
+			ret = ac->ops->create_authorizer(
+				ac, CEPH_ENTITY_TYPE_MDS,
+				&s->s_authorizer,
+				&s->s_authorizer_buf,
+				&s->s_authorizer_buf_len,
+				&s->s_authorizer_reply_buf,
+				&s->s_authorizer_reply_buf_len);
+			if (ret)
+				return ret;
+		}
+	}
+
+	*proto = ac->protocol;
+	*buf = s->s_authorizer_buf;
+	*len = s->s_authorizer_buf_len;
+	*reply_buf = s->s_authorizer_reply_buf;
+	*reply_len = s->s_authorizer_reply_buf_len;
+	return 0;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+
+	return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
+}
+
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+
+	if (ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
+
+	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
+}
+
+static const struct ceph_connection_operations mds_con_ops = {
+	.get = con_get,
+	.put = con_put,
+	.dispatch = dispatch,
+	.get_authorizer = get_authorizer,
+	.verify_authorizer_reply = verify_authorizer_reply,
+	.invalidate_authorizer = invalidate_authorizer,
+	.peer_reset = peer_reset,
+};
+
+/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 00000000..7d8a0d66
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,379 @@
+#ifndef _FS_CEPH_MDS_CLIENT_H
+#define _FS_CEPH_MDS_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/mdsmap.h>
+
+/*
+ * Some lock dependencies:
+ *
+ * session->s_mutex
+ *         mdsc->mutex
+ *
+ *         mdsc->snap_rwsem
+ *
+ *         inode->i_lock
+ *                 mdsc->snap_flush_lock
+ *                 mdsc->cap_delay_lock
+ *
+ */
+
+struct ceph_fs_client;
+struct ceph_cap;
+
+/*
+ * parsed info about a single inode.  pointers are into the encoded
+ * on-wire structures within the mds reply message payload.
+ */
+struct ceph_mds_reply_info_in {
+	struct ceph_mds_reply_inode *in;
+	struct ceph_dir_layout dir_layout;
+	u32 symlink_len;
+	char *symlink;
+	u32 xattr_len;
+	char *xattr_data;
+};
+
+/*
+ * parsed info about an mds reply, including information about
+ * either: 1) the target inode and/or its parent directory and dentry,
+ * and directory contents (for readdir results), or
+ * 2) the file range lock info (for fcntl F_GETLK results).
+ */
+struct ceph_mds_reply_info_parsed {
+	struct ceph_mds_reply_head    *head;
+
+	/* trace */
+	struct ceph_mds_reply_info_in diri, targeti;
+	struct ceph_mds_reply_dirfrag *dirfrag;
+	char                          *dname;
+	u32                           dname_len;
+	struct ceph_mds_reply_lease   *dlease;
+
+	/* extra */
+	union {
+		/* for fcntl F_GETLK results */
+		struct ceph_filelock *filelock_reply;
+
+		/* for readdir results */
+		struct {
+			struct ceph_mds_reply_dirfrag *dir_dir;
+			int                           dir_nr;
+			char                          **dir_dname;
+			u32                           *dir_dname_len;
+			struct ceph_mds_reply_lease   **dir_dlease;
+			struct ceph_mds_reply_info_in *dir_in;
+			u8                            dir_complete, dir_end;
+		};
+	};
+
+	/* encoded blob describing snapshot contexts for certain
+	   operations (e.g., open) */
+	void *snapblob;
+	int snapblob_len;
+};
+
+
+/*
+ * cap releases are batched and sent to the MDS en masse.
+ */
+#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE -			\
+				sizeof(struct ceph_mds_cap_release)) /	\
+			       sizeof(struct ceph_mds_cap_item))
+
+
+/*
+ * state associated with each MDS<->client session
+ */
+enum {
+	CEPH_MDS_SESSION_NEW = 1,
+	CEPH_MDS_SESSION_OPENING = 2,
+	CEPH_MDS_SESSION_OPEN = 3,
+	CEPH_MDS_SESSION_HUNG = 4,
+	CEPH_MDS_SESSION_CLOSING = 5,
+	CEPH_MDS_SESSION_RESTARTING = 6,
+	CEPH_MDS_SESSION_RECONNECTING = 7,
+};
+
+struct ceph_mds_session {
+	struct ceph_mds_client *s_mdsc;
+	int               s_mds;
+	int               s_state;
+	unsigned long     s_ttl;      /* time until mds kills us */
+	u64               s_seq;      /* incoming msg seq # */
+	struct mutex      s_mutex;    /* serialize session messages */
+
+	struct ceph_connection s_con;
+
+	struct ceph_authorizer *s_authorizer;
+	void             *s_authorizer_buf, *s_authorizer_reply_buf;
+	size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len;
+
+	/* protected by s_cap_lock */
+	spinlock_t        s_cap_lock;
+	u32               s_cap_gen;  /* inc each time we get mds stale msg */
+	unsigned long     s_cap_ttl;  /* when session caps expire */
+	struct list_head  s_caps;     /* all caps issued by this session */
+	int               s_nr_caps, s_trim_caps;
+	int               s_num_cap_releases;
+	struct list_head  s_cap_releases; /* waiting cap_release messages */
+	struct list_head  s_cap_releases_done; /* ready to send */
+	struct ceph_cap  *s_cap_iterator;
+
+	/* protected by mutex */
+	struct list_head  s_cap_flushing;     /* inodes w/ flushing caps */
+	struct list_head  s_cap_snaps_flushing;
+	unsigned long     s_renew_requested; /* last time we sent a renew req */
+	u64               s_renew_seq;
+
+	atomic_t          s_ref;
+	struct list_head  s_waiting;  /* waiting requests */
+	struct list_head  s_unsafe;   /* unsafe requests */
+};
+
+/*
+ * modes of choosing which MDS to send a request to
+ */
+enum {
+	USE_ANY_MDS,
+	USE_RANDOM_MDS,
+	USE_AUTH_MDS,   /* prefer authoritative mds for this metadata item */
+};
+
+struct ceph_mds_request;
+struct ceph_mds_client;
+
+/*
+ * request completion callback
+ */
+typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
+					     struct ceph_mds_request *req);
+
+/*
+ * an in-flight mds request
+ */
+struct ceph_mds_request {
+	u64 r_tid;                   /* transaction id */
+	struct rb_node r_node;
+	struct ceph_mds_client *r_mdsc;
+
+	int r_op;                    /* mds op code */
+
+	/* operation on what? */
+	struct inode *r_inode;              /* arg1 */
+	struct dentry *r_dentry;            /* arg1 */
+	struct dentry *r_old_dentry;        /* arg2: rename from or link from */
+	char *r_path1, *r_path2;
+	struct ceph_vino r_ino1, r_ino2;
+
+	struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
+	struct inode *r_target_inode;       /* resulting inode */
+
+	struct mutex r_fill_mutex;
+
+	union ceph_mds_request_args r_args;
+	int r_fmode;        /* file mode, if expecting cap */
+	uid_t r_uid;
+	gid_t r_gid;
+
+	/* for choosing which mds to send this request to */
+	int r_direct_mode;
+	u32 r_direct_hash;      /* choose dir frag based on this dentry hash */
+	bool r_direct_is_hash;  /* true if r_direct_hash is valid */
+
+	/* data payload is used for xattr ops */
+	struct page **r_pages;
+	int r_num_pages;
+	int r_data_len;
+
+	/* what caps shall we drop? */
+	int r_inode_drop, r_inode_unless;
+	int r_dentry_drop, r_dentry_unless;
+	int r_old_dentry_drop, r_old_dentry_unless;
+	struct inode *r_old_inode;
+	int r_old_inode_drop, r_old_inode_unless;
+
+	struct ceph_msg  *r_request;  /* original request */
+	int r_request_release_offset;
+	struct ceph_msg  *r_reply;
+	struct ceph_mds_reply_info_parsed r_reply_info;
+	int r_err;
+	bool r_aborted;
+
+	unsigned long r_timeout;  /* optional.  jiffies */
+	unsigned long r_started;  /* start time to measure timeout against */
+	unsigned long r_request_started; /* start time for mds request only,
+					    used to measure lease durations */
+
+	/* link unsafe requests to parent directory, for fsync */
+	struct inode	*r_unsafe_dir;
+	struct list_head r_unsafe_dir_item;
+
+	struct ceph_mds_session *r_session;
+
+	int               r_attempts;   /* resend attempts */
+	int               r_num_fwd;    /* number of forward attempts */
+	int               r_resend_mds; /* mds to resend to next, if any*/
+	u32               r_sent_on_mseq; /* cap mseq request was sent at*/
+
+	struct kref       r_kref;
+	struct list_head  r_wait;
+	struct completion r_completion;
+	struct completion r_safe_completion;
+	ceph_mds_request_callback_t r_callback;
+	struct list_head  r_unsafe_item;  /* per-session unsafe list item */
+	bool		  r_got_unsafe, r_got_safe, r_got_result;
+
+	bool              r_did_prepopulate;
+	u32               r_readdir_offset;
+
+	struct ceph_cap_reservation r_caps_reservation;
+	int r_num_caps;
+};
+
+/*
+ * mds client state
+ */
+struct ceph_mds_client {
+	struct ceph_fs_client  *fsc;
+	struct mutex            mutex;         /* all nested structures */
+
+	struct ceph_mdsmap      *mdsmap;
+	struct completion       safe_umount_waiters;
+	wait_queue_head_t       session_close_wq;
+	struct list_head        waiting_for_map;
+
+	struct ceph_mds_session **sessions;    /* NULL for mds if no session */
+	int                     max_sessions;  /* len of s_mds_sessions */
+	int                     stopping;      /* true if shutting down */
+
+	/*
+	 * snap_rwsem will cover cap linkage into snaprealms, and
+	 * realm snap contexts.  (later, we can do per-realm snap
+	 * contexts locks..)  the empty list contains realms with no
+	 * references (implying they contain no inodes with caps) that
+	 * should be destroyed.
+	 */
+	struct rw_semaphore     snap_rwsem;
+	struct rb_root          snap_realms;
+	struct list_head        snap_empty;
+	spinlock_t              snap_empty_lock;  /* protect snap_empty */
+
+	u64                    last_tid;      /* most recent mds request */
+	struct rb_root         request_tree;  /* pending mds requests */
+	struct delayed_work    delayed_work;  /* delayed work */
+	unsigned long    last_renew_caps;  /* last time we renewed our caps */
+	struct list_head cap_delay_list;   /* caps with delayed release */
+	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
+	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
+	spinlock_t       snap_flush_lock;
+
+	u64               cap_flush_seq;
+	struct list_head  cap_dirty;        /* inodes with dirty caps */
+	struct list_head  cap_dirty_migrating; /* ...that are migration... */
+	int               num_cap_flushing; /* # caps we are flushing */
+	spinlock_t        cap_dirty_lock;   /* protects above items */
+	wait_queue_head_t cap_flushing_wq;
+
+	/*
+	 * Cap reservations
+	 *
+	 * Maintain a global pool of preallocated struct ceph_caps, referenced
+	 * by struct ceph_caps_reservations.  This ensures that we preallocate
+	 * memory needed to successfully process an MDS response.  (If an MDS
+	 * sends us cap information and we fail to process it, we will have
+	 * problems due to the client and MDS being out of sync.)
+	 *
+	 * Reservations are 'owned' by a ceph_cap_reservation context.
+	 */
+	spinlock_t	caps_list_lock;
+	struct		list_head caps_list; /* unused (reserved or
+						unreserved) */
+	int		caps_total_count;    /* total caps allocated */
+	int		caps_use_count;      /* in use */
+	int		caps_reserve_count;  /* unused, reserved */
+	int		caps_avail_count;    /* unused, unreserved */
+	int		caps_min_count;      /* keep at least this many
+						(unreserved) */
+	spinlock_t	  dentry_lru_lock;
+	struct list_head  dentry_lru;
+	int		  num_dentry;
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+extern struct ceph_mds_session *
+__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
+
+static inline struct ceph_mds_session *
+ceph_get_mds_session(struct ceph_mds_session *s)
+{
+	atomic_inc(&s->s_ref);
+	return s;
+}
+
+extern void ceph_put_mds_session(struct ceph_mds_session *s);
+
+extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
+			     struct ceph_msg *msg, int mds);
+
+extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
+extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
+
+extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
+
+extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
+				    struct inode *inode,
+				    struct dentry *dn, int mask);
+
+extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
+
+extern struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
+extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+				     struct ceph_mds_request *req);
+extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+				struct inode *dir,
+				struct ceph_mds_request *req);
+static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
+{
+	kref_get(&req->r_kref);
+}
+extern void ceph_mdsc_release_request(struct kref *kref);
+static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
+{
+	kref_put(&req->r_kref, ceph_mdsc_release_request);
+}
+
+extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session);
+extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session);
+
+extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
+
+extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+				  int stop_on_nosnap);
+
+extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
+extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+				     struct inode *inode,
+				     struct dentry *dentry, char action,
+				     u32 seq);
+
+extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
+				 struct ceph_msg *msg);
+
+extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
+					  struct ceph_mds_session *session);
+
+#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 00000000..73b7d44e
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,179 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <linux/ceph/mdsmap.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+
+#include "super.h"
+
+
+/*
+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
+ */
+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+{
+	int n = 0;
+	int i;
+	char r;
+
+	/* count */
+	for (i = 0; i < m->m_max_mds; i++)
+		if (m->m_info[i].state > 0)
+			n++;
+	if (n == 0)
+		return -1;
+
+	/* pick */
+	get_random_bytes(&r, 1);
+	n = r % n;
+	i = 0;
+	for (i = 0; n > 0; i++, n--)
+		while (m->m_info[i].state <= 0)
+			i++;
+
+	return i;
+}
+
+/*
+ * Decode an MDS map
+ *
+ * Ignore any fields we don't care about (there are quite a few of
+ * them).
+ */
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
+{
+	struct ceph_mdsmap *m;
+	const void *start = *p;
+	int i, j, n;
+	int err = -EINVAL;
+	u16 version;
+
+	m = kzalloc(sizeof(*m), GFP_NOFS);
+	if (m == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ceph_decode_16_safe(p, end, version, bad);
+
+	ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
+	m->m_epoch = ceph_decode_32(p);
+	m->m_client_epoch = ceph_decode_32(p);
+	m->m_last_failure = ceph_decode_32(p);
+	m->m_root = ceph_decode_32(p);
+	m->m_session_timeout = ceph_decode_32(p);
+	m->m_session_autoclose = ceph_decode_32(p);
+	m->m_max_file_size = ceph_decode_64(p);
+	m->m_max_mds = ceph_decode_32(p);
+
+	m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
+	if (m->m_info == NULL)
+		goto badmem;
+
+	/* pick out active nodes from mds_info (state > 0) */
+	n = ceph_decode_32(p);
+	for (i = 0; i < n; i++) {
+		u64 global_id;
+		u32 namelen;
+		s32 mds, inc, state;
+		u64 state_seq;
+		u8 infoversion;
+		struct ceph_entity_addr addr;
+		u32 num_export_targets;
+		void *pexport_targets = NULL;
+		struct ceph_timespec laggy_since;
+
+		ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+		global_id = ceph_decode_64(p);
+		infoversion = ceph_decode_8(p);
+		*p += sizeof(u64);
+		namelen = ceph_decode_32(p);  /* skip mds name */
+		*p += namelen;
+
+		ceph_decode_need(p, end,
+				 4*sizeof(u32) + sizeof(u64) +
+				 sizeof(addr) + sizeof(struct ceph_timespec),
+				 bad);
+		mds = ceph_decode_32(p);
+		inc = ceph_decode_32(p);
+		state = ceph_decode_32(p);
+		state_seq = ceph_decode_64(p);
+		ceph_decode_copy(p, &addr, sizeof(addr));
+		ceph_decode_addr(&addr);
+		ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
+		*p += sizeof(u32);
+		ceph_decode_32_safe(p, end, namelen, bad);
+		*p += namelen;
+		if (infoversion >= 2) {
+			ceph_decode_32_safe(p, end, num_export_targets, bad);
+			pexport_targets = *p;
+			*p += num_export_targets * sizeof(u32);
+		} else {
+			num_export_targets = 0;
+		}
+
+		dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
+		     i+1, n, global_id, mds, inc,
+		     ceph_pr_addr(&addr.in_addr),
+		     ceph_mds_state_name(state));
+		if (mds >= 0 && mds < m->m_max_mds && state > 0) {
+			m->m_info[mds].global_id = global_id;
+			m->m_info[mds].state = state;
+			m->m_info[mds].addr = addr;
+			m->m_info[mds].laggy =
+				(laggy_since.tv_sec != 0 ||
+				 laggy_since.tv_nsec != 0);
+			m->m_info[mds].num_export_targets = num_export_targets;
+			if (num_export_targets) {
+				m->m_info[mds].export_targets =
+					kcalloc(num_export_targets, sizeof(u32),
+						GFP_NOFS);
+				for (j = 0; j < num_export_targets; j++)
+					m->m_info[mds].export_targets[j] =
+					       ceph_decode_32(&pexport_targets);
+			} else {
+				m->m_info[mds].export_targets = NULL;
+			}
+		}
+	}
+
+	/* pg_pools */
+	ceph_decode_32_safe(p, end, n, bad);
+	m->m_num_data_pg_pools = n;
+	m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
+	if (!m->m_data_pg_pools)
+		goto badmem;
+	ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
+	for (i = 0; i < n; i++)
+		m->m_data_pg_pools[i] = ceph_decode_32(p);
+	m->m_cas_pg_pool = ceph_decode_32(p);
+
+	/* ok, we don't care about the rest. */
+	dout("mdsmap_decode success epoch %u\n", m->m_epoch);
+	return m;
+
+badmem:
+	err = -ENOMEM;
+bad:
+	pr_err("corrupt mdsmap\n");
+	print_hex_dump(KERN_DEBUG, "mdsmap: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       start, end - start, true);
+	ceph_mdsmap_destroy(m);
+	return ERR_PTR(-EINVAL);
+}
+
+void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
+{
+	int i;
+
+	for (i = 0; i < m->m_max_mds; i++)
+		kfree(m->m_info[i].export_targets);
+	kfree(m->m_info);
+	kfree(m->m_data_pg_pools);
+	kfree(m);
+}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 00000000..54b14de2
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,916 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/sort.h>
+#include <linux/slab.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
+
+/*
+ * Snapshots in ceph are driven in large part by cooperation from the
+ * client.  In contrast to local file systems or file servers that
+ * implement snapshots at a single point in the system, ceph's
+ * distributed access to storage requires clients to help decide
+ * whether a write logically occurs before or after a recently created
+ * snapshot.
+ *
+ * This provides a perfect instantanous client-wide snapshot.  Between
+ * clients, however, snapshots may appear to be applied at slightly
+ * different points in time, depending on delays in delivering the
+ * snapshot notification.
+ *
+ * Snapshots are _not_ file system-wide.  Instead, each snapshot
+ * applies to the subdirectory nested beneath some directory.  This
+ * effectively divides the hierarchy into multiple "realms," where all
+ * of the files contained by each realm share the same set of
+ * snapshots.  An individual realm's snap set contains snapshots
+ * explicitly created on that realm, as well as any snaps in its
+ * parent's snap set _after_ the point at which the parent became it's
+ * parent (due to, say, a rename).  Similarly, snaps from prior parents
+ * during the time intervals during which they were the parent are included.
+ *
+ * The client is spared most of this detail, fortunately... it must only
+ * maintains a hierarchy of realms reflecting the current parent/child
+ * realm relationship, and for each realm has an explicit list of snaps
+ * inherited from prior parents.
+ *
+ * A snap_realm struct is maintained for realms containing every inode
+ * with an open cap in the system.  (The needed snap realm information is
+ * provided by the MDS whenever a cap is issued, i.e., on open.)  A 'seq'
+ * version number is used to ensure that as realm parameters change (new
+ * snapshot, new parent, etc.) the client's realm hierarchy is updated.
+ *
+ * The realm hierarchy drives the generation of a 'snap context' for each
+ * realm, which simply lists the resulting set of snaps for the realm.  This
+ * is attached to any writes sent to OSDs.
+ */
+/*
+ * Unfortunately error handling is a bit mixed here.  If we get a snap
+ * update, but don't have enough memory to update our realm hierarchy,
+ * it's not clear what we can do about it (besides complaining to the
+ * console).
+ */
+
+
+/*
+ * increase ref count for the realm
+ *
+ * caller must hold snap_rwsem for write.
+ */
+void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+			 struct ceph_snap_realm *realm)
+{
+	dout("get_realm %p %d -> %d\n", realm,
+	     atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
+	/*
+	 * since we _only_ increment realm refs or empty the empty
+	 * list with snap_rwsem held, adjusting the empty list here is
+	 * safe.  we do need to protect against concurrent empty list
+	 * additions, however.
+	 */
+	if (atomic_read(&realm->nref) == 0) {
+		spin_lock(&mdsc->snap_empty_lock);
+		list_del_init(&realm->empty_item);
+		spin_unlock(&mdsc->snap_empty_lock);
+	}
+
+	atomic_inc(&realm->nref);
+}
+
+static void __insert_snap_realm(struct rb_root *root,
+				struct ceph_snap_realm *new)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_snap_realm *r = NULL;
+
+	while (*p) {
+		parent = *p;
+		r = rb_entry(parent, struct ceph_snap_realm, node);
+		if (new->ino < r->ino)
+			p = &(*p)->rb_left;
+		else if (new->ino > r->ino)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+}
+
+/*
+ * create and get the realm rooted at @ino and bump its ref count.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static struct ceph_snap_realm *ceph_create_snap_realm(
+	struct ceph_mds_client *mdsc,
+	u64 ino)
+{
+	struct ceph_snap_realm *realm;
+
+	realm = kzalloc(sizeof(*realm), GFP_NOFS);
+	if (!realm)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&realm->nref, 0);    /* tree does not take a ref */
+	realm->ino = ino;
+	INIT_LIST_HEAD(&realm->children);
+	INIT_LIST_HEAD(&realm->child_item);
+	INIT_LIST_HEAD(&realm->empty_item);
+	INIT_LIST_HEAD(&realm->dirty_item);
+	INIT_LIST_HEAD(&realm->inodes_with_caps);
+	spin_lock_init(&realm->inodes_with_caps_lock);
+	__insert_snap_realm(&mdsc->snap_realms, realm);
+	dout("create_snap_realm %llx %p\n", realm->ino, realm);
+	return realm;
+}
+
+/*
+ * lookup the realm rooted at @ino.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+					       u64 ino)
+{
+	struct rb_node *n = mdsc->snap_realms.rb_node;
+	struct ceph_snap_realm *r;
+
+	while (n) {
+		r = rb_entry(n, struct ceph_snap_realm, node);
+		if (ino < r->ino)
+			n = n->rb_left;
+		else if (ino > r->ino)
+			n = n->rb_right;
+		else {
+			dout("lookup_snap_realm %llx %p\n", r->ino, r);
+			return r;
+		}
+	}
+	return NULL;
+}
+
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+			     struct ceph_snap_realm *realm);
+
+/*
+ * called with snap_rwsem (write)
+ */
+static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
+				 struct ceph_snap_realm *realm)
+{
+	dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+
+	rb_erase(&realm->node, &mdsc->snap_realms);
+
+	if (realm->parent) {
+		list_del_init(&realm->child_item);
+		__put_snap_realm(mdsc, realm->parent);
+	}
+
+	kfree(realm->prior_parent_snaps);
+	kfree(realm->snaps);
+	ceph_put_snap_context(realm->cached_context);
+	kfree(realm);
+}
+
+/*
+ * caller holds snap_rwsem (write)
+ */
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+			     struct ceph_snap_realm *realm)
+{
+	dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+	     atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+	if (atomic_dec_and_test(&realm->nref))
+		__destroy_snap_realm(mdsc, realm);
+}
+
+/*
+ * caller needn't hold any locks
+ */
+void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+			 struct ceph_snap_realm *realm)
+{
+	dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+	     atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+	if (!atomic_dec_and_test(&realm->nref))
+		return;
+
+	if (down_write_trylock(&mdsc->snap_rwsem)) {
+		__destroy_snap_realm(mdsc, realm);
+		up_write(&mdsc->snap_rwsem);
+	} else {
+		spin_lock(&mdsc->snap_empty_lock);
+		list_add(&realm->empty_item, &mdsc->snap_empty);
+		spin_unlock(&mdsc->snap_empty_lock);
+	}
+}
+
+/*
+ * Clean up any realms whose ref counts have dropped to zero.  Note
+ * that this does not include realms who were created but not yet
+ * used.
+ *
+ * Called under snap_rwsem (write)
+ */
+static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+	struct ceph_snap_realm *realm;
+
+	spin_lock(&mdsc->snap_empty_lock);
+	while (!list_empty(&mdsc->snap_empty)) {
+		realm = list_first_entry(&mdsc->snap_empty,
+				   struct ceph_snap_realm, empty_item);
+		list_del(&realm->empty_item);
+		spin_unlock(&mdsc->snap_empty_lock);
+		__destroy_snap_realm(mdsc, realm);
+		spin_lock(&mdsc->snap_empty_lock);
+	}
+	spin_unlock(&mdsc->snap_empty_lock);
+}
+
+void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+	down_write(&mdsc->snap_rwsem);
+	__cleanup_empty_realms(mdsc);
+	up_write(&mdsc->snap_rwsem);
+}
+
+/*
+ * adjust the parent realm of a given @realm.  adjust child list, and parent
+ * pointers, and ref counts appropriately.
+ *
+ * return true if parent was changed, 0 if unchanged, <0 on error.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
+				    struct ceph_snap_realm *realm,
+				    u64 parentino)
+{
+	struct ceph_snap_realm *parent;
+
+	if (realm->parent_ino == parentino)
+		return 0;
+
+	parent = ceph_lookup_snap_realm(mdsc, parentino);
+	if (!parent) {
+		parent = ceph_create_snap_realm(mdsc, parentino);
+		if (IS_ERR(parent))
+			return PTR_ERR(parent);
+	}
+	dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
+	     realm->ino, realm, realm->parent_ino, realm->parent,
+	     parentino, parent);
+	if (realm->parent) {
+		list_del_init(&realm->child_item);
+		ceph_put_snap_realm(mdsc, realm->parent);
+	}
+	realm->parent_ino = parentino;
+	realm->parent = parent;
+	ceph_get_snap_realm(mdsc, parent);
+	list_add(&realm->child_item, &parent->children);
+	return 1;
+}
+
+
+static int cmpu64_rev(const void *a, const void *b)
+{
+	if (*(u64 *)a < *(u64 *)b)
+		return 1;
+	if (*(u64 *)a > *(u64 *)b)
+		return -1;
+	return 0;
+}
+
+/*
+ * build the snap context for a given realm.
+ */
+static int build_snap_context(struct ceph_snap_realm *realm)
+{
+	struct ceph_snap_realm *parent = realm->parent;
+	struct ceph_snap_context *snapc;
+	int err = 0;
+	int i;
+	int num = realm->num_prior_parent_snaps + realm->num_snaps;
+
+	/*
+	 * build parent context, if it hasn't been built.
+	 * conservatively estimate that all parent snaps might be
+	 * included by us.
+	 */
+	if (parent) {
+		if (!parent->cached_context) {
+			err = build_snap_context(parent);
+			if (err)
+				goto fail;
+		}
+		num += parent->cached_context->num_snaps;
+	}
+
+	/* do i actually need to update?  not if my context seq
+	   matches realm seq, and my parents' does to.  (this works
+	   because we rebuild_snap_realms() works _downward_ in
+	   hierarchy after each update.) */
+	if (realm->cached_context &&
+	    realm->cached_context->seq == realm->seq &&
+	    (!parent ||
+	     realm->cached_context->seq >= parent->cached_context->seq)) {
+		dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
+		     " (unchanged)\n",
+		     realm->ino, realm, realm->cached_context,
+		     realm->cached_context->seq,
+		     realm->cached_context->num_snaps);
+		return 0;
+	}
+
+	/* alloc new snap context */
+	err = -ENOMEM;
+	if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
+		goto fail;
+	snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
+	if (!snapc)
+		goto fail;
+	atomic_set(&snapc->nref, 1);
+
+	/* build (reverse sorted) snap vector */
+	num = 0;
+	snapc->seq = realm->seq;
+	if (parent) {
+		/* include any of parent's snaps occurring _after_ my
+		   parent became my parent */
+		for (i = 0; i < parent->cached_context->num_snaps; i++)
+			if (parent->cached_context->snaps[i] >=
+			    realm->parent_since)
+				snapc->snaps[num++] =
+					parent->cached_context->snaps[i];
+		if (parent->cached_context->seq > snapc->seq)
+			snapc->seq = parent->cached_context->seq;
+	}
+	memcpy(snapc->snaps + num, realm->snaps,
+	       sizeof(u64)*realm->num_snaps);
+	num += realm->num_snaps;
+	memcpy(snapc->snaps + num, realm->prior_parent_snaps,
+	       sizeof(u64)*realm->num_prior_parent_snaps);
+	num += realm->num_prior_parent_snaps;
+
+	sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
+	snapc->num_snaps = num;
+	dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
+	     realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
+
+	if (realm->cached_context)
+		ceph_put_snap_context(realm->cached_context);
+	realm->cached_context = snapc;
+	return 0;
+
+fail:
+	/*
+	 * if we fail, clear old (incorrect) cached_context... hopefully
+	 * we'll have better luck building it later
+	 */
+	if (realm->cached_context) {
+		ceph_put_snap_context(realm->cached_context);
+		realm->cached_context = NULL;
+	}
+	pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
+	       realm, err);
+	return err;
+}
+
+/*
+ * rebuild snap context for the given realm and all of its children.
+ */
+static void rebuild_snap_realms(struct ceph_snap_realm *realm)
+{
+	struct ceph_snap_realm *child;
+
+	dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
+	build_snap_context(realm);
+
+	list_for_each_entry(child, &realm->children, child_item)
+		rebuild_snap_realms(child);
+}
+
+
+/*
+ * helper to allocate and decode an array of snapids.  free prior
+ * instance, if any.
+ */
+static int dup_array(u64 **dst, __le64 *src, int num)
+{
+	int i;
+
+	kfree(*dst);
+	if (num) {
+		*dst = kcalloc(num, sizeof(u64), GFP_NOFS);
+		if (!*dst)
+			return -ENOMEM;
+		for (i = 0; i < num; i++)
+			(*dst)[i] = get_unaligned_le64(src + i);
+	} else {
+		*dst = NULL;
+	}
+	return 0;
+}
+
+
+/*
+ * When a snapshot is applied, the size/mtime inode metadata is queued
+ * in a ceph_cap_snap (one for each snapshot) until writeback
+ * completes and the metadata can be flushed back to the MDS.
+ *
+ * However, if a (sync) write is currently in-progress when we apply
+ * the snapshot, we have to wait until the write succeeds or fails
+ * (and a final size/mtime is known).  In this case the
+ * cap_snap->writing = 1, and is said to be "pending."  When the write
+ * finishes, we __ceph_finish_cap_snap().
+ *
+ * Caller must hold snap_rwsem for read (i.e., the realm topology won't
+ * change).
+ */
+void ceph_queue_cap_snap(struct ceph_inode_info *ci)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_cap_snap *capsnap;
+	int used, dirty;
+
+	capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
+	if (!capsnap) {
+		pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
+		return;
+	}
+
+	spin_lock(&inode->i_lock);
+	used = __ceph_caps_used(ci);
+	dirty = __ceph_caps_dirty(ci);
+	if (__ceph_have_pending_cap_snap(ci)) {
+		/* there is no point in queuing multiple "pending" cap_snaps,
+		   as no new writes are allowed to start when pending, so any
+		   writes in progress now were started before the previous
+		   cap_snap.  lucky us. */
+		dout("queue_cap_snap %p already pending\n", inode);
+		kfree(capsnap);
+	} else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) ||
+		   (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
+			     CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
+		struct ceph_snap_context *snapc = ci->i_head_snapc;
+
+		dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
+		     capsnap, snapc);
+		ihold(inode);
+
+		atomic_set(&capsnap->nref, 1);
+		capsnap->ci = ci;
+		INIT_LIST_HEAD(&capsnap->ci_item);
+		INIT_LIST_HEAD(&capsnap->flushing_item);
+
+		capsnap->follows = snapc->seq;
+		capsnap->issued = __ceph_caps_issued(ci, NULL);
+		capsnap->dirty = dirty;
+
+		capsnap->mode = inode->i_mode;
+		capsnap->uid = inode->i_uid;
+		capsnap->gid = inode->i_gid;
+
+		if (dirty & CEPH_CAP_XATTR_EXCL) {
+			__ceph_build_xattrs_blob(ci);
+			capsnap->xattr_blob =
+				ceph_buffer_get(ci->i_xattrs.blob);
+			capsnap->xattr_version = ci->i_xattrs.version;
+		} else {
+			capsnap->xattr_blob = NULL;
+			capsnap->xattr_version = 0;
+		}
+
+		/* dirty page count moved from _head to this cap_snap;
+		   all subsequent writes page dirties occur _after_ this
+		   snapshot. */
+		capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
+		ci->i_wrbuffer_ref_head = 0;
+		capsnap->context = snapc;
+		ci->i_head_snapc =
+			ceph_get_snap_context(ci->i_snap_realm->cached_context);
+		dout(" new snapc is %p\n", ci->i_head_snapc);
+		list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
+
+		if (used & CEPH_CAP_FILE_WR) {
+			dout("queue_cap_snap %p cap_snap %p snapc %p"
+			     " seq %llu used WR, now pending\n", inode,
+			     capsnap, snapc, snapc->seq);
+			capsnap->writing = 1;
+		} else {
+			/* note mtime, size NOW. */
+			__ceph_finish_cap_snap(ci, capsnap);
+		}
+	} else {
+		dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+		kfree(capsnap);
+	}
+
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Finalize the size, mtime for a cap_snap.. that is, settle on final values
+ * to be used for the snapshot, to be flushed back to the mds.
+ *
+ * If capsnap can now be flushed, add to snap_flush list, and return 1.
+ *
+ * Caller must hold i_lock.
+ */
+int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+			    struct ceph_cap_snap *capsnap)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+
+	BUG_ON(capsnap->writing);
+	capsnap->size = inode->i_size;
+	capsnap->mtime = inode->i_mtime;
+	capsnap->atime = inode->i_atime;
+	capsnap->ctime = inode->i_ctime;
+	capsnap->time_warp_seq = ci->i_time_warp_seq;
+	if (capsnap->dirty_pages) {
+		dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
+		     "still has %d dirty pages\n", inode, capsnap,
+		     capsnap->context, capsnap->context->seq,
+		     ceph_cap_string(capsnap->dirty), capsnap->size,
+		     capsnap->dirty_pages);
+		return 0;
+	}
+	dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
+	     inode, capsnap, capsnap->context,
+	     capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+	     capsnap->size);
+
+	spin_lock(&mdsc->snap_flush_lock);
+	list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+	spin_unlock(&mdsc->snap_flush_lock);
+	return 1;  /* caller may want to ceph_flush_snaps */
+}
+
+/*
+ * Queue cap_snaps for snap writeback for this realm and its children.
+ * Called under snap_rwsem, so realm topology won't change.
+ */
+static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
+{
+	struct ceph_inode_info *ci;
+	struct inode *lastinode = NULL;
+	struct ceph_snap_realm *child;
+
+	dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
+
+	spin_lock(&realm->inodes_with_caps_lock);
+	list_for_each_entry(ci, &realm->inodes_with_caps,
+			    i_snap_realm_item) {
+		struct inode *inode = igrab(&ci->vfs_inode);
+		if (!inode)
+			continue;
+		spin_unlock(&realm->inodes_with_caps_lock);
+		if (lastinode)
+			iput(lastinode);
+		lastinode = inode;
+		ceph_queue_cap_snap(ci);
+		spin_lock(&realm->inodes_with_caps_lock);
+	}
+	spin_unlock(&realm->inodes_with_caps_lock);
+	if (lastinode)
+		iput(lastinode);
+
+	list_for_each_entry(child, &realm->children, child_item) {
+		dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n",
+		     realm, realm->ino, child, child->ino);
+		list_del_init(&child->dirty_item);
+		list_add(&child->dirty_item, &realm->dirty_item);
+	}
+
+	list_del_init(&realm->dirty_item);
+	dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
+}
+
+/*
+ * Parse and apply a snapblob "snap trace" from the MDS.  This specifies
+ * the snap realm parameters from a given realm and all of its ancestors,
+ * up to the root.
+ *
+ * Caller must hold snap_rwsem for write.
+ */
+int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
+			   void *p, void *e, bool deletion)
+{
+	struct ceph_mds_snap_realm *ri;    /* encoded */
+	__le64 *snaps;                     /* encoded */
+	__le64 *prior_parent_snaps;        /* encoded */
+	struct ceph_snap_realm *realm;
+	int invalidate = 0;
+	int err = -ENOMEM;
+	LIST_HEAD(dirty_realms);
+
+	dout("update_snap_trace deletion=%d\n", deletion);
+more:
+	ceph_decode_need(&p, e, sizeof(*ri), bad);
+	ri = p;
+	p += sizeof(*ri);
+	ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
+			    le32_to_cpu(ri->num_prior_parent_snaps)), bad);
+	snaps = p;
+	p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
+	prior_parent_snaps = p;
+	p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
+
+	realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
+	if (!realm) {
+		realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
+		if (IS_ERR(realm)) {
+			err = PTR_ERR(realm);
+			goto fail;
+		}
+	}
+
+	/* ensure the parent is correct */
+	err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
+	if (err < 0)
+		goto fail;
+	invalidate += err;
+
+	if (le64_to_cpu(ri->seq) > realm->seq) {
+		dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+		     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
+		/* update realm parameters, snap lists */
+		realm->seq = le64_to_cpu(ri->seq);
+		realm->created = le64_to_cpu(ri->created);
+		realm->parent_since = le64_to_cpu(ri->parent_since);
+
+		realm->num_snaps = le32_to_cpu(ri->num_snaps);
+		err = dup_array(&realm->snaps, snaps, realm->num_snaps);
+		if (err < 0)
+			goto fail;
+
+		realm->num_prior_parent_snaps =
+			le32_to_cpu(ri->num_prior_parent_snaps);
+		err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
+				realm->num_prior_parent_snaps);
+		if (err < 0)
+			goto fail;
+
+		/* queue realm for cap_snap creation */
+		list_add(&realm->dirty_item, &dirty_realms);
+
+		invalidate = 1;
+	} else if (!realm->cached_context) {
+		dout("update_snap_trace %llx %p seq %lld new\n",
+		     realm->ino, realm, realm->seq);
+		invalidate = 1;
+	} else {
+		dout("update_snap_trace %llx %p seq %lld unchanged\n",
+		     realm->ino, realm, realm->seq);
+	}
+
+	dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
+	     realm, invalidate, p, e);
+
+	if (p < e)
+		goto more;
+
+	/* invalidate when we reach the _end_ (root) of the trace */
+	if (invalidate)
+		rebuild_snap_realms(realm);
+
+	/*
+	 * queue cap snaps _after_ we've built the new snap contexts,
+	 * so that i_head_snapc can be set appropriately.
+	 */
+	while (!list_empty(&dirty_realms)) {
+		realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
+					 dirty_item);
+		queue_realm_cap_snaps(realm);
+	}
+
+	__cleanup_empty_realms(mdsc);
+	return 0;
+
+bad:
+	err = -EINVAL;
+fail:
+	pr_err("update_snap_trace error %d\n", err);
+	return err;
+}
+
+
+/*
+ * Send any cap_snaps that are queued for flush.  Try to carry
+ * s_mutex across multiple snap flushes to avoid locking overhead.
+ *
+ * Caller holds no locks.
+ */
+static void flush_snaps(struct ceph_mds_client *mdsc)
+{
+	struct ceph_inode_info *ci;
+	struct inode *inode;
+	struct ceph_mds_session *session = NULL;
+
+	dout("flush_snaps\n");
+	spin_lock(&mdsc->snap_flush_lock);
+	while (!list_empty(&mdsc->snap_flush_list)) {
+		ci = list_first_entry(&mdsc->snap_flush_list,
+				struct ceph_inode_info, i_snap_flush_item);
+		inode = &ci->vfs_inode;
+		ihold(inode);
+		spin_unlock(&mdsc->snap_flush_lock);
+		spin_lock(&inode->i_lock);
+		__ceph_flush_snaps(ci, &session, 0);
+		spin_unlock(&inode->i_lock);
+		iput(inode);
+		spin_lock(&mdsc->snap_flush_lock);
+	}
+	spin_unlock(&mdsc->snap_flush_lock);
+
+	if (session) {
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+	}
+	dout("flush_snaps done\n");
+}
+
+
+/*
+ * Handle a snap notification from the MDS.
+ *
+ * This can take two basic forms: the simplest is just a snap creation
+ * or deletion notification on an existing realm.  This should update the
+ * realm and its children.
+ *
+ * The more difficult case is realm creation, due to snap creation at a
+ * new point in the file hierarchy, or due to a rename that moves a file or
+ * directory into another realm.
+ */
+void ceph_handle_snap(struct ceph_mds_client *mdsc,
+		      struct ceph_mds_session *session,
+		      struct ceph_msg *msg)
+{
+	struct super_block *sb = mdsc->fsc->sb;
+	int mds = session->s_mds;
+	u64 split;
+	int op;
+	int trace_len;
+	struct ceph_snap_realm *realm = NULL;
+	void *p = msg->front.iov_base;
+	void *e = p + msg->front.iov_len;
+	struct ceph_mds_snap_head *h;
+	int num_split_inos, num_split_realms;
+	__le64 *split_inos = NULL, *split_realms = NULL;
+	int i;
+	int locked_rwsem = 0;
+
+	/* decode */
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	h = p;
+	op = le32_to_cpu(h->op);
+	split = le64_to_cpu(h->split);   /* non-zero if we are splitting an
+					  * existing realm */
+	num_split_inos = le32_to_cpu(h->num_split_inos);
+	num_split_realms = le32_to_cpu(h->num_split_realms);
+	trace_len = le32_to_cpu(h->trace_len);
+	p += sizeof(*h);
+
+	dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
+	     ceph_snap_op_name(op), split, trace_len);
+
+	mutex_lock(&session->s_mutex);
+	session->s_seq++;
+	mutex_unlock(&session->s_mutex);
+
+	down_write(&mdsc->snap_rwsem);
+	locked_rwsem = 1;
+
+	if (op == CEPH_SNAP_OP_SPLIT) {
+		struct ceph_mds_snap_realm *ri;
+
+		/*
+		 * A "split" breaks part of an existing realm off into
+		 * a new realm.  The MDS provides a list of inodes
+		 * (with caps) and child realms that belong to the new
+		 * child.
+		 */
+		split_inos = p;
+		p += sizeof(u64) * num_split_inos;
+		split_realms = p;
+		p += sizeof(u64) * num_split_realms;
+		ceph_decode_need(&p, e, sizeof(*ri), bad);
+		/* we will peek at realm info here, but will _not_
+		 * advance p, as the realm update will occur below in
+		 * ceph_update_snap_trace. */
+		ri = p;
+
+		realm = ceph_lookup_snap_realm(mdsc, split);
+		if (!realm) {
+			realm = ceph_create_snap_realm(mdsc, split);
+			if (IS_ERR(realm))
+				goto out;
+		}
+		ceph_get_snap_realm(mdsc, realm);
+
+		dout("splitting snap_realm %llx %p\n", realm->ino, realm);
+		for (i = 0; i < num_split_inos; i++) {
+			struct ceph_vino vino = {
+				.ino = le64_to_cpu(split_inos[i]),
+				.snap = CEPH_NOSNAP,
+			};
+			struct inode *inode = ceph_find_inode(sb, vino);
+			struct ceph_inode_info *ci;
+			struct ceph_snap_realm *oldrealm;
+
+			if (!inode)
+				continue;
+			ci = ceph_inode(inode);
+
+			spin_lock(&inode->i_lock);
+			if (!ci->i_snap_realm)
+				goto skip_inode;
+			/*
+			 * If this inode belongs to a realm that was
+			 * created after our new realm, we experienced
+			 * a race (due to another split notifications
+			 * arriving from a different MDS).  So skip
+			 * this inode.
+			 */
+			if (ci->i_snap_realm->created >
+			    le64_to_cpu(ri->created)) {
+				dout(" leaving %p in newer realm %llx %p\n",
+				     inode, ci->i_snap_realm->ino,
+				     ci->i_snap_realm);
+				goto skip_inode;
+			}
+			dout(" will move %p to split realm %llx %p\n",
+			     inode, realm->ino, realm);
+			/*
+			 * Move the inode to the new realm
+			 */
+			spin_lock(&realm->inodes_with_caps_lock);
+			list_del_init(&ci->i_snap_realm_item);
+			list_add(&ci->i_snap_realm_item,
+				 &realm->inodes_with_caps);
+			oldrealm = ci->i_snap_realm;
+			ci->i_snap_realm = realm;
+			spin_unlock(&realm->inodes_with_caps_lock);
+			spin_unlock(&inode->i_lock);
+
+			ceph_get_snap_realm(mdsc, realm);
+			ceph_put_snap_realm(mdsc, oldrealm);
+
+			iput(inode);
+			continue;
+
+skip_inode:
+			spin_unlock(&inode->i_lock);
+			iput(inode);
+		}
+
+		/* we may have taken some of the old realm's children. */
+		for (i = 0; i < num_split_realms; i++) {
+			struct ceph_snap_realm *child =
+				ceph_lookup_snap_realm(mdsc,
+					   le64_to_cpu(split_realms[i]));
+			if (!child)
+				continue;
+			adjust_snap_realm_parent(mdsc, child, realm->ino);
+		}
+	}
+
+	/*
+	 * update using the provided snap trace. if we are deleting a
+	 * snap, we can avoid queueing cap_snaps.
+	 */
+	ceph_update_snap_trace(mdsc, p, e,
+			       op == CEPH_SNAP_OP_DESTROY);
+
+	if (op == CEPH_SNAP_OP_SPLIT)
+		/* we took a reference when we created the realm, above */
+		ceph_put_snap_realm(mdsc, realm);
+
+	__cleanup_empty_realms(mdsc);
+
+	up_write(&mdsc->snap_rwsem);
+
+	flush_snaps(mdsc);
+	return;
+
+bad:
+	pr_err("corrupt snap message from mds%d\n", mds);
+	ceph_msg_dump(msg);
+out:
+	if (locked_rwsem)
+		up_write(&mdsc->snap_rwsem);
+	return;
+}
+
+
+
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
new file mode 100644
index 00000000..cd5097d7
--- /dev/null
+++ b/fs/ceph/strings.c
@@ -0,0 +1,117 @@
+/*
+ * Ceph fs string constants
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+
+const char *ceph_mds_state_name(int s)
+{
+	switch (s) {
+		/* down and out */
+	case CEPH_MDS_STATE_DNE:        return "down:dne";
+	case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
+		/* up and out */
+	case CEPH_MDS_STATE_BOOT:       return "up:boot";
+	case CEPH_MDS_STATE_STANDBY:    return "up:standby";
+	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+	case CEPH_MDS_STATE_CREATING:   return "up:creating";
+	case CEPH_MDS_STATE_STARTING:   return "up:starting";
+		/* up and in */
+	case CEPH_MDS_STATE_REPLAY:     return "up:replay";
+	case CEPH_MDS_STATE_RESOLVE:    return "up:resolve";
+	case CEPH_MDS_STATE_RECONNECT:  return "up:reconnect";
+	case CEPH_MDS_STATE_REJOIN:     return "up:rejoin";
+	case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
+	case CEPH_MDS_STATE_ACTIVE:     return "up:active";
+	case CEPH_MDS_STATE_STOPPING:   return "up:stopping";
+	}
+	return "???";
+}
+
+const char *ceph_session_op_name(int op)
+{
+	switch (op) {
+	case CEPH_SESSION_REQUEST_OPEN: return "request_open";
+	case CEPH_SESSION_OPEN: return "open";
+	case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
+	case CEPH_SESSION_CLOSE: return "close";
+	case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
+	case CEPH_SESSION_RENEWCAPS: return "renewcaps";
+	case CEPH_SESSION_STALE: return "stale";
+	case CEPH_SESSION_RECALL_STATE: return "recall_state";
+	}
+	return "???";
+}
+
+const char *ceph_mds_op_name(int op)
+{
+	switch (op) {
+	case CEPH_MDS_OP_LOOKUP:  return "lookup";
+	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
+	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+	case CEPH_MDS_OP_GETATTR:  return "getattr";
+	case CEPH_MDS_OP_SETXATTR: return "setxattr";
+	case CEPH_MDS_OP_SETATTR: return "setattr";
+	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+	case CEPH_MDS_OP_READDIR: return "readdir";
+	case CEPH_MDS_OP_MKNOD: return "mknod";
+	case CEPH_MDS_OP_LINK: return "link";
+	case CEPH_MDS_OP_UNLINK: return "unlink";
+	case CEPH_MDS_OP_RENAME: return "rename";
+	case CEPH_MDS_OP_MKDIR: return "mkdir";
+	case CEPH_MDS_OP_RMDIR: return "rmdir";
+	case CEPH_MDS_OP_SYMLINK: return "symlink";
+	case CEPH_MDS_OP_CREATE: return "create";
+	case CEPH_MDS_OP_OPEN: return "open";
+	case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
+	case CEPH_MDS_OP_LSSNAP: return "lssnap";
+	case CEPH_MDS_OP_MKSNAP: return "mksnap";
+	case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+	case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
+	case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
+	}
+	return "???";
+}
+
+const char *ceph_cap_op_name(int op)
+{
+	switch (op) {
+	case CEPH_CAP_OP_GRANT: return "grant";
+	case CEPH_CAP_OP_REVOKE: return "revoke";
+	case CEPH_CAP_OP_TRUNC: return "trunc";
+	case CEPH_CAP_OP_EXPORT: return "export";
+	case CEPH_CAP_OP_IMPORT: return "import";
+	case CEPH_CAP_OP_UPDATE: return "update";
+	case CEPH_CAP_OP_DROP: return "drop";
+	case CEPH_CAP_OP_FLUSH: return "flush";
+	case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
+	case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+	case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
+	case CEPH_CAP_OP_RELEASE: return "release";
+	case CEPH_CAP_OP_RENEW: return "renew";
+	}
+	return "???";
+}
+
+const char *ceph_lease_op_name(int o)
+{
+	switch (o) {
+	case CEPH_MDS_LEASE_REVOKE: return "revoke";
+	case CEPH_MDS_LEASE_RELEASE: return "release";
+	case CEPH_MDS_LEASE_RENEW: return "renew";
+	case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+	}
+	return "???";
+}
+
+const char *ceph_snap_op_name(int o)
+{
+	switch (o) {
+	case CEPH_SNAP_OP_UPDATE: return "update";
+	case CEPH_SNAP_OP_CREATE: return "create";
+	case CEPH_SNAP_OP_DESTROY: return "destroy";
+	case CEPH_SNAP_OP_SPLIT: return "split";
+	}
+	return "???";
+}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 00000000..f2f77fd3
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,921 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/backing-dev.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+/*
+ * Ceph superblock operations
+ *
+ * Handle the basics of mounting, unmounting.
+ */
+
+/*
+ * super ops
+ */
+static void ceph_put_super(struct super_block *s)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+
+	dout("put_super\n");
+	ceph_mdsc_close_sessions(fsc->mdsc);
+
+	/*
+	 * ensure we release the bdi before put_anon_super releases
+	 * the device name.
+	 */
+	if (s->s_bdi == &fsc->backing_dev_info) {
+		bdi_unregister(&fsc->backing_dev_info);
+		s->s_bdi = NULL;
+	}
+
+	return;
+}
+
+static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
+	struct ceph_monmap *monmap = fsc->client->monc.monmap;
+	struct ceph_statfs st;
+	u64 fsid;
+	int err;
+
+	dout("statfs\n");
+	err = ceph_monc_do_statfs(&fsc->client->monc, &st);
+	if (err < 0)
+		return err;
+
+	/* fill in kstatfs */
+	buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */
+
+	/*
+	 * express utilization in terms of large blocks to avoid
+	 * overflow on 32-bit machines.
+	 */
+	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
+	buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
+		(CEPH_BLOCK_SHIFT-10);
+	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+
+	buf->f_files = le64_to_cpu(st.num_objects);
+	buf->f_ffree = -1;
+	buf->f_namelen = NAME_MAX;
+	buf->f_frsize = PAGE_CACHE_SIZE;
+
+	/* leave fsid little-endian, regardless of host endianness */
+	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
+	buf->f_fsid.val[0] = fsid & 0xffffffff;
+	buf->f_fsid.val[1] = fsid >> 32;
+
+	return 0;
+}
+
+
+static int ceph_sync_fs(struct super_block *sb, int wait)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+
+	if (!wait) {
+		dout("sync_fs (non-blocking)\n");
+		ceph_flush_dirty_caps(fsc->mdsc);
+		dout("sync_fs (non-blocking) done\n");
+		return 0;
+	}
+
+	dout("sync_fs (blocking)\n");
+	ceph_osdc_sync(&fsc->client->osdc);
+	ceph_mdsc_sync(fsc->mdsc);
+	dout("sync_fs (blocking) done\n");
+	return 0;
+}
+
+/*
+ * mount options
+ */
+enum {
+	Opt_wsize,
+	Opt_rsize,
+	Opt_caps_wanted_delay_min,
+	Opt_caps_wanted_delay_max,
+	Opt_cap_release_safety,
+	Opt_readdir_max_entries,
+	Opt_readdir_max_bytes,
+	Opt_congestion_kb,
+	Opt_last_int,
+	/* int args above */
+	Opt_snapdirname,
+	Opt_last_string,
+	/* string args above */
+	Opt_dirstat,
+	Opt_nodirstat,
+	Opt_rbytes,
+	Opt_norbytes,
+	Opt_noasyncreaddir,
+	Opt_ino32,
+};
+
+static match_table_t fsopt_tokens = {
+	{Opt_wsize, "wsize=%d"},
+	{Opt_rsize, "rsize=%d"},
+	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
+	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+	{Opt_cap_release_safety, "cap_release_safety=%d"},
+	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
+	{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
+	{Opt_congestion_kb, "write_congestion_kb=%d"},
+	/* int args above */
+	{Opt_snapdirname, "snapdirname=%s"},
+	/* string args above */
+	{Opt_dirstat, "dirstat"},
+	{Opt_nodirstat, "nodirstat"},
+	{Opt_rbytes, "rbytes"},
+	{Opt_norbytes, "norbytes"},
+	{Opt_noasyncreaddir, "noasyncreaddir"},
+	{Opt_ino32, "ino32"},
+	{-1, NULL}
+};
+
+static int parse_fsopt_token(char *c, void *private)
+{
+	struct ceph_mount_options *fsopt = private;
+	substring_t argstr[MAX_OPT_ARGS];
+	int token, intval, ret;
+
+	token = match_token((char *)c, fsopt_tokens, argstr);
+	if (token < 0)
+		return -EINVAL;
+
+	if (token < Opt_last_int) {
+		ret = match_int(&argstr[0], &intval);
+		if (ret < 0) {
+			pr_err("bad mount option arg (not int) "
+			       "at '%s'\n", c);
+			return ret;
+		}
+		dout("got int token %d val %d\n", token, intval);
+	} else if (token > Opt_last_int && token < Opt_last_string) {
+		dout("got string token %d val %s\n", token,
+		     argstr[0].from);
+	} else {
+		dout("got token %d\n", token);
+	}
+
+	switch (token) {
+	case Opt_snapdirname:
+		kfree(fsopt->snapdir_name);
+		fsopt->snapdir_name = kstrndup(argstr[0].from,
+					       argstr[0].to-argstr[0].from,
+					       GFP_KERNEL);
+		if (!fsopt->snapdir_name)
+			return -ENOMEM;
+		break;
+
+		/* misc */
+	case Opt_wsize:
+		fsopt->wsize = intval;
+		break;
+	case Opt_rsize:
+		fsopt->rsize = intval;
+		break;
+	case Opt_caps_wanted_delay_min:
+		fsopt->caps_wanted_delay_min = intval;
+		break;
+	case Opt_caps_wanted_delay_max:
+		fsopt->caps_wanted_delay_max = intval;
+		break;
+	case Opt_readdir_max_entries:
+		fsopt->max_readdir = intval;
+		break;
+	case Opt_readdir_max_bytes:
+		fsopt->max_readdir_bytes = intval;
+		break;
+	case Opt_congestion_kb:
+		fsopt->congestion_kb = intval;
+		break;
+	case Opt_dirstat:
+		fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
+		break;
+	case Opt_nodirstat:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
+		break;
+	case Opt_rbytes:
+		fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
+		break;
+	case Opt_norbytes:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
+		break;
+	case Opt_noasyncreaddir:
+		fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
+		break;
+	case Opt_ino32:
+		fsopt->flags |= CEPH_MOUNT_OPT_INO32;
+		break;
+	default:
+		BUG_ON(token);
+	}
+	return 0;
+}
+
+static void destroy_mount_options(struct ceph_mount_options *args)
+{
+	dout("destroy_mount_options %p\n", args);
+	kfree(args->snapdir_name);
+	kfree(args);
+}
+
+static int strcmp_null(const char *s1, const char *s2)
+{
+	if (!s1 && !s2)
+		return 0;
+	if (s1 && !s2)
+		return -1;
+	if (!s1 && s2)
+		return 1;
+	return strcmp(s1, s2);
+}
+
+static int compare_mount_options(struct ceph_mount_options *new_fsopt,
+				 struct ceph_options *new_opt,
+				 struct ceph_fs_client *fsc)
+{
+	struct ceph_mount_options *fsopt1 = new_fsopt;
+	struct ceph_mount_options *fsopt2 = fsc->mount_options;
+	int ofs = offsetof(struct ceph_mount_options, snapdir_name);
+	int ret;
+
+	ret = memcmp(fsopt1, fsopt2, ofs);
+	if (ret)
+		return ret;
+
+	ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
+	if (ret)
+		return ret;
+
+	return ceph_compare_options(new_opt, fsc->client);
+}
+
+static int parse_mount_options(struct ceph_mount_options **pfsopt,
+			       struct ceph_options **popt,
+			       int flags, char *options,
+			       const char *dev_name,
+			       const char **path)
+{
+	struct ceph_mount_options *fsopt;
+	const char *dev_name_end;
+	int err = -ENOMEM;
+
+	fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
+	if (!fsopt)
+		return -ENOMEM;
+
+	dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
+
+        fsopt->sb_flags = flags;
+        fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+
+        fsopt->rsize = CEPH_RSIZE_DEFAULT;
+        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+	fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+	fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
+        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+        fsopt->congestion_kb = default_congestion_kb();
+	
+        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+        err = -EINVAL;
+        if (!dev_name)
+                goto out;
+        *path = strstr(dev_name, ":/");
+        if (*path == NULL) {
+                pr_err("device name is missing path (no :/ in %s)\n",
+                       dev_name);
+                goto out;
+        }
+	dev_name_end = *path;
+	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
+
+	/* path on server */
+	*path += 2;
+	dout("server path '%s'\n", *path);
+
+	err = ceph_parse_options(popt, options, dev_name, dev_name_end,
+				 parse_fsopt_token, (void *)fsopt);
+	if (err)
+		goto out;
+
+	/* success */
+	*pfsopt = fsopt;
+	return 0;
+
+out:
+	destroy_mount_options(fsopt);
+	return err;
+}
+
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
+	struct ceph_mount_options *fsopt = fsc->mount_options;
+	struct ceph_options *opt = fsc->client->options;
+
+	if (opt->flags & CEPH_OPT_FSID)
+		seq_printf(m, ",fsid=%pU", &opt->fsid);
+	if (opt->flags & CEPH_OPT_NOSHARE)
+		seq_puts(m, ",noshare");
+	if (opt->flags & CEPH_OPT_NOCRC)
+		seq_puts(m, ",nocrc");
+
+	if (opt->name)
+		seq_printf(m, ",name=%s", opt->name);
+	if (opt->key)
+		seq_puts(m, ",secret=<hidden>");
+
+	if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+		seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
+	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+		seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
+	if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+		seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
+	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+		seq_printf(m, ",osdkeepalivetimeout=%d",
+			   opt->osd_keepalive_timeout);
+
+	if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
+		seq_puts(m, ",dirstat");
+	if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
+		seq_puts(m, ",norbytes");
+	if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
+		seq_puts(m, ",noasyncreaddir");
+
+	if (fsopt->wsize)
+		seq_printf(m, ",wsize=%d", fsopt->wsize);
+	if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
+		seq_printf(m, ",rsize=%d", fsopt->rsize);
+	if (fsopt->congestion_kb != default_congestion_kb())
+		seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+	if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_min=%d",
+			 fsopt->caps_wanted_delay_min);
+	if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_max=%d",
+			   fsopt->caps_wanted_delay_max);
+	if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+		seq_printf(m, ",cap_release_safety=%d",
+			   fsopt->cap_release_safety);
+	if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+		seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
+	if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+		seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
+	if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+		seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+	return 0;
+}
+
+/*
+ * handle any mon messages the standard library doesn't understand.
+ * return error if we don't either.
+ */
+static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
+{
+	struct ceph_fs_client *fsc = client->private;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	switch (type) {
+	case CEPH_MSG_MDS_MAP:
+		ceph_mdsc_handle_map(fsc->mdsc, msg);
+		return 0;
+
+	default:
+		return -1;
+	}
+}
+
+/*
+ * create a new fs client
+ */
+struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+					struct ceph_options *opt)
+{
+	struct ceph_fs_client *fsc;
+	int err = -ENOMEM;
+
+	fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
+	if (!fsc)
+		return ERR_PTR(-ENOMEM);
+
+	fsc->client = ceph_create_client(opt, fsc);
+	if (IS_ERR(fsc->client)) {
+		err = PTR_ERR(fsc->client);
+		goto fail;
+	}
+	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+	fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
+		CEPH_FEATURE_DIRLAYOUTHASH;
+	fsc->client->monc.want_mdsmap = 1;
+
+	fsc->mount_options = fsopt;
+
+	fsc->sb = NULL;
+	fsc->mount_state = CEPH_MOUNT_MOUNTING;
+
+	atomic_long_set(&fsc->writeback_count, 0);
+
+	err = bdi_init(&fsc->backing_dev_info);
+	if (err < 0)
+		goto fail_client;
+
+	err = -ENOMEM;
+	/*
+	 * The number of concurrent works can be high but they don't need
+	 * to be processed in parallel, limit concurrency.
+	 */
+	fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
+	if (fsc->wb_wq == NULL)
+		goto fail_bdi;
+	fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
+	if (fsc->pg_inv_wq == NULL)
+		goto fail_wb_wq;
+	fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
+	if (fsc->trunc_wq == NULL)
+		goto fail_pg_inv_wq;
+
+	/* set up mempools */
+	err = -ENOMEM;
+	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+			      fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
+	if (!fsc->wb_pagevec_pool)
+		goto fail_trunc_wq;
+
+	/* caps */
+	fsc->min_caps = fsopt->max_readdir;
+
+	return fsc;
+
+fail_trunc_wq:
+	destroy_workqueue(fsc->trunc_wq);
+fail_pg_inv_wq:
+	destroy_workqueue(fsc->pg_inv_wq);
+fail_wb_wq:
+	destroy_workqueue(fsc->wb_wq);
+fail_bdi:
+	bdi_destroy(&fsc->backing_dev_info);
+fail_client:
+	ceph_destroy_client(fsc->client);
+fail:
+	kfree(fsc);
+	return ERR_PTR(err);
+}
+
+void destroy_fs_client(struct ceph_fs_client *fsc)
+{
+	dout("destroy_fs_client %p\n", fsc);
+
+	destroy_workqueue(fsc->wb_wq);
+	destroy_workqueue(fsc->pg_inv_wq);
+	destroy_workqueue(fsc->trunc_wq);
+
+	bdi_destroy(&fsc->backing_dev_info);
+
+	mempool_destroy(fsc->wb_pagevec_pool);
+
+	destroy_mount_options(fsc->mount_options);
+
+	ceph_fs_debugfs_cleanup(fsc);
+
+	ceph_destroy_client(fsc->client);
+
+	kfree(fsc);
+	dout("destroy_fs_client %p done\n", fsc);
+}
+
+/*
+ * caches
+ */
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
+{
+	struct ceph_inode_info *ci = foo;
+	inode_init_once(&ci->vfs_inode);
+}
+
+static int __init init_caches(void)
+{
+	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+				      sizeof(struct ceph_inode_info),
+				      __alignof__(struct ceph_inode_info),
+				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+				      ceph_inode_init_once);
+	if (ceph_inode_cachep == NULL)
+		return -ENOMEM;
+
+	ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_cap_cachep == NULL)
+		goto bad_cap;
+
+	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_dentry_cachep == NULL)
+		goto bad_dentry;
+
+	ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_file_cachep == NULL)
+		goto bad_file;
+
+	return 0;
+
+bad_file:
+	kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+	kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+	kmem_cache_destroy(ceph_inode_cachep);
+	return -ENOMEM;
+}
+
+static void destroy_caches(void)
+{
+	kmem_cache_destroy(ceph_inode_cachep);
+	kmem_cache_destroy(ceph_cap_cachep);
+	kmem_cache_destroy(ceph_dentry_cachep);
+	kmem_cache_destroy(ceph_file_cachep);
+}
+
+
+/*
+ * ceph_umount_begin - initiate forced umount.  Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
+ */
+static void ceph_umount_begin(struct super_block *sb)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+
+	dout("ceph_umount_begin - starting forced umount\n");
+	if (!fsc)
+		return;
+	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+	return;
+}
+
+static const struct super_operations ceph_super_ops = {
+	.alloc_inode	= ceph_alloc_inode,
+	.destroy_inode	= ceph_destroy_inode,
+	.write_inode    = ceph_write_inode,
+	.sync_fs        = ceph_sync_fs,
+	.put_super	= ceph_put_super,
+	.show_options   = ceph_show_options,
+	.statfs		= ceph_statfs,
+	.umount_begin   = ceph_umount_begin,
+};
+
+/*
+ * Bootstrap mount by opening the root directory.  Note the mount
+ * @started time from caller, and time out if this takes too long.
+ */
+static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
+				       const char *path,
+				       unsigned long started)
+{
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req = NULL;
+	int err;
+	struct dentry *root;
+
+	/* open dir */
+	dout("open_root_inode opening '%s'\n", path);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+	if (IS_ERR(req))
+		return ERR_CAST(req);
+	req->r_path1 = kstrdup(path, GFP_NOFS);
+	req->r_ino1.ino = CEPH_INO_ROOT;
+	req->r_ino1.snap = CEPH_NOSNAP;
+	req->r_started = started;
+	req->r_timeout = fsc->client->options->mount_timeout * HZ;
+	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+	req->r_num_caps = 2;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	if (err == 0) {
+		dout("open_root_inode success\n");
+		if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
+		    fsc->sb->s_root == NULL)
+			root = d_alloc_root(req->r_target_inode);
+		else
+			root = d_obtain_alias(req->r_target_inode);
+		req->r_target_inode = NULL;
+		dout("open_root_inode success, root dentry is %p\n", root);
+	} else {
+		root = ERR_PTR(err);
+	}
+	ceph_mdsc_put_request(req);
+	return root;
+}
+
+
+
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
+		      const char *path)
+{
+	int err;
+	unsigned long started = jiffies;  /* note the start time */
+	struct dentry *root;
+	int first = 0;   /* first vfsmount for this super_block */
+
+	dout("mount start\n");
+	mutex_lock(&fsc->client->mount_mutex);
+
+	err = __ceph_open_session(fsc->client, started);
+	if (err < 0)
+		goto out;
+
+	dout("mount opening root\n");
+	root = open_root_dentry(fsc, "", started);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto out;
+	}
+	if (fsc->sb->s_root) {
+		dput(root);
+	} else {
+		fsc->sb->s_root = root;
+		first = 1;
+
+		err = ceph_fs_debugfs_init(fsc);
+		if (err < 0)
+			goto fail;
+	}
+
+	if (path[0] == 0) {
+		dget(root);
+	} else {
+		dout("mount opening base mountpoint\n");
+		root = open_root_dentry(fsc, path, started);
+		if (IS_ERR(root)) {
+			err = PTR_ERR(root);
+			goto fail;
+		}
+	}
+
+	fsc->mount_state = CEPH_MOUNT_MOUNTED;
+	dout("mount success\n");
+	mutex_unlock(&fsc->client->mount_mutex);
+	return root;
+
+out:
+	mutex_unlock(&fsc->client->mount_mutex);
+	return ERR_PTR(err);
+
+fail:
+	if (first) {
+		dput(fsc->sb->s_root);
+		fsc->sb->s_root = NULL;
+	}
+	goto out;
+}
+
+static int ceph_set_super(struct super_block *s, void *data)
+{
+	struct ceph_fs_client *fsc = data;
+	int ret;
+
+	dout("set_super %p data %p\n", s, data);
+
+	s->s_flags = fsc->mount_options->sb_flags;
+	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
+
+	s->s_fs_info = fsc;
+	fsc->sb = s;
+
+	s->s_op = &ceph_super_ops;
+	s->s_export_op = &ceph_export_ops;
+
+	s->s_time_gran = 1000;  /* 1000 ns == 1 us */
+
+	ret = set_anon_super(s, NULL);  /* what is that second arg for? */
+	if (ret != 0)
+		goto fail;
+
+	return ret;
+
+fail:
+	s->s_fs_info = NULL;
+	fsc->sb = NULL;
+	return ret;
+}
+
+/*
+ * share superblock if same fs AND options
+ */
+static int ceph_compare_super(struct super_block *sb, void *data)
+{
+	struct ceph_fs_client *new = data;
+	struct ceph_mount_options *fsopt = new->mount_options;
+	struct ceph_options *opt = new->client->options;
+	struct ceph_fs_client *other = ceph_sb_to_client(sb);
+
+	dout("ceph_compare_super %p\n", sb);
+
+	if (compare_mount_options(fsopt, opt, other)) {
+		dout("monitor(s)/mount options don't match\n");
+		return 0;
+	}
+	if ((opt->flags & CEPH_OPT_FSID) &&
+	    ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
+		dout("fsid doesn't match\n");
+		return 0;
+	}
+	if (fsopt->sb_flags != other->mount_options->sb_flags) {
+		dout("flags differ\n");
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * construct our own bdi so we can control readahead, etc.
+ */
+static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
+
+static int ceph_register_bdi(struct super_block *sb,
+			     struct ceph_fs_client *fsc)
+{
+	int err;
+
+	/* set ra_pages based on rsize mount option? */
+	if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+		fsc->backing_dev_info.ra_pages =
+			(fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
+			>> PAGE_SHIFT;
+	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
+			   atomic_long_inc_return(&bdi_seq));
+	if (!err)
+		sb->s_bdi = &fsc->backing_dev_info;
+	return err;
+}
+
+static struct dentry *ceph_mount(struct file_system_type *fs_type,
+		       int flags, const char *dev_name, void *data)
+{
+	struct super_block *sb;
+	struct ceph_fs_client *fsc;
+	struct dentry *res;
+	int err;
+	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
+	const char *path = NULL;
+	struct ceph_mount_options *fsopt = NULL;
+	struct ceph_options *opt = NULL;
+
+	dout("ceph_mount\n");
+	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+	if (err < 0) {
+		res = ERR_PTR(err);
+		goto out_final;
+	}
+
+	/* create client (which we may/may not use) */
+	fsc = create_fs_client(fsopt, opt);
+	if (IS_ERR(fsc)) {
+		res = ERR_CAST(fsc);
+		kfree(fsopt);
+		kfree(opt);
+		goto out_final;
+	}
+
+	err = ceph_mdsc_init(fsc);
+	if (err < 0) {
+		res = ERR_PTR(err);
+		goto out;
+	}
+
+	if (ceph_test_opt(fsc->client, NOSHARE))
+		compare_super = NULL;
+	sb = sget(fs_type, compare_super, ceph_set_super, fsc);
+	if (IS_ERR(sb)) {
+		res = ERR_CAST(sb);
+		goto out;
+	}
+
+	if (ceph_sb_to_client(sb) != fsc) {
+		ceph_mdsc_destroy(fsc);
+		destroy_fs_client(fsc);
+		fsc = ceph_sb_to_client(sb);
+		dout("get_sb got existing client %p\n", fsc);
+	} else {
+		dout("get_sb using new client %p\n", fsc);
+		err = ceph_register_bdi(sb, fsc);
+		if (err < 0) {
+			res = ERR_PTR(err);
+			goto out_splat;
+		}
+	}
+
+	res = ceph_real_mount(fsc, path);
+	if (IS_ERR(res))
+		goto out_splat;
+	dout("root %p inode %p ino %llx.%llx\n", res,
+	     res->d_inode, ceph_vinop(res->d_inode));
+	return res;
+
+out_splat:
+	ceph_mdsc_close_sessions(fsc->mdsc);
+	deactivate_locked_super(sb);
+	goto out_final;
+
+out:
+	ceph_mdsc_destroy(fsc);
+	destroy_fs_client(fsc);
+out_final:
+	dout("ceph_mount fail %ld\n", PTR_ERR(res));
+	return res;
+}
+
+static void ceph_kill_sb(struct super_block *s)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+	dout("kill_sb %p\n", s);
+	ceph_mdsc_pre_umount(fsc->mdsc);
+	kill_anon_super(s);    /* will call put_super after sb is r/o */
+	ceph_mdsc_destroy(fsc);
+	destroy_fs_client(fsc);
+}
+
+static struct file_system_type ceph_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ceph",
+	.mount		= ceph_mount,
+	.kill_sb	= ceph_kill_sb,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE,
+};
+
+#define _STRINGIFY(x) #x
+#define STRINGIFY(x) _STRINGIFY(x)
+
+static int __init init_ceph(void)
+{
+	int ret = init_caches();
+	if (ret)
+		goto out;
+
+	ret = register_filesystem(&ceph_fs_type);
+	if (ret)
+		goto out_icache;
+
+	pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
+
+	return 0;
+
+out_icache:
+	destroy_caches();
+out:
+	return ret;
+}
+
+static void __exit exit_ceph(void)
+{
+	dout("exit_ceph\n");
+	unregister_filesystem(&ceph_fs_type);
+	destroy_caches();
+}
+
+module_init(init_ceph);
+module_exit(exit_ceph);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 00000000..f5cabefa
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,833 @@
+#ifndef _FS_CEPH_SUPER_H
+#define _FS_CEPH_SUPER_H
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/libceph.h>
+
+/* f_type in struct statfs */
+#define CEPH_SUPER_MAGIC 0x00c36400
+
+/* large granularity for statfs utilization stats to facilitate
+ * large volume sizes on 32-bit machines. */
+#define CEPH_BLOCK_SHIFT   20  /* 1 MB */
+#define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
+
+#define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
+#define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
+#define CEPH_MOUNT_OPT_INO32           (1<<8) /* 32 bit inos */
+
+#define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES)
+
+#define ceph_set_mount_opt(fsc, opt) \
+	(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+#define ceph_test_mount_opt(fsc, opt) \
+	(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
+
+#define CEPH_RSIZE_DEFAULT             (512*1024) /* readahead */
+#define CEPH_MAX_READDIR_DEFAULT        1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
+#define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
+
+struct ceph_mount_options {
+	int flags;
+	int sb_flags;
+
+	int wsize;
+	int rsize;            /* max readahead */
+	int congestion_kb;    /* max writeback in flight */
+	int caps_wanted_delay_min, caps_wanted_delay_max;
+	int cap_release_safety;
+	int max_readdir;       /* max readdir result (entires) */
+	int max_readdir_bytes; /* max readdir result (bytes) */
+
+	/*
+	 * everything above this point can be memcmp'd; everything below
+	 * is handled in compare_mount_options()
+	 */
+
+	char *snapdir_name;   /* default ".snap" */
+};
+
+struct ceph_fs_client {
+	struct super_block *sb;
+
+	struct ceph_mount_options *mount_options;
+	struct ceph_client *client;
+
+	unsigned long mount_state;
+	int min_caps;                  /* min caps i added */
+
+	struct ceph_mds_client *mdsc;
+
+	/* writeback */
+	mempool_t *wb_pagevec_pool;
+	struct workqueue_struct *wb_wq;
+	struct workqueue_struct *pg_inv_wq;
+	struct workqueue_struct *trunc_wq;
+	atomic_long_t writeback_count;
+
+	struct backing_dev_info backing_dev_info;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_dentry_lru, *debugfs_caps;
+	struct dentry *debugfs_congestion_kb;
+	struct dentry *debugfs_bdi;
+	struct dentry *debugfs_mdsc, *debugfs_mdsmap;
+#endif
+};
+
+
+/*
+ * File i/o capability.  This tracks shared state with the metadata
+ * server that allows us to cache or writeback attributes or to read
+ * and write data.  For any given inode, we should have one or more
+ * capabilities, one issued by each metadata server, and our
+ * cumulative access is the OR of all issued capabilities.
+ *
+ * Each cap is referenced by the inode's i_caps rbtree and by per-mds
+ * session capability lists.
+ */
+struct ceph_cap {
+	struct ceph_inode_info *ci;
+	struct rb_node ci_node;          /* per-ci cap tree */
+	struct ceph_mds_session *session;
+	struct list_head session_caps;   /* per-session caplist */
+	int mds;
+	u64 cap_id;       /* unique cap id (mds provided) */
+	int issued;       /* latest, from the mds */
+	int implemented;  /* implemented superset of issued (for revocation) */
+	int mds_wanted;
+	u32 seq, issue_seq, mseq;
+	u32 cap_gen;      /* active/stale cycle */
+	unsigned long last_used;
+	struct list_head caps_item;
+};
+
+#define CHECK_CAPS_NODELAY    1  /* do not delay any further */
+#define CHECK_CAPS_AUTHONLY   2  /* only check auth cap */
+#define CHECK_CAPS_FLUSH      4  /* flush any dirty caps */
+
+/*
+ * Snapped cap state that is pending flush to mds.  When a snapshot occurs,
+ * we first complete any in-process sync writes and writeback any dirty
+ * data before flushing the snapped state (tracked here) back to the MDS.
+ */
+struct ceph_cap_snap {
+	atomic_t nref;
+	struct ceph_inode_info *ci;
+	struct list_head ci_item, flushing_item;
+
+	u64 follows, flush_tid;
+	int issued, dirty;
+	struct ceph_snap_context *context;
+
+	mode_t mode;
+	uid_t uid;
+	gid_t gid;
+
+	struct ceph_buffer *xattr_blob;
+	u64 xattr_version;
+
+	u64 size;
+	struct timespec mtime, atime, ctime;
+	u64 time_warp_seq;
+	int writing;   /* a sync write is still in progress */
+	int dirty_pages;     /* dirty pages awaiting writeback */
+};
+
+static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
+{
+	if (atomic_dec_and_test(&capsnap->nref)) {
+		if (capsnap->xattr_blob)
+			ceph_buffer_put(capsnap->xattr_blob);
+		kfree(capsnap);
+	}
+}
+
+/*
+ * The frag tree describes how a directory is fragmented, potentially across
+ * multiple metadata servers.  It is also used to indicate points where
+ * metadata authority is delegated, and whether/where metadata is replicated.
+ *
+ * A _leaf_ frag will be present in the i_fragtree IFF there is
+ * delegation info.  That is, if mds >= 0 || ndist > 0.
+ */
+#define CEPH_MAX_DIRFRAG_REP 4
+
+struct ceph_inode_frag {
+	struct rb_node node;
+
+	/* fragtree state */
+	u32 frag;
+	int split_by;         /* i.e. 2^(split_by) children */
+
+	/* delegation and replication info */
+	int mds;              /* -1 if same authority as parent */
+	int ndist;            /* >0 if replicated */
+	int dist[CEPH_MAX_DIRFRAG_REP];
+};
+
+/*
+ * We cache inode xattrs as an encoded blob until they are first used,
+ * at which point we parse them into an rbtree.
+ */
+struct ceph_inode_xattr {
+	struct rb_node node;
+
+	const char *name;
+	int name_len;
+	const char *val;
+	int val_len;
+	int dirty;
+
+	int should_free_name;
+	int should_free_val;
+};
+
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+	struct ceph_mds_session *lease_session;
+	u32 lease_gen, lease_shared_gen;
+	u32 lease_seq;
+	unsigned long lease_renew_after, lease_renew_from;
+	struct list_head lru;
+	struct dentry *dentry;
+	u64 time;
+	u64 offset;
+};
+
+struct ceph_inode_xattrs_info {
+	/*
+	 * (still encoded) xattr blob. we avoid the overhead of parsing
+	 * this until someone actually calls getxattr, etc.
+	 *
+	 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
+	 * NULL means we don't know.
+	*/
+	struct ceph_buffer *blob, *prealloc_blob;
+
+	struct rb_root index;
+	bool dirty;
+	int count;
+	int names_size;
+	int vals_size;
+	u64 version, index_version;
+};
+
+/*
+ * Ceph inode.
+ */
+struct ceph_inode_info {
+	struct ceph_vino i_vino;   /* ceph ino + snap */
+
+	u64 i_version;
+	u32 i_time_warp_seq;
+
+	unsigned i_ceph_flags;
+	unsigned long i_release_count;
+
+	struct ceph_dir_layout i_dir_layout;
+	struct ceph_file_layout i_layout;
+	char *i_symlink;
+
+	/* for dirs */
+	struct timespec i_rctime;
+	u64 i_rbytes, i_rfiles, i_rsubdirs;
+	u64 i_files, i_subdirs;
+	u64 i_max_offset;  /* largest readdir offset, set with I_COMPLETE */
+
+	struct rb_root i_fragtree;
+	struct mutex i_fragtree_mutex;
+
+	struct ceph_inode_xattrs_info i_xattrs;
+
+	/* capabilities.  protected _both_ by i_lock and cap->session's
+	 * s_mutex. */
+	struct rb_root i_caps;           /* cap list */
+	struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
+	unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
+	struct list_head i_dirty_item, i_flushing_item;
+	u64 i_cap_flush_seq;
+	/* we need to track cap writeback on a per-cap-bit basis, to allow
+	 * overlapping, pipelined cap flushes to the mds.  we can probably
+	 * reduce the tid to 8 bits if we're concerned about inode size. */
+	u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
+	wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
+	unsigned long i_hold_caps_min; /* jiffies */
+	unsigned long i_hold_caps_max; /* jiffies */
+	struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
+	int i_cap_exporting_mds;         /* to handle cap migration between */
+	unsigned i_cap_exporting_mseq;   /*  mds's. */
+	unsigned i_cap_exporting_issued;
+	struct ceph_cap_reservation i_cap_migration_resv;
+	struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
+	struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 or
+						    dirty|flushing caps */
+	unsigned i_snap_caps;           /* cap bits for snapped files */
+
+	int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
+
+	u32 i_truncate_seq;        /* last truncate to smaller size */
+	u64 i_truncate_size;       /*  and the size we last truncated down to */
+	int i_truncate_pending;    /*  still need to call vmtruncate */
+
+	u64 i_max_size;            /* max file size authorized by mds */
+	u64 i_reported_size; /* (max_)size reported to or requested of mds */
+	u64 i_wanted_max_size;     /* offset we'd like to write too */
+	u64 i_requested_max_size;  /* max_size we've requested */
+
+	/* held references to caps */
+	int i_pin_ref;
+	int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
+	int i_wrbuffer_ref, i_wrbuffer_ref_head;
+	u32 i_shared_gen;       /* increment each time we get FILE_SHARED */
+	u32 i_rdcache_gen;      /* incremented each time we get FILE_CACHE. */
+	u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
+
+	struct list_head i_unsafe_writes; /* uncommitted sync writes */
+	struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+	spinlock_t i_unsafe_lock;
+
+	struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+	int i_snap_realm_counter; /* snap realm (if caps) */
+	struct list_head i_snap_realm_item;
+	struct list_head i_snap_flush_item;
+
+	struct work_struct i_wb_work;  /* writeback work */
+	struct work_struct i_pg_inv_work;  /* page invalidation work */
+
+	struct work_struct i_vmtruncate_work;
+
+	struct inode vfs_inode; /* at end */
+};
+
+static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
+{
+	return container_of(inode, struct ceph_inode_info, vfs_inode);
+}
+
+static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
+{
+	return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
+}
+
+static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
+{
+	return (struct ceph_fs_client *)sb->s_fs_info;
+}
+
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino;
+}
+
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ *               i_ino (kernel inode)   st_ino (userspace)
+ * i386          32                     32
+ * x86_64+ino32  64                     32
+ * x86_64        64                     64
+ */
+static inline u32 ceph_ino_to_ino32(ino_t ino)
+{
+	ino ^= ino >> (sizeof(ino) * 8 - 32);
+	if (!ino)
+		ino = 1;
+	return ino;
+}
+
+/*
+ * kernel i_ino value
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+	ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+	ino = ceph_ino_to_ino32(ino);
+#endif
+	return ino;
+}
+
+/*
+ * user-visible ino (stat, filldir)
+ */
+#if BITS_PER_LONG == 32
+static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
+{
+	return ino;
+}
+#else
+static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
+{
+	if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
+		ino = ceph_ino_to_ino32(ino);
+	return ino;
+}
+#endif
+
+
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+
+static inline u64 ceph_ino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.snap;
+}
+
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+	struct ceph_vino *pvino = (struct ceph_vino *)data;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	return ci->i_vino.ino == pvino->ino &&
+		ci->i_vino.snap == pvino->snap;
+}
+
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+					    struct ceph_vino vino)
+{
+	ino_t t = ceph_vino_to_ino(vino);
+	return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+
+
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
+#define CEPH_I_NODELAY   4  /* do not delay cap release */
+#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
+
+static inline void ceph_i_clear(struct inode *inode, unsigned mask)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	spin_lock(&inode->i_lock);
+	ci->i_ceph_flags &= ~mask;
+	spin_unlock(&inode->i_lock);
+}
+
+static inline void ceph_i_set(struct inode *inode, unsigned mask)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	spin_lock(&inode->i_lock);
+	ci->i_ceph_flags |= mask;
+	spin_unlock(&inode->i_lock);
+}
+
+static inline bool ceph_i_test(struct inode *inode, unsigned mask)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	bool r;
+
+	spin_lock(&inode->i_lock);
+	r = (ci->i_ceph_flags & mask) == mask;
+	spin_unlock(&inode->i_lock);
+	return r;
+}
+
+
+/* find a specific frag @f */
+extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
+						u32 f);
+
+/*
+ * choose fragment for value @v.  copy frag content to pfrag, if leaf
+ * exists
+ */
+extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+			    struct ceph_inode_frag *pfrag,
+			    int *found);
+
+static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+{
+	return (struct ceph_dentry_info *)dentry->d_fsdata;
+}
+
+static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
+{
+	return ((loff_t)frag << 32) | (loff_t)off;
+}
+
+/*
+ * caps helpers
+ */
+static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
+{
+	return !RB_EMPTY_ROOT(&ci->i_caps);
+}
+
+extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
+extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
+extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
+				    struct ceph_cap *cap);
+
+static inline int ceph_caps_issued(struct ceph_inode_info *ci)
+{
+	int issued;
+	spin_lock(&ci->vfs_inode.i_lock);
+	issued = __ceph_caps_issued(ci, NULL);
+	spin_unlock(&ci->vfs_inode.i_lock);
+	return issued;
+}
+
+static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
+					int touch)
+{
+	int r;
+	spin_lock(&ci->vfs_inode.i_lock);
+	r = __ceph_caps_issued_mask(ci, mask, touch);
+	spin_unlock(&ci->vfs_inode.i_lock);
+	return r;
+}
+
+static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
+{
+	return ci->i_dirty_caps | ci->i_flushing_caps;
+}
+extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+
+extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
+extern int __ceph_caps_used(struct ceph_inode_info *ci);
+
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+	int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+	if (w & CEPH_CAP_FILE_BUFFER)
+		w |= CEPH_CAP_FILE_EXCL;  /* we want EXCL if dirty data */
+	return w;
+}
+
+/* what the mds thinks we want */
+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
+
+extern void ceph_caps_init(struct ceph_mds_client *mdsc);
+extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
+extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
+extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+			     struct ceph_cap_reservation *ctx, int need);
+extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
+			       struct ceph_cap_reservation *ctx);
+extern void ceph_reservation_status(struct ceph_fs_client *client,
+				    int *total, int *avail, int *used,
+				    int *reserved, int *min);
+
+
+
+/*
+ * we keep buffered readdir results attached to file->private_data
+ */
+struct ceph_file_info {
+	int fmode;     /* initialized on open */
+
+	/* readdir: position within the dir */
+	u32 frag;
+	struct ceph_mds_request *last_readdir;
+	int at_end;
+
+	/* readdir: position within a frag */
+	unsigned offset;       /* offset of last chunk, adjusted for . and .. */
+	u64 next_offset;       /* offset of next chunk (last_name's + 1) */
+	char *last_name;       /* last entry in previous chunk */
+	struct dentry *dentry; /* next dentry (for dcache readdir) */
+	unsigned long dir_release_count;
+
+	/* used for -o dirstat read() on directory thing */
+	char *dir_info;
+	int dir_info_len;
+};
+
+
+
+/*
+ * A "snap realm" describes a subset of the file hierarchy sharing
+ * the same set of snapshots that apply to it.  The realms themselves
+ * are organized into a hierarchy, such that children inherit (some of)
+ * the snapshots of their parents.
+ *
+ * All inodes within the realm that have capabilities are linked into a
+ * per-realm list.
+ */
+struct ceph_snap_realm {
+	u64 ino;
+	atomic_t nref;
+	struct rb_node node;
+
+	u64 created, seq;
+	u64 parent_ino;
+	u64 parent_since;   /* snapid when our current parent became so */
+
+	u64 *prior_parent_snaps;      /* snaps inherited from any parents we */
+	int num_prior_parent_snaps;   /*  had prior to parent_since */
+	u64 *snaps;                   /* snaps specific to this realm */
+	int num_snaps;
+
+	struct ceph_snap_realm *parent;
+	struct list_head children;       /* list of child realms */
+	struct list_head child_item;
+
+	struct list_head empty_item;     /* if i have ref==0 */
+
+	struct list_head dirty_item;     /* if realm needs new context */
+
+	/* the current set of snaps for this realm */
+	struct ceph_snap_context *cached_context;
+
+	struct list_head inodes_with_caps;
+	spinlock_t inodes_with_caps_lock;
+};
+
+static inline int default_congestion_kb(void)
+{
+	int congestion_kb;
+
+	/*
+	 * Copied from NFS
+	 *
+	 * congestion size, scale with available memory.
+	 *
+	 *  64MB:    8192k
+	 * 128MB:   11585k
+	 * 256MB:   16384k
+	 * 512MB:   23170k
+	 *   1GB:   32768k
+	 *   2GB:   46340k
+	 *   4GB:   65536k
+	 *   8GB:   92681k
+	 *  16GB:  131072k
+	 *
+	 * This allows larger machines to have larger/more transfers.
+	 * Limit the default to 256M
+	 */
+	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+	if (congestion_kb > 256*1024)
+		congestion_kb = 256*1024;
+
+	return congestion_kb;
+}
+
+
+
+/* snap.c */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+					       u64 ino);
+extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+				struct ceph_snap_realm *realm);
+extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+				struct ceph_snap_realm *realm);
+extern int ceph_update_snap_trace(struct ceph_mds_client *m,
+				  void *p, void *e, bool deletion);
+extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_session *session,
+			     struct ceph_msg *msg);
+extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
+extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+				  struct ceph_cap_snap *capsnap);
+extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+
+/*
+ * a cap_snap is "pending" if it is still awaiting an in-progress
+ * sync write (that may/may not still update size, mtime, etc.).
+ */
+static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
+{
+	return !list_empty(&ci->i_cap_snaps) &&
+		list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
+			   ci_item)->writing;
+}
+
+/* inode.c */
+extern const struct inode_operations ceph_file_iops;
+
+extern struct inode *ceph_alloc_inode(struct super_block *sb);
+extern void ceph_destroy_inode(struct inode *inode);
+
+extern struct inode *ceph_get_inode(struct super_block *sb,
+				    struct ceph_vino vino);
+extern struct inode *ceph_get_snapdir(struct inode *parent);
+extern int ceph_fill_file_size(struct inode *inode, int issued,
+			       u32 truncate_seq, u64 truncate_size, u64 size);
+extern void ceph_fill_file_time(struct inode *inode, int issued,
+				u64 time_warp_seq, struct timespec *ctime,
+				struct timespec *mtime, struct timespec *atime);
+extern int ceph_fill_trace(struct super_block *sb,
+			   struct ceph_mds_request *req,
+			   struct ceph_mds_session *session);
+extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+				    struct ceph_mds_session *session);
+
+extern int ceph_inode_holds_cap(struct inode *inode, int mask);
+
+extern int ceph_inode_set_size(struct inode *inode, loff_t size);
+extern void __ceph_do_pending_vmtruncate(struct inode *inode);
+extern void ceph_queue_vmtruncate(struct inode *inode);
+
+extern void ceph_queue_invalidate(struct inode *inode);
+extern void ceph_queue_writeback(struct inode *inode);
+
+extern int ceph_do_getattr(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask, unsigned int flags);
+extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			struct kstat *stat);
+
+/* xattr.c */
+extern int ceph_setxattr(struct dentry *, const char *, const void *,
+			 size_t, int);
+extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
+extern int ceph_removexattr(struct dentry *, const char *);
+extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
+extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
+
+/* caps.c */
+extern const char *ceph_cap_string(int c);
+extern void ceph_handle_caps(struct ceph_mds_session *session,
+			     struct ceph_msg *msg);
+extern int ceph_add_cap(struct inode *inode,
+			struct ceph_mds_session *session, u64 cap_id,
+			int fmode, unsigned issued, unsigned wanted,
+			unsigned cap, unsigned seq, u64 realmino, int flags,
+			struct ceph_cap_reservation *caps_reservation);
+extern void __ceph_remove_cap(struct ceph_cap *cap);
+static inline void ceph_remove_cap(struct ceph_cap *cap)
+{
+	struct inode *inode = &cap->ci->vfs_inode;
+	spin_lock(&inode->i_lock);
+	__ceph_remove_cap(cap);
+	spin_unlock(&inode->i_lock);
+}
+extern void ceph_put_cap(struct ceph_mds_client *mdsc,
+			 struct ceph_cap *cap);
+
+extern void ceph_queue_caps_release(struct inode *inode);
+extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
+extern int ceph_fsync(struct file *file, int datasync);
+extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+				    struct ceph_mds_session *session);
+extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
+					     int mds);
+extern int ceph_get_cap_mds(struct inode *inode);
+extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
+extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+				       struct ceph_snap_context *snapc);
+extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
+			       struct ceph_mds_session **psession,
+			       int again);
+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+			    struct ceph_mds_session *session);
+extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
+
+extern int ceph_encode_inode_release(void **p, struct inode *inode,
+				     int mds, int drop, int unless, int force);
+extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
+				      int mds, int drop, int unless);
+
+extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+			 int *got, loff_t endoff);
+
+/* for counting open files by mode */
+static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
+{
+	ci->i_nr_by_mode[mode]++;
+}
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+
+/* addr.c */
+extern const struct address_space_operations ceph_aops;
+extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+
+/* file.c */
+extern const struct file_operations ceph_file_fops;
+extern const struct address_space_operations ceph_aops;
+extern int ceph_copy_to_page_vector(struct page **pages,
+				    const char *data,
+				    loff_t off, size_t len);
+extern int ceph_copy_from_page_vector(struct page **pages,
+				    char *data,
+				    loff_t off, size_t len);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
+extern int ceph_open(struct inode *inode, struct file *file);
+extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+				       struct nameidata *nd, int mode,
+				       int locked_dir);
+extern int ceph_release(struct inode *inode, struct file *filp);
+
+/* dir.c */
+extern const struct file_operations ceph_dir_fops;
+extern const struct inode_operations ceph_dir_iops;
+extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
+	ceph_snapdir_dentry_ops;
+
+extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
+extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+					 struct dentry *dentry, int err);
+
+extern void ceph_dentry_lru_add(struct dentry *dn);
+extern void ceph_dentry_lru_touch(struct dentry *dn);
+extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern unsigned ceph_dentry_hash(struct dentry *dn);
+
+/*
+ * our d_ops vary depending on whether the inode is live,
+ * snapshotted (read-only), or a virtual ".snap" directory.
+ */
+int ceph_init_dentry(struct dentry *dentry);
+
+
+/* ioctl.c */
+extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
+/* export.c */
+extern const struct export_operations ceph_export_ops;
+
+/* locks.c */
+extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
+extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
+extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
+extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
+			     int p_locks, int f_locks);
+extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
+
+static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
+{
+	if (dentry && dentry->d_parent)
+		return dentry->d_parent->d_inode;
+
+	return NULL;
+}
+
+/* debugfs.c */
+extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
+
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 00000000..f42d730f
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,854 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
+
+#include <linux/xattr.h>
+#include <linux/slab.h>
+
+static bool ceph_is_valid_xattr(const char *name)
+{
+	return !strncmp(name, "ceph.", 5) ||
+	       !strncmp(name, XATTR_SECURITY_PREFIX,
+			XATTR_SECURITY_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+
+/*
+ * These define virtual xattrs exposing the recursive directory
+ * statistics and layout metadata.
+ */
+struct ceph_vxattr_cb {
+	bool readonly;
+	char *name;
+	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
+			      size_t size);
+};
+
+/* directories */
+
+static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
+}
+
+static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
+				      size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_files);
+}
+
+static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_subdirs);
+}
+
+static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
+					 size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
+}
+
+static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
+				       size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rfiles);
+}
+
+static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
+					 size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rsubdirs);
+}
+
+static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
+				       size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rbytes);
+}
+
+static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
+				       size_t size)
+{
+	return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
+			(long)ci->i_rctime.tv_nsec);
+}
+
+static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
+	{ true, "ceph.dir.entries", ceph_vxattrcb_entries},
+	{ true, "ceph.dir.files", ceph_vxattrcb_files},
+	{ true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+	{ true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
+	{ true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+	{ true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+	{ true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+	{ true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
+	{ true, NULL, NULL }
+};
+
+/* files */
+
+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+				   size_t size)
+{
+	int ret;
+
+	ret = snprintf(val, size,
+		"chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
+		(unsigned long long)ceph_file_layout_su(ci->i_layout),
+		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+		(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+	if (ceph_file_layout_pg_preferred(ci->i_layout))
+		ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
+			    (unsigned long long)ceph_file_layout_pg_preferred(
+				    ci->i_layout));
+	return ret;
+}
+
+static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+	{ true, "ceph.layout", ceph_vxattrcb_layout},
+	{ NULL, NULL }
+};
+
+static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode))
+		return ceph_dir_vxattrs;
+	else if (S_ISREG(inode->i_mode))
+		return ceph_file_vxattrs;
+	return NULL;
+}
+
+static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
+						const char *name)
+{
+	do {
+		if (strcmp(vxattr->name, name) == 0)
+			return vxattr;
+		vxattr++;
+	} while (vxattr->name);
+	return NULL;
+}
+
+static int __set_xattr(struct ceph_inode_info *ci,
+			   const char *name, int name_len,
+			   const char *val, int val_len,
+			   int dirty,
+			   int should_free_name, int should_free_val,
+			   struct ceph_inode_xattr **newxattr)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct ceph_inode_xattr *xattr = NULL;
+	int c;
+	int new = 0;
+
+	p = &ci->i_xattrs.index.rb_node;
+	while (*p) {
+		parent = *p;
+		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+		c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else {
+			if (name_len == xattr->name_len)
+				break;
+			else if (name_len < xattr->name_len)
+				p = &(*p)->rb_left;
+			else
+				p = &(*p)->rb_right;
+		}
+		xattr = NULL;
+	}
+
+	if (!xattr) {
+		new = 1;
+		xattr = *newxattr;
+		xattr->name = name;
+		xattr->name_len = name_len;
+		xattr->should_free_name = should_free_name;
+
+		ci->i_xattrs.count++;
+		dout("__set_xattr count=%d\n", ci->i_xattrs.count);
+	} else {
+		kfree(*newxattr);
+		*newxattr = NULL;
+		if (xattr->should_free_val)
+			kfree((void *)xattr->val);
+
+		if (should_free_name) {
+			kfree((void *)name);
+			name = xattr->name;
+		}
+		ci->i_xattrs.names_size -= xattr->name_len;
+		ci->i_xattrs.vals_size -= xattr->val_len;
+	}
+	ci->i_xattrs.names_size += name_len;
+	ci->i_xattrs.vals_size += val_len;
+	if (val)
+		xattr->val = val;
+	else
+		xattr->val = "";
+
+	xattr->val_len = val_len;
+	xattr->dirty = dirty;
+	xattr->should_free_val = (val && should_free_val);
+
+	if (new) {
+		rb_link_node(&xattr->node, parent, p);
+		rb_insert_color(&xattr->node, &ci->i_xattrs.index);
+		dout("__set_xattr_val p=%p\n", p);
+	}
+
+	dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
+	     ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
+
+	return 0;
+}
+
+static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
+			   const char *name)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct ceph_inode_xattr *xattr = NULL;
+	int name_len = strlen(name);
+	int c;
+
+	p = &ci->i_xattrs.index.rb_node;
+	while (*p) {
+		parent = *p;
+		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+		c = strncmp(name, xattr->name, xattr->name_len);
+		if (c == 0 && name_len > xattr->name_len)
+			c = 1;
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else {
+			dout("__get_xattr %s: found %.*s\n", name,
+			     xattr->val_len, xattr->val);
+			return xattr;
+		}
+	}
+
+	dout("__get_xattr %s: not found\n", name);
+
+	return NULL;
+}
+
+static void __free_xattr(struct ceph_inode_xattr *xattr)
+{
+	BUG_ON(!xattr);
+
+	if (xattr->should_free_name)
+		kfree((void *)xattr->name);
+	if (xattr->should_free_val)
+		kfree((void *)xattr->val);
+
+	kfree(xattr);
+}
+
+static int __remove_xattr(struct ceph_inode_info *ci,
+			  struct ceph_inode_xattr *xattr)
+{
+	if (!xattr)
+		return -EOPNOTSUPP;
+
+	rb_erase(&xattr->node, &ci->i_xattrs.index);
+
+	if (xattr->should_free_name)
+		kfree((void *)xattr->name);
+	if (xattr->should_free_val)
+		kfree((void *)xattr->val);
+
+	ci->i_xattrs.names_size -= xattr->name_len;
+	ci->i_xattrs.vals_size -= xattr->val_len;
+	ci->i_xattrs.count--;
+	kfree(xattr);
+
+	return 0;
+}
+
+static int __remove_xattr_by_name(struct ceph_inode_info *ci,
+			   const char *name)
+{
+	struct rb_node **p;
+	struct ceph_inode_xattr *xattr;
+	int err;
+
+	p = &ci->i_xattrs.index.rb_node;
+	xattr = __get_xattr(ci, name);
+	err = __remove_xattr(ci, xattr);
+	return err;
+}
+
+static char *__copy_xattr_names(struct ceph_inode_info *ci,
+				char *dest)
+{
+	struct rb_node *p;
+	struct ceph_inode_xattr *xattr = NULL;
+
+	p = rb_first(&ci->i_xattrs.index);
+	dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
+
+	while (p) {
+		xattr = rb_entry(p, struct ceph_inode_xattr, node);
+		memcpy(dest, xattr->name, xattr->name_len);
+		dest[xattr->name_len] = '\0';
+
+		dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
+		     xattr->name_len, ci->i_xattrs.names_size);
+
+		dest += xattr->name_len + 1;
+		p = rb_next(p);
+	}
+
+	return dest;
+}
+
+void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
+{
+	struct rb_node *p, *tmp;
+	struct ceph_inode_xattr *xattr = NULL;
+
+	p = rb_first(&ci->i_xattrs.index);
+
+	dout("__ceph_destroy_xattrs p=%p\n", p);
+
+	while (p) {
+		xattr = rb_entry(p, struct ceph_inode_xattr, node);
+		tmp = p;
+		p = rb_next(tmp);
+		dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
+		     xattr->name_len, xattr->name);
+		rb_erase(tmp, &ci->i_xattrs.index);
+
+		__free_xattr(xattr);
+	}
+
+	ci->i_xattrs.names_size = 0;
+	ci->i_xattrs.vals_size = 0;
+	ci->i_xattrs.index_version = 0;
+	ci->i_xattrs.count = 0;
+	ci->i_xattrs.index = RB_ROOT;
+}
+
+static int __build_xattrs(struct inode *inode)
+	__releases(inode->i_lock)
+	__acquires(inode->i_lock)
+{
+	u32 namelen;
+	u32 numattr = 0;
+	void *p, *end;
+	u32 len;
+	const char *name, *val;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int xattr_version;
+	struct ceph_inode_xattr **xattrs = NULL;
+	int err = 0;
+	int i;
+
+	dout("__build_xattrs() len=%d\n",
+	     ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
+
+	if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
+		return 0; /* already built */
+
+	__ceph_destroy_xattrs(ci);
+
+start:
+	/* updated internal xattr rb tree */
+	if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
+		p = ci->i_xattrs.blob->vec.iov_base;
+		end = p + ci->i_xattrs.blob->vec.iov_len;
+		ceph_decode_32_safe(&p, end, numattr, bad);
+		xattr_version = ci->i_xattrs.version;
+		spin_unlock(&inode->i_lock);
+
+		xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
+				 GFP_NOFS);
+		err = -ENOMEM;
+		if (!xattrs)
+			goto bad_lock;
+		memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
+		for (i = 0; i < numattr; i++) {
+			xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
+					    GFP_NOFS);
+			if (!xattrs[i])
+				goto bad_lock;
+		}
+
+		spin_lock(&inode->i_lock);
+		if (ci->i_xattrs.version != xattr_version) {
+			/* lost a race, retry */
+			for (i = 0; i < numattr; i++)
+				kfree(xattrs[i]);
+			kfree(xattrs);
+			goto start;
+		}
+		err = -EIO;
+		while (numattr--) {
+			ceph_decode_32_safe(&p, end, len, bad);
+			namelen = len;
+			name = p;
+			p += len;
+			ceph_decode_32_safe(&p, end, len, bad);
+			val = p;
+			p += len;
+
+			err = __set_xattr(ci, name, namelen, val, len,
+					  0, 0, 0, &xattrs[numattr]);
+
+			if (err < 0)
+				goto bad;
+		}
+		kfree(xattrs);
+	}
+	ci->i_xattrs.index_version = ci->i_xattrs.version;
+	ci->i_xattrs.dirty = false;
+
+	return err;
+bad_lock:
+	spin_lock(&inode->i_lock);
+bad:
+	if (xattrs) {
+		for (i = 0; i < numattr; i++)
+			kfree(xattrs[i]);
+		kfree(xattrs);
+	}
+	ci->i_xattrs.names_size = 0;
+	return err;
+}
+
+static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
+				    int val_size)
+{
+	/*
+	 * 4 bytes for the length, and additional 4 bytes per each xattr name,
+	 * 4 bytes per each value
+	 */
+	int size = 4 + ci->i_xattrs.count*(4 + 4) +
+			     ci->i_xattrs.names_size +
+			     ci->i_xattrs.vals_size;
+	dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
+	     ci->i_xattrs.count, ci->i_xattrs.names_size,
+	     ci->i_xattrs.vals_size);
+
+	if (name_size)
+		size += 4 + 4 + name_size + val_size;
+
+	return size;
+}
+
+/*
+ * If there are dirty xattrs, reencode xattrs into the prealloc_blob
+ * and swap into place.
+ */
+void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
+{
+	struct rb_node *p;
+	struct ceph_inode_xattr *xattr = NULL;
+	void *dest;
+
+	dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
+	if (ci->i_xattrs.dirty) {
+		int need = __get_required_blob_size(ci, 0, 0);
+
+		BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
+
+		p = rb_first(&ci->i_xattrs.index);
+		dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
+
+		ceph_encode_32(&dest, ci->i_xattrs.count);
+		while (p) {
+			xattr = rb_entry(p, struct ceph_inode_xattr, node);
+
+			ceph_encode_32(&dest, xattr->name_len);
+			memcpy(dest, xattr->name, xattr->name_len);
+			dest += xattr->name_len;
+			ceph_encode_32(&dest, xattr->val_len);
+			memcpy(dest, xattr->val, xattr->val_len);
+			dest += xattr->val_len;
+
+			p = rb_next(p);
+		}
+
+		/* adjust buffer len; it may be larger than we need */
+		ci->i_xattrs.prealloc_blob->vec.iov_len =
+			dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
+
+		if (ci->i_xattrs.blob)
+			ceph_buffer_put(ci->i_xattrs.blob);
+		ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
+		ci->i_xattrs.prealloc_blob = NULL;
+		ci->i_xattrs.dirty = false;
+		ci->i_xattrs.version++;
+	}
+}
+
+ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+		      size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	int err;
+	struct ceph_inode_xattr *xattr;
+	struct ceph_vxattr_cb *vxattr = NULL;
+
+	if (!ceph_is_valid_xattr(name))
+		return -ENODATA;
+
+	/* let's see if a virtual xattr was requested */
+	if (vxattrs)
+		vxattr = ceph_match_vxattr(vxattrs, name);
+
+	spin_lock(&inode->i_lock);
+	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+	     ci->i_xattrs.version, ci->i_xattrs.index_version);
+
+	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
+	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
+		goto get_xattr;
+	} else {
+		spin_unlock(&inode->i_lock);
+		/* get xattrs from mds (if we don't already have them) */
+		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+		if (err)
+			return err;
+	}
+
+	spin_lock(&inode->i_lock);
+
+	if (vxattr && vxattr->readonly) {
+		err = vxattr->getxattr_cb(ci, value, size);
+		goto out;
+	}
+
+	err = __build_xattrs(inode);
+	if (err < 0)
+		goto out;
+
+get_xattr:
+	err = -ENODATA;  /* == ENOATTR */
+	xattr = __get_xattr(ci, name);
+	if (!xattr) {
+		if (vxattr)
+			err = vxattr->getxattr_cb(ci, value, size);
+		goto out;
+	}
+
+	err = -ERANGE;
+	if (size && size < xattr->val_len)
+		goto out;
+
+	err = xattr->val_len;
+	if (size == 0)
+		goto out;
+
+	memcpy(value, xattr->val, xattr->val_len);
+
+out:
+	spin_unlock(&inode->i_lock);
+	return err;
+}
+
+ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	u32 vir_namelen = 0;
+	u32 namelen;
+	int err;
+	u32 len;
+	int i;
+
+	spin_lock(&inode->i_lock);
+	dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
+	     ci->i_xattrs.version, ci->i_xattrs.index_version);
+
+	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
+	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
+		goto list_xattr;
+	} else {
+		spin_unlock(&inode->i_lock);
+		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+		if (err)
+			return err;
+	}
+
+	spin_lock(&inode->i_lock);
+
+	err = __build_xattrs(inode);
+	if (err < 0)
+		goto out;
+
+list_xattr:
+	vir_namelen = 0;
+	/* include virtual dir xattrs */
+	if (vxattrs)
+		for (i = 0; vxattrs[i].name; i++)
+			vir_namelen += strlen(vxattrs[i].name) + 1;
+	/* adding 1 byte per each variable due to the null termination */
+	namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
+	err = -ERANGE;
+	if (size && namelen > size)
+		goto out;
+
+	err = namelen;
+	if (size == 0)
+		goto out;
+
+	names = __copy_xattr_names(ci, names);
+
+	/* virtual xattr names, too */
+	if (vxattrs)
+		for (i = 0; vxattrs[i].name; i++) {
+			len = sprintf(names, "%s", vxattrs[i].name);
+			names += len + 1;
+		}
+
+out:
+	spin_unlock(&inode->i_lock);
+	return err;
+}
+
+static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
+			      const char *value, size_t size, int flags)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct inode *parent_inode = dentry->d_parent->d_inode;
+	struct ceph_mds_request *req;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	int err;
+	int i, nr_pages;
+	struct page **pages = NULL;
+	void *kaddr;
+
+	/* copy value into some pages */
+	nr_pages = calc_pages_for(0, size);
+	if (nr_pages) {
+		pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
+		if (!pages)
+			return -ENOMEM;
+		err = -ENOMEM;
+		for (i = 0; i < nr_pages; i++) {
+			pages[i] = __page_cache_alloc(GFP_NOFS);
+			if (!pages[i]) {
+				nr_pages = i;
+				goto out;
+			}
+			kaddr = kmap(pages[i]);
+			memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
+			       min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
+		}
+	}
+
+	dout("setxattr value=%.*s\n", (int)size, value);
+
+	/* do request */
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+	req->r_inode = inode;
+	ihold(inode);
+	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+	req->r_num_caps = 1;
+	req->r_args.setxattr.flags = cpu_to_le32(flags);
+	req->r_path2 = kstrdup(name, GFP_NOFS);
+
+	req->r_pages = pages;
+	req->r_num_pages = nr_pages;
+	req->r_data_len = size;
+
+	dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	ceph_mdsc_put_request(req);
+	dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
+
+out:
+	if (pages) {
+		for (i = 0; i < nr_pages; i++)
+			__free_page(pages[i]);
+		kfree(pages);
+	}
+	return err;
+}
+
+int ceph_setxattr(struct dentry *dentry, const char *name,
+		  const void *value, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	int err;
+	int name_len = strlen(name);
+	int val_len = size;
+	char *newname = NULL;
+	char *newval = NULL;
+	struct ceph_inode_xattr *xattr = NULL;
+	int issued;
+	int required_blob_size;
+	int dirty;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (!ceph_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	if (vxattrs) {
+		struct ceph_vxattr_cb *vxattr =
+			ceph_match_vxattr(vxattrs, name);
+		if (vxattr && vxattr->readonly)
+			return -EOPNOTSUPP;
+	}
+
+	/* preallocate memory for xattr name, value, index node */
+	err = -ENOMEM;
+	newname = kmemdup(name, name_len + 1, GFP_NOFS);
+	if (!newname)
+		goto out;
+
+	if (val_len) {
+		newval = kmalloc(val_len + 1, GFP_NOFS);
+		if (!newval)
+			goto out;
+		memcpy(newval, value, val_len);
+		newval[val_len] = '\0';
+	}
+
+	xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
+	if (!xattr)
+		goto out;
+
+	spin_lock(&inode->i_lock);
+retry:
+	issued = __ceph_caps_issued(ci, NULL);
+	if (!(issued & CEPH_CAP_XATTR_EXCL))
+		goto do_sync;
+	__build_xattrs(inode);
+
+	required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+
+	if (!ci->i_xattrs.prealloc_blob ||
+	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+		struct ceph_buffer *blob = NULL;
+
+		spin_unlock(&inode->i_lock);
+		dout(" preaallocating new blob size=%d\n", required_blob_size);
+		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
+		if (!blob)
+			goto out;
+		spin_lock(&inode->i_lock);
+		if (ci->i_xattrs.prealloc_blob)
+			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+		ci->i_xattrs.prealloc_blob = blob;
+		goto retry;
+	}
+
+	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
+	err = __set_xattr(ci, newname, name_len, newval,
+			  val_len, 1, 1, 1, &xattr);
+	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+	ci->i_xattrs.dirty = true;
+	inode->i_ctime = CURRENT_TIME;
+	spin_unlock(&inode->i_lock);
+	if (dirty)
+		__mark_inode_dirty(inode, dirty);
+	return err;
+
+do_sync:
+	spin_unlock(&inode->i_lock);
+	err = ceph_sync_setxattr(dentry, name, value, size, flags);
+out:
+	kfree(newname);
+	kfree(newval);
+	kfree(xattr);
+	return err;
+}
+
+static int ceph_send_removexattr(struct dentry *dentry, const char *name)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct inode *inode = dentry->d_inode;
+	struct inode *parent_inode = dentry->d_parent->d_inode;
+	struct ceph_mds_request *req;
+	int err;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = inode;
+	ihold(inode);
+	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+	req->r_num_caps = 1;
+	req->r_path2 = kstrdup(name, GFP_NOFS);
+
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+int ceph_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	int issued;
+	int err;
+	int dirty;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (!ceph_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	if (vxattrs) {
+		struct ceph_vxattr_cb *vxattr =
+			ceph_match_vxattr(vxattrs, name);
+		if (vxattr && vxattr->readonly)
+			return -EOPNOTSUPP;
+	}
+
+	spin_lock(&inode->i_lock);
+	__build_xattrs(inode);
+	issued = __ceph_caps_issued(ci, NULL);
+	dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
+
+	if (!(issued & CEPH_CAP_XATTR_EXCL))
+		goto do_sync;
+
+	err = __remove_xattr_by_name(ceph_inode(inode), name);
+	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+	ci->i_xattrs.dirty = true;
+	inode->i_ctime = CURRENT_TIME;
+
+	spin_unlock(&inode->i_lock);
+	if (dirty)
+		__mark_inode_dirty(inode, dirty);
+	return err;
+do_sync:
+	spin_unlock(&inode->i_lock);
+	err = ceph_send_removexattr(dentry, name);
+	return err;
+}
+
diff --git a/fs/char_dev.c b/fs/char_dev.c
new file mode 100644
index 00000000..dca9e5e0
--- /dev/null
+++ b/fs/char_dev.c
@@ -0,0 +1,575 @@
+/*
+ *  linux/fs/char_dev.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <linux/major.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include <linux/kobject.h>
+#include <linux/kobj_map.h>
+#include <linux/cdev.h>
+#include <linux/mutex.h>
+#include <linux/backing-dev.h>
+#include <linux/tty.h>
+
+#include "internal.h"
+
+/*
+ * capabilities for /dev/mem, /dev/kmem and similar directly mappable character
+ * devices
+ * - permits shared-mmap for read, write and/or exec
+ * - does not permit private mmap in NOMMU mode (can't do COW)
+ * - no readahead or I/O queue unplugging required
+ */
+struct backing_dev_info directly_mappable_cdev_bdi = {
+	.name = "char",
+	.capabilities	= (
+#ifdef CONFIG_MMU
+		/* permit private copies of the data to be taken */
+		BDI_CAP_MAP_COPY |
+#endif
+		/* permit direct mmap, for read, write or exec */
+		BDI_CAP_MAP_DIRECT |
+		BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
+		/* no writeback happens */
+		BDI_CAP_NO_ACCT_AND_WRITEBACK),
+};
+
+static struct kobj_map *cdev_map;
+
+static DEFINE_MUTEX(chrdevs_lock);
+
+static struct char_device_struct {
+	struct char_device_struct *next;
+	unsigned int major;
+	unsigned int baseminor;
+	int minorct;
+	char name[64];
+	struct cdev *cdev;		/* will die */
+} *chrdevs[CHRDEV_MAJOR_HASH_SIZE];
+
+/* index in the above */
+static inline int major_to_index(unsigned major)
+{
+	return major % CHRDEV_MAJOR_HASH_SIZE;
+}
+
+#ifdef CONFIG_PROC_FS
+
+void chrdev_show(struct seq_file *f, off_t offset)
+{
+	struct char_device_struct *cd;
+
+	if (offset < CHRDEV_MAJOR_HASH_SIZE) {
+		mutex_lock(&chrdevs_lock);
+		for (cd = chrdevs[offset]; cd; cd = cd->next)
+			seq_printf(f, "%3d %s\n", cd->major, cd->name);
+		mutex_unlock(&chrdevs_lock);
+	}
+}
+
+#endif /* CONFIG_PROC_FS */
+
+/*
+ * Register a single major with a specified minor range.
+ *
+ * If major == 0 this functions will dynamically allocate a major and return
+ * its number.
+ *
+ * If major > 0 this function will attempt to reserve the passed range of
+ * minors and will return zero on success.
+ *
+ * Returns a -ve errno on failure.
+ */
+static struct char_device_struct *
+__register_chrdev_region(unsigned int major, unsigned int baseminor,
+			   int minorct, const char *name)
+{
+	struct char_device_struct *cd, **cp;
+	int ret = 0;
+	int i;
+
+	cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL);
+	if (cd == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_lock(&chrdevs_lock);
+
+	/* temporary */
+	if (major == 0) {
+		for (i = ARRAY_SIZE(chrdevs)-1; i > 0; i--) {
+			if (chrdevs[i] == NULL)
+				break;
+		}
+
+		if (i == 0) {
+			ret = -EBUSY;
+			goto out;
+		}
+		major = i;
+		ret = major;
+	}
+
+	cd->major = major;
+	cd->baseminor = baseminor;
+	cd->minorct = minorct;
+	strlcpy(cd->name, name, sizeof(cd->name));
+
+	i = major_to_index(major);
+
+	for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next)
+		if ((*cp)->major > major ||
+		    ((*cp)->major == major &&
+		     (((*cp)->baseminor >= baseminor) ||
+		      ((*cp)->baseminor + (*cp)->minorct > baseminor))))
+			break;
+
+	/* Check for overlapping minor ranges.  */
+	if (*cp && (*cp)->major == major) {
+		int old_min = (*cp)->baseminor;
+		int old_max = (*cp)->baseminor + (*cp)->minorct - 1;
+		int new_min = baseminor;
+		int new_max = baseminor + minorct - 1;
+
+		/* New driver overlaps from the left.  */
+		if (new_max >= old_min && new_max <= old_max) {
+			ret = -EBUSY;
+			goto out;
+		}
+
+		/* New driver overlaps from the right.  */
+		if (new_min <= old_max && new_min >= old_min) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+
+	cd->next = *cp;
+	*cp = cd;
+	mutex_unlock(&chrdevs_lock);
+	return cd;
+out:
+	mutex_unlock(&chrdevs_lock);
+	kfree(cd);
+	return ERR_PTR(ret);
+}
+
+static struct char_device_struct *
+__unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct)
+{
+	struct char_device_struct *cd = NULL, **cp;
+	int i = major_to_index(major);
+
+	mutex_lock(&chrdevs_lock);
+	for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next)
+		if ((*cp)->major == major &&
+		    (*cp)->baseminor == baseminor &&
+		    (*cp)->minorct == minorct)
+			break;
+	if (*cp) {
+		cd = *cp;
+		*cp = cd->next;
+	}
+	mutex_unlock(&chrdevs_lock);
+	return cd;
+}
+
+/**
+ * register_chrdev_region() - register a range of device numbers
+ * @from: the first in the desired range of device numbers; must include
+ *        the major number.
+ * @count: the number of consecutive device numbers required
+ * @name: the name of the device or driver.
+ *
+ * Return value is zero on success, a negative error code on failure.
+ */
+int register_chrdev_region(dev_t from, unsigned count, const char *name)
+{
+	struct char_device_struct *cd;
+	dev_t to = from + count;
+	dev_t n, next;
+
+	for (n = from; n < to; n = next) {
+		next = MKDEV(MAJOR(n)+1, 0);
+		if (next > to)
+			next = to;
+		cd = __register_chrdev_region(MAJOR(n), MINOR(n),
+			       next - n, name);
+		if (IS_ERR(cd))
+			goto fail;
+	}
+	return 0;
+fail:
+	to = n;
+	for (n = from; n < to; n = next) {
+		next = MKDEV(MAJOR(n)+1, 0);
+		kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n));
+	}
+	return PTR_ERR(cd);
+}
+
+/**
+ * alloc_chrdev_region() - register a range of char device numbers
+ * @dev: output parameter for first assigned number
+ * @baseminor: first of the requested range of minor numbers
+ * @count: the number of minor numbers required
+ * @name: the name of the associated device or driver
+ *
+ * Allocates a range of char device numbers.  The major number will be
+ * chosen dynamically, and returned (along with the first minor number)
+ * in @dev.  Returns zero or a negative error code.
+ */
+int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
+			const char *name)
+{
+	struct char_device_struct *cd;
+	cd = __register_chrdev_region(0, baseminor, count, name);
+	if (IS_ERR(cd))
+		return PTR_ERR(cd);
+	*dev = MKDEV(cd->major, cd->baseminor);
+	return 0;
+}
+
+/**
+ * __register_chrdev() - create and register a cdev occupying a range of minors
+ * @major: major device number or 0 for dynamic allocation
+ * @baseminor: first of the requested range of minor numbers
+ * @count: the number of minor numbers required
+ * @name: name of this range of devices
+ * @fops: file operations associated with this devices
+ *
+ * If @major == 0 this functions will dynamically allocate a major and return
+ * its number.
+ *
+ * If @major > 0 this function will attempt to reserve a device with the given
+ * major number and will return zero on success.
+ *
+ * Returns a -ve errno on failure.
+ *
+ * The name of this device has nothing to do with the name of the device in
+ * /dev. It only helps to keep track of the different owners of devices. If
+ * your module name has only one type of devices it's ok to use e.g. the name
+ * of the module here.
+ */
+int __register_chrdev(unsigned int major, unsigned int baseminor,
+		      unsigned int count, const char *name,
+		      const struct file_operations *fops)
+{
+	struct char_device_struct *cd;
+	struct cdev *cdev;
+	int err = -ENOMEM;
+
+	cd = __register_chrdev_region(major, baseminor, count, name);
+	if (IS_ERR(cd))
+		return PTR_ERR(cd);
+	
+	cdev = cdev_alloc();
+	if (!cdev)
+		goto out2;
+
+	cdev->owner = fops->owner;
+	cdev->ops = fops;
+	kobject_set_name(&cdev->kobj, "%s", name);
+		
+	err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
+	if (err)
+		goto out;
+
+	cd->cdev = cdev;
+
+	return major ? 0 : cd->major;
+out:
+	kobject_put(&cdev->kobj);
+out2:
+	kfree(__unregister_chrdev_region(cd->major, baseminor, count));
+	return err;
+}
+
+/**
+ * unregister_chrdev_region() - return a range of device numbers
+ * @from: the first in the range of numbers to unregister
+ * @count: the number of device numbers to unregister
+ *
+ * This function will unregister a range of @count device numbers,
+ * starting with @from.  The caller should normally be the one who
+ * allocated those numbers in the first place...
+ */
+void unregister_chrdev_region(dev_t from, unsigned count)
+{
+	dev_t to = from + count;
+	dev_t n, next;
+
+	for (n = from; n < to; n = next) {
+		next = MKDEV(MAJOR(n)+1, 0);
+		if (next > to)
+			next = to;
+		kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n));
+	}
+}
+
+/**
+ * __unregister_chrdev - unregister and destroy a cdev
+ * @major: major device number
+ * @baseminor: first of the range of minor numbers
+ * @count: the number of minor numbers this cdev is occupying
+ * @name: name of this range of devices
+ *
+ * Unregister and destroy the cdev occupying the region described by
+ * @major, @baseminor and @count.  This function undoes what
+ * __register_chrdev() did.
+ */
+void __unregister_chrdev(unsigned int major, unsigned int baseminor,
+			 unsigned int count, const char *name)
+{
+	struct char_device_struct *cd;
+
+	cd = __unregister_chrdev_region(major, baseminor, count);
+	if (cd && cd->cdev)
+		cdev_del(cd->cdev);
+	kfree(cd);
+}
+
+static DEFINE_SPINLOCK(cdev_lock);
+
+static struct kobject *cdev_get(struct cdev *p)
+{
+	struct module *owner = p->owner;
+	struct kobject *kobj;
+
+	if (owner && !try_module_get(owner))
+		return NULL;
+	kobj = kobject_get(&p->kobj);
+	if (!kobj)
+		module_put(owner);
+	return kobj;
+}
+
+void cdev_put(struct cdev *p)
+{
+	if (p) {
+		struct module *owner = p->owner;
+		kobject_put(&p->kobj);
+		module_put(owner);
+	}
+}
+
+/*
+ * Called every time a character special file is opened
+ */
+static int chrdev_open(struct inode *inode, struct file *filp)
+{
+	struct cdev *p;
+	struct cdev *new = NULL;
+	int ret = 0;
+
+	spin_lock(&cdev_lock);
+	p = inode->i_cdev;
+	if (!p) {
+		struct kobject *kobj;
+		int idx;
+		spin_unlock(&cdev_lock);
+		kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
+		if (!kobj)
+			return -ENXIO;
+		new = container_of(kobj, struct cdev, kobj);
+		spin_lock(&cdev_lock);
+		/* Check i_cdev again in case somebody beat us to it while
+		   we dropped the lock. */
+		p = inode->i_cdev;
+		if (!p) {
+			inode->i_cdev = p = new;
+			list_add(&inode->i_devices, &p->list);
+			new = NULL;
+		} else if (!cdev_get(p))
+			ret = -ENXIO;
+	} else if (!cdev_get(p))
+		ret = -ENXIO;
+	spin_unlock(&cdev_lock);
+	cdev_put(new);
+	if (ret)
+		return ret;
+
+	ret = -ENXIO;
+	filp->f_op = fops_get(p->ops);
+	if (!filp->f_op)
+		goto out_cdev_put;
+
+	if (filp->f_op->open) {
+		ret = filp->f_op->open(inode,filp);
+		if (ret)
+			goto out_cdev_put;
+	}
+
+	return 0;
+
+ out_cdev_put:
+	cdev_put(p);
+	return ret;
+}
+
+void cd_forget(struct inode *inode)
+{
+	spin_lock(&cdev_lock);
+	list_del_init(&inode->i_devices);
+	inode->i_cdev = NULL;
+	spin_unlock(&cdev_lock);
+}
+
+static void cdev_purge(struct cdev *cdev)
+{
+	spin_lock(&cdev_lock);
+	while (!list_empty(&cdev->list)) {
+		struct inode *inode;
+		inode = container_of(cdev->list.next, struct inode, i_devices);
+		list_del_init(&inode->i_devices);
+		inode->i_cdev = NULL;
+	}
+	spin_unlock(&cdev_lock);
+}
+
+/*
+ * Dummy default file-operations: the only thing this does
+ * is contain the open that then fills in the correct operations
+ * depending on the special file...
+ */
+const struct file_operations def_chr_fops = {
+	.open = chrdev_open,
+	.llseek = noop_llseek,
+};
+
+static struct kobject *exact_match(dev_t dev, int *part, void *data)
+{
+	struct cdev *p = data;
+	return &p->kobj;
+}
+
+static int exact_lock(dev_t dev, void *data)
+{
+	struct cdev *p = data;
+	return cdev_get(p) ? 0 : -1;
+}
+
+/**
+ * cdev_add() - add a char device to the system
+ * @p: the cdev structure for the device
+ * @dev: the first device number for which this device is responsible
+ * @count: the number of consecutive minor numbers corresponding to this
+ *         device
+ *
+ * cdev_add() adds the device represented by @p to the system, making it
+ * live immediately.  A negative error code is returned on failure.
+ */
+int cdev_add(struct cdev *p, dev_t dev, unsigned count)
+{
+	p->dev = dev;
+	p->count = count;
+	return kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p);
+}
+
+static void cdev_unmap(dev_t dev, unsigned count)
+{
+	kobj_unmap(cdev_map, dev, count);
+}
+
+/**
+ * cdev_del() - remove a cdev from the system
+ * @p: the cdev structure to be removed
+ *
+ * cdev_del() removes @p from the system, possibly freeing the structure
+ * itself.
+ */
+void cdev_del(struct cdev *p)
+{
+	cdev_unmap(p->dev, p->count);
+	kobject_put(&p->kobj);
+}
+
+
+static void cdev_default_release(struct kobject *kobj)
+{
+	struct cdev *p = container_of(kobj, struct cdev, kobj);
+	cdev_purge(p);
+}
+
+static void cdev_dynamic_release(struct kobject *kobj)
+{
+	struct cdev *p = container_of(kobj, struct cdev, kobj);
+	cdev_purge(p);
+	kfree(p);
+}
+
+static struct kobj_type ktype_cdev_default = {
+	.release	= cdev_default_release,
+};
+
+static struct kobj_type ktype_cdev_dynamic = {
+	.release	= cdev_dynamic_release,
+};
+
+/**
+ * cdev_alloc() - allocate a cdev structure
+ *
+ * Allocates and returns a cdev structure, or NULL on failure.
+ */
+struct cdev *cdev_alloc(void)
+{
+	struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL);
+	if (p) {
+		INIT_LIST_HEAD(&p->list);
+		kobject_init(&p->kobj, &ktype_cdev_dynamic);
+	}
+	return p;
+}
+
+/**
+ * cdev_init() - initialize a cdev structure
+ * @cdev: the structure to initialize
+ * @fops: the file_operations for this device
+ *
+ * Initializes @cdev, remembering @fops, making it ready to add to the
+ * system with cdev_add().
+ */
+void cdev_init(struct cdev *cdev, const struct file_operations *fops)
+{
+	memset(cdev, 0, sizeof *cdev);
+	INIT_LIST_HEAD(&cdev->list);
+	kobject_init(&cdev->kobj, &ktype_cdev_default);
+	cdev->ops = fops;
+}
+
+static struct kobject *base_probe(dev_t dev, int *part, void *data)
+{
+	if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0)
+		/* Make old-style 2.4 aliases work */
+		request_module("char-major-%d", MAJOR(dev));
+	return NULL;
+}
+
+void __init chrdev_init(void)
+{
+	cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
+	bdi_init(&directly_mappable_cdev_bdi);
+}
+
+
+/* Let modules do char dev stuff */
+EXPORT_SYMBOL(register_chrdev_region);
+EXPORT_SYMBOL(unregister_chrdev_region);
+EXPORT_SYMBOL(alloc_chrdev_region);
+EXPORT_SYMBOL(cdev_init);
+EXPORT_SYMBOL(cdev_alloc);
+EXPORT_SYMBOL(cdev_del);
+EXPORT_SYMBOL(cdev_add);
+EXPORT_SYMBOL(__register_chrdev);
+EXPORT_SYMBOL(__unregister_chrdev);
+EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/AUTHORS b/fs/cifs/AUTHORS
new file mode 100644
index 00000000..ea940b1d
--- /dev/null
+++ b/fs/cifs/AUTHORS
@@ -0,0 +1,55 @@
+Original Author
+===============
+Steve French (sfrench@samba.org)
+
+The author wishes to express his appreciation and thanks to:
+Andrew Tridgell (Samba team) for his early suggestions about smb/cifs VFS
+improvements. Thanks to IBM for allowing me time and test resources to pursue
+this project, to Jim McDonough from IBM (and the Samba Team) for his help, to
+the IBM Linux JFS team for explaining many esoteric Linux filesystem features.
+Jeremy Allison of the Samba team has done invaluable work in adding the server
+side of the original CIFS Unix extensions and reviewing and implementing
+portions of the newer CIFS POSIX extensions into the Samba 3 file server. Thank
+Dave Boutcher of IBM Rochester (author of the OS/400 smb/cifs filesystem client)
+for proving years ago that very good smb/cifs clients could be done on Unix-like
+operating systems.  Volker Lendecke, Andrew Tridgell, Urban Widmark, John 
+Newbigin and others for their work on the Linux smbfs module.  Thanks to
+the other members of the Storage Network Industry Association CIFS Technical
+Workgroup for their work specifying this highly complex protocol and finally
+thanks to the Samba team for their technical advice and encouragement.
+
+Patch Contributors
+------------------
+Zwane Mwaikambo
+Andi Kleen
+Amrut Joshi
+Shobhit Dayal
+Sergey Vlasov
+Richard Hughes
+Yury Umanets
+Mark Hamzy (for some of the early cifs IPv6 work)
+Domen Puncer
+Jesper Juhl (in particular for lots of whitespace/formatting cleanup)
+Vince Negri and Dave Stahl (for finding an important caching bug)
+Adrian Bunk (kcalloc cleanups)
+Miklos Szeredi 
+Kazeon team for various fixes especially for 2.4 version.
+Asser Ferno (Change Notify support)
+Shaggy (Dave Kleikamp) for innumerable small fs suggestions and some good cleanup
+Gunter Kukkukk (testing and suggestions for support of old servers)
+Igor Mammedov (DFS support)
+Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code)
+
+Test case and Bug Report contributors
+-------------------------------------
+Thanks to those in the community who have submitted detailed bug reports
+and debug of problems they have found:  Jochen Dolze, David Blaine,
+Rene Scharfe, Martin Josefsson, Alexander Wild, Anthony Liguori,
+Lars Muller, Urban Widmark, Massimiliano Ferrero, Howard Owen,
+Olaf Kirch, Kieron Briggs, Nick Millington and others. Also special
+mention to the Stanford Checker (SWAT) which pointed out many minor
+bugs in error paths.  Valuable suggestions also have come from Al Viro
+and Dave Miller.
+
+And thanks to the IBM LTC and Power test teams and SuSE testers for
+finding multiple bugs during excellent stress test runs.
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
new file mode 100644
index 00000000..bc0025cd
--- /dev/null
+++ b/fs/cifs/CHANGES
@@ -0,0 +1,1065 @@
+Version 1.62
+------------
+Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened
+to more strictly handle corrupt frames.
+
+Version 1.61
+------------
+Fix append problem to Samba servers (files opened with O_APPEND could
+have duplicated data). Fix oops in cifs_lookup. Workaround problem
+mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
+Disable use of server inode numbers when server only
+partially supports them (e.g. for one server querying inode numbers on
+FindFirst fails but QPathInfo queries works). Fix oops with dfs in 
+cifs_put_smb_ses. Fix mmap to work on directio mounts (needed
+for OpenOffice when on forcedirectio mount e.g.)
+
+Version 1.60
+-------------
+Fix memory leak in reconnect.  Fix oops in DFS mount error path.
+Set s_maxbytes to smaller (the max that vfs can handle) so that
+sendfile will now work over cifs mounts again.  Add noforcegid
+and noforceuid mount parameters. Fix small mem leak when using
+ntlmv2. Fix 2nd mount to same server but with different port to
+be allowed (rather than reusing the 1st port) - only when the
+user explicitly overrides the port on the 2nd mount.
+
+Version 1.59
+------------
+Client uses server inode numbers (which are persistent) rather than
+client generated ones by default (mount option "serverino" turned
+on by default if server supports it).  Add forceuid and forcegid
+mount options (so that when negotiating unix extensions specifying
+which uid mounted does not immediately force the server's reported
+uids to be overridden).  Add support for scope mount parm. Improve
+hard link detection to use same inode for both.  Do not set
+read-only dos attribute on directories (for chmod) since Windows
+explorer special cases this attribute bit for directories for
+a different purpose.
+
+Version 1.58
+------------
+Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
+when the UTF-8 string is composed of unusually long (more than 4 byte) converted
+characters. Add support for mounting root of a share which redirects immediately
+to DFS target. Convert string conversion functions from Unicode to more
+accurately mark string length before allocating memory (which may help the
+rare cases where a UTF-8 string is much larger than the UCS2 string that
+we converted from).  Fix endianness of the vcnum field used during
+session setup to distinguish multiple mounts to same server from different
+userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
+flag to be set to 2, and mount must enable krb5 to turn on extended security).
+Performance of file create to Samba improved (posix create on lookup
+removes 1 of 2 network requests sent on file create)
+ 
+Version 1.57
+------------
+Improve support for multiple security contexts to the same server. We
+used to use the same "vcnumber" for all connections which could cause
+the server to treat subsequent connections, especially those that
+are authenticated as guest, as reconnections, invalidating the earlier
+user's smb session.  This fix allows cifs to mount multiple times to the
+same server with different userids without risking invalidating earlier
+established security contexts.  fsync now sends SMB Flush operation
+to better ensure that we wait for server to write all of the data to
+server disk (not just write it over the network).  Add new mount
+parameter to allow user to disable sending the (slow) SMB flush on
+fsync if desired (fsync still flushes all cached write data to the server).
+Posix file open support added (turned off after one attempt if server
+fails to support it properly, as with Samba server versions prior to 3.3.2)
+Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
+little memory for the "nativeFileSystem" field returned by the server
+during mount).  Endian convert inode numbers if necessary (makes it easier
+to compare inode numbers on network files from big endian systems). 
+
+Version 1.56
+------------
+Add "forcemandatorylock" mount option to allow user to use mandatory
+rather than posix (advisory) byte range locks, even though server would
+support posix byte range locks.  Fix query of root inode when prefixpath
+specified and user does not have access to query information about the
+top of the share.  Fix problem in 2.6.28 resolving DFS paths to
+Samba servers (worked to Windows).  Fix rmdir so that pending search
+(readdir) requests do not get invalid results which include the now
+removed directory.  Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
+when using DFS.  Add better file create support to servers which support
+the CIFS POSIX protocol extensions (this adds support for new flags
+on create, and improves semantics for write of locked ranges).
+
+Version 1.55
+------------
+Various fixes to make delete of open files behavior more predictable
+(when delete of an open file fails we mark the file as "delete-on-close"
+in a way that more servers accept, but only if we can first rename the
+file to a temporary name).  Add experimental support for more safely
+handling fcntl(F_SETLEASE).  Convert cifs to using blocking tcp
+sends, and also let tcp autotune the socket send and receive buffers.
+This reduces the number of EAGAIN errors returned by TCP/IP in
+high stress workloads (and the number of retries on socket writes
+when sending large SMBWriteX requests).  Fix case in which a portion of
+data can in some cases not get written to the file on the server before the
+file is closed.  Fix DFS parsing to properly handle path consumed field,
+and to handle certain codepage conversions better.  Fix mount and
+umount race that can cause oops in mount or umount or reconnect.
+
+Version 1.54
+------------
+Fix premature write failure on congested networks (we would give up
+on EAGAIN from the socket too quickly on large writes).
+Cifs_mkdir and cifs_create now respect the setgid bit on parent dir.
+Fix endian problems in acl (mode from/to cifs acl) on bigendian
+architectures.  Fix problems with preserving timestamps on copying open
+files (e.g. "cp -a") to Windows servers.  For mkdir and create honor setgid bit
+on parent directory when server supports Unix Extensions but not POSIX
+create. Update cifs.upcall version to handle new Kerberos sec flags
+(this requires update of cifs.upcall program from Samba).  Fix memory leak
+on dns_upcall (resolving DFS referralls).  Fix plain text password
+authentication (requires setting SecurityFlags to 0x30030 to enable
+lanman and plain text though).  Fix writes to be at correct offset when
+file is open with O_APPEND and file is on a directio (forcediretio) mount.
+Fix bug in rewinding readdir directory searches.  Add nodfs mount option.
+
+Version 1.53
+------------
+DFS support added (Microsoft Distributed File System client support needed
+for referrals which enable a hierarchical name space among servers).
+Disable temporary caching of mode bits to servers which do not support
+storing of mode (e.g. Windows servers, when client mounts without cifsacl
+mount option) and add new "dynperm" mount option to enable temporary caching
+of mode (enable old behavior).  Fix hang on mount caused when server crashes
+tcp session during negotiate protocol.
+
+Version 1.52
+------------
+Fix oops on second mount to server when null auth is used.
+Enable experimental Kerberos support.  Return writebehind errors on flush
+and sync so that events like out of disk space get reported properly on
+cached files. Fix setxattr failure to certain Samba versions. Fix mount
+of second share to disconnected server session (autoreconnect on this).
+Add ability to modify cifs acls for handling chmod (when mounted with
+cifsacl flag). Fix prefixpath path separator so we can handle mounts
+with prefixpaths longer than one directory (one path component) when
+mounted to Windows servers.  Fix slow file open when cifsacl
+enabled. Fix memory leak in FindNext when the SMB call returns -EBADF.
+
+
+Version 1.51
+------------
+Fix memory leak in statfs when mounted to very old servers (e.g.
+Windows 9x).  Add new feature "POSIX open" which allows servers
+which support the current POSIX Extensions to provide better semantics
+(e.g. delete for open files opened with posix open).  Take into
+account umask on posix mkdir not just older style mkdir.  Add
+ability to mount to IPC$ share (which allows CIFS named pipes to be
+opened, read and written as if they were files).  When 1st tree
+connect fails (e.g. due to signing negotiation failure) fix
+leak that causes cifsd not to stop and rmmod to fail to cleanup
+cifs_request_buffers pool. Fix problem with POSIX Open/Mkdir on
+bigendian architectures. Fix possible memory corruption when
+EAGAIN returned on kern_recvmsg. Return better error if server
+requires packet signing but client has disabled it. When mounted
+with cifsacl mount option - mode bits are approximated based
+on the contents of the ACL of the file or directory. When cifs
+mount helper is missing convert make sure that UNC name 
+has backslash (not forward slash) between ip address of server
+and the share name.
+
+Version 1.50
+------------
+Fix NTLMv2 signing. NFS server mounted over cifs works (if cifs mount is
+done with "serverino" mount option).  Add support for POSIX Unlink
+(helps with certain sharing violation cases when server such as
+Samba supports newer POSIX CIFS Protocol Extensions). Add "nounix"
+mount option to allow disabling the CIFS Unix Extensions for just
+that mount. Fix hang on spinlock in find_writable_file (race when
+reopening file after session crash).  Byte range unlock request to
+windows server could unlock more bytes (on server copy of file)
+than intended if start of unlock request is well before start of
+a previous byte range lock that we issued.
+
+Version 1.49
+------------
+IPv6 support.  Enable ipv6 addresses to be passed on mount (put the ipv6
+address after the "ip=" mount option, at least until mount.cifs is fixed to
+handle DNS host to ipv6 name translation).  Accept override of uid or gid
+on mount even when Unix Extensions are negotiated (it used to be ignored
+when Unix Extensions were ignored).  This allows users to override the
+default uid and gid for files when they are certain that the uids or
+gids on the server do not match those of the client.  Make "sec=none"
+mount override username (so that null user connection is attempted)
+to match what documentation said. Support for very large reads, over 127K,
+available to some newer servers (such as Samba 3.0.26 and later but
+note that it also requires setting CIFSMaxBufSize at module install
+time to a larger value which may hurt performance in some cases).
+Make sign option force signing (or fail if server does not support it).
+
+Version 1.48
+------------
+Fix mtime bouncing around from local idea of last write times to remote time.
+Fix hang (in i_size_read) when simultaneous size update of same remote file
+on smp system corrupts sequence number. Do not reread unnecessarily partial page
+(which we are about to overwrite anyway) when writing out file opened rw.
+When DOS attribute of file on non-Unix server's file changes on the server side
+from read-only back to read-write, reflect this change in default file mode
+(we had been leaving a file's mode read-only until the inode were reloaded).
+Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute
+when archive dos attribute not set and we are changing mode back to writeable
+on server which does not support the Unix Extensions).  Remove read only dos
+attribute on chmod when adding any write permission (ie on any of
+user/group/other (not all of user/group/other ie  0222) when
+mounted to windows.  Add support for POSIX MkDir (slight performance
+enhancement and eliminates the network race between the mkdir and set 
+path info of the mode).
+
+
+Version 1.47
+------------
+Fix oops in list_del during mount caused by unaligned string.
+Fix file corruption which could occur on some large file
+copies caused by writepages page i/o completion bug.
+Seek to SEEK_END forces check for update of file size for non-cached
+files. Allow file size to be updated on remote extend of locally open,
+non-cached file.  Fix reconnect to newer Samba servers (or other servers
+which support the CIFS Unix/POSIX extensions) so that we again tell the
+server the Unix/POSIX cifs capabilities which we support (SetFSInfo).
+Add experimental support for new POSIX Open/Mkdir (which returns
+stat information on the open, and allows setting the mode).
+
+Version 1.46
+------------
+Support deep tree mounts.  Better support OS/2, Win9x (DOS) time stamps.
+Allow null user to be specified on mount ("username="). Do not return
+EINVAL on readdir when filldir fails due to overwritten blocksize
+(fixes FC problem).  Return error in rename 2nd attempt retry (ie report
+if rename by handle also fails, after rename by path fails, we were
+not reporting whether the retry worked or not). Fix NTLMv2 to
+work to Windows servers (mount with option "sec=ntlmv2").
+
+Version 1.45
+------------
+Do not time out lockw calls when using posix extensions. Do not
+time out requests if server still responding reasonably fast
+on requests on other threads.  Improve POSIX locking emulation,
+(lock cancel now works, and unlock of merged range works even
+to Windows servers now).  Fix oops on mount to lanman servers
+(win9x, os/2 etc.) when null password.  Do not send listxattr
+(SMB to query all EAs) if nouser_xattr specified.  Fix SE Linux
+problem (instantiate inodes/dentries in right order for readdir).
+
+Version 1.44
+------------
+Rewritten sessionsetup support, including support for legacy SMB
+session setup needed for OS/2 and older servers such as Windows 95 and 98.
+Fix oops on ls to OS/2 servers.  Add support for level 1 FindFirst
+so we can do search (ls etc.) to OS/2.  Do not send NTCreateX
+or recent levels of FindFirst unless server says it supports NT SMBs
+(instead use legacy equivalents from LANMAN dialect). Fix to allow
+NTLMv2 authentication support (now can use stronger password hashing
+on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004).
+Allow override of global cifs security flags on mount via "sec=" option(s).
+
+Version 1.43
+------------
+POSIX locking to servers which support CIFS POSIX Extensions
+(disabled by default controlled by proc/fs/cifs/Experimental).
+Handle conversion of long share names (especially Asian languages)
+to Unicode during mount. Fix memory leak in sess struct on reconnect.
+Fix rare oops after acpi suspend.  Fix O_TRUNC opens to overwrite on
+cifs open which helps rare case when setpathinfo fails or server does
+not support it. 
+
+Version 1.42
+------------
+Fix slow oplock break when mounted to different servers at the same time and
+the tids match and we try to find matching fid on wrong server. Fix read
+looping when signing required by server (2.6.16 kernel only). Fix readdir
+vs. rename race which could cause each to hang. Return . and .. even
+if server does not.  Allow searches to skip first three entries and
+begin at any location. Fix oops in find_writeable_file.
+
+Version 1.41
+------------
+Fix NTLMv2 security (can be enabled in /proc/fs/cifs) so customers can
+configure stronger authentication.  Fix sfu symlinks so they can
+be followed (not just recognized).  Fix wraparound of bcc on
+read responses when buffer size over 64K and also fix wrap of
+max smb buffer size when CIFSMaxBufSize over 64K.  Fix oops in
+cifs_user_read and cifs_readpages (when EAGAIN on send of smb
+on socket is returned over and over).  Add POSIX (advisory) byte range
+locking support (requires server with newest CIFS UNIX Extensions
+to the protocol implemented). Slow down negprot slightly in port 139
+RFC1001 case to give session_init time on buggy servers.
+
+Version 1.40
+------------
+Use fsuid (fsgid) more consistently instead of uid (gid). Improve performance
+of readpages by eliminating one extra memcpy. Allow update of file size
+from remote server even if file is open for write as long as mount is
+directio.  Recognize share mode security and send NTLM encrypted password
+on tree connect if share mode negotiated.
+
+Version 1.39
+------------
+Defer close of a file handle slightly if pending writes depend on that handle
+(this reduces the EBADF bad file handle errors that can be logged under heavy
+stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2 
+Fix SFU style symlinks and mknod needed for servers which do not support the
+CIFS Unix Extensions.  Fix setfacl/getfacl on bigendian. Timeout negative
+dentries so files that the client sees as deleted but that later get created
+on the server will be recognized.  Add client side permission check on setattr.
+Timeout stuck requests better (where server has never responded or sent corrupt
+responses)
+
+Version 1.38
+------------
+Fix tcp socket retransmission timeouts (e.g. on ENOSPACE from the socket)
+to be smaller at first (but increasing) so large write performance performance
+over GigE is better.  Do not hang thread on illegal byte range lock response
+from Windows (Windows can send an RFC1001 size which does not match smb size) by
+allowing an SMBs TCP length to be up to a few bytes longer than it should be.
+wsize and rsize can now be larger than negotiated buffer size if server
+supports large readx/writex, even when directio mount flag not specified.
+Write size will in many cases now be 16K instead of 4K which greatly helps
+file copy performance on lightly loaded networks.  Fix oops in dnotify
+when experimental config flag enabled. Make cifsFYI more granular.
+
+Version 1.37
+------------
+Fix readdir caching when unlink removes file in current search buffer,
+and this is followed by a rewind search to just before the deleted entry.
+Do not attempt to set ctime unless atime and/or mtime change requested
+(most servers throw it away anyway). Fix length check of received smbs
+to be more accurate. Fix big endian problem with mapchars mount option,
+and with a field returned by statfs.
+
+Version 1.36
+------------
+Add support for mounting to older pre-CIFS servers such as Windows9x and ME.
+For these older servers, add option for passing netbios name of server in
+on mount (servernetbiosname).  Add suspend support for power management, to
+avoid cifsd thread preventing software suspend from working.
+Add mount option for disabling the default behavior of sending byte range lock
+requests to the server (necessary for certain applications which break with
+mandatory lock behavior such as Evolution), and also mount option for
+requesting case insensitive matching for path based requests (requesting
+case sensitive is the default).
+
+Version 1.35
+------------
+Add writepage performance improvements.  Fix path name conversions
+for long filenames on mounts which were done with "mapchars" mount option
+specified.  Ensure multiplex ids do not collide.  Fix case in which 
+rmmod can oops if done soon after last unmount.  Fix truncated
+search (readdir) output when resume filename was a long filename.
+Fix filename conversion when mapchars mount option was specified and
+filename was a long filename.
+
+Version 1.34
+------------
+Fix error mapping of the TOO_MANY_LINKS (hardlinks) case.
+Do not oops if root user kills cifs oplock kernel thread or
+kills the cifsd thread (NB: killing the cifs kernel threads is not
+recommended, unmount and rmmod cifs will kill them when they are
+no longer needed).  Fix readdir to ASCII servers (ie older servers
+which do not support Unicode) and also require asterisk.
+Fix out of memory case in which data could be written one page
+off in the page cache.
+
+Version 1.33
+------------
+Fix caching problem, in which readdir of directory containing a file
+which was cached could cause the file's time stamp to be updated
+without invalidating the readahead data (so we could get stale
+file data on the client for that file even as the server copy changed).
+Cleanup response processing so cifsd can not loop when abnormally
+terminated.
+
+
+Version 1.32
+------------
+Fix oops in ls when Transact2 FindFirst (or FindNext) returns more than one
+transact response for an SMB request and search entry split across two frames.
+Add support for lsattr (getting ext2/ext3/reiserfs attr flags from the server)
+as new protocol extensions. Do not send Get/Set calls for POSIX ACLs
+unless server explicitly claims to support them in CIFS Unix extensions
+POSIX ACL capability bit. Fix packet signing when multiuser mounting with
+different users from the same client to the same server. Fix oops in
+cifs_close. Add mount option for remapping reserved characters in
+filenames (also allow recognizing files with created by SFU which have any
+of these seven reserved characters, except backslash, to be recognized).
+Fix invalid transact2 message (we were sometimes trying to interpret
+oplock breaks as SMB responses). Add ioctl for checking that the
+current uid matches the uid of the mounter (needed by umount.cifs).
+Reduce the number of large buffer allocations in cifs response processing
+(significantly reduces memory pressure under heavy stress with multiple
+processes accessing the same server at the same time).
+
+Version 1.31
+------------
+Fix updates of DOS attributes and time fields so that files on NT4 servers
+do not get marked delete on close. Display sizes of cifs buffer pools in
+cifs stats. Fix oops in unmount when cifsd thread being killed by 
+shutdown. Add generic readv/writev and aio support. Report inode numbers 
+consistently in readdir and lookup (when serverino mount option is
+specified use the inode number that the server reports - for both lookup
+and readdir, otherwise by default the locally generated inode number is used
+for inodes created in either path since servers are not always able to 
+provide unique inode numbers when exporting multiple volumes from under one
+sharename).
+
+Version 1.30
+------------
+Allow new nouser_xattr mount parm to disable xattr support for user namespace.
+Do not flag user_xattr mount parm in dmesg.  Retry failures setting file time  
+(mostly affects NT4 servers) by retry with handle based network operation. 
+Add new POSIX Query FS Info for returning statfs info more accurately.
+Handle passwords with multiple commas in them.
+
+Version 1.29
+------------
+Fix default mode in sysfs of cifs module parms.  Remove old readdir routine.
+Fix capabilities flags for large readx so as to allow reads larger than 64K.
+
+Version 1.28
+------------
+Add module init parm for large SMB buffer size (to allow it to be changed
+from its default of 16K) which is especially useful for large file copy
+when mounting with the directio mount option. Fix oops after 
+returning from mount when experimental ExtendedSecurity enabled and
+SpnegoNegotiated returning invalid error. Fix case to retry better when 
+peek returns from 1 to 3 bytes on socket which should have more data.
+Fixed path based calls (such as cifs lookup) to handle path names
+longer than 530 (now can handle PATH_MAX). Fix pass through authentication
+from Samba server to DC (Samba required dummy LM password).
+
+Version 1.27
+------------
+Turn off DNOTIFY (directory change notification support) by default
+(unless built with the experimental flag) to fix hang with KDE
+file browser. Fix DNOTIFY flag mappings.  Fix hang (in wait_event
+waiting on an SMB response) in SendReceive when session dies but
+reconnects quickly from another task.  Add module init  parms for
+minimum number of large and small network buffers in the buffer pools,
+and for the maximum number of simultaneous requests.
+
+Version 1.26
+------------
+Add setfacl support to allow setting of ACLs remotely to Samba 3.10 and later
+and other POSIX CIFS compliant servers.  Fix error mapping for getfacl 
+to EOPNOTSUPP when server does not support posix acls on the wire. Fix 
+improperly zeroed buffer in CIFS Unix extensions set times call. 
+
+Version 1.25
+------------
+Fix internationalization problem in cifs readdir with filenames that map to 
+longer UTF-8 strings than the string on the wire was in Unicode.  Add workaround
+for readdir to netapp servers. Fix search rewind (seek into readdir to return 
+non-consecutive entries).  Do not do readdir when server negotiates 
+buffer size to small to fit filename. Add support for reading POSIX ACLs from
+the server (add also acl and noacl mount options).
+
+Version 1.24
+------------
+Optionally allow using server side inode numbers, rather than client generated
+ones by specifying mount option "serverino" - this is required for some apps
+to work which double check hardlinked files and have persistent inode numbers.
+
+Version 1.23
+------------
+Multiple bigendian fixes. On little endian systems (for reconnect after
+network failure) fix tcp session reconnect code so we do not try first
+to reconnect on reverse of port 445. Treat reparse points (NTFS junctions)
+as directories rather than symlinks because we can do follow link on them.
+
+Version 1.22
+------------
+Add config option to enable XATTR (extended attribute) support, mapping
+xattr names in the "user." namespace space to SMB/CIFS EAs. Lots of
+minor fixes pointed out by the Stanford SWAT checker (mostly missing
+or out of order NULL pointer checks in little used error paths).
+
+Version 1.21
+------------
+Add new mount parm to control whether mode check (generic_permission) is done
+on the client.  If Unix extensions are enabled and the uids on the client
+and server do not match, client permission checks are meaningless on
+server uids that do not exist on the client (this does not affect the
+normal ACL check which occurs on the server).  Fix default uid
+on mknod to match create and mkdir. Add optional mount parm to allow
+override of the default uid behavior (in which the server sets the uid
+and gid of newly created files). Normally for network filesystem mounts
+user want the server to set the uid/gid on newly created files (rather than 
+using uid of the client processes you would in a local filesystem).
+
+Version 1.20
+------------
+Make transaction counts more consistent. Merge /proc/fs/cifs/SimultaneousOps
+info into /proc/fs/cifs/DebugData.  Fix oops in rare oops in readdir 
+(in build_wildcard_path_from_dentry).  Fix mknod to pass type field
+(block/char/fifo) properly.  Remove spurious mount warning log entry when
+credentials passed as mount argument. Set major/minor device number in
+inode for block and char devices when unix extensions enabled.
+
+Version 1.19
+------------
+Fix /proc/fs/cifs/Stats and DebugData display to handle larger
+amounts of return data. Properly limit requests to MAX_REQ (50
+is the usual maximum active multiplex SMB/CIFS requests per server).
+Do not kill cifsd (and thus hurt the other SMB session) when more than one
+session to the same server (but with different userids) exists and one
+of the two user's smb sessions is being removed while leaving the other.
+Do not loop reconnecting in cifsd demultiplex thread when admin
+kills the thread without going through unmount.
+
+Version 1.18
+------------
+Do not rename hardlinked files (since that should be a noop). Flush
+cached write behind data when reopening a file after session abend,
+except when already in write. Grab per socket sem during reconnect 
+to avoid oops in sendmsg if overlapping with reconnect. Do not
+reset cached inode file size on readdir for files open for write on 
+client.
+
+
+Version 1.17
+------------
+Update number of blocks in file so du command is happier (in Linux a fake
+blocksize of 512 is required for calculating number of blocks in inode).
+Fix prepare write of partial pages to read in data from server if possible.
+Fix race on tcpStatus field between unmount and reconnection code, causing
+cifsd process sometimes to hang around forever. Improve out of memory
+checks in cifs_filldir
+
+Version 1.16
+------------
+Fix incorrect file size in file handle based setattr on big endian hardware.
+Fix oops in build_path_from_dentry when out of memory.  Add checks for invalid
+and closing file structs in writepage/partialpagewrite.  Add statistics
+for each mounted share (new menuconfig option). Fix endianness problem in
+volume information displayed in /proc/fs/cifs/DebugData (only affects
+affects big endian architectures). Prevent renames while constructing
+path names for open, mkdir and rmdir.
+
+Version 1.15
+------------
+Change to mempools for alloc smb request buffers and multiplex structs
+to better handle low memory problems (and potential deadlocks).
+
+Version 1.14
+------------
+Fix incomplete listings of large directories on Samba servers when Unix
+extensions enabled.  Fix oops when smb_buffer can not be allocated. Fix
+rename deadlock when writing out dirty pages at same time.
+
+Version 1.13
+------------
+Fix open of files in which O_CREATE can cause the mode to change in
+some cases. Fix case in which retry of write overlaps file close.
+Fix PPC64 build error.  Reduce excessive stack usage in smb password
+hashing. Fix overwrite of Linux user's view of file mode to Windows servers.
+
+Version 1.12
+------------
+Fixes for large file copy, signal handling, socket retry, buffer
+allocation and low memory situations.
+
+Version 1.11
+------------
+Better port 139 support to Windows servers (RFC1001/RFC1002 Session_Initialize)
+also now allowing support for specifying client netbiosname.  NT4 support added.
+
+Version 1.10
+------------
+Fix reconnection (and certain failed mounts) to properly wake up the
+blocked users thread so it does not seem hung (in some cases was blocked
+until the cifs receive timeout expired). Fix spurious error logging
+to kernel log when application with open network files killed. 
+
+Version 1.09
+------------
+Fix /proc/fs module unload warning message (that could be logged
+to the kernel log). Fix intermittent failure in connectathon
+test7 (hardlink count not immediately refreshed in case in which
+inode metadata can be incorrectly kept cached when time near zero)
+
+Version 1.08
+------------
+Allow file_mode and dir_mode (specified at mount time) to be enforced
+locally (the server already enforced its own ACLs too) for servers
+that do not report the correct mode (do not support the 
+CIFS Unix Extensions).
+
+Version 1.07
+------------
+Fix some small memory leaks in some unmount error paths. Fix major leak
+of cache pages in readpages causing multiple read oriented stress
+testcases (including fsx, and even large file copy) to fail over time. 
+
+Version 1.06
+------------
+Send NTCreateX with ATTR_POSIX if Linux/Unix extensions negotiated with server.
+This allows files that differ only in case and improves performance of file
+creation and file open to such servers.  Fix semaphore conflict which causes 
+slow delete of open file to Samba (which unfortunately can cause an oplock
+break to self while vfs_unlink held i_sem) which can hang for 20 seconds.
+
+Version 1.05
+------------
+fixes to cifs_readpages for fsx test case
+
+Version 1.04
+------------
+Fix caching data integrity bug when extending file size especially when no
+oplock on file.  Fix spurious logging of valid already parsed mount options
+that are parsed outside of the cifs vfs such as nosuid.
+
+
+Version 1.03
+------------
+Connect to server when port number override not specified, and tcp port
+unitialized.  Reset search to restart at correct file when kernel routine
+filldir returns error during large directory searches (readdir). 
+
+Version 1.02
+------------
+Fix caching problem when files opened by multiple clients in which 
+page cache could contain stale data, and write through did
+not occur often enough while file was still open when read ahead
+(read oplock) not allowed.  Treat "sep=" when first mount option
+as an override of comma as the default separator between mount
+options. 
+
+Version 1.01
+------------
+Allow passwords longer than 16 bytes. Allow null password string.
+
+Version 1.00
+------------
+Gracefully clean up failed mounts when attempting to mount to servers such as
+Windows 98 that terminate tcp sessions during protocol negotiation.  Handle
+embedded commas in mount parsing of passwords.
+
+Version 0.99
+------------
+Invalidate local inode cached pages on oplock break and when last file
+instance is closed so that the client does not continue using stale local
+copy rather than later modified server copy of file.  Do not reconnect
+when server drops the tcp session prematurely before negotiate
+protocol response.  Fix oops in reopen_file when dentry freed.  Allow
+the support for CIFS Unix Extensions to be disabled via proc interface.
+
+Version 0.98
+------------
+Fix hang in commit_write during reconnection of open files under heavy load.
+Fix unload_nls oops in a mount failure path. Serialize writes to same socket
+which also fixes any possible races when cifs signatures are enabled in SMBs
+being sent out of signature sequence number order.    
+
+Version 0.97
+------------
+Fix byte range locking bug (endian problem) causing bad offset and
+length.
+
+Version 0.96
+------------
+Fix oops (in send_sig) caused by CIFS unmount code trying to
+wake up the demultiplex thread after it had exited. Do not log
+error on harmless oplock release of closed handle.
+
+Version 0.95
+------------
+Fix unsafe global variable usage and password hash failure on gcc 3.3.1
+Fix problem reconnecting secondary mounts to same server after session 
+failure.  Fix invalid dentry - race in mkdir when directory gets created
+by another client between the lookup and mkdir.
+ 
+Version 0.94
+------------
+Fix to list processing in reopen_files. Fix reconnection when server hung
+but tcpip session still alive.  Set proper timeout on socket read.
+
+Version 0.93
+------------
+Add missing mount options including iocharset.  SMP fixes in write and open. 
+Fix errors in reconnecting after TCP session failure.  Fix module unloading
+of default nls codepage
+
+Version 0.92
+------------
+Active smb transactions should never go negative (fix double FreeXid). Fix
+list processing in file routines. Check return code on kmalloc in open.
+Fix spinlock usage for SMP.
+
+Version 0.91
+------------
+Fix oops in reopen_files when invalid dentry. drop dentry on server rename 
+and on revalidate errors. Fix cases where pid is now tgid.  Fix return code
+on create hard link when server does not support them. 
+
+Version 0.90
+------------
+Fix scheduling while atomic error in getting inode info on newly created file. 
+Fix truncate of existing files opened with O_CREAT but not O_TRUNC set.
+
+Version 0.89
+------------
+Fix oops on write to dead tcp session. Remove error log write for case when file open
+O_CREAT but not O_EXCL
+
+Version 0.88
+------------
+Fix non-POSIX behavior on rename of open file and delete of open file by taking 
+advantage of trans2 SetFileInfo rename facility if available on target server.
+Retry on ENOSPC and EAGAIN socket errors.
+
+Version 0.87
+------------
+Fix oops on big endian readdir.  Set blksize to be even power of two (2**blkbits) to fix
+allocation size miscalculation. After oplock token lost do not read through
+cache. 
+
+Version 0.86
+------------
+Fix oops on empty file readahead.  Fix for file size handling for locally cached files.
+
+Version 0.85
+------------
+Fix oops in mkdir when server fails to return inode info. Fix oops in reopen_files
+during auto reconnection to server after server recovered from failure.
+
+Version 0.84
+------------
+Finish support for Linux 2.5 open/create changes, which removes the
+redundant NTCreate/QPathInfo/close that was sent during file create.
+Enable oplock by default. Enable packet signing by default (needed to 
+access many recent Windows servers)
+
+Version 0.83
+------------
+Fix oops when mounting to long server names caused by inverted parms to kmalloc.
+Fix MultiuserMount (/proc/fs/cifs configuration setting) so that when enabled
+we will choose a cifs user session (smb uid) that better matches the local
+uid if a) the mount uid does not match the current uid and b) we have another
+session to the same server (ip address) for a different mount which
+matches the current local uid.
+
+Version 0.82
+------------
+Add support for mknod of block or character devices.  Fix oplock
+code (distributed caching) to properly send response to oplock
+break from server.
+
+Version 0.81
+------------
+Finish up CIFS packet digital signing for the default
+NTLM security case. This should help Windows 2003
+network interoperability since it is common for
+packet signing to be required now. Fix statfs (stat -f)
+which recently started returning errors due to 
+invalid value (-1 instead of 0) being set in the
+struct kstatfs f_ffiles field.
+
+Version 0.80
+-----------
+Fix oops on stopping oplock thread when removing cifs when
+built as module.
+
+Version 0.79
+------------
+Fix mount options for ro (readonly), uid, gid and file and directory mode. 
+
+Version 0.78
+------------
+Fix errors displayed on failed mounts to be more understandable.
+Fixed various incorrect or misleading smb to posix error code mappings.
+
+Version 0.77
+------------
+Fix display of NTFS DFS junctions to display as symlinks.
+They are the network equivalent.  Fix oops in 
+cifs_partialpagewrite caused by missing spinlock protection
+of openfile linked list.  Allow writebehind caching errors to 
+be returned to the application at file close.
+
+Version 0.76
+------------
+Clean up options displayed in /proc/mounts by show_options to
+be more consistent with other filesystems.
+
+Version 0.75
+------------
+Fix delete of readonly file to Windows servers.  Reflect
+presence or absence of read only dos attribute in mode
+bits for servers that do not support CIFS Unix extensions.
+Fix shortened results on readdir of large directories to
+servers supporting CIFS Unix extensions (caused by
+incorrect resume key).
+
+Version 0.74
+------------
+Fix truncate bug (set file size) that could cause hangs e.g. running fsx
+
+Version 0.73
+------------
+unload nls if mount fails.
+
+Version 0.72
+------------
+Add resume key support to search (readdir) code to workaround
+Windows bug.  Add /proc/fs/cifs/LookupCacheEnable which
+allows disabling caching of attribute information for
+lookups.
+
+Version 0.71
+------------
+Add more oplock handling (distributed caching code).  Remove
+dead code.  Remove excessive stack space utilization from
+symlink routines.
+
+Version 0.70
+------------
+Fix oops in get dfs referral (triggered when null path sent in to
+mount).  Add support for overriding rsize at mount time.
+
+Version 0.69
+------------
+Fix buffer overrun in readdir which caused intermittent kernel oopses.
+Fix writepage code to release kmap on write data.  Allow "-ip=" new 
+mount option to be passed in on parameter distinct from the first part
+(server name portion of) the UNC name.  Allow override of the
+tcp port of the target server via new mount option "-port="  
+
+Version 0.68
+------------
+Fix search handle leak on rewind.  Fix setuid and gid so that they are 
+reflected in the local inode immediately.  Cleanup of whitespace
+to make 2.4 and 2.5 versions more consistent.
+
+
+Version 0.67
+------------
+Fix signal sending so that captive thread (cifsd) exits on umount 
+(which was causing the warning in kmem_cache_free of the request buffers
+at rmmod time).  This had broken as a sideeffect of the recent global
+kernel change to daemonize.  Fix memory leak in readdir code which
+showed up in "ls -R" (and applications that did search rewinding).
+
+Version 0.66
+------------
+Reconnect tids and fids after session reconnection (still do not
+reconnect byte range locks though).  Fix problem caching
+lookup information for directory inodes, improving performance,
+especially in deep directory trees.  Fix various build warnings.
+
+Version 0.65
+------------
+Finish fixes to commit write for caching/readahead consistency.  fsx 
+now works to Samba servers.  Fix oops caused when readahead
+was interrupted by a signal.
+
+Version 0.64
+------------
+Fix data corruption (in partial page after truncate) that caused fsx to
+fail to Windows servers.  Cleaned up some extraneous error logging in
+common error paths.  Add generic sendfile support.
+
+Version 0.63
+------------
+Fix memory leak in AllocMidQEntry.
+Finish reconnection logic, so connection with server can be dropped
+(or server rebooted) and the cifs client will reconnect.  
+
+Version 0.62
+------------
+Fix temporary socket leak when bad userid or password specified 
+(or other SMBSessSetup failure).  Increase maximum buffer size to slightly
+over 16K to allow negotiation of up to Samba and Windows server default read 
+sizes.  Add support for readpages
+
+Version 0.61
+------------
+Fix oops when username not passed in on mount.  Extensive fixes and improvements
+to error logging (strip redundant newlines, change debug macros to ensure newline
+passed in and to be more consistent).  Fix writepage wrong file handle problem,
+a readonly file handle could be incorrectly used to attempt to write out
+file updates through the page cache to multiply open files.  This could cause
+the iozone benchmark to fail on the fwrite test. Fix bug mounting two different
+shares to the same Windows server when using different usernames
+(doing this to Samba servers worked but Windows was rejecting it) - now it is
+possible to use different userids when connecting to the same server from a
+Linux client. Fix oops when treeDisconnect called during unmount on
+previously freed socket.
+
+Version 0.60
+------------
+Fix oops in readpages caused by not setting address space operations in inode in 
+rare code path. 
+
+Version 0.59
+------------
+Includes support for deleting of open files and renaming over existing files (per POSIX
+requirement).  Add readlink support for Windows junction points (directory symlinks).
+
+Version 0.58
+------------
+Changed read and write to go through pagecache. Added additional address space operations.
+Memory mapped operations now working.
+
+Version 0.57
+------------
+Added writepage code for additional memory mapping support.  Fixed leak in xids causing
+the simultaneous operations counter (/proc/fs/cifs/SimultaneousOps) to increase on 
+every stat call.  Additional formatting cleanup. 
+
+Version 0.56
+------------
+Fix bigendian bug in order of time conversion. Merge 2.5 to 2.4 version.  Formatting cleanup.   
+
+Version 0.55
+------------
+Fixes from Zwane Mwaikambo for adding missing return code checking in a few places.
+Also included a modified version of his fix to protect global list manipulation of
+the smb session and tree connection and mid related global variables.
+
+Version 0.54
+------------
+Fix problem with captive thread hanging around at unmount time.  Adjust to 2.5.42-pre
+changes to superblock layout.   Remove wasteful allocation of smb buffers (now the send 
+buffer is reused for responses).  Add more oplock handling. Additional minor cleanup.
+
+Version 0.53
+------------
+More stylistic updates to better match kernel style.  Add additional statistics
+for filesystem which can be viewed via /proc/fs/cifs.  Add more pieces of NTLMv2
+and CIFS Packet Signing enablement.
+
+Version 0.52
+------------
+Replace call to sleep_on with safer wait_on_event.
+Make stylistic changes to better match kernel style recommendations.
+Remove most typedef usage (except for the PDUs themselves).
+
+Version 0.51
+------------
+Update mount so the -unc mount option is no longer required (the ip address can be specified
+in a UNC style device name.   Implementation of readpage/writepage started.
+
+Version 0.50
+------------
+Fix intermittent problem with incorrect smb header checking on badly 
+fragmented tcp responses
+
+Version 0.49
+------------
+Fixes to setting of allocation size and file size.
+
+Version 0.48
+------------
+Various 2.5.38 fixes.  Now works on 2.5.38
+
+Version 0.47
+------------
+Prepare for 2.5 kernel merge.  Remove ifdefs.
+
+Version 0.46
+------------
+Socket buffer management fixes.  Fix dual free.
+
+Version 0.45
+------------
+Various big endian fixes for hardlinks and symlinks and also for dfs.
+
+Version 0.44
+------------
+Various big endian fixes for servers with Unix extensions such as Samba
+
+Version 0.43
+------------
+Various FindNext fixes for incorrect filenames on large directory searches on big endian
+clients.  basic posix file i/o tests now work on big endian machines, not just le
+
+Version 0.42
+------------
+SessionSetup and NegotiateProtocol now work from Big Endian machines.
+Various Big Endian fixes found during testing on the Linux on 390.  Various fixes for compatibility with older
+versions of 2.4 kernel (now builds and works again on kernels at least as early as 2.4.7).
+
+Version 0.41
+------------
+Various minor fixes for Connectathon Posix "basic" file i/o test suite.  Directory caching fixed so hardlinked
+files now return the correct number of links on fstat as they are repeatedly linked and unlinked.
+
+Version 0.40
+------------
+Implemented "Raw" (i.e. not encapsulated in SPNEGO) NTLMSSP (i.e. the Security Provider Interface used to negotiate
+session advanced session authentication).  Raw NTLMSSP is preferred by Windows 2000 Professional and Windows XP.
+Began implementing support for SPNEGO encapsulation of NTLMSSP based session authentication blobs
+(which is the mechanism preferred by Windows 2000 server in the absence of Kerberos).
+
+Version 0.38
+------------
+Introduced optional mount helper utility mount.cifs and made coreq changes to cifs vfs to enable
+it. Fixed a few bugs in the DFS code (e.g. bcc two bytes too short and incorrect uid in PDU).
+
+Version 0.37
+------------
+Rewrote much of connection and mount/unmount logic to handle bugs with
+multiple uses to same share, multiple users to same server etc.
+
+Version 0.36
+------------
+Fixed major problem with dentry corruption (missing call to dput)
+
+Version 0.35
+------------
+Rewrite of readdir code to fix bug. Various fixes for bigendian machines.
+Begin adding oplock support.  Multiusermount and oplockEnabled flags added to /proc/fs/cifs
+although corresponding function not fully implemented in the vfs yet
+
+Version 0.34
+------------
+Fixed dentry caching bug, misc. cleanup 
+
+Version 0.33
+------------
+Fixed 2.5 support to handle build and configure changes as well as misc. 2.5 changes.  Now can build
+on current 2.5 beta version (2.5.24) of the Linux kernel as well as on 2.4 Linux kernels.
+Support for STATUS codes (newer 32 bit NT error codes) added.  DFS support begun to be added.
+
+Version 0.32
+------------
+Unix extensions (symlink, readlink, hardlink, chmod and some chgrp and chown) implemented
+and tested against Samba 2.2.5
+
+
+Version 0.31
+------------
+1) Fixed lockrange to be correct (it was one byte too short)
+
+2) Fixed GETLK (i.e. the fcntl call to test a range of bytes in a file to see if locked) to correctly 
+show range as locked when there is a conflict with an existing lock.
+
+3) default file perms are now 2767 (indicating support for mandatory locks) instead of 777 for directories
+in most cases.  Eventually will offer optional ability to query server for the correct perms.
+
+3) Fixed eventual trap when mounting twice to different shares on the same server when the first succeeded 
+but the second one was invalid and failed (the second one was incorrectly disconnecting the tcp and smb
+session) 
+
+4) Fixed error logging of valid mount options
+
+5) Removed logging of password field.
+
+6) Moved negotiate, treeDisconnect and uloggoffX (only tConx and SessSetup remain in connect.c) to cifssmb.c
+and cleaned them up and made them more consistent with other cifs functions. 
+
+7) Server support for Unix extensions is now fully detected and FindFirst is implemented both ways 
+(with or without Unix extensions) but FindNext and QueryPathInfo with the Unix extensions are not completed,
+nor is the symlink support using the Unix extensions
+
+8) Started adding the readlink and follow_link code 
+
+Version 0.3 
+-----------
+Initial drop
+
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
new file mode 100644
index 00000000..f66cc162
--- /dev/null
+++ b/fs/cifs/Kconfig
@@ -0,0 +1,161 @@
+config CIFS
+	tristate "CIFS support (advanced network filesystem, SMBFS successor)"
+	depends on INET
+	select NLS
+	select CRYPTO
+	select CRYPTO_MD4
+	select CRYPTO_MD5
+	select CRYPTO_HMAC
+	select CRYPTO_ARC4
+	select CRYPTO_ECB
+	select CRYPTO_DES
+	help
+	  This is the client VFS module for the Common Internet File System
+	  (CIFS) protocol which is the successor to the Server Message Block
+	  (SMB) protocol, the native file sharing mechanism for most early
+	  PC operating systems.  The CIFS protocol is fully supported by
+	  file servers such as Windows 2000 (including Windows 2003, NT 4
+	  and Windows XP) as well by Samba (which provides excellent CIFS
+	  server support for Linux and many other operating systems). Limited
+	  support for OS/2 and Windows ME and similar servers is provided as
+	  well.
+
+	  The cifs module provides an advanced network file system
+	  client for mounting to CIFS compliant servers.  It includes
+	  support for DFS (hierarchical name space), secure per-user
+	  session establishment via Kerberos or NTLM or NTLMv2,
+	  safe distributed caching (oplock), optional packet
+	  signing, Unicode and other internationalization improvements.
+	  If you need to mount to Samba or Windows from this machine, say Y.
+
+config CIFS_STATS
+        bool "CIFS statistics"
+        depends on CIFS
+        help
+          Enabling this option will cause statistics for each server share
+	  mounted by the cifs client to be displayed in /proc/fs/cifs/Stats
+
+config CIFS_STATS2
+	bool "Extended statistics"
+	depends on CIFS_STATS
+	help
+	  Enabling this option will allow more detailed statistics on SMB
+	  request timing to be displayed in /proc/fs/cifs/DebugData and also
+	  allow optional logging of slow responses to dmesg (depending on the
+	  value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details).
+	  These additional statistics may have a minor effect on performance
+	  and memory utilization.
+
+	  Unless you are a developer or are doing network performance analysis
+	  or tuning, say N.
+
+config CIFS_WEAK_PW_HASH
+	bool "Support legacy servers which use weaker LANMAN security"
+	depends on CIFS
+	help
+	  Modern CIFS servers including Samba and most Windows versions
+	  (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
+	  security mechanisms. These hash the password more securely
+	  than the mechanisms used in the older LANMAN version of the
+	  SMB protocol but LANMAN based authentication is needed to
+	  establish sessions with some old SMB servers.
+
+	  Enabling this option allows the cifs module to mount to older
+	  LANMAN based servers such as OS/2 and Windows 95, but such
+	  mounts may be less secure than mounts using NTLM or more recent
+	  security mechanisms if you are on a public network.  Unless you
+	  have a need to access old SMB servers (and are on a private
+	  network) you probably want to say N.  Even if this support
+	  is enabled in the kernel build, LANMAN authentication will not be
+	  used automatically. At runtime LANMAN mounts are disabled but
+	  can be set to required (or optional) either in
+	  /proc/fs/cifs (see fs/cifs/README for more detail) or via an
+	  option on the mount command. This support is disabled by
+	  default in order to reduce the possibility of a downgrade
+	  attack.
+
+	  If unsure, say N.
+
+config CIFS_UPCALL
+	bool "Kerberos/SPNEGO advanced session setup"
+	depends on CIFS && KEYS
+	select DNS_RESOLVER
+	help
+	  Enables an upcall mechanism for CIFS which accesses userspace helper
+	  utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets
+	  which are needed to mount to certain secure servers (for which more
+	  secure Kerberos authentication is required). If unsure, say N.
+
+config CIFS_XATTR
+        bool "CIFS extended attributes"
+        depends on CIFS
+        help
+          Extended attributes are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page, or visit
+          <http://acl.bestbits.at/> for details).  CIFS maps the name of
+          extended attributes beginning with the user namespace prefix
+          to SMB/CIFS EAs. EAs are stored on Windows servers without the
+          user namespace prefix, but their names are seen by Linux cifs clients
+          prefaced by the user namespace prefix. The system namespace
+          (used by some filesystems to store ACLs) is not supported at
+          this time.
+
+          If unsure, say N.
+
+config CIFS_POSIX
+        bool "CIFS POSIX Extensions"
+        depends on CIFS_XATTR
+        help
+          Enabling this option will cause the cifs client to attempt to
+	  negotiate a newer dialect with servers, such as Samba 3.0.5
+	  or later, that optionally can handle more POSIX like (rather
+	  than Windows like) file behavior.  It also enables
+	  support for POSIX ACLs (getfacl and setfacl) to servers
+	  (such as Samba 3.10 and later) which can negotiate
+	  CIFS POSIX ACL support.  If unsure, say N.
+
+config CIFS_DEBUG2
+	bool "Enable additional CIFS debugging routines"
+	depends on CIFS
+	help
+	   Enabling this option adds a few more debugging routines
+	   to the cifs code which slightly increases the size of
+	   the cifs module and can cause additional logging of debug
+	   messages in some error paths, slowing performance. This
+	   option can be turned off unless you are debugging
+	   cifs problems.  If unsure, say N.
+
+config CIFS_DFS_UPCALL
+	  bool "DFS feature support"
+	  depends on CIFS && KEYS
+	  select DNS_RESOLVER
+	  help
+	    Distributed File System (DFS) support is used to access shares
+	    transparently in an enterprise name space, even if the share
+	    moves to a different server.  This feature also enables
+	    an upcall mechanism for CIFS which contacts userspace helper
+	    utilities to provide server name resolution (host names to
+	    IP addresses) which is needed for implicit mounts of DFS junction
+	    points. If unsure, say N.
+
+config CIFS_FSCACHE
+	  bool "Provide CIFS client caching support (EXPERIMENTAL)"
+	  depends on EXPERIMENTAL
+	  depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
+	  help
+	    Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
+	    to be cached locally on disk through the general filesystem cache
+	    manager. If unsure, say N.
+
+config CIFS_ACL
+	  bool "Provide CIFS ACL support (EXPERIMENTAL)"
+	  depends on EXPERIMENTAL && CIFS_XATTR && KEYS
+	  help
+	    Allows to fetch CIFS/NTFS ACL from the server.  The DACL blob
+	    is handed over to the application/caller.
+
+config CIFS_NFSD_EXPORT
+	  bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)"
+	  depends on CIFS && EXPERIMENTAL && BROKEN
+	  help
+	   Allows NFS server to export a CIFS mounted share (nfsd over cifs)
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
new file mode 100644
index 00000000..005d524c
--- /dev/null
+++ b/fs/cifs/Makefile
@@ -0,0 +1,17 @@
+#
+# Makefile for Linux CIFS VFS client 
+#
+obj-$(CONFIG_CIFS) += cifs.o
+
+cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
+	  link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
+	  cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
+	  readdir.o ioctl.o sess.o export.o
+
+cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
+
+cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
+
+cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
+
+cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
diff --git a/fs/cifs/README b/fs/cifs/README
new file mode 100644
index 00000000..c5c2c5e5
--- /dev/null
+++ b/fs/cifs/README
@@ -0,0 +1,748 @@
+The CIFS VFS support for Linux supports many advanced network filesystem 
+features such as hierarchical dfs like namespace, hardlinks, locking and more.  
+It was designed to comply with the SNIA CIFS Technical Reference (which 
+supersedes the 1992 X/Open SMB Standard) as well as to perform best practice 
+practical interoperability with Windows 2000, Windows XP, Samba and equivalent 
+servers.  This code was developed in participation with the Protocol Freedom
+Information Foundation.
+
+Please see
+  http://protocolfreedom.org/ and
+  http://samba.org/samba/PFIF/
+for more details.
+
+
+For questions or bug reports please contact:
+    sfrench@samba.org (sfrench@us.ibm.com) 
+
+Build instructions:
+==================
+For Linux 2.4:
+1) Get the kernel source (e.g.from http://www.kernel.org)
+and download the cifs vfs source (see the project page
+at http://us1.samba.org/samba/Linux_CIFS_client.html)
+and change directory into the top of the kernel directory
+then patch the kernel (e.g. "patch -p1 < cifs_24.patch") 
+to add the cifs vfs to your kernel configure options if
+it has not already been added (e.g. current SuSE and UL
+users do not need to apply the cifs_24.patch since the cifs vfs is
+already in the kernel configure menu) and then
+mkdir linux/fs/cifs and then copy the current cifs vfs files from
+the cifs download to your kernel build directory e.g.
+
+	cp <cifs_download_dir>/fs/cifs/* to <kernel_download_dir>/fs/cifs
+	
+2) make menuconfig (or make xconfig)
+3) select cifs from within the network filesystem choices
+4) save and exit
+5) make dep
+6) make modules (or "make" if CIFS VFS not to be built as a module)
+
+For Linux 2.6:
+1) Download the kernel (e.g. from http://www.kernel.org)
+and change directory into the top of the kernel directory tree
+(e.g. /usr/src/linux-2.5.73)
+2) make menuconfig (or make xconfig)
+3) select cifs from within the network filesystem choices
+4) save and exit
+5) make
+
+
+Installation instructions:
+=========================
+If you have built the CIFS vfs as module (successfully) simply
+type "make modules_install" (or if you prefer, manually copy the file to
+the modules directory e.g. /lib/modules/2.4.10-4GB/kernel/fs/cifs/cifs.o).
+
+If you have built the CIFS vfs into the kernel itself, follow the instructions
+for your distribution on how to install a new kernel (usually you
+would simply type "make install").
+
+If you do not have the utility mount.cifs (in the Samba 3.0 source tree and on 
+the CIFS VFS web site) copy it to the same directory in which mount.smbfs and 
+similar files reside (usually /sbin).  Although the helper software is not  
+required, mount.cifs is recommended.  Eventually the Samba 3.0 utility program 
+"net" may also be helpful since it may someday provide easier mount syntax for
+users who are used to Windows e.g.
+	net use <mount point> <UNC name or cifs URL>
+Note that running the Winbind pam/nss module (logon service) on all of your
+Linux clients is useful in mapping Uids and Gids consistently across the
+domain to the proper network user.  The mount.cifs mount helper can be
+trivially built from Samba 3.0 or later source e.g. by executing:
+
+	gcc samba/source/client/mount.cifs.c -o mount.cifs
+
+If cifs is built as a module, then the size and number of network buffers
+and maximum number of simultaneous requests to one server can be configured.
+Changing these from their defaults is not recommended. By executing modinfo
+	modinfo kernel/fs/cifs/cifs.ko
+on kernel/fs/cifs/cifs.ko the list of configuration changes that can be made
+at module initialization time (by running insmod cifs.ko) can be seen.
+
+Allowing User Mounts
+====================
+To permit users to mount and unmount over directories they own is possible
+with the cifs vfs.  A way to enable such mounting is to mark the mount.cifs
+utility as suid (e.g. "chmod +s /sbin/mount.cifs). To enable users to 
+umount shares they mount requires
+1) mount.cifs version 1.4 or later
+2) an entry for the share in /etc/fstab indicating that a user may
+unmount it e.g.
+//server/usersharename  /mnt/username cifs user 0 0
+
+Note that when the mount.cifs utility is run suid (allowing user mounts), 
+in order to reduce risks, the "nosuid" mount flag is passed in on mount to
+disallow execution of an suid program mounted on the remote target.
+When mount is executed as root, nosuid is not passed in by default,
+and execution of suid programs on the remote target would be enabled
+by default. This can be changed, as with nfs and other filesystems, 
+by simply specifying "nosuid" among the mount options. For user mounts 
+though to be able to pass the suid flag to mount requires rebuilding 
+mount.cifs with the following flag: 
+ 
+        gcc samba/source/client/mount.cifs.c -DCIFS_ALLOW_USR_SUID -o mount.cifs
+
+There is a corresponding manual page for cifs mounting in the Samba 3.0 and
+later source tree in docs/manpages/mount.cifs.8 
+
+Allowing User Unmounts
+======================
+To permit users to ummount directories that they have user mounted (see above),
+the utility umount.cifs may be used.  It may be invoked directly, or if 
+umount.cifs is placed in /sbin, umount can invoke the cifs umount helper
+(at least for most versions of the umount utility) for umount of cifs
+mounts, unless umount is invoked with -i (which will avoid invoking a umount
+helper). As with mount.cifs, to enable user unmounts umount.cifs must be marked
+as suid (e.g. "chmod +s /sbin/umount.cifs") or equivalent (some distributions
+allow adding entries to a file to the /etc/permissions file to achieve the
+equivalent suid effect).  For this utility to succeed the target path
+must be a cifs mount, and the uid of the current user must match the uid
+of the user who mounted the resource.
+
+Also note that the customary way of allowing user mounts and unmounts is 
+(instead of using mount.cifs and unmount.cifs as suid) to add a line
+to the file /etc/fstab for each //server/share you wish to mount, but
+this can become unwieldy when potential mount targets include many
+or  unpredictable UNC names.
+
+Samba Considerations 
+==================== 
+To get the maximum benefit from the CIFS VFS, we recommend using a server that 
+supports the SNIA CIFS Unix Extensions standard (e.g.  Samba 2.2.5 or later or 
+Samba 3.0) but the CIFS vfs works fine with a wide variety of CIFS servers.  
+Note that uid, gid and file permissions will display default values if you do 
+not have a server that supports the Unix extensions for CIFS (such as Samba 
+2.2.5 or later).  To enable the Unix CIFS Extensions in the Samba server, add 
+the line: 
+
+	unix extensions = yes
+	
+to your smb.conf file on the server.  Note that the following smb.conf settings 
+are also useful (on the Samba server) when the majority of clients are Unix or 
+Linux: 
+
+	case sensitive = yes
+	delete readonly = yes 
+	ea support = yes
+
+Note that server ea support is required for supporting xattrs from the Linux
+cifs client, and that EA support is present in later versions of Samba (e.g. 
+3.0.6 and later (also EA support works in all versions of Windows, at least to
+shares on NTFS filesystems).  Extended Attribute (xattr) support is an optional
+feature of most Linux filesystems which may require enabling via
+make menuconfig. Client support for extended attributes (user xattr) can be
+disabled on a per-mount basis by specifying "nouser_xattr" on mount.
+
+The CIFS client can get and set POSIX ACLs (getfacl, setfacl) to Samba servers
+version 3.10 and later.  Setting POSIX ACLs requires enabling both XATTR and 
+then POSIX support in the CIFS configuration options when building the cifs
+module.  POSIX ACL support can be disabled on a per mount basic by specifying
+"noacl" on mount.
+ 
+Some administrators may want to change Samba's smb.conf "map archive" and 
+"create mask" parameters from the default.  Unless the create mask is changed
+newly created files can end up with an unnecessarily restrictive default mode,
+which may not be what you want, although if the CIFS Unix extensions are
+enabled on the server and client, subsequent setattr calls (e.g. chmod) can
+fix the mode.  Note that creating special devices (mknod) remotely 
+may require specifying a mkdev function to Samba if you are not using 
+Samba 3.0.6 or later.  For more information on these see the manual pages
+("man smb.conf") on the Samba server system.  Note that the cifs vfs,
+unlike the smbfs vfs, does not read the smb.conf on the client system 
+(the few optional settings are passed in on mount via -o parameters instead).  
+Note that Samba 2.2.7 or later includes a fix that allows the CIFS VFS to delete
+open files (required for strict POSIX compliance).  Windows Servers already 
+supported this feature. Samba server does not allow symlinks that refer to files
+outside of the share, so in Samba versions prior to 3.0.6, most symlinks to
+files with absolute paths (ie beginning with slash) such as:
+	 ln -s /mnt/foo bar
+would be forbidden. Samba 3.0.6 server or later includes the ability to create 
+such symlinks safely by converting unsafe symlinks (ie symlinks to server 
+files that are outside of the share) to a samba specific format on the server
+that is ignored by local server applications and non-cifs clients and that will
+not be traversed by the Samba server).  This is opaque to the Linux client
+application using the cifs vfs. Absolute symlinks will work to Samba 3.0.5 or
+later, but only for remote clients using the CIFS Unix extensions, and will
+be invisbile to Windows clients and typically will not affect local
+applications running on the same server as Samba.  
+
+Use instructions:
+================
+Once the CIFS VFS support is built into the kernel or installed as a module 
+(cifs.o), you can use mount syntax like the following to access Samba or Windows 
+servers: 
+
+  mount -t cifs //9.53.216.11/e$ /mnt -o user=myname,pass=mypassword
+
+Before -o the option -v may be specified to make the mount.cifs
+mount helper display the mount steps more verbosely.  
+After -o the following commonly used cifs vfs specific options
+are supported:
+
+  user=<username>
+  pass=<password>
+  domain=<domain name>
+  
+Other cifs mount options are described below.  Use of TCP names (in addition to
+ip addresses) is available if the mount helper (mount.cifs) is installed. If
+you do not trust the server to which are mounted, or if you do not have
+cifs signing enabled (and the physical network is insecure), consider use
+of the standard mount options "noexec" and "nosuid" to reduce the risk of 
+running an altered binary on your local system (downloaded from a hostile server
+or altered by a hostile router).
+
+Although mounting using format corresponding to the CIFS URL specification is
+not possible in mount.cifs yet, it is possible to use an alternate format
+for the server and sharename (which is somewhat similar to NFS style mount
+syntax) instead of the more widely used UNC format (i.e. \\server\share):
+  mount -t cifs tcp_name_of_server:share_name /mnt -o user=myname,pass=mypasswd
+
+When using the mount helper mount.cifs, passwords may be specified via alternate
+mechanisms, instead of specifying it after -o using the normal "pass=" syntax
+on the command line:
+1) By including it in a credential file. Specify credentials=filename as one
+of the mount options. Credential files contain two lines
+        username=someuser
+        password=your_password
+2) By specifying the password in the PASSWD environment variable (similarly
+the user name can be taken from the USER environment variable).
+3) By specifying the password in a file by name via PASSWD_FILE
+4) By specifying the password in a file by file descriptor via PASSWD_FD
+
+If no password is provided, mount.cifs will prompt for password entry
+
+Restrictions
+============
+Servers must support either "pure-TCP" (port 445 TCP/IP CIFS connections) or RFC 
+1001/1002 support for "Netbios-Over-TCP/IP." This is not likely to be a 
+problem as most servers support this.
+
+Valid filenames differ between Windows and Linux.  Windows typically restricts
+filenames which contain certain reserved characters (e.g.the character : 
+which is used to delimit the beginning of a stream name by Windows), while
+Linux allows a slightly wider set of valid characters in filenames. Windows
+servers can remap such characters when an explicit mapping is specified in
+the Server's registry.  Samba starting with version 3.10 will allow such 
+filenames (ie those which contain valid Linux characters, which normally
+would be forbidden for Windows/CIFS semantics) as long as the server is
+configured for Unix Extensions (and the client has not disabled
+/proc/fs/cifs/LinuxExtensionsEnabled).
+  
+
+CIFS VFS Mount Options
+======================
+A partial list of the supported mount options follows:
+  user		The user name to use when trying to establish
+		the CIFS session.
+  password	The user password.  If the mount helper is
+		installed, the user will be prompted for password
+		if not supplied.
+  ip		The ip address of the target server
+  unc		The target server Universal Network Name (export) to 
+		mount.	
+  domain	Set the SMB/CIFS workgroup name prepended to the
+		username during CIFS session establishment
+  forceuid	Set the default uid for inodes to the uid
+		passed in on mount. For mounts to servers
+		which do support the CIFS Unix extensions, such as a
+		properly configured Samba server, the server provides
+		the uid, gid and mode so this parameter should not be
+		specified unless the server and clients uid and gid
+		numbering differ.  If the server and client are in the
+		same domain (e.g. running winbind or nss_ldap) and
+		the server supports the Unix Extensions then the uid
+		and gid can be retrieved from the server (and uid
+		and gid would not have to be specifed on the mount. 
+		For servers which do not support the CIFS Unix
+		extensions, the default uid (and gid) returned on lookup
+		of existing files will be the uid (gid) of the person
+		who executed the mount (root, except when mount.cifs
+		is configured setuid for user mounts) unless the "uid=" 
+		(gid) mount option is specified. Also note that permission
+		checks (authorization checks) on accesses to a file occur
+		at the server, but there are cases in which an administrator
+		may want to restrict at the client as well.  For those
+		servers which do not report a uid/gid owner
+		(such as Windows), permissions can also be checked at the
+		client, and a crude form of client side permission checking 
+		can be enabled by specifying file_mode and dir_mode on 
+		the client.  (default)
+  forcegid	(similar to above but for the groupid instead of uid) (default)
+  noforceuid	Fill in file owner information (uid) by requesting it from
+		the server if possible. With this option, the value given in
+		the uid= option (on mount) will only be used if the server
+		can not support returning uids on inodes.
+  noforcegid	(similar to above but for the group owner, gid, instead of uid)
+  uid		Set the default uid for inodes, and indicate to the
+		cifs kernel driver which local user mounted. If the server
+		supports the unix extensions the default uid is
+		not used to fill in the owner fields of inodes (files)
+		unless the "forceuid" parameter is specified.
+  gid		Set the default gid for inodes (similar to above).
+  file_mode     If CIFS Unix extensions are not supported by the server
+		this overrides the default mode for file inodes.
+  fsc		Enable local disk caching using FS-Cache (off by default). This
+  		option could be useful to improve performance on a slow link,
+		heavily loaded server and/or network where reading from the
+		disk is faster than reading from the server (over the network).
+		This could also impact scalability positively as the
+		number of calls to the server are reduced. However, local
+		caching is not suitable for all workloads for e.g. read-once
+		type workloads. So, you need to consider carefully your
+		workload/scenario before using this option. Currently, local
+		disk caching is functional for CIFS files opened as read-only.
+  dir_mode      If CIFS Unix extensions are not supported by the server 
+		this overrides the default mode for directory inodes.
+  port		attempt to contact the server on this tcp port, before
+		trying the usual ports (port 445, then 139).
+  iocharset     Codepage used to convert local path names to and from
+		Unicode. Unicode is used by default for network path
+		names if the server supports it.  If iocharset is
+		not specified then the nls_default specified
+		during the local client kernel build will be used.
+		If server does not support Unicode, this parameter is
+		unused.
+  rsize		default read size (usually 16K). The client currently
+		can not use rsize larger than CIFSMaxBufSize. CIFSMaxBufSize
+		defaults to 16K and may be changed (from 8K to the maximum
+		kmalloc size allowed by your kernel) at module install time
+		for cifs.ko. Setting CIFSMaxBufSize to a very large value
+		will cause cifs to use more memory and may reduce performance
+		in some cases.  To use rsize greater than 127K (the original
+		cifs protocol maximum) also requires that the server support
+		a new Unix Capability flag (for very large read) which some
+		newer servers (e.g. Samba 3.0.26 or later) do. rsize can be
+		set from a minimum of 2048 to a maximum of 130048 (127K or
+		CIFSMaxBufSize, whichever is smaller)
+  wsize		default write size (default 57344)
+		maximum wsize currently allowed by CIFS is 57344 (fourteen
+		4096 byte pages)
+  actimeo=n	attribute cache timeout in seconds (default 1 second).
+		After this timeout, the cifs client requests fresh attribute
+		information from the server. This option allows to tune the
+		attribute cache timeout to suit the workload needs. Shorter
+		timeouts mean better the cache coherency, but increased number
+		of calls to the server. Longer timeouts mean reduced number
+		of calls to the server at the expense of less stricter cache
+		coherency checks (i.e. incorrect attribute cache for a short
+		period of time).
+  rw		mount the network share read-write (note that the
+		server may still consider the share read-only)
+  ro		mount network share read-only
+  version	used to distinguish different versions of the
+		mount helper utility (not typically needed)
+  sep		if first mount option (after the -o), overrides
+		the comma as the separator between the mount
+		parms. e.g.
+			-o user=myname,password=mypassword,domain=mydom
+		could be passed instead with period as the separator by
+			-o sep=.user=myname.password=mypassword.domain=mydom
+		this might be useful when comma is contained within username
+		or password or domain. This option is less important
+		when the cifs mount helper cifs.mount (version 1.1 or later)
+		is used.
+  nosuid        Do not allow remote executables with the suid bit 
+		program to be executed.  This is only meaningful for mounts
+		to servers such as Samba which support the CIFS Unix Extensions.
+		If you do not trust the servers in your network (your mount
+		targets) it is recommended that you specify this option for
+		greater security.
+  exec		Permit execution of binaries on the mount.
+  noexec	Do not permit execution of binaries on the mount.
+  dev		Recognize block devices on the remote mount.
+  nodev		Do not recognize devices on the remote mount.
+  suid          Allow remote files on this mountpoint with suid enabled to 
+		be executed (default for mounts when executed as root,
+		nosuid is default for user mounts).
+  credentials   Although ignored by the cifs kernel component, it is used by 
+		the mount helper, mount.cifs. When mount.cifs is installed it
+		opens and reads the credential file specified in order  
+		to obtain the userid and password arguments which are passed to
+		the cifs vfs.
+  guest         Although ignored by the kernel component, the mount.cifs
+		mount helper will not prompt the user for a password
+		if guest is specified on the mount options.  If no
+		password is specified a null password will be used.
+  perm          Client does permission checks (vfs_permission check of uid
+		and gid of the file against the mode and desired operation),
+		Note that this is in addition to the normal ACL check on the
+		target machine done by the server software. 
+		Client permission checking is enabled by default.
+  noperm        Client does not do permission checks.  This can expose
+		files on this mount to access by other users on the local
+		client system. It is typically only needed when the server
+		supports the CIFS Unix Extensions but the UIDs/GIDs on the
+		client and server system do not match closely enough to allow
+		access by the user doing the mount, but it may be useful with
+		non CIFS Unix Extension mounts for cases in which the default
+		mode is specified on the mount but is not to be enforced on the
+		client (e.g. perhaps when MultiUserMount is enabled)
+		Note that this does not affect the normal ACL check on the
+		target machine done by the server software (of the server
+		ACL against the user name provided at mount time).
+  serverino	Use server's inode numbers instead of generating automatically
+		incrementing inode numbers on the client.  Although this will
+		make it easier to spot hardlinked files (as they will have
+		the same inode numbers) and inode numbers may be persistent,
+		note that the server does not guarantee that the inode numbers
+		are unique if multiple server side mounts are exported under a
+		single share (since inode numbers on the servers might not
+		be unique if multiple filesystems are mounted under the same
+		shared higher level directory).  Note that some older
+		(e.g. pre-Windows 2000) do not support returning UniqueIDs
+		or the CIFS Unix Extensions equivalent and for those
+		this mount option will have no effect.  Exporting cifs mounts
+		under nfsd requires this mount option on the cifs mount.
+		This is now the default if server supports the 
+		required network operation.
+  noserverino   Client generates inode numbers (rather than using the actual one
+		from the server). These inode numbers will vary after
+		unmount or reboot which can confuse some applications,
+		but not all server filesystems support unique inode
+		numbers.
+  setuids       If the CIFS Unix extensions are negotiated with the server
+		the client will attempt to set the effective uid and gid of
+		the local process on newly created files, directories, and
+		devices (create, mkdir, mknod).  If the CIFS Unix Extensions
+		are not negotiated, for newly created files and directories
+		instead of using the default uid and gid specified on
+		the mount, cache the new file's uid and gid locally which means
+		that the uid for the file can change when the inode is
+	        reloaded (or the user remounts the share).
+  nosetuids     The client will not attempt to set the uid and gid on
+		on newly created files, directories, and devices (create, 
+		mkdir, mknod) which will result in the server setting the
+		uid and gid to the default (usually the server uid of the
+		user who mounted the share).  Letting the server (rather than
+		the client) set the uid and gid is the default. If the CIFS
+		Unix Extensions are not negotiated then the uid and gid for
+		new files will appear to be the uid (gid) of the mounter or the
+		uid (gid) parameter specified on the mount.
+  netbiosname   When mounting to servers via port 139, specifies the RFC1001
+		source name to use to represent the client netbios machine 
+		name when doing the RFC1001 netbios session initialize.
+  direct        Do not do inode data caching on files opened on this mount.
+		This precludes mmapping files on this mount. In some cases
+		with fast networks and little or no caching benefits on the
+		client (e.g. when the application is doing large sequential
+		reads bigger than page size without rereading the same data) 
+		this can provide better performance than the default
+		behavior which caches reads (readahead) and writes 
+		(writebehind) through the local Linux client pagecache 
+		if oplock (caching token) is granted and held. Note that
+		direct allows write operations larger than page size
+		to be sent to the server.
+  strictcache   Use for switching on strict cache mode. In this mode the
+		client read from the cache all the time it has Oplock Level II,
+		otherwise - read from the server. All written data are stored
+		in the cache, but if the client doesn't have Exclusive Oplock,
+		it writes the data to the server.
+  rwpidforward  Forward pid of a process who opened a file to any read or write
+		operation on that file. This prevent applications like WINE
+		from failing on read and write if we use mandatory brlock style.
+  acl   	Allow setfacl and getfacl to manage posix ACLs if server
+		supports them.  (default)
+  noacl 	Do not allow setfacl and getfacl calls on this mount
+  user_xattr    Allow getting and setting user xattrs (those attributes whose
+		name begins with "user." or "os2.") as OS/2 EAs (extended
+		attributes) to the server.  This allows support of the
+		setfattr and getfattr utilities. (default)
+  nouser_xattr  Do not allow getfattr/setfattr to get/set/list xattrs 
+  mapchars      Translate six of the seven reserved characters (not backslash)
+			*?<>|:
+		to the remap range (above 0xF000), which also
+		allows the CIFS client to recognize files created with
+		such characters by Windows's POSIX emulation. This can
+		also be useful when mounting to most versions of Samba
+		(which also forbids creating and opening files
+		whose names contain any of these seven characters).
+		This has no effect if the server does not support
+		Unicode on the wire.
+ nomapchars     Do not translate any of these seven characters (default).
+ nocase         Request case insensitive path name matching (case
+		sensitive is the default if the server suports it).
+		(mount option "ignorecase" is identical to "nocase")
+ posixpaths     If CIFS Unix extensions are supported, attempt to
+		negotiate posix path name support which allows certain
+		characters forbidden in typical CIFS filenames, without
+		requiring remapping. (default)
+ noposixpaths   If CIFS Unix extensions are supported, do not request
+		posix path name support (this may cause servers to
+		reject creatingfile with certain reserved characters).
+ nounix         Disable the CIFS Unix Extensions for this mount (tree
+		connection). This is rarely needed, but it may be useful
+		in order to turn off multiple settings all at once (ie
+		posix acls, posix locks, posix paths, symlink support
+		and retrieving uids/gids/mode from the server) or to
+		work around a bug in server which implement the Unix
+		Extensions.
+ nobrl          Do not send byte range lock requests to the server.
+		This is necessary for certain applications that break
+		with cifs style mandatory byte range locks (and most
+		cifs servers do not yet support requesting advisory
+		byte range locks).
+ forcemandatorylock Even if the server supports posix (advisory) byte range
+		locking, send only mandatory lock requests.  For some
+		(presumably rare) applications, originally coded for
+		DOS/Windows, which require Windows style mandatory byte range
+		locking, they may be able to take advantage of this option,
+		forcing the cifs client to only send mandatory locks
+		even if the cifs server would support posix advisory locks.
+		"forcemand" is accepted as a shorter form of this mount
+		option.
+ nostrictsync   If this mount option is set, when an application does an
+		fsync call then the cifs client does not send an SMB Flush
+		to the server (to force the server to write all dirty data
+		for this file immediately to disk), although cifs still sends
+		all dirty (cached) file data to the server and waits for the
+		server to respond to the write.  Since SMB Flush can be
+		very slow, and some servers may be reliable enough (to risk
+		delaying slightly flushing the data to disk on the server),
+		turning on this option may be useful to improve performance for
+		applications that fsync too much, at a small risk of server
+		crash.  If this mount option is not set, by default cifs will
+		send an SMB flush request (and wait for a response) on every
+		fsync call.
+ nodfs          Disable DFS (global name space support) even if the
+		server claims to support it.  This can help work around
+		a problem with parsing of DFS paths with Samba server
+		versions 3.0.24 and 3.0.25.
+ remount        remount the share (often used to change from ro to rw mounts
+	        or vice versa)
+ cifsacl        Report mode bits (e.g. on stat) based on the Windows ACL for
+	        the file. (EXPERIMENTAL)
+ servern        Specify the server 's netbios name (RFC1001 name) to use
+		when attempting to setup a session to the server. 
+		This is needed for mounting to some older servers (such
+		as OS/2 or Windows 98 and Windows ME) since they do not
+		support a default server name.  A server name can be up
+		to 15 characters long and is usually uppercased.
+ sfu            When the CIFS Unix Extensions are not negotiated, attempt to
+		create device files and fifos in a format compatible with
+		Services for Unix (SFU).  In addition retrieve bits 10-12
+		of the mode via the SETFILEBITS extended attribute (as
+		SFU does).  In the future the bottom 9 bits of the
+		mode also will be emulated using queries of the security
+		descriptor (ACL).
+ mfsymlinks     Enable support for Minshall+French symlinks
+		(see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks)
+		This option is ignored when specified together with the
+		'sfu' option. Minshall+French symlinks are used even if
+		the server supports the CIFS Unix Extensions.
+ sign           Must use packet signing (helps avoid unwanted data modification
+		by intermediate systems in the route).  Note that signing
+		does not work with lanman or plaintext authentication.
+ seal           Must seal (encrypt) all data on this mounted share before
+		sending on the network.  Requires support for Unix Extensions.
+		Note that this differs from the sign mount option in that it
+		causes encryption of data sent over this mounted share but other
+		shares mounted to the same server are unaffected.
+ locallease     This option is rarely needed. Fcntl F_SETLEASE is
+		used by some applications such as Samba and NFSv4 server to
+		check to see whether a file is cacheable.  CIFS has no way
+		to explicitly request a lease, but can check whether a file
+		is cacheable (oplocked).  Unfortunately, even if a file
+		is not oplocked, it could still be cacheable (ie cifs client
+		could grant fcntl leases if no other local processes are using
+		the file) for cases for example such as when the server does not
+		support oplocks and the user is sure that the only updates to
+		the file will be from this client. Specifying this mount option
+		will allow the cifs client to check for leases (only) locally
+		for files which are not oplocked instead of denying leases
+		in that case. (EXPERIMENTAL)
+ sec            Security mode.  Allowed values are:
+			none	attempt to connection as a null user (no name)
+			krb5    Use Kerberos version 5 authentication
+			krb5i   Use Kerberos authentication and packet signing
+			ntlm    Use NTLM password hashing (default)
+			ntlmi   Use NTLM password hashing with signing (if
+				/proc/fs/cifs/PacketSigningEnabled on or if
+				server requires signing also can be the default) 
+			ntlmv2  Use NTLMv2 password hashing      
+			ntlmv2i Use NTLMv2 password hashing with packet signing
+			lanman  (if configured in kernel config) use older
+				lanman hash
+hard		Retry file operations if server is not responding
+soft		Limit retries to unresponsive servers (usually only
+		one retry) before returning an error.  (default)
+
+The mount.cifs mount helper also accepts a few mount options before -o
+including:
+
+	-S      take password from stdin (equivalent to setting the environment
+		variable "PASSWD_FD=0"
+	-V      print mount.cifs version
+	-?      display simple usage information
+
+With most 2.6 kernel versions of modutils, the version of the cifs kernel
+module can be displayed via modinfo.
+
+Misc /proc/fs/cifs Flags and Debug Info
+=======================================
+Informational pseudo-files:
+DebugData		Displays information about active CIFS sessions and
+			shares, features enabled as well as the cifs.ko
+			version.
+Stats			Lists summary resource usage information as well as per
+			share statistics, if CONFIG_CIFS_STATS in enabled
+			in the kernel configuration.
+
+Configuration pseudo-files:
+MultiuserMount		If set to one, more than one CIFS session to 
+			the same server ip address can be established
+			if more than one uid accesses the same mount
+			point and if the uids user/password mapping
+			information is available. (default is 0)
+PacketSigningEnabled	If set to one, cifs packet signing is enabled
+			and will be used if the server requires 
+			it.  If set to two, cifs packet signing is
+			required even if the server considers packet
+			signing optional. (default 1)
+SecurityFlags		Flags which control security negotiation and
+			also packet signing. Authentication (may/must)
+			flags (e.g. for NTLM and/or NTLMv2) may be combined with
+			the signing flags.  Specifying two different password
+			hashing mechanisms (as "must use") on the other hand 
+			does not make much sense. Default flags are 
+				0x07007 
+			(NTLM, NTLMv2 and packet signing allowed).  The maximum 
+			allowable flags if you want to allow mounts to servers
+			using weaker password hashes is 0x37037 (lanman,
+			plaintext, ntlm, ntlmv2, signing allowed).  Some
+			SecurityFlags require the corresponding menuconfig
+			options to be enabled (lanman and plaintext require
+			CONFIG_CIFS_WEAK_PW_HASH for example).  Enabling
+			plaintext authentication currently requires also
+			enabling lanman authentication in the security flags
+			because the cifs module only supports sending
+			laintext passwords using the older lanman dialect
+			form of the session setup SMB.  (e.g. for authentication
+			using plain text passwords, set the SecurityFlags
+			to 0x30030):
+ 
+			may use packet signing 				0x00001
+			must use packet signing				0x01001
+			may use NTLM (most common password hash)	0x00002
+			must use NTLM					0x02002
+			may use NTLMv2					0x00004
+			must use NTLMv2					0x04004
+			may use Kerberos security			0x00008
+			must use Kerberos				0x08008
+			may use lanman (weak) password hash  		0x00010
+			must use lanman password hash			0x10010
+			may use plaintext passwords    			0x00020
+			must use plaintext passwords			0x20020
+			(reserved for future packet encryption)		0x00040
+
+cifsFYI			If set to non-zero value, additional debug information
+			will be logged to the system error log.  This field
+			contains three flags controlling different classes of
+			debugging entries.  The maximum value it can be set
+			to is 7 which enables all debugging points (default 0).
+			Some debugging statements are not compiled into the
+			cifs kernel unless CONFIG_CIFS_DEBUG2 is enabled in the
+			kernel configuration. cifsFYI may be set to one or
+			nore of the following flags (7 sets them all):
+
+			log cifs informational messages			0x01
+			log return codes from cifs entry points		0x02
+			log slow responses (ie which take longer than 1 second)
+			  CONFIG_CIFS_STATS2 must be enabled in .config	0x04
+				
+				
+traceSMB		If set to one, debug information is logged to the
+			system error log with the start of smb requests
+			and responses (default 0)
+LookupCacheEnable	If set to one, inode information is kept cached
+			for one second improving performance of lookups
+			(default 1)
+OplockEnabled		If set to one, safe distributed caching enabled.
+			(default 1)
+LinuxExtensionsEnabled	If set to one then the client will attempt to
+			use the CIFS "UNIX" extensions which are optional
+			protocol enhancements that allow CIFS servers
+			to return accurate UID/GID information as well
+			as support symbolic links. If you use servers
+			such as Samba that support the CIFS Unix
+			extensions but do not want to use symbolic link
+			support and want to map the uid and gid fields 
+			to values supplied at mount (rather than the 
+			actual values, then set this to zero. (default 1)
+
+These experimental features and tracing can be enabled by changing flags in 
+/proc/fs/cifs (after the cifs module has been installed or built into the 
+kernel, e.g.  insmod cifs).  To enable a feature set it to 1 e.g.  to enable 
+tracing to the kernel message log type: 
+
+	echo 7 > /proc/fs/cifs/cifsFYI
+	
+cifsFYI functions as a bit mask. Setting it to 1 enables additional kernel
+logging of various informational messages.  2 enables logging of non-zero
+SMB return codes while 4 enables logging of requests that take longer
+than one second to complete (except for byte range lock requests). 
+Setting it to 4 requires defining CONFIG_CIFS_STATS2 manually in the
+source code (typically by setting it in the beginning of cifsglob.h),
+and setting it to seven enables all three.  Finally, tracing
+the start of smb requests and responses can be enabled via:
+
+	echo 1 > /proc/fs/cifs/traceSMB
+
+Per share (per client mount) statistics are available in /proc/fs/cifs/Stats
+if the kernel was configured with cifs statistics enabled.  The statistics
+represent the number of successful (ie non-zero return code from the server) 
+SMB responses to some of the more common commands (open, delete, mkdir etc.).
+Also recorded is the total bytes read and bytes written to the server for
+that share.  Note that due to client caching effects this can be less than the
+number of bytes read and written by the application running on the client.
+The statistics for the number of total SMBs and oplock breaks are different in
+that they represent all for that share, not just those for which the server
+returned success.
+	
+Also note that "cat /proc/fs/cifs/DebugData" will display information about
+the active sessions and the shares that are mounted.
+
+Enabling Kerberos (extended security) works but requires version 1.2 or later
+of the helper program cifs.upcall to be present and to be configured in the
+/etc/request-key.conf file.  The cifs.upcall helper program is from the Samba
+project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not
+require this helper. Note that NTLMv2 security (which does not require the
+cifs.upcall helper program), instead of using Kerberos, is sufficient for
+some use cases.
+
+DFS support allows transparent redirection to shares in an MS-DFS name space.
+In addition, DFS support for target shares which are specified as UNC
+names which begin with host names (rather than IP addresses) requires
+a user space helper (such as cifs.upcall) to be present in order to
+translate host names to ip address, and the user space helper must also
+be configured in the file /etc/request-key.conf.  Samba, Windows servers and
+many NAS appliances support DFS as a way of constructing a global name
+space to ease network configuration and improve reliability.
+
+To use cifs Kerberos and DFS support, the Linux keyutils package should be
+installed and something like the following lines should be added to the
+/etc/request-key.conf file:
+
+create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
+create dns_resolver * * /usr/local/sbin/cifs.upcall %k
+
+
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
new file mode 100644
index 00000000..355abcdc
--- /dev/null
+++ b/fs/cifs/TODO
@@ -0,0 +1,129 @@
+Version 1.53 May 20, 2008
+
+A Partial List of Missing Features
+==================================
+
+Contributions are welcome.  There are plenty of opportunities
+for visible, important contributions to this module.  Here
+is a partial list of the known problems and missing features:
+
+a) Support for SecurityDescriptors(Windows/CIFS ACLs) for chmod/chgrp/chown
+so that these operations can be supported to Windows servers
+
+b) Mapping POSIX ACLs (and eventually NFSv4 ACLs) to CIFS
+SecurityDescriptors
+
+c) Better pam/winbind integration (e.g. to handle uid mapping
+better)
+
+d) Cleanup now unneeded SessSetup code in
+fs/cifs/connect.c and add back in NTLMSSP code if any servers
+need it
+
+e) fix NTLMv2 signing when two mounts with different users to same
+server.
+
+f) Directory entry caching relies on a 1 second timer, rather than 
+using FindNotify or equivalent.  - (started)
+
+g) quota support (needs minor kernel change since quota calls
+to make it to network filesystems or deviceless filesystems)
+
+h) investigate sync behavior (including syncpage) and check  
+for proper behavior of intr/nointr
+
+i) improve support for very old servers (OS/2 and Win9x for example)
+Including support for changing the time remotely (utimes command).
+
+j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
+extra copy in/out of the socket buffers in some cases.
+
+k) Better optimize open (and pathbased setfilesize) to reduce the
+oplock breaks coming from windows srv.  Piggyback identical file
+opens on top of each other by incrementing reference count rather
+than resending (helps reduce server resource utilization and avoid
+spurious oplock breaks).
+
+l) Improve performance of readpages by sending more than one read
+at a time when 8 pages or more are requested. In conjuntion
+add support for async_cifs_readpages.
+
+m) Add support for storing symlink info to Windows servers 
+in the Extended Attribute format their SFU clients would recognize.
+
+n) Finish fcntl D_NOTIFY support so kde and gnome file list windows
+will autorefresh (partially complete by Asser). Needs minor kernel
+vfs change to support removing D_NOTIFY on a file.   
+
+o) Add GUI tool to configure /proc/fs/cifs settings and for display of
+the CIFS statistics (started)
+
+p) implement support for security and trusted categories of xattrs
+(requires minor protocol extension) to enable better support for SELINUX
+
+q) Implement O_DIRECT flag on open (already supported on mount)
+
+r) Create UID mapping facility so server UIDs can be mapped on a per
+mount or a per server basis to client UIDs or nobody if no mapping
+exists.  This is helpful when Unix extensions are negotiated to
+allow better permission checking when UIDs differ on the server
+and client.  Add new protocol request to the CIFS protocol 
+standard for asking the server for the corresponding name of a
+particular uid.
+
+s) Add support for CIFS Unix and also the newer POSIX extensions to the
+server side for Samba 4.
+
+t) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers) 
+need to add ability to set time to server (utimes command)
+
+u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
+
+v) mount check for unmatched uids
+
+w) Add support for new vfs entry point for fallocate
+
+x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 
+processes can proceed better in parallel (on the server)
+
+y) Fix Samba 3 to handle reads/writes over 127K (and remove the cifs mount
+restriction of wsize max being 127K) 
+
+KNOWN BUGS (updated April 24, 2007)
+====================================
+See http://bugzilla.samba.org - search on product "CifsVFS" for
+current bug list.
+
+1) existing symbolic links (Windows reparse points) are recognized but
+can not be created remotely. They are implemented for Samba and those that
+support the CIFS Unix extensions, although earlier versions of Samba
+overly restrict the pathnames.
+2) follow_link and readdir code does not follow dfs junctions
+but recognizes them
+3) create of new files to FAT partitions on Windows servers can
+succeed but still return access denied (appears to be Windows 
+server not cifs client problem) and has not been reproduced recently.
+NTFS partitions do not have this problem.
+4) Unix/POSIX capabilities are reset after reconnection, and affect
+a few fields in the tree connection but we do do not know which
+superblocks to apply these changes to.  We should probably walk
+the list of superblocks to set these.  Also need to check the
+flags on the second mount to the same share, and see if we
+can do the same trick that NFS does to remount duplicate shares.
+
+Misc testing to do
+==================
+1) check out max path names and max path name components against various server
+types. Try nested symlinks (8 deep). Return max path name in stat -f information
+
+2) Modify file portion of ltp so it can run against a mounted network
+share and run it against cifs vfs in automated fashion.
+
+3) Additional performance testing and optimization using iozone and similar - 
+there are some easy changes that can be done to parallelize sequential writes,
+and when signing is disabled to request larger read sizes (larger than 
+negotiated size) and send larger write sizes to modern servers.
+
+4) More exhaustively test against less common servers.  More testing
+against Windows 9x, Windows ME servers.
+
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
new file mode 100644
index 00000000..cfd1ce34
--- /dev/null
+++ b/fs/cifs/asn1.c
@@ -0,0 +1,666 @@
+/*
+ * The ASB.1/BER parsing code is derived from ip_nat_snmp_basic.c which was in
+ * turn derived from the gxsnmp package by Gregory McLean & Jochen Friedrich
+ *
+ * Copyright (c) 2000 RP Internet (www.rpi.net.au).
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "cifsproto.h"
+
+/*****************************************************************************
+ *
+ * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse)
+ *
+ *****************************************************************************/
+
+/* Class */
+#define ASN1_UNI	0	/* Universal */
+#define ASN1_APL	1	/* Application */
+#define ASN1_CTX	2	/* Context */
+#define ASN1_PRV	3	/* Private */
+
+/* Tag */
+#define ASN1_EOC	0	/* End Of Contents or N/A */
+#define ASN1_BOL	1	/* Boolean */
+#define ASN1_INT	2	/* Integer */
+#define ASN1_BTS	3	/* Bit String */
+#define ASN1_OTS	4	/* Octet String */
+#define ASN1_NUL	5	/* Null */
+#define ASN1_OJI	6	/* Object Identifier  */
+#define ASN1_OJD	7	/* Object Description */
+#define ASN1_EXT	8	/* External */
+#define ASN1_ENUM	10	/* Enumerated */
+#define ASN1_SEQ	16	/* Sequence */
+#define ASN1_SET	17	/* Set */
+#define ASN1_NUMSTR	18	/* Numerical String */
+#define ASN1_PRNSTR	19	/* Printable String */
+#define ASN1_TEXSTR	20	/* Teletext String */
+#define ASN1_VIDSTR	21	/* Video String */
+#define ASN1_IA5STR	22	/* IA5 String */
+#define ASN1_UNITIM	23	/* Universal Time */
+#define ASN1_GENTIM	24	/* General Time */
+#define ASN1_GRASTR	25	/* Graphical String */
+#define ASN1_VISSTR	26	/* Visible String */
+#define ASN1_GENSTR	27	/* General String */
+
+/* Primitive / Constructed methods*/
+#define ASN1_PRI	0	/* Primitive */
+#define ASN1_CON	1	/* Constructed */
+
+/*
+ * Error codes.
+ */
+#define ASN1_ERR_NOERROR		0
+#define ASN1_ERR_DEC_EMPTY		2
+#define ASN1_ERR_DEC_EOC_MISMATCH	3
+#define ASN1_ERR_DEC_LENGTH_MISMATCH	4
+#define ASN1_ERR_DEC_BADVALUE		5
+
+#define SPNEGO_OID_LEN 7
+#define NTLMSSP_OID_LEN  10
+#define KRB5_OID_LEN  7
+#define KRB5U2U_OID_LEN  8
+#define MSKRB5_OID_LEN  7
+static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 };
+static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 };
+static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 };
+static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 };
+static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 };
+
+/*
+ * ASN.1 context.
+ */
+struct asn1_ctx {
+	int error;		/* Error condition */
+	unsigned char *pointer;	/* Octet just to be decoded */
+	unsigned char *begin;	/* First octet */
+	unsigned char *end;	/* Octet after last octet */
+};
+
+/*
+ * Octet string (not null terminated)
+ */
+struct asn1_octstr {
+	unsigned char *data;
+	unsigned int len;
+};
+
+static void
+asn1_open(struct asn1_ctx *ctx, unsigned char *buf, unsigned int len)
+{
+	ctx->begin = buf;
+	ctx->end = buf + len;
+	ctx->pointer = buf;
+	ctx->error = ASN1_ERR_NOERROR;
+}
+
+static unsigned char
+asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
+{
+	if (ctx->pointer >= ctx->end) {
+		ctx->error = ASN1_ERR_DEC_EMPTY;
+		return 0;
+	}
+	*ch = *(ctx->pointer)++;
+	return 1;
+}
+
+#if 0 /* will be needed later by spnego decoding/encoding of ntlmssp */
+static unsigned char
+asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val)
+{
+	unsigned char ch;
+
+	if (ctx->pointer >= ctx->end) {
+		ctx->error = ASN1_ERR_DEC_EMPTY;
+		return 0;
+	}
+
+	ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */
+	if ((ch) == ASN1_ENUM)  /* if ch value is ENUM, 0xa */
+		*val = *(++(ctx->pointer)); /* value has enum value */
+	else
+		return 0;
+
+	ctx->pointer++;
+	return 1;
+}
+#endif
+
+static unsigned char
+asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
+{
+	unsigned char ch;
+
+	*tag = 0;
+
+	do {
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+		*tag <<= 7;
+		*tag |= ch & 0x7F;
+	} while ((ch & 0x80) == 0x80);
+	return 1;
+}
+
+static unsigned char
+asn1_id_decode(struct asn1_ctx *ctx,
+	       unsigned int *cls, unsigned int *con, unsigned int *tag)
+{
+	unsigned char ch;
+
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+
+	*cls = (ch & 0xC0) >> 6;
+	*con = (ch & 0x20) >> 5;
+	*tag = (ch & 0x1F);
+
+	if (*tag == 0x1F) {
+		if (!asn1_tag_decode(ctx, tag))
+			return 0;
+	}
+	return 1;
+}
+
+static unsigned char
+asn1_length_decode(struct asn1_ctx *ctx, unsigned int *def, unsigned int *len)
+{
+	unsigned char ch, cnt;
+
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+
+	if (ch == 0x80)
+		*def = 0;
+	else {
+		*def = 1;
+
+		if (ch < 0x80)
+			*len = ch;
+		else {
+			cnt = (unsigned char) (ch & 0x7F);
+			*len = 0;
+
+			while (cnt > 0) {
+				if (!asn1_octet_decode(ctx, &ch))
+					return 0;
+				*len <<= 8;
+				*len |= ch;
+				cnt--;
+			}
+		}
+	}
+
+	/* don't trust len bigger than ctx buffer */
+	if (*len > ctx->end - ctx->pointer)
+		return 0;
+
+	return 1;
+}
+
+static unsigned char
+asn1_header_decode(struct asn1_ctx *ctx,
+		   unsigned char **eoc,
+		   unsigned int *cls, unsigned int *con, unsigned int *tag)
+{
+	unsigned int def = 0;
+	unsigned int len = 0;
+
+	if (!asn1_id_decode(ctx, cls, con, tag))
+		return 0;
+
+	if (!asn1_length_decode(ctx, &def, &len))
+		return 0;
+
+	/* primitive shall be definite, indefinite shall be constructed */
+	if (*con == ASN1_PRI && !def)
+		return 0;
+
+	if (def)
+		*eoc = ctx->pointer + len;
+	else
+		*eoc = NULL;
+	return 1;
+}
+
+static unsigned char
+asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+{
+	unsigned char ch;
+
+	if (eoc == NULL) {
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		if (ch != 0x00) {
+			ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+			return 0;
+		}
+
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		if (ch != 0x00) {
+			ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+			return 0;
+		}
+		return 1;
+	} else {
+		if (ctx->pointer != eoc) {
+			ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH;
+			return 0;
+		}
+		return 1;
+	}
+}
+
+/* static unsigned char asn1_null_decode(struct asn1_ctx *ctx,
+				      unsigned char *eoc)
+{
+	ctx->pointer = eoc;
+	return 1;
+}
+
+static unsigned char asn1_long_decode(struct asn1_ctx *ctx,
+				      unsigned char *eoc, long *integer)
+{
+	unsigned char ch;
+	unsigned int len;
+
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+
+	*integer = (signed char) ch;
+	len = 1;
+
+	while (ctx->pointer < eoc) {
+		if (++len > sizeof(long)) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			return 0;
+		}
+
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		*integer <<= 8;
+		*integer |= ch;
+	}
+	return 1;
+}
+
+static unsigned char asn1_uint_decode(struct asn1_ctx *ctx,
+				      unsigned char *eoc,
+				      unsigned int *integer)
+{
+	unsigned char ch;
+	unsigned int len;
+
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+
+	*integer = ch;
+	if (ch == 0)
+		len = 0;
+	else
+		len = 1;
+
+	while (ctx->pointer < eoc) {
+		if (++len > sizeof(unsigned int)) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			return 0;
+		}
+
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		*integer <<= 8;
+		*integer |= ch;
+	}
+	return 1;
+}
+
+static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx,
+				       unsigned char *eoc,
+				       unsigned long *integer)
+{
+	unsigned char ch;
+	unsigned int len;
+
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+
+	*integer = ch;
+	if (ch == 0)
+		len = 0;
+	else
+		len = 1;
+
+	while (ctx->pointer < eoc) {
+		if (++len > sizeof(unsigned long)) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			return 0;
+		}
+
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		*integer <<= 8;
+		*integer |= ch;
+	}
+	return 1;
+}
+
+static unsigned char
+asn1_octets_decode(struct asn1_ctx *ctx,
+		   unsigned char *eoc,
+		   unsigned char **octets, unsigned int *len)
+{
+	unsigned char *ptr;
+
+	*len = 0;
+
+	*octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
+	if (*octets == NULL) {
+		return 0;
+	}
+
+	ptr = *octets;
+	while (ctx->pointer < eoc) {
+		if (!asn1_octet_decode(ctx, (unsigned char *) ptr++)) {
+			kfree(*octets);
+			*octets = NULL;
+			return 0;
+		}
+		(*len)++;
+	}
+	return 1;
+} */
+
+static unsigned char
+asn1_subid_decode(struct asn1_ctx *ctx, unsigned long *subid)
+{
+	unsigned char ch;
+
+	*subid = 0;
+
+	do {
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		*subid <<= 7;
+		*subid |= ch & 0x7F;
+	} while ((ch & 0x80) == 0x80);
+	return 1;
+}
+
+static int
+asn1_oid_decode(struct asn1_ctx *ctx,
+		unsigned char *eoc, unsigned long **oid, unsigned int *len)
+{
+	unsigned long subid;
+	unsigned int size;
+	unsigned long *optr;
+
+	size = eoc - ctx->pointer + 1;
+
+	/* first subid actually encodes first two subids */
+	if (size < 2 || size > UINT_MAX/sizeof(unsigned long))
+		return 0;
+
+	*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
+	if (*oid == NULL)
+		return 0;
+
+	optr = *oid;
+
+	if (!asn1_subid_decode(ctx, &subid)) {
+		kfree(*oid);
+		*oid = NULL;
+		return 0;
+	}
+
+	if (subid < 40) {
+		optr[0] = 0;
+		optr[1] = subid;
+	} else if (subid < 80) {
+		optr[0] = 1;
+		optr[1] = subid - 40;
+	} else {
+		optr[0] = 2;
+		optr[1] = subid - 80;
+	}
+
+	*len = 2;
+	optr += 2;
+
+	while (ctx->pointer < eoc) {
+		if (++(*len) > size) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			kfree(*oid);
+			*oid = NULL;
+			return 0;
+		}
+
+		if (!asn1_subid_decode(ctx, optr++)) {
+			kfree(*oid);
+			*oid = NULL;
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static int
+compare_oid(unsigned long *oid1, unsigned int oid1len,
+	    unsigned long *oid2, unsigned int oid2len)
+{
+	unsigned int i;
+
+	if (oid1len != oid2len)
+		return 0;
+	else {
+		for (i = 0; i < oid1len; i++) {
+			if (oid1[i] != oid2[i])
+				return 0;
+		}
+		return 1;
+	}
+}
+
+	/* BB check for endian conversion issues here */
+
+int
+decode_negTokenInit(unsigned char *security_blob, int length,
+		    struct TCP_Server_Info *server)
+{
+	struct asn1_ctx ctx;
+	unsigned char *end;
+	unsigned char *sequence_end;
+	unsigned long *oid = NULL;
+	unsigned int cls, con, tag, oidlen, rc;
+
+	/* cifs_dump_mem(" Received SecBlob ", security_blob, length); */
+
+	asn1_open(&ctx, security_blob, length);
+
+	/* GSSAPI header */
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cFYI(1, "Error decoding negTokenInit header");
+		return 0;
+	} else if ((cls != ASN1_APL) || (con != ASN1_CON)
+		   || (tag != ASN1_EOC)) {
+		cFYI(1, "cls = %d con = %d tag = %d", cls, con, tag);
+		return 0;
+	}
+
+	/* Check for SPNEGO OID -- remember to free obj->oid */
+	rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
+	if (rc) {
+		if ((tag == ASN1_OJI) && (con == ASN1_PRI) &&
+		    (cls == ASN1_UNI)) {
+			rc = asn1_oid_decode(&ctx, end, &oid, &oidlen);
+			if (rc) {
+				rc = compare_oid(oid, oidlen, SPNEGO_OID,
+						 SPNEGO_OID_LEN);
+				kfree(oid);
+			}
+		} else
+			rc = 0;
+	}
+
+	/* SPNEGO OID not present or garbled -- bail out */
+	if (!rc) {
+		cFYI(1, "Error decoding negTokenInit header");
+		return 0;
+	}
+
+	/* SPNEGO */
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cFYI(1, "Error decoding negTokenInit");
+		return 0;
+	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
+		   || (tag != ASN1_EOC)) {
+		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
+		     cls, con, tag, end, *end);
+		return 0;
+	}
+
+	/* negTokenInit */
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cFYI(1, "Error decoding negTokenInit");
+		return 0;
+	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
+		   || (tag != ASN1_SEQ)) {
+		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
+		     cls, con, tag, end, *end);
+		return 0;
+	}
+
+	/* sequence */
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cFYI(1, "Error decoding 2nd part of negTokenInit");
+		return 0;
+	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
+		   || (tag != ASN1_EOC)) {
+		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
+		     cls, con, tag, end, *end);
+		return 0;
+	}
+
+	/* sequence of */
+	if (asn1_header_decode
+	    (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
+		cFYI(1, "Error decoding 2nd part of negTokenInit");
+		return 0;
+	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
+		   || (tag != ASN1_SEQ)) {
+		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
+		     cls, con, tag, end, *end);
+		return 0;
+	}
+
+	/* list of security mechanisms */
+	while (!asn1_eoc_decode(&ctx, sequence_end)) {
+		rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
+		if (!rc) {
+			cFYI(1, "Error decoding negTokenInit hdr exit2");
+			return 0;
+		}
+		if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
+			if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
+
+				cFYI(1, "OID len = %d oid = 0x%lx 0x%lx "
+					"0x%lx 0x%lx", oidlen, *oid,
+					*(oid + 1), *(oid + 2), *(oid + 3));
+
+				if (compare_oid(oid, oidlen, MSKRB5_OID,
+						MSKRB5_OID_LEN))
+					server->sec_mskerberos = true;
+				else if (compare_oid(oid, oidlen, KRB5U2U_OID,
+						     KRB5U2U_OID_LEN))
+					server->sec_kerberosu2u = true;
+				else if (compare_oid(oid, oidlen, KRB5_OID,
+						     KRB5_OID_LEN))
+					server->sec_kerberos = true;
+				else if (compare_oid(oid, oidlen, NTLMSSP_OID,
+						     NTLMSSP_OID_LEN))
+					server->sec_ntlmssp = true;
+
+				kfree(oid);
+			}
+		} else {
+			cFYI(1, "Should be an oid what is going on?");
+		}
+	}
+
+	/* mechlistMIC */
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		/* Check if we have reached the end of the blob, but with
+		   no mechListMic (e.g. NTLMSSP instead of KRB5) */
+		if (ctx.error == ASN1_ERR_DEC_EMPTY)
+			goto decode_negtoken_exit;
+		cFYI(1, "Error decoding last part negTokenInit exit3");
+		return 0;
+	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
+		/* tag = 3 indicating mechListMIC */
+		cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
+			cls, con, tag, end, *end);
+		return 0;
+	}
+
+	/* sequence */
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cFYI(1, "Error decoding last part negTokenInit exit5");
+		return 0;
+	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
+		   || (tag != ASN1_SEQ)) {
+		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)",
+			cls, con, tag, end, *end);
+	}
+
+	/* sequence of */
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cFYI(1, "Error decoding last part negTokenInit exit 7");
+		return 0;
+	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
+		cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
+			cls, con, tag, end, *end);
+		return 0;
+	}
+
+	/* general string */
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cFYI(1, "Error decoding last part negTokenInit exit9");
+		return 0;
+	} else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
+		   || (tag != ASN1_GENSTR)) {
+		cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)",
+			cls, con, tag, end, *end);
+		return 0;
+	}
+	cFYI(1, "Need to call asn1_octets_decode() function for %s",
+		ctx.pointer);	/* is this UTF-8 or ASCII? */
+decode_negtoken_exit:
+	return 1;
+}
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
new file mode 100644
index 00000000..545509c3
--- /dev/null
+++ b/fs/cifs/cache.c
@@ -0,0 +1,333 @@
+/*
+ *   fs/cifs/cache.c - CIFS filesystem cache index structure definitions
+ *
+ *   Copyright (c) 2010 Novell, Inc.
+ *   Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include "fscache.h"
+#include "cifs_debug.h"
+
+/*
+ * CIFS filesystem definition for FS-Cache
+ */
+struct fscache_netfs cifs_fscache_netfs = {
+	.name = "cifs",
+	.version = 0,
+};
+
+/*
+ * Register CIFS for caching with FS-Cache
+ */
+int cifs_fscache_register(void)
+{
+	return fscache_register_netfs(&cifs_fscache_netfs);
+}
+
+/*
+ * Unregister CIFS for caching
+ */
+void cifs_fscache_unregister(void)
+{
+	fscache_unregister_netfs(&cifs_fscache_netfs);
+}
+
+/*
+ * Key layout of CIFS server cache index object
+ */
+struct cifs_server_key {
+	uint16_t	family;		/* address family */
+	__be16		port;		/* IP port */
+	union {
+		struct in_addr	ipv4_addr;
+		struct in6_addr	ipv6_addr;
+	} addr[0];
+};
+
+/*
+ * Server object keyed by {IPaddress,port,family} tuple
+ */
+static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
+				   void *buffer, uint16_t maxbuf)
+{
+	const struct TCP_Server_Info *server = cookie_netfs_data;
+	const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
+	const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
+	const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
+	struct cifs_server_key *key = buffer;
+	uint16_t key_len = sizeof(struct cifs_server_key);
+
+	memset(key, 0, key_len);
+
+	/*
+	 * Should not be a problem as sin_family/sin6_family overlays
+	 * sa_family field
+	 */
+	switch (sa->sa_family) {
+	case AF_INET:
+		key->family = sa->sa_family;
+		key->port = addr->sin_port;
+		key->addr[0].ipv4_addr = addr->sin_addr;
+		key_len += sizeof(key->addr[0].ipv4_addr);
+		break;
+
+	case AF_INET6:
+		key->family = sa->sa_family;
+		key->port = addr6->sin6_port;
+		key->addr[0].ipv6_addr = addr6->sin6_addr;
+		key_len += sizeof(key->addr[0].ipv6_addr);
+		break;
+
+	default:
+		cERROR(1, "Unknown network family '%d'", sa->sa_family);
+		key_len = 0;
+		break;
+	}
+
+	return key_len;
+}
+
+/*
+ * Server object for FS-Cache
+ */
+const struct fscache_cookie_def cifs_fscache_server_index_def = {
+	.name = "CIFS.server",
+	.type = FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key = cifs_server_get_key,
+};
+
+/*
+ * Auxiliary data attached to CIFS superblock within the cache
+ */
+struct cifs_fscache_super_auxdata {
+	u64	resource_id;		/* unique server resource id */
+};
+
+static char *extract_sharename(const char *treename)
+{
+	const char *src;
+	char *delim, *dst;
+	int len;
+
+	/* skip double chars at the beginning */
+	src = treename + 2;
+
+	/* share name is always preceded by '\\' now */
+	delim = strchr(src, '\\');
+	if (!delim)
+		return ERR_PTR(-EINVAL);
+	delim++;
+	len = strlen(delim);
+
+	/* caller has to free the memory */
+	dst = kstrndup(delim, len, GFP_KERNEL);
+	if (!dst)
+		return ERR_PTR(-ENOMEM);
+
+	return dst;
+}
+
+/*
+ * Superblock object currently keyed by share name
+ */
+static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
+				   uint16_t maxbuf)
+{
+	const struct cifs_tcon *tcon = cookie_netfs_data;
+	char *sharename;
+	uint16_t len;
+
+	sharename = extract_sharename(tcon->treeName);
+	if (IS_ERR(sharename)) {
+		cFYI(1, "%s: couldn't extract sharename\n", __func__);
+		sharename = NULL;
+		return 0;
+	}
+
+	len = strlen(sharename);
+	if (len > maxbuf)
+		return 0;
+
+	memcpy(buffer, sharename, len);
+
+	kfree(sharename);
+
+	return len;
+}
+
+static uint16_t
+cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer,
+			   uint16_t maxbuf)
+{
+	struct cifs_fscache_super_auxdata auxdata;
+	const struct cifs_tcon *tcon = cookie_netfs_data;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.resource_id = tcon->resource_id;
+
+	if (maxbuf > sizeof(auxdata))
+		maxbuf = sizeof(auxdata);
+
+	memcpy(buffer, &auxdata, maxbuf);
+
+	return maxbuf;
+}
+
+static enum
+fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
+					      const void *data,
+					      uint16_t datalen)
+{
+	struct cifs_fscache_super_auxdata auxdata;
+	const struct cifs_tcon *tcon = cookie_netfs_data;
+
+	if (datalen != sizeof(auxdata))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.resource_id = tcon->resource_id;
+
+	if (memcmp(data, &auxdata, datalen) != 0)
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*
+ * Superblock object for FS-Cache
+ */
+const struct fscache_cookie_def cifs_fscache_super_index_def = {
+	.name = "CIFS.super",
+	.type = FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key = cifs_super_get_key,
+	.get_aux = cifs_fscache_super_get_aux,
+	.check_aux = cifs_fscache_super_check_aux,
+};
+
+/*
+ * Auxiliary data attached to CIFS inode within the cache
+ */
+struct cifs_fscache_inode_auxdata {
+	struct timespec	last_write_time;
+	struct timespec	last_change_time;
+	u64		eof;
+};
+
+static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data,
+					   void *buffer, uint16_t maxbuf)
+{
+	const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+	uint16_t keylen;
+
+	/* use the UniqueId as the key */
+	keylen = sizeof(cifsi->uniqueid);
+	if (keylen > maxbuf)
+		keylen = 0;
+	else
+		memcpy(buffer, &cifsi->uniqueid, keylen);
+
+	return keylen;
+}
+
+static void
+cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size)
+{
+	const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+
+	*size = cifsi->vfs_inode.i_size;
+}
+
+static uint16_t
+cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer,
+			   uint16_t maxbuf)
+{
+	struct cifs_fscache_inode_auxdata auxdata;
+	const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.eof = cifsi->server_eof;
+	auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+	auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+
+	if (maxbuf > sizeof(auxdata))
+		maxbuf = sizeof(auxdata);
+
+	memcpy(buffer, &auxdata, maxbuf);
+
+	return maxbuf;
+}
+
+static enum
+fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
+					      const void *data,
+					      uint16_t datalen)
+{
+	struct cifs_fscache_inode_auxdata auxdata;
+	struct cifsInodeInfo *cifsi = cookie_netfs_data;
+
+	if (datalen != sizeof(auxdata))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.eof = cifsi->server_eof;
+	auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+	auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+
+	if (memcmp(data, &auxdata, datalen) != 0)
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
+{
+	struct cifsInodeInfo *cifsi = cookie_netfs_data;
+	struct pagevec pvec;
+	pgoff_t first;
+	int loop, nr_pages;
+
+	pagevec_init(&pvec, 0);
+	first = 0;
+
+	cFYI(1, "%s: cifs inode 0x%p now uncached", __func__, cifsi);
+
+	for (;;) {
+		nr_pages = pagevec_lookup(&pvec,
+					  cifsi->vfs_inode.i_mapping, first,
+					  PAGEVEC_SIZE - pagevec_count(&pvec));
+		if (!nr_pages)
+			break;
+
+		for (loop = 0; loop < nr_pages; loop++)
+			ClearPageFsCache(pvec.pages[loop]);
+
+		first = pvec.pages[nr_pages - 1]->index + 1;
+
+		pvec.nr = nr_pages;
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+const struct fscache_cookie_def cifs_fscache_inode_object_def = {
+	.name		= "CIFS.uniqueid",
+	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
+	.get_key	= cifs_fscache_inode_get_key,
+	.get_attr	= cifs_fscache_inode_get_attr,
+	.get_aux	= cifs_fscache_inode_get_aux,
+	.check_aux	= cifs_fscache_inode_check_aux,
+	.now_uncached	= cifs_fscache_inode_now_uncached,
+};
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
new file mode 100644
index 00000000..2fe3cf13
--- /dev/null
+++ b/fs/cifs/cifs_debug.c
@@ -0,0 +1,782 @@
+/*
+ *   fs/cifs_debug.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2000,2005
+ *
+ *   Modified by Steve French (sfrench@us.ibm.com)
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifsfs.h"
+
+void
+cifs_dump_mem(char *label, void *data, int length)
+{
+	int i, j;
+	int *intptr = data;
+	char *charptr = data;
+	char buf[10], line[80];
+
+	printk(KERN_DEBUG "%s: dump of %d bytes of data at 0x%p\n",
+		label, length, data);
+	for (i = 0; i < length; i += 16) {
+		line[0] = 0;
+		for (j = 0; (j < 4) && (i + j * 4 < length); j++) {
+			sprintf(buf, " %08x", intptr[i / 4 + j]);
+			strcat(line, buf);
+		}
+		buf[0] = ' ';
+		buf[2] = 0;
+		for (j = 0; (j < 16) && (i + j < length); j++) {
+			buf[1] = isprint(charptr[i + j]) ? charptr[i + j] : '.';
+			strcat(line, buf);
+		}
+		printk(KERN_DEBUG "%s\n", line);
+	}
+}
+
+#ifdef CONFIG_CIFS_DEBUG2
+void cifs_dump_detail(struct smb_hdr *smb)
+{
+	cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
+		  smb->Command, smb->Status.CifsError,
+		  smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
+	cERROR(1, "smb buf %p len %d", smb, smbCalcSize(smb));
+}
+
+
+void cifs_dump_mids(struct TCP_Server_Info *server)
+{
+	struct list_head *tmp;
+	struct mid_q_entry *mid_entry;
+
+	if (server == NULL)
+		return;
+
+	cERROR(1, "Dump pending requests:");
+	spin_lock(&GlobalMid_Lock);
+	list_for_each(tmp, &server->pending_mid_q) {
+		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+		cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d",
+			mid_entry->midState,
+			(int)mid_entry->command,
+			mid_entry->pid,
+			mid_entry->callback_data,
+			mid_entry->mid);
+#ifdef CONFIG_CIFS_STATS2
+		cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
+			mid_entry->largeBuf,
+			mid_entry->resp_buf,
+			mid_entry->when_received,
+			jiffies);
+#endif /* STATS2 */
+		cERROR(1, "IsMult: %d IsEnd: %d", mid_entry->multiRsp,
+			  mid_entry->multiEnd);
+		if (mid_entry->resp_buf) {
+			cifs_dump_detail(mid_entry->resp_buf);
+			cifs_dump_mem("existing buf: ",
+				mid_entry->resp_buf, 62);
+		}
+	}
+	spin_unlock(&GlobalMid_Lock);
+}
+#endif /* CONFIG_CIFS_DEBUG2 */
+
+#ifdef CONFIG_PROC_FS
+static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
+{
+	struct list_head *tmp1, *tmp2, *tmp3;
+	struct mid_q_entry *mid_entry;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	int i, j;
+	__u32 dev_type;
+
+	seq_puts(m,
+		    "Display Internal CIFS Data Structures for Debugging\n"
+		    "---------------------------------------------------\n");
+	seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
+	seq_printf(m, "Features:");
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	seq_printf(m, " dfs");
+#endif
+#ifdef CONFIG_CIFS_FSCACHE
+	seq_printf(m, " fscache");
+#endif
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+	seq_printf(m, " lanman");
+#endif
+#ifdef CONFIG_CIFS_POSIX
+	seq_printf(m, " posix");
+#endif
+#ifdef CONFIG_CIFS_UPCALL
+	seq_printf(m, " spnego");
+#endif
+#ifdef CONFIG_CIFS_XATTR
+	seq_printf(m, " xattr");
+#endif
+#ifdef CONFIG_CIFS_ACL
+	seq_printf(m, " acl");
+#endif
+	seq_putc(m, '\n');
+	seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
+	seq_printf(m, "Servers:");
+
+	i = 0;
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp1, &cifs_tcp_ses_list) {
+		server = list_entry(tmp1, struct TCP_Server_Info,
+				    tcp_ses_list);
+		i++;
+		list_for_each(tmp2, &server->smb_ses_list) {
+			ses = list_entry(tmp2, struct cifs_ses,
+					 smb_ses_list);
+			if ((ses->serverDomain == NULL) ||
+				(ses->serverOS == NULL) ||
+				(ses->serverNOS == NULL)) {
+				seq_printf(m, "\n%d) entry for %s not fully "
+					   "displayed\n\t", i, ses->serverName);
+			} else {
+				seq_printf(m,
+				    "\n%d) Name: %s  Domain: %s Uses: %d OS:"
+				    " %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB"
+				    " session status: %d\t",
+				i, ses->serverName, ses->serverDomain,
+				ses->ses_count, ses->serverOS, ses->serverNOS,
+				ses->capabilities, ses->status);
+			}
+			seq_printf(m, "TCP status: %d\n\tLocal Users To "
+				   "Server: %d SecMode: 0x%x Req On Wire: %d",
+				   server->tcpStatus, server->srv_count,
+				   server->sec_mode,
+				   atomic_read(&server->inFlight));
+
+#ifdef CONFIG_CIFS_STATS2
+			seq_printf(m, " In Send: %d In MaxReq Wait: %d",
+				atomic_read(&server->inSend),
+				atomic_read(&server->num_waiters));
+#endif
+
+			seq_puts(m, "\n\tShares:");
+			j = 0;
+			list_for_each(tmp3, &ses->tcon_list) {
+				tcon = list_entry(tmp3, struct cifs_tcon,
+						  tcon_list);
+				++j;
+				dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
+				seq_printf(m, "\n\t%d) %s Mounts: %d ", j,
+					   tcon->treeName, tcon->tc_count);
+				if (tcon->nativeFileSystem) {
+					seq_printf(m, "Type: %s ",
+						   tcon->nativeFileSystem);
+				}
+				seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
+					"\nPathComponentMax: %d Status: 0x%d",
+					le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
+					le32_to_cpu(tcon->fsAttrInfo.Attributes),
+					le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
+					tcon->tidStatus);
+				if (dev_type == FILE_DEVICE_DISK)
+					seq_puts(m, " type: DISK ");
+				else if (dev_type == FILE_DEVICE_CD_ROM)
+					seq_puts(m, " type: CDROM ");
+				else
+					seq_printf(m, " type: %d ", dev_type);
+
+				if (tcon->need_reconnect)
+					seq_puts(m, "\tDISCONNECTED ");
+				seq_putc(m, '\n');
+			}
+
+			seq_puts(m, "\n\tMIDs:\n");
+
+			spin_lock(&GlobalMid_Lock);
+			list_for_each(tmp3, &server->pending_mid_q) {
+				mid_entry = list_entry(tmp3, struct mid_q_entry,
+					qhead);
+				seq_printf(m, "\tState: %d com: %d pid:"
+						" %d cbdata: %p mid %d\n",
+						mid_entry->midState,
+						(int)mid_entry->command,
+						mid_entry->pid,
+						mid_entry->callback_data,
+						mid_entry->mid);
+			}
+			spin_unlock(&GlobalMid_Lock);
+		}
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+	seq_putc(m, '\n');
+
+	/* BB add code to dump additional info such as TCP session info now */
+	return 0;
+}
+
+static int cifs_debug_data_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_debug_data_proc_show, NULL);
+}
+
+static const struct file_operations cifs_debug_data_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_debug_data_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#ifdef CONFIG_CIFS_STATS
+static ssize_t cifs_stats_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
+{
+	char c;
+	int rc;
+	struct list_head *tmp1, *tmp2, *tmp3;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+
+	rc = get_user(c, buffer);
+	if (rc)
+		return rc;
+
+	if (c == '1' || c == 'y' || c == 'Y' || c == '0') {
+#ifdef CONFIG_CIFS_STATS2
+		atomic_set(&totBufAllocCount, 0);
+		atomic_set(&totSmBufAllocCount, 0);
+#endif /* CONFIG_CIFS_STATS2 */
+		spin_lock(&cifs_tcp_ses_lock);
+		list_for_each(tmp1, &cifs_tcp_ses_list) {
+			server = list_entry(tmp1, struct TCP_Server_Info,
+					    tcp_ses_list);
+			list_for_each(tmp2, &server->smb_ses_list) {
+				ses = list_entry(tmp2, struct cifs_ses,
+						 smb_ses_list);
+				list_for_each(tmp3, &ses->tcon_list) {
+					tcon = list_entry(tmp3,
+							  struct cifs_tcon,
+							  tcon_list);
+					atomic_set(&tcon->num_smbs_sent, 0);
+					atomic_set(&tcon->num_writes, 0);
+					atomic_set(&tcon->num_reads, 0);
+					atomic_set(&tcon->num_oplock_brks, 0);
+					atomic_set(&tcon->num_opens, 0);
+					atomic_set(&tcon->num_posixopens, 0);
+					atomic_set(&tcon->num_posixmkdirs, 0);
+					atomic_set(&tcon->num_closes, 0);
+					atomic_set(&tcon->num_deletes, 0);
+					atomic_set(&tcon->num_mkdirs, 0);
+					atomic_set(&tcon->num_rmdirs, 0);
+					atomic_set(&tcon->num_renames, 0);
+					atomic_set(&tcon->num_t2renames, 0);
+					atomic_set(&tcon->num_ffirst, 0);
+					atomic_set(&tcon->num_fnext, 0);
+					atomic_set(&tcon->num_fclose, 0);
+					atomic_set(&tcon->num_hardlinks, 0);
+					atomic_set(&tcon->num_symlinks, 0);
+					atomic_set(&tcon->num_locks, 0);
+				}
+			}
+		}
+		spin_unlock(&cifs_tcp_ses_lock);
+	}
+
+	return count;
+}
+
+static int cifs_stats_proc_show(struct seq_file *m, void *v)
+{
+	int i;
+	struct list_head *tmp1, *tmp2, *tmp3;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+
+	seq_printf(m,
+			"Resources in use\nCIFS Session: %d\n",
+			sesInfoAllocCount.counter);
+	seq_printf(m, "Share (unique mount targets): %d\n",
+			tconInfoAllocCount.counter);
+	seq_printf(m, "SMB Request/Response Buffer: %d Pool size: %d\n",
+			bufAllocCount.counter,
+			cifs_min_rcv + tcpSesAllocCount.counter);
+	seq_printf(m, "SMB Small Req/Resp Buffer: %d Pool size: %d\n",
+			smBufAllocCount.counter, cifs_min_small);
+#ifdef CONFIG_CIFS_STATS2
+	seq_printf(m, "Total Large %d Small %d Allocations\n",
+				atomic_read(&totBufAllocCount),
+				atomic_read(&totSmBufAllocCount));
+#endif /* CONFIG_CIFS_STATS2 */
+
+	seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount));
+	seq_printf(m,
+		"\n%d session %d share reconnects\n",
+		tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
+
+	seq_printf(m,
+		"Total vfs operations: %d maximum at one time: %d\n",
+		GlobalCurrentXid, GlobalMaxActiveXid);
+
+	i = 0;
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp1, &cifs_tcp_ses_list) {
+		server = list_entry(tmp1, struct TCP_Server_Info,
+				    tcp_ses_list);
+		list_for_each(tmp2, &server->smb_ses_list) {
+			ses = list_entry(tmp2, struct cifs_ses,
+					 smb_ses_list);
+			list_for_each(tmp3, &ses->tcon_list) {
+				tcon = list_entry(tmp3,
+						  struct cifs_tcon,
+						  tcon_list);
+				i++;
+				seq_printf(m, "\n%d) %s", i, tcon->treeName);
+				if (tcon->need_reconnect)
+					seq_puts(m, "\tDISCONNECTED ");
+				seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
+					atomic_read(&tcon->num_smbs_sent),
+					atomic_read(&tcon->num_oplock_brks));
+				seq_printf(m, "\nReads:  %d Bytes: %lld",
+					atomic_read(&tcon->num_reads),
+					(long long)(tcon->bytes_read));
+				seq_printf(m, "\nWrites: %d Bytes: %lld",
+					atomic_read(&tcon->num_writes),
+					(long long)(tcon->bytes_written));
+				seq_printf(m, "\nFlushes: %d",
+					atomic_read(&tcon->num_flushes));
+				seq_printf(m, "\nLocks: %d HardLinks: %d "
+					      "Symlinks: %d",
+					atomic_read(&tcon->num_locks),
+					atomic_read(&tcon->num_hardlinks),
+					atomic_read(&tcon->num_symlinks));
+				seq_printf(m, "\nOpens: %d Closes: %d "
+					      "Deletes: %d",
+					atomic_read(&tcon->num_opens),
+					atomic_read(&tcon->num_closes),
+					atomic_read(&tcon->num_deletes));
+				seq_printf(m, "\nPosix Opens: %d "
+					      "Posix Mkdirs: %d",
+					atomic_read(&tcon->num_posixopens),
+					atomic_read(&tcon->num_posixmkdirs));
+				seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
+					atomic_read(&tcon->num_mkdirs),
+					atomic_read(&tcon->num_rmdirs));
+				seq_printf(m, "\nRenames: %d T2 Renames %d",
+					atomic_read(&tcon->num_renames),
+					atomic_read(&tcon->num_t2renames));
+				seq_printf(m, "\nFindFirst: %d FNext %d "
+					      "FClose %d",
+					atomic_read(&tcon->num_ffirst),
+					atomic_read(&tcon->num_fnext),
+					atomic_read(&tcon->num_fclose));
+			}
+		}
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static int cifs_stats_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_stats_proc_show, NULL);
+}
+
+static const struct file_operations cifs_stats_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_stats_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_stats_proc_write,
+};
+#endif /* STATS */
+
+static struct proc_dir_entry *proc_fs_cifs;
+static const struct file_operations cifsFYI_proc_fops;
+static const struct file_operations cifs_oplock_proc_fops;
+static const struct file_operations cifs_lookup_cache_proc_fops;
+static const struct file_operations traceSMB_proc_fops;
+static const struct file_operations cifs_multiuser_mount_proc_fops;
+static const struct file_operations cifs_security_flags_proc_fops;
+static const struct file_operations cifs_linux_ext_proc_fops;
+
+void
+cifs_proc_init(void)
+{
+	proc_fs_cifs = proc_mkdir("fs/cifs", NULL);
+	if (proc_fs_cifs == NULL)
+		return;
+
+	proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops);
+
+#ifdef CONFIG_CIFS_STATS
+	proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops);
+#endif /* STATS */
+	proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops);
+	proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
+	proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops);
+	proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
+		    &cifs_linux_ext_proc_fops);
+	proc_create("MultiuserMount", 0, proc_fs_cifs,
+		    &cifs_multiuser_mount_proc_fops);
+	proc_create("SecurityFlags", 0, proc_fs_cifs,
+		    &cifs_security_flags_proc_fops);
+	proc_create("LookupCacheEnabled", 0, proc_fs_cifs,
+		    &cifs_lookup_cache_proc_fops);
+}
+
+void
+cifs_proc_clean(void)
+{
+	if (proc_fs_cifs == NULL)
+		return;
+
+	remove_proc_entry("DebugData", proc_fs_cifs);
+	remove_proc_entry("cifsFYI", proc_fs_cifs);
+	remove_proc_entry("traceSMB", proc_fs_cifs);
+#ifdef CONFIG_CIFS_STATS
+	remove_proc_entry("Stats", proc_fs_cifs);
+#endif
+	remove_proc_entry("MultiuserMount", proc_fs_cifs);
+	remove_proc_entry("OplockEnabled", proc_fs_cifs);
+	remove_proc_entry("SecurityFlags", proc_fs_cifs);
+	remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
+	remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
+	remove_proc_entry("fs/cifs", NULL);
+}
+
+static int cifsFYI_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", cifsFYI);
+	return 0;
+}
+
+static int cifsFYI_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifsFYI_proc_show, NULL);
+}
+
+static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
+		size_t count, loff_t *ppos)
+{
+	char c;
+	int rc;
+
+	rc = get_user(c, buffer);
+	if (rc)
+		return rc;
+	if (c == '0' || c == 'n' || c == 'N')
+		cifsFYI = 0;
+	else if (c == '1' || c == 'y' || c == 'Y')
+		cifsFYI = 1;
+	else if ((c > '1') && (c <= '9'))
+		cifsFYI = (int) (c - '0'); /* see cifs_debug.h for meanings */
+
+	return count;
+}
+
+static const struct file_operations cifsFYI_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifsFYI_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifsFYI_proc_write,
+};
+
+static int cifs_oplock_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", oplockEnabled);
+	return 0;
+}
+
+static int cifs_oplock_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_oplock_proc_show, NULL);
+}
+
+static ssize_t cifs_oplock_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
+{
+	char c;
+	int rc;
+
+	rc = get_user(c, buffer);
+	if (rc)
+		return rc;
+	if (c == '0' || c == 'n' || c == 'N')
+		oplockEnabled = 0;
+	else if (c == '1' || c == 'y' || c == 'Y')
+		oplockEnabled = 1;
+
+	return count;
+}
+
+static const struct file_operations cifs_oplock_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_oplock_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_oplock_proc_write,
+};
+
+static int cifs_linux_ext_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", linuxExtEnabled);
+	return 0;
+}
+
+static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_linux_ext_proc_show, NULL);
+}
+
+static ssize_t cifs_linux_ext_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
+{
+	char c;
+	int rc;
+
+	rc = get_user(c, buffer);
+	if (rc)
+		return rc;
+	if (c == '0' || c == 'n' || c == 'N')
+		linuxExtEnabled = 0;
+	else if (c == '1' || c == 'y' || c == 'Y')
+		linuxExtEnabled = 1;
+
+	return count;
+}
+
+static const struct file_operations cifs_linux_ext_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_linux_ext_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_linux_ext_proc_write,
+};
+
+static int cifs_lookup_cache_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", lookupCacheEnabled);
+	return 0;
+}
+
+static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_lookup_cache_proc_show, NULL);
+}
+
+static ssize_t cifs_lookup_cache_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
+{
+	char c;
+	int rc;
+
+	rc = get_user(c, buffer);
+	if (rc)
+		return rc;
+	if (c == '0' || c == 'n' || c == 'N')
+		lookupCacheEnabled = 0;
+	else if (c == '1' || c == 'y' || c == 'Y')
+		lookupCacheEnabled = 1;
+
+	return count;
+}
+
+static const struct file_operations cifs_lookup_cache_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_lookup_cache_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_lookup_cache_proc_write,
+};
+
+static int traceSMB_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", traceSMB);
+	return 0;
+}
+
+static int traceSMB_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, traceSMB_proc_show, NULL);
+}
+
+static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
+		size_t count, loff_t *ppos)
+{
+	char c;
+	int rc;
+
+	rc = get_user(c, buffer);
+	if (rc)
+		return rc;
+	if (c == '0' || c == 'n' || c == 'N')
+		traceSMB = 0;
+	else if (c == '1' || c == 'y' || c == 'Y')
+		traceSMB = 1;
+
+	return count;
+}
+
+static const struct file_operations traceSMB_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= traceSMB_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= traceSMB_proc_write,
+};
+
+static int cifs_multiuser_mount_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", multiuser_mount);
+	return 0;
+}
+
+static int cifs_multiuser_mount_proc_open(struct inode *inode, struct file *fh)
+{
+	return single_open(fh, cifs_multiuser_mount_proc_show, NULL);
+}
+
+static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
+{
+	char c;
+	int rc;
+
+	rc = get_user(c, buffer);
+	if (rc)
+		return rc;
+	if (c == '0' || c == 'n' || c == 'N')
+		multiuser_mount = 0;
+	else if (c == '1' || c == 'y' || c == 'Y')
+		multiuser_mount = 1;
+
+	return count;
+}
+
+static const struct file_operations cifs_multiuser_mount_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_multiuser_mount_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_multiuser_mount_proc_write,
+};
+
+static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "0x%x\n", global_secflags);
+	return 0;
+}
+
+static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_security_flags_proc_show, NULL);
+}
+
+static ssize_t cifs_security_flags_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
+{
+	unsigned int flags;
+	char flags_string[12];
+	char c;
+
+	if ((count < 1) || (count > 11))
+		return -EINVAL;
+
+	memset(flags_string, 0, 12);
+
+	if (copy_from_user(flags_string, buffer, count))
+		return -EFAULT;
+
+	if (count < 3) {
+		/* single char or single char followed by null */
+		c = flags_string[0];
+		if (c == '0' || c == 'n' || c == 'N') {
+			global_secflags = CIFSSEC_DEF; /* default */
+			return count;
+		} else if (c == '1' || c == 'y' || c == 'Y') {
+			global_secflags = CIFSSEC_MAX;
+			return count;
+		} else if (!isdigit(c)) {
+			cERROR(1, "invalid flag %c", c);
+			return -EINVAL;
+		}
+	}
+	/* else we have a number */
+
+	flags = simple_strtoul(flags_string, NULL, 0);
+
+	cFYI(1, "sec flags 0x%x", flags);
+
+	if (flags <= 0)  {
+		cERROR(1, "invalid security flags %s", flags_string);
+		return -EINVAL;
+	}
+
+	if (flags & ~CIFSSEC_MASK) {
+		cERROR(1, "attempt to set unsupported security flags 0x%x",
+			flags & ~CIFSSEC_MASK);
+		return -EINVAL;
+	}
+	/* flags look ok - update the global security flags for cifs module */
+	global_secflags = flags;
+	if (global_secflags & CIFSSEC_MUST_SIGN) {
+		/* requiring signing implies signing is allowed */
+		global_secflags |= CIFSSEC_MAY_SIGN;
+		cFYI(1, "packet signing now required");
+	} else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) {
+		cFYI(1, "packet signing disabled");
+	}
+	/* BB should we turn on MAY flags for other MUST options? */
+	return count;
+}
+
+static const struct file_operations cifs_security_flags_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_security_flags_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_security_flags_proc_write,
+};
+#else
+inline void cifs_proc_init(void)
+{
+}
+
+inline void cifs_proc_clean(void)
+{
+}
+#endif /* PROC_FS */
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
new file mode 100644
index 00000000..8942b28c
--- /dev/null
+++ b/fs/cifs/cifs_debug.h
@@ -0,0 +1,96 @@
+/*
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000,2002
+ *   Modified by Steve French (sfrench@us.ibm.com)
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+*/
+#define CIFS_DEBUG		/* BB temporary */
+
+#ifndef _H_CIFS_DEBUG
+#define _H_CIFS_DEBUG
+
+void cifs_dump_mem(char *label, void *data, int length);
+#ifdef CONFIG_CIFS_DEBUG2
+#define DBG2 2
+void cifs_dump_detail(struct smb_hdr *);
+void cifs_dump_mids(struct TCP_Server_Info *);
+#else
+#define DBG2 0
+#endif
+extern int traceSMB;		/* flag which enables the function below */
+void dump_smb(struct smb_hdr *, int);
+#define CIFS_INFO	0x01
+#define CIFS_RC		0x02
+#define CIFS_TIMER	0x04
+
+/*
+ *	debug ON
+ *	--------
+ */
+#ifdef CIFS_DEBUG
+
+/* information message: e.g., configuration, major event */
+extern int cifsFYI;
+#define cifsfyi(fmt, arg...)						\
+do {									\
+	if (cifsFYI & CIFS_INFO)					\
+		printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg);	\
+} while (0)
+
+#define cFYI(set, fmt, arg...)			\
+do {						\
+	if (set)				\
+		cifsfyi(fmt, ##arg);		\
+} while (0)
+
+#define cifswarn(fmt, arg...)			\
+	printk(KERN_WARNING fmt "\n", ##arg)
+
+/* debug event message: */
+extern int cifsERROR;
+
+#define cEVENT(fmt, arg...)						\
+do {									\
+	if (cifsERROR)							\
+		printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg);	\
+} while (0)
+
+/* error event message: e.g., i/o error */
+#define cifserror(fmt, arg...)					\
+do {								\
+	if (cifsERROR)						\
+		printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg);	\
+} while (0)
+
+#define cERROR(set, fmt, arg...)		\
+do {						\
+	if (set)				\
+		cifserror(fmt, ##arg);		\
+} while (0)
+
+/*
+ *	debug OFF
+ *	---------
+ */
+#else		/* _CIFS_DEBUG */
+#define cERROR(set, fmt, arg...)
+#define cEVENT(fmt, arg...)
+#define cFYI(set, fmt, arg...)
+#define cifserror(fmt, arg...)
+#endif		/* _CIFS_DEBUG */
+
+#endif				/* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
new file mode 100644
index 00000000..8d8f28c9
--- /dev/null
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -0,0 +1,370 @@
+/*
+ *   Contains the CIFS DFS referral mounting routines used for handling
+ *   traversal via DFS junction point
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Copyright (C) International Business Machines  Corp., 2008
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *		Steve French (sfrench@us.ibm.com)
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation; either version
+ *   2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/vfs.h>
+#include <linux/fs.h>
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifsfs.h"
+#include "dns_resolve.h"
+#include "cifs_debug.h"
+
+static LIST_HEAD(cifs_dfs_automount_list);
+
+static void cifs_dfs_expire_automounts(struct work_struct *work);
+static DECLARE_DELAYED_WORK(cifs_dfs_automount_task,
+			    cifs_dfs_expire_automounts);
+static int cifs_dfs_mountpoint_expiry_timeout = 500 * HZ;
+
+static void cifs_dfs_expire_automounts(struct work_struct *work)
+{
+	struct list_head *list = &cifs_dfs_automount_list;
+
+	mark_mounts_for_expiry(list);
+	if (!list_empty(list))
+		schedule_delayed_work(&cifs_dfs_automount_task,
+				      cifs_dfs_mountpoint_expiry_timeout);
+}
+
+void cifs_dfs_release_automount_timer(void)
+{
+	BUG_ON(!list_empty(&cifs_dfs_automount_list));
+	cancel_delayed_work_sync(&cifs_dfs_automount_task);
+}
+
+/**
+ * cifs_get_share_name	-	extracts share name from UNC
+ * @node_name:	pointer to UNC string
+ *
+ * Extracts sharename form full UNC.
+ * i.e. strips from UNC trailing path that is not part of share
+ * name and fixup missing '\' in the beginning of DFS node refferal
+ * if necessary.
+ * Returns pointer to share name on success or ERR_PTR on error.
+ * Caller is responsible for freeing returned string.
+ */
+static char *cifs_get_share_name(const char *node_name)
+{
+	int len;
+	char *UNC;
+	char *pSep;
+
+	len = strlen(node_name);
+	UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
+			 GFP_KERNEL);
+	if (!UNC)
+		return ERR_PTR(-ENOMEM);
+
+	/* get share name and server name */
+	if (node_name[1] != '\\') {
+		UNC[0] = '\\';
+		strncpy(UNC+1, node_name, len);
+		len++;
+		UNC[len] = 0;
+	} else {
+		strncpy(UNC, node_name, len);
+		UNC[len] = 0;
+	}
+
+	/* find server name end */
+	pSep = memchr(UNC+2, '\\', len-2);
+	if (!pSep) {
+		cERROR(1, "%s: no server name end in node name: %s",
+			__func__, node_name);
+		kfree(UNC);
+		return ERR_PTR(-EINVAL);
+	}
+
+	/* find sharename end */
+	pSep++;
+	pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC));
+	if (pSep) {
+		/* trim path up to sharename end
+		 * now we have share name in UNC */
+		*pSep = 0;
+	}
+
+	return UNC;
+}
+
+
+/**
+ * cifs_compose_mount_options	-	creates mount options for refferral
+ * @sb_mountdata:	parent/root DFS mount options (template)
+ * @fullpath:		full path in UNC format
+ * @ref:		server's referral
+ * @devname:		pointer for saving device name
+ *
+ * creates mount options for submount based on template options sb_mountdata
+ * and replacing unc,ip,prefixpath options with ones we've got form ref_unc.
+ *
+ * Returns: pointer to new mount options or ERR_PTR.
+ * Caller is responcible for freeing retunrned value if it is not error.
+ */
+char *cifs_compose_mount_options(const char *sb_mountdata,
+				   const char *fullpath,
+				   const struct dfs_info3_param *ref,
+				   char **devname)
+{
+	int rc;
+	char *mountdata = NULL;
+	int md_len;
+	char *tkn_e;
+	char *srvIP = NULL;
+	char sep = ',';
+	int off, noff;
+
+	if (sb_mountdata == NULL)
+		return ERR_PTR(-EINVAL);
+
+	*devname = cifs_get_share_name(ref->node_name);
+	if (IS_ERR(*devname)) {
+		rc = PTR_ERR(*devname);
+		*devname = NULL;
+		goto compose_mount_options_err;
+	}
+
+	rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
+	if (rc < 0) {
+		cERROR(1, "%s: Failed to resolve server part of %s to IP: %d",
+			  __func__, *devname, rc);
+		goto compose_mount_options_err;
+	}
+	/* md_len = strlen(...) + 12 for 'sep+prefixpath='
+	 * assuming that we have 'unc=' and 'ip=' in
+	 * the original sb_mountdata
+	 */
+	md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12;
+	mountdata = kzalloc(md_len+1, GFP_KERNEL);
+	if (mountdata == NULL) {
+		rc = -ENOMEM;
+		goto compose_mount_options_err;
+	}
+
+	/* copy all options except of unc,ip,prefixpath */
+	off = 0;
+	if (strncmp(sb_mountdata, "sep=", 4) == 0) {
+			sep = sb_mountdata[4];
+			strncpy(mountdata, sb_mountdata, 5);
+			off += 5;
+	}
+
+	do {
+		tkn_e = strchr(sb_mountdata + off, sep);
+		if (tkn_e == NULL)
+			noff = strlen(sb_mountdata + off);
+		else
+			noff = tkn_e - (sb_mountdata + off) + 1;
+
+		if (strnicmp(sb_mountdata + off, "unc=", 4) == 0) {
+			off += noff;
+			continue;
+		}
+		if (strnicmp(sb_mountdata + off, "ip=", 3) == 0) {
+			off += noff;
+			continue;
+		}
+		if (strnicmp(sb_mountdata + off, "prefixpath=", 11) == 0) {
+			off += noff;
+			continue;
+		}
+		strncat(mountdata, sb_mountdata + off, noff);
+		off += noff;
+	} while (tkn_e);
+	strcat(mountdata, sb_mountdata + off);
+	mountdata[md_len] = '\0';
+
+	/* copy new IP and ref share name */
+	if (mountdata[strlen(mountdata) - 1] != sep)
+		strncat(mountdata, &sep, 1);
+	strcat(mountdata, "ip=");
+	strcat(mountdata, srvIP);
+	strncat(mountdata, &sep, 1);
+	strcat(mountdata, "unc=");
+	strcat(mountdata, *devname);
+
+	/* find & copy prefixpath */
+	tkn_e = strchr(ref->node_name + 2, '\\');
+	if (tkn_e == NULL) {
+		/* invalid unc, missing share name*/
+		rc = -EINVAL;
+		goto compose_mount_options_err;
+	}
+
+	tkn_e = strchr(tkn_e + 1, '\\');
+	if (tkn_e || (strlen(fullpath) - ref->path_consumed)) {
+		strncat(mountdata, &sep, 1);
+		strcat(mountdata, "prefixpath=");
+		if (tkn_e)
+			strcat(mountdata, tkn_e + 1);
+		strcat(mountdata, fullpath + ref->path_consumed);
+	}
+
+	/*cFYI(1, "%s: parent mountdata: %s", __func__,sb_mountdata);*/
+	/*cFYI(1, "%s: submount mountdata: %s", __func__, mountdata );*/
+
+compose_mount_options_out:
+	kfree(srvIP);
+	return mountdata;
+
+compose_mount_options_err:
+	kfree(mountdata);
+	mountdata = ERR_PTR(rc);
+	goto compose_mount_options_out;
+}
+
+/**
+ * cifs_dfs_do_refmount - mounts specified path using provided refferal
+ * @cifs_sb:		parent/root superblock
+ * @fullpath:		full path in UNC format
+ * @ref:		server's referral
+ */
+static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
+		const char *fullpath, const struct dfs_info3_param *ref)
+{
+	struct vfsmount *mnt;
+	char *mountdata;
+	char *devname = NULL;
+
+	/* strip first '\' from fullpath */
+	mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
+			fullpath + 1, ref, &devname);
+
+	if (IS_ERR(mountdata))
+		return (struct vfsmount *)mountdata;
+
+	mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata);
+	kfree(mountdata);
+	kfree(devname);
+	return mnt;
+
+}
+
+static void dump_referral(const struct dfs_info3_param *ref)
+{
+	cFYI(1, "DFS: ref path: %s", ref->path_name);
+	cFYI(1, "DFS: node path: %s", ref->node_name);
+	cFYI(1, "DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type);
+	cFYI(1, "DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
+				ref->path_consumed);
+}
+
+/*
+ * Create a vfsmount that we can automount
+ */
+static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
+{
+	struct dfs_info3_param *referrals = NULL;
+	unsigned int num_referrals = 0;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_ses *ses;
+	char *full_path;
+	int xid, i;
+	int rc;
+	struct vfsmount *mnt;
+	struct tcon_link *tlink;
+
+	cFYI(1, "in %s", __func__);
+	BUG_ON(IS_ROOT(mntpt));
+
+	/*
+	 * The MSDFS spec states that paths in DFS referral requests and
+	 * responses must be prefixed by a single '\' character instead of
+	 * the double backslashes usually used in the UNC. This function
+	 * gives us the latter, so we must adjust the result.
+	 */
+	mnt = ERR_PTR(-ENOMEM);
+	full_path = build_path_from_dentry(mntpt);
+	if (full_path == NULL)
+		goto cdda_exit;
+
+	cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		mnt = ERR_CAST(tlink);
+		goto free_full_path;
+	}
+	ses = tlink_tcon(tlink)->ses;
+
+	xid = GetXid();
+	rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
+		&num_referrals, &referrals,
+		cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	FreeXid(xid);
+
+	cifs_put_tlink(tlink);
+
+	mnt = ERR_PTR(-ENOENT);
+	for (i = 0; i < num_referrals; i++) {
+		int len;
+		dump_referral(referrals + i);
+		/* connect to a node */
+		len = strlen(referrals[i].node_name);
+		if (len < 2) {
+			cERROR(1, "%s: Net Address path too short: %s",
+					__func__, referrals[i].node_name);
+			mnt = ERR_PTR(-EINVAL);
+			break;
+		}
+		mnt = cifs_dfs_do_refmount(cifs_sb,
+				full_path, referrals + i);
+		cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
+					referrals[i].node_name, mnt);
+		if (!IS_ERR(mnt))
+			goto success;
+	}
+
+	/* no valid submounts were found; return error from get_dfs_path() by
+	 * preference */
+	if (rc != 0)
+		mnt = ERR_PTR(rc);
+
+success:
+	free_dfs_info_array(referrals, num_referrals);
+free_full_path:
+	kfree(full_path);
+cdda_exit:
+	cFYI(1, "leaving %s" , __func__);
+	return mnt;
+}
+
+/*
+ * Attempt to automount the referral
+ */
+struct vfsmount *cifs_dfs_d_automount(struct path *path)
+{
+	struct vfsmount *newmnt;
+
+	cFYI(1, "in %s", __func__);
+
+	newmnt = cifs_dfs_do_automount(path->dentry);
+	if (IS_ERR(newmnt)) {
+		cFYI(1, "leaving %s [automount failed]" , __func__);
+		return newmnt;
+	}
+
+	mntget(newmnt); /* prevent immediate expiration */
+	mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
+	schedule_delayed_work(&cifs_dfs_automount_task,
+			      cifs_dfs_mountpoint_expiry_timeout);
+	cFYI(1, "leaving %s [ok]" , __func__);
+	return newmnt;
+}
+
+const struct inode_operations cifs_dfs_referral_inode_operations = {
+};
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
new file mode 100644
index 00000000..7260e11e
--- /dev/null
+++ b/fs/cifs/cifs_fs_sb.h
@@ -0,0 +1,65 @@
+/*
+ *   fs/cifs/cifs_fs_sb.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2004
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ */
+#include <linux/rbtree.h>
+
+#ifndef _CIFS_FS_SB_H
+#define _CIFS_FS_SB_H
+
+#include <linux/backing-dev.h>
+
+#define CIFS_MOUNT_NO_PERM      1 /* do not do client vfs_perm check */
+#define CIFS_MOUNT_SET_UID      2 /* set current's euid in create etc. */
+#define CIFS_MOUNT_SERVER_INUM  4 /* inode numbers from uniqueid from server  */
+#define CIFS_MOUNT_DIRECT_IO    8 /* do not write nor read through page cache */
+#define CIFS_MOUNT_NO_XATTR     0x10  /* if set - disable xattr support       */
+#define CIFS_MOUNT_MAP_SPECIAL_CHR 0x20 /* remap illegal chars in filenames   */
+#define CIFS_MOUNT_POSIX_PATHS  0x40  /* Negotiate posix pathnames if possible*/
+#define CIFS_MOUNT_UNX_EMUL     0x80  /* Network compat with SFUnix emulation */
+#define CIFS_MOUNT_NO_BRL       0x100 /* No sending byte range locks to srv   */
+#define CIFS_MOUNT_CIFS_ACL     0x200 /* send ACL requests to non-POSIX srv   */
+#define CIFS_MOUNT_OVERR_UID    0x400 /* override uid returned from server    */
+#define CIFS_MOUNT_OVERR_GID    0x800 /* override gid returned from server    */
+#define CIFS_MOUNT_DYNPERM      0x1000 /* allow in-memory only mode setting   */
+#define CIFS_MOUNT_NOPOSIXBRL   0x2000 /* mandatory not posix byte range lock */
+#define CIFS_MOUNT_NOSSYNC      0x4000 /* don't do slow SMBflush on every sync*/
+#define CIFS_MOUNT_FSCACHE	0x8000 /* local caching enabled */
+#define CIFS_MOUNT_MF_SYMLINKS	0x10000 /* Minshall+French Symlinks enabled */
+#define CIFS_MOUNT_MULTIUSER	0x20000 /* multiuser mount */
+#define CIFS_MOUNT_STRICT_IO	0x40000 /* strict cache mode */
+#define CIFS_MOUNT_RWPIDFORWARD	0x80000 /* use pid forwarding for rw */
+#define CIFS_MOUNT_POSIXACL	0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
+
+struct cifs_sb_info {
+	struct rb_root tlink_tree;
+	spinlock_t tlink_tree_lock;
+	struct tcon_link *master_tlink;
+	struct nls_table *local_nls;
+	unsigned int rsize;
+	unsigned int wsize;
+	unsigned long actimeo; /* attribute cache timeout (jiffies) */
+	atomic_t active;
+	uid_t	mnt_uid;
+	gid_t	mnt_gid;
+	mode_t	mnt_file_mode;
+	mode_t	mnt_dir_mode;
+	unsigned int mnt_cifs_flags;
+	char   *mountdata; /* options received at mount time or via DFS refs */
+	struct backing_dev_info bdi;
+	struct delayed_work prune_tlinks;
+};
+#endif				/* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
new file mode 100644
index 00000000..2272fd5f
--- /dev/null
+++ b/fs/cifs/cifs_spnego.c
@@ -0,0 +1,175 @@
+/*
+ *   fs/cifs/cifs_spnego.c -- SPNEGO upcall management for CIFS
+ *
+ *   Copyright (c) 2007 Red Hat, Inc.
+ *   Author(s): Jeff Layton (jlayton@redhat.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <keys/user-type.h>
+#include <linux/key-type.h>
+#include <linux/inet.h>
+#include "cifsglob.h"
+#include "cifs_spnego.h"
+#include "cifs_debug.h"
+
+/* create a new cifs key */
+static int
+cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen)
+{
+	char *payload;
+	int ret;
+
+	ret = -ENOMEM;
+	payload = kmalloc(datalen, GFP_KERNEL);
+	if (!payload)
+		goto error;
+
+	/* attach the data */
+	memcpy(payload, data, datalen);
+	key->payload.data = payload;
+	ret = 0;
+
+error:
+	return ret;
+}
+
+static void
+cifs_spnego_key_destroy(struct key *key)
+{
+	kfree(key->payload.data);
+}
+
+
+/*
+ * keytype for CIFS spnego keys
+ */
+struct key_type cifs_spnego_key_type = {
+	.name		= "cifs.spnego",
+	.instantiate	= cifs_spnego_key_instantiate,
+	.match		= user_match,
+	.destroy	= cifs_spnego_key_destroy,
+	.describe	= user_describe,
+};
+
+/* length of longest version string e.g.  strlen("ver=0xFF") */
+#define MAX_VER_STR_LEN		8
+
+/* length of longest security mechanism name, eg in future could have
+ * strlen(";sec=ntlmsspi") */
+#define MAX_MECH_STR_LEN	13
+
+/* strlen of "host=" */
+#define HOST_KEY_LEN		5
+
+/* strlen of ";ip4=" or ";ip6=" */
+#define IP_KEY_LEN		5
+
+/* strlen of ";uid=0x" */
+#define UID_KEY_LEN		7
+
+/* strlen of ";creduid=0x" */
+#define CREDUID_KEY_LEN		11
+
+/* strlen of ";user=" */
+#define USER_KEY_LEN		6
+
+/* strlen of ";pid=0x" */
+#define PID_KEY_LEN		7
+
+/* get a key struct with a SPNEGO security blob, suitable for session setup */
+struct key *
+cifs_get_spnego_key(struct cifs_ses *sesInfo)
+{
+	struct TCP_Server_Info *server = sesInfo->server;
+	struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+	struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
+	char *description, *dp;
+	size_t desc_len;
+	struct key *spnego_key;
+	const char *hostname = server->hostname;
+
+	/* length of fields (with semicolons): ver=0xyz ip4=ipaddress
+	   host=hostname sec=mechanism uid=0xFF user=username */
+	desc_len = MAX_VER_STR_LEN +
+		   HOST_KEY_LEN + strlen(hostname) +
+		   IP_KEY_LEN + INET6_ADDRSTRLEN +
+		   MAX_MECH_STR_LEN +
+		   UID_KEY_LEN + (sizeof(uid_t) * 2) +
+		   CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
+		   USER_KEY_LEN + strlen(sesInfo->user_name) +
+		   PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
+
+	spnego_key = ERR_PTR(-ENOMEM);
+	description = kzalloc(desc_len, GFP_KERNEL);
+	if (description == NULL)
+		goto out;
+
+	dp = description;
+	/* start with version and hostname portion of UNC string */
+	spnego_key = ERR_PTR(-EINVAL);
+	sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION,
+		hostname);
+	dp = description + strlen(description);
+
+	/* add the server address */
+	if (server->dstaddr.ss_family == AF_INET)
+		sprintf(dp, "ip4=%pI4", &sa->sin_addr);
+	else if (server->dstaddr.ss_family == AF_INET6)
+		sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
+	else
+		goto out;
+
+	dp = description + strlen(description);
+
+	/* for now, only sec=krb5 and sec=mskrb5 are valid */
+	if (server->sec_kerberos)
+		sprintf(dp, ";sec=krb5");
+	else if (server->sec_mskerberos)
+		sprintf(dp, ";sec=mskrb5");
+	else
+		goto out;
+
+	dp = description + strlen(description);
+	sprintf(dp, ";uid=0x%x", sesInfo->linux_uid);
+
+	dp = description + strlen(description);
+	sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
+
+	dp = description + strlen(description);
+	sprintf(dp, ";user=%s", sesInfo->user_name);
+
+	dp = description + strlen(description);
+	sprintf(dp, ";pid=0x%x", current->pid);
+
+	cFYI(1, "key description = %s", description);
+	spnego_key = request_key(&cifs_spnego_key_type, description, "");
+
+#ifdef CONFIG_CIFS_DEBUG2
+	if (cifsFYI && !IS_ERR(spnego_key)) {
+		struct cifs_spnego_msg *msg = spnego_key->payload.data;
+		cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024U,
+				msg->secblob_len + msg->sesskey_len));
+	}
+#endif /* CONFIG_CIFS_DEBUG2 */
+
+out:
+	kfree(description);
+	return spnego_key;
+}
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
new file mode 100644
index 00000000..31bef9ee
--- /dev/null
+++ b/fs/cifs/cifs_spnego.h
@@ -0,0 +1,47 @@
+/*
+ *   fs/cifs/cifs_spnego.h -- SPNEGO upcall management for CIFS
+ *
+ *   Copyright (c) 2007 Red Hat, Inc.
+ *   Author(s): Jeff Layton (jlayton@redhat.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _CIFS_SPNEGO_H
+#define _CIFS_SPNEGO_H
+
+#define CIFS_SPNEGO_UPCALL_VERSION 2
+
+/*
+ * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION.
+ * The flags field is for future use. The request-key callout should set
+ * sesskey_len and secblob_len, and then concatenate the SessKey+SecBlob
+ * and stuff it in the data field.
+ */
+struct cifs_spnego_msg {
+	uint32_t	version;
+	uint32_t	flags;
+	uint32_t	sesskey_len;
+	uint32_t	secblob_len;
+	uint8_t		data[1];
+};
+
+#ifdef __KERNEL__
+extern struct key_type cifs_spnego_key_type;
+extern struct key *cifs_get_spnego_key(struct cifs_ses *sesInfo);
+#endif /* KERNEL */
+
+#endif /* _CIFS_SPNEGO_H */
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
new file mode 100644
index 00000000..1b2e180b
--- /dev/null
+++ b/fs/cifs/cifs_unicode.c
@@ -0,0 +1,332 @@
+/*
+ *   fs/cifs/cifs_unicode.c
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000,2009
+ *   Modified by Steve French (sfrench@us.ibm.com)
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "cifs_unicode.h"
+#include "cifs_uniupr.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifs_debug.h"
+
+/*
+ * cifs_ucs2_bytes - how long will a string be after conversion?
+ * @ucs - pointer to input string
+ * @maxbytes - don't go past this many bytes of input string
+ * @codepage - destination codepage
+ *
+ * Walk a ucs2le string and return the number of bytes that the string will
+ * be after being converted to the given charset, not including any null
+ * termination required. Don't walk past maxbytes in the source buffer.
+ */
+int
+cifs_ucs2_bytes(const __le16 *from, int maxbytes,
+		const struct nls_table *codepage)
+{
+	int i;
+	int charlen, outlen = 0;
+	int maxwords = maxbytes / 2;
+	char tmp[NLS_MAX_CHARSET_SIZE];
+	__u16 ftmp;
+
+	for (i = 0; i < maxwords; i++) {
+		ftmp = get_unaligned_le16(&from[i]);
+		if (ftmp == 0)
+			break;
+
+		charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
+		if (charlen > 0)
+			outlen += charlen;
+		else
+			outlen++;
+	}
+
+	return outlen;
+}
+
+/*
+ * cifs_mapchar - convert a host-endian char to proper char in codepage
+ * @target - where converted character should be copied
+ * @src_char - 2 byte host-endian source character
+ * @cp - codepage to which character should be converted
+ * @mapchar - should character be mapped according to mapchars mount option?
+ *
+ * This function handles the conversion of a single character. It is the
+ * responsibility of the caller to ensure that the target buffer is large
+ * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
+ */
+static int
+cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
+	     bool mapchar)
+{
+	int len = 1;
+
+	if (!mapchar)
+		goto cp_convert;
+
+	/*
+	 * BB: Cannot handle remapping UNI_SLASH until all the calls to
+	 *     build_path_from_dentry are modified, as they use slash as
+	 *     separator.
+	 */
+	switch (src_char) {
+	case UNI_COLON:
+		*target = ':';
+		break;
+	case UNI_ASTERISK:
+		*target = '*';
+		break;
+	case UNI_QUESTION:
+		*target = '?';
+		break;
+	case UNI_PIPE:
+		*target = '|';
+		break;
+	case UNI_GRTRTHAN:
+		*target = '>';
+		break;
+	case UNI_LESSTHAN:
+		*target = '<';
+		break;
+	default:
+		goto cp_convert;
+	}
+
+out:
+	return len;
+
+cp_convert:
+	len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
+	if (len <= 0) {
+		*target = '?';
+		len = 1;
+	}
+	goto out;
+}
+
+/*
+ * cifs_from_ucs2 - convert utf16le string to local charset
+ * @to - destination buffer
+ * @from - source buffer
+ * @tolen - destination buffer size (in bytes)
+ * @fromlen - source buffer size (in bytes)
+ * @codepage - codepage to which characters should be converted
+ * @mapchar - should characters be remapped according to the mapchars option?
+ *
+ * Convert a little-endian ucs2le string (as sent by the server) to a string
+ * in the provided codepage. The tolen and fromlen parameters are to ensure
+ * that the code doesn't walk off of the end of the buffer (which is always
+ * a danger if the alignment of the source buffer is off). The destination
+ * string is always properly null terminated and fits in the destination
+ * buffer. Returns the length of the destination string in bytes (including
+ * null terminator).
+ *
+ * Note that some windows versions actually send multiword UTF-16 characters
+ * instead of straight UCS-2. The linux nls routines however aren't able to
+ * deal with those characters properly. In the event that we get some of
+ * those characters, they won't be translated properly.
+ */
+int
+cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
+		 const struct nls_table *codepage, bool mapchar)
+{
+	int i, charlen, safelen;
+	int outlen = 0;
+	int nullsize = nls_nullsize(codepage);
+	int fromwords = fromlen / 2;
+	char tmp[NLS_MAX_CHARSET_SIZE];
+	__u16 ftmp;
+
+	/*
+	 * because the chars can be of varying widths, we need to take care
+	 * not to overflow the destination buffer when we get close to the
+	 * end of it. Until we get to this offset, we don't need to check
+	 * for overflow however.
+	 */
+	safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
+
+	for (i = 0; i < fromwords; i++) {
+		ftmp = get_unaligned_le16(&from[i]);
+		if (ftmp == 0)
+			break;
+
+		/*
+		 * check to see if converting this character might make the
+		 * conversion bleed into the null terminator
+		 */
+		if (outlen >= safelen) {
+			charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar);
+			if ((outlen + charlen) > (tolen - nullsize))
+				break;
+		}
+
+		/* put converted char into 'to' buffer */
+		charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
+		outlen += charlen;
+	}
+
+	/* properly null-terminate string */
+	for (i = 0; i < nullsize; i++)
+		to[outlen++] = 0;
+
+	return outlen;
+}
+
+/*
+ * NAME:	cifs_strtoUCS()
+ *
+ * FUNCTION:	Convert character string to unicode string
+ *
+ */
+int
+cifs_strtoUCS(__le16 *to, const char *from, int len,
+	      const struct nls_table *codepage)
+{
+	int charlen;
+	int i;
+	wchar_t wchar_to; /* needed to quiet sparse */
+
+	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
+		charlen = codepage->char2uni(from, len, &wchar_to);
+		if (charlen < 1) {
+			cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
+				*from, charlen);
+			/* A question mark */
+			wchar_to = 0x003f;
+			charlen = 1;
+		}
+		put_unaligned_le16(wchar_to, &to[i]);
+	}
+
+	put_unaligned_le16(0, &to[i]);
+	return i;
+}
+
+/*
+ * cifs_strndup_from_ucs - copy a string from wire format to the local codepage
+ * @src - source string
+ * @maxlen - don't walk past this many bytes in the source string
+ * @is_unicode - is this a unicode string?
+ * @codepage - destination codepage
+ *
+ * Take a string given by the server, convert it to the local codepage and
+ * put it in a new buffer. Returns a pointer to the new string or NULL on
+ * error.
+ */
+char *
+cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
+	     const struct nls_table *codepage)
+{
+	int len;
+	char *dst;
+
+	if (is_unicode) {
+		len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage);
+		len += nls_nullsize(codepage);
+		dst = kmalloc(len, GFP_KERNEL);
+		if (!dst)
+			return NULL;
+		cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage,
+			       false);
+	} else {
+		len = strnlen(src, maxlen);
+		len++;
+		dst = kmalloc(len, GFP_KERNEL);
+		if (!dst)
+			return NULL;
+		strlcpy(dst, src, len);
+	}
+
+	return dst;
+}
+
+/*
+ * Convert 16 bit Unicode pathname to wire format from string in current code
+ * page. Conversion may involve remapping up the six characters that are
+ * only legal in POSIX-like OS (if they are present in the string). Path
+ * names are little endian 16 bit Unicode on the wire
+ */
+int
+cifsConvertToUCS(__le16 *target, const char *source, int srclen,
+		 const struct nls_table *cp, int mapChars)
+{
+	int i, j, charlen;
+	char src_char;
+	__le16 dst_char;
+	wchar_t tmp;
+
+	if (!mapChars)
+		return cifs_strtoUCS(target, source, PATH_MAX, cp);
+
+	for (i = 0, j = 0; i < srclen; j++) {
+		src_char = source[i];
+		charlen = 1;
+		switch (src_char) {
+		case 0:
+			put_unaligned(0, &target[j]);
+			goto ctoUCS_out;
+		case ':':
+			dst_char = cpu_to_le16(UNI_COLON);
+			break;
+		case '*':
+			dst_char = cpu_to_le16(UNI_ASTERISK);
+			break;
+		case '?':
+			dst_char = cpu_to_le16(UNI_QUESTION);
+			break;
+		case '<':
+			dst_char = cpu_to_le16(UNI_LESSTHAN);
+			break;
+		case '>':
+			dst_char = cpu_to_le16(UNI_GRTRTHAN);
+			break;
+		case '|':
+			dst_char = cpu_to_le16(UNI_PIPE);
+			break;
+		/*
+		 * FIXME: We can not handle remapping backslash (UNI_SLASH)
+		 * until all the calls to build_path_from_dentry are modified,
+		 * as they use backslash as separator.
+		 */
+		default:
+			charlen = cp->char2uni(source + i, srclen - i, &tmp);
+			dst_char = cpu_to_le16(tmp);
+
+			/*
+			 * if no match, use question mark, which at least in
+			 * some cases serves as wild card
+			 */
+			if (charlen < 1) {
+				dst_char = cpu_to_le16(0x003f);
+				charlen = 1;
+			}
+		}
+		/*
+		 * character may take more than one byte in the source string,
+		 * but will take exactly two bytes in the target string
+		 */
+		i += charlen;
+		put_unaligned(dst_char, &target[j]);
+	}
+
+ctoUCS_out:
+	return i;
+}
+
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
new file mode 100644
index 00000000..6d02fd56
--- /dev/null
+++ b/fs/cifs/cifs_unicode.h
@@ -0,0 +1,383 @@
+/*
+ * cifs_unicode:  Unicode kernel case support
+ *
+ * Function:
+ *     Convert a unicode character to upper or lower case using
+ *     compressed tables.
+ *
+ *   Copyright (c) International Business Machines  Corp., 2000,2009
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * Notes:
+ *     These APIs are based on the C library functions.  The semantics
+ *     should match the C functions but with expanded size operands.
+ *
+ *     The upper/lower functions are based on a table created by mkupr.
+ *     This is a compressed table of upper and lower case conversion.
+ *
+ */
+#ifndef _CIFS_UNICODE_H
+#define _CIFS_UNICODE_H
+
+#include <asm/byteorder.h>
+#include <linux/types.h>
+#include <linux/nls.h>
+
+#define  UNIUPR_NOLOWER		/* Example to not expand lower case tables */
+
+/*
+ * Windows maps these to the user defined 16 bit Unicode range since they are
+ * reserved symbols (along with \ and /), otherwise illegal to store
+ * in filenames in NTFS
+ */
+#define UNI_ASTERISK    (__u16) ('*' + 0xF000)
+#define UNI_QUESTION    (__u16) ('?' + 0xF000)
+#define UNI_COLON       (__u16) (':' + 0xF000)
+#define UNI_GRTRTHAN    (__u16) ('>' + 0xF000)
+#define UNI_LESSTHAN    (__u16) ('<' + 0xF000)
+#define UNI_PIPE        (__u16) ('|' + 0xF000)
+#define UNI_SLASH       (__u16) ('\\' + 0xF000)
+
+/* Just define what we want from uniupr.h.  We don't want to define the tables
+ * in each source file.
+ */
+#ifndef	UNICASERANGE_DEFINED
+struct UniCaseRange {
+	wchar_t start;
+	wchar_t end;
+	signed char *table;
+};
+#endif				/* UNICASERANGE_DEFINED */
+
+#ifndef UNIUPR_NOUPPER
+extern signed char CifsUniUpperTable[512];
+extern const struct UniCaseRange CifsUniUpperRange[];
+#endif				/* UNIUPR_NOUPPER */
+
+#ifndef UNIUPR_NOLOWER
+extern signed char CifsUniLowerTable[512];
+extern const struct UniCaseRange CifsUniLowerRange[];
+#endif				/* UNIUPR_NOLOWER */
+
+#ifdef __KERNEL__
+int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
+		   const struct nls_table *codepage, bool mapchar);
+int cifs_ucs2_bytes(const __le16 *from, int maxbytes,
+		    const struct nls_table *codepage);
+int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
+char *cifs_strndup_from_ucs(const char *src, const int maxlen,
+			    const bool is_unicode,
+			    const struct nls_table *codepage);
+extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
+			const struct nls_table *cp, int mapChars);
+
+#endif
+
+/*
+ * UniStrcat:  Concatenate the second string to the first
+ *
+ * Returns:
+ *     Address of the first string
+ */
+static inline wchar_t *
+UniStrcat(wchar_t *ucs1, const wchar_t *ucs2)
+{
+	wchar_t *anchor = ucs1;	/* save a pointer to start of ucs1 */
+
+	while (*ucs1++) ;	/* To end of first string */
+	ucs1--;			/* Return to the null */
+	while ((*ucs1++ = *ucs2++)) ;	/* copy string 2 over */
+	return anchor;
+}
+
+/*
+ * UniStrchr:  Find a character in a string
+ *
+ * Returns:
+ *     Address of first occurrence of character in string
+ *     or NULL if the character is not in the string
+ */
+static inline wchar_t *
+UniStrchr(const wchar_t *ucs, wchar_t uc)
+{
+	while ((*ucs != uc) && *ucs)
+		ucs++;
+
+	if (*ucs == uc)
+		return (wchar_t *) ucs;
+	return NULL;
+}
+
+/*
+ * UniStrcmp:  Compare two strings
+ *
+ * Returns:
+ *     < 0:  First string is less than second
+ *     = 0:  Strings are equal
+ *     > 0:  First string is greater than second
+ */
+static inline int
+UniStrcmp(const wchar_t *ucs1, const wchar_t *ucs2)
+{
+	while ((*ucs1 == *ucs2) && *ucs1) {
+		ucs1++;
+		ucs2++;
+	}
+	return (int) *ucs1 - (int) *ucs2;
+}
+
+/*
+ * UniStrcpy:  Copy a string
+ */
+static inline wchar_t *
+UniStrcpy(wchar_t *ucs1, const wchar_t *ucs2)
+{
+	wchar_t *anchor = ucs1;	/* save the start of result string */
+
+	while ((*ucs1++ = *ucs2++)) ;
+	return anchor;
+}
+
+/*
+ * UniStrlen:  Return the length of a string (in 16 bit Unicode chars not bytes)
+ */
+static inline size_t
+UniStrlen(const wchar_t *ucs1)
+{
+	int i = 0;
+
+	while (*ucs1++)
+		i++;
+	return i;
+}
+
+/*
+ * UniStrnlen:  Return the length (in 16 bit Unicode chars not bytes) of a
+ *		string (length limited)
+ */
+static inline size_t
+UniStrnlen(const wchar_t *ucs1, int maxlen)
+{
+	int i = 0;
+
+	while (*ucs1++) {
+		i++;
+		if (i >= maxlen)
+			break;
+	}
+	return i;
+}
+
+/*
+ * UniStrncat:  Concatenate length limited string
+ */
+static inline wchar_t *
+UniStrncat(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+	wchar_t *anchor = ucs1;	/* save pointer to string 1 */
+
+	while (*ucs1++) ;
+	ucs1--;			/* point to null terminator of s1 */
+	while (n-- && (*ucs1 = *ucs2)) {	/* copy s2 after s1 */
+		ucs1++;
+		ucs2++;
+	}
+	*ucs1 = 0;		/* Null terminate the result */
+	return (anchor);
+}
+
+/*
+ * UniStrncmp:  Compare length limited string
+ */
+static inline int
+UniStrncmp(const wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+	if (!n)
+		return 0;	/* Null strings are equal */
+	while ((*ucs1 == *ucs2) && *ucs1 && --n) {
+		ucs1++;
+		ucs2++;
+	}
+	return (int) *ucs1 - (int) *ucs2;
+}
+
+/*
+ * UniStrncmp_le:  Compare length limited string - native to little-endian
+ */
+static inline int
+UniStrncmp_le(const wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+	if (!n)
+		return 0;	/* Null strings are equal */
+	while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) {
+		ucs1++;
+		ucs2++;
+	}
+	return (int) *ucs1 - (int) __le16_to_cpu(*ucs2);
+}
+
+/*
+ * UniStrncpy:  Copy length limited string with pad
+ */
+static inline wchar_t *
+UniStrncpy(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+	wchar_t *anchor = ucs1;
+
+	while (n-- && *ucs2)	/* Copy the strings */
+		*ucs1++ = *ucs2++;
+
+	n++;
+	while (n--)		/* Pad with nulls */
+		*ucs1++ = 0;
+	return anchor;
+}
+
+/*
+ * UniStrncpy_le:  Copy length limited string with pad to little-endian
+ */
+static inline wchar_t *
+UniStrncpy_le(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+	wchar_t *anchor = ucs1;
+
+	while (n-- && *ucs2)	/* Copy the strings */
+		*ucs1++ = __le16_to_cpu(*ucs2++);
+
+	n++;
+	while (n--)		/* Pad with nulls */
+		*ucs1++ = 0;
+	return anchor;
+}
+
+/*
+ * UniStrstr:  Find a string in a string
+ *
+ * Returns:
+ *     Address of first match found
+ *     NULL if no matching string is found
+ */
+static inline wchar_t *
+UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2)
+{
+	const wchar_t *anchor1 = ucs1;
+	const wchar_t *anchor2 = ucs2;
+
+	while (*ucs1) {
+		if (*ucs1 == *ucs2) {
+			/* Partial match found */
+			ucs1++;
+			ucs2++;
+		} else {
+			if (!*ucs2)	/* Match found */
+				return (wchar_t *) anchor1;
+			ucs1 = ++anchor1;	/* No match */
+			ucs2 = anchor2;
+		}
+	}
+
+	if (!*ucs2)		/* Both end together */
+		return (wchar_t *) anchor1;	/* Match found */
+	return NULL;		/* No match */
+}
+
+#ifndef UNIUPR_NOUPPER
+/*
+ * UniToupper:  Convert a unicode character to upper case
+ */
+static inline wchar_t
+UniToupper(register wchar_t uc)
+{
+	register const struct UniCaseRange *rp;
+
+	if (uc < sizeof(CifsUniUpperTable)) {
+		/* Latin characters */
+		return uc + CifsUniUpperTable[uc];	/* Use base tables */
+	} else {
+		rp = CifsUniUpperRange;	/* Use range tables */
+		while (rp->start) {
+			if (uc < rp->start)	/* Before start of range */
+				return uc;	/* Uppercase = input */
+			if (uc <= rp->end)	/* In range */
+				return uc + rp->table[uc - rp->start];
+			rp++;	/* Try next range */
+		}
+	}
+	return uc;		/* Past last range */
+}
+
+/*
+ * UniStrupr:  Upper case a unicode string
+ */
+static inline wchar_t *
+UniStrupr(register wchar_t *upin)
+{
+	register wchar_t *up;
+
+	up = upin;
+	while (*up) {		/* For all characters */
+		*up = UniToupper(*up);
+		up++;
+	}
+	return upin;		/* Return input pointer */
+}
+#endif				/* UNIUPR_NOUPPER */
+
+#ifndef UNIUPR_NOLOWER
+/*
+ * UniTolower:  Convert a unicode character to lower case
+ */
+static inline wchar_t
+UniTolower(register wchar_t uc)
+{
+	register const struct UniCaseRange *rp;
+
+	if (uc < sizeof(CifsUniLowerTable)) {
+		/* Latin characters */
+		return uc + CifsUniLowerTable[uc];	/* Use base tables */
+	} else {
+		rp = CifsUniLowerRange;	/* Use range tables */
+		while (rp->start) {
+			if (uc < rp->start)	/* Before start of range */
+				return uc;	/* Uppercase = input */
+			if (uc <= rp->end)	/* In range */
+				return uc + rp->table[uc - rp->start];
+			rp++;	/* Try next range */
+		}
+	}
+	return uc;		/* Past last range */
+}
+
+/*
+ * UniStrlwr:  Lower case a unicode string
+ */
+static inline wchar_t *
+UniStrlwr(register wchar_t *upin)
+{
+	register wchar_t *up;
+
+	up = upin;
+	while (*up) {		/* For all characters */
+		*up = UniTolower(*up);
+		up++;
+	}
+	return upin;		/* Return input pointer */
+}
+
+#endif
+
+#endif /* _CIFS_UNICODE_H */
diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h
new file mode 100644
index 00000000..0ac7c5a8
--- /dev/null
+++ b/fs/cifs/cifs_uniupr.h
@@ -0,0 +1,253 @@
+/*
+ *   Copyright (c) International Business Machines  Corp., 2000,2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * uniupr.h - Unicode compressed case ranges
+ *
+*/
+
+#ifndef UNIUPR_NOUPPER
+/*
+ * Latin upper case
+ */
+signed char CifsUniUpperTable[512] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 000-00f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 010-01f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 020-02f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 030-03f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 040-04f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 050-05f */
+	0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,	/* 060-06f */
+	-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,	/* 070-07f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 080-08f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 090-09f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0a0-0af */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0b0-0bf */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0c0-0cf */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0d0-0df */
+	-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,	/* 0e0-0ef */
+	-32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,	/* 0f0-0ff */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 100-10f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 110-11f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 120-12f */
+	0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,	/* 130-13f */
+	-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1,	/* 140-14f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 150-15f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 160-16f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,	/* 170-17f */
+	0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0,	/* 180-18f */
+	0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0,	/* 190-19f */
+	0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0,	/* 1a0-1af */
+	-1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0,	/* 1b0-1bf */
+	0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0,	/* 1c0-1cf */
+	-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -79, 0, -1, /* 1d0-1df */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e0-1ef */
+	0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1f0-1ff */
+};
+
+/* Upper case range - Greek */
+static signed char UniCaseRangeU03a0[47] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -38, -37, -37, -37,	/* 3a0-3af */
+	0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,	/* 3b0-3bf */
+	-32, -32, -31, -32, -32, -32, -32, -32, -32, -32, -32, -32, -64,
+	-63, -63,
+};
+
+/* Upper case range - Cyrillic */
+static signed char UniCaseRangeU0430[48] = {
+	-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,	/* 430-43f */
+	-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,	/* 440-44f */
+	0, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, 0, -80, -80,	/* 450-45f */
+};
+
+/* Upper case range - Extended cyrillic */
+static signed char UniCaseRangeU0490[61] = {
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 490-49f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 4a0-4af */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 4b0-4bf */
+	0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1,
+};
+
+/* Upper case range - Extended latin and greek */
+static signed char UniCaseRangeU1e00[509] = {
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e00-1e0f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e10-1e1f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e20-1e2f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e30-1e3f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e40-1e4f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e50-1e5f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e60-1e6f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e70-1e7f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1e80-1e8f */
+	0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, -59, 0, -1, 0, -1,	/* 1e90-1e9f */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1ea0-1eaf */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1eb0-1ebf */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1ec0-1ecf */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1ed0-1edf */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,	/* 1ee0-1eef */
+	0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0,	/* 1ef0-1eff */
+	8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f00-1f0f */
+	8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f10-1f1f */
+	8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f20-1f2f */
+	8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f30-1f3f */
+	8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f40-1f4f */
+	0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f50-1f5f */
+	8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f60-1f6f */
+	74, 74, 86, 86, 86, 86, 100, 100, 0, 0, 112, 112, 126, 126, 0, 0,	/* 1f70-1f7f */
+	8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f80-1f8f */
+	8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f90-1f9f */
+	8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1fa0-1faf */
+	8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1fb0-1fbf */
+	0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1fc0-1fcf */
+	8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1fd0-1fdf */
+	8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1fe0-1fef */
+	0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/* Upper case range - Wide latin */
+static signed char UniCaseRangeUff40[27] = {
+	0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,	/* ff40-ff4f */
+	-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+};
+
+/*
+ * Upper Case Range
+ */
+const struct UniCaseRange CifsUniUpperRange[] = {
+	{0x03a0, 0x03ce, UniCaseRangeU03a0},
+	{0x0430, 0x045f, UniCaseRangeU0430},
+	{0x0490, 0x04cc, UniCaseRangeU0490},
+	{0x1e00, 0x1ffc, UniCaseRangeU1e00},
+	{0xff40, 0xff5a, UniCaseRangeUff40},
+	{0}
+};
+#endif
+
+#ifndef UNIUPR_NOLOWER
+/*
+ * Latin lower case
+ */
+signed char CifsUniLowerTable[512] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 000-00f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 010-01f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 020-02f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 030-03f */
+	0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,	/* 040-04f */
+	32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,	/* 050-05f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 060-06f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 070-07f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 080-08f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 090-09f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0a0-0af */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0b0-0bf */
+	32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,	/* 0c0-0cf */
+	32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 0,	/* 0d0-0df */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0e0-0ef */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0f0-0ff */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 100-10f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 110-11f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 120-12f */
+	0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,	/* 130-13f */
+	0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,	/* 140-14f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 150-15f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 160-16f */
+	1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, 0,	/* 170-17f */
+	0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 79, 0,	/* 180-18f */
+	0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,	/* 190-19f */
+	1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,	/* 1a0-1af */
+	0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,	/* 1b0-1bf */
+	0, 0, 0, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 1, 0, 1,	/* 1c0-1cf */
+	0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,	/* 1d0-1df */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e0-1ef */
+	0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1f0-1ff */
+};
+
+/* Lower case range - Greek */
+static signed char UniCaseRangeL0380[44] = {
+	0, 0, 0, 0, 0, 0, 38, 0, 37, 37, 37, 0, 64, 0, 63, 63,	/* 380-38f */
+	0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,	/* 390-39f */
+	32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+};
+
+/* Lower case range - Cyrillic */
+static signed char UniCaseRangeL0400[48] = {
+	0, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 0, 80, 80,	/* 400-40f */
+	32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,	/* 410-41f */
+	32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,	/* 420-42f */
+};
+
+/* Lower case range - Extended cyrillic */
+static signed char UniCaseRangeL0490[60] = {
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 490-49f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 4a0-4af */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 4b0-4bf */
+	0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
+};
+
+/* Lower case range - Extended latin and greek */
+static signed char UniCaseRangeL1e00[504] = {
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e00-1e0f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e10-1e1f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e20-1e2f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e30-1e3f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e40-1e4f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e50-1e5f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e60-1e6f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e70-1e7f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1e80-1e8f */
+	1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,	/* 1e90-1e9f */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1ea0-1eaf */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1eb0-1ebf */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1ec0-1ecf */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1ed0-1edf */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,	/* 1ee0-1eef */
+	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,	/* 1ef0-1eff */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8,	/* 1f00-1f0f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0,	/* 1f10-1f1f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8,	/* 1f20-1f2f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8,	/* 1f30-1f3f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0,	/* 1f40-1f4f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, -8, 0, -8, 0, -8, 0, -8,	/* 1f50-1f5f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8,	/* 1f60-1f6f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1f70-1f7f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8,	/* 1f80-1f8f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8,	/* 1f90-1f9f */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8,	/* 1fa0-1faf */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -74, -74, -9, 0, 0, 0,	/* 1fb0-1fbf */
+	0, 0, 0, 0, 0, 0, 0, 0, -86, -86, -86, -86, -9, 0, 0, 0,	/* 1fc0-1fcf */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -100, -100, 0, 0, 0, 0,	/* 1fd0-1fdf */
+	0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -112, -112, -7, 0, 0, 0,	/* 1fe0-1fef */
+	0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/* Lower case range - Wide latin */
+static signed char UniCaseRangeLff20[27] = {
+	0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,	/* ff20-ff2f */
+	32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+};
+
+/*
+ * Lower Case Range
+ */
+const struct UniCaseRange CifsUniLowerRange[] = {
+	{0x0380, 0x03ab, UniCaseRangeL0380},
+	{0x0400, 0x042f, UniCaseRangeL0400},
+	{0x0490, 0x04cb, UniCaseRangeL0490},
+	{0x1e00, 0x1ff7, UniCaseRangeL1e00},
+	{0xff20, 0xff3a, UniCaseRangeLff20},
+	{0}
+};
+#endif
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
new file mode 100644
index 00000000..21de1d6d
--- /dev/null
+++ b/fs/cifs/cifsacl.c
@@ -0,0 +1,1142 @@
+/*
+ *   fs/cifs/cifsacl.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2007,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   Contains the routines for mapping CIFS/NTFS ACLs
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsacl.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+
+/* security id for everyone/world system group */
+static const struct cifs_sid sid_everyone = {
+	1, 1, {0, 0, 0, 0, 0, 1}, {0} };
+/* security id for Authenticated Users system group */
+static const struct cifs_sid sid_authusers = {
+	1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11)} };
+/* group users */
+static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
+
+const struct cred *root_cred;
+
+static void
+shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
+			int *nr_del)
+{
+	struct rb_node *node;
+	struct rb_node *tmp;
+	struct cifs_sid_id *psidid;
+
+	node = rb_first(root);
+	while (node) {
+		tmp = node;
+		node = rb_next(tmp);
+		psidid = rb_entry(tmp, struct cifs_sid_id, rbnode);
+		if (nr_to_scan == 0 || *nr_del == nr_to_scan)
+			++(*nr_rem);
+		else {
+			if (time_after(jiffies, psidid->time + SID_MAP_EXPIRE)
+						&& psidid->refcount == 0) {
+				rb_erase(tmp, root);
+				++(*nr_del);
+			} else
+				++(*nr_rem);
+		}
+	}
+}
+
+/*
+ * Run idmap cache shrinker.
+ */
+static int
+cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
+{
+	int nr_to_scan = sc->nr_to_scan;
+	int nr_del = 0;
+	int nr_rem = 0;
+	struct rb_root *root;
+
+	root = &uidtree;
+	spin_lock(&siduidlock);
+	shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+	spin_unlock(&siduidlock);
+
+	root = &gidtree;
+	spin_lock(&sidgidlock);
+	shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+	spin_unlock(&sidgidlock);
+
+	return nr_rem;
+}
+
+static struct shrinker cifs_shrinker = {
+	.shrink = cifs_idmap_shrinker,
+	.seeks = DEFAULT_SEEKS,
+};
+
+static int
+cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen)
+{
+	char *payload;
+
+	payload = kmalloc(datalen, GFP_KERNEL);
+	if (!payload)
+		return -ENOMEM;
+
+	memcpy(payload, data, datalen);
+	key->payload.data = payload;
+	return 0;
+}
+
+static inline void
+cifs_idmap_key_destroy(struct key *key)
+{
+	kfree(key->payload.data);
+}
+
+struct key_type cifs_idmap_key_type = {
+	.name        = "cifs.idmap",
+	.instantiate = cifs_idmap_key_instantiate,
+	.destroy     = cifs_idmap_key_destroy,
+	.describe    = user_describe,
+	.match       = user_match,
+};
+
+static void
+sid_to_str(struct cifs_sid *sidptr, char *sidstr)
+{
+	int i;
+	unsigned long saval;
+	char *strptr;
+
+	strptr = sidstr;
+
+	sprintf(strptr, "%s", "S");
+	strptr = sidstr + strlen(sidstr);
+
+	sprintf(strptr, "-%d", sidptr->revision);
+	strptr = sidstr + strlen(sidstr);
+
+	for (i = 0; i < 6; ++i) {
+		if (sidptr->authority[i]) {
+			sprintf(strptr, "-%d", sidptr->authority[i]);
+			strptr = sidstr + strlen(sidstr);
+		}
+	}
+
+	for (i = 0; i < sidptr->num_subauth; ++i) {
+		saval = le32_to_cpu(sidptr->sub_auth[i]);
+		sprintf(strptr, "-%ld", saval);
+		strptr = sidstr + strlen(sidstr);
+	}
+}
+
+static void
+id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr,
+		struct cifs_sid_id **psidid, char *typestr)
+{
+	int rc;
+	char *strptr;
+	struct rb_node *node = root->rb_node;
+	struct rb_node *parent = NULL;
+	struct rb_node **linkto = &(root->rb_node);
+	struct cifs_sid_id *lsidid;
+
+	while (node) {
+		lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+		parent = node;
+		rc = compare_sids(sidptr, &((lsidid)->sid));
+		if (rc > 0) {
+			linkto = &(node->rb_left);
+			node = node->rb_left;
+		} else if (rc < 0) {
+			linkto = &(node->rb_right);
+			node = node->rb_right;
+		}
+	}
+
+	memcpy(&(*psidid)->sid, sidptr, sizeof(struct cifs_sid));
+	(*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
+	(*psidid)->refcount = 0;
+
+	sprintf((*psidid)->sidstr, "%s", typestr);
+	strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
+	sid_to_str(&(*psidid)->sid, strptr);
+
+	clear_bit(SID_ID_PENDING, &(*psidid)->state);
+	clear_bit(SID_ID_MAPPED, &(*psidid)->state);
+
+	rb_link_node(&(*psidid)->rbnode, parent, linkto);
+	rb_insert_color(&(*psidid)->rbnode, root);
+}
+
+static struct cifs_sid_id *
+id_rb_search(struct rb_root *root, struct cifs_sid *sidptr)
+{
+	int rc;
+	struct rb_node *node = root->rb_node;
+	struct cifs_sid_id *lsidid;
+
+	while (node) {
+		lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+		rc = compare_sids(sidptr, &((lsidid)->sid));
+		if (rc > 0) {
+			node = node->rb_left;
+		} else if (rc < 0) {
+			node = node->rb_right;
+		} else /* node found */
+			return lsidid;
+	}
+
+	return NULL;
+}
+
+static int
+sidid_pending_wait(void *unused)
+{
+	schedule();
+	return signal_pending(current) ? -ERESTARTSYS : 0;
+}
+
+static int
+sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
+		struct cifs_fattr *fattr, uint sidtype)
+{
+	int rc;
+	unsigned long cid;
+	struct key *idkey;
+	const struct cred *saved_cred;
+	struct cifs_sid_id *psidid, *npsidid;
+	struct rb_root *cidtree;
+	spinlock_t *cidlock;
+
+	if (sidtype == SIDOWNER) {
+		cid = cifs_sb->mnt_uid; /* default uid, in case upcall fails */
+		cidlock = &siduidlock;
+		cidtree = &uidtree;
+	} else if (sidtype == SIDGROUP) {
+		cid = cifs_sb->mnt_gid; /* default gid, in case upcall fails */
+		cidlock = &sidgidlock;
+		cidtree = &gidtree;
+	} else
+		return -ENOENT;
+
+	spin_lock(cidlock);
+	psidid = id_rb_search(cidtree, psid);
+
+	if (!psidid) { /* node does not exist, allocate one & attempt adding */
+		spin_unlock(cidlock);
+		npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
+		if (!npsidid)
+			return -ENOMEM;
+
+		npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
+		if (!npsidid->sidstr) {
+			kfree(npsidid);
+			return -ENOMEM;
+		}
+
+		spin_lock(cidlock);
+		psidid = id_rb_search(cidtree, psid);
+		if (psidid) { /* node happened to get inserted meanwhile */
+			++psidid->refcount;
+			spin_unlock(cidlock);
+			kfree(npsidid->sidstr);
+			kfree(npsidid);
+		} else {
+			psidid = npsidid;
+			id_rb_insert(cidtree, psid, &psidid,
+					sidtype == SIDOWNER ? "os:" : "gs:");
+			++psidid->refcount;
+			spin_unlock(cidlock);
+		}
+	} else {
+		++psidid->refcount;
+		spin_unlock(cidlock);
+	}
+
+	/*
+	 * If we are here, it is safe to access psidid and its fields
+	 * since a reference was taken earlier while holding the spinlock.
+	 * A reference on the node is put without holding the spinlock
+	 * and it is OK to do so in this case, shrinker will not erase
+	 * this node until all references are put and we do not access
+	 * any fields of the node after a reference is put .
+	 */
+	if (test_bit(SID_ID_MAPPED, &psidid->state)) {
+		cid = psidid->id;
+		psidid->time = jiffies; /* update ts for accessing */
+		goto sid_to_id_out;
+	}
+
+	if (time_after(psidid->time + SID_MAP_RETRY, jiffies))
+		goto sid_to_id_out;
+
+	if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
+		saved_cred = override_creds(root_cred);
+		idkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
+		if (IS_ERR(idkey))
+			cFYI(1, "%s: Can't map SID to an id", __func__);
+		else {
+			cid = *(unsigned long *)idkey->payload.value;
+			psidid->id = cid;
+			set_bit(SID_ID_MAPPED, &psidid->state);
+			key_put(idkey);
+			kfree(psidid->sidstr);
+		}
+		revert_creds(saved_cred);
+		psidid->time = jiffies; /* update ts for accessing */
+		clear_bit(SID_ID_PENDING, &psidid->state);
+		wake_up_bit(&psidid->state, SID_ID_PENDING);
+	} else {
+		rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
+				sidid_pending_wait, TASK_INTERRUPTIBLE);
+		if (rc) {
+			cFYI(1, "%s: sidid_pending_wait interrupted %d",
+					__func__, rc);
+			--psidid->refcount; /* decremented without spinlock */
+			return rc;
+		}
+		if (test_bit(SID_ID_MAPPED, &psidid->state))
+			cid = psidid->id;
+	}
+
+sid_to_id_out:
+	--psidid->refcount; /* decremented without spinlock */
+	if (sidtype == SIDOWNER)
+		fattr->cf_uid = cid;
+	else
+		fattr->cf_gid = cid;
+
+	return 0;
+}
+
+int
+init_cifs_idmap(void)
+{
+	struct cred *cred;
+	struct key *keyring;
+	int ret;
+
+	cFYI(1, "Registering the %s key type\n", cifs_idmap_key_type.name);
+
+	/* create an override credential set with a special thread keyring in
+	 * which requests are cached
+	 *
+	 * this is used to prevent malicious redirections from being installed
+	 * with add_key().
+	 */
+	cred = prepare_kernel_cred(NULL);
+	if (!cred)
+		return -ENOMEM;
+
+	keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred,
+			    (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			    KEY_USR_VIEW | KEY_USR_READ,
+			    KEY_ALLOC_NOT_IN_QUOTA);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto failed_put_cred;
+	}
+
+	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		goto failed_put_key;
+
+	ret = register_key_type(&cifs_idmap_key_type);
+	if (ret < 0)
+		goto failed_put_key;
+
+	/* instruct request_key() to use this special keyring as a cache for
+	 * the results it looks up */
+	cred->thread_keyring = keyring;
+	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+	root_cred = cred;
+
+	spin_lock_init(&siduidlock);
+	uidtree = RB_ROOT;
+	spin_lock_init(&sidgidlock);
+	gidtree = RB_ROOT;
+
+	register_shrinker(&cifs_shrinker);
+
+	cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring));
+	return 0;
+
+failed_put_key:
+	key_put(keyring);
+failed_put_cred:
+	put_cred(cred);
+	return ret;
+}
+
+void
+exit_cifs_idmap(void)
+{
+	key_revoke(root_cred->thread_keyring);
+	unregister_key_type(&cifs_idmap_key_type);
+	put_cred(root_cred);
+	unregister_shrinker(&cifs_shrinker);
+	cFYI(1, "Unregistered %s key type\n", cifs_idmap_key_type.name);
+}
+
+void
+cifs_destroy_idmaptrees(void)
+{
+	struct rb_root *root;
+	struct rb_node *node;
+
+	root = &uidtree;
+	spin_lock(&siduidlock);
+	while ((node = rb_first(root)))
+		rb_erase(node, root);
+	spin_unlock(&siduidlock);
+
+	root = &gidtree;
+	spin_lock(&sidgidlock);
+	while ((node = rb_first(root)))
+		rb_erase(node, root);
+	spin_unlock(&sidgidlock);
+}
+
+/* if the two SIDs (roughly equivalent to a UUID for a user or group) are
+   the same returns 1, if they do not match returns 0 */
+int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
+{
+	int i;
+	int num_subauth, num_sat, num_saw;
+
+	if ((!ctsid) || (!cwsid))
+		return 1;
+
+	/* compare the revision */
+	if (ctsid->revision != cwsid->revision) {
+		if (ctsid->revision > cwsid->revision)
+			return 1;
+		else
+			return -1;
+	}
+
+	/* compare all of the six auth values */
+	for (i = 0; i < 6; ++i) {
+		if (ctsid->authority[i] != cwsid->authority[i]) {
+			if (ctsid->authority[i] > cwsid->authority[i])
+				return 1;
+			else
+				return -1;
+		}
+	}
+
+	/* compare all of the subauth values if any */
+	num_sat = ctsid->num_subauth;
+	num_saw = cwsid->num_subauth;
+	num_subauth = num_sat < num_saw ? num_sat : num_saw;
+	if (num_subauth) {
+		for (i = 0; i < num_subauth; ++i) {
+			if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
+				if (le32_to_cpu(ctsid->sub_auth[i]) >
+					le32_to_cpu(cwsid->sub_auth[i]))
+					return 1;
+				else
+					return -1;
+			}
+		}
+	}
+
+	return 0; /* sids compare/match */
+}
+
+
+/* copy ntsd, owner sid, and group sid from a security descriptor to another */
+static void copy_sec_desc(const struct cifs_ntsd *pntsd,
+				struct cifs_ntsd *pnntsd, __u32 sidsoffset)
+{
+	int i;
+
+	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
+	struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
+
+	/* copy security descriptor control portion */
+	pnntsd->revision = pntsd->revision;
+	pnntsd->type = pntsd->type;
+	pnntsd->dacloffset = cpu_to_le32(sizeof(struct cifs_ntsd));
+	pnntsd->sacloffset = 0;
+	pnntsd->osidoffset = cpu_to_le32(sidsoffset);
+	pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct cifs_sid));
+
+	/* copy owner sid */
+	owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->osidoffset));
+	nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset);
+
+	nowner_sid_ptr->revision = owner_sid_ptr->revision;
+	nowner_sid_ptr->num_subauth = owner_sid_ptr->num_subauth;
+	for (i = 0; i < 6; i++)
+		nowner_sid_ptr->authority[i] = owner_sid_ptr->authority[i];
+	for (i = 0; i < 5; i++)
+		nowner_sid_ptr->sub_auth[i] = owner_sid_ptr->sub_auth[i];
+
+	/* copy group sid */
+	group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->gsidoffset));
+	ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset +
+					sizeof(struct cifs_sid));
+
+	ngroup_sid_ptr->revision = group_sid_ptr->revision;
+	ngroup_sid_ptr->num_subauth = group_sid_ptr->num_subauth;
+	for (i = 0; i < 6; i++)
+		ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i];
+	for (i = 0; i < 5; i++)
+		ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i];
+
+	return;
+}
+
+
+/*
+   change posix mode to reflect permissions
+   pmode is the existing mode (we only want to overwrite part of this
+   bits to set can be: S_IRWXU, S_IRWXG or S_IRWXO ie 00700 or 00070 or 00007
+*/
+static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
+				 umode_t *pbits_to_set)
+{
+	__u32 flags = le32_to_cpu(ace_flags);
+	/* the order of ACEs is important.  The canonical order is to begin with
+	   DENY entries followed by ALLOW, otherwise an allow entry could be
+	   encountered first, making the subsequent deny entry like "dead code"
+	   which would be superflous since Windows stops when a match is made
+	   for the operation you are trying to perform for your user */
+
+	/* For deny ACEs we change the mask so that subsequent allow access
+	   control entries do not turn on the bits we are denying */
+	if (type == ACCESS_DENIED) {
+		if (flags & GENERIC_ALL)
+			*pbits_to_set &= ~S_IRWXUGO;
+
+		if ((flags & GENERIC_WRITE) ||
+			((flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS))
+			*pbits_to_set &= ~S_IWUGO;
+		if ((flags & GENERIC_READ) ||
+			((flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS))
+			*pbits_to_set &= ~S_IRUGO;
+		if ((flags & GENERIC_EXECUTE) ||
+			((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
+			*pbits_to_set &= ~S_IXUGO;
+		return;
+	} else if (type != ACCESS_ALLOWED) {
+		cERROR(1, "unknown access control type %d", type);
+		return;
+	}
+	/* else ACCESS_ALLOWED type */
+
+	if (flags & GENERIC_ALL) {
+		*pmode |= (S_IRWXUGO & (*pbits_to_set));
+		cFYI(DBG2, "all perms");
+		return;
+	}
+	if ((flags & GENERIC_WRITE) ||
+			((flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS))
+		*pmode |= (S_IWUGO & (*pbits_to_set));
+	if ((flags & GENERIC_READ) ||
+			((flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS))
+		*pmode |= (S_IRUGO & (*pbits_to_set));
+	if ((flags & GENERIC_EXECUTE) ||
+			((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
+		*pmode |= (S_IXUGO & (*pbits_to_set));
+
+	cFYI(DBG2, "access flags 0x%x mode now 0x%x", flags, *pmode);
+	return;
+}
+
+/*
+   Generate access flags to reflect permissions mode is the existing mode.
+   This function is called for every ACE in the DACL whose SID matches
+   with either owner or group or everyone.
+*/
+
+static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
+				__u32 *pace_flags)
+{
+	/* reset access mask */
+	*pace_flags = 0x0;
+
+	/* bits to use are either S_IRWXU or S_IRWXG or S_IRWXO */
+	mode &= bits_to_use;
+
+	/* check for R/W/X UGO since we do not know whose flags
+	   is this but we have cleared all the bits sans RWX for
+	   either user or group or other as per bits_to_use */
+	if (mode & S_IRUGO)
+		*pace_flags |= SET_FILE_READ_RIGHTS;
+	if (mode & S_IWUGO)
+		*pace_flags |= SET_FILE_WRITE_RIGHTS;
+	if (mode & S_IXUGO)
+		*pace_flags |= SET_FILE_EXEC_RIGHTS;
+
+	cFYI(DBG2, "mode: 0x%x, access flags now 0x%x", mode, *pace_flags);
+	return;
+}
+
+static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
+			const struct cifs_sid *psid, __u64 nmode, umode_t bits)
+{
+	int i;
+	__u16 size = 0;
+	__u32 access_req = 0;
+
+	pntace->type = ACCESS_ALLOWED;
+	pntace->flags = 0x0;
+	mode_to_access_flags(nmode, bits, &access_req);
+	if (!access_req)
+		access_req = SET_MINIMUM_RIGHTS;
+	pntace->access_req = cpu_to_le32(access_req);
+
+	pntace->sid.revision = psid->revision;
+	pntace->sid.num_subauth = psid->num_subauth;
+	for (i = 0; i < 6; i++)
+		pntace->sid.authority[i] = psid->authority[i];
+	for (i = 0; i < psid->num_subauth; i++)
+		pntace->sid.sub_auth[i] = psid->sub_auth[i];
+
+	size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth * 4);
+	pntace->size = cpu_to_le16(size);
+
+	return size;
+}
+
+
+#ifdef CONFIG_CIFS_DEBUG2
+static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
+{
+	int num_subauth;
+
+	/* validate that we do not go past end of acl */
+
+	if (le16_to_cpu(pace->size) < 16) {
+		cERROR(1, "ACE too small %d", le16_to_cpu(pace->size));
+		return;
+	}
+
+	if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) {
+		cERROR(1, "ACL too small to parse ACE");
+		return;
+	}
+
+	num_subauth = pace->sid.num_subauth;
+	if (num_subauth) {
+		int i;
+		cFYI(1, "ACE revision %d num_auth %d type %d flags %d size %d",
+			pace->sid.revision, pace->sid.num_subauth, pace->type,
+			pace->flags, le16_to_cpu(pace->size));
+		for (i = 0; i < num_subauth; ++i) {
+			cFYI(1, "ACE sub_auth[%d]: 0x%x", i,
+				le32_to_cpu(pace->sid.sub_auth[i]));
+		}
+
+		/* BB add length check to make sure that we do not have huge
+			num auths and therefore go off the end */
+	}
+
+	return;
+}
+#endif
+
+
+static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
+		       struct cifs_sid *pownersid, struct cifs_sid *pgrpsid,
+		       struct cifs_fattr *fattr)
+{
+	int i;
+	int num_aces = 0;
+	int acl_size;
+	char *acl_base;
+	struct cifs_ace **ppace;
+
+	/* BB need to add parm so we can store the SID BB */
+
+	if (!pdacl) {
+		/* no DACL in the security descriptor, set
+		   all the permissions for user/group/other */
+		fattr->cf_mode |= S_IRWXUGO;
+		return;
+	}
+
+	/* validate that we do not go past end of acl */
+	if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
+		cERROR(1, "ACL too small to parse DACL");
+		return;
+	}
+
+	cFYI(DBG2, "DACL revision %d size %d num aces %d",
+		le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
+		le32_to_cpu(pdacl->num_aces));
+
+	/* reset rwx permissions for user/group/other.
+	   Also, if num_aces is 0 i.e. DACL has no ACEs,
+	   user/group/other have no permissions */
+	fattr->cf_mode &= ~(S_IRWXUGO);
+
+	acl_base = (char *)pdacl;
+	acl_size = sizeof(struct cifs_acl);
+
+	num_aces = le32_to_cpu(pdacl->num_aces);
+	if (num_aces  > 0) {
+		umode_t user_mask = S_IRWXU;
+		umode_t group_mask = S_IRWXG;
+		umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
+
+		ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
+				GFP_KERNEL);
+		if (!ppace) {
+			cERROR(1, "DACL memory allocation error");
+			return;
+		}
+
+		for (i = 0; i < num_aces; ++i) {
+			ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
+#ifdef CONFIG_CIFS_DEBUG2
+			dump_ace(ppace[i], end_of_acl);
+#endif
+			if (compare_sids(&(ppace[i]->sid), pownersid) == 0)
+				access_flags_to_mode(ppace[i]->access_req,
+						     ppace[i]->type,
+						     &fattr->cf_mode,
+						     &user_mask);
+			if (compare_sids(&(ppace[i]->sid), pgrpsid) == 0)
+				access_flags_to_mode(ppace[i]->access_req,
+						     ppace[i]->type,
+						     &fattr->cf_mode,
+						     &group_mask);
+			if (compare_sids(&(ppace[i]->sid), &sid_everyone) == 0)
+				access_flags_to_mode(ppace[i]->access_req,
+						     ppace[i]->type,
+						     &fattr->cf_mode,
+						     &other_mask);
+			if (compare_sids(&(ppace[i]->sid), &sid_authusers) == 0)
+				access_flags_to_mode(ppace[i]->access_req,
+						     ppace[i]->type,
+						     &fattr->cf_mode,
+						     &other_mask);
+
+
+/*			memcpy((void *)(&(cifscred->aces[i])),
+				(void *)ppace[i],
+				sizeof(struct cifs_ace)); */
+
+			acl_base = (char *)ppace[i];
+			acl_size = le16_to_cpu(ppace[i]->size);
+		}
+
+		kfree(ppace);
+	}
+
+	return;
+}
+
+
+static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid,
+			struct cifs_sid *pgrpsid, __u64 nmode)
+{
+	u16 size = 0;
+	struct cifs_acl *pnndacl;
+
+	pnndacl = (struct cifs_acl *)((char *)pndacl + sizeof(struct cifs_acl));
+
+	size += fill_ace_for_sid((struct cifs_ace *) ((char *)pnndacl + size),
+					pownersid, nmode, S_IRWXU);
+	size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
+					pgrpsid, nmode, S_IRWXG);
+	size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
+					 &sid_everyone, nmode, S_IRWXO);
+
+	pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl));
+	pndacl->num_aces = cpu_to_le32(3);
+
+	return 0;
+}
+
+
+static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
+{
+	/* BB need to add parm so we can store the SID BB */
+
+	/* validate that we do not go past end of ACL - sid must be at least 8
+	   bytes long (assuming no sub-auths - e.g. the null SID */
+	if (end_of_acl < (char *)psid + 8) {
+		cERROR(1, "ACL too small to parse SID %p", psid);
+		return -EINVAL;
+	}
+
+	if (psid->num_subauth) {
+#ifdef CONFIG_CIFS_DEBUG2
+		int i;
+		cFYI(1, "SID revision %d num_auth %d",
+			psid->revision, psid->num_subauth);
+
+		for (i = 0; i < psid->num_subauth; i++) {
+			cFYI(1, "SID sub_auth[%d]: 0x%x ", i,
+				le32_to_cpu(psid->sub_auth[i]));
+		}
+
+		/* BB add length check to make sure that we do not have huge
+			num auths and therefore go off the end */
+		cFYI(1, "RID 0x%x",
+			le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
+#endif
+	}
+
+	return 0;
+}
+
+
+/* Convert CIFS ACL to POSIX form */
+static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
+		struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr)
+{
+	int rc = 0;
+	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
+	struct cifs_acl *dacl_ptr; /* no need for SACL ptr */
+	char *end_of_acl = ((char *)pntsd) + acl_len;
+	__u32 dacloffset;
+
+	if (pntsd == NULL)
+		return -EIO;
+
+	owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->osidoffset));
+	group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->gsidoffset));
+	dacloffset = le32_to_cpu(pntsd->dacloffset);
+	dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+	cFYI(DBG2, "revision %d type 0x%x ooffset 0x%x goffset 0x%x "
+		 "sacloffset 0x%x dacloffset 0x%x",
+		 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
+		 le32_to_cpu(pntsd->gsidoffset),
+		 le32_to_cpu(pntsd->sacloffset), dacloffset);
+/*	cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
+	rc = parse_sid(owner_sid_ptr, end_of_acl);
+	if (rc) {
+		cFYI(1, "%s: Error %d parsing Owner SID", __func__, rc);
+		return rc;
+	}
+	rc = sid_to_id(cifs_sb, owner_sid_ptr, fattr, SIDOWNER);
+	if (rc) {
+		cFYI(1, "%s: Error %d mapping Owner SID to uid", __func__, rc);
+		return rc;
+	}
+
+	rc = parse_sid(group_sid_ptr, end_of_acl);
+	if (rc) {
+		cFYI(1, "%s: Error %d mapping Owner SID to gid", __func__, rc);
+		return rc;
+	}
+	rc = sid_to_id(cifs_sb, group_sid_ptr, fattr, SIDGROUP);
+	if (rc) {
+		cFYI(1, "%s: Error %d mapping Group SID to gid", __func__, rc);
+		return rc;
+	}
+
+	if (dacloffset)
+		parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
+			   group_sid_ptr, fattr);
+	else
+		cFYI(1, "no ACL"); /* BB grant all or default perms? */
+
+/*	cifscred->uid = owner_sid_ptr->rid;
+	cifscred->gid = group_sid_ptr->rid;
+	memcpy((void *)(&(cifscred->osid)), (void *)owner_sid_ptr,
+			sizeof(struct cifs_sid));
+	memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
+			sizeof(struct cifs_sid)); */
+
+	return rc;
+}
+
+
+/* Convert permission bits from mode to equivalent CIFS ACL */
+static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
+				struct inode *inode, __u64 nmode)
+{
+	int rc = 0;
+	__u32 dacloffset;
+	__u32 ndacloffset;
+	__u32 sidsoffset;
+	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
+	struct cifs_acl *dacl_ptr = NULL;  /* no need for SACL ptr */
+	struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
+
+	if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL))
+		return -EIO;
+
+	owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->osidoffset));
+	group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->gsidoffset));
+
+	dacloffset = le32_to_cpu(pntsd->dacloffset);
+	dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+
+	ndacloffset = sizeof(struct cifs_ntsd);
+	ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
+	ndacl_ptr->revision = dacl_ptr->revision;
+	ndacl_ptr->size = 0;
+	ndacl_ptr->num_aces = 0;
+
+	rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode);
+
+	sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
+
+	/* copy security descriptor control portion and owner and group sid */
+	copy_sec_desc(pntsd, pnntsd, sidsoffset);
+
+	return rc;
+}
+
+static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
+		__u16 fid, u32 *pacllen)
+{
+	struct cifs_ntsd *pntsd = NULL;
+	int xid, rc;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+
+	if (IS_ERR(tlink))
+		return ERR_CAST(tlink);
+
+	xid = GetXid();
+	rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
+	FreeXid(xid);
+
+	cifs_put_tlink(tlink);
+
+	cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+	if (rc)
+		return ERR_PTR(rc);
+	return pntsd;
+}
+
+static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
+		const char *path, u32 *pacllen)
+{
+	struct cifs_ntsd *pntsd = NULL;
+	int oplock = 0;
+	int xid, rc;
+	__u16 fid;
+	struct cifs_tcon *tcon;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+
+	if (IS_ERR(tlink))
+		return ERR_CAST(tlink);
+
+	tcon = tlink_tcon(tlink);
+	xid = GetXid();
+
+	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
+			 &fid, &oplock, NULL, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (!rc) {
+		rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
+		CIFSSMBClose(xid, tcon, fid);
+	}
+
+	cifs_put_tlink(tlink);
+	FreeXid(xid);
+
+	cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+	if (rc)
+		return ERR_PTR(rc);
+	return pntsd;
+}
+
+/* Retrieve an ACL from the server */
+struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
+				      struct inode *inode, const char *path,
+				      u32 *pacllen)
+{
+	struct cifs_ntsd *pntsd = NULL;
+	struct cifsFileInfo *open_file = NULL;
+
+	if (inode)
+		open_file = find_readable_file(CIFS_I(inode), true);
+	if (!open_file)
+		return get_cifs_acl_by_path(cifs_sb, path, pacllen);
+
+	pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
+	cifsFileInfo_put(open_file);
+	return pntsd;
+}
+
+static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
+		struct cifs_ntsd *pnntsd, u32 acllen)
+{
+	int xid, rc;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+
+	xid = GetXid();
+	rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen);
+	FreeXid(xid);
+	cifs_put_tlink(tlink);
+
+	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
+	return rc;
+}
+
+static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
+		struct cifs_ntsd *pnntsd, u32 acllen)
+{
+	int oplock = 0;
+	int xid, rc;
+	__u16 fid;
+	struct cifs_tcon *tcon;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+
+	tcon = tlink_tcon(tlink);
+	xid = GetXid();
+
+	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0,
+			 &fid, &oplock, NULL, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc) {
+		cERROR(1, "Unable to open file to set ACL");
+		goto out;
+	}
+
+	rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen);
+	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
+
+	CIFSSMBClose(xid, tcon, fid);
+out:
+	FreeXid(xid);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+/* Set an ACL on the server */
+int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+				struct inode *inode, const char *path)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsFileInfo *open_file;
+	int rc;
+
+	cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
+
+	open_file = find_readable_file(CIFS_I(inode), true);
+	if (!open_file)
+		return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
+
+	rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
+	cifsFileInfo_put(open_file);
+	return rc;
+}
+
+/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
+int
+cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
+		  struct inode *inode, const char *path, const __u16 *pfid)
+{
+	struct cifs_ntsd *pntsd = NULL;
+	u32 acllen = 0;
+	int rc = 0;
+
+	cFYI(DBG2, "converting ACL to mode for %s", path);
+
+	if (pfid)
+		pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
+	else
+		pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
+
+	/* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
+	if (IS_ERR(pntsd)) {
+		rc = PTR_ERR(pntsd);
+		cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+	} else {
+		rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr);
+		kfree(pntsd);
+		if (rc)
+			cERROR(1, "parse sec desc failed rc = %d", rc);
+	}
+
+	return rc;
+}
+
+/* Convert mode bits to an ACL so we can update the ACL on the server */
+int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
+{
+	int rc = 0;
+	__u32 secdesclen = 0;
+	struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
+	struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
+
+	cFYI(DBG2, "set ACL from mode for %s", path);
+
+	/* Get the security descriptor */
+	pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
+
+	/* Add three ACEs for owner, group, everyone getting rid of
+	   other ACEs as chmod disables ACEs and set the security descriptor */
+
+	if (IS_ERR(pntsd)) {
+		rc = PTR_ERR(pntsd);
+		cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+	} else {
+		/* allocate memory for the smb header,
+		   set security descriptor request security descriptor
+		   parameters, and secuirty descriptor itself */
+
+		secdesclen = secdesclen < DEFSECDESCLEN ?
+					DEFSECDESCLEN : secdesclen;
+		pnntsd = kmalloc(secdesclen, GFP_KERNEL);
+		if (!pnntsd) {
+			cERROR(1, "Unable to allocate security descriptor");
+			kfree(pntsd);
+			return -ENOMEM;
+		}
+
+		rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
+
+		cFYI(DBG2, "build_sec_desc rc: %d", rc);
+
+		if (!rc) {
+			/* Set the security descriptor */
+			rc = set_cifs_acl(pnntsd, secdesclen, inode, path);
+			cFYI(DBG2, "set_cifs_acl rc: %d", rc);
+		}
+
+		kfree(pnntsd);
+		kfree(pntsd);
+	}
+
+	return rc;
+}
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
new file mode 100644
index 00000000..5c902c7c
--- /dev/null
+++ b/fs/cifs/cifsacl.h
@@ -0,0 +1,103 @@
+/*
+ *   fs/cifs/cifsacl.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2007
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _CIFSACL_H
+#define _CIFSACL_H
+
+
+#define NUM_AUTHS 6 /* number of authority fields */
+#define NUM_SUBAUTHS 5 /* number of sub authority fields */
+#define NUM_WK_SIDS 7 /* number of well known sids */
+#define SIDNAMELENGTH 20 /* long enough for the ones we care about */
+#define DEFSECDESCLEN 192 /* sec desc len contaiting a dacl with three aces */
+
+#define READ_BIT        0x4
+#define WRITE_BIT       0x2
+#define EXEC_BIT        0x1
+
+#define UBITSHIFT	6
+#define GBITSHIFT	3
+
+#define ACCESS_ALLOWED	0
+#define ACCESS_DENIED	1
+
+#define SIDOWNER 1
+#define SIDGROUP 2
+#define SIDLEN 150 /* S- 1 revision- 6 authorities- max 5 sub authorities */
+
+#define SID_ID_MAPPED 0
+#define SID_ID_PENDING 1
+#define SID_MAP_EXPIRE (3600 * HZ) /* map entry expires after one hour */
+#define SID_MAP_RETRY (300 * HZ)   /* wait 5 minutes for next attempt to map */
+
+struct cifs_ntsd {
+	__le16 revision; /* revision level */
+	__le16 type;
+	__le32 osidoffset;
+	__le32 gsidoffset;
+	__le32 sacloffset;
+	__le32 dacloffset;
+} __attribute__((packed));
+
+struct cifs_sid {
+	__u8 revision; /* revision level */
+	__u8 num_subauth;
+	__u8 authority[6];
+	__le32 sub_auth[5]; /* sub_auth[num_subauth] */
+} __attribute__((packed));
+
+struct cifs_acl {
+	__le16 revision; /* revision level */
+	__le16 size;
+	__le32 num_aces;
+} __attribute__((packed));
+
+struct cifs_ace {
+	__u8 type;
+	__u8 flags;
+	__le16 size;
+	__le32 access_req;
+	struct cifs_sid sid; /* ie UUID of user or group who gets these perms */
+} __attribute__((packed));
+
+struct cifs_wksid {
+	struct cifs_sid cifssid;
+	char sidname[SIDNAMELENGTH];
+} __attribute__((packed));
+
+struct cifs_sid_id {
+	unsigned int refcount; /* increment with spinlock, decrement without */
+	unsigned long id;
+	unsigned long time;
+	unsigned long state;
+	char *sidstr;
+	struct rb_node rbnode;
+	struct cifs_sid sid;
+};
+
+#ifdef __KERNEL__
+extern struct key_type cifs_idmap_key_type;
+extern const struct cred *root_cred;
+#endif /* KERNEL */
+
+extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
+
+#endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
new file mode 100644
index 00000000..5a0ee7f2
--- /dev/null
+++ b/fs/cifs/cifsencrypt.c
@@ -0,0 +1,762 @@
+/*
+ *   fs/cifs/cifsencrypt.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2005,2006
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "cifs_unicode.h"
+#include "cifsproto.h"
+#include "ntlmssp.h"
+#include <linux/ctype.h>
+#include <linux/random.h>
+
+/*
+ * Calculate and return the CIFS signature based on the mac key and SMB PDU.
+ * The 16 byte signature must be allocated by the caller. Note we only use the
+ * 1st eight bytes and that the smb header signature field on input contains
+ * the sequence number before this function is called. Also, this function
+ * should be called with the server->srv_mutex held.
+ */
+static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
+				struct TCP_Server_Info *server, char *signature)
+{
+	int rc;
+
+	if (cifs_pdu == NULL || signature == NULL || server == NULL)
+		return -EINVAL;
+
+	if (!server->secmech.sdescmd5) {
+		cERROR(1, "%s: Can't generate signature\n", __func__);
+		return -1;
+	}
+
+	rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+	if (rc) {
+		cERROR(1, "%s: Oould not init md5\n", __func__);
+		return rc;
+	}
+
+	crypto_shash_update(&server->secmech.sdescmd5->shash,
+		server->session_key.response, server->session_key.len);
+
+	crypto_shash_update(&server->secmech.sdescmd5->shash,
+		cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length));
+
+	rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
+
+	return 0;
+}
+
+/* must be called with server->srv_mutex held */
+int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
+		  __u32 *pexpected_response_sequence_number)
+{
+	int rc = 0;
+	char smb_signature[20];
+
+	if ((cifs_pdu == NULL) || (server == NULL))
+		return -EINVAL;
+
+	if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
+		return rc;
+
+	cifs_pdu->Signature.Sequence.SequenceNumber =
+			cpu_to_le32(server->sequence_number);
+	cifs_pdu->Signature.Sequence.Reserved = 0;
+
+	*pexpected_response_sequence_number = server->sequence_number++;
+	server->sequence_number++;
+
+	rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
+	if (rc)
+		memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
+	else
+		memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8);
+
+	return rc;
+}
+
+static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
+				struct TCP_Server_Info *server, char *signature)
+{
+	int i;
+	int rc;
+
+	if (iov == NULL || signature == NULL || server == NULL)
+		return -EINVAL;
+
+	if (!server->secmech.sdescmd5) {
+		cERROR(1, "%s: Can't generate signature\n", __func__);
+		return -1;
+	}
+
+	rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+	if (rc) {
+		cERROR(1, "%s: Oould not init md5\n", __func__);
+		return rc;
+	}
+
+	crypto_shash_update(&server->secmech.sdescmd5->shash,
+		server->session_key.response, server->session_key.len);
+
+	for (i = 0; i < n_vec; i++) {
+		if (iov[i].iov_len == 0)
+			continue;
+		if (iov[i].iov_base == NULL) {
+			cERROR(1, "null iovec entry");
+			return -EIO;
+		}
+		/* The first entry includes a length field (which does not get
+		   signed that occupies the first 4 bytes before the header */
+		if (i == 0) {
+			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
+				break; /* nothing to sign or corrupt header */
+			crypto_shash_update(&server->secmech.sdescmd5->shash,
+				iov[i].iov_base + 4, iov[i].iov_len - 4);
+		} else
+			crypto_shash_update(&server->secmech.sdescmd5->shash,
+				iov[i].iov_base, iov[i].iov_len);
+	}
+
+	rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
+
+	return rc;
+}
+
+/* must be called with server->srv_mutex held */
+int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
+		   __u32 *pexpected_response_sequence_number)
+{
+	int rc = 0;
+	char smb_signature[20];
+	struct smb_hdr *cifs_pdu = iov[0].iov_base;
+
+	if ((cifs_pdu == NULL) || (server == NULL))
+		return -EINVAL;
+
+	if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
+		return rc;
+
+	cifs_pdu->Signature.Sequence.SequenceNumber =
+				cpu_to_le32(server->sequence_number);
+	cifs_pdu->Signature.Sequence.Reserved = 0;
+
+	*pexpected_response_sequence_number = server->sequence_number++;
+	server->sequence_number++;
+
+	rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
+	if (rc)
+		memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
+	else
+		memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8);
+
+	return rc;
+}
+
+int cifs_verify_signature(struct smb_hdr *cifs_pdu,
+			  struct TCP_Server_Info *server,
+			  __u32 expected_sequence_number)
+{
+	unsigned int rc;
+	char server_response_sig[8];
+	char what_we_think_sig_should_be[20];
+
+	if (cifs_pdu == NULL || server == NULL)
+		return -EINVAL;
+
+	if (!server->session_estab)
+		return 0;
+
+	if (cifs_pdu->Command == SMB_COM_LOCKING_ANDX) {
+		struct smb_com_lock_req *pSMB =
+			(struct smb_com_lock_req *)cifs_pdu;
+	    if (pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)
+			return 0;
+	}
+
+	/* BB what if signatures are supposed to be on for session but
+	   server does not send one? BB */
+
+	/* Do not need to verify session setups with signature "BSRSPYL "  */
+	if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0)
+		cFYI(1, "dummy signature received for smb command 0x%x",
+			cifs_pdu->Command);
+
+	/* save off the origiginal signature so we can modify the smb and check
+		its signature against what the server sent */
+	memcpy(server_response_sig, cifs_pdu->Signature.SecuritySignature, 8);
+
+	cifs_pdu->Signature.Sequence.SequenceNumber =
+					cpu_to_le32(expected_sequence_number);
+	cifs_pdu->Signature.Sequence.Reserved = 0;
+
+	mutex_lock(&server->srv_mutex);
+	rc = cifs_calculate_signature(cifs_pdu, server,
+		what_we_think_sig_should_be);
+	mutex_unlock(&server->srv_mutex);
+
+	if (rc)
+		return rc;
+
+/*	cifs_dump_mem("what we think it should be: ",
+		      what_we_think_sig_should_be, 16); */
+
+	if (memcmp(server_response_sig, what_we_think_sig_should_be, 8))
+		return -EACCES;
+	else
+		return 0;
+
+}
+
+/* first calculate 24 bytes ntlm response and then 16 byte session key */
+int setup_ntlm_response(struct cifs_ses *ses)
+{
+	int rc = 0;
+	unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
+	char temp_key[CIFS_SESS_KEY_SIZE];
+
+	if (!ses)
+		return -EINVAL;
+
+	ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL);
+	if (!ses->auth_key.response) {
+		cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len);
+		return -ENOMEM;
+	}
+	ses->auth_key.len = temp_len;
+
+	rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
+			ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+	if (rc) {
+		cFYI(1, "%s Can't generate NTLM response, error: %d",
+			__func__, rc);
+		return rc;
+	}
+
+	rc = E_md4hash(ses->password, temp_key);
+	if (rc) {
+		cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+		return rc;
+	}
+
+	rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
+	if (rc)
+		cFYI(1, "%s Can't generate NTLM session key, error: %d",
+			__func__, rc);
+
+	return rc;
+}
+
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
+			char *lnm_session_key)
+{
+	int i;
+	int rc;
+	char password_with_pad[CIFS_ENCPWD_SIZE];
+
+	memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
+	if (password)
+		strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
+
+	if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
+		memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
+		memcpy(lnm_session_key, password_with_pad,
+			CIFS_ENCPWD_SIZE);
+		return 0;
+	}
+
+	/* calculate old style session key */
+	/* calling toupper is less broken than repeatedly
+	calling nls_toupper would be since that will never
+	work for UTF8, but neither handles multibyte code pages
+	but the only alternative would be converting to UCS-16 (Unicode)
+	(using a routine something like UniStrupr) then
+	uppercasing and then converting back from Unicode - which
+	would only worth doing it if we knew it were utf8. Basically
+	utf8 and other multibyte codepages each need their own strupper
+	function since a byte at a time will ont work. */
+
+	for (i = 0; i < CIFS_ENCPWD_SIZE; i++)
+		password_with_pad[i] = toupper(password_with_pad[i]);
+
+	rc = SMBencrypt(password_with_pad, cryptkey, lnm_session_key);
+
+	return rc;
+}
+#endif /* CIFS_WEAK_PW_HASH */
+
+/* Build a proper attribute value/target info pairs blob.
+ * Fill in netbios and dns domain name and workstation name
+ * and client time (total five av pairs and + one end of fields indicator.
+ * Allocate domain name which gets freed when session struct is deallocated.
+ */
+static int
+build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
+{
+	unsigned int dlen;
+	unsigned int wlen;
+	unsigned int size = 6 * sizeof(struct ntlmssp2_name);
+	__le64  curtime;
+	char *defdmname = "WORKGROUP";
+	unsigned char *blobptr;
+	struct ntlmssp2_name *attrptr;
+
+	if (!ses->domainName) {
+		ses->domainName = kstrdup(defdmname, GFP_KERNEL);
+		if (!ses->domainName)
+			return -ENOMEM;
+	}
+
+	dlen = strlen(ses->domainName);
+	wlen = strlen(ses->server->hostname);
+
+	/* The length of this blob is a size which is
+	 * six times the size of a structure which holds name/size +
+	 * two times the unicode length of a domain name +
+	 * two times the unicode length of a server name +
+	 * size of a timestamp (which is 8 bytes).
+	 */
+	ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
+	ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
+	if (!ses->auth_key.response) {
+		ses->auth_key.len = 0;
+		cERROR(1, "Challenge target info allocation failure");
+		return -ENOMEM;
+	}
+
+	blobptr = ses->auth_key.response;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
+	attrptr->length = cpu_to_le16(2 * dlen);
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+
+	blobptr += 2 * dlen;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME);
+	attrptr->length = cpu_to_le16(2 * wlen);
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
+
+	blobptr += 2 * wlen;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME);
+	attrptr->length = cpu_to_le16(2 * dlen);
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+
+	blobptr += 2 * dlen;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME);
+	attrptr->length = cpu_to_le16(2 * wlen);
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
+
+	blobptr += 2 * wlen;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP);
+	attrptr->length = cpu_to_le16(sizeof(__le64));
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	memcpy(blobptr, &curtime, sizeof(__le64));
+
+	return 0;
+}
+
+/* Server has provided av pairs/target info in the type 2 challenge
+ * packet and we have plucked it and stored within smb session.
+ * We parse that blob here to find netbios domain name to be used
+ * as part of ntlmv2 authentication (in Target String), if not already
+ * specified on the command line.
+ * If this function returns without any error but without fetching
+ * domain name, authentication may fail against some server but
+ * may not fail against other (those who are not very particular
+ * about target string i.e. for some, just user name might suffice.
+ */
+static int
+find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
+{
+	unsigned int attrsize;
+	unsigned int type;
+	unsigned int onesize = sizeof(struct ntlmssp2_name);
+	unsigned char *blobptr;
+	unsigned char *blobend;
+	struct ntlmssp2_name *attrptr;
+
+	if (!ses->auth_key.len || !ses->auth_key.response)
+		return 0;
+
+	blobptr = ses->auth_key.response;
+	blobend = blobptr + ses->auth_key.len;
+
+	while (blobptr + onesize < blobend) {
+		attrptr = (struct ntlmssp2_name *) blobptr;
+		type = le16_to_cpu(attrptr->type);
+		if (type == NTLMSSP_AV_EOL)
+			break;
+		blobptr += 2; /* advance attr type */
+		attrsize = le16_to_cpu(attrptr->length);
+		blobptr += 2; /* advance attr size */
+		if (blobptr + attrsize > blobend)
+			break;
+		if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
+			if (!attrsize)
+				break;
+			if (!ses->domainName) {
+				ses->domainName =
+					kmalloc(attrsize + 1, GFP_KERNEL);
+				if (!ses->domainName)
+						return -ENOMEM;
+				cifs_from_ucs2(ses->domainName,
+					(__le16 *)blobptr, attrsize, attrsize,
+					nls_cp, false);
+				break;
+			}
+		}
+		blobptr += attrsize; /* advance attr  value */
+	}
+
+	return 0;
+}
+
+static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
+			    const struct nls_table *nls_cp)
+{
+	int rc = 0;
+	int len;
+	char nt_hash[CIFS_NTHASH_SIZE];
+	wchar_t *user;
+	wchar_t *domain;
+	wchar_t *server;
+
+	if (!ses->server->secmech.sdeschmacmd5) {
+		cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+		return -1;
+	}
+
+	/* calculate md4 hash of password */
+	E_md4hash(ses->password, nt_hash);
+
+	crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
+				CIFS_NTHASH_SIZE);
+
+	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+	if (rc) {
+		cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n");
+		return rc;
+	}
+
+	/* convert ses->user_name to unicode and uppercase */
+	len = strlen(ses->user_name);
+	user = kmalloc(2 + (len * 2), GFP_KERNEL);
+	if (user == NULL) {
+		cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
+		rc = -ENOMEM;
+		goto calc_exit_2;
+	}
+	len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
+	UniStrupr(user);
+
+	crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+				(char *)user, 2 * len);
+
+	/* convert ses->domainName to unicode and uppercase */
+	if (ses->domainName) {
+		len = strlen(ses->domainName);
+
+		domain = kmalloc(2 + (len * 2), GFP_KERNEL);
+		if (domain == NULL) {
+			cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
+			rc = -ENOMEM;
+			goto calc_exit_1;
+		}
+		len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
+					nls_cp);
+		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+					(char *)domain, 2 * len);
+		kfree(domain);
+	} else if (ses->serverName) {
+		len = strlen(ses->serverName);
+
+		server = kmalloc(2 + (len * 2), GFP_KERNEL);
+		if (server == NULL) {
+			cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
+			rc = -ENOMEM;
+			goto calc_exit_1;
+		}
+		len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+					nls_cp);
+		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+					(char *)server, 2 * len);
+		kfree(server);
+	}
+
+	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+					ntlmv2_hash);
+
+calc_exit_1:
+	kfree(user);
+calc_exit_2:
+	return rc;
+}
+
+static int
+CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
+{
+	int rc;
+	unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
+
+	if (!ses->server->secmech.sdeschmacmd5) {
+		cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+		return -1;
+	}
+
+	crypto_shash_setkey(ses->server->secmech.hmacmd5,
+				ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+
+	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+	if (rc) {
+		cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
+		return rc;
+	}
+
+	if (ses->server->secType == RawNTLMSSP)
+		memcpy(ses->auth_key.response + offset,
+			ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+	else
+		memcpy(ses->auth_key.response + offset,
+			ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+	crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+		ses->auth_key.response + offset, ses->auth_key.len - offset);
+
+	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+		ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+
+	return rc;
+}
+
+
+int
+setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
+{
+	int rc;
+	int baselen;
+	unsigned int tilen;
+	struct ntlmv2_resp *buf;
+	char ntlmv2_hash[16];
+	unsigned char *tiblob = NULL; /* target info blob */
+
+	if (ses->server->secType == RawNTLMSSP) {
+		if (!ses->domainName) {
+			rc = find_domain_name(ses, nls_cp);
+			if (rc) {
+				cERROR(1, "error %d finding domain name", rc);
+				goto setup_ntlmv2_rsp_ret;
+			}
+		}
+	} else {
+		rc = build_avpair_blob(ses, nls_cp);
+		if (rc) {
+			cERROR(1, "error %d building av pair blob", rc);
+			goto setup_ntlmv2_rsp_ret;
+		}
+	}
+
+	baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
+	tilen = ses->auth_key.len;
+	tiblob = ses->auth_key.response;
+
+	ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
+	if (!ses->auth_key.response) {
+		rc = ENOMEM;
+		ses->auth_key.len = 0;
+		cERROR(1, "%s: Can't allocate auth blob", __func__);
+		goto setup_ntlmv2_rsp_ret;
+	}
+	ses->auth_key.len += baselen;
+
+	buf = (struct ntlmv2_resp *)
+			(ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+	buf->blob_signature = cpu_to_le32(0x00000101);
+	buf->reserved = 0;
+	buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
+	buf->reserved2 = 0;
+
+	memcpy(ses->auth_key.response + baselen, tiblob, tilen);
+
+	/* calculate ntlmv2_hash */
+	rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
+	if (rc) {
+		cERROR(1, "could not get v2 hash rc %d", rc);
+		goto setup_ntlmv2_rsp_ret;
+	}
+
+	/* calculate first part of the client response (CR1) */
+	rc = CalcNTLMv2_response(ses, ntlmv2_hash);
+	if (rc) {
+		cERROR(1, "Could not calculate CR1  rc: %d", rc);
+		goto setup_ntlmv2_rsp_ret;
+	}
+
+	/* now calculate the session key for NTLMv2 */
+	crypto_shash_setkey(ses->server->secmech.hmacmd5,
+		ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+
+	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+	if (rc) {
+		cERROR(1, "%s: Could not init hmacmd5\n", __func__);
+		goto setup_ntlmv2_rsp_ret;
+	}
+
+	crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+		ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+		CIFS_HMAC_MD5_HASH_SIZE);
+
+	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+		ses->auth_key.response);
+
+setup_ntlmv2_rsp_ret:
+	kfree(tiblob);
+
+	return rc;
+}
+
+int
+calc_seckey(struct cifs_ses *ses)
+{
+	int rc;
+	struct crypto_blkcipher *tfm_arc4;
+	struct scatterlist sgin, sgout;
+	struct blkcipher_desc desc;
+	unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
+
+	get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
+
+	tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm_arc4)) {
+		rc = PTR_ERR(tfm_arc4);
+		cERROR(1, "could not allocate crypto API arc4\n");
+		return rc;
+	}
+
+	desc.tfm = tfm_arc4;
+
+	crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
+					CIFS_SESS_KEY_SIZE);
+
+	sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
+	sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
+
+	rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
+	if (rc) {
+		cERROR(1, "could not encrypt session key rc: %d\n", rc);
+		crypto_free_blkcipher(tfm_arc4);
+		return rc;
+	}
+
+	/* make secondary_key/nonce as session key */
+	memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE);
+	/* and make len as that of session key only */
+	ses->auth_key.len = CIFS_SESS_KEY_SIZE;
+
+	crypto_free_blkcipher(tfm_arc4);
+
+	return 0;
+}
+
+void
+cifs_crypto_shash_release(struct TCP_Server_Info *server)
+{
+	if (server->secmech.md5)
+		crypto_free_shash(server->secmech.md5);
+
+	if (server->secmech.hmacmd5)
+		crypto_free_shash(server->secmech.hmacmd5);
+
+	kfree(server->secmech.sdeschmacmd5);
+
+	kfree(server->secmech.sdescmd5);
+}
+
+int
+cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+	int rc;
+	unsigned int size;
+
+	server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
+	if (IS_ERR(server->secmech.hmacmd5)) {
+		cERROR(1, "could not allocate crypto hmacmd5\n");
+		return PTR_ERR(server->secmech.hmacmd5);
+	}
+
+	server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
+	if (IS_ERR(server->secmech.md5)) {
+		cERROR(1, "could not allocate crypto md5\n");
+		rc = PTR_ERR(server->secmech.md5);
+		goto crypto_allocate_md5_fail;
+	}
+
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->secmech.hmacmd5);
+	server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
+	if (!server->secmech.sdeschmacmd5) {
+		cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n");
+		rc = -ENOMEM;
+		goto crypto_allocate_hmacmd5_sdesc_fail;
+	}
+	server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
+	server->secmech.sdeschmacmd5->shash.flags = 0x0;
+
+
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->secmech.md5);
+	server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
+	if (!server->secmech.sdescmd5) {
+		cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n");
+		rc = -ENOMEM;
+		goto crypto_allocate_md5_sdesc_fail;
+	}
+	server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
+	server->secmech.sdescmd5->shash.flags = 0x0;
+
+	return 0;
+
+crypto_allocate_md5_sdesc_fail:
+	kfree(server->secmech.sdeschmacmd5);
+
+crypto_allocate_hmacmd5_sdesc_fail:
+	crypto_free_shash(server->secmech.md5);
+
+crypto_allocate_md5_fail:
+	crypto_free_shash(server->secmech.hmacmd5);
+
+	return rc;
+}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
new file mode 100644
index 00000000..53e7d721
--- /dev/null
+++ b/fs/cifs/cifsfs.c
@@ -0,0 +1,1215 @@
+/*
+ *   fs/cifs/cifsfs.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   Common Internet FileSystem (CIFS) client
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* Note that BB means BUGBUG (ie something to fix eventually) */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/seq_file.h>
+#include <linux/vfs.h>
+#include <linux/mempool.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/namei.h>
+#include <net/ipv6.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#define DECLARE_GLOBALS_HERE
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include <linux/mm.h>
+#include <linux/key-type.h>
+#include "cifs_spnego.h"
+#include "fscache.h"
+#define CIFS_MAGIC_NUMBER 0xFF534D42	/* the first four bytes of SMB PDUs */
+
+int cifsFYI = 0;
+int cifsERROR = 1;
+int traceSMB = 0;
+unsigned int oplockEnabled = 1;
+unsigned int linuxExtEnabled = 1;
+unsigned int lookupCacheEnabled = 1;
+unsigned int multiuser_mount = 0;
+unsigned int global_secflags = CIFSSEC_DEF;
+/* unsigned int ntlmv2_support = 0; */
+unsigned int sign_CIFS_PDUs = 1;
+static const struct super_operations cifs_super_ops;
+unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
+module_param(CIFSMaxBufSize, int, 0);
+MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "
+				 "Default: 16384 Range: 8192 to 130048");
+unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
+module_param(cifs_min_rcv, int, 0);
+MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: "
+				"1 to 64");
+unsigned int cifs_min_small = 30;
+module_param(cifs_min_small, int, 0);
+MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
+				 "Range: 2 to 256");
+unsigned int cifs_max_pending = CIFS_MAX_REQ;
+module_param(cifs_max_pending, int, 0);
+MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
+				   "Default: 50 Range: 2 to 256");
+unsigned short echo_retries = 5;
+module_param(echo_retries, ushort, 0644);
+MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
+			       "reconnecting server. Default: 5. 0 means "
+			       "never reconnect.");
+extern mempool_t *cifs_sm_req_poolp;
+extern mempool_t *cifs_req_poolp;
+extern mempool_t *cifs_mid_poolp;
+
+void
+cifs_sb_active(struct super_block *sb)
+{
+	struct cifs_sb_info *server = CIFS_SB(sb);
+
+	if (atomic_inc_return(&server->active) == 1)
+		atomic_inc(&sb->s_active);
+}
+
+void
+cifs_sb_deactive(struct super_block *sb)
+{
+	struct cifs_sb_info *server = CIFS_SB(sb);
+
+	if (atomic_dec_and_test(&server->active))
+		deactivate_super(sb);
+}
+
+static int
+cifs_read_super(struct super_block *sb)
+{
+	struct inode *inode;
+	struct cifs_sb_info *cifs_sb;
+	int rc = 0;
+
+	cifs_sb = CIFS_SB(sb);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL)
+		sb->s_flags |= MS_POSIXACL;
+
+	if (cifs_sb_master_tcon(cifs_sb)->ses->capabilities & CAP_LARGE_FILES)
+		sb->s_maxbytes = MAX_LFS_FILESIZE;
+	else
+		sb->s_maxbytes = MAX_NON_LFS;
+
+	/* BB FIXME fix time_gran to be larger for LANMAN sessions */
+	sb->s_time_gran = 100;
+
+	sb->s_magic = CIFS_MAGIC_NUMBER;
+	sb->s_op = &cifs_super_ops;
+	sb->s_bdi = &cifs_sb->bdi;
+	sb->s_blocksize = CIFS_MAX_MSGSIZE;
+	sb->s_blocksize_bits = 14;	/* default 2**14 = CIFS_MAX_MSGSIZE */
+	inode = cifs_root_iget(sb);
+
+	if (IS_ERR(inode)) {
+		rc = PTR_ERR(inode);
+		inode = NULL;
+		goto out_no_root;
+	}
+
+	sb->s_root = d_alloc_root(inode);
+
+	if (!sb->s_root) {
+		rc = -ENOMEM;
+		goto out_no_root;
+	}
+
+	/* do that *after* d_alloc_root() - we want NULL ->d_op for root here */
+	if (cifs_sb_master_tcon(cifs_sb)->nocase)
+		sb->s_d_op = &cifs_ci_dentry_ops;
+	else
+		sb->s_d_op = &cifs_dentry_ops;
+
+#ifdef CIFS_NFSD_EXPORT
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+		cFYI(1, "export ops supported");
+		sb->s_export_op = &cifs_export_ops;
+	}
+#endif /* CIFS_NFSD_EXPORT */
+
+	return 0;
+
+out_no_root:
+	cERROR(1, "cifs_read_super: get root inode failed");
+	if (inode)
+		iput(inode);
+
+	return rc;
+}
+
+static void cifs_kill_sb(struct super_block *sb)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	kill_anon_super(sb);
+	cifs_umount(cifs_sb);
+}
+
+static int
+cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	int rc = -EOPNOTSUPP;
+	int xid;
+
+	xid = GetXid();
+
+	buf->f_type = CIFS_MAGIC_NUMBER;
+
+	/*
+	 * PATH_MAX may be too long - it would presumably be total path,
+	 * but note that some servers (includinng Samba 3) have a shorter
+	 * maximum path.
+	 *
+	 * Instead could get the real value via SMB_QUERY_FS_ATTRIBUTE_INFO.
+	 */
+	buf->f_namelen = PATH_MAX;
+	buf->f_files = 0;	/* undefined */
+	buf->f_ffree = 0;	/* unlimited */
+
+	/*
+	 * We could add a second check for a QFS Unix capability bit
+	 */
+	if ((tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_POSIX_EXTENSIONS & le64_to_cpu(tcon->fsUnixInfo.Capability)))
+		rc = CIFSSMBQFSPosixInfo(xid, tcon, buf);
+
+	/*
+	 * Only need to call the old QFSInfo if failed on newer one,
+	 * e.g. by OS/2.
+	 **/
+	if (rc && (tcon->ses->capabilities & CAP_NT_SMBS))
+		rc = CIFSSMBQFSInfo(xid, tcon, buf);
+
+	/*
+	 * Some old Windows servers also do not support level 103, retry with
+	 * older level one if old server failed the previous call or we
+	 * bypassed it because we detected that this was an older LANMAN sess
+	 */
+	if (rc)
+		rc = SMBOldQFSInfo(xid, tcon, buf);
+
+	FreeXid(xid);
+	return 0;
+}
+
+static int cifs_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	struct cifs_sb_info *cifs_sb;
+
+	cifs_sb = CIFS_SB(inode->i_sb);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
+		if ((mask & MAY_EXEC) && !execute_ok(inode))
+			return -EACCES;
+		else
+			return 0;
+	} else /* file mode might have been restricted at mount time
+		on the client (above and beyond ACL on servers) for
+		servers which do not support setting and viewing mode bits,
+		so allowing client to check permissions is useful */
+		return generic_permission(inode, mask, flags, NULL);
+}
+
+static struct kmem_cache *cifs_inode_cachep;
+static struct kmem_cache *cifs_req_cachep;
+static struct kmem_cache *cifs_mid_cachep;
+static struct kmem_cache *cifs_sm_req_cachep;
+mempool_t *cifs_sm_req_poolp;
+mempool_t *cifs_req_poolp;
+mempool_t *cifs_mid_poolp;
+
+static struct inode *
+cifs_alloc_inode(struct super_block *sb)
+{
+	struct cifsInodeInfo *cifs_inode;
+	cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL);
+	if (!cifs_inode)
+		return NULL;
+	cifs_inode->cifsAttrs = 0x20;	/* default */
+	cifs_inode->time = 0;
+	/* Until the file is open and we have gotten oplock
+	info back from the server, can not assume caching of
+	file data or metadata */
+	cifs_set_oplock_level(cifs_inode, 0);
+	cifs_inode->delete_pending = false;
+	cifs_inode->invalid_mapping = false;
+	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
+	cifs_inode->server_eof = 0;
+	cifs_inode->uniqueid = 0;
+	cifs_inode->createtime = 0;
+
+	/* Can not set i_flags here - they get immediately overwritten
+	   to zero by the VFS */
+/*	cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME;*/
+	INIT_LIST_HEAD(&cifs_inode->openFileList);
+	return &cifs_inode->vfs_inode;
+}
+
+static void cifs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+}
+
+static void
+cifs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, cifs_i_callback);
+}
+
+static void
+cifs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	cifs_fscache_release_inode_cookie(inode);
+}
+
+static void
+cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
+{
+	struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+	struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
+
+	seq_printf(s, ",addr=");
+
+	switch (server->dstaddr.ss_family) {
+	case AF_INET:
+		seq_printf(s, "%pI4", &sa->sin_addr.s_addr);
+		break;
+	case AF_INET6:
+		seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr);
+		if (sa6->sin6_scope_id)
+			seq_printf(s, "%%%u", sa6->sin6_scope_id);
+		break;
+	default:
+		seq_printf(s, "(unknown)");
+	}
+}
+
+static void
+cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
+{
+	seq_printf(s, ",sec=");
+
+	switch (server->secType) {
+	case LANMAN:
+		seq_printf(s, "lanman");
+		break;
+	case NTLMv2:
+		seq_printf(s, "ntlmv2");
+		break;
+	case NTLM:
+		seq_printf(s, "ntlm");
+		break;
+	case Kerberos:
+		seq_printf(s, "krb5");
+		break;
+	case RawNTLMSSP:
+		seq_printf(s, "ntlmssp");
+		break;
+	default:
+		/* shouldn't ever happen */
+		seq_printf(s, "unknown");
+		break;
+	}
+
+	if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+		seq_printf(s, "i");
+}
+
+/*
+ * cifs_show_options() is for displaying mount options in /proc/mounts.
+ * Not all settable options are displayed but most of the important
+ * ones are.
+ */
+static int
+cifs_show_options(struct seq_file *s, struct vfsmount *m)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct sockaddr *srcaddr;
+	srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
+
+	cifs_show_security(s, tcon->ses->server);
+
+	seq_printf(s, ",unc=%s", tcon->treeName);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
+		seq_printf(s, ",multiuser");
+	else if (tcon->ses->user_name)
+		seq_printf(s, ",username=%s", tcon->ses->user_name);
+
+	if (tcon->ses->domainName)
+		seq_printf(s, ",domain=%s", tcon->ses->domainName);
+
+	if (srcaddr->sa_family != AF_UNSPEC) {
+		struct sockaddr_in *saddr4;
+		struct sockaddr_in6 *saddr6;
+		saddr4 = (struct sockaddr_in *)srcaddr;
+		saddr6 = (struct sockaddr_in6 *)srcaddr;
+		if (srcaddr->sa_family == AF_INET6)
+			seq_printf(s, ",srcaddr=%pI6c",
+				   &saddr6->sin6_addr);
+		else if (srcaddr->sa_family == AF_INET)
+			seq_printf(s, ",srcaddr=%pI4",
+				   &saddr4->sin_addr.s_addr);
+		else
+			seq_printf(s, ",srcaddr=BAD-AF:%i",
+				   (int)(srcaddr->sa_family));
+	}
+
+	seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
+		seq_printf(s, ",forceuid");
+	else
+		seq_printf(s, ",noforceuid");
+
+	seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
+		seq_printf(s, ",forcegid");
+	else
+		seq_printf(s, ",noforcegid");
+
+	cifs_show_address(s, tcon->ses->server);
+
+	if (!tcon->unix_ext)
+		seq_printf(s, ",file_mode=0%o,dir_mode=0%o",
+					   cifs_sb->mnt_file_mode,
+					   cifs_sb->mnt_dir_mode);
+	if (tcon->seal)
+		seq_printf(s, ",seal");
+	if (tcon->nocase)
+		seq_printf(s, ",nocase");
+	if (tcon->retry)
+		seq_printf(s, ",hard");
+	if (tcon->unix_ext)
+		seq_printf(s, ",unix");
+	else
+		seq_printf(s, ",nounix");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
+		seq_printf(s, ",posixpaths");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
+		seq_printf(s, ",setuids");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+		seq_printf(s, ",serverino");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		seq_printf(s, ",rwpidforward");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL)
+		seq_printf(s, ",forcemand");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
+		seq_printf(s, ",directio");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+		seq_printf(s, ",nouser_xattr");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
+		seq_printf(s, ",mapchars");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
+		seq_printf(s, ",sfu");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+		seq_printf(s, ",nobrl");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+		seq_printf(s, ",cifsacl");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+		seq_printf(s, ",dynperm");
+	if (m->mnt_sb->s_flags & MS_POSIXACL)
+		seq_printf(s, ",acl");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+		seq_printf(s, ",mfsymlinks");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
+		seq_printf(s, ",fsc");
+
+	seq_printf(s, ",rsize=%d", cifs_sb->rsize);
+	seq_printf(s, ",wsize=%d", cifs_sb->wsize);
+	/* convert actimeo and display it in seconds */
+		seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
+
+	return 0;
+}
+
+static void cifs_umount_begin(struct super_block *sb)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct cifs_tcon *tcon;
+
+	if (cifs_sb == NULL)
+		return;
+
+	tcon = cifs_sb_master_tcon(cifs_sb);
+
+	spin_lock(&cifs_tcp_ses_lock);
+	if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
+		/* we have other mounts to same share or we have
+		   already tried to force umount this and woken up
+		   all waiting network requests, nothing to do */
+		spin_unlock(&cifs_tcp_ses_lock);
+		return;
+	} else if (tcon->tc_count == 1)
+		tcon->tidStatus = CifsExiting;
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
+	/* cancel_notify_requests(tcon); */
+	if (tcon->ses && tcon->ses->server) {
+		cFYI(1, "wake up tasks now - umount begin not complete");
+		wake_up_all(&tcon->ses->server->request_q);
+		wake_up_all(&tcon->ses->server->response_q);
+		msleep(1); /* yield */
+		/* we have to kick the requests once more */
+		wake_up_all(&tcon->ses->server->response_q);
+		msleep(1);
+	}
+
+	return;
+}
+
+#ifdef CONFIG_CIFS_STATS2
+static int cifs_show_stats(struct seq_file *s, struct vfsmount *mnt)
+{
+	/* BB FIXME */
+	return 0;
+}
+#endif
+
+static int cifs_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_NODIRATIME;
+	return 0;
+}
+
+static int cifs_drop_inode(struct inode *inode)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+	/* no serverino => unconditional eviction */
+	return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) ||
+		generic_drop_inode(inode);
+}
+
+static const struct super_operations cifs_super_ops = {
+	.statfs = cifs_statfs,
+	.alloc_inode = cifs_alloc_inode,
+	.destroy_inode = cifs_destroy_inode,
+	.drop_inode	= cifs_drop_inode,
+	.evict_inode	= cifs_evict_inode,
+/*	.delete_inode	= cifs_delete_inode,  */  /* Do not need above
+	function unless later we add lazy close of inodes or unless the
+	kernel forgets to call us with the same number of releases (closes)
+	as opens */
+	.show_options = cifs_show_options,
+	.umount_begin   = cifs_umount_begin,
+	.remount_fs = cifs_remount,
+#ifdef CONFIG_CIFS_STATS2
+	.show_stats = cifs_show_stats,
+#endif
+};
+
+/*
+ * Get root dentry from superblock according to prefix path mount option.
+ * Return dentry with refcount + 1 on success and NULL otherwise.
+ */
+static struct dentry *
+cifs_get_root(struct smb_vol *vol, struct super_block *sb)
+{
+	struct dentry *dentry;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	char *full_path = NULL;
+	char *s, *p;
+	char sep;
+	int xid;
+
+	full_path = cifs_build_path_to_root(vol, cifs_sb,
+					    cifs_sb_master_tcon(cifs_sb));
+	if (full_path == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	cFYI(1, "Get root dentry for %s", full_path);
+
+	xid = GetXid();
+	sep = CIFS_DIR_SEP(cifs_sb);
+	dentry = dget(sb->s_root);
+	p = s = full_path;
+
+	do {
+		struct inode *dir = dentry->d_inode;
+		struct dentry *child;
+
+		if (!dir) {
+			dput(dentry);
+			dentry = ERR_PTR(-ENOENT);
+			break;
+		}
+
+		/* skip separators */
+		while (*s == sep)
+			s++;
+		if (!*s)
+			break;
+		p = s++;
+		/* next separator */
+		while (*s && *s != sep)
+			s++;
+
+		mutex_lock(&dir->i_mutex);
+		child = lookup_one_len(p, dentry, s - p);
+		mutex_unlock(&dir->i_mutex);
+		dput(dentry);
+		dentry = child;
+	} while (!IS_ERR(dentry));
+	_FreeXid(xid);
+	kfree(full_path);
+	return dentry;
+}
+
+static int cifs_set_super(struct super_block *sb, void *data)
+{
+	struct cifs_mnt_data *mnt_data = data;
+	sb->s_fs_info = mnt_data->cifs_sb;
+	return set_anon_super(sb, NULL);
+}
+
+static struct dentry *
+cifs_do_mount(struct file_system_type *fs_type,
+	      int flags, const char *dev_name, void *data)
+{
+	int rc;
+	struct super_block *sb;
+	struct cifs_sb_info *cifs_sb;
+	struct smb_vol *volume_info;
+	struct cifs_mnt_data mnt_data;
+	struct dentry *root;
+
+	cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
+
+	volume_info = cifs_get_volume_info((char *)data, dev_name);
+	if (IS_ERR(volume_info))
+		return ERR_CAST(volume_info);
+
+	cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL);
+	if (cifs_sb == NULL) {
+		root = ERR_PTR(-ENOMEM);
+		goto out_nls;
+	}
+
+	cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
+	if (cifs_sb->mountdata == NULL) {
+		root = ERR_PTR(-ENOMEM);
+		goto out_cifs_sb;
+	}
+
+	cifs_setup_cifs_sb(volume_info, cifs_sb);
+
+	rc = cifs_mount(cifs_sb, volume_info);
+	if (rc) {
+		if (!(flags & MS_SILENT))
+			cERROR(1, "cifs_mount failed w/return code = %d", rc);
+		root = ERR_PTR(rc);
+		goto out_mountdata;
+	}
+
+	mnt_data.vol = volume_info;
+	mnt_data.cifs_sb = cifs_sb;
+	mnt_data.flags = flags;
+
+	sb = sget(fs_type, cifs_match_super, cifs_set_super, &mnt_data);
+	if (IS_ERR(sb)) {
+		root = ERR_CAST(sb);
+		cifs_umount(cifs_sb);
+		goto out;
+	}
+
+	if (sb->s_root) {
+		cFYI(1, "Use existing superblock");
+		cifs_umount(cifs_sb);
+	} else {
+		sb->s_flags = flags;
+		/* BB should we make this contingent on mount parm? */
+		sb->s_flags |= MS_NODIRATIME | MS_NOATIME;
+
+		rc = cifs_read_super(sb);
+		if (rc) {
+			root = ERR_PTR(rc);
+			goto out_super;
+		}
+
+		sb->s_flags |= MS_ACTIVE;
+	}
+
+	root = cifs_get_root(volume_info, sb);
+	if (IS_ERR(root))
+		goto out_super;
+
+	cFYI(1, "dentry root is: %p", root);
+	goto out;
+
+out_super:
+	deactivate_locked_super(sb);
+out:
+	cifs_cleanup_volume_info(volume_info);
+	return root;
+
+out_mountdata:
+	kfree(cifs_sb->mountdata);
+out_cifs_sb:
+	kfree(cifs_sb);
+out_nls:
+	unload_nls(volume_info->local_nls);
+	goto out;
+}
+
+static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				   unsigned long nr_segs, loff_t pos)
+{
+	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+	ssize_t written;
+	int rc;
+
+	written = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+	if (CIFS_I(inode)->clientCanCacheAll)
+		return written;
+
+	rc = filemap_fdatawrite(inode->i_mapping);
+	if (rc)
+		cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);
+
+	return written;
+}
+
+static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
+{
+	/* origin == SEEK_END => we must revalidate the cached file length */
+	if (origin == SEEK_END) {
+		int rc;
+		struct inode *inode = file->f_path.dentry->d_inode;
+
+		/*
+		 * We need to be sure that all dirty pages are written and the
+		 * server has the newest file length.
+		 */
+		if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping &&
+		    inode->i_mapping->nrpages != 0) {
+			rc = filemap_fdatawait(inode->i_mapping);
+			if (rc) {
+				mapping_set_error(inode->i_mapping, rc);
+				return rc;
+			}
+		}
+		/*
+		 * Some applications poll for the file length in this strange
+		 * way so we must seek to end on non-oplocked files by
+		 * setting the revalidate time to zero.
+		 */
+		CIFS_I(inode)->time = 0;
+
+		rc = cifs_revalidate_file_attr(file);
+		if (rc < 0)
+			return (loff_t)rc;
+	}
+	return generic_file_llseek_unlocked(file, offset, origin);
+}
+
+static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
+{
+	/* note that this is called by vfs setlease with lock_flocks held
+	   to protect *lease from going away */
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct cifsFileInfo *cfile = file->private_data;
+
+	if (!(S_ISREG(inode->i_mode)))
+		return -EINVAL;
+
+	/* check if file is oplocked */
+	if (((arg == F_RDLCK) &&
+		(CIFS_I(inode)->clientCanCacheRead)) ||
+	    ((arg == F_WRLCK) &&
+		(CIFS_I(inode)->clientCanCacheAll)))
+		return generic_setlease(file, arg, lease);
+	else if (tlink_tcon(cfile->tlink)->local_lease &&
+		 !CIFS_I(inode)->clientCanCacheRead)
+		/* If the server claims to support oplock on this
+		   file, then we still need to check oplock even
+		   if the local_lease mount option is set, but there
+		   are servers which do not support oplock for which
+		   this mount option may be useful if the user
+		   knows that the file won't be changed on the server
+		   by anyone else */
+		return generic_setlease(file, arg, lease);
+	else
+		return -EAGAIN;
+}
+
+struct file_system_type cifs_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "cifs",
+	.mount = cifs_do_mount,
+	.kill_sb = cifs_kill_sb,
+	/*  .fs_flags */
+};
+const struct inode_operations cifs_dir_inode_ops = {
+	.create = cifs_create,
+	.lookup = cifs_lookup,
+	.getattr = cifs_getattr,
+	.unlink = cifs_unlink,
+	.link = cifs_hardlink,
+	.mkdir = cifs_mkdir,
+	.rmdir = cifs_rmdir,
+	.rename = cifs_rename,
+	.permission = cifs_permission,
+/*	revalidate:cifs_revalidate,   */
+	.setattr = cifs_setattr,
+	.symlink = cifs_symlink,
+	.mknod   = cifs_mknod,
+#ifdef CONFIG_CIFS_XATTR
+	.setxattr = cifs_setxattr,
+	.getxattr = cifs_getxattr,
+	.listxattr = cifs_listxattr,
+	.removexattr = cifs_removexattr,
+#endif
+};
+
+const struct inode_operations cifs_file_inode_ops = {
+/*	revalidate:cifs_revalidate, */
+	.setattr = cifs_setattr,
+	.getattr = cifs_getattr, /* do we need this anymore? */
+	.rename = cifs_rename,
+	.permission = cifs_permission,
+#ifdef CONFIG_CIFS_XATTR
+	.setxattr = cifs_setxattr,
+	.getxattr = cifs_getxattr,
+	.listxattr = cifs_listxattr,
+	.removexattr = cifs_removexattr,
+#endif
+};
+
+const struct inode_operations cifs_symlink_inode_ops = {
+	.readlink = generic_readlink,
+	.follow_link = cifs_follow_link,
+	.put_link = cifs_put_link,
+	.permission = cifs_permission,
+	/* BB add the following two eventually */
+	/* revalidate: cifs_revalidate,
+	   setattr:    cifs_notify_change, *//* BB do we need notify change */
+#ifdef CONFIG_CIFS_XATTR
+	.setxattr = cifs_setxattr,
+	.getxattr = cifs_getxattr,
+	.listxattr = cifs_listxattr,
+	.removexattr = cifs_removexattr,
+#endif
+};
+
+const struct file_operations cifs_file_ops = {
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = generic_file_aio_read,
+	.aio_write = cifs_file_aio_write,
+	.open = cifs_open,
+	.release = cifs_close,
+	.lock = cifs_lock,
+	.fsync = cifs_fsync,
+	.flush = cifs_flush,
+	.mmap  = cifs_file_mmap,
+	.splice_read = generic_file_splice_read,
+	.llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl	= cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.setlease = cifs_setlease,
+};
+
+const struct file_operations cifs_file_strict_ops = {
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = cifs_strict_readv,
+	.aio_write = cifs_strict_writev,
+	.open = cifs_open,
+	.release = cifs_close,
+	.lock = cifs_lock,
+	.fsync = cifs_strict_fsync,
+	.flush = cifs_flush,
+	.mmap = cifs_file_strict_mmap,
+	.splice_read = generic_file_splice_read,
+	.llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl	= cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.setlease = cifs_setlease,
+};
+
+const struct file_operations cifs_file_direct_ops = {
+	/* BB reevaluate whether they can be done with directio, no cache */
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = cifs_user_readv,
+	.aio_write = cifs_user_writev,
+	.open = cifs_open,
+	.release = cifs_close,
+	.lock = cifs_lock,
+	.fsync = cifs_fsync,
+	.flush = cifs_flush,
+	.mmap = cifs_file_mmap,
+	.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl  = cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.llseek = cifs_llseek,
+	.setlease = cifs_setlease,
+};
+
+const struct file_operations cifs_file_nobrl_ops = {
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = generic_file_aio_read,
+	.aio_write = cifs_file_aio_write,
+	.open = cifs_open,
+	.release = cifs_close,
+	.fsync = cifs_fsync,
+	.flush = cifs_flush,
+	.mmap  = cifs_file_mmap,
+	.splice_read = generic_file_splice_read,
+	.llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl	= cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.setlease = cifs_setlease,
+};
+
+const struct file_operations cifs_file_strict_nobrl_ops = {
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = cifs_strict_readv,
+	.aio_write = cifs_strict_writev,
+	.open = cifs_open,
+	.release = cifs_close,
+	.fsync = cifs_strict_fsync,
+	.flush = cifs_flush,
+	.mmap = cifs_file_strict_mmap,
+	.splice_read = generic_file_splice_read,
+	.llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl	= cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.setlease = cifs_setlease,
+};
+
+const struct file_operations cifs_file_direct_nobrl_ops = {
+	/* BB reevaluate whether they can be done with directio, no cache */
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = cifs_user_readv,
+	.aio_write = cifs_user_writev,
+	.open = cifs_open,
+	.release = cifs_close,
+	.fsync = cifs_fsync,
+	.flush = cifs_flush,
+	.mmap = cifs_file_mmap,
+	.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl  = cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.llseek = cifs_llseek,
+	.setlease = cifs_setlease,
+};
+
+const struct file_operations cifs_dir_ops = {
+	.readdir = cifs_readdir,
+	.release = cifs_closedir,
+	.read    = generic_read_dir,
+	.unlocked_ioctl  = cifs_ioctl,
+	.llseek = generic_file_llseek,
+};
+
+static void
+cifs_init_once(void *inode)
+{
+	struct cifsInodeInfo *cifsi = inode;
+
+	inode_init_once(&cifsi->vfs_inode);
+	INIT_LIST_HEAD(&cifsi->lockList);
+}
+
+static int
+cifs_init_inodecache(void)
+{
+	cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
+					      sizeof(struct cifsInodeInfo),
+					      0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					      cifs_init_once);
+	if (cifs_inode_cachep == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void
+cifs_destroy_inodecache(void)
+{
+	kmem_cache_destroy(cifs_inode_cachep);
+}
+
+static int
+cifs_init_request_bufs(void)
+{
+	if (CIFSMaxBufSize < 8192) {
+	/* Buffer size can not be smaller than 2 * PATH_MAX since maximum
+	Unicode path name has to fit in any SMB/CIFS path based frames */
+		CIFSMaxBufSize = 8192;
+	} else if (CIFSMaxBufSize > 1024*127) {
+		CIFSMaxBufSize = 1024 * 127;
+	} else {
+		CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
+	}
+/*	cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */
+	cifs_req_cachep = kmem_cache_create("cifs_request",
+					    CIFSMaxBufSize +
+					    MAX_CIFS_HDR_SIZE, 0,
+					    SLAB_HWCACHE_ALIGN, NULL);
+	if (cifs_req_cachep == NULL)
+		return -ENOMEM;
+
+	if (cifs_min_rcv < 1)
+		cifs_min_rcv = 1;
+	else if (cifs_min_rcv > 64) {
+		cifs_min_rcv = 64;
+		cERROR(1, "cifs_min_rcv set to maximum (64)");
+	}
+
+	cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
+						  cifs_req_cachep);
+
+	if (cifs_req_poolp == NULL) {
+		kmem_cache_destroy(cifs_req_cachep);
+		return -ENOMEM;
+	}
+	/* MAX_CIFS_SMALL_BUFFER_SIZE bytes is enough for most SMB responses and
+	almost all handle based requests (but not write response, nor is it
+	sufficient for path based requests).  A smaller size would have
+	been more efficient (compacting multiple slab items on one 4k page)
+	for the case in which debug was on, but this larger size allows
+	more SMBs to use small buffer alloc and is still much more
+	efficient to alloc 1 per page off the slab compared to 17K (5page)
+	alloc of large cifs buffers even when page debugging is on */
+	cifs_sm_req_cachep = kmem_cache_create("cifs_small_rq",
+			MAX_CIFS_SMALL_BUFFER_SIZE, 0, SLAB_HWCACHE_ALIGN,
+			NULL);
+	if (cifs_sm_req_cachep == NULL) {
+		mempool_destroy(cifs_req_poolp);
+		kmem_cache_destroy(cifs_req_cachep);
+		return -ENOMEM;
+	}
+
+	if (cifs_min_small < 2)
+		cifs_min_small = 2;
+	else if (cifs_min_small > 256) {
+		cifs_min_small = 256;
+		cFYI(1, "cifs_min_small set to maximum (256)");
+	}
+
+	cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
+						     cifs_sm_req_cachep);
+
+	if (cifs_sm_req_poolp == NULL) {
+		mempool_destroy(cifs_req_poolp);
+		kmem_cache_destroy(cifs_req_cachep);
+		kmem_cache_destroy(cifs_sm_req_cachep);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void
+cifs_destroy_request_bufs(void)
+{
+	mempool_destroy(cifs_req_poolp);
+	kmem_cache_destroy(cifs_req_cachep);
+	mempool_destroy(cifs_sm_req_poolp);
+	kmem_cache_destroy(cifs_sm_req_cachep);
+}
+
+static int
+cifs_init_mids(void)
+{
+	cifs_mid_cachep = kmem_cache_create("cifs_mpx_ids",
+					    sizeof(struct mid_q_entry), 0,
+					    SLAB_HWCACHE_ALIGN, NULL);
+	if (cifs_mid_cachep == NULL)
+		return -ENOMEM;
+
+	/* 3 is a reasonable minimum number of simultaneous operations */
+	cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep);
+	if (cifs_mid_poolp == NULL) {
+		kmem_cache_destroy(cifs_mid_cachep);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void
+cifs_destroy_mids(void)
+{
+	mempool_destroy(cifs_mid_poolp);
+	kmem_cache_destroy(cifs_mid_cachep);
+}
+
+static int __init
+init_cifs(void)
+{
+	int rc = 0;
+	cifs_proc_init();
+	INIT_LIST_HEAD(&cifs_tcp_ses_list);
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
+	INIT_LIST_HEAD(&GlobalDnotifyReqList);
+	INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
+/*
+ *  Initialize Global counters
+ */
+	atomic_set(&sesInfoAllocCount, 0);
+	atomic_set(&tconInfoAllocCount, 0);
+	atomic_set(&tcpSesAllocCount, 0);
+	atomic_set(&tcpSesReconnectCount, 0);
+	atomic_set(&tconInfoReconnectCount, 0);
+
+	atomic_set(&bufAllocCount, 0);
+	atomic_set(&smBufAllocCount, 0);
+#ifdef CONFIG_CIFS_STATS2
+	atomic_set(&totBufAllocCount, 0);
+	atomic_set(&totSmBufAllocCount, 0);
+#endif /* CONFIG_CIFS_STATS2 */
+
+	atomic_set(&midCount, 0);
+	GlobalCurrentXid = 0;
+	GlobalTotalActiveXid = 0;
+	GlobalMaxActiveXid = 0;
+	spin_lock_init(&cifs_tcp_ses_lock);
+	spin_lock_init(&cifs_file_list_lock);
+	spin_lock_init(&GlobalMid_Lock);
+
+	if (cifs_max_pending < 2) {
+		cifs_max_pending = 2;
+		cFYI(1, "cifs_max_pending set to min of 2");
+	} else if (cifs_max_pending > 256) {
+		cifs_max_pending = 256;
+		cFYI(1, "cifs_max_pending set to max of 256");
+	}
+
+	rc = cifs_fscache_register();
+	if (rc)
+		goto out_clean_proc;
+
+	rc = cifs_init_inodecache();
+	if (rc)
+		goto out_unreg_fscache;
+
+	rc = cifs_init_mids();
+	if (rc)
+		goto out_destroy_inodecache;
+
+	rc = cifs_init_request_bufs();
+	if (rc)
+		goto out_destroy_mids;
+
+#ifdef CONFIG_CIFS_UPCALL
+	rc = register_key_type(&cifs_spnego_key_type);
+	if (rc)
+		goto out_destroy_request_bufs;
+#endif /* CONFIG_CIFS_UPCALL */
+
+#ifdef CONFIG_CIFS_ACL
+	rc = init_cifs_idmap();
+	if (rc)
+		goto out_register_key_type;
+#endif /* CONFIG_CIFS_ACL */
+
+	rc = register_filesystem(&cifs_fs_type);
+	if (rc)
+		goto out_init_cifs_idmap;
+
+	return 0;
+
+out_init_cifs_idmap:
+#ifdef CONFIG_CIFS_ACL
+	exit_cifs_idmap();
+out_register_key_type:
+#endif
+#ifdef CONFIG_CIFS_UPCALL
+	unregister_key_type(&cifs_spnego_key_type);
+out_destroy_request_bufs:
+#endif
+	cifs_destroy_request_bufs();
+out_destroy_mids:
+	cifs_destroy_mids();
+out_destroy_inodecache:
+	cifs_destroy_inodecache();
+out_unreg_fscache:
+	cifs_fscache_unregister();
+out_clean_proc:
+	cifs_proc_clean();
+	return rc;
+}
+
+static void __exit
+exit_cifs(void)
+{
+	cFYI(DBG2, "exit_cifs");
+	cifs_proc_clean();
+	cifs_fscache_unregister();
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	cifs_dfs_release_automount_timer();
+#endif
+#ifdef CONFIG_CIFS_ACL
+	cifs_destroy_idmaptrees();
+	exit_cifs_idmap();
+#endif
+#ifdef CONFIG_CIFS_UPCALL
+	unregister_key_type(&cifs_spnego_key_type);
+#endif
+	unregister_filesystem(&cifs_fs_type);
+	cifs_destroy_inodecache();
+	cifs_destroy_mids();
+	cifs_destroy_request_bufs();
+}
+
+MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
+MODULE_LICENSE("GPL");	/* combination of LGPL + GPL source behaves as GPL */
+MODULE_DESCRIPTION
+    ("VFS to access servers complying with the SNIA CIFS Specification "
+     "e.g. Samba and Windows");
+MODULE_VERSION(CIFS_VERSION);
+module_init(init_cifs)
+module_exit(exit_cifs)
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
new file mode 100644
index 00000000..036ca83e
--- /dev/null
+++ b/fs/cifs/cifsfs.h
@@ -0,0 +1,133 @@
+/*
+ *   fs/cifs/cifsfs.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002, 2007
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _CIFSFS_H
+#define _CIFSFS_H
+
+#define ROOT_I 2
+
+/*
+ * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
+ * so that it will fit.
+ */
+static inline ino_t
+cifs_uniqueid_to_ino_t(u64 fileid)
+{
+	ino_t ino = (ino_t) fileid;
+	if (sizeof(ino_t) < sizeof(u64))
+		ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8;
+	return ino;
+}
+
+extern struct file_system_type cifs_fs_type;
+extern const struct address_space_operations cifs_addr_ops;
+extern const struct address_space_operations cifs_addr_ops_smallbuf;
+
+/* Functions related to super block operations */
+extern void cifs_sb_active(struct super_block *sb);
+extern void cifs_sb_deactive(struct super_block *sb);
+
+/* Functions related to inodes */
+extern const struct inode_operations cifs_dir_inode_ops;
+extern struct inode *cifs_root_iget(struct super_block *);
+extern int cifs_create(struct inode *, struct dentry *, int,
+		       struct nameidata *);
+extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
+				  struct nameidata *);
+extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
+extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
+extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t);
+extern int cifs_mkdir(struct inode *, struct dentry *, int);
+extern int cifs_rmdir(struct inode *, struct dentry *);
+extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
+		       struct dentry *);
+extern int cifs_revalidate_file_attr(struct file *filp);
+extern int cifs_revalidate_dentry_attr(struct dentry *);
+extern int cifs_revalidate_file(struct file *filp);
+extern int cifs_revalidate_dentry(struct dentry *);
+extern int cifs_invalidate_mapping(struct inode *inode);
+extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int cifs_setattr(struct dentry *, struct iattr *);
+
+extern const struct inode_operations cifs_file_inode_ops;
+extern const struct inode_operations cifs_symlink_inode_ops;
+extern const struct inode_operations cifs_dfs_referral_inode_operations;
+
+
+/* Functions related to files and directories */
+extern const struct file_operations cifs_file_ops;
+extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */
+extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */
+extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */
+extern const struct file_operations cifs_file_direct_nobrl_ops;
+extern const struct file_operations cifs_file_strict_nobrl_ops;
+extern int cifs_open(struct inode *inode, struct file *file);
+extern int cifs_close(struct inode *inode, struct file *file);
+extern int cifs_closedir(struct inode *inode, struct file *file);
+extern ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos);
+extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
+				 unsigned long nr_segs, loff_t pos);
+extern ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos);
+extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
+				  unsigned long nr_segs, loff_t pos);
+extern int cifs_lock(struct file *, int, struct file_lock *);
+extern int cifs_fsync(struct file *, int);
+extern int cifs_strict_fsync(struct file *, int);
+extern int cifs_flush(struct file *, fl_owner_t id);
+extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
+extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
+extern const struct file_operations cifs_dir_ops;
+extern int cifs_dir_open(struct inode *inode, struct file *file);
+extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
+
+/* Functions related to dir entries */
+extern const struct dentry_operations cifs_dentry_ops;
+extern const struct dentry_operations cifs_ci_dentry_ops;
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
+#else
+#define cifs_dfs_d_automount NULL
+#endif
+
+/* Functions related to symlinks */
+extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
+extern void cifs_put_link(struct dentry *direntry,
+			  struct nameidata *nd, void *);
+extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
+			 int buflen);
+extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
+			const char *symname);
+extern int	cifs_removexattr(struct dentry *, const char *);
+extern int	cifs_setxattr(struct dentry *, const char *, const void *,
+			size_t, int);
+extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
+extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
+
+#ifdef CIFS_NFSD_EXPORT
+extern const struct export_operations cifs_export_ops;
+#endif /* CIFS_NFSD_EXPORT */
+
+#define CIFS_VERSION   "1.74"
+#endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
new file mode 100644
index 00000000..7cb9dd22
--- /dev/null
+++ b/fs/cifs/cifsglob.h
@@ -0,0 +1,951 @@
+/*
+ *   fs/cifs/cifsglob.h
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Jeremy Allison (jra@samba.org)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ */
+#ifndef _CIFS_GLOB_H
+#define _CIFS_GLOB_H
+
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include "cifs_fs_sb.h"
+#include "cifsacl.h"
+#include <crypto/internal/hash.h>
+#include <linux/scatterlist.h>
+
+/*
+ * The sizes of various internal tables and strings
+ */
+#define MAX_UID_INFO 16
+#define MAX_SES_INFO 2
+#define MAX_TCON_INFO 4
+
+#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1)
+#define MAX_SERVER_SIZE 15
+#define MAX_SHARE_SIZE 80
+#define MAX_USERNAME_SIZE 256	/* reasonable maximum for current servers */
+#define MAX_PASSWORD_SIZE 512	/* max for windows seems to be 256 wide chars */
+
+#define CIFS_MIN_RCV_POOL 4
+
+#define MAX_REOPEN_ATT	5 /* these many maximum attempts to reopen a file */
+/*
+ * default attribute cache timeout (jiffies)
+ */
+#define CIFS_DEF_ACTIMEO (1 * HZ)
+
+/*
+ * max attribute cache timeout (jiffies) - 2^30
+ */
+#define CIFS_MAX_ACTIMEO (1 << 30)
+
+/*
+ * MAX_REQ is the maximum number of requests that WE will send
+ * on one socket concurrently. It also matches the most common
+ * value of max multiplex returned by servers.  We may
+ * eventually want to use the negotiated value (in case
+ * future servers can handle more) when we are more confident that
+ * we will not have problems oveloading the socket with pending
+ * write data.
+ */
+#define CIFS_MAX_REQ 50
+
+#define RFC1001_NAME_LEN 15
+#define RFC1001_NAME_LEN_WITH_NULL (RFC1001_NAME_LEN + 1)
+
+/* currently length of NIP6_FMT */
+#define SERVER_NAME_LENGTH 40
+#define SERVER_NAME_LEN_WITH_NULL     (SERVER_NAME_LENGTH + 1)
+
+/* used to define string lengths for reversing unicode strings */
+/*         (256+1)*2 = 514                                     */
+/*           (max path length + 1 for null) * 2 for unicode    */
+#define MAX_NAME 514
+
+#include "cifspdu.h"
+
+#ifndef XATTR_DOS_ATTRIB
+#define XATTR_DOS_ATTRIB "user.DOSATTRIB"
+#endif
+
+/*
+ * CIFS vfs client Status information (based on what we know.)
+ */
+
+/* associated with each tcp and smb session */
+enum statusEnum {
+	CifsNew = 0,
+	CifsGood,
+	CifsExiting,
+	CifsNeedReconnect,
+	CifsNeedNegotiate
+};
+
+enum securityEnum {
+	LANMAN = 0,			/* Legacy LANMAN auth */
+	NTLM,			/* Legacy NTLM012 auth with NTLM hash */
+	NTLMv2,			/* Legacy NTLM auth with NTLMv2 hash */
+	RawNTLMSSP,		/* NTLMSSP without SPNEGO, NTLMv2 hash */
+/*	NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
+	Kerberos,		/* Kerberos via SPNEGO */
+};
+
+enum protocolEnum {
+	TCP = 0,
+	SCTP
+	/* Netbios frames protocol not supported at this time */
+};
+
+struct session_key {
+	unsigned int len;
+	char *response;
+};
+
+/* crypto security descriptor definition */
+struct sdesc {
+	struct shash_desc shash;
+	char ctx[];
+};
+
+/* crypto hashing related structure/fields, not specific to a sec mech */
+struct cifs_secmech {
+	struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
+	struct crypto_shash *md5; /* md5 hash function */
+	struct sdesc *sdeschmacmd5;  /* ctxt to generate ntlmv2 hash, CR1 */
+	struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
+};
+
+/* per smb session structure/fields */
+struct ntlmssp_auth {
+	__u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */
+	__u32 server_flags; /* sent by server in type 2 ntlmssp exchange */
+	unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */
+	char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */
+};
+
+struct cifs_cred {
+	int uid;
+	int gid;
+	int mode;
+	int cecount;
+	struct cifs_sid osid;
+	struct cifs_sid gsid;
+	struct cifs_ntace *ntaces;
+	struct cifs_ace *aces;
+};
+
+/*
+ *****************************************************************
+ * Except the CIFS PDUs themselves all the
+ * globally interesting structs should go here
+ *****************************************************************
+ */
+
+struct smb_vol {
+	char *username;
+	char *password;
+	char *domainname;
+	char *UNC;
+	char *UNCip;
+	char *iocharset;  /* local code page for mapping to and from Unicode */
+	char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
+	char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
+	uid_t cred_uid;
+	uid_t linux_uid;
+	gid_t linux_gid;
+	mode_t file_mode;
+	mode_t dir_mode;
+	unsigned secFlg;
+	bool retry:1;
+	bool intr:1;
+	bool setuids:1;
+	bool override_uid:1;
+	bool override_gid:1;
+	bool dynperm:1;
+	bool noperm:1;
+	bool no_psx_acl:1; /* set if posix acl support should be disabled */
+	bool cifs_acl:1;
+	bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
+	bool server_ino:1; /* use inode numbers from server ie UniqueId */
+	bool direct_io:1;
+	bool strict_io:1; /* strict cache behavior */
+	bool remap:1;      /* set to remap seven reserved chars in filenames */
+	bool posix_paths:1; /* unset to not ask for posix pathnames. */
+	bool no_linux_ext:1;
+	bool sfu_emul:1;
+	bool nullauth:1;   /* attempt to authenticate with null user */
+	bool nocase:1;     /* request case insensitive filenames */
+	bool nobrl:1;      /* disable sending byte range locks to srv */
+	bool mand_lock:1;  /* send mandatory not posix byte range lock reqs */
+	bool seal:1;       /* request transport encryption on share */
+	bool nodfs:1;      /* Do not request DFS, even if available */
+	bool local_lease:1; /* check leases only on local system, not remote */
+	bool noblocksnd:1;
+	bool noautotune:1;
+	bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
+	bool fsc:1;	/* enable fscache */
+	bool mfsymlinks:1; /* use Minshall+French Symlinks */
+	bool multiuser:1;
+	bool rwpidforward:1; /* pid forward for read/write operations */
+	unsigned int rsize;
+	unsigned int wsize;
+	bool sockopt_tcp_nodelay:1;
+	unsigned short int port;
+	unsigned long actimeo; /* attribute cache timeout (jiffies) */
+	char *prepath;
+	struct sockaddr_storage srcaddr; /* allow binding to a local IP */
+	struct nls_table *local_nls;
+};
+
+#define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \
+			 CIFS_MOUNT_SERVER_INUM | CIFS_MOUNT_DIRECT_IO | \
+			 CIFS_MOUNT_NO_XATTR | CIFS_MOUNT_MAP_SPECIAL_CHR | \
+			 CIFS_MOUNT_UNX_EMUL | CIFS_MOUNT_NO_BRL | \
+			 CIFS_MOUNT_CIFS_ACL | CIFS_MOUNT_OVERR_UID | \
+			 CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \
+			 CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \
+			 CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \
+			 CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO)
+
+#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \
+		      MS_NODEV | MS_SYNCHRONOUS)
+
+struct cifs_mnt_data {
+	struct cifs_sb_info *cifs_sb;
+	struct smb_vol *vol;
+	int flags;
+};
+
+struct TCP_Server_Info {
+	struct list_head tcp_ses_list;
+	struct list_head smb_ses_list;
+	int srv_count; /* reference counter */
+	/* 15 character server name + 0x20 16th byte indicating type = srv */
+	char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
+	enum statusEnum tcpStatus; /* what we think the status is */
+	char *hostname; /* hostname portion of UNC string */
+	struct socket *ssocket;
+	struct sockaddr_storage dstaddr;
+	struct sockaddr_storage srcaddr; /* locally bind to this IP */
+#ifdef CONFIG_NET_NS
+	struct net *net;
+#endif
+	wait_queue_head_t response_q;
+	wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
+	struct list_head pending_mid_q;
+	bool noblocksnd;		/* use blocking sendmsg */
+	bool noautotune;		/* do not autotune send buf sizes */
+	bool tcp_nodelay;
+	atomic_t inFlight;  /* number of requests on the wire to server */
+	struct mutex srv_mutex;
+	struct task_struct *tsk;
+	char server_GUID[16];
+	char sec_mode;
+	bool session_estab; /* mark when very first sess is established */
+	u16 dialect; /* dialect index that server chose */
+	enum securityEnum secType;
+	unsigned int maxReq;	/* Clients should submit no more */
+	/* than maxReq distinct unanswered SMBs to the server when using  */
+	/* multiplexed reads or writes */
+	unsigned int maxBuf;	/* maxBuf specifies the maximum */
+	/* message size the server can send or receive for non-raw SMBs */
+	/* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */
+	/* when socket is setup (and during reconnect) before NegProt sent */
+	unsigned int max_rw;	/* maxRw specifies the maximum */
+	/* message size the server can send or receive for */
+	/* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
+	unsigned int max_vcs;	/* maximum number of smb sessions, at least
+				   those that can be specified uniquely with
+				   vcnumbers */
+	int capabilities; /* allow selective disabling of caps by smb sess */
+	int timeAdj;  /* Adjust for difference in server time zone in sec */
+	__u16 CurrentMid;         /* multiplex id - rotating counter */
+	char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
+	/* 16th byte of RFC1001 workstation name is always null */
+	char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
+	__u32 sequence_number; /* for signing, protected by srv_mutex */
+	struct session_key session_key;
+	unsigned long lstrp; /* when we got last response from this server */
+	struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
+	/* extended security flavors that server supports */
+	bool	sec_ntlmssp;		/* supports NTLMSSP */
+	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
+	bool	sec_kerberos;		/* supports plain Kerberos */
+	bool	sec_mskerberos;		/* supports legacy MS Kerberos */
+	struct delayed_work	echo; /* echo ping workqueue job */
+#ifdef CONFIG_CIFS_FSCACHE
+	struct fscache_cookie   *fscache; /* client index cache cookie */
+#endif
+#ifdef CONFIG_CIFS_STATS2
+	atomic_t inSend; /* requests trying to send */
+	atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
+#endif
+};
+
+/*
+ * Macros to allow the TCP_Server_Info->net field and related code to drop out
+ * when CONFIG_NET_NS isn't set.
+ */
+
+#ifdef CONFIG_NET_NS
+
+static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
+{
+	return srv->net;
+}
+
+static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
+{
+	srv->net = net;
+}
+
+#else
+
+static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
+{
+	return &init_net;
+}
+
+static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
+{
+}
+
+#endif
+
+/*
+ * Session structure.  One of these for each uid session with a particular host
+ */
+struct cifs_ses {
+	struct list_head smb_ses_list;
+	struct list_head tcon_list;
+	struct mutex session_mutex;
+	struct TCP_Server_Info *server;	/* pointer to server info */
+	int ses_count;		/* reference counter */
+	enum statusEnum status;
+	unsigned overrideSecFlg;  /* if non-zero override global sec flags */
+	__u16 ipc_tid;		/* special tid for connection to IPC share */
+	__u16 flags;
+	__u16 vcnum;
+	char *serverOS;		/* name of operating system underlying server */
+	char *serverNOS;	/* name of network operating system of server */
+	char *serverDomain;	/* security realm of server */
+	int Suid;		/* remote smb uid  */
+	uid_t linux_uid;        /* overriding owner of files on the mount */
+	uid_t cred_uid;		/* owner of credentials */
+	int capabilities;
+	char serverName[SERVER_NAME_LEN_WITH_NULL * 2];	/* BB make bigger for
+				TCP names - will ipv6 and sctp addresses fit? */
+	char *user_name;	/* must not be null except during init of sess
+				   and after mount option parsing we fill it */
+	char *domainName;
+	char *password;
+	struct session_key auth_key;
+	struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
+	bool need_reconnect:1; /* connection reset, uid now invalid */
+};
+/* no more than one of the following three session flags may be set */
+#define CIFS_SES_NT4 1
+#define CIFS_SES_OS2 2
+#define CIFS_SES_W9X 4
+/* following flag is set for old servers such as OS2 (and Win95?)
+   which do not negotiate NTLM or POSIX dialects, but instead
+   negotiate one of the older LANMAN dialects */
+#define CIFS_SES_LANMAN 8
+/*
+ * there is one of these for each connection to a resource on a particular
+ * session
+ */
+struct cifs_tcon {
+	struct list_head tcon_list;
+	int tc_count;
+	struct list_head openFileList;
+	struct cifs_ses *ses;	/* pointer to session associated with */
+	char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
+	char *nativeFileSystem;
+	char *password;		/* for share-level security */
+	__u16 tid;		/* The 2 byte tree id */
+	__u16 Flags;		/* optional support bits */
+	enum statusEnum tidStatus;
+#ifdef CONFIG_CIFS_STATS
+	atomic_t num_smbs_sent;
+	atomic_t num_writes;
+	atomic_t num_reads;
+	atomic_t num_flushes;
+	atomic_t num_oplock_brks;
+	atomic_t num_opens;
+	atomic_t num_closes;
+	atomic_t num_deletes;
+	atomic_t num_mkdirs;
+	atomic_t num_posixopens;
+	atomic_t num_posixmkdirs;
+	atomic_t num_rmdirs;
+	atomic_t num_renames;
+	atomic_t num_t2renames;
+	atomic_t num_ffirst;
+	atomic_t num_fnext;
+	atomic_t num_fclose;
+	atomic_t num_hardlinks;
+	atomic_t num_symlinks;
+	atomic_t num_locks;
+	atomic_t num_acl_get;
+	atomic_t num_acl_set;
+#ifdef CONFIG_CIFS_STATS2
+	unsigned long long time_writes;
+	unsigned long long time_reads;
+	unsigned long long time_opens;
+	unsigned long long time_deletes;
+	unsigned long long time_closes;
+	unsigned long long time_mkdirs;
+	unsigned long long time_rmdirs;
+	unsigned long long time_renames;
+	unsigned long long time_t2renames;
+	unsigned long long time_ffirst;
+	unsigned long long time_fnext;
+	unsigned long long time_fclose;
+#endif /* CONFIG_CIFS_STATS2 */
+	__u64    bytes_read;
+	__u64    bytes_written;
+	spinlock_t stat_lock;
+#endif /* CONFIG_CIFS_STATS */
+	FILE_SYSTEM_DEVICE_INFO fsDevInfo;
+	FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */
+	FILE_SYSTEM_UNIX_INFO fsUnixInfo;
+	bool ipc:1;		/* set if connection to IPC$ eg for RPC/PIPES */
+	bool retry:1;
+	bool nocase:1;
+	bool seal:1;      /* transport encryption for this mounted share */
+	bool unix_ext:1;  /* if false disable Linux extensions to CIFS protocol
+				for this mount even if server would support */
+	bool local_lease:1; /* check leases (only) on local system not remote */
+	bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
+	bool need_reconnect:1; /* connection reset, tid now invalid */
+#ifdef CONFIG_CIFS_FSCACHE
+	u64 resource_id;		/* server resource id */
+	struct fscache_cookie *fscache;	/* cookie for share */
+#endif
+	/* BB add field for back pointer to sb struct(s)? */
+};
+
+/*
+ * This is a refcounted and timestamped container for a tcon pointer. The
+ * container holds a tcon reference. It is considered safe to free one of
+ * these when the tl_count goes to 0. The tl_time is the time of the last
+ * "get" on the container.
+ */
+struct tcon_link {
+	struct rb_node		tl_rbnode;
+	uid_t			tl_uid;
+	unsigned long		tl_flags;
+#define TCON_LINK_MASTER	0
+#define TCON_LINK_PENDING	1
+#define TCON_LINK_IN_TREE	2
+	unsigned long		tl_time;
+	atomic_t		tl_count;
+	struct cifs_tcon	*tl_tcon;
+};
+
+extern struct tcon_link *cifs_sb_tlink(struct cifs_sb_info *cifs_sb);
+
+static inline struct cifs_tcon *
+tlink_tcon(struct tcon_link *tlink)
+{
+	return tlink->tl_tcon;
+}
+
+extern void cifs_put_tlink(struct tcon_link *tlink);
+
+static inline struct tcon_link *
+cifs_get_tlink(struct tcon_link *tlink)
+{
+	if (tlink && !IS_ERR(tlink))
+		atomic_inc(&tlink->tl_count);
+	return tlink;
+}
+
+/* This function is always expected to succeed */
+extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
+
+/*
+ * This info hangs off the cifsFileInfo structure, pointed to by llist.
+ * This is used to track byte stream locks on the file
+ */
+struct cifsLockInfo {
+	struct list_head llist;	/* pointer to next cifsLockInfo */
+	__u64 offset;
+	__u64 length;
+	__u8 type;
+};
+
+/*
+ * One of these for each open instance of a file
+ */
+struct cifs_search_info {
+	loff_t index_of_last_entry;
+	__u16 entries_in_buffer;
+	__u16 info_level;
+	__u32 resume_key;
+	char *ntwrk_buf_start;
+	char *srch_entries_start;
+	char *last_entry;
+	char *presume_name;
+	unsigned int resume_name_len;
+	bool endOfSearch:1;
+	bool emptyDir:1;
+	bool unicode:1;
+	bool smallBuf:1; /* so we know which buf_release function to call */
+};
+
+struct cifsFileInfo {
+	struct list_head tlist;	/* pointer to next fid owned by tcon */
+	struct list_head flist;	/* next fid (file instance) for this inode */
+	unsigned int uid;	/* allows finding which FileInfo structure */
+	__u32 pid;		/* process id who opened file */
+	__u16 netfid;		/* file id from remote */
+	/* BB add lock scope info here if needed */ ;
+	/* lock scope id (0 if none) */
+	struct dentry *dentry;
+	unsigned int f_flags;
+	struct tcon_link *tlink;
+	struct mutex lock_mutex;
+	struct list_head llist; /* list of byte range locks we have. */
+	bool invalidHandle:1;	/* file closed via session abend */
+	bool oplock_break_cancelled:1;
+	int count;		/* refcount protected by cifs_file_list_lock */
+	struct mutex fh_mutex; /* prevents reopen race after dead ses*/
+	struct cifs_search_info srch_inf;
+	struct work_struct oplock_break; /* work for oplock breaks */
+};
+
+struct cifs_io_parms {
+	__u16 netfid;
+	__u32 pid;
+	__u64 offset;
+	unsigned int length;
+	struct cifs_tcon *tcon;
+};
+
+/*
+ * Take a reference on the file private data. Must be called with
+ * cifs_file_list_lock held.
+ */
+static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
+{
+	++cifs_file->count;
+}
+
+void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
+
+/*
+ * One of these for each file inode
+ */
+
+struct cifsInodeInfo {
+	struct list_head lockList;
+	/* BB add in lists for dirty pages i.e. write caching info for oplock */
+	struct list_head openFileList;
+	__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
+	bool clientCanCacheRead;	/* read oplock */
+	bool clientCanCacheAll;		/* read and writebehind oplock */
+	bool delete_pending;		/* DELETE_ON_CLOSE is set */
+	bool invalid_mapping;		/* pagecache is invalid */
+	unsigned long time;		/* jiffies of last update of inode */
+	u64  server_eof;		/* current file size on server */
+	u64  uniqueid;			/* server inode number */
+	u64  createtime;		/* creation time on server */
+#ifdef CONFIG_CIFS_FSCACHE
+	struct fscache_cookie *fscache;
+#endif
+	struct inode vfs_inode;
+};
+
+static inline struct cifsInodeInfo *
+CIFS_I(struct inode *inode)
+{
+	return container_of(inode, struct cifsInodeInfo, vfs_inode);
+}
+
+static inline struct cifs_sb_info *
+CIFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
+{
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
+		return '/';
+	else
+		return '\\';
+}
+
+static inline void
+convert_delimiter(char *path, char delim)
+{
+	int i;
+	char old_delim;
+
+	if (path == NULL)
+		return;
+
+	if (delim == '/')
+		old_delim = '\\';
+	else
+		old_delim = '/';
+
+	for (i = 0; path[i] != '\0'; i++) {
+		if (path[i] == old_delim)
+			path[i] = delim;
+	}
+}
+
+#ifdef CONFIG_CIFS_STATS
+#define cifs_stats_inc atomic_inc
+
+static inline void cifs_stats_bytes_written(struct cifs_tcon *tcon,
+					    unsigned int bytes)
+{
+	if (bytes) {
+		spin_lock(&tcon->stat_lock);
+		tcon->bytes_written += bytes;
+		spin_unlock(&tcon->stat_lock);
+	}
+}
+
+static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon,
+					 unsigned int bytes)
+{
+	spin_lock(&tcon->stat_lock);
+	tcon->bytes_read += bytes;
+	spin_unlock(&tcon->stat_lock);
+}
+#else
+
+#define  cifs_stats_inc(field) do {} while (0)
+#define  cifs_stats_bytes_written(tcon, bytes) do {} while (0)
+#define  cifs_stats_bytes_read(tcon, bytes) do {} while (0)
+
+#endif
+
+struct mid_q_entry;
+
+/*
+ * This is the prototype for the mid callback function. When creating one,
+ * take special care to avoid deadlocks. Things to bear in mind:
+ *
+ * - it will be called by cifsd, with no locks held
+ * - the mid will be removed from any lists
+ */
+typedef void (mid_callback_t)(struct mid_q_entry *mid);
+
+/* one of these for every pending CIFS request to the server */
+struct mid_q_entry {
+	struct list_head qhead;	/* mids waiting on reply from this server */
+	__u16 mid;		/* multiplex id */
+	__u16 pid;		/* process id */
+	__u32 sequence_number;  /* for CIFS signing */
+	unsigned long when_alloc;  /* when mid was created */
+#ifdef CONFIG_CIFS_STATS2
+	unsigned long when_sent; /* time when smb send finished */
+	unsigned long when_received; /* when demux complete (taken off wire) */
+#endif
+	mid_callback_t *callback; /* call completion callback */
+	void *callback_data;	  /* general purpose pointer for callback */
+	struct smb_hdr *resp_buf;	/* response buffer */
+	int midState;	/* wish this were enum but can not pass to wait_event */
+	__u8 command;	/* smb command code */
+	bool largeBuf:1;	/* if valid response, is pointer to large buf */
+	bool multiRsp:1;	/* multiple trans2 responses for one request  */
+	bool multiEnd:1;	/* both received */
+};
+
+struct oplock_q_entry {
+	struct list_head qhead;
+	struct inode *pinode;
+	struct cifs_tcon *tcon;
+	__u16 netfid;
+};
+
+/* for pending dnotify requests */
+struct dir_notify_req {
+	struct list_head lhead;
+	__le16 Pid;
+	__le16 PidHigh;
+	__u16 Mid;
+	__u16 Tid;
+	__u16 Uid;
+	__u16 netfid;
+	__u32 filter; /* CompletionFilter (for multishot) */
+	int multishot;
+	struct file *pfile;
+};
+
+struct dfs_info3_param {
+	int flags; /* DFSREF_REFERRAL_SERVER, DFSREF_STORAGE_SERVER*/
+	int path_consumed;
+	int server_type;
+	int ref_flag;
+	char *path_name;
+	char *node_name;
+};
+
+/*
+ * common struct for holding inode info when searching for or updating an
+ * inode with new info
+ */
+
+#define CIFS_FATTR_DFS_REFERRAL		0x1
+#define CIFS_FATTR_DELETE_PENDING	0x2
+#define CIFS_FATTR_NEED_REVAL		0x4
+#define CIFS_FATTR_INO_COLLISION	0x8
+
+struct cifs_fattr {
+	u32		cf_flags;
+	u32		cf_cifsattrs;
+	u64		cf_uniqueid;
+	u64		cf_eof;
+	u64		cf_bytes;
+	u64		cf_createtime;
+	uid_t		cf_uid;
+	gid_t		cf_gid;
+	umode_t		cf_mode;
+	dev_t		cf_rdev;
+	unsigned int	cf_nlink;
+	unsigned int	cf_dtype;
+	struct timespec	cf_atime;
+	struct timespec	cf_mtime;
+	struct timespec	cf_ctime;
+};
+
+static inline void free_dfs_info_param(struct dfs_info3_param *param)
+{
+	if (param) {
+		kfree(param->path_name);
+		kfree(param->node_name);
+		kfree(param);
+	}
+}
+
+static inline void free_dfs_info_array(struct dfs_info3_param *param,
+				       int number_of_items)
+{
+	int i;
+	if ((number_of_items == 0) || (param == NULL))
+		return;
+	for (i = 0; i < number_of_items; i++) {
+		kfree(param[i].path_name);
+		kfree(param[i].node_name);
+	}
+	kfree(param);
+}
+
+#define   MID_FREE 0
+#define   MID_REQUEST_ALLOCATED 1
+#define   MID_REQUEST_SUBMITTED 2
+#define   MID_RESPONSE_RECEIVED 4
+#define   MID_RETRY_NEEDED      8 /* session closed while this request out */
+#define   MID_RESPONSE_MALFORMED 0x10
+#define   MID_SHUTDOWN		 0x20
+
+/* Types of response buffer returned from SendReceive2 */
+#define   CIFS_NO_BUFFER        0    /* Response buffer not returned */
+#define   CIFS_SMALL_BUFFER     1
+#define   CIFS_LARGE_BUFFER     2
+#define   CIFS_IOVEC            4    /* array of response buffers */
+
+/* Type of Request to SendReceive2 */
+#define   CIFS_BLOCKING_OP      1    /* operation can block */
+#define   CIFS_ASYNC_OP         2    /* do not wait for response */
+#define   CIFS_TIMEOUT_MASK 0x003    /* only one of above set in req */
+#define   CIFS_LOG_ERROR    0x010    /* log NT STATUS if non-zero */
+#define   CIFS_LARGE_BUF_OP 0x020    /* large request buffer */
+#define   CIFS_NO_RESP      0x040    /* no response buffer required */
+
+/* Security Flags: indicate type of session setup needed */
+#define   CIFSSEC_MAY_SIGN	0x00001
+#define   CIFSSEC_MAY_NTLM	0x00002
+#define   CIFSSEC_MAY_NTLMV2	0x00004
+#define   CIFSSEC_MAY_KRB5	0x00008
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define   CIFSSEC_MAY_LANMAN	0x00010
+#define   CIFSSEC_MAY_PLNTXT	0x00020
+#else
+#define   CIFSSEC_MAY_LANMAN    0
+#define   CIFSSEC_MAY_PLNTXT    0
+#endif /* weak passwords */
+#define   CIFSSEC_MAY_SEAL	0x00040 /* not supported yet */
+#define   CIFSSEC_MAY_NTLMSSP	0x00080 /* raw ntlmssp with ntlmv2 */
+
+#define   CIFSSEC_MUST_SIGN	0x01001
+/* note that only one of the following can be set so the
+result of setting MUST flags more than once will be to
+require use of the stronger protocol */
+#define   CIFSSEC_MUST_NTLM	0x02002
+#define   CIFSSEC_MUST_NTLMV2	0x04004
+#define   CIFSSEC_MUST_KRB5	0x08008
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define   CIFSSEC_MUST_LANMAN	0x10010
+#define   CIFSSEC_MUST_PLNTXT	0x20020
+#ifdef CONFIG_CIFS_UPCALL
+#define   CIFSSEC_MASK          0xBF0BF /* allows weak security but also krb5 */
+#else
+#define   CIFSSEC_MASK          0xB70B7 /* current flags supported if weak */
+#endif /* UPCALL */
+#else /* do not allow weak pw hash */
+#ifdef CONFIG_CIFS_UPCALL
+#define   CIFSSEC_MASK          0x8F08F /* flags supported if no weak allowed */
+#else
+#define	  CIFSSEC_MASK          0x87087 /* flags supported if no weak allowed */
+#endif /* UPCALL */
+#endif /* WEAK_PW_HASH */
+#define   CIFSSEC_MUST_SEAL	0x40040 /* not supported yet */
+#define   CIFSSEC_MUST_NTLMSSP	0x80080 /* raw ntlmssp with ntlmv2 */
+
+#define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2)
+#define   CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
+#define   CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
+/*
+ *****************************************************************
+ * All constants go here
+ *****************************************************************
+ */
+
+#define UID_HASH (16)
+
+/*
+ * Note that ONE module should define _DECLARE_GLOBALS_HERE to cause the
+ * following to be declared.
+ */
+
+/****************************************************************************
+ *  Locking notes.  All updates to global variables and lists should be
+ *                  protected by spinlocks or semaphores.
+ *
+ *  Spinlocks
+ *  ---------
+ *  GlobalMid_Lock protects:
+ *	list operations on pending_mid_q and oplockQ
+ *      updates to XID counters, multiplex id  and SMB sequence numbers
+ *  cifs_file_list_lock protects:
+ *	list operations on tcp and SMB session lists and tCon lists
+ *  f_owner.lock protects certain per file struct operations
+ *  mapping->page_lock protects certain per page operations
+ *
+ *  Semaphores
+ *  ----------
+ *  sesSem     operations on smb session
+ *  tconSem    operations on tree connection
+ *  fh_sem      file handle reconnection operations
+ *
+ ****************************************************************************/
+
+#ifdef DECLARE_GLOBALS_HERE
+#define GLOBAL_EXTERN
+#else
+#define GLOBAL_EXTERN extern
+#endif
+
+/*
+ * the list of TCP_Server_Info structures, ie each of the sockets
+ * connecting our client to a distinct server (ip address), is
+ * chained together by cifs_tcp_ses_list. The list of all our SMB
+ * sessions (and from that the tree connections) can be found
+ * by iterating over cifs_tcp_ses_list
+ */
+GLOBAL_EXTERN struct list_head		cifs_tcp_ses_list;
+
+/*
+ * This lock protects the cifs_tcp_ses_list, the list of smb sessions per
+ * tcp session, and the list of tcon's per smb session. It also protects
+ * the reference counters for the server, smb session, and tcon. Finally,
+ * changes to the tcon->tidStatus should be done while holding this lock.
+ */
+GLOBAL_EXTERN spinlock_t		cifs_tcp_ses_lock;
+
+/*
+ * This lock protects the cifs_file->llist and cifs_file->flist
+ * list operations, and updates to some flags (cifs_file->invalidHandle)
+ * It will be moved to either use the tcon->stat_lock or equivalent later.
+ * If cifs_tcp_ses_lock and the lock below are both needed to be held, then
+ * the cifs_tcp_ses_lock must be grabbed first and released last.
+ */
+GLOBAL_EXTERN spinlock_t	cifs_file_list_lock;
+
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
+/* Outstanding dir notify requests */
+GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
+/* DirNotify response queue */
+GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q;
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
+
+/*
+ * Global transaction id (XID) information
+ */
+GLOBAL_EXTERN unsigned int GlobalCurrentXid;	/* protected by GlobalMid_Sem */
+GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
+GLOBAL_EXTERN unsigned int GlobalMaxActiveXid;	/* prot by GlobalMid_Sem */
+GLOBAL_EXTERN spinlock_t GlobalMid_Lock;  /* protects above & list operations */
+					  /* on midQ entries */
+/*
+ *  Global counters, updated atomically
+ */
+GLOBAL_EXTERN atomic_t sesInfoAllocCount;
+GLOBAL_EXTERN atomic_t tconInfoAllocCount;
+GLOBAL_EXTERN atomic_t tcpSesAllocCount;
+GLOBAL_EXTERN atomic_t tcpSesReconnectCount;
+GLOBAL_EXTERN atomic_t tconInfoReconnectCount;
+
+/* Various Debug counters */
+GLOBAL_EXTERN atomic_t bufAllocCount;    /* current number allocated  */
+#ifdef CONFIG_CIFS_STATS2
+GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */
+GLOBAL_EXTERN atomic_t totSmBufAllocCount;
+#endif
+GLOBAL_EXTERN atomic_t smBufAllocCount;
+GLOBAL_EXTERN atomic_t midCount;
+
+/* Misc globals */
+GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
+				to be established on existing mount if we
+				have the uid/password or Kerberos credential
+				or equivalent for current user */
+GLOBAL_EXTERN unsigned int oplockEnabled;
+GLOBAL_EXTERN unsigned int lookupCacheEnabled;
+GLOBAL_EXTERN unsigned int global_secflags;	/* if on, session setup sent
+				with more secure ntlmssp2 challenge/resp */
+GLOBAL_EXTERN unsigned int sign_CIFS_PDUs;  /* enable smb packet signing */
+GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
+GLOBAL_EXTERN unsigned int CIFSMaxBufSize;  /* max size not including hdr */
+GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
+GLOBAL_EXTERN unsigned int cifs_min_small;  /* min size of small buf pool */
+GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
+
+/* reconnect after this many failed echo attempts */
+GLOBAL_EXTERN unsigned short echo_retries;
+
+GLOBAL_EXTERN struct rb_root uidtree;
+GLOBAL_EXTERN struct rb_root gidtree;
+GLOBAL_EXTERN spinlock_t siduidlock;
+GLOBAL_EXTERN spinlock_t sidgidlock;
+
+void cifs_oplock_break(struct work_struct *work);
+void cifs_oplock_break_get(struct cifsFileInfo *cfile);
+void cifs_oplock_break_put(struct cifsFileInfo *cfile);
+
+extern const struct slow_work_ops cifs_oplock_break_ops;
+
+#endif	/* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
new file mode 100644
index 00000000..de3aa285
--- /dev/null
+++ b/fs/cifs/cifspdu.h
@@ -0,0 +1,2641 @@
+/*
+ *   fs/cifs/cifspdu.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2009
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _CIFSPDU_H
+#define _CIFSPDU_H
+
+#include <net/sock.h>
+#include <asm/unaligned.h>
+#include "smbfsctl.h"
+
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define LANMAN_PROT 0
+#define LANMAN2_PROT 1
+#define CIFS_PROT   2
+#else
+#define CIFS_PROT   0
+#endif
+#define POSIX_PROT  (CIFS_PROT+1)
+#define BAD_PROT 0xFFFF
+
+/* SMB command codes:
+ * Note some commands have minimal (wct=0,bcc=0), or uninteresting, responses
+ * (ie which include no useful data other than the SMB error code itself).
+ * This can allow us to avoid response buffer allocations and copy in some cases
+ */
+#define SMB_COM_CREATE_DIRECTORY      0x00 /* trivial response */
+#define SMB_COM_DELETE_DIRECTORY      0x01 /* trivial response */
+#define SMB_COM_CLOSE                 0x04 /* triv req/rsp, timestamp ignored */
+#define SMB_COM_FLUSH                 0x05 /* triv req/rsp */
+#define SMB_COM_DELETE                0x06 /* trivial response */
+#define SMB_COM_RENAME                0x07 /* trivial response */
+#define SMB_COM_QUERY_INFORMATION     0x08 /* aka getattr */
+#define SMB_COM_SETATTR               0x09 /* trivial response */
+#define SMB_COM_LOCKING_ANDX          0x24 /* trivial response */
+#define SMB_COM_COPY                  0x29 /* trivial rsp, fail filename ignrd*/
+#define SMB_COM_ECHO                  0x2B /* echo request */
+#define SMB_COM_OPEN_ANDX             0x2D /* Legacy open for old servers */
+#define SMB_COM_READ_ANDX             0x2E
+#define SMB_COM_WRITE_ANDX            0x2F
+#define SMB_COM_TRANSACTION2          0x32
+#define SMB_COM_TRANSACTION2_SECONDARY 0x33
+#define SMB_COM_FIND_CLOSE2           0x34 /* trivial response */
+#define SMB_COM_TREE_DISCONNECT       0x71 /* trivial response */
+#define SMB_COM_NEGOTIATE             0x72
+#define SMB_COM_SESSION_SETUP_ANDX    0x73
+#define SMB_COM_LOGOFF_ANDX           0x74 /* trivial response */
+#define SMB_COM_TREE_CONNECT_ANDX     0x75
+#define SMB_COM_NT_TRANSACT           0xA0
+#define SMB_COM_NT_TRANSACT_SECONDARY 0xA1
+#define SMB_COM_NT_CREATE_ANDX        0xA2
+#define SMB_COM_NT_CANCEL             0xA4 /* no response */
+#define SMB_COM_NT_RENAME             0xA5 /* trivial response */
+
+/* Transact2 subcommand codes */
+#define TRANS2_OPEN                   0x00
+#define TRANS2_FIND_FIRST             0x01
+#define TRANS2_FIND_NEXT              0x02
+#define TRANS2_QUERY_FS_INFORMATION   0x03
+#define TRANS2_SET_FS_INFORMATION     0x04
+#define TRANS2_QUERY_PATH_INFORMATION 0x05
+#define TRANS2_SET_PATH_INFORMATION   0x06
+#define TRANS2_QUERY_FILE_INFORMATION 0x07
+#define TRANS2_SET_FILE_INFORMATION   0x08
+#define TRANS2_GET_DFS_REFERRAL       0x10
+#define TRANS2_REPORT_DFS_INCOSISTENCY 0x11
+
+/* SMB Transact (Named Pipe) subcommand codes */
+#define TRANS_SET_NMPIPE_STATE      0x0001
+#define TRANS_RAW_READ_NMPIPE       0x0011
+#define TRANS_QUERY_NMPIPE_STATE    0x0021
+#define TRANS_QUERY_NMPIPE_INFO     0x0022
+#define TRANS_PEEK_NMPIPE           0x0023
+#define TRANS_TRANSACT_NMPIPE       0x0026
+#define TRANS_RAW_WRITE_NMPIPE      0x0031
+#define TRANS_READ_NMPIPE           0x0036
+#define TRANS_WRITE_NMPIPE          0x0037
+#define TRANS_WAIT_NMPIPE           0x0053
+#define TRANS_CALL_NMPIPE           0x0054
+
+/* NT Transact subcommand codes */
+#define NT_TRANSACT_CREATE            0x01
+#define NT_TRANSACT_IOCTL             0x02
+#define NT_TRANSACT_SET_SECURITY_DESC 0x03
+#define NT_TRANSACT_NOTIFY_CHANGE     0x04
+#define NT_TRANSACT_RENAME            0x05
+#define NT_TRANSACT_QUERY_SECURITY_DESC 0x06
+#define NT_TRANSACT_GET_USER_QUOTA    0x07
+#define NT_TRANSACT_SET_USER_QUOTA    0x08
+
+#define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */
+/* future chained NTCreateXReadX bigger, but for time being NTCreateX biggest */
+/* among the requests (NTCreateX response is bigger with wct of 34) */
+#define MAX_CIFS_HDR_SIZE 0x58 /* 4 len + 32 hdr + (2*24 wct) + 2 bct + 2 pad */
+#define CIFS_SMALL_PATH 120 /* allows for (448-88)/3 */
+
+/* internal cifs vfs structures */
+/*****************************************************************
+ * All constants go here
+ *****************************************************************
+ */
+
+/*
+ * Starting value for maximum SMB size negotiation
+ */
+#define CIFS_MAX_MSGSIZE (4*4096)
+
+/*
+ * Size of encrypted user password in bytes
+ */
+#define CIFS_ENCPWD_SIZE (16)
+
+/*
+ * Size of the crypto key returned on the negotiate SMB in bytes
+ */
+#define CIFS_CRYPTO_KEY_SIZE (8)
+
+/*
+ * Size of the ntlm client response
+ */
+#define CIFS_AUTH_RESP_SIZE (24)
+
+/*
+ * Size of the session key (crypto key encrypted with the password
+ */
+#define CIFS_SESS_KEY_SIZE (16)
+
+#define CIFS_CLIENT_CHALLENGE_SIZE (8)
+#define CIFS_SERVER_CHALLENGE_SIZE (8)
+#define CIFS_HMAC_MD5_HASH_SIZE (16)
+#define CIFS_CPHTXT_SIZE (16)
+#define CIFS_NTHASH_SIZE (16)
+
+/*
+ * Maximum user name length
+ */
+#define CIFS_UNLEN (20)
+
+/*
+ * Flags on SMB open
+ */
+#define SMBOPEN_WRITE_THROUGH 0x4000
+#define SMBOPEN_DENY_ALL      0x0010
+#define SMBOPEN_DENY_WRITE    0x0020
+#define SMBOPEN_DENY_READ     0x0030
+#define SMBOPEN_DENY_NONE     0x0040
+#define SMBOPEN_READ          0x0000
+#define SMBOPEN_WRITE         0x0001
+#define SMBOPEN_READWRITE     0x0002
+#define SMBOPEN_EXECUTE       0x0003
+
+#define SMBOPEN_OCREATE       0x0010
+#define SMBOPEN_OTRUNC        0x0002
+#define SMBOPEN_OAPPEND       0x0001
+
+/*
+ * SMB flag definitions
+ */
+#define SMBFLG_EXTD_LOCK 0x01	/* server supports lock-read write-unlock smb */
+#define SMBFLG_RCV_POSTED 0x02	/* obsolete */
+#define SMBFLG_RSVD 0x04
+#define SMBFLG_CASELESS 0x08	/* all pathnames treated as caseless (off
+				implies case sensitive file handling request) */
+#define SMBFLG_CANONICAL_PATH_FORMAT 0x10	/* obsolete */
+#define SMBFLG_OLD_OPLOCK 0x20	/* obsolete */
+#define SMBFLG_OLD_OPLOCK_NOTIFY 0x40	/* obsolete */
+#define SMBFLG_RESPONSE 0x80	/* this PDU is a response from server */
+
+/*
+ * SMB flag2 definitions
+ */
+#define SMBFLG2_KNOWS_LONG_NAMES cpu_to_le16(1)	/* can send long (non-8.3)
+						   path names in response */
+#define SMBFLG2_KNOWS_EAS cpu_to_le16(2)
+#define SMBFLG2_SECURITY_SIGNATURE cpu_to_le16(4)
+#define SMBFLG2_COMPRESSED (8)
+#define SMBFLG2_SECURITY_SIGNATURE_REQUIRED (0x10)
+#define SMBFLG2_IS_LONG_NAME cpu_to_le16(0x40)
+#define SMBFLG2_REPARSE_PATH (0x400)
+#define SMBFLG2_EXT_SEC cpu_to_le16(0x800)
+#define SMBFLG2_DFS cpu_to_le16(0x1000)
+#define SMBFLG2_PAGING_IO cpu_to_le16(0x2000)
+#define SMBFLG2_ERR_STATUS cpu_to_le16(0x4000)
+#define SMBFLG2_UNICODE cpu_to_le16(0x8000)
+
+/*
+ * These are the file access permission bits defined in CIFS for the
+ * NTCreateAndX as well as the level 0x107
+ * TRANS2_QUERY_PATH_INFORMATION API.  The level 0x107, SMB_QUERY_FILE_ALL_INFO
+ * responds with the AccessFlags.
+ * The AccessFlags specifies the access permissions a caller has to the
+ * file and can have any suitable combination of the following values:
+ */
+
+#define FILE_READ_DATA        0x00000001  /* Data can be read from the file   */
+#define FILE_WRITE_DATA       0x00000002  /* Data can be written to the file  */
+#define FILE_APPEND_DATA      0x00000004  /* Data can be appended to the file */
+#define FILE_READ_EA          0x00000008  /* Extended attributes associated   */
+					  /* with the file can be read        */
+#define FILE_WRITE_EA         0x00000010  /* Extended attributes associated   */
+					  /* with the file can be written     */
+#define FILE_EXECUTE          0x00000020  /*Data can be read into memory from */
+					  /* the file using system paging I/O */
+#define FILE_DELETE_CHILD     0x00000040
+#define FILE_READ_ATTRIBUTES  0x00000080  /* Attributes associated with the   */
+					  /* file can be read                 */
+#define FILE_WRITE_ATTRIBUTES 0x00000100  /* Attributes associated with the   */
+					  /* file can be written              */
+#define DELETE                0x00010000  /* The file can be deleted          */
+#define READ_CONTROL          0x00020000  /* The access control list and      */
+					  /* ownership associated with the    */
+					  /* file can be read                 */
+#define WRITE_DAC             0x00040000  /* The access control list and      */
+					  /* ownership associated with the    */
+					  /* file can be written.             */
+#define WRITE_OWNER           0x00080000  /* Ownership information associated */
+					  /* with the file can be written     */
+#define SYNCHRONIZE           0x00100000  /* The file handle can waited on to */
+					  /* synchronize with the completion  */
+					  /* of an input/output request       */
+#define GENERIC_ALL           0x10000000
+#define GENERIC_EXECUTE       0x20000000
+#define GENERIC_WRITE         0x40000000
+#define GENERIC_READ          0x80000000
+					 /* In summary - Relevant file       */
+					 /* access flags from CIFS are       */
+					 /* file_read_data, file_write_data  */
+					 /* file_execute, file_read_attributes*/
+					 /* write_dac, and delete.           */
+
+#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_READ_ATTRIBUTES)
+#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
+				| FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES)
+#define FILE_EXEC_RIGHTS (FILE_EXECUTE)
+
+#define SET_FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_WRITE_EA \
+				| FILE_READ_ATTRIBUTES \
+				| FILE_WRITE_ATTRIBUTES \
+				| DELETE | READ_CONTROL | WRITE_DAC \
+				| WRITE_OWNER | SYNCHRONIZE)
+#define SET_FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
+				| FILE_READ_EA | FILE_WRITE_EA \
+				| FILE_DELETE_CHILD | FILE_READ_ATTRIBUTES \
+				| FILE_WRITE_ATTRIBUTES \
+				| DELETE | READ_CONTROL | WRITE_DAC \
+				| WRITE_OWNER | SYNCHRONIZE)
+#define SET_FILE_EXEC_RIGHTS (FILE_READ_EA | FILE_WRITE_EA | FILE_EXECUTE \
+				| FILE_READ_ATTRIBUTES \
+				| FILE_WRITE_ATTRIBUTES \
+				| DELETE | READ_CONTROL | WRITE_DAC \
+				| WRITE_OWNER | SYNCHRONIZE)
+
+#define SET_MINIMUM_RIGHTS (FILE_READ_EA | FILE_READ_ATTRIBUTES \
+				| READ_CONTROL | SYNCHRONIZE)
+
+
+/*
+ * Invalid readdir handle
+ */
+#define CIFS_NO_HANDLE        0xFFFF
+
+#define NO_CHANGE_64          0xFFFFFFFFFFFFFFFFULL
+#define NO_CHANGE_32          0xFFFFFFFFUL
+
+/* IPC$ in ASCII */
+#define CIFS_IPC_RESOURCE "\x49\x50\x43\x24"
+
+/* IPC$ in Unicode */
+#define CIFS_IPC_UNICODE_RESOURCE "\x00\x49\x00\x50\x00\x43\x00\x24\x00\x00"
+
+/* Unicode Null terminate 2 bytes of 0 */
+#define UNICODE_NULL "\x00\x00"
+#define ASCII_NULL 0x00
+
+/*
+ * Server type values (returned on EnumServer API
+ */
+#define CIFS_SV_TYPE_DC     0x00000008
+#define CIFS_SV_TYPE_BACKDC 0x00000010
+
+/*
+ * Alias type flags (From EnumAlias API call
+ */
+#define CIFS_ALIAS_TYPE_FILE 0x0001
+#define CIFS_SHARE_TYPE_FILE 0x0000
+
+/*
+ * File Attribute flags
+ */
+#define ATTR_READONLY  0x0001
+#define ATTR_HIDDEN    0x0002
+#define ATTR_SYSTEM    0x0004
+#define ATTR_VOLUME    0x0008
+#define ATTR_DIRECTORY 0x0010
+#define ATTR_ARCHIVE   0x0020
+#define ATTR_DEVICE    0x0040
+#define ATTR_NORMAL    0x0080
+#define ATTR_TEMPORARY 0x0100
+#define ATTR_SPARSE    0x0200
+#define ATTR_REPARSE   0x0400
+#define ATTR_COMPRESSED 0x0800
+#define ATTR_OFFLINE    0x1000	/* ie file not immediately available -
+					on offline storage */
+#define ATTR_NOT_CONTENT_INDEXED 0x2000
+#define ATTR_ENCRYPTED  0x4000
+#define ATTR_POSIX_SEMANTICS 0x01000000
+#define ATTR_BACKUP_SEMANTICS 0x02000000
+#define ATTR_DELETE_ON_CLOSE 0x04000000
+#define ATTR_SEQUENTIAL_SCAN 0x08000000
+#define ATTR_RANDOM_ACCESS   0x10000000
+#define ATTR_NO_BUFFERING    0x20000000
+#define ATTR_WRITE_THROUGH   0x80000000
+
+/* ShareAccess flags */
+#define FILE_NO_SHARE     0x00000000
+#define FILE_SHARE_READ   0x00000001
+#define FILE_SHARE_WRITE  0x00000002
+#define FILE_SHARE_DELETE 0x00000004
+#define FILE_SHARE_ALL    0x00000007
+
+/* CreateDisposition flags, similar to CreateAction as well */
+#define FILE_SUPERSEDE    0x00000000
+#define FILE_OPEN         0x00000001
+#define FILE_CREATE       0x00000002
+#define FILE_OPEN_IF      0x00000003
+#define FILE_OVERWRITE    0x00000004
+#define FILE_OVERWRITE_IF 0x00000005
+
+/* CreateOptions */
+#define CREATE_NOT_FILE		0x00000001	/* if set must not be file */
+#define CREATE_WRITE_THROUGH	0x00000002
+#define CREATE_SEQUENTIAL       0x00000004
+#define CREATE_NO_BUFFER        0x00000008      /* should not buffer on srv */
+#define CREATE_SYNC_ALERT       0x00000010	/* MBZ */
+#define CREATE_ASYNC_ALERT      0x00000020	/* MBZ */
+#define CREATE_NOT_DIR		0x00000040    /* if set must not be directory */
+#define CREATE_TREE_CONNECTION  0x00000080	/* should be zero */
+#define CREATE_COMPLETE_IF_OPLK 0x00000100	/* should be zero */
+#define CREATE_NO_EA_KNOWLEDGE  0x00000200
+#define CREATE_EIGHT_DOT_THREE  0x00000400	/* doc says this is obsolete
+						 "open for recovery" flag should
+						 be zero in any case */
+#define CREATE_OPEN_FOR_RECOVERY 0x00000400
+#define CREATE_RANDOM_ACCESS	0x00000800
+#define CREATE_DELETE_ON_CLOSE	0x00001000
+#define CREATE_OPEN_BY_ID       0x00002000
+#define CREATE_OPEN_BACKUP_INTENT 0x00004000
+#define CREATE_NO_COMPRESSION   0x00008000
+#define CREATE_RESERVE_OPFILTER 0x00100000	/* should be zero */
+#define OPEN_REPARSE_POINT	0x00200000
+#define OPEN_NO_RECALL          0x00400000
+#define OPEN_FREE_SPACE_QUERY   0x00800000	/* should be zero */
+#define CREATE_OPTIONS_MASK     0x007FFFFF
+#define CREATE_OPTION_READONLY	0x10000000
+#define CREATE_OPTION_SPECIAL   0x20000000   /* system. NB not sent over wire */
+
+/* ImpersonationLevel flags */
+#define SECURITY_ANONYMOUS      0
+#define SECURITY_IDENTIFICATION 1
+#define SECURITY_IMPERSONATION  2
+#define SECURITY_DELEGATION     3
+
+/* SecurityFlags */
+#define SECURITY_CONTEXT_TRACKING 0x01
+#define SECURITY_EFFECTIVE_ONLY   0x02
+
+/*
+ * Default PID value, used in all SMBs where the PID is not important
+ */
+#define CIFS_DFT_PID  0x1234
+
+/*
+ * We use the same routine for Copy and Move SMBs.  This flag is used to
+ * distinguish
+ */
+#define CIFS_COPY_OP 1
+#define CIFS_RENAME_OP 2
+
+#define GETU16(var)  (*((__u16 *)var))	/* BB check for endian issues */
+#define GETU32(var)  (*((__u32 *)var))	/* BB check for endian issues */
+
+struct smb_hdr {
+	__be32 smb_buf_length;	/* BB length is only two (rarely three) bytes,
+		with one or two byte "type" preceding it that will be
+		zero - we could mask the type byte off */
+	__u8 Protocol[4];
+	__u8 Command;
+	union {
+		struct {
+			__u8 ErrorClass;
+			__u8 Reserved;
+			__le16 Error;
+		} __attribute__((packed)) DosError;
+		__le32 CifsError;
+	} __attribute__((packed)) Status;
+	__u8 Flags;
+	__le16 Flags2;		/* note: le */
+	__le16 PidHigh;
+	union {
+		struct {
+			__le32 SequenceNumber;  /* le */
+			__u32 Reserved; /* zero */
+		} __attribute__((packed)) Sequence;
+		__u8 SecuritySignature[8];	/* le */
+	} __attribute__((packed)) Signature;
+	__u8 pad[2];
+	__u16 Tid;
+	__le16 Pid;
+	__u16 Uid;
+	__u16 Mid;
+	__u8 WordCount;
+} __attribute__((packed));
+
+/* given a pointer to an smb_hdr, retrieve a void pointer to the ByteCount */
+static inline void *
+BCC(struct smb_hdr *smb)
+{
+	return (void *)smb + sizeof(*smb) + 2 * smb->WordCount;
+}
+
+/* given a pointer to an smb_hdr retrieve the pointer to the byte area */
+#define pByteArea(smb_var) (BCC(smb_var) + 2)
+
+/* get the unconverted ByteCount for a SMB packet and return it */
+static inline __u16
+get_bcc(struct smb_hdr *hdr)
+{
+	__le16 *bc_ptr = (__le16 *)BCC(hdr);
+
+	return get_unaligned_le16(bc_ptr);
+}
+
+/* set the ByteCount for a SMB packet in little-endian */
+static inline void
+put_bcc(__u16 count, struct smb_hdr *hdr)
+{
+	__le16 *bc_ptr = (__le16 *)BCC(hdr);
+
+	put_unaligned_le16(count, bc_ptr);
+}
+
+/*
+ * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
+ * No longer as important, now that TCP names are more commonly used to
+ * resolve hosts.
+ */
+#define CNLEN 15
+
+/*
+ * Share Name Length (SNLEN)
+ * Note:  This length was limited by the SMB used to get
+ *        the Share info.   NetShareEnum only returned 13
+ *        chars, including the null termination.
+ * This was removed because it no longer is limiting.
+ */
+
+/*
+ * Comment Length
+ */
+#define MAXCOMMENTLEN 40
+
+/*
+ * The OS/2 maximum path name
+ */
+#define MAX_PATHCONF 256
+
+/*
+ *  SMB frame definitions  (following must be packed structs)
+ *  See the SNIA CIFS Specification for details.
+ *
+ *  The Naming convention is the lower case version of the
+ *  smb command code name for the struct and this is typedef to the
+ *  uppercase version of the same name with the prefix SMB_ removed
+ *  for brevity.  Although typedefs are not commonly used for
+ *  structure definitions in the Linux kernel, their use in the
+ *  CIFS standards document, which this code is based on, may
+ *  make this one of the cases where typedefs for structures make
+ *  sense to improve readability for readers of the standards doc.
+ *  Typedefs can always be removed later if they are too distracting
+ *  and they are only used for the CIFSs PDUs themselves, not
+ *  internal cifs vfs structures
+ *
+ */
+
+typedef struct negotiate_req {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__le16 ByteCount;
+	unsigned char DialectsArray[1];
+} __attribute__((packed)) NEGOTIATE_REQ;
+
+/* Dialect index is 13 for LANMAN */
+
+#define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */
+
+typedef struct lanman_neg_rsp {
+	struct smb_hdr hdr;	/* wct = 13 */
+	__le16 DialectIndex;
+	__le16 SecurityMode;
+	__le16 MaxBufSize;
+	__le16 MaxMpxCount;
+	__le16 MaxNumberVcs;
+	__le16 RawMode;
+	__le32 SessionKey;
+	struct {
+		__le16 Time;
+		__le16 Date;
+	} __attribute__((packed)) SrvTime;
+	__le16 ServerTimeZone;
+	__le16 EncryptionKeyLength;
+	__le16 Reserved;
+	__u16  ByteCount;
+	unsigned char EncryptionKey[1];
+} __attribute__((packed)) LANMAN_NEG_RSP;
+
+#define READ_RAW_ENABLE 1
+#define WRITE_RAW_ENABLE 2
+#define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE)
+
+typedef struct negotiate_rsp {
+	struct smb_hdr hdr;	/* wct = 17 */
+	__le16 DialectIndex; /* 0xFFFF = no dialect acceptable */
+	__u8 SecurityMode;
+	__le16 MaxMpxCount;
+	__le16 MaxNumberVcs;
+	__le32 MaxBufferSize;
+	__le32 MaxRawSize;
+	__le32 SessionKey;
+	__le32 Capabilities;	/* see below */
+	__le32 SystemTimeLow;
+	__le32 SystemTimeHigh;
+	__le16 ServerTimeZone;
+	__u8 EncryptionKeyLength;
+	__u16 ByteCount;
+	union {
+		unsigned char EncryptionKey[1];	/* cap extended security off */
+		/* followed by Domain name - if extended security is off */
+		/* followed by 16 bytes of server GUID */
+		/* then security blob if cap_extended_security negotiated */
+		struct {
+			unsigned char GUID[16];
+			unsigned char SecurityBlob[1];
+		} __attribute__((packed)) extended_response;
+	} __attribute__((packed)) u;
+} __attribute__((packed)) NEGOTIATE_RSP;
+
+/* SecurityMode bits */
+#define SECMODE_USER          0x01	/* off indicates share level security */
+#define SECMODE_PW_ENCRYPT    0x02
+#define SECMODE_SIGN_ENABLED  0x04	/* SMB security signatures enabled */
+#define SECMODE_SIGN_REQUIRED 0x08	/* SMB security signatures required */
+
+/* Negotiate response Capabilities */
+#define CAP_RAW_MODE           0x00000001
+#define CAP_MPX_MODE           0x00000002
+#define CAP_UNICODE            0x00000004
+#define CAP_LARGE_FILES        0x00000008
+#define CAP_NT_SMBS            0x00000010	/* implies CAP_NT_FIND */
+#define CAP_RPC_REMOTE_APIS    0x00000020
+#define CAP_STATUS32           0x00000040
+#define CAP_LEVEL_II_OPLOCKS   0x00000080
+#define CAP_LOCK_AND_READ      0x00000100
+#define CAP_NT_FIND            0x00000200
+#define CAP_DFS                0x00001000
+#define CAP_INFOLEVEL_PASSTHRU 0x00002000
+#define CAP_LARGE_READ_X       0x00004000
+#define CAP_LARGE_WRITE_X      0x00008000
+#define CAP_LWIO               0x00010000 /* support fctl_srv_req_resume_key */
+#define CAP_UNIX               0x00800000
+#define CAP_COMPRESSED_DATA    0x02000000
+#define CAP_DYNAMIC_REAUTH     0x20000000
+#define CAP_PERSISTENT_HANDLES 0x40000000
+#define CAP_EXTENDED_SECURITY  0x80000000
+
+typedef union smb_com_session_setup_andx {
+	struct {		/* request format */
+		struct smb_hdr hdr;	/* wct = 12 */
+		__u8 AndXCommand;
+		__u8 AndXReserved;
+		__le16 AndXOffset;
+		__le16 MaxBufferSize;
+		__le16 MaxMpxCount;
+		__le16 VcNumber;
+		__u32 SessionKey;
+		__le16 SecurityBlobLength;
+		__u32 Reserved;
+		__le32 Capabilities;	/* see below */
+		__le16 ByteCount;
+		unsigned char SecurityBlob[1];	/* followed by */
+		/* STRING NativeOS */
+		/* STRING NativeLanMan */
+	} __attribute__((packed)) req;	/* NTLM request format (with
+					extended security */
+
+	struct {		/* request format */
+		struct smb_hdr hdr;	/* wct = 13 */
+		__u8 AndXCommand;
+		__u8 AndXReserved;
+		__le16 AndXOffset;
+		__le16 MaxBufferSize;
+		__le16 MaxMpxCount;
+		__le16 VcNumber;
+		__u32 SessionKey;
+		__le16 CaseInsensitivePasswordLength; /* ASCII password len */
+		__le16 CaseSensitivePasswordLength; /* Unicode password length*/
+		__u32 Reserved;	/* see below */
+		__le32 Capabilities;
+		__le16 ByteCount;
+		unsigned char CaseInsensitivePassword[1];     /* followed by: */
+		/* unsigned char * CaseSensitivePassword; */
+		/* STRING AccountName */
+		/* STRING PrimaryDomain */
+		/* STRING NativeOS */
+		/* STRING NativeLanMan */
+	} __attribute__((packed)) req_no_secext; /* NTLM request format (without
+							extended security */
+
+	struct {		/* default (NTLM) response format */
+		struct smb_hdr hdr;	/* wct = 4 */
+		__u8 AndXCommand;
+		__u8 AndXReserved;
+		__le16 AndXOffset;
+		__le16 Action;	/* see below */
+		__le16 SecurityBlobLength;
+		__u16 ByteCount;
+		unsigned char SecurityBlob[1];	/* followed by */
+/*      unsigned char  * NativeOS;      */
+/*	unsigned char  * NativeLanMan;  */
+/*      unsigned char  * PrimaryDomain; */
+	} __attribute__((packed)) resp;	/* NTLM response
+					   (with or without extended sec) */
+
+	struct {		/* request format */
+		struct smb_hdr hdr;	/* wct = 10 */
+		__u8 AndXCommand;
+		__u8 AndXReserved;
+		__le16 AndXOffset;
+		__le16 MaxBufferSize;
+		__le16 MaxMpxCount;
+		__le16 VcNumber;
+		__u32 SessionKey;
+		__le16 PasswordLength;
+		__u32 Reserved; /* encrypt key len and offset */
+		__le16 ByteCount;
+		unsigned char AccountPassword[1];	/* followed by */
+		/* STRING AccountName */
+		/* STRING PrimaryDomain */
+		/* STRING NativeOS */
+		/* STRING NativeLanMan */
+	} __attribute__((packed)) old_req; /* pre-NTLM (LANMAN2.1) req format */
+
+	struct {		/* default (NTLM) response format */
+		struct smb_hdr hdr;	/* wct = 3 */
+		__u8 AndXCommand;
+		__u8 AndXReserved;
+		__le16 AndXOffset;
+		__le16 Action;	/* see below */
+		__u16 ByteCount;
+		unsigned char NativeOS[1];	/* followed by */
+/*	unsigned char * NativeLanMan; */
+/*      unsigned char * PrimaryDomain; */
+	} __attribute__((packed)) old_resp; /* pre-NTLM (LANMAN2.1) response */
+} __attribute__((packed)) SESSION_SETUP_ANDX;
+
+/* format of NLTMv2 Response ie "case sensitive password" hash when NTLMv2 */
+
+#define NTLMSSP_SERVER_TYPE	1
+#define NTLMSSP_DOMAIN_TYPE	2
+#define NTLMSSP_FQ_DOMAIN_TYPE	3
+#define NTLMSSP_DNS_DOMAIN_TYPE	4
+#define NTLMSSP_DNS_PARENT_TYPE	5
+
+struct ntlmssp2_name {
+	__le16 type;
+	__le16 length;
+/*	char   name[length]; */
+} __attribute__((packed));
+
+struct ntlmv2_resp {
+	char ntlmv2_hash[CIFS_ENCPWD_SIZE];
+	__le32 blob_signature;
+	__u32  reserved;
+	__le64  time;
+	__u64  client_chal; /* random */
+	__u32  reserved2;
+	/* array of name entries could follow ending in minimum 4 byte struct */
+} __attribute__((packed));
+
+
+#define CIFS_NETWORK_OPSYS "CIFS VFS Client for Linux"
+
+/* Capabilities bits (for NTLM SessSetup request) */
+#define CAP_UNICODE            0x00000004
+#define CAP_LARGE_FILES        0x00000008
+#define CAP_NT_SMBS            0x00000010
+#define CAP_STATUS32           0x00000040
+#define CAP_LEVEL_II_OPLOCKS   0x00000080
+#define CAP_NT_FIND            0x00000200	/* reserved should be zero
+				(because NT_SMBs implies the same thing?) */
+#define CAP_BULK_TRANSFER      0x20000000
+#define CAP_EXTENDED_SECURITY  0x80000000
+
+/* Action bits */
+#define GUEST_LOGIN 1
+
+typedef struct smb_com_tconx_req {
+	struct smb_hdr hdr;	/* wct = 4 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__le16 Flags;		/* see below */
+	__le16 PasswordLength;
+	__le16 ByteCount;
+	unsigned char Password[1];	/* followed by */
+/* STRING Path    *//* \\server\share name */
+	/* STRING Service */
+} __attribute__((packed)) TCONX_REQ;
+
+typedef struct smb_com_tconx_rsp {
+	struct smb_hdr hdr;	/* wct = 3 , not extended response */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__le16 OptionalSupport;	/* see below */
+	__u16 ByteCount;
+	unsigned char Service[1];	/* always ASCII, not Unicode */
+	/* STRING NativeFileSystem */
+} __attribute__((packed)) TCONX_RSP;
+
+typedef struct smb_com_tconx_rsp_ext {
+	struct smb_hdr hdr;	/* wct = 7, extended response */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__le16 OptionalSupport;	/* see below */
+	__le32 MaximalShareAccessRights;
+	__le32 GuestMaximalShareAccessRights;
+	__u16 ByteCount;
+	unsigned char Service[1];	/* always ASCII, not Unicode */
+	/* STRING NativeFileSystem */
+} __attribute__((packed)) TCONX_RSP_EXT;
+
+
+/* tree connect Flags */
+#define DISCONNECT_TID          0x0001
+#define TCON_EXTENDED_SIGNATURES 0x0004
+#define TCON_EXTENDED_SECINFO   0x0008
+
+/* OptionalSupport bits */
+#define SMB_SUPPORT_SEARCH_BITS 0x0001	/* "must have" directory search bits
+					 (exclusive searches supported) */
+#define SMB_SHARE_IS_IN_DFS     0x0002
+#define SMB_CSC_MASK               0x000C
+/* CSC flags defined as follows */
+#define SMB_CSC_CACHE_MANUAL_REINT 0x0000
+#define SMB_CSC_CACHE_AUTO_REINT   0x0004
+#define SMB_CSC_CACHE_VDO          0x0008
+#define SMB_CSC_NO_CACHING         0x000C
+#define SMB_UNIQUE_FILE_NAME    0x0010
+#define SMB_EXTENDED_SIGNATURES 0x0020
+
+/* services
+ *
+ * A:       ie disk
+ * LPT1:    ie printer
+ * IPC      ie named pipe
+ * COMM
+ * ?????    ie any type
+ *
+ */
+
+typedef struct smb_com_echo_req {
+	struct	smb_hdr hdr;
+	__le16	EchoCount;
+	__le16	ByteCount;
+	char	Data[1];
+} __attribute__((packed)) ECHO_REQ;
+
+typedef struct smb_com_echo_rsp {
+	struct	smb_hdr hdr;
+	__le16	SequenceNumber;
+	__le16	ByteCount;
+	char	Data[1];
+} __attribute__((packed)) ECHO_RSP;
+
+typedef struct smb_com_logoff_andx_req {
+	struct smb_hdr hdr;	/* wct = 2 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__u16 AndXOffset;
+	__u16 ByteCount;
+} __attribute__((packed)) LOGOFF_ANDX_REQ;
+
+typedef struct smb_com_logoff_andx_rsp {
+	struct smb_hdr hdr;	/* wct = 2 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__u16 AndXOffset;
+	__u16 ByteCount;
+} __attribute__((packed)) LOGOFF_ANDX_RSP;
+
+typedef union smb_com_tree_disconnect {	/* as an altetnative can use flag on
+					tree_connect PDU to effect disconnect */
+					/* tdis is probably simplest SMB PDU */
+	struct {
+		struct smb_hdr hdr;	/* wct = 0 */
+		__u16 ByteCount;	/* bcc = 0 */
+	} __attribute__((packed)) req;
+	struct {
+		struct smb_hdr hdr;	/* wct = 0 */
+		__u16 ByteCount;	/* bcc = 0 */
+	} __attribute__((packed)) resp;
+} __attribute__((packed)) TREE_DISCONNECT;
+
+typedef struct smb_com_close_req {
+	struct smb_hdr hdr;	/* wct = 3 */
+	__u16 FileID;
+	__u32 LastWriteTime;	/* should be zero or -1 */
+	__u16 ByteCount;	/* 0 */
+} __attribute__((packed)) CLOSE_REQ;
+
+typedef struct smb_com_close_rsp {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__u16 ByteCount;	/* bct = 0 */
+} __attribute__((packed)) CLOSE_RSP;
+
+typedef struct smb_com_flush_req {
+	struct smb_hdr hdr;	/* wct = 1 */
+	__u16 FileID;
+	__u16 ByteCount;	/* 0 */
+} __attribute__((packed)) FLUSH_REQ;
+
+typedef struct smb_com_findclose_req {
+	struct smb_hdr hdr; /* wct = 1 */
+	__u16 FileID;
+	__u16 ByteCount;    /* 0 */
+} __attribute__((packed)) FINDCLOSE_REQ;
+
+/* OpenFlags */
+#define REQ_MORE_INFO      0x00000001  /* legacy (OPEN_AND_X) only */
+#define REQ_OPLOCK         0x00000002
+#define REQ_BATCHOPLOCK    0x00000004
+#define REQ_OPENDIRONLY    0x00000008
+#define REQ_EXTENDED_INFO  0x00000010
+
+/* File type */
+#define DISK_TYPE		0x0000
+#define BYTE_PIPE_TYPE		0x0001
+#define MESSAGE_PIPE_TYPE	0x0002
+#define PRINTER_TYPE		0x0003
+#define COMM_DEV_TYPE		0x0004
+#define UNKNOWN_TYPE		0xFFFF
+
+/* Device Type or File Status Flags */
+#define NO_EAS			0x0001
+#define NO_SUBSTREAMS		0x0002
+#define NO_REPARSETAG		0x0004
+/* following flags can apply if pipe */
+#define ICOUNT_MASK		0x00FF
+#define PIPE_READ_MODE		0x0100
+#define NAMED_PIPE_TYPE		0x0400
+#define PIPE_END_POINT		0x4000
+#define BLOCKING_NAMED_PIPE	0x8000
+
+typedef struct smb_com_open_req {	/* also handles create */
+	struct smb_hdr hdr;	/* wct = 24 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u8 Reserved;		/* Must Be Zero */
+	__le16 NameLength;
+	__le32 OpenFlags;
+	__u32  RootDirectoryFid;
+	__le32 DesiredAccess;
+	__le64 AllocationSize;
+	__le32 FileAttributes;
+	__le32 ShareAccess;
+	__le32 CreateDisposition;
+	__le32 CreateOptions;
+	__le32 ImpersonationLevel;
+	__u8 SecurityFlags;
+	__le16 ByteCount;
+	char fileName[1];
+} __attribute__((packed)) OPEN_REQ;
+
+/* open response: oplock levels */
+#define OPLOCK_NONE  	 0
+#define OPLOCK_EXCLUSIVE 1
+#define OPLOCK_BATCH	 2
+#define OPLOCK_READ	 3  /* level 2 oplock */
+
+/* open response for CreateAction shifted left */
+#define CIFS_CREATE_ACTION 0x20000 /* file created */
+
+typedef struct smb_com_open_rsp {
+	struct smb_hdr hdr;	/* wct = 34 BB */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u8 OplockLevel;
+	__u16 Fid;
+	__le32 CreateAction;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le32 FileAttributes;
+	__le64 AllocationSize;
+	__le64 EndOfFile;
+	__le16 FileType;
+	__le16 DeviceState;
+	__u8 DirectoryFlag;
+	__u16 ByteCount;	/* bct = 0 */
+} __attribute__((packed)) OPEN_RSP;
+
+typedef struct smb_com_open_rsp_ext {
+	struct smb_hdr hdr;     /* wct = 42 but meaningless due to MS bug? */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u8 OplockLevel;
+	__u16 Fid;
+	__le32 CreateAction;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le32 FileAttributes;
+	__le64 AllocationSize;
+	__le64 EndOfFile;
+	__le16 FileType;
+	__le16 DeviceState;
+	__u8 DirectoryFlag;
+	__u8 VolumeGUID[16];
+	__u64 FileId; /* note no endian conversion - is opaque UniqueID */
+	__le32 MaximalAccessRights;
+	__le32 GuestMaximalAccessRights;
+	__u16 ByteCount;        /* bct = 0 */
+} __attribute__((packed)) OPEN_RSP_EXT;
+
+
+/* format of legacy open request */
+typedef struct smb_com_openx_req {
+	struct smb_hdr	hdr;	/* wct = 15 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__le16 OpenFlags;
+	__le16 Mode;
+	__le16 Sattr; /* search attributes */
+	__le16 FileAttributes;  /* dos attrs */
+	__le32 CreateTime; /* os2 format */
+	__le16 OpenFunction;
+	__le32 EndOfFile;
+	__le32 Timeout;
+	__le32 Reserved;
+	__le16  ByteCount;  /* file name follows */
+	char   fileName[1];
+} __attribute__((packed)) OPENX_REQ;
+
+typedef struct smb_com_openx_rsp {
+	struct smb_hdr	hdr;	/* wct = 15 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u16  Fid;
+	__le16 FileAttributes;
+	__le32 LastWriteTime; /* os2 format */
+	__le32 EndOfFile;
+	__le16 Access;
+	__le16 FileType;
+	__le16 IPCState;
+	__le16 Action;
+	__u32  FileId;
+	__u16  Reserved;
+	__u16  ByteCount;
+} __attribute__((packed)) OPENX_RSP;
+
+/* For encoding of POSIX Open Request - see trans2 function 0x209 data struct */
+
+/* Legacy write request for older servers */
+typedef struct smb_com_writex_req {
+	struct smb_hdr hdr;     /* wct = 12 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u16 Fid;
+	__le32 OffsetLow;
+	__u32 Reserved; /* Timeout */
+	__le16 WriteMode; /* 1 = write through */
+	__le16 Remaining;
+	__le16 Reserved2;
+	__le16 DataLengthLow;
+	__le16 DataOffset;
+	__le16 ByteCount;
+	__u8 Pad;		/* BB check for whether padded to DWORD
+				   boundary and optimum performance here */
+	char Data[0];
+} __attribute__((packed)) WRITEX_REQ;
+
+typedef struct smb_com_write_req {
+	struct smb_hdr hdr;	/* wct = 14 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u16 Fid;
+	__le32 OffsetLow;
+	__u32 Reserved;
+	__le16 WriteMode;
+	__le16 Remaining;
+	__le16 DataLengthHigh;
+	__le16 DataLengthLow;
+	__le16 DataOffset;
+	__le32 OffsetHigh;
+	__le16 ByteCount;
+	__u8 Pad;		/* BB check for whether padded to DWORD
+				   boundary and optimum performance here */
+	char Data[0];
+} __attribute__((packed)) WRITE_REQ;
+
+typedef struct smb_com_write_rsp {
+	struct smb_hdr hdr;	/* wct = 6 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__le16 Count;
+	__le16 Remaining;
+	__le16 CountHigh;
+	__u16  Reserved;
+	__u16 ByteCount;
+} __attribute__((packed)) WRITE_RSP;
+
+/* legacy read request for older servers */
+typedef struct smb_com_readx_req {
+	struct smb_hdr hdr;	/* wct = 10 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u16 Fid;
+	__le32 OffsetLow;
+	__le16 MaxCount;
+	__le16 MinCount;	/* obsolete */
+	__le32 Reserved;
+	__le16 Remaining;
+	__le16 ByteCount;
+} __attribute__((packed)) READX_REQ;
+
+typedef struct smb_com_read_req {
+	struct smb_hdr hdr;	/* wct = 12 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u16 Fid;
+	__le32 OffsetLow;
+	__le16 MaxCount;
+	__le16 MinCount;		/* obsolete */
+	__le32 MaxCountHigh;
+	__le16 Remaining;
+	__le32 OffsetHigh;
+	__le16 ByteCount;
+} __attribute__((packed)) READ_REQ;
+
+typedef struct smb_com_read_rsp {
+	struct smb_hdr hdr;	/* wct = 12 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__le16 Remaining;
+	__le16 DataCompactionMode;
+	__le16 Reserved;
+	__le16 DataLength;
+	__le16 DataOffset;
+	__le16 DataLengthHigh;
+	__u64 Reserved2;
+	__u16 ByteCount;
+	__u8 Pad;		/* BB check for whether padded to DWORD
+				   boundary and optimum performance here */
+	char Data[1];
+} __attribute__((packed)) READ_RSP;
+
+typedef struct locking_andx_range {
+	__le16 Pid;
+	__le16 Pad;
+	__le32 OffsetHigh;
+	__le32 OffsetLow;
+	__le32 LengthHigh;
+	__le32 LengthLow;
+} __attribute__((packed)) LOCKING_ANDX_RANGE;
+
+#define LOCKING_ANDX_SHARED_LOCK     0x01
+#define LOCKING_ANDX_OPLOCK_RELEASE  0x02
+#define LOCKING_ANDX_CHANGE_LOCKTYPE 0x04
+#define LOCKING_ANDX_CANCEL_LOCK     0x08
+#define LOCKING_ANDX_LARGE_FILES     0x10	/* always on for us */
+
+typedef struct smb_com_lock_req {
+	struct smb_hdr hdr;	/* wct = 8 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u16 Fid;
+	__u8 LockType;
+	__u8 OplockLevel;
+	__le32 Timeout;
+	__le16 NumberOfUnlocks;
+	__le16 NumberOfLocks;
+	__le16 ByteCount;
+	LOCKING_ANDX_RANGE Locks[1];
+} __attribute__((packed)) LOCK_REQ;
+
+/* lock type */
+#define CIFS_RDLCK	0
+#define CIFS_WRLCK	1
+#define CIFS_UNLCK      2
+typedef struct cifs_posix_lock {
+	__le16  lock_type;  /* 0 = Read, 1 = Write, 2 = Unlock */
+	__le16  lock_flags; /* 1 = Wait (only valid for setlock) */
+	__le32  pid;
+	__le64	start;
+	__le64	length;
+	/* BB what about additional owner info to identify network client */
+} __attribute__((packed)) CIFS_POSIX_LOCK;
+
+typedef struct smb_com_lock_rsp {
+	struct smb_hdr hdr;	/* wct = 2 */
+	__u8 AndXCommand;
+	__u8 AndXReserved;
+	__le16 AndXOffset;
+	__u16 ByteCount;
+} __attribute__((packed)) LOCK_RSP;
+
+typedef struct smb_com_rename_req {
+	struct smb_hdr hdr;	/* wct = 1 */
+	__le16 SearchAttributes;	/* target file attributes */
+	__le16 ByteCount;
+	__u8 BufferFormat;	/* 4 = ASCII or Unicode */
+	unsigned char OldFileName[1];
+	/* followed by __u8 BufferFormat2 */
+	/* followed by NewFileName */
+} __attribute__((packed)) RENAME_REQ;
+
+	/* copy request flags */
+#define COPY_MUST_BE_FILE      0x0001
+#define COPY_MUST_BE_DIR       0x0002
+#define COPY_TARGET_MODE_ASCII 0x0004 /* if not set, binary */
+#define COPY_SOURCE_MODE_ASCII 0x0008 /* if not set, binary */
+#define COPY_VERIFY_WRITES     0x0010
+#define COPY_TREE              0x0020
+
+typedef struct smb_com_copy_req {
+	struct smb_hdr hdr;	/* wct = 3 */
+	__u16 Tid2;
+	__le16 OpenFunction;
+	__le16 Flags;
+	__le16 ByteCount;
+	__u8 BufferFormat;	/* 4 = ASCII or Unicode */
+	unsigned char OldFileName[1];
+	/* followed by __u8 BufferFormat2 */
+	/* followed by NewFileName string */
+} __attribute__((packed)) COPY_REQ;
+
+typedef struct smb_com_copy_rsp {
+	struct smb_hdr hdr;     /* wct = 1 */
+	__le16 CopyCount;    /* number of files copied */
+	__u16 ByteCount;    /* may be zero */
+	__u8 BufferFormat;  /* 0x04 - only present if errored file follows */
+	unsigned char ErrorFileName[1]; /* only present if error in copy */
+} __attribute__((packed)) COPY_RSP;
+
+#define CREATE_HARD_LINK		0x103
+#define MOVEFILE_COPY_ALLOWED		0x0002
+#define MOVEFILE_REPLACE_EXISTING	0x0001
+
+typedef struct smb_com_nt_rename_req {	/* A5 - also used for create hardlink */
+	struct smb_hdr hdr;	/* wct = 4 */
+	__le16 SearchAttributes;	/* target file attributes */
+	__le16 Flags;		/* spec says Information Level */
+	__le32 ClusterCount;
+	__le16 ByteCount;
+	__u8 BufferFormat;	/* 4 = ASCII or Unicode */
+	unsigned char OldFileName[1];
+	/* followed by __u8 BufferFormat2 */
+	/* followed by NewFileName */
+} __attribute__((packed)) NT_RENAME_REQ;
+
+typedef struct smb_com_rename_rsp {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__u16 ByteCount;	/* bct = 0 */
+} __attribute__((packed)) RENAME_RSP;
+
+typedef struct smb_com_delete_file_req {
+	struct smb_hdr hdr;	/* wct = 1 */
+	__le16 SearchAttributes;
+	__le16 ByteCount;
+	__u8 BufferFormat;	/* 4 = ASCII */
+	unsigned char fileName[1];
+} __attribute__((packed)) DELETE_FILE_REQ;
+
+typedef struct smb_com_delete_file_rsp {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__u16 ByteCount;	/* bct = 0 */
+} __attribute__((packed)) DELETE_FILE_RSP;
+
+typedef struct smb_com_delete_directory_req {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__le16 ByteCount;
+	__u8 BufferFormat;	/* 4 = ASCII */
+	unsigned char DirName[1];
+} __attribute__((packed)) DELETE_DIRECTORY_REQ;
+
+typedef struct smb_com_delete_directory_rsp {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__u16 ByteCount;	/* bct = 0 */
+} __attribute__((packed)) DELETE_DIRECTORY_RSP;
+
+typedef struct smb_com_create_directory_req {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__le16 ByteCount;
+	__u8 BufferFormat;	/* 4 = ASCII */
+	unsigned char DirName[1];
+} __attribute__((packed)) CREATE_DIRECTORY_REQ;
+
+typedef struct smb_com_create_directory_rsp {
+	struct smb_hdr hdr;	/* wct = 0 */
+	__u16 ByteCount;	/* bct = 0 */
+} __attribute__((packed)) CREATE_DIRECTORY_RSP;
+
+typedef struct smb_com_query_information_req {
+	struct smb_hdr hdr;     /* wct = 0 */
+	__le16 ByteCount;	/* 1 + namelen + 1 */
+	__u8 BufferFormat;      /* 4 = ASCII */
+	unsigned char FileName[1];
+} __attribute__((packed)) QUERY_INFORMATION_REQ;
+
+typedef struct smb_com_query_information_rsp {
+	struct smb_hdr hdr;     /* wct = 10 */
+	__le16 attr;
+	__le32  last_write_time;
+	__le32 size;
+	__u16  reserved[5];
+	__le16 ByteCount;	/* bcc = 0 */
+} __attribute__((packed)) QUERY_INFORMATION_RSP;
+
+typedef struct smb_com_setattr_req {
+	struct smb_hdr hdr; /* wct = 8 */
+	__le16 attr;
+	__le16 time_low;
+	__le16 time_high;
+	__le16 reserved[5]; /* must be zero */
+	__u16  ByteCount;
+	__u8   BufferFormat; /* 4 = ASCII */
+	unsigned char fileName[1];
+} __attribute__((packed)) SETATTR_REQ;
+
+typedef struct smb_com_setattr_rsp {
+	struct smb_hdr hdr;     /* wct = 0 */
+	__u16 ByteCount;        /* bct = 0 */
+} __attribute__((packed)) SETATTR_RSP;
+
+/* empty wct response to setattr */
+
+/*******************************************************/
+/* NT Transact structure definitions follow            */
+/* Currently only ioctl, acl (get security descriptor) */
+/* and notify are implemented                          */
+/*******************************************************/
+typedef struct smb_com_ntransact_req {
+	struct smb_hdr hdr; /* wct >= 19 */
+	__u8 MaxSetupCount;
+	__u16 Reserved;
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 MaxParameterCount;
+	__le32 MaxDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__u8 SetupCount; /* four setup words follow subcommand */
+	/* SNIA spec incorrectly included spurious pad here */
+	__le16 SubCommand; /* 2 = IOCTL/FSCTL */
+	/* SetupCount words follow then */
+	__le16 ByteCount;
+	__u8 Pad[3];
+	__u8 Parms[0];
+} __attribute__((packed)) NTRANSACT_REQ;
+
+typedef struct smb_com_ntransact_rsp {
+	struct smb_hdr hdr;     /* wct = 18 */
+	__u8 Reserved[3];
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 ParameterDisplacement;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__le32 DataDisplacement;
+	__u8 SetupCount;   /* 0 */
+	__u16 ByteCount;
+	/* __u8 Pad[3]; */
+	/* parms and data follow */
+} __attribute__((packed)) NTRANSACT_RSP;
+
+typedef struct smb_com_transaction_ioctl_req {
+	struct smb_hdr hdr;	/* wct = 23 */
+	__u8 MaxSetupCount;
+	__u16 Reserved;
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 MaxParameterCount;
+	__le32 MaxDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__u8 SetupCount; /* four setup words follow subcommand */
+	/* SNIA spec incorrectly included spurious pad here */
+	__le16 SubCommand; /* 2 = IOCTL/FSCTL */
+	__le32 FunctionCode;
+	__u16 Fid;
+	__u8 IsFsctl;  /* 1 = File System Control 0 = device control (IOCTL) */
+	__u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */
+	__le16 ByteCount;
+	__u8 Pad[3];
+	__u8 Data[1];
+} __attribute__((packed)) TRANSACT_IOCTL_REQ;
+
+typedef struct smb_com_transaction_ioctl_rsp {
+	struct smb_hdr hdr;	/* wct = 19 */
+	__u8 Reserved[3];
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 ParameterDisplacement;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__le32 DataDisplacement;
+	__u8 SetupCount;	/* 1 */
+	__le16 ReturnedDataLen;
+	__u16 ByteCount;
+} __attribute__((packed)) TRANSACT_IOCTL_RSP;
+
+#define CIFS_ACL_OWNER 1
+#define CIFS_ACL_GROUP 2
+#define CIFS_ACL_DACL  4
+#define CIFS_ACL_SACL  8
+
+typedef struct smb_com_transaction_qsec_req {
+	struct smb_hdr hdr;     /* wct = 19 */
+	__u8 MaxSetupCount;
+	__u16 Reserved;
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 MaxParameterCount;
+	__le32 MaxDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__u8 SetupCount; /* no setup words follow subcommand */
+	/* SNIA spec incorrectly included spurious pad here */
+	__le16 SubCommand; /* 6 = QUERY_SECURITY_DESC */
+	__le16 ByteCount; /* bcc = 3 + 8 */
+	__u8 Pad[3];
+	__u16 Fid;
+	__u16 Reserved2;
+	__le32 AclFlags;
+} __attribute__((packed)) QUERY_SEC_DESC_REQ;
+
+
+typedef struct smb_com_transaction_ssec_req {
+	struct smb_hdr hdr;     /* wct = 19 */
+	__u8 MaxSetupCount;
+	__u16 Reserved;
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 MaxParameterCount;
+	__le32 MaxDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__u8 SetupCount; /* no setup words follow subcommand */
+	/* SNIA spec incorrectly included spurious pad here */
+	__le16 SubCommand; /* 3 = SET_SECURITY_DESC */
+	__le16 ByteCount; /* bcc = 3 + 8 */
+	__u8 Pad[3];
+	__u16 Fid;
+	__u16 Reserved2;
+	__le32 AclFlags;
+} __attribute__((packed)) SET_SEC_DESC_REQ;
+
+typedef struct smb_com_transaction_change_notify_req {
+	struct smb_hdr hdr;     /* wct = 23 */
+	__u8 MaxSetupCount;
+	__u16 Reserved;
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 MaxParameterCount;
+	__le32 MaxDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__u8 SetupCount; /* four setup words follow subcommand */
+	/* SNIA spec incorrectly included spurious pad here */
+	__le16 SubCommand;/* 4 = Change Notify */
+	__le32 CompletionFilter;  /* operation to monitor */
+	__u16 Fid;
+	__u8 WatchTree;  /* 1 = Monitor subdirectories */
+	__u8 Reserved2;
+	__le16 ByteCount;
+/* 	__u8 Pad[3];*/
+/*	__u8 Data[1];*/
+} __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ;
+
+/* BB eventually change to use generic ntransact rsp struct
+      and validation routine */
+typedef struct smb_com_transaction_change_notify_rsp {
+	struct smb_hdr hdr;	/* wct = 18 */
+	__u8 Reserved[3];
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 ParameterDisplacement;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__le32 DataDisplacement;
+	__u8 SetupCount;   /* 0 */
+	__u16 ByteCount;
+	/* __u8 Pad[3]; */
+} __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_RSP;
+/* Completion Filter flags for Notify */
+#define FILE_NOTIFY_CHANGE_FILE_NAME    0x00000001
+#define FILE_NOTIFY_CHANGE_DIR_NAME     0x00000002
+#define FILE_NOTIFY_CHANGE_NAME         0x00000003
+#define FILE_NOTIFY_CHANGE_ATTRIBUTES   0x00000004
+#define FILE_NOTIFY_CHANGE_SIZE         0x00000008
+#define FILE_NOTIFY_CHANGE_LAST_WRITE   0x00000010
+#define FILE_NOTIFY_CHANGE_LAST_ACCESS  0x00000020
+#define FILE_NOTIFY_CHANGE_CREATION     0x00000040
+#define FILE_NOTIFY_CHANGE_EA           0x00000080
+#define FILE_NOTIFY_CHANGE_SECURITY     0x00000100
+#define FILE_NOTIFY_CHANGE_STREAM_NAME  0x00000200
+#define FILE_NOTIFY_CHANGE_STREAM_SIZE  0x00000400
+#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800
+
+#define FILE_ACTION_ADDED		0x00000001
+#define FILE_ACTION_REMOVED		0x00000002
+#define FILE_ACTION_MODIFIED		0x00000003
+#define FILE_ACTION_RENAMED_OLD_NAME	0x00000004
+#define FILE_ACTION_RENAMED_NEW_NAME	0x00000005
+#define FILE_ACTION_ADDED_STREAM	0x00000006
+#define FILE_ACTION_REMOVED_STREAM	0x00000007
+#define FILE_ACTION_MODIFIED_STREAM	0x00000008
+
+/* response contains array of the following structures */
+struct file_notify_information {
+	__le32 NextEntryOffset;
+	__le32 Action;
+	__le32 FileNameLength;
+	__u8  FileName[0];
+} __attribute__((packed));
+
+struct reparse_data {
+	__u32	ReparseTag;
+	__u16	ReparseDataLength;
+	__u16	Reserved;
+	__u16	AltNameOffset;
+	__u16	AltNameLen;
+	__u16	TargetNameOffset;
+	__u16	TargetNameLen;
+	char	LinkNamesBuf[1];
+} __attribute__((packed));
+
+struct cifs_quota_data {
+	__u32	rsrvd1;  /* 0 */
+	__u32	sid_size;
+	__u64	rsrvd2;  /* 0 */
+	__u64	space_used;
+	__u64	soft_limit;
+	__u64	hard_limit;
+	char	sid[1];  /* variable size? */
+} __attribute__((packed));
+
+/* quota sub commands */
+#define QUOTA_LIST_CONTINUE	    0
+#define QUOTA_LIST_START	0x100
+#define QUOTA_FOR_SID		0x101
+
+struct trans2_req {
+	/* struct smb_hdr hdr precedes. Set wct = 14+ */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;
+	__u8 Reserved3;
+	__le16 SubCommand; /* 1st setup word - SetupCount words follow */
+	__le16 ByteCount;
+} __attribute__((packed));
+
+struct smb_t2_req {
+	struct smb_hdr hdr;
+	struct trans2_req t2_req;
+} __attribute__((packed));
+
+struct trans2_resp {
+	/* struct smb_hdr hdr precedes. Note wct = 10 + setup count */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__u16 Reserved;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 ParameterDisplacement;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__le16 DataDisplacement;
+	__u8 SetupCount;
+	__u8 Reserved1;
+	/* SetupWords[SetupCount];
+	__u16 ByteCount;
+	__u16 Reserved2;*/
+	/* data area follows */
+} __attribute__((packed));
+
+struct smb_t2_rsp {
+	struct smb_hdr hdr;
+	struct trans2_resp t2_rsp;
+} __attribute__((packed));
+
+/* PathInfo/FileInfo infolevels */
+#define SMB_INFO_STANDARD                   1
+#define SMB_SET_FILE_EA                     2
+#define SMB_QUERY_FILE_EA_SIZE              2
+#define SMB_INFO_QUERY_EAS_FROM_LIST        3
+#define SMB_INFO_QUERY_ALL_EAS              4
+#define SMB_INFO_IS_NAME_VALID              6
+#define SMB_QUERY_FILE_BASIC_INFO       0x101
+#define SMB_QUERY_FILE_STANDARD_INFO    0x102
+#define SMB_QUERY_FILE_EA_INFO          0x103
+#define SMB_QUERY_FILE_NAME_INFO        0x104
+#define SMB_QUERY_FILE_ALLOCATION_INFO  0x105
+#define SMB_QUERY_FILE_END_OF_FILEINFO  0x106
+#define SMB_QUERY_FILE_ALL_INFO         0x107
+#define SMB_QUERY_ALT_NAME_INFO         0x108
+#define SMB_QUERY_FILE_STREAM_INFO      0x109
+#define SMB_QUERY_FILE_COMPRESSION_INFO 0x10B
+#define SMB_QUERY_FILE_UNIX_BASIC       0x200
+#define SMB_QUERY_FILE_UNIX_LINK        0x201
+#define SMB_QUERY_POSIX_ACL             0x204
+#define SMB_QUERY_XATTR                 0x205  /* e.g. system EA name space */
+#define SMB_QUERY_ATTR_FLAGS            0x206  /* append,immutable etc. */
+#define SMB_QUERY_POSIX_PERMISSION      0x207
+#define SMB_QUERY_POSIX_LOCK            0x208
+/* #define SMB_POSIX_OPEN               0x209 */
+/* #define SMB_POSIX_UNLINK             0x20a */
+#define SMB_QUERY_FILE__UNIX_INFO2      0x20b
+#define SMB_QUERY_FILE_INTERNAL_INFO    0x3ee
+#define SMB_QUERY_FILE_ACCESS_INFO      0x3f0
+#define SMB_QUERY_FILE_NAME_INFO2       0x3f1 /* 0x30 bytes */
+#define SMB_QUERY_FILE_POSITION_INFO    0x3f6
+#define SMB_QUERY_FILE_MODE_INFO        0x3f8
+#define SMB_QUERY_FILE_ALGN_INFO        0x3f9
+
+
+#define SMB_SET_FILE_BASIC_INFO	        0x101
+#define SMB_SET_FILE_DISPOSITION_INFO   0x102
+#define SMB_SET_FILE_ALLOCATION_INFO    0x103
+#define SMB_SET_FILE_END_OF_FILE_INFO   0x104
+#define SMB_SET_FILE_UNIX_BASIC         0x200
+#define SMB_SET_FILE_UNIX_LINK          0x201
+#define SMB_SET_FILE_UNIX_HLINK         0x203
+#define SMB_SET_POSIX_ACL               0x204
+#define SMB_SET_XATTR                   0x205
+#define SMB_SET_ATTR_FLAGS              0x206  /* append, immutable etc. */
+#define SMB_SET_POSIX_LOCK              0x208
+#define SMB_POSIX_OPEN                  0x209
+#define SMB_POSIX_UNLINK                0x20a
+#define SMB_SET_FILE_UNIX_INFO2         0x20b
+#define SMB_SET_FILE_BASIC_INFO2        0x3ec
+#define SMB_SET_FILE_RENAME_INFORMATION 0x3f2 /* BB check if qpathinfo too */
+#define SMB_FILE_ALL_INFO2              0x3fa
+#define SMB_SET_FILE_ALLOCATION_INFO2   0x3fb
+#define SMB_SET_FILE_END_OF_FILE_INFO2  0x3fc
+#define SMB_FILE_MOVE_CLUSTER_INFO      0x407
+#define SMB_FILE_QUOTA_INFO             0x408
+#define SMB_FILE_REPARSEPOINT_INFO      0x409
+#define SMB_FILE_MAXIMUM_INFO           0x40d
+
+/* Find File infolevels */
+#define SMB_FIND_FILE_INFO_STANDARD       0x001
+#define SMB_FIND_FILE_QUERY_EA_SIZE       0x002
+#define SMB_FIND_FILE_QUERY_EAS_FROM_LIST 0x003
+#define SMB_FIND_FILE_DIRECTORY_INFO      0x101
+#define SMB_FIND_FILE_FULL_DIRECTORY_INFO 0x102
+#define SMB_FIND_FILE_NAMES_INFO          0x103
+#define SMB_FIND_FILE_BOTH_DIRECTORY_INFO 0x104
+#define SMB_FIND_FILE_ID_FULL_DIR_INFO    0x105
+#define SMB_FIND_FILE_ID_BOTH_DIR_INFO    0x106
+#define SMB_FIND_FILE_UNIX                0x202
+
+typedef struct smb_com_transaction2_qpi_req {
+	struct smb_hdr hdr;	/* wct = 14+ */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;
+	__u8 Reserved3;
+	__le16 SubCommand;	/* one setup word */
+	__le16 ByteCount;
+	__u8 Pad;
+	__le16 InformationLevel;
+	__u32 Reserved4;
+	char FileName[1];
+} __attribute__((packed)) TRANSACTION2_QPI_REQ;
+
+typedef struct smb_com_transaction2_qpi_rsp {
+	struct smb_hdr hdr;	/* wct = 10 + SetupCount */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+	__u16 Reserved2; /* parameter word is present for infolevels > 100 */
+} __attribute__((packed)) TRANSACTION2_QPI_RSP;
+
+typedef struct smb_com_transaction2_spi_req {
+	struct smb_hdr hdr;	/* wct = 15 */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;
+	__u8 Reserved3;
+	__le16 SubCommand;	/* one setup word */
+	__le16 ByteCount;
+	__u8 Pad;
+	__u16 Pad1;
+	__le16 InformationLevel;
+	__u32 Reserved4;
+	char FileName[1];
+} __attribute__((packed)) TRANSACTION2_SPI_REQ;
+
+typedef struct smb_com_transaction2_spi_rsp {
+	struct smb_hdr hdr;	/* wct = 10 + SetupCount */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+	__u16 Reserved2; /* parameter word is present for infolevels > 100 */
+} __attribute__((packed)) TRANSACTION2_SPI_RSP;
+
+struct set_file_rename {
+	__le32 overwrite;   /* 1 = overwrite dest */
+	__u32 root_fid;   /* zero */
+	__le32 target_name_len;
+	char  target_name[0];  /* Must be unicode */
+} __attribute__((packed));
+
+struct smb_com_transaction2_sfi_req {
+	struct smb_hdr hdr;	/* wct = 15 */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;
+	__u8 Reserved3;
+	__le16 SubCommand;	/* one setup word */
+	__le16 ByteCount;
+	__u8 Pad;
+	__u16 Pad1;
+	__u16 Fid;
+	__le16 InformationLevel;
+	__u16 Reserved4;
+} __attribute__((packed));
+
+struct smb_com_transaction2_sfi_rsp {
+	struct smb_hdr hdr;	/* wct = 10 + SetupCount */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+	__u16 Reserved2;	/* parameter word reserved -
+					present for infolevels > 100 */
+} __attribute__((packed));
+
+struct smb_t2_qfi_req {
+	struct	smb_hdr hdr;
+	struct	trans2_req t2;
+	__u8	Pad;
+	__u16	Fid;
+	__le16	InformationLevel;
+} __attribute__((packed));
+
+struct smb_t2_qfi_rsp {
+	struct smb_hdr hdr;     /* wct = 10 + SetupCount */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+	__u16 Reserved2;        /* parameter word reserved -
+				   present for infolevels > 100 */
+} __attribute__((packed));
+
+/*
+ * Flags on T2 FINDFIRST and FINDNEXT
+ */
+#define CIFS_SEARCH_CLOSE_ALWAYS  0x0001
+#define CIFS_SEARCH_CLOSE_AT_END  0x0002
+#define CIFS_SEARCH_RETURN_RESUME 0x0004
+#define CIFS_SEARCH_CONTINUE_FROM_LAST 0x0008
+#define CIFS_SEARCH_BACKUP_SEARCH 0x0010
+
+/*
+ * Size of the resume key on FINDFIRST and FINDNEXT calls
+ */
+#define CIFS_SMB_RESUME_KEY_SIZE 4
+
+typedef struct smb_com_transaction2_ffirst_req {
+	struct smb_hdr hdr;	/* wct = 15 */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;	/* one */
+	__u8 Reserved3;
+	__le16 SubCommand;	/* TRANS2_FIND_FIRST */
+	__le16 ByteCount;
+	__u8 Pad;
+	__le16 SearchAttributes;
+	__le16 SearchCount;
+	__le16 SearchFlags;
+	__le16 InformationLevel;
+	__le32 SearchStorageType;
+	char FileName[1];
+} __attribute__((packed)) TRANSACTION2_FFIRST_REQ;
+
+typedef struct smb_com_transaction2_ffirst_rsp {
+	struct smb_hdr hdr;	/* wct = 10 */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+} __attribute__((packed)) TRANSACTION2_FFIRST_RSP;
+
+typedef struct smb_com_transaction2_ffirst_rsp_parms {
+	__u16 SearchHandle;
+	__le16 SearchCount;
+	__le16 EndofSearch;
+	__le16 EAErrorOffset;
+	__le16 LastNameOffset;
+} __attribute__((packed)) T2_FFIRST_RSP_PARMS;
+
+typedef struct smb_com_transaction2_fnext_req {
+	struct smb_hdr hdr;	/* wct = 15 */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;	/* one */
+	__u8 Reserved3;
+	__le16 SubCommand;	/* TRANS2_FIND_NEXT */
+	__le16 ByteCount;
+	__u8 Pad;
+	__u16 SearchHandle;
+	__le16 SearchCount;
+	__le16 InformationLevel;
+	__u32 ResumeKey;
+	__le16 SearchFlags;
+	char ResumeFileName[1];
+} __attribute__((packed)) TRANSACTION2_FNEXT_REQ;
+
+typedef struct smb_com_transaction2_fnext_rsp {
+	struct smb_hdr hdr;	/* wct = 10 */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+} __attribute__((packed)) TRANSACTION2_FNEXT_RSP;
+
+typedef struct smb_com_transaction2_fnext_rsp_parms {
+	__le16 SearchCount;
+	__le16 EndofSearch;
+	__le16 EAErrorOffset;
+	__le16 LastNameOffset;
+} __attribute__((packed)) T2_FNEXT_RSP_PARMS;
+
+/* QFSInfo Levels */
+#define SMB_INFO_ALLOCATION         1
+#define SMB_INFO_VOLUME             2
+#define SMB_QUERY_FS_VOLUME_INFO    0x102
+#define SMB_QUERY_FS_SIZE_INFO      0x103
+#define SMB_QUERY_FS_DEVICE_INFO    0x104
+#define SMB_QUERY_FS_ATTRIBUTE_INFO 0x105
+#define SMB_QUERY_CIFS_UNIX_INFO    0x200
+#define SMB_QUERY_POSIX_FS_INFO     0x201
+#define SMB_QUERY_POSIX_WHO_AM_I    0x202
+#define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203
+#define SMB_QUERY_FS_PROXY          0x204 /* WAFS enabled. Returns structure
+					    FILE_SYSTEM__UNIX_INFO to tell
+					    whether new NTIOCTL available
+					    (0xACE) for WAN friendly SMB
+					    operations to be carried */
+#define SMB_QUERY_LABEL_INFO        0x3ea
+#define SMB_QUERY_FS_QUOTA_INFO     0x3ee
+#define SMB_QUERY_FS_FULL_SIZE_INFO 0x3ef
+#define SMB_QUERY_OBJECTID_INFO     0x3f0
+
+typedef struct smb_com_transaction2_qfsi_req {
+	struct smb_hdr hdr;	/* wct = 14+ */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;
+	__u8 Reserved3;
+	__le16 SubCommand;	/* one setup word */
+	__le16 ByteCount;
+	__u8 Pad;
+	__le16 InformationLevel;
+} __attribute__((packed)) TRANSACTION2_QFSI_REQ;
+
+typedef struct smb_com_transaction_qfsi_rsp {
+	struct smb_hdr hdr;	/* wct = 10 + SetupCount */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+	__u8 Pad;	/* may be three bytes? *//* followed by data area */
+} __attribute__((packed)) TRANSACTION2_QFSI_RSP;
+
+typedef struct whoami_rsp_data { /* Query level 0x202 */
+	__u32 flags; /* 0 = Authenticated user 1 = GUEST */
+	__u32 mask; /* which flags bits server understands ie 0x0001 */
+	__u64 unix_user_id;
+	__u64 unix_user_gid;
+	__u32 number_of_supplementary_gids; /* may be zero */
+	__u32 number_of_sids; /* may be zero */
+	__u32 length_of_sid_array; /* in bytes - may be zero */
+	__u32 pad; /* reserved - MBZ */
+	/* __u64 gid_array[0]; */  /* may be empty */
+	/* __u8 * psid_list */  /* may be empty */
+} __attribute__((packed)) WHOAMI_RSP_DATA;
+
+/* SETFSInfo Levels */
+#define SMB_SET_CIFS_UNIX_INFO    0x200
+typedef struct smb_com_transaction2_setfsi_req {
+	struct smb_hdr hdr;	/* wct = 15 */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;	/* 4 */
+	__le16 ParameterOffset;
+	__le16 DataCount;	/* 12 */
+	__le16 DataOffset;
+	__u8 SetupCount;	/* one */
+	__u8 Reserved3;
+	__le16 SubCommand;	/* TRANS2_SET_FS_INFORMATION */
+	__le16 ByteCount;
+	__u8 Pad;
+	__u16 FileNum;		/* Parameters start. */
+	__le16 InformationLevel;/* Parameters end. */
+	__le16 ClientUnixMajor; /* Data start. */
+	__le16 ClientUnixMinor;
+	__le64 ClientUnixCap;   /* Data end */
+} __attribute__((packed)) TRANSACTION2_SETFSI_REQ;
+
+typedef struct smb_com_transaction2_setfsi_rsp {
+	struct smb_hdr hdr;	/* wct = 10 */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+} __attribute__((packed)) TRANSACTION2_SETFSI_RSP;
+
+
+typedef struct smb_com_transaction2_get_dfs_refer_req {
+	struct smb_hdr hdr;	/* wct = 15 */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;
+	__le16 ParameterOffset;
+	__le16 DataCount;
+	__le16 DataOffset;
+	__u8 SetupCount;
+	__u8 Reserved3;
+	__le16 SubCommand;	/* one setup word */
+	__le16 ByteCount;
+	__u8 Pad[3];		/* Win2K has sent 0x0F01 (max response length
+				   perhaps?) followed by one byte pad - doesn't
+				   seem to matter though */
+	__le16 MaxReferralLevel;
+	char RequestFileName[1];
+} __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_REQ;
+
+#define DFS_VERSION cpu_to_le16(0x0003)
+
+/* DFS server target type */
+#define DFS_TYPE_LINK 0x0000  /* also for sysvol targets */
+#define DFS_TYPE_ROOT 0x0001
+
+/* Referral Entry Flags */
+#define DFS_NAME_LIST_REF 0x0200 /* set for domain or DC referral responses */
+#define DFS_TARGET_SET_BOUNDARY 0x0400 /* only valid with version 4 dfs req */
+
+typedef struct dfs_referral_level_3 { /* version 4 is same, + one flag bit */
+	__le16 VersionNumber;  /* must be 3 or 4 */
+	__le16 Size;
+	__le16 ServerType; /* 0x0001 = root targets; 0x0000 = link targets */
+	__le16 ReferralEntryFlags;
+	__le32 TimeToLive;
+	__le16 DfsPathOffset;
+	__le16 DfsAlternatePathOffset;
+	__le16 NetworkAddressOffset; /* offset of the link target */
+	__u8   ServiceSiteGuid[16];  /* MBZ, ignored */
+} __attribute__((packed)) REFERRAL3;
+
+typedef struct smb_com_transaction_get_dfs_refer_rsp {
+	struct smb_hdr hdr;	/* wct = 10 */
+	struct trans2_resp t2;
+	__u16 ByteCount;
+	__u8 Pad;
+	__le16 PathConsumed;
+	__le16 NumberOfReferrals;
+	__le32 DFSFlags;
+	REFERRAL3 referrals[1];	/* array of level 3 dfs_referral structures */
+	/* followed by the strings pointed to by the referral structures */
+} __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_RSP;
+
+/* DFS Flags */
+#define DFSREF_REFERRAL_SERVER  0x00000001 /* all targets are DFS roots */
+#define DFSREF_STORAGE_SERVER   0x00000002 /* no further ref requests needed */
+#define DFSREF_TARGET_FAILBACK  0x00000004 /* only for DFS referral version 4 */
+
+/*
+ ************************************************************************
+ * All structs for everything above the SMB PDUs themselves
+ * (such as the T2 level specific data) go here
+ ************************************************************************
+ */
+
+/*
+ * Information on a server
+ */
+
+struct serverInfo {
+	char name[16];
+	unsigned char versionMajor;
+	unsigned char versionMinor;
+	unsigned long type;
+	unsigned int commentOffset;
+} __attribute__((packed));
+
+/*
+ * The following structure is the format of the data returned on a NetShareEnum
+ * with level "90" (x5A)
+ */
+
+struct shareInfo {
+	char shareName[13];
+	char pad;
+	unsigned short type;
+	unsigned int commentOffset;
+} __attribute__((packed));
+
+struct aliasInfo {
+	char aliasName[9];
+	char pad;
+	unsigned int commentOffset;
+	unsigned char type[2];
+} __attribute__((packed));
+
+struct aliasInfo92 {
+	int aliasNameOffset;
+	int serverNameOffset;
+	int shareNameOffset;
+} __attribute__((packed));
+
+typedef struct {
+	__le64 TotalAllocationUnits;
+	__le64 FreeAllocationUnits;
+	__le32 SectorsPerAllocationUnit;
+	__le32 BytesPerSector;
+} __attribute__((packed)) FILE_SYSTEM_INFO;	/* size info, level 0x103 */
+
+typedef struct {
+	__le32 fsid;
+	__le32 SectorsPerAllocationUnit;
+	__le32 TotalAllocationUnits;
+	__le32 FreeAllocationUnits;
+	__le16  BytesPerSector;
+} __attribute__((packed)) FILE_SYSTEM_ALLOC_INFO;
+
+typedef struct {
+	__le16 MajorVersionNumber;
+	__le16 MinorVersionNumber;
+	__le64 Capability;
+} __attribute__((packed)) FILE_SYSTEM_UNIX_INFO; /* Unix extension level 0x200*/
+
+/* Version numbers for CIFS UNIX major and minor. */
+#define CIFS_UNIX_MAJOR_VERSION 1
+#define CIFS_UNIX_MINOR_VERSION 0
+
+/* Linux/Unix extensions capability flags */
+#define CIFS_UNIX_FCNTL_CAP             0x00000001 /* support for fcntl locks */
+#define CIFS_UNIX_POSIX_ACL_CAP         0x00000002 /* support getfacl/setfacl */
+#define CIFS_UNIX_XATTR_CAP             0x00000004 /* support new namespace   */
+#define CIFS_UNIX_EXTATTR_CAP           0x00000008 /* support chattr/chflag   */
+#define CIFS_UNIX_POSIX_PATHNAMES_CAP   0x00000010 /* Allow POSIX path chars  */
+#define CIFS_UNIX_POSIX_PATH_OPS_CAP    0x00000020 /* Allow new POSIX path based
+						      calls including posix open
+						      and posix unlink */
+#define CIFS_UNIX_LARGE_READ_CAP        0x00000040 /* support reads >128K (up
+						      to 0xFFFF00 */
+#define CIFS_UNIX_LARGE_WRITE_CAP       0x00000080
+#define CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP 0x00000100 /* can do SPNEGO crypt */
+#define CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP  0x00000200 /* must do  */
+#define CIFS_UNIX_PROXY_CAP             0x00000400 /* Proxy cap: 0xACE ioctl and
+						      QFS PROXY call */
+#ifdef CONFIG_CIFS_POSIX
+/* Can not set pathnames cap yet until we send new posix create SMB since
+   otherwise server can treat such handles opened with older ntcreatex
+   (by a new client which knows how to send posix path ops)
+   as non-posix handles (can affect write behavior with byte range locks.
+   We can add back in POSIX_PATH_OPS cap when Posix Create/Mkdir finished */
+/* #define CIFS_UNIX_CAP_MASK              0x000000fb */
+#define CIFS_UNIX_CAP_MASK              0x000000db
+#else
+#define CIFS_UNIX_CAP_MASK              0x00000013
+#endif /* CONFIG_CIFS_POSIX */
+
+
+#define CIFS_POSIX_EXTENSIONS           0x00000010 /* support for new QFSInfo */
+
+typedef struct {
+	/* For undefined recommended transfer size return -1 in that field */
+	__le32 OptimalTransferSize;  /* bsize on some os, iosize on other os */
+	__le32 BlockSize;
+    /* The next three fields are in terms of the block size.
+	(above). If block size is unknown, 4096 would be a
+	reasonable block size for a server to report.
+	Note that returning the blocks/blocksavail removes need
+	to make a second call (to QFSInfo level 0x103 to get this info.
+	UserBlockAvail is typically less than or equal to BlocksAvail,
+	if no distinction is made return the same value in each */
+	__le64 TotalBlocks;
+	__le64 BlocksAvail;       /* bfree */
+	__le64 UserBlocksAvail;   /* bavail */
+    /* For undefined Node fields or FSID return -1 */
+	__le64 TotalFileNodes;
+	__le64 FreeFileNodes;
+	__le64 FileSysIdentifier;   /* fsid */
+	/* NB Namelen comes from FILE_SYSTEM_ATTRIBUTE_INFO call */
+	/* NB flags can come from FILE_SYSTEM_DEVICE_INFO call   */
+} __attribute__((packed)) FILE_SYSTEM_POSIX_INFO;
+
+/* DeviceType Flags */
+#define FILE_DEVICE_CD_ROM              0x00000002
+#define FILE_DEVICE_CD_ROM_FILE_SYSTEM  0x00000003
+#define FILE_DEVICE_DFS                 0x00000006
+#define FILE_DEVICE_DISK                0x00000007
+#define FILE_DEVICE_DISK_FILE_SYSTEM    0x00000008
+#define FILE_DEVICE_FILE_SYSTEM         0x00000009
+#define FILE_DEVICE_NAMED_PIPE          0x00000011
+#define FILE_DEVICE_NETWORK             0x00000012
+#define FILE_DEVICE_NETWORK_FILE_SYSTEM 0x00000014
+#define FILE_DEVICE_NULL                0x00000015
+#define FILE_DEVICE_PARALLEL_PORT       0x00000016
+#define FILE_DEVICE_PRINTER             0x00000018
+#define FILE_DEVICE_SERIAL_PORT         0x0000001b
+#define FILE_DEVICE_STREAMS             0x0000001e
+#define FILE_DEVICE_TAPE                0x0000001f
+#define FILE_DEVICE_TAPE_FILE_SYSTEM    0x00000020
+#define FILE_DEVICE_VIRTUAL_DISK        0x00000024
+#define FILE_DEVICE_NETWORK_REDIRECTOR  0x00000028
+
+typedef struct {
+	__le32 DeviceType;
+	__le32 DeviceCharacteristics;
+} __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO; /* device info level 0x104 */
+
+typedef struct {
+	__le32 Attributes;
+	__le32 MaxPathNameComponentLength;
+	__le32 FileSystemNameLen;
+	char FileSystemName[52]; /* do not have to save this - get subset? */
+} __attribute__((packed)) FILE_SYSTEM_ATTRIBUTE_INFO;
+
+/******************************************************************************/
+/* QueryFileInfo/QueryPathinfo (also for SetPath/SetFile) data buffer formats */
+/******************************************************************************/
+typedef struct { /* data block encoding of response to level 263 QPathInfo */
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le32 Attributes;
+	__u32 Pad1;
+	__le64 AllocationSize;
+	__le64 EndOfFile;	/* size ie offset to first free byte in file */
+	__le32 NumberOfLinks;	/* hard links */
+	__u8 DeletePending;
+	__u8 Directory;
+	__u16 Pad2;
+	__u64 IndexNumber;
+	__le32 EASize;
+	__le32 AccessFlags;
+	__u64 IndexNumber1;
+	__le64 CurrentByteOffset;
+	__le32 Mode;
+	__le32 AlignmentRequirement;
+	__le32 FileNameLength;
+	char FileName[1];
+} __attribute__((packed)) FILE_ALL_INFO;	/* level 0x107 QPathInfo */
+
+/* defines for enumerating possible values of the Unix type field below */
+#define UNIX_FILE      0
+#define UNIX_DIR       1
+#define UNIX_SYMLINK   2
+#define UNIX_CHARDEV   3
+#define UNIX_BLOCKDEV  4
+#define UNIX_FIFO      5
+#define UNIX_SOCKET    6
+typedef struct {
+	__le64 EndOfFile;
+	__le64 NumOfBytes;
+	__le64 LastStatusChange; /*SNIA specs DCE time for the 3 time fields */
+	__le64 LastAccessTime;
+	__le64 LastModificationTime;
+	__le64 Uid;
+	__le64 Gid;
+	__le32 Type;
+	__le64 DevMajor;
+	__le64 DevMinor;
+	__le64 UniqueId;
+	__le64 Permissions;
+	__le64 Nlinks;
+} __attribute__((packed)) FILE_UNIX_BASIC_INFO;	/* level 0x200 QPathInfo */
+
+typedef struct {
+	char LinkDest[1];
+} __attribute__((packed)) FILE_UNIX_LINK_INFO;	/* level 0x201 QPathInfo */
+
+/* The following three structures are needed only for
+	setting time to NT4 and some older servers via
+	the primitive DOS time format */
+typedef struct {
+	__u16 Day:5;
+	__u16 Month:4;
+	__u16 Year:7;
+} __attribute__((packed)) SMB_DATE;
+
+typedef struct {
+	__u16 TwoSeconds:5;
+	__u16 Minutes:6;
+	__u16 Hours:5;
+} __attribute__((packed)) SMB_TIME;
+
+typedef struct {
+	__le16 CreationDate; /* SMB Date see above */
+	__le16 CreationTime; /* SMB Time */
+	__le16 LastAccessDate;
+	__le16 LastAccessTime;
+	__le16 LastWriteDate;
+	__le16 LastWriteTime;
+	__le32 DataSize; /* File Size (EOF) */
+	__le32 AllocationSize;
+	__le16 Attributes; /* verify not u32 */
+	__le32 EASize;
+} __attribute__((packed)) FILE_INFO_STANDARD;  /* level 1 SetPath/FileInfo */
+
+typedef struct {
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le32 Attributes;
+	__u32 Pad;
+} __attribute__((packed)) FILE_BASIC_INFO;	/* size info, level 0x101 */
+
+struct file_allocation_info {
+	__le64 AllocationSize; /* Note old Samba srvr rounds this up too much */
+} __attribute__((packed));	/* size used on disk, for level 0x103 for set,
+				   0x105 for query */
+
+struct file_end_of_file_info {
+	__le64 FileSize;		/* offset to end of file */
+} __attribute__((packed)); /* size info, level 0x104 for set, 0x106 for query */
+
+struct file_alt_name_info {
+	__u8   alt_name[1];
+} __attribute__((packed));      /* level 0x0108 */
+
+struct file_stream_info {
+	__le32 number_of_streams;  /* BB check sizes and verify location */
+	/* followed by info on streams themselves
+		u64 size;
+		u64 allocation_size
+		stream info */
+};      /* level 0x109 */
+
+struct file_compression_info {
+	__le64 compressed_size;
+	__le16 format;
+	__u8   unit_shift;
+	__u8   ch_shift;
+	__u8   cl_shift;
+	__u8   pad[3];
+} __attribute__((packed));      /* level 0x10b */
+
+/* POSIX ACL set/query path info structures */
+#define CIFS_ACL_VERSION 1
+struct cifs_posix_ace { /* access control entry (ACE) */
+	__u8  cifs_e_tag;
+	__u8  cifs_e_perm;
+	__le64 cifs_uid; /* or gid */
+} __attribute__((packed));
+
+struct cifs_posix_acl { /* access conrol list  (ACL) */
+	__le16	version;
+	__le16	access_entry_count;  /* access ACL - count of entries */
+	__le16	default_entry_count; /* default ACL - count of entries */
+	struct cifs_posix_ace ace_array[0];
+	/* followed by
+	struct cifs_posix_ace default_ace_arraay[] */
+} __attribute__((packed));  /* level 0x204 */
+
+/* types of access control entries already defined in posix_acl.h */
+/* #define CIFS_POSIX_ACL_USER_OBJ	 0x01
+#define CIFS_POSIX_ACL_USER      0x02
+#define CIFS_POSIX_ACL_GROUP_OBJ 0x04
+#define CIFS_POSIX_ACL_GROUP     0x08
+#define CIFS_POSIX_ACL_MASK      0x10
+#define CIFS_POSIX_ACL_OTHER     0x20 */
+
+/* types of perms */
+/* #define CIFS_POSIX_ACL_EXECUTE   0x01
+#define CIFS_POSIX_ACL_WRITE     0x02
+#define CIFS_POSIX_ACL_READ	     0x04 */
+
+/* end of POSIX ACL definitions */
+
+/* POSIX Open Flags */
+#define SMB_O_RDONLY 	 0x1
+#define SMB_O_WRONLY 	0x2
+#define SMB_O_RDWR 	0x4
+#define SMB_O_CREAT 	0x10
+#define SMB_O_EXCL 	0x20
+#define SMB_O_TRUNC 	0x40
+#define SMB_O_APPEND 	0x80
+#define SMB_O_SYNC 	0x100
+#define SMB_O_DIRECTORY 0x200
+#define SMB_O_NOFOLLOW 	0x400
+#define SMB_O_DIRECT 	0x800
+
+typedef struct {
+	__le32 OpenFlags; /* same as NT CreateX */
+	__le32 PosixOpenFlags;
+	__le64 Permissions;
+	__le16 Level; /* reply level requested (see QPathInfo levels) */
+} __attribute__((packed)) OPEN_PSX_REQ; /* level 0x209 SetPathInfo data */
+
+typedef struct {
+	__le16 OplockFlags;
+	__u16 Fid;
+	__le32 CreateAction;
+	__le16 ReturnedLevel;
+	__le16 Pad;
+	/* struct following varies based on requested level */
+} __attribute__((packed)) OPEN_PSX_RSP; /* level 0x209 SetPathInfo data */
+
+#define SMB_POSIX_UNLINK_FILE_TARGET		0
+#define SMB_POSIX_UNLINK_DIRECTORY_TARGET	1
+
+struct unlink_psx_rq { /* level 0x20a SetPathInfo */
+	__le16 type;
+} __attribute__((packed));
+
+struct file_internal_info {
+	__le64  UniqueId; /* inode number */
+} __attribute__((packed));      /* level 0x3ee */
+
+struct file_mode_info {
+	__le32	Mode;
+} __attribute__((packed));      /* level 0x3f8 */
+
+struct file_attrib_tag {
+	__le32 Attribute;
+	__le32 ReparseTag;
+} __attribute__((packed));      /* level 0x40b */
+
+
+/********************************************************/
+/*  FindFirst/FindNext transact2 data buffer formats    */
+/********************************************************/
+
+typedef struct {
+	__le32 NextEntryOffset;
+	__u32 ResumeKey; /* as with FileIndex - no need to convert */
+	FILE_UNIX_BASIC_INFO basic;
+	char FileName[1];
+} __attribute__((packed)) FILE_UNIX_INFO; /* level 0x202 */
+
+typedef struct {
+	__le32 NextEntryOffset;
+	__u32 FileIndex;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le64 EndOfFile;
+	__le64 AllocationSize;
+	__le32 ExtFileAttributes;
+	__le32 FileNameLength;
+	char FileName[1];
+} __attribute__((packed)) FILE_DIRECTORY_INFO;   /* level 0x101 FF resp data */
+
+typedef struct {
+	__le32 NextEntryOffset;
+	__u32 FileIndex;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le64 EndOfFile;
+	__le64 AllocationSize;
+	__le32 ExtFileAttributes;
+	__le32 FileNameLength;
+	__le32 EaSize; /* length of the xattrs */
+	char FileName[1];
+} __attribute__((packed)) FILE_FULL_DIRECTORY_INFO; /* level 0x102 rsp data */
+
+typedef struct {
+	__le32 NextEntryOffset;
+	__u32 FileIndex;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le64 EndOfFile;
+	__le64 AllocationSize;
+	__le32 ExtFileAttributes;
+	__le32 FileNameLength;
+	__le32 EaSize; /* EA size */
+	__le32 Reserved;
+	__le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
+	char FileName[1];
+} __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */
+
+typedef struct {
+	__le32 NextEntryOffset;
+	__u32 FileIndex;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le64 EndOfFile;
+	__le64 AllocationSize;
+	__le32 ExtFileAttributes;
+	__le32 FileNameLength;
+	__le32 EaSize; /* length of the xattrs */
+	__u8   ShortNameLength;
+	__u8   Reserved;
+	__u8   ShortName[12];
+	char FileName[1];
+} __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FFrsp data */
+
+typedef struct {
+	__u32  ResumeKey;
+	__le16 CreationDate; /* SMB Date */
+	__le16 CreationTime; /* SMB Time */
+	__le16 LastAccessDate;
+	__le16 LastAccessTime;
+	__le16 LastWriteDate;
+	__le16 LastWriteTime;
+	__le32 DataSize; /* File Size (EOF) */
+	__le32 AllocationSize;
+	__le16 Attributes; /* verify not u32 */
+	__u8   FileNameLength;
+	char FileName[1];
+} __attribute__((packed)) FIND_FILE_STANDARD_INFO; /* level 0x1 FF resp data */
+
+
+struct win_dev {
+	unsigned char type[8]; /* IntxCHR or IntxBLK */
+	__le64 major;
+	__le64 minor;
+} __attribute__((packed));
+
+struct gea {
+	unsigned char name_len;
+	char name[1];
+} __attribute__((packed));
+
+struct gealist {
+	unsigned long list_len;
+	struct gea list[1];
+} __attribute__((packed));
+
+struct fea {
+	unsigned char EA_flags;
+	__u8 name_len;
+	__le16 value_len;
+	char name[1];
+	/* optionally followed by value */
+} __attribute__((packed));
+/* flags for _FEA.fEA */
+#define FEA_NEEDEA         0x80	/* need EA bit */
+
+struct fealist {
+	__le32 list_len;
+	struct fea list[1];
+} __attribute__((packed));
+
+/* used to hold an arbitrary blob of data */
+struct data_blob {
+	__u8 *data;
+	size_t length;
+	void (*free) (struct data_blob *data_blob);
+} __attribute__((packed));
+
+
+#ifdef CONFIG_CIFS_POSIX
+/*
+	For better POSIX semantics from Linux client, (even better
+	than the existing CIFS Unix Extensions) we need updated PDUs for:
+
+	1) PosixCreateX - to set and return the mode, inode#, device info and
+	perhaps add a CreateDevice - to create Pipes and other special .inodes
+	Also note POSIX open flags
+	2) Close - to return the last write time to do cache across close
+		more safely
+	3) FindFirst return unique inode number - what about resume key, two
+	forms short (matches readdir) and full (enough info to cache inodes)
+	4) Mkdir - set mode
+
+	And under consideration:
+	5) FindClose2 (return nanosecond timestamp ??)
+	6) Use nanosecond timestamps throughout all time fields if
+	   corresponding attribute flag is set
+	7) sendfile - handle based copy
+
+	what about fixing 64 bit alignment
+
+	There are also various legacy SMB/CIFS requests used as is
+
+	From existing Lanman and NTLM dialects:
+	--------------------------------------
+	NEGOTIATE
+	SESSION_SETUP_ANDX (BB which?)
+	TREE_CONNECT_ANDX (BB which wct?)
+	TREE_DISCONNECT (BB add volume timestamp on response)
+	LOGOFF_ANDX
+	DELETE (note delete open file behavior)
+	DELETE_DIRECTORY
+	READ_AND_X
+	WRITE_AND_X
+	LOCKING_AND_X (note posix lock semantics)
+	RENAME (note rename across dirs and open file rename posix behaviors)
+	NT_RENAME (for hardlinks) Is this good enough for all features?
+	FIND_CLOSE2
+	TRANSACTION2 (18 cases)
+		SMB_SET_FILE_END_OF_FILE_INFO2 SMB_SET_PATH_END_OF_FILE_INFO2
+		(BB verify that never need to set allocation size)
+		SMB_SET_FILE_BASIC_INFO2 (setting times - BB can it be done via
+			 Unix ext?)
+
+	COPY (note support for copy across directories) - FUTURE, OPTIONAL
+	setting/getting OS/2 EAs - FUTURE (BB can this handle
+	setting Linux xattrs perfectly)         - OPTIONAL
+	dnotify                                 - FUTURE, OPTIONAL
+	quota                                   - FUTURE, OPTIONAL
+
+	Note that various requests implemented for NT interop such as
+		NT_TRANSACT (IOCTL) QueryReparseInfo
+	are unneeded to servers compliant with the CIFS POSIX extensions
+
+	From CIFS Unix Extensions:
+	-------------------------
+	T2 SET_PATH_INFO (SMB_SET_FILE_UNIX_LINK) for symlinks
+	T2 SET_PATH_INFO (SMB_SET_FILE_BASIC_INFO2)
+	T2 QUERY_PATH_INFO (SMB_QUERY_FILE_UNIX_LINK)
+	T2 QUERY_PATH_INFO (SMB_QUERY_FILE_UNIX_BASIC)	BB check for missing
+							inode fields
+				Actually a need QUERY_FILE_UNIX_INFO
+				since has inode num
+				BB what about a) blksize/blkbits/blocks
+							  b) i_version
+							  c) i_rdev
+							  d) notify mask?
+							  e) generation
+							  f) size_seqcount
+	T2 FIND_FIRST/FIND_NEXT FIND_FILE_UNIX
+	TRANS2_GET_DFS_REFERRAL		      - OPTIONAL but recommended
+	T2_QFS_INFO QueryDevice/AttributeInfo - OPTIONAL
+ */
+
+/* xsymlink is a symlink format (used by MacOS) that can be used
+   to save symlink info in a regular file when
+   mounted to operating systems that do not
+   support the cifs Unix extensions or EAs (for xattr
+   based symlinks).  For such a file to be recognized
+   as containing symlink data:
+
+   1) file size must be 1067,
+   2) signature must begin file data,
+   3) length field must be set to ASCII representation
+	of a number which is less than or equal to 1024,
+   4) md5 must match that of the path data */
+
+struct xsymlink {
+	/* 1067 bytes */
+	char signature[4]; /* XSym */ /* not null terminated */
+	char cr0;         /* \n */
+/* ASCII representation of length (4 bytes decimal) terminated by \n not null */
+	char length[4];
+	char cr1;         /* \n */
+/* md5 of valid subset of path ie path[0] through path[length-1] */
+	__u8 md5[32];
+	char cr2;        /* \n */
+/* if room left, then end with \n then 0x20s by convention but not required */
+	char path[1024];
+} __attribute__((packed));
+
+typedef struct file_xattr_info {
+	/* BB do we need another field for flags? BB */
+	__u32 xattr_name_len;
+	__u32 xattr_value_len;
+	char  xattr_name[0];
+	/* followed by xattr_value[xattr_value_len], no pad */
+} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info
+					      level 0x205 */
+
+
+/* flags for chattr command */
+#define EXT_SECURE_DELETE		0x00000001 /* EXT3_SECRM_FL */
+#define EXT_ENABLE_UNDELETE		0x00000002 /* EXT3_UNRM_FL */
+/* Reserved for compress file 0x4 */
+#define EXT_SYNCHRONOUS			0x00000008 /* EXT3_SYNC_FL */
+#define EXT_IMMUTABLE_FL		0x00000010 /* EXT3_IMMUTABLE_FL */
+#define EXT_OPEN_APPEND_ONLY		0x00000020 /* EXT3_APPEND_FL */
+#define EXT_DO_NOT_BACKUP		0x00000040 /* EXT3_NODUMP_FL */
+#define EXT_NO_UPDATE_ATIME		0x00000080 /* EXT3_NOATIME_FL */
+/* 0x100 through 0x800 reserved for compression flags and are GET-ONLY */
+#define EXT_HASH_TREE_INDEXED_DIR	0x00001000 /* GET-ONLY EXT3_INDEX_FL */
+/* 0x2000 reserved for IMAGIC_FL */
+#define EXT_JOURNAL_THIS_FILE	0x00004000 /* GET-ONLY EXT3_JOURNAL_DATA_FL */
+/* 0x8000 reserved for EXT3_NOTAIL_FL */
+#define EXT_SYNCHRONOUS_DIR		0x00010000 /* EXT3_DIRSYNC_FL */
+#define EXT_TOPDIR			0x00020000 /* EXT3_TOPDIR_FL */
+
+#define EXT_SET_MASK			0x000300FF
+#define EXT_GET_MASK			0x0003DFFF
+
+typedef struct file_chattr_info {
+	__le64	mask; /* list of all possible attribute bits */
+	__le64	mode; /* list of actual attribute bits on this inode */
+} __attribute__((packed)) FILE_CHATTR_INFO;  /* ext attributes
+						(chattr, chflags) level 0x206 */
+#endif 				/* POSIX */
+#endif				/* _CIFSPDU_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
new file mode 100644
index 00000000..8df28e92
--- /dev/null
+++ b/fs/cifs/cifsproto.h
@@ -0,0 +1,460 @@
+/*
+ *   fs/cifs/cifsproto.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _CIFSPROTO_H
+#define _CIFSPROTO_H
+#include <linux/nls.h>
+
+struct statfs;
+struct smb_vol;
+
+/*
+ *****************************************************************
+ * All Prototypes
+ *****************************************************************
+ */
+
+extern struct smb_hdr *cifs_buf_get(void);
+extern void cifs_buf_release(void *);
+extern struct smb_hdr *cifs_small_buf_get(void);
+extern void cifs_small_buf_release(void *);
+extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
+			unsigned int /* length */);
+extern unsigned int _GetXid(void);
+extern void _FreeXid(unsigned int);
+#define GetXid()						\
+({								\
+	int __xid = (int)_GetXid();				\
+	cFYI(1, "CIFS VFS: in %s as Xid: %d with uid: %d",	\
+	     __func__, __xid, current_fsuid());			\
+	__xid;							\
+})
+
+#define FreeXid(curr_xid)					\
+do {								\
+	_FreeXid(curr_xid);					\
+	cFYI(1, "CIFS VFS: leaving %s (xid = %d) rc = %d",	\
+	     __func__, curr_xid, (int)rc);			\
+} while (0)
+extern int init_cifs_idmap(void);
+extern void exit_cifs_idmap(void);
+extern void cifs_destroy_idmaptrees(void);
+extern char *build_path_from_dentry(struct dentry *);
+extern char *cifs_build_path_to_root(struct smb_vol *vol,
+				     struct cifs_sb_info *cifs_sb,
+				     struct cifs_tcon *tcon);
+extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
+extern char *cifs_compose_mount_options(const char *sb_mountdata,
+		const char *fullpath, const struct dfs_info3_param *ref,
+		char **devname);
+/* extern void renew_parental_timestamps(struct dentry *direntry);*/
+extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
+					struct TCP_Server_Info *server);
+extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
+extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
+			   unsigned int nvec, mid_callback_t *callback,
+			   void *cbdata, bool ignore_pend);
+extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
+			struct smb_hdr * /* input */ ,
+			struct smb_hdr * /* out */ ,
+			int * /* bytes returned */ , const int long_op);
+extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
+			struct smb_hdr *in_buf, int flags);
+extern int cifs_check_receive(struct mid_q_entry *mid,
+			struct TCP_Server_Info *server, bool log_error);
+extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
+			struct kvec *, int /* nvec to send */,
+			int * /* type of buf returned */ , const int flags);
+extern int SendReceiveBlockingLock(const unsigned int xid,
+			struct cifs_tcon *ptcon,
+			struct smb_hdr *in_buf ,
+			struct smb_hdr *out_buf,
+			int *bytes_returned);
+extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
+extern bool is_valid_oplock_break(struct smb_hdr *smb,
+				  struct TCP_Server_Info *);
+extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
+extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
+			    unsigned int bytes_written);
+extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
+extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
+extern unsigned int smbCalcSize(struct smb_hdr *ptr);
+extern int decode_negTokenInit(unsigned char *security_blob, int length,
+			struct TCP_Server_Info *server);
+extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
+extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port);
+extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
+				const unsigned short int port);
+extern int map_smb_to_linux_error(struct smb_hdr *smb, bool logErr);
+extern void header_assemble(struct smb_hdr *, char /* command */ ,
+			    const struct cifs_tcon *, int /* length of
+			    fixed section (word count) in two byte units */);
+extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
+				struct cifs_ses *ses,
+				void **request_buf);
+extern int CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses,
+			     const struct nls_table *nls_cp);
+extern __u16 GetNextMid(struct TCP_Server_Info *server);
+extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
+extern u64 cifs_UnixTimeToNT(struct timespec);
+extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
+				      int offset);
+extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
+
+extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
+				struct file *file, struct tcon_link *tlink,
+				__u32 oplock);
+extern int cifs_posix_open(char *full_path, struct inode **pinode,
+				struct super_block *sb,
+				int mode, unsigned int f_flags,
+				__u32 *poplock, __u16 *pnetfid, int xid);
+void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
+extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
+				     FILE_UNIX_BASIC_INFO *info,
+				     struct cifs_sb_info *cifs_sb);
+extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
+extern struct inode *cifs_iget(struct super_block *sb,
+			       struct cifs_fattr *fattr);
+
+extern int cifs_get_file_info(struct file *filp);
+extern int cifs_get_inode_info(struct inode **pinode,
+			const unsigned char *search_path,
+			FILE_ALL_INFO *pfile_info,
+			struct super_block *sb, int xid, const __u16 *pfid);
+extern int cifs_get_file_info_unix(struct file *filp);
+extern int cifs_get_inode_info_unix(struct inode **pinode,
+			const unsigned char *search_path,
+			struct super_block *sb, int xid);
+extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
+			      struct cifs_fattr *fattr, struct inode *inode,
+			      const char *path, const __u16 *pfid);
+extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
+extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
+					const char *, u32 *);
+extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
+				const char *);
+
+extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
+			       struct cifs_sb_info *cifs_sb);
+extern int cifs_match_super(struct super_block *, void *);
+extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info);
+extern struct smb_vol *cifs_get_volume_info(char *mount_data,
+					    const char *devname);
+extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *);
+extern void cifs_umount(struct cifs_sb_info *);
+extern void cifs_dfs_release_automount_timer(void);
+void cifs_proc_init(void);
+void cifs_proc_clean(void);
+
+extern int cifs_negotiate_protocol(unsigned int xid,
+				  struct cifs_ses *ses);
+extern int cifs_setup_session(unsigned int xid, struct cifs_ses *ses,
+			struct nls_table *nls_info);
+extern int CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses);
+
+extern int CIFSTCon(unsigned int xid, struct cifs_ses *ses,
+			const char *tree, struct cifs_tcon *tcon,
+			const struct nls_table *);
+
+extern int CIFSFindFirst(const int xid, struct cifs_tcon *tcon,
+		const char *searchName, const struct nls_table *nls_codepage,
+		__u16 *searchHandle, struct cifs_search_info *psrch_inf,
+		int map, const char dirsep);
+
+extern int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
+		__u16 searchHandle, struct cifs_search_info *psrch_inf);
+
+extern int CIFSFindClose(const int, struct cifs_tcon *tcon,
+			const __u16 search_handle);
+
+extern int CIFSSMBQFileInfo(const int xid, struct cifs_tcon *tcon,
+			u16 netfid, FILE_ALL_INFO *pFindData);
+extern int CIFSSMBQPathInfo(const int xid, struct cifs_tcon *tcon,
+			const unsigned char *searchName,
+			FILE_ALL_INFO *findData,
+			int legacy /* whether to use old info level */,
+			const struct nls_table *nls_codepage, int remap);
+extern int SMBQueryInformation(const int xid, struct cifs_tcon *tcon,
+			const unsigned char *searchName,
+			FILE_ALL_INFO *findData,
+			const struct nls_table *nls_codepage, int remap);
+
+extern int CIFSSMBUnixQFileInfo(const int xid, struct cifs_tcon *tcon,
+			u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
+extern int CIFSSMBUnixQPathInfo(const int xid,
+			struct cifs_tcon *tcon,
+			const unsigned char *searchName,
+			FILE_UNIX_BASIC_INFO *pFindData,
+			const struct nls_table *nls_codepage, int remap);
+
+extern int CIFSGetDFSRefer(const int xid, struct cifs_ses *ses,
+			const unsigned char *searchName,
+			struct dfs_info3_param **target_nodes,
+			unsigned int *number_of_nodes_in_array,
+			const struct nls_table *nls_codepage, int remap);
+
+extern int get_dfs_path(int xid, struct cifs_ses *pSesInfo,
+			const char *old_path,
+			const struct nls_table *nls_codepage,
+			unsigned int *pnum_referrals,
+			struct dfs_info3_param **preferrals,
+			int remap);
+extern void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
+				 struct cifs_sb_info *cifs_sb,
+				 struct smb_vol *vol);
+extern int CIFSSMBQFSInfo(const int xid, struct cifs_tcon *tcon,
+			struct kstatfs *FSData);
+extern int SMBOldQFSInfo(const int xid, struct cifs_tcon *tcon,
+			struct kstatfs *FSData);
+extern int CIFSSMBSetFSUnixInfo(const int xid, struct cifs_tcon *tcon,
+			__u64 cap);
+
+extern int CIFSSMBQFSAttributeInfo(const int xid,
+			struct cifs_tcon *tcon);
+extern int CIFSSMBQFSDeviceInfo(const int xid, struct cifs_tcon *tcon);
+extern int CIFSSMBQFSUnixInfo(const int xid, struct cifs_tcon *tcon);
+extern int CIFSSMBQFSPosixInfo(const int xid, struct cifs_tcon *tcon,
+			struct kstatfs *FSData);
+
+extern int CIFSSMBSetPathInfo(const int xid, struct cifs_tcon *tcon,
+			const char *fileName, const FILE_BASIC_INFO *data,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern int CIFSSMBSetFileInfo(const int xid, struct cifs_tcon *tcon,
+			const FILE_BASIC_INFO *data, __u16 fid,
+			__u32 pid_of_opener);
+extern int CIFSSMBSetFileDisposition(const int xid, struct cifs_tcon *tcon,
+			bool delete_file, __u16 fid, __u32 pid_of_opener);
+#if 0
+extern int CIFSSMBSetAttrLegacy(int xid, struct cifs_tcon *tcon,
+			char *fileName, __u16 dos_attributes,
+			const struct nls_table *nls_codepage);
+#endif /* possibly unneeded function */
+extern int CIFSSMBSetEOF(const int xid, struct cifs_tcon *tcon,
+			const char *fileName, __u64 size,
+			bool setAllocationSizeFlag,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern int CIFSSMBSetFileSize(const int xid, struct cifs_tcon *tcon,
+			 __u64 size, __u16 fileHandle, __u32 opener_pid,
+			bool AllocSizeFlag);
+
+struct cifs_unix_set_info_args {
+	__u64	ctime;
+	__u64	atime;
+	__u64	mtime;
+	__u64	mode;
+	__u64	uid;
+	__u64	gid;
+	dev_t	device;
+};
+
+extern int CIFSSMBUnixSetFileInfo(const int xid, struct cifs_tcon *tcon,
+				  const struct cifs_unix_set_info_args *args,
+				  u16 fid, u32 pid_of_opener);
+
+extern int CIFSSMBUnixSetPathInfo(const int xid, struct cifs_tcon *pTcon,
+			char *fileName,
+			const struct cifs_unix_set_info_args *args,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+
+extern int CIFSSMBMkDir(const int xid, struct cifs_tcon *tcon,
+			const char *newName,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern int CIFSSMBRmDir(const int xid, struct cifs_tcon *tcon,
+			const char *name, const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern int CIFSPOSIXDelFile(const int xid, struct cifs_tcon *tcon,
+			const char *name, __u16 type,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern int CIFSSMBDelFile(const int xid, struct cifs_tcon *tcon,
+			const char *name,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern int CIFSSMBRename(const int xid, struct cifs_tcon *tcon,
+			const char *fromName, const char *toName,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon,
+			int netfid, const char *target_name,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern int CIFSCreateHardLink(const int xid,
+			struct cifs_tcon *tcon,
+			const char *fromName, const char *toName,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern int CIFSUnixCreateHardLink(const int xid,
+			struct cifs_tcon *tcon,
+			const char *fromName, const char *toName,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern int CIFSUnixCreateSymLink(const int xid,
+			struct cifs_tcon *tcon,
+			const char *fromName, const char *toName,
+			const struct nls_table *nls_codepage);
+extern int CIFSSMBUnixQuerySymLink(const int xid,
+			struct cifs_tcon *tcon,
+			const unsigned char *searchName, char **syminfo,
+			const struct nls_table *nls_codepage);
+#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL
+extern int CIFSSMBQueryReparseLinkInfo(const int xid,
+			struct cifs_tcon *tcon,
+			const unsigned char *searchName,
+			char *symlinkinfo, const int buflen, __u16 fid,
+			const struct nls_table *nls_codepage);
+#endif /* temporarily unused until cifs_symlink fixed */
+extern int CIFSSMBOpen(const int xid, struct cifs_tcon *tcon,
+			const char *fileName, const int disposition,
+			const int access_flags, const int omode,
+			__u16 *netfid, int *pOplock, FILE_ALL_INFO *,
+			const struct nls_table *nls_codepage, int remap);
+extern int SMBLegacyOpen(const int xid, struct cifs_tcon *tcon,
+			const char *fileName, const int disposition,
+			const int access_flags, const int omode,
+			__u16 *netfid, int *pOplock, FILE_ALL_INFO *,
+			const struct nls_table *nls_codepage, int remap);
+extern int CIFSPOSIXCreate(const int xid, struct cifs_tcon *tcon,
+			u32 posix_flags, __u64 mode, __u16 *netfid,
+			FILE_UNIX_BASIC_INFO *pRetData,
+			__u32 *pOplock, const char *name,
+			const struct nls_table *nls_codepage, int remap);
+extern int CIFSSMBClose(const int xid, struct cifs_tcon *tcon,
+			const int smb_file_id);
+
+extern int CIFSSMBFlush(const int xid, struct cifs_tcon *tcon,
+			const int smb_file_id);
+
+extern int CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms,
+			unsigned int *nbytes, char **buf,
+			int *return_buf_type);
+extern int CIFSSMBWrite(const int xid, struct cifs_io_parms *io_parms,
+			unsigned int *nbytes, const char *buf,
+			const char __user *ubuf, const int long_op);
+extern int CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms,
+			unsigned int *nbytes, struct kvec *iov, const int nvec,
+			const int long_op);
+extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon,
+			const unsigned char *searchName, __u64 *inode_number,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+
+extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
+			const __u16 netfid, const __u64 len,
+			const __u64 offset, const __u32 numUnlock,
+			const __u32 numLock, const __u8 lockType,
+			const bool waitFlag, const __u8 oplock_level);
+extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
+			const __u16 smb_file_id, const int get_flag,
+			const __u64 len, struct file_lock *,
+			const __u16 lock_type, const bool waitFlag);
+extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon);
+extern int CIFSSMBEcho(struct TCP_Server_Info *server);
+extern int CIFSSMBLogoff(const int xid, struct cifs_ses *ses);
+
+extern struct cifs_ses *sesInfoAlloc(void);
+extern void sesInfoFree(struct cifs_ses *);
+extern struct cifs_tcon *tconInfoAlloc(void);
+extern void tconInfoFree(struct cifs_tcon *);
+
+extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
+extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
+			  __u32 *);
+extern int cifs_verify_signature(struct smb_hdr *,
+				 struct TCP_Server_Info *server,
+				__u32 expected_sequence_number);
+extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
+extern int setup_ntlm_response(struct cifs_ses *);
+extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
+extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
+extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
+extern int calc_seckey(struct cifs_ses *);
+
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+extern int calc_lanman_hash(const char *password, const char *cryptkey,
+				bool encrypt, char *lnm_session_key);
+#endif /* CIFS_WEAK_PW_HASH */
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
+extern int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon,
+			const int notify_subdirs, const __u16 netfid,
+			__u32 filter, struct file *file, int multishot,
+			const struct nls_table *nls_codepage);
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
+extern int CIFSSMBCopy(int xid,
+			struct cifs_tcon *source_tcon,
+			const char *fromName,
+			const __u16 target_tid,
+			const char *toName, const int flags,
+			const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifs_tcon *tcon,
+			const unsigned char *searchName,
+			const unsigned char *ea_name, char *EAData,
+			size_t bufsize, const struct nls_table *nls_codepage,
+			int remap_special_chars);
+extern int CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon,
+		const char *fileName, const char *ea_name,
+		const void *ea_value, const __u16 ea_value_len,
+		const struct nls_table *nls_codepage, int remap_special_chars);
+extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon,
+			__u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
+extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16,
+			struct cifs_ntsd *, __u32);
+extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon,
+		const unsigned char *searchName,
+		char *acl_inf, const int buflen, const int acl_type,
+		const struct nls_table *nls_codepage, int remap_special_chars);
+extern int CIFSSMBSetPosixACL(const int xid, struct cifs_tcon *tcon,
+		const unsigned char *fileName,
+		const char *local_acl, const int buflen, const int acl_type,
+		const struct nls_table *nls_codepage, int remap_special_chars);
+extern int CIFSGetExtAttr(const int xid, struct cifs_tcon *tcon,
+			const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
+extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
+extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
+extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+		const unsigned char *path,
+		struct cifs_sb_info *cifs_sb, int xid);
+extern int mdfour(unsigned char *, unsigned char *, int);
+extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
+extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
+			unsigned char *p24);
+
+/* asynchronous write support */
+struct cifs_writedata {
+	struct kref			refcount;
+	enum writeback_sync_modes	sync_mode;
+	struct work_struct		work;
+	struct cifsFileInfo		*cfile;
+	__u64				offset;
+	unsigned int			bytes;
+	int				result;
+	unsigned int			nr_pages;
+	struct page			*pages[1];
+};
+
+int cifs_async_writev(struct cifs_writedata *wdata);
+struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages);
+void cifs_writedata_release(struct kref *refcount);
+
+#endif			/* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
new file mode 100644
index 00000000..07132c4e
--- /dev/null
+++ b/fs/cifs/cifssmb.c
@@ -0,0 +1,6088 @@
+/*
+ *   fs/cifs/cifssmb.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   Contains the routines for constructing the SMB PDUs themselves
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+ /* SMB/CIFS PDU handling routines here - except for leftovers in connect.c   */
+ /* These are mostly routines that operate on a pathname, or on a tree id     */
+ /* (mounted volume), but there are eight handle based routines which must be */
+ /* treated slightly differently for reconnection purposes since we never     */
+ /* want to reuse a stale file handle and only the caller knows the file info */
+
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/pagemap.h>
+#include <asm/uaccess.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsacl.h"
+#include "cifsproto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+
+#ifdef CONFIG_CIFS_POSIX
+static struct {
+	int index;
+	char *name;
+} protocols[] = {
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+	{LANMAN_PROT, "\2LM1.2X002"},
+	{LANMAN2_PROT, "\2LANMAN2.1"},
+#endif /* weak password hashing for legacy clients */
+	{CIFS_PROT, "\2NT LM 0.12"},
+	{POSIX_PROT, "\2POSIX 2"},
+	{BAD_PROT, "\2"}
+};
+#else
+static struct {
+	int index;
+	char *name;
+} protocols[] = {
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+	{LANMAN_PROT, "\2LM1.2X002"},
+	{LANMAN2_PROT, "\2LANMAN2.1"},
+#endif /* weak password hashing for legacy clients */
+	{CIFS_PROT, "\2NT LM 0.12"},
+	{BAD_PROT, "\2"}
+};
+#endif
+
+/* define the number of elements in the cifs dialect array */
+#ifdef CONFIG_CIFS_POSIX
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define CIFS_NUM_PROT 4
+#else
+#define CIFS_NUM_PROT 2
+#endif /* CIFS_WEAK_PW_HASH */
+#else /* not posix */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define CIFS_NUM_PROT 3
+#else
+#define CIFS_NUM_PROT 1
+#endif /* CONFIG_CIFS_WEAK_PW_HASH */
+#endif /* CIFS_POSIX */
+
+/* Mark as invalid, all open files on tree connections since they
+   were closed when session to server was lost */
+static void mark_open_files_invalid(struct cifs_tcon *pTcon)
+{
+	struct cifsFileInfo *open_file = NULL;
+	struct list_head *tmp;
+	struct list_head *tmp1;
+
+/* list all files open on tree connection and mark them invalid */
+	spin_lock(&cifs_file_list_lock);
+	list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
+		open_file = list_entry(tmp, struct cifsFileInfo, tlist);
+		open_file->invalidHandle = true;
+		open_file->oplock_break_cancelled = true;
+	}
+	spin_unlock(&cifs_file_list_lock);
+	/* BB Add call to invalidate_inodes(sb) for all superblocks mounted
+	   to this tcon */
+}
+
+/* reconnect the socket, tcon, and smb session if needed */
+static int
+cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
+{
+	int rc = 0;
+	struct cifs_ses *ses;
+	struct TCP_Server_Info *server;
+	struct nls_table *nls_codepage;
+
+	/*
+	 * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
+	 * tcp and smb session status done differently for those three - in the
+	 * calling routine
+	 */
+	if (!tcon)
+		return 0;
+
+	ses = tcon->ses;
+	server = ses->server;
+
+	/*
+	 * only tree disconnect, open, and write, (and ulogoff which does not
+	 * have tcon) are allowed as we start force umount
+	 */
+	if (tcon->tidStatus == CifsExiting) {
+		if (smb_command != SMB_COM_WRITE_ANDX &&
+		    smb_command != SMB_COM_OPEN_ANDX &&
+		    smb_command != SMB_COM_TREE_DISCONNECT) {
+			cFYI(1, "can not send cmd %d while umounting",
+				smb_command);
+			return -ENODEV;
+		}
+	}
+
+	/*
+	 * Give demultiplex thread up to 10 seconds to reconnect, should be
+	 * greater than cifs socket timeout which is 7 seconds
+	 */
+	while (server->tcpStatus == CifsNeedReconnect) {
+		wait_event_interruptible_timeout(server->response_q,
+			(server->tcpStatus != CifsNeedReconnect), 10 * HZ);
+
+		/* are we still trying to reconnect? */
+		if (server->tcpStatus != CifsNeedReconnect)
+			break;
+
+		/*
+		 * on "soft" mounts we wait once. Hard mounts keep
+		 * retrying until process is killed or server comes
+		 * back on-line
+		 */
+		if (!tcon->retry) {
+			cFYI(1, "gave up waiting on reconnect in smb_init");
+			return -EHOSTDOWN;
+		}
+	}
+
+	if (!ses->need_reconnect && !tcon->need_reconnect)
+		return 0;
+
+	nls_codepage = load_nls_default();
+
+	/*
+	 * need to prevent multiple threads trying to simultaneously
+	 * reconnect the same SMB session
+	 */
+	mutex_lock(&ses->session_mutex);
+	rc = cifs_negotiate_protocol(0, ses);
+	if (rc == 0 && ses->need_reconnect)
+		rc = cifs_setup_session(0, ses, nls_codepage);
+
+	/* do we need to reconnect tcon? */
+	if (rc || !tcon->need_reconnect) {
+		mutex_unlock(&ses->session_mutex);
+		goto out;
+	}
+
+	mark_open_files_invalid(tcon);
+	rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
+	mutex_unlock(&ses->session_mutex);
+	cFYI(1, "reconnect tcon rc = %d", rc);
+
+	if (rc)
+		goto out;
+
+	/*
+	 * FIXME: check if wsize needs updated due to negotiated smb buffer
+	 * 	  size shrinking
+	 */
+	atomic_inc(&tconInfoReconnectCount);
+
+	/* tell server Unix caps we support */
+	if (ses->capabilities & CAP_UNIX)
+		reset_cifs_unix_caps(0, tcon, NULL, NULL);
+
+	/*
+	 * Removed call to reopen open files here. It is safer (and faster) to
+	 * reopen files one at a time as needed in read and write.
+	 *
+	 * FIXME: what about file locks? don't we need to reclaim them ASAP?
+	 */
+
+out:
+	/*
+	 * Check if handle based operation so we know whether we can continue
+	 * or not without returning to caller to reset file handle
+	 */
+	switch (smb_command) {
+	case SMB_COM_READ_ANDX:
+	case SMB_COM_WRITE_ANDX:
+	case SMB_COM_CLOSE:
+	case SMB_COM_FIND_CLOSE2:
+	case SMB_COM_LOCKING_ANDX:
+		rc = -EAGAIN;
+	}
+
+	unload_nls(nls_codepage);
+	return rc;
+}
+
+/* Allocate and return pointer to an SMB request buffer, and set basic
+   SMB information in the SMB header.  If the return code is zero, this
+   function must have filled in request_buf pointer */
+static int
+small_smb_init(int smb_command, int wct, struct cifs_tcon *tcon,
+		void **request_buf)
+{
+	int rc;
+
+	rc = cifs_reconnect_tcon(tcon, smb_command);
+	if (rc)
+		return rc;
+
+	*request_buf = cifs_small_buf_get();
+	if (*request_buf == NULL) {
+		/* BB should we add a retry in here if not a writepage? */
+		return -ENOMEM;
+	}
+
+	header_assemble((struct smb_hdr *) *request_buf, smb_command,
+			tcon, wct);
+
+	if (tcon != NULL)
+		cifs_stats_inc(&tcon->num_smbs_sent);
+
+	return 0;
+}
+
+int
+small_smb_init_no_tc(const int smb_command, const int wct,
+		     struct cifs_ses *ses, void **request_buf)
+{
+	int rc;
+	struct smb_hdr *buffer;
+
+	rc = small_smb_init(smb_command, wct, NULL, request_buf);
+	if (rc)
+		return rc;
+
+	buffer = (struct smb_hdr *)*request_buf;
+	buffer->Mid = GetNextMid(ses->server);
+	if (ses->capabilities & CAP_UNICODE)
+		buffer->Flags2 |= SMBFLG2_UNICODE;
+	if (ses->capabilities & CAP_STATUS32)
+		buffer->Flags2 |= SMBFLG2_ERR_STATUS;
+
+	/* uid, tid can stay at zero as set in header assemble */
+
+	/* BB add support for turning on the signing when
+	this function is used after 1st of session setup requests */
+
+	return rc;
+}
+
+/* If the return code is zero, this function must fill in request_buf pointer */
+static int
+__smb_init(int smb_command, int wct, struct cifs_tcon *tcon,
+			void **request_buf, void **response_buf)
+{
+	*request_buf = cifs_buf_get();
+	if (*request_buf == NULL) {
+		/* BB should we add a retry in here if not a writepage? */
+		return -ENOMEM;
+	}
+    /* Although the original thought was we needed the response buf for  */
+    /* potential retries of smb operations it turns out we can determine */
+    /* from the mid flags when the request buffer can be resent without  */
+    /* having to use a second distinct buffer for the response */
+	if (response_buf)
+		*response_buf = *request_buf;
+
+	header_assemble((struct smb_hdr *) *request_buf, smb_command, tcon,
+			wct);
+
+	if (tcon != NULL)
+		cifs_stats_inc(&tcon->num_smbs_sent);
+
+	return 0;
+}
+
+/* If the return code is zero, this function must fill in request_buf pointer */
+static int
+smb_init(int smb_command, int wct, struct cifs_tcon *tcon,
+	 void **request_buf, void **response_buf)
+{
+	int rc;
+
+	rc = cifs_reconnect_tcon(tcon, smb_command);
+	if (rc)
+		return rc;
+
+	return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
+}
+
+static int
+smb_init_no_reconnect(int smb_command, int wct, struct cifs_tcon *tcon,
+			void **request_buf, void **response_buf)
+{
+	if (tcon->ses->need_reconnect || tcon->need_reconnect)
+		return -EHOSTDOWN;
+
+	return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
+}
+
+static int validate_t2(struct smb_t2_rsp *pSMB)
+{
+	unsigned int total_size;
+
+	/* check for plausible wct */
+	if (pSMB->hdr.WordCount < 10)
+		goto vt2_err;
+
+	/* check for parm and data offset going beyond end of smb */
+	if (get_unaligned_le16(&pSMB->t2_rsp.ParameterOffset) > 1024 ||
+	    get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024)
+		goto vt2_err;
+
+	total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount);
+	if (total_size >= 512)
+		goto vt2_err;
+
+	/* check that bcc is at least as big as parms + data, and that it is
+	 * less than negotiated smb buffer
+	 */
+	total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount);
+	if (total_size > get_bcc(&pSMB->hdr) ||
+	    total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)
+		goto vt2_err;
+
+	return 0;
+vt2_err:
+	cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB,
+		sizeof(struct smb_t2_rsp) + 16);
+	return -EINVAL;
+}
+
+static inline void inc_rfc1001_len(void *pSMB, int count)
+{
+	struct smb_hdr *hdr = (struct smb_hdr *)pSMB;
+
+	be32_add_cpu(&hdr->smb_buf_length, count);
+}
+
+int
+CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
+{
+	NEGOTIATE_REQ *pSMB;
+	NEGOTIATE_RSP *pSMBr;
+	int rc = 0;
+	int bytes_returned;
+	int i;
+	struct TCP_Server_Info *server;
+	u16 count;
+	unsigned int secFlags;
+
+	if (ses->server)
+		server = ses->server;
+	else {
+		rc = -EIO;
+		return rc;
+	}
+	rc = smb_init(SMB_COM_NEGOTIATE, 0, NULL /* no tcon yet */ ,
+		      (void **) &pSMB, (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	/* if any of auth flags (ie not sign or seal) are overriden use them */
+	if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
+		secFlags = ses->overrideSecFlg;  /* BB FIXME fix sign flags? */
+	else /* if override flags set only sign/seal OR them with global auth */
+		secFlags = global_secflags | ses->overrideSecFlg;
+
+	cFYI(1, "secFlags 0x%x", secFlags);
+
+	pSMB->hdr.Mid = GetNextMid(server);
+	pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
+
+	if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
+		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
+	else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
+		cFYI(1, "Kerberos only mechanism, enable extended security");
+		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
+	} else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
+		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
+	else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
+		cFYI(1, "NTLMSSP only mechanism, enable extended security");
+		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
+	}
+
+	count = 0;
+	for (i = 0; i < CIFS_NUM_PROT; i++) {
+		strncpy(pSMB->DialectsArray+count, protocols[i].name, 16);
+		count += strlen(protocols[i].name) + 1;
+		/* null at end of source and target buffers anyway */
+	}
+	inc_rfc1001_len(pSMB, count);
+	pSMB->ByteCount = cpu_to_le16(count);
+
+	rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc != 0)
+		goto neg_err_exit;
+
+	server->dialect = le16_to_cpu(pSMBr->DialectIndex);
+	cFYI(1, "Dialect: %d", server->dialect);
+	/* Check wct = 1 error case */
+	if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) {
+		/* core returns wct = 1, but we do not ask for core - otherwise
+		small wct just comes when dialect index is -1 indicating we
+		could not negotiate a common dialect */
+		rc = -EOPNOTSUPP;
+		goto neg_err_exit;
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+	} else if ((pSMBr->hdr.WordCount == 13)
+			&& ((server->dialect == LANMAN_PROT)
+				|| (server->dialect == LANMAN2_PROT))) {
+		__s16 tmp;
+		struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
+
+		if ((secFlags & CIFSSEC_MAY_LANMAN) ||
+			(secFlags & CIFSSEC_MAY_PLNTXT))
+			server->secType = LANMAN;
+		else {
+			cERROR(1, "mount failed weak security disabled"
+				   " in /proc/fs/cifs/SecurityFlags");
+			rc = -EOPNOTSUPP;
+			goto neg_err_exit;
+		}
+		server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode);
+		server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
+		server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
+				(__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+		server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
+		/* even though we do not use raw we might as well set this
+		accurately, in case we ever find a need for it */
+		if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
+			server->max_rw = 0xFF00;
+			server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
+		} else {
+			server->max_rw = 0;/* do not need to use raw anyway */
+			server->capabilities = CAP_MPX_MODE;
+		}
+		tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
+		if (tmp == -1) {
+			/* OS/2 often does not set timezone therefore
+			 * we must use server time to calc time zone.
+			 * Could deviate slightly from the right zone.
+			 * Smallest defined timezone difference is 15 minutes
+			 * (i.e. Nepal).  Rounding up/down is done to match
+			 * this requirement.
+			 */
+			int val, seconds, remain, result;
+			struct timespec ts, utc;
+			utc = CURRENT_TIME;
+			ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
+					    rsp->SrvTime.Time, 0);
+			cFYI(1, "SrvTime %d sec since 1970 (utc: %d) diff: %d",
+				(int)ts.tv_sec, (int)utc.tv_sec,
+				(int)(utc.tv_sec - ts.tv_sec));
+			val = (int)(utc.tv_sec - ts.tv_sec);
+			seconds = abs(val);
+			result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
+			remain = seconds % MIN_TZ_ADJ;
+			if (remain >= (MIN_TZ_ADJ / 2))
+				result += MIN_TZ_ADJ;
+			if (val < 0)
+				result = -result;
+			server->timeAdj = result;
+		} else {
+			server->timeAdj = (int)tmp;
+			server->timeAdj *= 60; /* also in seconds */
+		}
+		cFYI(1, "server->timeAdj: %d seconds", server->timeAdj);
+
+
+		/* BB get server time for time conversions and add
+		code to use it and timezone since this is not UTC */
+
+		if (rsp->EncryptionKeyLength ==
+				cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
+			memcpy(ses->server->cryptkey, rsp->EncryptionKey,
+				CIFS_CRYPTO_KEY_SIZE);
+		} else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
+			rc = -EIO; /* need cryptkey unless plain text */
+			goto neg_err_exit;
+		}
+
+		cFYI(1, "LANMAN negotiated");
+		/* we will not end up setting signing flags - as no signing
+		was in LANMAN and server did not return the flags on */
+		goto signing_check;
+#else /* weak security disabled */
+	} else if (pSMBr->hdr.WordCount == 13) {
+		cERROR(1, "mount failed, cifs module not built "
+			  "with CIFS_WEAK_PW_HASH support");
+		rc = -EOPNOTSUPP;
+#endif /* WEAK_PW_HASH */
+		goto neg_err_exit;
+	} else if (pSMBr->hdr.WordCount != 17) {
+		/* unknown wct */
+		rc = -EOPNOTSUPP;
+		goto neg_err_exit;
+	}
+	/* else wct == 17 NTLM */
+	server->sec_mode = pSMBr->SecurityMode;
+	if ((server->sec_mode & SECMODE_USER) == 0)
+		cFYI(1, "share mode security");
+
+	if ((server->sec_mode & SECMODE_PW_ENCRYPT) == 0)
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+		if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
+#endif /* CIFS_WEAK_PW_HASH */
+			cERROR(1, "Server requests plain text password"
+				  " but client support disabled");
+
+	if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
+		server->secType = NTLMv2;
+	else if (secFlags & CIFSSEC_MAY_NTLM)
+		server->secType = NTLM;
+	else if (secFlags & CIFSSEC_MAY_NTLMV2)
+		server->secType = NTLMv2;
+	else if (secFlags & CIFSSEC_MAY_KRB5)
+		server->secType = Kerberos;
+	else if (secFlags & CIFSSEC_MAY_NTLMSSP)
+		server->secType = RawNTLMSSP;
+	else if (secFlags & CIFSSEC_MAY_LANMAN)
+		server->secType = LANMAN;
+	else {
+		rc = -EOPNOTSUPP;
+		cERROR(1, "Invalid security type");
+		goto neg_err_exit;
+	}
+	/* else ... any others ...? */
+
+	/* one byte, so no need to convert this or EncryptionKeyLen from
+	   little endian */
+	server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount);
+	/* probably no need to store and check maxvcs */
+	server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
+			(__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+	server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
+	cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
+	server->capabilities = le32_to_cpu(pSMBr->Capabilities);
+	server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
+	server->timeAdj *= 60;
+	if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
+		memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
+		       CIFS_CRYPTO_KEY_SIZE);
+	} else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC ||
+			server->capabilities & CAP_EXTENDED_SECURITY) &&
+				(pSMBr->EncryptionKeyLength == 0)) {
+		/* decode security blob */
+		count = get_bcc(&pSMBr->hdr);
+		if (count < 16) {
+			rc = -EIO;
+			goto neg_err_exit;
+		}
+		spin_lock(&cifs_tcp_ses_lock);
+		if (server->srv_count > 1) {
+			spin_unlock(&cifs_tcp_ses_lock);
+			if (memcmp(server->server_GUID,
+				   pSMBr->u.extended_response.
+				   GUID, 16) != 0) {
+				cFYI(1, "server UID changed");
+				memcpy(server->server_GUID,
+					pSMBr->u.extended_response.GUID,
+					16);
+			}
+		} else {
+			spin_unlock(&cifs_tcp_ses_lock);
+			memcpy(server->server_GUID,
+			       pSMBr->u.extended_response.GUID, 16);
+		}
+
+		if (count == 16) {
+			server->secType = RawNTLMSSP;
+		} else {
+			rc = decode_negTokenInit(pSMBr->u.extended_response.
+						 SecurityBlob, count - 16,
+						 server);
+			if (rc == 1)
+				rc = 0;
+			else
+				rc = -EINVAL;
+			if (server->secType == Kerberos) {
+				if (!server->sec_kerberos &&
+						!server->sec_mskerberos)
+					rc = -EOPNOTSUPP;
+			} else if (server->secType == RawNTLMSSP) {
+				if (!server->sec_ntlmssp)
+					rc = -EOPNOTSUPP;
+			} else
+					rc = -EOPNOTSUPP;
+		}
+	} else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
+		rc = -EIO; /* no crypt key only if plain text pwd */
+		goto neg_err_exit;
+	} else
+		server->capabilities &= ~CAP_EXTENDED_SECURITY;
+
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+signing_check:
+#endif
+	if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
+		/* MUST_SIGN already includes the MAY_SIGN FLAG
+		   so if this is zero it means that signing is disabled */
+		cFYI(1, "Signing disabled");
+		if (server->sec_mode & SECMODE_SIGN_REQUIRED) {
+			cERROR(1, "Server requires "
+				   "packet signing to be enabled in "
+				   "/proc/fs/cifs/SecurityFlags.");
+			rc = -EOPNOTSUPP;
+		}
+		server->sec_mode &=
+			~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
+	} else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
+		/* signing required */
+		cFYI(1, "Must sign - secFlags 0x%x", secFlags);
+		if ((server->sec_mode &
+			(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
+			cERROR(1, "signing required but server lacks support");
+			rc = -EOPNOTSUPP;
+		} else
+			server->sec_mode |= SECMODE_SIGN_REQUIRED;
+	} else {
+		/* signing optional ie CIFSSEC_MAY_SIGN */
+		if ((server->sec_mode & SECMODE_SIGN_REQUIRED) == 0)
+			server->sec_mode &=
+				~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
+	}
+
+neg_err_exit:
+	cifs_buf_release(pSMB);
+
+	cFYI(1, "negprot rc %d", rc);
+	return rc;
+}
+
+int
+CIFSSMBTDis(const int xid, struct cifs_tcon *tcon)
+{
+	struct smb_hdr *smb_buffer;
+	int rc = 0;
+
+	cFYI(1, "In tree disconnect");
+
+	/* BB: do we need to check this? These should never be NULL. */
+	if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
+		return -EIO;
+
+	/*
+	 * No need to return error on this operation if tid invalidated and
+	 * closed on server already e.g. due to tcp session crashing. Also,
+	 * the tcon is no longer on the list, so no need to take lock before
+	 * checking this.
+	 */
+	if ((tcon->need_reconnect) || (tcon->ses->need_reconnect))
+		return 0;
+
+	rc = small_smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon,
+			    (void **)&smb_buffer);
+	if (rc)
+		return rc;
+
+	rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
+	if (rc)
+		cFYI(1, "Tree disconnect failed %d", rc);
+
+	/* No need to return error on this operation if tid invalidated and
+	   closed on server already e.g. due to tcp session crashing */
+	if (rc == -EAGAIN)
+		rc = 0;
+
+	return rc;
+}
+
+/*
+ * This is a no-op for now. We're not really interested in the reply, but
+ * rather in the fact that the server sent one and that server->lstrp
+ * gets updated.
+ *
+ * FIXME: maybe we should consider checking that the reply matches request?
+ */
+static void
+cifs_echo_callback(struct mid_q_entry *mid)
+{
+	struct TCP_Server_Info *server = mid->callback_data;
+
+	DeleteMidQEntry(mid);
+	atomic_dec(&server->inFlight);
+	wake_up(&server->request_q);
+}
+
+int
+CIFSSMBEcho(struct TCP_Server_Info *server)
+{
+	ECHO_REQ *smb;
+	int rc = 0;
+	struct kvec iov;
+
+	cFYI(1, "In echo request");
+
+	rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb);
+	if (rc)
+		return rc;
+
+	/* set up echo request */
+	smb->hdr.Tid = 0xffff;
+	smb->hdr.WordCount = 1;
+	put_unaligned_le16(1, &smb->EchoCount);
+	put_bcc(1, &smb->hdr);
+	smb->Data[0] = 'a';
+	inc_rfc1001_len(smb, 3);
+	iov.iov_base = smb;
+	iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
+
+	rc = cifs_call_async(server, &iov, 1, cifs_echo_callback, server, true);
+	if (rc)
+		cFYI(1, "Echo request failed: %d", rc);
+
+	cifs_small_buf_release(smb);
+
+	return rc;
+}
+
+int
+CIFSSMBLogoff(const int xid, struct cifs_ses *ses)
+{
+	LOGOFF_ANDX_REQ *pSMB;
+	int rc = 0;
+
+	cFYI(1, "In SMBLogoff for session disconnect");
+
+	/*
+	 * BB: do we need to check validity of ses and server? They should
+	 * always be valid since we have an active reference. If not, that
+	 * should probably be a BUG()
+	 */
+	if (!ses || !ses->server)
+		return -EIO;
+
+	mutex_lock(&ses->session_mutex);
+	if (ses->need_reconnect)
+		goto session_already_dead; /* no need to send SMBlogoff if uid
+					      already closed due to reconnect */
+	rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB);
+	if (rc) {
+		mutex_unlock(&ses->session_mutex);
+		return rc;
+	}
+
+	pSMB->hdr.Mid = GetNextMid(ses->server);
+
+	if (ses->server->sec_mode &
+		   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+			pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+
+	pSMB->hdr.Uid = ses->Suid;
+
+	pSMB->AndXCommand = 0xFF;
+	rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
+session_already_dead:
+	mutex_unlock(&ses->session_mutex);
+
+	/* if session dead then we do not need to do ulogoff,
+		since server closed smb session, no sense reporting
+		error */
+	if (rc == -EAGAIN)
+		rc = 0;
+	return rc;
+}
+
+int
+CIFSPOSIXDelFile(const int xid, struct cifs_tcon *tcon, const char *fileName,
+		 __u16 type, const struct nls_table *nls_codepage, int remap)
+{
+	TRANSACTION2_SPI_REQ *pSMB = NULL;
+	TRANSACTION2_SPI_RSP *pSMBr = NULL;
+	struct unlink_psx_rq *pRqD;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, param_offset, offset, byte_count;
+
+	cFYI(1, "In POSIX delete");
+PsxDelete:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
+				     PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else { /* BB add path length overrun check */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, fileName, name_len);
+	}
+
+	params = 6 + name_len;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = 0; /* BB double check this with jra */
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+
+	/* Setup pointer to Request Data (inode type) */
+	pRqD = (struct unlink_psx_rq *)(((char *)&pSMB->hdr.Protocol) + offset);
+	pRqD->type = cpu_to_le16(type);
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + sizeof(struct unlink_psx_rq);
+
+	pSMB->DataCount = cpu_to_le16(sizeof(struct unlink_psx_rq));
+	pSMB->TotalDataCount = cpu_to_le16(sizeof(struct unlink_psx_rq));
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_UNLINK);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cFYI(1, "Posix delete returned %d", rc);
+	cifs_buf_release(pSMB);
+
+	cifs_stats_inc(&tcon->num_deletes);
+
+	if (rc == -EAGAIN)
+		goto PsxDelete;
+
+	return rc;
+}
+
+int
+CIFSSMBDelFile(const int xid, struct cifs_tcon *tcon, const char *fileName,
+	       const struct nls_table *nls_codepage, int remap)
+{
+	DELETE_FILE_REQ *pSMB = NULL;
+	DELETE_FILE_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+
+DelFileRetry:
+	rc = smb_init(SMB_COM_DELETE, 1, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUCS((__le16 *) pSMB->fileName, fileName,
+				     PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {		/* BB improve check for buffer overruns BB */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->fileName, fileName, name_len);
+	}
+	pSMB->SearchAttributes =
+	    cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM);
+	pSMB->BufferFormat = 0x04;
+	inc_rfc1001_len(pSMB, name_len + 1);
+	pSMB->ByteCount = cpu_to_le16(name_len + 1);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->num_deletes);
+	if (rc)
+		cFYI(1, "Error in RMFile = %d", rc);
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto DelFileRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBRmDir(const int xid, struct cifs_tcon *tcon, const char *dirName,
+	     const struct nls_table *nls_codepage, int remap)
+{
+	DELETE_DIRECTORY_REQ *pSMB = NULL;
+	DELETE_DIRECTORY_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+
+	cFYI(1, "In CIFSSMBRmDir");
+RmDirRetry:
+	rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, dirName,
+					 PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {		/* BB improve check for buffer overruns BB */
+		name_len = strnlen(dirName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->DirName, dirName, name_len);
+	}
+
+	pSMB->BufferFormat = 0x04;
+	inc_rfc1001_len(pSMB, name_len + 1);
+	pSMB->ByteCount = cpu_to_le16(name_len + 1);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->num_rmdirs);
+	if (rc)
+		cFYI(1, "Error in RMDir = %d", rc);
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto RmDirRetry;
+	return rc;
+}
+
+int
+CIFSSMBMkDir(const int xid, struct cifs_tcon *tcon,
+	     const char *name, const struct nls_table *nls_codepage, int remap)
+{
+	int rc = 0;
+	CREATE_DIRECTORY_REQ *pSMB = NULL;
+	CREATE_DIRECTORY_RSP *pSMBr = NULL;
+	int bytes_returned;
+	int name_len;
+
+	cFYI(1, "In CIFSSMBMkDir");
+MkDirRetry:
+	rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, name,
+					    PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {		/* BB improve check for buffer overruns BB */
+		name_len = strnlen(name, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->DirName, name, name_len);
+	}
+
+	pSMB->BufferFormat = 0x04;
+	inc_rfc1001_len(pSMB, name_len + 1);
+	pSMB->ByteCount = cpu_to_le16(name_len + 1);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->num_mkdirs);
+	if (rc)
+		cFYI(1, "Error in Mkdir = %d", rc);
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto MkDirRetry;
+	return rc;
+}
+
+int
+CIFSPOSIXCreate(const int xid, struct cifs_tcon *tcon, __u32 posix_flags,
+		__u64 mode, __u16 *netfid, FILE_UNIX_BASIC_INFO *pRetData,
+		__u32 *pOplock, const char *name,
+		const struct nls_table *nls_codepage, int remap)
+{
+	TRANSACTION2_SPI_REQ *pSMB = NULL;
+	TRANSACTION2_SPI_RSP *pSMBr = NULL;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, param_offset, offset, byte_count, count;
+	OPEN_PSX_REQ *pdata;
+	OPEN_PSX_RSP *psx_rsp;
+
+	cFYI(1, "In POSIX Create");
+PsxCreat:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUCS((__le16 *) pSMB->FileName, name,
+				     PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(name, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, name, name_len);
+	}
+
+	params = 6 + name_len;
+	count = sizeof(OPEN_PSX_REQ);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(1000);	/* large enough */
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+	pdata = (OPEN_PSX_REQ *)(((char *)&pSMB->hdr.Protocol) + offset);
+	pdata->Level = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
+	pdata->Permissions = cpu_to_le64(mode);
+	pdata->PosixOpenFlags = cpu_to_le32(posix_flags);
+	pdata->OpenFlags =  cpu_to_le32(*pOplock);
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_OPEN);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Posix create returned %d", rc);
+		goto psx_create_err;
+	}
+
+	cFYI(1, "copying inode info");
+	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+	if (rc || get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)) {
+		rc = -EIO;	/* bad smb */
+		goto psx_create_err;
+	}
+
+	/* copy return information to pRetData */
+	psx_rsp = (OPEN_PSX_RSP *)((char *) &pSMBr->hdr.Protocol
+			+ le16_to_cpu(pSMBr->t2.DataOffset));
+
+	*pOplock = le16_to_cpu(psx_rsp->OplockFlags);
+	if (netfid)
+		*netfid = psx_rsp->Fid;   /* cifs fid stays in le */
+	/* Let caller know file was created so we can set the mode. */
+	/* Do we care about the CreateAction in any other cases? */
+	if (cpu_to_le32(FILE_CREATE) == psx_rsp->CreateAction)
+		*pOplock |= CIFS_CREATE_ACTION;
+	/* check to make sure response data is there */
+	if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) {
+		pRetData->Type = cpu_to_le32(-1); /* unknown */
+		cFYI(DBG2, "unknown type");
+	} else {
+		if (get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)
+					+ sizeof(FILE_UNIX_BASIC_INFO)) {
+			cERROR(1, "Open response data too small");
+			pRetData->Type = cpu_to_le32(-1);
+			goto psx_create_err;
+		}
+		memcpy((char *) pRetData,
+			(char *)psx_rsp + sizeof(OPEN_PSX_RSP),
+			sizeof(FILE_UNIX_BASIC_INFO));
+	}
+
+psx_create_err:
+	cifs_buf_release(pSMB);
+
+	if (posix_flags & SMB_O_DIRECTORY)
+		cifs_stats_inc(&tcon->num_posixmkdirs);
+	else
+		cifs_stats_inc(&tcon->num_posixopens);
+
+	if (rc == -EAGAIN)
+		goto PsxCreat;
+
+	return rc;
+}
+
+static __u16 convert_disposition(int disposition)
+{
+	__u16 ofun = 0;
+
+	switch (disposition) {
+		case FILE_SUPERSEDE:
+			ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC;
+			break;
+		case FILE_OPEN:
+			ofun = SMBOPEN_OAPPEND;
+			break;
+		case FILE_CREATE:
+			ofun = SMBOPEN_OCREATE;
+			break;
+		case FILE_OPEN_IF:
+			ofun = SMBOPEN_OCREATE | SMBOPEN_OAPPEND;
+			break;
+		case FILE_OVERWRITE:
+			ofun = SMBOPEN_OTRUNC;
+			break;
+		case FILE_OVERWRITE_IF:
+			ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC;
+			break;
+		default:
+			cFYI(1, "unknown disposition %d", disposition);
+			ofun =  SMBOPEN_OAPPEND; /* regular open */
+	}
+	return ofun;
+}
+
+static int
+access_flags_to_smbopen_mode(const int access_flags)
+{
+	int masked_flags = access_flags & (GENERIC_READ | GENERIC_WRITE);
+
+	if (masked_flags == GENERIC_READ)
+		return SMBOPEN_READ;
+	else if (masked_flags == GENERIC_WRITE)
+		return SMBOPEN_WRITE;
+
+	/* just go for read/write */
+	return SMBOPEN_READWRITE;
+}
+
+int
+SMBLegacyOpen(const int xid, struct cifs_tcon *tcon,
+	    const char *fileName, const int openDisposition,
+	    const int access_flags, const int create_options, __u16 *netfid,
+	    int *pOplock, FILE_ALL_INFO *pfile_info,
+	    const struct nls_table *nls_codepage, int remap)
+{
+	int rc = -EACCES;
+	OPENX_REQ *pSMB = NULL;
+	OPENX_RSP *pSMBr = NULL;
+	int bytes_returned;
+	int name_len;
+	__u16 count;
+
+OldOpenRetry:
+	rc = smb_init(SMB_COM_OPEN_ANDX, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->AndXCommand = 0xFF;       /* none */
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		count = 1;      /* account for one byte pad to word boundary */
+		name_len =
+		   cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
+				    fileName, PATH_MAX, nls_codepage, remap);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+	} else {                /* BB improve check for buffer overruns BB */
+		count = 0;      /* no pad */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;     /* trailing null */
+		strncpy(pSMB->fileName, fileName, name_len);
+	}
+	if (*pOplock & REQ_OPLOCK)
+		pSMB->OpenFlags = cpu_to_le16(REQ_OPLOCK);
+	else if (*pOplock & REQ_BATCHOPLOCK)
+		pSMB->OpenFlags = cpu_to_le16(REQ_BATCHOPLOCK);
+
+	pSMB->OpenFlags |= cpu_to_le16(REQ_MORE_INFO);
+	pSMB->Mode = cpu_to_le16(access_flags_to_smbopen_mode(access_flags));
+	pSMB->Mode |= cpu_to_le16(0x40); /* deny none */
+	/* set file as system file if special file such
+	   as fifo and server expecting SFU style and
+	   no Unix extensions */
+
+	if (create_options & CREATE_OPTION_SPECIAL)
+		pSMB->FileAttributes = cpu_to_le16(ATTR_SYSTEM);
+	else /* BB FIXME BB */
+		pSMB->FileAttributes = cpu_to_le16(0/*ATTR_NORMAL*/);
+
+	if (create_options & CREATE_OPTION_READONLY)
+		pSMB->FileAttributes |= cpu_to_le16(ATTR_READONLY);
+
+	/* BB FIXME BB */
+/*	pSMB->CreateOptions = cpu_to_le32(create_options &
+						 CREATE_OPTIONS_MASK); */
+	/* BB FIXME END BB */
+
+	pSMB->Sattr = cpu_to_le16(ATTR_HIDDEN | ATTR_SYSTEM | ATTR_DIRECTORY);
+	pSMB->OpenFunction = cpu_to_le16(convert_disposition(openDisposition));
+	count += name_len;
+	inc_rfc1001_len(pSMB, count);
+
+	pSMB->ByteCount = cpu_to_le16(count);
+	/* long_op set to 1 to allow for oplock break timeouts */
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			(struct smb_hdr *)pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->num_opens);
+	if (rc) {
+		cFYI(1, "Error in Open = %d", rc);
+	} else {
+	/* BB verify if wct == 15 */
+
+/*		*pOplock = pSMBr->OplockLevel; */ /* BB take from action field*/
+
+		*netfid = pSMBr->Fid;   /* cifs fid stays in le */
+		/* Let caller know file was created so we can set the mode. */
+		/* Do we care about the CreateAction in any other cases? */
+	/* BB FIXME BB */
+/*		if (cpu_to_le32(FILE_CREATE) == pSMBr->CreateAction)
+			*pOplock |= CIFS_CREATE_ACTION; */
+	/* BB FIXME END */
+
+		if (pfile_info) {
+			pfile_info->CreationTime = 0; /* BB convert CreateTime*/
+			pfile_info->LastAccessTime = 0; /* BB fixme */
+			pfile_info->LastWriteTime = 0; /* BB fixme */
+			pfile_info->ChangeTime = 0;  /* BB fixme */
+			pfile_info->Attributes =
+				cpu_to_le32(le16_to_cpu(pSMBr->FileAttributes));
+			/* the file_info buf is endian converted by caller */
+			pfile_info->AllocationSize =
+				cpu_to_le64(le32_to_cpu(pSMBr->EndOfFile));
+			pfile_info->EndOfFile = pfile_info->AllocationSize;
+			pfile_info->NumberOfLinks = cpu_to_le32(1);
+			pfile_info->DeletePending = 0;
+		}
+	}
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto OldOpenRetry;
+	return rc;
+}
+
+int
+CIFSSMBOpen(const int xid, struct cifs_tcon *tcon,
+	    const char *fileName, const int openDisposition,
+	    const int access_flags, const int create_options, __u16 *netfid,
+	    int *pOplock, FILE_ALL_INFO *pfile_info,
+	    const struct nls_table *nls_codepage, int remap)
+{
+	int rc = -EACCES;
+	OPEN_REQ *pSMB = NULL;
+	OPEN_RSP *pSMBr = NULL;
+	int bytes_returned;
+	int name_len;
+	__u16 count;
+
+openRetry:
+	rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->AndXCommand = 0xFF;	/* none */
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		count = 1;	/* account for one byte pad to word boundary */
+		name_len =
+		    cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
+				     fileName, PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+		pSMB->NameLength = cpu_to_le16(name_len);
+	} else {		/* BB improve check for buffer overruns BB */
+		count = 0;	/* no pad */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;	/* trailing null */
+		pSMB->NameLength = cpu_to_le16(name_len);
+		strncpy(pSMB->fileName, fileName, name_len);
+	}
+	if (*pOplock & REQ_OPLOCK)
+		pSMB->OpenFlags = cpu_to_le32(REQ_OPLOCK);
+	else if (*pOplock & REQ_BATCHOPLOCK)
+		pSMB->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK);
+	pSMB->DesiredAccess = cpu_to_le32(access_flags);
+	pSMB->AllocationSize = 0;
+	/* set file as system file if special file such
+	   as fifo and server expecting SFU style and
+	   no Unix extensions */
+	if (create_options & CREATE_OPTION_SPECIAL)
+		pSMB->FileAttributes = cpu_to_le32(ATTR_SYSTEM);
+	else
+		pSMB->FileAttributes = cpu_to_le32(ATTR_NORMAL);
+
+	/* XP does not handle ATTR_POSIX_SEMANTICS */
+	/* but it helps speed up case sensitive checks for other
+	servers such as Samba */
+	if (tcon->ses->capabilities & CAP_UNIX)
+		pSMB->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS);
+
+	if (create_options & CREATE_OPTION_READONLY)
+		pSMB->FileAttributes |= cpu_to_le32(ATTR_READONLY);
+
+	pSMB->ShareAccess = cpu_to_le32(FILE_SHARE_ALL);
+	pSMB->CreateDisposition = cpu_to_le32(openDisposition);
+	pSMB->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK);
+	/* BB Expirement with various impersonation levels and verify */
+	pSMB->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION);
+	pSMB->SecurityFlags =
+	    SECURITY_CONTEXT_TRACKING | SECURITY_EFFECTIVE_ONLY;
+
+	count += name_len;
+	inc_rfc1001_len(pSMB, count);
+
+	pSMB->ByteCount = cpu_to_le16(count);
+	/* long_op set to 1 to allow for oplock break timeouts */
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			(struct smb_hdr *)pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->num_opens);
+	if (rc) {
+		cFYI(1, "Error in Open = %d", rc);
+	} else {
+		*pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */
+		*netfid = pSMBr->Fid;	/* cifs fid stays in le */
+		/* Let caller know file was created so we can set the mode. */
+		/* Do we care about the CreateAction in any other cases? */
+		if (cpu_to_le32(FILE_CREATE) == pSMBr->CreateAction)
+			*pOplock |= CIFS_CREATE_ACTION;
+		if (pfile_info) {
+			memcpy((char *)pfile_info, (char *)&pSMBr->CreationTime,
+				36 /* CreationTime to Attributes */);
+			/* the file_info buf is endian converted by caller */
+			pfile_info->AllocationSize = pSMBr->AllocationSize;
+			pfile_info->EndOfFile = pSMBr->EndOfFile;
+			pfile_info->NumberOfLinks = cpu_to_le32(1);
+			pfile_info->DeletePending = 0;
+		}
+	}
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto openRetry;
+	return rc;
+}
+
+int
+CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes,
+	    char **buf, int *pbuf_type)
+{
+	int rc = -EACCES;
+	READ_REQ *pSMB = NULL;
+	READ_RSP *pSMBr = NULL;
+	char *pReadData = NULL;
+	int wct;
+	int resp_buf_type = 0;
+	struct kvec iov[1];
+	__u32 pid = io_parms->pid;
+	__u16 netfid = io_parms->netfid;
+	__u64 offset = io_parms->offset;
+	struct cifs_tcon *tcon = io_parms->tcon;
+	unsigned int count = io_parms->length;
+
+	cFYI(1, "Reading %d bytes on fid %d", count, netfid);
+	if (tcon->ses->capabilities & CAP_LARGE_FILES)
+		wct = 12;
+	else {
+		wct = 10; /* old style read */
+		if ((offset >> 32) > 0)  {
+			/* can not handle this big offset for old */
+			return -EIO;
+		}
+	}
+
+	*nbytes = 0;
+	rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **) &pSMB);
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16));
+
+	/* tcon and ses pointer are checked in smb_init */
+	if (tcon->ses->server == NULL)
+		return -ECONNABORTED;
+
+	pSMB->AndXCommand = 0xFF;       /* none */
+	pSMB->Fid = netfid;
+	pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF);
+	if (wct == 12)
+		pSMB->OffsetHigh = cpu_to_le32(offset >> 32);
+
+	pSMB->Remaining = 0;
+	pSMB->MaxCount = cpu_to_le16(count & 0xFFFF);
+	pSMB->MaxCountHigh = cpu_to_le32(count >> 16);
+	if (wct == 12)
+		pSMB->ByteCount = 0;  /* no need to do le conversion since 0 */
+	else {
+		/* old style read */
+		struct smb_com_readx_req *pSMBW =
+			(struct smb_com_readx_req *)pSMB;
+		pSMBW->ByteCount = 0;
+	}
+
+	iov[0].iov_base = (char *)pSMB;
+	iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
+	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
+			 &resp_buf_type, CIFS_LOG_ERROR);
+	cifs_stats_inc(&tcon->num_reads);
+	pSMBr = (READ_RSP *)iov[0].iov_base;
+	if (rc) {
+		cERROR(1, "Send error in read = %d", rc);
+	} else {
+		int data_length = le16_to_cpu(pSMBr->DataLengthHigh);
+		data_length = data_length << 16;
+		data_length += le16_to_cpu(pSMBr->DataLength);
+		*nbytes = data_length;
+
+		/*check that DataLength would not go beyond end of SMB */
+		if ((data_length > CIFSMaxBufSize)
+				|| (data_length > count)) {
+			cFYI(1, "bad length %d for count %d",
+				 data_length, count);
+			rc = -EIO;
+			*nbytes = 0;
+		} else {
+			pReadData = (char *) (&pSMBr->hdr.Protocol) +
+					le16_to_cpu(pSMBr->DataOffset);
+/*			if (rc = copy_to_user(buf, pReadData, data_length)) {
+				cERROR(1, "Faulting on read rc = %d",rc);
+				rc = -EFAULT;
+			}*/ /* can not use copy_to_user when using page cache*/
+			if (*buf)
+				memcpy(*buf, pReadData, data_length);
+		}
+	}
+
+/*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
+	if (*buf) {
+		if (resp_buf_type == CIFS_SMALL_BUFFER)
+			cifs_small_buf_release(iov[0].iov_base);
+		else if (resp_buf_type == CIFS_LARGE_BUFFER)
+			cifs_buf_release(iov[0].iov_base);
+	} else if (resp_buf_type != CIFS_NO_BUFFER) {
+		/* return buffer to caller to free */
+		*buf = iov[0].iov_base;
+		if (resp_buf_type == CIFS_SMALL_BUFFER)
+			*pbuf_type = CIFS_SMALL_BUFFER;
+		else if (resp_buf_type == CIFS_LARGE_BUFFER)
+			*pbuf_type = CIFS_LARGE_BUFFER;
+	} /* else no valid buffer on return - leave as null */
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+		since file handle passed in no longer valid */
+	return rc;
+}
+
+
+int
+CIFSSMBWrite(const int xid, struct cifs_io_parms *io_parms,
+	     unsigned int *nbytes, const char *buf,
+	     const char __user *ubuf, const int long_op)
+{
+	int rc = -EACCES;
+	WRITE_REQ *pSMB = NULL;
+	WRITE_RSP *pSMBr = NULL;
+	int bytes_returned, wct;
+	__u32 bytes_sent;
+	__u16 byte_count;
+	__u32 pid = io_parms->pid;
+	__u16 netfid = io_parms->netfid;
+	__u64 offset = io_parms->offset;
+	struct cifs_tcon *tcon = io_parms->tcon;
+	unsigned int count = io_parms->length;
+
+	*nbytes = 0;
+
+	/* cFYI(1, "write at %lld %d bytes", offset, count);*/
+	if (tcon->ses == NULL)
+		return -ECONNABORTED;
+
+	if (tcon->ses->capabilities & CAP_LARGE_FILES)
+		wct = 14;
+	else {
+		wct = 12;
+		if ((offset >> 32) > 0) {
+			/* can not handle big offset for old srv */
+			return -EIO;
+		}
+	}
+
+	rc = smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16));
+
+	/* tcon and ses pointer are checked in smb_init */
+	if (tcon->ses->server == NULL)
+		return -ECONNABORTED;
+
+	pSMB->AndXCommand = 0xFF;	/* none */
+	pSMB->Fid = netfid;
+	pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF);
+	if (wct == 14)
+		pSMB->OffsetHigh = cpu_to_le32(offset >> 32);
+
+	pSMB->Reserved = 0xFFFFFFFF;
+	pSMB->WriteMode = 0;
+	pSMB->Remaining = 0;
+
+	/* Can increase buffer size if buffer is big enough in some cases ie we
+	can send more if LARGE_WRITE_X capability returned by the server and if
+	our buffer is big enough or if we convert to iovecs on socket writes
+	and eliminate the copy to the CIFS buffer */
+	if (tcon->ses->capabilities & CAP_LARGE_WRITE_X) {
+		bytes_sent = min_t(const unsigned int, CIFSMaxBufSize, count);
+	} else {
+		bytes_sent = (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE)
+			 & ~0xFF;
+	}
+
+	if (bytes_sent > count)
+		bytes_sent = count;
+	pSMB->DataOffset =
+		cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4);
+	if (buf)
+		memcpy(pSMB->Data, buf, bytes_sent);
+	else if (ubuf) {
+		if (copy_from_user(pSMB->Data, ubuf, bytes_sent)) {
+			cifs_buf_release(pSMB);
+			return -EFAULT;
+		}
+	} else if (count != 0) {
+		/* No buffer */
+		cifs_buf_release(pSMB);
+		return -EINVAL;
+	} /* else setting file size with write of zero bytes */
+	if (wct == 14)
+		byte_count = bytes_sent + 1; /* pad */
+	else /* wct == 12 */
+		byte_count = bytes_sent + 5; /* bigger pad, smaller smb hdr */
+
+	pSMB->DataLengthLow = cpu_to_le16(bytes_sent & 0xFFFF);
+	pSMB->DataLengthHigh = cpu_to_le16(bytes_sent >> 16);
+	inc_rfc1001_len(pSMB, byte_count);
+
+	if (wct == 14)
+		pSMB->ByteCount = cpu_to_le16(byte_count);
+	else { /* old style write has byte count 4 bytes earlier
+		  so 4 bytes pad  */
+		struct smb_com_writex_req *pSMBW =
+			(struct smb_com_writex_req *)pSMB;
+		pSMBW->ByteCount = cpu_to_le16(byte_count);
+	}
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, long_op);
+	cifs_stats_inc(&tcon->num_writes);
+	if (rc) {
+		cFYI(1, "Send error in write = %d", rc);
+	} else {
+		*nbytes = le16_to_cpu(pSMBr->CountHigh);
+		*nbytes = (*nbytes) << 16;
+		*nbytes += le16_to_cpu(pSMBr->Count);
+
+		/*
+		 * Mask off high 16 bits when bytes written as returned by the
+		 * server is greater than bytes requested by the client. Some
+		 * OS/2 servers are known to set incorrect CountHigh values.
+		 */
+		if (*nbytes > count)
+			*nbytes &= 0xFFFF;
+	}
+
+	cifs_buf_release(pSMB);
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+		since file handle passed in no longer valid */
+
+	return rc;
+}
+
+void
+cifs_writedata_release(struct kref *refcount)
+{
+	struct cifs_writedata *wdata = container_of(refcount,
+					struct cifs_writedata, refcount);
+
+	if (wdata->cfile)
+		cifsFileInfo_put(wdata->cfile);
+
+	kfree(wdata);
+}
+
+/*
+ * Write failed with a retryable error. Resend the write request. It's also
+ * possible that the page was redirtied so re-clean the page.
+ */
+static void
+cifs_writev_requeue(struct cifs_writedata *wdata)
+{
+	int i, rc;
+	struct inode *inode = wdata->cfile->dentry->d_inode;
+
+	for (i = 0; i < wdata->nr_pages; i++) {
+		lock_page(wdata->pages[i]);
+		clear_page_dirty_for_io(wdata->pages[i]);
+	}
+
+	do {
+		rc = cifs_async_writev(wdata);
+	} while (rc == -EAGAIN);
+
+	for (i = 0; i < wdata->nr_pages; i++) {
+		if (rc != 0)
+			SetPageError(wdata->pages[i]);
+		unlock_page(wdata->pages[i]);
+	}
+
+	mapping_set_error(inode->i_mapping, rc);
+	kref_put(&wdata->refcount, cifs_writedata_release);
+}
+
+static void
+cifs_writev_complete(struct work_struct *work)
+{
+	struct cifs_writedata *wdata = container_of(work,
+						struct cifs_writedata, work);
+	struct inode *inode = wdata->cfile->dentry->d_inode;
+	int i = 0;
+
+	if (wdata->result == 0) {
+		cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes);
+		cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink),
+					 wdata->bytes);
+	} else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN)
+		return cifs_writev_requeue(wdata);
+
+	for (i = 0; i < wdata->nr_pages; i++) {
+		struct page *page = wdata->pages[i];
+		if (wdata->result == -EAGAIN)
+			__set_page_dirty_nobuffers(page);
+		else if (wdata->result < 0)
+			SetPageError(page);
+		end_page_writeback(page);
+		page_cache_release(page);
+	}
+	if (wdata->result != -EAGAIN)
+		mapping_set_error(inode->i_mapping, wdata->result);
+	kref_put(&wdata->refcount, cifs_writedata_release);
+}
+
+struct cifs_writedata *
+cifs_writedata_alloc(unsigned int nr_pages)
+{
+	struct cifs_writedata *wdata;
+
+	/* this would overflow */
+	if (nr_pages == 0) {
+		cERROR(1, "%s: called with nr_pages == 0!", __func__);
+		return NULL;
+	}
+
+	/* writedata + number of page pointers */
+	wdata = kzalloc(sizeof(*wdata) +
+			sizeof(struct page *) * (nr_pages - 1), GFP_NOFS);
+	if (wdata != NULL) {
+		INIT_WORK(&wdata->work, cifs_writev_complete);
+		kref_init(&wdata->refcount);
+	}
+	return wdata;
+}
+
+/*
+ * Check the midState and signature on received buffer (if any), and queue the
+ * workqueue completion task.
+ */
+static void
+cifs_writev_callback(struct mid_q_entry *mid)
+{
+	struct cifs_writedata *wdata = mid->callback_data;
+	struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+	unsigned int written;
+	WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
+
+	switch (mid->midState) {
+	case MID_RESPONSE_RECEIVED:
+		wdata->result = cifs_check_receive(mid, tcon->ses->server, 0);
+		if (wdata->result != 0)
+			break;
+
+		written = le16_to_cpu(smb->CountHigh);
+		written <<= 16;
+		written += le16_to_cpu(smb->Count);
+		/*
+		 * Mask off high 16 bits when bytes written as returned
+		 * by the server is greater than bytes requested by the
+		 * client. OS/2 servers are known to set incorrect
+		 * CountHigh values.
+		 */
+		if (written > wdata->bytes)
+			written &= 0xFFFF;
+
+		if (written < wdata->bytes)
+			wdata->result = -ENOSPC;
+		else
+			wdata->bytes = written;
+		break;
+	case MID_REQUEST_SUBMITTED:
+	case MID_RETRY_NEEDED:
+		wdata->result = -EAGAIN;
+		break;
+	default:
+		wdata->result = -EIO;
+		break;
+	}
+
+	queue_work(system_nrt_wq, &wdata->work);
+	DeleteMidQEntry(mid);
+	atomic_dec(&tcon->ses->server->inFlight);
+	wake_up(&tcon->ses->server->request_q);
+}
+
+/* cifs_async_writev - send an async write, and set up mid to handle result */
+int
+cifs_async_writev(struct cifs_writedata *wdata)
+{
+	int i, rc = -EACCES;
+	WRITE_REQ *smb = NULL;
+	int wct;
+	struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+	struct inode *inode = wdata->cfile->dentry->d_inode;
+	struct kvec *iov = NULL;
+
+	if (tcon->ses->capabilities & CAP_LARGE_FILES) {
+		wct = 14;
+	} else {
+		wct = 12;
+		if (wdata->offset >> 32 > 0) {
+			/* can not handle big offset for old srv */
+			return -EIO;
+		}
+	}
+
+	rc = small_smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **)&smb);
+	if (rc)
+		goto async_writev_out;
+
+	/* 1 iov per page + 1 for header */
+	iov = kzalloc((wdata->nr_pages + 1) * sizeof(*iov), GFP_NOFS);
+	if (iov == NULL) {
+		rc = -ENOMEM;
+		goto async_writev_out;
+	}
+
+	smb->hdr.Pid = cpu_to_le16((__u16)wdata->cfile->pid);
+	smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->cfile->pid >> 16));
+
+	smb->AndXCommand = 0xFF;	/* none */
+	smb->Fid = wdata->cfile->netfid;
+	smb->OffsetLow = cpu_to_le32(wdata->offset & 0xFFFFFFFF);
+	if (wct == 14)
+		smb->OffsetHigh = cpu_to_le32(wdata->offset >> 32);
+	smb->Reserved = 0xFFFFFFFF;
+	smb->WriteMode = 0;
+	smb->Remaining = 0;
+
+	smb->DataOffset =
+	    cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4);
+
+	/* 4 for RFC1001 length + 1 for BCC */
+	iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4 + 1;
+	iov[0].iov_base = smb;
+
+	/* marshal up the pages into iov array */
+	wdata->bytes = 0;
+	for (i = 0; i < wdata->nr_pages; i++) {
+		iov[i + 1].iov_len = min(inode->i_size -
+				      page_offset(wdata->pages[i]),
+					(loff_t)PAGE_CACHE_SIZE);
+		iov[i + 1].iov_base = kmap(wdata->pages[i]);
+		wdata->bytes += iov[i + 1].iov_len;
+	}
+
+	cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes);
+
+	smb->DataLengthLow = cpu_to_le16(wdata->bytes & 0xFFFF);
+	smb->DataLengthHigh = cpu_to_le16(wdata->bytes >> 16);
+
+	if (wct == 14) {
+		inc_rfc1001_len(&smb->hdr, wdata->bytes + 1);
+		put_bcc(wdata->bytes + 1, &smb->hdr);
+	} else {
+		/* wct == 12 */
+		struct smb_com_writex_req *smbw =
+				(struct smb_com_writex_req *)smb;
+		inc_rfc1001_len(&smbw->hdr, wdata->bytes + 5);
+		put_bcc(wdata->bytes + 5, &smbw->hdr);
+		iov[0].iov_len += 4; /* pad bigger by four bytes */
+	}
+
+	kref_get(&wdata->refcount);
+	rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1,
+			     cifs_writev_callback, wdata, false);
+
+	if (rc == 0)
+		cifs_stats_inc(&tcon->num_writes);
+	else
+		kref_put(&wdata->refcount, cifs_writedata_release);
+
+	/* send is done, unmap pages */
+	for (i = 0; i < wdata->nr_pages; i++)
+		kunmap(wdata->pages[i]);
+
+async_writev_out:
+	cifs_small_buf_release(smb);
+	kfree(iov);
+	return rc;
+}
+
+int
+CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms,
+	      unsigned int *nbytes, struct kvec *iov, int n_vec,
+	      const int long_op)
+{
+	int rc = -EACCES;
+	WRITE_REQ *pSMB = NULL;
+	int wct;
+	int smb_hdr_len;
+	int resp_buf_type = 0;
+	__u32 pid = io_parms->pid;
+	__u16 netfid = io_parms->netfid;
+	__u64 offset = io_parms->offset;
+	struct cifs_tcon *tcon = io_parms->tcon;
+	unsigned int count = io_parms->length;
+
+	*nbytes = 0;
+
+	cFYI(1, "write2 at %lld %d bytes", (long long)offset, count);
+
+	if (tcon->ses->capabilities & CAP_LARGE_FILES) {
+		wct = 14;
+	} else {
+		wct = 12;
+		if ((offset >> 32) > 0) {
+			/* can not handle big offset for old srv */
+			return -EIO;
+		}
+	}
+	rc = small_smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **) &pSMB);
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16));
+
+	/* tcon and ses pointer are checked in smb_init */
+	if (tcon->ses->server == NULL)
+		return -ECONNABORTED;
+
+	pSMB->AndXCommand = 0xFF;	/* none */
+	pSMB->Fid = netfid;
+	pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF);
+	if (wct == 14)
+		pSMB->OffsetHigh = cpu_to_le32(offset >> 32);
+	pSMB->Reserved = 0xFFFFFFFF;
+	pSMB->WriteMode = 0;
+	pSMB->Remaining = 0;
+
+	pSMB->DataOffset =
+	    cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4);
+
+	pSMB->DataLengthLow = cpu_to_le16(count & 0xFFFF);
+	pSMB->DataLengthHigh = cpu_to_le16(count >> 16);
+	/* header + 1 byte pad */
+	smb_hdr_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 1;
+	if (wct == 14)
+		inc_rfc1001_len(pSMB, count + 1);
+	else /* wct == 12 */
+		inc_rfc1001_len(pSMB, count + 5); /* smb data starts later */
+	if (wct == 14)
+		pSMB->ByteCount = cpu_to_le16(count + 1);
+	else /* wct == 12 */ /* bigger pad, smaller smb hdr, keep offset ok */ {
+		struct smb_com_writex_req *pSMBW =
+				(struct smb_com_writex_req *)pSMB;
+		pSMBW->ByteCount = cpu_to_le16(count + 5);
+	}
+	iov[0].iov_base = pSMB;
+	if (wct == 14)
+		iov[0].iov_len = smb_hdr_len + 4;
+	else /* wct == 12 pad bigger by four bytes */
+		iov[0].iov_len = smb_hdr_len + 8;
+
+
+	rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type,
+			  long_op);
+	cifs_stats_inc(&tcon->num_writes);
+	if (rc) {
+		cFYI(1, "Send error Write2 = %d", rc);
+	} else if (resp_buf_type == 0) {
+		/* presumably this can not happen, but best to be safe */
+		rc = -EIO;
+	} else {
+		WRITE_RSP *pSMBr = (WRITE_RSP *)iov[0].iov_base;
+		*nbytes = le16_to_cpu(pSMBr->CountHigh);
+		*nbytes = (*nbytes) << 16;
+		*nbytes += le16_to_cpu(pSMBr->Count);
+
+		/*
+		 * Mask off high 16 bits when bytes written as returned by the
+		 * server is greater than bytes requested by the client. OS/2
+		 * servers are known to set incorrect CountHigh values.
+		 */
+		if (*nbytes > count)
+			*nbytes &= 0xFFFF;
+	}
+
+/*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
+	if (resp_buf_type == CIFS_SMALL_BUFFER)
+		cifs_small_buf_release(iov[0].iov_base);
+	else if (resp_buf_type == CIFS_LARGE_BUFFER)
+		cifs_buf_release(iov[0].iov_base);
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+		since file handle passed in no longer valid */
+
+	return rc;
+}
+
+
+int
+CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
+	    const __u16 smb_file_id, const __u64 len,
+	    const __u64 offset, const __u32 numUnlock,
+	    const __u32 numLock, const __u8 lockType,
+	    const bool waitFlag, const __u8 oplock_level)
+{
+	int rc = 0;
+	LOCK_REQ *pSMB = NULL;
+/*	LOCK_RSP *pSMBr = NULL; */ /* No response data other than rc to parse */
+	int bytes_returned;
+	int timeout = 0;
+	__u16 count;
+
+	cFYI(1, "CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock);
+	rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
+
+	if (rc)
+		return rc;
+
+	if (lockType == LOCKING_ANDX_OPLOCK_RELEASE) {
+		timeout = CIFS_ASYNC_OP; /* no response expected */
+		pSMB->Timeout = 0;
+	} else if (waitFlag) {
+		timeout = CIFS_BLOCKING_OP; /* blocking operation, no timeout */
+		pSMB->Timeout = cpu_to_le32(-1);/* blocking - do not time out */
+	} else {
+		pSMB->Timeout = 0;
+	}
+
+	pSMB->NumberOfLocks = cpu_to_le16(numLock);
+	pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock);
+	pSMB->LockType = lockType;
+	pSMB->OplockLevel = oplock_level;
+	pSMB->AndXCommand = 0xFF;	/* none */
+	pSMB->Fid = smb_file_id; /* netfid stays le */
+
+	if ((numLock != 0) || (numUnlock != 0)) {
+		pSMB->Locks[0].Pid = cpu_to_le16(current->tgid);
+		/* BB where to store pid high? */
+		pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len);
+		pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32));
+		pSMB->Locks[0].OffsetLow = cpu_to_le32((u32)offset);
+		pSMB->Locks[0].OffsetHigh = cpu_to_le32((u32)(offset>>32));
+		count = sizeof(LOCKING_ANDX_RANGE);
+	} else {
+		/* oplock break */
+		count = 0;
+	}
+	inc_rfc1001_len(pSMB, count);
+	pSMB->ByteCount = cpu_to_le16(count);
+
+	if (waitFlag) {
+		rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
+			(struct smb_hdr *) pSMB, &bytes_returned);
+		cifs_small_buf_release(pSMB);
+	} else {
+		rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *)pSMB,
+				      timeout);
+		/* SMB buffer freed by function above */
+	}
+	cifs_stats_inc(&tcon->num_locks);
+	if (rc)
+		cFYI(1, "Send error in Lock = %d", rc);
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+	since file handle passed in no longer valid */
+	return rc;
+}
+
+int
+CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
+		const __u16 smb_file_id, const int get_flag, const __u64 len,
+		struct file_lock *pLockData, const __u16 lock_type,
+		const bool waitFlag)
+{
+	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+	struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
+	struct cifs_posix_lock *parm_data;
+	int rc = 0;
+	int timeout = 0;
+	int bytes_returned = 0;
+	int resp_buf_type = 0;
+	__u16 params, param_offset, offset, byte_count, count;
+	struct kvec iov[1];
+
+	cFYI(1, "Posix Lock");
+
+	if (pLockData == NULL)
+		return -EINVAL;
+
+	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+
+	if (rc)
+		return rc;
+
+	pSMBr = (struct smb_com_transaction2_sfi_rsp *)pSMB;
+
+	params = 6;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+	offset = param_offset + params;
+
+	count = sizeof(struct cifs_posix_lock);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB from sess */
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	if (get_flag)
+		pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+	else
+		pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	parm_data = (struct cifs_posix_lock *)
+			(((char *) &pSMB->hdr.Protocol) + offset);
+
+	parm_data->lock_type = cpu_to_le16(lock_type);
+	if (waitFlag) {
+		timeout = CIFS_BLOCKING_OP; /* blocking operation, no timeout */
+		parm_data->lock_flags = cpu_to_le16(1);
+		pSMB->Timeout = cpu_to_le32(-1);
+	} else
+		pSMB->Timeout = 0;
+
+	parm_data->pid = cpu_to_le32(current->tgid);
+	parm_data->start = cpu_to_le64(pLockData->fl_start);
+	parm_data->length = cpu_to_le64(len);  /* normalize negative numbers */
+
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->Fid = smb_file_id;
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_LOCK);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	if (waitFlag) {
+		rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
+			(struct smb_hdr *) pSMBr, &bytes_returned);
+	} else {
+		iov[0].iov_base = (char *)pSMB;
+		iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
+		rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
+				&resp_buf_type, timeout);
+		pSMB = NULL; /* request buf already freed by SendReceive2. Do
+				not try to free it twice below on exit */
+		pSMBr = (struct smb_com_transaction2_sfi_rsp *)iov[0].iov_base;
+	}
+
+	if (rc) {
+		cFYI(1, "Send error in Posix Lock = %d", rc);
+	} else if (get_flag) {
+		/* lock structure can be returned on get */
+		__u16 data_offset;
+		__u16 data_count;
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < sizeof(*parm_data)) {
+			rc = -EIO;      /* bad smb */
+			goto plk_err_exit;
+		}
+		data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+		data_count  = le16_to_cpu(pSMBr->t2.DataCount);
+		if (data_count < sizeof(struct cifs_posix_lock)) {
+			rc = -EIO;
+			goto plk_err_exit;
+		}
+		parm_data = (struct cifs_posix_lock *)
+			((char *)&pSMBr->hdr.Protocol + data_offset);
+		if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK))
+			pLockData->fl_type = F_UNLCK;
+		else {
+			if (parm_data->lock_type ==
+					__constant_cpu_to_le16(CIFS_RDLCK))
+				pLockData->fl_type = F_RDLCK;
+			else if (parm_data->lock_type ==
+					__constant_cpu_to_le16(CIFS_WRLCK))
+				pLockData->fl_type = F_WRLCK;
+
+			pLockData->fl_start = le64_to_cpu(parm_data->start);
+			pLockData->fl_end = pLockData->fl_start +
+					le64_to_cpu(parm_data->length) - 1;
+			pLockData->fl_pid = le32_to_cpu(parm_data->pid);
+		}
+	}
+
+plk_err_exit:
+	if (pSMB)
+		cifs_small_buf_release(pSMB);
+
+	if (resp_buf_type == CIFS_SMALL_BUFFER)
+		cifs_small_buf_release(iov[0].iov_base);
+	else if (resp_buf_type == CIFS_LARGE_BUFFER)
+		cifs_buf_release(iov[0].iov_base);
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+	   since file handle passed in no longer valid */
+
+	return rc;
+}
+
+
+int
+CIFSSMBClose(const int xid, struct cifs_tcon *tcon, int smb_file_id)
+{
+	int rc = 0;
+	CLOSE_REQ *pSMB = NULL;
+	cFYI(1, "In CIFSSMBClose");
+
+/* do not retry on dead session on close */
+	rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB);
+	if (rc == -EAGAIN)
+		return 0;
+	if (rc)
+		return rc;
+
+	pSMB->FileID = (__u16) smb_file_id;
+	pSMB->LastWriteTime = 0xFFFFFFFF;
+	pSMB->ByteCount = 0;
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	cifs_stats_inc(&tcon->num_closes);
+	if (rc) {
+		if (rc != -EINTR) {
+			/* EINTR is expected when user ctl-c to kill app */
+			cERROR(1, "Send error in Close = %d", rc);
+		}
+	}
+
+	/* Since session is dead, file will be closed on server already */
+	if (rc == -EAGAIN)
+		rc = 0;
+
+	return rc;
+}
+
+int
+CIFSSMBFlush(const int xid, struct cifs_tcon *tcon, int smb_file_id)
+{
+	int rc = 0;
+	FLUSH_REQ *pSMB = NULL;
+	cFYI(1, "In CIFSSMBFlush");
+
+	rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
+	if (rc)
+		return rc;
+
+	pSMB->FileID = (__u16) smb_file_id;
+	pSMB->ByteCount = 0;
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	cifs_stats_inc(&tcon->num_flushes);
+	if (rc)
+		cERROR(1, "Send error in Flush = %d", rc);
+
+	return rc;
+}
+
+int
+CIFSSMBRename(const int xid, struct cifs_tcon *tcon,
+	      const char *fromName, const char *toName,
+	      const struct nls_table *nls_codepage, int remap)
+{
+	int rc = 0;
+	RENAME_REQ *pSMB = NULL;
+	RENAME_RSP *pSMBr = NULL;
+	int bytes_returned;
+	int name_len, name_len2;
+	__u16 count;
+
+	cFYI(1, "In CIFSSMBRename");
+renameRetry:
+	rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->BufferFormat = 0x04;
+	pSMB->SearchAttributes =
+	    cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM |
+			ATTR_DIRECTORY);
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
+				     PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+		pSMB->OldFileName[name_len] = 0x04;	/* pad */
+	/* protocol requires ASCII signature byte on Unicode string */
+		pSMB->OldFileName[name_len + 1] = 0x00;
+		name_len2 =
+		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
+				     toName, PATH_MAX, nls_codepage, remap);
+		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
+		name_len2 *= 2;	/* convert to bytes */
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fromName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->OldFileName, fromName, name_len);
+		name_len2 = strnlen(toName, PATH_MAX);
+		name_len2++;	/* trailing null */
+		pSMB->OldFileName[name_len] = 0x04;  /* 2nd buffer format */
+		strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2);
+		name_len2++;	/* trailing null */
+		name_len2++;	/* signature byte */
+	}
+
+	count = 1 /* 1st signature byte */  + name_len + name_len2;
+	inc_rfc1001_len(pSMB, count);
+	pSMB->ByteCount = cpu_to_le16(count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->num_renames);
+	if (rc)
+		cFYI(1, "Send error in rename = %d", rc);
+
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto renameRetry;
+
+	return rc;
+}
+
+int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon,
+		int netfid, const char *target_name,
+		const struct nls_table *nls_codepage, int remap)
+{
+	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+	struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
+	struct set_file_rename *rename_info;
+	char *data_offset;
+	char dummy_string[30];
+	int rc = 0;
+	int bytes_returned = 0;
+	int len_of_str;
+	__u16 params, param_offset, offset, count, byte_count;
+
+	cFYI(1, "Rename to File by handle");
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB,
+			(void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 6;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+	offset = param_offset + params;
+
+	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+	rename_info = (struct set_file_rename *) data_offset;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB from sess */
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+	byte_count = 3 /* pad */  + params;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	/* construct random name ".cifs_tmp<inodenum><mid>" */
+	rename_info->overwrite = cpu_to_le32(1);
+	rename_info->root_fid  = 0;
+	/* unicode only call */
+	if (target_name == NULL) {
+		sprintf(dummy_string, "cifs%x", pSMB->hdr.Mid);
+		len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+					dummy_string, 24, nls_codepage, remap);
+	} else {
+		len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+					target_name, PATH_MAX, nls_codepage,
+					remap);
+	}
+	rename_info->target_name_len = cpu_to_le32(2 * len_of_str);
+	count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str);
+	byte_count += count;
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->Fid = netfid;
+	pSMB->InformationLevel =
+		cpu_to_le16(SMB_SET_FILE_RENAME_INFORMATION);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, pTcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&pTcon->num_t2renames);
+	if (rc)
+		cFYI(1, "Send error in Rename (by file handle) = %d", rc);
+
+	cifs_buf_release(pSMB);
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+		since file handle passed in no longer valid */
+
+	return rc;
+}
+
+int
+CIFSSMBCopy(const int xid, struct cifs_tcon *tcon, const char *fromName,
+	    const __u16 target_tid, const char *toName, const int flags,
+	    const struct nls_table *nls_codepage, int remap)
+{
+	int rc = 0;
+	COPY_REQ *pSMB = NULL;
+	COPY_RSP *pSMBr = NULL;
+	int bytes_returned;
+	int name_len, name_len2;
+	__u16 count;
+
+	cFYI(1, "In CIFSSMBCopy");
+copyRetry:
+	rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB,
+			(void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->BufferFormat = 0x04;
+	pSMB->Tid2 = target_tid;
+
+	pSMB->Flags = cpu_to_le16(flags & COPY_TREE);
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len = cifsConvertToUCS((__le16 *) pSMB->OldFileName,
+					    fromName, PATH_MAX, nls_codepage,
+					    remap);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+		pSMB->OldFileName[name_len] = 0x04;     /* pad */
+		/* protocol requires ASCII signature byte on Unicode string */
+		pSMB->OldFileName[name_len + 1] = 0x00;
+		name_len2 =
+		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
+				toName, PATH_MAX, nls_codepage, remap);
+		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
+		name_len2 *= 2; /* convert to bytes */
+	} else { 	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fromName, PATH_MAX);
+		name_len++;     /* trailing null */
+		strncpy(pSMB->OldFileName, fromName, name_len);
+		name_len2 = strnlen(toName, PATH_MAX);
+		name_len2++;    /* trailing null */
+		pSMB->OldFileName[name_len] = 0x04;  /* 2nd buffer format */
+		strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2);
+		name_len2++;    /* trailing null */
+		name_len2++;    /* signature byte */
+	}
+
+	count = 1 /* 1st signature byte */  + name_len + name_len2;
+	inc_rfc1001_len(pSMB, count);
+	pSMB->ByteCount = cpu_to_le16(count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in copy = %d with %d files copied",
+			rc, le16_to_cpu(pSMBr->CopyCount));
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto copyRetry;
+
+	return rc;
+}
+
+int
+CIFSUnixCreateSymLink(const int xid, struct cifs_tcon *tcon,
+		      const char *fromName, const char *toName,
+		      const struct nls_table *nls_codepage)
+{
+	TRANSACTION2_SPI_REQ *pSMB = NULL;
+	TRANSACTION2_SPI_RSP *pSMBr = NULL;
+	char *data_offset;
+	int name_len;
+	int name_len_target;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, param_offset, offset, byte_count;
+
+	cFYI(1, "In Symlink Unix style");
+createSymLinkRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifs_strtoUCS((__le16 *) pSMB->FileName, fromName, PATH_MAX
+				  /* find define for this maxpathcomponent */
+				  , nls_codepage);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fromName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, fromName, name_len);
+	}
+	params = 6 + name_len;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+
+	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len_target =
+		    cifs_strtoUCS((__le16 *) data_offset, toName, PATH_MAX
+				  /* find define for this maxpathcomponent */
+				  , nls_codepage);
+		name_len_target++;	/* trailing null */
+		name_len_target *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len_target = strnlen(toName, PATH_MAX);
+		name_len_target++;	/* trailing null */
+		strncpy(data_offset, toName, name_len_target);
+	}
+
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max on data count below from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + name_len_target;
+	pSMB->DataCount = cpu_to_le16(name_len_target);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_LINK);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->num_symlinks);
+	if (rc)
+		cFYI(1, "Send error in SetPathInfo create symlink = %d", rc);
+
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto createSymLinkRetry;
+
+	return rc;
+}
+
+int
+CIFSUnixCreateHardLink(const int xid, struct cifs_tcon *tcon,
+		       const char *fromName, const char *toName,
+		       const struct nls_table *nls_codepage, int remap)
+{
+	TRANSACTION2_SPI_REQ *pSMB = NULL;
+	TRANSACTION2_SPI_RSP *pSMBr = NULL;
+	char *data_offset;
+	int name_len;
+	int name_len_target;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, param_offset, offset, byte_count;
+
+	cFYI(1, "In Create Hard link Unix style");
+createHardLinkRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len = cifsConvertToUCS((__le16 *) pSMB->FileName, toName,
+					    PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(toName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, toName, name_len);
+	}
+	params = 6 + name_len;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+
+	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len_target =
+		    cifsConvertToUCS((__le16 *) data_offset, fromName, PATH_MAX,
+				     nls_codepage, remap);
+		name_len_target++;	/* trailing null */
+		name_len_target *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len_target = strnlen(fromName, PATH_MAX);
+		name_len_target++;	/* trailing null */
+		strncpy(data_offset, fromName, name_len_target);
+	}
+
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max on data count below from sess*/
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + name_len_target;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->DataCount = cpu_to_le16(name_len_target);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_HLINK);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->num_hardlinks);
+	if (rc)
+		cFYI(1, "Send error in SetPathInfo (hard link) = %d", rc);
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto createHardLinkRetry;
+
+	return rc;
+}
+
+int
+CIFSCreateHardLink(const int xid, struct cifs_tcon *tcon,
+		   const char *fromName, const char *toName,
+		   const struct nls_table *nls_codepage, int remap)
+{
+	int rc = 0;
+	NT_RENAME_REQ *pSMB = NULL;
+	RENAME_RSP *pSMBr = NULL;
+	int bytes_returned;
+	int name_len, name_len2;
+	__u16 count;
+
+	cFYI(1, "In CIFSCreateHardLink");
+winCreateHardLinkRetry:
+
+	rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->SearchAttributes =
+	    cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM |
+			ATTR_DIRECTORY);
+	pSMB->Flags = cpu_to_le16(CREATE_HARD_LINK);
+	pSMB->ClusterCount = 0;
+
+	pSMB->BufferFormat = 0x04;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
+				     PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+
+		/* protocol specifies ASCII buffer format (0x04) for unicode */
+		pSMB->OldFileName[name_len] = 0x04;
+		pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
+		name_len2 =
+		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
+				     toName, PATH_MAX, nls_codepage, remap);
+		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
+		name_len2 *= 2;	/* convert to bytes */
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fromName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->OldFileName, fromName, name_len);
+		name_len2 = strnlen(toName, PATH_MAX);
+		name_len2++;	/* trailing null */
+		pSMB->OldFileName[name_len] = 0x04;	/* 2nd buffer format */
+		strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2);
+		name_len2++;	/* trailing null */
+		name_len2++;	/* signature byte */
+	}
+
+	count = 1 /* string type byte */  + name_len + name_len2;
+	inc_rfc1001_len(pSMB, count);
+	pSMB->ByteCount = cpu_to_le16(count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->num_hardlinks);
+	if (rc)
+		cFYI(1, "Send error in hard link (NT rename) = %d", rc);
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto winCreateHardLinkRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBUnixQuerySymLink(const int xid, struct cifs_tcon *tcon,
+			const unsigned char *searchName, char **symlinkinfo,
+			const struct nls_table *nls_codepage)
+{
+/* SMB_QUERY_FILE_UNIX_LINK */
+	TRANSACTION2_QPI_REQ *pSMB = NULL;
+	TRANSACTION2_QPI_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+	__u16 params, byte_count;
+	char *data_start;
+
+	cFYI(1, "In QPathSymLinkInfo (Unix) for path %s", searchName);
+
+querySymLinkRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifs_strtoUCS((__le16 *) pSMB->FileName, searchName,
+				  PATH_MAX, nls_codepage);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(searchName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, searchName, name_len);
+	}
+
+	params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+	struct smb_com_transaction2_qpi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_LINK);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in QuerySymLinkInfo = %d", rc);
+	} else {
+		/* decode response */
+
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		/* BB also check enough total bytes returned */
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
+			rc = -EIO;
+		else {
+			bool is_unicode;
+			u16 count = le16_to_cpu(pSMBr->t2.DataCount);
+
+			data_start = ((char *) &pSMBr->hdr.Protocol) +
+					   le16_to_cpu(pSMBr->t2.DataOffset);
+
+			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+				is_unicode = true;
+			else
+				is_unicode = false;
+
+			/* BB FIXME investigate remapping reserved chars here */
+			*symlinkinfo = cifs_strndup_from_ucs(data_start, count,
+						    is_unicode, nls_codepage);
+			if (!*symlinkinfo)
+				rc = -ENOMEM;
+		}
+	}
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto querySymLinkRetry;
+	return rc;
+}
+
+#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL
+/*
+ *	Recent Windows versions now create symlinks more frequently
+ *	and they use the "reparse point" mechanism below.  We can of course
+ *	do symlinks nicely to Samba and other servers which support the
+ *	CIFS Unix Extensions and we can also do SFU symlinks and "client only"
+ *	"MF" symlinks optionally, but for recent Windows we really need to
+ *	reenable the code below and fix the cifs_symlink callers to handle this.
+ *	In the interim this code has been moved to its own config option so
+ *	it is not compiled in by default until callers fixed up and more tested.
+ */
+int
+CIFSSMBQueryReparseLinkInfo(const int xid, struct cifs_tcon *tcon,
+			const unsigned char *searchName,
+			char *symlinkinfo, const int buflen, __u16 fid,
+			const struct nls_table *nls_codepage)
+{
+	int rc = 0;
+	int bytes_returned;
+	struct smb_com_transaction_ioctl_req *pSMB;
+	struct smb_com_transaction_ioctl_rsp *pSMBr;
+
+	cFYI(1, "In Windows reparse style QueryLink for path %s", searchName);
+	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->TotalParameterCount = 0 ;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le32(2);
+	/* BB find exact data count max from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
+					  MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+	pSMB->MaxSetupCount = 4;
+	pSMB->Reserved = 0;
+	pSMB->ParameterOffset = 0;
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 4;
+	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->FunctionCode = cpu_to_le32(FSCTL_GET_REPARSE_POINT);
+	pSMB->IsFsctl = 1; /* FSCTL */
+	pSMB->IsRootFlag = 0;
+	pSMB->Fid = fid; /* file handle always le */
+	pSMB->ByteCount = 0;
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in QueryReparseLinkInfo = %d", rc);
+	} else {		/* decode response */
+		__u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
+		__u32 data_count = le32_to_cpu(pSMBr->DataCount);
+		if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) {
+			/* BB also check enough total bytes returned */
+			rc = -EIO;	/* bad smb */
+			goto qreparse_out;
+		}
+		if (data_count && (data_count < 2048)) {
+			char *end_of_smb = 2 /* sizeof byte count */ +
+			       get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
+
+			struct reparse_data *reparse_buf =
+						(struct reparse_data *)
+						((char *)&pSMBr->hdr.Protocol
+								 + data_offset);
+			if ((char *)reparse_buf >= end_of_smb) {
+				rc = -EIO;
+				goto qreparse_out;
+			}
+			if ((reparse_buf->LinkNamesBuf +
+				reparse_buf->TargetNameOffset +
+				reparse_buf->TargetNameLen) > end_of_smb) {
+				cFYI(1, "reparse buf beyond SMB");
+				rc = -EIO;
+				goto qreparse_out;
+			}
+
+			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
+				cifs_from_ucs2(symlinkinfo, (__le16 *)
+						(reparse_buf->LinkNamesBuf +
+						reparse_buf->TargetNameOffset),
+						buflen,
+						reparse_buf->TargetNameLen,
+						nls_codepage, 0);
+			} else { /* ASCII names */
+				strncpy(symlinkinfo,
+					reparse_buf->LinkNamesBuf +
+					reparse_buf->TargetNameOffset,
+					min_t(const int, buflen,
+					   reparse_buf->TargetNameLen));
+			}
+		} else {
+			rc = -EIO;
+			cFYI(1, "Invalid return data count on "
+				 "get reparse info ioctl");
+		}
+		symlinkinfo[buflen] = 0; /* just in case so the caller
+					does not go off the end of the buffer */
+		cFYI(1, "readlink result - %s", symlinkinfo);
+	}
+
+qreparse_out:
+	cifs_buf_release(pSMB);
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+		since file handle passed in no longer valid */
+
+	return rc;
+}
+#endif /* CIFS_SYMLINK_EXPERIMENTAL */ /* BB temporarily unused */
+
+#ifdef CONFIG_CIFS_POSIX
+
+/*Convert an Access Control Entry from wire format to local POSIX xattr format*/
+static void cifs_convert_ace(posix_acl_xattr_entry *ace,
+			     struct cifs_posix_ace *cifs_ace)
+{
+	/* u8 cifs fields do not need le conversion */
+	ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
+	ace->e_tag  = cpu_to_le16(cifs_ace->cifs_e_tag);
+	ace->e_id   = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
+	/* cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id); */
+
+	return;
+}
+
+/* Convert ACL from CIFS POSIX wire format to local Linux POSIX ACL xattr */
+static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
+			       const int acl_type, const int size_of_data_area)
+{
+	int size =  0;
+	int i;
+	__u16 count;
+	struct cifs_posix_ace *pACE;
+	struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src;
+	posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)trgt;
+
+	if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION)
+		return -EOPNOTSUPP;
+
+	if (acl_type & ACL_TYPE_ACCESS) {
+		count = le16_to_cpu(cifs_acl->access_entry_count);
+		pACE = &cifs_acl->ace_array[0];
+		size = sizeof(struct cifs_posix_acl);
+		size += sizeof(struct cifs_posix_ace) * count;
+		/* check if we would go beyond end of SMB */
+		if (size_of_data_area < size) {
+			cFYI(1, "bad CIFS POSIX ACL size %d vs. %d",
+				size_of_data_area, size);
+			return -EINVAL;
+		}
+	} else if (acl_type & ACL_TYPE_DEFAULT) {
+		count = le16_to_cpu(cifs_acl->access_entry_count);
+		size = sizeof(struct cifs_posix_acl);
+		size += sizeof(struct cifs_posix_ace) * count;
+/* skip past access ACEs to get to default ACEs */
+		pACE = &cifs_acl->ace_array[count];
+		count = le16_to_cpu(cifs_acl->default_entry_count);
+		size += sizeof(struct cifs_posix_ace) * count;
+		/* check if we would go beyond end of SMB */
+		if (size_of_data_area < size)
+			return -EINVAL;
+	} else {
+		/* illegal type */
+		return -EINVAL;
+	}
+
+	size = posix_acl_xattr_size(count);
+	if ((buflen == 0) || (local_acl == NULL)) {
+		/* used to query ACL EA size */
+	} else if (size > buflen) {
+		return -ERANGE;
+	} else /* buffer big enough */ {
+		local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+		for (i = 0; i < count ; i++) {
+			cifs_convert_ace(&local_acl->a_entries[i], pACE);
+			pACE++;
+		}
+	}
+	return size;
+}
+
+static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
+				     const posix_acl_xattr_entry *local_ace)
+{
+	__u16 rc = 0; /* 0 = ACL converted ok */
+
+	cifs_ace->cifs_e_perm = le16_to_cpu(local_ace->e_perm);
+	cifs_ace->cifs_e_tag =  le16_to_cpu(local_ace->e_tag);
+	/* BB is there a better way to handle the large uid? */
+	if (local_ace->e_id == cpu_to_le32(-1)) {
+	/* Probably no need to le convert -1 on any arch but can not hurt */
+		cifs_ace->cifs_uid = cpu_to_le64(-1);
+	} else
+		cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
+	/*cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id);*/
+	return rc;
+}
+
+/* Convert ACL from local Linux POSIX xattr to CIFS POSIX ACL wire format */
+static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
+			       const int buflen, const int acl_type)
+{
+	__u16 rc = 0;
+	struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data;
+	posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)pACL;
+	int count;
+	int i;
+
+	if ((buflen == 0) || (pACL == NULL) || (cifs_acl == NULL))
+		return 0;
+
+	count = posix_acl_xattr_count((size_t)buflen);
+	cFYI(1, "setting acl with %d entries from buf of length %d and "
+		"version of %d",
+		count, buflen, le32_to_cpu(local_acl->a_version));
+	if (le32_to_cpu(local_acl->a_version) != 2) {
+		cFYI(1, "unknown POSIX ACL version %d",
+		     le32_to_cpu(local_acl->a_version));
+		return 0;
+	}
+	cifs_acl->version = cpu_to_le16(1);
+	if (acl_type == ACL_TYPE_ACCESS)
+		cifs_acl->access_entry_count = cpu_to_le16(count);
+	else if (acl_type == ACL_TYPE_DEFAULT)
+		cifs_acl->default_entry_count = cpu_to_le16(count);
+	else {
+		cFYI(1, "unknown ACL type %d", acl_type);
+		return 0;
+	}
+	for (i = 0; i < count; i++) {
+		rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i],
+					&local_acl->a_entries[i]);
+		if (rc != 0) {
+			/* ACE not converted */
+			break;
+		}
+	}
+	if (rc == 0) {
+		rc = (__u16)(count * sizeof(struct cifs_posix_ace));
+		rc += sizeof(struct cifs_posix_acl);
+		/* BB add check to make sure ACL does not overflow SMB */
+	}
+	return rc;
+}
+
+int
+CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon,
+		   const unsigned char *searchName,
+		   char *acl_inf, const int buflen, const int acl_type,
+		   const struct nls_table *nls_codepage, int remap)
+{
+/* SMB_QUERY_POSIX_ACL */
+	TRANSACTION2_QPI_REQ *pSMB = NULL;
+	TRANSACTION2_QPI_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+	__u16 params, byte_count;
+
+	cFYI(1, "In GetPosixACL (Unix) for path %s", searchName);
+
+queryAclRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		(void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+			cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
+					 PATH_MAX, nls_codepage, remap);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+		pSMB->FileName[name_len] = 0;
+		pSMB->FileName[name_len+1] = 0;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(searchName, PATH_MAX);
+		name_len++;     /* trailing null */
+		strncpy(pSMB->FileName, searchName, name_len);
+	}
+
+	params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset = cpu_to_le16(
+		offsetof(struct smb_com_transaction2_qpi_req,
+			 InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_ACL);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->num_acl_get);
+	if (rc) {
+		cFYI(1, "Send error in Query POSIX ACL = %d", rc);
+	} else {
+		/* decode response */
+
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		/* BB also check enough total bytes returned */
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
+			rc = -EIO;      /* bad smb */
+		else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			__u16 count = le16_to_cpu(pSMBr->t2.DataCount);
+			rc = cifs_copy_posix_acl(acl_inf,
+				(char *)&pSMBr->hdr.Protocol+data_offset,
+				buflen, acl_type, count);
+		}
+	}
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto queryAclRetry;
+	return rc;
+}
+
+int
+CIFSSMBSetPosixACL(const int xid, struct cifs_tcon *tcon,
+		   const unsigned char *fileName,
+		   const char *local_acl, const int buflen,
+		   const int acl_type,
+		   const struct nls_table *nls_codepage, int remap)
+{
+	struct smb_com_transaction2_spi_req *pSMB = NULL;
+	struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
+	char *parm_data;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count, data_count, param_offset, offset;
+
+	cFYI(1, "In SetPosixACL (Unix) for path %s", fileName);
+setAclRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+			cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
+				      PATH_MAX, nls_codepage, remap);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;     /* trailing null */
+		strncpy(pSMB->FileName, fileName, name_len);
+	}
+	params = 6 + name_len;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB size from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+	parm_data = ((char *) &pSMB->hdr.Protocol) + offset;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+
+	/* convert to on the wire format for POSIX ACL */
+	data_count = ACL_to_cifs_posix(parm_data, local_acl, buflen, acl_type);
+
+	if (data_count == 0) {
+		rc = -EOPNOTSUPP;
+		goto setACLerrorExit;
+	}
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_ACL);
+	byte_count = 3 /* pad */  + params + data_count;
+	pSMB->DataCount = cpu_to_le16(data_count);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cFYI(1, "Set POSIX ACL returned %d", rc);
+
+setACLerrorExit:
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto setAclRetry;
+	return rc;
+}
+
+/* BB fix tabs in this function FIXME BB */
+int
+CIFSGetExtAttr(const int xid, struct cifs_tcon *tcon,
+	       const int netfid, __u64 *pExtAttrBits, __u64 *pMask)
+{
+	int rc = 0;
+	struct smb_t2_qfi_req *pSMB = NULL;
+	struct smb_t2_qfi_rsp *pSMBr = NULL;
+	int bytes_returned;
+	__u16 params, byte_count;
+
+	cFYI(1, "In GetExtAttr");
+	if (tcon == NULL)
+		return -ENODEV;
+
+GetExtAttrRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+			(void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2 /* level */ + 2 /* fid */;
+	pSMB->t2.TotalDataCount = 0;
+	pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->t2.MaxDataCount = cpu_to_le16(4000);
+	pSMB->t2.MaxSetupCount = 0;
+	pSMB->t2.Reserved = 0;
+	pSMB->t2.Flags = 0;
+	pSMB->t2.Timeout = 0;
+	pSMB->t2.Reserved2 = 0;
+	pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+					       Fid) - 4);
+	pSMB->t2.DataCount = 0;
+	pSMB->t2.DataOffset = 0;
+	pSMB->t2.SetupCount = 1;
+	pSMB->t2.Reserved3 = 0;
+	pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+	pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_ATTR_FLAGS);
+	pSMB->Pad = 0;
+	pSMB->Fid = netfid;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->t2.ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "error %d in GetExtAttr", rc);
+	} else {
+		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		/* BB also check enough total bytes returned */
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
+			/* If rc should we check for EOPNOSUPP and
+			   disable the srvino flag? or in caller? */
+			rc = -EIO;      /* bad smb */
+		else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			__u16 count = le16_to_cpu(pSMBr->t2.DataCount);
+			struct file_chattr_info *pfinfo;
+			/* BB Do we need a cast or hash here ? */
+			if (count != 16) {
+				cFYI(1, "Illegal size ret in GetExtAttr");
+				rc = -EIO;
+				goto GetExtAttrOut;
+			}
+			pfinfo = (struct file_chattr_info *)
+				 (data_offset + (char *) &pSMBr->hdr.Protocol);
+			*pExtAttrBits = le64_to_cpu(pfinfo->mode);
+			*pMask = le64_to_cpu(pfinfo->mask);
+		}
+	}
+GetExtAttrOut:
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto GetExtAttrRetry;
+	return rc;
+}
+
+#endif /* CONFIG_POSIX */
+
+#ifdef CONFIG_CIFS_ACL
+/*
+ * Initialize NT TRANSACT SMB into small smb request buffer.  This assumes that
+ * all NT TRANSACTS that we init here have total parm and data under about 400
+ * bytes (to fit in small cifs buffer size), which is the case so far, it
+ * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of
+ * returned setup area) and MaxParameterCount (returned parms size) must be set
+ * by caller
+ */
+static int
+smb_init_nttransact(const __u16 sub_command, const int setup_count,
+		   const int parm_len, struct cifs_tcon *tcon,
+		   void **ret_buf)
+{
+	int rc;
+	__u32 temp_offset;
+	struct smb_com_ntransact_req *pSMB;
+
+	rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
+				(void **)&pSMB);
+	if (rc)
+		return rc;
+	*ret_buf = (void *)pSMB;
+	pSMB->Reserved = 0;
+	pSMB->TotalParameterCount = cpu_to_le32(parm_len);
+	pSMB->TotalDataCount  = 0;
+	pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
+					  MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->DataCount  = pSMB->TotalDataCount;
+	temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
+			(setup_count * 2) - 4 /* for rfc1001 length itself */;
+	pSMB->ParameterOffset = cpu_to_le32(temp_offset);
+	pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
+	pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
+	pSMB->SubCommand = cpu_to_le16(sub_command);
+	return 0;
+}
+
+static int
+validate_ntransact(char *buf, char **ppparm, char **ppdata,
+		   __u32 *pparmlen, __u32 *pdatalen)
+{
+	char *end_of_smb;
+	__u32 data_count, data_offset, parm_count, parm_offset;
+	struct smb_com_ntransact_rsp *pSMBr;
+	u16 bcc;
+
+	*pdatalen = 0;
+	*pparmlen = 0;
+
+	if (buf == NULL)
+		return -EINVAL;
+
+	pSMBr = (struct smb_com_ntransact_rsp *)buf;
+
+	bcc = get_bcc(&pSMBr->hdr);
+	end_of_smb = 2 /* sizeof byte count */ + bcc +
+			(char *)&pSMBr->ByteCount;
+
+	data_offset = le32_to_cpu(pSMBr->DataOffset);
+	data_count = le32_to_cpu(pSMBr->DataCount);
+	parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
+	parm_count = le32_to_cpu(pSMBr->ParameterCount);
+
+	*ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
+	*ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
+
+	/* should we also check that parm and data areas do not overlap? */
+	if (*ppparm > end_of_smb) {
+		cFYI(1, "parms start after end of smb");
+		return -EINVAL;
+	} else if (parm_count + *ppparm > end_of_smb) {
+		cFYI(1, "parm end after end of smb");
+		return -EINVAL;
+	} else if (*ppdata > end_of_smb) {
+		cFYI(1, "data starts after end of smb");
+		return -EINVAL;
+	} else if (data_count + *ppdata > end_of_smb) {
+		cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
+			*ppdata, data_count, (data_count + *ppdata),
+			end_of_smb, pSMBr);
+		return -EINVAL;
+	} else if (parm_count + data_count > bcc) {
+		cFYI(1, "parm count and data count larger than SMB");
+		return -EINVAL;
+	}
+	*pdatalen = data_count;
+	*pparmlen = parm_count;
+	return 0;
+}
+
+/* Get Security Descriptor (by handle) from remote server for a file or dir */
+int
+CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid,
+		  struct cifs_ntsd **acl_inf, __u32 *pbuflen)
+{
+	int rc = 0;
+	int buf_type = 0;
+	QUERY_SEC_DESC_REQ *pSMB;
+	struct kvec iov[1];
+
+	cFYI(1, "GetCifsACL");
+
+	*pbuflen = 0;
+	*acl_inf = NULL;
+
+	rc = smb_init_nttransact(NT_TRANSACT_QUERY_SECURITY_DESC, 0,
+			8 /* parm len */, tcon, (void **) &pSMB);
+	if (rc)
+		return rc;
+
+	pSMB->MaxParameterCount = cpu_to_le32(4);
+	/* BB TEST with big acls that might need to be e.g. larger than 16K */
+	pSMB->MaxSetupCount = 0;
+	pSMB->Fid = fid; /* file handle always le */
+	pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP |
+				     CIFS_ACL_DACL);
+	pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */
+	inc_rfc1001_len(pSMB, 11);
+	iov[0].iov_base = (char *)pSMB;
+	iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
+
+	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
+			 0);
+	cifs_stats_inc(&tcon->num_acl_get);
+	if (rc) {
+		cFYI(1, "Send error in QuerySecDesc = %d", rc);
+	} else {                /* decode response */
+		__le32 *parm;
+		__u32 parm_len;
+		__u32 acl_len;
+		struct smb_com_ntransact_rsp *pSMBr;
+		char *pdata;
+
+/* validate_nttransact */
+		rc = validate_ntransact(iov[0].iov_base, (char **)&parm,
+					&pdata, &parm_len, pbuflen);
+		if (rc)
+			goto qsec_out;
+		pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
+
+		cFYI(1, "smb %p parm %p data %p", pSMBr, parm, *acl_inf);
+
+		if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
+			rc = -EIO;      /* bad smb */
+			*pbuflen = 0;
+			goto qsec_out;
+		}
+
+/* BB check that data area is minimum length and as big as acl_len */
+
+		acl_len = le32_to_cpu(*parm);
+		if (acl_len != *pbuflen) {
+			cERROR(1, "acl length %d does not match %d",
+				   acl_len, *pbuflen);
+			if (*pbuflen > acl_len)
+				*pbuflen = acl_len;
+		}
+
+		/* check if buffer is big enough for the acl
+		   header followed by the smallest SID */
+		if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
+		    (*pbuflen >= 64 * 1024)) {
+			cERROR(1, "bad acl length %d", *pbuflen);
+			rc = -EINVAL;
+			*pbuflen = 0;
+		} else {
+			*acl_inf = kmalloc(*pbuflen, GFP_KERNEL);
+			if (*acl_inf == NULL) {
+				*pbuflen = 0;
+				rc = -ENOMEM;
+			}
+			memcpy(*acl_inf, pdata, *pbuflen);
+		}
+	}
+qsec_out:
+	if (buf_type == CIFS_SMALL_BUFFER)
+		cifs_small_buf_release(iov[0].iov_base);
+	else if (buf_type == CIFS_LARGE_BUFFER)
+		cifs_buf_release(iov[0].iov_base);
+/*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
+	return rc;
+}
+
+int
+CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid,
+			struct cifs_ntsd *pntsd, __u32 acllen)
+{
+	__u16 byte_count, param_count, data_count, param_offset, data_offset;
+	int rc = 0;
+	int bytes_returned = 0;
+	SET_SEC_DESC_REQ *pSMB = NULL;
+	NTRANSACT_RSP *pSMBr = NULL;
+
+setCifsAclRetry:
+	rc = smb_init(SMB_COM_NT_TRANSACT, 19, tcon, (void **) &pSMB,
+			(void **) &pSMBr);
+	if (rc)
+			return (rc);
+
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+
+	param_count = 8;
+	param_offset = offsetof(struct smb_com_transaction_ssec_req, Fid) - 4;
+	data_count = acllen;
+	data_offset = param_offset + param_count;
+	byte_count = 3 /* pad */  + param_count;
+
+	pSMB->DataCount = cpu_to_le32(data_count);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->MaxParameterCount = cpu_to_le32(4);
+	pSMB->MaxDataCount = cpu_to_le32(16384);
+	pSMB->ParameterCount = cpu_to_le32(param_count);
+	pSMB->ParameterOffset = cpu_to_le32(param_offset);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->DataOffset = cpu_to_le32(data_offset);
+	pSMB->SetupCount = 0;
+	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_SET_SECURITY_DESC);
+	pSMB->ByteCount = cpu_to_le16(byte_count+data_count);
+
+	pSMB->Fid = fid; /* file handle always le */
+	pSMB->Reserved2 = 0;
+	pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL);
+
+	if (pntsd && acllen) {
+		memcpy((char *) &pSMBr->hdr.Protocol + data_offset,
+			(char *) pntsd,
+			acllen);
+		inc_rfc1001_len(pSMB, byte_count + data_count);
+	} else
+		inc_rfc1001_len(pSMB, byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
+
+	cFYI(1, "SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc);
+	if (rc)
+		cFYI(1, "Set CIFS ACL returned %d", rc);
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto setCifsAclRetry;
+
+	return (rc);
+}
+
+#endif /* CONFIG_CIFS_ACL */
+
+/* Legacy Query Path Information call for lookup to old servers such
+   as Win9x/WinME */
+int SMBQueryInformation(const int xid, struct cifs_tcon *tcon,
+			const unsigned char *searchName,
+			FILE_ALL_INFO *pFinfo,
+			const struct nls_table *nls_codepage, int remap)
+{
+	QUERY_INFORMATION_REQ *pSMB;
+	QUERY_INFORMATION_RSP *pSMBr;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+
+	cFYI(1, "In SMBQPath path %s", searchName);
+QInfRetry:
+	rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+			cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
+					PATH_MAX, nls_codepage, remap);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+	} else {
+		name_len = strnlen(searchName, PATH_MAX);
+		name_len++;     /* trailing null */
+		strncpy(pSMB->FileName, searchName, name_len);
+	}
+	pSMB->BufferFormat = 0x04;
+	name_len++; /* account for buffer type byte */
+	inc_rfc1001_len(pSMB, (__u16)name_len);
+	pSMB->ByteCount = cpu_to_le16(name_len);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in QueryInfo = %d", rc);
+	} else if (pFinfo) {
+		struct timespec ts;
+		__u32 time = le32_to_cpu(pSMBr->last_write_time);
+
+		/* decode response */
+		/* BB FIXME - add time zone adjustment BB */
+		memset(pFinfo, 0, sizeof(FILE_ALL_INFO));
+		ts.tv_nsec = 0;
+		ts.tv_sec = time;
+		/* decode time fields */
+		pFinfo->ChangeTime = cpu_to_le64(cifs_UnixTimeToNT(ts));
+		pFinfo->LastWriteTime = pFinfo->ChangeTime;
+		pFinfo->LastAccessTime = 0;
+		pFinfo->AllocationSize =
+			cpu_to_le64(le32_to_cpu(pSMBr->size));
+		pFinfo->EndOfFile = pFinfo->AllocationSize;
+		pFinfo->Attributes =
+			cpu_to_le32(le16_to_cpu(pSMBr->attr));
+	} else
+		rc = -EIO; /* bad buffer passed in */
+
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto QInfRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBQFileInfo(const int xid, struct cifs_tcon *tcon,
+		 u16 netfid, FILE_ALL_INFO *pFindData)
+{
+	struct smb_t2_qfi_req *pSMB = NULL;
+	struct smb_t2_qfi_rsp *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	__u16 params, byte_count;
+
+QFileInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2 /* level */ + 2 /* fid */;
+	pSMB->t2.TotalDataCount = 0;
+	pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+	pSMB->t2.MaxSetupCount = 0;
+	pSMB->t2.Reserved = 0;
+	pSMB->t2.Flags = 0;
+	pSMB->t2.Timeout = 0;
+	pSMB->t2.Reserved2 = 0;
+	pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+					       Fid) - 4);
+	pSMB->t2.DataCount = 0;
+	pSMB->t2.DataOffset = 0;
+	pSMB->t2.SetupCount = 1;
+	pSMB->t2.Reserved3 = 0;
+	pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+	pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
+	pSMB->Pad = 0;
+	pSMB->Fid = netfid;
+	inc_rfc1001_len(pSMB, byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in QPathInfo = %d", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc) /* BB add auto retry on EOPNOTSUPP? */
+			rc = -EIO;
+		else if (get_bcc(&pSMBr->hdr) < 40)
+			rc = -EIO;	/* bad smb */
+		else if (pFindData) {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			memcpy((char *) pFindData,
+			       (char *) &pSMBr->hdr.Protocol +
+			       data_offset, sizeof(FILE_ALL_INFO));
+		} else
+		    rc = -ENOMEM;
+	}
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto QFileInfoRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBQPathInfo(const int xid, struct cifs_tcon *tcon,
+		 const unsigned char *searchName,
+		 FILE_ALL_INFO *pFindData,
+		 int legacy /* old style infolevel */,
+		 const struct nls_table *nls_codepage, int remap)
+{
+/* level 263 SMB_QUERY_FILE_ALL_INFO */
+	TRANSACTION2_QPI_REQ *pSMB = NULL;
+	TRANSACTION2_QPI_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+	__u16 params, byte_count;
+
+/* cFYI(1, "In QPathInfo path %s", searchName); */
+QPathInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
+				     PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(searchName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, searchName, name_len);
+	}
+
+	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+	struct smb_com_transaction2_qpi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	if (legacy)
+		pSMB->InformationLevel = cpu_to_le16(SMB_INFO_STANDARD);
+	else
+		pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in QPathInfo = %d", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc) /* BB add auto retry on EOPNOTSUPP? */
+			rc = -EIO;
+		else if (!legacy && get_bcc(&pSMBr->hdr) < 40)
+			rc = -EIO;	/* bad smb */
+		else if (legacy && get_bcc(&pSMBr->hdr) < 24)
+			rc = -EIO;  /* 24 or 26 expected but we do not read
+					last field */
+		else if (pFindData) {
+			int size;
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+
+			/* On legacy responses we do not read the last field,
+			EAsize, fortunately since it varies by subdialect and
+			also note it differs on Set vs. Get, ie two bytes or 4
+			bytes depending but we don't care here */
+			if (legacy)
+				size = sizeof(FILE_INFO_STANDARD);
+			else
+				size = sizeof(FILE_ALL_INFO);
+			memcpy((char *) pFindData,
+			       (char *) &pSMBr->hdr.Protocol +
+			       data_offset, size);
+		} else
+		    rc = -ENOMEM;
+	}
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto QPathInfoRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBUnixQFileInfo(const int xid, struct cifs_tcon *tcon,
+		 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
+{
+	struct smb_t2_qfi_req *pSMB = NULL;
+	struct smb_t2_qfi_rsp *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	__u16 params, byte_count;
+
+UnixQFileInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2 /* level */ + 2 /* fid */;
+	pSMB->t2.TotalDataCount = 0;
+	pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+	pSMB->t2.MaxSetupCount = 0;
+	pSMB->t2.Reserved = 0;
+	pSMB->t2.Flags = 0;
+	pSMB->t2.Timeout = 0;
+	pSMB->t2.Reserved2 = 0;
+	pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+					       Fid) - 4);
+	pSMB->t2.DataCount = 0;
+	pSMB->t2.DataOffset = 0;
+	pSMB->t2.SetupCount = 1;
+	pSMB->t2.Reserved3 = 0;
+	pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+	pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
+	pSMB->Pad = 0;
+	pSMB->Fid = netfid;
+	inc_rfc1001_len(pSMB, byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in QPathInfo = %d", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
+			cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
+				   "Unix Extensions can be disabled on mount "
+				   "by specifying the nosfu mount option.");
+			rc = -EIO;	/* bad smb */
+		} else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			memcpy((char *) pFindData,
+			       (char *) &pSMBr->hdr.Protocol +
+			       data_offset,
+			       sizeof(FILE_UNIX_BASIC_INFO));
+		}
+	}
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto UnixQFileInfoRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBUnixQPathInfo(const int xid, struct cifs_tcon *tcon,
+		     const unsigned char *searchName,
+		     FILE_UNIX_BASIC_INFO *pFindData,
+		     const struct nls_table *nls_codepage, int remap)
+{
+/* SMB_QUERY_FILE_UNIX_BASIC */
+	TRANSACTION2_QPI_REQ *pSMB = NULL;
+	TRANSACTION2_QPI_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned = 0;
+	int name_len;
+	__u16 params, byte_count;
+
+	cFYI(1, "In QPathInfo (Unix) the path %s", searchName);
+UnixQPathInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
+				  PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(searchName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, searchName, name_len);
+	}
+
+	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+	struct smb_com_transaction2_qpi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in QPathInfo = %d", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
+			cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
+				   "Unix Extensions can be disabled on mount "
+				   "by specifying the nosfu mount option.");
+			rc = -EIO;	/* bad smb */
+		} else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			memcpy((char *) pFindData,
+			       (char *) &pSMBr->hdr.Protocol +
+			       data_offset,
+			       sizeof(FILE_UNIX_BASIC_INFO));
+		}
+	}
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto UnixQPathInfoRetry;
+
+	return rc;
+}
+
+/* xid, tcon, searchName and codepage are input parms, rest are returned */
+int
+CIFSFindFirst(const int xid, struct cifs_tcon *tcon,
+	      const char *searchName,
+	      const struct nls_table *nls_codepage,
+	      __u16 *pnetfid,
+	      struct cifs_search_info *psrch_inf, int remap, const char dirsep)
+{
+/* level 257 SMB_ */
+	TRANSACTION2_FFIRST_REQ *pSMB = NULL;
+	TRANSACTION2_FFIRST_RSP *pSMBr = NULL;
+	T2_FFIRST_RSP_PARMS *parms;
+	int rc = 0;
+	int bytes_returned = 0;
+	int name_len;
+	__u16 params, byte_count;
+
+	cFYI(1, "In FindFirst for %s", searchName);
+
+findFirstRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
+				 PATH_MAX, nls_codepage, remap);
+		/* We can not add the asterik earlier in case
+		it got remapped to 0xF03A as if it were part of the
+		directory name instead of a wildcard */
+		name_len *= 2;
+		pSMB->FileName[name_len] = dirsep;
+		pSMB->FileName[name_len+1] = 0;
+		pSMB->FileName[name_len+2] = '*';
+		pSMB->FileName[name_len+3] = 0;
+		name_len += 4; /* now the trailing null */
+		pSMB->FileName[name_len] = 0; /* null terminate just in case */
+		pSMB->FileName[name_len+1] = 0;
+		name_len += 2;
+	} else {	/* BB add check for overrun of SMB buf BB */
+		name_len = strnlen(searchName, PATH_MAX);
+/* BB fix here and in unicode clause above ie
+		if (name_len > buffersize-header)
+			free buffer exit; BB */
+		strncpy(pSMB->FileName, searchName, name_len);
+		pSMB->FileName[name_len] = dirsep;
+		pSMB->FileName[name_len+1] = '*';
+		pSMB->FileName[name_len+2] = 0;
+		name_len += 3;
+	}
+
+	params = 12 + name_len /* includes null */ ;
+	pSMB->TotalDataCount = 0;	/* no EAs */
+	pSMB->MaxParameterCount = cpu_to_le16(10);
+	pSMB->MaxDataCount = cpu_to_le16((tcon->ses->server->maxBuf -
+					  MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(
+	      offsetof(struct smb_com_transaction2_ffirst_req, SearchAttributes)
+		- 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;	/* one byte, no need to make endian neutral */
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_FIND_FIRST);
+	pSMB->SearchAttributes =
+	    cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM |
+			ATTR_DIRECTORY);
+	pSMB->SearchCount = cpu_to_le16(CIFSMaxBufSize/sizeof(FILE_UNIX_INFO));
+	pSMB->SearchFlags = cpu_to_le16(CIFS_SEARCH_CLOSE_AT_END |
+		CIFS_SEARCH_RETURN_RESUME);
+	pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level);
+
+	/* BB what should we set StorageType to? Does it matter? BB */
+	pSMB->SearchStorageType = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->num_ffirst);
+
+	if (rc) {/* BB add logic to retry regular search if Unix search
+			rejected unexpectedly by server */
+		/* BB Add code to handle unsupported level rc */
+		cFYI(1, "Error in FindFirst = %d", rc);
+
+		cifs_buf_release(pSMB);
+
+		/* BB eventually could optimize out free and realloc of buf */
+		/*    for this case */
+		if (rc == -EAGAIN)
+			goto findFirstRetry;
+	} else { /* decode response */
+		/* BB remember to free buffer if error BB */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		if (rc == 0) {
+			unsigned int lnoff;
+
+			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+				psrch_inf->unicode = true;
+			else
+				psrch_inf->unicode = false;
+
+			psrch_inf->ntwrk_buf_start = (char *)pSMBr;
+			psrch_inf->smallBuf = 0;
+			psrch_inf->srch_entries_start =
+				(char *) &pSMBr->hdr.Protocol +
+					le16_to_cpu(pSMBr->t2.DataOffset);
+			parms = (T2_FFIRST_RSP_PARMS *)((char *) &pSMBr->hdr.Protocol +
+			       le16_to_cpu(pSMBr->t2.ParameterOffset));
+
+			if (parms->EndofSearch)
+				psrch_inf->endOfSearch = true;
+			else
+				psrch_inf->endOfSearch = false;
+
+			psrch_inf->entries_in_buffer =
+					le16_to_cpu(parms->SearchCount);
+			psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
+				psrch_inf->entries_in_buffer;
+			lnoff = le16_to_cpu(parms->LastNameOffset);
+			if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
+			      lnoff) {
+				cERROR(1, "ignoring corrupt resume name");
+				psrch_inf->last_entry = NULL;
+				return rc;
+			}
+
+			psrch_inf->last_entry = psrch_inf->srch_entries_start +
+							lnoff;
+
+			*pnetfid = parms->SearchHandle;
+		} else {
+			cifs_buf_release(pSMB);
+		}
+	}
+
+	return rc;
+}
+
+int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
+		 __u16 searchHandle, struct cifs_search_info *psrch_inf)
+{
+	TRANSACTION2_FNEXT_REQ *pSMB = NULL;
+	TRANSACTION2_FNEXT_RSP *pSMBr = NULL;
+	T2_FNEXT_RSP_PARMS *parms;
+	char *response_data;
+	int rc = 0;
+	int bytes_returned;
+	unsigned int name_len;
+	__u16 params, byte_count;
+
+	cFYI(1, "In FindNext");
+
+	if (psrch_inf->endOfSearch)
+		return -ENOENT;
+
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		(void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 14; /* includes 2 bytes of null string, converted to LE below*/
+	byte_count = 0;
+	pSMB->TotalDataCount = 0;       /* no EAs */
+	pSMB->MaxParameterCount = cpu_to_le16(8);
+	pSMB->MaxDataCount =
+		cpu_to_le16((tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) &
+				0xFFFFFF00);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset =  cpu_to_le16(
+	      offsetof(struct smb_com_transaction2_fnext_req,SearchHandle) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_FIND_NEXT);
+	pSMB->SearchHandle = searchHandle;      /* always kept as le */
+	pSMB->SearchCount =
+		cpu_to_le16(CIFSMaxBufSize / sizeof(FILE_UNIX_INFO));
+	pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level);
+	pSMB->ResumeKey = psrch_inf->resume_key;
+	pSMB->SearchFlags =
+	      cpu_to_le16(CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME);
+
+	name_len = psrch_inf->resume_name_len;
+	params += name_len;
+	if (name_len < PATH_MAX) {
+		memcpy(pSMB->ResumeFileName, psrch_inf->presume_name, name_len);
+		byte_count += name_len;
+		/* 14 byte parm len above enough for 2 byte null terminator */
+		pSMB->ResumeFileName[name_len] = 0;
+		pSMB->ResumeFileName[name_len+1] = 0;
+	} else {
+		rc = -EINVAL;
+		goto FNext2_err_exit;
+	}
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			(struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->num_fnext);
+	if (rc) {
+		if (rc == -EBADF) {
+			psrch_inf->endOfSearch = true;
+			cifs_buf_release(pSMB);
+			rc = 0; /* search probably was closed at end of search*/
+		} else
+			cFYI(1, "FindNext returned = %d", rc);
+	} else {                /* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc == 0) {
+			unsigned int lnoff;
+
+			/* BB fixme add lock for file (srch_info) struct here */
+			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+				psrch_inf->unicode = true;
+			else
+				psrch_inf->unicode = false;
+			response_data = (char *) &pSMBr->hdr.Protocol +
+			       le16_to_cpu(pSMBr->t2.ParameterOffset);
+			parms = (T2_FNEXT_RSP_PARMS *)response_data;
+			response_data = (char *)&pSMBr->hdr.Protocol +
+				le16_to_cpu(pSMBr->t2.DataOffset);
+			if (psrch_inf->smallBuf)
+				cifs_small_buf_release(
+					psrch_inf->ntwrk_buf_start);
+			else
+				cifs_buf_release(psrch_inf->ntwrk_buf_start);
+			psrch_inf->srch_entries_start = response_data;
+			psrch_inf->ntwrk_buf_start = (char *)pSMB;
+			psrch_inf->smallBuf = 0;
+			if (parms->EndofSearch)
+				psrch_inf->endOfSearch = true;
+			else
+				psrch_inf->endOfSearch = false;
+			psrch_inf->entries_in_buffer =
+						le16_to_cpu(parms->SearchCount);
+			psrch_inf->index_of_last_entry +=
+				psrch_inf->entries_in_buffer;
+			lnoff = le16_to_cpu(parms->LastNameOffset);
+			if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
+			      lnoff) {
+				cERROR(1, "ignoring corrupt resume name");
+				psrch_inf->last_entry = NULL;
+				return rc;
+			} else
+				psrch_inf->last_entry =
+					psrch_inf->srch_entries_start + lnoff;
+
+/*  cFYI(1, "fnxt2 entries in buf %d index_of_last %d",
+	    psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
+
+			/* BB fixme add unlock here */
+		}
+
+	}
+
+	/* BB On error, should we leave previous search buf (and count and
+	last entry fields) intact or free the previous one? */
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+	since file handle passed in no longer valid */
+FNext2_err_exit:
+	if (rc != 0)
+		cifs_buf_release(pSMB);
+	return rc;
+}
+
+int
+CIFSFindClose(const int xid, struct cifs_tcon *tcon,
+	      const __u16 searchHandle)
+{
+	int rc = 0;
+	FINDCLOSE_REQ *pSMB = NULL;
+
+	cFYI(1, "In CIFSSMBFindClose");
+	rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB);
+
+	/* no sense returning error if session restarted
+		as file handle has been closed */
+	if (rc == -EAGAIN)
+		return 0;
+	if (rc)
+		return rc;
+
+	pSMB->FileID = searchHandle;
+	pSMB->ByteCount = 0;
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	if (rc)
+		cERROR(1, "Send error in FindClose = %d", rc);
+
+	cifs_stats_inc(&tcon->num_fclose);
+
+	/* Since session is dead, search handle closed on server already */
+	if (rc == -EAGAIN)
+		rc = 0;
+
+	return rc;
+}
+
+int
+CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon,
+		      const unsigned char *searchName,
+		      __u64 *inode_number,
+		      const struct nls_table *nls_codepage, int remap)
+{
+	int rc = 0;
+	TRANSACTION2_QPI_REQ *pSMB = NULL;
+	TRANSACTION2_QPI_RSP *pSMBr = NULL;
+	int name_len, bytes_returned;
+	__u16 params, byte_count;
+
+	cFYI(1, "In GetSrvInodeNum for %s", searchName);
+	if (tcon == NULL)
+		return -ENODEV;
+
+GetInodeNumberRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+			cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
+					 PATH_MAX, nls_codepage, remap);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(searchName, PATH_MAX);
+		name_len++;     /* trailing null */
+		strncpy(pSMB->FileName, searchName, name_len);
+	}
+
+	params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+		struct smb_com_transaction2_qpi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_INTERNAL_INFO);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "error %d in QueryInternalInfo", rc);
+	} else {
+		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		/* BB also check enough total bytes returned */
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
+			/* If rc should we check for EOPNOSUPP and
+			disable the srvino flag? or in caller? */
+			rc = -EIO;      /* bad smb */
+		else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			__u16 count = le16_to_cpu(pSMBr->t2.DataCount);
+			struct file_internal_info *pfinfo;
+			/* BB Do we need a cast or hash here ? */
+			if (count < 8) {
+				cFYI(1, "Illegal size ret in QryIntrnlInf");
+				rc = -EIO;
+				goto GetInodeNumOut;
+			}
+			pfinfo = (struct file_internal_info *)
+				(data_offset + (char *) &pSMBr->hdr.Protocol);
+			*inode_number = le64_to_cpu(pfinfo->UniqueId);
+		}
+	}
+GetInodeNumOut:
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto GetInodeNumberRetry;
+	return rc;
+}
+
+/* parses DFS refferal V3 structure
+ * caller is responsible for freeing target_nodes
+ * returns:
+ * 	on success - 0
+ *	on failure - errno
+ */
+static int
+parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
+		unsigned int *num_of_nodes,
+		struct dfs_info3_param **target_nodes,
+		const struct nls_table *nls_codepage, int remap,
+		const char *searchName)
+{
+	int i, rc = 0;
+	char *data_end;
+	bool is_unicode;
+	struct dfs_referral_level_3 *ref;
+
+	if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+		is_unicode = true;
+	else
+		is_unicode = false;
+	*num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
+
+	if (*num_of_nodes < 1) {
+		cERROR(1, "num_referrals: must be at least > 0,"
+			"but we get num_referrals = %d\n", *num_of_nodes);
+		rc = -EINVAL;
+		goto parse_DFS_referrals_exit;
+	}
+
+	ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
+	if (ref->VersionNumber != cpu_to_le16(3)) {
+		cERROR(1, "Referrals of V%d version are not supported,"
+			"should be V3", le16_to_cpu(ref->VersionNumber));
+		rc = -EINVAL;
+		goto parse_DFS_referrals_exit;
+	}
+
+	/* get the upper boundary of the resp buffer */
+	data_end = (char *)(&(pSMBr->PathConsumed)) +
+				le16_to_cpu(pSMBr->t2.DataCount);
+
+	cFYI(1, "num_referrals: %d dfs flags: 0x%x ...\n",
+			*num_of_nodes,
+			le32_to_cpu(pSMBr->DFSFlags));
+
+	*target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
+			*num_of_nodes, GFP_KERNEL);
+	if (*target_nodes == NULL) {
+		cERROR(1, "Failed to allocate buffer for target_nodes\n");
+		rc = -ENOMEM;
+		goto parse_DFS_referrals_exit;
+	}
+
+	/* collect necessary data from referrals */
+	for (i = 0; i < *num_of_nodes; i++) {
+		char *temp;
+		int max_len;
+		struct dfs_info3_param *node = (*target_nodes)+i;
+
+		node->flags = le32_to_cpu(pSMBr->DFSFlags);
+		if (is_unicode) {
+			__le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
+						GFP_KERNEL);
+			if (tmp == NULL) {
+				rc = -ENOMEM;
+				goto parse_DFS_referrals_exit;
+			}
+			cifsConvertToUCS((__le16 *) tmp, searchName,
+					PATH_MAX, nls_codepage, remap);
+			node->path_consumed = cifs_ucs2_bytes(tmp,
+					le16_to_cpu(pSMBr->PathConsumed),
+					nls_codepage);
+			kfree(tmp);
+		} else
+			node->path_consumed = le16_to_cpu(pSMBr->PathConsumed);
+
+		node->server_type = le16_to_cpu(ref->ServerType);
+		node->ref_flag = le16_to_cpu(ref->ReferralEntryFlags);
+
+		/* copy DfsPath */
+		temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
+		max_len = data_end - temp;
+		node->path_name = cifs_strndup_from_ucs(temp, max_len,
+						      is_unicode, nls_codepage);
+		if (!node->path_name) {
+			rc = -ENOMEM;
+			goto parse_DFS_referrals_exit;
+		}
+
+		/* copy link target UNC */
+		temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
+		max_len = data_end - temp;
+		node->node_name = cifs_strndup_from_ucs(temp, max_len,
+						      is_unicode, nls_codepage);
+		if (!node->node_name)
+			rc = -ENOMEM;
+	}
+
+parse_DFS_referrals_exit:
+	if (rc) {
+		free_dfs_info_array(*target_nodes, *num_of_nodes);
+		*target_nodes = NULL;
+		*num_of_nodes = 0;
+	}
+	return rc;
+}
+
+int
+CIFSGetDFSRefer(const int xid, struct cifs_ses *ses,
+		const unsigned char *searchName,
+		struct dfs_info3_param **target_nodes,
+		unsigned int *num_of_nodes,
+		const struct nls_table *nls_codepage, int remap)
+{
+/* TRANS2_GET_DFS_REFERRAL */
+	TRANSACTION2_GET_DFS_REFER_REQ *pSMB = NULL;
+	TRANSACTION2_GET_DFS_REFER_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+	__u16 params, byte_count;
+	*num_of_nodes = 0;
+	*target_nodes = NULL;
+
+	cFYI(1, "In GetDFSRefer the path %s", searchName);
+	if (ses == NULL)
+		return -ENODEV;
+getDFSRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, NULL, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	/* server pointer checked in called function,
+	but should never be null here anyway */
+	pSMB->hdr.Mid = GetNextMid(ses->server);
+	pSMB->hdr.Tid = ses->ipc_tid;
+	pSMB->hdr.Uid = ses->Suid;
+	if (ses->capabilities & CAP_STATUS32)
+		pSMB->hdr.Flags2 |= SMBFLG2_ERR_STATUS;
+	if (ses->capabilities & CAP_DFS)
+		pSMB->hdr.Flags2 |= SMBFLG2_DFS;
+
+	if (ses->capabilities & CAP_UNICODE) {
+		pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
+		name_len =
+		    cifsConvertToUCS((__le16 *) pSMB->RequestFileName,
+				     searchName, PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(searchName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->RequestFileName, searchName, name_len);
+	}
+
+	if (ses->server) {
+		if (ses->server->sec_mode &
+		   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+			pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+	}
+
+	pSMB->hdr.Uid = ses->Suid;
+
+	params = 2 /* level */  + name_len /*includes null */ ;
+	pSMB->TotalDataCount = 0;
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->MaxParameterCount = 0;
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+	  struct smb_com_transaction2_get_dfs_refer_req, MaxReferralLevel) - 4);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_GET_DFS_REFERRAL);
+	byte_count = params + 3 /* pad */ ;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->MaxReferralLevel = cpu_to_le16(3);
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in GetDFSRefer = %d", rc);
+		goto GetDFSRefExit;
+	}
+	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+	/* BB Also check if enough total bytes returned? */
+	if (rc || get_bcc(&pSMBr->hdr) < 17) {
+		rc = -EIO;      /* bad smb */
+		goto GetDFSRefExit;
+	}
+
+	cFYI(1, "Decoding GetDFSRefer response BCC: %d  Offset %d",
+				get_bcc(&pSMBr->hdr),
+				le16_to_cpu(pSMBr->t2.DataOffset));
+
+	/* parse returned result into more usable form */
+	rc = parse_DFS_referrals(pSMBr, num_of_nodes,
+				 target_nodes, nls_codepage, remap,
+				 searchName);
+
+GetDFSRefExit:
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto getDFSRetry;
+
+	return rc;
+}
+
+/* Query File System Info such as free space to old servers such as Win 9x */
+int
+SMBOldQFSInfo(const int xid, struct cifs_tcon *tcon, struct kstatfs *FSData)
+{
+/* level 0x01 SMB_QUERY_FILE_SYSTEM_INFO */
+	TRANSACTION2_QFSI_REQ *pSMB = NULL;
+	TRANSACTION2_QFSI_RSP *pSMBr = NULL;
+	FILE_SYSTEM_ALLOC_INFO *response_data;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count;
+
+	cFYI(1, "OldQFSInfo");
+oldQFSInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		(void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2;     /* level */
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+	struct smb_com_transaction2_qfsi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
+	pSMB->InformationLevel = cpu_to_le16(SMB_INFO_ALLOCATION);
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in QFSInfo = %d", rc);
+	} else {                /* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < 18)
+			rc = -EIO;      /* bad smb */
+		else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			cFYI(1, "qfsinf resp BCC: %d  Offset %d",
+				 get_bcc(&pSMBr->hdr), data_offset);
+
+			response_data = (FILE_SYSTEM_ALLOC_INFO *)
+				(((char *) &pSMBr->hdr.Protocol) + data_offset);
+			FSData->f_bsize =
+				le16_to_cpu(response_data->BytesPerSector) *
+				le32_to_cpu(response_data->
+					SectorsPerAllocationUnit);
+			FSData->f_blocks =
+			       le32_to_cpu(response_data->TotalAllocationUnits);
+			FSData->f_bfree = FSData->f_bavail =
+				le32_to_cpu(response_data->FreeAllocationUnits);
+			cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
+			     (unsigned long long)FSData->f_blocks,
+			     (unsigned long long)FSData->f_bfree,
+			     FSData->f_bsize);
+		}
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto oldQFSInfoRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBQFSInfo(const int xid, struct cifs_tcon *tcon, struct kstatfs *FSData)
+{
+/* level 0x103 SMB_QUERY_FILE_SYSTEM_INFO */
+	TRANSACTION2_QFSI_REQ *pSMB = NULL;
+	TRANSACTION2_QFSI_RSP *pSMBr = NULL;
+	FILE_SYSTEM_INFO *response_data;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count;
+
+	cFYI(1, "In QFSInfo");
+QFSInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2;	/* level */
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+		struct smb_com_transaction2_qfsi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_SIZE_INFO);
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in QFSInfo = %d", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < 24)
+			rc = -EIO;	/* bad smb */
+		else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+
+			response_data =
+			    (FILE_SYSTEM_INFO
+			     *) (((char *) &pSMBr->hdr.Protocol) +
+				 data_offset);
+			FSData->f_bsize =
+			    le32_to_cpu(response_data->BytesPerSector) *
+			    le32_to_cpu(response_data->
+					SectorsPerAllocationUnit);
+			FSData->f_blocks =
+			    le64_to_cpu(response_data->TotalAllocationUnits);
+			FSData->f_bfree = FSData->f_bavail =
+			    le64_to_cpu(response_data->FreeAllocationUnits);
+			cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
+			     (unsigned long long)FSData->f_blocks,
+			     (unsigned long long)FSData->f_bfree,
+			     FSData->f_bsize);
+		}
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto QFSInfoRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBQFSAttributeInfo(const int xid, struct cifs_tcon *tcon)
+{
+/* level 0x105  SMB_QUERY_FILE_SYSTEM_INFO */
+	TRANSACTION2_QFSI_REQ *pSMB = NULL;
+	TRANSACTION2_QFSI_RSP *pSMBr = NULL;
+	FILE_SYSTEM_ATTRIBUTE_INFO *response_data;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count;
+
+	cFYI(1, "In QFSAttributeInfo");
+QFSAttributeRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2;	/* level */
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+		struct smb_com_transaction2_qfsi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_ATTRIBUTE_INFO);
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cERROR(1, "Send error in QFSAttributeInfo = %d", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < 13) {
+			/* BB also check if enough bytes returned */
+			rc = -EIO;	/* bad smb */
+		} else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			response_data =
+			    (FILE_SYSTEM_ATTRIBUTE_INFO
+			     *) (((char *) &pSMBr->hdr.Protocol) +
+				 data_offset);
+			memcpy(&tcon->fsAttrInfo, response_data,
+			       sizeof(FILE_SYSTEM_ATTRIBUTE_INFO));
+		}
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto QFSAttributeRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBQFSDeviceInfo(const int xid, struct cifs_tcon *tcon)
+{
+/* level 0x104 SMB_QUERY_FILE_SYSTEM_INFO */
+	TRANSACTION2_QFSI_REQ *pSMB = NULL;
+	TRANSACTION2_QFSI_RSP *pSMBr = NULL;
+	FILE_SYSTEM_DEVICE_INFO *response_data;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count;
+
+	cFYI(1, "In QFSDeviceInfo");
+QFSDeviceRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2;	/* level */
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+		struct smb_com_transaction2_qfsi_req, InformationLevel) - 4);
+
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_DEVICE_INFO);
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in QFSDeviceInfo = %d", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) <
+			  sizeof(FILE_SYSTEM_DEVICE_INFO))
+			rc = -EIO;	/* bad smb */
+		else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			response_data =
+			    (FILE_SYSTEM_DEVICE_INFO *)
+				(((char *) &pSMBr->hdr.Protocol) +
+				 data_offset);
+			memcpy(&tcon->fsDevInfo, response_data,
+			       sizeof(FILE_SYSTEM_DEVICE_INFO));
+		}
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto QFSDeviceRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBQFSUnixInfo(const int xid, struct cifs_tcon *tcon)
+{
+/* level 0x200  SMB_QUERY_CIFS_UNIX_INFO */
+	TRANSACTION2_QFSI_REQ *pSMB = NULL;
+	TRANSACTION2_QFSI_RSP *pSMBr = NULL;
+	FILE_SYSTEM_UNIX_INFO *response_data;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count;
+
+	cFYI(1, "In QFSUnixInfo");
+QFSUnixRetry:
+	rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
+				   (void **) &pSMB, (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2;	/* level */
+	pSMB->TotalDataCount = 0;
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(100);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	byte_count = params + 1 /* pad */ ;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(struct
+			smb_com_transaction2_qfsi_req, InformationLevel) - 4);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_CIFS_UNIX_INFO);
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cERROR(1, "Send error in QFSUnixInfo = %d", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < 13) {
+			rc = -EIO;	/* bad smb */
+		} else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			response_data =
+			    (FILE_SYSTEM_UNIX_INFO
+			     *) (((char *) &pSMBr->hdr.Protocol) +
+				 data_offset);
+			memcpy(&tcon->fsUnixInfo, response_data,
+			       sizeof(FILE_SYSTEM_UNIX_INFO));
+		}
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto QFSUnixRetry;
+
+
+	return rc;
+}
+
+int
+CIFSSMBSetFSUnixInfo(const int xid, struct cifs_tcon *tcon, __u64 cap)
+{
+/* level 0x200  SMB_SET_CIFS_UNIX_INFO */
+	TRANSACTION2_SETFSI_REQ *pSMB = NULL;
+	TRANSACTION2_SETFSI_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, param_offset, offset, byte_count;
+
+	cFYI(1, "In SETFSUnixInfo");
+SETFSUnixRetry:
+	/* BB switch to small buf init to save memory */
+	rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
+					(void **) &pSMB, (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 4;	/* 2 bytes zero followed by info level. */
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_setfsi_req, FileNum)
+				- 4;
+	offset = param_offset + params;
+
+	pSMB->MaxParameterCount = cpu_to_le16(4);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(100);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FS_INFORMATION);
+	byte_count = 1 /* pad */ + params + 12;
+
+	pSMB->DataCount = cpu_to_le16(12);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+
+	/* Params. */
+	pSMB->FileNum = 0;
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_CIFS_UNIX_INFO);
+
+	/* Data. */
+	pSMB->ClientUnixMajor = cpu_to_le16(CIFS_UNIX_MAJOR_VERSION);
+	pSMB->ClientUnixMinor = cpu_to_le16(CIFS_UNIX_MINOR_VERSION);
+	pSMB->ClientUnixCap = cpu_to_le64(cap);
+
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cERROR(1, "Send error in SETFSUnixInfo = %d", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		if (rc)
+			rc = -EIO;	/* bad smb */
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto SETFSUnixRetry;
+
+	return rc;
+}
+
+
+
+int
+CIFSSMBQFSPosixInfo(const int xid, struct cifs_tcon *tcon,
+		   struct kstatfs *FSData)
+{
+/* level 0x201  SMB_QUERY_CIFS_POSIX_INFO */
+	TRANSACTION2_QFSI_REQ *pSMB = NULL;
+	TRANSACTION2_QFSI_RSP *pSMBr = NULL;
+	FILE_SYSTEM_POSIX_INFO *response_data;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count;
+
+	cFYI(1, "In QFSPosixInfo");
+QFSPosixRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2;	/* level */
+	pSMB->TotalDataCount = 0;
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(100);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	byte_count = params + 1 /* pad */ ;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(struct
+			smb_com_transaction2_qfsi_req, InformationLevel) - 4);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_FS_INFO);
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in QFSUnixInfo = %d", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || get_bcc(&pSMBr->hdr) < 13) {
+			rc = -EIO;	/* bad smb */
+		} else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			response_data =
+			    (FILE_SYSTEM_POSIX_INFO
+			     *) (((char *) &pSMBr->hdr.Protocol) +
+				 data_offset);
+			FSData->f_bsize =
+					le32_to_cpu(response_data->BlockSize);
+			FSData->f_blocks =
+					le64_to_cpu(response_data->TotalBlocks);
+			FSData->f_bfree =
+			    le64_to_cpu(response_data->BlocksAvail);
+			if (response_data->UserBlocksAvail == cpu_to_le64(-1)) {
+				FSData->f_bavail = FSData->f_bfree;
+			} else {
+				FSData->f_bavail =
+				    le64_to_cpu(response_data->UserBlocksAvail);
+			}
+			if (response_data->TotalFileNodes != cpu_to_le64(-1))
+				FSData->f_files =
+				     le64_to_cpu(response_data->TotalFileNodes);
+			if (response_data->FreeFileNodes != cpu_to_le64(-1))
+				FSData->f_ffree =
+				      le64_to_cpu(response_data->FreeFileNodes);
+		}
+	}
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto QFSPosixRetry;
+
+	return rc;
+}
+
+
+/* We can not use write of zero bytes trick to
+   set file size due to need for large file support.  Also note that
+   this SetPathInfo is preferred to SetFileInfo based method in next
+   routine which is only needed to work around a sharing violation bug
+   in Samba which this routine can run into */
+
+int
+CIFSSMBSetEOF(const int xid, struct cifs_tcon *tcon, const char *fileName,
+	      __u64 size, bool SetAllocation,
+	      const struct nls_table *nls_codepage, int remap)
+{
+	struct smb_com_transaction2_spi_req *pSMB = NULL;
+	struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
+	struct file_end_of_file_info *parm_data;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count, data_count, param_offset, offset;
+
+	cFYI(1, "In SetEOF");
+SetEOFRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
+				     PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, fileName, name_len);
+	}
+	params = 6 + name_len;
+	data_count = sizeof(struct file_end_of_file_info);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(4100);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+	if (SetAllocation) {
+		if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
+			pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2);
+		else
+			pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO);
+	} else /* Set File Size */  {
+	    if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
+		    pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO2);
+	    else
+		    pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO);
+	}
+
+	parm_data =
+	    (struct file_end_of_file_info *) (((char *) &pSMB->hdr.Protocol) +
+				       offset);
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + data_count;
+	pSMB->DataCount = cpu_to_le16(data_count);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	parm_data->FileSize = cpu_to_le64(size);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cFYI(1, "SetPathInfo (file size) returned %d", rc);
+
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto SetEOFRetry;
+
+	return rc;
+}
+
+int
+CIFSSMBSetFileSize(const int xid, struct cifs_tcon *tcon, __u64 size,
+		   __u16 fid, __u32 pid_of_opener, bool SetAllocation)
+{
+	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+	struct file_end_of_file_info *parm_data;
+	int rc = 0;
+	__u16 params, param_offset, offset, byte_count, count;
+
+	cFYI(1, "SetFileSize (via SetFileInfo) %lld",
+			(long long)size);
+	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
+
+	params = 6;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+	offset = param_offset + params;
+
+	count = sizeof(struct file_end_of_file_info);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	parm_data =
+		(struct file_end_of_file_info *) (((char *) &pSMB->hdr.Protocol)
+				+ offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	parm_data->FileSize = cpu_to_le64(size);
+	pSMB->Fid = fid;
+	if (SetAllocation) {
+		if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
+			pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2);
+		else
+			pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO);
+	} else /* Set File Size */  {
+	    if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
+		    pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO2);
+	    else
+		    pSMB->InformationLevel =
+				cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO);
+	}
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	if (rc) {
+		cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc);
+	}
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+		since file handle passed in no longer valid */
+
+	return rc;
+}
+
+/* Some legacy servers such as NT4 require that the file times be set on
+   an open handle, rather than by pathname - this is awkward due to
+   potential access conflicts on the open, but it is unavoidable for these
+   old servers since the only other choice is to go from 100 nanosecond DCE
+   time and resort to the original setpathinfo level which takes the ancient
+   DOS time format with 2 second granularity */
+int
+CIFSSMBSetFileInfo(const int xid, struct cifs_tcon *tcon,
+		    const FILE_BASIC_INFO *data, __u16 fid, __u32 pid_of_opener)
+{
+	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+	char *data_offset;
+	int rc = 0;
+	__u16 params, param_offset, offset, byte_count, count;
+
+	cFYI(1, "Set Times (via SetFileInfo)");
+	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
+
+	params = 6;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+	offset = param_offset + params;
+
+	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+
+	count = sizeof(FILE_BASIC_INFO);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB PDU from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->Fid = fid;
+	if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
+		pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO2);
+	else
+		pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	if (rc)
+		cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+		since file handle passed in no longer valid */
+
+	return rc;
+}
+
+int
+CIFSSMBSetFileDisposition(const int xid, struct cifs_tcon *tcon,
+			  bool delete_file, __u16 fid, __u32 pid_of_opener)
+{
+	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+	char *data_offset;
+	int rc = 0;
+	__u16 params, param_offset, offset, byte_count, count;
+
+	cFYI(1, "Set File Disposition (via SetFileInfo)");
+	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
+
+	params = 6;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+	offset = param_offset + params;
+
+	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+
+	count = 1;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB PDU from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->Fid = fid;
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	*data_offset = delete_file ? 1 : 0;
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	if (rc)
+		cFYI(1, "Send error in SetFileDisposition = %d", rc);
+
+	return rc;
+}
+
+int
+CIFSSMBSetPathInfo(const int xid, struct cifs_tcon *tcon,
+		   const char *fileName, const FILE_BASIC_INFO *data,
+		   const struct nls_table *nls_codepage, int remap)
+{
+	TRANSACTION2_SPI_REQ *pSMB = NULL;
+	TRANSACTION2_SPI_RSP *pSMBr = NULL;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	char *data_offset;
+	__u16 params, param_offset, offset, byte_count, count;
+
+	cFYI(1, "In SetTimes");
+
+SetTimesRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
+				     PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, fileName, name_len);
+	}
+
+	params = 6 + name_len;
+	count = sizeof(FILE_BASIC_INFO);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
+		pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO2);
+	else
+		pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cFYI(1, "SetPathInfo (times) returned %d", rc);
+
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto SetTimesRetry;
+
+	return rc;
+}
+
+/* Can not be used to set time stamps yet (due to old DOS time format) */
+/* Can be used to set attributes */
+#if 0  /* Possibly not needed - since it turns out that strangely NT4 has a bug
+	  handling it anyway and NT4 was what we thought it would be needed for
+	  Do not delete it until we prove whether needed for Win9x though */
+int
+CIFSSMBSetAttrLegacy(int xid, struct cifs_tcon *tcon, char *fileName,
+		__u16 dos_attrs, const struct nls_table *nls_codepage)
+{
+	SETATTR_REQ *pSMB = NULL;
+	SETATTR_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+
+	cFYI(1, "In SetAttrLegacy");
+
+SetAttrLgcyRetry:
+	rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+			ConvertToUCS((__le16 *) pSMB->fileName, fileName,
+				PATH_MAX, nls_codepage);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;     /* trailing null */
+		strncpy(pSMB->fileName, fileName, name_len);
+	}
+	pSMB->attr = cpu_to_le16(dos_attrs);
+	pSMB->BufferFormat = 0x04;
+	inc_rfc1001_len(pSMB, name_len + 1);
+	pSMB->ByteCount = cpu_to_le16(name_len + 1);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cFYI(1, "Error in LegacySetAttr = %d", rc);
+
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto SetAttrLgcyRetry;
+
+	return rc;
+}
+#endif /* temporarily unneeded SetAttr legacy function */
+
+static void
+cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
+			const struct cifs_unix_set_info_args *args)
+{
+	u64 mode = args->mode;
+
+	/*
+	 * Samba server ignores set of file size to zero due to bugs in some
+	 * older clients, but we should be precise - we use SetFileSize to
+	 * set file size and do not want to truncate file size to zero
+	 * accidentally as happened on one Samba server beta by putting
+	 * zero instead of -1 here
+	 */
+	data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64);
+	data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64);
+	data_offset->LastStatusChange = cpu_to_le64(args->ctime);
+	data_offset->LastAccessTime = cpu_to_le64(args->atime);
+	data_offset->LastModificationTime = cpu_to_le64(args->mtime);
+	data_offset->Uid = cpu_to_le64(args->uid);
+	data_offset->Gid = cpu_to_le64(args->gid);
+	/* better to leave device as zero when it is  */
+	data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));
+	data_offset->DevMinor = cpu_to_le64(MINOR(args->device));
+	data_offset->Permissions = cpu_to_le64(mode);
+
+	if (S_ISREG(mode))
+		data_offset->Type = cpu_to_le32(UNIX_FILE);
+	else if (S_ISDIR(mode))
+		data_offset->Type = cpu_to_le32(UNIX_DIR);
+	else if (S_ISLNK(mode))
+		data_offset->Type = cpu_to_le32(UNIX_SYMLINK);
+	else if (S_ISCHR(mode))
+		data_offset->Type = cpu_to_le32(UNIX_CHARDEV);
+	else if (S_ISBLK(mode))
+		data_offset->Type = cpu_to_le32(UNIX_BLOCKDEV);
+	else if (S_ISFIFO(mode))
+		data_offset->Type = cpu_to_le32(UNIX_FIFO);
+	else if (S_ISSOCK(mode))
+		data_offset->Type = cpu_to_le32(UNIX_SOCKET);
+}
+
+int
+CIFSSMBUnixSetFileInfo(const int xid, struct cifs_tcon *tcon,
+		       const struct cifs_unix_set_info_args *args,
+		       u16 fid, u32 pid_of_opener)
+{
+	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+	FILE_UNIX_BASIC_INFO *data_offset;
+	int rc = 0;
+	u16 params, param_offset, offset, byte_count, count;
+
+	cFYI(1, "Set Unix Info (via SetFileInfo)");
+	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
+
+	params = 6;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+	offset = param_offset + params;
+
+	data_offset = (FILE_UNIX_BASIC_INFO *)
+				((char *)(&pSMB->hdr.Protocol) + offset);
+	count = sizeof(FILE_UNIX_BASIC_INFO);
+
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB PDU from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->Fid = fid;
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	cifs_fill_unix_set_info(data_offset, args);
+
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	if (rc)
+		cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
+
+	/* Note: On -EAGAIN error only caller can retry on handle based calls
+		since file handle passed in no longer valid */
+
+	return rc;
+}
+
+int
+CIFSSMBUnixSetPathInfo(const int xid, struct cifs_tcon *tcon, char *fileName,
+		       const struct cifs_unix_set_info_args *args,
+		       const struct nls_table *nls_codepage, int remap)
+{
+	TRANSACTION2_SPI_REQ *pSMB = NULL;
+	TRANSACTION2_SPI_RSP *pSMBr = NULL;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	FILE_UNIX_BASIC_INFO *data_offset;
+	__u16 params, param_offset, offset, count, byte_count;
+
+	cFYI(1, "In SetUID/GID/Mode");
+setPermsRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
+				     PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, fileName, name_len);
+	}
+
+	params = 6 + name_len;
+	count = sizeof(FILE_UNIX_BASIC_INFO);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+	data_offset =
+	    (FILE_UNIX_BASIC_INFO *) ((char *) &pSMB->hdr.Protocol +
+				      offset);
+	memset(data_offset, 0, count);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+
+	cifs_fill_unix_set_info(data_offset, args);
+
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cFYI(1, "SetPathInfo (perms) returned %d", rc);
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto setPermsRetry;
+	return rc;
+}
+
+#ifdef CONFIG_CIFS_XATTR
+/*
+ * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common
+ * function used by listxattr and getxattr type calls. When ea_name is set,
+ * it looks for that attribute name and stuffs that value into the EAData
+ * buffer. When ea_name is NULL, it stuffs a list of attribute names into the
+ * buffer. In both cases, the return value is either the length of the
+ * resulting data or a negative error code. If EAData is a NULL pointer then
+ * the data isn't copied to it, but the length is returned.
+ */
+ssize_t
+CIFSSMBQAllEAs(const int xid, struct cifs_tcon *tcon,
+		const unsigned char *searchName, const unsigned char *ea_name,
+		char *EAData, size_t buf_size,
+		const struct nls_table *nls_codepage, int remap)
+{
+		/* BB assumes one setup word */
+	TRANSACTION2_QPI_REQ *pSMB = NULL;
+	TRANSACTION2_QPI_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int list_len;
+	struct fealist *ea_response_data;
+	struct fea *temp_fea;
+	char *temp_ptr;
+	char *end_of_smb;
+	__u16 params, byte_count, data_offset;
+
+	cFYI(1, "In Query All EAs path %s", searchName);
+QAllEAsRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		list_len =
+		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
+				     PATH_MAX, nls_codepage, remap);
+		list_len++;	/* trailing null */
+		list_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		list_len = strnlen(searchName, PATH_MAX);
+		list_len++;	/* trailing null */
+		strncpy(pSMB->FileName, searchName, list_len);
+	}
+
+	params = 2 /* level */ + 4 /* reserved */ + list_len /* includes NUL */;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset = cpu_to_le16(offsetof(
+	struct smb_com_transaction2_qpi_req, InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in QueryAllEAs = %d", rc);
+		goto QAllEAsOut;
+	}
+
+
+	/* BB also check enough total bytes returned */
+	/* BB we need to improve the validity checking
+	of these trans2 responses */
+
+	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+	if (rc || get_bcc(&pSMBr->hdr) < 4) {
+		rc = -EIO;	/* bad smb */
+		goto QAllEAsOut;
+	}
+
+	/* check that length of list is not more than bcc */
+	/* check that each entry does not go beyond length
+	   of list */
+	/* check that each element of each entry does not
+	   go beyond end of list */
+	/* validate_trans2_offsets() */
+	/* BB check if start of smb + data_offset > &bcc+ bcc */
+
+	data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+	ea_response_data = (struct fealist *)
+				(((char *) &pSMBr->hdr.Protocol) + data_offset);
+
+	list_len = le32_to_cpu(ea_response_data->list_len);
+	cFYI(1, "ea length %d", list_len);
+	if (list_len <= 8) {
+		cFYI(1, "empty EA list returned from server");
+		goto QAllEAsOut;
+	}
+
+	/* make sure list_len doesn't go past end of SMB */
+	end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr);
+	if ((char *)ea_response_data + list_len > end_of_smb) {
+		cFYI(1, "EA list appears to go beyond SMB");
+		rc = -EIO;
+		goto QAllEAsOut;
+	}
+
+	/* account for ea list len */
+	list_len -= 4;
+	temp_fea = ea_response_data->list;
+	temp_ptr = (char *)temp_fea;
+	while (list_len > 0) {
+		unsigned int name_len;
+		__u16 value_len;
+
+		list_len -= 4;
+		temp_ptr += 4;
+		/* make sure we can read name_len and value_len */
+		if (list_len < 0) {
+			cFYI(1, "EA entry goes beyond length of list");
+			rc = -EIO;
+			goto QAllEAsOut;
+		}
+
+		name_len = temp_fea->name_len;
+		value_len = le16_to_cpu(temp_fea->value_len);
+		list_len -= name_len + 1 + value_len;
+		if (list_len < 0) {
+			cFYI(1, "EA entry goes beyond length of list");
+			rc = -EIO;
+			goto QAllEAsOut;
+		}
+
+		if (ea_name) {
+			if (strncmp(ea_name, temp_ptr, name_len) == 0) {
+				temp_ptr += name_len + 1;
+				rc = value_len;
+				if (buf_size == 0)
+					goto QAllEAsOut;
+				if ((size_t)value_len > buf_size) {
+					rc = -ERANGE;
+					goto QAllEAsOut;
+				}
+				memcpy(EAData, temp_ptr, value_len);
+				goto QAllEAsOut;
+			}
+		} else {
+			/* account for prefix user. and trailing null */
+			rc += (5 + 1 + name_len);
+			if (rc < (int) buf_size) {
+				memcpy(EAData, "user.", 5);
+				EAData += 5;
+				memcpy(EAData, temp_ptr, name_len);
+				EAData += name_len;
+				/* null terminate name */
+				*EAData = 0;
+				++EAData;
+			} else if (buf_size == 0) {
+				/* skip copy - calc size only */
+			} else {
+				/* stop before overrun buffer */
+				rc = -ERANGE;
+				break;
+			}
+		}
+		temp_ptr += name_len + 1 + value_len;
+		temp_fea = (struct fea *)temp_ptr;
+	}
+
+	/* didn't find the named attribute */
+	if (ea_name)
+		rc = -ENODATA;
+
+QAllEAsOut:
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto QAllEAsRetry;
+
+	return (ssize_t)rc;
+}
+
+int
+CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon, const char *fileName,
+	     const char *ea_name, const void *ea_value,
+	     const __u16 ea_value_len, const struct nls_table *nls_codepage,
+	     int remap)
+{
+	struct smb_com_transaction2_spi_req *pSMB = NULL;
+	struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
+	struct fealist *parm_data;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, param_offset, byte_count, offset, count;
+
+	cFYI(1, "In SetEA");
+SetEARetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
+				     PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(fileName, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, fileName, name_len);
+	}
+
+	params = 6 + name_len;
+
+	/* done calculating parms using name_len of file name,
+	now use name_len to calculate length of ea name
+	we are going to create in the inode xattrs */
+	if (ea_name == NULL)
+		name_len = 0;
+	else
+		name_len = strnlen(ea_name, 255);
+
+	count = sizeof(*parm_data) + ea_value_len + name_len;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB PDU from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+	pSMB->InformationLevel =
+		cpu_to_le16(SMB_SET_FILE_EA);
+
+	parm_data =
+		(struct fealist *) (((char *) &pSMB->hdr.Protocol) +
+				       offset);
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->DataCount = cpu_to_le16(count);
+	parm_data->list_len = cpu_to_le32(count);
+	parm_data->list[0].EA_flags = 0;
+	/* we checked above that name len is less than 255 */
+	parm_data->list[0].name_len = (__u8)name_len;
+	/* EA names are always ASCII */
+	if (ea_name)
+		strncpy(parm_data->list[0].name, ea_name, name_len);
+	parm_data->list[0].name[name_len] = 0;
+	parm_data->list[0].value_len = cpu_to_le16(ea_value_len);
+	/* caller ensures that ea_value_len is less than 64K but
+	we need to ensure that it fits within the smb */
+
+	/*BB add length check to see if it would fit in
+	     negotiated SMB buffer size BB */
+	/* if (ea_value_len > buffer_size - 512 (enough for header)) */
+	if (ea_value_len)
+		memcpy(parm_data->list[0].name+name_len+1,
+		       ea_value, ea_value_len);
+
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cFYI(1, "SetPathInfo (EA) returned %d", rc);
+
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto SetEARetry;
+
+	return rc;
+}
+#endif
+
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* BB unused temporarily */
+/*
+ *	Years ago the kernel added a "dnotify" function for Samba server,
+ *	to allow network clients (such as Windows) to display updated
+ *	lists of files in directory listings automatically when
+ *	files are added by one user when another user has the
+ *	same directory open on their desktop.  The Linux cifs kernel
+ *	client hooked into the kernel side of this interface for
+ *	the same reason, but ironically when the VFS moved from
+ *	"dnotify" to "inotify" it became harder to plug in Linux
+ *	network file system clients (the most obvious use case
+ *	for notify interfaces is when multiple users can update
+ *	the contents of the same directory - exactly what network
+ *	file systems can do) although the server (Samba) could
+ *	still use it.  For the short term we leave the worker
+ *	function ifdeffed out (below) until inotify is fixed
+ *	in the VFS to make it easier to plug in network file
+ *	system clients.  If inotify turns out to be permanently
+ *	incompatible for network fs clients, we could instead simply
+ *	expose this config flag by adding a future cifs (and smb2) notify ioctl.
+ */
+int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon,
+		  const int notify_subdirs, const __u16 netfid,
+		  __u32 filter, struct file *pfile, int multishot,
+		  const struct nls_table *nls_codepage)
+{
+	int rc = 0;
+	struct smb_com_transaction_change_notify_req *pSMB = NULL;
+	struct smb_com_ntransaction_change_notify_rsp *pSMBr = NULL;
+	struct dir_notify_req *dnotify_req;
+	int bytes_returned;
+
+	cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
+	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->TotalParameterCount = 0 ;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le32(2);
+	/* BB find exact data count max from sess structure BB */
+	pSMB->MaxDataCount = 0; /* same in little endian or be */
+/* BB VERIFY verify which is correct for above BB */
+	pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
+					     MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+
+	pSMB->MaxSetupCount = 4;
+	pSMB->Reserved = 0;
+	pSMB->ParameterOffset = 0;
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 4; /* single byte does not need le conversion */
+	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_NOTIFY_CHANGE);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	if (notify_subdirs)
+		pSMB->WatchTree = 1; /* one byte - no le conversion needed */
+	pSMB->Reserved2 = 0;
+	pSMB->CompletionFilter = cpu_to_le32(filter);
+	pSMB->Fid = netfid; /* file handle always le */
+	pSMB->ByteCount = 0;
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *)pSMBr, &bytes_returned,
+			 CIFS_ASYNC_OP);
+	if (rc) {
+		cFYI(1, "Error in Notify = %d", rc);
+	} else {
+		/* Add file to outstanding requests */
+		/* BB change to kmem cache alloc */
+		dnotify_req = kmalloc(
+						sizeof(struct dir_notify_req),
+						 GFP_KERNEL);
+		if (dnotify_req) {
+			dnotify_req->Pid = pSMB->hdr.Pid;
+			dnotify_req->PidHigh = pSMB->hdr.PidHigh;
+			dnotify_req->Mid = pSMB->hdr.Mid;
+			dnotify_req->Tid = pSMB->hdr.Tid;
+			dnotify_req->Uid = pSMB->hdr.Uid;
+			dnotify_req->netfid = netfid;
+			dnotify_req->pfile = pfile;
+			dnotify_req->filter = filter;
+			dnotify_req->multishot = multishot;
+			spin_lock(&GlobalMid_Lock);
+			list_add_tail(&dnotify_req->lhead,
+					&GlobalDnotifyReqList);
+			spin_unlock(&GlobalMid_Lock);
+		} else
+			rc = -ENOMEM;
+	}
+	cifs_buf_release(pSMB);
+	return rc;
+}
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
new file mode 100644
index 00000000..b7758094
--- /dev/null
+++ b/fs/cifs/connect.c
@@ -0,0 +1,3719 @@
+/*
+ *   fs/cifs/connect.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2009
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/net.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/ctype.h>
+#include <linux/utsname.h>
+#include <linux/mempool.h>
+#include <linux/delay.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/pagevec.h>
+#include <linux/freezer.h>
+#include <linux/namei.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <linux/inet.h>
+#include <net/ipv6.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "ntlmssp.h"
+#include "nterr.h"
+#include "rfc1002pdu.h"
+#include "fscache.h"
+
+#define CIFS_PORT 445
+#define RFC1001_PORT 139
+
+/* SMB echo "timeout" -- FIXME: tunable? */
+#define SMB_ECHO_INTERVAL (60 * HZ)
+
+extern mempool_t *cifs_req_poolp;
+
+/* FIXME: should these be tunable? */
+#define TLINK_ERROR_EXPIRE	(1 * HZ)
+#define TLINK_IDLE_EXPIRE	(600 * HZ)
+
+static int ip_connect(struct TCP_Server_Info *server);
+static int generic_ip_connect(struct TCP_Server_Info *server);
+static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
+static void cifs_prune_tlinks(struct work_struct *work);
+static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
+					const char *devname);
+
+/*
+ * cifs tcp session reconnection
+ *
+ * mark tcp session as reconnecting so temporarily locked
+ * mark all smb sessions as reconnecting for tcp session
+ * reconnect tcp session
+ * wake up waiters on reconnection? - (not needed currently)
+ */
+static int
+cifs_reconnect(struct TCP_Server_Info *server)
+{
+	int rc = 0;
+	struct list_head *tmp, *tmp2;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct mid_q_entry *mid_entry;
+	struct list_head retry_list;
+
+	spin_lock(&GlobalMid_Lock);
+	if (server->tcpStatus == CifsExiting) {
+		/* the demux thread will exit normally
+		next time through the loop */
+		spin_unlock(&GlobalMid_Lock);
+		return rc;
+	} else
+		server->tcpStatus = CifsNeedReconnect;
+	spin_unlock(&GlobalMid_Lock);
+	server->maxBuf = 0;
+
+	cFYI(1, "Reconnecting tcp session");
+
+	/* before reconnecting the tcp session, mark the smb session (uid)
+		and the tid bad so they are not used until reconnected */
+	cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &server->smb_ses_list) {
+		ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+		ses->need_reconnect = true;
+		ses->ipc_tid = 0;
+		list_for_each(tmp2, &ses->tcon_list) {
+			tcon = list_entry(tmp2, struct cifs_tcon, tcon_list);
+			tcon->need_reconnect = true;
+		}
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	/* do not want to be sending data on a socket we are freeing */
+	cFYI(1, "%s: tearing down socket", __func__);
+	mutex_lock(&server->srv_mutex);
+	if (server->ssocket) {
+		cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
+			server->ssocket->flags);
+		kernel_sock_shutdown(server->ssocket, SHUT_WR);
+		cFYI(1, "Post shutdown state: 0x%x Flags: 0x%lx",
+			server->ssocket->state,
+			server->ssocket->flags);
+		sock_release(server->ssocket);
+		server->ssocket = NULL;
+	}
+	server->sequence_number = 0;
+	server->session_estab = false;
+	kfree(server->session_key.response);
+	server->session_key.response = NULL;
+	server->session_key.len = 0;
+	server->lstrp = jiffies;
+	mutex_unlock(&server->srv_mutex);
+
+	/* mark submitted MIDs for retry and issue callback */
+	INIT_LIST_HEAD(&retry_list);
+	cFYI(1, "%s: moving mids to private list", __func__);
+	spin_lock(&GlobalMid_Lock);
+	list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
+		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+		if (mid_entry->midState == MID_REQUEST_SUBMITTED)
+			mid_entry->midState = MID_RETRY_NEEDED;
+		list_move(&mid_entry->qhead, &retry_list);
+	}
+	spin_unlock(&GlobalMid_Lock);
+
+	cFYI(1, "%s: issuing mid callbacks", __func__);
+	list_for_each_safe(tmp, tmp2, &retry_list) {
+		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+		list_del_init(&mid_entry->qhead);
+		mid_entry->callback(mid_entry);
+	}
+
+	do {
+		try_to_freeze();
+
+		/* we should try only the port we connected to before */
+		rc = generic_ip_connect(server);
+		if (rc) {
+			cFYI(1, "reconnect error %d", rc);
+			msleep(3000);
+		} else {
+			atomic_inc(&tcpSesReconnectCount);
+			spin_lock(&GlobalMid_Lock);
+			if (server->tcpStatus != CifsExiting)
+				server->tcpStatus = CifsNeedNegotiate;
+			spin_unlock(&GlobalMid_Lock);
+		}
+	} while (server->tcpStatus == CifsNeedReconnect);
+
+	return rc;
+}
+
+/*
+	return codes:
+		0 	not a transact2, or all data present
+		>0 	transact2 with that much data missing
+		-EINVAL = invalid transact2
+
+ */
+static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
+{
+	struct smb_t2_rsp *pSMBt;
+	int remaining;
+	__u16 total_data_size, data_in_this_rsp;
+
+	if (pSMB->Command != SMB_COM_TRANSACTION2)
+		return 0;
+
+	/* check for plausible wct, bcc and t2 data and parm sizes */
+	/* check for parm and data offset going beyond end of smb */
+	if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */
+		cFYI(1, "invalid transact2 word count");
+		return -EINVAL;
+	}
+
+	pSMBt = (struct smb_t2_rsp *)pSMB;
+
+	total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
+	data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
+
+	if (total_data_size == data_in_this_rsp)
+		return 0;
+	else if (total_data_size < data_in_this_rsp) {
+		cFYI(1, "total data %d smaller than data in frame %d",
+			total_data_size, data_in_this_rsp);
+		return -EINVAL;
+	}
+
+	remaining = total_data_size - data_in_this_rsp;
+
+	cFYI(1, "missing %d bytes from transact2, check next response",
+		remaining);
+	if (total_data_size > maxBufSize) {
+		cERROR(1, "TotalDataSize %d is over maximum buffer %d",
+			total_data_size, maxBufSize);
+		return -EINVAL;
+	}
+	return remaining;
+}
+
+static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
+{
+	struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
+	struct smb_t2_rsp *pSMBt  = (struct smb_t2_rsp *)pTargetSMB;
+	char *data_area_of_target;
+	char *data_area_of_buf2;
+	int remaining;
+	unsigned int byte_count, total_in_buf;
+	__u16 total_data_size, total_in_buf2;
+
+	total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
+
+	if (total_data_size !=
+	    get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
+		cFYI(1, "total data size of primary and secondary t2 differ");
+
+	total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
+
+	remaining = total_data_size - total_in_buf;
+
+	if (remaining < 0)
+		return -EPROTO;
+
+	if (remaining == 0) /* nothing to do, ignore */
+		return 0;
+
+	total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
+	if (remaining < total_in_buf2) {
+		cFYI(1, "transact2 2nd response contains too much data");
+	}
+
+	/* find end of first SMB data area */
+	data_area_of_target = (char *)&pSMBt->hdr.Protocol +
+				get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
+	/* validate target area */
+
+	data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
+				get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
+
+	data_area_of_target += total_in_buf;
+
+	/* copy second buffer into end of first buffer */
+	total_in_buf += total_in_buf2;
+	/* is the result too big for the field? */
+	if (total_in_buf > USHRT_MAX)
+		return -EPROTO;
+	put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
+
+	/* fix up the BCC */
+	byte_count = get_bcc(pTargetSMB);
+	byte_count += total_in_buf2;
+	/* is the result too big for the field? */
+	if (byte_count > USHRT_MAX)
+		return -EPROTO;
+	put_bcc(byte_count, pTargetSMB);
+
+	byte_count = be32_to_cpu(pTargetSMB->smb_buf_length);
+	byte_count += total_in_buf2;
+	/* don't allow buffer to overflow */
+	if (byte_count > CIFSMaxBufSize)
+		return -ENOBUFS;
+	pTargetSMB->smb_buf_length = cpu_to_be32(byte_count);
+
+	memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
+
+	if (remaining == total_in_buf2) {
+		cFYI(1, "found the last secondary response");
+		return 0; /* we are done */
+	} else /* more responses to go */
+		return 1;
+}
+
+static void
+cifs_echo_request(struct work_struct *work)
+{
+	int rc;
+	struct TCP_Server_Info *server = container_of(work,
+					struct TCP_Server_Info, echo.work);
+
+	/*
+	 * We cannot send an echo until the NEGOTIATE_PROTOCOL request is
+	 * done, which is indicated by maxBuf != 0. Also, no need to ping if
+	 * we got a response recently
+	 */
+	if (server->maxBuf == 0 ||
+	    time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+		goto requeue_echo;
+
+	rc = CIFSSMBEcho(server);
+	if (rc)
+		cFYI(1, "Unable to send echo request to server: %s",
+			server->hostname);
+
+requeue_echo:
+	queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
+}
+
+static int
+cifs_demultiplex_thread(struct TCP_Server_Info *server)
+{
+	int length;
+	unsigned int pdu_length, total_read;
+	struct smb_hdr *smb_buffer = NULL;
+	struct smb_hdr *bigbuf = NULL;
+	struct smb_hdr *smallbuf = NULL;
+	struct msghdr smb_msg;
+	struct kvec iov;
+	struct socket *csocket = server->ssocket;
+	struct list_head *tmp, *tmp2;
+	struct task_struct *task_to_wake = NULL;
+	struct mid_q_entry *mid_entry;
+	char temp;
+	bool isLargeBuf = false;
+	bool isMultiRsp;
+	int reconnect;
+
+	current->flags |= PF_MEMALLOC;
+	cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
+
+	length = atomic_inc_return(&tcpSesAllocCount);
+	if (length > 1)
+		mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
+				GFP_KERNEL);
+
+	set_freezable();
+	while (server->tcpStatus != CifsExiting) {
+		if (try_to_freeze())
+			continue;
+		if (bigbuf == NULL) {
+			bigbuf = cifs_buf_get();
+			if (!bigbuf) {
+				cERROR(1, "No memory for large SMB response");
+				msleep(3000);
+				/* retry will check if exiting */
+				continue;
+			}
+		} else if (isLargeBuf) {
+			/* we are reusing a dirty large buf, clear its start */
+			memset(bigbuf, 0, sizeof(struct smb_hdr));
+		}
+
+		if (smallbuf == NULL) {
+			smallbuf = cifs_small_buf_get();
+			if (!smallbuf) {
+				cERROR(1, "No memory for SMB response");
+				msleep(1000);
+				/* retry will check if exiting */
+				continue;
+			}
+			/* beginning of smb buffer is cleared in our buf_get */
+		} else /* if existing small buf clear beginning */
+			memset(smallbuf, 0, sizeof(struct smb_hdr));
+
+		isLargeBuf = false;
+		isMultiRsp = false;
+		smb_buffer = smallbuf;
+		iov.iov_base = smb_buffer;
+		iov.iov_len = 4;
+		smb_msg.msg_control = NULL;
+		smb_msg.msg_controllen = 0;
+		pdu_length = 4; /* enough to get RFC1001 header */
+
+incomplete_rcv:
+		if (echo_retries > 0 && server->tcpStatus == CifsGood &&
+		    time_after(jiffies, server->lstrp +
+					(echo_retries * SMB_ECHO_INTERVAL))) {
+			cERROR(1, "Server %s has not responded in %d seconds. "
+				  "Reconnecting...", server->hostname,
+				  (echo_retries * SMB_ECHO_INTERVAL / HZ));
+			cifs_reconnect(server);
+			csocket = server->ssocket;
+			wake_up(&server->response_q);
+			continue;
+		}
+
+		length =
+		    kernel_recvmsg(csocket, &smb_msg,
+				&iov, 1, pdu_length, 0 /* BB other flags? */);
+
+		if (server->tcpStatus == CifsExiting) {
+			break;
+		} else if (server->tcpStatus == CifsNeedReconnect) {
+			cFYI(1, "Reconnect after server stopped responding");
+			cifs_reconnect(server);
+			cFYI(1, "call to reconnect done");
+			csocket = server->ssocket;
+			continue;
+		} else if (length == -ERESTARTSYS ||
+			   length == -EAGAIN ||
+			   length == -EINTR) {
+			msleep(1); /* minimum sleep to prevent looping
+				allowing socket to clear and app threads to set
+				tcpStatus CifsNeedReconnect if server hung */
+			if (pdu_length < 4) {
+				iov.iov_base = (4 - pdu_length) +
+							(char *)smb_buffer;
+				iov.iov_len = pdu_length;
+				smb_msg.msg_control = NULL;
+				smb_msg.msg_controllen = 0;
+				goto incomplete_rcv;
+			} else
+				continue;
+		} else if (length <= 0) {
+			cFYI(1, "Reconnect after unexpected peek error %d",
+				length);
+			cifs_reconnect(server);
+			csocket = server->ssocket;
+			wake_up(&server->response_q);
+			continue;
+		} else if (length < pdu_length) {
+			cFYI(1, "requested %d bytes but only got %d bytes",
+				  pdu_length, length);
+			pdu_length -= length;
+			msleep(1);
+			goto incomplete_rcv;
+		}
+
+		/* The right amount was read from socket - 4 bytes */
+		/* so we can now interpret the length field */
+
+		/* the first byte big endian of the length field,
+		is actually not part of the length but the type
+		with the most common, zero, as regular data */
+		temp = *((char *) smb_buffer);
+
+		/* Note that FC 1001 length is big endian on the wire,
+		but we convert it here so it is always manipulated
+		as host byte order */
+		pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
+
+		cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
+
+		if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
+			continue;
+		} else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
+			cFYI(1, "Good RFC 1002 session rsp");
+			continue;
+		} else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
+			/* we get this from Windows 98 instead of
+			   an error on SMB negprot response */
+			cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
+				pdu_length);
+			/* give server a second to clean up  */
+			msleep(1000);
+			/* always try 445 first on reconnect since we get NACK
+			 * on some if we ever connected to port 139 (the NACK
+			 * is since we do not begin with RFC1001 session
+			 * initialize frame)
+			 */
+			cifs_set_port((struct sockaddr *)
+					&server->dstaddr, CIFS_PORT);
+			cifs_reconnect(server);
+			csocket = server->ssocket;
+			wake_up(&server->response_q);
+			continue;
+		} else if (temp != (char) 0) {
+			cERROR(1, "Unknown RFC 1002 frame");
+			cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
+				      length);
+			cifs_reconnect(server);
+			csocket = server->ssocket;
+			continue;
+		}
+
+		/* else we have an SMB response */
+		if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
+			    (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
+			cERROR(1, "Invalid size SMB length %d pdu_length %d",
+					length, pdu_length+4);
+			cifs_reconnect(server);
+			csocket = server->ssocket;
+			wake_up(&server->response_q);
+			continue;
+		}
+
+		/* else length ok */
+		reconnect = 0;
+
+		if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
+			isLargeBuf = true;
+			memcpy(bigbuf, smallbuf, 4);
+			smb_buffer = bigbuf;
+		}
+		length = 0;
+		iov.iov_base = 4 + (char *)smb_buffer;
+		iov.iov_len = pdu_length;
+		for (total_read = 0; total_read < pdu_length;
+		     total_read += length) {
+			length = kernel_recvmsg(csocket, &smb_msg, &iov, 1,
+						pdu_length - total_read, 0);
+			if (server->tcpStatus == CifsExiting) {
+				/* then will exit */
+				reconnect = 2;
+				break;
+			} else if (server->tcpStatus == CifsNeedReconnect) {
+				cifs_reconnect(server);
+				csocket = server->ssocket;
+				/* Reconnect wakes up rspns q */
+				/* Now we will reread sock */
+				reconnect = 1;
+				break;
+			} else if (length == -ERESTARTSYS ||
+				   length == -EAGAIN ||
+				   length == -EINTR) {
+				msleep(1); /* minimum sleep to prevent looping,
+					      allowing socket to clear and app
+					      threads to set tcpStatus
+					      CifsNeedReconnect if server hung*/
+				length = 0;
+				continue;
+			} else if (length <= 0) {
+				cERROR(1, "Received no data, expecting %d",
+					      pdu_length - total_read);
+				cifs_reconnect(server);
+				csocket = server->ssocket;
+				reconnect = 1;
+				break;
+			}
+		}
+		if (reconnect == 2)
+			break;
+		else if (reconnect == 1)
+			continue;
+
+		total_read += 4; /* account for rfc1002 hdr */
+
+		dump_smb(smb_buffer, total_read);
+
+		/*
+		 * We know that we received enough to get to the MID as we
+		 * checked the pdu_length earlier. Now check to see
+		 * if the rest of the header is OK. We borrow the length
+		 * var for the rest of the loop to avoid a new stack var.
+		 *
+		 * 48 bytes is enough to display the header and a little bit
+		 * into the payload for debugging purposes.
+		 */
+		length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
+		if (length != 0)
+			cifs_dump_mem("Bad SMB: ", smb_buffer,
+					min_t(unsigned int, total_read, 48));
+
+		mid_entry = NULL;
+		server->lstrp = jiffies;
+
+		spin_lock(&GlobalMid_Lock);
+		list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
+			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+
+			if (mid_entry->mid != smb_buffer->Mid ||
+			    mid_entry->midState != MID_REQUEST_SUBMITTED ||
+			    mid_entry->command != smb_buffer->Command) {
+				mid_entry = NULL;
+				continue;
+			}
+
+			if (length == 0 &&
+			    check2ndT2(smb_buffer, server->maxBuf) > 0) {
+				/* We have a multipart transact2 resp */
+				isMultiRsp = true;
+				if (mid_entry->resp_buf) {
+					/* merge response - fix up 1st*/
+					length = coalesce_t2(smb_buffer,
+							mid_entry->resp_buf);
+					if (length > 0) {
+						length = 0;
+						mid_entry->multiRsp = true;
+						break;
+					} else {
+						/* all parts received or
+						 * packet is malformed
+						 */
+						mid_entry->multiEnd = true;
+						goto multi_t2_fnd;
+					}
+				} else {
+					if (!isLargeBuf) {
+						/*
+						 * FIXME: switch to already
+						 *        allocated largebuf?
+						 */
+						cERROR(1, "1st trans2 resp "
+							  "needs bigbuf");
+					} else {
+						/* Have first buffer */
+						mid_entry->resp_buf =
+							 smb_buffer;
+						mid_entry->largeBuf = true;
+						bigbuf = NULL;
+					}
+				}
+				break;
+			}
+			mid_entry->resp_buf = smb_buffer;
+			mid_entry->largeBuf = isLargeBuf;
+multi_t2_fnd:
+			if (length == 0)
+				mid_entry->midState = MID_RESPONSE_RECEIVED;
+			else
+				mid_entry->midState = MID_RESPONSE_MALFORMED;
+#ifdef CONFIG_CIFS_STATS2
+			mid_entry->when_received = jiffies;
+#endif
+			list_del_init(&mid_entry->qhead);
+			break;
+		}
+		spin_unlock(&GlobalMid_Lock);
+
+		if (mid_entry != NULL) {
+			mid_entry->callback(mid_entry);
+			/* Was previous buf put in mpx struct for multi-rsp? */
+			if (!isMultiRsp) {
+				/* smb buffer will be freed by user thread */
+				if (isLargeBuf)
+					bigbuf = NULL;
+				else
+					smallbuf = NULL;
+			}
+		} else if (length != 0) {
+			/* response sanity checks failed */
+			continue;
+		} else if (!is_valid_oplock_break(smb_buffer, server) &&
+			   !isMultiRsp) {
+			cERROR(1, "No task to wake, unknown frame received! "
+				   "NumMids %d", atomic_read(&midCount));
+			cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
+				      sizeof(struct smb_hdr));
+#ifdef CONFIG_CIFS_DEBUG2
+			cifs_dump_detail(smb_buffer);
+			cifs_dump_mids(server);
+#endif /* CIFS_DEBUG2 */
+
+		}
+	} /* end while !EXITING */
+
+	/* take it off the list, if it's not already */
+	spin_lock(&cifs_tcp_ses_lock);
+	list_del_init(&server->tcp_ses_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	spin_lock(&GlobalMid_Lock);
+	server->tcpStatus = CifsExiting;
+	spin_unlock(&GlobalMid_Lock);
+	wake_up_all(&server->response_q);
+
+	/* check if we have blocked requests that need to free */
+	/* Note that cifs_max_pending is normally 50, but
+	can be set at module install time to as little as two */
+	spin_lock(&GlobalMid_Lock);
+	if (atomic_read(&server->inFlight) >= cifs_max_pending)
+		atomic_set(&server->inFlight, cifs_max_pending - 1);
+	/* We do not want to set the max_pending too low or we
+	could end up with the counter going negative */
+	spin_unlock(&GlobalMid_Lock);
+	/* Although there should not be any requests blocked on
+	this queue it can not hurt to be paranoid and try to wake up requests
+	that may haven been blocked when more than 50 at time were on the wire
+	to the same server - they now will see the session is in exit state
+	and get out of SendReceive.  */
+	wake_up_all(&server->request_q);
+	/* give those requests time to exit */
+	msleep(125);
+
+	if (server->ssocket) {
+		sock_release(csocket);
+		server->ssocket = NULL;
+	}
+	/* buffer usually freed in free_mid - need to free it here on exit */
+	cifs_buf_release(bigbuf);
+	if (smallbuf) /* no sense logging a debug message if NULL */
+		cifs_small_buf_release(smallbuf);
+
+	if (!list_empty(&server->pending_mid_q)) {
+		struct list_head dispose_list;
+
+		INIT_LIST_HEAD(&dispose_list);
+		spin_lock(&GlobalMid_Lock);
+		list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
+			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+			cFYI(1, "Clearing mid 0x%x", mid_entry->mid);
+			mid_entry->midState = MID_SHUTDOWN;
+			list_move(&mid_entry->qhead, &dispose_list);
+		}
+		spin_unlock(&GlobalMid_Lock);
+
+		/* now walk dispose list and issue callbacks */
+		list_for_each_safe(tmp, tmp2, &dispose_list) {
+			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+			cFYI(1, "Callback mid 0x%x", mid_entry->mid);
+			list_del_init(&mid_entry->qhead);
+			mid_entry->callback(mid_entry);
+		}
+		/* 1/8th of sec is more than enough time for them to exit */
+		msleep(125);
+	}
+
+	if (!list_empty(&server->pending_mid_q)) {
+		/* mpx threads have not exited yet give them
+		at least the smb send timeout time for long ops */
+		/* due to delays on oplock break requests, we need
+		to wait at least 45 seconds before giving up
+		on a request getting a response and going ahead
+		and killing cifsd */
+		cFYI(1, "Wait for exit from demultiplex thread");
+		msleep(46000);
+		/* if threads still have not exited they are probably never
+		coming home not much else we can do but free the memory */
+	}
+
+	kfree(server->hostname);
+	task_to_wake = xchg(&server->tsk, NULL);
+	kfree(server);
+
+	length = atomic_dec_return(&tcpSesAllocCount);
+	if (length  > 0)
+		mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
+				GFP_KERNEL);
+
+	/* if server->tsk was NULL then wait for a signal before exiting */
+	if (!task_to_wake) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		while (!signal_pending(current)) {
+			schedule();
+			set_current_state(TASK_INTERRUPTIBLE);
+		}
+		set_current_state(TASK_RUNNING);
+	}
+
+	module_put_and_exit(0);
+}
+
+/* extract the host portion of the UNC string */
+static char *
+extract_hostname(const char *unc)
+{
+	const char *src;
+	char *dst, *delim;
+	unsigned int len;
+
+	/* skip double chars at beginning of string */
+	/* BB: check validity of these bytes? */
+	src = unc + 2;
+
+	/* delimiter between hostname and sharename is always '\\' now */
+	delim = strchr(src, '\\');
+	if (!delim)
+		return ERR_PTR(-EINVAL);
+
+	len = delim - src;
+	dst = kmalloc((len + 1), GFP_KERNEL);
+	if (dst == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(dst, src, len);
+	dst[len] = '\0';
+
+	return dst;
+}
+
+static int
+cifs_parse_mount_options(const char *mountdata, const char *devname,
+			 struct smb_vol *vol)
+{
+	char *value, *data, *end;
+	char *mountdata_copy = NULL, *options;
+	unsigned int  temp_len, i, j;
+	char separator[2];
+	short int override_uid = -1;
+	short int override_gid = -1;
+	bool uid_specified = false;
+	bool gid_specified = false;
+	char *nodename = utsname()->nodename;
+
+	separator[0] = ',';
+	separator[1] = 0;
+
+	/*
+	 * does not have to be perfect mapping since field is
+	 * informational, only used for servers that do not support
+	 * port 445 and it can be overridden at mount time
+	 */
+	memset(vol->source_rfc1001_name, 0x20, RFC1001_NAME_LEN);
+	for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++)
+		vol->source_rfc1001_name[i] = toupper(nodename[i]);
+
+	vol->source_rfc1001_name[RFC1001_NAME_LEN] = 0;
+	/* null target name indicates to use *SMBSERVR default called name
+	   if we end up sending RFC1001 session initialize */
+	vol->target_rfc1001_name[0] = 0;
+	vol->cred_uid = current_uid();
+	vol->linux_uid = current_uid();
+	vol->linux_gid = current_gid();
+
+	/* default to only allowing write access to owner of the mount */
+	vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR;
+
+	/* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
+	/* default is always to request posix paths. */
+	vol->posix_paths = 1;
+	/* default to using server inode numbers where available */
+	vol->server_ino = 1;
+
+	vol->actimeo = CIFS_DEF_ACTIMEO;
+
+	if (!mountdata)
+		goto cifs_parse_mount_err;
+
+	mountdata_copy = kstrndup(mountdata, PAGE_SIZE, GFP_KERNEL);
+	if (!mountdata_copy)
+		goto cifs_parse_mount_err;
+
+	options = mountdata_copy;
+	end = options + strlen(options);
+	if (strncmp(options, "sep=", 4) == 0) {
+		if (options[4] != 0) {
+			separator[0] = options[4];
+			options += 5;
+		} else {
+			cFYI(1, "Null separator not allowed");
+		}
+	}
+
+	while ((data = strsep(&options, separator)) != NULL) {
+		if (!*data)
+			continue;
+		if ((value = strchr(data, '=')) != NULL)
+			*value++ = '\0';
+
+		/* Have to parse this before we parse for "user" */
+		if (strnicmp(data, "user_xattr", 10) == 0) {
+			vol->no_xattr = 0;
+		} else if (strnicmp(data, "nouser_xattr", 12) == 0) {
+			vol->no_xattr = 1;
+		} else if (strnicmp(data, "user", 4) == 0) {
+			if (!value) {
+				printk(KERN_WARNING
+				       "CIFS: invalid or missing username\n");
+				goto cifs_parse_mount_err;
+			} else if (!*value) {
+				/* null user, ie anonymous, authentication */
+				vol->nullauth = 1;
+			}
+			if (strnlen(value, MAX_USERNAME_SIZE) <
+						MAX_USERNAME_SIZE) {
+				vol->username = kstrdup(value, GFP_KERNEL);
+				if (!vol->username) {
+					printk(KERN_WARNING "CIFS: no memory "
+							    "for username\n");
+					goto cifs_parse_mount_err;
+				}
+			} else {
+				printk(KERN_WARNING "CIFS: username too long\n");
+				goto cifs_parse_mount_err;
+			}
+		} else if (strnicmp(data, "pass", 4) == 0) {
+			if (!value) {
+				vol->password = NULL;
+				continue;
+			} else if (value[0] == 0) {
+				/* check if string begins with double comma
+				   since that would mean the password really
+				   does start with a comma, and would not
+				   indicate an empty string */
+				if (value[1] != separator[0]) {
+					vol->password = NULL;
+					continue;
+				}
+			}
+			temp_len = strlen(value);
+			/* removed password length check, NTLM passwords
+				can be arbitrarily long */
+
+			/* if comma in password, the string will be
+			prematurely null terminated.  Commas in password are
+			specified across the cifs mount interface by a double
+			comma ie ,, and a comma used as in other cases ie ','
+			as a parameter delimiter/separator is single and due
+			to the strsep above is temporarily zeroed. */
+
+			/* NB: password legally can have multiple commas and
+			the only illegal character in a password is null */
+
+			if ((value[temp_len] == 0) &&
+			    (value + temp_len < end) &&
+			    (value[temp_len+1] == separator[0])) {
+				/* reinsert comma */
+				value[temp_len] = separator[0];
+				temp_len += 2;  /* move after second comma */
+				while (value[temp_len] != 0)  {
+					if (value[temp_len] == separator[0]) {
+						if (value[temp_len+1] ==
+						     separator[0]) {
+						/* skip second comma */
+							temp_len++;
+						} else {
+						/* single comma indicating start
+							 of next parm */
+							break;
+						}
+					}
+					temp_len++;
+				}
+				if (value[temp_len] == 0) {
+					options = NULL;
+				} else {
+					value[temp_len] = 0;
+					/* point option to start of next parm */
+					options = value + temp_len + 1;
+				}
+				/* go from value to value + temp_len condensing
+				double commas to singles. Note that this ends up
+				allocating a few bytes too many, which is ok */
+				vol->password = kzalloc(temp_len, GFP_KERNEL);
+				if (vol->password == NULL) {
+					printk(KERN_WARNING "CIFS: no memory "
+							    "for password\n");
+					goto cifs_parse_mount_err;
+				}
+				for (i = 0, j = 0; i < temp_len; i++, j++) {
+					vol->password[j] = value[i];
+					if (value[i] == separator[0]
+						&& value[i+1] == separator[0]) {
+						/* skip second comma */
+						i++;
+					}
+				}
+				vol->password[j] = 0;
+			} else {
+				vol->password = kzalloc(temp_len+1, GFP_KERNEL);
+				if (vol->password == NULL) {
+					printk(KERN_WARNING "CIFS: no memory "
+							    "for password\n");
+					goto cifs_parse_mount_err;
+				}
+				strcpy(vol->password, value);
+			}
+		} else if (!strnicmp(data, "ip", 2) ||
+			   !strnicmp(data, "addr", 4)) {
+			if (!value || !*value) {
+				vol->UNCip = NULL;
+			} else if (strnlen(value, INET6_ADDRSTRLEN) <
+							INET6_ADDRSTRLEN) {
+				vol->UNCip = kstrdup(value, GFP_KERNEL);
+				if (!vol->UNCip) {
+					printk(KERN_WARNING "CIFS: no memory "
+							    "for UNC IP\n");
+					goto cifs_parse_mount_err;
+				}
+			} else {
+				printk(KERN_WARNING "CIFS: ip address "
+						    "too long\n");
+				goto cifs_parse_mount_err;
+			}
+		} else if (strnicmp(data, "sec", 3) == 0) {
+			if (!value || !*value) {
+				cERROR(1, "no security value specified");
+				continue;
+			} else if (strnicmp(value, "krb5i", 5) == 0) {
+				vol->secFlg |= CIFSSEC_MAY_KRB5 |
+					CIFSSEC_MUST_SIGN;
+			} else if (strnicmp(value, "krb5p", 5) == 0) {
+				/* vol->secFlg |= CIFSSEC_MUST_SEAL |
+					CIFSSEC_MAY_KRB5; */
+				cERROR(1, "Krb5 cifs privacy not supported");
+				goto cifs_parse_mount_err;
+			} else if (strnicmp(value, "krb5", 4) == 0) {
+				vol->secFlg |= CIFSSEC_MAY_KRB5;
+			} else if (strnicmp(value, "ntlmsspi", 8) == 0) {
+				vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
+					CIFSSEC_MUST_SIGN;
+			} else if (strnicmp(value, "ntlmssp", 7) == 0) {
+				vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
+			} else if (strnicmp(value, "ntlmv2i", 7) == 0) {
+				vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
+					CIFSSEC_MUST_SIGN;
+			} else if (strnicmp(value, "ntlmv2", 6) == 0) {
+				vol->secFlg |= CIFSSEC_MAY_NTLMV2;
+			} else if (strnicmp(value, "ntlmi", 5) == 0) {
+				vol->secFlg |= CIFSSEC_MAY_NTLM |
+					CIFSSEC_MUST_SIGN;
+			} else if (strnicmp(value, "ntlm", 4) == 0) {
+				/* ntlm is default so can be turned off too */
+				vol->secFlg |= CIFSSEC_MAY_NTLM;
+			} else if (strnicmp(value, "nontlm", 6) == 0) {
+				/* BB is there a better way to do this? */
+				vol->secFlg |= CIFSSEC_MAY_NTLMV2;
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+			} else if (strnicmp(value, "lanman", 6) == 0) {
+				vol->secFlg |= CIFSSEC_MAY_LANMAN;
+#endif
+			} else if (strnicmp(value, "none", 4) == 0) {
+				vol->nullauth = 1;
+			} else {
+				cERROR(1, "bad security option: %s", value);
+				goto cifs_parse_mount_err;
+			}
+		} else if (strnicmp(data, "vers", 3) == 0) {
+			if (!value || !*value) {
+				cERROR(1, "no protocol version specified"
+					  " after vers= mount option");
+			} else if ((strnicmp(value, "cifs", 4) == 0) ||
+				   (strnicmp(value, "1", 1) == 0)) {
+				/* this is the default */
+				continue;
+			}
+		} else if ((strnicmp(data, "unc", 3) == 0)
+			   || (strnicmp(data, "target", 6) == 0)
+			   || (strnicmp(data, "path", 4) == 0)) {
+			if (!value || !*value) {
+				printk(KERN_WARNING "CIFS: invalid path to "
+						    "network resource\n");
+				goto cifs_parse_mount_err;
+			}
+			if ((temp_len = strnlen(value, 300)) < 300) {
+				vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
+				if (vol->UNC == NULL)
+					goto cifs_parse_mount_err;
+				strcpy(vol->UNC, value);
+				if (strncmp(vol->UNC, "//", 2) == 0) {
+					vol->UNC[0] = '\\';
+					vol->UNC[1] = '\\';
+				} else if (strncmp(vol->UNC, "\\\\", 2) != 0) {
+					printk(KERN_WARNING
+					       "CIFS: UNC Path does not begin "
+					       "with // or \\\\ \n");
+					goto cifs_parse_mount_err;
+				}
+			} else {
+				printk(KERN_WARNING "CIFS: UNC name too long\n");
+				goto cifs_parse_mount_err;
+			}
+		} else if ((strnicmp(data, "domain", 3) == 0)
+			   || (strnicmp(data, "workgroup", 5) == 0)) {
+			if (!value || !*value) {
+				printk(KERN_WARNING "CIFS: invalid domain name\n");
+				goto cifs_parse_mount_err;
+			}
+			/* BB are there cases in which a comma can be valid in
+			a domain name and need special handling? */
+			if (strnlen(value, 256) < 256) {
+				vol->domainname = kstrdup(value, GFP_KERNEL);
+				if (!vol->domainname) {
+					printk(KERN_WARNING "CIFS: no memory "
+							    "for domainname\n");
+					goto cifs_parse_mount_err;
+				}
+				cFYI(1, "Domain name set");
+			} else {
+				printk(KERN_WARNING "CIFS: domain name too "
+						    "long\n");
+				goto cifs_parse_mount_err;
+			}
+		} else if (strnicmp(data, "srcaddr", 7) == 0) {
+			vol->srcaddr.ss_family = AF_UNSPEC;
+
+			if (!value || !*value) {
+				printk(KERN_WARNING "CIFS: srcaddr value"
+				       " not specified.\n");
+				goto cifs_parse_mount_err;
+			}
+			i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
+						 value, strlen(value));
+			if (i == 0) {
+				printk(KERN_WARNING "CIFS:  Could not parse"
+				       " srcaddr: %s\n",
+				       value);
+				goto cifs_parse_mount_err;
+			}
+		} else if (strnicmp(data, "prefixpath", 10) == 0) {
+			if (!value || !*value) {
+				printk(KERN_WARNING
+					"CIFS: invalid path prefix\n");
+				goto cifs_parse_mount_err;
+			}
+			if ((temp_len = strnlen(value, 1024)) < 1024) {
+				if (value[0] != '/')
+					temp_len++;  /* missing leading slash */
+				vol->prepath = kmalloc(temp_len+1, GFP_KERNEL);
+				if (vol->prepath == NULL)
+					goto cifs_parse_mount_err;
+				if (value[0] != '/') {
+					vol->prepath[0] = '/';
+					strcpy(vol->prepath+1, value);
+				} else
+					strcpy(vol->prepath, value);
+				cFYI(1, "prefix path %s", vol->prepath);
+			} else {
+				printk(KERN_WARNING "CIFS: prefix too long\n");
+				goto cifs_parse_mount_err;
+			}
+		} else if (strnicmp(data, "iocharset", 9) == 0) {
+			if (!value || !*value) {
+				printk(KERN_WARNING "CIFS: invalid iocharset "
+						    "specified\n");
+				goto cifs_parse_mount_err;
+			}
+			if (strnlen(value, 65) < 65) {
+				if (strnicmp(value, "default", 7)) {
+					vol->iocharset = kstrdup(value,
+								 GFP_KERNEL);
+
+					if (!vol->iocharset) {
+						printk(KERN_WARNING "CIFS: no "
+								   "memory for"
+								   "charset\n");
+						goto cifs_parse_mount_err;
+					}
+				}
+				/* if iocharset not set then load_nls_default
+				   is used by caller */
+				cFYI(1, "iocharset set to %s", value);
+			} else {
+				printk(KERN_WARNING "CIFS: iocharset name "
+						    "too long.\n");
+				goto cifs_parse_mount_err;
+			}
+		} else if (!strnicmp(data, "uid", 3) && value && *value) {
+			vol->linux_uid = simple_strtoul(value, &value, 0);
+			uid_specified = true;
+		} else if (!strnicmp(data, "cruid", 5) && value && *value) {
+			vol->cred_uid = simple_strtoul(value, &value, 0);
+		} else if (!strnicmp(data, "forceuid", 8)) {
+			override_uid = 1;
+		} else if (!strnicmp(data, "noforceuid", 10)) {
+			override_uid = 0;
+		} else if (!strnicmp(data, "gid", 3) && value && *value) {
+			vol->linux_gid = simple_strtoul(value, &value, 0);
+			gid_specified = true;
+		} else if (!strnicmp(data, "forcegid", 8)) {
+			override_gid = 1;
+		} else if (!strnicmp(data, "noforcegid", 10)) {
+			override_gid = 0;
+		} else if (strnicmp(data, "file_mode", 4) == 0) {
+			if (value && *value) {
+				vol->file_mode =
+					simple_strtoul(value, &value, 0);
+			}
+		} else if (strnicmp(data, "dir_mode", 4) == 0) {
+			if (value && *value) {
+				vol->dir_mode =
+					simple_strtoul(value, &value, 0);
+			}
+		} else if (strnicmp(data, "dirmode", 4) == 0) {
+			if (value && *value) {
+				vol->dir_mode =
+					simple_strtoul(value, &value, 0);
+			}
+		} else if (strnicmp(data, "port", 4) == 0) {
+			if (value && *value) {
+				vol->port =
+					simple_strtoul(value, &value, 0);
+			}
+		} else if (strnicmp(data, "rsize", 5) == 0) {
+			if (value && *value) {
+				vol->rsize =
+					simple_strtoul(value, &value, 0);
+			}
+		} else if (strnicmp(data, "wsize", 5) == 0) {
+			if (value && *value) {
+				vol->wsize =
+					simple_strtoul(value, &value, 0);
+			}
+		} else if (strnicmp(data, "sockopt", 5) == 0) {
+			if (!value || !*value) {
+				cERROR(1, "no socket option specified");
+				continue;
+			} else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
+				vol->sockopt_tcp_nodelay = 1;
+			}
+		} else if (strnicmp(data, "netbiosname", 4) == 0) {
+			if (!value || !*value || (*value == ' ')) {
+				cFYI(1, "invalid (empty) netbiosname");
+			} else {
+				memset(vol->source_rfc1001_name, 0x20,
+					RFC1001_NAME_LEN);
+				/*
+				 * FIXME: are there cases in which a comma can
+				 * be valid in workstation netbios name (and
+				 * need special handling)?
+				 */
+				for (i = 0; i < RFC1001_NAME_LEN; i++) {
+					/* don't ucase netbiosname for user */
+					if (value[i] == 0)
+						break;
+					vol->source_rfc1001_name[i] = value[i];
+				}
+				/* The string has 16th byte zero still from
+				set at top of the function  */
+				if (i == RFC1001_NAME_LEN && value[i] != 0)
+					printk(KERN_WARNING "CIFS: netbiosname"
+						" longer than 15 truncated.\n");
+			}
+		} else if (strnicmp(data, "servern", 7) == 0) {
+			/* servernetbiosname specified override *SMBSERVER */
+			if (!value || !*value || (*value == ' ')) {
+				cFYI(1, "empty server netbiosname specified");
+			} else {
+				/* last byte, type, is 0x20 for servr type */
+				memset(vol->target_rfc1001_name, 0x20,
+					RFC1001_NAME_LEN_WITH_NULL);
+
+				for (i = 0; i < 15; i++) {
+				/* BB are there cases in which a comma can be
+				   valid in this workstation netbios name
+				   (and need special handling)? */
+
+				/* user or mount helper must uppercase
+				   the netbiosname */
+					if (value[i] == 0)
+						break;
+					else
+						vol->target_rfc1001_name[i] =
+								value[i];
+				}
+				/* The string has 16th byte zero still from
+				   set at top of the function  */
+				if (i == RFC1001_NAME_LEN && value[i] != 0)
+					printk(KERN_WARNING "CIFS: server net"
+					"biosname longer than 15 truncated.\n");
+			}
+		} else if (strnicmp(data, "actimeo", 7) == 0) {
+			if (value && *value) {
+				vol->actimeo = HZ * simple_strtoul(value,
+								   &value, 0);
+				if (vol->actimeo > CIFS_MAX_ACTIMEO) {
+					cERROR(1, "CIFS: attribute cache"
+							"timeout too large");
+					goto cifs_parse_mount_err;
+				}
+			}
+		} else if (strnicmp(data, "credentials", 4) == 0) {
+			/* ignore */
+		} else if (strnicmp(data, "version", 3) == 0) {
+			/* ignore */
+		} else if (strnicmp(data, "guest", 5) == 0) {
+			/* ignore */
+		} else if (strnicmp(data, "rw", 2) == 0 && strlen(data) == 2) {
+			/* ignore */
+		} else if (strnicmp(data, "ro", 2) == 0) {
+			/* ignore */
+		} else if (strnicmp(data, "noblocksend", 11) == 0) {
+			vol->noblocksnd = 1;
+		} else if (strnicmp(data, "noautotune", 10) == 0) {
+			vol->noautotune = 1;
+		} else if ((strnicmp(data, "suid", 4) == 0) ||
+				   (strnicmp(data, "nosuid", 6) == 0) ||
+				   (strnicmp(data, "exec", 4) == 0) ||
+				   (strnicmp(data, "noexec", 6) == 0) ||
+				   (strnicmp(data, "nodev", 5) == 0) ||
+				   (strnicmp(data, "noauto", 6) == 0) ||
+				   (strnicmp(data, "dev", 3) == 0)) {
+			/*  The mount tool or mount.cifs helper (if present)
+			    uses these opts to set flags, and the flags are read
+			    by the kernel vfs layer before we get here (ie
+			    before read super) so there is no point trying to
+			    parse these options again and set anything and it
+			    is ok to just ignore them */
+			continue;
+		} else if (strnicmp(data, "hard", 4) == 0) {
+			vol->retry = 1;
+		} else if (strnicmp(data, "soft", 4) == 0) {
+			vol->retry = 0;
+		} else if (strnicmp(data, "perm", 4) == 0) {
+			vol->noperm = 0;
+		} else if (strnicmp(data, "noperm", 6) == 0) {
+			vol->noperm = 1;
+		} else if (strnicmp(data, "mapchars", 8) == 0) {
+			vol->remap = 1;
+		} else if (strnicmp(data, "nomapchars", 10) == 0) {
+			vol->remap = 0;
+		} else if (strnicmp(data, "sfu", 3) == 0) {
+			vol->sfu_emul = 1;
+		} else if (strnicmp(data, "nosfu", 5) == 0) {
+			vol->sfu_emul = 0;
+		} else if (strnicmp(data, "nodfs", 5) == 0) {
+			vol->nodfs = 1;
+		} else if (strnicmp(data, "posixpaths", 10) == 0) {
+			vol->posix_paths = 1;
+		} else if (strnicmp(data, "noposixpaths", 12) == 0) {
+			vol->posix_paths = 0;
+		} else if (strnicmp(data, "nounix", 6) == 0) {
+			vol->no_linux_ext = 1;
+		} else if (strnicmp(data, "nolinux", 7) == 0) {
+			vol->no_linux_ext = 1;
+		} else if ((strnicmp(data, "nocase", 6) == 0) ||
+			   (strnicmp(data, "ignorecase", 10)  == 0)) {
+			vol->nocase = 1;
+		} else if (strnicmp(data, "mand", 4) == 0) {
+			/* ignore */
+		} else if (strnicmp(data, "nomand", 6) == 0) {
+			/* ignore */
+		} else if (strnicmp(data, "_netdev", 7) == 0) {
+			/* ignore */
+		} else if (strnicmp(data, "brl", 3) == 0) {
+			vol->nobrl =  0;
+		} else if ((strnicmp(data, "nobrl", 5) == 0) ||
+			   (strnicmp(data, "nolock", 6) == 0)) {
+			vol->nobrl =  1;
+			/* turn off mandatory locking in mode
+			if remote locking is turned off since the
+			local vfs will do advisory */
+			if (vol->file_mode ==
+				(S_IALLUGO & ~(S_ISUID | S_IXGRP)))
+				vol->file_mode = S_IALLUGO;
+		} else if (strnicmp(data, "forcemandatorylock", 9) == 0) {
+			/* will take the shorter form "forcemand" as well */
+			/* This mount option will force use of mandatory
+			  (DOS/Windows style) byte range locks, instead of
+			  using posix advisory byte range locks, even if the
+			  Unix extensions are available and posix locks would
+			  be supported otherwise. If Unix extensions are not
+			  negotiated this has no effect since mandatory locks
+			  would be used (mandatory locks is all that those
+			  those servers support) */
+			vol->mand_lock = 1;
+		} else if (strnicmp(data, "setuids", 7) == 0) {
+			vol->setuids = 1;
+		} else if (strnicmp(data, "nosetuids", 9) == 0) {
+			vol->setuids = 0;
+		} else if (strnicmp(data, "dynperm", 7) == 0) {
+			vol->dynperm = true;
+		} else if (strnicmp(data, "nodynperm", 9) == 0) {
+			vol->dynperm = false;
+		} else if (strnicmp(data, "nohard", 6) == 0) {
+			vol->retry = 0;
+		} else if (strnicmp(data, "nosoft", 6) == 0) {
+			vol->retry = 1;
+		} else if (strnicmp(data, "nointr", 6) == 0) {
+			vol->intr = 0;
+		} else if (strnicmp(data, "intr", 4) == 0) {
+			vol->intr = 1;
+		} else if (strnicmp(data, "nostrictsync", 12) == 0) {
+			vol->nostrictsync = 1;
+		} else if (strnicmp(data, "strictsync", 10) == 0) {
+			vol->nostrictsync = 0;
+		} else if (strnicmp(data, "serverino", 7) == 0) {
+			vol->server_ino = 1;
+		} else if (strnicmp(data, "noserverino", 9) == 0) {
+			vol->server_ino = 0;
+		} else if (strnicmp(data, "rwpidforward", 12) == 0) {
+			vol->rwpidforward = 1;
+		} else if (strnicmp(data, "cifsacl", 7) == 0) {
+			vol->cifs_acl = 1;
+		} else if (strnicmp(data, "nocifsacl", 9) == 0) {
+			vol->cifs_acl = 0;
+		} else if (strnicmp(data, "acl", 3) == 0) {
+			vol->no_psx_acl = 0;
+		} else if (strnicmp(data, "noacl", 5) == 0) {
+			vol->no_psx_acl = 1;
+		} else if (strnicmp(data, "locallease", 6) == 0) {
+			vol->local_lease = 1;
+		} else if (strnicmp(data, "sign", 4) == 0) {
+			vol->secFlg |= CIFSSEC_MUST_SIGN;
+		} else if (strnicmp(data, "seal", 4) == 0) {
+			/* we do not do the following in secFlags because seal
+			   is a per tree connection (mount) not a per socket
+			   or per-smb connection option in the protocol */
+			/* vol->secFlg |= CIFSSEC_MUST_SEAL; */
+			vol->seal = 1;
+		} else if (strnicmp(data, "direct", 6) == 0) {
+			vol->direct_io = 1;
+		} else if (strnicmp(data, "forcedirectio", 13) == 0) {
+			vol->direct_io = 1;
+		} else if (strnicmp(data, "strictcache", 11) == 0) {
+			vol->strict_io = 1;
+		} else if (strnicmp(data, "noac", 4) == 0) {
+			printk(KERN_WARNING "CIFS: Mount option noac not "
+				"supported. Instead set "
+				"/proc/fs/cifs/LookupCacheEnabled to 0\n");
+		} else if (strnicmp(data, "fsc", 3) == 0) {
+#ifndef CONFIG_CIFS_FSCACHE
+			cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE "
+				  "kernel config option set");
+			goto cifs_parse_mount_err;
+#endif
+			vol->fsc = true;
+		} else if (strnicmp(data, "mfsymlinks", 10) == 0) {
+			vol->mfsymlinks = true;
+		} else if (strnicmp(data, "multiuser", 8) == 0) {
+			vol->multiuser = true;
+		} else
+			printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
+						data);
+	}
+	if (vol->UNC == NULL) {
+		if (devname == NULL) {
+			printk(KERN_WARNING "CIFS: Missing UNC name for mount "
+						"target\n");
+			goto cifs_parse_mount_err;
+		}
+		if ((temp_len = strnlen(devname, 300)) < 300) {
+			vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
+			if (vol->UNC == NULL)
+				goto cifs_parse_mount_err;
+			strcpy(vol->UNC, devname);
+			if (strncmp(vol->UNC, "//", 2) == 0) {
+				vol->UNC[0] = '\\';
+				vol->UNC[1] = '\\';
+			} else if (strncmp(vol->UNC, "\\\\", 2) != 0) {
+				printk(KERN_WARNING "CIFS: UNC Path does not "
+						    "begin with // or \\\\ \n");
+				goto cifs_parse_mount_err;
+			}
+			value = strpbrk(vol->UNC+2, "/\\");
+			if (value)
+				*value = '\\';
+		} else {
+			printk(KERN_WARNING "CIFS: UNC name too long\n");
+			goto cifs_parse_mount_err;
+		}
+	}
+
+	if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
+		cERROR(1, "Multiuser mounts currently require krb5 "
+			  "authentication!");
+		goto cifs_parse_mount_err;
+	}
+
+	if (vol->UNCip == NULL)
+		vol->UNCip = &vol->UNC[2];
+
+	if (uid_specified)
+		vol->override_uid = override_uid;
+	else if (override_uid == 1)
+		printk(KERN_NOTICE "CIFS: ignoring forceuid mount option "
+				   "specified with no uid= option.\n");
+
+	if (gid_specified)
+		vol->override_gid = override_gid;
+	else if (override_gid == 1)
+		printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
+				   "specified with no gid= option.\n");
+
+	kfree(mountdata_copy);
+	return 0;
+
+cifs_parse_mount_err:
+	kfree(mountdata_copy);
+	return 1;
+}
+
+/** Returns true if srcaddr isn't specified and rhs isn't
+ * specified, or if srcaddr is specified and
+ * matches the IP address of the rhs argument.
+ */
+static bool
+srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
+{
+	switch (srcaddr->sa_family) {
+	case AF_UNSPEC:
+		return (rhs->sa_family == AF_UNSPEC);
+	case AF_INET: {
+		struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr;
+		struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs;
+		return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr);
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
+		struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs;
+		return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
+	}
+	default:
+		WARN_ON(1);
+		return false; /* don't expect to be here */
+	}
+}
+
+/*
+ * If no port is specified in addr structure, we try to match with 445 port
+ * and if it fails - with 139 ports. It should be called only if address
+ * families of server and addr are equal.
+ */
+static bool
+match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
+{
+	__be16 port, *sport;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		sport = &((struct sockaddr_in *) &server->dstaddr)->sin_port;
+		port = ((struct sockaddr_in *) addr)->sin_port;
+		break;
+	case AF_INET6:
+		sport = &((struct sockaddr_in6 *) &server->dstaddr)->sin6_port;
+		port = ((struct sockaddr_in6 *) addr)->sin6_port;
+		break;
+	default:
+		WARN_ON(1);
+		return false;
+	}
+
+	if (!port) {
+		port = htons(CIFS_PORT);
+		if (port == *sport)
+			return true;
+
+		port = htons(RFC1001_PORT);
+	}
+
+	return port == *sport;
+}
+
+static bool
+match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
+	      struct sockaddr *srcaddr)
+{
+	switch (addr->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+		struct sockaddr_in *srv_addr4 =
+					(struct sockaddr_in *)&server->dstaddr;
+
+		if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr)
+			return false;
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+		struct sockaddr_in6 *srv_addr6 =
+					(struct sockaddr_in6 *)&server->dstaddr;
+
+		if (!ipv6_addr_equal(&addr6->sin6_addr,
+				     &srv_addr6->sin6_addr))
+			return false;
+		if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id)
+			return false;
+		break;
+	}
+	default:
+		WARN_ON(1);
+		return false; /* don't expect to be here */
+	}
+
+	if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
+		return false;
+
+	return true;
+}
+
+static bool
+match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
+{
+	unsigned int secFlags;
+
+	if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
+		secFlags = vol->secFlg;
+	else
+		secFlags = global_secflags | vol->secFlg;
+
+	switch (server->secType) {
+	case LANMAN:
+		if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT)))
+			return false;
+		break;
+	case NTLMv2:
+		if (!(secFlags & CIFSSEC_MAY_NTLMV2))
+			return false;
+		break;
+	case NTLM:
+		if (!(secFlags & CIFSSEC_MAY_NTLM))
+			return false;
+		break;
+	case Kerberos:
+		if (!(secFlags & CIFSSEC_MAY_KRB5))
+			return false;
+		break;
+	case RawNTLMSSP:
+		if (!(secFlags & CIFSSEC_MAY_NTLMSSP))
+			return false;
+		break;
+	default:
+		/* shouldn't happen */
+		return false;
+	}
+
+	/* now check if signing mode is acceptable */
+	if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
+	    (server->sec_mode & SECMODE_SIGN_REQUIRED))
+			return false;
+	else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) &&
+		 (server->sec_mode &
+		  (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0)
+			return false;
+
+	return true;
+}
+
+static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr,
+			 struct smb_vol *vol)
+{
+	if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
+		return 0;
+
+	if (!match_address(server, addr,
+			   (struct sockaddr *)&vol->srcaddr))
+		return 0;
+
+	if (!match_port(server, addr))
+		return 0;
+
+	if (!match_security(server, vol))
+		return 0;
+
+	return 1;
+}
+
+static struct TCP_Server_Info *
+cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
+{
+	struct TCP_Server_Info *server;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+		if (!match_server(server, addr, vol))
+			continue;
+
+		++server->srv_count;
+		spin_unlock(&cifs_tcp_ses_lock);
+		cFYI(1, "Existing tcp session with server found");
+		return server;
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+	return NULL;
+}
+
+static void
+cifs_put_tcp_session(struct TCP_Server_Info *server)
+{
+	struct task_struct *task;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	if (--server->srv_count > 0) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
+
+	put_net(cifs_net_ns(server));
+
+	list_del_init(&server->tcp_ses_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	cancel_delayed_work_sync(&server->echo);
+
+	spin_lock(&GlobalMid_Lock);
+	server->tcpStatus = CifsExiting;
+	spin_unlock(&GlobalMid_Lock);
+
+	cifs_crypto_shash_release(server);
+	cifs_fscache_release_client_cookie(server);
+
+	kfree(server->session_key.response);
+	server->session_key.response = NULL;
+	server->session_key.len = 0;
+
+	task = xchg(&server->tsk, NULL);
+	if (task)
+		force_sig(SIGKILL, task);
+}
+
+static struct TCP_Server_Info *
+cifs_get_tcp_session(struct smb_vol *volume_info)
+{
+	struct TCP_Server_Info *tcp_ses = NULL;
+	struct sockaddr_storage addr;
+	struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
+	struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
+	int rc;
+
+	memset(&addr, 0, sizeof(struct sockaddr_storage));
+
+	cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
+
+	if (volume_info->UNCip && volume_info->UNC) {
+		rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
+					volume_info->UNCip,
+					strlen(volume_info->UNCip),
+					volume_info->port);
+		if (!rc) {
+			/* we failed translating address */
+			rc = -EINVAL;
+			goto out_err;
+		}
+	} else if (volume_info->UNCip) {
+		/* BB using ip addr as tcp_ses name to connect to the
+		   DFS root below */
+		cERROR(1, "Connecting to DFS root not implemented yet");
+		rc = -EINVAL;
+		goto out_err;
+	} else /* which tcp_sess DFS root would we conect to */ {
+		cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
+			"unc=//192.168.1.100/public) specified");
+		rc = -EINVAL;
+		goto out_err;
+	}
+
+	/* see if we already have a matching tcp_ses */
+	tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info);
+	if (tcp_ses)
+		return tcp_ses;
+
+	tcp_ses = kzalloc(sizeof(struct TCP_Server_Info), GFP_KERNEL);
+	if (!tcp_ses) {
+		rc = -ENOMEM;
+		goto out_err;
+	}
+
+	rc = cifs_crypto_shash_allocate(tcp_ses);
+	if (rc) {
+		cERROR(1, "could not setup hash structures rc %d", rc);
+		goto out_err;
+	}
+
+	cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
+	tcp_ses->hostname = extract_hostname(volume_info->UNC);
+	if (IS_ERR(tcp_ses->hostname)) {
+		rc = PTR_ERR(tcp_ses->hostname);
+		goto out_err_crypto_release;
+	}
+
+	tcp_ses->noblocksnd = volume_info->noblocksnd;
+	tcp_ses->noautotune = volume_info->noautotune;
+	tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
+	atomic_set(&tcp_ses->inFlight, 0);
+	init_waitqueue_head(&tcp_ses->response_q);
+	init_waitqueue_head(&tcp_ses->request_q);
+	INIT_LIST_HEAD(&tcp_ses->pending_mid_q);
+	mutex_init(&tcp_ses->srv_mutex);
+	memcpy(tcp_ses->workstation_RFC1001_name,
+		volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
+	memcpy(tcp_ses->server_RFC1001_name,
+		volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
+	tcp_ses->session_estab = false;
+	tcp_ses->sequence_number = 0;
+	tcp_ses->lstrp = jiffies;
+	INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
+	INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
+	INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
+
+	/*
+	 * at this point we are the only ones with the pointer
+	 * to the struct since the kernel thread not created yet
+	 * no need to spinlock this init of tcpStatus or srv_count
+	 */
+	tcp_ses->tcpStatus = CifsNew;
+	memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
+	       sizeof(tcp_ses->srcaddr));
+	++tcp_ses->srv_count;
+
+	if (addr.ss_family == AF_INET6) {
+		cFYI(1, "attempting ipv6 connect");
+		/* BB should we allow ipv6 on port 139? */
+		/* other OS never observed in Wild doing 139 with v6 */
+		memcpy(&tcp_ses->dstaddr, sin_server6,
+		       sizeof(struct sockaddr_in6));
+	} else
+		memcpy(&tcp_ses->dstaddr, sin_server,
+		       sizeof(struct sockaddr_in));
+
+	rc = ip_connect(tcp_ses);
+	if (rc < 0) {
+		cERROR(1, "Error connecting to socket. Aborting operation");
+		goto out_err_crypto_release;
+	}
+
+	/*
+	 * since we're in a cifs function already, we know that
+	 * this will succeed. No need for try_module_get().
+	 */
+	__module_get(THIS_MODULE);
+	tcp_ses->tsk = kthread_run((void *)(void *)cifs_demultiplex_thread,
+				  tcp_ses, "cifsd");
+	if (IS_ERR(tcp_ses->tsk)) {
+		rc = PTR_ERR(tcp_ses->tsk);
+		cERROR(1, "error %d create cifsd thread", rc);
+		module_put(THIS_MODULE);
+		goto out_err_crypto_release;
+	}
+	tcp_ses->tcpStatus = CifsNeedNegotiate;
+
+	/* thread spawned, put it on the list */
+	spin_lock(&cifs_tcp_ses_lock);
+	list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	cifs_fscache_get_client_cookie(tcp_ses);
+
+	/* queue echo request delayed work */
+	queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
+
+	return tcp_ses;
+
+out_err_crypto_release:
+	cifs_crypto_shash_release(tcp_ses);
+
+	put_net(cifs_net_ns(tcp_ses));
+
+out_err:
+	if (tcp_ses) {
+		if (!IS_ERR(tcp_ses->hostname))
+			kfree(tcp_ses->hostname);
+		if (tcp_ses->ssocket)
+			sock_release(tcp_ses->ssocket);
+		kfree(tcp_ses);
+	}
+	return ERR_PTR(rc);
+}
+
+static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
+{
+	switch (ses->server->secType) {
+	case Kerberos:
+		if (vol->cred_uid != ses->cred_uid)
+			return 0;
+		break;
+	default:
+		/* anything else takes username/password */
+		if (ses->user_name == NULL)
+			return 0;
+		if (strncmp(ses->user_name, vol->username,
+			    MAX_USERNAME_SIZE))
+			return 0;
+		if (strlen(vol->username) != 0 &&
+		    ses->password != NULL &&
+		    strncmp(ses->password,
+			    vol->password ? vol->password : "",
+			    MAX_PASSWORD_SIZE))
+			return 0;
+	}
+	return 1;
+}
+
+static struct cifs_ses *
+cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
+{
+	struct cifs_ses *ses;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+		if (!match_session(ses, vol))
+			continue;
+		++ses->ses_count;
+		spin_unlock(&cifs_tcp_ses_lock);
+		return ses;
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+	return NULL;
+}
+
+static void
+cifs_put_smb_ses(struct cifs_ses *ses)
+{
+	int xid;
+	struct TCP_Server_Info *server = ses->server;
+
+	cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
+	spin_lock(&cifs_tcp_ses_lock);
+	if (--ses->ses_count > 0) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
+
+	list_del_init(&ses->smb_ses_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	if (ses->status == CifsGood) {
+		xid = GetXid();
+		CIFSSMBLogoff(xid, ses);
+		_FreeXid(xid);
+	}
+	sesInfoFree(ses);
+	cifs_put_tcp_session(server);
+}
+
+static bool warned_on_ntlm;  /* globals init to false automatically */
+
+static struct cifs_ses *
+cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
+{
+	int rc = -ENOMEM, xid;
+	struct cifs_ses *ses;
+	struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
+	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
+
+	xid = GetXid();
+
+	ses = cifs_find_smb_ses(server, volume_info);
+	if (ses) {
+		cFYI(1, "Existing smb sess found (status=%d)", ses->status);
+
+		mutex_lock(&ses->session_mutex);
+		rc = cifs_negotiate_protocol(xid, ses);
+		if (rc) {
+			mutex_unlock(&ses->session_mutex);
+			/* problem -- put our ses reference */
+			cifs_put_smb_ses(ses);
+			FreeXid(xid);
+			return ERR_PTR(rc);
+		}
+		if (ses->need_reconnect) {
+			cFYI(1, "Session needs reconnect");
+			rc = cifs_setup_session(xid, ses,
+						volume_info->local_nls);
+			if (rc) {
+				mutex_unlock(&ses->session_mutex);
+				/* problem -- put our reference */
+				cifs_put_smb_ses(ses);
+				FreeXid(xid);
+				return ERR_PTR(rc);
+			}
+		}
+		mutex_unlock(&ses->session_mutex);
+
+		/* existing SMB ses has a server reference already */
+		cifs_put_tcp_session(server);
+		FreeXid(xid);
+		return ses;
+	}
+
+	cFYI(1, "Existing smb sess not found");
+	ses = sesInfoAlloc();
+	if (ses == NULL)
+		goto get_ses_fail;
+
+	/* new SMB session uses our server ref */
+	ses->server = server;
+	if (server->dstaddr.ss_family == AF_INET6)
+		sprintf(ses->serverName, "%pI6", &addr6->sin6_addr);
+	else
+		sprintf(ses->serverName, "%pI4", &addr->sin_addr);
+
+	if (volume_info->username) {
+		ses->user_name = kstrdup(volume_info->username, GFP_KERNEL);
+		if (!ses->user_name)
+			goto get_ses_fail;
+	}
+
+	/* volume_info->password freed at unmount */
+	if (volume_info->password) {
+		ses->password = kstrdup(volume_info->password, GFP_KERNEL);
+		if (!ses->password)
+			goto get_ses_fail;
+	}
+	if (volume_info->domainname) {
+		ses->domainName = kstrdup(volume_info->domainname, GFP_KERNEL);
+		if (!ses->domainName)
+			goto get_ses_fail;
+	}
+	ses->cred_uid = volume_info->cred_uid;
+	ses->linux_uid = volume_info->linux_uid;
+
+	/* ntlmv2 is much stronger than ntlm security, and has been broadly
+	supported for many years, time to update default security mechanism */
+	if ((volume_info->secFlg == 0) && warned_on_ntlm == false) {
+		warned_on_ntlm = true;
+		cERROR(1, "default security mechanism requested.  The default "
+			"security mechanism will be upgraded from ntlm to "
+			"ntlmv2 in kernel release 3.1");
+	}
+	ses->overrideSecFlg = volume_info->secFlg;
+
+	mutex_lock(&ses->session_mutex);
+	rc = cifs_negotiate_protocol(xid, ses);
+	if (!rc)
+		rc = cifs_setup_session(xid, ses, volume_info->local_nls);
+	mutex_unlock(&ses->session_mutex);
+	if (rc)
+		goto get_ses_fail;
+
+	/* success, put it on the list */
+	spin_lock(&cifs_tcp_ses_lock);
+	list_add(&ses->smb_ses_list, &server->smb_ses_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	FreeXid(xid);
+	return ses;
+
+get_ses_fail:
+	sesInfoFree(ses);
+	FreeXid(xid);
+	return ERR_PTR(rc);
+}
+
+static int match_tcon(struct cifs_tcon *tcon, const char *unc)
+{
+	if (tcon->tidStatus == CifsExiting)
+		return 0;
+	if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE))
+		return 0;
+	return 1;
+}
+
+static struct cifs_tcon *
+cifs_find_tcon(struct cifs_ses *ses, const char *unc)
+{
+	struct list_head *tmp;
+	struct cifs_tcon *tcon;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &ses->tcon_list) {
+		tcon = list_entry(tmp, struct cifs_tcon, tcon_list);
+		if (!match_tcon(tcon, unc))
+			continue;
+		++tcon->tc_count;
+		spin_unlock(&cifs_tcp_ses_lock);
+		return tcon;
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+	return NULL;
+}
+
+static void
+cifs_put_tcon(struct cifs_tcon *tcon)
+{
+	int xid;
+	struct cifs_ses *ses = tcon->ses;
+
+	cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
+	spin_lock(&cifs_tcp_ses_lock);
+	if (--tcon->tc_count > 0) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
+
+	list_del_init(&tcon->tcon_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	xid = GetXid();
+	CIFSSMBTDis(xid, tcon);
+	_FreeXid(xid);
+
+	cifs_fscache_release_super_cookie(tcon);
+	tconInfoFree(tcon);
+	cifs_put_smb_ses(ses);
+}
+
+static struct cifs_tcon *
+cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
+{
+	int rc, xid;
+	struct cifs_tcon *tcon;
+
+	tcon = cifs_find_tcon(ses, volume_info->UNC);
+	if (tcon) {
+		cFYI(1, "Found match on UNC path");
+		/* existing tcon already has a reference */
+		cifs_put_smb_ses(ses);
+		if (tcon->seal != volume_info->seal)
+			cERROR(1, "transport encryption setting "
+				   "conflicts with existing tid");
+		return tcon;
+	}
+
+	tcon = tconInfoAlloc();
+	if (tcon == NULL) {
+		rc = -ENOMEM;
+		goto out_fail;
+	}
+
+	tcon->ses = ses;
+	if (volume_info->password) {
+		tcon->password = kstrdup(volume_info->password, GFP_KERNEL);
+		if (!tcon->password) {
+			rc = -ENOMEM;
+			goto out_fail;
+		}
+	}
+
+	if (strchr(volume_info->UNC + 3, '\\') == NULL
+	    && strchr(volume_info->UNC + 3, '/') == NULL) {
+		cERROR(1, "Missing share name");
+		rc = -ENODEV;
+		goto out_fail;
+	}
+
+	/* BB Do we need to wrap session_mutex around
+	 * this TCon call and Unix SetFS as
+	 * we do on SessSetup and reconnect? */
+	xid = GetXid();
+	rc = CIFSTCon(xid, ses, volume_info->UNC, tcon, volume_info->local_nls);
+	FreeXid(xid);
+	cFYI(1, "CIFS Tcon rc = %d", rc);
+	if (rc)
+		goto out_fail;
+
+	if (volume_info->nodfs) {
+		tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
+		cFYI(1, "DFS disabled (%d)", tcon->Flags);
+	}
+	tcon->seal = volume_info->seal;
+	/* we can have only one retry value for a connection
+	   to a share so for resources mounted more than once
+	   to the same server share the last value passed in
+	   for the retry flag is used */
+	tcon->retry = volume_info->retry;
+	tcon->nocase = volume_info->nocase;
+	tcon->local_lease = volume_info->local_lease;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_add(&tcon->tcon_list, &ses->tcon_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	cifs_fscache_get_super_cookie(tcon);
+
+	return tcon;
+
+out_fail:
+	tconInfoFree(tcon);
+	return ERR_PTR(rc);
+}
+
+void
+cifs_put_tlink(struct tcon_link *tlink)
+{
+	if (!tlink || IS_ERR(tlink))
+		return;
+
+	if (!atomic_dec_and_test(&tlink->tl_count) ||
+	    test_bit(TCON_LINK_IN_TREE, &tlink->tl_flags)) {
+		tlink->tl_time = jiffies;
+		return;
+	}
+
+	if (!IS_ERR(tlink_tcon(tlink)))
+		cifs_put_tcon(tlink_tcon(tlink));
+	kfree(tlink);
+	return;
+}
+
+static inline struct tcon_link *
+cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
+{
+	return cifs_sb->master_tlink;
+}
+
+static int
+compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
+{
+	struct cifs_sb_info *old = CIFS_SB(sb);
+	struct cifs_sb_info *new = mnt_data->cifs_sb;
+
+	if ((sb->s_flags & CIFS_MS_MASK) != (mnt_data->flags & CIFS_MS_MASK))
+		return 0;
+
+	if ((old->mnt_cifs_flags & CIFS_MOUNT_MASK) !=
+	    (new->mnt_cifs_flags & CIFS_MOUNT_MASK))
+		return 0;
+
+	if (old->rsize != new->rsize)
+		return 0;
+
+	/*
+	 * We want to share sb only if we don't specify wsize or specified wsize
+	 * is greater or equal than existing one.
+	 */
+	if (new->wsize && new->wsize < old->wsize)
+		return 0;
+
+	if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid)
+		return 0;
+
+	if (old->mnt_file_mode != new->mnt_file_mode ||
+	    old->mnt_dir_mode != new->mnt_dir_mode)
+		return 0;
+
+	if (strcmp(old->local_nls->charset, new->local_nls->charset))
+		return 0;
+
+	if (old->actimeo != new->actimeo)
+		return 0;
+
+	return 1;
+}
+
+int
+cifs_match_super(struct super_block *sb, void *data)
+{
+	struct cifs_mnt_data *mnt_data = (struct cifs_mnt_data *)data;
+	struct smb_vol *volume_info;
+	struct cifs_sb_info *cifs_sb;
+	struct TCP_Server_Info *tcp_srv;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct tcon_link *tlink;
+	struct sockaddr_storage addr;
+	int rc = 0;
+
+	memset(&addr, 0, sizeof(struct sockaddr_storage));
+
+	spin_lock(&cifs_tcp_ses_lock);
+	cifs_sb = CIFS_SB(sb);
+	tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
+	if (IS_ERR(tlink)) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		return rc;
+	}
+	tcon = tlink_tcon(tlink);
+	ses = tcon->ses;
+	tcp_srv = ses->server;
+
+	volume_info = mnt_data->vol;
+
+	if (!volume_info->UNCip || !volume_info->UNC)
+		goto out;
+
+	rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
+				volume_info->UNCip,
+				strlen(volume_info->UNCip),
+				volume_info->port);
+	if (!rc)
+		goto out;
+
+	if (!match_server(tcp_srv, (struct sockaddr *)&addr, volume_info) ||
+	    !match_session(ses, volume_info) ||
+	    !match_tcon(tcon, volume_info->UNC)) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = compare_mount_options(sb, mnt_data);
+out:
+	spin_unlock(&cifs_tcp_ses_lock);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+int
+get_dfs_path(int xid, struct cifs_ses *pSesInfo, const char *old_path,
+	     const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
+	     struct dfs_info3_param **preferrals, int remap)
+{
+	char *temp_unc;
+	int rc = 0;
+
+	*pnum_referrals = 0;
+	*preferrals = NULL;
+
+	if (pSesInfo->ipc_tid == 0) {
+		temp_unc = kmalloc(2 /* for slashes */ +
+			strnlen(pSesInfo->serverName,
+				SERVER_NAME_LEN_WITH_NULL * 2)
+				 + 1 + 4 /* slash IPC$ */  + 2,
+				GFP_KERNEL);
+		if (temp_unc == NULL)
+			return -ENOMEM;
+		temp_unc[0] = '\\';
+		temp_unc[1] = '\\';
+		strcpy(temp_unc + 2, pSesInfo->serverName);
+		strcpy(temp_unc + 2 + strlen(pSesInfo->serverName), "\\IPC$");
+		rc = CIFSTCon(xid, pSesInfo, temp_unc, NULL, nls_codepage);
+		cFYI(1, "CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid);
+		kfree(temp_unc);
+	}
+	if (rc == 0)
+		rc = CIFSGetDFSRefer(xid, pSesInfo, old_path, preferrals,
+				     pnum_referrals, nls_codepage, remap);
+	/* BB map targetUNCs to dfs_info3 structures, here or
+		in CIFSGetDFSRefer BB */
+
+	return rc;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key cifs_key[2];
+static struct lock_class_key cifs_slock_key[2];
+
+static inline void
+cifs_reclassify_socket4(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	BUG_ON(sock_owned_by_user(sk));
+	sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS",
+		&cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]);
+}
+
+static inline void
+cifs_reclassify_socket6(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	BUG_ON(sock_owned_by_user(sk));
+	sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS",
+		&cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]);
+}
+#else
+static inline void
+cifs_reclassify_socket4(struct socket *sock)
+{
+}
+
+static inline void
+cifs_reclassify_socket6(struct socket *sock)
+{
+}
+#endif
+
+/* See RFC1001 section 14 on representation of Netbios names */
+static void rfc1002mangle(char *target, char *source, unsigned int length)
+{
+	unsigned int i, j;
+
+	for (i = 0, j = 0; i < (length); i++) {
+		/* mask a nibble at a time and encode */
+		target[j] = 'A' + (0x0F & (source[i] >> 4));
+		target[j+1] = 'A' + (0x0F & source[i]);
+		j += 2;
+	}
+
+}
+
+static int
+bind_socket(struct TCP_Server_Info *server)
+{
+	int rc = 0;
+	if (server->srcaddr.ss_family != AF_UNSPEC) {
+		/* Bind to the specified local IP address */
+		struct socket *socket = server->ssocket;
+		rc = socket->ops->bind(socket,
+				       (struct sockaddr *) &server->srcaddr,
+				       sizeof(server->srcaddr));
+		if (rc < 0) {
+			struct sockaddr_in *saddr4;
+			struct sockaddr_in6 *saddr6;
+			saddr4 = (struct sockaddr_in *)&server->srcaddr;
+			saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
+			if (saddr6->sin6_family == AF_INET6)
+				cERROR(1, "cifs: "
+				       "Failed to bind to: %pI6c, error: %d\n",
+				       &saddr6->sin6_addr, rc);
+			else
+				cERROR(1, "cifs: "
+				       "Failed to bind to: %pI4, error: %d\n",
+				       &saddr4->sin_addr.s_addr, rc);
+		}
+	}
+	return rc;
+}
+
+static int
+ip_rfc1001_connect(struct TCP_Server_Info *server)
+{
+	int rc = 0;
+	/*
+	 * some servers require RFC1001 sessinit before sending
+	 * negprot - BB check reconnection in case where second
+	 * sessinit is sent but no second negprot
+	 */
+	struct rfc1002_session_packet *ses_init_buf;
+	struct smb_hdr *smb_buf;
+	ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
+			       GFP_KERNEL);
+	if (ses_init_buf) {
+		ses_init_buf->trailer.session_req.called_len = 32;
+
+		if (server->server_RFC1001_name &&
+		    server->server_RFC1001_name[0] != 0)
+			rfc1002mangle(ses_init_buf->trailer.
+				      session_req.called_name,
+				      server->server_RFC1001_name,
+				      RFC1001_NAME_LEN_WITH_NULL);
+		else
+			rfc1002mangle(ses_init_buf->trailer.
+				      session_req.called_name,
+				      DEFAULT_CIFS_CALLED_NAME,
+				      RFC1001_NAME_LEN_WITH_NULL);
+
+		ses_init_buf->trailer.session_req.calling_len = 32;
+
+		/*
+		 * calling name ends in null (byte 16) from old smb
+		 * convention.
+		 */
+		if (server->workstation_RFC1001_name &&
+		    server->workstation_RFC1001_name[0] != 0)
+			rfc1002mangle(ses_init_buf->trailer.
+				      session_req.calling_name,
+				      server->workstation_RFC1001_name,
+				      RFC1001_NAME_LEN_WITH_NULL);
+		else
+			rfc1002mangle(ses_init_buf->trailer.
+				      session_req.calling_name,
+				      "LINUX_CIFS_CLNT",
+				      RFC1001_NAME_LEN_WITH_NULL);
+
+		ses_init_buf->trailer.session_req.scope1 = 0;
+		ses_init_buf->trailer.session_req.scope2 = 0;
+		smb_buf = (struct smb_hdr *)ses_init_buf;
+
+		/* sizeof RFC1002_SESSION_REQUEST with no scope */
+		smb_buf->smb_buf_length = cpu_to_be32(0x81000044);
+		rc = smb_send(server, smb_buf, 0x44);
+		kfree(ses_init_buf);
+		/*
+		 * RFC1001 layer in at least one server
+		 * requires very short break before negprot
+		 * presumably because not expecting negprot
+		 * to follow so fast.  This is a simple
+		 * solution that works without
+		 * complicating the code and causes no
+		 * significant slowing down on mount
+		 * for everyone else
+		 */
+		usleep_range(1000, 2000);
+	}
+	/*
+	 * else the negprot may still work without this
+	 * even though malloc failed
+	 */
+
+	return rc;
+}
+
+static int
+generic_ip_connect(struct TCP_Server_Info *server)
+{
+	int rc = 0;
+	__be16 sport;
+	int slen, sfamily;
+	struct socket *socket = server->ssocket;
+	struct sockaddr *saddr;
+
+	saddr = (struct sockaddr *) &server->dstaddr;
+
+	if (server->dstaddr.ss_family == AF_INET6) {
+		sport = ((struct sockaddr_in6 *) saddr)->sin6_port;
+		slen = sizeof(struct sockaddr_in6);
+		sfamily = AF_INET6;
+	} else {
+		sport = ((struct sockaddr_in *) saddr)->sin_port;
+		slen = sizeof(struct sockaddr_in);
+		sfamily = AF_INET;
+	}
+
+	if (socket == NULL) {
+		rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
+				   IPPROTO_TCP, &socket, 1);
+		if (rc < 0) {
+			cERROR(1, "Error %d creating socket", rc);
+			server->ssocket = NULL;
+			return rc;
+		}
+
+		/* BB other socket options to set KEEPALIVE, NODELAY? */
+		cFYI(1, "Socket created");
+		server->ssocket = socket;
+		socket->sk->sk_allocation = GFP_NOFS;
+		if (sfamily == AF_INET6)
+			cifs_reclassify_socket6(socket);
+		else
+			cifs_reclassify_socket4(socket);
+	}
+
+	rc = bind_socket(server);
+	if (rc < 0)
+		return rc;
+
+	/*
+	 * Eventually check for other socket options to change from
+	 * the default. sock_setsockopt not used because it expects
+	 * user space buffer
+	 */
+	socket->sk->sk_rcvtimeo = 7 * HZ;
+	socket->sk->sk_sndtimeo = 5 * HZ;
+
+	/* make the bufsizes depend on wsize/rsize and max requests */
+	if (server->noautotune) {
+		if (socket->sk->sk_sndbuf < (200 * 1024))
+			socket->sk->sk_sndbuf = 200 * 1024;
+		if (socket->sk->sk_rcvbuf < (140 * 1024))
+			socket->sk->sk_rcvbuf = 140 * 1024;
+	}
+
+	if (server->tcp_nodelay) {
+		int val = 1;
+		rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
+				(char *)&val, sizeof(val));
+		if (rc)
+			cFYI(1, "set TCP_NODELAY socket option error %d", rc);
+	}
+
+	 cFYI(1, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
+		 socket->sk->sk_sndbuf,
+		 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
+
+	rc = socket->ops->connect(socket, saddr, slen, 0);
+	if (rc < 0) {
+		cFYI(1, "Error %d connecting to server", rc);
+		sock_release(socket);
+		server->ssocket = NULL;
+		return rc;
+	}
+
+	if (sport == htons(RFC1001_PORT))
+		rc = ip_rfc1001_connect(server);
+
+	return rc;
+}
+
+static int
+ip_connect(struct TCP_Server_Info *server)
+{
+	__be16 *sport;
+	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
+	struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
+
+	if (server->dstaddr.ss_family == AF_INET6)
+		sport = &addr6->sin6_port;
+	else
+		sport = &addr->sin_port;
+
+	if (*sport == 0) {
+		int rc;
+
+		/* try with 445 port at first */
+		*sport = htons(CIFS_PORT);
+
+		rc = generic_ip_connect(server);
+		if (rc >= 0)
+			return rc;
+
+		/* if it failed, try with 139 port */
+		*sport = htons(RFC1001_PORT);
+	}
+
+	return generic_ip_connect(server);
+}
+
+void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
+			  struct cifs_sb_info *cifs_sb, struct smb_vol *vol_info)
+{
+	/* if we are reconnecting then should we check to see if
+	 * any requested capabilities changed locally e.g. via
+	 * remount but we can not do much about it here
+	 * if they have (even if we could detect it by the following)
+	 * Perhaps we could add a backpointer to array of sb from tcon
+	 * or if we change to make all sb to same share the same
+	 * sb as NFS - then we only have one backpointer to sb.
+	 * What if we wanted to mount the server share twice once with
+	 * and once without posixacls or posix paths? */
+	__u64 saved_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+
+	if (vol_info && vol_info->no_linux_ext) {
+		tcon->fsUnixInfo.Capability = 0;
+		tcon->unix_ext = 0; /* Unix Extensions disabled */
+		cFYI(1, "Linux protocol extensions disabled");
+		return;
+	} else if (vol_info)
+		tcon->unix_ext = 1; /* Unix Extensions supported */
+
+	if (tcon->unix_ext == 0) {
+		cFYI(1, "Unix extensions disabled so not set on reconnect");
+		return;
+	}
+
+	if (!CIFSSMBQFSUnixInfo(xid, tcon)) {
+		__u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+		cFYI(1, "unix caps which server supports %lld", cap);
+		/* check for reconnect case in which we do not
+		   want to change the mount behavior if we can avoid it */
+		if (vol_info == NULL) {
+			/* turn off POSIX ACL and PATHNAMES if not set
+			   originally at mount time */
+			if ((saved_cap & CIFS_UNIX_POSIX_ACL_CAP) == 0)
+				cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
+			if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
+				if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
+					cERROR(1, "POSIXPATH support change");
+				cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
+			} else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
+				cERROR(1, "possible reconnect error");
+				cERROR(1, "server disabled POSIX path support");
+			}
+		}
+
+		if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)
+			cERROR(1, "per-share encryption not supported yet");
+
+		cap &= CIFS_UNIX_CAP_MASK;
+		if (vol_info && vol_info->no_psx_acl)
+			cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
+		else if (CIFS_UNIX_POSIX_ACL_CAP & cap) {
+			cFYI(1, "negotiated posix acl support");
+			if (cifs_sb)
+				cifs_sb->mnt_cifs_flags |=
+					CIFS_MOUNT_POSIXACL;
+		}
+
+		if (vol_info && vol_info->posix_paths == 0)
+			cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
+		else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) {
+			cFYI(1, "negotiate posix pathnames");
+			if (cifs_sb)
+				cifs_sb->mnt_cifs_flags |=
+					CIFS_MOUNT_POSIX_PATHS;
+		}
+
+		if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) {
+			if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
+				cifs_sb->rsize = 127 * 1024;
+				cFYI(DBG2, "larger reads not supported by srv");
+			}
+		}
+
+
+		cFYI(1, "Negotiate caps 0x%x", (int)cap);
+#ifdef CONFIG_CIFS_DEBUG2
+		if (cap & CIFS_UNIX_FCNTL_CAP)
+			cFYI(1, "FCNTL cap");
+		if (cap & CIFS_UNIX_EXTATTR_CAP)
+			cFYI(1, "EXTATTR cap");
+		if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
+			cFYI(1, "POSIX path cap");
+		if (cap & CIFS_UNIX_XATTR_CAP)
+			cFYI(1, "XATTR cap");
+		if (cap & CIFS_UNIX_POSIX_ACL_CAP)
+			cFYI(1, "POSIX ACL cap");
+		if (cap & CIFS_UNIX_LARGE_READ_CAP)
+			cFYI(1, "very large read cap");
+		if (cap & CIFS_UNIX_LARGE_WRITE_CAP)
+			cFYI(1, "very large write cap");
+		if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP)
+			cFYI(1, "transport encryption cap");
+		if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)
+			cFYI(1, "mandatory transport encryption cap");
+#endif /* CIFS_DEBUG2 */
+		if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
+			if (vol_info == NULL) {
+				cFYI(1, "resetting capabilities failed");
+			} else
+				cERROR(1, "Negotiating Unix capabilities "
+					   "with the server failed.  Consider "
+					   "mounting with the Unix Extensions\n"
+					   "disabled, if problems are found, "
+					   "by specifying the nounix mount "
+					   "option.");
+
+		}
+	}
+}
+
+void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
+			struct cifs_sb_info *cifs_sb)
+{
+	INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
+
+	spin_lock_init(&cifs_sb->tlink_tree_lock);
+	cifs_sb->tlink_tree = RB_ROOT;
+
+	if (pvolume_info->rsize > CIFSMaxBufSize) {
+		cERROR(1, "rsize %d too large, using MaxBufSize",
+			pvolume_info->rsize);
+		cifs_sb->rsize = CIFSMaxBufSize;
+	} else if ((pvolume_info->rsize) &&
+			(pvolume_info->rsize <= CIFSMaxBufSize))
+		cifs_sb->rsize = pvolume_info->rsize;
+	else /* default */
+		cifs_sb->rsize = CIFSMaxBufSize;
+
+	if (cifs_sb->rsize < 2048) {
+		cifs_sb->rsize = 2048;
+		/* Windows ME may prefer this */
+		cFYI(1, "readsize set to minimum: 2048");
+	}
+
+	/*
+	 * Temporarily set wsize for matching superblock. If we end up using
+	 * new sb then cifs_negotiate_wsize will later negotiate it downward
+	 * if needed.
+	 */
+	cifs_sb->wsize = pvolume_info->wsize;
+
+	cifs_sb->mnt_uid = pvolume_info->linux_uid;
+	cifs_sb->mnt_gid = pvolume_info->linux_gid;
+	cifs_sb->mnt_file_mode = pvolume_info->file_mode;
+	cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
+	cFYI(1, "file mode: 0x%x  dir mode: 0x%x",
+		cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
+
+	cifs_sb->actimeo = pvolume_info->actimeo;
+	cifs_sb->local_nls = pvolume_info->local_nls;
+
+	if (pvolume_info->noperm)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
+	if (pvolume_info->setuids)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID;
+	if (pvolume_info->server_ino)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
+	if (pvolume_info->remap)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
+	if (pvolume_info->no_xattr)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
+	if (pvolume_info->sfu_emul)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
+	if (pvolume_info->nobrl)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
+	if (pvolume_info->nostrictsync)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC;
+	if (pvolume_info->mand_lock)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL;
+	if (pvolume_info->rwpidforward)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD;
+	if (pvolume_info->cifs_acl)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
+	if (pvolume_info->override_uid)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
+	if (pvolume_info->override_gid)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
+	if (pvolume_info->dynperm)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
+	if (pvolume_info->fsc)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE;
+	if (pvolume_info->multiuser)
+		cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
+					    CIFS_MOUNT_NO_PERM);
+	if (pvolume_info->strict_io)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO;
+	if (pvolume_info->direct_io) {
+		cFYI(1, "mounting share using direct i/o");
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
+	}
+	if (pvolume_info->mfsymlinks) {
+		if (pvolume_info->sfu_emul) {
+			cERROR(1,  "mount option mfsymlinks ignored if sfu "
+				   "mount option is used");
+		} else {
+			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
+		}
+	}
+
+	if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
+		cERROR(1, "mount option dynperm ignored if cifsacl "
+			   "mount option supported");
+}
+
+/*
+ * When the server supports very large writes via POSIX extensions, we can
+ * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including
+ * the RFC1001 length.
+ *
+ * Note that this might make for "interesting" allocation problems during
+ * writeback however as we have to allocate an array of pointers for the
+ * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
+ */
+#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4)
+
+/*
+ * When the server doesn't allow large posix writes, only allow a wsize of
+ * 2^17-1 minus the size of the WRITE_AND_X header. That allows for a write up
+ * to the maximum size described by RFC1002.
+ */
+#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
+
+/*
+ * The default wsize is 1M. find_get_pages seems to return a maximum of 256
+ * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
+ * a single wsize request with a single call.
+ */
+#define CIFS_DEFAULT_WSIZE (1024 * 1024)
+
+static unsigned int
+cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
+{
+	__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
+				CIFS_DEFAULT_WSIZE;
+
+	/* can server support 24-bit write sizes? (via UNIX extensions) */
+	if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
+		wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1002_WSIZE);
+
+	/*
+	 * no CAP_LARGE_WRITE_X or is signing enabled without CAP_UNIX set?
+	 * Limit it to max buffer offered by the server, minus the size of the
+	 * WRITEX header, not including the 4 byte RFC1001 length.
+	 */
+	if (!(server->capabilities & CAP_LARGE_WRITE_X) ||
+	    (!(server->capabilities & CAP_UNIX) &&
+	     (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED))))
+		wsize = min_t(unsigned int, wsize,
+				server->maxBuf - sizeof(WRITE_REQ) + 4);
+
+	/* hard limit of CIFS_MAX_WSIZE */
+	wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE);
+
+	return wsize;
+}
+
+static int
+is_path_accessible(int xid, struct cifs_tcon *tcon,
+		   struct cifs_sb_info *cifs_sb, const char *full_path)
+{
+	int rc;
+	FILE_ALL_INFO *pfile_info;
+
+	pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (pfile_info == NULL)
+		return -ENOMEM;
+
+	rc = CIFSSMBQPathInfo(xid, tcon, full_path, pfile_info,
+			      0 /* not legacy */, cifs_sb->local_nls,
+			      cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if (rc == -EOPNOTSUPP || rc == -EINVAL)
+		rc = SMBQueryInformation(xid, tcon, full_path, pfile_info,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+				  CIFS_MOUNT_MAP_SPECIAL_CHR);
+	kfree(pfile_info);
+	return rc;
+}
+
+static void
+cleanup_volume_info_contents(struct smb_vol *volume_info)
+{
+	kfree(volume_info->username);
+	kzfree(volume_info->password);
+	kfree(volume_info->UNC);
+	if (volume_info->UNCip != volume_info->UNC + 2)
+		kfree(volume_info->UNCip);
+	kfree(volume_info->domainname);
+	kfree(volume_info->iocharset);
+	kfree(volume_info->prepath);
+}
+
+void
+cifs_cleanup_volume_info(struct smb_vol *volume_info)
+{
+	if (!volume_info)
+		return;
+	cleanup_volume_info_contents(volume_info);
+	kfree(volume_info);
+}
+
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+/* build_path_to_root returns full path to root when
+ * we do not have an exiting connection (tcon) */
+static char *
+build_unc_path_to_root(const struct smb_vol *vol,
+		const struct cifs_sb_info *cifs_sb)
+{
+	char *full_path, *pos;
+	unsigned int pplen = vol->prepath ? strlen(vol->prepath) : 0;
+	unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1);
+
+	full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
+	if (full_path == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	strncpy(full_path, vol->UNC, unc_len);
+	pos = full_path + unc_len;
+
+	if (pplen) {
+		strncpy(pos, vol->prepath, pplen);
+		pos += pplen;
+	}
+
+	*pos = '\0'; /* add trailing null */
+	convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
+	cFYI(1, "%s: full_path=%s", __func__, full_path);
+	return full_path;
+}
+
+/*
+ * Perform a dfs referral query for a share and (optionally) prefix
+ *
+ * If a referral is found, cifs_sb->mountdata will be (re-)allocated
+ * to a string containing updated options for the submount.  Otherwise it
+ * will be left untouched.
+ *
+ * Returns the rc from get_dfs_path to the caller, which can be used to
+ * determine whether there were referrals.
+ */
+static int
+expand_dfs_referral(int xid, struct cifs_ses *pSesInfo,
+		    struct smb_vol *volume_info, struct cifs_sb_info *cifs_sb,
+		    int check_prefix)
+{
+	int rc;
+	unsigned int num_referrals = 0;
+	struct dfs_info3_param *referrals = NULL;
+	char *full_path = NULL, *ref_path = NULL, *mdata = NULL;
+
+	full_path = build_unc_path_to_root(volume_info, cifs_sb);
+	if (IS_ERR(full_path))
+		return PTR_ERR(full_path);
+
+	/* For DFS paths, skip the first '\' of the UNC */
+	ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1;
+
+	rc = get_dfs_path(xid, pSesInfo , ref_path, cifs_sb->local_nls,
+			  &num_referrals, &referrals,
+			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if (!rc && num_referrals > 0) {
+		char *fake_devname = NULL;
+
+		mdata = cifs_compose_mount_options(cifs_sb->mountdata,
+						   full_path + 1, referrals,
+						   &fake_devname);
+
+		free_dfs_info_array(referrals, num_referrals);
+
+		if (IS_ERR(mdata)) {
+			rc = PTR_ERR(mdata);
+			mdata = NULL;
+		} else {
+			cleanup_volume_info_contents(volume_info);
+			memset(volume_info, '\0', sizeof(*volume_info));
+			rc = cifs_setup_volume_info(volume_info, mdata,
+							fake_devname);
+		}
+		kfree(fake_devname);
+		kfree(cifs_sb->mountdata);
+		cifs_sb->mountdata = mdata;
+	}
+	kfree(full_path);
+	return rc;
+}
+#endif
+
+static int
+cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
+			const char *devname)
+{
+	int rc = 0;
+
+	if (cifs_parse_mount_options(mount_data, devname, volume_info))
+		return -EINVAL;
+
+	if (volume_info->nullauth) {
+		cFYI(1, "null user");
+		volume_info->username = kzalloc(1, GFP_KERNEL);
+		if (volume_info->username == NULL)
+			return -ENOMEM;
+	} else if (volume_info->username) {
+		/* BB fixme parse for domain name here */
+		cFYI(1, "Username: %s", volume_info->username);
+	} else {
+		cifserror("No username specified");
+	/* In userspace mount helper we can get user name from alternate
+	   locations such as env variables and files on disk */
+		return -EINVAL;
+	}
+
+	/* this is needed for ASCII cp to Unicode converts */
+	if (volume_info->iocharset == NULL) {
+		/* load_nls_default cannot return null */
+		volume_info->local_nls = load_nls_default();
+	} else {
+		volume_info->local_nls = load_nls(volume_info->iocharset);
+		if (volume_info->local_nls == NULL) {
+			cERROR(1, "CIFS mount error: iocharset %s not found",
+				 volume_info->iocharset);
+			return -ELIBACC;
+		}
+	}
+
+	return rc;
+}
+
+struct smb_vol *
+cifs_get_volume_info(char *mount_data, const char *devname)
+{
+	int rc;
+	struct smb_vol *volume_info;
+
+	volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL);
+	if (!volume_info)
+		return ERR_PTR(-ENOMEM);
+
+	rc = cifs_setup_volume_info(volume_info, mount_data, devname);
+	if (rc) {
+		cifs_cleanup_volume_info(volume_info);
+		volume_info = ERR_PTR(rc);
+	}
+
+	return volume_info;
+}
+
+int
+cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
+{
+	int rc;
+	int xid;
+	struct cifs_ses *pSesInfo;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *srvTcp;
+	char   *full_path;
+	struct tcon_link *tlink;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	int referral_walks_count = 0;
+#endif
+
+	rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+	if (rc)
+		return rc;
+
+	cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+try_mount_again:
+	/* cleanup activities if we're chasing a referral */
+	if (referral_walks_count) {
+		if (tcon)
+			cifs_put_tcon(tcon);
+		else if (pSesInfo)
+			cifs_put_smb_ses(pSesInfo);
+
+		FreeXid(xid);
+	}
+#endif
+	rc = 0;
+	tcon = NULL;
+	pSesInfo = NULL;
+	srvTcp = NULL;
+	full_path = NULL;
+	tlink = NULL;
+
+	xid = GetXid();
+
+	/* get a reference to a tcp session */
+	srvTcp = cifs_get_tcp_session(volume_info);
+	if (IS_ERR(srvTcp)) {
+		rc = PTR_ERR(srvTcp);
+		bdi_destroy(&cifs_sb->bdi);
+		goto out;
+	}
+
+	/* get a reference to a SMB session */
+	pSesInfo = cifs_get_smb_ses(srvTcp, volume_info);
+	if (IS_ERR(pSesInfo)) {
+		rc = PTR_ERR(pSesInfo);
+		pSesInfo = NULL;
+		goto mount_fail_check;
+	}
+
+	/* search for existing tcon to this server share */
+	tcon = cifs_get_tcon(pSesInfo, volume_info);
+	if (IS_ERR(tcon)) {
+		rc = PTR_ERR(tcon);
+		tcon = NULL;
+		goto remote_path_check;
+	}
+
+	/* tell server which Unix caps we support */
+	if (tcon->ses->capabilities & CAP_UNIX) {
+		/* reset of caps checks mount to see if unix extensions
+		   disabled for just this mount */
+		reset_cifs_unix_caps(xid, tcon, cifs_sb, volume_info);
+		if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
+		    (le64_to_cpu(tcon->fsUnixInfo.Capability) &
+		     CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) {
+			rc = -EACCES;
+			goto mount_fail_check;
+		}
+	} else
+		tcon->unix_ext = 0; /* server does not support them */
+
+	/* do not care if following two calls succeed - informational */
+	if (!tcon->ipc) {
+		CIFSSMBQFSDeviceInfo(xid, tcon);
+		CIFSSMBQFSAttributeInfo(xid, tcon);
+	}
+
+	if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
+		cifs_sb->rsize = 1024 * 127;
+		cFYI(DBG2, "no very large read support, rsize now 127K");
+	}
+	if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
+		cifs_sb->rsize = min(cifs_sb->rsize,
+			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
+
+	cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info);
+
+remote_path_check:
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	/*
+	 * Perform an unconditional check for whether there are DFS
+	 * referrals for this path without prefix, to provide support
+	 * for DFS referrals from w2k8 servers which don't seem to respond
+	 * with PATH_NOT_COVERED to requests that include the prefix.
+	 * Chase the referral if found, otherwise continue normally.
+	 */
+	if (referral_walks_count == 0) {
+		int refrc = expand_dfs_referral(xid, pSesInfo, volume_info,
+						cifs_sb, false);
+		if (!refrc) {
+			referral_walks_count++;
+			goto try_mount_again;
+		}
+	}
+#endif
+
+	/* check if a whole path is not remote */
+	if (!rc && tcon) {
+		/* build_path_to_root works only when we have a valid tcon */
+		full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon);
+		if (full_path == NULL) {
+			rc = -ENOMEM;
+			goto mount_fail_check;
+		}
+		rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
+		if (rc != 0 && rc != -EREMOTE) {
+			kfree(full_path);
+			goto mount_fail_check;
+		}
+		kfree(full_path);
+	}
+
+	/* get referral if needed */
+	if (rc == -EREMOTE) {
+#ifdef CONFIG_CIFS_DFS_UPCALL
+		if (referral_walks_count > MAX_NESTED_LINKS) {
+			/*
+			 * BB: when we implement proper loop detection,
+			 *     we will remove this check. But now we need it
+			 *     to prevent an indefinite loop if 'DFS tree' is
+			 *     misconfigured (i.e. has loops).
+			 */
+			rc = -ELOOP;
+			goto mount_fail_check;
+		}
+
+		rc = expand_dfs_referral(xid, pSesInfo, volume_info, cifs_sb,
+					 true);
+
+		if (!rc) {
+			referral_walks_count++;
+			goto try_mount_again;
+		}
+		goto mount_fail_check;
+#else /* No DFS support, return error on mount */
+		rc = -EOPNOTSUPP;
+#endif
+	}
+
+	if (rc)
+		goto mount_fail_check;
+
+	/* now, hang the tcon off of the superblock */
+	tlink = kzalloc(sizeof *tlink, GFP_KERNEL);
+	if (tlink == NULL) {
+		rc = -ENOMEM;
+		goto mount_fail_check;
+	}
+
+	tlink->tl_uid = pSesInfo->linux_uid;
+	tlink->tl_tcon = tcon;
+	tlink->tl_time = jiffies;
+	set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
+	set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+
+	cifs_sb->master_tlink = tlink;
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
+	spin_unlock(&cifs_sb->tlink_tree_lock);
+
+	queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
+				TLINK_IDLE_EXPIRE);
+
+mount_fail_check:
+	/* on error free sesinfo and tcon struct if needed */
+	if (rc) {
+		/* If find_unc succeeded then rc == 0 so we can not end */
+		/* up accidentally freeing someone elses tcon struct */
+		if (tcon)
+			cifs_put_tcon(tcon);
+		else if (pSesInfo)
+			cifs_put_smb_ses(pSesInfo);
+		else
+			cifs_put_tcp_session(srvTcp);
+		bdi_destroy(&cifs_sb->bdi);
+		goto out;
+	}
+
+	/* volume_info->password is freed above when existing session found
+	(in which case it is not needed anymore) but when new sesion is created
+	the password ptr is put in the new session structure (in which case the
+	password will be freed at unmount time) */
+out:
+	/* zero out password before freeing */
+	FreeXid(xid);
+	return rc;
+}
+
+/*
+ * Issue a TREE_CONNECT request. Note that for IPC$ shares, that the tcon
+ * pointer may be NULL.
+ */
+int
+CIFSTCon(unsigned int xid, struct cifs_ses *ses,
+	 const char *tree, struct cifs_tcon *tcon,
+	 const struct nls_table *nls_codepage)
+{
+	struct smb_hdr *smb_buffer;
+	struct smb_hdr *smb_buffer_response;
+	TCONX_REQ *pSMB;
+	TCONX_RSP *pSMBr;
+	unsigned char *bcc_ptr;
+	int rc = 0;
+	int length;
+	__u16 bytes_left, count;
+
+	if (ses == NULL)
+		return -EIO;
+
+	smb_buffer = cifs_buf_get();
+	if (smb_buffer == NULL)
+		return -ENOMEM;
+
+	smb_buffer_response = smb_buffer;
+
+	header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
+			NULL /*no tid */ , 4 /*wct */ );
+
+	smb_buffer->Mid = GetNextMid(ses->server);
+	smb_buffer->Uid = ses->Suid;
+	pSMB = (TCONX_REQ *) smb_buffer;
+	pSMBr = (TCONX_RSP *) smb_buffer_response;
+
+	pSMB->AndXCommand = 0xFF;
+	pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO);
+	bcc_ptr = &pSMB->Password[0];
+	if (!tcon || (ses->server->sec_mode & SECMODE_USER)) {
+		pSMB->PasswordLength = cpu_to_le16(1);	/* minimum */
+		*bcc_ptr = 0; /* password is null byte */
+		bcc_ptr++;              /* skip password */
+		/* already aligned so no need to do it below */
+	} else {
+		pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+		/* BB FIXME add code to fail this if NTLMv2 or Kerberos
+		   specified as required (when that support is added to
+		   the vfs in the future) as only NTLM or the much
+		   weaker LANMAN (which we do not send by default) is accepted
+		   by Samba (not sure whether other servers allow
+		   NTLMv2 password here) */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+		if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
+		    (ses->server->secType == LANMAN))
+			calc_lanman_hash(tcon->password, ses->server->cryptkey,
+					 ses->server->sec_mode &
+					    SECMODE_PW_ENCRYPT ? true : false,
+					 bcc_ptr);
+		else
+#endif /* CIFS_WEAK_PW_HASH */
+		rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
+					bcc_ptr);
+
+		bcc_ptr += CIFS_AUTH_RESP_SIZE;
+		if (ses->capabilities & CAP_UNICODE) {
+			/* must align unicode strings */
+			*bcc_ptr = 0; /* null byte password */
+			bcc_ptr++;
+		}
+	}
+
+	if (ses->server->sec_mode &
+			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+		smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+
+	if (ses->capabilities & CAP_STATUS32) {
+		smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
+	}
+	if (ses->capabilities & CAP_DFS) {
+		smb_buffer->Flags2 |= SMBFLG2_DFS;
+	}
+	if (ses->capabilities & CAP_UNICODE) {
+		smb_buffer->Flags2 |= SMBFLG2_UNICODE;
+		length =
+		    cifs_strtoUCS((__le16 *) bcc_ptr, tree,
+			6 /* max utf8 char length in bytes */ *
+			(/* server len*/ + 256 /* share len */), nls_codepage);
+		bcc_ptr += 2 * length;	/* convert num 16 bit words to bytes */
+		bcc_ptr += 2;	/* skip trailing null */
+	} else {		/* ASCII */
+		strcpy(bcc_ptr, tree);
+		bcc_ptr += strlen(tree) + 1;
+	}
+	strcpy(bcc_ptr, "?????");
+	bcc_ptr += strlen("?????");
+	bcc_ptr += 1;
+	count = bcc_ptr - &pSMB->Password[0];
+	pSMB->hdr.smb_buf_length = cpu_to_be32(be32_to_cpu(
+					pSMB->hdr.smb_buf_length) + count);
+	pSMB->ByteCount = cpu_to_le16(count);
+
+	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
+			 0);
+
+	/* above now done in SendReceive */
+	if ((rc == 0) && (tcon != NULL)) {
+		bool is_unicode;
+
+		tcon->tidStatus = CifsGood;
+		tcon->need_reconnect = false;
+		tcon->tid = smb_buffer_response->Tid;
+		bcc_ptr = pByteArea(smb_buffer_response);
+		bytes_left = get_bcc(smb_buffer_response);
+		length = strnlen(bcc_ptr, bytes_left - 2);
+		if (smb_buffer->Flags2 & SMBFLG2_UNICODE)
+			is_unicode = true;
+		else
+			is_unicode = false;
+
+
+		/* skip service field (NB: this field is always ASCII) */
+		if (length == 3) {
+			if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
+			    (bcc_ptr[2] == 'C')) {
+				cFYI(1, "IPC connection");
+				tcon->ipc = 1;
+			}
+		} else if (length == 2) {
+			if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) {
+				/* the most common case */
+				cFYI(1, "disk share connection");
+			}
+		}
+		bcc_ptr += length + 1;
+		bytes_left -= (length + 1);
+		strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
+
+		/* mostly informational -- no need to fail on error here */
+		kfree(tcon->nativeFileSystem);
+		tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
+						      bytes_left, is_unicode,
+						      nls_codepage);
+
+		cFYI(1, "nativeFileSystem=%s", tcon->nativeFileSystem);
+
+		if ((smb_buffer_response->WordCount == 3) ||
+			 (smb_buffer_response->WordCount == 7))
+			/* field is in same location */
+			tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
+		else
+			tcon->Flags = 0;
+		cFYI(1, "Tcon flags: 0x%x ", tcon->Flags);
+	} else if ((rc == 0) && tcon == NULL) {
+		/* all we need to save for IPC$ connection */
+		ses->ipc_tid = smb_buffer_response->Tid;
+	}
+
+	cifs_buf_release(smb_buffer);
+	return rc;
+}
+
+void
+cifs_umount(struct cifs_sb_info *cifs_sb)
+{
+	struct rb_root *root = &cifs_sb->tlink_tree;
+	struct rb_node *node;
+	struct tcon_link *tlink;
+
+	cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
+
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	while ((node = rb_first(root))) {
+		tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+		cifs_get_tlink(tlink);
+		clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+		rb_erase(node, root);
+
+		spin_unlock(&cifs_sb->tlink_tree_lock);
+		cifs_put_tlink(tlink);
+		spin_lock(&cifs_sb->tlink_tree_lock);
+	}
+	spin_unlock(&cifs_sb->tlink_tree_lock);
+
+	bdi_destroy(&cifs_sb->bdi);
+	kfree(cifs_sb->mountdata);
+	unload_nls(cifs_sb->local_nls);
+	kfree(cifs_sb);
+}
+
+int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses)
+{
+	int rc = 0;
+	struct TCP_Server_Info *server = ses->server;
+
+	/* only send once per connect */
+	if (server->maxBuf != 0)
+		return 0;
+
+	rc = CIFSSMBNegotiate(xid, ses);
+	if (rc == -EAGAIN) {
+		/* retry only once on 1st time connection */
+		rc = CIFSSMBNegotiate(xid, ses);
+		if (rc == -EAGAIN)
+			rc = -EHOSTDOWN;
+	}
+	if (rc == 0) {
+		spin_lock(&GlobalMid_Lock);
+		if (server->tcpStatus == CifsNeedNegotiate)
+			server->tcpStatus = CifsGood;
+		else
+			rc = -EHOSTDOWN;
+		spin_unlock(&GlobalMid_Lock);
+
+	}
+
+	return rc;
+}
+
+
+int cifs_setup_session(unsigned int xid, struct cifs_ses *ses,
+			struct nls_table *nls_info)
+{
+	int rc = 0;
+	struct TCP_Server_Info *server = ses->server;
+
+	ses->flags = 0;
+	ses->capabilities = server->capabilities;
+	if (linuxExtEnabled == 0)
+		ses->capabilities &= (~CAP_UNIX);
+
+	cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
+		 server->sec_mode, server->capabilities, server->timeAdj);
+
+	rc = CIFS_SessSetup(xid, ses, nls_info);
+	if (rc) {
+		cERROR(1, "Send error in SessSetup = %d", rc);
+	} else {
+		mutex_lock(&ses->server->srv_mutex);
+		if (!server->session_estab) {
+			server->session_key.response = ses->auth_key.response;
+			server->session_key.len = ses->auth_key.len;
+			server->sequence_number = 0x2;
+			server->session_estab = true;
+			ses->auth_key.response = NULL;
+		}
+		mutex_unlock(&server->srv_mutex);
+
+		cFYI(1, "CIFS Session Established successfully");
+		spin_lock(&GlobalMid_Lock);
+		ses->status = CifsGood;
+		ses->need_reconnect = false;
+		spin_unlock(&GlobalMid_Lock);
+	}
+
+	kfree(ses->auth_key.response);
+	ses->auth_key.response = NULL;
+	ses->auth_key.len = 0;
+	kfree(ses->ntlmssp);
+	ses->ntlmssp = NULL;
+
+	return rc;
+}
+
+static struct cifs_tcon *
+cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
+{
+	struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon = NULL;
+	struct smb_vol *vol_info;
+	char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */
+			   /* We used to have this as MAX_USERNAME which is   */
+			   /* way too big now (256 instead of 32) */
+
+	vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
+	if (vol_info == NULL) {
+		tcon = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	snprintf(username, sizeof(username), "krb50x%x", fsuid);
+	vol_info->username = username;
+	vol_info->local_nls = cifs_sb->local_nls;
+	vol_info->linux_uid = fsuid;
+	vol_info->cred_uid = fsuid;
+	vol_info->UNC = master_tcon->treeName;
+	vol_info->retry = master_tcon->retry;
+	vol_info->nocase = master_tcon->nocase;
+	vol_info->local_lease = master_tcon->local_lease;
+	vol_info->no_linux_ext = !master_tcon->unix_ext;
+
+	/* FIXME: allow for other secFlg settings */
+	vol_info->secFlg = CIFSSEC_MUST_KRB5;
+
+	/* get a reference for the same TCP session */
+	spin_lock(&cifs_tcp_ses_lock);
+	++master_tcon->ses->server->srv_count;
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
+	if (IS_ERR(ses)) {
+		tcon = (struct cifs_tcon *)ses;
+		cifs_put_tcp_session(master_tcon->ses->server);
+		goto out;
+	}
+
+	tcon = cifs_get_tcon(ses, vol_info);
+	if (IS_ERR(tcon)) {
+		cifs_put_smb_ses(ses);
+		goto out;
+	}
+
+	if (ses->capabilities & CAP_UNIX)
+		reset_cifs_unix_caps(0, tcon, NULL, vol_info);
+out:
+	kfree(vol_info);
+
+	return tcon;
+}
+
+struct cifs_tcon *
+cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
+{
+	return tlink_tcon(cifs_sb_master_tlink(cifs_sb));
+}
+
+static int
+cifs_sb_tcon_pending_wait(void *unused)
+{
+	schedule();
+	return signal_pending(current) ? -ERESTARTSYS : 0;
+}
+
+/* find and return a tlink with given uid */
+static struct tcon_link *
+tlink_rb_search(struct rb_root *root, uid_t uid)
+{
+	struct rb_node *node = root->rb_node;
+	struct tcon_link *tlink;
+
+	while (node) {
+		tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+
+		if (tlink->tl_uid > uid)
+			node = node->rb_left;
+		else if (tlink->tl_uid < uid)
+			node = node->rb_right;
+		else
+			return tlink;
+	}
+	return NULL;
+}
+
+/* insert a tcon_link into the tree */
+static void
+tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct tcon_link *tlink;
+
+	while (*new) {
+		tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
+		parent = *new;
+
+		if (tlink->tl_uid > new_tlink->tl_uid)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	rb_link_node(&new_tlink->tl_rbnode, parent, new);
+	rb_insert_color(&new_tlink->tl_rbnode, root);
+}
+
+/*
+ * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
+ * current task.
+ *
+ * If the superblock doesn't refer to a multiuser mount, then just return
+ * the master tcon for the mount.
+ *
+ * First, search the rbtree for an existing tcon for this fsuid. If one
+ * exists, then check to see if it's pending construction. If it is then wait
+ * for construction to complete. Once it's no longer pending, check to see if
+ * it failed and either return an error or retry construction, depending on
+ * the timeout.
+ *
+ * If one doesn't exist then insert a new tcon_link struct into the tree and
+ * try to construct a new one.
+ */
+struct tcon_link *
+cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
+{
+	int ret;
+	uid_t fsuid = current_fsuid();
+	struct tcon_link *tlink, *newtlink;
+
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+		return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
+
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
+	if (tlink)
+		cifs_get_tlink(tlink);
+	spin_unlock(&cifs_sb->tlink_tree_lock);
+
+	if (tlink == NULL) {
+		newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
+		if (newtlink == NULL)
+			return ERR_PTR(-ENOMEM);
+		newtlink->tl_uid = fsuid;
+		newtlink->tl_tcon = ERR_PTR(-EACCES);
+		set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
+		set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
+		cifs_get_tlink(newtlink);
+
+		spin_lock(&cifs_sb->tlink_tree_lock);
+		/* was one inserted after previous search? */
+		tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
+		if (tlink) {
+			cifs_get_tlink(tlink);
+			spin_unlock(&cifs_sb->tlink_tree_lock);
+			kfree(newtlink);
+			goto wait_for_construction;
+		}
+		tlink = newtlink;
+		tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
+		spin_unlock(&cifs_sb->tlink_tree_lock);
+	} else {
+wait_for_construction:
+		ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
+				  cifs_sb_tcon_pending_wait,
+				  TASK_INTERRUPTIBLE);
+		if (ret) {
+			cifs_put_tlink(tlink);
+			return ERR_PTR(ret);
+		}
+
+		/* if it's good, return it */
+		if (!IS_ERR(tlink->tl_tcon))
+			return tlink;
+
+		/* return error if we tried this already recently */
+		if (time_before(jiffies, tlink->tl_time + TLINK_ERROR_EXPIRE)) {
+			cifs_put_tlink(tlink);
+			return ERR_PTR(-EACCES);
+		}
+
+		if (test_and_set_bit(TCON_LINK_PENDING, &tlink->tl_flags))
+			goto wait_for_construction;
+	}
+
+	tlink->tl_tcon = cifs_construct_tcon(cifs_sb, fsuid);
+	clear_bit(TCON_LINK_PENDING, &tlink->tl_flags);
+	wake_up_bit(&tlink->tl_flags, TCON_LINK_PENDING);
+
+	if (IS_ERR(tlink->tl_tcon)) {
+		cifs_put_tlink(tlink);
+		return ERR_PTR(-EACCES);
+	}
+
+	return tlink;
+}
+
+/*
+ * periodic workqueue job that scans tcon_tree for a superblock and closes
+ * out tcons.
+ */
+static void
+cifs_prune_tlinks(struct work_struct *work)
+{
+	struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
+						    prune_tlinks.work);
+	struct rb_root *root = &cifs_sb->tlink_tree;
+	struct rb_node *node = rb_first(root);
+	struct rb_node *tmp;
+	struct tcon_link *tlink;
+
+	/*
+	 * Because we drop the spinlock in the loop in order to put the tlink
+	 * it's not guarded against removal of links from the tree. The only
+	 * places that remove entries from the tree are this function and
+	 * umounts. Because this function is non-reentrant and is canceled
+	 * before umount can proceed, this is safe.
+	 */
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	node = rb_first(root);
+	while (node != NULL) {
+		tmp = node;
+		node = rb_next(tmp);
+		tlink = rb_entry(tmp, struct tcon_link, tl_rbnode);
+
+		if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) ||
+		    atomic_read(&tlink->tl_count) != 0 ||
+		    time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies))
+			continue;
+
+		cifs_get_tlink(tlink);
+		clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+		rb_erase(tmp, root);
+
+		spin_unlock(&cifs_sb->tlink_tree_lock);
+		cifs_put_tlink(tlink);
+		spin_lock(&cifs_sb->tlink_tree_lock);
+	}
+	spin_unlock(&cifs_sb->tlink_tree_lock);
+
+	queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
+				TLINK_IDLE_EXPIRE);
+}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
new file mode 100644
index 00000000..ed5c07b0
--- /dev/null
+++ b/fs/cifs/dir.c
@@ -0,0 +1,742 @@
+/*
+ *   fs/cifs/dir.c
+ *
+ *   vfs operations that deal with dentries
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2009
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+
+static void
+renew_parental_timestamps(struct dentry *direntry)
+{
+	/* BB check if there is a way to get the kernel to do this or if we
+	   really need this */
+	do {
+		direntry->d_time = jiffies;
+		direntry = direntry->d_parent;
+	} while (!IS_ROOT(direntry));
+}
+
+/* Note: caller must free return buffer */
+char *
+build_path_from_dentry(struct dentry *direntry)
+{
+	struct dentry *temp;
+	int namelen;
+	int dfsplen;
+	char *full_path;
+	char dirsep;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	unsigned seq;
+
+	if (direntry == NULL)
+		return NULL;  /* not much we can do if dentry is freed and
+		we need to reopen the file after it was closed implicitly
+		when the server crashed */
+
+	dirsep = CIFS_DIR_SEP(cifs_sb);
+	if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+		dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
+	else
+		dfsplen = 0;
+cifs_bp_rename_retry:
+	namelen = dfsplen;
+	seq = read_seqbegin(&rename_lock);
+	rcu_read_lock();
+	for (temp = direntry; !IS_ROOT(temp);) {
+		namelen += (1 + temp->d_name.len);
+		temp = temp->d_parent;
+		if (temp == NULL) {
+			cERROR(1, "corrupt dentry");
+			rcu_read_unlock();
+			return NULL;
+		}
+	}
+	rcu_read_unlock();
+
+	full_path = kmalloc(namelen+1, GFP_KERNEL);
+	if (full_path == NULL)
+		return full_path;
+	full_path[namelen] = 0;	/* trailing null */
+	rcu_read_lock();
+	for (temp = direntry; !IS_ROOT(temp);) {
+		spin_lock(&temp->d_lock);
+		namelen -= 1 + temp->d_name.len;
+		if (namelen < 0) {
+			spin_unlock(&temp->d_lock);
+			break;
+		} else {
+			full_path[namelen] = dirsep;
+			strncpy(full_path + namelen + 1, temp->d_name.name,
+				temp->d_name.len);
+			cFYI(0, "name: %s", full_path + namelen);
+		}
+		spin_unlock(&temp->d_lock);
+		temp = temp->d_parent;
+		if (temp == NULL) {
+			cERROR(1, "corrupt dentry");
+			rcu_read_unlock();
+			kfree(full_path);
+			return NULL;
+		}
+	}
+	rcu_read_unlock();
+	if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) {
+		cFYI(1, "did not end path lookup where expected. namelen=%d "
+			"dfsplen=%d", namelen, dfsplen);
+		/* presumably this is only possible if racing with a rename
+		of one of the parent directories  (we can not lock the dentries
+		above us to prevent this, but retrying should be harmless) */
+		kfree(full_path);
+		goto cifs_bp_rename_retry;
+	}
+	/* DIR_SEP already set for byte  0 / vs \ but not for
+	   subsequent slashes in prepath which currently must
+	   be entered the right way - not sure if there is an alternative
+	   since the '\' is a valid posix character so we can not switch
+	   those safely to '/' if any are found in the middle of the prepath */
+	/* BB test paths to Windows with '/' in the midst of prepath */
+
+	if (dfsplen) {
+		strncpy(full_path, tcon->treeName, dfsplen);
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
+			int i;
+			for (i = 0; i < dfsplen; i++) {
+				if (full_path[i] == '\\')
+					full_path[i] = '/';
+			}
+		}
+	}
+	return full_path;
+}
+
+/* Inode operations in similar order to how they appear in Linux file fs.h */
+
+int
+cifs_create(struct inode *inode, struct dentry *direntry, int mode,
+		struct nameidata *nd)
+{
+	int rc = -ENOENT;
+	int xid;
+	int create_options = CREATE_NOT_DIR;
+	__u32 oplock = 0;
+	int oflags;
+	/*
+	 * BB below access is probably too much for mknod to request
+	 *    but we have to do query and setpathinfo so requesting
+	 *    less could fail (unless we want to request getatr and setatr
+	 *    permissions (only).  At least for POSIX we do not have to
+	 *    request so much.
+	 */
+	int desiredAccess = GENERIC_READ | GENERIC_WRITE;
+	__u16 fileHandle;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	char *full_path = NULL;
+	FILE_ALL_INFO *buf = NULL;
+	struct inode *newinode = NULL;
+	int disposition = FILE_OVERWRITE_IF;
+
+	xid = GetXid();
+
+	cifs_sb = CIFS_SB(inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		FreeXid(xid);
+		return PTR_ERR(tlink);
+	}
+	tcon = tlink_tcon(tlink);
+
+	if (oplockEnabled)
+		oplock = REQ_OPLOCK;
+
+	if (nd && (nd->flags & LOOKUP_OPEN))
+		oflags = nd->intent.open.file->f_flags;
+	else
+		oflags = O_RDONLY | O_CREAT;
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto cifs_create_out;
+	}
+
+	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		rc = cifs_posix_open(full_path, &newinode,
+			inode->i_sb, mode, oflags, &oplock, &fileHandle, xid);
+		/* EIO could indicate that (posix open) operation is not
+		   supported, despite what server claimed in capability
+		   negotiation.  EREMOTE indicates DFS junction, which is not
+		   handled in posix open */
+
+		if (rc == 0) {
+			if (newinode == NULL) /* query inode info */
+				goto cifs_create_get_file_info;
+			else /* success, no need to query */
+				goto cifs_create_set_dentry;
+		} else if ((rc != -EIO) && (rc != -EREMOTE) &&
+			 (rc != -EOPNOTSUPP) && (rc != -EINVAL))
+			goto cifs_create_out;
+		/* else fallthrough to retry, using older open call, this is
+		   case where server does not support this SMB level, and
+		   falsely claims capability (also get here for DFS case
+		   which should be rare for path not covered on files) */
+	}
+
+	if (nd && (nd->flags & LOOKUP_OPEN)) {
+		/* if the file is going to stay open, then we
+		   need to set the desired access properly */
+		desiredAccess = 0;
+		if (OPEN_FMODE(oflags) & FMODE_READ)
+			desiredAccess |= GENERIC_READ; /* is this too little? */
+		if (OPEN_FMODE(oflags) & FMODE_WRITE)
+			desiredAccess |= GENERIC_WRITE;
+
+		if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+			disposition = FILE_CREATE;
+		else if ((oflags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC))
+			disposition = FILE_OVERWRITE_IF;
+		else if ((oflags & O_CREAT) == O_CREAT)
+			disposition = FILE_OPEN_IF;
+		else
+			cFYI(1, "Create flag not set in create function");
+	}
+
+	/* BB add processing to set equivalent of mode - e.g. via CreateX with
+	   ACLs */
+
+	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (buf == NULL) {
+		rc = -ENOMEM;
+		goto cifs_create_out;
+	}
+
+	/*
+	 * if we're not using unix extensions, see if we need to set
+	 * ATTR_READONLY on the create call
+	 */
+	if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
+		create_options |= CREATE_OPTION_READONLY;
+
+	if (tcon->ses->capabilities & CAP_NT_SMBS)
+		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
+			 desiredAccess, create_options,
+			 &fileHandle, &oplock, buf, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	else
+		rc = -EIO; /* no NT SMB support fall into legacy open below */
+
+	if (rc == -EIO) {
+		/* old server, retry the open legacy style */
+		rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
+			desiredAccess, create_options,
+			&fileHandle, &oplock, buf, cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	}
+	if (rc) {
+		cFYI(1, "cifs_create returned 0x%x", rc);
+		goto cifs_create_out;
+	}
+
+	/* If Open reported that we actually created a file
+	   then we now have to set the mode if possible */
+	if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+		struct cifs_unix_set_info_args args = {
+				.mode	= mode,
+				.ctime	= NO_CHANGE_64,
+				.atime	= NO_CHANGE_64,
+				.mtime	= NO_CHANGE_64,
+				.device	= 0,
+		};
+
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+			args.uid = (__u64) current_fsuid();
+			if (inode->i_mode & S_ISGID)
+				args.gid = (__u64) inode->i_gid;
+			else
+				args.gid = (__u64) current_fsgid();
+		} else {
+			args.uid = NO_CHANGE_64;
+			args.gid = NO_CHANGE_64;
+		}
+		CIFSSMBUnixSetFileInfo(xid, tcon, &args, fileHandle,
+					current->tgid);
+	} else {
+		/* BB implement mode setting via Windows security
+		   descriptors e.g. */
+		/* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
+
+		/* Could set r/o dos attribute if mode & 0222 == 0 */
+	}
+
+cifs_create_get_file_info:
+	/* server might mask mode so we have to query for it */
+	if (tcon->unix_ext)
+		rc = cifs_get_inode_info_unix(&newinode, full_path,
+					      inode->i_sb, xid);
+	else {
+		rc = cifs_get_inode_info(&newinode, full_path, buf,
+					 inode->i_sb, xid, &fileHandle);
+		if (newinode) {
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+				newinode->i_mode = mode;
+			if ((oplock & CIFS_CREATE_ACTION) &&
+			    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
+				newinode->i_uid = current_fsuid();
+				if (inode->i_mode & S_ISGID)
+					newinode->i_gid = inode->i_gid;
+				else
+					newinode->i_gid = current_fsgid();
+			}
+		}
+	}
+
+cifs_create_set_dentry:
+	if (rc == 0)
+		d_instantiate(direntry, newinode);
+	else
+		cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
+
+	if (newinode && nd && (nd->flags & LOOKUP_OPEN)) {
+		struct cifsFileInfo *pfile_info;
+		struct file *filp;
+
+		filp = lookup_instantiate_filp(nd, direntry, generic_file_open);
+		if (IS_ERR(filp)) {
+			rc = PTR_ERR(filp);
+			CIFSSMBClose(xid, tcon, fileHandle);
+			goto cifs_create_out;
+		}
+
+		pfile_info = cifs_new_fileinfo(fileHandle, filp, tlink, oplock);
+		if (pfile_info == NULL) {
+			fput(filp);
+			CIFSSMBClose(xid, tcon, fileHandle);
+			rc = -ENOMEM;
+		}
+	} else {
+		CIFSSMBClose(xid, tcon, fileHandle);
+	}
+
+cifs_create_out:
+	kfree(buf);
+	kfree(full_path);
+	cifs_put_tlink(tlink);
+	FreeXid(xid);
+	return rc;
+}
+
+int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
+		dev_t device_number)
+{
+	int rc = -EPERM;
+	int xid;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	struct cifs_io_parms io_parms;
+	char *full_path = NULL;
+	struct inode *newinode = NULL;
+	int oplock = 0;
+	u16 fileHandle;
+	FILE_ALL_INFO *buf = NULL;
+	unsigned int bytes_written;
+	struct win_dev *pdev;
+
+	if (!old_valid_dev(device_number))
+		return -EINVAL;
+
+	cifs_sb = CIFS_SB(inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto mknod_out;
+	}
+
+	if (pTcon->unix_ext) {
+		struct cifs_unix_set_info_args args = {
+			.mode	= mode & ~current_umask(),
+			.ctime	= NO_CHANGE_64,
+			.atime	= NO_CHANGE_64,
+			.mtime	= NO_CHANGE_64,
+			.device	= device_number,
+		};
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+			args.uid = (__u64) current_fsuid();
+			args.gid = (__u64) current_fsgid();
+		} else {
+			args.uid = NO_CHANGE_64;
+			args.gid = NO_CHANGE_64;
+		}
+		rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
+					    cifs_sb->local_nls,
+					    cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (rc)
+			goto mknod_out;
+
+		rc = cifs_get_inode_info_unix(&newinode, full_path,
+						inode->i_sb, xid);
+
+		if (rc == 0)
+			d_instantiate(direntry, newinode);
+		goto mknod_out;
+	}
+
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+		goto mknod_out;
+
+
+	cFYI(1, "sfu compat create special file");
+
+	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (buf == NULL) {
+		kfree(full_path);
+		rc = -ENOMEM;
+		FreeXid(xid);
+		return rc;
+	}
+
+	/* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
+	rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
+			 GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
+			 &fileHandle, &oplock, buf, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc)
+		goto mknod_out;
+
+	/* BB Do not bother to decode buf since no local inode yet to put
+	 * timestamps in, but we can reuse it safely */
+
+	pdev = (struct win_dev *)buf;
+	io_parms.netfid = fileHandle;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = pTcon;
+	io_parms.offset = 0;
+	io_parms.length = sizeof(struct win_dev);
+	if (S_ISCHR(mode)) {
+		memcpy(pdev->type, "IntxCHR", 8);
+		pdev->major =
+		      cpu_to_le64(MAJOR(device_number));
+		pdev->minor =
+		      cpu_to_le64(MINOR(device_number));
+		rc = CIFSSMBWrite(xid, &io_parms,
+			&bytes_written, (char *)pdev,
+			NULL, 0);
+	} else if (S_ISBLK(mode)) {
+		memcpy(pdev->type, "IntxBLK", 8);
+		pdev->major =
+		      cpu_to_le64(MAJOR(device_number));
+		pdev->minor =
+		      cpu_to_le64(MINOR(device_number));
+		rc = CIFSSMBWrite(xid, &io_parms,
+			&bytes_written, (char *)pdev,
+			NULL, 0);
+	} /* else if (S_ISFIFO) */
+	CIFSSMBClose(xid, pTcon, fileHandle);
+	d_drop(direntry);
+
+	/* FIXME: add code here to set EAs */
+
+mknod_out:
+	kfree(full_path);
+	kfree(buf);
+	FreeXid(xid);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+struct dentry *
+cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
+	    struct nameidata *nd)
+{
+	int xid;
+	int rc = 0; /* to get around spurious gcc warning, set to zero here */
+	__u32 oplock = 0;
+	__u16 fileHandle = 0;
+	bool posix_open = false;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	struct cifsFileInfo *cfile;
+	struct inode *newInode = NULL;
+	char *full_path = NULL;
+	struct file *filp;
+
+	xid = GetXid();
+
+	cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p",
+	      parent_dir_inode, direntry->d_name.name, direntry);
+
+	/* check whether path exists */
+
+	cifs_sb = CIFS_SB(parent_dir_inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		FreeXid(xid);
+		return (struct dentry *)tlink;
+	}
+	pTcon = tlink_tcon(tlink);
+
+	/*
+	 * Don't allow the separator character in a path component.
+	 * The VFS will not allow "/", but "\" is allowed by posix.
+	 */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) {
+		int i;
+		for (i = 0; i < direntry->d_name.len; i++)
+			if (direntry->d_name.name[i] == '\\') {
+				cFYI(1, "Invalid file name");
+				rc = -EINVAL;
+				goto lookup_out;
+			}
+	}
+
+	/*
+	 * O_EXCL: optimize away the lookup, but don't hash the dentry. Let
+	 * the VFS handle the create.
+	 */
+	if (nd && (nd->flags & LOOKUP_EXCL)) {
+		d_instantiate(direntry, NULL);
+		rc = 0;
+		goto lookup_out;
+	}
+
+	/* can not grab the rename sem here since it would
+	deadlock in the cases (beginning of sys_rename itself)
+	in which we already have the sb rename sem */
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto lookup_out;
+	}
+
+	if (direntry->d_inode != NULL) {
+		cFYI(1, "non-NULL inode in lookup");
+	} else {
+		cFYI(1, "NULL inode in lookup");
+	}
+	cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode);
+
+	/* Posix open is only called (at lookup time) for file create now.
+	 * For opens (rather than creates), because we do not know if it
+	 * is a file or directory yet, and current Samba no longer allows
+	 * us to do posix open on dirs, we could end up wasting an open call
+	 * on what turns out to be a dir. For file opens, we wait to call posix
+	 * open till cifs_open.  It could be added here (lookup) in the future
+	 * but the performance tradeoff of the extra network request when EISDIR
+	 * or EACCES is returned would have to be weighed against the 50%
+	 * reduction in network traffic in the other paths.
+	 */
+	if (pTcon->unix_ext) {
+		if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
+		     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
+		     (nd->intent.open.file->f_flags & O_CREAT)) {
+			rc = cifs_posix_open(full_path, &newInode,
+					parent_dir_inode->i_sb,
+					nd->intent.open.create_mode,
+					nd->intent.open.file->f_flags, &oplock,
+					&fileHandle, xid);
+			/*
+			 * The check below works around a bug in POSIX
+			 * open in samba versions 3.3.1 and earlier where
+			 * open could incorrectly fail with invalid parameter.
+			 * If either that or op not supported returned, follow
+			 * the normal lookup.
+			 */
+			switch (rc) {
+			case 0:
+				/*
+				 * The server may allow us to open things like
+				 * FIFOs, but the client isn't set up to deal
+				 * with that. If it's not a regular file, just
+				 * close it and proceed as if it were a normal
+				 * lookup.
+				 */
+				if (newInode && !S_ISREG(newInode->i_mode)) {
+					CIFSSMBClose(xid, pTcon, fileHandle);
+					break;
+				}
+			case -ENOENT:
+				posix_open = true;
+			case -EOPNOTSUPP:
+				break;
+			default:
+				pTcon->broken_posix_open = true;
+			}
+		}
+		if (!posix_open)
+			rc = cifs_get_inode_info_unix(&newInode, full_path,
+						parent_dir_inode->i_sb, xid);
+	} else
+		rc = cifs_get_inode_info(&newInode, full_path, NULL,
+				parent_dir_inode->i_sb, xid, NULL);
+
+	if ((rc == 0) && (newInode != NULL)) {
+		d_add(direntry, newInode);
+		if (posix_open) {
+			filp = lookup_instantiate_filp(nd, direntry,
+						       generic_file_open);
+			if (IS_ERR(filp)) {
+				rc = PTR_ERR(filp);
+				CIFSSMBClose(xid, pTcon, fileHandle);
+				goto lookup_out;
+			}
+
+			cfile = cifs_new_fileinfo(fileHandle, filp, tlink,
+						  oplock);
+			if (cfile == NULL) {
+				fput(filp);
+				CIFSSMBClose(xid, pTcon, fileHandle);
+				rc = -ENOMEM;
+				goto lookup_out;
+			}
+		}
+		/* since paths are not looked up by component - the parent
+		   directories are presumed to be good here */
+		renew_parental_timestamps(direntry);
+
+	} else if (rc == -ENOENT) {
+		rc = 0;
+		direntry->d_time = jiffies;
+		d_add(direntry, NULL);
+	/*	if it was once a directory (but how can we tell?) we could do
+		shrink_dcache_parent(direntry); */
+	} else if (rc != -EACCES) {
+		cERROR(1, "Unexpected lookup error %d", rc);
+		/* We special case check for Access Denied - since that
+		is a common return code */
+	}
+
+lookup_out:
+	kfree(full_path);
+	cifs_put_tlink(tlink);
+	FreeXid(xid);
+	return ERR_PTR(rc);
+}
+
+static int
+cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
+{
+	if (nd && (nd->flags & LOOKUP_RCU))
+		return -ECHILD;
+
+	if (direntry->d_inode) {
+		if (cifs_revalidate_dentry(direntry))
+			return 0;
+		else
+			return 1;
+	}
+
+	/*
+	 * This may be nfsd (or something), anyway, we can't see the
+	 * intent of this. So, since this can be for creation, drop it.
+	 */
+	if (!nd)
+		return 0;
+
+	/*
+	 * Drop the negative dentry, in order to make sure to use the
+	 * case sensitive name which is specified by user if this is
+	 * for creation.
+	 */
+	if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+		if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+			return 0;
+	}
+
+	if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled)
+		return 0;
+
+	return 1;
+}
+
+/* static int cifs_d_delete(struct dentry *direntry)
+{
+	int rc = 0;
+
+	cFYI(1, "In cifs d_delete, name = %s", direntry->d_name.name);
+
+	return rc;
+}     */
+
+const struct dentry_operations cifs_dentry_ops = {
+	.d_revalidate = cifs_d_revalidate,
+	.d_automount = cifs_dfs_d_automount,
+/* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
+};
+
+static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *q)
+{
+	struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
+	unsigned long hash;
+	int i;
+
+	hash = init_name_hash();
+	for (i = 0; i < q->len; i++)
+		hash = partial_name_hash(nls_tolower(codepage, q->name[i]),
+					 hash);
+	q->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+static int cifs_ci_compare(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls;
+
+	if ((name->len == len) &&
+	    (nls_strnicmp(codepage, name->name, str, len) == 0))
+		return 0;
+	return 1;
+}
+
+const struct dentry_operations cifs_ci_dentry_ops = {
+	.d_revalidate = cifs_d_revalidate,
+	.d_hash = cifs_ci_hash,
+	.d_compare = cifs_ci_compare,
+	.d_automount = cifs_dfs_d_automount,
+};
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
new file mode 100644
index 00000000..548f0623
--- /dev/null
+++ b/fs/cifs/dns_resolve.c
@@ -0,0 +1,98 @@
+/*
+ *  fs/cifs/dns_resolve.c
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *		David Howells (dhowells@redhat.com)
+ *
+ *   Contains the CIFS DFS upcall routines used for hostname to
+ *   IP address translation.
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/slab.h>
+#include <linux/dns_resolver.h>
+#include "dns_resolve.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+
+/**
+ * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
+ * @unc: UNC path specifying the server
+ * @ip_addr: Where to return the IP address.
+ *
+ * The IP address will be returned in string form, and the caller is
+ * responsible for freeing it.
+ *
+ * Returns length of result on success, -ve on error.
+ */
+int
+dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
+{
+	struct sockaddr_storage ss;
+	const char *hostname, *sep;
+	char *name;
+	int len, rc;
+
+	if (!ip_addr || !unc)
+		return -EINVAL;
+
+	len = strlen(unc);
+	if (len < 3) {
+		cFYI(1, "%s: unc is too short: %s", __func__, unc);
+		return -EINVAL;
+	}
+
+	/* Discount leading slashes for cifs */
+	len -= 2;
+	hostname = unc + 2;
+
+	/* Search for server name delimiter */
+	sep = memchr(hostname, '\\', len);
+	if (sep)
+		len = sep - hostname;
+	else
+		cFYI(1, "%s: probably server name is whole unc: %s",
+		     __func__, unc);
+
+	/* Try to interpret hostname as an IPv4 or IPv6 address */
+	rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len);
+	if (rc > 0)
+		goto name_is_IP_address;
+
+	/* Perform the upcall */
+	rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
+	if (rc < 0)
+		cERROR(1, "%s: unable to resolve: %*.*s",
+		       __func__, len, len, hostname);
+	else
+		cFYI(1, "%s: resolved: %*.*s to %s",
+		     __func__, len, len, hostname, *ip_addr);
+	return rc;
+
+name_is_IP_address:
+	name = kmalloc(len + 1, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+	memcpy(name, hostname, len);
+	name[len] = 0;
+	cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name);
+	*ip_addr = name;
+	return 0;
+}
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
new file mode 100644
index 00000000..d3f5d27f
--- /dev/null
+++ b/fs/cifs/dns_resolve.h
@@ -0,0 +1,30 @@
+/*
+ *   fs/cifs/dns_resolve.h -- DNS Resolver upcall management for CIFS DFS
+ *                            Handles host name to IP address resolution
+ *
+ *   Copyright (c) International Business Machines  Corp., 2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _DNS_RESOLVE_H
+#define _DNS_RESOLVE_H
+
+#ifdef __KERNEL__
+extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
+#endif /* KERNEL */
+
+#endif /* _DNS_RESOLVE_H */
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
new file mode 100644
index 00000000..55d87ac5
--- /dev/null
+++ b/fs/cifs/export.c
@@ -0,0 +1,67 @@
+/*
+ *   fs/cifs/export.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2007
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   Common Internet FileSystem (CIFS) client
+ *
+ *   Operations related to support for exporting files via NFSD
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+ /*
+  * See Documentation/filesystems/nfs/Exporting
+  * and examples in fs/exportfs
+  *
+  * Since cifs is a network file system, an "fsid" must be included for
+  * any nfs exports file entries which refer to cifs paths.  In addition
+  * the cifs mount must be mounted with the "serverino" option (ie use stable
+  * server inode numbers instead of locally generated temporary ones).
+  * Although cifs inodes do not use generation numbers (have generation number
+  * of zero) - the inode number alone should be good enough for simple cases
+  * in which users want to export cifs shares with NFS. The decode and encode
+  * could be improved by using a new routine which expects 64 bit inode numbers
+  * instead of the default 32 bit routines in fs/exportfs
+  *
+  */
+
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "cifsfs.h"
+
+#ifdef CIFS_NFSD_EXPORT
+static struct dentry *cifs_get_parent(struct dentry *dentry)
+{
+	/* BB need to add code here eventually to enable export via NFSD */
+	cFYI(1, "get parent for %p", dentry);
+	return ERR_PTR(-EACCES);
+}
+
+const struct export_operations cifs_export_ops = {
+	.get_parent = cifs_get_parent,
+/*	Following five export operations are unneeded so far and can default:
+	.get_dentry =
+	.get_name =
+	.find_exported_dentry =
+	.decode_fh =
+	.encode_fs =  */
+};
+
+#endif /* CIFS_NFSD_EXPORT */
+
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
new file mode 100644
index 00000000..9040cb06
--- /dev/null
+++ b/fs/cifs/file.c
@@ -0,0 +1,2471 @@
+/*
+ *   fs/cifs/file.c
+ *
+ *   vfs operations that deal with files
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Jeremy Allison (jra@samba.org)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/backing-dev.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/delay.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <asm/div64.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "fscache.h"
+
+static inline int cifs_convert_flags(unsigned int flags)
+{
+	if ((flags & O_ACCMODE) == O_RDONLY)
+		return GENERIC_READ;
+	else if ((flags & O_ACCMODE) == O_WRONLY)
+		return GENERIC_WRITE;
+	else if ((flags & O_ACCMODE) == O_RDWR) {
+		/* GENERIC_ALL is too much permission to request
+		   can cause unnecessary access denied on create */
+		/* return GENERIC_ALL; */
+		return (GENERIC_READ | GENERIC_WRITE);
+	}
+
+	return (READ_CONTROL | FILE_WRITE_ATTRIBUTES | FILE_READ_ATTRIBUTES |
+		FILE_WRITE_EA | FILE_APPEND_DATA | FILE_WRITE_DATA |
+		FILE_READ_DATA);
+}
+
+static u32 cifs_posix_convert_flags(unsigned int flags)
+{
+	u32 posix_flags = 0;
+
+	if ((flags & O_ACCMODE) == O_RDONLY)
+		posix_flags = SMB_O_RDONLY;
+	else if ((flags & O_ACCMODE) == O_WRONLY)
+		posix_flags = SMB_O_WRONLY;
+	else if ((flags & O_ACCMODE) == O_RDWR)
+		posix_flags = SMB_O_RDWR;
+
+	if (flags & O_CREAT)
+		posix_flags |= SMB_O_CREAT;
+	if (flags & O_EXCL)
+		posix_flags |= SMB_O_EXCL;
+	if (flags & O_TRUNC)
+		posix_flags |= SMB_O_TRUNC;
+	/* be safe and imply O_SYNC for O_DSYNC */
+	if (flags & O_DSYNC)
+		posix_flags |= SMB_O_SYNC;
+	if (flags & O_DIRECTORY)
+		posix_flags |= SMB_O_DIRECTORY;
+	if (flags & O_NOFOLLOW)
+		posix_flags |= SMB_O_NOFOLLOW;
+	if (flags & O_DIRECT)
+		posix_flags |= SMB_O_DIRECT;
+
+	return posix_flags;
+}
+
+static inline int cifs_get_disposition(unsigned int flags)
+{
+	if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+		return FILE_CREATE;
+	else if ((flags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC))
+		return FILE_OVERWRITE_IF;
+	else if ((flags & O_CREAT) == O_CREAT)
+		return FILE_OPEN_IF;
+	else if ((flags & O_TRUNC) == O_TRUNC)
+		return FILE_OVERWRITE;
+	else
+		return FILE_OPEN;
+}
+
+int cifs_posix_open(char *full_path, struct inode **pinode,
+			struct super_block *sb, int mode, unsigned int f_flags,
+			__u32 *poplock, __u16 *pnetfid, int xid)
+{
+	int rc;
+	FILE_UNIX_BASIC_INFO *presp_data;
+	__u32 posix_flags = 0;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct cifs_fattr fattr;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+
+	cFYI(1, "posix open %s", full_path);
+
+	presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+	if (presp_data == NULL)
+		return -ENOMEM;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto posix_open_ret;
+	}
+
+	tcon = tlink_tcon(tlink);
+	mode &= ~current_umask();
+
+	posix_flags = cifs_posix_convert_flags(f_flags);
+	rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data,
+			     poplock, full_path, cifs_sb->local_nls,
+			     cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	cifs_put_tlink(tlink);
+
+	if (rc)
+		goto posix_open_ret;
+
+	if (presp_data->Type == cpu_to_le32(-1))
+		goto posix_open_ret; /* open ok, caller does qpathinfo */
+
+	if (!pinode)
+		goto posix_open_ret; /* caller does not need info */
+
+	cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
+
+	/* get new inode and set it up */
+	if (*pinode == NULL) {
+		cifs_fill_uniqueid(sb, &fattr);
+		*pinode = cifs_iget(sb, &fattr);
+		if (!*pinode) {
+			rc = -ENOMEM;
+			goto posix_open_ret;
+		}
+	} else {
+		cifs_fattr_to_inode(*pinode, &fattr);
+	}
+
+posix_open_ret:
+	kfree(presp_data);
+	return rc;
+}
+
+static int
+cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
+	     struct cifs_tcon *tcon, unsigned int f_flags, __u32 *poplock,
+	     __u16 *pnetfid, int xid)
+{
+	int rc;
+	int desiredAccess;
+	int disposition;
+	FILE_ALL_INFO *buf;
+
+	desiredAccess = cifs_convert_flags(f_flags);
+
+/*********************************************************************
+ *  open flag mapping table:
+ *
+ *	POSIX Flag            CIFS Disposition
+ *	----------            ----------------
+ *	O_CREAT               FILE_OPEN_IF
+ *	O_CREAT | O_EXCL      FILE_CREATE
+ *	O_CREAT | O_TRUNC     FILE_OVERWRITE_IF
+ *	O_TRUNC               FILE_OVERWRITE
+ *	none of the above     FILE_OPEN
+ *
+ *	Note that there is not a direct match between disposition
+ *	FILE_SUPERSEDE (ie create whether or not file exists although
+ *	O_CREAT | O_TRUNC is similar but truncates the existing
+ *	file rather than creating a new file as FILE_SUPERSEDE does
+ *	(which uses the attributes / metadata passed in on open call)
+ *?
+ *?  O_SYNC is a reasonable match to CIFS writethrough flag
+ *?  and the read write flags match reasonably.  O_LARGEFILE
+ *?  is irrelevant because largefile support is always used
+ *?  by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
+ *	 O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
+ *********************************************************************/
+
+	disposition = cifs_get_disposition(f_flags);
+
+	/* BB pass O_SYNC flag through on file attributes .. BB */
+
+	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (tcon->ses->capabilities & CAP_NT_SMBS)
+		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
+			 desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+				 & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	else
+		rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
+			desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+			cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+				& CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if (rc)
+		goto out;
+
+	if (tcon->unix_ext)
+		rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
+					      xid);
+	else
+		rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
+					 xid, pnetfid);
+
+out:
+	kfree(buf);
+	return rc;
+}
+
+struct cifsFileInfo *
+cifs_new_fileinfo(__u16 fileHandle, struct file *file,
+		  struct tcon_link *tlink, __u32 oplock)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
+	struct cifsFileInfo *pCifsFile;
+
+	pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+	if (pCifsFile == NULL)
+		return pCifsFile;
+
+	pCifsFile->count = 1;
+	pCifsFile->netfid = fileHandle;
+	pCifsFile->pid = current->tgid;
+	pCifsFile->uid = current_fsuid();
+	pCifsFile->dentry = dget(dentry);
+	pCifsFile->f_flags = file->f_flags;
+	pCifsFile->invalidHandle = false;
+	pCifsFile->tlink = cifs_get_tlink(tlink);
+	mutex_init(&pCifsFile->fh_mutex);
+	mutex_init(&pCifsFile->lock_mutex);
+	INIT_LIST_HEAD(&pCifsFile->llist);
+	INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
+
+	spin_lock(&cifs_file_list_lock);
+	list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
+	/* if readable file instance put first in list*/
+	if (file->f_mode & FMODE_READ)
+		list_add(&pCifsFile->flist, &pCifsInode->openFileList);
+	else
+		list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
+	spin_unlock(&cifs_file_list_lock);
+
+	cifs_set_oplock_level(pCifsInode, oplock);
+
+	file->private_data = pCifsFile;
+	return pCifsFile;
+}
+
+/*
+ * Release a reference on the file private data. This may involve closing
+ * the filehandle out on the server. Must be called without holding
+ * cifs_file_list_lock.
+ */
+void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
+{
+	struct inode *inode = cifs_file->dentry->d_inode;
+	struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsLockInfo *li, *tmp;
+
+	spin_lock(&cifs_file_list_lock);
+	if (--cifs_file->count > 0) {
+		spin_unlock(&cifs_file_list_lock);
+		return;
+	}
+
+	/* remove it from the lists */
+	list_del(&cifs_file->flist);
+	list_del(&cifs_file->tlist);
+
+	if (list_empty(&cifsi->openFileList)) {
+		cFYI(1, "closing last open instance for inode %p",
+			cifs_file->dentry->d_inode);
+
+		/* in strict cache mode we need invalidate mapping on the last
+		   close  because it may cause a error when we open this file
+		   again and get at least level II oplock */
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+			CIFS_I(inode)->invalid_mapping = true;
+
+		cifs_set_oplock_level(cifsi, 0);
+	}
+	spin_unlock(&cifs_file_list_lock);
+
+	if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
+		int xid, rc;
+
+		xid = GetXid();
+		rc = CIFSSMBClose(xid, tcon, cifs_file->netfid);
+		FreeXid(xid);
+	}
+
+	/* Delete any outstanding lock records. We'll lose them when the file
+	 * is closed anyway.
+	 */
+	mutex_lock(&cifs_file->lock_mutex);
+	list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
+		list_del(&li->llist);
+		kfree(li);
+	}
+	mutex_unlock(&cifs_file->lock_mutex);
+
+	cifs_put_tlink(cifs_file->tlink);
+	dput(cifs_file->dentry);
+	kfree(cifs_file);
+}
+
+int cifs_open(struct inode *inode, struct file *file)
+{
+	int rc = -EACCES;
+	int xid;
+	__u32 oplock;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *tcon;
+	struct tcon_link *tlink;
+	struct cifsFileInfo *pCifsFile = NULL;
+	char *full_path = NULL;
+	bool posix_open_ok = false;
+	__u16 netfid;
+
+	xid = GetXid();
+
+	cifs_sb = CIFS_SB(inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		FreeXid(xid);
+		return PTR_ERR(tlink);
+	}
+	tcon = tlink_tcon(tlink);
+
+	full_path = build_path_from_dentry(file->f_path.dentry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
+		 inode, file->f_flags, full_path);
+
+	if (oplockEnabled)
+		oplock = REQ_OPLOCK;
+	else
+		oplock = 0;
+
+	if (!tcon->broken_posix_open && tcon->unix_ext &&
+	    (tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		/* can not refresh inode info since size could be stale */
+		rc = cifs_posix_open(full_path, &inode, inode->i_sb,
+				cifs_sb->mnt_file_mode /* ignored */,
+				file->f_flags, &oplock, &netfid, xid);
+		if (rc == 0) {
+			cFYI(1, "posix open succeeded");
+			posix_open_ok = true;
+		} else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+			if (tcon->ses->serverNOS)
+				cERROR(1, "server %s of type %s returned"
+					   " unexpected error on SMB posix open"
+					   ", disabling posix open support."
+					   " Check if server update available.",
+					   tcon->ses->serverName,
+					   tcon->ses->serverNOS);
+			tcon->broken_posix_open = true;
+		} else if ((rc != -EIO) && (rc != -EREMOTE) &&
+			 (rc != -EOPNOTSUPP)) /* path not found or net err */
+			goto out;
+		/* else fallthrough to retry open the old way on network i/o
+		   or DFS errors */
+	}
+
+	if (!posix_open_ok) {
+		rc = cifs_nt_open(full_path, inode, cifs_sb, tcon,
+				  file->f_flags, &oplock, &netfid, xid);
+		if (rc)
+			goto out;
+	}
+
+	pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
+	if (pCifsFile == NULL) {
+		CIFSSMBClose(xid, tcon, netfid);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	cifs_fscache_set_inode_cookie(inode, file);
+
+	if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
+		/* time to set mode which we can not set earlier due to
+		   problems creating new read-only files */
+		struct cifs_unix_set_info_args args = {
+			.mode	= inode->i_mode,
+			.uid	= NO_CHANGE_64,
+			.gid	= NO_CHANGE_64,
+			.ctime	= NO_CHANGE_64,
+			.atime	= NO_CHANGE_64,
+			.mtime	= NO_CHANGE_64,
+			.device	= 0,
+		};
+		CIFSSMBUnixSetFileInfo(xid, tcon, &args, netfid,
+					pCifsFile->pid);
+	}
+
+out:
+	kfree(full_path);
+	FreeXid(xid);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+/* Try to reacquire byte range locks that were released when session */
+/* to server was lost */
+static int cifs_relock_file(struct cifsFileInfo *cifsFile)
+{
+	int rc = 0;
+
+/* BB list all locks open on this file and relock */
+
+	return rc;
+}
+
+static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
+{
+	int rc = -EACCES;
+	int xid;
+	__u32 oplock;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *tcon;
+	struct cifsInodeInfo *pCifsInode;
+	struct inode *inode;
+	char *full_path = NULL;
+	int desiredAccess;
+	int disposition = FILE_OPEN;
+	__u16 netfid;
+
+	xid = GetXid();
+	mutex_lock(&pCifsFile->fh_mutex);
+	if (!pCifsFile->invalidHandle) {
+		mutex_unlock(&pCifsFile->fh_mutex);
+		rc = 0;
+		FreeXid(xid);
+		return rc;
+	}
+
+	inode = pCifsFile->dentry->d_inode;
+	cifs_sb = CIFS_SB(inode->i_sb);
+	tcon = tlink_tcon(pCifsFile->tlink);
+
+/* can not grab rename sem here because various ops, including
+   those that already have the rename sem can end up causing writepage
+   to get called and if the server was down that means we end up here,
+   and we can never tell if the caller already has the rename_sem */
+	full_path = build_path_from_dentry(pCifsFile->dentry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		mutex_unlock(&pCifsFile->fh_mutex);
+		FreeXid(xid);
+		return rc;
+	}
+
+	cFYI(1, "inode = 0x%p file flags 0x%x for %s",
+		 inode, pCifsFile->f_flags, full_path);
+
+	if (oplockEnabled)
+		oplock = REQ_OPLOCK;
+	else
+		oplock = 0;
+
+	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+
+		/*
+		 * O_CREAT, O_EXCL and O_TRUNC already had their effect on the
+		 * original open. Must mask them off for a reopen.
+		 */
+		unsigned int oflags = pCifsFile->f_flags &
+						~(O_CREAT | O_EXCL | O_TRUNC);
+
+		rc = cifs_posix_open(full_path, NULL, inode->i_sb,
+				cifs_sb->mnt_file_mode /* ignored */,
+				oflags, &oplock, &netfid, xid);
+		if (rc == 0) {
+			cFYI(1, "posix reopen succeeded");
+			goto reopen_success;
+		}
+		/* fallthrough to retry open the old way on errors, especially
+		   in the reconnect path it is important to retry hard */
+	}
+
+	desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
+
+	/* Can not refresh inode by passing in file_info buf to be returned
+	   by SMBOpen and then calling get_inode_info with returned buf
+	   since file might have write behind data that needs to be flushed
+	   and server version of file size can be stale. If we knew for sure
+	   that inode was not dirty locally we could do this */
+
+	rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess,
+			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
+			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc) {
+		mutex_unlock(&pCifsFile->fh_mutex);
+		cFYI(1, "cifs_open returned 0x%x", rc);
+		cFYI(1, "oplock: %d", oplock);
+		goto reopen_error_exit;
+	}
+
+reopen_success:
+	pCifsFile->netfid = netfid;
+	pCifsFile->invalidHandle = false;
+	mutex_unlock(&pCifsFile->fh_mutex);
+	pCifsInode = CIFS_I(inode);
+
+	if (can_flush) {
+		rc = filemap_write_and_wait(inode->i_mapping);
+		mapping_set_error(inode->i_mapping, rc);
+
+		if (tcon->unix_ext)
+			rc = cifs_get_inode_info_unix(&inode,
+				full_path, inode->i_sb, xid);
+		else
+			rc = cifs_get_inode_info(&inode,
+				full_path, NULL, inode->i_sb,
+				xid, NULL);
+	} /* else we are writing out data to server already
+	     and could deadlock if we tried to flush data, and
+	     since we do not know if we have data that would
+	     invalidate the current end of file on the server
+	     we can not go to the server to get the new inod
+	     info */
+
+	cifs_set_oplock_level(pCifsInode, oplock);
+
+	cifs_relock_file(pCifsFile);
+
+reopen_error_exit:
+	kfree(full_path);
+	FreeXid(xid);
+	return rc;
+}
+
+int cifs_close(struct inode *inode, struct file *file)
+{
+	if (file->private_data != NULL) {
+		cifsFileInfo_put(file->private_data);
+		file->private_data = NULL;
+	}
+
+	/* return code from the ->release op is always ignored */
+	return 0;
+}
+
+int cifs_closedir(struct inode *inode, struct file *file)
+{
+	int rc = 0;
+	int xid;
+	struct cifsFileInfo *pCFileStruct = file->private_data;
+	char *ptmp;
+
+	cFYI(1, "Closedir inode = 0x%p", inode);
+
+	xid = GetXid();
+
+	if (pCFileStruct) {
+		struct cifs_tcon *pTcon = tlink_tcon(pCFileStruct->tlink);
+
+		cFYI(1, "Freeing private data in close dir");
+		spin_lock(&cifs_file_list_lock);
+		if (!pCFileStruct->srch_inf.endOfSearch &&
+		    !pCFileStruct->invalidHandle) {
+			pCFileStruct->invalidHandle = true;
+			spin_unlock(&cifs_file_list_lock);
+			rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
+			cFYI(1, "Closing uncompleted readdir with rc %d",
+				 rc);
+			/* not much we can do if it fails anyway, ignore rc */
+			rc = 0;
+		} else
+			spin_unlock(&cifs_file_list_lock);
+		ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
+		if (ptmp) {
+			cFYI(1, "closedir free smb buf in srch struct");
+			pCFileStruct->srch_inf.ntwrk_buf_start = NULL;
+			if (pCFileStruct->srch_inf.smallBuf)
+				cifs_small_buf_release(ptmp);
+			else
+				cifs_buf_release(ptmp);
+		}
+		cifs_put_tlink(pCFileStruct->tlink);
+		kfree(file->private_data);
+		file->private_data = NULL;
+	}
+	/* BB can we lock the filestruct while this is going on? */
+	FreeXid(xid);
+	return rc;
+}
+
+static int store_file_lock(struct cifsFileInfo *fid, __u64 len,
+				__u64 offset, __u8 lockType)
+{
+	struct cifsLockInfo *li =
+		kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
+	if (li == NULL)
+		return -ENOMEM;
+	li->offset = offset;
+	li->length = len;
+	li->type = lockType;
+	mutex_lock(&fid->lock_mutex);
+	list_add(&li->llist, &fid->llist);
+	mutex_unlock(&fid->lock_mutex);
+	return 0;
+}
+
+int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
+{
+	int rc, xid;
+	__u32 numLock = 0;
+	__u32 numUnlock = 0;
+	__u64 length;
+	bool wait_flag = false;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *tcon;
+	__u16 netfid;
+	__u8 lockType = LOCKING_ANDX_LARGE_FILES;
+	bool posix_locking = 0;
+
+	length = 1 + pfLock->fl_end - pfLock->fl_start;
+	rc = -EACCES;
+	xid = GetXid();
+
+	cFYI(1, "Lock parm: 0x%x flockflags: "
+		 "0x%x flocktype: 0x%x start: %lld end: %lld",
+		cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start,
+		pfLock->fl_end);
+
+	if (pfLock->fl_flags & FL_POSIX)
+		cFYI(1, "Posix");
+	if (pfLock->fl_flags & FL_FLOCK)
+		cFYI(1, "Flock");
+	if (pfLock->fl_flags & FL_SLEEP) {
+		cFYI(1, "Blocking lock");
+		wait_flag = true;
+	}
+	if (pfLock->fl_flags & FL_ACCESS)
+		cFYI(1, "Process suspended by mandatory locking - "
+			 "not implemented yet");
+	if (pfLock->fl_flags & FL_LEASE)
+		cFYI(1, "Lease on file - not implemented yet");
+	if (pfLock->fl_flags &
+	    (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
+		cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags);
+
+	if (pfLock->fl_type == F_WRLCK) {
+		cFYI(1, "F_WRLCK ");
+		numLock = 1;
+	} else if (pfLock->fl_type == F_UNLCK) {
+		cFYI(1, "F_UNLCK");
+		numUnlock = 1;
+		/* Check if unlock includes more than
+		one lock range */
+	} else if (pfLock->fl_type == F_RDLCK) {
+		cFYI(1, "F_RDLCK");
+		lockType |= LOCKING_ANDX_SHARED_LOCK;
+		numLock = 1;
+	} else if (pfLock->fl_type == F_EXLCK) {
+		cFYI(1, "F_EXLCK");
+		numLock = 1;
+	} else if (pfLock->fl_type == F_SHLCK) {
+		cFYI(1, "F_SHLCK");
+		lockType |= LOCKING_ANDX_SHARED_LOCK;
+		numLock = 1;
+	} else
+		cFYI(1, "Unknown type of lock");
+
+	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
+	netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
+
+	if ((tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+		posix_locking = 1;
+	/* BB add code here to normalize offset and length to
+	account for negative length which we can not accept over the
+	wire */
+	if (IS_GETLK(cmd)) {
+		if (posix_locking) {
+			int posix_lock_type;
+			if (lockType & LOCKING_ANDX_SHARED_LOCK)
+				posix_lock_type = CIFS_RDLCK;
+			else
+				posix_lock_type = CIFS_WRLCK;
+			rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */,
+					length, pfLock, posix_lock_type,
+					wait_flag);
+			FreeXid(xid);
+			return rc;
+		}
+
+		/* BB we could chain these into one lock request BB */
+		rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
+				 0, 1, lockType, 0 /* wait flag */, 0);
+		if (rc == 0) {
+			rc = CIFSSMBLock(xid, tcon, netfid, length,
+					 pfLock->fl_start, 1 /* numUnlock */ ,
+					 0 /* numLock */ , lockType,
+					 0 /* wait flag */, 0);
+			pfLock->fl_type = F_UNLCK;
+			if (rc != 0)
+				cERROR(1, "Error unlocking previously locked "
+					   "range %d during test of lock", rc);
+			rc = 0;
+
+		} else {
+			/* if rc == ERR_SHARING_VIOLATION ? */
+			rc = 0;
+
+			if (lockType & LOCKING_ANDX_SHARED_LOCK) {
+				pfLock->fl_type = F_WRLCK;
+			} else {
+				rc = CIFSSMBLock(xid, tcon, netfid, length,
+					pfLock->fl_start, 0, 1,
+					lockType | LOCKING_ANDX_SHARED_LOCK,
+					0 /* wait flag */, 0);
+				if (rc == 0) {
+					rc = CIFSSMBLock(xid, tcon, netfid,
+						length, pfLock->fl_start, 1, 0,
+						lockType |
+						LOCKING_ANDX_SHARED_LOCK,
+						0 /* wait flag */, 0);
+					pfLock->fl_type = F_RDLCK;
+					if (rc != 0)
+						cERROR(1, "Error unlocking "
+						"previously locked range %d "
+						"during test of lock", rc);
+					rc = 0;
+				} else {
+					pfLock->fl_type = F_WRLCK;
+					rc = 0;
+				}
+			}
+		}
+
+		FreeXid(xid);
+		return rc;
+	}
+
+	if (!numLock && !numUnlock) {
+		/* if no lock or unlock then nothing
+		to do since we do not know what it is */
+		FreeXid(xid);
+		return -EOPNOTSUPP;
+	}
+
+	if (posix_locking) {
+		int posix_lock_type;
+		if (lockType & LOCKING_ANDX_SHARED_LOCK)
+			posix_lock_type = CIFS_RDLCK;
+		else
+			posix_lock_type = CIFS_WRLCK;
+
+		if (numUnlock == 1)
+			posix_lock_type = CIFS_UNLCK;
+
+		rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */,
+				      length, pfLock, posix_lock_type,
+				      wait_flag);
+	} else {
+		struct cifsFileInfo *fid = file->private_data;
+
+		if (numLock) {
+			rc = CIFSSMBLock(xid, tcon, netfid, length,
+					 pfLock->fl_start, 0, numLock, lockType,
+					 wait_flag, 0);
+
+			if (rc == 0) {
+				/* For Windows locks we must store them. */
+				rc = store_file_lock(fid, length,
+						pfLock->fl_start, lockType);
+			}
+		} else if (numUnlock) {
+			/* For each stored lock that this unlock overlaps
+			   completely, unlock it. */
+			int stored_rc = 0;
+			struct cifsLockInfo *li, *tmp;
+
+			rc = 0;
+			mutex_lock(&fid->lock_mutex);
+			list_for_each_entry_safe(li, tmp, &fid->llist, llist) {
+				if (pfLock->fl_start <= li->offset &&
+						(pfLock->fl_start + length) >=
+						(li->offset + li->length)) {
+					stored_rc = CIFSSMBLock(xid, tcon,
+							netfid, li->length,
+							li->offset, 1, 0,
+							li->type, false, 0);
+					if (stored_rc)
+						rc = stored_rc;
+					else {
+						list_del(&li->llist);
+						kfree(li);
+					}
+				}
+			}
+			mutex_unlock(&fid->lock_mutex);
+		}
+	}
+
+	if (pfLock->fl_flags & FL_POSIX)
+		posix_lock_file_wait(file, pfLock);
+	FreeXid(xid);
+	return rc;
+}
+
+/* update the file size (if needed) after a write */
+void
+cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
+		      unsigned int bytes_written)
+{
+	loff_t end_of_write = offset + bytes_written;
+
+	if (end_of_write > cifsi->server_eof)
+		cifsi->server_eof = end_of_write;
+}
+
+static ssize_t cifs_write(struct cifsFileInfo *open_file, __u32 pid,
+			  const char *write_data, size_t write_size,
+			  loff_t *poffset)
+{
+	int rc = 0;
+	unsigned int bytes_written = 0;
+	unsigned int total_written;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *pTcon;
+	int xid;
+	struct dentry *dentry = open_file->dentry;
+	struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
+	struct cifs_io_parms io_parms;
+
+	cifs_sb = CIFS_SB(dentry->d_sb);
+
+	cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
+	   *poffset, dentry->d_name.name);
+
+	pTcon = tlink_tcon(open_file->tlink);
+
+	xid = GetXid();
+
+	for (total_written = 0; write_size > total_written;
+	     total_written += bytes_written) {
+		rc = -EAGAIN;
+		while (rc == -EAGAIN) {
+			struct kvec iov[2];
+			unsigned int len;
+
+			if (open_file->invalidHandle) {
+				/* we could deadlock if we called
+				   filemap_fdatawait from here so tell
+				   reopen_file not to flush data to
+				   server now */
+				rc = cifs_reopen_file(open_file, false);
+				if (rc != 0)
+					break;
+			}
+
+			len = min((size_t)cifs_sb->wsize,
+				  write_size - total_written);
+			/* iov[0] is reserved for smb header */
+			iov[1].iov_base = (char *)write_data + total_written;
+			iov[1].iov_len = len;
+			io_parms.netfid = open_file->netfid;
+			io_parms.pid = pid;
+			io_parms.tcon = pTcon;
+			io_parms.offset = *poffset;
+			io_parms.length = len;
+			rc = CIFSSMBWrite2(xid, &io_parms, &bytes_written, iov,
+					   1, 0);
+		}
+		if (rc || (bytes_written == 0)) {
+			if (total_written)
+				break;
+			else {
+				FreeXid(xid);
+				return rc;
+			}
+		} else {
+			cifs_update_eof(cifsi, *poffset, bytes_written);
+			*poffset += bytes_written;
+		}
+	}
+
+	cifs_stats_bytes_written(pTcon, total_written);
+
+	if (total_written > 0) {
+		spin_lock(&dentry->d_inode->i_lock);
+		if (*poffset > dentry->d_inode->i_size)
+			i_size_write(dentry->d_inode, *poffset);
+		spin_unlock(&dentry->d_inode->i_lock);
+	}
+	mark_inode_dirty_sync(dentry->d_inode);
+	FreeXid(xid);
+	return total_written;
+}
+
+struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
+					bool fsuid_only)
+{
+	struct cifsFileInfo *open_file = NULL;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+
+	/* only filter by fsuid on multiuser mounts */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+		fsuid_only = false;
+
+	spin_lock(&cifs_file_list_lock);
+	/* we could simply get the first_list_entry since write-only entries
+	   are always at the end of the list but since the first entry might
+	   have a close pending, we go through the whole list */
+	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
+		if (fsuid_only && open_file->uid != current_fsuid())
+			continue;
+		if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
+			if (!open_file->invalidHandle) {
+				/* found a good file */
+				/* lock it so it will not be closed on us */
+				cifsFileInfo_get(open_file);
+				spin_unlock(&cifs_file_list_lock);
+				return open_file;
+			} /* else might as well continue, and look for
+			     another, or simply have the caller reopen it
+			     again rather than trying to fix this handle */
+		} else /* write only file */
+			break; /* write only files are last so must be done */
+	}
+	spin_unlock(&cifs_file_list_lock);
+	return NULL;
+}
+
+struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
+					bool fsuid_only)
+{
+	struct cifsFileInfo *open_file, *inv_file = NULL;
+	struct cifs_sb_info *cifs_sb;
+	bool any_available = false;
+	int rc;
+	unsigned int refind = 0;
+
+	/* Having a null inode here (because mapping->host was set to zero by
+	the VFS or MM) should not happen but we had reports of on oops (due to
+	it being zero) during stress testcases so we need to check for it */
+
+	if (cifs_inode == NULL) {
+		cERROR(1, "Null inode passed to cifs_writeable_file");
+		dump_stack();
+		return NULL;
+	}
+
+	cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+
+	/* only filter by fsuid on multiuser mounts */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+		fsuid_only = false;
+
+	spin_lock(&cifs_file_list_lock);
+refind_writable:
+	if (refind > MAX_REOPEN_ATT) {
+		spin_unlock(&cifs_file_list_lock);
+		return NULL;
+	}
+	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
+		if (!any_available && open_file->pid != current->tgid)
+			continue;
+		if (fsuid_only && open_file->uid != current_fsuid())
+			continue;
+		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
+			if (!open_file->invalidHandle) {
+				/* found a good writable file */
+				cifsFileInfo_get(open_file);
+				spin_unlock(&cifs_file_list_lock);
+				return open_file;
+			} else {
+				if (!inv_file)
+					inv_file = open_file;
+			}
+		}
+	}
+	/* couldn't find useable FH with same pid, try any available */
+	if (!any_available) {
+		any_available = true;
+		goto refind_writable;
+	}
+
+	if (inv_file) {
+		any_available = false;
+		cifsFileInfo_get(inv_file);
+	}
+
+	spin_unlock(&cifs_file_list_lock);
+
+	if (inv_file) {
+		rc = cifs_reopen_file(inv_file, false);
+		if (!rc)
+			return inv_file;
+		else {
+			spin_lock(&cifs_file_list_lock);
+			list_move_tail(&inv_file->flist,
+					&cifs_inode->openFileList);
+			spin_unlock(&cifs_file_list_lock);
+			cifsFileInfo_put(inv_file);
+			spin_lock(&cifs_file_list_lock);
+			++refind;
+			goto refind_writable;
+		}
+	}
+
+	return NULL;
+}
+
+static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
+{
+	struct address_space *mapping = page->mapping;
+	loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	char *write_data;
+	int rc = -EFAULT;
+	int bytes_written = 0;
+	struct inode *inode;
+	struct cifsFileInfo *open_file;
+
+	if (!mapping || !mapping->host)
+		return -EFAULT;
+
+	inode = page->mapping->host;
+
+	offset += (loff_t)from;
+	write_data = kmap(page);
+	write_data += from;
+
+	if ((to > PAGE_CACHE_SIZE) || (from > to)) {
+		kunmap(page);
+		return -EIO;
+	}
+
+	/* racing with truncate? */
+	if (offset > mapping->host->i_size) {
+		kunmap(page);
+		return 0; /* don't care */
+	}
+
+	/* check to make sure that we are not extending the file */
+	if (mapping->host->i_size - offset < (loff_t)to)
+		to = (unsigned)(mapping->host->i_size - offset);
+
+	open_file = find_writable_file(CIFS_I(mapping->host), false);
+	if (open_file) {
+		bytes_written = cifs_write(open_file, open_file->pid,
+					   write_data, to - from, &offset);
+		cifsFileInfo_put(open_file);
+		/* Does mm or vfs already set times? */
+		inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
+		if ((bytes_written > 0) && (offset))
+			rc = 0;
+		else if (bytes_written < 0)
+			rc = bytes_written;
+	} else {
+		cFYI(1, "No writeable filehandles for inode");
+		rc = -EIO;
+	}
+
+	kunmap(page);
+	return rc;
+}
+
+static int cifs_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
+	bool done = false, scanned = false, range_whole = false;
+	pgoff_t end, index;
+	struct cifs_writedata *wdata;
+	struct page *page;
+	int rc = 0;
+
+	/*
+	 * If wsize is smaller than the page cache size, default to writing
+	 * one page at a time via cifs_writepage
+	 */
+	if (cifs_sb->wsize < PAGE_CACHE_SIZE)
+		return generic_writepages(mapping, wbc);
+
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = true;
+		scanned = true;
+	}
+retry:
+	while (!done && index <= end) {
+		unsigned int i, nr_pages, found_pages;
+		pgoff_t next = 0, tofind;
+		struct page **pages;
+
+		tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1,
+				end - index) + 1;
+
+		wdata = cifs_writedata_alloc((unsigned int)tofind);
+		if (!wdata) {
+			rc = -ENOMEM;
+			break;
+		}
+
+		/*
+		 * find_get_pages_tag seems to return a max of 256 on each
+		 * iteration, so we must call it several times in order to
+		 * fill the array or the wsize is effectively limited to
+		 * 256 * PAGE_CACHE_SIZE.
+		 */
+		found_pages = 0;
+		pages = wdata->pages;
+		do {
+			nr_pages = find_get_pages_tag(mapping, &index,
+							PAGECACHE_TAG_DIRTY,
+							tofind, pages);
+			found_pages += nr_pages;
+			tofind -= nr_pages;
+			pages += nr_pages;
+		} while (nr_pages && tofind && index <= end);
+
+		if (found_pages == 0) {
+			kref_put(&wdata->refcount, cifs_writedata_release);
+			break;
+		}
+
+		nr_pages = 0;
+		for (i = 0; i < found_pages; i++) {
+			page = wdata->pages[i];
+			/*
+			 * At this point we hold neither mapping->tree_lock nor
+			 * lock on the page itself: the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or even
+			 * swizzled back from swapper_space to tmpfs file
+			 * mapping
+			 */
+
+			if (nr_pages == 0)
+				lock_page(page);
+			else if (!trylock_page(page))
+				break;
+
+			if (unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				break;
+			}
+
+			if (!wbc->range_cyclic && page->index > end) {
+				done = true;
+				unlock_page(page);
+				break;
+			}
+
+			if (next && (page->index != next)) {
+				/* Not next consecutive page */
+				unlock_page(page);
+				break;
+			}
+
+			if (wbc->sync_mode != WB_SYNC_NONE)
+				wait_on_page_writeback(page);
+
+			if (PageWriteback(page) ||
+					!clear_page_dirty_for_io(page)) {
+				unlock_page(page);
+				break;
+			}
+
+			/*
+			 * This actually clears the dirty bit in the radix tree.
+			 * See cifs_writepage() for more commentary.
+			 */
+			set_page_writeback(page);
+
+			if (page_offset(page) >= mapping->host->i_size) {
+				done = true;
+				unlock_page(page);
+				end_page_writeback(page);
+				break;
+			}
+
+			wdata->pages[i] = page;
+			next = page->index + 1;
+			++nr_pages;
+		}
+
+		/* reset index to refind any pages skipped */
+		if (nr_pages == 0)
+			index = wdata->pages[0]->index + 1;
+
+		/* put any pages we aren't going to use */
+		for (i = nr_pages; i < found_pages; i++) {
+			page_cache_release(wdata->pages[i]);
+			wdata->pages[i] = NULL;
+		}
+
+		/* nothing to write? */
+		if (nr_pages == 0) {
+			kref_put(&wdata->refcount, cifs_writedata_release);
+			continue;
+		}
+
+		wdata->sync_mode = wbc->sync_mode;
+		wdata->nr_pages = nr_pages;
+		wdata->offset = page_offset(wdata->pages[0]);
+
+		do {
+			if (wdata->cfile != NULL)
+				cifsFileInfo_put(wdata->cfile);
+			wdata->cfile = find_writable_file(CIFS_I(mapping->host),
+							  false);
+			if (!wdata->cfile) {
+				cERROR(1, "No writable handles for inode");
+				rc = -EBADF;
+				break;
+			}
+			rc = cifs_async_writev(wdata);
+		} while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
+
+		for (i = 0; i < nr_pages; ++i)
+			unlock_page(wdata->pages[i]);
+
+		/* send failure -- clean up the mess */
+		if (rc != 0) {
+			for (i = 0; i < nr_pages; ++i) {
+				if (rc == -EAGAIN)
+					redirty_page_for_writepage(wbc,
+							   wdata->pages[i]);
+				else
+					SetPageError(wdata->pages[i]);
+				end_page_writeback(wdata->pages[i]);
+				page_cache_release(wdata->pages[i]);
+			}
+			if (rc != -EAGAIN)
+				mapping_set_error(mapping, rc);
+		}
+		kref_put(&wdata->refcount, cifs_writedata_release);
+
+		wbc->nr_to_write -= nr_pages;
+		if (wbc->nr_to_write <= 0)
+			done = true;
+
+		index = next;
+	}
+
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = true;
+		index = 0;
+		goto retry;
+	}
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+
+	return rc;
+}
+
+static int
+cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
+{
+	int rc;
+	int xid;
+
+	xid = GetXid();
+/* BB add check for wbc flags */
+	page_cache_get(page);
+	if (!PageUptodate(page))
+		cFYI(1, "ppw - page not up to date");
+
+	/*
+	 * Set the "writeback" flag, and clear "dirty" in the radix tree.
+	 *
+	 * A writepage() implementation always needs to do either this,
+	 * or re-dirty the page with "redirty_page_for_writepage()" in
+	 * the case of a failure.
+	 *
+	 * Just unlocking the page will cause the radix tree tag-bits
+	 * to fail to update with the state of the page correctly.
+	 */
+	set_page_writeback(page);
+retry_write:
+	rc = cifs_partialpagewrite(page, 0, PAGE_CACHE_SIZE);
+	if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL)
+		goto retry_write;
+	else if (rc == -EAGAIN)
+		redirty_page_for_writepage(wbc, page);
+	else if (rc != 0)
+		SetPageError(page);
+	else
+		SetPageUptodate(page);
+	end_page_writeback(page);
+	page_cache_release(page);
+	FreeXid(xid);
+	return rc;
+}
+
+static int cifs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int rc = cifs_writepage_locked(page, wbc);
+	unlock_page(page);
+	return rc;
+}
+
+static int cifs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	int rc;
+	struct inode *inode = mapping->host;
+	struct cifsFileInfo *cfile = file->private_data;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+	__u32 pid;
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = cfile->pid;
+	else
+		pid = current->tgid;
+
+	cFYI(1, "write_end for page %p from pos %lld with %d bytes",
+		 page, pos, copied);
+
+	if (PageChecked(page)) {
+		if (copied == len)
+			SetPageUptodate(page);
+		ClearPageChecked(page);
+	} else if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
+		SetPageUptodate(page);
+
+	if (!PageUptodate(page)) {
+		char *page_data;
+		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+		int xid;
+
+		xid = GetXid();
+		/* this is probably better than directly calling
+		   partialpage_write since in this function the file handle is
+		   known which we might as well	leverage */
+		/* BB check if anything else missing out of ppw
+		   such as updating last write time */
+		page_data = kmap(page);
+		rc = cifs_write(cfile, pid, page_data + offset, copied, &pos);
+		/* if (rc < 0) should we set writebehind rc? */
+		kunmap(page);
+
+		FreeXid(xid);
+	} else {
+		rc = copied;
+		pos += copied;
+		set_page_dirty(page);
+	}
+
+	if (rc > 0) {
+		spin_lock(&inode->i_lock);
+		if (pos > inode->i_size)
+			i_size_write(inode, pos);
+		spin_unlock(&inode->i_lock);
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	return rc;
+}
+
+int cifs_strict_fsync(struct file *file, int datasync)
+{
+	int xid;
+	int rc = 0;
+	struct cifs_tcon *tcon;
+	struct cifsFileInfo *smbfile = file->private_data;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+	xid = GetXid();
+
+	cFYI(1, "Sync file - name: %s datasync: 0x%x",
+		file->f_path.dentry->d_name.name, datasync);
+
+	if (!CIFS_I(inode)->clientCanCacheRead) {
+		rc = cifs_invalidate_mapping(inode);
+		if (rc) {
+			cFYI(1, "rc: %d during invalidate phase", rc);
+			rc = 0; /* don't care about it in fsync */
+		}
+	}
+
+	tcon = tlink_tcon(smbfile->tlink);
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+		rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
+
+	FreeXid(xid);
+	return rc;
+}
+
+int cifs_fsync(struct file *file, int datasync)
+{
+	int xid;
+	int rc = 0;
+	struct cifs_tcon *tcon;
+	struct cifsFileInfo *smbfile = file->private_data;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+
+	xid = GetXid();
+
+	cFYI(1, "Sync file - name: %s datasync: 0x%x",
+		file->f_path.dentry->d_name.name, datasync);
+
+	tcon = tlink_tcon(smbfile->tlink);
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+		rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
+
+	FreeXid(xid);
+	return rc;
+}
+
+/*
+ * As file closes, flush all cached write data for this inode checking
+ * for write behind errors.
+ */
+int cifs_flush(struct file *file, fl_owner_t id)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int rc = 0;
+
+	if (file->f_mode & FMODE_WRITE)
+		rc = filemap_write_and_wait(inode->i_mapping);
+
+	cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
+
+	return rc;
+}
+
+static int
+cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
+{
+	int rc = 0;
+	unsigned long i;
+
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = alloc_page(__GFP_HIGHMEM);
+		if (!pages[i]) {
+			/*
+			 * save number of pages we have already allocated and
+			 * return with ENOMEM error
+			 */
+			num_pages = i;
+			rc = -ENOMEM;
+			goto error;
+		}
+	}
+
+	return rc;
+
+error:
+	for (i = 0; i < num_pages; i++)
+		put_page(pages[i]);
+	return rc;
+}
+
+static inline
+size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
+{
+	size_t num_pages;
+	size_t clen;
+
+	clen = min_t(const size_t, len, wsize);
+	num_pages = clen / PAGE_CACHE_SIZE;
+	if (clen % PAGE_CACHE_SIZE)
+		num_pages++;
+
+	if (cur_len)
+		*cur_len = clen;
+
+	return num_pages;
+}
+
+static ssize_t
+cifs_iovec_write(struct file *file, const struct iovec *iov,
+		 unsigned long nr_segs, loff_t *poffset)
+{
+	unsigned int written;
+	unsigned long num_pages, npages, i;
+	size_t copied, len, cur_len;
+	ssize_t total_written = 0;
+	struct kvec *to_send;
+	struct page **pages;
+	struct iov_iter it;
+	struct inode *inode;
+	struct cifsFileInfo *open_file;
+	struct cifs_tcon *pTcon;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_io_parms io_parms;
+	int xid, rc;
+	__u32 pid;
+
+	len = iov_length(iov, nr_segs);
+	if (!len)
+		return 0;
+
+	rc = generic_write_checks(file, poffset, &len, 0);
+	if (rc)
+		return rc;
+
+	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	num_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
+
+	pages = kmalloc(sizeof(struct pages *)*num_pages, GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	to_send = kmalloc(sizeof(struct kvec)*(num_pages + 1), GFP_KERNEL);
+	if (!to_send) {
+		kfree(pages);
+		return -ENOMEM;
+	}
+
+	rc = cifs_write_allocate_pages(pages, num_pages);
+	if (rc) {
+		kfree(pages);
+		kfree(to_send);
+		return rc;
+	}
+
+	xid = GetXid();
+	open_file = file->private_data;
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = open_file->pid;
+	else
+		pid = current->tgid;
+
+	pTcon = tlink_tcon(open_file->tlink);
+	inode = file->f_path.dentry->d_inode;
+
+	iov_iter_init(&it, iov, nr_segs, len, 0);
+	npages = num_pages;
+
+	do {
+		size_t save_len = cur_len;
+		for (i = 0; i < npages; i++) {
+			copied = min_t(const size_t, cur_len, PAGE_CACHE_SIZE);
+			copied = iov_iter_copy_from_user(pages[i], &it, 0,
+							 copied);
+			cur_len -= copied;
+			iov_iter_advance(&it, copied);
+			to_send[i+1].iov_base = kmap(pages[i]);
+			to_send[i+1].iov_len = copied;
+		}
+
+		cur_len = save_len - cur_len;
+
+		do {
+			if (open_file->invalidHandle) {
+				rc = cifs_reopen_file(open_file, false);
+				if (rc != 0)
+					break;
+			}
+			io_parms.netfid = open_file->netfid;
+			io_parms.pid = pid;
+			io_parms.tcon = pTcon;
+			io_parms.offset = *poffset;
+			io_parms.length = cur_len;
+			rc = CIFSSMBWrite2(xid, &io_parms, &written, to_send,
+					   npages, 0);
+		} while (rc == -EAGAIN);
+
+		for (i = 0; i < npages; i++)
+			kunmap(pages[i]);
+
+		if (written) {
+			len -= written;
+			total_written += written;
+			cifs_update_eof(CIFS_I(inode), *poffset, written);
+			*poffset += written;
+		} else if (rc < 0) {
+			if (!total_written)
+				total_written = rc;
+			break;
+		}
+
+		/* get length and number of kvecs of the next write */
+		npages = get_numpages(cifs_sb->wsize, len, &cur_len);
+	} while (len > 0);
+
+	if (total_written > 0) {
+		spin_lock(&inode->i_lock);
+		if (*poffset > inode->i_size)
+			i_size_write(inode, *poffset);
+		spin_unlock(&inode->i_lock);
+	}
+
+	cifs_stats_bytes_written(pTcon, total_written);
+	mark_inode_dirty_sync(inode);
+
+	for (i = 0; i < num_pages; i++)
+		put_page(pages[i]);
+	kfree(to_send);
+	kfree(pages);
+	FreeXid(xid);
+	return total_written;
+}
+
+ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
+{
+	ssize_t written;
+	struct inode *inode;
+
+	inode = iocb->ki_filp->f_path.dentry->d_inode;
+
+	/*
+	 * BB - optimize the way when signing is disabled. We can drop this
+	 * extra memory-to-memory copying and use iovec buffers for constructing
+	 * write request.
+	 */
+
+	written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos);
+	if (written > 0) {
+		CIFS_I(inode)->invalid_mapping = true;
+		iocb->ki_pos = pos;
+	}
+
+	return written;
+}
+
+ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
+			   unsigned long nr_segs, loff_t pos)
+{
+	struct inode *inode;
+
+	inode = iocb->ki_filp->f_path.dentry->d_inode;
+
+	if (CIFS_I(inode)->clientCanCacheAll)
+		return generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+	/*
+	 * In strict cache mode we need to write the data to the server exactly
+	 * from the pos to pos+len-1 rather than flush all affected pages
+	 * because it may cause a error with mandatory locks on these pages but
+	 * not on the region from pos to ppos+len-1.
+	 */
+
+	return cifs_user_writev(iocb, iov, nr_segs, pos);
+}
+
+static ssize_t
+cifs_iovec_read(struct file *file, const struct iovec *iov,
+		 unsigned long nr_segs, loff_t *poffset)
+{
+	int rc;
+	int xid;
+	ssize_t total_read;
+	unsigned int bytes_read = 0;
+	size_t len, cur_len;
+	int iov_offset = 0;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *pTcon;
+	struct cifsFileInfo *open_file;
+	struct smb_com_read_rsp *pSMBr;
+	struct cifs_io_parms io_parms;
+	char *read_data;
+	__u32 pid;
+
+	if (!nr_segs)
+		return 0;
+
+	len = iov_length(iov, nr_segs);
+	if (!len)
+		return 0;
+
+	xid = GetXid();
+	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+
+	open_file = file->private_data;
+	pTcon = tlink_tcon(open_file->tlink);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = open_file->pid;
+	else
+		pid = current->tgid;
+
+	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+		cFYI(1, "attempting read on write only file instance");
+
+	for (total_read = 0; total_read < len; total_read += bytes_read) {
+		cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
+		rc = -EAGAIN;
+		read_data = NULL;
+
+		while (rc == -EAGAIN) {
+			int buf_type = CIFS_NO_BUFFER;
+			if (open_file->invalidHandle) {
+				rc = cifs_reopen_file(open_file, true);
+				if (rc != 0)
+					break;
+			}
+			io_parms.netfid = open_file->netfid;
+			io_parms.pid = pid;
+			io_parms.tcon = pTcon;
+			io_parms.offset = *poffset;
+			io_parms.length = cur_len;
+			rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
+					 &read_data, &buf_type);
+			pSMBr = (struct smb_com_read_rsp *)read_data;
+			if (read_data) {
+				char *data_offset = read_data + 4 +
+						le16_to_cpu(pSMBr->DataOffset);
+				if (memcpy_toiovecend(iov, data_offset,
+						      iov_offset, bytes_read))
+					rc = -EFAULT;
+				if (buf_type == CIFS_SMALL_BUFFER)
+					cifs_small_buf_release(read_data);
+				else if (buf_type == CIFS_LARGE_BUFFER)
+					cifs_buf_release(read_data);
+				read_data = NULL;
+				iov_offset += bytes_read;
+			}
+		}
+
+		if (rc || (bytes_read == 0)) {
+			if (total_read) {
+				break;
+			} else {
+				FreeXid(xid);
+				return rc;
+			}
+		} else {
+			cifs_stats_bytes_read(pTcon, bytes_read);
+			*poffset += bytes_read;
+		}
+	}
+
+	FreeXid(xid);
+	return total_read;
+}
+
+ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos)
+{
+	ssize_t read;
+
+	read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
+	if (read > 0)
+		iocb->ki_pos = pos;
+
+	return read;
+}
+
+ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
+			  unsigned long nr_segs, loff_t pos)
+{
+	struct inode *inode;
+
+	inode = iocb->ki_filp->f_path.dentry->d_inode;
+
+	if (CIFS_I(inode)->clientCanCacheRead)
+		return generic_file_aio_read(iocb, iov, nr_segs, pos);
+
+	/*
+	 * In strict cache mode we need to read from the server all the time
+	 * if we don't have level II oplock because the server can delay mtime
+	 * change - so we can't make a decision about inode invalidating.
+	 * And we can also fail with pagereading if there are mandatory locks
+	 * on pages affected by this read but not on the region from pos to
+	 * pos+len-1.
+	 */
+
+	return cifs_user_readv(iocb, iov, nr_segs, pos);
+}
+
+static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
+			 loff_t *poffset)
+{
+	int rc = -EACCES;
+	unsigned int bytes_read = 0;
+	unsigned int total_read;
+	unsigned int current_read_size;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *pTcon;
+	int xid;
+	char *current_offset;
+	struct cifsFileInfo *open_file;
+	struct cifs_io_parms io_parms;
+	int buf_type = CIFS_NO_BUFFER;
+	__u32 pid;
+
+	xid = GetXid();
+	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+
+	if (file->private_data == NULL) {
+		rc = -EBADF;
+		FreeXid(xid);
+		return rc;
+	}
+	open_file = file->private_data;
+	pTcon = tlink_tcon(open_file->tlink);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = open_file->pid;
+	else
+		pid = current->tgid;
+
+	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+		cFYI(1, "attempting read on write only file instance");
+
+	for (total_read = 0, current_offset = read_data;
+	     read_size > total_read;
+	     total_read += bytes_read, current_offset += bytes_read) {
+		current_read_size = min_t(const int, read_size - total_read,
+					  cifs_sb->rsize);
+		/* For windows me and 9x we do not want to request more
+		than it negotiated since it will refuse the read then */
+		if ((pTcon->ses) &&
+			!(pTcon->ses->capabilities & CAP_LARGE_FILES)) {
+			current_read_size = min_t(const int, current_read_size,
+					pTcon->ses->server->maxBuf - 128);
+		}
+		rc = -EAGAIN;
+		while (rc == -EAGAIN) {
+			if (open_file->invalidHandle) {
+				rc = cifs_reopen_file(open_file, true);
+				if (rc != 0)
+					break;
+			}
+			io_parms.netfid = open_file->netfid;
+			io_parms.pid = pid;
+			io_parms.tcon = pTcon;
+			io_parms.offset = *poffset;
+			io_parms.length = current_read_size;
+			rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
+					 &current_offset, &buf_type);
+		}
+		if (rc || (bytes_read == 0)) {
+			if (total_read) {
+				break;
+			} else {
+				FreeXid(xid);
+				return rc;
+			}
+		} else {
+			cifs_stats_bytes_read(pTcon, total_read);
+			*poffset += bytes_read;
+		}
+	}
+	FreeXid(xid);
+	return total_read;
+}
+
+/*
+ * If the page is mmap'ed into a process' page tables, then we need to make
+ * sure that it doesn't change while being written back.
+ */
+static int
+cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+
+	lock_page(page);
+	return VM_FAULT_LOCKED;
+}
+
+static struct vm_operations_struct cifs_file_vm_ops = {
+	.fault = filemap_fault,
+	.page_mkwrite = cifs_page_mkwrite,
+};
+
+int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int rc, xid;
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	xid = GetXid();
+
+	if (!CIFS_I(inode)->clientCanCacheRead) {
+		rc = cifs_invalidate_mapping(inode);
+		if (rc)
+			return rc;
+	}
+
+	rc = generic_file_mmap(file, vma);
+	if (rc == 0)
+		vma->vm_ops = &cifs_file_vm_ops;
+	FreeXid(xid);
+	return rc;
+}
+
+int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int rc, xid;
+
+	xid = GetXid();
+	rc = cifs_revalidate_file(file);
+	if (rc) {
+		cFYI(1, "Validation prior to mmap failed, error=%d", rc);
+		FreeXid(xid);
+		return rc;
+	}
+	rc = generic_file_mmap(file, vma);
+	if (rc == 0)
+		vma->vm_ops = &cifs_file_vm_ops;
+	FreeXid(xid);
+	return rc;
+}
+
+
+static void cifs_copy_cache_pages(struct address_space *mapping,
+	struct list_head *pages, int bytes_read, char *data)
+{
+	struct page *page;
+	char *target;
+
+	while (bytes_read > 0) {
+		if (list_empty(pages))
+			break;
+
+		page = list_entry(pages->prev, struct page, lru);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index,
+				      GFP_KERNEL)) {
+			page_cache_release(page);
+			cFYI(1, "Add page cache failed");
+			data += PAGE_CACHE_SIZE;
+			bytes_read -= PAGE_CACHE_SIZE;
+			continue;
+		}
+		page_cache_release(page);
+
+		target = kmap_atomic(page, KM_USER0);
+
+		if (PAGE_CACHE_SIZE > bytes_read) {
+			memcpy(target, data, bytes_read);
+			/* zero the tail end of this partial page */
+			memset(target + bytes_read, 0,
+			       PAGE_CACHE_SIZE - bytes_read);
+			bytes_read = 0;
+		} else {
+			memcpy(target, data, PAGE_CACHE_SIZE);
+			bytes_read -= PAGE_CACHE_SIZE;
+		}
+		kunmap_atomic(target, KM_USER0);
+
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+		unlock_page(page);
+		data += PAGE_CACHE_SIZE;
+
+		/* add page to FS-Cache */
+		cifs_readpage_to_fscache(mapping->host, page);
+	}
+	return;
+}
+
+static int cifs_readpages(struct file *file, struct address_space *mapping,
+	struct list_head *page_list, unsigned num_pages)
+{
+	int rc = -EACCES;
+	int xid;
+	loff_t offset;
+	struct page *page;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *pTcon;
+	unsigned int bytes_read = 0;
+	unsigned int read_size, i;
+	char *smb_read_data = NULL;
+	struct smb_com_read_rsp *pSMBr;
+	struct cifsFileInfo *open_file;
+	struct cifs_io_parms io_parms;
+	int buf_type = CIFS_NO_BUFFER;
+	__u32 pid;
+
+	xid = GetXid();
+	if (file->private_data == NULL) {
+		rc = -EBADF;
+		FreeXid(xid);
+		return rc;
+	}
+	open_file = file->private_data;
+	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	pTcon = tlink_tcon(open_file->tlink);
+
+	/*
+	 * Reads as many pages as possible from fscache. Returns -ENOBUFS
+	 * immediately if the cookie is negative
+	 */
+	rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
+					 &num_pages);
+	if (rc == 0)
+		goto read_complete;
+
+	cFYI(DBG2, "rpages: num pages %d", num_pages);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = open_file->pid;
+	else
+		pid = current->tgid;
+
+	for (i = 0; i < num_pages; ) {
+		unsigned contig_pages;
+		struct page *tmp_page;
+		unsigned long expected_index;
+
+		if (list_empty(page_list))
+			break;
+
+		page = list_entry(page_list->prev, struct page, lru);
+		offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+
+		/* count adjacent pages that we will read into */
+		contig_pages = 0;
+		expected_index =
+			list_entry(page_list->prev, struct page, lru)->index;
+		list_for_each_entry_reverse(tmp_page, page_list, lru) {
+			if (tmp_page->index == expected_index) {
+				contig_pages++;
+				expected_index++;
+			} else
+				break;
+		}
+		if (contig_pages + i >  num_pages)
+			contig_pages = num_pages - i;
+
+		/* for reads over a certain size could initiate async
+		   read ahead */
+
+		read_size = contig_pages * PAGE_CACHE_SIZE;
+		/* Read size needs to be in multiples of one page */
+		read_size = min_t(const unsigned int, read_size,
+				  cifs_sb->rsize & PAGE_CACHE_MASK);
+		cFYI(DBG2, "rpages: read size 0x%x  contiguous pages %d",
+				read_size, contig_pages);
+		rc = -EAGAIN;
+		while (rc == -EAGAIN) {
+			if (open_file->invalidHandle) {
+				rc = cifs_reopen_file(open_file, true);
+				if (rc != 0)
+					break;
+			}
+			io_parms.netfid = open_file->netfid;
+			io_parms.pid = pid;
+			io_parms.tcon = pTcon;
+			io_parms.offset = offset;
+			io_parms.length = read_size;
+			rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
+					 &smb_read_data, &buf_type);
+			/* BB more RC checks ? */
+			if (rc == -EAGAIN) {
+				if (smb_read_data) {
+					if (buf_type == CIFS_SMALL_BUFFER)
+						cifs_small_buf_release(smb_read_data);
+					else if (buf_type == CIFS_LARGE_BUFFER)
+						cifs_buf_release(smb_read_data);
+					smb_read_data = NULL;
+				}
+			}
+		}
+		if ((rc < 0) || (smb_read_data == NULL)) {
+			cFYI(1, "Read error in readpages: %d", rc);
+			break;
+		} else if (bytes_read > 0) {
+			task_io_account_read(bytes_read);
+			pSMBr = (struct smb_com_read_rsp *)smb_read_data;
+			cifs_copy_cache_pages(mapping, page_list, bytes_read,
+				smb_read_data + 4 /* RFC1001 hdr */ +
+				le16_to_cpu(pSMBr->DataOffset));
+
+			i +=  bytes_read >> PAGE_CACHE_SHIFT;
+			cifs_stats_bytes_read(pTcon, bytes_read);
+			if ((bytes_read & PAGE_CACHE_MASK) != bytes_read) {
+				i++; /* account for partial page */
+
+				/* server copy of file can have smaller size
+				   than client */
+				/* BB do we need to verify this common case ?
+				   this case is ok - if we are at server EOF
+				   we will hit it on next read */
+
+				/* break; */
+			}
+		} else {
+			cFYI(1, "No bytes read (%d) at offset %lld . "
+				"Cleaning remaining pages from readahead list",
+				bytes_read, offset);
+			/* BB turn off caching and do new lookup on
+			   file size at server? */
+			break;
+		}
+		if (smb_read_data) {
+			if (buf_type == CIFS_SMALL_BUFFER)
+				cifs_small_buf_release(smb_read_data);
+			else if (buf_type == CIFS_LARGE_BUFFER)
+				cifs_buf_release(smb_read_data);
+			smb_read_data = NULL;
+		}
+		bytes_read = 0;
+	}
+
+/* need to free smb_read_data buf before exit */
+	if (smb_read_data) {
+		if (buf_type == CIFS_SMALL_BUFFER)
+			cifs_small_buf_release(smb_read_data);
+		else if (buf_type == CIFS_LARGE_BUFFER)
+			cifs_buf_release(smb_read_data);
+		smb_read_data = NULL;
+	}
+
+read_complete:
+	FreeXid(xid);
+	return rc;
+}
+
+static int cifs_readpage_worker(struct file *file, struct page *page,
+	loff_t *poffset)
+{
+	char *read_data;
+	int rc;
+
+	/* Is the page cached? */
+	rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page);
+	if (rc == 0)
+		goto read_complete;
+
+	page_cache_get(page);
+	read_data = kmap(page);
+	/* for reads over a certain size could initiate async read ahead */
+
+	rc = cifs_read(file, read_data, PAGE_CACHE_SIZE, poffset);
+
+	if (rc < 0)
+		goto io_error;
+	else
+		cFYI(1, "Bytes read %d", rc);
+
+	file->f_path.dentry->d_inode->i_atime =
+		current_fs_time(file->f_path.dentry->d_inode->i_sb);
+
+	if (PAGE_CACHE_SIZE > rc)
+		memset(read_data + rc, 0, PAGE_CACHE_SIZE - rc);
+
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+
+	/* send this page to the cache */
+	cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page);
+
+	rc = 0;
+
+io_error:
+	kunmap(page);
+	page_cache_release(page);
+
+read_complete:
+	return rc;
+}
+
+static int cifs_readpage(struct file *file, struct page *page)
+{
+	loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	int rc = -EACCES;
+	int xid;
+
+	xid = GetXid();
+
+	if (file->private_data == NULL) {
+		rc = -EBADF;
+		FreeXid(xid);
+		return rc;
+	}
+
+	cFYI(1, "readpage %p at offset %d 0x%x\n",
+		 page, (int)offset, (int)offset);
+
+	rc = cifs_readpage_worker(file, page, &offset);
+
+	unlock_page(page);
+
+	FreeXid(xid);
+	return rc;
+}
+
+static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
+{
+	struct cifsFileInfo *open_file;
+
+	spin_lock(&cifs_file_list_lock);
+	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
+		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
+			spin_unlock(&cifs_file_list_lock);
+			return 1;
+		}
+	}
+	spin_unlock(&cifs_file_list_lock);
+	return 0;
+}
+
+/* We do not want to update the file size from server for inodes
+   open for write - to avoid races with writepage extending
+   the file - in the future we could consider allowing
+   refreshing the inode only on increases in the file size
+   but this is tricky to do without racing with writebehind
+   page caching in the current Linux kernel design */
+bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
+{
+	if (!cifsInode)
+		return true;
+
+	if (is_inode_writable(cifsInode)) {
+		/* This inode is open for write at least once */
+		struct cifs_sb_info *cifs_sb;
+
+		cifs_sb = CIFS_SB(cifsInode->vfs_inode.i_sb);
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+			/* since no page cache to corrupt on directio
+			we can change size safely */
+			return true;
+		}
+
+		if (i_size_read(&cifsInode->vfs_inode) < end_of_file)
+			return true;
+
+		return false;
+	} else
+		return true;
+}
+
+static int cifs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
+	loff_t page_start = pos & PAGE_MASK;
+	loff_t i_size;
+	struct page *page;
+	int rc = 0;
+
+	cFYI(1, "write_begin from %lld len %d", (long long)pos, len);
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (PageUptodate(page))
+		goto out;
+
+	/*
+	 * If we write a full page it will be up to date, no need to read from
+	 * the server. If the write is short, we'll end up doing a sync write
+	 * instead.
+	 */
+	if (len == PAGE_CACHE_SIZE)
+		goto out;
+
+	/*
+	 * optimize away the read when we have an oplock, and we're not
+	 * expecting to use any of the data we'd be reading in. That
+	 * is, when the page lies beyond the EOF, or straddles the EOF
+	 * and the write will cover all of the existing data.
+	 */
+	if (CIFS_I(mapping->host)->clientCanCacheRead) {
+		i_size = i_size_read(mapping->host);
+		if (page_start >= i_size ||
+		    (offset == 0 && (pos + len) >= i_size)) {
+			zero_user_segments(page, 0, offset,
+					   offset + len,
+					   PAGE_CACHE_SIZE);
+			/*
+			 * PageChecked means that the parts of the page
+			 * to which we're not writing are considered up
+			 * to date. Once the data is copied to the
+			 * page, it can be set uptodate.
+			 */
+			SetPageChecked(page);
+			goto out;
+		}
+	}
+
+	if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+		/*
+		 * might as well read a page, it is fast enough. If we get
+		 * an error, we don't need to return it. cifs_write_end will
+		 * do a sync write instead since PG_uptodate isn't set.
+		 */
+		cifs_readpage_worker(file, page, &page_start);
+	} else {
+		/* we could try using another file handle if there is one -
+		   but how would we lock it to prevent close of that handle
+		   racing with this read? In any case
+		   this will be written out by write_end so is fine */
+	}
+out:
+	*pagep = page;
+	return rc;
+}
+
+static int cifs_release_page(struct page *page, gfp_t gfp)
+{
+	if (PagePrivate(page))
+		return 0;
+
+	return cifs_fscache_release_page(page, gfp);
+}
+
+static void cifs_invalidate_page(struct page *page, unsigned long offset)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
+
+	if (offset == 0)
+		cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
+}
+
+static int cifs_launder_page(struct page *page)
+{
+	int rc = 0;
+	loff_t range_start = page_offset(page);
+	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = 0,
+		.range_start = range_start,
+		.range_end = range_end,
+	};
+
+	cFYI(1, "Launder page: %p", page);
+
+	if (clear_page_dirty_for_io(page))
+		rc = cifs_writepage_locked(page, &wbc);
+
+	cifs_fscache_invalidate_page(page, page->mapping->host);
+	return rc;
+}
+
+void cifs_oplock_break(struct work_struct *work)
+{
+	struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
+						  oplock_break);
+	struct inode *inode = cfile->dentry->d_inode;
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+	int rc = 0;
+
+	if (inode && S_ISREG(inode->i_mode)) {
+		if (cinode->clientCanCacheRead)
+			break_lease(inode, O_RDONLY);
+		else
+			break_lease(inode, O_WRONLY);
+		rc = filemap_fdatawrite(inode->i_mapping);
+		if (cinode->clientCanCacheRead == 0) {
+			rc = filemap_fdatawait(inode->i_mapping);
+			mapping_set_error(inode->i_mapping, rc);
+			invalidate_remote_inode(inode);
+		}
+		cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
+	}
+
+	/*
+	 * releasing stale oplock after recent reconnect of smb session using
+	 * a now incorrect file handle is not a data integrity issue but do
+	 * not bother sending an oplock release if session to server still is
+	 * disconnected since oplock already released by the server
+	 */
+	if (!cfile->oplock_break_cancelled) {
+		rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
+				 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
+				 cinode->clientCanCacheRead ? 1 : 0);
+		cFYI(1, "Oplock release rc = %d", rc);
+	}
+
+	/*
+	 * We might have kicked in before is_valid_oplock_break()
+	 * finished grabbing reference for us.  Make sure it's done by
+	 * waiting for cifs_file_list_lock.
+	 */
+	spin_lock(&cifs_file_list_lock);
+	spin_unlock(&cifs_file_list_lock);
+
+	cifs_oplock_break_put(cfile);
+}
+
+/* must be called while holding cifs_file_list_lock */
+void cifs_oplock_break_get(struct cifsFileInfo *cfile)
+{
+	cifs_sb_active(cfile->dentry->d_sb);
+	cifsFileInfo_get(cfile);
+}
+
+void cifs_oplock_break_put(struct cifsFileInfo *cfile)
+{
+	struct super_block *sb = cfile->dentry->d_sb;
+
+	cifsFileInfo_put(cfile);
+	cifs_sb_deactive(sb);
+}
+
+const struct address_space_operations cifs_addr_ops = {
+	.readpage = cifs_readpage,
+	.readpages = cifs_readpages,
+	.writepage = cifs_writepage,
+	.writepages = cifs_writepages,
+	.write_begin = cifs_write_begin,
+	.write_end = cifs_write_end,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+	.releasepage = cifs_release_page,
+	.invalidatepage = cifs_invalidate_page,
+	.launder_page = cifs_launder_page,
+};
+
+/*
+ * cifs_readpages requires the server to support a buffer large enough to
+ * contain the header plus one complete page of data.  Otherwise, we need
+ * to leave cifs_readpages out of the address space operations.
+ */
+const struct address_space_operations cifs_addr_ops_smallbuf = {
+	.readpage = cifs_readpage,
+	.writepage = cifs_writepage,
+	.writepages = cifs_writepages,
+	.write_begin = cifs_write_begin,
+	.write_end = cifs_write_end,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+	.releasepage = cifs_release_page,
+	.invalidatepage = cifs_invalidate_page,
+	.launder_page = cifs_launder_page,
+};
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
new file mode 100644
index 00000000..42e5363b
--- /dev/null
+++ b/fs/cifs/fscache.c
@@ -0,0 +1,235 @@
+/*
+ *   fs/cifs/fscache.c - CIFS filesystem cache interface
+ *
+ *   Copyright (c) 2010 Novell, Inc.
+ *   Author(s): Suresh Jayaraman <sjayaraman@suse.de>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include "fscache.h"
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+
+void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
+{
+	server->fscache =
+		fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
+				&cifs_fscache_server_index_def, server);
+	cFYI(1, "%s: (0x%p/0x%p)", __func__, server,
+			server->fscache);
+}
+
+void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
+{
+	cFYI(1, "%s: (0x%p/0x%p)", __func__, server,
+			server->fscache);
+	fscache_relinquish_cookie(server->fscache, 0);
+	server->fscache = NULL;
+}
+
+void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
+{
+	struct TCP_Server_Info *server = tcon->ses->server;
+
+	tcon->fscache =
+		fscache_acquire_cookie(server->fscache,
+				&cifs_fscache_super_index_def, tcon);
+	cFYI(1, "%s: (0x%p/0x%p)", __func__, server->fscache,
+			tcon->fscache);
+}
+
+void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
+{
+	cFYI(1, "%s: (0x%p)", __func__, tcon->fscache);
+	fscache_relinquish_cookie(tcon->fscache, 0);
+	tcon->fscache = NULL;
+}
+
+static void cifs_fscache_enable_inode_cookie(struct inode *inode)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+	if (cifsi->fscache)
+		return;
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
+		cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
+				&cifs_fscache_inode_object_def, cifsi);
+		cFYI(1, "%s: got FH cookie (0x%p/0x%p)", __func__,
+				tcon->fscache, cifsi->fscache);
+	}
+}
+
+void cifs_fscache_release_inode_cookie(struct inode *inode)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+	if (cifsi->fscache) {
+		cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache);
+		fscache_relinquish_cookie(cifsi->fscache, 0);
+		cifsi->fscache = NULL;
+	}
+}
+
+static void cifs_fscache_disable_inode_cookie(struct inode *inode)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+	if (cifsi->fscache) {
+		cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache);
+		fscache_uncache_all_inode_pages(cifsi->fscache, inode);
+		fscache_relinquish_cookie(cifsi->fscache, 1);
+		cifsi->fscache = NULL;
+	}
+}
+
+void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
+{
+	if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+		cifs_fscache_disable_inode_cookie(inode);
+	else
+		cifs_fscache_enable_inode_cookie(inode);
+}
+
+void cifs_fscache_reset_inode_cookie(struct inode *inode)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct fscache_cookie *old = cifsi->fscache;
+
+	if (cifsi->fscache) {
+		/* retire the current fscache cache and get a new one */
+		fscache_relinquish_cookie(cifsi->fscache, 1);
+
+		cifsi->fscache = fscache_acquire_cookie(
+					cifs_sb_master_tcon(cifs_sb)->fscache,
+					&cifs_fscache_inode_object_def,
+					cifsi);
+		cFYI(1, "%s: new cookie 0x%p oldcookie 0x%p",
+				__func__, cifsi->fscache, old);
+	}
+}
+
+int cifs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	if (PageFsCache(page)) {
+		struct inode *inode = page->mapping->host;
+		struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+		cFYI(1, "%s: (0x%p/0x%p)", __func__, page,
+				cifsi->fscache);
+		if (!fscache_maybe_release_page(cifsi->fscache, page, gfp))
+			return 0;
+	}
+
+	return 1;
+}
+
+static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx,
+						int error)
+{
+	cFYI(1, "%s: (0x%p/%d)", __func__, page, error);
+	if (!error)
+		SetPageUptodate(page);
+	unlock_page(page);
+}
+
+/*
+ * Retrieve a page from FS-Cache
+ */
+int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+	int ret;
+
+	cFYI(1, "%s: (fsc:%p, p:%p, i:0x%p", __func__,
+			CIFS_I(inode)->fscache, page, inode);
+	ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page,
+					 cifs_readpage_from_fscache_complete,
+					 NULL,
+					 GFP_KERNEL);
+	switch (ret) {
+
+	case 0: /* page found in fscache, read submitted */
+		cFYI(1, "%s: submitted", __func__);
+		return ret;
+	case -ENOBUFS:	/* page won't be cached */
+	case -ENODATA:	/* page not in cache */
+		cFYI(1, "%s: %d", __func__, ret);
+		return 1;
+
+	default:
+		cERROR(1, "unknown error ret = %d", ret);
+	}
+	return ret;
+}
+
+/*
+ * Retrieve a set of pages from FS-Cache
+ */
+int __cifs_readpages_from_fscache(struct inode *inode,
+				struct address_space *mapping,
+				struct list_head *pages,
+				unsigned *nr_pages)
+{
+	int ret;
+
+	cFYI(1, "%s: (0x%p/%u/0x%p)", __func__,
+			CIFS_I(inode)->fscache, *nr_pages, inode);
+	ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping,
+					  pages, nr_pages,
+					  cifs_readpage_from_fscache_complete,
+					  NULL,
+					  mapping_gfp_mask(mapping));
+	switch (ret) {
+	case 0:	/* read submitted to the cache for all pages */
+		cFYI(1, "%s: submitted", __func__);
+		return ret;
+
+	case -ENOBUFS:	/* some pages are not cached and can't be */
+	case -ENODATA:	/* some pages are not cached */
+		cFYI(1, "%s: no page", __func__);
+		return 1;
+
+	default:
+		cFYI(1, "unknown error ret = %d", ret);
+	}
+
+	return ret;
+}
+
+void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+	int ret;
+
+	cFYI(1, "%s: (fsc: %p, p: %p, i: %p)", __func__,
+			CIFS_I(inode)->fscache, page, inode);
+	ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL);
+	if (ret != 0)
+		fscache_uncache_page(CIFS_I(inode)->fscache, page);
+}
+
+void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	struct fscache_cookie *cookie = cifsi->fscache;
+
+	cFYI(1, "%s: (0x%p/0x%p)", __func__, page, cookie);
+	fscache_wait_on_page_write(cookie, page);
+	fscache_uncache_page(cookie, page);
+}
+
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
new file mode 100644
index 00000000..63539323
--- /dev/null
+++ b/fs/cifs/fscache.h
@@ -0,0 +1,136 @@
+/*
+ *   fs/cifs/fscache.h - CIFS filesystem cache interface definitions
+ *
+ *   Copyright (c) 2010 Novell, Inc.
+ *   Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _CIFS_FSCACHE_H
+#define _CIFS_FSCACHE_H
+
+#include <linux/fscache.h>
+
+#include "cifsglob.h"
+
+#ifdef CONFIG_CIFS_FSCACHE
+
+extern struct fscache_netfs cifs_fscache_netfs;
+extern const struct fscache_cookie_def cifs_fscache_server_index_def;
+extern const struct fscache_cookie_def cifs_fscache_super_index_def;
+extern const struct fscache_cookie_def cifs_fscache_inode_object_def;
+
+extern int cifs_fscache_register(void);
+extern void cifs_fscache_unregister(void);
+
+/*
+ * fscache.c
+ */
+extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *);
+extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *);
+extern void cifs_fscache_get_super_cookie(struct cifs_tcon *);
+extern void cifs_fscache_release_super_cookie(struct cifs_tcon *);
+
+extern void cifs_fscache_release_inode_cookie(struct inode *);
+extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *);
+extern void cifs_fscache_reset_inode_cookie(struct inode *);
+
+extern void __cifs_fscache_invalidate_page(struct page *, struct inode *);
+extern int cifs_fscache_release_page(struct page *page, gfp_t gfp);
+extern int __cifs_readpage_from_fscache(struct inode *, struct page *);
+extern int __cifs_readpages_from_fscache(struct inode *,
+					 struct address_space *,
+					 struct list_head *,
+					 unsigned *);
+
+extern void __cifs_readpage_to_fscache(struct inode *, struct page *);
+
+static inline void cifs_fscache_invalidate_page(struct page *page,
+					       struct inode *inode)
+{
+	if (PageFsCache(page))
+		__cifs_fscache_invalidate_page(page, inode);
+}
+
+static inline int cifs_readpage_from_fscache(struct inode *inode,
+					     struct page *page)
+{
+	if (CIFS_I(inode)->fscache)
+		return __cifs_readpage_from_fscache(inode, page);
+
+	return -ENOBUFS;
+}
+
+static inline int cifs_readpages_from_fscache(struct inode *inode,
+					      struct address_space *mapping,
+					      struct list_head *pages,
+					      unsigned *nr_pages)
+{
+	if (CIFS_I(inode)->fscache)
+		return __cifs_readpages_from_fscache(inode, mapping, pages,
+						     nr_pages);
+	return -ENOBUFS;
+}
+
+static inline void cifs_readpage_to_fscache(struct inode *inode,
+					    struct page *page)
+{
+	if (PageFsCache(page))
+		__cifs_readpage_to_fscache(inode, page);
+}
+
+#else /* CONFIG_CIFS_FSCACHE */
+static inline int cifs_fscache_register(void) { return 0; }
+static inline void cifs_fscache_unregister(void) {}
+
+static inline void
+cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {}
+static inline void
+cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {}
+static inline void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) {}
+static inline void
+cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {}
+
+static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void cifs_fscache_set_inode_cookie(struct inode *inode,
+						 struct file *filp) {}
+static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {}
+static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	return 1; /* May release page */
+}
+
+static inline void cifs_fscache_invalidate_page(struct page *page,
+			struct inode *inode) {}
+static inline int
+cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+	return -ENOBUFS;
+}
+
+static inline int cifs_readpages_from_fscache(struct inode *inode,
+					      struct address_space *mapping,
+					      struct list_head *pages,
+					      unsigned *nr_pages)
+{
+	return -ENOBUFS;
+}
+
+static inline void cifs_readpage_to_fscache(struct inode *inode,
+			struct page *page) {}
+
+#endif /* CONFIG_CIFS_FSCACHE */
+
+#endif /* _CIFS_FSCACHE_H */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
new file mode 100644
index 00000000..745e5cdc
--- /dev/null
+++ b/fs/cifs/inode.c
@@ -0,0 +1,2272 @@
+/*
+ *   fs/cifs/inode.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "fscache.h"
+
+
+static void cifs_set_ops(struct inode *inode)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &cifs_file_inode_ops;
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+				inode->i_fop = &cifs_file_direct_nobrl_ops;
+			else
+				inode->i_fop = &cifs_file_direct_ops;
+		} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+				inode->i_fop = &cifs_file_strict_nobrl_ops;
+			else
+				inode->i_fop = &cifs_file_strict_ops;
+		} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+			inode->i_fop = &cifs_file_nobrl_ops;
+		else { /* not direct, send byte range locks */
+			inode->i_fop = &cifs_file_ops;
+		}
+
+		/* check if server can support readpages */
+		if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
+				PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
+			inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
+		else
+			inode->i_data.a_ops = &cifs_addr_ops;
+		break;
+	case S_IFDIR:
+#ifdef CONFIG_CIFS_DFS_UPCALL
+		if (IS_AUTOMOUNT(inode)) {
+			inode->i_op = &cifs_dfs_referral_inode_operations;
+		} else {
+#else /* NO DFS support, treat as a directory */
+		{
+#endif
+			inode->i_op = &cifs_dir_inode_ops;
+			inode->i_fop = &cifs_dir_ops;
+		}
+		break;
+	case S_IFLNK:
+		inode->i_op = &cifs_symlink_inode_ops;
+		break;
+	default:
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		break;
+	}
+}
+
+/* check inode attributes against fattr. If they don't match, tag the
+ * inode for cache invalidation
+ */
+static void
+cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
+{
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+
+	cFYI(1, "%s: revalidating inode %llu", __func__, cifs_i->uniqueid);
+
+	if (inode->i_state & I_NEW) {
+		cFYI(1, "%s: inode %llu is new", __func__, cifs_i->uniqueid);
+		return;
+	}
+
+	/* don't bother with revalidation if we have an oplock */
+	if (cifs_i->clientCanCacheRead) {
+		cFYI(1, "%s: inode %llu is oplocked", __func__,
+			 cifs_i->uniqueid);
+		return;
+	}
+
+	 /* revalidate if mtime or size have changed */
+	if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
+	    cifs_i->server_eof == fattr->cf_eof) {
+		cFYI(1, "%s: inode %llu is unchanged", __func__,
+			 cifs_i->uniqueid);
+		return;
+	}
+
+	cFYI(1, "%s: invalidating inode %llu mapping", __func__,
+		 cifs_i->uniqueid);
+	cifs_i->invalid_mapping = true;
+}
+
+/* populate an inode with info from a cifs_fattr struct */
+void
+cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
+{
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	unsigned long oldtime = cifs_i->time;
+
+	cifs_revalidate_cache(inode, fattr);
+
+	inode->i_atime = fattr->cf_atime;
+	inode->i_mtime = fattr->cf_mtime;
+	inode->i_ctime = fattr->cf_ctime;
+	inode->i_rdev = fattr->cf_rdev;
+	inode->i_nlink = fattr->cf_nlink;
+	inode->i_uid = fattr->cf_uid;
+	inode->i_gid = fattr->cf_gid;
+
+	/* if dynperm is set, don't clobber existing mode */
+	if (inode->i_state & I_NEW ||
+	    !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM))
+		inode->i_mode = fattr->cf_mode;
+
+	cifs_i->cifsAttrs = fattr->cf_cifsattrs;
+
+	if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
+		cifs_i->time = 0;
+	else
+		cifs_i->time = jiffies;
+
+	cFYI(1, "inode 0x%p old_time=%ld new_time=%ld", inode,
+		 oldtime, cifs_i->time);
+
+	cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
+
+	cifs_i->server_eof = fattr->cf_eof;
+	/*
+	 * Can't safely change the file size here if the client is writing to
+	 * it due to potential races.
+	 */
+	spin_lock(&inode->i_lock);
+	if (is_size_safe_to_change(cifs_i, fattr->cf_eof)) {
+		i_size_write(inode, fattr->cf_eof);
+
+		/*
+		 * i_blocks is not related to (i_size / i_blksize),
+		 * but instead 512 byte (2**9) size is required for
+		 * calculating num blocks.
+		 */
+		inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9;
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
+		inode->i_flags |= S_AUTOMOUNT;
+	cifs_set_ops(inode);
+}
+
+void
+cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+		return;
+
+	fattr->cf_uniqueid = iunique(sb, ROOT_I);
+}
+
+/* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */
+void
+cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
+			 struct cifs_sb_info *cifs_sb)
+{
+	memset(fattr, 0, sizeof(*fattr));
+	fattr->cf_uniqueid = le64_to_cpu(info->UniqueId);
+	fattr->cf_bytes = le64_to_cpu(info->NumOfBytes);
+	fattr->cf_eof = le64_to_cpu(info->EndOfFile);
+
+	fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
+	fattr->cf_mtime = cifs_NTtimeToUnix(info->LastModificationTime);
+	fattr->cf_ctime = cifs_NTtimeToUnix(info->LastStatusChange);
+	fattr->cf_mode = le64_to_cpu(info->Permissions);
+
+	/*
+	 * Since we set the inode type below we need to mask off
+	 * to avoid strange results if bits set above.
+	 */
+	fattr->cf_mode &= ~S_IFMT;
+	switch (le32_to_cpu(info->Type)) {
+	case UNIX_FILE:
+		fattr->cf_mode |= S_IFREG;
+		fattr->cf_dtype = DT_REG;
+		break;
+	case UNIX_SYMLINK:
+		fattr->cf_mode |= S_IFLNK;
+		fattr->cf_dtype = DT_LNK;
+		break;
+	case UNIX_DIR:
+		fattr->cf_mode |= S_IFDIR;
+		fattr->cf_dtype = DT_DIR;
+		break;
+	case UNIX_CHARDEV:
+		fattr->cf_mode |= S_IFCHR;
+		fattr->cf_dtype = DT_CHR;
+		fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor),
+				       le64_to_cpu(info->DevMinor) & MINORMASK);
+		break;
+	case UNIX_BLOCKDEV:
+		fattr->cf_mode |= S_IFBLK;
+		fattr->cf_dtype = DT_BLK;
+		fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor),
+				       le64_to_cpu(info->DevMinor) & MINORMASK);
+		break;
+	case UNIX_FIFO:
+		fattr->cf_mode |= S_IFIFO;
+		fattr->cf_dtype = DT_FIFO;
+		break;
+	case UNIX_SOCKET:
+		fattr->cf_mode |= S_IFSOCK;
+		fattr->cf_dtype = DT_SOCK;
+		break;
+	default:
+		/* safest to call it a file if we do not know */
+		fattr->cf_mode |= S_IFREG;
+		fattr->cf_dtype = DT_REG;
+		cFYI(1, "unknown type %d", le32_to_cpu(info->Type));
+		break;
+	}
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
+		fattr->cf_uid = cifs_sb->mnt_uid;
+	else
+		fattr->cf_uid = le64_to_cpu(info->Uid);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
+		fattr->cf_gid = cifs_sb->mnt_gid;
+	else
+		fattr->cf_gid = le64_to_cpu(info->Gid);
+
+	fattr->cf_nlink = le64_to_cpu(info->Nlinks);
+}
+
+/*
+ * Fill a cifs_fattr struct with fake inode info.
+ *
+ * Needed to setup cifs_fattr data for the directory which is the
+ * junction to the new submount (ie to setup the fake directory
+ * which represents a DFS referral).
+ */
+static void
+cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+
+	cFYI(1, "creating fake fattr for DFS referral");
+
+	memset(fattr, 0, sizeof(*fattr));
+	fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
+	fattr->cf_uid = cifs_sb->mnt_uid;
+	fattr->cf_gid = cifs_sb->mnt_gid;
+	fattr->cf_atime = CURRENT_TIME;
+	fattr->cf_ctime = CURRENT_TIME;
+	fattr->cf_mtime = CURRENT_TIME;
+	fattr->cf_nlink = 2;
+	fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
+}
+
+int cifs_get_file_info_unix(struct file *filp)
+{
+	int rc;
+	int xid;
+	FILE_UNIX_BASIC_INFO find_data;
+	struct cifs_fattr fattr;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsFileInfo *cfile = filp->private_data;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+
+	xid = GetXid();
+	rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
+	if (!rc) {
+		cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
+	} else if (rc == -EREMOTE) {
+		cifs_create_dfs_fattr(&fattr, inode->i_sb);
+		rc = 0;
+	}
+
+	cifs_fattr_to_inode(inode, &fattr);
+	FreeXid(xid);
+	return rc;
+}
+
+int cifs_get_inode_info_unix(struct inode **pinode,
+			     const unsigned char *full_path,
+			     struct super_block *sb, int xid)
+{
+	int rc;
+	FILE_UNIX_BASIC_INFO find_data;
+	struct cifs_fattr fattr;
+	struct cifs_tcon *tcon;
+	struct tcon_link *tlink;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+
+	cFYI(1, "Getting info on %s", full_path);
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	/* could have done a find first instead but this returns more info */
+	rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
+				  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	cifs_put_tlink(tlink);
+
+	if (!rc) {
+		cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
+	} else if (rc == -EREMOTE) {
+		cifs_create_dfs_fattr(&fattr, sb);
+		rc = 0;
+	} else {
+		return rc;
+	}
+
+	/* check for Minshall+French symlinks */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+		int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+		if (tmprc)
+			cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+	}
+
+	if (*pinode == NULL) {
+		/* get new inode */
+		cifs_fill_uniqueid(sb, &fattr);
+		*pinode = cifs_iget(sb, &fattr);
+		if (!*pinode)
+			rc = -ENOMEM;
+	} else {
+		/* we already have inode, update it */
+		cifs_fattr_to_inode(*pinode, &fattr);
+	}
+
+	return rc;
+}
+
+static int
+cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
+	      struct cifs_sb_info *cifs_sb, int xid)
+{
+	int rc;
+	int oplock = 0;
+	__u16 netfid;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	struct cifs_io_parms io_parms;
+	char buf[24];
+	unsigned int bytes_read;
+	char *pbuf;
+
+	pbuf = buf;
+
+	fattr->cf_mode &= ~S_IFMT;
+
+	if (fattr->cf_eof == 0) {
+		fattr->cf_mode |= S_IFIFO;
+		fattr->cf_dtype = DT_FIFO;
+		return 0;
+	} else if (fattr->cf_eof < 8) {
+		fattr->cf_mode |= S_IFREG;
+		fattr->cf_dtype = DT_REG;
+		return -EINVAL;	 /* EOPNOTSUPP? */
+	}
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ,
+			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
+			 cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc == 0) {
+		int buf_type = CIFS_NO_BUFFER;
+			/* Read header */
+		io_parms.netfid = netfid;
+		io_parms.pid = current->tgid;
+		io_parms.tcon = tcon;
+		io_parms.offset = 0;
+		io_parms.length = 24;
+		rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf,
+				 &buf_type);
+		if ((rc == 0) && (bytes_read >= 8)) {
+			if (memcmp("IntxBLK", pbuf, 8) == 0) {
+				cFYI(1, "Block device");
+				fattr->cf_mode |= S_IFBLK;
+				fattr->cf_dtype = DT_BLK;
+				if (bytes_read == 24) {
+					/* we have enough to decode dev num */
+					__u64 mjr; /* major */
+					__u64 mnr; /* minor */
+					mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
+					mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
+					fattr->cf_rdev = MKDEV(mjr, mnr);
+				}
+			} else if (memcmp("IntxCHR", pbuf, 8) == 0) {
+				cFYI(1, "Char device");
+				fattr->cf_mode |= S_IFCHR;
+				fattr->cf_dtype = DT_CHR;
+				if (bytes_read == 24) {
+					/* we have enough to decode dev num */
+					__u64 mjr; /* major */
+					__u64 mnr; /* minor */
+					mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
+					mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
+					fattr->cf_rdev = MKDEV(mjr, mnr);
+				}
+			} else if (memcmp("IntxLNK", pbuf, 7) == 0) {
+				cFYI(1, "Symlink");
+				fattr->cf_mode |= S_IFLNK;
+				fattr->cf_dtype = DT_LNK;
+			} else {
+				fattr->cf_mode |= S_IFREG; /* file? */
+				fattr->cf_dtype = DT_REG;
+				rc = -EOPNOTSUPP;
+			}
+		} else {
+			fattr->cf_mode |= S_IFREG; /* then it is a file */
+			fattr->cf_dtype = DT_REG;
+			rc = -EOPNOTSUPP; /* or some unknown SFU type */
+		}
+		CIFSSMBClose(xid, tcon, netfid);
+	}
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+#define SFBITS_MASK (S_ISVTX | S_ISGID | S_ISUID)  /* SETFILEBITS valid bits */
+
+/*
+ * Fetch mode bits as provided by SFU.
+ *
+ * FIXME: Doesn't this clobber the type bit we got from cifs_sfu_type ?
+ */
+static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
+			 struct cifs_sb_info *cifs_sb, int xid)
+{
+#ifdef CONFIG_CIFS_XATTR
+	ssize_t rc;
+	char ea_value[4];
+	__u32 mode;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS",
+			    ea_value, 4 /* size of buf */, cifs_sb->local_nls,
+			    cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	cifs_put_tlink(tlink);
+	if (rc < 0)
+		return (int)rc;
+	else if (rc > 3) {
+		mode = le32_to_cpu(*((__le32 *)ea_value));
+		fattr->cf_mode &= ~SFBITS_MASK;
+		cFYI(1, "special bits 0%o org mode 0%o", mode,
+			 fattr->cf_mode);
+		fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
+		cFYI(1, "special mode bits 0%o", mode);
+	}
+
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
+static void
+cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
+		       struct cifs_sb_info *cifs_sb, bool adjust_tz)
+{
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+	memset(fattr, 0, sizeof(*fattr));
+	fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
+	if (info->DeletePending)
+		fattr->cf_flags |= CIFS_FATTR_DELETE_PENDING;
+
+	if (info->LastAccessTime)
+		fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
+	else
+		fattr->cf_atime = CURRENT_TIME;
+
+	fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
+	fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
+
+	if (adjust_tz) {
+		fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj;
+		fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj;
+	}
+
+	fattr->cf_eof = le64_to_cpu(info->EndOfFile);
+	fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+	fattr->cf_createtime = le64_to_cpu(info->CreationTime);
+
+	if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+		fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
+		fattr->cf_dtype = DT_DIR;
+	} else {
+		fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
+		fattr->cf_dtype = DT_REG;
+
+		/* clear write bits if ATTR_READONLY is set */
+		if (fattr->cf_cifsattrs & ATTR_READONLY)
+			fattr->cf_mode &= ~(S_IWUGO);
+	}
+
+	fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+
+	fattr->cf_uid = cifs_sb->mnt_uid;
+	fattr->cf_gid = cifs_sb->mnt_gid;
+}
+
+int cifs_get_file_info(struct file *filp)
+{
+	int rc;
+	int xid;
+	FILE_ALL_INFO find_data;
+	struct cifs_fattr fattr;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsFileInfo *cfile = filp->private_data;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+
+	xid = GetXid();
+	rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
+	switch (rc) {
+	case 0:
+		cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
+		break;
+	case -EREMOTE:
+		cifs_create_dfs_fattr(&fattr, inode->i_sb);
+		rc = 0;
+		break;
+	case -EOPNOTSUPP:
+	case -EINVAL:
+		/*
+		 * FIXME: legacy server -- fall back to path-based call?
+		 * for now, just skip revalidating and mark inode for
+		 * immediate reval.
+		 */
+		rc = 0;
+		CIFS_I(inode)->time = 0;
+	default:
+		goto cgfi_exit;
+	}
+
+	/*
+	 * don't bother with SFU junk here -- just mark inode as needing
+	 * revalidation.
+	 */
+	fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
+	fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
+	cifs_fattr_to_inode(inode, &fattr);
+cgfi_exit:
+	FreeXid(xid);
+	return rc;
+}
+
+int cifs_get_inode_info(struct inode **pinode,
+	const unsigned char *full_path, FILE_ALL_INFO *pfindData,
+	struct super_block *sb, int xid, const __u16 *pfid)
+{
+	int rc = 0, tmprc;
+	struct cifs_tcon *pTcon;
+	struct tcon_link *tlink;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	char *buf = NULL;
+	bool adjustTZ = false;
+	struct cifs_fattr fattr;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	cFYI(1, "Getting info on %s", full_path);
+
+	if ((pfindData == NULL) && (*pinode != NULL)) {
+		if (CIFS_I(*pinode)->clientCanCacheRead) {
+			cFYI(1, "No need to revalidate cached inode sizes");
+			goto cgii_exit;
+		}
+	}
+
+	/* if file info not passed in then get it from server */
+	if (pfindData == NULL) {
+		buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+		if (buf == NULL) {
+			rc = -ENOMEM;
+			goto cgii_exit;
+		}
+		pfindData = (FILE_ALL_INFO *)buf;
+
+		/* could do find first instead but this returns more info */
+		rc = CIFSSMBQPathInfo(xid, pTcon, full_path, pfindData,
+			      0 /* not legacy */,
+			      cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+		/* BB optimize code so we do not make the above call
+		when server claims no NT SMB support and the above call
+		failed at least once - set flag in tcon or mount */
+		if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
+			rc = SMBQueryInformation(xid, pTcon, full_path,
+					pfindData, cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
+					  CIFS_MOUNT_MAP_SPECIAL_CHR);
+			adjustTZ = true;
+		}
+	}
+
+	if (!rc) {
+		cifs_all_info_to_fattr(&fattr, (FILE_ALL_INFO *) pfindData,
+				       cifs_sb, adjustTZ);
+	} else if (rc == -EREMOTE) {
+		cifs_create_dfs_fattr(&fattr, sb);
+		rc = 0;
+	} else {
+		goto cgii_exit;
+	}
+
+	/*
+	 * If an inode wasn't passed in, then get the inode number
+	 *
+	 * Is an i_ino of zero legal? Can we use that to check if the server
+	 * supports returning inode numbers?  Are there other sanity checks we
+	 * can use to ensure that the server is really filling in that field?
+	 *
+	 * We can not use the IndexNumber field by default from Windows or
+	 * Samba (in ALL_INFO buf) but we can request it explicitly. The SNIA
+	 * CIFS spec claims that this value is unique within the scope of a
+	 * share, and the windows docs hint that it's actually unique
+	 * per-machine.
+	 *
+	 * There may be higher info levels that work but are there Windows
+	 * server or network appliances for which IndexNumber field is not
+	 * guaranteed unique?
+	 */
+	if (*pinode == NULL) {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+			int rc1 = 0;
+
+			rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
+					full_path, &fattr.cf_uniqueid,
+					cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+			if (rc1 || !fattr.cf_uniqueid) {
+				cFYI(1, "GetSrvInodeNum rc %d", rc1);
+				fattr.cf_uniqueid = iunique(sb, ROOT_I);
+				cifs_autodisable_serverino(cifs_sb);
+			}
+		} else {
+			fattr.cf_uniqueid = iunique(sb, ROOT_I);
+		}
+	} else {
+		fattr.cf_uniqueid = CIFS_I(*pinode)->uniqueid;
+	}
+
+	/* query for SFU type info if supported and needed */
+	if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
+	    cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+		tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
+		if (tmprc)
+			cFYI(1, "cifs_sfu_type failed: %d", tmprc);
+	}
+
+#ifdef CONFIG_CIFS_ACL
+	/* fill in 0777 bits from ACL */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+		rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path,
+						pfid);
+		if (rc) {
+			cFYI(1, "%s: Getting ACL failed with error: %d",
+				__func__, rc);
+			goto cgii_exit;
+		}
+	}
+#endif /* CONFIG_CIFS_ACL */
+
+	/* fill in remaining high mode bits e.g. SUID, VTX */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
+		cifs_sfu_mode(&fattr, full_path, cifs_sb, xid);
+
+	/* check for Minshall+French symlinks */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+		tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+		if (tmprc)
+			cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+	}
+
+	if (!*pinode) {
+		*pinode = cifs_iget(sb, &fattr);
+		if (!*pinode)
+			rc = -ENOMEM;
+	} else {
+		cifs_fattr_to_inode(*pinode, &fattr);
+	}
+
+cgii_exit:
+	kfree(buf);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+static const struct inode_operations cifs_ipc_inode_ops = {
+	.lookup = cifs_lookup,
+};
+
+char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
+			      struct cifs_tcon *tcon)
+{
+	int pplen = vol->prepath ? strlen(vol->prepath) : 0;
+	int dfsplen;
+	char *full_path = NULL;
+
+	/* if no prefix path, simply set path to the root of share to "" */
+	if (pplen == 0) {
+		full_path = kmalloc(1, GFP_KERNEL);
+		if (full_path)
+			full_path[0] = 0;
+		return full_path;
+	}
+
+	if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+		dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
+	else
+		dfsplen = 0;
+
+	full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
+	if (full_path == NULL)
+		return full_path;
+
+	if (dfsplen)
+		strncpy(full_path, tcon->treeName, dfsplen);
+	strncpy(full_path + dfsplen, vol->prepath, pplen);
+	convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
+	full_path[dfsplen + pplen] = 0; /* add trailing null */
+	return full_path;
+}
+
+static int
+cifs_find_inode(struct inode *inode, void *opaque)
+{
+	struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
+
+	/* don't match inode with different uniqueid */
+	if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
+		return 0;
+
+	/* use createtime like an i_generation field */
+	if (CIFS_I(inode)->createtime != fattr->cf_createtime)
+		return 0;
+
+	/* don't match inode of different type */
+	if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
+		return 0;
+
+	/* if it's not a directory or has no dentries, then flag it */
+	if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry))
+		fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
+
+	return 1;
+}
+
+static int
+cifs_init_inode(struct inode *inode, void *opaque)
+{
+	struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
+
+	CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
+	CIFS_I(inode)->createtime = fattr->cf_createtime;
+	return 0;
+}
+
+/*
+ * walk dentry list for an inode and report whether it has aliases that
+ * are hashed. We use this to determine if a directory inode can actually
+ * be used.
+ */
+static bool
+inode_has_hashed_dentries(struct inode *inode)
+{
+	struct dentry *dentry;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+		if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
+			spin_unlock(&inode->i_lock);
+			return true;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	return false;
+}
+
+/* Given fattrs, get a corresponding inode */
+struct inode *
+cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
+{
+	unsigned long hash;
+	struct inode *inode;
+
+retry_iget5_locked:
+	cFYI(1, "looking for uniqueid=%llu", fattr->cf_uniqueid);
+
+	/* hash down to 32-bits on 32-bit arch */
+	hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
+
+	inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
+	if (inode) {
+		/* was there a potentially problematic inode collision? */
+		if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
+			fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
+
+			if (inode_has_hashed_dentries(inode)) {
+				cifs_autodisable_serverino(CIFS_SB(sb));
+				iput(inode);
+				fattr->cf_uniqueid = iunique(sb, ROOT_I);
+				goto retry_iget5_locked;
+			}
+		}
+
+		cifs_fattr_to_inode(inode, fattr);
+		if (sb->s_flags & MS_NOATIME)
+			inode->i_flags |= S_NOATIME | S_NOCMTIME;
+		if (inode->i_state & I_NEW) {
+			inode->i_ino = hash;
+			if (S_ISREG(inode->i_mode))
+				inode->i_data.backing_dev_info = sb->s_bdi;
+#ifdef CONFIG_CIFS_FSCACHE
+			/* initialize per-inode cache cookie pointer */
+			CIFS_I(inode)->fscache = NULL;
+#endif
+			unlock_new_inode(inode);
+		}
+	}
+
+	return inode;
+}
+
+/* gets root inode */
+struct inode *cifs_root_iget(struct super_block *sb)
+{
+	int xid;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct inode *inode = NULL;
+	long rc;
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+	xid = GetXid();
+	if (tcon->unix_ext)
+		rc = cifs_get_inode_info_unix(&inode, "", sb, xid);
+	else
+		rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL);
+
+	if (!inode) {
+		inode = ERR_PTR(rc);
+		goto out;
+	}
+
+#ifdef CONFIG_CIFS_FSCACHE
+	/* populate tcon->resource_id */
+	tcon->resource_id = CIFS_I(inode)->uniqueid;
+#endif
+
+	if (rc && tcon->ipc) {
+		cFYI(1, "ipc connection - fake read inode");
+		inode->i_mode |= S_IFDIR;
+		inode->i_nlink = 2;
+		inode->i_op = &cifs_ipc_inode_ops;
+		inode->i_fop = &simple_dir_operations;
+		inode->i_uid = cifs_sb->mnt_uid;
+		inode->i_gid = cifs_sb->mnt_gid;
+	} else if (rc) {
+		iget_failed(inode);
+		inode = ERR_PTR(rc);
+	}
+
+out:
+	/* can not call macro FreeXid here since in a void func
+	 * TODO: This is no longer true
+	 */
+	_FreeXid(xid);
+	return inode;
+}
+
+static int
+cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
+		    char *full_path, __u32 dosattr)
+{
+	int rc;
+	int oplock = 0;
+	__u16 netfid;
+	__u32 netpid;
+	bool set_time = false;
+	struct cifsFileInfo *open_file;
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink = NULL;
+	struct cifs_tcon *pTcon;
+	FILE_BASIC_INFO	info_buf;
+
+	if (attrs == NULL)
+		return -EINVAL;
+
+	if (attrs->ia_valid & ATTR_ATIME) {
+		set_time = true;
+		info_buf.LastAccessTime =
+			cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
+	} else
+		info_buf.LastAccessTime = 0;
+
+	if (attrs->ia_valid & ATTR_MTIME) {
+		set_time = true;
+		info_buf.LastWriteTime =
+		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
+	} else
+		info_buf.LastWriteTime = 0;
+
+	/*
+	 * Samba throws this field away, but windows may actually use it.
+	 * Do not set ctime unless other time stamps are changed explicitly
+	 * (i.e. by utimes()) since we would then have a mix of client and
+	 * server times.
+	 */
+	if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
+		cFYI(1, "CIFS - CTIME changed");
+		info_buf.ChangeTime =
+		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
+	} else
+		info_buf.ChangeTime = 0;
+
+	info_buf.CreationTime = 0;	/* don't change */
+	info_buf.Attributes = cpu_to_le32(dosattr);
+
+	/*
+	 * If the file is already open for write, just use that fileid
+	 */
+	open_file = find_writable_file(cifsInode, true);
+	if (open_file) {
+		netfid = open_file->netfid;
+		netpid = open_file->pid;
+		pTcon = tlink_tcon(open_file->tlink);
+		goto set_via_filehandle;
+	}
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		tlink = NULL;
+		goto out;
+	}
+	pTcon = tlink_tcon(tlink);
+
+	/*
+	 * NT4 apparently returns success on this call, but it doesn't
+	 * really work.
+	 */
+	if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
+		rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
+				     &info_buf, cifs_sb->local_nls,
+				     cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (rc == 0) {
+			cifsInode->cifsAttrs = dosattr;
+			goto out;
+		} else if (rc != -EOPNOTSUPP && rc != -EINVAL)
+			goto out;
+	}
+
+	cFYI(1, "calling SetFileInfo since SetPathInfo for "
+		 "times not supported by this server");
+	rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
+			 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
+			 CREATE_NOT_DIR, &netfid, &oplock,
+			 NULL, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if (rc != 0) {
+		if (rc == -EIO)
+			rc = -EINVAL;
+		goto out;
+	}
+
+	netpid = current->tgid;
+
+set_via_filehandle:
+	rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
+	if (!rc)
+		cifsInode->cifsAttrs = dosattr;
+
+	if (open_file == NULL)
+		CIFSSMBClose(xid, pTcon, netfid);
+	else
+		cifsFileInfo_put(open_file);
+out:
+	if (tlink != NULL)
+		cifs_put_tlink(tlink);
+	return rc;
+}
+
+/*
+ * open the given file (if it isn't already), set the DELETE_ON_CLOSE bit
+ * and rename it to a random name that hopefully won't conflict with
+ * anything else.
+ */
+static int
+cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
+{
+	int oplock = 0;
+	int rc;
+	__u16 netfid;
+	struct inode *inode = dentry->d_inode;
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	__u32 dosattr, origattr;
+	FILE_BASIC_INFO *info_buf = NULL;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
+			 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
+			 &netfid, &oplock, NULL, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc != 0)
+		goto out;
+
+	origattr = cifsInode->cifsAttrs;
+	if (origattr == 0)
+		origattr |= ATTR_NORMAL;
+
+	dosattr = origattr & ~ATTR_READONLY;
+	if (dosattr == 0)
+		dosattr |= ATTR_NORMAL;
+	dosattr |= ATTR_HIDDEN;
+
+	/* set ATTR_HIDDEN and clear ATTR_READONLY, but only if needed */
+	if (dosattr != origattr) {
+		info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL);
+		if (info_buf == NULL) {
+			rc = -ENOMEM;
+			goto out_close;
+		}
+		info_buf->Attributes = cpu_to_le32(dosattr);
+		rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid,
+					current->tgid);
+		/* although we would like to mark the file hidden
+ 		   if that fails we will still try to rename it */
+		if (rc != 0)
+			cifsInode->cifsAttrs = dosattr;
+		else
+			dosattr = origattr; /* since not able to change them */
+	}
+
+	/* rename the file */
+	rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
+				   cifs_sb->mnt_cifs_flags &
+					    CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc != 0) {
+		rc = -ETXTBSY;
+		goto undo_setattr;
+	}
+
+	/* try to set DELETE_ON_CLOSE */
+	if (!cifsInode->delete_pending) {
+		rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid,
+					       current->tgid);
+		/*
+		 * some samba versions return -ENOENT when we try to set the
+		 * file disposition here. Likely a samba bug, but work around
+		 * it for now. This means that some cifsXXX files may hang
+		 * around after they shouldn't.
+		 *
+		 * BB: remove this hack after more servers have the fix
+		 */
+		if (rc == -ENOENT)
+			rc = 0;
+		else if (rc != 0) {
+			rc = -ETXTBSY;
+			goto undo_rename;
+		}
+		cifsInode->delete_pending = true;
+	}
+
+out_close:
+	CIFSSMBClose(xid, tcon, netfid);
+out:
+	kfree(info_buf);
+	cifs_put_tlink(tlink);
+	return rc;
+
+	/*
+	 * reset everything back to the original state. Don't bother
+	 * dealing with errors here since we can't do anything about
+	 * them anyway.
+	 */
+undo_rename:
+	CIFSSMBRenameOpenFile(xid, tcon, netfid, dentry->d_name.name,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					    CIFS_MOUNT_MAP_SPECIAL_CHR);
+undo_setattr:
+	if (dosattr != origattr) {
+		info_buf->Attributes = cpu_to_le32(origattr);
+		if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid,
+					current->tgid))
+			cifsInode->cifsAttrs = origattr;
+	}
+
+	goto out_close;
+}
+
+
+/*
+ * If dentry->d_inode is null (usually meaning the cached dentry
+ * is a negative dentry) then we would attempt a standard SMB delete, but
+ * if that fails we can not attempt the fall back mechanisms on EACCESS
+ * but will return the EACCESS to the caller. Note that the VFS does not call
+ * unlink on negative dentries currently.
+ */
+int cifs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int rc = 0;
+	int xid;
+	char *full_path = NULL;
+	struct inode *inode = dentry->d_inode;
+	struct cifsInodeInfo *cifs_inode;
+	struct super_block *sb = dir->i_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	struct iattr *attrs = NULL;
+	__u32 dosattr = 0, origattr = 0;
+
+	cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	xid = GetXid();
+
+	/* Unlink can be called from rename so we can not take the
+	 * sb->s_vfs_rename_mutex here */
+	full_path = build_path_from_dentry(dentry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto unlink_out;
+	}
+
+	if ((tcon->ses->capabilities & CAP_UNIX) &&
+		(CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		rc = CIFSPOSIXDelFile(xid, tcon, full_path,
+			SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		cFYI(1, "posix del rc %d", rc);
+		if ((rc == 0) || (rc == -ENOENT))
+			goto psx_del_no_retry;
+	}
+
+retry_std_delete:
+	rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+psx_del_no_retry:
+	if (!rc) {
+		if (inode)
+			drop_nlink(inode);
+	} else if (rc == -ENOENT) {
+		d_drop(dentry);
+	} else if (rc == -ETXTBSY) {
+		rc = cifs_rename_pending_delete(full_path, dentry, xid);
+		if (rc == 0)
+			drop_nlink(inode);
+	} else if ((rc == -EACCES) && (dosattr == 0) && inode) {
+		attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
+		if (attrs == NULL) {
+			rc = -ENOMEM;
+			goto out_reval;
+		}
+
+		/* try to reset dos attributes */
+		cifs_inode = CIFS_I(inode);
+		origattr = cifs_inode->cifsAttrs;
+		if (origattr == 0)
+			origattr |= ATTR_NORMAL;
+		dosattr = origattr & ~ATTR_READONLY;
+		if (dosattr == 0)
+			dosattr |= ATTR_NORMAL;
+		dosattr |= ATTR_HIDDEN;
+
+		rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr);
+		if (rc != 0)
+			goto out_reval;
+
+		goto retry_std_delete;
+	}
+
+	/* undo the setattr if we errored out and it's needed */
+	if (rc != 0 && dosattr != 0)
+		cifs_set_file_info(inode, attrs, xid, full_path, origattr);
+
+out_reval:
+	if (inode) {
+		cifs_inode = CIFS_I(inode);
+		cifs_inode->time = 0;	/* will force revalidate to get info
+					   when needed */
+		inode->i_ctime = current_fs_time(sb);
+	}
+	dir->i_ctime = dir->i_mtime = current_fs_time(sb);
+	cifs_inode = CIFS_I(dir);
+	CIFS_I(dir)->time = 0;	/* force revalidate of dir as well */
+unlink_out:
+	kfree(full_path);
+	kfree(attrs);
+	FreeXid(xid);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
+{
+	int rc = 0, tmprc;
+	int xid;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	char *full_path = NULL;
+	struct inode *newinode = NULL;
+	struct cifs_fattr fattr;
+
+	cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
+
+	cifs_sb = CIFS_SB(inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto mkdir_out;
+	}
+
+	if ((pTcon->ses->capabilities & CAP_UNIX) &&
+		(CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(pTcon->fsUnixInfo.Capability))) {
+		u32 oplock = 0;
+		FILE_UNIX_BASIC_INFO *pInfo =
+			kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+		if (pInfo == NULL) {
+			rc = -ENOMEM;
+			goto mkdir_out;
+		}
+
+		mode &= ~current_umask();
+		rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
+				mode, NULL /* netfid */, pInfo, &oplock,
+				full_path, cifs_sb->local_nls,
+				cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (rc == -EOPNOTSUPP) {
+			kfree(pInfo);
+			goto mkdir_retry_old;
+		} else if (rc) {
+			cFYI(1, "posix mkdir returned 0x%x", rc);
+			d_drop(direntry);
+		} else {
+			if (pInfo->Type == cpu_to_le32(-1)) {
+				/* no return info, go query for it */
+				kfree(pInfo);
+				goto mkdir_get_info;
+			}
+/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need
+	to set uid/gid */
+			inc_nlink(inode);
+
+			cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
+			cifs_fill_uniqueid(inode->i_sb, &fattr);
+			newinode = cifs_iget(inode->i_sb, &fattr);
+			if (!newinode) {
+				kfree(pInfo);
+				goto mkdir_get_info;
+			}
+
+			d_instantiate(direntry, newinode);
+
+#ifdef CONFIG_CIFS_DEBUG2
+			cFYI(1, "instantiated dentry %p %s to inode %p",
+				direntry, direntry->d_name.name, newinode);
+
+			if (newinode->i_nlink != 2)
+				cFYI(1, "unexpected number of links %d",
+					newinode->i_nlink);
+#endif
+		}
+		kfree(pInfo);
+		goto mkdir_out;
+	}
+mkdir_retry_old:
+	/* BB add setting the equivalent of mode via CreateX w/ACLs */
+	rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
+			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc) {
+		cFYI(1, "cifs_mkdir returned 0x%x", rc);
+		d_drop(direntry);
+	} else {
+mkdir_get_info:
+		inc_nlink(inode);
+		if (pTcon->unix_ext)
+			rc = cifs_get_inode_info_unix(&newinode, full_path,
+						      inode->i_sb, xid);
+		else
+			rc = cifs_get_inode_info(&newinode, full_path, NULL,
+						 inode->i_sb, xid, NULL);
+
+		d_instantiate(direntry, newinode);
+		 /* setting nlink not necessary except in cases where we
+		  * failed to get it from the server or was set bogus */
+		if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
+				direntry->d_inode->i_nlink = 2;
+
+		mode &= ~current_umask();
+		/* must turn on setgid bit if parent dir has it */
+		if (inode->i_mode & S_ISGID)
+			mode |= S_ISGID;
+
+		if (pTcon->unix_ext) {
+			struct cifs_unix_set_info_args args = {
+				.mode	= mode,
+				.ctime	= NO_CHANGE_64,
+				.atime	= NO_CHANGE_64,
+				.mtime	= NO_CHANGE_64,
+				.device	= 0,
+			};
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+				args.uid = (__u64)current_fsuid();
+				if (inode->i_mode & S_ISGID)
+					args.gid = (__u64)inode->i_gid;
+				else
+					args.gid = (__u64)current_fsgid();
+			} else {
+				args.uid = NO_CHANGE_64;
+				args.gid = NO_CHANGE_64;
+			}
+			CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
+					       cifs_sb->local_nls,
+					       cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		} else {
+			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
+			    (mode & S_IWUGO) == 0) {
+				FILE_BASIC_INFO pInfo;
+				struct cifsInodeInfo *cifsInode;
+				u32 dosattrs;
+
+				memset(&pInfo, 0, sizeof(pInfo));
+				cifsInode = CIFS_I(newinode);
+				dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
+				pInfo.Attributes = cpu_to_le32(dosattrs);
+				tmprc = CIFSSMBSetPathInfo(xid, pTcon,
+						full_path, &pInfo,
+						cifs_sb->local_nls,
+						cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+				if (tmprc == 0)
+					cifsInode->cifsAttrs = dosattrs;
+			}
+			if (direntry->d_inode) {
+				if (cifs_sb->mnt_cifs_flags &
+				     CIFS_MOUNT_DYNPERM)
+					direntry->d_inode->i_mode =
+						(mode | S_IFDIR);
+
+				if (cifs_sb->mnt_cifs_flags &
+				     CIFS_MOUNT_SET_UID) {
+					direntry->d_inode->i_uid =
+						current_fsuid();
+					if (inode->i_mode & S_ISGID)
+						direntry->d_inode->i_gid =
+							inode->i_gid;
+					else
+						direntry->d_inode->i_gid =
+							current_fsgid();
+				}
+			}
+		}
+	}
+mkdir_out:
+	kfree(full_path);
+	FreeXid(xid);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+int cifs_rmdir(struct inode *inode, struct dentry *direntry)
+{
+	int rc = 0;
+	int xid;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	char *full_path = NULL;
+	struct cifsInodeInfo *cifsInode;
+
+	cFYI(1, "cifs_rmdir, inode = 0x%p", inode);
+
+	xid = GetXid();
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto rmdir_exit;
+	}
+
+	cifs_sb = CIFS_SB(inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto rmdir_exit;
+	}
+	pTcon = tlink_tcon(tlink);
+
+	rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls,
+			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	cifs_put_tlink(tlink);
+
+	if (!rc) {
+		drop_nlink(inode);
+		spin_lock(&direntry->d_inode->i_lock);
+		i_size_write(direntry->d_inode, 0);
+		clear_nlink(direntry->d_inode);
+		spin_unlock(&direntry->d_inode->i_lock);
+	}
+
+	cifsInode = CIFS_I(direntry->d_inode);
+	cifsInode->time = 0;	/* force revalidate to go get info when
+				   needed */
+
+	cifsInode = CIFS_I(inode);
+	cifsInode->time = 0;	/* force revalidate to get parent dir info
+				   since cached search results now invalid */
+
+	direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
+		current_fs_time(inode->i_sb);
+
+rmdir_exit:
+	kfree(full_path);
+	FreeXid(xid);
+	return rc;
+}
+
+static int
+cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
+		struct dentry *to_dentry, const char *toPath)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	__u16 srcfid;
+	int oplock, rc;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	/* try path-based rename first */
+	rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
+			   cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	/*
+	 * don't bother with rename by filehandle unless file is busy and
+	 * source Note that cross directory moves do not work with
+	 * rename by filehandle to various Windows servers.
+	 */
+	if (rc == 0 || rc != -ETXTBSY)
+		goto do_rename_exit;
+
+	/* open-file renames don't work across directories */
+	if (to_dentry->d_parent != from_dentry->d_parent)
+		goto do_rename_exit;
+
+	/* open the file to be renamed -- we need DELETE perms */
+	rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
+			 CREATE_NOT_DIR, &srcfid, &oplock, NULL,
+			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if (rc == 0) {
+		rc = CIFSSMBRenameOpenFile(xid, pTcon, srcfid,
+				(const char *) to_dentry->d_name.name,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+		CIFSSMBClose(xid, pTcon, srcfid);
+	}
+do_rename_exit:
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
+	struct inode *target_dir, struct dentry *target_dentry)
+{
+	char *fromName = NULL;
+	char *toName = NULL;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
+	FILE_UNIX_BASIC_INFO *info_buf_target;
+	int xid, rc, tmprc;
+
+	cifs_sb = CIFS_SB(source_dir->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	xid = GetXid();
+
+	/*
+	 * we already have the rename sem so we do not need to
+	 * grab it again here to protect the path integrity
+	 */
+	fromName = build_path_from_dentry(source_dentry);
+	if (fromName == NULL) {
+		rc = -ENOMEM;
+		goto cifs_rename_exit;
+	}
+
+	toName = build_path_from_dentry(target_dentry);
+	if (toName == NULL) {
+		rc = -ENOMEM;
+		goto cifs_rename_exit;
+	}
+
+	rc = cifs_do_rename(xid, source_dentry, fromName,
+			    target_dentry, toName);
+
+	if (rc == -EEXIST && tcon->unix_ext) {
+		/*
+		 * Are src and dst hardlinks of same inode? We can
+		 * only tell with unix extensions enabled
+		 */
+		info_buf_source =
+			kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO),
+					GFP_KERNEL);
+		if (info_buf_source == NULL) {
+			rc = -ENOMEM;
+			goto cifs_rename_exit;
+		}
+
+		info_buf_target = info_buf_source + 1;
+		tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName,
+					info_buf_source,
+					cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (tmprc != 0)
+			goto unlink_target;
+
+		tmprc = CIFSSMBUnixQPathInfo(xid, tcon, toName,
+					info_buf_target,
+					cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+		if (tmprc == 0 && (info_buf_source->UniqueId ==
+				   info_buf_target->UniqueId)) {
+			/* same file, POSIX says that this is a noop */
+			rc = 0;
+			goto cifs_rename_exit;
+		}
+	} /* else ... BB we could add the same check for Windows by
+		     checking the UniqueId via FILE_INTERNAL_INFO */
+
+unlink_target:
+	/* Try unlinking the target dentry if it's not negative */
+	if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) {
+		tmprc = cifs_unlink(target_dir, target_dentry);
+		if (tmprc)
+			goto cifs_rename_exit;
+
+		rc = cifs_do_rename(xid, source_dentry, fromName,
+				    target_dentry, toName);
+	}
+
+cifs_rename_exit:
+	kfree(info_buf_source);
+	kfree(fromName);
+	kfree(toName);
+	FreeXid(xid);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+static bool
+cifs_inode_needs_reval(struct inode *inode)
+{
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+	if (cifs_i->clientCanCacheRead)
+		return false;
+
+	if (!lookupCacheEnabled)
+		return true;
+
+	if (cifs_i->time == 0)
+		return true;
+
+	if (!time_in_range(jiffies, cifs_i->time,
+				cifs_i->time + cifs_sb->actimeo))
+		return true;
+
+	/* hardlinked files w/ noserverino get "special" treatment */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+	    S_ISREG(inode->i_mode) && inode->i_nlink != 1)
+		return true;
+
+	return false;
+}
+
+/*
+ * Zap the cache. Called when invalid_mapping flag is set.
+ */
+int
+cifs_invalidate_mapping(struct inode *inode)
+{
+	int rc = 0;
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+
+	cifs_i->invalid_mapping = false;
+
+	if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
+		rc = invalidate_inode_pages2(inode->i_mapping);
+		if (rc) {
+			cERROR(1, "%s: could not invalidate inode %p", __func__,
+			       inode);
+			cifs_i->invalid_mapping = true;
+		}
+	}
+
+	cifs_fscache_reset_inode_cookie(inode);
+	return rc;
+}
+
+int cifs_revalidate_file_attr(struct file *filp)
+{
+	int rc = 0;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+
+	if (!cifs_inode_needs_reval(inode))
+		return rc;
+
+	if (tlink_tcon(cfile->tlink)->unix_ext)
+		rc = cifs_get_file_info_unix(filp);
+	else
+		rc = cifs_get_file_info(filp);
+
+	return rc;
+}
+
+int cifs_revalidate_dentry_attr(struct dentry *dentry)
+{
+	int xid;
+	int rc = 0;
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dentry->d_sb;
+	char *full_path = NULL;
+
+	if (inode == NULL)
+		return -ENOENT;
+
+	if (!cifs_inode_needs_reval(inode))
+		return rc;
+
+	xid = GetXid();
+
+	/* can not safely grab the rename sem here if rename calls revalidate
+	   since that would deadlock */
+	full_path = build_path_from_dentry(dentry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	cFYI(1, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time "
+		 "%ld jiffies %ld", full_path, inode, inode->i_count.counter,
+		 dentry, dentry->d_time, jiffies);
+
+	if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
+		rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
+	else
+		rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
+					 xid, NULL);
+
+out:
+	kfree(full_path);
+	FreeXid(xid);
+	return rc;
+}
+
+int cifs_revalidate_file(struct file *filp)
+{
+	int rc;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	rc = cifs_revalidate_file_attr(filp);
+	if (rc)
+		return rc;
+
+	if (CIFS_I(inode)->invalid_mapping)
+		rc = cifs_invalidate_mapping(inode);
+	return rc;
+}
+
+/* revalidate a dentry's inode attributes */
+int cifs_revalidate_dentry(struct dentry *dentry)
+{
+	int rc;
+	struct inode *inode = dentry->d_inode;
+
+	rc = cifs_revalidate_dentry_attr(dentry);
+	if (rc)
+		return rc;
+
+	if (CIFS_I(inode)->invalid_mapping)
+		rc = cifs_invalidate_mapping(inode);
+	return rc;
+}
+
+int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct inode *inode = dentry->d_inode;
+	int rc;
+
+	/*
+	 * We need to be sure that all dirty pages are written and the server
+	 * has actual ctime, mtime and file length.
+	 */
+	if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping &&
+	    inode->i_mapping->nrpages != 0) {
+		rc = filemap_fdatawait(inode->i_mapping);
+		if (rc) {
+			mapping_set_error(inode->i_mapping, rc);
+			return rc;
+		}
+	}
+
+	rc = cifs_revalidate_dentry_attr(dentry);
+	if (rc)
+		return rc;
+
+	generic_fillattr(inode, stat);
+	stat->blksize = CIFS_MAX_MSGSIZE;
+	stat->ino = CIFS_I(inode)->uniqueid;
+
+	/*
+	 * If on a multiuser mount without unix extensions, and the admin hasn't
+	 * overridden them, set the ownership to the fsuid/fsgid of the current
+	 * process.
+	 */
+	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
+	    !tcon->unix_ext) {
+		if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
+			stat->uid = current_fsuid();
+		if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
+			stat->gid = current_fsgid();
+	}
+	return rc;
+}
+
+static int cifs_truncate_page(struct address_space *mapping, loff_t from)
+{
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE - 1);
+	struct page *page;
+	int rc = 0;
+
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		return -ENOMEM;
+
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	unlock_page(page);
+	page_cache_release(page);
+	return rc;
+}
+
+static void cifs_setsize(struct inode *inode, loff_t offset)
+{
+	loff_t oldsize;
+
+	spin_lock(&inode->i_lock);
+	oldsize = inode->i_size;
+	i_size_write(inode, offset);
+	spin_unlock(&inode->i_lock);
+
+	truncate_pagecache(inode, oldsize, offset);
+}
+
+static int
+cifs_set_file_size(struct inode *inode, struct iattr *attrs,
+		   int xid, char *full_path)
+{
+	int rc;
+	struct cifsFileInfo *open_file;
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink = NULL;
+	struct cifs_tcon *pTcon = NULL;
+	struct cifs_io_parms io_parms;
+
+	/*
+	 * To avoid spurious oplock breaks from server, in the case of
+	 * inodes that we already have open, avoid doing path based
+	 * setting of file size if we can do it by handle.
+	 * This keeps our caching token (oplock) and avoids timeouts
+	 * when the local oplock break takes longer to flush
+	 * writebehind data than the SMB timeout for the SetPathInfo
+	 * request would allow
+	 */
+	open_file = find_writable_file(cifsInode, true);
+	if (open_file) {
+		__u16 nfid = open_file->netfid;
+		__u32 npid = open_file->pid;
+		pTcon = tlink_tcon(open_file->tlink);
+		rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
+					npid, false);
+		cifsFileInfo_put(open_file);
+		cFYI(1, "SetFSize for attrs rc = %d", rc);
+		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+			unsigned int bytes_written;
+
+			io_parms.netfid = nfid;
+			io_parms.pid = npid;
+			io_parms.tcon = pTcon;
+			io_parms.offset = 0;
+			io_parms.length = attrs->ia_size;
+			rc = CIFSSMBWrite(xid, &io_parms, &bytes_written,
+					  NULL, NULL, 1);
+			cFYI(1, "Wrt seteof rc %d", rc);
+		}
+	} else
+		rc = -EINVAL;
+
+	if (rc != 0) {
+		if (pTcon == NULL) {
+			tlink = cifs_sb_tlink(cifs_sb);
+			if (IS_ERR(tlink))
+				return PTR_ERR(tlink);
+			pTcon = tlink_tcon(tlink);
+		}
+
+		/* Set file size by pathname rather than by handle
+		   either because no valid, writeable file handle for
+		   it was found or because there was an error setting
+		   it by handle */
+		rc = CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size,
+				   false, cifs_sb->local_nls,
+				   cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		cFYI(1, "SetEOF by path (setattrs) rc = %d", rc);
+		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+			__u16 netfid;
+			int oplock = 0;
+
+			rc = SMBLegacyOpen(xid, pTcon, full_path,
+				FILE_OPEN, GENERIC_WRITE,
+				CREATE_NOT_DIR, &netfid, &oplock, NULL,
+				cifs_sb->local_nls,
+				cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+			if (rc == 0) {
+				unsigned int bytes_written;
+
+				io_parms.netfid = netfid;
+				io_parms.pid = current->tgid;
+				io_parms.tcon = pTcon;
+				io_parms.offset = 0;
+				io_parms.length = attrs->ia_size;
+				rc = CIFSSMBWrite(xid, &io_parms,
+						  &bytes_written,
+						  NULL, NULL,  1);
+				cFYI(1, "wrt seteof rc %d", rc);
+				CIFSSMBClose(xid, pTcon, netfid);
+			}
+		}
+		if (tlink)
+			cifs_put_tlink(tlink);
+	}
+
+	if (rc == 0) {
+		cifsInode->server_eof = attrs->ia_size;
+		cifs_setsize(inode, attrs->ia_size);
+		cifs_truncate_page(inode->i_mapping, inode->i_size);
+	}
+
+	return rc;
+}
+
+static int
+cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
+{
+	int rc;
+	int xid;
+	char *full_path = NULL;
+	struct inode *inode = direntry->d_inode;
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	struct cifs_unix_set_info_args *args = NULL;
+	struct cifsFileInfo *open_file;
+
+	cFYI(1, "setattr_unix on file %s attrs->ia_valid=0x%x",
+		 direntry->d_name.name, attrs->ia_valid);
+
+	xid = GetXid();
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
+		attrs->ia_valid |= ATTR_FORCE;
+
+	rc = inode_change_ok(inode, attrs);
+	if (rc < 0)
+		goto out;
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * Attempt to flush data before changing attributes. We need to do
+	 * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
+	 * ownership or mode then we may also need to do this. Here, we take
+	 * the safe way out and just do the flush on all setattr requests. If
+	 * the flush returns error, store it to report later and continue.
+	 *
+	 * BB: This should be smarter. Why bother flushing pages that
+	 * will be truncated anyway? Also, should we error out here if
+	 * the flush returns error?
+	 */
+	rc = filemap_write_and_wait(inode->i_mapping);
+	mapping_set_error(inode->i_mapping, rc);
+	rc = 0;
+
+	if (attrs->ia_valid & ATTR_SIZE) {
+		rc = cifs_set_file_size(inode, attrs, xid, full_path);
+		if (rc != 0)
+			goto out;
+	}
+
+	/* skip mode change if it's just for clearing setuid/setgid */
+	if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
+		attrs->ia_valid &= ~ATTR_MODE;
+
+	args = kmalloc(sizeof(*args), GFP_KERNEL);
+	if (args == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* set up the struct */
+	if (attrs->ia_valid & ATTR_MODE)
+		args->mode = attrs->ia_mode;
+	else
+		args->mode = NO_CHANGE_64;
+
+	if (attrs->ia_valid & ATTR_UID)
+		args->uid = attrs->ia_uid;
+	else
+		args->uid = NO_CHANGE_64;
+
+	if (attrs->ia_valid & ATTR_GID)
+		args->gid = attrs->ia_gid;
+	else
+		args->gid = NO_CHANGE_64;
+
+	if (attrs->ia_valid & ATTR_ATIME)
+		args->atime = cifs_UnixTimeToNT(attrs->ia_atime);
+	else
+		args->atime = NO_CHANGE_64;
+
+	if (attrs->ia_valid & ATTR_MTIME)
+		args->mtime = cifs_UnixTimeToNT(attrs->ia_mtime);
+	else
+		args->mtime = NO_CHANGE_64;
+
+	if (attrs->ia_valid & ATTR_CTIME)
+		args->ctime = cifs_UnixTimeToNT(attrs->ia_ctime);
+	else
+		args->ctime = NO_CHANGE_64;
+
+	args->device = 0;
+	open_file = find_writable_file(cifsInode, true);
+	if (open_file) {
+		u16 nfid = open_file->netfid;
+		u32 npid = open_file->pid;
+		pTcon = tlink_tcon(open_file->tlink);
+		rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
+		cifsFileInfo_put(open_file);
+	} else {
+		tlink = cifs_sb_tlink(cifs_sb);
+		if (IS_ERR(tlink)) {
+			rc = PTR_ERR(tlink);
+			goto out;
+		}
+		pTcon = tlink_tcon(tlink);
+		rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
+				    cifs_sb->local_nls,
+				    cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		cifs_put_tlink(tlink);
+	}
+
+	if (rc)
+		goto out;
+
+	if ((attrs->ia_valid & ATTR_SIZE) &&
+	    attrs->ia_size != i_size_read(inode))
+		truncate_setsize(inode, attrs->ia_size);
+
+	setattr_copy(inode, attrs);
+	mark_inode_dirty(inode);
+
+	/* force revalidate when any of these times are set since some
+	   of the fs types (eg ext3, fat) do not have fine enough
+	   time granularity to match protocol, and we do not have a
+	   a way (yet) to query the server fs's time granularity (and
+	   whether it rounds times down).
+	*/
+	if (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME))
+		cifsInode->time = 0;
+out:
+	kfree(args);
+	kfree(full_path);
+	FreeXid(xid);
+	return rc;
+}
+
+static int
+cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
+{
+	int xid;
+	struct inode *inode = direntry->d_inode;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	char *full_path = NULL;
+	int rc = -EACCES;
+	__u32 dosattr = 0;
+	__u64 mode = NO_CHANGE_64;
+
+	xid = GetXid();
+
+	cFYI(1, "setattr on file %s attrs->iavalid 0x%x",
+		 direntry->d_name.name, attrs->ia_valid);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
+		attrs->ia_valid |= ATTR_FORCE;
+
+	rc = inode_change_ok(inode, attrs);
+	if (rc < 0) {
+		FreeXid(xid);
+		return rc;
+	}
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		FreeXid(xid);
+		return rc;
+	}
+
+	/*
+	 * Attempt to flush data before changing attributes. We need to do
+	 * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
+	 * ownership or mode then we may also need to do this. Here, we take
+	 * the safe way out and just do the flush on all setattr requests. If
+	 * the flush returns error, store it to report later and continue.
+	 *
+	 * BB: This should be smarter. Why bother flushing pages that
+	 * will be truncated anyway? Also, should we error out here if
+	 * the flush returns error?
+	 */
+	rc = filemap_write_and_wait(inode->i_mapping);
+	mapping_set_error(inode->i_mapping, rc);
+	rc = 0;
+
+	if (attrs->ia_valid & ATTR_SIZE) {
+		rc = cifs_set_file_size(inode, attrs, xid, full_path);
+		if (rc != 0)
+			goto cifs_setattr_exit;
+	}
+
+	/*
+	 * Without unix extensions we can't send ownership changes to the
+	 * server, so silently ignore them. This is consistent with how
+	 * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With
+	 * CIFSACL support + proper Windows to Unix idmapping, we may be
+	 * able to support this in the future.
+	 */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID))
+		attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
+
+	/* skip mode change if it's just for clearing setuid/setgid */
+	if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
+		attrs->ia_valid &= ~ATTR_MODE;
+
+	if (attrs->ia_valid & ATTR_MODE) {
+		cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
+		mode = attrs->ia_mode;
+	}
+
+	if (attrs->ia_valid & ATTR_MODE) {
+		rc = 0;
+#ifdef CONFIG_CIFS_ACL
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+			rc = mode_to_cifs_acl(inode, full_path, mode);
+			if (rc) {
+				cFYI(1, "%s: Setting ACL failed with error: %d",
+					__func__, rc);
+				goto cifs_setattr_exit;
+			}
+		} else
+#endif /* CONFIG_CIFS_ACL */
+		if (((mode & S_IWUGO) == 0) &&
+		    (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
+
+			dosattr = cifsInode->cifsAttrs | ATTR_READONLY;
+
+			/* fix up mode if we're not using dynperm */
+			if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
+				attrs->ia_mode = inode->i_mode & ~S_IWUGO;
+		} else if ((mode & S_IWUGO) &&
+			   (cifsInode->cifsAttrs & ATTR_READONLY)) {
+
+			dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY;
+			/* Attributes of 0 are ignored */
+			if (dosattr == 0)
+				dosattr |= ATTR_NORMAL;
+
+			/* reset local inode permissions to normal */
+			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) {
+				attrs->ia_mode &= ~(S_IALLUGO);
+				if (S_ISDIR(inode->i_mode))
+					attrs->ia_mode |=
+						cifs_sb->mnt_dir_mode;
+				else
+					attrs->ia_mode |=
+						cifs_sb->mnt_file_mode;
+			}
+		} else if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) {
+			/* ignore mode change - ATTR_READONLY hasn't changed */
+			attrs->ia_valid &= ~ATTR_MODE;
+		}
+	}
+
+	if (attrs->ia_valid & (ATTR_MTIME|ATTR_ATIME|ATTR_CTIME) ||
+	    ((attrs->ia_valid & ATTR_MODE) && dosattr)) {
+		rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr);
+		/* BB: check for rc = -EOPNOTSUPP and switch to legacy mode */
+
+		/* Even if error on time set, no sense failing the call if
+		the server would set the time to a reasonable value anyway,
+		and this check ensures that we are not being called from
+		sys_utimes in which case we ought to fail the call back to
+		the user when the server rejects the call */
+		if ((rc) && (attrs->ia_valid &
+				(ATTR_MODE | ATTR_GID | ATTR_UID | ATTR_SIZE)))
+			rc = 0;
+	}
+
+	/* do not need local check to inode_check_ok since the server does
+	   that */
+	if (rc)
+		goto cifs_setattr_exit;
+
+	if ((attrs->ia_valid & ATTR_SIZE) &&
+	    attrs->ia_size != i_size_read(inode))
+		truncate_setsize(inode, attrs->ia_size);
+
+	setattr_copy(inode, attrs);
+	mark_inode_dirty(inode);
+
+cifs_setattr_exit:
+	kfree(full_path);
+	FreeXid(xid);
+	return rc;
+}
+
+int
+cifs_setattr(struct dentry *direntry, struct iattr *attrs)
+{
+	struct inode *inode = direntry->d_inode;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
+
+	if (pTcon->unix_ext)
+		return cifs_setattr_unix(direntry, attrs);
+
+	return cifs_setattr_nounix(direntry, attrs);
+
+	/* BB: add cifs_setattr_legacy for really old servers */
+}
+
+#if 0
+void cifs_delete_inode(struct inode *inode)
+{
+	cFYI(1, "In cifs_delete_inode, inode = 0x%p", inode);
+	/* may have to add back in if and when safe distributed caching of
+	   directories added e.g. via FindNotify */
+}
+#endif
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
new file mode 100644
index 00000000..4221b5e4
--- /dev/null
+++ b/fs/cifs/ioctl.c
@@ -0,0 +1,102 @@
+/*
+ *   fs/cifs/ioctl.c
+ *
+ *   vfs operations that deal with io control
+ *
+ *   Copyright (C) International Business Machines  Corp., 2005,2007
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifsfs.h"
+
+#define CIFS_IOC_CHECKUMOUNT _IO(0xCF, 2)
+
+long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
+{
+	struct inode *inode = filep->f_dentry->d_inode;
+	int rc = -ENOTTY; /* strange error - but the precedent */
+	int xid;
+	struct cifs_sb_info *cifs_sb;
+#ifdef CONFIG_CIFS_POSIX
+	struct cifsFileInfo *pSMBFile = filep->private_data;
+	struct cifs_tcon *tcon;
+	__u64	ExtAttrBits = 0;
+	__u64	ExtAttrMask = 0;
+	__u64   caps;
+#endif /* CONFIG_CIFS_POSIX */
+
+	xid = GetXid();
+
+	cFYI(1, "ioctl file %p  cmd %u  arg %lu", filep, command, arg);
+
+	cifs_sb = CIFS_SB(inode->i_sb);
+
+	switch (command) {
+		case CIFS_IOC_CHECKUMOUNT:
+			cFYI(1, "User unmount attempted");
+			if (cifs_sb->mnt_uid == current_uid())
+				rc = 0;
+			else {
+				rc = -EACCES;
+				cFYI(1, "uids do not match");
+			}
+			break;
+#ifdef CONFIG_CIFS_POSIX
+		case FS_IOC_GETFLAGS:
+			if (pSMBFile == NULL)
+				break;
+			tcon = tlink_tcon(pSMBFile->tlink);
+			caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+			if (CIFS_UNIX_EXTATTR_CAP & caps) {
+				rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid,
+					&ExtAttrBits, &ExtAttrMask);
+				if (rc == 0)
+					rc = put_user(ExtAttrBits &
+						FS_FL_USER_VISIBLE,
+						(int __user *)arg);
+			}
+			break;
+
+		case FS_IOC_SETFLAGS:
+			if (pSMBFile == NULL)
+				break;
+			tcon = tlink_tcon(pSMBFile->tlink);
+			caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+			if (CIFS_UNIX_EXTATTR_CAP & caps) {
+				if (get_user(ExtAttrBits, (int __user *)arg)) {
+					rc = -EFAULT;
+					break;
+				}
+				/* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
+					extAttrBits, &ExtAttrMask);*/
+			}
+			cFYI(1, "set flags not implemented yet");
+			break;
+#endif /* CONFIG_CIFS_POSIX */
+		default:
+			cFYI(1, "unsupported ioctl");
+			break;
+	}
+
+	FreeXid(xid);
+	return rc;
+}
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
new file mode 100644
index 00000000..556b1a0b
--- /dev/null
+++ b/fs/cifs/link.c
@@ -0,0 +1,593 @@
+/*
+ *   fs/cifs/link.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+
+#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
+#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
+#define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1))
+#define CIFS_MF_SYMLINK_LINK_MAXLEN (1024)
+#define CIFS_MF_SYMLINK_FILE_SIZE \
+	(CIFS_MF_SYMLINK_LINK_OFFSET + CIFS_MF_SYMLINK_LINK_MAXLEN)
+
+#define CIFS_MF_SYMLINK_LEN_FORMAT "XSym\n%04u\n"
+#define CIFS_MF_SYMLINK_MD5_FORMAT \
+	"%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n"
+#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) \
+	md5_hash[0],  md5_hash[1],  md5_hash[2],  md5_hash[3], \
+	md5_hash[4],  md5_hash[5],  md5_hash[6],  md5_hash[7], \
+	md5_hash[8],  md5_hash[9],  md5_hash[10], md5_hash[11],\
+	md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
+
+static int
+symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
+{
+	int rc;
+	unsigned int size;
+	struct crypto_shash *md5;
+	struct sdesc *sdescmd5;
+
+	md5 = crypto_alloc_shash("md5", 0, 0);
+	if (IS_ERR(md5)) {
+		rc = PTR_ERR(md5);
+		cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
+		return rc;
+	}
+	size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
+	sdescmd5 = kmalloc(size, GFP_KERNEL);
+	if (!sdescmd5) {
+		rc = -ENOMEM;
+		cERROR(1, "%s: Memory allocation failure\n", __func__);
+		goto symlink_hash_err;
+	}
+	sdescmd5->shash.tfm = md5;
+	sdescmd5->shash.flags = 0x0;
+
+	rc = crypto_shash_init(&sdescmd5->shash);
+	if (rc) {
+		cERROR(1, "%s: Could not init md5 shash\n", __func__);
+		goto symlink_hash_err;
+	}
+	crypto_shash_update(&sdescmd5->shash, link_str, link_len);
+	rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
+
+symlink_hash_err:
+	crypto_free_shash(md5);
+	kfree(sdescmd5);
+
+	return rc;
+}
+
+static int
+CIFSParseMFSymlink(const u8 *buf,
+		   unsigned int buf_len,
+		   unsigned int *_link_len,
+		   char **_link_str)
+{
+	int rc;
+	unsigned int link_len;
+	const char *md5_str1;
+	const char *link_str;
+	u8 md5_hash[16];
+	char md5_str2[34];
+
+	if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
+		return -EINVAL;
+
+	md5_str1 = (const char *)&buf[CIFS_MF_SYMLINK_MD5_OFFSET];
+	link_str = (const char *)&buf[CIFS_MF_SYMLINK_LINK_OFFSET];
+
+	rc = sscanf(buf, CIFS_MF_SYMLINK_LEN_FORMAT, &link_len);
+	if (rc != 1)
+		return -EINVAL;
+
+	rc = symlink_hash(link_len, link_str, md5_hash);
+	if (rc) {
+		cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
+		return rc;
+	}
+
+	snprintf(md5_str2, sizeof(md5_str2),
+		 CIFS_MF_SYMLINK_MD5_FORMAT,
+		 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+
+	if (strncmp(md5_str1, md5_str2, 17) != 0)
+		return -EINVAL;
+
+	if (_link_str) {
+		*_link_str = kstrndup(link_str, link_len, GFP_KERNEL);
+		if (!*_link_str)
+			return -ENOMEM;
+	}
+
+	*_link_len = link_len;
+	return 0;
+}
+
+static int
+CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
+{
+	int rc;
+	unsigned int link_len;
+	unsigned int ofs;
+	u8 md5_hash[16];
+
+	if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
+		return -EINVAL;
+
+	link_len = strlen(link_str);
+
+	if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
+		return -ENAMETOOLONG;
+
+	rc = symlink_hash(link_len, link_str, md5_hash);
+	if (rc) {
+		cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
+		return rc;
+	}
+
+	snprintf(buf, buf_len,
+		 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
+		 link_len,
+		 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+
+	ofs = CIFS_MF_SYMLINK_LINK_OFFSET;
+	memcpy(buf + ofs, link_str, link_len);
+
+	ofs += link_len;
+	if (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
+		buf[ofs] = '\n';
+		ofs++;
+	}
+
+	while (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
+		buf[ofs] = ' ';
+		ofs++;
+	}
+
+	return 0;
+}
+
+static int
+CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon,
+		    const char *fromName, const char *toName,
+		    const struct nls_table *nls_codepage, int remap)
+{
+	int rc;
+	int oplock = 0;
+	__u16 netfid = 0;
+	u8 *buf;
+	unsigned int bytes_written = 0;
+	struct cifs_io_parms io_parms;
+
+	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
+	if (rc != 0) {
+		kfree(buf);
+		return rc;
+	}
+
+	rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
+			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
+			 nls_codepage, remap);
+	if (rc != 0) {
+		kfree(buf);
+		return rc;
+	}
+
+	io_parms.netfid = netfid;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+
+	rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, buf, NULL, 0);
+	CIFSSMBClose(xid, tcon, netfid);
+	kfree(buf);
+	if (rc != 0)
+		return rc;
+
+	if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE)
+		return -EIO;
+
+	return 0;
+}
+
+static int
+CIFSQueryMFSymLink(const int xid, struct cifs_tcon *tcon,
+		   const unsigned char *searchName, char **symlinkinfo,
+		   const struct nls_table *nls_codepage, int remap)
+{
+	int rc;
+	int oplock = 0;
+	__u16 netfid = 0;
+	u8 *buf;
+	char *pbuf;
+	unsigned int bytes_read = 0;
+	int buf_type = CIFS_NO_BUFFER;
+	unsigned int link_len = 0;
+	struct cifs_io_parms io_parms;
+	FILE_ALL_INFO file_info;
+
+	rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
+			 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
+			 nls_codepage, remap);
+	if (rc != 0)
+		return rc;
+
+	if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+		CIFSSMBClose(xid, tcon, netfid);
+		/* it's not a symlink */
+		return -EINVAL;
+	}
+
+	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	pbuf = buf;
+	io_parms.netfid = netfid;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+
+	rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
+	CIFSSMBClose(xid, tcon, netfid);
+	if (rc != 0) {
+		kfree(buf);
+		return rc;
+	}
+
+	rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo);
+	kfree(buf);
+	if (rc != 0)
+		return rc;
+
+	return 0;
+}
+
+bool
+CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
+{
+	if (!(fattr->cf_mode & S_IFREG))
+		/* it's not a symlink */
+		return false;
+
+	if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
+		/* it's not a symlink */
+		return false;
+
+	return true;
+}
+
+int
+CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+		   const unsigned char *path,
+		   struct cifs_sb_info *cifs_sb, int xid)
+{
+	int rc;
+	int oplock = 0;
+	__u16 netfid = 0;
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	struct cifs_io_parms io_parms;
+	u8 *buf;
+	char *pbuf;
+	unsigned int bytes_read = 0;
+	int buf_type = CIFS_NO_BUFFER;
+	unsigned int link_len = 0;
+	FILE_ALL_INFO file_info;
+
+	if (!CIFSCouldBeMFSymlink(fattr))
+		/* it's not a symlink */
+		return 0;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
+			 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
+			 cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc != 0)
+		goto out;
+
+	if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+		CIFSSMBClose(xid, pTcon, netfid);
+		/* it's not a symlink */
+		goto out;
+	}
+
+	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+	if (!buf) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	pbuf = buf;
+	io_parms.netfid = netfid;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = pTcon;
+	io_parms.offset = 0;
+	io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+
+	rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
+	CIFSSMBClose(xid, pTcon, netfid);
+	if (rc != 0) {
+		kfree(buf);
+		goto out;
+	}
+
+	rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
+	kfree(buf);
+	if (rc == -EINVAL) {
+		/* it's not a symlink */
+		rc = 0;
+		goto out;
+	}
+
+	if (rc != 0)
+		goto out;
+
+	/* it is a symlink */
+	fattr->cf_eof = link_len;
+	fattr->cf_mode &= ~S_IFMT;
+	fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
+	fattr->cf_dtype = DT_LNK;
+out:
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+int
+cifs_hardlink(struct dentry *old_file, struct inode *inode,
+	      struct dentry *direntry)
+{
+	int rc = -EACCES;
+	int xid;
+	char *fromName = NULL;
+	char *toName = NULL;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	struct cifsInodeInfo *cifsInode;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
+
+	fromName = build_path_from_dentry(old_file);
+	toName = build_path_from_dentry(direntry);
+	if ((fromName == NULL) || (toName == NULL)) {
+		rc = -ENOMEM;
+		goto cifs_hl_exit;
+	}
+
+	if (pTcon->unix_ext)
+		rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName,
+					    cifs_sb->local_nls,
+					    cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+	else {
+		rc = CIFSCreateHardLink(xid, pTcon, fromName, toName,
+					cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if ((rc == -EIO) || (rc == -EINVAL))
+			rc = -EOPNOTSUPP;
+	}
+
+	d_drop(direntry);	/* force new lookup from server of target */
+
+	/* if source file is cached (oplocked) revalidate will not go to server
+	   until the file is closed or oplock broken so update nlinks locally */
+	if (old_file->d_inode) {
+		cifsInode = CIFS_I(old_file->d_inode);
+		if (rc == 0) {
+			old_file->d_inode->i_nlink++;
+/* BB should we make this contingent on superblock flag NOATIME? */
+/*			old_file->d_inode->i_ctime = CURRENT_TIME;*/
+			/* parent dir timestamps will update from srv
+			within a second, would it really be worth it
+			to set the parent dir cifs inode time to zero
+			to force revalidate (faster) for it too? */
+		}
+		/* if not oplocked will force revalidate to get info
+		   on source file from srv */
+		cifsInode->time = 0;
+
+		/* Will update parent dir timestamps from srv within a second.
+		   Would it really be worth it to set the parent dir (cifs
+		   inode) time field to zero to force revalidate on parent
+		   directory faster ie
+			CIFS_I(inode)->time = 0;  */
+	}
+
+cifs_hl_exit:
+	kfree(fromName);
+	kfree(toName);
+	FreeXid(xid);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+void *
+cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
+{
+	struct inode *inode = direntry->d_inode;
+	int rc = -ENOMEM;
+	int xid;
+	char *full_path = NULL;
+	char *target_path = NULL;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink = NULL;
+	struct cifs_tcon *tcon;
+
+	xid = GetXid();
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		tlink = NULL;
+		goto out;
+	}
+	tcon = tlink_tcon(tlink);
+
+	/*
+	 * For now, we just handle symlinks with unix extensions enabled.
+	 * Eventually we should handle NTFS reparse points, and MacOS
+	 * symlink support. For instance...
+	 *
+	 * rc = CIFSSMBQueryReparseLinkInfo(...)
+	 *
+	 * For now, just return -EACCES when the server doesn't support posix
+	 * extensions. Note that we still allow querying symlinks when posix
+	 * extensions are manually disabled. We could disable these as well
+	 * but there doesn't seem to be any harm in allowing the client to
+	 * read them.
+	 */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+	    && !(tcon->ses->capabilities & CAP_UNIX)) {
+		rc = -EACCES;
+		goto out;
+	}
+
+	full_path = build_path_from_dentry(direntry);
+	if (!full_path)
+		goto out;
+
+	cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
+
+	rc = -EACCES;
+	/*
+	 * First try Minshall+French Symlinks, if configured
+	 * and fallback to UNIX Extensions Symlinks.
+	 */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+		rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path,
+					cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if ((rc != 0) && (tcon->ses->capabilities & CAP_UNIX))
+		rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
+					     cifs_sb->local_nls);
+
+	kfree(full_path);
+out:
+	if (rc != 0) {
+		kfree(target_path);
+		target_path = ERR_PTR(rc);
+	}
+
+	FreeXid(xid);
+	if (tlink)
+		cifs_put_tlink(tlink);
+	nd_set_link(nd, target_path);
+	return NULL;
+}
+
+int
+cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
+{
+	int rc = -EOPNOTSUPP;
+	int xid;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	char *full_path = NULL;
+	struct inode *newinode = NULL;
+
+	xid = GetXid();
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto symlink_exit;
+	}
+	pTcon = tlink_tcon(tlink);
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto symlink_exit;
+	}
+
+	cFYI(1, "Full path: %s", full_path);
+	cFYI(1, "symname is %s", symname);
+
+	/* BB what if DFS and this volume is on different share? BB */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+		rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
+					 cifs_sb->local_nls,
+					 cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+	else if (pTcon->unix_ext)
+		rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
+					   cifs_sb->local_nls);
+	/* else
+	   rc = CIFSCreateReparseSymLink(xid, pTcon, fromName, toName,
+					cifs_sb_target->local_nls); */
+
+	if (rc == 0) {
+		if (pTcon->unix_ext)
+			rc = cifs_get_inode_info_unix(&newinode, full_path,
+						      inode->i_sb, xid);
+		else
+			rc = cifs_get_inode_info(&newinode, full_path, NULL,
+						 inode->i_sb, xid, NULL);
+
+		if (rc != 0) {
+			cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
+			      rc);
+		} else {
+			d_instantiate(direntry, newinode);
+		}
+	}
+symlink_exit:
+	kfree(full_path);
+	cifs_put_tlink(tlink);
+	FreeXid(xid);
+	return rc;
+}
+
+void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *cookie)
+{
+	char *p = nd_get_link(nd);
+	if (!IS_ERR(p))
+		kfree(p);
+}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
new file mode 100644
index 00000000..03a1f491
--- /dev/null
+++ b/fs/cifs/misc.c
@@ -0,0 +1,685 @@
+/*
+ *   fs/cifs/misc.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/mempool.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "smberr.h"
+#include "nterr.h"
+#include "cifs_unicode.h"
+
+extern mempool_t *cifs_sm_req_poolp;
+extern mempool_t *cifs_req_poolp;
+
+/* The xid serves as a useful identifier for each incoming vfs request,
+   in a similar way to the mid which is useful to track each sent smb,
+   and CurrentXid can also provide a running counter (although it
+   will eventually wrap past zero) of the total vfs operations handled
+   since the cifs fs was mounted */
+
+unsigned int
+_GetXid(void)
+{
+	unsigned int xid;
+
+	spin_lock(&GlobalMid_Lock);
+	GlobalTotalActiveXid++;
+
+	/* keep high water mark for number of simultaneous ops in filesystem */
+	if (GlobalTotalActiveXid > GlobalMaxActiveXid)
+		GlobalMaxActiveXid = GlobalTotalActiveXid;
+	if (GlobalTotalActiveXid > 65000)
+		cFYI(1, "warning: more than 65000 requests active");
+	xid = GlobalCurrentXid++;
+	spin_unlock(&GlobalMid_Lock);
+	return xid;
+}
+
+void
+_FreeXid(unsigned int xid)
+{
+	spin_lock(&GlobalMid_Lock);
+	/* if (GlobalTotalActiveXid == 0)
+		BUG(); */
+	GlobalTotalActiveXid--;
+	spin_unlock(&GlobalMid_Lock);
+}
+
+struct cifs_ses *
+sesInfoAlloc(void)
+{
+	struct cifs_ses *ret_buf;
+
+	ret_buf = kzalloc(sizeof(struct cifs_ses), GFP_KERNEL);
+	if (ret_buf) {
+		atomic_inc(&sesInfoAllocCount);
+		ret_buf->status = CifsNew;
+		++ret_buf->ses_count;
+		INIT_LIST_HEAD(&ret_buf->smb_ses_list);
+		INIT_LIST_HEAD(&ret_buf->tcon_list);
+		mutex_init(&ret_buf->session_mutex);
+	}
+	return ret_buf;
+}
+
+void
+sesInfoFree(struct cifs_ses *buf_to_free)
+{
+	if (buf_to_free == NULL) {
+		cFYI(1, "Null buffer passed to sesInfoFree");
+		return;
+	}
+
+	atomic_dec(&sesInfoAllocCount);
+	kfree(buf_to_free->serverOS);
+	kfree(buf_to_free->serverDomain);
+	kfree(buf_to_free->serverNOS);
+	if (buf_to_free->password) {
+		memset(buf_to_free->password, 0, strlen(buf_to_free->password));
+		kfree(buf_to_free->password);
+	}
+	kfree(buf_to_free->user_name);
+	kfree(buf_to_free->domainName);
+	kfree(buf_to_free);
+}
+
+struct cifs_tcon *
+tconInfoAlloc(void)
+{
+	struct cifs_tcon *ret_buf;
+	ret_buf = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL);
+	if (ret_buf) {
+		atomic_inc(&tconInfoAllocCount);
+		ret_buf->tidStatus = CifsNew;
+		++ret_buf->tc_count;
+		INIT_LIST_HEAD(&ret_buf->openFileList);
+		INIT_LIST_HEAD(&ret_buf->tcon_list);
+#ifdef CONFIG_CIFS_STATS
+		spin_lock_init(&ret_buf->stat_lock);
+#endif
+	}
+	return ret_buf;
+}
+
+void
+tconInfoFree(struct cifs_tcon *buf_to_free)
+{
+	if (buf_to_free == NULL) {
+		cFYI(1, "Null buffer passed to tconInfoFree");
+		return;
+	}
+	atomic_dec(&tconInfoAllocCount);
+	kfree(buf_to_free->nativeFileSystem);
+	if (buf_to_free->password) {
+		memset(buf_to_free->password, 0, strlen(buf_to_free->password));
+		kfree(buf_to_free->password);
+	}
+	kfree(buf_to_free);
+}
+
+struct smb_hdr *
+cifs_buf_get(void)
+{
+	struct smb_hdr *ret_buf = NULL;
+
+/* We could use negotiated size instead of max_msgsize -
+   but it may be more efficient to always alloc same size
+   albeit slightly larger than necessary and maxbuffersize
+   defaults to this and can not be bigger */
+	ret_buf = mempool_alloc(cifs_req_poolp, GFP_NOFS);
+
+	/* clear the first few header bytes */
+	/* for most paths, more is cleared in header_assemble */
+	if (ret_buf) {
+		memset(ret_buf, 0, sizeof(struct smb_hdr) + 3);
+		atomic_inc(&bufAllocCount);
+#ifdef CONFIG_CIFS_STATS2
+		atomic_inc(&totBufAllocCount);
+#endif /* CONFIG_CIFS_STATS2 */
+	}
+
+	return ret_buf;
+}
+
+void
+cifs_buf_release(void *buf_to_free)
+{
+	if (buf_to_free == NULL) {
+		/* cFYI(1, "Null buffer passed to cifs_buf_release");*/
+		return;
+	}
+	mempool_free(buf_to_free, cifs_req_poolp);
+
+	atomic_dec(&bufAllocCount);
+	return;
+}
+
+struct smb_hdr *
+cifs_small_buf_get(void)
+{
+	struct smb_hdr *ret_buf = NULL;
+
+/* We could use negotiated size instead of max_msgsize -
+   but it may be more efficient to always alloc same size
+   albeit slightly larger than necessary and maxbuffersize
+   defaults to this and can not be bigger */
+	ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS);
+	if (ret_buf) {
+	/* No need to clear memory here, cleared in header assemble */
+	/*	memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
+		atomic_inc(&smBufAllocCount);
+#ifdef CONFIG_CIFS_STATS2
+		atomic_inc(&totSmBufAllocCount);
+#endif /* CONFIG_CIFS_STATS2 */
+
+	}
+	return ret_buf;
+}
+
+void
+cifs_small_buf_release(void *buf_to_free)
+{
+
+	if (buf_to_free == NULL) {
+		cFYI(1, "Null buffer passed to cifs_small_buf_release");
+		return;
+	}
+	mempool_free(buf_to_free, cifs_sm_req_poolp);
+
+	atomic_dec(&smBufAllocCount);
+	return;
+}
+
+/*
+	Find a free multiplex id (SMB mid). Otherwise there could be
+	mid collisions which might cause problems, demultiplexing the
+	wrong response to this request. Multiplex ids could collide if
+	one of a series requests takes much longer than the others, or
+	if a very large number of long lived requests (byte range
+	locks or FindNotify requests) are pending.  No more than
+	64K-1 requests can be outstanding at one time.  If no
+	mids are available, return zero.  A future optimization
+	could make the combination of mids and uid the key we use
+	to demultiplex on (rather than mid alone).
+	In addition to the above check, the cifs demultiplex
+	code already used the command code as a secondary
+	check of the frame and if signing is negotiated the
+	response would be discarded if the mid were the same
+	but the signature was wrong.  Since the mid is not put in the
+	pending queue until later (when it is about to be dispatched)
+	we do have to limit the number of outstanding requests
+	to somewhat less than 64K-1 although it is hard to imagine
+	so many threads being in the vfs at one time.
+*/
+__u16 GetNextMid(struct TCP_Server_Info *server)
+{
+	__u16 mid = 0;
+	__u16 last_mid;
+	bool collision;
+
+	spin_lock(&GlobalMid_Lock);
+	last_mid = server->CurrentMid; /* we do not want to loop forever */
+	server->CurrentMid++;
+	/* This nested loop looks more expensive than it is.
+	In practice the list of pending requests is short,
+	fewer than 50, and the mids are likely to be unique
+	on the first pass through the loop unless some request
+	takes longer than the 64 thousand requests before it
+	(and it would also have to have been a request that
+	 did not time out) */
+	while (server->CurrentMid != last_mid) {
+		struct mid_q_entry *mid_entry;
+		unsigned int num_mids;
+
+		collision = false;
+		if (server->CurrentMid == 0)
+			server->CurrentMid++;
+
+		num_mids = 0;
+		list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
+			++num_mids;
+			if (mid_entry->mid == server->CurrentMid &&
+			    mid_entry->midState == MID_REQUEST_SUBMITTED) {
+				/* This mid is in use, try a different one */
+				collision = true;
+				break;
+			}
+		}
+
+		/*
+		 * if we have more than 32k mids in the list, then something
+		 * is very wrong. Possibly a local user is trying to DoS the
+		 * box by issuing long-running calls and SIGKILL'ing them. If
+		 * we get to 2^16 mids then we're in big trouble as this
+		 * function could loop forever.
+		 *
+		 * Go ahead and assign out the mid in this situation, but force
+		 * an eventual reconnect to clean out the pending_mid_q.
+		 */
+		if (num_mids > 32768)
+			server->tcpStatus = CifsNeedReconnect;
+
+		if (!collision) {
+			mid = server->CurrentMid;
+			break;
+		}
+		server->CurrentMid++;
+	}
+	spin_unlock(&GlobalMid_Lock);
+	return mid;
+}
+
+/* NB: MID can not be set if treeCon not passed in, in that
+   case it is responsbility of caller to set the mid */
+void
+header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
+		const struct cifs_tcon *treeCon, int word_count
+		/* length of fixed section (word count) in two byte units  */)
+{
+	struct list_head *temp_item;
+	struct cifs_ses *ses;
+	char *temp = (char *) buffer;
+
+	memset(temp, 0, 256); /* bigger than MAX_CIFS_HDR_SIZE */
+
+	buffer->smb_buf_length = cpu_to_be32(
+	    (2 * word_count) + sizeof(struct smb_hdr) -
+	    4 /*  RFC 1001 length field does not count */  +
+	    2 /* for bcc field itself */) ;
+
+	buffer->Protocol[0] = 0xFF;
+	buffer->Protocol[1] = 'S';
+	buffer->Protocol[2] = 'M';
+	buffer->Protocol[3] = 'B';
+	buffer->Command = smb_command;
+	buffer->Flags = 0x00;	/* case sensitive */
+	buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES;
+	buffer->Pid = cpu_to_le16((__u16)current->tgid);
+	buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16));
+	if (treeCon) {
+		buffer->Tid = treeCon->tid;
+		if (treeCon->ses) {
+			if (treeCon->ses->capabilities & CAP_UNICODE)
+				buffer->Flags2 |= SMBFLG2_UNICODE;
+			if (treeCon->ses->capabilities & CAP_STATUS32)
+				buffer->Flags2 |= SMBFLG2_ERR_STATUS;
+
+			/* Uid is not converted */
+			buffer->Uid = treeCon->ses->Suid;
+			buffer->Mid = GetNextMid(treeCon->ses->server);
+			if (multiuser_mount != 0) {
+		/* For the multiuser case, there are few obvious technically  */
+		/* possible mechanisms to match the local linux user (uid)    */
+		/* to a valid remote smb user (smb_uid):		      */
+		/* 	1) Query Winbind (or other local pam/nss daemon       */
+		/* 	  for userid/password/logon_domain or credential      */
+		/*      2) Query Winbind for uid to sid to username mapping   */
+		/* 	   and see if we have a matching password for existing*/
+		/*         session for that user perhas getting password by   */
+		/*         adding a new pam_cifs module that stores passwords */
+		/*         so that the cifs vfs can get at that for all logged*/
+		/*	   on users					      */
+		/*	3) (Which is the mechanism we have chosen)	      */
+		/*	   Search through sessions to the same server for a   */
+		/*	   a match on the uid that was passed in on mount     */
+		/*         with the current processes uid (or euid?) and use  */
+		/* 	   that smb uid.   If no existing smb session for     */
+		/* 	   that uid found, use the default smb session ie     */
+		/*         the smb session for the volume mounted which is    */
+		/* 	   the same as would be used if the multiuser mount   */
+		/* 	   flag were disabled.  */
+
+		/*  BB Add support for establishing new tCon and SMB Session  */
+		/*      with userid/password pairs found on the smb session   */
+		/*	for other target tcp/ip addresses 		BB    */
+				if (current_fsuid() != treeCon->ses->linux_uid) {
+					cFYI(1, "Multiuser mode and UID "
+						 "did not match tcon uid");
+					spin_lock(&cifs_tcp_ses_lock);
+					list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
+						ses = list_entry(temp_item, struct cifs_ses, smb_ses_list);
+						if (ses->linux_uid == current_fsuid()) {
+							if (ses->server == treeCon->ses->server) {
+								cFYI(1, "found matching uid substitute right smb_uid");
+								buffer->Uid = ses->Suid;
+								break;
+							} else {
+				/* BB eventually call cifs_setup_session here */
+								cFYI(1, "local UID found but no smb sess with this server exists");
+							}
+						}
+					}
+					spin_unlock(&cifs_tcp_ses_lock);
+				}
+			}
+		}
+		if (treeCon->Flags & SMB_SHARE_IS_IN_DFS)
+			buffer->Flags2 |= SMBFLG2_DFS;
+		if (treeCon->nocase)
+			buffer->Flags  |= SMBFLG_CASELESS;
+		if ((treeCon->ses) && (treeCon->ses->server))
+			if (treeCon->ses->server->sec_mode &
+			  (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+				buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+	}
+
+/*  endian conversion of flags is now done just before sending */
+	buffer->WordCount = (char) word_count;
+	return;
+}
+
+static int
+check_smb_hdr(struct smb_hdr *smb, __u16 mid)
+{
+	/* does it have the right SMB "signature" ? */
+	if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
+		cERROR(1, "Bad protocol string signature header 0x%x",
+			*(unsigned int *)smb->Protocol);
+		return 1;
+	}
+
+	/* Make sure that message ids match */
+	if (mid != smb->Mid) {
+		cERROR(1, "Mids do not match. received=%u expected=%u",
+			smb->Mid, mid);
+		return 1;
+	}
+
+	/* if it's a response then accept */
+	if (smb->Flags & SMBFLG_RESPONSE)
+		return 0;
+
+	/* only one valid case where server sends us request */
+	if (smb->Command == SMB_COM_LOCKING_ANDX)
+		return 0;
+
+	cERROR(1, "Server sent request, not response. mid=%u", smb->Mid);
+	return 1;
+}
+
+int
+checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
+{
+	__u32 len = be32_to_cpu(smb->smb_buf_length);
+	__u32 clc_len;  /* calculated length */
+	cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
+
+	if (length < 2 + sizeof(struct smb_hdr)) {
+		if ((length >= sizeof(struct smb_hdr) - 1)
+			    && (smb->Status.CifsError != 0)) {
+			smb->WordCount = 0;
+			/* some error cases do not return wct and bcc */
+			return 0;
+		} else if ((length == sizeof(struct smb_hdr) + 1) &&
+				(smb->WordCount == 0)) {
+			char *tmp = (char *)smb;
+			/* Need to work around a bug in two servers here */
+			/* First, check if the part of bcc they sent was zero */
+			if (tmp[sizeof(struct smb_hdr)] == 0) {
+				/* some servers return only half of bcc
+				 * on simple responses (wct, bcc both zero)
+				 * in particular have seen this on
+				 * ulogoffX and FindClose. This leaves
+				 * one byte of bcc potentially unitialized
+				 */
+				/* zero rest of bcc */
+				tmp[sizeof(struct smb_hdr)+1] = 0;
+				return 0;
+			}
+			cERROR(1, "rcvd invalid byte count (bcc)");
+		} else {
+			cERROR(1, "Length less than smb header size");
+		}
+		return 1;
+	}
+	if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+		cERROR(1, "smb length greater than MaxBufSize, mid=%d",
+				   smb->Mid);
+		return 1;
+	}
+
+	if (check_smb_hdr(smb, mid))
+		return 1;
+	clc_len = smbCalcSize(smb);
+
+	if (4 + len != length) {
+		cERROR(1, "Length read does not match RFC1001 length %d",
+			   len);
+		return 1;
+	}
+
+	if (4 + len != clc_len) {
+		/* check if bcc wrapped around for large read responses */
+		if ((len > 64 * 1024) && (len > clc_len)) {
+			/* check if lengths match mod 64K */
+			if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
+				return 0; /* bcc wrapped */
+		}
+		cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
+				clc_len, 4 + len, smb->Mid);
+
+		if (4 + len < clc_len) {
+			cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
+					len, smb->Mid);
+			return 1;
+		} else if (len > clc_len + 512) {
+			/*
+			 * Some servers (Windows XP in particular) send more
+			 * data than the lengths in the SMB packet would
+			 * indicate on certain calls (byte range locks and
+			 * trans2 find first calls in particular). While the
+			 * client can handle such a frame by ignoring the
+			 * trailing data, we choose limit the amount of extra
+			 * data to 512 bytes.
+			 */
+			cERROR(1, "RFC1001 size %u more than 512 bytes larger "
+				  "than SMB for mid=%u", len, smb->Mid);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+bool
+is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
+{
+	struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
+	struct list_head *tmp, *tmp1, *tmp2;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct cifsInodeInfo *pCifsInode;
+	struct cifsFileInfo *netfile;
+
+	cFYI(1, "Checking for oplock break or dnotify response");
+	if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
+	   (pSMB->hdr.Flags & SMBFLG_RESPONSE)) {
+		struct smb_com_transaction_change_notify_rsp *pSMBr =
+			(struct smb_com_transaction_change_notify_rsp *)buf;
+		struct file_notify_information *pnotify;
+		__u32 data_offset = 0;
+		if (get_bcc(buf) > sizeof(struct file_notify_information)) {
+			data_offset = le32_to_cpu(pSMBr->DataOffset);
+
+			pnotify = (struct file_notify_information *)
+				((char *)&pSMBr->hdr.Protocol + data_offset);
+			cFYI(1, "dnotify on %s Action: 0x%x",
+				 pnotify->FileName, pnotify->Action);
+			/*   cifs_dump_mem("Rcvd notify Data: ",buf,
+				sizeof(struct smb_hdr)+60); */
+			return true;
+		}
+		if (pSMBr->hdr.Status.CifsError) {
+			cFYI(1, "notify err 0x%d",
+				pSMBr->hdr.Status.CifsError);
+			return true;
+		}
+		return false;
+	}
+	if (pSMB->hdr.Command != SMB_COM_LOCKING_ANDX)
+		return false;
+	if (pSMB->hdr.Flags & SMBFLG_RESPONSE) {
+		/* no sense logging error on invalid handle on oplock
+		   break - harmless race between close request and oplock
+		   break response is expected from time to time writing out
+		   large dirty files cached on the client */
+		if ((NT_STATUS_INVALID_HANDLE) ==
+		   le32_to_cpu(pSMB->hdr.Status.CifsError)) {
+			cFYI(1, "invalid handle on oplock break");
+			return true;
+		} else if (ERRbadfid ==
+		   le16_to_cpu(pSMB->hdr.Status.DosError.Error)) {
+			return true;
+		} else {
+			return false; /* on valid oplock brk we get "request" */
+		}
+	}
+	if (pSMB->hdr.WordCount != 8)
+		return false;
+
+	cFYI(1, "oplock type 0x%d level 0x%d",
+		 pSMB->LockType, pSMB->OplockLevel);
+	if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
+		return false;
+
+	/* look up tcon based on tid & uid */
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &srv->smb_ses_list) {
+		ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+		list_for_each(tmp1, &ses->tcon_list) {
+			tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+			if (tcon->tid != buf->Tid)
+				continue;
+
+			cifs_stats_inc(&tcon->num_oplock_brks);
+			spin_lock(&cifs_file_list_lock);
+			list_for_each(tmp2, &tcon->openFileList) {
+				netfile = list_entry(tmp2, struct cifsFileInfo,
+						     tlist);
+				if (pSMB->Fid != netfile->netfid)
+					continue;
+
+				cFYI(1, "file id match, oplock break");
+				pCifsInode = CIFS_I(netfile->dentry->d_inode);
+
+				cifs_set_oplock_level(pCifsInode,
+					pSMB->OplockLevel ? OPLOCK_READ : 0);
+				/*
+				 * cifs_oplock_break_put() can't be called
+				 * from here.  Get reference after queueing
+				 * succeeded.  cifs_oplock_break() will
+				 * synchronize using cifs_file_list_lock.
+				 */
+				if (queue_work(system_nrt_wq,
+					       &netfile->oplock_break))
+					cifs_oplock_break_get(netfile);
+				netfile->oplock_break_cancelled = false;
+
+				spin_unlock(&cifs_file_list_lock);
+				spin_unlock(&cifs_tcp_ses_lock);
+				return true;
+			}
+			spin_unlock(&cifs_file_list_lock);
+			spin_unlock(&cifs_tcp_ses_lock);
+			cFYI(1, "No matching file for oplock break");
+			return true;
+		}
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+	cFYI(1, "Can not process oplock break for non-existent connection");
+	return true;
+}
+
+void
+dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
+{
+	int i, j;
+	char debug_line[17];
+	unsigned char *buffer;
+
+	if (traceSMB == 0)
+		return;
+
+	buffer = (unsigned char *) smb_buf;
+	for (i = 0, j = 0; i < smb_buf_length; i++, j++) {
+		if (i % 8 == 0) {
+			/* have reached the beginning of line */
+			printk(KERN_DEBUG "| ");
+			j = 0;
+		}
+		printk("%0#4x ", buffer[i]);
+		debug_line[2 * j] = ' ';
+		if (isprint(buffer[i]))
+			debug_line[1 + (2 * j)] = buffer[i];
+		else
+			debug_line[1 + (2 * j)] = '_';
+
+		if (i % 8 == 7) {
+			/* reached end of line, time to print ascii */
+			debug_line[16] = 0;
+			printk(" | %s\n", debug_line);
+		}
+	}
+	for (; j < 8; j++) {
+		printk("     ");
+		debug_line[2 * j] = ' ';
+		debug_line[1 + (2 * j)] = ' ';
+	}
+	printk(" | %s\n", debug_line);
+	return;
+}
+
+void
+cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
+{
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
+		cERROR(1, "Autodisabling the use of server inode numbers on "
+			   "%s. This server doesn't seem to support them "
+			   "properly. Hardlinks will not be recognized on this "
+			   "mount. Consider mounting with the \"noserverino\" "
+			   "option to silence this message.",
+			   cifs_sb_master_tcon(cifs_sb)->treeName);
+	}
+}
+
+void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
+{
+	oplock &= 0xF;
+
+	if (oplock == OPLOCK_EXCLUSIVE) {
+		cinode->clientCanCacheAll = true;
+		cinode->clientCanCacheRead = true;
+		cFYI(1, "Exclusive Oplock granted on inode %p",
+		     &cinode->vfs_inode);
+	} else if (oplock == OPLOCK_READ) {
+		cinode->clientCanCacheAll = false;
+		cinode->clientCanCacheRead = true;
+		cFYI(1, "Level II Oplock granted on inode %p",
+		    &cinode->vfs_inode);
+	} else {
+		cinode->clientCanCacheAll = false;
+		cinode->clientCanCacheRead = false;
+	}
+}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
new file mode 100644
index 00000000..73e47e84
--- /dev/null
+++ b/fs/cifs/netmisc.c
@@ -0,0 +1,1008 @@
+/*
+ *   fs/cifs/netmisc.c
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   Error mapping routines from Samba libsmb/errormap.c
+ *   Copyright (C) Andrew Tridgell 2001
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/net.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <asm/div64.h>
+#include <asm/byteorder.h>
+#include <linux/inet.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "smberr.h"
+#include "cifs_debug.h"
+#include "nterr.h"
+
+struct smb_to_posix_error {
+	__u16 smb_err;
+	int posix_code;
+};
+
+static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
+	{ERRbadfunc, -EINVAL},
+	{ERRbadfile, -ENOENT},
+	{ERRbadpath, -ENOTDIR},
+	{ERRnofids, -EMFILE},
+	{ERRnoaccess, -EACCES},
+	{ERRbadfid, -EBADF},
+	{ERRbadmcb, -EIO},
+	{ERRnomem, -ENOMEM},
+	{ERRbadmem, -EFAULT},
+	{ERRbadenv, -EFAULT},
+	{ERRbadformat, -EINVAL},
+	{ERRbadaccess, -EACCES},
+	{ERRbaddata, -EIO},
+	{ERRbaddrive, -ENXIO},
+	{ERRremcd, -EACCES},
+	{ERRdiffdevice, -EXDEV},
+	{ERRnofiles, -ENOENT},
+	{ERRwriteprot, -EROFS},
+	{ERRbadshare, -ETXTBSY},
+	{ERRlock, -EACCES},
+	{ERRunsup, -EINVAL},
+	{ERRnosuchshare, -ENXIO},
+	{ERRfilexists, -EEXIST},
+	{ERRinvparm, -EINVAL},
+	{ERRdiskfull, -ENOSPC},
+	{ERRinvname, -ENOENT},
+	{ERRinvlevel, -EOPNOTSUPP},
+	{ERRdirnotempty, -ENOTEMPTY},
+	{ERRnotlocked, -ENOLCK},
+	{ERRcancelviolation, -ENOLCK},
+	{ERRalreadyexists, -EEXIST},
+	{ERRmoredata, -EOVERFLOW},
+	{ERReasnotsupported, -EOPNOTSUPP},
+	{ErrQuota, -EDQUOT},
+	{ErrNotALink, -ENOLINK},
+	{ERRnetlogonNotStarted, -ENOPROTOOPT},
+	{ERRsymlink, -EOPNOTSUPP},
+	{ErrTooManyLinks, -EMLINK},
+	{0, 0}
+};
+
+static const struct smb_to_posix_error mapping_table_ERRSRV[] = {
+	{ERRerror, -EIO},
+	{ERRbadpw, -EACCES},  /* was EPERM */
+	{ERRbadtype, -EREMOTE},
+	{ERRaccess, -EACCES},
+	{ERRinvtid, -ENXIO},
+	{ERRinvnetname, -ENXIO},
+	{ERRinvdevice, -ENXIO},
+	{ERRqfull, -ENOSPC},
+	{ERRqtoobig, -ENOSPC},
+	{ERRqeof, -EIO},
+	{ERRinvpfid, -EBADF},
+	{ERRsmbcmd, -EBADRQC},
+	{ERRsrverror, -EIO},
+	{ERRbadBID, -EIO},
+	{ERRfilespecs, -EINVAL},
+	{ERRbadLink, -EIO},
+	{ERRbadpermits, -EINVAL},
+	{ERRbadPID, -ESRCH},
+	{ERRsetattrmode, -EINVAL},
+	{ERRpaused, -EHOSTDOWN},
+	{ERRmsgoff, -EHOSTDOWN},
+	{ERRnoroom, -ENOSPC},
+	{ERRrmuns, -EUSERS},
+	{ERRtimeout, -ETIME},
+	{ERRnoresource, -ENOBUFS},
+	{ERRtoomanyuids, -EUSERS},
+	{ERRbaduid, -EACCES},
+	{ERRusempx, -EIO},
+	{ERRusestd, -EIO},
+	{ERR_NOTIFY_ENUM_DIR, -ENOBUFS},
+	{ERRnoSuchUser, -EACCES},
+/*	{ERRaccountexpired, -EACCES},
+	{ERRbadclient, -EACCES},
+	{ERRbadLogonTime, -EACCES},
+	{ERRpasswordExpired, -EACCES},*/
+	{ERRaccountexpired, -EKEYEXPIRED},
+	{ERRbadclient, -EACCES},
+	{ERRbadLogonTime, -EACCES},
+	{ERRpasswordExpired, -EKEYEXPIRED},
+
+	{ERRnosupport, -EINVAL},
+	{0, 0}
+};
+
+static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
+	{0, 0}
+};
+
+/*
+ * Convert a string containing text IPv4 or IPv6 address to binary form.
+ *
+ * Returns 0 on failure.
+ */
+static int
+cifs_inet_pton(const int address_family, const char *cp, int len, void *dst)
+{
+	int ret = 0;
+
+	/* calculate length by finding first slash or NULL */
+	if (address_family == AF_INET)
+		ret = in4_pton(cp, len, dst, '\\', NULL);
+	else if (address_family == AF_INET6)
+		ret = in6_pton(cp, len, dst , '\\', NULL);
+
+	cFYI(DBG2, "address conversion returned %d for %*.*s",
+	     ret, len, len, cp);
+	if (ret > 0)
+		ret = 1;
+	return ret;
+}
+
+/*
+ * Try to convert a string to an IPv4 address and then attempt to convert
+ * it to an IPv6 address if that fails. Set the family field if either
+ * succeeds. If it's an IPv6 address and it has a '%' sign in it, try to
+ * treat the part following it as a numeric sin6_scope_id.
+ *
+ * Returns 0 on failure.
+ */
+int
+cifs_convert_address(struct sockaddr *dst, const char *src, int len)
+{
+	int rc, alen, slen;
+	const char *pct;
+	char scope_id[13];
+	struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
+	struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
+
+	/* IPv4 address */
+	if (cifs_inet_pton(AF_INET, src, len, &s4->sin_addr.s_addr)) {
+		s4->sin_family = AF_INET;
+		return 1;
+	}
+
+	/* attempt to exclude the scope ID from the address part */
+	pct = memchr(src, '%', len);
+	alen = pct ? pct - src : len;
+
+	rc = cifs_inet_pton(AF_INET6, src, alen, &s6->sin6_addr.s6_addr);
+	if (!rc)
+		return rc;
+
+	s6->sin6_family = AF_INET6;
+	if (pct) {
+		/* grab the scope ID */
+		slen = len - (alen + 1);
+		if (slen <= 0 || slen > 12)
+			return 0;
+		memcpy(scope_id, pct + 1, slen);
+		scope_id[slen] = '\0';
+
+		rc = strict_strtoul(scope_id, 0,
+					(unsigned long *)&s6->sin6_scope_id);
+		rc = (rc == 0) ? 1 : 0;
+	}
+
+	return rc;
+}
+
+int
+cifs_set_port(struct sockaddr *addr, const unsigned short int port)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *)addr)->sin_port = htons(port);
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
+		break;
+	default:
+		return 0;
+	}
+	return 1;
+}
+
+int
+cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
+		   const unsigned short int port)
+{
+	if (!cifs_convert_address(dst, src, len))
+		return 0;
+	return cifs_set_port(dst, port);
+}
+
+/*****************************************************************************
+convert a NT status code to a dos class/code
+ *****************************************************************************/
+/* NT status -> dos error map */
+static const struct {
+	__u8 dos_class;
+	__u16 dos_code;
+	__u32 ntstatus;
+} ntstatus_to_dos_map[] = {
+	{
+	ERRDOS, ERRgeneral, NT_STATUS_UNSUCCESSFUL}, {
+	ERRDOS, ERRbadfunc, NT_STATUS_NOT_IMPLEMENTED}, {
+	ERRDOS, ERRinvlevel, NT_STATUS_INVALID_INFO_CLASS}, {
+	ERRDOS, 24, NT_STATUS_INFO_LENGTH_MISMATCH}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ACCESS_VIOLATION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_IN_PAGE_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_QUOTA}, {
+	ERRDOS, ERRbadfid, NT_STATUS_INVALID_HANDLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_INITIAL_STACK}, {
+	ERRDOS, 193, NT_STATUS_BAD_INITIAL_PC}, {
+	ERRDOS, 87, NT_STATUS_INVALID_CID}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TIMER_NOT_CANCELED}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER}, {
+	ERRDOS, ERRbadfile, NT_STATUS_NO_SUCH_DEVICE}, {
+	ERRDOS, ERRbadfile, NT_STATUS_NO_SUCH_FILE}, {
+	ERRDOS, ERRbadfunc, NT_STATUS_INVALID_DEVICE_REQUEST}, {
+	ERRDOS, 38, NT_STATUS_END_OF_FILE}, {
+	ERRDOS, 34, NT_STATUS_WRONG_VOLUME}, {
+	ERRDOS, 21, NT_STATUS_NO_MEDIA_IN_DEVICE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNRECOGNIZED_MEDIA}, {
+	ERRDOS, 27, NT_STATUS_NONEXISTENT_SECTOR},
+/*	{ This NT error code was 'sqashed'
+	 from NT_STATUS_MORE_PROCESSING_REQUIRED to NT_STATUS_OK
+	 during the session setup } */
+	{
+	ERRDOS, ERRnomem, NT_STATUS_NO_MEMORY}, {
+	ERRDOS, 487, NT_STATUS_CONFLICTING_ADDRESSES}, {
+	ERRDOS, 487, NT_STATUS_NOT_MAPPED_VIEW}, {
+	ERRDOS, 87, NT_STATUS_UNABLE_TO_FREE_VM}, {
+	ERRDOS, 87, NT_STATUS_UNABLE_TO_DELETE_SECTION}, {
+	ERRDOS, 2142, NT_STATUS_INVALID_SYSTEM_SERVICE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_INSTRUCTION}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_INVALID_LOCK_SEQUENCE}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_INVALID_VIEW_SIZE}, {
+	ERRDOS, 193, NT_STATUS_INVALID_FILE_FOR_SECTION}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_ALREADY_COMMITTED},
+/*	{ This NT error code was 'sqashed'
+	 from NT_STATUS_ACCESS_DENIED to NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE
+	 during the session setup }   */
+	{
+	ERRDOS, ERRnoaccess, NT_STATUS_ACCESS_DENIED}, {
+	ERRDOS, 111, NT_STATUS_BUFFER_TOO_SMALL}, {
+	ERRDOS, ERRbadfid, NT_STATUS_OBJECT_TYPE_MISMATCH}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NONCONTINUABLE_EXCEPTION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_DISPOSITION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNWIND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_STACK}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_UNWIND_TARGET}, {
+	ERRDOS, 158, NT_STATUS_NOT_LOCKED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PARITY_ERROR}, {
+	ERRDOS, 487, NT_STATUS_UNABLE_TO_DECOMMIT_VM}, {
+	ERRDOS, 487, NT_STATUS_NOT_COMMITTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_PORT_ATTRIBUTES}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PORT_MESSAGE_TOO_LONG}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_MIX}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_QUOTA_LOWER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DISK_CORRUPT_ERROR}, {
+	 /* mapping changed since shell does lookup on * expects FileNotFound */
+	ERRDOS, ERRbadfile, NT_STATUS_OBJECT_NAME_INVALID}, {
+	ERRDOS, ERRbadfile, NT_STATUS_OBJECT_NAME_NOT_FOUND}, {
+	ERRDOS, ERRalreadyexists, NT_STATUS_OBJECT_NAME_COLLISION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_HANDLE_NOT_WAITABLE}, {
+	ERRDOS, ERRbadfid, NT_STATUS_PORT_DISCONNECTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DEVICE_ALREADY_ATTACHED}, {
+	ERRDOS, 161, NT_STATUS_OBJECT_PATH_INVALID}, {
+	ERRDOS, ERRbadpath, NT_STATUS_OBJECT_PATH_NOT_FOUND}, {
+	ERRDOS, 161, NT_STATUS_OBJECT_PATH_SYNTAX_BAD}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DATA_OVERRUN}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DATA_LATE_ERROR}, {
+	ERRDOS, 23, NT_STATUS_DATA_ERROR}, {
+	ERRDOS, 23, NT_STATUS_CRC_ERROR}, {
+	ERRDOS, ERRnomem, NT_STATUS_SECTION_TOO_BIG}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_PORT_CONNECTION_REFUSED}, {
+	ERRDOS, ERRbadfid, NT_STATUS_INVALID_PORT_HANDLE}, {
+	ERRDOS, ERRbadshare, NT_STATUS_SHARING_VIOLATION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_QUOTA_EXCEEDED}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PAGE_PROTECTION}, {
+	ERRDOS, 288, NT_STATUS_MUTANT_NOT_OWNED}, {
+	ERRDOS, 298, NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED}, {
+	ERRDOS, 87, NT_STATUS_PORT_ALREADY_SET}, {
+	ERRDOS, 87, NT_STATUS_SECTION_NOT_IMAGE}, {
+	ERRDOS, 156, NT_STATUS_SUSPEND_COUNT_EXCEEDED}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_THREAD_IS_TERMINATING}, {
+	ERRDOS, 87, NT_STATUS_BAD_WORKING_SET_LIMIT}, {
+	ERRDOS, 87, NT_STATUS_INCOMPATIBLE_FILE_MAP}, {
+	ERRDOS, 87, NT_STATUS_SECTION_PROTECTION}, {
+	ERRDOS, ERReasnotsupported, NT_STATUS_EAS_NOT_SUPPORTED}, {
+	ERRDOS, 255, NT_STATUS_EA_TOO_LARGE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NONEXISTENT_EA_ENTRY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_EAS_ON_FILE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_EA_CORRUPT_ERROR}, {
+	ERRDOS, ERRlock, NT_STATUS_FILE_LOCK_CONFLICT}, {
+	ERRDOS, ERRlock, NT_STATUS_LOCK_NOT_GRANTED}, {
+	ERRDOS, ERRbadfile, NT_STATUS_DELETE_PENDING}, {
+	ERRDOS, ERRunsup, NT_STATUS_CTL_FILE_NOT_SUPPORTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNKNOWN_REVISION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REVISION_MISMATCH}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_OWNER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_PRIMARY_GROUP}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_IMPERSONATION_TOKEN}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANT_DISABLE_MANDATORY}, {
+	ERRDOS, 2215, NT_STATUS_NO_LOGON_SERVERS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_LOGON_SESSION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PRIVILEGE}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_PRIVILEGE_NOT_HELD}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACCOUNT_NAME}, {
+	ERRHRD, ERRgeneral, NT_STATUS_USER_EXISTS},
+/*	{ This NT error code was 'sqashed'
+	 from NT_STATUS_NO_SUCH_USER to NT_STATUS_LOGON_FAILURE
+	 during the session setup } */
+	{
+	ERRDOS, ERRnoaccess, NT_STATUS_NO_SUCH_USER}, { /* could map to 2238 */
+	ERRHRD, ERRgeneral, NT_STATUS_GROUP_EXISTS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_GROUP}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MEMBER_IN_GROUP}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MEMBER_NOT_IN_GROUP}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LAST_ADMIN},
+/*	{ This NT error code was 'sqashed'
+	 from NT_STATUS_WRONG_PASSWORD to NT_STATUS_LOGON_FAILURE
+	 during the session setup } */
+	{
+	ERRSRV, ERRbadpw, NT_STATUS_WRONG_PASSWORD}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ILL_FORMED_PASSWORD}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PASSWORD_RESTRICTION}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_LOGON_FAILURE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ACCOUNT_RESTRICTION}, {
+	ERRSRV, ERRbadLogonTime, NT_STATUS_INVALID_LOGON_HOURS}, {
+	ERRSRV, ERRbadclient, NT_STATUS_INVALID_WORKSTATION}, {
+	ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_EXPIRED}, {
+	ERRSRV, ERRaccountexpired, NT_STATUS_ACCOUNT_DISABLED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NONE_MAPPED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_LUIDS_REQUESTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LUIDS_EXHAUSTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_SUB_AUTHORITY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACL}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_SID}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_SECURITY_DESCR}, {
+	ERRDOS, 127, NT_STATUS_PROCEDURE_NOT_FOUND}, {
+	ERRDOS, 193, NT_STATUS_INVALID_IMAGE_FORMAT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_TOKEN}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_INHERITANCE_ACL}, {
+	ERRDOS, 158, NT_STATUS_RANGE_NOT_LOCKED}, {
+	ERRDOS, 112, NT_STATUS_DISK_FULL}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SERVER_DISABLED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SERVER_NOT_DISABLED}, {
+	ERRDOS, 68, NT_STATUS_TOO_MANY_GUIDS_REQUESTED}, {
+	ERRDOS, 259, NT_STATUS_GUIDS_EXHAUSTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_ID_AUTHORITY}, {
+	ERRDOS, 259, NT_STATUS_AGENTS_EXHAUSTED}, {
+	ERRDOS, 154, NT_STATUS_INVALID_VOLUME_LABEL}, {
+	ERRDOS, 14, NT_STATUS_SECTION_NOT_EXTENDED}, {
+	ERRDOS, 487, NT_STATUS_NOT_MAPPED_DATA}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_DATA_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_TYPE_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_NAME_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ARRAY_BOUNDS_EXCEEDED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOAT_DENORMAL_OPERAND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOAT_DIVIDE_BY_ZERO}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOAT_INEXACT_RESULT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOAT_INVALID_OPERATION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOAT_OVERFLOW}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOAT_STACK_CHECK}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOAT_UNDERFLOW}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INTEGER_DIVIDE_BY_ZERO}, {
+	ERRDOS, 534, NT_STATUS_INTEGER_OVERFLOW}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PRIVILEGED_INSTRUCTION}, {
+	ERRDOS, ERRnomem, NT_STATUS_TOO_MANY_PAGING_FILES}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FILE_INVALID}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ALLOTTED_SPACE_EXCEEDED},
+/*	{ This NT error code was 'sqashed'
+	 from NT_STATUS_INSUFFICIENT_RESOURCES to
+	 NT_STATUS_INSUFF_SERVER_RESOURCES during the session setup } */
+	{
+	ERRDOS, ERRnomem, NT_STATUS_INSUFFICIENT_RESOURCES}, {
+	ERRDOS, ERRbadpath, NT_STATUS_DFS_EXIT_PATH_FOUND}, {
+	ERRDOS, 23, NT_STATUS_DEVICE_DATA_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_CONNECTED}, {
+	ERRDOS, 21, NT_STATUS_DEVICE_POWER_FAILURE}, {
+	ERRDOS, 487, NT_STATUS_FREE_VM_NOT_AT_BASE}, {
+	ERRDOS, 487, NT_STATUS_MEMORY_NOT_ALLOCATED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_WORKING_SET_QUOTA}, {
+	ERRDOS, 19, NT_STATUS_MEDIA_WRITE_PROTECTED}, {
+	ERRDOS, 21, NT_STATUS_DEVICE_NOT_READY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_GROUP_ATTRIBUTES}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_IMPERSONATION_LEVEL}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANT_OPEN_ANONYMOUS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_VALIDATION_CLASS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_TOKEN_TYPE}, {
+	ERRDOS, 87, NT_STATUS_BAD_MASTER_BOOT_RECORD}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INSTRUCTION_MISALIGNMENT}, {
+	ERRDOS, ERRpipebusy, NT_STATUS_INSTANCE_NOT_AVAILABLE}, {
+	ERRDOS, ERRpipebusy, NT_STATUS_PIPE_NOT_AVAILABLE}, {
+	ERRDOS, ERRbadpipe, NT_STATUS_INVALID_PIPE_STATE}, {
+	ERRDOS, ERRpipebusy, NT_STATUS_PIPE_BUSY}, {
+	ERRDOS, ERRbadfunc, NT_STATUS_ILLEGAL_FUNCTION}, {
+	ERRDOS, ERRnotconnected, NT_STATUS_PIPE_DISCONNECTED}, {
+	ERRDOS, ERRpipeclosing, NT_STATUS_PIPE_CLOSING}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PIPE_CONNECTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PIPE_LISTENING}, {
+	ERRDOS, ERRbadpipe, NT_STATUS_INVALID_READ_MODE}, {
+	ERRDOS, 121, NT_STATUS_IO_TIMEOUT}, {
+	ERRDOS, 38, NT_STATUS_FILE_FORCED_CLOSED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PROFILING_NOT_STARTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PROFILING_NOT_STOPPED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_COULD_NOT_INTERPRET}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_FILE_IS_A_DIRECTORY}, {
+	ERRDOS, ERRunsup, NT_STATUS_NOT_SUPPORTED}, {
+	ERRDOS, 51, NT_STATUS_REMOTE_NOT_LISTENING}, {
+	ERRDOS, 52, NT_STATUS_DUPLICATE_NAME}, {
+	ERRDOS, 53, NT_STATUS_BAD_NETWORK_PATH}, {
+	ERRDOS, 54, NT_STATUS_NETWORK_BUSY}, {
+	ERRDOS, 55, NT_STATUS_DEVICE_DOES_NOT_EXIST}, {
+	ERRDOS, 56, NT_STATUS_TOO_MANY_COMMANDS}, {
+	ERRDOS, 57, NT_STATUS_ADAPTER_HARDWARE_ERROR}, {
+	ERRDOS, 58, NT_STATUS_INVALID_NETWORK_RESPONSE}, {
+	ERRDOS, 59, NT_STATUS_UNEXPECTED_NETWORK_ERROR}, {
+	ERRDOS, 60, NT_STATUS_BAD_REMOTE_ADAPTER}, {
+	ERRDOS, 61, NT_STATUS_PRINT_QUEUE_FULL}, {
+	ERRDOS, 62, NT_STATUS_NO_SPOOL_SPACE}, {
+	ERRDOS, 63, NT_STATUS_PRINT_CANCELLED}, {
+	ERRDOS, 64, NT_STATUS_NETWORK_NAME_DELETED}, {
+	ERRDOS, 65, NT_STATUS_NETWORK_ACCESS_DENIED}, {
+	ERRDOS, 66, NT_STATUS_BAD_DEVICE_TYPE}, {
+	ERRDOS, ERRnosuchshare, NT_STATUS_BAD_NETWORK_NAME}, {
+	ERRDOS, 68, NT_STATUS_TOO_MANY_NAMES}, {
+	ERRDOS, 69, NT_STATUS_TOO_MANY_SESSIONS}, {
+	ERRDOS, 70, NT_STATUS_SHARING_PAUSED}, {
+	ERRDOS, 71, NT_STATUS_REQUEST_NOT_ACCEPTED}, {
+	ERRDOS, 72, NT_STATUS_REDIRECTOR_PAUSED}, {
+	ERRDOS, 88, NT_STATUS_NET_WRITE_FAULT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PROFILING_AT_LIMIT}, {
+	ERRDOS, ERRdiffdevice, NT_STATUS_NOT_SAME_DEVICE}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_FILE_RENAMED}, {
+	ERRDOS, 240, NT_STATUS_VIRTUAL_CIRCUIT_CLOSED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SECURITY_ON_OBJECT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANT_WAIT}, {
+	ERRDOS, ERRpipeclosing, NT_STATUS_PIPE_EMPTY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANT_ACCESS_DOMAIN_INFO}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANT_TERMINATE_SELF}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_SERVER_STATE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_DOMAIN_STATE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_DOMAIN_ROLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_DOMAIN}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_EXISTS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_LIMIT_EXCEEDED}, {
+	ERRDOS, 300, NT_STATUS_OPLOCK_NOT_GRANTED}, {
+	ERRDOS, 301, NT_STATUS_INVALID_OPLOCK_PROTOCOL}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_DB_CORRUPTION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_GENERIC_NOT_MAPPED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_DESCRIPTOR_FORMAT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_USER_BUFFER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_IO_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_CREATE_ERR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_MAP_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_EXTEND_ERR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NOT_LOGON_PROCESS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOGON_SESSION_EXISTS}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_1}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_2}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_3}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_4}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_5}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_6}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_7}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_8}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_9}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_10}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_11}, {
+	ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_12}, {
+	ERRDOS, ERRbadpath, NT_STATUS_REDIRECTOR_NOT_STARTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REDIRECTOR_STARTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_STACK_OVERFLOW}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PACKAGE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_FUNCTION_TABLE}, {
+	ERRDOS, 203, 0xc0000100}, {
+	ERRDOS, 145, NT_STATUS_DIRECTORY_NOT_EMPTY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FILE_CORRUPT_ERROR}, {
+	ERRDOS, 267, NT_STATUS_NOT_A_DIRECTORY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_LOGON_SESSION_STATE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOGON_SESSION_COLLISION}, {
+	ERRDOS, 206, NT_STATUS_NAME_TOO_LONG}, {
+	ERRDOS, 2401, NT_STATUS_FILES_OPEN}, {
+	ERRDOS, 2404, NT_STATUS_CONNECTION_IN_USE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MESSAGE_NOT_FOUND}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_PROCESS_IS_TERMINATING}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_LOGON_TYPE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_GUID_TRANSLATION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANNOT_IMPERSONATE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_IMAGE_ALREADY_LOADED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_NOT_PRESENT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_LID_NOT_EXIST}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_LID_ALREADY_OWNED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_NOT_LID_OWNER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_COMMAND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_LID}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_SELECTOR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_LDT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_SIZE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_OFFSET}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_DESCRIPTOR}, {
+	ERRDOS, 193, NT_STATUS_INVALID_IMAGE_NE_FORMAT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RXACT_INVALID_STATE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RXACT_COMMIT_FAILURE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MAPPED_FILE_SIZE_ZERO}, {
+	ERRDOS, ERRnofids, NT_STATUS_TOO_MANY_OPENED_FILES}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANCELLED}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_CANNOT_DELETE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_COMPUTER_NAME}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_FILE_DELETED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_ACCOUNT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_GROUP}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_USER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MEMBERS_PRIMARY_GROUP}, {
+	ERRDOS, ERRbadfid, NT_STATUS_FILE_CLOSED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_THREADS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_THREAD_NOT_IN_PROCESS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TOKEN_ALREADY_IN_USE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_QUOTA_EXCEEDED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_COMMITMENT_LIMIT}, {
+	ERRDOS, 193, NT_STATUS_INVALID_IMAGE_LE_FORMAT}, {
+	ERRDOS, 193, NT_STATUS_INVALID_IMAGE_NOT_MZ}, {
+	ERRDOS, 193, NT_STATUS_INVALID_IMAGE_PROTECT}, {
+	ERRDOS, 193, NT_STATUS_INVALID_IMAGE_WIN_16}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOGON_SERVER_CONFLICT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TIME_DIFFERENCE_AT_DC}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SYNCHRONIZATION_REQUIRED}, {
+	ERRDOS, 126, NT_STATUS_DLL_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_OPEN_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_IO_PRIVILEGE_FAILED}, {
+	ERRDOS, 182, NT_STATUS_ORDINAL_NOT_FOUND}, {
+	ERRDOS, 127, NT_STATUS_ENTRYPOINT_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CONTROL_C_EXIT}, {
+	ERRDOS, 64, NT_STATUS_LOCAL_DISCONNECT}, {
+	ERRDOS, 64, NT_STATUS_REMOTE_DISCONNECT}, {
+	ERRDOS, 51, NT_STATUS_REMOTE_RESOURCES}, {
+	ERRDOS, 59, NT_STATUS_LINK_FAILED}, {
+	ERRDOS, 59, NT_STATUS_LINK_TIMEOUT}, {
+	ERRDOS, 59, NT_STATUS_INVALID_CONNECTION}, {
+	ERRDOS, 59, NT_STATUS_INVALID_ADDRESS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DLL_INIT_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MISSING_SYSTEMFILE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNHANDLED_EXCEPTION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_APP_INIT_FAILURE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_CREATE_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_PAGEFILE}, {
+	ERRDOS, 124, NT_STATUS_INVALID_LEVEL}, {
+	ERRDOS, 86, NT_STATUS_WRONG_PASSWORD_CORE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_FLOAT_CONTEXT}, {
+	ERRDOS, 109, NT_STATUS_PIPE_BROKEN}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_CORRUPT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_IO_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_EVENT_PAIR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNRECOGNIZED_VOLUME}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SERIAL_NO_DEVICE_INITED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_ALIAS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MEMBER_NOT_IN_ALIAS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MEMBER_IN_ALIAS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ALIAS_EXISTS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOGON_NOT_GRANTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_SECRETS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SECRET_TOO_LONG}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_DB_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FULLSCREEN_MODE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_CONTEXT_IDS}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_LOGON_TYPE_NOT_GRANTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NOT_REGISTRY_FILE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FT_MISSING_MEMBER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ILL_FORMED_SERVICE_ENTRY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_CHARACTER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNMAPPABLE_CHARACTER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNDEFINED_CHARACTER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_VOLUME}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_WRONG_CYLINDER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_UNKNOWN_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_BAD_REGISTERS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DISK_RECALIBRATE_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DISK_OPERATION_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DISK_RESET_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SHARED_IRQ_BUSY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FT_ORPHANING}, {
+	ERRHRD, ERRgeneral, 0xc000016e}, {
+	ERRHRD, ERRgeneral, 0xc000016f}, {
+	ERRHRD, ERRgeneral, 0xc0000170}, {
+	ERRHRD, ERRgeneral, 0xc0000171}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PARTITION_FAILURE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_BLOCK_LENGTH}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_PARTITIONED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNABLE_TO_LOCK_MEDIA}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNABLE_TO_UNLOAD_MEDIA}, {
+	ERRHRD, ERRgeneral, NT_STATUS_EOM_OVERFLOW}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_MEDIA}, {
+	ERRHRD, ERRgeneral, 0xc0000179}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_MEMBER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_MEMBER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_KEY_DELETED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_LOG_SPACE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_SIDS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_KEY_HAS_CHILDREN}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CHILD_MUST_BE_VOLATILE}, {
+	ERRDOS, 87, NT_STATUS_DEVICE_CONFIGURATION_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DRIVER_INTERNAL_ERROR}, {
+	ERRDOS, 22, NT_STATUS_INVALID_DEVICE_STATE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_IO_DEVICE_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DEVICE_PROTOCOL_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BACKUP_CONTROLLER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOG_FILE_FULL}, {
+	ERRDOS, 19, NT_STATUS_TOO_LATE}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_NO_TRUST_LSA_SECRET},
+/*	{ This NT error code was 'sqashed'
+	 from NT_STATUS_NO_TRUST_SAM_ACCOUNT to
+	 NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE during the session setup } */
+	{
+	ERRDOS, ERRnoaccess, NT_STATUS_NO_TRUST_SAM_ACCOUNT}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_TRUSTED_DOMAIN_FAILURE}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_FILE_CORRUPT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_CANT_START}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_TRUST_FAILURE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MUTANT_LIMIT_EXCEEDED}, {
+	ERRDOS, ERRnetlogonNotStarted, NT_STATUS_NETLOGON_NOT_STARTED}, {
+	ERRSRV, ERRaccountexpired, NT_STATUS_ACCOUNT_EXPIRED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_POSSIBLE_DEADLOCK}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NETWORK_CREDENTIAL_CONFLICT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REMOTE_SESSION_LIMIT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_FILE_CHANGED}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT},
+/*	{ This NT error code was 'sqashed'
+	 from NT_STATUS_DOMAIN_TRUST_INCONSISTENT to NT_STATUS_LOGON_FAILURE
+	 during the session setup }  */
+	{
+	ERRDOS, ERRnoaccess, NT_STATUS_DOMAIN_TRUST_INCONSISTENT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FS_DRIVER_REQUIRED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_USER_SESSION_KEY}, {
+	ERRDOS, 59, NT_STATUS_USER_SESSION_DELETED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_LANG_NOT_FOUND}, {
+	ERRDOS, ERRnomem, NT_STATUS_INSUFF_SERVER_RESOURCES}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_BUFFER_SIZE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_COMPONENT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_WILDCARD}, {
+	ERRDOS, 68, NT_STATUS_TOO_MANY_ADDRESSES}, {
+	ERRDOS, 52, NT_STATUS_ADDRESS_ALREADY_EXISTS}, {
+	ERRDOS, 64, NT_STATUS_ADDRESS_CLOSED}, {
+	ERRDOS, 64, NT_STATUS_CONNECTION_DISCONNECTED}, {
+	ERRDOS, 64, NT_STATUS_CONNECTION_RESET}, {
+	ERRDOS, 68, NT_STATUS_TOO_MANY_NODES}, {
+	ERRDOS, 59, NT_STATUS_TRANSACTION_ABORTED}, {
+	ERRDOS, 59, NT_STATUS_TRANSACTION_TIMED_OUT}, {
+	ERRDOS, 59, NT_STATUS_TRANSACTION_NO_RELEASE}, {
+	ERRDOS, 59, NT_STATUS_TRANSACTION_NO_MATCH}, {
+	ERRDOS, 59, NT_STATUS_TRANSACTION_RESPONDED}, {
+	ERRDOS, 59, NT_STATUS_TRANSACTION_INVALID_ID}, {
+	ERRDOS, 59, NT_STATUS_TRANSACTION_INVALID_TYPE}, {
+	ERRDOS, ERRunsup, NT_STATUS_NOT_SERVER_SESSION}, {
+	ERRDOS, ERRunsup, NT_STATUS_NOT_CLIENT_SESSION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CANNOT_LOAD_REGISTRY_FILE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DEBUG_ATTACH_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_SYSTEM_PROCESS_TERMINATED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DATA_NOT_ACCEPTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_BROWSER_SERVERS_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_VDM_HARD_ERROR}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DRIVER_CANCEL_TIMEOUT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REPLY_MESSAGE_MISMATCH}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MAPPED_ALIGNMENT}, {
+	ERRDOS, 193, NT_STATUS_IMAGE_CHECKSUM_MISMATCH}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOST_WRITEBEHIND_DATA}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID}, {
+	ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_MUST_CHANGE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NOT_TINY_STREAM}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RECOVERY_FAILURE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_STACK_OVERFLOW_READ}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FAIL_CHECK}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DUPLICATE_OBJECTID}, {
+	ERRHRD, ERRgeneral, NT_STATUS_OBJECTID_EXISTS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CONVERT_TO_LARGE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_RETRY}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FOUND_OUT_OF_SCOPE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ALLOCATE_BUCKET}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PROPSET_NOT_FOUND}, {
+	ERRHRD, ERRgeneral, NT_STATUS_MARSHALL_OVERFLOW}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_VARIANT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND}, {
+	ERRDOS, ERRnoaccess, NT_STATUS_ACCOUNT_LOCKED_OUT}, {
+	ERRDOS, ERRbadfid, NT_STATUS_HANDLE_NOT_CLOSABLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_REFUSED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_GRACEFUL_DISCONNECT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ADDRESS_ALREADY_ASSOCIATED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_ADDRESS_NOT_ASSOCIATED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_INVALID}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_ACTIVE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NETWORK_UNREACHABLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_HOST_UNREACHABLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PROTOCOL_UNREACHABLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PORT_UNREACHABLE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REQUEST_ABORTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_ABORTED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_COMPRESSION_BUFFER}, {
+	ERRHRD, ERRgeneral, NT_STATUS_USER_MAPPED_FILE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_AUDIT_FAILED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_TIMER_RESOLUTION_NOT_SET}, {
+	ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_COUNT_LIMIT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOGIN_TIME_RESTRICTION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LOGIN_WKSTA_RESTRICTION}, {
+	ERRDOS, 193, NT_STATUS_IMAGE_MP_UP_MISMATCH}, {
+	ERRHRD, ERRgeneral, 0xc000024a}, {
+	ERRHRD, ERRgeneral, 0xc000024b}, {
+	ERRHRD, ERRgeneral, 0xc000024c}, {
+	ERRHRD, ERRgeneral, 0xc000024d}, {
+	ERRHRD, ERRgeneral, 0xc000024e}, {
+	ERRHRD, ERRgeneral, 0xc000024f}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INSUFFICIENT_LOGON_INFO}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_DLL_ENTRYPOINT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_BAD_SERVICE_ENTRYPOINT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LPC_REPLY_LOST}, {
+	ERRHRD, ERRgeneral, NT_STATUS_IP_ADDRESS_CONFLICT1}, {
+	ERRHRD, ERRgeneral, NT_STATUS_IP_ADDRESS_CONFLICT2}, {
+	ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_QUOTA_LIMIT}, {
+	ERRSRV, 3, NT_STATUS_PATH_NOT_COVERED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_NO_CALLBACK_ACTIVE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_LICENSE_QUOTA_EXCEEDED}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PWD_TOO_SHORT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PWD_TOO_RECENT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PWD_HISTORY_CONFLICT}, {
+	ERRHRD, ERRgeneral, 0xc000025d}, {
+	ERRHRD, ERRgeneral, NT_STATUS_PLUGPLAY_NO_DEVICE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_UNSUPPORTED_COMPRESSION}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_HW_PROFILE}, {
+	ERRHRD, ERRgeneral, NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH}, {
+	ERRDOS, 182, NT_STATUS_DRIVER_ORDINAL_NOT_FOUND}, {
+	ERRDOS, 127, NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND}, {
+	ERRDOS, 288, NT_STATUS_RESOURCE_NOT_OWNED}, {
+	ERRDOS, ErrTooManyLinks, NT_STATUS_TOO_MANY_LINKS}, {
+	ERRHRD, ERRgeneral, NT_STATUS_QUOTA_LIST_INCONSISTENT}, {
+	ERRHRD, ERRgeneral, NT_STATUS_FILE_IS_OFFLINE}, {
+	ERRDOS, 21, 0xc000026e}, {
+	ERRDOS, 161, 0xc0000281}, {
+	ERRDOS, ERRnoaccess, 0xc000028a}, {
+	ERRDOS, ERRnoaccess, 0xc000028b}, {
+	ERRHRD, ERRgeneral, 0xc000028c}, {
+	ERRDOS, ERRnoaccess, 0xc000028d}, {
+	ERRDOS, ERRnoaccess, 0xc000028e}, {
+	ERRDOS, ERRnoaccess, 0xc000028f}, {
+	ERRDOS, ERRnoaccess, 0xc0000290}, {
+	ERRDOS, ERRbadfunc, 0xc000029c}, {
+	ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, {
+	ERRDOS, ERRinvlevel, 0x007c0001}, };
+
+/*****************************************************************************
+ Print an error message from the status code
+ *****************************************************************************/
+static void
+cifs_print_status(__u32 status_code)
+{
+	int idx = 0;
+
+	while (nt_errs[idx].nt_errstr != NULL) {
+		if (((nt_errs[idx].nt_errcode) & 0xFFFFFF) ==
+		    (status_code & 0xFFFFFF)) {
+			printk(KERN_NOTICE "Status code returned 0x%08x %s\n",
+				   status_code, nt_errs[idx].nt_errstr);
+		}
+		idx++;
+	}
+	return;
+}
+
+
+static void
+ntstatus_to_dos(__u32 ntstatus, __u8 *eclass, __u16 *ecode)
+{
+	int i;
+	if (ntstatus == 0) {
+		*eclass = 0;
+		*ecode = 0;
+		return;
+	}
+	for (i = 0; ntstatus_to_dos_map[i].ntstatus; i++) {
+		if (ntstatus == ntstatus_to_dos_map[i].ntstatus) {
+			*eclass = ntstatus_to_dos_map[i].dos_class;
+			*ecode = ntstatus_to_dos_map[i].dos_code;
+			return;
+		}
+	}
+	*eclass = ERRHRD;
+	*ecode = ERRgeneral;
+}
+
+int
+map_smb_to_linux_error(struct smb_hdr *smb, bool logErr)
+{
+	unsigned int i;
+	int rc = -EIO;	/* if transport error smb error may not be set */
+	__u8 smberrclass;
+	__u16 smberrcode;
+
+	/* BB if NT Status codes - map NT BB */
+
+	/* old style smb error codes */
+	if (smb->Status.CifsError == 0)
+		return 0;
+
+	if (smb->Flags2 & SMBFLG2_ERR_STATUS) {
+		/* translate the newer STATUS codes to old style SMB errors
+		 * and then to POSIX errors */
+		__u32 err = le32_to_cpu(smb->Status.CifsError);
+		if (logErr && (err != (NT_STATUS_MORE_PROCESSING_REQUIRED)))
+			cifs_print_status(err);
+		else if (cifsFYI & CIFS_RC)
+			cifs_print_status(err);
+		ntstatus_to_dos(err, &smberrclass, &smberrcode);
+	} else {
+		smberrclass = smb->Status.DosError.ErrorClass;
+		smberrcode = le16_to_cpu(smb->Status.DosError.Error);
+	}
+
+	/* old style errors */
+
+	/* DOS class smb error codes - map DOS */
+	if (smberrclass == ERRDOS) {
+		/* 1 byte field no need to byte reverse */
+		for (i = 0;
+		     i <
+		     sizeof(mapping_table_ERRDOS) /
+		     sizeof(struct smb_to_posix_error); i++) {
+			if (mapping_table_ERRDOS[i].smb_err == 0)
+				break;
+			else if (mapping_table_ERRDOS[i].smb_err ==
+								smberrcode) {
+				rc = mapping_table_ERRDOS[i].posix_code;
+				break;
+			}
+			/* else try next error mapping one to see if match */
+		}
+	} else if (smberrclass == ERRSRV) {
+		/* server class of error codes */
+		for (i = 0;
+		     i <
+		     sizeof(mapping_table_ERRSRV) /
+		     sizeof(struct smb_to_posix_error); i++) {
+			if (mapping_table_ERRSRV[i].smb_err == 0)
+				break;
+			else if (mapping_table_ERRSRV[i].smb_err ==
+								smberrcode) {
+				rc = mapping_table_ERRSRV[i].posix_code;
+				break;
+			}
+			/* else try next error mapping to see if match */
+		}
+	}
+	/* else ERRHRD class errors or junk  - return EIO */
+
+	cFYI(1, "Mapping smb error code 0x%x to POSIX err %d",
+		 le32_to_cpu(smb->Status.CifsError), rc);
+
+	/* generic corrective action e.g. reconnect SMB session on
+	 * ERRbaduid could be added */
+
+	return rc;
+}
+
+/*
+ * calculate the size of the SMB message based on the fixed header
+ * portion, the number of word parameters and the data portion of the message
+ */
+unsigned int
+smbCalcSize(struct smb_hdr *ptr)
+{
+	return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
+		2 /* size of the bcc field */ + get_bcc(ptr));
+}
+
+/* The following are taken from fs/ntfs/util.c */
+
+#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
+
+/*
+ * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
+ * into Unix UTC (based 1970-01-01, in seconds).
+ */
+struct timespec
+cifs_NTtimeToUnix(__le64 ntutc)
+{
+	struct timespec ts;
+	/* BB what about the timezone? BB */
+
+	/* Subtract the NTFS time offset, then convert to 1s intervals. */
+	u64 t;
+
+	t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
+	ts.tv_nsec = do_div(t, 10000000) * 100;
+	ts.tv_sec = t;
+	return ts;
+}
+
+/* Convert the Unix UTC into NT UTC. */
+u64
+cifs_UnixTimeToNT(struct timespec t)
+{
+	/* Convert to 100ns intervals and then add the NTFS time offset. */
+	return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET;
+}
+
+static int total_days_of_prev_months[] =
+{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
+
+struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
+{
+	struct timespec ts;
+	int sec, min, days, month, year;
+	u16 date = le16_to_cpu(le_date);
+	u16 time = le16_to_cpu(le_time);
+	SMB_TIME *st = (SMB_TIME *)&time;
+	SMB_DATE *sd = (SMB_DATE *)&date;
+
+	cFYI(1, "date %d time %d", date, time);
+
+	sec = 2 * st->TwoSeconds;
+	min = st->Minutes;
+	if ((sec > 59) || (min > 59))
+		cERROR(1, "illegal time min %d sec %d", min, sec);
+	sec += (min * 60);
+	sec += 60 * 60 * st->Hours;
+	if (st->Hours > 24)
+		cERROR(1, "illegal hours %d", st->Hours);
+	days = sd->Day;
+	month = sd->Month;
+	if ((days > 31) || (month > 12)) {
+		cERROR(1, "illegal date, month %d day: %d", month, days);
+		if (month > 12)
+			month = 12;
+	}
+	month -= 1;
+	days += total_days_of_prev_months[month];
+	days += 3652; /* account for difference in days between 1980 and 1970 */
+	year = sd->Year;
+	days += year * 365;
+	days += (year/4); /* leap year */
+	/* generalized leap year calculation is more complex, ie no leap year
+	for years/100 except for years/400, but since the maximum number for DOS
+	 year is 2**7, the last year is 1980+127, which means we need only
+	 consider 2 special case years, ie the years 2000 and 2100, and only
+	 adjust for the lack of leap year for the year 2100, as 2000 was a
+	 leap year (divisable by 400) */
+	if (year >= 120)  /* the year 2100 */
+		days = days - 1;  /* do not count leap year for the year 2100 */
+
+	/* adjust for leap year where we are still before leap day */
+	if (year != 120)
+		days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0);
+	sec += 24 * 60 * 60 * days;
+
+	ts.tv_sec = sec + offset;
+
+	/* cFYI(1, "sec after cnvrt dos to unix time %d",sec); */
+
+	ts.tv_nsec = 0;
+	return ts;
+}
diff --git a/fs/cifs/nterr.c b/fs/cifs/nterr.c
new file mode 100644
index 00000000..819fd994
--- /dev/null
+++ b/fs/cifs/nterr.c
@@ -0,0 +1,687 @@
+/*
+ *  Unix SMB/Netbios implementation.
+ *  Version 1.9.
+ *  RPC Pipe client / server routines
+ *  Copyright (C) Luke Kenneth Casson Leighton 1997-2001.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* NT error codes - see nterr.h */
+#include <linux/types.h>
+#include <linux/fs.h>
+#include "nterr.h"
+
+const struct nt_err_code_struct nt_errs[] = {
+	{"NT_STATUS_OK", NT_STATUS_OK},
+	{"NT_STATUS_UNSUCCESSFUL", NT_STATUS_UNSUCCESSFUL},
+	{"NT_STATUS_NOT_IMPLEMENTED", NT_STATUS_NOT_IMPLEMENTED},
+	{"NT_STATUS_INVALID_INFO_CLASS", NT_STATUS_INVALID_INFO_CLASS},
+	{"NT_STATUS_INFO_LENGTH_MISMATCH", NT_STATUS_INFO_LENGTH_MISMATCH},
+	{"NT_STATUS_ACCESS_VIOLATION", NT_STATUS_ACCESS_VIOLATION},
+	{"STATUS_BUFFER_OVERFLOW", STATUS_BUFFER_OVERFLOW},
+	{"NT_STATUS_IN_PAGE_ERROR", NT_STATUS_IN_PAGE_ERROR},
+	{"NT_STATUS_PAGEFILE_QUOTA", NT_STATUS_PAGEFILE_QUOTA},
+	{"NT_STATUS_INVALID_HANDLE", NT_STATUS_INVALID_HANDLE},
+	{"NT_STATUS_BAD_INITIAL_STACK", NT_STATUS_BAD_INITIAL_STACK},
+	{"NT_STATUS_BAD_INITIAL_PC", NT_STATUS_BAD_INITIAL_PC},
+	{"NT_STATUS_INVALID_CID", NT_STATUS_INVALID_CID},
+	{"NT_STATUS_TIMER_NOT_CANCELED", NT_STATUS_TIMER_NOT_CANCELED},
+	{"NT_STATUS_INVALID_PARAMETER", NT_STATUS_INVALID_PARAMETER},
+	{"NT_STATUS_NO_SUCH_DEVICE", NT_STATUS_NO_SUCH_DEVICE},
+	{"NT_STATUS_NO_SUCH_FILE", NT_STATUS_NO_SUCH_FILE},
+	{"NT_STATUS_INVALID_DEVICE_REQUEST",
+	 NT_STATUS_INVALID_DEVICE_REQUEST},
+	{"NT_STATUS_END_OF_FILE", NT_STATUS_END_OF_FILE},
+	{"NT_STATUS_WRONG_VOLUME", NT_STATUS_WRONG_VOLUME},
+	{"NT_STATUS_NO_MEDIA_IN_DEVICE", NT_STATUS_NO_MEDIA_IN_DEVICE},
+	{"NT_STATUS_UNRECOGNIZED_MEDIA", NT_STATUS_UNRECOGNIZED_MEDIA},
+	{"NT_STATUS_NONEXISTENT_SECTOR", NT_STATUS_NONEXISTENT_SECTOR},
+	{"NT_STATUS_MORE_PROCESSING_REQUIRED",
+	 NT_STATUS_MORE_PROCESSING_REQUIRED},
+	{"NT_STATUS_NO_MEMORY", NT_STATUS_NO_MEMORY},
+	{"NT_STATUS_CONFLICTING_ADDRESSES",
+	 NT_STATUS_CONFLICTING_ADDRESSES},
+	{"NT_STATUS_NOT_MAPPED_VIEW", NT_STATUS_NOT_MAPPED_VIEW},
+	{"NT_STATUS_UNABLE_TO_FREE_VM", NT_STATUS_UNABLE_TO_FREE_VM},
+	{"NT_STATUS_UNABLE_TO_DELETE_SECTION",
+	 NT_STATUS_UNABLE_TO_DELETE_SECTION},
+	{"NT_STATUS_INVALID_SYSTEM_SERVICE",
+	 NT_STATUS_INVALID_SYSTEM_SERVICE},
+	{"NT_STATUS_ILLEGAL_INSTRUCTION", NT_STATUS_ILLEGAL_INSTRUCTION},
+	{"NT_STATUS_INVALID_LOCK_SEQUENCE",
+	 NT_STATUS_INVALID_LOCK_SEQUENCE},
+	{"NT_STATUS_INVALID_VIEW_SIZE", NT_STATUS_INVALID_VIEW_SIZE},
+	{"NT_STATUS_INVALID_FILE_FOR_SECTION",
+	 NT_STATUS_INVALID_FILE_FOR_SECTION},
+	{"NT_STATUS_ALREADY_COMMITTED", NT_STATUS_ALREADY_COMMITTED},
+	{"NT_STATUS_ACCESS_DENIED", NT_STATUS_ACCESS_DENIED},
+	{"NT_STATUS_BUFFER_TOO_SMALL", NT_STATUS_BUFFER_TOO_SMALL},
+	{"NT_STATUS_OBJECT_TYPE_MISMATCH", NT_STATUS_OBJECT_TYPE_MISMATCH},
+	{"NT_STATUS_NONCONTINUABLE_EXCEPTION",
+	 NT_STATUS_NONCONTINUABLE_EXCEPTION},
+	{"NT_STATUS_INVALID_DISPOSITION", NT_STATUS_INVALID_DISPOSITION},
+	{"NT_STATUS_UNWIND", NT_STATUS_UNWIND},
+	{"NT_STATUS_BAD_STACK", NT_STATUS_BAD_STACK},
+	{"NT_STATUS_INVALID_UNWIND_TARGET",
+	 NT_STATUS_INVALID_UNWIND_TARGET},
+	{"NT_STATUS_NOT_LOCKED", NT_STATUS_NOT_LOCKED},
+	{"NT_STATUS_PARITY_ERROR", NT_STATUS_PARITY_ERROR},
+	{"NT_STATUS_UNABLE_TO_DECOMMIT_VM",
+	 NT_STATUS_UNABLE_TO_DECOMMIT_VM},
+	{"NT_STATUS_NOT_COMMITTED", NT_STATUS_NOT_COMMITTED},
+	{"NT_STATUS_INVALID_PORT_ATTRIBUTES",
+	 NT_STATUS_INVALID_PORT_ATTRIBUTES},
+	{"NT_STATUS_PORT_MESSAGE_TOO_LONG",
+	 NT_STATUS_PORT_MESSAGE_TOO_LONG},
+	{"NT_STATUS_INVALID_PARAMETER_MIX",
+	 NT_STATUS_INVALID_PARAMETER_MIX},
+	{"NT_STATUS_INVALID_QUOTA_LOWER", NT_STATUS_INVALID_QUOTA_LOWER},
+	{"NT_STATUS_DISK_CORRUPT_ERROR", NT_STATUS_DISK_CORRUPT_ERROR},
+	{"NT_STATUS_OBJECT_NAME_INVALID", NT_STATUS_OBJECT_NAME_INVALID},
+	{"NT_STATUS_OBJECT_NAME_NOT_FOUND",
+	 NT_STATUS_OBJECT_NAME_NOT_FOUND},
+	{"NT_STATUS_OBJECT_NAME_COLLISION",
+	 NT_STATUS_OBJECT_NAME_COLLISION},
+	{"NT_STATUS_HANDLE_NOT_WAITABLE", NT_STATUS_HANDLE_NOT_WAITABLE},
+	{"NT_STATUS_PORT_DISCONNECTED", NT_STATUS_PORT_DISCONNECTED},
+	{"NT_STATUS_DEVICE_ALREADY_ATTACHED",
+	 NT_STATUS_DEVICE_ALREADY_ATTACHED},
+	{"NT_STATUS_OBJECT_PATH_INVALID", NT_STATUS_OBJECT_PATH_INVALID},
+	{"NT_STATUS_OBJECT_PATH_NOT_FOUND",
+	 NT_STATUS_OBJECT_PATH_NOT_FOUND},
+	{"NT_STATUS_OBJECT_PATH_SYNTAX_BAD",
+	 NT_STATUS_OBJECT_PATH_SYNTAX_BAD},
+	{"NT_STATUS_DATA_OVERRUN", NT_STATUS_DATA_OVERRUN},
+	{"NT_STATUS_DATA_LATE_ERROR", NT_STATUS_DATA_LATE_ERROR},
+	{"NT_STATUS_DATA_ERROR", NT_STATUS_DATA_ERROR},
+	{"NT_STATUS_CRC_ERROR", NT_STATUS_CRC_ERROR},
+	{"NT_STATUS_SECTION_TOO_BIG", NT_STATUS_SECTION_TOO_BIG},
+	{"NT_STATUS_PORT_CONNECTION_REFUSED",
+	 NT_STATUS_PORT_CONNECTION_REFUSED},
+	{"NT_STATUS_INVALID_PORT_HANDLE", NT_STATUS_INVALID_PORT_HANDLE},
+	{"NT_STATUS_SHARING_VIOLATION", NT_STATUS_SHARING_VIOLATION},
+	{"NT_STATUS_QUOTA_EXCEEDED", NT_STATUS_QUOTA_EXCEEDED},
+	{"NT_STATUS_INVALID_PAGE_PROTECTION",
+	 NT_STATUS_INVALID_PAGE_PROTECTION},
+	{"NT_STATUS_MUTANT_NOT_OWNED", NT_STATUS_MUTANT_NOT_OWNED},
+	{"NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED",
+	 NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED},
+	{"NT_STATUS_PORT_ALREADY_SET", NT_STATUS_PORT_ALREADY_SET},
+	{"NT_STATUS_SECTION_NOT_IMAGE", NT_STATUS_SECTION_NOT_IMAGE},
+	{"NT_STATUS_SUSPEND_COUNT_EXCEEDED",
+	 NT_STATUS_SUSPEND_COUNT_EXCEEDED},
+	{"NT_STATUS_THREAD_IS_TERMINATING",
+	 NT_STATUS_THREAD_IS_TERMINATING},
+	{"NT_STATUS_BAD_WORKING_SET_LIMIT",
+	 NT_STATUS_BAD_WORKING_SET_LIMIT},
+	{"NT_STATUS_INCOMPATIBLE_FILE_MAP",
+	 NT_STATUS_INCOMPATIBLE_FILE_MAP},
+	{"NT_STATUS_SECTION_PROTECTION", NT_STATUS_SECTION_PROTECTION},
+	{"NT_STATUS_EAS_NOT_SUPPORTED", NT_STATUS_EAS_NOT_SUPPORTED},
+	{"NT_STATUS_EA_TOO_LARGE", NT_STATUS_EA_TOO_LARGE},
+	{"NT_STATUS_NONEXISTENT_EA_ENTRY", NT_STATUS_NONEXISTENT_EA_ENTRY},
+	{"NT_STATUS_NO_EAS_ON_FILE", NT_STATUS_NO_EAS_ON_FILE},
+	{"NT_STATUS_EA_CORRUPT_ERROR", NT_STATUS_EA_CORRUPT_ERROR},
+	{"NT_STATUS_FILE_LOCK_CONFLICT", NT_STATUS_FILE_LOCK_CONFLICT},
+	{"NT_STATUS_LOCK_NOT_GRANTED", NT_STATUS_LOCK_NOT_GRANTED},
+	{"NT_STATUS_DELETE_PENDING", NT_STATUS_DELETE_PENDING},
+	{"NT_STATUS_CTL_FILE_NOT_SUPPORTED",
+	 NT_STATUS_CTL_FILE_NOT_SUPPORTED},
+	{"NT_STATUS_UNKNOWN_REVISION", NT_STATUS_UNKNOWN_REVISION},
+	{"NT_STATUS_REVISION_MISMATCH", NT_STATUS_REVISION_MISMATCH},
+	{"NT_STATUS_INVALID_OWNER", NT_STATUS_INVALID_OWNER},
+	{"NT_STATUS_INVALID_PRIMARY_GROUP",
+	 NT_STATUS_INVALID_PRIMARY_GROUP},
+	{"NT_STATUS_NO_IMPERSONATION_TOKEN",
+	 NT_STATUS_NO_IMPERSONATION_TOKEN},
+	{"NT_STATUS_CANT_DISABLE_MANDATORY",
+	 NT_STATUS_CANT_DISABLE_MANDATORY},
+	{"NT_STATUS_NO_LOGON_SERVERS", NT_STATUS_NO_LOGON_SERVERS},
+	{"NT_STATUS_NO_SUCH_LOGON_SESSION",
+	 NT_STATUS_NO_SUCH_LOGON_SESSION},
+	{"NT_STATUS_NO_SUCH_PRIVILEGE", NT_STATUS_NO_SUCH_PRIVILEGE},
+	{"NT_STATUS_PRIVILEGE_NOT_HELD", NT_STATUS_PRIVILEGE_NOT_HELD},
+	{"NT_STATUS_INVALID_ACCOUNT_NAME", NT_STATUS_INVALID_ACCOUNT_NAME},
+	{"NT_STATUS_USER_EXISTS", NT_STATUS_USER_EXISTS},
+	{"NT_STATUS_NO_SUCH_USER", NT_STATUS_NO_SUCH_USER},
+	{"NT_STATUS_GROUP_EXISTS", NT_STATUS_GROUP_EXISTS},
+	{"NT_STATUS_NO_SUCH_GROUP", NT_STATUS_NO_SUCH_GROUP},
+	{"NT_STATUS_MEMBER_IN_GROUP", NT_STATUS_MEMBER_IN_GROUP},
+	{"NT_STATUS_MEMBER_NOT_IN_GROUP", NT_STATUS_MEMBER_NOT_IN_GROUP},
+	{"NT_STATUS_LAST_ADMIN", NT_STATUS_LAST_ADMIN},
+	{"NT_STATUS_WRONG_PASSWORD", NT_STATUS_WRONG_PASSWORD},
+	{"NT_STATUS_ILL_FORMED_PASSWORD", NT_STATUS_ILL_FORMED_PASSWORD},
+	{"NT_STATUS_PASSWORD_RESTRICTION", NT_STATUS_PASSWORD_RESTRICTION},
+	{"NT_STATUS_LOGON_FAILURE", NT_STATUS_LOGON_FAILURE},
+	{"NT_STATUS_ACCOUNT_RESTRICTION", NT_STATUS_ACCOUNT_RESTRICTION},
+	{"NT_STATUS_INVALID_LOGON_HOURS", NT_STATUS_INVALID_LOGON_HOURS},
+	{"NT_STATUS_INVALID_WORKSTATION", NT_STATUS_INVALID_WORKSTATION},
+	{"NT_STATUS_PASSWORD_EXPIRED", NT_STATUS_PASSWORD_EXPIRED},
+	{"NT_STATUS_ACCOUNT_DISABLED", NT_STATUS_ACCOUNT_DISABLED},
+	{"NT_STATUS_NONE_MAPPED", NT_STATUS_NONE_MAPPED},
+	{"NT_STATUS_TOO_MANY_LUIDS_REQUESTED",
+	 NT_STATUS_TOO_MANY_LUIDS_REQUESTED},
+	{"NT_STATUS_LUIDS_EXHAUSTED", NT_STATUS_LUIDS_EXHAUSTED},
+	{"NT_STATUS_INVALID_SUB_AUTHORITY",
+	 NT_STATUS_INVALID_SUB_AUTHORITY},
+	{"NT_STATUS_INVALID_ACL", NT_STATUS_INVALID_ACL},
+	{"NT_STATUS_INVALID_SID", NT_STATUS_INVALID_SID},
+	{"NT_STATUS_INVALID_SECURITY_DESCR",
+	 NT_STATUS_INVALID_SECURITY_DESCR},
+	{"NT_STATUS_PROCEDURE_NOT_FOUND", NT_STATUS_PROCEDURE_NOT_FOUND},
+	{"NT_STATUS_INVALID_IMAGE_FORMAT", NT_STATUS_INVALID_IMAGE_FORMAT},
+	{"NT_STATUS_NO_TOKEN", NT_STATUS_NO_TOKEN},
+	{"NT_STATUS_BAD_INHERITANCE_ACL", NT_STATUS_BAD_INHERITANCE_ACL},
+	{"NT_STATUS_RANGE_NOT_LOCKED", NT_STATUS_RANGE_NOT_LOCKED},
+	{"NT_STATUS_DISK_FULL", NT_STATUS_DISK_FULL},
+	{"NT_STATUS_SERVER_DISABLED", NT_STATUS_SERVER_DISABLED},
+	{"NT_STATUS_SERVER_NOT_DISABLED", NT_STATUS_SERVER_NOT_DISABLED},
+	{"NT_STATUS_TOO_MANY_GUIDS_REQUESTED",
+	 NT_STATUS_TOO_MANY_GUIDS_REQUESTED},
+	{"NT_STATUS_GUIDS_EXHAUSTED", NT_STATUS_GUIDS_EXHAUSTED},
+	{"NT_STATUS_INVALID_ID_AUTHORITY", NT_STATUS_INVALID_ID_AUTHORITY},
+	{"NT_STATUS_AGENTS_EXHAUSTED", NT_STATUS_AGENTS_EXHAUSTED},
+	{"NT_STATUS_INVALID_VOLUME_LABEL", NT_STATUS_INVALID_VOLUME_LABEL},
+	{"NT_STATUS_SECTION_NOT_EXTENDED", NT_STATUS_SECTION_NOT_EXTENDED},
+	{"NT_STATUS_NOT_MAPPED_DATA", NT_STATUS_NOT_MAPPED_DATA},
+	{"NT_STATUS_RESOURCE_DATA_NOT_FOUND",
+	 NT_STATUS_RESOURCE_DATA_NOT_FOUND},
+	{"NT_STATUS_RESOURCE_TYPE_NOT_FOUND",
+	 NT_STATUS_RESOURCE_TYPE_NOT_FOUND},
+	{"NT_STATUS_RESOURCE_NAME_NOT_FOUND",
+	 NT_STATUS_RESOURCE_NAME_NOT_FOUND},
+	{"NT_STATUS_ARRAY_BOUNDS_EXCEEDED",
+	 NT_STATUS_ARRAY_BOUNDS_EXCEEDED},
+	{"NT_STATUS_FLOAT_DENORMAL_OPERAND",
+	 NT_STATUS_FLOAT_DENORMAL_OPERAND},
+	{"NT_STATUS_FLOAT_DIVIDE_BY_ZERO", NT_STATUS_FLOAT_DIVIDE_BY_ZERO},
+	{"NT_STATUS_FLOAT_INEXACT_RESULT", NT_STATUS_FLOAT_INEXACT_RESULT},
+	{"NT_STATUS_FLOAT_INVALID_OPERATION",
+	 NT_STATUS_FLOAT_INVALID_OPERATION},
+	{"NT_STATUS_FLOAT_OVERFLOW", NT_STATUS_FLOAT_OVERFLOW},
+	{"NT_STATUS_FLOAT_STACK_CHECK", NT_STATUS_FLOAT_STACK_CHECK},
+	{"NT_STATUS_FLOAT_UNDERFLOW", NT_STATUS_FLOAT_UNDERFLOW},
+	{"NT_STATUS_INTEGER_DIVIDE_BY_ZERO",
+	 NT_STATUS_INTEGER_DIVIDE_BY_ZERO},
+	{"NT_STATUS_INTEGER_OVERFLOW", NT_STATUS_INTEGER_OVERFLOW},
+	{"NT_STATUS_PRIVILEGED_INSTRUCTION",
+	 NT_STATUS_PRIVILEGED_INSTRUCTION},
+	{"NT_STATUS_TOO_MANY_PAGING_FILES",
+	 NT_STATUS_TOO_MANY_PAGING_FILES},
+	{"NT_STATUS_FILE_INVALID", NT_STATUS_FILE_INVALID},
+	{"NT_STATUS_ALLOTTED_SPACE_EXCEEDED",
+	 NT_STATUS_ALLOTTED_SPACE_EXCEEDED},
+	{"NT_STATUS_INSUFFICIENT_RESOURCES",
+	 NT_STATUS_INSUFFICIENT_RESOURCES},
+	{"NT_STATUS_DFS_EXIT_PATH_FOUND", NT_STATUS_DFS_EXIT_PATH_FOUND},
+	{"NT_STATUS_DEVICE_DATA_ERROR", NT_STATUS_DEVICE_DATA_ERROR},
+	{"NT_STATUS_DEVICE_NOT_CONNECTED", NT_STATUS_DEVICE_NOT_CONNECTED},
+	{"NT_STATUS_DEVICE_POWER_FAILURE", NT_STATUS_DEVICE_POWER_FAILURE},
+	{"NT_STATUS_FREE_VM_NOT_AT_BASE", NT_STATUS_FREE_VM_NOT_AT_BASE},
+	{"NT_STATUS_MEMORY_NOT_ALLOCATED", NT_STATUS_MEMORY_NOT_ALLOCATED},
+	{"NT_STATUS_WORKING_SET_QUOTA", NT_STATUS_WORKING_SET_QUOTA},
+	{"NT_STATUS_MEDIA_WRITE_PROTECTED",
+	 NT_STATUS_MEDIA_WRITE_PROTECTED},
+	{"NT_STATUS_DEVICE_NOT_READY", NT_STATUS_DEVICE_NOT_READY},
+	{"NT_STATUS_INVALID_GROUP_ATTRIBUTES",
+	 NT_STATUS_INVALID_GROUP_ATTRIBUTES},
+	{"NT_STATUS_BAD_IMPERSONATION_LEVEL",
+	 NT_STATUS_BAD_IMPERSONATION_LEVEL},
+	{"NT_STATUS_CANT_OPEN_ANONYMOUS", NT_STATUS_CANT_OPEN_ANONYMOUS},
+	{"NT_STATUS_BAD_VALIDATION_CLASS", NT_STATUS_BAD_VALIDATION_CLASS},
+	{"NT_STATUS_BAD_TOKEN_TYPE", NT_STATUS_BAD_TOKEN_TYPE},
+	{"NT_STATUS_BAD_MASTER_BOOT_RECORD",
+	 NT_STATUS_BAD_MASTER_BOOT_RECORD},
+	{"NT_STATUS_INSTRUCTION_MISALIGNMENT",
+	 NT_STATUS_INSTRUCTION_MISALIGNMENT},
+	{"NT_STATUS_INSTANCE_NOT_AVAILABLE",
+	 NT_STATUS_INSTANCE_NOT_AVAILABLE},
+	{"NT_STATUS_PIPE_NOT_AVAILABLE", NT_STATUS_PIPE_NOT_AVAILABLE},
+	{"NT_STATUS_INVALID_PIPE_STATE", NT_STATUS_INVALID_PIPE_STATE},
+	{"NT_STATUS_PIPE_BUSY", NT_STATUS_PIPE_BUSY},
+	{"NT_STATUS_ILLEGAL_FUNCTION", NT_STATUS_ILLEGAL_FUNCTION},
+	{"NT_STATUS_PIPE_DISCONNECTED", NT_STATUS_PIPE_DISCONNECTED},
+	{"NT_STATUS_PIPE_CLOSING", NT_STATUS_PIPE_CLOSING},
+	{"NT_STATUS_PIPE_CONNECTED", NT_STATUS_PIPE_CONNECTED},
+	{"NT_STATUS_PIPE_LISTENING", NT_STATUS_PIPE_LISTENING},
+	{"NT_STATUS_INVALID_READ_MODE", NT_STATUS_INVALID_READ_MODE},
+	{"NT_STATUS_IO_TIMEOUT", NT_STATUS_IO_TIMEOUT},
+	{"NT_STATUS_FILE_FORCED_CLOSED", NT_STATUS_FILE_FORCED_CLOSED},
+	{"NT_STATUS_PROFILING_NOT_STARTED",
+	 NT_STATUS_PROFILING_NOT_STARTED},
+	{"NT_STATUS_PROFILING_NOT_STOPPED",
+	 NT_STATUS_PROFILING_NOT_STOPPED},
+	{"NT_STATUS_COULD_NOT_INTERPRET", NT_STATUS_COULD_NOT_INTERPRET},
+	{"NT_STATUS_FILE_IS_A_DIRECTORY", NT_STATUS_FILE_IS_A_DIRECTORY},
+	{"NT_STATUS_NOT_SUPPORTED", NT_STATUS_NOT_SUPPORTED},
+	{"NT_STATUS_REMOTE_NOT_LISTENING", NT_STATUS_REMOTE_NOT_LISTENING},
+	{"NT_STATUS_DUPLICATE_NAME", NT_STATUS_DUPLICATE_NAME},
+	{"NT_STATUS_BAD_NETWORK_PATH", NT_STATUS_BAD_NETWORK_PATH},
+	{"NT_STATUS_NETWORK_BUSY", NT_STATUS_NETWORK_BUSY},
+	{"NT_STATUS_DEVICE_DOES_NOT_EXIST",
+	 NT_STATUS_DEVICE_DOES_NOT_EXIST},
+	{"NT_STATUS_TOO_MANY_COMMANDS", NT_STATUS_TOO_MANY_COMMANDS},
+	{"NT_STATUS_ADAPTER_HARDWARE_ERROR",
+	 NT_STATUS_ADAPTER_HARDWARE_ERROR},
+	{"NT_STATUS_INVALID_NETWORK_RESPONSE",
+	 NT_STATUS_INVALID_NETWORK_RESPONSE},
+	{"NT_STATUS_UNEXPECTED_NETWORK_ERROR",
+	 NT_STATUS_UNEXPECTED_NETWORK_ERROR},
+	{"NT_STATUS_BAD_REMOTE_ADAPTER", NT_STATUS_BAD_REMOTE_ADAPTER},
+	{"NT_STATUS_PRINT_QUEUE_FULL", NT_STATUS_PRINT_QUEUE_FULL},
+	{"NT_STATUS_NO_SPOOL_SPACE", NT_STATUS_NO_SPOOL_SPACE},
+	{"NT_STATUS_PRINT_CANCELLED", NT_STATUS_PRINT_CANCELLED},
+	{"NT_STATUS_NETWORK_NAME_DELETED", NT_STATUS_NETWORK_NAME_DELETED},
+	{"NT_STATUS_NETWORK_ACCESS_DENIED",
+	 NT_STATUS_NETWORK_ACCESS_DENIED},
+	{"NT_STATUS_BAD_DEVICE_TYPE", NT_STATUS_BAD_DEVICE_TYPE},
+	{"NT_STATUS_BAD_NETWORK_NAME", NT_STATUS_BAD_NETWORK_NAME},
+	{"NT_STATUS_TOO_MANY_NAMES", NT_STATUS_TOO_MANY_NAMES},
+	{"NT_STATUS_TOO_MANY_SESSIONS", NT_STATUS_TOO_MANY_SESSIONS},
+	{"NT_STATUS_SHARING_PAUSED", NT_STATUS_SHARING_PAUSED},
+	{"NT_STATUS_REQUEST_NOT_ACCEPTED", NT_STATUS_REQUEST_NOT_ACCEPTED},
+	{"NT_STATUS_REDIRECTOR_PAUSED", NT_STATUS_REDIRECTOR_PAUSED},
+	{"NT_STATUS_NET_WRITE_FAULT", NT_STATUS_NET_WRITE_FAULT},
+	{"NT_STATUS_PROFILING_AT_LIMIT", NT_STATUS_PROFILING_AT_LIMIT},
+	{"NT_STATUS_NOT_SAME_DEVICE", NT_STATUS_NOT_SAME_DEVICE},
+	{"NT_STATUS_FILE_RENAMED", NT_STATUS_FILE_RENAMED},
+	{"NT_STATUS_VIRTUAL_CIRCUIT_CLOSED",
+	 NT_STATUS_VIRTUAL_CIRCUIT_CLOSED},
+	{"NT_STATUS_NO_SECURITY_ON_OBJECT",
+	 NT_STATUS_NO_SECURITY_ON_OBJECT},
+	{"NT_STATUS_CANT_WAIT", NT_STATUS_CANT_WAIT},
+	{"NT_STATUS_PIPE_EMPTY", NT_STATUS_PIPE_EMPTY},
+	{"NT_STATUS_CANT_ACCESS_DOMAIN_INFO",
+	 NT_STATUS_CANT_ACCESS_DOMAIN_INFO},
+	{"NT_STATUS_CANT_TERMINATE_SELF", NT_STATUS_CANT_TERMINATE_SELF},
+	{"NT_STATUS_INVALID_SERVER_STATE", NT_STATUS_INVALID_SERVER_STATE},
+	{"NT_STATUS_INVALID_DOMAIN_STATE", NT_STATUS_INVALID_DOMAIN_STATE},
+	{"NT_STATUS_INVALID_DOMAIN_ROLE", NT_STATUS_INVALID_DOMAIN_ROLE},
+	{"NT_STATUS_NO_SUCH_DOMAIN", NT_STATUS_NO_SUCH_DOMAIN},
+	{"NT_STATUS_DOMAIN_EXISTS", NT_STATUS_DOMAIN_EXISTS},
+	{"NT_STATUS_DOMAIN_LIMIT_EXCEEDED",
+	 NT_STATUS_DOMAIN_LIMIT_EXCEEDED},
+	{"NT_STATUS_OPLOCK_NOT_GRANTED", NT_STATUS_OPLOCK_NOT_GRANTED},
+	{"NT_STATUS_INVALID_OPLOCK_PROTOCOL",
+	 NT_STATUS_INVALID_OPLOCK_PROTOCOL},
+	{"NT_STATUS_INTERNAL_DB_CORRUPTION",
+	 NT_STATUS_INTERNAL_DB_CORRUPTION},
+	{"NT_STATUS_INTERNAL_ERROR", NT_STATUS_INTERNAL_ERROR},
+	{"NT_STATUS_GENERIC_NOT_MAPPED", NT_STATUS_GENERIC_NOT_MAPPED},
+	{"NT_STATUS_BAD_DESCRIPTOR_FORMAT",
+	 NT_STATUS_BAD_DESCRIPTOR_FORMAT},
+	{"NT_STATUS_INVALID_USER_BUFFER", NT_STATUS_INVALID_USER_BUFFER},
+	{"NT_STATUS_UNEXPECTED_IO_ERROR", NT_STATUS_UNEXPECTED_IO_ERROR},
+	{"NT_STATUS_UNEXPECTED_MM_CREATE_ERR",
+	 NT_STATUS_UNEXPECTED_MM_CREATE_ERR},
+	{"NT_STATUS_UNEXPECTED_MM_MAP_ERROR",
+	 NT_STATUS_UNEXPECTED_MM_MAP_ERROR},
+	{"NT_STATUS_UNEXPECTED_MM_EXTEND_ERR",
+	 NT_STATUS_UNEXPECTED_MM_EXTEND_ERR},
+	{"NT_STATUS_NOT_LOGON_PROCESS", NT_STATUS_NOT_LOGON_PROCESS},
+	{"NT_STATUS_LOGON_SESSION_EXISTS", NT_STATUS_LOGON_SESSION_EXISTS},
+	{"NT_STATUS_INVALID_PARAMETER_1", NT_STATUS_INVALID_PARAMETER_1},
+	{"NT_STATUS_INVALID_PARAMETER_2", NT_STATUS_INVALID_PARAMETER_2},
+	{"NT_STATUS_INVALID_PARAMETER_3", NT_STATUS_INVALID_PARAMETER_3},
+	{"NT_STATUS_INVALID_PARAMETER_4", NT_STATUS_INVALID_PARAMETER_4},
+	{"NT_STATUS_INVALID_PARAMETER_5", NT_STATUS_INVALID_PARAMETER_5},
+	{"NT_STATUS_INVALID_PARAMETER_6", NT_STATUS_INVALID_PARAMETER_6},
+	{"NT_STATUS_INVALID_PARAMETER_7", NT_STATUS_INVALID_PARAMETER_7},
+	{"NT_STATUS_INVALID_PARAMETER_8", NT_STATUS_INVALID_PARAMETER_8},
+	{"NT_STATUS_INVALID_PARAMETER_9", NT_STATUS_INVALID_PARAMETER_9},
+	{"NT_STATUS_INVALID_PARAMETER_10", NT_STATUS_INVALID_PARAMETER_10},
+	{"NT_STATUS_INVALID_PARAMETER_11", NT_STATUS_INVALID_PARAMETER_11},
+	{"NT_STATUS_INVALID_PARAMETER_12", NT_STATUS_INVALID_PARAMETER_12},
+	{"NT_STATUS_REDIRECTOR_NOT_STARTED",
+	 NT_STATUS_REDIRECTOR_NOT_STARTED},
+	{"NT_STATUS_REDIRECTOR_STARTED", NT_STATUS_REDIRECTOR_STARTED},
+	{"NT_STATUS_STACK_OVERFLOW", NT_STATUS_STACK_OVERFLOW},
+	{"NT_STATUS_NO_SUCH_PACKAGE", NT_STATUS_NO_SUCH_PACKAGE},
+	{"NT_STATUS_BAD_FUNCTION_TABLE", NT_STATUS_BAD_FUNCTION_TABLE},
+	{"NT_STATUS_DIRECTORY_NOT_EMPTY", NT_STATUS_DIRECTORY_NOT_EMPTY},
+	{"NT_STATUS_FILE_CORRUPT_ERROR", NT_STATUS_FILE_CORRUPT_ERROR},
+	{"NT_STATUS_NOT_A_DIRECTORY", NT_STATUS_NOT_A_DIRECTORY},
+	{"NT_STATUS_BAD_LOGON_SESSION_STATE",
+	 NT_STATUS_BAD_LOGON_SESSION_STATE},
+	{"NT_STATUS_LOGON_SESSION_COLLISION",
+	 NT_STATUS_LOGON_SESSION_COLLISION},
+	{"NT_STATUS_NAME_TOO_LONG", NT_STATUS_NAME_TOO_LONG},
+	{"NT_STATUS_FILES_OPEN", NT_STATUS_FILES_OPEN},
+	{"NT_STATUS_CONNECTION_IN_USE", NT_STATUS_CONNECTION_IN_USE},
+	{"NT_STATUS_MESSAGE_NOT_FOUND", NT_STATUS_MESSAGE_NOT_FOUND},
+	{"NT_STATUS_PROCESS_IS_TERMINATING",
+	 NT_STATUS_PROCESS_IS_TERMINATING},
+	{"NT_STATUS_INVALID_LOGON_TYPE", NT_STATUS_INVALID_LOGON_TYPE},
+	{"NT_STATUS_NO_GUID_TRANSLATION", NT_STATUS_NO_GUID_TRANSLATION},
+	{"NT_STATUS_CANNOT_IMPERSONATE", NT_STATUS_CANNOT_IMPERSONATE},
+	{"NT_STATUS_IMAGE_ALREADY_LOADED", NT_STATUS_IMAGE_ALREADY_LOADED},
+	{"NT_STATUS_ABIOS_NOT_PRESENT", NT_STATUS_ABIOS_NOT_PRESENT},
+	{"NT_STATUS_ABIOS_LID_NOT_EXIST", NT_STATUS_ABIOS_LID_NOT_EXIST},
+	{"NT_STATUS_ABIOS_LID_ALREADY_OWNED",
+	 NT_STATUS_ABIOS_LID_ALREADY_OWNED},
+	{"NT_STATUS_ABIOS_NOT_LID_OWNER", NT_STATUS_ABIOS_NOT_LID_OWNER},
+	{"NT_STATUS_ABIOS_INVALID_COMMAND",
+	 NT_STATUS_ABIOS_INVALID_COMMAND},
+	{"NT_STATUS_ABIOS_INVALID_LID", NT_STATUS_ABIOS_INVALID_LID},
+	{"NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE",
+	 NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE},
+	{"NT_STATUS_ABIOS_INVALID_SELECTOR",
+	 NT_STATUS_ABIOS_INVALID_SELECTOR},
+	{"NT_STATUS_NO_LDT", NT_STATUS_NO_LDT},
+	{"NT_STATUS_INVALID_LDT_SIZE", NT_STATUS_INVALID_LDT_SIZE},
+	{"NT_STATUS_INVALID_LDT_OFFSET", NT_STATUS_INVALID_LDT_OFFSET},
+	{"NT_STATUS_INVALID_LDT_DESCRIPTOR",
+	 NT_STATUS_INVALID_LDT_DESCRIPTOR},
+	{"NT_STATUS_INVALID_IMAGE_NE_FORMAT",
+	 NT_STATUS_INVALID_IMAGE_NE_FORMAT},
+	{"NT_STATUS_RXACT_INVALID_STATE", NT_STATUS_RXACT_INVALID_STATE},
+	{"NT_STATUS_RXACT_COMMIT_FAILURE", NT_STATUS_RXACT_COMMIT_FAILURE},
+	{"NT_STATUS_MAPPED_FILE_SIZE_ZERO",
+	 NT_STATUS_MAPPED_FILE_SIZE_ZERO},
+	{"NT_STATUS_TOO_MANY_OPENED_FILES",
+	 NT_STATUS_TOO_MANY_OPENED_FILES},
+	{"NT_STATUS_CANCELLED", NT_STATUS_CANCELLED},
+	{"NT_STATUS_CANNOT_DELETE", NT_STATUS_CANNOT_DELETE},
+	{"NT_STATUS_INVALID_COMPUTER_NAME",
+	 NT_STATUS_INVALID_COMPUTER_NAME},
+	{"NT_STATUS_FILE_DELETED", NT_STATUS_FILE_DELETED},
+	{"NT_STATUS_SPECIAL_ACCOUNT", NT_STATUS_SPECIAL_ACCOUNT},
+	{"NT_STATUS_SPECIAL_GROUP", NT_STATUS_SPECIAL_GROUP},
+	{"NT_STATUS_SPECIAL_USER", NT_STATUS_SPECIAL_USER},
+	{"NT_STATUS_MEMBERS_PRIMARY_GROUP",
+	 NT_STATUS_MEMBERS_PRIMARY_GROUP},
+	{"NT_STATUS_FILE_CLOSED", NT_STATUS_FILE_CLOSED},
+	{"NT_STATUS_TOO_MANY_THREADS", NT_STATUS_TOO_MANY_THREADS},
+	{"NT_STATUS_THREAD_NOT_IN_PROCESS",
+	 NT_STATUS_THREAD_NOT_IN_PROCESS},
+	{"NT_STATUS_TOKEN_ALREADY_IN_USE", NT_STATUS_TOKEN_ALREADY_IN_USE},
+	{"NT_STATUS_PAGEFILE_QUOTA_EXCEEDED",
+	 NT_STATUS_PAGEFILE_QUOTA_EXCEEDED},
+	{"NT_STATUS_COMMITMENT_LIMIT", NT_STATUS_COMMITMENT_LIMIT},
+	{"NT_STATUS_INVALID_IMAGE_LE_FORMAT",
+	 NT_STATUS_INVALID_IMAGE_LE_FORMAT},
+	{"NT_STATUS_INVALID_IMAGE_NOT_MZ", NT_STATUS_INVALID_IMAGE_NOT_MZ},
+	{"NT_STATUS_INVALID_IMAGE_PROTECT",
+	 NT_STATUS_INVALID_IMAGE_PROTECT},
+	{"NT_STATUS_INVALID_IMAGE_WIN_16", NT_STATUS_INVALID_IMAGE_WIN_16},
+	{"NT_STATUS_LOGON_SERVER_CONFLICT",
+	 NT_STATUS_LOGON_SERVER_CONFLICT},
+	{"NT_STATUS_TIME_DIFFERENCE_AT_DC",
+	 NT_STATUS_TIME_DIFFERENCE_AT_DC},
+	{"NT_STATUS_SYNCHRONIZATION_REQUIRED",
+	 NT_STATUS_SYNCHRONIZATION_REQUIRED},
+	{"NT_STATUS_DLL_NOT_FOUND", NT_STATUS_DLL_NOT_FOUND},
+	{"NT_STATUS_OPEN_FAILED", NT_STATUS_OPEN_FAILED},
+	{"NT_STATUS_IO_PRIVILEGE_FAILED", NT_STATUS_IO_PRIVILEGE_FAILED},
+	{"NT_STATUS_ORDINAL_NOT_FOUND", NT_STATUS_ORDINAL_NOT_FOUND},
+	{"NT_STATUS_ENTRYPOINT_NOT_FOUND", NT_STATUS_ENTRYPOINT_NOT_FOUND},
+	{"NT_STATUS_CONTROL_C_EXIT", NT_STATUS_CONTROL_C_EXIT},
+	{"NT_STATUS_LOCAL_DISCONNECT", NT_STATUS_LOCAL_DISCONNECT},
+	{"NT_STATUS_REMOTE_DISCONNECT", NT_STATUS_REMOTE_DISCONNECT},
+	{"NT_STATUS_REMOTE_RESOURCES", NT_STATUS_REMOTE_RESOURCES},
+	{"NT_STATUS_LINK_FAILED", NT_STATUS_LINK_FAILED},
+	{"NT_STATUS_LINK_TIMEOUT", NT_STATUS_LINK_TIMEOUT},
+	{"NT_STATUS_INVALID_CONNECTION", NT_STATUS_INVALID_CONNECTION},
+	{"NT_STATUS_INVALID_ADDRESS", NT_STATUS_INVALID_ADDRESS},
+	{"NT_STATUS_DLL_INIT_FAILED", NT_STATUS_DLL_INIT_FAILED},
+	{"NT_STATUS_MISSING_SYSTEMFILE", NT_STATUS_MISSING_SYSTEMFILE},
+	{"NT_STATUS_UNHANDLED_EXCEPTION", NT_STATUS_UNHANDLED_EXCEPTION},
+	{"NT_STATUS_APP_INIT_FAILURE", NT_STATUS_APP_INIT_FAILURE},
+	{"NT_STATUS_PAGEFILE_CREATE_FAILED",
+	 NT_STATUS_PAGEFILE_CREATE_FAILED},
+	{"NT_STATUS_NO_PAGEFILE", NT_STATUS_NO_PAGEFILE},
+	{"NT_STATUS_INVALID_LEVEL", NT_STATUS_INVALID_LEVEL},
+	{"NT_STATUS_WRONG_PASSWORD_CORE", NT_STATUS_WRONG_PASSWORD_CORE},
+	{"NT_STATUS_ILLEGAL_FLOAT_CONTEXT",
+	 NT_STATUS_ILLEGAL_FLOAT_CONTEXT},
+	{"NT_STATUS_PIPE_BROKEN", NT_STATUS_PIPE_BROKEN},
+	{"NT_STATUS_REGISTRY_CORRUPT", NT_STATUS_REGISTRY_CORRUPT},
+	{"NT_STATUS_REGISTRY_IO_FAILED", NT_STATUS_REGISTRY_IO_FAILED},
+	{"NT_STATUS_NO_EVENT_PAIR", NT_STATUS_NO_EVENT_PAIR},
+	{"NT_STATUS_UNRECOGNIZED_VOLUME", NT_STATUS_UNRECOGNIZED_VOLUME},
+	{"NT_STATUS_SERIAL_NO_DEVICE_INITED",
+	 NT_STATUS_SERIAL_NO_DEVICE_INITED},
+	{"NT_STATUS_NO_SUCH_ALIAS", NT_STATUS_NO_SUCH_ALIAS},
+	{"NT_STATUS_MEMBER_NOT_IN_ALIAS", NT_STATUS_MEMBER_NOT_IN_ALIAS},
+	{"NT_STATUS_MEMBER_IN_ALIAS", NT_STATUS_MEMBER_IN_ALIAS},
+	{"NT_STATUS_ALIAS_EXISTS", NT_STATUS_ALIAS_EXISTS},
+	{"NT_STATUS_LOGON_NOT_GRANTED", NT_STATUS_LOGON_NOT_GRANTED},
+	{"NT_STATUS_TOO_MANY_SECRETS", NT_STATUS_TOO_MANY_SECRETS},
+	{"NT_STATUS_SECRET_TOO_LONG", NT_STATUS_SECRET_TOO_LONG},
+	{"NT_STATUS_INTERNAL_DB_ERROR", NT_STATUS_INTERNAL_DB_ERROR},
+	{"NT_STATUS_FULLSCREEN_MODE", NT_STATUS_FULLSCREEN_MODE},
+	{"NT_STATUS_TOO_MANY_CONTEXT_IDS", NT_STATUS_TOO_MANY_CONTEXT_IDS},
+	{"NT_STATUS_LOGON_TYPE_NOT_GRANTED",
+	 NT_STATUS_LOGON_TYPE_NOT_GRANTED},
+	{"NT_STATUS_NOT_REGISTRY_FILE", NT_STATUS_NOT_REGISTRY_FILE},
+	{"NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED",
+	 NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED},
+	{"NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR",
+	 NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR},
+	{"NT_STATUS_FT_MISSING_MEMBER", NT_STATUS_FT_MISSING_MEMBER},
+	{"NT_STATUS_ILL_FORMED_SERVICE_ENTRY",
+	 NT_STATUS_ILL_FORMED_SERVICE_ENTRY},
+	{"NT_STATUS_ILLEGAL_CHARACTER", NT_STATUS_ILLEGAL_CHARACTER},
+	{"NT_STATUS_UNMAPPABLE_CHARACTER", NT_STATUS_UNMAPPABLE_CHARACTER},
+	{"NT_STATUS_UNDEFINED_CHARACTER", NT_STATUS_UNDEFINED_CHARACTER},
+	{"NT_STATUS_FLOPPY_VOLUME", NT_STATUS_FLOPPY_VOLUME},
+	{"NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND",
+	 NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND},
+	{"NT_STATUS_FLOPPY_WRONG_CYLINDER",
+	 NT_STATUS_FLOPPY_WRONG_CYLINDER},
+	{"NT_STATUS_FLOPPY_UNKNOWN_ERROR", NT_STATUS_FLOPPY_UNKNOWN_ERROR},
+	{"NT_STATUS_FLOPPY_BAD_REGISTERS", NT_STATUS_FLOPPY_BAD_REGISTERS},
+	{"NT_STATUS_DISK_RECALIBRATE_FAILED",
+	 NT_STATUS_DISK_RECALIBRATE_FAILED},
+	{"NT_STATUS_DISK_OPERATION_FAILED",
+	 NT_STATUS_DISK_OPERATION_FAILED},
+	{"NT_STATUS_DISK_RESET_FAILED", NT_STATUS_DISK_RESET_FAILED},
+	{"NT_STATUS_SHARED_IRQ_BUSY", NT_STATUS_SHARED_IRQ_BUSY},
+	{"NT_STATUS_FT_ORPHANING", NT_STATUS_FT_ORPHANING},
+	{"NT_STATUS_PARTITION_FAILURE", NT_STATUS_PARTITION_FAILURE},
+	{"NT_STATUS_INVALID_BLOCK_LENGTH", NT_STATUS_INVALID_BLOCK_LENGTH},
+	{"NT_STATUS_DEVICE_NOT_PARTITIONED",
+	 NT_STATUS_DEVICE_NOT_PARTITIONED},
+	{"NT_STATUS_UNABLE_TO_LOCK_MEDIA", NT_STATUS_UNABLE_TO_LOCK_MEDIA},
+	{"NT_STATUS_UNABLE_TO_UNLOAD_MEDIA",
+	 NT_STATUS_UNABLE_TO_UNLOAD_MEDIA},
+	{"NT_STATUS_EOM_OVERFLOW", NT_STATUS_EOM_OVERFLOW},
+	{"NT_STATUS_NO_MEDIA", NT_STATUS_NO_MEDIA},
+	{"NT_STATUS_NO_SUCH_MEMBER", NT_STATUS_NO_SUCH_MEMBER},
+	{"NT_STATUS_INVALID_MEMBER", NT_STATUS_INVALID_MEMBER},
+	{"NT_STATUS_KEY_DELETED", NT_STATUS_KEY_DELETED},
+	{"NT_STATUS_NO_LOG_SPACE", NT_STATUS_NO_LOG_SPACE},
+	{"NT_STATUS_TOO_MANY_SIDS", NT_STATUS_TOO_MANY_SIDS},
+	{"NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED",
+	 NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED},
+	{"NT_STATUS_KEY_HAS_CHILDREN", NT_STATUS_KEY_HAS_CHILDREN},
+	{"NT_STATUS_CHILD_MUST_BE_VOLATILE",
+	 NT_STATUS_CHILD_MUST_BE_VOLATILE},
+	{"NT_STATUS_DEVICE_CONFIGURATION_ERROR",
+	 NT_STATUS_DEVICE_CONFIGURATION_ERROR},
+	{"NT_STATUS_DRIVER_INTERNAL_ERROR",
+	 NT_STATUS_DRIVER_INTERNAL_ERROR},
+	{"NT_STATUS_INVALID_DEVICE_STATE", NT_STATUS_INVALID_DEVICE_STATE},
+	{"NT_STATUS_IO_DEVICE_ERROR", NT_STATUS_IO_DEVICE_ERROR},
+	{"NT_STATUS_DEVICE_PROTOCOL_ERROR",
+	 NT_STATUS_DEVICE_PROTOCOL_ERROR},
+	{"NT_STATUS_BACKUP_CONTROLLER", NT_STATUS_BACKUP_CONTROLLER},
+	{"NT_STATUS_LOG_FILE_FULL", NT_STATUS_LOG_FILE_FULL},
+	{"NT_STATUS_TOO_LATE", NT_STATUS_TOO_LATE},
+	{"NT_STATUS_NO_TRUST_LSA_SECRET", NT_STATUS_NO_TRUST_LSA_SECRET},
+	{"NT_STATUS_NO_TRUST_SAM_ACCOUNT", NT_STATUS_NO_TRUST_SAM_ACCOUNT},
+	{"NT_STATUS_TRUSTED_DOMAIN_FAILURE",
+	 NT_STATUS_TRUSTED_DOMAIN_FAILURE},
+	{"NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE",
+	 NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE},
+	{"NT_STATUS_EVENTLOG_FILE_CORRUPT",
+	 NT_STATUS_EVENTLOG_FILE_CORRUPT},
+	{"NT_STATUS_EVENTLOG_CANT_START", NT_STATUS_EVENTLOG_CANT_START},
+	{"NT_STATUS_TRUST_FAILURE", NT_STATUS_TRUST_FAILURE},
+	{"NT_STATUS_MUTANT_LIMIT_EXCEEDED",
+	 NT_STATUS_MUTANT_LIMIT_EXCEEDED},
+	{"NT_STATUS_NETLOGON_NOT_STARTED", NT_STATUS_NETLOGON_NOT_STARTED},
+	{"NT_STATUS_ACCOUNT_EXPIRED", NT_STATUS_ACCOUNT_EXPIRED},
+	{"NT_STATUS_POSSIBLE_DEADLOCK", NT_STATUS_POSSIBLE_DEADLOCK},
+	{"NT_STATUS_NETWORK_CREDENTIAL_CONFLICT",
+	 NT_STATUS_NETWORK_CREDENTIAL_CONFLICT},
+	{"NT_STATUS_REMOTE_SESSION_LIMIT", NT_STATUS_REMOTE_SESSION_LIMIT},
+	{"NT_STATUS_EVENTLOG_FILE_CHANGED",
+	 NT_STATUS_EVENTLOG_FILE_CHANGED},
+	{"NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT",
+	 NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT},
+	{"NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT",
+	 NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT},
+	{"NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT",
+	 NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT},
+	{"NT_STATUS_DOMAIN_TRUST_INCONSISTENT",
+	 NT_STATUS_DOMAIN_TRUST_INCONSISTENT},
+	{"NT_STATUS_FS_DRIVER_REQUIRED", NT_STATUS_FS_DRIVER_REQUIRED},
+	{"NT_STATUS_NO_USER_SESSION_KEY", NT_STATUS_NO_USER_SESSION_KEY},
+	{"NT_STATUS_USER_SESSION_DELETED", NT_STATUS_USER_SESSION_DELETED},
+	{"NT_STATUS_RESOURCE_LANG_NOT_FOUND",
+	 NT_STATUS_RESOURCE_LANG_NOT_FOUND},
+	{"NT_STATUS_INSUFF_SERVER_RESOURCES",
+	 NT_STATUS_INSUFF_SERVER_RESOURCES},
+	{"NT_STATUS_INVALID_BUFFER_SIZE", NT_STATUS_INVALID_BUFFER_SIZE},
+	{"NT_STATUS_INVALID_ADDRESS_COMPONENT",
+	 NT_STATUS_INVALID_ADDRESS_COMPONENT},
+	{"NT_STATUS_INVALID_ADDRESS_WILDCARD",
+	 NT_STATUS_INVALID_ADDRESS_WILDCARD},
+	{"NT_STATUS_TOO_MANY_ADDRESSES", NT_STATUS_TOO_MANY_ADDRESSES},
+	{"NT_STATUS_ADDRESS_ALREADY_EXISTS",
+	 NT_STATUS_ADDRESS_ALREADY_EXISTS},
+	{"NT_STATUS_ADDRESS_CLOSED", NT_STATUS_ADDRESS_CLOSED},
+	{"NT_STATUS_CONNECTION_DISCONNECTED",
+	 NT_STATUS_CONNECTION_DISCONNECTED},
+	{"NT_STATUS_CONNECTION_RESET", NT_STATUS_CONNECTION_RESET},
+	{"NT_STATUS_TOO_MANY_NODES", NT_STATUS_TOO_MANY_NODES},
+	{"NT_STATUS_TRANSACTION_ABORTED", NT_STATUS_TRANSACTION_ABORTED},
+	{"NT_STATUS_TRANSACTION_TIMED_OUT",
+	 NT_STATUS_TRANSACTION_TIMED_OUT},
+	{"NT_STATUS_TRANSACTION_NO_RELEASE",
+	 NT_STATUS_TRANSACTION_NO_RELEASE},
+	{"NT_STATUS_TRANSACTION_NO_MATCH", NT_STATUS_TRANSACTION_NO_MATCH},
+	{"NT_STATUS_TRANSACTION_RESPONDED",
+	 NT_STATUS_TRANSACTION_RESPONDED},
+	{"NT_STATUS_TRANSACTION_INVALID_ID",
+	 NT_STATUS_TRANSACTION_INVALID_ID},
+	{"NT_STATUS_TRANSACTION_INVALID_TYPE",
+	 NT_STATUS_TRANSACTION_INVALID_TYPE},
+	{"NT_STATUS_NOT_SERVER_SESSION", NT_STATUS_NOT_SERVER_SESSION},
+	{"NT_STATUS_NOT_CLIENT_SESSION", NT_STATUS_NOT_CLIENT_SESSION},
+	{"NT_STATUS_CANNOT_LOAD_REGISTRY_FILE",
+	 NT_STATUS_CANNOT_LOAD_REGISTRY_FILE},
+	{"NT_STATUS_DEBUG_ATTACH_FAILED", NT_STATUS_DEBUG_ATTACH_FAILED},
+	{"NT_STATUS_SYSTEM_PROCESS_TERMINATED",
+	 NT_STATUS_SYSTEM_PROCESS_TERMINATED},
+	{"NT_STATUS_DATA_NOT_ACCEPTED", NT_STATUS_DATA_NOT_ACCEPTED},
+	{"NT_STATUS_NO_BROWSER_SERVERS_FOUND",
+	 NT_STATUS_NO_BROWSER_SERVERS_FOUND},
+	{"NT_STATUS_VDM_HARD_ERROR", NT_STATUS_VDM_HARD_ERROR},
+	{"NT_STATUS_DRIVER_CANCEL_TIMEOUT",
+	 NT_STATUS_DRIVER_CANCEL_TIMEOUT},
+	{"NT_STATUS_REPLY_MESSAGE_MISMATCH",
+	 NT_STATUS_REPLY_MESSAGE_MISMATCH},
+	{"NT_STATUS_MAPPED_ALIGNMENT", NT_STATUS_MAPPED_ALIGNMENT},
+	{"NT_STATUS_IMAGE_CHECKSUM_MISMATCH",
+	 NT_STATUS_IMAGE_CHECKSUM_MISMATCH},
+	{"NT_STATUS_LOST_WRITEBEHIND_DATA",
+	 NT_STATUS_LOST_WRITEBEHIND_DATA},
+	{"NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID",
+	 NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID},
+	{"NT_STATUS_PASSWORD_MUST_CHANGE", NT_STATUS_PASSWORD_MUST_CHANGE},
+	{"NT_STATUS_NOT_FOUND", NT_STATUS_NOT_FOUND},
+	{"NT_STATUS_NOT_TINY_STREAM", NT_STATUS_NOT_TINY_STREAM},
+	{"NT_STATUS_RECOVERY_FAILURE", NT_STATUS_RECOVERY_FAILURE},
+	{"NT_STATUS_STACK_OVERFLOW_READ", NT_STATUS_STACK_OVERFLOW_READ},
+	{"NT_STATUS_FAIL_CHECK", NT_STATUS_FAIL_CHECK},
+	{"NT_STATUS_DUPLICATE_OBJECTID", NT_STATUS_DUPLICATE_OBJECTID},
+	{"NT_STATUS_OBJECTID_EXISTS", NT_STATUS_OBJECTID_EXISTS},
+	{"NT_STATUS_CONVERT_TO_LARGE", NT_STATUS_CONVERT_TO_LARGE},
+	{"NT_STATUS_RETRY", NT_STATUS_RETRY},
+	{"NT_STATUS_FOUND_OUT_OF_SCOPE", NT_STATUS_FOUND_OUT_OF_SCOPE},
+	{"NT_STATUS_ALLOCATE_BUCKET", NT_STATUS_ALLOCATE_BUCKET},
+	{"NT_STATUS_PROPSET_NOT_FOUND", NT_STATUS_PROPSET_NOT_FOUND},
+	{"NT_STATUS_MARSHALL_OVERFLOW", NT_STATUS_MARSHALL_OVERFLOW},
+	{"NT_STATUS_INVALID_VARIANT", NT_STATUS_INVALID_VARIANT},
+	{"NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND",
+	 NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND},
+	{"NT_STATUS_ACCOUNT_LOCKED_OUT", NT_STATUS_ACCOUNT_LOCKED_OUT},
+	{"NT_STATUS_HANDLE_NOT_CLOSABLE", NT_STATUS_HANDLE_NOT_CLOSABLE},
+	{"NT_STATUS_CONNECTION_REFUSED", NT_STATUS_CONNECTION_REFUSED},
+	{"NT_STATUS_GRACEFUL_DISCONNECT", NT_STATUS_GRACEFUL_DISCONNECT},
+	{"NT_STATUS_ADDRESS_ALREADY_ASSOCIATED",
+	 NT_STATUS_ADDRESS_ALREADY_ASSOCIATED},
+	{"NT_STATUS_ADDRESS_NOT_ASSOCIATED",
+	 NT_STATUS_ADDRESS_NOT_ASSOCIATED},
+	{"NT_STATUS_CONNECTION_INVALID", NT_STATUS_CONNECTION_INVALID},
+	{"NT_STATUS_CONNECTION_ACTIVE", NT_STATUS_CONNECTION_ACTIVE},
+	{"NT_STATUS_NETWORK_UNREACHABLE", NT_STATUS_NETWORK_UNREACHABLE},
+	{"NT_STATUS_HOST_UNREACHABLE", NT_STATUS_HOST_UNREACHABLE},
+	{"NT_STATUS_PROTOCOL_UNREACHABLE", NT_STATUS_PROTOCOL_UNREACHABLE},
+	{"NT_STATUS_PORT_UNREACHABLE", NT_STATUS_PORT_UNREACHABLE},
+	{"NT_STATUS_REQUEST_ABORTED", NT_STATUS_REQUEST_ABORTED},
+	{"NT_STATUS_CONNECTION_ABORTED", NT_STATUS_CONNECTION_ABORTED},
+	{"NT_STATUS_BAD_COMPRESSION_BUFFER",
+	 NT_STATUS_BAD_COMPRESSION_BUFFER},
+	{"NT_STATUS_USER_MAPPED_FILE", NT_STATUS_USER_MAPPED_FILE},
+	{"NT_STATUS_AUDIT_FAILED", NT_STATUS_AUDIT_FAILED},
+	{"NT_STATUS_TIMER_RESOLUTION_NOT_SET",
+	 NT_STATUS_TIMER_RESOLUTION_NOT_SET},
+	{"NT_STATUS_CONNECTION_COUNT_LIMIT",
+	 NT_STATUS_CONNECTION_COUNT_LIMIT},
+	{"NT_STATUS_LOGIN_TIME_RESTRICTION",
+	 NT_STATUS_LOGIN_TIME_RESTRICTION},
+	{"NT_STATUS_LOGIN_WKSTA_RESTRICTION",
+	 NT_STATUS_LOGIN_WKSTA_RESTRICTION},
+	{"NT_STATUS_IMAGE_MP_UP_MISMATCH", NT_STATUS_IMAGE_MP_UP_MISMATCH},
+	{"NT_STATUS_INSUFFICIENT_LOGON_INFO",
+	 NT_STATUS_INSUFFICIENT_LOGON_INFO},
+	{"NT_STATUS_BAD_DLL_ENTRYPOINT", NT_STATUS_BAD_DLL_ENTRYPOINT},
+	{"NT_STATUS_BAD_SERVICE_ENTRYPOINT",
+	 NT_STATUS_BAD_SERVICE_ENTRYPOINT},
+	{"NT_STATUS_LPC_REPLY_LOST", NT_STATUS_LPC_REPLY_LOST},
+	{"NT_STATUS_IP_ADDRESS_CONFLICT1", NT_STATUS_IP_ADDRESS_CONFLICT1},
+	{"NT_STATUS_IP_ADDRESS_CONFLICT2", NT_STATUS_IP_ADDRESS_CONFLICT2},
+	{"NT_STATUS_REGISTRY_QUOTA_LIMIT", NT_STATUS_REGISTRY_QUOTA_LIMIT},
+	{"NT_STATUS_PATH_NOT_COVERED", NT_STATUS_PATH_NOT_COVERED},
+	{"NT_STATUS_NO_CALLBACK_ACTIVE", NT_STATUS_NO_CALLBACK_ACTIVE},
+	{"NT_STATUS_LICENSE_QUOTA_EXCEEDED",
+	 NT_STATUS_LICENSE_QUOTA_EXCEEDED},
+	{"NT_STATUS_PWD_TOO_SHORT", NT_STATUS_PWD_TOO_SHORT},
+	{"NT_STATUS_PWD_TOO_RECENT", NT_STATUS_PWD_TOO_RECENT},
+	{"NT_STATUS_PWD_HISTORY_CONFLICT", NT_STATUS_PWD_HISTORY_CONFLICT},
+	{"NT_STATUS_PLUGPLAY_NO_DEVICE", NT_STATUS_PLUGPLAY_NO_DEVICE},
+	{"NT_STATUS_UNSUPPORTED_COMPRESSION",
+	 NT_STATUS_UNSUPPORTED_COMPRESSION},
+	{"NT_STATUS_INVALID_HW_PROFILE", NT_STATUS_INVALID_HW_PROFILE},
+	{"NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH",
+	 NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH},
+	{"NT_STATUS_DRIVER_ORDINAL_NOT_FOUND",
+	 NT_STATUS_DRIVER_ORDINAL_NOT_FOUND},
+	{"NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND",
+	 NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND},
+	{"NT_STATUS_RESOURCE_NOT_OWNED", NT_STATUS_RESOURCE_NOT_OWNED},
+	{"NT_STATUS_TOO_MANY_LINKS", NT_STATUS_TOO_MANY_LINKS},
+	{"NT_STATUS_QUOTA_LIST_INCONSISTENT",
+	 NT_STATUS_QUOTA_LIST_INCONSISTENT},
+	{"NT_STATUS_FILE_IS_OFFLINE", NT_STATUS_FILE_IS_OFFLINE},
+	{"NT_STATUS_NO_MORE_ENTRIES", NT_STATUS_NO_MORE_ENTRIES},
+	{"STATUS_MORE_ENTRIES", STATUS_MORE_ENTRIES},
+	{"STATUS_SOME_UNMAPPED", STATUS_SOME_UNMAPPED},
+	{NULL, 0}
+};
diff --git a/fs/cifs/nterr.h b/fs/cifs/nterr.h
new file mode 100644
index 00000000..25726736
--- /dev/null
+++ b/fs/cifs/nterr.h
@@ -0,0 +1,561 @@
+/*
+   Unix SMB/Netbios implementation.
+   Version 1.9.
+   NT error code constants
+   Copyright (C) Andrew Tridgell              1992-2000
+   Copyright (C) John H Terpstra              1996-2000
+   Copyright (C) Luke Kenneth Casson Leighton 1996-2000
+   Copyright (C) Paul Ashton                  1998-2000
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+
+
+#ifndef _NTERR_H
+#define _NTERR_H
+
+struct nt_err_code_struct {
+	char *nt_errstr;
+	__u32 nt_errcode;
+};
+
+extern const struct nt_err_code_struct nt_errs[];
+
+/* Win32 Status codes. */
+#define STATUS_MORE_ENTRIES               0x0105
+#define ERROR_INVALID_PARAMETER		  0x0057
+#define ERROR_INSUFFICIENT_BUFFER	  0x007a
+#define STATUS_1804	                  0x070c
+#define STATUS_NOTIFY_ENUM_DIR            0x010c
+
+/* Win32 Error codes extracted using a loop in smbclient then printing a
+   netmon sniff to a file. */
+
+#define NT_STATUS_OK 0x0000
+#define STATUS_SOME_UNMAPPED       0x0107
+#define STATUS_BUFFER_OVERFLOW     0x80000005
+#define NT_STATUS_NO_MORE_ENTRIES  0x8000001a
+#define NT_STATUS_MEDIA_CHANGED    0x8000001c
+#define NT_STATUS_END_OF_MEDIA     0x8000001e
+#define NT_STATUS_MEDIA_CHECK      0x80000020
+#define NT_STATUS_NO_DATA_DETECTED 0x8000001c
+#define NT_STATUS_STOPPED_ON_SYMLINK 0x8000002d
+#define NT_STATUS_DEVICE_REQUIRES_CLEANING 0x80000288
+#define NT_STATUS_DEVICE_DOOR_OPEN 0x80000288
+#define NT_STATUS_UNSUCCESSFUL 0xC0000000 | 0x0001
+#define NT_STATUS_NOT_IMPLEMENTED 0xC0000000 | 0x0002
+#define NT_STATUS_INVALID_INFO_CLASS 0xC0000000 | 0x0003
+#define NT_STATUS_INFO_LENGTH_MISMATCH 0xC0000000 | 0x0004
+#define NT_STATUS_ACCESS_VIOLATION 0xC0000000 | 0x0005
+#define NT_STATUS_IN_PAGE_ERROR 0xC0000000 | 0x0006
+#define NT_STATUS_PAGEFILE_QUOTA 0xC0000000 | 0x0007
+#define NT_STATUS_INVALID_HANDLE 0xC0000000 | 0x0008
+#define NT_STATUS_BAD_INITIAL_STACK 0xC0000000 | 0x0009
+#define NT_STATUS_BAD_INITIAL_PC 0xC0000000 | 0x000a
+#define NT_STATUS_INVALID_CID 0xC0000000 | 0x000b
+#define NT_STATUS_TIMER_NOT_CANCELED 0xC0000000 | 0x000c
+#define NT_STATUS_INVALID_PARAMETER 0xC0000000 | 0x000d
+#define NT_STATUS_NO_SUCH_DEVICE 0xC0000000 | 0x000e
+#define NT_STATUS_NO_SUCH_FILE 0xC0000000 | 0x000f
+#define NT_STATUS_INVALID_DEVICE_REQUEST 0xC0000000 | 0x0010
+#define NT_STATUS_END_OF_FILE 0xC0000000 | 0x0011
+#define NT_STATUS_WRONG_VOLUME 0xC0000000 | 0x0012
+#define NT_STATUS_NO_MEDIA_IN_DEVICE 0xC0000000 | 0x0013
+#define NT_STATUS_UNRECOGNIZED_MEDIA 0xC0000000 | 0x0014
+#define NT_STATUS_NONEXISTENT_SECTOR 0xC0000000 | 0x0015
+#define NT_STATUS_MORE_PROCESSING_REQUIRED 0xC0000000 | 0x0016
+#define NT_STATUS_NO_MEMORY 0xC0000000 | 0x0017
+#define NT_STATUS_CONFLICTING_ADDRESSES 0xC0000000 | 0x0018
+#define NT_STATUS_NOT_MAPPED_VIEW 0xC0000000 | 0x0019
+#define NT_STATUS_UNABLE_TO_FREE_VM 0x80000000 | 0x001a
+#define NT_STATUS_UNABLE_TO_DELETE_SECTION 0xC0000000 | 0x001b
+#define NT_STATUS_INVALID_SYSTEM_SERVICE 0xC0000000 | 0x001c
+#define NT_STATUS_ILLEGAL_INSTRUCTION 0xC0000000 | 0x001d
+#define NT_STATUS_INVALID_LOCK_SEQUENCE 0xC0000000 | 0x001e
+#define NT_STATUS_INVALID_VIEW_SIZE 0xC0000000 | 0x001f
+#define NT_STATUS_INVALID_FILE_FOR_SECTION 0xC0000000 | 0x0020
+#define NT_STATUS_ALREADY_COMMITTED 0xC0000000 | 0x0021
+#define NT_STATUS_ACCESS_DENIED 0xC0000000 | 0x0022
+#define NT_STATUS_BUFFER_TOO_SMALL 0xC0000000 | 0x0023
+#define NT_STATUS_OBJECT_TYPE_MISMATCH 0xC0000000 | 0x0024
+#define NT_STATUS_NONCONTINUABLE_EXCEPTION 0xC0000000 | 0x0025
+#define NT_STATUS_INVALID_DISPOSITION 0xC0000000 | 0x0026
+#define NT_STATUS_UNWIND 0xC0000000 | 0x0027
+#define NT_STATUS_BAD_STACK 0xC0000000 | 0x0028
+#define NT_STATUS_INVALID_UNWIND_TARGET 0xC0000000 | 0x0029
+#define NT_STATUS_NOT_LOCKED 0xC0000000 | 0x002a
+#define NT_STATUS_PARITY_ERROR 0xC0000000 | 0x002b
+#define NT_STATUS_UNABLE_TO_DECOMMIT_VM 0xC0000000 | 0x002c
+#define NT_STATUS_NOT_COMMITTED 0xC0000000 | 0x002d
+#define NT_STATUS_INVALID_PORT_ATTRIBUTES 0xC0000000 | 0x002e
+#define NT_STATUS_PORT_MESSAGE_TOO_LONG 0xC0000000 | 0x002f
+#define NT_STATUS_INVALID_PARAMETER_MIX 0xC0000000 | 0x0030
+#define NT_STATUS_INVALID_QUOTA_LOWER 0xC0000000 | 0x0031
+#define NT_STATUS_DISK_CORRUPT_ERROR 0xC0000000 | 0x0032
+#define NT_STATUS_OBJECT_NAME_INVALID 0xC0000000 | 0x0033
+#define NT_STATUS_OBJECT_NAME_NOT_FOUND 0xC0000000 | 0x0034
+#define NT_STATUS_OBJECT_NAME_COLLISION 0xC0000000 | 0x0035
+#define NT_STATUS_HANDLE_NOT_WAITABLE 0xC0000000 | 0x0036
+#define NT_STATUS_PORT_DISCONNECTED 0xC0000000 | 0x0037
+#define NT_STATUS_DEVICE_ALREADY_ATTACHED 0xC0000000 | 0x0038
+#define NT_STATUS_OBJECT_PATH_INVALID 0xC0000000 | 0x0039
+#define NT_STATUS_OBJECT_PATH_NOT_FOUND 0xC0000000 | 0x003a
+#define NT_STATUS_OBJECT_PATH_SYNTAX_BAD 0xC0000000 | 0x003b
+#define NT_STATUS_DATA_OVERRUN 0xC0000000 | 0x003c
+#define NT_STATUS_DATA_LATE_ERROR 0xC0000000 | 0x003d
+#define NT_STATUS_DATA_ERROR 0xC0000000 | 0x003e
+#define NT_STATUS_CRC_ERROR 0xC0000000 | 0x003f
+#define NT_STATUS_SECTION_TOO_BIG 0xC0000000 | 0x0040
+#define NT_STATUS_PORT_CONNECTION_REFUSED 0xC0000000 | 0x0041
+#define NT_STATUS_INVALID_PORT_HANDLE 0xC0000000 | 0x0042
+#define NT_STATUS_SHARING_VIOLATION 0xC0000000 | 0x0043
+#define NT_STATUS_QUOTA_EXCEEDED 0xC0000000 | 0x0044
+#define NT_STATUS_INVALID_PAGE_PROTECTION 0xC0000000 | 0x0045
+#define NT_STATUS_MUTANT_NOT_OWNED 0xC0000000 | 0x0046
+#define NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED 0xC0000000 | 0x0047
+#define NT_STATUS_PORT_ALREADY_SET 0xC0000000 | 0x0048
+#define NT_STATUS_SECTION_NOT_IMAGE 0xC0000000 | 0x0049
+#define NT_STATUS_SUSPEND_COUNT_EXCEEDED 0xC0000000 | 0x004a
+#define NT_STATUS_THREAD_IS_TERMINATING 0xC0000000 | 0x004b
+#define NT_STATUS_BAD_WORKING_SET_LIMIT 0xC0000000 | 0x004c
+#define NT_STATUS_INCOMPATIBLE_FILE_MAP 0xC0000000 | 0x004d
+#define NT_STATUS_SECTION_PROTECTION 0xC0000000 | 0x004e
+#define NT_STATUS_EAS_NOT_SUPPORTED 0xC0000000 | 0x004f
+#define NT_STATUS_EA_TOO_LARGE 0xC0000000 | 0x0050
+#define NT_STATUS_NONEXISTENT_EA_ENTRY 0xC0000000 | 0x0051
+#define NT_STATUS_NO_EAS_ON_FILE 0xC0000000 | 0x0052
+#define NT_STATUS_EA_CORRUPT_ERROR 0xC0000000 | 0x0053
+#define NT_STATUS_FILE_LOCK_CONFLICT 0xC0000000 | 0x0054
+#define NT_STATUS_LOCK_NOT_GRANTED 0xC0000000 | 0x0055
+#define NT_STATUS_DELETE_PENDING 0xC0000000 | 0x0056
+#define NT_STATUS_CTL_FILE_NOT_SUPPORTED 0xC0000000 | 0x0057
+#define NT_STATUS_UNKNOWN_REVISION 0xC0000000 | 0x0058
+#define NT_STATUS_REVISION_MISMATCH 0xC0000000 | 0x0059
+#define NT_STATUS_INVALID_OWNER 0xC0000000 | 0x005a
+#define NT_STATUS_INVALID_PRIMARY_GROUP 0xC0000000 | 0x005b
+#define NT_STATUS_NO_IMPERSONATION_TOKEN 0xC0000000 | 0x005c
+#define NT_STATUS_CANT_DISABLE_MANDATORY 0xC0000000 | 0x005d
+#define NT_STATUS_NO_LOGON_SERVERS 0xC0000000 | 0x005e
+#define NT_STATUS_NO_SUCH_LOGON_SESSION 0xC0000000 | 0x005f
+#define NT_STATUS_NO_SUCH_PRIVILEGE 0xC0000000 | 0x0060
+#define NT_STATUS_PRIVILEGE_NOT_HELD 0xC0000000 | 0x0061
+#define NT_STATUS_INVALID_ACCOUNT_NAME 0xC0000000 | 0x0062
+#define NT_STATUS_USER_EXISTS 0xC0000000 | 0x0063
+#define NT_STATUS_NO_SUCH_USER 0xC0000000 | 0x0064
+#define NT_STATUS_GROUP_EXISTS 0xC0000000 | 0x0065
+#define NT_STATUS_NO_SUCH_GROUP 0xC0000000 | 0x0066
+#define NT_STATUS_MEMBER_IN_GROUP 0xC0000000 | 0x0067
+#define NT_STATUS_MEMBER_NOT_IN_GROUP 0xC0000000 | 0x0068
+#define NT_STATUS_LAST_ADMIN 0xC0000000 | 0x0069
+#define NT_STATUS_WRONG_PASSWORD 0xC0000000 | 0x006a
+#define NT_STATUS_ILL_FORMED_PASSWORD 0xC0000000 | 0x006b
+#define NT_STATUS_PASSWORD_RESTRICTION 0xC0000000 | 0x006c
+#define NT_STATUS_LOGON_FAILURE 0xC0000000 | 0x006d
+#define NT_STATUS_ACCOUNT_RESTRICTION 0xC0000000 | 0x006e
+#define NT_STATUS_INVALID_LOGON_HOURS 0xC0000000 | 0x006f
+#define NT_STATUS_INVALID_WORKSTATION 0xC0000000 | 0x0070
+#define NT_STATUS_PASSWORD_EXPIRED 0xC0000000 | 0x0071
+#define NT_STATUS_ACCOUNT_DISABLED 0xC0000000 | 0x0072
+#define NT_STATUS_NONE_MAPPED 0xC0000000 | 0x0073
+#define NT_STATUS_TOO_MANY_LUIDS_REQUESTED 0xC0000000 | 0x0074
+#define NT_STATUS_LUIDS_EXHAUSTED 0xC0000000 | 0x0075
+#define NT_STATUS_INVALID_SUB_AUTHORITY 0xC0000000 | 0x0076
+#define NT_STATUS_INVALID_ACL 0xC0000000 | 0x0077
+#define NT_STATUS_INVALID_SID 0xC0000000 | 0x0078
+#define NT_STATUS_INVALID_SECURITY_DESCR 0xC0000000 | 0x0079
+#define NT_STATUS_PROCEDURE_NOT_FOUND 0xC0000000 | 0x007a
+#define NT_STATUS_INVALID_IMAGE_FORMAT 0xC0000000 | 0x007b
+#define NT_STATUS_NO_TOKEN 0xC0000000 | 0x007c
+#define NT_STATUS_BAD_INHERITANCE_ACL 0xC0000000 | 0x007d
+#define NT_STATUS_RANGE_NOT_LOCKED 0xC0000000 | 0x007e
+#define NT_STATUS_DISK_FULL 0xC0000000 | 0x007f
+#define NT_STATUS_SERVER_DISABLED 0xC0000000 | 0x0080
+#define NT_STATUS_SERVER_NOT_DISABLED 0xC0000000 | 0x0081
+#define NT_STATUS_TOO_MANY_GUIDS_REQUESTED 0xC0000000 | 0x0082
+#define NT_STATUS_GUIDS_EXHAUSTED 0xC0000000 | 0x0083
+#define NT_STATUS_INVALID_ID_AUTHORITY 0xC0000000 | 0x0084
+#define NT_STATUS_AGENTS_EXHAUSTED 0xC0000000 | 0x0085
+#define NT_STATUS_INVALID_VOLUME_LABEL 0xC0000000 | 0x0086
+#define NT_STATUS_SECTION_NOT_EXTENDED 0xC0000000 | 0x0087
+#define NT_STATUS_NOT_MAPPED_DATA 0xC0000000 | 0x0088
+#define NT_STATUS_RESOURCE_DATA_NOT_FOUND 0xC0000000 | 0x0089
+#define NT_STATUS_RESOURCE_TYPE_NOT_FOUND 0xC0000000 | 0x008a
+#define NT_STATUS_RESOURCE_NAME_NOT_FOUND 0xC0000000 | 0x008b
+#define NT_STATUS_ARRAY_BOUNDS_EXCEEDED 0xC0000000 | 0x008c
+#define NT_STATUS_FLOAT_DENORMAL_OPERAND 0xC0000000 | 0x008d
+#define NT_STATUS_FLOAT_DIVIDE_BY_ZERO 0xC0000000 | 0x008e
+#define NT_STATUS_FLOAT_INEXACT_RESULT 0xC0000000 | 0x008f
+#define NT_STATUS_FLOAT_INVALID_OPERATION 0xC0000000 | 0x0090
+#define NT_STATUS_FLOAT_OVERFLOW 0xC0000000 | 0x0091
+#define NT_STATUS_FLOAT_STACK_CHECK 0xC0000000 | 0x0092
+#define NT_STATUS_FLOAT_UNDERFLOW 0xC0000000 | 0x0093
+#define NT_STATUS_INTEGER_DIVIDE_BY_ZERO 0xC0000000 | 0x0094
+#define NT_STATUS_INTEGER_OVERFLOW 0xC0000000 | 0x0095
+#define NT_STATUS_PRIVILEGED_INSTRUCTION 0xC0000000 | 0x0096
+#define NT_STATUS_TOO_MANY_PAGING_FILES 0xC0000000 | 0x0097
+#define NT_STATUS_FILE_INVALID 0xC0000000 | 0x0098
+#define NT_STATUS_ALLOTTED_SPACE_EXCEEDED 0xC0000000 | 0x0099
+#define NT_STATUS_INSUFFICIENT_RESOURCES 0xC0000000 | 0x009a
+#define NT_STATUS_DFS_EXIT_PATH_FOUND 0xC0000000 | 0x009b
+#define NT_STATUS_DEVICE_DATA_ERROR 0xC0000000 | 0x009c
+#define NT_STATUS_DEVICE_NOT_CONNECTED 0xC0000000 | 0x009d
+#define NT_STATUS_DEVICE_POWER_FAILURE 0xC0000000 | 0x009e
+#define NT_STATUS_FREE_VM_NOT_AT_BASE 0xC0000000 | 0x009f
+#define NT_STATUS_MEMORY_NOT_ALLOCATED 0xC0000000 | 0x00a0
+#define NT_STATUS_WORKING_SET_QUOTA 0xC0000000 | 0x00a1
+#define NT_STATUS_MEDIA_WRITE_PROTECTED 0xC0000000 | 0x00a2
+#define NT_STATUS_DEVICE_NOT_READY 0xC0000000 | 0x00a3
+#define NT_STATUS_INVALID_GROUP_ATTRIBUTES 0xC0000000 | 0x00a4
+#define NT_STATUS_BAD_IMPERSONATION_LEVEL 0xC0000000 | 0x00a5
+#define NT_STATUS_CANT_OPEN_ANONYMOUS 0xC0000000 | 0x00a6
+#define NT_STATUS_BAD_VALIDATION_CLASS 0xC0000000 | 0x00a7
+#define NT_STATUS_BAD_TOKEN_TYPE 0xC0000000 | 0x00a8
+#define NT_STATUS_BAD_MASTER_BOOT_RECORD 0xC0000000 | 0x00a9
+#define NT_STATUS_INSTRUCTION_MISALIGNMENT 0xC0000000 | 0x00aa
+#define NT_STATUS_INSTANCE_NOT_AVAILABLE 0xC0000000 | 0x00ab
+#define NT_STATUS_PIPE_NOT_AVAILABLE 0xC0000000 | 0x00ac
+#define NT_STATUS_INVALID_PIPE_STATE 0xC0000000 | 0x00ad
+#define NT_STATUS_PIPE_BUSY 0xC0000000 | 0x00ae
+#define NT_STATUS_ILLEGAL_FUNCTION 0xC0000000 | 0x00af
+#define NT_STATUS_PIPE_DISCONNECTED 0xC0000000 | 0x00b0
+#define NT_STATUS_PIPE_CLOSING 0xC0000000 | 0x00b1
+#define NT_STATUS_PIPE_CONNECTED 0xC0000000 | 0x00b2
+#define NT_STATUS_PIPE_LISTENING 0xC0000000 | 0x00b3
+#define NT_STATUS_INVALID_READ_MODE 0xC0000000 | 0x00b4
+#define NT_STATUS_IO_TIMEOUT 0xC0000000 | 0x00b5
+#define NT_STATUS_FILE_FORCED_CLOSED 0xC0000000 | 0x00b6
+#define NT_STATUS_PROFILING_NOT_STARTED 0xC0000000 | 0x00b7
+#define NT_STATUS_PROFILING_NOT_STOPPED 0xC0000000 | 0x00b8
+#define NT_STATUS_COULD_NOT_INTERPRET 0xC0000000 | 0x00b9
+#define NT_STATUS_FILE_IS_A_DIRECTORY 0xC0000000 | 0x00ba
+#define NT_STATUS_NOT_SUPPORTED 0xC0000000 | 0x00bb
+#define NT_STATUS_REMOTE_NOT_LISTENING 0xC0000000 | 0x00bc
+#define NT_STATUS_DUPLICATE_NAME 0xC0000000 | 0x00bd
+#define NT_STATUS_BAD_NETWORK_PATH 0xC0000000 | 0x00be
+#define NT_STATUS_NETWORK_BUSY 0xC0000000 | 0x00bf
+#define NT_STATUS_DEVICE_DOES_NOT_EXIST 0xC0000000 | 0x00c0
+#define NT_STATUS_TOO_MANY_COMMANDS 0xC0000000 | 0x00c1
+#define NT_STATUS_ADAPTER_HARDWARE_ERROR 0xC0000000 | 0x00c2
+#define NT_STATUS_INVALID_NETWORK_RESPONSE 0xC0000000 | 0x00c3
+#define NT_STATUS_UNEXPECTED_NETWORK_ERROR 0xC0000000 | 0x00c4
+#define NT_STATUS_BAD_REMOTE_ADAPTER 0xC0000000 | 0x00c5
+#define NT_STATUS_PRINT_QUEUE_FULL 0xC0000000 | 0x00c6
+#define NT_STATUS_NO_SPOOL_SPACE 0xC0000000 | 0x00c7
+#define NT_STATUS_PRINT_CANCELLED 0xC0000000 | 0x00c8
+#define NT_STATUS_NETWORK_NAME_DELETED 0xC0000000 | 0x00c9
+#define NT_STATUS_NETWORK_ACCESS_DENIED 0xC0000000 | 0x00ca
+#define NT_STATUS_BAD_DEVICE_TYPE 0xC0000000 | 0x00cb
+#define NT_STATUS_BAD_NETWORK_NAME 0xC0000000 | 0x00cc
+#define NT_STATUS_TOO_MANY_NAMES 0xC0000000 | 0x00cd
+#define NT_STATUS_TOO_MANY_SESSIONS 0xC0000000 | 0x00ce
+#define NT_STATUS_SHARING_PAUSED 0xC0000000 | 0x00cf
+#define NT_STATUS_REQUEST_NOT_ACCEPTED 0xC0000000 | 0x00d0
+#define NT_STATUS_REDIRECTOR_PAUSED 0xC0000000 | 0x00d1
+#define NT_STATUS_NET_WRITE_FAULT 0xC0000000 | 0x00d2
+#define NT_STATUS_PROFILING_AT_LIMIT 0xC0000000 | 0x00d3
+#define NT_STATUS_NOT_SAME_DEVICE 0xC0000000 | 0x00d4
+#define NT_STATUS_FILE_RENAMED 0xC0000000 | 0x00d5
+#define NT_STATUS_VIRTUAL_CIRCUIT_CLOSED 0xC0000000 | 0x00d6
+#define NT_STATUS_NO_SECURITY_ON_OBJECT 0xC0000000 | 0x00d7
+#define NT_STATUS_CANT_WAIT 0xC0000000 | 0x00d8
+#define NT_STATUS_PIPE_EMPTY 0xC0000000 | 0x00d9
+#define NT_STATUS_CANT_ACCESS_DOMAIN_INFO 0xC0000000 | 0x00da
+#define NT_STATUS_CANT_TERMINATE_SELF 0xC0000000 | 0x00db
+#define NT_STATUS_INVALID_SERVER_STATE 0xC0000000 | 0x00dc
+#define NT_STATUS_INVALID_DOMAIN_STATE 0xC0000000 | 0x00dd
+#define NT_STATUS_INVALID_DOMAIN_ROLE 0xC0000000 | 0x00de
+#define NT_STATUS_NO_SUCH_DOMAIN 0xC0000000 | 0x00df
+#define NT_STATUS_DOMAIN_EXISTS 0xC0000000 | 0x00e0
+#define NT_STATUS_DOMAIN_LIMIT_EXCEEDED 0xC0000000 | 0x00e1
+#define NT_STATUS_OPLOCK_NOT_GRANTED 0xC0000000 | 0x00e2
+#define NT_STATUS_INVALID_OPLOCK_PROTOCOL 0xC0000000 | 0x00e3
+#define NT_STATUS_INTERNAL_DB_CORRUPTION 0xC0000000 | 0x00e4
+#define NT_STATUS_INTERNAL_ERROR 0xC0000000 | 0x00e5
+#define NT_STATUS_GENERIC_NOT_MAPPED 0xC0000000 | 0x00e6
+#define NT_STATUS_BAD_DESCRIPTOR_FORMAT 0xC0000000 | 0x00e7
+#define NT_STATUS_INVALID_USER_BUFFER 0xC0000000 | 0x00e8
+#define NT_STATUS_UNEXPECTED_IO_ERROR 0xC0000000 | 0x00e9
+#define NT_STATUS_UNEXPECTED_MM_CREATE_ERR 0xC0000000 | 0x00ea
+#define NT_STATUS_UNEXPECTED_MM_MAP_ERROR 0xC0000000 | 0x00eb
+#define NT_STATUS_UNEXPECTED_MM_EXTEND_ERR 0xC0000000 | 0x00ec
+#define NT_STATUS_NOT_LOGON_PROCESS 0xC0000000 | 0x00ed
+#define NT_STATUS_LOGON_SESSION_EXISTS 0xC0000000 | 0x00ee
+#define NT_STATUS_INVALID_PARAMETER_1 0xC0000000 | 0x00ef
+#define NT_STATUS_INVALID_PARAMETER_2 0xC0000000 | 0x00f0
+#define NT_STATUS_INVALID_PARAMETER_3 0xC0000000 | 0x00f1
+#define NT_STATUS_INVALID_PARAMETER_4 0xC0000000 | 0x00f2
+#define NT_STATUS_INVALID_PARAMETER_5 0xC0000000 | 0x00f3
+#define NT_STATUS_INVALID_PARAMETER_6 0xC0000000 | 0x00f4
+#define NT_STATUS_INVALID_PARAMETER_7 0xC0000000 | 0x00f5
+#define NT_STATUS_INVALID_PARAMETER_8 0xC0000000 | 0x00f6
+#define NT_STATUS_INVALID_PARAMETER_9 0xC0000000 | 0x00f7
+#define NT_STATUS_INVALID_PARAMETER_10 0xC0000000 | 0x00f8
+#define NT_STATUS_INVALID_PARAMETER_11 0xC0000000 | 0x00f9
+#define NT_STATUS_INVALID_PARAMETER_12 0xC0000000 | 0x00fa
+#define NT_STATUS_REDIRECTOR_NOT_STARTED 0xC0000000 | 0x00fb
+#define NT_STATUS_REDIRECTOR_STARTED 0xC0000000 | 0x00fc
+#define NT_STATUS_STACK_OVERFLOW 0xC0000000 | 0x00fd
+#define NT_STATUS_NO_SUCH_PACKAGE 0xC0000000 | 0x00fe
+#define NT_STATUS_BAD_FUNCTION_TABLE 0xC0000000 | 0x00ff
+#define NT_STATUS_DIRECTORY_NOT_EMPTY 0xC0000000 | 0x0101
+#define NT_STATUS_FILE_CORRUPT_ERROR 0xC0000000 | 0x0102
+#define NT_STATUS_NOT_A_DIRECTORY 0xC0000000 | 0x0103
+#define NT_STATUS_BAD_LOGON_SESSION_STATE 0xC0000000 | 0x0104
+#define NT_STATUS_LOGON_SESSION_COLLISION 0xC0000000 | 0x0105
+#define NT_STATUS_NAME_TOO_LONG 0xC0000000 | 0x0106
+#define NT_STATUS_FILES_OPEN 0xC0000000 | 0x0107
+#define NT_STATUS_CONNECTION_IN_USE 0xC0000000 | 0x0108
+#define NT_STATUS_MESSAGE_NOT_FOUND 0xC0000000 | 0x0109
+#define NT_STATUS_PROCESS_IS_TERMINATING 0xC0000000 | 0x010a
+#define NT_STATUS_INVALID_LOGON_TYPE 0xC0000000 | 0x010b
+#define NT_STATUS_NO_GUID_TRANSLATION 0xC0000000 | 0x010c
+#define NT_STATUS_CANNOT_IMPERSONATE 0xC0000000 | 0x010d
+#define NT_STATUS_IMAGE_ALREADY_LOADED 0xC0000000 | 0x010e
+#define NT_STATUS_ABIOS_NOT_PRESENT 0xC0000000 | 0x010f
+#define NT_STATUS_ABIOS_LID_NOT_EXIST 0xC0000000 | 0x0110
+#define NT_STATUS_ABIOS_LID_ALREADY_OWNED 0xC0000000 | 0x0111
+#define NT_STATUS_ABIOS_NOT_LID_OWNER 0xC0000000 | 0x0112
+#define NT_STATUS_ABIOS_INVALID_COMMAND 0xC0000000 | 0x0113
+#define NT_STATUS_ABIOS_INVALID_LID 0xC0000000 | 0x0114
+#define NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE 0xC0000000 | 0x0115
+#define NT_STATUS_ABIOS_INVALID_SELECTOR 0xC0000000 | 0x0116
+#define NT_STATUS_NO_LDT 0xC0000000 | 0x0117
+#define NT_STATUS_INVALID_LDT_SIZE 0xC0000000 | 0x0118
+#define NT_STATUS_INVALID_LDT_OFFSET 0xC0000000 | 0x0119
+#define NT_STATUS_INVALID_LDT_DESCRIPTOR 0xC0000000 | 0x011a
+#define NT_STATUS_INVALID_IMAGE_NE_FORMAT 0xC0000000 | 0x011b
+#define NT_STATUS_RXACT_INVALID_STATE 0xC0000000 | 0x011c
+#define NT_STATUS_RXACT_COMMIT_FAILURE 0xC0000000 | 0x011d
+#define NT_STATUS_MAPPED_FILE_SIZE_ZERO 0xC0000000 | 0x011e
+#define NT_STATUS_TOO_MANY_OPENED_FILES 0xC0000000 | 0x011f
+#define NT_STATUS_CANCELLED 0xC0000000 | 0x0120
+#define NT_STATUS_CANNOT_DELETE 0xC0000000 | 0x0121
+#define NT_STATUS_INVALID_COMPUTER_NAME 0xC0000000 | 0x0122
+#define NT_STATUS_FILE_DELETED 0xC0000000 | 0x0123
+#define NT_STATUS_SPECIAL_ACCOUNT 0xC0000000 | 0x0124
+#define NT_STATUS_SPECIAL_GROUP 0xC0000000 | 0x0125
+#define NT_STATUS_SPECIAL_USER 0xC0000000 | 0x0126
+#define NT_STATUS_MEMBERS_PRIMARY_GROUP 0xC0000000 | 0x0127
+#define NT_STATUS_FILE_CLOSED 0xC0000000 | 0x0128
+#define NT_STATUS_TOO_MANY_THREADS 0xC0000000 | 0x0129
+#define NT_STATUS_THREAD_NOT_IN_PROCESS 0xC0000000 | 0x012a
+#define NT_STATUS_TOKEN_ALREADY_IN_USE 0xC0000000 | 0x012b
+#define NT_STATUS_PAGEFILE_QUOTA_EXCEEDED 0xC0000000 | 0x012c
+#define NT_STATUS_COMMITMENT_LIMIT 0xC0000000 | 0x012d
+#define NT_STATUS_INVALID_IMAGE_LE_FORMAT 0xC0000000 | 0x012e
+#define NT_STATUS_INVALID_IMAGE_NOT_MZ 0xC0000000 | 0x012f
+#define NT_STATUS_INVALID_IMAGE_PROTECT 0xC0000000 | 0x0130
+#define NT_STATUS_INVALID_IMAGE_WIN_16 0xC0000000 | 0x0131
+#define NT_STATUS_LOGON_SERVER_CONFLICT 0xC0000000 | 0x0132
+#define NT_STATUS_TIME_DIFFERENCE_AT_DC 0xC0000000 | 0x0133
+#define NT_STATUS_SYNCHRONIZATION_REQUIRED 0xC0000000 | 0x0134
+#define NT_STATUS_DLL_NOT_FOUND 0xC0000000 | 0x0135
+#define NT_STATUS_OPEN_FAILED 0xC0000000 | 0x0136
+#define NT_STATUS_IO_PRIVILEGE_FAILED 0xC0000000 | 0x0137
+#define NT_STATUS_ORDINAL_NOT_FOUND 0xC0000000 | 0x0138
+#define NT_STATUS_ENTRYPOINT_NOT_FOUND 0xC0000000 | 0x0139
+#define NT_STATUS_CONTROL_C_EXIT 0xC0000000 | 0x013a
+#define NT_STATUS_LOCAL_DISCONNECT 0xC0000000 | 0x013b
+#define NT_STATUS_REMOTE_DISCONNECT 0xC0000000 | 0x013c
+#define NT_STATUS_REMOTE_RESOURCES 0xC0000000 | 0x013d
+#define NT_STATUS_LINK_FAILED 0xC0000000 | 0x013e
+#define NT_STATUS_LINK_TIMEOUT 0xC0000000 | 0x013f
+#define NT_STATUS_INVALID_CONNECTION 0xC0000000 | 0x0140
+#define NT_STATUS_INVALID_ADDRESS 0xC0000000 | 0x0141
+#define NT_STATUS_DLL_INIT_FAILED 0xC0000000 | 0x0142
+#define NT_STATUS_MISSING_SYSTEMFILE 0xC0000000 | 0x0143
+#define NT_STATUS_UNHANDLED_EXCEPTION 0xC0000000 | 0x0144
+#define NT_STATUS_APP_INIT_FAILURE 0xC0000000 | 0x0145
+#define NT_STATUS_PAGEFILE_CREATE_FAILED 0xC0000000 | 0x0146
+#define NT_STATUS_NO_PAGEFILE 0xC0000000 | 0x0147
+#define NT_STATUS_INVALID_LEVEL 0xC0000000 | 0x0148
+#define NT_STATUS_WRONG_PASSWORD_CORE 0xC0000000 | 0x0149
+#define NT_STATUS_ILLEGAL_FLOAT_CONTEXT 0xC0000000 | 0x014a
+#define NT_STATUS_PIPE_BROKEN 0xC0000000 | 0x014b
+#define NT_STATUS_REGISTRY_CORRUPT 0xC0000000 | 0x014c
+#define NT_STATUS_REGISTRY_IO_FAILED 0xC0000000 | 0x014d
+#define NT_STATUS_NO_EVENT_PAIR 0xC0000000 | 0x014e
+#define NT_STATUS_UNRECOGNIZED_VOLUME 0xC0000000 | 0x014f
+#define NT_STATUS_SERIAL_NO_DEVICE_INITED 0xC0000000 | 0x0150
+#define NT_STATUS_NO_SUCH_ALIAS 0xC0000000 | 0x0151
+#define NT_STATUS_MEMBER_NOT_IN_ALIAS 0xC0000000 | 0x0152
+#define NT_STATUS_MEMBER_IN_ALIAS 0xC0000000 | 0x0153
+#define NT_STATUS_ALIAS_EXISTS 0xC0000000 | 0x0154
+#define NT_STATUS_LOGON_NOT_GRANTED 0xC0000000 | 0x0155
+#define NT_STATUS_TOO_MANY_SECRETS 0xC0000000 | 0x0156
+#define NT_STATUS_SECRET_TOO_LONG 0xC0000000 | 0x0157
+#define NT_STATUS_INTERNAL_DB_ERROR 0xC0000000 | 0x0158
+#define NT_STATUS_FULLSCREEN_MODE 0xC0000000 | 0x0159
+#define NT_STATUS_TOO_MANY_CONTEXT_IDS 0xC0000000 | 0x015a
+#define NT_STATUS_LOGON_TYPE_NOT_GRANTED 0xC0000000 | 0x015b
+#define NT_STATUS_NOT_REGISTRY_FILE 0xC0000000 | 0x015c
+#define NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED 0xC0000000 | 0x015d
+#define NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR 0xC0000000 | 0x015e
+#define NT_STATUS_FT_MISSING_MEMBER 0xC0000000 | 0x015f
+#define NT_STATUS_ILL_FORMED_SERVICE_ENTRY 0xC0000000 | 0x0160
+#define NT_STATUS_ILLEGAL_CHARACTER 0xC0000000 | 0x0161
+#define NT_STATUS_UNMAPPABLE_CHARACTER 0xC0000000 | 0x0162
+#define NT_STATUS_UNDEFINED_CHARACTER 0xC0000000 | 0x0163
+#define NT_STATUS_FLOPPY_VOLUME 0xC0000000 | 0x0164
+#define NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND 0xC0000000 | 0x0165
+#define NT_STATUS_FLOPPY_WRONG_CYLINDER 0xC0000000 | 0x0166
+#define NT_STATUS_FLOPPY_UNKNOWN_ERROR 0xC0000000 | 0x0167
+#define NT_STATUS_FLOPPY_BAD_REGISTERS 0xC0000000 | 0x0168
+#define NT_STATUS_DISK_RECALIBRATE_FAILED 0xC0000000 | 0x0169
+#define NT_STATUS_DISK_OPERATION_FAILED 0xC0000000 | 0x016a
+#define NT_STATUS_DISK_RESET_FAILED 0xC0000000 | 0x016b
+#define NT_STATUS_SHARED_IRQ_BUSY 0xC0000000 | 0x016c
+#define NT_STATUS_FT_ORPHANING 0xC0000000 | 0x016d
+#define NT_STATUS_PARTITION_FAILURE 0xC0000000 | 0x0172
+#define NT_STATUS_INVALID_BLOCK_LENGTH 0xC0000000 | 0x0173
+#define NT_STATUS_DEVICE_NOT_PARTITIONED 0xC0000000 | 0x0174
+#define NT_STATUS_UNABLE_TO_LOCK_MEDIA 0xC0000000 | 0x0175
+#define NT_STATUS_UNABLE_TO_UNLOAD_MEDIA 0xC0000000 | 0x0176
+#define NT_STATUS_EOM_OVERFLOW 0xC0000000 | 0x0177
+#define NT_STATUS_NO_MEDIA 0xC0000000 | 0x0178
+#define NT_STATUS_NO_SUCH_MEMBER 0xC0000000 | 0x017a
+#define NT_STATUS_INVALID_MEMBER 0xC0000000 | 0x017b
+#define NT_STATUS_KEY_DELETED 0xC0000000 | 0x017c
+#define NT_STATUS_NO_LOG_SPACE 0xC0000000 | 0x017d
+#define NT_STATUS_TOO_MANY_SIDS 0xC0000000 | 0x017e
+#define NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED 0xC0000000 | 0x017f
+#define NT_STATUS_KEY_HAS_CHILDREN 0xC0000000 | 0x0180
+#define NT_STATUS_CHILD_MUST_BE_VOLATILE 0xC0000000 | 0x0181
+#define NT_STATUS_DEVICE_CONFIGURATION_ERROR 0xC0000000 | 0x0182
+#define NT_STATUS_DRIVER_INTERNAL_ERROR 0xC0000000 | 0x0183
+#define NT_STATUS_INVALID_DEVICE_STATE 0xC0000000 | 0x0184
+#define NT_STATUS_IO_DEVICE_ERROR 0xC0000000 | 0x0185
+#define NT_STATUS_DEVICE_PROTOCOL_ERROR 0xC0000000 | 0x0186
+#define NT_STATUS_BACKUP_CONTROLLER 0xC0000000 | 0x0187
+#define NT_STATUS_LOG_FILE_FULL 0xC0000000 | 0x0188
+#define NT_STATUS_TOO_LATE 0xC0000000 | 0x0189
+#define NT_STATUS_NO_TRUST_LSA_SECRET 0xC0000000 | 0x018a
+#define NT_STATUS_NO_TRUST_SAM_ACCOUNT 0xC0000000 | 0x018b
+#define NT_STATUS_TRUSTED_DOMAIN_FAILURE 0xC0000000 | 0x018c
+#define NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE 0xC0000000 | 0x018d
+#define NT_STATUS_EVENTLOG_FILE_CORRUPT 0xC0000000 | 0x018e
+#define NT_STATUS_EVENTLOG_CANT_START 0xC0000000 | 0x018f
+#define NT_STATUS_TRUST_FAILURE 0xC0000000 | 0x0190
+#define NT_STATUS_MUTANT_LIMIT_EXCEEDED 0xC0000000 | 0x0191
+#define NT_STATUS_NETLOGON_NOT_STARTED 0xC0000000 | 0x0192
+#define NT_STATUS_ACCOUNT_EXPIRED 0xC0000000 | 0x0193
+#define NT_STATUS_POSSIBLE_DEADLOCK 0xC0000000 | 0x0194
+#define NT_STATUS_NETWORK_CREDENTIAL_CONFLICT 0xC0000000 | 0x0195
+#define NT_STATUS_REMOTE_SESSION_LIMIT 0xC0000000 | 0x0196
+#define NT_STATUS_EVENTLOG_FILE_CHANGED 0xC0000000 | 0x0197
+#define NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT 0xC0000000 | 0x0198
+#define NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT 0xC0000000 | 0x0199
+#define NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT 0xC0000000 | 0x019a
+#define NT_STATUS_DOMAIN_TRUST_INCONSISTENT 0xC0000000 | 0x019b
+#define NT_STATUS_FS_DRIVER_REQUIRED 0xC0000000 | 0x019c
+#define NT_STATUS_NO_USER_SESSION_KEY 0xC0000000 | 0x0202
+#define NT_STATUS_USER_SESSION_DELETED 0xC0000000 | 0x0203
+#define NT_STATUS_RESOURCE_LANG_NOT_FOUND 0xC0000000 | 0x0204
+#define NT_STATUS_INSUFF_SERVER_RESOURCES 0xC0000000 | 0x0205
+#define NT_STATUS_INVALID_BUFFER_SIZE 0xC0000000 | 0x0206
+#define NT_STATUS_INVALID_ADDRESS_COMPONENT 0xC0000000 | 0x0207
+#define NT_STATUS_INVALID_ADDRESS_WILDCARD 0xC0000000 | 0x0208
+#define NT_STATUS_TOO_MANY_ADDRESSES 0xC0000000 | 0x0209
+#define NT_STATUS_ADDRESS_ALREADY_EXISTS 0xC0000000 | 0x020a
+#define NT_STATUS_ADDRESS_CLOSED 0xC0000000 | 0x020b
+#define NT_STATUS_CONNECTION_DISCONNECTED 0xC0000000 | 0x020c
+#define NT_STATUS_CONNECTION_RESET 0xC0000000 | 0x020d
+#define NT_STATUS_TOO_MANY_NODES 0xC0000000 | 0x020e
+#define NT_STATUS_TRANSACTION_ABORTED 0xC0000000 | 0x020f
+#define NT_STATUS_TRANSACTION_TIMED_OUT 0xC0000000 | 0x0210
+#define NT_STATUS_TRANSACTION_NO_RELEASE 0xC0000000 | 0x0211
+#define NT_STATUS_TRANSACTION_NO_MATCH 0xC0000000 | 0x0212
+#define NT_STATUS_TRANSACTION_RESPONDED 0xC0000000 | 0x0213
+#define NT_STATUS_TRANSACTION_INVALID_ID 0xC0000000 | 0x0214
+#define NT_STATUS_TRANSACTION_INVALID_TYPE 0xC0000000 | 0x0215
+#define NT_STATUS_NOT_SERVER_SESSION 0xC0000000 | 0x0216
+#define NT_STATUS_NOT_CLIENT_SESSION 0xC0000000 | 0x0217
+#define NT_STATUS_CANNOT_LOAD_REGISTRY_FILE 0xC0000000 | 0x0218
+#define NT_STATUS_DEBUG_ATTACH_FAILED 0xC0000000 | 0x0219
+#define NT_STATUS_SYSTEM_PROCESS_TERMINATED 0xC0000000 | 0x021a
+#define NT_STATUS_DATA_NOT_ACCEPTED 0xC0000000 | 0x021b
+#define NT_STATUS_NO_BROWSER_SERVERS_FOUND 0xC0000000 | 0x021c
+#define NT_STATUS_VDM_HARD_ERROR 0xC0000000 | 0x021d
+#define NT_STATUS_DRIVER_CANCEL_TIMEOUT 0xC0000000 | 0x021e
+#define NT_STATUS_REPLY_MESSAGE_MISMATCH 0xC0000000 | 0x021f
+#define NT_STATUS_MAPPED_ALIGNMENT 0xC0000000 | 0x0220
+#define NT_STATUS_IMAGE_CHECKSUM_MISMATCH 0xC0000000 | 0x0221
+#define NT_STATUS_LOST_WRITEBEHIND_DATA 0xC0000000 | 0x0222
+#define NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID 0xC0000000 | 0x0223
+#define NT_STATUS_PASSWORD_MUST_CHANGE 0xC0000000 | 0x0224
+#define NT_STATUS_NOT_FOUND 0xC0000000 | 0x0225
+#define NT_STATUS_NOT_TINY_STREAM 0xC0000000 | 0x0226
+#define NT_STATUS_RECOVERY_FAILURE 0xC0000000 | 0x0227
+#define NT_STATUS_STACK_OVERFLOW_READ 0xC0000000 | 0x0228
+#define NT_STATUS_FAIL_CHECK 0xC0000000 | 0x0229
+#define NT_STATUS_DUPLICATE_OBJECTID 0xC0000000 | 0x022a
+#define NT_STATUS_OBJECTID_EXISTS 0xC0000000 | 0x022b
+#define NT_STATUS_CONVERT_TO_LARGE 0xC0000000 | 0x022c
+#define NT_STATUS_RETRY 0xC0000000 | 0x022d
+#define NT_STATUS_FOUND_OUT_OF_SCOPE 0xC0000000 | 0x022e
+#define NT_STATUS_ALLOCATE_BUCKET 0xC0000000 | 0x022f
+#define NT_STATUS_PROPSET_NOT_FOUND 0xC0000000 | 0x0230
+#define NT_STATUS_MARSHALL_OVERFLOW 0xC0000000 | 0x0231
+#define NT_STATUS_INVALID_VARIANT 0xC0000000 | 0x0232
+#define NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND 0xC0000000 | 0x0233
+#define NT_STATUS_ACCOUNT_LOCKED_OUT 0xC0000000 | 0x0234
+#define NT_STATUS_HANDLE_NOT_CLOSABLE 0xC0000000 | 0x0235
+#define NT_STATUS_CONNECTION_REFUSED 0xC0000000 | 0x0236
+#define NT_STATUS_GRACEFUL_DISCONNECT 0xC0000000 | 0x0237
+#define NT_STATUS_ADDRESS_ALREADY_ASSOCIATED 0xC0000000 | 0x0238
+#define NT_STATUS_ADDRESS_NOT_ASSOCIATED 0xC0000000 | 0x0239
+#define NT_STATUS_CONNECTION_INVALID 0xC0000000 | 0x023a
+#define NT_STATUS_CONNECTION_ACTIVE 0xC0000000 | 0x023b
+#define NT_STATUS_NETWORK_UNREACHABLE 0xC0000000 | 0x023c
+#define NT_STATUS_HOST_UNREACHABLE 0xC0000000 | 0x023d
+#define NT_STATUS_PROTOCOL_UNREACHABLE 0xC0000000 | 0x023e
+#define NT_STATUS_PORT_UNREACHABLE 0xC0000000 | 0x023f
+#define NT_STATUS_REQUEST_ABORTED 0xC0000000 | 0x0240
+#define NT_STATUS_CONNECTION_ABORTED 0xC0000000 | 0x0241
+#define NT_STATUS_BAD_COMPRESSION_BUFFER 0xC0000000 | 0x0242
+#define NT_STATUS_USER_MAPPED_FILE 0xC0000000 | 0x0243
+#define NT_STATUS_AUDIT_FAILED 0xC0000000 | 0x0244
+#define NT_STATUS_TIMER_RESOLUTION_NOT_SET 0xC0000000 | 0x0245
+#define NT_STATUS_CONNECTION_COUNT_LIMIT 0xC0000000 | 0x0246
+#define NT_STATUS_LOGIN_TIME_RESTRICTION 0xC0000000 | 0x0247
+#define NT_STATUS_LOGIN_WKSTA_RESTRICTION 0xC0000000 | 0x0248
+#define NT_STATUS_IMAGE_MP_UP_MISMATCH 0xC0000000 | 0x0249
+#define NT_STATUS_INSUFFICIENT_LOGON_INFO 0xC0000000 | 0x0250
+#define NT_STATUS_BAD_DLL_ENTRYPOINT 0xC0000000 | 0x0251
+#define NT_STATUS_BAD_SERVICE_ENTRYPOINT 0xC0000000 | 0x0252
+#define NT_STATUS_LPC_REPLY_LOST 0xC0000000 | 0x0253
+#define NT_STATUS_IP_ADDRESS_CONFLICT1 0xC0000000 | 0x0254
+#define NT_STATUS_IP_ADDRESS_CONFLICT2 0xC0000000 | 0x0255
+#define NT_STATUS_REGISTRY_QUOTA_LIMIT 0xC0000000 | 0x0256
+#define NT_STATUS_PATH_NOT_COVERED 0xC0000000 | 0x0257
+#define NT_STATUS_NO_CALLBACK_ACTIVE 0xC0000000 | 0x0258
+#define NT_STATUS_LICENSE_QUOTA_EXCEEDED 0xC0000000 | 0x0259
+#define NT_STATUS_PWD_TOO_SHORT 0xC0000000 | 0x025a
+#define NT_STATUS_PWD_TOO_RECENT 0xC0000000 | 0x025b
+#define NT_STATUS_PWD_HISTORY_CONFLICT 0xC0000000 | 0x025c
+#define NT_STATUS_PLUGPLAY_NO_DEVICE 0xC0000000 | 0x025e
+#define NT_STATUS_UNSUPPORTED_COMPRESSION 0xC0000000 | 0x025f
+#define NT_STATUS_INVALID_HW_PROFILE 0xC0000000 | 0x0260
+#define NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH 0xC0000000 | 0x0261
+#define NT_STATUS_DRIVER_ORDINAL_NOT_FOUND 0xC0000000 | 0x0262
+#define NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND 0xC0000000 | 0x0263
+#define NT_STATUS_RESOURCE_NOT_OWNED 0xC0000000 | 0x0264
+#define NT_STATUS_TOO_MANY_LINKS 0xC0000000 | 0x0265
+#define NT_STATUS_QUOTA_LIST_INCONSISTENT 0xC0000000 | 0x0266
+#define NT_STATUS_FILE_IS_OFFLINE 0xC0000000 | 0x0267
+#define NT_STATUS_NO_SUCH_JOB 0xC0000000 | 0xEDE	/* scheduler */
+
+#endif				/* _NTERR_H */
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
new file mode 100644
index 00000000..5d52e4a3
--- /dev/null
+++ b/fs/cifs/ntlmssp.h
@@ -0,0 +1,128 @@
+/*
+ *   fs/cifs/ntlmssp.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2007
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define NTLMSSP_SIGNATURE "NTLMSSP"
+/* Message Types */
+#define NtLmNegotiate     cpu_to_le32(1)
+#define NtLmChallenge     cpu_to_le32(2)
+#define NtLmAuthenticate  cpu_to_le32(3)
+#define UnknownMessage    cpu_to_le32(8)
+
+/* Negotiate Flags */
+#define NTLMSSP_NEGOTIATE_UNICODE         0x01 /* Text strings are unicode */
+#define NTLMSSP_NEGOTIATE_OEM             0x02 /* Text strings are in OEM */
+#define NTLMSSP_REQUEST_TARGET            0x04 /* Srv returns its auth realm */
+/* define reserved9                       0x08 */
+#define NTLMSSP_NEGOTIATE_SIGN          0x0010 /* Request signing capability */
+#define NTLMSSP_NEGOTIATE_SEAL          0x0020 /* Request confidentiality */
+#define NTLMSSP_NEGOTIATE_DGRAM         0x0040
+#define NTLMSSP_NEGOTIATE_LM_KEY        0x0080 /* Use LM session key */
+/* defined reserved 8                   0x0100 */
+#define NTLMSSP_NEGOTIATE_NTLM          0x0200 /* NTLM authentication */
+#define NTLMSSP_NEGOTIATE_NT_ONLY       0x0400 /* Lanman not allowed */
+#define NTLMSSP_ANONYMOUS               0x0800
+#define NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED 0x1000 /* reserved6 */
+#define NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED 0x2000
+#define NTLMSSP_NEGOTIATE_LOCAL_CALL    0x4000 /* client/server same machine */
+#define NTLMSSP_NEGOTIATE_ALWAYS_SIGN   0x8000 /* Sign. All security levels  */
+#define NTLMSSP_TARGET_TYPE_DOMAIN     0x10000
+#define NTLMSSP_TARGET_TYPE_SERVER     0x20000
+#define NTLMSSP_TARGET_TYPE_SHARE      0x40000
+#define NTLMSSP_NEGOTIATE_EXTENDED_SEC 0x80000 /* NB:not related to NTLMv2 pwd*/
+/* #define NTLMSSP_REQUEST_INIT_RESP     0x100000 */
+#define NTLMSSP_NEGOTIATE_IDENTIFY    0x100000
+#define NTLMSSP_REQUEST_ACCEPT_RESP   0x200000 /* reserved5 */
+#define NTLMSSP_REQUEST_NON_NT_KEY    0x400000
+#define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000
+/* #define reserved4                 0x1000000 */
+#define NTLMSSP_NEGOTIATE_VERSION    0x2000000 /* we do not set */
+/* #define reserved3                 0x4000000 */
+/* #define reserved2                 0x8000000 */
+/* #define reserved1                0x10000000 */
+#define NTLMSSP_NEGOTIATE_128       0x20000000
+#define NTLMSSP_NEGOTIATE_KEY_XCH   0x40000000
+#define NTLMSSP_NEGOTIATE_56        0x80000000
+
+/* Define AV Pair Field IDs */
+enum av_field_type {
+	NTLMSSP_AV_EOL = 0,
+	NTLMSSP_AV_NB_COMPUTER_NAME,
+	NTLMSSP_AV_NB_DOMAIN_NAME,
+	NTLMSSP_AV_DNS_COMPUTER_NAME,
+	NTLMSSP_AV_DNS_DOMAIN_NAME,
+	NTLMSSP_AV_DNS_TREE_NAME,
+	NTLMSSP_AV_FLAGS,
+	NTLMSSP_AV_TIMESTAMP,
+	NTLMSSP_AV_RESTRICTION,
+	NTLMSSP_AV_TARGET_NAME,
+	NTLMSSP_AV_CHANNEL_BINDINGS
+};
+
+/* Although typedefs are not commonly used for structure definitions */
+/* in the Linux kernel, in this particular case they are useful      */
+/* to more closely match the standards document for NTLMSSP from     */
+/* OpenGroup and to make the code more closely match the standard in */
+/* appearance */
+
+typedef struct _SECURITY_BUFFER {
+	__le16 Length;
+	__le16 MaximumLength;
+	__le32 BufferOffset;	/* offset to buffer */
+} __attribute__((packed)) SECURITY_BUFFER;
+
+typedef struct _NEGOTIATE_MESSAGE {
+	__u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
+	__le32 MessageType;     /* NtLmNegotiate = 1 */
+	__le32 NegotiateFlags;
+	SECURITY_BUFFER DomainName;	/* RFC 1001 style and ASCII */
+	SECURITY_BUFFER WorkstationName;	/* RFC 1001 and ASCII */
+	/* SECURITY_BUFFER for version info not present since we
+	   do not set the version is present flag */
+	char DomainString[0];
+	/* followed by WorkstationString */
+} __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE;
+
+typedef struct _CHALLENGE_MESSAGE {
+	__u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
+	__le32 MessageType;   /* NtLmChallenge = 2 */
+	SECURITY_BUFFER TargetName;
+	__le32 NegotiateFlags;
+	__u8 Challenge[CIFS_CRYPTO_KEY_SIZE];
+	__u8 Reserved[8];
+	SECURITY_BUFFER TargetInfoArray;
+	/* SECURITY_BUFFER for version info not present since we
+	   do not set the version is present flag */
+} __attribute__((packed)) CHALLENGE_MESSAGE, *PCHALLENGE_MESSAGE;
+
+typedef struct _AUTHENTICATE_MESSAGE {
+	__u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
+	__le32 MessageType;  /* NtLmsAuthenticate = 3 */
+	SECURITY_BUFFER LmChallengeResponse;
+	SECURITY_BUFFER NtChallengeResponse;
+	SECURITY_BUFFER DomainName;
+	SECURITY_BUFFER UserName;
+	SECURITY_BUFFER WorkstationName;
+	SECURITY_BUFFER SessionKey;
+	__le32 NegotiateFlags;
+	/* SECURITY_BUFFER for version info not present since we
+	   do not set the version is present flag */
+	char UserString[0];
+} __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
new file mode 100644
index 00000000..6751e745
--- /dev/null
+++ b/fs/cifs/readdir.c
@@ -0,0 +1,887 @@
+/*
+ *   fs/cifs/readdir.c
+ *
+ *   Directory search handling
+ *
+ *   Copyright (C) International Business Machines  Corp., 2004, 2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "cifsfs.h"
+
+/*
+ * To be safe - for UCS to UTF-8 with strings loaded with the rare long
+ * characters alloc more to account for such multibyte target UTF-8
+ * characters.
+ */
+#define UNICODE_NAME_MAX ((4 * NAME_MAX) + 2)
+
+#ifdef CONFIG_CIFS_DEBUG2
+static void dump_cifs_file_struct(struct file *file, char *label)
+{
+	struct cifsFileInfo *cf;
+
+	if (file) {
+		cf = file->private_data;
+		if (cf == NULL) {
+			cFYI(1, "empty cifs private file data");
+			return;
+		}
+		if (cf->invalidHandle)
+			cFYI(1, "invalid handle");
+		if (cf->srch_inf.endOfSearch)
+			cFYI(1, "end of search");
+		if (cf->srch_inf.emptyDir)
+			cFYI(1, "empty dir");
+	}
+}
+#else
+static inline void dump_cifs_file_struct(struct file *file, char *label)
+{
+}
+#endif /* DEBUG2 */
+
+/*
+ * Find the dentry that matches "name". If there isn't one, create one. If it's
+ * a negative dentry or the uniqueid changed, then drop it and recreate it.
+ */
+static struct dentry *
+cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
+		    struct cifs_fattr *fattr)
+{
+	struct dentry *dentry, *alias;
+	struct inode *inode;
+	struct super_block *sb = parent->d_inode->i_sb;
+
+	cFYI(1, "For %s", name->name);
+
+	if (parent->d_op && parent->d_op->d_hash)
+		parent->d_op->d_hash(parent, parent->d_inode, name);
+	else
+		name->hash = full_name_hash(name->name, name->len);
+
+	dentry = d_lookup(parent, name);
+	if (dentry) {
+		/* FIXME: check for inode number changes? */
+		if (dentry->d_inode != NULL)
+			return dentry;
+		d_drop(dentry);
+		dput(dentry);
+	}
+
+	dentry = d_alloc(parent, name);
+	if (dentry == NULL)
+		return NULL;
+
+	inode = cifs_iget(sb, fattr);
+	if (!inode) {
+		dput(dentry);
+		return NULL;
+	}
+
+	alias = d_materialise_unique(dentry, inode);
+	if (alias != NULL) {
+		dput(dentry);
+		if (IS_ERR(alias))
+			return NULL;
+		dentry = alias;
+	}
+
+	return dentry;
+}
+
+static void
+cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
+{
+	fattr->cf_uid = cifs_sb->mnt_uid;
+	fattr->cf_gid = cifs_sb->mnt_gid;
+
+	if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+		fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
+		fattr->cf_dtype = DT_DIR;
+	} else {
+		fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
+		fattr->cf_dtype = DT_REG;
+	}
+
+	if (fattr->cf_cifsattrs & ATTR_READONLY)
+		fattr->cf_mode &= ~S_IWUGO;
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL &&
+	    fattr->cf_cifsattrs & ATTR_SYSTEM) {
+		if (fattr->cf_eof == 0)  {
+			fattr->cf_mode &= ~S_IFMT;
+			fattr->cf_mode |= S_IFIFO;
+			fattr->cf_dtype = DT_FIFO;
+		} else {
+			/*
+			 * trying to get the type and mode via SFU can be slow,
+			 * so just call those regular files for now, and mark
+			 * for reval
+			 */
+			fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
+		}
+	}
+}
+
+static void
+cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
+		       struct cifs_sb_info *cifs_sb)
+{
+	memset(fattr, 0, sizeof(*fattr));
+	fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes);
+	fattr->cf_eof = le64_to_cpu(info->EndOfFile);
+	fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+	fattr->cf_createtime = le64_to_cpu(info->CreationTime);
+	fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
+	fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
+	fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
+
+	cifs_fill_common_info(fattr, cifs_sb);
+}
+
+static void
+cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
+		       struct cifs_sb_info *cifs_sb)
+{
+	int offset = cifs_sb_master_tcon(cifs_sb)->ses->server->timeAdj;
+
+	memset(fattr, 0, sizeof(*fattr));
+	fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate,
+					    info->LastAccessTime, offset);
+	fattr->cf_ctime = cnvrtDosUnixTm(info->LastWriteDate,
+					    info->LastWriteTime, offset);
+	fattr->cf_mtime = cnvrtDosUnixTm(info->LastWriteDate,
+					    info->LastWriteTime, offset);
+
+	fattr->cf_cifsattrs = le16_to_cpu(info->Attributes);
+	fattr->cf_bytes = le32_to_cpu(info->AllocationSize);
+	fattr->cf_eof = le32_to_cpu(info->DataSize);
+
+	cifs_fill_common_info(fattr, cifs_sb);
+}
+
+/* BB eventually need to add the following helper function to
+      resolve NT_STATUS_STOPPED_ON_SYMLINK return code when
+      we try to do FindFirst on (NTFS) directory symlinks */
+/*
+int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
+			     int xid)
+{
+	__u16 fid;
+	int len;
+	int oplock = 0;
+	int rc;
+	struct cifs_tcon *ptcon = cifs_sb_tcon(cifs_sb);
+	char *tmpbuffer;
+
+	rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
+			OPEN_REPARSE_POINT, &fid, &oplock, NULL,
+			cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (!rc) {
+		tmpbuffer = kmalloc(maxpath);
+		rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path,
+				tmpbuffer,
+				maxpath -1,
+				fid,
+				cifs_sb->local_nls);
+		if (CIFSSMBClose(xid, ptcon, fid)) {
+			cFYI(1, "Error closing temporary reparsepoint open");
+		}
+	}
+}
+ */
+
+static int initiate_cifs_search(const int xid, struct file *file)
+{
+	int rc = 0;
+	char *full_path = NULL;
+	struct cifsFileInfo *cifsFile;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	struct tcon_link *tlink = NULL;
+	struct cifs_tcon *pTcon;
+
+	if (file->private_data == NULL) {
+		tlink = cifs_sb_tlink(cifs_sb);
+		if (IS_ERR(tlink))
+			return PTR_ERR(tlink);
+
+		cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+		if (cifsFile == NULL) {
+			rc = -ENOMEM;
+			goto error_exit;
+		}
+		file->private_data = cifsFile;
+		cifsFile->tlink = cifs_get_tlink(tlink);
+		pTcon = tlink_tcon(tlink);
+	} else {
+		cifsFile = file->private_data;
+		pTcon = tlink_tcon(cifsFile->tlink);
+	}
+
+	cifsFile->invalidHandle = true;
+	cifsFile->srch_inf.endOfSearch = false;
+
+	full_path = build_path_from_dentry(file->f_path.dentry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto error_exit;
+	}
+
+	cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
+
+ffirst_retry:
+	/* test for Unix extensions */
+	/* but now check for them on the share/mount not on the SMB session */
+/*	if (pTcon->ses->capabilities & CAP_UNIX) { */
+	if (pTcon->unix_ext)
+		cifsFile->srch_inf.info_level = SMB_FIND_FILE_UNIX;
+	else if ((pTcon->ses->capabilities &
+			(CAP_NT_SMBS | CAP_NT_FIND)) == 0) {
+		cifsFile->srch_inf.info_level = SMB_FIND_FILE_INFO_STANDARD;
+	} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+		cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
+	} else /* not srvinos - BB fixme add check for backlevel? */ {
+		cifsFile->srch_inf.info_level = SMB_FIND_FILE_DIRECTORY_INFO;
+	}
+
+	rc = CIFSFindFirst(xid, pTcon, full_path, cifs_sb->local_nls,
+		&cifsFile->netfid, &cifsFile->srch_inf,
+		cifs_sb->mnt_cifs_flags &
+			CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb));
+	if (rc == 0)
+		cifsFile->invalidHandle = false;
+	/* BB add following call to handle readdir on new NTFS symlink errors
+	else if STATUS_STOPPED_ON_SYMLINK
+		call get_symlink_reparse_path and retry with new path */
+	else if ((rc == -EOPNOTSUPP) &&
+		(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
+		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
+		goto ffirst_retry;
+	}
+error_exit:
+	kfree(full_path);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
+/* return length of unicode string in bytes */
+static int cifs_unicode_bytelen(char *str)
+{
+	int len;
+	__le16 *ustr = (__le16 *)str;
+
+	for (len = 0; len <= PATH_MAX; len++) {
+		if (ustr[len] == 0)
+			return len << 1;
+	}
+	cFYI(1, "Unicode string longer than PATH_MAX found");
+	return len << 1;
+}
+
+static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
+{
+	char *new_entry;
+	FILE_DIRECTORY_INFO *pDirInfo = (FILE_DIRECTORY_INFO *)old_entry;
+
+	if (level == SMB_FIND_FILE_INFO_STANDARD) {
+		FIND_FILE_STANDARD_INFO *pfData;
+		pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo;
+
+		new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) +
+				pfData->FileNameLength;
+	} else
+		new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
+	cFYI(1, "new entry %p old entry %p", new_entry, old_entry);
+	/* validate that new_entry is not past end of SMB */
+	if (new_entry >= end_of_smb) {
+		cERROR(1, "search entry %p began after end of SMB %p old entry %p",
+			new_entry, end_of_smb, old_entry);
+		return NULL;
+	} else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
+		    (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb))
+		  || ((level != SMB_FIND_FILE_INFO_STANDARD) &&
+		   (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb)))  {
+		cERROR(1, "search entry %p extends after end of SMB %p",
+			new_entry, end_of_smb);
+		return NULL;
+	} else
+		return new_entry;
+
+}
+
+#define UNICODE_DOT cpu_to_le16(0x2e)
+
+/* return 0 if no match and 1 for . (current directory) and 2 for .. (parent) */
+static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
+{
+	int rc = 0;
+	char *filename = NULL;
+	int len = 0;
+
+	if (cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
+		FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		if (cfile->srch_inf.unicode) {
+			len = cifs_unicode_bytelen(filename);
+		} else {
+			/* BB should we make this strnlen of PATH_MAX? */
+			len = strnlen(filename, 5);
+		}
+	} else if (cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) {
+		FILE_DIRECTORY_INFO *pFindData =
+			(FILE_DIRECTORY_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+	} else if (cfile->srch_inf.info_level ==
+			SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
+		FILE_FULL_DIRECTORY_INFO *pFindData =
+			(FILE_FULL_DIRECTORY_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+	} else if (cfile->srch_inf.info_level ==
+			SMB_FIND_FILE_ID_FULL_DIR_INFO) {
+		SEARCH_ID_FULL_DIR_INFO *pFindData =
+			(SEARCH_ID_FULL_DIR_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+	} else if (cfile->srch_inf.info_level ==
+			SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
+		FILE_BOTH_DIRECTORY_INFO *pFindData =
+			(FILE_BOTH_DIRECTORY_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+	} else if (cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) {
+		FIND_FILE_STANDARD_INFO *pFindData =
+			(FIND_FILE_STANDARD_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = pFindData->FileNameLength;
+	} else {
+		cFYI(1, "Unknown findfirst level %d",
+			 cfile->srch_inf.info_level);
+	}
+
+	if (filename) {
+		if (cfile->srch_inf.unicode) {
+			__le16 *ufilename = (__le16 *)filename;
+			if (len == 2) {
+				/* check for . */
+				if (ufilename[0] == UNICODE_DOT)
+					rc = 1;
+			} else if (len == 4) {
+				/* check for .. */
+				if ((ufilename[0] == UNICODE_DOT)
+				   && (ufilename[1] == UNICODE_DOT))
+					rc = 2;
+			}
+		} else /* ASCII */ {
+			if (len == 1) {
+				if (filename[0] == '.')
+					rc = 1;
+			} else if (len == 2) {
+				if ((filename[0] == '.') && (filename[1] == '.'))
+					rc = 2;
+			}
+		}
+	}
+
+	return rc;
+}
+
+/* Check if directory that we are searching has changed so we can decide
+   whether we can use the cached search results from the previous search */
+static int is_dir_changed(struct file *file)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
+
+	if (cifsInfo->time == 0)
+		return 1; /* directory was changed, perhaps due to unlink */
+	else
+		return 0;
+
+}
+
+static int cifs_save_resume_key(const char *current_entry,
+	struct cifsFileInfo *cifsFile)
+{
+	int rc = 0;
+	unsigned int len = 0;
+	__u16 level;
+	char *filename;
+
+	if ((cifsFile == NULL) || (current_entry == NULL))
+		return -EINVAL;
+
+	level = cifsFile->srch_inf.info_level;
+
+	if (level == SMB_FIND_FILE_UNIX) {
+		FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
+
+		filename = &pFindData->FileName[0];
+		if (cifsFile->srch_inf.unicode) {
+			len = cifs_unicode_bytelen(filename);
+		} else {
+			/* BB should we make this strnlen of PATH_MAX? */
+			len = strnlen(filename, PATH_MAX);
+		}
+		cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
+	} else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
+		FILE_DIRECTORY_INFO *pFindData =
+			(FILE_DIRECTORY_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+	} else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
+		FILE_FULL_DIRECTORY_INFO *pFindData =
+			(FILE_FULL_DIRECTORY_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+	} else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
+		SEARCH_ID_FULL_DIR_INFO *pFindData =
+			(SEARCH_ID_FULL_DIR_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+	} else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
+		FILE_BOTH_DIRECTORY_INFO *pFindData =
+			(FILE_BOTH_DIRECTORY_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+	} else if (level == SMB_FIND_FILE_INFO_STANDARD) {
+		FIND_FILE_STANDARD_INFO *pFindData =
+			(FIND_FILE_STANDARD_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		/* one byte length, no name conversion */
+		len = (unsigned int)pFindData->FileNameLength;
+		cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
+	} else {
+		cFYI(1, "Unknown findfirst level %d", level);
+		return -EINVAL;
+	}
+	cifsFile->srch_inf.resume_name_len = len;
+	cifsFile->srch_inf.presume_name = filename;
+	return rc;
+}
+
+/* find the corresponding entry in the search */
+/* Note that the SMB server returns search entries for . and .. which
+   complicates logic here if we choose to parse for them and we do not
+   assume that they are located in the findfirst return buffer.*/
+/* We start counting in the buffer with entry 2 and increment for every
+   entry (do not increment for . or .. entry) */
+static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
+	struct file *file, char **ppCurrentEntry, int *num_to_ret)
+{
+	int rc = 0;
+	int pos_in_buf = 0;
+	loff_t first_entry_in_buffer;
+	loff_t index_to_find = file->f_pos;
+	struct cifsFileInfo *cifsFile = file->private_data;
+	/* check if index in the buffer */
+
+	if ((cifsFile == NULL) || (ppCurrentEntry == NULL) ||
+	   (num_to_ret == NULL))
+		return -ENOENT;
+
+	*ppCurrentEntry = NULL;
+	first_entry_in_buffer =
+		cifsFile->srch_inf.index_of_last_entry -
+			cifsFile->srch_inf.entries_in_buffer;
+
+	/* if first entry in buf is zero then is first buffer
+	in search response data which means it is likely . and ..
+	will be in this buffer, although some servers do not return
+	. and .. for the root of a drive and for those we need
+	to start two entries earlier */
+
+	dump_cifs_file_struct(file, "In fce ");
+	if (((index_to_find < cifsFile->srch_inf.index_of_last_entry) &&
+	     is_dir_changed(file)) ||
+	   (index_to_find < first_entry_in_buffer)) {
+		/* close and restart search */
+		cFYI(1, "search backing up - close and restart search");
+		spin_lock(&cifs_file_list_lock);
+		if (!cifsFile->srch_inf.endOfSearch &&
+		    !cifsFile->invalidHandle) {
+			cifsFile->invalidHandle = true;
+			spin_unlock(&cifs_file_list_lock);
+			CIFSFindClose(xid, pTcon, cifsFile->netfid);
+		} else
+			spin_unlock(&cifs_file_list_lock);
+		if (cifsFile->srch_inf.ntwrk_buf_start) {
+			cFYI(1, "freeing SMB ff cache buf on search rewind");
+			if (cifsFile->srch_inf.smallBuf)
+				cifs_small_buf_release(cifsFile->srch_inf.
+						ntwrk_buf_start);
+			else
+				cifs_buf_release(cifsFile->srch_inf.
+						ntwrk_buf_start);
+			cifsFile->srch_inf.ntwrk_buf_start = NULL;
+		}
+		rc = initiate_cifs_search(xid, file);
+		if (rc) {
+			cFYI(1, "error %d reinitiating a search on rewind",
+				 rc);
+			return rc;
+		}
+		cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
+	}
+
+	while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
+	      (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
+		cFYI(1, "calling findnext2");
+		rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
+				  &cifsFile->srch_inf);
+		cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
+		if (rc)
+			return -ENOENT;
+	}
+	if (index_to_find < cifsFile->srch_inf.index_of_last_entry) {
+		/* we found the buffer that contains the entry */
+		/* scan and find it */
+		int i;
+		char *current_entry;
+		char *end_of_smb = cifsFile->srch_inf.ntwrk_buf_start +
+			smbCalcSize((struct smb_hdr *)
+				cifsFile->srch_inf.ntwrk_buf_start);
+
+		current_entry = cifsFile->srch_inf.srch_entries_start;
+		first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry
+					- cifsFile->srch_inf.entries_in_buffer;
+		pos_in_buf = index_to_find - first_entry_in_buffer;
+		cFYI(1, "found entry - pos_in_buf %d", pos_in_buf);
+
+		for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) {
+			/* go entry by entry figuring out which is first */
+			current_entry = nxt_dir_entry(current_entry, end_of_smb,
+						cifsFile->srch_inf.info_level);
+		}
+		if ((current_entry == NULL) && (i < pos_in_buf)) {
+			/* BB fixme - check if we should flag this error */
+			cERROR(1, "reached end of buf searching for pos in buf"
+			  " %d index to find %lld rc %d",
+			  pos_in_buf, index_to_find, rc);
+		}
+		rc = 0;
+		*ppCurrentEntry = current_entry;
+	} else {
+		cFYI(1, "index not in buffer - could not findnext into it");
+		return 0;
+	}
+
+	if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) {
+		cFYI(1, "can not return entries pos_in_buf beyond last");
+		*num_to_ret = 0;
+	} else
+		*num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf;
+
+	return rc;
+}
+
+/* inode num, inode type and filename returned */
+static int cifs_get_name_from_search_buf(struct qstr *pqst,
+	char *current_entry, __u16 level, unsigned int unicode,
+	struct cifs_sb_info *cifs_sb, unsigned int max_len, __u64 *pinum)
+{
+	int rc = 0;
+	unsigned int len = 0;
+	char *filename;
+	struct nls_table *nlt = cifs_sb->local_nls;
+
+	*pinum = 0;
+
+	if (level == SMB_FIND_FILE_UNIX) {
+		FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
+
+		filename = &pFindData->FileName[0];
+		if (unicode) {
+			len = cifs_unicode_bytelen(filename);
+		} else {
+			/* BB should we make this strnlen of PATH_MAX? */
+			len = strnlen(filename, PATH_MAX);
+		}
+
+		*pinum = le64_to_cpu(pFindData->basic.UniqueId);
+	} else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
+		FILE_DIRECTORY_INFO *pFindData =
+			(FILE_DIRECTORY_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+	} else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
+		FILE_FULL_DIRECTORY_INFO *pFindData =
+			(FILE_FULL_DIRECTORY_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+	} else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
+		SEARCH_ID_FULL_DIR_INFO *pFindData =
+			(SEARCH_ID_FULL_DIR_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+		*pinum = le64_to_cpu(pFindData->UniqueId);
+	} else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
+		FILE_BOTH_DIRECTORY_INFO *pFindData =
+			(FILE_BOTH_DIRECTORY_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+	} else if (level == SMB_FIND_FILE_INFO_STANDARD) {
+		FIND_FILE_STANDARD_INFO *pFindData =
+			(FIND_FILE_STANDARD_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		/* one byte length, no name conversion */
+		len = (unsigned int)pFindData->FileNameLength;
+	} else {
+		cFYI(1, "Unknown findfirst level %d", level);
+		return -EINVAL;
+	}
+
+	if (len > max_len) {
+		cERROR(1, "bad search response length %d past smb end", len);
+		return -EINVAL;
+	}
+
+	if (unicode) {
+		pqst->len = cifs_from_ucs2((char *) pqst->name,
+					   (__le16 *) filename,
+					   UNICODE_NAME_MAX,
+					   min(len, max_len), nlt,
+					   cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		pqst->len -= nls_nullsize(nlt);
+	} else {
+		pqst->name = filename;
+		pqst->len = len;
+	}
+	return rc;
+}
+
+static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
+			void *direntry, char *scratch_buf, unsigned int max_len)
+{
+	int rc = 0;
+	struct qstr qstring;
+	struct cifsFileInfo *pCifsF;
+	u64    inum;
+	ino_t  ino;
+	struct super_block *sb;
+	struct cifs_sb_info *cifs_sb;
+	struct dentry *tmp_dentry;
+	struct cifs_fattr fattr;
+
+	/* get filename and len into qstring */
+	/* get dentry */
+	/* decide whether to create and populate ionde */
+	if ((direntry == NULL) || (file == NULL))
+		return -EINVAL;
+
+	pCifsF = file->private_data;
+
+	if ((scratch_buf == NULL) || (pfindEntry == NULL) || (pCifsF == NULL))
+		return -ENOENT;
+
+	rc = cifs_entry_is_dot(pfindEntry, pCifsF);
+	/* skip . and .. since we added them first */
+	if (rc != 0)
+		return 0;
+
+	sb = file->f_path.dentry->d_sb;
+	cifs_sb = CIFS_SB(sb);
+
+	qstring.name = scratch_buf;
+	rc = cifs_get_name_from_search_buf(&qstring, pfindEntry,
+			pCifsF->srch_inf.info_level,
+			pCifsF->srch_inf.unicode, cifs_sb,
+			max_len, &inum /* returned */);
+
+	if (rc)
+		return rc;
+
+	if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX)
+		cifs_unix_basic_to_fattr(&fattr,
+				 &((FILE_UNIX_INFO *) pfindEntry)->basic,
+				 cifs_sb);
+	else if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD)
+		cifs_std_info_to_fattr(&fattr, (FIND_FILE_STANDARD_INFO *)
+					pfindEntry, cifs_sb);
+	else
+		cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *)
+					pfindEntry, cifs_sb);
+
+	if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
+		fattr.cf_uniqueid = inum;
+	} else {
+		fattr.cf_uniqueid = iunique(sb, ROOT_I);
+		cifs_autodisable_serverino(cifs_sb);
+	}
+
+	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
+	    CIFSCouldBeMFSymlink(&fattr))
+		/*
+		 * trying to get the type and mode can be slow,
+		 * so just call those regular files for now, and mark
+		 * for reval
+		 */
+		fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
+
+	ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
+	tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
+
+	rc = filldir(direntry, qstring.name, qstring.len, file->f_pos,
+		     ino, fattr.cf_dtype);
+
+	dput(tmp_dentry);
+	return rc;
+}
+
+
+int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
+{
+	int rc = 0;
+	int xid, i;
+	struct cifs_tcon *pTcon;
+	struct cifsFileInfo *cifsFile = NULL;
+	char *current_entry;
+	int num_to_fill = 0;
+	char *tmp_buf = NULL;
+	char *end_of_smb;
+	unsigned int max_len;
+
+	xid = GetXid();
+
+	/*
+	 * Ensure FindFirst doesn't fail before doing filldir() for '.' and
+	 * '..'. Otherwise we won't be able to notify VFS in case of failure.
+	 */
+	if (file->private_data == NULL) {
+		rc = initiate_cifs_search(xid, file);
+		cFYI(1, "initiate cifs search rc %d", rc);
+		if (rc)
+			goto rddir2_exit;
+	}
+
+	switch ((int) file->f_pos) {
+	case 0:
+		if (filldir(direntry, ".", 1, file->f_pos,
+		     file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) {
+			cERROR(1, "Filldir for current dir failed");
+			rc = -ENOMEM;
+			break;
+		}
+		file->f_pos++;
+	case 1:
+		if (filldir(direntry, "..", 2, file->f_pos,
+		     file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) {
+			cERROR(1, "Filldir for parent dir failed");
+			rc = -ENOMEM;
+			break;
+		}
+		file->f_pos++;
+	default:
+		/* 1) If search is active,
+			is in current search buffer?
+			if it before then restart search
+			if after then keep searching till find it */
+
+		if (file->private_data == NULL) {
+			rc = -EINVAL;
+			FreeXid(xid);
+			return rc;
+		}
+		cifsFile = file->private_data;
+		if (cifsFile->srch_inf.endOfSearch) {
+			if (cifsFile->srch_inf.emptyDir) {
+				cFYI(1, "End of search, empty dir");
+				rc = 0;
+				break;
+			}
+		} /* else {
+			cifsFile->invalidHandle = true;
+			CIFSFindClose(xid, pTcon, cifsFile->netfid);
+		} */
+
+		pTcon = tlink_tcon(cifsFile->tlink);
+		rc = find_cifs_entry(xid, pTcon, file,
+				&current_entry, &num_to_fill);
+		if (rc) {
+			cFYI(1, "fce error %d", rc);
+			goto rddir2_exit;
+		} else if (current_entry != NULL) {
+			cFYI(1, "entry %lld found", file->f_pos);
+		} else {
+			cFYI(1, "could not find entry");
+			goto rddir2_exit;
+		}
+		cFYI(1, "loop through %d times filling dir for net buf %p",
+			num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
+		max_len = smbCalcSize((struct smb_hdr *)
+				cifsFile->srch_inf.ntwrk_buf_start);
+		end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
+
+		tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
+		if (tmp_buf == NULL) {
+			rc = -ENOMEM;
+			break;
+		}
+
+		for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
+			if (current_entry == NULL) {
+				/* evaluate whether this case is an error */
+				cERROR(1, "past SMB end,  num to fill %d i %d",
+					  num_to_fill, i);
+				break;
+			}
+			/* if buggy server returns . and .. late do
+			we want to check for that here? */
+			rc = cifs_filldir(current_entry, file,
+					filldir, direntry, tmp_buf, max_len);
+			if (rc == -EOVERFLOW) {
+				rc = 0;
+				break;
+			}
+
+			file->f_pos++;
+			if (file->f_pos ==
+				cifsFile->srch_inf.index_of_last_entry) {
+				cFYI(1, "last entry in buf at pos %lld %s",
+					file->f_pos, tmp_buf);
+				cifs_save_resume_key(current_entry, cifsFile);
+				break;
+			} else
+				current_entry =
+					nxt_dir_entry(current_entry, end_of_smb,
+						cifsFile->srch_inf.info_level);
+		}
+		kfree(tmp_buf);
+		break;
+	} /* end switch */
+
+rddir2_exit:
+	FreeXid(xid);
+	return rc;
+}
diff --git a/fs/cifs/rfc1002pdu.h b/fs/cifs/rfc1002pdu.h
new file mode 100644
index 00000000..8b69fcce
--- /dev/null
+++ b/fs/cifs/rfc1002pdu.h
@@ -0,0 +1,74 @@
+/*
+ *   fs/cifs/rfc1002pdu.h
+ *
+ *   Protocol Data Unit definitions for RFC 1001/1002 support
+ *
+ *   Copyright (c) International Business Machines  Corp., 2004
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* NB: unlike smb/cifs packets, the RFC1002 structures are big endian */
+
+	/* RFC 1002 session packet types */
+#define RFC1002_SESSION_MESSAGE 0x00
+#define RFC1002_SESSION_REQUEST  0x81
+#define RFC1002_POSITIVE_SESSION_RESPONSE 0x82
+#define RFC1002_NEGATIVE_SESSION_RESPONSE 0x83
+#define RFC1002_RETARGET_SESSION_RESPONSE 0x84
+#define RFC1002_SESSION_KEEP_ALIVE 0x85
+
+	/* RFC 1002 flags (only one defined */
+#define RFC1002_LENGTH_EXTEND 0x80 /* high order bit of length (ie +64K) */
+
+struct rfc1002_session_packet {
+	__u8	type;
+	__u8	flags;
+	__u16	length;
+	union {
+		struct {
+			__u8 called_len;
+			__u8 called_name[32];
+			__u8 scope1; /* null */
+			__u8 calling_len;
+			__u8 calling_name[32];
+			__u8 scope2; /* null */
+		} __attribute__((packed)) session_req;
+		struct {
+			__u32 retarget_ip_addr;
+			__u16 port;
+		} __attribute__((packed)) retarget_resp;
+		__u8 neg_ses_resp_error_code;
+		/* POSITIVE_SESSION_RESPONSE packet does not include trailer.
+		SESSION_KEEP_ALIVE packet also does not include a trailer.
+		Trailer for the SESSION_MESSAGE packet is SMB/CIFS header */
+	} __attribute__((packed)) trailer;
+} __attribute__((packed));
+
+/* Negative Session Response error codes */
+#define RFC1002_NOT_LISTENING_CALLED  0x80 /* not listening on called name */
+#define RFC1002_NOT_LISTENING_CALLING 0x81 /* not listening on calling name */
+#define RFC1002_NOT_PRESENT           0x82 /* called name not present */
+#define RFC1002_INSUFFICIENT_RESOURCE 0x83
+#define RFC1002_UNSPECIFIED_ERROR     0x8F
+
+/* RFC 1002 Datagram service packets are not defined here as they
+are not needed for the network filesystem client unless we plan on
+implementing broadcast resolution of the server ip address (from
+server netbios name). Currently server names are resolved only via DNS
+(tcp name) or ip address or an /etc/hosts equivalent mapping to ip address.*/
+
+#define DEFAULT_CIFS_CALLED_NAME  "*SMBSERVER      "
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
new file mode 100644
index 00000000..0cfae191
--- /dev/null
+++ b/fs/cifs/sess.c
@@ -0,0 +1,952 @@
+/*
+ *   fs/cifs/sess.c
+ *
+ *   SMB/CIFS session setup handling routines
+ *
+ *   Copyright (c) International Business Machines  Corp., 2006, 2009
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "ntlmssp.h"
+#include "nterr.h"
+#include <linux/utsname.h>
+#include <linux/slab.h>
+#include "cifs_spnego.h"
+
+/*
+ * Checks if this is the first smb session to be reconnected after
+ * the socket has been reestablished (so we know whether to use vc 0).
+ * Called while holding the cifs_tcp_ses_lock, so do not block
+ */
+static bool is_first_ses_reconnect(struct cifs_ses *ses)
+{
+	struct list_head *tmp;
+	struct cifs_ses *tmp_ses;
+
+	list_for_each(tmp, &ses->server->smb_ses_list) {
+		tmp_ses = list_entry(tmp, struct cifs_ses,
+				     smb_ses_list);
+		if (tmp_ses->need_reconnect == false)
+			return false;
+	}
+	/* could not find a session that was already connected,
+	   this must be the first one we are reconnecting */
+	return true;
+}
+
+/*
+ *	vc number 0 is treated specially by some servers, and should be the
+ *      first one we request.  After that we can use vcnumbers up to maxvcs,
+ *	one for each smb session (some Windows versions set maxvcs incorrectly
+ *	so maxvc=1 can be ignored).  If we have too many vcs, we can reuse
+ *	any vc but zero (some servers reset the connection on vcnum zero)
+ *
+ */
+static __le16 get_next_vcnum(struct cifs_ses *ses)
+{
+	__u16 vcnum = 0;
+	struct list_head *tmp;
+	struct cifs_ses *tmp_ses;
+	__u16 max_vcs = ses->server->max_vcs;
+	__u16 i;
+	int free_vc_found = 0;
+
+	/* Quoting the MS-SMB specification: "Windows-based SMB servers set this
+	field to one but do not enforce this limit, which allows an SMB client
+	to establish more virtual circuits than allowed by this value ... but
+	other server implementations can enforce this limit." */
+	if (max_vcs < 2)
+		max_vcs = 0xFFFF;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
+			goto get_vc_num_exit;  /* vcnum will be zero */
+	for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
+		if (i == 0) /* this is the only connection, use vc 0 */
+			break;
+
+		free_vc_found = 1;
+
+		list_for_each(tmp, &ses->server->smb_ses_list) {
+			tmp_ses = list_entry(tmp, struct cifs_ses,
+					     smb_ses_list);
+			if (tmp_ses->vcnum == i) {
+				free_vc_found = 0;
+				break; /* found duplicate, try next vcnum */
+			}
+		}
+		if (free_vc_found)
+			break; /* we found a vcnumber that will work - use it */
+	}
+
+	if (i == 0)
+		vcnum = 0; /* for most common case, ie if one smb session, use
+			      vc zero.  Also for case when no free vcnum, zero
+			      is safest to send (some clients only send zero) */
+	else if (free_vc_found == 0)
+		vcnum = 1;  /* we can not reuse vc=0 safely, since some servers
+				reset all uids on that, but 1 is ok. */
+	else
+		vcnum = i;
+	ses->vcnum = vcnum;
+get_vc_num_exit:
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	return cpu_to_le16(vcnum);
+}
+
+static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
+{
+	__u32 capabilities = 0;
+
+	/* init fields common to all four types of SessSetup */
+	/* Note that offsets for first seven fields in req struct are same  */
+	/*	in CIFS Specs so does not matter which of 3 forms of struct */
+	/*	that we use in next few lines                               */
+	/* Note that header is initialized to zero in header_assemble */
+	pSMB->req.AndXCommand = 0xFF;
+	pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
+	pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
+	pSMB->req.VcNumber = get_next_vcnum(ses);
+
+	/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
+
+	/* BB verify whether signing required on neg or just on auth frame
+	   (and NTLM case) */
+
+	capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
+			CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
+
+	if (ses->server->sec_mode &
+	    (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+		pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+
+	if (ses->capabilities & CAP_UNICODE) {
+		pSMB->req.hdr.Flags2 |= SMBFLG2_UNICODE;
+		capabilities |= CAP_UNICODE;
+	}
+	if (ses->capabilities & CAP_STATUS32) {
+		pSMB->req.hdr.Flags2 |= SMBFLG2_ERR_STATUS;
+		capabilities |= CAP_STATUS32;
+	}
+	if (ses->capabilities & CAP_DFS) {
+		pSMB->req.hdr.Flags2 |= SMBFLG2_DFS;
+		capabilities |= CAP_DFS;
+	}
+	if (ses->capabilities & CAP_UNIX)
+		capabilities |= CAP_UNIX;
+
+	return capabilities;
+}
+
+static void
+unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
+{
+	char *bcc_ptr = *pbcc_area;
+	int bytes_ret = 0;
+
+	/* Copy OS version */
+	bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32,
+				  nls_cp);
+	bcc_ptr += 2 * bytes_ret;
+	bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release,
+				  32, nls_cp);
+	bcc_ptr += 2 * bytes_ret;
+	bcc_ptr += 2; /* trailing null */
+
+	bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
+				  32, nls_cp);
+	bcc_ptr += 2 * bytes_ret;
+	bcc_ptr += 2; /* trailing null */
+
+	*pbcc_area = bcc_ptr;
+}
+
+static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
+				   const struct nls_table *nls_cp)
+{
+	char *bcc_ptr = *pbcc_area;
+	int bytes_ret = 0;
+
+	/* copy domain */
+	if (ses->domainName == NULL) {
+		/* Sending null domain better than using a bogus domain name (as
+		we did briefly in 2.6.18) since server will use its default */
+		*bcc_ptr = 0;
+		*(bcc_ptr+1) = 0;
+		bytes_ret = 0;
+	} else
+		bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->domainName,
+					  256, nls_cp);
+	bcc_ptr += 2 * bytes_ret;
+	bcc_ptr += 2;  /* account for null terminator */
+
+	*pbcc_area = bcc_ptr;
+}
+
+
+static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
+				   const struct nls_table *nls_cp)
+{
+	char *bcc_ptr = *pbcc_area;
+	int bytes_ret = 0;
+
+	/* BB FIXME add check that strings total less
+	than 335 or will need to send them as arrays */
+
+	/* unicode strings, must be word aligned before the call */
+/*	if ((long) bcc_ptr % 2)	{
+		*bcc_ptr = 0;
+		bcc_ptr++;
+	} */
+	/* copy user */
+	if (ses->user_name == NULL) {
+		/* null user mount */
+		*bcc_ptr = 0;
+		*(bcc_ptr+1) = 0;
+	} else {
+		bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->user_name,
+					  MAX_USERNAME_SIZE, nls_cp);
+	}
+	bcc_ptr += 2 * bytes_ret;
+	bcc_ptr += 2; /* account for null termination */
+
+	unicode_domain_string(&bcc_ptr, ses, nls_cp);
+	unicode_oslm_strings(&bcc_ptr, nls_cp);
+
+	*pbcc_area = bcc_ptr;
+}
+
+static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
+				 const struct nls_table *nls_cp)
+{
+	char *bcc_ptr = *pbcc_area;
+
+	/* copy user */
+	/* BB what about null user mounts - check that we do this BB */
+	/* copy user */
+	if (ses->user_name != NULL) {
+		strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE);
+		bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE);
+	}
+	/* else null user mount */
+	*bcc_ptr = 0;
+	bcc_ptr++; /* account for null termination */
+
+	/* copy domain */
+	if (ses->domainName != NULL) {
+		strncpy(bcc_ptr, ses->domainName, 256);
+		bcc_ptr += strnlen(ses->domainName, 256);
+	} /* else we will send a null domain name
+	     so the server will default to its own domain */
+	*bcc_ptr = 0;
+	bcc_ptr++;
+
+	/* BB check for overflow here */
+
+	strcpy(bcc_ptr, "Linux version ");
+	bcc_ptr += strlen("Linux version ");
+	strcpy(bcc_ptr, init_utsname()->release);
+	bcc_ptr += strlen(init_utsname()->release) + 1;
+
+	strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
+	bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
+
+	*pbcc_area = bcc_ptr;
+}
+
+static void
+decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
+		      const struct nls_table *nls_cp)
+{
+	int len;
+	char *data = *pbcc_area;
+
+	cFYI(1, "bleft %d", bleft);
+
+	kfree(ses->serverOS);
+	ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+	cFYI(1, "serverOS=%s", ses->serverOS);
+	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
+	data += len;
+	bleft -= len;
+	if (bleft <= 0)
+		return;
+
+	kfree(ses->serverNOS);
+	ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+	cFYI(1, "serverNOS=%s", ses->serverNOS);
+	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
+	data += len;
+	bleft -= len;
+	if (bleft <= 0)
+		return;
+
+	kfree(ses->serverDomain);
+	ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+	cFYI(1, "serverDomain=%s", ses->serverDomain);
+
+	return;
+}
+
+static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
+			       struct cifs_ses *ses,
+			       const struct nls_table *nls_cp)
+{
+	int rc = 0;
+	int len;
+	char *bcc_ptr = *pbcc_area;
+
+	cFYI(1, "decode sessetup ascii. bleft %d", bleft);
+
+	len = strnlen(bcc_ptr, bleft);
+	if (len >= bleft)
+		return rc;
+
+	kfree(ses->serverOS);
+
+	ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
+	if (ses->serverOS)
+		strncpy(ses->serverOS, bcc_ptr, len);
+	if (strncmp(ses->serverOS, "OS/2", 4) == 0) {
+			cFYI(1, "OS/2 server");
+			ses->flags |= CIFS_SES_OS2;
+	}
+
+	bcc_ptr += len + 1;
+	bleft -= len + 1;
+
+	len = strnlen(bcc_ptr, bleft);
+	if (len >= bleft)
+		return rc;
+
+	kfree(ses->serverNOS);
+
+	ses->serverNOS = kzalloc(len + 1, GFP_KERNEL);
+	if (ses->serverNOS)
+		strncpy(ses->serverNOS, bcc_ptr, len);
+
+	bcc_ptr += len + 1;
+	bleft -= len + 1;
+
+	len = strnlen(bcc_ptr, bleft);
+	if (len > bleft)
+		return rc;
+
+	/* No domain field in LANMAN case. Domain is
+	   returned by old servers in the SMB negprot response */
+	/* BB For newer servers which do not support Unicode,
+	   but thus do return domain here we could add parsing
+	   for it later, but it is not very important */
+	cFYI(1, "ascii: bytes left %d", bleft);
+
+	return rc;
+}
+
+static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
+				    struct cifs_ses *ses)
+{
+	unsigned int tioffset; /* challenge message target info area */
+	unsigned int tilen; /* challenge message target info area length  */
+
+	CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
+
+	if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
+		cERROR(1, "challenge blob len %d too small", blob_len);
+		return -EINVAL;
+	}
+
+	if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
+		cERROR(1, "blob signature incorrect %s", pblob->Signature);
+		return -EINVAL;
+	}
+	if (pblob->MessageType != NtLmChallenge) {
+		cERROR(1, "Incorrect message type %d", pblob->MessageType);
+		return -EINVAL;
+	}
+
+	memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
+	/* BB we could decode pblob->NegotiateFlags; some may be useful */
+	/* In particular we can examine sign flags */
+	/* BB spec says that if AvId field of MsvAvTimestamp is populated then
+		we must set the MIC field of the AUTHENTICATE_MESSAGE */
+	ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
+	tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
+	tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
+	if (tilen) {
+		ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
+		if (!ses->auth_key.response) {
+			cERROR(1, "Challenge target info allocation failure");
+			return -ENOMEM;
+		}
+		memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen);
+		ses->auth_key.len = tilen;
+	}
+
+	return 0;
+}
+
+/* BB Move to ntlmssp.c eventually */
+
+/* We do not malloc the blob, it is passed in pbuffer, because
+   it is fixed size, and small, making this approach cleaner */
+static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
+					 struct cifs_ses *ses)
+{
+	NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
+	__u32 flags;
+
+	memset(pbuffer, 0, sizeof(NEGOTIATE_MESSAGE));
+	memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
+	sec_blob->MessageType = NtLmNegotiate;
+
+	/* BB is NTLMV2 session security format easier to use here? */
+	flags = NTLMSSP_NEGOTIATE_56 |	NTLMSSP_REQUEST_TARGET |
+		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
+		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+	if (ses->server->sec_mode &
+			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+		flags |= NTLMSSP_NEGOTIATE_SIGN;
+		if (!ses->server->session_estab)
+			flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
+	}
+
+	sec_blob->NegotiateFlags = cpu_to_le32(flags);
+
+	sec_blob->WorkstationName.BufferOffset = 0;
+	sec_blob->WorkstationName.Length = 0;
+	sec_blob->WorkstationName.MaximumLength = 0;
+
+	/* Domain name is sent on the Challenge not Negotiate NTLMSSP request */
+	sec_blob->DomainName.BufferOffset = 0;
+	sec_blob->DomainName.Length = 0;
+	sec_blob->DomainName.MaximumLength = 0;
+}
+
+/* We do not malloc the blob, it is passed in pbuffer, because its
+   maximum possible size is fixed and small, making this approach cleaner.
+   This function returns the length of the data in the blob */
+static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
+					u16 *buflen,
+				   struct cifs_ses *ses,
+				   const struct nls_table *nls_cp)
+{
+	int rc;
+	AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
+	__u32 flags;
+	unsigned char *tmp;
+
+	memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
+	sec_blob->MessageType = NtLmAuthenticate;
+
+	flags = NTLMSSP_NEGOTIATE_56 |
+		NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
+		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
+		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+	if (ses->server->sec_mode &
+	   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+		flags |= NTLMSSP_NEGOTIATE_SIGN;
+		if (!ses->server->session_estab)
+			flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
+	}
+
+	tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
+	sec_blob->NegotiateFlags = cpu_to_le32(flags);
+
+	sec_blob->LmChallengeResponse.BufferOffset =
+				cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE));
+	sec_blob->LmChallengeResponse.Length = 0;
+	sec_blob->LmChallengeResponse.MaximumLength = 0;
+
+	sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
+	rc = setup_ntlmv2_rsp(ses, nls_cp);
+	if (rc) {
+		cERROR(1, "Error %d during NTLMSSP authentication", rc);
+		goto setup_ntlmv2_ret;
+	}
+	memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+			ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+	tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+
+	sec_blob->NtChallengeResponse.Length =
+			cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+	sec_blob->NtChallengeResponse.MaximumLength =
+			cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+
+	if (ses->domainName == NULL) {
+		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->DomainName.Length = 0;
+		sec_blob->DomainName.MaximumLength = 0;
+		tmp += 2;
+	} else {
+		int len;
+		len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
+				    MAX_USERNAME_SIZE, nls_cp);
+		len *= 2; /* unicode is 2 bytes each */
+		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->DomainName.Length = cpu_to_le16(len);
+		sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
+		tmp += len;
+	}
+
+	if (ses->user_name == NULL) {
+		sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->UserName.Length = 0;
+		sec_blob->UserName.MaximumLength = 0;
+		tmp += 2;
+	} else {
+		int len;
+		len = cifs_strtoUCS((__le16 *)tmp, ses->user_name,
+				    MAX_USERNAME_SIZE, nls_cp);
+		len *= 2; /* unicode is 2 bytes each */
+		sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->UserName.Length = cpu_to_le16(len);
+		sec_blob->UserName.MaximumLength = cpu_to_le16(len);
+		tmp += len;
+	}
+
+	sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+	sec_blob->WorkstationName.Length = 0;
+	sec_blob->WorkstationName.MaximumLength = 0;
+	tmp += 2;
+
+	if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) ||
+		(ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
+			&& !calc_seckey(ses)) {
+		memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
+		sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
+		sec_blob->SessionKey.MaximumLength =
+				cpu_to_le16(CIFS_CPHTXT_SIZE);
+		tmp += CIFS_CPHTXT_SIZE;
+	} else {
+		sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->SessionKey.Length = 0;
+		sec_blob->SessionKey.MaximumLength = 0;
+	}
+
+setup_ntlmv2_ret:
+	*buflen = tmp - pbuffer;
+	return rc;
+}
+
+int
+CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses,
+	       const struct nls_table *nls_cp)
+{
+	int rc = 0;
+	int wct;
+	struct smb_hdr *smb_buf;
+	char *bcc_ptr;
+	char *str_area;
+	SESSION_SETUP_ANDX *pSMB;
+	__u32 capabilities;
+	__u16 count;
+	int resp_buf_type;
+	struct kvec iov[3];
+	enum securityEnum type;
+	__u16 action, bytes_remaining;
+	struct key *spnego_key = NULL;
+	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
+	u16 blob_len;
+	char *ntlmsspblob = NULL;
+
+	if (ses == NULL)
+		return -EINVAL;
+
+	type = ses->server->secType;
+	cFYI(1, "sess setup type %d", type);
+	if (type == RawNTLMSSP) {
+		/* if memory allocation is successful, caller of this function
+		 * frees it.
+		 */
+		ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
+		if (!ses->ntlmssp)
+			return -ENOMEM;
+	}
+
+ssetup_ntlmssp_authenticate:
+	if (phase == NtLmChallenge)
+		phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
+
+	if (type == LANMAN) {
+#ifndef CONFIG_CIFS_WEAK_PW_HASH
+		/* LANMAN and plaintext are less secure and off by default.
+		So we make this explicitly be turned on in kconfig (in the
+		build) and turned on at runtime (changed from the default)
+		in proc/fs/cifs or via mount parm.  Unfortunately this is
+		needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
+		return -EOPNOTSUPP;
+#endif
+		wct = 10; /* lanman 2 style sessionsetup */
+	} else if ((type == NTLM) || (type == NTLMv2)) {
+		/* For NTLMv2 failures eventually may need to retry NTLM */
+		wct = 13; /* old style NTLM sessionsetup */
+	} else /* same size: negotiate or auth, NTLMSSP or extended security */
+		wct = 12;
+
+	rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
+			    (void **)&smb_buf);
+	if (rc)
+		return rc;
+
+	pSMB = (SESSION_SETUP_ANDX *)smb_buf;
+
+	capabilities = cifs_ssetup_hdr(ses, pSMB);
+
+	/* we will send the SMB in three pieces:
+	a fixed length beginning part, an optional
+	SPNEGO blob (which can be zero length), and a
+	last part which will include the strings
+	and rest of bcc area. This allows us to avoid
+	a large buffer 17K allocation */
+	iov[0].iov_base = (char *)pSMB;
+	iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
+
+	/* setting this here allows the code at the end of the function
+	   to free the request buffer if there's an error */
+	resp_buf_type = CIFS_SMALL_BUFFER;
+
+	/* 2000 big enough to fit max user, domain, NOS name etc. */
+	str_area = kmalloc(2000, GFP_KERNEL);
+	if (str_area == NULL) {
+		rc = -ENOMEM;
+		goto ssetup_exit;
+	}
+	bcc_ptr = str_area;
+
+	ses->flags &= ~CIFS_SES_LANMAN;
+
+	iov[1].iov_base = NULL;
+	iov[1].iov_len = 0;
+
+	if (type == LANMAN) {
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+		char lnm_session_key[CIFS_AUTH_RESP_SIZE];
+
+		pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
+
+		/* no capabilities flags in old lanman negotiation */
+
+		pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+
+		/* Calculate hash with password and copy into bcc_ptr.
+		 * Encryption Key (stored as in cryptkey) gets used if the
+		 * security mode bit in Negottiate Protocol response states
+		 * to use challenge/response method (i.e. Password bit is 1).
+		 */
+
+		rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
+				 ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
+					true : false, lnm_session_key);
+
+		ses->flags |= CIFS_SES_LANMAN;
+		memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
+		bcc_ptr += CIFS_AUTH_RESP_SIZE;
+
+		/* can not sign if LANMAN negotiated so no need
+		to calculate signing key? but what if server
+		changed to do higher than lanman dialect and
+		we reconnected would we ever calc signing_key? */
+
+		cFYI(1, "Negotiating LANMAN setting up strings");
+		/* Unicode not allowed for LANMAN dialects */
+		ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
+#endif
+	} else if (type == NTLM) {
+		pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
+		pSMB->req_no_secext.CaseInsensitivePasswordLength =
+			cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+		pSMB->req_no_secext.CaseSensitivePasswordLength =
+			cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+
+		/* calculate ntlm response and session key */
+		rc = setup_ntlm_response(ses);
+		if (rc) {
+			cERROR(1, "Error %d during NTLM authentication", rc);
+			goto ssetup_exit;
+		}
+
+		/* copy ntlm response */
+		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+				CIFS_AUTH_RESP_SIZE);
+		bcc_ptr += CIFS_AUTH_RESP_SIZE;
+		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+				CIFS_AUTH_RESP_SIZE);
+		bcc_ptr += CIFS_AUTH_RESP_SIZE;
+
+		if (ses->capabilities & CAP_UNICODE) {
+			/* unicode strings must be word aligned */
+			if (iov[0].iov_len % 2) {
+				*bcc_ptr = 0;
+				bcc_ptr++;
+			}
+			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
+		} else
+			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
+	} else if (type == NTLMv2) {
+		pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
+
+		/* LM2 password would be here if we supported it */
+		pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
+
+		/* calculate nlmv2 response and session key */
+		rc = setup_ntlmv2_rsp(ses, nls_cp);
+		if (rc) {
+			cERROR(1, "Error %d during NTLMv2 authentication", rc);
+			goto ssetup_exit;
+		}
+		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+				ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+		bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+
+		/* set case sensitive password length after tilen may get
+		 * assigned, tilen is 0 otherwise.
+		 */
+		pSMB->req_no_secext.CaseSensitivePasswordLength =
+			cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+
+		if (ses->capabilities & CAP_UNICODE) {
+			if (iov[0].iov_len % 2) {
+				*bcc_ptr = 0;
+				bcc_ptr++;
+			}
+			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
+		} else
+			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
+	} else if (type == Kerberos) {
+#ifdef CONFIG_CIFS_UPCALL
+		struct cifs_spnego_msg *msg;
+
+		spnego_key = cifs_get_spnego_key(ses);
+		if (IS_ERR(spnego_key)) {
+			rc = PTR_ERR(spnego_key);
+			spnego_key = NULL;
+			goto ssetup_exit;
+		}
+
+		msg = spnego_key->payload.data;
+		/* check version field to make sure that cifs.upcall is
+		   sending us a response in an expected form */
+		if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
+			cERROR(1, "incorrect version of cifs.upcall (expected"
+				   " %d but got %d)",
+				   CIFS_SPNEGO_UPCALL_VERSION, msg->version);
+			rc = -EKEYREJECTED;
+			goto ssetup_exit;
+		}
+
+		ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL);
+		if (!ses->auth_key.response) {
+			cERROR(1, "Kerberos can't allocate (%u bytes) memory",
+					msg->sesskey_len);
+			rc = -ENOMEM;
+			goto ssetup_exit;
+		}
+		memcpy(ses->auth_key.response, msg->data, msg->sesskey_len);
+		ses->auth_key.len = msg->sesskey_len;
+
+		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+		capabilities |= CAP_EXTENDED_SECURITY;
+		pSMB->req.Capabilities = cpu_to_le32(capabilities);
+		iov[1].iov_base = msg->data + msg->sesskey_len;
+		iov[1].iov_len = msg->secblob_len;
+		pSMB->req.SecurityBlobLength = cpu_to_le16(iov[1].iov_len);
+
+		if (ses->capabilities & CAP_UNICODE) {
+			/* unicode strings must be word aligned */
+			if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+				*bcc_ptr = 0;
+				bcc_ptr++;
+			}
+			unicode_oslm_strings(&bcc_ptr, nls_cp);
+			unicode_domain_string(&bcc_ptr, ses, nls_cp);
+		} else
+		/* BB: is this right? */
+			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
+#else /* ! CONFIG_CIFS_UPCALL */
+		cERROR(1, "Kerberos negotiated but upcall support disabled!");
+		rc = -ENOSYS;
+		goto ssetup_exit;
+#endif /* CONFIG_CIFS_UPCALL */
+	} else if (type == RawNTLMSSP) {
+		if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
+			cERROR(1, "NTLMSSP requires Unicode support");
+			rc = -ENOSYS;
+			goto ssetup_exit;
+		}
+
+		cFYI(1, "ntlmssp session setup phase %d", phase);
+		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+		capabilities |= CAP_EXTENDED_SECURITY;
+		pSMB->req.Capabilities |= cpu_to_le32(capabilities);
+		switch(phase) {
+		case NtLmNegotiate:
+			build_ntlmssp_negotiate_blob(
+				pSMB->req.SecurityBlob, ses);
+			iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+			iov[1].iov_base = pSMB->req.SecurityBlob;
+			pSMB->req.SecurityBlobLength =
+				cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
+			break;
+		case NtLmAuthenticate:
+			/*
+			 * 5 is an empirical value, large enough to hold
+			 * authenticate message plus max 10 of av paris,
+			 * domain, user, workstation names, flags, etc.
+			 */
+			ntlmsspblob = kzalloc(
+				5*sizeof(struct _AUTHENTICATE_MESSAGE),
+				GFP_KERNEL);
+			if (!ntlmsspblob) {
+				cERROR(1, "Can't allocate NTLMSSP blob");
+				rc = -ENOMEM;
+				goto ssetup_exit;
+			}
+
+			rc = build_ntlmssp_auth_blob(ntlmsspblob,
+						&blob_len, ses, nls_cp);
+			if (rc)
+				goto ssetup_exit;
+			iov[1].iov_len = blob_len;
+			iov[1].iov_base = ntlmsspblob;
+			pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
+			/*
+			 * Make sure that we tell the server that we are using
+			 * the uid that it just gave us back on the response
+			 * (challenge)
+			 */
+			smb_buf->Uid = ses->Suid;
+			break;
+		default:
+			cERROR(1, "invalid phase %d", phase);
+			rc = -ENOSYS;
+			goto ssetup_exit;
+		}
+		/* unicode strings must be word aligned */
+		if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+			*bcc_ptr = 0;
+			bcc_ptr++;
+		}
+		unicode_oslm_strings(&bcc_ptr, nls_cp);
+	} else {
+		cERROR(1, "secType %d not supported!", type);
+		rc = -ENOSYS;
+		goto ssetup_exit;
+	}
+
+	iov[2].iov_base = str_area;
+	iov[2].iov_len = (long) bcc_ptr - (long) str_area;
+
+	count = iov[1].iov_len + iov[2].iov_len;
+	smb_buf->smb_buf_length =
+		cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
+
+	put_bcc(count, smb_buf);
+
+	rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
+			  CIFS_LOG_ERROR);
+	/* SMB request buf freed in SendReceive2 */
+
+	pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
+	smb_buf = (struct smb_hdr *)iov[0].iov_base;
+
+	if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError ==
+			cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
+		if (phase != NtLmNegotiate) {
+			cERROR(1, "Unexpected more processing error");
+			goto ssetup_exit;
+		}
+		/* NTLMSSP Negotiate sent now processing challenge (response) */
+		phase = NtLmChallenge; /* process ntlmssp challenge */
+		rc = 0; /* MORE_PROC rc is not an error here, but expected */
+	}
+	if (rc)
+		goto ssetup_exit;
+
+	if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
+		rc = -EIO;
+		cERROR(1, "bad word count %d", smb_buf->WordCount);
+		goto ssetup_exit;
+	}
+	action = le16_to_cpu(pSMB->resp.Action);
+	if (action & GUEST_LOGIN)
+		cFYI(1, "Guest login"); /* BB mark SesInfo struct? */
+	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
+	cFYI(1, "UID = %d ", ses->Suid);
+	/* response can have either 3 or 4 word count - Samba sends 3 */
+	/* and lanman response is 3 */
+	bytes_remaining = get_bcc(smb_buf);
+	bcc_ptr = pByteArea(smb_buf);
+
+	if (smb_buf->WordCount == 4) {
+		blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
+		if (blob_len > bytes_remaining) {
+			cERROR(1, "bad security blob length %d", blob_len);
+			rc = -EINVAL;
+			goto ssetup_exit;
+		}
+		if (phase == NtLmChallenge) {
+			rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
+			/* now goto beginning for ntlmssp authenticate phase */
+			if (rc)
+				goto ssetup_exit;
+		}
+		bcc_ptr += blob_len;
+		bytes_remaining -= blob_len;
+	}
+
+	/* BB check if Unicode and decode strings */
+	if (bytes_remaining == 0) {
+		/* no string area to decode, do nothing */
+	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+		/* unicode string area must be word-aligned */
+		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+			++bcc_ptr;
+			--bytes_remaining;
+		}
+		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
+	} else {
+		rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining,
+					 ses, nls_cp);
+	}
+
+ssetup_exit:
+	if (spnego_key) {
+		key_revoke(spnego_key);
+		key_put(spnego_key);
+	}
+	kfree(str_area);
+	kfree(ntlmsspblob);
+	ntlmsspblob = NULL;
+	if (resp_buf_type == CIFS_SMALL_BUFFER) {
+		cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
+		cifs_small_buf_release(iov[0].iov_base);
+	} else if (resp_buf_type == CIFS_LARGE_BUFFER)
+		cifs_buf_release(iov[0].iov_base);
+
+	/* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */
+	if ((phase == NtLmChallenge) && (rc == 0))
+		goto ssetup_ntlmssp_authenticate;
+
+	return rc;
+}
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
new file mode 100644
index 00000000..1c5b770c
--- /dev/null
+++ b/fs/cifs/smbencrypt.c
@@ -0,0 +1,401 @@
+/*
+   Unix SMB/Netbios implementation.
+   Version 1.9.
+   SMB parameters and setup
+   Copyright (C) Andrew Tridgell 1992-2000
+   Copyright (C) Luke Kenneth Casson Leighton 1996-2000
+   Modified by Jeremy Allison 1995.
+   Copyright (C) Andrew Bartlett <abartlet@samba.org> 2002-2003
+   Modified by Steve French (sfrench@us.ibm.com) 2002-2003
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include "cifs_unicode.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "cifsproto.h"
+
+#ifndef false
+#define false 0
+#endif
+#ifndef true
+#define true 1
+#endif
+
+/* following came from the other byteorder.h to avoid include conflicts */
+#define CVAL(buf,pos) (((unsigned char *)(buf))[pos])
+#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
+#define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
+
+static void
+str_to_key(unsigned char *str, unsigned char *key)
+{
+	int i;
+
+	key[0] = str[0] >> 1;
+	key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2);
+	key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3);
+	key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4);
+	key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5);
+	key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6);
+	key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7);
+	key[7] = str[6] & 0x7F;
+	for (i = 0; i < 8; i++)
+		key[i] = (key[i] << 1);
+}
+
+static int
+smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
+{
+	int rc;
+	unsigned char key2[8];
+	struct crypto_blkcipher *tfm_des;
+	struct scatterlist sgin, sgout;
+	struct blkcipher_desc desc;
+
+	str_to_key(key, key2);
+
+	tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm_des)) {
+		rc = PTR_ERR(tfm_des);
+		cERROR(1, "could not allocate des crypto API\n");
+		goto smbhash_err;
+	}
+
+	desc.tfm = tfm_des;
+
+	crypto_blkcipher_setkey(tfm_des, key2, 8);
+
+	sg_init_one(&sgin, in, 8);
+	sg_init_one(&sgout, out, 8);
+
+	rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8);
+	if (rc)
+		cERROR(1, "could not encrypt crypt key rc: %d\n", rc);
+
+	crypto_free_blkcipher(tfm_des);
+smbhash_err:
+	return rc;
+}
+
+static int
+E_P16(unsigned char *p14, unsigned char *p16)
+{
+	int rc;
+	unsigned char sp8[8] =
+	    { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 };
+
+	rc = smbhash(p16, sp8, p14);
+	if (rc)
+		return rc;
+	rc = smbhash(p16 + 8, sp8, p14 + 7);
+	return rc;
+}
+
+static int
+E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
+{
+	int rc;
+
+	rc = smbhash(p24, c8, p21);
+	if (rc)
+		return rc;
+	rc = smbhash(p24 + 8, c8, p21 + 7);
+	if (rc)
+		return rc;
+	rc = smbhash(p24 + 16, c8, p21 + 14);
+	return rc;
+}
+
+/* produce a md4 message digest from data of length n bytes */
+int
+mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
+{
+	int rc;
+	unsigned int size;
+	struct crypto_shash *md4;
+	struct sdesc *sdescmd4;
+
+	md4 = crypto_alloc_shash("md4", 0, 0);
+	if (IS_ERR(md4)) {
+		rc = PTR_ERR(md4);
+		cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc);
+		return rc;
+	}
+	size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
+	sdescmd4 = kmalloc(size, GFP_KERNEL);
+	if (!sdescmd4) {
+		rc = -ENOMEM;
+		cERROR(1, "%s: Memory allocation failure\n", __func__);
+		goto mdfour_err;
+	}
+	sdescmd4->shash.tfm = md4;
+	sdescmd4->shash.flags = 0x0;
+
+	rc = crypto_shash_init(&sdescmd4->shash);
+	if (rc) {
+		cERROR(1, "%s: Could not init md4 shash\n", __func__);
+		goto mdfour_err;
+	}
+	crypto_shash_update(&sdescmd4->shash, link_str, link_len);
+	rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
+
+mdfour_err:
+	crypto_free_shash(md4);
+	kfree(sdescmd4);
+
+	return rc;
+}
+
+/*
+   This implements the X/Open SMB password encryption
+   It takes a password, a 8 byte "crypt key" and puts 24 bytes of
+   encrypted password into p24 */
+/* Note that password must be uppercased and null terminated */
+int
+SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24)
+{
+	int rc;
+	unsigned char p14[14], p16[16], p21[21];
+
+	memset(p14, '\0', 14);
+	memset(p16, '\0', 16);
+	memset(p21, '\0', 21);
+
+	memcpy(p14, passwd, 14);
+	rc = E_P16(p14, p16);
+	if (rc)
+		return rc;
+
+	memcpy(p21, p16, 16);
+	rc = E_P24(p21, c8, p24);
+
+	return rc;
+}
+
+/* Routines for Windows NT MD4 Hash functions. */
+static int
+_my_wcslen(__u16 *str)
+{
+	int len = 0;
+	while (*str++ != 0)
+		len++;
+	return len;
+}
+
+/*
+ * Convert a string into an NT UNICODE string.
+ * Note that regardless of processor type
+ * this must be in intel (little-endian)
+ * format.
+ */
+
+static int
+_my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
+{	/* BB not a very good conversion routine - change/fix */
+	int i;
+	__u16 val;
+
+	for (i = 0; i < len; i++) {
+		val = *src;
+		SSVAL(dst, 0, val);
+		dst++;
+		src++;
+		if (val == 0)
+			break;
+	}
+	return i;
+}
+
+/*
+ * Creates the MD4 Hash of the users password in NT UNICODE.
+ */
+
+int
+E_md4hash(const unsigned char *passwd, unsigned char *p16)
+{
+	int rc;
+	int len;
+	__u16 wpwd[129];
+
+	/* Password cannot be longer than 128 characters */
+	if (passwd) {
+		len = strlen((char *) passwd);
+		if (len > 128)
+			len = 128;
+
+		/* Password must be converted to NT unicode */
+		_my_mbstowcs(wpwd, passwd, len);
+	} else
+		len = 0;
+
+	wpwd[len] = 0;	/* Ensure string is null terminated */
+	/* Calculate length in bytes */
+	len = _my_wcslen(wpwd) * sizeof(__u16);
+
+	rc = mdfour(p16, (unsigned char *) wpwd, len);
+	memset(wpwd, 0, 129 * 2);
+
+	return rc;
+}
+
+#if 0 /* currently unused */
+/* Does both the NT and LM owfs of a user's password */
+static void
+nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
+{
+	char passwd[514];
+
+	memset(passwd, '\0', 514);
+	if (strlen(pwd) < 513)
+		strcpy(passwd, pwd);
+	else
+		memcpy(passwd, pwd, 512);
+	/* Calculate the MD4 hash (NT compatible) of the password */
+	memset(nt_p16, '\0', 16);
+	E_md4hash(passwd, nt_p16);
+
+	/* Mangle the passwords into Lanman format */
+	passwd[14] = '\0';
+/*	strupper(passwd); */
+
+	/* Calculate the SMB (lanman) hash functions of the password */
+
+	memset(p16, '\0', 16);
+	E_P16((unsigned char *) passwd, (unsigned char *) p16);
+
+	/* clear out local copy of user's password (just being paranoid). */
+	memset(passwd, '\0', sizeof(passwd));
+}
+#endif
+
+/* Does the NTLMv2 owfs of a user's password */
+#if 0  /* function not needed yet - but will be soon */
+static void
+ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
+		const char *domain_n, unsigned char kr_buf[16],
+		const struct nls_table *nls_codepage)
+{
+	wchar_t *user_u;
+	wchar_t *dom_u;
+	int user_l, domain_l;
+	struct HMACMD5Context ctx;
+
+	/* might as well do one alloc to hold both (user_u and dom_u) */
+	user_u = kmalloc(2048 * sizeof(wchar_t), GFP_KERNEL);
+	if (user_u == NULL)
+		return;
+	dom_u = user_u + 1024;
+
+	/* push_ucs2(NULL, user_u, user_n, (user_l+1)*2,
+			STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER);
+	   push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2,
+			STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */
+
+	/* BB user and domain may need to be uppercased */
+	user_l = cifs_strtoUCS(user_u, user_n, 511, nls_codepage);
+	domain_l = cifs_strtoUCS(dom_u, domain_n, 511, nls_codepage);
+
+	user_l++;		/* trailing null */
+	domain_l++;
+
+	hmac_md5_init_limK_to_64(owf, 16, &ctx);
+	hmac_md5_update((const unsigned char *) user_u, user_l * 2, &ctx);
+	hmac_md5_update((const unsigned char *) dom_u, domain_l * 2, &ctx);
+	hmac_md5_final(kr_buf, &ctx);
+
+	kfree(user_u);
+}
+#endif
+
+/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
+#if 0 /* currently unused */
+static void
+NTLMSSPOWFencrypt(unsigned char passwd[8],
+		  unsigned char *ntlmchalresp, unsigned char p24[24])
+{
+	unsigned char p21[21];
+
+	memset(p21, '\0', 21);
+	memcpy(p21, passwd, 8);
+	memset(p21 + 8, 0xbd, 8);
+
+	E_P24(p21, ntlmchalresp, p24);
+}
+#endif
+
+/* Does the NT MD4 hash then des encryption. */
+int
+SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
+{
+	int rc;
+	unsigned char p16[16], p21[21];
+
+	memset(p16, '\0', 16);
+	memset(p21, '\0', 21);
+
+	rc = E_md4hash(passwd, p16);
+	if (rc) {
+		cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+		return rc;
+	}
+	memcpy(p21, p16, 16);
+	rc = E_P24(p21, c8, p24);
+	return rc;
+}
+
+
+/* Does the md5 encryption from the NT hash for NTLMv2. */
+/* These routines will be needed later */
+#if 0
+static void
+SMBOWFencrypt_ntv2(const unsigned char kr[16],
+		   const struct data_blob *srv_chal,
+		   const struct data_blob *cli_chal, unsigned char resp_buf[16])
+{
+	struct HMACMD5Context ctx;
+
+	hmac_md5_init_limK_to_64(kr, 16, &ctx);
+	hmac_md5_update(srv_chal->data, srv_chal->length, &ctx);
+	hmac_md5_update(cli_chal->data, cli_chal->length, &ctx);
+	hmac_md5_final(resp_buf, &ctx);
+}
+
+static void
+SMBsesskeygen_ntv2(const unsigned char kr[16],
+		   const unsigned char *nt_resp, __u8 sess_key[16])
+{
+	struct HMACMD5Context ctx;
+
+	hmac_md5_init_limK_to_64(kr, 16, &ctx);
+	hmac_md5_update(nt_resp, 16, &ctx);
+	hmac_md5_final((unsigned char *) sess_key, &ctx);
+}
+
+static void
+SMBsesskeygen_ntv1(const unsigned char kr[16],
+		   const unsigned char *nt_resp, __u8 sess_key[16])
+{
+	mdfour((unsigned char *) sess_key, (unsigned char *) kr, 16);
+}
+#endif
diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h
new file mode 100644
index 00000000..7f16cb82
--- /dev/null
+++ b/fs/cifs/smberr.h
@@ -0,0 +1,184 @@
+/*
+ *   fs/cifs/smberr.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2004
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   See Error Codes section of the SNIA CIFS Specification
+ *   for more information
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define SUCCESS	0x00	/* The request was successful. */
+#define ERRDOS	0x01	/* Error is from the core DOS operating system set */
+#define ERRSRV	0x02	/* Error is generated by the file server daemon */
+#define ERRHRD	0x03	/* Error is a hardware error. */
+#define ERRCMD	0xFF	/* Command was not in the "SMB" format. */
+
+/* The following error codes may be generated with the SUCCESS error class.*/
+
+/*#define SUCCESS	0	The request was successful. */
+
+/* The following error codes may be generated with the ERRDOS error class.*/
+
+#define ERRbadfunc		1	/* Invalid function. The server did not
+					   recognize or could not perform a
+					   system call generated by the server,
+					   e.g., set the DIRECTORY attribute on
+					   a data file, invalid seek mode. */
+#define ERRbadfile		2	/* File not found. The last component
+					   of a file's pathname could not be
+					   found. */
+#define ERRbadpath		3	/* Directory invalid. A directory
+					   component in a pathname could not be
+					   found. */
+#define ERRnofids		4	/* Too many open files. The server has
+					   no file handles available. */
+#define ERRnoaccess		5	/* Access denied, the client's context
+					   does not permit the requested
+					   function. This includes the
+					   following conditions: invalid rename
+					   command, write to Fid open for read
+					   only, read on Fid open for write
+					   only, attempt to delete a non-empty
+					   directory */
+#define ERRbadfid		6	/* Invalid file handle. The file handle
+					   specified was not recognized by the
+					   server. */
+#define ERRbadmcb		7	/* Memory control blocks destroyed. */
+#define ERRnomem		8	/* Insufficient server memory to
+					   perform the requested function. */
+#define ERRbadmem		9	/* Invalid memory block address. */
+#define ERRbadenv		10	/* Invalid environment. */
+#define ERRbadformat		11	/* Invalid format. */
+#define ERRbadaccess		12	/* Invalid open mode. */
+#define ERRbaddata		13	/* Invalid data (generated only by
+					   IOCTL calls within the server). */
+#define ERRbaddrive		15	/* Invalid drive specified. */
+#define ERRremcd		16	/* A Delete Directory request attempted
+					   to remove the server's current
+					   directory. */
+#define ERRdiffdevice		17	/* Not same device (e.g., a cross
+					   volume rename was attempted */
+#define ERRnofiles		18	/* A File Search command can find no
+					   more files matching the specified
+					   criteria. */
+#define ERRwriteprot		19	/* media is write protected */
+#define ERRgeneral		31
+#define ERRbadshare		32	/* The sharing mode specified for an
+					   Open conflicts with existing FIDs on
+					   the file. */
+#define ERRlock			33	/* A Lock request conflicted with an
+					   existing lock or specified an
+					   invalid mode, or an Unlock requested
+					   attempted to remove a lock held by
+					   another process. */
+#define ERRunsup		50
+#define ERRnosuchshare		67
+#define ERRfilexists		80	/* The file named in the request
+					   already exists. */
+#define ERRinvparm		87
+#define ERRdiskfull		112
+#define ERRinvname		123
+#define ERRinvlevel		124
+#define ERRdirnotempty		145
+#define ERRnotlocked		158
+#define ERRcancelviolation	173
+#define ERRalreadyexists	183
+#define ERRbadpipe		230
+#define ERRpipebusy		231
+#define ERRpipeclosing		232
+#define ERRnotconnected		233
+#define ERRmoredata		234
+#define ERReasnotsupported	282
+#define ErrQuota		0x200	/* The operation would cause a quota
+					   limit to be exceeded. */
+#define ErrNotALink		0x201	/* A link operation was performed on a
+					   pathname that was not a link. */
+
+/* Below errors are used internally (do not come over the wire) for passthrough
+   from STATUS codes to POSIX only  */
+#define ERRsymlink              0xFFFD
+#define ErrTooManyLinks         0xFFFE
+
+/* Following error codes may be generated with the ERRSRV error class.*/
+
+#define ERRerror		1	/* Non-specific error code. It is
+					   returned under the following
+					   conditions: resource other than disk
+					   space exhausted (e.g. TIDs), first
+					   SMB command was not negotiate,
+					   multiple negotiates attempted, and
+					   internal server error. */
+#define ERRbadpw		2	/* Bad password - name/password pair in
+					   a TreeConnect or Session Setup are
+					   invalid. */
+#define ERRbadtype		3	/* used for indicating DFS referral
+					   needed */
+#define ERRaccess		4	/* The client does not have the
+					   necessary access rights within the
+					   specified context for requested
+					   function. */
+#define ERRinvtid		5	/* The Tid specified in a command was
+					   invalid. */
+#define ERRinvnetname		6	/* Invalid network name in tree
+					   connect. */
+#define ERRinvdevice		7	/* Invalid device - printer request
+					   made to non-printer connection or
+					   non-printer request made to printer
+					   connection. */
+#define ERRqfull		49	/* Print queue full (files) -- returned
+					   by open print file. */
+#define ERRqtoobig		50	/* Print queue full -- no space. */
+#define ERRqeof			51	/* EOF on print queue dump */
+#define ERRinvpfid		52	/* Invalid print file FID. */
+#define ERRsmbcmd		64	/* The server did not recognize the
+					   command received. */
+#define ERRsrverror		65	/* The server encountered an internal
+					   error, e.g., system file
+					   unavailable. */
+#define ERRbadBID		66	/* (obsolete) */
+#define ERRfilespecs		67	/* The Fid and pathname parameters
+					   contained an invalid combination of
+					   values. */
+#define ERRbadLink		68	/* (obsolete) */
+#define ERRbadpermits		69	/* The access permissions specified for
+					   a file or directory are not a valid
+					   combination. */
+#define ERRbadPID		70
+#define ERRsetattrmode		71	/* attribute (mode) is invalid */
+#define ERRpaused		81	/* Server is paused */
+#define ERRmsgoff		82	/* reserved - messaging off */
+#define ERRnoroom		83	/* reserved - no room for message */
+#define ERRrmuns		87	/* reserved - too many remote names */
+#define ERRtimeout		88	/* operation timed out */
+#define ERRnoresource		89	/* No resources available for request
+					   */
+#define ERRtoomanyuids		90	/* Too many UIDs active on this session
+					   */
+#define ERRbaduid		91	/* The UID is not known as a valid user
+					   */
+#define ERRusempx		250	/* temporarily unable to use raw */
+#define ERRusestd		251	/* temporarily unable to use either raw
+					   or mpx */
+#define ERR_NOTIFY_ENUM_DIR	1024
+#define ERRnoSuchUser		2238	/* user account does not exist */
+#define ERRaccountexpired	2239
+#define ERRbadclient		2240	/* can not logon from this client */
+#define ERRbadLogonTime		2241	/* logon hours do not allow this */
+#define ERRpasswordExpired	2242
+#define ERRnetlogonNotStarted	2455
+#define ERRnosupport		0xFFFF
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
new file mode 100644
index 00000000..7056b891
--- /dev/null
+++ b/fs/cifs/smbfsctl.h
@@ -0,0 +1,84 @@
+/*
+ *   fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2009
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* IOCTL information */
+/*
+ * List of ioctl/fsctl function codes that are or could be useful in the
+ * future to remote clients like cifs or SMB2 client.  There is probably
+ * a slightly larger set of fsctls that NTFS local filesystem could handle,
+ * including the seven below that we do not have struct definitions for.
+ * Even with protocol definitions for most of these now available, we still
+ * need to do some experimentation to identify which are practical to do
+ * remotely.  Some of the following, such as the encryption/compression ones
+ * could be invoked from tools via a specialized hook into the VFS rather
+ * than via the standard vfs entry points
+ */
+#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
+#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
+#define FSCTL_REQUEST_BATCH_OPLOCK   0x00090008
+#define FSCTL_LOCK_VOLUME            0x00090018
+#define FSCTL_UNLOCK_VOLUME          0x0009001C
+#define FSCTL_IS_PATHNAME_VALID      0x0009002C /* BB add struct */
+#define FSCTL_GET_COMPRESSION        0x0009003C /* BB add struct */
+#define FSCTL_SET_COMPRESSION        0x0009C040 /* BB add struct */
+#define FSCTL_QUERY_FAT_BPB          0x00090058 /* BB add struct */
+/* Verify the next FSCTL number, we had it as 0x00090090 before */
+#define FSCTL_FILESYSTEM_GET_STATS   0x00090060 /* BB add struct */
+#define FSCTL_GET_NTFS_VOLUME_DATA   0x00090064 /* BB add struct */
+#define FSCTL_GET_RETRIEVAL_POINTERS 0x00090073 /* BB add struct */
+#define FSCTL_IS_VOLUME_DIRTY        0x00090078 /* BB add struct */
+#define FSCTL_ALLOW_EXTENDED_DASD_IO 0x00090083 /* BB add struct */
+#define FSCTL_REQUEST_FILTER_OPLOCK  0x0009008C
+#define FSCTL_FIND_FILES_BY_SID      0x0009008F /* BB add struct */
+#define FSCTL_SET_OBJECT_ID          0x00090098 /* BB add struct */
+#define FSCTL_GET_OBJECT_ID          0x0009009C /* BB add struct */
+#define FSCTL_DELETE_OBJECT_ID       0x000900A0 /* BB add struct */
+#define FSCTL_SET_REPARSE_POINT      0x000900A4 /* BB add struct */
+#define FSCTL_GET_REPARSE_POINT      0x000900A8 /* BB add struct */
+#define FSCTL_DELETE_REPARSE_POINT   0x000900AC /* BB add struct */
+#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */
+#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */
+#define FSCTL_SET_SPARSE             0x000900C4 /* BB add struct */
+#define FSCTL_SET_ZERO_DATA          0x000900C8 /* BB add struct */
+#define FSCTL_SET_ENCRYPTION         0x000900D7 /* BB add struct */
+#define FSCTL_ENCRYPTION_FSCTL_IO    0x000900DB /* BB add struct */
+#define FSCTL_WRITE_RAW_ENCRYPTED    0x000900DF /* BB add struct */
+#define FSCTL_READ_RAW_ENCRYPTED     0x000900E3 /* BB add struct */
+#define FSCTL_READ_FILE_USN_DATA     0x000900EB /* BB add struct */
+#define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */
+#define FSCTL_SIS_COPYFILE           0x00090100 /* BB add struct */
+#define FSCTL_RECALL_FILE            0x00090117 /* BB add struct */
+#define FSCTL_QUERY_SPARING_INFO     0x00090138 /* BB add struct */
+#define FSCTL_SET_ZERO_ON_DEALLOC    0x00090194 /* BB add struct */
+#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
+#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
+#define FSCTL_SET_DEFECT_MANAGEMENT  0x00098134 /* BB add struct */
+#define FSCTL_SIS_LINK_FILES         0x0009C104
+#define FSCTL_PIPE_PEEK              0x0011400C /* BB add struct */
+#define FSCTL_PIPE_TRANSCEIVE        0x0011C017 /* BB add struct */
+/* strange that the number for this op is not sequential with previous op */
+#define FSCTL_PIPE_WAIT              0x00110018 /* BB add struct */
+#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
+#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
+
+#define IO_REPARSE_TAG_MOUNT_POINT   0xA0000003
+#define IO_REPARSE_TAG_HSM           0xC0000004
+#define IO_REPARSE_TAG_SIS           0x80000007
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
new file mode 100644
index 00000000..c1b9c4b1
--- /dev/null
+++ b/fs/cifs/transport.c
@@ -0,0 +1,931 @@
+/*
+ *   fs/cifs/transport.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *   Jeremy Allison (jra@samba.org) 2006.
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/delay.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <linux/mempool.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+
+extern mempool_t *cifs_mid_poolp;
+
+static void
+wake_up_task(struct mid_q_entry *mid)
+{
+	wake_up_process(mid->callback_data);
+}
+
+struct mid_q_entry *
+AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
+{
+	struct mid_q_entry *temp;
+
+	if (server == NULL) {
+		cERROR(1, "Null TCP session in AllocMidQEntry");
+		return NULL;
+	}
+
+	temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
+	if (temp == NULL)
+		return temp;
+	else {
+		memset(temp, 0, sizeof(struct mid_q_entry));
+		temp->mid = smb_buffer->Mid;	/* always LE */
+		temp->pid = current->pid;
+		temp->command = smb_buffer->Command;
+		cFYI(1, "For smb_command %d", temp->command);
+	/*	do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
+		/* when mid allocated can be before when sent */
+		temp->when_alloc = jiffies;
+
+		/*
+		 * The default is for the mid to be synchronous, so the
+		 * default callback just wakes up the current task.
+		 */
+		temp->callback = wake_up_task;
+		temp->callback_data = current;
+	}
+
+	atomic_inc(&midCount);
+	temp->midState = MID_REQUEST_ALLOCATED;
+	return temp;
+}
+
+void
+DeleteMidQEntry(struct mid_q_entry *midEntry)
+{
+#ifdef CONFIG_CIFS_STATS2
+	unsigned long now;
+#endif
+	midEntry->midState = MID_FREE;
+	atomic_dec(&midCount);
+	if (midEntry->largeBuf)
+		cifs_buf_release(midEntry->resp_buf);
+	else
+		cifs_small_buf_release(midEntry->resp_buf);
+#ifdef CONFIG_CIFS_STATS2
+	now = jiffies;
+	/* commands taking longer than one second are indications that
+	   something is wrong, unless it is quite a slow link or server */
+	if ((now - midEntry->when_alloc) > HZ) {
+		if ((cifsFYI & CIFS_TIMER) &&
+		   (midEntry->command != SMB_COM_LOCKING_ANDX)) {
+			printk(KERN_DEBUG " CIFS slow rsp: cmd %d mid %d",
+			       midEntry->command, midEntry->mid);
+			printk(" A: 0x%lx S: 0x%lx R: 0x%lx\n",
+			       now - midEntry->when_alloc,
+			       now - midEntry->when_sent,
+			       now - midEntry->when_received);
+		}
+	}
+#endif
+	mempool_free(midEntry, cifs_mid_poolp);
+}
+
+static void
+delete_mid(struct mid_q_entry *mid)
+{
+	spin_lock(&GlobalMid_Lock);
+	list_del(&mid->qhead);
+	spin_unlock(&GlobalMid_Lock);
+
+	DeleteMidQEntry(mid);
+}
+
+static int
+smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
+{
+	int rc = 0;
+	int i = 0;
+	struct msghdr smb_msg;
+	struct smb_hdr *smb_buffer = iov[0].iov_base;
+	unsigned int len = iov[0].iov_len;
+	unsigned int total_len;
+	int first_vec = 0;
+	unsigned int smb_buf_length = be32_to_cpu(smb_buffer->smb_buf_length);
+	struct socket *ssocket = server->ssocket;
+
+	if (ssocket == NULL)
+		return -ENOTSOCK; /* BB eventually add reconnect code here */
+
+	smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
+	smb_msg.msg_namelen = sizeof(struct sockaddr);
+	smb_msg.msg_control = NULL;
+	smb_msg.msg_controllen = 0;
+	if (server->noblocksnd)
+		smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
+	else
+		smb_msg.msg_flags = MSG_NOSIGNAL;
+
+	total_len = 0;
+	for (i = 0; i < n_vec; i++)
+		total_len += iov[i].iov_len;
+
+	cFYI(1, "Sending smb:  total_len %d", total_len);
+	dump_smb(smb_buffer, len);
+
+	i = 0;
+	while (total_len) {
+		rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec],
+				    n_vec - first_vec, total_len);
+		if ((rc == -ENOSPC) || (rc == -EAGAIN)) {
+			i++;
+			/* if blocking send we try 3 times, since each can block
+			   for 5 seconds. For nonblocking  we have to try more
+			   but wait increasing amounts of time allowing time for
+			   socket to clear.  The overall time we wait in either
+			   case to send on the socket is about 15 seconds.
+			   Similarly we wait for 15 seconds for
+			   a response from the server in SendReceive[2]
+			   for the server to send a response back for
+			   most types of requests (except SMB Write
+			   past end of file which can be slow, and
+			   blocking lock operations). NFS waits slightly longer
+			   than CIFS, but this can make it take longer for
+			   nonresponsive servers to be detected and 15 seconds
+			   is more than enough time for modern networks to
+			   send a packet.  In most cases if we fail to send
+			   after the retries we will kill the socket and
+			   reconnect which may clear the network problem.
+			*/
+			if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
+				cERROR(1, "sends on sock %p stuck for 15 seconds",
+				    ssocket);
+				rc = -EAGAIN;
+				break;
+			}
+			msleep(1 << i);
+			continue;
+		}
+		if (rc < 0)
+			break;
+
+		if (rc == total_len) {
+			total_len = 0;
+			break;
+		} else if (rc > total_len) {
+			cERROR(1, "sent %d requested %d", rc, total_len);
+			break;
+		}
+		if (rc == 0) {
+			/* should never happen, letting socket clear before
+			   retrying is our only obvious option here */
+			cERROR(1, "tcp sent no data");
+			msleep(500);
+			continue;
+		}
+		total_len -= rc;
+		/* the line below resets i */
+		for (i = first_vec; i < n_vec; i++) {
+			if (iov[i].iov_len) {
+				if (rc > iov[i].iov_len) {
+					rc -= iov[i].iov_len;
+					iov[i].iov_len = 0;
+				} else {
+					iov[i].iov_base += rc;
+					iov[i].iov_len -= rc;
+					first_vec = i;
+					break;
+				}
+			}
+		}
+		i = 0; /* in case we get ENOSPC on the next send */
+	}
+
+	if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
+		cFYI(1, "partial send (%d remaining), terminating session",
+			total_len);
+		/* If we have only sent part of an SMB then the next SMB
+		   could be taken as the remainder of this one.  We need
+		   to kill the socket so the server throws away the partial
+		   SMB */
+		server->tcpStatus = CifsNeedReconnect;
+	}
+
+	if (rc < 0 && rc != -EINTR)
+		cERROR(1, "Error %d sending data on socket to server", rc);
+	else
+		rc = 0;
+
+	/* Don't want to modify the buffer as a
+	   side effect of this call. */
+	smb_buffer->smb_buf_length = cpu_to_be32(smb_buf_length);
+
+	return rc;
+}
+
+int
+smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
+	 unsigned int smb_buf_length)
+{
+	struct kvec iov;
+
+	iov.iov_base = smb_buffer;
+	iov.iov_len = smb_buf_length + 4;
+
+	return smb_sendv(server, &iov, 1);
+}
+
+static int wait_for_free_request(struct TCP_Server_Info *server,
+				 const int long_op)
+{
+	if (long_op == CIFS_ASYNC_OP) {
+		/* oplock breaks must not be held up */
+		atomic_inc(&server->inFlight);
+		return 0;
+	}
+
+	spin_lock(&GlobalMid_Lock);
+	while (1) {
+		if (atomic_read(&server->inFlight) >= cifs_max_pending) {
+			spin_unlock(&GlobalMid_Lock);
+#ifdef CONFIG_CIFS_STATS2
+			atomic_inc(&server->num_waiters);
+#endif
+			wait_event(server->request_q,
+				   atomic_read(&server->inFlight)
+				     < cifs_max_pending);
+#ifdef CONFIG_CIFS_STATS2
+			atomic_dec(&server->num_waiters);
+#endif
+			spin_lock(&GlobalMid_Lock);
+		} else {
+			if (server->tcpStatus == CifsExiting) {
+				spin_unlock(&GlobalMid_Lock);
+				return -ENOENT;
+			}
+
+			/* can not count locking commands against total
+			   as they are allowed to block on server */
+
+			/* update # of requests on the wire to server */
+			if (long_op != CIFS_BLOCKING_OP)
+				atomic_inc(&server->inFlight);
+			spin_unlock(&GlobalMid_Lock);
+			break;
+		}
+	}
+	return 0;
+}
+
+static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
+			struct mid_q_entry **ppmidQ)
+{
+	if (ses->server->tcpStatus == CifsExiting) {
+		return -ENOENT;
+	}
+
+	if (ses->server->tcpStatus == CifsNeedReconnect) {
+		cFYI(1, "tcp session dead - return to caller to retry");
+		return -EAGAIN;
+	}
+
+	if (ses->status != CifsGood) {
+		/* check if SMB session is bad because we are setting it up */
+		if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
+			(in_buf->Command != SMB_COM_NEGOTIATE))
+			return -EAGAIN;
+		/* else ok - we are setting up session */
+	}
+	*ppmidQ = AllocMidQEntry(in_buf, ses->server);
+	if (*ppmidQ == NULL)
+		return -ENOMEM;
+	spin_lock(&GlobalMid_Lock);
+	list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
+	spin_unlock(&GlobalMid_Lock);
+	return 0;
+}
+
+static int
+wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
+{
+	int error;
+
+	error = wait_event_killable(server->response_q,
+				    midQ->midState != MID_REQUEST_SUBMITTED);
+	if (error < 0)
+		return -ERESTARTSYS;
+
+	return 0;
+}
+
+
+/*
+ * Send a SMB request and set the callback function in the mid to handle
+ * the result. Caller is responsible for dealing with timeouts.
+ */
+int
+cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
+		unsigned int nvec, mid_callback_t *callback, void *cbdata,
+		bool ignore_pend)
+{
+	int rc;
+	struct mid_q_entry *mid;
+	struct smb_hdr *hdr = (struct smb_hdr *)iov[0].iov_base;
+
+	rc = wait_for_free_request(server, ignore_pend ? CIFS_ASYNC_OP : 0);
+	if (rc)
+		return rc;
+
+	/* enable signing if server requires it */
+	if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+		hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+
+	mutex_lock(&server->srv_mutex);
+	mid = AllocMidQEntry(hdr, server);
+	if (mid == NULL) {
+		mutex_unlock(&server->srv_mutex);
+		atomic_dec(&server->inFlight);
+		wake_up(&server->request_q);
+		return -ENOMEM;
+	}
+
+	/* put it on the pending_mid_q */
+	spin_lock(&GlobalMid_Lock);
+	list_add_tail(&mid->qhead, &server->pending_mid_q);
+	spin_unlock(&GlobalMid_Lock);
+
+	rc = cifs_sign_smb2(iov, nvec, server, &mid->sequence_number);
+	if (rc) {
+		mutex_unlock(&server->srv_mutex);
+		goto out_err;
+	}
+
+	mid->callback = callback;
+	mid->callback_data = cbdata;
+	mid->midState = MID_REQUEST_SUBMITTED;
+#ifdef CONFIG_CIFS_STATS2
+	atomic_inc(&server->inSend);
+#endif
+	rc = smb_sendv(server, iov, nvec);
+#ifdef CONFIG_CIFS_STATS2
+	atomic_dec(&server->inSend);
+	mid->when_sent = jiffies;
+#endif
+	mutex_unlock(&server->srv_mutex);
+	if (rc)
+		goto out_err;
+
+	return rc;
+out_err:
+	delete_mid(mid);
+	atomic_dec(&server->inFlight);
+	wake_up(&server->request_q);
+	return rc;
+}
+
+/*
+ *
+ * Send an SMB Request.  No response info (other than return code)
+ * needs to be parsed.
+ *
+ * flags indicate the type of request buffer and how long to wait
+ * and whether to log NT STATUS code (error) before mapping it to POSIX error
+ *
+ */
+int
+SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
+		struct smb_hdr *in_buf, int flags)
+{
+	int rc;
+	struct kvec iov[1];
+	int resp_buf_type;
+
+	iov[0].iov_base = (char *)in_buf;
+	iov[0].iov_len = be32_to_cpu(in_buf->smb_buf_length) + 4;
+	flags |= CIFS_NO_RESP;
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
+	cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
+
+	return rc;
+}
+
+static int
+cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
+{
+	int rc = 0;
+
+	cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command,
+		mid->mid, mid->midState);
+
+	spin_lock(&GlobalMid_Lock);
+	switch (mid->midState) {
+	case MID_RESPONSE_RECEIVED:
+		spin_unlock(&GlobalMid_Lock);
+		return rc;
+	case MID_RETRY_NEEDED:
+		rc = -EAGAIN;
+		break;
+	case MID_RESPONSE_MALFORMED:
+		rc = -EIO;
+		break;
+	case MID_SHUTDOWN:
+		rc = -EHOSTDOWN;
+		break;
+	default:
+		list_del_init(&mid->qhead);
+		cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
+			mid->mid, mid->midState);
+		rc = -EIO;
+	}
+	spin_unlock(&GlobalMid_Lock);
+
+	DeleteMidQEntry(mid);
+	return rc;
+}
+
+/*
+ * An NT cancel request header looks just like the original request except:
+ *
+ * The Command is SMB_COM_NT_CANCEL
+ * The WordCount is zeroed out
+ * The ByteCount is zeroed out
+ *
+ * This function mangles an existing request buffer into a
+ * SMB_COM_NT_CANCEL request and then sends it.
+ */
+static int
+send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
+		struct mid_q_entry *mid)
+{
+	int rc = 0;
+
+	/* -4 for RFC1001 length and +2 for BCC field */
+	in_buf->smb_buf_length = cpu_to_be32(sizeof(struct smb_hdr) - 4  + 2);
+	in_buf->Command = SMB_COM_NT_CANCEL;
+	in_buf->WordCount = 0;
+	put_bcc(0, in_buf);
+
+	mutex_lock(&server->srv_mutex);
+	rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+	if (rc) {
+		mutex_unlock(&server->srv_mutex);
+		return rc;
+	}
+	rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+	mutex_unlock(&server->srv_mutex);
+
+	cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
+		in_buf->Mid, rc);
+
+	return rc;
+}
+
+int
+cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+		   bool log_error)
+{
+	dump_smb(mid->resp_buf,
+		 min_t(u32, 92, be32_to_cpu(mid->resp_buf->smb_buf_length)));
+
+	/* convert the length into a more usable form */
+	if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+		/* FIXME: add code to kill session */
+		if (cifs_verify_signature(mid->resp_buf, server,
+					  mid->sequence_number + 1) != 0)
+			cERROR(1, "Unexpected SMB signature");
+	}
+
+	/* BB special case reconnect tid and uid here? */
+	return map_smb_to_linux_error(mid->resp_buf, log_error);
+}
+
+int
+SendReceive2(const unsigned int xid, struct cifs_ses *ses,
+	     struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
+	     const int flags)
+{
+	int rc = 0;
+	int long_op;
+	struct mid_q_entry *midQ;
+	struct smb_hdr *in_buf = iov[0].iov_base;
+
+	long_op = flags & CIFS_TIMEOUT_MASK;
+
+	*pRespBufType = CIFS_NO_BUFFER;  /* no response buf yet */
+
+	if ((ses == NULL) || (ses->server == NULL)) {
+		cifs_small_buf_release(in_buf);
+		cERROR(1, "Null session");
+		return -EIO;
+	}
+
+	if (ses->server->tcpStatus == CifsExiting) {
+		cifs_small_buf_release(in_buf);
+		return -ENOENT;
+	}
+
+	/* Ensure that we do not send more than 50 overlapping requests
+	   to the same server. We may make this configurable later or
+	   use ses->maxReq */
+
+	rc = wait_for_free_request(ses->server, long_op);
+	if (rc) {
+		cifs_small_buf_release(in_buf);
+		return rc;
+	}
+
+	/* make sure that we sign in the same order that we send on this socket
+	   and avoid races inside tcp sendmsg code that could cause corruption
+	   of smb data */
+
+	mutex_lock(&ses->server->srv_mutex);
+
+	rc = allocate_mid(ses, in_buf, &midQ);
+	if (rc) {
+		mutex_unlock(&ses->server->srv_mutex);
+		cifs_small_buf_release(in_buf);
+		/* Update # of requests on wire to server */
+		atomic_dec(&ses->server->inFlight);
+		wake_up(&ses->server->request_q);
+		return rc;
+	}
+	rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number);
+	if (rc) {
+		mutex_unlock(&ses->server->srv_mutex);
+		cifs_small_buf_release(in_buf);
+		goto out;
+	}
+
+	midQ->midState = MID_REQUEST_SUBMITTED;
+#ifdef CONFIG_CIFS_STATS2
+	atomic_inc(&ses->server->inSend);
+#endif
+	rc = smb_sendv(ses->server, iov, n_vec);
+#ifdef CONFIG_CIFS_STATS2
+	atomic_dec(&ses->server->inSend);
+	midQ->when_sent = jiffies;
+#endif
+
+	mutex_unlock(&ses->server->srv_mutex);
+
+	if (rc < 0) {
+		cifs_small_buf_release(in_buf);
+		goto out;
+	}
+
+	if (long_op == CIFS_ASYNC_OP) {
+		cifs_small_buf_release(in_buf);
+		goto out;
+	}
+
+	rc = wait_for_response(ses->server, midQ);
+	if (rc != 0) {
+		send_nt_cancel(ses->server, in_buf, midQ);
+		spin_lock(&GlobalMid_Lock);
+		if (midQ->midState == MID_REQUEST_SUBMITTED) {
+			midQ->callback = DeleteMidQEntry;
+			spin_unlock(&GlobalMid_Lock);
+			cifs_small_buf_release(in_buf);
+			atomic_dec(&ses->server->inFlight);
+			wake_up(&ses->server->request_q);
+			return rc;
+		}
+		spin_unlock(&GlobalMid_Lock);
+	}
+
+	cifs_small_buf_release(in_buf);
+
+	rc = cifs_sync_mid_result(midQ, ses->server);
+	if (rc != 0) {
+		atomic_dec(&ses->server->inFlight);
+		wake_up(&ses->server->request_q);
+		return rc;
+	}
+
+	if (!midQ->resp_buf || midQ->midState != MID_RESPONSE_RECEIVED) {
+		rc = -EIO;
+		cFYI(1, "Bad MID state?");
+		goto out;
+	}
+
+	iov[0].iov_base = (char *)midQ->resp_buf;
+	iov[0].iov_len = be32_to_cpu(midQ->resp_buf->smb_buf_length) + 4;
+	if (midQ->largeBuf)
+		*pRespBufType = CIFS_LARGE_BUFFER;
+	else
+		*pRespBufType = CIFS_SMALL_BUFFER;
+
+	rc = cifs_check_receive(midQ, ses->server, flags & CIFS_LOG_ERROR);
+
+	/* mark it so buf will not be freed by delete_mid */
+	if ((flags & CIFS_NO_RESP) == 0)
+		midQ->resp_buf = NULL;
+out:
+	delete_mid(midQ);
+	atomic_dec(&ses->server->inFlight);
+	wake_up(&ses->server->request_q);
+
+	return rc;
+}
+
+int
+SendReceive(const unsigned int xid, struct cifs_ses *ses,
+	    struct smb_hdr *in_buf, struct smb_hdr *out_buf,
+	    int *pbytes_returned, const int long_op)
+{
+	int rc = 0;
+	struct mid_q_entry *midQ;
+
+	if (ses == NULL) {
+		cERROR(1, "Null smb session");
+		return -EIO;
+	}
+	if (ses->server == NULL) {
+		cERROR(1, "Null tcp session");
+		return -EIO;
+	}
+
+	if (ses->server->tcpStatus == CifsExiting)
+		return -ENOENT;
+
+	/* Ensure that we do not send more than 50 overlapping requests
+	   to the same server. We may make this configurable later or
+	   use ses->maxReq */
+
+	if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
+			MAX_CIFS_HDR_SIZE - 4) {
+		cERROR(1, "Illegal length, greater than maximum frame, %d",
+			   be32_to_cpu(in_buf->smb_buf_length));
+		return -EIO;
+	}
+
+	rc = wait_for_free_request(ses->server, long_op);
+	if (rc)
+		return rc;
+
+	/* make sure that we sign in the same order that we send on this socket
+	   and avoid races inside tcp sendmsg code that could cause corruption
+	   of smb data */
+
+	mutex_lock(&ses->server->srv_mutex);
+
+	rc = allocate_mid(ses, in_buf, &midQ);
+	if (rc) {
+		mutex_unlock(&ses->server->srv_mutex);
+		/* Update # of requests on wire to server */
+		atomic_dec(&ses->server->inFlight);
+		wake_up(&ses->server->request_q);
+		return rc;
+	}
+
+	rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
+	if (rc) {
+		mutex_unlock(&ses->server->srv_mutex);
+		goto out;
+	}
+
+	midQ->midState = MID_REQUEST_SUBMITTED;
+#ifdef CONFIG_CIFS_STATS2
+	atomic_inc(&ses->server->inSend);
+#endif
+	rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+#ifdef CONFIG_CIFS_STATS2
+	atomic_dec(&ses->server->inSend);
+	midQ->when_sent = jiffies;
+#endif
+	mutex_unlock(&ses->server->srv_mutex);
+
+	if (rc < 0)
+		goto out;
+
+	if (long_op == CIFS_ASYNC_OP)
+		goto out;
+
+	rc = wait_for_response(ses->server, midQ);
+	if (rc != 0) {
+		send_nt_cancel(ses->server, in_buf, midQ);
+		spin_lock(&GlobalMid_Lock);
+		if (midQ->midState == MID_REQUEST_SUBMITTED) {
+			/* no longer considered to be "in-flight" */
+			midQ->callback = DeleteMidQEntry;
+			spin_unlock(&GlobalMid_Lock);
+			atomic_dec(&ses->server->inFlight);
+			wake_up(&ses->server->request_q);
+			return rc;
+		}
+		spin_unlock(&GlobalMid_Lock);
+	}
+
+	rc = cifs_sync_mid_result(midQ, ses->server);
+	if (rc != 0) {
+		atomic_dec(&ses->server->inFlight);
+		wake_up(&ses->server->request_q);
+		return rc;
+	}
+
+	if (!midQ->resp_buf || !out_buf ||
+	    midQ->midState != MID_RESPONSE_RECEIVED) {
+		rc = -EIO;
+		cERROR(1, "Bad MID state?");
+		goto out;
+	}
+
+	*pbytes_returned = be32_to_cpu(midQ->resp_buf->smb_buf_length);
+	memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
+	rc = cifs_check_receive(midQ, ses->server, 0);
+out:
+	delete_mid(midQ);
+	atomic_dec(&ses->server->inFlight);
+	wake_up(&ses->server->request_q);
+
+	return rc;
+}
+
+/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows
+   blocking lock to return. */
+
+static int
+send_lock_cancel(const unsigned int xid, struct cifs_tcon *tcon,
+			struct smb_hdr *in_buf,
+			struct smb_hdr *out_buf)
+{
+	int bytes_returned;
+	struct cifs_ses *ses = tcon->ses;
+	LOCK_REQ *pSMB = (LOCK_REQ *)in_buf;
+
+	/* We just modify the current in_buf to change
+	   the type of lock from LOCKING_ANDX_SHARED_LOCK
+	   or LOCKING_ANDX_EXCLUSIVE_LOCK to
+	   LOCKING_ANDX_CANCEL_LOCK. */
+
+	pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES;
+	pSMB->Timeout = 0;
+	pSMB->hdr.Mid = GetNextMid(ses->server);
+
+	return SendReceive(xid, ses, in_buf, out_buf,
+			&bytes_returned, 0);
+}
+
+int
+SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
+	    struct smb_hdr *in_buf, struct smb_hdr *out_buf,
+	    int *pbytes_returned)
+{
+	int rc = 0;
+	int rstart = 0;
+	struct mid_q_entry *midQ;
+	struct cifs_ses *ses;
+
+	if (tcon == NULL || tcon->ses == NULL) {
+		cERROR(1, "Null smb session");
+		return -EIO;
+	}
+	ses = tcon->ses;
+
+	if (ses->server == NULL) {
+		cERROR(1, "Null tcp session");
+		return -EIO;
+	}
+
+	if (ses->server->tcpStatus == CifsExiting)
+		return -ENOENT;
+
+	/* Ensure that we do not send more than 50 overlapping requests
+	   to the same server. We may make this configurable later or
+	   use ses->maxReq */
+
+	if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
+			MAX_CIFS_HDR_SIZE - 4) {
+		cERROR(1, "Illegal length, greater than maximum frame, %d",
+			   be32_to_cpu(in_buf->smb_buf_length));
+		return -EIO;
+	}
+
+	rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP);
+	if (rc)
+		return rc;
+
+	/* make sure that we sign in the same order that we send on this socket
+	   and avoid races inside tcp sendmsg code that could cause corruption
+	   of smb data */
+
+	mutex_lock(&ses->server->srv_mutex);
+
+	rc = allocate_mid(ses, in_buf, &midQ);
+	if (rc) {
+		mutex_unlock(&ses->server->srv_mutex);
+		return rc;
+	}
+
+	rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
+	if (rc) {
+		delete_mid(midQ);
+		mutex_unlock(&ses->server->srv_mutex);
+		return rc;
+	}
+
+	midQ->midState = MID_REQUEST_SUBMITTED;
+#ifdef CONFIG_CIFS_STATS2
+	atomic_inc(&ses->server->inSend);
+#endif
+	rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+#ifdef CONFIG_CIFS_STATS2
+	atomic_dec(&ses->server->inSend);
+	midQ->when_sent = jiffies;
+#endif
+	mutex_unlock(&ses->server->srv_mutex);
+
+	if (rc < 0) {
+		delete_mid(midQ);
+		return rc;
+	}
+
+	/* Wait for a reply - allow signals to interrupt. */
+	rc = wait_event_interruptible(ses->server->response_q,
+		(!(midQ->midState == MID_REQUEST_SUBMITTED)) ||
+		((ses->server->tcpStatus != CifsGood) &&
+		 (ses->server->tcpStatus != CifsNew)));
+
+	/* Were we interrupted by a signal ? */
+	if ((rc == -ERESTARTSYS) &&
+		(midQ->midState == MID_REQUEST_SUBMITTED) &&
+		((ses->server->tcpStatus == CifsGood) ||
+		 (ses->server->tcpStatus == CifsNew))) {
+
+		if (in_buf->Command == SMB_COM_TRANSACTION2) {
+			/* POSIX lock. We send a NT_CANCEL SMB to cause the
+			   blocking lock to return. */
+			rc = send_nt_cancel(ses->server, in_buf, midQ);
+			if (rc) {
+				delete_mid(midQ);
+				return rc;
+			}
+		} else {
+			/* Windows lock. We send a LOCKINGX_CANCEL_LOCK
+			   to cause the blocking lock to return. */
+
+			rc = send_lock_cancel(xid, tcon, in_buf, out_buf);
+
+			/* If we get -ENOLCK back the lock may have
+			   already been removed. Don't exit in this case. */
+			if (rc && rc != -ENOLCK) {
+				delete_mid(midQ);
+				return rc;
+			}
+		}
+
+		rc = wait_for_response(ses->server, midQ);
+		if (rc) {
+			send_nt_cancel(ses->server, in_buf, midQ);
+			spin_lock(&GlobalMid_Lock);
+			if (midQ->midState == MID_REQUEST_SUBMITTED) {
+				/* no longer considered to be "in-flight" */
+				midQ->callback = DeleteMidQEntry;
+				spin_unlock(&GlobalMid_Lock);
+				return rc;
+			}
+			spin_unlock(&GlobalMid_Lock);
+		}
+
+		/* We got the response - restart system call. */
+		rstart = 1;
+	}
+
+	rc = cifs_sync_mid_result(midQ, ses->server);
+	if (rc != 0)
+		return rc;
+
+	/* rcvd frame is ok */
+	if (out_buf == NULL || midQ->midState != MID_RESPONSE_RECEIVED) {
+		rc = -EIO;
+		cERROR(1, "Bad MID state?");
+		goto out;
+	}
+
+	*pbytes_returned = be32_to_cpu(midQ->resp_buf->smb_buf_length);
+	memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
+	rc = cifs_check_receive(midQ, ses->server, 0);
+out:
+	delete_mid(midQ);
+	if (rstart && rc == -EACCES)
+		return -ERESTARTSYS;
+	return rc;
+}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
new file mode 100644
index 00000000..2a22fb29
--- /dev/null
+++ b/fs/cifs/xattr.c
@@ -0,0 +1,420 @@
+/*
+ *   fs/cifs/xattr.c
+ *
+ *   Copyright (c) International Business Machines  Corp., 2003, 2007
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+
+#define MAX_EA_VALUE_SIZE 65535
+#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
+#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
+#define CIFS_XATTR_USER_PREFIX "user."
+#define CIFS_XATTR_SYSTEM_PREFIX "system."
+#define CIFS_XATTR_OS2_PREFIX "os2."
+#define CIFS_XATTR_SECURITY_PREFIX "security."
+#define CIFS_XATTR_TRUSTED_PREFIX "trusted."
+#define XATTR_TRUSTED_PREFIX_LEN  8
+#define XATTR_SECURITY_PREFIX_LEN 9
+/* BB need to add server (Samba e.g) support for security and trusted prefix */
+
+
+
+int cifs_removexattr(struct dentry *direntry, const char *ea_name)
+{
+	int rc = -EOPNOTSUPP;
+#ifdef CONFIG_CIFS_XATTR
+	int xid;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	struct super_block *sb;
+	char *full_path = NULL;
+
+	if (direntry == NULL)
+		return -EIO;
+	if (direntry->d_inode == NULL)
+		return -EIO;
+	sb = direntry->d_inode->i_sb;
+	if (sb == NULL)
+		return -EIO;
+
+	cifs_sb = CIFS_SB(sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto remove_ea_exit;
+	}
+	if (ea_name == NULL) {
+		cFYI(1, "Null xattr names not supported");
+	} else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5)
+		&& (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) {
+		cFYI(1,
+		     "illegal xattr request %s (only user namespace supported)",
+		     ea_name);
+		/* BB what if no namespace prefix? */
+		/* Should we just pass them to server, except for
+		system and perhaps security prefixes? */
+	} else {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+			goto remove_ea_exit;
+
+		ea_name += 5; /* skip past user. prefix */
+		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL,
+			(__u16)0, cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	}
+remove_ea_exit:
+	kfree(full_path);
+	FreeXid(xid);
+	cifs_put_tlink(tlink);
+#endif
+	return rc;
+}
+
+int cifs_setxattr(struct dentry *direntry, const char *ea_name,
+		  const void *ea_value, size_t value_size, int flags)
+{
+	int rc = -EOPNOTSUPP;
+#ifdef CONFIG_CIFS_XATTR
+	int xid;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	struct super_block *sb;
+	char *full_path;
+	struct cifs_ntsd *pacl;
+
+	if (direntry == NULL)
+		return -EIO;
+	if (direntry->d_inode == NULL)
+		return -EIO;
+	sb = direntry->d_inode->i_sb;
+	if (sb == NULL)
+		return -EIO;
+
+	cifs_sb = CIFS_SB(sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto set_ea_exit;
+	}
+	/* return dos attributes as pseudo xattr */
+	/* return alt name if available as pseudo attr */
+
+	/* if proc/fs/cifs/streamstoxattr is set then
+		search server for EAs or streams to
+		returns as xattrs */
+	if (value_size > MAX_EA_VALUE_SIZE) {
+		cFYI(1, "size of EA value too large");
+		rc = -EOPNOTSUPP;
+		goto set_ea_exit;
+	}
+
+	if (ea_name == NULL) {
+		cFYI(1, "Null xattr names not supported");
+	} else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+			goto set_ea_exit;
+		if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
+			cFYI(1, "attempt to set cifs inode metadata");
+
+		ea_name += 5; /* skip past user. prefix */
+		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
+			(__u16)value_size, cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	} else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+			goto set_ea_exit;
+
+		ea_name += 4; /* skip past os2. prefix */
+		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
+			(__u16)value_size, cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
+			strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+		pacl = kmalloc(value_size, GFP_KERNEL);
+		if (!pacl) {
+			cFYI(1, "%s: Can't allocate memory for ACL",
+					__func__);
+			rc = -ENOMEM;
+		} else {
+#ifdef CONFIG_CIFS_ACL
+			memcpy(pacl, ea_value, value_size);
+			rc = set_cifs_acl(pacl, value_size,
+				direntry->d_inode, full_path);
+			if (rc == 0) /* force revalidate of the inode */
+				CIFS_I(direntry->d_inode)->time = 0;
+			kfree(pacl);
+#else
+			cFYI(1, "Set CIFS ACL not supported yet");
+#endif /* CONFIG_CIFS_ACL */
+		}
+	} else {
+		int temp;
+		temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
+			strlen(POSIX_ACL_XATTR_ACCESS));
+		if (temp == 0) {
+#ifdef CONFIG_CIFS_POSIX
+			if (sb->s_flags & MS_POSIXACL)
+				rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
+					ea_value, (const int)value_size,
+					ACL_TYPE_ACCESS, cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+			cFYI(1, "set POSIX ACL rc %d", rc);
+#else
+			cFYI(1, "set POSIX ACL not supported");
+#endif
+		} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
+				   strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+#ifdef CONFIG_CIFS_POSIX
+			if (sb->s_flags & MS_POSIXACL)
+				rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
+					ea_value, (const int)value_size,
+					ACL_TYPE_DEFAULT, cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+			cFYI(1, "set POSIX default ACL rc %d", rc);
+#else
+			cFYI(1, "set default POSIX ACL not supported");
+#endif
+		} else {
+			cFYI(1, "illegal xattr request %s (only user namespace"
+				" supported)", ea_name);
+		  /* BB what if no namespace prefix? */
+		  /* Should we just pass them to server, except for
+		  system and perhaps security prefixes? */
+		}
+	}
+
+set_ea_exit:
+	kfree(full_path);
+	FreeXid(xid);
+	cifs_put_tlink(tlink);
+#endif
+	return rc;
+}
+
+ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
+	void *ea_value, size_t buf_size)
+{
+	ssize_t rc = -EOPNOTSUPP;
+#ifdef CONFIG_CIFS_XATTR
+	int xid;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	struct super_block *sb;
+	char *full_path;
+
+	if (direntry == NULL)
+		return -EIO;
+	if (direntry->d_inode == NULL)
+		return -EIO;
+	sb = direntry->d_inode->i_sb;
+	if (sb == NULL)
+		return -EIO;
+
+	cifs_sb = CIFS_SB(sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto get_ea_exit;
+	}
+	/* return dos attributes as pseudo xattr */
+	/* return alt name if available as pseudo attr */
+	if (ea_name == NULL) {
+		cFYI(1, "Null xattr names not supported");
+	} else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+			goto get_ea_exit;
+
+		if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) {
+			cFYI(1, "attempt to query cifs inode metadata");
+			/* revalidate/getattr then populate from inode */
+		} /* BB add else when above is implemented */
+		ea_name += 5; /* skip past user. prefix */
+		rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
+			buf_size, cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	} else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+			goto get_ea_exit;
+
+		ea_name += 4; /* skip past os2. prefix */
+		rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
+			buf_size, cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	} else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
+			  strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
+#ifdef CONFIG_CIFS_POSIX
+		if (sb->s_flags & MS_POSIXACL)
+			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
+				ea_value, buf_size, ACL_TYPE_ACCESS,
+				cifs_sb->local_nls,
+				cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+#else
+		cFYI(1, "Query POSIX ACL not supported yet");
+#endif /* CONFIG_CIFS_POSIX */
+	} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
+			  strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+#ifdef CONFIG_CIFS_POSIX
+		if (sb->s_flags & MS_POSIXACL)
+			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
+				ea_value, buf_size, ACL_TYPE_DEFAULT,
+				cifs_sb->local_nls,
+				cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+#else
+		cFYI(1, "Query POSIX default ACL not supported yet");
+#endif /* CONFIG_CIFS_POSIX */
+	} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
+				strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+#ifdef CONFIG_CIFS_ACL
+			u32 acllen;
+			struct cifs_ntsd *pacl;
+
+			pacl = get_cifs_acl(cifs_sb, direntry->d_inode,
+						full_path, &acllen);
+			if (IS_ERR(pacl)) {
+				rc = PTR_ERR(pacl);
+				cERROR(1, "%s: error %zd getting sec desc",
+						__func__, rc);
+			} else {
+				if (ea_value) {
+					if (acllen > buf_size)
+						acllen = -ERANGE;
+					else
+						memcpy(ea_value, pacl, acllen);
+				}
+				rc = acllen;
+				kfree(pacl);
+			}
+#else
+		cFYI(1, "Query CIFS ACL not supported yet");
+#endif /* CONFIG_CIFS_ACL */
+	} else if (strncmp(ea_name,
+		  CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
+		cFYI(1, "Trusted xattr namespace not supported yet");
+	} else if (strncmp(ea_name,
+		  CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
+		cFYI(1, "Security xattr namespace not supported yet");
+	} else
+		cFYI(1,
+		    "illegal xattr request %s (only user namespace supported)",
+		     ea_name);
+
+	/* We could add an additional check for streams ie
+	    if proc/fs/cifs/streamstoxattr is set then
+		search server for EAs or streams to
+		returns as xattrs */
+
+	if (rc == -EINVAL)
+		rc = -EOPNOTSUPP;
+
+get_ea_exit:
+	kfree(full_path);
+	FreeXid(xid);
+	cifs_put_tlink(tlink);
+#endif
+	return rc;
+}
+
+ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
+{
+	ssize_t rc = -EOPNOTSUPP;
+#ifdef CONFIG_CIFS_XATTR
+	int xid;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	struct super_block *sb;
+	char *full_path;
+
+	if (direntry == NULL)
+		return -EIO;
+	if (direntry->d_inode == NULL)
+		return -EIO;
+	sb = direntry->d_inode->i_sb;
+	if (sb == NULL)
+		return -EIO;
+
+	cifs_sb = CIFS_SB(sb);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+		return -EOPNOTSUPP;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto list_ea_exit;
+	}
+	/* return dos attributes as pseudo xattr */
+	/* return alt name if available as pseudo attr */
+
+	/* if proc/fs/cifs/streamstoxattr is set then
+		search server for EAs or streams to
+		returns as xattrs */
+	rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data,
+				buf_size, cifs_sb->local_nls,
+				cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+list_ea_exit:
+	kfree(full_path);
+	FreeXid(xid);
+	cifs_put_tlink(tlink);
+#endif
+	return rc;
+}
diff --git a/fs/coda/Kconfig b/fs/coda/Kconfig
new file mode 100644
index 00000000..c0e5a7fa
--- /dev/null
+++ b/fs/coda/Kconfig
@@ -0,0 +1,21 @@
+config CODA_FS
+	tristate "Coda file system support (advanced network fs)"
+	depends on INET
+	help
+	  Coda is an advanced network file system, similar to NFS in that it
+	  enables you to mount file systems of a remote server and access them
+	  with regular Unix commands as if they were sitting on your hard
+	  disk.  Coda has several advantages over NFS: support for
+	  disconnected operation (e.g. for laptops), read/write server
+	  replication, security model for authentication and encryption,
+	  persistent client caches and write back caching.
+
+	  If you say Y here, your Linux box will be able to act as a Coda
+	  *client*.  You will need user level code as well, both for the
+	  client and server.  Servers are currently user level, i.e. they need
+	  no kernel support.  Please read
+	  <file:Documentation/filesystems/coda.txt> and check out the Coda
+	  home page <http://www.coda.cs.cmu.edu/>.
+
+	  To compile the coda client support as a module, choose M here: the
+	  module will be called coda.
diff --git a/fs/coda/Makefile b/fs/coda/Makefile
new file mode 100644
index 00000000..1bab69a0
--- /dev/null
+++ b/fs/coda/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the Linux Coda filesystem routines.
+#
+
+obj-$(CONFIG_CODA_FS) += coda.o
+
+coda-objs := psdev.o cache.o cnode.o inode.o dir.o file.o upcall.o \
+	     coda_linux.o symlink.o pioctl.o sysctl.o 
+
+# If you want debugging output, please uncomment the following line.
+
+# ccflags-y := -DDEBUG -DDEBUG_SMB_MALLOC=1
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
new file mode 100644
index 00000000..69015787
--- /dev/null
+++ b/fs/coda/cache.c
@@ -0,0 +1,122 @@
+/*
+ * Cache operations for Coda.
+ * For Linux 2.1: (C) 1997 Carnegie Mellon University
+ * For Linux 2.3: (C) 2000 Carnegie Mellon University
+ *
+ * Carnegie Mellon encourages users of this code to contribute improvements
+ * to the Coda project http://www.coda.cs.cmu.edu/ <coda@cs.cmu.edu>.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <asm/uaccess.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
+
+static atomic_t permission_epoch = ATOMIC_INIT(0);
+
+/* replace or extend an acl cache hit */
+void coda_cache_enter(struct inode *inode, int mask)
+{
+	struct coda_inode_info *cii = ITOC(inode);
+
+	spin_lock(&cii->c_lock);
+	cii->c_cached_epoch = atomic_read(&permission_epoch);
+	if (cii->c_uid != current_fsuid()) {
+		cii->c_uid = current_fsuid();
+                cii->c_cached_perm = mask;
+        } else
+                cii->c_cached_perm |= mask;
+	spin_unlock(&cii->c_lock);
+}
+
+/* remove cached acl from an inode */
+void coda_cache_clear_inode(struct inode *inode)
+{
+	struct coda_inode_info *cii = ITOC(inode);
+	spin_lock(&cii->c_lock);
+	cii->c_cached_epoch = atomic_read(&permission_epoch) - 1;
+	spin_unlock(&cii->c_lock);
+}
+
+/* remove all acl caches */
+void coda_cache_clear_all(struct super_block *sb)
+{
+	atomic_inc(&permission_epoch);
+}
+
+
+/* check if the mask has been matched against the acl already */
+int coda_cache_check(struct inode *inode, int mask)
+{
+	struct coda_inode_info *cii = ITOC(inode);
+	int hit;
+	
+	spin_lock(&cii->c_lock);
+	hit = (mask & cii->c_cached_perm) == mask &&
+	    cii->c_uid == current_fsuid() &&
+	    cii->c_cached_epoch == atomic_read(&permission_epoch);
+	spin_unlock(&cii->c_lock);
+
+	return hit;
+}
+
+
+/* Purging dentries and children */
+/* The following routines drop dentries which are not
+   in use and flag dentries which are in use to be 
+   zapped later.
+
+   The flags are detected by:
+   - coda_dentry_revalidate (for lookups) if the flag is C_PURGE
+   - coda_dentry_delete: to remove dentry from the cache when d_count
+     falls to zero
+   - an inode method coda_revalidate (for attributes) if the 
+     flag is C_VATTR
+*/
+
+/* this won't do any harm: just flag all children */
+static void coda_flag_children(struct dentry *parent, int flag)
+{
+	struct list_head *child;
+	struct dentry *de;
+
+	spin_lock(&parent->d_lock);
+	list_for_each(child, &parent->d_subdirs)
+	{
+		de = list_entry(child, struct dentry, d_u.d_child);
+		/* don't know what to do with negative dentries */
+		if ( ! de->d_inode ) 
+			continue;
+		coda_flag_inode(de->d_inode, flag);
+	}
+	spin_unlock(&parent->d_lock);
+	return; 
+}
+
+void coda_flag_inode_children(struct inode *inode, int flag)
+{
+	struct dentry *alias_de;
+
+	if ( !inode || !S_ISDIR(inode->i_mode)) 
+		return; 
+
+	alias_de = d_find_alias(inode);
+	if (!alias_de)
+		return;
+	coda_flag_children(alias_de, flag);
+	shrink_dcache_parent(alias_de);
+	dput(alias_de);
+}
+
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
new file mode 100644
index 00000000..6475877b
--- /dev/null
+++ b/fs/coda/cnode.c
@@ -0,0 +1,174 @@
+/* cnode related routines for the coda kernel code
+   (C) 1996 Peter Braam
+   */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/time.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+#include "coda_linux.h"
+
+static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
+{
+	return memcmp(fid1, fid2, sizeof(*fid1)) == 0;
+}
+
+static const struct inode_operations coda_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= coda_setattr,
+};
+
+/* cnode.c */
+static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
+{
+        coda_vattr_to_iattr(inode, attr);
+
+        if (S_ISREG(inode->i_mode)) {
+                inode->i_op = &coda_file_inode_operations;
+                inode->i_fop = &coda_file_operations;
+        } else if (S_ISDIR(inode->i_mode)) {
+                inode->i_op = &coda_dir_inode_operations;
+                inode->i_fop = &coda_dir_operations;
+        } else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &coda_symlink_inode_operations;
+		inode->i_data.a_ops = &coda_symlink_aops;
+		inode->i_mapping = &inode->i_data;
+	} else
+                init_special_inode(inode, inode->i_mode, huge_decode_dev(attr->va_rdev));
+}
+
+static int coda_test_inode(struct inode *inode, void *data)
+{
+	struct CodaFid *fid = (struct CodaFid *)data;
+	struct coda_inode_info *cii = ITOC(inode);
+	return coda_fideq(&cii->c_fid, fid);
+}
+
+static int coda_set_inode(struct inode *inode, void *data)
+{
+	struct CodaFid *fid = (struct CodaFid *)data;
+	struct coda_inode_info *cii = ITOC(inode);
+	cii->c_fid = *fid;
+	return 0;
+}
+
+struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
+			 struct coda_vattr * attr)
+{
+	struct inode *inode;
+	struct coda_inode_info *cii;
+	unsigned long hash = coda_f2i(fid);
+
+	inode = iget5_locked(sb, hash, coda_test_inode, coda_set_inode, fid);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (inode->i_state & I_NEW) {
+		cii = ITOC(inode);
+		/* we still need to set i_ino for things like stat(2) */
+		inode->i_ino = hash;
+		/* inode is locked and unique, no need to grab cii->c_lock */
+		cii->c_mapcount = 0;
+		unlock_new_inode(inode);
+	}
+
+	/* always replace the attributes, type might have changed */
+	coda_fill_inode(inode, attr);
+	return inode;
+}
+
+/* this is effectively coda_iget:
+   - get attributes (might be cached)
+   - get the inode for the fid using vfs iget
+   - link the two up if this is needed
+   - fill in the attributes
+*/
+int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_block *sb)
+{
+        struct coda_vattr attr;
+        int error;
+        
+	/* We get inode numbers from Venus -- see venus source */
+	error = venus_getattr(sb, fid, &attr);
+	if ( error ) {
+	    *inode = NULL;
+	    return error;
+	} 
+
+	*inode = coda_iget(sb, fid, &attr);
+	if ( IS_ERR(*inode) ) {
+		printk("coda_cnode_make: coda_iget failed\n");
+                return PTR_ERR(*inode);
+        }
+	return 0;
+}
+
+
+/* Although we treat Coda file identifiers as immutable, there is one
+ * special case for files created during a disconnection where they may
+ * not be globally unique. When an identifier collision is detected we
+ * first try to flush the cached inode from the kernel and finally
+ * resort to renaming/rehashing in-place. Userspace remembers both old
+ * and new values of the identifier to handle any in-flight upcalls.
+ * The real solution is to use globally unique UUIDs as identifiers, but
+ * retrofitting the existing userspace code for this is non-trivial. */
+void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid, 
+		      struct CodaFid *newfid)
+{
+	struct coda_inode_info *cii = ITOC(inode);
+	unsigned long hash = coda_f2i(newfid);
+	
+	BUG_ON(!coda_fideq(&cii->c_fid, oldfid));
+
+	/* replace fid and rehash inode */
+	/* XXX we probably need to hold some lock here! */
+	remove_inode_hash(inode);
+	cii->c_fid = *newfid;
+	inode->i_ino = hash;
+	__insert_inode_hash(inode, hash);
+}
+
+/* convert a fid to an inode. */
+struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb) 
+{
+	struct inode *inode;
+	unsigned long hash = coda_f2i(fid);
+
+	if ( !sb ) {
+		printk("coda_fid_to_inode: no sb!\n");
+		return NULL;
+	}
+
+	inode = ilookup5(sb, hash, coda_test_inode, fid);
+	if ( !inode )
+		return NULL;
+
+	/* we should never see newly created inodes because we intentionally
+	 * fail in the initialization callback */
+	BUG_ON(inode->i_state & I_NEW);
+
+	return inode;
+}
+
+/* the CONTROL inode is made without asking attributes from Venus */
+int coda_cnode_makectl(struct inode **inode, struct super_block *sb)
+{
+	int error = -ENOMEM;
+
+	*inode = new_inode(sb);
+	if (*inode) {
+		(*inode)->i_ino = CTL_INO;
+		(*inode)->i_op = &coda_ioctl_inode_operations;
+		(*inode)->i_fop = &coda_ioctl_operations;
+		(*inode)->i_mode = 0444;
+		error = 0;
+	}
+
+	return error;
+}
+
diff --git a/fs/coda/coda_cache.h b/fs/coda/coda_cache.h
new file mode 100644
index 00000000..c910b5eb
--- /dev/null
+++ b/fs/coda/coda_cache.h
@@ -0,0 +1,22 @@
+/* Coda filesystem -- Linux Minicache
+ *
+ * Copyright (C) 1989 - 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project. Contact Peter Braam
+ * <coda@cs.cmu.edu>
+ */
+
+#ifndef _CFSNC_HEADER_
+#define _CFSNC_HEADER_
+
+/* credential cache */
+void coda_cache_enter(struct inode *inode, int mask);
+void coda_cache_clear_inode(struct inode *);
+void coda_cache_clear_all(struct super_block *sb);
+int coda_cache_check(struct inode *inode, int mask);
+
+/* for downcalls and attributes and lookups */
+void coda_flag_inode_children(struct inode *inode, int flag);
+
+#endif /* _CFSNC_HEADER_ */
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
new file mode 100644
index 00000000..e35071b1
--- /dev/null
+++ b/fs/coda/coda_fs_i.h
@@ -0,0 +1,58 @@
+/*
+ *  coda_fs_i.h
+ *
+ *  Copyright (C) 1998 Carnegie Mellon University
+ *
+ */
+
+#ifndef _LINUX_CODA_FS_I
+#define _LINUX_CODA_FS_I
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/coda.h>
+
+/*
+ * coda fs inode data
+ * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and
+ * c_cached_perm.
+ * vfs_inode is set only when the inode is created and never changes.
+ * c_fid is set when the inode is created and should be considered immutable.
+ */
+struct coda_inode_info {
+	struct CodaFid	   c_fid;	/* Coda identifier */
+	u_short	           c_flags;     /* flags (see below) */
+	unsigned int	   c_mapcount;  /* nr of times this inode is mapped */
+	unsigned int	   c_cached_epoch; /* epoch for cached permissions */
+	vuid_t		   c_uid;	/* fsuid for cached permissions */
+	unsigned int       c_cached_perm; /* cached access permissions */
+	spinlock_t	   c_lock;
+	struct inode	   vfs_inode;
+};
+
+/*
+ * coda fs file private data
+ */
+#define CODA_MAGIC 0xC0DAC0DA
+struct coda_file_info {
+	int		   cfi_magic;	  /* magic number */
+	struct file	  *cfi_container; /* container file for this cnode */
+	unsigned int	   cfi_mapcount;  /* nr of times this file is mapped */
+};
+
+#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data))
+
+/* flags */
+#define C_VATTR       0x1   /* Validity of vattr in inode */
+#define C_FLUSH       0x2   /* used after a flush */
+#define C_DYING       0x4   /* from venus (which died) */
+#define C_PURGE       0x8
+
+int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
+struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
+int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
+struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
+void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
+
+#endif
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
new file mode 100644
index 00000000..6b443ff4
--- /dev/null
+++ b/fs/coda/coda_int.h
@@ -0,0 +1,20 @@
+#ifndef _CODA_INT_
+#define _CODA_INT_
+
+struct dentry;
+struct file;
+
+extern struct file_system_type coda_fs_type;
+extern unsigned long coda_timeout;
+extern int coda_hard;
+extern int coda_fake_statfs;
+
+void coda_destroy_inodecache(void);
+int coda_init_inodecache(void);
+int coda_fsync(struct file *coda_file, int datasync);
+void coda_sysctl_init(void);
+void coda_sysctl_clean(void);
+
+#endif  /*  _CODA_INT_  */
+
+
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
new file mode 100644
index 00000000..2bdbcc11
--- /dev/null
+++ b/fs/coda/coda_linux.c
@@ -0,0 +1,192 @@
+/*
+ * Inode operations for Coda filesystem
+ * Original version: (C) 1996 P. Braam and M. Callahan
+ * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University
+ * 
+ * Carnegie Mellon encourages users to contribute improvements to
+ * the Coda project. Contact Peter Braam (coda@cs.cmu.edu).
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <asm/uaccess.h>
+#include <linux/string.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+#include "coda_linux.h"
+
+/* initialize the debugging variables */
+int coda_fake_statfs;
+
+/* print a fid */
+char * coda_f2s(struct CodaFid *f)
+{
+	static char s[60];
+
+ 	sprintf(s, "(%08x.%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2], f->opaque[3]);
+
+	return s;
+}
+
+/* recognize special .CONTROL name */
+int coda_iscontrol(const char *name, size_t length)
+{
+	return ((CODA_CONTROLLEN == length) && 
+                (strncmp(name, CODA_CONTROL, CODA_CONTROLLEN) == 0));
+}
+
+/* recognize /coda inode */
+int coda_isroot(struct inode *i)
+{
+    return ( i->i_sb->s_root->d_inode == i );
+}
+
+unsigned short coda_flags_to_cflags(unsigned short flags)
+{
+	unsigned short coda_flags = 0;
+	
+	if ((flags & O_ACCMODE) == O_RDONLY)
+		coda_flags |= C_O_READ;
+
+	if ((flags & O_ACCMODE) == O_RDWR)
+		coda_flags |= C_O_READ | C_O_WRITE;
+
+	if ((flags & O_ACCMODE) == O_WRONLY)
+		coda_flags |= C_O_WRITE;
+
+	if (flags & O_TRUNC)
+		coda_flags |= C_O_TRUNC;
+
+	if (flags & O_CREAT)
+		coda_flags |= C_O_CREAT;
+
+	if (flags & O_EXCL)
+		coda_flags |= C_O_EXCL;
+
+	return coda_flags;
+}
+
+
+/* utility functions below */
+void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
+{
+        int inode_type;
+        /* inode's i_flags, i_ino are set by iget 
+           XXX: is this all we need ??
+           */
+        switch (attr->va_type) {
+        case C_VNON:
+                inode_type  = 0;
+                break;
+        case C_VREG:
+                inode_type = S_IFREG;
+                break;
+        case C_VDIR:
+                inode_type = S_IFDIR;
+                break;
+        case C_VLNK:
+                inode_type = S_IFLNK;
+                break;
+        default:
+                inode_type = 0;
+        }
+	inode->i_mode |= inode_type;
+
+	if (attr->va_mode != (u_short) -1)
+	        inode->i_mode = attr->va_mode | inode_type;
+        if (attr->va_uid != -1) 
+	        inode->i_uid = (uid_t) attr->va_uid;
+        if (attr->va_gid != -1)
+	        inode->i_gid = (gid_t) attr->va_gid;
+	if (attr->va_nlink != -1)
+	        inode->i_nlink = attr->va_nlink;
+	if (attr->va_size != -1)
+	        inode->i_size = attr->va_size;
+	if (attr->va_size != -1)
+		inode->i_blocks = (attr->va_size + 511) >> 9;
+	if (attr->va_atime.tv_sec != -1) 
+	        inode->i_atime = attr->va_atime;
+	if (attr->va_mtime.tv_sec != -1)
+	        inode->i_mtime = attr->va_mtime;
+        if (attr->va_ctime.tv_sec != -1)
+	        inode->i_ctime = attr->va_ctime;
+}
+
+
+/* 
+ * BSD sets attributes that need not be modified to -1. 
+ * Linux uses the valid field to indicate what should be
+ * looked at.  The BSD type field needs to be deduced from linux 
+ * mode.
+ * So we have to do some translations here.
+ */
+
+void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr)
+{
+        unsigned int valid;
+
+        /* clean out */        
+	vattr->va_mode = -1;
+        vattr->va_uid = (vuid_t) -1; 
+        vattr->va_gid = (vgid_t) -1;
+        vattr->va_size = (off_t) -1;
+	vattr->va_atime.tv_sec = (time_t) -1;
+	vattr->va_atime.tv_nsec =  (time_t) -1;
+        vattr->va_mtime.tv_sec = (time_t) -1;
+        vattr->va_mtime.tv_nsec = (time_t) -1;
+	vattr->va_ctime.tv_sec = (time_t) -1;
+	vattr->va_ctime.tv_nsec = (time_t) -1;
+        vattr->va_type = C_VNON;
+	vattr->va_fileid = -1;
+	vattr->va_gen = -1;
+	vattr->va_bytes = -1;
+	vattr->va_nlink = -1;
+	vattr->va_blocksize = -1;
+	vattr->va_rdev = -1;
+        vattr->va_flags = 0;
+
+        /* determine the type */
+#if 0
+        mode = iattr->ia_mode;
+                if ( S_ISDIR(mode) ) {
+                vattr->va_type = C_VDIR; 
+        } else if ( S_ISREG(mode) ) {
+                vattr->va_type = C_VREG;
+        } else if ( S_ISLNK(mode) ) {
+                vattr->va_type = C_VLNK;
+        } else {
+                /* don't do others */
+                vattr->va_type = C_VNON;
+        }
+#endif 
+
+        /* set those vattrs that need change */
+        valid = iattr->ia_valid;
+        if ( valid & ATTR_MODE ) {
+                vattr->va_mode = iattr->ia_mode;
+	}
+        if ( valid & ATTR_UID ) {
+                vattr->va_uid = (vuid_t) iattr->ia_uid;
+	}
+        if ( valid & ATTR_GID ) {
+                vattr->va_gid = (vgid_t) iattr->ia_gid;
+	}
+        if ( valid & ATTR_SIZE ) {
+                vattr->va_size = iattr->ia_size;
+	}
+        if ( valid & ATTR_ATIME ) {
+                vattr->va_atime = iattr->ia_atime;
+	}
+        if ( valid & ATTR_MTIME ) {
+                vattr->va_mtime = iattr->ia_mtime;
+	}
+        if ( valid & ATTR_CTIME ) {
+                vattr->va_ctime = iattr->ia_ctime;
+	}
+}
+
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
new file mode 100644
index 00000000..9b0c5323
--- /dev/null
+++ b/fs/coda/coda_linux.h
@@ -0,0 +1,101 @@
+/* 
+ * Coda File System, Linux Kernel module
+ * 
+ * Original version, adapted from cfs_mach.c, (C) Carnegie Mellon University
+ * Linux modifications (C) 1996, Peter J. Braam
+ * Rewritten for Linux 2.1 (C) 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project.
+ */
+
+#ifndef _LINUX_CODA_FS
+#define _LINUX_CODA_FS
+
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/wait.h>		
+#include <linux/types.h>
+#include <linux/fs.h>
+#include "coda_fs_i.h"
+
+/* operations */
+extern const struct inode_operations coda_dir_inode_operations;
+extern const struct inode_operations coda_file_inode_operations;
+extern const struct inode_operations coda_ioctl_inode_operations;
+
+extern const struct dentry_operations coda_dentry_operations;
+
+extern const struct address_space_operations coda_file_aops;
+extern const struct address_space_operations coda_symlink_aops;
+
+extern const struct file_operations coda_dir_operations;
+extern const struct file_operations coda_file_operations;
+extern const struct file_operations coda_ioctl_operations;
+
+/* operations shared over more than one file */
+int coda_open(struct inode *i, struct file *f);
+int coda_release(struct inode *i, struct file *f);
+int coda_permission(struct inode *inode, int mask, unsigned int flags);
+int coda_revalidate_inode(struct dentry *);
+int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+int coda_setattr(struct dentry *, struct iattr *);
+
+/* this file:  heloers */
+char *coda_f2s(struct CodaFid *f);
+int coda_isroot(struct inode *i);
+int coda_iscontrol(const char *name, size_t length);
+
+void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
+void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
+unsigned short coda_flags_to_cflags(unsigned short);
+
+/* sysctl.h */
+void coda_sysctl_init(void);
+void coda_sysctl_clean(void);
+
+#define CODA_ALLOC(ptr, cast, size) do { \
+    if (size < PAGE_SIZE) \
+        ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
+    else \
+        ptr = (cast)vmalloc((unsigned long) size); \
+    if (!ptr) \
+        printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
+    else memset( ptr, 0, size ); \
+} while (0)
+
+
+#define CODA_FREE(ptr,size) \
+    do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
+
+/* inode to cnode access functions */
+
+static inline struct coda_inode_info *ITOC(struct inode *inode)
+{
+	return list_entry(inode, struct coda_inode_info, vfs_inode);
+}
+
+static __inline__ struct CodaFid *coda_i2f(struct inode *inode)
+{
+	return &(ITOC(inode)->c_fid);
+}
+
+static __inline__ char *coda_i2s(struct inode *inode)
+{
+	return coda_f2s(&(ITOC(inode)->c_fid));
+}
+
+/* this will not zap the inode away */
+static __inline__ void coda_flag_inode(struct inode *inode, int flag)
+{
+	struct coda_inode_info *cii = ITOC(inode);
+
+	spin_lock(&cii->c_lock);
+	cii->c_flags |= flag;
+	spin_unlock(&cii->c_lock);
+}		
+
+#endif
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
new file mode 100644
index 00000000..2b8dae4d
--- /dev/null
+++ b/fs/coda/dir.c
@@ -0,0 +1,650 @@
+
+/*
+ * Directory operations for Coda filesystem
+ * Original version: (C) 1996 P. Braam and M. Callahan
+ * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University
+ * 
+ * Carnegie Mellon encourages users to contribute improvements to
+ * the Coda project. Contact Peter Braam (coda@cs.cmu.edu).
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/namei.h>
+
+#include <asm/uaccess.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
+
+#include "coda_int.h"
+
+/* dir inode-ops */
+static int coda_create(struct inode *dir, struct dentry *new, int mode, struct nameidata *nd);
+static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, struct nameidata *nd);
+static int coda_link(struct dentry *old_dentry, struct inode *dir_inode, 
+		     struct dentry *entry);
+static int coda_unlink(struct inode *dir_inode, struct dentry *entry);
+static int coda_symlink(struct inode *dir_inode, struct dentry *entry,
+			const char *symname);
+static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, int mode);
+static int coda_rmdir(struct inode *dir_inode, struct dentry *entry);
+static int coda_rename(struct inode *old_inode, struct dentry *old_dentry, 
+                       struct inode *new_inode, struct dentry *new_dentry);
+
+/* dir file-ops */
+static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
+
+/* dentry ops */
+static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd);
+static int coda_dentry_delete(const struct dentry *);
+
+/* support routines */
+static int coda_venus_readdir(struct file *coda_file, void *buf,
+			      filldir_t filldir);
+
+/* same as fs/bad_inode.c */
+static int coda_return_EIO(void)
+{
+	return -EIO;
+}
+#define CODA_EIO_ERROR ((void *) (coda_return_EIO))
+
+const struct dentry_operations coda_dentry_operations =
+{
+	.d_revalidate	= coda_dentry_revalidate,
+	.d_delete	= coda_dentry_delete,
+};
+
+const struct inode_operations coda_dir_inode_operations =
+{
+	.create		= coda_create,
+	.lookup		= coda_lookup,
+	.link		= coda_link,
+	.unlink		= coda_unlink,
+	.symlink	= coda_symlink,
+	.mkdir		= coda_mkdir,
+	.rmdir		= coda_rmdir,
+	.mknod		= CODA_EIO_ERROR,
+	.rename		= coda_rename,
+	.permission	= coda_permission,
+	.getattr	= coda_getattr,
+	.setattr	= coda_setattr,
+};
+
+const struct file_operations coda_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= coda_readdir,
+	.open		= coda_open,
+	.release	= coda_release,
+	.fsync		= coda_fsync,
+};
+
+
+/* inode operations for directories */
+/* access routines: lookup, readlink, permission */
+static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struct nameidata *nd)
+{
+	struct inode *inode = NULL;
+	struct CodaFid resfid = { { 0, } };
+	int type = 0;
+	int error = 0;
+	const char *name = entry->d_name.name;
+	size_t length = entry->d_name.len;
+
+	if (length > CODA_MAXNAMLEN) {
+		printk(KERN_ERR "name too long: lookup, %s (%*s)\n",
+		       coda_i2s(dir), (int)length, name);
+		return ERR_PTR(-ENAMETOOLONG);
+	}
+
+	/* control object, create inode on the fly */
+	if (coda_isroot(dir) && coda_iscontrol(name, length)) {
+		error = coda_cnode_makectl(&inode, dir->i_sb);
+		type = CODA_NOCACHE;
+		goto exit;
+	}
+
+	error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
+			     &type, &resfid);
+	if (!error)
+		error = coda_cnode_make(&inode, &resfid, dir->i_sb);
+
+	if (error && error != -ENOENT)
+		return ERR_PTR(error);
+
+exit:
+	if (inode && (type & CODA_NOCACHE))
+		coda_flag_inode(inode, C_VATTR | C_PURGE);
+
+	return d_splice_alias(inode, entry);
+}
+
+
+int coda_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	int error;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
+ 
+	if (!mask)
+		return 0;
+
+	if ((mask & MAY_EXEC) && !execute_ok(inode))
+		return -EACCES;
+
+	if (coda_cache_check(inode, mask))
+		return 0;
+
+	error = venus_access(inode->i_sb, coda_i2f(inode), mask);
+    
+	if (!error)
+		coda_cache_enter(inode, mask);
+
+	return error;
+}
+
+
+static inline void coda_dir_update_mtime(struct inode *dir)
+{
+#ifdef REQUERY_VENUS_FOR_MTIME
+	/* invalidate the directory cnode's attributes so we refetch the
+	 * attributes from venus next time the inode is referenced */
+	coda_flag_inode(dir, C_VATTR);
+#else
+	/* optimistically we can also act as if our nose bleeds. The
+	 * granularity of the mtime is coarse anyways so we might actually be
+	 * right most of the time. Note: we only do this for directories. */
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+#endif
+}
+
+/* we have to wrap inc_nlink/drop_nlink because sometimes userspace uses a
+ * trick to fool GNU find's optimizations. If we can't be sure of the link
+ * (because of volume mount points) we set i_nlink to 1 which forces find
+ * to consider every child as a possible directory. We should also never
+ * see an increment or decrement for deleted directories where i_nlink == 0 */
+static inline void coda_dir_inc_nlink(struct inode *dir)
+{
+	if (dir->i_nlink >= 2)
+		inc_nlink(dir);
+}
+
+static inline void coda_dir_drop_nlink(struct inode *dir)
+{
+	if (dir->i_nlink > 2)
+		drop_nlink(dir);
+}
+
+/* creation routines: create, mknod, mkdir, link, symlink */
+static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd)
+{
+	int error;
+	const char *name=de->d_name.name;
+	int length=de->d_name.len;
+	struct inode *inode;
+	struct CodaFid newfid;
+	struct coda_vattr attrs;
+
+	if (coda_isroot(dir) && coda_iscontrol(name, length))
+		return -EPERM;
+
+	error = venus_create(dir->i_sb, coda_i2f(dir), name, length, 
+				0, mode, &newfid, &attrs);
+	if (error)
+		goto err_out;
+
+	inode = coda_iget(dir->i_sb, &newfid, &attrs);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto err_out;
+	}
+
+	/* invalidate the directory cnode's attributes */
+	coda_dir_update_mtime(dir);
+	d_instantiate(de, inode);
+	return 0;
+err_out:
+	d_drop(de);
+	return error;
+}
+
+static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
+{
+	struct inode *inode;
+	struct coda_vattr attrs;
+	const char *name = de->d_name.name;
+	int len = de->d_name.len;
+	int error;
+	struct CodaFid newfid;
+
+	if (coda_isroot(dir) && coda_iscontrol(name, len))
+		return -EPERM;
+
+	attrs.va_mode = mode;
+	error = venus_mkdir(dir->i_sb, coda_i2f(dir), 
+			       name, len, &newfid, &attrs);
+	if (error)
+		goto err_out;
+         
+	inode = coda_iget(dir->i_sb, &newfid, &attrs);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto err_out;
+	}
+
+	/* invalidate the directory cnode's attributes */
+	coda_dir_inc_nlink(dir);
+	coda_dir_update_mtime(dir);
+	d_instantiate(de, inode);
+	return 0;
+err_out:
+	d_drop(de);
+	return error;
+}
+
+/* try to make de an entry in dir_inodde linked to source_de */ 
+static int coda_link(struct dentry *source_de, struct inode *dir_inode, 
+	  struct dentry *de)
+{
+	struct inode *inode = source_de->d_inode;
+        const char * name = de->d_name.name;
+	int len = de->d_name.len;
+	int error;
+
+	if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
+		return -EPERM;
+
+	error = venus_link(dir_inode->i_sb, coda_i2f(inode),
+			   coda_i2f(dir_inode), (const char *)name, len);
+	if (error) {
+		d_drop(de);
+		return error;
+	}
+
+	coda_dir_update_mtime(dir_inode);
+	ihold(inode);
+	d_instantiate(de, inode);
+	inc_nlink(inode);
+	return 0;
+}
+
+
+static int coda_symlink(struct inode *dir_inode, struct dentry *de,
+			const char *symname)
+{
+	const char *name = de->d_name.name;
+	int len = de->d_name.len;
+	int symlen;
+	int error;
+
+	if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
+		return -EPERM;
+
+	symlen = strlen(symname);
+	if (symlen > CODA_MAXPATHLEN)
+		return -ENAMETOOLONG;
+
+	/*
+	 * This entry is now negative. Since we do not create
+	 * an inode for the entry we have to drop it.
+	 */
+	d_drop(de);
+	error = venus_symlink(dir_inode->i_sb, coda_i2f(dir_inode), name, len,
+			      symname, symlen);
+
+	/* mtime is no good anymore */
+	if (!error)
+		coda_dir_update_mtime(dir_inode);
+
+	return error;
+}
+
+/* destruction routines: unlink, rmdir */
+static int coda_unlink(struct inode *dir, struct dentry *de)
+{
+        int error;
+	const char *name = de->d_name.name;
+	int len = de->d_name.len;
+
+	error = venus_remove(dir->i_sb, coda_i2f(dir), name, len);
+	if (error)
+		return error;
+
+	coda_dir_update_mtime(dir);
+	drop_nlink(de->d_inode);
+	return 0;
+}
+
+static int coda_rmdir(struct inode *dir, struct dentry *de)
+{
+	const char *name = de->d_name.name;
+	int len = de->d_name.len;
+	int error;
+
+	error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
+	if (!error) {
+		/* VFS may delete the child */
+		if (de->d_inode)
+		    de->d_inode->i_nlink = 0;
+
+		/* fix the link count of the parent */
+		coda_dir_drop_nlink(dir);
+		coda_dir_update_mtime(dir);
+	}
+	return error;
+}
+
+/* rename */
+static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
+{
+	const char *old_name = old_dentry->d_name.name;
+	const char *new_name = new_dentry->d_name.name;
+	int old_length = old_dentry->d_name.len;
+	int new_length = new_dentry->d_name.len;
+	int error;
+
+	error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
+			     coda_i2f(new_dir), old_length, new_length,
+			     (const char *) old_name, (const char *)new_name);
+	if (!error) {
+		if (new_dentry->d_inode) {
+			if (S_ISDIR(new_dentry->d_inode->i_mode)) {
+				coda_dir_drop_nlink(old_dir);
+				coda_dir_inc_nlink(new_dir);
+			}
+			coda_dir_update_mtime(old_dir);
+			coda_dir_update_mtime(new_dir);
+			coda_flag_inode(new_dentry->d_inode, C_VATTR);
+		} else {
+			coda_flag_inode(old_dir, C_VATTR);
+			coda_flag_inode(new_dir, C_VATTR);
+		}
+	}
+	return error;
+}
+
+
+/* file operations for directories */
+static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
+{
+	struct coda_file_info *cfi;
+	struct file *host_file;
+	int ret;
+
+	cfi = CODA_FTOC(coda_file);
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	host_file = cfi->cfi_container;
+
+	if (!host_file->f_op)
+		return -ENOTDIR;
+
+	if (host_file->f_op->readdir)
+	{
+		/* potemkin case: we were handed a directory inode.
+		 * We can't use vfs_readdir because we have to keep the file
+		 * position in sync between the coda_file and the host_file.
+		 * and as such we need grab the inode mutex. */
+		struct inode *host_inode = host_file->f_path.dentry->d_inode;
+
+		mutex_lock(&host_inode->i_mutex);
+		host_file->f_pos = coda_file->f_pos;
+
+		ret = -ENOENT;
+		if (!IS_DEADDIR(host_inode)) {
+			ret = host_file->f_op->readdir(host_file, buf, filldir);
+			file_accessed(host_file);
+		}
+
+		coda_file->f_pos = host_file->f_pos;
+		mutex_unlock(&host_inode->i_mutex);
+	}
+	else /* Venus: we must read Venus dirents from a file */
+		ret = coda_venus_readdir(coda_file, buf, filldir);
+
+	return ret;
+}
+
+static inline unsigned int CDT2DT(unsigned char cdt)
+{
+	unsigned int dt;
+
+	switch(cdt) {
+	case CDT_UNKNOWN: dt = DT_UNKNOWN; break;
+	case CDT_FIFO:	  dt = DT_FIFO;    break;
+	case CDT_CHR:	  dt = DT_CHR;     break;
+	case CDT_DIR:	  dt = DT_DIR;     break;
+	case CDT_BLK:	  dt = DT_BLK;     break;
+	case CDT_REG:	  dt = DT_REG;     break;
+	case CDT_LNK:	  dt = DT_LNK;     break;
+	case CDT_SOCK:	  dt = DT_SOCK;    break;
+	case CDT_WHT:	  dt = DT_WHT;     break;
+	default:	  dt = DT_UNKNOWN; break;
+	}
+	return dt;
+}
+
+/* support routines */
+static int coda_venus_readdir(struct file *coda_file, void *buf,
+			      filldir_t filldir)
+{
+	int result = 0; /* # of entries returned */
+	struct coda_file_info *cfi;
+	struct coda_inode_info *cii;
+	struct file *host_file;
+	struct dentry *de;
+	struct venus_dirent *vdir;
+	unsigned long vdir_size =
+	    (unsigned long)(&((struct venus_dirent *)0)->d_name);
+	unsigned int type;
+	struct qstr name;
+	ino_t ino;
+	int ret;
+
+	cfi = CODA_FTOC(coda_file);
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	host_file = cfi->cfi_container;
+
+	de = coda_file->f_path.dentry;
+	cii = ITOC(de->d_inode);
+
+	vdir = kmalloc(sizeof(*vdir), GFP_KERNEL);
+	if (!vdir) return -ENOMEM;
+
+	if (coda_file->f_pos == 0) {
+		ret = filldir(buf, ".", 1, 0, de->d_inode->i_ino, DT_DIR);
+		if (ret < 0)
+			goto out;
+		result++;
+		coda_file->f_pos++;
+	}
+	if (coda_file->f_pos == 1) {
+		ret = filldir(buf, "..", 2, 1, de->d_parent->d_inode->i_ino, DT_DIR);
+		if (ret < 0)
+			goto out;
+		result++;
+		coda_file->f_pos++;
+	}
+	while (1) {
+		/* read entries from the directory file */
+		ret = kernel_read(host_file, coda_file->f_pos - 2, (char *)vdir,
+				  sizeof(*vdir));
+		if (ret < 0) {
+			printk(KERN_ERR "coda readdir: read dir %s failed %d\n",
+			       coda_f2s(&cii->c_fid), ret);
+			break;
+		}
+		if (ret == 0) break; /* end of directory file reached */
+
+		/* catch truncated reads */
+		if (ret < vdir_size || ret < vdir_size + vdir->d_namlen) {
+			printk(KERN_ERR "coda readdir: short read on %s\n",
+			       coda_f2s(&cii->c_fid));
+			ret = -EBADF;
+			break;
+		}
+		/* validate whether the directory file actually makes sense */
+		if (vdir->d_reclen < vdir_size + vdir->d_namlen) {
+			printk(KERN_ERR "coda readdir: invalid dir %s\n",
+			       coda_f2s(&cii->c_fid));
+			ret = -EBADF;
+			break;
+		}
+
+		name.len = vdir->d_namlen;
+		name.name = vdir->d_name;
+
+		/* Make sure we skip '.' and '..', we already got those */
+		if (name.name[0] == '.' && (name.len == 1 ||
+		    (vdir->d_name[1] == '.' && name.len == 2)))
+			vdir->d_fileno = name.len = 0;
+
+		/* skip null entries */
+		if (vdir->d_fileno && name.len) {
+			/* try to look up this entry in the dcache, that way
+			 * userspace doesn't have to worry about breaking
+			 * getcwd by having mismatched inode numbers for
+			 * internal volume mountpoints. */
+			ino = find_inode_number(de, &name);
+			if (!ino) ino = vdir->d_fileno;
+
+			type = CDT2DT(vdir->d_type);
+			ret = filldir(buf, name.name, name.len,
+				      coda_file->f_pos, ino, type);
+			/* failure means no space for filling in this round */
+			if (ret < 0) break;
+			result++;
+		}
+		/* we'll always have progress because d_reclen is unsigned and
+		 * we've already established it is non-zero. */
+		coda_file->f_pos += vdir->d_reclen;
+	}
+out:
+	kfree(vdir);
+	return result ? result : ret;
+}
+
+/* called when a cache lookup succeeds */
+static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
+{
+	struct inode *inode;
+	struct coda_inode_info *cii;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = de->d_inode;
+	if (!inode || coda_isroot(inode))
+		goto out;
+	if (is_bad_inode(inode))
+		goto bad;
+
+	cii = ITOC(de->d_inode);
+	if (!(cii->c_flags & (C_PURGE | C_FLUSH)))
+		goto out;
+
+	shrink_dcache_parent(de);
+
+	/* propagate for a flush */
+	if (cii->c_flags & C_FLUSH) 
+		coda_flag_inode_children(inode, C_FLUSH);
+
+	if (de->d_count > 1)
+		/* pretend it's valid, but don't change the flags */
+		goto out;
+
+	/* clear the flags. */
+	spin_lock(&cii->c_lock);
+	cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
+	spin_unlock(&cii->c_lock);
+bad:
+	return 0;
+out:
+	return 1;
+}
+
+/*
+ * This is the callback from dput() when d_count is going to 0.
+ * We use this to unhash dentries with bad inodes.
+ */
+static int coda_dentry_delete(const struct dentry * dentry)
+{
+	int flags;
+
+	if (!dentry->d_inode) 
+		return 0;
+
+	flags = (ITOC(dentry->d_inode)->c_flags) & C_PURGE;
+	if (is_bad_inode(dentry->d_inode) || flags) {
+		return 1;
+	}
+	return 0;
+}
+
+
+
+/*
+ * This is called when we want to check if the inode has
+ * changed on the server.  Coda makes this easy since the
+ * cache manager Venus issues a downcall to the kernel when this 
+ * happens 
+ */
+int coda_revalidate_inode(struct dentry *dentry)
+{
+	struct coda_vattr attr;
+	int error;
+	int old_mode;
+	ino_t old_ino;
+	struct inode *inode = dentry->d_inode;
+	struct coda_inode_info *cii = ITOC(inode);
+
+	if (!cii->c_flags)
+		return 0;
+
+	if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) {
+		error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr);
+		if (error)
+			return -EIO;
+
+		/* this inode may be lost if:
+		   - it's ino changed 
+		   - type changes must be permitted for repair and
+		   missing mount points.
+		*/
+		old_mode = inode->i_mode;
+		old_ino = inode->i_ino;
+		coda_vattr_to_iattr(inode, &attr);
+
+		if ((old_mode & S_IFMT) != (inode->i_mode & S_IFMT)) {
+			printk("Coda: inode %ld, fid %s changed type!\n",
+			       inode->i_ino, coda_f2s(&(cii->c_fid)));
+		}
+
+		/* the following can happen when a local fid is replaced 
+		   with a global one, here we lose and declare the inode bad */
+		if (inode->i_ino != old_ino)
+			return -EIO;
+		
+		coda_flag_inode_children(inode, C_FLUSH);
+
+		spin_lock(&cii->c_lock);
+		cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
+		spin_unlock(&cii->c_lock);
+	}
+	return 0;
+}
diff --git a/fs/coda/file.c b/fs/coda/file.c
new file mode 100644
index 00000000..0433057b
--- /dev/null
+++ b/fs/coda/file.c
@@ -0,0 +1,234 @@
+/*
+ * File operations for Coda.
+ * Original version: (C) 1996 Peter Braam 
+ * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon encourages users of this code to contribute improvements
+ * to the Coda project. Contact Peter Braam <coda@cs.cmu.edu>.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/cred.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+
+#include "coda_linux.h"
+#include "coda_int.h"
+
+static ssize_t
+coda_file_read(struct file *coda_file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct coda_file_info *cfi;
+	struct file *host_file;
+
+	cfi = CODA_FTOC(coda_file);
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	host_file = cfi->cfi_container;
+
+	if (!host_file->f_op || !host_file->f_op->read)
+		return -EINVAL;
+
+	return host_file->f_op->read(host_file, buf, count, ppos);
+}
+
+static ssize_t
+coda_file_splice_read(struct file *coda_file, loff_t *ppos,
+		      struct pipe_inode_info *pipe, size_t count,
+		      unsigned int flags)
+{
+	ssize_t (*splice_read)(struct file *, loff_t *,
+			       struct pipe_inode_info *, size_t, unsigned int);
+	struct coda_file_info *cfi;
+	struct file *host_file;
+
+	cfi = CODA_FTOC(coda_file);
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	host_file = cfi->cfi_container;
+
+	splice_read = host_file->f_op->splice_read;
+	if (!splice_read)
+		splice_read = default_file_splice_read;
+
+	return splice_read(host_file, ppos, pipe, count, flags);
+}
+
+static ssize_t
+coda_file_write(struct file *coda_file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct inode *host_inode, *coda_inode = coda_file->f_path.dentry->d_inode;
+	struct coda_file_info *cfi;
+	struct file *host_file;
+	ssize_t ret;
+
+	cfi = CODA_FTOC(coda_file);
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	host_file = cfi->cfi_container;
+
+	if (!host_file->f_op || !host_file->f_op->write)
+		return -EINVAL;
+
+	host_inode = host_file->f_path.dentry->d_inode;
+	mutex_lock(&coda_inode->i_mutex);
+
+	ret = host_file->f_op->write(host_file, buf, count, ppos);
+
+	coda_inode->i_size = host_inode->i_size;
+	coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
+	coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC;
+	mutex_unlock(&coda_inode->i_mutex);
+
+	return ret;
+}
+
+static int
+coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
+{
+	struct coda_file_info *cfi;
+	struct coda_inode_info *cii;
+	struct file *host_file;
+	struct inode *coda_inode, *host_inode;
+
+	cfi = CODA_FTOC(coda_file);
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	host_file = cfi->cfi_container;
+
+	if (!host_file->f_op || !host_file->f_op->mmap)
+		return -ENODEV;
+
+	coda_inode = coda_file->f_path.dentry->d_inode;
+	host_inode = host_file->f_path.dentry->d_inode;
+
+	cii = ITOC(coda_inode);
+	spin_lock(&cii->c_lock);
+	coda_file->f_mapping = host_file->f_mapping;
+	if (coda_inode->i_mapping == &coda_inode->i_data)
+		coda_inode->i_mapping = host_inode->i_mapping;
+
+	/* only allow additional mmaps as long as userspace isn't changing
+	 * the container file on us! */
+	else if (coda_inode->i_mapping != host_inode->i_mapping) {
+		spin_unlock(&cii->c_lock);
+		return -EBUSY;
+	}
+
+	/* keep track of how often the coda_inode/host_file has been mmapped */
+	cii->c_mapcount++;
+	cfi->cfi_mapcount++;
+	spin_unlock(&cii->c_lock);
+
+	return host_file->f_op->mmap(host_file, vma);
+}
+
+int coda_open(struct inode *coda_inode, struct file *coda_file)
+{
+	struct file *host_file = NULL;
+	int error;
+	unsigned short flags = coda_file->f_flags & (~O_EXCL);
+	unsigned short coda_flags = coda_flags_to_cflags(flags);
+	struct coda_file_info *cfi;
+
+	cfi = kmalloc(sizeof(struct coda_file_info), GFP_KERNEL);
+	if (!cfi)
+		return -ENOMEM;
+
+	error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags,
+			   &host_file);
+	if (!host_file)
+		error = -EIO;
+
+	if (error) {
+		kfree(cfi);
+		return error;
+	}
+
+	host_file->f_flags |= coda_file->f_flags & (O_APPEND | O_SYNC);
+
+	cfi->cfi_magic = CODA_MAGIC;
+	cfi->cfi_mapcount = 0;
+	cfi->cfi_container = host_file;
+
+	BUG_ON(coda_file->private_data != NULL);
+	coda_file->private_data = cfi;
+	return 0;
+}
+
+int coda_release(struct inode *coda_inode, struct file *coda_file)
+{
+	unsigned short flags = (coda_file->f_flags) & (~O_EXCL);
+	unsigned short coda_flags = coda_flags_to_cflags(flags);
+	struct coda_file_info *cfi;
+	struct coda_inode_info *cii;
+	struct inode *host_inode;
+	int err;
+
+	cfi = CODA_FTOC(coda_file);
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+
+	err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
+			  coda_flags, coda_file->f_cred->fsuid);
+
+	host_inode = cfi->cfi_container->f_path.dentry->d_inode;
+	cii = ITOC(coda_inode);
+
+	/* did we mmap this file? */
+	spin_lock(&cii->c_lock);
+	if (coda_inode->i_mapping == &host_inode->i_data) {
+		cii->c_mapcount -= cfi->cfi_mapcount;
+		if (!cii->c_mapcount)
+			coda_inode->i_mapping = &coda_inode->i_data;
+	}
+	spin_unlock(&cii->c_lock);
+
+	fput(cfi->cfi_container);
+	kfree(coda_file->private_data);
+	coda_file->private_data = NULL;
+
+	/* VFS fput ignores the return value from file_operations->release, so
+	 * there is no use returning an error here */
+	return 0;
+}
+
+int coda_fsync(struct file *coda_file, int datasync)
+{
+	struct file *host_file;
+	struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
+	struct coda_file_info *cfi;
+	int err;
+
+	if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) ||
+	      S_ISLNK(coda_inode->i_mode)))
+		return -EINVAL;
+
+	cfi = CODA_FTOC(coda_file);
+	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+	host_file = cfi->cfi_container;
+
+	err = vfs_fsync(host_file, datasync);
+	if (!err && !datasync)
+		err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
+
+	return err;
+}
+
+const struct file_operations coda_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= coda_file_read,
+	.write		= coda_file_write,
+	.mmap		= coda_file_mmap,
+	.open		= coda_open,
+	.release	= coda_release,
+	.fsync		= coda_fsync,
+	.splice_read	= coda_file_splice_read,
+};
+
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
new file mode 100644
index 00000000..871b2771
--- /dev/null
+++ b/fs/coda/inode.c
@@ -0,0 +1,329 @@
+/*
+ * Super block/filesystem wide operations
+ *
+ * Copyright (C) 1996 Peter J. Braam <braam@maths.ox.ac.uk> and 
+ * Michael Callahan <callahan@maths.ox.ac.uk> 
+ * 
+ * Rewritten for Linux 2.1.  Peter Braam <braam@cs.cmu.edu>
+ * Copyright (C) Carnegie Mellon University
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
+
+#include "coda_int.h"
+
+/* VFS super_block ops */
+static void coda_evict_inode(struct inode *);
+static void coda_put_super(struct super_block *);
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf);
+
+static struct kmem_cache * coda_inode_cachep;
+
+static struct inode *coda_alloc_inode(struct super_block *sb)
+{
+	struct coda_inode_info *ei;
+	ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	memset(&ei->c_fid, 0, sizeof(struct CodaFid));
+	ei->c_flags = 0;
+	ei->c_uid = 0;
+	ei->c_cached_perm = 0;
+	spin_lock_init(&ei->c_lock);
+	return &ei->vfs_inode;
+}
+
+static void coda_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(coda_inode_cachep, ITOC(inode));
+}
+
+static void coda_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, coda_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct coda_inode_info *ei = (struct coda_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+int coda_init_inodecache(void)
+{
+	coda_inode_cachep = kmem_cache_create("coda_inode_cache",
+				sizeof(struct coda_inode_info),
+				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+				init_once);
+	if (coda_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void coda_destroy_inodecache(void)
+{
+	kmem_cache_destroy(coda_inode_cachep);
+}
+
+static int coda_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_NOATIME;
+	return 0;
+}
+
+/* exported operations */
+static const struct super_operations coda_super_operations =
+{
+	.alloc_inode	= coda_alloc_inode,
+	.destroy_inode	= coda_destroy_inode,
+	.evict_inode	= coda_evict_inode,
+	.put_super	= coda_put_super,
+	.statfs		= coda_statfs,
+	.remount_fs	= coda_remount,
+};
+
+static int get_device_index(struct coda_mount_data *data)
+{
+	struct file *file;
+	struct inode *inode;
+	int idx;
+
+	if(data == NULL) {
+		printk("coda_read_super: Bad mount data\n");
+		return -1;
+	}
+
+	if(data->version != CODA_MOUNT_VERSION) {
+		printk("coda_read_super: Bad mount version\n");
+		return -1;
+	}
+
+	file = fget(data->fd);
+	inode = NULL;
+	if(file)
+		inode = file->f_path.dentry->d_inode;
+	
+	if(!inode || !S_ISCHR(inode->i_mode) ||
+	   imajor(inode) != CODA_PSDEV_MAJOR) {
+		if(file)
+			fput(file);
+
+		printk("coda_read_super: Bad file\n");
+		return -1;
+	}
+
+	idx = iminor(inode);
+	fput(file);
+
+	if(idx < 0 || idx >= MAX_CODADEVS) {
+		printk("coda_read_super: Bad minor number\n");
+		return -1;
+	}
+
+	return idx;
+}
+
+static int coda_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *root = NULL;
+	struct venus_comm *vc;
+	struct CodaFid fid;
+	int error;
+	int idx;
+
+	idx = get_device_index((struct coda_mount_data *) data);
+
+	/* Ignore errors in data, for backward compatibility */
+	if(idx == -1)
+		idx = 0;
+	
+	printk(KERN_INFO "coda_read_super: device index: %i\n", idx);
+
+	vc = &coda_comms[idx];
+	mutex_lock(&vc->vc_mutex);
+
+	if (!vc->vc_inuse) {
+		printk("coda_read_super: No pseudo device\n");
+		error = -EINVAL;
+		goto unlock_out;
+	}
+
+	if (vc->vc_sb) {
+		printk("coda_read_super: Device already mounted\n");
+		error = -EBUSY;
+		goto unlock_out;
+	}
+
+	error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
+	if (error)
+		goto unlock_out;
+
+	vc->vc_sb = sb;
+	mutex_unlock(&vc->vc_mutex);
+
+	sb->s_fs_info = vc;
+	sb->s_flags |= MS_NOATIME;
+	sb->s_blocksize = 4096;	/* XXXXX  what do we put here?? */
+	sb->s_blocksize_bits = 12;
+	sb->s_magic = CODA_SUPER_MAGIC;
+	sb->s_op = &coda_super_operations;
+	sb->s_d_op = &coda_dentry_operations;
+	sb->s_bdi = &vc->bdi;
+
+	/* get root fid from Venus: this needs the root inode */
+	error = venus_rootfid(sb, &fid);
+	if ( error ) {
+	        printk("coda_read_super: coda_get_rootfid failed with %d\n",
+		       error);
+		goto error;
+	}
+	printk("coda_read_super: rootfid is %s\n", coda_f2s(&fid));
+	
+	/* make root inode */
+        error = coda_cnode_make(&root, &fid, sb);
+        if ( error || !root ) {
+	    printk("Failure of coda_cnode_make for root: error %d\n", error);
+	    goto error;
+	} 
+
+	printk("coda_read_super: rootinode is %ld dev %s\n", 
+	       root->i_ino, root->i_sb->s_id);
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		error = -EINVAL;
+		goto error;
+	}
+	return 0;
+
+error:
+	if (root)
+		iput(root);
+
+	mutex_lock(&vc->vc_mutex);
+	bdi_destroy(&vc->bdi);
+	vc->vc_sb = NULL;
+	sb->s_fs_info = NULL;
+unlock_out:
+	mutex_unlock(&vc->vc_mutex);
+	return error;
+}
+
+static void coda_put_super(struct super_block *sb)
+{
+	struct venus_comm *vcp = coda_vcp(sb);
+	mutex_lock(&vcp->vc_mutex);
+	bdi_destroy(&vcp->bdi);
+	vcp->vc_sb = NULL;
+	sb->s_fs_info = NULL;
+	mutex_unlock(&vcp->vc_mutex);
+
+	printk("Coda: Bye bye.\n");
+}
+
+static void coda_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	coda_cache_clear_inode(inode);
+}
+
+int coda_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	int err = coda_revalidate_inode(dentry);
+	if (!err)
+		generic_fillattr(dentry->d_inode, stat);
+	return err;
+}
+
+int coda_setattr(struct dentry *de, struct iattr *iattr)
+{
+	struct inode *inode = de->d_inode;
+	struct coda_vattr vattr;
+	int error;
+
+	memset(&vattr, 0, sizeof(vattr)); 
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	coda_iattr_to_vattr(iattr, &vattr);
+	vattr.va_type = C_VNON; /* cannot set type */
+
+	/* Venus is responsible for truncating the container-file!!! */
+	error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr);
+
+	if (!error) {
+	        coda_vattr_to_iattr(inode, &vattr); 
+		coda_cache_clear_inode(inode);
+	}
+	return error;
+}
+
+const struct inode_operations coda_file_inode_operations = {
+	.permission	= coda_permission,
+	.getattr	= coda_getattr,
+	.setattr	= coda_setattr,
+};
+
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	int error;
+	
+	error = venus_statfs(dentry, buf);
+
+	if (error) {
+		/* fake something like AFS does */
+		buf->f_blocks = 9000000;
+		buf->f_bfree  = 9000000;
+		buf->f_bavail = 9000000;
+		buf->f_files  = 9000000;
+		buf->f_ffree  = 9000000;
+	}
+
+	/* and fill in the rest */
+	buf->f_type = CODA_SUPER_MAGIC;
+	buf->f_bsize = 4096;
+	buf->f_namelen = CODA_MAXNAMLEN;
+
+	return 0; 
+}
+
+/* init_coda: used by filesystems.c to register coda */
+
+static struct dentry *coda_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_nodev(fs_type, flags, data, coda_fill_super);
+}
+
+struct file_system_type coda_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "coda",
+	.mount		= coda_mount,
+	.kill_sb	= kill_anon_super,
+	.fs_flags	= FS_BINARY_MOUNTDATA,
+};
+
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
new file mode 100644
index 00000000..cb140ef2
--- /dev/null
+++ b/fs/coda/pioctl.c
@@ -0,0 +1,90 @@
+/*
+ * Pioctl operations for Coda.
+ * Original version: (C) 1996 Peter Braam
+ * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon encourages users of this code to contribute improvements
+ * to the Coda project. Contact Peter Braam <coda@cs.cmu.edu>.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/namei.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+
+#include "coda_linux.h"
+
+/* pioctl ops */
+static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
+static long coda_pioctl(struct file *filp, unsigned int cmd,
+			unsigned long user_data);
+
+/* exported from this file */
+const struct inode_operations coda_ioctl_inode_operations = {
+	.permission	= coda_ioctl_permission,
+	.setattr	= coda_setattr,
+};
+
+const struct file_operations coda_ioctl_operations = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl	= coda_pioctl,
+	.llseek		= noop_llseek,
+};
+
+/* the coda pioctl inode ops */
+static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	return (mask & MAY_EXEC) ? -EACCES : 0;
+}
+
+static long coda_pioctl(struct file *filp, unsigned int cmd,
+			unsigned long user_data)
+{
+	struct path path;
+	int error;
+	struct PioctlData data;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *target_inode = NULL;
+	struct coda_inode_info *cnp;
+
+	/* get the Pioctl data arguments from user space */
+	if (copy_from_user(&data, (void __user *)user_data, sizeof(data)))
+		return -EINVAL;
+
+	/*
+	 * Look up the pathname. Note that the pathname is in
+	 * user memory, and namei takes care of this
+	 */
+	if (data.follow)
+		error = user_path(data.path, &path);
+	else
+		error = user_lpath(data.path, &path);
+
+	if (error)
+		return error;
+
+	target_inode = path.dentry->d_inode;
+
+	/* return if it is not a Coda inode */
+	if (target_inode->i_sb != inode->i_sb) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	/* now proceed to make the upcall */
+	cnp = ITOC(target_inode);
+
+	error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
+out:
+	path_put(&path);
+	return error;
+}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
new file mode 100644
index 00000000..8f616e0e
--- /dev/null
+++ b/fs/coda/psdev.c
@@ -0,0 +1,430 @@
+/*
+ *      	An implementation of a loadable kernel mode driver providing
+ *		multiple kernel/user space bidirectional communications links.
+ *
+ * 		Author: 	Alan Cox <alan@lxorguk.ukuu.org.uk>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ * 
+ *              Adapted to become the Linux 2.0 Coda pseudo device
+ *              Peter  Braam  <braam@maths.ox.ac.uk> 
+ *              Michael Callahan <mjc@emmy.smith.edu>           
+ *
+ *              Changes for Linux 2.1
+ *              Copyright (c) 1997 Carnegie-Mellon University
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/device.h>
+#include <asm/io.h>
+#include <asm/system.h>
+#include <asm/poll.h>
+#include <asm/uaccess.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+
+#include "coda_linux.h"
+
+#include "coda_int.h"
+
+/* statistics */
+int           coda_hard;         /* allows signals during upcalls */
+unsigned long coda_timeout = 30; /* .. secs, then signals will dequeue */
+
+
+struct venus_comm coda_comms[MAX_CODADEVS];
+static struct class *coda_psdev_class;
+
+/*
+ * Device operations
+ */
+
+static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
+{
+        struct venus_comm *vcp = (struct venus_comm *) file->private_data;
+	unsigned int mask = POLLOUT | POLLWRNORM;
+
+	poll_wait(file, &vcp->vc_waitq, wait);
+	mutex_lock(&vcp->vc_mutex);
+	if (!list_empty(&vcp->vc_pending))
+                mask |= POLLIN | POLLRDNORM;
+	mutex_unlock(&vcp->vc_mutex);
+
+	return mask;
+}
+
+static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg)
+{
+	unsigned int data;
+
+	switch(cmd) {
+	case CIOC_KERNEL_VERSION:
+		data = CODA_KERNEL_VERSION;
+		return put_user(data, (int __user *) arg);
+	default:
+		return -ENOTTY;
+	}
+
+	return 0;
+}
+
+/*
+ *	Receive a message written by Venus to the psdev
+ */
+ 
+static ssize_t coda_psdev_write(struct file *file, const char __user *buf, 
+				size_t nbytes, loff_t *off)
+{
+        struct venus_comm *vcp = (struct venus_comm *) file->private_data;
+        struct upc_req *req = NULL;
+        struct upc_req *tmp;
+	struct list_head *lh;
+	struct coda_in_hdr hdr;
+	ssize_t retval = 0, count = 0;
+	int error;
+
+        /* Peek at the opcode, uniquefier */
+	if (copy_from_user(&hdr, buf, 2 * sizeof(u_long)))
+	        return -EFAULT;
+
+        if (DOWNCALL(hdr.opcode)) {
+		union outputArgs *dcbuf;
+		int size = sizeof(*dcbuf);
+
+		if  ( nbytes < sizeof(struct coda_out_hdr) ) {
+		        printk("coda_downcall opc %d uniq %d, not enough!\n",
+			       hdr.opcode, hdr.unique);
+			count = nbytes;
+			goto out;
+		}
+		if ( nbytes > size ) {
+		        printk("Coda: downcall opc %d, uniq %d, too much!",
+			       hdr.opcode, hdr.unique);
+		        nbytes = size;
+		}
+		CODA_ALLOC(dcbuf, union outputArgs *, nbytes);
+		if (copy_from_user(dcbuf, buf, nbytes)) {
+			CODA_FREE(dcbuf, nbytes);
+			retval = -EFAULT;
+			goto out;
+		}
+
+		/* what downcall errors does Venus handle ? */
+		error = coda_downcall(vcp, hdr.opcode, dcbuf);
+
+		CODA_FREE(dcbuf, nbytes);
+		if (error) {
+		        printk("psdev_write: coda_downcall error: %d\n", error);
+			retval = error;
+			goto out;
+		}
+		count = nbytes;
+		goto out;
+	}
+        
+	/* Look for the message on the processing queue. */
+	mutex_lock(&vcp->vc_mutex);
+	list_for_each(lh, &vcp->vc_processing) {
+		tmp = list_entry(lh, struct upc_req , uc_chain);
+		if (tmp->uc_unique == hdr.unique) {
+			req = tmp;
+			list_del(&req->uc_chain);
+			break;
+		}
+	}
+	mutex_unlock(&vcp->vc_mutex);
+
+	if (!req) {
+		printk("psdev_write: msg (%d, %d) not found\n", 
+			hdr.opcode, hdr.unique);
+		retval = -ESRCH;
+		goto out;
+	}
+
+        /* move data into response buffer. */
+	if (req->uc_outSize < nbytes) {
+                printk("psdev_write: too much cnt: %d, cnt: %ld, opc: %d, uniq: %d.\n",
+		       req->uc_outSize, (long)nbytes, hdr.opcode, hdr.unique);
+		nbytes = req->uc_outSize; /* don't have more space! */
+	}
+        if (copy_from_user(req->uc_data, buf, nbytes)) {
+		req->uc_flags |= CODA_REQ_ABORT;
+		wake_up(&req->uc_sleep);
+		retval = -EFAULT;
+		goto out;
+	}
+
+	/* adjust outsize. is this useful ?? */
+	req->uc_outSize = nbytes;
+	req->uc_flags |= CODA_REQ_WRITE;
+	count = nbytes;
+
+	/* Convert filedescriptor into a file handle */
+	if (req->uc_opcode == CODA_OPEN_BY_FD) {
+		struct coda_open_by_fd_out *outp =
+			(struct coda_open_by_fd_out *)req->uc_data;
+		if (!outp->oh.result)
+			outp->fh = fget(outp->fd);
+	}
+
+        wake_up(&req->uc_sleep);
+out:
+        return(count ? count : retval);  
+}
+
+/*
+ *	Read a message from the kernel to Venus
+ */
+
+static ssize_t coda_psdev_read(struct file * file, char __user * buf, 
+			       size_t nbytes, loff_t *off)
+{
+	DECLARE_WAITQUEUE(wait, current);
+        struct venus_comm *vcp = (struct venus_comm *) file->private_data;
+        struct upc_req *req;
+	ssize_t retval = 0, count = 0;
+
+	if (nbytes == 0)
+		return 0;
+
+	mutex_lock(&vcp->vc_mutex);
+
+	add_wait_queue(&vcp->vc_waitq, &wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (list_empty(&vcp->vc_pending)) {
+		if (file->f_flags & O_NONBLOCK) {
+			retval = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+		mutex_unlock(&vcp->vc_mutex);
+		schedule();
+		mutex_lock(&vcp->vc_mutex);
+	}
+
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&vcp->vc_waitq, &wait);
+
+	if (retval)
+		goto out;
+
+	req = list_entry(vcp->vc_pending.next, struct upc_req,uc_chain);
+	list_del(&req->uc_chain);
+
+	/* Move the input args into userspace */
+	count = req->uc_inSize;
+	if (nbytes < req->uc_inSize) {
+                printk ("psdev_read: Venus read %ld bytes of %d in message\n",
+			(long)nbytes, req->uc_inSize);
+		count = nbytes;
+        }
+
+	if (copy_to_user(buf, req->uc_data, count))
+	        retval = -EFAULT;
+        
+	/* If request was not a signal, enqueue and don't free */
+	if (!(req->uc_flags & CODA_REQ_ASYNC)) {
+		req->uc_flags |= CODA_REQ_READ;
+		list_add_tail(&(req->uc_chain), &vcp->vc_processing);
+		goto out;
+	}
+
+	CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
+	kfree(req);
+out:
+	mutex_unlock(&vcp->vc_mutex);
+	return (count ? count : retval);
+}
+
+static int coda_psdev_open(struct inode * inode, struct file * file)
+{
+	struct venus_comm *vcp;
+	int idx, err;
+
+	idx = iminor(inode);
+	if (idx < 0 || idx >= MAX_CODADEVS)
+		return -ENODEV;
+
+	err = -EBUSY;
+	vcp = &coda_comms[idx];
+	mutex_lock(&vcp->vc_mutex);
+
+	if (!vcp->vc_inuse) {
+		vcp->vc_inuse++;
+
+		INIT_LIST_HEAD(&vcp->vc_pending);
+		INIT_LIST_HEAD(&vcp->vc_processing);
+		init_waitqueue_head(&vcp->vc_waitq);
+		vcp->vc_sb = NULL;
+		vcp->vc_seq = 0;
+
+		file->private_data = vcp;
+		err = 0;
+	}
+
+	mutex_unlock(&vcp->vc_mutex);
+	return err;
+}
+
+
+static int coda_psdev_release(struct inode * inode, struct file * file)
+{
+	struct venus_comm *vcp = (struct venus_comm *) file->private_data;
+	struct upc_req *req, *tmp;
+
+	if (!vcp || !vcp->vc_inuse ) {
+		printk("psdev_release: Not open.\n");
+		return -1;
+	}
+
+	mutex_lock(&vcp->vc_mutex);
+
+	/* Wakeup clients so they can return. */
+	list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) {
+		list_del(&req->uc_chain);
+
+		/* Async requests need to be freed here */
+		if (req->uc_flags & CODA_REQ_ASYNC) {
+			CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
+			kfree(req);
+			continue;
+		}
+		req->uc_flags |= CODA_REQ_ABORT;
+		wake_up(&req->uc_sleep);
+	}
+
+	list_for_each_entry_safe(req, tmp, &vcp->vc_processing, uc_chain) {
+		list_del(&req->uc_chain);
+
+		req->uc_flags |= CODA_REQ_ABORT;
+		wake_up(&req->uc_sleep);
+	}
+
+	file->private_data = NULL;
+	vcp->vc_inuse--;
+	mutex_unlock(&vcp->vc_mutex);
+	return 0;
+}
+
+
+static const struct file_operations coda_psdev_fops = {
+	.owner		= THIS_MODULE,
+	.read		= coda_psdev_read,
+	.write		= coda_psdev_write,
+	.poll		= coda_psdev_poll,
+	.unlocked_ioctl	= coda_psdev_ioctl,
+	.open		= coda_psdev_open,
+	.release	= coda_psdev_release,
+	.llseek		= noop_llseek,
+};
+
+static int init_coda_psdev(void)
+{
+	int i, err = 0;
+	if (register_chrdev(CODA_PSDEV_MAJOR, "coda", &coda_psdev_fops)) {
+              printk(KERN_ERR "coda_psdev: unable to get major %d\n", 
+		     CODA_PSDEV_MAJOR);
+              return -EIO;
+	}
+	coda_psdev_class = class_create(THIS_MODULE, "coda");
+	if (IS_ERR(coda_psdev_class)) {
+		err = PTR_ERR(coda_psdev_class);
+		goto out_chrdev;
+	}		
+	for (i = 0; i < MAX_CODADEVS; i++) {
+		mutex_init(&(&coda_comms[i])->vc_mutex);
+		device_create(coda_psdev_class, NULL,
+			      MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i);
+	}
+	coda_sysctl_init();
+	goto out;
+
+out_chrdev:
+	unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
+out:
+	return err;
+}
+
+MODULE_AUTHOR("Jan Harkes, Peter J. Braam");
+MODULE_DESCRIPTION("Coda Distributed File System VFS interface");
+MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR);
+MODULE_LICENSE("GPL");
+MODULE_VERSION("6.6");
+
+static int __init init_coda(void)
+{
+	int status;
+	int i;
+
+	status = coda_init_inodecache();
+	if (status)
+		goto out2;
+	status = init_coda_psdev();
+	if ( status ) {
+		printk("Problem (%d) in init_coda_psdev\n", status);
+		goto out1;
+	}
+	
+	status = register_filesystem(&coda_fs_type);
+	if (status) {
+		printk("coda: failed to register filesystem!\n");
+		goto out;
+	}
+	return 0;
+out:
+	for (i = 0; i < MAX_CODADEVS; i++)
+		device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
+	class_destroy(coda_psdev_class);
+	unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
+	coda_sysctl_clean();
+out1:
+	coda_destroy_inodecache();
+out2:
+	return status;
+}
+
+static void __exit exit_coda(void)
+{
+        int err, i;
+
+	err = unregister_filesystem(&coda_fs_type);
+        if ( err != 0 ) {
+                printk("coda: failed to unregister filesystem\n");
+        }
+	for (i = 0; i < MAX_CODADEVS; i++)
+		device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
+	class_destroy(coda_psdev_class);
+	unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
+	coda_sysctl_clean();
+	coda_destroy_inodecache();
+}
+
+module_init(init_coda);
+module_exit(exit_coda);
+
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
new file mode 100644
index 00000000..ab94ef63
--- /dev/null
+++ b/fs/coda/symlink.c
@@ -0,0 +1,50 @@
+/*
+ * Symlink inode operations for Coda filesystem
+ * Original version: (C) 1996 P. Braam and M. Callahan
+ * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University
+ * 
+ * Carnegie Mellon encourages users to contribute improvements to
+ * the Coda project. Contact Peter Braam (coda@cs.cmu.edu).
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+
+#include "coda_linux.h"
+
+static int coda_symlink_filler(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int error;
+	struct coda_inode_info *cii;
+	unsigned int len = PAGE_SIZE;
+	char *p = kmap(page);
+
+	cii = ITOC(inode);
+
+	error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
+	if (error)
+		goto fail;
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+	return 0;
+
+fail:
+	SetPageError(page);
+	kunmap(page);
+	unlock_page(page);
+	return error;
+}
+
+const struct address_space_operations coda_symlink_aops = {
+	.readpage	= coda_symlink_filler,
+};
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
new file mode 100644
index 00000000..af56ad56
--- /dev/null
+++ b/fs/coda/sysctl.c
@@ -0,0 +1,73 @@
+/*
+ * Sysctl operations for Coda filesystem
+ * Original version: (C) 1996 P. Braam and M. Callahan
+ * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University
+ * 
+ * Carnegie Mellon encourages users to contribute improvements to
+ * the Coda project. Contact Peter Braam (coda@cs.cmu.edu).
+ */
+
+#include <linux/sysctl.h>
+
+#include "coda_int.h"
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *fs_table_header;
+
+static ctl_table coda_table[] = {
+	{
+		.procname	= "timeout",
+		.data		= &coda_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "hard",
+		.data		= &coda_hard,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "fake_statfs",
+		.data		= &coda_fake_statfs,
+		.maxlen		= sizeof(int),
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec
+	},
+	{}
+};
+
+static ctl_table fs_table[] = {
+	{
+		.procname	= "coda",
+		.mode		= 0555,
+		.child		= coda_table
+	},
+	{}
+};
+
+void coda_sysctl_init(void)
+{
+	if ( !fs_table_header )
+		fs_table_header = register_sysctl_table(fs_table);
+}
+
+void coda_sysctl_clean(void)
+{
+	if ( fs_table_header ) {
+		unregister_sysctl_table(fs_table_header);
+		fs_table_header = NULL;
+	}
+}
+
+#else
+void coda_sysctl_init(void)
+{
+}
+
+void coda_sysctl_clean(void)
+{
+}
+#endif
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
new file mode 100644
index 00000000..9727e0c5
--- /dev/null
+++ b/fs/coda/upcall.c
@@ -0,0 +1,883 @@
+/*
+ * Mostly platform independent upcall operations to Venus:
+ *  -- upcalls
+ *  -- upcall routines
+ *
+ * Linux 2.0 version
+ * Copyright (C) 1996 Peter J. Braam <braam@maths.ox.ac.uk>, 
+ * Michael Callahan <callahan@maths.ox.ac.uk> 
+ * 
+ * Redone for Linux 2.1
+ * Copyright (C) 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this code to contribute
+ * improvements to the Coda project. Contact Peter Braam <coda@cs.cmu.edu>.
+ */
+
+#include <asm/system.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <asm/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/vfs.h>
+
+#include <linux/coda.h>
+#include <linux/coda_psdev.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
+
+#include "coda_int.h"
+
+static int coda_upcall(struct venus_comm *vc, int inSize, int *outSize,
+		       union inputArgs *buffer);
+
+static void *alloc_upcall(int opcode, int size)
+{
+	union inputArgs *inp;
+
+	CODA_ALLOC(inp, union inputArgs *, size);
+        if (!inp)
+		return ERR_PTR(-ENOMEM);
+
+        inp->ih.opcode = opcode;
+	inp->ih.pid = current->pid;
+	inp->ih.pgid = task_pgrp_nr(current);
+	inp->ih.uid = current_fsuid();
+
+	return (void*)inp;
+}
+
+#define UPARG(op)\
+do {\
+	inp = (union inputArgs *)alloc_upcall(op, insize); \
+        if (IS_ERR(inp)) { return PTR_ERR(inp); }\
+        outp = (union outputArgs *)(inp); \
+        outsize = insize; \
+} while (0)
+
+#define INSIZE(tag) sizeof(struct coda_ ## tag ## _in)
+#define OUTSIZE(tag) sizeof(struct coda_ ## tag ## _out)
+#define SIZE(tag)  max_t(unsigned int, INSIZE(tag), OUTSIZE(tag))
+
+
+/* the upcalls */
+int venus_rootfid(struct super_block *sb, struct CodaFid *fidp)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+
+        insize = SIZE(root);
+        UPARG(CODA_ROOT);
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+	if (!error)
+		*fidp = outp->coda_root.VFid;
+
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+int venus_getattr(struct super_block *sb, struct CodaFid *fid, 
+		     struct coda_vattr *attr) 
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+
+        insize = SIZE(getattr); 
+	UPARG(CODA_GETATTR);
+        inp->coda_getattr.VFid = *fid;
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+	if (!error)
+		*attr = outp->coda_getattr.attr;
+
+	CODA_FREE(inp, insize);
+        return error;
+}
+
+int venus_setattr(struct super_block *sb, struct CodaFid *fid, 
+		  struct coda_vattr *vattr)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+	
+	insize = SIZE(setattr);
+	UPARG(CODA_SETATTR);
+
+        inp->coda_setattr.VFid = *fid;
+	inp->coda_setattr.attr = *vattr;
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+        CODA_FREE(inp, insize);
+        return error;
+}
+
+int venus_lookup(struct super_block *sb, struct CodaFid *fid, 
+		    const char *name, int length, int * type, 
+		    struct CodaFid *resfid)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+	int offset;
+
+	offset = INSIZE(lookup);
+        insize = max_t(unsigned int, offset + length +1, OUTSIZE(lookup));
+	UPARG(CODA_LOOKUP);
+
+        inp->coda_lookup.VFid = *fid;
+	inp->coda_lookup.name = offset;
+	inp->coda_lookup.flags = CLU_CASE_SENSITIVE;
+        /* send Venus a null terminated string */
+        memcpy((char *)(inp) + offset, name, length);
+        *((char *)inp + offset + length) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+	if (!error) {
+		*resfid = outp->coda_lookup.VFid;
+		*type = outp->coda_lookup.vtype;
+	}
+
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
+		vuid_t uid)
+{
+	union inputArgs *inp;
+	union outputArgs *outp;
+	int insize, outsize, error;
+	
+	insize = SIZE(release);
+	UPARG(CODA_CLOSE);
+	
+	inp->ih.uid = uid;
+        inp->coda_close.VFid = *fid;
+        inp->coda_close.flags = flags;
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+	CODA_FREE(inp, insize);
+        return error;
+}
+
+int venus_open(struct super_block *sb, struct CodaFid *fid,
+		  int flags, struct file **fh)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+       
+	insize = SIZE(open_by_fd);
+	UPARG(CODA_OPEN_BY_FD);
+
+	inp->coda_open_by_fd.VFid = *fid;
+	inp->coda_open_by_fd.flags = flags;
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+	if (!error)
+		*fh = outp->coda_open_by_fd.fh;
+
+	CODA_FREE(inp, insize);
+	return error;
+}	
+
+int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid, 
+		   const char *name, int length, 
+		   struct CodaFid *newfid, struct coda_vattr *attrs)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+        int offset;
+
+	offset = INSIZE(mkdir);
+	insize = max_t(unsigned int, offset + length + 1, OUTSIZE(mkdir));
+	UPARG(CODA_MKDIR);
+
+        inp->coda_mkdir.VFid = *dirfid;
+        inp->coda_mkdir.attr = *attrs;
+	inp->coda_mkdir.name = offset;
+        /* Venus must get null terminated string */
+        memcpy((char *)(inp) + offset, name, length);
+        *((char *)inp + offset + length) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+	if (!error) {
+		*attrs = outp->coda_mkdir.attr;
+		*newfid = outp->coda_mkdir.VFid;
+	}
+
+	CODA_FREE(inp, insize);
+	return error;        
+}
+
+
+int venus_rename(struct super_block *sb, struct CodaFid *old_fid, 
+		 struct CodaFid *new_fid, size_t old_length, 
+		 size_t new_length, const char *old_name, 
+		 const char *new_name)
+{
+	union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error; 
+	int offset, s;
+	
+	offset = INSIZE(rename);
+	insize = max_t(unsigned int, offset + new_length + old_length + 8,
+		     OUTSIZE(rename)); 
+ 	UPARG(CODA_RENAME);
+
+        inp->coda_rename.sourceFid = *old_fid;
+        inp->coda_rename.destFid =  *new_fid;
+        inp->coda_rename.srcname = offset;
+
+        /* Venus must receive an null terminated string */
+        s = ( old_length & ~0x3) +4; /* round up to word boundary */
+        memcpy((char *)(inp) + offset, old_name, old_length);
+        *((char *)inp + offset + old_length) = '\0';
+
+        /* another null terminated string for Venus */
+        offset += s;
+        inp->coda_rename.destname = offset;
+        s = ( new_length & ~0x3) +4; /* round up to word boundary */
+        memcpy((char *)(inp) + offset, new_name, new_length);
+        *((char *)inp + offset + new_length) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+int venus_create(struct super_block *sb, struct CodaFid *dirfid, 
+		 const char *name, int length, int excl, int mode,
+		 struct CodaFid *newfid, struct coda_vattr *attrs) 
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+        int offset;
+
+        offset = INSIZE(create);
+	insize = max_t(unsigned int, offset + length + 1, OUTSIZE(create));
+	UPARG(CODA_CREATE);
+
+        inp->coda_create.VFid = *dirfid;
+        inp->coda_create.attr.va_mode = mode;
+	inp->coda_create.excl = excl;
+        inp->coda_create.mode = mode;
+        inp->coda_create.name = offset;
+
+        /* Venus must get null terminated string */
+        memcpy((char *)(inp) + offset, name, length);
+        *((char *)inp + offset + length) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+	if (!error) {
+		*attrs = outp->coda_create.attr;
+		*newfid = outp->coda_create.VFid;
+	}
+
+	CODA_FREE(inp, insize);
+	return error;        
+}
+
+int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid, 
+		    const char *name, int length)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+        int offset;
+
+        offset = INSIZE(rmdir);
+	insize = max_t(unsigned int, offset + length + 1, OUTSIZE(rmdir));
+	UPARG(CODA_RMDIR);
+
+        inp->coda_rmdir.VFid = *dirfid;
+        inp->coda_rmdir.name = offset;
+        memcpy((char *)(inp) + offset, name, length);
+	*((char *)inp + offset + length) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+int venus_remove(struct super_block *sb, struct CodaFid *dirfid, 
+		    const char *name, int length)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int error=0, insize, outsize, offset;
+
+        offset = INSIZE(remove);
+	insize = max_t(unsigned int, offset + length + 1, OUTSIZE(remove));
+	UPARG(CODA_REMOVE);
+
+        inp->coda_remove.VFid = *dirfid;
+        inp->coda_remove.name = offset;
+        memcpy((char *)(inp) + offset, name, length);
+	*((char *)inp + offset + length) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+int venus_readlink(struct super_block *sb, struct CodaFid *fid, 
+		      char *buffer, int *length)
+{ 
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+        int retlen;
+        char *result;
+        
+	insize = max_t(unsigned int,
+		     INSIZE(readlink), OUTSIZE(readlink)+ *length + 1);
+	UPARG(CODA_READLINK);
+
+        inp->coda_readlink.VFid = *fid;
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+	if (!error) {
+		retlen = outp->coda_readlink.count;
+		if ( retlen > *length )
+			retlen = *length;
+		*length = retlen;
+		result =  (char *)outp + (long)outp->coda_readlink.data;
+		memcpy(buffer, result, retlen);
+		*(buffer + retlen) = '\0';
+	}
+
+        CODA_FREE(inp, insize);
+        return error;
+}
+
+
+
+int venus_link(struct super_block *sb, struct CodaFid *fid, 
+		  struct CodaFid *dirfid, const char *name, int len )
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+        int offset;
+
+	offset = INSIZE(link);
+	insize = max_t(unsigned int, offset  + len + 1, OUTSIZE(link));
+        UPARG(CODA_LINK);
+
+        inp->coda_link.sourceFid = *fid;
+        inp->coda_link.destFid = *dirfid;
+        inp->coda_link.tname = offset;
+
+        /* make sure strings are null terminated */
+        memcpy((char *)(inp) + offset, name, len);
+        *((char *)inp + offset + len) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+	CODA_FREE(inp, insize);
+        return error;
+}
+
+int venus_symlink(struct super_block *sb, struct CodaFid *fid,
+		     const char *name, int len,
+		     const char *symname, int symlen)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+        int offset, s;
+
+        offset = INSIZE(symlink);
+	insize = max_t(unsigned int, offset + len + symlen + 8, OUTSIZE(symlink));
+	UPARG(CODA_SYMLINK);
+        
+        /*        inp->coda_symlink.attr = *tva; XXXXXX */ 
+        inp->coda_symlink.VFid = *fid;
+
+	/* Round up to word boundary and null terminate */
+        inp->coda_symlink.srcname = offset;
+        s = ( symlen  & ~0x3 ) + 4; 
+        memcpy((char *)(inp) + offset, symname, symlen);
+        *((char *)inp + offset + symlen) = '\0';
+        
+	/* Round up to word boundary and null terminate */
+        offset += s;
+        inp->coda_symlink.tname = offset;
+        s = (len & ~0x3) + 4;
+        memcpy((char *)(inp) + offset, name, len);
+        *((char *)inp + offset + len) = '\0';
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+	CODA_FREE(inp, insize);
+        return error;
+}
+
+int venus_fsync(struct super_block *sb, struct CodaFid *fid)
+{
+        union inputArgs *inp;
+        union outputArgs *outp; 
+	int insize, outsize, error;
+	
+	insize=SIZE(fsync);
+	UPARG(CODA_FSYNC);
+
+	inp->coda_fsync.VFid = *fid;
+	error = coda_upcall(coda_vcp(sb), sizeof(union inputArgs),
+			    &outsize, inp);
+
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+int venus_access(struct super_block *sb, struct CodaFid *fid, int mask)
+{
+        union inputArgs *inp;
+        union outputArgs *outp; 
+	int insize, outsize, error;
+
+	insize = SIZE(access);
+	UPARG(CODA_ACCESS);
+
+        inp->coda_access.VFid = *fid;
+        inp->coda_access.flags = mask;
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
+
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+
+int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
+		 unsigned int cmd, struct PioctlData *data)
+{
+        union inputArgs *inp;
+        union outputArgs *outp;  
+	int insize, outsize, error;
+	int iocsize;
+
+	insize = VC_MAXMSGSIZE;
+	UPARG(CODA_IOCTL);
+
+        /* build packet for Venus */
+        if (data->vi.in_size > VC_MAXDATASIZE) {
+		error = -EINVAL;
+		goto exit;
+        }
+
+        if (data->vi.out_size > VC_MAXDATASIZE) {
+		error = -EINVAL;
+		goto exit;
+	}
+
+        inp->coda_ioctl.VFid = *fid;
+    
+        /* the cmd field was mutated by increasing its size field to
+         * reflect the path and follow args. We need to subtract that
+         * out before sending the command to Venus.  */
+        inp->coda_ioctl.cmd = (cmd & ~(PIOCPARM_MASK << 16));	
+        iocsize = ((cmd >> 16) & PIOCPARM_MASK) - sizeof(char *) - sizeof(int);
+        inp->coda_ioctl.cmd |= (iocsize & PIOCPARM_MASK) <<	16;	
+    
+        /* in->coda_ioctl.rwflag = flag; */
+        inp->coda_ioctl.len = data->vi.in_size;
+        inp->coda_ioctl.data = (char *)(INSIZE(ioctl));
+     
+        /* get the data out of user space */
+        if ( copy_from_user((char*)inp + (long)inp->coda_ioctl.data,
+			    data->vi.in, data->vi.in_size) ) {
+		error = -EINVAL;
+	        goto exit;
+	}
+
+	error = coda_upcall(coda_vcp(sb), SIZE(ioctl) + data->vi.in_size,
+			    &outsize, inp);
+
+        if (error) {
+	        printk("coda_pioctl: Venus returns: %d for %s\n", 
+		       error, coda_f2s(fid));
+		goto exit; 
+	}
+
+	if (outsize < (long)outp->coda_ioctl.data + outp->coda_ioctl.len) {
+		error = -EINVAL;
+		goto exit;
+	}
+        
+	/* Copy out the OUT buffer. */
+        if (outp->coda_ioctl.len > data->vi.out_size) {
+		error = -EINVAL;
+		goto exit;
+        }
+
+	/* Copy out the OUT buffer. */
+	if (copy_to_user(data->vi.out,
+			 (char *)outp + (long)outp->coda_ioctl.data,
+			 outp->coda_ioctl.len)) {
+		error = -EFAULT;
+		goto exit;
+	}
+
+ exit:
+	CODA_FREE(inp, insize);
+	return error;
+}
+
+int venus_statfs(struct dentry *dentry, struct kstatfs *sfs)
+{ 
+        union inputArgs *inp;
+        union outputArgs *outp;
+        int insize, outsize, error;
+        
+	insize = max_t(unsigned int, INSIZE(statfs), OUTSIZE(statfs));
+	UPARG(CODA_STATFS);
+
+	error = coda_upcall(coda_vcp(dentry->d_sb), insize, &outsize, inp);
+	if (!error) {
+		sfs->f_blocks = outp->coda_statfs.stat.f_blocks;
+		sfs->f_bfree  = outp->coda_statfs.stat.f_bfree;
+		sfs->f_bavail = outp->coda_statfs.stat.f_bavail;
+		sfs->f_files  = outp->coda_statfs.stat.f_files;
+		sfs->f_ffree  = outp->coda_statfs.stat.f_ffree;
+	}
+
+        CODA_FREE(inp, insize);
+        return error;
+}
+
+/*
+ * coda_upcall and coda_downcall routines.
+ */
+static void coda_block_signals(sigset_t *old)
+{
+	spin_lock_irq(&current->sighand->siglock);
+	*old = current->blocked;
+
+	sigfillset(&current->blocked);
+	sigdelset(&current->blocked, SIGKILL);
+	sigdelset(&current->blocked, SIGSTOP);
+	sigdelset(&current->blocked, SIGINT);
+
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+}
+
+static void coda_unblock_signals(sigset_t *old)
+{
+	spin_lock_irq(&current->sighand->siglock);
+	current->blocked = *old;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+}
+
+/* Don't allow signals to interrupt the following upcalls before venus
+ * has seen them,
+ * - CODA_CLOSE or CODA_RELEASE upcall  (to avoid reference count problems)
+ * - CODA_STORE				(to avoid data loss)
+ */
+#define CODA_INTERRUPTIBLE(r) (!coda_hard && \
+			       (((r)->uc_opcode != CODA_CLOSE && \
+				 (r)->uc_opcode != CODA_STORE && \
+				 (r)->uc_opcode != CODA_RELEASE) || \
+				(r)->uc_flags & CODA_REQ_READ))
+
+static inline void coda_waitfor_upcall(struct venus_comm *vcp,
+				       struct upc_req *req)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	unsigned long timeout = jiffies + coda_timeout * HZ;
+	sigset_t old;
+	int blocked;
+
+	coda_block_signals(&old);
+	blocked = 1;
+
+	add_wait_queue(&req->uc_sleep, &wait);
+	for (;;) {
+		if (CODA_INTERRUPTIBLE(req))
+			set_current_state(TASK_INTERRUPTIBLE);
+		else
+			set_current_state(TASK_UNINTERRUPTIBLE);
+
+		/* got a reply */
+		if (req->uc_flags & (CODA_REQ_WRITE | CODA_REQ_ABORT))
+			break;
+
+		if (blocked && time_after(jiffies, timeout) &&
+		    CODA_INTERRUPTIBLE(req))
+		{
+			coda_unblock_signals(&old);
+			blocked = 0;
+		}
+
+		if (signal_pending(current)) {
+			list_del(&req->uc_chain);
+			break;
+		}
+
+		mutex_unlock(&vcp->vc_mutex);
+		if (blocked)
+			schedule_timeout(HZ);
+		else
+			schedule();
+		mutex_lock(&vcp->vc_mutex);
+	}
+	if (blocked)
+		coda_unblock_signals(&old);
+
+	remove_wait_queue(&req->uc_sleep, &wait);
+	set_current_state(TASK_RUNNING);
+}
+
+
+/*
+ * coda_upcall will return an error in the case of
+ * failed communication with Venus _or_ will peek at Venus
+ * reply and return Venus' error.
+ *
+ * As venus has 2 types of errors, normal errors (positive) and internal
+ * errors (negative), normal errors are negated, while internal errors
+ * are all mapped to -EINTR, while showing a nice warning message. (jh)
+ */
+static int coda_upcall(struct venus_comm *vcp,
+		       int inSize, int *outSize,
+		       union inputArgs *buffer)
+{
+	union outputArgs *out;
+	union inputArgs *sig_inputArgs;
+	struct upc_req *req = NULL, *sig_req;
+	int error;
+
+	mutex_lock(&vcp->vc_mutex);
+
+	if (!vcp->vc_inuse) {
+		printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n");
+		error = -ENXIO;
+		goto exit;
+	}
+
+	/* Format the request message. */
+	req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
+	if (!req) {
+		error = -ENOMEM;
+		goto exit;
+	}
+
+	req->uc_data = (void *)buffer;
+	req->uc_flags = 0;
+	req->uc_inSize = inSize;
+	req->uc_outSize = *outSize ? *outSize : inSize;
+	req->uc_opcode = ((union inputArgs *)buffer)->ih.opcode;
+	req->uc_unique = ++vcp->vc_seq;
+	init_waitqueue_head(&req->uc_sleep);
+
+	/* Fill in the common input args. */
+	((union inputArgs *)buffer)->ih.unique = req->uc_unique;
+
+	/* Append msg to pending queue and poke Venus. */
+	list_add_tail(&req->uc_chain, &vcp->vc_pending);
+
+	wake_up_interruptible(&vcp->vc_waitq);
+	/* We can be interrupted while we wait for Venus to process
+	 * our request.  If the interrupt occurs before Venus has read
+	 * the request, we dequeue and return. If it occurs after the
+	 * read but before the reply, we dequeue, send a signal
+	 * message, and return. If it occurs after the reply we ignore
+	 * it. In no case do we want to restart the syscall.  If it
+	 * was interrupted by a venus shutdown (psdev_close), return
+	 * ENODEV.  */
+
+	/* Go to sleep.  Wake up on signals only after the timeout. */
+	coda_waitfor_upcall(vcp, req);
+
+	/* Op went through, interrupt or not... */
+	if (req->uc_flags & CODA_REQ_WRITE) {
+		out = (union outputArgs *)req->uc_data;
+		/* here we map positive Venus errors to kernel errors */
+		error = -out->oh.result;
+		*outSize = req->uc_outSize;
+		goto exit;
+	}
+
+	error = -EINTR;
+	if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) {
+		printk(KERN_WARNING "coda: Unexpected interruption.\n");
+		goto exit;
+	}
+
+	/* Interrupted before venus read it. */
+	if (!(req->uc_flags & CODA_REQ_READ))
+		goto exit;
+
+	/* Venus saw the upcall, make sure we can send interrupt signal */
+	if (!vcp->vc_inuse) {
+		printk(KERN_INFO "coda: Venus dead, not sending signal.\n");
+		goto exit;
+	}
+
+	error = -ENOMEM;
+	sig_req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
+	if (!sig_req) goto exit;
+
+	CODA_ALLOC((sig_req->uc_data), char *, sizeof(struct coda_in_hdr));
+	if (!sig_req->uc_data) {
+		kfree(sig_req);
+		goto exit;
+	}
+
+	error = -EINTR;
+	sig_inputArgs = (union inputArgs *)sig_req->uc_data;
+	sig_inputArgs->ih.opcode = CODA_SIGNAL;
+	sig_inputArgs->ih.unique = req->uc_unique;
+
+	sig_req->uc_flags = CODA_REQ_ASYNC;
+	sig_req->uc_opcode = sig_inputArgs->ih.opcode;
+	sig_req->uc_unique = sig_inputArgs->ih.unique;
+	sig_req->uc_inSize = sizeof(struct coda_in_hdr);
+	sig_req->uc_outSize = sizeof(struct coda_in_hdr);
+
+	/* insert at head of queue! */
+	list_add(&(sig_req->uc_chain), &vcp->vc_pending);
+	wake_up_interruptible(&vcp->vc_waitq);
+
+exit:
+	kfree(req);
+	mutex_unlock(&vcp->vc_mutex);
+	return error;
+}
+
+/*  
+    The statements below are part of the Coda opportunistic
+    programming -- taken from the Mach/BSD kernel code for Coda. 
+    You don't get correct semantics by stating what needs to be
+    done without guaranteeing the invariants needed for it to happen.
+    When will be have time to find out what exactly is going on?  (pjb)
+*/
+
+
+/* 
+ * There are 7 cases where cache invalidations occur.  The semantics
+ *  of each is listed here:
+ *
+ * CODA_FLUSH     -- flush all entries from the name cache and the cnode cache.
+ * CODA_PURGEUSER -- flush all entries from the name cache for a specific user
+ *                  This call is a result of token expiration.
+ *
+ * The next arise as the result of callbacks on a file or directory.
+ * CODA_ZAPFILE   -- flush the cached attributes for a file.
+
+ * CODA_ZAPDIR    -- flush the attributes for the dir and
+ *                  force a new lookup for all the children
+                    of this dir.
+
+ *
+ * The next is a result of Venus detecting an inconsistent file.
+ * CODA_PURGEFID  -- flush the attribute for the file
+ *                  purge it and its children from the dcache
+ *
+ * The last  allows Venus to replace local fids with global ones
+ * during reintegration.
+ *
+ * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */
+
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
+{
+	struct inode *inode = NULL;
+	struct CodaFid *fid = NULL, *newfid;
+	struct super_block *sb;
+
+	/* Handle invalidation requests. */
+	mutex_lock(&vcp->vc_mutex);
+	sb = vcp->vc_sb;
+	if (!sb || !sb->s_root)
+		goto unlock_out;
+
+	switch (opcode) {
+	case CODA_FLUSH:
+		coda_cache_clear_all(sb);
+		shrink_dcache_sb(sb);
+		if (sb->s_root->d_inode)
+			coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
+		break;
+
+	case CODA_PURGEUSER:
+		coda_cache_clear_all(sb);
+		break;
+
+	case CODA_ZAPDIR:
+		fid = &out->coda_zapdir.CodaFid;
+		break;
+
+	case CODA_ZAPFILE:
+		fid = &out->coda_zapfile.CodaFid;
+		break;
+
+	case CODA_PURGEFID:
+		fid = &out->coda_purgefid.CodaFid;
+		break;
+
+	case CODA_REPLACE:
+		fid = &out->coda_replace.OldFid;
+		break;
+	}
+	if (fid)
+		inode = coda_fid_to_inode(fid, sb);
+
+unlock_out:
+	mutex_unlock(&vcp->vc_mutex);
+
+	if (!inode)
+		return 0;
+
+	switch (opcode) {
+	case CODA_ZAPDIR:
+		coda_flag_inode_children(inode, C_PURGE);
+		coda_flag_inode(inode, C_VATTR);
+		break;
+
+	case CODA_ZAPFILE:
+		coda_flag_inode(inode, C_VATTR);
+		break;
+
+	case CODA_PURGEFID:
+		coda_flag_inode_children(inode, C_PURGE);
+
+		/* catch the dentries later if some are still busy */
+		coda_flag_inode(inode, C_PURGE);
+		d_prune_aliases(inode);
+		break;
+
+	case CODA_REPLACE:
+		newfid = &out->coda_replace.NewFid;
+		coda_replace_fid(inode, fid, newfid);
+		break;
+	}
+	iput(inode);
+	return 0;
+}
+
diff --git a/fs/compat.c b/fs/compat.c
new file mode 100644
index 00000000..0ea00832
--- /dev/null
+++ b/fs/compat.c
@@ -0,0 +1,2061 @@
+/*
+ *  linux/fs/compat.c
+ *
+ *  Kernel compatibililty routines for e.g. 32 bit syscall support
+ *  on 64 bit kernels.
+ *
+ *  Copyright (C) 2002       Stephen Rothwell, IBM Corporation
+ *  Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
+ *  Copyright (C) 1998       Eddie C. Dost  (ecd@skynet.be)
+ *  Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
+ *  Copyright (C) 2003       Pavel Machek (pavel@ucw.cz)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/fcntl.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/vfs.h>
+#include <linux/ioctl.h>
+#include <linux/init.h>
+#include <linux/ncp_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/syscalls.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/dirent.h>
+#include <linux/fsnotify.h>
+#include <linux/highuid.h>
+#include <linux/nfsd/syscall.h>
+#include <linux/personality.h>
+#include <linux/rwsem.h>
+#include <linux/tsacct_kern.h>
+#include <linux/security.h>
+#include <linux/highmem.h>
+#include <linux/signal.h>
+#include <linux/poll.h>
+#include <linux/mm.h>
+#include <linux/eventpoll.h>
+#include <linux/fs_struct.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/ioctls.h>
+#include "internal.h"
+
+int compat_log = 1;
+
+int compat_printk(const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	if (!compat_log)
+		return 0;
+	va_start(ap, fmt);
+	ret = vprintk(fmt, ap);
+	va_end(ap);
+	return ret;
+}
+
+#include "read_write.h"
+
+/*
+ * Not all architectures have sys_utime, so implement this in terms
+ * of sys_utimes.
+ */
+asmlinkage long compat_sys_utime(const char __user *filename,
+				 struct compat_utimbuf __user *t)
+{
+	struct timespec tv[2];
+
+	if (t) {
+		if (get_user(tv[0].tv_sec, &t->actime) ||
+		    get_user(tv[1].tv_sec, &t->modtime))
+			return -EFAULT;
+		tv[0].tv_nsec = 0;
+		tv[1].tv_nsec = 0;
+	}
+	return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
+}
+
+asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags)
+{
+	struct timespec tv[2];
+
+	if  (t) {
+		if (get_compat_timespec(&tv[0], &t[0]) ||
+		    get_compat_timespec(&tv[1], &t[1]))
+			return -EFAULT;
+
+		if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
+			return 0;
+	}
+	return do_utimes(dfd, filename, t ? tv : NULL, flags);
+}
+
+asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t)
+{
+	struct timespec tv[2];
+
+	if (t) {
+		if (get_user(tv[0].tv_sec, &t[0].tv_sec) ||
+		    get_user(tv[0].tv_nsec, &t[0].tv_usec) ||
+		    get_user(tv[1].tv_sec, &t[1].tv_sec) ||
+		    get_user(tv[1].tv_nsec, &t[1].tv_usec))
+			return -EFAULT;
+		if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 ||
+		    tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0)
+			return -EINVAL;
+		tv[0].tv_nsec *= 1000;
+		tv[1].tv_nsec *= 1000;
+	}
+	return do_utimes(dfd, filename, t ? tv : NULL, 0);
+}
+
+asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t)
+{
+	return compat_sys_futimesat(AT_FDCWD, filename, t);
+}
+
+static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
+{
+	compat_ino_t ino = stat->ino;
+	typeof(ubuf->st_uid) uid = 0;
+	typeof(ubuf->st_gid) gid = 0;
+	int err;
+
+	SET_UID(uid, stat->uid);
+	SET_GID(gid, stat->gid);
+
+	if ((u64) stat->size > MAX_NON_LFS ||
+	    !old_valid_dev(stat->dev) ||
+	    !old_valid_dev(stat->rdev))
+		return -EOVERFLOW;
+	if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino)
+		return -EOVERFLOW;
+
+	if (clear_user(ubuf, sizeof(*ubuf)))
+		return -EFAULT;
+
+	err  = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev);
+	err |= __put_user(ino, &ubuf->st_ino);
+	err |= __put_user(stat->mode, &ubuf->st_mode);
+	err |= __put_user(stat->nlink, &ubuf->st_nlink);
+	err |= __put_user(uid, &ubuf->st_uid);
+	err |= __put_user(gid, &ubuf->st_gid);
+	err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev);
+	err |= __put_user(stat->size, &ubuf->st_size);
+	err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime);
+	err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec);
+	err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime);
+	err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec);
+	err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime);
+	err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec);
+	err |= __put_user(stat->blksize, &ubuf->st_blksize);
+	err |= __put_user(stat->blocks, &ubuf->st_blocks);
+	return err;
+}
+
+asmlinkage long compat_sys_newstat(const char __user * filename,
+		struct compat_stat __user *statbuf)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_stat(filename, &stat);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
+}
+
+asmlinkage long compat_sys_newlstat(const char __user * filename,
+		struct compat_stat __user *statbuf)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
+}
+
+#ifndef __ARCH_WANT_STAT64
+asmlinkage long compat_sys_newfstatat(unsigned int dfd,
+		const char __user *filename,
+		struct compat_stat __user *statbuf, int flag)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
+}
+#endif
+
+asmlinkage long compat_sys_newfstat(unsigned int fd,
+		struct compat_stat __user * statbuf)
+{
+	struct kstat stat;
+	int error = vfs_fstat(fd, &stat);
+
+	if (!error)
+		error = cp_compat_stat(&stat, statbuf);
+	return error;
+}
+
+static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf)
+{
+	
+	if (sizeof ubuf->f_blocks == 4) {
+		if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
+		     kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
+			return -EOVERFLOW;
+		/* f_files and f_ffree may be -1; it's okay
+		 * to stuff that into 32 bits */
+		if (kbuf->f_files != 0xffffffffffffffffULL
+		 && (kbuf->f_files & 0xffffffff00000000ULL))
+			return -EOVERFLOW;
+		if (kbuf->f_ffree != 0xffffffffffffffffULL
+		 && (kbuf->f_ffree & 0xffffffff00000000ULL))
+			return -EOVERFLOW;
+	}
+	if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
+	    __put_user(kbuf->f_type, &ubuf->f_type) ||
+	    __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
+	    __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
+	    __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
+	    __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
+	    __put_user(kbuf->f_files, &ubuf->f_files) ||
+	    __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
+	    __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
+	    __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
+	    __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
+	    __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+	    __put_user(0, &ubuf->f_spare[0]) || 
+	    __put_user(0, &ubuf->f_spare[1]) || 
+	    __put_user(0, &ubuf->f_spare[2]) || 
+	    __put_user(0, &ubuf->f_spare[3]) || 
+	    __put_user(0, &ubuf->f_spare[4]))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ * The following statfs calls are copies of code from fs/statfs.c and
+ * should be checked against those from time to time
+ */
+asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
+{
+	struct kstatfs tmp;
+	int error = user_statfs(pathname, &tmp);
+	if (!error)
+		error = put_compat_statfs(buf, &tmp);
+	return error;
+}
+
+asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf)
+{
+	struct kstatfs tmp;
+	int error = fd_statfs(fd, &tmp);
+	if (!error)
+		error = put_compat_statfs(buf, &tmp);
+	return error;
+}
+
+static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
+{
+	if (sizeof ubuf->f_blocks == 4) {
+		if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
+		     kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
+			return -EOVERFLOW;
+		/* f_files and f_ffree may be -1; it's okay
+		 * to stuff that into 32 bits */
+		if (kbuf->f_files != 0xffffffffffffffffULL
+		 && (kbuf->f_files & 0xffffffff00000000ULL))
+			return -EOVERFLOW;
+		if (kbuf->f_ffree != 0xffffffffffffffffULL
+		 && (kbuf->f_ffree & 0xffffffff00000000ULL))
+			return -EOVERFLOW;
+	}
+	if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
+	    __put_user(kbuf->f_type, &ubuf->f_type) ||
+	    __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
+	    __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
+	    __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
+	    __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
+	    __put_user(kbuf->f_files, &ubuf->f_files) ||
+	    __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
+	    __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
+	    __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
+	    __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
+	    __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+	    __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+	    __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
+		return -EFAULT;
+	return 0;
+}
+
+asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
+{
+	struct kstatfs tmp;
+	int error;
+
+	if (sz != sizeof(*buf))
+		return -EINVAL;
+
+	error = user_statfs(pathname, &tmp);
+	if (!error)
+		error = put_compat_statfs64(buf, &tmp);
+	return error;
+}
+
+asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf)
+{
+	struct kstatfs tmp;
+	int error;
+
+	if (sz != sizeof(*buf))
+		return -EINVAL;
+
+	error = fd_statfs(fd, &tmp);
+	if (!error)
+		error = put_compat_statfs64(buf, &tmp);
+	return error;
+}
+
+/*
+ * This is a copy of sys_ustat, just dealing with a structure layout.
+ * Given how simple this syscall is that apporach is more maintainable
+ * than the various conversion hacks.
+ */
+asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
+{
+	struct super_block *sb;
+	struct compat_ustat tmp;
+	struct kstatfs sbuf;
+	int err;
+
+	sb = user_get_super(new_decode_dev(dev));
+	if (!sb)
+		return -EINVAL;
+	err = statfs_by_dentry(sb->s_root, &sbuf);
+	drop_super(sb);
+	if (err)
+		return err;
+
+	memset(&tmp, 0, sizeof(struct compat_ustat));
+	tmp.f_tfree = sbuf.f_bfree;
+	tmp.f_tinode = sbuf.f_ffree;
+	if (copy_to_user(u, &tmp, sizeof(struct compat_ustat)))
+		return -EFAULT;
+	return 0;
+}
+
+static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
+{
+	if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
+	    __get_user(kfl->l_type, &ufl->l_type) ||
+	    __get_user(kfl->l_whence, &ufl->l_whence) ||
+	    __get_user(kfl->l_start, &ufl->l_start) ||
+	    __get_user(kfl->l_len, &ufl->l_len) ||
+	    __get_user(kfl->l_pid, &ufl->l_pid))
+		return -EFAULT;
+	return 0;
+}
+
+static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
+{
+	if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
+	    __put_user(kfl->l_type, &ufl->l_type) ||
+	    __put_user(kfl->l_whence, &ufl->l_whence) ||
+	    __put_user(kfl->l_start, &ufl->l_start) ||
+	    __put_user(kfl->l_len, &ufl->l_len) ||
+	    __put_user(kfl->l_pid, &ufl->l_pid))
+		return -EFAULT;
+	return 0;
+}
+
+#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64
+static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
+{
+	if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
+	    __get_user(kfl->l_type, &ufl->l_type) ||
+	    __get_user(kfl->l_whence, &ufl->l_whence) ||
+	    __get_user(kfl->l_start, &ufl->l_start) ||
+	    __get_user(kfl->l_len, &ufl->l_len) ||
+	    __get_user(kfl->l_pid, &ufl->l_pid))
+		return -EFAULT;
+	return 0;
+}
+#endif
+
+#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64
+static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
+{
+	if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
+	    __put_user(kfl->l_type, &ufl->l_type) ||
+	    __put_user(kfl->l_whence, &ufl->l_whence) ||
+	    __put_user(kfl->l_start, &ufl->l_start) ||
+	    __put_user(kfl->l_len, &ufl->l_len) ||
+	    __put_user(kfl->l_pid, &ufl->l_pid))
+		return -EFAULT;
+	return 0;
+}
+#endif
+
+asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
+		unsigned long arg)
+{
+	mm_segment_t old_fs;
+	struct flock f;
+	long ret;
+
+	switch (cmd) {
+	case F_GETLK:
+	case F_SETLK:
+	case F_SETLKW:
+		ret = get_compat_flock(&f, compat_ptr(arg));
+		if (ret != 0)
+			break;
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		ret = sys_fcntl(fd, cmd, (unsigned long)&f);
+		set_fs(old_fs);
+		if (cmd == F_GETLK && ret == 0) {
+			/* GETLK was successful and we need to return the data...
+			 * but it needs to fit in the compat structure.
+			 * l_start shouldn't be too big, unless the original
+			 * start + end is greater than COMPAT_OFF_T_MAX, in which
+			 * case the app was asking for trouble, so we return
+			 * -EOVERFLOW in that case.
+			 * l_len could be too big, in which case we just truncate it,
+			 * and only allow the app to see that part of the conflicting
+			 * lock that might make sense to it anyway
+			 */
+
+			if (f.l_start > COMPAT_OFF_T_MAX)
+				ret = -EOVERFLOW;
+			if (f.l_len > COMPAT_OFF_T_MAX)
+				f.l_len = COMPAT_OFF_T_MAX;
+			if (ret == 0)
+				ret = put_compat_flock(&f, compat_ptr(arg));
+		}
+		break;
+
+	case F_GETLK64:
+	case F_SETLK64:
+	case F_SETLKW64:
+		ret = get_compat_flock64(&f, compat_ptr(arg));
+		if (ret != 0)
+			break;
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		ret = sys_fcntl(fd, (cmd == F_GETLK64) ? F_GETLK :
+				((cmd == F_SETLK64) ? F_SETLK : F_SETLKW),
+				(unsigned long)&f);
+		set_fs(old_fs);
+		if (cmd == F_GETLK64 && ret == 0) {
+			/* need to return lock information - see above for commentary */
+			if (f.l_start > COMPAT_LOFF_T_MAX)
+				ret = -EOVERFLOW;
+			if (f.l_len > COMPAT_LOFF_T_MAX)
+				f.l_len = COMPAT_LOFF_T_MAX;
+			if (ret == 0)
+				ret = put_compat_flock64(&f, compat_ptr(arg));
+		}
+		break;
+
+	default:
+		ret = sys_fcntl(fd, cmd, arg);
+		break;
+	}
+	return ret;
+}
+
+asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd,
+		unsigned long arg)
+{
+	if ((cmd == F_GETLK64) || (cmd == F_SETLK64) || (cmd == F_SETLKW64))
+		return -EINVAL;
+	return compat_sys_fcntl64(fd, cmd, arg);
+}
+
+asmlinkage long
+compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p)
+{
+	long ret;
+	aio_context_t ctx64;
+
+	mm_segment_t oldfs = get_fs();
+	if (unlikely(get_user(ctx64, ctx32p)))
+		return -EFAULT;
+
+	set_fs(KERNEL_DS);
+	/* The __user pointer cast is valid because of the set_fs() */
+	ret = sys_io_setup(nr_reqs, (aio_context_t __user *) &ctx64);
+	set_fs(oldfs);
+	/* truncating is ok because it's a user address */
+	if (!ret)
+		ret = put_user((u32) ctx64, ctx32p);
+	return ret;
+}
+
+asmlinkage long
+compat_sys_io_getevents(aio_context_t ctx_id,
+				 unsigned long min_nr,
+				 unsigned long nr,
+				 struct io_event __user *events,
+				 struct compat_timespec __user *timeout)
+{
+	long ret;
+	struct timespec t;
+	struct timespec __user *ut = NULL;
+
+	ret = -EFAULT;
+	if (unlikely(!access_ok(VERIFY_WRITE, events, 
+				nr * sizeof(struct io_event))))
+		goto out;
+	if (timeout) {
+		if (get_compat_timespec(&t, timeout))
+			goto out;
+
+		ut = compat_alloc_user_space(sizeof(*ut));
+		if (copy_to_user(ut, &t, sizeof(t)) )
+			goto out;
+	} 
+	ret = sys_io_getevents(ctx_id, min_nr, nr, events, ut);
+out:
+	return ret;
+}
+
+/* A write operation does a read from user space and vice versa */
+#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
+
+ssize_t compat_rw_copy_check_uvector(int type,
+		const struct compat_iovec __user *uvector, unsigned long nr_segs,
+		unsigned long fast_segs, struct iovec *fast_pointer,
+		struct iovec **ret_pointer)
+{
+	compat_ssize_t tot_len;
+	struct iovec *iov = *ret_pointer = fast_pointer;
+	ssize_t ret = 0;
+	int seg;
+
+	/*
+	 * SuS says "The readv() function *may* fail if the iovcnt argument
+	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+	 * traditionally returned zero for zero segments, so...
+	 */
+	if (nr_segs == 0)
+		goto out;
+
+	ret = -EINVAL;
+	if (nr_segs > UIO_MAXIOV || nr_segs < 0)
+		goto out;
+	if (nr_segs > fast_segs) {
+		ret = -ENOMEM;
+		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+		if (iov == NULL)
+			goto out;
+	}
+	*ret_pointer = iov;
+
+	/*
+	 * Single unix specification:
+	 * We should -EINVAL if an element length is not >= 0 and fitting an
+	 * ssize_t.
+	 *
+	 * In Linux, the total length is limited to MAX_RW_COUNT, there is
+	 * no overflow possibility.
+	 */
+	tot_len = 0;
+	ret = -EINVAL;
+	for (seg = 0; seg < nr_segs; seg++) {
+		compat_uptr_t buf;
+		compat_ssize_t len;
+
+		if (__get_user(len, &uvector->iov_len) ||
+		   __get_user(buf, &uvector->iov_base)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
+			goto out;
+		if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		if (len > MAX_RW_COUNT - tot_len)
+			len = MAX_RW_COUNT - tot_len;
+		tot_len += len;
+		iov->iov_base = compat_ptr(buf);
+		iov->iov_len = (compat_size_t) len;
+		uvector++;
+		iov++;
+	}
+	ret = tot_len;
+
+out:
+	return ret;
+}
+
+static inline long
+copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
+{
+	compat_uptr_t uptr;
+	int i;
+
+	for (i = 0; i < nr; ++i) {
+		if (get_user(uptr, ptr32 + i))
+			return -EFAULT;
+		if (put_user(compat_ptr(uptr), ptr64 + i))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+#define MAX_AIO_SUBMITS 	(PAGE_SIZE/sizeof(struct iocb *))
+
+asmlinkage long
+compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
+{
+	struct iocb __user * __user *iocb64; 
+	long ret;
+
+	if (unlikely(nr < 0))
+		return -EINVAL;
+
+	if (nr > MAX_AIO_SUBMITS)
+		nr = MAX_AIO_SUBMITS;
+	
+	iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
+	ret = copy_iocb(nr, iocb, iocb64);
+	if (!ret)
+		ret = do_io_submit(ctx_id, nr, iocb64, 1);
+	return ret;
+}
+
+struct compat_ncp_mount_data {
+	compat_int_t version;
+	compat_uint_t ncp_fd;
+	__compat_uid_t mounted_uid;
+	compat_pid_t wdog_pid;
+	unsigned char mounted_vol[NCP_VOLNAME_LEN + 1];
+	compat_uint_t time_out;
+	compat_uint_t retry_count;
+	compat_uint_t flags;
+	__compat_uid_t uid;
+	__compat_gid_t gid;
+	compat_mode_t file_mode;
+	compat_mode_t dir_mode;
+};
+
+struct compat_ncp_mount_data_v4 {
+	compat_int_t version;
+	compat_ulong_t flags;
+	compat_ulong_t mounted_uid;
+	compat_long_t wdog_pid;
+	compat_uint_t ncp_fd;
+	compat_uint_t time_out;
+	compat_uint_t retry_count;
+	compat_ulong_t uid;
+	compat_ulong_t gid;
+	compat_ulong_t file_mode;
+	compat_ulong_t dir_mode;
+};
+
+static void *do_ncp_super_data_conv(void *raw_data)
+{
+	int version = *(unsigned int *)raw_data;
+
+	if (version == 3) {
+		struct compat_ncp_mount_data *c_n = raw_data;
+		struct ncp_mount_data *n = raw_data;
+
+		n->dir_mode = c_n->dir_mode;
+		n->file_mode = c_n->file_mode;
+		n->gid = c_n->gid;
+		n->uid = c_n->uid;
+		memmove (n->mounted_vol, c_n->mounted_vol, (sizeof (c_n->mounted_vol) + 3 * sizeof (unsigned int)));
+		n->wdog_pid = c_n->wdog_pid;
+		n->mounted_uid = c_n->mounted_uid;
+	} else if (version == 4) {
+		struct compat_ncp_mount_data_v4 *c_n = raw_data;
+		struct ncp_mount_data_v4 *n = raw_data;
+
+		n->dir_mode = c_n->dir_mode;
+		n->file_mode = c_n->file_mode;
+		n->gid = c_n->gid;
+		n->uid = c_n->uid;
+		n->retry_count = c_n->retry_count;
+		n->time_out = c_n->time_out;
+		n->ncp_fd = c_n->ncp_fd;
+		n->wdog_pid = c_n->wdog_pid;
+		n->mounted_uid = c_n->mounted_uid;
+		n->flags = c_n->flags;
+	} else if (version != 5) {
+		return NULL;
+	}
+
+	return raw_data;
+}
+
+
+struct compat_nfs_string {
+	compat_uint_t len;
+	compat_uptr_t data;
+};
+
+static inline void compat_nfs_string(struct nfs_string *dst,
+				     struct compat_nfs_string *src)
+{
+	dst->data = compat_ptr(src->data);
+	dst->len = src->len;
+}
+
+struct compat_nfs4_mount_data_v1 {
+	compat_int_t version;
+	compat_int_t flags;
+	compat_int_t rsize;
+	compat_int_t wsize;
+	compat_int_t timeo;
+	compat_int_t retrans;
+	compat_int_t acregmin;
+	compat_int_t acregmax;
+	compat_int_t acdirmin;
+	compat_int_t acdirmax;
+	struct compat_nfs_string client_addr;
+	struct compat_nfs_string mnt_path;
+	struct compat_nfs_string hostname;
+	compat_uint_t host_addrlen;
+	compat_uptr_t host_addr;
+	compat_int_t proto;
+	compat_int_t auth_flavourlen;
+	compat_uptr_t auth_flavours;
+};
+
+static int do_nfs4_super_data_conv(void *raw_data)
+{
+	int version = *(compat_uint_t *) raw_data;
+
+	if (version == 1) {
+		struct compat_nfs4_mount_data_v1 *raw = raw_data;
+		struct nfs4_mount_data *real = raw_data;
+
+		/* copy the fields backwards */
+		real->auth_flavours = compat_ptr(raw->auth_flavours);
+		real->auth_flavourlen = raw->auth_flavourlen;
+		real->proto = raw->proto;
+		real->host_addr = compat_ptr(raw->host_addr);
+		real->host_addrlen = raw->host_addrlen;
+		compat_nfs_string(&real->hostname, &raw->hostname);
+		compat_nfs_string(&real->mnt_path, &raw->mnt_path);
+		compat_nfs_string(&real->client_addr, &raw->client_addr);
+		real->acdirmax = raw->acdirmax;
+		real->acdirmin = raw->acdirmin;
+		real->acregmax = raw->acregmax;
+		real->acregmin = raw->acregmin;
+		real->retrans = raw->retrans;
+		real->timeo = raw->timeo;
+		real->wsize = raw->wsize;
+		real->rsize = raw->rsize;
+		real->flags = raw->flags;
+		real->version = raw->version;
+	}
+
+	return 0;
+}
+
+#define NCPFS_NAME      "ncpfs"
+#define NFS4_NAME	"nfs4"
+
+asmlinkage long compat_sys_mount(const char __user * dev_name,
+				 const char __user * dir_name,
+				 const char __user * type, unsigned long flags,
+				 const void __user * data)
+{
+	char *kernel_type;
+	unsigned long data_page;
+	char *kernel_dev;
+	char *dir_page;
+	int retval;
+
+	retval = copy_mount_string(type, &kernel_type);
+	if (retval < 0)
+		goto out;
+
+	dir_page = getname(dir_name);
+	retval = PTR_ERR(dir_page);
+	if (IS_ERR(dir_page))
+		goto out1;
+
+	retval = copy_mount_string(dev_name, &kernel_dev);
+	if (retval < 0)
+		goto out2;
+
+	retval = copy_mount_options(data, &data_page);
+	if (retval < 0)
+		goto out3;
+
+	retval = -EINVAL;
+
+	if (kernel_type && data_page) {
+		if (!strcmp(kernel_type, NCPFS_NAME)) {
+			do_ncp_super_data_conv((void *)data_page);
+		} else if (!strcmp(kernel_type, NFS4_NAME)) {
+			if (do_nfs4_super_data_conv((void *) data_page))
+				goto out4;
+		}
+	}
+
+	retval = do_mount(kernel_dev, dir_page, kernel_type,
+			flags, (void*)data_page);
+
+ out4:
+	free_page(data_page);
+ out3:
+	kfree(kernel_dev);
+ out2:
+	putname(dir_page);
+ out1:
+	kfree(kernel_type);
+ out:
+	return retval;
+}
+
+struct compat_old_linux_dirent {
+	compat_ulong_t	d_ino;
+	compat_ulong_t	d_offset;
+	unsigned short	d_namlen;
+	char		d_name[1];
+};
+
+struct compat_readdir_callback {
+	struct compat_old_linux_dirent __user *dirent;
+	int result;
+};
+
+static int compat_fillonedir(void *__buf, const char *name, int namlen,
+			loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct compat_readdir_callback *buf = __buf;
+	struct compat_old_linux_dirent __user *dirent;
+	compat_ulong_t d_ino;
+
+	if (buf->result)
+		return -EINVAL;
+	d_ino = ino;
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+		buf->result = -EOVERFLOW;
+		return -EOVERFLOW;
+	}
+	buf->result++;
+	dirent = buf->dirent;
+	if (!access_ok(VERIFY_WRITE, dirent,
+			(unsigned long)(dirent->d_name + namlen + 1) -
+				(unsigned long)dirent))
+		goto efault;
+	if (	__put_user(d_ino, &dirent->d_ino) ||
+		__put_user(offset, &dirent->d_offset) ||
+		__put_user(namlen, &dirent->d_namlen) ||
+		__copy_to_user(dirent->d_name, name, namlen) ||
+		__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	return 0;
+efault:
+	buf->result = -EFAULT;
+	return -EFAULT;
+}
+
+asmlinkage long compat_sys_old_readdir(unsigned int fd,
+	struct compat_old_linux_dirent __user *dirent, unsigned int count)
+{
+	int error;
+	struct file *file;
+	struct compat_readdir_callback buf;
+
+	error = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	buf.result = 0;
+	buf.dirent = dirent;
+
+	error = vfs_readdir(file, compat_fillonedir, &buf);
+	if (buf.result)
+		error = buf.result;
+
+	fput(file);
+out:
+	return error;
+}
+
+struct compat_linux_dirent {
+	compat_ulong_t	d_ino;
+	compat_ulong_t	d_off;
+	unsigned short	d_reclen;
+	char		d_name[1];
+};
+
+struct compat_getdents_callback {
+	struct compat_linux_dirent __user *current_dir;
+	struct compat_linux_dirent __user *previous;
+	int count;
+	int error;
+};
+
+static int compat_filldir(void *__buf, const char *name, int namlen,
+		loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct compat_linux_dirent __user * dirent;
+	struct compat_getdents_callback *buf = __buf;
+	compat_ulong_t d_ino;
+	int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
+		namlen + 2, sizeof(compat_long_t));
+
+	buf->error = -EINVAL;	/* only used if we fail.. */
+	if (reclen > buf->count)
+		return -EINVAL;
+	d_ino = ino;
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+		buf->error = -EOVERFLOW;
+		return -EOVERFLOW;
+	}
+	dirent = buf->previous;
+	if (dirent) {
+		if (__put_user(offset, &dirent->d_off))
+			goto efault;
+	}
+	dirent = buf->current_dir;
+	if (__put_user(d_ino, &dirent->d_ino))
+		goto efault;
+	if (__put_user(reclen, &dirent->d_reclen))
+		goto efault;
+	if (copy_to_user(dirent->d_name, name, namlen))
+		goto efault;
+	if (__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	if (__put_user(d_type, (char  __user *) dirent + reclen - 1))
+		goto efault;
+	buf->previous = dirent;
+	dirent = (void __user *)dirent + reclen;
+	buf->current_dir = dirent;
+	buf->count -= reclen;
+	return 0;
+efault:
+	buf->error = -EFAULT;
+	return -EFAULT;
+}
+
+asmlinkage long compat_sys_getdents(unsigned int fd,
+		struct compat_linux_dirent __user *dirent, unsigned int count)
+{
+	struct file * file;
+	struct compat_linux_dirent __user * lastdirent;
+	struct compat_getdents_callback buf;
+	int error;
+
+	error = -EFAULT;
+	if (!access_ok(VERIFY_WRITE, dirent, count))
+		goto out;
+
+	error = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	buf.current_dir = dirent;
+	buf.previous = NULL;
+	buf.count = count;
+	buf.error = 0;
+
+	error = vfs_readdir(file, compat_filldir, &buf);
+	if (error >= 0)
+		error = buf.error;
+	lastdirent = buf.previous;
+	if (lastdirent) {
+		if (put_user(file->f_pos, &lastdirent->d_off))
+			error = -EFAULT;
+		else
+			error = count - buf.count;
+	}
+	fput(file);
+out:
+	return error;
+}
+
+#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64
+
+struct compat_getdents_callback64 {
+	struct linux_dirent64 __user *current_dir;
+	struct linux_dirent64 __user *previous;
+	int count;
+	int error;
+};
+
+static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t offset,
+		     u64 ino, unsigned int d_type)
+{
+	struct linux_dirent64 __user *dirent;
+	struct compat_getdents_callback64 *buf = __buf;
+	int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
+		sizeof(u64));
+	u64 off;
+
+	buf->error = -EINVAL;	/* only used if we fail.. */
+	if (reclen > buf->count)
+		return -EINVAL;
+	dirent = buf->previous;
+
+	if (dirent) {
+		if (__put_user_unaligned(offset, &dirent->d_off))
+			goto efault;
+	}
+	dirent = buf->current_dir;
+	if (__put_user_unaligned(ino, &dirent->d_ino))
+		goto efault;
+	off = 0;
+	if (__put_user_unaligned(off, &dirent->d_off))
+		goto efault;
+	if (__put_user(reclen, &dirent->d_reclen))
+		goto efault;
+	if (__put_user(d_type, &dirent->d_type))
+		goto efault;
+	if (copy_to_user(dirent->d_name, name, namlen))
+		goto efault;
+	if (__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	buf->previous = dirent;
+	dirent = (void __user *)dirent + reclen;
+	buf->current_dir = dirent;
+	buf->count -= reclen;
+	return 0;
+efault:
+	buf->error = -EFAULT;
+	return -EFAULT;
+}
+
+asmlinkage long compat_sys_getdents64(unsigned int fd,
+		struct linux_dirent64 __user * dirent, unsigned int count)
+{
+	struct file * file;
+	struct linux_dirent64 __user * lastdirent;
+	struct compat_getdents_callback64 buf;
+	int error;
+
+	error = -EFAULT;
+	if (!access_ok(VERIFY_WRITE, dirent, count))
+		goto out;
+
+	error = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	buf.current_dir = dirent;
+	buf.previous = NULL;
+	buf.count = count;
+	buf.error = 0;
+
+	error = vfs_readdir(file, compat_filldir64, &buf);
+	if (error >= 0)
+		error = buf.error;
+	lastdirent = buf.previous;
+	if (lastdirent) {
+		typeof(lastdirent->d_off) d_off = file->f_pos;
+		if (__put_user_unaligned(d_off, &lastdirent->d_off))
+			error = -EFAULT;
+		else
+			error = count - buf.count;
+	}
+	fput(file);
+out:
+	return error;
+}
+#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */
+
+static ssize_t compat_do_readv_writev(int type, struct file *file,
+			       const struct compat_iovec __user *uvector,
+			       unsigned long nr_segs, loff_t *pos)
+{
+	compat_ssize_t tot_len;
+	struct iovec iovstack[UIO_FASTIOV];
+	struct iovec *iov = iovstack;
+	ssize_t ret;
+	io_fn_t fn;
+	iov_fn_t fnv;
+
+	ret = -EINVAL;
+	if (!file->f_op)
+		goto out;
+
+	ret = -EFAULT;
+	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
+		goto out;
+
+	tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
+					       UIO_FASTIOV, iovstack, &iov);
+	if (tot_len == 0) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = rw_verify_area(type, file, pos, tot_len);
+	if (ret < 0)
+		goto out;
+
+	fnv = NULL;
+	if (type == READ) {
+		fn = file->f_op->read;
+		fnv = file->f_op->aio_read;
+	} else {
+		fn = (io_fn_t)file->f_op->write;
+		fnv = file->f_op->aio_write;
+	}
+
+	if (fnv)
+		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
+						pos, fnv);
+	else
+		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+
+out:
+	if (iov != iovstack)
+		kfree(iov);
+	if ((ret + (type == READ)) > 0) {
+		if (type == READ)
+			fsnotify_access(file);
+		else
+			fsnotify_modify(file);
+	}
+	return ret;
+}
+
+static size_t compat_readv(struct file *file,
+			   const struct compat_iovec __user *vec,
+			   unsigned long vlen, loff_t *pos)
+{
+	ssize_t ret = -EBADF;
+
+	if (!(file->f_mode & FMODE_READ))
+		goto out;
+
+	ret = -EINVAL;
+	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
+		goto out;
+
+	ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
+
+out:
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
+	return ret;
+}
+
+asmlinkage ssize_t
+compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
+		 unsigned long vlen)
+{
+	struct file *file;
+	int fput_needed;
+	ssize_t ret;
+
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -EBADF;
+	ret = compat_readv(file, vec, vlen, &file->f_pos);
+	fput_light(file, fput_needed);
+	return ret;
+}
+
+asmlinkage ssize_t
+compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
+		  unsigned long vlen, u32 pos_low, u32 pos_high)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	struct file *file;
+	int fput_needed;
+	ssize_t ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -EBADF;
+	ret = -ESPIPE;
+	if (file->f_mode & FMODE_PREAD)
+		ret = compat_readv(file, vec, vlen, &pos);
+	fput_light(file, fput_needed);
+	return ret;
+}
+
+static size_t compat_writev(struct file *file,
+			    const struct compat_iovec __user *vec,
+			    unsigned long vlen, loff_t *pos)
+{
+	ssize_t ret = -EBADF;
+
+	if (!(file->f_mode & FMODE_WRITE))
+		goto out;
+
+	ret = -EINVAL;
+	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
+		goto out;
+
+	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
+
+out:
+	if (ret > 0)
+		add_wchar(current, ret);
+	inc_syscw(current);
+	return ret;
+}
+
+asmlinkage ssize_t
+compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
+		  unsigned long vlen)
+{
+	struct file *file;
+	int fput_needed;
+	ssize_t ret;
+
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -EBADF;
+	ret = compat_writev(file, vec, vlen, &file->f_pos);
+	fput_light(file, fput_needed);
+	return ret;
+}
+
+asmlinkage ssize_t
+compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
+		   unsigned long vlen, u32 pos_low, u32 pos_high)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	struct file *file;
+	int fput_needed;
+	ssize_t ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -EBADF;
+	ret = -ESPIPE;
+	if (file->f_mode & FMODE_PWRITE)
+		ret = compat_writev(file, vec, vlen, &pos);
+	fput_light(file, fput_needed);
+	return ret;
+}
+
+asmlinkage long
+compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
+		    unsigned int nr_segs, unsigned int flags)
+{
+	unsigned i;
+	struct iovec __user *iov;
+	if (nr_segs > UIO_MAXIOV)
+		return -EINVAL;
+	iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
+	for (i = 0; i < nr_segs; i++) {
+		struct compat_iovec v;
+		if (get_user(v.iov_base, &iov32[i].iov_base) ||
+		    get_user(v.iov_len, &iov32[i].iov_len) ||
+		    put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
+		    put_user(v.iov_len, &iov[i].iov_len))
+			return -EFAULT;
+	}
+	return sys_vmsplice(fd, iov, nr_segs, flags);
+}
+
+/*
+ * Exactly like fs/open.c:sys_open(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open(const char __user *filename, int flags, int mode)
+{
+	return do_sys_open(AT_FDCWD, filename, flags, mode);
+}
+
+/*
+ * Exactly like fs/open.c:sys_openat(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int mode)
+{
+	return do_sys_open(dfd, filename, flags, mode);
+}
+
+#define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
+
+static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+				      int timeval, int ret)
+{
+	struct timespec ts;
+
+	if (!p)
+		return ret;
+
+	if (current->personality & STICKY_TIMEOUTS)
+		goto sticky;
+
+	/* No update for zero timeout */
+	if (!end_time->tv_sec && !end_time->tv_nsec)
+		return ret;
+
+	ktime_get_ts(&ts);
+	ts = timespec_sub(*end_time, ts);
+	if (ts.tv_sec < 0)
+		ts.tv_sec = ts.tv_nsec = 0;
+
+	if (timeval) {
+		struct compat_timeval rtv;
+
+		rtv.tv_sec = ts.tv_sec;
+		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
+
+		if (!copy_to_user(p, &rtv, sizeof(rtv)))
+			return ret;
+	} else {
+		struct compat_timespec rts;
+
+		rts.tv_sec = ts.tv_sec;
+		rts.tv_nsec = ts.tv_nsec;
+
+		if (!copy_to_user(p, &rts, sizeof(rts)))
+			return ret;
+	}
+	/*
+	 * If an application puts its timeval in read-only memory, we
+	 * don't want the Linux-specific update to the timeval to
+	 * cause a fault after the select has completed
+	 * successfully. However, because we're not updating the
+	 * timeval, we can't restart the system call.
+	 */
+
+sticky:
+	if (ret == -ERESTARTNOHAND)
+		ret = -EINTR;
+	return ret;
+}
+
+/*
+ * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
+ * 64-bit unsigned longs.
+ */
+static
+int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
+			unsigned long *fdset)
+{
+	nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
+	if (ufdset) {
+		unsigned long odd;
+
+		if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t)))
+			return -EFAULT;
+
+		odd = nr & 1UL;
+		nr &= ~1UL;
+		while (nr) {
+			unsigned long h, l;
+			if (__get_user(l, ufdset) || __get_user(h, ufdset+1))
+				return -EFAULT;
+			ufdset += 2;
+			*fdset++ = h << 32 | l;
+			nr -= 2;
+		}
+		if (odd && __get_user(*fdset, ufdset))
+			return -EFAULT;
+	} else {
+		/* Tricky, must clear full unsigned long in the
+		 * kernel fdset at the end, this makes sure that
+		 * actually happens.
+		 */
+		memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t));
+	}
+	return 0;
+}
+
+static
+int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
+		      unsigned long *fdset)
+{
+	unsigned long odd;
+	nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
+
+	if (!ufdset)
+		return 0;
+
+	odd = nr & 1UL;
+	nr &= ~1UL;
+	while (nr) {
+		unsigned long h, l;
+		l = *fdset++;
+		h = l >> 32;
+		if (__put_user(l, ufdset) || __put_user(h, ufdset+1))
+			return -EFAULT;
+		ufdset += 2;
+		nr -= 2;
+	}
+	if (odd && __put_user(*fdset, ufdset))
+		return -EFAULT;
+	return 0;
+}
+
+
+/*
+ * This is a virtual copy of sys_select from fs/select.c and probably
+ * should be compared to it from time to time
+ */
+
+/*
+ * We can actually return ERESTARTSYS instead of EINTR, but I'd
+ * like to be certain this leads to no problems. So I return
+ * EINTR just for safety.
+ *
+ * Update: ERESTARTSYS breaks at least the xview clock binary, so
+ * I'm trying ERESTARTNOHAND which restart only when you want to.
+ */
+int compat_core_sys_select(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct timespec *end_time)
+{
+	fd_set_bits fds;
+	void *bits;
+	int size, max_fds, ret = -EINVAL;
+	struct fdtable *fdt;
+	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
+
+	if (n < 0)
+		goto out_nofds;
+
+	/* max_fds can increase, so grab it once to avoid race */
+	rcu_read_lock();
+	fdt = files_fdtable(current->files);
+	max_fds = fdt->max_fds;
+	rcu_read_unlock();
+	if (n > max_fds)
+		n = max_fds;
+
+	/*
+	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
+	 * since we used fdset we need to allocate memory in units of
+	 * long-words.
+	 */
+	size = FDS_BYTES(n);
+	bits = stack_fds;
+	if (size > sizeof(stack_fds) / 6) {
+		bits = kmalloc(6 * size, GFP_KERNEL);
+		ret = -ENOMEM;
+		if (!bits)
+			goto out_nofds;
+	}
+	fds.in      = (unsigned long *)  bits;
+	fds.out     = (unsigned long *) (bits +   size);
+	fds.ex      = (unsigned long *) (bits + 2*size);
+	fds.res_in  = (unsigned long *) (bits + 3*size);
+	fds.res_out = (unsigned long *) (bits + 4*size);
+	fds.res_ex  = (unsigned long *) (bits + 5*size);
+
+	if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
+	    (ret = compat_get_fd_set(n, outp, fds.out)) ||
+	    (ret = compat_get_fd_set(n, exp, fds.ex)))
+		goto out;
+	zero_fd_set(n, fds.res_in);
+	zero_fd_set(n, fds.res_out);
+	zero_fd_set(n, fds.res_ex);
+
+	ret = do_select(n, &fds, end_time);
+
+	if (ret < 0)
+		goto out;
+	if (!ret) {
+		ret = -ERESTARTNOHAND;
+		if (signal_pending(current))
+			goto out;
+		ret = 0;
+	}
+
+	if (compat_set_fd_set(n, inp, fds.res_in) ||
+	    compat_set_fd_set(n, outp, fds.res_out) ||
+	    compat_set_fd_set(n, exp, fds.res_ex))
+		ret = -EFAULT;
+out:
+	if (bits != stack_fds)
+		kfree(bits);
+out_nofds:
+	return ret;
+}
+
+asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct compat_timeval __user *tvp)
+{
+	struct timespec end_time, *to = NULL;
+	struct compat_timeval tv;
+	int ret;
+
+	if (tvp) {
+		if (copy_from_user(&tv, tvp, sizeof(tv)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to,
+				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
+				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
+			return -EINVAL;
+	}
+
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
+
+	return ret;
+}
+
+struct compat_sel_arg_struct {
+	compat_ulong_t n;
+	compat_uptr_t inp;
+	compat_uptr_t outp;
+	compat_uptr_t exp;
+	compat_uptr_t tvp;
+};
+
+asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg)
+{
+	struct compat_sel_arg_struct a;
+
+	if (copy_from_user(&a, arg, sizeof(a)))
+		return -EFAULT;
+	return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
+				 compat_ptr(a.exp), compat_ptr(a.tvp));
+}
+
+#ifdef HAVE_SET_RESTORE_SIGMASK
+static long do_compat_pselect(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
+	compat_size_t sigsetsize)
+{
+	compat_sigset_t ss32;
+	sigset_t ksigmask, sigsaved;
+	struct compat_timespec ts;
+	struct timespec end_time, *to = NULL;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
+	}
+
+	if (sigmask) {
+		if (sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &ss32);
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+	if (ret == -ERESTARTNOHAND) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+					sizeof(sigsaved));
+			set_restore_sigmask();
+		}
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	return ret;
+}
+
+asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct compat_timespec __user *tsp, void __user *sig)
+{
+	compat_size_t sigsetsize = 0;
+	compat_uptr_t up = 0;
+
+	if (sig) {
+		if (!access_ok(VERIFY_READ, sig,
+				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
+		    	__get_user(up, (compat_uptr_t __user *)sig) ||
+		    	__get_user(sigsetsize,
+				(compat_size_t __user *)(sig+sizeof(up))))
+			return -EFAULT;
+	}
+	return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
+				 sigsetsize);
+}
+
+asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
+	unsigned int nfds, struct compat_timespec __user *tsp,
+	const compat_sigset_t __user *sigmask, compat_size_t sigsetsize)
+{
+	compat_sigset_t ss32;
+	sigset_t ksigmask, sigsaved;
+	struct compat_timespec ts;
+	struct timespec end_time, *to = NULL;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
+	}
+
+	if (sigmask) {
+		if (sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &ss32);
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = do_sys_poll(ufds, nfds, to);
+
+	/* We can restart this syscall, usually */
+	if (ret == -EINTR) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+				sizeof(sigsaved));
+			set_restore_sigmask();
+		}
+		ret = -ERESTARTNOHAND;
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+	return ret;
+}
+#endif /* HAVE_SET_RESTORE_SIGMASK */
+
+#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED)
+/* Stuff for NFS server syscalls... */
+struct compat_nfsctl_svc {
+	u16			svc32_port;
+	s32			svc32_nthreads;
+};
+
+struct compat_nfsctl_client {
+	s8			cl32_ident[NFSCLNT_IDMAX+1];
+	s32			cl32_naddr;
+	struct in_addr		cl32_addrlist[NFSCLNT_ADDRMAX];
+	s32			cl32_fhkeytype;
+	s32			cl32_fhkeylen;
+	u8			cl32_fhkey[NFSCLNT_KEYMAX];
+};
+
+struct compat_nfsctl_export {
+	char		ex32_client[NFSCLNT_IDMAX+1];
+	char		ex32_path[NFS_MAXPATHLEN+1];
+	compat_dev_t	ex32_dev;
+	compat_ino_t	ex32_ino;
+	compat_int_t	ex32_flags;
+	__compat_uid_t	ex32_anon_uid;
+	__compat_gid_t	ex32_anon_gid;
+};
+
+struct compat_nfsctl_fdparm {
+	struct sockaddr		gd32_addr;
+	s8			gd32_path[NFS_MAXPATHLEN+1];
+	compat_int_t		gd32_version;
+};
+
+struct compat_nfsctl_fsparm {
+	struct sockaddr		gd32_addr;
+	s8			gd32_path[NFS_MAXPATHLEN+1];
+	compat_int_t		gd32_maxlen;
+};
+
+struct compat_nfsctl_arg {
+	compat_int_t		ca32_version;	/* safeguard */
+	union {
+		struct compat_nfsctl_svc	u32_svc;
+		struct compat_nfsctl_client	u32_client;
+		struct compat_nfsctl_export	u32_export;
+		struct compat_nfsctl_fdparm	u32_getfd;
+		struct compat_nfsctl_fsparm	u32_getfs;
+	} u;
+#define ca32_svc	u.u32_svc
+#define ca32_client	u.u32_client
+#define ca32_export	u.u32_export
+#define ca32_getfd	u.u32_getfd
+#define ca32_getfs	u.u32_getfs
+};
+
+union compat_nfsctl_res {
+	__u8			cr32_getfh[NFS_FHSIZE];
+	struct knfsd_fh		cr32_getfs;
+};
+
+static int compat_nfs_svc_trans(struct nfsctl_arg *karg,
+				struct compat_nfsctl_arg __user *arg)
+{
+	if (!access_ok(VERIFY_READ, &arg->ca32_svc, sizeof(arg->ca32_svc)) ||
+		get_user(karg->ca_version, &arg->ca32_version) ||
+		__get_user(karg->ca_svc.svc_port, &arg->ca32_svc.svc32_port) ||
+		__get_user(karg->ca_svc.svc_nthreads,
+				&arg->ca32_svc.svc32_nthreads))
+		return -EFAULT;
+	return 0;
+}
+
+static int compat_nfs_clnt_trans(struct nfsctl_arg *karg,
+				struct compat_nfsctl_arg __user *arg)
+{
+	if (!access_ok(VERIFY_READ, &arg->ca32_client,
+			sizeof(arg->ca32_client)) ||
+		get_user(karg->ca_version, &arg->ca32_version) ||
+		__copy_from_user(&karg->ca_client.cl_ident[0],
+				&arg->ca32_client.cl32_ident[0],
+				NFSCLNT_IDMAX) ||
+		__get_user(karg->ca_client.cl_naddr,
+				&arg->ca32_client.cl32_naddr) ||
+		__copy_from_user(&karg->ca_client.cl_addrlist[0],
+				&arg->ca32_client.cl32_addrlist[0],
+				(sizeof(struct in_addr) * NFSCLNT_ADDRMAX)) ||
+		__get_user(karg->ca_client.cl_fhkeytype,
+				&arg->ca32_client.cl32_fhkeytype) ||
+		__get_user(karg->ca_client.cl_fhkeylen,
+				&arg->ca32_client.cl32_fhkeylen) ||
+		__copy_from_user(&karg->ca_client.cl_fhkey[0],
+				&arg->ca32_client.cl32_fhkey[0],
+				NFSCLNT_KEYMAX))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int compat_nfs_exp_trans(struct nfsctl_arg *karg,
+				struct compat_nfsctl_arg __user *arg)
+{
+	if (!access_ok(VERIFY_READ, &arg->ca32_export,
+				sizeof(arg->ca32_export)) ||
+		get_user(karg->ca_version, &arg->ca32_version) ||
+		__copy_from_user(&karg->ca_export.ex_client[0],
+				&arg->ca32_export.ex32_client[0],
+				NFSCLNT_IDMAX) ||
+		__copy_from_user(&karg->ca_export.ex_path[0],
+				&arg->ca32_export.ex32_path[0],
+				NFS_MAXPATHLEN) ||
+		__get_user(karg->ca_export.ex_dev,
+				&arg->ca32_export.ex32_dev) ||
+		__get_user(karg->ca_export.ex_ino,
+				&arg->ca32_export.ex32_ino) ||
+		__get_user(karg->ca_export.ex_flags,
+				&arg->ca32_export.ex32_flags) ||
+		__get_user(karg->ca_export.ex_anon_uid,
+				&arg->ca32_export.ex32_anon_uid) ||
+		__get_user(karg->ca_export.ex_anon_gid,
+				&arg->ca32_export.ex32_anon_gid))
+		return -EFAULT;
+	SET_UID(karg->ca_export.ex_anon_uid, karg->ca_export.ex_anon_uid);
+	SET_GID(karg->ca_export.ex_anon_gid, karg->ca_export.ex_anon_gid);
+
+	return 0;
+}
+
+static int compat_nfs_getfd_trans(struct nfsctl_arg *karg,
+				struct compat_nfsctl_arg __user *arg)
+{
+	if (!access_ok(VERIFY_READ, &arg->ca32_getfd,
+			sizeof(arg->ca32_getfd)) ||
+		get_user(karg->ca_version, &arg->ca32_version) ||
+		__copy_from_user(&karg->ca_getfd.gd_addr,
+				&arg->ca32_getfd.gd32_addr,
+				(sizeof(struct sockaddr))) ||
+		__copy_from_user(&karg->ca_getfd.gd_path,
+				&arg->ca32_getfd.gd32_path,
+				(NFS_MAXPATHLEN+1)) ||
+		__get_user(karg->ca_getfd.gd_version,
+				&arg->ca32_getfd.gd32_version))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int compat_nfs_getfs_trans(struct nfsctl_arg *karg,
+				struct compat_nfsctl_arg __user *arg)
+{
+	if (!access_ok(VERIFY_READ,&arg->ca32_getfs,sizeof(arg->ca32_getfs)) ||
+		get_user(karg->ca_version, &arg->ca32_version) ||
+		__copy_from_user(&karg->ca_getfs.gd_addr,
+				&arg->ca32_getfs.gd32_addr,
+				(sizeof(struct sockaddr))) ||
+		__copy_from_user(&karg->ca_getfs.gd_path,
+				&arg->ca32_getfs.gd32_path,
+				(NFS_MAXPATHLEN+1)) ||
+		__get_user(karg->ca_getfs.gd_maxlen,
+				&arg->ca32_getfs.gd32_maxlen))
+		return -EFAULT;
+
+	return 0;
+}
+
+/* This really doesn't need translations, we are only passing
+ * back a union which contains opaque nfs file handle data.
+ */
+static int compat_nfs_getfh_res_trans(union nfsctl_res *kres,
+				union compat_nfsctl_res __user *res)
+{
+	int err;
+
+	err = copy_to_user(res, kres, sizeof(*res));
+
+	return (err) ? -EFAULT : 0;
+}
+
+asmlinkage long compat_sys_nfsservctl(int cmd,
+				struct compat_nfsctl_arg __user *arg,
+				union compat_nfsctl_res __user *res)
+{
+	struct nfsctl_arg *karg;
+	union nfsctl_res *kres;
+	mm_segment_t oldfs;
+	int err;
+
+	karg = kmalloc(sizeof(*karg), GFP_USER);
+	kres = kmalloc(sizeof(*kres), GFP_USER);
+	if(!karg || !kres) {
+		err = -ENOMEM;
+		goto done;
+	}
+
+	switch(cmd) {
+	case NFSCTL_SVC:
+		err = compat_nfs_svc_trans(karg, arg);
+		break;
+
+	case NFSCTL_ADDCLIENT:
+		err = compat_nfs_clnt_trans(karg, arg);
+		break;
+
+	case NFSCTL_DELCLIENT:
+		err = compat_nfs_clnt_trans(karg, arg);
+		break;
+
+	case NFSCTL_EXPORT:
+	case NFSCTL_UNEXPORT:
+		err = compat_nfs_exp_trans(karg, arg);
+		break;
+
+	case NFSCTL_GETFD:
+		err = compat_nfs_getfd_trans(karg, arg);
+		break;
+
+	case NFSCTL_GETFS:
+		err = compat_nfs_getfs_trans(karg, arg);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	if (err)
+		goto done;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	/* The __user pointer casts are valid because of the set_fs() */
+	err = sys_nfsservctl(cmd, (void __user *) karg, (void __user *) kres);
+	set_fs(oldfs);
+
+	if (err)
+		goto done;
+
+	if((cmd == NFSCTL_GETFD) ||
+	   (cmd == NFSCTL_GETFS))
+		err = compat_nfs_getfh_res_trans(kres, res);
+
+done:
+	kfree(karg);
+	kfree(kres);
+	return err;
+}
+#else /* !NFSD */
+long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2)
+{
+	return sys_ni_syscall();
+}
+#endif
+
+#ifdef CONFIG_EPOLL
+
+#ifdef HAVE_SET_RESTORE_SIGMASK
+asmlinkage long compat_sys_epoll_pwait(int epfd,
+			struct compat_epoll_event __user *events,
+			int maxevents, int timeout,
+			const compat_sigset_t __user *sigmask,
+			compat_size_t sigsetsize)
+{
+	long err;
+	compat_sigset_t csigmask;
+	sigset_t ksigmask, sigsaved;
+
+	/*
+	 * If the caller wants a certain signal mask to be set during the wait,
+	 * we apply it here.
+	 */
+	if (sigmask) {
+		if (sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &csigmask);
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	err = sys_epoll_wait(epfd, events, maxevents, timeout);
+
+	/*
+	 * If we changed the signal mask, we need to restore the original one.
+	 * In case we've got a signal while waiting, we do not restore the
+	 * signal mask yet, and we allow do_signal() to deliver the signal on
+	 * the way back to userspace, before the signal mask is restored.
+	 */
+	if (sigmask) {
+		if (err == -EINTR) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+			       sizeof(sigsaved));
+			set_restore_sigmask();
+		} else
+			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	}
+
+	return err;
+}
+#endif /* HAVE_SET_RESTORE_SIGMASK */
+
+#endif /* CONFIG_EPOLL */
+
+#ifdef CONFIG_SIGNALFD
+
+asmlinkage long compat_sys_signalfd4(int ufd,
+				     const compat_sigset_t __user *sigmask,
+				     compat_size_t sigsetsize, int flags)
+{
+	compat_sigset_t ss32;
+	sigset_t tmp;
+	sigset_t __user *ksigmask;
+
+	if (sigsetsize != sizeof(compat_sigset_t))
+		return -EINVAL;
+	if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+		return -EFAULT;
+	sigset_from_compat(&tmp, &ss32);
+	ksigmask = compat_alloc_user_space(sizeof(sigset_t));
+	if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
+		return -EFAULT;
+
+	return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags);
+}
+
+asmlinkage long compat_sys_signalfd(int ufd,
+				    const compat_sigset_t __user *sigmask,
+				    compat_size_t sigsetsize)
+{
+	return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0);
+}
+#endif /* CONFIG_SIGNALFD */
+
+#ifdef CONFIG_TIMERFD
+
+asmlinkage long compat_sys_timerfd_settime(int ufd, int flags,
+				   const struct compat_itimerspec __user *utmr,
+				   struct compat_itimerspec __user *otmr)
+{
+	int error;
+	struct itimerspec t;
+	struct itimerspec __user *ut;
+
+	if (get_compat_itimerspec(&t, utmr))
+		return -EFAULT;
+	ut = compat_alloc_user_space(2 * sizeof(struct itimerspec));
+	if (copy_to_user(&ut[0], &t, sizeof(t)))
+		return -EFAULT;
+	error = sys_timerfd_settime(ufd, flags, &ut[0], &ut[1]);
+	if (!error && otmr)
+		error = (copy_from_user(&t, &ut[1], sizeof(struct itimerspec)) ||
+			 put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0;
+
+	return error;
+}
+
+asmlinkage long compat_sys_timerfd_gettime(int ufd,
+				   struct compat_itimerspec __user *otmr)
+{
+	int error;
+	struct itimerspec t;
+	struct itimerspec __user *ut;
+
+	ut = compat_alloc_user_space(sizeof(struct itimerspec));
+	error = sys_timerfd_gettime(ufd, ut);
+	if (!error)
+		error = (copy_from_user(&t, ut, sizeof(struct itimerspec)) ||
+			 put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0;
+
+	return error;
+}
+
+#endif /* CONFIG_TIMERFD */
+
+#ifdef CONFIG_FHANDLE
+/*
+ * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
+ * doesn't set the O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open_by_handle_at(int mountdirfd,
+			     struct file_handle __user *handle, int flags)
+{
+	return do_handle_open(mountdirfd, handle, flags);
+}
+#endif
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
new file mode 100644
index 00000000..112e45a1
--- /dev/null
+++ b/fs/compat_binfmt_elf.c
@@ -0,0 +1,133 @@
+/*
+ * 32-bit compatibility support for ELF format executables and core dumps.
+ *
+ * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * Red Hat Author: Roland McGrath.
+ *
+ * This file is used in a 64-bit kernel that wants to support 32-bit ELF.
+ * asm/elf.h is responsible for defining the compat_* and COMPAT_* macros
+ * used below, with definitions appropriate for 32-bit ABI compatibility.
+ *
+ * We use macros to rename the ABI types and machine-dependent
+ * functions used in binfmt_elf.c to compat versions.
+ */
+
+#include <linux/elfcore-compat.h>
+#include <linux/time.h>
+
+/*
+ * Rename the basic ELF layout types to refer to the 32-bit class of files.
+ */
+#undef	ELF_CLASS
+#define ELF_CLASS	ELFCLASS32
+
+#undef	elfhdr
+#undef	elf_phdr
+#undef	elf_shdr
+#undef	elf_note
+#undef	elf_addr_t
+#define elfhdr		elf32_hdr
+#define elf_phdr	elf32_phdr
+#define elf_shdr	elf32_shdr
+#define elf_note	elf32_note
+#define elf_addr_t	Elf32_Addr
+
+/*
+ * The machine-dependent core note format types are defined in elfcore-compat.h,
+ * which requires asm/elf.h to define compat_elf_gregset_t et al.
+ */
+#define elf_prstatus	compat_elf_prstatus
+#define elf_prpsinfo	compat_elf_prpsinfo
+
+/*
+ * Compat version of cputime_to_compat_timeval, perhaps this
+ * should be an inline in <linux/compat.h>.
+ */
+static void cputime_to_compat_timeval(const cputime_t cputime,
+				      struct compat_timeval *value)
+{
+	struct timeval tv;
+	cputime_to_timeval(cputime, &tv);
+	value->tv_sec = tv.tv_sec;
+	value->tv_usec = tv.tv_usec;
+}
+
+#undef cputime_to_timeval
+#define cputime_to_timeval cputime_to_compat_timeval
+
+
+/*
+ * To use this file, asm/elf.h must define compat_elf_check_arch.
+ * The other following macros can be defined if the compat versions
+ * differ from the native ones, or omitted when they match.
+ */
+
+#undef	ELF_ARCH
+#undef	elf_check_arch
+#define	elf_check_arch	compat_elf_check_arch
+
+#ifdef	COMPAT_ELF_PLATFORM
+#undef	ELF_PLATFORM
+#define	ELF_PLATFORM		COMPAT_ELF_PLATFORM
+#endif
+
+#ifdef	COMPAT_ELF_HWCAP
+#undef	ELF_HWCAP
+#define	ELF_HWCAP		COMPAT_ELF_HWCAP
+#endif
+
+#ifdef	COMPAT_ARCH_DLINFO
+#undef	ARCH_DLINFO
+#define	ARCH_DLINFO		COMPAT_ARCH_DLINFO
+#endif
+
+#ifdef	COMPAT_ELF_ET_DYN_BASE
+#undef	ELF_ET_DYN_BASE
+#define	ELF_ET_DYN_BASE		COMPAT_ELF_ET_DYN_BASE
+#endif
+
+#ifdef COMPAT_ELF_EXEC_PAGESIZE
+#undef	ELF_EXEC_PAGESIZE
+#define	ELF_EXEC_PAGESIZE	COMPAT_ELF_EXEC_PAGESIZE
+#endif
+
+#ifdef	COMPAT_ELF_PLAT_INIT
+#undef	ELF_PLAT_INIT
+#define	ELF_PLAT_INIT		COMPAT_ELF_PLAT_INIT
+#endif
+
+#ifdef	COMPAT_SET_PERSONALITY
+#undef	SET_PERSONALITY
+#define	SET_PERSONALITY		COMPAT_SET_PERSONALITY
+#endif
+
+#ifdef	compat_start_thread
+#undef	start_thread
+#define	start_thread		compat_start_thread
+#endif
+
+#ifdef	compat_arch_setup_additional_pages
+#undef	ARCH_HAS_SETUP_ADDITIONAL_PAGES
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
+#undef	arch_setup_additional_pages
+#define	arch_setup_additional_pages compat_arch_setup_additional_pages
+#endif
+
+/*
+ * Rename a few of the symbols that binfmt_elf.c will define.
+ * These are all local so the names don't really matter, but it
+ * might make some debugging less confusing not to duplicate them.
+ */
+#define elf_format		compat_elf_format
+#define init_elf_binfmt		init_compat_elf_binfmt
+#define exit_elf_binfmt		exit_compat_elf_binfmt
+
+/*
+ * We share all the actual code with the native (64-bit) version.
+ */
+#include "binfmt_elf.c"
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
new file mode 100644
index 00000000..61abb638
--- /dev/null
+++ b/fs/compat_ioctl.c
@@ -0,0 +1,1656 @@
+/*
+ * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
+ *
+ * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
+ * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
+ * Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
+ * Copyright (C) 2003       Pavel Machek (pavel@ucw.cz)
+ *
+ * These routines maintain argument size conversion between 32bit and 64bit
+ * ioctls.
+ */
+
+#include <linux/joystick.h>
+
+#include <linux/types.h>
+#include <linux/compat.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/compiler.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/ioctl.h>
+#include <linux/if.h>
+#include <linux/if_bridge.h>
+#include <linux/raid/md_u.h>
+#include <linux/kd.h>
+#include <linux/route.h>
+#include <linux/in6.h>
+#include <linux/ipv6_route.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/vt.h>
+#include <linux/falloc.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/ppp_defs.h>
+#include <linux/if_ppp.h>
+#include <linux/if_pppox.h>
+#include <linux/mtio.h>
+#include <linux/auto_fs.h>
+#include <linux/auto_fs4.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>
+#include <linux/fb.h>
+#include <linux/videodev2.h>
+#include <linux/netdevice.h>
+#include <linux/raw.h>
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
+#include <linux/rtc.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/serial.h>
+#include <linux/if_tun.h>
+#include <linux/ctype.h>
+#include <linux/syscalls.h>
+#include <linux/i2c.h>
+#include <linux/i2c-dev.h>
+#include <linux/atalk.h>
+#include <linux/gfp.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci.h>
+#include <net/bluetooth/rfcomm.h>
+
+#include <linux/capi.h>
+#include <linux/gigaset_dev.h>
+
+#ifdef CONFIG_BLOCK
+#include <linux/loop.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+#include <scsi/sg.h>
+#endif
+
+#include <asm/uaccess.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_bonding.h>
+#include <linux/watchdog.h>
+
+#include <linux/soundcard.h>
+#include <linux/lp.h>
+#include <linux/ppdev.h>
+
+#include <linux/atm.h>
+#include <linux/atmarp.h>
+#include <linux/atmclip.h>
+#include <linux/atmdev.h>
+#include <linux/atmioc.h>
+#include <linux/atmlec.h>
+#include <linux/atmmpc.h>
+#include <linux/atmsvc.h>
+#include <linux/atm_tcp.h>
+#include <linux/sonet.h>
+#include <linux/atm_suni.h>
+
+#include <linux/usb.h>
+#include <linux/usbdevice_fs.h>
+#include <linux/nbd.h>
+#include <linux/random.h>
+#include <linux/filter.h>
+
+#include <linux/hiddev.h>
+
+#include <linux/dvb/audio.h>
+#include <linux/dvb/dmx.h>
+#include <linux/dvb/frontend.h>
+#include <linux/dvb/video.h>
+
+#include <linux/sort.h>
+
+#ifdef CONFIG_SPARC
+#include <asm/fbio.h>
+#endif
+
+static int w_long(unsigned int fd, unsigned int cmd,
+		compat_ulong_t __user *argp)
+{
+	mm_segment_t old_fs = get_fs();
+	int err;
+	unsigned long val;
+
+	set_fs (KERNEL_DS);
+	err = sys_ioctl(fd, cmd, (unsigned long)&val);
+	set_fs (old_fs);
+	if (!err && put_user(val, argp))
+		return -EFAULT;
+	return err;
+}
+
+struct compat_video_event {
+	int32_t		type;
+	compat_time_t	timestamp;
+	union {
+	        video_size_t size;
+		unsigned int frame_rate;
+	} u;
+};
+
+static int do_video_get_event(unsigned int fd, unsigned int cmd,
+		struct compat_video_event __user *up)
+{
+	struct video_event kevent;
+	mm_segment_t old_fs = get_fs();
+	int err;
+
+	set_fs(KERNEL_DS);
+	err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
+	set_fs(old_fs);
+
+	if (!err) {
+		err  = put_user(kevent.type, &up->type);
+		err |= put_user(kevent.timestamp, &up->timestamp);
+		err |= put_user(kevent.u.size.w, &up->u.size.w);
+		err |= put_user(kevent.u.size.h, &up->u.size.h);
+		err |= put_user(kevent.u.size.aspect_ratio,
+				&up->u.size.aspect_ratio);
+		if (err)
+			err = -EFAULT;
+	}
+
+	return err;
+}
+
+struct compat_video_still_picture {
+        compat_uptr_t iFrame;
+        int32_t size;
+};
+
+static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
+	struct compat_video_still_picture __user *up)
+{
+	struct video_still_picture __user *up_native;
+	compat_uptr_t fp;
+	int32_t size;
+	int err;
+
+	err  = get_user(fp, &up->iFrame);
+	err |= get_user(size, &up->size);
+	if (err)
+		return -EFAULT;
+
+	up_native =
+		compat_alloc_user_space(sizeof(struct video_still_picture));
+
+	err =  put_user(compat_ptr(fp), &up_native->iFrame);
+	err |= put_user(size, &up_native->size);
+	if (err)
+		return -EFAULT;
+
+	err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+
+	return err;
+}
+
+struct compat_video_spu_palette {
+	int length;
+	compat_uptr_t palette;
+};
+
+static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
+		struct compat_video_spu_palette __user *up)
+{
+	struct video_spu_palette __user *up_native;
+	compat_uptr_t palp;
+	int length, err;
+
+	err  = get_user(palp, &up->palette);
+	err |= get_user(length, &up->length);
+
+	up_native = compat_alloc_user_space(sizeof(struct video_spu_palette));
+	err  = put_user(compat_ptr(palp), &up_native->palette);
+	err |= put_user(length, &up_native->length);
+	if (err)
+		return -EFAULT;
+
+	err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+
+	return err;
+}
+
+#ifdef CONFIG_BLOCK
+typedef struct sg_io_hdr32 {
+	compat_int_t interface_id;	/* [i] 'S' for SCSI generic (required) */
+	compat_int_t dxfer_direction;	/* [i] data transfer direction  */
+	unsigned char cmd_len;		/* [i] SCSI command length ( <= 16 bytes) */
+	unsigned char mx_sb_len;		/* [i] max length to write to sbp */
+	unsigned short iovec_count;	/* [i] 0 implies no scatter gather */
+	compat_uint_t dxfer_len;		/* [i] byte count of data transfer */
+	compat_uint_t dxferp;		/* [i], [*io] points to data transfer memory
+					      or scatter gather list */
+	compat_uptr_t cmdp;		/* [i], [*i] points to command to perform */
+	compat_uptr_t sbp;		/* [i], [*o] points to sense_buffer memory */
+	compat_uint_t timeout;		/* [i] MAX_UINT->no timeout (unit: millisec) */
+	compat_uint_t flags;		/* [i] 0 -> default, see SG_FLAG... */
+	compat_int_t pack_id;		/* [i->o] unused internally (normally) */
+	compat_uptr_t usr_ptr;		/* [i->o] unused internally */
+	unsigned char status;		/* [o] scsi status */
+	unsigned char masked_status;	/* [o] shifted, masked scsi status */
+	unsigned char msg_status;		/* [o] messaging level data (optional) */
+	unsigned char sb_len_wr;		/* [o] byte count actually written to sbp */
+	unsigned short host_status;	/* [o] errors from host adapter */
+	unsigned short driver_status;	/* [o] errors from software driver */
+	compat_int_t resid;		/* [o] dxfer_len - actual_transferred */
+	compat_uint_t duration;		/* [o] time taken by cmd (unit: millisec) */
+	compat_uint_t info;		/* [o] auxiliary information */
+} sg_io_hdr32_t;  /* 64 bytes long (on sparc32) */
+
+typedef struct sg_iovec32 {
+	compat_uint_t iov_base;
+	compat_uint_t iov_len;
+} sg_iovec32_t;
+
+static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count)
+{
+	sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1);
+	sg_iovec32_t __user *iov32 = dxferp;
+	int i;
+
+	for (i = 0; i < iovec_count; i++) {
+		u32 base, len;
+
+		if (get_user(base, &iov32[i].iov_base) ||
+		    get_user(len, &iov32[i].iov_len) ||
+		    put_user(compat_ptr(base), &iov[i].iov_base) ||
+		    put_user(len, &iov[i].iov_len))
+			return -EFAULT;
+	}
+
+	if (put_user(iov, &sgio->dxferp))
+		return -EFAULT;
+	return 0;
+}
+
+static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
+			sg_io_hdr32_t __user *sgio32)
+{
+	sg_io_hdr_t __user *sgio;
+	u16 iovec_count;
+	u32 data;
+	void __user *dxferp;
+	int err;
+	int interface_id;
+
+	if (get_user(interface_id, &sgio32->interface_id))
+		return -EFAULT;
+	if (interface_id != 'S')
+		return sys_ioctl(fd, cmd, (unsigned long)sgio32);
+
+	if (get_user(iovec_count, &sgio32->iovec_count))
+		return -EFAULT;
+
+	{
+		void __user *top = compat_alloc_user_space(0);
+		void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) +
+				       (iovec_count * sizeof(sg_iovec_t)));
+		if (new > top)
+			return -EINVAL;
+
+		sgio = new;
+	}
+
+	/* Ok, now construct.  */
+	if (copy_in_user(&sgio->interface_id, &sgio32->interface_id,
+			 (2 * sizeof(int)) +
+			 (2 * sizeof(unsigned char)) +
+			 (1 * sizeof(unsigned short)) +
+			 (1 * sizeof(unsigned int))))
+		return -EFAULT;
+
+	if (get_user(data, &sgio32->dxferp))
+		return -EFAULT;
+	dxferp = compat_ptr(data);
+	if (iovec_count) {
+		if (sg_build_iovec(sgio, dxferp, iovec_count))
+			return -EFAULT;
+	} else {
+		if (put_user(dxferp, &sgio->dxferp))
+			return -EFAULT;
+	}
+
+	{
+		unsigned char __user *cmdp;
+		unsigned char __user *sbp;
+
+		if (get_user(data, &sgio32->cmdp))
+			return -EFAULT;
+		cmdp = compat_ptr(data);
+
+		if (get_user(data, &sgio32->sbp))
+			return -EFAULT;
+		sbp = compat_ptr(data);
+
+		if (put_user(cmdp, &sgio->cmdp) ||
+		    put_user(sbp, &sgio->sbp))
+			return -EFAULT;
+	}
+
+	if (copy_in_user(&sgio->timeout, &sgio32->timeout,
+			 3 * sizeof(int)))
+		return -EFAULT;
+
+	if (get_user(data, &sgio32->usr_ptr))
+		return -EFAULT;
+	if (put_user(compat_ptr(data), &sgio->usr_ptr))
+		return -EFAULT;
+
+	err = sys_ioctl(fd, cmd, (unsigned long) sgio);
+
+	if (err >= 0) {
+		void __user *datap;
+
+		if (copy_in_user(&sgio32->pack_id, &sgio->pack_id,
+				 sizeof(int)) ||
+		    get_user(datap, &sgio->usr_ptr) ||
+		    put_user((u32)(unsigned long)datap,
+			     &sgio32->usr_ptr) ||
+		    copy_in_user(&sgio32->status, &sgio->status,
+				 (4 * sizeof(unsigned char)) +
+				 (2 * sizeof(unsigned short)) +
+				 (3 * sizeof(int))))
+			err = -EFAULT;
+	}
+
+	return err;
+}
+
+struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
+	char req_state;
+	char orphan;
+	char sg_io_owned;
+	char problem;
+	int pack_id;
+	compat_uptr_t usr_ptr;
+	unsigned int duration;
+	int unused;
+};
+
+static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct
+			compat_sg_req_info __user *o)
+{
+	int err, i;
+	sg_req_info_t __user *r;
+	r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
+	err = sys_ioctl(fd,cmd,(unsigned long)r);
+	if (err < 0)
+		return err;
+	for (i = 0; i < SG_MAX_QUEUE; i++) {
+		void __user *ptr;
+		int d;
+
+		if (copy_in_user(o + i, r + i, offsetof(sg_req_info_t, usr_ptr)) ||
+		    get_user(ptr, &r[i].usr_ptr) ||
+		    get_user(d, &r[i].duration) ||
+		    put_user((u32)(unsigned long)(ptr), &o[i].usr_ptr) ||
+		    put_user(d, &o[i].duration))
+			return -EFAULT;
+	}
+	return err;
+}
+#endif /* CONFIG_BLOCK */
+
+struct sock_fprog32 {
+	unsigned short	len;
+	compat_caddr_t	filter;
+};
+
+#define PPPIOCSPASS32	_IOW('t', 71, struct sock_fprog32)
+#define PPPIOCSACTIVE32	_IOW('t', 70, struct sock_fprog32)
+
+static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
+			struct sock_fprog32 __user *u_fprog32)
+{
+	struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog));
+	void __user *fptr64;
+	u32 fptr32;
+	u16 flen;
+
+	if (get_user(flen, &u_fprog32->len) ||
+	    get_user(fptr32, &u_fprog32->filter))
+		return -EFAULT;
+
+	fptr64 = compat_ptr(fptr32);
+
+	if (put_user(flen, &u_fprog64->len) ||
+	    put_user(fptr64, &u_fprog64->filter))
+		return -EFAULT;
+
+	if (cmd == PPPIOCSPASS32)
+		cmd = PPPIOCSPASS;
+	else
+		cmd = PPPIOCSACTIVE;
+
+	return sys_ioctl(fd, cmd, (unsigned long) u_fprog64);
+}
+
+struct ppp_option_data32 {
+	compat_caddr_t	ptr;
+	u32			length;
+	compat_int_t		transmit;
+};
+#define PPPIOCSCOMPRESS32	_IOW('t', 77, struct ppp_option_data32)
+
+struct ppp_idle32 {
+	compat_time_t xmit_idle;
+	compat_time_t recv_idle;
+};
+#define PPPIOCGIDLE32		_IOR('t', 63, struct ppp_idle32)
+
+static int ppp_gidle(unsigned int fd, unsigned int cmd,
+		struct ppp_idle32 __user *idle32)
+{
+	struct ppp_idle __user *idle;
+	__kernel_time_t xmit, recv;
+	int err;
+
+	idle = compat_alloc_user_space(sizeof(*idle));
+
+	err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle);
+
+	if (!err) {
+		if (get_user(xmit, &idle->xmit_idle) ||
+		    get_user(recv, &idle->recv_idle) ||
+		    put_user(xmit, &idle32->xmit_idle) ||
+		    put_user(recv, &idle32->recv_idle))
+			err = -EFAULT;
+	}
+	return err;
+}
+
+static int ppp_scompress(unsigned int fd, unsigned int cmd,
+	struct ppp_option_data32 __user *odata32)
+{
+	struct ppp_option_data __user *odata;
+	__u32 data;
+	void __user *datap;
+
+	odata = compat_alloc_user_space(sizeof(*odata));
+
+	if (get_user(data, &odata32->ptr))
+		return -EFAULT;
+
+	datap = compat_ptr(data);
+	if (put_user(datap, &odata->ptr))
+		return -EFAULT;
+
+	if (copy_in_user(&odata->length, &odata32->length,
+			 sizeof(__u32) + sizeof(int)))
+		return -EFAULT;
+
+	return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata);
+}
+
+#ifdef CONFIG_BLOCK
+struct mtget32 {
+	compat_long_t	mt_type;
+	compat_long_t	mt_resid;
+	compat_long_t	mt_dsreg;
+	compat_long_t	mt_gstat;
+	compat_long_t	mt_erreg;
+	compat_daddr_t	mt_fileno;
+	compat_daddr_t	mt_blkno;
+};
+#define MTIOCGET32	_IOR('m', 2, struct mtget32)
+
+struct mtpos32 {
+	compat_long_t	mt_blkno;
+};
+#define MTIOCPOS32	_IOR('m', 3, struct mtpos32)
+
+static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
+{
+	mm_segment_t old_fs = get_fs();
+	struct mtget get;
+	struct mtget32 __user *umget32;
+	struct mtpos pos;
+	struct mtpos32 __user *upos32;
+	unsigned long kcmd;
+	void *karg;
+	int err = 0;
+
+	switch(cmd) {
+	case MTIOCPOS32:
+		kcmd = MTIOCPOS;
+		karg = &pos;
+		break;
+	default:	/* MTIOCGET32 */
+		kcmd = MTIOCGET;
+		karg = &get;
+		break;
+	}
+	set_fs (KERNEL_DS);
+	err = sys_ioctl (fd, kcmd, (unsigned long)karg);
+	set_fs (old_fs);
+	if (err)
+		return err;
+	switch (cmd) {
+	case MTIOCPOS32:
+		upos32 = argp;
+		err = __put_user(pos.mt_blkno, &upos32->mt_blkno);
+		break;
+	case MTIOCGET32:
+		umget32 = argp;
+		err = __put_user(get.mt_type, &umget32->mt_type);
+		err |= __put_user(get.mt_resid, &umget32->mt_resid);
+		err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg);
+		err |= __put_user(get.mt_gstat, &umget32->mt_gstat);
+		err |= __put_user(get.mt_erreg, &umget32->mt_erreg);
+		err |= __put_user(get.mt_fileno, &umget32->mt_fileno);
+		err |= __put_user(get.mt_blkno, &umget32->mt_blkno);
+		break;
+	}
+	return err ? -EFAULT: 0;
+}
+
+#endif /* CONFIG_BLOCK */
+
+/* Bluetooth ioctls */
+#define HCIUARTSETPROTO		_IOW('U', 200, int)
+#define HCIUARTGETPROTO		_IOR('U', 201, int)
+#define HCIUARTGETDEVICE	_IOR('U', 202, int)
+#define HCIUARTSETFLAGS		_IOW('U', 203, int)
+#define HCIUARTGETFLAGS		_IOR('U', 204, int)
+
+#define BNEPCONNADD	_IOW('B', 200, int)
+#define BNEPCONNDEL	_IOW('B', 201, int)
+#define BNEPGETCONNLIST	_IOR('B', 210, int)
+#define BNEPGETCONNINFO	_IOR('B', 211, int)
+
+#define CMTPCONNADD	_IOW('C', 200, int)
+#define CMTPCONNDEL	_IOW('C', 201, int)
+#define CMTPGETCONNLIST	_IOR('C', 210, int)
+#define CMTPGETCONNINFO	_IOR('C', 211, int)
+
+#define HIDPCONNADD	_IOW('H', 200, int)
+#define HIDPCONNDEL	_IOW('H', 201, int)
+#define HIDPGETCONNLIST	_IOR('H', 210, int)
+#define HIDPGETCONNINFO	_IOR('H', 211, int)
+
+
+struct serial_struct32 {
+        compat_int_t    type;
+        compat_int_t    line;
+        compat_uint_t   port;
+        compat_int_t    irq;
+        compat_int_t    flags;
+        compat_int_t    xmit_fifo_size;
+        compat_int_t    custom_divisor;
+        compat_int_t    baud_base;
+        unsigned short  close_delay;
+        char    io_type;
+        char    reserved_char[1];
+        compat_int_t    hub6;
+        unsigned short  closing_wait; /* time to wait before closing */
+        unsigned short  closing_wait2; /* no longer used... */
+        compat_uint_t   iomem_base;
+        unsigned short  iomem_reg_shift;
+        unsigned int    port_high;
+     /* compat_ulong_t  iomap_base FIXME */
+        compat_int_t    reserved[1];
+};
+
+static int serial_struct_ioctl(unsigned fd, unsigned cmd,
+			struct serial_struct32 __user *ss32)
+{
+        typedef struct serial_struct SS;
+        typedef struct serial_struct32 SS32;
+        int err;
+        struct serial_struct ss;
+        mm_segment_t oldseg = get_fs();
+        __u32 udata;
+	unsigned int base;
+
+        if (cmd == TIOCSSERIAL) {
+                if (!access_ok(VERIFY_READ, ss32, sizeof(SS32)))
+                        return -EFAULT;
+                if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base)))
+			return -EFAULT;
+                if (__get_user(udata, &ss32->iomem_base))
+			return -EFAULT;
+                ss.iomem_base = compat_ptr(udata);
+                if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
+		    __get_user(ss.port_high, &ss32->port_high))
+			return -EFAULT;
+                ss.iomap_base = 0UL;
+        }
+        set_fs(KERNEL_DS);
+                err = sys_ioctl(fd,cmd,(unsigned long)(&ss));
+        set_fs(oldseg);
+        if (cmd == TIOCGSERIAL && err >= 0) {
+                if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32)))
+                        return -EFAULT;
+                if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base)))
+			return -EFAULT;
+		base = (unsigned long)ss.iomem_base  >> 32 ?
+			0xffffffff : (unsigned)(unsigned long)ss.iomem_base;
+		if (__put_user(base, &ss32->iomem_base) ||
+		    __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
+		    __put_user(ss.port_high, &ss32->port_high))
+			return -EFAULT;
+        }
+        return err;
+}
+
+/*
+ * I2C layer ioctls
+ */
+
+struct i2c_msg32 {
+	u16 addr;
+	u16 flags;
+	u16 len;
+	compat_caddr_t buf;
+};
+
+struct i2c_rdwr_ioctl_data32 {
+	compat_caddr_t msgs; /* struct i2c_msg __user *msgs */
+	u32 nmsgs;
+};
+
+struct i2c_smbus_ioctl_data32 {
+	u8 read_write;
+	u8 command;
+	u32 size;
+	compat_caddr_t data; /* union i2c_smbus_data *data */
+};
+
+struct i2c_rdwr_aligned {
+	struct i2c_rdwr_ioctl_data cmd;
+	struct i2c_msg msgs[0];
+};
+
+static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
+			struct i2c_rdwr_ioctl_data32    __user *udata)
+{
+	struct i2c_rdwr_aligned		__user *tdata;
+	struct i2c_msg			__user *tmsgs;
+	struct i2c_msg32		__user *umsgs;
+	compat_caddr_t			datap;
+	int				nmsgs, i;
+
+	if (get_user(nmsgs, &udata->nmsgs))
+		return -EFAULT;
+	if (nmsgs > I2C_RDRW_IOCTL_MAX_MSGS)
+		return -EINVAL;
+
+	if (get_user(datap, &udata->msgs))
+		return -EFAULT;
+	umsgs = compat_ptr(datap);
+
+	tdata = compat_alloc_user_space(sizeof(*tdata) +
+				      nmsgs * sizeof(struct i2c_msg));
+	tmsgs = &tdata->msgs[0];
+
+	if (put_user(nmsgs, &tdata->cmd.nmsgs) ||
+	    put_user(tmsgs, &tdata->cmd.msgs))
+		return -EFAULT;
+
+	for (i = 0; i < nmsgs; i++) {
+		if (copy_in_user(&tmsgs[i].addr, &umsgs[i].addr, 3*sizeof(u16)))
+			return -EFAULT;
+		if (get_user(datap, &umsgs[i].buf) ||
+		    put_user(compat_ptr(datap), &tmsgs[i].buf))
+			return -EFAULT;
+	}
+	return sys_ioctl(fd, cmd, (unsigned long)tdata);
+}
+
+static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
+			struct i2c_smbus_ioctl_data32   __user *udata)
+{
+	struct i2c_smbus_ioctl_data	__user *tdata;
+	compat_caddr_t			datap;
+
+	tdata = compat_alloc_user_space(sizeof(*tdata));
+	if (tdata == NULL)
+		return -ENOMEM;
+	if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata)))
+		return -EFAULT;
+
+	if (!access_ok(VERIFY_READ, udata, sizeof(*udata)))
+		return -EFAULT;
+
+	if (__copy_in_user(&tdata->read_write, &udata->read_write, 2 * sizeof(u8)))
+		return -EFAULT;
+	if (__copy_in_user(&tdata->size, &udata->size, 2 * sizeof(u32)))
+		return -EFAULT;
+	if (__get_user(datap, &udata->data) ||
+	    __put_user(compat_ptr(datap), &tdata->data))
+		return -EFAULT;
+
+	return sys_ioctl(fd, cmd, (unsigned long)tdata);
+}
+
+#define RTC_IRQP_READ32		_IOR('p', 0x0b, compat_ulong_t)
+#define RTC_IRQP_SET32		_IOW('p', 0x0c, compat_ulong_t)
+#define RTC_EPOCH_READ32	_IOR('p', 0x0d, compat_ulong_t)
+#define RTC_EPOCH_SET32		_IOW('p', 0x0e, compat_ulong_t)
+
+static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp)
+{
+	mm_segment_t oldfs = get_fs();
+	compat_ulong_t val32;
+	unsigned long kval;
+	int ret;
+
+	switch (cmd) {
+	case RTC_IRQP_READ32:
+	case RTC_EPOCH_READ32:
+		set_fs(KERNEL_DS);
+		ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ?
+					RTC_IRQP_READ : RTC_EPOCH_READ,
+					(unsigned long)&kval);
+		set_fs(oldfs);
+		if (ret)
+			return ret;
+		val32 = kval;
+		return put_user(val32, (unsigned int __user *)argp);
+	case RTC_IRQP_SET32:
+		return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp);
+	case RTC_EPOCH_SET32:
+		return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp);
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+/* on ia32 l_start is on a 32-bit boundary */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+struct space_resv_32 {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start	__attribute__((packed));
+			/* len == 0 means until end of file */
+	__s64		l_len __attribute__((packed));
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserve area */
+};
+
+#define FS_IOC_RESVSP_32		_IOW ('X', 40, struct space_resv_32)
+#define FS_IOC_RESVSP64_32	_IOW ('X', 42, struct space_resv_32)
+
+/* just account for different alignment */
+static int compat_ioctl_preallocate(struct file *file,
+			struct space_resv_32    __user *p32)
+{
+	struct space_resv	__user *p = compat_alloc_user_space(sizeof(*p));
+
+	if (copy_in_user(&p->l_type,	&p32->l_type,	sizeof(s16)) ||
+	    copy_in_user(&p->l_whence,	&p32->l_whence, sizeof(s16)) ||
+	    copy_in_user(&p->l_start,	&p32->l_start,	sizeof(s64)) ||
+	    copy_in_user(&p->l_len,	&p32->l_len,	sizeof(s64)) ||
+	    copy_in_user(&p->l_sysid,	&p32->l_sysid,	sizeof(s32)) ||
+	    copy_in_user(&p->l_pid,	&p32->l_pid,	sizeof(u32)) ||
+	    copy_in_user(&p->l_pad,	&p32->l_pad,	4*sizeof(u32)))
+		return -EFAULT;
+
+	return ioctl_preallocate(file, p);
+}
+#endif
+
+/*
+ * simple reversible transform to make our table more evenly
+ * distributed after sorting.
+ */
+#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff)
+
+#define COMPATIBLE_IOCTL(cmd) XFORM(cmd),
+/* ioctl should not be warned about even if it's not implemented.
+   Valid reasons to use this:
+   - It is implemented with ->compat_ioctl on some device, but programs
+   call it on others too.
+   - The ioctl is not implemented in the native kernel, but programs
+   call it commonly anyways.
+   Most other reasons are not valid. */
+#define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd)
+
+static unsigned int ioctl_pointer[] = {
+/* compatible ioctls first */
+COMPATIBLE_IOCTL(0x4B50)   /* KDGHWCLK - not in the kernel, but don't complain */
+COMPATIBLE_IOCTL(0x4B51)   /* KDSHWCLK - not in the kernel, but don't complain */
+
+/* Big T */
+COMPATIBLE_IOCTL(TCGETA)
+COMPATIBLE_IOCTL(TCSETA)
+COMPATIBLE_IOCTL(TCSETAW)
+COMPATIBLE_IOCTL(TCSETAF)
+COMPATIBLE_IOCTL(TCSBRK)
+COMPATIBLE_IOCTL(TCXONC)
+COMPATIBLE_IOCTL(TCFLSH)
+COMPATIBLE_IOCTL(TCGETS)
+COMPATIBLE_IOCTL(TCSETS)
+COMPATIBLE_IOCTL(TCSETSW)
+COMPATIBLE_IOCTL(TCSETSF)
+COMPATIBLE_IOCTL(TIOCLINUX)
+COMPATIBLE_IOCTL(TIOCSBRK)
+COMPATIBLE_IOCTL(TIOCGDEV)
+COMPATIBLE_IOCTL(TIOCCBRK)
+COMPATIBLE_IOCTL(TIOCGSID)
+COMPATIBLE_IOCTL(TIOCGICOUNT)
+/* Little t */
+COMPATIBLE_IOCTL(TIOCGETD)
+COMPATIBLE_IOCTL(TIOCSETD)
+COMPATIBLE_IOCTL(TIOCEXCL)
+COMPATIBLE_IOCTL(TIOCNXCL)
+COMPATIBLE_IOCTL(TIOCCONS)
+COMPATIBLE_IOCTL(TIOCGSOFTCAR)
+COMPATIBLE_IOCTL(TIOCSSOFTCAR)
+COMPATIBLE_IOCTL(TIOCSWINSZ)
+COMPATIBLE_IOCTL(TIOCGWINSZ)
+COMPATIBLE_IOCTL(TIOCMGET)
+COMPATIBLE_IOCTL(TIOCMBIC)
+COMPATIBLE_IOCTL(TIOCMBIS)
+COMPATIBLE_IOCTL(TIOCMSET)
+COMPATIBLE_IOCTL(TIOCPKT)
+COMPATIBLE_IOCTL(TIOCNOTTY)
+COMPATIBLE_IOCTL(TIOCSTI)
+COMPATIBLE_IOCTL(TIOCOUTQ)
+COMPATIBLE_IOCTL(TIOCSPGRP)
+COMPATIBLE_IOCTL(TIOCGPGRP)
+COMPATIBLE_IOCTL(TIOCGPTN)
+COMPATIBLE_IOCTL(TIOCSPTLCK)
+COMPATIBLE_IOCTL(TIOCSERGETLSR)
+COMPATIBLE_IOCTL(TIOCSIG)
+#ifdef TCGETS2
+COMPATIBLE_IOCTL(TCGETS2)
+COMPATIBLE_IOCTL(TCSETS2)
+COMPATIBLE_IOCTL(TCSETSW2)
+COMPATIBLE_IOCTL(TCSETSF2)
+#endif
+/* Little f */
+COMPATIBLE_IOCTL(FIOCLEX)
+COMPATIBLE_IOCTL(FIONCLEX)
+COMPATIBLE_IOCTL(FIOASYNC)
+COMPATIBLE_IOCTL(FIONBIO)
+COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
+COMPATIBLE_IOCTL(FS_IOC_FIEMAP)
+/* 0x00 */
+COMPATIBLE_IOCTL(FIBMAP)
+COMPATIBLE_IOCTL(FIGETBSZ)
+/* 'X' - originally XFS but some now in the VFS */
+COMPATIBLE_IOCTL(FIFREEZE)
+COMPATIBLE_IOCTL(FITHAW)
+COMPATIBLE_IOCTL(KDGETKEYCODE)
+COMPATIBLE_IOCTL(KDSETKEYCODE)
+COMPATIBLE_IOCTL(KDGKBTYPE)
+COMPATIBLE_IOCTL(KDGETMODE)
+COMPATIBLE_IOCTL(KDGKBMODE)
+COMPATIBLE_IOCTL(KDGKBMETA)
+COMPATIBLE_IOCTL(KDGKBENT)
+COMPATIBLE_IOCTL(KDSKBENT)
+COMPATIBLE_IOCTL(KDGKBSENT)
+COMPATIBLE_IOCTL(KDSKBSENT)
+COMPATIBLE_IOCTL(KDGKBDIACR)
+COMPATIBLE_IOCTL(KDSKBDIACR)
+COMPATIBLE_IOCTL(KDKBDREP)
+COMPATIBLE_IOCTL(KDGKBLED)
+COMPATIBLE_IOCTL(KDGETLED)
+#ifdef CONFIG_BLOCK
+/* Big S */
+COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
+COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK)
+COMPATIBLE_IOCTL(SCSI_IOCTL_DOORUNLOCK)
+COMPATIBLE_IOCTL(SCSI_IOCTL_TEST_UNIT_READY)
+COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER)
+COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
+COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
+COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
+#endif
+/* Big V (don't complain on serial console) */
+IGNORE_IOCTL(VT_OPENQRY)
+IGNORE_IOCTL(VT_GETMODE)
+/* Little p (/dev/rtc, /dev/envctrl, etc.) */
+COMPATIBLE_IOCTL(RTC_AIE_ON)
+COMPATIBLE_IOCTL(RTC_AIE_OFF)
+COMPATIBLE_IOCTL(RTC_UIE_ON)
+COMPATIBLE_IOCTL(RTC_UIE_OFF)
+COMPATIBLE_IOCTL(RTC_PIE_ON)
+COMPATIBLE_IOCTL(RTC_PIE_OFF)
+COMPATIBLE_IOCTL(RTC_WIE_ON)
+COMPATIBLE_IOCTL(RTC_WIE_OFF)
+COMPATIBLE_IOCTL(RTC_ALM_SET)
+COMPATIBLE_IOCTL(RTC_ALM_READ)
+COMPATIBLE_IOCTL(RTC_RD_TIME)
+COMPATIBLE_IOCTL(RTC_SET_TIME)
+COMPATIBLE_IOCTL(RTC_WKALM_SET)
+COMPATIBLE_IOCTL(RTC_WKALM_RD)
+/*
+ * These two are only for the sbus rtc driver, but
+ * hwclock tries them on every rtc device first when
+ * running on sparc.  On other architectures the entries
+ * are useless but harmless.
+ */
+COMPATIBLE_IOCTL(_IOR('p', 20, int[7])) /* RTCGET */
+COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */
+/* Little m */
+COMPATIBLE_IOCTL(MTIOCTOP)
+/* Socket level stuff */
+COMPATIBLE_IOCTL(FIOQSIZE)
+#ifdef CONFIG_BLOCK
+/* loop */
+IGNORE_IOCTL(LOOP_CLR_FD)
+/* md calls this on random blockdevs */
+IGNORE_IOCTL(RAID_VERSION)
+/* SG stuff */
+COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
+COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
+COMPATIBLE_IOCTL(SG_EMULATED_HOST)
+COMPATIBLE_IOCTL(SG_GET_TRANSFORM)
+COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE)
+COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE)
+COMPATIBLE_IOCTL(SG_GET_SCSI_ID)
+COMPATIBLE_IOCTL(SG_SET_FORCE_LOW_DMA)
+COMPATIBLE_IOCTL(SG_GET_LOW_DMA)
+COMPATIBLE_IOCTL(SG_SET_FORCE_PACK_ID)
+COMPATIBLE_IOCTL(SG_GET_PACK_ID)
+COMPATIBLE_IOCTL(SG_GET_NUM_WAITING)
+COMPATIBLE_IOCTL(SG_SET_DEBUG)
+COMPATIBLE_IOCTL(SG_GET_SG_TABLESIZE)
+COMPATIBLE_IOCTL(SG_GET_COMMAND_Q)
+COMPATIBLE_IOCTL(SG_SET_COMMAND_Q)
+COMPATIBLE_IOCTL(SG_GET_VERSION_NUM)
+COMPATIBLE_IOCTL(SG_NEXT_CMD_LEN)
+COMPATIBLE_IOCTL(SG_SCSI_RESET)
+COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
+COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN)
+COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN)
+#endif
+/* PPP stuff */
+COMPATIBLE_IOCTL(PPPIOCGFLAGS)
+COMPATIBLE_IOCTL(PPPIOCSFLAGS)
+COMPATIBLE_IOCTL(PPPIOCGASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCSASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCGUNIT)
+COMPATIBLE_IOCTL(PPPIOCGRASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCSRASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCGMRU)
+COMPATIBLE_IOCTL(PPPIOCSMRU)
+COMPATIBLE_IOCTL(PPPIOCSMAXCID)
+COMPATIBLE_IOCTL(PPPIOCGXASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCSXASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCXFERUNIT)
+/* PPPIOCSCOMPRESS is translated */
+COMPATIBLE_IOCTL(PPPIOCGNPMODE)
+COMPATIBLE_IOCTL(PPPIOCSNPMODE)
+COMPATIBLE_IOCTL(PPPIOCGDEBUG)
+COMPATIBLE_IOCTL(PPPIOCSDEBUG)
+/* PPPIOCSPASS is translated */
+/* PPPIOCSACTIVE is translated */
+/* PPPIOCGIDLE is translated */
+COMPATIBLE_IOCTL(PPPIOCNEWUNIT)
+COMPATIBLE_IOCTL(PPPIOCATTACH)
+COMPATIBLE_IOCTL(PPPIOCDETACH)
+COMPATIBLE_IOCTL(PPPIOCSMRRU)
+COMPATIBLE_IOCTL(PPPIOCCONNECT)
+COMPATIBLE_IOCTL(PPPIOCDISCONN)
+COMPATIBLE_IOCTL(PPPIOCATTCHAN)
+COMPATIBLE_IOCTL(PPPIOCGCHAN)
+/* PPPOX */
+COMPATIBLE_IOCTL(PPPOEIOCSFWD)
+COMPATIBLE_IOCTL(PPPOEIOCDFWD)
+/* ppdev */
+COMPATIBLE_IOCTL(PPSETMODE)
+COMPATIBLE_IOCTL(PPRSTATUS)
+COMPATIBLE_IOCTL(PPRCONTROL)
+COMPATIBLE_IOCTL(PPWCONTROL)
+COMPATIBLE_IOCTL(PPFCONTROL)
+COMPATIBLE_IOCTL(PPRDATA)
+COMPATIBLE_IOCTL(PPWDATA)
+COMPATIBLE_IOCTL(PPCLAIM)
+COMPATIBLE_IOCTL(PPRELEASE)
+COMPATIBLE_IOCTL(PPYIELD)
+COMPATIBLE_IOCTL(PPEXCL)
+COMPATIBLE_IOCTL(PPDATADIR)
+COMPATIBLE_IOCTL(PPNEGOT)
+COMPATIBLE_IOCTL(PPWCTLONIRQ)
+COMPATIBLE_IOCTL(PPCLRIRQ)
+COMPATIBLE_IOCTL(PPSETPHASE)
+COMPATIBLE_IOCTL(PPGETMODES)
+COMPATIBLE_IOCTL(PPGETMODE)
+COMPATIBLE_IOCTL(PPGETPHASE)
+COMPATIBLE_IOCTL(PPGETFLAGS)
+COMPATIBLE_IOCTL(PPSETFLAGS)
+/* Big A */
+/* sparc only */
+/* Big Q for sound/OSS */
+COMPATIBLE_IOCTL(SNDCTL_SEQ_RESET)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_SYNC)
+COMPATIBLE_IOCTL(SNDCTL_SYNTH_INFO)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_CTRLRATE)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_GETOUTCOUNT)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_GETINCOUNT)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_PERCMODE)
+COMPATIBLE_IOCTL(SNDCTL_FM_LOAD_INSTR)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_TESTMIDI)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_RESETSAMPLES)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_NRSYNTHS)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_NRMIDIS)
+COMPATIBLE_IOCTL(SNDCTL_MIDI_INFO)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_THRESHOLD)
+COMPATIBLE_IOCTL(SNDCTL_SYNTH_MEMAVL)
+COMPATIBLE_IOCTL(SNDCTL_FM_4OP_ENABLE)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_PANIC)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_OUTOFBAND)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_GETTIME)
+COMPATIBLE_IOCTL(SNDCTL_SYNTH_ID)
+COMPATIBLE_IOCTL(SNDCTL_SYNTH_CONTROL)
+COMPATIBLE_IOCTL(SNDCTL_SYNTH_REMOVESAMPLE)
+/* Big T for sound/OSS */
+COMPATIBLE_IOCTL(SNDCTL_TMR_TIMEBASE)
+COMPATIBLE_IOCTL(SNDCTL_TMR_START)
+COMPATIBLE_IOCTL(SNDCTL_TMR_STOP)
+COMPATIBLE_IOCTL(SNDCTL_TMR_CONTINUE)
+COMPATIBLE_IOCTL(SNDCTL_TMR_TEMPO)
+COMPATIBLE_IOCTL(SNDCTL_TMR_SOURCE)
+COMPATIBLE_IOCTL(SNDCTL_TMR_METRONOME)
+COMPATIBLE_IOCTL(SNDCTL_TMR_SELECT)
+/* Little m for sound/OSS */
+COMPATIBLE_IOCTL(SNDCTL_MIDI_PRETIME)
+COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUMODE)
+COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUCMD)
+/* Big P for sound/OSS */
+COMPATIBLE_IOCTL(SNDCTL_DSP_RESET)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SYNC)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SPEED)
+COMPATIBLE_IOCTL(SNDCTL_DSP_STEREO)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETBLKSIZE)
+COMPATIBLE_IOCTL(SNDCTL_DSP_CHANNELS)
+COMPATIBLE_IOCTL(SOUND_PCM_WRITE_FILTER)
+COMPATIBLE_IOCTL(SNDCTL_DSP_POST)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SUBDIVIDE)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SETFRAGMENT)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETFMTS)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SETFMT)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETOSPACE)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETISPACE)
+COMPATIBLE_IOCTL(SNDCTL_DSP_NONBLOCK)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETCAPS)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETTRIGGER)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SETTRIGGER)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETIPTR)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETOPTR)
+/* SNDCTL_DSP_MAPINBUF,  XXX needs translation */
+/* SNDCTL_DSP_MAPOUTBUF,  XXX needs translation */
+COMPATIBLE_IOCTL(SNDCTL_DSP_SETSYNCRO)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SETDUPLEX)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETODELAY)
+COMPATIBLE_IOCTL(SNDCTL_DSP_PROFILE)
+COMPATIBLE_IOCTL(SOUND_PCM_READ_RATE)
+COMPATIBLE_IOCTL(SOUND_PCM_READ_CHANNELS)
+COMPATIBLE_IOCTL(SOUND_PCM_READ_BITS)
+COMPATIBLE_IOCTL(SOUND_PCM_READ_FILTER)
+/* Big C for sound/OSS */
+COMPATIBLE_IOCTL(SNDCTL_COPR_RESET)
+COMPATIBLE_IOCTL(SNDCTL_COPR_LOAD)
+COMPATIBLE_IOCTL(SNDCTL_COPR_RDATA)
+COMPATIBLE_IOCTL(SNDCTL_COPR_RCODE)
+COMPATIBLE_IOCTL(SNDCTL_COPR_WDATA)
+COMPATIBLE_IOCTL(SNDCTL_COPR_WCODE)
+COMPATIBLE_IOCTL(SNDCTL_COPR_RUN)
+COMPATIBLE_IOCTL(SNDCTL_COPR_HALT)
+COMPATIBLE_IOCTL(SNDCTL_COPR_SENDMSG)
+COMPATIBLE_IOCTL(SNDCTL_COPR_RCVMSG)
+/* Big M for sound/OSS */
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_VOLUME)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_BASS)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_TREBLE)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_SYNTH)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_PCM)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_SPEAKER)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_MIC)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_CD)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_IMIX)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_ALTPCM)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECLEV)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_IGAIN)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_OGAIN)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE1)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE2)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE3)
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL1))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL2))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL3))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEIN))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEOUT))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_VIDEO))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_RADIO))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_MONITOR))
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_MUTE)
+/* SOUND_MIXER_READ_ENHANCE,  same value as READ_MUTE */
+/* SOUND_MIXER_READ_LOUD,  same value as READ_MUTE */
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECSRC)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_DEVMASK)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECMASK)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_STEREODEVS)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_CAPS)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_VOLUME)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_BASS)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_TREBLE)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SYNTH)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_PCM)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SPEAKER)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MIC)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_CD)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IMIX)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_ALTPCM)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECLEV)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IGAIN)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_OGAIN)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE1)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE2)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE3)
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL1))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL2))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL3))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEIN))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEOUT))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_VIDEO))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_RADIO))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_MONITOR))
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MUTE)
+/* SOUND_MIXER_WRITE_ENHANCE,  same value as WRITE_MUTE */
+/* SOUND_MIXER_WRITE_LOUD,  same value as WRITE_MUTE */
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECSRC)
+COMPATIBLE_IOCTL(SOUND_MIXER_INFO)
+COMPATIBLE_IOCTL(SOUND_OLD_MIXER_INFO)
+COMPATIBLE_IOCTL(SOUND_MIXER_ACCESS)
+COMPATIBLE_IOCTL(SOUND_MIXER_AGC)
+COMPATIBLE_IOCTL(SOUND_MIXER_3DSE)
+COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE1)
+COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE2)
+COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE3)
+COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE4)
+COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
+COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
+COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
+COMPATIBLE_IOCTL(OSS_GETVERSION)
+/* Raw devices */
+COMPATIBLE_IOCTL(RAW_SETBIND)
+COMPATIBLE_IOCTL(RAW_GETBIND)
+/* Watchdog */
+COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
+COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
+COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS)
+COMPATIBLE_IOCTL(WDIOC_GETTEMP)
+COMPATIBLE_IOCTL(WDIOC_SETOPTIONS)
+COMPATIBLE_IOCTL(WDIOC_KEEPALIVE)
+COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT)
+COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT)
+/* Big R */
+COMPATIBLE_IOCTL(RNDGETENTCNT)
+COMPATIBLE_IOCTL(RNDADDTOENTCNT)
+COMPATIBLE_IOCTL(RNDGETPOOL)
+COMPATIBLE_IOCTL(RNDADDENTROPY)
+COMPATIBLE_IOCTL(RNDZAPENTCNT)
+COMPATIBLE_IOCTL(RNDCLEARPOOL)
+/* Bluetooth */
+COMPATIBLE_IOCTL(HCIDEVUP)
+COMPATIBLE_IOCTL(HCIDEVDOWN)
+COMPATIBLE_IOCTL(HCIDEVRESET)
+COMPATIBLE_IOCTL(HCIDEVRESTAT)
+COMPATIBLE_IOCTL(HCIGETDEVLIST)
+COMPATIBLE_IOCTL(HCIGETDEVINFO)
+COMPATIBLE_IOCTL(HCIGETCONNLIST)
+COMPATIBLE_IOCTL(HCIGETCONNINFO)
+COMPATIBLE_IOCTL(HCIGETAUTHINFO)
+COMPATIBLE_IOCTL(HCISETRAW)
+COMPATIBLE_IOCTL(HCISETSCAN)
+COMPATIBLE_IOCTL(HCISETAUTH)
+COMPATIBLE_IOCTL(HCISETENCRYPT)
+COMPATIBLE_IOCTL(HCISETPTYPE)
+COMPATIBLE_IOCTL(HCISETLINKPOL)
+COMPATIBLE_IOCTL(HCISETLINKMODE)
+COMPATIBLE_IOCTL(HCISETACLMTU)
+COMPATIBLE_IOCTL(HCISETSCOMTU)
+COMPATIBLE_IOCTL(HCIBLOCKADDR)
+COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
+COMPATIBLE_IOCTL(HCIINQUIRY)
+COMPATIBLE_IOCTL(HCIUARTSETPROTO)
+COMPATIBLE_IOCTL(HCIUARTGETPROTO)
+COMPATIBLE_IOCTL(RFCOMMCREATEDEV)
+COMPATIBLE_IOCTL(RFCOMMRELEASEDEV)
+COMPATIBLE_IOCTL(RFCOMMGETDEVLIST)
+COMPATIBLE_IOCTL(RFCOMMGETDEVINFO)
+COMPATIBLE_IOCTL(RFCOMMSTEALDLC)
+COMPATIBLE_IOCTL(BNEPCONNADD)
+COMPATIBLE_IOCTL(BNEPCONNDEL)
+COMPATIBLE_IOCTL(BNEPGETCONNLIST)
+COMPATIBLE_IOCTL(BNEPGETCONNINFO)
+COMPATIBLE_IOCTL(CMTPCONNADD)
+COMPATIBLE_IOCTL(CMTPCONNDEL)
+COMPATIBLE_IOCTL(CMTPGETCONNLIST)
+COMPATIBLE_IOCTL(CMTPGETCONNINFO)
+COMPATIBLE_IOCTL(HIDPCONNADD)
+COMPATIBLE_IOCTL(HIDPCONNDEL)
+COMPATIBLE_IOCTL(HIDPGETCONNLIST)
+COMPATIBLE_IOCTL(HIDPGETCONNINFO)
+/* CAPI */
+COMPATIBLE_IOCTL(CAPI_REGISTER)
+COMPATIBLE_IOCTL(CAPI_GET_MANUFACTURER)
+COMPATIBLE_IOCTL(CAPI_GET_VERSION)
+COMPATIBLE_IOCTL(CAPI_GET_SERIAL)
+COMPATIBLE_IOCTL(CAPI_GET_PROFILE)
+COMPATIBLE_IOCTL(CAPI_MANUFACTURER_CMD)
+COMPATIBLE_IOCTL(CAPI_GET_ERRCODE)
+COMPATIBLE_IOCTL(CAPI_INSTALLED)
+COMPATIBLE_IOCTL(CAPI_GET_FLAGS)
+COMPATIBLE_IOCTL(CAPI_SET_FLAGS)
+COMPATIBLE_IOCTL(CAPI_CLR_FLAGS)
+COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT)
+COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT)
+/* Siemens Gigaset */
+COMPATIBLE_IOCTL(GIGASET_REDIR)
+COMPATIBLE_IOCTL(GIGASET_CONFIG)
+COMPATIBLE_IOCTL(GIGASET_BRKCHARS)
+COMPATIBLE_IOCTL(GIGASET_VERSION)
+/* Misc. */
+COMPATIBLE_IOCTL(0x41545900)		/* ATYIO_CLKR */
+COMPATIBLE_IOCTL(0x41545901)		/* ATYIO_CLKW */
+COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
+COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
+COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
+COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
+/* NBD */
+COMPATIBLE_IOCTL(NBD_DO_IT)
+COMPATIBLE_IOCTL(NBD_CLEAR_SOCK)
+COMPATIBLE_IOCTL(NBD_CLEAR_QUE)
+COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
+COMPATIBLE_IOCTL(NBD_DISCONNECT)
+/* i2c */
+COMPATIBLE_IOCTL(I2C_SLAVE)
+COMPATIBLE_IOCTL(I2C_SLAVE_FORCE)
+COMPATIBLE_IOCTL(I2C_TENBIT)
+COMPATIBLE_IOCTL(I2C_PEC)
+COMPATIBLE_IOCTL(I2C_RETRIES)
+COMPATIBLE_IOCTL(I2C_TIMEOUT)
+/* hiddev */
+COMPATIBLE_IOCTL(HIDIOCGVERSION)
+COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
+COMPATIBLE_IOCTL(HIDIOCGDEVINFO)
+COMPATIBLE_IOCTL(HIDIOCGSTRING)
+COMPATIBLE_IOCTL(HIDIOCINITREPORT)
+COMPATIBLE_IOCTL(HIDIOCGREPORT)
+COMPATIBLE_IOCTL(HIDIOCSREPORT)
+COMPATIBLE_IOCTL(HIDIOCGREPORTINFO)
+COMPATIBLE_IOCTL(HIDIOCGFIELDINFO)
+COMPATIBLE_IOCTL(HIDIOCGUSAGE)
+COMPATIBLE_IOCTL(HIDIOCSUSAGE)
+COMPATIBLE_IOCTL(HIDIOCGUCODE)
+COMPATIBLE_IOCTL(HIDIOCGFLAG)
+COMPATIBLE_IOCTL(HIDIOCSFLAG)
+COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX)
+COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO)
+/* dvb */
+COMPATIBLE_IOCTL(AUDIO_STOP)
+COMPATIBLE_IOCTL(AUDIO_PLAY)
+COMPATIBLE_IOCTL(AUDIO_PAUSE)
+COMPATIBLE_IOCTL(AUDIO_CONTINUE)
+COMPATIBLE_IOCTL(AUDIO_SELECT_SOURCE)
+COMPATIBLE_IOCTL(AUDIO_SET_MUTE)
+COMPATIBLE_IOCTL(AUDIO_SET_AV_SYNC)
+COMPATIBLE_IOCTL(AUDIO_SET_BYPASS_MODE)
+COMPATIBLE_IOCTL(AUDIO_CHANNEL_SELECT)
+COMPATIBLE_IOCTL(AUDIO_GET_STATUS)
+COMPATIBLE_IOCTL(AUDIO_GET_CAPABILITIES)
+COMPATIBLE_IOCTL(AUDIO_CLEAR_BUFFER)
+COMPATIBLE_IOCTL(AUDIO_SET_ID)
+COMPATIBLE_IOCTL(AUDIO_SET_MIXER)
+COMPATIBLE_IOCTL(AUDIO_SET_STREAMTYPE)
+COMPATIBLE_IOCTL(AUDIO_SET_EXT_ID)
+COMPATIBLE_IOCTL(AUDIO_SET_ATTRIBUTES)
+COMPATIBLE_IOCTL(AUDIO_SET_KARAOKE)
+COMPATIBLE_IOCTL(DMX_START)
+COMPATIBLE_IOCTL(DMX_STOP)
+COMPATIBLE_IOCTL(DMX_SET_FILTER)
+COMPATIBLE_IOCTL(DMX_SET_PES_FILTER)
+COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE)
+COMPATIBLE_IOCTL(DMX_GET_PES_PIDS)
+COMPATIBLE_IOCTL(DMX_GET_CAPS)
+COMPATIBLE_IOCTL(DMX_SET_SOURCE)
+COMPATIBLE_IOCTL(DMX_GET_STC)
+COMPATIBLE_IOCTL(FE_GET_INFO)
+COMPATIBLE_IOCTL(FE_DISEQC_RESET_OVERLOAD)
+COMPATIBLE_IOCTL(FE_DISEQC_SEND_MASTER_CMD)
+COMPATIBLE_IOCTL(FE_DISEQC_RECV_SLAVE_REPLY)
+COMPATIBLE_IOCTL(FE_DISEQC_SEND_BURST)
+COMPATIBLE_IOCTL(FE_SET_TONE)
+COMPATIBLE_IOCTL(FE_SET_VOLTAGE)
+COMPATIBLE_IOCTL(FE_ENABLE_HIGH_LNB_VOLTAGE)
+COMPATIBLE_IOCTL(FE_READ_STATUS)
+COMPATIBLE_IOCTL(FE_READ_BER)
+COMPATIBLE_IOCTL(FE_READ_SIGNAL_STRENGTH)
+COMPATIBLE_IOCTL(FE_READ_SNR)
+COMPATIBLE_IOCTL(FE_READ_UNCORRECTED_BLOCKS)
+COMPATIBLE_IOCTL(FE_SET_FRONTEND)
+COMPATIBLE_IOCTL(FE_GET_FRONTEND)
+COMPATIBLE_IOCTL(FE_GET_EVENT)
+COMPATIBLE_IOCTL(FE_DISHNETWORK_SEND_LEGACY_CMD)
+COMPATIBLE_IOCTL(VIDEO_STOP)
+COMPATIBLE_IOCTL(VIDEO_PLAY)
+COMPATIBLE_IOCTL(VIDEO_FREEZE)
+COMPATIBLE_IOCTL(VIDEO_CONTINUE)
+COMPATIBLE_IOCTL(VIDEO_SELECT_SOURCE)
+COMPATIBLE_IOCTL(VIDEO_SET_BLANK)
+COMPATIBLE_IOCTL(VIDEO_GET_STATUS)
+COMPATIBLE_IOCTL(VIDEO_SET_DISPLAY_FORMAT)
+COMPATIBLE_IOCTL(VIDEO_FAST_FORWARD)
+COMPATIBLE_IOCTL(VIDEO_SLOWMOTION)
+COMPATIBLE_IOCTL(VIDEO_GET_CAPABILITIES)
+COMPATIBLE_IOCTL(VIDEO_CLEAR_BUFFER)
+COMPATIBLE_IOCTL(VIDEO_SET_ID)
+COMPATIBLE_IOCTL(VIDEO_SET_STREAMTYPE)
+COMPATIBLE_IOCTL(VIDEO_SET_FORMAT)
+COMPATIBLE_IOCTL(VIDEO_SET_SYSTEM)
+COMPATIBLE_IOCTL(VIDEO_SET_HIGHLIGHT)
+COMPATIBLE_IOCTL(VIDEO_SET_SPU)
+COMPATIBLE_IOCTL(VIDEO_GET_NAVI)
+COMPATIBLE_IOCTL(VIDEO_SET_ATTRIBUTES)
+COMPATIBLE_IOCTL(VIDEO_GET_SIZE)
+COMPATIBLE_IOCTL(VIDEO_GET_FRAME_RATE)
+
+/* joystick */
+COMPATIBLE_IOCTL(JSIOCGVERSION)
+COMPATIBLE_IOCTL(JSIOCGAXES)
+COMPATIBLE_IOCTL(JSIOCGBUTTONS)
+COMPATIBLE_IOCTL(JSIOCGNAME(0))
+
+#ifdef TIOCGLTC
+COMPATIBLE_IOCTL(TIOCGLTC)
+COMPATIBLE_IOCTL(TIOCSLTC)
+#endif
+#ifdef TIOCSTART
+/*
+ * For these two we have definitions in ioctls.h and/or termios.h on
+ * some architectures but no actual implemention.  Some applications
+ * like bash call them if they are defined in the headers, so we provide
+ * entries here to avoid syslog message spew.
+ */
+COMPATIBLE_IOCTL(TIOCSTART)
+COMPATIBLE_IOCTL(TIOCSTOP)
+#endif
+
+/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
+   but we don't want warnings on other file systems. So declare
+   them as compatible here. */
+#define VFAT_IOCTL_READDIR_BOTH32       _IOR('r', 1, struct compat_dirent[2])
+#define VFAT_IOCTL_READDIR_SHORT32      _IOR('r', 2, struct compat_dirent[2])
+
+IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
+IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
+
+#ifdef CONFIG_SPARC
+/* Sparc framebuffers, handled in sbusfb_compat_ioctl() */
+IGNORE_IOCTL(FBIOGTYPE)
+IGNORE_IOCTL(FBIOSATTR)
+IGNORE_IOCTL(FBIOGATTR)
+IGNORE_IOCTL(FBIOSVIDEO)
+IGNORE_IOCTL(FBIOGVIDEO)
+IGNORE_IOCTL(FBIOSCURPOS)
+IGNORE_IOCTL(FBIOGCURPOS)
+IGNORE_IOCTL(FBIOGCURMAX)
+IGNORE_IOCTL(FBIOPUTCMAP32)
+IGNORE_IOCTL(FBIOGETCMAP32)
+IGNORE_IOCTL(FBIOSCURSOR32)
+IGNORE_IOCTL(FBIOGCURSOR32)
+#endif
+};
+
+/*
+ * Convert common ioctl arguments based on their command number
+ *
+ * Please do not add any code in here. Instead, implement
+ * a compat_ioctl operation in the place that handleѕ the
+ * ioctl for the native case.
+ */
+static long do_ioctl_trans(int fd, unsigned int cmd,
+		 unsigned long arg, struct file *file)
+{
+	void __user *argp = compat_ptr(arg);
+
+	switch (cmd) {
+	case PPPIOCGIDLE32:
+		return ppp_gidle(fd, cmd, argp);
+	case PPPIOCSCOMPRESS32:
+		return ppp_scompress(fd, cmd, argp);
+	case PPPIOCSPASS32:
+	case PPPIOCSACTIVE32:
+		return ppp_sock_fprog_ioctl_trans(fd, cmd, argp);
+#ifdef CONFIG_BLOCK
+	case SG_IO:
+		return sg_ioctl_trans(fd, cmd, argp);
+	case SG_GET_REQUEST_TABLE:
+		return sg_grt_trans(fd, cmd, argp);
+	case MTIOCGET32:
+	case MTIOCPOS32:
+		return mt_ioctl_trans(fd, cmd, argp);
+#endif
+	/* Serial */
+	case TIOCGSERIAL:
+	case TIOCSSERIAL:
+		return serial_struct_ioctl(fd, cmd, argp);
+	/* i2c */
+	case I2C_FUNCS:
+		return w_long(fd, cmd, argp);
+	case I2C_RDWR:
+		return do_i2c_rdwr_ioctl(fd, cmd, argp);
+	case I2C_SMBUS:
+		return do_i2c_smbus_ioctl(fd, cmd, argp);
+	/* Not implemented in the native kernel */
+	case RTC_IRQP_READ32:
+	case RTC_IRQP_SET32:
+	case RTC_EPOCH_READ32:
+	case RTC_EPOCH_SET32:
+		return rtc_ioctl(fd, cmd, argp);
+
+	/* dvb */
+	case VIDEO_GET_EVENT:
+		return do_video_get_event(fd, cmd, argp);
+	case VIDEO_STILLPICTURE:
+		return do_video_stillpicture(fd, cmd, argp);
+	case VIDEO_SET_SPU_PALETTE:
+		return do_video_set_spu_palette(fd, cmd, argp);
+	}
+
+	/*
+	 * These take an integer instead of a pointer as 'arg',
+	 * so we must not do a compat_ptr() translation.
+	 */
+	switch (cmd) {
+	/* Big T */
+	case TCSBRKP:
+	case TIOCMIWAIT:
+	case TIOCSCTTY:
+	/* RAID */
+	case HOT_REMOVE_DISK:
+	case HOT_ADD_DISK:
+	case SET_DISK_FAULTY:
+	case SET_BITMAP_FILE:
+	/* Big K */
+	case KDSIGACCEPT:
+	case KIOCSOUND:
+	case KDMKTONE:
+	case KDSETMODE:
+	case KDSKBMODE:
+	case KDSKBMETA:
+	case KDSKBLED:
+	case KDSETLED:
+	/* NBD */
+	case NBD_SET_SOCK:
+	case NBD_SET_BLKSIZE:
+	case NBD_SET_SIZE:
+	case NBD_SET_SIZE_BLOCKS:
+		return do_vfs_ioctl(file, fd, cmd, arg);
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+static void compat_ioctl_error(struct file *filp, unsigned int fd,
+		unsigned int cmd, unsigned long arg)
+{
+	char buf[10];
+	char *fn = "?";
+	char *path;
+
+	/* find the name of the device. */
+	path = (char *)__get_free_page(GFP_KERNEL);
+	if (path) {
+		fn = d_path(&filp->f_path, path, PAGE_SIZE);
+		if (IS_ERR(fn))
+			fn = "?";
+	}
+
+	 sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK);
+	if (!isprint(buf[1]))
+		sprintf(buf, "%02x", buf[1]);
+	compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
+			"cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n",
+			current->comm, current->pid,
+			(int)fd, (unsigned int)cmd, buf,
+			(cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,
+			(unsigned int)arg, fn);
+
+	if (path)
+		free_page((unsigned long)path);
+}
+
+static int compat_ioctl_check_table(unsigned int xcmd)
+{
+	int i;
+	const int max = ARRAY_SIZE(ioctl_pointer) - 1;
+
+	BUILD_BUG_ON(max >= (1 << 16));
+
+	/* guess initial offset into table, assuming a
+	   normalized distribution */
+	i = ((xcmd >> 16) * max) >> 16;
+
+	/* do linear search up first, until greater or equal */
+	while (ioctl_pointer[i] < xcmd && i < max)
+		i++;
+
+	/* then do linear search down */
+	while (ioctl_pointer[i] > xcmd && i > 0)
+		i--;
+
+	return ioctl_pointer[i] == xcmd;
+}
+
+asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
+				unsigned long arg)
+{
+	struct file *filp;
+	int error = -EBADF;
+	int fput_needed;
+
+	filp = fget_light(fd, &fput_needed);
+	if (!filp)
+		goto out;
+
+	/* RED-PEN how should LSM module know it's handling 32bit? */
+	error = security_file_ioctl(filp, cmd, arg);
+	if (error)
+		goto out_fput;
+
+	/*
+	 * To allow the compat_ioctl handlers to be self contained
+	 * we need to check the common ioctls here first.
+	 * Just handle them with the standard handlers below.
+	 */
+	switch (cmd) {
+	case FIOCLEX:
+	case FIONCLEX:
+	case FIONBIO:
+	case FIOASYNC:
+	case FIOQSIZE:
+		break;
+
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+	case FS_IOC_RESVSP_32:
+	case FS_IOC_RESVSP64_32:
+		error = compat_ioctl_preallocate(filp, compat_ptr(arg));
+		goto out_fput;
+#else
+	case FS_IOC_RESVSP:
+	case FS_IOC_RESVSP64:
+		error = ioctl_preallocate(filp, compat_ptr(arg));
+		goto out_fput;
+#endif
+
+	case FIBMAP:
+	case FIGETBSZ:
+	case FIONREAD:
+		if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
+			break;
+		/*FALL THROUGH*/
+
+	default:
+		if (filp->f_op && filp->f_op->compat_ioctl) {
+			error = filp->f_op->compat_ioctl(filp, cmd, arg);
+			if (error != -ENOIOCTLCMD)
+				goto out_fput;
+		}
+
+		if (!filp->f_op || !filp->f_op->unlocked_ioctl)
+			goto do_ioctl;
+		break;
+	}
+
+	if (compat_ioctl_check_table(XFORM(cmd)))
+		goto found_handler;
+
+	error = do_ioctl_trans(fd, cmd, arg, filp);
+	if (error == -ENOIOCTLCMD) {
+		static int count;
+
+		if (++count <= 50)
+			compat_ioctl_error(filp, fd, cmd, arg);
+		error = -EINVAL;
+	}
+
+	goto out_fput;
+
+ found_handler:
+	arg = (unsigned long)compat_ptr(arg);
+ do_ioctl:
+	error = do_vfs_ioctl(filp, fd, cmd, arg);
+ out_fput:
+	fput_light(filp, fput_needed);
+ out:
+	return error;
+}
+
+static int __init init_sys32_ioctl_cmp(const void *p, const void *q)
+{
+	unsigned int a, b;
+	a = *(unsigned int *)p;
+	b = *(unsigned int *)q;
+	if (a > b)
+		return 1;
+	if (a < b)
+		return -1;
+	return 0;
+}
+
+static int __init init_sys32_ioctl(void)
+{
+	sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer),
+		init_sys32_ioctl_cmp, NULL);
+	return 0;
+}
+__initcall(init_sys32_ioctl);
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
new file mode 100644
index 00000000..9febcdef
--- /dev/null
+++ b/fs/configfs/Kconfig
@@ -0,0 +1,11 @@
+config CONFIGFS_FS
+	tristate "Userspace-driven configuration filesystem"
+	select SYSFS
+	help
+	  configfs is a RAM-based filesystem that provides the converse
+	  of sysfs's functionality. Where sysfs is a filesystem-based
+	  view of kernel objects, configfs is a filesystem-based manager
+	  of kernel objects, or config_items.
+
+	  Both sysfs and configfs can and should exist together on the
+	  same system. One is not a replacement for the other.
diff --git a/fs/configfs/Makefile b/fs/configfs/Makefile
new file mode 100644
index 00000000..00ffb278
--- /dev/null
+++ b/fs/configfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the configfs virtual filesystem
+#
+
+obj-$(CONFIG_CONFIGFS_FS)	+= configfs.o
+
+configfs-objs	:= inode.o file.o dir.o symlink.o mount.o item.o
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
new file mode 100644
index 00000000..82bda8fd
--- /dev/null
+++ b/fs/configfs/configfs_internal.h
@@ -0,0 +1,161 @@
+/* -*- mode: c; c-basic-offset:8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * configfs_internal.h - Internal stuff for configfs
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+struct configfs_dirent {
+	atomic_t		s_count;
+	int			s_dependent_count;
+	struct list_head	s_sibling;
+	struct list_head	s_children;
+	struct list_head	s_links;
+	void			* s_element;
+	int			s_type;
+	umode_t			s_mode;
+	struct dentry		* s_dentry;
+	struct iattr		* s_iattr;
+#ifdef CONFIG_LOCKDEP
+	int			s_depth;
+#endif
+};
+
+#define CONFIGFS_ROOT		0x0001
+#define CONFIGFS_DIR		0x0002
+#define CONFIGFS_ITEM_ATTR	0x0004
+#define CONFIGFS_ITEM_LINK	0x0020
+#define CONFIGFS_USET_DIR	0x0040
+#define CONFIGFS_USET_DEFAULT	0x0080
+#define CONFIGFS_USET_DROPPING	0x0100
+#define CONFIGFS_USET_IN_MKDIR	0x0200
+#define CONFIGFS_USET_CREATING	0x0400
+#define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
+
+extern struct mutex configfs_symlink_mutex;
+extern spinlock_t configfs_dirent_lock;
+
+extern struct vfsmount * configfs_mount;
+extern struct kmem_cache *configfs_dir_cachep;
+
+extern int configfs_is_root(struct config_item *item);
+
+extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *);
+extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+extern int configfs_inode_init(void);
+extern void configfs_inode_exit(void);
+
+extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
+extern int configfs_make_dirent(struct configfs_dirent *,
+				struct dentry *, void *, umode_t, int);
+extern int configfs_dirent_is_ready(struct configfs_dirent *);
+
+extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
+extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
+
+extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
+extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
+extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr);
+
+extern int configfs_pin_fs(void);
+extern void configfs_release_fs(void);
+
+extern struct rw_semaphore configfs_rename_sem;
+extern struct super_block * configfs_sb;
+extern const struct file_operations configfs_dir_operations;
+extern const struct file_operations configfs_file_operations;
+extern const struct file_operations bin_fops;
+extern const struct inode_operations configfs_dir_inode_operations;
+extern const struct inode_operations configfs_symlink_inode_operations;
+extern const struct dentry_operations configfs_dentry_ops;
+
+extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
+			    const char *symname);
+extern int configfs_unlink(struct inode *dir, struct dentry *dentry);
+
+struct configfs_symlink {
+	struct list_head sl_list;
+	struct config_item *sl_target;
+};
+
+extern int configfs_create_link(struct configfs_symlink *sl,
+				struct dentry *parent,
+				struct dentry *dentry);
+
+static inline struct config_item * to_item(struct dentry * dentry)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	return ((struct config_item *) sd->s_element);
+}
+
+static inline struct configfs_attribute * to_attr(struct dentry * dentry)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	return ((struct configfs_attribute *) sd->s_element);
+}
+
+static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
+{
+	struct config_item * item = NULL;
+
+	spin_lock(&dentry->d_lock);
+	if (!d_unhashed(dentry)) {
+		struct configfs_dirent * sd = dentry->d_fsdata;
+		if (sd->s_type & CONFIGFS_ITEM_LINK) {
+			struct configfs_symlink * sl = sd->s_element;
+			item = config_item_get(sl->sl_target);
+		} else
+			item = config_item_get(sd->s_element);
+	}
+	spin_unlock(&dentry->d_lock);
+
+	return item;
+}
+
+static inline void release_configfs_dirent(struct configfs_dirent * sd)
+{
+	if (!(sd->s_type & CONFIGFS_ROOT)) {
+		kfree(sd->s_iattr);
+		kmem_cache_free(configfs_dir_cachep, sd);
+	}
+}
+
+static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
+{
+	if (sd) {
+		WARN_ON(!atomic_read(&sd->s_count));
+		atomic_inc(&sd->s_count);
+	}
+	return sd;
+}
+
+static inline void configfs_put(struct configfs_dirent * sd)
+{
+	WARN_ON(!atomic_read(&sd->s_count));
+	if (atomic_dec_and_test(&sd->s_count))
+		release_configfs_dirent(sd);
+}
+
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
new file mode 100644
index 00000000..9a37a9b6
--- /dev/null
+++ b/fs/configfs/dir.c
@@ -0,0 +1,1766 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.c - Operations for configfs directories.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#undef DEBUG
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+DECLARE_RWSEM(configfs_rename_sem);
+/*
+ * Protects mutations of configfs_dirent linkage together with proper i_mutex
+ * Also protects mutations of symlinks linkage to target configfs_dirent
+ * Mutators of configfs_dirent linkage must *both* have the proper inode locked
+ * and configfs_dirent_lock locked, in that order.
+ * This allows one to safely traverse configfs_dirent trees and symlinks without
+ * having to lock inodes.
+ *
+ * Protects setting of CONFIGFS_USET_DROPPING: checking the flag
+ * unlocked is not reliable unless in detach_groups() called from
+ * rmdir()/unregister() and from configfs_attach_group()
+ */
+DEFINE_SPINLOCK(configfs_dirent_lock);
+
+static void configfs_d_iput(struct dentry * dentry,
+			    struct inode * inode)
+{
+	struct configfs_dirent *sd = dentry->d_fsdata;
+
+	if (sd) {
+		BUG_ON(sd->s_dentry != dentry);
+		/* Coordinate with configfs_readdir */
+		spin_lock(&configfs_dirent_lock);
+		sd->s_dentry = NULL;
+		spin_unlock(&configfs_dirent_lock);
+		configfs_put(sd);
+	}
+	iput(inode);
+}
+
+/*
+ * We _must_ delete our dentries on last dput, as the chain-to-parent
+ * behavior is required to clear the parents of default_groups.
+ */
+static int configfs_d_delete(const struct dentry *dentry)
+{
+	return 1;
+}
+
+const struct dentry_operations configfs_dentry_ops = {
+	.d_iput		= configfs_d_iput,
+	/* simple_delete_dentry() isn't exported */
+	.d_delete	= configfs_d_delete,
+};
+
+#ifdef CONFIG_LOCKDEP
+
+/*
+ * Helpers to make lockdep happy with our recursive locking of default groups'
+ * inodes (see configfs_attach_group() and configfs_detach_group()).
+ * We put default groups i_mutexes in separate classes according to their depth
+ * from the youngest non-default group ancestor.
+ *
+ * For a non-default group A having default groups A/B, A/C, and A/C/D, default
+ * groups A/B and A/C will have their inode's mutex in class
+ * default_group_class[0], and default group A/C/D will be in
+ * default_group_class[1].
+ *
+ * The lock classes are declared and assigned in inode.c, according to the
+ * s_depth value.
+ * The s_depth value is initialized to -1, adjusted to >= 0 when attaching
+ * default groups, and reset to -1 when all default groups are attached. During
+ * attachment, if configfs_create() sees s_depth > 0, the lock class of the new
+ * inode's mutex is set to default_group_class[s_depth - 1].
+ */
+
+static void configfs_init_dirent_depth(struct configfs_dirent *sd)
+{
+	sd->s_depth = -1;
+}
+
+static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
+					  struct configfs_dirent *sd)
+{
+	int parent_depth = parent_sd->s_depth;
+
+	if (parent_depth >= 0)
+		sd->s_depth = parent_depth + 1;
+}
+
+static void
+configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
+{
+	/*
+	 * item's i_mutex class is already setup, so s_depth is now only
+	 * used to set new sub-directories s_depth, which is always done
+	 * with item's i_mutex locked.
+	 */
+	/*
+	 *  sd->s_depth == -1 iff we are a non default group.
+	 *  else (we are a default group) sd->s_depth > 0 (see
+	 *  create_dir()).
+	 */
+	if (sd->s_depth == -1)
+		/*
+		 * We are a non default group and we are going to create
+		 * default groups.
+		 */
+		sd->s_depth = 0;
+}
+
+static void
+configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
+{
+	/* We will not create default groups anymore. */
+	sd->s_depth = -1;
+}
+
+#else /* CONFIG_LOCKDEP */
+
+static void configfs_init_dirent_depth(struct configfs_dirent *sd)
+{
+}
+
+static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
+					  struct configfs_dirent *sd)
+{
+}
+
+static void
+configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
+{
+}
+
+static void
+configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
+{
+}
+
+#endif /* CONFIG_LOCKDEP */
+
+/*
+ * Allocates a new configfs_dirent and links it to the parent configfs_dirent
+ */
+static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd,
+						   void *element, int type)
+{
+	struct configfs_dirent * sd;
+
+	sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL);
+	if (!sd)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&sd->s_count, 1);
+	INIT_LIST_HEAD(&sd->s_links);
+	INIT_LIST_HEAD(&sd->s_children);
+	sd->s_element = element;
+	sd->s_type = type;
+	configfs_init_dirent_depth(sd);
+	spin_lock(&configfs_dirent_lock);
+	if (parent_sd->s_type & CONFIGFS_USET_DROPPING) {
+		spin_unlock(&configfs_dirent_lock);
+		kmem_cache_free(configfs_dir_cachep, sd);
+		return ERR_PTR(-ENOENT);
+	}
+	list_add(&sd->s_sibling, &parent_sd->s_children);
+	spin_unlock(&configfs_dirent_lock);
+
+	return sd;
+}
+
+/*
+ *
+ * Return -EEXIST if there is already a configfs element with the same
+ * name for the same parent.
+ *
+ * called with parent inode's i_mutex held
+ */
+static int configfs_dirent_exists(struct configfs_dirent *parent_sd,
+				  const unsigned char *new)
+{
+	struct configfs_dirent * sd;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_element) {
+			const unsigned char *existing = configfs_get_name(sd);
+			if (strcmp(existing, new))
+				continue;
+			else
+				return -EEXIST;
+		}
+	}
+
+	return 0;
+}
+
+
+int configfs_make_dirent(struct configfs_dirent * parent_sd,
+			 struct dentry * dentry, void * element,
+			 umode_t mode, int type)
+{
+	struct configfs_dirent * sd;
+
+	sd = configfs_new_dirent(parent_sd, element, type);
+	if (IS_ERR(sd))
+		return PTR_ERR(sd);
+
+	sd->s_mode = mode;
+	sd->s_dentry = dentry;
+	if (dentry)
+		dentry->d_fsdata = configfs_get(sd);
+
+	return 0;
+}
+
+static int init_dir(struct inode * inode)
+{
+	inode->i_op = &configfs_dir_inode_operations;
+	inode->i_fop = &configfs_dir_operations;
+
+	/* directory inodes start off with i_nlink == 2 (for "." entry) */
+	inc_nlink(inode);
+	return 0;
+}
+
+static int configfs_init_file(struct inode * inode)
+{
+	inode->i_size = PAGE_SIZE;
+	inode->i_fop = &configfs_file_operations;
+	return 0;
+}
+
+static int init_symlink(struct inode * inode)
+{
+	inode->i_op = &configfs_symlink_inode_operations;
+	return 0;
+}
+
+static int create_dir(struct config_item * k, struct dentry * p,
+		      struct dentry * d)
+{
+	int error;
+	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
+
+	error = configfs_dirent_exists(p->d_fsdata, d->d_name.name);
+	if (!error)
+		error = configfs_make_dirent(p->d_fsdata, d, k, mode,
+					     CONFIGFS_DIR | CONFIGFS_USET_CREATING);
+	if (!error) {
+		configfs_set_dir_dirent_depth(p->d_fsdata, d->d_fsdata);
+		error = configfs_create(d, mode, init_dir);
+		if (!error) {
+			inc_nlink(p->d_inode);
+		} else {
+			struct configfs_dirent *sd = d->d_fsdata;
+			if (sd) {
+				spin_lock(&configfs_dirent_lock);
+				list_del_init(&sd->s_sibling);
+				spin_unlock(&configfs_dirent_lock);
+				configfs_put(sd);
+			}
+		}
+	}
+	return error;
+}
+
+
+/**
+ *	configfs_create_dir - create a directory for an config_item.
+ *	@item:		config_itemwe're creating directory for.
+ *	@dentry:	config_item's dentry.
+ *
+ *	Note: user-created entries won't be allowed under this new directory
+ *	until it is validated by configfs_dir_set_ready()
+ */
+
+static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
+{
+	struct dentry * parent;
+	int error = 0;
+
+	BUG_ON(!item);
+
+	if (item->ci_parent)
+		parent = item->ci_parent->ci_dentry;
+	else if (configfs_mount && configfs_mount->mnt_sb)
+		parent = configfs_mount->mnt_sb->s_root;
+	else
+		return -EFAULT;
+
+	error = create_dir(item,parent,dentry);
+	if (!error)
+		item->ci_dentry = dentry;
+	return error;
+}
+
+/*
+ * Allow userspace to create new entries under a new directory created with
+ * configfs_create_dir(), and under all of its chidlren directories recursively.
+ * @sd		configfs_dirent of the new directory to validate
+ *
+ * Caller must hold configfs_dirent_lock.
+ */
+static void configfs_dir_set_ready(struct configfs_dirent *sd)
+{
+	struct configfs_dirent *child_sd;
+
+	sd->s_type &= ~CONFIGFS_USET_CREATING;
+	list_for_each_entry(child_sd, &sd->s_children, s_sibling)
+		if (child_sd->s_type & CONFIGFS_USET_CREATING)
+			configfs_dir_set_ready(child_sd);
+}
+
+/*
+ * Check that a directory does not belong to a directory hierarchy being
+ * attached and not validated yet.
+ * @sd		configfs_dirent of the directory to check
+ *
+ * @return	non-zero iff the directory was validated
+ *
+ * Note: takes configfs_dirent_lock, so the result may change from false to true
+ * in two consecutive calls, but never from true to false.
+ */
+int configfs_dirent_is_ready(struct configfs_dirent *sd)
+{
+	int ret;
+
+	spin_lock(&configfs_dirent_lock);
+	ret = !(sd->s_type & CONFIGFS_USET_CREATING);
+	spin_unlock(&configfs_dirent_lock);
+
+	return ret;
+}
+
+int configfs_create_link(struct configfs_symlink *sl,
+			 struct dentry *parent,
+			 struct dentry *dentry)
+{
+	int err = 0;
+	umode_t mode = S_IFLNK | S_IRWXUGO;
+
+	err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode,
+				   CONFIGFS_ITEM_LINK);
+	if (!err) {
+		err = configfs_create(dentry, mode, init_symlink);
+		if (err) {
+			struct configfs_dirent *sd = dentry->d_fsdata;
+			if (sd) {
+				spin_lock(&configfs_dirent_lock);
+				list_del_init(&sd->s_sibling);
+				spin_unlock(&configfs_dirent_lock);
+				configfs_put(sd);
+			}
+		}
+	}
+	return err;
+}
+
+static void remove_dir(struct dentry * d)
+{
+	struct dentry * parent = dget(d->d_parent);
+	struct configfs_dirent * sd;
+
+	sd = d->d_fsdata;
+	spin_lock(&configfs_dirent_lock);
+	list_del_init(&sd->s_sibling);
+	spin_unlock(&configfs_dirent_lock);
+	configfs_put(sd);
+	if (d->d_inode)
+		simple_rmdir(parent->d_inode,d);
+
+	pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
+
+	dput(parent);
+}
+
+/**
+ * configfs_remove_dir - remove an config_item's directory.
+ * @item:	config_item we're removing.
+ *
+ * The only thing special about this is that we remove any files in
+ * the directory before we remove the directory, and we've inlined
+ * what used to be configfs_rmdir() below, instead of calling separately.
+ *
+ * Caller holds the mutex of the item's inode
+ */
+
+static void configfs_remove_dir(struct config_item * item)
+{
+	struct dentry * dentry = dget(item->ci_dentry);
+
+	if (!dentry)
+		return;
+
+	remove_dir(dentry);
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+
+/* attaches attribute's configfs_dirent to the dentry corresponding to the
+ * attribute file
+ */
+static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)
+{
+	struct configfs_attribute * attr = sd->s_element;
+	int error;
+
+	dentry->d_fsdata = configfs_get(sd);
+	sd->s_dentry = dentry;
+	error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG,
+				configfs_init_file);
+	if (error) {
+		configfs_put(sd);
+		return error;
+	}
+
+	d_rehash(dentry);
+
+	return 0;
+}
+
+static struct dentry * configfs_lookup(struct inode *dir,
+				       struct dentry *dentry,
+				       struct nameidata *nd)
+{
+	struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
+	struct configfs_dirent * sd;
+	int found = 0;
+	int err;
+
+	/*
+	 * Fake invisibility if dir belongs to a group/default groups hierarchy
+	 * being attached
+	 *
+	 * This forbids userspace to read/write attributes of items which may
+	 * not complete their initialization, since the dentries of the
+	 * attributes won't be instantiated.
+	 */
+	err = -ENOENT;
+	if (!configfs_dirent_is_ready(parent_sd))
+		goto out;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_NOT_PINNED) {
+			const unsigned char * name = configfs_get_name(sd);
+
+			if (strcmp(name, dentry->d_name.name))
+				continue;
+
+			found = 1;
+			err = configfs_attach_attr(sd, dentry);
+			break;
+		}
+	}
+
+	if (!found) {
+		/*
+		 * If it doesn't exist and it isn't a NOT_PINNED item,
+		 * it must be negative.
+		 */
+		if (dentry->d_name.len > NAME_MAX)
+			return ERR_PTR(-ENAMETOOLONG);
+		d_add(dentry, NULL);
+		return NULL;
+	}
+
+out:
+	return ERR_PTR(err);
+}
+
+/*
+ * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
+ * attributes and are removed by rmdir().  We recurse, setting
+ * CONFIGFS_USET_DROPPING on all children that are candidates for
+ * default detach.
+ * If there is an error, the caller will reset the flags via
+ * configfs_detach_rollback().
+ */
+static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex)
+{
+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *sd;
+	int ret;
+
+	/* Mark that we're trying to drop the group */
+	parent_sd->s_type |= CONFIGFS_USET_DROPPING;
+
+	ret = -EBUSY;
+	if (!list_empty(&parent_sd->s_links))
+		goto out;
+
+	ret = 0;
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element ||
+		    (sd->s_type & CONFIGFS_NOT_PINNED))
+			continue;
+		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+			/* Abort if racing with mkdir() */
+			if (sd->s_type & CONFIGFS_USET_IN_MKDIR) {
+				if (wait_mutex)
+					*wait_mutex = &sd->s_dentry->d_inode->i_mutex;
+				return -EAGAIN;
+			}
+
+			/*
+			 * Yup, recursive.  If there's a problem, blame
+			 * deep nesting of default_groups
+			 */
+			ret = configfs_detach_prep(sd->s_dentry, wait_mutex);
+			if (!ret)
+				continue;
+		} else
+			ret = -ENOTEMPTY;
+
+		break;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was
+ * set.
+ */
+static void configfs_detach_rollback(struct dentry *dentry)
+{
+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *sd;
+
+	parent_sd->s_type &= ~CONFIGFS_USET_DROPPING;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling)
+		if (sd->s_type & CONFIGFS_USET_DEFAULT)
+			configfs_detach_rollback(sd->s_dentry);
+}
+
+static void detach_attrs(struct config_item * item)
+{
+	struct dentry * dentry = dget(item->ci_dentry);
+	struct configfs_dirent * parent_sd;
+	struct configfs_dirent * sd, * tmp;
+
+	if (!dentry)
+		return;
+
+	pr_debug("configfs %s: dropping attrs for  dir\n",
+		 dentry->d_name.name);
+
+	parent_sd = dentry->d_fsdata;
+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
+			continue;
+		spin_lock(&configfs_dirent_lock);
+		list_del_init(&sd->s_sibling);
+		spin_unlock(&configfs_dirent_lock);
+		configfs_drop_dentry(sd, dentry);
+		configfs_put(sd);
+	}
+
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+static int populate_attrs(struct config_item *item)
+{
+	struct config_item_type *t = item->ci_type;
+	struct configfs_attribute *attr;
+	int error = 0;
+	int i;
+
+	if (!t)
+		return -EINVAL;
+	if (t->ct_attrs) {
+		for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {
+			if ((error = configfs_create_file(item, attr)))
+				break;
+		}
+	}
+
+	if (error)
+		detach_attrs(item);
+
+	return error;
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+				 struct config_item *item,
+				 struct dentry *dentry);
+static void configfs_detach_group(struct config_item *item);
+
+static void detach_groups(struct config_group *group)
+{
+	struct dentry * dentry = dget(group->cg_item.ci_dentry);
+	struct dentry *child;
+	struct configfs_dirent *parent_sd;
+	struct configfs_dirent *sd, *tmp;
+
+	if (!dentry)
+		return;
+
+	parent_sd = dentry->d_fsdata;
+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element ||
+		    !(sd->s_type & CONFIGFS_USET_DEFAULT))
+			continue;
+
+		child = sd->s_dentry;
+
+		mutex_lock(&child->d_inode->i_mutex);
+
+		configfs_detach_group(sd->s_element);
+		child->d_inode->i_flags |= S_DEAD;
+		dont_mount(child);
+
+		mutex_unlock(&child->d_inode->i_mutex);
+
+		d_delete(child);
+		dput(child);
+	}
+
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+/*
+ * This fakes mkdir(2) on a default_groups[] entry.  It
+ * creates a dentry, attachs it, and then does fixup
+ * on the sd->s_type.
+ *
+ * We could, perhaps, tweak our parent's ->mkdir for a minute and
+ * try using vfs_mkdir.  Just a thought.
+ */
+static int create_default_group(struct config_group *parent_group,
+				struct config_group *group)
+{
+	int ret;
+	struct qstr name;
+	struct configfs_dirent *sd;
+	/* We trust the caller holds a reference to parent */
+	struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
+
+	if (!group->cg_item.ci_name)
+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
+	name.name = group->cg_item.ci_name;
+	name.len = strlen(name.name);
+	name.hash = full_name_hash(name.name, name.len);
+
+	ret = -ENOMEM;
+	child = d_alloc(parent, &name);
+	if (child) {
+		d_add(child, NULL);
+
+		ret = configfs_attach_group(&parent_group->cg_item,
+					    &group->cg_item, child);
+		if (!ret) {
+			sd = child->d_fsdata;
+			sd->s_type |= CONFIGFS_USET_DEFAULT;
+		} else {
+			BUG_ON(child->d_inode);
+			d_drop(child);
+			dput(child);
+		}
+	}
+
+	return ret;
+}
+
+static int populate_groups(struct config_group *group)
+{
+	struct config_group *new_group;
+	int ret = 0;
+	int i;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+
+			ret = create_default_group(group, new_group);
+			if (ret) {
+				detach_groups(group);
+				break;
+			}
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * All of link_obj/unlink_obj/link_group/unlink_group require that
+ * subsys->su_mutex is held.
+ */
+
+static void unlink_obj(struct config_item *item)
+{
+	struct config_group *group;
+
+	group = item->ci_group;
+	if (group) {
+		list_del_init(&item->ci_entry);
+
+		item->ci_group = NULL;
+		item->ci_parent = NULL;
+
+		/* Drop the reference for ci_entry */
+		config_item_put(item);
+
+		/* Drop the reference for ci_parent */
+		config_group_put(group);
+	}
+}
+
+static void link_obj(struct config_item *parent_item, struct config_item *item)
+{
+	/*
+	 * Parent seems redundant with group, but it makes certain
+	 * traversals much nicer.
+	 */
+	item->ci_parent = parent_item;
+
+	/*
+	 * We hold a reference on the parent for the child's ci_parent
+	 * link.
+	 */
+	item->ci_group = config_group_get(to_config_group(parent_item));
+	list_add_tail(&item->ci_entry, &item->ci_group->cg_children);
+
+	/*
+	 * We hold a reference on the child for ci_entry on the parent's
+	 * cg_children
+	 */
+	config_item_get(item);
+}
+
+static void unlink_group(struct config_group *group)
+{
+	int i;
+	struct config_group *new_group;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+			unlink_group(new_group);
+		}
+	}
+
+	group->cg_subsys = NULL;
+	unlink_obj(&group->cg_item);
+}
+
+static void link_group(struct config_group *parent_group, struct config_group *group)
+{
+	int i;
+	struct config_group *new_group;
+	struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
+
+	link_obj(&parent_group->cg_item, &group->cg_item);
+
+	if (parent_group->cg_subsys)
+		subsys = parent_group->cg_subsys;
+	else if (configfs_is_root(&parent_group->cg_item))
+		subsys = to_configfs_subsystem(group);
+	else
+		BUG();
+	group->cg_subsys = subsys;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+			link_group(group, new_group);
+		}
+	}
+}
+
+/*
+ * The goal is that configfs_attach_item() (and
+ * configfs_attach_group()) can be called from either the VFS or this
+ * module.  That is, they assume that the items have been created,
+ * the dentry allocated, and the dcache is all ready to go.
+ *
+ * If they fail, they must clean up after themselves as if they
+ * had never been called.  The caller (VFS or local function) will
+ * handle cleaning up the dcache bits.
+ *
+ * configfs_detach_group() and configfs_detach_item() behave similarly on
+ * the way out.  They assume that the proper semaphores are held, they
+ * clean up the configfs items, and they expect their callers will
+ * handle the dcache bits.
+ */
+static int configfs_attach_item(struct config_item *parent_item,
+				struct config_item *item,
+				struct dentry *dentry)
+{
+	int ret;
+
+	ret = configfs_create_dir(item, dentry);
+	if (!ret) {
+		ret = populate_attrs(item);
+		if (ret) {
+			/*
+			 * We are going to remove an inode and its dentry but
+			 * the VFS may already have hit and used them. Thus,
+			 * we must lock them as rmdir() would.
+			 */
+			mutex_lock(&dentry->d_inode->i_mutex);
+			configfs_remove_dir(item);
+			dentry->d_inode->i_flags |= S_DEAD;
+			dont_mount(dentry);
+			mutex_unlock(&dentry->d_inode->i_mutex);
+			d_delete(dentry);
+		}
+	}
+
+	return ret;
+}
+
+/* Caller holds the mutex of the item's inode */
+static void configfs_detach_item(struct config_item *item)
+{
+	detach_attrs(item);
+	configfs_remove_dir(item);
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+				 struct config_item *item,
+				 struct dentry *dentry)
+{
+	int ret;
+	struct configfs_dirent *sd;
+
+	ret = configfs_attach_item(parent_item, item, dentry);
+	if (!ret) {
+		sd = dentry->d_fsdata;
+		sd->s_type |= CONFIGFS_USET_DIR;
+
+		/*
+		 * FYI, we're faking mkdir in populate_groups()
+		 * We must lock the group's inode to avoid races with the VFS
+		 * which can already hit the inode and try to add/remove entries
+		 * under it.
+		 *
+		 * We must also lock the inode to remove it safely in case of
+		 * error, as rmdir() would.
+		 */
+		mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+		configfs_adjust_dir_dirent_depth_before_populate(sd);
+		ret = populate_groups(to_config_group(item));
+		if (ret) {
+			configfs_detach_item(item);
+			dentry->d_inode->i_flags |= S_DEAD;
+			dont_mount(dentry);
+		}
+		configfs_adjust_dir_dirent_depth_after_populate(sd);
+		mutex_unlock(&dentry->d_inode->i_mutex);
+		if (ret)
+			d_delete(dentry);
+	}
+
+	return ret;
+}
+
+/* Caller holds the mutex of the group's inode */
+static void configfs_detach_group(struct config_item *item)
+{
+	detach_groups(to_config_group(item));
+	configfs_detach_item(item);
+}
+
+/*
+ * After the item has been detached from the filesystem view, we are
+ * ready to tear it out of the hierarchy.  Notify the client before
+ * we do that so they can perform any cleanup that requires
+ * navigating the hierarchy.  A client does not need to provide this
+ * callback.  The subsystem semaphore MUST be held by the caller, and
+ * references must be valid for both items.  It also assumes the
+ * caller has validated ci_type.
+ */
+static void client_disconnect_notify(struct config_item *parent_item,
+				     struct config_item *item)
+{
+	struct config_item_type *type;
+
+	type = parent_item->ci_type;
+	BUG_ON(!type);
+
+	if (type->ct_group_ops && type->ct_group_ops->disconnect_notify)
+		type->ct_group_ops->disconnect_notify(to_config_group(parent_item),
+						      item);
+}
+
+/*
+ * Drop the initial reference from make_item()/make_group()
+ * This function assumes that reference is held on item
+ * and that item holds a valid reference to the parent.  Also, it
+ * assumes the caller has validated ci_type.
+ */
+static void client_drop_item(struct config_item *parent_item,
+			     struct config_item *item)
+{
+	struct config_item_type *type;
+
+	type = parent_item->ci_type;
+	BUG_ON(!type);
+
+	/*
+	 * If ->drop_item() exists, it is responsible for the
+	 * config_item_put().
+	 */
+	if (type->ct_group_ops && type->ct_group_ops->drop_item)
+		type->ct_group_ops->drop_item(to_config_group(parent_item),
+					      item);
+	else
+		config_item_put(item);
+}
+
+#ifdef DEBUG
+static void configfs_dump_one(struct configfs_dirent *sd, int level)
+{
+	printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd));
+
+#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type);
+	type_print(CONFIGFS_ROOT);
+	type_print(CONFIGFS_DIR);
+	type_print(CONFIGFS_ITEM_ATTR);
+	type_print(CONFIGFS_ITEM_LINK);
+	type_print(CONFIGFS_USET_DIR);
+	type_print(CONFIGFS_USET_DEFAULT);
+	type_print(CONFIGFS_USET_DROPPING);
+#undef type_print
+}
+
+static int configfs_dump(struct configfs_dirent *sd, int level)
+{
+	struct configfs_dirent *child_sd;
+	int ret = 0;
+
+	configfs_dump_one(sd, level);
+
+	if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT)))
+		return 0;
+
+	list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
+		ret = configfs_dump(child_sd, level + 2);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+#endif
+
+
+/*
+ * configfs_depend_item() and configfs_undepend_item()
+ *
+ * WARNING: Do not call these from a configfs callback!
+ *
+ * This describes these functions and their helpers.
+ *
+ * Allow another kernel system to depend on a config_item.  If this
+ * happens, the item cannot go away until the dependent can live without
+ * it.  The idea is to give client modules as simple an interface as
+ * possible.  When a system asks them to depend on an item, they just
+ * call configfs_depend_item().  If the item is live and the client
+ * driver is in good shape, we'll happily do the work for them.
+ *
+ * Why is the locking complex?  Because configfs uses the VFS to handle
+ * all locking, but this function is called outside the normal
+ * VFS->configfs path.  So it must take VFS locks to prevent the
+ * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc).  This is
+ * why you can't call these functions underneath configfs callbacks.
+ *
+ * Note, btw, that this can be called at *any* time, even when a configfs
+ * subsystem isn't registered, or when configfs is loading or unloading.
+ * Just like configfs_register_subsystem().  So we take the same
+ * precautions.  We pin the filesystem.  We lock configfs_dirent_lock.
+ * If we can find the target item in the
+ * configfs tree, it must be part of the subsystem tree as well, so we
+ * do not need the subsystem semaphore.  Holding configfs_dirent_lock helps
+ * locking out mkdir() and rmdir(), who might be racing us.
+ */
+
+/*
+ * configfs_depend_prep()
+ *
+ * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
+ * attributes.  This is similar but not the same to configfs_detach_prep().
+ * Note that configfs_detach_prep() expects the parent to be locked when it
+ * is called, but we lock the parent *inside* configfs_depend_prep().  We
+ * do that so we can unlock it if we find nothing.
+ *
+ * Here we do a depth-first search of the dentry hierarchy looking for
+ * our object.
+ * We deliberately ignore items tagged as dropping since they are virtually
+ * dead, as well as items in the middle of attachment since they virtually
+ * do not exist yet. This completes the locking out of racing mkdir() and
+ * rmdir().
+ * Note: subdirectories in the middle of attachment start with s_type =
+ * CONFIGFS_DIR|CONFIGFS_USET_CREATING set by create_dir().  When
+ * CONFIGFS_USET_CREATING is set, we ignore the item.  The actual set of
+ * s_type is in configfs_new_dirent(), which has configfs_dirent_lock.
+ *
+ * If the target is not found, -ENOENT is bubbled up.
+ *
+ * This adds a requirement that all config_items be unique!
+ *
+ * This is recursive.  There isn't
+ * much on the stack, though, so folks that need this function - be careful
+ * about your stack!  Patches will be accepted to make it iterative.
+ */
+static int configfs_depend_prep(struct dentry *origin,
+				struct config_item *target)
+{
+	struct configfs_dirent *child_sd, *sd = origin->d_fsdata;
+	int ret = 0;
+
+	BUG_ON(!origin || !sd);
+
+	if (sd->s_element == target)  /* Boo-yah */
+		goto out;
+
+	list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
+		if ((child_sd->s_type & CONFIGFS_DIR) &&
+		    !(child_sd->s_type & CONFIGFS_USET_DROPPING) &&
+		    !(child_sd->s_type & CONFIGFS_USET_CREATING)) {
+			ret = configfs_depend_prep(child_sd->s_dentry,
+						   target);
+			if (!ret)
+				goto out;  /* Child path boo-yah */
+		}
+	}
+
+	/* We looped all our children and didn't find target */
+	ret = -ENOENT;
+
+out:
+	return ret;
+}
+
+int configfs_depend_item(struct configfs_subsystem *subsys,
+			 struct config_item *target)
+{
+	int ret;
+	struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
+	struct config_item *s_item = &subsys->su_group.cg_item;
+
+	/*
+	 * Pin the configfs filesystem.  This means we can safely access
+	 * the root of the configfs filesystem.
+	 */
+	ret = configfs_pin_fs();
+	if (ret)
+		return ret;
+
+	/*
+	 * Next, lock the root directory.  We're going to check that the
+	 * subsystem is really registered, and so we need to lock out
+	 * configfs_[un]register_subsystem().
+	 */
+	mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
+
+	root_sd = configfs_sb->s_root->d_fsdata;
+
+	list_for_each_entry(p, &root_sd->s_children, s_sibling) {
+		if (p->s_type & CONFIGFS_DIR) {
+			if (p->s_element == s_item) {
+				subsys_sd = p;
+				break;
+			}
+		}
+	}
+
+	if (!subsys_sd) {
+		ret = -ENOENT;
+		goto out_unlock_fs;
+	}
+
+	/* Ok, now we can trust subsys/s_item */
+
+	spin_lock(&configfs_dirent_lock);
+	/* Scan the tree, return 0 if found */
+	ret = configfs_depend_prep(subsys_sd->s_dentry, target);
+	if (ret)
+		goto out_unlock_dirent_lock;
+
+	/*
+	 * We are sure that the item is not about to be removed by rmdir(), and
+	 * not in the middle of attachment by mkdir().
+	 */
+	p = target->ci_dentry->d_fsdata;
+	p->s_dependent_count += 1;
+
+out_unlock_dirent_lock:
+	spin_unlock(&configfs_dirent_lock);
+out_unlock_fs:
+	mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
+
+	/*
+	 * If we succeeded, the fs is pinned via other methods.  If not,
+	 * we're done with it anyway.  So release_fs() is always right.
+	 */
+	configfs_release_fs();
+
+	return ret;
+}
+EXPORT_SYMBOL(configfs_depend_item);
+
+/*
+ * Release the dependent linkage.  This is much simpler than
+ * configfs_depend_item() because we know that that the client driver is
+ * pinned, thus the subsystem is pinned, and therefore configfs is pinned.
+ */
+void configfs_undepend_item(struct configfs_subsystem *subsys,
+			    struct config_item *target)
+{
+	struct configfs_dirent *sd;
+
+	/*
+	 * Since we can trust everything is pinned, we just need
+	 * configfs_dirent_lock.
+	 */
+	spin_lock(&configfs_dirent_lock);
+
+	sd = target->ci_dentry->d_fsdata;
+	BUG_ON(sd->s_dependent_count < 1);
+
+	sd->s_dependent_count -= 1;
+
+	/*
+	 * After this unlock, we cannot trust the item to stay alive!
+	 * DO NOT REFERENCE item after this unlock.
+	 */
+	spin_unlock(&configfs_dirent_lock);
+}
+EXPORT_SYMBOL(configfs_undepend_item);
+
+static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int ret = 0;
+	int module_got = 0;
+	struct config_group *group = NULL;
+	struct config_item *item = NULL;
+	struct config_item *parent_item;
+	struct configfs_subsystem *subsys;
+	struct configfs_dirent *sd;
+	struct config_item_type *type;
+	struct module *subsys_owner = NULL, *new_item_owner = NULL;
+	char *name;
+
+	if (dentry->d_parent == configfs_sb->s_root) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	sd = dentry->d_parent->d_fsdata;
+
+	/*
+	 * Fake invisibility if dir belongs to a group/default groups hierarchy
+	 * being attached
+	 */
+	if (!configfs_dirent_is_ready(sd)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (!(sd->s_type & CONFIGFS_USET_DIR)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	/* Get a working ref for the duration of this function */
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+	subsys = to_config_group(parent_item)->cg_subsys;
+	BUG_ON(!subsys);
+
+	if (!type || !type->ct_group_ops ||
+	    (!type->ct_group_ops->make_group &&
+	     !type->ct_group_ops->make_item)) {
+		ret = -EPERM;  /* Lack-of-mkdir returns -EPERM */
+		goto out_put;
+	}
+
+	/*
+	 * The subsystem may belong to a different module than the item
+	 * being created.  We don't want to safely pin the new item but
+	 * fail to pin the subsystem it sits under.
+	 */
+	if (!subsys->su_group.cg_item.ci_type) {
+		ret = -EINVAL;
+		goto out_put;
+	}
+	subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
+	if (!try_module_get(subsys_owner)) {
+		ret = -EINVAL;
+		goto out_put;
+	}
+
+	name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
+	if (!name) {
+		ret = -ENOMEM;
+		goto out_subsys_put;
+	}
+
+	snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
+
+	mutex_lock(&subsys->su_mutex);
+	if (type->ct_group_ops->make_group) {
+		group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
+		if (!group)
+			group = ERR_PTR(-ENOMEM);
+		if (!IS_ERR(group)) {
+			link_group(to_config_group(parent_item), group);
+			item = &group->cg_item;
+		} else
+			ret = PTR_ERR(group);
+	} else {
+		item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
+		if (!item)
+			item = ERR_PTR(-ENOMEM);
+		if (!IS_ERR(item))
+			link_obj(parent_item, item);
+		else
+			ret = PTR_ERR(item);
+	}
+	mutex_unlock(&subsys->su_mutex);
+
+	kfree(name);
+	if (ret) {
+		/*
+		 * If ret != 0, then link_obj() was never called.
+		 * There are no extra references to clean up.
+		 */
+		goto out_subsys_put;
+	}
+
+	/*
+	 * link_obj() has been called (via link_group() for groups).
+	 * From here on out, errors must clean that up.
+	 */
+
+	type = item->ci_type;
+	if (!type) {
+		ret = -EINVAL;
+		goto out_unlink;
+	}
+
+	new_item_owner = type->ct_owner;
+	if (!try_module_get(new_item_owner)) {
+		ret = -EINVAL;
+		goto out_unlink;
+	}
+
+	/*
+	 * I hate doing it this way, but if there is
+	 * an error,  module_put() probably should
+	 * happen after any cleanup.
+	 */
+	module_got = 1;
+
+	/*
+	 * Make racing rmdir() fail if it did not tag parent with
+	 * CONFIGFS_USET_DROPPING
+	 * Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will
+	 * fail and let rmdir() terminate correctly
+	 */
+	spin_lock(&configfs_dirent_lock);
+	/* This will make configfs_detach_prep() fail */
+	sd->s_type |= CONFIGFS_USET_IN_MKDIR;
+	spin_unlock(&configfs_dirent_lock);
+
+	if (group)
+		ret = configfs_attach_group(parent_item, item, dentry);
+	else
+		ret = configfs_attach_item(parent_item, item, dentry);
+
+	spin_lock(&configfs_dirent_lock);
+	sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
+	if (!ret)
+		configfs_dir_set_ready(dentry->d_fsdata);
+	spin_unlock(&configfs_dirent_lock);
+
+out_unlink:
+	if (ret) {
+		/* Tear down everything we built up */
+		mutex_lock(&subsys->su_mutex);
+
+		client_disconnect_notify(parent_item, item);
+		if (group)
+			unlink_group(group);
+		else
+			unlink_obj(item);
+		client_drop_item(parent_item, item);
+
+		mutex_unlock(&subsys->su_mutex);
+
+		if (module_got)
+			module_put(new_item_owner);
+	}
+
+out_subsys_put:
+	if (ret)
+		module_put(subsys_owner);
+
+out_put:
+	/*
+	 * link_obj()/link_group() took a reference from child->parent,
+	 * so the parent is safely pinned.  We can drop our working
+	 * reference.
+	 */
+	config_item_put(parent_item);
+
+out:
+	return ret;
+}
+
+static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct config_item *parent_item;
+	struct config_item *item;
+	struct configfs_subsystem *subsys;
+	struct configfs_dirent *sd;
+	struct module *subsys_owner = NULL, *dead_item_owner = NULL;
+	int ret;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		return -EPERM;
+
+	sd = dentry->d_fsdata;
+	if (sd->s_type & CONFIGFS_USET_DEFAULT)
+		return -EPERM;
+
+	/* Get a working ref until we have the child */
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	subsys = to_config_group(parent_item)->cg_subsys;
+	BUG_ON(!subsys);
+
+	if (!parent_item->ci_type) {
+		config_item_put(parent_item);
+		return -EINVAL;
+	}
+
+	/* configfs_mkdir() shouldn't have allowed this */
+	BUG_ON(!subsys->su_group.cg_item.ci_type);
+	subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
+
+	/*
+	 * Ensure that no racing symlink() will make detach_prep() fail while
+	 * the new link is temporarily attached
+	 */
+	do {
+		struct mutex *wait_mutex;
+
+		mutex_lock(&configfs_symlink_mutex);
+		spin_lock(&configfs_dirent_lock);
+		/*
+		 * Here's where we check for dependents.  We're protected by
+		 * configfs_dirent_lock.
+		 * If no dependent, atomically tag the item as dropping.
+		 */
+		ret = sd->s_dependent_count ? -EBUSY : 0;
+		if (!ret) {
+			ret = configfs_detach_prep(dentry, &wait_mutex);
+			if (ret)
+				configfs_detach_rollback(dentry);
+		}
+		spin_unlock(&configfs_dirent_lock);
+		mutex_unlock(&configfs_symlink_mutex);
+
+		if (ret) {
+			if (ret != -EAGAIN) {
+				config_item_put(parent_item);
+				return ret;
+			}
+
+			/* Wait until the racing operation terminates */
+			mutex_lock(wait_mutex);
+			mutex_unlock(wait_mutex);
+		}
+	} while (ret == -EAGAIN);
+
+	/* Get a working ref for the duration of this function */
+	item = configfs_get_config_item(dentry);
+
+	/* Drop reference from above, item already holds one. */
+	config_item_put(parent_item);
+
+	if (item->ci_type)
+		dead_item_owner = item->ci_type->ct_owner;
+
+	if (sd->s_type & CONFIGFS_USET_DIR) {
+		configfs_detach_group(item);
+
+		mutex_lock(&subsys->su_mutex);
+		client_disconnect_notify(parent_item, item);
+		unlink_group(to_config_group(item));
+	} else {
+		configfs_detach_item(item);
+
+		mutex_lock(&subsys->su_mutex);
+		client_disconnect_notify(parent_item, item);
+		unlink_obj(item);
+	}
+
+	client_drop_item(parent_item, item);
+	mutex_unlock(&subsys->su_mutex);
+
+	/* Drop our reference from above */
+	config_item_put(item);
+
+	module_put(dead_item_owner);
+	module_put(subsys_owner);
+
+	return 0;
+}
+
+const struct inode_operations configfs_dir_inode_operations = {
+	.mkdir		= configfs_mkdir,
+	.rmdir		= configfs_rmdir,
+	.symlink	= configfs_symlink,
+	.unlink		= configfs_unlink,
+	.lookup		= configfs_lookup,
+	.setattr	= configfs_setattr,
+};
+
+#if 0
+int configfs_rename_dir(struct config_item * item, const char *new_name)
+{
+	int error = 0;
+	struct dentry * new_dentry, * parent;
+
+	if (!strcmp(config_item_name(item), new_name))
+		return -EINVAL;
+
+	if (!item->parent)
+		return -EINVAL;
+
+	down_write(&configfs_rename_sem);
+	parent = item->parent->dentry;
+
+	mutex_lock(&parent->d_inode->i_mutex);
+
+	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
+	if (!IS_ERR(new_dentry)) {
+		if (!new_dentry->d_inode) {
+			error = config_item_set_name(item, "%s", new_name);
+			if (!error) {
+				d_add(new_dentry, NULL);
+				d_move(item->dentry, new_dentry);
+			}
+			else
+				d_delete(new_dentry);
+		} else
+			error = -EEXIST;
+		dput(new_dentry);
+	}
+	mutex_unlock(&parent->d_inode->i_mutex);
+	up_write(&configfs_rename_sem);
+
+	return error;
+}
+#endif
+
+static int configfs_dir_open(struct inode *inode, struct file *file)
+{
+	struct dentry * dentry = file->f_path.dentry;
+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
+	int err;
+
+	mutex_lock(&dentry->d_inode->i_mutex);
+	/*
+	 * Fake invisibility if dir belongs to a group/default groups hierarchy
+	 * being attached
+	 */
+	err = -ENOENT;
+	if (configfs_dirent_is_ready(parent_sd)) {
+		file->private_data = configfs_new_dirent(parent_sd, NULL, 0);
+		if (IS_ERR(file->private_data))
+			err = PTR_ERR(file->private_data);
+		else
+			err = 0;
+	}
+	mutex_unlock(&dentry->d_inode->i_mutex);
+
+	return err;
+}
+
+static int configfs_dir_close(struct inode *inode, struct file *file)
+{
+	struct dentry * dentry = file->f_path.dentry;
+	struct configfs_dirent * cursor = file->private_data;
+
+	mutex_lock(&dentry->d_inode->i_mutex);
+	spin_lock(&configfs_dirent_lock);
+	list_del_init(&cursor->s_sibling);
+	spin_unlock(&configfs_dirent_lock);
+	mutex_unlock(&dentry->d_inode->i_mutex);
+
+	release_configfs_dirent(cursor);
+
+	return 0;
+}
+
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct configfs_dirent *sd)
+{
+	return (sd->s_mode >> 12) & 15;
+}
+
+static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *cursor = filp->private_data;
+	struct list_head *p, *q = &cursor->s_sibling;
+	ino_t ino = 0;
+	int i = filp->f_pos;
+
+	switch (i) {
+		case 0:
+			ino = dentry->d_inode->i_ino;
+			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		case 1:
+			ino = parent_ino(dentry);
+			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		default:
+			if (filp->f_pos == 2) {
+				spin_lock(&configfs_dirent_lock);
+				list_move(q, &parent_sd->s_children);
+				spin_unlock(&configfs_dirent_lock);
+			}
+			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
+				struct configfs_dirent *next;
+				const char * name;
+				int len;
+				struct inode *inode = NULL;
+
+				next = list_entry(p, struct configfs_dirent,
+						   s_sibling);
+				if (!next->s_element)
+					continue;
+
+				name = configfs_get_name(next);
+				len = strlen(name);
+
+				/*
+				 * We'll have a dentry and an inode for
+				 * PINNED items and for open attribute
+				 * files.  We lock here to prevent a race
+				 * with configfs_d_iput() clearing
+				 * s_dentry before calling iput().
+				 *
+				 * Why do we go to the trouble?  If
+				 * someone has an attribute file open,
+				 * the inode number should match until
+				 * they close it.  Beyond that, we don't
+				 * care.
+				 */
+				spin_lock(&configfs_dirent_lock);
+				dentry = next->s_dentry;
+				if (dentry)
+					inode = dentry->d_inode;
+				if (inode)
+					ino = inode->i_ino;
+				spin_unlock(&configfs_dirent_lock);
+				if (!inode)
+					ino = iunique(configfs_sb, 2);
+
+				if (filldir(dirent, name, len, filp->f_pos, ino,
+						 dt_type(next)) < 0)
+					return 0;
+
+				spin_lock(&configfs_dirent_lock);
+				list_move(q, p);
+				spin_unlock(&configfs_dirent_lock);
+				p = q;
+				filp->f_pos++;
+			}
+	}
+	return 0;
+}
+
+static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
+{
+	struct dentry * dentry = file->f_path.dentry;
+
+	mutex_lock(&dentry->d_inode->i_mutex);
+	switch (origin) {
+		case 1:
+			offset += file->f_pos;
+		case 0:
+			if (offset >= 0)
+				break;
+		default:
+			mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+			return -EINVAL;
+	}
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		if (file->f_pos >= 2) {
+			struct configfs_dirent *sd = dentry->d_fsdata;
+			struct configfs_dirent *cursor = file->private_data;
+			struct list_head *p;
+			loff_t n = file->f_pos - 2;
+
+			spin_lock(&configfs_dirent_lock);
+			list_del(&cursor->s_sibling);
+			p = sd->s_children.next;
+			while (n && p != &sd->s_children) {
+				struct configfs_dirent *next;
+				next = list_entry(p, struct configfs_dirent,
+						   s_sibling);
+				if (next->s_element)
+					n--;
+				p = p->next;
+			}
+			list_add_tail(&cursor->s_sibling, p);
+			spin_unlock(&configfs_dirent_lock);
+		}
+	}
+	mutex_unlock(&dentry->d_inode->i_mutex);
+	return offset;
+}
+
+const struct file_operations configfs_dir_operations = {
+	.open		= configfs_dir_open,
+	.release	= configfs_dir_close,
+	.llseek		= configfs_dir_lseek,
+	.read		= generic_read_dir,
+	.readdir	= configfs_readdir,
+};
+
+int configfs_register_subsystem(struct configfs_subsystem *subsys)
+{
+	int err;
+	struct config_group *group = &subsys->su_group;
+	struct qstr name;
+	struct dentry *dentry;
+	struct configfs_dirent *sd;
+
+	err = configfs_pin_fs();
+	if (err)
+		return err;
+
+	if (!group->cg_item.ci_name)
+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
+
+	sd = configfs_sb->s_root->d_fsdata;
+	link_group(to_config_group(sd->s_element), group);
+
+	mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
+			I_MUTEX_PARENT);
+
+	name.name = group->cg_item.ci_name;
+	name.len = strlen(name.name);
+	name.hash = full_name_hash(name.name, name.len);
+
+	err = -ENOMEM;
+	dentry = d_alloc(configfs_sb->s_root, &name);
+	if (dentry) {
+		d_add(dentry, NULL);
+
+		err = configfs_attach_group(sd->s_element, &group->cg_item,
+					    dentry);
+		if (err) {
+			BUG_ON(dentry->d_inode);
+			d_drop(dentry);
+			dput(dentry);
+		} else {
+			spin_lock(&configfs_dirent_lock);
+			configfs_dir_set_ready(dentry->d_fsdata);
+			spin_unlock(&configfs_dirent_lock);
+		}
+	}
+
+	mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
+
+	if (err) {
+		unlink_group(group);
+		configfs_release_fs();
+	}
+
+	return err;
+}
+
+void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
+{
+	struct config_group *group = &subsys->su_group;
+	struct dentry *dentry = group->cg_item.ci_dentry;
+
+	if (dentry->d_parent != configfs_sb->s_root) {
+		printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");
+		return;
+	}
+
+	mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
+			  I_MUTEX_PARENT);
+	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+	mutex_lock(&configfs_symlink_mutex);
+	spin_lock(&configfs_dirent_lock);
+	if (configfs_detach_prep(dentry, NULL)) {
+		printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
+	}
+	spin_unlock(&configfs_dirent_lock);
+	mutex_unlock(&configfs_symlink_mutex);
+	configfs_detach_group(&group->cg_item);
+	dentry->d_inode->i_flags |= S_DEAD;
+	dont_mount(dentry);
+	mutex_unlock(&dentry->d_inode->i_mutex);
+
+	d_delete(dentry);
+
+	mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
+
+	dput(dentry);
+
+	unlink_group(group);
+	configfs_release_fs();
+}
+
+EXPORT_SYMBOL(configfs_register_subsystem);
+EXPORT_SYMBOL(configfs_unregister_subsystem);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
new file mode 100644
index 00000000..2b6cb23d
--- /dev/null
+++ b/fs/configfs/file.c
@@ -0,0 +1,344 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.c - operations for regular (text) files.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <asm/uaccess.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+/*
+ * A simple attribute can only be 4096 characters.  Why 4k?  Because the
+ * original code limited it to PAGE_SIZE.  That's a bad idea, though,
+ * because an attribute of 16k on ia64 won't work on x86.  So we limit to
+ * 4k, our minimum common page size.
+ */
+#define SIMPLE_ATTR_SIZE 4096
+
+struct configfs_buffer {
+	size_t			count;
+	loff_t			pos;
+	char			* page;
+	struct configfs_item_operations	* ops;
+	struct mutex		mutex;
+	int			needs_read_fill;
+};
+
+
+/**
+ *	fill_read_buffer - allocate and fill buffer from item.
+ *	@dentry:	dentry pointer.
+ *	@buffer:	data buffer for file.
+ *
+ *	Allocate @buffer->page, if it hasn't been already, then call the
+ *	config_item's show() method to fill the buffer with this attribute's
+ *	data.
+ *	This is called only once, on the file's first read.
+ */
+static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer)
+{
+	struct configfs_attribute * attr = to_attr(dentry);
+	struct config_item * item = to_item(dentry->d_parent);
+	struct configfs_item_operations * ops = buffer->ops;
+	int ret = 0;
+	ssize_t count;
+
+	if (!buffer->page)
+		buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	count = ops->show_attribute(item,attr,buffer->page);
+	buffer->needs_read_fill = 0;
+	BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
+	if (count >= 0)
+		buffer->count = count;
+	else
+		ret = count;
+	return ret;
+}
+
+/**
+ *	configfs_read_file - read an attribute.
+ *	@file:	file pointer.
+ *	@buf:	buffer to fill.
+ *	@count:	number of bytes to read.
+ *	@ppos:	starting offset in file.
+ *
+ *	Userspace wants to read an attribute file. The attribute descriptor
+ *	is in the file's ->d_fsdata. The target item is in the directory's
+ *	->d_fsdata.
+ *
+ *	We call fill_read_buffer() to allocate and fill the buffer from the
+ *	item's show() method exactly once (if the read is happening from
+ *	the beginning of the file). That should fill the entire buffer with
+ *	all the data the item has to offer for that attribute.
+ *	We then call flush_read_buffer() to copy the buffer to userspace
+ *	in the increments specified.
+ */
+
+static ssize_t
+configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct configfs_buffer * buffer = file->private_data;
+	ssize_t retval = 0;
+
+	mutex_lock(&buffer->mutex);
+	if (buffer->needs_read_fill) {
+		if ((retval = fill_read_buffer(file->f_path.dentry,buffer)))
+			goto out;
+	}
+	pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
+		 __func__, count, *ppos, buffer->page);
+	retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
+					 buffer->count);
+out:
+	mutex_unlock(&buffer->mutex);
+	return retval;
+}
+
+
+/**
+ *	fill_write_buffer - copy buffer from userspace.
+ *	@buffer:	data buffer for file.
+ *	@buf:		data from user.
+ *	@count:		number of bytes in @userbuf.
+ *
+ *	Allocate @buffer->page if it hasn't been already, then
+ *	copy the user-supplied buffer into it.
+ */
+
+static int
+fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count)
+{
+	int error;
+
+	if (!buffer->page)
+		buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	if (count >= SIMPLE_ATTR_SIZE)
+		count = SIMPLE_ATTR_SIZE - 1;
+	error = copy_from_user(buffer->page,buf,count);
+	buffer->needs_read_fill = 1;
+	/* if buf is assumed to contain a string, terminate it by \0,
+	 * so e.g. sscanf() can scan the string easily */
+	buffer->page[count] = 0;
+	return error ? -EFAULT : count;
+}
+
+
+/**
+ *	flush_write_buffer - push buffer to config_item.
+ *	@dentry:	dentry to the attribute
+ *	@buffer:	data buffer for file.
+ *	@count:		number of bytes
+ *
+ *	Get the correct pointers for the config_item and the attribute we're
+ *	dealing with, then call the store() method for the attribute,
+ *	passing the buffer that we acquired in fill_write_buffer().
+ */
+
+static int
+flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count)
+{
+	struct configfs_attribute * attr = to_attr(dentry);
+	struct config_item * item = to_item(dentry->d_parent);
+	struct configfs_item_operations * ops = buffer->ops;
+
+	return ops->store_attribute(item,attr,buffer->page,count);
+}
+
+
+/**
+ *	configfs_write_file - write an attribute.
+ *	@file:	file pointer
+ *	@buf:	data to write
+ *	@count:	number of bytes
+ *	@ppos:	starting offset
+ *
+ *	Similar to configfs_read_file(), though working in the opposite direction.
+ *	We allocate and fill the data from the user in fill_write_buffer(),
+ *	then push it to the config_item in flush_write_buffer().
+ *	There is no easy way for us to know if userspace is only doing a partial
+ *	write, so we don't support them. We expect the entire buffer to come
+ *	on the first write.
+ *	Hint: if you're writing a value, first read the file, modify only the
+ *	the value you're changing, then write entire buffer back.
+ */
+
+static ssize_t
+configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct configfs_buffer * buffer = file->private_data;
+	ssize_t len;
+
+	mutex_lock(&buffer->mutex);
+	len = fill_write_buffer(buffer, buf, count);
+	if (len > 0)
+		len = flush_write_buffer(file->f_path.dentry, buffer, count);
+	if (len > 0)
+		*ppos += len;
+	mutex_unlock(&buffer->mutex);
+	return len;
+}
+
+static int check_perm(struct inode * inode, struct file * file)
+{
+	struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent);
+	struct configfs_attribute * attr = to_attr(file->f_path.dentry);
+	struct configfs_buffer * buffer;
+	struct configfs_item_operations * ops = NULL;
+	int error = 0;
+
+	if (!item || !attr)
+		goto Einval;
+
+	/* Grab the module reference for this attribute if we have one */
+	if (!try_module_get(attr->ca_owner)) {
+		error = -ENODEV;
+		goto Done;
+	}
+
+	if (item->ci_type)
+		ops = item->ci_type->ct_item_ops;
+	else
+		goto Eaccess;
+
+	/* File needs write support.
+	 * The inode's perms must say it's ok,
+	 * and we must have a store method.
+	 */
+	if (file->f_mode & FMODE_WRITE) {
+
+		if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
+			goto Eaccess;
+
+	}
+
+	/* File needs read support.
+	 * The inode's perms must say it's ok, and we there
+	 * must be a show method for it.
+	 */
+	if (file->f_mode & FMODE_READ) {
+		if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
+			goto Eaccess;
+	}
+
+	/* No error? Great, allocate a buffer for the file, and store it
+	 * it in file->private_data for easy access.
+	 */
+	buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
+	if (!buffer) {
+		error = -ENOMEM;
+		goto Enomem;
+	}
+	mutex_init(&buffer->mutex);
+	buffer->needs_read_fill = 1;
+	buffer->ops = ops;
+	file->private_data = buffer;
+	goto Done;
+
+ Einval:
+	error = -EINVAL;
+	goto Done;
+ Eaccess:
+	error = -EACCES;
+ Enomem:
+	module_put(attr->ca_owner);
+ Done:
+	if (error && item)
+		config_item_put(item);
+	return error;
+}
+
+static int configfs_open_file(struct inode * inode, struct file * filp)
+{
+	return check_perm(inode,filp);
+}
+
+static int configfs_release(struct inode * inode, struct file * filp)
+{
+	struct config_item * item = to_item(filp->f_path.dentry->d_parent);
+	struct configfs_attribute * attr = to_attr(filp->f_path.dentry);
+	struct module * owner = attr->ca_owner;
+	struct configfs_buffer * buffer = filp->private_data;
+
+	if (item)
+		config_item_put(item);
+	/* After this point, attr should not be accessed. */
+	module_put(owner);
+
+	if (buffer) {
+		if (buffer->page)
+			free_page((unsigned long)buffer->page);
+		mutex_destroy(&buffer->mutex);
+		kfree(buffer);
+	}
+	return 0;
+}
+
+const struct file_operations configfs_file_operations = {
+	.read		= configfs_read_file,
+	.write		= configfs_write_file,
+	.llseek		= generic_file_llseek,
+	.open		= configfs_open_file,
+	.release	= configfs_release,
+};
+
+
+int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type)
+{
+	struct configfs_dirent * parent_sd = dir->d_fsdata;
+	umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
+	int error = 0;
+
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
+	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
+	mutex_unlock(&dir->d_inode->i_mutex);
+
+	return error;
+}
+
+
+/**
+ *	configfs_create_file - create an attribute file for an item.
+ *	@item:	item we're creating for.
+ *	@attr:	atrribute descriptor.
+ */
+
+int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr)
+{
+	BUG_ON(!item || !item->ci_dentry || !attr);
+
+	return configfs_add_file(item->ci_dentry, attr,
+				 CONFIGFS_ITEM_ATTR);
+}
+
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
new file mode 100644
index 00000000..c83f4768
--- /dev/null
+++ b/fs/configfs/inode.c
@@ -0,0 +1,297 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.c - basic inode and dentry operations.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see Documentation/filesystems/configfs.txt for more information.
+ */
+
+#undef DEBUG
+
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/backing-dev.h>
+#include <linux/capability.h>
+#include <linux/sched.h>
+#include <linux/lockdep.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key default_group_class[MAX_LOCK_DEPTH];
+#endif
+
+extern struct super_block * configfs_sb;
+
+static const struct address_space_operations configfs_aops = {
+	.readpage	= simple_readpage,
+	.write_begin	= simple_write_begin,
+	.write_end	= simple_write_end,
+};
+
+static struct backing_dev_info configfs_backing_dev_info = {
+	.name		= "configfs",
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+
+static const struct inode_operations configfs_inode_operations ={
+	.setattr	= configfs_setattr,
+};
+
+int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
+{
+	struct inode * inode = dentry->d_inode;
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	struct iattr * sd_iattr;
+	unsigned int ia_valid = iattr->ia_valid;
+	int error;
+
+	if (!sd)
+		return -EINVAL;
+
+	sd_iattr = sd->s_iattr;
+	if (!sd_iattr) {
+		/* setting attributes for the first time, allocate now */
+		sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
+		if (!sd_iattr)
+			return -ENOMEM;
+		/* assign default attributes */
+		sd_iattr->ia_mode = sd->s_mode;
+		sd_iattr->ia_uid = 0;
+		sd_iattr->ia_gid = 0;
+		sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
+		sd->s_iattr = sd_iattr;
+	}
+	/* attributes were changed atleast once in past */
+
+	error = simple_setattr(dentry, iattr);
+	if (error)
+		return error;
+
+	if (ia_valid & ATTR_UID)
+		sd_iattr->ia_uid = iattr->ia_uid;
+	if (ia_valid & ATTR_GID)
+		sd_iattr->ia_gid = iattr->ia_gid;
+	if (ia_valid & ATTR_ATIME)
+		sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_MTIME)
+		sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_CTIME)
+		sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = iattr->ia_mode;
+
+		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+			mode &= ~S_ISGID;
+		sd_iattr->ia_mode = sd->s_mode = mode;
+	}
+
+	return error;
+}
+
+static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
+{
+	inode->i_mode = mode;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+}
+
+static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
+{
+	inode->i_mode = iattr->ia_mode;
+	inode->i_uid = iattr->ia_uid;
+	inode->i_gid = iattr->ia_gid;
+	inode->i_atime = iattr->ia_atime;
+	inode->i_mtime = iattr->ia_mtime;
+	inode->i_ctime = iattr->ia_ctime;
+}
+
+struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
+{
+	struct inode * inode = new_inode(configfs_sb);
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode->i_mapping->a_ops = &configfs_aops;
+		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
+		inode->i_op = &configfs_inode_operations;
+
+		if (sd->s_iattr) {
+			/* sysfs_dirent has non-default attributes
+			 * get them for the new inode from persistent copy
+			 * in sysfs_dirent
+			 */
+			set_inode_attr(inode, sd->s_iattr);
+		} else
+			set_default_inode_attr(inode, mode);
+	}
+	return inode;
+}
+
+#ifdef CONFIG_LOCKDEP
+
+static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
+					  struct inode *inode)
+{
+	int depth = sd->s_depth;
+
+	if (depth > 0) {
+		if (depth <= ARRAY_SIZE(default_group_class)) {
+			lockdep_set_class(&inode->i_mutex,
+					  &default_group_class[depth - 1]);
+		} else {
+			/*
+			 * In practice the maximum level of locking depth is
+			 * already reached. Just inform about possible reasons.
+			 */
+			printk(KERN_INFO "configfs: Too many levels of inodes"
+			       " for the locking correctness validator.\n");
+			printk(KERN_INFO "Spurious warnings may appear.\n");
+		}
+	}
+}
+
+#else /* CONFIG_LOCKDEP */
+
+static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
+					  struct inode *inode)
+{
+}
+
+#endif /* CONFIG_LOCKDEP */
+
+int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
+{
+	int error = 0;
+	struct inode * inode = NULL;
+	if (dentry) {
+		if (!dentry->d_inode) {
+			struct configfs_dirent *sd = dentry->d_fsdata;
+			if ((inode = configfs_new_inode(mode, sd))) {
+				if (dentry->d_parent && dentry->d_parent->d_inode) {
+					struct inode *p_inode = dentry->d_parent->d_inode;
+					p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
+				}
+				configfs_set_inode_lock_class(sd, inode);
+				goto Proceed;
+			}
+			else
+				error = -ENOMEM;
+		} else
+			error = -EEXIST;
+	} else
+		error = -ENOENT;
+	goto Done;
+
+ Proceed:
+	if (init)
+		error = init(inode);
+	if (!error) {
+		d_instantiate(dentry, inode);
+		if (S_ISDIR(mode) || S_ISLNK(mode))
+			dget(dentry);  /* pin link and directory dentries in core */
+	} else
+		iput(inode);
+ Done:
+	return error;
+}
+
+/*
+ * Get the name for corresponding element represented by the given configfs_dirent
+ */
+const unsigned char * configfs_get_name(struct configfs_dirent *sd)
+{
+	struct configfs_attribute *attr;
+
+	BUG_ON(!sd || !sd->s_element);
+
+	/* These always have a dentry, so use that */
+	if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
+		return sd->s_dentry->d_name.name;
+
+	if (sd->s_type & CONFIGFS_ITEM_ATTR) {
+		attr = sd->s_element;
+		return attr->ca_name;
+	}
+	return NULL;
+}
+
+
+/*
+ * Unhashes the dentry corresponding to given configfs_dirent
+ * Called with parent inode's i_mutex held.
+ */
+void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
+{
+	struct dentry * dentry = sd->s_dentry;
+
+	if (dentry) {
+		spin_lock(&dentry->d_lock);
+		if (!(d_unhashed(dentry) && dentry->d_inode)) {
+			dget_dlock(dentry);
+			__d_drop(dentry);
+			spin_unlock(&dentry->d_lock);
+			simple_unlink(parent->d_inode, dentry);
+		} else
+			spin_unlock(&dentry->d_lock);
+	}
+}
+
+void configfs_hash_and_remove(struct dentry * dir, const char * name)
+{
+	struct configfs_dirent * sd;
+	struct configfs_dirent * parent_sd = dir->d_fsdata;
+
+	if (dir->d_inode == NULL)
+		/* no inode means this hasn't been made visible yet */
+		return;
+
+	mutex_lock(&dir->d_inode->i_mutex);
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element)
+			continue;
+		if (!strcmp(configfs_get_name(sd), name)) {
+			spin_lock(&configfs_dirent_lock);
+			list_del_init(&sd->s_sibling);
+			spin_unlock(&configfs_dirent_lock);
+			configfs_drop_dentry(sd, dir);
+			configfs_put(sd);
+			break;
+		}
+	}
+	mutex_unlock(&dir->d_inode->i_mutex);
+}
+
+int __init configfs_inode_init(void)
+{
+	return bdi_init(&configfs_backing_dev_info);
+}
+
+void __exit configfs_inode_exit(void)
+{
+	bdi_destroy(&configfs_backing_dev_info);
+}
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
new file mode 100644
index 00000000..76dc4c3e
--- /dev/null
+++ b/fs/configfs/item.c
@@ -0,0 +1,216 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * item.c - library routines for handling generic config items
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on kobject:
+ * 	kobject is Copyright (c) 2002-2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see the file Documentation/filesystems/configfs.txt for
+ * critical information about using the config_item interface.
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+static inline struct config_item * to_item(struct list_head * entry)
+{
+	return container_of(entry,struct config_item,ci_entry);
+}
+
+/* Evil kernel */
+static void config_item_release(struct kref *kref);
+
+/**
+ *	config_item_init - initialize item.
+ *	@item:	item in question.
+ */
+void config_item_init(struct config_item * item)
+{
+	kref_init(&item->ci_kref);
+	INIT_LIST_HEAD(&item->ci_entry);
+}
+
+/**
+ *	config_item_set_name - Set the name of an item
+ *	@item:	item.
+ *	@name:	name.
+ *
+ *	If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a
+ *	dynamically allocated string that @item->ci_name points to.
+ *	Otherwise, use the static @item->ci_namebuf array.
+ */
+int config_item_set_name(struct config_item * item, const char * fmt, ...)
+{
+	int error = 0;
+	int limit = CONFIGFS_ITEM_NAME_LEN;
+	int need;
+	va_list args;
+	char * name;
+
+	/*
+	 * First, try the static array
+	 */
+	va_start(args,fmt);
+	need = vsnprintf(item->ci_namebuf,limit,fmt,args);
+	va_end(args);
+	if (need < limit)
+		name = item->ci_namebuf;
+	else {
+		/*
+		 * Need more space? Allocate it and try again
+		 */
+		limit = need + 1;
+		name = kmalloc(limit,GFP_KERNEL);
+		if (!name) {
+			error = -ENOMEM;
+			goto Done;
+		}
+		va_start(args,fmt);
+		need = vsnprintf(name,limit,fmt,args);
+		va_end(args);
+
+		/* Still? Give up. */
+		if (need >= limit) {
+			kfree(name);
+			error = -EFAULT;
+			goto Done;
+		}
+	}
+
+	/* Free the old name, if necessary. */
+	if (item->ci_name && item->ci_name != item->ci_namebuf)
+		kfree(item->ci_name);
+
+	/* Now, set the new name */
+	item->ci_name = name;
+ Done:
+	return error;
+}
+
+EXPORT_SYMBOL(config_item_set_name);
+
+void config_item_init_type_name(struct config_item *item,
+				const char *name,
+				struct config_item_type *type)
+{
+	config_item_set_name(item, name);
+	item->ci_type = type;
+	config_item_init(item);
+}
+EXPORT_SYMBOL(config_item_init_type_name);
+
+void config_group_init_type_name(struct config_group *group, const char *name,
+			 struct config_item_type *type)
+{
+	config_item_set_name(&group->cg_item, name);
+	group->cg_item.ci_type = type;
+	config_group_init(group);
+}
+EXPORT_SYMBOL(config_group_init_type_name);
+
+struct config_item * config_item_get(struct config_item * item)
+{
+	if (item)
+		kref_get(&item->ci_kref);
+	return item;
+}
+
+static void config_item_cleanup(struct config_item * item)
+{
+	struct config_item_type * t = item->ci_type;
+	struct config_group * s = item->ci_group;
+	struct config_item * parent = item->ci_parent;
+
+	pr_debug("config_item %s: cleaning up\n",config_item_name(item));
+	if (item->ci_name != item->ci_namebuf)
+		kfree(item->ci_name);
+	item->ci_name = NULL;
+	if (t && t->ct_item_ops && t->ct_item_ops->release)
+		t->ct_item_ops->release(item);
+	if (s)
+		config_group_put(s);
+	if (parent)
+		config_item_put(parent);
+}
+
+static void config_item_release(struct kref *kref)
+{
+	config_item_cleanup(container_of(kref, struct config_item, ci_kref));
+}
+
+/**
+ *	config_item_put - decrement refcount for item.
+ *	@item:	item.
+ *
+ *	Decrement the refcount, and if 0, call config_item_cleanup().
+ */
+void config_item_put(struct config_item * item)
+{
+	if (item)
+		kref_put(&item->ci_kref, config_item_release);
+}
+
+/**
+ *	config_group_init - initialize a group for use
+ *	@k:	group
+ */
+void config_group_init(struct config_group *group)
+{
+	config_item_init(&group->cg_item);
+	INIT_LIST_HEAD(&group->cg_children);
+}
+
+/**
+ *	config_group_find_item - search for item in group.
+ *	@group:	group we're looking in.
+ *	@name:	item's name.
+ *
+ *	Iterate over @group->cg_list, looking for a matching config_item.
+ *	If matching item is found take a reference and return the item.
+ *	Caller must have locked group via @group->cg_subsys->su_mtx.
+ */
+struct config_item *config_group_find_item(struct config_group *group,
+					   const char *name)
+{
+	struct list_head * entry;
+	struct config_item * ret = NULL;
+
+	list_for_each(entry,&group->cg_children) {
+		struct config_item * item = to_item(entry);
+		if (config_item_name(item) &&
+		    !strcmp(config_item_name(item), name)) {
+			ret = config_item_get(item);
+			break;
+		}
+	}
+	return ret;
+}
+
+EXPORT_SYMBOL(config_item_init);
+EXPORT_SYMBOL(config_group_init);
+EXPORT_SYMBOL(config_item_get);
+EXPORT_SYMBOL(config_item_put);
+EXPORT_SYMBOL(config_group_find_item);
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
new file mode 100644
index 00000000..ecc62178
--- /dev/null
+++ b/fs/configfs/mount.c
@@ -0,0 +1,187 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * mount.c - operations for initializing and mounting configfs.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+/* Random magic number */
+#define CONFIGFS_MAGIC 0x62656570
+
+struct vfsmount * configfs_mount = NULL;
+struct super_block * configfs_sb = NULL;
+struct kmem_cache *configfs_dir_cachep;
+static int configfs_mnt_count = 0;
+
+static const struct super_operations configfs_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+};
+
+static struct config_group configfs_root_group = {
+	.cg_item = {
+		.ci_namebuf	= "root",
+		.ci_name	= configfs_root_group.cg_item.ci_namebuf,
+	},
+};
+
+int configfs_is_root(struct config_item *item)
+{
+	return item == &configfs_root_group.cg_item;
+}
+
+static struct configfs_dirent configfs_root = {
+	.s_sibling	= LIST_HEAD_INIT(configfs_root.s_sibling),
+	.s_children	= LIST_HEAD_INIT(configfs_root.s_children),
+	.s_element	= &configfs_root_group.cg_item,
+	.s_type		= CONFIGFS_ROOT,
+	.s_iattr	= NULL,
+};
+
+static int configfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode;
+	struct dentry *root;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = CONFIGFS_MAGIC;
+	sb->s_op = &configfs_ops;
+	sb->s_time_gran = 1;
+	configfs_sb = sb;
+
+	inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
+				   &configfs_root);
+	if (inode) {
+		inode->i_op = &configfs_dir_inode_operations;
+		inode->i_fop = &configfs_dir_operations;
+		/* directory inodes start off with i_nlink == 2 (for "." entry) */
+		inc_nlink(inode);
+	} else {
+		pr_debug("configfs: could not get root inode\n");
+		return -ENOMEM;
+	}
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		pr_debug("%s: could not get root dentry!\n",__func__);
+		iput(inode);
+		return -ENOMEM;
+	}
+	config_group_init(&configfs_root_group);
+	configfs_root_group.cg_item.ci_dentry = root;
+	root->d_fsdata = &configfs_root;
+	sb->s_root = root;
+	sb->s_d_op = &configfs_dentry_ops; /* the rest get that */
+	return 0;
+}
+
+static struct dentry *configfs_do_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_single(fs_type, flags, data, configfs_fill_super);
+}
+
+static struct file_system_type configfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "configfs",
+	.mount		= configfs_do_mount,
+	.kill_sb	= kill_litter_super,
+};
+
+int configfs_pin_fs(void)
+{
+	return simple_pin_fs(&configfs_fs_type, &configfs_mount,
+			     &configfs_mnt_count);
+}
+
+void configfs_release_fs(void)
+{
+	simple_release_fs(&configfs_mount, &configfs_mnt_count);
+}
+
+
+static struct kobject *config_kobj;
+
+static int __init configfs_init(void)
+{
+	int err = -ENOMEM;
+
+	configfs_dir_cachep = kmem_cache_create("configfs_dir_cache",
+						sizeof(struct configfs_dirent),
+						0, 0, NULL);
+	if (!configfs_dir_cachep)
+		goto out;
+
+	config_kobj = kobject_create_and_add("config", kernel_kobj);
+	if (!config_kobj) {
+		kmem_cache_destroy(configfs_dir_cachep);
+		configfs_dir_cachep = NULL;
+		goto out;
+	}
+
+	err = register_filesystem(&configfs_fs_type);
+	if (err) {
+		printk(KERN_ERR "configfs: Unable to register filesystem!\n");
+		kobject_put(config_kobj);
+		kmem_cache_destroy(configfs_dir_cachep);
+		configfs_dir_cachep = NULL;
+		goto out;
+	}
+
+	err = configfs_inode_init();
+	if (err) {
+		unregister_filesystem(&configfs_fs_type);
+		kobject_put(config_kobj);
+		kmem_cache_destroy(configfs_dir_cachep);
+		configfs_dir_cachep = NULL;
+	}
+out:
+	return err;
+}
+
+static void __exit configfs_exit(void)
+{
+	unregister_filesystem(&configfs_fs_type);
+	kobject_put(config_kobj);
+	kmem_cache_destroy(configfs_dir_cachep);
+	configfs_dir_cachep = NULL;
+	configfs_inode_exit();
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.0.2");
+MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
+
+module_init(configfs_init);
+module_exit(configfs_exit);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
new file mode 100644
index 00000000..0f3eb41d
--- /dev/null
+++ b/fs/configfs/symlink.c
@@ -0,0 +1,320 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * symlink.c - operations for configfs symlinks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+/* Protects attachments of new symlinks */
+DEFINE_MUTEX(configfs_symlink_mutex);
+
+static int item_depth(struct config_item * item)
+{
+	struct config_item * p = item;
+	int depth = 0;
+	do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p));
+	return depth;
+}
+
+static int item_path_length(struct config_item * item)
+{
+	struct config_item * p = item;
+	int length = 1;
+	do {
+		length += strlen(config_item_name(p)) + 1;
+		p = p->ci_parent;
+	} while (p && !configfs_is_root(p));
+	return length;
+}
+
+static void fill_item_path(struct config_item * item, char * buffer, int length)
+{
+	struct config_item * p;
+
+	--length;
+	for (p = item; p && !configfs_is_root(p); p = p->ci_parent) {
+		int cur = strlen(config_item_name(p));
+
+		/* back up enough to print this bus id with '/' */
+		length -= cur;
+		strncpy(buffer + length,config_item_name(p),cur);
+		*(buffer + --length) = '/';
+	}
+}
+
+static int create_link(struct config_item *parent_item,
+		       struct config_item *item,
+		       struct dentry *dentry)
+{
+	struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
+	struct configfs_symlink *sl;
+	int ret;
+
+	ret = -ENOENT;
+	if (!configfs_dirent_is_ready(target_sd))
+		goto out;
+	ret = -ENOMEM;
+	sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
+	if (sl) {
+		sl->sl_target = config_item_get(item);
+		spin_lock(&configfs_dirent_lock);
+		if (target_sd->s_type & CONFIGFS_USET_DROPPING) {
+			spin_unlock(&configfs_dirent_lock);
+			config_item_put(item);
+			kfree(sl);
+			return -ENOENT;
+		}
+		list_add(&sl->sl_list, &target_sd->s_links);
+		spin_unlock(&configfs_dirent_lock);
+		ret = configfs_create_link(sl, parent_item->ci_dentry,
+					   dentry);
+		if (ret) {
+			spin_lock(&configfs_dirent_lock);
+			list_del_init(&sl->sl_list);
+			spin_unlock(&configfs_dirent_lock);
+			config_item_put(item);
+			kfree(sl);
+		}
+	}
+
+out:
+	return ret;
+}
+
+
+static int get_target(const char *symname, struct path *path,
+		      struct config_item **target)
+{
+	int ret;
+
+	ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, path);
+	if (!ret) {
+		if (path->dentry->d_sb == configfs_sb) {
+			*target = configfs_get_config_item(path->dentry);
+			if (!*target) {
+				ret = -ENOENT;
+				path_put(path);
+			}
+		} else {
+			ret = -EPERM;
+			path_put(path);
+		}
+	}
+
+	return ret;
+}
+
+
+int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	int ret;
+	struct path path;
+	struct configfs_dirent *sd;
+	struct config_item *parent_item;
+	struct config_item *target_item = NULL;
+	struct config_item_type *type;
+
+	ret = -EPERM;  /* What lack-of-symlink returns */
+	if (dentry->d_parent == configfs_sb->s_root)
+		goto out;
+
+	sd = dentry->d_parent->d_fsdata;
+	/*
+	 * Fake invisibility if dir belongs to a group/default groups hierarchy
+	 * being attached
+	 */
+	ret = -ENOENT;
+	if (!configfs_dirent_is_ready(sd))
+		goto out;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+
+	ret = -EPERM;
+	if (!type || !type->ct_item_ops ||
+	    !type->ct_item_ops->allow_link)
+		goto out_put;
+
+	ret = get_target(symname, &path, &target_item);
+	if (ret)
+		goto out_put;
+
+	ret = type->ct_item_ops->allow_link(parent_item, target_item);
+	if (!ret) {
+		mutex_lock(&configfs_symlink_mutex);
+		ret = create_link(parent_item, target_item, dentry);
+		mutex_unlock(&configfs_symlink_mutex);
+		if (ret && type->ct_item_ops->drop_link)
+			type->ct_item_ops->drop_link(parent_item,
+						     target_item);
+	}
+
+	config_item_put(target_item);
+	path_put(&path);
+
+out_put:
+	config_item_put(parent_item);
+
+out:
+	return ret;
+}
+
+int configfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct configfs_dirent *sd = dentry->d_fsdata;
+	struct configfs_symlink *sl;
+	struct config_item *parent_item;
+	struct config_item_type *type;
+	int ret;
+
+	ret = -EPERM;  /* What lack-of-symlink returns */
+	if (!(sd->s_type & CONFIGFS_ITEM_LINK))
+		goto out;
+
+	BUG_ON(dentry->d_parent == configfs_sb->s_root);
+
+	sl = sd->s_element;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+
+	spin_lock(&configfs_dirent_lock);
+	list_del_init(&sd->s_sibling);
+	spin_unlock(&configfs_dirent_lock);
+	configfs_drop_dentry(sd, dentry->d_parent);
+	dput(dentry);
+	configfs_put(sd);
+
+	/*
+	 * drop_link() must be called before
+	 * list_del_init(&sl->sl_list), so that the order of
+	 * drop_link(this, target) and drop_item(target) is preserved.
+	 */
+	if (type && type->ct_item_ops &&
+	    type->ct_item_ops->drop_link)
+		type->ct_item_ops->drop_link(parent_item,
+					       sl->sl_target);
+
+	spin_lock(&configfs_dirent_lock);
+	list_del_init(&sl->sl_list);
+	spin_unlock(&configfs_dirent_lock);
+
+	/* Put reference from create_link() */
+	config_item_put(sl->sl_target);
+	kfree(sl);
+
+	config_item_put(parent_item);
+
+	ret = 0;
+
+out:
+	return ret;
+}
+
+static int configfs_get_target_path(struct config_item * item, struct config_item * target,
+				   char *path)
+{
+	char * s;
+	int depth, size;
+
+	depth = item_depth(item);
+	size = item_path_length(target) + depth * 3 - 1;
+	if (size > PATH_MAX)
+		return -ENAMETOOLONG;
+
+	pr_debug("%s: depth = %d, size = %d\n", __func__, depth, size);
+
+	for (s = path; depth--; s += 3)
+		strcpy(s,"../");
+
+	fill_item_path(target, path, size);
+	pr_debug("%s: path = '%s'\n", __func__, path);
+
+	return 0;
+}
+
+static int configfs_getlink(struct dentry *dentry, char * path)
+{
+	struct config_item *item, *target_item;
+	int error = 0;
+
+	item = configfs_get_config_item(dentry->d_parent);
+	if (!item)
+		return -EINVAL;
+
+	target_item = configfs_get_config_item(dentry);
+	if (!target_item) {
+		config_item_put(item);
+		return -EINVAL;
+	}
+
+	down_read(&configfs_rename_sem);
+	error = configfs_get_target_path(item, target_item, path);
+	up_read(&configfs_rename_sem);
+
+	config_item_put(item);
+	config_item_put(target_item);
+	return error;
+
+}
+
+static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int error = -ENOMEM;
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+	if (page) {
+		error = configfs_getlink(dentry, (char *)page);
+		if (!error) {
+			nd_set_link(nd, (char *)page);
+			return (void *)page;
+		}
+	}
+
+	nd_set_link(nd, ERR_PTR(error));
+	return NULL;
+}
+
+static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
+			      void *cookie)
+{
+	if (cookie) {
+		unsigned long page = (unsigned long)cookie;
+		free_page(page);
+	}
+}
+
+const struct inode_operations configfs_symlink_inode_operations = {
+	.follow_link = configfs_follow_link,
+	.readlink = generic_readlink,
+	.put_link = configfs_put_link,
+	.setattr = configfs_setattr,
+};
+
diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig
new file mode 100644
index 00000000..cd06466f
--- /dev/null
+++ b/fs/cramfs/Kconfig
@@ -0,0 +1,19 @@
+config CRAMFS
+	tristate "Compressed ROM file system support (cramfs)"
+	depends on BLOCK
+	select ZLIB_INFLATE
+	help
+	  Saying Y here includes support for CramFs (Compressed ROM File
+	  System).  CramFs is designed to be a simple, small, and compressed
+	  file system for ROM based embedded systems.  CramFs is read-only,
+	  limited to 256MB file systems (with 16MB files), and doesn't support
+	  16/32 bits uid/gid, hard links and timestamps.
+
+	  See <file:Documentation/filesystems/cramfs.txt> and
+	  <file:fs/cramfs/README> for further information.
+
+	  To compile this as a module, choose M here: the module will be called
+	  cramfs.  Note that the root file system (the one containing the
+	  directory /) cannot be compiled as a module.
+
+	  If unsure, say N.
diff --git a/fs/cramfs/Makefile b/fs/cramfs/Makefile
new file mode 100644
index 00000000..92ebb464
--- /dev/null
+++ b/fs/cramfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux cramfs routines.
+#
+
+obj-$(CONFIG_CRAMFS) += cramfs.o
+
+cramfs-objs := inode.o uncompress.o
diff --git a/fs/cramfs/README b/fs/cramfs/README
new file mode 100644
index 00000000..445d1c2d
--- /dev/null
+++ b/fs/cramfs/README
@@ -0,0 +1,168 @@
+Notes on Filesystem Layout
+--------------------------
+
+These notes describe what mkcramfs generates.  Kernel requirements are
+a bit looser, e.g. it doesn't care if the <file_data> items are
+swapped around (though it does care that directory entries (inodes) in
+a given directory are contiguous, as this is used by readdir).
+
+All data is currently in host-endian format; neither mkcramfs nor the
+kernel ever do swabbing.  (See section `Block Size' below.)
+
+<filesystem>:
+	<superblock>
+	<directory_structure>
+	<data>
+
+<superblock>: struct cramfs_super (see cramfs_fs.h).
+
+<directory_structure>:
+	For each file:
+		struct cramfs_inode (see cramfs_fs.h).
+		Filename.  Not generally null-terminated, but it is
+		 null-padded to a multiple of 4 bytes.
+
+The order of inode traversal is described as "width-first" (not to be
+confused with breadth-first); i.e. like depth-first but listing all of
+a directory's entries before recursing down its subdirectories: the
+same order as `ls -AUR' (but without the /^\..*:$/ directory header
+lines); put another way, the same order as `find -type d -exec
+ls -AU1 {} \;'.
+
+Beginning in 2.4.7, directory entries are sorted.  This optimization
+allows cramfs_lookup to return more quickly when a filename does not
+exist, speeds up user-space directory sorts, etc.
+
+<data>:
+	One <file_data> for each file that's either a symlink or a
+	 regular file of non-zero st_size.
+
+<file_data>:
+	nblocks * <block_pointer>
+	 (where nblocks = (st_size - 1) / blksize + 1)
+	nblocks * <block>
+	padding to multiple of 4 bytes
+
+The i'th <block_pointer> for a file stores the byte offset of the
+*end* of the i'th <block> (i.e. one past the last byte, which is the
+same as the start of the (i+1)'th <block> if there is one).  The first
+<block> immediately follows the last <block_pointer> for the file.
+<block_pointer>s are each 32 bits long.
+
+The order of <file_data>'s is a depth-first descent of the directory
+tree, i.e. the same order as `find -size +0 \( -type f -o -type l \)
+-print'.
+
+
+<block>: The i'th <block> is the output of zlib's compress function
+applied to the i'th blksize-sized chunk of the input data.
+(For the last <block> of the file, the input may of course be smaller.)
+Each <block> may be a different size.  (See <block_pointer> above.)
+<block>s are merely byte-aligned, not generally u32-aligned.
+
+
+Holes
+-----
+
+This kernel supports cramfs holes (i.e. [efficient representation of]
+blocks in uncompressed data consisting entirely of NUL bytes), but by
+default mkcramfs doesn't test for & create holes, since cramfs in
+kernels up to at least 2.3.39 didn't support holes.  Run mkcramfs
+with -z if you want it to create files that can have holes in them.
+
+
+Tools
+-----
+
+The cramfs user-space tools, including mkcramfs and cramfsck, are
+located at <http://sourceforge.net/projects/cramfs/>.
+
+
+Future Development
+==================
+
+Block Size
+----------
+
+(Block size in cramfs refers to the size of input data that is
+compressed at a time.  It's intended to be somewhere around
+PAGE_CACHE_SIZE for cramfs_readpage's convenience.)
+
+The superblock ought to indicate the block size that the fs was
+written for, since comments in <linux/pagemap.h> indicate that
+PAGE_CACHE_SIZE may grow in future (if I interpret the comment
+correctly).
+
+Currently, mkcramfs #define's PAGE_CACHE_SIZE as 4096 and uses that
+for blksize, whereas Linux-2.3.39 uses its PAGE_CACHE_SIZE, which in
+turn is defined as PAGE_SIZE (which can be as large as 32KB on arm).
+This discrepancy is a bug, though it's not clear which should be
+changed.
+
+One option is to change mkcramfs to take its PAGE_CACHE_SIZE from
+<asm/page.h>.  Personally I don't like this option, but it does
+require the least amount of change: just change `#define
+PAGE_CACHE_SIZE (4096)' to `#include <asm/page.h>'.  The disadvantage
+is that the generated cramfs cannot always be shared between different
+kernels, not even necessarily kernels of the same architecture if
+PAGE_CACHE_SIZE is subject to change between kernel versions
+(currently possible with arm and ia64).
+
+The remaining options try to make cramfs more sharable.
+
+One part of that is addressing endianness.  The two options here are
+`always use little-endian' (like ext2fs) or `writer chooses
+endianness; kernel adapts at runtime'.  Little-endian wins because of
+code simplicity and little CPU overhead even on big-endian machines.
+
+The cost of swabbing is changing the code to use the le32_to_cpu
+etc. macros as used by ext2fs.  We don't need to swab the compressed
+data, only the superblock, inodes and block pointers.
+
+
+The other part of making cramfs more sharable is choosing a block
+size.  The options are:
+
+  1. Always 4096 bytes.
+
+  2. Writer chooses blocksize; kernel adapts but rejects blocksize >
+     PAGE_CACHE_SIZE.
+
+  3. Writer chooses blocksize; kernel adapts even to blocksize >
+     PAGE_CACHE_SIZE.
+
+It's easy enough to change the kernel to use a smaller value than
+PAGE_CACHE_SIZE: just make cramfs_readpage read multiple blocks.
+
+The cost of option 1 is that kernels with a larger PAGE_CACHE_SIZE
+value don't get as good compression as they can.
+
+The cost of option 2 relative to option 1 is that the code uses
+variables instead of #define'd constants.  The gain is that people
+with kernels having larger PAGE_CACHE_SIZE can make use of that if
+they don't mind their cramfs being inaccessible to kernels with
+smaller PAGE_CACHE_SIZE values.
+
+Option 3 is easy to implement if we don't mind being CPU-inefficient:
+e.g. get readpage to decompress to a buffer of size MAX_BLKSIZE (which
+must be no larger than 32KB) and discard what it doesn't need.
+Getting readpage to read into all the covered pages is harder.
+
+The main advantage of option 3 over 1, 2, is better compression.  The
+cost is greater complexity.  Probably not worth it, but I hope someone
+will disagree.  (If it is implemented, then I'll re-use that code in
+e2compr.)
+
+
+Another cost of 2 and 3 over 1 is making mkcramfs use a different
+block size, but that just means adding and parsing a -b option.
+
+
+Inode Size
+----------
+
+Given that cramfs will probably be used for CDs etc. as well as just
+silicon ROMs, it might make sense to expand the inode a little from
+its current 12 bytes.  Inodes other than the root inode are followed
+by filename, so the expansion doesn't even have to be a multiple of 4
+bytes.
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
new file mode 100644
index 00000000..739fb59b
--- /dev/null
+++ b/fs/cramfs/inode.c
@@ -0,0 +1,601 @@
+/*
+ * Compressed rom filesystem for Linux.
+ *
+ * Copyright (C) 1999 Linus Torvalds.
+ *
+ * This file is released under the GPL.
+ */
+
+/*
+ * These are the VFS interfaces to the compressed rom filesystem.
+ * The actual compression is based on zlib, see the other files.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/blkdev.h>
+#include <linux/cramfs_fs.h>
+#include <linux/slab.h>
+#include <linux/cramfs_fs_sb.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/mutex.h>
+
+#include <asm/uaccess.h>
+
+static const struct super_operations cramfs_ops;
+static const struct inode_operations cramfs_dir_inode_operations;
+static const struct file_operations cramfs_directory_operations;
+static const struct address_space_operations cramfs_aops;
+
+static DEFINE_MUTEX(read_mutex);
+
+
+/* These macros may change in future, to provide better st_ino semantics. */
+#define OFFSET(x)	((x)->i_ino)
+
+static unsigned long cramino(const struct cramfs_inode *cino, unsigned int offset)
+{
+	if (!cino->offset)
+		return offset + 1;
+	if (!cino->size)
+		return offset + 1;
+
+	/*
+	 * The file mode test fixes buggy mkcramfs implementations where
+	 * cramfs_inode->offset is set to a non zero value for entries
+	 * which did not contain data, like devices node and fifos.
+	 */
+	switch (cino->mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFLNK:
+		return cino->offset << 2;
+	default:
+		break;
+	}
+	return offset + 1;
+}
+
+static struct inode *get_cramfs_inode(struct super_block *sb,
+	const struct cramfs_inode *cramfs_inode, unsigned int offset)
+{
+	struct inode *inode;
+	static struct timespec zerotime;
+
+	inode = iget_locked(sb, cramino(cramfs_inode, offset));
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	switch (cramfs_inode->mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_fop = &generic_ro_fops;
+		inode->i_data.a_ops = &cramfs_aops;
+		break;
+	case S_IFDIR:
+		inode->i_op = &cramfs_dir_inode_operations;
+		inode->i_fop = &cramfs_directory_operations;
+		break;
+	case S_IFLNK:
+		inode->i_op = &page_symlink_inode_operations;
+		inode->i_data.a_ops = &cramfs_aops;
+		break;
+	default:
+		init_special_inode(inode, cramfs_inode->mode,
+				old_decode_dev(cramfs_inode->size));
+	}
+
+	inode->i_mode = cramfs_inode->mode;
+	inode->i_uid = cramfs_inode->uid;
+	inode->i_gid = cramfs_inode->gid;
+
+	/* if the lower 2 bits are zero, the inode contains data */
+	if (!(inode->i_ino & 3)) {
+		inode->i_size = cramfs_inode->size;
+		inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+	}
+
+	/* Struct copy intentional */
+	inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
+	/* inode->i_nlink is left 1 - arguably wrong for directories,
+	   but it's the best we can do without reading the directory
+	   contents.  1 yields the right result in GNU find, even
+	   without -noleaf option. */
+
+	unlock_new_inode(inode);
+
+	return inode;
+}
+
+/*
+ * We have our own block cache: don't fill up the buffer cache
+ * with the rom-image, because the way the filesystem is set
+ * up the accesses should be fairly regular and cached in the
+ * page cache and dentry tree anyway..
+ *
+ * This also acts as a way to guarantee contiguous areas of up to
+ * BLKS_PER_BUF*PAGE_CACHE_SIZE, so that the caller doesn't need to
+ * worry about end-of-buffer issues even when decompressing a full
+ * page cache.
+ */
+#define READ_BUFFERS (2)
+/* NEXT_BUFFER(): Loop over [0..(READ_BUFFERS-1)]. */
+#define NEXT_BUFFER(_ix) ((_ix) ^ 1)
+
+/*
+ * BLKS_PER_BUF_SHIFT should be at least 2 to allow for "compressed"
+ * data that takes up more space than the original and with unlucky
+ * alignment.
+ */
+#define BLKS_PER_BUF_SHIFT	(2)
+#define BLKS_PER_BUF		(1 << BLKS_PER_BUF_SHIFT)
+#define BUFFER_SIZE		(BLKS_PER_BUF*PAGE_CACHE_SIZE)
+
+static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE];
+static unsigned buffer_blocknr[READ_BUFFERS];
+static struct super_block * buffer_dev[READ_BUFFERS];
+static int next_buffer;
+
+/*
+ * Returns a pointer to a buffer containing at least LEN bytes of
+ * filesystem starting at byte offset OFFSET into the filesystem.
+ */
+static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned int len)
+{
+	struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+	struct page *pages[BLKS_PER_BUF];
+	unsigned i, blocknr, buffer;
+	unsigned long devsize;
+	char *data;
+
+	if (!len)
+		return NULL;
+	blocknr = offset >> PAGE_CACHE_SHIFT;
+	offset &= PAGE_CACHE_SIZE - 1;
+
+	/* Check if an existing buffer already has the data.. */
+	for (i = 0; i < READ_BUFFERS; i++) {
+		unsigned int blk_offset;
+
+		if (buffer_dev[i] != sb)
+			continue;
+		if (blocknr < buffer_blocknr[i])
+			continue;
+		blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_CACHE_SHIFT;
+		blk_offset += offset;
+		if (blk_offset + len > BUFFER_SIZE)
+			continue;
+		return read_buffers[i] + blk_offset;
+	}
+
+	devsize = mapping->host->i_size >> PAGE_CACHE_SHIFT;
+
+	/* Ok, read in BLKS_PER_BUF pages completely first. */
+	for (i = 0; i < BLKS_PER_BUF; i++) {
+		struct page *page = NULL;
+
+		if (blocknr + i < devsize) {
+			page = read_mapping_page_async(mapping, blocknr + i,
+									NULL);
+			/* synchronous error? */
+			if (IS_ERR(page))
+				page = NULL;
+		}
+		pages[i] = page;
+	}
+
+	for (i = 0; i < BLKS_PER_BUF; i++) {
+		struct page *page = pages[i];
+		if (page) {
+			wait_on_page_locked(page);
+			if (!PageUptodate(page)) {
+				/* asynchronous error */
+				page_cache_release(page);
+				pages[i] = NULL;
+			}
+		}
+	}
+
+	buffer = next_buffer;
+	next_buffer = NEXT_BUFFER(buffer);
+	buffer_blocknr[buffer] = blocknr;
+	buffer_dev[buffer] = sb;
+
+	data = read_buffers[buffer];
+	for (i = 0; i < BLKS_PER_BUF; i++) {
+		struct page *page = pages[i];
+		if (page) {
+			memcpy(data, kmap(page), PAGE_CACHE_SIZE);
+			kunmap(page);
+			page_cache_release(page);
+		} else
+			memset(data, 0, PAGE_CACHE_SIZE);
+		data += PAGE_CACHE_SIZE;
+	}
+	return read_buffers[buffer] + offset;
+}
+
+static void cramfs_put_super(struct super_block *sb)
+{
+	kfree(sb->s_fs_info);
+	sb->s_fs_info = NULL;
+}
+
+static int cramfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	int i;
+	struct cramfs_super super;
+	unsigned long root_offset;
+	struct cramfs_sb_info *sbi;
+	struct inode *root;
+
+	sb->s_flags |= MS_RDONLY;
+
+	sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	sb->s_fs_info = sbi;
+
+	/* Invalidate the read buffers on mount: think disk change.. */
+	mutex_lock(&read_mutex);
+	for (i = 0; i < READ_BUFFERS; i++)
+		buffer_blocknr[i] = -1;
+
+	/* Read the first block and get the superblock from it */
+	memcpy(&super, cramfs_read(sb, 0, sizeof(super)), sizeof(super));
+	mutex_unlock(&read_mutex);
+
+	/* Do sanity checks on the superblock */
+	if (super.magic != CRAMFS_MAGIC) {
+		/* check for wrong endianess */
+		if (super.magic == CRAMFS_MAGIC_WEND) {
+			if (!silent)
+				printk(KERN_ERR "cramfs: wrong endianess\n");
+			goto out;
+		}
+
+		/* check at 512 byte offset */
+		mutex_lock(&read_mutex);
+		memcpy(&super, cramfs_read(sb, 512, sizeof(super)), sizeof(super));
+		mutex_unlock(&read_mutex);
+		if (super.magic != CRAMFS_MAGIC) {
+			if (super.magic == CRAMFS_MAGIC_WEND && !silent)
+				printk(KERN_ERR "cramfs: wrong endianess\n");
+			else if (!silent)
+				printk(KERN_ERR "cramfs: wrong magic\n");
+			goto out;
+		}
+	}
+
+	/* get feature flags first */
+	if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) {
+		printk(KERN_ERR "cramfs: unsupported filesystem features\n");
+		goto out;
+	}
+
+	/* Check that the root inode is in a sane state */
+	if (!S_ISDIR(super.root.mode)) {
+		printk(KERN_ERR "cramfs: root is not a directory\n");
+		goto out;
+	}
+	/* correct strange, hard-coded permissions of mkcramfs */
+	super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+
+	root_offset = super.root.offset << 2;
+	if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
+		sbi->size=super.size;
+		sbi->blocks=super.fsid.blocks;
+		sbi->files=super.fsid.files;
+	} else {
+		sbi->size=1<<28;
+		sbi->blocks=0;
+		sbi->files=0;
+	}
+	sbi->magic=super.magic;
+	sbi->flags=super.flags;
+	if (root_offset == 0)
+		printk(KERN_INFO "cramfs: empty filesystem");
+	else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) &&
+		 ((root_offset != sizeof(struct cramfs_super)) &&
+		  (root_offset != 512 + sizeof(struct cramfs_super))))
+	{
+		printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset);
+		goto out;
+	}
+
+	/* Set it all up.. */
+	sb->s_op = &cramfs_ops;
+	root = get_cramfs_inode(sb, &super.root, 0);
+	if (IS_ERR(root))
+		goto out;
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		iput(root);
+		goto out;
+	}
+	return 0;
+out:
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+	return -EINVAL;
+}
+
+static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type = CRAMFS_MAGIC;
+	buf->f_bsize = PAGE_CACHE_SIZE;
+	buf->f_blocks = CRAMFS_SB(sb)->blocks;
+	buf->f_bfree = 0;
+	buf->f_bavail = 0;
+	buf->f_files = CRAMFS_SB(sb)->files;
+	buf->f_ffree = 0;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = CRAMFS_MAXPATHLEN;
+	return 0;
+}
+
+/*
+ * Read a cramfs directory entry.
+ */
+static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	char *buf;
+	unsigned int offset;
+	int copied;
+
+	/* Offset within the thing. */
+	offset = filp->f_pos;
+	if (offset >= inode->i_size)
+		return 0;
+	/* Directory entries are always 4-byte aligned */
+	if (offset & 3)
+		return -EINVAL;
+
+	buf = kmalloc(CRAMFS_MAXPATHLEN, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	copied = 0;
+	while (offset < inode->i_size) {
+		struct cramfs_inode *de;
+		unsigned long nextoffset;
+		char *name;
+		ino_t ino;
+		mode_t mode;
+		int namelen, error;
+
+		mutex_lock(&read_mutex);
+		de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
+		name = (char *)(de+1);
+
+		/*
+		 * Namelengths on disk are shifted by two
+		 * and the name padded out to 4-byte boundaries
+		 * with zeroes.
+		 */
+		namelen = de->namelen << 2;
+		memcpy(buf, name, namelen);
+		ino = cramino(de, OFFSET(inode) + offset);
+		mode = de->mode;
+		mutex_unlock(&read_mutex);
+		nextoffset = offset + sizeof(*de) + namelen;
+		for (;;) {
+			if (!namelen) {
+				kfree(buf);
+				return -EIO;
+			}
+			if (buf[namelen-1])
+				break;
+			namelen--;
+		}
+		error = filldir(dirent, buf, namelen, offset, ino, mode >> 12);
+		if (error)
+			break;
+
+		offset = nextoffset;
+		filp->f_pos = offset;
+		copied++;
+	}
+	kfree(buf);
+	return 0;
+}
+
+/*
+ * Lookup and fill in the inode data..
+ */
+static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned int offset = 0;
+	struct inode *inode = NULL;
+	int sorted;
+
+	mutex_lock(&read_mutex);
+	sorted = CRAMFS_SB(dir->i_sb)->flags & CRAMFS_FLAG_SORTED_DIRS;
+	while (offset < dir->i_size) {
+		struct cramfs_inode *de;
+		char *name;
+		int namelen, retval;
+		int dir_off = OFFSET(dir) + offset;
+
+		de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN);
+		name = (char *)(de+1);
+
+		/* Try to take advantage of sorted directories */
+		if (sorted && (dentry->d_name.name[0] < name[0]))
+			break;
+
+		namelen = de->namelen << 2;
+		offset += sizeof(*de) + namelen;
+
+		/* Quick check that the name is roughly the right length */
+		if (((dentry->d_name.len + 3) & ~3) != namelen)
+			continue;
+
+		for (;;) {
+			if (!namelen) {
+				inode = ERR_PTR(-EIO);
+				goto out;
+			}
+			if (name[namelen-1])
+				break;
+			namelen--;
+		}
+		if (namelen != dentry->d_name.len)
+			continue;
+		retval = memcmp(dentry->d_name.name, name, namelen);
+		if (retval > 0)
+			continue;
+		if (!retval) {
+			inode = get_cramfs_inode(dir->i_sb, de, dir_off);
+			break;
+		}
+		/* else (retval < 0) */
+		if (sorted)
+			break;
+	}
+out:
+	mutex_unlock(&read_mutex);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	d_add(dentry, inode);
+	return NULL;
+}
+
+static int cramfs_readpage(struct file *file, struct page * page)
+{
+	struct inode *inode = page->mapping->host;
+	u32 maxblock;
+	int bytes_filled;
+	void *pgdata;
+
+	maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	bytes_filled = 0;
+	pgdata = kmap(page);
+
+	if (page->index < maxblock) {
+		struct super_block *sb = inode->i_sb;
+		u32 blkptr_offset = OFFSET(inode) + page->index*4;
+		u32 start_offset, compr_len;
+
+		start_offset = OFFSET(inode) + maxblock*4;
+		mutex_lock(&read_mutex);
+		if (page->index)
+			start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4,
+				4);
+		compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) -
+			start_offset);
+		mutex_unlock(&read_mutex);
+
+		if (compr_len == 0)
+			; /* hole */
+		else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
+			pr_err("cramfs: bad compressed blocksize %u\n",
+				compr_len);
+			goto err;
+		} else {
+			mutex_lock(&read_mutex);
+			bytes_filled = cramfs_uncompress_block(pgdata,
+				 PAGE_CACHE_SIZE,
+				 cramfs_read(sb, start_offset, compr_len),
+				 compr_len);
+			mutex_unlock(&read_mutex);
+			if (unlikely(bytes_filled < 0))
+				goto err;
+		}
+	}
+
+	memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled);
+	flush_dcache_page(page);
+	kunmap(page);
+	SetPageUptodate(page);
+	unlock_page(page);
+	return 0;
+
+err:
+	kunmap(page);
+	ClearPageUptodate(page);
+	SetPageError(page);
+	unlock_page(page);
+	return 0;
+}
+
+static const struct address_space_operations cramfs_aops = {
+	.readpage = cramfs_readpage
+};
+
+/*
+ * Our operations:
+ */
+
+/*
+ * A directory can only readdir
+ */
+static const struct file_operations cramfs_directory_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= cramfs_readdir,
+};
+
+static const struct inode_operations cramfs_dir_inode_operations = {
+	.lookup		= cramfs_lookup,
+};
+
+static const struct super_operations cramfs_ops = {
+	.put_super	= cramfs_put_super,
+	.remount_fs	= cramfs_remount,
+	.statfs		= cramfs_statfs,
+};
+
+static struct dentry *cramfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
+}
+
+static struct file_system_type cramfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "cramfs",
+	.mount		= cramfs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_cramfs_fs(void)
+{
+	int rv;
+
+	rv = cramfs_uncompress_init();
+	if (rv < 0)
+		return rv;
+	rv = register_filesystem(&cramfs_fs_type);
+	if (rv < 0)
+		cramfs_uncompress_exit();
+	return rv;
+}
+
+static void __exit exit_cramfs_fs(void)
+{
+	cramfs_uncompress_exit();
+	unregister_filesystem(&cramfs_fs_type);
+}
+
+module_init(init_cramfs_fs)
+module_exit(exit_cramfs_fs)
+MODULE_LICENSE("GPL");
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
new file mode 100644
index 00000000..02332980
--- /dev/null
+++ b/fs/cramfs/uncompress.c
@@ -0,0 +1,77 @@
+/*
+ * uncompress.c
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * cramfs interfaces to the uncompression library. There's really just
+ * three entrypoints:
+ *
+ *  - cramfs_uncompress_init() - called to initialize the thing.
+ *  - cramfs_uncompress_exit() - tell me when you're done
+ *  - cramfs_uncompress_block() - uncompress a block.
+ *
+ * NOTE NOTE NOTE! The uncompression is entirely single-threaded. We
+ * only have one stream, and we'll initialize it only once even if it
+ * then is used by multiple filesystems.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/vmalloc.h>
+#include <linux/zlib.h>
+#include <linux/cramfs_fs.h>
+
+static z_stream stream;
+static int initialized;
+
+/* Returns length of decompressed data. */
+int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
+{
+	int err;
+
+	stream.next_in = src;
+	stream.avail_in = srclen;
+
+	stream.next_out = dst;
+	stream.avail_out = dstlen;
+
+	err = zlib_inflateReset(&stream);
+	if (err != Z_OK) {
+		printk("zlib_inflateReset error %d\n", err);
+		zlib_inflateEnd(&stream);
+		zlib_inflateInit(&stream);
+	}
+
+	err = zlib_inflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto err;
+	return stream.total_out;
+
+err:
+	printk("Error %d while decompressing!\n", err);
+	printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen);
+	return -EIO;
+}
+
+int cramfs_uncompress_init(void)
+{
+	if (!initialized++) {
+		stream.workspace = vmalloc(zlib_inflate_workspacesize());
+		if ( !stream.workspace ) {
+			initialized = 0;
+			return -ENOMEM;
+		}
+		stream.next_in = NULL;
+		stream.avail_in = 0;
+		zlib_inflateInit(&stream);
+	}
+	return 0;
+}
+
+void cramfs_uncompress_exit(void)
+{
+	if (!--initialized) {
+		zlib_inflateEnd(&stream);
+		vfree(stream.workspace);
+	}
+}
diff --git a/fs/dcache.c b/fs/dcache.c
new file mode 100644
index 00000000..0b51cfc9
--- /dev/null
+++ b/fs/dcache.c
@@ -0,0 +1,3124 @@
+/*
+ * fs/dcache.c
+ *
+ * Complete reimplementation
+ * (C) 1997 Thomas Schoebel-Theuer,
+ * with heavy changes by Linus Torvalds
+ */
+
+/*
+ * Notes on the allocation strategy:
+ *
+ * The dcache is a master of the icache - whenever a dcache entry
+ * exists, the inode will always exist. "iput()" is done either when
+ * the dcache entry is deleted or garbage collected.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <asm/uaccess.h>
+#include <linux/security.h>
+#include <linux/seqlock.h>
+#include <linux/swap.h>
+#include <linux/bootmem.h>
+#include <linux/fs_struct.h>
+#include <linux/hardirq.h>
+#include <linux/bit_spinlock.h>
+#include <linux/rculist_bl.h>
+#include <linux/prefetch.h>
+#include "internal.h"
+
+/*
+ * Usage:
+ * dcache->d_inode->i_lock protects:
+ *   - i_dentry, d_alias, d_inode of aliases
+ * dcache_hash_bucket lock protects:
+ *   - the dcache hash table
+ * s_anon bl list spinlock protects:
+ *   - the s_anon list (see __d_drop)
+ * dcache_lru_lock protects:
+ *   - the dcache lru lists and counters
+ * d_lock protects:
+ *   - d_flags
+ *   - d_name
+ *   - d_lru
+ *   - d_count
+ *   - d_unhashed()
+ *   - d_parent and d_subdirs
+ *   - childrens' d_child and d_parent
+ *   - d_alias, d_inode
+ *
+ * Ordering:
+ * dentry->d_inode->i_lock
+ *   dentry->d_lock
+ *     dcache_lru_lock
+ *     dcache_hash_bucket lock
+ *     s_anon lock
+ *
+ * If there is an ancestor relationship:
+ * dentry->d_parent->...->d_parent->d_lock
+ *   ...
+ *     dentry->d_parent->d_lock
+ *       dentry->d_lock
+ *
+ * If no ancestor relationship:
+ * if (dentry1 < dentry2)
+ *   dentry1->d_lock
+ *     dentry2->d_lock
+ */
+int sysctl_vfs_cache_pressure __read_mostly = 100;
+EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
+
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
+
+EXPORT_SYMBOL(rename_lock);
+
+static struct kmem_cache *dentry_cache __read_mostly;
+
+/*
+ * This is the single most critical data structure when it comes
+ * to the dcache: the hashtable for lookups. Somebody should try
+ * to make this good - I've just made it work.
+ *
+ * This hash-function tries to avoid losing too many bits of hash
+ * information, yet avoid using a prime hash-size or similar.
+ */
+#define D_HASHBITS     d_hash_shift
+#define D_HASHMASK     d_hash_mask
+
+static unsigned int d_hash_mask __read_mostly;
+static unsigned int d_hash_shift __read_mostly;
+
+static struct hlist_bl_head *dentry_hashtable __read_mostly;
+
+static inline struct hlist_bl_head *d_hash(struct dentry *parent,
+					unsigned long hash)
+{
+	hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
+	hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
+	return dentry_hashtable + (hash & D_HASHMASK);
+}
+
+/* Statistics gathering. */
+struct dentry_stat_t dentry_stat = {
+	.age_limit = 45,
+};
+
+static DEFINE_PER_CPU(unsigned int, nr_dentry);
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+static int get_nr_dentry(void)
+{
+	int i;
+	int sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_dentry, i);
+	return sum < 0 ? 0 : sum;
+}
+
+int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
+		   size_t *lenp, loff_t *ppos)
+{
+	dentry_stat.nr_dentry = get_nr_dentry();
+	return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
+
+static void __d_free(struct rcu_head *head)
+{
+	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
+
+	WARN_ON(!list_empty(&dentry->d_alias));
+	if (dname_external(dentry))
+		kfree(dentry->d_name.name);
+	kmem_cache_free(dentry_cache, dentry); 
+}
+
+/*
+ * no locks, please.
+ */
+static void d_free(struct dentry *dentry)
+{
+	BUG_ON(dentry->d_count);
+	this_cpu_dec(nr_dentry);
+	if (dentry->d_op && dentry->d_op->d_release)
+		dentry->d_op->d_release(dentry);
+
+	/* if dentry was never visible to RCU, immediate free is OK */
+	if (!(dentry->d_flags & DCACHE_RCUACCESS))
+		__d_free(&dentry->d_u.d_rcu);
+	else
+		call_rcu(&dentry->d_u.d_rcu, __d_free);
+}
+
+/**
+ * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * @dentry: the target dentry
+ * After this call, in-progress rcu-walk path lookup will fail. This
+ * should be called after unhashing, and after changing d_inode (if
+ * the dentry has not already been unhashed).
+ */
+static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+{
+	assert_spin_locked(&dentry->d_lock);
+	/* Go through a barrier */
+	write_seqcount_barrier(&dentry->d_seq);
+}
+
+/*
+ * Release the dentry's inode, using the filesystem
+ * d_iput() operation if defined. Dentry has no refcount
+ * and is unhashed.
+ */
+static void dentry_iput(struct dentry * dentry)
+	__releases(dentry->d_lock)
+	__releases(dentry->d_inode->i_lock)
+{
+	struct inode *inode = dentry->d_inode;
+	if (inode) {
+		dentry->d_inode = NULL;
+		list_del_init(&dentry->d_alias);
+		spin_unlock(&dentry->d_lock);
+		spin_unlock(&inode->i_lock);
+		if (!inode->i_nlink)
+			fsnotify_inoderemove(inode);
+		if (dentry->d_op && dentry->d_op->d_iput)
+			dentry->d_op->d_iput(dentry, inode);
+		else
+			iput(inode);
+	} else {
+		spin_unlock(&dentry->d_lock);
+	}
+}
+
+/*
+ * Release the dentry's inode, using the filesystem
+ * d_iput() operation if defined. dentry remains in-use.
+ */
+static void dentry_unlink_inode(struct dentry * dentry)
+	__releases(dentry->d_lock)
+	__releases(dentry->d_inode->i_lock)
+{
+	struct inode *inode = dentry->d_inode;
+	dentry->d_inode = NULL;
+	list_del_init(&dentry->d_alias);
+	dentry_rcuwalk_barrier(dentry);
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&inode->i_lock);
+	if (!inode->i_nlink)
+		fsnotify_inoderemove(inode);
+	if (dentry->d_op && dentry->d_op->d_iput)
+		dentry->d_op->d_iput(dentry, inode);
+	else
+		iput(inode);
+}
+
+/*
+ * dentry_lru_(add|del|move_tail) must be called with d_lock held.
+ */
+static void dentry_lru_add(struct dentry *dentry)
+{
+	if (list_empty(&dentry->d_lru)) {
+		spin_lock(&dcache_lru_lock);
+		list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+		dentry->d_sb->s_nr_dentry_unused++;
+		dentry_stat.nr_unused++;
+		spin_unlock(&dcache_lru_lock);
+	}
+}
+
+static void __dentry_lru_del(struct dentry *dentry)
+{
+	list_del_init(&dentry->d_lru);
+	dentry->d_flags &= ~DCACHE_SHRINK_LIST;
+	dentry->d_sb->s_nr_dentry_unused--;
+	dentry_stat.nr_unused--;
+}
+
+static void dentry_lru_del(struct dentry *dentry)
+{
+	if (!list_empty(&dentry->d_lru)) {
+		spin_lock(&dcache_lru_lock);
+		__dentry_lru_del(dentry);
+		spin_unlock(&dcache_lru_lock);
+	}
+}
+
+static void dentry_lru_move_tail(struct dentry *dentry)
+{
+	spin_lock(&dcache_lru_lock);
+	if (list_empty(&dentry->d_lru)) {
+		list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+		dentry->d_sb->s_nr_dentry_unused++;
+		dentry_stat.nr_unused++;
+	} else {
+		list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+	}
+	spin_unlock(&dcache_lru_lock);
+}
+
+/**
+ * d_kill - kill dentry and return parent
+ * @dentry: dentry to kill
+ * @parent: parent dentry
+ *
+ * The dentry must already be unhashed and removed from the LRU.
+ *
+ * If this is the root of the dentry tree, return NULL.
+ *
+ * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
+ * d_kill.
+ */
+static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
+	__releases(dentry->d_lock)
+	__releases(parent->d_lock)
+	__releases(dentry->d_inode->i_lock)
+{
+	list_del(&dentry->d_u.d_child);
+	/*
+	 * Inform try_to_ascend() that we are no longer attached to the
+	 * dentry tree
+	 */
+	dentry->d_flags |= DCACHE_DISCONNECTED;
+	if (parent)
+		spin_unlock(&parent->d_lock);
+	dentry_iput(dentry);
+	/*
+	 * dentry_iput drops the locks, at which point nobody (except
+	 * transient RCU lookups) can reach this dentry.
+	 */
+	d_free(dentry);
+	return parent;
+}
+
+/**
+ * d_drop - drop a dentry
+ * @dentry: dentry to drop
+ *
+ * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
+ * be found through a VFS lookup any more. Note that this is different from
+ * deleting the dentry - d_delete will try to mark the dentry negative if
+ * possible, giving a successful _negative_ lookup, while d_drop will
+ * just make the cache lookup fail.
+ *
+ * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
+ * reason (NFS timeouts or autofs deletes).
+ *
+ * __d_drop requires dentry->d_lock.
+ */
+void __d_drop(struct dentry *dentry)
+{
+	if (!d_unhashed(dentry)) {
+		struct hlist_bl_head *b;
+		if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
+			b = &dentry->d_sb->s_anon;
+		else
+			b = d_hash(dentry->d_parent, dentry->d_name.hash);
+
+		hlist_bl_lock(b);
+		__hlist_bl_del(&dentry->d_hash);
+		dentry->d_hash.pprev = NULL;
+		hlist_bl_unlock(b);
+
+		dentry_rcuwalk_barrier(dentry);
+	}
+}
+EXPORT_SYMBOL(__d_drop);
+
+void d_drop(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(d_drop);
+
+/*
+ * Finish off a dentry we've decided to kill.
+ * dentry->d_lock must be held, returns with it unlocked.
+ * If ref is non-zero, then decrement the refcount too.
+ * Returns dentry requiring refcount drop, or NULL if we're done.
+ */
+static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
+	__releases(dentry->d_lock)
+{
+	struct inode *inode;
+	struct dentry *parent;
+
+	inode = dentry->d_inode;
+	if (inode && !spin_trylock(&inode->i_lock)) {
+relock:
+		spin_unlock(&dentry->d_lock);
+		cpu_relax();
+		return dentry; /* try again with same dentry */
+	}
+	if (IS_ROOT(dentry))
+		parent = NULL;
+	else
+		parent = dentry->d_parent;
+	if (parent && !spin_trylock(&parent->d_lock)) {
+		if (inode)
+			spin_unlock(&inode->i_lock);
+		goto relock;
+	}
+
+	if (ref)
+		dentry->d_count--;
+	/* if dentry was on the d_lru list delete it from there */
+	dentry_lru_del(dentry);
+	/* if it was on the hash then remove it */
+	__d_drop(dentry);
+	return d_kill(dentry, parent);
+}
+
+/* 
+ * This is dput
+ *
+ * This is complicated by the fact that we do not want to put
+ * dentries that are no longer on any hash chain on the unused
+ * list: we'd much rather just get rid of them immediately.
+ *
+ * However, that implies that we have to traverse the dentry
+ * tree upwards to the parents which might _also_ now be
+ * scheduled for deletion (it may have been only waiting for
+ * its last child to go away).
+ *
+ * This tail recursion is done by hand as we don't want to depend
+ * on the compiler to always get this right (gcc generally doesn't).
+ * Real recursion would eat up our stack space.
+ */
+
+/*
+ * dput - release a dentry
+ * @dentry: dentry to release 
+ *
+ * Release a dentry. This will drop the usage count and if appropriate
+ * call the dentry unlink method as well as removing it from the queues and
+ * releasing its resources. If the parent dentries were scheduled for release
+ * they too may now get deleted.
+ */
+void dput(struct dentry *dentry)
+{
+	if (!dentry)
+		return;
+
+repeat:
+	if (dentry->d_count == 1)
+		might_sleep();
+	spin_lock(&dentry->d_lock);
+	BUG_ON(!dentry->d_count);
+	if (dentry->d_count > 1) {
+		dentry->d_count--;
+		spin_unlock(&dentry->d_lock);
+		return;
+	}
+
+	if (dentry->d_flags & DCACHE_OP_DELETE) {
+		if (dentry->d_op->d_delete(dentry))
+			goto kill_it;
+	}
+
+	/* Unreachable? Get rid of it */
+ 	if (d_unhashed(dentry))
+		goto kill_it;
+
+	/* Otherwise leave it cached and ensure it's on the LRU */
+	dentry->d_flags |= DCACHE_REFERENCED;
+	dentry_lru_add(dentry);
+
+	dentry->d_count--;
+	spin_unlock(&dentry->d_lock);
+	return;
+
+kill_it:
+	dentry = dentry_kill(dentry, 1);
+	if (dentry)
+		goto repeat;
+}
+EXPORT_SYMBOL(dput);
+
+/**
+ * d_invalidate - invalidate a dentry
+ * @dentry: dentry to invalidate
+ *
+ * Try to invalidate the dentry if it turns out to be
+ * possible. If there are other dentries that can be
+ * reached through this one we can't delete it and we
+ * return -EBUSY. On success we return 0.
+ *
+ * no dcache lock.
+ */
+ 
+int d_invalidate(struct dentry * dentry)
+{
+	/*
+	 * If it's already been dropped, return OK.
+	 */
+	spin_lock(&dentry->d_lock);
+	if (d_unhashed(dentry)) {
+		spin_unlock(&dentry->d_lock);
+		return 0;
+	}
+	/*
+	 * Check whether to do a partial shrink_dcache
+	 * to get rid of unused child entries.
+	 */
+	if (!list_empty(&dentry->d_subdirs)) {
+		spin_unlock(&dentry->d_lock);
+		shrink_dcache_parent(dentry);
+		spin_lock(&dentry->d_lock);
+	}
+
+	/*
+	 * Somebody else still using it?
+	 *
+	 * If it's a directory, we can't drop it
+	 * for fear of somebody re-populating it
+	 * with children (even though dropping it
+	 * would make it unreachable from the root,
+	 * we might still populate it if it was a
+	 * working directory or similar).
+	 */
+	if (dentry->d_count > 1) {
+		if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
+			spin_unlock(&dentry->d_lock);
+			return -EBUSY;
+		}
+	}
+
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+	return 0;
+}
+EXPORT_SYMBOL(d_invalidate);
+
+/* This must be called with d_lock held */
+static inline void __dget_dlock(struct dentry *dentry)
+{
+	dentry->d_count++;
+}
+
+static inline void __dget(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__dget_dlock(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+struct dentry *dget_parent(struct dentry *dentry)
+{
+	struct dentry *ret;
+
+repeat:
+	/*
+	 * Don't need rcu_dereference because we re-check it was correct under
+	 * the lock.
+	 */
+	rcu_read_lock();
+	ret = dentry->d_parent;
+	if (!ret) {
+		rcu_read_unlock();
+		goto out;
+	}
+	spin_lock(&ret->d_lock);
+	if (unlikely(ret != dentry->d_parent)) {
+		spin_unlock(&ret->d_lock);
+		rcu_read_unlock();
+		goto repeat;
+	}
+	rcu_read_unlock();
+	BUG_ON(!ret->d_count);
+	ret->d_count++;
+	spin_unlock(&ret->d_lock);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(dget_parent);
+
+/**
+ * d_find_alias - grab a hashed alias of inode
+ * @inode: inode in question
+ * @want_discon:  flag, used by d_splice_alias, to request
+ *          that only a DISCONNECTED alias be returned.
+ *
+ * If inode has a hashed alias, or is a directory and has any alias,
+ * acquire the reference to alias and return it. Otherwise return NULL.
+ * Notice that if inode is a directory there can be only one alias and
+ * it can be unhashed only if it has no children, or if it is the root
+ * of a filesystem.
+ *
+ * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
+ * any other hashed alias over that one unless @want_discon is set,
+ * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
+ */
+static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
+{
+	struct dentry *alias, *discon_alias;
+
+again:
+	discon_alias = NULL;
+	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
+		spin_lock(&alias->d_lock);
+ 		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
+			if (IS_ROOT(alias) &&
+			    (alias->d_flags & DCACHE_DISCONNECTED)) {
+				discon_alias = alias;
+			} else if (!want_discon) {
+				__dget_dlock(alias);
+				spin_unlock(&alias->d_lock);
+				return alias;
+			}
+		}
+		spin_unlock(&alias->d_lock);
+	}
+	if (discon_alias) {
+		alias = discon_alias;
+		spin_lock(&alias->d_lock);
+		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
+			if (IS_ROOT(alias) &&
+			    (alias->d_flags & DCACHE_DISCONNECTED)) {
+				__dget_dlock(alias);
+				spin_unlock(&alias->d_lock);
+				return alias;
+			}
+		}
+		spin_unlock(&alias->d_lock);
+		goto again;
+	}
+	return NULL;
+}
+
+struct dentry *d_find_alias(struct inode *inode)
+{
+	struct dentry *de = NULL;
+
+	if (!list_empty(&inode->i_dentry)) {
+		spin_lock(&inode->i_lock);
+		de = __d_find_alias(inode, 0);
+		spin_unlock(&inode->i_lock);
+	}
+	return de;
+}
+EXPORT_SYMBOL(d_find_alias);
+
+/*
+ *	Try to kill dentries associated with this inode.
+ * WARNING: you must own a reference to inode.
+ */
+void d_prune_aliases(struct inode *inode)
+{
+	struct dentry *dentry;
+restart:
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+		spin_lock(&dentry->d_lock);
+		if (!dentry->d_count) {
+			__dget_dlock(dentry);
+			__d_drop(dentry);
+			spin_unlock(&dentry->d_lock);
+			spin_unlock(&inode->i_lock);
+			dput(dentry);
+			goto restart;
+		}
+		spin_unlock(&dentry->d_lock);
+	}
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(d_prune_aliases);
+
+/*
+ * Try to throw away a dentry - free the inode, dput the parent.
+ * Requires dentry->d_lock is held, and dentry->d_count == 0.
+ * Releases dentry->d_lock.
+ *
+ * This may fail if locks cannot be acquired no problem, just try again.
+ */
+static void try_prune_one_dentry(struct dentry *dentry)
+	__releases(dentry->d_lock)
+{
+	struct dentry *parent;
+
+	parent = dentry_kill(dentry, 0);
+	/*
+	 * If dentry_kill returns NULL, we have nothing more to do.
+	 * if it returns the same dentry, trylocks failed. In either
+	 * case, just loop again.
+	 *
+	 * Otherwise, we need to prune ancestors too. This is necessary
+	 * to prevent quadratic behavior of shrink_dcache_parent(), but
+	 * is also expected to be beneficial in reducing dentry cache
+	 * fragmentation.
+	 */
+	if (!parent)
+		return;
+	if (parent == dentry)
+		return;
+
+	/* Prune ancestors. */
+	dentry = parent;
+	while (dentry) {
+		spin_lock(&dentry->d_lock);
+		if (dentry->d_count > 1) {
+			dentry->d_count--;
+			spin_unlock(&dentry->d_lock);
+			return;
+		}
+		dentry = dentry_kill(dentry, 1);
+	}
+}
+
+static void shrink_dentry_list(struct list_head *list)
+{
+	struct dentry *dentry;
+
+	rcu_read_lock();
+	for (;;) {
+		dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
+		if (&dentry->d_lru == list)
+			break; /* empty */
+		spin_lock(&dentry->d_lock);
+		if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
+			spin_unlock(&dentry->d_lock);
+			continue;
+		}
+
+		/*
+		 * We found an inuse dentry which was not removed from
+		 * the LRU because of laziness during lookup.  Do not free
+		 * it - just keep it off the LRU list.
+		 */
+		if (dentry->d_count) {
+			dentry_lru_del(dentry);
+			spin_unlock(&dentry->d_lock);
+			continue;
+		}
+
+		rcu_read_unlock();
+
+		try_prune_one_dentry(dentry);
+
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
+/**
+ * __shrink_dcache_sb - shrink the dentry LRU on a given superblock
+ * @sb:		superblock to shrink dentry LRU.
+ * @count:	number of entries to prune
+ * @flags:	flags to control the dentry processing
+ *
+ * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
+ */
+static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
+{
+	/* called from prune_dcache() and shrink_dcache_parent() */
+	struct dentry *dentry;
+	LIST_HEAD(referenced);
+	LIST_HEAD(tmp);
+	int cnt = *count;
+
+relock:
+	spin_lock(&dcache_lru_lock);
+	while (!list_empty(&sb->s_dentry_lru)) {
+		dentry = list_entry(sb->s_dentry_lru.prev,
+				struct dentry, d_lru);
+		BUG_ON(dentry->d_sb != sb);
+
+		if (!spin_trylock(&dentry->d_lock)) {
+			spin_unlock(&dcache_lru_lock);
+			cpu_relax();
+			goto relock;
+		}
+
+		/*
+		 * If we are honouring the DCACHE_REFERENCED flag and the
+		 * dentry has this flag set, don't free it.  Clear the flag
+		 * and put it back on the LRU.
+		 */
+		if (flags & DCACHE_REFERENCED &&
+				dentry->d_flags & DCACHE_REFERENCED) {
+			dentry->d_flags &= ~DCACHE_REFERENCED;
+			list_move(&dentry->d_lru, &referenced);
+			spin_unlock(&dentry->d_lock);
+		} else {
+			list_move_tail(&dentry->d_lru, &tmp);
+			dentry->d_flags |= DCACHE_SHRINK_LIST;
+			spin_unlock(&dentry->d_lock);
+			if (!--cnt)
+				break;
+		}
+		cond_resched_lock(&dcache_lru_lock);
+	}
+	if (!list_empty(&referenced))
+		list_splice(&referenced, &sb->s_dentry_lru);
+	spin_unlock(&dcache_lru_lock);
+
+	shrink_dentry_list(&tmp);
+
+	*count = cnt;
+}
+
+/**
+ * prune_dcache - shrink the dcache
+ * @count: number of entries to try to free
+ *
+ * Shrink the dcache. This is done when we need more memory, or simply when we
+ * need to unmount something (at which point we need to unuse all dentries).
+ *
+ * This function may fail to free any resources if all the dentries are in use.
+ */
+static void prune_dcache(int count)
+{
+	struct super_block *sb, *p = NULL;
+	int w_count;
+	int unused = dentry_stat.nr_unused;
+	int prune_ratio;
+	int pruned;
+
+	if (unused == 0 || count == 0)
+		return;
+	if (count >= unused)
+		prune_ratio = 1;
+	else
+		prune_ratio = unused / count;
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (list_empty(&sb->s_instances))
+			continue;
+		if (sb->s_nr_dentry_unused == 0)
+			continue;
+		sb->s_count++;
+		/* Now, we reclaim unused dentrins with fairness.
+		 * We reclaim them same percentage from each superblock.
+		 * We calculate number of dentries to scan on this sb
+		 * as follows, but the implementation is arranged to avoid
+		 * overflows:
+		 * number of dentries to scan on this sb =
+		 * count * (number of dentries on this sb /
+		 * number of dentries in the machine)
+		 */
+		spin_unlock(&sb_lock);
+		if (prune_ratio != 1)
+			w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1;
+		else
+			w_count = sb->s_nr_dentry_unused;
+		pruned = w_count;
+		/*
+		 * We need to be sure this filesystem isn't being unmounted,
+		 * otherwise we could race with generic_shutdown_super(), and
+		 * end up holding a reference to an inode while the filesystem
+		 * is unmounted.  So we try to get s_umount, and make sure
+		 * s_root isn't NULL.
+		 */
+		if (down_read_trylock(&sb->s_umount)) {
+			if ((sb->s_root != NULL) &&
+			    (!list_empty(&sb->s_dentry_lru))) {
+				__shrink_dcache_sb(sb, &w_count,
+						DCACHE_REFERENCED);
+				pruned -= w_count;
+			}
+			up_read(&sb->s_umount);
+		}
+		spin_lock(&sb_lock);
+		if (p)
+			__put_super(p);
+		count -= pruned;
+		p = sb;
+		/* more work left to do? */
+		if (count <= 0)
+			break;
+	}
+	if (p)
+		__put_super(p);
+	spin_unlock(&sb_lock);
+}
+
+/**
+ * shrink_dcache_sb - shrink dcache for a superblock
+ * @sb: superblock
+ *
+ * Shrink the dcache for the specified super block. This is used to free
+ * the dcache before unmounting a file system.
+ */
+void shrink_dcache_sb(struct super_block *sb)
+{
+	LIST_HEAD(tmp);
+
+	spin_lock(&dcache_lru_lock);
+	while (!list_empty(&sb->s_dentry_lru)) {
+		list_splice_init(&sb->s_dentry_lru, &tmp);
+		spin_unlock(&dcache_lru_lock);
+		shrink_dentry_list(&tmp);
+		spin_lock(&dcache_lru_lock);
+	}
+	spin_unlock(&dcache_lru_lock);
+}
+EXPORT_SYMBOL(shrink_dcache_sb);
+
+/*
+ * destroy a single subtree of dentries for unmount
+ * - see the comments on shrink_dcache_for_umount() for a description of the
+ *   locking
+ */
+static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
+{
+	struct dentry *parent;
+	unsigned detached = 0;
+
+	BUG_ON(!IS_ROOT(dentry));
+
+	/* detach this root from the system */
+	spin_lock(&dentry->d_lock);
+	dentry_lru_del(dentry);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+
+	for (;;) {
+		/* descend to the first leaf in the current subtree */
+		while (!list_empty(&dentry->d_subdirs)) {
+			struct dentry *loop;
+
+			/* this is a branch with children - detach all of them
+			 * from the system in one go */
+			spin_lock(&dentry->d_lock);
+			list_for_each_entry(loop, &dentry->d_subdirs,
+					    d_u.d_child) {
+				spin_lock_nested(&loop->d_lock,
+						DENTRY_D_LOCK_NESTED);
+				dentry_lru_del(loop);
+				__d_drop(loop);
+				spin_unlock(&loop->d_lock);
+			}
+			spin_unlock(&dentry->d_lock);
+
+			/* move to the first child */
+			dentry = list_entry(dentry->d_subdirs.next,
+					    struct dentry, d_u.d_child);
+		}
+
+		/* consume the dentries from this leaf up through its parents
+		 * until we find one with children or run out altogether */
+		do {
+			struct inode *inode;
+
+			if (dentry->d_count != 0) {
+				printk(KERN_ERR
+				       "BUG: Dentry %p{i=%lx,n=%s}"
+				       " still in use (%d)"
+				       " [unmount of %s %s]\n",
+				       dentry,
+				       dentry->d_inode ?
+				       dentry->d_inode->i_ino : 0UL,
+				       dentry->d_name.name,
+				       dentry->d_count,
+				       dentry->d_sb->s_type->name,
+				       dentry->d_sb->s_id);
+				BUG();
+			}
+
+			if (IS_ROOT(dentry)) {
+				parent = NULL;
+				list_del(&dentry->d_u.d_child);
+			} else {
+				parent = dentry->d_parent;
+				spin_lock(&parent->d_lock);
+				parent->d_count--;
+				list_del(&dentry->d_u.d_child);
+				spin_unlock(&parent->d_lock);
+			}
+
+			detached++;
+
+			inode = dentry->d_inode;
+			if (inode) {
+				dentry->d_inode = NULL;
+				list_del_init(&dentry->d_alias);
+				if (dentry->d_op && dentry->d_op->d_iput)
+					dentry->d_op->d_iput(dentry, inode);
+				else
+					iput(inode);
+			}
+
+			d_free(dentry);
+
+			/* finished when we fall off the top of the tree,
+			 * otherwise we ascend to the parent and move to the
+			 * next sibling if there is one */
+			if (!parent)
+				return;
+			dentry = parent;
+		} while (list_empty(&dentry->d_subdirs));
+
+		dentry = list_entry(dentry->d_subdirs.next,
+				    struct dentry, d_u.d_child);
+	}
+}
+
+/*
+ * destroy the dentries attached to a superblock on unmounting
+ * - we don't need to use dentry->d_lock because:
+ *   - the superblock is detached from all mountings and open files, so the
+ *     dentry trees will not be rearranged by the VFS
+ *   - s_umount is write-locked, so the memory pressure shrinker will ignore
+ *     any dentries belonging to this superblock that it comes across
+ *   - the filesystem itself is no longer permitted to rearrange the dentries
+ *     in this superblock
+ */
+void shrink_dcache_for_umount(struct super_block *sb)
+{
+	struct dentry *dentry;
+
+	if (down_read_trylock(&sb->s_umount))
+		BUG();
+
+	dentry = sb->s_root;
+	sb->s_root = NULL;
+	spin_lock(&dentry->d_lock);
+	dentry->d_count--;
+	spin_unlock(&dentry->d_lock);
+	shrink_dcache_for_umount_subtree(dentry);
+
+	while (!hlist_bl_empty(&sb->s_anon)) {
+		dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
+		shrink_dcache_for_umount_subtree(dentry);
+	}
+}
+
+/*
+ * This tries to ascend one level of parenthood, but
+ * we can race with renaming, so we need to re-check
+ * the parenthood after dropping the lock and check
+ * that the sequence number still matches.
+ */
+static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq)
+{
+	struct dentry *new = old->d_parent;
+
+	rcu_read_lock();
+	spin_unlock(&old->d_lock);
+	spin_lock(&new->d_lock);
+
+	/*
+	 * might go back up the wrong parent if we have had a rename
+	 * or deletion
+	 */
+	if (new != old->d_parent ||
+		 (old->d_flags & DCACHE_DISCONNECTED) ||
+		 (!locked && read_seqretry(&rename_lock, seq))) {
+		spin_unlock(&new->d_lock);
+		new = NULL;
+	}
+	rcu_read_unlock();
+	return new;
+}
+
+
+/*
+ * Search for at least 1 mount point in the dentry's subdirs.
+ * We descend to the next level whenever the d_subdirs
+ * list is non-empty and continue searching.
+ */
+ 
+/**
+ * have_submounts - check for mounts over a dentry
+ * @parent: dentry to check.
+ *
+ * Return true if the parent or its subdirectories contain
+ * a mount point
+ */
+int have_submounts(struct dentry *parent)
+{
+	struct dentry *this_parent;
+	struct list_head *next;
+	unsigned seq;
+	int locked = 0;
+
+	seq = read_seqbegin(&rename_lock);
+again:
+	this_parent = parent;
+
+	if (d_mountpoint(parent))
+		goto positive;
+	spin_lock(&this_parent->d_lock);
+repeat:
+	next = this_parent->d_subdirs.next;
+resume:
+	while (next != &this_parent->d_subdirs) {
+		struct list_head *tmp = next;
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
+		next = tmp->next;
+
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+		/* Have we found a mount point ? */
+		if (d_mountpoint(dentry)) {
+			spin_unlock(&dentry->d_lock);
+			spin_unlock(&this_parent->d_lock);
+			goto positive;
+		}
+		if (!list_empty(&dentry->d_subdirs)) {
+			spin_unlock(&this_parent->d_lock);
+			spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
+			this_parent = dentry;
+			spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
+			goto repeat;
+		}
+		spin_unlock(&dentry->d_lock);
+	}
+	/*
+	 * All done at this level ... ascend and resume the search.
+	 */
+	if (this_parent != parent) {
+		struct dentry *child = this_parent;
+		this_parent = try_to_ascend(this_parent, locked, seq);
+		if (!this_parent)
+			goto rename_retry;
+		next = child->d_u.d_child.next;
+		goto resume;
+	}
+	spin_unlock(&this_parent->d_lock);
+	if (!locked && read_seqretry(&rename_lock, seq))
+		goto rename_retry;
+	if (locked)
+		write_sequnlock(&rename_lock);
+	return 0; /* No mount points found in tree */
+positive:
+	if (!locked && read_seqretry(&rename_lock, seq))
+		goto rename_retry;
+	if (locked)
+		write_sequnlock(&rename_lock);
+	return 1;
+
+rename_retry:
+	locked = 1;
+	write_seqlock(&rename_lock);
+	goto again;
+}
+EXPORT_SYMBOL(have_submounts);
+
+/*
+ * Search the dentry child list for the specified parent,
+ * and move any unused dentries to the end of the unused
+ * list for prune_dcache(). We descend to the next level
+ * whenever the d_subdirs list is non-empty and continue
+ * searching.
+ *
+ * It returns zero iff there are no unused children,
+ * otherwise  it returns the number of children moved to
+ * the end of the unused list. This may not be the total
+ * number of unused children, because select_parent can
+ * drop the lock and return early due to latency
+ * constraints.
+ */
+static int select_parent(struct dentry * parent)
+{
+	struct dentry *this_parent;
+	struct list_head *next;
+	unsigned seq;
+	int found = 0;
+	int locked = 0;
+
+	seq = read_seqbegin(&rename_lock);
+again:
+	this_parent = parent;
+	spin_lock(&this_parent->d_lock);
+repeat:
+	next = this_parent->d_subdirs.next;
+resume:
+	while (next != &this_parent->d_subdirs) {
+		struct list_head *tmp = next;
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
+		next = tmp->next;
+
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+
+		/* 
+		 * move only zero ref count dentries to the end 
+		 * of the unused list for prune_dcache
+		 *
+		 * Those which are presently on the shrink list, being processed
+		 * by shrink_dentry_list(), shouldn't be moved.  Otherwise the
+		 * loop in shrink_dcache_parent() might not make any progress
+		 * and loop forever.
+		 */
+		if (dentry->d_count) {
+			dentry_lru_del(dentry);
+		} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
+			dentry_lru_move_tail(dentry);
+			found++;
+		}
+		/*
+		 * We can return to the caller if we have found some (this
+		 * ensures forward progress). We'll be coming back to find
+		 * the rest.
+		 */
+		if (found && need_resched()) {
+			spin_unlock(&dentry->d_lock);
+			goto out;
+		}
+
+		/*
+		 * Descend a level if the d_subdirs list is non-empty.
+		 */
+		if (!list_empty(&dentry->d_subdirs)) {
+			spin_unlock(&this_parent->d_lock);
+			spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
+			this_parent = dentry;
+			spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
+			goto repeat;
+		}
+
+		spin_unlock(&dentry->d_lock);
+	}
+	/*
+	 * All done at this level ... ascend and resume the search.
+	 */
+	if (this_parent != parent) {
+		struct dentry *child = this_parent;
+		this_parent = try_to_ascend(this_parent, locked, seq);
+		if (!this_parent)
+			goto rename_retry;
+		next = child->d_u.d_child.next;
+		goto resume;
+	}
+out:
+	spin_unlock(&this_parent->d_lock);
+	if (!locked && read_seqretry(&rename_lock, seq))
+		goto rename_retry;
+	if (locked)
+		write_sequnlock(&rename_lock);
+	return found;
+
+rename_retry:
+	if (found)
+		return found;
+	locked = 1;
+	write_seqlock(&rename_lock);
+	goto again;
+}
+
+/**
+ * shrink_dcache_parent - prune dcache
+ * @parent: parent of entries to prune
+ *
+ * Prune the dcache to remove unused children of the parent dentry.
+ */
+ 
+void shrink_dcache_parent(struct dentry * parent)
+{
+	struct super_block *sb = parent->d_sb;
+	int found;
+
+	while ((found = select_parent(parent)) != 0)
+		__shrink_dcache_sb(sb, &found, 0);
+}
+EXPORT_SYMBOL(shrink_dcache_parent);
+
+/*
+ * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain.
+ *
+ * We need to avoid reentering the filesystem if the caller is performing a
+ * GFP_NOFS allocation attempt.  One example deadlock is:
+ *
+ * ext2_new_block->getblk->GFP->shrink_dcache_memory->prune_dcache->
+ * prune_one_dentry->dput->dentry_iput->iput->inode->i_sb->s_op->put_inode->
+ * ext2_discard_prealloc->ext2_free_blocks->lock_super->DEADLOCK.
+ *
+ * In this case we return -1 to tell the caller that we baled.
+ */
+static int shrink_dcache_memory(struct shrinker *shrink,
+				struct shrink_control *sc)
+{
+	int nr = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
+
+	if (nr) {
+		if (!(gfp_mask & __GFP_FS))
+			return -1;
+		prune_dcache(nr);
+	}
+
+	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+}
+
+static struct shrinker dcache_shrinker = {
+	.shrink = shrink_dcache_memory,
+	.seeks = DEFAULT_SEEKS,
+};
+
+/**
+ * d_alloc	-	allocate a dcache entry
+ * @parent: parent of entry to allocate
+ * @name: qstr of the name
+ *
+ * Allocates a dentry. It returns %NULL if there is insufficient memory
+ * available. On a success the dentry is returned. The name passed in is
+ * copied and the copy passed in may be reused after this call.
+ */
+ 
+struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
+{
+	struct dentry *dentry;
+	char *dname;
+
+	dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+	if (!dentry)
+		return NULL;
+
+	if (name->len > DNAME_INLINE_LEN-1) {
+		dname = kmalloc(name->len + 1, GFP_KERNEL);
+		if (!dname) {
+			kmem_cache_free(dentry_cache, dentry); 
+			return NULL;
+		}
+	} else  {
+		dname = dentry->d_iname;
+	}	
+	dentry->d_name.name = dname;
+
+	dentry->d_name.len = name->len;
+	dentry->d_name.hash = name->hash;
+	memcpy(dname, name->name, name->len);
+	dname[name->len] = 0;
+
+	dentry->d_count = 1;
+	dentry->d_flags = 0;
+	spin_lock_init(&dentry->d_lock);
+	seqcount_init(&dentry->d_seq);
+	dentry->d_inode = NULL;
+	dentry->d_parent = NULL;
+	dentry->d_sb = NULL;
+	dentry->d_op = NULL;
+	dentry->d_fsdata = NULL;
+	INIT_HLIST_BL_NODE(&dentry->d_hash);
+	INIT_LIST_HEAD(&dentry->d_lru);
+	INIT_LIST_HEAD(&dentry->d_subdirs);
+	INIT_LIST_HEAD(&dentry->d_alias);
+	INIT_LIST_HEAD(&dentry->d_u.d_child);
+
+	if (parent) {
+		spin_lock(&parent->d_lock);
+		/*
+		 * don't need child lock because it is not subject
+		 * to concurrency here
+		 */
+		__dget_dlock(parent);
+		dentry->d_parent = parent;
+		dentry->d_sb = parent->d_sb;
+		d_set_d_op(dentry, dentry->d_sb->s_d_op);
+		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
+		spin_unlock(&parent->d_lock);
+	}
+
+	this_cpu_inc(nr_dentry);
+
+	return dentry;
+}
+EXPORT_SYMBOL(d_alloc);
+
+struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
+{
+	struct dentry *dentry = d_alloc(NULL, name);
+	if (dentry) {
+		dentry->d_sb = sb;
+		d_set_d_op(dentry, dentry->d_sb->s_d_op);
+		dentry->d_parent = dentry;
+		dentry->d_flags |= DCACHE_DISCONNECTED;
+	}
+	return dentry;
+}
+EXPORT_SYMBOL(d_alloc_pseudo);
+
+struct dentry *d_alloc_name(struct dentry *parent, const char *name)
+{
+	struct qstr q;
+
+	q.name = name;
+	q.len = strlen(name);
+	q.hash = full_name_hash(q.name, q.len);
+	return d_alloc(parent, &q);
+}
+EXPORT_SYMBOL(d_alloc_name);
+
+void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
+{
+	WARN_ON_ONCE(dentry->d_op);
+	WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH	|
+				DCACHE_OP_COMPARE	|
+				DCACHE_OP_REVALIDATE	|
+				DCACHE_OP_DELETE ));
+	dentry->d_op = op;
+	if (!op)
+		return;
+	if (op->d_hash)
+		dentry->d_flags |= DCACHE_OP_HASH;
+	if (op->d_compare)
+		dentry->d_flags |= DCACHE_OP_COMPARE;
+	if (op->d_revalidate)
+		dentry->d_flags |= DCACHE_OP_REVALIDATE;
+	if (op->d_delete)
+		dentry->d_flags |= DCACHE_OP_DELETE;
+
+}
+EXPORT_SYMBOL(d_set_d_op);
+
+static void __d_instantiate(struct dentry *dentry, struct inode *inode)
+{
+	spin_lock(&dentry->d_lock);
+	if (inode) {
+		if (unlikely(IS_AUTOMOUNT(inode)))
+			dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+		list_add(&dentry->d_alias, &inode->i_dentry);
+	}
+	dentry->d_inode = inode;
+	dentry_rcuwalk_barrier(dentry);
+	spin_unlock(&dentry->d_lock);
+	fsnotify_d_instantiate(dentry, inode);
+}
+
+/**
+ * d_instantiate - fill in inode information for a dentry
+ * @entry: dentry to complete
+ * @inode: inode to attach to this dentry
+ *
+ * Fill in inode information in the entry.
+ *
+ * This turns negative dentries into productive full members
+ * of society.
+ *
+ * NOTE! This assumes that the inode count has been incremented
+ * (or otherwise set) by the caller to indicate that it is now
+ * in use by the dcache.
+ */
+ 
+void d_instantiate(struct dentry *entry, struct inode * inode)
+{
+	BUG_ON(!list_empty(&entry->d_alias));
+	if (inode)
+		spin_lock(&inode->i_lock);
+	__d_instantiate(entry, inode);
+	if (inode)
+		spin_unlock(&inode->i_lock);
+	security_d_instantiate(entry, inode);
+}
+EXPORT_SYMBOL(d_instantiate);
+
+/**
+ * d_instantiate_unique - instantiate a non-aliased dentry
+ * @entry: dentry to instantiate
+ * @inode: inode to attach to this dentry
+ *
+ * Fill in inode information in the entry. On success, it returns NULL.
+ * If an unhashed alias of "entry" already exists, then we return the
+ * aliased dentry instead and drop one reference to inode.
+ *
+ * Note that in order to avoid conflicts with rename() etc, the caller
+ * had better be holding the parent directory semaphore.
+ *
+ * This also assumes that the inode count has been incremented
+ * (or otherwise set) by the caller to indicate that it is now
+ * in use by the dcache.
+ */
+static struct dentry *__d_instantiate_unique(struct dentry *entry,
+					     struct inode *inode)
+{
+	struct dentry *alias;
+	int len = entry->d_name.len;
+	const char *name = entry->d_name.name;
+	unsigned int hash = entry->d_name.hash;
+
+	if (!inode) {
+		__d_instantiate(entry, NULL);
+		return NULL;
+	}
+
+	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
+		struct qstr *qstr = &alias->d_name;
+
+		/*
+		 * Don't need alias->d_lock here, because aliases with
+		 * d_parent == entry->d_parent are not subject to name or
+		 * parent changes, because the parent inode i_mutex is held.
+		 */
+		if (qstr->hash != hash)
+			continue;
+		if (alias->d_parent != entry->d_parent)
+			continue;
+		if (dentry_cmp(qstr->name, qstr->len, name, len))
+			continue;
+		__dget(alias);
+		return alias;
+	}
+
+	__d_instantiate(entry, inode);
+	return NULL;
+}
+
+struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
+{
+	struct dentry *result;
+
+	BUG_ON(!list_empty(&entry->d_alias));
+
+	if (inode)
+		spin_lock(&inode->i_lock);
+	result = __d_instantiate_unique(entry, inode);
+	if (inode)
+		spin_unlock(&inode->i_lock);
+
+	if (!result) {
+		security_d_instantiate(entry, inode);
+		return NULL;
+	}
+
+	BUG_ON(!d_unhashed(result));
+	iput(inode);
+	return result;
+}
+
+EXPORT_SYMBOL(d_instantiate_unique);
+
+/**
+ * d_alloc_root - allocate root dentry
+ * @root_inode: inode to allocate the root for
+ *
+ * Allocate a root ("/") dentry for the inode given. The inode is
+ * instantiated and returned. %NULL is returned if there is insufficient
+ * memory or the inode passed is %NULL.
+ */
+ 
+struct dentry * d_alloc_root(struct inode * root_inode)
+{
+	struct dentry *res = NULL;
+
+	if (root_inode) {
+		static const struct qstr name = { .name = "/", .len = 1 };
+
+		res = d_alloc(NULL, &name);
+		if (res) {
+			res->d_sb = root_inode->i_sb;
+			d_set_d_op(res, res->d_sb->s_d_op);
+			res->d_parent = res;
+			d_instantiate(res, root_inode);
+		}
+	}
+	return res;
+}
+EXPORT_SYMBOL(d_alloc_root);
+
+static struct dentry * __d_find_any_alias(struct inode *inode)
+{
+	struct dentry *alias;
+
+	if (list_empty(&inode->i_dentry))
+		return NULL;
+	alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
+	__dget(alias);
+	return alias;
+}
+
+static struct dentry * d_find_any_alias(struct inode *inode)
+{
+	struct dentry *de;
+
+	spin_lock(&inode->i_lock);
+	de = __d_find_any_alias(inode);
+	spin_unlock(&inode->i_lock);
+	return de;
+}
+
+
+/**
+ * d_obtain_alias - find or allocate a dentry for a given inode
+ * @inode: inode to allocate the dentry for
+ *
+ * Obtain a dentry for an inode resulting from NFS filehandle conversion or
+ * similar open by handle operations.  The returned dentry may be anonymous,
+ * or may have a full name (if the inode was already in the cache).
+ *
+ * When called on a directory inode, we must ensure that the inode only ever
+ * has one dentry.  If a dentry is found, that is returned instead of
+ * allocating a new one.
+ *
+ * On successful return, the reference to the inode has been transferred
+ * to the dentry.  In case of an error the reference on the inode is released.
+ * To make it easier to use in export operations a %NULL or IS_ERR inode may
+ * be passed in and will be the error will be propagate to the return value,
+ * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
+ */
+struct dentry *d_obtain_alias(struct inode *inode)
+{
+	static const struct qstr anonstring = { .name = "" };
+	struct dentry *tmp;
+	struct dentry *res;
+
+	if (!inode)
+		return ERR_PTR(-ESTALE);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	res = d_find_any_alias(inode);
+	if (res)
+		goto out_iput;
+
+	tmp = d_alloc(NULL, &anonstring);
+	if (!tmp) {
+		res = ERR_PTR(-ENOMEM);
+		goto out_iput;
+	}
+	tmp->d_parent = tmp; /* make sure dput doesn't croak */
+
+
+	spin_lock(&inode->i_lock);
+	res = __d_find_any_alias(inode);
+	if (res) {
+		spin_unlock(&inode->i_lock);
+		dput(tmp);
+		goto out_iput;
+	}
+
+	/* attach a disconnected dentry */
+	spin_lock(&tmp->d_lock);
+	tmp->d_sb = inode->i_sb;
+	d_set_d_op(tmp, tmp->d_sb->s_d_op);
+	tmp->d_inode = inode;
+	tmp->d_flags |= DCACHE_DISCONNECTED;
+	list_add(&tmp->d_alias, &inode->i_dentry);
+	hlist_bl_lock(&tmp->d_sb->s_anon);
+	hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
+	hlist_bl_unlock(&tmp->d_sb->s_anon);
+	spin_unlock(&tmp->d_lock);
+	spin_unlock(&inode->i_lock);
+	security_d_instantiate(tmp, inode);
+
+	return tmp;
+
+ out_iput:
+	if (res && !IS_ERR(res))
+		security_d_instantiate(res, inode);
+	iput(inode);
+	return res;
+}
+EXPORT_SYMBOL(d_obtain_alias);
+
+/**
+ * d_splice_alias - splice a disconnected dentry into the tree if one exists
+ * @inode:  the inode which may have a disconnected dentry
+ * @dentry: a negative dentry which we want to point to the inode.
+ *
+ * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
+ * DCACHE_DISCONNECTED), then d_move that in place of the given dentry
+ * and return it, else simply d_add the inode to the dentry and return NULL.
+ *
+ * This is needed in the lookup routine of any filesystem that is exportable
+ * (via knfsd) so that we can build dcache paths to directories effectively.
+ *
+ * If a dentry was found and moved, then it is returned.  Otherwise NULL
+ * is returned.  This matches the expected return value of ->lookup.
+ *
+ */
+struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+{
+	struct dentry *new = NULL;
+
+	if (inode && S_ISDIR(inode->i_mode)) {
+		spin_lock(&inode->i_lock);
+		new = __d_find_alias(inode, 1);
+		if (new) {
+			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
+			spin_unlock(&inode->i_lock);
+			security_d_instantiate(new, inode);
+			d_move(new, dentry);
+			iput(inode);
+		} else {
+			/* already taking inode->i_lock, so d_add() by hand */
+			__d_instantiate(dentry, inode);
+			spin_unlock(&inode->i_lock);
+			security_d_instantiate(dentry, inode);
+			d_rehash(dentry);
+		}
+	} else
+		d_add(dentry, inode);
+	return new;
+}
+EXPORT_SYMBOL(d_splice_alias);
+
+/**
+ * d_add_ci - lookup or allocate new dentry with case-exact name
+ * @inode:  the inode case-insensitive lookup has found
+ * @dentry: the negative dentry that was passed to the parent's lookup func
+ * @name:   the case-exact name to be associated with the returned dentry
+ *
+ * This is to avoid filling the dcache with case-insensitive names to the
+ * same inode, only the actual correct case is stored in the dcache for
+ * case-insensitive filesystems.
+ *
+ * For a case-insensitive lookup match and if the the case-exact dentry
+ * already exists in in the dcache, use it and return it.
+ *
+ * If no entry exists with the exact case name, allocate new dentry with
+ * the exact case, and return the spliced entry.
+ */
+struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
+			struct qstr *name)
+{
+	int error;
+	struct dentry *found;
+	struct dentry *new;
+
+	/*
+	 * First check if a dentry matching the name already exists,
+	 * if not go ahead and create it now.
+	 */
+	found = d_hash_and_lookup(dentry->d_parent, name);
+	if (!found) {
+		new = d_alloc(dentry->d_parent, name);
+		if (!new) {
+			error = -ENOMEM;
+			goto err_out;
+		}
+
+		found = d_splice_alias(inode, new);
+		if (found) {
+			dput(new);
+			return found;
+		}
+		return new;
+	}
+
+	/*
+	 * If a matching dentry exists, and it's not negative use it.
+	 *
+	 * Decrement the reference count to balance the iget() done
+	 * earlier on.
+	 */
+	if (found->d_inode) {
+		if (unlikely(found->d_inode != inode)) {
+			/* This can't happen because bad inodes are unhashed. */
+			BUG_ON(!is_bad_inode(inode));
+			BUG_ON(!is_bad_inode(found->d_inode));
+		}
+		iput(inode);
+		return found;
+	}
+
+	/*
+	 * Negative dentry: instantiate it unless the inode is a directory and
+	 * already has a dentry.
+	 */
+	spin_lock(&inode->i_lock);
+	if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) {
+		__d_instantiate(found, inode);
+		spin_unlock(&inode->i_lock);
+		security_d_instantiate(found, inode);
+		return found;
+	}
+
+	/*
+	 * In case a directory already has a (disconnected) entry grab a
+	 * reference to it, move it in place and use it.
+	 */
+	new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+	__dget(new);
+	spin_unlock(&inode->i_lock);
+	security_d_instantiate(found, inode);
+	d_move(new, found);
+	iput(inode);
+	dput(found);
+	return new;
+
+err_out:
+	iput(inode);
+	return ERR_PTR(error);
+}
+EXPORT_SYMBOL(d_add_ci);
+
+/**
+ * __d_lookup_rcu - search for a dentry (racy, store-free)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * @seq: returns d_seq value at the point where the dentry was found
+ * @inode: returns dentry->d_inode when the inode was found valid.
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup_rcu is the dcache lookup function for rcu-walk name
+ * resolution (store-free path walking) design described in
+ * Documentation/filesystems/path-lookup.txt.
+ *
+ * This is not to be used outside core vfs.
+ *
+ * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
+ * held, and rcu_read_lock held. The returned dentry must not be stored into
+ * without taking d_lock and checking d_seq sequence count against @seq
+ * returned here.
+ *
+ * A refcount may be taken on the found dentry with the __d_rcu_to_refcount
+ * function.
+ *
+ * Alternatively, __d_lookup_rcu may be called again to look up the child of
+ * the returned dentry, so long as its parent's seqlock is checked after the
+ * child is looked up. Thus, an interlocking stepping of sequence lock checks
+ * is formed, giving integrity down the path walk.
+ */
+struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
+				unsigned *seq, struct inode **inode)
+{
+	unsigned int len = name->len;
+	unsigned int hash = name->hash;
+	const unsigned char *str = name->name;
+	struct hlist_bl_head *b = d_hash(parent, hash);
+	struct hlist_bl_node *node;
+	struct dentry *dentry;
+
+	/*
+	 * Note: There is significant duplication with __d_lookup_rcu which is
+	 * required to prevent single threaded performance regressions
+	 * especially on architectures where smp_rmb (in seqcounts) are costly.
+	 * Keep the two functions in sync.
+	 */
+
+	/*
+	 * The hash list is protected using RCU.
+	 *
+	 * Carefully use d_seq when comparing a candidate dentry, to avoid
+	 * races with d_move().
+	 *
+	 * It is possible that concurrent renames can mess up our list
+	 * walk here and result in missing our dentry, resulting in the
+	 * false-negative result. d_lookup() protects against concurrent
+	 * renames using rename_lock seqlock.
+	 *
+	 * See Documentation/filesystems/path-lookup.txt for more details.
+	 */
+	hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
+		struct inode *i;
+		const char *tname;
+		int tlen;
+
+		if (dentry->d_name.hash != hash)
+			continue;
+
+seqretry:
+		*seq = read_seqcount_begin(&dentry->d_seq);
+		if (dentry->d_parent != parent)
+			continue;
+		if (d_unhashed(dentry))
+			continue;
+		tlen = dentry->d_name.len;
+		tname = dentry->d_name.name;
+		i = dentry->d_inode;
+		prefetch(tname);
+		/*
+		 * This seqcount check is required to ensure name and
+		 * len are loaded atomically, so as not to walk off the
+		 * edge of memory when walking. If we could load this
+		 * atomically some other way, we could drop this check.
+		 */
+		if (read_seqcount_retry(&dentry->d_seq, *seq))
+			goto seqretry;
+		if (parent->d_flags & DCACHE_OP_COMPARE) {
+			if (parent->d_op->d_compare(parent, *inode,
+						dentry, i,
+						tlen, tname, name))
+				continue;
+		} else {
+			if (dentry_cmp(tname, tlen, str, len))
+				continue;
+		}
+		/*
+		 * No extra seqcount check is required after the name
+		 * compare. The caller must perform a seqcount check in
+		 * order to do anything useful with the returned dentry
+		 * anyway.
+		 */
+		*inode = i;
+		return dentry;
+	}
+	return NULL;
+}
+
+/**
+ * d_lookup - search for a dentry
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
+ *
+ * d_lookup searches the children of the parent dentry for the name in
+ * question. If the dentry is found its reference count is incremented and the
+ * dentry is returned. The caller must use dput to free the entry when it has
+ * finished using it. %NULL is returned if the dentry does not exist.
+ */
+struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
+{
+	struct dentry *dentry;
+	unsigned seq;
+
+        do {
+                seq = read_seqbegin(&rename_lock);
+                dentry = __d_lookup(parent, name);
+                if (dentry)
+			break;
+	} while (read_seqretry(&rename_lock, seq));
+	return dentry;
+}
+EXPORT_SYMBOL(d_lookup);
+
+/**
+ * __d_lookup - search for a dentry (racy)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup is like d_lookup, however it may (rarely) return a
+ * false-negative result due to unrelated rename activity.
+ *
+ * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
+ * however it must be used carefully, eg. with a following d_lookup in
+ * the case of failure.
+ *
+ * __d_lookup callers must be commented.
+ */
+struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
+{
+	unsigned int len = name->len;
+	unsigned int hash = name->hash;
+	const unsigned char *str = name->name;
+	struct hlist_bl_head *b = d_hash(parent, hash);
+	struct hlist_bl_node *node;
+	struct dentry *found = NULL;
+	struct dentry *dentry;
+
+	/*
+	 * Note: There is significant duplication with __d_lookup_rcu which is
+	 * required to prevent single threaded performance regressions
+	 * especially on architectures where smp_rmb (in seqcounts) are costly.
+	 * Keep the two functions in sync.
+	 */
+
+	/*
+	 * The hash list is protected using RCU.
+	 *
+	 * Take d_lock when comparing a candidate dentry, to avoid races
+	 * with d_move().
+	 *
+	 * It is possible that concurrent renames can mess up our list
+	 * walk here and result in missing our dentry, resulting in the
+	 * false-negative result. d_lookup() protects against concurrent
+	 * renames using rename_lock seqlock.
+	 *
+	 * See Documentation/filesystems/path-lookup.txt for more details.
+	 */
+	rcu_read_lock();
+	
+	hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
+		const char *tname;
+		int tlen;
+
+		if (dentry->d_name.hash != hash)
+			continue;
+
+		spin_lock(&dentry->d_lock);
+		if (dentry->d_parent != parent)
+			goto next;
+		if (d_unhashed(dentry))
+			goto next;
+
+		/*
+		 * It is safe to compare names since d_move() cannot
+		 * change the qstr (protected by d_lock).
+		 */
+		tlen = dentry->d_name.len;
+		tname = dentry->d_name.name;
+		if (parent->d_flags & DCACHE_OP_COMPARE) {
+			if (parent->d_op->d_compare(parent, parent->d_inode,
+						dentry, dentry->d_inode,
+						tlen, tname, name))
+				goto next;
+		} else {
+			if (dentry_cmp(tname, tlen, str, len))
+				goto next;
+		}
+
+		dentry->d_count++;
+		found = dentry;
+		spin_unlock(&dentry->d_lock);
+		break;
+next:
+		spin_unlock(&dentry->d_lock);
+ 	}
+ 	rcu_read_unlock();
+
+ 	return found;
+}
+
+/**
+ * d_hash_and_lookup - hash the qstr then search for a dentry
+ * @dir: Directory to search in
+ * @name: qstr of name we wish to find
+ *
+ * On hash failure or on lookup failure NULL is returned.
+ */
+struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
+{
+	struct dentry *dentry = NULL;
+
+	/*
+	 * Check for a fs-specific hash function. Note that we must
+	 * calculate the standard hash first, as the d_op->d_hash()
+	 * routine may choose to leave the hash value unchanged.
+	 */
+	name->hash = full_name_hash(name->name, name->len);
+	if (dir->d_flags & DCACHE_OP_HASH) {
+		if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0)
+			goto out;
+	}
+	dentry = d_lookup(dir, name);
+out:
+	return dentry;
+}
+
+/**
+ * d_validate - verify dentry provided from insecure source (deprecated)
+ * @dentry: The dentry alleged to be valid child of @dparent
+ * @dparent: The parent dentry (known to be valid)
+ *
+ * An insecure source has sent us a dentry, here we verify it and dget() it.
+ * This is used by ncpfs in its readdir implementation.
+ * Zero is returned in the dentry is invalid.
+ *
+ * This function is slow for big directories, and deprecated, do not use it.
+ */
+int d_validate(struct dentry *dentry, struct dentry *dparent)
+{
+	struct dentry *child;
+
+	spin_lock(&dparent->d_lock);
+	list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) {
+		if (dentry == child) {
+			spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+			__dget_dlock(dentry);
+			spin_unlock(&dentry->d_lock);
+			spin_unlock(&dparent->d_lock);
+			return 1;
+		}
+	}
+	spin_unlock(&dparent->d_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(d_validate);
+
+/*
+ * When a file is deleted, we have two options:
+ * - turn this dentry into a negative dentry
+ * - unhash this dentry and free it.
+ *
+ * Usually, we want to just turn this into
+ * a negative dentry, but if anybody else is
+ * currently using the dentry or the inode
+ * we can't do that and we fall back on removing
+ * it from the hash queues and waiting for
+ * it to be deleted later when it has no users
+ */
+ 
+/**
+ * d_delete - delete a dentry
+ * @dentry: The dentry to delete
+ *
+ * Turn the dentry into a negative dentry if possible, otherwise
+ * remove it from the hash queues so it can be deleted later
+ */
+ 
+void d_delete(struct dentry * dentry)
+{
+	struct inode *inode;
+	int isdir = 0;
+	/*
+	 * Are we the only user?
+	 */
+again:
+	spin_lock(&dentry->d_lock);
+	inode = dentry->d_inode;
+	isdir = S_ISDIR(inode->i_mode);
+	if (dentry->d_count == 1) {
+		if (inode && !spin_trylock(&inode->i_lock)) {
+			spin_unlock(&dentry->d_lock);
+			cpu_relax();
+			goto again;
+		}
+		dentry->d_flags &= ~DCACHE_CANT_MOUNT;
+		dentry_unlink_inode(dentry);
+		fsnotify_nameremove(dentry, isdir);
+		return;
+	}
+
+	if (!d_unhashed(dentry))
+		__d_drop(dentry);
+
+	spin_unlock(&dentry->d_lock);
+
+	fsnotify_nameremove(dentry, isdir);
+}
+EXPORT_SYMBOL(d_delete);
+
+static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b)
+{
+	BUG_ON(!d_unhashed(entry));
+	hlist_bl_lock(b);
+	entry->d_flags |= DCACHE_RCUACCESS;
+	hlist_bl_add_head_rcu(&entry->d_hash, b);
+	hlist_bl_unlock(b);
+}
+
+static void _d_rehash(struct dentry * entry)
+{
+	__d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash));
+}
+
+/**
+ * d_rehash	- add an entry back to the hash
+ * @entry: dentry to add to the hash
+ *
+ * Adds a dentry to the hash according to its name.
+ */
+ 
+void d_rehash(struct dentry * entry)
+{
+	spin_lock(&entry->d_lock);
+	_d_rehash(entry);
+	spin_unlock(&entry->d_lock);
+}
+EXPORT_SYMBOL(d_rehash);
+
+/**
+ * dentry_update_name_case - update case insensitive dentry with a new name
+ * @dentry: dentry to be updated
+ * @name: new name
+ *
+ * Update a case insensitive dentry with new case of name.
+ *
+ * dentry must have been returned by d_lookup with name @name. Old and new
+ * name lengths must match (ie. no d_compare which allows mismatched name
+ * lengths).
+ *
+ * Parent inode i_mutex must be held over d_lookup and into this call (to
+ * keep renames and concurrent inserts, and readdir(2) away).
+ */
+void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
+{
+	BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex));
+	BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
+
+	spin_lock(&dentry->d_lock);
+	write_seqcount_begin(&dentry->d_seq);
+	memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
+	write_seqcount_end(&dentry->d_seq);
+	spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(dentry_update_name_case);
+
+static void switch_names(struct dentry *dentry, struct dentry *target)
+{
+	if (dname_external(target)) {
+		if (dname_external(dentry)) {
+			/*
+			 * Both external: swap the pointers
+			 */
+			swap(target->d_name.name, dentry->d_name.name);
+		} else {
+			/*
+			 * dentry:internal, target:external.  Steal target's
+			 * storage and make target internal.
+			 */
+			memcpy(target->d_iname, dentry->d_name.name,
+					dentry->d_name.len + 1);
+			dentry->d_name.name = target->d_name.name;
+			target->d_name.name = target->d_iname;
+		}
+	} else {
+		if (dname_external(dentry)) {
+			/*
+			 * dentry:external, target:internal.  Give dentry's
+			 * storage to target and make dentry internal
+			 */
+			memcpy(dentry->d_iname, target->d_name.name,
+					target->d_name.len + 1);
+			target->d_name.name = dentry->d_name.name;
+			dentry->d_name.name = dentry->d_iname;
+		} else {
+			/*
+			 * Both are internal.  Just copy target to dentry
+			 */
+			memcpy(dentry->d_iname, target->d_name.name,
+					target->d_name.len + 1);
+			dentry->d_name.len = target->d_name.len;
+			return;
+		}
+	}
+	swap(dentry->d_name.len, target->d_name.len);
+}
+
+static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
+{
+	/*
+	 * XXXX: do we really need to take target->d_lock?
+	 */
+	if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent)
+		spin_lock(&target->d_parent->d_lock);
+	else {
+		if (d_ancestor(dentry->d_parent, target->d_parent)) {
+			spin_lock(&dentry->d_parent->d_lock);
+			spin_lock_nested(&target->d_parent->d_lock,
+						DENTRY_D_LOCK_NESTED);
+		} else {
+			spin_lock(&target->d_parent->d_lock);
+			spin_lock_nested(&dentry->d_parent->d_lock,
+						DENTRY_D_LOCK_NESTED);
+		}
+	}
+	if (target < dentry) {
+		spin_lock_nested(&target->d_lock, 2);
+		spin_lock_nested(&dentry->d_lock, 3);
+	} else {
+		spin_lock_nested(&dentry->d_lock, 2);
+		spin_lock_nested(&target->d_lock, 3);
+	}
+}
+
+static void dentry_unlock_parents_for_move(struct dentry *dentry,
+					struct dentry *target)
+{
+	if (target->d_parent != dentry->d_parent)
+		spin_unlock(&dentry->d_parent->d_lock);
+	if (target->d_parent != target)
+		spin_unlock(&target->d_parent->d_lock);
+}
+
+/*
+ * When switching names, the actual string doesn't strictly have to
+ * be preserved in the target - because we're dropping the target
+ * anyway. As such, we can just do a simple memcpy() to copy over
+ * the new name before we switch.
+ *
+ * Note that we have to be a lot more careful about getting the hash
+ * switched - we have to switch the hash value properly even if it
+ * then no longer matches the actual (corrupted) string of the target.
+ * The hash value has to match the hash queue that the dentry is on..
+ */
+/*
+ * __d_move - move a dentry
+ * @dentry: entry to move
+ * @target: new dentry
+ *
+ * Update the dcache to reflect the move of a file name. Negative
+ * dcache entries should not be moved in this way.  Caller hold
+ * rename_lock.
+ */
+static void __d_move(struct dentry * dentry, struct dentry * target)
+{
+	if (!dentry->d_inode)
+		printk(KERN_WARNING "VFS: moving negative dcache entry\n");
+
+	BUG_ON(d_ancestor(dentry, target));
+	BUG_ON(d_ancestor(target, dentry));
+
+	dentry_lock_for_move(dentry, target);
+
+	write_seqcount_begin(&dentry->d_seq);
+	write_seqcount_begin(&target->d_seq);
+
+	/* __d_drop does write_seqcount_barrier, but they're OK to nest. */
+
+	/*
+	 * Move the dentry to the target hash queue. Don't bother checking
+	 * for the same hash queue because of how unlikely it is.
+	 */
+	__d_drop(dentry);
+	__d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
+
+	/* Unhash the target: dput() will then get rid of it */
+	__d_drop(target);
+
+	list_del(&dentry->d_u.d_child);
+	list_del(&target->d_u.d_child);
+
+	/* Switch the names.. */
+	switch_names(dentry, target);
+	swap(dentry->d_name.hash, target->d_name.hash);
+
+	/* ... and switch the parents */
+	if (IS_ROOT(dentry)) {
+		dentry->d_parent = target->d_parent;
+		target->d_parent = target;
+		INIT_LIST_HEAD(&target->d_u.d_child);
+	} else {
+		swap(dentry->d_parent, target->d_parent);
+
+		/* And add them back to the (new) parent lists */
+		list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
+	}
+
+	list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
+
+	write_seqcount_end(&target->d_seq);
+	write_seqcount_end(&dentry->d_seq);
+
+	dentry_unlock_parents_for_move(dentry, target);
+	spin_unlock(&target->d_lock);
+	fsnotify_d_move(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+/*
+ * d_move - move a dentry
+ * @dentry: entry to move
+ * @target: new dentry
+ *
+ * Update the dcache to reflect the move of a file name. Negative
+ * dcache entries should not be moved in this way.
+ */
+void d_move(struct dentry *dentry, struct dentry *target)
+{
+	write_seqlock(&rename_lock);
+	__d_move(dentry, target);
+	write_sequnlock(&rename_lock);
+}
+EXPORT_SYMBOL(d_move);
+
+/**
+ * d_ancestor - search for an ancestor
+ * @p1: ancestor dentry
+ * @p2: child dentry
+ *
+ * Returns the ancestor dentry of p2 which is a child of p1, if p1 is
+ * an ancestor of p2, else NULL.
+ */
+struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
+{
+	struct dentry *p;
+
+	for (p = p2; !IS_ROOT(p); p = p->d_parent) {
+		if (p->d_parent == p1)
+			return p;
+	}
+	return NULL;
+}
+
+/*
+ * This helper attempts to cope with remotely renamed directories
+ *
+ * It assumes that the caller is already holding
+ * dentry->d_parent->d_inode->i_mutex, inode->i_lock and rename_lock
+ *
+ * Note: If ever the locking in lock_rename() changes, then please
+ * remember to update this too...
+ */
+static struct dentry *__d_unalias(struct inode *inode,
+		struct dentry *dentry, struct dentry *alias)
+{
+	struct mutex *m1 = NULL, *m2 = NULL;
+	struct dentry *ret;
+
+	/* If alias and dentry share a parent, then no extra locks required */
+	if (alias->d_parent == dentry->d_parent)
+		goto out_unalias;
+
+	/* See lock_rename() */
+	ret = ERR_PTR(-EBUSY);
+	if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
+		goto out_err;
+	m1 = &dentry->d_sb->s_vfs_rename_mutex;
+	if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex))
+		goto out_err;
+	m2 = &alias->d_parent->d_inode->i_mutex;
+out_unalias:
+	__d_move(alias, dentry);
+	ret = alias;
+out_err:
+	spin_unlock(&inode->i_lock);
+	if (m2)
+		mutex_unlock(m2);
+	if (m1)
+		mutex_unlock(m1);
+	return ret;
+}
+
+/*
+ * Prepare an anonymous dentry for life in the superblock's dentry tree as a
+ * named dentry in place of the dentry to be replaced.
+ * returns with anon->d_lock held!
+ */
+static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
+{
+	struct dentry *dparent, *aparent;
+
+	dentry_lock_for_move(anon, dentry);
+
+	write_seqcount_begin(&dentry->d_seq);
+	write_seqcount_begin(&anon->d_seq);
+
+	dparent = dentry->d_parent;
+	aparent = anon->d_parent;
+
+	switch_names(dentry, anon);
+	swap(dentry->d_name.hash, anon->d_name.hash);
+
+	dentry->d_parent = (aparent == anon) ? dentry : aparent;
+	list_del(&dentry->d_u.d_child);
+	if (!IS_ROOT(dentry))
+		list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
+	else
+		INIT_LIST_HEAD(&dentry->d_u.d_child);
+
+	anon->d_parent = (dparent == dentry) ? anon : dparent;
+	list_del(&anon->d_u.d_child);
+	if (!IS_ROOT(anon))
+		list_add(&anon->d_u.d_child, &anon->d_parent->d_subdirs);
+	else
+		INIT_LIST_HEAD(&anon->d_u.d_child);
+
+	write_seqcount_end(&dentry->d_seq);
+	write_seqcount_end(&anon->d_seq);
+
+	dentry_unlock_parents_for_move(anon, dentry);
+	spin_unlock(&dentry->d_lock);
+
+	/* anon->d_lock still locked, returns locked */
+	anon->d_flags &= ~DCACHE_DISCONNECTED;
+}
+
+/**
+ * d_materialise_unique - introduce an inode into the tree
+ * @dentry: candidate dentry
+ * @inode: inode to bind to the dentry, to which aliases may be attached
+ *
+ * Introduces an dentry into the tree, substituting an extant disconnected
+ * root directory alias in its place if there is one
+ */
+struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
+{
+	struct dentry *actual;
+
+	BUG_ON(!d_unhashed(dentry));
+
+	if (!inode) {
+		actual = dentry;
+		__d_instantiate(dentry, NULL);
+		d_rehash(actual);
+		goto out_nolock;
+	}
+
+	spin_lock(&inode->i_lock);
+
+	if (S_ISDIR(inode->i_mode)) {
+		struct dentry *alias;
+
+		/* Does an aliased dentry already exist? */
+		alias = __d_find_alias(inode, 0);
+		if (alias) {
+			actual = alias;
+			write_seqlock(&rename_lock);
+
+			if (d_ancestor(alias, dentry)) {
+				/* Check for loops */
+				actual = ERR_PTR(-ELOOP);
+				spin_unlock(&inode->i_lock);
+			} else if (IS_ROOT(alias)) {
+				/* Is this an anonymous mountpoint that we
+				 * could splice into our tree? */
+				__d_materialise_dentry(dentry, alias);
+				write_sequnlock(&rename_lock);
+				__d_drop(alias);
+				goto found;
+			} else {
+				/* Nope, but we must(!) avoid directory
+				 * aliasing. This drops inode->i_lock */
+				actual = __d_unalias(inode, dentry, alias);
+			}
+			write_sequnlock(&rename_lock);
+			if (IS_ERR(actual))
+				dput(alias);
+			goto out_nolock;
+		}
+	}
+
+	/* Add a unique reference */
+	actual = __d_instantiate_unique(dentry, inode);
+	if (!actual)
+		actual = dentry;
+	else
+		BUG_ON(!d_unhashed(actual));
+
+	spin_lock(&actual->d_lock);
+found:
+	_d_rehash(actual);
+	spin_unlock(&actual->d_lock);
+	spin_unlock(&inode->i_lock);
+out_nolock:
+	if (actual == dentry) {
+		security_d_instantiate(dentry, inode);
+		return NULL;
+	}
+
+	iput(inode);
+	return actual;
+}
+EXPORT_SYMBOL_GPL(d_materialise_unique);
+
+static int prepend(char **buffer, int *buflen, const char *str, int namelen)
+{
+	*buflen -= namelen;
+	if (*buflen < 0)
+		return -ENAMETOOLONG;
+	*buffer -= namelen;
+	memcpy(*buffer, str, namelen);
+	return 0;
+}
+
+static int prepend_name(char **buffer, int *buflen, struct qstr *name)
+{
+	return prepend(buffer, buflen, name->name, name->len);
+}
+
+/**
+ * prepend_path - Prepend path string to a buffer
+ * @path: the dentry/vfsmount to report
+ * @root: root vfsmnt/dentry
+ * @buffer: pointer to the end of the buffer
+ * @buflen: pointer to buffer length
+ *
+ * Caller holds the rename_lock.
+ */
+static int prepend_path(const struct path *path,
+			const struct path *root,
+			char **buffer, int *buflen)
+{
+	struct dentry *dentry = path->dentry;
+	struct vfsmount *vfsmnt = path->mnt;
+	bool slash = false;
+	int error = 0;
+
+	br_read_lock(vfsmount_lock);
+	while (dentry != root->dentry || vfsmnt != root->mnt) {
+		struct dentry * parent;
+
+		if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
+			/* Global root? */
+			if (vfsmnt->mnt_parent == vfsmnt) {
+				goto global_root;
+			}
+			dentry = vfsmnt->mnt_mountpoint;
+			vfsmnt = vfsmnt->mnt_parent;
+			continue;
+		}
+		parent = dentry->d_parent;
+		prefetch(parent);
+		spin_lock(&dentry->d_lock);
+		error = prepend_name(buffer, buflen, &dentry->d_name);
+		spin_unlock(&dentry->d_lock);
+		if (!error)
+			error = prepend(buffer, buflen, "/", 1);
+		if (error)
+			break;
+
+		slash = true;
+		dentry = parent;
+	}
+
+	if (!error && !slash)
+		error = prepend(buffer, buflen, "/", 1);
+
+out:
+	br_read_unlock(vfsmount_lock);
+	return error;
+
+global_root:
+	/*
+	 * Filesystems needing to implement special "root names"
+	 * should do so with ->d_dname()
+	 */
+	if (IS_ROOT(dentry) &&
+	    (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) {
+		WARN(1, "Root dentry has weird name <%.*s>\n",
+		     (int) dentry->d_name.len, dentry->d_name.name);
+	}
+	if (!slash)
+		error = prepend(buffer, buflen, "/", 1);
+	if (!error)
+		error = vfsmnt->mnt_ns ? 1 : 2;
+	goto out;
+}
+
+/**
+ * __d_path - return the path of a dentry
+ * @path: the dentry/vfsmount to report
+ * @root: root vfsmnt/dentry
+ * @buf: buffer to return value in
+ * @buflen: buffer length
+ *
+ * Convert a dentry into an ASCII path name.
+ *
+ * Returns a pointer into the buffer or an error code if the
+ * path was too long.
+ *
+ * "buflen" should be positive.
+ *
+ * If the path is not reachable from the supplied root, return %NULL.
+ */
+char *__d_path(const struct path *path,
+	       const struct path *root,
+	       char *buf, int buflen)
+{
+	char *res = buf + buflen;
+	int error;
+
+	prepend(&res, &buflen, "\0", 1);
+	write_seqlock(&rename_lock);
+	error = prepend_path(path, root, &res, &buflen);
+	write_sequnlock(&rename_lock);
+
+	if (error < 0)
+		return ERR_PTR(error);
+	if (error > 0)
+		return NULL;
+	return res;
+}
+
+char *d_absolute_path(const struct path *path,
+	       char *buf, int buflen)
+{
+	struct path root = {};
+	char *res = buf + buflen;
+	int error;
+
+	prepend(&res, &buflen, "\0", 1);
+	write_seqlock(&rename_lock);
+	error = prepend_path(path, &root, &res, &buflen);
+	write_sequnlock(&rename_lock);
+
+	if (error > 1)
+		error = -EINVAL;
+	if (error < 0)
+		return ERR_PTR(error);
+	return res;
+}
+
+/*
+ * same as __d_path but appends "(deleted)" for unlinked files.
+ */
+static int path_with_deleted(const struct path *path,
+			     const struct path *root,
+			     char **buf, int *buflen)
+{
+	prepend(buf, buflen, "\0", 1);
+	if (d_unlinked(path->dentry)) {
+		int error = prepend(buf, buflen, " (deleted)", 10);
+		if (error)
+			return error;
+	}
+
+	return prepend_path(path, root, buf, buflen);
+}
+
+static int prepend_unreachable(char **buffer, int *buflen)
+{
+	return prepend(buffer, buflen, "(unreachable)", 13);
+}
+
+/**
+ * d_path - return the path of a dentry
+ * @path: path to report
+ * @buf: buffer to return value in
+ * @buflen: buffer length
+ *
+ * Convert a dentry into an ASCII path name. If the entry has been deleted
+ * the string " (deleted)" is appended. Note that this is ambiguous.
+ *
+ * Returns a pointer into the buffer or an error code if the path was
+ * too long. Note: Callers should use the returned pointer, not the passed
+ * in buffer, to use the name! The implementation often starts at an offset
+ * into the buffer, and may leave 0 bytes at the start.
+ *
+ * "buflen" should be positive.
+ */
+char *d_path(const struct path *path, char *buf, int buflen)
+{
+	char *res = buf + buflen;
+	struct path root;
+	int error;
+
+	/*
+	 * We have various synthetic filesystems that never get mounted.  On
+	 * these filesystems dentries are never used for lookup purposes, and
+	 * thus don't need to be hashed.  They also don't need a name until a
+	 * user wants to identify the object in /proc/pid/fd/.  The little hack
+	 * below allows us to generate a name for these objects on demand:
+	 */
+	if (path->dentry->d_op && path->dentry->d_op->d_dname)
+		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
+
+	get_fs_root(current->fs, &root);
+	write_seqlock(&rename_lock);
+	error = path_with_deleted(path, &root, &res, &buflen);
+	if (error < 0)
+		res = ERR_PTR(error);
+	write_sequnlock(&rename_lock);
+	path_put(&root);
+	return res;
+}
+EXPORT_SYMBOL(d_path);
+
+/**
+ * d_path_with_unreachable - return the path of a dentry
+ * @path: path to report
+ * @buf: buffer to return value in
+ * @buflen: buffer length
+ *
+ * The difference from d_path() is that this prepends "(unreachable)"
+ * to paths which are unreachable from the current process' root.
+ */
+char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
+{
+	char *res = buf + buflen;
+	struct path root;
+	int error;
+
+	if (path->dentry->d_op && path->dentry->d_op->d_dname)
+		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
+
+	get_fs_root(current->fs, &root);
+	write_seqlock(&rename_lock);
+	error = path_with_deleted(path, &root, &res, &buflen);
+	if (error > 0)
+		error = prepend_unreachable(&res, &buflen);
+	write_sequnlock(&rename_lock);
+	path_put(&root);
+	if (error)
+		res =  ERR_PTR(error);
+
+	return res;
+}
+
+/*
+ * Helper function for dentry_operations.d_dname() members
+ */
+char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
+			const char *fmt, ...)
+{
+	va_list args;
+	char temp[64];
+	int sz;
+
+	va_start(args, fmt);
+	sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1;
+	va_end(args);
+
+	if (sz > sizeof(temp) || sz > buflen)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	buffer += buflen - sz;
+	return memcpy(buffer, temp, sz);
+}
+
+/*
+ * Write full pathname from the root of the filesystem into the buffer.
+ */
+static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
+{
+	char *end = buf + buflen;
+	char *retval;
+
+	prepend(&end, &buflen, "\0", 1);
+	if (buflen < 1)
+		goto Elong;
+	/* Get '/' right */
+	retval = end-1;
+	*retval = '/';
+
+	while (!IS_ROOT(dentry)) {
+		struct dentry *parent = dentry->d_parent;
+		int error;
+
+		prefetch(parent);
+		spin_lock(&dentry->d_lock);
+		error = prepend_name(&end, &buflen, &dentry->d_name);
+		spin_unlock(&dentry->d_lock);
+		if (error != 0 || prepend(&end, &buflen, "/", 1) != 0)
+			goto Elong;
+
+		retval = end;
+		dentry = parent;
+	}
+	return retval;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
+{
+	char *retval;
+
+	write_seqlock(&rename_lock);
+	retval = __dentry_path(dentry, buf, buflen);
+	write_sequnlock(&rename_lock);
+
+	return retval;
+}
+EXPORT_SYMBOL(dentry_path_raw);
+
+char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+{
+	char *p = NULL;
+	char *retval;
+
+	write_seqlock(&rename_lock);
+	if (d_unlinked(dentry)) {
+		p = buf + buflen;
+		if (prepend(&p, &buflen, "//deleted", 10) != 0)
+			goto Elong;
+		buflen++;
+	}
+	retval = __dentry_path(dentry, buf, buflen);
+	write_sequnlock(&rename_lock);
+	if (!IS_ERR(retval) && p)
+		*p = '/';	/* restore '/' overriden with '\0' */
+	return retval;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+/*
+ * NOTE! The user-level library version returns a
+ * character pointer. The kernel system call just
+ * returns the length of the buffer filled (which
+ * includes the ending '\0' character), or a negative
+ * error value. So libc would do something like
+ *
+ *	char *getcwd(char * buf, size_t size)
+ *	{
+ *		int retval;
+ *
+ *		retval = sys_getcwd(buf, size);
+ *		if (retval >= 0)
+ *			return buf;
+ *		errno = -retval;
+ *		return NULL;
+ *	}
+ */
+SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
+{
+	int error;
+	struct path pwd, root;
+	char *page = (char *) __get_free_page(GFP_USER);
+
+	if (!page)
+		return -ENOMEM;
+
+	get_fs_root_and_pwd(current->fs, &root, &pwd);
+
+	error = -ENOENT;
+	write_seqlock(&rename_lock);
+	if (!d_unlinked(pwd.dentry)) {
+		unsigned long len;
+		char *cwd = page + PAGE_SIZE;
+		int buflen = PAGE_SIZE;
+
+		prepend(&cwd, &buflen, "\0", 1);
+		error = prepend_path(&pwd, &root, &cwd, &buflen);
+		write_sequnlock(&rename_lock);
+
+		if (error < 0)
+			goto out;
+
+		/* Unreachable from current root */
+		if (error > 0) {
+			error = prepend_unreachable(&cwd, &buflen);
+			if (error)
+				goto out;
+		}
+
+		error = -ERANGE;
+		len = PAGE_SIZE + page - cwd;
+		if (len <= size) {
+			error = len;
+			if (copy_to_user(buf, cwd, len))
+				error = -EFAULT;
+		}
+	} else {
+		write_sequnlock(&rename_lock);
+	}
+
+out:
+	path_put(&pwd);
+	path_put(&root);
+	free_page((unsigned long) page);
+	return error;
+}
+
+/*
+ * Test whether new_dentry is a subdirectory of old_dentry.
+ *
+ * Trivially implemented using the dcache structure
+ */
+
+/**
+ * is_subdir - is new dentry a subdirectory of old_dentry
+ * @new_dentry: new dentry
+ * @old_dentry: old dentry
+ *
+ * Returns 1 if new_dentry is a subdirectory of the parent (at any depth).
+ * Returns 0 otherwise.
+ * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
+ */
+  
+int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
+{
+	int result;
+	unsigned seq;
+
+	if (new_dentry == old_dentry)
+		return 1;
+
+	do {
+		/* for restarting inner loop in case of seq retry */
+		seq = read_seqbegin(&rename_lock);
+		/*
+		 * Need rcu_readlock to protect against the d_parent trashing
+		 * due to d_move
+		 */
+		rcu_read_lock();
+		if (d_ancestor(old_dentry, new_dentry))
+			result = 1;
+		else
+			result = 0;
+		rcu_read_unlock();
+	} while (read_seqretry(&rename_lock, seq));
+
+	return result;
+}
+
+int path_is_under(struct path *path1, struct path *path2)
+{
+	struct vfsmount *mnt = path1->mnt;
+	struct dentry *dentry = path1->dentry;
+	int res;
+
+	br_read_lock(vfsmount_lock);
+	if (mnt != path2->mnt) {
+		for (;;) {
+			if (mnt->mnt_parent == mnt) {
+				br_read_unlock(vfsmount_lock);
+				return 0;
+			}
+			if (mnt->mnt_parent == path2->mnt)
+				break;
+			mnt = mnt->mnt_parent;
+		}
+		dentry = mnt->mnt_mountpoint;
+	}
+	res = is_subdir(dentry, path2->dentry);
+	br_read_unlock(vfsmount_lock);
+	return res;
+}
+EXPORT_SYMBOL(path_is_under);
+
+void d_genocide(struct dentry *root)
+{
+	struct dentry *this_parent;
+	struct list_head *next;
+	unsigned seq;
+	int locked = 0;
+
+	seq = read_seqbegin(&rename_lock);
+again:
+	this_parent = root;
+	spin_lock(&this_parent->d_lock);
+repeat:
+	next = this_parent->d_subdirs.next;
+resume:
+	while (next != &this_parent->d_subdirs) {
+		struct list_head *tmp = next;
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
+		next = tmp->next;
+
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+		if (d_unhashed(dentry) || !dentry->d_inode) {
+			spin_unlock(&dentry->d_lock);
+			continue;
+		}
+		if (!list_empty(&dentry->d_subdirs)) {
+			spin_unlock(&this_parent->d_lock);
+			spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
+			this_parent = dentry;
+			spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
+			goto repeat;
+		}
+		if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
+			dentry->d_flags |= DCACHE_GENOCIDE;
+			dentry->d_count--;
+		}
+		spin_unlock(&dentry->d_lock);
+	}
+	if (this_parent != root) {
+		struct dentry *child = this_parent;
+		if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
+			this_parent->d_flags |= DCACHE_GENOCIDE;
+			this_parent->d_count--;
+		}
+		this_parent = try_to_ascend(this_parent, locked, seq);
+		if (!this_parent)
+			goto rename_retry;
+		next = child->d_u.d_child.next;
+		goto resume;
+	}
+	spin_unlock(&this_parent->d_lock);
+	if (!locked && read_seqretry(&rename_lock, seq))
+		goto rename_retry;
+	if (locked)
+		write_sequnlock(&rename_lock);
+	return;
+
+rename_retry:
+	locked = 1;
+	write_seqlock(&rename_lock);
+	goto again;
+}
+
+/**
+ * find_inode_number - check for dentry with name
+ * @dir: directory to check
+ * @name: Name to find.
+ *
+ * Check whether a dentry already exists for the given name,
+ * and return the inode number if it has an inode. Otherwise
+ * 0 is returned.
+ *
+ * This routine is used to post-process directory listings for
+ * filesystems using synthetic inode numbers, and is necessary
+ * to keep getcwd() working.
+ */
+ 
+ino_t find_inode_number(struct dentry *dir, struct qstr *name)
+{
+	struct dentry * dentry;
+	ino_t ino = 0;
+
+	dentry = d_hash_and_lookup(dir, name);
+	if (dentry) {
+		if (dentry->d_inode)
+			ino = dentry->d_inode->i_ino;
+		dput(dentry);
+	}
+	return ino;
+}
+EXPORT_SYMBOL(find_inode_number);
+
+static __initdata unsigned long dhash_entries;
+static int __init set_dhash_entries(char *str)
+{
+	if (!str)
+		return 0;
+	dhash_entries = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("dhash_entries=", set_dhash_entries);
+
+static void __init dcache_init_early(void)
+{
+	int loop;
+
+	/* If hashes are distributed across NUMA nodes, defer
+	 * hash allocation until vmalloc space is available.
+	 */
+	if (hashdist)
+		return;
+
+	dentry_hashtable =
+		alloc_large_system_hash("Dentry cache",
+					sizeof(struct hlist_bl_head),
+					dhash_entries,
+					13,
+					HASH_EARLY,
+					&d_hash_shift,
+					&d_hash_mask,
+					0);
+
+	for (loop = 0; loop < (1 << d_hash_shift); loop++)
+		INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
+}
+
+static void __init dcache_init(void)
+{
+	int loop;
+
+	/* 
+	 * A constructor could be added for stable state like the lists,
+	 * but it is probably not worth it because of the cache nature
+	 * of the dcache. 
+	 */
+	dentry_cache = KMEM_CACHE(dentry,
+		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
+	
+	register_shrinker(&dcache_shrinker);
+
+	/* Hash may have been set up in dcache_init_early */
+	if (!hashdist)
+		return;
+
+	dentry_hashtable =
+		alloc_large_system_hash("Dentry cache",
+					sizeof(struct hlist_bl_head),
+					dhash_entries,
+					13,
+					0,
+					&d_hash_shift,
+					&d_hash_mask,
+					0);
+
+	for (loop = 0; loop < (1 << d_hash_shift); loop++)
+		INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
+}
+
+/* SLAB cache for __getname() consumers */
+struct kmem_cache *names_cachep __read_mostly;
+EXPORT_SYMBOL(names_cachep);
+
+EXPORT_SYMBOL(d_genocide);
+
+void __init vfs_caches_init_early(void)
+{
+	dcache_init_early();
+	inode_init_early();
+}
+
+void __init vfs_caches_init(unsigned long mempages)
+{
+	unsigned long reserve;
+
+	/* Base hash sizes on available memory, with a reserve equal to
+           150% of current kernel size */
+
+	reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1);
+	mempages -= reserve;
+
+	names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+
+	dcache_init();
+	inode_init();
+	files_init(mempages);
+	mnt_init();
+	bdev_cache_init();
+	chrdev_init();
+}
diff --git a/fs/dcookies.c b/fs/dcookies.c
new file mode 100644
index 00000000..dda0dc70
--- /dev/null
+++ b/fs/dcookies.c
@@ -0,0 +1,345 @@
+/*
+ * dcookies.c
+ *
+ * Copyright 2002 John Levon <levon@movementarian.org>
+ *
+ * Persistent cookie-path mappings. These are used by
+ * profilers to convert a per-task EIP value into something
+ * non-transitory that can be processed at a later date.
+ * This is done by locking the dentry/vfsmnt pair in the
+ * kernel until released by the tasks needing the persistent
+ * objects. The tag is simply an unsigned long that refers
+ * to the pair and can be looked up from userspace.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/mount.h>
+#include <linux/capability.h>
+#include <linux/dcache.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/dcookies.h>
+#include <linux/mutex.h>
+#include <linux/path.h>
+#include <asm/uaccess.h>
+
+/* The dcookies are allocated from a kmem_cache and
+ * hashed onto a small number of lists. None of the
+ * code here is particularly performance critical
+ */
+struct dcookie_struct {
+	struct path path;
+	struct list_head hash_list;
+};
+
+static LIST_HEAD(dcookie_users);
+static DEFINE_MUTEX(dcookie_mutex);
+static struct kmem_cache *dcookie_cache __read_mostly;
+static struct list_head *dcookie_hashtable __read_mostly;
+static size_t hash_size __read_mostly;
+
+static inline int is_live(void)
+{
+	return !(list_empty(&dcookie_users));
+}
+
+
+/* The dentry is locked, its address will do for the cookie */
+static inline unsigned long dcookie_value(struct dcookie_struct * dcs)
+{
+	return (unsigned long)dcs->path.dentry;
+}
+
+
+static size_t dcookie_hash(unsigned long dcookie)
+{
+	return (dcookie >> L1_CACHE_SHIFT) & (hash_size - 1);
+}
+
+
+static struct dcookie_struct * find_dcookie(unsigned long dcookie)
+{
+	struct dcookie_struct *found = NULL;
+	struct dcookie_struct * dcs;
+	struct list_head * pos;
+	struct list_head * list;
+
+	list = dcookie_hashtable + dcookie_hash(dcookie);
+
+	list_for_each(pos, list) {
+		dcs = list_entry(pos, struct dcookie_struct, hash_list);
+		if (dcookie_value(dcs) == dcookie) {
+			found = dcs;
+			break;
+		}
+	}
+
+	return found;
+}
+
+
+static void hash_dcookie(struct dcookie_struct * dcs)
+{
+	struct list_head * list = dcookie_hashtable + dcookie_hash(dcookie_value(dcs));
+	list_add(&dcs->hash_list, list);
+}
+
+
+static struct dcookie_struct *alloc_dcookie(struct path *path)
+{
+	struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache,
+							GFP_KERNEL);
+	struct dentry *d;
+	if (!dcs)
+		return NULL;
+
+	d = path->dentry;
+	spin_lock(&d->d_lock);
+	d->d_flags |= DCACHE_COOKIE;
+	spin_unlock(&d->d_lock);
+
+	dcs->path = *path;
+	path_get(path);
+	hash_dcookie(dcs);
+	return dcs;
+}
+
+
+/* This is the main kernel-side routine that retrieves the cookie
+ * value for a dentry/vfsmnt pair.
+ */
+int get_dcookie(struct path *path, unsigned long *cookie)
+{
+	int err = 0;
+	struct dcookie_struct * dcs;
+
+	mutex_lock(&dcookie_mutex);
+
+	if (!is_live()) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (path->dentry->d_flags & DCACHE_COOKIE) {
+		dcs = find_dcookie((unsigned long)path->dentry);
+	} else {
+		dcs = alloc_dcookie(path);
+		if (!dcs) {
+			err = -ENOMEM;
+			goto out;
+		}
+	}
+
+	*cookie = dcookie_value(dcs);
+
+out:
+	mutex_unlock(&dcookie_mutex);
+	return err;
+}
+
+
+/* And here is where the userspace process can look up the cookie value
+ * to retrieve the path.
+ */
+SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len)
+{
+	unsigned long cookie = (unsigned long)cookie64;
+	int err = -EINVAL;
+	char * kbuf;
+	char * path;
+	size_t pathlen;
+	struct dcookie_struct * dcs;
+
+	/* we could leak path information to users
+	 * without dir read permission without this
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&dcookie_mutex);
+
+	if (!is_live()) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (!(dcs = find_dcookie(cookie)))
+		goto out;
+
+	err = -ENOMEM;
+	kbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!kbuf)
+		goto out;
+
+	/* FIXME: (deleted) ? */
+	path = d_path(&dcs->path, kbuf, PAGE_SIZE);
+
+	mutex_unlock(&dcookie_mutex);
+
+	if (IS_ERR(path)) {
+		err = PTR_ERR(path);
+		goto out_free;
+	}
+
+	err = -ERANGE;
+ 
+	pathlen = kbuf + PAGE_SIZE - path;
+	if (pathlen <= len) {
+		err = pathlen;
+		if (copy_to_user(buf, path, pathlen))
+			err = -EFAULT;
+	}
+
+out_free:
+	kfree(kbuf);
+	return err;
+out:
+	mutex_unlock(&dcookie_mutex);
+	return err;
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_lookup_dcookie(u64 cookie64, long buf, long len)
+{
+	return SYSC_lookup_dcookie(cookie64, (char __user *) buf, (size_t) len);
+}
+SYSCALL_ALIAS(sys_lookup_dcookie, SyS_lookup_dcookie);
+#endif
+
+static int dcookie_init(void)
+{
+	struct list_head * d;
+	unsigned int i, hash_bits;
+	int err = -ENOMEM;
+
+	dcookie_cache = kmem_cache_create("dcookie_cache",
+		sizeof(struct dcookie_struct),
+		0, 0, NULL);
+
+	if (!dcookie_cache)
+		goto out;
+
+	dcookie_hashtable = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!dcookie_hashtable)
+		goto out_kmem;
+
+	err = 0;
+
+	/*
+	 * Find the power-of-two list-heads that can fit into the allocation..
+	 * We don't guarantee that "sizeof(struct list_head)" is necessarily
+	 * a power-of-two.
+	 */
+	hash_size = PAGE_SIZE / sizeof(struct list_head);
+	hash_bits = 0;
+	do {
+		hash_bits++;
+	} while ((hash_size >> hash_bits) != 0);
+	hash_bits--;
+
+	/*
+	 * Re-calculate the actual number of entries and the mask
+	 * from the number of bits we can fit.
+	 */
+	hash_size = 1UL << hash_bits;
+
+	/* And initialize the newly allocated array */
+	d = dcookie_hashtable;
+	i = hash_size;
+	do {
+		INIT_LIST_HEAD(d);
+		d++;
+		i--;
+	} while (i);
+
+out:
+	return err;
+out_kmem:
+	kmem_cache_destroy(dcookie_cache);
+	goto out;
+}
+
+
+static void free_dcookie(struct dcookie_struct * dcs)
+{
+	struct dentry *d = dcs->path.dentry;
+
+	spin_lock(&d->d_lock);
+	d->d_flags &= ~DCACHE_COOKIE;
+	spin_unlock(&d->d_lock);
+
+	path_put(&dcs->path);
+	kmem_cache_free(dcookie_cache, dcs);
+}
+
+
+static void dcookie_exit(void)
+{
+	struct list_head * list;
+	struct list_head * pos;
+	struct list_head * pos2;
+	struct dcookie_struct * dcs;
+	size_t i;
+
+	for (i = 0; i < hash_size; ++i) {
+		list = dcookie_hashtable + i;
+		list_for_each_safe(pos, pos2, list) {
+			dcs = list_entry(pos, struct dcookie_struct, hash_list);
+			list_del(&dcs->hash_list);
+			free_dcookie(dcs);
+		}
+	}
+
+	kfree(dcookie_hashtable);
+	kmem_cache_destroy(dcookie_cache);
+}
+
+
+struct dcookie_user {
+	struct list_head next;
+};
+ 
+struct dcookie_user * dcookie_register(void)
+{
+	struct dcookie_user * user;
+
+	mutex_lock(&dcookie_mutex);
+
+	user = kmalloc(sizeof(struct dcookie_user), GFP_KERNEL);
+	if (!user)
+		goto out;
+
+	if (!is_live() && dcookie_init())
+		goto out_free;
+
+	list_add(&user->next, &dcookie_users);
+
+out:
+	mutex_unlock(&dcookie_mutex);
+	return user;
+out_free:
+	kfree(user);
+	user = NULL;
+	goto out;
+}
+
+
+void dcookie_unregister(struct dcookie_user * user)
+{
+	mutex_lock(&dcookie_mutex);
+
+	list_del(&user->next);
+	kfree(user);
+
+	if (!is_live())
+		dcookie_exit();
+
+	mutex_unlock(&dcookie_mutex);
+}
+
+EXPORT_SYMBOL_GPL(dcookie_register);
+EXPORT_SYMBOL_GPL(dcookie_unregister);
+EXPORT_SYMBOL_GPL(get_dcookie);
diff --git a/fs/debugfs/Makefile b/fs/debugfs/Makefile
new file mode 100644
index 00000000..840c4569
--- /dev/null
+++ b/fs/debugfs/Makefile
@@ -0,0 +1,4 @@
+debugfs-objs	:= inode.o file.o
+
+obj-$(CONFIG_DEBUG_FS)	+= debugfs.o
+
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
new file mode 100644
index 00000000..90f76575
--- /dev/null
+++ b/fs/debugfs/file.c
@@ -0,0 +1,527 @@
+/*
+ *  file.c - part of debugfs, a tiny little debug file system
+ *
+ *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
+ *  Copyright (C) 2004 IBM Inc.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License version
+ *	2 as published by the Free Software Foundation.
+ *
+ *  debugfs is for people to use instead of /proc or /sys.
+ *  See Documentation/DocBook/filesystems for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/debugfs.h>
+
+static ssize_t default_read_file(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	return 0;
+}
+
+static ssize_t default_write_file(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	return count;
+}
+
+static int default_open(struct inode *inode, struct file *file)
+{
+	if (inode->i_private)
+		file->private_data = inode->i_private;
+
+	return 0;
+}
+
+const struct file_operations debugfs_file_operations = {
+	.read =		default_read_file,
+	.write =	default_write_file,
+	.open =		default_open,
+	.llseek =	noop_llseek,
+};
+
+static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	nd_set_link(nd, dentry->d_inode->i_private);
+	return NULL;
+}
+
+const struct inode_operations debugfs_link_operations = {
+	.readlink       = generic_readlink,
+	.follow_link    = debugfs_follow_link,
+};
+
+static int debugfs_u8_set(void *data, u64 val)
+{
+	*(u8 *)data = val;
+	return 0;
+}
+static int debugfs_u8_get(void *data, u64 *val)
+{
+	*val = *(u8 *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
+
+/**
+ * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value.  If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_u8(const char *name, mode_t mode,
+				 struct dentry *parent, u8 *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u8_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u8_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_u8);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_u8);
+
+static int debugfs_u16_set(void *data, u64 val)
+{
+	*(u16 *)data = val;
+	return 0;
+}
+static int debugfs_u16_get(void *data, u64 *val)
+{
+	*val = *(u16 *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
+
+/**
+ * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value.  If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_u16(const char *name, mode_t mode,
+				  struct dentry *parent, u16 *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u16_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u16_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_u16);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_u16);
+
+static int debugfs_u32_set(void *data, u64 val)
+{
+	*(u32 *)data = val;
+	return 0;
+}
+static int debugfs_u32_get(void *data, u64 *val)
+{
+	*val = *(u32 *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
+
+/**
+ * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value.  If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_u32(const char *name, mode_t mode,
+				 struct dentry *parent, u32 *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u32_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u32_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_u32);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_u32);
+
+static int debugfs_u64_set(void *data, u64 val)
+{
+	*(u64 *)data = val;
+	return 0;
+}
+
+static int debugfs_u64_get(void *data, u64 *val)
+{
+	*val = *(u64 *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
+
+/**
+ * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value.  If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+				 struct dentry *parent, u64 *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u64_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u64_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_u64);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_u64);
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n");
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
+
+/*
+ * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
+ *
+ * These functions are exactly the same as the above functions (but use a hex
+ * output for the decimal challenged). For details look at the above unsigned
+ * decimal functions.
+ */
+
+/**
+ * debugfs_create_x8 - create a debugfs file that is used to read and write an unsigned 8-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+				 struct dentry *parent, u8 *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x8_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x8_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_x8);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x8);
+
+/**
+ * debugfs_create_x16 - create a debugfs file that is used to read and write an unsigned 16-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+				 struct dentry *parent, u16 *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x16_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x16_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_x16);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x16);
+
+/**
+ * debugfs_create_x32 - create a debugfs file that is used to read and write an unsigned 32-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+				 struct dentry *parent, u32 *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x32_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x32_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_x32);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x32);
+
+/**
+ * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+				 struct dentry *parent, u64 *value)
+{
+	return debugfs_create_file(name, mode, parent, value, &fops_x64);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x64);
+
+
+static int debugfs_size_t_set(void *data, u64 val)
+{
+	*(size_t *)data = val;
+	return 0;
+}
+static int debugfs_size_t_get(void *data, u64 *val)
+{
+	*val = *(size_t *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
+			"%llu\n");	/* %llu and %zu are more or less the same */
+
+/**
+ * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+				     struct dentry *parent, size_t *value)
+{
+	return debugfs_create_file(name, mode, parent, value, &fops_size_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_size_t);
+
+
+static ssize_t read_file_bool(struct file *file, char __user *user_buf,
+			      size_t count, loff_t *ppos)
+{
+	char buf[3];
+	u32 *val = file->private_data;
+	
+	if (*val)
+		buf[0] = 'Y';
+	else
+		buf[0] = 'N';
+	buf[1] = '\n';
+	buf[2] = 0x00;
+	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
+			       size_t count, loff_t *ppos)
+{
+	char buf[32];
+	size_t buf_size;
+	bool bv;
+	u32 *val = file->private_data;
+
+	buf_size = min(count, (sizeof(buf)-1));
+	if (copy_from_user(buf, user_buf, buf_size))
+		return -EFAULT;
+
+	if (strtobool(buf, &bv) == 0)
+		*val = bv;
+
+	return count;
+}
+
+static const struct file_operations fops_bool = {
+	.read =		read_file_bool,
+	.write =	write_file_bool,
+	.open =		default_open,
+	.llseek =	default_llseek,
+};
+
+/**
+ * debugfs_create_bool - create a debugfs file that is used to read and write a boolean value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value.  If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_bool(const char *name, mode_t mode,
+				   struct dentry *parent, u32 *value)
+{
+	return debugfs_create_file(name, mode, parent, value, &fops_bool);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_bool);
+
+static ssize_t read_file_blob(struct file *file, char __user *user_buf,
+			      size_t count, loff_t *ppos)
+{
+	struct debugfs_blob_wrapper *blob = file->private_data;
+	return simple_read_from_buffer(user_buf, count, ppos, blob->data,
+			blob->size);
+}
+
+static const struct file_operations fops_blob = {
+	.read =		read_file_blob,
+	.open =		default_open,
+	.llseek =	default_llseek,
+};
+
+/**
+ * debugfs_create_blob - create a debugfs file that is used to read a binary blob
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @blob: a pointer to a struct debugfs_blob_wrapper which contains a pointer
+ *        to the blob data and the size of the data.
+ *
+ * This function creates a file in debugfs with the given name that exports
+ * @blob->data as a binary blob. If the @mode variable is so set it can be
+ * read from. Writing is not supported.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+				   struct dentry *parent,
+				   struct debugfs_blob_wrapper *blob)
+{
+	return debugfs_create_file(name, mode, parent, blob, &fops_blob);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_blob);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
new file mode 100644
index 00000000..e7a7a2f0
--- /dev/null
+++ b/fs/debugfs/inode.c
@@ -0,0 +1,544 @@
+/*
+ *  file.c - part of debugfs, a tiny little debug file system
+ *
+ *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
+ *  Copyright (C) 2004 IBM Inc.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License version
+ *	2 as published by the Free Software Foundation.
+ *
+ *  debugfs is for people to use instead of /proc or /sys.
+ *  See Documentation/DocBook/kernel-api for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/namei.h>
+#include <linux/debugfs.h>
+#include <linux/fsnotify.h>
+#include <linux/string.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+
+static struct vfsmount *debugfs_mount;
+static int debugfs_mount_count;
+static bool debugfs_registered;
+
+static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev,
+				       void *data, const struct file_operations *fops)
+
+{
+	struct inode *inode = new_inode(sb);
+
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode->i_mode = mode;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		switch (mode & S_IFMT) {
+		default:
+			init_special_inode(inode, mode, dev);
+			break;
+		case S_IFREG:
+			inode->i_fop = fops ? fops : &debugfs_file_operations;
+			inode->i_private = data;
+			break;
+		case S_IFLNK:
+			inode->i_op = &debugfs_link_operations;
+			inode->i_fop = fops;
+			inode->i_private = data;
+			break;
+		case S_IFDIR:
+			inode->i_op = &simple_dir_inode_operations;
+			inode->i_fop = fops ? fops : &simple_dir_operations;
+			inode->i_private = data;
+
+			/* directory inodes start off with i_nlink == 2
+			 * (for "." entry) */
+			inc_nlink(inode);
+			break;
+		}
+	}
+	return inode; 
+}
+
+/* SMP-safe */
+static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
+			 int mode, dev_t dev, void *data,
+			 const struct file_operations *fops)
+{
+	struct inode *inode;
+	int error = -EPERM;
+
+	if (dentry->d_inode)
+		return -EEXIST;
+
+	inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops);
+	if (inode) {
+		d_instantiate(dentry, inode);
+		dget(dentry);
+		error = 0;
+	}
+	return error;
+}
+
+static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode,
+			 void *data, const struct file_operations *fops)
+{
+	int res;
+
+	mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
+	res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
+	if (!res) {
+		inc_nlink(dir);
+		fsnotify_mkdir(dir, dentry);
+	}
+	return res;
+}
+
+static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode,
+			void *data, const struct file_operations *fops)
+{
+	mode = (mode & S_IALLUGO) | S_IFLNK;
+	return debugfs_mknod(dir, dentry, mode, 0, data, fops);
+}
+
+static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode,
+			  void *data, const struct file_operations *fops)
+{
+	int res;
+
+	mode = (mode & S_IALLUGO) | S_IFREG;
+	res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
+	if (!res)
+		fsnotify_create(dir, dentry);
+	return res;
+}
+
+static inline int debugfs_positive(struct dentry *dentry)
+{
+	return dentry->d_inode && !d_unhashed(dentry);
+}
+
+static int debug_fill_super(struct super_block *sb, void *data, int silent)
+{
+	static struct tree_descr debug_files[] = {{""}};
+
+	return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
+}
+
+static struct dentry *debug_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data)
+{
+	return mount_single(fs_type, flags, data, debug_fill_super);
+}
+
+static struct file_system_type debug_fs_type = {
+	.owner =	THIS_MODULE,
+	.name =		"debugfs",
+	.mount =	debug_mount,
+	.kill_sb =	kill_litter_super,
+};
+
+static int debugfs_create_by_name(const char *name, mode_t mode,
+				  struct dentry *parent,
+				  struct dentry **dentry,
+				  void *data,
+				  const struct file_operations *fops)
+{
+	int error = 0;
+
+	/* If the parent is not specified, we create it in the root.
+	 * We need the root dentry to do this, which is in the super 
+	 * block. A pointer to that is in the struct vfsmount that we
+	 * have around.
+	 */
+	if (!parent)
+		parent = debugfs_mount->mnt_sb->s_root;
+
+	*dentry = NULL;
+	mutex_lock(&parent->d_inode->i_mutex);
+	*dentry = lookup_one_len(name, parent, strlen(name));
+	if (!IS_ERR(*dentry)) {
+		switch (mode & S_IFMT) {
+		case S_IFDIR:
+			error = debugfs_mkdir(parent->d_inode, *dentry, mode,
+					      data, fops);
+			break;
+		case S_IFLNK:
+			error = debugfs_link(parent->d_inode, *dentry, mode,
+					     data, fops);
+			break;
+		default:
+			error = debugfs_create(parent->d_inode, *dentry, mode,
+					       data, fops);
+			break;
+		}
+		dput(*dentry);
+	} else
+		error = PTR_ERR(*dentry);
+	mutex_unlock(&parent->d_inode->i_mutex);
+
+	return error;
+}
+
+/**
+ * debugfs_create_file - create a file in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this paramater is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ *        on.  The inode.i_private pointer will point to this value on
+ *        the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ *        this file.
+ *
+ * This is the basic "create a file" function for debugfs.  It allows for a
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the debugfs_create_dir() function is
+ * recommended to be used instead.)
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_create_file(const char *name, mode_t mode,
+				   struct dentry *parent, void *data,
+				   const struct file_operations *fops)
+{
+	struct dentry *dentry = NULL;
+	int error;
+
+	pr_debug("debugfs: creating file '%s'\n",name);
+
+	error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
+			      &debugfs_mount_count);
+	if (error)
+		goto exit;
+
+	error = debugfs_create_by_name(name, mode, parent, &dentry,
+				       data, fops);
+	if (error) {
+		dentry = NULL;
+		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+		goto exit;
+	}
+exit:
+	return dentry;
+}
+EXPORT_SYMBOL_GPL(debugfs_create_file);
+
+/**
+ * debugfs_create_dir - create a directory in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the directory to
+ *        create.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this paramater is NULL, then the
+ *          directory will be created in the root of the debugfs filesystem.
+ *
+ * This function creates a directory in debugfs with the given name.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
+{
+	return debugfs_create_file(name, 
+				   S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
+				   parent, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_dir);
+
+/**
+ * debugfs_create_symlink- create a symbolic link in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the symbolic link to
+ *        create.
+ * @parent: a pointer to the parent dentry for this symbolic link.  This
+ *          should be a directory dentry if set.  If this paramater is NULL,
+ *          then the symbolic link will be created in the root of the debugfs
+ *          filesystem.
+ * @target: a pointer to a string containing the path to the target of the
+ *          symbolic link.
+ *
+ * This function creates a symbolic link with the given name in debugfs that
+ * links to the given target path.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the symbolic
+ * link is to be removed (no automatic cleanup happens if your module is
+ * unloaded, you are responsible here.)  If an error occurs, %NULL will be
+ * returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
+				      const char *target)
+{
+	struct dentry *result;
+	char *link;
+
+	link = kstrdup(target, GFP_KERNEL);
+	if (!link)
+		return NULL;
+
+	result = debugfs_create_file(name, S_IFLNK | S_IRWXUGO, parent, link,
+				     NULL);
+	if (!result)
+		kfree(link);
+	return result;
+}
+EXPORT_SYMBOL_GPL(debugfs_create_symlink);
+
+static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
+{
+	int ret = 0;
+
+	if (debugfs_positive(dentry)) {
+		if (dentry->d_inode) {
+			dget(dentry);
+			switch (dentry->d_inode->i_mode & S_IFMT) {
+			case S_IFDIR:
+				ret = simple_rmdir(parent->d_inode, dentry);
+				break;
+			case S_IFLNK:
+				kfree(dentry->d_inode->i_private);
+				/* fall through */
+			default:
+				simple_unlink(parent->d_inode, dentry);
+				break;
+			}
+			if (!ret)
+				d_delete(dentry);
+			dput(dentry);
+		}
+	}
+	return ret;
+}
+
+/**
+ * debugfs_remove - removes a file or directory from the debugfs filesystem
+ * @dentry: a pointer to a the dentry of the file or directory to be
+ *          removed.
+ *
+ * This function removes a file or directory in debugfs that was previously
+ * created with a call to another debugfs function (like
+ * debugfs_create_file() or variants thereof.)
+ *
+ * This function is required to be called in order for the file to be
+ * removed, no automatic cleanup of files will happen when a module is
+ * removed, you are responsible here.
+ */
+void debugfs_remove(struct dentry *dentry)
+{
+	struct dentry *parent;
+	int ret;
+
+	if (!dentry)
+		return;
+
+	parent = dentry->d_parent;
+	if (!parent || !parent->d_inode)
+		return;
+
+	mutex_lock(&parent->d_inode->i_mutex);
+	ret = __debugfs_remove(dentry, parent);
+	mutex_unlock(&parent->d_inode->i_mutex);
+	if (!ret)
+		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+}
+EXPORT_SYMBOL_GPL(debugfs_remove);
+
+/**
+ * debugfs_remove_recursive - recursively removes a directory
+ * @dentry: a pointer to a the dentry of the directory to be removed.
+ *
+ * This function recursively removes a directory tree in debugfs that
+ * was previously created with a call to another debugfs function
+ * (like debugfs_create_file() or variants thereof.)
+ *
+ * This function is required to be called in order for the file to be
+ * removed, no automatic cleanup of files will happen when a module is
+ * removed, you are responsible here.
+ */
+void debugfs_remove_recursive(struct dentry *dentry)
+{
+	struct dentry *child;
+	struct dentry *parent;
+
+	if (!dentry)
+		return;
+
+	parent = dentry->d_parent;
+	if (!parent || !parent->d_inode)
+		return;
+
+	parent = dentry;
+	mutex_lock(&parent->d_inode->i_mutex);
+
+	while (1) {
+		/*
+		 * When all dentries under "parent" has been removed,
+		 * walk up the tree until we reach our starting point.
+		 */
+		if (list_empty(&parent->d_subdirs)) {
+			mutex_unlock(&parent->d_inode->i_mutex);
+			if (parent == dentry)
+				break;
+			parent = parent->d_parent;
+			mutex_lock(&parent->d_inode->i_mutex);
+		}
+		child = list_entry(parent->d_subdirs.next, struct dentry,
+				d_u.d_child);
+ next_sibling:
+
+		/*
+		 * If "child" isn't empty, walk down the tree and
+		 * remove all its descendants first.
+		 */
+		if (!list_empty(&child->d_subdirs)) {
+			mutex_unlock(&parent->d_inode->i_mutex);
+			parent = child;
+			mutex_lock(&parent->d_inode->i_mutex);
+			continue;
+		}
+		__debugfs_remove(child, parent);
+		if (parent->d_subdirs.next == &child->d_u.d_child) {
+			/*
+			 * Try the next sibling.
+			 */
+			if (child->d_u.d_child.next != &parent->d_subdirs) {
+				child = list_entry(child->d_u.d_child.next,
+						   struct dentry,
+						   d_u.d_child);
+				goto next_sibling;
+			}
+
+			/*
+			 * Avoid infinite loop if we fail to remove
+			 * one dentry.
+			 */
+			mutex_unlock(&parent->d_inode->i_mutex);
+			break;
+		}
+		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+	}
+
+	parent = dentry->d_parent;
+	mutex_lock(&parent->d_inode->i_mutex);
+	__debugfs_remove(dentry, parent);
+	mutex_unlock(&parent->d_inode->i_mutex);
+	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+}
+EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
+
+/**
+ * debugfs_rename - rename a file/directory in the debugfs filesystem
+ * @old_dir: a pointer to the parent dentry for the renamed object. This
+ *          should be a directory dentry.
+ * @old_dentry: dentry of an object to be renamed.
+ * @new_dir: a pointer to the parent dentry where the object should be
+ *          moved. This should be a directory dentry.
+ * @new_name: a pointer to a string containing the target name.
+ *
+ * This function renames a file/directory in debugfs.  The target must not
+ * exist for rename to succeed.
+ *
+ * This function will return a pointer to old_dentry (which is updated to
+ * reflect renaming) if it succeeds. If an error occurs, %NULL will be
+ * returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
+		struct dentry *new_dir, const char *new_name)
+{
+	int error;
+	struct dentry *dentry = NULL, *trap;
+	const char *old_name;
+
+	trap = lock_rename(new_dir, old_dir);
+	/* Source or destination directories don't exist? */
+	if (!old_dir->d_inode || !new_dir->d_inode)
+		goto exit;
+	/* Source does not exist, cyclic rename, or mountpoint? */
+	if (!old_dentry->d_inode || old_dentry == trap ||
+	    d_mountpoint(old_dentry))
+		goto exit;
+	dentry = lookup_one_len(new_name, new_dir, strlen(new_name));
+	/* Lookup failed, cyclic rename or target exists? */
+	if (IS_ERR(dentry) || dentry == trap || dentry->d_inode)
+		goto exit;
+
+	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
+
+	error = simple_rename(old_dir->d_inode, old_dentry, new_dir->d_inode,
+		dentry);
+	if (error) {
+		fsnotify_oldname_free(old_name);
+		goto exit;
+	}
+	d_move(old_dentry, dentry);
+	fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name,
+		S_ISDIR(old_dentry->d_inode->i_mode),
+		NULL, old_dentry);
+	fsnotify_oldname_free(old_name);
+	unlock_rename(new_dir, old_dir);
+	dput(dentry);
+	return old_dentry;
+exit:
+	if (dentry && !IS_ERR(dentry))
+		dput(dentry);
+	unlock_rename(new_dir, old_dir);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(debugfs_rename);
+
+/**
+ * debugfs_initialized - Tells whether debugfs has been registered
+ */
+bool debugfs_initialized(void)
+{
+	return debugfs_registered;
+}
+EXPORT_SYMBOL_GPL(debugfs_initialized);
+
+
+static struct kobject *debug_kobj;
+
+static int __init debugfs_init(void)
+{
+	int retval;
+
+	debug_kobj = kobject_create_and_add("debug", kernel_kobj);
+	if (!debug_kobj)
+		return -EINVAL;
+
+	retval = register_filesystem(&debug_fs_type);
+	if (retval)
+		kobject_put(debug_kobj);
+	else
+		debugfs_registered = true;
+
+	return retval;
+}
+core_initcall(debugfs_init);
+
diff --git a/fs/devpts/Makefile b/fs/devpts/Makefile
new file mode 100644
index 00000000..236696ef
--- /dev/null
+++ b/fs/devpts/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux /dev/pts virtual filesystem.
+#
+
+obj-$(CONFIG_UNIX98_PTYS)		+= devpts.o
+
+devpts-$(CONFIG_UNIX98_PTYS)		:= inode.o
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
new file mode 100644
index 00000000..2f27e578
--- /dev/null
+++ b/fs/devpts/inode.c
@@ -0,0 +1,572 @@
+/* -*- linux-c -*- --------------------------------------------------------- *
+ *
+ * linux/fs/devpts/inode.c
+ *
+ *  Copyright 1998-2004 H. Peter Anvin -- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/tty.h>
+#include <linux/mutex.h>
+#include <linux/magic.h>
+#include <linux/idr.h>
+#include <linux/devpts_fs.h>
+#include <linux/parser.h>
+#include <linux/fsnotify.h>
+#include <linux/seq_file.h>
+
+#define DEVPTS_DEFAULT_MODE 0600
+/*
+ * ptmx is a new node in /dev/pts and will be unused in legacy (single-
+ * instance) mode. To prevent surprises in user space, set permissions of
+ * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful
+ * permissions.
+ */
+#define DEVPTS_DEFAULT_PTMX_MODE 0000
+#define PTMX_MINOR	2
+
+extern int pty_limit;			/* Config limit on Unix98 ptys */
+static DEFINE_MUTEX(allocated_ptys_lock);
+
+static struct vfsmount *devpts_mnt;
+
+struct pts_mount_opts {
+	int setuid;
+	int setgid;
+	uid_t   uid;
+	gid_t   gid;
+	umode_t mode;
+	umode_t ptmxmode;
+	int newinstance;
+};
+
+enum {
+	Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance,
+	Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_mode, "mode=%o"},
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+	{Opt_ptmxmode, "ptmxmode=%o"},
+	{Opt_newinstance, "newinstance"},
+#endif
+	{Opt_err, NULL}
+};
+
+struct pts_fs_info {
+	struct ida allocated_ptys;
+	struct pts_mount_opts mount_opts;
+	struct dentry *ptmx_dentry;
+};
+
+static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct super_block *pts_sb_from_inode(struct inode *inode)
+{
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+	if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
+		return inode->i_sb;
+#endif
+	return devpts_mnt->mnt_sb;
+}
+
+#define PARSE_MOUNT	0
+#define PARSE_REMOUNT	1
+
+/*
+ * parse_mount_options():
+ * 	Set @opts to mount options specified in @data. If an option is not
+ * 	specified in @data, set it to its default value. The exception is
+ * 	'newinstance' option which can only be set/cleared on a mount (i.e.
+ * 	cannot be changed during remount).
+ *
+ * Note: @data may be NULL (in which case all options are set to default).
+ */
+static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
+{
+	char *p;
+
+	opts->setuid  = 0;
+	opts->setgid  = 0;
+	opts->uid     = 0;
+	opts->gid     = 0;
+	opts->mode    = DEVPTS_DEFAULT_MODE;
+	opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+
+	/* newinstance makes sense only on initial mount */
+	if (op == PARSE_MOUNT)
+		opts->newinstance = 0;
+
+	while ((p = strsep(&data, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		int token;
+		int option;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			opts->uid = option;
+			opts->setuid = 1;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			opts->gid = option;
+			opts->setgid = 1;
+			break;
+		case Opt_mode:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->mode = option & S_IALLUGO;
+			break;
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+		case Opt_ptmxmode:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->ptmxmode = option & S_IALLUGO;
+			break;
+		case Opt_newinstance:
+			/* newinstance makes sense only on initial mount */
+			if (op == PARSE_MOUNT)
+				opts->newinstance = 1;
+			break;
+#endif
+		default:
+			printk(KERN_ERR "devpts: called with bogus options\n");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+static int mknod_ptmx(struct super_block *sb)
+{
+	int mode;
+	int rc = -ENOMEM;
+	struct dentry *dentry;
+	struct inode *inode;
+	struct dentry *root = sb->s_root;
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+	struct pts_mount_opts *opts = &fsi->mount_opts;
+
+	mutex_lock(&root->d_inode->i_mutex);
+
+	/* If we have already created ptmx node, return */
+	if (fsi->ptmx_dentry) {
+		rc = 0;
+		goto out;
+	}
+
+	dentry = d_alloc_name(root, "ptmx");
+	if (!dentry) {
+		printk(KERN_NOTICE "Unable to alloc dentry for ptmx node\n");
+		goto out;
+	}
+
+	/*
+	 * Create a new 'ptmx' node in this mount of devpts.
+	 */
+	inode = new_inode(sb);
+	if (!inode) {
+		printk(KERN_ERR "Unable to alloc inode for ptmx node\n");
+		dput(dentry);
+		goto out;
+	}
+
+	inode->i_ino = 2;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+	mode = S_IFCHR|opts->ptmxmode;
+	init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
+
+	d_add(dentry, inode);
+
+	fsi->ptmx_dentry = dentry;
+	rc = 0;
+out:
+	mutex_unlock(&root->d_inode->i_mutex);
+	return rc;
+}
+
+static void update_ptmx_mode(struct pts_fs_info *fsi)
+{
+	struct inode *inode;
+	if (fsi->ptmx_dentry) {
+		inode = fsi->ptmx_dentry->d_inode;
+		inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
+	}
+}
+#else
+static inline void update_ptmx_mode(struct pts_fs_info *fsi)
+{
+       return;
+}
+#endif
+
+static int devpts_remount(struct super_block *sb, int *flags, char *data)
+{
+	int err;
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+	struct pts_mount_opts *opts = &fsi->mount_opts;
+
+	err = parse_mount_options(data, PARSE_REMOUNT, opts);
+
+	/*
+	 * parse_mount_options() restores options to default values
+	 * before parsing and may have changed ptmxmode. So, update the
+	 * mode in the inode too. Bogus options don't fail the remount,
+	 * so do this even on error return.
+	 */
+	update_ptmx_mode(fsi);
+
+	return err;
+}
+
+static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct pts_fs_info *fsi = DEVPTS_SB(vfs->mnt_sb);
+	struct pts_mount_opts *opts = &fsi->mount_opts;
+
+	if (opts->setuid)
+		seq_printf(seq, ",uid=%u", opts->uid);
+	if (opts->setgid)
+		seq_printf(seq, ",gid=%u", opts->gid);
+	seq_printf(seq, ",mode=%03o", opts->mode);
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+	seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
+#endif
+
+	return 0;
+}
+
+static const struct super_operations devpts_sops = {
+	.statfs		= simple_statfs,
+	.remount_fs	= devpts_remount,
+	.show_options	= devpts_show_options,
+};
+
+static void *new_pts_fs_info(void)
+{
+	struct pts_fs_info *fsi;
+
+	fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL);
+	if (!fsi)
+		return NULL;
+
+	ida_init(&fsi->allocated_ptys);
+	fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
+	fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+
+	return fsi;
+}
+
+static int
+devpts_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct inode *inode;
+
+	s->s_blocksize = 1024;
+	s->s_blocksize_bits = 10;
+	s->s_magic = DEVPTS_SUPER_MAGIC;
+	s->s_op = &devpts_sops;
+	s->s_time_gran = 1;
+
+	s->s_fs_info = new_pts_fs_info();
+	if (!s->s_fs_info)
+		goto fail;
+
+	inode = new_inode(s);
+	if (!inode)
+		goto free_fsi;
+	inode->i_ino = 1;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
+	inode->i_op = &simple_dir_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+	inode->i_nlink = 2;
+
+	s->s_root = d_alloc_root(inode);
+	if (s->s_root)
+		return 0;
+
+	printk(KERN_ERR "devpts: get root dentry failed\n");
+	iput(inode);
+
+free_fsi:
+	kfree(s->s_fs_info);
+fail:
+	return -ENOMEM;
+}
+
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+static int compare_init_pts_sb(struct super_block *s, void *p)
+{
+	if (devpts_mnt)
+		return devpts_mnt->mnt_sb == s;
+	return 0;
+}
+
+/*
+ * devpts_mount()
+ *
+ *     If the '-o newinstance' mount option was specified, mount a new
+ *     (private) instance of devpts.  PTYs created in this instance are
+ *     independent of the PTYs in other devpts instances.
+ *
+ *     If the '-o newinstance' option was not specified, mount/remount the
+ *     initial kernel mount of devpts.  This type of mount gives the
+ *     legacy, single-instance semantics.
+ *
+ *     The 'newinstance' option is needed to support multiple namespace
+ *     semantics in devpts while preserving backward compatibility of the
+ *     current 'single-namespace' semantics. i.e all mounts of devpts
+ *     without the 'newinstance' mount option should bind to the initial
+ *     kernel mount, like mount_single().
+ *
+ *     Mounts with 'newinstance' option create a new, private namespace.
+ *
+ *     NOTE:
+ *
+ *     For single-mount semantics, devpts cannot use mount_single(),
+ *     because mount_single()/sget() find and use the super-block from
+ *     the most recent mount of devpts. But that recent mount may be a
+ *     'newinstance' mount and mount_single() would pick the newinstance
+ *     super-block instead of the initial super-block.
+ */
+static struct dentry *devpts_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	int error;
+	struct pts_mount_opts opts;
+	struct super_block *s;
+
+	error = parse_mount_options(data, PARSE_MOUNT, &opts);
+	if (error)
+		return ERR_PTR(error);
+
+	if (opts.newinstance)
+		s = sget(fs_type, NULL, set_anon_super, NULL);
+	else
+		s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
+
+	if (IS_ERR(s))
+		return ERR_CAST(s);
+
+	if (!s->s_root) {
+		s->s_flags = flags;
+		error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		if (error)
+			goto out_undo_sget;
+		s->s_flags |= MS_ACTIVE;
+	}
+
+	memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
+
+	error = mknod_ptmx(s);
+	if (error)
+		goto out_undo_sget;
+
+	return dget(s->s_root);
+
+out_undo_sget:
+	deactivate_locked_super(s);
+	return ERR_PTR(error);
+}
+
+#else
+/*
+ * This supports only the legacy single-instance semantics (no
+ * multiple-instance semantics)
+ */
+static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *data)
+{
+	return mount_single(fs_type, flags, data, devpts_fill_super);
+}
+#endif
+
+static void devpts_kill_sb(struct super_block *sb)
+{
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+
+	kfree(fsi);
+	kill_litter_super(sb);
+}
+
+static struct file_system_type devpts_fs_type = {
+	.name		= "devpts",
+	.mount		= devpts_mount,
+	.kill_sb	= devpts_kill_sb,
+};
+
+/*
+ * The normal naming convention is simply /dev/pts/<number>; this conforms
+ * to the System V naming convention
+ */
+
+int devpts_new_index(struct inode *ptmx_inode)
+{
+	struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+	int index;
+	int ida_ret;
+
+retry:
+	if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
+		return -ENOMEM;
+
+	mutex_lock(&allocated_ptys_lock);
+	ida_ret = ida_get_new(&fsi->allocated_ptys, &index);
+	if (ida_ret < 0) {
+		mutex_unlock(&allocated_ptys_lock);
+		if (ida_ret == -EAGAIN)
+			goto retry;
+		return -EIO;
+	}
+
+	if (index >= pty_limit) {
+		ida_remove(&fsi->allocated_ptys, index);
+		mutex_unlock(&allocated_ptys_lock);
+		return -EIO;
+	}
+	mutex_unlock(&allocated_ptys_lock);
+	return index;
+}
+
+void devpts_kill_index(struct inode *ptmx_inode, int idx)
+{
+	struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+
+	mutex_lock(&allocated_ptys_lock);
+	ida_remove(&fsi->allocated_ptys, idx);
+	mutex_unlock(&allocated_ptys_lock);
+}
+
+int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
+{
+	/* tty layer puts index from devpts_new_index() in here */
+	int number = tty->index;
+	struct tty_driver *driver = tty->driver;
+	dev_t device = MKDEV(driver->major, driver->minor_start+number);
+	struct dentry *dentry;
+	struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+	struct inode *inode = new_inode(sb);
+	struct dentry *root = sb->s_root;
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+	struct pts_mount_opts *opts = &fsi->mount_opts;
+	int ret = 0;
+	char s[12];
+
+	/* We're supposed to be given the slave end of a pty */
+	BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY);
+	BUG_ON(driver->subtype != PTY_TYPE_SLAVE);
+
+	if (!inode)
+		return -ENOMEM;
+
+	inode->i_ino = number + 3;
+	inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
+	inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	init_special_inode(inode, S_IFCHR|opts->mode, device);
+	inode->i_private = tty;
+	tty->driver_data = inode;
+
+	sprintf(s, "%d", number);
+
+	mutex_lock(&root->d_inode->i_mutex);
+
+	dentry = d_alloc_name(root, s);
+	if (dentry) {
+		d_add(dentry, inode);
+		fsnotify_create(root->d_inode, dentry);
+	} else {
+		iput(inode);
+		ret = -ENOMEM;
+	}
+
+	mutex_unlock(&root->d_inode->i_mutex);
+
+	return ret;
+}
+
+struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
+{
+	struct dentry *dentry;
+	struct tty_struct *tty;
+
+	BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
+
+	/* Ensure dentry has not been deleted by devpts_pty_kill() */
+	dentry = d_find_alias(pts_inode);
+	if (!dentry)
+		return NULL;
+
+	tty = NULL;
+	if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
+		tty = (struct tty_struct *)pts_inode->i_private;
+
+	dput(dentry);
+
+	return tty;
+}
+
+void devpts_pty_kill(struct tty_struct *tty)
+{
+	struct inode *inode = tty->driver_data;
+	struct super_block *sb = pts_sb_from_inode(inode);
+	struct dentry *root = sb->s_root;
+	struct dentry *dentry;
+
+	BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
+
+	mutex_lock(&root->d_inode->i_mutex);
+
+	dentry = d_find_alias(inode);
+
+	inode->i_nlink--;
+	d_delete(dentry);
+	dput(dentry);	/* d_alloc_name() in devpts_pty_new() */
+	dput(dentry);		/* d_find_alias above */
+
+	mutex_unlock(&root->d_inode->i_mutex);
+}
+
+static int __init init_devpts_fs(void)
+{
+	int err = register_filesystem(&devpts_fs_type);
+	if (!err) {
+		devpts_mnt = kern_mount(&devpts_fs_type);
+		if (IS_ERR(devpts_mnt)) {
+			err = PTR_ERR(devpts_mnt);
+			unregister_filesystem(&devpts_fs_type);
+		}
+	}
+	return err;
+}
+module_init(init_devpts_fs)
diff --git a/fs/direct-io.c b/fs/direct-io.c
new file mode 100644
index 00000000..ac5f1641
--- /dev/null
+++ b/fs/direct-io.c
@@ -0,0 +1,1256 @@
+/*
+ * fs/direct-io.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * O_DIRECT
+ *
+ * 04Jul2002	Andrew Morton
+ *		Initial version
+ * 11Sep2002	janetinc@us.ibm.com
+ * 		added readv/writev support.
+ * 29Oct2002	Andrew Morton
+ *		rewrote bio_add_page() support.
+ * 30Oct2002	pbadari@us.ibm.com
+ *		added support for non-aligned IO.
+ * 06Nov2002	pbadari@us.ibm.com
+ *		added asynchronous IO support.
+ * 21Jul2003	nathans@sgi.com
+ *		added IO completion notifier.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/bio.h>
+#include <linux/wait.h>
+#include <linux/err.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/rwsem.h>
+#include <linux/uio.h>
+#include <asm/atomic.h>
+
+/*
+ * How many user pages to map in one call to get_user_pages().  This determines
+ * the size of a structure on the stack.
+ */
+#define DIO_PAGES	64
+
+/*
+ * This code generally works in units of "dio_blocks".  A dio_block is
+ * somewhere between the hard sector size and the filesystem block size.  it
+ * is determined on a per-invocation basis.   When talking to the filesystem
+ * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity
+ * down by dio->blkfactor.  Similarly, fs-blocksize quantities are converted
+ * to bio_block quantities by shifting left by blkfactor.
+ *
+ * If blkfactor is zero then the user's request was aligned to the filesystem's
+ * blocksize.
+ */
+
+struct dio {
+	/* BIO submission state */
+	struct bio *bio;		/* bio under assembly */
+	struct inode *inode;
+	int rw;
+	loff_t i_size;			/* i_size when submitted */
+	int flags;			/* doesn't change */
+	unsigned blkbits;		/* doesn't change */
+	unsigned blkfactor;		/* When we're using an alignment which
+					   is finer than the filesystem's soft
+					   blocksize, this specifies how much
+					   finer.  blkfactor=2 means 1/4-block
+					   alignment.  Does not change */
+	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
+					   been performed at the start of a
+					   write */
+	int pages_in_io;		/* approximate total IO pages */
+	size_t	size;			/* total request size (doesn't change)*/
+	sector_t block_in_file;		/* Current offset into the underlying
+					   file in dio_block units. */
+	unsigned blocks_available;	/* At block_in_file.  changes */
+	sector_t final_block_in_request;/* doesn't change */
+	unsigned first_block_in_page;	/* doesn't change, Used only once */
+	int boundary;			/* prev block is at a boundary */
+	int reap_counter;		/* rate limit reaping */
+	get_block_t *get_block;		/* block mapping function */
+	dio_iodone_t *end_io;		/* IO completion function */
+	dio_submit_t *submit_io;	/* IO submition function */
+	loff_t logical_offset_in_bio;	/* current first logical block in bio */
+	sector_t final_block_in_bio;	/* current final block in bio + 1 */
+	sector_t next_block_for_io;	/* next block to be put under IO,
+					   in dio_blocks units */
+	struct buffer_head map_bh;	/* last get_block() result */
+
+	/*
+	 * Deferred addition of a page to the dio.  These variables are
+	 * private to dio_send_cur_page(), submit_page_section() and
+	 * dio_bio_add_page().
+	 */
+	struct page *cur_page;		/* The page */
+	unsigned cur_page_offset;	/* Offset into it, in bytes */
+	unsigned cur_page_len;		/* Nr of bytes at cur_page_offset */
+	sector_t cur_page_block;	/* Where it starts */
+	loff_t cur_page_fs_offset;	/* Offset in file */
+
+	/* BIO completion state */
+	spinlock_t bio_lock;		/* protects BIO fields below */
+	unsigned long refcount;		/* direct_io_worker() and bios */
+	struct bio *bio_list;		/* singly linked via bi_private */
+	struct task_struct *waiter;	/* waiting task (NULL if none) */
+
+	/* AIO related stuff */
+	struct kiocb *iocb;		/* kiocb */
+	int is_async;			/* is IO async ? */
+	int io_error;			/* IO error in completion path */
+	ssize_t result;                 /* IO result */
+
+	/*
+	 * Page fetching state. These variables belong to dio_refill_pages().
+	 */
+	int curr_page;			/* changes */
+	int total_pages;		/* doesn't change */
+	unsigned long curr_user_address;/* changes */
+
+	/*
+	 * Page queue.  These variables belong to dio_refill_pages() and
+	 * dio_get_page().
+	 */
+	unsigned head;			/* next page to process */
+	unsigned tail;			/* last valid page + 1 */
+	int page_errors;		/* errno from get_user_pages() */
+
+	/*
+	 * pages[] (and any fields placed after it) are not zeroed out at
+	 * allocation time.  Don't add new fields after pages[] unless you
+	 * wish that they not be zeroed.
+	 */
+	struct page *pages[DIO_PAGES];	/* page buffer */
+};
+
+/*
+ * How many pages are in the queue?
+ */
+static inline unsigned dio_pages_present(struct dio *dio)
+{
+	return dio->tail - dio->head;
+}
+
+/*
+ * Go grab and pin some userspace pages.   Typically we'll get 64 at a time.
+ */
+static int dio_refill_pages(struct dio *dio)
+{
+	int ret;
+	int nr_pages;
+
+	nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
+	ret = get_user_pages_fast(
+		dio->curr_user_address,		/* Where from? */
+		nr_pages,			/* How many pages? */
+		dio->rw == READ,		/* Write to memory? */
+		&dio->pages[0]);		/* Put results here */
+
+	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
+		struct page *page = ZERO_PAGE(0);
+		/*
+		 * A memory fault, but the filesystem has some outstanding
+		 * mapped blocks.  We need to use those blocks up to avoid
+		 * leaking stale data in the file.
+		 */
+		if (dio->page_errors == 0)
+			dio->page_errors = ret;
+		page_cache_get(page);
+		dio->pages[0] = page;
+		dio->head = 0;
+		dio->tail = 1;
+		ret = 0;
+		goto out;
+	}
+
+	if (ret >= 0) {
+		dio->curr_user_address += ret * PAGE_SIZE;
+		dio->curr_page += ret;
+		dio->head = 0;
+		dio->tail = ret;
+		ret = 0;
+	}
+out:
+	return ret;	
+}
+
+/*
+ * Get another userspace page.  Returns an ERR_PTR on error.  Pages are
+ * buffered inside the dio so that we can call get_user_pages() against a
+ * decent number of pages, less frequently.  To provide nicer use of the
+ * L1 cache.
+ */
+static struct page *dio_get_page(struct dio *dio)
+{
+	if (dio_pages_present(dio) == 0) {
+		int ret;
+
+		ret = dio_refill_pages(dio);
+		if (ret)
+			return ERR_PTR(ret);
+		BUG_ON(dio_pages_present(dio) == 0);
+	}
+	return dio->pages[dio->head++];
+}
+
+/**
+ * dio_complete() - called when all DIO BIO I/O has been completed
+ * @offset: the byte offset in the file of the completed operation
+ *
+ * This releases locks as dictated by the locking type, lets interested parties
+ * know that a DIO operation has completed, and calculates the resulting return
+ * code for the operation.
+ *
+ * It lets the filesystem know if it registered an interest earlier via
+ * get_block.  Pass the private field of the map buffer_head so that
+ * filesystems can use it to hold additional state between get_block calls and
+ * dio_complete.
+ */
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
+{
+	ssize_t transferred = 0;
+
+	/*
+	 * AIO submission can race with bio completion to get here while
+	 * expecting to have the last io completed by bio completion.
+	 * In that case -EIOCBQUEUED is in fact not an error we want
+	 * to preserve through this call.
+	 */
+	if (ret == -EIOCBQUEUED)
+		ret = 0;
+
+	if (dio->result) {
+		transferred = dio->result;
+
+		/* Check for short read case */
+		if ((dio->rw == READ) && ((offset + transferred) > dio->i_size))
+			transferred = dio->i_size - offset;
+	}
+
+	if (ret == 0)
+		ret = dio->page_errors;
+	if (ret == 0)
+		ret = dio->io_error;
+	if (ret == 0)
+		ret = transferred;
+
+	if (dio->end_io && dio->result) {
+		dio->end_io(dio->iocb, offset, transferred,
+			    dio->map_bh.b_private, ret, is_async);
+	} else if (is_async) {
+		aio_complete(dio->iocb, ret, 0);
+	}
+
+	if (dio->flags & DIO_LOCKING)
+		/* lockdep: non-owner release */
+		up_read_non_owner(&dio->inode->i_alloc_sem);
+
+	return ret;
+}
+
+static int dio_bio_complete(struct dio *dio, struct bio *bio);
+/*
+ * Asynchronous IO callback. 
+ */
+static void dio_bio_end_aio(struct bio *bio, int error)
+{
+	struct dio *dio = bio->bi_private;
+	unsigned long remaining;
+	unsigned long flags;
+
+	/* cleanup the bio */
+	dio_bio_complete(dio, bio);
+
+	spin_lock_irqsave(&dio->bio_lock, flags);
+	remaining = --dio->refcount;
+	if (remaining == 1 && dio->waiter)
+		wake_up_process(dio->waiter);
+	spin_unlock_irqrestore(&dio->bio_lock, flags);
+
+	if (remaining == 0) {
+		dio_complete(dio, dio->iocb->ki_pos, 0, true);
+		kfree(dio);
+	}
+}
+
+/*
+ * The BIO completion handler simply queues the BIO up for the process-context
+ * handler.
+ *
+ * During I/O bi_private points at the dio.  After I/O, bi_private is used to
+ * implement a singly-linked list of completed BIOs, at dio->bio_list.
+ */
+static void dio_bio_end_io(struct bio *bio, int error)
+{
+	struct dio *dio = bio->bi_private;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dio->bio_lock, flags);
+	bio->bi_private = dio->bio_list;
+	dio->bio_list = bio;
+	if (--dio->refcount == 1 && dio->waiter)
+		wake_up_process(dio->waiter);
+	spin_unlock_irqrestore(&dio->bio_lock, flags);
+}
+
+/**
+ * dio_end_io - handle the end io action for the given bio
+ * @bio: The direct io bio thats being completed
+ * @error: Error if there was one
+ *
+ * This is meant to be called by any filesystem that uses their own dio_submit_t
+ * so that the DIO specific endio actions are dealt with after the filesystem
+ * has done it's completion work.
+ */
+void dio_end_io(struct bio *bio, int error)
+{
+	struct dio *dio = bio->bi_private;
+
+	if (dio->is_async)
+		dio_bio_end_aio(bio, error);
+	else
+		dio_bio_end_io(bio, error);
+}
+EXPORT_SYMBOL_GPL(dio_end_io);
+
+static void
+dio_bio_alloc(struct dio *dio, struct block_device *bdev,
+		sector_t first_sector, int nr_vecs)
+{
+	struct bio *bio;
+
+	/*
+	 * bio_alloc() is guaranteed to return a bio when called with
+	 * __GFP_WAIT and we request a valid number of vectors.
+	 */
+	bio = bio_alloc(GFP_KERNEL, nr_vecs);
+
+	bio->bi_bdev = bdev;
+	bio->bi_sector = first_sector;
+	if (dio->is_async)
+		bio->bi_end_io = dio_bio_end_aio;
+	else
+		bio->bi_end_io = dio_bio_end_io;
+
+	dio->bio = bio;
+	dio->logical_offset_in_bio = dio->cur_page_fs_offset;
+}
+
+/*
+ * In the AIO read case we speculatively dirty the pages before starting IO.
+ * During IO completion, any of these pages which happen to have been written
+ * back will be redirtied by bio_check_pages_dirty().
+ *
+ * bios hold a dio reference between submit_bio and ->end_io.
+ */
+static void dio_bio_submit(struct dio *dio)
+{
+	struct bio *bio = dio->bio;
+	unsigned long flags;
+
+	bio->bi_private = dio;
+
+	spin_lock_irqsave(&dio->bio_lock, flags);
+	dio->refcount++;
+	spin_unlock_irqrestore(&dio->bio_lock, flags);
+
+	if (dio->is_async && dio->rw == READ)
+		bio_set_pages_dirty(bio);
+
+	if (dio->submit_io)
+		dio->submit_io(dio->rw, bio, dio->inode,
+			       dio->logical_offset_in_bio);
+	else
+		submit_bio(dio->rw, bio);
+
+	dio->bio = NULL;
+	dio->boundary = 0;
+	dio->logical_offset_in_bio = 0;
+}
+
+/*
+ * Release any resources in case of a failure
+ */
+static void dio_cleanup(struct dio *dio)
+{
+	while (dio_pages_present(dio))
+		page_cache_release(dio_get_page(dio));
+}
+
+/*
+ * Wait for the next BIO to complete.  Remove it and return it.  NULL is
+ * returned once all BIOs have been completed.  This must only be called once
+ * all bios have been issued so that dio->refcount can only decrease.  This
+ * requires that that the caller hold a reference on the dio.
+ */
+static struct bio *dio_await_one(struct dio *dio)
+{
+	unsigned long flags;
+	struct bio *bio = NULL;
+
+	spin_lock_irqsave(&dio->bio_lock, flags);
+
+	/*
+	 * Wait as long as the list is empty and there are bios in flight.  bio
+	 * completion drops the count, maybe adds to the list, and wakes while
+	 * holding the bio_lock so we don't need set_current_state()'s barrier
+	 * and can call it after testing our condition.
+	 */
+	while (dio->refcount > 1 && dio->bio_list == NULL) {
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		dio->waiter = current;
+		spin_unlock_irqrestore(&dio->bio_lock, flags);
+		io_schedule();
+		/* wake up sets us TASK_RUNNING */
+		spin_lock_irqsave(&dio->bio_lock, flags);
+		dio->waiter = NULL;
+	}
+	if (dio->bio_list) {
+		bio = dio->bio_list;
+		dio->bio_list = bio->bi_private;
+	}
+	spin_unlock_irqrestore(&dio->bio_lock, flags);
+	return bio;
+}
+
+/*
+ * Process one completed BIO.  No locks are held.
+ */
+static int dio_bio_complete(struct dio *dio, struct bio *bio)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int page_no;
+
+	if (!uptodate)
+		dio->io_error = -EIO;
+
+	if (dio->is_async && dio->rw == READ) {
+		bio_check_pages_dirty(bio);	/* transfers ownership */
+	} else {
+		for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
+			struct page *page = bvec[page_no].bv_page;
+
+			if (dio->rw == READ && !PageCompound(page))
+				set_page_dirty_lock(page);
+			page_cache_release(page);
+		}
+		bio_put(bio);
+	}
+	return uptodate ? 0 : -EIO;
+}
+
+/*
+ * Wait on and process all in-flight BIOs.  This must only be called once
+ * all bios have been issued so that the refcount can only decrease.
+ * This just waits for all bios to make it through dio_bio_complete.  IO
+ * errors are propagated through dio->io_error and should be propagated via
+ * dio_complete().
+ */
+static void dio_await_completion(struct dio *dio)
+{
+	struct bio *bio;
+	do {
+		bio = dio_await_one(dio);
+		if (bio)
+			dio_bio_complete(dio, bio);
+	} while (bio);
+}
+
+/*
+ * A really large O_DIRECT read or write can generate a lot of BIOs.  So
+ * to keep the memory consumption sane we periodically reap any completed BIOs
+ * during the BIO generation phase.
+ *
+ * This also helps to limit the peak amount of pinned userspace memory.
+ */
+static int dio_bio_reap(struct dio *dio)
+{
+	int ret = 0;
+
+	if (dio->reap_counter++ >= 64) {
+		while (dio->bio_list) {
+			unsigned long flags;
+			struct bio *bio;
+			int ret2;
+
+			spin_lock_irqsave(&dio->bio_lock, flags);
+			bio = dio->bio_list;
+			dio->bio_list = bio->bi_private;
+			spin_unlock_irqrestore(&dio->bio_lock, flags);
+			ret2 = dio_bio_complete(dio, bio);
+			if (ret == 0)
+				ret = ret2;
+		}
+		dio->reap_counter = 0;
+	}
+	return ret;
+}
+
+/*
+ * Call into the fs to map some more disk blocks.  We record the current number
+ * of available blocks at dio->blocks_available.  These are in units of the
+ * fs blocksize, (1 << inode->i_blkbits).
+ *
+ * The fs is allowed to map lots of blocks at once.  If it wants to do that,
+ * it uses the passed inode-relative block number as the file offset, as usual.
+ *
+ * get_block() is passed the number of i_blkbits-sized blocks which direct_io
+ * has remaining to do.  The fs should not map more than this number of blocks.
+ *
+ * If the fs has mapped a lot of blocks, it should populate bh->b_size to
+ * indicate how much contiguous disk space has been made available at
+ * bh->b_blocknr.
+ *
+ * If *any* of the mapped blocks are new, then the fs must set buffer_new().
+ * This isn't very efficient...
+ *
+ * In the case of filesystem holes: the fs may return an arbitrarily-large
+ * hole by returning an appropriate value in b_size and by clearing
+ * buffer_mapped().  However the direct-io code will only process holes one
+ * block at a time - it will repeatedly call get_block() as it walks the hole.
+ */
+static int get_more_blocks(struct dio *dio)
+{
+	int ret;
+	struct buffer_head *map_bh = &dio->map_bh;
+	sector_t fs_startblk;	/* Into file, in filesystem-sized blocks */
+	unsigned long fs_count;	/* Number of filesystem-sized blocks */
+	unsigned long dio_count;/* Number of dio_block-sized blocks */
+	unsigned long blkmask;
+	int create;
+
+	/*
+	 * If there was a memory error and we've overwritten all the
+	 * mapped blocks then we can now return that memory error
+	 */
+	ret = dio->page_errors;
+	if (ret == 0) {
+		BUG_ON(dio->block_in_file >= dio->final_block_in_request);
+		fs_startblk = dio->block_in_file >> dio->blkfactor;
+		dio_count = dio->final_block_in_request - dio->block_in_file;
+		fs_count = dio_count >> dio->blkfactor;
+		blkmask = (1 << dio->blkfactor) - 1;
+		if (dio_count & blkmask)	
+			fs_count++;
+
+		map_bh->b_state = 0;
+		map_bh->b_size = fs_count << dio->inode->i_blkbits;
+
+		/*
+		 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
+		 * forbid block creations: only overwrites are permitted.
+		 * We will return early to the caller once we see an
+		 * unmapped buffer head returned, and the caller will fall
+		 * back to buffered I/O.
+		 *
+		 * Otherwise the decision is left to the get_blocks method,
+		 * which may decide to handle it or also return an unmapped
+		 * buffer head.
+		 */
+		create = dio->rw & WRITE;
+		if (dio->flags & DIO_SKIP_HOLES) {
+			if (dio->block_in_file < (i_size_read(dio->inode) >>
+							dio->blkbits))
+				create = 0;
+		}
+
+		ret = (*dio->get_block)(dio->inode, fs_startblk,
+						map_bh, create);
+	}
+	return ret;
+}
+
+/*
+ * There is no bio.  Make one now.
+ */
+static int dio_new_bio(struct dio *dio, sector_t start_sector)
+{
+	sector_t sector;
+	int ret, nr_pages;
+
+	ret = dio_bio_reap(dio);
+	if (ret)
+		goto out;
+	sector = start_sector << (dio->blkbits - 9);
+	nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
+	nr_pages = min(nr_pages, BIO_MAX_PAGES);
+	BUG_ON(nr_pages <= 0);
+	dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
+	dio->boundary = 0;
+out:
+	return ret;
+}
+
+/*
+ * Attempt to put the current chunk of 'cur_page' into the current BIO.  If
+ * that was successful then update final_block_in_bio and take a ref against
+ * the just-added page.
+ *
+ * Return zero on success.  Non-zero means the caller needs to start a new BIO.
+ */
+static int dio_bio_add_page(struct dio *dio)
+{
+	int ret;
+
+	ret = bio_add_page(dio->bio, dio->cur_page,
+			dio->cur_page_len, dio->cur_page_offset);
+	if (ret == dio->cur_page_len) {
+		/*
+		 * Decrement count only, if we are done with this page
+		 */
+		if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE)
+			dio->pages_in_io--;
+		page_cache_get(dio->cur_page);
+		dio->final_block_in_bio = dio->cur_page_block +
+			(dio->cur_page_len >> dio->blkbits);
+		ret = 0;
+	} else {
+		ret = 1;
+	}
+	return ret;
+}
+		
+/*
+ * Put cur_page under IO.  The section of cur_page which is described by
+ * cur_page_offset,cur_page_len is put into a BIO.  The section of cur_page
+ * starts on-disk at cur_page_block.
+ *
+ * We take a ref against the page here (on behalf of its presence in the bio).
+ *
+ * The caller of this function is responsible for removing cur_page from the
+ * dio, and for dropping the refcount which came from that presence.
+ */
+static int dio_send_cur_page(struct dio *dio)
+{
+	int ret = 0;
+
+	if (dio->bio) {
+		loff_t cur_offset = dio->cur_page_fs_offset;
+		loff_t bio_next_offset = dio->logical_offset_in_bio +
+			dio->bio->bi_size;
+
+		/*
+		 * See whether this new request is contiguous with the old.
+		 *
+		 * Btrfs cannot handle having logically non-contiguous requests
+		 * submitted.  For example if you have
+		 *
+		 * Logical:  [0-4095][HOLE][8192-12287]
+		 * Physical: [0-4095]      [4096-8191]
+		 *
+		 * We cannot submit those pages together as one BIO.  So if our
+		 * current logical offset in the file does not equal what would
+		 * be the next logical offset in the bio, submit the bio we
+		 * have.
+		 */
+		if (dio->final_block_in_bio != dio->cur_page_block ||
+		    cur_offset != bio_next_offset)
+			dio_bio_submit(dio);
+		/*
+		 * Submit now if the underlying fs is about to perform a
+		 * metadata read
+		 */
+		else if (dio->boundary)
+			dio_bio_submit(dio);
+	}
+
+	if (dio->bio == NULL) {
+		ret = dio_new_bio(dio, dio->cur_page_block);
+		if (ret)
+			goto out;
+	}
+
+	if (dio_bio_add_page(dio) != 0) {
+		dio_bio_submit(dio);
+		ret = dio_new_bio(dio, dio->cur_page_block);
+		if (ret == 0) {
+			ret = dio_bio_add_page(dio);
+			BUG_ON(ret != 0);
+		}
+	}
+out:
+	return ret;
+}
+
+/*
+ * An autonomous function to put a chunk of a page under deferred IO.
+ *
+ * The caller doesn't actually know (or care) whether this piece of page is in
+ * a BIO, or is under IO or whatever.  We just take care of all possible 
+ * situations here.  The separation between the logic of do_direct_IO() and
+ * that of submit_page_section() is important for clarity.  Please don't break.
+ *
+ * The chunk of page starts on-disk at blocknr.
+ *
+ * We perform deferred IO, by recording the last-submitted page inside our
+ * private part of the dio structure.  If possible, we just expand the IO
+ * across that page here.
+ *
+ * If that doesn't work out then we put the old page into the bio and add this
+ * page to the dio instead.
+ */
+static int
+submit_page_section(struct dio *dio, struct page *page,
+		unsigned offset, unsigned len, sector_t blocknr)
+{
+	int ret = 0;
+
+	if (dio->rw & WRITE) {
+		/*
+		 * Read accounting is performed in submit_bio()
+		 */
+		task_io_account_write(len);
+	}
+
+	/*
+	 * Can we just grow the current page's presence in the dio?
+	 */
+	if (	(dio->cur_page == page) &&
+		(dio->cur_page_offset + dio->cur_page_len == offset) &&
+		(dio->cur_page_block +
+			(dio->cur_page_len >> dio->blkbits) == blocknr)) {
+		dio->cur_page_len += len;
+
+		/*
+		 * If dio->boundary then we want to schedule the IO now to
+		 * avoid metadata seeks.
+		 */
+		if (dio->boundary) {
+			ret = dio_send_cur_page(dio);
+			page_cache_release(dio->cur_page);
+			dio->cur_page = NULL;
+		}
+		goto out;
+	}
+
+	/*
+	 * If there's a deferred page already there then send it.
+	 */
+	if (dio->cur_page) {
+		ret = dio_send_cur_page(dio);
+		page_cache_release(dio->cur_page);
+		dio->cur_page = NULL;
+		if (ret)
+			goto out;
+	}
+
+	page_cache_get(page);		/* It is in dio */
+	dio->cur_page = page;
+	dio->cur_page_offset = offset;
+	dio->cur_page_len = len;
+	dio->cur_page_block = blocknr;
+	dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
+out:
+	return ret;
+}
+
+/*
+ * Clean any dirty buffers in the blockdev mapping which alias newly-created
+ * file blocks.  Only called for S_ISREG files - blockdevs do not set
+ * buffer_new
+ */
+static void clean_blockdev_aliases(struct dio *dio)
+{
+	unsigned i;
+	unsigned nblocks;
+
+	nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits;
+
+	for (i = 0; i < nblocks; i++) {
+		unmap_underlying_metadata(dio->map_bh.b_bdev,
+					dio->map_bh.b_blocknr + i);
+	}
+}
+
+/*
+ * If we are not writing the entire block and get_block() allocated
+ * the block for us, we need to fill-in the unused portion of the
+ * block with zeros. This happens only if user-buffer, fileoffset or
+ * io length is not filesystem block-size multiple.
+ *
+ * `end' is zero if we're doing the start of the IO, 1 at the end of the
+ * IO.
+ */
+static void dio_zero_block(struct dio *dio, int end)
+{
+	unsigned dio_blocks_per_fs_block;
+	unsigned this_chunk_blocks;	/* In dio_blocks */
+	unsigned this_chunk_bytes;
+	struct page *page;
+
+	dio->start_zero_done = 1;
+	if (!dio->blkfactor || !buffer_new(&dio->map_bh))
+		return;
+
+	dio_blocks_per_fs_block = 1 << dio->blkfactor;
+	this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);
+
+	if (!this_chunk_blocks)
+		return;
+
+	/*
+	 * We need to zero out part of an fs block.  It is either at the
+	 * beginning or the end of the fs block.
+	 */
+	if (end) 
+		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
+
+	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
+
+	page = ZERO_PAGE(0);
+	if (submit_page_section(dio, page, 0, this_chunk_bytes, 
+				dio->next_block_for_io))
+		return;
+
+	dio->next_block_for_io += this_chunk_blocks;
+}
+
+/*
+ * Walk the user pages, and the file, mapping blocks to disk and generating
+ * a sequence of (page,offset,len,block) mappings.  These mappings are injected
+ * into submit_page_section(), which takes care of the next stage of submission
+ *
+ * Direct IO against a blockdev is different from a file.  Because we can
+ * happily perform page-sized but 512-byte aligned IOs.  It is important that
+ * blockdev IO be able to have fine alignment and large sizes.
+ *
+ * So what we do is to permit the ->get_block function to populate bh.b_size
+ * with the size of IO which is permitted at this offset and this i_blkbits.
+ *
+ * For best results, the blockdev should be set up with 512-byte i_blkbits and
+ * it should set b_size to PAGE_SIZE or more inside get_block().  This gives
+ * fine alignment but still allows this function to work in PAGE_SIZE units.
+ */
+static int do_direct_IO(struct dio *dio)
+{
+	const unsigned blkbits = dio->blkbits;
+	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
+	struct page *page;
+	unsigned block_in_page;
+	struct buffer_head *map_bh = &dio->map_bh;
+	int ret = 0;
+
+	/* The I/O can start at any block offset within the first page */
+	block_in_page = dio->first_block_in_page;
+
+	while (dio->block_in_file < dio->final_block_in_request) {
+		page = dio_get_page(dio);
+		if (IS_ERR(page)) {
+			ret = PTR_ERR(page);
+			goto out;
+		}
+
+		while (block_in_page < blocks_per_page) {
+			unsigned offset_in_page = block_in_page << blkbits;
+			unsigned this_chunk_bytes;	/* # of bytes mapped */
+			unsigned this_chunk_blocks;	/* # of blocks */
+			unsigned u;
+
+			if (dio->blocks_available == 0) {
+				/*
+				 * Need to go and map some more disk
+				 */
+				unsigned long blkmask;
+				unsigned long dio_remainder;
+
+				ret = get_more_blocks(dio);
+				if (ret) {
+					page_cache_release(page);
+					goto out;
+				}
+				if (!buffer_mapped(map_bh))
+					goto do_holes;
+
+				dio->blocks_available =
+						map_bh->b_size >> dio->blkbits;
+				dio->next_block_for_io =
+					map_bh->b_blocknr << dio->blkfactor;
+				if (buffer_new(map_bh))
+					clean_blockdev_aliases(dio);
+
+				if (!dio->blkfactor)
+					goto do_holes;
+
+				blkmask = (1 << dio->blkfactor) - 1;
+				dio_remainder = (dio->block_in_file & blkmask);
+
+				/*
+				 * If we are at the start of IO and that IO
+				 * starts partway into a fs-block,
+				 * dio_remainder will be non-zero.  If the IO
+				 * is a read then we can simply advance the IO
+				 * cursor to the first block which is to be
+				 * read.  But if the IO is a write and the
+				 * block was newly allocated we cannot do that;
+				 * the start of the fs block must be zeroed out
+				 * on-disk
+				 */
+				if (!buffer_new(map_bh))
+					dio->next_block_for_io += dio_remainder;
+				dio->blocks_available -= dio_remainder;
+			}
+do_holes:
+			/* Handle holes */
+			if (!buffer_mapped(map_bh)) {
+				loff_t i_size_aligned;
+
+				/* AKPM: eargh, -ENOTBLK is a hack */
+				if (dio->rw & WRITE) {
+					page_cache_release(page);
+					return -ENOTBLK;
+				}
+
+				/*
+				 * Be sure to account for a partial block as the
+				 * last block in the file
+				 */
+				i_size_aligned = ALIGN(i_size_read(dio->inode),
+							1 << blkbits);
+				if (dio->block_in_file >=
+						i_size_aligned >> blkbits) {
+					/* We hit eof */
+					page_cache_release(page);
+					goto out;
+				}
+				zero_user(page, block_in_page << blkbits,
+						1 << blkbits);
+				dio->block_in_file++;
+				block_in_page++;
+				goto next_block;
+			}
+
+			/*
+			 * If we're performing IO which has an alignment which
+			 * is finer than the underlying fs, go check to see if
+			 * we must zero out the start of this block.
+			 */
+			if (unlikely(dio->blkfactor && !dio->start_zero_done))
+				dio_zero_block(dio, 0);
+
+			/*
+			 * Work out, in this_chunk_blocks, how much disk we
+			 * can add to this page
+			 */
+			this_chunk_blocks = dio->blocks_available;
+			u = (PAGE_SIZE - offset_in_page) >> blkbits;
+			if (this_chunk_blocks > u)
+				this_chunk_blocks = u;
+			u = dio->final_block_in_request - dio->block_in_file;
+			if (this_chunk_blocks > u)
+				this_chunk_blocks = u;
+			this_chunk_bytes = this_chunk_blocks << blkbits;
+			BUG_ON(this_chunk_bytes == 0);
+
+			dio->boundary = buffer_boundary(map_bh);
+			ret = submit_page_section(dio, page, offset_in_page,
+				this_chunk_bytes, dio->next_block_for_io);
+			if (ret) {
+				page_cache_release(page);
+				goto out;
+			}
+			dio->next_block_for_io += this_chunk_blocks;
+
+			dio->block_in_file += this_chunk_blocks;
+			block_in_page += this_chunk_blocks;
+			dio->blocks_available -= this_chunk_blocks;
+next_block:
+			BUG_ON(dio->block_in_file > dio->final_block_in_request);
+			if (dio->block_in_file == dio->final_block_in_request)
+				break;
+		}
+
+		/* Drop the ref which was taken in get_user_pages() */
+		page_cache_release(page);
+		block_in_page = 0;
+	}
+out:
+	return ret;
+}
+
+/*
+ * Releases both i_mutex and i_alloc_sem
+ */
+static ssize_t
+direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
+	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
+	unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
+	dio_submit_t submit_io, struct dio *dio)
+{
+	unsigned long user_addr; 
+	unsigned long flags;
+	int seg;
+	ssize_t ret = 0;
+	ssize_t ret2;
+	size_t bytes;
+
+	dio->inode = inode;
+	dio->rw = rw;
+	dio->blkbits = blkbits;
+	dio->blkfactor = inode->i_blkbits - blkbits;
+	dio->block_in_file = offset >> blkbits;
+
+	dio->get_block = get_block;
+	dio->end_io = end_io;
+	dio->submit_io = submit_io;
+	dio->final_block_in_bio = -1;
+	dio->next_block_for_io = -1;
+
+	dio->iocb = iocb;
+	dio->i_size = i_size_read(inode);
+
+	spin_lock_init(&dio->bio_lock);
+	dio->refcount = 1;
+
+	/*
+	 * In case of non-aligned buffers, we may need 2 more
+	 * pages since we need to zero out first and last block.
+	 */
+	if (unlikely(dio->blkfactor))
+		dio->pages_in_io = 2;
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		dio->pages_in_io +=
+			((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
+				- user_addr/PAGE_SIZE);
+	}
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		dio->size += bytes = iov[seg].iov_len;
+
+		/* Index into the first page of the first block */
+		dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
+		dio->final_block_in_request = dio->block_in_file +
+						(bytes >> blkbits);
+		/* Page fetching state */
+		dio->head = 0;
+		dio->tail = 0;
+		dio->curr_page = 0;
+
+		dio->total_pages = 0;
+		if (user_addr & (PAGE_SIZE-1)) {
+			dio->total_pages++;
+			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
+		}
+		dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+		dio->curr_user_address = user_addr;
+	
+		ret = do_direct_IO(dio);
+
+		dio->result += iov[seg].iov_len -
+			((dio->final_block_in_request - dio->block_in_file) <<
+					blkbits);
+
+		if (ret) {
+			dio_cleanup(dio);
+			break;
+		}
+	} /* end iovec loop */
+
+	if (ret == -ENOTBLK) {
+		/*
+		 * The remaining part of the request will be
+		 * be handled by buffered I/O when we return
+		 */
+		ret = 0;
+	}
+	/*
+	 * There may be some unwritten disk at the end of a part-written
+	 * fs-block-sized block.  Go zero that now.
+	 */
+	dio_zero_block(dio, 1);
+
+	if (dio->cur_page) {
+		ret2 = dio_send_cur_page(dio);
+		if (ret == 0)
+			ret = ret2;
+		page_cache_release(dio->cur_page);
+		dio->cur_page = NULL;
+	}
+	if (dio->bio)
+		dio_bio_submit(dio);
+
+	/*
+	 * It is possible that, we return short IO due to end of file.
+	 * In that case, we need to release all the pages we got hold on.
+	 */
+	dio_cleanup(dio);
+
+	/*
+	 * All block lookups have been performed. For READ requests
+	 * we can let i_mutex go now that its achieved its purpose
+	 * of protecting us from looking up uninitialized blocks.
+	 */
+	if (rw == READ && (dio->flags & DIO_LOCKING))
+		mutex_unlock(&dio->inode->i_mutex);
+
+	/*
+	 * The only time we want to leave bios in flight is when a successful
+	 * partial aio read or full aio write have been setup.  In that case
+	 * bio completion will call aio_complete.  The only time it's safe to
+	 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
+	 * This had *better* be the only place that raises -EIOCBQUEUED.
+	 */
+	BUG_ON(ret == -EIOCBQUEUED);
+	if (dio->is_async && ret == 0 && dio->result &&
+	    ((rw & READ) || (dio->result == dio->size)))
+		ret = -EIOCBQUEUED;
+
+	if (ret != -EIOCBQUEUED)
+		dio_await_completion(dio);
+
+	/*
+	 * Sync will always be dropping the final ref and completing the
+	 * operation.  AIO can if it was a broken operation described above or
+	 * in fact if all the bios race to complete before we get here.  In
+	 * that case dio_complete() translates the EIOCBQUEUED into the proper
+	 * return code that the caller will hand to aio_complete().
+	 *
+	 * This is managed by the bio_lock instead of being an atomic_t so that
+	 * completion paths can drop their ref and use the remaining count to
+	 * decide to wake the submission path atomically.
+	 */
+	spin_lock_irqsave(&dio->bio_lock, flags);
+	ret2 = --dio->refcount;
+	spin_unlock_irqrestore(&dio->bio_lock, flags);
+
+	if (ret2 == 0) {
+		ret = dio_complete(dio, offset, ret, false);
+		kfree(dio);
+	} else
+		BUG_ON(ret != -EIOCBQUEUED);
+
+	return ret;
+}
+
+/*
+ * This is a library function for use by filesystem drivers.
+ *
+ * The locking rules are governed by the flags parameter:
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
+ *
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
+ *    internal locking but rather rely on the filesystem to synchronize
+ *    direct I/O reads/writes versus each other and truncate.
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
+ *    entry and are never taken.
+ */
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
+	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+	dio_submit_t submit_io,	int flags)
+{
+	int seg;
+	size_t size;
+	unsigned long addr;
+	unsigned blkbits = inode->i_blkbits;
+	unsigned bdev_blkbits = 0;
+	unsigned blocksize_mask = (1 << blkbits) - 1;
+	ssize_t retval = -EINVAL;
+	loff_t end = offset;
+	struct dio *dio;
+
+	if (rw & WRITE)
+		rw = WRITE_ODIRECT;
+
+	if (bdev)
+		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
+
+	if (offset & blocksize_mask) {
+		if (bdev)
+			 blkbits = bdev_blkbits;
+		blocksize_mask = (1 << blkbits) - 1;
+		if (offset & blocksize_mask)
+			goto out;
+	}
+
+	/* Check the memory alignment.  Blocks cannot straddle pages */
+	for (seg = 0; seg < nr_segs; seg++) {
+		addr = (unsigned long)iov[seg].iov_base;
+		size = iov[seg].iov_len;
+		end += size;
+		if ((addr & blocksize_mask) || (size & blocksize_mask))  {
+			if (bdev)
+				 blkbits = bdev_blkbits;
+			blocksize_mask = (1 << blkbits) - 1;
+			if ((addr & blocksize_mask) || (size & blocksize_mask))  
+				goto out;
+		}
+	}
+
+	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+	retval = -ENOMEM;
+	if (!dio)
+		goto out;
+	/*
+	 * Believe it or not, zeroing out the page array caused a .5%
+	 * performance regression in a database benchmark.  So, we take
+	 * care to only zero out what's needed.
+	 */
+	memset(dio, 0, offsetof(struct dio, pages));
+
+	dio->flags = flags;
+	if (dio->flags & DIO_LOCKING) {
+		/* watch out for a 0 len io from a tricksy fs */
+		if (rw == READ && end > offset) {
+			struct address_space *mapping =
+					iocb->ki_filp->f_mapping;
+
+			/* will be released by direct_io_worker */
+			mutex_lock(&inode->i_mutex);
+
+			retval = filemap_write_and_wait_range(mapping, offset,
+							      end - 1);
+			if (retval) {
+				mutex_unlock(&inode->i_mutex);
+				kfree(dio);
+				goto out;
+			}
+		}
+
+		/*
+		 * Will be released at I/O completion, possibly in a
+		 * different thread.
+		 */
+		down_read_non_owner(&inode->i_alloc_sem);
+	}
+
+	/*
+	 * For file extending writes updating i_size before data
+	 * writeouts complete can expose uninitialized blocks. So
+	 * even for AIO, we need to wait for i/o to complete before
+	 * returning in this case.
+	 */
+	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
+		(end > i_size_read(inode)));
+
+	retval = direct_io_worker(rw, iocb, inode, iov, offset,
+				nr_segs, blkbits, get_block, end_io,
+				submit_io, dio);
+
+out:
+	return retval;
+}
+EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 00000000..1897eb1b
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,16 @@
+menuconfig DLM
+	tristate "Distributed Lock Manager (DLM)"
+	depends on EXPERIMENTAL && INET
+	depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
+	select IP_SCTP
+	help
+	A general purpose distributed lock manager for kernel or userspace
+	applications.
+
+config DLM_DEBUG
+	bool "DLM debugging"
+	depends on DLM
+	help
+	Under the debugfs mount point, the name of each lockspace will
+	appear as a file in the "dlm" directory.  The output is the
+	list of resource and locks the local node knows about.
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 00000000..ca1c9124
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,21 @@
+obj-$(CONFIG_DLM) +=		dlm.o
+dlm-y :=			ast.o \
+				config.o \
+				dir.o \
+				lock.o \
+				lockspace.o \
+				main.o \
+				member.o \
+				memory.o \
+				midcomms.o \
+				netlink.o \
+				lowcomms.o \
+				plock.o \
+				rcom.o \
+				recover.o \
+				recoverd.o \
+				requestqueue.o \
+				user.o \
+				util.o 
+dlm-$(CONFIG_DLM_DEBUG) +=	debug_fs.o
+
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 00000000..abc49f29
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,346 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lock.h"
+#include "user.h"
+#include "ast.h"
+
+#define WAKE_ASTS  0
+
+static uint64_t			ast_seq_count;
+static struct list_head		ast_queue;
+static spinlock_t		ast_queue_lock;
+static struct task_struct *	astd_task;
+static unsigned long		astd_wakeflags;
+static struct mutex		astd_running;
+
+
+static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb)
+{
+	int i;
+
+	log_print("last_bast %x %llu flags %x mode %d sb %d %x",
+		  lkb->lkb_id,
+		  (unsigned long long)lkb->lkb_last_bast.seq,
+		  lkb->lkb_last_bast.flags,
+		  lkb->lkb_last_bast.mode,
+		  lkb->lkb_last_bast.sb_status,
+		  lkb->lkb_last_bast.sb_flags);
+
+	log_print("last_cast %x %llu flags %x mode %d sb %d %x",
+		  lkb->lkb_id,
+		  (unsigned long long)lkb->lkb_last_cast.seq,
+		  lkb->lkb_last_cast.flags,
+		  lkb->lkb_last_cast.mode,
+		  lkb->lkb_last_cast.sb_status,
+		  lkb->lkb_last_cast.sb_flags);
+
+	for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+		log_print("cb %x %llu flags %x mode %d sb %d %x",
+			  lkb->lkb_id,
+			  (unsigned long long)lkb->lkb_callbacks[i].seq,
+			  lkb->lkb_callbacks[i].flags,
+			  lkb->lkb_callbacks[i].mode,
+			  lkb->lkb_callbacks[i].sb_status,
+			  lkb->lkb_callbacks[i].sb_flags);
+	}
+}
+
+void dlm_del_ast(struct dlm_lkb *lkb)
+{
+	spin_lock(&ast_queue_lock);
+	if (!list_empty(&lkb->lkb_astqueue))
+		list_del_init(&lkb->lkb_astqueue);
+	spin_unlock(&ast_queue_lock);
+}
+
+int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+			 int status, uint32_t sbflags, uint64_t seq)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	uint64_t prev_seq;
+	int prev_mode;
+	int i;
+
+	for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+		if (lkb->lkb_callbacks[i].seq)
+			continue;
+
+		/*
+		 * Suppress some redundant basts here, do more on removal.
+		 * Don't even add a bast if the callback just before it
+		 * is a bast for the same mode or a more restrictive mode.
+		 * (the addional > PR check is needed for PR/CW inversion)
+		 */
+
+		if ((i > 0) && (flags & DLM_CB_BAST) &&
+		    (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) {
+
+			prev_seq = lkb->lkb_callbacks[i-1].seq;
+			prev_mode = lkb->lkb_callbacks[i-1].mode;
+
+			if ((prev_mode == mode) ||
+			    (prev_mode > mode && prev_mode > DLM_LOCK_PR)) {
+
+				log_debug(ls, "skip %x add bast %llu mode %d "
+					  "for bast %llu mode %d",
+					  lkb->lkb_id,
+					  (unsigned long long)seq,
+					  mode,
+					  (unsigned long long)prev_seq,
+					  prev_mode);
+				return 0;
+			}
+		}
+
+		lkb->lkb_callbacks[i].seq = seq;
+		lkb->lkb_callbacks[i].flags = flags;
+		lkb->lkb_callbacks[i].mode = mode;
+		lkb->lkb_callbacks[i].sb_status = status;
+		lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF);
+		break;
+	}
+
+	if (i == DLM_CALLBACKS_SIZE) {
+		log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x",
+			  lkb->lkb_id, (unsigned long long)seq,
+			  flags, mode, status, sbflags);
+		dlm_dump_lkb_callbacks(lkb);
+		return -1;
+	}
+
+	return 0;
+}
+
+int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
+			 struct dlm_callback *cb, int *resid)
+{
+	int i;
+
+	*resid = 0;
+
+	if (!lkb->lkb_callbacks[0].seq)
+		return -ENOENT;
+
+	/* oldest undelivered cb is callbacks[0] */
+
+	memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback));
+	memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback));
+
+	/* shift others down */
+
+	for (i = 1; i < DLM_CALLBACKS_SIZE; i++) {
+		if (!lkb->lkb_callbacks[i].seq)
+			break;
+		memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i],
+		       sizeof(struct dlm_callback));
+		memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback));
+		(*resid)++;
+	}
+
+	/* if cb is a bast, it should be skipped if the blocking mode is
+	   compatible with the last granted mode */
+
+	if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) {
+		if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) {
+			cb->flags |= DLM_CB_SKIP;
+
+			log_debug(ls, "skip %x bast %llu mode %d "
+				  "for cast %llu mode %d",
+				  lkb->lkb_id,
+				  (unsigned long long)cb->seq,
+				  cb->mode,
+				  (unsigned long long)lkb->lkb_last_cast.seq,
+				  lkb->lkb_last_cast.mode);
+			return 0;
+		}
+	}
+
+	if (cb->flags & DLM_CB_CAST) {
+		memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback));
+		lkb->lkb_last_cast_time = ktime_get();
+	}
+
+	if (cb->flags & DLM_CB_BAST) {
+		memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback));
+		lkb->lkb_last_bast_time = ktime_get();
+	}
+
+	return 0;
+}
+
+void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
+		 uint32_t sbflags)
+{
+	uint64_t seq;
+	int rv;
+
+	spin_lock(&ast_queue_lock);
+
+	seq = ++ast_seq_count;
+
+	if (lkb->lkb_flags & DLM_IFL_USER) {
+		spin_unlock(&ast_queue_lock);
+		dlm_user_add_ast(lkb, flags, mode, status, sbflags, seq);
+		return;
+	}
+
+	rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
+	if (rv < 0) {
+		spin_unlock(&ast_queue_lock);
+		return;
+	}
+
+	if (list_empty(&lkb->lkb_astqueue)) {
+		kref_get(&lkb->lkb_ref);
+		list_add_tail(&lkb->lkb_astqueue, &ast_queue);
+	}
+	spin_unlock(&ast_queue_lock);
+
+	set_bit(WAKE_ASTS, &astd_wakeflags);
+	wake_up_process(astd_task);
+}
+
+static void process_asts(void)
+{
+	struct dlm_ls *ls = NULL;
+	struct dlm_rsb *r = NULL;
+	struct dlm_lkb *lkb;
+	void (*castfn) (void *astparam);
+	void (*bastfn) (void *astparam, int mode);
+	struct dlm_callback callbacks[DLM_CALLBACKS_SIZE];
+	int i, rv, resid;
+
+repeat:
+	spin_lock(&ast_queue_lock);
+	list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
+		r = lkb->lkb_resource;
+		ls = r->res_ls;
+
+		if (dlm_locking_stopped(ls))
+			continue;
+
+		/* we remove from astqueue list and remove everything in
+		   lkb_callbacks before releasing the spinlock so empty
+		   lkb_astqueue is always consistent with empty lkb_callbacks */
+
+		list_del_init(&lkb->lkb_astqueue);
+
+		castfn = lkb->lkb_astfn;
+		bastfn = lkb->lkb_bastfn;
+
+		memset(&callbacks, 0, sizeof(callbacks));
+
+		for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+			rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid);
+			if (rv < 0)
+				break;
+		}
+		spin_unlock(&ast_queue_lock);
+
+		if (resid) {
+			/* shouldn't happen, for loop should have removed all */
+			log_error(ls, "callback resid %d lkb %x",
+				  resid, lkb->lkb_id);
+		}
+
+		for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+			if (!callbacks[i].seq)
+				break;
+			if (callbacks[i].flags & DLM_CB_SKIP) {
+				continue;
+			} else if (callbacks[i].flags & DLM_CB_BAST) {
+				bastfn(lkb->lkb_astparam, callbacks[i].mode);
+			} else if (callbacks[i].flags & DLM_CB_CAST) {
+				lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
+				lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
+				castfn(lkb->lkb_astparam);
+			}
+		}
+
+		/* removes ref for ast_queue, may cause lkb to be freed */
+		dlm_put_lkb(lkb);
+
+		cond_resched();
+		goto repeat;
+	}
+	spin_unlock(&ast_queue_lock);
+}
+
+static inline int no_asts(void)
+{
+	int ret;
+
+	spin_lock(&ast_queue_lock);
+	ret = list_empty(&ast_queue);
+	spin_unlock(&ast_queue_lock);
+	return ret;
+}
+
+static int dlm_astd(void *data)
+{
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!test_bit(WAKE_ASTS, &astd_wakeflags))
+			schedule();
+		set_current_state(TASK_RUNNING);
+
+		mutex_lock(&astd_running);
+		if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
+			process_asts();
+		mutex_unlock(&astd_running);
+	}
+	return 0;
+}
+
+void dlm_astd_wake(void)
+{
+	if (!no_asts()) {
+		set_bit(WAKE_ASTS, &astd_wakeflags);
+		wake_up_process(astd_task);
+	}
+}
+
+int dlm_astd_start(void)
+{
+	struct task_struct *p;
+	int error = 0;
+
+	INIT_LIST_HEAD(&ast_queue);
+	spin_lock_init(&ast_queue_lock);
+	mutex_init(&astd_running);
+
+	p = kthread_run(dlm_astd, NULL, "dlm_astd");
+	if (IS_ERR(p))
+		error = PTR_ERR(p);
+	else
+		astd_task = p;
+	return error;
+}
+
+void dlm_astd_stop(void)
+{
+	kthread_stop(astd_task);
+}
+
+void dlm_astd_suspend(void)
+{
+	mutex_lock(&astd_running);
+}
+
+void dlm_astd_resume(void)
+{
+	mutex_unlock(&astd_running);
+}
+
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 00000000..8aa89c9b
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,31 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __ASTD_DOT_H__
+#define __ASTD_DOT_H__
+
+void dlm_del_ast(struct dlm_lkb *lkb);
+int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+                         int status, uint32_t sbflags, uint64_t seq);
+int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                         struct dlm_callback *cb, int *resid);
+void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
+		 uint32_t sbflags);
+
+void dlm_astd_wake(void);
+int dlm_astd_start(void);
+void dlm_astd_stop(void);
+void dlm_astd_suspend(void);
+void dlm_astd_resume(void);
+
+#endif
+
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 00000000..9b026ea8
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,1010 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/configfs.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
+#include <net/sock.h>
+
+#include "config.h"
+#include "lowcomms.h"
+
+/*
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
+ * /config/dlm/<cluster>/comms/<comm>/nodeid
+ * /config/dlm/<cluster>/comms/<comm>/local
+ * /config/dlm/<cluster>/comms/<comm>/addr
+ * The <cluster> level is useless, but I haven't figured out how to avoid it.
+ */
+
+static struct config_group *space_list;
+static struct config_group *comm_list;
+static struct dlm_comm *local_comm;
+
+struct dlm_clusters;
+struct dlm_cluster;
+struct dlm_spaces;
+struct dlm_space;
+struct dlm_comms;
+struct dlm_comm;
+struct dlm_nodes;
+struct dlm_node;
+
+static struct config_group *make_cluster(struct config_group *, const char *);
+static void drop_cluster(struct config_group *, struct config_item *);
+static void release_cluster(struct config_item *);
+static struct config_group *make_space(struct config_group *, const char *);
+static void drop_space(struct config_group *, struct config_item *);
+static void release_space(struct config_item *);
+static struct config_item *make_comm(struct config_group *, const char *);
+static void drop_comm(struct config_group *, struct config_item *);
+static void release_comm(struct config_item *);
+static struct config_item *make_node(struct config_group *, const char *);
+static void drop_node(struct config_group *, struct config_item *);
+static void release_node(struct config_item *);
+
+static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
+			    char *buf);
+static ssize_t store_cluster(struct config_item *i,
+			     struct configfs_attribute *a,
+			     const char *buf, size_t len);
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+			 char *buf);
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+			  const char *buf, size_t len);
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+			 char *buf);
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+			  const char *buf, size_t len);
+
+static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf);
+static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
+				size_t len);
+static ssize_t comm_local_read(struct dlm_comm *cm, char *buf);
+static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
+				size_t len);
+static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf,
+				size_t len);
+static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf);
+static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
+				size_t len);
+static ssize_t node_weight_read(struct dlm_node *nd, char *buf);
+static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+				size_t len);
+
+struct dlm_cluster {
+	struct config_group group;
+	unsigned int cl_tcp_port;
+	unsigned int cl_buffer_size;
+	unsigned int cl_rsbtbl_size;
+	unsigned int cl_lkbtbl_size;
+	unsigned int cl_dirtbl_size;
+	unsigned int cl_recover_timer;
+	unsigned int cl_toss_secs;
+	unsigned int cl_scan_secs;
+	unsigned int cl_log_debug;
+	unsigned int cl_protocol;
+	unsigned int cl_timewarn_cs;
+	unsigned int cl_waitwarn_us;
+};
+
+enum {
+	CLUSTER_ATTR_TCP_PORT = 0,
+	CLUSTER_ATTR_BUFFER_SIZE,
+	CLUSTER_ATTR_RSBTBL_SIZE,
+	CLUSTER_ATTR_LKBTBL_SIZE,
+	CLUSTER_ATTR_DIRTBL_SIZE,
+	CLUSTER_ATTR_RECOVER_TIMER,
+	CLUSTER_ATTR_TOSS_SECS,
+	CLUSTER_ATTR_SCAN_SECS,
+	CLUSTER_ATTR_LOG_DEBUG,
+	CLUSTER_ATTR_PROTOCOL,
+	CLUSTER_ATTR_TIMEWARN_CS,
+	CLUSTER_ATTR_WAITWARN_US,
+};
+
+struct cluster_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct dlm_cluster *, char *);
+	ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
+};
+
+static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
+			   int *info_field, int check_zero,
+			   const char *buf, size_t len)
+{
+	unsigned int x;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	x = simple_strtoul(buf, NULL, 0);
+
+	if (check_zero && !x)
+		return -EINVAL;
+
+	*cl_field = x;
+	*info_field = x;
+
+	return len;
+}
+
+#define CLUSTER_ATTR(name, check_zero)                                        \
+static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \
+{                                                                             \
+	return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name,         \
+			   check_zero, buf, len);                             \
+}                                                                             \
+static ssize_t name##_read(struct dlm_cluster *cl, char *buf)                 \
+{                                                                             \
+	return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name);               \
+}                                                                             \
+static struct cluster_attribute cluster_attr_##name =                         \
+__CONFIGFS_ATTR(name, 0644, name##_read, name##_write)
+
+CLUSTER_ATTR(tcp_port, 1);
+CLUSTER_ATTR(buffer_size, 1);
+CLUSTER_ATTR(rsbtbl_size, 1);
+CLUSTER_ATTR(lkbtbl_size, 1);
+CLUSTER_ATTR(dirtbl_size, 1);
+CLUSTER_ATTR(recover_timer, 1);
+CLUSTER_ATTR(toss_secs, 1);
+CLUSTER_ATTR(scan_secs, 1);
+CLUSTER_ATTR(log_debug, 0);
+CLUSTER_ATTR(protocol, 0);
+CLUSTER_ATTR(timewarn_cs, 1);
+CLUSTER_ATTR(waitwarn_us, 0);
+
+static struct configfs_attribute *cluster_attrs[] = {
+	[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
+	[CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr,
+	[CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr,
+	[CLUSTER_ATTR_LKBTBL_SIZE] = &cluster_attr_lkbtbl_size.attr,
+	[CLUSTER_ATTR_DIRTBL_SIZE] = &cluster_attr_dirtbl_size.attr,
+	[CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr,
+	[CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
+	[CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
+	[CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
+	[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
+	[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
+	[CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
+	NULL,
+};
+
+enum {
+	COMM_ATTR_NODEID = 0,
+	COMM_ATTR_LOCAL,
+	COMM_ATTR_ADDR,
+};
+
+struct comm_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct dlm_comm *, char *);
+	ssize_t (*store)(struct dlm_comm *, const char *, size_t);
+};
+
+static struct comm_attribute comm_attr_nodeid = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "nodeid",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.show   = comm_nodeid_read,
+	.store  = comm_nodeid_write,
+};
+
+static struct comm_attribute comm_attr_local = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "local",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.show   = comm_local_read,
+	.store  = comm_local_write,
+};
+
+static struct comm_attribute comm_attr_addr = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "addr",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.store  = comm_addr_write,
+};
+
+static struct configfs_attribute *comm_attrs[] = {
+	[COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
+	[COMM_ATTR_LOCAL] = &comm_attr_local.attr,
+	[COMM_ATTR_ADDR] = &comm_attr_addr.attr,
+	NULL,
+};
+
+enum {
+	NODE_ATTR_NODEID = 0,
+	NODE_ATTR_WEIGHT,
+};
+
+struct node_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct dlm_node *, char *);
+	ssize_t (*store)(struct dlm_node *, const char *, size_t);
+};
+
+static struct node_attribute node_attr_nodeid = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "nodeid",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.show   = node_nodeid_read,
+	.store  = node_nodeid_write,
+};
+
+static struct node_attribute node_attr_weight = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "weight",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.show   = node_weight_read,
+	.store  = node_weight_write,
+};
+
+static struct configfs_attribute *node_attrs[] = {
+	[NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
+	[NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
+	NULL,
+};
+
+struct dlm_clusters {
+	struct configfs_subsystem subsys;
+};
+
+struct dlm_spaces {
+	struct config_group ss_group;
+};
+
+struct dlm_space {
+	struct config_group group;
+	struct list_head members;
+	struct mutex members_lock;
+	int members_count;
+};
+
+struct dlm_comms {
+	struct config_group cs_group;
+};
+
+struct dlm_comm {
+	struct config_item item;
+	int nodeid;
+	int local;
+	int addr_count;
+	struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
+};
+
+struct dlm_nodes {
+	struct config_group ns_group;
+};
+
+struct dlm_node {
+	struct config_item item;
+	struct list_head list; /* space->members */
+	int nodeid;
+	int weight;
+	int new;
+};
+
+static struct configfs_group_operations clusters_ops = {
+	.make_group = make_cluster,
+	.drop_item = drop_cluster,
+};
+
+static struct configfs_item_operations cluster_ops = {
+	.release = release_cluster,
+	.show_attribute = show_cluster,
+	.store_attribute = store_cluster,
+};
+
+static struct configfs_group_operations spaces_ops = {
+	.make_group = make_space,
+	.drop_item = drop_space,
+};
+
+static struct configfs_item_operations space_ops = {
+	.release = release_space,
+};
+
+static struct configfs_group_operations comms_ops = {
+	.make_item = make_comm,
+	.drop_item = drop_comm,
+};
+
+static struct configfs_item_operations comm_ops = {
+	.release = release_comm,
+	.show_attribute = show_comm,
+	.store_attribute = store_comm,
+};
+
+static struct configfs_group_operations nodes_ops = {
+	.make_item = make_node,
+	.drop_item = drop_node,
+};
+
+static struct configfs_item_operations node_ops = {
+	.release = release_node,
+	.show_attribute = show_node,
+	.store_attribute = store_node,
+};
+
+static struct config_item_type clusters_type = {
+	.ct_group_ops = &clusters_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type cluster_type = {
+	.ct_item_ops = &cluster_ops,
+	.ct_attrs = cluster_attrs,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type spaces_type = {
+	.ct_group_ops = &spaces_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type space_type = {
+	.ct_item_ops = &space_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type comms_type = {
+	.ct_group_ops = &comms_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type comm_type = {
+	.ct_item_ops = &comm_ops,
+	.ct_attrs = comm_attrs,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type nodes_type = {
+	.ct_group_ops = &nodes_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type node_type = {
+	.ct_item_ops = &node_ops,
+	.ct_attrs = node_attrs,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
+{
+	return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
+		   NULL;
+}
+
+static struct dlm_space *config_item_to_space(struct config_item *i)
+{
+	return i ? container_of(to_config_group(i), struct dlm_space, group) :
+		   NULL;
+}
+
+static struct dlm_comm *config_item_to_comm(struct config_item *i)
+{
+	return i ? container_of(i, struct dlm_comm, item) : NULL;
+}
+
+static struct dlm_node *config_item_to_node(struct config_item *i)
+{
+	return i ? container_of(i, struct dlm_node, item) : NULL;
+}
+
+static struct config_group *make_cluster(struct config_group *g,
+					 const char *name)
+{
+	struct dlm_cluster *cl = NULL;
+	struct dlm_spaces *sps = NULL;
+	struct dlm_comms *cms = NULL;
+	void *gps = NULL;
+
+	cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
+	gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS);
+	sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
+	cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
+
+	if (!cl || !gps || !sps || !cms)
+		goto fail;
+
+	config_group_init_type_name(&cl->group, name, &cluster_type);
+	config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
+	config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
+
+	cl->group.default_groups = gps;
+	cl->group.default_groups[0] = &sps->ss_group;
+	cl->group.default_groups[1] = &cms->cs_group;
+	cl->group.default_groups[2] = NULL;
+
+	cl->cl_tcp_port = dlm_config.ci_tcp_port;
+	cl->cl_buffer_size = dlm_config.ci_buffer_size;
+	cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size;
+	cl->cl_lkbtbl_size = dlm_config.ci_lkbtbl_size;
+	cl->cl_dirtbl_size = dlm_config.ci_dirtbl_size;
+	cl->cl_recover_timer = dlm_config.ci_recover_timer;
+	cl->cl_toss_secs = dlm_config.ci_toss_secs;
+	cl->cl_scan_secs = dlm_config.ci_scan_secs;
+	cl->cl_log_debug = dlm_config.ci_log_debug;
+	cl->cl_protocol = dlm_config.ci_protocol;
+	cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
+	cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
+
+	space_list = &sps->ss_group;
+	comm_list = &cms->cs_group;
+	return &cl->group;
+
+ fail:
+	kfree(cl);
+	kfree(gps);
+	kfree(sps);
+	kfree(cms);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void drop_cluster(struct config_group *g, struct config_item *i)
+{
+	struct dlm_cluster *cl = config_item_to_cluster(i);
+	struct config_item *tmp;
+	int j;
+
+	for (j = 0; cl->group.default_groups[j]; j++) {
+		tmp = &cl->group.default_groups[j]->cg_item;
+		cl->group.default_groups[j] = NULL;
+		config_item_put(tmp);
+	}
+
+	space_list = NULL;
+	comm_list = NULL;
+
+	config_item_put(i);
+}
+
+static void release_cluster(struct config_item *i)
+{
+	struct dlm_cluster *cl = config_item_to_cluster(i);
+	kfree(cl->group.default_groups);
+	kfree(cl);
+}
+
+static struct config_group *make_space(struct config_group *g, const char *name)
+{
+	struct dlm_space *sp = NULL;
+	struct dlm_nodes *nds = NULL;
+	void *gps = NULL;
+
+	sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS);
+	gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS);
+	nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS);
+
+	if (!sp || !gps || !nds)
+		goto fail;
+
+	config_group_init_type_name(&sp->group, name, &space_type);
+	config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
+
+	sp->group.default_groups = gps;
+	sp->group.default_groups[0] = &nds->ns_group;
+	sp->group.default_groups[1] = NULL;
+
+	INIT_LIST_HEAD(&sp->members);
+	mutex_init(&sp->members_lock);
+	sp->members_count = 0;
+	return &sp->group;
+
+ fail:
+	kfree(sp);
+	kfree(gps);
+	kfree(nds);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void drop_space(struct config_group *g, struct config_item *i)
+{
+	struct dlm_space *sp = config_item_to_space(i);
+	struct config_item *tmp;
+	int j;
+
+	/* assert list_empty(&sp->members) */
+
+	for (j = 0; sp->group.default_groups[j]; j++) {
+		tmp = &sp->group.default_groups[j]->cg_item;
+		sp->group.default_groups[j] = NULL;
+		config_item_put(tmp);
+	}
+
+	config_item_put(i);
+}
+
+static void release_space(struct config_item *i)
+{
+	struct dlm_space *sp = config_item_to_space(i);
+	kfree(sp->group.default_groups);
+	kfree(sp);
+}
+
+static struct config_item *make_comm(struct config_group *g, const char *name)
+{
+	struct dlm_comm *cm;
+
+	cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS);
+	if (!cm)
+		return ERR_PTR(-ENOMEM);
+
+	config_item_init_type_name(&cm->item, name, &comm_type);
+	cm->nodeid = -1;
+	cm->local = 0;
+	cm->addr_count = 0;
+	return &cm->item;
+}
+
+static void drop_comm(struct config_group *g, struct config_item *i)
+{
+	struct dlm_comm *cm = config_item_to_comm(i);
+	if (local_comm == cm)
+		local_comm = NULL;
+	dlm_lowcomms_close(cm->nodeid);
+	while (cm->addr_count--)
+		kfree(cm->addr[cm->addr_count]);
+	config_item_put(i);
+}
+
+static void release_comm(struct config_item *i)
+{
+	struct dlm_comm *cm = config_item_to_comm(i);
+	kfree(cm);
+}
+
+static struct config_item *make_node(struct config_group *g, const char *name)
+{
+	struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
+	struct dlm_node *nd;
+
+	nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS);
+	if (!nd)
+		return ERR_PTR(-ENOMEM);
+
+	config_item_init_type_name(&nd->item, name, &node_type);
+	nd->nodeid = -1;
+	nd->weight = 1;  /* default weight of 1 if none is set */
+	nd->new = 1;     /* set to 0 once it's been read by dlm_nodeid_list() */
+
+	mutex_lock(&sp->members_lock);
+	list_add(&nd->list, &sp->members);
+	sp->members_count++;
+	mutex_unlock(&sp->members_lock);
+
+	return &nd->item;
+}
+
+static void drop_node(struct config_group *g, struct config_item *i)
+{
+	struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
+	struct dlm_node *nd = config_item_to_node(i);
+
+	mutex_lock(&sp->members_lock);
+	list_del(&nd->list);
+	sp->members_count--;
+	mutex_unlock(&sp->members_lock);
+
+	config_item_put(i);
+}
+
+static void release_node(struct config_item *i)
+{
+	struct dlm_node *nd = config_item_to_node(i);
+	kfree(nd);
+}
+
+static struct dlm_clusters clusters_root = {
+	.subsys = {
+		.su_group = {
+			.cg_item = {
+				.ci_namebuf = "dlm",
+				.ci_type = &clusters_type,
+			},
+		},
+	},
+};
+
+int __init dlm_config_init(void)
+{
+	config_group_init(&clusters_root.subsys.su_group);
+	mutex_init(&clusters_root.subsys.su_mutex);
+	return configfs_register_subsystem(&clusters_root.subsys);
+}
+
+void dlm_config_exit(void)
+{
+	configfs_unregister_subsystem(&clusters_root.subsys);
+}
+
+/*
+ * Functions for user space to read/write attributes
+ */
+
+static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
+			    char *buf)
+{
+	struct dlm_cluster *cl = config_item_to_cluster(i);
+	struct cluster_attribute *cla =
+			container_of(a, struct cluster_attribute, attr);
+	return cla->show ? cla->show(cl, buf) : 0;
+}
+
+static ssize_t store_cluster(struct config_item *i,
+			     struct configfs_attribute *a,
+			     const char *buf, size_t len)
+{
+	struct dlm_cluster *cl = config_item_to_cluster(i);
+	struct cluster_attribute *cla =
+		container_of(a, struct cluster_attribute, attr);
+	return cla->store ? cla->store(cl, buf, len) : -EINVAL;
+}
+
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+			 char *buf)
+{
+	struct dlm_comm *cm = config_item_to_comm(i);
+	struct comm_attribute *cma =
+			container_of(a, struct comm_attribute, attr);
+	return cma->show ? cma->show(cm, buf) : 0;
+}
+
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+			  const char *buf, size_t len)
+{
+	struct dlm_comm *cm = config_item_to_comm(i);
+	struct comm_attribute *cma =
+		container_of(a, struct comm_attribute, attr);
+	return cma->store ? cma->store(cm, buf, len) : -EINVAL;
+}
+
+static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf)
+{
+	return sprintf(buf, "%d\n", cm->nodeid);
+}
+
+static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
+				 size_t len)
+{
+	cm->nodeid = simple_strtol(buf, NULL, 0);
+	return len;
+}
+
+static ssize_t comm_local_read(struct dlm_comm *cm, char *buf)
+{
+	return sprintf(buf, "%d\n", cm->local);
+}
+
+static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
+				size_t len)
+{
+	cm->local= simple_strtol(buf, NULL, 0);
+	if (cm->local && !local_comm)
+		local_comm = cm;
+	return len;
+}
+
+static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
+{
+	struct sockaddr_storage *addr;
+
+	if (len != sizeof(struct sockaddr_storage))
+		return -EINVAL;
+
+	if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
+		return -ENOSPC;
+
+	addr = kzalloc(sizeof(*addr), GFP_NOFS);
+	if (!addr)
+		return -ENOMEM;
+
+	memcpy(addr, buf, len);
+	cm->addr[cm->addr_count++] = addr;
+	return len;
+}
+
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+			 char *buf)
+{
+	struct dlm_node *nd = config_item_to_node(i);
+	struct node_attribute *nda =
+			container_of(a, struct node_attribute, attr);
+	return nda->show ? nda->show(nd, buf) : 0;
+}
+
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+			  const char *buf, size_t len)
+{
+	struct dlm_node *nd = config_item_to_node(i);
+	struct node_attribute *nda =
+		container_of(a, struct node_attribute, attr);
+	return nda->store ? nda->store(nd, buf, len) : -EINVAL;
+}
+
+static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
+{
+	return sprintf(buf, "%d\n", nd->nodeid);
+}
+
+static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
+				 size_t len)
+{
+	nd->nodeid = simple_strtol(buf, NULL, 0);
+	return len;
+}
+
+static ssize_t node_weight_read(struct dlm_node *nd, char *buf)
+{
+	return sprintf(buf, "%d\n", nd->weight);
+}
+
+static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+				 size_t len)
+{
+	nd->weight = simple_strtol(buf, NULL, 0);
+	return len;
+}
+
+/*
+ * Functions for the dlm to get the info that's been configured
+ */
+
+static struct dlm_space *get_space(char *name)
+{
+	struct config_item *i;
+
+	if (!space_list)
+		return NULL;
+
+	mutex_lock(&space_list->cg_subsys->su_mutex);
+	i = config_group_find_item(space_list, name);
+	mutex_unlock(&space_list->cg_subsys->su_mutex);
+
+	return config_item_to_space(i);
+}
+
+static void put_space(struct dlm_space *sp)
+{
+	config_item_put(&sp->group.cg_item);
+}
+
+static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
+{
+	switch (x->ss_family) {
+	case AF_INET: {
+		struct sockaddr_in *sinx = (struct sockaddr_in *)x;
+		struct sockaddr_in *siny = (struct sockaddr_in *)y;
+		if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
+			return 0;
+		if (sinx->sin_port != siny->sin_port)
+			return 0;
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
+		struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
+		if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
+			return 0;
+		if (sinx->sin6_port != siny->sin6_port)
+			return 0;
+		break;
+	}
+	default:
+		return 0;
+	}
+	return 1;
+}
+
+static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
+{
+	struct config_item *i;
+	struct dlm_comm *cm = NULL;
+	int found = 0;
+
+	if (!comm_list)
+		return NULL;
+
+	mutex_lock(&clusters_root.subsys.su_mutex);
+
+	list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
+		cm = config_item_to_comm(i);
+
+		if (nodeid) {
+			if (cm->nodeid != nodeid)
+				continue;
+			found = 1;
+			config_item_get(i);
+			break;
+		} else {
+			if (!cm->addr_count || !addr_compare(cm->addr[0], addr))
+				continue;
+			found = 1;
+			config_item_get(i);
+			break;
+		}
+	}
+	mutex_unlock(&clusters_root.subsys.su_mutex);
+
+	if (!found)
+		cm = NULL;
+	return cm;
+}
+
+static void put_comm(struct dlm_comm *cm)
+{
+	config_item_put(&cm->item);
+}
+
+/* caller must free mem */
+int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
+		    int **new_out, int *new_count_out)
+{
+	struct dlm_space *sp;
+	struct dlm_node *nd;
+	int i = 0, rv = 0, ids_count = 0, new_count = 0;
+	int *ids, *new;
+
+	sp = get_space(lsname);
+	if (!sp)
+		return -EEXIST;
+
+	mutex_lock(&sp->members_lock);
+	if (!sp->members_count) {
+		rv = -EINVAL;
+		printk(KERN_ERR "dlm: zero members_count\n");
+		goto out;
+	}
+
+	ids_count = sp->members_count;
+
+	ids = kcalloc(ids_count, sizeof(int), GFP_NOFS);
+	if (!ids) {
+		rv = -ENOMEM;
+		goto out;
+	}
+
+	list_for_each_entry(nd, &sp->members, list) {
+		ids[i++] = nd->nodeid;
+		if (nd->new)
+			new_count++;
+	}
+
+	if (ids_count != i)
+		printk(KERN_ERR "dlm: bad nodeid count %d %d\n", ids_count, i);
+
+	if (!new_count)
+		goto out_ids;
+
+	new = kcalloc(new_count, sizeof(int), GFP_NOFS);
+	if (!new) {
+		kfree(ids);
+		rv = -ENOMEM;
+		goto out;
+	}
+
+	i = 0;
+	list_for_each_entry(nd, &sp->members, list) {
+		if (nd->new) {
+			new[i++] = nd->nodeid;
+			nd->new = 0;
+		}
+	}
+	*new_count_out = new_count;
+	*new_out = new;
+
+ out_ids:
+	*ids_count_out = ids_count;
+	*ids_out = ids;
+ out:
+	mutex_unlock(&sp->members_lock);
+	put_space(sp);
+	return rv;
+}
+
+int dlm_node_weight(char *lsname, int nodeid)
+{
+	struct dlm_space *sp;
+	struct dlm_node *nd;
+	int w = -EEXIST;
+
+	sp = get_space(lsname);
+	if (!sp)
+		goto out;
+
+	mutex_lock(&sp->members_lock);
+	list_for_each_entry(nd, &sp->members, list) {
+		if (nd->nodeid != nodeid)
+			continue;
+		w = nd->weight;
+		break;
+	}
+	mutex_unlock(&sp->members_lock);
+	put_space(sp);
+ out:
+	return w;
+}
+
+int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
+{
+	struct dlm_comm *cm = get_comm(nodeid, NULL);
+	if (!cm)
+		return -EEXIST;
+	if (!cm->addr_count)
+		return -ENOENT;
+	memcpy(addr, cm->addr[0], sizeof(*addr));
+	put_comm(cm);
+	return 0;
+}
+
+int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
+{
+	struct dlm_comm *cm = get_comm(0, addr);
+	if (!cm)
+		return -EEXIST;
+	*nodeid = cm->nodeid;
+	put_comm(cm);
+	return 0;
+}
+
+int dlm_our_nodeid(void)
+{
+	return local_comm ? local_comm->nodeid : 0;
+}
+
+/* num 0 is first addr, num 1 is second addr */
+int dlm_our_addr(struct sockaddr_storage *addr, int num)
+{
+	if (!local_comm)
+		return -1;
+	if (num + 1 > local_comm->addr_count)
+		return -1;
+	memcpy(addr, local_comm->addr[num], sizeof(*addr));
+	return 0;
+}
+
+/* Config file defaults */
+#define DEFAULT_TCP_PORT       21064
+#define DEFAULT_BUFFER_SIZE     4096
+#define DEFAULT_RSBTBL_SIZE     1024
+#define DEFAULT_LKBTBL_SIZE     1024
+#define DEFAULT_DIRTBL_SIZE     1024
+#define DEFAULT_RECOVER_TIMER      5
+#define DEFAULT_TOSS_SECS         10
+#define DEFAULT_SCAN_SECS          5
+#define DEFAULT_LOG_DEBUG          0
+#define DEFAULT_PROTOCOL           0
+#define DEFAULT_TIMEWARN_CS      500 /* 5 sec = 500 centiseconds */
+#define DEFAULT_WAITWARN_US	   0
+
+struct dlm_config_info dlm_config = {
+	.ci_tcp_port = DEFAULT_TCP_PORT,
+	.ci_buffer_size = DEFAULT_BUFFER_SIZE,
+	.ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE,
+	.ci_lkbtbl_size = DEFAULT_LKBTBL_SIZE,
+	.ci_dirtbl_size = DEFAULT_DIRTBL_SIZE,
+	.ci_recover_timer = DEFAULT_RECOVER_TIMER,
+	.ci_toss_secs = DEFAULT_TOSS_SECS,
+	.ci_scan_secs = DEFAULT_SCAN_SECS,
+	.ci_log_debug = DEFAULT_LOG_DEBUG,
+	.ci_protocol = DEFAULT_PROTOCOL,
+	.ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
+	.ci_waitwarn_us = DEFAULT_WAITWARN_US
+};
+
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 00000000..dd0ce24d
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,47 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __CONFIG_DOT_H__
+#define __CONFIG_DOT_H__
+
+#define DLM_MAX_ADDR_COUNT 3
+
+struct dlm_config_info {
+	int ci_tcp_port;
+	int ci_buffer_size;
+	int ci_rsbtbl_size;
+	int ci_lkbtbl_size;
+	int ci_dirtbl_size;
+	int ci_recover_timer;
+	int ci_toss_secs;
+	int ci_scan_secs;
+	int ci_log_debug;
+	int ci_protocol;
+	int ci_timewarn_cs;
+	int ci_waitwarn_us;
+};
+
+extern struct dlm_config_info dlm_config;
+
+int dlm_config_init(void);
+void dlm_config_exit(void);
+int dlm_node_weight(char *lsname, int nodeid);
+int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
+		    int **new_out, int *new_count_out);
+int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
+int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
+int dlm_our_nodeid(void);
+int dlm_our_addr(struct sockaddr_storage *addr, int num);
+
+#endif				/* __CONFIG_DOT_H__ */
+
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 00000000..59779237
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,731 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2009 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+
+#include "dlm_internal.h"
+#include "lock.h"
+
+#define DLM_DEBUG_BUF_LEN 4096
+static char debug_buf[DLM_DEBUG_BUF_LEN];
+static struct mutex debug_buf_lock;
+
+static struct dentry *dlm_root;
+
+static char *print_lockmode(int mode)
+{
+	switch (mode) {
+	case DLM_LOCK_IV:
+		return "--";
+	case DLM_LOCK_NL:
+		return "NL";
+	case DLM_LOCK_CR:
+		return "CR";
+	case DLM_LOCK_CW:
+		return "CW";
+	case DLM_LOCK_PR:
+		return "PR";
+	case DLM_LOCK_PW:
+		return "PW";
+	case DLM_LOCK_EX:
+		return "EX";
+	default:
+		return "??";
+	}
+}
+
+static int print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			      struct dlm_rsb *res)
+{
+	seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
+
+	if (lkb->lkb_status == DLM_LKSTS_CONVERT ||
+	    lkb->lkb_status == DLM_LKSTS_WAITING)
+		seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
+
+	if (lkb->lkb_nodeid) {
+		if (lkb->lkb_nodeid != res->res_nodeid)
+			seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
+				   lkb->lkb_remid);
+		else
+			seq_printf(s, " Master:     %08x", lkb->lkb_remid);
+	}
+
+	if (lkb->lkb_wait_type)
+		seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+
+	return seq_printf(s, "\n");
+}
+
+static int print_format1(struct dlm_rsb *res, struct seq_file *s)
+{
+	struct dlm_lkb *lkb;
+	int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
+	int rv;
+
+	lock_rsb(res);
+
+	rv = seq_printf(s, "\nResource %p Name (len=%d) \"",
+			res, res->res_length);
+	if (rv)
+		goto out;
+
+	for (i = 0; i < res->res_length; i++) {
+		if (isprint(res->res_name[i]))
+			seq_printf(s, "%c", res->res_name[i]);
+		else
+			seq_printf(s, "%c", '.');
+	}
+
+	if (res->res_nodeid > 0)
+		rv = seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
+				res->res_nodeid);
+	else if (res->res_nodeid == 0)
+		rv = seq_printf(s, "\"  \nMaster Copy\n");
+	else if (res->res_nodeid == -1)
+		rv = seq_printf(s, "\"  \nLooking up master (lkid %x)\n",
+			   	res->res_first_lkid);
+	else
+		rv = seq_printf(s, "\"  \nInvalid master %d\n",
+				res->res_nodeid);
+	if (rv)
+		goto out;
+
+	/* Print the LVB: */
+	if (res->res_lvbptr) {
+		seq_printf(s, "LVB: ");
+		for (i = 0; i < lvblen; i++) {
+			if (i == lvblen / 2)
+				seq_printf(s, "\n     ");
+			seq_printf(s, "%02x ",
+				   (unsigned char) res->res_lvbptr[i]);
+		}
+		if (rsb_flag(res, RSB_VALNOTVALID))
+			seq_printf(s, " (INVALID)");
+		rv = seq_printf(s, "\n");
+		if (rv)
+			goto out;
+	}
+
+	root_list = !list_empty(&res->res_root_list);
+	recover_list = !list_empty(&res->res_recover_list);
+
+	if (root_list || recover_list) {
+		rv = seq_printf(s, "Recovery: root %d recover %d flags %lx "
+				"count %d\n", root_list, recover_list,
+			   	res->res_flags, res->res_recover_locks_count);
+		if (rv)
+			goto out;
+	}
+
+	/* Print the locks attached to this resource */
+	seq_printf(s, "Granted Queue\n");
+	list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) {
+		rv = print_format1_lock(s, lkb, res);
+		if (rv)
+			goto out;
+	}
+
+	seq_printf(s, "Conversion Queue\n");
+	list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) {
+		rv = print_format1_lock(s, lkb, res);
+		if (rv)
+			goto out;
+	}
+
+	seq_printf(s, "Waiting Queue\n");
+	list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) {
+		rv = print_format1_lock(s, lkb, res);
+		if (rv)
+			goto out;
+	}
+
+	if (list_empty(&res->res_lookup))
+		goto out;
+
+	seq_printf(s, "Lookup Queue\n");
+	list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
+		rv = seq_printf(s, "%08x %s", lkb->lkb_id,
+				print_lockmode(lkb->lkb_rqmode));
+		if (lkb->lkb_wait_type)
+			seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+		rv = seq_printf(s, "\n");
+	}
+ out:
+	unlock_rsb(res);
+	return rv;
+}
+
+static int print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			      struct dlm_rsb *r)
+{
+	u64 xid = 0;
+	u64 us;
+	int rv;
+
+	if (lkb->lkb_flags & DLM_IFL_USER) {
+		if (lkb->lkb_ua)
+			xid = lkb->lkb_ua->xid;
+	}
+
+	/* microseconds since lkb was added to current queue */
+	us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp));
+
+	/* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
+	   r_nodeid r_len r_name */
+
+	rv = seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
+			lkb->lkb_id,
+			lkb->lkb_nodeid,
+			lkb->lkb_remid,
+			lkb->lkb_ownpid,
+			(unsigned long long)xid,
+			lkb->lkb_exflags,
+			lkb->lkb_flags,
+			lkb->lkb_status,
+			lkb->lkb_grmode,
+			lkb->lkb_rqmode,
+			(unsigned long long)us,
+			r->res_nodeid,
+			r->res_length,
+			r->res_name);
+	return rv;
+}
+
+static int print_format2(struct dlm_rsb *r, struct seq_file *s)
+{
+	struct dlm_lkb *lkb;
+	int rv = 0;
+
+	lock_rsb(r);
+
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+		rv = print_format2_lock(s, lkb, r);
+		if (rv)
+			goto out;
+	}
+
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+		rv = print_format2_lock(s, lkb, r);
+		if (rv)
+			goto out;
+	}
+
+	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
+		rv = print_format2_lock(s, lkb, r);
+		if (rv)
+			goto out;
+	}
+ out:
+	unlock_rsb(r);
+	return rv;
+}
+
+static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			      int rsb_lookup)
+{
+	u64 xid = 0;
+	int rv;
+
+	if (lkb->lkb_flags & DLM_IFL_USER) {
+		if (lkb->lkb_ua)
+			xid = lkb->lkb_ua->xid;
+	}
+
+	rv = seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
+			lkb->lkb_id,
+			lkb->lkb_nodeid,
+			lkb->lkb_remid,
+			lkb->lkb_ownpid,
+			(unsigned long long)xid,
+			lkb->lkb_exflags,
+			lkb->lkb_flags,
+			lkb->lkb_status,
+			lkb->lkb_grmode,
+			lkb->lkb_rqmode,
+			lkb->lkb_last_bast.mode,
+			rsb_lookup,
+			lkb->lkb_wait_type,
+			lkb->lkb_lvbseq,
+			(unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
+			(unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time));
+	return rv;
+}
+
+static int print_format3(struct dlm_rsb *r, struct seq_file *s)
+{
+	struct dlm_lkb *lkb;
+	int i, lvblen = r->res_ls->ls_lvblen;
+	int print_name = 1;
+	int rv;
+
+	lock_rsb(r);
+
+	rv = seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
+			r,
+			r->res_nodeid,
+			r->res_first_lkid,
+			r->res_flags,
+			!list_empty(&r->res_root_list),
+			!list_empty(&r->res_recover_list),
+			r->res_recover_locks_count,
+			r->res_length);
+	if (rv)
+		goto out;
+
+	for (i = 0; i < r->res_length; i++) {
+		if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
+			print_name = 0;
+	}
+
+	seq_printf(s, "%s", print_name ? "str " : "hex");
+
+	for (i = 0; i < r->res_length; i++) {
+		if (print_name)
+			seq_printf(s, "%c", r->res_name[i]);
+		else
+			seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
+	}
+	rv = seq_printf(s, "\n");
+	if (rv)
+		goto out;
+
+	if (!r->res_lvbptr)
+		goto do_locks;
+
+	seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen);
+
+	for (i = 0; i < lvblen; i++)
+		seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
+	rv = seq_printf(s, "\n");
+	if (rv)
+		goto out;
+
+ do_locks:
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+		rv = print_format3_lock(s, lkb, 0);
+		if (rv)
+			goto out;
+	}
+
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+		rv = print_format3_lock(s, lkb, 0);
+		if (rv)
+			goto out;
+	}
+
+	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
+		rv = print_format3_lock(s, lkb, 0);
+		if (rv)
+			goto out;
+	}
+
+	list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) {
+		rv = print_format3_lock(s, lkb, 1);
+		if (rv)
+			goto out;
+	}
+ out:
+	unlock_rsb(r);
+	return rv;
+}
+
+struct rsbtbl_iter {
+	struct dlm_rsb *rsb;
+	unsigned bucket;
+	int format;
+	int header;
+};
+
+/* seq_printf returns -1 if the buffer is full, and 0 otherwise.
+   If the buffer is full, seq_printf can be called again, but it
+   does nothing and just returns -1.  So, the these printing routines
+   periodically check the return value to avoid wasting too much time
+   trying to print to a full buffer. */
+
+static int table_seq_show(struct seq_file *seq, void *iter_ptr)
+{
+	struct rsbtbl_iter *ri = iter_ptr;
+	int rv = 0;
+
+	switch (ri->format) {
+	case 1:
+		rv = print_format1(ri->rsb, seq);
+		break;
+	case 2:
+		if (ri->header) {
+			seq_printf(seq, "id nodeid remid pid xid exflags "
+					"flags sts grmode rqmode time_ms "
+					"r_nodeid r_len r_name\n");
+			ri->header = 0;
+		}
+		rv = print_format2(ri->rsb, seq);
+		break;
+	case 3:
+		if (ri->header) {
+			seq_printf(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
+			ri->header = 0;
+		}
+		rv = print_format3(ri->rsb, seq);
+		break;
+	}
+
+	return rv;
+}
+
+static const struct seq_operations format1_seq_ops;
+static const struct seq_operations format2_seq_ops;
+static const struct seq_operations format3_seq_ops;
+
+static void *table_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct dlm_ls *ls = seq->private;
+	struct rsbtbl_iter *ri;
+	struct dlm_rsb *r;
+	loff_t n = *pos;
+	unsigned bucket, entry;
+
+	bucket = n >> 32;
+	entry = n & ((1LL << 32) - 1);
+
+	if (bucket >= ls->ls_rsbtbl_size)
+		return NULL;
+
+	ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS);
+	if (!ri)
+		return NULL;
+	if (n == 0)
+		ri->header = 1;
+	if (seq->op == &format1_seq_ops)
+		ri->format = 1;
+	if (seq->op == &format2_seq_ops)
+		ri->format = 2;
+	if (seq->op == &format3_seq_ops)
+		ri->format = 3;
+
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
+	if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
+		list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list,
+				    res_hashchain) {
+			if (!entry--) {
+				dlm_hold_rsb(r);
+				ri->rsb = r;
+				ri->bucket = bucket;
+				spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+				return ri;
+			}
+		}
+	}
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+
+	/*
+	 * move to the first rsb in the next non-empty bucket
+	 */
+
+	/* zero the entry */
+	n &= ~((1LL << 32) - 1);
+
+	while (1) {
+		bucket++;
+		n += 1LL << 32;
+
+		if (bucket >= ls->ls_rsbtbl_size) {
+			kfree(ri);
+			return NULL;
+		}
+
+		spin_lock(&ls->ls_rsbtbl[bucket].lock);
+		if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
+			r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
+					     struct dlm_rsb, res_hashchain);
+			dlm_hold_rsb(r);
+			ri->rsb = r;
+			ri->bucket = bucket;
+			spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+			*pos = n;
+			return ri;
+		}
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+	}
+}
+
+static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
+{
+	struct dlm_ls *ls = seq->private;
+	struct rsbtbl_iter *ri = iter_ptr;
+	struct list_head *next;
+	struct dlm_rsb *r, *rp;
+	loff_t n = *pos;
+	unsigned bucket;
+
+	bucket = n >> 32;
+
+	/*
+	 * move to the next rsb in the same bucket
+	 */
+
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
+	rp = ri->rsb;
+	next = rp->res_hashchain.next;
+
+	if (next != &ls->ls_rsbtbl[bucket].list) {
+		r = list_entry(next, struct dlm_rsb, res_hashchain);
+		dlm_hold_rsb(r);
+		ri->rsb = r;
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+		dlm_put_rsb(rp);
+		++*pos;
+		return ri;
+	}
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+	dlm_put_rsb(rp);
+
+	/*
+	 * move to the first rsb in the next non-empty bucket
+	 */
+
+	/* zero the entry */
+	n &= ~((1LL << 32) - 1);
+
+	while (1) {
+		bucket++;
+		n += 1LL << 32;
+
+		if (bucket >= ls->ls_rsbtbl_size) {
+			kfree(ri);
+			return NULL;
+		}
+
+		spin_lock(&ls->ls_rsbtbl[bucket].lock);
+		if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
+			r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
+					     struct dlm_rsb, res_hashchain);
+			dlm_hold_rsb(r);
+			ri->rsb = r;
+			ri->bucket = bucket;
+			spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+			*pos = n;
+			return ri;
+		}
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+	}
+}
+
+static void table_seq_stop(struct seq_file *seq, void *iter_ptr)
+{
+	struct rsbtbl_iter *ri = iter_ptr;
+
+	if (ri) {
+		dlm_put_rsb(ri->rsb);
+		kfree(ri);
+	}
+}
+
+static const struct seq_operations format1_seq_ops = {
+	.start = table_seq_start,
+	.next  = table_seq_next,
+	.stop  = table_seq_stop,
+	.show  = table_seq_show,
+};
+
+static const struct seq_operations format2_seq_ops = {
+	.start = table_seq_start,
+	.next  = table_seq_next,
+	.stop  = table_seq_stop,
+	.show  = table_seq_show,
+};
+
+static const struct seq_operations format3_seq_ops = {
+	.start = table_seq_start,
+	.next  = table_seq_next,
+	.stop  = table_seq_stop,
+	.show  = table_seq_show,
+};
+
+static const struct file_operations format1_fops;
+static const struct file_operations format2_fops;
+static const struct file_operations format3_fops;
+
+static int table_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret = -1;
+
+	if (file->f_op == &format1_fops)
+		ret = seq_open(file, &format1_seq_ops);
+	else if (file->f_op == &format2_fops)
+		ret = seq_open(file, &format2_seq_ops);
+	else if (file->f_op == &format3_fops)
+		ret = seq_open(file, &format3_seq_ops);
+
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private; /* the dlm_ls */
+	return 0;
+}
+
+static const struct file_operations format1_fops = {
+	.owner   = THIS_MODULE,
+	.open    = table_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static const struct file_operations format2_fops = {
+	.owner   = THIS_MODULE,
+	.open    = table_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static const struct file_operations format3_fops = {
+	.owner   = THIS_MODULE,
+	.open    = table_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+/*
+ * dump lkb's on the ls_waiters list
+ */
+
+static int waiters_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+static ssize_t waiters_read(struct file *file, char __user *userbuf,
+			    size_t count, loff_t *ppos)
+{
+	struct dlm_ls *ls = file->private_data;
+	struct dlm_lkb *lkb;
+	size_t len = DLM_DEBUG_BUF_LEN, pos = 0, ret, rv;
+
+	mutex_lock(&debug_buf_lock);
+	mutex_lock(&ls->ls_waiters_mutex);
+	memset(debug_buf, 0, sizeof(debug_buf));
+
+	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+		ret = snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n",
+			       lkb->lkb_id, lkb->lkb_wait_type,
+			       lkb->lkb_nodeid, lkb->lkb_resource->res_name);
+		if (ret >= len - pos)
+			break;
+		pos += ret;
+	}
+	mutex_unlock(&ls->ls_waiters_mutex);
+
+	rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos);
+	mutex_unlock(&debug_buf_lock);
+	return rv;
+}
+
+static const struct file_operations waiters_fops = {
+	.owner   = THIS_MODULE,
+	.open    = waiters_open,
+	.read    = waiters_read,
+	.llseek  = default_llseek,
+};
+
+void dlm_delete_debug_file(struct dlm_ls *ls)
+{
+	if (ls->ls_debug_rsb_dentry)
+		debugfs_remove(ls->ls_debug_rsb_dentry);
+	if (ls->ls_debug_waiters_dentry)
+		debugfs_remove(ls->ls_debug_waiters_dentry);
+	if (ls->ls_debug_locks_dentry)
+		debugfs_remove(ls->ls_debug_locks_dentry);
+	if (ls->ls_debug_all_dentry)
+		debugfs_remove(ls->ls_debug_all_dentry);
+}
+
+int dlm_create_debug_file(struct dlm_ls *ls)
+{
+	char name[DLM_LOCKSPACE_LEN+8];
+
+	/* format 1 */
+
+	ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
+						      S_IFREG | S_IRUGO,
+						      dlm_root,
+						      ls,
+						      &format1_fops);
+	if (!ls->ls_debug_rsb_dentry)
+		goto fail;
+
+	/* format 2 */
+
+	memset(name, 0, sizeof(name));
+	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name);
+
+	ls->ls_debug_locks_dentry = debugfs_create_file(name,
+							S_IFREG | S_IRUGO,
+							dlm_root,
+							ls,
+							&format2_fops);
+	if (!ls->ls_debug_locks_dentry)
+		goto fail;
+
+	/* format 3 */
+
+	memset(name, 0, sizeof(name));
+	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name);
+
+	ls->ls_debug_all_dentry = debugfs_create_file(name,
+						      S_IFREG | S_IRUGO,
+						      dlm_root,
+						      ls,
+						      &format3_fops);
+	if (!ls->ls_debug_all_dentry)
+		goto fail;
+
+	memset(name, 0, sizeof(name));
+	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
+
+	ls->ls_debug_waiters_dentry = debugfs_create_file(name,
+							  S_IFREG | S_IRUGO,
+							  dlm_root,
+							  ls,
+							  &waiters_fops);
+	if (!ls->ls_debug_waiters_dentry)
+		goto fail;
+
+	return 0;
+
+ fail:
+	dlm_delete_debug_file(ls);
+	return -ENOMEM;
+}
+
+int __init dlm_register_debugfs(void)
+{
+	mutex_init(&debug_buf_lock);
+	dlm_root = debugfs_create_dir("dlm", NULL);
+	return dlm_root ? 0 : -ENOMEM;
+}
+
+void dlm_unregister_debugfs(void)
+{
+	debugfs_remove(dlm_root);
+}
+
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 00000000..7b84c1db
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,441 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "rcom.h"
+#include "config.h"
+#include "memory.h"
+#include "recover.h"
+#include "util.h"
+#include "lock.h"
+#include "dir.h"
+
+
+static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+	spin_lock(&ls->ls_recover_list_lock);
+	list_add(&de->list, &ls->ls_recover_list);
+	spin_unlock(&ls->ls_recover_list_lock);
+}
+
+static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
+{
+	int found = 0;
+	struct dlm_direntry *de;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	list_for_each_entry(de, &ls->ls_recover_list, list) {
+		if (de->length == len) {
+			list_del(&de->list);
+			de->master_nodeid = 0;
+			memset(de->name, 0, len);
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+
+	if (!found)
+		de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_NOFS);
+	return de;
+}
+
+void dlm_clear_free_entries(struct dlm_ls *ls)
+{
+	struct dlm_direntry *de;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	while (!list_empty(&ls->ls_recover_list)) {
+		de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
+				list);
+		list_del(&de->list);
+		kfree(de);
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+}
+
+/*
+ * We use the upper 16 bits of the hash value to select the directory node.
+ * Low bits are used for distribution of rsb's among hash buckets on each node.
+ *
+ * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
+ * num_nodes to the hash value.  This value in the desired range is used as an
+ * offset into the sorted list of nodeid's to give the particular nodeid.
+ */
+
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
+{
+	struct list_head *tmp;
+	struct dlm_member *memb = NULL;
+	uint32_t node, n = 0;
+	int nodeid;
+
+	if (ls->ls_num_nodes == 1) {
+		nodeid = dlm_our_nodeid();
+		goto out;
+	}
+
+	if (ls->ls_node_array) {
+		node = (hash >> 16) % ls->ls_total_weight;
+		nodeid = ls->ls_node_array[node];
+		goto out;
+	}
+
+	/* make_member_array() failed to kmalloc ls_node_array... */
+
+	node = (hash >> 16) % ls->ls_num_nodes;
+
+	list_for_each(tmp, &ls->ls_nodes) {
+		if (n++ != node)
+			continue;
+		memb = list_entry(tmp, struct dlm_member, list);
+		break;
+	}
+
+	DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
+				 ls->ls_num_nodes, n, node););
+	nodeid = memb->nodeid;
+ out:
+	return nodeid;
+}
+
+int dlm_dir_nodeid(struct dlm_rsb *r)
+{
+	return dlm_hash2nodeid(r->res_ls, r->res_hash);
+}
+
+static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
+{
+	uint32_t val;
+
+	val = jhash(name, len, 0);
+	val &= (ls->ls_dirtbl_size - 1);
+
+	return val;
+}
+
+static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+	uint32_t bucket;
+
+	bucket = dir_hash(ls, de->name, de->length);
+	list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+}
+
+static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
+					  int namelen, uint32_t bucket)
+{
+	struct dlm_direntry *de;
+
+	list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
+		if (de->length == namelen && !memcmp(name, de->name, namelen))
+			goto out;
+	}
+	de = NULL;
+ out:
+	return de;
+}
+
+void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
+{
+	struct dlm_direntry *de;
+	uint32_t bucket;
+
+	bucket = dir_hash(ls, name, namelen);
+
+	spin_lock(&ls->ls_dirtbl[bucket].lock);
+
+	de = search_bucket(ls, name, namelen, bucket);
+
+	if (!de) {
+		log_error(ls, "remove fr %u none", nodeid);
+		goto out;
+	}
+
+	if (de->master_nodeid != nodeid) {
+		log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
+		goto out;
+	}
+
+	list_del(&de->list);
+	kfree(de);
+ out:
+	spin_unlock(&ls->ls_dirtbl[bucket].lock);
+}
+
+void dlm_dir_clear(struct dlm_ls *ls)
+{
+	struct list_head *head;
+	struct dlm_direntry *de;
+	int i;
+
+	DLM_ASSERT(list_empty(&ls->ls_recover_list), );
+
+	for (i = 0; i < ls->ls_dirtbl_size; i++) {
+		spin_lock(&ls->ls_dirtbl[i].lock);
+		head = &ls->ls_dirtbl[i].list;
+		while (!list_empty(head)) {
+			de = list_entry(head->next, struct dlm_direntry, list);
+			list_del(&de->list);
+			put_free_de(ls, de);
+		}
+		spin_unlock(&ls->ls_dirtbl[i].lock);
+	}
+}
+
+int dlm_recover_directory(struct dlm_ls *ls)
+{
+	struct dlm_member *memb;
+	struct dlm_direntry *de;
+	char *b, *last_name = NULL;
+	int error = -ENOMEM, last_len, count = 0;
+	uint16_t namelen;
+
+	log_debug(ls, "dlm_recover_directory");
+
+	if (dlm_no_directory(ls))
+		goto out_status;
+
+	dlm_dir_clear(ls);
+
+	last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS);
+	if (!last_name)
+		goto out;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		memset(last_name, 0, DLM_RESNAME_MAXLEN);
+		last_len = 0;
+
+		for (;;) {
+			int left;
+			error = dlm_recovery_stopped(ls);
+			if (error)
+				goto out_free;
+
+			error = dlm_rcom_names(ls, memb->nodeid,
+					       last_name, last_len);
+			if (error)
+				goto out_free;
+
+			schedule();
+
+			/*
+			 * pick namelen/name pairs out of received buffer
+			 */
+
+			b = ls->ls_recover_buf->rc_buf;
+			left = ls->ls_recover_buf->rc_header.h_length;
+			left -= sizeof(struct dlm_rcom);
+
+			for (;;) {
+				__be16 v;
+
+				error = -EINVAL;
+				if (left < sizeof(__be16))
+					goto out_free;
+
+				memcpy(&v, b, sizeof(__be16));
+				namelen = be16_to_cpu(v);
+				b += sizeof(__be16);
+				left -= sizeof(__be16);
+
+				/* namelen of 0xFFFFF marks end of names for
+				   this node; namelen of 0 marks end of the
+				   buffer */
+
+				if (namelen == 0xFFFF)
+					goto done;
+				if (!namelen)
+					break;
+
+				if (namelen > left)
+					goto out_free;
+
+				if (namelen > DLM_RESNAME_MAXLEN)
+					goto out_free;
+
+				error = -ENOMEM;
+				de = get_free_de(ls, namelen);
+				if (!de)
+					goto out_free;
+
+				de->master_nodeid = memb->nodeid;
+				de->length = namelen;
+				last_len = namelen;
+				memcpy(de->name, b, namelen);
+				memcpy(last_name, b, namelen);
+				b += namelen;
+				left -= namelen;
+
+				add_entry_to_hash(ls, de);
+				count++;
+			}
+		}
+         done:
+		;
+	}
+
+ out_status:
+	error = 0;
+	dlm_set_recover_status(ls, DLM_RS_DIR);
+	log_debug(ls, "dlm_recover_directory %d entries", count);
+ out_free:
+	kfree(last_name);
+ out:
+	dlm_clear_free_entries(ls);
+	return error;
+}
+
+static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
+		     int namelen, int *r_nodeid)
+{
+	struct dlm_direntry *de, *tmp;
+	uint32_t bucket;
+
+	bucket = dir_hash(ls, name, namelen);
+
+	spin_lock(&ls->ls_dirtbl[bucket].lock);
+	de = search_bucket(ls, name, namelen, bucket);
+	if (de) {
+		*r_nodeid = de->master_nodeid;
+		spin_unlock(&ls->ls_dirtbl[bucket].lock);
+		if (*r_nodeid == nodeid)
+			return -EEXIST;
+		return 0;
+	}
+
+	spin_unlock(&ls->ls_dirtbl[bucket].lock);
+
+	if (namelen > DLM_RESNAME_MAXLEN)
+		return -EINVAL;
+
+	de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_NOFS);
+	if (!de)
+		return -ENOMEM;
+
+	de->master_nodeid = nodeid;
+	de->length = namelen;
+	memcpy(de->name, name, namelen);
+
+	spin_lock(&ls->ls_dirtbl[bucket].lock);
+	tmp = search_bucket(ls, name, namelen, bucket);
+	if (tmp) {
+		kfree(de);
+		de = tmp;
+	} else {
+		list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+	}
+	*r_nodeid = de->master_nodeid;
+	spin_unlock(&ls->ls_dirtbl[bucket].lock);
+	return 0;
+}
+
+int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
+		   int *r_nodeid)
+{
+	return get_entry(ls, nodeid, name, namelen, r_nodeid);
+}
+
+static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len)
+{
+	struct dlm_rsb *r;
+
+	down_read(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		if (len == r->res_length && !memcmp(name, r->res_name, len)) {
+			up_read(&ls->ls_root_sem);
+			return r;
+		}
+	}
+	up_read(&ls->ls_root_sem);
+	return NULL;
+}
+
+/* Find the rsb where we left off (or start again), then send rsb names
+   for rsb's we're master of and whose directory node matches the requesting
+   node.  inbuf is the rsb name last sent, inlen is the name's length */
+
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+ 			   char *outbuf, int outlen, int nodeid)
+{
+	struct list_head *list;
+	struct dlm_rsb *r;
+	int offset = 0, dir_nodeid;
+	__be16 be_namelen;
+
+	down_read(&ls->ls_root_sem);
+
+	if (inlen > 1) {
+		r = find_rsb_root(ls, inbuf, inlen);
+		if (!r) {
+			inbuf[inlen - 1] = '\0';
+			log_error(ls, "copy_master_names from %d start %d %s",
+				  nodeid, inlen, inbuf);
+			goto out;
+		}
+		list = r->res_root_list.next;
+	} else {
+		list = ls->ls_root_list.next;
+	}
+
+	for (offset = 0; list != &ls->ls_root_list; list = list->next) {
+		r = list_entry(list, struct dlm_rsb, res_root_list);
+		if (r->res_nodeid)
+			continue;
+
+		dir_nodeid = dlm_dir_nodeid(r);
+		if (dir_nodeid != nodeid)
+			continue;
+
+		/*
+		 * The block ends when we can't fit the following in the
+		 * remaining buffer space:
+		 * namelen (uint16_t) +
+		 * name (r->res_length) +
+		 * end-of-block record 0x0000 (uint16_t)
+		 */
+
+		if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
+			/* Write end-of-block record */
+			be_namelen = cpu_to_be16(0);
+			memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+			offset += sizeof(__be16);
+			goto out;
+		}
+
+		be_namelen = cpu_to_be16(r->res_length);
+		memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+		offset += sizeof(__be16);
+		memcpy(outbuf + offset, r->res_name, r->res_length);
+		offset += r->res_length;
+	}
+
+	/*
+	 * If we've reached the end of the list (and there's room) write a
+	 * terminating record.
+	 */
+
+	if ((list == &ls->ls_root_list) &&
+	    (offset + sizeof(uint16_t) <= outlen)) {
+		be_namelen = cpu_to_be16(0xFFFF);
+		memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+		offset += sizeof(__be16);
+	}
+
+ out:
+	up_read(&ls->ls_root_sem);
+}
+
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 00000000..0b0eb126
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+
+
+int dlm_dir_nodeid(struct dlm_rsb *rsb);
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
+void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
+void dlm_dir_clear(struct dlm_ls *ls);
+void dlm_clear_free_entries(struct dlm_ls *ls);
+int dlm_recover_directory(struct dlm_ls *ls);
+int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
+	int *r_nodeid);
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+	char *outbuf, int outlen, int nodeid);
+
+#endif				/* __DIR_DOT_H__ */
+
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 00000000..0262451e
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,616 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DLM_INTERNAL_DOT_H__
+#define __DLM_INTERNAL_DOT_H__
+
+/*
+ * This is the main header file to be included in each DLM source file.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/socket.h>
+#include <linux/kthread.h>
+#include <linux/kobject.h>
+#include <linux/kref.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <asm/uaccess.h>
+
+#include <linux/dlm.h>
+#include "config.h"
+
+/* Size of the temp buffer midcomms allocates on the stack.
+   We try to make this large enough so most messages fit.
+   FIXME: should sctp make this unnecessary? */
+
+#define DLM_INBUF_LEN		148
+
+struct dlm_ls;
+struct dlm_lkb;
+struct dlm_rsb;
+struct dlm_member;
+struct dlm_lkbtable;
+struct dlm_rsbtable;
+struct dlm_dirtable;
+struct dlm_direntry;
+struct dlm_recover;
+struct dlm_header;
+struct dlm_message;
+struct dlm_rcom;
+struct dlm_mhandle;
+
+#define log_print(fmt, args...) \
+	printk(KERN_ERR "dlm: "fmt"\n" , ##args)
+#define log_error(ls, fmt, args...) \
+	printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
+
+#define log_debug(ls, fmt, args...) \
+do { \
+	if (dlm_config.ci_log_debug) \
+		printk(KERN_DEBUG "dlm: %s: " fmt "\n", \
+		       (ls)->ls_name , ##args); \
+} while (0)
+
+#define DLM_ASSERT(x, do) \
+{ \
+  if (!(x)) \
+  { \
+    printk(KERN_ERR "\nDLM:  Assertion failed on line %d of file %s\n" \
+               "DLM:  assertion:  \"%s\"\n" \
+               "DLM:  time = %lu\n", \
+               __LINE__, __FILE__, #x, jiffies); \
+    {do} \
+    printk("\n"); \
+    BUG(); \
+    panic("DLM:  Record message above and reboot.\n"); \
+  } \
+}
+
+
+struct dlm_direntry {
+	struct list_head	list;
+	uint32_t		master_nodeid;
+	uint16_t		length;
+	char			name[1];
+};
+
+struct dlm_dirtable {
+	struct list_head	list;
+	spinlock_t		lock;
+};
+
+struct dlm_rsbtable {
+	struct list_head	list;
+	struct list_head	toss;
+	spinlock_t		lock;
+};
+
+struct dlm_lkbtable {
+	struct list_head	list;
+	rwlock_t		lock;
+	uint16_t		counter;
+};
+
+/*
+ * Lockspace member (per node in a ls)
+ */
+
+struct dlm_member {
+	struct list_head	list;
+	int			nodeid;
+	int			weight;
+};
+
+/*
+ * Save and manage recovery state for a lockspace.
+ */
+
+struct dlm_recover {
+	struct list_head	list;
+	int			*nodeids;   /* nodeids of all members */
+	int			node_count;
+	int			*new;       /* nodeids of new members */
+	int			new_count;
+	uint64_t		seq;
+};
+
+/*
+ * Pass input args to second stage locking function.
+ */
+
+struct dlm_args {
+	uint32_t		flags;
+	void			(*astfn) (void *astparam);
+	void			*astparam;
+	void			(*bastfn) (void *astparam, int mode);
+	int			mode;
+	struct dlm_lksb		*lksb;
+	unsigned long		timeout;
+};
+
+
+/*
+ * Lock block
+ *
+ * A lock can be one of three types:
+ *
+ * local copy      lock is mastered locally
+ *                 (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
+ * process copy    lock is mastered on a remote node
+ *                 (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
+ * master copy     master node's copy of a lock owned by remote node
+ *                 (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
+ *
+ * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
+ * dlm_unlock.  The dlm does not modify these or use any private flags in
+ * this field; it only contains DLM_LKF_ flags from dlm.h.  These flags
+ * are sent as-is to the remote master when the lock is remote.
+ *
+ * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
+ * Some internal flags are shared between the master and process nodes;
+ * these shared flags are kept in the lower two bytes.  One of these
+ * flags set on the master copy will be propagated to the process copy
+ * and v.v.  Other internal flags are private to the master or process
+ * node (e.g. DLM_IFL_MSTCPY).  These are kept in the high two bytes.
+ *
+ * lkb_sbflags: status block flags.  These flags are copied directly into
+ * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
+ * ast.  All defined in dlm.h with DLM_SBF_ prefix.
+ *
+ * lkb_status: the lock status indicates which rsb queue the lock is
+ * on, grant, convert, or wait.  DLM_LKSTS_ WAITING/GRANTED/CONVERT
+ *
+ * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
+ * reply is needed.  Only set when the lkb is on the lockspace waiters
+ * list awaiting a reply from a remote node.
+ *
+ * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
+ * is a master copy, nodeid specifies the remote lock holder, when the
+ * lkb is a process copy, the nodeid specifies the lock master.
+ */
+
+/* lkb_status */
+
+#define DLM_LKSTS_WAITING	1
+#define DLM_LKSTS_GRANTED	2
+#define DLM_LKSTS_CONVERT	3
+
+/* lkb_flags */
+
+#define DLM_IFL_MSTCPY		0x00010000
+#define DLM_IFL_RESEND		0x00020000
+#define DLM_IFL_DEAD		0x00040000
+#define DLM_IFL_OVERLAP_UNLOCK  0x00080000
+#define DLM_IFL_OVERLAP_CANCEL  0x00100000
+#define DLM_IFL_ENDOFLIFE	0x00200000
+#define DLM_IFL_WATCH_TIMEWARN	0x00400000
+#define DLM_IFL_TIMEOUT_CANCEL	0x00800000
+#define DLM_IFL_DEADLOCK_CANCEL	0x01000000
+#define DLM_IFL_STUB_MS		0x02000000 /* magic number for m_flags */
+#define DLM_IFL_USER		0x00000001
+#define DLM_IFL_ORPHAN		0x00000002
+
+#define DLM_CALLBACKS_SIZE	6
+
+#define DLM_CB_CAST		0x00000001
+#define DLM_CB_BAST		0x00000002
+#define DLM_CB_SKIP		0x00000004
+
+struct dlm_callback {
+	uint64_t		seq;
+	uint32_t		flags;		/* DLM_CBF_ */
+	int			sb_status;	/* copy to lksb status */
+	uint8_t			sb_flags;	/* copy to lksb flags */
+	int8_t			mode; /* rq mode of bast, gr mode of cast */
+};
+
+struct dlm_lkb {
+	struct dlm_rsb		*lkb_resource;	/* the rsb */
+	struct kref		lkb_ref;
+	int			lkb_nodeid;	/* copied from rsb */
+	int			lkb_ownpid;	/* pid of lock owner */
+	uint32_t		lkb_id;		/* our lock ID */
+	uint32_t		lkb_remid;	/* lock ID on remote partner */
+	uint32_t		lkb_exflags;	/* external flags from caller */
+	uint32_t		lkb_sbflags;	/* lksb flags */
+	uint32_t		lkb_flags;	/* internal flags */
+	uint32_t		lkb_lvbseq;	/* lvb sequence number */
+
+	int8_t			lkb_status;     /* granted, waiting, convert */
+	int8_t			lkb_rqmode;	/* requested lock mode */
+	int8_t			lkb_grmode;	/* granted lock mode */
+	int8_t			lkb_highbast;	/* highest mode bast sent for */
+
+	int8_t			lkb_wait_type;	/* type of reply waiting for */
+	int8_t			lkb_wait_count;
+	int			lkb_wait_nodeid; /* for debugging */
+
+	struct list_head	lkb_idtbl_list;	/* lockspace lkbtbl */
+	struct list_head	lkb_statequeue;	/* rsb g/c/w list */
+	struct list_head	lkb_rsb_lookup;	/* waiting for rsb lookup */
+	struct list_head	lkb_wait_reply;	/* waiting for remote reply */
+	struct list_head	lkb_astqueue;	/* need ast to be sent */
+	struct list_head	lkb_ownqueue;	/* list of locks for a process */
+	struct list_head	lkb_time_list;
+	ktime_t			lkb_timestamp;
+	ktime_t			lkb_wait_time;
+	unsigned long		lkb_timeout_cs;
+
+	struct dlm_callback	lkb_callbacks[DLM_CALLBACKS_SIZE];
+	struct dlm_callback	lkb_last_cast;
+	struct dlm_callback	lkb_last_bast;
+	ktime_t			lkb_last_cast_time;	/* for debugging */
+	ktime_t			lkb_last_bast_time;	/* for debugging */
+
+	char			*lkb_lvbptr;
+	struct dlm_lksb		*lkb_lksb;      /* caller's status block */
+	void			(*lkb_astfn) (void *astparam);
+	void			(*lkb_bastfn) (void *astparam, int mode);
+	union {
+		void			*lkb_astparam;	/* caller's ast arg */
+		struct dlm_user_args	*lkb_ua;
+	};
+};
+
+
+struct dlm_rsb {
+	struct dlm_ls		*res_ls;	/* the lockspace */
+	struct kref		res_ref;
+	struct mutex		res_mutex;
+	unsigned long		res_flags;
+	int			res_length;	/* length of rsb name */
+	int			res_nodeid;
+	uint32_t                res_lvbseq;
+	uint32_t		res_hash;
+	uint32_t		res_bucket;	/* rsbtbl */
+	unsigned long		res_toss_time;
+	uint32_t		res_first_lkid;
+	struct list_head	res_lookup;	/* lkbs waiting on first */
+	struct list_head	res_hashchain;	/* rsbtbl */
+	struct list_head	res_grantqueue;
+	struct list_head	res_convertqueue;
+	struct list_head	res_waitqueue;
+
+	struct list_head	res_root_list;	    /* used for recovery */
+	struct list_head	res_recover_list;   /* used for recovery */
+	int			res_recover_locks_count;
+
+	char			*res_lvbptr;
+	char			res_name[1];
+};
+
+/* find_rsb() flags */
+
+#define R_MASTER		1	/* only return rsb if it's a master */
+#define R_CREATE		2	/* create/add rsb if not found */
+
+/* rsb_flags */
+
+enum rsb_flags {
+	RSB_MASTER_UNCERTAIN,
+	RSB_VALNOTVALID,
+	RSB_VALNOTVALID_PREV,
+	RSB_NEW_MASTER,
+	RSB_NEW_MASTER2,
+	RSB_RECOVER_CONVERT,
+	RSB_LOCKS_PURGED,
+};
+
+static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+	__set_bit(flag, &r->res_flags);
+}
+
+static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+	__clear_bit(flag, &r->res_flags);
+}
+
+static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+	return test_bit(flag, &r->res_flags);
+}
+
+
+/* dlm_header is first element of all structs sent between nodes */
+
+#define DLM_HEADER_MAJOR	0x00030000
+#define DLM_HEADER_MINOR	0x00000000
+
+#define DLM_MSG			1
+#define DLM_RCOM		2
+
+struct dlm_header {
+	uint32_t		h_version;
+	uint32_t		h_lockspace;
+	uint32_t		h_nodeid;	/* nodeid of sender */
+	uint16_t		h_length;
+	uint8_t			h_cmd;		/* DLM_MSG, DLM_RCOM */
+	uint8_t			h_pad;
+};
+
+
+#define DLM_MSG_REQUEST		1
+#define DLM_MSG_CONVERT		2
+#define DLM_MSG_UNLOCK		3
+#define DLM_MSG_CANCEL		4
+#define DLM_MSG_REQUEST_REPLY	5
+#define DLM_MSG_CONVERT_REPLY	6
+#define DLM_MSG_UNLOCK_REPLY	7
+#define DLM_MSG_CANCEL_REPLY	8
+#define DLM_MSG_GRANT		9
+#define DLM_MSG_BAST		10
+#define DLM_MSG_LOOKUP		11
+#define DLM_MSG_REMOVE		12
+#define DLM_MSG_LOOKUP_REPLY	13
+#define DLM_MSG_PURGE		14
+
+struct dlm_message {
+	struct dlm_header	m_header;
+	uint32_t		m_type;		/* DLM_MSG_ */
+	uint32_t		m_nodeid;
+	uint32_t		m_pid;
+	uint32_t		m_lkid;		/* lkid on sender */
+	uint32_t		m_remid;	/* lkid on receiver */
+	uint32_t		m_parent_lkid;
+	uint32_t		m_parent_remid;
+	uint32_t		m_exflags;
+	uint32_t		m_sbflags;
+	uint32_t		m_flags;
+	uint32_t		m_lvbseq;
+	uint32_t		m_hash;
+	int			m_status;
+	int			m_grmode;
+	int			m_rqmode;
+	int			m_bastmode;
+	int			m_asts;
+	int			m_result;	/* 0 or -EXXX */
+	char			m_extra[0];	/* name or lvb */
+};
+
+
+#define DLM_RS_NODES		0x00000001
+#define DLM_RS_NODES_ALL	0x00000002
+#define DLM_RS_DIR		0x00000004
+#define DLM_RS_DIR_ALL		0x00000008
+#define DLM_RS_LOCKS		0x00000010
+#define DLM_RS_LOCKS_ALL	0x00000020
+#define DLM_RS_DONE		0x00000040
+#define DLM_RS_DONE_ALL		0x00000080
+
+#define DLM_RCOM_STATUS		1
+#define DLM_RCOM_NAMES		2
+#define DLM_RCOM_LOOKUP		3
+#define DLM_RCOM_LOCK		4
+#define DLM_RCOM_STATUS_REPLY	5
+#define DLM_RCOM_NAMES_REPLY	6
+#define DLM_RCOM_LOOKUP_REPLY	7
+#define DLM_RCOM_LOCK_REPLY	8
+
+struct dlm_rcom {
+	struct dlm_header	rc_header;
+	uint32_t		rc_type;	/* DLM_RCOM_ */
+	int			rc_result;	/* multi-purpose */
+	uint64_t		rc_id;		/* match reply with request */
+	uint64_t		rc_seq;		/* sender's ls_recover_seq */
+	uint64_t		rc_seq_reply;	/* remote ls_recover_seq */
+	char			rc_buf[0];
+};
+
+union dlm_packet {
+	struct dlm_header	header;		/* common to other two */
+	struct dlm_message	message;
+	struct dlm_rcom		rcom;
+};
+
+struct rcom_config {
+	__le32			rf_lvblen;
+	__le32			rf_lsflags;
+	__le64			rf_unused;
+};
+
+struct rcom_lock {
+	__le32			rl_ownpid;
+	__le32			rl_lkid;
+	__le32			rl_remid;
+	__le32			rl_parent_lkid;
+	__le32			rl_parent_remid;
+	__le32			rl_exflags;
+	__le32			rl_flags;
+	__le32			rl_lvbseq;
+	__le32			rl_result;
+	int8_t			rl_rqmode;
+	int8_t			rl_grmode;
+	int8_t			rl_status;
+	int8_t			rl_asts;
+	__le16			rl_wait_type;
+	__le16			rl_namelen;
+	char			rl_name[DLM_RESNAME_MAXLEN];
+	char			rl_lvb[0];
+};
+
+struct dlm_ls {
+	struct list_head	ls_list;	/* list of lockspaces */
+	dlm_lockspace_t		*ls_local_handle;
+	uint32_t		ls_global_id;	/* global unique lockspace ID */
+	uint32_t		ls_exflags;
+	int			ls_lvblen;
+	int			ls_count;	/* refcount of processes in
+						   the dlm using this ls */
+	int			ls_create_count; /* create/release refcount */
+	unsigned long		ls_flags;	/* LSFL_ */
+	unsigned long		ls_scan_time;
+	struct kobject		ls_kobj;
+
+	struct dlm_rsbtable	*ls_rsbtbl;
+	uint32_t		ls_rsbtbl_size;
+
+	struct dlm_lkbtable	*ls_lkbtbl;
+	uint32_t		ls_lkbtbl_size;
+
+	struct dlm_dirtable	*ls_dirtbl;
+	uint32_t		ls_dirtbl_size;
+
+	struct mutex		ls_waiters_mutex;
+	struct list_head	ls_waiters;	/* lkbs needing a reply */
+
+	struct mutex		ls_orphans_mutex;
+	struct list_head	ls_orphans;
+
+	struct mutex		ls_timeout_mutex;
+	struct list_head	ls_timeout;
+
+	struct list_head	ls_nodes;	/* current nodes in ls */
+	struct list_head	ls_nodes_gone;	/* dead node list, recovery */
+	int			ls_num_nodes;	/* number of nodes in ls */
+	int			ls_low_nodeid;
+	int			ls_total_weight;
+	int			*ls_node_array;
+
+	struct dlm_rsb		ls_stub_rsb;	/* for returning errors */
+	struct dlm_lkb		ls_stub_lkb;	/* for returning errors */
+	struct dlm_message	ls_stub_ms;	/* for faking a reply */
+
+	struct dentry		*ls_debug_rsb_dentry; /* debugfs */
+	struct dentry		*ls_debug_waiters_dentry; /* debugfs */
+	struct dentry		*ls_debug_locks_dentry; /* debugfs */
+	struct dentry		*ls_debug_all_dentry; /* debugfs */
+
+	wait_queue_head_t	ls_uevent_wait;	/* user part of join/leave */
+	int			ls_uevent_result;
+	struct completion	ls_members_done;
+	int			ls_members_result;
+
+	struct miscdevice       ls_device;
+
+	/* recovery related */
+
+	struct timer_list	ls_timer;
+	struct task_struct	*ls_recoverd_task;
+	struct mutex		ls_recoverd_active;
+	spinlock_t		ls_recover_lock;
+	unsigned long		ls_recover_begin; /* jiffies timestamp */
+	uint32_t		ls_recover_status; /* DLM_RS_ */
+	uint64_t		ls_recover_seq;
+	struct dlm_recover	*ls_recover_args;
+	struct rw_semaphore	ls_in_recovery;	/* block local requests */
+	struct rw_semaphore	ls_recv_active;	/* block dlm_recv */
+	struct list_head	ls_requestqueue;/* queue remote requests */
+	struct mutex		ls_requestqueue_mutex;
+	struct dlm_rcom		*ls_recover_buf;
+	int			ls_recover_nodeid; /* for debugging */
+	uint64_t		ls_rcom_seq;
+	spinlock_t		ls_rcom_spin;
+	struct list_head	ls_recover_list;
+	spinlock_t		ls_recover_list_lock;
+	int			ls_recover_list_count;
+	wait_queue_head_t	ls_wait_general;
+	struct mutex		ls_clear_proc_locks;
+
+	struct list_head	ls_root_list;	/* root resources */
+	struct rw_semaphore	ls_root_sem;	/* protect root_list */
+
+	int			ls_namelen;
+	char			ls_name[1];
+};
+
+#define LSFL_WORK		0
+#define LSFL_RUNNING		1
+#define LSFL_RECOVERY_STOP	2
+#define LSFL_RCOM_READY		3
+#define LSFL_RCOM_WAIT		4
+#define LSFL_UEVENT_WAIT	5
+#define LSFL_TIMEWARN		6
+
+/* much of this is just saving user space pointers associated with the
+   lock that we pass back to the user lib with an ast */
+
+struct dlm_user_args {
+	struct dlm_user_proc	*proc; /* each process that opens the lockspace
+					  device has private data
+					  (dlm_user_proc) on the struct file,
+					  the process's locks point back to it*/
+	struct dlm_lksb		lksb;
+	struct dlm_lksb __user	*user_lksb;
+	void __user		*castparam;
+	void __user		*castaddr;
+	void __user		*bastparam;
+	void __user		*bastaddr;
+	uint64_t		xid;
+};
+
+#define DLM_PROC_FLAGS_CLOSING 1
+#define DLM_PROC_FLAGS_COMPAT  2
+
+/* locks list is kept so we can remove all a process's locks when it
+   exits (or orphan those that are persistent) */
+
+struct dlm_user_proc {
+	dlm_lockspace_t		*lockspace;
+	unsigned long		flags; /* DLM_PROC_FLAGS */
+	struct list_head	asts;
+	spinlock_t		asts_spin;
+	struct list_head	locks;
+	spinlock_t		locks_spin;
+	struct list_head	unlocking;
+	wait_queue_head_t	wait;
+};
+
+static inline int dlm_locking_stopped(struct dlm_ls *ls)
+{
+	return !test_bit(LSFL_RUNNING, &ls->ls_flags);
+}
+
+static inline int dlm_recovery_stopped(struct dlm_ls *ls)
+{
+	return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+}
+
+static inline int dlm_no_directory(struct dlm_ls *ls)
+{
+	return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
+}
+
+int dlm_netlink_init(void);
+void dlm_netlink_exit(void);
+void dlm_timeout_warn(struct dlm_lkb *lkb);
+int dlm_plock_init(void);
+void dlm_plock_exit(void);
+
+#ifdef CONFIG_DLM_DEBUG
+int dlm_register_debugfs(void);
+void dlm_unregister_debugfs(void);
+int dlm_create_debug_file(struct dlm_ls *ls);
+void dlm_delete_debug_file(struct dlm_ls *ls);
+#else
+static inline int dlm_register_debugfs(void) { return 0; }
+static inline void dlm_unregister_debugfs(void) { }
+static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
+static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+#endif
+
+#endif				/* __DLM_INTERNAL_DOT_H__ */
+
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 00000000..f71d0b5a
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,5115 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* Central locking logic has four stages:
+
+   dlm_lock()
+   dlm_unlock()
+
+   request_lock(ls, lkb)
+   convert_lock(ls, lkb)
+   unlock_lock(ls, lkb)
+   cancel_lock(ls, lkb)
+
+   _request_lock(r, lkb)
+   _convert_lock(r, lkb)
+   _unlock_lock(r, lkb)
+   _cancel_lock(r, lkb)
+
+   do_request(r, lkb)
+   do_convert(r, lkb)
+   do_unlock(r, lkb)
+   do_cancel(r, lkb)
+
+   Stage 1 (lock, unlock) is mainly about checking input args and
+   splitting into one of the four main operations:
+
+       dlm_lock          = request_lock
+       dlm_lock+CONVERT  = convert_lock
+       dlm_unlock        = unlock_lock
+       dlm_unlock+CANCEL = cancel_lock
+
+   Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
+   provided to the next stage.
+
+   Stage 3, _xxxx_lock(), determines if the operation is local or remote.
+   When remote, it calls send_xxxx(), when local it calls do_xxxx().
+
+   Stage 4, do_xxxx(), is the guts of the operation.  It manipulates the
+   given rsb and lkb and queues callbacks.
+
+   For remote operations, send_xxxx() results in the corresponding do_xxxx()
+   function being executed on the remote node.  The connecting send/receive
+   calls on local (L) and remote (R) nodes:
+
+   L: send_xxxx()              ->  R: receive_xxxx()
+                                   R: do_xxxx()
+   L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
+*/
+#include <linux/types.h>
+#include <linux/slab.h>
+#include "dlm_internal.h"
+#include <linux/dlm_device.h>
+#include "memory.h"
+#include "lowcomms.h"
+#include "requestqueue.h"
+#include "util.h"
+#include "dir.h"
+#include "member.h"
+#include "lockspace.h"
+#include "ast.h"
+#include "lock.h"
+#include "rcom.h"
+#include "recover.h"
+#include "lvb_table.h"
+#include "user.h"
+#include "config.h"
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_remove(struct dlm_rsb *r);
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+				    struct dlm_message *ms);
+static int receive_extralen(struct dlm_message *ms);
+static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
+static void del_timeout(struct dlm_lkb *lkb);
+
+/*
+ * Lock compatibilty matrix - thanks Steve
+ * UN = Unlocked state. Not really a state, used as a flag
+ * PD = Padding. Used to make the matrix a nice power of two in size
+ * Other states are the same as the VMS DLM.
+ * Usage: matrix[grmode+1][rqmode+1]  (although m[rq+1][gr+1] is the same)
+ */
+
+static const int __dlm_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+        {1, 1, 1, 1, 1, 1, 1, 0},       /* UN */
+        {1, 1, 1, 1, 1, 1, 1, 0},       /* NL */
+        {1, 1, 1, 1, 1, 1, 0, 0},       /* CR */
+        {1, 1, 1, 1, 0, 0, 0, 0},       /* CW */
+        {1, 1, 1, 0, 1, 0, 0, 0},       /* PR */
+        {1, 1, 1, 0, 0, 0, 0, 0},       /* PW */
+        {1, 1, 0, 0, 0, 0, 0, 0},       /* EX */
+        {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
+};
+
+/*
+ * This defines the direction of transfer of LVB data.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ * 1 = LVB is returned to the caller
+ * 0 = LVB is written to the resource
+ * -1 = nothing happens to the LVB
+ */
+
+const int dlm_lvb_operations[8][8] = {
+        /* UN   NL  CR  CW  PR  PW  EX  PD*/
+        {  -1,  1,  1,  1,  1,  1,  1, -1 }, /* UN */
+        {  -1,  1,  1,  1,  1,  1,  1,  0 }, /* NL */
+        {  -1, -1,  1,  1,  1,  1,  1,  0 }, /* CR */
+        {  -1, -1, -1,  1,  1,  1,  1,  0 }, /* CW */
+        {  -1, -1, -1, -1,  1,  1,  1,  0 }, /* PR */
+        {  -1,  0,  0,  0,  0,  0,  1,  0 }, /* PW */
+        {  -1,  0,  0,  0,  0,  0,  0,  0 }, /* EX */
+        {  -1,  0,  0,  0,  0,  0,  0,  0 }  /* PD */
+};
+
+#define modes_compat(gr, rq) \
+	__dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
+
+int dlm_modes_compat(int mode1, int mode2)
+{
+	return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
+}
+
+/*
+ * Compatibility matrix for conversions with QUECVT set.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ */
+
+static const int __quecvt_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+        {0, 0, 0, 0, 0, 0, 0, 0},       /* UN */
+        {0, 0, 1, 1, 1, 1, 1, 0},       /* NL */
+        {0, 0, 0, 1, 1, 1, 1, 0},       /* CR */
+        {0, 0, 0, 0, 1, 1, 1, 0},       /* CW */
+        {0, 0, 0, 1, 0, 1, 1, 0},       /* PR */
+        {0, 0, 0, 0, 0, 0, 1, 0},       /* PW */
+        {0, 0, 0, 0, 0, 0, 0, 0},       /* EX */
+        {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
+};
+
+void dlm_print_lkb(struct dlm_lkb *lkb)
+{
+	printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
+	       "     status %d rqmode %d grmode %d wait_type %d\n",
+	       lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
+	       lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
+	       lkb->lkb_grmode, lkb->lkb_wait_type);
+}
+
+static void dlm_print_rsb(struct dlm_rsb *r)
+{
+	printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
+	       r->res_nodeid, r->res_flags, r->res_first_lkid,
+	       r->res_recover_locks_count, r->res_name);
+}
+
+void dlm_dump_rsb(struct dlm_rsb *r)
+{
+	struct dlm_lkb *lkb;
+
+	dlm_print_rsb(r);
+
+	printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n",
+	       list_empty(&r->res_root_list), list_empty(&r->res_recover_list));
+	printk(KERN_ERR "rsb lookup list\n");
+	list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
+		dlm_print_lkb(lkb);
+	printk(KERN_ERR "rsb grant queue:\n");
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
+		dlm_print_lkb(lkb);
+	printk(KERN_ERR "rsb convert queue:\n");
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
+		dlm_print_lkb(lkb);
+	printk(KERN_ERR "rsb wait queue:\n");
+	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
+		dlm_print_lkb(lkb);
+}
+
+/* Threads cannot use the lockspace while it's being recovered */
+
+static inline void dlm_lock_recovery(struct dlm_ls *ls)
+{
+	down_read(&ls->ls_in_recovery);
+}
+
+void dlm_unlock_recovery(struct dlm_ls *ls)
+{
+	up_read(&ls->ls_in_recovery);
+}
+
+int dlm_lock_recovery_try(struct dlm_ls *ls)
+{
+	return down_read_trylock(&ls->ls_in_recovery);
+}
+
+static inline int can_be_queued(struct dlm_lkb *lkb)
+{
+	return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
+}
+
+static inline int force_blocking_asts(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
+}
+
+static inline int is_demoted(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
+}
+
+static inline int is_altmode(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_sbflags & DLM_SBF_ALTMODE);
+}
+
+static inline int is_granted(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_status == DLM_LKSTS_GRANTED);
+}
+
+static inline int is_remote(struct dlm_rsb *r)
+{
+	DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
+	return !!r->res_nodeid;
+}
+
+static inline int is_process_copy(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
+}
+
+static inline int is_master_copy(struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+		DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
+	return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
+}
+
+static inline int middle_conversion(struct dlm_lkb *lkb)
+{
+	if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
+	    (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
+		return 1;
+	return 0;
+}
+
+static inline int down_conversion(struct dlm_lkb *lkb)
+{
+	return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
+}
+
+static inline int is_overlap_unlock(struct dlm_lkb *lkb)
+{
+	return lkb->lkb_flags & DLM_IFL_OVERLAP_UNLOCK;
+}
+
+static inline int is_overlap_cancel(struct dlm_lkb *lkb)
+{
+	return lkb->lkb_flags & DLM_IFL_OVERLAP_CANCEL;
+}
+
+static inline int is_overlap(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_flags & (DLM_IFL_OVERLAP_UNLOCK |
+				  DLM_IFL_OVERLAP_CANCEL));
+}
+
+static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+	if (is_master_copy(lkb))
+		return;
+
+	del_timeout(lkb);
+
+	DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
+
+	/* if the operation was a cancel, then return -DLM_ECANCEL, if a
+	   timeout caused the cancel then return -ETIMEDOUT */
+	if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) {
+		lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL;
+		rv = -ETIMEDOUT;
+	}
+
+	if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) {
+		lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL;
+		rv = -EDEADLK;
+	}
+
+	dlm_add_ast(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags);
+}
+
+static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	queue_cast(r, lkb,
+		   is_overlap_unlock(lkb) ? -DLM_EUNLOCK : -DLM_ECANCEL);
+}
+
+static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
+{
+	if (is_master_copy(lkb)) {
+		send_bast(r, lkb, rqmode);
+	} else {
+		dlm_add_ast(lkb, DLM_CB_BAST, rqmode, 0, 0);
+	}
+}
+
+/*
+ * Basic operations on rsb's and lkb's
+ */
+
+static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
+{
+	struct dlm_rsb *r;
+
+	r = dlm_allocate_rsb(ls, len);
+	if (!r)
+		return NULL;
+
+	r->res_ls = ls;
+	r->res_length = len;
+	memcpy(r->res_name, name, len);
+	mutex_init(&r->res_mutex);
+
+	INIT_LIST_HEAD(&r->res_lookup);
+	INIT_LIST_HEAD(&r->res_grantqueue);
+	INIT_LIST_HEAD(&r->res_convertqueue);
+	INIT_LIST_HEAD(&r->res_waitqueue);
+	INIT_LIST_HEAD(&r->res_root_list);
+	INIT_LIST_HEAD(&r->res_recover_list);
+
+	return r;
+}
+
+static int search_rsb_list(struct list_head *head, char *name, int len,
+			   unsigned int flags, struct dlm_rsb **r_ret)
+{
+	struct dlm_rsb *r;
+	int error = 0;
+
+	list_for_each_entry(r, head, res_hashchain) {
+		if (len == r->res_length && !memcmp(name, r->res_name, len))
+			goto found;
+	}
+	*r_ret = NULL;
+	return -EBADR;
+
+ found:
+	if (r->res_nodeid && (flags & R_MASTER))
+		error = -ENOTBLK;
+	*r_ret = r;
+	return error;
+}
+
+static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+		       unsigned int flags, struct dlm_rsb **r_ret)
+{
+	struct dlm_rsb *r;
+	int error;
+
+	error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
+	if (!error) {
+		kref_get(&r->res_ref);
+		goto out;
+	}
+	error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
+	if (error)
+		goto out;
+
+	list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
+
+	if (dlm_no_directory(ls))
+		goto out;
+
+	if (r->res_nodeid == -1) {
+		rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+		r->res_first_lkid = 0;
+	} else if (r->res_nodeid > 0) {
+		rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
+		r->res_first_lkid = 0;
+	} else {
+		DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
+		DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
+	}
+ out:
+	*r_ret = r;
+	return error;
+}
+
+static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+		      unsigned int flags, struct dlm_rsb **r_ret)
+{
+	int error;
+	spin_lock(&ls->ls_rsbtbl[b].lock);
+	error = _search_rsb(ls, name, len, b, flags, r_ret);
+	spin_unlock(&ls->ls_rsbtbl[b].lock);
+	return error;
+}
+
+/*
+ * Find rsb in rsbtbl and potentially create/add one
+ *
+ * Delaying the release of rsb's has a similar benefit to applications keeping
+ * NL locks on an rsb, but without the guarantee that the cached master value
+ * will still be valid when the rsb is reused.  Apps aren't always smart enough
+ * to keep NL locks on an rsb that they may lock again shortly; this can lead
+ * to excessive master lookups and removals if we don't delay the release.
+ *
+ * Searching for an rsb means looking through both the normal list and toss
+ * list.  When found on the toss list the rsb is moved to the normal list with
+ * ref count of 1; when found on normal list the ref count is incremented.
+ */
+
+static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
+		    unsigned int flags, struct dlm_rsb **r_ret)
+{
+	struct dlm_rsb *r = NULL, *tmp;
+	uint32_t hash, bucket;
+	int error = -EINVAL;
+
+	if (namelen > DLM_RESNAME_MAXLEN)
+		goto out;
+
+	if (dlm_no_directory(ls))
+		flags |= R_CREATE;
+
+	error = 0;
+	hash = jhash(name, namelen, 0);
+	bucket = hash & (ls->ls_rsbtbl_size - 1);
+
+	error = search_rsb(ls, name, namelen, bucket, flags, &r);
+	if (!error)
+		goto out;
+
+	if (error == -EBADR && !(flags & R_CREATE))
+		goto out;
+
+	/* the rsb was found but wasn't a master copy */
+	if (error == -ENOTBLK)
+		goto out;
+
+	error = -ENOMEM;
+	r = create_rsb(ls, name, namelen);
+	if (!r)
+		goto out;
+
+	r->res_hash = hash;
+	r->res_bucket = bucket;
+	r->res_nodeid = -1;
+	kref_init(&r->res_ref);
+
+	/* With no directory, the master can be set immediately */
+	if (dlm_no_directory(ls)) {
+		int nodeid = dlm_dir_nodeid(r);
+		if (nodeid == dlm_our_nodeid())
+			nodeid = 0;
+		r->res_nodeid = nodeid;
+	}
+
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
+	error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
+	if (!error) {
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+		dlm_free_rsb(r);
+		r = tmp;
+		goto out;
+	}
+	list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+	error = 0;
+ out:
+	*r_ret = r;
+	return error;
+}
+
+/* This is only called to add a reference when the code already holds
+   a valid reference to the rsb, so there's no need for locking. */
+
+static inline void hold_rsb(struct dlm_rsb *r)
+{
+	kref_get(&r->res_ref);
+}
+
+void dlm_hold_rsb(struct dlm_rsb *r)
+{
+	hold_rsb(r);
+}
+
+static void toss_rsb(struct kref *kref)
+{
+	struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+	struct dlm_ls *ls = r->res_ls;
+
+	DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
+	kref_init(&r->res_ref);
+	list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
+	r->res_toss_time = jiffies;
+	if (r->res_lvbptr) {
+		dlm_free_lvb(r->res_lvbptr);
+		r->res_lvbptr = NULL;
+	}
+}
+
+/* When all references to the rsb are gone it's transferred to
+   the tossed list for later disposal. */
+
+static void put_rsb(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+	uint32_t bucket = r->res_bucket;
+
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
+	kref_put(&r->res_ref, toss_rsb);
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+}
+
+void dlm_put_rsb(struct dlm_rsb *r)
+{
+	put_rsb(r);
+}
+
+/* See comment for unhold_lkb */
+
+static void unhold_rsb(struct dlm_rsb *r)
+{
+	int rv;
+	rv = kref_put(&r->res_ref, toss_rsb);
+	DLM_ASSERT(!rv, dlm_dump_rsb(r););
+}
+
+static void kill_rsb(struct kref *kref)
+{
+	struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+
+	/* All work is done after the return from kref_put() so we
+	   can release the write_lock before the remove and free. */
+
+	DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
+	DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
+	DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
+	DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
+	DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
+	DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
+}
+
+/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
+   The rsb must exist as long as any lkb's for it do. */
+
+static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	hold_rsb(r);
+	lkb->lkb_resource = r;
+}
+
+static void detach_lkb(struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_resource) {
+		put_rsb(lkb->lkb_resource);
+		lkb->lkb_resource = NULL;
+	}
+}
+
+static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+	struct dlm_lkb *lkb, *tmp;
+	uint32_t lkid = 0;
+	uint16_t bucket;
+
+	lkb = dlm_allocate_lkb(ls);
+	if (!lkb)
+		return -ENOMEM;
+
+	lkb->lkb_nodeid = -1;
+	lkb->lkb_grmode = DLM_LOCK_IV;
+	kref_init(&lkb->lkb_ref);
+	INIT_LIST_HEAD(&lkb->lkb_ownqueue);
+	INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
+	INIT_LIST_HEAD(&lkb->lkb_time_list);
+	INIT_LIST_HEAD(&lkb->lkb_astqueue);
+
+	get_random_bytes(&bucket, sizeof(bucket));
+	bucket &= (ls->ls_lkbtbl_size - 1);
+
+	write_lock(&ls->ls_lkbtbl[bucket].lock);
+
+	/* counter can roll over so we must verify lkid is not in use */
+
+	while (lkid == 0) {
+		lkid = (bucket << 16) | ls->ls_lkbtbl[bucket].counter++;
+
+		list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
+				    lkb_idtbl_list) {
+			if (tmp->lkb_id != lkid)
+				continue;
+			lkid = 0;
+			break;
+		}
+	}
+
+	lkb->lkb_id = lkid;
+	list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
+	write_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+	*lkb_ret = lkb;
+	return 0;
+}
+
+static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
+{
+	struct dlm_lkb *lkb;
+	uint16_t bucket = (lkid >> 16);
+
+	list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
+		if (lkb->lkb_id == lkid)
+			return lkb;
+	}
+	return NULL;
+}
+
+static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
+{
+	struct dlm_lkb *lkb;
+	uint16_t bucket = (lkid >> 16);
+
+	if (bucket >= ls->ls_lkbtbl_size)
+		return -EBADSLT;
+
+	read_lock(&ls->ls_lkbtbl[bucket].lock);
+	lkb = __find_lkb(ls, lkid);
+	if (lkb)
+		kref_get(&lkb->lkb_ref);
+	read_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+	*lkb_ret = lkb;
+	return lkb ? 0 : -ENOENT;
+}
+
+static void kill_lkb(struct kref *kref)
+{
+	struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
+
+	/* All work is done after the return from kref_put() so we
+	   can release the write_lock before the detach_lkb */
+
+	DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+}
+
+/* __put_lkb() is used when an lkb may not have an rsb attached to
+   it so we need to provide the lockspace explicitly */
+
+static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	uint16_t bucket = (lkb->lkb_id >> 16);
+
+	write_lock(&ls->ls_lkbtbl[bucket].lock);
+	if (kref_put(&lkb->lkb_ref, kill_lkb)) {
+		list_del(&lkb->lkb_idtbl_list);
+		write_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+		detach_lkb(lkb);
+
+		/* for local/process lkbs, lvbptr points to caller's lksb */
+		if (lkb->lkb_lvbptr && is_master_copy(lkb))
+			dlm_free_lvb(lkb->lkb_lvbptr);
+		dlm_free_lkb(lkb);
+		return 1;
+	} else {
+		write_unlock(&ls->ls_lkbtbl[bucket].lock);
+		return 0;
+	}
+}
+
+int dlm_put_lkb(struct dlm_lkb *lkb)
+{
+	struct dlm_ls *ls;
+
+	DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
+	DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
+
+	ls = lkb->lkb_resource->res_ls;
+	return __put_lkb(ls, lkb);
+}
+
+/* This is only called to add a reference when the code already holds
+   a valid reference to the lkb, so there's no need for locking. */
+
+static inline void hold_lkb(struct dlm_lkb *lkb)
+{
+	kref_get(&lkb->lkb_ref);
+}
+
+/* This is called when we need to remove a reference and are certain
+   it's not the last ref.  e.g. del_lkb is always called between a
+   find_lkb/put_lkb and is always the inverse of a previous add_lkb.
+   put_lkb would work fine, but would involve unnecessary locking */
+
+static inline void unhold_lkb(struct dlm_lkb *lkb)
+{
+	int rv;
+	rv = kref_put(&lkb->lkb_ref, kill_lkb);
+	DLM_ASSERT(!rv, dlm_print_lkb(lkb););
+}
+
+static void lkb_add_ordered(struct list_head *new, struct list_head *head,
+			    int mode)
+{
+	struct dlm_lkb *lkb = NULL;
+
+	list_for_each_entry(lkb, head, lkb_statequeue)
+		if (lkb->lkb_rqmode < mode)
+			break;
+
+	__list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
+}
+
+/* add/remove lkb to rsb's grant/convert/wait queue */
+
+static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
+{
+	kref_get(&lkb->lkb_ref);
+
+	DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+
+	lkb->lkb_timestamp = ktime_get();
+
+	lkb->lkb_status = status;
+
+	switch (status) {
+	case DLM_LKSTS_WAITING:
+		if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+			list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
+		else
+			list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
+		break;
+	case DLM_LKSTS_GRANTED:
+		/* convention says granted locks kept in order of grmode */
+		lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
+				lkb->lkb_grmode);
+		break;
+	case DLM_LKSTS_CONVERT:
+		if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+			list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
+		else
+			list_add_tail(&lkb->lkb_statequeue,
+				      &r->res_convertqueue);
+		break;
+	default:
+		DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
+	}
+}
+
+static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	lkb->lkb_status = 0;
+	list_del(&lkb->lkb_statequeue);
+	unhold_lkb(lkb);
+}
+
+static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
+{
+	hold_lkb(lkb);
+	del_lkb(r, lkb);
+	add_lkb(r, lkb, sts);
+	unhold_lkb(lkb);
+}
+
+static int msg_reply_type(int mstype)
+{
+	switch (mstype) {
+	case DLM_MSG_REQUEST:
+		return DLM_MSG_REQUEST_REPLY;
+	case DLM_MSG_CONVERT:
+		return DLM_MSG_CONVERT_REPLY;
+	case DLM_MSG_UNLOCK:
+		return DLM_MSG_UNLOCK_REPLY;
+	case DLM_MSG_CANCEL:
+		return DLM_MSG_CANCEL_REPLY;
+	case DLM_MSG_LOOKUP:
+		return DLM_MSG_LOOKUP_REPLY;
+	}
+	return -1;
+}
+
+static int nodeid_warned(int nodeid, int num_nodes, int *warned)
+{
+	int i;
+
+	for (i = 0; i < num_nodes; i++) {
+		if (!warned[i]) {
+			warned[i] = nodeid;
+			return 0;
+		}
+		if (warned[i] == nodeid)
+			return 1;
+	}
+	return 0;
+}
+
+void dlm_scan_waiters(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb;
+	ktime_t zero = ktime_set(0, 0);
+	s64 us;
+	s64 debug_maxus = 0;
+	u32 debug_scanned = 0;
+	u32 debug_expired = 0;
+	int num_nodes = 0;
+	int *warned = NULL;
+
+	if (!dlm_config.ci_waitwarn_us)
+		return;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+
+	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+		if (ktime_equal(lkb->lkb_wait_time, zero))
+			continue;
+
+		debug_scanned++;
+
+		us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
+
+		if (us < dlm_config.ci_waitwarn_us)
+			continue;
+
+		lkb->lkb_wait_time = zero;
+
+		debug_expired++;
+		if (us > debug_maxus)
+			debug_maxus = us;
+
+		if (!num_nodes) {
+			num_nodes = ls->ls_num_nodes;
+			warned = kmalloc(GFP_KERNEL, num_nodes * sizeof(int));
+			if (warned)
+				memset(warned, 0, num_nodes * sizeof(int));
+		}
+		if (!warned)
+			continue;
+		if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
+			continue;
+
+		log_error(ls, "waitwarn %x %lld %d us check connection to "
+			  "node %d", lkb->lkb_id, (long long)us,
+			  dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
+	}
+	mutex_unlock(&ls->ls_waiters_mutex);
+
+	if (warned)
+		kfree(warned);
+
+	if (debug_expired)
+		log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
+			  debug_scanned, debug_expired,
+			  dlm_config.ci_waitwarn_us, (long long)debug_maxus);
+}
+
+/* add/remove lkb from global waiters list of lkb's waiting for
+   a reply from a remote node */
+
+static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int error = 0;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+
+	if (is_overlap_unlock(lkb) ||
+	    (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) {
+		switch (mstype) {
+		case DLM_MSG_UNLOCK:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
+			break;
+		case DLM_MSG_CANCEL:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
+			break;
+		default:
+			error = -EBUSY;
+			goto out;
+		}
+		lkb->lkb_wait_count++;
+		hold_lkb(lkb);
+
+		log_debug(ls, "addwait %x cur %d overlap %d count %d f %x",
+			  lkb->lkb_id, lkb->lkb_wait_type, mstype,
+			  lkb->lkb_wait_count, lkb->lkb_flags);
+		goto out;
+	}
+
+	DLM_ASSERT(!lkb->lkb_wait_count,
+		   dlm_print_lkb(lkb);
+		   printk("wait_count %d\n", lkb->lkb_wait_count););
+
+	lkb->lkb_wait_count++;
+	lkb->lkb_wait_type = mstype;
+	lkb->lkb_wait_time = ktime_get();
+	lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
+	hold_lkb(lkb);
+	list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
+ out:
+	if (error)
+		log_error(ls, "addwait error %x %d flags %x %d %d %s",
+			  lkb->lkb_id, error, lkb->lkb_flags, mstype,
+			  lkb->lkb_wait_type, lkb->lkb_resource->res_name);
+	mutex_unlock(&ls->ls_waiters_mutex);
+	return error;
+}
+
+/* We clear the RESEND flag because we might be taking an lkb off the waiters
+   list as part of process_requestqueue (e.g. a lookup that has an optimized
+   request reply on the requestqueue) between dlm_recover_waiters_pre() which
+   set RESEND and dlm_recover_waiters_post() */
+
+static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
+				struct dlm_message *ms)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int overlap_done = 0;
+
+	if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) {
+		log_debug(ls, "remwait %x unlock_reply overlap", lkb->lkb_id);
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		overlap_done = 1;
+		goto out_del;
+	}
+
+	if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) {
+		log_debug(ls, "remwait %x cancel_reply overlap", lkb->lkb_id);
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		overlap_done = 1;
+		goto out_del;
+	}
+
+	/* Cancel state was preemptively cleared by a successful convert,
+	   see next comment, nothing to do. */
+
+	if ((mstype == DLM_MSG_CANCEL_REPLY) &&
+	    (lkb->lkb_wait_type != DLM_MSG_CANCEL)) {
+		log_debug(ls, "remwait %x cancel_reply wait_type %d",
+			  lkb->lkb_id, lkb->lkb_wait_type);
+		return -1;
+	}
+
+	/* Remove for the convert reply, and premptively remove for the
+	   cancel reply.  A convert has been granted while there's still
+	   an outstanding cancel on it (the cancel is moot and the result
+	   in the cancel reply should be 0).  We preempt the cancel reply
+	   because the app gets the convert result and then can follow up
+	   with another op, like convert.  This subsequent op would see the
+	   lingering state of the cancel and fail with -EBUSY. */
+
+	if ((mstype == DLM_MSG_CONVERT_REPLY) &&
+	    (lkb->lkb_wait_type == DLM_MSG_CONVERT) &&
+	    is_overlap_cancel(lkb) && ms && !ms->m_result) {
+		log_debug(ls, "remwait %x convert_reply zap overlap_cancel",
+			  lkb->lkb_id);
+		lkb->lkb_wait_type = 0;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		lkb->lkb_wait_count--;
+		goto out_del;
+	}
+
+	/* N.B. type of reply may not always correspond to type of original
+	   msg due to lookup->request optimization, verify others? */
+
+	if (lkb->lkb_wait_type) {
+		lkb->lkb_wait_type = 0;
+		goto out_del;
+	}
+
+	log_error(ls, "remwait error %x reply %d flags %x no wait_type",
+		  lkb->lkb_id, mstype, lkb->lkb_flags);
+	return -1;
+
+ out_del:
+	/* the force-unlock/cancel has completed and we haven't recvd a reply
+	   to the op that was in progress prior to the unlock/cancel; we
+	   give up on any reply to the earlier op.  FIXME: not sure when/how
+	   this would happen */
+
+	if (overlap_done && lkb->lkb_wait_type) {
+		log_error(ls, "remwait error %x reply %d wait_type %d overlap",
+			  lkb->lkb_id, mstype, lkb->lkb_wait_type);
+		lkb->lkb_wait_count--;
+		lkb->lkb_wait_type = 0;
+	}
+
+	DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb););
+
+	lkb->lkb_flags &= ~DLM_IFL_RESEND;
+	lkb->lkb_wait_count--;
+	if (!lkb->lkb_wait_count)
+		list_del_init(&lkb->lkb_wait_reply);
+	unhold_lkb(lkb);
+	return 0;
+}
+
+static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int error;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+	error = _remove_from_waiters(lkb, mstype, NULL);
+	mutex_unlock(&ls->ls_waiters_mutex);
+	return error;
+}
+
+/* Handles situations where we might be processing a "fake" or "stub" reply in
+   which we can't try to take waiters_mutex again. */
+
+static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int error;
+
+	if (ms->m_flags != DLM_IFL_STUB_MS)
+		mutex_lock(&ls->ls_waiters_mutex);
+	error = _remove_from_waiters(lkb, ms->m_type, ms);
+	if (ms->m_flags != DLM_IFL_STUB_MS)
+		mutex_unlock(&ls->ls_waiters_mutex);
+	return error;
+}
+
+static void dir_remove(struct dlm_rsb *r)
+{
+	int to_nodeid;
+
+	if (dlm_no_directory(r->res_ls))
+		return;
+
+	to_nodeid = dlm_dir_nodeid(r);
+	if (to_nodeid != dlm_our_nodeid())
+		send_remove(r);
+	else
+		dlm_dir_remove_entry(r->res_ls, to_nodeid,
+				     r->res_name, r->res_length);
+}
+
+/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
+   found since they are in order of newest to oldest? */
+
+static int shrink_bucket(struct dlm_ls *ls, int b)
+{
+	struct dlm_rsb *r;
+	int count = 0, found;
+
+	for (;;) {
+		found = 0;
+		spin_lock(&ls->ls_rsbtbl[b].lock);
+		list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
+					    res_hashchain) {
+			if (!time_after_eq(jiffies, r->res_toss_time +
+					   dlm_config.ci_toss_secs * HZ))
+				continue;
+			found = 1;
+			break;
+		}
+
+		if (!found) {
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
+			break;
+		}
+
+		if (kref_put(&r->res_ref, kill_rsb)) {
+			list_del(&r->res_hashchain);
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
+
+			if (is_master(r))
+				dir_remove(r);
+			dlm_free_rsb(r);
+			count++;
+		} else {
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
+			log_error(ls, "tossed rsb in use %s", r->res_name);
+		}
+	}
+
+	return count;
+}
+
+void dlm_scan_rsbs(struct dlm_ls *ls)
+{
+	int i;
+
+	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+		shrink_bucket(ls, i);
+		if (dlm_locking_stopped(ls))
+			break;
+		cond_resched();
+	}
+}
+
+static void add_timeout(struct dlm_lkb *lkb)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+
+	if (is_master_copy(lkb))
+		return;
+
+	if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
+	    !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
+		lkb->lkb_flags |= DLM_IFL_WATCH_TIMEWARN;
+		goto add_it;
+	}
+	if (lkb->lkb_exflags & DLM_LKF_TIMEOUT)
+		goto add_it;
+	return;
+
+ add_it:
+	DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
+	mutex_lock(&ls->ls_timeout_mutex);
+	hold_lkb(lkb);
+	list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
+	mutex_unlock(&ls->ls_timeout_mutex);
+}
+
+static void del_timeout(struct dlm_lkb *lkb)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+
+	mutex_lock(&ls->ls_timeout_mutex);
+	if (!list_empty(&lkb->lkb_time_list)) {
+		list_del_init(&lkb->lkb_time_list);
+		unhold_lkb(lkb);
+	}
+	mutex_unlock(&ls->ls_timeout_mutex);
+}
+
+/* FIXME: is it safe to look at lkb_exflags, lkb_flags, lkb_timestamp, and
+   lkb_lksb_timeout without lock_rsb?  Note: we can't lock timeout_mutex
+   and then lock rsb because of lock ordering in add_timeout.  We may need
+   to specify some special timeout-related bits in the lkb that are just to
+   be accessed under the timeout_mutex. */
+
+void dlm_scan_timeout(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	struct dlm_lkb *lkb;
+	int do_cancel, do_warn;
+	s64 wait_us;
+
+	for (;;) {
+		if (dlm_locking_stopped(ls))
+			break;
+
+		do_cancel = 0;
+		do_warn = 0;
+		mutex_lock(&ls->ls_timeout_mutex);
+		list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
+
+			wait_us = ktime_to_us(ktime_sub(ktime_get(),
+					      		lkb->lkb_timestamp));
+
+			if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
+			    wait_us >= (lkb->lkb_timeout_cs * 10000))
+				do_cancel = 1;
+
+			if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
+			    wait_us >= dlm_config.ci_timewarn_cs * 10000)
+				do_warn = 1;
+
+			if (!do_cancel && !do_warn)
+				continue;
+			hold_lkb(lkb);
+			break;
+		}
+		mutex_unlock(&ls->ls_timeout_mutex);
+
+		if (!do_cancel && !do_warn)
+			break;
+
+		r = lkb->lkb_resource;
+		hold_rsb(r);
+		lock_rsb(r);
+
+		if (do_warn) {
+			/* clear flag so we only warn once */
+			lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN;
+			if (!(lkb->lkb_exflags & DLM_LKF_TIMEOUT))
+				del_timeout(lkb);
+			dlm_timeout_warn(lkb);
+		}
+
+		if (do_cancel) {
+			log_debug(ls, "timeout cancel %x node %d %s",
+				  lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
+			lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN;
+			lkb->lkb_flags |= DLM_IFL_TIMEOUT_CANCEL;
+			del_timeout(lkb);
+			_cancel_lock(r, lkb);
+		}
+
+		unlock_rsb(r);
+		unhold_rsb(r);
+		dlm_put_lkb(lkb);
+	}
+}
+
+/* This is only called by dlm_recoverd, and we rely on dlm_ls_stop() stopping
+   dlm_recoverd before checking/setting ls_recover_begin. */
+
+void dlm_adjust_timeouts(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb;
+	u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin);
+
+	ls->ls_recover_begin = 0;
+	mutex_lock(&ls->ls_timeout_mutex);
+	list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
+		lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
+	mutex_unlock(&ls->ls_timeout_mutex);
+
+	if (!dlm_config.ci_waitwarn_us)
+		return;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+		if (ktime_to_us(lkb->lkb_wait_time))
+			lkb->lkb_wait_time = ktime_get();
+	}
+	mutex_unlock(&ls->ls_waiters_mutex);
+}
+
+/* lkb is master or local copy */
+
+static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int b, len = r->res_ls->ls_lvblen;
+
+	/* b=1 lvb returned to caller
+	   b=0 lvb written to rsb or invalidated
+	   b=-1 do nothing */
+
+	b =  dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+
+	if (b == 1) {
+		if (!lkb->lkb_lvbptr)
+			return;
+
+		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+			return;
+
+		if (!r->res_lvbptr)
+			return;
+
+		memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
+		lkb->lkb_lvbseq = r->res_lvbseq;
+
+	} else if (b == 0) {
+		if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+			rsb_set_flag(r, RSB_VALNOTVALID);
+			return;
+		}
+
+		if (!lkb->lkb_lvbptr)
+			return;
+
+		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+			return;
+
+		if (!r->res_lvbptr)
+			r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
+
+		if (!r->res_lvbptr)
+			return;
+
+		memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
+		r->res_lvbseq++;
+		lkb->lkb_lvbseq = r->res_lvbseq;
+		rsb_clear_flag(r, RSB_VALNOTVALID);
+	}
+
+	if (rsb_flag(r, RSB_VALNOTVALID))
+		lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
+}
+
+static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_grmode < DLM_LOCK_PW)
+		return;
+
+	if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+		rsb_set_flag(r, RSB_VALNOTVALID);
+		return;
+	}
+
+	if (!lkb->lkb_lvbptr)
+		return;
+
+	if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+		return;
+
+	if (!r->res_lvbptr)
+		r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
+
+	if (!r->res_lvbptr)
+		return;
+
+	memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+	r->res_lvbseq++;
+	rsb_clear_flag(r, RSB_VALNOTVALID);
+}
+
+/* lkb is process copy (pc) */
+
+static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			    struct dlm_message *ms)
+{
+	int b;
+
+	if (!lkb->lkb_lvbptr)
+		return;
+
+	if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+		return;
+
+	b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+	if (b == 1) {
+		int len = receive_extralen(ms);
+		if (len > DLM_RESNAME_MAXLEN)
+			len = DLM_RESNAME_MAXLEN;
+		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+		lkb->lkb_lvbseq = ms->m_lvbseq;
+	}
+}
+
+/* Manipulate lkb's on rsb's convert/granted/waiting queues
+   remove_lock -- used for unlock, removes lkb from granted
+   revert_lock -- used for cancel, moves lkb from convert to granted
+   grant_lock  -- used for request and convert, adds lkb to granted or
+                  moves lkb from convert or waiting to granted
+
+   Each of these is used for master or local copy lkb's.  There is
+   also a _pc() variation used to make the corresponding change on
+   a process copy (pc) lkb. */
+
+static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	del_lkb(r, lkb);
+	lkb->lkb_grmode = DLM_LOCK_IV;
+	/* this unhold undoes the original ref from create_lkb()
+	   so this leads to the lkb being freed */
+	unhold_lkb(lkb);
+}
+
+static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	set_lvb_unlock(r, lkb);
+	_remove_lock(r, lkb);
+}
+
+static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	_remove_lock(r, lkb);
+}
+
+/* returns: 0 did nothing
+	    1 moved lock to granted
+	   -1 removed lock */
+
+static int revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int rv = 0;
+
+	lkb->lkb_rqmode = DLM_LOCK_IV;
+
+	switch (lkb->lkb_status) {
+	case DLM_LKSTS_GRANTED:
+		break;
+	case DLM_LKSTS_CONVERT:
+		move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+		rv = 1;
+		break;
+	case DLM_LKSTS_WAITING:
+		del_lkb(r, lkb);
+		lkb->lkb_grmode = DLM_LOCK_IV;
+		/* this unhold undoes the original ref from create_lkb()
+		   so this leads to the lkb being freed */
+		unhold_lkb(lkb);
+		rv = -1;
+		break;
+	default:
+		log_print("invalid status for revert %d", lkb->lkb_status);
+	}
+	return rv;
+}
+
+static int revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	return revert_lock(r, lkb);
+}
+
+static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_grmode != lkb->lkb_rqmode) {
+		lkb->lkb_grmode = lkb->lkb_rqmode;
+		if (lkb->lkb_status)
+			move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+		else
+			add_lkb(r, lkb, DLM_LKSTS_GRANTED);
+	}
+
+	lkb->lkb_rqmode = DLM_LOCK_IV;
+}
+
+static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	set_lvb_lock(r, lkb);
+	_grant_lock(r, lkb);
+	lkb->lkb_highbast = 0;
+}
+
+static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			  struct dlm_message *ms)
+{
+	set_lvb_lock_pc(r, lkb, ms);
+	_grant_lock(r, lkb);
+}
+
+/* called by grant_pending_locks() which means an async grant message must
+   be sent to the requesting node in addition to granting the lock if the
+   lkb belongs to a remote node. */
+
+static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	grant_lock(r, lkb);
+	if (is_master_copy(lkb))
+		send_grant(r, lkb);
+	else
+		queue_cast(r, lkb, 0);
+}
+
+/* The special CONVDEADLK, ALTPR and ALTCW flags allow the master to
+   change the granted/requested modes.  We're munging things accordingly in
+   the process copy.
+   CONVDEADLK: our grmode may have been forced down to NL to resolve a
+   conversion deadlock
+   ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
+   compatible with other granted locks */
+
+static void munge_demoted(struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
+		log_print("munge_demoted %x invalid modes gr %d rq %d",
+			  lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
+		return;
+	}
+
+	lkb->lkb_grmode = DLM_LOCK_NL;
+}
+
+static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	if (ms->m_type != DLM_MSG_REQUEST_REPLY &&
+	    ms->m_type != DLM_MSG_GRANT) {
+		log_print("munge_altmode %x invalid reply type %d",
+			  lkb->lkb_id, ms->m_type);
+		return;
+	}
+
+	if (lkb->lkb_exflags & DLM_LKF_ALTPR)
+		lkb->lkb_rqmode = DLM_LOCK_PR;
+	else if (lkb->lkb_exflags & DLM_LKF_ALTCW)
+		lkb->lkb_rqmode = DLM_LOCK_CW;
+	else {
+		log_print("munge_altmode invalid exflags %x", lkb->lkb_exflags);
+		dlm_print_lkb(lkb);
+	}
+}
+
+static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
+{
+	struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
+					   lkb_statequeue);
+	if (lkb->lkb_id == first->lkb_id)
+		return 1;
+
+	return 0;
+}
+
+/* Check if the given lkb conflicts with another lkb on the queue. */
+
+static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
+{
+	struct dlm_lkb *this;
+
+	list_for_each_entry(this, head, lkb_statequeue) {
+		if (this == lkb)
+			continue;
+		if (!modes_compat(this, lkb))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * "A conversion deadlock arises with a pair of lock requests in the converting
+ * queue for one resource.  The granted mode of each lock blocks the requested
+ * mode of the other lock."
+ *
+ * Part 2: if the granted mode of lkb is preventing an earlier lkb in the
+ * convert queue from being granted, then deadlk/demote lkb.
+ *
+ * Example:
+ * Granted Queue: empty
+ * Convert Queue: NL->EX (first lock)
+ *                PR->EX (second lock)
+ *
+ * The first lock can't be granted because of the granted mode of the second
+ * lock and the second lock can't be granted because it's not first in the
+ * list.  We either cancel lkb's conversion (PR->EX) and return EDEADLK, or we
+ * demote the granted mode of lkb (from PR to NL) if it has the CONVDEADLK
+ * flag set and return DEMOTED in the lksb flags.
+ *
+ * Originally, this function detected conv-deadlk in a more limited scope:
+ * - if !modes_compat(lkb1, lkb2) && !modes_compat(lkb2, lkb1), or
+ * - if lkb1 was the first entry in the queue (not just earlier), and was
+ *   blocked by the granted mode of lkb2, and there was nothing on the
+ *   granted queue preventing lkb1 from being granted immediately, i.e.
+ *   lkb2 was the only thing preventing lkb1 from being granted.
+ *
+ * That second condition meant we'd only say there was conv-deadlk if
+ * resolving it (by demotion) would lead to the first lock on the convert
+ * queue being granted right away.  It allowed conversion deadlocks to exist
+ * between locks on the convert queue while they couldn't be granted anyway.
+ *
+ * Now, we detect and take action on conversion deadlocks immediately when
+ * they're created, even if they may not be immediately consequential.  If
+ * lkb1 exists anywhere in the convert queue and lkb2 comes in with a granted
+ * mode that would prevent lkb1's conversion from being granted, we do a
+ * deadlk/demote on lkb2 right away and don't let it onto the convert queue.
+ * I think this means that the lkb_is_ahead condition below should always
+ * be zero, i.e. there will never be conv-deadlk between two locks that are
+ * both already on the convert queue.
+ */
+
+static int conversion_deadlock_detect(struct dlm_rsb *r, struct dlm_lkb *lkb2)
+{
+	struct dlm_lkb *lkb1;
+	int lkb_is_ahead = 0;
+
+	list_for_each_entry(lkb1, &r->res_convertqueue, lkb_statequeue) {
+		if (lkb1 == lkb2) {
+			lkb_is_ahead = 1;
+			continue;
+		}
+
+		if (!lkb_is_ahead) {
+			if (!modes_compat(lkb2, lkb1))
+				return 1;
+		} else {
+			if (!modes_compat(lkb2, lkb1) &&
+			    !modes_compat(lkb1, lkb2))
+				return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Return 1 if the lock can be granted, 0 otherwise.
+ * Also detect and resolve conversion deadlocks.
+ *
+ * lkb is the lock to be granted
+ *
+ * now is 1 if the function is being called in the context of the
+ * immediate request, it is 0 if called later, after the lock has been
+ * queued.
+ *
+ * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
+ */
+
+static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+	int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
+
+	/*
+	 * 6-10: Version 5.4 introduced an option to address the phenomenon of
+	 * a new request for a NL mode lock being blocked.
+	 *
+	 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
+	 * request, then it would be granted.  In essence, the use of this flag
+	 * tells the Lock Manager to expedite theis request by not considering
+	 * what may be in the CONVERTING or WAITING queues...  As of this
+	 * writing, the EXPEDITE flag can be used only with new requests for NL
+	 * mode locks.  This flag is not valid for conversion requests.
+	 *
+	 * A shortcut.  Earlier checks return an error if EXPEDITE is used in a
+	 * conversion or used with a non-NL requested mode.  We also know an
+	 * EXPEDITE request is always granted immediately, so now must always
+	 * be 1.  The full condition to grant an expedite request: (now &&
+	 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
+	 * therefore be shortened to just checking the flag.
+	 */
+
+	if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
+		return 1;
+
+	/*
+	 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
+	 * added to the remaining conditions.
+	 */
+
+	if (queue_conflict(&r->res_grantqueue, lkb))
+		goto out;
+
+	/*
+	 * 6-3: By default, a conversion request is immediately granted if the
+	 * requested mode is compatible with the modes of all other granted
+	 * locks
+	 */
+
+	if (queue_conflict(&r->res_convertqueue, lkb))
+		goto out;
+
+	/*
+	 * 6-5: But the default algorithm for deciding whether to grant or
+	 * queue conversion requests does not by itself guarantee that such
+	 * requests are serviced on a "first come first serve" basis.  This, in
+	 * turn, can lead to a phenomenon known as "indefinate postponement".
+	 *
+	 * 6-7: This issue is dealt with by using the optional QUECVT flag with
+	 * the system service employed to request a lock conversion.  This flag
+	 * forces certain conversion requests to be queued, even if they are
+	 * compatible with the granted modes of other locks on the same
+	 * resource.  Thus, the use of this flag results in conversion requests
+	 * being ordered on a "first come first servce" basis.
+	 *
+	 * DCT: This condition is all about new conversions being able to occur
+	 * "in place" while the lock remains on the granted queue (assuming
+	 * nothing else conflicts.)  IOW if QUECVT isn't set, a conversion
+	 * doesn't _have_ to go onto the convert queue where it's processed in
+	 * order.  The "now" variable is necessary to distinguish converts
+	 * being received and processed for the first time now, because once a
+	 * convert is moved to the conversion queue the condition below applies
+	 * requiring fifo granting.
+	 */
+
+	if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
+		return 1;
+
+	/*
+	 * The NOORDER flag is set to avoid the standard vms rules on grant
+	 * order.
+	 */
+
+	if (lkb->lkb_exflags & DLM_LKF_NOORDER)
+		return 1;
+
+	/*
+	 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
+	 * granted until all other conversion requests ahead of it are granted
+	 * and/or canceled.
+	 */
+
+	if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
+		return 1;
+
+	/*
+	 * 6-4: By default, a new request is immediately granted only if all
+	 * three of the following conditions are satisfied when the request is
+	 * issued:
+	 * - The queue of ungranted conversion requests for the resource is
+	 *   empty.
+	 * - The queue of ungranted new requests for the resource is empty.
+	 * - The mode of the new request is compatible with the most
+	 *   restrictive mode of all granted locks on the resource.
+	 */
+
+	if (now && !conv && list_empty(&r->res_convertqueue) &&
+	    list_empty(&r->res_waitqueue))
+		return 1;
+
+	/*
+	 * 6-4: Once a lock request is in the queue of ungranted new requests,
+	 * it cannot be granted until the queue of ungranted conversion
+	 * requests is empty, all ungranted new requests ahead of it are
+	 * granted and/or canceled, and it is compatible with the granted mode
+	 * of the most restrictive lock granted on the resource.
+	 */
+
+	if (!now && !conv && list_empty(&r->res_convertqueue) &&
+	    first_in_list(lkb, &r->res_waitqueue))
+		return 1;
+ out:
+	return 0;
+}
+
+static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
+			  int *err)
+{
+	int rv;
+	int8_t alt = 0, rqmode = lkb->lkb_rqmode;
+	int8_t is_convert = (lkb->lkb_grmode != DLM_LOCK_IV);
+
+	if (err)
+		*err = 0;
+
+	rv = _can_be_granted(r, lkb, now);
+	if (rv)
+		goto out;
+
+	/*
+	 * The CONVDEADLK flag is non-standard and tells the dlm to resolve
+	 * conversion deadlocks by demoting grmode to NL, otherwise the dlm
+	 * cancels one of the locks.
+	 */
+
+	if (is_convert && can_be_queued(lkb) &&
+	    conversion_deadlock_detect(r, lkb)) {
+		if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) {
+			lkb->lkb_grmode = DLM_LOCK_NL;
+			lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
+		} else if (!(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
+			if (err)
+				*err = -EDEADLK;
+			else {
+				log_print("can_be_granted deadlock %x now %d",
+					  lkb->lkb_id, now);
+				dlm_dump_rsb(r);
+			}
+		}
+		goto out;
+	}
+
+	/*
+	 * The ALTPR and ALTCW flags are non-standard and tell the dlm to try
+	 * to grant a request in a mode other than the normal rqmode.  It's a
+	 * simple way to provide a big optimization to applications that can
+	 * use them.
+	 */
+
+	if (rqmode != DLM_LOCK_PR && (lkb->lkb_exflags & DLM_LKF_ALTPR))
+		alt = DLM_LOCK_PR;
+	else if (rqmode != DLM_LOCK_CW && (lkb->lkb_exflags & DLM_LKF_ALTCW))
+		alt = DLM_LOCK_CW;
+
+	if (alt) {
+		lkb->lkb_rqmode = alt;
+		rv = _can_be_granted(r, lkb, now);
+		if (rv)
+			lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
+		else
+			lkb->lkb_rqmode = rqmode;
+	}
+ out:
+	return rv;
+}
+
+/* FIXME: I don't think that can_be_granted() can/will demote or find deadlock
+   for locks pending on the convert list.  Once verified (watch for these
+   log_prints), we should be able to just call _can_be_granted() and not
+   bother with the demote/deadlk cases here (and there's no easy way to deal
+   with a deadlk here, we'd have to generate something like grant_lock with
+   the deadlk error.) */
+
+/* Returns the highest requested mode of all blocked conversions; sets
+   cw if there's a blocked conversion to DLM_LOCK_CW. */
+
+static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw)
+{
+	struct dlm_lkb *lkb, *s;
+	int hi, demoted, quit, grant_restart, demote_restart;
+	int deadlk;
+
+	quit = 0;
+ restart:
+	grant_restart = 0;
+	demote_restart = 0;
+	hi = DLM_LOCK_IV;
+
+	list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
+		demoted = is_demoted(lkb);
+		deadlk = 0;
+
+		if (can_be_granted(r, lkb, 0, &deadlk)) {
+			grant_lock_pending(r, lkb);
+			grant_restart = 1;
+			continue;
+		}
+
+		if (!demoted && is_demoted(lkb)) {
+			log_print("WARN: pending demoted %x node %d %s",
+				  lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
+			demote_restart = 1;
+			continue;
+		}
+
+		if (deadlk) {
+			log_print("WARN: pending deadlock %x node %d %s",
+				  lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
+			dlm_dump_rsb(r);
+			continue;
+		}
+
+		hi = max_t(int, lkb->lkb_rqmode, hi);
+
+		if (cw && lkb->lkb_rqmode == DLM_LOCK_CW)
+			*cw = 1;
+	}
+
+	if (grant_restart)
+		goto restart;
+	if (demote_restart && !quit) {
+		quit = 1;
+		goto restart;
+	}
+
+	return max_t(int, high, hi);
+}
+
+static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw)
+{
+	struct dlm_lkb *lkb, *s;
+
+	list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
+		if (can_be_granted(r, lkb, 0, NULL))
+			grant_lock_pending(r, lkb);
+                else {
+			high = max_t(int, lkb->lkb_rqmode, high);
+			if (lkb->lkb_rqmode == DLM_LOCK_CW)
+				*cw = 1;
+		}
+	}
+
+	return high;
+}
+
+/* cw of 1 means there's a lock with a rqmode of DLM_LOCK_CW that's blocked
+   on either the convert or waiting queue.
+   high is the largest rqmode of all locks blocked on the convert or
+   waiting queue. */
+
+static int lock_requires_bast(struct dlm_lkb *gr, int high, int cw)
+{
+	if (gr->lkb_grmode == DLM_LOCK_PR && cw) {
+		if (gr->lkb_highbast < DLM_LOCK_EX)
+			return 1;
+		return 0;
+	}
+
+	if (gr->lkb_highbast < high &&
+	    !__dlm_compat_matrix[gr->lkb_grmode+1][high+1])
+		return 1;
+	return 0;
+}
+
+static void grant_pending_locks(struct dlm_rsb *r)
+{
+	struct dlm_lkb *lkb, *s;
+	int high = DLM_LOCK_IV;
+	int cw = 0;
+
+	DLM_ASSERT(is_master(r), dlm_dump_rsb(r););
+
+	high = grant_pending_convert(r, high, &cw);
+	high = grant_pending_wait(r, high, &cw);
+
+	if (high == DLM_LOCK_IV)
+		return;
+
+	/*
+	 * If there are locks left on the wait/convert queue then send blocking
+	 * ASTs to granted locks based on the largest requested mode (high)
+	 * found above.
+	 */
+
+	list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
+		if (lkb->lkb_bastfn && lock_requires_bast(lkb, high, cw)) {
+			if (cw && high == DLM_LOCK_PR &&
+			    lkb->lkb_grmode == DLM_LOCK_PR)
+				queue_bast(r, lkb, DLM_LOCK_CW);
+			else
+				queue_bast(r, lkb, high);
+			lkb->lkb_highbast = high;
+		}
+	}
+}
+
+static int modes_require_bast(struct dlm_lkb *gr, struct dlm_lkb *rq)
+{
+	if ((gr->lkb_grmode == DLM_LOCK_PR && rq->lkb_rqmode == DLM_LOCK_CW) ||
+	    (gr->lkb_grmode == DLM_LOCK_CW && rq->lkb_rqmode == DLM_LOCK_PR)) {
+		if (gr->lkb_highbast < DLM_LOCK_EX)
+			return 1;
+		return 0;
+	}
+
+	if (gr->lkb_highbast < rq->lkb_rqmode && !modes_compat(gr, rq))
+		return 1;
+	return 0;
+}
+
+static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
+			    struct dlm_lkb *lkb)
+{
+	struct dlm_lkb *gr;
+
+	list_for_each_entry(gr, head, lkb_statequeue) {
+		/* skip self when sending basts to convertqueue */
+		if (gr == lkb)
+			continue;
+		if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) {
+			queue_bast(r, gr, lkb->lkb_rqmode);
+			gr->lkb_highbast = lkb->lkb_rqmode;
+		}
+	}
+}
+
+static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	send_bast_queue(r, &r->res_grantqueue, lkb);
+}
+
+static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	send_bast_queue(r, &r->res_grantqueue, lkb);
+	send_bast_queue(r, &r->res_convertqueue, lkb);
+}
+
+/* set_master(r, lkb) -- set the master nodeid of a resource
+
+   The purpose of this function is to set the nodeid field in the given
+   lkb using the nodeid field in the given rsb.  If the rsb's nodeid is
+   known, it can just be copied to the lkb and the function will return
+   0.  If the rsb's nodeid is _not_ known, it needs to be looked up
+   before it can be copied to the lkb.
+
+   When the rsb nodeid is being looked up remotely, the initial lkb
+   causing the lookup is kept on the ls_waiters list waiting for the
+   lookup reply.  Other lkb's waiting for the same rsb lookup are kept
+   on the rsb's res_lookup list until the master is verified.
+
+   Return values:
+   0: nodeid is set in rsb/lkb and the caller should go ahead and use it
+   1: the rsb master is not available and the lkb has been placed on
+      a wait queue
+*/
+
+static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	struct dlm_ls *ls = r->res_ls;
+	int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+
+	if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
+		rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+		r->res_first_lkid = lkb->lkb_id;
+		lkb->lkb_nodeid = r->res_nodeid;
+		return 0;
+	}
+
+	if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
+		list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
+		return 1;
+	}
+
+	if (r->res_nodeid == 0) {
+		lkb->lkb_nodeid = 0;
+		return 0;
+	}
+
+	if (r->res_nodeid > 0) {
+		lkb->lkb_nodeid = r->res_nodeid;
+		return 0;
+	}
+
+	DLM_ASSERT(r->res_nodeid == -1, dlm_dump_rsb(r););
+
+	dir_nodeid = dlm_dir_nodeid(r);
+
+	if (dir_nodeid != our_nodeid) {
+		r->res_first_lkid = lkb->lkb_id;
+		send_lookup(r, lkb);
+		return 1;
+	}
+
+	for (i = 0; i < 2; i++) {
+		/* It's possible for dlm_scand to remove an old rsb for
+		   this same resource from the toss list, us to create
+		   a new one, look up the master locally, and find it
+		   already exists just before dlm_scand does the
+		   dir_remove() on the previous rsb. */
+
+		error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
+				       r->res_length, &ret_nodeid);
+		if (!error)
+			break;
+		log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
+		schedule();
+	}
+	if (error && error != -EEXIST)
+		return error;
+
+	if (ret_nodeid == our_nodeid) {
+		r->res_first_lkid = 0;
+		r->res_nodeid = 0;
+		lkb->lkb_nodeid = 0;
+	} else {
+		r->res_first_lkid = lkb->lkb_id;
+		r->res_nodeid = ret_nodeid;
+		lkb->lkb_nodeid = ret_nodeid;
+	}
+	return 0;
+}
+
+static void process_lookup_list(struct dlm_rsb *r)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
+		list_del_init(&lkb->lkb_rsb_lookup);
+		_request_lock(r, lkb);
+		schedule();
+	}
+}
+
+/* confirm_master -- confirm (or deny) an rsb's master nodeid */
+
+static void confirm_master(struct dlm_rsb *r, int error)
+{
+	struct dlm_lkb *lkb;
+
+	if (!r->res_first_lkid)
+		return;
+
+	switch (error) {
+	case 0:
+	case -EINPROGRESS:
+		r->res_first_lkid = 0;
+		process_lookup_list(r);
+		break;
+
+	case -EAGAIN:
+	case -EBADR:
+	case -ENOTBLK:
+		/* the remote request failed and won't be retried (it was
+		   a NOQUEUE, or has been canceled/unlocked); make a waiting
+		   lkb the first_lkid */
+
+		r->res_first_lkid = 0;
+
+		if (!list_empty(&r->res_lookup)) {
+			lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
+					 lkb_rsb_lookup);
+			list_del_init(&lkb->lkb_rsb_lookup);
+			r->res_first_lkid = lkb->lkb_id;
+			_request_lock(r, lkb);
+		}
+		break;
+
+	default:
+		log_error(r->res_ls, "confirm_master unknown error %d", error);
+	}
+}
+
+static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
+			 int namelen, unsigned long timeout_cs,
+			 void (*ast) (void *astparam),
+			 void *astparam,
+			 void (*bast) (void *astparam, int mode),
+			 struct dlm_args *args)
+{
+	int rv = -EINVAL;
+
+	/* check for invalid arg usage */
+
+	if (mode < 0 || mode > DLM_LOCK_EX)
+		goto out;
+
+	if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
+		goto out;
+
+	if (flags & DLM_LKF_CANCEL)
+		goto out;
+
+	if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
+		goto out;
+
+	if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
+		goto out;
+
+	if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
+		goto out;
+
+	if (!ast || !lksb)
+		goto out;
+
+	if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
+		goto out;
+
+	if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
+		goto out;
+
+	/* these args will be copied to the lkb in validate_lock_args,
+	   it cannot be done now because when converting locks, fields in
+	   an active lkb cannot be modified before locking the rsb */
+
+	args->flags = flags;
+	args->astfn = ast;
+	args->astparam = astparam;
+	args->bastfn = bast;
+	args->timeout = timeout_cs;
+	args->mode = mode;
+	args->lksb = lksb;
+	rv = 0;
+ out:
+	return rv;
+}
+
+static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
+{
+	if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
+ 		      DLM_LKF_FORCEUNLOCK))
+		return -EINVAL;
+
+	if (flags & DLM_LKF_CANCEL && flags & DLM_LKF_FORCEUNLOCK)
+		return -EINVAL;
+
+	args->flags = flags;
+	args->astparam = astarg;
+	return 0;
+}
+
+static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+			      struct dlm_args *args)
+{
+	int rv = -EINVAL;
+
+	if (args->flags & DLM_LKF_CONVERT) {
+		if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+			goto out;
+
+		if (args->flags & DLM_LKF_QUECVT &&
+		    !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
+			goto out;
+
+		rv = -EBUSY;
+		if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+			goto out;
+
+		if (lkb->lkb_wait_type)
+			goto out;
+
+		if (is_overlap(lkb))
+			goto out;
+	}
+
+	lkb->lkb_exflags = args->flags;
+	lkb->lkb_sbflags = 0;
+	lkb->lkb_astfn = args->astfn;
+	lkb->lkb_astparam = args->astparam;
+	lkb->lkb_bastfn = args->bastfn;
+	lkb->lkb_rqmode = args->mode;
+	lkb->lkb_lksb = args->lksb;
+	lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
+	lkb->lkb_ownpid = (int) current->pid;
+	lkb->lkb_timeout_cs = args->timeout;
+	rv = 0;
+ out:
+	if (rv)
+		log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s",
+			  rv, lkb->lkb_id, lkb->lkb_flags, args->flags,
+			  lkb->lkb_status, lkb->lkb_wait_type,
+			  lkb->lkb_resource->res_name);
+	return rv;
+}
+
+/* when dlm_unlock() sees -EBUSY with CANCEL/FORCEUNLOCK it returns 0
+   for success */
+
+/* note: it's valid for lkb_nodeid/res_nodeid to be -1 when we get here
+   because there may be a lookup in progress and it's valid to do
+   cancel/unlockf on it */
+
+static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int rv = -EINVAL;
+
+	if (lkb->lkb_flags & DLM_IFL_MSTCPY) {
+		log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
+		dlm_print_lkb(lkb);
+		goto out;
+	}
+
+	/* an lkb may still exist even though the lock is EOL'ed due to a
+	   cancel, unlock or failed noqueue request; an app can't use these
+	   locks; return same error as if the lkid had not been found at all */
+
+	if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
+		log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
+		rv = -ENOENT;
+		goto out;
+	}
+
+	/* an lkb may be waiting for an rsb lookup to complete where the
+	   lookup was initiated by another lock */
+
+	if (!list_empty(&lkb->lkb_rsb_lookup)) {
+		if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
+			log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
+			list_del_init(&lkb->lkb_rsb_lookup);
+			queue_cast(lkb->lkb_resource, lkb,
+				   args->flags & DLM_LKF_CANCEL ?
+				   -DLM_ECANCEL : -DLM_EUNLOCK);
+			unhold_lkb(lkb); /* undoes create_lkb() */
+		}
+		/* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
+		rv = -EBUSY;
+		goto out;
+	}
+
+	/* cancel not allowed with another cancel/unlock in progress */
+
+	if (args->flags & DLM_LKF_CANCEL) {
+		if (lkb->lkb_exflags & DLM_LKF_CANCEL)
+			goto out;
+
+		if (is_overlap(lkb))
+			goto out;
+
+		/* don't let scand try to do a cancel */
+		del_timeout(lkb);
+
+		if (lkb->lkb_flags & DLM_IFL_RESEND) {
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
+			rv = -EBUSY;
+			goto out;
+		}
+
+		/* there's nothing to cancel */
+		if (lkb->lkb_status == DLM_LKSTS_GRANTED &&
+		    !lkb->lkb_wait_type) {
+			rv = -EBUSY;
+			goto out;
+		}
+
+		switch (lkb->lkb_wait_type) {
+		case DLM_MSG_LOOKUP:
+		case DLM_MSG_REQUEST:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
+			rv = -EBUSY;
+			goto out;
+		case DLM_MSG_UNLOCK:
+		case DLM_MSG_CANCEL:
+			goto out;
+		}
+		/* add_to_waiters() will set OVERLAP_CANCEL */
+		goto out_ok;
+	}
+
+	/* do we need to allow a force-unlock if there's a normal unlock
+	   already in progress?  in what conditions could the normal unlock
+	   fail such that we'd want to send a force-unlock to be sure? */
+
+	if (args->flags & DLM_LKF_FORCEUNLOCK) {
+		if (lkb->lkb_exflags & DLM_LKF_FORCEUNLOCK)
+			goto out;
+
+		if (is_overlap_unlock(lkb))
+			goto out;
+
+		/* don't let scand try to do a cancel */
+		del_timeout(lkb);
+
+		if (lkb->lkb_flags & DLM_IFL_RESEND) {
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
+			rv = -EBUSY;
+			goto out;
+		}
+
+		switch (lkb->lkb_wait_type) {
+		case DLM_MSG_LOOKUP:
+		case DLM_MSG_REQUEST:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
+			rv = -EBUSY;
+			goto out;
+		case DLM_MSG_UNLOCK:
+			goto out;
+		}
+		/* add_to_waiters() will set OVERLAP_UNLOCK */
+		goto out_ok;
+	}
+
+	/* normal unlock not allowed if there's any op in progress */
+	rv = -EBUSY;
+	if (lkb->lkb_wait_type || lkb->lkb_wait_count)
+		goto out;
+
+ out_ok:
+	/* an overlapping op shouldn't blow away exflags from other op */
+	lkb->lkb_exflags |= args->flags;
+	lkb->lkb_sbflags = 0;
+	lkb->lkb_astparam = args->astparam;
+	rv = 0;
+ out:
+	if (rv)
+		log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv,
+			  lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
+			  args->flags, lkb->lkb_wait_type,
+			  lkb->lkb_resource->res_name);
+	return rv;
+}
+
+/*
+ * Four stage 4 varieties:
+ * do_request(), do_convert(), do_unlock(), do_cancel()
+ * These are called on the master node for the given lock and
+ * from the central locking logic.
+ */
+
+static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error = 0;
+
+	if (can_be_granted(r, lkb, 1, NULL)) {
+		grant_lock(r, lkb);
+		queue_cast(r, lkb, 0);
+		goto out;
+	}
+
+	if (can_be_queued(lkb)) {
+		error = -EINPROGRESS;
+		add_lkb(r, lkb, DLM_LKSTS_WAITING);
+		add_timeout(lkb);
+		goto out;
+	}
+
+	error = -EAGAIN;
+	queue_cast(r, lkb, -EAGAIN);
+ out:
+	return error;
+}
+
+static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			       int error)
+{
+	switch (error) {
+	case -EAGAIN:
+		if (force_blocking_asts(lkb))
+			send_blocking_asts_all(r, lkb);
+		break;
+	case -EINPROGRESS:
+		send_blocking_asts(r, lkb);
+		break;
+	}
+}
+
+static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error = 0;
+	int deadlk = 0;
+
+	/* changing an existing lock may allow others to be granted */
+
+	if (can_be_granted(r, lkb, 1, &deadlk)) {
+		grant_lock(r, lkb);
+		queue_cast(r, lkb, 0);
+		goto out;
+	}
+
+	/* can_be_granted() detected that this lock would block in a conversion
+	   deadlock, so we leave it on the granted queue and return EDEADLK in
+	   the ast for the convert. */
+
+	if (deadlk) {
+		/* it's left on the granted queue */
+		log_debug(r->res_ls, "deadlock %x node %d sts%d g%d r%d %s",
+			  lkb->lkb_id, lkb->lkb_nodeid, lkb->lkb_status,
+			  lkb->lkb_grmode, lkb->lkb_rqmode, r->res_name);
+		revert_lock(r, lkb);
+		queue_cast(r, lkb, -EDEADLK);
+		error = -EDEADLK;
+		goto out;
+	}
+
+	/* is_demoted() means the can_be_granted() above set the grmode
+	   to NL, and left us on the granted queue.  This auto-demotion
+	   (due to CONVDEADLK) might mean other locks, and/or this lock, are
+	   now grantable.  We have to try to grant other converting locks
+	   before we try again to grant this one. */
+
+	if (is_demoted(lkb)) {
+		grant_pending_convert(r, DLM_LOCK_IV, NULL);
+		if (_can_be_granted(r, lkb, 1)) {
+			grant_lock(r, lkb);
+			queue_cast(r, lkb, 0);
+			goto out;
+		}
+		/* else fall through and move to convert queue */
+	}
+
+	if (can_be_queued(lkb)) {
+		error = -EINPROGRESS;
+		del_lkb(r, lkb);
+		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+		add_timeout(lkb);
+		goto out;
+	}
+
+	error = -EAGAIN;
+	queue_cast(r, lkb, -EAGAIN);
+ out:
+	return error;
+}
+
+static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			       int error)
+{
+	switch (error) {
+	case 0:
+		grant_pending_locks(r);
+		/* grant_pending_locks also sends basts */
+		break;
+	case -EAGAIN:
+		if (force_blocking_asts(lkb))
+			send_blocking_asts_all(r, lkb);
+		break;
+	case -EINPROGRESS:
+		send_blocking_asts(r, lkb);
+		break;
+	}
+}
+
+static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	remove_lock(r, lkb);
+	queue_cast(r, lkb, -DLM_EUNLOCK);
+	return -DLM_EUNLOCK;
+}
+
+static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			      int error)
+{
+	grant_pending_locks(r);
+}
+
+/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
+ 
+static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	error = revert_lock(r, lkb);
+	if (error) {
+		queue_cast(r, lkb, -DLM_ECANCEL);
+		return -DLM_ECANCEL;
+	}
+	return 0;
+}
+
+static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			      int error)
+{
+	if (error)
+		grant_pending_locks(r);
+}
+
+/*
+ * Four stage 3 varieties:
+ * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
+ */
+
+/* add a new lkb to a possibly new rsb, called by requesting process */
+
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	/* set_master: sets lkb nodeid from r */
+
+	error = set_master(r, lkb);
+	if (error < 0)
+		goto out;
+	if (error) {
+		error = 0;
+		goto out;
+	}
+
+	if (is_remote(r)) {
+		/* receive_request() calls do_request() on remote node */
+		error = send_request(r, lkb);
+	} else {
+		error = do_request(r, lkb);
+		/* for remote locks the request_reply is sent
+		   between do_request and do_request_effects */
+		do_request_effects(r, lkb, error);
+	}
+ out:
+	return error;
+}
+
+/* change some property of an existing lkb, e.g. mode */
+
+static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	if (is_remote(r)) {
+		/* receive_convert() calls do_convert() on remote node */
+		error = send_convert(r, lkb);
+	} else {
+		error = do_convert(r, lkb);
+		/* for remote locks the convert_reply is sent
+		   between do_convert and do_convert_effects */
+		do_convert_effects(r, lkb, error);
+	}
+
+	return error;
+}
+
+/* remove an existing lkb from the granted queue */
+
+static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	if (is_remote(r)) {
+		/* receive_unlock() calls do_unlock() on remote node */
+		error = send_unlock(r, lkb);
+	} else {
+		error = do_unlock(r, lkb);
+		/* for remote locks the unlock_reply is sent
+		   between do_unlock and do_unlock_effects */
+		do_unlock_effects(r, lkb, error);
+	}
+
+	return error;
+}
+
+/* remove an existing lkb from the convert or wait queue */
+
+static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	if (is_remote(r)) {
+		/* receive_cancel() calls do_cancel() on remote node */
+		error = send_cancel(r, lkb);
+	} else {
+		error = do_cancel(r, lkb);
+		/* for remote locks the cancel_reply is sent
+		   between do_cancel and do_cancel_effects */
+		do_cancel_effects(r, lkb, error);
+	}
+
+	return error;
+}
+
+/*
+ * Four stage 2 varieties:
+ * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
+ */
+
+static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
+			int len, struct dlm_args *args)
+{
+	struct dlm_rsb *r;
+	int error;
+
+	error = validate_lock_args(ls, lkb, args);
+	if (error)
+		goto out;
+
+	error = find_rsb(ls, name, len, R_CREATE, &r);
+	if (error)
+		goto out;
+
+	lock_rsb(r);
+
+	attach_lkb(r, lkb);
+	lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
+
+	error = _request_lock(r, lkb);
+
+	unlock_rsb(r);
+	put_rsb(r);
+
+ out:
+	return error;
+}
+
+static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+			struct dlm_args *args)
+{
+	struct dlm_rsb *r;
+	int error;
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_lock_args(ls, lkb, args);
+	if (error)
+		goto out;
+
+	error = _convert_lock(r, lkb);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	return error;
+}
+
+static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+		       struct dlm_args *args)
+{
+	struct dlm_rsb *r;
+	int error;
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_unlock_args(lkb, args);
+	if (error)
+		goto out;
+
+	error = _unlock_lock(r, lkb);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	return error;
+}
+
+static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+		       struct dlm_args *args)
+{
+	struct dlm_rsb *r;
+	int error;
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_unlock_args(lkb, args);
+	if (error)
+		goto out;
+
+	error = _cancel_lock(r, lkb);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	return error;
+}
+
+/*
+ * Two stage 1 varieties:  dlm_lock() and dlm_unlock()
+ */
+
+int dlm_lock(dlm_lockspace_t *lockspace,
+	     int mode,
+	     struct dlm_lksb *lksb,
+	     uint32_t flags,
+	     void *name,
+	     unsigned int namelen,
+	     uint32_t parent_lkid,
+	     void (*ast) (void *astarg),
+	     void *astarg,
+	     void (*bast) (void *astarg, int mode))
+{
+	struct dlm_ls *ls;
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	int error, convert = flags & DLM_LKF_CONVERT;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+
+	dlm_lock_recovery(ls);
+
+	if (convert)
+		error = find_lkb(ls, lksb->sb_lkid, &lkb);
+	else
+		error = create_lkb(ls, &lkb);
+
+	if (error)
+		goto out;
+
+	error = set_lock_args(mode, lksb, flags, namelen, 0, ast,
+			      astarg, bast, &args);
+	if (error)
+		goto out_put;
+
+	if (convert)
+		error = convert_lock(ls, lkb, &args);
+	else
+		error = request_lock(ls, lkb, name, namelen, &args);
+
+	if (error == -EINPROGRESS)
+		error = 0;
+ out_put:
+	if (convert || error)
+		__put_lkb(ls, lkb);
+	if (error == -EAGAIN || error == -EDEADLK)
+		error = 0;
+ out:
+	dlm_unlock_recovery(ls);
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+int dlm_unlock(dlm_lockspace_t *lockspace,
+	       uint32_t lkid,
+	       uint32_t flags,
+	       struct dlm_lksb *lksb,
+	       void *astarg)
+{
+	struct dlm_ls *ls;
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	int error;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+
+	dlm_lock_recovery(ls);
+
+	error = find_lkb(ls, lkid, &lkb);
+	if (error)
+		goto out;
+
+	error = set_unlock_args(flags, astarg, &args);
+	if (error)
+		goto out_put;
+
+	if (flags & DLM_LKF_CANCEL)
+		error = cancel_lock(ls, lkb, &args);
+	else
+		error = unlock_lock(ls, lkb, &args);
+
+	if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
+		error = 0;
+	if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)))
+		error = 0;
+ out_put:
+	dlm_put_lkb(lkb);
+ out:
+	dlm_unlock_recovery(ls);
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+/*
+ * send/receive routines for remote operations and replies
+ *
+ * send_args
+ * send_common
+ * send_request			receive_request
+ * send_convert			receive_convert
+ * send_unlock			receive_unlock
+ * send_cancel			receive_cancel
+ * send_grant			receive_grant
+ * send_bast			receive_bast
+ * send_lookup			receive_lookup
+ * send_remove			receive_remove
+ *
+ * 				send_common_reply
+ * receive_request_reply	send_request_reply
+ * receive_convert_reply	send_convert_reply
+ * receive_unlock_reply		send_unlock_reply
+ * receive_cancel_reply		send_cancel_reply
+ * receive_lookup_reply		send_lookup_reply
+ */
+
+static int _create_message(struct dlm_ls *ls, int mb_len,
+			   int to_nodeid, int mstype,
+			   struct dlm_message **ms_ret,
+			   struct dlm_mhandle **mh_ret)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	char *mb;
+
+	/* get_buffer gives us a message handle (mh) that we need to
+	   pass into lowcomms_commit and a message buffer (mb) that we
+	   write our data into */
+
+	mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
+	if (!mh)
+		return -ENOBUFS;
+
+	memset(mb, 0, mb_len);
+
+	ms = (struct dlm_message *) mb;
+
+	ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+	ms->m_header.h_lockspace = ls->ls_global_id;
+	ms->m_header.h_nodeid = dlm_our_nodeid();
+	ms->m_header.h_length = mb_len;
+	ms->m_header.h_cmd = DLM_MSG;
+
+	ms->m_type = mstype;
+
+	*mh_ret = mh;
+	*ms_ret = ms;
+	return 0;
+}
+
+static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			  int to_nodeid, int mstype,
+			  struct dlm_message **ms_ret,
+			  struct dlm_mhandle **mh_ret)
+{
+	int mb_len = sizeof(struct dlm_message);
+
+	switch (mstype) {
+	case DLM_MSG_REQUEST:
+	case DLM_MSG_LOOKUP:
+	case DLM_MSG_REMOVE:
+		mb_len += r->res_length;
+		break;
+	case DLM_MSG_CONVERT:
+	case DLM_MSG_UNLOCK:
+	case DLM_MSG_REQUEST_REPLY:
+	case DLM_MSG_CONVERT_REPLY:
+	case DLM_MSG_GRANT:
+		if (lkb && lkb->lkb_lvbptr)
+			mb_len += r->res_ls->ls_lvblen;
+		break;
+	}
+
+	return _create_message(r->res_ls, mb_len, to_nodeid, mstype,
+			       ms_ret, mh_ret);
+}
+
+/* further lowcomms enhancements or alternate implementations may make
+   the return value from this function useful at some point */
+
+static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
+{
+	dlm_message_out(ms);
+	dlm_lowcomms_commit_buffer(mh);
+	return 0;
+}
+
+static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
+		      struct dlm_message *ms)
+{
+	ms->m_nodeid   = lkb->lkb_nodeid;
+	ms->m_pid      = lkb->lkb_ownpid;
+	ms->m_lkid     = lkb->lkb_id;
+	ms->m_remid    = lkb->lkb_remid;
+	ms->m_exflags  = lkb->lkb_exflags;
+	ms->m_sbflags  = lkb->lkb_sbflags;
+	ms->m_flags    = lkb->lkb_flags;
+	ms->m_lvbseq   = lkb->lkb_lvbseq;
+	ms->m_status   = lkb->lkb_status;
+	ms->m_grmode   = lkb->lkb_grmode;
+	ms->m_rqmode   = lkb->lkb_rqmode;
+	ms->m_hash     = r->res_hash;
+
+	/* m_result and m_bastmode are set from function args,
+	   not from lkb fields */
+
+	if (lkb->lkb_bastfn)
+		ms->m_asts |= DLM_CB_BAST;
+	if (lkb->lkb_astfn)
+		ms->m_asts |= DLM_CB_CAST;
+
+	/* compare with switch in create_message; send_remove() doesn't
+	   use send_args() */
+
+	switch (ms->m_type) {
+	case DLM_MSG_REQUEST:
+	case DLM_MSG_LOOKUP:
+		memcpy(ms->m_extra, r->res_name, r->res_length);
+		break;
+	case DLM_MSG_CONVERT:
+	case DLM_MSG_UNLOCK:
+	case DLM_MSG_REQUEST_REPLY:
+	case DLM_MSG_CONVERT_REPLY:
+	case DLM_MSG_GRANT:
+		if (!lkb->lkb_lvbptr)
+			break;
+		memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+		break;
+	}
+}
+
+static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = r->res_nodeid;
+
+	error = add_to_waiters(lkb, mstype, to_nodeid);
+	if (error)
+		return error;
+
+	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+	if (error)
+		goto fail;
+
+	send_args(r, lkb, ms);
+
+	error = send_message(mh, ms);
+	if (error)
+		goto fail;
+	return 0;
+
+ fail:
+	remove_from_waiters(lkb, msg_reply_type(mstype));
+	return error;
+}
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	return send_common(r, lkb, DLM_MSG_REQUEST);
+}
+
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	error = send_common(r, lkb, DLM_MSG_CONVERT);
+
+	/* down conversions go without a reply from the master */
+	if (!error && down_conversion(lkb)) {
+		remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
+		r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS;
+		r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
+		r->res_ls->ls_stub_ms.m_result = 0;
+		__receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
+	}
+
+	return error;
+}
+
+/* FIXME: if this lkb is the only lock we hold on the rsb, then set
+   MASTER_UNCERTAIN to force the next request on the rsb to confirm
+   that the master is still correct. */
+
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	return send_common(r, lkb, DLM_MSG_UNLOCK);
+}
+
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	return send_common(r, lkb, DLM_MSG_CANCEL);
+}
+
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = lkb->lkb_nodeid;
+
+	error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
+	if (error)
+		goto out;
+
+	send_args(r, lkb, ms);
+
+	ms->m_result = 0;
+
+	error = send_message(mh, ms);
+ out:
+	return error;
+}
+
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = lkb->lkb_nodeid;
+
+	error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
+	if (error)
+		goto out;
+
+	send_args(r, lkb, ms);
+
+	ms->m_bastmode = mode;
+
+	error = send_message(mh, ms);
+ out:
+	return error;
+}
+
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = dlm_dir_nodeid(r);
+
+	error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid);
+	if (error)
+		return error;
+
+	error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
+	if (error)
+		goto fail;
+
+	send_args(r, lkb, ms);
+
+	error = send_message(mh, ms);
+	if (error)
+		goto fail;
+	return 0;
+
+ fail:
+	remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
+	return error;
+}
+
+static int send_remove(struct dlm_rsb *r)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = dlm_dir_nodeid(r);
+
+	error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
+	if (error)
+		goto out;
+
+	memcpy(ms->m_extra, r->res_name, r->res_length);
+	ms->m_hash = r->res_hash;
+
+	error = send_message(mh, ms);
+ out:
+	return error;
+}
+
+static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			     int mstype, int rv)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = lkb->lkb_nodeid;
+
+	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+	if (error)
+		goto out;
+
+	send_args(r, lkb, ms);
+
+	ms->m_result = rv;
+
+	error = send_message(mh, ms);
+ out:
+	return error;
+}
+
+static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+	return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
+}
+
+static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+	return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
+}
+
+static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+	return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
+}
+
+static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+	return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
+}
+
+static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
+			     int ret_nodeid, int rv)
+{
+	struct dlm_rsb *r = &ls->ls_stub_rsb;
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int error, nodeid = ms_in->m_header.h_nodeid;
+
+	error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
+	if (error)
+		goto out;
+
+	ms->m_lkid = ms_in->m_lkid;
+	ms->m_result = rv;
+	ms->m_nodeid = ret_nodeid;
+
+	error = send_message(mh, ms);
+ out:
+	return error;
+}
+
+/* which args we save from a received message depends heavily on the type
+   of message, unlike the send side where we can safely send everything about
+   the lkb for any type of message */
+
+static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	lkb->lkb_exflags = ms->m_exflags;
+	lkb->lkb_sbflags = ms->m_sbflags;
+	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+		         (ms->m_flags & 0x0000FFFF);
+}
+
+static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	if (ms->m_flags == DLM_IFL_STUB_MS)
+		return;
+
+	lkb->lkb_sbflags = ms->m_sbflags;
+	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+		         (ms->m_flags & 0x0000FFFF);
+}
+
+static int receive_extralen(struct dlm_message *ms)
+{
+	return (ms->m_header.h_length - sizeof(struct dlm_message));
+}
+
+static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
+		       struct dlm_message *ms)
+{
+	int len;
+
+	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+		if (!lkb->lkb_lvbptr)
+			lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
+		if (!lkb->lkb_lvbptr)
+			return -ENOMEM;
+		len = receive_extralen(ms);
+		if (len > DLM_RESNAME_MAXLEN)
+			len = DLM_RESNAME_MAXLEN;
+		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+	}
+	return 0;
+}
+
+static void fake_bastfn(void *astparam, int mode)
+{
+	log_print("fake_bastfn should not be called");
+}
+
+static void fake_astfn(void *astparam)
+{
+	log_print("fake_astfn should not be called");
+}
+
+static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+				struct dlm_message *ms)
+{
+	lkb->lkb_nodeid = ms->m_header.h_nodeid;
+	lkb->lkb_ownpid = ms->m_pid;
+	lkb->lkb_remid = ms->m_lkid;
+	lkb->lkb_grmode = DLM_LOCK_IV;
+	lkb->lkb_rqmode = ms->m_rqmode;
+
+	lkb->lkb_bastfn = (ms->m_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
+	lkb->lkb_astfn = (ms->m_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
+
+	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+		/* lkb was just created so there won't be an lvb yet */
+		lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
+		if (!lkb->lkb_lvbptr)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+				struct dlm_message *ms)
+{
+	if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+		return -EBUSY;
+
+	if (receive_lvb(ls, lkb, ms))
+		return -ENOMEM;
+
+	lkb->lkb_rqmode = ms->m_rqmode;
+	lkb->lkb_lvbseq = ms->m_lvbseq;
+
+	return 0;
+}
+
+static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+			       struct dlm_message *ms)
+{
+	if (receive_lvb(ls, lkb, ms))
+		return -ENOMEM;
+	return 0;
+}
+
+/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
+   uses to send a reply and that the remote end uses to process the reply. */
+
+static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb = &ls->ls_stub_lkb;
+	lkb->lkb_nodeid = ms->m_header.h_nodeid;
+	lkb->lkb_remid = ms->m_lkid;
+}
+
+/* This is called after the rsb is locked so that we can safely inspect
+   fields in the lkb. */
+
+static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	int from = ms->m_header.h_nodeid;
+	int error = 0;
+
+	switch (ms->m_type) {
+	case DLM_MSG_CONVERT:
+	case DLM_MSG_UNLOCK:
+	case DLM_MSG_CANCEL:
+		if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
+			error = -EINVAL;
+		break;
+
+	case DLM_MSG_CONVERT_REPLY:
+	case DLM_MSG_UNLOCK_REPLY:
+	case DLM_MSG_CANCEL_REPLY:
+	case DLM_MSG_GRANT:
+	case DLM_MSG_BAST:
+		if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
+			error = -EINVAL;
+		break;
+
+	case DLM_MSG_REQUEST_REPLY:
+		if (!is_process_copy(lkb))
+			error = -EINVAL;
+		else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
+			error = -EINVAL;
+		break;
+
+	default:
+		error = -EINVAL;
+	}
+
+	if (error)
+		log_error(lkb->lkb_resource->res_ls,
+			  "ignore invalid message %d from %d %x %x %x %d",
+			  ms->m_type, from, lkb->lkb_id, lkb->lkb_remid,
+			  lkb->lkb_flags, lkb->lkb_nodeid);
+	return error;
+}
+
+static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error, namelen;
+
+	error = create_lkb(ls, &lkb);
+	if (error)
+		goto fail;
+
+	receive_flags(lkb, ms);
+	lkb->lkb_flags |= DLM_IFL_MSTCPY;
+	error = receive_request_args(ls, lkb, ms);
+	if (error) {
+		__put_lkb(ls, lkb);
+		goto fail;
+	}
+
+	namelen = receive_extralen(ms);
+
+	error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
+	if (error) {
+		__put_lkb(ls, lkb);
+		goto fail;
+	}
+
+	lock_rsb(r);
+
+	attach_lkb(r, lkb);
+	error = do_request(r, lkb);
+	send_request_reply(r, lkb, error);
+	do_request_effects(r, lkb, error);
+
+	unlock_rsb(r);
+	put_rsb(r);
+
+	if (error == -EINPROGRESS)
+		error = 0;
+	if (error)
+		dlm_put_lkb(lkb);
+	return;
+
+ fail:
+	setup_stub_lkb(ls, ms);
+	send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error, reply = 1;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error)
+		goto fail;
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	receive_flags(lkb, ms);
+
+	error = receive_convert_args(ls, lkb, ms);
+	if (error) {
+		send_convert_reply(r, lkb, error);
+		goto out;
+	}
+
+	reply = !down_conversion(lkb);
+
+	error = do_convert(r, lkb);
+	if (reply)
+		send_convert_reply(r, lkb, error);
+	do_convert_effects(r, lkb, error);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+	return;
+
+ fail:
+	setup_stub_lkb(ls, ms);
+	send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error)
+		goto fail;
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	receive_flags(lkb, ms);
+
+	error = receive_unlock_args(ls, lkb, ms);
+	if (error) {
+		send_unlock_reply(r, lkb, error);
+		goto out;
+	}
+
+	error = do_unlock(r, lkb);
+	send_unlock_reply(r, lkb, error);
+	do_unlock_effects(r, lkb, error);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+	return;
+
+ fail:
+	setup_stub_lkb(ls, ms);
+	send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error)
+		goto fail;
+
+	receive_flags(lkb, ms);
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	error = do_cancel(r, lkb);
+	send_cancel_reply(r, lkb, error);
+	do_cancel_effects(r, lkb, error);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+	return;
+
+ fail:
+	setup_stub_lkb(ls, ms);
+	send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error) {
+		log_debug(ls, "receive_grant from %d no lkb %x",
+			  ms->m_header.h_nodeid, ms->m_remid);
+		return;
+	}
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	receive_flags_reply(lkb, ms);
+	if (is_altmode(lkb))
+		munge_altmode(lkb, ms);
+	grant_lock_pc(r, lkb, ms);
+	queue_cast(r, lkb, 0);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+}
+
+static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error) {
+		log_debug(ls, "receive_bast from %d no lkb %x",
+			  ms->m_header.h_nodeid, ms->m_remid);
+		return;
+	}
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	queue_bast(r, lkb, ms->m_bastmode);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+}
+
+static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
+
+	from_nodeid = ms->m_header.h_nodeid;
+	our_nodeid = dlm_our_nodeid();
+
+	len = receive_extralen(ms);
+
+	dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+	if (dir_nodeid != our_nodeid) {
+		log_error(ls, "lookup dir_nodeid %d from %d",
+			  dir_nodeid, from_nodeid);
+		error = -EINVAL;
+		ret_nodeid = -1;
+		goto out;
+	}
+
+	error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
+
+	/* Optimization: we're master so treat lookup as a request */
+	if (!error && ret_nodeid == our_nodeid) {
+		receive_request(ls, ms);
+		return;
+	}
+ out:
+	send_lookup_reply(ls, ms, ret_nodeid, error);
+}
+
+static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	int len, dir_nodeid, from_nodeid;
+
+	from_nodeid = ms->m_header.h_nodeid;
+
+	len = receive_extralen(ms);
+
+	dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+	if (dir_nodeid != dlm_our_nodeid()) {
+		log_error(ls, "remove dir entry dir_nodeid %d from %d",
+			  dir_nodeid, from_nodeid);
+		return;
+	}
+
+	dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
+}
+
+static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	do_purge(ls, ms->m_nodeid, ms->m_pid);
+}
+
+static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error, mstype, result;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error) {
+		log_debug(ls, "receive_request_reply from %d no lkb %x",
+			  ms->m_header.h_nodeid, ms->m_remid);
+		return;
+	}
+
+	r = lkb->lkb_resource;
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	mstype = lkb->lkb_wait_type;
+	error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
+	if (error)
+		goto out;
+
+	/* Optimization: the dir node was also the master, so it took our
+	   lookup as a request and sent request reply instead of lookup reply */
+	if (mstype == DLM_MSG_LOOKUP) {
+		r->res_nodeid = ms->m_header.h_nodeid;
+		lkb->lkb_nodeid = r->res_nodeid;
+	}
+
+	/* this is the value returned from do_request() on the master */
+	result = ms->m_result;
+
+	switch (result) {
+	case -EAGAIN:
+		/* request would block (be queued) on remote master */
+		queue_cast(r, lkb, -EAGAIN);
+		confirm_master(r, -EAGAIN);
+		unhold_lkb(lkb); /* undoes create_lkb() */
+		break;
+
+	case -EINPROGRESS:
+	case 0:
+		/* request was queued or granted on remote master */
+		receive_flags_reply(lkb, ms);
+		lkb->lkb_remid = ms->m_lkid;
+		if (is_altmode(lkb))
+			munge_altmode(lkb, ms);
+		if (result) {
+			add_lkb(r, lkb, DLM_LKSTS_WAITING);
+			add_timeout(lkb);
+		} else {
+			grant_lock_pc(r, lkb, ms);
+			queue_cast(r, lkb, 0);
+		}
+		confirm_master(r, result);
+		break;
+
+	case -EBADR:
+	case -ENOTBLK:
+		/* find_rsb failed to find rsb or rsb wasn't master */
+		log_debug(ls, "receive_request_reply %x %x master diff %d %d",
+			  lkb->lkb_id, lkb->lkb_flags, r->res_nodeid, result);
+		r->res_nodeid = -1;
+		lkb->lkb_nodeid = -1;
+
+		if (is_overlap(lkb)) {
+			/* we'll ignore error in cancel/unlock reply */
+			queue_cast_overlap(r, lkb);
+			confirm_master(r, result);
+			unhold_lkb(lkb); /* undoes create_lkb() */
+		} else
+			_request_lock(r, lkb);
+		break;
+
+	default:
+		log_error(ls, "receive_request_reply %x error %d",
+			  lkb->lkb_id, result);
+	}
+
+	if (is_overlap_unlock(lkb) && (result == 0 || result == -EINPROGRESS)) {
+		log_debug(ls, "receive_request_reply %x result %d unlock",
+			  lkb->lkb_id, result);
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		send_unlock(r, lkb);
+	} else if (is_overlap_cancel(lkb) && (result == -EINPROGRESS)) {
+		log_debug(ls, "receive_request_reply %x cancel", lkb->lkb_id);
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		send_cancel(r, lkb);
+	} else {
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+	}
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+}
+
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+				    struct dlm_message *ms)
+{
+	/* this is the value returned from do_convert() on the master */
+	switch (ms->m_result) {
+	case -EAGAIN:
+		/* convert would block (be queued) on remote master */
+		queue_cast(r, lkb, -EAGAIN);
+		break;
+
+	case -EDEADLK:
+		receive_flags_reply(lkb, ms);
+		revert_lock_pc(r, lkb);
+		queue_cast(r, lkb, -EDEADLK);
+		break;
+
+	case -EINPROGRESS:
+		/* convert was queued on remote master */
+		receive_flags_reply(lkb, ms);
+		if (is_demoted(lkb))
+			munge_demoted(lkb);
+		del_lkb(r, lkb);
+		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+		add_timeout(lkb);
+		break;
+
+	case 0:
+		/* convert was granted on remote master */
+		receive_flags_reply(lkb, ms);
+		if (is_demoted(lkb))
+			munge_demoted(lkb);
+		grant_lock_pc(r, lkb, ms);
+		queue_cast(r, lkb, 0);
+		break;
+
+	default:
+		log_error(r->res_ls, "receive_convert_reply %x error %d",
+			  lkb->lkb_id, ms->m_result);
+	}
+}
+
+static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	struct dlm_rsb *r = lkb->lkb_resource;
+	int error;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	/* stub reply can happen with waiters_mutex held */
+	error = remove_from_waiters_ms(lkb, ms);
+	if (error)
+		goto out;
+
+	__receive_convert_reply(r, lkb, ms);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+}
+
+static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error) {
+		log_debug(ls, "receive_convert_reply from %d no lkb %x",
+			  ms->m_header.h_nodeid, ms->m_remid);
+		return;
+	}
+
+	_receive_convert_reply(lkb, ms);
+	dlm_put_lkb(lkb);
+}
+
+static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	struct dlm_rsb *r = lkb->lkb_resource;
+	int error;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	/* stub reply can happen with waiters_mutex held */
+	error = remove_from_waiters_ms(lkb, ms);
+	if (error)
+		goto out;
+
+	/* this is the value returned from do_unlock() on the master */
+
+	switch (ms->m_result) {
+	case -DLM_EUNLOCK:
+		receive_flags_reply(lkb, ms);
+		remove_lock_pc(r, lkb);
+		queue_cast(r, lkb, -DLM_EUNLOCK);
+		break;
+	case -ENOENT:
+		break;
+	default:
+		log_error(r->res_ls, "receive_unlock_reply %x error %d",
+			  lkb->lkb_id, ms->m_result);
+	}
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+}
+
+static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error) {
+		log_debug(ls, "receive_unlock_reply from %d no lkb %x",
+			  ms->m_header.h_nodeid, ms->m_remid);
+		return;
+	}
+
+	_receive_unlock_reply(lkb, ms);
+	dlm_put_lkb(lkb);
+}
+
+static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	struct dlm_rsb *r = lkb->lkb_resource;
+	int error;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
+	/* stub reply can happen with waiters_mutex held */
+	error = remove_from_waiters_ms(lkb, ms);
+	if (error)
+		goto out;
+
+	/* this is the value returned from do_cancel() on the master */
+
+	switch (ms->m_result) {
+	case -DLM_ECANCEL:
+		receive_flags_reply(lkb, ms);
+		revert_lock_pc(r, lkb);
+		queue_cast(r, lkb, -DLM_ECANCEL);
+		break;
+	case 0:
+		break;
+	default:
+		log_error(r->res_ls, "receive_cancel_reply %x error %d",
+			  lkb->lkb_id, ms->m_result);
+	}
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+}
+
+static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error) {
+		log_debug(ls, "receive_cancel_reply from %d no lkb %x",
+			  ms->m_header.h_nodeid, ms->m_remid);
+		return;
+	}
+
+	_receive_cancel_reply(lkb, ms);
+	dlm_put_lkb(lkb);
+}
+
+static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error, ret_nodeid;
+
+	error = find_lkb(ls, ms->m_lkid, &lkb);
+	if (error) {
+		log_error(ls, "receive_lookup_reply no lkb");
+		return;
+	}
+
+	/* ms->m_result is the value returned by dlm_dir_lookup on dir node
+	   FIXME: will a non-zero error ever be returned? */
+
+	r = lkb->lkb_resource;
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
+	if (error)
+		goto out;
+
+	ret_nodeid = ms->m_nodeid;
+	if (ret_nodeid == dlm_our_nodeid()) {
+		r->res_nodeid = 0;
+		ret_nodeid = 0;
+		r->res_first_lkid = 0;
+	} else {
+		/* set_master() will copy res_nodeid to lkb_nodeid */
+		r->res_nodeid = ret_nodeid;
+	}
+
+	if (is_overlap(lkb)) {
+		log_debug(ls, "receive_lookup_reply %x unlock %x",
+			  lkb->lkb_id, lkb->lkb_flags);
+		queue_cast_overlap(r, lkb);
+		unhold_lkb(lkb); /* undoes create_lkb() */
+		goto out_list;
+	}
+
+	_request_lock(r, lkb);
+
+ out_list:
+	if (!ret_nodeid)
+		process_lookup_list(r);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+}
+
+static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	if (!dlm_is_member(ls, ms->m_header.h_nodeid)) {
+		log_debug(ls, "ignore non-member message %d from %d %x %x %d",
+			  ms->m_type, ms->m_header.h_nodeid, ms->m_lkid,
+			  ms->m_remid, ms->m_result);
+		return;
+	}
+
+	switch (ms->m_type) {
+
+	/* messages sent to a master node */
+
+	case DLM_MSG_REQUEST:
+		receive_request(ls, ms);
+		break;
+
+	case DLM_MSG_CONVERT:
+		receive_convert(ls, ms);
+		break;
+
+	case DLM_MSG_UNLOCK:
+		receive_unlock(ls, ms);
+		break;
+
+	case DLM_MSG_CANCEL:
+		receive_cancel(ls, ms);
+		break;
+
+	/* messages sent from a master node (replies to above) */
+
+	case DLM_MSG_REQUEST_REPLY:
+		receive_request_reply(ls, ms);
+		break;
+
+	case DLM_MSG_CONVERT_REPLY:
+		receive_convert_reply(ls, ms);
+		break;
+
+	case DLM_MSG_UNLOCK_REPLY:
+		receive_unlock_reply(ls, ms);
+		break;
+
+	case DLM_MSG_CANCEL_REPLY:
+		receive_cancel_reply(ls, ms);
+		break;
+
+	/* messages sent from a master node (only two types of async msg) */
+
+	case DLM_MSG_GRANT:
+		receive_grant(ls, ms);
+		break;
+
+	case DLM_MSG_BAST:
+		receive_bast(ls, ms);
+		break;
+
+	/* messages sent to a dir node */
+
+	case DLM_MSG_LOOKUP:
+		receive_lookup(ls, ms);
+		break;
+
+	case DLM_MSG_REMOVE:
+		receive_remove(ls, ms);
+		break;
+
+	/* messages sent from a dir node (remove has no reply) */
+
+	case DLM_MSG_LOOKUP_REPLY:
+		receive_lookup_reply(ls, ms);
+		break;
+
+	/* other messages */
+
+	case DLM_MSG_PURGE:
+		receive_purge(ls, ms);
+		break;
+
+	default:
+		log_error(ls, "unknown message type %d", ms->m_type);
+	}
+
+	dlm_astd_wake();
+}
+
+/* If the lockspace is in recovery mode (locking stopped), then normal
+   messages are saved on the requestqueue for processing after recovery is
+   done.  When not in recovery mode, we wait for dlm_recoverd to drain saved
+   messages off the requestqueue before we process new ones. This occurs right
+   after recovery completes when we transition from saving all messages on
+   requestqueue, to processing all the saved messages, to processing new
+   messages as they arrive. */
+
+static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms,
+				int nodeid)
+{
+	if (dlm_locking_stopped(ls)) {
+		dlm_add_requestqueue(ls, nodeid, ms);
+	} else {
+		dlm_wait_requestqueue(ls);
+		_receive_message(ls, ms);
+	}
+}
+
+/* This is called by dlm_recoverd to process messages that were saved on
+   the requestqueue. */
+
+void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	_receive_message(ls, ms);
+}
+
+/* This is called by the midcomms layer when something is received for
+   the lockspace.  It could be either a MSG (normal message sent as part of
+   standard locking activity) or an RCOM (recovery message sent as part of
+   lockspace recovery). */
+
+void dlm_receive_buffer(union dlm_packet *p, int nodeid)
+{
+	struct dlm_header *hd = &p->header;
+	struct dlm_ls *ls;
+	int type = 0;
+
+	switch (hd->h_cmd) {
+	case DLM_MSG:
+		dlm_message_in(&p->message);
+		type = p->message.m_type;
+		break;
+	case DLM_RCOM:
+		dlm_rcom_in(&p->rcom);
+		type = p->rcom.rc_type;
+		break;
+	default:
+		log_print("invalid h_cmd %d from %u", hd->h_cmd, nodeid);
+		return;
+	}
+
+	if (hd->h_nodeid != nodeid) {
+		log_print("invalid h_nodeid %d from %d lockspace %x",
+			  hd->h_nodeid, nodeid, hd->h_lockspace);
+		return;
+	}
+
+	ls = dlm_find_lockspace_global(hd->h_lockspace);
+	if (!ls) {
+		if (dlm_config.ci_log_debug)
+			log_print("invalid lockspace %x from %d cmd %d type %d",
+				  hd->h_lockspace, nodeid, hd->h_cmd, type);
+
+		if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
+			dlm_send_ls_not_ready(nodeid, &p->rcom);
+		return;
+	}
+
+	/* this rwsem allows dlm_ls_stop() to wait for all dlm_recv threads to
+	   be inactive (in this ls) before transitioning to recovery mode */
+
+	down_read(&ls->ls_recv_active);
+	if (hd->h_cmd == DLM_MSG)
+		dlm_receive_message(ls, &p->message, nodeid);
+	else
+		dlm_receive_rcom(ls, &p->rcom, nodeid);
+	up_read(&ls->ls_recv_active);
+
+	dlm_put_lockspace(ls);
+}
+
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
+				   struct dlm_message *ms_stub)
+{
+	if (middle_conversion(lkb)) {
+		hold_lkb(lkb);
+		memset(ms_stub, 0, sizeof(struct dlm_message));
+		ms_stub->m_flags = DLM_IFL_STUB_MS;
+		ms_stub->m_type = DLM_MSG_CONVERT_REPLY;
+		ms_stub->m_result = -EINPROGRESS;
+		ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+		_receive_convert_reply(lkb, ms_stub);
+
+		/* Same special case as in receive_rcom_lock_args() */
+		lkb->lkb_grmode = DLM_LOCK_IV;
+		rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
+		unhold_lkb(lkb);
+
+	} else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
+		lkb->lkb_flags |= DLM_IFL_RESEND;
+	}
+
+	/* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
+	   conversions are async; there's no reply from the remote master */
+}
+
+/* A waiting lkb needs recovery if the master node has failed, or
+   the master node is changing (only when no directory is used) */
+
+static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	if (dlm_is_removed(ls, lkb->lkb_nodeid))
+		return 1;
+
+	if (!dlm_no_directory(ls))
+		return 0;
+
+	if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
+		return 1;
+
+	return 0;
+}
+
+/* Recovery for locks that are waiting for replies from nodes that are now
+   gone.  We can just complete unlocks and cancels by faking a reply from the
+   dead node.  Requests and up-conversions we flag to be resent after
+   recovery.  Down-conversions can just be completed with a fake reply like
+   unlocks.  Conversions between PR and CW need special attention. */
+
+void dlm_recover_waiters_pre(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb, *safe;
+	struct dlm_message *ms_stub;
+	int wait_type, stub_unlock_result, stub_cancel_result;
+
+	ms_stub = kmalloc(GFP_KERNEL, sizeof(struct dlm_message));
+	if (!ms_stub) {
+		log_error(ls, "dlm_recover_waiters_pre no mem");
+		return;
+	}
+
+	mutex_lock(&ls->ls_waiters_mutex);
+
+	list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
+
+		/* exclude debug messages about unlocks because there can be so
+		   many and they aren't very interesting */
+
+		if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
+			log_debug(ls, "recover_waiter %x nodeid %d "
+				  "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid,
+				  lkb->lkb_wait_type, lkb->lkb_wait_nodeid);
+		}
+
+		/* all outstanding lookups, regardless of destination  will be
+		   resent after recovery is done */
+
+		if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
+			lkb->lkb_flags |= DLM_IFL_RESEND;
+			continue;
+		}
+
+		if (!waiter_needs_recovery(ls, lkb))
+			continue;
+
+		wait_type = lkb->lkb_wait_type;
+		stub_unlock_result = -DLM_EUNLOCK;
+		stub_cancel_result = -DLM_ECANCEL;
+
+		/* Main reply may have been received leaving a zero wait_type,
+		   but a reply for the overlapping op may not have been
+		   received.  In that case we need to fake the appropriate
+		   reply for the overlap op. */
+
+		if (!wait_type) {
+			if (is_overlap_cancel(lkb)) {
+				wait_type = DLM_MSG_CANCEL;
+				if (lkb->lkb_grmode == DLM_LOCK_IV)
+					stub_cancel_result = 0;
+			}
+			if (is_overlap_unlock(lkb)) {
+				wait_type = DLM_MSG_UNLOCK;
+				if (lkb->lkb_grmode == DLM_LOCK_IV)
+					stub_unlock_result = -ENOENT;
+			}
+
+			log_debug(ls, "rwpre overlap %x %x %d %d %d",
+				  lkb->lkb_id, lkb->lkb_flags, wait_type,
+				  stub_cancel_result, stub_unlock_result);
+		}
+
+		switch (wait_type) {
+
+		case DLM_MSG_REQUEST:
+			lkb->lkb_flags |= DLM_IFL_RESEND;
+			break;
+
+		case DLM_MSG_CONVERT:
+			recover_convert_waiter(ls, lkb, ms_stub);
+			break;
+
+		case DLM_MSG_UNLOCK:
+			hold_lkb(lkb);
+			memset(ms_stub, 0, sizeof(struct dlm_message));
+			ms_stub->m_flags = DLM_IFL_STUB_MS;
+			ms_stub->m_type = DLM_MSG_UNLOCK_REPLY;
+			ms_stub->m_result = stub_unlock_result;
+			ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+			_receive_unlock_reply(lkb, ms_stub);
+			dlm_put_lkb(lkb);
+			break;
+
+		case DLM_MSG_CANCEL:
+			hold_lkb(lkb);
+			memset(ms_stub, 0, sizeof(struct dlm_message));
+			ms_stub->m_flags = DLM_IFL_STUB_MS;
+			ms_stub->m_type = DLM_MSG_CANCEL_REPLY;
+			ms_stub->m_result = stub_cancel_result;
+			ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+			_receive_cancel_reply(lkb, ms_stub);
+			dlm_put_lkb(lkb);
+			break;
+
+		default:
+			log_error(ls, "invalid lkb wait_type %d %d",
+				  lkb->lkb_wait_type, wait_type);
+		}
+		schedule();
+	}
+	mutex_unlock(&ls->ls_waiters_mutex);
+	kfree(ms_stub);
+}
+
+static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb;
+	int found = 0;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+		if (lkb->lkb_flags & DLM_IFL_RESEND) {
+			hold_lkb(lkb);
+			found = 1;
+			break;
+		}
+	}
+	mutex_unlock(&ls->ls_waiters_mutex);
+
+	if (!found)
+		lkb = NULL;
+	return lkb;
+}
+
+/* Deal with lookups and lkb's marked RESEND from _pre.  We may now be the
+   master or dir-node for r.  Processing the lkb may result in it being placed
+   back on waiters. */
+
+/* We do this after normal locking has been enabled and any saved messages
+   (in requestqueue) have been processed.  We should be confident that at
+   this point we won't get or process a reply to any of these waiting
+   operations.  But, new ops may be coming in on the rsbs/locks here from
+   userspace or remotely. */
+
+/* there may have been an overlap unlock/cancel prior to recovery or after
+   recovery.  if before, the lkb may still have a pos wait_count; if after, the
+   overlap flag would just have been set and nothing new sent.  we can be
+   confident here than any replies to either the initial op or overlap ops
+   prior to recovery have been received. */
+
+int dlm_recover_waiters_post(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error = 0, mstype, err, oc, ou;
+
+	while (1) {
+		if (dlm_locking_stopped(ls)) {
+			log_debug(ls, "recover_waiters_post aborted");
+			error = -EINTR;
+			break;
+		}
+
+		lkb = find_resend_waiter(ls);
+		if (!lkb)
+			break;
+
+		r = lkb->lkb_resource;
+		hold_rsb(r);
+		lock_rsb(r);
+
+		mstype = lkb->lkb_wait_type;
+		oc = is_overlap_cancel(lkb);
+		ou = is_overlap_unlock(lkb);
+		err = 0;
+
+		log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d",
+			  lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid);
+
+		/* At this point we assume that we won't get a reply to any
+		   previous op or overlap op on this lock.  First, do a big
+		   remove_from_waiters() for all previous ops. */
+
+		lkb->lkb_flags &= ~DLM_IFL_RESEND;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		lkb->lkb_wait_type = 0;
+		lkb->lkb_wait_count = 0;
+		mutex_lock(&ls->ls_waiters_mutex);
+		list_del_init(&lkb->lkb_wait_reply);
+		mutex_unlock(&ls->ls_waiters_mutex);
+		unhold_lkb(lkb); /* for waiters list */
+
+		if (oc || ou) {
+			/* do an unlock or cancel instead of resending */
+			switch (mstype) {
+			case DLM_MSG_LOOKUP:
+			case DLM_MSG_REQUEST:
+				queue_cast(r, lkb, ou ? -DLM_EUNLOCK :
+							-DLM_ECANCEL);
+				unhold_lkb(lkb); /* undoes create_lkb() */
+				break;
+			case DLM_MSG_CONVERT:
+				if (oc) {
+					queue_cast(r, lkb, -DLM_ECANCEL);
+				} else {
+					lkb->lkb_exflags |= DLM_LKF_FORCEUNLOCK;
+					_unlock_lock(r, lkb);
+				}
+				break;
+			default:
+				err = 1;
+			}
+		} else {
+			switch (mstype) {
+			case DLM_MSG_LOOKUP:
+			case DLM_MSG_REQUEST:
+				_request_lock(r, lkb);
+				if (is_master(r))
+					confirm_master(r, 0);
+				break;
+			case DLM_MSG_CONVERT:
+				_convert_lock(r, lkb);
+				break;
+			default:
+				err = 1;
+			}
+		}
+
+		if (err)
+			log_error(ls, "recover_waiters_post %x %d %x %d %d",
+			  	  lkb->lkb_id, mstype, lkb->lkb_flags, oc, ou);
+		unlock_rsb(r);
+		put_rsb(r);
+		dlm_put_lkb(lkb);
+	}
+
+	return error;
+}
+
+static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
+			int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
+{
+	struct dlm_ls *ls = r->res_ls;
+	struct dlm_lkb *lkb, *safe;
+
+	list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
+		if (test(ls, lkb)) {
+			rsb_set_flag(r, RSB_LOCKS_PURGED);
+			del_lkb(r, lkb);
+			/* this put should free the lkb */
+			if (!dlm_put_lkb(lkb))
+				log_error(ls, "purged lkb not released");
+		}
+	}
+}
+
+static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
+}
+
+static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	return is_master_copy(lkb);
+}
+
+static void purge_dead_locks(struct dlm_rsb *r)
+{
+	purge_queue(r, &r->res_grantqueue, &purge_dead_test);
+	purge_queue(r, &r->res_convertqueue, &purge_dead_test);
+	purge_queue(r, &r->res_waitqueue, &purge_dead_test);
+}
+
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
+{
+	purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
+	purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
+	purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
+}
+
+/* Get rid of locks held by nodes that are gone. */
+
+int dlm_purge_locks(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+
+	log_debug(ls, "dlm_purge_locks");
+
+	down_write(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		hold_rsb(r);
+		lock_rsb(r);
+		if (is_master(r))
+			purge_dead_locks(r);
+		unlock_rsb(r);
+		unhold_rsb(r);
+
+		schedule();
+	}
+	up_write(&ls->ls_root_sem);
+
+	return 0;
+}
+
+static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
+{
+	struct dlm_rsb *r, *r_ret = NULL;
+
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
+	list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
+		if (!rsb_flag(r, RSB_LOCKS_PURGED))
+			continue;
+		hold_rsb(r);
+		rsb_clear_flag(r, RSB_LOCKS_PURGED);
+		r_ret = r;
+		break;
+	}
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+	return r_ret;
+}
+
+void dlm_grant_after_purge(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	int bucket = 0;
+
+	while (1) {
+		r = find_purged_rsb(ls, bucket);
+		if (!r) {
+			if (bucket == ls->ls_rsbtbl_size - 1)
+				break;
+			bucket++;
+			continue;
+		}
+		lock_rsb(r);
+		if (is_master(r)) {
+			grant_pending_locks(r);
+			confirm_master(r, 0);
+		}
+		unlock_rsb(r);
+		put_rsb(r);
+		schedule();
+	}
+}
+
+static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
+					 uint32_t remid)
+{
+	struct dlm_lkb *lkb;
+
+	list_for_each_entry(lkb, head, lkb_statequeue) {
+		if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
+			return lkb;
+	}
+	return NULL;
+}
+
+static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
+				    uint32_t remid)
+{
+	struct dlm_lkb *lkb;
+
+	lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
+	if (lkb)
+		return lkb;
+	lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
+	if (lkb)
+		return lkb;
+	lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
+	if (lkb)
+		return lkb;
+	return NULL;
+}
+
+/* needs at least dlm_rcom + rcom_lock */
+static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+				  struct dlm_rsb *r, struct dlm_rcom *rc)
+{
+	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+
+	lkb->lkb_nodeid = rc->rc_header.h_nodeid;
+	lkb->lkb_ownpid = le32_to_cpu(rl->rl_ownpid);
+	lkb->lkb_remid = le32_to_cpu(rl->rl_lkid);
+	lkb->lkb_exflags = le32_to_cpu(rl->rl_exflags);
+	lkb->lkb_flags = le32_to_cpu(rl->rl_flags) & 0x0000FFFF;
+	lkb->lkb_flags |= DLM_IFL_MSTCPY;
+	lkb->lkb_lvbseq = le32_to_cpu(rl->rl_lvbseq);
+	lkb->lkb_rqmode = rl->rl_rqmode;
+	lkb->lkb_grmode = rl->rl_grmode;
+	/* don't set lkb_status because add_lkb wants to itself */
+
+	lkb->lkb_bastfn = (rl->rl_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
+	lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
+
+	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+		int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
+			 sizeof(struct rcom_lock);
+		if (lvblen > ls->ls_lvblen)
+			return -EINVAL;
+		lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
+		if (!lkb->lkb_lvbptr)
+			return -ENOMEM;
+		memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
+	}
+
+	/* Conversions between PR and CW (middle modes) need special handling.
+	   The real granted mode of these converting locks cannot be determined
+	   until all locks have been rebuilt on the rsb (recover_conversion) */
+
+	if (rl->rl_wait_type == cpu_to_le16(DLM_MSG_CONVERT) &&
+	    middle_conversion(lkb)) {
+		rl->rl_status = DLM_LKSTS_CONVERT;
+		lkb->lkb_grmode = DLM_LOCK_IV;
+		rsb_set_flag(r, RSB_RECOVER_CONVERT);
+	}
+
+	return 0;
+}
+
+/* This lkb may have been recovered in a previous aborted recovery so we need
+   to check if the rsb already has an lkb with the given remote nodeid/lkid.
+   If so we just send back a standard reply.  If not, we create a new lkb with
+   the given values and send back our lkid.  We send back our lkid by sending
+   back the rcom_lock struct we got but with the remid field filled in. */
+
+/* needs at least dlm_rcom + rcom_lock */
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+	struct dlm_rsb *r;
+	struct dlm_lkb *lkb;
+	int error;
+
+	if (rl->rl_parent_lkid) {
+		error = -EOPNOTSUPP;
+		goto out;
+	}
+
+	error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen),
+			 R_MASTER, &r);
+	if (error)
+		goto out;
+
+	lock_rsb(r);
+
+	lkb = search_remid(r, rc->rc_header.h_nodeid, le32_to_cpu(rl->rl_lkid));
+	if (lkb) {
+		error = -EEXIST;
+		goto out_remid;
+	}
+
+	error = create_lkb(ls, &lkb);
+	if (error)
+		goto out_unlock;
+
+	error = receive_rcom_lock_args(ls, lkb, r, rc);
+	if (error) {
+		__put_lkb(ls, lkb);
+		goto out_unlock;
+	}
+
+	attach_lkb(r, lkb);
+	add_lkb(r, lkb, rl->rl_status);
+	error = 0;
+
+ out_remid:
+	/* this is the new value returned to the lock holder for
+	   saving in its process-copy lkb */
+	rl->rl_remid = cpu_to_le32(lkb->lkb_id);
+
+ out_unlock:
+	unlock_rsb(r);
+	put_rsb(r);
+ out:
+	if (error)
+		log_debug(ls, "recover_master_copy %d %x", error,
+			  le32_to_cpu(rl->rl_lkid));
+	rl->rl_result = cpu_to_le32(error);
+	return error;
+}
+
+/* needs at least dlm_rcom + rcom_lock */
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+	struct dlm_rsb *r;
+	struct dlm_lkb *lkb;
+	int error;
+
+	error = find_lkb(ls, le32_to_cpu(rl->rl_lkid), &lkb);
+	if (error) {
+		log_error(ls, "recover_process_copy no lkid %x",
+				le32_to_cpu(rl->rl_lkid));
+		return error;
+	}
+
+	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+	error = le32_to_cpu(rl->rl_result);
+
+	r = lkb->lkb_resource;
+	hold_rsb(r);
+	lock_rsb(r);
+
+	switch (error) {
+	case -EBADR:
+		/* There's a chance the new master received our lock before
+		   dlm_recover_master_reply(), this wouldn't happen if we did
+		   a barrier between recover_masters and recover_locks. */
+		log_debug(ls, "master copy not ready %x r %lx %s", lkb->lkb_id,
+			  (unsigned long)r, r->res_name);
+		dlm_send_rcom_lock(r, lkb);
+		goto out;
+	case -EEXIST:
+		log_debug(ls, "master copy exists %x", lkb->lkb_id);
+		/* fall through */
+	case 0:
+		lkb->lkb_remid = le32_to_cpu(rl->rl_remid);
+		break;
+	default:
+		log_error(ls, "dlm_recover_process_copy unknown error %d %x",
+			  error, lkb->lkb_id);
+	}
+
+	/* an ack for dlm_recover_locks() which waits for replies from
+	   all the locks it sends to new masters */
+	dlm_recovered_lock(r);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+
+	return 0;
+}
+
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
+		     int mode, uint32_t flags, void *name, unsigned int namelen,
+		     unsigned long timeout_cs)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	int error;
+
+	dlm_lock_recovery(ls);
+
+	error = create_lkb(ls, &lkb);
+	if (error) {
+		kfree(ua);
+		goto out;
+	}
+
+	if (flags & DLM_LKF_VALBLK) {
+		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
+		if (!ua->lksb.sb_lvbptr) {
+			kfree(ua);
+			__put_lkb(ls, lkb);
+			error = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
+	   When DLM_IFL_USER is set, the dlm knows that this is a userspace
+	   lock and that lkb_astparam is the dlm_user_args structure. */
+
+	error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
+			      fake_astfn, ua, fake_bastfn, &args);
+	lkb->lkb_flags |= DLM_IFL_USER;
+
+	if (error) {
+		__put_lkb(ls, lkb);
+		goto out;
+	}
+
+	error = request_lock(ls, lkb, name, namelen, &args);
+
+	switch (error) {
+	case 0:
+		break;
+	case -EINPROGRESS:
+		error = 0;
+		break;
+	case -EAGAIN:
+		error = 0;
+		/* fall through */
+	default:
+		__put_lkb(ls, lkb);
+		goto out;
+	}
+
+	/* add this new lkb to the per-process list of locks */
+	spin_lock(&ua->proc->locks_spin);
+	hold_lkb(lkb);
+	list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
+	spin_unlock(&ua->proc->locks_spin);
+ out:
+	dlm_unlock_recovery(ls);
+	return error;
+}
+
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+		     int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
+		     unsigned long timeout_cs)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	struct dlm_user_args *ua;
+	int error;
+
+	dlm_lock_recovery(ls);
+
+	error = find_lkb(ls, lkid, &lkb);
+	if (error)
+		goto out;
+
+	/* user can change the params on its lock when it converts it, or
+	   add an lvb that didn't exist before */
+
+	ua = lkb->lkb_ua;
+
+	if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
+		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
+		if (!ua->lksb.sb_lvbptr) {
+			error = -ENOMEM;
+			goto out_put;
+		}
+	}
+	if (lvb_in && ua->lksb.sb_lvbptr)
+		memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+
+	ua->xid = ua_tmp->xid;
+	ua->castparam = ua_tmp->castparam;
+	ua->castaddr = ua_tmp->castaddr;
+	ua->bastparam = ua_tmp->bastparam;
+	ua->bastaddr = ua_tmp->bastaddr;
+	ua->user_lksb = ua_tmp->user_lksb;
+
+	error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs,
+			      fake_astfn, ua, fake_bastfn, &args);
+	if (error)
+		goto out_put;
+
+	error = convert_lock(ls, lkb, &args);
+
+	if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK)
+		error = 0;
+ out_put:
+	dlm_put_lkb(lkb);
+ out:
+	dlm_unlock_recovery(ls);
+	kfree(ua_tmp);
+	return error;
+}
+
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+		    uint32_t flags, uint32_t lkid, char *lvb_in)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	struct dlm_user_args *ua;
+	int error;
+
+	dlm_lock_recovery(ls);
+
+	error = find_lkb(ls, lkid, &lkb);
+	if (error)
+		goto out;
+
+	ua = lkb->lkb_ua;
+
+	if (lvb_in && ua->lksb.sb_lvbptr)
+		memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+	if (ua_tmp->castparam)
+		ua->castparam = ua_tmp->castparam;
+	ua->user_lksb = ua_tmp->user_lksb;
+
+	error = set_unlock_args(flags, ua, &args);
+	if (error)
+		goto out_put;
+
+	error = unlock_lock(ls, lkb, &args);
+
+	if (error == -DLM_EUNLOCK)
+		error = 0;
+	/* from validate_unlock_args() */
+	if (error == -EBUSY && (flags & DLM_LKF_FORCEUNLOCK))
+		error = 0;
+	if (error)
+		goto out_put;
+
+	spin_lock(&ua->proc->locks_spin);
+	/* dlm_user_add_ast() may have already taken lkb off the proc list */
+	if (!list_empty(&lkb->lkb_ownqueue))
+		list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
+	spin_unlock(&ua->proc->locks_spin);
+ out_put:
+	dlm_put_lkb(lkb);
+ out:
+	dlm_unlock_recovery(ls);
+	kfree(ua_tmp);
+	return error;
+}
+
+int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+		    uint32_t flags, uint32_t lkid)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	struct dlm_user_args *ua;
+	int error;
+
+	dlm_lock_recovery(ls);
+
+	error = find_lkb(ls, lkid, &lkb);
+	if (error)
+		goto out;
+
+	ua = lkb->lkb_ua;
+	if (ua_tmp->castparam)
+		ua->castparam = ua_tmp->castparam;
+	ua->user_lksb = ua_tmp->user_lksb;
+
+	error = set_unlock_args(flags, ua, &args);
+	if (error)
+		goto out_put;
+
+	error = cancel_lock(ls, lkb, &args);
+
+	if (error == -DLM_ECANCEL)
+		error = 0;
+	/* from validate_unlock_args() */
+	if (error == -EBUSY)
+		error = 0;
+ out_put:
+	dlm_put_lkb(lkb);
+ out:
+	dlm_unlock_recovery(ls);
+	kfree(ua_tmp);
+	return error;
+}
+
+int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	struct dlm_user_args *ua;
+	struct dlm_rsb *r;
+	int error;
+
+	dlm_lock_recovery(ls);
+
+	error = find_lkb(ls, lkid, &lkb);
+	if (error)
+		goto out;
+
+	ua = lkb->lkb_ua;
+
+	error = set_unlock_args(flags, ua, &args);
+	if (error)
+		goto out_put;
+
+	/* same as cancel_lock(), but set DEADLOCK_CANCEL after lock_rsb */
+
+	r = lkb->lkb_resource;
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_unlock_args(lkb, &args);
+	if (error)
+		goto out_r;
+	lkb->lkb_flags |= DLM_IFL_DEADLOCK_CANCEL;
+
+	error = _cancel_lock(r, lkb);
+ out_r:
+	unlock_rsb(r);
+	put_rsb(r);
+
+	if (error == -DLM_ECANCEL)
+		error = 0;
+	/* from validate_unlock_args() */
+	if (error == -EBUSY)
+		error = 0;
+ out_put:
+	dlm_put_lkb(lkb);
+ out:
+	dlm_unlock_recovery(ls);
+	return error;
+}
+
+/* lkb's that are removed from the waiters list by revert are just left on the
+   orphans list with the granted orphan locks, to be freed by purge */
+
+static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	struct dlm_args args;
+	int error;
+
+	hold_lkb(lkb);
+	mutex_lock(&ls->ls_orphans_mutex);
+	list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans);
+	mutex_unlock(&ls->ls_orphans_mutex);
+
+	set_unlock_args(0, lkb->lkb_ua, &args);
+
+	error = cancel_lock(ls, lkb, &args);
+	if (error == -DLM_ECANCEL)
+		error = 0;
+	return error;
+}
+
+/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
+   Regardless of what rsb queue the lock is on, it's removed and freed. */
+
+static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	struct dlm_args args;
+	int error;
+
+	set_unlock_args(DLM_LKF_FORCEUNLOCK, lkb->lkb_ua, &args);
+
+	error = unlock_lock(ls, lkb, &args);
+	if (error == -DLM_EUNLOCK)
+		error = 0;
+	return error;
+}
+
+/* We have to release clear_proc_locks mutex before calling unlock_proc_lock()
+   (which does lock_rsb) due to deadlock with receiving a message that does
+   lock_rsb followed by dlm_user_add_ast() */
+
+static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
+				     struct dlm_user_proc *proc)
+{
+	struct dlm_lkb *lkb = NULL;
+
+	mutex_lock(&ls->ls_clear_proc_locks);
+	if (list_empty(&proc->locks))
+		goto out;
+
+	lkb = list_entry(proc->locks.next, struct dlm_lkb, lkb_ownqueue);
+	list_del_init(&lkb->lkb_ownqueue);
+
+	if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
+		lkb->lkb_flags |= DLM_IFL_ORPHAN;
+	else
+		lkb->lkb_flags |= DLM_IFL_DEAD;
+ out:
+	mutex_unlock(&ls->ls_clear_proc_locks);
+	return lkb;
+}
+
+/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
+   1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
+   which we clear here. */
+
+/* proc CLOSING flag is set so no more device_reads should look at proc->asts
+   list, and no more device_writes should add lkb's to proc->locks list; so we
+   shouldn't need to take asts_spin or locks_spin here.  this assumes that
+   device reads/writes/closes are serialized -- FIXME: we may need to serialize
+   them ourself. */
+
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	dlm_lock_recovery(ls);
+
+	while (1) {
+		lkb = del_proc_lock(ls, proc);
+		if (!lkb)
+			break;
+		del_timeout(lkb);
+		if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
+			orphan_proc_lock(ls, lkb);
+		else
+			unlock_proc_lock(ls, lkb);
+
+		/* this removes the reference for the proc->locks list
+		   added by dlm_user_request, it may result in the lkb
+		   being freed */
+
+		dlm_put_lkb(lkb);
+	}
+
+	mutex_lock(&ls->ls_clear_proc_locks);
+
+	/* in-progress unlocks */
+	list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
+		list_del_init(&lkb->lkb_ownqueue);
+		lkb->lkb_flags |= DLM_IFL_DEAD;
+		dlm_put_lkb(lkb);
+	}
+
+	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
+		memset(&lkb->lkb_callbacks, 0,
+		       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+		list_del_init(&lkb->lkb_astqueue);
+		dlm_put_lkb(lkb);
+	}
+
+	mutex_unlock(&ls->ls_clear_proc_locks);
+	dlm_unlock_recovery(ls);
+}
+
+static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	while (1) {
+		lkb = NULL;
+		spin_lock(&proc->locks_spin);
+		if (!list_empty(&proc->locks)) {
+			lkb = list_entry(proc->locks.next, struct dlm_lkb,
+					 lkb_ownqueue);
+			list_del_init(&lkb->lkb_ownqueue);
+		}
+		spin_unlock(&proc->locks_spin);
+
+		if (!lkb)
+			break;
+
+		lkb->lkb_flags |= DLM_IFL_DEAD;
+		unlock_proc_lock(ls, lkb);
+		dlm_put_lkb(lkb); /* ref from proc->locks list */
+	}
+
+	spin_lock(&proc->locks_spin);
+	list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
+		list_del_init(&lkb->lkb_ownqueue);
+		lkb->lkb_flags |= DLM_IFL_DEAD;
+		dlm_put_lkb(lkb);
+	}
+	spin_unlock(&proc->locks_spin);
+
+	spin_lock(&proc->asts_spin);
+	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
+		memset(&lkb->lkb_callbacks, 0,
+		       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+		list_del_init(&lkb->lkb_astqueue);
+		dlm_put_lkb(lkb);
+	}
+	spin_unlock(&proc->asts_spin);
+}
+
+/* pid of 0 means purge all orphans */
+
+static void do_purge(struct dlm_ls *ls, int nodeid, int pid)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	mutex_lock(&ls->ls_orphans_mutex);
+	list_for_each_entry_safe(lkb, safe, &ls->ls_orphans, lkb_ownqueue) {
+		if (pid && lkb->lkb_ownpid != pid)
+			continue;
+		unlock_proc_lock(ls, lkb);
+		list_del_init(&lkb->lkb_ownqueue);
+		dlm_put_lkb(lkb);
+	}
+	mutex_unlock(&ls->ls_orphans_mutex);
+}
+
+static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int error;
+
+	error = _create_message(ls, sizeof(struct dlm_message), nodeid,
+				DLM_MSG_PURGE, &ms, &mh);
+	if (error)
+		return error;
+	ms->m_nodeid = nodeid;
+	ms->m_pid = pid;
+
+	return send_message(mh, ms);
+}
+
+int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
+		   int nodeid, int pid)
+{
+	int error = 0;
+
+	if (nodeid != dlm_our_nodeid()) {
+		error = send_purge(ls, nodeid, pid);
+	} else {
+		dlm_lock_recovery(ls);
+		if (pid == current->pid)
+			purge_proc_locks(ls, proc);
+		else
+			do_purge(ls, nodeid, pid);
+		dlm_unlock_recovery(ls);
+	}
+	return error;
+}
+
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 00000000..265017a7
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,70 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCK_DOT_H__
+#define __LOCK_DOT_H__
+
+void dlm_dump_rsb(struct dlm_rsb *r);
+void dlm_print_lkb(struct dlm_lkb *lkb);
+void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
+void dlm_receive_buffer(union dlm_packet *p, int nodeid);
+int dlm_modes_compat(int mode1, int mode2);
+void dlm_put_rsb(struct dlm_rsb *r);
+void dlm_hold_rsb(struct dlm_rsb *r);
+int dlm_put_lkb(struct dlm_lkb *lkb);
+void dlm_scan_rsbs(struct dlm_ls *ls);
+int dlm_lock_recovery_try(struct dlm_ls *ls);
+void dlm_unlock_recovery(struct dlm_ls *ls);
+void dlm_scan_waiters(struct dlm_ls *ls);
+void dlm_scan_timeout(struct dlm_ls *ls);
+void dlm_adjust_timeouts(struct dlm_ls *ls);
+
+int dlm_purge_locks(struct dlm_ls *ls);
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
+void dlm_grant_after_purge(struct dlm_ls *ls);
+int dlm_recover_waiters_post(struct dlm_ls *ls);
+void dlm_recover_waiters_pre(struct dlm_ls *ls);
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
+	uint32_t flags, void *name, unsigned int namelen,
+	unsigned long timeout_cs);
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+	int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
+	unsigned long timeout_cs);
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+	uint32_t flags, uint32_t lkid, char *lvb_in);
+int dlm_user_cancel(struct dlm_ls *ls,  struct dlm_user_args *ua_tmp,
+	uint32_t flags, uint32_t lkid);
+int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
+	int nodeid, int pid);
+int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid);
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
+
+static inline int is_master(struct dlm_rsb *r)
+{
+	return !r->res_nodeid;
+}
+
+static inline void lock_rsb(struct dlm_rsb *r)
+{
+	mutex_lock(&r->res_mutex);
+}
+
+static inline void unlock_rsb(struct dlm_rsb *r)
+{
+	mutex_unlock(&r->res_mutex);
+}
+
+#endif
+
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 00000000..14cbf409
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,844 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "ast.h"
+#include "dir.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "recover.h"
+#include "requestqueue.h"
+#include "user.h"
+
+static int			ls_count;
+static struct mutex		ls_lock;
+static struct list_head		lslist;
+static spinlock_t		lslist_lock;
+static struct task_struct *	scand_task;
+
+
+static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+	ssize_t ret = len;
+	int n = simple_strtol(buf, NULL, 0);
+
+	ls = dlm_find_lockspace_local(ls->ls_local_handle);
+	if (!ls)
+		return -EINVAL;
+
+	switch (n) {
+	case 0:
+		dlm_ls_stop(ls);
+		break;
+	case 1:
+		dlm_ls_start(ls);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	dlm_put_lockspace(ls);
+	return ret;
+}
+
+static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+	ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
+	set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
+	wake_up(&ls->ls_uevent_wait);
+	return len;
+}
+
+static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", ls->ls_global_id);
+}
+
+static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+	ls->ls_global_id = simple_strtoul(buf, NULL, 0);
+	return len;
+}
+
+static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
+{
+	uint32_t status = dlm_recover_status(ls);
+	return snprintf(buf, PAGE_SIZE, "%x\n", status);
+}
+
+static ssize_t dlm_recover_nodeid_show(struct dlm_ls *ls, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", ls->ls_recover_nodeid);
+}
+
+struct dlm_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct dlm_ls *, char *);
+	ssize_t (*store)(struct dlm_ls *, const char *, size_t);
+};
+
+static struct dlm_attr dlm_attr_control = {
+	.attr  = {.name = "control", .mode = S_IWUSR},
+	.store = dlm_control_store
+};
+
+static struct dlm_attr dlm_attr_event = {
+	.attr  = {.name = "event_done", .mode = S_IWUSR},
+	.store = dlm_event_store
+};
+
+static struct dlm_attr dlm_attr_id = {
+	.attr  = {.name = "id", .mode = S_IRUGO | S_IWUSR},
+	.show  = dlm_id_show,
+	.store = dlm_id_store
+};
+
+static struct dlm_attr dlm_attr_recover_status = {
+	.attr  = {.name = "recover_status", .mode = S_IRUGO},
+	.show  = dlm_recover_status_show
+};
+
+static struct dlm_attr dlm_attr_recover_nodeid = {
+	.attr  = {.name = "recover_nodeid", .mode = S_IRUGO},
+	.show  = dlm_recover_nodeid_show
+};
+
+static struct attribute *dlm_attrs[] = {
+	&dlm_attr_control.attr,
+	&dlm_attr_event.attr,
+	&dlm_attr_id.attr,
+	&dlm_attr_recover_status.attr,
+	&dlm_attr_recover_nodeid.attr,
+	NULL,
+};
+
+static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
+			     char *buf)
+{
+	struct dlm_ls *ls  = container_of(kobj, struct dlm_ls, ls_kobj);
+	struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+	return a->show ? a->show(ls, buf) : 0;
+}
+
+static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
+			      const char *buf, size_t len)
+{
+	struct dlm_ls *ls  = container_of(kobj, struct dlm_ls, ls_kobj);
+	struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+	return a->store ? a->store(ls, buf, len) : len;
+}
+
+static void lockspace_kobj_release(struct kobject *k)
+{
+	struct dlm_ls *ls  = container_of(k, struct dlm_ls, ls_kobj);
+	kfree(ls);
+}
+
+static const struct sysfs_ops dlm_attr_ops = {
+	.show  = dlm_attr_show,
+	.store = dlm_attr_store,
+};
+
+static struct kobj_type dlm_ktype = {
+	.default_attrs = dlm_attrs,
+	.sysfs_ops     = &dlm_attr_ops,
+	.release       = lockspace_kobj_release,
+};
+
+static struct kset *dlm_kset;
+
+static int do_uevent(struct dlm_ls *ls, int in)
+{
+	int error;
+
+	if (in)
+		kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
+	else
+		kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
+
+	log_debug(ls, "%s the lockspace group...", in ? "joining" : "leaving");
+
+	/* dlm_controld will see the uevent, do the necessary group management
+	   and then write to sysfs to wake us */
+
+	error = wait_event_interruptible(ls->ls_uevent_wait,
+			test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
+
+	log_debug(ls, "group event done %d %d", error, ls->ls_uevent_result);
+
+	if (error)
+		goto out;
+
+	error = ls->ls_uevent_result;
+ out:
+	if (error)
+		log_error(ls, "group %s failed %d %d", in ? "join" : "leave",
+			  error, ls->ls_uevent_result);
+	return error;
+}
+
+static int dlm_uevent(struct kset *kset, struct kobject *kobj,
+		      struct kobj_uevent_env *env)
+{
+	struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
+
+	add_uevent_var(env, "LOCKSPACE=%s", ls->ls_name);
+	return 0;
+}
+
+static struct kset_uevent_ops dlm_uevent_ops = {
+	.uevent = dlm_uevent,
+};
+
+int __init dlm_lockspace_init(void)
+{
+	ls_count = 0;
+	mutex_init(&ls_lock);
+	INIT_LIST_HEAD(&lslist);
+	spin_lock_init(&lslist_lock);
+
+	dlm_kset = kset_create_and_add("dlm", &dlm_uevent_ops, kernel_kobj);
+	if (!dlm_kset) {
+		printk(KERN_WARNING "%s: can not create kset\n", __func__);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void dlm_lockspace_exit(void)
+{
+	kset_unregister(dlm_kset);
+}
+
+static struct dlm_ls *find_ls_to_scan(void)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (time_after_eq(jiffies, ls->ls_scan_time +
+					    dlm_config.ci_scan_secs * HZ)) {
+			spin_unlock(&lslist_lock);
+			return ls;
+		}
+	}
+	spin_unlock(&lslist_lock);
+	return NULL;
+}
+
+static int dlm_scand(void *data)
+{
+	struct dlm_ls *ls;
+
+	while (!kthread_should_stop()) {
+		ls = find_ls_to_scan();
+		if (ls) {
+			if (dlm_lock_recovery_try(ls)) {
+				ls->ls_scan_time = jiffies;
+				dlm_scan_rsbs(ls);
+				dlm_scan_timeout(ls);
+				dlm_scan_waiters(ls);
+				dlm_unlock_recovery(ls);
+			} else {
+				ls->ls_scan_time += HZ;
+			}
+			continue;
+		}
+		schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
+	}
+	return 0;
+}
+
+static int dlm_scand_start(void)
+{
+	struct task_struct *p;
+	int error = 0;
+
+	p = kthread_run(dlm_scand, NULL, "dlm_scand");
+	if (IS_ERR(p))
+		error = PTR_ERR(p);
+	else
+		scand_task = p;
+	return error;
+}
+
+static void dlm_scand_stop(void)
+{
+	kthread_stop(scand_task);
+}
+
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_global_id == id) {
+			ls->ls_count++;
+			goto out;
+		}
+	}
+	ls = NULL;
+ out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_local_handle == lockspace) {
+			ls->ls_count++;
+			goto out;
+		}
+	}
+	ls = NULL;
+ out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_device(int minor)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_device.minor == minor) {
+			ls->ls_count++;
+			goto out;
+		}
+	}
+	ls = NULL;
+ out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+void dlm_put_lockspace(struct dlm_ls *ls)
+{
+	spin_lock(&lslist_lock);
+	ls->ls_count--;
+	spin_unlock(&lslist_lock);
+}
+
+static void remove_lockspace(struct dlm_ls *ls)
+{
+	for (;;) {
+		spin_lock(&lslist_lock);
+		if (ls->ls_count == 0) {
+			WARN_ON(ls->ls_create_count != 0);
+			list_del(&ls->ls_list);
+			spin_unlock(&lslist_lock);
+			return;
+		}
+		spin_unlock(&lslist_lock);
+		ssleep(1);
+	}
+}
+
+static int threads_start(void)
+{
+	int error;
+
+	/* Thread which process lock requests for all lockspace's */
+	error = dlm_astd_start();
+	if (error) {
+		log_print("cannot start dlm_astd thread %d", error);
+		goto fail;
+	}
+
+	error = dlm_scand_start();
+	if (error) {
+		log_print("cannot start dlm_scand thread %d", error);
+		goto astd_fail;
+	}
+
+	/* Thread for sending/receiving messages for all lockspace's */
+	error = dlm_lowcomms_start();
+	if (error) {
+		log_print("cannot start dlm lowcomms %d", error);
+		goto scand_fail;
+	}
+
+	return 0;
+
+ scand_fail:
+	dlm_scand_stop();
+ astd_fail:
+	dlm_astd_stop();
+ fail:
+	return error;
+}
+
+static void threads_stop(void)
+{
+	dlm_scand_stop();
+	dlm_lowcomms_stop();
+	dlm_astd_stop();
+}
+
+static int new_lockspace(const char *name, int namelen, void **lockspace,
+			 uint32_t flags, int lvblen)
+{
+	struct dlm_ls *ls;
+	int i, size, error;
+	int do_unreg = 0;
+
+	if (namelen > DLM_LOCKSPACE_LEN)
+		return -EINVAL;
+
+	if (!lvblen || (lvblen % 8))
+		return -EINVAL;
+
+	if (!try_module_get(THIS_MODULE))
+		return -EINVAL;
+
+	if (!dlm_user_daemon_available()) {
+		module_put(THIS_MODULE);
+		return -EUNATCH;
+	}
+
+	error = 0;
+
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		WARN_ON(ls->ls_create_count <= 0);
+		if (ls->ls_namelen != namelen)
+			continue;
+		if (memcmp(ls->ls_name, name, namelen))
+			continue;
+		if (flags & DLM_LSFL_NEWEXCL) {
+			error = -EEXIST;
+			break;
+		}
+		ls->ls_create_count++;
+		*lockspace = ls;
+		error = 1;
+		break;
+	}
+	spin_unlock(&lslist_lock);
+
+	if (error)
+		goto out;
+
+	error = -ENOMEM;
+
+	ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS);
+	if (!ls)
+		goto out;
+	memcpy(ls->ls_name, name, namelen);
+	ls->ls_namelen = namelen;
+	ls->ls_lvblen = lvblen;
+	ls->ls_count = 0;
+	ls->ls_flags = 0;
+	ls->ls_scan_time = jiffies;
+
+	if (flags & DLM_LSFL_TIMEWARN)
+		set_bit(LSFL_TIMEWARN, &ls->ls_flags);
+
+	/* ls_exflags are forced to match among nodes, and we don't
+	   need to require all nodes to have some flags set */
+	ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
+				    DLM_LSFL_NEWEXCL));
+
+	size = dlm_config.ci_rsbtbl_size;
+	ls->ls_rsbtbl_size = size;
+
+	ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_NOFS);
+	if (!ls->ls_rsbtbl)
+		goto out_lsfree;
+	for (i = 0; i < size; i++) {
+		INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
+		INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
+		spin_lock_init(&ls->ls_rsbtbl[i].lock);
+	}
+
+	size = dlm_config.ci_lkbtbl_size;
+	ls->ls_lkbtbl_size = size;
+
+	ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_NOFS);
+	if (!ls->ls_lkbtbl)
+		goto out_rsbfree;
+	for (i = 0; i < size; i++) {
+		INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
+		rwlock_init(&ls->ls_lkbtbl[i].lock);
+		ls->ls_lkbtbl[i].counter = 1;
+	}
+
+	size = dlm_config.ci_dirtbl_size;
+	ls->ls_dirtbl_size = size;
+
+	ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_NOFS);
+	if (!ls->ls_dirtbl)
+		goto out_lkbfree;
+	for (i = 0; i < size; i++) {
+		INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
+		spin_lock_init(&ls->ls_dirtbl[i].lock);
+	}
+
+	INIT_LIST_HEAD(&ls->ls_waiters);
+	mutex_init(&ls->ls_waiters_mutex);
+	INIT_LIST_HEAD(&ls->ls_orphans);
+	mutex_init(&ls->ls_orphans_mutex);
+	INIT_LIST_HEAD(&ls->ls_timeout);
+	mutex_init(&ls->ls_timeout_mutex);
+
+	INIT_LIST_HEAD(&ls->ls_nodes);
+	INIT_LIST_HEAD(&ls->ls_nodes_gone);
+	ls->ls_num_nodes = 0;
+	ls->ls_low_nodeid = 0;
+	ls->ls_total_weight = 0;
+	ls->ls_node_array = NULL;
+
+	memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
+	ls->ls_stub_rsb.res_ls = ls;
+
+	ls->ls_debug_rsb_dentry = NULL;
+	ls->ls_debug_waiters_dentry = NULL;
+
+	init_waitqueue_head(&ls->ls_uevent_wait);
+	ls->ls_uevent_result = 0;
+	init_completion(&ls->ls_members_done);
+	ls->ls_members_result = -1;
+
+	ls->ls_recoverd_task = NULL;
+	mutex_init(&ls->ls_recoverd_active);
+	spin_lock_init(&ls->ls_recover_lock);
+	spin_lock_init(&ls->ls_rcom_spin);
+	get_random_bytes(&ls->ls_rcom_seq, sizeof(uint64_t));
+	ls->ls_recover_status = 0;
+	ls->ls_recover_seq = 0;
+	ls->ls_recover_args = NULL;
+	init_rwsem(&ls->ls_in_recovery);
+	init_rwsem(&ls->ls_recv_active);
+	INIT_LIST_HEAD(&ls->ls_requestqueue);
+	mutex_init(&ls->ls_requestqueue_mutex);
+	mutex_init(&ls->ls_clear_proc_locks);
+
+	ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
+	if (!ls->ls_recover_buf)
+		goto out_dirfree;
+
+	INIT_LIST_HEAD(&ls->ls_recover_list);
+	spin_lock_init(&ls->ls_recover_list_lock);
+	ls->ls_recover_list_count = 0;
+	ls->ls_local_handle = ls;
+	init_waitqueue_head(&ls->ls_wait_general);
+	INIT_LIST_HEAD(&ls->ls_root_list);
+	init_rwsem(&ls->ls_root_sem);
+
+	down_write(&ls->ls_in_recovery);
+
+	spin_lock(&lslist_lock);
+	ls->ls_create_count = 1;
+	list_add(&ls->ls_list, &lslist);
+	spin_unlock(&lslist_lock);
+
+	/* needs to find ls in lslist */
+	error = dlm_recoverd_start(ls);
+	if (error) {
+		log_error(ls, "can't start dlm_recoverd %d", error);
+		goto out_delist;
+	}
+
+	ls->ls_kobj.kset = dlm_kset;
+	error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL,
+				     "%s", ls->ls_name);
+	if (error)
+		goto out_stop;
+	kobject_uevent(&ls->ls_kobj, KOBJ_ADD);
+
+	/* let kobject handle freeing of ls if there's an error */
+	do_unreg = 1;
+
+	/* This uevent triggers dlm_controld in userspace to add us to the
+	   group of nodes that are members of this lockspace (managed by the
+	   cluster infrastructure.)  Once it's done that, it tells us who the
+	   current lockspace members are (via configfs) and then tells the
+	   lockspace to start running (via sysfs) in dlm_ls_start(). */
+
+	error = do_uevent(ls, 1);
+	if (error)
+		goto out_stop;
+
+	wait_for_completion(&ls->ls_members_done);
+	error = ls->ls_members_result;
+	if (error)
+		goto out_members;
+
+	dlm_create_debug_file(ls);
+
+	log_debug(ls, "join complete");
+	*lockspace = ls;
+	return 0;
+
+ out_members:
+	do_uevent(ls, 0);
+	dlm_clear_members(ls);
+	kfree(ls->ls_node_array);
+ out_stop:
+	dlm_recoverd_stop(ls);
+ out_delist:
+	spin_lock(&lslist_lock);
+	list_del(&ls->ls_list);
+	spin_unlock(&lslist_lock);
+	kfree(ls->ls_recover_buf);
+ out_dirfree:
+	kfree(ls->ls_dirtbl);
+ out_lkbfree:
+	kfree(ls->ls_lkbtbl);
+ out_rsbfree:
+	kfree(ls->ls_rsbtbl);
+ out_lsfree:
+	if (do_unreg)
+		kobject_put(&ls->ls_kobj);
+	else
+		kfree(ls);
+ out:
+	module_put(THIS_MODULE);
+	return error;
+}
+
+int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
+		      uint32_t flags, int lvblen)
+{
+	int error = 0;
+
+	mutex_lock(&ls_lock);
+	if (!ls_count)
+		error = threads_start();
+	if (error)
+		goto out;
+
+	error = new_lockspace(name, namelen, lockspace, flags, lvblen);
+	if (!error)
+		ls_count++;
+	if (error > 0)
+		error = 0;
+	if (!ls_count)
+		threads_stop();
+ out:
+	mutex_unlock(&ls_lock);
+	return error;
+}
+
+/* Return 1 if the lockspace still has active remote locks,
+ *        2 if the lockspace still has active local locks.
+ */
+static int lockspace_busy(struct dlm_ls *ls)
+{
+	int i, lkb_found = 0;
+	struct dlm_lkb *lkb;
+
+	/* NOTE: We check the lockidtbl here rather than the resource table.
+	   This is because there may be LKBs queued as ASTs that have been
+	   unlinked from their RSBs and are pending deletion once the AST has
+	   been delivered */
+
+	for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+		read_lock(&ls->ls_lkbtbl[i].lock);
+		if (!list_empty(&ls->ls_lkbtbl[i].list)) {
+			lkb_found = 1;
+			list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
+					    lkb_idtbl_list) {
+				if (!lkb->lkb_nodeid) {
+					read_unlock(&ls->ls_lkbtbl[i].lock);
+					return 2;
+				}
+			}
+		}
+		read_unlock(&ls->ls_lkbtbl[i].lock);
+	}
+	return lkb_found;
+}
+
+static int release_lockspace(struct dlm_ls *ls, int force)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *rsb;
+	struct list_head *head;
+	int i, busy, rv;
+
+	busy = lockspace_busy(ls);
+
+	spin_lock(&lslist_lock);
+	if (ls->ls_create_count == 1) {
+		if (busy > force)
+			rv = -EBUSY;
+		else {
+			/* remove_lockspace takes ls off lslist */
+			ls->ls_create_count = 0;
+			rv = 0;
+		}
+	} else if (ls->ls_create_count > 1) {
+		rv = --ls->ls_create_count;
+	} else {
+		rv = -EINVAL;
+	}
+	spin_unlock(&lslist_lock);
+
+	if (rv) {
+		log_debug(ls, "release_lockspace no remove %d", rv);
+		return rv;
+	}
+
+	dlm_device_deregister(ls);
+
+	if (force < 3 && dlm_user_daemon_available())
+		do_uevent(ls, 0);
+
+	dlm_recoverd_stop(ls);
+
+	remove_lockspace(ls);
+
+	dlm_delete_debug_file(ls);
+
+	dlm_astd_suspend();
+
+	kfree(ls->ls_recover_buf);
+
+	/*
+	 * Free direntry structs.
+	 */
+
+	dlm_dir_clear(ls);
+	kfree(ls->ls_dirtbl);
+
+	/*
+	 * Free all lkb's on lkbtbl[] lists.
+	 */
+
+	for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+		head = &ls->ls_lkbtbl[i].list;
+		while (!list_empty(head)) {
+			lkb = list_entry(head->next, struct dlm_lkb,
+					 lkb_idtbl_list);
+
+			list_del(&lkb->lkb_idtbl_list);
+
+			dlm_del_ast(lkb);
+
+			if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
+				dlm_free_lvb(lkb->lkb_lvbptr);
+
+			dlm_free_lkb(lkb);
+		}
+	}
+	dlm_astd_resume();
+
+	kfree(ls->ls_lkbtbl);
+
+	/*
+	 * Free all rsb's on rsbtbl[] lists
+	 */
+
+	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+		head = &ls->ls_rsbtbl[i].list;
+		while (!list_empty(head)) {
+			rsb = list_entry(head->next, struct dlm_rsb,
+					 res_hashchain);
+
+			list_del(&rsb->res_hashchain);
+			dlm_free_rsb(rsb);
+		}
+
+		head = &ls->ls_rsbtbl[i].toss;
+		while (!list_empty(head)) {
+			rsb = list_entry(head->next, struct dlm_rsb,
+					 res_hashchain);
+			list_del(&rsb->res_hashchain);
+			dlm_free_rsb(rsb);
+		}
+	}
+
+	kfree(ls->ls_rsbtbl);
+
+	/*
+	 * Free structures on any other lists
+	 */
+
+	dlm_purge_requestqueue(ls);
+	kfree(ls->ls_recover_args);
+	dlm_clear_free_entries(ls);
+	dlm_clear_members(ls);
+	dlm_clear_members_gone(ls);
+	kfree(ls->ls_node_array);
+	log_debug(ls, "release_lockspace final free");
+	kobject_put(&ls->ls_kobj);
+	/* The ls structure will be freed when the kobject is done with */
+
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+/*
+ * Called when a system has released all its locks and is not going to use the
+ * lockspace any longer.  We free everything we're managing for this lockspace.
+ * Remaining nodes will go through the recovery process as if we'd died.  The
+ * lockspace must continue to function as usual, participating in recoveries,
+ * until this returns.
+ *
+ * Force has 4 possible values:
+ * 0 - don't destroy locksapce if it has any LKBs
+ * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
+ * 2 - destroy lockspace regardless of LKBs
+ * 3 - destroy lockspace as part of a forced shutdown
+ */
+
+int dlm_release_lockspace(void *lockspace, int force)
+{
+	struct dlm_ls *ls;
+	int error;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+	dlm_put_lockspace(ls);
+
+	mutex_lock(&ls_lock);
+	error = release_lockspace(ls, force);
+	if (!error)
+		ls_count--;
+	if (!ls_count)
+		threads_stop();
+	mutex_unlock(&ls_lock);
+
+	return error;
+}
+
+void dlm_stop_lockspaces(void)
+{
+	struct dlm_ls *ls;
+
+ restart:
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
+			continue;
+		spin_unlock(&lslist_lock);
+		log_error(ls, "no userland control daemon, stopping lockspace");
+		dlm_ls_stop(ls);
+		goto restart;
+	}
+	spin_unlock(&lslist_lock);
+}
+
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 00000000..f879f879
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,26 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCKSPACE_DOT_H__
+#define __LOCKSPACE_DOT_H__
+
+int dlm_lockspace_init(void);
+void dlm_lockspace_exit(void);
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
+struct dlm_ls *dlm_find_lockspace_local(void *id);
+struct dlm_ls *dlm_find_lockspace_device(int minor);
+void dlm_put_lockspace(struct dlm_ls *ls);
+void dlm_stop_lockspaces(void);
+
+#endif				/* __LOCKSPACE_DOT_H__ */
+
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 00000000..5e2c71f0
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1572 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * lowcomms.c
+ *
+ * This is the "low-level" comms layer.
+ *
+ * It is responsible for sending/receiving messages
+ * from other nodes in the cluster.
+ *
+ * Cluster nodes are referred to by their nodeids. nodeids are
+ * simply 32 bit numbers to the locking module - if they need to
+ * be expanded for the cluster infrastructure then that is its
+ * responsibility. It is this layer's
+ * responsibility to resolve these into IP address or
+ * whatever it needs for inter-node communication.
+ *
+ * The comms level is two kernel threads that deal mainly with
+ * the receiving of messages from other nodes and passing them
+ * up to the mid-level comms layer (which understands the
+ * message format) for execution by the locking core, and
+ * a send thread which does all the setting up of connections
+ * to remote nodes and the sending of data. Threads are not allowed
+ * to send their own data because it may cause them to wait in times
+ * of high load. Also, this way, the sending thread can collect together
+ * messages bound for one node and send them in one block.
+ *
+ * lowcomms will choose to use either TCP or SCTP as its transport layer
+ * depending on the configuration variable 'protocol'. This should be set
+ * to 0 (default) for TCP or 1 for SCTP. It should be configured using a
+ * cluster-wide mechanism as it must be the same on all nodes of the cluster
+ * for the DLM to function.
+ *
+ */
+
+#include <asm/ioctls.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mutex.h>
+#include <linux/sctp.h>
+#include <linux/slab.h>
+#include <net/sctp/user.h>
+#include <net/ipv6.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "config.h"
+
+#define NEEDED_RMEM (4*1024*1024)
+#define CONN_HASH_SIZE 32
+
+/* Number of messages to send before rescheduling */
+#define MAX_SEND_MSG_COUNT 25
+
+struct cbuf {
+	unsigned int base;
+	unsigned int len;
+	unsigned int mask;
+};
+
+static void cbuf_add(struct cbuf *cb, int n)
+{
+	cb->len += n;
+}
+
+static int cbuf_data(struct cbuf *cb)
+{
+	return ((cb->base + cb->len) & cb->mask);
+}
+
+static void cbuf_init(struct cbuf *cb, int size)
+{
+	cb->base = cb->len = 0;
+	cb->mask = size-1;
+}
+
+static void cbuf_eat(struct cbuf *cb, int n)
+{
+	cb->len  -= n;
+	cb->base += n;
+	cb->base &= cb->mask;
+}
+
+static bool cbuf_empty(struct cbuf *cb)
+{
+	return cb->len == 0;
+}
+
+struct connection {
+	struct socket *sock;	/* NULL if not connected */
+	uint32_t nodeid;	/* So we know who we are in the list */
+	struct mutex sock_mutex;
+	unsigned long flags;
+#define CF_READ_PENDING 1
+#define CF_WRITE_PENDING 2
+#define CF_CONNECT_PENDING 3
+#define CF_INIT_PENDING 4
+#define CF_IS_OTHERCON 5
+#define CF_CLOSE 6
+#define CF_APP_LIMITED 7
+	struct list_head writequeue;  /* List of outgoing writequeue_entries */
+	spinlock_t writequeue_lock;
+	int (*rx_action) (struct connection *);	/* What to do when active */
+	void (*connect_action) (struct connection *);	/* What to do to connect */
+	struct page *rx_page;
+	struct cbuf cb;
+	int retries;
+#define MAX_CONNECT_RETRIES 3
+	int sctp_assoc;
+	struct hlist_node list;
+	struct connection *othercon;
+	struct work_struct rwork; /* Receive workqueue */
+	struct work_struct swork; /* Send workqueue */
+};
+#define sock2con(x) ((struct connection *)(x)->sk_user_data)
+
+/* An entry waiting to be sent */
+struct writequeue_entry {
+	struct list_head list;
+	struct page *page;
+	int offset;
+	int len;
+	int end;
+	int users;
+	struct connection *con;
+};
+
+static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static int dlm_local_count;
+
+/* Work queues */
+static struct workqueue_struct *recv_workqueue;
+static struct workqueue_struct *send_workqueue;
+
+static struct hlist_head connection_hash[CONN_HASH_SIZE];
+static DEFINE_MUTEX(connections_lock);
+static struct kmem_cache *con_cache;
+
+static void process_recv_sockets(struct work_struct *work);
+static void process_send_sockets(struct work_struct *work);
+
+
+/* This is deliberately very simple because most clusters have simple
+   sequential nodeids, so we should be able to go straight to a connection
+   struct in the array */
+static inline int nodeid_hash(int nodeid)
+{
+	return nodeid & (CONN_HASH_SIZE-1);
+}
+
+static struct connection *__find_con(int nodeid)
+{
+	int r;
+	struct hlist_node *h;
+	struct connection *con;
+
+	r = nodeid_hash(nodeid);
+
+	hlist_for_each_entry(con, h, &connection_hash[r], list) {
+		if (con->nodeid == nodeid)
+			return con;
+	}
+	return NULL;
+}
+
+/*
+ * If 'allocation' is zero then we don't attempt to create a new
+ * connection structure for this node.
+ */
+static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
+{
+	struct connection *con = NULL;
+	int r;
+
+	con = __find_con(nodeid);
+	if (con || !alloc)
+		return con;
+
+	con = kmem_cache_zalloc(con_cache, alloc);
+	if (!con)
+		return NULL;
+
+	r = nodeid_hash(nodeid);
+	hlist_add_head(&con->list, &connection_hash[r]);
+
+	con->nodeid = nodeid;
+	mutex_init(&con->sock_mutex);
+	INIT_LIST_HEAD(&con->writequeue);
+	spin_lock_init(&con->writequeue_lock);
+	INIT_WORK(&con->swork, process_send_sockets);
+	INIT_WORK(&con->rwork, process_recv_sockets);
+
+	/* Setup action pointers for child sockets */
+	if (con->nodeid) {
+		struct connection *zerocon = __find_con(0);
+
+		con->connect_action = zerocon->connect_action;
+		if (!con->rx_action)
+			con->rx_action = zerocon->rx_action;
+	}
+
+	return con;
+}
+
+/* Loop round all connections */
+static void foreach_conn(void (*conn_func)(struct connection *c))
+{
+	int i;
+	struct hlist_node *h, *n;
+	struct connection *con;
+
+	for (i = 0; i < CONN_HASH_SIZE; i++) {
+		hlist_for_each_entry_safe(con, h, n, &connection_hash[i], list){
+			conn_func(con);
+		}
+	}
+}
+
+static struct connection *nodeid2con(int nodeid, gfp_t allocation)
+{
+	struct connection *con;
+
+	mutex_lock(&connections_lock);
+	con = __nodeid2con(nodeid, allocation);
+	mutex_unlock(&connections_lock);
+
+	return con;
+}
+
+/* This is a bit drastic, but only called when things go wrong */
+static struct connection *assoc2con(int assoc_id)
+{
+	int i;
+	struct hlist_node *h;
+	struct connection *con;
+
+	mutex_lock(&connections_lock);
+
+	for (i = 0 ; i < CONN_HASH_SIZE; i++) {
+		hlist_for_each_entry(con, h, &connection_hash[i], list) {
+			if (con->sctp_assoc == assoc_id) {
+				mutex_unlock(&connections_lock);
+				return con;
+			}
+		}
+	}
+	mutex_unlock(&connections_lock);
+	return NULL;
+}
+
+static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
+{
+	struct sockaddr_storage addr;
+	int error;
+
+	if (!dlm_local_count)
+		return -1;
+
+	error = dlm_nodeid_to_addr(nodeid, &addr);
+	if (error)
+		return error;
+
+	if (dlm_local_addr[0]->ss_family == AF_INET) {
+		struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
+		struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
+		ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
+	} else {
+		struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
+		struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
+		ipv6_addr_copy(&ret6->sin6_addr, &in6->sin6_addr);
+	}
+
+	return 0;
+}
+
+/* Data available on socket or listen socket received a connect */
+static void lowcomms_data_ready(struct sock *sk, int count_unused)
+{
+	struct connection *con = sock2con(sk);
+	if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
+		queue_work(recv_workqueue, &con->rwork);
+}
+
+static void lowcomms_write_space(struct sock *sk)
+{
+	struct connection *con = sock2con(sk);
+
+	if (!con)
+		return;
+
+	clear_bit(SOCK_NOSPACE, &con->sock->flags);
+
+	if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
+		con->sock->sk->sk_write_pending--;
+		clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+	}
+
+	if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+		queue_work(send_workqueue, &con->swork);
+}
+
+static inline void lowcomms_connect_sock(struct connection *con)
+{
+	if (test_bit(CF_CLOSE, &con->flags))
+		return;
+	if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
+		queue_work(send_workqueue, &con->swork);
+}
+
+static void lowcomms_state_change(struct sock *sk)
+{
+	if (sk->sk_state == TCP_ESTABLISHED)
+		lowcomms_write_space(sk);
+}
+
+int dlm_lowcomms_connect_node(int nodeid)
+{
+	struct connection *con;
+
+	/* with sctp there's no connecting without sending */
+	if (dlm_config.ci_protocol != 0)
+		return 0;
+
+	if (nodeid == dlm_our_nodeid())
+		return 0;
+
+	con = nodeid2con(nodeid, GFP_NOFS);
+	if (!con)
+		return -ENOMEM;
+	lowcomms_connect_sock(con);
+	return 0;
+}
+
+/* Make a socket active */
+static int add_sock(struct socket *sock, struct connection *con)
+{
+	con->sock = sock;
+
+	/* Install a data_ready callback */
+	con->sock->sk->sk_data_ready = lowcomms_data_ready;
+	con->sock->sk->sk_write_space = lowcomms_write_space;
+	con->sock->sk->sk_state_change = lowcomms_state_change;
+	con->sock->sk->sk_user_data = con;
+	con->sock->sk->sk_allocation = GFP_NOFS;
+	return 0;
+}
+
+/* Add the port number to an IPv6 or 4 sockaddr and return the address
+   length */
+static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
+			  int *addr_len)
+{
+	saddr->ss_family =  dlm_local_addr[0]->ss_family;
+	if (saddr->ss_family == AF_INET) {
+		struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
+		in4_addr->sin_port = cpu_to_be16(port);
+		*addr_len = sizeof(struct sockaddr_in);
+		memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
+	} else {
+		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
+		in6_addr->sin6_port = cpu_to_be16(port);
+		*addr_len = sizeof(struct sockaddr_in6);
+	}
+	memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len);
+}
+
+/* Close a remote connection and tidy up */
+static void close_connection(struct connection *con, bool and_other)
+{
+	mutex_lock(&con->sock_mutex);
+
+	if (con->sock) {
+		sock_release(con->sock);
+		con->sock = NULL;
+	}
+	if (con->othercon && and_other) {
+		/* Will only re-enter once. */
+		close_connection(con->othercon, false);
+	}
+	if (con->rx_page) {
+		__free_page(con->rx_page);
+		con->rx_page = NULL;
+	}
+
+	con->retries = 0;
+	mutex_unlock(&con->sock_mutex);
+}
+
+/* We only send shutdown messages to nodes that are not part of the cluster */
+static void sctp_send_shutdown(sctp_assoc_t associd)
+{
+	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct msghdr outmessage;
+	struct cmsghdr *cmsg;
+	struct sctp_sndrcvinfo *sinfo;
+	int ret;
+	struct connection *con;
+
+	con = nodeid2con(0,0);
+	BUG_ON(con == NULL);
+
+	outmessage.msg_name = NULL;
+	outmessage.msg_namelen = 0;
+	outmessage.msg_control = outcmsg;
+	outmessage.msg_controllen = sizeof(outcmsg);
+	outmessage.msg_flags = MSG_EOR;
+
+	cmsg = CMSG_FIRSTHDR(&outmessage);
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	outmessage.msg_controllen = cmsg->cmsg_len;
+	sinfo = CMSG_DATA(cmsg);
+	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+
+	sinfo->sinfo_flags |= MSG_EOF;
+	sinfo->sinfo_assoc_id = associd;
+
+	ret = kernel_sendmsg(con->sock, &outmessage, NULL, 0, 0);
+
+	if (ret != 0)
+		log_print("send EOF to node failed: %d", ret);
+}
+
+static void sctp_init_failed_foreach(struct connection *con)
+{
+	con->sctp_assoc = 0;
+	if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+		if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+			queue_work(send_workqueue, &con->swork);
+	}
+}
+
+/* INIT failed but we don't know which node...
+   restart INIT on all pending nodes */
+static void sctp_init_failed(void)
+{
+	mutex_lock(&connections_lock);
+
+	foreach_conn(sctp_init_failed_foreach);
+
+	mutex_unlock(&connections_lock);
+}
+
+/* Something happened to an association */
+static void process_sctp_notification(struct connection *con,
+				      struct msghdr *msg, char *buf)
+{
+	union sctp_notification *sn = (union sctp_notification *)buf;
+
+	if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+		switch (sn->sn_assoc_change.sac_state) {
+
+		case SCTP_COMM_UP:
+		case SCTP_RESTART:
+		{
+			/* Check that the new node is in the lockspace */
+			struct sctp_prim prim;
+			int nodeid;
+			int prim_len, ret;
+			int addr_len;
+			struct connection *new_con;
+			sctp_peeloff_arg_t parg;
+			int parglen = sizeof(parg);
+			int err;
+
+			/*
+			 * We get this before any data for an association.
+			 * We verify that the node is in the cluster and
+			 * then peel off a socket for it.
+			 */
+			if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
+				log_print("COMM_UP for invalid assoc ID %d",
+					 (int)sn->sn_assoc_change.sac_assoc_id);
+				sctp_init_failed();
+				return;
+			}
+			memset(&prim, 0, sizeof(struct sctp_prim));
+			prim_len = sizeof(struct sctp_prim);
+			prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
+
+			ret = kernel_getsockopt(con->sock,
+						IPPROTO_SCTP,
+						SCTP_PRIMARY_ADDR,
+						(char*)&prim,
+						&prim_len);
+			if (ret < 0) {
+				log_print("getsockopt/sctp_primary_addr on "
+					  "new assoc %d failed : %d",
+					  (int)sn->sn_assoc_change.sac_assoc_id,
+					  ret);
+
+				/* Retry INIT later */
+				new_con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
+				if (new_con)
+					clear_bit(CF_CONNECT_PENDING, &con->flags);
+				return;
+			}
+			make_sockaddr(&prim.ssp_addr, 0, &addr_len);
+			if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+				int i;
+				unsigned char *b=(unsigned char *)&prim.ssp_addr;
+				log_print("reject connect from unknown addr");
+				for (i=0; i<sizeof(struct sockaddr_storage);i++)
+					printk("%02x ", b[i]);
+				printk("\n");
+				sctp_send_shutdown(prim.ssp_assoc_id);
+				return;
+			}
+
+			new_con = nodeid2con(nodeid, GFP_NOFS);
+			if (!new_con)
+				return;
+
+			/* Peel off a new sock */
+			parg.associd = sn->sn_assoc_change.sac_assoc_id;
+			ret = kernel_getsockopt(con->sock, IPPROTO_SCTP,
+						SCTP_SOCKOPT_PEELOFF,
+						(void *)&parg, &parglen);
+			if (ret < 0) {
+				log_print("Can't peel off a socket for "
+					  "connection %d to node %d: err=%d",
+					  parg.associd, nodeid, ret);
+				return;
+			}
+			new_con->sock = sockfd_lookup(parg.sd, &err);
+			if (!new_con->sock) {
+				log_print("sockfd_lookup error %d", err);
+				return;
+			}
+			add_sock(new_con->sock, new_con);
+			sockfd_put(new_con->sock);
+
+			log_print("connecting to %d sctp association %d",
+				 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
+
+			/* Send any pending writes */
+			clear_bit(CF_CONNECT_PENDING, &new_con->flags);
+			clear_bit(CF_INIT_PENDING, &con->flags);
+			if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
+				queue_work(send_workqueue, &new_con->swork);
+			}
+			if (!test_and_set_bit(CF_READ_PENDING, &new_con->flags))
+				queue_work(recv_workqueue, &new_con->rwork);
+		}
+		break;
+
+		case SCTP_COMM_LOST:
+		case SCTP_SHUTDOWN_COMP:
+		{
+			con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
+			if (con) {
+				con->sctp_assoc = 0;
+			}
+		}
+		break;
+
+		/* We don't know which INIT failed, so clear the PENDING flags
+		 * on them all.  if assoc_id is zero then it will then try
+		 * again */
+
+		case SCTP_CANT_STR_ASSOC:
+		{
+			log_print("Can't start SCTP association - retrying");
+			sctp_init_failed();
+		}
+		break;
+
+		default:
+			log_print("unexpected SCTP assoc change id=%d state=%d",
+				  (int)sn->sn_assoc_change.sac_assoc_id,
+				  sn->sn_assoc_change.sac_state);
+		}
+	}
+}
+
+/* Data received from remote end */
+static int receive_from_sock(struct connection *con)
+{
+	int ret = 0;
+	struct msghdr msg = {};
+	struct kvec iov[2];
+	unsigned len;
+	int r;
+	int call_again_soon = 0;
+	int nvec;
+	char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+
+	mutex_lock(&con->sock_mutex);
+
+	if (con->sock == NULL) {
+		ret = -EAGAIN;
+		goto out_close;
+	}
+
+	if (con->rx_page == NULL) {
+		/*
+		 * This doesn't need to be atomic, but I think it should
+		 * improve performance if it is.
+		 */
+		con->rx_page = alloc_page(GFP_ATOMIC);
+		if (con->rx_page == NULL)
+			goto out_resched;
+		cbuf_init(&con->cb, PAGE_CACHE_SIZE);
+	}
+
+	/* Only SCTP needs these really */
+	memset(&incmsg, 0, sizeof(incmsg));
+	msg.msg_control = incmsg;
+	msg.msg_controllen = sizeof(incmsg);
+
+	/*
+	 * iov[0] is the bit of the circular buffer between the current end
+	 * point (cb.base + cb.len) and the end of the buffer.
+	 */
+	iov[0].iov_len = con->cb.base - cbuf_data(&con->cb);
+	iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb);
+	iov[1].iov_len = 0;
+	nvec = 1;
+
+	/*
+	 * iov[1] is the bit of the circular buffer between the start of the
+	 * buffer and the start of the currently used section (cb.base)
+	 */
+	if (cbuf_data(&con->cb) >= con->cb.base) {
+		iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb);
+		iov[1].iov_len = con->cb.base;
+		iov[1].iov_base = page_address(con->rx_page);
+		nvec = 2;
+	}
+	len = iov[0].iov_len + iov[1].iov_len;
+
+	r = ret = kernel_recvmsg(con->sock, &msg, iov, nvec, len,
+			       MSG_DONTWAIT | MSG_NOSIGNAL);
+	if (ret <= 0)
+		goto out_close;
+
+	/* Process SCTP notifications */
+	if (msg.msg_flags & MSG_NOTIFICATION) {
+		msg.msg_control = incmsg;
+		msg.msg_controllen = sizeof(incmsg);
+
+		process_sctp_notification(con, &msg,
+				page_address(con->rx_page) + con->cb.base);
+		mutex_unlock(&con->sock_mutex);
+		return 0;
+	}
+	BUG_ON(con->nodeid == 0);
+
+	if (ret == len)
+		call_again_soon = 1;
+	cbuf_add(&con->cb, ret);
+	ret = dlm_process_incoming_buffer(con->nodeid,
+					  page_address(con->rx_page),
+					  con->cb.base, con->cb.len,
+					  PAGE_CACHE_SIZE);
+	if (ret == -EBADMSG) {
+		log_print("lowcomms: addr=%p, base=%u, len=%u, "
+			  "iov_len=%u, iov_base[0]=%p, read=%d",
+			  page_address(con->rx_page), con->cb.base, con->cb.len,
+			  len, iov[0].iov_base, r);
+	}
+	if (ret < 0)
+		goto out_close;
+	cbuf_eat(&con->cb, ret);
+
+	if (cbuf_empty(&con->cb) && !call_again_soon) {
+		__free_page(con->rx_page);
+		con->rx_page = NULL;
+	}
+
+	if (call_again_soon)
+		goto out_resched;
+	mutex_unlock(&con->sock_mutex);
+	return 0;
+
+out_resched:
+	if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
+		queue_work(recv_workqueue, &con->rwork);
+	mutex_unlock(&con->sock_mutex);
+	return -EAGAIN;
+
+out_close:
+	mutex_unlock(&con->sock_mutex);
+	if (ret != -EAGAIN) {
+		close_connection(con, false);
+		/* Reconnect when there is something to send */
+	}
+	/* Don't return success if we really got EOF */
+	if (ret == 0)
+		ret = -EAGAIN;
+
+	return ret;
+}
+
+/* Listening socket is busy, accept a connection */
+static int tcp_accept_from_sock(struct connection *con)
+{
+	int result;
+	struct sockaddr_storage peeraddr;
+	struct socket *newsock;
+	int len;
+	int nodeid;
+	struct connection *newcon;
+	struct connection *addcon;
+
+	memset(&peeraddr, 0, sizeof(peeraddr));
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
+				  IPPROTO_TCP, &newsock);
+	if (result < 0)
+		return -ENOMEM;
+
+	mutex_lock_nested(&con->sock_mutex, 0);
+
+	result = -ENOTCONN;
+	if (con->sock == NULL)
+		goto accept_err;
+
+	newsock->type = con->sock->type;
+	newsock->ops = con->sock->ops;
+
+	result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
+	if (result < 0)
+		goto accept_err;
+
+	/* Get the connected socket's peer */
+	memset(&peeraddr, 0, sizeof(peeraddr));
+	if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
+				  &len, 2)) {
+		result = -ECONNABORTED;
+		goto accept_err;
+	}
+
+	/* Get the new node's NODEID */
+	make_sockaddr(&peeraddr, 0, &len);
+	if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) {
+		log_print("connect from non cluster node");
+		sock_release(newsock);
+		mutex_unlock(&con->sock_mutex);
+		return -1;
+	}
+
+	log_print("got connection from %d", nodeid);
+
+	/*  Check to see if we already have a connection to this node. This
+	 *  could happen if the two nodes initiate a connection at roughly
+	 *  the same time and the connections cross on the wire.
+	 *  In this case we store the incoming one in "othercon"
+	 */
+	newcon = nodeid2con(nodeid, GFP_NOFS);
+	if (!newcon) {
+		result = -ENOMEM;
+		goto accept_err;
+	}
+	mutex_lock_nested(&newcon->sock_mutex, 1);
+	if (newcon->sock) {
+		struct connection *othercon = newcon->othercon;
+
+		if (!othercon) {
+			othercon = kmem_cache_zalloc(con_cache, GFP_NOFS);
+			if (!othercon) {
+				log_print("failed to allocate incoming socket");
+				mutex_unlock(&newcon->sock_mutex);
+				result = -ENOMEM;
+				goto accept_err;
+			}
+			othercon->nodeid = nodeid;
+			othercon->rx_action = receive_from_sock;
+			mutex_init(&othercon->sock_mutex);
+			INIT_WORK(&othercon->swork, process_send_sockets);
+			INIT_WORK(&othercon->rwork, process_recv_sockets);
+			set_bit(CF_IS_OTHERCON, &othercon->flags);
+		}
+		if (!othercon->sock) {
+			newcon->othercon = othercon;
+			othercon->sock = newsock;
+			newsock->sk->sk_user_data = othercon;
+			add_sock(newsock, othercon);
+			addcon = othercon;
+		}
+		else {
+			printk("Extra connection from node %d attempted\n", nodeid);
+			result = -EAGAIN;
+			mutex_unlock(&newcon->sock_mutex);
+			goto accept_err;
+		}
+	}
+	else {
+		newsock->sk->sk_user_data = newcon;
+		newcon->rx_action = receive_from_sock;
+		add_sock(newsock, newcon);
+		addcon = newcon;
+	}
+
+	mutex_unlock(&newcon->sock_mutex);
+
+	/*
+	 * Add it to the active queue in case we got data
+	 * between processing the accept adding the socket
+	 * to the read_sockets list
+	 */
+	if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
+		queue_work(recv_workqueue, &addcon->rwork);
+	mutex_unlock(&con->sock_mutex);
+
+	return 0;
+
+accept_err:
+	mutex_unlock(&con->sock_mutex);
+	sock_release(newsock);
+
+	if (result != -EAGAIN)
+		log_print("error accepting connection from node: %d", result);
+	return result;
+}
+
+static void free_entry(struct writequeue_entry *e)
+{
+	__free_page(e->page);
+	kfree(e);
+}
+
+/* Initiate an SCTP association.
+   This is a special case of send_to_sock() in that we don't yet have a
+   peeled-off socket for this association, so we use the listening socket
+   and add the primary IP address of the remote node.
+ */
+static void sctp_init_assoc(struct connection *con)
+{
+	struct sockaddr_storage rem_addr;
+	char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct msghdr outmessage;
+	struct cmsghdr *cmsg;
+	struct sctp_sndrcvinfo *sinfo;
+	struct connection *base_con;
+	struct writequeue_entry *e;
+	int len, offset;
+	int ret;
+	int addrlen;
+	struct kvec iov[1];
+
+	if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
+		return;
+
+	if (con->retries++ > MAX_CONNECT_RETRIES)
+		return;
+
+	if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) {
+		log_print("no address for nodeid %d", con->nodeid);
+		return;
+	}
+	base_con = nodeid2con(0, 0);
+	BUG_ON(base_con == NULL);
+
+	make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
+
+	outmessage.msg_name = &rem_addr;
+	outmessage.msg_namelen = addrlen;
+	outmessage.msg_control = outcmsg;
+	outmessage.msg_controllen = sizeof(outcmsg);
+	outmessage.msg_flags = MSG_EOR;
+
+	spin_lock(&con->writequeue_lock);
+
+	if (list_empty(&con->writequeue)) {
+		spin_unlock(&con->writequeue_lock);
+		log_print("writequeue empty for nodeid %d", con->nodeid);
+		return;
+	}
+
+	e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
+	len = e->len;
+	offset = e->offset;
+	spin_unlock(&con->writequeue_lock);
+
+	/* Send the first block off the write queue */
+	iov[0].iov_base = page_address(e->page)+offset;
+	iov[0].iov_len = len;
+
+	cmsg = CMSG_FIRSTHDR(&outmessage);
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	sinfo = CMSG_DATA(cmsg);
+	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+	sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid());
+	outmessage.msg_controllen = cmsg->cmsg_len;
+
+	ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
+	if (ret < 0) {
+		log_print("Send first packet to node %d failed: %d",
+			  con->nodeid, ret);
+
+		/* Try again later */
+		clear_bit(CF_CONNECT_PENDING, &con->flags);
+		clear_bit(CF_INIT_PENDING, &con->flags);
+	}
+	else {
+		spin_lock(&con->writequeue_lock);
+		e->offset += ret;
+		e->len -= ret;
+
+		if (e->len == 0 && e->users == 0) {
+			list_del(&e->list);
+			free_entry(e);
+		}
+		spin_unlock(&con->writequeue_lock);
+	}
+}
+
+/* Connect a new socket to its peer */
+static void tcp_connect_to_sock(struct connection *con)
+{
+	int result = -EHOSTUNREACH;
+	struct sockaddr_storage saddr, src_addr;
+	int addr_len;
+	struct socket *sock = NULL;
+	int one = 1;
+
+	if (con->nodeid == 0) {
+		log_print("attempt to connect sock 0 foiled");
+		return;
+	}
+
+	mutex_lock(&con->sock_mutex);
+	if (con->retries++ > MAX_CONNECT_RETRIES)
+		goto out;
+
+	/* Some odd races can cause double-connects, ignore them */
+	if (con->sock) {
+		result = 0;
+		goto out;
+	}
+
+	/* Create a socket to communicate with */
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
+				  IPPROTO_TCP, &sock);
+	if (result < 0)
+		goto out_err;
+
+	memset(&saddr, 0, sizeof(saddr));
+	if (dlm_nodeid_to_addr(con->nodeid, &saddr))
+		goto out_err;
+
+	sock->sk->sk_user_data = con;
+	con->rx_action = receive_from_sock;
+	con->connect_action = tcp_connect_to_sock;
+	add_sock(sock, con);
+
+	/* Bind to our cluster-known address connecting to avoid
+	   routing problems */
+	memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
+	make_sockaddr(&src_addr, 0, &addr_len);
+	result = sock->ops->bind(sock, (struct sockaddr *) &src_addr,
+				 addr_len);
+	if (result < 0) {
+		log_print("could not bind for connect: %d", result);
+		/* This *may* not indicate a critical error */
+	}
+
+	make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
+
+	log_print("connecting to %d", con->nodeid);
+
+	/* Turn off Nagle's algorithm */
+	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+			  sizeof(one));
+
+	result =
+		sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
+				   O_NONBLOCK);
+	if (result == -EINPROGRESS)
+		result = 0;
+	if (result == 0)
+		goto out;
+
+out_err:
+	if (con->sock) {
+		sock_release(con->sock);
+		con->sock = NULL;
+	} else if (sock) {
+		sock_release(sock);
+	}
+	/*
+	 * Some errors are fatal and this list might need adjusting. For other
+	 * errors we try again until the max number of retries is reached.
+	 */
+	if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
+	    result != -ENETDOWN && result != -EINVAL
+	    && result != -EPROTONOSUPPORT) {
+		lowcomms_connect_sock(con);
+		result = 0;
+	}
+out:
+	mutex_unlock(&con->sock_mutex);
+	return;
+}
+
+static struct socket *tcp_create_listen_sock(struct connection *con,
+					     struct sockaddr_storage *saddr)
+{
+	struct socket *sock = NULL;
+	int result = 0;
+	int one = 1;
+	int addr_len;
+
+	if (dlm_local_addr[0]->ss_family == AF_INET)
+		addr_len = sizeof(struct sockaddr_in);
+	else
+		addr_len = sizeof(struct sockaddr_in6);
+
+	/* Create a socket to communicate with */
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
+				  IPPROTO_TCP, &sock);
+	if (result < 0) {
+		log_print("Can't create listening comms socket");
+		goto create_out;
+	}
+
+	/* Turn off Nagle's algorithm */
+	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+			  sizeof(one));
+
+	result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+				   (char *)&one, sizeof(one));
+
+	if (result < 0) {
+		log_print("Failed to set SO_REUSEADDR on socket: %d", result);
+	}
+	sock->sk->sk_user_data = con;
+	con->rx_action = tcp_accept_from_sock;
+	con->connect_action = tcp_connect_to_sock;
+	con->sock = sock;
+
+	/* Bind to our port */
+	make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
+	result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
+	if (result < 0) {
+		log_print("Can't bind to port %d", dlm_config.ci_tcp_port);
+		sock_release(sock);
+		sock = NULL;
+		con->sock = NULL;
+		goto create_out;
+	}
+	result = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+				 (char *)&one, sizeof(one));
+	if (result < 0) {
+		log_print("Set keepalive failed: %d", result);
+	}
+
+	result = sock->ops->listen(sock, 5);
+	if (result < 0) {
+		log_print("Can't listen on port %d", dlm_config.ci_tcp_port);
+		sock_release(sock);
+		sock = NULL;
+		goto create_out;
+	}
+
+create_out:
+	return sock;
+}
+
+/* Get local addresses */
+static void init_local(void)
+{
+	struct sockaddr_storage sas, *addr;
+	int i;
+
+	dlm_local_count = 0;
+	for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
+		if (dlm_our_addr(&sas, i))
+			break;
+
+		addr = kmalloc(sizeof(*addr), GFP_NOFS);
+		if (!addr)
+			break;
+		memcpy(addr, &sas, sizeof(*addr));
+		dlm_local_addr[dlm_local_count++] = addr;
+	}
+}
+
+/* Bind to an IP address. SCTP allows multiple address so it can do
+   multi-homing */
+static int add_sctp_bind_addr(struct connection *sctp_con,
+			      struct sockaddr_storage *addr,
+			      int addr_len, int num)
+{
+	int result = 0;
+
+	if (num == 1)
+		result = kernel_bind(sctp_con->sock,
+				     (struct sockaddr *) addr,
+				     addr_len);
+	else
+		result = kernel_setsockopt(sctp_con->sock, SOL_SCTP,
+					   SCTP_SOCKOPT_BINDX_ADD,
+					   (char *)addr, addr_len);
+
+	if (result < 0)
+		log_print("Can't bind to port %d addr number %d",
+			  dlm_config.ci_tcp_port, num);
+
+	return result;
+}
+
+/* Initialise SCTP socket and bind to all interfaces */
+static int sctp_listen_for_all(void)
+{
+	struct socket *sock = NULL;
+	struct sockaddr_storage localaddr;
+	struct sctp_event_subscribe subscribe;
+	int result = -EINVAL, num = 1, i, addr_len;
+	struct connection *con = nodeid2con(0, GFP_NOFS);
+	int bufsize = NEEDED_RMEM;
+
+	if (!con)
+		return -ENOMEM;
+
+	log_print("Using SCTP for communications");
+
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
+				  IPPROTO_SCTP, &sock);
+	if (result < 0) {
+		log_print("Can't create comms socket, check SCTP is loaded");
+		goto out;
+	}
+
+	/* Listen for events */
+	memset(&subscribe, 0, sizeof(subscribe));
+	subscribe.sctp_data_io_event = 1;
+	subscribe.sctp_association_event = 1;
+	subscribe.sctp_send_failure_event = 1;
+	subscribe.sctp_shutdown_event = 1;
+	subscribe.sctp_partial_delivery_event = 1;
+
+	result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUFFORCE,
+				 (char *)&bufsize, sizeof(bufsize));
+	if (result)
+		log_print("Error increasing buffer space on socket %d", result);
+
+	result = kernel_setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
+				   (char *)&subscribe, sizeof(subscribe));
+	if (result < 0) {
+		log_print("Failed to set SCTP_EVENTS on socket: result=%d",
+			  result);
+		goto create_delsock;
+	}
+
+	/* Init con struct */
+	sock->sk->sk_user_data = con;
+	con->sock = sock;
+	con->sock->sk->sk_data_ready = lowcomms_data_ready;
+	con->rx_action = receive_from_sock;
+	con->connect_action = sctp_init_assoc;
+
+	/* Bind to all interfaces. */
+	for (i = 0; i < dlm_local_count; i++) {
+		memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+		make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len);
+
+		result = add_sctp_bind_addr(con, &localaddr, addr_len, num);
+		if (result)
+			goto create_delsock;
+		++num;
+	}
+
+	result = sock->ops->listen(sock, 5);
+	if (result < 0) {
+		log_print("Can't set socket listening");
+		goto create_delsock;
+	}
+
+	return 0;
+
+create_delsock:
+	sock_release(sock);
+	con->sock = NULL;
+out:
+	return result;
+}
+
+static int tcp_listen_for_all(void)
+{
+	struct socket *sock = NULL;
+	struct connection *con = nodeid2con(0, GFP_NOFS);
+	int result = -EINVAL;
+
+	if (!con)
+		return -ENOMEM;
+
+	/* We don't support multi-homed hosts */
+	if (dlm_local_addr[1] != NULL) {
+		log_print("TCP protocol can't handle multi-homed hosts, "
+			  "try SCTP");
+		return -EINVAL;
+	}
+
+	log_print("Using TCP for communications");
+
+	sock = tcp_create_listen_sock(con, dlm_local_addr[0]);
+	if (sock) {
+		add_sock(sock, con);
+		result = 0;
+	}
+	else {
+		result = -EADDRINUSE;
+	}
+
+	return result;
+}
+
+
+
+static struct writequeue_entry *new_writequeue_entry(struct connection *con,
+						     gfp_t allocation)
+{
+	struct writequeue_entry *entry;
+
+	entry = kmalloc(sizeof(struct writequeue_entry), allocation);
+	if (!entry)
+		return NULL;
+
+	entry->page = alloc_page(allocation);
+	if (!entry->page) {
+		kfree(entry);
+		return NULL;
+	}
+
+	entry->offset = 0;
+	entry->len = 0;
+	entry->end = 0;
+	entry->users = 0;
+	entry->con = con;
+
+	return entry;
+}
+
+void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
+{
+	struct connection *con;
+	struct writequeue_entry *e;
+	int offset = 0;
+	int users = 0;
+
+	con = nodeid2con(nodeid, allocation);
+	if (!con)
+		return NULL;
+
+	spin_lock(&con->writequeue_lock);
+	e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
+	if ((&e->list == &con->writequeue) ||
+	    (PAGE_CACHE_SIZE - e->end < len)) {
+		e = NULL;
+	} else {
+		offset = e->end;
+		e->end += len;
+		users = e->users++;
+	}
+	spin_unlock(&con->writequeue_lock);
+
+	if (e) {
+	got_one:
+		*ppc = page_address(e->page) + offset;
+		return e;
+	}
+
+	e = new_writequeue_entry(con, allocation);
+	if (e) {
+		spin_lock(&con->writequeue_lock);
+		offset = e->end;
+		e->end += len;
+		users = e->users++;
+		list_add_tail(&e->list, &con->writequeue);
+		spin_unlock(&con->writequeue_lock);
+		goto got_one;
+	}
+	return NULL;
+}
+
+void dlm_lowcomms_commit_buffer(void *mh)
+{
+	struct writequeue_entry *e = (struct writequeue_entry *)mh;
+	struct connection *con = e->con;
+	int users;
+
+	spin_lock(&con->writequeue_lock);
+	users = --e->users;
+	if (users)
+		goto out;
+	e->len = e->end - e->offset;
+	spin_unlock(&con->writequeue_lock);
+
+	if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
+		queue_work(send_workqueue, &con->swork);
+	}
+	return;
+
+out:
+	spin_unlock(&con->writequeue_lock);
+	return;
+}
+
+/* Send a message */
+static void send_to_sock(struct connection *con)
+{
+	int ret = 0;
+	const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+	struct writequeue_entry *e;
+	int len, offset;
+	int count = 0;
+
+	mutex_lock(&con->sock_mutex);
+	if (con->sock == NULL)
+		goto out_connect;
+
+	spin_lock(&con->writequeue_lock);
+	for (;;) {
+		e = list_entry(con->writequeue.next, struct writequeue_entry,
+			       list);
+		if ((struct list_head *) e == &con->writequeue)
+			break;
+
+		len = e->len;
+		offset = e->offset;
+		BUG_ON(len == 0 && e->users == 0);
+		spin_unlock(&con->writequeue_lock);
+
+		ret = 0;
+		if (len) {
+			ret = kernel_sendpage(con->sock, e->page, offset, len,
+					      msg_flags);
+			if (ret == -EAGAIN || ret == 0) {
+				if (ret == -EAGAIN &&
+				    test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+				    !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+					/* Notify TCP that we're limited by the
+					 * application window size.
+					 */
+					set_bit(SOCK_NOSPACE, &con->sock->flags);
+					con->sock->sk->sk_write_pending++;
+				}
+				cond_resched();
+				goto out;
+			}
+			if (ret <= 0)
+				goto send_error;
+		}
+
+		/* Don't starve people filling buffers */
+		if (++count >= MAX_SEND_MSG_COUNT) {
+			cond_resched();
+			count = 0;
+		}
+
+		spin_lock(&con->writequeue_lock);
+		e->offset += ret;
+		e->len -= ret;
+
+		if (e->len == 0 && e->users == 0) {
+			list_del(&e->list);
+			free_entry(e);
+			continue;
+		}
+	}
+	spin_unlock(&con->writequeue_lock);
+out:
+	mutex_unlock(&con->sock_mutex);
+	return;
+
+send_error:
+	mutex_unlock(&con->sock_mutex);
+	close_connection(con, false);
+	lowcomms_connect_sock(con);
+	return;
+
+out_connect:
+	mutex_unlock(&con->sock_mutex);
+	if (!test_bit(CF_INIT_PENDING, &con->flags))
+		lowcomms_connect_sock(con);
+	return;
+}
+
+static void clean_one_writequeue(struct connection *con)
+{
+	struct writequeue_entry *e, *safe;
+
+	spin_lock(&con->writequeue_lock);
+	list_for_each_entry_safe(e, safe, &con->writequeue, list) {
+		list_del(&e->list);
+		free_entry(e);
+	}
+	spin_unlock(&con->writequeue_lock);
+}
+
+/* Called from recovery when it knows that a node has
+   left the cluster */
+int dlm_lowcomms_close(int nodeid)
+{
+	struct connection *con;
+
+	log_print("closing connection to node %d", nodeid);
+	con = nodeid2con(nodeid, 0);
+	if (con) {
+		clear_bit(CF_CONNECT_PENDING, &con->flags);
+		clear_bit(CF_WRITE_PENDING, &con->flags);
+		set_bit(CF_CLOSE, &con->flags);
+		if (cancel_work_sync(&con->swork))
+			log_print("canceled swork for node %d", nodeid);
+		if (cancel_work_sync(&con->rwork))
+			log_print("canceled rwork for node %d", nodeid);
+		clean_one_writequeue(con);
+		close_connection(con, true);
+	}
+	return 0;
+}
+
+/* Receive workqueue function */
+static void process_recv_sockets(struct work_struct *work)
+{
+	struct connection *con = container_of(work, struct connection, rwork);
+	int err;
+
+	clear_bit(CF_READ_PENDING, &con->flags);
+	do {
+		err = con->rx_action(con);
+	} while (!err);
+}
+
+/* Send workqueue function */
+static void process_send_sockets(struct work_struct *work)
+{
+	struct connection *con = container_of(work, struct connection, swork);
+
+	if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+		con->connect_action(con);
+		set_bit(CF_WRITE_PENDING, &con->flags);
+	}
+	if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
+		send_to_sock(con);
+}
+
+
+/* Discard all entries on the write queues */
+static void clean_writequeues(void)
+{
+	foreach_conn(clean_one_writequeue);
+}
+
+static void work_stop(void)
+{
+	destroy_workqueue(recv_workqueue);
+	destroy_workqueue(send_workqueue);
+}
+
+static int work_start(void)
+{
+	recv_workqueue = alloc_workqueue("dlm_recv",
+					 WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
+	if (!recv_workqueue) {
+		log_print("can't start dlm_recv");
+		return -ENOMEM;
+	}
+
+	send_workqueue = alloc_workqueue("dlm_send",
+					 WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
+	if (!send_workqueue) {
+		log_print("can't start dlm_send");
+		destroy_workqueue(recv_workqueue);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void stop_conn(struct connection *con)
+{
+	con->flags |= 0x0F;
+	if (con->sock && con->sock->sk)
+		con->sock->sk->sk_user_data = NULL;
+}
+
+static void free_conn(struct connection *con)
+{
+	close_connection(con, true);
+	if (con->othercon)
+		kmem_cache_free(con_cache, con->othercon);
+	hlist_del(&con->list);
+	kmem_cache_free(con_cache, con);
+}
+
+void dlm_lowcomms_stop(void)
+{
+	/* Set all the flags to prevent any
+	   socket activity.
+	*/
+	mutex_lock(&connections_lock);
+	foreach_conn(stop_conn);
+	mutex_unlock(&connections_lock);
+
+	work_stop();
+
+	mutex_lock(&connections_lock);
+	clean_writequeues();
+
+	foreach_conn(free_conn);
+
+	mutex_unlock(&connections_lock);
+	kmem_cache_destroy(con_cache);
+}
+
+int dlm_lowcomms_start(void)
+{
+	int error = -EINVAL;
+	struct connection *con;
+	int i;
+
+	for (i = 0; i < CONN_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&connection_hash[i]);
+
+	init_local();
+	if (!dlm_local_count) {
+		error = -ENOTCONN;
+		log_print("no local IP address has been set");
+		goto out;
+	}
+
+	error = -ENOMEM;
+	con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
+				      __alignof__(struct connection), 0,
+				      NULL);
+	if (!con_cache)
+		goto out;
+
+	/* Start listening */
+	if (dlm_config.ci_protocol == 0)
+		error = tcp_listen_for_all();
+	else
+		error = sctp_listen_for_all();
+	if (error)
+		goto fail_unlisten;
+
+	error = work_start();
+	if (error)
+		goto fail_unlisten;
+
+	return 0;
+
+fail_unlisten:
+	con = nodeid2con(0,0);
+	if (con) {
+		close_connection(con, false);
+		kmem_cache_free(con_cache, con);
+	}
+	kmem_cache_destroy(con_cache);
+
+out:
+	return error;
+}
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 00000000..1311e642
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,25 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOWCOMMS_DOT_H__
+#define __LOWCOMMS_DOT_H__
+
+int dlm_lowcomms_start(void);
+void dlm_lowcomms_stop(void);
+int dlm_lowcomms_close(int nodeid);
+void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc);
+void dlm_lowcomms_commit_buffer(void *mh);
+int dlm_lowcomms_connect_node(int nodeid);
+
+#endif				/* __LOWCOMMS_DOT_H__ */
+
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 00000000..cc3e92f3
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LVB_TABLE_DOT_H__
+#define __LVB_TABLE_DOT_H__
+
+extern const int dlm_lvb_operations[8][8];
+
+#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 00000000..5a59efa0
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,95 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "user.h"
+#include "memory.h"
+#include "config.h"
+
+static int __init init_dlm(void)
+{
+	int error;
+
+	error = dlm_memory_init();
+	if (error)
+		goto out;
+
+	error = dlm_lockspace_init();
+	if (error)
+		goto out_mem;
+
+	error = dlm_config_init();
+	if (error)
+		goto out_lockspace;
+
+	error = dlm_register_debugfs();
+	if (error)
+		goto out_config;
+
+	error = dlm_user_init();
+	if (error)
+		goto out_debug;
+
+	error = dlm_netlink_init();
+	if (error)
+		goto out_user;
+
+	error = dlm_plock_init();
+	if (error)
+		goto out_netlink;
+
+	printk("DLM installed\n");
+
+	return 0;
+
+ out_netlink:
+	dlm_netlink_exit();
+ out_user:
+	dlm_user_exit();
+ out_debug:
+	dlm_unregister_debugfs();
+ out_config:
+	dlm_config_exit();
+ out_lockspace:
+	dlm_lockspace_exit();
+ out_mem:
+	dlm_memory_exit();
+ out:
+	return error;
+}
+
+static void __exit exit_dlm(void)
+{
+	dlm_plock_exit();
+	dlm_netlink_exit();
+	dlm_user_exit();
+	dlm_config_exit();
+	dlm_memory_exit();
+	dlm_lockspace_exit();
+	dlm_unregister_debugfs();
+}
+
+module_init(init_dlm);
+module_exit(exit_dlm);
+
+MODULE_DESCRIPTION("Distributed Lock Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL_GPL(dlm_new_lockspace);
+EXPORT_SYMBOL_GPL(dlm_release_lockspace);
+EXPORT_SYMBOL_GPL(dlm_lock);
+EXPORT_SYMBOL_GPL(dlm_unlock);
+
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 00000000..b12532e5
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,390 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2009 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "recover.h"
+#include "rcom.h"
+#include "config.h"
+#include "lowcomms.h"
+
+static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
+{
+	struct dlm_member *memb = NULL;
+	struct list_head *tmp;
+	struct list_head *newlist = &new->list;
+	struct list_head *head = &ls->ls_nodes;
+
+	list_for_each(tmp, head) {
+		memb = list_entry(tmp, struct dlm_member, list);
+		if (new->nodeid < memb->nodeid)
+			break;
+	}
+
+	if (!memb)
+		list_add_tail(newlist, head);
+	else {
+		/* FIXME: can use list macro here */
+		newlist->prev = tmp->prev;
+		newlist->next = tmp;
+		tmp->prev->next = newlist;
+		tmp->prev = newlist;
+	}
+}
+
+static int dlm_add_member(struct dlm_ls *ls, int nodeid)
+{
+	struct dlm_member *memb;
+	int w, error;
+
+	memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
+	if (!memb)
+		return -ENOMEM;
+
+	w = dlm_node_weight(ls->ls_name, nodeid);
+	if (w < 0) {
+		kfree(memb);
+		return w;
+	}
+
+	error = dlm_lowcomms_connect_node(nodeid);
+	if (error < 0) {
+		kfree(memb);
+		return error;
+	}
+
+	memb->nodeid = nodeid;
+	memb->weight = w;
+	add_ordered_member(ls, memb);
+	ls->ls_num_nodes++;
+	return 0;
+}
+
+static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
+{
+	list_move(&memb->list, &ls->ls_nodes_gone);
+	ls->ls_num_nodes--;
+}
+
+int dlm_is_member(struct dlm_ls *ls, int nodeid)
+{
+	struct dlm_member *memb;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (memb->nodeid == nodeid)
+			return 1;
+	}
+	return 0;
+}
+
+int dlm_is_removed(struct dlm_ls *ls, int nodeid)
+{
+	struct dlm_member *memb;
+
+	list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
+		if (memb->nodeid == nodeid)
+			return 1;
+	}
+	return 0;
+}
+
+static void clear_memb_list(struct list_head *head)
+{
+	struct dlm_member *memb;
+
+	while (!list_empty(head)) {
+		memb = list_entry(head->next, struct dlm_member, list);
+		list_del(&memb->list);
+		kfree(memb);
+	}
+}
+
+void dlm_clear_members(struct dlm_ls *ls)
+{
+	clear_memb_list(&ls->ls_nodes);
+	ls->ls_num_nodes = 0;
+}
+
+void dlm_clear_members_gone(struct dlm_ls *ls)
+{
+	clear_memb_list(&ls->ls_nodes_gone);
+}
+
+static void make_member_array(struct dlm_ls *ls)
+{
+	struct dlm_member *memb;
+	int i, w, x = 0, total = 0, all_zero = 0, *array;
+
+	kfree(ls->ls_node_array);
+	ls->ls_node_array = NULL;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (memb->weight)
+			total += memb->weight;
+	}
+
+	/* all nodes revert to weight of 1 if all have weight 0 */
+
+	if (!total) {
+		total = ls->ls_num_nodes;
+		all_zero = 1;
+	}
+
+	ls->ls_total_weight = total;
+
+	array = kmalloc(sizeof(int) * total, GFP_NOFS);
+	if (!array)
+		return;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (!all_zero && !memb->weight)
+			continue;
+
+		if (all_zero)
+			w = 1;
+		else
+			w = memb->weight;
+
+		DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
+
+		for (i = 0; i < w; i++)
+			array[x++] = memb->nodeid;
+	}
+
+	ls->ls_node_array = array;
+}
+
+/* send a status request to all members just to establish comms connections */
+
+static int ping_members(struct dlm_ls *ls)
+{
+	struct dlm_member *memb;
+	int error = 0;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		error = dlm_recovery_stopped(ls);
+		if (error)
+			break;
+		error = dlm_rcom_status(ls, memb->nodeid);
+		if (error)
+			break;
+	}
+	if (error)
+		log_debug(ls, "ping_members aborted %d last nodeid %d",
+			  error, ls->ls_recover_nodeid);
+	return error;
+}
+
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
+{
+	struct dlm_member *memb, *safe;
+	int i, error, found, pos = 0, neg = 0, low = -1;
+
+	/* previously removed members that we've not finished removing need to
+	   count as a negative change so the "neg" recovery steps will happen */
+
+	list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
+		log_debug(ls, "prev removed member %d", memb->nodeid);
+		neg++;
+	}
+
+	/* move departed members from ls_nodes to ls_nodes_gone */
+
+	list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
+		found = 0;
+		for (i = 0; i < rv->node_count; i++) {
+			if (memb->nodeid == rv->nodeids[i]) {
+				found = 1;
+				break;
+			}
+		}
+
+		if (!found) {
+			neg++;
+			dlm_remove_member(ls, memb);
+			log_debug(ls, "remove member %d", memb->nodeid);
+		}
+	}
+
+	/* Add an entry to ls_nodes_gone for members that were removed and
+	   then added again, so that previous state for these nodes will be
+	   cleared during recovery. */
+
+	for (i = 0; i < rv->new_count; i++) {
+		if (!dlm_is_member(ls, rv->new[i]))
+			continue;
+		log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
+
+		memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
+		if (!memb)
+			return -ENOMEM;
+		memb->nodeid = rv->new[i];
+		list_add_tail(&memb->list, &ls->ls_nodes_gone);
+		neg++;
+	}
+
+	/* add new members to ls_nodes */
+
+	for (i = 0; i < rv->node_count; i++) {
+		if (dlm_is_member(ls, rv->nodeids[i]))
+			continue;
+		dlm_add_member(ls, rv->nodeids[i]);
+		pos++;
+		log_debug(ls, "add member %d", rv->nodeids[i]);
+	}
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (low == -1 || memb->nodeid < low)
+			low = memb->nodeid;
+	}
+	ls->ls_low_nodeid = low;
+
+	make_member_array(ls);
+	dlm_set_recover_status(ls, DLM_RS_NODES);
+	*neg_out = neg;
+
+	error = ping_members(ls);
+	if (!error || error == -EPROTO) {
+		/* new_lockspace() may be waiting to know if the config
+		   is good or bad */
+		ls->ls_members_result = error;
+		complete(&ls->ls_members_done);
+	}
+	if (error)
+		goto out;
+
+	error = dlm_recover_members_wait(ls);
+ out:
+	log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
+	return error;
+}
+
+/* Userspace guarantees that dlm_ls_stop() has completed on all nodes before
+   dlm_ls_start() is called on any of them to start the new recovery. */
+
+int dlm_ls_stop(struct dlm_ls *ls)
+{
+	int new;
+
+	/*
+	 * Prevent dlm_recv from being in the middle of something when we do
+	 * the stop.  This includes ensuring dlm_recv isn't processing a
+	 * recovery message (rcom), while dlm_recoverd is aborting and
+	 * resetting things from an in-progress recovery.  i.e. we want
+	 * dlm_recoverd to abort its recovery without worrying about dlm_recv
+	 * processing an rcom at the same time.  Stopping dlm_recv also makes
+	 * it easy for dlm_receive_message() to check locking stopped and add a
+	 * message to the requestqueue without races.
+	 */
+
+	down_write(&ls->ls_recv_active);
+
+	/*
+	 * Abort any recovery that's in progress (see RECOVERY_STOP,
+	 * dlm_recovery_stopped()) and tell any other threads running in the
+	 * dlm to quit any processing (see RUNNING, dlm_locking_stopped()).
+	 */
+
+	spin_lock(&ls->ls_recover_lock);
+	set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+	new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
+	ls->ls_recover_seq++;
+	spin_unlock(&ls->ls_recover_lock);
+
+	/*
+	 * Let dlm_recv run again, now any normal messages will be saved on the
+	 * requestqueue for later.
+	 */
+
+	up_write(&ls->ls_recv_active);
+
+	/*
+	 * This in_recovery lock does two things:
+	 * 1) Keeps this function from returning until all threads are out
+	 *    of locking routines and locking is truly stopped.
+	 * 2) Keeps any new requests from being processed until it's unlocked
+	 *    when recovery is complete.
+	 */
+
+	if (new)
+		down_write(&ls->ls_in_recovery);
+
+	/*
+	 * The recoverd suspend/resume makes sure that dlm_recoverd (if
+	 * running) has noticed RECOVERY_STOP above and quit processing the
+	 * previous recovery.
+	 */
+
+	dlm_recoverd_suspend(ls);
+	ls->ls_recover_status = 0;
+	dlm_recoverd_resume(ls);
+
+	if (!ls->ls_recover_begin)
+		ls->ls_recover_begin = jiffies;
+	return 0;
+}
+
+int dlm_ls_start(struct dlm_ls *ls)
+{
+	struct dlm_recover *rv = NULL, *rv_old;
+	int *ids = NULL, *new = NULL;
+	int error, ids_count = 0, new_count = 0;
+
+	rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
+	if (!rv)
+		return -ENOMEM;
+
+	error = dlm_nodeid_list(ls->ls_name, &ids, &ids_count,
+				&new, &new_count);
+	if (error < 0)
+		goto fail;
+
+	spin_lock(&ls->ls_recover_lock);
+
+	/* the lockspace needs to be stopped before it can be started */
+
+	if (!dlm_locking_stopped(ls)) {
+		spin_unlock(&ls->ls_recover_lock);
+		log_error(ls, "start ignored: lockspace running");
+		error = -EINVAL;
+		goto fail;
+	}
+
+	rv->nodeids = ids;
+	rv->node_count = ids_count;
+	rv->new = new;
+	rv->new_count = new_count;
+	rv->seq = ++ls->ls_recover_seq;
+	rv_old = ls->ls_recover_args;
+	ls->ls_recover_args = rv;
+	spin_unlock(&ls->ls_recover_lock);
+
+	if (rv_old) {
+		log_error(ls, "unused recovery %llx %d",
+			  (unsigned long long)rv_old->seq, rv_old->node_count);
+		kfree(rv_old->nodeids);
+		kfree(rv_old->new);
+		kfree(rv_old);
+	}
+
+	dlm_recoverd_kick(ls);
+	return 0;
+
+ fail:
+	kfree(rv);
+	kfree(ids);
+	kfree(new);
+	return error;
+}
+
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 00000000..7a26fca1
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,25 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMBER_DOT_H__
+#define __MEMBER_DOT_H__
+
+int dlm_ls_stop(struct dlm_ls *ls);
+int dlm_ls_start(struct dlm_ls *ls);
+void dlm_clear_members(struct dlm_ls *ls);
+void dlm_clear_members_gone(struct dlm_ls *ls);
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
+int dlm_is_removed(struct dlm_ls *ls, int nodeid);
+int dlm_is_member(struct dlm_ls *ls, int nodeid);
+
+#endif                          /* __MEMBER_DOT_H__ */
+
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 00000000..8e0d00db
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,92 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "config.h"
+#include "memory.h"
+
+static struct kmem_cache *lkb_cache;
+
+
+int __init dlm_memory_init(void)
+{
+	int ret = 0;
+
+	lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
+				__alignof__(struct dlm_lkb), 0, NULL);
+	if (!lkb_cache)
+		ret = -ENOMEM;
+	return ret;
+}
+
+void dlm_memory_exit(void)
+{
+	if (lkb_cache)
+		kmem_cache_destroy(lkb_cache);
+}
+
+char *dlm_allocate_lvb(struct dlm_ls *ls)
+{
+	char *p;
+
+	p = kzalloc(ls->ls_lvblen, GFP_NOFS);
+	return p;
+}
+
+void dlm_free_lvb(char *p)
+{
+	kfree(p);
+}
+
+/* FIXME: have some minimal space built-in to rsb for the name and
+   kmalloc a separate name if needed, like dentries are done */
+
+struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
+{
+	struct dlm_rsb *r;
+
+	DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
+
+	r = kzalloc(sizeof(*r) + namelen, GFP_NOFS);
+	return r;
+}
+
+void dlm_free_rsb(struct dlm_rsb *r)
+{
+	if (r->res_lvbptr)
+		dlm_free_lvb(r->res_lvbptr);
+	kfree(r);
+}
+
+struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb;
+
+	lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS);
+	return lkb;
+}
+
+void dlm_free_lkb(struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_flags & DLM_IFL_USER) {
+		struct dlm_user_args *ua;
+		ua = lkb->lkb_ua;
+		if (ua) {
+			if (ua->lksb.sb_lvbptr)
+				kfree(ua->lksb.sb_lvbptr);
+			kfree(ua);
+		}
+	}
+	kmem_cache_free(lkb_cache, lkb);
+}
+
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 00000000..485fb291
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,27 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMORY_DOT_H__
+#define __MEMORY_DOT_H__
+
+int dlm_memory_init(void);
+void dlm_memory_exit(void);
+struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen);
+void dlm_free_rsb(struct dlm_rsb *r);
+struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
+void dlm_free_lkb(struct dlm_lkb *l);
+char *dlm_allocate_lvb(struct dlm_ls *ls);
+void dlm_free_lvb(char *l);
+
+#endif		/* __MEMORY_DOT_H__ */
+
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 00000000..f3396c62
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,137 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * midcomms.c
+ *
+ * This is the appallingly named "mid-level" comms layer.
+ *
+ * Its purpose is to take packets from the "real" comms layer,
+ * split them up into packets and pass them to the interested
+ * part of the locking mechanism.
+ *
+ * It also takes messages from the locking layer, formats them
+ * into packets and sends them to the comms layer.
+ */
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "lock.h"
+#include "midcomms.h"
+
+
+static void copy_from_cb(void *dst, const void *base, unsigned offset,
+			 unsigned len, unsigned limit)
+{
+	unsigned copy = len;
+
+	if ((copy + offset) > limit)
+		copy = limit - offset;
+	memcpy(dst, base + offset, copy);
+	len -= copy;
+	if (len)
+		memcpy(dst + copy, base, len);
+}
+
+/*
+ * Called from the low-level comms layer to process a buffer of
+ * commands.
+ *
+ * Only complete messages are processed here, any "spare" bytes from
+ * the end of a buffer are saved and tacked onto the front of the next
+ * message that comes in. I doubt this will happen very often but we
+ * need to be able to cope with it and I don't want the task to be waiting
+ * for packets to come in when there is useful work to be done.
+ */
+
+int dlm_process_incoming_buffer(int nodeid, const void *base,
+				unsigned offset, unsigned len, unsigned limit)
+{
+	union {
+		unsigned char __buf[DLM_INBUF_LEN];
+		/* this is to force proper alignment on some arches */
+		union dlm_packet p;
+	} __tmp;
+	union dlm_packet *p = &__tmp.p;
+	int ret = 0;
+	int err = 0;
+	uint16_t msglen;
+	uint32_t lockspace;
+
+	while (len > sizeof(struct dlm_header)) {
+
+		/* Copy just the header to check the total length.  The
+		   message may wrap around the end of the buffer back to the
+		   start, so we need to use a temp buffer and copy_from_cb. */
+
+		copy_from_cb(p, base, offset, sizeof(struct dlm_header),
+			     limit);
+
+		msglen = le16_to_cpu(p->header.h_length);
+		lockspace = p->header.h_lockspace;
+
+		err = -EINVAL;
+		if (msglen < sizeof(struct dlm_header))
+			break;
+		if (p->header.h_cmd == DLM_MSG) {
+			if (msglen < sizeof(struct dlm_message))
+				break;
+		} else {
+			if (msglen < sizeof(struct dlm_rcom))
+				break;
+		}
+		err = -E2BIG;
+		if (msglen > dlm_config.ci_buffer_size) {
+			log_print("message size %d from %d too big, buf len %d",
+				  msglen, nodeid, len);
+			break;
+		}
+		err = 0;
+
+		/* If only part of the full message is contained in this
+		   buffer, then do nothing and wait for lowcomms to call
+		   us again later with more data.  We return 0 meaning
+		   we've consumed none of the input buffer. */
+
+		if (msglen > len)
+			break;
+
+		/* Allocate a larger temp buffer if the full message won't fit
+		   in the buffer on the stack (which should work for most
+		   ordinary messages). */
+
+		if (msglen > sizeof(__tmp) && p == &__tmp.p) {
+			p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
+			if (p == NULL)
+				return ret;
+		}
+
+		copy_from_cb(p, base, offset, msglen, limit);
+
+		BUG_ON(lockspace != p->header.h_lockspace);
+
+		ret += msglen;
+		offset += msglen;
+		offset &= (limit - 1);
+		len -= msglen;
+
+		dlm_receive_buffer(p, nodeid);
+	}
+
+	if (p != &__tmp.p)
+		kfree(p);
+
+	return err ? err : ret;
+}
+
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 00000000..95852a5f
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MIDCOMMS_DOT_H__
+#define __MIDCOMMS_DOT_H__
+
+int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
+				unsigned len, unsigned limit);
+
+#endif				/* __MIDCOMMS_DOT_H__ */
+
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
new file mode 100644
index 00000000..ef17e016
--- /dev/null
+++ b/fs/dlm/netlink.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#include <net/genetlink.h>
+#include <linux/dlm.h>
+#include <linux/dlm_netlink.h>
+#include <linux/gfp.h>
+
+#include "dlm_internal.h"
+
+static uint32_t dlm_nl_seqnum;
+static uint32_t listener_nlpid;
+
+static struct genl_family family = {
+	.id		= GENL_ID_GENERATE,
+	.name		= DLM_GENL_NAME,
+	.version	= DLM_GENL_VERSION,
+};
+
+static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
+{
+	struct sk_buff *skb;
+	void *data;
+
+	skb = genlmsg_new(size, GFP_NOFS);
+	if (!skb)
+		return -ENOMEM;
+
+	/* add the message headers */
+	data = genlmsg_put(skb, 0, dlm_nl_seqnum++, &family, 0, cmd);
+	if (!data) {
+		nlmsg_free(skb);
+		return -EINVAL;
+	}
+
+	*skbp = skb;
+	return 0;
+}
+
+static struct dlm_lock_data *mk_data(struct sk_buff *skb)
+{
+	struct nlattr *ret;
+
+	ret = nla_reserve(skb, DLM_TYPE_LOCK, sizeof(struct dlm_lock_data));
+	if (!ret)
+		return NULL;
+	return nla_data(ret);
+}
+
+static int send_data(struct sk_buff *skb)
+{
+	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	void *data = genlmsg_data(genlhdr);
+	int rv;
+
+	rv = genlmsg_end(skb, data);
+	if (rv < 0) {
+		nlmsg_free(skb);
+		return rv;
+	}
+
+	return genlmsg_unicast(&init_net, skb, listener_nlpid);
+}
+
+static int user_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+	listener_nlpid = info->snd_pid;
+	printk("user_cmd nlpid %u\n", listener_nlpid);
+	return 0;
+}
+
+static struct genl_ops dlm_nl_ops = {
+	.cmd		= DLM_CMD_HELLO,
+	.doit		= user_cmd,
+};
+
+int __init dlm_netlink_init(void)
+{
+	return genl_register_family_with_ops(&family, &dlm_nl_ops, 1);
+}
+
+void dlm_netlink_exit(void)
+{
+	genl_unregister_family(&family);
+}
+
+static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
+{
+	struct dlm_rsb *r = lkb->lkb_resource;
+
+	memset(data, 0, sizeof(struct dlm_lock_data));
+
+	data->version = DLM_LOCK_DATA_VERSION;
+	data->nodeid = lkb->lkb_nodeid;
+	data->ownpid = lkb->lkb_ownpid;
+	data->id = lkb->lkb_id;
+	data->remid = lkb->lkb_remid;
+	data->status = lkb->lkb_status;
+	data->grmode = lkb->lkb_grmode;
+	data->rqmode = lkb->lkb_rqmode;
+	if (lkb->lkb_ua)
+		data->xid = lkb->lkb_ua->xid;
+	if (r) {
+		data->lockspace_id = r->res_ls->ls_global_id;
+		data->resource_namelen = r->res_length;
+		memcpy(data->resource_name, r->res_name, r->res_length);
+	}
+}
+
+void dlm_timeout_warn(struct dlm_lkb *lkb)
+{
+	struct sk_buff *uninitialized_var(send_skb);
+	struct dlm_lock_data *data;
+	size_t size;
+	int rv;
+
+	size = nla_total_size(sizeof(struct dlm_lock_data)) +
+	       nla_total_size(0); /* why this? */
+
+	rv = prepare_data(DLM_CMD_TIMEOUT, &send_skb, size);
+	if (rv < 0)
+		return;
+
+	data = mk_data(send_skb);
+	if (!data) {
+		nlmsg_free(send_skb);
+		return;
+	}
+
+	fill_data(data, lkb);
+
+	send_data(send_skb);
+}
+
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
new file mode 100644
index 00000000..e2b87800
--- /dev/null
+++ b/fs/dlm/plock.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <linux/dlm.h>
+#include <linux/dlm_plock.h>
+#include <linux/slab.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+
+static spinlock_t ops_lock;
+static struct list_head send_list;
+static struct list_head recv_list;
+static wait_queue_head_t send_wq;
+static wait_queue_head_t recv_wq;
+
+struct plock_op {
+	struct list_head list;
+	int done;
+	struct dlm_plock_info info;
+};
+
+struct plock_xop {
+	struct plock_op xop;
+	void *callback;
+	void *fl;
+	void *file;
+	struct file_lock flc;
+};
+
+
+static inline void set_version(struct dlm_plock_info *info)
+{
+	info->version[0] = DLM_PLOCK_VERSION_MAJOR;
+	info->version[1] = DLM_PLOCK_VERSION_MINOR;
+	info->version[2] = DLM_PLOCK_VERSION_PATCH;
+}
+
+static int check_version(struct dlm_plock_info *info)
+{
+	if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
+	    (DLM_PLOCK_VERSION_MINOR < info->version[1])) {
+		log_print("plock device version mismatch: "
+			  "kernel (%u.%u.%u), user (%u.%u.%u)",
+			  DLM_PLOCK_VERSION_MAJOR,
+			  DLM_PLOCK_VERSION_MINOR,
+			  DLM_PLOCK_VERSION_PATCH,
+			  info->version[0],
+			  info->version[1],
+			  info->version[2]);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void send_op(struct plock_op *op)
+{
+	set_version(&op->info);
+	INIT_LIST_HEAD(&op->list);
+	spin_lock(&ops_lock);
+	list_add_tail(&op->list, &send_list);
+	spin_unlock(&ops_lock);
+	wake_up(&send_wq);
+}
+
+/* If a process was killed while waiting for the only plock on a file,
+   locks_remove_posix will not see any lock on the file so it won't
+   send an unlock-close to us to pass on to userspace to clean up the
+   abandoned waiter.  So, we have to insert the unlock-close when the
+   lock call is interrupted. */
+
+static void do_unlock_close(struct dlm_ls *ls, u64 number,
+			    struct file *file, struct file_lock *fl)
+{
+	struct plock_op *op;
+
+	op = kzalloc(sizeof(*op), GFP_NOFS);
+	if (!op)
+		return;
+
+	op->info.optype		= DLM_PLOCK_OP_UNLOCK;
+	op->info.pid		= fl->fl_pid;
+	op->info.fsid		= ls->ls_global_id;
+	op->info.number		= number;
+	op->info.start		= 0;
+	op->info.end		= OFFSET_MAX;
+	if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+		op->info.owner	= (__u64) fl->fl_pid;
+	else
+		op->info.owner	= (__u64)(long) fl->fl_owner;
+
+	op->info.flags |= DLM_PLOCK_FL_CLOSE;
+	send_op(op);
+}
+
+int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+		   int cmd, struct file_lock *fl)
+{
+	struct dlm_ls *ls;
+	struct plock_op *op;
+	struct plock_xop *xop;
+	int rv;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+
+	xop = kzalloc(sizeof(*xop), GFP_NOFS);
+	if (!xop) {
+		rv = -ENOMEM;
+		goto out;
+	}
+
+	op = &xop->xop;
+	op->info.optype		= DLM_PLOCK_OP_LOCK;
+	op->info.pid		= fl->fl_pid;
+	op->info.ex		= (fl->fl_type == F_WRLCK);
+	op->info.wait		= IS_SETLKW(cmd);
+	op->info.fsid		= ls->ls_global_id;
+	op->info.number		= number;
+	op->info.start		= fl->fl_start;
+	op->info.end		= fl->fl_end;
+	if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
+		/* fl_owner is lockd which doesn't distinguish
+		   processes on the nfs client */
+		op->info.owner	= (__u64) fl->fl_pid;
+		xop->callback	= fl->fl_lmops->fl_grant;
+		locks_init_lock(&xop->flc);
+		locks_copy_lock(&xop->flc, fl);
+		xop->fl		= fl;
+		xop->file	= file;
+	} else {
+		op->info.owner	= (__u64)(long) fl->fl_owner;
+		xop->callback	= NULL;
+	}
+
+	send_op(op);
+
+	if (xop->callback == NULL) {
+		rv = wait_event_killable(recv_wq, (op->done != 0));
+		if (rv == -ERESTARTSYS) {
+			log_debug(ls, "dlm_posix_lock: wait killed %llx",
+				  (unsigned long long)number);
+			spin_lock(&ops_lock);
+			list_del(&op->list);
+			spin_unlock(&ops_lock);
+			kfree(xop);
+			do_unlock_close(ls, number, file, fl);
+			goto out;
+		}
+	} else {
+		rv = FILE_LOCK_DEFERRED;
+		goto out;
+	}
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		log_error(ls, "dlm_posix_lock: op on list %llx",
+			  (unsigned long long)number);
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	rv = op->info.rv;
+
+	if (!rv) {
+		if (posix_lock_file_wait(file, fl) < 0)
+			log_error(ls, "dlm_posix_lock: vfs lock error %llx",
+				  (unsigned long long)number);
+	}
+
+	kfree(xop);
+out:
+	dlm_put_lockspace(ls);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(dlm_posix_lock);
+
+/* Returns failure iff a successful lock operation should be canceled */
+static int dlm_plock_callback(struct plock_op *op)
+{
+	struct file *file;
+	struct file_lock *fl;
+	struct file_lock *flc;
+	int (*notify)(void *, void *, int) = NULL;
+	struct plock_xop *xop = (struct plock_xop *)op;
+	int rv = 0;
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		log_print("dlm_plock_callback: op on list %llx",
+			  (unsigned long long)op->info.number);
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	/* check if the following 2 are still valid or make a copy */
+	file = xop->file;
+	flc = &xop->flc;
+	fl = xop->fl;
+	notify = xop->callback;
+
+	if (op->info.rv) {
+		notify(fl, NULL, op->info.rv);
+		goto out;
+	}
+
+	/* got fs lock; bookkeep locally as well: */
+	flc->fl_flags &= ~FL_SLEEP;
+	if (posix_lock_file(file, flc, NULL)) {
+		/*
+		 * This can only happen in the case of kmalloc() failure.
+		 * The filesystem's own lock is the authoritative lock,
+		 * so a failure to get the lock locally is not a disaster.
+		 * As long as the fs cannot reliably cancel locks (especially
+		 * in a low-memory situation), we're better off ignoring
+		 * this failure than trying to recover.
+		 */
+		log_print("dlm_plock_callback: vfs lock error %llx file %p fl %p",
+			  (unsigned long long)op->info.number, file, fl);
+	}
+
+	rv = notify(fl, NULL, 0);
+	if (rv) {
+		/* XXX: We need to cancel the fs lock here: */
+		log_print("dlm_plock_callback: lock granted after lock request "
+			  "failed; dangling lock!\n");
+		goto out;
+	}
+
+out:
+	kfree(xop);
+	return rv;
+}
+
+int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+		     struct file_lock *fl)
+{
+	struct dlm_ls *ls;
+	struct plock_op *op;
+	int rv;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+
+	op = kzalloc(sizeof(*op), GFP_NOFS);
+	if (!op) {
+		rv = -ENOMEM;
+		goto out;
+	}
+
+	if (posix_lock_file_wait(file, fl) < 0)
+		log_error(ls, "dlm_posix_unlock: vfs unlock error %llx",
+			  (unsigned long long)number);
+
+	op->info.optype		= DLM_PLOCK_OP_UNLOCK;
+	op->info.pid		= fl->fl_pid;
+	op->info.fsid		= ls->ls_global_id;
+	op->info.number		= number;
+	op->info.start		= fl->fl_start;
+	op->info.end		= fl->fl_end;
+	if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+		op->info.owner	= (__u64) fl->fl_pid;
+	else
+		op->info.owner	= (__u64)(long) fl->fl_owner;
+
+	if (fl->fl_flags & FL_CLOSE) {
+		op->info.flags |= DLM_PLOCK_FL_CLOSE;
+		send_op(op);
+		rv = 0;
+		goto out;
+	}
+
+	send_op(op);
+	wait_event(recv_wq, (op->done != 0));
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		log_error(ls, "dlm_posix_unlock: op on list %llx",
+			  (unsigned long long)number);
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	rv = op->info.rv;
+
+	if (rv == -ENOENT)
+		rv = 0;
+
+	kfree(op);
+out:
+	dlm_put_lockspace(ls);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(dlm_posix_unlock);
+
+int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+		  struct file_lock *fl)
+{
+	struct dlm_ls *ls;
+	struct plock_op *op;
+	int rv;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+
+	op = kzalloc(sizeof(*op), GFP_NOFS);
+	if (!op) {
+		rv = -ENOMEM;
+		goto out;
+	}
+
+	op->info.optype		= DLM_PLOCK_OP_GET;
+	op->info.pid		= fl->fl_pid;
+	op->info.ex		= (fl->fl_type == F_WRLCK);
+	op->info.fsid		= ls->ls_global_id;
+	op->info.number		= number;
+	op->info.start		= fl->fl_start;
+	op->info.end		= fl->fl_end;
+	if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+		op->info.owner	= (__u64) fl->fl_pid;
+	else
+		op->info.owner	= (__u64)(long) fl->fl_owner;
+
+	send_op(op);
+	wait_event(recv_wq, (op->done != 0));
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		log_error(ls, "dlm_posix_get: op on list %llx",
+			  (unsigned long long)number);
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	/* info.rv from userspace is 1 for conflict, 0 for no-conflict,
+	   -ENOENT if there are no locks on the file */
+
+	rv = op->info.rv;
+
+	fl->fl_type = F_UNLCK;
+	if (rv == -ENOENT)
+		rv = 0;
+	else if (rv > 0) {
+		locks_init_lock(fl);
+		fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
+		fl->fl_flags = FL_POSIX;
+		fl->fl_pid = op->info.pid;
+		fl->fl_start = op->info.start;
+		fl->fl_end = op->info.end;
+		rv = 0;
+	}
+
+	kfree(op);
+out:
+	dlm_put_lockspace(ls);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(dlm_posix_get);
+
+/* a read copies out one plock request from the send list */
+static ssize_t dev_read(struct file *file, char __user *u, size_t count,
+			loff_t *ppos)
+{
+	struct dlm_plock_info info;
+	struct plock_op *op = NULL;
+
+	if (count < sizeof(info))
+		return -EINVAL;
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&send_list)) {
+		op = list_entry(send_list.next, struct plock_op, list);
+		if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+			list_del(&op->list);
+		else
+			list_move(&op->list, &recv_list);
+		memcpy(&info, &op->info, sizeof(info));
+	}
+	spin_unlock(&ops_lock);
+
+	if (!op)
+		return -EAGAIN;
+
+	/* there is no need to get a reply from userspace for unlocks
+	   that were generated by the vfs cleaning up for a close
+	   (the process did not make an unlock call). */
+
+	if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+		kfree(op);
+
+	if (copy_to_user(u, &info, sizeof(info)))
+		return -EFAULT;
+	return sizeof(info);
+}
+
+/* a write copies in one plock result that should match a plock_op
+   on the recv list */
+static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
+			 loff_t *ppos)
+{
+	struct dlm_plock_info info;
+	struct plock_op *op;
+	int found = 0, do_callback = 0;
+
+	if (count != sizeof(info))
+		return -EINVAL;
+
+	if (copy_from_user(&info, u, sizeof(info)))
+		return -EFAULT;
+
+	if (check_version(&info))
+		return -EINVAL;
+
+	spin_lock(&ops_lock);
+	list_for_each_entry(op, &recv_list, list) {
+		if (op->info.fsid == info.fsid &&
+		    op->info.number == info.number &&
+		    op->info.owner == info.owner) {
+			struct plock_xop *xop = (struct plock_xop *)op;
+			list_del_init(&op->list);
+			memcpy(&op->info, &info, sizeof(info));
+			if (xop->callback)
+				do_callback = 1;
+			else
+				op->done = 1;
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock(&ops_lock);
+
+	if (found) {
+		if (do_callback)
+			dlm_plock_callback(op);
+		else
+			wake_up(&recv_wq);
+	} else
+		log_print("dev_write no op %x %llx", info.fsid,
+			  (unsigned long long)info.number);
+	return count;
+}
+
+static unsigned int dev_poll(struct file *file, poll_table *wait)
+{
+	unsigned int mask = 0;
+
+	poll_wait(file, &send_wq, wait);
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&send_list))
+		mask = POLLIN | POLLRDNORM;
+	spin_unlock(&ops_lock);
+
+	return mask;
+}
+
+static const struct file_operations dev_fops = {
+	.read    = dev_read,
+	.write   = dev_write,
+	.poll    = dev_poll,
+	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
+};
+
+static struct miscdevice plock_dev_misc = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = DLM_PLOCK_MISC_NAME,
+	.fops = &dev_fops
+};
+
+int dlm_plock_init(void)
+{
+	int rv;
+
+	spin_lock_init(&ops_lock);
+	INIT_LIST_HEAD(&send_list);
+	INIT_LIST_HEAD(&recv_list);
+	init_waitqueue_head(&send_wq);
+	init_waitqueue_head(&recv_wq);
+
+	rv = misc_register(&plock_dev_misc);
+	if (rv)
+		log_print("dlm_plock_init: misc_register failed %d", rv);
+	return rv;
+}
+
+void dlm_plock_exit(void)
+{
+	if (misc_deregister(&plock_dev_misc) < 0)
+		log_print("dlm_plock_exit: misc_deregister failed");
+}
+
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 00000000..f10a50f2
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,511 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "rcom.h"
+#include "recover.h"
+#include "dir.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "util.h"
+
+
+static int rcom_response(struct dlm_ls *ls)
+{
+	return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
+}
+
+static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
+		       struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	char *mb;
+	int mb_len = sizeof(struct dlm_rcom) + len;
+
+	mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
+	if (!mh) {
+		log_print("create_rcom to %d type %d len %d ENOBUFS",
+			  to_nodeid, type, len);
+		return -ENOBUFS;
+	}
+	memset(mb, 0, mb_len);
+
+	rc = (struct dlm_rcom *) mb;
+
+	rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+	rc->rc_header.h_lockspace = ls->ls_global_id;
+	rc->rc_header.h_nodeid = dlm_our_nodeid();
+	rc->rc_header.h_length = mb_len;
+	rc->rc_header.h_cmd = DLM_RCOM;
+
+	rc->rc_type = type;
+
+	spin_lock(&ls->ls_recover_lock);
+	rc->rc_seq = ls->ls_recover_seq;
+	spin_unlock(&ls->ls_recover_lock);
+
+	*mh_ret = mh;
+	*rc_ret = rc;
+	return 0;
+}
+
+static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
+		      struct dlm_rcom *rc)
+{
+	dlm_rcom_out(rc);
+	dlm_lowcomms_commit_buffer(mh);
+}
+
+/* When replying to a status request, a node also sends back its
+   configuration values.  The requesting node then checks that the remote
+   node is configured the same way as itself. */
+
+static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
+{
+	rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen);
+	rf->rf_lsflags = cpu_to_le32(ls->ls_exflags);
+}
+
+static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
+{
+	struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
+	size_t conf_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
+
+	if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) {
+		log_error(ls, "version mismatch: %x nodeid %d: %x",
+			  DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid,
+			  rc->rc_header.h_version);
+		return -EPROTO;
+	}
+
+	if (rc->rc_header.h_length < conf_size) {
+		log_error(ls, "config too short: %d nodeid %d",
+			  rc->rc_header.h_length, nodeid);
+		return -EPROTO;
+	}
+
+	if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen ||
+	    le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) {
+		log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
+			  ls->ls_lvblen, ls->ls_exflags, nodeid,
+			  le32_to_cpu(rf->rf_lvblen),
+			  le32_to_cpu(rf->rf_lsflags));
+		return -EPROTO;
+	}
+	return 0;
+}
+
+static void allow_sync_reply(struct dlm_ls *ls, uint64_t *new_seq)
+{
+	spin_lock(&ls->ls_rcom_spin);
+	*new_seq = ++ls->ls_rcom_seq;
+	set_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
+	spin_unlock(&ls->ls_rcom_spin);
+}
+
+static void disallow_sync_reply(struct dlm_ls *ls)
+{
+	spin_lock(&ls->ls_rcom_spin);
+	clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
+	clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+	spin_unlock(&ls->ls_rcom_spin);
+}
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error = 0;
+
+	ls->ls_recover_nodeid = nodeid;
+
+	if (nodeid == dlm_our_nodeid()) {
+		rc = ls->ls_recover_buf;
+		rc->rc_result = dlm_recover_status(ls);
+		goto out;
+	}
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
+	if (error)
+		goto out;
+
+	allow_sync_reply(ls, &rc->rc_id);
+	memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
+
+	send_rcom(ls, mh, rc);
+
+	error = dlm_wait_function(ls, &rcom_response);
+	disallow_sync_reply(ls);
+	if (error)
+		goto out;
+
+	rc = ls->ls_recover_buf;
+
+	if (rc->rc_result == -ESRCH) {
+		/* we pretend the remote lockspace exists with 0 status */
+		log_debug(ls, "remote node %d not ready", nodeid);
+		rc->rc_result = 0;
+	} else
+		error = check_config(ls, rc, nodeid);
+	/* the caller looks at rc_result for the remote recovery status */
+ out:
+	return error;
+}
+
+static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error, nodeid = rc_in->rc_header.h_nodeid;
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
+			    sizeof(struct rcom_config), &rc, &mh);
+	if (error)
+		return;
+	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
+	rc->rc_result = dlm_recover_status(ls);
+	make_config(ls, (struct rcom_config *) rc->rc_buf);
+
+	send_rcom(ls, mh, rc);
+}
+
+static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	spin_lock(&ls->ls_rcom_spin);
+	if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) ||
+	    rc_in->rc_id != ls->ls_rcom_seq) {
+		log_debug(ls, "reject reply %d from %d seq %llx expect %llx",
+			  rc_in->rc_type, rc_in->rc_header.h_nodeid,
+			  (unsigned long long)rc_in->rc_id,
+			  (unsigned long long)ls->ls_rcom_seq);
+		goto out;
+	}
+	memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
+	set_bit(LSFL_RCOM_READY, &ls->ls_flags);
+	clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
+	wake_up(&ls->ls_wait_general);
+ out:
+	spin_unlock(&ls->ls_rcom_spin);
+}
+
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error = 0;
+	int max_size = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom);
+
+	ls->ls_recover_nodeid = nodeid;
+
+	if (nodeid == dlm_our_nodeid()) {
+		ls->ls_recover_buf->rc_header.h_length =
+			dlm_config.ci_buffer_size;
+		dlm_copy_master_names(ls, last_name, last_len,
+		                      ls->ls_recover_buf->rc_buf,
+		                      max_size, nodeid);
+		goto out;
+	}
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
+	if (error)
+		goto out;
+	memcpy(rc->rc_buf, last_name, last_len);
+
+	allow_sync_reply(ls, &rc->rc_id);
+	memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
+
+	send_rcom(ls, mh, rc);
+
+	error = dlm_wait_function(ls, &rcom_response);
+	disallow_sync_reply(ls);
+ out:
+	return error;
+}
+
+static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error, inlen, outlen, nodeid;
+
+	nodeid = rc_in->rc_header.h_nodeid;
+	inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+	outlen = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom);
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
+	if (error)
+		return;
+	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
+
+	dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
+			      nodeid);
+	send_rcom(ls, mh, rc);
+}
+
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	struct dlm_ls *ls = r->res_ls;
+	int error;
+
+	error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
+			    &rc, &mh);
+	if (error)
+		goto out;
+	memcpy(rc->rc_buf, r->res_name, r->res_length);
+	rc->rc_id = (unsigned long) r;
+
+	send_rcom(ls, mh, rc);
+ out:
+	return error;
+}
+
+static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
+	int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
+	if (error)
+		return;
+
+	error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
+	if (error)
+		ret_nodeid = error;
+	rc->rc_result = ret_nodeid;
+	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
+
+	send_rcom(ls, mh, rc);
+}
+
+static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	dlm_recover_master_reply(ls, rc_in);
+}
+
+static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			   struct rcom_lock *rl)
+{
+	memset(rl, 0, sizeof(*rl));
+
+	rl->rl_ownpid = cpu_to_le32(lkb->lkb_ownpid);
+	rl->rl_lkid = cpu_to_le32(lkb->lkb_id);
+	rl->rl_exflags = cpu_to_le32(lkb->lkb_exflags);
+	rl->rl_flags = cpu_to_le32(lkb->lkb_flags);
+	rl->rl_lvbseq = cpu_to_le32(lkb->lkb_lvbseq);
+	rl->rl_rqmode = lkb->lkb_rqmode;
+	rl->rl_grmode = lkb->lkb_grmode;
+	rl->rl_status = lkb->lkb_status;
+	rl->rl_wait_type = cpu_to_le16(lkb->lkb_wait_type);
+
+	if (lkb->lkb_bastfn)
+		rl->rl_asts |= DLM_CB_BAST;
+	if (lkb->lkb_astfn)
+		rl->rl_asts |= DLM_CB_CAST;
+
+	rl->rl_namelen = cpu_to_le16(r->res_length);
+	memcpy(rl->rl_name, r->res_name, r->res_length);
+
+	/* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
+	   If so, receive_rcom_lock_args() won't take this copy. */
+
+	if (lkb->lkb_lvbptr)
+		memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+}
+
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	struct dlm_ls *ls = r->res_ls;
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	struct rcom_lock *rl;
+	int error, len = sizeof(struct rcom_lock);
+
+	if (lkb->lkb_lvbptr)
+		len += ls->ls_lvblen;
+
+	error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
+	if (error)
+		goto out;
+
+	rl = (struct rcom_lock *) rc->rc_buf;
+	pack_rcom_lock(r, lkb, rl);
+	rc->rc_id = (unsigned long) r;
+
+	send_rcom(ls, mh, rc);
+ out:
+	return error;
+}
+
+/* needs at least dlm_rcom + rcom_lock */
+static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error, nodeid = rc_in->rc_header.h_nodeid;
+
+	dlm_recover_master_copy(ls, rc_in);
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
+			    sizeof(struct rcom_lock), &rc, &mh);
+	if (error)
+		return;
+
+	/* We send back the same rcom_lock struct we received, but
+	   dlm_recover_master_copy() has filled in rl_remid and rl_result */
+
+	memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
+	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
+
+	send_rcom(ls, mh, rc);
+}
+
+/* If the lockspace doesn't exist then still send a status message
+   back; it's possible that it just doesn't have its global_id yet. */
+
+int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
+{
+	struct dlm_rcom *rc;
+	struct rcom_config *rf;
+	struct dlm_mhandle *mh;
+	char *mb;
+	int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
+
+	mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_NOFS, &mb);
+	if (!mh)
+		return -ENOBUFS;
+	memset(mb, 0, mb_len);
+
+	rc = (struct dlm_rcom *) mb;
+
+	rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+	rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
+	rc->rc_header.h_nodeid = dlm_our_nodeid();
+	rc->rc_header.h_length = mb_len;
+	rc->rc_header.h_cmd = DLM_RCOM;
+
+	rc->rc_type = DLM_RCOM_STATUS_REPLY;
+	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
+	rc->rc_result = -ESRCH;
+
+	rf = (struct rcom_config *) rc->rc_buf;
+	rf->rf_lvblen = cpu_to_le32(~0U);
+
+	dlm_rcom_out(rc);
+	dlm_lowcomms_commit_buffer(mh);
+
+	return 0;
+}
+
+static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+	uint64_t seq;
+	int rv = 0;
+
+	switch (rc->rc_type) {
+	case DLM_RCOM_STATUS_REPLY:
+	case DLM_RCOM_NAMES_REPLY:
+	case DLM_RCOM_LOOKUP_REPLY:
+	case DLM_RCOM_LOCK_REPLY:
+		spin_lock(&ls->ls_recover_lock);
+		seq = ls->ls_recover_seq;
+		spin_unlock(&ls->ls_recover_lock);
+		if (rc->rc_seq_reply != seq) {
+			log_debug(ls, "ignoring old reply %x from %d "
+				      "seq_reply %llx expect %llx",
+				      rc->rc_type, rc->rc_header.h_nodeid,
+				      (unsigned long long)rc->rc_seq_reply,
+				      (unsigned long long)seq);
+			rv = 1;
+		}
+	}
+	return rv;
+}
+
+/* Called by dlm_recv; corresponds to dlm_receive_message() but special
+   recovery-only comms are sent through here. */
+
+void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
+{
+	int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock);
+
+	if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
+		log_debug(ls, "ignoring recovery message %x from %d",
+			  rc->rc_type, nodeid);
+		goto out;
+	}
+
+	if (is_old_reply(ls, rc))
+		goto out;
+
+	switch (rc->rc_type) {
+	case DLM_RCOM_STATUS:
+		receive_rcom_status(ls, rc);
+		break;
+
+	case DLM_RCOM_NAMES:
+		receive_rcom_names(ls, rc);
+		break;
+
+	case DLM_RCOM_LOOKUP:
+		receive_rcom_lookup(ls, rc);
+		break;
+
+	case DLM_RCOM_LOCK:
+		if (rc->rc_header.h_length < lock_size)
+			goto Eshort;
+		receive_rcom_lock(ls, rc);
+		break;
+
+	case DLM_RCOM_STATUS_REPLY:
+		receive_sync_reply(ls, rc);
+		break;
+
+	case DLM_RCOM_NAMES_REPLY:
+		receive_sync_reply(ls, rc);
+		break;
+
+	case DLM_RCOM_LOOKUP_REPLY:
+		receive_rcom_lookup_reply(ls, rc);
+		break;
+
+	case DLM_RCOM_LOCK_REPLY:
+		if (rc->rc_header.h_length < lock_size)
+			goto Eshort;
+		dlm_recover_process_copy(ls, rc);
+		break;
+
+	default:
+		log_error(ls, "receive_rcom bad type %d", rc->rc_type);
+	}
+out:
+	return;
+Eshort:
+	log_error(ls, "recovery message %x from %d is too short",
+			  rc->rc_type, nodeid);
+}
+
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 00000000..b09abd29
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,25 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RCOM_DOT_H__
+#define __RCOM_DOT_H__
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid);
+int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in);
+
+#endif
+
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 00000000..14638235
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,789 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "dir.h"
+#include "config.h"
+#include "ast.h"
+#include "memory.h"
+#include "rcom.h"
+#include "lock.h"
+#include "lowcomms.h"
+#include "member.h"
+#include "recover.h"
+
+
+/*
+ * Recovery waiting routines: these functions wait for a particular reply from
+ * a remote node, or for the remote node to report a certain status.  They need
+ * to abort if the lockspace is stopped indicating a node has failed (perhaps
+ * the one being waited for).
+ */
+
+/*
+ * Wait until given function returns non-zero or lockspace is stopped
+ * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes).  When another
+ * function thinks it could have completed the waited-on task, they should wake
+ * up ls_wait_general to get an immediate response rather than waiting for the
+ * timer to detect the result.  A timer wakes us up periodically while waiting
+ * to see if we should abort due to a node failure.  This should only be called
+ * by the dlm_recoverd thread.
+ */
+
+static void dlm_wait_timer_fn(unsigned long data)
+{
+	struct dlm_ls *ls = (struct dlm_ls *) data;
+	mod_timer(&ls->ls_timer, jiffies + (dlm_config.ci_recover_timer * HZ));
+	wake_up(&ls->ls_wait_general);
+}
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
+{
+	int error = 0;
+
+	init_timer(&ls->ls_timer);
+	ls->ls_timer.function = dlm_wait_timer_fn;
+	ls->ls_timer.data = (long) ls;
+	ls->ls_timer.expires = jiffies + (dlm_config.ci_recover_timer * HZ);
+	add_timer(&ls->ls_timer);
+
+	wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
+	del_timer_sync(&ls->ls_timer);
+
+	if (dlm_recovery_stopped(ls)) {
+		log_debug(ls, "dlm_wait_function aborted");
+		error = -EINTR;
+	}
+	return error;
+}
+
+/*
+ * An efficient way for all nodes to wait for all others to have a certain
+ * status.  The node with the lowest nodeid polls all the others for their
+ * status (wait_status_all) and all the others poll the node with the low id
+ * for its accumulated result (wait_status_low).  When all nodes have set
+ * status flag X, then status flag X_ALL will be set on the low nodeid.
+ */
+
+uint32_t dlm_recover_status(struct dlm_ls *ls)
+{
+	uint32_t status;
+	spin_lock(&ls->ls_recover_lock);
+	status = ls->ls_recover_status;
+	spin_unlock(&ls->ls_recover_lock);
+	return status;
+}
+
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
+{
+	spin_lock(&ls->ls_recover_lock);
+	ls->ls_recover_status |= status;
+	spin_unlock(&ls->ls_recover_lock);
+}
+
+static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
+{
+	struct dlm_rcom *rc = ls->ls_recover_buf;
+	struct dlm_member *memb;
+	int error = 0, delay;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		delay = 0;
+		for (;;) {
+			if (dlm_recovery_stopped(ls)) {
+				error = -EINTR;
+				goto out;
+			}
+
+			error = dlm_rcom_status(ls, memb->nodeid);
+			if (error)
+				goto out;
+
+			if (rc->rc_result & wait_status)
+				break;
+			if (delay < 1000)
+				delay += 20;
+			msleep(delay);
+		}
+	}
+ out:
+	return error;
+}
+
+static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
+{
+	struct dlm_rcom *rc = ls->ls_recover_buf;
+	int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
+
+	for (;;) {
+		if (dlm_recovery_stopped(ls)) {
+			error = -EINTR;
+			goto out;
+		}
+
+		error = dlm_rcom_status(ls, nodeid);
+		if (error)
+			break;
+
+		if (rc->rc_result & wait_status)
+			break;
+		if (delay < 1000)
+			delay += 20;
+		msleep(delay);
+	}
+ out:
+	return error;
+}
+
+static int wait_status(struct dlm_ls *ls, uint32_t status)
+{
+	uint32_t status_all = status << 1;
+	int error;
+
+	if (ls->ls_low_nodeid == dlm_our_nodeid()) {
+		error = wait_status_all(ls, status);
+		if (!error)
+			dlm_set_recover_status(ls, status_all);
+	} else
+		error = wait_status_low(ls, status_all);
+
+	return error;
+}
+
+int dlm_recover_members_wait(struct dlm_ls *ls)
+{
+	return wait_status(ls, DLM_RS_NODES);
+}
+
+int dlm_recover_directory_wait(struct dlm_ls *ls)
+{
+	return wait_status(ls, DLM_RS_DIR);
+}
+
+int dlm_recover_locks_wait(struct dlm_ls *ls)
+{
+	return wait_status(ls, DLM_RS_LOCKS);
+}
+
+int dlm_recover_done_wait(struct dlm_ls *ls)
+{
+	return wait_status(ls, DLM_RS_DONE);
+}
+
+/*
+ * The recover_list contains all the rsb's for which we've requested the new
+ * master nodeid.  As replies are returned from the resource directories the
+ * rsb's are removed from the list.  When the list is empty we're done.
+ *
+ * The recover_list is later similarly used for all rsb's for which we've sent
+ * new lkb's and need to receive new corresponding lkid's.
+ *
+ * We use the address of the rsb struct as a simple local identifier for the
+ * rsb so we can match an rcom reply with the rsb it was sent for.
+ */
+
+static int recover_list_empty(struct dlm_ls *ls)
+{
+	int empty;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	empty = list_empty(&ls->ls_recover_list);
+	spin_unlock(&ls->ls_recover_list_lock);
+
+	return empty;
+}
+
+static void recover_list_add(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	if (list_empty(&r->res_recover_list)) {
+		list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
+		ls->ls_recover_list_count++;
+		dlm_hold_rsb(r);
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+}
+
+static void recover_list_del(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	list_del_init(&r->res_recover_list);
+	ls->ls_recover_list_count--;
+	spin_unlock(&ls->ls_recover_list_lock);
+
+	dlm_put_rsb(r);
+}
+
+static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
+{
+	struct dlm_rsb *r = NULL;
+
+	spin_lock(&ls->ls_recover_list_lock);
+
+	list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
+		if (id == (unsigned long) r)
+			goto out;
+	}
+	r = NULL;
+ out:
+	spin_unlock(&ls->ls_recover_list_lock);
+	return r;
+}
+
+static void recover_list_clear(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r, *s;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
+		list_del_init(&r->res_recover_list);
+		r->res_recover_locks_count = 0;
+		dlm_put_rsb(r);
+		ls->ls_recover_list_count--;
+	}
+
+	if (ls->ls_recover_list_count != 0) {
+		log_error(ls, "warning: recover_list_count %d",
+			  ls->ls_recover_list_count);
+		ls->ls_recover_list_count = 0;
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+}
+
+
+/* Master recovery: find new master node for rsb's that were
+   mastered on nodes that have been removed.
+
+   dlm_recover_masters
+   recover_master
+   dlm_send_rcom_lookup            ->  receive_rcom_lookup
+                                       dlm_dir_lookup
+   receive_rcom_lookup_reply       <-
+   dlm_recover_master_reply
+   set_new_master
+   set_master_lkbs
+   set_lock_master
+*/
+
+/*
+ * Set the lock master for all LKBs in a lock queue
+ * If we are the new master of the rsb, we may have received new
+ * MSTCPY locks from other nodes already which we need to ignore
+ * when setting the new nodeid.
+ */
+
+static void set_lock_master(struct list_head *queue, int nodeid)
+{
+	struct dlm_lkb *lkb;
+
+	list_for_each_entry(lkb, queue, lkb_statequeue)
+		if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
+			lkb->lkb_nodeid = nodeid;
+}
+
+static void set_master_lkbs(struct dlm_rsb *r)
+{
+	set_lock_master(&r->res_grantqueue, r->res_nodeid);
+	set_lock_master(&r->res_convertqueue, r->res_nodeid);
+	set_lock_master(&r->res_waitqueue, r->res_nodeid);
+}
+
+/*
+ * Propagate the new master nodeid to locks
+ * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
+ * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
+ * rsb's to consider.
+ */
+
+static void set_new_master(struct dlm_rsb *r, int nodeid)
+{
+	lock_rsb(r);
+	r->res_nodeid = nodeid;
+	set_master_lkbs(r);
+	rsb_set_flag(r, RSB_NEW_MASTER);
+	rsb_set_flag(r, RSB_NEW_MASTER2);
+	unlock_rsb(r);
+}
+
+/*
+ * We do async lookups on rsb's that need new masters.  The rsb's
+ * waiting for a lookup reply are kept on the recover_list.
+ */
+
+static int recover_master(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+	int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+
+	dir_nodeid = dlm_dir_nodeid(r);
+
+	if (dir_nodeid == our_nodeid) {
+		error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
+				       r->res_length, &ret_nodeid);
+		if (error)
+			log_error(ls, "recover dir lookup error %d", error);
+
+		if (ret_nodeid == our_nodeid)
+			ret_nodeid = 0;
+		set_new_master(r, ret_nodeid);
+	} else {
+		recover_list_add(r);
+		error = dlm_send_rcom_lookup(r, dir_nodeid);
+	}
+
+	return error;
+}
+
+/*
+ * When not using a directory, most resource names will hash to a new static
+ * master nodeid and the resource will need to be remastered.
+ */
+
+static int recover_master_static(struct dlm_rsb *r)
+{
+	int master = dlm_dir_nodeid(r);
+
+	if (master == dlm_our_nodeid())
+		master = 0;
+
+	if (r->res_nodeid != master) {
+		if (is_master(r))
+			dlm_purge_mstcpy_locks(r);
+		set_new_master(r, master);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Go through local root resources and for each rsb which has a master which
+ * has departed, get the new master nodeid from the directory.  The dir will
+ * assign mastery to the first node to look up the new master.  That means
+ * we'll discover in this lookup if we're the new master of any rsb's.
+ *
+ * We fire off all the dir lookup requests individually and asynchronously to
+ * the correct dir node.
+ */
+
+int dlm_recover_masters(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	int error = 0, count = 0;
+
+	log_debug(ls, "dlm_recover_masters");
+
+	down_read(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		if (dlm_recovery_stopped(ls)) {
+			up_read(&ls->ls_root_sem);
+			error = -EINTR;
+			goto out;
+		}
+
+		if (dlm_no_directory(ls))
+			count += recover_master_static(r);
+		else if (!is_master(r) &&
+			 (dlm_is_removed(ls, r->res_nodeid) ||
+			  rsb_flag(r, RSB_NEW_MASTER))) {
+			recover_master(r);
+			count++;
+		}
+
+		schedule();
+	}
+	up_read(&ls->ls_root_sem);
+
+	log_debug(ls, "dlm_recover_masters %d resources", count);
+
+	error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+	if (error)
+		recover_list_clear(ls);
+	return error;
+}
+
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+	struct dlm_rsb *r;
+	int nodeid;
+
+	r = recover_list_find(ls, rc->rc_id);
+	if (!r) {
+		log_error(ls, "dlm_recover_master_reply no id %llx",
+			  (unsigned long long)rc->rc_id);
+		goto out;
+	}
+
+	nodeid = rc->rc_result;
+	if (nodeid == dlm_our_nodeid())
+		nodeid = 0;
+
+	set_new_master(r, nodeid);
+	recover_list_del(r);
+
+	if (recover_list_empty(ls))
+		wake_up(&ls->ls_wait_general);
+ out:
+	return 0;
+}
+
+
+/* Lock recovery: rebuild the process-copy locks we hold on a
+   remastered rsb on the new rsb master.
+
+   dlm_recover_locks
+   recover_locks
+   recover_locks_queue
+   dlm_send_rcom_lock              ->  receive_rcom_lock
+                                       dlm_recover_master_copy
+   receive_rcom_lock_reply         <-
+   dlm_recover_process_copy
+*/
+
+
+/*
+ * keep a count of the number of lkb's we send to the new master; when we get
+ * an equal number of replies then recovery for the rsb is done
+ */
+
+static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
+{
+	struct dlm_lkb *lkb;
+	int error = 0;
+
+	list_for_each_entry(lkb, head, lkb_statequeue) {
+	   	error = dlm_send_rcom_lock(r, lkb);
+		if (error)
+			break;
+		r->res_recover_locks_count++;
+	}
+
+	return error;
+}
+
+static int recover_locks(struct dlm_rsb *r)
+{
+	int error = 0;
+
+	lock_rsb(r);
+
+	DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r););
+
+	error = recover_locks_queue(r, &r->res_grantqueue);
+	if (error)
+		goto out;
+	error = recover_locks_queue(r, &r->res_convertqueue);
+	if (error)
+		goto out;
+	error = recover_locks_queue(r, &r->res_waitqueue);
+	if (error)
+		goto out;
+
+	if (r->res_recover_locks_count)
+		recover_list_add(r);
+	else
+		rsb_clear_flag(r, RSB_NEW_MASTER);
+ out:
+	unlock_rsb(r);
+	return error;
+}
+
+int dlm_recover_locks(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	int error, count = 0;
+
+	log_debug(ls, "dlm_recover_locks");
+
+	down_read(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		if (is_master(r)) {
+			rsb_clear_flag(r, RSB_NEW_MASTER);
+			continue;
+		}
+
+		if (!rsb_flag(r, RSB_NEW_MASTER))
+			continue;
+
+		if (dlm_recovery_stopped(ls)) {
+			error = -EINTR;
+			up_read(&ls->ls_root_sem);
+			goto out;
+		}
+
+		error = recover_locks(r);
+		if (error) {
+			up_read(&ls->ls_root_sem);
+			goto out;
+		}
+
+		count += r->res_recover_locks_count;
+	}
+	up_read(&ls->ls_root_sem);
+
+	log_debug(ls, "dlm_recover_locks %d locks", count);
+
+	error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+	if (error)
+		recover_list_clear(ls);
+	else
+		dlm_set_recover_status(ls, DLM_RS_LOCKS);
+	return error;
+}
+
+void dlm_recovered_lock(struct dlm_rsb *r)
+{
+	DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r););
+
+	r->res_recover_locks_count--;
+	if (!r->res_recover_locks_count) {
+		rsb_clear_flag(r, RSB_NEW_MASTER);
+		recover_list_del(r);
+	}
+
+	if (recover_list_empty(r->res_ls))
+		wake_up(&r->res_ls->ls_wait_general);
+}
+
+/*
+ * The lvb needs to be recovered on all master rsb's.  This includes setting
+ * the VALNOTVALID flag if necessary, and determining the correct lvb contents
+ * based on the lvb's of the locks held on the rsb.
+ *
+ * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb.  If it
+ * was already set prior to recovery, it's not cleared, regardless of locks.
+ *
+ * The LVB contents are only considered for changing when this is a new master
+ * of the rsb (NEW_MASTER2).  Then, the rsb's lvb is taken from any lkb with
+ * mode > CR.  If no lkb's exist with mode above CR, the lvb contents are taken
+ * from the lkb with the largest lvb sequence number.
+ */
+
+static void recover_lvb(struct dlm_rsb *r)
+{
+	struct dlm_lkb *lkb, *high_lkb = NULL;
+	uint32_t high_seq = 0;
+	int lock_lvb_exists = 0;
+	int big_lock_exists = 0;
+	int lvblen = r->res_ls->ls_lvblen;
+
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+			continue;
+
+		lock_lvb_exists = 1;
+
+		if (lkb->lkb_grmode > DLM_LOCK_CR) {
+			big_lock_exists = 1;
+			goto setflag;
+		}
+
+		if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+			high_lkb = lkb;
+			high_seq = lkb->lkb_lvbseq;
+		}
+	}
+
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+			continue;
+
+		lock_lvb_exists = 1;
+
+		if (lkb->lkb_grmode > DLM_LOCK_CR) {
+			big_lock_exists = 1;
+			goto setflag;
+		}
+
+		if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+			high_lkb = lkb;
+			high_seq = lkb->lkb_lvbseq;
+		}
+	}
+
+ setflag:
+	if (!lock_lvb_exists)
+		goto out;
+
+	if (!big_lock_exists)
+		rsb_set_flag(r, RSB_VALNOTVALID);
+
+	/* don't mess with the lvb unless we're the new master */
+	if (!rsb_flag(r, RSB_NEW_MASTER2))
+		goto out;
+
+	if (!r->res_lvbptr) {
+		r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
+		if (!r->res_lvbptr)
+			goto out;
+	}
+
+	if (big_lock_exists) {
+		r->res_lvbseq = lkb->lkb_lvbseq;
+		memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
+	} else if (high_lkb) {
+		r->res_lvbseq = high_lkb->lkb_lvbseq;
+		memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
+	} else {
+		r->res_lvbseq = 0;
+		memset(r->res_lvbptr, 0, lvblen);
+	}
+ out:
+	return;
+}
+
+/* All master rsb's flagged RECOVER_CONVERT need to be looked at.  The locks
+   converting PR->CW or CW->PR need to have their lkb_grmode set. */
+
+static void recover_conversion(struct dlm_rsb *r)
+{
+	struct dlm_lkb *lkb;
+	int grmode = -1;
+
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+		if (lkb->lkb_grmode == DLM_LOCK_PR ||
+		    lkb->lkb_grmode == DLM_LOCK_CW) {
+			grmode = lkb->lkb_grmode;
+			break;
+		}
+	}
+
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+		if (lkb->lkb_grmode != DLM_LOCK_IV)
+			continue;
+		if (grmode == -1)
+			lkb->lkb_grmode = lkb->lkb_rqmode;
+		else
+			lkb->lkb_grmode = grmode;
+	}
+}
+
+/* We've become the new master for this rsb and waiting/converting locks may
+   need to be granted in dlm_grant_after_purge() due to locks that may have
+   existed from a removed node. */
+
+static void set_locks_purged(struct dlm_rsb *r)
+{
+	if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
+		rsb_set_flag(r, RSB_LOCKS_PURGED);
+}
+
+void dlm_recover_rsbs(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	int count = 0;
+
+	log_debug(ls, "dlm_recover_rsbs");
+
+	down_read(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		lock_rsb(r);
+		if (is_master(r)) {
+			if (rsb_flag(r, RSB_RECOVER_CONVERT))
+				recover_conversion(r);
+			if (rsb_flag(r, RSB_NEW_MASTER2))
+				set_locks_purged(r);
+			recover_lvb(r);
+			count++;
+		}
+		rsb_clear_flag(r, RSB_RECOVER_CONVERT);
+		rsb_clear_flag(r, RSB_NEW_MASTER2);
+		unlock_rsb(r);
+	}
+	up_read(&ls->ls_root_sem);
+
+	log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
+}
+
+/* Create a single list of all root rsb's to be used during recovery */
+
+int dlm_create_root_list(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	int i, error = 0;
+
+	down_write(&ls->ls_root_sem);
+	if (!list_empty(&ls->ls_root_list)) {
+		log_error(ls, "root list not empty");
+		error = -EINVAL;
+		goto out;
+	}
+
+	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+		spin_lock(&ls->ls_rsbtbl[i].lock);
+		list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
+			list_add(&r->res_root_list, &ls->ls_root_list);
+			dlm_hold_rsb(r);
+		}
+
+		/* If we're using a directory, add tossed rsbs to the root
+		   list; they'll have entries created in the new directory,
+		   but no other recovery steps should do anything with them. */
+
+		if (dlm_no_directory(ls)) {
+			spin_unlock(&ls->ls_rsbtbl[i].lock);
+			continue;
+		}
+
+		list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) {
+			list_add(&r->res_root_list, &ls->ls_root_list);
+			dlm_hold_rsb(r);
+		}
+		spin_unlock(&ls->ls_rsbtbl[i].lock);
+	}
+ out:
+	up_write(&ls->ls_root_sem);
+	return error;
+}
+
+void dlm_release_root_list(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r, *safe;
+
+	down_write(&ls->ls_root_sem);
+	list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
+		list_del_init(&r->res_root_list);
+		dlm_put_rsb(r);
+	}
+	up_write(&ls->ls_root_sem);
+}
+
+/* If not using a directory, clear the entire toss list, there's no benefit to
+   caching the master value since it's fixed.  If we are using a dir, keep the
+   rsb's we're the master of.  Recovery will add them to the root list and from
+   there they'll be entered in the rebuilt directory. */
+
+void dlm_clear_toss_list(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r, *safe;
+	int i;
+
+	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+		spin_lock(&ls->ls_rsbtbl[i].lock);
+		list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
+					 res_hashchain) {
+			if (dlm_no_directory(ls) || !is_master(r)) {
+				list_del(&r->res_hashchain);
+				dlm_free_rsb(r);
+			}
+		}
+		spin_unlock(&ls->ls_rsbtbl[i].lock);
+	}
+}
+
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 00000000..ebd0363f
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVER_DOT_H__
+#define __RECOVER_DOT_H__
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
+uint32_t dlm_recover_status(struct dlm_ls *ls);
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
+int dlm_recover_members_wait(struct dlm_ls *ls);
+int dlm_recover_directory_wait(struct dlm_ls *ls);
+int dlm_recover_locks_wait(struct dlm_ls *ls);
+int dlm_recover_done_wait(struct dlm_ls *ls);
+int dlm_recover_masters(struct dlm_ls *ls);
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_locks(struct dlm_ls *ls);
+void dlm_recovered_lock(struct dlm_rsb *r);
+int dlm_create_root_list(struct dlm_ls *ls);
+void dlm_release_root_list(struct dlm_ls *ls);
+void dlm_clear_toss_list(struct dlm_ls *ls);
+void dlm_recover_rsbs(struct dlm_ls *ls);
+
+#endif				/* __RECOVER_DOT_H__ */
+
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 00000000..fd677c8c
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,323 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "dir.h"
+#include "ast.h"
+#include "recover.h"
+#include "lowcomms.h"
+#include "lock.h"
+#include "requestqueue.h"
+#include "recoverd.h"
+
+
+/* If the start for which we're re-enabling locking (seq) has been superseded
+   by a newer stop (ls_recover_seq), we need to leave locking disabled.
+
+   We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees
+   locking stopped and b) adds a message to the requestqueue, but dlm_recoverd
+   enables locking and clears the requestqueue between a and b. */
+
+static int enable_locking(struct dlm_ls *ls, uint64_t seq)
+{
+	int error = -EINTR;
+
+	down_write(&ls->ls_recv_active);
+
+	spin_lock(&ls->ls_recover_lock);
+	if (ls->ls_recover_seq == seq) {
+		set_bit(LSFL_RUNNING, &ls->ls_flags);
+		/* unblocks processes waiting to enter the dlm */
+		up_write(&ls->ls_in_recovery);
+		error = 0;
+	}
+	spin_unlock(&ls->ls_recover_lock);
+
+	up_write(&ls->ls_recv_active);
+	return error;
+}
+
+static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
+{
+	unsigned long start;
+	int error, neg = 0;
+
+	log_debug(ls, "recover %llx", (unsigned long long)rv->seq);
+
+	mutex_lock(&ls->ls_recoverd_active);
+
+	/*
+	 * Suspending and resuming dlm_astd ensures that no lkb's from this ls
+	 * will be processed by dlm_astd during recovery.
+	 */
+
+	dlm_astd_suspend();
+	dlm_astd_resume();
+
+	/*
+	 * Free non-master tossed rsb's.  Master rsb's are kept on toss
+	 * list and put on root list to be included in resdir recovery.
+	 */
+
+	dlm_clear_toss_list(ls);
+
+	/*
+	 * This list of root rsb's will be the basis of most of the recovery
+	 * routines.
+	 */
+
+	dlm_create_root_list(ls);
+
+	/*
+	 * Add or remove nodes from the lockspace's ls_nodes list.
+	 * Also waits for all nodes to complete dlm_recover_members.
+	 */
+
+	error = dlm_recover_members(ls, rv, &neg);
+	if (error) {
+		log_debug(ls, "recover_members failed %d", error);
+		goto fail;
+	}
+	start = jiffies;
+
+	/*
+	 * Rebuild our own share of the directory by collecting from all other
+	 * nodes their master rsb names that hash to us.
+	 */
+
+	error = dlm_recover_directory(ls);
+	if (error) {
+		log_debug(ls, "recover_directory failed %d", error);
+		goto fail;
+	}
+
+	/*
+	 * Wait for all nodes to complete directory rebuild.
+	 */
+
+	error = dlm_recover_directory_wait(ls);
+	if (error) {
+		log_debug(ls, "recover_directory_wait failed %d", error);
+		goto fail;
+	}
+
+	/*
+	 * We may have outstanding operations that are waiting for a reply from
+	 * a failed node.  Mark these to be resent after recovery.  Unlock and
+	 * cancel ops can just be completed.
+	 */
+
+	dlm_recover_waiters_pre(ls);
+
+	error = dlm_recovery_stopped(ls);
+	if (error)
+		goto fail;
+
+	if (neg || dlm_no_directory(ls)) {
+		/*
+		 * Clear lkb's for departed nodes.
+		 */
+
+		dlm_purge_locks(ls);
+
+		/*
+		 * Get new master nodeid's for rsb's that were mastered on
+		 * departed nodes.
+		 */
+
+		error = dlm_recover_masters(ls);
+		if (error) {
+			log_debug(ls, "recover_masters failed %d", error);
+			goto fail;
+		}
+
+		/*
+		 * Send our locks on remastered rsb's to the new masters.
+		 */
+
+		error = dlm_recover_locks(ls);
+		if (error) {
+			log_debug(ls, "recover_locks failed %d", error);
+			goto fail;
+		}
+
+		error = dlm_recover_locks_wait(ls);
+		if (error) {
+			log_debug(ls, "recover_locks_wait failed %d", error);
+			goto fail;
+		}
+
+		/*
+		 * Finalize state in master rsb's now that all locks can be
+		 * checked.  This includes conversion resolution and lvb
+		 * settings.
+		 */
+
+		dlm_recover_rsbs(ls);
+	} else {
+		/*
+		 * Other lockspace members may be going through the "neg" steps
+		 * while also adding us to the lockspace, in which case they'll
+		 * be doing the recover_locks (RS_LOCKS) barrier.
+		 */
+		dlm_set_recover_status(ls, DLM_RS_LOCKS);
+
+		error = dlm_recover_locks_wait(ls);
+		if (error) {
+			log_debug(ls, "recover_locks_wait failed %d", error);
+			goto fail;
+		}
+	}
+
+	dlm_release_root_list(ls);
+
+	/*
+	 * Purge directory-related requests that are saved in requestqueue.
+	 * All dir requests from before recovery are invalid now due to the dir
+	 * rebuild and will be resent by the requesting nodes.
+	 */
+
+	dlm_purge_requestqueue(ls);
+
+	dlm_set_recover_status(ls, DLM_RS_DONE);
+	error = dlm_recover_done_wait(ls);
+	if (error) {
+		log_debug(ls, "recover_done_wait failed %d", error);
+		goto fail;
+	}
+
+	dlm_clear_members_gone(ls);
+
+	dlm_adjust_timeouts(ls);
+
+	error = enable_locking(ls, rv->seq);
+	if (error) {
+		log_debug(ls, "enable_locking failed %d", error);
+		goto fail;
+	}
+
+	error = dlm_process_requestqueue(ls);
+	if (error) {
+		log_debug(ls, "process_requestqueue failed %d", error);
+		goto fail;
+	}
+
+	error = dlm_recover_waiters_post(ls);
+	if (error) {
+		log_debug(ls, "recover_waiters_post failed %d", error);
+		goto fail;
+	}
+
+	dlm_grant_after_purge(ls);
+
+	dlm_astd_wake();
+
+	log_debug(ls, "recover %llx done: %u ms",
+		  (unsigned long long)rv->seq,
+		  jiffies_to_msecs(jiffies - start));
+	mutex_unlock(&ls->ls_recoverd_active);
+
+	return 0;
+
+ fail:
+	dlm_release_root_list(ls);
+	log_debug(ls, "recover %llx error %d",
+		  (unsigned long long)rv->seq, error);
+	mutex_unlock(&ls->ls_recoverd_active);
+	return error;
+}
+
+/* The dlm_ls_start() that created the rv we take here may already have been
+   stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP
+   flag set. */
+
+static void do_ls_recovery(struct dlm_ls *ls)
+{
+	struct dlm_recover *rv = NULL;
+
+	spin_lock(&ls->ls_recover_lock);
+	rv = ls->ls_recover_args;
+	ls->ls_recover_args = NULL;
+	if (rv && ls->ls_recover_seq == rv->seq)
+		clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+	spin_unlock(&ls->ls_recover_lock);
+
+	if (rv) {
+		ls_recover(ls, rv);
+		kfree(rv->nodeids);
+		kfree(rv->new);
+		kfree(rv);
+	}
+}
+
+static int dlm_recoverd(void *arg)
+{
+	struct dlm_ls *ls;
+
+	ls = dlm_find_lockspace_local(arg);
+	if (!ls) {
+		log_print("dlm_recoverd: no lockspace %p", arg);
+		return -1;
+	}
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!test_bit(LSFL_WORK, &ls->ls_flags))
+			schedule();
+		set_current_state(TASK_RUNNING);
+
+		if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
+			do_ls_recovery(ls);
+	}
+
+	dlm_put_lockspace(ls);
+	return 0;
+}
+
+void dlm_recoverd_kick(struct dlm_ls *ls)
+{
+	set_bit(LSFL_WORK, &ls->ls_flags);
+	wake_up_process(ls->ls_recoverd_task);
+}
+
+int dlm_recoverd_start(struct dlm_ls *ls)
+{
+	struct task_struct *p;
+	int error = 0;
+
+	p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
+	if (IS_ERR(p))
+		error = PTR_ERR(p);
+	else
+                ls->ls_recoverd_task = p;
+	return error;
+}
+
+void dlm_recoverd_stop(struct dlm_ls *ls)
+{
+	kthread_stop(ls->ls_recoverd_task);
+}
+
+void dlm_recoverd_suspend(struct dlm_ls *ls)
+{
+	wake_up(&ls->ls_wait_general);
+	mutex_lock(&ls->ls_recoverd_active);
+}
+
+void dlm_recoverd_resume(struct dlm_ls *ls)
+{
+	mutex_unlock(&ls->ls_recoverd_active);
+}
+
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 00000000..866657c5
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVERD_DOT_H__
+#define __RECOVERD_DOT_H__
+
+void dlm_recoverd_kick(struct dlm_ls *ls);
+void dlm_recoverd_stop(struct dlm_ls *ls);
+int dlm_recoverd_start(struct dlm_ls *ls);
+void dlm_recoverd_suspend(struct dlm_ls *ls);
+void dlm_recoverd_resume(struct dlm_ls *ls);
+
+#endif				/* __RECOVERD_DOT_H__ */
+
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 00000000..a44fa228
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,188 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "member.h"
+#include "lock.h"
+#include "dir.h"
+#include "config.h"
+#include "requestqueue.h"
+
+struct rq_entry {
+	struct list_head list;
+	int nodeid;
+	struct dlm_message request;
+};
+
+/*
+ * Requests received while the lockspace is in recovery get added to the
+ * request queue and processed when recovery is complete.  This happens when
+ * the lockspace is suspended on some nodes before it is on others, or the
+ * lockspace is enabled on some while still suspended on others.
+ */
+
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
+{
+	struct rq_entry *e;
+	int length = ms->m_header.h_length - sizeof(struct dlm_message);
+
+	e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS);
+	if (!e) {
+		log_print("dlm_add_requestqueue: out of memory len %d", length);
+		return;
+	}
+
+	e->nodeid = nodeid;
+	memcpy(&e->request, ms, ms->m_header.h_length);
+
+	mutex_lock(&ls->ls_requestqueue_mutex);
+	list_add_tail(&e->list, &ls->ls_requestqueue);
+	mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
+/*
+ * Called by dlm_recoverd to process normal messages saved while recovery was
+ * happening.  Normal locking has been enabled before this is called.  dlm_recv
+ * upon receiving a message, will wait for all saved messages to be drained
+ * here before processing the message it got.  If a new dlm_ls_stop() arrives
+ * while we're processing these saved messages, it may block trying to suspend
+ * dlm_recv if dlm_recv is waiting for us in dlm_wait_requestqueue.  In that
+ * case, we don't abort since locking_stopped is still 0.  If dlm_recv is not
+ * waiting for us, then this processing may be aborted due to locking_stopped.
+ */
+
+int dlm_process_requestqueue(struct dlm_ls *ls)
+{
+	struct rq_entry *e;
+	int error = 0;
+
+	mutex_lock(&ls->ls_requestqueue_mutex);
+
+	for (;;) {
+		if (list_empty(&ls->ls_requestqueue)) {
+			mutex_unlock(&ls->ls_requestqueue_mutex);
+			error = 0;
+			break;
+		}
+		e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
+		mutex_unlock(&ls->ls_requestqueue_mutex);
+
+		dlm_receive_message_saved(ls, &e->request);
+
+		mutex_lock(&ls->ls_requestqueue_mutex);
+		list_del(&e->list);
+		kfree(e);
+
+		if (dlm_locking_stopped(ls)) {
+			log_debug(ls, "process_requestqueue abort running");
+			mutex_unlock(&ls->ls_requestqueue_mutex);
+			error = -EINTR;
+			break;
+		}
+		schedule();
+	}
+
+	return error;
+}
+
+/*
+ * After recovery is done, locking is resumed and dlm_recoverd takes all the
+ * saved requests and processes them as they would have been by dlm_recv.  At
+ * the same time, dlm_recv will start receiving new requests from remote nodes.
+ * We want to delay dlm_recv processing new requests until dlm_recoverd has
+ * finished processing the old saved requests.  We don't check for locking
+ * stopped here because dlm_ls_stop won't stop locking until it's suspended us
+ * (dlm_recv).
+ */
+
+void dlm_wait_requestqueue(struct dlm_ls *ls)
+{
+	for (;;) {
+		mutex_lock(&ls->ls_requestqueue_mutex);
+		if (list_empty(&ls->ls_requestqueue))
+			break;
+		mutex_unlock(&ls->ls_requestqueue_mutex);
+		schedule();
+	}
+	mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
+static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
+{
+	uint32_t type = ms->m_type;
+
+	/* the ls is being cleaned up and freed by release_lockspace */
+	if (!ls->ls_count)
+		return 1;
+
+	if (dlm_is_removed(ls, nodeid))
+		return 1;
+
+	/* directory operations are always purged because the directory is
+	   always rebuilt during recovery and the lookups resent */
+
+	if (type == DLM_MSG_REMOVE ||
+	    type == DLM_MSG_LOOKUP ||
+	    type == DLM_MSG_LOOKUP_REPLY)
+		return 1;
+
+	if (!dlm_no_directory(ls))
+		return 0;
+
+	/* with no directory, the master is likely to change as a part of
+	   recovery; requests to/from the defunct master need to be purged */
+
+	switch (type) {
+	case DLM_MSG_REQUEST:
+	case DLM_MSG_CONVERT:
+	case DLM_MSG_UNLOCK:
+	case DLM_MSG_CANCEL:
+		/* we're no longer the master of this resource, the sender
+		   will resend to the new master (see waiter_needs_recovery) */
+
+		if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
+			return 1;
+		break;
+
+	case DLM_MSG_REQUEST_REPLY:
+	case DLM_MSG_CONVERT_REPLY:
+	case DLM_MSG_UNLOCK_REPLY:
+	case DLM_MSG_CANCEL_REPLY:
+	case DLM_MSG_GRANT:
+		/* this reply is from the former master of the resource,
+		   we'll resend to the new master if needed */
+
+		if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
+			return 1;
+		break;
+	}
+
+	return 0;
+}
+
+void dlm_purge_requestqueue(struct dlm_ls *ls)
+{
+	struct dlm_message *ms;
+	struct rq_entry *e, *safe;
+
+	mutex_lock(&ls->ls_requestqueue_mutex);
+	list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
+		ms =  &e->request;
+
+		if (purge_request(ls, ms, e->nodeid)) {
+			list_del(&e->list);
+			kfree(e);
+		}
+	}
+	mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 00000000..10ce449b
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __REQUESTQUEUE_DOT_H__
+#define __REQUESTQUEUE_DOT_H__
+
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms);
+int dlm_process_requestqueue(struct dlm_ls *ls);
+void dlm_wait_requestqueue(struct dlm_ls *ls);
+void dlm_purge_requestqueue(struct dlm_ls *ls);
+
+#endif
+
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
new file mode 100644
index 00000000..e96bf3e9
--- /dev/null
+++ b/fs/dlm/user.c
@@ -0,0 +1,1011 @@
+/*
+ * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/signal.h>
+#include <linux/spinlock.h>
+#include <linux/dlm.h>
+#include <linux/dlm_device.h>
+#include <linux/slab.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "lvb_table.h"
+#include "user.h"
+#include "ast.h"
+
+static const char name_prefix[] = "dlm";
+static const struct file_operations device_fops;
+static atomic_t dlm_monitor_opened;
+static int dlm_monitor_unused = 1;
+
+#ifdef CONFIG_COMPAT
+
+struct dlm_lock_params32 {
+	__u8 mode;
+	__u8 namelen;
+	__u16 unused;
+	__u32 flags;
+	__u32 lkid;
+	__u32 parent;
+	__u64 xid;
+	__u64 timeout;
+	__u32 castparam;
+	__u32 castaddr;
+	__u32 bastparam;
+	__u32 bastaddr;
+	__u32 lksb;
+	char lvb[DLM_USER_LVB_LEN];
+	char name[0];
+};
+
+struct dlm_write_request32 {
+	__u32 version[3];
+	__u8 cmd;
+	__u8 is64bit;
+	__u8 unused[2];
+
+	union  {
+		struct dlm_lock_params32 lock;
+		struct dlm_lspace_params lspace;
+		struct dlm_purge_params purge;
+	} i;
+};
+
+struct dlm_lksb32 {
+	__u32 sb_status;
+	__u32 sb_lkid;
+	__u8 sb_flags;
+	__u32 sb_lvbptr;
+};
+
+struct dlm_lock_result32 {
+	__u32 version[3];
+	__u32 length;
+	__u32 user_astaddr;
+	__u32 user_astparam;
+	__u32 user_lksb;
+	struct dlm_lksb32 lksb;
+	__u8 bast_mode;
+	__u8 unused[3];
+	/* Offsets may be zero if no data is present */
+	__u32 lvb_offset;
+};
+
+static void compat_input(struct dlm_write_request *kb,
+			 struct dlm_write_request32 *kb32,
+			 int namelen)
+{
+	kb->version[0] = kb32->version[0];
+	kb->version[1] = kb32->version[1];
+	kb->version[2] = kb32->version[2];
+
+	kb->cmd = kb32->cmd;
+	kb->is64bit = kb32->is64bit;
+	if (kb->cmd == DLM_USER_CREATE_LOCKSPACE ||
+	    kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
+		kb->i.lspace.flags = kb32->i.lspace.flags;
+		kb->i.lspace.minor = kb32->i.lspace.minor;
+		memcpy(kb->i.lspace.name, kb32->i.lspace.name, namelen);
+	} else if (kb->cmd == DLM_USER_PURGE) {
+		kb->i.purge.nodeid = kb32->i.purge.nodeid;
+		kb->i.purge.pid = kb32->i.purge.pid;
+	} else {
+		kb->i.lock.mode = kb32->i.lock.mode;
+		kb->i.lock.namelen = kb32->i.lock.namelen;
+		kb->i.lock.flags = kb32->i.lock.flags;
+		kb->i.lock.lkid = kb32->i.lock.lkid;
+		kb->i.lock.parent = kb32->i.lock.parent;
+		kb->i.lock.xid = kb32->i.lock.xid;
+		kb->i.lock.timeout = kb32->i.lock.timeout;
+		kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
+		kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
+		kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
+		kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
+		kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
+		memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
+		memcpy(kb->i.lock.name, kb32->i.lock.name, namelen);
+	}
+}
+
+static void compat_output(struct dlm_lock_result *res,
+			  struct dlm_lock_result32 *res32)
+{
+	res32->version[0] = res->version[0];
+	res32->version[1] = res->version[1];
+	res32->version[2] = res->version[2];
+
+	res32->user_astaddr = (__u32)(long)res->user_astaddr;
+	res32->user_astparam = (__u32)(long)res->user_astparam;
+	res32->user_lksb = (__u32)(long)res->user_lksb;
+	res32->bast_mode = res->bast_mode;
+
+	res32->lvb_offset = res->lvb_offset;
+	res32->length = res->length;
+
+	res32->lksb.sb_status = res->lksb.sb_status;
+	res32->lksb.sb_flags = res->lksb.sb_flags;
+	res32->lksb.sb_lkid = res->lksb.sb_lkid;
+	res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr;
+}
+#endif
+
+/* Figure out if this lock is at the end of its life and no longer
+   available for the application to use.  The lkb still exists until
+   the final ast is read.  A lock becomes EOL in three situations:
+     1. a noqueue request fails with EAGAIN
+     2. an unlock completes with EUNLOCK
+     3. a cancel of a waiting request completes with ECANCEL/EDEADLK
+   An EOL lock needs to be removed from the process's list of locks.
+   And we can't allow any new operation on an EOL lock.  This is
+   not related to the lifetime of the lkb struct which is managed
+   entirely by refcount. */
+
+static int lkb_is_endoflife(int mode, int status)
+{
+	switch (status) {
+	case -DLM_EUNLOCK:
+		return 1;
+	case -DLM_ECANCEL:
+	case -ETIMEDOUT:
+	case -EDEADLK:
+	case -EAGAIN:
+		if (mode == DLM_LOCK_IV)
+			return 1;
+		break;
+	}
+	return 0;
+}
+
+/* we could possibly check if the cancel of an orphan has resulted in the lkb
+   being removed and then remove that lkb from the orphans list and free it */
+
+void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
+		      int status, uint32_t sbflags, uint64_t seq)
+{
+	struct dlm_ls *ls;
+	struct dlm_user_args *ua;
+	struct dlm_user_proc *proc;
+	int rv;
+
+	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
+		return;
+
+	ls = lkb->lkb_resource->res_ls;
+	mutex_lock(&ls->ls_clear_proc_locks);
+
+	/* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
+	   can't be delivered.  For ORPHAN's, dlm_clear_proc_locks() freed
+	   lkb->ua so we can't try to use it.  This second check is necessary
+	   for cases where a completion ast is received for an operation that
+	   began before clear_proc_locks did its cancel/unlock. */
+
+	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
+		goto out;
+
+	DLM_ASSERT(lkb->lkb_ua, dlm_print_lkb(lkb););
+	ua = lkb->lkb_ua;
+	proc = ua->proc;
+
+	if ((flags & DLM_CB_BAST) && ua->bastaddr == NULL)
+		goto out;
+
+	if ((flags & DLM_CB_CAST) && lkb_is_endoflife(mode, status))
+		lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
+
+	spin_lock(&proc->asts_spin);
+
+	rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
+	if (rv < 0) {
+		spin_unlock(&proc->asts_spin);
+		goto out;
+	}
+
+	if (list_empty(&lkb->lkb_astqueue)) {
+		kref_get(&lkb->lkb_ref);
+		list_add_tail(&lkb->lkb_astqueue, &proc->asts);
+		wake_up_interruptible(&proc->wait);
+	}
+	spin_unlock(&proc->asts_spin);
+
+	if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
+		/* N.B. spin_lock locks_spin, not asts_spin */
+		spin_lock(&proc->locks_spin);
+		if (!list_empty(&lkb->lkb_ownqueue)) {
+			list_del_init(&lkb->lkb_ownqueue);
+			dlm_put_lkb(lkb);
+		}
+		spin_unlock(&proc->locks_spin);
+	}
+ out:
+	mutex_unlock(&ls->ls_clear_proc_locks);
+}
+
+static int device_user_lock(struct dlm_user_proc *proc,
+			    struct dlm_lock_params *params)
+{
+	struct dlm_ls *ls;
+	struct dlm_user_args *ua;
+	int error = -ENOMEM;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	if (!params->castaddr || !params->lksb) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
+	if (!ua)
+		goto out;
+	ua->proc = proc;
+	ua->user_lksb = params->lksb;
+	ua->castparam = params->castparam;
+	ua->castaddr = params->castaddr;
+	ua->bastparam = params->bastparam;
+	ua->bastaddr = params->bastaddr;
+	ua->xid = params->xid;
+
+	if (params->flags & DLM_LKF_CONVERT)
+		error = dlm_user_convert(ls, ua,
+				         params->mode, params->flags,
+				         params->lkid, params->lvb,
+					 (unsigned long) params->timeout);
+	else {
+		error = dlm_user_request(ls, ua,
+					 params->mode, params->flags,
+					 params->name, params->namelen,
+					 (unsigned long) params->timeout);
+		if (!error)
+			error = ua->lksb.sb_lkid;
+	}
+ out:
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+static int device_user_unlock(struct dlm_user_proc *proc,
+			      struct dlm_lock_params *params)
+{
+	struct dlm_ls *ls;
+	struct dlm_user_args *ua;
+	int error = -ENOMEM;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
+	if (!ua)
+		goto out;
+	ua->proc = proc;
+	ua->user_lksb = params->lksb;
+	ua->castparam = params->castparam;
+	ua->castaddr = params->castaddr;
+
+	if (params->flags & DLM_LKF_CANCEL)
+		error = dlm_user_cancel(ls, ua, params->flags, params->lkid);
+	else
+		error = dlm_user_unlock(ls, ua, params->flags, params->lkid,
+					params->lvb);
+ out:
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+static int device_user_deadlock(struct dlm_user_proc *proc,
+				struct dlm_lock_params *params)
+{
+	struct dlm_ls *ls;
+	int error;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	error = dlm_user_deadlock(ls, params->flags, params->lkid);
+
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+static int dlm_device_register(struct dlm_ls *ls, char *name)
+{
+	int error, len;
+
+	/* The device is already registered.  This happens when the
+	   lockspace is created multiple times from userspace. */
+	if (ls->ls_device.name)
+		return 0;
+
+	error = -ENOMEM;
+	len = strlen(name) + strlen(name_prefix) + 2;
+	ls->ls_device.name = kzalloc(len, GFP_NOFS);
+	if (!ls->ls_device.name)
+		goto fail;
+
+	snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
+		 name);
+	ls->ls_device.fops = &device_fops;
+	ls->ls_device.minor = MISC_DYNAMIC_MINOR;
+
+	error = misc_register(&ls->ls_device);
+	if (error) {
+		kfree(ls->ls_device.name);
+	}
+fail:
+	return error;
+}
+
+int dlm_device_deregister(struct dlm_ls *ls)
+{
+	int error;
+
+	/* The device is not registered.  This happens when the lockspace
+	   was never used from userspace, or when device_create_lockspace()
+	   calls dlm_release_lockspace() after the register fails. */
+	if (!ls->ls_device.name)
+		return 0;
+
+	error = misc_deregister(&ls->ls_device);
+	if (!error)
+		kfree(ls->ls_device.name);
+	return error;
+}
+
+static int device_user_purge(struct dlm_user_proc *proc,
+			     struct dlm_purge_params *params)
+{
+	struct dlm_ls *ls;
+	int error;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	error = dlm_user_purge(ls, proc, params->nodeid, params->pid);
+
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+static int device_create_lockspace(struct dlm_lspace_params *params)
+{
+	dlm_lockspace_t *lockspace;
+	struct dlm_ls *ls;
+	int error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	error = dlm_new_lockspace(params->name, strlen(params->name),
+				  &lockspace, params->flags, DLM_USER_LVB_LEN);
+	if (error)
+		return error;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	error = dlm_device_register(ls, params->name);
+	dlm_put_lockspace(ls);
+
+	if (error)
+		dlm_release_lockspace(lockspace, 0);
+	else
+		error = ls->ls_device.minor;
+
+	return error;
+}
+
+static int device_remove_lockspace(struct dlm_lspace_params *params)
+{
+	dlm_lockspace_t *lockspace;
+	struct dlm_ls *ls;
+	int error, force = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ls = dlm_find_lockspace_device(params->minor);
+	if (!ls)
+		return -ENOENT;
+
+	if (params->flags & DLM_USER_LSFLG_FORCEFREE)
+		force = 2;
+
+	lockspace = ls->ls_local_handle;
+	dlm_put_lockspace(ls);
+
+	/* The final dlm_release_lockspace waits for references to go to
+	   zero, so all processes will need to close their device for the
+	   ls before the release will proceed.  release also calls the
+	   device_deregister above.  Converting a positive return value
+	   from release to zero means that userspace won't know when its
+	   release was the final one, but it shouldn't need to know. */
+
+	error = dlm_release_lockspace(lockspace, force);
+	if (error > 0)
+		error = 0;
+	return error;
+}
+
+/* Check the user's version matches ours */
+static int check_version(struct dlm_write_request *req)
+{
+	if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
+	    (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
+	     req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
+
+		printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
+		       "user (%d.%d.%d) kernel (%d.%d.%d)\n",
+		       current->comm,
+		       task_pid_nr(current),
+		       req->version[0],
+		       req->version[1],
+		       req->version[2],
+		       DLM_DEVICE_VERSION_MAJOR,
+		       DLM_DEVICE_VERSION_MINOR,
+		       DLM_DEVICE_VERSION_PATCH);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * device_write
+ *
+ *   device_user_lock
+ *     dlm_user_request -> request_lock
+ *     dlm_user_convert -> convert_lock
+ *
+ *   device_user_unlock
+ *     dlm_user_unlock -> unlock_lock
+ *     dlm_user_cancel -> cancel_lock
+ *
+ *   device_create_lockspace
+ *     dlm_new_lockspace
+ *
+ *   device_remove_lockspace
+ *     dlm_release_lockspace
+ */
+
+/* a write to a lockspace device is a lock or unlock request, a write
+   to the control device is to create/remove a lockspace */
+
+static ssize_t device_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	struct dlm_user_proc *proc = file->private_data;
+	struct dlm_write_request *kbuf;
+	sigset_t tmpsig, allsigs;
+	int error;
+
+#ifdef CONFIG_COMPAT
+	if (count < sizeof(struct dlm_write_request32))
+#else
+	if (count < sizeof(struct dlm_write_request))
+#endif
+		return -EINVAL;
+
+	kbuf = kzalloc(count + 1, GFP_NOFS);
+	if (!kbuf)
+		return -ENOMEM;
+
+	if (copy_from_user(kbuf, buf, count)) {
+		error = -EFAULT;
+		goto out_free;
+	}
+
+	if (check_version(kbuf)) {
+		error = -EBADE;
+		goto out_free;
+	}
+
+#ifdef CONFIG_COMPAT
+	if (!kbuf->is64bit) {
+		struct dlm_write_request32 *k32buf;
+		int namelen = 0;
+
+		if (count > sizeof(struct dlm_write_request32))
+			namelen = count - sizeof(struct dlm_write_request32);
+
+		k32buf = (struct dlm_write_request32 *)kbuf;
+
+		/* add 1 after namelen so that the name string is terminated */
+		kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1,
+			       GFP_NOFS);
+		if (!kbuf) {
+			kfree(k32buf);
+			return -ENOMEM;
+		}
+
+		if (proc)
+			set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
+
+		compat_input(kbuf, k32buf, namelen);
+		kfree(k32buf);
+	}
+#endif
+
+	/* do we really need this? can a write happen after a close? */
+	if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
+	    (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))) {
+		error = -EINVAL;
+		goto out_free;
+	}
+
+	sigfillset(&allsigs);
+	sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+
+	error = -EINVAL;
+
+	switch (kbuf->cmd)
+	{
+	case DLM_USER_LOCK:
+		if (!proc) {
+			log_print("no locking on control device");
+			goto out_sig;
+		}
+		error = device_user_lock(proc, &kbuf->i.lock);
+		break;
+
+	case DLM_USER_UNLOCK:
+		if (!proc) {
+			log_print("no locking on control device");
+			goto out_sig;
+		}
+		error = device_user_unlock(proc, &kbuf->i.lock);
+		break;
+
+	case DLM_USER_DEADLOCK:
+		if (!proc) {
+			log_print("no locking on control device");
+			goto out_sig;
+		}
+		error = device_user_deadlock(proc, &kbuf->i.lock);
+		break;
+
+	case DLM_USER_CREATE_LOCKSPACE:
+		if (proc) {
+			log_print("create/remove only on control device");
+			goto out_sig;
+		}
+		error = device_create_lockspace(&kbuf->i.lspace);
+		break;
+
+	case DLM_USER_REMOVE_LOCKSPACE:
+		if (proc) {
+			log_print("create/remove only on control device");
+			goto out_sig;
+		}
+		error = device_remove_lockspace(&kbuf->i.lspace);
+		break;
+
+	case DLM_USER_PURGE:
+		if (!proc) {
+			log_print("no locking on control device");
+			goto out_sig;
+		}
+		error = device_user_purge(proc, &kbuf->i.purge);
+		break;
+
+	default:
+		log_print("Unknown command passed to DLM device : %d\n",
+			  kbuf->cmd);
+	}
+
+ out_sig:
+	sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+ out_free:
+	kfree(kbuf);
+	return error;
+}
+
+/* Every process that opens the lockspace device has its own "proc" structure
+   hanging off the open file that's used to keep track of locks owned by the
+   process and asts that need to be delivered to the process. */
+
+static int device_open(struct inode *inode, struct file *file)
+{
+	struct dlm_user_proc *proc;
+	struct dlm_ls *ls;
+
+	ls = dlm_find_lockspace_device(iminor(inode));
+	if (!ls)
+		return -ENOENT;
+
+	proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS);
+	if (!proc) {
+		dlm_put_lockspace(ls);
+		return -ENOMEM;
+	}
+
+	proc->lockspace = ls->ls_local_handle;
+	INIT_LIST_HEAD(&proc->asts);
+	INIT_LIST_HEAD(&proc->locks);
+	INIT_LIST_HEAD(&proc->unlocking);
+	spin_lock_init(&proc->asts_spin);
+	spin_lock_init(&proc->locks_spin);
+	init_waitqueue_head(&proc->wait);
+	file->private_data = proc;
+
+	return 0;
+}
+
+static int device_close(struct inode *inode, struct file *file)
+{
+	struct dlm_user_proc *proc = file->private_data;
+	struct dlm_ls *ls;
+	sigset_t tmpsig, allsigs;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	sigfillset(&allsigs);
+	sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+
+	set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
+
+	dlm_clear_proc_locks(ls, proc);
+
+	/* at this point no more lkb's should exist for this lockspace,
+	   so there's no chance of dlm_user_add_ast() being called and
+	   looking for lkb->ua->proc */
+
+	kfree(proc);
+	file->private_data = NULL;
+
+	dlm_put_lockspace(ls);
+	dlm_put_lockspace(ls);  /* for the find in device_open() */
+
+	/* FIXME: AUTOFREE: if this ls is no longer used do
+	   device_remove_lockspace() */
+
+	sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+	recalc_sigpending();
+
+	return 0;
+}
+
+static int copy_result_to_user(struct dlm_user_args *ua, int compat,
+			       uint32_t flags, int mode, int copy_lvb,
+			       char __user *buf, size_t count)
+{
+#ifdef CONFIG_COMPAT
+	struct dlm_lock_result32 result32;
+#endif
+	struct dlm_lock_result result;
+	void *resultptr;
+	int error=0;
+	int len;
+	int struct_len;
+
+	memset(&result, 0, sizeof(struct dlm_lock_result));
+	result.version[0] = DLM_DEVICE_VERSION_MAJOR;
+	result.version[1] = DLM_DEVICE_VERSION_MINOR;
+	result.version[2] = DLM_DEVICE_VERSION_PATCH;
+	memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
+	result.user_lksb = ua->user_lksb;
+
+	/* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
+	   in a conversion unless the conversion is successful.  See code
+	   in dlm_user_convert() for updating ua from ua_tmp.  OpenVMS, though,
+	   notes that a new blocking AST address and parameter are set even if
+	   the conversion fails, so maybe we should just do that. */
+
+	if (flags & DLM_CB_BAST) {
+		result.user_astaddr = ua->bastaddr;
+		result.user_astparam = ua->bastparam;
+		result.bast_mode = mode;
+	} else {
+		result.user_astaddr = ua->castaddr;
+		result.user_astparam = ua->castparam;
+	}
+
+#ifdef CONFIG_COMPAT
+	if (compat)
+		len = sizeof(struct dlm_lock_result32);
+	else
+#endif
+		len = sizeof(struct dlm_lock_result);
+	struct_len = len;
+
+	/* copy lvb to userspace if there is one, it's been updated, and
+	   the user buffer has space for it */
+
+	if (copy_lvb && ua->lksb.sb_lvbptr && count >= len + DLM_USER_LVB_LEN) {
+		if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
+				 DLM_USER_LVB_LEN)) {
+			error = -EFAULT;
+			goto out;
+		}
+
+		result.lvb_offset = len;
+		len += DLM_USER_LVB_LEN;
+	}
+
+	result.length = len;
+	resultptr = &result;
+#ifdef CONFIG_COMPAT
+	if (compat) {
+		compat_output(&result, &result32);
+		resultptr = &result32;
+	}
+#endif
+
+	if (copy_to_user(buf, resultptr, struct_len))
+		error = -EFAULT;
+	else
+		error = len;
+ out:
+	return error;
+}
+
+static int copy_version_to_user(char __user *buf, size_t count)
+{
+	struct dlm_device_version ver;
+
+	memset(&ver, 0, sizeof(struct dlm_device_version));
+	ver.version[0] = DLM_DEVICE_VERSION_MAJOR;
+	ver.version[1] = DLM_DEVICE_VERSION_MINOR;
+	ver.version[2] = DLM_DEVICE_VERSION_PATCH;
+
+	if (copy_to_user(buf, &ver, sizeof(struct dlm_device_version)))
+		return -EFAULT;
+	return sizeof(struct dlm_device_version);
+}
+
+/* a read returns a single ast described in a struct dlm_lock_result */
+
+static ssize_t device_read(struct file *file, char __user *buf, size_t count,
+			   loff_t *ppos)
+{
+	struct dlm_user_proc *proc = file->private_data;
+	struct dlm_lkb *lkb;
+	DECLARE_WAITQUEUE(wait, current);
+	struct dlm_callback cb;
+	int rv, resid, copy_lvb = 0;
+
+	if (count == sizeof(struct dlm_device_version)) {
+		rv = copy_version_to_user(buf, count);
+		return rv;
+	}
+
+	if (!proc) {
+		log_print("non-version read from control device %zu", count);
+		return -EINVAL;
+	}
+
+#ifdef CONFIG_COMPAT
+	if (count < sizeof(struct dlm_lock_result32))
+#else
+	if (count < sizeof(struct dlm_lock_result))
+#endif
+		return -EINVAL;
+
+ try_another:
+
+	/* do we really need this? can a read happen after a close? */
+	if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
+		return -EINVAL;
+
+	spin_lock(&proc->asts_spin);
+	if (list_empty(&proc->asts)) {
+		if (file->f_flags & O_NONBLOCK) {
+			spin_unlock(&proc->asts_spin);
+			return -EAGAIN;
+		}
+
+		add_wait_queue(&proc->wait, &wait);
+
+	repeat:
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (list_empty(&proc->asts) && !signal_pending(current)) {
+			spin_unlock(&proc->asts_spin);
+			schedule();
+			spin_lock(&proc->asts_spin);
+			goto repeat;
+		}
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&proc->wait, &wait);
+
+		if (signal_pending(current)) {
+			spin_unlock(&proc->asts_spin);
+			return -ERESTARTSYS;
+		}
+	}
+
+	/* if we empty lkb_callbacks, we don't want to unlock the spinlock
+	   without removing lkb_astqueue; so empty lkb_astqueue is always
+	   consistent with empty lkb_callbacks */
+
+	lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
+
+	rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid);
+	if (rv < 0) {
+		/* this shouldn't happen; lkb should have been removed from
+		   list when resid was zero */
+		log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id);
+		list_del_init(&lkb->lkb_astqueue);
+		spin_unlock(&proc->asts_spin);
+		/* removes ref for proc->asts, may cause lkb to be freed */
+		dlm_put_lkb(lkb);
+		goto try_another;
+	}
+	if (!resid)
+		list_del_init(&lkb->lkb_astqueue);
+	spin_unlock(&proc->asts_spin);
+
+	if (cb.flags & DLM_CB_SKIP) {
+		/* removes ref for proc->asts, may cause lkb to be freed */
+		if (!resid)
+			dlm_put_lkb(lkb);
+		goto try_another;
+	}
+
+	if (cb.flags & DLM_CB_CAST) {
+		int old_mode, new_mode;
+
+		old_mode = lkb->lkb_last_cast.mode;
+		new_mode = cb.mode;
+
+		if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr &&
+		    dlm_lvb_operations[old_mode + 1][new_mode + 1])
+			copy_lvb = 1;
+
+		lkb->lkb_lksb->sb_status = cb.sb_status;
+		lkb->lkb_lksb->sb_flags = cb.sb_flags;
+	}
+
+	rv = copy_result_to_user(lkb->lkb_ua,
+				 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+				 cb.flags, cb.mode, copy_lvb, buf, count);
+
+	/* removes ref for proc->asts, may cause lkb to be freed */
+	if (!resid)
+		dlm_put_lkb(lkb);
+
+	return rv;
+}
+
+static unsigned int device_poll(struct file *file, poll_table *wait)
+{
+	struct dlm_user_proc *proc = file->private_data;
+
+	poll_wait(file, &proc->wait, wait);
+
+	spin_lock(&proc->asts_spin);
+	if (!list_empty(&proc->asts)) {
+		spin_unlock(&proc->asts_spin);
+		return POLLIN | POLLRDNORM;
+	}
+	spin_unlock(&proc->asts_spin);
+	return 0;
+}
+
+int dlm_user_daemon_available(void)
+{
+	/* dlm_controld hasn't started (or, has started, but not
+	   properly populated configfs) */
+
+	if (!dlm_our_nodeid())
+		return 0;
+
+	/* This is to deal with versions of dlm_controld that don't
+	   know about the monitor device.  We assume that if the
+	   dlm_controld was started (above), but the monitor device
+	   was never opened, that it's an old version.  dlm_controld
+	   should open the monitor device before populating configfs. */
+
+	if (dlm_monitor_unused)
+		return 1;
+
+	return atomic_read(&dlm_monitor_opened) ? 1 : 0;
+}
+
+static int ctl_device_open(struct inode *inode, struct file *file)
+{
+	file->private_data = NULL;
+	return 0;
+}
+
+static int ctl_device_close(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static int monitor_device_open(struct inode *inode, struct file *file)
+{
+	atomic_inc(&dlm_monitor_opened);
+	dlm_monitor_unused = 0;
+	return 0;
+}
+
+static int monitor_device_close(struct inode *inode, struct file *file)
+{
+	if (atomic_dec_and_test(&dlm_monitor_opened))
+		dlm_stop_lockspaces();
+	return 0;
+}
+
+static const struct file_operations device_fops = {
+	.open    = device_open,
+	.release = device_close,
+	.read    = device_read,
+	.write   = device_write,
+	.poll    = device_poll,
+	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
+};
+
+static const struct file_operations ctl_device_fops = {
+	.open    = ctl_device_open,
+	.release = ctl_device_close,
+	.read    = device_read,
+	.write   = device_write,
+	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
+};
+
+static struct miscdevice ctl_device = {
+	.name  = "dlm-control",
+	.fops  = &ctl_device_fops,
+	.minor = MISC_DYNAMIC_MINOR,
+};
+
+static const struct file_operations monitor_device_fops = {
+	.open    = monitor_device_open,
+	.release = monitor_device_close,
+	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
+};
+
+static struct miscdevice monitor_device = {
+	.name  = "dlm-monitor",
+	.fops  = &monitor_device_fops,
+	.minor = MISC_DYNAMIC_MINOR,
+};
+
+int __init dlm_user_init(void)
+{
+	int error;
+
+	atomic_set(&dlm_monitor_opened, 0);
+
+	error = misc_register(&ctl_device);
+	if (error) {
+		log_print("misc_register failed for control device");
+		goto out;
+	}
+
+	error = misc_register(&monitor_device);
+	if (error) {
+		log_print("misc_register failed for monitor device");
+		misc_deregister(&ctl_device);
+	}
+ out:
+	return error;
+}
+
+void dlm_user_exit(void)
+{
+	misc_deregister(&ctl_device);
+	misc_deregister(&monitor_device);
+}
+
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
new file mode 100644
index 00000000..00499ab8
--- /dev/null
+++ b/fs/dlm/user.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __USER_DOT_H__
+#define __USER_DOT_H__
+
+void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
+                      int status, uint32_t sbflags, uint64_t seq);
+int dlm_user_init(void);
+void dlm_user_exit(void);
+int dlm_device_deregister(struct dlm_ls *ls);
+int dlm_user_daemon_available(void);
+
+#endif
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 00000000..e36520af
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,154 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "rcom.h"
+#include "util.h"
+
+#define DLM_ERRNO_EDEADLK		35
+#define DLM_ERRNO_EBADR			53
+#define DLM_ERRNO_EBADSLT		57
+#define DLM_ERRNO_EPROTO		71
+#define DLM_ERRNO_EOPNOTSUPP		95
+#define DLM_ERRNO_ETIMEDOUT	       110
+#define DLM_ERRNO_EINPROGRESS	       115
+
+static void header_out(struct dlm_header *hd)
+{
+	hd->h_version		= cpu_to_le32(hd->h_version);
+	hd->h_lockspace		= cpu_to_le32(hd->h_lockspace);
+	hd->h_nodeid		= cpu_to_le32(hd->h_nodeid);
+	hd->h_length		= cpu_to_le16(hd->h_length);
+}
+
+static void header_in(struct dlm_header *hd)
+{
+	hd->h_version		= le32_to_cpu(hd->h_version);
+	hd->h_lockspace		= le32_to_cpu(hd->h_lockspace);
+	hd->h_nodeid		= le32_to_cpu(hd->h_nodeid);
+	hd->h_length		= le16_to_cpu(hd->h_length);
+}
+
+/* higher errno values are inconsistent across architectures, so select
+   one set of values for on the wire */
+
+static int to_dlm_errno(int err)
+{
+	switch (err) {
+	case -EDEADLK:
+		return -DLM_ERRNO_EDEADLK;
+	case -EBADR:
+		return -DLM_ERRNO_EBADR;
+	case -EBADSLT:
+		return -DLM_ERRNO_EBADSLT;
+	case -EPROTO:
+		return -DLM_ERRNO_EPROTO;
+	case -EOPNOTSUPP:
+		return -DLM_ERRNO_EOPNOTSUPP;
+	case -ETIMEDOUT:
+		return -DLM_ERRNO_ETIMEDOUT;
+	case -EINPROGRESS:
+		return -DLM_ERRNO_EINPROGRESS;
+	}
+	return err;
+}
+
+static int from_dlm_errno(int err)
+{
+	switch (err) {
+	case -DLM_ERRNO_EDEADLK:
+		return -EDEADLK;
+	case -DLM_ERRNO_EBADR:
+		return -EBADR;
+	case -DLM_ERRNO_EBADSLT:
+		return -EBADSLT;
+	case -DLM_ERRNO_EPROTO:
+		return -EPROTO;
+	case -DLM_ERRNO_EOPNOTSUPP:
+		return -EOPNOTSUPP;
+	case -DLM_ERRNO_ETIMEDOUT:
+		return -ETIMEDOUT;
+	case -DLM_ERRNO_EINPROGRESS:
+		return -EINPROGRESS;
+	}
+	return err;
+}
+
+void dlm_message_out(struct dlm_message *ms)
+{
+	header_out(&ms->m_header);
+
+	ms->m_type		= cpu_to_le32(ms->m_type);
+	ms->m_nodeid		= cpu_to_le32(ms->m_nodeid);
+	ms->m_pid		= cpu_to_le32(ms->m_pid);
+	ms->m_lkid		= cpu_to_le32(ms->m_lkid);
+	ms->m_remid		= cpu_to_le32(ms->m_remid);
+	ms->m_parent_lkid	= cpu_to_le32(ms->m_parent_lkid);
+	ms->m_parent_remid	= cpu_to_le32(ms->m_parent_remid);
+	ms->m_exflags		= cpu_to_le32(ms->m_exflags);
+	ms->m_sbflags		= cpu_to_le32(ms->m_sbflags);
+	ms->m_flags		= cpu_to_le32(ms->m_flags);
+	ms->m_lvbseq		= cpu_to_le32(ms->m_lvbseq);
+	ms->m_hash		= cpu_to_le32(ms->m_hash);
+	ms->m_status		= cpu_to_le32(ms->m_status);
+	ms->m_grmode		= cpu_to_le32(ms->m_grmode);
+	ms->m_rqmode		= cpu_to_le32(ms->m_rqmode);
+	ms->m_bastmode		= cpu_to_le32(ms->m_bastmode);
+	ms->m_asts		= cpu_to_le32(ms->m_asts);
+	ms->m_result		= cpu_to_le32(to_dlm_errno(ms->m_result));
+}
+
+void dlm_message_in(struct dlm_message *ms)
+{
+	header_in(&ms->m_header);
+
+	ms->m_type		= le32_to_cpu(ms->m_type);
+	ms->m_nodeid		= le32_to_cpu(ms->m_nodeid);
+	ms->m_pid		= le32_to_cpu(ms->m_pid);
+	ms->m_lkid		= le32_to_cpu(ms->m_lkid);
+	ms->m_remid		= le32_to_cpu(ms->m_remid);
+	ms->m_parent_lkid	= le32_to_cpu(ms->m_parent_lkid);
+	ms->m_parent_remid	= le32_to_cpu(ms->m_parent_remid);
+	ms->m_exflags		= le32_to_cpu(ms->m_exflags);
+	ms->m_sbflags		= le32_to_cpu(ms->m_sbflags);
+	ms->m_flags		= le32_to_cpu(ms->m_flags);
+	ms->m_lvbseq		= le32_to_cpu(ms->m_lvbseq);
+	ms->m_hash		= le32_to_cpu(ms->m_hash);
+	ms->m_status		= le32_to_cpu(ms->m_status);
+	ms->m_grmode		= le32_to_cpu(ms->m_grmode);
+	ms->m_rqmode		= le32_to_cpu(ms->m_rqmode);
+	ms->m_bastmode		= le32_to_cpu(ms->m_bastmode);
+	ms->m_asts		= le32_to_cpu(ms->m_asts);
+	ms->m_result		= from_dlm_errno(le32_to_cpu(ms->m_result));
+}
+
+void dlm_rcom_out(struct dlm_rcom *rc)
+{
+	header_out(&rc->rc_header);
+
+	rc->rc_type		= cpu_to_le32(rc->rc_type);
+	rc->rc_result		= cpu_to_le32(rc->rc_result);
+	rc->rc_id		= cpu_to_le64(rc->rc_id);
+	rc->rc_seq		= cpu_to_le64(rc->rc_seq);
+	rc->rc_seq_reply	= cpu_to_le64(rc->rc_seq_reply);
+}
+
+void dlm_rcom_in(struct dlm_rcom *rc)
+{
+	header_in(&rc->rc_header);
+
+	rc->rc_type		= le32_to_cpu(rc->rc_type);
+	rc->rc_result		= le32_to_cpu(rc->rc_result);
+	rc->rc_id		= le64_to_cpu(rc->rc_id);
+	rc->rc_seq		= le64_to_cpu(rc->rc_seq);
+	rc->rc_seq_reply	= le64_to_cpu(rc->rc_seq_reply);
+}
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 00000000..2b259151
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+
+void dlm_message_out(struct dlm_message *ms);
+void dlm_message_in(struct dlm_message *ms);
+void dlm_rcom_out(struct dlm_rcom *rc);
+void dlm_rcom_in(struct dlm_rcom *rc);
+
+#endif
+
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
new file mode 100644
index 00000000..c00e055b
--- /dev/null
+++ b/fs/drop_caches.c
@@ -0,0 +1,67 @@
+/*
+ * Implement the manual drop-all-pagecache function
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/writeback.h>
+#include <linux/sysctl.h>
+#include <linux/gfp.h>
+#include "internal.h"
+
+/* A global variable is a bit ugly, but it keeps the code simple */
+int sysctl_drop_caches;
+
+static void drop_pagecache_sb(struct super_block *sb, void *unused)
+{
+	struct inode *inode, *toput_inode = NULL;
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		spin_lock(&inode->i_lock);
+		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+		    (inode->i_mapping->nrpages == 0)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_sb_list_lock);
+		invalidate_mapping_pages(inode->i_mapping, 0, -1);
+		iput(toput_inode);
+		toput_inode = inode;
+		spin_lock(&inode_sb_list_lock);
+	}
+	spin_unlock(&inode_sb_list_lock);
+	iput(toput_inode);
+}
+
+static void drop_slab(void)
+{
+	int nr_objects;
+	struct shrink_control shrink = {
+		.gfp_mask = GFP_KERNEL,
+	};
+
+	do {
+		nr_objects = shrink_slab(&shrink, 1000, 1000);
+	} while (nr_objects > 10);
+}
+
+int drop_caches_sysctl_handler(ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (ret)
+		return ret;
+	if (write) {
+		if (sysctl_drop_caches & 1)
+			iterate_supers(drop_pagecache_sb, NULL);
+		if (sysctl_drop_caches & 2)
+			drop_slab();
+	}
+	return 0;
+}
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
new file mode 100644
index 00000000..1cd6d9d3
--- /dev/null
+++ b/fs/ecryptfs/Kconfig
@@ -0,0 +1,14 @@
+config ECRYPT_FS
+	tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && KEYS && CRYPTO
+	select CRYPTO_ECB
+	select CRYPTO_CBC
+	select CRYPTO_MD5
+	help
+	  Encrypted filesystem that operates on the VFS layer.  See
+	  <file:Documentation/filesystems/ecryptfs.txt> to learn more about
+	  eCryptfs.  Userspace components are required and can be
+	  obtained from <http://ecryptfs.sf.net>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ecryptfs.
diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile
new file mode 100644
index 00000000..2cc9ee4a
--- /dev/null
+++ b/fs/ecryptfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux 2.6 eCryptfs
+#
+
+obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o
+
+ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o miscdev.o kthread.o debug.o
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
new file mode 100644
index 00000000..c6602d24
--- /dev/null
+++ b/fs/ecryptfs/crypto.c
@@ -0,0 +1,2238 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2004 Erez Zadok
+ * Copyright (C) 2001-2004 Stony Brook University
+ * Copyright (C) 2004-2007 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *   		Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/random.h>
+#include <linux/compiler.h>
+#include <linux/key.h>
+#include <linux/namei.h>
+#include <linux/crypto.h>
+#include <linux/file.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include "ecryptfs_kernel.h"
+
+static int
+ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+			     struct page *dst_page, int dst_offset,
+			     struct page *src_page, int src_offset, int size,
+			     unsigned char *iv);
+static int
+ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+			     struct page *dst_page, int dst_offset,
+			     struct page *src_page, int src_offset, int size,
+			     unsigned char *iv);
+
+/**
+ * ecryptfs_to_hex
+ * @dst: Buffer to take hex character representation of contents of
+ *       src; must be at least of size (src_size * 2)
+ * @src: Buffer to be converted to a hex string respresentation
+ * @src_size: number of bytes to convert
+ */
+void ecryptfs_to_hex(char *dst, char *src, size_t src_size)
+{
+	int x;
+
+	for (x = 0; x < src_size; x++)
+		sprintf(&dst[x * 2], "%.2x", (unsigned char)src[x]);
+}
+
+/**
+ * ecryptfs_from_hex
+ * @dst: Buffer to take the bytes from src hex; must be at least of
+ *       size (src_size / 2)
+ * @src: Buffer to be converted from a hex string respresentation to raw value
+ * @dst_size: size of dst buffer, or number of hex characters pairs to convert
+ */
+void ecryptfs_from_hex(char *dst, char *src, int dst_size)
+{
+	int x;
+	char tmp[3] = { 0, };
+
+	for (x = 0; x < dst_size; x++) {
+		tmp[0] = src[x * 2];
+		tmp[1] = src[x * 2 + 1];
+		dst[x] = (unsigned char)simple_strtol(tmp, NULL, 16);
+	}
+}
+
+/**
+ * ecryptfs_calculate_md5 - calculates the md5 of @src
+ * @dst: Pointer to 16 bytes of allocated memory
+ * @crypt_stat: Pointer to crypt_stat struct for the current inode
+ * @src: Data to be md5'd
+ * @len: Length of @src
+ *
+ * Uses the allocated crypto context that crypt_stat references to
+ * generate the MD5 sum of the contents of src.
+ */
+static int ecryptfs_calculate_md5(char *dst,
+				  struct ecryptfs_crypt_stat *crypt_stat,
+				  char *src, int len)
+{
+	struct scatterlist sg;
+	struct hash_desc desc = {
+		.tfm = crypt_stat->hash_tfm,
+		.flags = CRYPTO_TFM_REQ_MAY_SLEEP
+	};
+	int rc = 0;
+
+	mutex_lock(&crypt_stat->cs_hash_tfm_mutex);
+	sg_init_one(&sg, (u8 *)src, len);
+	if (!desc.tfm) {
+		desc.tfm = crypto_alloc_hash(ECRYPTFS_DEFAULT_HASH, 0,
+					     CRYPTO_ALG_ASYNC);
+		if (IS_ERR(desc.tfm)) {
+			rc = PTR_ERR(desc.tfm);
+			ecryptfs_printk(KERN_ERR, "Error attempting to "
+					"allocate crypto context; rc = [%d]\n",
+					rc);
+			goto out;
+		}
+		crypt_stat->hash_tfm = desc.tfm;
+	}
+	rc = crypto_hash_init(&desc);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error initializing crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out;
+	}
+	rc = crypto_hash_update(&desc, &sg, len);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error updating crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out;
+	}
+	rc = crypto_hash_final(&desc, dst);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error finalizing crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out;
+	}
+out:
+	mutex_unlock(&crypt_stat->cs_hash_tfm_mutex);
+	return rc;
+}
+
+static int ecryptfs_crypto_api_algify_cipher_name(char **algified_name,
+						  char *cipher_name,
+						  char *chaining_modifier)
+{
+	int cipher_name_len = strlen(cipher_name);
+	int chaining_modifier_len = strlen(chaining_modifier);
+	int algified_name_len;
+	int rc;
+
+	algified_name_len = (chaining_modifier_len + cipher_name_len + 3);
+	(*algified_name) = kmalloc(algified_name_len, GFP_KERNEL);
+	if (!(*algified_name)) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	snprintf((*algified_name), algified_name_len, "%s(%s)",
+		 chaining_modifier, cipher_name);
+	rc = 0;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_derive_iv
+ * @iv: destination for the derived iv vale
+ * @crypt_stat: Pointer to crypt_stat struct for the current inode
+ * @offset: Offset of the extent whose IV we are to derive
+ *
+ * Generate the initialization vector from the given root IV and page
+ * offset.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+		       loff_t offset)
+{
+	int rc = 0;
+	char dst[MD5_DIGEST_SIZE];
+	char src[ECRYPTFS_MAX_IV_BYTES + 16];
+
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "root iv:\n");
+		ecryptfs_dump_hex(crypt_stat->root_iv, crypt_stat->iv_bytes);
+	}
+	/* TODO: It is probably secure to just cast the least
+	 * significant bits of the root IV into an unsigned long and
+	 * add the offset to that rather than go through all this
+	 * hashing business. -Halcrow */
+	memcpy(src, crypt_stat->root_iv, crypt_stat->iv_bytes);
+	memset((src + crypt_stat->iv_bytes), 0, 16);
+	snprintf((src + crypt_stat->iv_bytes), 16, "%lld", offset);
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "source:\n");
+		ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16));
+	}
+	rc = ecryptfs_calculate_md5(dst, crypt_stat, src,
+				    (crypt_stat->iv_bytes + 16));
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
+				"MD5 while generating IV for a page\n");
+		goto out;
+	}
+	memcpy(iv, dst, crypt_stat->iv_bytes);
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "derived iv:\n");
+		ecryptfs_dump_hex(iv, crypt_stat->iv_bytes);
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_init_crypt_stat
+ * @crypt_stat: Pointer to the crypt_stat struct to initialize.
+ *
+ * Initialize the crypt_stat structure.
+ */
+void
+ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
+	INIT_LIST_HEAD(&crypt_stat->keysig_list);
+	mutex_init(&crypt_stat->keysig_list_mutex);
+	mutex_init(&crypt_stat->cs_mutex);
+	mutex_init(&crypt_stat->cs_tfm_mutex);
+	mutex_init(&crypt_stat->cs_hash_tfm_mutex);
+	crypt_stat->flags |= ECRYPTFS_STRUCT_INITIALIZED;
+}
+
+/**
+ * ecryptfs_destroy_crypt_stat
+ * @crypt_stat: Pointer to the crypt_stat struct to initialize.
+ *
+ * Releases all memory associated with a crypt_stat struct.
+ */
+void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	struct ecryptfs_key_sig *key_sig, *key_sig_tmp;
+
+	if (crypt_stat->tfm)
+		crypto_free_blkcipher(crypt_stat->tfm);
+	if (crypt_stat->hash_tfm)
+		crypto_free_hash(crypt_stat->hash_tfm);
+	list_for_each_entry_safe(key_sig, key_sig_tmp,
+				 &crypt_stat->keysig_list, crypt_stat_list) {
+		list_del(&key_sig->crypt_stat_list);
+		kmem_cache_free(ecryptfs_key_sig_cache, key_sig);
+	}
+	memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
+}
+
+void ecryptfs_destroy_mount_crypt_stat(
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	struct ecryptfs_global_auth_tok *auth_tok, *auth_tok_tmp;
+
+	if (!(mount_crypt_stat->flags & ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED))
+		return;
+	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	list_for_each_entry_safe(auth_tok, auth_tok_tmp,
+				 &mount_crypt_stat->global_auth_tok_list,
+				 mount_crypt_stat_list) {
+		list_del(&auth_tok->mount_crypt_stat_list);
+		if (auth_tok->global_auth_tok_key
+		    && !(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID))
+			key_put(auth_tok->global_auth_tok_key);
+		kmem_cache_free(ecryptfs_global_auth_tok_cache, auth_tok);
+	}
+	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	memset(mount_crypt_stat, 0, sizeof(struct ecryptfs_mount_crypt_stat));
+}
+
+/**
+ * virt_to_scatterlist
+ * @addr: Virtual address
+ * @size: Size of data; should be an even multiple of the block size
+ * @sg: Pointer to scatterlist array; set to NULL to obtain only
+ *      the number of scatterlist structs required in array
+ * @sg_size: Max array size
+ *
+ * Fills in a scatterlist array with page references for a passed
+ * virtual address.
+ *
+ * Returns the number of scatterlist structs in array used
+ */
+int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
+			int sg_size)
+{
+	int i = 0;
+	struct page *pg;
+	int offset;
+	int remainder_of_page;
+
+	sg_init_table(sg, sg_size);
+
+	while (size > 0 && i < sg_size) {
+		pg = virt_to_page(addr);
+		offset = offset_in_page(addr);
+		if (sg)
+			sg_set_page(&sg[i], pg, 0, offset);
+		remainder_of_page = PAGE_CACHE_SIZE - offset;
+		if (size >= remainder_of_page) {
+			if (sg)
+				sg[i].length = remainder_of_page;
+			addr += remainder_of_page;
+			size -= remainder_of_page;
+		} else {
+			if (sg)
+				sg[i].length = size;
+			addr += size;
+			size = 0;
+		}
+		i++;
+	}
+	if (size > 0)
+		return -ENOMEM;
+	return i;
+}
+
+/**
+ * encrypt_scatterlist
+ * @crypt_stat: Pointer to the crypt_stat struct to initialize.
+ * @dest_sg: Destination of encrypted data
+ * @src_sg: Data to be encrypted
+ * @size: Length of data to be encrypted
+ * @iv: iv to use during encryption
+ *
+ * Returns the number of bytes encrypted; negative value on error
+ */
+static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
+			       struct scatterlist *dest_sg,
+			       struct scatterlist *src_sg, int size,
+			       unsigned char *iv)
+{
+	struct blkcipher_desc desc = {
+		.tfm = crypt_stat->tfm,
+		.info = iv,
+		.flags = CRYPTO_TFM_REQ_MAY_SLEEP
+	};
+	int rc = 0;
+
+	BUG_ON(!crypt_stat || !crypt_stat->tfm
+	       || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
+				crypt_stat->key_size);
+		ecryptfs_dump_hex(crypt_stat->key,
+				  crypt_stat->key_size);
+	}
+	/* Consider doing this once, when the file is opened */
+	mutex_lock(&crypt_stat->cs_tfm_mutex);
+	if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
+		rc = crypto_blkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
+					     crypt_stat->key_size);
+		crypt_stat->flags |= ECRYPTFS_KEY_SET;
+	}
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n",
+				rc);
+		mutex_unlock(&crypt_stat->cs_tfm_mutex);
+		rc = -EINVAL;
+		goto out;
+	}
+	ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size);
+	crypto_blkcipher_encrypt_iv(&desc, dest_sg, src_sg, size);
+	mutex_unlock(&crypt_stat->cs_tfm_mutex);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_lower_offset_for_extent
+ *
+ * Convert an eCryptfs page index into a lower byte offset
+ */
+static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num,
+					     struct ecryptfs_crypt_stat *crypt_stat)
+{
+	(*offset) = ecryptfs_lower_header_size(crypt_stat)
+		    + (crypt_stat->extent_size * extent_num);
+}
+
+/**
+ * ecryptfs_encrypt_extent
+ * @enc_extent_page: Allocated page into which to encrypt the data in
+ *                   @page
+ * @crypt_stat: crypt_stat containing cryptographic context for the
+ *              encryption operation
+ * @page: Page containing plaintext data extent to encrypt
+ * @extent_offset: Page extent offset for use in generating IV
+ *
+ * Encrypts one extent of data.
+ *
+ * Return zero on success; non-zero otherwise
+ */
+static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
+				   struct ecryptfs_crypt_stat *crypt_stat,
+				   struct page *page,
+				   unsigned long extent_offset)
+{
+	loff_t extent_base;
+	char extent_iv[ECRYPTFS_MAX_IV_BYTES];
+	int rc;
+
+	extent_base = (((loff_t)page->index)
+		       * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
+	rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
+				(extent_base + extent_offset));
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
+			"extent [0x%.16llx]; rc = [%d]\n",
+			(unsigned long long)(extent_base + extent_offset), rc);
+		goto out;
+	}
+	rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0,
+					  page, (extent_offset
+						 * crypt_stat->extent_size),
+					  crypt_stat->extent_size, extent_iv);
+	if (rc < 0) {
+		printk(KERN_ERR "%s: Error attempting to encrypt page with "
+		       "page->index = [%ld], extent_offset = [%ld]; "
+		       "rc = [%d]\n", __func__, page->index, extent_offset,
+		       rc);
+		goto out;
+	}
+	rc = 0;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_encrypt_page
+ * @page: Page mapped from the eCryptfs inode for the file; contains
+ *        decrypted content that needs to be encrypted (to a temporary
+ *        page; not in place) and written out to the lower file
+ *
+ * Encrypt an eCryptfs page. This is done on a per-extent basis. Note
+ * that eCryptfs pages may straddle the lower pages -- for instance,
+ * if the file was created on a machine with an 8K page size
+ * (resulting in an 8K header), and then the file is copied onto a
+ * host with a 32K page size, then when reading page 0 of the eCryptfs
+ * file, 24K of page 0 of the lower file will be read and decrypted,
+ * and then 8K of page 1 of the lower file will be read and decrypted.
+ *
+ * Returns zero on success; negative on error
+ */
+int ecryptfs_encrypt_page(struct page *page)
+{
+	struct inode *ecryptfs_inode;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	char *enc_extent_virt;
+	struct page *enc_extent_page = NULL;
+	loff_t extent_offset;
+	int rc = 0;
+
+	ecryptfs_inode = page->mapping->host;
+	crypt_stat =
+		&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
+	BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
+	enc_extent_page = alloc_page(GFP_USER);
+	if (!enc_extent_page) {
+		rc = -ENOMEM;
+		ecryptfs_printk(KERN_ERR, "Error allocating memory for "
+				"encrypted extent\n");
+		goto out;
+	}
+	enc_extent_virt = kmap(enc_extent_page);
+	for (extent_offset = 0;
+	     extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
+	     extent_offset++) {
+		loff_t offset;
+
+		rc = ecryptfs_encrypt_extent(enc_extent_page, crypt_stat, page,
+					     extent_offset);
+		if (rc) {
+			printk(KERN_ERR "%s: Error encrypting extent; "
+			       "rc = [%d]\n", __func__, rc);
+			goto out;
+		}
+		ecryptfs_lower_offset_for_extent(
+			&offset, ((((loff_t)page->index)
+				   * (PAGE_CACHE_SIZE
+				      / crypt_stat->extent_size))
+				  + extent_offset), crypt_stat);
+		rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt,
+					  offset, crypt_stat->extent_size);
+		if (rc < 0) {
+			ecryptfs_printk(KERN_ERR, "Error attempting "
+					"to write lower page; rc = [%d]"
+					"\n", rc);
+			goto out;
+		}
+	}
+	rc = 0;
+out:
+	if (enc_extent_page) {
+		kunmap(enc_extent_page);
+		__free_page(enc_extent_page);
+	}
+	return rc;
+}
+
+static int ecryptfs_decrypt_extent(struct page *page,
+				   struct ecryptfs_crypt_stat *crypt_stat,
+				   struct page *enc_extent_page,
+				   unsigned long extent_offset)
+{
+	loff_t extent_base;
+	char extent_iv[ECRYPTFS_MAX_IV_BYTES];
+	int rc;
+
+	extent_base = (((loff_t)page->index)
+		       * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
+	rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
+				(extent_base + extent_offset));
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
+			"extent [0x%.16llx]; rc = [%d]\n",
+			(unsigned long long)(extent_base + extent_offset), rc);
+		goto out;
+	}
+	rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
+					  (extent_offset
+					   * crypt_stat->extent_size),
+					  enc_extent_page, 0,
+					  crypt_stat->extent_size, extent_iv);
+	if (rc < 0) {
+		printk(KERN_ERR "%s: Error attempting to decrypt to page with "
+		       "page->index = [%ld], extent_offset = [%ld]; "
+		       "rc = [%d]\n", __func__, page->index, extent_offset,
+		       rc);
+		goto out;
+	}
+	rc = 0;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_decrypt_page
+ * @page: Page mapped from the eCryptfs inode for the file; data read
+ *        and decrypted from the lower file will be written into this
+ *        page
+ *
+ * Decrypt an eCryptfs page. This is done on a per-extent basis. Note
+ * that eCryptfs pages may straddle the lower pages -- for instance,
+ * if the file was created on a machine with an 8K page size
+ * (resulting in an 8K header), and then the file is copied onto a
+ * host with a 32K page size, then when reading page 0 of the eCryptfs
+ * file, 24K of page 0 of the lower file will be read and decrypted,
+ * and then 8K of page 1 of the lower file will be read and decrypted.
+ *
+ * Returns zero on success; negative on error
+ */
+int ecryptfs_decrypt_page(struct page *page)
+{
+	struct inode *ecryptfs_inode;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	char *enc_extent_virt;
+	struct page *enc_extent_page = NULL;
+	unsigned long extent_offset;
+	int rc = 0;
+
+	ecryptfs_inode = page->mapping->host;
+	crypt_stat =
+		&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
+	BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
+	enc_extent_page = alloc_page(GFP_USER);
+	if (!enc_extent_page) {
+		rc = -ENOMEM;
+		ecryptfs_printk(KERN_ERR, "Error allocating memory for "
+				"encrypted extent\n");
+		goto out;
+	}
+	enc_extent_virt = kmap(enc_extent_page);
+	for (extent_offset = 0;
+	     extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
+	     extent_offset++) {
+		loff_t offset;
+
+		ecryptfs_lower_offset_for_extent(
+			&offset, ((page->index * (PAGE_CACHE_SIZE
+						  / crypt_stat->extent_size))
+				  + extent_offset), crypt_stat);
+		rc = ecryptfs_read_lower(enc_extent_virt, offset,
+					 crypt_stat->extent_size,
+					 ecryptfs_inode);
+		if (rc < 0) {
+			ecryptfs_printk(KERN_ERR, "Error attempting "
+					"to read lower page; rc = [%d]"
+					"\n", rc);
+			goto out;
+		}
+		rc = ecryptfs_decrypt_extent(page, crypt_stat, enc_extent_page,
+					     extent_offset);
+		if (rc) {
+			printk(KERN_ERR "%s: Error encrypting extent; "
+			       "rc = [%d]\n", __func__, rc);
+			goto out;
+		}
+	}
+out:
+	if (enc_extent_page) {
+		kunmap(enc_extent_page);
+		__free_page(enc_extent_page);
+	}
+	return rc;
+}
+
+/**
+ * decrypt_scatterlist
+ * @crypt_stat: Cryptographic context
+ * @dest_sg: The destination scatterlist to decrypt into
+ * @src_sg: The source scatterlist to decrypt from
+ * @size: The number of bytes to decrypt
+ * @iv: The initialization vector to use for the decryption
+ *
+ * Returns the number of bytes decrypted; negative value on error
+ */
+static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
+			       struct scatterlist *dest_sg,
+			       struct scatterlist *src_sg, int size,
+			       unsigned char *iv)
+{
+	struct blkcipher_desc desc = {
+		.tfm = crypt_stat->tfm,
+		.info = iv,
+		.flags = CRYPTO_TFM_REQ_MAY_SLEEP
+	};
+	int rc = 0;
+
+	/* Consider doing this once, when the file is opened */
+	mutex_lock(&crypt_stat->cs_tfm_mutex);
+	rc = crypto_blkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
+				     crypt_stat->key_size);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n",
+				rc);
+		mutex_unlock(&crypt_stat->cs_tfm_mutex);
+		rc = -EINVAL;
+		goto out;
+	}
+	ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size);
+	rc = crypto_blkcipher_decrypt_iv(&desc, dest_sg, src_sg, size);
+	mutex_unlock(&crypt_stat->cs_tfm_mutex);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error decrypting; rc = [%d]\n",
+				rc);
+		goto out;
+	}
+	rc = size;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_encrypt_page_offset
+ * @crypt_stat: The cryptographic context
+ * @dst_page: The page to encrypt into
+ * @dst_offset: The offset in the page to encrypt into
+ * @src_page: The page to encrypt from
+ * @src_offset: The offset in the page to encrypt from
+ * @size: The number of bytes to encrypt
+ * @iv: The initialization vector to use for the encryption
+ *
+ * Returns the number of bytes encrypted
+ */
+static int
+ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+			     struct page *dst_page, int dst_offset,
+			     struct page *src_page, int src_offset, int size,
+			     unsigned char *iv)
+{
+	struct scatterlist src_sg, dst_sg;
+
+	sg_init_table(&src_sg, 1);
+	sg_init_table(&dst_sg, 1);
+
+	sg_set_page(&src_sg, src_page, size, src_offset);
+	sg_set_page(&dst_sg, dst_page, size, dst_offset);
+	return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
+}
+
+/**
+ * ecryptfs_decrypt_page_offset
+ * @crypt_stat: The cryptographic context
+ * @dst_page: The page to decrypt into
+ * @dst_offset: The offset in the page to decrypt into
+ * @src_page: The page to decrypt from
+ * @src_offset: The offset in the page to decrypt from
+ * @size: The number of bytes to decrypt
+ * @iv: The initialization vector to use for the decryption
+ *
+ * Returns the number of bytes decrypted
+ */
+static int
+ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+			     struct page *dst_page, int dst_offset,
+			     struct page *src_page, int src_offset, int size,
+			     unsigned char *iv)
+{
+	struct scatterlist src_sg, dst_sg;
+
+	sg_init_table(&src_sg, 1);
+	sg_set_page(&src_sg, src_page, size, src_offset);
+
+	sg_init_table(&dst_sg, 1);
+	sg_set_page(&dst_sg, dst_page, size, dst_offset);
+
+	return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
+}
+
+#define ECRYPTFS_MAX_SCATTERLIST_LEN 4
+
+/**
+ * ecryptfs_init_crypt_ctx
+ * @crypt_stat: Uninitialized crypt stats structure
+ *
+ * Initialize the crypto context.
+ *
+ * TODO: Performance: Keep a cache of initialized cipher contexts;
+ * only init if needed
+ */
+int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	char *full_alg_name;
+	int rc = -EINVAL;
+
+	if (!crypt_stat->cipher) {
+		ecryptfs_printk(KERN_ERR, "No cipher specified\n");
+		goto out;
+	}
+	ecryptfs_printk(KERN_DEBUG,
+			"Initializing cipher [%s]; strlen = [%d]; "
+			"key_size_bits = [%zd]\n",
+			crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
+			crypt_stat->key_size << 3);
+	if (crypt_stat->tfm) {
+		rc = 0;
+		goto out;
+	}
+	mutex_lock(&crypt_stat->cs_tfm_mutex);
+	rc = ecryptfs_crypto_api_algify_cipher_name(&full_alg_name,
+						    crypt_stat->cipher, "cbc");
+	if (rc)
+		goto out_unlock;
+	crypt_stat->tfm = crypto_alloc_blkcipher(full_alg_name, 0,
+						 CRYPTO_ALG_ASYNC);
+	kfree(full_alg_name);
+	if (IS_ERR(crypt_stat->tfm)) {
+		rc = PTR_ERR(crypt_stat->tfm);
+		crypt_stat->tfm = NULL;
+		ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): "
+				"Error initializing cipher [%s]\n",
+				crypt_stat->cipher);
+		goto out_unlock;
+	}
+	crypto_blkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+	rc = 0;
+out_unlock:
+	mutex_unlock(&crypt_stat->cs_tfm_mutex);
+out:
+	return rc;
+}
+
+static void set_extent_mask_and_shift(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	int extent_size_tmp;
+
+	crypt_stat->extent_mask = 0xFFFFFFFF;
+	crypt_stat->extent_shift = 0;
+	if (crypt_stat->extent_size == 0)
+		return;
+	extent_size_tmp = crypt_stat->extent_size;
+	while ((extent_size_tmp & 0x01) == 0) {
+		extent_size_tmp >>= 1;
+		crypt_stat->extent_mask <<= 1;
+		crypt_stat->extent_shift++;
+	}
+}
+
+void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	/* Default values; may be overwritten as we are parsing the
+	 * packets. */
+	crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
+	set_extent_mask_and_shift(crypt_stat);
+	crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
+	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+		crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
+	else {
+		if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
+			crypt_stat->metadata_size =
+				ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
+		else
+			crypt_stat->metadata_size = PAGE_CACHE_SIZE;
+	}
+}
+
+/**
+ * ecryptfs_compute_root_iv
+ * @crypt_stats
+ *
+ * On error, sets the root IV to all 0's.
+ */
+int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	int rc = 0;
+	char dst[MD5_DIGEST_SIZE];
+
+	BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE);
+	BUG_ON(crypt_stat->iv_bytes <= 0);
+	if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_WARNING, "Session key not valid; "
+				"cannot generate root IV\n");
+		goto out;
+	}
+	rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key,
+				    crypt_stat->key_size);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
+				"MD5 while generating root IV\n");
+		goto out;
+	}
+	memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes);
+out:
+	if (rc) {
+		memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes);
+		crypt_stat->flags |= ECRYPTFS_SECURITY_WARNING;
+	}
+	return rc;
+}
+
+static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	get_random_bytes(crypt_stat->key, crypt_stat->key_size);
+	crypt_stat->flags |= ECRYPTFS_KEY_VALID;
+	ecryptfs_compute_root_iv(crypt_stat);
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "Generated new session key:\n");
+		ecryptfs_dump_hex(crypt_stat->key,
+				  crypt_stat->key_size);
+	}
+}
+
+/**
+ * ecryptfs_copy_mount_wide_flags_to_inode_flags
+ * @crypt_stat: The inode's cryptographic context
+ * @mount_crypt_stat: The mount point's cryptographic context
+ *
+ * This function propagates the mount-wide flags to individual inode
+ * flags.
+ */
+static void ecryptfs_copy_mount_wide_flags_to_inode_flags(
+	struct ecryptfs_crypt_stat *crypt_stat,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	if (mount_crypt_stat->flags & ECRYPTFS_XATTR_METADATA_ENABLED)
+		crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+	if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
+		crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED;
+	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+		crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES;
+		if (mount_crypt_stat->flags
+		    & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)
+			crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK;
+		else if (mount_crypt_stat->flags
+			 & ECRYPTFS_GLOBAL_ENCFN_USE_FEK)
+			crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK;
+	}
+}
+
+static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
+	struct ecryptfs_crypt_stat *crypt_stat,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	struct ecryptfs_global_auth_tok *global_auth_tok;
+	int rc = 0;
+
+	mutex_lock(&crypt_stat->keysig_list_mutex);
+	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+
+	list_for_each_entry(global_auth_tok,
+			    &mount_crypt_stat->global_auth_tok_list,
+			    mount_crypt_stat_list) {
+		if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_FNEK)
+			continue;
+		rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig);
+		if (rc) {
+			printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc);
+			goto out;
+		}
+	}
+
+out:
+	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	mutex_unlock(&crypt_stat->keysig_list_mutex);
+	return rc;
+}
+
+/**
+ * ecryptfs_set_default_crypt_stat_vals
+ * @crypt_stat: The inode's cryptographic context
+ * @mount_crypt_stat: The mount point's cryptographic context
+ *
+ * Default values in the event that policy does not override them.
+ */
+static void ecryptfs_set_default_crypt_stat_vals(
+	struct ecryptfs_crypt_stat *crypt_stat,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat,
+						      mount_crypt_stat);
+	ecryptfs_set_default_sizes(crypt_stat);
+	strcpy(crypt_stat->cipher, ECRYPTFS_DEFAULT_CIPHER);
+	crypt_stat->key_size = ECRYPTFS_DEFAULT_KEY_BYTES;
+	crypt_stat->flags &= ~(ECRYPTFS_KEY_VALID);
+	crypt_stat->file_version = ECRYPTFS_FILE_VERSION;
+	crypt_stat->mount_crypt_stat = mount_crypt_stat;
+}
+
+/**
+ * ecryptfs_new_file_context
+ * @ecryptfs_dentry: The eCryptfs dentry
+ *
+ * If the crypto context for the file has not yet been established,
+ * this is where we do that.  Establishing a new crypto context
+ * involves the following decisions:
+ *  - What cipher to use?
+ *  - What set of authentication tokens to use?
+ * Here we just worry about getting enough information into the
+ * authentication tokens so that we know that they are available.
+ * We associate the available authentication tokens with the new file
+ * via the set of signatures in the crypt_stat struct.  Later, when
+ * the headers are actually written out, we may again defer to
+ * userspace to perform the encryption of the session key; for the
+ * foreseeable future, this will be the case with public key packets.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry)
+{
+	struct ecryptfs_crypt_stat *crypt_stat =
+	    &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+	    &ecryptfs_superblock_to_private(
+		    ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	int cipher_name_len;
+	int rc = 0;
+
+	ecryptfs_set_default_crypt_stat_vals(crypt_stat, mount_crypt_stat);
+	crypt_stat->flags |= (ECRYPTFS_ENCRYPTED | ECRYPTFS_KEY_VALID);
+	ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat,
+						      mount_crypt_stat);
+	rc = ecryptfs_copy_mount_wide_sigs_to_inode_sigs(crypt_stat,
+							 mount_crypt_stat);
+	if (rc) {
+		printk(KERN_ERR "Error attempting to copy mount-wide key sigs "
+		       "to the inode key sigs; rc = [%d]\n", rc);
+		goto out;
+	}
+	cipher_name_len =
+		strlen(mount_crypt_stat->global_default_cipher_name);
+	memcpy(crypt_stat->cipher,
+	       mount_crypt_stat->global_default_cipher_name,
+	       cipher_name_len);
+	crypt_stat->cipher[cipher_name_len] = '\0';
+	crypt_stat->key_size =
+		mount_crypt_stat->global_default_cipher_key_size;
+	ecryptfs_generate_new_key(crypt_stat);
+	rc = ecryptfs_init_crypt_ctx(crypt_stat);
+	if (rc)
+		ecryptfs_printk(KERN_ERR, "Error initializing cryptographic "
+				"context for cipher [%s]: rc = [%d]\n",
+				crypt_stat->cipher, rc);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_validate_marker - check for the ecryptfs marker
+ * @data: The data block in which to check
+ *
+ * Returns zero if marker found; -EINVAL if not found
+ */
+static int ecryptfs_validate_marker(char *data)
+{
+	u32 m_1, m_2;
+
+	m_1 = get_unaligned_be32(data);
+	m_2 = get_unaligned_be32(data + 4);
+	if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
+		return 0;
+	ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
+			"MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2,
+			MAGIC_ECRYPTFS_MARKER);
+	ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = "
+			"[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER));
+	return -EINVAL;
+}
+
+struct ecryptfs_flag_map_elem {
+	u32 file_flag;
+	u32 local_flag;
+};
+
+/* Add support for additional flags by adding elements here. */
+static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = {
+	{0x00000001, ECRYPTFS_ENABLE_HMAC},
+	{0x00000002, ECRYPTFS_ENCRYPTED},
+	{0x00000004, ECRYPTFS_METADATA_IN_XATTR},
+	{0x00000008, ECRYPTFS_ENCRYPT_FILENAMES}
+};
+
+/**
+ * ecryptfs_process_flags
+ * @crypt_stat: The cryptographic context
+ * @page_virt: Source data to be parsed
+ * @bytes_read: Updated with the number of bytes read
+ *
+ * Returns zero on success; non-zero if the flag set is invalid
+ */
+static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat,
+				  char *page_virt, int *bytes_read)
+{
+	int rc = 0;
+	int i;
+	u32 flags;
+
+	flags = get_unaligned_be32(page_virt);
+	for (i = 0; i < ((sizeof(ecryptfs_flag_map)
+			  / sizeof(struct ecryptfs_flag_map_elem))); i++)
+		if (flags & ecryptfs_flag_map[i].file_flag) {
+			crypt_stat->flags |= ecryptfs_flag_map[i].local_flag;
+		} else
+			crypt_stat->flags &= ~(ecryptfs_flag_map[i].local_flag);
+	/* Version is in top 8 bits of the 32-bit flag vector */
+	crypt_stat->file_version = ((flags >> 24) & 0xFF);
+	(*bytes_read) = 4;
+	return rc;
+}
+
+/**
+ * write_ecryptfs_marker
+ * @page_virt: The pointer to in a page to begin writing the marker
+ * @written: Number of bytes written
+ *
+ * Marker = 0x3c81b7f5
+ */
+static void write_ecryptfs_marker(char *page_virt, size_t *written)
+{
+	u32 m_1, m_2;
+
+	get_random_bytes(&m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
+	m_2 = (m_1 ^ MAGIC_ECRYPTFS_MARKER);
+	put_unaligned_be32(m_1, page_virt);
+	page_virt += (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2);
+	put_unaligned_be32(m_2, page_virt);
+	(*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
+}
+
+void ecryptfs_write_crypt_stat_flags(char *page_virt,
+				     struct ecryptfs_crypt_stat *crypt_stat,
+				     size_t *written)
+{
+	u32 flags = 0;
+	int i;
+
+	for (i = 0; i < ((sizeof(ecryptfs_flag_map)
+			  / sizeof(struct ecryptfs_flag_map_elem))); i++)
+		if (crypt_stat->flags & ecryptfs_flag_map[i].local_flag)
+			flags |= ecryptfs_flag_map[i].file_flag;
+	/* Version is in top 8 bits of the 32-bit flag vector */
+	flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000);
+	put_unaligned_be32(flags, page_virt);
+	(*written) = 4;
+}
+
+struct ecryptfs_cipher_code_str_map_elem {
+	char cipher_str[16];
+	u8 cipher_code;
+};
+
+/* Add support for additional ciphers by adding elements here. The
+ * cipher_code is whatever OpenPGP applicatoins use to identify the
+ * ciphers. List in order of probability. */
+static struct ecryptfs_cipher_code_str_map_elem
+ecryptfs_cipher_code_str_map[] = {
+	{"aes",RFC2440_CIPHER_AES_128 },
+	{"blowfish", RFC2440_CIPHER_BLOWFISH},
+	{"des3_ede", RFC2440_CIPHER_DES3_EDE},
+	{"cast5", RFC2440_CIPHER_CAST_5},
+	{"twofish", RFC2440_CIPHER_TWOFISH},
+	{"cast6", RFC2440_CIPHER_CAST_6},
+	{"aes", RFC2440_CIPHER_AES_192},
+	{"aes", RFC2440_CIPHER_AES_256}
+};
+
+/**
+ * ecryptfs_code_for_cipher_string
+ * @cipher_name: The string alias for the cipher
+ * @key_bytes: Length of key in bytes; used for AES code selection
+ *
+ * Returns zero on no match, or the cipher code on match
+ */
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes)
+{
+	int i;
+	u8 code = 0;
+	struct ecryptfs_cipher_code_str_map_elem *map =
+		ecryptfs_cipher_code_str_map;
+
+	if (strcmp(cipher_name, "aes") == 0) {
+		switch (key_bytes) {
+		case 16:
+			code = RFC2440_CIPHER_AES_128;
+			break;
+		case 24:
+			code = RFC2440_CIPHER_AES_192;
+			break;
+		case 32:
+			code = RFC2440_CIPHER_AES_256;
+		}
+	} else {
+		for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
+			if (strcmp(cipher_name, map[i].cipher_str) == 0) {
+				code = map[i].cipher_code;
+				break;
+			}
+	}
+	return code;
+}
+
+/**
+ * ecryptfs_cipher_code_to_string
+ * @str: Destination to write out the cipher name
+ * @cipher_code: The code to convert to cipher name string
+ *
+ * Returns zero on success
+ */
+int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code)
+{
+	int rc = 0;
+	int i;
+
+	str[0] = '\0';
+	for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
+		if (cipher_code == ecryptfs_cipher_code_str_map[i].cipher_code)
+			strcpy(str, ecryptfs_cipher_code_str_map[i].cipher_str);
+	if (str[0] == '\0') {
+		ecryptfs_printk(KERN_WARNING, "Cipher code not recognized: "
+				"[%d]\n", cipher_code);
+		rc = -EINVAL;
+	}
+	return rc;
+}
+
+int ecryptfs_read_and_validate_header_region(struct inode *inode)
+{
+	u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
+	u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
+	int rc;
+
+	rc = ecryptfs_read_lower(file_size, 0, ECRYPTFS_SIZE_AND_MARKER_BYTES,
+				 inode);
+	if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
+		return rc >= 0 ? -EINVAL : rc;
+	rc = ecryptfs_validate_marker(marker);
+	if (!rc)
+		ecryptfs_i_size_init(file_size, inode);
+	return rc;
+}
+
+void
+ecryptfs_write_header_metadata(char *virt,
+			       struct ecryptfs_crypt_stat *crypt_stat,
+			       size_t *written)
+{
+	u32 header_extent_size;
+	u16 num_header_extents_at_front;
+
+	header_extent_size = (u32)crypt_stat->extent_size;
+	num_header_extents_at_front =
+		(u16)(crypt_stat->metadata_size / crypt_stat->extent_size);
+	put_unaligned_be32(header_extent_size, virt);
+	virt += 4;
+	put_unaligned_be16(num_header_extents_at_front, virt);
+	(*written) = 6;
+}
+
+struct kmem_cache *ecryptfs_header_cache;
+
+/**
+ * ecryptfs_write_headers_virt
+ * @page_virt: The virtual address to write the headers to
+ * @max: The size of memory allocated at page_virt
+ * @size: Set to the number of bytes written by this function
+ * @crypt_stat: The cryptographic context
+ * @ecryptfs_dentry: The eCryptfs dentry
+ *
+ * Format version: 1
+ *
+ *   Header Extent:
+ *     Octets 0-7:        Unencrypted file size (big-endian)
+ *     Octets 8-15:       eCryptfs special marker
+ *     Octets 16-19:      Flags
+ *      Octet 16:         File format version number (between 0 and 255)
+ *      Octets 17-18:     Reserved
+ *      Octet 19:         Bit 1 (lsb): Reserved
+ *                        Bit 2: Encrypted?
+ *                        Bits 3-8: Reserved
+ *     Octets 20-23:      Header extent size (big-endian)
+ *     Octets 24-25:      Number of header extents at front of file
+ *                        (big-endian)
+ *     Octet  26:         Begin RFC 2440 authentication token packet set
+ *   Data Extent 0:
+ *     Lower data (CBC encrypted)
+ *   Data Extent 1:
+ *     Lower data (CBC encrypted)
+ *   ...
+ *
+ * Returns zero on success
+ */
+static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
+				       size_t *size,
+				       struct ecryptfs_crypt_stat *crypt_stat,
+				       struct dentry *ecryptfs_dentry)
+{
+	int rc;
+	size_t written;
+	size_t offset;
+
+	offset = ECRYPTFS_FILE_SIZE_BYTES;
+	write_ecryptfs_marker((page_virt + offset), &written);
+	offset += written;
+	ecryptfs_write_crypt_stat_flags((page_virt + offset), crypt_stat,
+					&written);
+	offset += written;
+	ecryptfs_write_header_metadata((page_virt + offset), crypt_stat,
+				       &written);
+	offset += written;
+	rc = ecryptfs_generate_key_packet_set((page_virt + offset), crypt_stat,
+					      ecryptfs_dentry, &written,
+					      max - offset);
+	if (rc)
+		ecryptfs_printk(KERN_WARNING, "Error generating key packet "
+				"set; rc = [%d]\n", rc);
+	if (size) {
+		offset += written;
+		*size = offset;
+	}
+	return rc;
+}
+
+static int
+ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry,
+				    char *virt, size_t virt_len)
+{
+	int rc;
+
+	rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt,
+				  0, virt_len);
+	if (rc < 0)
+		printk(KERN_ERR "%s: Error attempting to write header "
+		       "information to lower file; rc = [%d]\n", __func__, rc);
+	else
+		rc = 0;
+	return rc;
+}
+
+static int
+ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
+				 char *page_virt, size_t size)
+{
+	int rc;
+
+	rc = ecryptfs_setxattr(ecryptfs_dentry, ECRYPTFS_XATTR_NAME, page_virt,
+			       size, 0);
+	return rc;
+}
+
+static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
+					       unsigned int order)
+{
+	struct page *page;
+
+	page = alloc_pages(gfp_mask | __GFP_ZERO, order);
+	if (page)
+		return (unsigned long) page_address(page);
+	return 0;
+}
+
+/**
+ * ecryptfs_write_metadata
+ * @ecryptfs_dentry: The eCryptfs dentry
+ *
+ * Write the file headers out.  This will likely involve a userspace
+ * callout, in which the session key is encrypted with one or more
+ * public keys and/or the passphrase necessary to do the encryption is
+ * retrieved via a prompt.  Exactly what happens at this point should
+ * be policy-dependent.
+ *
+ * Returns zero on success; non-zero on error
+ */
+int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
+{
+	struct ecryptfs_crypt_stat *crypt_stat =
+		&ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+	unsigned int order;
+	char *virt;
+	size_t virt_len;
+	size_t size = 0;
+	int rc = 0;
+
+	if (likely(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+		if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) {
+			printk(KERN_ERR "Key is invalid; bailing out\n");
+			rc = -EINVAL;
+			goto out;
+		}
+	} else {
+		printk(KERN_WARNING "%s: Encrypted flag not set\n",
+		       __func__);
+		rc = -EINVAL;
+		goto out;
+	}
+	virt_len = crypt_stat->metadata_size;
+	order = get_order(virt_len);
+	/* Released in this function */
+	virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order);
+	if (!virt) {
+		printk(KERN_ERR "%s: Out of memory\n", __func__);
+		rc = -ENOMEM;
+		goto out;
+	}
+	/* Zeroed page ensures the in-header unencrypted i_size is set to 0 */
+	rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat,
+					 ecryptfs_dentry);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n",
+		       __func__, rc);
+		goto out_free;
+	}
+	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+		rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
+						      size);
+	else
+		rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt,
+							 virt_len);
+	if (rc) {
+		printk(KERN_ERR "%s: Error writing metadata out to lower file; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out_free;
+	}
+out_free:
+	free_pages((unsigned long)virt, order);
+out:
+	return rc;
+}
+
+#define ECRYPTFS_DONT_VALIDATE_HEADER_SIZE 0
+#define ECRYPTFS_VALIDATE_HEADER_SIZE 1
+static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
+				 char *virt, int *bytes_read,
+				 int validate_header_size)
+{
+	int rc = 0;
+	u32 header_extent_size;
+	u16 num_header_extents_at_front;
+
+	header_extent_size = get_unaligned_be32(virt);
+	virt += sizeof(__be32);
+	num_header_extents_at_front = get_unaligned_be16(virt);
+	crypt_stat->metadata_size = (((size_t)num_header_extents_at_front
+				     * (size_t)header_extent_size));
+	(*bytes_read) = (sizeof(__be32) + sizeof(__be16));
+	if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE)
+	    && (crypt_stat->metadata_size
+		< ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) {
+		rc = -EINVAL;
+		printk(KERN_WARNING "Invalid header size: [%zd]\n",
+		       crypt_stat->metadata_size);
+	}
+	return rc;
+}
+
+/**
+ * set_default_header_data
+ * @crypt_stat: The cryptographic context
+ *
+ * For version 0 file format; this function is only for backwards
+ * compatibility for files created with the prior versions of
+ * eCryptfs.
+ */
+static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
+}
+
+void ecryptfs_i_size_init(const char *page_virt, struct inode *inode)
+{
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	u64 file_size;
+
+	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+	mount_crypt_stat =
+		&ecryptfs_superblock_to_private(inode->i_sb)->mount_crypt_stat;
+	if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
+		file_size = i_size_read(ecryptfs_inode_to_lower(inode));
+		if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+			file_size += crypt_stat->metadata_size;
+	} else
+		file_size = get_unaligned_be64(page_virt);
+	i_size_write(inode, (loff_t)file_size);
+	crypt_stat->flags |= ECRYPTFS_I_SIZE_INITIALIZED;
+}
+
+/**
+ * ecryptfs_read_headers_virt
+ * @page_virt: The virtual address into which to read the headers
+ * @crypt_stat: The cryptographic context
+ * @ecryptfs_dentry: The eCryptfs dentry
+ * @validate_header_size: Whether to validate the header size while reading
+ *
+ * Read/parse the header data. The header format is detailed in the
+ * comment block for the ecryptfs_write_headers_virt() function.
+ *
+ * Returns zero on success
+ */
+static int ecryptfs_read_headers_virt(char *page_virt,
+				      struct ecryptfs_crypt_stat *crypt_stat,
+				      struct dentry *ecryptfs_dentry,
+				      int validate_header_size)
+{
+	int rc = 0;
+	int offset;
+	int bytes_read;
+
+	ecryptfs_set_default_sizes(crypt_stat);
+	crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private(
+		ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	offset = ECRYPTFS_FILE_SIZE_BYTES;
+	rc = ecryptfs_validate_marker(page_virt + offset);
+	if (rc)
+		goto out;
+	if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED))
+		ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode);
+	offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
+	rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset),
+				    &bytes_read);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error processing flags\n");
+		goto out;
+	}
+	if (crypt_stat->file_version > ECRYPTFS_SUPPORTED_FILE_VERSION) {
+		ecryptfs_printk(KERN_WARNING, "File version is [%d]; only "
+				"file version [%d] is supported by this "
+				"version of eCryptfs\n",
+				crypt_stat->file_version,
+				ECRYPTFS_SUPPORTED_FILE_VERSION);
+		rc = -EINVAL;
+		goto out;
+	}
+	offset += bytes_read;
+	if (crypt_stat->file_version >= 1) {
+		rc = parse_header_metadata(crypt_stat, (page_virt + offset),
+					   &bytes_read, validate_header_size);
+		if (rc) {
+			ecryptfs_printk(KERN_WARNING, "Error reading header "
+					"metadata; rc = [%d]\n", rc);
+		}
+		offset += bytes_read;
+	} else
+		set_default_header_data(crypt_stat);
+	rc = ecryptfs_parse_packet_set(crypt_stat, (page_virt + offset),
+				       ecryptfs_dentry);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_read_xattr_region
+ * @page_virt: The vitual address into which to read the xattr data
+ * @ecryptfs_inode: The eCryptfs inode
+ *
+ * Attempts to read the crypto metadata from the extended attribute
+ * region of the lower file.
+ *
+ * Returns zero on success; non-zero on error
+ */
+int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode)
+{
+	struct dentry *lower_dentry =
+		ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_dentry;
+	ssize_t size;
+	int rc = 0;
+
+	size = ecryptfs_getxattr_lower(lower_dentry, ECRYPTFS_XATTR_NAME,
+				       page_virt, ECRYPTFS_DEFAULT_EXTENT_SIZE);
+	if (size < 0) {
+		if (unlikely(ecryptfs_verbosity > 0))
+			printk(KERN_INFO "Error attempting to read the [%s] "
+			       "xattr from the lower file; return value = "
+			       "[%zd]\n", ECRYPTFS_XATTR_NAME, size);
+		rc = -EINVAL;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
+					    struct inode *inode)
+{
+	u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
+	u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
+	int rc;
+
+	rc = ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry),
+				     ECRYPTFS_XATTR_NAME, file_size,
+				     ECRYPTFS_SIZE_AND_MARKER_BYTES);
+	if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
+		return rc >= 0 ? -EINVAL : rc;
+	rc = ecryptfs_validate_marker(marker);
+	if (!rc)
+		ecryptfs_i_size_init(file_size, inode);
+	return rc;
+}
+
+/**
+ * ecryptfs_read_metadata
+ *
+ * Common entry point for reading file metadata. From here, we could
+ * retrieve the header information from the header region of the file,
+ * the xattr region of the file, or some other repostory that is
+ * stored separately from the file itself. The current implementation
+ * supports retrieving the metadata information from the file contents
+ * and from the xattr region.
+ *
+ * Returns zero if valid headers found and parsed; non-zero otherwise
+ */
+int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
+{
+	int rc = 0;
+	char *page_virt = NULL;
+	struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
+	struct ecryptfs_crypt_stat *crypt_stat =
+	    &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		&ecryptfs_superblock_to_private(
+			ecryptfs_dentry->d_sb)->mount_crypt_stat;
+
+	ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat,
+						      mount_crypt_stat);
+	/* Read the first page from the underlying file */
+	page_virt = kmem_cache_alloc(ecryptfs_header_cache, GFP_USER);
+	if (!page_virt) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "%s: Unable to allocate page_virt\n",
+		       __func__);
+		goto out;
+	}
+	rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size,
+				 ecryptfs_inode);
+	if (rc >= 0)
+		rc = ecryptfs_read_headers_virt(page_virt, crypt_stat,
+						ecryptfs_dentry,
+						ECRYPTFS_VALIDATE_HEADER_SIZE);
+	if (rc) {
+		memset(page_virt, 0, PAGE_CACHE_SIZE);
+		rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
+		if (rc) {
+			printk(KERN_DEBUG "Valid eCryptfs headers not found in "
+			       "file header region or xattr region, inode %lu\n",
+				ecryptfs_inode->i_ino);
+			rc = -EINVAL;
+			goto out;
+		}
+		rc = ecryptfs_read_headers_virt(page_virt, crypt_stat,
+						ecryptfs_dentry,
+						ECRYPTFS_DONT_VALIDATE_HEADER_SIZE);
+		if (rc) {
+			printk(KERN_DEBUG "Valid eCryptfs headers not found in "
+			       "file xattr region either, inode %lu\n",
+				ecryptfs_inode->i_ino);
+			rc = -EINVAL;
+		}
+		if (crypt_stat->mount_crypt_stat->flags
+		    & ECRYPTFS_XATTR_METADATA_ENABLED) {
+			crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+		} else {
+			printk(KERN_WARNING "Attempt to access file with "
+			       "crypto metadata only in the extended attribute "
+			       "region, but eCryptfs was mounted without "
+			       "xattr support enabled. eCryptfs will not treat "
+			       "this like an encrypted file, inode %lu\n",
+				ecryptfs_inode->i_ino);
+			rc = -EINVAL;
+		}
+	}
+out:
+	if (page_virt) {
+		memset(page_virt, 0, PAGE_CACHE_SIZE);
+		kmem_cache_free(ecryptfs_header_cache, page_virt);
+	}
+	return rc;
+}
+
+/**
+ * ecryptfs_encrypt_filename - encrypt filename
+ *
+ * CBC-encrypts the filename. We do not want to encrypt the same
+ * filename with the same key and IV, which may happen with hard
+ * links, so we prepend random bits to each filename.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int
+ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
+			  struct ecryptfs_crypt_stat *crypt_stat,
+			  struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	int rc = 0;
+
+	filename->encrypted_filename = NULL;
+	filename->encrypted_filename_size = 0;
+	if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+	    || (mount_crypt_stat && (mount_crypt_stat->flags
+				     & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+		size_t packet_size;
+		size_t remaining_bytes;
+
+		rc = ecryptfs_write_tag_70_packet(
+			NULL, NULL,
+			&filename->encrypted_filename_size,
+			mount_crypt_stat, NULL,
+			filename->filename_size);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to get packet "
+			       "size for tag 72; rc = [%d]\n", __func__,
+			       rc);
+			filename->encrypted_filename_size = 0;
+			goto out;
+		}
+		filename->encrypted_filename =
+			kmalloc(filename->encrypted_filename_size, GFP_KERNEL);
+		if (!filename->encrypted_filename) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kmalloc [%zd] bytes\n", __func__,
+			       filename->encrypted_filename_size);
+			rc = -ENOMEM;
+			goto out;
+		}
+		remaining_bytes = filename->encrypted_filename_size;
+		rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename,
+						  &remaining_bytes,
+						  &packet_size,
+						  mount_crypt_stat,
+						  filename->filename,
+						  filename->filename_size);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to generate "
+			       "tag 70 packet; rc = [%d]\n", __func__,
+			       rc);
+			kfree(filename->encrypted_filename);
+			filename->encrypted_filename = NULL;
+			filename->encrypted_filename_size = 0;
+			goto out;
+		}
+		filename->encrypted_filename_size = packet_size;
+	} else {
+		printk(KERN_ERR "%s: No support for requested filename "
+		       "encryption method in this release\n", __func__);
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
+				  const char *name, size_t name_size)
+{
+	int rc = 0;
+
+	(*copied_name) = kmalloc((name_size + 1), GFP_KERNEL);
+	if (!(*copied_name)) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	memcpy((void *)(*copied_name), (void *)name, name_size);
+	(*copied_name)[(name_size)] = '\0';	/* Only for convenience
+						 * in printing out the
+						 * string in debug
+						 * messages */
+	(*copied_name_size) = name_size;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_process_key_cipher - Perform key cipher initialization.
+ * @key_tfm: Crypto context for key material, set by this function
+ * @cipher_name: Name of the cipher
+ * @key_size: Size of the key in bytes
+ *
+ * Returns zero on success. Any crypto_tfm structs allocated here
+ * should be released by other functions, such as on a superblock put
+ * event, regardless of whether this function succeeds for fails.
+ */
+static int
+ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
+			    char *cipher_name, size_t *key_size)
+{
+	char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
+	char *full_alg_name = NULL;
+	int rc;
+
+	*key_tfm = NULL;
+	if (*key_size > ECRYPTFS_MAX_KEY_BYTES) {
+		rc = -EINVAL;
+		printk(KERN_ERR "Requested key size is [%zd] bytes; maximum "
+		      "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES);
+		goto out;
+	}
+	rc = ecryptfs_crypto_api_algify_cipher_name(&full_alg_name, cipher_name,
+						    "ecb");
+	if (rc)
+		goto out;
+	*key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(*key_tfm)) {
+		rc = PTR_ERR(*key_tfm);
+		printk(KERN_ERR "Unable to allocate crypto cipher with name "
+		       "[%s]; rc = [%d]\n", full_alg_name, rc);
+		goto out;
+	}
+	crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+	if (*key_size == 0) {
+		struct blkcipher_alg *alg = crypto_blkcipher_alg(*key_tfm);
+
+		*key_size = alg->max_keysize;
+	}
+	get_random_bytes(dummy_key, *key_size);
+	rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
+	if (rc) {
+		printk(KERN_ERR "Error attempting to set key of size [%zd] for "
+		       "cipher [%s]; rc = [%d]\n", *key_size, full_alg_name,
+		       rc);
+		rc = -EINVAL;
+		goto out;
+	}
+out:
+	kfree(full_alg_name);
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_key_tfm_cache;
+static struct list_head key_tfm_list;
+struct mutex key_tfm_list_mutex;
+
+int __init ecryptfs_init_crypto(void)
+{
+	mutex_init(&key_tfm_list_mutex);
+	INIT_LIST_HEAD(&key_tfm_list);
+	return 0;
+}
+
+/**
+ * ecryptfs_destroy_crypto - free all cached key_tfms on key_tfm_list
+ *
+ * Called only at module unload time
+ */
+int ecryptfs_destroy_crypto(void)
+{
+	struct ecryptfs_key_tfm *key_tfm, *key_tfm_tmp;
+
+	mutex_lock(&key_tfm_list_mutex);
+	list_for_each_entry_safe(key_tfm, key_tfm_tmp, &key_tfm_list,
+				 key_tfm_list) {
+		list_del(&key_tfm->key_tfm_list);
+		if (key_tfm->key_tfm)
+			crypto_free_blkcipher(key_tfm->key_tfm);
+		kmem_cache_free(ecryptfs_key_tfm_cache, key_tfm);
+	}
+	mutex_unlock(&key_tfm_list_mutex);
+	return 0;
+}
+
+int
+ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name,
+			 size_t key_size)
+{
+	struct ecryptfs_key_tfm *tmp_tfm;
+	int rc = 0;
+
+	BUG_ON(!mutex_is_locked(&key_tfm_list_mutex));
+
+	tmp_tfm = kmem_cache_alloc(ecryptfs_key_tfm_cache, GFP_KERNEL);
+	if (key_tfm != NULL)
+		(*key_tfm) = tmp_tfm;
+	if (!tmp_tfm) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "Error attempting to allocate from "
+		       "ecryptfs_key_tfm_cache\n");
+		goto out;
+	}
+	mutex_init(&tmp_tfm->key_tfm_mutex);
+	strncpy(tmp_tfm->cipher_name, cipher_name,
+		ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+	tmp_tfm->cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
+	tmp_tfm->key_size = key_size;
+	rc = ecryptfs_process_key_cipher(&tmp_tfm->key_tfm,
+					 tmp_tfm->cipher_name,
+					 &tmp_tfm->key_size);
+	if (rc) {
+		printk(KERN_ERR "Error attempting to initialize key TFM "
+		       "cipher with name = [%s]; rc = [%d]\n",
+		       tmp_tfm->cipher_name, rc);
+		kmem_cache_free(ecryptfs_key_tfm_cache, tmp_tfm);
+		if (key_tfm != NULL)
+			(*key_tfm) = NULL;
+		goto out;
+	}
+	list_add(&tmp_tfm->key_tfm_list, &key_tfm_list);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_tfm_exists - Search for existing tfm for cipher_name.
+ * @cipher_name: the name of the cipher to search for
+ * @key_tfm: set to corresponding tfm if found
+ *
+ * Searches for cached key_tfm matching @cipher_name
+ * Must be called with &key_tfm_list_mutex held
+ * Returns 1 if found, with @key_tfm set
+ * Returns 0 if not found, with @key_tfm set to NULL
+ */
+int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm)
+{
+	struct ecryptfs_key_tfm *tmp_key_tfm;
+
+	BUG_ON(!mutex_is_locked(&key_tfm_list_mutex));
+
+	list_for_each_entry(tmp_key_tfm, &key_tfm_list, key_tfm_list) {
+		if (strcmp(tmp_key_tfm->cipher_name, cipher_name) == 0) {
+			if (key_tfm)
+				(*key_tfm) = tmp_key_tfm;
+			return 1;
+		}
+	}
+	if (key_tfm)
+		(*key_tfm) = NULL;
+	return 0;
+}
+
+/**
+ * ecryptfs_get_tfm_and_mutex_for_cipher_name
+ *
+ * @tfm: set to cached tfm found, or new tfm created
+ * @tfm_mutex: set to mutex for cached tfm found, or new tfm created
+ * @cipher_name: the name of the cipher to search for and/or add
+ *
+ * Sets pointers to @tfm & @tfm_mutex matching @cipher_name.
+ * Searches for cached item first, and creates new if not found.
+ * Returns 0 on success, non-zero if adding new cipher failed
+ */
+int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
+					       struct mutex **tfm_mutex,
+					       char *cipher_name)
+{
+	struct ecryptfs_key_tfm *key_tfm;
+	int rc = 0;
+
+	(*tfm) = NULL;
+	(*tfm_mutex) = NULL;
+
+	mutex_lock(&key_tfm_list_mutex);
+	if (!ecryptfs_tfm_exists(cipher_name, &key_tfm)) {
+		rc = ecryptfs_add_new_key_tfm(&key_tfm, cipher_name, 0);
+		if (rc) {
+			printk(KERN_ERR "Error adding new key_tfm to list; "
+					"rc = [%d]\n", rc);
+			goto out;
+		}
+	}
+	(*tfm) = key_tfm->key_tfm;
+	(*tfm_mutex) = &key_tfm->key_tfm_mutex;
+out:
+	mutex_unlock(&key_tfm_list_mutex);
+	return rc;
+}
+
+/* 64 characters forming a 6-bit target field */
+static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
+						 "EFGHIJKLMNOPQRST"
+						 "UVWXYZabcdefghij"
+						 "klmnopqrstuvwxyz");
+
+/* We could either offset on every reverse map or just pad some 0x00's
+ * at the front here */
+static const unsigned char filename_rev_map[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */
+	0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */
+	0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */
+	0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */
+	0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */
+	0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */
+	0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */
+	0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
+	0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
+	0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
+	0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */
+};
+
+/**
+ * ecryptfs_encode_for_filename
+ * @dst: Destination location for encoded filename
+ * @dst_size: Size of the encoded filename in bytes
+ * @src: Source location for the filename to encode
+ * @src_size: Size of the source in bytes
+ */
+void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size,
+				  unsigned char *src, size_t src_size)
+{
+	size_t num_blocks;
+	size_t block_num = 0;
+	size_t dst_offset = 0;
+	unsigned char last_block[3];
+
+	if (src_size == 0) {
+		(*dst_size) = 0;
+		goto out;
+	}
+	num_blocks = (src_size / 3);
+	if ((src_size % 3) == 0) {
+		memcpy(last_block, (&src[src_size - 3]), 3);
+	} else {
+		num_blocks++;
+		last_block[2] = 0x00;
+		switch (src_size % 3) {
+		case 1:
+			last_block[0] = src[src_size - 1];
+			last_block[1] = 0x00;
+			break;
+		case 2:
+			last_block[0] = src[src_size - 2];
+			last_block[1] = src[src_size - 1];
+		}
+	}
+	(*dst_size) = (num_blocks * 4);
+	if (!dst)
+		goto out;
+	while (block_num < num_blocks) {
+		unsigned char *src_block;
+		unsigned char dst_block[4];
+
+		if (block_num == (num_blocks - 1))
+			src_block = last_block;
+		else
+			src_block = &src[block_num * 3];
+		dst_block[0] = ((src_block[0] >> 2) & 0x3F);
+		dst_block[1] = (((src_block[0] << 4) & 0x30)
+				| ((src_block[1] >> 4) & 0x0F));
+		dst_block[2] = (((src_block[1] << 2) & 0x3C)
+				| ((src_block[2] >> 6) & 0x03));
+		dst_block[3] = (src_block[2] & 0x3F);
+		dst[dst_offset++] = portable_filename_chars[dst_block[0]];
+		dst[dst_offset++] = portable_filename_chars[dst_block[1]];
+		dst[dst_offset++] = portable_filename_chars[dst_block[2]];
+		dst[dst_offset++] = portable_filename_chars[dst_block[3]];
+		block_num++;
+	}
+out:
+	return;
+}
+
+/**
+ * ecryptfs_decode_from_filename
+ * @dst: If NULL, this function only sets @dst_size and returns. If
+ *       non-NULL, this function decodes the encoded octets in @src
+ *       into the memory that @dst points to.
+ * @dst_size: Set to the size of the decoded string.
+ * @src: The encoded set of octets to decode.
+ * @src_size: The size of the encoded set of octets to decode.
+ */
+static void
+ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
+			      const unsigned char *src, size_t src_size)
+{
+	u8 current_bit_offset = 0;
+	size_t src_byte_offset = 0;
+	size_t dst_byte_offset = 0;
+
+	if (dst == NULL) {
+		/* Not exact; conservatively long. Every block of 4
+		 * encoded characters decodes into a block of 3
+		 * decoded characters. This segment of code provides
+		 * the caller with the maximum amount of allocated
+		 * space that @dst will need to point to in a
+		 * subsequent call. */
+		(*dst_size) = (((src_size + 1) * 3) / 4);
+		goto out;
+	}
+	while (src_byte_offset < src_size) {
+		unsigned char src_byte =
+				filename_rev_map[(int)src[src_byte_offset]];
+
+		switch (current_bit_offset) {
+		case 0:
+			dst[dst_byte_offset] = (src_byte << 2);
+			current_bit_offset = 6;
+			break;
+		case 6:
+			dst[dst_byte_offset++] |= (src_byte >> 4);
+			dst[dst_byte_offset] = ((src_byte & 0xF)
+						 << 4);
+			current_bit_offset = 4;
+			break;
+		case 4:
+			dst[dst_byte_offset++] |= (src_byte >> 2);
+			dst[dst_byte_offset] = (src_byte << 6);
+			current_bit_offset = 2;
+			break;
+		case 2:
+			dst[dst_byte_offset++] |= (src_byte);
+			dst[dst_byte_offset] = 0;
+			current_bit_offset = 0;
+			break;
+		}
+		src_byte_offset++;
+	}
+	(*dst_size) = dst_byte_offset;
+out:
+	return;
+}
+
+/**
+ * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text
+ * @crypt_stat: The crypt_stat struct associated with the file anem to encode
+ * @name: The plaintext name
+ * @length: The length of the plaintext
+ * @encoded_name: The encypted name
+ *
+ * Encrypts and encodes a filename into something that constitutes a
+ * valid filename for a filesystem, with printable characters.
+ *
+ * We assume that we have a properly initialized crypto context,
+ * pointed to by crypt_stat->tfm.
+ *
+ * Returns zero on success; non-zero on otherwise
+ */
+int ecryptfs_encrypt_and_encode_filename(
+	char **encoded_name,
+	size_t *encoded_name_size,
+	struct ecryptfs_crypt_stat *crypt_stat,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+	const char *name, size_t name_size)
+{
+	size_t encoded_name_no_prefix_size;
+	int rc = 0;
+
+	(*encoded_name) = NULL;
+	(*encoded_name_size) = 0;
+	if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+	    || (mount_crypt_stat && (mount_crypt_stat->flags
+				     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
+		struct ecryptfs_filename *filename;
+
+		filename = kzalloc(sizeof(*filename), GFP_KERNEL);
+		if (!filename) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kzalloc [%zd] bytes\n", __func__,
+			       sizeof(*filename));
+			rc = -ENOMEM;
+			goto out;
+		}
+		filename->filename = (char *)name;
+		filename->filename_size = name_size;
+		rc = ecryptfs_encrypt_filename(filename, crypt_stat,
+					       mount_crypt_stat);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to encrypt "
+			       "filename; rc = [%d]\n", __func__, rc);
+			kfree(filename);
+			goto out;
+		}
+		ecryptfs_encode_for_filename(
+			NULL, &encoded_name_no_prefix_size,
+			filename->encrypted_filename,
+			filename->encrypted_filename_size);
+		if ((crypt_stat && (crypt_stat->flags
+				    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+		    || (mount_crypt_stat
+			&& (mount_crypt_stat->flags
+			    & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)))
+			(*encoded_name_size) =
+				(ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+				 + encoded_name_no_prefix_size);
+		else
+			(*encoded_name_size) =
+				(ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+				 + encoded_name_no_prefix_size);
+		(*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL);
+		if (!(*encoded_name)) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kzalloc [%zd] bytes\n", __func__,
+			       (*encoded_name_size));
+			rc = -ENOMEM;
+			kfree(filename->encrypted_filename);
+			kfree(filename);
+			goto out;
+		}
+		if ((crypt_stat && (crypt_stat->flags
+				    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+		    || (mount_crypt_stat
+			&& (mount_crypt_stat->flags
+			    & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+			memcpy((*encoded_name),
+			       ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+			       ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
+			ecryptfs_encode_for_filename(
+			    ((*encoded_name)
+			     + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE),
+			    &encoded_name_no_prefix_size,
+			    filename->encrypted_filename,
+			    filename->encrypted_filename_size);
+			(*encoded_name_size) =
+				(ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+				 + encoded_name_no_prefix_size);
+			(*encoded_name)[(*encoded_name_size)] = '\0';
+		} else {
+			rc = -EOPNOTSUPP;
+		}
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to encode "
+			       "encrypted filename; rc = [%d]\n", __func__,
+			       rc);
+			kfree((*encoded_name));
+			(*encoded_name) = NULL;
+			(*encoded_name_size) = 0;
+		}
+		kfree(filename->encrypted_filename);
+		kfree(filename);
+	} else {
+		rc = ecryptfs_copy_filename(encoded_name,
+					    encoded_name_size,
+					    name, name_size);
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
+ * @plaintext_name: The plaintext name
+ * @plaintext_name_size: The plaintext name size
+ * @ecryptfs_dir_dentry: eCryptfs directory dentry
+ * @name: The filename in cipher text
+ * @name_size: The cipher text name size
+ *
+ * Decrypts and decodes the filename.
+ *
+ * Returns zero on error; non-zero otherwise
+ */
+int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
+					 size_t *plaintext_name_size,
+					 struct dentry *ecryptfs_dir_dentry,
+					 const char *name, size_t name_size)
+{
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		&ecryptfs_superblock_to_private(
+			ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
+	char *decoded_name;
+	size_t decoded_name_size;
+	size_t packet_size;
+	int rc = 0;
+
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
+	    && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
+	    && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+			ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
+		const char *orig_name = name;
+		size_t orig_name_size = name_size;
+
+		name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+		name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+		ecryptfs_decode_from_filename(NULL, &decoded_name_size,
+					      name, name_size);
+		decoded_name = kmalloc(decoded_name_size, GFP_KERNEL);
+		if (!decoded_name) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kmalloc [%zd] bytes\n", __func__,
+			       decoded_name_size);
+			rc = -ENOMEM;
+			goto out;
+		}
+		ecryptfs_decode_from_filename(decoded_name, &decoded_name_size,
+					      name, name_size);
+		rc = ecryptfs_parse_tag_70_packet(plaintext_name,
+						  plaintext_name_size,
+						  &packet_size,
+						  mount_crypt_stat,
+						  decoded_name,
+						  decoded_name_size);
+		if (rc) {
+			printk(KERN_INFO "%s: Could not parse tag 70 packet "
+			       "from filename; copying through filename "
+			       "as-is\n", __func__);
+			rc = ecryptfs_copy_filename(plaintext_name,
+						    plaintext_name_size,
+						    orig_name, orig_name_size);
+			goto out_free;
+		}
+	} else {
+		rc = ecryptfs_copy_filename(plaintext_name,
+					    plaintext_name_size,
+					    name, name_size);
+		goto out;
+	}
+out_free:
+	kfree(decoded_name);
+out:
+	return rc;
+}
diff --git a/fs/ecryptfs/debug.c b/fs/ecryptfs/debug.c
new file mode 100644
index 00000000..3d2bdf54
--- /dev/null
+++ b/fs/ecryptfs/debug.c
@@ -0,0 +1,121 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * Functions only useful for debugging.
+ *
+ * Copyright (C) 2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_dump_auth_tok - debug function to print auth toks
+ *
+ * This function will print the contents of an ecryptfs authentication
+ * token.
+ */
+void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok)
+{
+	char salt[ECRYPTFS_SALT_SIZE * 2 + 1];
+	char sig[ECRYPTFS_SIG_SIZE_HEX + 1];
+
+	ecryptfs_printk(KERN_DEBUG, "Auth tok at mem loc [%p]:\n",
+			auth_tok);
+	if (auth_tok->flags & ECRYPTFS_PRIVATE_KEY) {
+		ecryptfs_printk(KERN_DEBUG, " * private key type\n");
+	} else {
+		ecryptfs_printk(KERN_DEBUG, " * passphrase type\n");
+		ecryptfs_to_hex(salt, auth_tok->token.password.salt,
+				ECRYPTFS_SALT_SIZE);
+		salt[ECRYPTFS_SALT_SIZE * 2] = '\0';
+		ecryptfs_printk(KERN_DEBUG, " * salt = [%s]\n", salt);
+		if (auth_tok->token.password.flags &
+		    ECRYPTFS_PERSISTENT_PASSWORD) {
+			ecryptfs_printk(KERN_DEBUG, " * persistent\n");
+		}
+		memcpy(sig, auth_tok->token.password.signature,
+		       ECRYPTFS_SIG_SIZE_HEX);
+		sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+		ecryptfs_printk(KERN_DEBUG, " * signature = [%s]\n", sig);
+	}
+	ecryptfs_printk(KERN_DEBUG, " * session_key.flags = [0x%x]\n",
+			auth_tok->session_key.flags);
+	if (auth_tok->session_key.flags
+	    & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT)
+		ecryptfs_printk(KERN_DEBUG,
+				" * Userspace decrypt request set\n");
+	if (auth_tok->session_key.flags
+	    & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT)
+		ecryptfs_printk(KERN_DEBUG,
+				" * Userspace encrypt request set\n");
+	if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_DECRYPTED_KEY) {
+		ecryptfs_printk(KERN_DEBUG, " * Contains decrypted key\n");
+		ecryptfs_printk(KERN_DEBUG,
+				" * session_key.decrypted_key_size = [0x%x]\n",
+				auth_tok->session_key.decrypted_key_size);
+		ecryptfs_printk(KERN_DEBUG, " * Decrypted session key "
+				"dump:\n");
+		if (ecryptfs_verbosity > 0)
+			ecryptfs_dump_hex(auth_tok->session_key.decrypted_key,
+					  ECRYPTFS_DEFAULT_KEY_BYTES);
+	}
+	if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_ENCRYPTED_KEY) {
+		ecryptfs_printk(KERN_DEBUG, " * Contains encrypted key\n");
+		ecryptfs_printk(KERN_DEBUG,
+				" * session_key.encrypted_key_size = [0x%x]\n",
+				auth_tok->session_key.encrypted_key_size);
+		ecryptfs_printk(KERN_DEBUG, " * Encrypted session key "
+				"dump:\n");
+		if (ecryptfs_verbosity > 0)
+			ecryptfs_dump_hex(auth_tok->session_key.encrypted_key,
+					  auth_tok->session_key.
+					  encrypted_key_size);
+	}
+}
+
+/**
+ * ecryptfs_dump_hex - debug hex printer
+ * @data: string of bytes to be printed
+ * @bytes: number of bytes to print
+ *
+ * Dump hexadecimal representation of char array
+ */
+void ecryptfs_dump_hex(char *data, int bytes)
+{
+	int i = 0;
+	int add_newline = 1;
+
+	if (ecryptfs_verbosity < 1)
+		return;
+	if (bytes != 0) {
+		printk(KERN_DEBUG "0x%.2x.", (unsigned char)data[i]);
+		i++;
+	}
+	while (i < bytes) {
+		printk("0x%.2x.", (unsigned char)data[i]);
+		i++;
+		if (i % 16 == 0) {
+			printk("\n");
+			add_newline = 0;
+		} else
+			add_newline = 1;
+	}
+	if (add_newline)
+		printk("\n");
+}
+
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
new file mode 100644
index 00000000..534c1d46
--- /dev/null
+++ b/fs/ecryptfs/dentry.c
@@ -0,0 +1,105 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/fs_stack.h>
+#include <linux/slab.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_d_revalidate - revalidate an ecryptfs dentry
+ * @dentry: The ecryptfs dentry
+ * @nd: The associated nameidata
+ *
+ * Called when the VFS needs to revalidate a dentry. This
+ * is called whenever a name lookup finds a dentry in the
+ * dcache. Most filesystems leave this as NULL, because all their
+ * dentries in the dcache are valid.
+ *
+ * Returns 1 if valid, 0 otherwise.
+ *
+ */
+static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct dentry *lower_dentry;
+	struct vfsmount *lower_mnt;
+	struct dentry *dentry_save = NULL;
+	struct vfsmount *vfsmount_save = NULL;
+	int rc = 1;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+	if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
+		goto out;
+	if (nd) {
+		dentry_save = nd->path.dentry;
+		vfsmount_save = nd->path.mnt;
+		nd->path.dentry = lower_dentry;
+		nd->path.mnt = lower_mnt;
+	}
+	rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd);
+	if (nd) {
+		nd->path.dentry = dentry_save;
+		nd->path.mnt = vfsmount_save;
+	}
+	if (dentry->d_inode) {
+		struct inode *lower_inode =
+			ecryptfs_inode_to_lower(dentry->d_inode);
+
+		fsstack_copy_attr_all(dentry->d_inode, lower_inode);
+	}
+out:
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_dentry_info_cache;
+
+/**
+ * ecryptfs_d_release
+ * @dentry: The ecryptfs dentry
+ *
+ * Called when a dentry is really deallocated.
+ */
+static void ecryptfs_d_release(struct dentry *dentry)
+{
+	if (ecryptfs_dentry_to_private(dentry)) {
+		if (ecryptfs_dentry_to_lower(dentry)) {
+			dput(ecryptfs_dentry_to_lower(dentry));
+			mntput(ecryptfs_dentry_to_lower_mnt(dentry));
+		}
+		kmem_cache_free(ecryptfs_dentry_info_cache,
+				ecryptfs_dentry_to_private(dentry));
+	}
+	return;
+}
+
+const struct dentry_operations ecryptfs_dops = {
+	.d_revalidate = ecryptfs_d_revalidate,
+	.d_release = ecryptfs_d_release,
+};
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
new file mode 100644
index 00000000..43c7c43b
--- /dev/null
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -0,0 +1,771 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * Kernel declarations.
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2008 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *              Trevor S. Highland <trevor.highland@gmail.com>
+ *              Tyler Hicks <tyhicks@ou.edu>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#ifndef ECRYPTFS_KERNEL_H
+#define ECRYPTFS_KERNEL_H
+
+#include <keys/user-type.h>
+#include <linux/fs.h>
+#include <linux/fs_stack.h>
+#include <linux/namei.h>
+#include <linux/scatterlist.h>
+#include <linux/hash.h>
+#include <linux/nsproxy.h>
+#include <linux/backing-dev.h>
+
+/* Version verification for shared data structures w/ userspace */
+#define ECRYPTFS_VERSION_MAJOR 0x00
+#define ECRYPTFS_VERSION_MINOR 0x04
+#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x03
+/* These flags indicate which features are supported by the kernel
+ * module; userspace tools such as the mount helper read
+ * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine
+ * how to behave. */
+#define ECRYPTFS_VERSIONING_PASSPHRASE            0x00000001
+#define ECRYPTFS_VERSIONING_PUBKEY                0x00000002
+#define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004
+#define ECRYPTFS_VERSIONING_POLICY                0x00000008
+#define ECRYPTFS_VERSIONING_XATTR                 0x00000010
+#define ECRYPTFS_VERSIONING_MULTKEY               0x00000020
+#define ECRYPTFS_VERSIONING_DEVMISC               0x00000040
+#define ECRYPTFS_VERSIONING_HMAC                  0x00000080
+#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION   0x00000100
+#define ECRYPTFS_VERSIONING_GCM                   0x00000200
+#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
+				  | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \
+				  | ECRYPTFS_VERSIONING_PUBKEY \
+				  | ECRYPTFS_VERSIONING_XATTR \
+				  | ECRYPTFS_VERSIONING_MULTKEY \
+				  | ECRYPTFS_VERSIONING_DEVMISC \
+				  | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION)
+#define ECRYPTFS_MAX_PASSWORD_LENGTH 64
+#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
+#define ECRYPTFS_SALT_SIZE 8
+#define ECRYPTFS_SALT_SIZE_HEX (ECRYPTFS_SALT_SIZE*2)
+/* The original signature size is only for what is stored on disk; all
+ * in-memory representations are expanded hex, so it better adapted to
+ * be passed around or referenced on the command line */
+#define ECRYPTFS_SIG_SIZE 8
+#define ECRYPTFS_SIG_SIZE_HEX (ECRYPTFS_SIG_SIZE*2)
+#define ECRYPTFS_PASSWORD_SIG_SIZE ECRYPTFS_SIG_SIZE_HEX
+#define ECRYPTFS_MAX_KEY_BYTES 64
+#define ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES 512
+#define ECRYPTFS_DEFAULT_IV_BYTES 16
+#define ECRYPTFS_FILE_VERSION 0x03
+#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096
+#define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192
+#define ECRYPTFS_DEFAULT_MSG_CTX_ELEMS 32
+#define ECRYPTFS_DEFAULT_SEND_TIMEOUT HZ
+#define ECRYPTFS_MAX_MSG_CTX_TTL (HZ*3)
+#define ECRYPTFS_MAX_PKI_NAME_BYTES 16
+#define ECRYPTFS_DEFAULT_NUM_USERS 4
+#define ECRYPTFS_MAX_NUM_USERS 32768
+#define ECRYPTFS_XATTR_NAME "user.ecryptfs"
+
+#define RFC2440_CIPHER_DES3_EDE 0x02
+#define RFC2440_CIPHER_CAST_5 0x03
+#define RFC2440_CIPHER_BLOWFISH 0x04
+#define RFC2440_CIPHER_AES_128 0x07
+#define RFC2440_CIPHER_AES_192 0x08
+#define RFC2440_CIPHER_AES_256 0x09
+#define RFC2440_CIPHER_TWOFISH 0x0a
+#define RFC2440_CIPHER_CAST_6 0x0b
+
+#define RFC2440_CIPHER_RSA 0x01
+
+/**
+ * For convenience, we may need to pass around the encrypted session
+ * key between kernel and userspace because the authentication token
+ * may not be extractable.  For example, the TPM may not release the
+ * private key, instead requiring the encrypted data and returning the
+ * decrypted data.
+ */
+struct ecryptfs_session_key {
+#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT 0x00000001
+#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT 0x00000002
+#define ECRYPTFS_CONTAINS_DECRYPTED_KEY 0x00000004
+#define ECRYPTFS_CONTAINS_ENCRYPTED_KEY 0x00000008
+	u32 flags;
+	u32 encrypted_key_size;
+	u32 decrypted_key_size;
+	u8 encrypted_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES];
+	u8 decrypted_key[ECRYPTFS_MAX_KEY_BYTES];
+};
+
+struct ecryptfs_password {
+	u32 password_bytes;
+	s32 hash_algo;
+	u32 hash_iterations;
+	u32 session_key_encryption_key_bytes;
+#define ECRYPTFS_PERSISTENT_PASSWORD 0x01
+#define ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET 0x02
+	u32 flags;
+	/* Iterated-hash concatenation of salt and passphrase */
+	u8 session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
+	u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1];
+	/* Always in expanded hex */
+	u8 salt[ECRYPTFS_SALT_SIZE];
+};
+
+enum ecryptfs_token_types {ECRYPTFS_PASSWORD, ECRYPTFS_PRIVATE_KEY};
+
+struct ecryptfs_private_key {
+	u32 key_size;
+	u32 data_len;
+	u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1];
+	char pki_type[ECRYPTFS_MAX_PKI_NAME_BYTES + 1];
+	u8 data[];
+};
+
+/* May be a password or a private key */
+struct ecryptfs_auth_tok {
+	u16 version; /* 8-bit major and 8-bit minor */
+	u16 token_type;
+#define ECRYPTFS_ENCRYPT_ONLY 0x00000001
+	u32 flags;
+	struct ecryptfs_session_key session_key;
+	u8 reserved[32];
+	union {
+		struct ecryptfs_password password;
+		struct ecryptfs_private_key private_key;
+	} token;
+} __attribute__ ((packed));
+
+void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok);
+extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size);
+extern void ecryptfs_from_hex(char *dst, char *src, int dst_size);
+
+struct ecryptfs_key_record {
+	unsigned char type;
+	size_t enc_key_size;
+	unsigned char sig[ECRYPTFS_SIG_SIZE];
+	unsigned char enc_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES];
+};
+
+struct ecryptfs_auth_tok_list {
+	struct ecryptfs_auth_tok *auth_tok;
+	struct list_head list;
+};
+
+struct ecryptfs_crypt_stat;
+struct ecryptfs_mount_crypt_stat;
+
+struct ecryptfs_page_crypt_context {
+	struct page *page;
+#define ECRYPTFS_PREPARE_COMMIT_MODE 0
+#define ECRYPTFS_WRITEPAGE_MODE      1
+	unsigned int mode;
+	union {
+		struct file *lower_file;
+		struct writeback_control *wbc;
+	} param;
+};
+
+static inline struct ecryptfs_auth_tok *
+ecryptfs_get_key_payload_data(struct key *key)
+{
+	return (struct ecryptfs_auth_tok *)
+		(((struct user_key_payload*)key->payload.data)->data);
+}
+
+#define ECRYPTFS_MAX_KEYSET_SIZE 1024
+#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
+#define ECRYPTFS_MAX_NUM_ENC_KEYS 64
+#define ECRYPTFS_MAX_IV_BYTES 16	/* 128 bits */
+#define ECRYPTFS_SALT_BYTES 2
+#define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5
+#define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8	/* 4*2 */
+#define ECRYPTFS_FILE_SIZE_BYTES (sizeof(u64))
+#define ECRYPTFS_SIZE_AND_MARKER_BYTES (ECRYPTFS_FILE_SIZE_BYTES \
+					+ MAGIC_ECRYPTFS_MARKER_SIZE_BYTES)
+#define ECRYPTFS_DEFAULT_CIPHER "aes"
+#define ECRYPTFS_DEFAULT_KEY_BYTES 16
+#define ECRYPTFS_DEFAULT_HASH "md5"
+#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH
+#define ECRYPTFS_TAG_1_PACKET_TYPE 0x01
+#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
+#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
+#define ECRYPTFS_TAG_64_PACKET_TYPE 0x40
+#define ECRYPTFS_TAG_65_PACKET_TYPE 0x41
+#define ECRYPTFS_TAG_66_PACKET_TYPE 0x42
+#define ECRYPTFS_TAG_67_PACKET_TYPE 0x43
+#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename
+					  * as dentry name */
+#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in
+					  * metadata */
+#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as
+					  * dentry name */
+#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
+					  * metadata */
+/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
+ * ECRYPTFS_MAX_IV_BYTES */
+#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
+#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
+#define MD5_DIGEST_SIZE 16
+#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED."
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24
+#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32)
+
+struct ecryptfs_key_sig {
+	struct list_head crypt_stat_list;
+	char keysig[ECRYPTFS_SIG_SIZE_HEX + 1];
+};
+
+struct ecryptfs_filename {
+	struct list_head crypt_stat_list;
+#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001
+	u32 flags;
+	u32 seq_no;
+	char *filename;
+	char *encrypted_filename;
+	size_t filename_size;
+	size_t encrypted_filename_size;
+	char fnek_sig[ECRYPTFS_SIG_SIZE_HEX];
+	char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1];
+};
+
+/**
+ * This is the primary struct associated with each encrypted file.
+ *
+ * TODO: cache align/pack?
+ */
+struct ecryptfs_crypt_stat {
+#define ECRYPTFS_STRUCT_INITIALIZED   0x00000001
+#define ECRYPTFS_POLICY_APPLIED       0x00000002
+#define ECRYPTFS_ENCRYPTED            0x00000004
+#define ECRYPTFS_SECURITY_WARNING     0x00000008
+#define ECRYPTFS_ENABLE_HMAC          0x00000010
+#define ECRYPTFS_ENCRYPT_IV_PAGES     0x00000020
+#define ECRYPTFS_KEY_VALID            0x00000040
+#define ECRYPTFS_METADATA_IN_XATTR    0x00000080
+#define ECRYPTFS_VIEW_AS_ENCRYPTED    0x00000100
+#define ECRYPTFS_KEY_SET              0x00000200
+#define ECRYPTFS_ENCRYPT_FILENAMES    0x00000400
+#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00000800
+#define ECRYPTFS_ENCFN_USE_FEK        0x00001000
+#define ECRYPTFS_UNLINK_SIGS          0x00002000
+#define ECRYPTFS_I_SIZE_INITIALIZED   0x00004000
+	u32 flags;
+	unsigned int file_version;
+	size_t iv_bytes;
+	size_t metadata_size;
+	size_t extent_size; /* Data extent size; default is 4096 */
+	size_t key_size;
+	size_t extent_shift;
+	unsigned int extent_mask;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+	struct crypto_blkcipher *tfm;
+	struct crypto_hash *hash_tfm; /* Crypto context for generating
+				       * the initialization vectors */
+	unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+	unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
+	unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
+	struct list_head keysig_list;
+	struct mutex keysig_list_mutex;
+	struct mutex cs_tfm_mutex;
+	struct mutex cs_hash_tfm_mutex;
+	struct mutex cs_mutex;
+};
+
+/* inode private data. */
+struct ecryptfs_inode_info {
+	struct inode vfs_inode;
+	struct inode *wii_inode;
+	struct mutex lower_file_mutex;
+	atomic_t lower_file_count;
+	struct file *lower_file;
+	struct ecryptfs_crypt_stat crypt_stat;
+};
+
+/* dentry private data. Each dentry must keep track of a lower
+ * vfsmount too. */
+struct ecryptfs_dentry_info {
+	struct path lower_path;
+	struct ecryptfs_crypt_stat *crypt_stat;
+};
+
+/**
+ * ecryptfs_global_auth_tok - A key used to encrypt all new files under the mountpoint
+ * @flags: Status flags
+ * @mount_crypt_stat_list: These auth_toks hang off the mount-wide
+ *                         cryptographic context. Every time a new
+ *                         inode comes into existence, eCryptfs copies
+ *                         the auth_toks on that list to the set of
+ *                         auth_toks on the inode's crypt_stat
+ * @global_auth_tok_key: The key from the user's keyring for the sig
+ * @global_auth_tok: The key contents
+ * @sig: The key identifier
+ *
+ * ecryptfs_global_auth_tok structs refer to authentication token keys
+ * in the user keyring that apply to newly created files. A list of
+ * these objects hangs off of the mount_crypt_stat struct for any
+ * given eCryptfs mount. This struct maintains a reference to both the
+ * key contents and the key itself so that the key can be put on
+ * unmount.
+ */
+struct ecryptfs_global_auth_tok {
+#define ECRYPTFS_AUTH_TOK_INVALID 0x00000001
+#define ECRYPTFS_AUTH_TOK_FNEK    0x00000002
+	u32 flags;
+	struct list_head mount_crypt_stat_list;
+	struct key *global_auth_tok_key;
+	unsigned char sig[ECRYPTFS_SIG_SIZE_HEX + 1];
+};
+
+/**
+ * ecryptfs_key_tfm - Persistent key tfm
+ * @key_tfm: crypto API handle to the key
+ * @key_size: Key size in bytes
+ * @key_tfm_mutex: Mutex to ensure only one operation in eCryptfs is
+ *                 using the persistent TFM at any point in time
+ * @key_tfm_list: Handle to hang this off the module-wide TFM list
+ * @cipher_name: String name for the cipher for this TFM
+ *
+ * Typically, eCryptfs will use the same ciphers repeatedly throughout
+ * the course of its operations. In order to avoid unnecessarily
+ * destroying and initializing the same cipher repeatedly, eCryptfs
+ * keeps a list of crypto API contexts around to use when needed.
+ */
+struct ecryptfs_key_tfm {
+	struct crypto_blkcipher *key_tfm;
+	size_t key_size;
+	struct mutex key_tfm_mutex;
+	struct list_head key_tfm_list;
+	unsigned char cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
+};
+
+extern struct mutex key_tfm_list_mutex;
+
+/**
+ * This struct is to enable a mount-wide passphrase/salt combo. This
+ * is more or less a stopgap to provide similar functionality to other
+ * crypto filesystems like EncFS or CFS until full policy support is
+ * implemented in eCryptfs.
+ */
+struct ecryptfs_mount_crypt_stat {
+	/* Pointers to memory we do not own, do not free these */
+#define ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED 0x00000001
+#define ECRYPTFS_XATTR_METADATA_ENABLED        0x00000002
+#define ECRYPTFS_ENCRYPTED_VIEW_ENABLED        0x00000004
+#define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED  0x00000008
+#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES      0x00000010
+#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK   0x00000020
+#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK          0x00000040
+#define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY    0x00000080
+	u32 flags;
+	struct list_head global_auth_tok_list;
+	struct mutex global_auth_tok_list_mutex;
+	size_t global_default_cipher_key_size;
+	size_t global_default_fn_cipher_key_bytes;
+	unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
+						 + 1];
+	unsigned char global_default_fn_cipher_name[
+		ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
+	char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1];
+};
+
+/* superblock private data. */
+struct ecryptfs_sb_info {
+	struct super_block *wsi_sb;
+	struct ecryptfs_mount_crypt_stat mount_crypt_stat;
+	struct backing_dev_info bdi;
+};
+
+/* file private data. */
+struct ecryptfs_file_info {
+	struct file *wfi_file;
+	struct ecryptfs_crypt_stat *crypt_stat;
+};
+
+/* auth_tok <=> encrypted_session_key mappings */
+struct ecryptfs_auth_tok_list_item {
+	unsigned char encrypted_session_key[ECRYPTFS_MAX_KEY_BYTES];
+	struct list_head list;
+	struct ecryptfs_auth_tok auth_tok;
+};
+
+struct ecryptfs_message {
+	/* Can never be greater than ecryptfs_message_buf_len */
+	/* Used to find the parent msg_ctx */
+	/* Inherits from msg_ctx->index */
+	u32 index;
+	u32 data_len;
+	u8 data[];
+};
+
+struct ecryptfs_msg_ctx {
+#define ECRYPTFS_MSG_CTX_STATE_FREE     0x01
+#define ECRYPTFS_MSG_CTX_STATE_PENDING  0x02
+#define ECRYPTFS_MSG_CTX_STATE_DONE     0x03
+#define ECRYPTFS_MSG_CTX_STATE_NO_REPLY 0x04
+	u8 state;
+#define ECRYPTFS_MSG_HELO 100
+#define ECRYPTFS_MSG_QUIT 101
+#define ECRYPTFS_MSG_REQUEST 102
+#define ECRYPTFS_MSG_RESPONSE 103
+	u8 type;
+	u32 index;
+	/* Counter converts to a sequence number. Each message sent
+	 * out for which we expect a response has an associated
+	 * sequence number. The response must have the same sequence
+	 * number as the counter for the msg_stc for the message to be
+	 * valid. */
+	u32 counter;
+	size_t msg_size;
+	struct ecryptfs_message *msg;
+	struct task_struct *task;
+	struct list_head node;
+	struct list_head daemon_out_list;
+	struct mutex mux;
+};
+
+struct ecryptfs_daemon;
+
+struct ecryptfs_daemon {
+#define ECRYPTFS_DAEMON_IN_READ      0x00000001
+#define ECRYPTFS_DAEMON_IN_POLL      0x00000002
+#define ECRYPTFS_DAEMON_ZOMBIE       0x00000004
+#define ECRYPTFS_DAEMON_MISCDEV_OPEN 0x00000008
+	u32 flags;
+	u32 num_queued_msg_ctx;
+	struct pid *pid;
+	uid_t euid;
+	struct user_namespace *user_ns;
+	struct task_struct *task;
+	struct mutex mux;
+	struct list_head msg_ctx_out_queue;
+	wait_queue_head_t wait;
+	struct hlist_node euid_chain;
+};
+
+extern struct mutex ecryptfs_daemon_hash_mux;
+
+static inline size_t
+ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+		return 0;
+	return crypt_stat->metadata_size;
+}
+
+static inline struct ecryptfs_file_info *
+ecryptfs_file_to_private(struct file *file)
+{
+	return file->private_data;
+}
+
+static inline void
+ecryptfs_set_file_private(struct file *file,
+			  struct ecryptfs_file_info *file_info)
+{
+	file->private_data = file_info;
+}
+
+static inline struct file *ecryptfs_file_to_lower(struct file *file)
+{
+	return ((struct ecryptfs_file_info *)file->private_data)->wfi_file;
+}
+
+static inline void
+ecryptfs_set_file_lower(struct file *file, struct file *lower_file)
+{
+	((struct ecryptfs_file_info *)file->private_data)->wfi_file =
+		lower_file;
+}
+
+static inline struct ecryptfs_inode_info *
+ecryptfs_inode_to_private(struct inode *inode)
+{
+	return container_of(inode, struct ecryptfs_inode_info, vfs_inode);
+}
+
+static inline struct inode *ecryptfs_inode_to_lower(struct inode *inode)
+{
+	return ecryptfs_inode_to_private(inode)->wii_inode;
+}
+
+static inline void
+ecryptfs_set_inode_lower(struct inode *inode, struct inode *lower_inode)
+{
+	ecryptfs_inode_to_private(inode)->wii_inode = lower_inode;
+}
+
+static inline struct ecryptfs_sb_info *
+ecryptfs_superblock_to_private(struct super_block *sb)
+{
+	return (struct ecryptfs_sb_info *)sb->s_fs_info;
+}
+
+static inline void
+ecryptfs_set_superblock_private(struct super_block *sb,
+				struct ecryptfs_sb_info *sb_info)
+{
+	sb->s_fs_info = sb_info;
+}
+
+static inline struct super_block *
+ecryptfs_superblock_to_lower(struct super_block *sb)
+{
+	return ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb;
+}
+
+static inline void
+ecryptfs_set_superblock_lower(struct super_block *sb,
+			      struct super_block *lower_sb)
+{
+	((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb = lower_sb;
+}
+
+static inline struct ecryptfs_dentry_info *
+ecryptfs_dentry_to_private(struct dentry *dentry)
+{
+	return (struct ecryptfs_dentry_info *)dentry->d_fsdata;
+}
+
+static inline void
+ecryptfs_set_dentry_private(struct dentry *dentry,
+			    struct ecryptfs_dentry_info *dentry_info)
+{
+	dentry->d_fsdata = dentry_info;
+}
+
+static inline struct dentry *
+ecryptfs_dentry_to_lower(struct dentry *dentry)
+{
+	return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry;
+}
+
+static inline void
+ecryptfs_set_dentry_lower(struct dentry *dentry, struct dentry *lower_dentry)
+{
+	((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry =
+		lower_dentry;
+}
+
+static inline struct vfsmount *
+ecryptfs_dentry_to_lower_mnt(struct dentry *dentry)
+{
+	return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt;
+}
+
+static inline void
+ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
+{
+	((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt =
+		lower_mnt;
+}
+
+#define ecryptfs_printk(type, fmt, arg...) \
+        __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
+__attribute__ ((format(printf, 1, 2)))
+void __ecryptfs_printk(const char *fmt, ...);
+
+extern const struct file_operations ecryptfs_main_fops;
+extern const struct file_operations ecryptfs_dir_fops;
+extern const struct inode_operations ecryptfs_main_iops;
+extern const struct inode_operations ecryptfs_dir_iops;
+extern const struct inode_operations ecryptfs_symlink_iops;
+extern const struct super_operations ecryptfs_sops;
+extern const struct dentry_operations ecryptfs_dops;
+extern const struct address_space_operations ecryptfs_aops;
+extern int ecryptfs_verbosity;
+extern unsigned int ecryptfs_message_buf_len;
+extern signed long ecryptfs_message_wait_timeout;
+extern unsigned int ecryptfs_number_of_users;
+
+extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
+extern struct kmem_cache *ecryptfs_file_info_cache;
+extern struct kmem_cache *ecryptfs_dentry_info_cache;
+extern struct kmem_cache *ecryptfs_inode_info_cache;
+extern struct kmem_cache *ecryptfs_sb_info_cache;
+extern struct kmem_cache *ecryptfs_header_cache;
+extern struct kmem_cache *ecryptfs_xattr_cache;
+extern struct kmem_cache *ecryptfs_key_record_cache;
+extern struct kmem_cache *ecryptfs_key_sig_cache;
+extern struct kmem_cache *ecryptfs_global_auth_tok_cache;
+extern struct kmem_cache *ecryptfs_key_tfm_cache;
+extern struct kmem_cache *ecryptfs_open_req_cache;
+
+struct ecryptfs_open_req {
+#define ECRYPTFS_REQ_PROCESSED 0x00000001
+#define ECRYPTFS_REQ_DROPPED   0x00000002
+#define ECRYPTFS_REQ_ZOMBIE    0x00000004
+	u32 flags;
+	struct file **lower_file;
+	struct dentry *lower_dentry;
+	struct vfsmount *lower_mnt;
+	wait_queue_head_t wait;
+	struct mutex mux;
+	struct list_head kthread_ctl_list;
+};
+
+struct inode *ecryptfs_get_inode(struct inode *lower_inode,
+				 struct super_block *sb);
+void ecryptfs_i_size_init(const char *page_virt, struct inode *inode);
+int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
+					 size_t *decrypted_name_size,
+					 struct dentry *ecryptfs_dentry,
+					 const char *name, size_t name_size);
+int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
+int ecryptfs_encrypt_and_encode_filename(
+	char **encoded_name,
+	size_t *encoded_name_size,
+	struct ecryptfs_crypt_stat *crypt_stat,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+	const char *name, size_t name_size);
+struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
+void ecryptfs_dump_hex(char *data, int bytes);
+int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
+			int sg_size);
+int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_rotate_iv(unsigned char *iv);
+void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_destroy_mount_crypt_stat(
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
+int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat);
+int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode);
+int ecryptfs_encrypt_page(struct page *page);
+int ecryptfs_decrypt_page(struct page *page);
+int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry);
+int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
+int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
+void ecryptfs_write_crypt_stat_flags(char *page_virt,
+				     struct ecryptfs_crypt_stat *crypt_stat,
+				     size_t *written);
+int ecryptfs_read_and_validate_header_region(struct inode *inode);
+int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
+					    struct inode *inode);
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
+int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
+void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
+int ecryptfs_generate_key_packet_set(char *dest_base,
+				     struct ecryptfs_crypt_stat *crypt_stat,
+				     struct dentry *ecryptfs_dentry,
+				     size_t *len, size_t max);
+int
+ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
+			  unsigned char *src, struct dentry *ecryptfs_dentry);
+int ecryptfs_truncate(struct dentry *dentry, loff_t new_length);
+ssize_t
+ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
+			void *value, size_t size);
+int
+ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		  size_t size, int flags);
+int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode);
+int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns,
+			  struct pid *pid);
+int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns,
+			  struct pid *pid);
+int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
+			      struct user_namespace *user_ns, struct pid *pid,
+			      u32 seq);
+int ecryptfs_send_message(char *data, int data_len,
+			  struct ecryptfs_msg_ctx **msg_ctx);
+int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,
+			       struct ecryptfs_message **emsg);
+int ecryptfs_init_messaging(void);
+void ecryptfs_release_messaging(void);
+
+void
+ecryptfs_write_header_metadata(char *virt,
+			       struct ecryptfs_crypt_stat *crypt_stat,
+			       size_t *written);
+int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig);
+int
+ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			   char *sig, u32 global_auth_tok_flags);
+int ecryptfs_get_global_auth_tok_for_sig(
+	struct ecryptfs_global_auth_tok **global_auth_tok,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig);
+int
+ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name,
+			 size_t key_size);
+int ecryptfs_init_crypto(void);
+int ecryptfs_destroy_crypto(void);
+int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm);
+int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
+					       struct mutex **tfm_mutex,
+					       char *cipher_name);
+int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
+				      struct ecryptfs_auth_tok **auth_tok,
+				      char *sig);
+int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
+			 loff_t offset, size_t size);
+int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
+				      struct page *page_for_lower,
+				      size_t offset_in_page, size_t size);
+int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size);
+int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
+			struct inode *ecryptfs_inode);
+int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
+				     pgoff_t page_index,
+				     size_t offset_in_page, size_t size,
+				     struct inode *ecryptfs_inode);
+struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
+int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
+int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
+				 struct user_namespace *user_ns);
+int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
+				 size_t *length_size);
+int ecryptfs_write_packet_length(char *dest, size_t size,
+				 size_t *packet_size_length);
+int ecryptfs_init_ecryptfs_miscdev(void);
+void ecryptfs_destroy_ecryptfs_miscdev(void);
+int ecryptfs_send_miscdev(char *data, size_t data_size,
+			  struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
+			  u16 msg_flags, struct ecryptfs_daemon *daemon);
+void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx);
+int
+ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
+		      struct user_namespace *user_ns, struct pid *pid);
+int ecryptfs_init_kthread(void);
+void ecryptfs_destroy_kthread(void);
+int ecryptfs_privileged_open(struct file **lower_file,
+			     struct dentry *lower_dentry,
+			     struct vfsmount *lower_mnt,
+			     const struct cred *cred);
+int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode);
+void ecryptfs_put_lower_file(struct inode *inode);
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *filename, size_t filename_size);
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *data, size_t max_packet_size);
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+		       loff_t offset);
+
+#endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
new file mode 100644
index 00000000..0c1a6527
--- /dev/null
+++ b/fs/ecryptfs/file.c
@@ -0,0 +1,379 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2004 Erez Zadok
+ * Copyright (C) 2001-2004 Stony Brook University
+ * Copyright (C) 2004-2007 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
+ *   		Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+#include <linux/compat.h>
+#include <linux/fs_stack.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_read_update_atime
+ *
+ * generic_file_read updates the atime of upper layer inode.  But, it
+ * doesn't give us a chance to update the atime of the lower layer
+ * inode.  This function is a wrapper to generic_file_read.  It
+ * updates the atime of the lower level inode if generic_file_read
+ * returns without any errors. This is to be used only for file reads.
+ * The function to be used for directory reads is ecryptfs_read.
+ */
+static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
+				const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
+{
+	ssize_t rc;
+	struct dentry *lower_dentry;
+	struct vfsmount *lower_vfsmount;
+	struct file *file = iocb->ki_filp;
+
+	rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
+	/*
+	 * Even though this is a async interface, we need to wait
+	 * for IO to finish to update atime
+	 */
+	if (-EIOCBQUEUED == rc)
+		rc = wait_on_sync_kiocb(iocb);
+	if (rc >= 0) {
+		lower_dentry = ecryptfs_dentry_to_lower(file->f_path.dentry);
+		lower_vfsmount = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry);
+		touch_atime(lower_vfsmount, lower_dentry);
+	}
+	return rc;
+}
+
+struct ecryptfs_getdents_callback {
+	void *dirent;
+	struct dentry *dentry;
+	filldir_t filldir;
+	int filldir_called;
+	int entries_written;
+};
+
+/* Inspired by generic filldir in fs/readdir.c */
+static int
+ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
+		 loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct ecryptfs_getdents_callback *buf =
+	    (struct ecryptfs_getdents_callback *)dirent;
+	size_t name_size;
+	char *name;
+	int rc;
+
+	buf->filldir_called++;
+	rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
+						  buf->dentry, lower_name,
+						  lower_namelen);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to decode and decrypt "
+		       "filename [%s]; rc = [%d]\n", __func__, lower_name,
+		       rc);
+		goto out;
+	}
+	rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
+	kfree(name);
+	if (rc >= 0)
+		buf->entries_written++;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_readdir
+ * @file: The eCryptfs directory file
+ * @dirent: Directory entry handle
+ * @filldir: The filldir callback function
+ */
+static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+	int rc;
+	struct file *lower_file;
+	struct inode *inode;
+	struct ecryptfs_getdents_callback buf;
+
+	lower_file = ecryptfs_file_to_lower(file);
+	lower_file->f_pos = file->f_pos;
+	inode = file->f_path.dentry->d_inode;
+	memset(&buf, 0, sizeof(buf));
+	buf.dirent = dirent;
+	buf.dentry = file->f_path.dentry;
+	buf.filldir = filldir;
+	buf.filldir_called = 0;
+	buf.entries_written = 0;
+	rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
+	file->f_pos = lower_file->f_pos;
+	if (rc < 0)
+		goto out;
+	if (buf.filldir_called && !buf.entries_written)
+		goto out;
+	if (rc >= 0)
+		fsstack_copy_attr_atime(inode,
+					lower_file->f_path.dentry->d_inode);
+out:
+	return rc;
+}
+
+static void ecryptfs_vma_close(struct vm_area_struct *vma)
+{
+	filemap_write_and_wait(vma->vm_file->f_mapping);
+}
+
+static const struct vm_operations_struct ecryptfs_file_vm_ops = {
+	.close		= ecryptfs_vma_close,
+	.fault		= filemap_fault,
+};
+
+static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int rc;
+
+	rc = generic_file_mmap(file, vma);
+	if (!rc)
+		vma->vm_ops = &ecryptfs_file_vm_ops;
+
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_file_info_cache;
+
+/**
+ * ecryptfs_open
+ * @inode: inode speciying file to open
+ * @file: Structure to return filled in
+ *
+ * Opens the file specified by inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_open(struct inode *inode, struct file *file)
+{
+	int rc = 0;
+	struct ecryptfs_crypt_stat *crypt_stat = NULL;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+	struct dentry *ecryptfs_dentry = file->f_path.dentry;
+	/* Private value of ecryptfs_dentry allocated in
+	 * ecryptfs_lookup() */
+	struct dentry *lower_dentry;
+	struct ecryptfs_file_info *file_info;
+
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+		ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	if ((mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
+	    && ((file->f_flags & O_WRONLY) || (file->f_flags & O_RDWR)
+		|| (file->f_flags & O_CREAT) || (file->f_flags & O_TRUNC)
+		|| (file->f_flags & O_APPEND))) {
+		printk(KERN_WARNING "Mount has encrypted view enabled; "
+		       "files may only be read\n");
+		rc = -EPERM;
+		goto out;
+	}
+	/* Released in ecryptfs_release or end of function if failure */
+	file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL);
+	ecryptfs_set_file_private(file, file_info);
+	if (!file_info) {
+		ecryptfs_printk(KERN_ERR,
+				"Error attempting to allocate memory\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+	mutex_lock(&crypt_stat->cs_mutex);
+	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) {
+		ecryptfs_printk(KERN_DEBUG, "Setting flags for stat...\n");
+		/* Policy code enabled in future release */
+		crypt_stat->flags |= (ECRYPTFS_POLICY_APPLIED
+				      | ECRYPTFS_ENCRYPTED);
+	}
+	mutex_unlock(&crypt_stat->cs_mutex);
+	rc = ecryptfs_get_lower_file(ecryptfs_dentry, inode);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to initialize "
+			"the lower file for the dentry with name "
+			"[%s]; rc = [%d]\n", __func__,
+			ecryptfs_dentry->d_name.name, rc);
+		goto out_free;
+	}
+	if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
+	    == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) {
+		rc = -EPERM;
+		printk(KERN_WARNING "%s: Lower file is RO; eCryptfs "
+		       "file must hence be opened RO\n", __func__);
+		goto out_put;
+	}
+	ecryptfs_set_file_lower(
+		file, ecryptfs_inode_to_private(inode)->lower_file);
+	if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
+		ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
+		mutex_lock(&crypt_stat->cs_mutex);
+		crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
+		mutex_unlock(&crypt_stat->cs_mutex);
+		rc = 0;
+		goto out;
+	}
+	mutex_lock(&crypt_stat->cs_mutex);
+	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)
+	    || !(crypt_stat->flags & ECRYPTFS_KEY_VALID)) {
+		rc = ecryptfs_read_metadata(ecryptfs_dentry);
+		if (rc) {
+			ecryptfs_printk(KERN_DEBUG,
+					"Valid headers not found\n");
+			if (!(mount_crypt_stat->flags
+			      & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) {
+				rc = -EIO;
+				printk(KERN_WARNING "Either the lower file "
+				       "is not in a valid eCryptfs format, "
+				       "or the key could not be retrieved. "
+				       "Plaintext passthrough mode is not "
+				       "enabled; returning -EIO\n");
+				mutex_unlock(&crypt_stat->cs_mutex);
+				goto out_put;
+			}
+			rc = 0;
+			crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED
+					       | ECRYPTFS_ENCRYPTED);
+			mutex_unlock(&crypt_stat->cs_mutex);
+			goto out;
+		}
+	}
+	mutex_unlock(&crypt_stat->cs_mutex);
+	ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = "
+			"[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino,
+			(unsigned long long)i_size_read(inode));
+	goto out;
+out_put:
+	ecryptfs_put_lower_file(inode);
+out_free:
+	kmem_cache_free(ecryptfs_file_info_cache,
+			ecryptfs_file_to_private(file));
+out:
+	return rc;
+}
+
+static int ecryptfs_flush(struct file *file, fl_owner_t td)
+{
+	return file->f_mode & FMODE_WRITE
+	       ? filemap_write_and_wait(file->f_mapping) : 0;
+}
+
+static int ecryptfs_release(struct inode *inode, struct file *file)
+{
+	ecryptfs_put_lower_file(inode);
+	kmem_cache_free(ecryptfs_file_info_cache,
+			ecryptfs_file_to_private(file));
+	return 0;
+}
+
+static int
+ecryptfs_fsync(struct file *file, int datasync)
+{
+	int rc = 0;
+
+	rc = generic_file_fsync(file, datasync);
+	if (rc)
+		goto out;
+	rc = vfs_fsync(ecryptfs_file_to_lower(file), datasync);
+out:
+	return rc;
+}
+
+static int ecryptfs_fasync(int fd, struct file *file, int flag)
+{
+	int rc = 0;
+	struct file *lower_file = NULL;
+
+	lower_file = ecryptfs_file_to_lower(file);
+	if (lower_file->f_op && lower_file->f_op->fasync)
+		rc = lower_file->f_op->fasync(fd, lower_file, flag);
+	return rc;
+}
+
+static long
+ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct file *lower_file = NULL;
+	long rc = -ENOTTY;
+
+	if (ecryptfs_file_to_private(file))
+		lower_file = ecryptfs_file_to_lower(file);
+	if (lower_file && lower_file->f_op && lower_file->f_op->unlocked_ioctl)
+		rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
+	return rc;
+}
+
+#ifdef CONFIG_COMPAT
+static long
+ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct file *lower_file = NULL;
+	long rc = -ENOIOCTLCMD;
+
+	if (ecryptfs_file_to_private(file))
+		lower_file = ecryptfs_file_to_lower(file);
+	if (lower_file && lower_file->f_op && lower_file->f_op->compat_ioctl)
+		rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
+	return rc;
+}
+#endif
+
+const struct file_operations ecryptfs_dir_fops = {
+	.readdir = ecryptfs_readdir,
+	.read = generic_read_dir,
+	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = ecryptfs_compat_ioctl,
+#endif
+	.open = ecryptfs_open,
+	.flush = ecryptfs_flush,
+	.release = ecryptfs_release,
+	.fsync = ecryptfs_fsync,
+	.fasync = ecryptfs_fasync,
+	.splice_read = generic_file_splice_read,
+	.llseek = default_llseek,
+};
+
+const struct file_operations ecryptfs_main_fops = {
+	.llseek = generic_file_llseek,
+	.read = do_sync_read,
+	.aio_read = ecryptfs_read_update_atime,
+	.write = do_sync_write,
+	.aio_write = generic_file_aio_write,
+	.readdir = ecryptfs_readdir,
+	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = ecryptfs_compat_ioctl,
+#endif
+	.mmap = ecryptfs_file_mmap,
+	.open = ecryptfs_open,
+	.flush = ecryptfs_flush,
+	.release = ecryptfs_release,
+	.fsync = ecryptfs_fsync,
+	.fasync = ecryptfs_fasync,
+	.splice_read = generic_file_splice_read,
+};
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
new file mode 100644
index 00000000..27173293
--- /dev/null
+++ b/fs/ecryptfs/inode.c
@@ -0,0 +1,1228 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2004 Erez Zadok
+ * Copyright (C) 2001-2004 Stony Brook University
+ * Copyright (C) 2004-2007 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *              Michael C. Thompsion <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/crypto.h>
+#include <linux/fs_stack.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <asm/unaligned.h>
+#include "ecryptfs_kernel.h"
+
+static struct dentry *lock_parent(struct dentry *dentry)
+{
+	struct dentry *dir;
+
+	dir = dget_parent(dentry);
+	mutex_lock_nested(&(dir->d_inode->i_mutex), I_MUTEX_PARENT);
+	return dir;
+}
+
+static void unlock_dir(struct dentry *dir)
+{
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(dir);
+}
+
+static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
+{
+	if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode)
+		return 1;
+	return 0;
+}
+
+static int ecryptfs_inode_set(struct inode *inode, void *opaque)
+{
+	struct inode *lower_inode = opaque;
+
+	ecryptfs_set_inode_lower(inode, lower_inode);
+	fsstack_copy_attr_all(inode, lower_inode);
+	/* i_size will be overwritten for encrypted regular files */
+	fsstack_copy_inode_size(inode, lower_inode);
+	inode->i_ino = lower_inode->i_ino;
+	inode->i_version++;
+	inode->i_mapping->a_ops = &ecryptfs_aops;
+	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
+
+	if (S_ISLNK(inode->i_mode))
+		inode->i_op = &ecryptfs_symlink_iops;
+	else if (S_ISDIR(inode->i_mode))
+		inode->i_op = &ecryptfs_dir_iops;
+	else
+		inode->i_op = &ecryptfs_main_iops;
+
+	if (S_ISDIR(inode->i_mode))
+		inode->i_fop = &ecryptfs_dir_fops;
+	else if (special_file(inode->i_mode))
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+	else
+		inode->i_fop = &ecryptfs_main_fops;
+
+	return 0;
+}
+
+static struct inode *__ecryptfs_get_inode(struct inode *lower_inode,
+					  struct super_block *sb)
+{
+	struct inode *inode;
+
+	if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb))
+		return ERR_PTR(-EXDEV);
+	if (!igrab(lower_inode))
+		return ERR_PTR(-ESTALE);
+	inode = iget5_locked(sb, (unsigned long)lower_inode,
+			     ecryptfs_inode_test, ecryptfs_inode_set,
+			     lower_inode);
+	if (!inode) {
+		iput(lower_inode);
+		return ERR_PTR(-EACCES);
+	}
+	if (!(inode->i_state & I_NEW))
+		iput(lower_inode);
+
+	return inode;
+}
+
+struct inode *ecryptfs_get_inode(struct inode *lower_inode,
+				 struct super_block *sb)
+{
+	struct inode *inode = __ecryptfs_get_inode(lower_inode, sb);
+
+	if (!IS_ERR(inode) && (inode->i_state & I_NEW))
+		unlock_new_inode(inode);
+
+	return inode;
+}
+
+/**
+ * ecryptfs_interpose
+ * @lower_dentry: Existing dentry in the lower filesystem
+ * @dentry: ecryptfs' dentry
+ * @sb: ecryptfs's super_block
+ *
+ * Interposes upper and lower dentries.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_interpose(struct dentry *lower_dentry,
+			      struct dentry *dentry, struct super_block *sb)
+{
+	struct inode *inode = ecryptfs_get_inode(lower_dentry->d_inode, sb);
+
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	d_instantiate(dentry, inode);
+
+	return 0;
+}
+
+/**
+ * ecryptfs_create_underlying_file
+ * @lower_dir_inode: inode of the parent in the lower fs of the new file
+ * @dentry: New file's dentry
+ * @mode: The mode of the new file
+ * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
+ *
+ * Creates the file in the lower file system.
+ *
+ * Returns zero on success; non-zero on error condition
+ */
+static int
+ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
+				struct dentry *dentry, int mode,
+				struct nameidata *nd)
+{
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+	struct dentry *dentry_save;
+	struct vfsmount *vfsmount_save;
+	unsigned int flags_save;
+	int rc;
+
+	if (nd) {
+		dentry_save = nd->path.dentry;
+		vfsmount_save = nd->path.mnt;
+		flags_save = nd->flags;
+		nd->path.dentry = lower_dentry;
+		nd->path.mnt = lower_mnt;
+		nd->flags &= ~LOOKUP_OPEN;
+	}
+	rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
+	if (nd) {
+		nd->path.dentry = dentry_save;
+		nd->path.mnt = vfsmount_save;
+		nd->flags = flags_save;
+	}
+	return rc;
+}
+
+/**
+ * ecryptfs_do_create
+ * @directory_inode: inode of the new file's dentry's parent in ecryptfs
+ * @ecryptfs_dentry: New file's dentry in ecryptfs
+ * @mode: The mode of the new file
+ * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
+ *
+ * Creates the underlying file and the eCryptfs inode which will link to
+ * it. It will also update the eCryptfs directory inode to mimic the
+ * stat of the lower directory inode.
+ *
+ * Returns zero on success; non-zero on error condition
+ */
+static int
+ecryptfs_do_create(struct inode *directory_inode,
+		   struct dentry *ecryptfs_dentry, int mode,
+		   struct nameidata *nd)
+{
+	int rc;
+	struct dentry *lower_dentry;
+	struct dentry *lower_dir_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	if (IS_ERR(lower_dir_dentry)) {
+		ecryptfs_printk(KERN_ERR, "Error locking directory of "
+				"dentry\n");
+		rc = PTR_ERR(lower_dir_dentry);
+		goto out;
+	}
+	rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode,
+					     ecryptfs_dentry, mode, nd);
+	if (rc) {
+		printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out_lock;
+	}
+	rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
+				directory_inode->i_sb);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
+		goto out_lock;
+	}
+	fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode);
+	fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode);
+out_lock:
+	unlock_dir(lower_dir_dentry);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_initialize_file
+ *
+ * Cause the file to be changed from a basic empty file to an ecryptfs
+ * file with a header and first data page.
+ *
+ * Returns zero on success
+ */
+static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
+{
+	struct ecryptfs_crypt_stat *crypt_stat =
+		&ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+	int rc = 0;
+
+	if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
+		ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
+		crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
+		goto out;
+	}
+	ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n");
+	rc = ecryptfs_new_file_context(ecryptfs_dentry);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error creating new file "
+				"context; rc = [%d]\n", rc);
+		goto out;
+	}
+	rc = ecryptfs_get_lower_file(ecryptfs_dentry,
+				     ecryptfs_dentry->d_inode);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to initialize "
+			"the lower file for the dentry with name "
+			"[%s]; rc = [%d]\n", __func__,
+			ecryptfs_dentry->d_name.name, rc);
+		goto out;
+	}
+	rc = ecryptfs_write_metadata(ecryptfs_dentry);
+	if (rc)
+		printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc);
+	ecryptfs_put_lower_file(ecryptfs_dentry->d_inode);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_create
+ * @dir: The inode of the directory in which to create the file.
+ * @dentry: The eCryptfs dentry
+ * @mode: The mode of the new file.
+ * @nd: nameidata
+ *
+ * Creates a new file.
+ *
+ * Returns zero on success; non-zero on error condition
+ */
+static int
+ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
+		int mode, struct nameidata *nd)
+{
+	int rc;
+
+	/* ecryptfs_do_create() calls ecryptfs_interpose() */
+	rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd);
+	if (unlikely(rc)) {
+		ecryptfs_printk(KERN_WARNING, "Failed to create file in"
+				"lower filesystem\n");
+		goto out;
+	}
+	/* At this point, a file exists on "disk"; we need to make sure
+	 * that this on disk file is prepared to be an ecryptfs file */
+	rc = ecryptfs_initialize_file(ecryptfs_dentry);
+out:
+	return rc;
+}
+
+static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
+{
+	struct ecryptfs_crypt_stat *crypt_stat;
+	int rc;
+
+	rc = ecryptfs_get_lower_file(dentry, inode);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to initialize "
+			"the lower file for the dentry with name "
+			"[%s]; rc = [%d]\n", __func__,
+			dentry->d_name.name, rc);
+		return rc;
+	}
+
+	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+	/* TODO: lock for crypt_stat comparison */
+	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+		ecryptfs_set_default_sizes(crypt_stat);
+
+	rc = ecryptfs_read_and_validate_header_region(inode);
+	ecryptfs_put_lower_file(inode);
+	if (rc) {
+		rc = ecryptfs_read_and_validate_xattr_region(dentry, inode);
+		if (!rc)
+			crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+	}
+
+	/* Must return 0 to allow non-eCryptfs files to be looked up, too */
+	return 0;
+}
+
+/**
+ * ecryptfs_lookup_interpose - Dentry interposition for a lookup
+ */
+static int ecryptfs_lookup_interpose(struct dentry *dentry,
+				     struct dentry *lower_dentry,
+				     struct inode *dir_inode)
+{
+	struct inode *inode, *lower_inode = lower_dentry->d_inode;
+	struct ecryptfs_dentry_info *dentry_info;
+	struct vfsmount *lower_mnt;
+	int rc = 0;
+
+	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
+	fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
+	BUG_ON(!lower_dentry->d_count);
+
+	dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
+	ecryptfs_set_dentry_private(dentry, dentry_info);
+	if (!dentry_info) {
+		printk(KERN_ERR "%s: Out of memory whilst attempting "
+		       "to allocate ecryptfs_dentry_info struct\n",
+			__func__);
+		dput(lower_dentry);
+		mntput(lower_mnt);
+		d_drop(dentry);
+		return -ENOMEM;
+	}
+	ecryptfs_set_dentry_lower(dentry, lower_dentry);
+	ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
+
+	if (!lower_dentry->d_inode) {
+		/* We want to add because we couldn't find in lower */
+		d_add(dentry, NULL);
+		return 0;
+	}
+	inode = __ecryptfs_get_inode(lower_inode, dir_inode->i_sb);
+	if (IS_ERR(inode)) {
+		printk(KERN_ERR "%s: Error interposing; rc = [%ld]\n",
+		       __func__, PTR_ERR(inode));
+		return PTR_ERR(inode);
+	}
+	if (S_ISREG(inode->i_mode)) {
+		rc = ecryptfs_i_size_read(dentry, inode);
+		if (rc) {
+			make_bad_inode(inode);
+			return rc;
+		}
+	}
+
+	if (inode->i_state & I_NEW)
+		unlock_new_inode(inode);
+	d_add(dentry, inode);
+
+	return rc;
+}
+
+/**
+ * ecryptfs_lookup
+ * @ecryptfs_dir_inode: The eCryptfs directory inode
+ * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
+ * @ecryptfs_nd: nameidata; may be NULL
+ *
+ * Find a file on disk. If the file does not exist, then we'll add it to the
+ * dentry cache and continue on to read it from the disk.
+ */
+static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
+				      struct dentry *ecryptfs_dentry,
+				      struct nameidata *ecryptfs_nd)
+{
+	char *encrypted_and_encoded_name = NULL;
+	size_t encrypted_and_encoded_name_size;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
+	struct dentry *lower_dir_dentry, *lower_dentry;
+	int rc = 0;
+
+	if ((ecryptfs_dentry->d_name.len == 1
+	     && !strcmp(ecryptfs_dentry->d_name.name, "."))
+	    || (ecryptfs_dentry->d_name.len == 2
+		&& !strcmp(ecryptfs_dentry->d_name.name, ".."))) {
+		goto out_d_drop;
+	}
+	lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
+	mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
+	lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
+				      lower_dir_dentry,
+				      ecryptfs_dentry->d_name.len);
+	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
+	if (IS_ERR(lower_dentry)) {
+		rc = PTR_ERR(lower_dentry);
+		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
+				"[%d] on lower_dentry = [%s]\n", __func__, rc,
+				encrypted_and_encoded_name);
+		goto out_d_drop;
+	}
+	if (lower_dentry->d_inode)
+		goto interpose;
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+				ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	if (!(mount_crypt_stat
+	    && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
+		goto interpose;
+	dput(lower_dentry);
+	rc = ecryptfs_encrypt_and_encode_filename(
+		&encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
+		NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name,
+		ecryptfs_dentry->d_name.len);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to encrypt and encode "
+		       "filename; rc = [%d]\n", __func__, rc);
+		goto out_d_drop;
+	}
+	mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
+	lower_dentry = lookup_one_len(encrypted_and_encoded_name,
+				      lower_dir_dentry,
+				      encrypted_and_encoded_name_size);
+	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
+	if (IS_ERR(lower_dentry)) {
+		rc = PTR_ERR(lower_dentry);
+		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
+				"[%d] on lower_dentry = [%s]\n", __func__, rc,
+				encrypted_and_encoded_name);
+		goto out_d_drop;
+	}
+interpose:
+	rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry,
+				       ecryptfs_dir_inode);
+	goto out;
+out_d_drop:
+	d_drop(ecryptfs_dentry);
+out:
+	kfree(encrypted_and_encoded_name);
+	return ERR_PTR(rc);
+}
+
+static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
+			 struct dentry *new_dentry)
+{
+	struct dentry *lower_old_dentry;
+	struct dentry *lower_new_dentry;
+	struct dentry *lower_dir_dentry;
+	u64 file_size_save;
+	int rc;
+
+	file_size_save = i_size_read(old_dentry->d_inode);
+	lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
+	lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
+	dget(lower_old_dentry);
+	dget(lower_new_dentry);
+	lower_dir_dentry = lock_parent(lower_new_dentry);
+	rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode,
+		      lower_new_dentry);
+	if (rc || !lower_new_dentry->d_inode)
+		goto out_lock;
+	rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
+	if (rc)
+		goto out_lock;
+	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
+	fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
+	old_dentry->d_inode->i_nlink =
+		ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
+	i_size_write(new_dentry->d_inode, file_size_save);
+out_lock:
+	unlock_dir(lower_dir_dentry);
+	dput(lower_new_dentry);
+	dput(lower_old_dentry);
+	return rc;
+}
+
+static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int rc = 0;
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
+	struct dentry *lower_dir_dentry;
+
+	dget(lower_dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	rc = vfs_unlink(lower_dir_inode, lower_dentry);
+	if (rc) {
+		printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
+		goto out_unlock;
+	}
+	fsstack_copy_attr_times(dir, lower_dir_inode);
+	dentry->d_inode->i_nlink =
+		ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink;
+	dentry->d_inode->i_ctime = dir->i_ctime;
+	d_drop(dentry);
+out_unlock:
+	unlock_dir(lower_dir_dentry);
+	dput(lower_dentry);
+	return rc;
+}
+
+static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
+			    const char *symname)
+{
+	int rc;
+	struct dentry *lower_dentry;
+	struct dentry *lower_dir_dentry;
+	char *encoded_symname;
+	size_t encoded_symlen;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	dget(lower_dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+		dir->i_sb)->mount_crypt_stat;
+	rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
+						  &encoded_symlen,
+						  NULL,
+						  mount_crypt_stat, symname,
+						  strlen(symname));
+	if (rc)
+		goto out_lock;
+	rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
+			 encoded_symname);
+	kfree(encoded_symname);
+	if (rc || !lower_dentry->d_inode)
+		goto out_lock;
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
+	if (rc)
+		goto out_lock;
+	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
+	fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
+out_lock:
+	unlock_dir(lower_dir_dentry);
+	dput(lower_dentry);
+	if (!dentry->d_inode)
+		d_drop(dentry);
+	return rc;
+}
+
+static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int rc;
+	struct dentry *lower_dentry;
+	struct dentry *lower_dir_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode);
+	if (rc || !lower_dentry->d_inode)
+		goto out;
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
+	if (rc)
+		goto out;
+	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
+	fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
+	dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
+out:
+	unlock_dir(lower_dir_dentry);
+	if (!dentry->d_inode)
+		d_drop(dentry);
+	return rc;
+}
+
+static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct dentry *lower_dentry;
+	struct dentry *lower_dir_dentry;
+	int rc;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	dget(dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	dget(lower_dentry);
+	rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry);
+	dput(lower_dentry);
+	if (!rc && dentry->d_inode)
+		clear_nlink(dentry->d_inode);
+	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
+	dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
+	unlock_dir(lower_dir_dentry);
+	if (!rc)
+		d_drop(dentry);
+	dput(dentry);
+	return rc;
+}
+
+static int
+ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+	int rc;
+	struct dentry *lower_dentry;
+	struct dentry *lower_dir_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev);
+	if (rc || !lower_dentry->d_inode)
+		goto out;
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
+	if (rc)
+		goto out;
+	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
+	fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
+out:
+	unlock_dir(lower_dir_dentry);
+	if (!dentry->d_inode)
+		d_drop(dentry);
+	return rc;
+}
+
+static int
+ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	int rc;
+	struct dentry *lower_old_dentry;
+	struct dentry *lower_new_dentry;
+	struct dentry *lower_old_dir_dentry;
+	struct dentry *lower_new_dir_dentry;
+	struct dentry *trap = NULL;
+
+	lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
+	lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
+	dget(lower_old_dentry);
+	dget(lower_new_dentry);
+	lower_old_dir_dentry = dget_parent(lower_old_dentry);
+	lower_new_dir_dentry = dget_parent(lower_new_dentry);
+	trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+	/* source should not be ancestor of target */
+	if (trap == lower_old_dentry) {
+		rc = -EINVAL;
+		goto out_lock;
+	}
+	/* target should not be ancestor of source */
+	if (trap == lower_new_dentry) {
+		rc = -ENOTEMPTY;
+		goto out_lock;
+	}
+	rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
+			lower_new_dir_dentry->d_inode, lower_new_dentry);
+	if (rc)
+		goto out_lock;
+	fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode);
+	if (new_dir != old_dir)
+		fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode);
+out_lock:
+	unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+	dput(lower_new_dir_dentry);
+	dput(lower_old_dir_dentry);
+	dput(lower_new_dentry);
+	dput(lower_old_dentry);
+	return rc;
+}
+
+static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
+				   size_t *bufsiz)
+{
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	char *lower_buf;
+	size_t lower_bufsiz = PATH_MAX;
+	mm_segment_t old_fs;
+	int rc;
+
+	lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
+	if (!lower_buf) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	old_fs = get_fs();
+	set_fs(get_ds());
+	rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
+						   (char __user *)lower_buf,
+						   lower_bufsiz);
+	set_fs(old_fs);
+	if (rc < 0)
+		goto out;
+	lower_bufsiz = rc;
+	rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
+						  lower_buf, lower_bufsiz);
+out:
+	kfree(lower_buf);
+	return rc;
+}
+
+static int
+ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+	char *kbuf;
+	size_t kbufsiz, copied;
+	int rc;
+
+	rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz);
+	if (rc)
+		goto out;
+	copied = min_t(size_t, bufsiz, kbufsiz);
+	rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied;
+	kfree(kbuf);
+	fsstack_copy_attr_atime(dentry->d_inode,
+				ecryptfs_dentry_to_lower(dentry)->d_inode);
+out:
+	return rc;
+}
+
+static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	char *buf;
+	int len = PAGE_SIZE, rc;
+	mm_segment_t old_fs;
+
+	/* Released in ecryptfs_put_link(); only release here on error */
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf) {
+		buf = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	old_fs = get_fs();
+	set_fs(get_ds());
+	rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
+	set_fs(old_fs);
+	if (rc < 0) {
+		kfree(buf);
+		buf = ERR_PTR(rc);
+	} else
+		buf[rc] = '\0';
+out:
+	nd_set_link(nd, buf);
+	return NULL;
+}
+
+static void
+ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
+{
+	char *buf = nd_get_link(nd);
+	if (!IS_ERR(buf)) {
+		/* Free the char* */
+		kfree(buf);
+	}
+}
+
+/**
+ * upper_size_to_lower_size
+ * @crypt_stat: Crypt_stat associated with file
+ * @upper_size: Size of the upper file
+ *
+ * Calculate the required size of the lower file based on the
+ * specified size of the upper file. This calculation is based on the
+ * number of headers in the underlying file and the extent size.
+ *
+ * Returns Calculated size of the lower file.
+ */
+static loff_t
+upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
+			 loff_t upper_size)
+{
+	loff_t lower_size;
+
+	lower_size = ecryptfs_lower_header_size(crypt_stat);
+	if (upper_size != 0) {
+		loff_t num_extents;
+
+		num_extents = upper_size >> crypt_stat->extent_shift;
+		if (upper_size & ~crypt_stat->extent_mask)
+			num_extents++;
+		lower_size += (num_extents * crypt_stat->extent_size);
+	}
+	return lower_size;
+}
+
+/**
+ * truncate_upper
+ * @dentry: The ecryptfs layer dentry
+ * @ia: Address of the ecryptfs inode's attributes
+ * @lower_ia: Address of the lower inode's attributes
+ *
+ * Function to handle truncations modifying the size of the file. Note
+ * that the file sizes are interpolated. When expanding, we are simply
+ * writing strings of 0's out. When truncating, we truncate the upper
+ * inode and update the lower_ia according to the page index
+ * interpolations. If ATTR_SIZE is set in lower_ia->ia_valid upon return,
+ * the caller must use lower_ia in a call to notify_change() to perform
+ * the truncation of the lower inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int truncate_upper(struct dentry *dentry, struct iattr *ia,
+			  struct iattr *lower_ia)
+{
+	int rc = 0;
+	struct inode *inode = dentry->d_inode;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	loff_t i_size = i_size_read(inode);
+	loff_t lower_size_before_truncate;
+	loff_t lower_size_after_truncate;
+
+	if (unlikely((ia->ia_size == i_size))) {
+		lower_ia->ia_valid &= ~ATTR_SIZE;
+		return 0;
+	}
+	rc = ecryptfs_get_lower_file(dentry, inode);
+	if (rc)
+		return rc;
+	crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+	/* Switch on growing or shrinking file */
+	if (ia->ia_size > i_size) {
+		char zero[] = { 0x00 };
+
+		lower_ia->ia_valid &= ~ATTR_SIZE;
+		/* Write a single 0 at the last position of the file;
+		 * this triggers code that will fill in 0's throughout
+		 * the intermediate portion of the previous end of the
+		 * file and the new and of the file */
+		rc = ecryptfs_write(inode, zero,
+				    (ia->ia_size - 1), 1);
+	} else { /* ia->ia_size < i_size_read(inode) */
+		/* We're chopping off all the pages down to the page
+		 * in which ia->ia_size is located. Fill in the end of
+		 * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to
+		 * PAGE_CACHE_SIZE with zeros. */
+		size_t num_zeros = (PAGE_CACHE_SIZE
+				    - (ia->ia_size & ~PAGE_CACHE_MASK));
+
+		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+			truncate_setsize(inode, ia->ia_size);
+			lower_ia->ia_size = ia->ia_size;
+			lower_ia->ia_valid |= ATTR_SIZE;
+			goto out;
+		}
+		if (num_zeros) {
+			char *zeros_virt;
+
+			zeros_virt = kzalloc(num_zeros, GFP_KERNEL);
+			if (!zeros_virt) {
+				rc = -ENOMEM;
+				goto out;
+			}
+			rc = ecryptfs_write(inode, zeros_virt,
+					    ia->ia_size, num_zeros);
+			kfree(zeros_virt);
+			if (rc) {
+				printk(KERN_ERR "Error attempting to zero out "
+				       "the remainder of the end page on "
+				       "reducing truncate; rc = [%d]\n", rc);
+				goto out;
+			}
+		}
+		truncate_setsize(inode, ia->ia_size);
+		rc = ecryptfs_write_inode_size_to_metadata(inode);
+		if (rc) {
+			printk(KERN_ERR	"Problem with "
+			       "ecryptfs_write_inode_size_to_metadata; "
+			       "rc = [%d]\n", rc);
+			goto out;
+		}
+		/* We are reducing the size of the ecryptfs file, and need to
+		 * know if we need to reduce the size of the lower file. */
+		lower_size_before_truncate =
+		    upper_size_to_lower_size(crypt_stat, i_size);
+		lower_size_after_truncate =
+		    upper_size_to_lower_size(crypt_stat, ia->ia_size);
+		if (lower_size_after_truncate < lower_size_before_truncate) {
+			lower_ia->ia_size = lower_size_after_truncate;
+			lower_ia->ia_valid |= ATTR_SIZE;
+		} else
+			lower_ia->ia_valid &= ~ATTR_SIZE;
+	}
+out:
+	ecryptfs_put_lower_file(inode);
+	return rc;
+}
+
+static int ecryptfs_inode_newsize_ok(struct inode *inode, loff_t offset)
+{
+	struct ecryptfs_crypt_stat *crypt_stat;
+	loff_t lower_oldsize, lower_newsize;
+
+	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+	lower_oldsize = upper_size_to_lower_size(crypt_stat,
+						 i_size_read(inode));
+	lower_newsize = upper_size_to_lower_size(crypt_stat, offset);
+	if (lower_newsize > lower_oldsize) {
+		/*
+		 * The eCryptfs inode and the new *lower* size are mixed here
+		 * because we may not have the lower i_mutex held and/or it may
+		 * not be appropriate to call inode_newsize_ok() with inodes
+		 * from other filesystems.
+		 */
+		return inode_newsize_ok(inode, lower_newsize);
+	}
+
+	return 0;
+}
+
+/**
+ * ecryptfs_truncate
+ * @dentry: The ecryptfs layer dentry
+ * @new_length: The length to expand the file to
+ *
+ * Simple function that handles the truncation of an eCryptfs inode and
+ * its corresponding lower inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
+{
+	struct iattr ia = { .ia_valid = ATTR_SIZE, .ia_size = new_length };
+	struct iattr lower_ia = { .ia_valid = 0 };
+	int rc;
+
+	rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length);
+	if (rc)
+		return rc;
+
+	rc = truncate_upper(dentry, &ia, &lower_ia);
+	if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
+		struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+
+		mutex_lock(&lower_dentry->d_inode->i_mutex);
+		rc = notify_change(lower_dentry, &lower_ia);
+		mutex_unlock(&lower_dentry->d_inode->i_mutex);
+	}
+	return rc;
+}
+
+static int
+ecryptfs_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+	return inode_permission(ecryptfs_inode_to_lower(inode), mask);
+}
+
+/**
+ * ecryptfs_setattr
+ * @dentry: dentry handle to the inode to modify
+ * @ia: Structure with flags of what to change and values
+ *
+ * Updates the metadata of an inode. If the update is to the size
+ * i.e. truncation, then ecryptfs_truncate will handle the size modification
+ * of both the ecryptfs inode and the lower inode.
+ *
+ * All other metadata changes will be passed right to the lower filesystem,
+ * and we will just update our inode to look like the lower.
+ */
+static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	int rc = 0;
+	struct dentry *lower_dentry;
+	struct iattr lower_ia;
+	struct inode *inode;
+	struct inode *lower_inode;
+	struct ecryptfs_crypt_stat *crypt_stat;
+
+	crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+	if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED))
+		ecryptfs_init_crypt_stat(crypt_stat);
+	inode = dentry->d_inode;
+	lower_inode = ecryptfs_inode_to_lower(inode);
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	mutex_lock(&crypt_stat->cs_mutex);
+	if (S_ISDIR(dentry->d_inode->i_mode))
+		crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
+	else if (S_ISREG(dentry->d_inode->i_mode)
+		 && (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)
+		     || !(crypt_stat->flags & ECRYPTFS_KEY_VALID))) {
+		struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+
+		mount_crypt_stat = &ecryptfs_superblock_to_private(
+			dentry->d_sb)->mount_crypt_stat;
+		rc = ecryptfs_get_lower_file(dentry, inode);
+		if (rc) {
+			mutex_unlock(&crypt_stat->cs_mutex);
+			goto out;
+		}
+		rc = ecryptfs_read_metadata(dentry);
+		ecryptfs_put_lower_file(inode);
+		if (rc) {
+			if (!(mount_crypt_stat->flags
+			      & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) {
+				rc = -EIO;
+				printk(KERN_WARNING "Either the lower file "
+				       "is not in a valid eCryptfs format, "
+				       "or the key could not be retrieved. "
+				       "Plaintext passthrough mode is not "
+				       "enabled; returning -EIO\n");
+				mutex_unlock(&crypt_stat->cs_mutex);
+				goto out;
+			}
+			rc = 0;
+			crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED
+					       | ECRYPTFS_ENCRYPTED);
+		}
+	}
+	mutex_unlock(&crypt_stat->cs_mutex);
+
+	rc = inode_change_ok(inode, ia);
+	if (rc)
+		goto out;
+	if (ia->ia_valid & ATTR_SIZE) {
+		rc = ecryptfs_inode_newsize_ok(inode, ia->ia_size);
+		if (rc)
+			goto out;
+	}
+
+	if (S_ISREG(inode->i_mode)) {
+		rc = filemap_write_and_wait(inode->i_mapping);
+		if (rc)
+			goto out;
+		fsstack_copy_attr_all(inode, lower_inode);
+	}
+	memcpy(&lower_ia, ia, sizeof(lower_ia));
+	if (ia->ia_valid & ATTR_FILE)
+		lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file);
+	if (ia->ia_valid & ATTR_SIZE) {
+		rc = truncate_upper(dentry, ia, &lower_ia);
+		if (rc < 0)
+			goto out;
+	}
+
+	/*
+	 * mode change is for clearing setuid/setgid bits. Allow lower fs
+	 * to interpret this in its own way.
+	 */
+	if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+		lower_ia.ia_valid &= ~ATTR_MODE;
+
+	mutex_lock(&lower_dentry->d_inode->i_mutex);
+	rc = notify_change(lower_dentry, &lower_ia);
+	mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+	fsstack_copy_attr_all(inode, lower_inode);
+	return rc;
+}
+
+int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
+			  struct kstat *stat)
+{
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+	int rc = 0;
+
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+						dentry->d_sb)->mount_crypt_stat;
+	generic_fillattr(dentry->d_inode, stat);
+	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+		char *target;
+		size_t targetsiz;
+
+		rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz);
+		if (!rc) {
+			kfree(target);
+			stat->size = targetsiz;
+		}
+	}
+	return rc;
+}
+
+int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		     struct kstat *stat)
+{
+	struct kstat lower_stat;
+	int rc;
+
+	rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
+			 ecryptfs_dentry_to_lower(dentry), &lower_stat);
+	if (!rc) {
+		fsstack_copy_attr_all(dentry->d_inode,
+				      ecryptfs_inode_to_lower(dentry->d_inode));
+		generic_fillattr(dentry->d_inode, stat);
+		stat->blocks = lower_stat.blocks;
+	}
+	return rc;
+}
+
+int
+ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		  size_t size, int flags)
+{
+	int rc = 0;
+	struct dentry *lower_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	if (!lower_dentry->d_inode->i_op->setxattr) {
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+
+	rc = vfs_setxattr(lower_dentry, name, value, size, flags);
+	if (!rc)
+		fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode);
+out:
+	return rc;
+}
+
+ssize_t
+ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
+			void *value, size_t size)
+{
+	int rc = 0;
+
+	if (!lower_dentry->d_inode->i_op->getxattr) {
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+	mutex_lock(&lower_dentry->d_inode->i_mutex);
+	rc = lower_dentry->d_inode->i_op->getxattr(lower_dentry, name, value,
+						   size);
+	mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+	return rc;
+}
+
+static ssize_t
+ecryptfs_getxattr(struct dentry *dentry, const char *name, void *value,
+		  size_t size)
+{
+	return ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry), name,
+				       value, size);
+}
+
+static ssize_t
+ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+	int rc = 0;
+	struct dentry *lower_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	if (!lower_dentry->d_inode->i_op->listxattr) {
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+	mutex_lock(&lower_dentry->d_inode->i_mutex);
+	rc = lower_dentry->d_inode->i_op->listxattr(lower_dentry, list, size);
+	mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+	return rc;
+}
+
+static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
+{
+	int rc = 0;
+	struct dentry *lower_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	if (!lower_dentry->d_inode->i_op->removexattr) {
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+	mutex_lock(&lower_dentry->d_inode->i_mutex);
+	rc = lower_dentry->d_inode->i_op->removexattr(lower_dentry, name);
+	mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+	return rc;
+}
+
+const struct inode_operations ecryptfs_symlink_iops = {
+	.readlink = ecryptfs_readlink,
+	.follow_link = ecryptfs_follow_link,
+	.put_link = ecryptfs_put_link,
+	.permission = ecryptfs_permission,
+	.setattr = ecryptfs_setattr,
+	.getattr = ecryptfs_getattr_link,
+	.setxattr = ecryptfs_setxattr,
+	.getxattr = ecryptfs_getxattr,
+	.listxattr = ecryptfs_listxattr,
+	.removexattr = ecryptfs_removexattr
+};
+
+const struct inode_operations ecryptfs_dir_iops = {
+	.create = ecryptfs_create,
+	.lookup = ecryptfs_lookup,
+	.link = ecryptfs_link,
+	.unlink = ecryptfs_unlink,
+	.symlink = ecryptfs_symlink,
+	.mkdir = ecryptfs_mkdir,
+	.rmdir = ecryptfs_rmdir,
+	.mknod = ecryptfs_mknod,
+	.rename = ecryptfs_rename,
+	.permission = ecryptfs_permission,
+	.setattr = ecryptfs_setattr,
+	.setxattr = ecryptfs_setxattr,
+	.getxattr = ecryptfs_getxattr,
+	.listxattr = ecryptfs_listxattr,
+	.removexattr = ecryptfs_removexattr
+};
+
+const struct inode_operations ecryptfs_main_iops = {
+	.permission = ecryptfs_permission,
+	.setattr = ecryptfs_setattr,
+	.getattr = ecryptfs_getattr,
+	.setxattr = ecryptfs_setxattr,
+	.getxattr = ecryptfs_getxattr,
+	.listxattr = ecryptfs_listxattr,
+	.removexattr = ecryptfs_removexattr
+};
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
new file mode 100644
index 00000000..89dc18e7
--- /dev/null
+++ b/fs/ecryptfs/keystore.c
@@ -0,0 +1,2530 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * In-kernel key management code.  Includes functions to parse and
+ * write authentication token-related packets with the underlying
+ * file.
+ *
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
+ *              Michael C. Thompson <mcthomps@us.ibm.com>
+ *              Trevor S. Highland <trevor.highland@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/string.h>
+#include <linux/syscalls.h>
+#include <linux/pagemap.h>
+#include <linux/key.h>
+#include <linux/random.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * request_key returned an error instead of a valid key address;
+ * determine the type of error, make appropriate log entries, and
+ * return an error code.
+ */
+static int process_request_key_err(long err_code)
+{
+	int rc = 0;
+
+	switch (err_code) {
+	case -ENOKEY:
+		ecryptfs_printk(KERN_WARNING, "No key\n");
+		rc = -ENOENT;
+		break;
+	case -EKEYEXPIRED:
+		ecryptfs_printk(KERN_WARNING, "Key expired\n");
+		rc = -ETIME;
+		break;
+	case -EKEYREVOKED:
+		ecryptfs_printk(KERN_WARNING, "Key revoked\n");
+		rc = -EINVAL;
+		break;
+	default:
+		ecryptfs_printk(KERN_WARNING, "Unknown error code: "
+				"[0x%.16lx]\n", err_code);
+		rc = -EINVAL;
+	}
+	return rc;
+}
+
+static int process_find_global_auth_tok_for_sig_err(int err_code)
+{
+	int rc = err_code;
+
+	switch (err_code) {
+	case -ENOENT:
+		ecryptfs_printk(KERN_WARNING, "Missing auth tok\n");
+		break;
+	case -EINVAL:
+		ecryptfs_printk(KERN_WARNING, "Invalid auth tok\n");
+		break;
+	default:
+		rc = process_request_key_err(err_code);
+		break;
+	}
+	return rc;
+}
+
+/**
+ * ecryptfs_parse_packet_length
+ * @data: Pointer to memory containing length at offset
+ * @size: This function writes the decoded size to this memory
+ *        address; zero on error
+ * @length_size: The number of bytes occupied by the encoded length
+ *
+ * Returns zero on success; non-zero on error
+ */
+int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
+				 size_t *length_size)
+{
+	int rc = 0;
+
+	(*length_size) = 0;
+	(*size) = 0;
+	if (data[0] < 192) {
+		/* One-byte length */
+		(*size) = (unsigned char)data[0];
+		(*length_size) = 1;
+	} else if (data[0] < 224) {
+		/* Two-byte length */
+		(*size) = (((unsigned char)(data[0]) - 192) * 256);
+		(*size) += ((unsigned char)(data[1]) + 192);
+		(*length_size) = 2;
+	} else if (data[0] == 255) {
+		/* Five-byte length; we're not supposed to see this */
+		ecryptfs_printk(KERN_ERR, "Five-byte packet length not "
+				"supported\n");
+		rc = -EINVAL;
+		goto out;
+	} else {
+		ecryptfs_printk(KERN_ERR, "Error parsing packet length\n");
+		rc = -EINVAL;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_write_packet_length
+ * @dest: The byte array target into which to write the length. Must
+ *        have at least 5 bytes allocated.
+ * @size: The length to write.
+ * @packet_size_length: The number of bytes used to encode the packet
+ *                      length is written to this address.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+int ecryptfs_write_packet_length(char *dest, size_t size,
+				 size_t *packet_size_length)
+{
+	int rc = 0;
+
+	if (size < 192) {
+		dest[0] = size;
+		(*packet_size_length) = 1;
+	} else if (size < 65536) {
+		dest[0] = (((size - 192) / 256) + 192);
+		dest[1] = ((size - 192) % 256);
+		(*packet_size_length) = 2;
+	} else {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_WARNING,
+				"Unsupported packet size: [%zd]\n", size);
+	}
+	return rc;
+}
+
+static int
+write_tag_64_packet(char *signature, struct ecryptfs_session_key *session_key,
+		    char **packet, size_t *packet_len)
+{
+	size_t i = 0;
+	size_t data_len;
+	size_t packet_size_len;
+	char *message;
+	int rc;
+
+	/*
+	 *              ***** TAG 64 Packet Format *****
+	 *    | Content Type                       | 1 byte       |
+	 *    | Key Identifier Size                | 1 or 2 bytes |
+	 *    | Key Identifier                     | arbitrary    |
+	 *    | Encrypted File Encryption Key Size | 1 or 2 bytes |
+	 *    | Encrypted File Encryption Key      | arbitrary    |
+	 */
+	data_len = (5 + ECRYPTFS_SIG_SIZE_HEX
+		    + session_key->encrypted_key_size);
+	*packet = kmalloc(data_len, GFP_KERNEL);
+	message = *packet;
+	if (!message) {
+		ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	message[i++] = ECRYPTFS_TAG_64_PACKET_TYPE;
+	rc = ecryptfs_write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX,
+					  &packet_size_len);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating tag 64 packet "
+				"header; cannot generate packet length\n");
+		goto out;
+	}
+	i += packet_size_len;
+	memcpy(&message[i], signature, ECRYPTFS_SIG_SIZE_HEX);
+	i += ECRYPTFS_SIG_SIZE_HEX;
+	rc = ecryptfs_write_packet_length(&message[i],
+					  session_key->encrypted_key_size,
+					  &packet_size_len);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating tag 64 packet "
+				"header; cannot generate packet length\n");
+		goto out;
+	}
+	i += packet_size_len;
+	memcpy(&message[i], session_key->encrypted_key,
+	       session_key->encrypted_key_size);
+	i += session_key->encrypted_key_size;
+	*packet_len = i;
+out:
+	return rc;
+}
+
+static int
+parse_tag_65_packet(struct ecryptfs_session_key *session_key, u8 *cipher_code,
+		    struct ecryptfs_message *msg)
+{
+	size_t i = 0;
+	char *data;
+	size_t data_len;
+	size_t m_size;
+	size_t message_len;
+	u16 checksum = 0;
+	u16 expected_checksum = 0;
+	int rc;
+
+	/*
+	 *              ***** TAG 65 Packet Format *****
+	 *         | Content Type             | 1 byte       |
+	 *         | Status Indicator         | 1 byte       |
+	 *         | File Encryption Key Size | 1 or 2 bytes |
+	 *         | File Encryption Key      | arbitrary    |
+	 */
+	message_len = msg->data_len;
+	data = msg->data;
+	if (message_len < 4) {
+		rc = -EIO;
+		goto out;
+	}
+	if (data[i++] != ECRYPTFS_TAG_65_PACKET_TYPE) {
+		ecryptfs_printk(KERN_ERR, "Type should be ECRYPTFS_TAG_65\n");
+		rc = -EIO;
+		goto out;
+	}
+	if (data[i++]) {
+		ecryptfs_printk(KERN_ERR, "Status indicator has non-zero value "
+				"[%d]\n", data[i-1]);
+		rc = -EIO;
+		goto out;
+	}
+	rc = ecryptfs_parse_packet_length(&data[i], &m_size, &data_len);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error parsing packet length; "
+				"rc = [%d]\n", rc);
+		goto out;
+	}
+	i += data_len;
+	if (message_len < (i + m_size)) {
+		ecryptfs_printk(KERN_ERR, "The message received from ecryptfsd "
+				"is shorter than expected\n");
+		rc = -EIO;
+		goto out;
+	}
+	if (m_size < 3) {
+		ecryptfs_printk(KERN_ERR,
+				"The decrypted key is not long enough to "
+				"include a cipher code and checksum\n");
+		rc = -EIO;
+		goto out;
+	}
+	*cipher_code = data[i++];
+	/* The decrypted key includes 1 byte cipher code and 2 byte checksum */
+	session_key->decrypted_key_size = m_size - 3;
+	if (session_key->decrypted_key_size > ECRYPTFS_MAX_KEY_BYTES) {
+		ecryptfs_printk(KERN_ERR, "key_size [%d] larger than "
+				"the maximum key size [%d]\n",
+				session_key->decrypted_key_size,
+				ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES);
+		rc = -EIO;
+		goto out;
+	}
+	memcpy(session_key->decrypted_key, &data[i],
+	       session_key->decrypted_key_size);
+	i += session_key->decrypted_key_size;
+	expected_checksum += (unsigned char)(data[i++]) << 8;
+	expected_checksum += (unsigned char)(data[i++]);
+	for (i = 0; i < session_key->decrypted_key_size; i++)
+		checksum += session_key->decrypted_key[i];
+	if (expected_checksum != checksum) {
+		ecryptfs_printk(KERN_ERR, "Invalid checksum for file "
+				"encryption  key; expected [%x]; calculated "
+				"[%x]\n", expected_checksum, checksum);
+		rc = -EIO;
+	}
+out:
+	return rc;
+}
+
+
+static int
+write_tag_66_packet(char *signature, u8 cipher_code,
+		    struct ecryptfs_crypt_stat *crypt_stat, char **packet,
+		    size_t *packet_len)
+{
+	size_t i = 0;
+	size_t j;
+	size_t data_len;
+	size_t checksum = 0;
+	size_t packet_size_len;
+	char *message;
+	int rc;
+
+	/*
+	 *              ***** TAG 66 Packet Format *****
+	 *         | Content Type             | 1 byte       |
+	 *         | Key Identifier Size      | 1 or 2 bytes |
+	 *         | Key Identifier           | arbitrary    |
+	 *         | File Encryption Key Size | 1 or 2 bytes |
+	 *         | File Encryption Key      | arbitrary    |
+	 */
+	data_len = (5 + ECRYPTFS_SIG_SIZE_HEX + crypt_stat->key_size);
+	*packet = kmalloc(data_len, GFP_KERNEL);
+	message = *packet;
+	if (!message) {
+		ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	message[i++] = ECRYPTFS_TAG_66_PACKET_TYPE;
+	rc = ecryptfs_write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX,
+					  &packet_size_len);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet "
+				"header; cannot generate packet length\n");
+		goto out;
+	}
+	i += packet_size_len;
+	memcpy(&message[i], signature, ECRYPTFS_SIG_SIZE_HEX);
+	i += ECRYPTFS_SIG_SIZE_HEX;
+	/* The encrypted key includes 1 byte cipher code and 2 byte checksum */
+	rc = ecryptfs_write_packet_length(&message[i], crypt_stat->key_size + 3,
+					  &packet_size_len);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet "
+				"header; cannot generate packet length\n");
+		goto out;
+	}
+	i += packet_size_len;
+	message[i++] = cipher_code;
+	memcpy(&message[i], crypt_stat->key, crypt_stat->key_size);
+	i += crypt_stat->key_size;
+	for (j = 0; j < crypt_stat->key_size; j++)
+		checksum += crypt_stat->key[j];
+	message[i++] = (checksum / 256) % 256;
+	message[i++] = (checksum % 256);
+	*packet_len = i;
+out:
+	return rc;
+}
+
+static int
+parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
+		    struct ecryptfs_message *msg)
+{
+	size_t i = 0;
+	char *data;
+	size_t data_len;
+	size_t message_len;
+	int rc;
+
+	/*
+	 *              ***** TAG 65 Packet Format *****
+	 *    | Content Type                       | 1 byte       |
+	 *    | Status Indicator                   | 1 byte       |
+	 *    | Encrypted File Encryption Key Size | 1 or 2 bytes |
+	 *    | Encrypted File Encryption Key      | arbitrary    |
+	 */
+	message_len = msg->data_len;
+	data = msg->data;
+	/* verify that everything through the encrypted FEK size is present */
+	if (message_len < 4) {
+		rc = -EIO;
+		printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable "
+		       "message length is [%d]\n", __func__, message_len, 4);
+		goto out;
+	}
+	if (data[i++] != ECRYPTFS_TAG_67_PACKET_TYPE) {
+		rc = -EIO;
+		printk(KERN_ERR "%s: Type should be ECRYPTFS_TAG_67\n",
+		       __func__);
+		goto out;
+	}
+	if (data[i++]) {
+		rc = -EIO;
+		printk(KERN_ERR "%s: Status indicator has non zero "
+		       "value [%d]\n", __func__, data[i-1]);
+
+		goto out;
+	}
+	rc = ecryptfs_parse_packet_length(&data[i], &key_rec->enc_key_size,
+					  &data_len);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error parsing packet length; "
+				"rc = [%d]\n", rc);
+		goto out;
+	}
+	i += data_len;
+	if (message_len < (i + key_rec->enc_key_size)) {
+		rc = -EIO;
+		printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n",
+		       __func__, message_len, (i + key_rec->enc_key_size));
+		goto out;
+	}
+	if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
+		rc = -EIO;
+		printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than "
+		       "the maximum key size [%d]\n", __func__,
+		       key_rec->enc_key_size,
+		       ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES);
+		goto out;
+	}
+	memcpy(key_rec->enc_key, &data[i], key_rec->enc_key_size);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_verify_version
+ * @version: The version number to confirm
+ *
+ * Returns zero on good version; non-zero otherwise
+ */
+static int ecryptfs_verify_version(u16 version)
+{
+	int rc = 0;
+	unsigned char major;
+	unsigned char minor;
+
+	major = ((version >> 8) & 0xFF);
+	minor = (version & 0xFF);
+	if (major != ECRYPTFS_VERSION_MAJOR) {
+		ecryptfs_printk(KERN_ERR, "Major version number mismatch. "
+				"Expected [%d]; got [%d]\n",
+				ECRYPTFS_VERSION_MAJOR, major);
+		rc = -EINVAL;
+		goto out;
+	}
+	if (minor != ECRYPTFS_VERSION_MINOR) {
+		ecryptfs_printk(KERN_ERR, "Minor version number mismatch. "
+				"Expected [%d]; got [%d]\n",
+				ECRYPTFS_VERSION_MINOR, minor);
+		rc = -EINVAL;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_verify_auth_tok_from_key
+ * @auth_tok_key: key containing the authentication token
+ * @auth_tok: authentication token
+ *
+ * Returns zero on valid auth tok; -EINVAL otherwise
+ */
+static int
+ecryptfs_verify_auth_tok_from_key(struct key *auth_tok_key,
+				  struct ecryptfs_auth_tok **auth_tok)
+{
+	int rc = 0;
+
+	(*auth_tok) = ecryptfs_get_key_payload_data(auth_tok_key);
+	if (ecryptfs_verify_version((*auth_tok)->version)) {
+		printk(KERN_ERR "Data structure version mismatch. Userspace "
+		       "tools must match eCryptfs kernel module with major "
+		       "version [%d] and minor version [%d]\n",
+		       ECRYPTFS_VERSION_MAJOR, ECRYPTFS_VERSION_MINOR);
+		rc = -EINVAL;
+		goto out;
+	}
+	if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD
+	    && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) {
+		printk(KERN_ERR "Invalid auth_tok structure "
+		       "returned from key query\n");
+		rc = -EINVAL;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+static int
+ecryptfs_find_global_auth_tok_for_sig(
+	struct key **auth_tok_key,
+	struct ecryptfs_auth_tok **auth_tok,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
+{
+	struct ecryptfs_global_auth_tok *walker;
+	int rc = 0;
+
+	(*auth_tok_key) = NULL;
+	(*auth_tok) = NULL;
+	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	list_for_each_entry(walker,
+			    &mount_crypt_stat->global_auth_tok_list,
+			    mount_crypt_stat_list) {
+		if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX))
+			continue;
+
+		if (walker->flags & ECRYPTFS_AUTH_TOK_INVALID) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		rc = key_validate(walker->global_auth_tok_key);
+		if (rc) {
+			if (rc == -EKEYEXPIRED)
+				goto out;
+			goto out_invalid_auth_tok;
+		}
+
+		down_write(&(walker->global_auth_tok_key->sem));
+		rc = ecryptfs_verify_auth_tok_from_key(
+				walker->global_auth_tok_key, auth_tok);
+		if (rc)
+			goto out_invalid_auth_tok_unlock;
+
+		(*auth_tok_key) = walker->global_auth_tok_key;
+		key_get(*auth_tok_key);
+		goto out;
+	}
+	rc = -ENOENT;
+	goto out;
+out_invalid_auth_tok_unlock:
+	up_write(&(walker->global_auth_tok_key->sem));
+out_invalid_auth_tok:
+	printk(KERN_WARNING "Invalidating auth tok with sig = [%s]\n", sig);
+	walker->flags |= ECRYPTFS_AUTH_TOK_INVALID;
+	key_put(walker->global_auth_tok_key);
+	walker->global_auth_tok_key = NULL;
+out:
+	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	return rc;
+}
+
+/**
+ * ecryptfs_find_auth_tok_for_sig
+ * @auth_tok: Set to the matching auth_tok; NULL if not found
+ * @crypt_stat: inode crypt_stat crypto context
+ * @sig: Sig of auth_tok to find
+ *
+ * For now, this function simply looks at the registered auth_tok's
+ * linked off the mount_crypt_stat, so all the auth_toks that can be
+ * used must be registered at mount time. This function could
+ * potentially try a lot harder to find auth_tok's (e.g., by calling
+ * out to ecryptfsd to dynamically retrieve an auth_tok object) so
+ * that static registration of auth_tok's will no longer be necessary.
+ *
+ * Returns zero on no error; non-zero on error
+ */
+static int
+ecryptfs_find_auth_tok_for_sig(
+	struct key **auth_tok_key,
+	struct ecryptfs_auth_tok **auth_tok,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+	char *sig)
+{
+	int rc = 0;
+
+	rc = ecryptfs_find_global_auth_tok_for_sig(auth_tok_key, auth_tok,
+						   mount_crypt_stat, sig);
+	if (rc == -ENOENT) {
+		/* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the
+		 * mount_crypt_stat structure, we prevent to use auth toks that
+		 * are not inserted through the ecryptfs_add_global_auth_tok
+		 * function.
+		 */
+		if (mount_crypt_stat->flags
+				& ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
+			return -EINVAL;
+
+		rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok,
+						       sig);
+	}
+	return rc;
+}
+
+/**
+ * write_tag_70_packet can gobble a lot of stack space. We stuff most
+ * of the function's parameters in a kmalloc'd struct to help reduce
+ * eCryptfs' overall stack usage.
+ */
+struct ecryptfs_write_tag_70_packet_silly_stack {
+	u8 cipher_code;
+	size_t max_packet_size;
+	size_t packet_size_len;
+	size_t block_aligned_filename_size;
+	size_t block_size;
+	size_t i;
+	size_t j;
+	size_t num_rand_bytes;
+	struct mutex *tfm_mutex;
+	char *block_aligned_filename;
+	struct ecryptfs_auth_tok *auth_tok;
+	struct scatterlist src_sg[2];
+	struct scatterlist dst_sg[2];
+	struct blkcipher_desc desc;
+	char iv[ECRYPTFS_MAX_IV_BYTES];
+	char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+	char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+	struct hash_desc hash_desc;
+	struct scatterlist hash_sg;
+};
+
+/**
+ * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK
+ * @filename: NULL-terminated filename string
+ *
+ * This is the simplest mechanism for achieving filename encryption in
+ * eCryptfs. It encrypts the given filename with the mount-wide
+ * filename encryption key (FNEK) and stores it in a packet to @dest,
+ * which the callee will encode and write directly into the dentry
+ * name.
+ */
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *filename, size_t filename_size)
+{
+	struct ecryptfs_write_tag_70_packet_silly_stack *s;
+	struct key *auth_tok_key = NULL;
+	int rc = 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s) {
+		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+		rc = -ENOMEM;
+		goto out;
+	}
+	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	(*packet_size) = 0;
+	rc = ecryptfs_find_auth_tok_for_sig(
+		&auth_tok_key,
+		&s->auth_tok, mount_crypt_stat,
+		mount_crypt_stat->global_default_fnek_sig);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to find auth tok for "
+		       "fnek sig [%s]; rc = [%d]\n", __func__,
+		       mount_crypt_stat->global_default_fnek_sig, rc);
+		goto out;
+	}
+	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
+		&s->desc.tfm,
+		&s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "Internal error whilst attempting to get "
+		       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+		       mount_crypt_stat->global_default_fn_cipher_name, rc);
+		goto out;
+	}
+	mutex_lock(s->tfm_mutex);
+	s->block_size = crypto_blkcipher_blocksize(s->desc.tfm);
+	/* Plus one for the \0 separator between the random prefix
+	 * and the plaintext filename */
+	s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1);
+	s->block_aligned_filename_size = (s->num_rand_bytes + filename_size);
+	if ((s->block_aligned_filename_size % s->block_size) != 0) {
+		s->num_rand_bytes += (s->block_size
+				      - (s->block_aligned_filename_size
+					 % s->block_size));
+		s->block_aligned_filename_size = (s->num_rand_bytes
+						  + filename_size);
+	}
+	/* Octet 0: Tag 70 identifier
+	 * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+	 *              and block-aligned encrypted filename size)
+	 * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+	 * Octet N2-N3: Cipher identifier (1 octet)
+	 * Octets N3-N4: Block-aligned encrypted filename
+	 *  - Consists of a minimum number of random characters, a \0
+	 *    separator, and then the filename */
+	s->max_packet_size = (1                   /* Tag 70 identifier */
+			      + 3                 /* Max Tag 70 packet size */
+			      + ECRYPTFS_SIG_SIZE /* FNEK sig */
+			      + 1                 /* Cipher identifier */
+			      + s->block_aligned_filename_size);
+	if (dest == NULL) {
+		(*packet_size) = s->max_packet_size;
+		goto out_unlock;
+	}
+	if (s->max_packet_size > (*remaining_bytes)) {
+		printk(KERN_WARNING "%s: Require [%zd] bytes to write; only "
+		       "[%zd] available\n", __func__, s->max_packet_size,
+		       (*remaining_bytes));
+		rc = -EINVAL;
+		goto out_unlock;
+	}
+	s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
+					    GFP_KERNEL);
+	if (!s->block_aligned_filename) {
+		printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+		       "kzalloc [%zd] bytes\n", __func__,
+		       s->block_aligned_filename_size);
+		rc = -ENOMEM;
+		goto out_unlock;
+	}
+	s->i = 0;
+	dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE;
+	rc = ecryptfs_write_packet_length(&dest[s->i],
+					  (ECRYPTFS_SIG_SIZE
+					   + 1 /* Cipher code */
+					   + s->block_aligned_filename_size),
+					  &s->packet_size_len);
+	if (rc) {
+		printk(KERN_ERR "%s: Error generating tag 70 packet "
+		       "header; cannot generate packet length; rc = [%d]\n",
+		       __func__, rc);
+		goto out_free_unlock;
+	}
+	s->i += s->packet_size_len;
+	ecryptfs_from_hex(&dest[s->i],
+			  mount_crypt_stat->global_default_fnek_sig,
+			  ECRYPTFS_SIG_SIZE);
+	s->i += ECRYPTFS_SIG_SIZE;
+	s->cipher_code = ecryptfs_code_for_cipher_string(
+		mount_crypt_stat->global_default_fn_cipher_name,
+		mount_crypt_stat->global_default_fn_cipher_key_bytes);
+	if (s->cipher_code == 0) {
+		printk(KERN_WARNING "%s: Unable to generate code for "
+		       "cipher [%s] with key bytes [%zd]\n", __func__,
+		       mount_crypt_stat->global_default_fn_cipher_name,
+		       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		rc = -EINVAL;
+		goto out_free_unlock;
+	}
+	dest[s->i++] = s->cipher_code;
+	/* TODO: Support other key modules than passphrase for
+	 * filename encryption */
+	if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
+		rc = -EOPNOTSUPP;
+		printk(KERN_INFO "%s: Filename encryption only supports "
+		       "password tokens\n", __func__);
+		goto out_free_unlock;
+	}
+	sg_init_one(
+		&s->hash_sg,
+		(u8 *)s->auth_tok->token.password.session_key_encryption_key,
+		s->auth_tok->token.password.session_key_encryption_key_bytes);
+	s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0,
+					     CRYPTO_ALG_ASYNC);
+	if (IS_ERR(s->hash_desc.tfm)) {
+			rc = PTR_ERR(s->hash_desc.tfm);
+			printk(KERN_ERR "%s: Error attempting to "
+			       "allocate hash crypto context; rc = [%d]\n",
+			       __func__, rc);
+			goto out_free_unlock;
+	}
+	rc = crypto_hash_init(&s->hash_desc);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error initializing crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out_release_free_unlock;
+	}
+	rc = crypto_hash_update(
+		&s->hash_desc, &s->hash_sg,
+		s->auth_tok->token.password.session_key_encryption_key_bytes);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error updating crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out_release_free_unlock;
+	}
+	rc = crypto_hash_final(&s->hash_desc, s->hash);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error finalizing crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out_release_free_unlock;
+	}
+	for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) {
+		s->block_aligned_filename[s->j] =
+			s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
+		if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
+		    == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
+			sg_init_one(&s->hash_sg, (u8 *)s->hash,
+				    ECRYPTFS_TAG_70_DIGEST_SIZE);
+			rc = crypto_hash_init(&s->hash_desc);
+			if (rc) {
+				printk(KERN_ERR
+				       "%s: Error initializing crypto hash; "
+				       "rc = [%d]\n", __func__, rc);
+				goto out_release_free_unlock;
+			}
+			rc = crypto_hash_update(&s->hash_desc, &s->hash_sg,
+						ECRYPTFS_TAG_70_DIGEST_SIZE);
+			if (rc) {
+				printk(KERN_ERR
+				       "%s: Error updating crypto hash; "
+				       "rc = [%d]\n", __func__, rc);
+				goto out_release_free_unlock;
+			}
+			rc = crypto_hash_final(&s->hash_desc, s->tmp_hash);
+			if (rc) {
+				printk(KERN_ERR
+				       "%s: Error finalizing crypto hash; "
+				       "rc = [%d]\n", __func__, rc);
+				goto out_release_free_unlock;
+			}
+			memcpy(s->hash, s->tmp_hash,
+			       ECRYPTFS_TAG_70_DIGEST_SIZE);
+		}
+		if (s->block_aligned_filename[s->j] == '\0')
+			s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL;
+	}
+	memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename,
+	       filename_size);
+	rc = virt_to_scatterlist(s->block_aligned_filename,
+				 s->block_aligned_filename_size, s->src_sg, 2);
+	if (rc < 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert filename memory to scatterlist; rc = [%d]. "
+		       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+		       s->block_aligned_filename_size);
+		goto out_release_free_unlock;
+	}
+	rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
+				 s->dst_sg, 2);
+	if (rc < 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert encrypted filename memory to scatterlist; "
+		       "rc = [%d]. block_aligned_filename_size = [%zd]\n",
+		       __func__, rc, s->block_aligned_filename_size);
+		goto out_release_free_unlock;
+	}
+	/* The characters in the first block effectively do the job
+	 * of the IV here, so we just use 0's for the IV. Note the
+	 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+	 * >= ECRYPTFS_MAX_IV_BYTES. */
+	memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+	s->desc.info = s->iv;
+	rc = crypto_blkcipher_setkey(
+		s->desc.tfm,
+		s->auth_tok->token.password.session_key_encryption_key,
+		mount_crypt_stat->global_default_fn_cipher_key_bytes);
+	if (rc < 0) {
+		printk(KERN_ERR "%s: Error setting key for crypto context; "
+		       "rc = [%d]. s->auth_tok->token.password.session_key_"
+		       "encryption_key = [0x%p]; mount_crypt_stat->"
+		       "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
+		       rc,
+		       s->auth_tok->token.password.session_key_encryption_key,
+		       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		goto out_release_free_unlock;
+	}
+	rc = crypto_blkcipher_encrypt_iv(&s->desc, s->dst_sg, s->src_sg,
+					 s->block_aligned_filename_size);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to encrypt filename; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out_release_free_unlock;
+	}
+	s->i += s->block_aligned_filename_size;
+	(*packet_size) = s->i;
+	(*remaining_bytes) -= (*packet_size);
+out_release_free_unlock:
+	crypto_free_hash(s->hash_desc.tfm);
+out_free_unlock:
+	kzfree(s->block_aligned_filename);
+out_unlock:
+	mutex_unlock(s->tfm_mutex);
+out:
+	if (auth_tok_key) {
+		up_write(&(auth_tok_key->sem));
+		key_put(auth_tok_key);
+	}
+	kfree(s);
+	return rc;
+}
+
+struct ecryptfs_parse_tag_70_packet_silly_stack {
+	u8 cipher_code;
+	size_t max_packet_size;
+	size_t packet_size_len;
+	size_t parsed_tag_70_packet_size;
+	size_t block_aligned_filename_size;
+	size_t block_size;
+	size_t i;
+	struct mutex *tfm_mutex;
+	char *decrypted_filename;
+	struct ecryptfs_auth_tok *auth_tok;
+	struct scatterlist src_sg[2];
+	struct scatterlist dst_sg[2];
+	struct blkcipher_desc desc;
+	char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
+	char iv[ECRYPTFS_MAX_IV_BYTES];
+	char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+};
+
+/**
+ * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
+ * @filename: This function kmalloc's the memory for the filename
+ * @filename_size: This function sets this to the amount of memory
+ *                 kmalloc'd for the filename
+ * @packet_size: This function sets this to the the number of octets
+ *               in the packet parsed
+ * @mount_crypt_stat: The mount-wide cryptographic context
+ * @data: The memory location containing the start of the tag 70
+ *        packet
+ * @max_packet_size: The maximum legal size of the packet to be parsed
+ *                   from @data
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *data, size_t max_packet_size)
+{
+	struct ecryptfs_parse_tag_70_packet_silly_stack *s;
+	struct key *auth_tok_key = NULL;
+	int rc = 0;
+
+	(*packet_size) = 0;
+	(*filename_size) = 0;
+	(*filename) = NULL;
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s) {
+		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+		rc = -ENOMEM;
+		goto out;
+	}
+	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) {
+		printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
+		       "at least [%d]\n", __func__, max_packet_size,
+			(1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1));
+		rc = -EINVAL;
+		goto out;
+	}
+	/* Octet 0: Tag 70 identifier
+	 * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+	 *              and block-aligned encrypted filename size)
+	 * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+	 * Octet N2-N3: Cipher identifier (1 octet)
+	 * Octets N3-N4: Block-aligned encrypted filename
+	 *  - Consists of a minimum number of random numbers, a \0
+	 *    separator, and then the filename */
+	if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) {
+		printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be "
+		       "tag [0x%.2x]\n", __func__,
+		       data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE);
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = ecryptfs_parse_packet_length(&data[(*packet_size)],
+					  &s->parsed_tag_70_packet_size,
+					  &s->packet_size_len);
+	if (rc) {
+		printk(KERN_WARNING "%s: Error parsing packet length; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out;
+	}
+	s->block_aligned_filename_size = (s->parsed_tag_70_packet_size
+					  - ECRYPTFS_SIG_SIZE - 1);
+	if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size)
+	    > max_packet_size) {
+		printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet "
+		       "size is [%zd]\n", __func__, max_packet_size,
+		       (1 + s->packet_size_len + 1
+			+ s->block_aligned_filename_size));
+		rc = -EINVAL;
+		goto out;
+	}
+	(*packet_size) += s->packet_size_len;
+	ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)],
+			ECRYPTFS_SIG_SIZE);
+	s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+	(*packet_size) += ECRYPTFS_SIG_SIZE;
+	s->cipher_code = data[(*packet_size)++];
+	rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code);
+	if (rc) {
+		printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n",
+		       __func__, s->cipher_code);
+		goto out;
+	}
+	rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+					    &s->auth_tok, mount_crypt_stat,
+					    s->fnek_sig_hex);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to find auth tok for "
+		       "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
+		       rc);
+		goto out;
+	}
+	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
+							&s->tfm_mutex,
+							s->cipher_string);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "Internal error whilst attempting to get "
+		       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+		       s->cipher_string, rc);
+		goto out;
+	}
+	mutex_lock(s->tfm_mutex);
+	rc = virt_to_scatterlist(&data[(*packet_size)],
+				 s->block_aligned_filename_size, s->src_sg, 2);
+	if (rc < 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert encrypted filename memory to scatterlist; "
+		       "rc = [%d]. block_aligned_filename_size = [%zd]\n",
+		       __func__, rc, s->block_aligned_filename_size);
+		goto out_unlock;
+	}
+	(*packet_size) += s->block_aligned_filename_size;
+	s->decrypted_filename = kmalloc(s->block_aligned_filename_size,
+					GFP_KERNEL);
+	if (!s->decrypted_filename) {
+		printk(KERN_ERR "%s: Out of memory whilst attempting to "
+		       "kmalloc [%zd] bytes\n", __func__,
+		       s->block_aligned_filename_size);
+		rc = -ENOMEM;
+		goto out_unlock;
+	}
+	rc = virt_to_scatterlist(s->decrypted_filename,
+				 s->block_aligned_filename_size, s->dst_sg, 2);
+	if (rc < 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert decrypted filename memory to scatterlist; "
+		       "rc = [%d]. block_aligned_filename_size = [%zd]\n",
+		       __func__, rc, s->block_aligned_filename_size);
+		goto out_free_unlock;
+	}
+	/* The characters in the first block effectively do the job of
+	 * the IV here, so we just use 0's for the IV. Note the
+	 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+	 * >= ECRYPTFS_MAX_IV_BYTES. */
+	memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+	s->desc.info = s->iv;
+	/* TODO: Support other key modules than passphrase for
+	 * filename encryption */
+	if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
+		rc = -EOPNOTSUPP;
+		printk(KERN_INFO "%s: Filename encryption only supports "
+		       "password tokens\n", __func__);
+		goto out_free_unlock;
+	}
+	rc = crypto_blkcipher_setkey(
+		s->desc.tfm,
+		s->auth_tok->token.password.session_key_encryption_key,
+		mount_crypt_stat->global_default_fn_cipher_key_bytes);
+	if (rc < 0) {
+		printk(KERN_ERR "%s: Error setting key for crypto context; "
+		       "rc = [%d]. s->auth_tok->token.password.session_key_"
+		       "encryption_key = [0x%p]; mount_crypt_stat->"
+		       "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
+		       rc,
+		       s->auth_tok->token.password.session_key_encryption_key,
+		       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		goto out_free_unlock;
+	}
+	rc = crypto_blkcipher_decrypt_iv(&s->desc, s->dst_sg, s->src_sg,
+					 s->block_aligned_filename_size);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to decrypt filename; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out_free_unlock;
+	}
+	s->i = 0;
+	while (s->decrypted_filename[s->i] != '\0'
+	       && s->i < s->block_aligned_filename_size)
+		s->i++;
+	if (s->i == s->block_aligned_filename_size) {
+		printk(KERN_WARNING "%s: Invalid tag 70 packet; could not "
+		       "find valid separator between random characters and "
+		       "the filename\n", __func__);
+		rc = -EINVAL;
+		goto out_free_unlock;
+	}
+	s->i++;
+	(*filename_size) = (s->block_aligned_filename_size - s->i);
+	if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) {
+		printk(KERN_WARNING "%s: Filename size is [%zd], which is "
+		       "invalid\n", __func__, (*filename_size));
+		rc = -EINVAL;
+		goto out_free_unlock;
+	}
+	(*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL);
+	if (!(*filename)) {
+		printk(KERN_ERR "%s: Out of memory whilst attempting to "
+		       "kmalloc [%zd] bytes\n", __func__,
+		       ((*filename_size) + 1));
+		rc = -ENOMEM;
+		goto out_free_unlock;
+	}
+	memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size));
+	(*filename)[(*filename_size)] = '\0';
+out_free_unlock:
+	kfree(s->decrypted_filename);
+out_unlock:
+	mutex_unlock(s->tfm_mutex);
+out:
+	if (rc) {
+		(*packet_size) = 0;
+		(*filename_size) = 0;
+		(*filename) = NULL;
+	}
+	if (auth_tok_key) {
+		up_write(&(auth_tok_key->sem));
+		key_put(auth_tok_key);
+	}
+	kfree(s);
+	return rc;
+}
+
+static int
+ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok)
+{
+	int rc = 0;
+
+	(*sig) = NULL;
+	switch (auth_tok->token_type) {
+	case ECRYPTFS_PASSWORD:
+		(*sig) = auth_tok->token.password.signature;
+		break;
+	case ECRYPTFS_PRIVATE_KEY:
+		(*sig) = auth_tok->token.private_key.signature;
+		break;
+	default:
+		printk(KERN_ERR "Cannot get sig for auth_tok of type [%d]\n",
+		       auth_tok->token_type);
+		rc = -EINVAL;
+	}
+	return rc;
+}
+
+/**
+ * decrypt_pki_encrypted_session_key - Decrypt the session key with the given auth_tok.
+ * @auth_tok: The key authentication token used to decrypt the session key
+ * @crypt_stat: The cryptographic context
+ *
+ * Returns zero on success; non-zero error otherwise.
+ */
+static int
+decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
+				  struct ecryptfs_crypt_stat *crypt_stat)
+{
+	u8 cipher_code = 0;
+	struct ecryptfs_msg_ctx *msg_ctx;
+	struct ecryptfs_message *msg = NULL;
+	char *auth_tok_sig;
+	char *payload;
+	size_t payload_len;
+	int rc;
+
+	rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok);
+	if (rc) {
+		printk(KERN_ERR "Unrecognized auth tok type: [%d]\n",
+		       auth_tok->token_type);
+		goto out;
+	}
+	rc = write_tag_64_packet(auth_tok_sig, &(auth_tok->session_key),
+				 &payload, &payload_len);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet\n");
+		goto out;
+	}
+	rc = ecryptfs_send_message(payload, payload_len, &msg_ctx);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error sending message to "
+				"ecryptfsd\n");
+		goto out;
+	}
+	rc = ecryptfs_wait_for_response(msg_ctx, &msg);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Failed to receive tag 65 packet "
+				"from the user space daemon\n");
+		rc = -EIO;
+		goto out;
+	}
+	rc = parse_tag_65_packet(&(auth_tok->session_key),
+				 &cipher_code, msg);
+	if (rc) {
+		printk(KERN_ERR "Failed to parse tag 65 packet; rc = [%d]\n",
+		       rc);
+		goto out;
+	}
+	auth_tok->session_key.flags |= ECRYPTFS_CONTAINS_DECRYPTED_KEY;
+	memcpy(crypt_stat->key, auth_tok->session_key.decrypted_key,
+	       auth_tok->session_key.decrypted_key_size);
+	crypt_stat->key_size = auth_tok->session_key.decrypted_key_size;
+	rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher, cipher_code);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Cipher code [%d] is invalid\n",
+				cipher_code)
+		goto out;
+	}
+	crypt_stat->flags |= ECRYPTFS_KEY_VALID;
+	if (ecryptfs_verbosity > 0) {
+		ecryptfs_printk(KERN_DEBUG, "Decrypted session key:\n");
+		ecryptfs_dump_hex(crypt_stat->key,
+				  crypt_stat->key_size);
+	}
+out:
+	if (msg)
+		kfree(msg);
+	return rc;
+}
+
+static void wipe_auth_tok_list(struct list_head *auth_tok_list_head)
+{
+	struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+	struct ecryptfs_auth_tok_list_item *auth_tok_list_item_tmp;
+
+	list_for_each_entry_safe(auth_tok_list_item, auth_tok_list_item_tmp,
+				 auth_tok_list_head, list) {
+		list_del(&auth_tok_list_item->list);
+		kmem_cache_free(ecryptfs_auth_tok_list_item_cache,
+				auth_tok_list_item);
+	}
+}
+
+struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
+
+/**
+ * parse_tag_1_packet
+ * @crypt_stat: The cryptographic context to modify based on packet contents
+ * @data: The raw bytes of the packet.
+ * @auth_tok_list: eCryptfs parses packets into authentication tokens;
+ *                 a new authentication token will be placed at the
+ *                 end of this list for this packet.
+ * @new_auth_tok: Pointer to a pointer to memory that this function
+ *                allocates; sets the memory address of the pointer to
+ *                NULL on error. This object is added to the
+ *                auth_tok_list.
+ * @packet_size: This function writes the size of the parsed packet
+ *               into this memory location; zero on error.
+ * @max_packet_size: The maximum allowable packet size
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat,
+		   unsigned char *data, struct list_head *auth_tok_list,
+		   struct ecryptfs_auth_tok **new_auth_tok,
+		   size_t *packet_size, size_t max_packet_size)
+{
+	size_t body_size;
+	struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+	size_t length_size;
+	int rc = 0;
+
+	(*packet_size) = 0;
+	(*new_auth_tok) = NULL;
+	/**
+	 * This format is inspired by OpenPGP; see RFC 2440
+	 * packet tag 1
+	 *
+	 * Tag 1 identifier (1 byte)
+	 * Max Tag 1 packet size (max 3 bytes)
+	 * Version (1 byte)
+	 * Key identifier (8 bytes; ECRYPTFS_SIG_SIZE)
+	 * Cipher identifier (1 byte)
+	 * Encrypted key size (arbitrary)
+	 *
+	 * 12 bytes minimum packet size
+	 */
+	if (unlikely(max_packet_size < 12)) {
+		printk(KERN_ERR "Invalid max packet size; must be >=12\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	if (data[(*packet_size)++] != ECRYPTFS_TAG_1_PACKET_TYPE) {
+		printk(KERN_ERR "Enter w/ first byte != 0x%.2x\n",
+		       ECRYPTFS_TAG_1_PACKET_TYPE);
+		rc = -EINVAL;
+		goto out;
+	}
+	/* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or
+	 * at end of function upon failure */
+	auth_tok_list_item =
+		kmem_cache_zalloc(ecryptfs_auth_tok_list_item_cache,
+				  GFP_KERNEL);
+	if (!auth_tok_list_item) {
+		printk(KERN_ERR "Unable to allocate memory\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	(*new_auth_tok) = &auth_tok_list_item->auth_tok;
+	rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size,
+					  &length_size);
+	if (rc) {
+		printk(KERN_WARNING "Error parsing packet length; "
+		       "rc = [%d]\n", rc);
+		goto out_free;
+	}
+	if (unlikely(body_size < (ECRYPTFS_SIG_SIZE + 2))) {
+		printk(KERN_WARNING "Invalid body size ([%td])\n", body_size);
+		rc = -EINVAL;
+		goto out_free;
+	}
+	(*packet_size) += length_size;
+	if (unlikely((*packet_size) + body_size > max_packet_size)) {
+		printk(KERN_WARNING "Packet size exceeds max\n");
+		rc = -EINVAL;
+		goto out_free;
+	}
+	if (unlikely(data[(*packet_size)++] != 0x03)) {
+		printk(KERN_WARNING "Unknown version number [%d]\n",
+		       data[(*packet_size) - 1]);
+		rc = -EINVAL;
+		goto out_free;
+	}
+	ecryptfs_to_hex((*new_auth_tok)->token.private_key.signature,
+			&data[(*packet_size)], ECRYPTFS_SIG_SIZE);
+	*packet_size += ECRYPTFS_SIG_SIZE;
+	/* This byte is skipped because the kernel does not need to
+	 * know which public key encryption algorithm was used */
+	(*packet_size)++;
+	(*new_auth_tok)->session_key.encrypted_key_size =
+		body_size - (ECRYPTFS_SIG_SIZE + 2);
+	if ((*new_auth_tok)->session_key.encrypted_key_size
+	    > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
+		printk(KERN_WARNING "Tag 1 packet contains key larger "
+		       "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES");
+		rc = -EINVAL;
+		goto out;
+	}
+	memcpy((*new_auth_tok)->session_key.encrypted_key,
+	       &data[(*packet_size)], (body_size - (ECRYPTFS_SIG_SIZE + 2)));
+	(*packet_size) += (*new_auth_tok)->session_key.encrypted_key_size;
+	(*new_auth_tok)->session_key.flags &=
+		~ECRYPTFS_CONTAINS_DECRYPTED_KEY;
+	(*new_auth_tok)->session_key.flags |=
+		ECRYPTFS_CONTAINS_ENCRYPTED_KEY;
+	(*new_auth_tok)->token_type = ECRYPTFS_PRIVATE_KEY;
+	(*new_auth_tok)->flags = 0;
+	(*new_auth_tok)->session_key.flags &=
+		~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT);
+	(*new_auth_tok)->session_key.flags &=
+		~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT);
+	list_add(&auth_tok_list_item->list, auth_tok_list);
+	goto out;
+out_free:
+	(*new_auth_tok) = NULL;
+	memset(auth_tok_list_item, 0,
+	       sizeof(struct ecryptfs_auth_tok_list_item));
+	kmem_cache_free(ecryptfs_auth_tok_list_item_cache,
+			auth_tok_list_item);
+out:
+	if (rc)
+		(*packet_size) = 0;
+	return rc;
+}
+
+/**
+ * parse_tag_3_packet
+ * @crypt_stat: The cryptographic context to modify based on packet
+ *              contents.
+ * @data: The raw bytes of the packet.
+ * @auth_tok_list: eCryptfs parses packets into authentication tokens;
+ *                 a new authentication token will be placed at the end
+ *                 of this list for this packet.
+ * @new_auth_tok: Pointer to a pointer to memory that this function
+ *                allocates; sets the memory address of the pointer to
+ *                NULL on error. This object is added to the
+ *                auth_tok_list.
+ * @packet_size: This function writes the size of the parsed packet
+ *               into this memory location; zero on error.
+ * @max_packet_size: maximum number of bytes to parse
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
+		   unsigned char *data, struct list_head *auth_tok_list,
+		   struct ecryptfs_auth_tok **new_auth_tok,
+		   size_t *packet_size, size_t max_packet_size)
+{
+	size_t body_size;
+	struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+	size_t length_size;
+	int rc = 0;
+
+	(*packet_size) = 0;
+	(*new_auth_tok) = NULL;
+	/**
+	 *This format is inspired by OpenPGP; see RFC 2440
+	 * packet tag 3
+	 *
+	 * Tag 3 identifier (1 byte)
+	 * Max Tag 3 packet size (max 3 bytes)
+	 * Version (1 byte)
+	 * Cipher code (1 byte)
+	 * S2K specifier (1 byte)
+	 * Hash identifier (1 byte)
+	 * Salt (ECRYPTFS_SALT_SIZE)
+	 * Hash iterations (1 byte)
+	 * Encrypted key (arbitrary)
+	 *
+	 * (ECRYPTFS_SALT_SIZE + 7) minimum packet size
+	 */
+	if (max_packet_size < (ECRYPTFS_SALT_SIZE + 7)) {
+		printk(KERN_ERR "Max packet size too large\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	if (data[(*packet_size)++] != ECRYPTFS_TAG_3_PACKET_TYPE) {
+		printk(KERN_ERR "First byte != 0x%.2x; invalid packet\n",
+		       ECRYPTFS_TAG_3_PACKET_TYPE);
+		rc = -EINVAL;
+		goto out;
+	}
+	/* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or
+	 * at end of function upon failure */
+	auth_tok_list_item =
+	    kmem_cache_zalloc(ecryptfs_auth_tok_list_item_cache, GFP_KERNEL);
+	if (!auth_tok_list_item) {
+		printk(KERN_ERR "Unable to allocate memory\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	(*new_auth_tok) = &auth_tok_list_item->auth_tok;
+	rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size,
+					  &length_size);
+	if (rc) {
+		printk(KERN_WARNING "Error parsing packet length; rc = [%d]\n",
+		       rc);
+		goto out_free;
+	}
+	if (unlikely(body_size < (ECRYPTFS_SALT_SIZE + 5))) {
+		printk(KERN_WARNING "Invalid body size ([%td])\n", body_size);
+		rc = -EINVAL;
+		goto out_free;
+	}
+	(*packet_size) += length_size;
+	if (unlikely((*packet_size) + body_size > max_packet_size)) {
+		printk(KERN_ERR "Packet size exceeds max\n");
+		rc = -EINVAL;
+		goto out_free;
+	}
+	(*new_auth_tok)->session_key.encrypted_key_size =
+		(body_size - (ECRYPTFS_SALT_SIZE + 5));
+	if ((*new_auth_tok)->session_key.encrypted_key_size
+	    > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
+		printk(KERN_WARNING "Tag 3 packet contains key larger "
+		       "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
+		rc = -EINVAL;
+		goto out_free;
+	}
+	if (unlikely(data[(*packet_size)++] != 0x04)) {
+		printk(KERN_WARNING "Unknown version number [%d]\n",
+		       data[(*packet_size) - 1]);
+		rc = -EINVAL;
+		goto out_free;
+	}
+	rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher,
+					    (u16)data[(*packet_size)]);
+	if (rc)
+		goto out_free;
+	/* A little extra work to differentiate among the AES key
+	 * sizes; see RFC2440 */
+	switch(data[(*packet_size)++]) {
+	case RFC2440_CIPHER_AES_192:
+		crypt_stat->key_size = 24;
+		break;
+	default:
+		crypt_stat->key_size =
+			(*new_auth_tok)->session_key.encrypted_key_size;
+	}
+	rc = ecryptfs_init_crypt_ctx(crypt_stat);
+	if (rc)
+		goto out_free;
+	if (unlikely(data[(*packet_size)++] != 0x03)) {
+		printk(KERN_WARNING "Only S2K ID 3 is currently supported\n");
+		rc = -ENOSYS;
+		goto out_free;
+	}
+	/* TODO: finish the hash mapping */
+	switch (data[(*packet_size)++]) {
+	case 0x01: /* See RFC2440 for these numbers and their mappings */
+		/* Choose MD5 */
+		memcpy((*new_auth_tok)->token.password.salt,
+		       &data[(*packet_size)], ECRYPTFS_SALT_SIZE);
+		(*packet_size) += ECRYPTFS_SALT_SIZE;
+		/* This conversion was taken straight from RFC2440 */
+		(*new_auth_tok)->token.password.hash_iterations =
+			((u32) 16 + (data[(*packet_size)] & 15))
+				<< ((data[(*packet_size)] >> 4) + 6);
+		(*packet_size)++;
+		/* Friendly reminder:
+		 * (*new_auth_tok)->session_key.encrypted_key_size =
+		 *         (body_size - (ECRYPTFS_SALT_SIZE + 5)); */
+		memcpy((*new_auth_tok)->session_key.encrypted_key,
+		       &data[(*packet_size)],
+		       (*new_auth_tok)->session_key.encrypted_key_size);
+		(*packet_size) +=
+			(*new_auth_tok)->session_key.encrypted_key_size;
+		(*new_auth_tok)->session_key.flags &=
+			~ECRYPTFS_CONTAINS_DECRYPTED_KEY;
+		(*new_auth_tok)->session_key.flags |=
+			ECRYPTFS_CONTAINS_ENCRYPTED_KEY;
+		(*new_auth_tok)->token.password.hash_algo = 0x01; /* MD5 */
+		break;
+	default:
+		ecryptfs_printk(KERN_ERR, "Unsupported hash algorithm: "
+				"[%d]\n", data[(*packet_size) - 1]);
+		rc = -ENOSYS;
+		goto out_free;
+	}
+	(*new_auth_tok)->token_type = ECRYPTFS_PASSWORD;
+	/* TODO: Parametarize; we might actually want userspace to
+	 * decrypt the session key. */
+	(*new_auth_tok)->session_key.flags &=
+			    ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT);
+	(*new_auth_tok)->session_key.flags &=
+			    ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT);
+	list_add(&auth_tok_list_item->list, auth_tok_list);
+	goto out;
+out_free:
+	(*new_auth_tok) = NULL;
+	memset(auth_tok_list_item, 0,
+	       sizeof(struct ecryptfs_auth_tok_list_item));
+	kmem_cache_free(ecryptfs_auth_tok_list_item_cache,
+			auth_tok_list_item);
+out:
+	if (rc)
+		(*packet_size) = 0;
+	return rc;
+}
+
+/**
+ * parse_tag_11_packet
+ * @data: The raw bytes of the packet
+ * @contents: This function writes the data contents of the literal
+ *            packet into this memory location
+ * @max_contents_bytes: The maximum number of bytes that this function
+ *                      is allowed to write into contents
+ * @tag_11_contents_size: This function writes the size of the parsed
+ *                        contents into this memory location; zero on
+ *                        error
+ * @packet_size: This function writes the size of the parsed packet
+ *               into this memory location; zero on error
+ * @max_packet_size: maximum number of bytes to parse
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+parse_tag_11_packet(unsigned char *data, unsigned char *contents,
+		    size_t max_contents_bytes, size_t *tag_11_contents_size,
+		    size_t *packet_size, size_t max_packet_size)
+{
+	size_t body_size;
+	size_t length_size;
+	int rc = 0;
+
+	(*packet_size) = 0;
+	(*tag_11_contents_size) = 0;
+	/* This format is inspired by OpenPGP; see RFC 2440
+	 * packet tag 11
+	 *
+	 * Tag 11 identifier (1 byte)
+	 * Max Tag 11 packet size (max 3 bytes)
+	 * Binary format specifier (1 byte)
+	 * Filename length (1 byte)
+	 * Filename ("_CONSOLE") (8 bytes)
+	 * Modification date (4 bytes)
+	 * Literal data (arbitrary)
+	 *
+	 * We need at least 16 bytes of data for the packet to even be
+	 * valid.
+	 */
+	if (max_packet_size < 16) {
+		printk(KERN_ERR "Maximum packet size too small\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	if (data[(*packet_size)++] != ECRYPTFS_TAG_11_PACKET_TYPE) {
+		printk(KERN_WARNING "Invalid tag 11 packet format\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size,
+					  &length_size);
+	if (rc) {
+		printk(KERN_WARNING "Invalid tag 11 packet format\n");
+		goto out;
+	}
+	if (body_size < 14) {
+		printk(KERN_WARNING "Invalid body size ([%td])\n", body_size);
+		rc = -EINVAL;
+		goto out;
+	}
+	(*packet_size) += length_size;
+	(*tag_11_contents_size) = (body_size - 14);
+	if (unlikely((*packet_size) + body_size + 1 > max_packet_size)) {
+		printk(KERN_ERR "Packet size exceeds max\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	if (unlikely((*tag_11_contents_size) > max_contents_bytes)) {
+		printk(KERN_ERR "Literal data section in tag 11 packet exceeds "
+		       "expected size\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	if (data[(*packet_size)++] != 0x62) {
+		printk(KERN_WARNING "Unrecognizable packet\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	if (data[(*packet_size)++] != 0x08) {
+		printk(KERN_WARNING "Unrecognizable packet\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	(*packet_size) += 12; /* Ignore filename and modification date */
+	memcpy(contents, &data[(*packet_size)], (*tag_11_contents_size));
+	(*packet_size) += (*tag_11_contents_size);
+out:
+	if (rc) {
+		(*packet_size) = 0;
+		(*tag_11_contents_size) = 0;
+	}
+	return rc;
+}
+
+int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
+				      struct ecryptfs_auth_tok **auth_tok,
+				      char *sig)
+{
+	int rc = 0;
+
+	(*auth_tok_key) = request_key(&key_type_user, sig, NULL);
+	if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
+		printk(KERN_ERR "Could not find key with description: [%s]\n",
+		       sig);
+		rc = process_request_key_err(PTR_ERR(*auth_tok_key));
+		(*auth_tok_key) = NULL;
+		goto out;
+	}
+	down_write(&(*auth_tok_key)->sem);
+	rc = ecryptfs_verify_auth_tok_from_key(*auth_tok_key, auth_tok);
+	if (rc) {
+		up_write(&(*auth_tok_key)->sem);
+		key_put(*auth_tok_key);
+		(*auth_tok_key) = NULL;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+/**
+ * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok.
+ * @auth_tok: The passphrase authentication token to use to encrypt the FEK
+ * @crypt_stat: The cryptographic context
+ *
+ * Returns zero on success; non-zero error otherwise
+ */
+static int
+decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
+					 struct ecryptfs_crypt_stat *crypt_stat)
+{
+	struct scatterlist dst_sg[2];
+	struct scatterlist src_sg[2];
+	struct mutex *tfm_mutex;
+	struct blkcipher_desc desc = {
+		.flags = CRYPTO_TFM_REQ_MAY_SLEEP
+	};
+	int rc = 0;
+
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(
+			KERN_DEBUG, "Session key encryption key (size [%d]):\n",
+			auth_tok->token.password.session_key_encryption_key_bytes);
+		ecryptfs_dump_hex(
+			auth_tok->token.password.session_key_encryption_key,
+			auth_tok->token.password.session_key_encryption_key_bytes);
+	}
+	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+							crypt_stat->cipher);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "Internal error whilst attempting to get "
+		       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+		       crypt_stat->cipher, rc);
+		goto out;
+	}
+	rc = virt_to_scatterlist(auth_tok->session_key.encrypted_key,
+				 auth_tok->session_key.encrypted_key_size,
+				 src_sg, 2);
+	if (rc < 1 || rc > 2) {
+		printk(KERN_ERR "Internal error whilst attempting to convert "
+			"auth_tok->session_key.encrypted_key to scatterlist; "
+			"expected rc = 1; got rc = [%d]. "
+		       "auth_tok->session_key.encrypted_key_size = [%d]\n", rc,
+			auth_tok->session_key.encrypted_key_size);
+		goto out;
+	}
+	auth_tok->session_key.decrypted_key_size =
+		auth_tok->session_key.encrypted_key_size;
+	rc = virt_to_scatterlist(auth_tok->session_key.decrypted_key,
+				 auth_tok->session_key.decrypted_key_size,
+				 dst_sg, 2);
+	if (rc < 1 || rc > 2) {
+		printk(KERN_ERR "Internal error whilst attempting to convert "
+			"auth_tok->session_key.decrypted_key to scatterlist; "
+			"expected rc = 1; got rc = [%d]\n", rc);
+		goto out;
+	}
+	mutex_lock(tfm_mutex);
+	rc = crypto_blkcipher_setkey(
+		desc.tfm, auth_tok->token.password.session_key_encryption_key,
+		crypt_stat->key_size);
+	if (unlikely(rc < 0)) {
+		mutex_unlock(tfm_mutex);
+		printk(KERN_ERR "Error setting key for crypto context\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = crypto_blkcipher_decrypt(&desc, dst_sg, src_sg,
+				      auth_tok->session_key.encrypted_key_size);
+	mutex_unlock(tfm_mutex);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "Error decrypting; rc = [%d]\n", rc);
+		goto out;
+	}
+	auth_tok->session_key.flags |= ECRYPTFS_CONTAINS_DECRYPTED_KEY;
+	memcpy(crypt_stat->key, auth_tok->session_key.decrypted_key,
+	       auth_tok->session_key.decrypted_key_size);
+	crypt_stat->flags |= ECRYPTFS_KEY_VALID;
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n",
+				crypt_stat->key_size);
+		ecryptfs_dump_hex(crypt_stat->key,
+				  crypt_stat->key_size);
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_parse_packet_set
+ * @crypt_stat: The cryptographic context
+ * @src: Virtual address of region of memory containing the packets
+ * @ecryptfs_dentry: The eCryptfs dentry associated with the packet set
+ *
+ * Get crypt_stat to have the file's session key if the requisite key
+ * is available to decrypt the session key.
+ *
+ * Returns Zero if a valid authentication token was retrieved and
+ * processed; negative value for file not encrypted or for error
+ * conditions.
+ */
+int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
+			      unsigned char *src,
+			      struct dentry *ecryptfs_dentry)
+{
+	size_t i = 0;
+	size_t found_auth_tok;
+	size_t next_packet_is_auth_tok_packet;
+	struct list_head auth_tok_list;
+	struct ecryptfs_auth_tok *matching_auth_tok;
+	struct ecryptfs_auth_tok *candidate_auth_tok;
+	char *candidate_auth_tok_sig;
+	size_t packet_size;
+	struct ecryptfs_auth_tok *new_auth_tok;
+	unsigned char sig_tmp_space[ECRYPTFS_SIG_SIZE];
+	struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+	size_t tag_11_contents_size;
+	size_t tag_11_packet_size;
+	struct key *auth_tok_key = NULL;
+	int rc = 0;
+
+	INIT_LIST_HEAD(&auth_tok_list);
+	/* Parse the header to find as many packets as we can; these will be
+	 * added the our &auth_tok_list */
+	next_packet_is_auth_tok_packet = 1;
+	while (next_packet_is_auth_tok_packet) {
+		size_t max_packet_size = ((PAGE_CACHE_SIZE - 8) - i);
+
+		switch (src[i]) {
+		case ECRYPTFS_TAG_3_PACKET_TYPE:
+			rc = parse_tag_3_packet(crypt_stat,
+						(unsigned char *)&src[i],
+						&auth_tok_list, &new_auth_tok,
+						&packet_size, max_packet_size);
+			if (rc) {
+				ecryptfs_printk(KERN_ERR, "Error parsing "
+						"tag 3 packet\n");
+				rc = -EIO;
+				goto out_wipe_list;
+			}
+			i += packet_size;
+			rc = parse_tag_11_packet((unsigned char *)&src[i],
+						 sig_tmp_space,
+						 ECRYPTFS_SIG_SIZE,
+						 &tag_11_contents_size,
+						 &tag_11_packet_size,
+						 max_packet_size);
+			if (rc) {
+				ecryptfs_printk(KERN_ERR, "No valid "
+						"(ecryptfs-specific) literal "
+						"packet containing "
+						"authentication token "
+						"signature found after "
+						"tag 3 packet\n");
+				rc = -EIO;
+				goto out_wipe_list;
+			}
+			i += tag_11_packet_size;
+			if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
+				ecryptfs_printk(KERN_ERR, "Expected "
+						"signature of size [%d]; "
+						"read size [%zd]\n",
+						ECRYPTFS_SIG_SIZE,
+						tag_11_contents_size);
+				rc = -EIO;
+				goto out_wipe_list;
+			}
+			ecryptfs_to_hex(new_auth_tok->token.password.signature,
+					sig_tmp_space, tag_11_contents_size);
+			new_auth_tok->token.password.signature[
+				ECRYPTFS_PASSWORD_SIG_SIZE] = '\0';
+			crypt_stat->flags |= ECRYPTFS_ENCRYPTED;
+			break;
+		case ECRYPTFS_TAG_1_PACKET_TYPE:
+			rc = parse_tag_1_packet(crypt_stat,
+						(unsigned char *)&src[i],
+						&auth_tok_list, &new_auth_tok,
+						&packet_size, max_packet_size);
+			if (rc) {
+				ecryptfs_printk(KERN_ERR, "Error parsing "
+						"tag 1 packet\n");
+				rc = -EIO;
+				goto out_wipe_list;
+			}
+			i += packet_size;
+			crypt_stat->flags |= ECRYPTFS_ENCRYPTED;
+			break;
+		case ECRYPTFS_TAG_11_PACKET_TYPE:
+			ecryptfs_printk(KERN_WARNING, "Invalid packet set "
+					"(Tag 11 not allowed by itself)\n");
+			rc = -EIO;
+			goto out_wipe_list;
+			break;
+		default:
+			ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
+					"of the file header; hex value of "
+					"character is [0x%.2x]\n", i, src[i]);
+			next_packet_is_auth_tok_packet = 0;
+		}
+	}
+	if (list_empty(&auth_tok_list)) {
+		printk(KERN_ERR "The lower file appears to be a non-encrypted "
+		       "eCryptfs file; this is not supported in this version "
+		       "of the eCryptfs kernel module\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	/* auth_tok_list contains the set of authentication tokens
+	 * parsed from the metadata. We need to find a matching
+	 * authentication token that has the secret component(s)
+	 * necessary to decrypt the EFEK in the auth_tok parsed from
+	 * the metadata. There may be several potential matches, but
+	 * just one will be sufficient to decrypt to get the FEK. */
+find_next_matching_auth_tok:
+	found_auth_tok = 0;
+	list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) {
+		candidate_auth_tok = &auth_tok_list_item->auth_tok;
+		if (unlikely(ecryptfs_verbosity > 0)) {
+			ecryptfs_printk(KERN_DEBUG,
+					"Considering cadidate auth tok:\n");
+			ecryptfs_dump_auth_tok(candidate_auth_tok);
+		}
+		rc = ecryptfs_get_auth_tok_sig(&candidate_auth_tok_sig,
+					       candidate_auth_tok);
+		if (rc) {
+			printk(KERN_ERR
+			       "Unrecognized candidate auth tok type: [%d]\n",
+			       candidate_auth_tok->token_type);
+			rc = -EINVAL;
+			goto out_wipe_list;
+		}
+		rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+					       &matching_auth_tok,
+					       crypt_stat->mount_crypt_stat,
+					       candidate_auth_tok_sig);
+		if (!rc) {
+			found_auth_tok = 1;
+			goto found_matching_auth_tok;
+		}
+	}
+	if (!found_auth_tok) {
+		ecryptfs_printk(KERN_ERR, "Could not find a usable "
+				"authentication token\n");
+		rc = -EIO;
+		goto out_wipe_list;
+	}
+found_matching_auth_tok:
+	if (candidate_auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) {
+		memcpy(&(candidate_auth_tok->token.private_key),
+		       &(matching_auth_tok->token.private_key),
+		       sizeof(struct ecryptfs_private_key));
+		up_write(&(auth_tok_key->sem));
+		key_put(auth_tok_key);
+		rc = decrypt_pki_encrypted_session_key(candidate_auth_tok,
+						       crypt_stat);
+	} else if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD) {
+		memcpy(&(candidate_auth_tok->token.password),
+		       &(matching_auth_tok->token.password),
+		       sizeof(struct ecryptfs_password));
+		up_write(&(auth_tok_key->sem));
+		key_put(auth_tok_key);
+		rc = decrypt_passphrase_encrypted_session_key(
+			candidate_auth_tok, crypt_stat);
+	} else {
+		up_write(&(auth_tok_key->sem));
+		key_put(auth_tok_key);
+		rc = -EINVAL;
+	}
+	if (rc) {
+		struct ecryptfs_auth_tok_list_item *auth_tok_list_item_tmp;
+
+		ecryptfs_printk(KERN_WARNING, "Error decrypting the "
+				"session key for authentication token with sig "
+				"[%.*s]; rc = [%d]. Removing auth tok "
+				"candidate from the list and searching for "
+				"the next match.\n", ECRYPTFS_SIG_SIZE_HEX,
+				candidate_auth_tok_sig,	rc);
+		list_for_each_entry_safe(auth_tok_list_item,
+					 auth_tok_list_item_tmp,
+					 &auth_tok_list, list) {
+			if (candidate_auth_tok
+			    == &auth_tok_list_item->auth_tok) {
+				list_del(&auth_tok_list_item->list);
+				kmem_cache_free(
+					ecryptfs_auth_tok_list_item_cache,
+					auth_tok_list_item);
+				goto find_next_matching_auth_tok;
+			}
+		}
+		BUG();
+	}
+	rc = ecryptfs_compute_root_iv(crypt_stat);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error computing "
+				"the root IV\n");
+		goto out_wipe_list;
+	}
+	rc = ecryptfs_init_crypt_ctx(crypt_stat);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error initializing crypto "
+				"context for cipher [%s]; rc = [%d]\n",
+				crypt_stat->cipher, rc);
+	}
+out_wipe_list:
+	wipe_auth_tok_list(&auth_tok_list);
+out:
+	return rc;
+}
+
+static int
+pki_encrypt_session_key(struct key *auth_tok_key,
+			struct ecryptfs_auth_tok *auth_tok,
+			struct ecryptfs_crypt_stat *crypt_stat,
+			struct ecryptfs_key_record *key_rec)
+{
+	struct ecryptfs_msg_ctx *msg_ctx = NULL;
+	char *payload = NULL;
+	size_t payload_len;
+	struct ecryptfs_message *msg;
+	int rc;
+
+	rc = write_tag_66_packet(auth_tok->token.private_key.signature,
+				 ecryptfs_code_for_cipher_string(
+					 crypt_stat->cipher,
+					 crypt_stat->key_size),
+				 crypt_stat, &payload, &payload_len);
+	up_write(&(auth_tok_key->sem));
+	key_put(auth_tok_key);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
+		goto out;
+	}
+	rc = ecryptfs_send_message(payload, payload_len, &msg_ctx);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error sending message to "
+				"ecryptfsd\n");
+		goto out;
+	}
+	rc = ecryptfs_wait_for_response(msg_ctx, &msg);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Failed to receive tag 67 packet "
+				"from the user space daemon\n");
+		rc = -EIO;
+		goto out;
+	}
+	rc = parse_tag_67_packet(key_rec, msg);
+	if (rc)
+		ecryptfs_printk(KERN_ERR, "Error parsing tag 67 packet\n");
+	kfree(msg);
+out:
+	kfree(payload);
+	return rc;
+}
+/**
+ * write_tag_1_packet - Write an RFC2440-compatible tag 1 (public key) packet
+ * @dest: Buffer into which to write the packet
+ * @remaining_bytes: Maximum number of bytes that can be writtn
+ * @auth_tok_key: The authentication token key to unlock and put when done with
+ *                @auth_tok
+ * @auth_tok: The authentication token used for generating the tag 1 packet
+ * @crypt_stat: The cryptographic context
+ * @key_rec: The key record struct for the tag 1 packet
+ * @packet_size: This function will write the number of bytes that end
+ *               up constituting the packet; set to zero on error
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+write_tag_1_packet(char *dest, size_t *remaining_bytes,
+		   struct key *auth_tok_key, struct ecryptfs_auth_tok *auth_tok,
+		   struct ecryptfs_crypt_stat *crypt_stat,
+		   struct ecryptfs_key_record *key_rec, size_t *packet_size)
+{
+	size_t i;
+	size_t encrypted_session_key_valid = 0;
+	size_t packet_size_length;
+	size_t max_packet_size;
+	int rc = 0;
+
+	(*packet_size) = 0;
+	ecryptfs_from_hex(key_rec->sig, auth_tok->token.private_key.signature,
+			  ECRYPTFS_SIG_SIZE);
+	encrypted_session_key_valid = 0;
+	for (i = 0; i < crypt_stat->key_size; i++)
+		encrypted_session_key_valid |=
+			auth_tok->session_key.encrypted_key[i];
+	if (encrypted_session_key_valid) {
+		memcpy(key_rec->enc_key,
+		       auth_tok->session_key.encrypted_key,
+		       auth_tok->session_key.encrypted_key_size);
+		up_write(&(auth_tok_key->sem));
+		key_put(auth_tok_key);
+		goto encrypted_session_key_set;
+	}
+	if (auth_tok->session_key.encrypted_key_size == 0)
+		auth_tok->session_key.encrypted_key_size =
+			auth_tok->token.private_key.key_size;
+	rc = pki_encrypt_session_key(auth_tok_key, auth_tok, crypt_stat,
+				     key_rec);
+	if (rc) {
+		printk(KERN_ERR "Failed to encrypt session key via a key "
+		       "module; rc = [%d]\n", rc);
+		goto out;
+	}
+	if (ecryptfs_verbosity > 0) {
+		ecryptfs_printk(KERN_DEBUG, "Encrypted key:\n");
+		ecryptfs_dump_hex(key_rec->enc_key, key_rec->enc_key_size);
+	}
+encrypted_session_key_set:
+	/* This format is inspired by OpenPGP; see RFC 2440
+	 * packet tag 1 */
+	max_packet_size = (1                         /* Tag 1 identifier */
+			   + 3                       /* Max Tag 1 packet size */
+			   + 1                       /* Version */
+			   + ECRYPTFS_SIG_SIZE       /* Key identifier */
+			   + 1                       /* Cipher identifier */
+			   + key_rec->enc_key_size); /* Encrypted key size */
+	if (max_packet_size > (*remaining_bytes)) {
+		printk(KERN_ERR "Packet length larger than maximum allowable; "
+		       "need up to [%td] bytes, but there are only [%td] "
+		       "available\n", max_packet_size, (*remaining_bytes));
+		rc = -EINVAL;
+		goto out;
+	}
+	dest[(*packet_size)++] = ECRYPTFS_TAG_1_PACKET_TYPE;
+	rc = ecryptfs_write_packet_length(&dest[(*packet_size)],
+					  (max_packet_size - 4),
+					  &packet_size_length);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating tag 1 packet "
+				"header; cannot generate packet length\n");
+		goto out;
+	}
+	(*packet_size) += packet_size_length;
+	dest[(*packet_size)++] = 0x03; /* version 3 */
+	memcpy(&dest[(*packet_size)], key_rec->sig, ECRYPTFS_SIG_SIZE);
+	(*packet_size) += ECRYPTFS_SIG_SIZE;
+	dest[(*packet_size)++] = RFC2440_CIPHER_RSA;
+	memcpy(&dest[(*packet_size)], key_rec->enc_key,
+	       key_rec->enc_key_size);
+	(*packet_size) += key_rec->enc_key_size;
+out:
+	if (rc)
+		(*packet_size) = 0;
+	else
+		(*remaining_bytes) -= (*packet_size);
+	return rc;
+}
+
+/**
+ * write_tag_11_packet
+ * @dest: Target into which Tag 11 packet is to be written
+ * @remaining_bytes: Maximum packet length
+ * @contents: Byte array of contents to copy in
+ * @contents_length: Number of bytes in contents
+ * @packet_length: Length of the Tag 11 packet written; zero on error
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+write_tag_11_packet(char *dest, size_t *remaining_bytes, char *contents,
+		    size_t contents_length, size_t *packet_length)
+{
+	size_t packet_size_length;
+	size_t max_packet_size;
+	int rc = 0;
+
+	(*packet_length) = 0;
+	/* This format is inspired by OpenPGP; see RFC 2440
+	 * packet tag 11 */
+	max_packet_size = (1                   /* Tag 11 identifier */
+			   + 3                 /* Max Tag 11 packet size */
+			   + 1                 /* Binary format specifier */
+			   + 1                 /* Filename length */
+			   + 8                 /* Filename ("_CONSOLE") */
+			   + 4                 /* Modification date */
+			   + contents_length); /* Literal data */
+	if (max_packet_size > (*remaining_bytes)) {
+		printk(KERN_ERR "Packet length larger than maximum allowable; "
+		       "need up to [%td] bytes, but there are only [%td] "
+		       "available\n", max_packet_size, (*remaining_bytes));
+		rc = -EINVAL;
+		goto out;
+	}
+	dest[(*packet_length)++] = ECRYPTFS_TAG_11_PACKET_TYPE;
+	rc = ecryptfs_write_packet_length(&dest[(*packet_length)],
+					  (max_packet_size - 4),
+					  &packet_size_length);
+	if (rc) {
+		printk(KERN_ERR "Error generating tag 11 packet header; cannot "
+		       "generate packet length. rc = [%d]\n", rc);
+		goto out;
+	}
+	(*packet_length) += packet_size_length;
+	dest[(*packet_length)++] = 0x62; /* binary data format specifier */
+	dest[(*packet_length)++] = 8;
+	memcpy(&dest[(*packet_length)], "_CONSOLE", 8);
+	(*packet_length) += 8;
+	memset(&dest[(*packet_length)], 0x00, 4);
+	(*packet_length) += 4;
+	memcpy(&dest[(*packet_length)], contents, contents_length);
+	(*packet_length) += contents_length;
+ out:
+	if (rc)
+		(*packet_length) = 0;
+	else
+		(*remaining_bytes) -= (*packet_length);
+	return rc;
+}
+
+/**
+ * write_tag_3_packet
+ * @dest: Buffer into which to write the packet
+ * @remaining_bytes: Maximum number of bytes that can be written
+ * @auth_tok: Authentication token
+ * @crypt_stat: The cryptographic context
+ * @key_rec: encrypted key
+ * @packet_size: This function will write the number of bytes that end
+ *               up constituting the packet; set to zero on error
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+write_tag_3_packet(char *dest, size_t *remaining_bytes,
+		   struct ecryptfs_auth_tok *auth_tok,
+		   struct ecryptfs_crypt_stat *crypt_stat,
+		   struct ecryptfs_key_record *key_rec, size_t *packet_size)
+{
+	size_t i;
+	size_t encrypted_session_key_valid = 0;
+	char session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
+	struct scatterlist dst_sg[2];
+	struct scatterlist src_sg[2];
+	struct mutex *tfm_mutex = NULL;
+	u8 cipher_code;
+	size_t packet_size_length;
+	size_t max_packet_size;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		crypt_stat->mount_crypt_stat;
+	struct blkcipher_desc desc = {
+		.tfm = NULL,
+		.flags = CRYPTO_TFM_REQ_MAY_SLEEP
+	};
+	int rc = 0;
+
+	(*packet_size) = 0;
+	ecryptfs_from_hex(key_rec->sig, auth_tok->token.password.signature,
+			  ECRYPTFS_SIG_SIZE);
+	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+							crypt_stat->cipher);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "Internal error whilst attempting to get "
+		       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+		       crypt_stat->cipher, rc);
+		goto out;
+	}
+	if (mount_crypt_stat->global_default_cipher_key_size == 0) {
+		struct blkcipher_alg *alg = crypto_blkcipher_alg(desc.tfm);
+
+		printk(KERN_WARNING "No key size specified at mount; "
+		       "defaulting to [%d]\n", alg->max_keysize);
+		mount_crypt_stat->global_default_cipher_key_size =
+			alg->max_keysize;
+	}
+	if (crypt_stat->key_size == 0)
+		crypt_stat->key_size =
+			mount_crypt_stat->global_default_cipher_key_size;
+	if (auth_tok->session_key.encrypted_key_size == 0)
+		auth_tok->session_key.encrypted_key_size =
+			crypt_stat->key_size;
+	if (crypt_stat->key_size == 24
+	    && strcmp("aes", crypt_stat->cipher) == 0) {
+		memset((crypt_stat->key + 24), 0, 8);
+		auth_tok->session_key.encrypted_key_size = 32;
+	} else
+		auth_tok->session_key.encrypted_key_size = crypt_stat->key_size;
+	key_rec->enc_key_size =
+		auth_tok->session_key.encrypted_key_size;
+	encrypted_session_key_valid = 0;
+	for (i = 0; i < auth_tok->session_key.encrypted_key_size; i++)
+		encrypted_session_key_valid |=
+			auth_tok->session_key.encrypted_key[i];
+	if (encrypted_session_key_valid) {
+		ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; "
+				"using auth_tok->session_key.encrypted_key, "
+				"where key_rec->enc_key_size = [%zd]\n",
+				key_rec->enc_key_size);
+		memcpy(key_rec->enc_key,
+		       auth_tok->session_key.encrypted_key,
+		       key_rec->enc_key_size);
+		goto encrypted_session_key_set;
+	}
+	if (auth_tok->token.password.flags &
+	    ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET) {
+		ecryptfs_printk(KERN_DEBUG, "Using previously generated "
+				"session key encryption key of size [%d]\n",
+				auth_tok->token.password.
+				session_key_encryption_key_bytes);
+		memcpy(session_key_encryption_key,
+		       auth_tok->token.password.session_key_encryption_key,
+		       crypt_stat->key_size);
+		ecryptfs_printk(KERN_DEBUG,
+				"Cached session key " "encryption key: \n");
+		if (ecryptfs_verbosity > 0)
+			ecryptfs_dump_hex(session_key_encryption_key, 16);
+	}
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "Session key encryption key:\n");
+		ecryptfs_dump_hex(session_key_encryption_key, 16);
+	}
+	rc = virt_to_scatterlist(crypt_stat->key, key_rec->enc_key_size,
+				 src_sg, 2);
+	if (rc < 1 || rc > 2) {
+		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
+				"for crypt_stat session key; expected rc = 1; "
+				"got rc = [%d]. key_rec->enc_key_size = [%zd]\n",
+				rc, key_rec->enc_key_size);
+		rc = -ENOMEM;
+		goto out;
+	}
+	rc = virt_to_scatterlist(key_rec->enc_key, key_rec->enc_key_size,
+				 dst_sg, 2);
+	if (rc < 1 || rc > 2) {
+		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
+				"for crypt_stat encrypted session key; "
+				"expected rc = 1; got rc = [%d]. "
+				"key_rec->enc_key_size = [%zd]\n", rc,
+				key_rec->enc_key_size);
+		rc = -ENOMEM;
+		goto out;
+	}
+	mutex_lock(tfm_mutex);
+	rc = crypto_blkcipher_setkey(desc.tfm, session_key_encryption_key,
+				     crypt_stat->key_size);
+	if (rc < 0) {
+		mutex_unlock(tfm_mutex);
+		ecryptfs_printk(KERN_ERR, "Error setting key for crypto "
+				"context; rc = [%d]\n", rc);
+		goto out;
+	}
+	rc = 0;
+	ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
+			crypt_stat->key_size);
+	rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
+				      (*key_rec).enc_key_size);
+	mutex_unlock(tfm_mutex);
+	if (rc) {
+		printk(KERN_ERR "Error encrypting; rc = [%d]\n", rc);
+		goto out;
+	}
+	ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
+	if (ecryptfs_verbosity > 0) {
+		ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n",
+				key_rec->enc_key_size);
+		ecryptfs_dump_hex(key_rec->enc_key,
+				  key_rec->enc_key_size);
+	}
+encrypted_session_key_set:
+	/* This format is inspired by OpenPGP; see RFC 2440
+	 * packet tag 3 */
+	max_packet_size = (1                         /* Tag 3 identifier */
+			   + 3                       /* Max Tag 3 packet size */
+			   + 1                       /* Version */
+			   + 1                       /* Cipher code */
+			   + 1                       /* S2K specifier */
+			   + 1                       /* Hash identifier */
+			   + ECRYPTFS_SALT_SIZE      /* Salt */
+			   + 1                       /* Hash iterations */
+			   + key_rec->enc_key_size); /* Encrypted key size */
+	if (max_packet_size > (*remaining_bytes)) {
+		printk(KERN_ERR "Packet too large; need up to [%td] bytes, but "
+		       "there are only [%td] available\n", max_packet_size,
+		       (*remaining_bytes));
+		rc = -EINVAL;
+		goto out;
+	}
+	dest[(*packet_size)++] = ECRYPTFS_TAG_3_PACKET_TYPE;
+	/* Chop off the Tag 3 identifier(1) and Tag 3 packet size(3)
+	 * to get the number of octets in the actual Tag 3 packet */
+	rc = ecryptfs_write_packet_length(&dest[(*packet_size)],
+					  (max_packet_size - 4),
+					  &packet_size_length);
+	if (rc) {
+		printk(KERN_ERR "Error generating tag 3 packet header; cannot "
+		       "generate packet length. rc = [%d]\n", rc);
+		goto out;
+	}
+	(*packet_size) += packet_size_length;
+	dest[(*packet_size)++] = 0x04; /* version 4 */
+	/* TODO: Break from RFC2440 so that arbitrary ciphers can be
+	 * specified with strings */
+	cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher,
+						      crypt_stat->key_size);
+	if (cipher_code == 0) {
+		ecryptfs_printk(KERN_WARNING, "Unable to generate code for "
+				"cipher [%s]\n", crypt_stat->cipher);
+		rc = -EINVAL;
+		goto out;
+	}
+	dest[(*packet_size)++] = cipher_code;
+	dest[(*packet_size)++] = 0x03;	/* S2K */
+	dest[(*packet_size)++] = 0x01;	/* MD5 (TODO: parameterize) */
+	memcpy(&dest[(*packet_size)], auth_tok->token.password.salt,
+	       ECRYPTFS_SALT_SIZE);
+	(*packet_size) += ECRYPTFS_SALT_SIZE;	/* salt */
+	dest[(*packet_size)++] = 0x60;	/* hash iterations (65536) */
+	memcpy(&dest[(*packet_size)], key_rec->enc_key,
+	       key_rec->enc_key_size);
+	(*packet_size) += key_rec->enc_key_size;
+out:
+	if (rc)
+		(*packet_size) = 0;
+	else
+		(*remaining_bytes) -= (*packet_size);
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_key_record_cache;
+
+/**
+ * ecryptfs_generate_key_packet_set
+ * @dest_base: Virtual address from which to write the key record set
+ * @crypt_stat: The cryptographic context from which the
+ *              authentication tokens will be retrieved
+ * @ecryptfs_dentry: The dentry, used to retrieve the mount crypt stat
+ *                   for the global parameters
+ * @len: The amount written
+ * @max: The maximum amount of data allowed to be written
+ *
+ * Generates a key packet set and writes it to the virtual address
+ * passed in.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+int
+ecryptfs_generate_key_packet_set(char *dest_base,
+				 struct ecryptfs_crypt_stat *crypt_stat,
+				 struct dentry *ecryptfs_dentry, size_t *len,
+				 size_t max)
+{
+	struct ecryptfs_auth_tok *auth_tok;
+	struct key *auth_tok_key = NULL;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		&ecryptfs_superblock_to_private(
+			ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	size_t written;
+	struct ecryptfs_key_record *key_rec;
+	struct ecryptfs_key_sig *key_sig;
+	int rc = 0;
+
+	(*len) = 0;
+	mutex_lock(&crypt_stat->keysig_list_mutex);
+	key_rec = kmem_cache_alloc(ecryptfs_key_record_cache, GFP_KERNEL);
+	if (!key_rec) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	list_for_each_entry(key_sig, &crypt_stat->keysig_list,
+			    crypt_stat_list) {
+		memset(key_rec, 0, sizeof(*key_rec));
+		rc = ecryptfs_find_global_auth_tok_for_sig(&auth_tok_key,
+							   &auth_tok,
+							   mount_crypt_stat,
+							   key_sig->keysig);
+		if (rc) {
+			printk(KERN_WARNING "Unable to retrieve auth tok with "
+			       "sig = [%s]\n", key_sig->keysig);
+			rc = process_find_global_auth_tok_for_sig_err(rc);
+			goto out_free;
+		}
+		if (auth_tok->token_type == ECRYPTFS_PASSWORD) {
+			rc = write_tag_3_packet((dest_base + (*len)),
+						&max, auth_tok,
+						crypt_stat, key_rec,
+						&written);
+			up_write(&(auth_tok_key->sem));
+			key_put(auth_tok_key);
+			if (rc) {
+				ecryptfs_printk(KERN_WARNING, "Error "
+						"writing tag 3 packet\n");
+				goto out_free;
+			}
+			(*len) += written;
+			/* Write auth tok signature packet */
+			rc = write_tag_11_packet((dest_base + (*len)), &max,
+						 key_rec->sig,
+						 ECRYPTFS_SIG_SIZE, &written);
+			if (rc) {
+				ecryptfs_printk(KERN_ERR, "Error writing "
+						"auth tok signature packet\n");
+				goto out_free;
+			}
+			(*len) += written;
+		} else if (auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) {
+			rc = write_tag_1_packet(dest_base + (*len), &max,
+						auth_tok_key, auth_tok,
+						crypt_stat, key_rec, &written);
+			if (rc) {
+				ecryptfs_printk(KERN_WARNING, "Error "
+						"writing tag 1 packet\n");
+				goto out_free;
+			}
+			(*len) += written;
+		} else {
+			up_write(&(auth_tok_key->sem));
+			key_put(auth_tok_key);
+			ecryptfs_printk(KERN_WARNING, "Unsupported "
+					"authentication token type\n");
+			rc = -EINVAL;
+			goto out_free;
+		}
+	}
+	if (likely(max > 0)) {
+		dest_base[(*len)] = 0x00;
+	} else {
+		ecryptfs_printk(KERN_ERR, "Error writing boundary byte\n");
+		rc = -EIO;
+	}
+out_free:
+	kmem_cache_free(ecryptfs_key_record_cache, key_rec);
+out:
+	if (rc)
+		(*len) = 0;
+	mutex_unlock(&crypt_stat->keysig_list_mutex);
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_key_sig_cache;
+
+int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig)
+{
+	struct ecryptfs_key_sig *new_key_sig;
+
+	new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL);
+	if (!new_key_sig) {
+		printk(KERN_ERR
+		       "Error allocating from ecryptfs_key_sig_cache\n");
+		return -ENOMEM;
+	}
+	memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX);
+	new_key_sig->keysig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+	/* Caller must hold keysig_list_mutex */
+	list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list);
+
+	return 0;
+}
+
+struct kmem_cache *ecryptfs_global_auth_tok_cache;
+
+int
+ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *sig, u32 global_auth_tok_flags)
+{
+	struct ecryptfs_global_auth_tok *new_auth_tok;
+	int rc = 0;
+
+	new_auth_tok = kmem_cache_zalloc(ecryptfs_global_auth_tok_cache,
+					GFP_KERNEL);
+	if (!new_auth_tok) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "Error allocating from "
+		       "ecryptfs_global_auth_tok_cache\n");
+		goto out;
+	}
+	memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX);
+	new_auth_tok->flags = global_auth_tok_flags;
+	new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	list_add(&new_auth_tok->mount_crypt_stat_list,
+		 &mount_crypt_stat->global_auth_tok_list);
+	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+out:
+	return rc;
+}
+
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
new file mode 100644
index 00000000..69f994a7
--- /dev/null
+++ b/fs/ecryptfs/kthread.c
@@ -0,0 +1,197 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 2008 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/mount.h>
+#include "ecryptfs_kernel.h"
+
+struct kmem_cache *ecryptfs_open_req_cache;
+
+static struct ecryptfs_kthread_ctl {
+#define ECRYPTFS_KTHREAD_ZOMBIE 0x00000001
+	u32 flags;
+	struct mutex mux;
+	struct list_head req_list;
+	wait_queue_head_t wait;
+} ecryptfs_kthread_ctl;
+
+static struct task_struct *ecryptfs_kthread;
+
+/**
+ * ecryptfs_threadfn
+ * @ignored: ignored
+ *
+ * The eCryptfs kernel thread that has the responsibility of getting
+ * the lower file with RW permissions.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_threadfn(void *ignored)
+{
+	set_freezable();
+	while (1)  {
+		struct ecryptfs_open_req *req;
+
+		wait_event_freezable(
+			ecryptfs_kthread_ctl.wait,
+			(!list_empty(&ecryptfs_kthread_ctl.req_list)
+			 || kthread_should_stop()));
+		mutex_lock(&ecryptfs_kthread_ctl.mux);
+		if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) {
+			mutex_unlock(&ecryptfs_kthread_ctl.mux);
+			goto out;
+		}
+		while (!list_empty(&ecryptfs_kthread_ctl.req_list)) {
+			req = list_first_entry(&ecryptfs_kthread_ctl.req_list,
+					       struct ecryptfs_open_req,
+					       kthread_ctl_list);
+			mutex_lock(&req->mux);
+			list_del(&req->kthread_ctl_list);
+			if (!(req->flags & ECRYPTFS_REQ_ZOMBIE)) {
+				dget(req->lower_dentry);
+				mntget(req->lower_mnt);
+				(*req->lower_file) = dentry_open(
+					req->lower_dentry, req->lower_mnt,
+					(O_RDWR | O_LARGEFILE), current_cred());
+				req->flags |= ECRYPTFS_REQ_PROCESSED;
+			}
+			wake_up(&req->wait);
+			mutex_unlock(&req->mux);
+		}
+		mutex_unlock(&ecryptfs_kthread_ctl.mux);
+	}
+out:
+	return 0;
+}
+
+int __init ecryptfs_init_kthread(void)
+{
+	int rc = 0;
+
+	mutex_init(&ecryptfs_kthread_ctl.mux);
+	init_waitqueue_head(&ecryptfs_kthread_ctl.wait);
+	INIT_LIST_HEAD(&ecryptfs_kthread_ctl.req_list);
+	ecryptfs_kthread = kthread_run(&ecryptfs_threadfn, NULL,
+				       "ecryptfs-kthread");
+	if (IS_ERR(ecryptfs_kthread)) {
+		rc = PTR_ERR(ecryptfs_kthread);
+		printk(KERN_ERR "%s: Failed to create kernel thread; rc = [%d]"
+		       "\n", __func__, rc);
+	}
+	return rc;
+}
+
+void ecryptfs_destroy_kthread(void)
+{
+	struct ecryptfs_open_req *req;
+
+	mutex_lock(&ecryptfs_kthread_ctl.mux);
+	ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE;
+	list_for_each_entry(req, &ecryptfs_kthread_ctl.req_list,
+			    kthread_ctl_list) {
+		mutex_lock(&req->mux);
+		req->flags |= ECRYPTFS_REQ_ZOMBIE;
+		wake_up(&req->wait);
+		mutex_unlock(&req->mux);
+	}
+	mutex_unlock(&ecryptfs_kthread_ctl.mux);
+	kthread_stop(ecryptfs_kthread);
+	wake_up(&ecryptfs_kthread_ctl.wait);
+}
+
+/**
+ * ecryptfs_privileged_open
+ * @lower_file: Result of dentry_open by root on lower dentry
+ * @lower_dentry: Lower dentry for file to open
+ * @lower_mnt: Lower vfsmount for file to open
+ *
+ * This function gets a r/w file opened againt the lower dentry.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_privileged_open(struct file **lower_file,
+			     struct dentry *lower_dentry,
+			     struct vfsmount *lower_mnt,
+			     const struct cred *cred)
+{
+	struct ecryptfs_open_req *req;
+	int flags = O_LARGEFILE;
+	int rc = 0;
+
+	/* Corresponding dput() and mntput() are done when the
+	 * lower file is fput() when all eCryptfs files for the inode are
+	 * released. */
+	dget(lower_dentry);
+	mntget(lower_mnt);
+	flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR;
+	(*lower_file) = dentry_open(lower_dentry, lower_mnt, flags, cred);
+	if (!IS_ERR(*lower_file))
+		goto out;
+	if (flags & O_RDONLY) {
+		rc = PTR_ERR((*lower_file));
+		goto out;
+	}
+	req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL);
+	if (!req) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	mutex_init(&req->mux);
+	req->lower_file = lower_file;
+	req->lower_dentry = lower_dentry;
+	req->lower_mnt = lower_mnt;
+	init_waitqueue_head(&req->wait);
+	req->flags = 0;
+	mutex_lock(&ecryptfs_kthread_ctl.mux);
+	if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) {
+		rc = -EIO;
+		mutex_unlock(&ecryptfs_kthread_ctl.mux);
+		printk(KERN_ERR "%s: We are in the middle of shutting down; "
+		       "aborting privileged request to open lower file\n",
+			__func__);
+		goto out_free;
+	}
+	list_add_tail(&req->kthread_ctl_list, &ecryptfs_kthread_ctl.req_list);
+	mutex_unlock(&ecryptfs_kthread_ctl.mux);
+	wake_up(&ecryptfs_kthread_ctl.wait);
+	wait_event(req->wait, (req->flags != 0));
+	mutex_lock(&req->mux);
+	BUG_ON(req->flags == 0);
+	if (req->flags & ECRYPTFS_REQ_DROPPED
+	    || req->flags & ECRYPTFS_REQ_ZOMBIE) {
+		rc = -EIO;
+		printk(KERN_WARNING "%s: Privileged open request dropped\n",
+		       __func__);
+		goto out_unlock;
+	}
+	if (IS_ERR(*req->lower_file))
+		rc = PTR_ERR(*req->lower_file);
+out_unlock:
+	mutex_unlock(&req->mux);
+out_free:
+	kmem_cache_free(ecryptfs_open_req_cache, req);
+out:
+	return rc;
+}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
new file mode 100644
index 00000000..b4a6befb
--- /dev/null
+++ b/fs/ecryptfs/main.c
@@ -0,0 +1,867 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2007 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *              Michael C. Thompson <mcthomps@us.ibm.com>
+ *              Tyler Hicks <tyhicks@ou.edu>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/dcache.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/skbuff.h>
+#include <linux/crypto.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/key.h>
+#include <linux/parser.h>
+#include <linux/fs_stack.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * Module parameter that defines the ecryptfs_verbosity level.
+ */
+int ecryptfs_verbosity = 0;
+
+module_param(ecryptfs_verbosity, int, 0);
+MODULE_PARM_DESC(ecryptfs_verbosity,
+		 "Initial verbosity level (0 or 1; defaults to "
+		 "0, which is Quiet)");
+
+/**
+ * Module parameter that defines the number of message buffer elements
+ */
+unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS;
+
+module_param(ecryptfs_message_buf_len, uint, 0);
+MODULE_PARM_DESC(ecryptfs_message_buf_len,
+		 "Number of message buffer elements");
+
+/**
+ * Module parameter that defines the maximum guaranteed amount of time to wait
+ * for a response from ecryptfsd.  The actual sleep time will be, more than
+ * likely, a small amount greater than this specified value, but only less if
+ * the message successfully arrives.
+ */
+signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ;
+
+module_param(ecryptfs_message_wait_timeout, long, 0);
+MODULE_PARM_DESC(ecryptfs_message_wait_timeout,
+		 "Maximum number of seconds that an operation will "
+		 "sleep while waiting for a message response from "
+		 "userspace");
+
+/**
+ * Module parameter that is an estimate of the maximum number of users
+ * that will be concurrently using eCryptfs. Set this to the right
+ * value to balance performance and memory use.
+ */
+unsigned int ecryptfs_number_of_users = ECRYPTFS_DEFAULT_NUM_USERS;
+
+module_param(ecryptfs_number_of_users, uint, 0);
+MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of "
+		 "concurrent users of eCryptfs");
+
+void __ecryptfs_printk(const char *fmt, ...)
+{
+	va_list args;
+	va_start(args, fmt);
+	if (fmt[1] == '7') { /* KERN_DEBUG */
+		if (ecryptfs_verbosity >= 1)
+			vprintk(fmt, args);
+	} else
+		vprintk(fmt, args);
+	va_end(args);
+}
+
+/**
+ * ecryptfs_init_lower_file
+ * @ecryptfs_dentry: Fully initialized eCryptfs dentry object, with
+ *                   the lower dentry and the lower mount set
+ *
+ * eCryptfs only ever keeps a single open file for every lower
+ * inode. All I/O operations to the lower inode occur through that
+ * file. When the first eCryptfs dentry that interposes with the first
+ * lower dentry for that inode is created, this function creates the
+ * lower file struct and associates it with the eCryptfs
+ * inode. When all eCryptfs files associated with the inode are released, the
+ * file is closed.
+ *
+ * The lower file will be opened with read/write permissions, if
+ * possible. Otherwise, it is opened read-only.
+ *
+ * This function does nothing if a lower file is already
+ * associated with the eCryptfs inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_init_lower_file(struct dentry *dentry,
+				    struct file **lower_file)
+{
+	const struct cred *cred = current_cred();
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+	int rc;
+
+	rc = ecryptfs_privileged_open(lower_file, lower_dentry, lower_mnt,
+				      cred);
+	if (rc) {
+		printk(KERN_ERR "Error opening lower file "
+		       "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
+		       "rc = [%d]\n", lower_dentry, lower_mnt, rc);
+		(*lower_file) = NULL;
+	}
+	return rc;
+}
+
+int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode)
+{
+	struct ecryptfs_inode_info *inode_info;
+	int count, rc = 0;
+
+	inode_info = ecryptfs_inode_to_private(inode);
+	mutex_lock(&inode_info->lower_file_mutex);
+	count = atomic_inc_return(&inode_info->lower_file_count);
+	if (WARN_ON_ONCE(count < 1))
+		rc = -EINVAL;
+	else if (count == 1) {
+		rc = ecryptfs_init_lower_file(dentry,
+					      &inode_info->lower_file);
+		if (rc)
+			atomic_set(&inode_info->lower_file_count, 0);
+	}
+	mutex_unlock(&inode_info->lower_file_mutex);
+	return rc;
+}
+
+void ecryptfs_put_lower_file(struct inode *inode)
+{
+	struct ecryptfs_inode_info *inode_info;
+
+	inode_info = ecryptfs_inode_to_private(inode);
+	if (atomic_dec_and_mutex_lock(&inode_info->lower_file_count,
+				      &inode_info->lower_file_mutex)) {
+		fput(inode_info->lower_file);
+		inode_info->lower_file = NULL;
+		mutex_unlock(&inode_info->lower_file_mutex);
+	}
+}
+
+enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
+       ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
+       ecryptfs_opt_ecryptfs_key_bytes,
+       ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
+       ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
+       ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
+       ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only,
+       ecryptfs_opt_check_dev_ruid,
+       ecryptfs_opt_err };
+
+static const match_table_t tokens = {
+	{ecryptfs_opt_sig, "sig=%s"},
+	{ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"},
+	{ecryptfs_opt_cipher, "cipher=%s"},
+	{ecryptfs_opt_ecryptfs_cipher, "ecryptfs_cipher=%s"},
+	{ecryptfs_opt_ecryptfs_key_bytes, "ecryptfs_key_bytes=%u"},
+	{ecryptfs_opt_passthrough, "ecryptfs_passthrough"},
+	{ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"},
+	{ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"},
+	{ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
+	{ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
+	{ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
+	{ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
+	{ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"},
+	{ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"},
+	{ecryptfs_opt_err, NULL}
+};
+
+static int ecryptfs_init_global_auth_toks(
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	struct ecryptfs_global_auth_tok *global_auth_tok;
+	struct ecryptfs_auth_tok *auth_tok;
+	int rc = 0;
+
+	list_for_each_entry(global_auth_tok,
+			    &mount_crypt_stat->global_auth_tok_list,
+			    mount_crypt_stat_list) {
+		rc = ecryptfs_keyring_auth_tok_for_sig(
+			&global_auth_tok->global_auth_tok_key, &auth_tok,
+			global_auth_tok->sig);
+		if (rc) {
+			printk(KERN_ERR "Could not find valid key in user "
+			       "session keyring for sig specified in mount "
+			       "option: [%s]\n", global_auth_tok->sig);
+			global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID;
+			goto out;
+		} else {
+			global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID;
+			up_write(&(global_auth_tok->global_auth_tok_key)->sem);
+		}
+	}
+out:
+	return rc;
+}
+
+static void ecryptfs_init_mount_crypt_stat(
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	memset((void *)mount_crypt_stat, 0,
+	       sizeof(struct ecryptfs_mount_crypt_stat));
+	INIT_LIST_HEAD(&mount_crypt_stat->global_auth_tok_list);
+	mutex_init(&mount_crypt_stat->global_auth_tok_list_mutex);
+	mount_crypt_stat->flags |= ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED;
+}
+
+/**
+ * ecryptfs_parse_options
+ * @sb: The ecryptfs super block
+ * @options: The options passed to the kernel
+ * @check_ruid: set to 1 if device uid should be checked against the ruid
+ *
+ * Parse mount options:
+ * debug=N 	   - ecryptfs_verbosity level for debug output
+ * sig=XXX	   - description(signature) of the key to use
+ *
+ * Returns the dentry object of the lower-level (lower/interposed)
+ * directory; We want to mount our stackable file system on top of
+ * that lower directory.
+ *
+ * The signature of the key to use must be the description of a key
+ * already in the keyring. Mounting will fail if the key can not be
+ * found.
+ *
+ * Returns zero on success; non-zero on error
+ */
+static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
+				  uid_t *check_ruid)
+{
+	char *p;
+	int rc = 0;
+	int sig_set = 0;
+	int cipher_name_set = 0;
+	int fn_cipher_name_set = 0;
+	int cipher_key_bytes;
+	int cipher_key_bytes_set = 0;
+	int fn_cipher_key_bytes;
+	int fn_cipher_key_bytes_set = 0;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		&sbi->mount_crypt_stat;
+	substring_t args[MAX_OPT_ARGS];
+	int token;
+	char *sig_src;
+	char *cipher_name_dst;
+	char *cipher_name_src;
+	char *fn_cipher_name_dst;
+	char *fn_cipher_name_src;
+	char *fnek_dst;
+	char *fnek_src;
+	char *cipher_key_bytes_src;
+	char *fn_cipher_key_bytes_src;
+
+	*check_ruid = 0;
+
+	if (!options) {
+		rc = -EINVAL;
+		goto out;
+	}
+	ecryptfs_init_mount_crypt_stat(mount_crypt_stat);
+	while ((p = strsep(&options, ",")) != NULL) {
+		if (!*p)
+			continue;
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case ecryptfs_opt_sig:
+		case ecryptfs_opt_ecryptfs_sig:
+			sig_src = args[0].from;
+			rc = ecryptfs_add_global_auth_tok(mount_crypt_stat,
+							  sig_src, 0);
+			if (rc) {
+				printk(KERN_ERR "Error attempting to register "
+				       "global sig; rc = [%d]\n", rc);
+				goto out;
+			}
+			sig_set = 1;
+			break;
+		case ecryptfs_opt_cipher:
+		case ecryptfs_opt_ecryptfs_cipher:
+			cipher_name_src = args[0].from;
+			cipher_name_dst =
+				mount_crypt_stat->
+				global_default_cipher_name;
+			strncpy(cipher_name_dst, cipher_name_src,
+				ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+			cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
+			cipher_name_set = 1;
+			break;
+		case ecryptfs_opt_ecryptfs_key_bytes:
+			cipher_key_bytes_src = args[0].from;
+			cipher_key_bytes =
+				(int)simple_strtol(cipher_key_bytes_src,
+						   &cipher_key_bytes_src, 0);
+			mount_crypt_stat->global_default_cipher_key_size =
+				cipher_key_bytes;
+			cipher_key_bytes_set = 1;
+			break;
+		case ecryptfs_opt_passthrough:
+			mount_crypt_stat->flags |=
+				ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED;
+			break;
+		case ecryptfs_opt_xattr_metadata:
+			mount_crypt_stat->flags |=
+				ECRYPTFS_XATTR_METADATA_ENABLED;
+			break;
+		case ecryptfs_opt_encrypted_view:
+			mount_crypt_stat->flags |=
+				ECRYPTFS_XATTR_METADATA_ENABLED;
+			mount_crypt_stat->flags |=
+				ECRYPTFS_ENCRYPTED_VIEW_ENABLED;
+			break;
+		case ecryptfs_opt_fnek_sig:
+			fnek_src = args[0].from;
+			fnek_dst =
+				mount_crypt_stat->global_default_fnek_sig;
+			strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX);
+			mount_crypt_stat->global_default_fnek_sig[
+				ECRYPTFS_SIG_SIZE_HEX] = '\0';
+			rc = ecryptfs_add_global_auth_tok(
+				mount_crypt_stat,
+				mount_crypt_stat->global_default_fnek_sig,
+				ECRYPTFS_AUTH_TOK_FNEK);
+			if (rc) {
+				printk(KERN_ERR "Error attempting to register "
+				       "global fnek sig [%s]; rc = [%d]\n",
+				       mount_crypt_stat->global_default_fnek_sig,
+				       rc);
+				goto out;
+			}
+			mount_crypt_stat->flags |=
+				(ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES
+				 | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK);
+			break;
+		case ecryptfs_opt_fn_cipher:
+			fn_cipher_name_src = args[0].from;
+			fn_cipher_name_dst =
+				mount_crypt_stat->global_default_fn_cipher_name;
+			strncpy(fn_cipher_name_dst, fn_cipher_name_src,
+				ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+			mount_crypt_stat->global_default_fn_cipher_name[
+				ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
+			fn_cipher_name_set = 1;
+			break;
+		case ecryptfs_opt_fn_cipher_key_bytes:
+			fn_cipher_key_bytes_src = args[0].from;
+			fn_cipher_key_bytes =
+				(int)simple_strtol(fn_cipher_key_bytes_src,
+						   &fn_cipher_key_bytes_src, 0);
+			mount_crypt_stat->global_default_fn_cipher_key_bytes =
+				fn_cipher_key_bytes;
+			fn_cipher_key_bytes_set = 1;
+			break;
+		case ecryptfs_opt_unlink_sigs:
+			mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
+			break;
+		case ecryptfs_opt_mount_auth_tok_only:
+			mount_crypt_stat->flags |=
+				ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY;
+			break;
+		case ecryptfs_opt_check_dev_ruid:
+			*check_ruid = 1;
+			break;
+		case ecryptfs_opt_err:
+		default:
+			printk(KERN_WARNING
+			       "%s: eCryptfs: unrecognized option [%s]\n",
+			       __func__, p);
+		}
+	}
+	if (!sig_set) {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_ERR, "You must supply at least one valid "
+				"auth tok signature as a mount "
+				"parameter; see the eCryptfs README\n");
+		goto out;
+	}
+	if (!cipher_name_set) {
+		int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
+
+		BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+		strcpy(mount_crypt_stat->global_default_cipher_name,
+		       ECRYPTFS_DEFAULT_CIPHER);
+	}
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !fn_cipher_name_set)
+		strcpy(mount_crypt_stat->global_default_fn_cipher_name,
+		       mount_crypt_stat->global_default_cipher_name);
+	if (!cipher_key_bytes_set)
+		mount_crypt_stat->global_default_cipher_key_size = 0;
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !fn_cipher_key_bytes_set)
+		mount_crypt_stat->global_default_fn_cipher_key_bytes =
+			mount_crypt_stat->global_default_cipher_key_size;
+	mutex_lock(&key_tfm_list_mutex);
+	if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name,
+				 NULL)) {
+		rc = ecryptfs_add_new_key_tfm(
+			NULL, mount_crypt_stat->global_default_cipher_name,
+			mount_crypt_stat->global_default_cipher_key_size);
+		if (rc) {
+			printk(KERN_ERR "Error attempting to initialize "
+			       "cipher with name = [%s] and key size = [%td]; "
+			       "rc = [%d]\n",
+			       mount_crypt_stat->global_default_cipher_name,
+			       mount_crypt_stat->global_default_cipher_key_size,
+			       rc);
+			rc = -EINVAL;
+			mutex_unlock(&key_tfm_list_mutex);
+			goto out;
+		}
+	}
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !ecryptfs_tfm_exists(
+		    mount_crypt_stat->global_default_fn_cipher_name, NULL)) {
+		rc = ecryptfs_add_new_key_tfm(
+			NULL, mount_crypt_stat->global_default_fn_cipher_name,
+			mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		if (rc) {
+			printk(KERN_ERR "Error attempting to initialize "
+			       "cipher with name = [%s] and key size = [%td]; "
+			       "rc = [%d]\n",
+			       mount_crypt_stat->global_default_fn_cipher_name,
+			       mount_crypt_stat->global_default_fn_cipher_key_bytes,
+			       rc);
+			rc = -EINVAL;
+			mutex_unlock(&key_tfm_list_mutex);
+			goto out;
+		}
+	}
+	mutex_unlock(&key_tfm_list_mutex);
+	rc = ecryptfs_init_global_auth_toks(mount_crypt_stat);
+	if (rc)
+		printk(KERN_WARNING "One or more global auth toks could not "
+		       "properly register; rc = [%d]\n", rc);
+out:
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_sb_info_cache;
+static struct file_system_type ecryptfs_fs_type;
+
+/**
+ * ecryptfs_get_sb
+ * @fs_type
+ * @flags
+ * @dev_name: The path to mount over
+ * @raw_data: The options passed into the kernel
+ */
+static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *raw_data)
+{
+	struct super_block *s;
+	struct ecryptfs_sb_info *sbi;
+	struct ecryptfs_dentry_info *root_info;
+	const char *err = "Getting sb failed";
+	struct inode *inode;
+	struct path path;
+	uid_t check_ruid;
+	int rc;
+
+	sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
+	if (!sbi) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid);
+	if (rc) {
+		err = "Error parsing options";
+		goto out;
+	}
+
+	s = sget(fs_type, NULL, set_anon_super, NULL);
+	if (IS_ERR(s)) {
+		rc = PTR_ERR(s);
+		goto out;
+	}
+
+	s->s_flags = flags;
+	rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
+	if (rc)
+		goto out1;
+
+	ecryptfs_set_superblock_private(s, sbi);
+	s->s_bdi = &sbi->bdi;
+
+	/* ->kill_sb() will take care of sbi after that point */
+	sbi = NULL;
+	s->s_op = &ecryptfs_sops;
+	s->s_d_op = &ecryptfs_dops;
+
+	err = "Reading sb failed";
+	rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "kern_path() failed\n");
+		goto out1;
+	}
+	if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
+		rc = -EINVAL;
+		printk(KERN_ERR "Mount on filesystem of type "
+			"eCryptfs explicitly disallowed due to "
+			"known incompatibilities\n");
+		goto out_free;
+	}
+
+	if (check_ruid && path.dentry->d_inode->i_uid != current_uid()) {
+		rc = -EPERM;
+		printk(KERN_ERR "Mount of device (uid: %d) not owned by "
+		       "requested user (uid: %d)\n",
+		       path.dentry->d_inode->i_uid, current_uid());
+		goto out_free;
+	}
+
+	ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
+	s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
+	s->s_blocksize = path.dentry->d_sb->s_blocksize;
+	s->s_magic = ECRYPTFS_SUPER_MAGIC;
+
+	inode = ecryptfs_get_inode(path.dentry->d_inode, s);
+	rc = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_free;
+
+	s->s_root = d_alloc_root(inode);
+	if (!s->s_root) {
+		iput(inode);
+		rc = -ENOMEM;
+		goto out_free;
+	}
+
+	rc = -ENOMEM;
+	root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
+	if (!root_info)
+		goto out_free;
+
+	/* ->kill_sb() will take care of root_info */
+	ecryptfs_set_dentry_private(s->s_root, root_info);
+	ecryptfs_set_dentry_lower(s->s_root, path.dentry);
+	ecryptfs_set_dentry_lower_mnt(s->s_root, path.mnt);
+
+	s->s_flags |= MS_ACTIVE;
+	return dget(s->s_root);
+
+out_free:
+	path_put(&path);
+out1:
+	deactivate_locked_super(s);
+out:
+	if (sbi) {
+		ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
+		kmem_cache_free(ecryptfs_sb_info_cache, sbi);
+	}
+	printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
+	return ERR_PTR(rc);
+}
+
+/**
+ * ecryptfs_kill_block_super
+ * @sb: The ecryptfs super block
+ *
+ * Used to bring the superblock down and free the private data.
+ */
+static void ecryptfs_kill_block_super(struct super_block *sb)
+{
+	struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
+	kill_anon_super(sb);
+	if (!sb_info)
+		return;
+	ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
+	bdi_destroy(&sb_info->bdi);
+	kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
+}
+
+static struct file_system_type ecryptfs_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "ecryptfs",
+	.mount = ecryptfs_mount,
+	.kill_sb = ecryptfs_kill_block_super,
+	.fs_flags = 0
+};
+
+/**
+ * inode_info_init_once
+ *
+ * Initializes the ecryptfs_inode_info_cache when it is created
+ */
+static void
+inode_info_init_once(void *vptr)
+{
+	struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static struct ecryptfs_cache_info {
+	struct kmem_cache **cache;
+	const char *name;
+	size_t size;
+	void (*ctor)(void *obj);
+} ecryptfs_cache_infos[] = {
+	{
+		.cache = &ecryptfs_auth_tok_list_item_cache,
+		.name = "ecryptfs_auth_tok_list_item",
+		.size = sizeof(struct ecryptfs_auth_tok_list_item),
+	},
+	{
+		.cache = &ecryptfs_file_info_cache,
+		.name = "ecryptfs_file_cache",
+		.size = sizeof(struct ecryptfs_file_info),
+	},
+	{
+		.cache = &ecryptfs_dentry_info_cache,
+		.name = "ecryptfs_dentry_info_cache",
+		.size = sizeof(struct ecryptfs_dentry_info),
+	},
+	{
+		.cache = &ecryptfs_inode_info_cache,
+		.name = "ecryptfs_inode_cache",
+		.size = sizeof(struct ecryptfs_inode_info),
+		.ctor = inode_info_init_once,
+	},
+	{
+		.cache = &ecryptfs_sb_info_cache,
+		.name = "ecryptfs_sb_cache",
+		.size = sizeof(struct ecryptfs_sb_info),
+	},
+	{
+		.cache = &ecryptfs_header_cache,
+		.name = "ecryptfs_headers",
+		.size = PAGE_CACHE_SIZE,
+	},
+	{
+		.cache = &ecryptfs_xattr_cache,
+		.name = "ecryptfs_xattr_cache",
+		.size = PAGE_CACHE_SIZE,
+	},
+	{
+		.cache = &ecryptfs_key_record_cache,
+		.name = "ecryptfs_key_record_cache",
+		.size = sizeof(struct ecryptfs_key_record),
+	},
+	{
+		.cache = &ecryptfs_key_sig_cache,
+		.name = "ecryptfs_key_sig_cache",
+		.size = sizeof(struct ecryptfs_key_sig),
+	},
+	{
+		.cache = &ecryptfs_global_auth_tok_cache,
+		.name = "ecryptfs_global_auth_tok_cache",
+		.size = sizeof(struct ecryptfs_global_auth_tok),
+	},
+	{
+		.cache = &ecryptfs_key_tfm_cache,
+		.name = "ecryptfs_key_tfm_cache",
+		.size = sizeof(struct ecryptfs_key_tfm),
+	},
+	{
+		.cache = &ecryptfs_open_req_cache,
+		.name = "ecryptfs_open_req_cache",
+		.size = sizeof(struct ecryptfs_open_req),
+	},
+};
+
+static void ecryptfs_free_kmem_caches(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
+		struct ecryptfs_cache_info *info;
+
+		info = &ecryptfs_cache_infos[i];
+		if (*(info->cache))
+			kmem_cache_destroy(*(info->cache));
+	}
+}
+
+/**
+ * ecryptfs_init_kmem_caches
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_init_kmem_caches(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
+		struct ecryptfs_cache_info *info;
+
+		info = &ecryptfs_cache_infos[i];
+		*(info->cache) = kmem_cache_create(info->name, info->size,
+				0, SLAB_HWCACHE_ALIGN, info->ctor);
+		if (!*(info->cache)) {
+			ecryptfs_free_kmem_caches();
+			ecryptfs_printk(KERN_WARNING, "%s: "
+					"kmem_cache_create failed\n",
+					info->name);
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+static struct kobject *ecryptfs_kobj;
+
+static ssize_t version_show(struct kobject *kobj,
+			    struct kobj_attribute *attr, char *buff)
+{
+	return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
+}
+
+static struct kobj_attribute version_attr = __ATTR_RO(version);
+
+static struct attribute *attributes[] = {
+	&version_attr.attr,
+	NULL,
+};
+
+static struct attribute_group attr_group = {
+	.attrs = attributes,
+};
+
+static int do_sysfs_registration(void)
+{
+	int rc;
+
+	ecryptfs_kobj = kobject_create_and_add("ecryptfs", fs_kobj);
+	if (!ecryptfs_kobj) {
+		printk(KERN_ERR "Unable to create ecryptfs kset\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	rc = sysfs_create_group(ecryptfs_kobj, &attr_group);
+	if (rc) {
+		printk(KERN_ERR
+		       "Unable to create ecryptfs version attributes\n");
+		kobject_put(ecryptfs_kobj);
+	}
+out:
+	return rc;
+}
+
+static void do_sysfs_unregistration(void)
+{
+	sysfs_remove_group(ecryptfs_kobj, &attr_group);
+	kobject_put(ecryptfs_kobj);
+}
+
+static int __init ecryptfs_init(void)
+{
+	int rc;
+
+	if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_CACHE_SIZE) {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
+				"larger than the host's page size, and so "
+				"eCryptfs cannot run on this system. The "
+				"default eCryptfs extent size is [%u] bytes; "
+				"the page size is [%lu] bytes.\n",
+				ECRYPTFS_DEFAULT_EXTENT_SIZE,
+				(unsigned long)PAGE_CACHE_SIZE);
+		goto out;
+	}
+	rc = ecryptfs_init_kmem_caches();
+	if (rc) {
+		printk(KERN_ERR
+		       "Failed to allocate one or more kmem_cache objects\n");
+		goto out;
+	}
+	rc = register_filesystem(&ecryptfs_fs_type);
+	if (rc) {
+		printk(KERN_ERR "Failed to register filesystem\n");
+		goto out_free_kmem_caches;
+	}
+	rc = do_sysfs_registration();
+	if (rc) {
+		printk(KERN_ERR "sysfs registration failed\n");
+		goto out_unregister_filesystem;
+	}
+	rc = ecryptfs_init_kthread();
+	if (rc) {
+		printk(KERN_ERR "%s: kthread initialization failed; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out_do_sysfs_unregistration;
+	}
+	rc = ecryptfs_init_messaging();
+	if (rc) {
+		printk(KERN_ERR "Failure occurred while attempting to "
+				"initialize the communications channel to "
+				"ecryptfsd\n");
+		goto out_destroy_kthread;
+	}
+	rc = ecryptfs_init_crypto();
+	if (rc) {
+		printk(KERN_ERR "Failure whilst attempting to init crypto; "
+		       "rc = [%d]\n", rc);
+		goto out_release_messaging;
+	}
+	if (ecryptfs_verbosity > 0)
+		printk(KERN_CRIT "eCryptfs verbosity set to %d. Secret values "
+			"will be written to the syslog!\n", ecryptfs_verbosity);
+
+	goto out;
+out_release_messaging:
+	ecryptfs_release_messaging();
+out_destroy_kthread:
+	ecryptfs_destroy_kthread();
+out_do_sysfs_unregistration:
+	do_sysfs_unregistration();
+out_unregister_filesystem:
+	unregister_filesystem(&ecryptfs_fs_type);
+out_free_kmem_caches:
+	ecryptfs_free_kmem_caches();
+out:
+	return rc;
+}
+
+static void __exit ecryptfs_exit(void)
+{
+	int rc;
+
+	rc = ecryptfs_destroy_crypto();
+	if (rc)
+		printk(KERN_ERR "Failure whilst attempting to destroy crypto; "
+		       "rc = [%d]\n", rc);
+	ecryptfs_release_messaging();
+	ecryptfs_destroy_kthread();
+	do_sysfs_unregistration();
+	unregister_filesystem(&ecryptfs_fs_type);
+	ecryptfs_free_kmem_caches();
+}
+
+MODULE_AUTHOR("Michael A. Halcrow <mhalcrow@us.ibm.com>");
+MODULE_DESCRIPTION("eCryptfs");
+
+MODULE_LICENSE("GPL");
+
+module_init(ecryptfs_init)
+module_exit(ecryptfs_exit)
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
new file mode 100644
index 00000000..ab224809
--- /dev/null
+++ b/fs/ecryptfs/messaging.c
@@ -0,0 +1,578 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 2004-2008 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
+ *		Tyler Hicks <tyhicks@ou.edu>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+#include <linux/nsproxy.h>
+#include "ecryptfs_kernel.h"
+
+static LIST_HEAD(ecryptfs_msg_ctx_free_list);
+static LIST_HEAD(ecryptfs_msg_ctx_alloc_list);
+static struct mutex ecryptfs_msg_ctx_lists_mux;
+
+static struct hlist_head *ecryptfs_daemon_hash;
+struct mutex ecryptfs_daemon_hash_mux;
+static int ecryptfs_hash_bits;
+#define ecryptfs_uid_hash(uid) \
+        hash_long((unsigned long)uid, ecryptfs_hash_bits)
+
+static u32 ecryptfs_msg_counter;
+static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr;
+
+/**
+ * ecryptfs_acquire_free_msg_ctx
+ * @msg_ctx: The context that was acquired from the free list
+ *
+ * Acquires a context element from the free list and locks the mutex
+ * on the context.  Sets the msg_ctx task to current.  Returns zero on
+ * success; non-zero on error or upon failure to acquire a free
+ * context element.  Must be called with ecryptfs_msg_ctx_lists_mux
+ * held.
+ */
+static int ecryptfs_acquire_free_msg_ctx(struct ecryptfs_msg_ctx **msg_ctx)
+{
+	struct list_head *p;
+	int rc;
+
+	if (list_empty(&ecryptfs_msg_ctx_free_list)) {
+		printk(KERN_WARNING "%s: The eCryptfs free "
+		       "context list is empty.  It may be helpful to "
+		       "specify the ecryptfs_message_buf_len "
+		       "parameter to be greater than the current "
+		       "value of [%d]\n", __func__, ecryptfs_message_buf_len);
+		rc = -ENOMEM;
+		goto out;
+	}
+	list_for_each(p, &ecryptfs_msg_ctx_free_list) {
+		*msg_ctx = list_entry(p, struct ecryptfs_msg_ctx, node);
+		if (mutex_trylock(&(*msg_ctx)->mux)) {
+			(*msg_ctx)->task = current;
+			rc = 0;
+			goto out;
+		}
+	}
+	rc = -ENOMEM;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_msg_ctx_free_to_alloc
+ * @msg_ctx: The context to move from the free list to the alloc list
+ *
+ * Must be called with ecryptfs_msg_ctx_lists_mux held.
+ */
+static void ecryptfs_msg_ctx_free_to_alloc(struct ecryptfs_msg_ctx *msg_ctx)
+{
+	list_move(&msg_ctx->node, &ecryptfs_msg_ctx_alloc_list);
+	msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_PENDING;
+	msg_ctx->counter = ++ecryptfs_msg_counter;
+}
+
+/**
+ * ecryptfs_msg_ctx_alloc_to_free
+ * @msg_ctx: The context to move from the alloc list to the free list
+ *
+ * Must be called with ecryptfs_msg_ctx_lists_mux held.
+ */
+void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx)
+{
+	list_move(&(msg_ctx->node), &ecryptfs_msg_ctx_free_list);
+	if (msg_ctx->msg)
+		kfree(msg_ctx->msg);
+	msg_ctx->msg = NULL;
+	msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_FREE;
+}
+
+/**
+ * ecryptfs_find_daemon_by_euid
+ * @euid: The effective user id which maps to the desired daemon id
+ * @user_ns: The namespace in which @euid applies
+ * @daemon: If return value is zero, points to the desired daemon pointer
+ *
+ * Must be called with ecryptfs_daemon_hash_mux held.
+ *
+ * Search the hash list for the given user id.
+ *
+ * Returns zero if the user id exists in the list; non-zero otherwise.
+ */
+int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
+				 struct user_namespace *user_ns)
+{
+	struct hlist_node *elem;
+	int rc;
+
+	hlist_for_each_entry(*daemon, elem,
+			     &ecryptfs_daemon_hash[ecryptfs_uid_hash(euid)],
+			     euid_chain) {
+		if ((*daemon)->euid == euid && (*daemon)->user_ns == user_ns) {
+			rc = 0;
+			goto out;
+		}
+	}
+	rc = -EINVAL;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_spawn_daemon - Create and initialize a new daemon struct
+ * @daemon: Pointer to set to newly allocated daemon struct
+ * @euid: Effective user id for the daemon
+ * @user_ns: The namespace in which @euid applies
+ * @pid: Process id for the daemon
+ *
+ * Must be called ceremoniously while in possession of
+ * ecryptfs_sacred_daemon_hash_mux
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int
+ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
+		      struct user_namespace *user_ns, struct pid *pid)
+{
+	int rc = 0;
+
+	(*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL);
+	if (!(*daemon)) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
+		       "GFP_KERNEL memory\n", __func__, sizeof(**daemon));
+		goto out;
+	}
+	(*daemon)->euid = euid;
+	(*daemon)->user_ns = get_user_ns(user_ns);
+	(*daemon)->pid = get_pid(pid);
+	(*daemon)->task = current;
+	mutex_init(&(*daemon)->mux);
+	INIT_LIST_HEAD(&(*daemon)->msg_ctx_out_queue);
+	init_waitqueue_head(&(*daemon)->wait);
+	(*daemon)->num_queued_msg_ctx = 0;
+	hlist_add_head(&(*daemon)->euid_chain,
+		       &ecryptfs_daemon_hash[ecryptfs_uid_hash(euid)]);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_exorcise_daemon - Destroy the daemon struct
+ *
+ * Must be called ceremoniously while in possession of
+ * ecryptfs_daemon_hash_mux and the daemon's own mux.
+ */
+int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
+{
+	struct ecryptfs_msg_ctx *msg_ctx, *msg_ctx_tmp;
+	int rc = 0;
+
+	mutex_lock(&daemon->mux);
+	if ((daemon->flags & ECRYPTFS_DAEMON_IN_READ)
+	    || (daemon->flags & ECRYPTFS_DAEMON_IN_POLL)) {
+		rc = -EBUSY;
+		printk(KERN_WARNING "%s: Attempt to destroy daemon with pid "
+		       "[0x%p], but it is in the midst of a read or a poll\n",
+		       __func__, daemon->pid);
+		mutex_unlock(&daemon->mux);
+		goto out;
+	}
+	list_for_each_entry_safe(msg_ctx, msg_ctx_tmp,
+				 &daemon->msg_ctx_out_queue, daemon_out_list) {
+		list_del(&msg_ctx->daemon_out_list);
+		daemon->num_queued_msg_ctx--;
+		printk(KERN_WARNING "%s: Warning: dropping message that is in "
+		       "the out queue of a dying daemon\n", __func__);
+		ecryptfs_msg_ctx_alloc_to_free(msg_ctx);
+	}
+	hlist_del(&daemon->euid_chain);
+	if (daemon->task)
+		wake_up_process(daemon->task);
+	if (daemon->pid)
+		put_pid(daemon->pid);
+	if (daemon->user_ns)
+		put_user_ns(daemon->user_ns);
+	mutex_unlock(&daemon->mux);
+	kzfree(daemon);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_process_quit
+ * @euid: The user ID owner of the message
+ * @user_ns: The namespace in which @euid applies
+ * @pid: The process ID for the userspace program that sent the
+ *       message
+ *
+ * Deletes the corresponding daemon for the given euid and pid, if
+ * it is the registered that is requesting the deletion. Returns zero
+ * after deleting the desired daemon; non-zero otherwise.
+ */
+int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns,
+			  struct pid *pid)
+{
+	struct ecryptfs_daemon *daemon;
+	int rc;
+
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, user_ns);
+	if (rc || !daemon) {
+		rc = -EINVAL;
+		printk(KERN_ERR "Received request from user [%d] to "
+		       "unregister unrecognized daemon [0x%p]\n", euid, pid);
+		goto out_unlock;
+	}
+	rc = ecryptfs_exorcise_daemon(daemon);
+out_unlock:
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	return rc;
+}
+
+/**
+ * ecryptfs_process_reponse
+ * @msg: The ecryptfs message received; the caller should sanity check
+ *       msg->data_len and free the memory
+ * @pid: The process ID of the userspace application that sent the
+ *       message
+ * @seq: The sequence number of the message; must match the sequence
+ *       number for the existing message context waiting for this
+ *       response
+ *
+ * Processes a response message after sending an operation request to
+ * userspace. Some other process is awaiting this response. Before
+ * sending out its first communications, the other process allocated a
+ * msg_ctx from the ecryptfs_msg_ctx_arr at a particular index. The
+ * response message contains this index so that we can copy over the
+ * response message into the msg_ctx that the process holds a
+ * reference to. The other process is going to wake up, check to see
+ * that msg_ctx->state == ECRYPTFS_MSG_CTX_STATE_DONE, and then
+ * proceed to read off and process the response message. Returns zero
+ * upon delivery to desired context element; non-zero upon delivery
+ * failure or error.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
+			      struct user_namespace *user_ns, struct pid *pid,
+			      u32 seq)
+{
+	struct ecryptfs_daemon *uninitialized_var(daemon);
+	struct ecryptfs_msg_ctx *msg_ctx;
+	size_t msg_size;
+	struct nsproxy *nsproxy;
+	struct user_namespace *tsk_user_ns;
+	uid_t ctx_euid;
+	int rc;
+
+	if (msg->index >= ecryptfs_message_buf_len) {
+		rc = -EINVAL;
+		printk(KERN_ERR "%s: Attempt to reference "
+		       "context buffer at index [%d]; maximum "
+		       "allowable is [%d]\n", __func__, msg->index,
+		       (ecryptfs_message_buf_len - 1));
+		goto out;
+	}
+	msg_ctx = &ecryptfs_msg_ctx_arr[msg->index];
+	mutex_lock(&msg_ctx->mux);
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	rcu_read_lock();
+	nsproxy = task_nsproxy(msg_ctx->task);
+	if (nsproxy == NULL) {
+		rc = -EBADMSG;
+		printk(KERN_ERR "%s: Receiving process is a zombie. Dropping "
+		       "message.\n", __func__);
+		rcu_read_unlock();
+		mutex_unlock(&ecryptfs_daemon_hash_mux);
+		goto wake_up;
+	}
+	tsk_user_ns = __task_cred(msg_ctx->task)->user->user_ns;
+	ctx_euid = task_euid(msg_ctx->task);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, tsk_user_ns);
+	rcu_read_unlock();
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	if (rc) {
+		rc = -EBADMSG;
+		printk(KERN_WARNING "%s: User [%d] received a "
+		       "message response from process [0x%p] but does "
+		       "not have a registered daemon\n", __func__,
+		       ctx_euid, pid);
+		goto wake_up;
+	}
+	if (ctx_euid != euid) {
+		rc = -EBADMSG;
+		printk(KERN_WARNING "%s: Received message from user "
+		       "[%d]; expected message from user [%d]\n", __func__,
+		       euid, ctx_euid);
+		goto unlock;
+	}
+	if (tsk_user_ns != user_ns) {
+		rc = -EBADMSG;
+		printk(KERN_WARNING "%s: Received message from user_ns "
+		       "[0x%p]; expected message from user_ns [0x%p]\n",
+		       __func__, user_ns, tsk_user_ns);
+		goto unlock;
+	}
+	if (daemon->pid != pid) {
+		rc = -EBADMSG;
+		printk(KERN_ERR "%s: User [%d] sent a message response "
+		       "from an unrecognized process [0x%p]\n",
+		       __func__, ctx_euid, pid);
+		goto unlock;
+	}
+	if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_PENDING) {
+		rc = -EINVAL;
+		printk(KERN_WARNING "%s: Desired context element is not "
+		       "pending a response\n", __func__);
+		goto unlock;
+	} else if (msg_ctx->counter != seq) {
+		rc = -EINVAL;
+		printk(KERN_WARNING "%s: Invalid message sequence; "
+		       "expected [%d]; received [%d]\n", __func__,
+		       msg_ctx->counter, seq);
+		goto unlock;
+	}
+	msg_size = (sizeof(*msg) + msg->data_len);
+	msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL);
+	if (!msg_ctx->msg) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
+		       "GFP_KERNEL memory\n", __func__, msg_size);
+		goto unlock;
+	}
+	memcpy(msg_ctx->msg, msg, msg_size);
+	msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE;
+	rc = 0;
+wake_up:
+	wake_up_process(msg_ctx->task);
+unlock:
+	mutex_unlock(&msg_ctx->mux);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_send_message_locked
+ * @data: The data to send
+ * @data_len: The length of data
+ * @msg_ctx: The message context allocated for the send
+ *
+ * Must be called with ecryptfs_daemon_hash_mux held.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int
+ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
+			     struct ecryptfs_msg_ctx **msg_ctx)
+{
+	struct ecryptfs_daemon *daemon;
+	uid_t euid = current_euid();
+	int rc;
+
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
+	if (rc || !daemon) {
+		rc = -ENOTCONN;
+		printk(KERN_ERR "%s: User [%d] does not have a daemon "
+		       "registered\n", __func__, euid);
+		goto out;
+	}
+	mutex_lock(&ecryptfs_msg_ctx_lists_mux);
+	rc = ecryptfs_acquire_free_msg_ctx(msg_ctx);
+	if (rc) {
+		mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
+		printk(KERN_WARNING "%s: Could not claim a free "
+		       "context element\n", __func__);
+		goto out;
+	}
+	ecryptfs_msg_ctx_free_to_alloc(*msg_ctx);
+	mutex_unlock(&(*msg_ctx)->mux);
+	mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
+	rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type, 0,
+				   daemon);
+	if (rc)
+		printk(KERN_ERR "%s: Error attempting to send message to "
+		       "userspace daemon; rc = [%d]\n", __func__, rc);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_send_message
+ * @data: The data to send
+ * @data_len: The length of data
+ * @msg_ctx: The message context allocated for the send
+ *
+ * Grabs ecryptfs_daemon_hash_mux.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_send_message(char *data, int data_len,
+			  struct ecryptfs_msg_ctx **msg_ctx)
+{
+	int rc;
+
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	rc = ecryptfs_send_message_locked(data, data_len, ECRYPTFS_MSG_REQUEST,
+					  msg_ctx);
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	return rc;
+}
+
+/**
+ * ecryptfs_wait_for_response
+ * @msg_ctx: The context that was assigned when sending a message
+ * @msg: The incoming message from userspace; not set if rc != 0
+ *
+ * Sleeps until awaken by ecryptfs_receive_message or until the amount
+ * of time exceeds ecryptfs_message_wait_timeout.  If zero is
+ * returned, msg will point to a valid message from userspace; a
+ * non-zero value is returned upon failure to receive a message or an
+ * error occurs. Callee must free @msg on success.
+ */
+int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,
+			       struct ecryptfs_message **msg)
+{
+	signed long timeout = ecryptfs_message_wait_timeout * HZ;
+	int rc = 0;
+
+sleep:
+	timeout = schedule_timeout_interruptible(timeout);
+	mutex_lock(&ecryptfs_msg_ctx_lists_mux);
+	mutex_lock(&msg_ctx->mux);
+	if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_DONE) {
+		if (timeout) {
+			mutex_unlock(&msg_ctx->mux);
+			mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
+			goto sleep;
+		}
+		rc = -ENOMSG;
+	} else {
+		*msg = msg_ctx->msg;
+		msg_ctx->msg = NULL;
+	}
+	ecryptfs_msg_ctx_alloc_to_free(msg_ctx);
+	mutex_unlock(&msg_ctx->mux);
+	mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
+	return rc;
+}
+
+int __init ecryptfs_init_messaging(void)
+{
+	int i;
+	int rc = 0;
+
+	if (ecryptfs_number_of_users > ECRYPTFS_MAX_NUM_USERS) {
+		ecryptfs_number_of_users = ECRYPTFS_MAX_NUM_USERS;
+		printk(KERN_WARNING "%s: Specified number of users is "
+		       "too large, defaulting to [%d] users\n", __func__,
+		       ecryptfs_number_of_users);
+	}
+	mutex_init(&ecryptfs_daemon_hash_mux);
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	ecryptfs_hash_bits = 1;
+	while (ecryptfs_number_of_users >> ecryptfs_hash_bits)
+		ecryptfs_hash_bits++;
+	ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head)
+					* (1 << ecryptfs_hash_bits)),
+				       GFP_KERNEL);
+	if (!ecryptfs_daemon_hash) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
+		mutex_unlock(&ecryptfs_daemon_hash_mux);
+		goto out;
+	}
+	for (i = 0; i < (1 << ecryptfs_hash_bits); i++)
+		INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]);
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx)
+					* ecryptfs_message_buf_len),
+				       GFP_KERNEL);
+	if (!ecryptfs_msg_ctx_arr) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
+		goto out;
+	}
+	mutex_init(&ecryptfs_msg_ctx_lists_mux);
+	mutex_lock(&ecryptfs_msg_ctx_lists_mux);
+	ecryptfs_msg_counter = 0;
+	for (i = 0; i < ecryptfs_message_buf_len; i++) {
+		INIT_LIST_HEAD(&ecryptfs_msg_ctx_arr[i].node);
+		INIT_LIST_HEAD(&ecryptfs_msg_ctx_arr[i].daemon_out_list);
+		mutex_init(&ecryptfs_msg_ctx_arr[i].mux);
+		mutex_lock(&ecryptfs_msg_ctx_arr[i].mux);
+		ecryptfs_msg_ctx_arr[i].index = i;
+		ecryptfs_msg_ctx_arr[i].state = ECRYPTFS_MSG_CTX_STATE_FREE;
+		ecryptfs_msg_ctx_arr[i].counter = 0;
+		ecryptfs_msg_ctx_arr[i].task = NULL;
+		ecryptfs_msg_ctx_arr[i].msg = NULL;
+		list_add_tail(&ecryptfs_msg_ctx_arr[i].node,
+			      &ecryptfs_msg_ctx_free_list);
+		mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux);
+	}
+	mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
+	rc = ecryptfs_init_ecryptfs_miscdev();
+	if (rc)
+		ecryptfs_release_messaging();
+out:
+	return rc;
+}
+
+void ecryptfs_release_messaging(void)
+{
+	if (ecryptfs_msg_ctx_arr) {
+		int i;
+
+		mutex_lock(&ecryptfs_msg_ctx_lists_mux);
+		for (i = 0; i < ecryptfs_message_buf_len; i++) {
+			mutex_lock(&ecryptfs_msg_ctx_arr[i].mux);
+			if (ecryptfs_msg_ctx_arr[i].msg)
+				kfree(ecryptfs_msg_ctx_arr[i].msg);
+			mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux);
+		}
+		kfree(ecryptfs_msg_ctx_arr);
+		mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
+	}
+	if (ecryptfs_daemon_hash) {
+		struct hlist_node *elem;
+		struct ecryptfs_daemon *daemon;
+		int i;
+
+		mutex_lock(&ecryptfs_daemon_hash_mux);
+		for (i = 0; i < (1 << ecryptfs_hash_bits); i++) {
+			int rc;
+
+			hlist_for_each_entry(daemon, elem,
+					     &ecryptfs_daemon_hash[i],
+					     euid_chain) {
+				rc = ecryptfs_exorcise_daemon(daemon);
+				if (rc)
+					printk(KERN_ERR "%s: Error whilst "
+					       "attempting to destroy daemon; "
+					       "rc = [%d]. Dazed and confused, "
+					       "but trying to continue.\n",
+					       __func__, rc);
+			}
+		}
+		kfree(ecryptfs_daemon_hash);
+		mutex_unlock(&ecryptfs_daemon_hash_mux);
+	}
+	ecryptfs_destroy_ecryptfs_miscdev();
+	return;
+}
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
new file mode 100644
index 00000000..0dc5a3d5
--- /dev/null
+++ b/fs/ecryptfs/miscdev.c
@@ -0,0 +1,547 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 2008 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/hash.h>
+#include <linux/random.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include "ecryptfs_kernel.h"
+
+static atomic_t ecryptfs_num_miscdev_opens;
+
+/**
+ * ecryptfs_miscdev_poll
+ * @file: dev file (ignored)
+ * @pt: dev poll table (ignored)
+ *
+ * Returns the poll mask
+ */
+static unsigned int
+ecryptfs_miscdev_poll(struct file *file, poll_table *pt)
+{
+	struct ecryptfs_daemon *daemon;
+	unsigned int mask = 0;
+	uid_t euid = current_euid();
+	int rc;
+
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	/* TODO: Just use file->private_data? */
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
+	BUG_ON(rc || !daemon);
+	mutex_lock(&daemon->mux);
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
+		printk(KERN_WARNING "%s: Attempt to poll on zombified "
+		       "daemon\n", __func__);
+		goto out_unlock_daemon;
+	}
+	if (daemon->flags & ECRYPTFS_DAEMON_IN_READ)
+		goto out_unlock_daemon;
+	if (daemon->flags & ECRYPTFS_DAEMON_IN_POLL)
+		goto out_unlock_daemon;
+	daemon->flags |= ECRYPTFS_DAEMON_IN_POLL;
+	mutex_unlock(&daemon->mux);
+	poll_wait(file, &daemon->wait, pt);
+	mutex_lock(&daemon->mux);
+	if (!list_empty(&daemon->msg_ctx_out_queue))
+		mask |= POLLIN | POLLRDNORM;
+out_unlock_daemon:
+	daemon->flags &= ~ECRYPTFS_DAEMON_IN_POLL;
+	mutex_unlock(&daemon->mux);
+	return mask;
+}
+
+/**
+ * ecryptfs_miscdev_open
+ * @inode: inode of miscdev handle (ignored)
+ * @file: file for miscdev handle (ignored)
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int
+ecryptfs_miscdev_open(struct inode *inode, struct file *file)
+{
+	struct ecryptfs_daemon *daemon = NULL;
+	uid_t euid = current_euid();
+	int rc;
+
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	rc = try_module_get(THIS_MODULE);
+	if (rc == 0) {
+		rc = -EIO;
+		printk(KERN_ERR "%s: Error attempting to increment module use "
+		       "count; rc = [%d]\n", __func__, rc);
+		goto out_unlock_daemon_list;
+	}
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
+	if (rc || !daemon) {
+		rc = ecryptfs_spawn_daemon(&daemon, euid, current_user_ns(),
+					   task_pid(current));
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to spawn daemon; "
+			       "rc = [%d]\n", __func__, rc);
+			goto out_module_put_unlock_daemon_list;
+		}
+	}
+	mutex_lock(&daemon->mux);
+	if (daemon->pid != task_pid(current)) {
+		rc = -EINVAL;
+		printk(KERN_ERR "%s: pid [0x%p] has registered with euid [%d], "
+		       "but pid [0x%p] has attempted to open the handle "
+		       "instead\n", __func__, daemon->pid, daemon->euid,
+		       task_pid(current));
+		goto out_unlock_daemon;
+	}
+	if (daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN) {
+		rc = -EBUSY;
+		printk(KERN_ERR "%s: Miscellaneous device handle may only be "
+		       "opened once per daemon; pid [0x%p] already has this "
+		       "handle open\n", __func__, daemon->pid);
+		goto out_unlock_daemon;
+	}
+	daemon->flags |= ECRYPTFS_DAEMON_MISCDEV_OPEN;
+	atomic_inc(&ecryptfs_num_miscdev_opens);
+out_unlock_daemon:
+	mutex_unlock(&daemon->mux);
+out_module_put_unlock_daemon_list:
+	if (rc)
+		module_put(THIS_MODULE);
+out_unlock_daemon_list:
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	return rc;
+}
+
+/**
+ * ecryptfs_miscdev_release
+ * @inode: inode of fs/ecryptfs/euid handle (ignored)
+ * @file: file for fs/ecryptfs/euid handle (ignored)
+ *
+ * This keeps the daemon registered until the daemon sends another
+ * ioctl to fs/ecryptfs/ctl or until the kernel module unregisters.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int
+ecryptfs_miscdev_release(struct inode *inode, struct file *file)
+{
+	struct ecryptfs_daemon *daemon = NULL;
+	uid_t euid = current_euid();
+	int rc;
+
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
+	BUG_ON(rc || !daemon);
+	mutex_lock(&daemon->mux);
+	BUG_ON(daemon->pid != task_pid(current));
+	BUG_ON(!(daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN));
+	daemon->flags &= ~ECRYPTFS_DAEMON_MISCDEV_OPEN;
+	atomic_dec(&ecryptfs_num_miscdev_opens);
+	mutex_unlock(&daemon->mux);
+	rc = ecryptfs_exorcise_daemon(daemon);
+	if (rc) {
+		printk(KERN_CRIT "%s: Fatal error whilst attempting to "
+		       "shut down daemon; rc = [%d]. Please report this "
+		       "bug.\n", __func__, rc);
+		BUG();
+	}
+	module_put(THIS_MODULE);
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	return rc;
+}
+
+/**
+ * ecryptfs_send_miscdev
+ * @data: Data to send to daemon; may be NULL
+ * @data_size: Amount of data to send to daemon
+ * @msg_ctx: Message context, which is used to handle the reply. If
+ *           this is NULL, then we do not expect a reply.
+ * @msg_type: Type of message
+ * @msg_flags: Flags for message
+ * @daemon: eCryptfs daemon object
+ *
+ * Add msg_ctx to queue and then, if it exists, notify the blocked
+ * miscdevess about the data being available. Must be called with
+ * ecryptfs_daemon_hash_mux held.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_send_miscdev(char *data, size_t data_size,
+			  struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
+			  u16 msg_flags, struct ecryptfs_daemon *daemon)
+{
+	int rc = 0;
+
+	mutex_lock(&msg_ctx->mux);
+	msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size),
+			       GFP_KERNEL);
+	if (!msg_ctx->msg) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "%s: Out of memory whilst attempting "
+		       "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
+		       (sizeof(*msg_ctx->msg) + data_size));
+		goto out_unlock;
+	}
+	msg_ctx->msg->index = msg_ctx->index;
+	msg_ctx->msg->data_len = data_size;
+	msg_ctx->type = msg_type;
+	memcpy(msg_ctx->msg->data, data, data_size);
+	msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
+	mutex_lock(&daemon->mux);
+	list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue);
+	daemon->num_queued_msg_ctx++;
+	wake_up_interruptible(&daemon->wait);
+	mutex_unlock(&daemon->mux);
+out_unlock:
+	mutex_unlock(&msg_ctx->mux);
+	return rc;
+}
+
+/**
+ * ecryptfs_miscdev_read - format and send message from queue
+ * @file: fs/ecryptfs/euid miscdevfs handle (ignored)
+ * @buf: User buffer into which to copy the next message on the daemon queue
+ * @count: Amount of space available in @buf
+ * @ppos: Offset in file (ignored)
+ *
+ * Pulls the most recent message from the daemon queue, formats it for
+ * being sent via a miscdevfs handle, and copies it into @buf
+ *
+ * Returns the number of bytes copied into the user buffer
+ */
+static ssize_t
+ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
+		      loff_t *ppos)
+{
+	struct ecryptfs_daemon *daemon;
+	struct ecryptfs_msg_ctx *msg_ctx;
+	size_t packet_length_size;
+	char packet_length[3];
+	size_t i;
+	size_t total_length;
+	uid_t euid = current_euid();
+	int rc;
+
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	/* TODO: Just use file->private_data? */
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
+	BUG_ON(rc || !daemon);
+	mutex_lock(&daemon->mux);
+	if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
+		rc = 0;
+		mutex_unlock(&ecryptfs_daemon_hash_mux);
+		printk(KERN_WARNING "%s: Attempt to read from zombified "
+		       "daemon\n", __func__);
+		goto out_unlock_daemon;
+	}
+	if (daemon->flags & ECRYPTFS_DAEMON_IN_READ) {
+		rc = 0;
+		mutex_unlock(&ecryptfs_daemon_hash_mux);
+		goto out_unlock_daemon;
+	}
+	/* This daemon will not go away so long as this flag is set */
+	daemon->flags |= ECRYPTFS_DAEMON_IN_READ;
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+check_list:
+	if (list_empty(&daemon->msg_ctx_out_queue)) {
+		mutex_unlock(&daemon->mux);
+		rc = wait_event_interruptible(
+			daemon->wait, !list_empty(&daemon->msg_ctx_out_queue));
+		mutex_lock(&daemon->mux);
+		if (rc < 0) {
+			rc = 0;
+			goto out_unlock_daemon;
+		}
+	}
+	if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
+		rc = 0;
+		goto out_unlock_daemon;
+	}
+	if (list_empty(&daemon->msg_ctx_out_queue)) {
+		/* Something else jumped in since the
+		 * wait_event_interruptable() and removed the
+		 * message from the queue; try again */
+		goto check_list;
+	}
+	BUG_ON(euid != daemon->euid);
+	BUG_ON(current_user_ns() != daemon->user_ns);
+	BUG_ON(task_pid(current) != daemon->pid);
+	msg_ctx = list_first_entry(&daemon->msg_ctx_out_queue,
+				   struct ecryptfs_msg_ctx, daemon_out_list);
+	BUG_ON(!msg_ctx);
+	mutex_lock(&msg_ctx->mux);
+	if (msg_ctx->msg) {
+		rc = ecryptfs_write_packet_length(packet_length,
+						  msg_ctx->msg_size,
+						  &packet_length_size);
+		if (rc) {
+			rc = 0;
+			printk(KERN_WARNING "%s: Error writing packet length; "
+			       "rc = [%d]\n", __func__, rc);
+			goto out_unlock_msg_ctx;
+		}
+	} else {
+		packet_length_size = 0;
+		msg_ctx->msg_size = 0;
+	}
+	/* miscdevfs packet format:
+	 *  Octet 0: Type
+	 *  Octets 1-4: network byte order msg_ctx->counter
+	 *  Octets 5-N0: Size of struct ecryptfs_message to follow
+	 *  Octets N0-N1: struct ecryptfs_message (including data)
+	 *
+	 *  Octets 5-N1 not written if the packet type does not
+	 *  include a message */
+	total_length = (1 + 4 + packet_length_size + msg_ctx->msg_size);
+	if (count < total_length) {
+		rc = 0;
+		printk(KERN_WARNING "%s: Only given user buffer of "
+		       "size [%zd], but we need [%zd] to read the "
+		       "pending message\n", __func__, count, total_length);
+		goto out_unlock_msg_ctx;
+	}
+	rc = -EFAULT;
+	if (put_user(msg_ctx->type, buf))
+		goto out_unlock_msg_ctx;
+	if (put_user(cpu_to_be32(msg_ctx->counter), (__be32 __user *)(buf + 1)))
+		goto out_unlock_msg_ctx;
+	i = 5;
+	if (msg_ctx->msg) {
+		if (copy_to_user(&buf[i], packet_length, packet_length_size))
+			goto out_unlock_msg_ctx;
+		i += packet_length_size;
+		if (copy_to_user(&buf[i], msg_ctx->msg, msg_ctx->msg_size))
+			goto out_unlock_msg_ctx;
+		i += msg_ctx->msg_size;
+	}
+	rc = i;
+	list_del(&msg_ctx->daemon_out_list);
+	kfree(msg_ctx->msg);
+	msg_ctx->msg = NULL;
+	/* We do not expect a reply from the userspace daemon for any
+	 * message type other than ECRYPTFS_MSG_REQUEST */
+	if (msg_ctx->type != ECRYPTFS_MSG_REQUEST)
+		ecryptfs_msg_ctx_alloc_to_free(msg_ctx);
+out_unlock_msg_ctx:
+	mutex_unlock(&msg_ctx->mux);
+out_unlock_daemon:
+	daemon->flags &= ~ECRYPTFS_DAEMON_IN_READ;
+	mutex_unlock(&daemon->mux);
+	return rc;
+}
+
+/**
+ * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon
+ * @data: Bytes comprising struct ecryptfs_message
+ * @data_size: sizeof(struct ecryptfs_message) + data len
+ * @euid: Effective user id of miscdevess sending the miscdev response
+ * @user_ns: The namespace in which @euid applies
+ * @pid: Miscdevess id of miscdevess sending the miscdev response
+ * @seq: Sequence number for miscdev response packet
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_miscdev_response(char *data, size_t data_size,
+				     uid_t euid, struct user_namespace *user_ns,
+				     struct pid *pid, u32 seq)
+{
+	struct ecryptfs_message *msg = (struct ecryptfs_message *)data;
+	int rc;
+
+	if ((sizeof(*msg) + msg->data_len) != data_size) {
+		printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = "
+		       "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__,
+		       (sizeof(*msg) + msg->data_len), data_size);
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = ecryptfs_process_response(msg, euid, user_ns, pid, seq);
+	if (rc)
+		printk(KERN_ERR
+		       "Error processing response message; rc = [%d]\n", rc);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_miscdev_write - handle write to daemon miscdev handle
+ * @file: File for misc dev handle (ignored)
+ * @buf: Buffer containing user data
+ * @count: Amount of data in @buf
+ * @ppos: Pointer to offset in file (ignored)
+ *
+ * miscdevfs packet format:
+ *  Octet 0: Type
+ *  Octets 1-4: network byte order msg_ctx->counter (0's for non-response)
+ *  Octets 5-N0: Size of struct ecryptfs_message to follow
+ *  Octets N0-N1: struct ecryptfs_message (including data)
+ *
+ * Returns the number of bytes read from @buf
+ */
+static ssize_t
+ecryptfs_miscdev_write(struct file *file, const char __user *buf,
+		       size_t count, loff_t *ppos)
+{
+	__be32 counter_nbo;
+	u32 seq;
+	size_t packet_size, packet_size_length, i;
+	ssize_t sz = 0;
+	char *data;
+	uid_t euid = current_euid();
+	unsigned char packet_size_peek[3];
+	int rc;
+
+	if (count == 0) {
+		goto out;
+	} else if (count == (1 + 4)) {
+		/* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */
+		goto memdup;
+	} else if (count < (1 + 4 + 1)
+		   || count > (1 + 4 + 2 + sizeof(struct ecryptfs_message) + 4
+			       + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES)) {
+		printk(KERN_WARNING "%s: Acceptable packet size range is "
+		       "[%d-%lu], but amount of data written is [%zu].",
+		       __func__, (1 + 4 + 1),
+		       (1 + 4 + 2 + sizeof(struct ecryptfs_message) + 4
+			+ ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES), count);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(packet_size_peek, (buf + 1 + 4),
+			   sizeof(packet_size_peek))) {
+		printk(KERN_WARNING "%s: Error while inspecting packet size\n",
+		       __func__);
+		return -EFAULT;
+	}
+
+	rc = ecryptfs_parse_packet_length(packet_size_peek, &packet_size,
+					  &packet_size_length);
+	if (rc) {
+		printk(KERN_WARNING "%s: Error parsing packet length; "
+		       "rc = [%d]\n", __func__, rc);
+		return rc;
+	}
+
+	if ((1 + 4 + packet_size_length + packet_size) != count) {
+		printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__,
+		       packet_size);
+		return -EINVAL;
+	}
+
+memdup:
+	data = memdup_user(buf, count);
+	if (IS_ERR(data)) {
+		printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
+		       __func__, PTR_ERR(data));
+		goto out;
+	}
+	sz = count;
+	i = 0;
+	switch (data[i++]) {
+	case ECRYPTFS_MSG_RESPONSE:
+		if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
+			printk(KERN_WARNING "%s: Minimum acceptable packet "
+			       "size is [%zd], but amount of data written is "
+			       "only [%zd]. Discarding response packet.\n",
+			       __func__,
+			       (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
+			       count);
+			goto out_free;
+		}
+		memcpy(&counter_nbo, &data[i], 4);
+		seq = be32_to_cpu(counter_nbo);
+		i += 4 + packet_size_length;
+		rc = ecryptfs_miscdev_response(&data[i], packet_size,
+					       euid, current_user_ns(),
+					       task_pid(current), seq);
+		if (rc)
+			printk(KERN_WARNING "%s: Failed to deliver miscdev "
+			       "response to requesting operation; rc = [%d]\n",
+			       __func__, rc);
+		break;
+	case ECRYPTFS_MSG_HELO:
+	case ECRYPTFS_MSG_QUIT:
+		break;
+	default:
+		ecryptfs_printk(KERN_WARNING, "Dropping miscdev "
+				"message of unrecognized type [%d]\n",
+				data[0]);
+		break;
+	}
+out_free:
+	kfree(data);
+out:
+	return sz;
+}
+
+
+static const struct file_operations ecryptfs_miscdev_fops = {
+	.open    = ecryptfs_miscdev_open,
+	.poll    = ecryptfs_miscdev_poll,
+	.read    = ecryptfs_miscdev_read,
+	.write   = ecryptfs_miscdev_write,
+	.release = ecryptfs_miscdev_release,
+	.llseek  = noop_llseek,
+};
+
+static struct miscdevice ecryptfs_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name  = "ecryptfs",
+	.fops  = &ecryptfs_miscdev_fops
+};
+
+/**
+ * ecryptfs_init_ecryptfs_miscdev
+ *
+ * Messages sent to the userspace daemon from the kernel are placed on
+ * a queue associated with the daemon. The next read against the
+ * miscdev handle by that daemon will return the oldest message placed
+ * on the message queue for the daemon.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int __init ecryptfs_init_ecryptfs_miscdev(void)
+{
+	int rc;
+
+	atomic_set(&ecryptfs_num_miscdev_opens, 0);
+	rc = misc_register(&ecryptfs_miscdev);
+	if (rc)
+		printk(KERN_ERR "%s: Failed to register miscellaneous device "
+		       "for communications with userspace daemons; rc = [%d]\n",
+		       __func__, rc);
+	return rc;
+}
+
+/**
+ * ecryptfs_destroy_ecryptfs_miscdev
+ *
+ * All of the daemons must be exorcised prior to calling this
+ * function.
+ */
+void ecryptfs_destroy_ecryptfs_miscdev(void)
+{
+	BUG_ON(atomic_read(&ecryptfs_num_miscdev_opens) != 0);
+	misc_deregister(&ecryptfs_miscdev);
+}
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
new file mode 100644
index 00000000..6a44148c
--- /dev/null
+++ b/fs/ecryptfs/mmap.c
@@ -0,0 +1,566 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * This is where eCryptfs coordinates the symmetric encryption and
+ * decryption of the file data as it passes between the lower
+ * encrypted file and the upper decrypted file.
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2007 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/page-flags.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_get_locked_page
+ *
+ * Get one page from cache or lower f/s, return error otherwise.
+ *
+ * Returns locked and up-to-date page (if ok), with increased
+ * refcnt.
+ */
+struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
+{
+	struct page *page = read_mapping_page(inode->i_mapping, index, NULL);
+	if (!IS_ERR(page))
+		lock_page(page);
+	return page;
+}
+
+/**
+ * ecryptfs_writepage
+ * @page: Page that is locked before this call is made
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int rc;
+
+	/*
+	 * Refuse to write the page out if we are called from reclaim context
+	 * since our writepage() path may potentially allocate memory when
+	 * calling into the lower fs vfs_write() which may in turn invoke
+	 * us again.
+	 */
+	if (current->flags & PF_MEMALLOC) {
+		redirty_page_for_writepage(wbc, page);
+		rc = 0;
+		goto out;
+	}
+
+	rc = ecryptfs_encrypt_page(page);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error encrypting "
+				"page (upper index [0x%.16lx])\n", page->index);
+		ClearPageUptodate(page);
+		goto out;
+	}
+	SetPageUptodate(page);
+out:
+	unlock_page(page);
+	return rc;
+}
+
+static void strip_xattr_flag(char *page_virt,
+			     struct ecryptfs_crypt_stat *crypt_stat)
+{
+	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
+		size_t written;
+
+		crypt_stat->flags &= ~ECRYPTFS_METADATA_IN_XATTR;
+		ecryptfs_write_crypt_stat_flags(page_virt, crypt_stat,
+						&written);
+		crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+	}
+}
+
+/**
+ *   Header Extent:
+ *     Octets 0-7:        Unencrypted file size (big-endian)
+ *     Octets 8-15:       eCryptfs special marker
+ *     Octets 16-19:      Flags
+ *      Octet 16:         File format version number (between 0 and 255)
+ *      Octets 17-18:     Reserved
+ *      Octet 19:         Bit 1 (lsb): Reserved
+ *                        Bit 2: Encrypted?
+ *                        Bits 3-8: Reserved
+ *     Octets 20-23:      Header extent size (big-endian)
+ *     Octets 24-25:      Number of header extents at front of file
+ *                        (big-endian)
+ *     Octet  26:         Begin RFC 2440 authentication token packet set
+ */
+
+/**
+ * ecryptfs_copy_up_encrypted_with_header
+ * @page: Sort of a ``virtual'' representation of the encrypted lower
+ *        file. The actual lower file does not have the metadata in
+ *        the header. This is locked.
+ * @crypt_stat: The eCryptfs inode's cryptographic context
+ *
+ * The ``view'' is the version of the file that userspace winds up
+ * seeing, with the header information inserted.
+ */
+static int
+ecryptfs_copy_up_encrypted_with_header(struct page *page,
+				       struct ecryptfs_crypt_stat *crypt_stat)
+{
+	loff_t extent_num_in_page = 0;
+	loff_t num_extents_per_page = (PAGE_CACHE_SIZE
+				       / crypt_stat->extent_size);
+	int rc = 0;
+
+	while (extent_num_in_page < num_extents_per_page) {
+		loff_t view_extent_num = ((((loff_t)page->index)
+					   * num_extents_per_page)
+					  + extent_num_in_page);
+		size_t num_header_extents_at_front =
+			(crypt_stat->metadata_size / crypt_stat->extent_size);
+
+		if (view_extent_num < num_header_extents_at_front) {
+			/* This is a header extent */
+			char *page_virt;
+
+			page_virt = kmap_atomic(page, KM_USER0);
+			memset(page_virt, 0, PAGE_CACHE_SIZE);
+			/* TODO: Support more than one header extent */
+			if (view_extent_num == 0) {
+				size_t written;
+
+				rc = ecryptfs_read_xattr_region(
+					page_virt, page->mapping->host);
+				strip_xattr_flag(page_virt + 16, crypt_stat);
+				ecryptfs_write_header_metadata(page_virt + 20,
+							       crypt_stat,
+							       &written);
+			}
+			kunmap_atomic(page_virt, KM_USER0);
+			flush_dcache_page(page);
+			if (rc) {
+				printk(KERN_ERR "%s: Error reading xattr "
+				       "region; rc = [%d]\n", __func__, rc);
+				goto out;
+			}
+		} else {
+			/* This is an encrypted data extent */
+			loff_t lower_offset =
+				((view_extent_num * crypt_stat->extent_size)
+				 - crypt_stat->metadata_size);
+
+			rc = ecryptfs_read_lower_page_segment(
+				page, (lower_offset >> PAGE_CACHE_SHIFT),
+				(lower_offset & ~PAGE_CACHE_MASK),
+				crypt_stat->extent_size, page->mapping->host);
+			if (rc) {
+				printk(KERN_ERR "%s: Error attempting to read "
+				       "extent at offset [%lld] in the lower "
+				       "file; rc = [%d]\n", __func__,
+				       lower_offset, rc);
+				goto out;
+			}
+		}
+		extent_num_in_page++;
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_readpage
+ * @file: An eCryptfs file
+ * @page: Page from eCryptfs inode mapping into which to stick the read data
+ *
+ * Read in a page, decrypting if necessary.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int ecryptfs_readpage(struct file *file, struct page *page)
+{
+	struct ecryptfs_crypt_stat *crypt_stat =
+		&ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
+	int rc = 0;
+
+	if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+		rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
+						      PAGE_CACHE_SIZE,
+						      page->mapping->host);
+	} else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) {
+		if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
+			rc = ecryptfs_copy_up_encrypted_with_header(page,
+								    crypt_stat);
+			if (rc) {
+				printk(KERN_ERR "%s: Error attempting to copy "
+				       "the encrypted content from the lower "
+				       "file whilst inserting the metadata "
+				       "from the xattr into the header; rc = "
+				       "[%d]\n", __func__, rc);
+				goto out;
+			}
+
+		} else {
+			rc = ecryptfs_read_lower_page_segment(
+				page, page->index, 0, PAGE_CACHE_SIZE,
+				page->mapping->host);
+			if (rc) {
+				printk(KERN_ERR "Error reading page; rc = "
+				       "[%d]\n", rc);
+				goto out;
+			}
+		}
+	} else {
+		rc = ecryptfs_decrypt_page(page);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR, "Error decrypting page; "
+					"rc = [%d]\n", rc);
+			goto out;
+		}
+	}
+out:
+	if (rc)
+		ClearPageUptodate(page);
+	else
+		SetPageUptodate(page);
+	ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n",
+			page->index);
+	unlock_page(page);
+	return rc;
+}
+
+/**
+ * Called with lower inode mutex held.
+ */
+static int fill_zeros_to_end_of_page(struct page *page, unsigned int to)
+{
+	struct inode *inode = page->mapping->host;
+	int end_byte_in_page;
+
+	if ((i_size_read(inode) / PAGE_CACHE_SIZE) != page->index)
+		goto out;
+	end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE;
+	if (to > end_byte_in_page)
+		end_byte_in_page = to;
+	zero_user_segment(page, end_byte_in_page, PAGE_CACHE_SIZE);
+out:
+	return 0;
+}
+
+/**
+ * ecryptfs_write_begin
+ * @file: The eCryptfs file
+ * @mapping: The eCryptfs object
+ * @pos: The file offset at which to start writing
+ * @len: Length of the write
+ * @flags: Various flags
+ * @pagep: Pointer to return the page
+ * @fsdata: Pointer to return fs data (unused)
+ *
+ * This function must zero any hole we create
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_write_begin(struct file *file,
+			struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	loff_t prev_page_end_size;
+	int rc = 0;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
+	if (!PageUptodate(page)) {
+		struct ecryptfs_crypt_stat *crypt_stat =
+			&ecryptfs_inode_to_private(mapping->host)->crypt_stat;
+
+		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+			rc = ecryptfs_read_lower_page_segment(
+				page, index, 0, PAGE_CACHE_SIZE, mapping->host);
+			if (rc) {
+				printk(KERN_ERR "%s: Error attemping to read "
+				       "lower page segment; rc = [%d]\n",
+				       __func__, rc);
+				ClearPageUptodate(page);
+				goto out;
+			} else
+				SetPageUptodate(page);
+		} else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) {
+			if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
+				rc = ecryptfs_copy_up_encrypted_with_header(
+					page, crypt_stat);
+				if (rc) {
+					printk(KERN_ERR "%s: Error attempting "
+					       "to copy the encrypted content "
+					       "from the lower file whilst "
+					       "inserting the metadata from "
+					       "the xattr into the header; rc "
+					       "= [%d]\n", __func__, rc);
+					ClearPageUptodate(page);
+					goto out;
+				}
+				SetPageUptodate(page);
+			} else {
+				rc = ecryptfs_read_lower_page_segment(
+					page, index, 0, PAGE_CACHE_SIZE,
+					mapping->host);
+				if (rc) {
+					printk(KERN_ERR "%s: Error reading "
+					       "page; rc = [%d]\n",
+					       __func__, rc);
+					ClearPageUptodate(page);
+					goto out;
+				}
+				SetPageUptodate(page);
+			}
+		} else {
+			if (prev_page_end_size
+			    >= i_size_read(page->mapping->host)) {
+				zero_user(page, 0, PAGE_CACHE_SIZE);
+			} else {
+				rc = ecryptfs_decrypt_page(page);
+				if (rc) {
+					printk(KERN_ERR "%s: Error decrypting "
+					       "page at index [%ld]; "
+					       "rc = [%d]\n",
+					       __func__, page->index, rc);
+					ClearPageUptodate(page);
+					goto out;
+				}
+			}
+			SetPageUptodate(page);
+		}
+	}
+	/* If creating a page or more of holes, zero them out via truncate.
+	 * Note, this will increase i_size. */
+	if (index != 0) {
+		if (prev_page_end_size > i_size_read(page->mapping->host)) {
+			rc = ecryptfs_truncate(file->f_path.dentry,
+					       prev_page_end_size);
+			if (rc) {
+				printk(KERN_ERR "%s: Error on attempt to "
+				       "truncate to (higher) offset [%lld];"
+				       " rc = [%d]\n", __func__,
+				       prev_page_end_size, rc);
+				goto out;
+			}
+		}
+	}
+	/* Writing to a new page, and creating a small hole from start
+	 * of page?  Zero it out. */
+	if ((i_size_read(mapping->host) == prev_page_end_size)
+	    && (pos != 0))
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+out:
+	if (unlikely(rc)) {
+		unlock_page(page);
+		page_cache_release(page);
+		*pagep = NULL;
+	}
+	return rc;
+}
+
+/**
+ * ecryptfs_write_inode_size_to_header
+ *
+ * Writes the lower file size to the first 8 bytes of the header.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
+{
+	char *file_size_virt;
+	int rc;
+
+	file_size_virt = kmalloc(sizeof(u64), GFP_KERNEL);
+	if (!file_size_virt) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	put_unaligned_be64(i_size_read(ecryptfs_inode), file_size_virt);
+	rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0,
+				  sizeof(u64));
+	kfree(file_size_virt);
+	if (rc < 0)
+		printk(KERN_ERR "%s: Error writing file size to header; "
+		       "rc = [%d]\n", __func__, rc);
+	else
+		rc = 0;
+out:
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_xattr_cache;
+
+static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
+{
+	ssize_t size;
+	void *xattr_virt;
+	struct dentry *lower_dentry =
+		ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_dentry;
+	struct inode *lower_inode = lower_dentry->d_inode;
+	int rc;
+
+	if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) {
+		printk(KERN_WARNING
+		       "No support for setting xattr in lower filesystem\n");
+		rc = -ENOSYS;
+		goto out;
+	}
+	xattr_virt = kmem_cache_alloc(ecryptfs_xattr_cache, GFP_KERNEL);
+	if (!xattr_virt) {
+		printk(KERN_ERR "Out of memory whilst attempting to write "
+		       "inode size to xattr\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	mutex_lock(&lower_inode->i_mutex);
+	size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
+					   xattr_virt, PAGE_CACHE_SIZE);
+	if (size < 0)
+		size = 8;
+	put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
+	rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
+					 xattr_virt, size, 0);
+	mutex_unlock(&lower_inode->i_mutex);
+	if (rc)
+		printk(KERN_ERR "Error whilst attempting to write inode size "
+		       "to lower file xattr; rc = [%d]\n", rc);
+	kmem_cache_free(ecryptfs_xattr_cache, xattr_virt);
+out:
+	return rc;
+}
+
+int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
+{
+	struct ecryptfs_crypt_stat *crypt_stat;
+
+	crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
+	BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
+	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+		return ecryptfs_write_inode_size_to_xattr(ecryptfs_inode);
+	else
+		return ecryptfs_write_inode_size_to_header(ecryptfs_inode);
+}
+
+/**
+ * ecryptfs_write_end
+ * @file: The eCryptfs file object
+ * @mapping: The eCryptfs object
+ * @pos: The file position
+ * @len: The length of the data (unused)
+ * @copied: The amount of data copied
+ * @page: The eCryptfs page
+ * @fsdata: The fsdata (unused)
+ *
+ * This is where we encrypt the data and pass the encrypted data to
+ * the lower filesystem.  In OpenPGP-compatible mode, we operate on
+ * entire underlying packets.
+ */
+static int ecryptfs_write_end(struct file *file,
+			struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned to = from + copied;
+	struct inode *ecryptfs_inode = mapping->host;
+	struct ecryptfs_crypt_stat *crypt_stat =
+		&ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
+	int rc;
+	int need_unlock_page = 1;
+
+	ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
+			"(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
+	if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+		rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
+						       to);
+		if (!rc) {
+			rc = copied;
+			fsstack_copy_inode_size(ecryptfs_inode,
+				ecryptfs_inode_to_lower(ecryptfs_inode));
+		}
+		goto out;
+	}
+	/* Fills in zeros if 'to' goes beyond inode size */
+	rc = fill_zeros_to_end_of_page(page, to);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
+			"zeros in page with index = [0x%.16lx]\n", index);
+		goto out;
+	}
+	set_page_dirty(page);
+	unlock_page(page);
+	need_unlock_page = 0;
+	if (pos + copied > i_size_read(ecryptfs_inode)) {
+		i_size_write(ecryptfs_inode, pos + copied);
+		ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
+			"[0x%.16llx]\n",
+			(unsigned long long)i_size_read(ecryptfs_inode));
+		balance_dirty_pages_ratelimited(mapping);
+		rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
+		if (rc) {
+			printk(KERN_ERR "Error writing inode size to metadata; "
+			       "rc = [%d]\n", rc);
+			goto out;
+		}
+	}
+	rc = copied;
+out:
+	if (need_unlock_page)
+		unlock_page(page);
+	page_cache_release(page);
+	return rc;
+}
+
+static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
+{
+	int rc = 0;
+	struct inode *inode;
+	struct inode *lower_inode;
+
+	inode = (struct inode *)mapping->host;
+	lower_inode = ecryptfs_inode_to_lower(inode);
+	if (lower_inode->i_mapping->a_ops->bmap)
+		rc = lower_inode->i_mapping->a_ops->bmap(lower_inode->i_mapping,
+							 block);
+	return rc;
+}
+
+const struct address_space_operations ecryptfs_aops = {
+	.writepage = ecryptfs_writepage,
+	.readpage = ecryptfs_readpage,
+	.write_begin = ecryptfs_write_begin,
+	.write_end = ecryptfs_write_end,
+	.bmap = ecryptfs_bmap,
+};
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
new file mode 100644
index 00000000..608c1c3f
--- /dev/null
+++ b/fs/ecryptfs/read_write.c
@@ -0,0 +1,357 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 2007 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_write_lower
+ * @ecryptfs_inode: The eCryptfs inode
+ * @data: Data to write
+ * @offset: Byte offset in the lower file to which to write the data
+ * @size: Number of bytes from @data to write at @offset in the lower
+ *        file
+ *
+ * Write data to the lower file.
+ *
+ * Returns bytes written on success; less than zero on error
+ */
+int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
+			 loff_t offset, size_t size)
+{
+	struct file *lower_file;
+	mm_segment_t fs_save;
+	ssize_t rc;
+
+	lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
+	if (!lower_file)
+		return -EIO;
+	fs_save = get_fs();
+	set_fs(get_ds());
+	rc = vfs_write(lower_file, data, size, &offset);
+	set_fs(fs_save);
+	mark_inode_dirty_sync(ecryptfs_inode);
+	return rc;
+}
+
+/**
+ * ecryptfs_write_lower_page_segment
+ * @ecryptfs_inode: The eCryptfs inode
+ * @page_for_lower: The page containing the data to be written to the
+ *                  lower file
+ * @offset_in_page: The offset in the @page_for_lower from which to
+ *                  start writing the data
+ * @size: The amount of data from @page_for_lower to write to the
+ *        lower file
+ *
+ * Determines the byte offset in the file for the given page and
+ * offset within the page, maps the page, and makes the call to write
+ * the contents of @page_for_lower to the lower inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
+				      struct page *page_for_lower,
+				      size_t offset_in_page, size_t size)
+{
+	char *virt;
+	loff_t offset;
+	int rc;
+
+	offset = ((((loff_t)page_for_lower->index) << PAGE_CACHE_SHIFT)
+		  + offset_in_page);
+	virt = kmap(page_for_lower);
+	rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
+	if (rc > 0)
+		rc = 0;
+	kunmap(page_for_lower);
+	return rc;
+}
+
+/**
+ * ecryptfs_write
+ * @ecryptfs_inode: The eCryptfs file into which to write
+ * @data: Virtual address where data to write is located
+ * @offset: Offset in the eCryptfs file at which to begin writing the
+ *          data from @data
+ * @size: The number of bytes to write from @data
+ *
+ * Write an arbitrary amount of data to an arbitrary location in the
+ * eCryptfs inode page cache. This is done on a page-by-page, and then
+ * by an extent-by-extent, basis; individual extents are encrypted and
+ * written to the lower page cache (via VFS writes). This function
+ * takes care of all the address translation to locations in the lower
+ * filesystem; it also handles truncate events, writing out zeros
+ * where necessary.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
+		   size_t size)
+{
+	struct page *ecryptfs_page;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	char *ecryptfs_page_virt;
+	loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
+	loff_t data_offset = 0;
+	loff_t pos;
+	int rc = 0;
+
+	crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
+	/*
+	 * if we are writing beyond current size, then start pos
+	 * at the current size - we'll fill in zeros from there.
+	 */
+	if (offset > ecryptfs_file_size)
+		pos = ecryptfs_file_size;
+	else
+		pos = offset;
+	while (pos < (offset + size)) {
+		pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
+		size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
+		size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
+		loff_t total_remaining_bytes = ((offset + size) - pos);
+
+		if (fatal_signal_pending(current)) {
+			rc = -EINTR;
+			break;
+		}
+
+		if (num_bytes > total_remaining_bytes)
+			num_bytes = total_remaining_bytes;
+		if (pos < offset) {
+			/* remaining zeros to write, up to destination offset */
+			loff_t total_remaining_zeros = (offset - pos);
+
+			if (num_bytes > total_remaining_zeros)
+				num_bytes = total_remaining_zeros;
+		}
+		ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
+							 ecryptfs_page_idx);
+		if (IS_ERR(ecryptfs_page)) {
+			rc = PTR_ERR(ecryptfs_page);
+			printk(KERN_ERR "%s: Error getting page at "
+			       "index [%ld] from eCryptfs inode "
+			       "mapping; rc = [%d]\n", __func__,
+			       ecryptfs_page_idx, rc);
+			goto out;
+		}
+		ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
+
+		/*
+		 * pos: where we're now writing, offset: where the request was
+		 * If current pos is before request, we are filling zeros
+		 * If we are at or beyond request, we are writing the *data*
+		 * If we're in a fresh page beyond eof, zero it in either case
+		 */
+		if (pos < offset || !start_offset_in_page) {
+			/* We are extending past the previous end of the file.
+			 * Fill in zero values to the end of the page */
+			memset(((char *)ecryptfs_page_virt
+				+ start_offset_in_page), 0,
+				PAGE_CACHE_SIZE - start_offset_in_page);
+		}
+
+		/* pos >= offset, we are now writing the data request */
+		if (pos >= offset) {
+			memcpy(((char *)ecryptfs_page_virt
+				+ start_offset_in_page),
+			       (data + data_offset), num_bytes);
+			data_offset += num_bytes;
+		}
+		kunmap_atomic(ecryptfs_page_virt, KM_USER0);
+		flush_dcache_page(ecryptfs_page);
+		SetPageUptodate(ecryptfs_page);
+		unlock_page(ecryptfs_page);
+		if (crypt_stat->flags & ECRYPTFS_ENCRYPTED)
+			rc = ecryptfs_encrypt_page(ecryptfs_page);
+		else
+			rc = ecryptfs_write_lower_page_segment(ecryptfs_inode,
+						ecryptfs_page,
+						start_offset_in_page,
+						data_offset);
+		page_cache_release(ecryptfs_page);
+		if (rc) {
+			printk(KERN_ERR "%s: Error encrypting "
+			       "page; rc = [%d]\n", __func__, rc);
+			goto out;
+		}
+		pos += num_bytes;
+	}
+	if (pos > ecryptfs_file_size) {
+		i_size_write(ecryptfs_inode, pos);
+		if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) {
+			int rc2;
+
+			rc2 = ecryptfs_write_inode_size_to_metadata(
+								ecryptfs_inode);
+			if (rc2) {
+				printk(KERN_ERR	"Problem with "
+				       "ecryptfs_write_inode_size_to_metadata; "
+				       "rc = [%d]\n", rc2);
+				if (!rc)
+					rc = rc2;
+				goto out;
+			}
+		}
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_read_lower
+ * @data: The read data is stored here by this function
+ * @offset: Byte offset in the lower file from which to read the data
+ * @size: Number of bytes to read from @offset of the lower file and
+ *        store into @data
+ * @ecryptfs_inode: The eCryptfs inode
+ *
+ * Read @size bytes of data at byte offset @offset from the lower
+ * inode into memory location @data.
+ *
+ * Returns bytes read on success; 0 on EOF; less than zero on error
+ */
+int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
+			struct inode *ecryptfs_inode)
+{
+	struct file *lower_file;
+	mm_segment_t fs_save;
+	ssize_t rc;
+
+	lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
+	if (!lower_file)
+		return -EIO;
+	fs_save = get_fs();
+	set_fs(get_ds());
+	rc = vfs_read(lower_file, data, size, &offset);
+	set_fs(fs_save);
+	return rc;
+}
+
+/**
+ * ecryptfs_read_lower_page_segment
+ * @page_for_ecryptfs: The page into which data for eCryptfs will be
+ *                     written
+ * @offset_in_page: Offset in @page_for_ecryptfs from which to start
+ *                  writing
+ * @size: The number of bytes to write into @page_for_ecryptfs
+ * @ecryptfs_inode: The eCryptfs inode
+ *
+ * Determines the byte offset in the file for the given page and
+ * offset within the page, maps the page, and makes the call to read
+ * the contents of @page_for_ecryptfs from the lower inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
+				     pgoff_t page_index,
+				     size_t offset_in_page, size_t size,
+				     struct inode *ecryptfs_inode)
+{
+	char *virt;
+	loff_t offset;
+	int rc;
+
+	offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page);
+	virt = kmap(page_for_ecryptfs);
+	rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
+	if (rc > 0)
+		rc = 0;
+	kunmap(page_for_ecryptfs);
+	flush_dcache_page(page_for_ecryptfs);
+	return rc;
+}
+
+#if 0
+/**
+ * ecryptfs_read
+ * @data: The virtual address into which to write the data read (and
+ *        possibly decrypted) from the lower file
+ * @offset: The offset in the decrypted view of the file from which to
+ *          read into @data
+ * @size: The number of bytes to read into @data
+ * @ecryptfs_file: The eCryptfs file from which to read
+ *
+ * Read an arbitrary amount of data from an arbitrary location in the
+ * eCryptfs page cache. This is done on an extent-by-extent basis;
+ * individual extents are decrypted and read from the lower page
+ * cache (via VFS reads). This function takes care of all the
+ * address translation to locations in the lower filesystem.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_read(char *data, loff_t offset, size_t size,
+		  struct file *ecryptfs_file)
+{
+	struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
+	struct page *ecryptfs_page;
+	char *ecryptfs_page_virt;
+	loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
+	loff_t data_offset = 0;
+	loff_t pos;
+	int rc = 0;
+
+	if ((offset + size) > ecryptfs_file_size) {
+		rc = -EINVAL;
+		printk(KERN_ERR "%s: Attempt to read data past the end of the "
+			"file; offset = [%lld]; size = [%td]; "
+		       "ecryptfs_file_size = [%lld]\n",
+		       __func__, offset, size, ecryptfs_file_size);
+		goto out;
+	}
+	pos = offset;
+	while (pos < (offset + size)) {
+		pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
+		size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
+		size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
+		size_t total_remaining_bytes = ((offset + size) - pos);
+
+		if (num_bytes > total_remaining_bytes)
+			num_bytes = total_remaining_bytes;
+		ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
+							 ecryptfs_page_idx);
+		if (IS_ERR(ecryptfs_page)) {
+			rc = PTR_ERR(ecryptfs_page);
+			printk(KERN_ERR "%s: Error getting page at "
+			       "index [%ld] from eCryptfs inode "
+			       "mapping; rc = [%d]\n", __func__,
+			       ecryptfs_page_idx, rc);
+			goto out;
+		}
+		ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
+		memcpy((data + data_offset),
+		       ((char *)ecryptfs_page_virt + start_offset_in_page),
+		       num_bytes);
+		kunmap_atomic(ecryptfs_page_virt, KM_USER0);
+		flush_dcache_page(ecryptfs_page);
+		SetPageUptodate(ecryptfs_page);
+		unlock_page(ecryptfs_page);
+		page_cache_release(ecryptfs_page);
+		pos += num_bytes;
+		data_offset += num_bytes;
+	}
+out:
+	return rc;
+}
+#endif  /*  0  */
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
new file mode 100644
index 00000000..dbd52d40
--- /dev/null
+++ b/fs/ecryptfs/super.c
@@ -0,0 +1,181 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *              Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/key.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/file.h>
+#include <linux/crypto.h>
+#include "ecryptfs_kernel.h"
+
+struct kmem_cache *ecryptfs_inode_info_cache;
+
+/**
+ * ecryptfs_alloc_inode - allocate an ecryptfs inode
+ * @sb: Pointer to the ecryptfs super block
+ *
+ * Called to bring an inode into existence.
+ *
+ * Only handle allocation, setting up structures should be done in
+ * ecryptfs_read_inode. This is because the kernel, between now and
+ * then, will 0 out the private data pointer.
+ *
+ * Returns a pointer to a newly allocated inode, NULL otherwise
+ */
+static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
+{
+	struct ecryptfs_inode_info *inode_info;
+	struct inode *inode = NULL;
+
+	inode_info = kmem_cache_alloc(ecryptfs_inode_info_cache, GFP_KERNEL);
+	if (unlikely(!inode_info))
+		goto out;
+	ecryptfs_init_crypt_stat(&inode_info->crypt_stat);
+	mutex_init(&inode_info->lower_file_mutex);
+	atomic_set(&inode_info->lower_file_count, 0);
+	inode_info->lower_file = NULL;
+	inode = &inode_info->vfs_inode;
+out:
+	return inode;
+}
+
+static void ecryptfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ecryptfs_inode_info *inode_info;
+	inode_info = ecryptfs_inode_to_private(inode);
+
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+}
+
+/**
+ * ecryptfs_destroy_inode
+ * @inode: The ecryptfs inode
+ *
+ * This is used during the final destruction of the inode.  All
+ * allocation of memory related to the inode, including allocated
+ * memory in the crypt_stat struct, will be released here.
+ * There should be no chance that this deallocation will be missed.
+ */
+static void ecryptfs_destroy_inode(struct inode *inode)
+{
+	struct ecryptfs_inode_info *inode_info;
+
+	inode_info = ecryptfs_inode_to_private(inode);
+	BUG_ON(inode_info->lower_file);
+	ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
+	call_rcu(&inode->i_rcu, ecryptfs_i_callback);
+}
+
+/**
+ * ecryptfs_statfs
+ * @sb: The ecryptfs super block
+ * @buf: The struct kstatfs to fill in with stats
+ *
+ * Get the filesystem statistics. Currently, we let this pass right through
+ * to the lower filesystem and take no action ourselves.
+ */
+static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+
+	if (!lower_dentry->d_sb->s_op->statfs)
+		return -ENOSYS;
+	return lower_dentry->d_sb->s_op->statfs(lower_dentry, buf);
+}
+
+/**
+ * ecryptfs_evict_inode
+ * @inode - The ecryptfs inode
+ *
+ * Called by iput() when the inode reference count reached zero
+ * and the inode is not hashed anywhere.  Used to clear anything
+ * that needs to be, before the inode is completely destroyed and put
+ * on the inode free list. We use this to drop out reference to the
+ * lower inode.
+ */
+static void ecryptfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	iput(ecryptfs_inode_to_lower(inode));
+}
+
+/**
+ * ecryptfs_show_options
+ *
+ * Prints the mount options for a given superblock.
+ * Returns zero; does not fail.
+ */
+static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct super_block *sb = mnt->mnt_sb;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		&ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
+	struct ecryptfs_global_auth_tok *walker;
+
+	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	list_for_each_entry(walker,
+			    &mount_crypt_stat->global_auth_tok_list,
+			    mount_crypt_stat_list) {
+		if (walker->flags & ECRYPTFS_AUTH_TOK_FNEK)
+			seq_printf(m, ",ecryptfs_fnek_sig=%s", walker->sig);
+		else
+			seq_printf(m, ",ecryptfs_sig=%s", walker->sig);
+	}
+	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+
+	seq_printf(m, ",ecryptfs_cipher=%s",
+		mount_crypt_stat->global_default_cipher_name);
+
+	if (mount_crypt_stat->global_default_cipher_key_size)
+		seq_printf(m, ",ecryptfs_key_bytes=%zd",
+			   mount_crypt_stat->global_default_cipher_key_size);
+	if (mount_crypt_stat->flags & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)
+		seq_printf(m, ",ecryptfs_passthrough");
+	if (mount_crypt_stat->flags & ECRYPTFS_XATTR_METADATA_ENABLED)
+		seq_printf(m, ",ecryptfs_xattr_metadata");
+	if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
+		seq_printf(m, ",ecryptfs_encrypted_view");
+	if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
+		seq_printf(m, ",ecryptfs_unlink_sigs");
+	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
+		seq_printf(m, ",ecryptfs_mount_auth_tok_only");
+
+	return 0;
+}
+
+const struct super_operations ecryptfs_sops = {
+	.alloc_inode = ecryptfs_alloc_inode,
+	.destroy_inode = ecryptfs_destroy_inode,
+	.drop_inode = generic_drop_inode,
+	.statfs = ecryptfs_statfs,
+	.remount_fs = NULL,
+	.evict_inode = ecryptfs_evict_inode,
+	.show_options = ecryptfs_show_options
+};
diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
new file mode 100644
index 00000000..6ebfc1c2
--- /dev/null
+++ b/fs/efs/Kconfig
@@ -0,0 +1,14 @@
+config EFS_FS
+	tristate "EFS file system support (read only) (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	help
+	  EFS is an older file system used for non-ISO9660 CD-ROMs and hard
+	  disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
+	  uses the XFS file system for hard disk partitions however).
+
+	  This implementation only offers read-only access. If you don't know
+	  what all this is about, it's safe to say N. For more information
+	  about EFS see its home page at <http://aeschi.ch.eu.org/efs/>.
+
+	  To compile the EFS file system support as a module, choose M here: the
+	  module will be called efs.
diff --git a/fs/efs/Makefile b/fs/efs/Makefile
new file mode 100644
index 00000000..963543d4
--- /dev/null
+++ b/fs/efs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux efs-filesystem routines.
+#
+
+obj-$(CONFIG_EFS_FS) += efs.o
+
+efs-objs := super.o inode.o namei.o dir.o file.o symlink.o
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
new file mode 100644
index 00000000..7ee6f7e3
--- /dev/null
+++ b/fs/efs/dir.c
@@ -0,0 +1,110 @@
+/*
+ * dir.c
+ *
+ * Copyright (c) 1999 Al Smith
+ */
+
+#include <linux/buffer_head.h>
+#include "efs.h"
+
+static int efs_readdir(struct file *, void *, filldir_t);
+
+const struct file_operations efs_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= efs_readdir,
+};
+
+const struct inode_operations efs_dir_inode_operations = {
+	.lookup		= efs_lookup,
+};
+
+static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct buffer_head *bh;
+
+	struct efs_dir		*dirblock;
+	struct efs_dentry	*dirslot;
+	efs_ino_t		inodenum;
+	efs_block_t		block;
+	int			slot, namelen;
+	char			*nameptr;
+
+	if (inode->i_size & (EFS_DIRBSIZE-1))
+		printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n");
+
+	/* work out where this entry can be found */
+	block = filp->f_pos >> EFS_DIRBSIZE_BITS;
+
+	/* each block contains at most 256 slots */
+	slot  = filp->f_pos & 0xff;
+
+	/* look at all blocks */
+	while (block < inode->i_blocks) {
+		/* read the dir block */
+		bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
+
+		if (!bh) {
+			printk(KERN_ERR "EFS: readdir(): failed to read dir block %d\n", block);
+			break;
+		}
+
+		dirblock = (struct efs_dir *) bh->b_data; 
+
+		if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) {
+			printk(KERN_ERR "EFS: readdir(): invalid directory block\n");
+			brelse(bh);
+			break;
+		}
+
+		while (slot < dirblock->slots) {
+			if (dirblock->space[slot] == 0) {
+				slot++;
+				continue;
+			}
+
+			dirslot  = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot));
+
+			inodenum = be32_to_cpu(dirslot->inode);
+			namelen  = dirslot->namelen;
+			nameptr  = dirslot->name;
+
+#ifdef DEBUG
+			printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen);
+#endif
+			if (namelen > 0) {
+				/* found the next entry */
+				filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
+
+				/* copy filename and data in dirslot */
+				filldir(dirent, nameptr, namelen, filp->f_pos, inodenum, DT_UNKNOWN);
+
+				/* sanity check */
+				if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
+					printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
+					slot++;
+					continue;
+				}
+
+				/* store position of next slot */
+				if (++slot == dirblock->slots) {
+					slot = 0;
+					block++;
+				}
+				brelse(bh);
+				filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
+				goto out;
+			}
+			slot++;
+		}
+		brelse(bh);
+
+		slot = 0;
+		block++;
+	}
+
+	filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
+out:
+	return 0;
+}
+
diff --git a/fs/efs/efs.h b/fs/efs/efs.h
new file mode 100644
index 00000000..d8305b58
--- /dev/null
+++ b/fs/efs/efs.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 1999 Al Smith
+ *
+ * Portions derived from work (c) 1995,1996 Christian Vogelgsang.
+ * Portions derived from IRIX header files (c) 1988 Silicon Graphics
+ */
+#ifndef _EFS_EFS_H_
+#define _EFS_EFS_H_
+
+#include <linux/fs.h>
+#include <asm/uaccess.h>
+
+#define EFS_VERSION "1.0a"
+
+static const char cprt[] = "EFS: "EFS_VERSION" - (c) 1999 Al Smith <Al.Smith@aeschi.ch.eu.org>";
+
+
+/* 1 block is 512 bytes */
+#define	EFS_BLOCKSIZE_BITS	9
+#define	EFS_BLOCKSIZE		(1 << EFS_BLOCKSIZE_BITS)
+
+typedef	int32_t		efs_block_t;
+typedef uint32_t	efs_ino_t;
+
+#define	EFS_DIRECTEXTENTS	12
+
+/*
+ * layout of an extent, in memory and on disk. 8 bytes exactly.
+ */
+typedef union extent_u {
+	unsigned char raw[8];
+	struct extent_s {
+		unsigned int	ex_magic:8;	/* magic # (zero) */
+		unsigned int	ex_bn:24;	/* basic block */
+		unsigned int	ex_length:8;	/* numblocks in this extent */
+		unsigned int	ex_offset:24;	/* logical offset into file */
+	} cooked;
+} efs_extent;
+
+typedef struct edevs {
+	__be16		odev;
+	__be32		ndev;
+} efs_devs;
+
+/*
+ * extent based filesystem inode as it appears on disk.  The efs inode
+ * is exactly 128 bytes long.
+ */
+struct	efs_dinode {
+	__be16		di_mode;	/* mode and type of file */
+	__be16		di_nlink;	/* number of links to file */
+	__be16		di_uid;		/* owner's user id */
+	__be16		di_gid;		/* owner's group id */
+	__be32		di_size;	/* number of bytes in file */
+	__be32		di_atime;	/* time last accessed */
+	__be32		di_mtime;	/* time last modified */
+	__be32		di_ctime;	/* time created */
+	__be32		di_gen;		/* generation number */
+	__be16		di_numextents;	/* # of extents */
+	u_char		di_version;	/* version of inode */
+	u_char		di_spare;	/* spare - used by AFS */
+	union di_addr {
+		efs_extent	di_extents[EFS_DIRECTEXTENTS];
+		efs_devs	di_dev;	/* device for IFCHR/IFBLK */
+	} di_u;
+};
+
+/* efs inode storage in memory */
+struct efs_inode_info {
+	int		numextents;
+	int		lastextent;
+
+	efs_extent	extents[EFS_DIRECTEXTENTS];
+	struct inode	vfs_inode;
+};
+
+#include <linux/efs_fs_sb.h>
+
+#define EFS_DIRBSIZE_BITS	EFS_BLOCKSIZE_BITS
+#define EFS_DIRBSIZE		(1 << EFS_DIRBSIZE_BITS)
+
+struct efs_dentry {
+	__be32		inode;
+	unsigned char	namelen;
+	char		name[3];
+};
+
+#define EFS_DENTSIZE	(sizeof(struct efs_dentry) - 3 + 1)
+#define EFS_MAXNAMELEN  ((1 << (sizeof(char) * 8)) - 1)
+
+#define EFS_DIRBLK_HEADERSIZE	4
+#define EFS_DIRBLK_MAGIC	0xbeef	/* moo */
+
+struct efs_dir {
+	__be16	magic;
+	unsigned char	firstused;
+	unsigned char	slots;
+
+	unsigned char	space[EFS_DIRBSIZE - EFS_DIRBLK_HEADERSIZE];
+};
+
+#define EFS_MAXENTS \
+	((EFS_DIRBSIZE - EFS_DIRBLK_HEADERSIZE) / \
+	 (EFS_DENTSIZE + sizeof(char)))
+
+#define EFS_SLOTAT(dir, slot) EFS_REALOFF((dir)->space[slot])
+
+#define EFS_REALOFF(offset) ((offset << 1))
+
+
+static inline struct efs_inode_info *INODE_INFO(struct inode *inode)
+{
+	return container_of(inode, struct efs_inode_info, vfs_inode);
+}
+
+static inline struct efs_sb_info *SUPER_INFO(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+struct statfs;
+struct fid;
+
+extern const struct inode_operations efs_dir_inode_operations;
+extern const struct file_operations efs_dir_operations;
+extern const struct address_space_operations efs_symlink_aops;
+
+extern struct inode *efs_iget(struct super_block *, unsigned long);
+extern efs_block_t efs_map_block(struct inode *, efs_block_t);
+extern int efs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+
+extern struct dentry *efs_lookup(struct inode *, struct dentry *, struct nameidata *);
+extern struct dentry *efs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type);
+extern struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type);
+extern struct dentry *efs_get_parent(struct dentry *);
+extern int efs_bmap(struct inode *, int);
+
+#endif /* _EFS_EFS_H_ */
diff --git a/fs/efs/file.c b/fs/efs/file.c
new file mode 100644
index 00000000..1ccb364f
--- /dev/null
+++ b/fs/efs/file.c
@@ -0,0 +1,60 @@
+/*
+ * file.c
+ *
+ * Copyright (c) 1999 Al Smith
+ *
+ * Portions derived from work (c) 1995,1996 Christian Vogelgsang.
+ */
+
+#include <linux/buffer_head.h>
+#include "efs.h"
+
+int efs_get_block(struct inode *inode, sector_t iblock,
+		  struct buffer_head *bh_result, int create)
+{
+	int error = -EROFS;
+	long phys;
+
+	if (create)
+		return error;
+	if (iblock >= inode->i_blocks) {
+#ifdef DEBUG
+		/*
+		 * i have no idea why this happens as often as it does
+		 */
+		printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n",
+			block,
+			inode->i_blocks,
+			inode->i_size);
+#endif
+		return 0;
+	}
+	phys = efs_map_block(inode, iblock);
+	if (phys)
+		map_bh(bh_result, inode->i_sb, phys);
+	return 0;
+}
+
+int efs_bmap(struct inode *inode, efs_block_t block) {
+
+	if (block < 0) {
+		printk(KERN_WARNING "EFS: bmap(): block < 0\n");
+		return 0;
+	}
+
+	/* are we about to read past the end of a file ? */
+	if (!(block < inode->i_blocks)) {
+#ifdef DEBUG
+		/*
+		 * i have no idea why this happens as often as it does
+		 */
+		printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n",
+			block,
+			inode->i_blocks,
+			inode->i_size);
+#endif
+		return 0;
+	}
+
+	return efs_map_block(inode, block);
+}
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
new file mode 100644
index 00000000..9c13412e
--- /dev/null
+++ b/fs/efs/inode.c
@@ -0,0 +1,313 @@
+/*
+ * inode.c
+ *
+ * Copyright (c) 1999 Al Smith
+ *
+ * Portions derived from work (c) 1995,1996 Christian Vogelgsang,
+ *              and from work (c) 1998 Mike Shaver.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include "efs.h"
+#include <linux/efs_fs_sb.h>
+
+static int efs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page,efs_get_block);
+}
+static sector_t _efs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,efs_get_block);
+}
+static const struct address_space_operations efs_aops = {
+	.readpage = efs_readpage,
+	.bmap = _efs_bmap
+};
+
+static inline void extent_copy(efs_extent *src, efs_extent *dst) {
+	/*
+	 * this is slightly evil. it doesn't just copy
+	 * efs_extent from src to dst, it also mangles
+	 * the bits so that dst ends up in cpu byte-order.
+	 */
+
+	dst->cooked.ex_magic  =  (unsigned int) src->raw[0];
+	dst->cooked.ex_bn     = ((unsigned int) src->raw[1] << 16) |
+				((unsigned int) src->raw[2] <<  8) |
+				((unsigned int) src->raw[3] <<  0);
+	dst->cooked.ex_length =  (unsigned int) src->raw[4];
+	dst->cooked.ex_offset = ((unsigned int) src->raw[5] << 16) |
+				((unsigned int) src->raw[6] <<  8) |
+				((unsigned int) src->raw[7] <<  0);
+	return;
+}
+
+struct inode *efs_iget(struct super_block *super, unsigned long ino)
+{
+	int i, inode_index;
+	dev_t device;
+	u32 rdev;
+	struct buffer_head *bh;
+	struct efs_sb_info    *sb = SUPER_INFO(super);
+	struct efs_inode_info *in;
+	efs_block_t block, offset;
+	struct efs_dinode *efs_inode;
+	struct inode *inode;
+
+	inode = iget_locked(super, ino);
+	if (IS_ERR(inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	in = INODE_INFO(inode);
+
+	/*
+	** EFS layout:
+	**
+	** |   cylinder group    |   cylinder group    |   cylinder group ..etc
+	** |inodes|data          |inodes|data          |inodes|data       ..etc
+	**
+	** work out the inode block index, (considering initially that the
+	** inodes are stored as consecutive blocks). then work out the block
+	** number of that inode given the above layout, and finally the
+	** offset of the inode within that block.
+	*/
+
+	inode_index = inode->i_ino /
+		(EFS_BLOCKSIZE / sizeof(struct efs_dinode));
+
+	block = sb->fs_start + sb->first_block + 
+		(sb->group_size * (inode_index / sb->inode_blocks)) +
+		(inode_index % sb->inode_blocks);
+
+	offset = (inode->i_ino %
+			(EFS_BLOCKSIZE / sizeof(struct efs_dinode))) *
+		sizeof(struct efs_dinode);
+
+	bh = sb_bread(inode->i_sb, block);
+	if (!bh) {
+		printk(KERN_WARNING "EFS: bread() failed at block %d\n", block);
+		goto read_inode_error;
+	}
+
+	efs_inode = (struct efs_dinode *) (bh->b_data + offset);
+    
+	inode->i_mode  = be16_to_cpu(efs_inode->di_mode);
+	inode->i_nlink = be16_to_cpu(efs_inode->di_nlink);
+	inode->i_uid   = (uid_t)be16_to_cpu(efs_inode->di_uid);
+	inode->i_gid   = (gid_t)be16_to_cpu(efs_inode->di_gid);
+	inode->i_size  = be32_to_cpu(efs_inode->di_size);
+	inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime);
+	inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime);
+	inode->i_ctime.tv_sec = be32_to_cpu(efs_inode->di_ctime);
+	inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
+
+	/* this is the number of blocks in the file */
+	if (inode->i_size == 0) {
+		inode->i_blocks = 0;
+	} else {
+		inode->i_blocks = ((inode->i_size - 1) >> EFS_BLOCKSIZE_BITS) + 1;
+	}
+
+	rdev = be16_to_cpu(efs_inode->di_u.di_dev.odev);
+	if (rdev == 0xffff) {
+		rdev = be32_to_cpu(efs_inode->di_u.di_dev.ndev);
+		if (sysv_major(rdev) > 0xfff)
+			device = 0;
+		else
+			device = MKDEV(sysv_major(rdev), sysv_minor(rdev));
+	} else
+		device = old_decode_dev(rdev);
+
+	/* get the number of extents for this object */
+	in->numextents = be16_to_cpu(efs_inode->di_numextents);
+	in->lastextent = 0;
+
+	/* copy the extents contained within the inode to memory */
+	for(i = 0; i < EFS_DIRECTEXTENTS; i++) {
+		extent_copy(&(efs_inode->di_u.di_extents[i]), &(in->extents[i]));
+		if (i < in->numextents && in->extents[i].cooked.ex_magic != 0) {
+			printk(KERN_WARNING "EFS: extent %d has bad magic number in inode %lu\n", i, inode->i_ino);
+			brelse(bh);
+			goto read_inode_error;
+		}
+	}
+
+	brelse(bh);
+   
+#ifdef DEBUG
+	printk(KERN_DEBUG "EFS: efs_iget(): inode %lu, extents %d, mode %o\n",
+		inode->i_ino, in->numextents, inode->i_mode);
+#endif
+
+	switch (inode->i_mode & S_IFMT) {
+		case S_IFDIR: 
+			inode->i_op = &efs_dir_inode_operations; 
+			inode->i_fop = &efs_dir_operations; 
+			break;
+		case S_IFREG:
+			inode->i_fop = &generic_ro_fops;
+			inode->i_data.a_ops = &efs_aops;
+			break;
+		case S_IFLNK:
+			inode->i_op = &page_symlink_inode_operations;
+			inode->i_data.a_ops = &efs_symlink_aops;
+			break;
+		case S_IFCHR:
+		case S_IFBLK:
+		case S_IFIFO:
+			init_special_inode(inode, inode->i_mode, device);
+			break;
+		default:
+			printk(KERN_WARNING "EFS: unsupported inode mode %o\n", inode->i_mode);
+			goto read_inode_error;
+			break;
+	}
+
+	unlock_new_inode(inode);
+	return inode;
+        
+read_inode_error:
+	printk(KERN_WARNING "EFS: failed to read inode %lu\n", inode->i_ino);
+	iget_failed(inode);
+	return ERR_PTR(-EIO);
+}
+
+static inline efs_block_t
+efs_extent_check(efs_extent *ptr, efs_block_t block, struct efs_sb_info *sb) {
+	efs_block_t start;
+	efs_block_t length;
+	efs_block_t offset;
+
+	/*
+	 * given an extent and a logical block within a file,
+	 * can this block be found within this extent ?
+	 */
+	start  = ptr->cooked.ex_bn;
+	length = ptr->cooked.ex_length;
+	offset = ptr->cooked.ex_offset;
+
+	if ((block >= offset) && (block < offset+length)) {
+		return(sb->fs_start + start + block - offset);
+	} else {
+		return 0;
+	}
+}
+
+efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
+	struct efs_sb_info    *sb = SUPER_INFO(inode->i_sb);
+	struct efs_inode_info *in = INODE_INFO(inode);
+	struct buffer_head    *bh = NULL;
+
+	int cur, last, first = 1;
+	int ibase, ioffset, dirext, direxts, indext, indexts;
+	efs_block_t iblock, result = 0, lastblock = 0;
+	efs_extent ext, *exts;
+
+	last = in->lastextent;
+
+	if (in->numextents <= EFS_DIRECTEXTENTS) {
+		/* first check the last extent we returned */
+		if ((result = efs_extent_check(&in->extents[last], block, sb)))
+			return result;
+    
+		/* if we only have one extent then nothing can be found */
+		if (in->numextents == 1) {
+			printk(KERN_ERR "EFS: map_block() failed to map (1 extent)\n");
+			return 0;
+		}
+
+		direxts = in->numextents;
+
+		/*
+		 * check the stored extents in the inode
+		 * start with next extent and check forwards
+		 */
+		for(dirext = 1; dirext < direxts; dirext++) {
+			cur = (last + dirext) % in->numextents;
+			if ((result = efs_extent_check(&in->extents[cur], block, sb))) {
+				in->lastextent = cur;
+				return result;
+			}
+		}
+
+		printk(KERN_ERR "EFS: map_block() failed to map block %u (dir)\n", block);
+		return 0;
+	}
+
+#ifdef DEBUG
+	printk(KERN_DEBUG "EFS: map_block(): indirect search for logical block %u\n", block);
+#endif
+	direxts = in->extents[0].cooked.ex_offset;
+	indexts = in->numextents;
+
+	for(indext = 0; indext < indexts; indext++) {
+		cur = (last + indext) % indexts;
+
+		/*
+		 * work out which direct extent contains `cur'.
+		 *
+		 * also compute ibase: i.e. the number of the first
+		 * indirect extent contained within direct extent `cur'.
+		 *
+		 */
+		ibase = 0;
+		for(dirext = 0; cur < ibase && dirext < direxts; dirext++) {
+			ibase += in->extents[dirext].cooked.ex_length *
+				(EFS_BLOCKSIZE / sizeof(efs_extent));
+		}
+
+		if (dirext == direxts) {
+			/* should never happen */
+			printk(KERN_ERR "EFS: couldn't find direct extent for indirect extent %d (block %u)\n", cur, block);
+			if (bh) brelse(bh);
+			return 0;
+		}
+		
+		/* work out block number and offset of this indirect extent */
+		iblock = sb->fs_start + in->extents[dirext].cooked.ex_bn +
+			(cur - ibase) /
+			(EFS_BLOCKSIZE / sizeof(efs_extent));
+		ioffset = (cur - ibase) %
+			(EFS_BLOCKSIZE / sizeof(efs_extent));
+
+		if (first || lastblock != iblock) {
+			if (bh) brelse(bh);
+
+			bh = sb_bread(inode->i_sb, iblock);
+			if (!bh) {
+				printk(KERN_ERR "EFS: bread() failed at block %d\n", iblock);
+				return 0;
+			}
+#ifdef DEBUG
+			printk(KERN_DEBUG "EFS: map_block(): read indirect extent block %d\n", iblock);
+#endif
+			first = 0;
+			lastblock = iblock;
+		}
+
+		exts = (efs_extent *) bh->b_data;
+
+		extent_copy(&(exts[ioffset]), &ext);
+
+		if (ext.cooked.ex_magic != 0) {
+			printk(KERN_ERR "EFS: extent %d has bad magic number in block %d\n", cur, iblock);
+			if (bh) brelse(bh);
+			return 0;
+		}
+
+		if ((result = efs_extent_check(&ext, block, sb))) {
+			if (bh) brelse(bh);
+			in->lastextent = cur;
+			return result;
+		}
+	}
+	if (bh) brelse(bh);
+	printk(KERN_ERR "EFS: map_block() failed to map block %u (indir)\n", block);
+	return 0;
+}  
+
+MODULE_LICENSE("GPL");
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
new file mode 100644
index 00000000..1511bf9e
--- /dev/null
+++ b/fs/efs/namei.c
@@ -0,0 +1,118 @@
+/*
+ * namei.c
+ *
+ * Copyright (c) 1999 Al Smith
+ *
+ * Portions derived from work (c) 1995,1996 Christian Vogelgsang.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/exportfs.h>
+#include "efs.h"
+
+
+static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) {
+	struct buffer_head *bh;
+
+	int			slot, namelen;
+	char			*nameptr;
+	struct efs_dir		*dirblock;
+	struct efs_dentry	*dirslot;
+	efs_ino_t		inodenum;
+	efs_block_t		block;
+ 
+	if (inode->i_size & (EFS_DIRBSIZE-1))
+		printk(KERN_WARNING "EFS: WARNING: find_entry(): directory size not a multiple of EFS_DIRBSIZE\n");
+
+	for(block = 0; block < inode->i_blocks; block++) {
+
+		bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
+		if (!bh) {
+			printk(KERN_ERR "EFS: find_entry(): failed to read dir block %d\n", block);
+			return 0;
+		}
+    
+		dirblock = (struct efs_dir *) bh->b_data;
+
+		if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) {
+			printk(KERN_ERR "EFS: find_entry(): invalid directory block\n");
+			brelse(bh);
+			return(0);
+		}
+
+		for(slot = 0; slot < dirblock->slots; slot++) {
+			dirslot  = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot));
+
+			namelen  = dirslot->namelen;
+			nameptr  = dirslot->name;
+
+			if ((namelen == len) && (!memcmp(name, nameptr, len))) {
+				inodenum = be32_to_cpu(dirslot->inode);
+				brelse(bh);
+				return(inodenum);
+			}
+		}
+		brelse(bh);
+	}
+	return(0);
+}
+
+struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) {
+	efs_ino_t inodenum;
+	struct inode * inode = NULL;
+
+	inodenum = efs_find_entry(dir, dentry->d_name.name, dentry->d_name.len);
+	if (inodenum) {
+		inode = efs_iget(dir->i_sb, inodenum);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+
+	return d_splice_alias(inode, dentry);
+}
+
+static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino,
+		u32 generation)
+{
+	struct inode *inode;
+
+	if (ino == 0)
+		return ERR_PTR(-ESTALE);
+	inode = efs_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return inode;
+}
+
+struct dentry *efs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    efs_nfs_get_inode);
+}
+
+struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    efs_nfs_get_inode);
+}
+
+struct dentry *efs_get_parent(struct dentry *child)
+{
+	struct dentry *parent = ERR_PTR(-ENOENT);
+	efs_ino_t ino;
+
+	ino = efs_find_entry(child->d_inode, "..", 2);
+	if (ino)
+		parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino));
+
+	return parent;
+}
diff --git a/fs/efs/super.c b/fs/efs/super.c
new file mode 100644
index 00000000..0f31acb0
--- /dev/null
+++ b/fs/efs/super.c
@@ -0,0 +1,359 @@
+/*
+ * super.c
+ *
+ * Copyright (c) 1999 Al Smith
+ *
+ * Portions derived from work (c) 1995,1996 Christian Vogelgsang.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/exportfs.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+
+#include "efs.h"
+#include <linux/efs_vh.h>
+#include <linux/efs_fs_sb.h>
+
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
+static int efs_fill_super(struct super_block *s, void *d, int silent);
+
+static struct dentry *efs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
+}
+
+static struct file_system_type efs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "efs",
+	.mount		= efs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static struct pt_types sgi_pt_types[] = {
+	{0x00,		"SGI vh"},
+	{0x01,		"SGI trkrepl"},
+	{0x02,		"SGI secrepl"},
+	{0x03,		"SGI raw"},
+	{0x04,		"SGI bsd"},
+	{SGI_SYSV,	"SGI sysv"},
+	{0x06,		"SGI vol"},
+	{SGI_EFS,	"SGI efs"},
+	{0x08,		"SGI lv"},
+	{0x09,		"SGI rlv"},
+	{0x0A,		"SGI xfs"},
+	{0x0B,		"SGI xfslog"},
+	{0x0C,		"SGI xlv"},
+	{0x82,		"Linux swap"},
+	{0x83,		"Linux native"},
+	{0,		NULL}
+};
+
+
+static struct kmem_cache * efs_inode_cachep;
+
+static struct inode *efs_alloc_inode(struct super_block *sb)
+{
+	struct efs_inode_info *ei;
+	ei = (struct efs_inode_info *)kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void efs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
+}
+
+static void efs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, efs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct efs_inode_info *ei = (struct efs_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	efs_inode_cachep = kmem_cache_create("efs_inode_cache",
+				sizeof(struct efs_inode_info),
+				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+				init_once);
+	if (efs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(efs_inode_cachep);
+}
+
+static void efs_put_super(struct super_block *s)
+{
+	kfree(s->s_fs_info);
+	s->s_fs_info = NULL;
+}
+
+static int efs_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+static const struct super_operations efs_superblock_operations = {
+	.alloc_inode	= efs_alloc_inode,
+	.destroy_inode	= efs_destroy_inode,
+	.put_super	= efs_put_super,
+	.statfs		= efs_statfs,
+	.remount_fs	= efs_remount,
+};
+
+static const struct export_operations efs_export_ops = {
+	.fh_to_dentry	= efs_fh_to_dentry,
+	.fh_to_parent	= efs_fh_to_parent,
+	.get_parent	= efs_get_parent,
+};
+
+static int __init init_efs_fs(void) {
+	int err;
+	printk("EFS: "EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n");
+	err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&efs_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_efs_fs(void) {
+	unregister_filesystem(&efs_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_efs_fs)
+module_exit(exit_efs_fs)
+
+static efs_block_t efs_validate_vh(struct volume_header *vh) {
+	int		i;
+	__be32		cs, *ui;
+	int		csum;
+	efs_block_t	sblock = 0; /* shuts up gcc */
+	struct pt_types	*pt_entry;
+	int		pt_type, slice = -1;
+
+	if (be32_to_cpu(vh->vh_magic) != VHMAGIC) {
+		/*
+		 * assume that we're dealing with a partition and allow
+		 * read_super() to try and detect a valid superblock
+		 * on the next block.
+		 */
+		return 0;
+	}
+
+	ui = ((__be32 *) (vh + 1)) - 1;
+	for(csum = 0; ui >= ((__be32 *) vh);) {
+		cs = *ui--;
+		csum += be32_to_cpu(cs);
+	}
+	if (csum) {
+		printk(KERN_INFO "EFS: SGI disklabel: checksum bad, label corrupted\n");
+		return 0;
+	}
+
+#ifdef DEBUG
+	printk(KERN_DEBUG "EFS: bf: \"%16s\"\n", vh->vh_bootfile);
+
+	for(i = 0; i < NVDIR; i++) {
+		int	j;
+		char	name[VDNAMESIZE+1];
+
+		for(j = 0; j < VDNAMESIZE; j++) {
+			name[j] = vh->vh_vd[i].vd_name[j];
+		}
+		name[j] = (char) 0;
+
+		if (name[0]) {
+			printk(KERN_DEBUG "EFS: vh: %8s block: 0x%08x size: 0x%08x\n",
+				name,
+				(int) be32_to_cpu(vh->vh_vd[i].vd_lbn),
+				(int) be32_to_cpu(vh->vh_vd[i].vd_nbytes));
+		}
+	}
+#endif
+
+	for(i = 0; i < NPARTAB; i++) {
+		pt_type = (int) be32_to_cpu(vh->vh_pt[i].pt_type);
+		for(pt_entry = sgi_pt_types; pt_entry->pt_name; pt_entry++) {
+			if (pt_type == pt_entry->pt_type) break;
+		}
+#ifdef DEBUG
+		if (be32_to_cpu(vh->vh_pt[i].pt_nblks)) {
+			printk(KERN_DEBUG "EFS: pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n",
+				i,
+				(int) be32_to_cpu(vh->vh_pt[i].pt_firstlbn),
+				(int) be32_to_cpu(vh->vh_pt[i].pt_nblks),
+				pt_type,
+				(pt_entry->pt_name) ? pt_entry->pt_name : "unknown");
+		}
+#endif
+		if (IS_EFS(pt_type)) {
+			sblock = be32_to_cpu(vh->vh_pt[i].pt_firstlbn);
+			slice = i;
+		}
+	}
+
+	if (slice == -1) {
+		printk(KERN_NOTICE "EFS: partition table contained no EFS partitions\n");
+#ifdef DEBUG
+	} else {
+		printk(KERN_INFO "EFS: using slice %d (type %s, offset 0x%x)\n",
+			slice,
+			(pt_entry->pt_name) ? pt_entry->pt_name : "unknown",
+			sblock);
+#endif
+	}
+	return sblock;
+}
+
+static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) {
+
+	if (!IS_EFS_MAGIC(be32_to_cpu(super->fs_magic)))
+		return -1;
+
+	sb->fs_magic     = be32_to_cpu(super->fs_magic);
+	sb->total_blocks = be32_to_cpu(super->fs_size);
+	sb->first_block  = be32_to_cpu(super->fs_firstcg);
+	sb->group_size   = be32_to_cpu(super->fs_cgfsize);
+	sb->data_free    = be32_to_cpu(super->fs_tfree);
+	sb->inode_free   = be32_to_cpu(super->fs_tinode);
+	sb->inode_blocks = be16_to_cpu(super->fs_cgisize);
+	sb->total_groups = be16_to_cpu(super->fs_ncg);
+    
+	return 0;    
+}
+
+static int efs_fill_super(struct super_block *s, void *d, int silent)
+{
+	struct efs_sb_info *sb;
+	struct buffer_head *bh;
+	struct inode *root;
+	int ret = -EINVAL;
+
+ 	sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
+	if (!sb)
+		return -ENOMEM;
+	s->s_fs_info = sb;
+ 
+	s->s_magic		= EFS_SUPER_MAGIC;
+	if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
+		printk(KERN_ERR "EFS: device does not support %d byte blocks\n",
+			EFS_BLOCKSIZE);
+		goto out_no_fs_ul;
+	}
+  
+	/* read the vh (volume header) block */
+	bh = sb_bread(s, 0);
+
+	if (!bh) {
+		printk(KERN_ERR "EFS: cannot read volume header\n");
+		goto out_no_fs_ul;
+	}
+
+	/*
+	 * if this returns zero then we didn't find any partition table.
+	 * this isn't (yet) an error - just assume for the moment that
+	 * the device is valid and go on to search for a superblock.
+	 */
+	sb->fs_start = efs_validate_vh((struct volume_header *) bh->b_data);
+	brelse(bh);
+
+	if (sb->fs_start == -1) {
+		goto out_no_fs_ul;
+	}
+
+	bh = sb_bread(s, sb->fs_start + EFS_SUPER);
+	if (!bh) {
+		printk(KERN_ERR "EFS: cannot read superblock\n");
+		goto out_no_fs_ul;
+	}
+		
+	if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) {
+#ifdef DEBUG
+		printk(KERN_WARNING "EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER);
+#endif
+		brelse(bh);
+		goto out_no_fs_ul;
+	}
+	brelse(bh);
+
+	if (!(s->s_flags & MS_RDONLY)) {
+#ifdef DEBUG
+		printk(KERN_INFO "EFS: forcing read-only mode\n");
+#endif
+		s->s_flags |= MS_RDONLY;
+	}
+	s->s_op   = &efs_superblock_operations;
+	s->s_export_op = &efs_export_ops;
+	root = efs_iget(s, EFS_ROOTINODE);
+	if (IS_ERR(root)) {
+		printk(KERN_ERR "EFS: get root inode failed\n");
+		ret = PTR_ERR(root);
+		goto out_no_fs;
+	}
+
+	s->s_root = d_alloc_root(root);
+	if (!(s->s_root)) {
+		printk(KERN_ERR "EFS: get root dentry failed\n");
+		iput(root);
+		ret = -ENOMEM;
+		goto out_no_fs;
+	}
+
+	return 0;
+
+out_no_fs_ul:
+out_no_fs:
+	s->s_fs_info = NULL;
+	kfree(sb);
+	return ret;
+}
+
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
+	struct super_block *sb = dentry->d_sb;
+	struct efs_sb_info *sbi = SUPER_INFO(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type    = EFS_SUPER_MAGIC;	/* efs magic number */
+	buf->f_bsize   = EFS_BLOCKSIZE;		/* blocksize */
+	buf->f_blocks  = sbi->total_groups *	/* total data blocks */
+			(sbi->group_size - sbi->inode_blocks);
+	buf->f_bfree   = sbi->data_free;	/* free data blocks */
+	buf->f_bavail  = sbi->data_free;	/* free blocks for non-root */
+	buf->f_files   = sbi->total_groups *	/* total inodes */
+			sbi->inode_blocks *
+			(EFS_BLOCKSIZE / sizeof(struct efs_dinode));
+	buf->f_ffree   = sbi->inode_free;	/* free inodes */
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = EFS_MAXNAMELEN;	/* max filename length */
+
+	return 0;
+}
+
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
new file mode 100644
index 00000000..75117d0d
--- /dev/null
+++ b/fs/efs/symlink.c
@@ -0,0 +1,54 @@
+/*
+ * symlink.c
+ *
+ * Copyright (c) 1999 Al Smith
+ *
+ * Portions derived from work (c) 1995,1996 Christian Vogelgsang.
+ */
+
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include "efs.h"
+
+static int efs_symlink_readpage(struct file *file, struct page *page)
+{
+	char *link = kmap(page);
+	struct buffer_head * bh;
+	struct inode * inode = page->mapping->host;
+	efs_block_t size = inode->i_size;
+	int err;
+  
+	err = -ENAMETOOLONG;
+	if (size > 2 * EFS_BLOCKSIZE)
+		goto fail;
+  
+	/* read first 512 bytes of link target */
+	err = -EIO;
+	bh = sb_bread(inode->i_sb, efs_bmap(inode, 0));
+	if (!bh)
+		goto fail;
+	memcpy(link, bh->b_data, (size > EFS_BLOCKSIZE) ? EFS_BLOCKSIZE : size);
+	brelse(bh);
+	if (size > EFS_BLOCKSIZE) {
+		bh = sb_bread(inode->i_sb, efs_bmap(inode, 1));
+		if (!bh)
+			goto fail;
+		memcpy(link + EFS_BLOCKSIZE, bh->b_data, size - EFS_BLOCKSIZE);
+		brelse(bh);
+	}
+	link[size] = '\0';
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+	return 0;
+fail:
+	SetPageError(page);
+	kunmap(page);
+	unlock_page(page);
+	return err;
+}
+
+const struct address_space_operations efs_symlink_aops = {
+	.readpage	= efs_symlink_readpage
+};
diff --git a/fs/eventfd.c b/fs/eventfd.c
new file mode 100644
index 00000000..d9a59177
--- /dev/null
+++ b/fs/eventfd.c
@@ -0,0 +1,439 @@
+/*
+ *  fs/eventfd.c
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+#include <linux/module.h>
+#include <linux/kref.h>
+#include <linux/eventfd.h>
+
+struct eventfd_ctx {
+	struct kref kref;
+	wait_queue_head_t wqh;
+	/*
+	 * Every time that a write(2) is performed on an eventfd, the
+	 * value of the __u64 being written is added to "count" and a
+	 * wakeup is performed on "wqh". A read(2) will return the "count"
+	 * value to userspace, and will reset "count" to zero. The kernel
+	 * side eventfd_signal() also, adds to the "count" counter and
+	 * issue a wakeup.
+	 */
+	__u64 count;
+	unsigned int flags;
+};
+
+/**
+ * eventfd_signal - Adds @n to the eventfd counter.
+ * @ctx: [in] Pointer to the eventfd context.
+ * @n: [in] Value of the counter to be added to the eventfd internal counter.
+ *          The value cannot be negative.
+ *
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returining a POLLERR
+ * to poll(2).
+ *
+ * Returns @n in case of success, a non-negative number lower than @n in case
+ * of overflow, or the following error codes:
+ *
+ * -EINVAL    : The value of @n is negative.
+ */
+int eventfd_signal(struct eventfd_ctx *ctx, int n)
+{
+	unsigned long flags;
+
+	if (n < 0)
+		return -EINVAL;
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	if (ULLONG_MAX - ctx->count < n)
+		n = (int) (ULLONG_MAX - ctx->count);
+	ctx->count += n;
+	if (waitqueue_active(&ctx->wqh))
+		wake_up_locked_poll(&ctx->wqh, POLLIN);
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+
+	return n;
+}
+EXPORT_SYMBOL_GPL(eventfd_signal);
+
+static void eventfd_free_ctx(struct eventfd_ctx *ctx)
+{
+	kfree(ctx);
+}
+
+static void eventfd_free(struct kref *kref)
+{
+	struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
+
+	eventfd_free_ctx(ctx);
+}
+
+/**
+ * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to the eventfd context.
+ *
+ * Returns: In case of success, returns a pointer to the eventfd context.
+ */
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
+{
+	kref_get(&ctx->kref);
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_get);
+
+/**
+ * eventfd_ctx_put - Releases a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to eventfd context.
+ *
+ * The eventfd context reference must have been previously acquired either
+ * with eventfd_ctx_get() or eventfd_ctx_fdget().
+ */
+void eventfd_ctx_put(struct eventfd_ctx *ctx)
+{
+	kref_put(&ctx->kref, eventfd_free);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_put);
+
+static int eventfd_release(struct inode *inode, struct file *file)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+
+	wake_up_poll(&ctx->wqh, POLLHUP);
+	eventfd_ctx_put(ctx);
+	return 0;
+}
+
+static unsigned int eventfd_poll(struct file *file, poll_table *wait)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+	unsigned int events = 0;
+	unsigned long flags;
+
+	poll_wait(file, &ctx->wqh, wait);
+
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	if (ctx->count > 0)
+		events |= POLLIN;
+	if (ctx->count == ULLONG_MAX)
+		events |= POLLERR;
+	if (ULLONG_MAX - 1 > ctx->count)
+		events |= POLLOUT;
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+
+	return events;
+}
+
+static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
+{
+	*cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+	ctx->count -= *cnt;
+}
+
+/**
+ * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
+ * @ctx: [in] Pointer to eventfd context.
+ * @wait: [in] Wait queue to be removed.
+ * @cnt: [out] Pointer to the 64-bit counter value.
+ *
+ * Returns %0 if successful, or the following error codes:
+ *
+ * -EAGAIN      : The operation would have blocked.
+ *
+ * This is used to atomically remove a wait queue entry from the eventfd wait
+ * queue head, and read/reset the counter value.
+ */
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+				  __u64 *cnt)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	eventfd_ctx_do_read(ctx, cnt);
+	__remove_wait_queue(&ctx->wqh, wait);
+	if (*cnt != 0 && waitqueue_active(&ctx->wqh))
+		wake_up_locked_poll(&ctx->wqh, POLLOUT);
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+
+	return *cnt != 0 ? 0 : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
+
+/**
+ * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
+ * @ctx: [in] Pointer to eventfd context.
+ * @no_wait: [in] Different from zero if the operation should not block.
+ * @cnt: [out] Pointer to the 64-bit counter value.
+ *
+ * Returns %0 if successful, or the following error codes:
+ *
+ * -EAGAIN      : The operation would have blocked but @no_wait was non-zero.
+ * -ERESTARTSYS : A signal interrupted the wait operation.
+ *
+ * If @no_wait is zero, the function might sleep until the eventfd internal
+ * counter becomes greater than zero.
+ */
+ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
+{
+	ssize_t res;
+	DECLARE_WAITQUEUE(wait, current);
+
+	spin_lock_irq(&ctx->wqh.lock);
+	*cnt = 0;
+	res = -EAGAIN;
+	if (ctx->count > 0)
+		res = 0;
+	else if (!no_wait) {
+		__add_wait_queue(&ctx->wqh, &wait);
+		for (;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (ctx->count > 0) {
+				res = 0;
+				break;
+			}
+			if (signal_pending(current)) {
+				res = -ERESTARTSYS;
+				break;
+			}
+			spin_unlock_irq(&ctx->wqh.lock);
+			schedule();
+			spin_lock_irq(&ctx->wqh.lock);
+		}
+		__remove_wait_queue(&ctx->wqh, &wait);
+		__set_current_state(TASK_RUNNING);
+	}
+	if (likely(res == 0)) {
+		eventfd_ctx_do_read(ctx, cnt);
+		if (waitqueue_active(&ctx->wqh))
+			wake_up_locked_poll(&ctx->wqh, POLLOUT);
+	}
+	spin_unlock_irq(&ctx->wqh.lock);
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_read);
+
+static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+	ssize_t res;
+	__u64 cnt;
+
+	if (count < sizeof(cnt))
+		return -EINVAL;
+	res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
+	if (res < 0)
+		return res;
+
+	return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
+}
+
+static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
+			     loff_t *ppos)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+	ssize_t res;
+	__u64 ucnt;
+	DECLARE_WAITQUEUE(wait, current);
+
+	if (count < sizeof(ucnt))
+		return -EINVAL;
+	if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
+		return -EFAULT;
+	if (ucnt == ULLONG_MAX)
+		return -EINVAL;
+	spin_lock_irq(&ctx->wqh.lock);
+	res = -EAGAIN;
+	if (ULLONG_MAX - ctx->count > ucnt)
+		res = sizeof(ucnt);
+	else if (!(file->f_flags & O_NONBLOCK)) {
+		__add_wait_queue(&ctx->wqh, &wait);
+		for (res = 0;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (ULLONG_MAX - ctx->count > ucnt) {
+				res = sizeof(ucnt);
+				break;
+			}
+			if (signal_pending(current)) {
+				res = -ERESTARTSYS;
+				break;
+			}
+			spin_unlock_irq(&ctx->wqh.lock);
+			schedule();
+			spin_lock_irq(&ctx->wqh.lock);
+		}
+		__remove_wait_queue(&ctx->wqh, &wait);
+		__set_current_state(TASK_RUNNING);
+	}
+	if (likely(res > 0)) {
+		ctx->count += ucnt;
+		if (waitqueue_active(&ctx->wqh))
+			wake_up_locked_poll(&ctx->wqh, POLLIN);
+	}
+	spin_unlock_irq(&ctx->wqh.lock);
+
+	return res;
+}
+
+static const struct file_operations eventfd_fops = {
+	.release	= eventfd_release,
+	.poll		= eventfd_poll,
+	.read		= eventfd_read,
+	.write		= eventfd_write,
+	.llseek		= noop_llseek,
+};
+
+/**
+ * eventfd_fget - Acquire a reference of an eventfd file descriptor.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the eventfd file structure in case of success, or the
+ * following error pointer:
+ *
+ * -EBADF    : Invalid @fd file descriptor.
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
+struct file *eventfd_fget(int fd)
+{
+	struct file *file;
+
+	file = fget(fd);
+	if (!file)
+		return ERR_PTR(-EBADF);
+	if (file->f_op != &eventfd_fops) {
+		fput(file);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return file;
+}
+EXPORT_SYMBOL_GPL(eventfd_fget);
+
+/**
+ * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointers returned by the following functions:
+ *
+ * eventfd_fget
+ */
+struct eventfd_ctx *eventfd_ctx_fdget(int fd)
+{
+	struct file *file;
+	struct eventfd_ctx *ctx;
+
+	file = eventfd_fget(fd);
+	if (IS_ERR(file))
+		return (struct eventfd_ctx *) file;
+	ctx = eventfd_ctx_get(file->private_data);
+	fput(file);
+
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
+
+/**
+ * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
+ * @file: [in] Eventfd file pointer.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointer:
+ *
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
+struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
+{
+	if (file->f_op != &eventfd_fops)
+		return ERR_PTR(-EINVAL);
+
+	return eventfd_ctx_get(file->private_data);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
+
+/**
+ * eventfd_file_create - Creates an eventfd file pointer.
+ * @count: Initial eventfd counter value.
+ * @flags: Flags for the eventfd file.
+ *
+ * This function creates an eventfd file pointer, w/out installing it into
+ * the fd table. This is useful when the eventfd file is used during the
+ * initialization of data structures that require extra setup after the eventfd
+ * creation. So the eventfd creation is split into the file pointer creation
+ * phase, and the file descriptor installation phase.
+ * In this way races with userspace closing the newly installed file descriptor
+ * can be avoided.
+ * Returns an eventfd file pointer, or a proper error pointer.
+ */
+struct file *eventfd_file_create(unsigned int count, int flags)
+{
+	struct file *file;
+	struct eventfd_ctx *ctx;
+
+	/* Check the EFD_* constants for consistency.  */
+	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
+
+	if (flags & ~EFD_FLAGS_SET)
+		return ERR_PTR(-EINVAL);
+
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&ctx->kref);
+	init_waitqueue_head(&ctx->wqh);
+	ctx->count = count;
+	ctx->flags = flags;
+
+	file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
+				  O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
+	if (IS_ERR(file))
+		eventfd_free_ctx(ctx);
+
+	return file;
+}
+
+SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
+{
+	int fd, error;
+	struct file *file;
+
+	error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);
+	if (error < 0)
+		return error;
+	fd = error;
+
+	file = eventfd_file_create(count, flags);
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto err_put_unused_fd;
+	}
+	fd_install(fd, file);
+
+	return fd;
+
+err_put_unused_fd:
+	put_unused_fd(fd);
+
+	return error;
+}
+
+SYSCALL_DEFINE1(eventfd, unsigned int, count)
+{
+	return sys_eventfd2(count, 0);
+}
+
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
new file mode 100644
index 00000000..35a852a2
--- /dev/null
+++ b/fs/eventpoll.c
@@ -0,0 +1,1818 @@
+/*
+ *  fs/eventpoll.c (Efficient event retrieval implementation)
+ *  Copyright (C) 2001,...,2009	 Davide Libenzi
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/spinlock.h>
+#include <linux/syscalls.h>
+#include <linux/rbtree.h>
+#include <linux/wait.h>
+#include <linux/eventpoll.h>
+#include <linux/mount.h>
+#include <linux/bitops.h>
+#include <linux/mutex.h>
+#include <linux/anon_inodes.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/mman.h>
+#include <asm/atomic.h>
+
+/*
+ * LOCKING:
+ * There are three level of locking required by epoll :
+ *
+ * 1) epmutex (mutex)
+ * 2) ep->mtx (mutex)
+ * 3) ep->lock (spinlock)
+ *
+ * The acquire order is the one listed above, from 1 to 3.
+ * We need a spinlock (ep->lock) because we manipulate objects
+ * from inside the poll callback, that might be triggered from
+ * a wake_up() that in turn might be called from IRQ context.
+ * So we can't sleep inside the poll callback and hence we need
+ * a spinlock. During the event transfer loop (from kernel to
+ * user space) we could end up sleeping due a copy_to_user(), so
+ * we need a lock that will allow us to sleep. This lock is a
+ * mutex (ep->mtx). It is acquired during the event transfer loop,
+ * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
+ * Then we also need a global mutex to serialize eventpoll_release_file()
+ * and ep_free().
+ * This mutex is acquired by ep_free() during the epoll file
+ * cleanup path and it is also acquired by eventpoll_release_file()
+ * if a file has been pushed inside an epoll set and it is then
+ * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
+ * It is also acquired when inserting an epoll fd onto another epoll
+ * fd. We do this so that we walk the epoll tree and ensure that this
+ * insertion does not create a cycle of epoll file descriptors, which
+ * could lead to deadlock. We need a global mutex to prevent two
+ * simultaneous inserts (A into B and B into A) from racing and
+ * constructing a cycle without either insert observing that it is
+ * going to.
+ * It is necessary to acquire multiple "ep->mtx"es at once in the
+ * case when one epoll fd is added to another. In this case, we
+ * always acquire the locks in the order of nesting (i.e. after
+ * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
+ * before e2->mtx). Since we disallow cycles of epoll file
+ * descriptors, this ensures that the mutexes are well-ordered. In
+ * order to communicate this nesting to lockdep, when walking a tree
+ * of epoll file descriptors, we use the current recursion depth as
+ * the lockdep subkey.
+ * It is possible to drop the "ep->mtx" and to use the global
+ * mutex "epmutex" (together with "ep->lock") to have it working,
+ * but having "ep->mtx" will make the interface more scalable.
+ * Events that require holding "epmutex" are very rare, while for
+ * normal operations the epoll private "ep->mtx" will guarantee
+ * a better scalability.
+ */
+
+/* Epoll private bits inside the event mask */
+#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
+
+/* Maximum number of nesting allowed inside epoll sets */
+#define EP_MAX_NESTS 4
+
+#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
+
+#define EP_UNACTIVE_PTR ((void *) -1L)
+
+#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
+
+struct epoll_filefd {
+	struct file *file;
+	int fd;
+};
+
+/*
+ * Structure used to track possible nested calls, for too deep recursions
+ * and loop cycles.
+ */
+struct nested_call_node {
+	struct list_head llink;
+	void *cookie;
+	void *ctx;
+};
+
+/*
+ * This structure is used as collector for nested calls, to check for
+ * maximum recursion dept and loop cycles.
+ */
+struct nested_calls {
+	struct list_head tasks_call_list;
+	spinlock_t lock;
+};
+
+/*
+ * Each file descriptor added to the eventpoll interface will
+ * have an entry of this type linked to the "rbr" RB tree.
+ */
+struct epitem {
+	/* RB tree node used to link this structure to the eventpoll RB tree */
+	struct rb_node rbn;
+
+	/* List header used to link this structure to the eventpoll ready list */
+	struct list_head rdllink;
+
+	/*
+	 * Works together "struct eventpoll"->ovflist in keeping the
+	 * single linked chain of items.
+	 */
+	struct epitem *next;
+
+	/* The file descriptor information this item refers to */
+	struct epoll_filefd ffd;
+
+	/* Number of active wait queue attached to poll operations */
+	int nwait;
+
+	/* List containing poll wait queues */
+	struct list_head pwqlist;
+
+	/* The "container" of this item */
+	struct eventpoll *ep;
+
+	/* List header used to link this item to the "struct file" items list */
+	struct list_head fllink;
+
+	/* The structure that describe the interested events and the source fd */
+	struct epoll_event event;
+};
+
+/*
+ * This structure is stored inside the "private_data" member of the file
+ * structure and represents the main data structure for the eventpoll
+ * interface.
+ */
+struct eventpoll {
+	/* Protect the access to this structure */
+	spinlock_t lock;
+
+	/*
+	 * This mutex is used to ensure that files are not removed
+	 * while epoll is using them. This is held during the event
+	 * collection loop, the file cleanup path, the epoll file exit
+	 * code and the ctl operations.
+	 */
+	struct mutex mtx;
+
+	/* Wait queue used by sys_epoll_wait() */
+	wait_queue_head_t wq;
+
+	/* Wait queue used by file->poll() */
+	wait_queue_head_t poll_wait;
+
+	/* List of ready file descriptors */
+	struct list_head rdllist;
+
+	/* RB tree root used to store monitored fd structs */
+	struct rb_root rbr;
+
+	/*
+	 * This is a single linked list that chains all the "struct epitem" that
+	 * happened while transferring ready events to userspace w/out
+	 * holding ->lock.
+	 */
+	struct epitem *ovflist;
+
+	/* The user that created the eventpoll descriptor */
+	struct user_struct *user;
+
+	struct file *file;
+
+	/* used to optimize loop detection check */
+	int visited;
+	struct list_head visited_list_link;
+};
+
+/* Wait structure used by the poll hooks */
+struct eppoll_entry {
+	/* List header used to link this structure to the "struct epitem" */
+	struct list_head llink;
+
+	/* The "base" pointer is set to the container "struct epitem" */
+	struct epitem *base;
+
+	/*
+	 * Wait queue item that will be linked to the target file wait
+	 * queue head.
+	 */
+	wait_queue_t wait;
+
+	/* The wait queue head that linked the "wait" wait queue item */
+	wait_queue_head_t *whead;
+};
+
+/* Wrapper struct used by poll queueing */
+struct ep_pqueue {
+	poll_table pt;
+	struct epitem *epi;
+};
+
+/* Used by the ep_send_events() function as callback private data */
+struct ep_send_events_data {
+	int maxevents;
+	struct epoll_event __user *events;
+};
+
+/*
+ * Configuration options available inside /proc/sys/fs/epoll/
+ */
+/* Maximum number of epoll watched descriptors, per user */
+static long max_user_watches __read_mostly;
+
+/*
+ * This mutex is used to serialize ep_free() and eventpoll_release_file().
+ */
+static DEFINE_MUTEX(epmutex);
+
+/* Used to check for epoll file descriptor inclusion loops */
+static struct nested_calls poll_loop_ncalls;
+
+/* Used for safe wake up implementation */
+static struct nested_calls poll_safewake_ncalls;
+
+/* Used to call file's f_op->poll() under the nested calls boundaries */
+static struct nested_calls poll_readywalk_ncalls;
+
+/* Slab cache used to allocate "struct epitem" */
+static struct kmem_cache *epi_cache __read_mostly;
+
+/* Slab cache used to allocate "struct eppoll_entry" */
+static struct kmem_cache *pwq_cache __read_mostly;
+
+/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
+static LIST_HEAD(visited_list);
+
+/*
+ * List of files with newly added links, where we may need to limit the number
+ * of emanating paths. Protected by the epmutex.
+ */
+static LIST_HEAD(tfile_check_list);
+
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+static long zero;
+static long long_max = LONG_MAX;
+
+ctl_table epoll_table[] = {
+	{
+		.procname	= "max_user_watches",
+		.data		= &max_user_watches,
+		.maxlen		= sizeof(max_user_watches),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &long_max,
+	},
+	{ }
+};
+#endif /* CONFIG_SYSCTL */
+
+static const struct file_operations eventpoll_fops;
+
+static inline int is_file_epoll(struct file *f)
+{
+	return f->f_op == &eventpoll_fops;
+}
+
+/* Setup the structure that is used as key for the RB tree */
+static inline void ep_set_ffd(struct epoll_filefd *ffd,
+			      struct file *file, int fd)
+{
+	ffd->file = file;
+	ffd->fd = fd;
+}
+
+/* Compare RB tree keys */
+static inline int ep_cmp_ffd(struct epoll_filefd *p1,
+			     struct epoll_filefd *p2)
+{
+	return (p1->file > p2->file ? +1:
+	        (p1->file < p2->file ? -1 : p1->fd - p2->fd));
+}
+
+/* Tells us if the item is currently linked */
+static inline int ep_is_linked(struct list_head *p)
+{
+	return !list_empty(p);
+}
+
+static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
+{
+	return container_of(p, struct eppoll_entry, wait);
+}
+
+/* Get the "struct epitem" from a wait queue pointer */
+static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
+{
+	return container_of(p, struct eppoll_entry, wait)->base;
+}
+
+/* Get the "struct epitem" from an epoll queue wrapper */
+static inline struct epitem *ep_item_from_epqueue(poll_table *p)
+{
+	return container_of(p, struct ep_pqueue, pt)->epi;
+}
+
+/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
+static inline int ep_op_has_event(int op)
+{
+	return op != EPOLL_CTL_DEL;
+}
+
+/* Initialize the poll safe wake up structure */
+static void ep_nested_calls_init(struct nested_calls *ncalls)
+{
+	INIT_LIST_HEAD(&ncalls->tasks_call_list);
+	spin_lock_init(&ncalls->lock);
+}
+
+/**
+ * ep_events_available - Checks if ready events might be available.
+ *
+ * @ep: Pointer to the eventpoll context.
+ *
+ * Returns: Returns a value different than zero if ready events are available,
+ *          or zero otherwise.
+ */
+static inline int ep_events_available(struct eventpoll *ep)
+{
+	return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
+}
+
+/**
+ * ep_call_nested - Perform a bound (possibly) nested call, by checking
+ *                  that the recursion limit is not exceeded, and that
+ *                  the same nested call (by the meaning of same cookie) is
+ *                  no re-entered.
+ *
+ * @ncalls: Pointer to the nested_calls structure to be used for this call.
+ * @max_nests: Maximum number of allowed nesting calls.
+ * @nproc: Nested call core function pointer.
+ * @priv: Opaque data to be passed to the @nproc callback.
+ * @cookie: Cookie to be used to identify this nested call.
+ * @ctx: This instance context.
+ *
+ * Returns: Returns the code returned by the @nproc callback, or -1 if
+ *          the maximum recursion limit has been exceeded.
+ */
+static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
+			  int (*nproc)(void *, void *, int), void *priv,
+			  void *cookie, void *ctx)
+{
+	int error, call_nests = 0;
+	unsigned long flags;
+	struct list_head *lsthead = &ncalls->tasks_call_list;
+	struct nested_call_node *tncur;
+	struct nested_call_node tnode;
+
+	spin_lock_irqsave(&ncalls->lock, flags);
+
+	/*
+	 * Try to see if the current task is already inside this wakeup call.
+	 * We use a list here, since the population inside this set is always
+	 * very much limited.
+	 */
+	list_for_each_entry(tncur, lsthead, llink) {
+		if (tncur->ctx == ctx &&
+		    (tncur->cookie == cookie || ++call_nests > max_nests)) {
+			/*
+			 * Ops ... loop detected or maximum nest level reached.
+			 * We abort this wake by breaking the cycle itself.
+			 */
+			error = -1;
+			goto out_unlock;
+		}
+	}
+
+	/* Add the current task and cookie to the list */
+	tnode.ctx = ctx;
+	tnode.cookie = cookie;
+	list_add(&tnode.llink, lsthead);
+
+	spin_unlock_irqrestore(&ncalls->lock, flags);
+
+	/* Call the nested function */
+	error = (*nproc)(priv, cookie, call_nests);
+
+	/* Remove the current task from the list */
+	spin_lock_irqsave(&ncalls->lock, flags);
+	list_del(&tnode.llink);
+out_unlock:
+	spin_unlock_irqrestore(&ncalls->lock, flags);
+
+	return error;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
+				     unsigned long events, int subclass)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
+	wake_up_locked_poll(wqueue, events);
+	spin_unlock_irqrestore(&wqueue->lock, flags);
+}
+#else
+static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
+				     unsigned long events, int subclass)
+{
+	wake_up_poll(wqueue, events);
+}
+#endif
+
+static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
+{
+	ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
+			  1 + call_nests);
+	return 0;
+}
+
+/*
+ * Perform a safe wake up of the poll wait list. The problem is that
+ * with the new callback'd wake up system, it is possible that the
+ * poll callback is reentered from inside the call to wake_up() done
+ * on the poll wait queue head. The rule is that we cannot reenter the
+ * wake up code from the same task more than EP_MAX_NESTS times,
+ * and we cannot reenter the same wait queue head at all. This will
+ * enable to have a hierarchy of epoll file descriptor of no more than
+ * EP_MAX_NESTS deep.
+ */
+static void ep_poll_safewake(wait_queue_head_t *wq)
+{
+	int this_cpu = get_cpu();
+
+	ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
+		       ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
+
+	put_cpu();
+}
+
+static void ep_remove_wait_queue(struct eppoll_entry *pwq)
+{
+	wait_queue_head_t *whead;
+
+	rcu_read_lock();
+	/* If it is cleared by POLLFREE, it should be rcu-safe */
+	whead = rcu_dereference(pwq->whead);
+	if (whead)
+		remove_wait_queue(whead, &pwq->wait);
+	rcu_read_unlock();
+}
+
+/*
+ * This function unregisters poll callbacks from the associated file
+ * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
+ * ep_free).
+ */
+static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
+{
+	struct list_head *lsthead = &epi->pwqlist;
+	struct eppoll_entry *pwq;
+
+	while (!list_empty(lsthead)) {
+		pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
+
+		list_del(&pwq->llink);
+		ep_remove_wait_queue(pwq);
+		kmem_cache_free(pwq_cache, pwq);
+	}
+}
+
+/**
+ * ep_scan_ready_list - Scans the ready list in a way that makes possible for
+ *                      the scan code, to call f_op->poll(). Also allows for
+ *                      O(NumReady) performance.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @sproc: Pointer to the scan callback.
+ * @priv: Private opaque data passed to the @sproc callback.
+ * @depth: The current depth of recursive f_op->poll calls.
+ *
+ * Returns: The same integer error code returned by the @sproc callback.
+ */
+static int ep_scan_ready_list(struct eventpoll *ep,
+			      int (*sproc)(struct eventpoll *,
+					   struct list_head *, void *),
+			      void *priv,
+			      int depth)
+{
+	int error, pwake = 0;
+	unsigned long flags;
+	struct epitem *epi, *nepi;
+	LIST_HEAD(txlist);
+
+	/*
+	 * We need to lock this because we could be hit by
+	 * eventpoll_release_file() and epoll_ctl().
+	 */
+	mutex_lock_nested(&ep->mtx, depth);
+
+	/*
+	 * Steal the ready list, and re-init the original one to the
+	 * empty list. Also, set ep->ovflist to NULL so that events
+	 * happening while looping w/out locks, are not lost. We cannot
+	 * have the poll callback to queue directly on ep->rdllist,
+	 * because we want the "sproc" callback to be able to do it
+	 * in a lockless way.
+	 */
+	spin_lock_irqsave(&ep->lock, flags);
+	list_splice_init(&ep->rdllist, &txlist);
+	ep->ovflist = NULL;
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	/*
+	 * Now call the callback function.
+	 */
+	error = (*sproc)(ep, &txlist, priv);
+
+	spin_lock_irqsave(&ep->lock, flags);
+	/*
+	 * During the time we spent inside the "sproc" callback, some
+	 * other events might have been queued by the poll callback.
+	 * We re-insert them inside the main ready-list here.
+	 */
+	for (nepi = ep->ovflist; (epi = nepi) != NULL;
+	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
+		/*
+		 * We need to check if the item is already in the list.
+		 * During the "sproc" callback execution time, items are
+		 * queued into ->ovflist but the "txlist" might already
+		 * contain them, and the list_splice() below takes care of them.
+		 */
+		if (!ep_is_linked(&epi->rdllink))
+			list_add_tail(&epi->rdllink, &ep->rdllist);
+	}
+	/*
+	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
+	 * releasing the lock, events will be queued in the normal way inside
+	 * ep->rdllist.
+	 */
+	ep->ovflist = EP_UNACTIVE_PTR;
+
+	/*
+	 * Quickly re-inject items left on "txlist".
+	 */
+	list_splice(&txlist, &ep->rdllist);
+
+	if (!list_empty(&ep->rdllist)) {
+		/*
+		 * Wake up (if active) both the eventpoll wait list and
+		 * the ->poll() wait list (delayed after we release the lock).
+		 */
+		if (waitqueue_active(&ep->wq))
+			wake_up_locked(&ep->wq);
+		if (waitqueue_active(&ep->poll_wait))
+			pwake++;
+	}
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	mutex_unlock(&ep->mtx);
+
+	/* We have to call this outside the lock */
+	if (pwake)
+		ep_poll_safewake(&ep->poll_wait);
+
+	return error;
+}
+
+/*
+ * Removes a "struct epitem" from the eventpoll RB tree and deallocates
+ * all the associated resources. Must be called with "mtx" held.
+ */
+static int ep_remove(struct eventpoll *ep, struct epitem *epi)
+{
+	unsigned long flags;
+	struct file *file = epi->ffd.file;
+
+	/*
+	 * Removes poll wait queue hooks. We _have_ to do this without holding
+	 * the "ep->lock" otherwise a deadlock might occur. This because of the
+	 * sequence of the lock acquisition. Here we do "ep->lock" then the wait
+	 * queue head lock when unregistering the wait queue. The wakeup callback
+	 * will run by holding the wait queue head lock and will call our callback
+	 * that will try to get "ep->lock".
+	 */
+	ep_unregister_pollwait(ep, epi);
+
+	/* Remove the current item from the list of epoll hooks */
+	spin_lock(&file->f_lock);
+	if (ep_is_linked(&epi->fllink))
+		list_del_init(&epi->fllink);
+	spin_unlock(&file->f_lock);
+
+	rb_erase(&epi->rbn, &ep->rbr);
+
+	spin_lock_irqsave(&ep->lock, flags);
+	if (ep_is_linked(&epi->rdllink))
+		list_del_init(&epi->rdllink);
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	/* At this point it is safe to free the eventpoll item */
+	kmem_cache_free(epi_cache, epi);
+
+	atomic_long_dec(&ep->user->epoll_watches);
+
+	return 0;
+}
+
+static void ep_free(struct eventpoll *ep)
+{
+	struct rb_node *rbp;
+	struct epitem *epi;
+
+	/* We need to release all tasks waiting for these file */
+	if (waitqueue_active(&ep->poll_wait))
+		ep_poll_safewake(&ep->poll_wait);
+
+	/*
+	 * We need to lock this because we could be hit by
+	 * eventpoll_release_file() while we're freeing the "struct eventpoll".
+	 * We do not need to hold "ep->mtx" here because the epoll file
+	 * is on the way to be removed and no one has references to it
+	 * anymore. The only hit might come from eventpoll_release_file() but
+	 * holding "epmutex" is sufficient here.
+	 */
+	mutex_lock(&epmutex);
+
+	/*
+	 * Walks through the whole tree by unregistering poll callbacks.
+	 */
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+
+		ep_unregister_pollwait(ep, epi);
+	}
+
+	/*
+	 * Walks through the whole tree by freeing each "struct epitem". At this
+	 * point we are sure no poll callbacks will be lingering around, and also by
+	 * holding "epmutex" we can be sure that no file cleanup code will hit
+	 * us during this operation. So we can avoid the lock on "ep->lock".
+	 */
+	while ((rbp = rb_first(&ep->rbr)) != NULL) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+		ep_remove(ep, epi);
+	}
+
+	mutex_unlock(&epmutex);
+	mutex_destroy(&ep->mtx);
+	free_uid(ep->user);
+	kfree(ep);
+}
+
+static int ep_eventpoll_release(struct inode *inode, struct file *file)
+{
+	struct eventpoll *ep = file->private_data;
+
+	if (ep)
+		ep_free(ep);
+
+	return 0;
+}
+
+static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+			       void *priv)
+{
+	struct epitem *epi, *tmp;
+
+	list_for_each_entry_safe(epi, tmp, head, rdllink) {
+		if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+		    epi->event.events)
+			return POLLIN | POLLRDNORM;
+		else {
+			/*
+			 * Item has been dropped into the ready list by the poll
+			 * callback, but it's not actually ready, as far as
+			 * caller requested events goes. We can remove it here.
+			 */
+			list_del_init(&epi->rdllink);
+		}
+	}
+
+	return 0;
+}
+
+static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
+{
+	return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
+}
+
+static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
+{
+	int pollflags;
+	struct eventpoll *ep = file->private_data;
+
+	/* Insert inside our poll wait queue */
+	poll_wait(file, &ep->poll_wait, wait);
+
+	/*
+	 * Proceed to find out if wanted events are really available inside
+	 * the ready list. This need to be done under ep_call_nested()
+	 * supervision, since the call to f_op->poll() done on listed files
+	 * could re-enter here.
+	 */
+	pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
+				   ep_poll_readyevents_proc, ep, ep, current);
+
+	return pollflags != -1 ? pollflags : 0;
+}
+
+/* File callbacks that implement the eventpoll file behaviour */
+static const struct file_operations eventpoll_fops = {
+	.release	= ep_eventpoll_release,
+	.poll		= ep_eventpoll_poll,
+	.llseek		= noop_llseek,
+};
+
+/*
+ * This is called from eventpoll_release() to unlink files from the eventpoll
+ * interface. We need to have this facility to cleanup correctly files that are
+ * closed without being removed from the eventpoll interface.
+ */
+void eventpoll_release_file(struct file *file)
+{
+	struct list_head *lsthead = &file->f_ep_links;
+	struct eventpoll *ep;
+	struct epitem *epi;
+
+	/*
+	 * We don't want to get "file->f_lock" because it is not
+	 * necessary. It is not necessary because we're in the "struct file"
+	 * cleanup path, and this means that no one is using this file anymore.
+	 * So, for example, epoll_ctl() cannot hit here since if we reach this
+	 * point, the file counter already went to zero and fget() would fail.
+	 * The only hit might come from ep_free() but by holding the mutex
+	 * will correctly serialize the operation. We do need to acquire
+	 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
+	 * from anywhere but ep_free().
+	 *
+	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
+	 */
+	mutex_lock(&epmutex);
+
+	while (!list_empty(lsthead)) {
+		epi = list_first_entry(lsthead, struct epitem, fllink);
+
+		ep = epi->ep;
+		list_del_init(&epi->fllink);
+		mutex_lock_nested(&ep->mtx, 0);
+		ep_remove(ep, epi);
+		mutex_unlock(&ep->mtx);
+	}
+
+	mutex_unlock(&epmutex);
+}
+
+static int ep_alloc(struct eventpoll **pep)
+{
+	int error;
+	struct user_struct *user;
+	struct eventpoll *ep;
+
+	user = get_current_user();
+	error = -ENOMEM;
+	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
+	if (unlikely(!ep))
+		goto free_uid;
+
+	spin_lock_init(&ep->lock);
+	mutex_init(&ep->mtx);
+	init_waitqueue_head(&ep->wq);
+	init_waitqueue_head(&ep->poll_wait);
+	INIT_LIST_HEAD(&ep->rdllist);
+	ep->rbr = RB_ROOT;
+	ep->ovflist = EP_UNACTIVE_PTR;
+	ep->user = user;
+
+	*pep = ep;
+
+	return 0;
+
+free_uid:
+	free_uid(user);
+	return error;
+}
+
+/*
+ * Search the file inside the eventpoll tree. The RB tree operations
+ * are protected by the "mtx" mutex, and ep_find() must be called with
+ * "mtx" held.
+ */
+static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
+{
+	int kcmp;
+	struct rb_node *rbp;
+	struct epitem *epi, *epir = NULL;
+	struct epoll_filefd ffd;
+
+	ep_set_ffd(&ffd, file, fd);
+	for (rbp = ep->rbr.rb_node; rbp; ) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+		kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
+		if (kcmp > 0)
+			rbp = rbp->rb_right;
+		else if (kcmp < 0)
+			rbp = rbp->rb_left;
+		else {
+			epir = epi;
+			break;
+		}
+	}
+
+	return epir;
+}
+
+/*
+ * This is the callback that is passed to the wait queue wakeup
+ * mechanism. It is called by the stored file descriptors when they
+ * have events to report.
+ */
+static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	int pwake = 0;
+	unsigned long flags;
+	struct epitem *epi = ep_item_from_wait(wait);
+	struct eventpoll *ep = epi->ep;
+
+	if ((unsigned long)key & POLLFREE) {
+		ep_pwq_from_wait(wait)->whead = NULL;
+		/*
+		 * whead = NULL above can race with ep_remove_wait_queue()
+		 * which can do another remove_wait_queue() after us, so we
+		 * can't use __remove_wait_queue(). whead->lock is held by
+		 * the caller.
+		 */
+		list_del_init(&wait->task_list);
+	}
+
+	spin_lock_irqsave(&ep->lock, flags);
+
+	/*
+	 * If the event mask does not contain any poll(2) event, we consider the
+	 * descriptor to be disabled. This condition is likely the effect of the
+	 * EPOLLONESHOT bit that disables the descriptor when an event is received,
+	 * until the next EPOLL_CTL_MOD will be issued.
+	 */
+	if (!(epi->event.events & ~EP_PRIVATE_BITS))
+		goto out_unlock;
+
+	/*
+	 * Check the events coming with the callback. At this stage, not
+	 * every device reports the events in the "key" parameter of the
+	 * callback. We need to be able to handle both cases here, hence the
+	 * test for "key" != NULL before the event match test.
+	 */
+	if (key && !((unsigned long) key & epi->event.events))
+		goto out_unlock;
+
+	/*
+	 * If we are transferring events to userspace, we can hold no locks
+	 * (because we're accessing user memory, and because of linux f_op->poll()
+	 * semantics). All the events that happen during that period of time are
+	 * chained in ep->ovflist and requeued later on.
+	 */
+	if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
+		if (epi->next == EP_UNACTIVE_PTR) {
+			epi->next = ep->ovflist;
+			ep->ovflist = epi;
+		}
+		goto out_unlock;
+	}
+
+	/* If this file is already in the ready list we exit soon */
+	if (!ep_is_linked(&epi->rdllink))
+		list_add_tail(&epi->rdllink, &ep->rdllist);
+
+	/*
+	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
+	 * wait list.
+	 */
+	if (waitqueue_active(&ep->wq))
+		wake_up_locked(&ep->wq);
+	if (waitqueue_active(&ep->poll_wait))
+		pwake++;
+
+out_unlock:
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	/* We have to call this outside the lock */
+	if (pwake)
+		ep_poll_safewake(&ep->poll_wait);
+
+	return 1;
+}
+
+/*
+ * This is the callback that is used to add our wait queue to the
+ * target file wakeup lists.
+ */
+static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
+				 poll_table *pt)
+{
+	struct epitem *epi = ep_item_from_epqueue(pt);
+	struct eppoll_entry *pwq;
+
+	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
+		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
+		pwq->whead = whead;
+		pwq->base = epi;
+		add_wait_queue(whead, &pwq->wait);
+		list_add_tail(&pwq->llink, &epi->pwqlist);
+		epi->nwait++;
+	} else {
+		/* We have to signal that an error occurred */
+		epi->nwait = -1;
+	}
+}
+
+static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
+{
+	int kcmp;
+	struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;
+	struct epitem *epic;
+
+	while (*p) {
+		parent = *p;
+		epic = rb_entry(parent, struct epitem, rbn);
+		kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
+		if (kcmp > 0)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&epi->rbn, parent, p);
+	rb_insert_color(&epi->rbn, &ep->rbr);
+}
+
+
+
+#define PATH_ARR_SIZE 5
+/*
+ * These are the number paths of length 1 to 5, that we are allowing to emanate
+ * from a single file of interest. For example, we allow 1000 paths of length
+ * 1, to emanate from each file of interest. This essentially represents the
+ * potential wakeup paths, which need to be limited in order to avoid massive
+ * uncontrolled wakeup storms. The common use case should be a single ep which
+ * is connected to n file sources. In this case each file source has 1 path
+ * of length 1. Thus, the numbers below should be more than sufficient. These
+ * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
+ * and delete can't add additional paths. Protected by the epmutex.
+ */
+static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
+static int path_count[PATH_ARR_SIZE];
+
+static int path_count_inc(int nests)
+{
+	/* Allow an arbitrary number of depth 1 paths */
+	if (nests == 0)
+		return 0;
+
+	if (++path_count[nests] > path_limits[nests])
+		return -1;
+	return 0;
+}
+
+static void path_count_init(void)
+{
+	int i;
+
+	for (i = 0; i < PATH_ARR_SIZE; i++)
+		path_count[i] = 0;
+}
+
+static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
+{
+	int error = 0;
+	struct file *file = priv;
+	struct file *child_file;
+	struct epitem *epi;
+
+	list_for_each_entry(epi, &file->f_ep_links, fllink) {
+		child_file = epi->ep->file;
+		if (is_file_epoll(child_file)) {
+			if (list_empty(&child_file->f_ep_links)) {
+				if (path_count_inc(call_nests)) {
+					error = -1;
+					break;
+				}
+			} else {
+				error = ep_call_nested(&poll_loop_ncalls,
+							EP_MAX_NESTS,
+							reverse_path_check_proc,
+							child_file, child_file,
+							current);
+			}
+			if (error != 0)
+				break;
+		} else {
+			printk(KERN_ERR "reverse_path_check_proc: "
+				"file is not an ep!\n");
+		}
+	}
+	return error;
+}
+
+/**
+ * reverse_path_check - The tfile_check_list is list of file *, which have
+ *                      links that are proposed to be newly added. We need to
+ *                      make sure that those added links don't add too many
+ *                      paths such that we will spend all our time waking up
+ *                      eventpoll objects.
+ *
+ * Returns: Returns zero if the proposed links don't create too many paths,
+ *	    -1 otherwise.
+ */
+static int reverse_path_check(void)
+{
+	int length = 0;
+	int error = 0;
+	struct file *current_file;
+
+	/* let's call this for all tfiles */
+	list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
+		length++;
+		path_count_init();
+		error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+					reverse_path_check_proc, current_file,
+					current_file, current);
+		if (error)
+			break;
+	}
+	return error;
+}
+
+/*
+ * Must be called with "mtx" held.
+ */
+static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+		     struct file *tfile, int fd)
+{
+	int error, revents, pwake = 0;
+	unsigned long flags;
+	long user_watches;
+	struct epitem *epi;
+	struct ep_pqueue epq;
+
+	user_watches = atomic_long_read(&ep->user->epoll_watches);
+	if (unlikely(user_watches >= max_user_watches))
+		return -ENOSPC;
+	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
+		return -ENOMEM;
+
+	/* Item initialization follow here ... */
+	INIT_LIST_HEAD(&epi->rdllink);
+	INIT_LIST_HEAD(&epi->fllink);
+	INIT_LIST_HEAD(&epi->pwqlist);
+	epi->ep = ep;
+	ep_set_ffd(&epi->ffd, tfile, fd);
+	epi->event = *event;
+	epi->nwait = 0;
+	epi->next = EP_UNACTIVE_PTR;
+
+	/* Initialize the poll table using the queue callback */
+	epq.epi = epi;
+	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
+
+	/*
+	 * Attach the item to the poll hooks and get current event bits.
+	 * We can safely use the file* here because its usage count has
+	 * been increased by the caller of this function. Note that after
+	 * this operation completes, the poll callback can start hitting
+	 * the new item.
+	 */
+	revents = tfile->f_op->poll(tfile, &epq.pt);
+
+	/*
+	 * We have to check if something went wrong during the poll wait queue
+	 * install process. Namely an allocation for a wait queue failed due
+	 * high memory pressure.
+	 */
+	error = -ENOMEM;
+	if (epi->nwait < 0)
+		goto error_unregister;
+
+	/* Add the current item to the list of active epoll hook for this file */
+	spin_lock(&tfile->f_lock);
+	list_add_tail(&epi->fllink, &tfile->f_ep_links);
+	spin_unlock(&tfile->f_lock);
+
+	/*
+	 * Add the current item to the RB tree. All RB tree operations are
+	 * protected by "mtx", and ep_insert() is called with "mtx" held.
+	 */
+	ep_rbtree_insert(ep, epi);
+
+	/* now check if we've created too many backpaths */
+	error = -EINVAL;
+	if (reverse_path_check())
+		goto error_remove_epi;
+
+	/* We have to drop the new item inside our item list to keep track of it */
+	spin_lock_irqsave(&ep->lock, flags);
+
+	/* If the file is already "ready" we drop it inside the ready list */
+	if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
+		list_add_tail(&epi->rdllink, &ep->rdllist);
+
+		/* Notify waiting tasks that events are available */
+		if (waitqueue_active(&ep->wq))
+			wake_up_locked(&ep->wq);
+		if (waitqueue_active(&ep->poll_wait))
+			pwake++;
+	}
+
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	atomic_long_inc(&ep->user->epoll_watches);
+
+	/* We have to call this outside the lock */
+	if (pwake)
+		ep_poll_safewake(&ep->poll_wait);
+
+	return 0;
+
+error_remove_epi:
+	spin_lock(&tfile->f_lock);
+	if (ep_is_linked(&epi->fllink))
+		list_del_init(&epi->fllink);
+	spin_unlock(&tfile->f_lock);
+
+	rb_erase(&epi->rbn, &ep->rbr);
+
+error_unregister:
+	ep_unregister_pollwait(ep, epi);
+
+	/*
+	 * We need to do this because an event could have been arrived on some
+	 * allocated wait queue. Note that we don't care about the ep->ovflist
+	 * list, since that is used/cleaned only inside a section bound by "mtx".
+	 * And ep_insert() is called with "mtx" held.
+	 */
+	spin_lock_irqsave(&ep->lock, flags);
+	if (ep_is_linked(&epi->rdllink))
+		list_del_init(&epi->rdllink);
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	kmem_cache_free(epi_cache, epi);
+
+	return error;
+}
+
+/*
+ * Modify the interest event mask by dropping an event if the new mask
+ * has a match in the current file status. Must be called with "mtx" held.
+ */
+static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
+{
+	int pwake = 0;
+	unsigned int revents;
+
+	/*
+	 * Set the new event interest mask before calling f_op->poll();
+	 * otherwise we might miss an event that happens between the
+	 * f_op->poll() call and the new event set registering.
+	 */
+	epi->event.events = event->events;
+	epi->event.data = event->data; /* protected by mtx */
+
+	/*
+	 * Get current event bits. We can safely use the file* here because
+	 * its usage count has been increased by the caller of this function.
+	 */
+	revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
+
+	/*
+	 * If the item is "hot" and it is not registered inside the ready
+	 * list, push it inside.
+	 */
+	if (revents & event->events) {
+		spin_lock_irq(&ep->lock);
+		if (!ep_is_linked(&epi->rdllink)) {
+			list_add_tail(&epi->rdllink, &ep->rdllist);
+
+			/* Notify waiting tasks that events are available */
+			if (waitqueue_active(&ep->wq))
+				wake_up_locked(&ep->wq);
+			if (waitqueue_active(&ep->poll_wait))
+				pwake++;
+		}
+		spin_unlock_irq(&ep->lock);
+	}
+
+	/* We have to call this outside the lock */
+	if (pwake)
+		ep_poll_safewake(&ep->poll_wait);
+
+	return 0;
+}
+
+static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
+			       void *priv)
+{
+	struct ep_send_events_data *esed = priv;
+	int eventcnt;
+	unsigned int revents;
+	struct epitem *epi;
+	struct epoll_event __user *uevent;
+
+	/*
+	 * We can loop without lock because we are passed a task private list.
+	 * Items cannot vanish during the loop because ep_scan_ready_list() is
+	 * holding "mtx" during this call.
+	 */
+	for (eventcnt = 0, uevent = esed->events;
+	     !list_empty(head) && eventcnt < esed->maxevents;) {
+		epi = list_first_entry(head, struct epitem, rdllink);
+
+		list_del_init(&epi->rdllink);
+
+		revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+			epi->event.events;
+
+		/*
+		 * If the event mask intersect the caller-requested one,
+		 * deliver the event to userspace. Again, ep_scan_ready_list()
+		 * is holding "mtx", so no operations coming from userspace
+		 * can change the item.
+		 */
+		if (revents) {
+			if (__put_user(revents, &uevent->events) ||
+			    __put_user(epi->event.data, &uevent->data)) {
+				list_add(&epi->rdllink, head);
+				return eventcnt ? eventcnt : -EFAULT;
+			}
+			eventcnt++;
+			uevent++;
+			if (epi->event.events & EPOLLONESHOT)
+				epi->event.events &= EP_PRIVATE_BITS;
+			else if (!(epi->event.events & EPOLLET)) {
+				/*
+				 * If this file has been added with Level
+				 * Trigger mode, we need to insert back inside
+				 * the ready list, so that the next call to
+				 * epoll_wait() will check again the events
+				 * availability. At this point, no one can insert
+				 * into ep->rdllist besides us. The epoll_ctl()
+				 * callers are locked out by
+				 * ep_scan_ready_list() holding "mtx" and the
+				 * poll callback will queue them in ep->ovflist.
+				 */
+				list_add_tail(&epi->rdllink, &ep->rdllist);
+			}
+		}
+	}
+
+	return eventcnt;
+}
+
+static int ep_send_events(struct eventpoll *ep,
+			  struct epoll_event __user *events, int maxevents)
+{
+	struct ep_send_events_data esed;
+
+	esed.maxevents = maxevents;
+	esed.events = events;
+
+	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
+}
+
+static inline struct timespec ep_set_mstimeout(long ms)
+{
+	struct timespec now, ts = {
+		.tv_sec = ms / MSEC_PER_SEC,
+		.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
+	};
+
+	ktime_get_ts(&now);
+	return timespec_add_safe(now, ts);
+}
+
+/**
+ * ep_poll - Retrieves ready events, and delivers them to the caller supplied
+ *           event buffer.
+ *
+ * @ep: Pointer to the eventpoll context.
+ * @events: Pointer to the userspace buffer where the ready events should be
+ *          stored.
+ * @maxevents: Size (in terms of number of events) of the caller event buffer.
+ * @timeout: Maximum timeout for the ready events fetch operation, in
+ *           milliseconds. If the @timeout is zero, the function will not block,
+ *           while if the @timeout is less than zero, the function will block
+ *           until at least one event has been retrieved (or an error
+ *           occurred).
+ *
+ * Returns: Returns the number of ready events which have been fetched, or an
+ *          error code, in case of error.
+ */
+static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
+		   int maxevents, long timeout)
+{
+	int res = 0, eavail, timed_out = 0;
+	unsigned long flags;
+	long slack = 0;
+	wait_queue_t wait;
+	ktime_t expires, *to = NULL;
+
+	if (timeout > 0) {
+		struct timespec end_time = ep_set_mstimeout(timeout);
+
+		slack = select_estimate_accuracy(&end_time);
+		to = &expires;
+		*to = timespec_to_ktime(end_time);
+	} else if (timeout == 0) {
+		/*
+		 * Avoid the unnecessary trip to the wait queue loop, if the
+		 * caller specified a non blocking operation.
+		 */
+		timed_out = 1;
+		spin_lock_irqsave(&ep->lock, flags);
+		goto check_events;
+	}
+
+fetch_events:
+	spin_lock_irqsave(&ep->lock, flags);
+
+	if (!ep_events_available(ep)) {
+		/*
+		 * We don't have any available event to return to the caller.
+		 * We need to sleep here, and we will be wake up by
+		 * ep_poll_callback() when events will become available.
+		 */
+		init_waitqueue_entry(&wait, current);
+		__add_wait_queue_exclusive(&ep->wq, &wait);
+
+		for (;;) {
+			/*
+			 * We don't want to sleep if the ep_poll_callback() sends us
+			 * a wakeup in between. That's why we set the task state
+			 * to TASK_INTERRUPTIBLE before doing the checks.
+			 */
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (ep_events_available(ep) || timed_out)
+				break;
+			if (signal_pending(current)) {
+				res = -EINTR;
+				break;
+			}
+
+			spin_unlock_irqrestore(&ep->lock, flags);
+			if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+				timed_out = 1;
+
+			spin_lock_irqsave(&ep->lock, flags);
+		}
+		__remove_wait_queue(&ep->wq, &wait);
+
+		set_current_state(TASK_RUNNING);
+	}
+check_events:
+	/* Is it worth to try to dig for events ? */
+	eavail = ep_events_available(ep);
+
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	/*
+	 * Try to transfer events to user space. In case we get 0 events and
+	 * there's still timeout left over, we go trying again in search of
+	 * more luck.
+	 */
+	if (!res && eavail &&
+	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
+		goto fetch_events;
+
+	return res;
+}
+
+/**
+ * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
+ *                      API, to verify that adding an epoll file inside another
+ *                      epoll structure, does not violate the constraints, in
+ *                      terms of closed loops, or too deep chains (which can
+ *                      result in excessive stack usage).
+ *
+ * @priv: Pointer to the epoll file to be currently checked.
+ * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
+ *          data structure pointer.
+ * @call_nests: Current dept of the @ep_call_nested() call stack.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+{
+	int error = 0;
+	struct file *file = priv;
+	struct eventpoll *ep = file->private_data;
+	struct eventpoll *ep_tovisit;
+	struct rb_node *rbp;
+	struct epitem *epi;
+
+	mutex_lock_nested(&ep->mtx, call_nests + 1);
+	ep->visited = 1;
+	list_add(&ep->visited_list_link, &visited_list);
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+		if (unlikely(is_file_epoll(epi->ffd.file))) {
+			ep_tovisit = epi->ffd.file->private_data;
+			if (ep_tovisit->visited)
+				continue;
+			error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+					ep_loop_check_proc, epi->ffd.file,
+					ep_tovisit, current);
+			if (error != 0)
+				break;
+		} else {
+			/*
+			 * If we've reached a file that is not associated with
+			 * an ep, then we need to check if the newly added
+			 * links are going to add too many wakeup paths. We do
+			 * this by adding it to the tfile_check_list, if it's
+			 * not already there, and calling reverse_path_check()
+			 * during ep_insert().
+			 */
+			if (list_empty(&epi->ffd.file->f_tfile_llink))
+				list_add(&epi->ffd.file->f_tfile_llink,
+					 &tfile_check_list);
+		}
+	}
+	mutex_unlock(&ep->mtx);
+
+	return error;
+}
+
+/**
+ * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
+ *                 another epoll file (represented by @ep) does not create
+ *                 closed loops or too deep chains.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @file: Pointer to the epoll file to be checked.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check(struct eventpoll *ep, struct file *file)
+{
+	int ret;
+	struct eventpoll *ep_cur, *ep_next;
+
+	ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+			      ep_loop_check_proc, file, ep, current);
+	/* clear visited list */
+	list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
+							visited_list_link) {
+		ep_cur->visited = 0;
+		list_del(&ep_cur->visited_list_link);
+	}
+	return ret;
+}
+
+static void clear_tfile_check_list(void)
+{
+	struct file *file;
+
+	/* first clear the tfile_check_list */
+	while (!list_empty(&tfile_check_list)) {
+		file = list_first_entry(&tfile_check_list, struct file,
+					f_tfile_llink);
+		list_del_init(&file->f_tfile_llink);
+	}
+	INIT_LIST_HEAD(&tfile_check_list);
+}
+
+/*
+ * Open an eventpoll file descriptor.
+ */
+SYSCALL_DEFINE1(epoll_create1, int, flags)
+{
+	int error, fd;
+	struct eventpoll *ep = NULL;
+	struct file *file;
+
+	/* Check the EPOLL_* constant for consistency.  */
+	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
+
+	if (flags & ~EPOLL_CLOEXEC)
+		return -EINVAL;
+	/*
+	 * Create the internal data structure ("struct eventpoll").
+	 */
+	error = ep_alloc(&ep);
+	if (error < 0)
+		return error;
+	/*
+	 * Creates all the items needed to setup an eventpoll file. That is,
+	 * a file structure and a free file descriptor.
+	 */
+	fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
+	if (fd < 0) {
+		error = fd;
+		goto out_free_ep;
+	}
+	file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
+				 O_RDWR | (flags & O_CLOEXEC));
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto out_free_fd;
+	}
+	fd_install(fd, file);
+	ep->file = file;
+	return fd;
+
+out_free_fd:
+	put_unused_fd(fd);
+out_free_ep:
+	ep_free(ep);
+	return error;
+}
+
+SYSCALL_DEFINE1(epoll_create, int, size)
+{
+	if (size <= 0)
+		return -EINVAL;
+
+	return sys_epoll_create1(0);
+}
+
+/*
+ * The following function implements the controller interface for
+ * the eventpoll file that enables the insertion/removal/change of
+ * file descriptors inside the interest set.
+ */
+SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
+		struct epoll_event __user *, event)
+{
+	int error;
+	int did_lock_epmutex = 0;
+	struct file *file, *tfile;
+	struct eventpoll *ep;
+	struct epitem *epi;
+	struct epoll_event epds;
+
+	error = -EFAULT;
+	if (ep_op_has_event(op) &&
+	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
+		goto error_return;
+
+	/* Get the "struct file *" for the eventpoll file */
+	error = -EBADF;
+	file = fget(epfd);
+	if (!file)
+		goto error_return;
+
+	/* Get the "struct file *" for the target file */
+	tfile = fget(fd);
+	if (!tfile)
+		goto error_fput;
+
+	/* The target file descriptor must support poll */
+	error = -EPERM;
+	if (!tfile->f_op || !tfile->f_op->poll)
+		goto error_tgt_fput;
+
+	/*
+	 * We have to check that the file structure underneath the file descriptor
+	 * the user passed to us _is_ an eventpoll file. And also we do not permit
+	 * adding an epoll file descriptor inside itself.
+	 */
+	error = -EINVAL;
+	if (file == tfile || !is_file_epoll(file))
+		goto error_tgt_fput;
+
+	/*
+	 * At this point it is safe to assume that the "private_data" contains
+	 * our own data structure.
+	 */
+	ep = file->private_data;
+
+	/*
+	 * When we insert an epoll file descriptor, inside another epoll file
+	 * descriptor, there is the change of creating closed loops, which are
+	 * better be handled here, than in more critical paths. While we are
+	 * checking for loops we also determine the list of files reachable
+	 * and hang them on the tfile_check_list, so we can check that we
+	 * haven't created too many possible wakeup paths.
+	 *
+	 * We need to hold the epmutex across both ep_insert and ep_remove
+	 * b/c we want to make sure we are looking at a coherent view of
+	 * epoll network.
+	 */
+	if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
+		mutex_lock(&epmutex);
+		did_lock_epmutex = 1;
+	}
+	if (op == EPOLL_CTL_ADD) {
+		if (is_file_epoll(tfile)) {
+			error = -ELOOP;
+			if (ep_loop_check(ep, tfile) != 0)
+				goto error_tgt_fput;
+		} else
+			list_add(&tfile->f_tfile_llink, &tfile_check_list);
+	}
+
+	mutex_lock_nested(&ep->mtx, 0);
+
+	/*
+	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
+	 * above, we can be sure to be able to use the item looked up by
+	 * ep_find() till we release the mutex.
+	 */
+	epi = ep_find(ep, tfile, fd);
+
+	error = -EINVAL;
+	switch (op) {
+	case EPOLL_CTL_ADD:
+		if (!epi) {
+			epds.events |= POLLERR | POLLHUP;
+			error = ep_insert(ep, &epds, tfile, fd);
+		} else
+			error = -EEXIST;
+		clear_tfile_check_list();
+		break;
+	case EPOLL_CTL_DEL:
+		if (epi)
+			error = ep_remove(ep, epi);
+		else
+			error = -ENOENT;
+		break;
+	case EPOLL_CTL_MOD:
+		if (epi) {
+			epds.events |= POLLERR | POLLHUP;
+			error = ep_modify(ep, epi, &epds);
+		} else
+			error = -ENOENT;
+		break;
+	}
+	mutex_unlock(&ep->mtx);
+
+error_tgt_fput:
+	if (did_lock_epmutex)
+		mutex_unlock(&epmutex);
+
+	fput(tfile);
+error_fput:
+	fput(file);
+error_return:
+
+	return error;
+}
+
+/*
+ * Implement the event wait interface for the eventpoll file. It is the kernel
+ * part of the user space epoll_wait(2).
+ */
+SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout)
+{
+	int error;
+	struct file *file;
+	struct eventpoll *ep;
+
+	/* The maximum number of event must be greater than zero */
+	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
+		return -EINVAL;
+
+	/* Verify that the area passed by the user is writeable */
+	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
+		error = -EFAULT;
+		goto error_return;
+	}
+
+	/* Get the "struct file *" for the eventpoll file */
+	error = -EBADF;
+	file = fget(epfd);
+	if (!file)
+		goto error_return;
+
+	/*
+	 * We have to check that the file structure underneath the fd
+	 * the user passed to us _is_ an eventpoll file.
+	 */
+	error = -EINVAL;
+	if (!is_file_epoll(file))
+		goto error_fput;
+
+	/*
+	 * At this point it is safe to assume that the "private_data" contains
+	 * our own data structure.
+	 */
+	ep = file->private_data;
+
+	/* Time to fish for events ... */
+	error = ep_poll(ep, events, maxevents, timeout);
+
+error_fput:
+	fput(file);
+error_return:
+
+	return error;
+}
+
+#ifdef HAVE_SET_RESTORE_SIGMASK
+
+/*
+ * Implement the event wait interface for the eventpoll file. It is the kernel
+ * part of the user space epoll_pwait(2).
+ */
+SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout, const sigset_t __user *, sigmask,
+		size_t, sigsetsize)
+{
+	int error;
+	sigset_t ksigmask, sigsaved;
+
+	/*
+	 * If the caller wants a certain signal mask to be set during the wait,
+	 * we apply it here.
+	 */
+	if (sigmask) {
+		if (sigsetsize != sizeof(sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+			return -EFAULT;
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	error = sys_epoll_wait(epfd, events, maxevents, timeout);
+
+	/*
+	 * If we changed the signal mask, we need to restore the original one.
+	 * In case we've got a signal while waiting, we do not restore the
+	 * signal mask yet, and we allow do_signal() to deliver the signal on
+	 * the way back to userspace, before the signal mask is restored.
+	 */
+	if (sigmask) {
+		if (error == -EINTR) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+			       sizeof(sigsaved));
+			set_restore_sigmask();
+		} else
+			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	}
+
+	return error;
+}
+
+#endif /* HAVE_SET_RESTORE_SIGMASK */
+
+static int __init eventpoll_init(void)
+{
+	struct sysinfo si;
+
+	si_meminfo(&si);
+	/*
+	 * Allows top 4% of lomem to be allocated for epoll watches (per user).
+	 */
+	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
+		EP_ITEM_COST;
+	BUG_ON(max_user_watches < 0);
+
+	/*
+	 * Initialize the structure used to perform epoll file descriptor
+	 * inclusion loops checks.
+	 */
+	ep_nested_calls_init(&poll_loop_ncalls);
+
+	/* Initialize the structure used to perform safe poll wait head wake ups */
+	ep_nested_calls_init(&poll_safewake_ncalls);
+
+	/* Initialize the structure used to perform file's f_op->poll() calls */
+	ep_nested_calls_init(&poll_readywalk_ncalls);
+
+	/* Allocates slab cache used to allocate "struct epitem" items */
+	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+
+	/* Allocates slab cache used to allocate "struct eppoll_entry" */
+	pwq_cache = kmem_cache_create("eventpoll_pwq",
+			sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
+
+	return 0;
+}
+fs_initcall(eventpoll_init);
diff --git a/fs/exec.c b/fs/exec.c
new file mode 100644
index 00000000..044c13ff
--- /dev/null
+++ b/fs/exec.c
@@ -0,0 +1,2247 @@
+/*
+ *  linux/fs/exec.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * #!-checking implemented by tytso.
+ */
+/*
+ * Demand-loading implemented 01.12.91 - no need to read anything but
+ * the header into memory. The inode of the executable is put into
+ * "current->executable", and page faults do the actual loading. Clean.
+ *
+ * Once more I can proudly say that linux stood up to being changed: it
+ * was less than 2 hours work to get demand-loading completely implemented.
+ *
+ * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
+ * current->executable is only used by the procfs.  This allows a dispatch
+ * table to check for several different types  of binary formats.  We keep
+ * trying until we recognize the file or we run out of supported binary
+ * formats. 
+ */
+
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/swap.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/perf_event.h>
+#include <linux/highmem.h>
+#include <linux/spinlock.h>
+#include <linux/key.h>
+#include <linux/personality.h>
+#include <linux/binfmts.h>
+#include <linux/utsname.h>
+#include <linux/pid_namespace.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/tsacct_kern.h>
+#include <linux/cn_proc.h>
+#include <linux/audit.h>
+#include <linux/tracehook.h>
+#include <linux/kmod.h>
+#include <linux/fsnotify.h>
+#include <linux/fs_struct.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
+#include <linux/compat.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/tlb.h>
+#include "internal.h"
+
+int core_uses_pid;
+char core_pattern[CORENAME_MAX_SIZE] = "core";
+unsigned int core_pipe_limit;
+int suid_dumpable = 0;
+
+struct core_name {
+	char *corename;
+	int used, size;
+};
+static atomic_t call_count = ATOMIC_INIT(1);
+
+/* The maximal length of core_pattern is also specified in sysctl.c */
+
+static LIST_HEAD(formats);
+static DEFINE_RWLOCK(binfmt_lock);
+
+int __register_binfmt(struct linux_binfmt * fmt, int insert)
+{
+	if (!fmt)
+		return -EINVAL;
+	write_lock(&binfmt_lock);
+	insert ? list_add(&fmt->lh, &formats) :
+		 list_add_tail(&fmt->lh, &formats);
+	write_unlock(&binfmt_lock);
+	return 0;	
+}
+
+EXPORT_SYMBOL(__register_binfmt);
+
+void unregister_binfmt(struct linux_binfmt * fmt)
+{
+	write_lock(&binfmt_lock);
+	list_del(&fmt->lh);
+	write_unlock(&binfmt_lock);
+}
+
+EXPORT_SYMBOL(unregister_binfmt);
+
+static inline void put_binfmt(struct linux_binfmt * fmt)
+{
+	module_put(fmt->module);
+}
+
+/*
+ * Note that a shared library must be both readable and executable due to
+ * security reasons.
+ *
+ * Also note that we take the address to load from from the file itself.
+ */
+SYSCALL_DEFINE1(uselib, const char __user *, library)
+{
+	struct file *file;
+	char *tmp = getname(library);
+	int error = PTR_ERR(tmp);
+	static const struct open_flags uselib_flags = {
+		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+		.acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
+		.intent = LOOKUP_OPEN
+	};
+
+	if (IS_ERR(tmp))
+		goto out;
+
+	file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
+	putname(tmp);
+	error = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out;
+
+	error = -EINVAL;
+	if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
+		goto exit;
+
+	error = -EACCES;
+	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+		goto exit;
+
+	fsnotify_open(file);
+
+	error = -ENOEXEC;
+	if(file->f_op) {
+		struct linux_binfmt * fmt;
+
+		read_lock(&binfmt_lock);
+		list_for_each_entry(fmt, &formats, lh) {
+			if (!fmt->load_shlib)
+				continue;
+			if (!try_module_get(fmt->module))
+				continue;
+			read_unlock(&binfmt_lock);
+			error = fmt->load_shlib(file);
+			read_lock(&binfmt_lock);
+			put_binfmt(fmt);
+			if (error != -ENOEXEC)
+				break;
+		}
+		read_unlock(&binfmt_lock);
+	}
+exit:
+	fput(file);
+out:
+  	return error;
+}
+
+#ifdef CONFIG_MMU
+/*
+ * The nascent bprm->mm is not visible until exec_mmap() but it can
+ * use a lot of memory, account these pages in current->mm temporary
+ * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
+ * change the counter back via acct_arg_size(0).
+ */
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+	struct mm_struct *mm = current->mm;
+	long diff = (long)(pages - bprm->vma_pages);
+
+	if (!mm || !diff)
+		return;
+
+	bprm->vma_pages = pages;
+
+#ifdef SPLIT_RSS_COUNTING
+	add_mm_counter(mm, MM_ANONPAGES, diff);
+#else
+	spin_lock(&mm->page_table_lock);
+	add_mm_counter(mm, MM_ANONPAGES, diff);
+	spin_unlock(&mm->page_table_lock);
+#endif
+}
+
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		int write)
+{
+	struct page *page;
+	int ret;
+
+#ifdef CONFIG_STACK_GROWSUP
+	if (write) {
+		ret = expand_downwards(bprm->vma, pos);
+		if (ret < 0)
+			return NULL;
+	}
+#endif
+	ret = get_user_pages(current, bprm->mm, pos,
+			1, write, 1, &page, NULL);
+	if (ret <= 0)
+		return NULL;
+
+	if (write) {
+		unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
+		struct rlimit *rlim;
+
+		acct_arg_size(bprm, size / PAGE_SIZE);
+
+		/*
+		 * We've historically supported up to 32 pages (ARG_MAX)
+		 * of argument strings even with small stacks
+		 */
+		if (size <= ARG_MAX)
+			return page;
+
+		/*
+		 * Limit to 1/4-th the stack size for the argv+env strings.
+		 * This ensures that:
+		 *  - the remaining binfmt code will not run out of stack space,
+		 *  - the program will have a reasonable amount of stack left
+		 *    to work from.
+		 */
+		rlim = current->signal->rlim;
+		if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
+			put_page(page);
+			return NULL;
+		}
+	}
+
+	return page;
+}
+
+static void put_arg_page(struct page *page)
+{
+	put_page(page);
+}
+
+static void free_arg_page(struct linux_binprm *bprm, int i)
+{
+}
+
+static void free_arg_pages(struct linux_binprm *bprm)
+{
+}
+
+static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		struct page *page)
+{
+	flush_cache_page(bprm->vma, pos, page_to_pfn(page));
+}
+
+static int __bprm_mm_init(struct linux_binprm *bprm)
+{
+	int err;
+	struct vm_area_struct *vma = NULL;
+	struct mm_struct *mm = bprm->mm;
+
+	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	if (!vma)
+		return -ENOMEM;
+
+	down_write(&mm->mmap_sem);
+	vma->vm_mm = mm;
+
+	/*
+	 * Place the stack at the largest stack address the architecture
+	 * supports. Later, we'll move this to an appropriate place. We don't
+	 * use STACK_TOP because that can depend on attributes which aren't
+	 * configured yet.
+	 */
+	BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
+	vma->vm_end = STACK_TOP_MAX;
+	vma->vm_start = vma->vm_end - PAGE_SIZE;
+	vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
+	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+	INIT_LIST_HEAD(&vma->anon_vma_chain);
+
+	err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
+	if (err)
+		goto err;
+
+	err = insert_vm_struct(mm, vma);
+	if (err)
+		goto err;
+
+	mm->stack_vm = mm->total_vm = 1;
+	up_write(&mm->mmap_sem);
+	bprm->p = vma->vm_end - sizeof(void *);
+	return 0;
+err:
+	up_write(&mm->mmap_sem);
+	bprm->vma = NULL;
+	kmem_cache_free(vm_area_cachep, vma);
+	return err;
+}
+
+static bool valid_arg_len(struct linux_binprm *bprm, long len)
+{
+	return len <= MAX_ARG_STRLEN;
+}
+
+#else
+
+static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+}
+
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		int write)
+{
+	struct page *page;
+
+	page = bprm->page[pos / PAGE_SIZE];
+	if (!page && write) {
+		page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
+		if (!page)
+			return NULL;
+		bprm->page[pos / PAGE_SIZE] = page;
+	}
+
+	return page;
+}
+
+static void put_arg_page(struct page *page)
+{
+}
+
+static void free_arg_page(struct linux_binprm *bprm, int i)
+{
+	if (bprm->page[i]) {
+		__free_page(bprm->page[i]);
+		bprm->page[i] = NULL;
+	}
+}
+
+static void free_arg_pages(struct linux_binprm *bprm)
+{
+	int i;
+
+	for (i = 0; i < MAX_ARG_PAGES; i++)
+		free_arg_page(bprm, i);
+}
+
+static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		struct page *page)
+{
+}
+
+static int __bprm_mm_init(struct linux_binprm *bprm)
+{
+	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
+	return 0;
+}
+
+static bool valid_arg_len(struct linux_binprm *bprm, long len)
+{
+	return len <= bprm->p;
+}
+
+#endif /* CONFIG_MMU */
+
+/*
+ * Create a new mm_struct and populate it with a temporary stack
+ * vm_area_struct.  We don't have enough context at this point to set the stack
+ * flags, permissions, and offset, so we use temporary values.  We'll update
+ * them later in setup_arg_pages().
+ */
+int bprm_mm_init(struct linux_binprm *bprm)
+{
+	int err;
+	struct mm_struct *mm = NULL;
+
+	bprm->mm = mm = mm_alloc();
+	err = -ENOMEM;
+	if (!mm)
+		goto err;
+
+	err = init_new_context(current, mm);
+	if (err)
+		goto err;
+
+	err = __bprm_mm_init(bprm);
+	if (err)
+		goto err;
+
+	return 0;
+
+err:
+	if (mm) {
+		bprm->mm = NULL;
+		mmdrop(mm);
+	}
+
+	return err;
+}
+
+struct user_arg_ptr {
+#ifdef CONFIG_COMPAT
+	bool is_compat;
+#endif
+	union {
+		const char __user *const __user *native;
+#ifdef CONFIG_COMPAT
+		compat_uptr_t __user *compat;
+#endif
+	} ptr;
+};
+
+static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
+{
+	const char __user *native;
+
+#ifdef CONFIG_COMPAT
+	if (unlikely(argv.is_compat)) {
+		compat_uptr_t compat;
+
+		if (get_user(compat, argv.ptr.compat + nr))
+			return ERR_PTR(-EFAULT);
+
+		return compat_ptr(compat);
+	}
+#endif
+
+	if (get_user(native, argv.ptr.native + nr))
+		return ERR_PTR(-EFAULT);
+
+	return native;
+}
+
+/*
+ * count() counts the number of strings in array ARGV.
+ */
+static int count(struct user_arg_ptr argv, int max)
+{
+	int i = 0;
+
+	if (argv.ptr.native != NULL) {
+		for (;;) {
+			const char __user *p = get_user_arg_ptr(argv, i);
+
+			if (!p)
+				break;
+
+			if (IS_ERR(p))
+				return -EFAULT;
+
+			if (i++ >= max)
+				return -E2BIG;
+
+			if (fatal_signal_pending(current))
+				return -ERESTARTNOHAND;
+			cond_resched();
+		}
+	}
+	return i;
+}
+
+/*
+ * 'copy_strings()' copies argument/environment strings from the old
+ * processes's memory to the new process's stack.  The call to get_user_pages()
+ * ensures the destination page is created and not swapped out.
+ */
+static int copy_strings(int argc, struct user_arg_ptr argv,
+			struct linux_binprm *bprm)
+{
+	struct page *kmapped_page = NULL;
+	char *kaddr = NULL;
+	unsigned long kpos = 0;
+	int ret;
+
+	while (argc-- > 0) {
+		const char __user *str;
+		int len;
+		unsigned long pos;
+
+		ret = -EFAULT;
+		str = get_user_arg_ptr(argv, argc);
+		if (IS_ERR(str))
+			goto out;
+
+		len = strnlen_user(str, MAX_ARG_STRLEN);
+		if (!len)
+			goto out;
+
+		ret = -E2BIG;
+		if (!valid_arg_len(bprm, len))
+			goto out;
+
+		/* We're going to work our way backwords. */
+		pos = bprm->p;
+		str += len;
+		bprm->p -= len;
+
+		while (len > 0) {
+			int offset, bytes_to_copy;
+
+			if (fatal_signal_pending(current)) {
+				ret = -ERESTARTNOHAND;
+				goto out;
+			}
+			cond_resched();
+
+			offset = pos % PAGE_SIZE;
+			if (offset == 0)
+				offset = PAGE_SIZE;
+
+			bytes_to_copy = offset;
+			if (bytes_to_copy > len)
+				bytes_to_copy = len;
+
+			offset -= bytes_to_copy;
+			pos -= bytes_to_copy;
+			str -= bytes_to_copy;
+			len -= bytes_to_copy;
+
+			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
+				struct page *page;
+
+				page = get_arg_page(bprm, pos, 1);
+				if (!page) {
+					ret = -E2BIG;
+					goto out;
+				}
+
+				if (kmapped_page) {
+					flush_kernel_dcache_page(kmapped_page);
+					kunmap(kmapped_page);
+					put_arg_page(kmapped_page);
+				}
+				kmapped_page = page;
+				kaddr = kmap(kmapped_page);
+				kpos = pos & PAGE_MASK;
+				flush_arg_page(bprm, kpos, kmapped_page);
+			}
+			if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
+				ret = -EFAULT;
+				goto out;
+			}
+		}
+	}
+	ret = 0;
+out:
+	if (kmapped_page) {
+		flush_kernel_dcache_page(kmapped_page);
+		kunmap(kmapped_page);
+		put_arg_page(kmapped_page);
+	}
+	return ret;
+}
+
+/*
+ * Like copy_strings, but get argv and its values from kernel memory.
+ */
+int copy_strings_kernel(int argc, const char *const *__argv,
+			struct linux_binprm *bprm)
+{
+	int r;
+	mm_segment_t oldfs = get_fs();
+	struct user_arg_ptr argv = {
+		.ptr.native = (const char __user *const  __user *)__argv,
+	};
+
+	set_fs(KERNEL_DS);
+	r = copy_strings(argc, argv, bprm);
+	set_fs(oldfs);
+
+	return r;
+}
+EXPORT_SYMBOL(copy_strings_kernel);
+
+#ifdef CONFIG_MMU
+
+/*
+ * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
+ * the binfmt code determines where the new stack should reside, we shift it to
+ * its final location.  The process proceeds as follows:
+ *
+ * 1) Use shift to calculate the new vma endpoints.
+ * 2) Extend vma to cover both the old and new ranges.  This ensures the
+ *    arguments passed to subsequent functions are consistent.
+ * 3) Move vma's page tables to the new range.
+ * 4) Free up any cleared pgd range.
+ * 5) Shrink the vma to cover only the new range.
+ */
+static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long old_start = vma->vm_start;
+	unsigned long old_end = vma->vm_end;
+	unsigned long length = old_end - old_start;
+	unsigned long new_start = old_start - shift;
+	unsigned long new_end = old_end - shift;
+	struct mmu_gather tlb;
+
+	BUG_ON(new_start > new_end);
+
+	/*
+	 * ensure there are no vmas between where we want to go
+	 * and where we are
+	 */
+	if (vma != find_vma(mm, new_start))
+		return -EFAULT;
+
+	/*
+	 * cover the whole range: [new_start, old_end)
+	 */
+	if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
+		return -ENOMEM;
+
+	/*
+	 * move the page tables downwards, on failure we rely on
+	 * process cleanup to remove whatever mess we made.
+	 */
+	if (length != move_page_tables(vma, old_start,
+				       vma, new_start, length))
+		return -ENOMEM;
+
+	lru_add_drain();
+	tlb_gather_mmu(&tlb, mm, 0);
+	if (new_end > old_start) {
+		/*
+		 * when the old and new regions overlap clear from new_end.
+		 */
+		free_pgd_range(&tlb, new_end, old_end, new_end,
+			vma->vm_next ? vma->vm_next->vm_start : 0);
+	} else {
+		/*
+		 * otherwise, clean from old_start; this is done to not touch
+		 * the address space in [new_end, old_start) some architectures
+		 * have constraints on va-space that make this illegal (IA64) -
+		 * for the others its just a little faster.
+		 */
+		free_pgd_range(&tlb, old_start, old_end, new_end,
+			vma->vm_next ? vma->vm_next->vm_start : 0);
+	}
+	tlb_finish_mmu(&tlb, new_end, old_end);
+
+	/*
+	 * Shrink the vma to just the new range.  Always succeeds.
+	 */
+	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
+
+	return 0;
+}
+
+/*
+ * Finalizes the stack vm_area_struct. The flags and permissions are updated,
+ * the stack is optionally relocated, and some extra space is added.
+ */
+int setup_arg_pages(struct linux_binprm *bprm,
+		    unsigned long stack_top,
+		    int executable_stack)
+{
+	unsigned long ret;
+	unsigned long stack_shift;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma = bprm->vma;
+	struct vm_area_struct *prev = NULL;
+	unsigned long vm_flags;
+	unsigned long stack_base;
+	unsigned long stack_size;
+	unsigned long stack_expand;
+	unsigned long rlim_stack;
+
+#ifdef CONFIG_STACK_GROWSUP
+	/* Limit stack size to 1GB */
+	stack_base = rlimit_max(RLIMIT_STACK);
+	if (stack_base > (1 << 30))
+		stack_base = 1 << 30;
+
+	/* Make sure we didn't let the argument array grow too large. */
+	if (vma->vm_end - vma->vm_start > stack_base)
+		return -ENOMEM;
+
+	stack_base = PAGE_ALIGN(stack_top - stack_base);
+
+	stack_shift = vma->vm_start - stack_base;
+	mm->arg_start = bprm->p - stack_shift;
+	bprm->p = vma->vm_end - stack_shift;
+#else
+	stack_top = arch_align_stack(stack_top);
+	stack_top = PAGE_ALIGN(stack_top);
+
+	if (unlikely(stack_top < mmap_min_addr) ||
+	    unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
+		return -ENOMEM;
+
+	stack_shift = vma->vm_end - stack_top;
+
+	bprm->p -= stack_shift;
+	mm->arg_start = bprm->p;
+#endif
+
+	if (bprm->loader)
+		bprm->loader -= stack_shift;
+	bprm->exec -= stack_shift;
+
+	down_write(&mm->mmap_sem);
+	vm_flags = VM_STACK_FLAGS;
+
+	/*
+	 * Adjust stack execute permissions; explicitly enable for
+	 * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
+	 * (arch default) otherwise.
+	 */
+	if (unlikely(executable_stack == EXSTACK_ENABLE_X))
+		vm_flags |= VM_EXEC;
+	else if (executable_stack == EXSTACK_DISABLE_X)
+		vm_flags &= ~VM_EXEC;
+	vm_flags |= mm->def_flags;
+	vm_flags |= VM_STACK_INCOMPLETE_SETUP;
+
+	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
+			vm_flags);
+	if (ret)
+		goto out_unlock;
+	BUG_ON(prev != vma);
+
+	/* Move stack pages down in memory. */
+	if (stack_shift) {
+		ret = shift_arg_pages(vma, stack_shift);
+		if (ret)
+			goto out_unlock;
+	}
+
+	/* mprotect_fixup is overkill to remove the temporary stack flags */
+	vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
+
+	stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
+	stack_size = vma->vm_end - vma->vm_start;
+	/*
+	 * Align this down to a page boundary as expand_stack
+	 * will align it up.
+	 */
+	rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
+#ifdef CONFIG_STACK_GROWSUP
+	if (stack_size + stack_expand > rlim_stack)
+		stack_base = vma->vm_start + rlim_stack;
+	else
+		stack_base = vma->vm_end + stack_expand;
+#else
+	if (stack_size + stack_expand > rlim_stack)
+		stack_base = vma->vm_end - rlim_stack;
+	else
+		stack_base = vma->vm_start - stack_expand;
+#endif
+	current->mm->start_stack = bprm->p;
+	ret = expand_stack(vma, stack_base);
+	if (ret)
+		ret = -EFAULT;
+
+out_unlock:
+	up_write(&mm->mmap_sem);
+	return ret;
+}
+EXPORT_SYMBOL(setup_arg_pages);
+
+#endif /* CONFIG_MMU */
+
+struct file *open_exec(const char *name)
+{
+	struct file *file;
+	int err;
+	static const struct open_flags open_exec_flags = {
+		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+		.acc_mode = MAY_EXEC | MAY_OPEN,
+		.intent = LOOKUP_OPEN
+	};
+
+	file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
+	if (IS_ERR(file))
+		goto out;
+
+	err = -EACCES;
+	if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
+		goto exit;
+
+	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+		goto exit;
+
+	fsnotify_open(file);
+
+	err = deny_write_access(file);
+	if (err)
+		goto exit;
+
+out:
+	return file;
+
+exit:
+	fput(file);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(open_exec);
+
+int kernel_read(struct file *file, loff_t offset,
+		char *addr, unsigned long count)
+{
+	mm_segment_t old_fs;
+	loff_t pos = offset;
+	int result;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	result = vfs_read(file, (void __user *)addr, count, &pos);
+	set_fs(old_fs);
+	return result;
+}
+
+EXPORT_SYMBOL(kernel_read);
+
+static int exec_mmap(struct mm_struct *mm)
+{
+	struct task_struct *tsk;
+	struct mm_struct * old_mm, *active_mm;
+
+	/* Notify parent that we're no longer interested in the old VM */
+	tsk = current;
+	old_mm = current->mm;
+	sync_mm_rss(tsk, old_mm);
+	mm_release(tsk, old_mm);
+
+	if (old_mm) {
+		/*
+		 * Make sure that if there is a core dump in progress
+		 * for the old mm, we get out and die instead of going
+		 * through with the exec.  We must hold mmap_sem around
+		 * checking core_state and changing tsk->mm.
+		 */
+		down_read(&old_mm->mmap_sem);
+		if (unlikely(old_mm->core_state)) {
+			up_read(&old_mm->mmap_sem);
+			return -EINTR;
+		}
+	}
+	task_lock(tsk);
+	active_mm = tsk->active_mm;
+	tsk->mm = mm;
+	tsk->active_mm = mm;
+	activate_mm(active_mm, mm);
+	if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+		atomic_dec(&old_mm->oom_disable_count);
+		atomic_inc(&tsk->mm->oom_disable_count);
+	}
+	task_unlock(tsk);
+	arch_pick_mmap_layout(mm);
+	if (old_mm) {
+		up_read(&old_mm->mmap_sem);
+		BUG_ON(active_mm != old_mm);
+		mm_update_next_owner(old_mm);
+		mmput(old_mm);
+		return 0;
+	}
+	mmdrop(active_mm);
+	return 0;
+}
+
+/*
+ * This function makes sure the current process has its own signal table,
+ * so that flush_signal_handlers can later reset the handlers without
+ * disturbing other processes.  (Other processes might share the signal
+ * table via the CLONE_SIGHAND option to clone().)
+ */
+static int de_thread(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+	struct sighand_struct *oldsighand = tsk->sighand;
+	spinlock_t *lock = &oldsighand->siglock;
+
+	if (thread_group_empty(tsk))
+		goto no_thread_group;
+
+	/*
+	 * Kill all other threads in the thread group.
+	 */
+	spin_lock_irq(lock);
+	if (signal_group_exit(sig)) {
+		/*
+		 * Another group action in progress, just
+		 * return so that the signal is processed.
+		 */
+		spin_unlock_irq(lock);
+		return -EAGAIN;
+	}
+
+	sig->group_exit_task = tsk;
+	sig->notify_count = zap_other_threads(tsk);
+	if (!thread_group_leader(tsk))
+		sig->notify_count--;
+
+	while (sig->notify_count) {
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(lock);
+		schedule();
+		spin_lock_irq(lock);
+	}
+	spin_unlock_irq(lock);
+
+	/*
+	 * At this point all other threads have exited, all we have to
+	 * do is to wait for the thread group leader to become inactive,
+	 * and to assume its PID:
+	 */
+	if (!thread_group_leader(tsk)) {
+		struct task_struct *leader = tsk->group_leader;
+
+		sig->notify_count = -1;	/* for exit_notify() */
+		for (;;) {
+			write_lock_irq(&tasklist_lock);
+			if (likely(leader->exit_state))
+				break;
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			write_unlock_irq(&tasklist_lock);
+			schedule();
+		}
+
+		/*
+		 * The only record we have of the real-time age of a
+		 * process, regardless of execs it's done, is start_time.
+		 * All the past CPU time is accumulated in signal_struct
+		 * from sister threads now dead.  But in this non-leader
+		 * exec, nothing survives from the original leader thread,
+		 * whose birth marks the true age of this process now.
+		 * When we take on its identity by switching to its PID, we
+		 * also take its birthdate (always earlier than our own).
+		 */
+		tsk->start_time = leader->start_time;
+
+		BUG_ON(!same_thread_group(leader, tsk));
+		BUG_ON(has_group_leader_pid(tsk));
+		/*
+		 * An exec() starts a new thread group with the
+		 * TGID of the previous thread group. Rehash the
+		 * two threads with a switched PID, and release
+		 * the former thread group leader:
+		 */
+
+		/* Become a process group leader with the old leader's pid.
+		 * The old leader becomes a thread of the this thread group.
+		 * Note: The old leader also uses this pid until release_task
+		 *       is called.  Odd but simple and correct.
+		 */
+		detach_pid(tsk, PIDTYPE_PID);
+		tsk->pid = leader->pid;
+		attach_pid(tsk, PIDTYPE_PID,  task_pid(leader));
+		transfer_pid(leader, tsk, PIDTYPE_PGID);
+		transfer_pid(leader, tsk, PIDTYPE_SID);
+
+		list_replace_rcu(&leader->tasks, &tsk->tasks);
+		list_replace_init(&leader->sibling, &tsk->sibling);
+
+		tsk->group_leader = tsk;
+		leader->group_leader = tsk;
+
+		tsk->exit_signal = SIGCHLD;
+
+		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
+		leader->exit_state = EXIT_DEAD;
+		write_unlock_irq(&tasklist_lock);
+
+		release_task(leader);
+	}
+
+	sig->group_exit_task = NULL;
+	sig->notify_count = 0;
+
+no_thread_group:
+	if (current->mm)
+		setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
+
+	exit_itimers(sig);
+	flush_itimer_signals();
+
+	if (atomic_read(&oldsighand->count) != 1) {
+		struct sighand_struct *newsighand;
+		/*
+		 * This ->sighand is shared with the CLONE_SIGHAND
+		 * but not CLONE_THREAD task, switch to the new one.
+		 */
+		newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
+		if (!newsighand)
+			return -ENOMEM;
+
+		atomic_set(&newsighand->count, 1);
+		memcpy(newsighand->action, oldsighand->action,
+		       sizeof(newsighand->action));
+
+		write_lock_irq(&tasklist_lock);
+		spin_lock(&oldsighand->siglock);
+		rcu_assign_pointer(tsk->sighand, newsighand);
+		spin_unlock(&oldsighand->siglock);
+		write_unlock_irq(&tasklist_lock);
+
+		__cleanup_sighand(oldsighand);
+	}
+
+	BUG_ON(!thread_group_leader(tsk));
+	return 0;
+}
+
+/*
+ * These functions flushes out all traces of the currently running executable
+ * so that a new one can be started
+ */
+static void flush_old_files(struct files_struct * files)
+{
+	long j = -1;
+	struct fdtable *fdt;
+
+	spin_lock(&files->file_lock);
+	for (;;) {
+		unsigned long set, i;
+
+		j++;
+		i = j * __NFDBITS;
+		fdt = files_fdtable(files);
+		if (i >= fdt->max_fds)
+			break;
+		set = fdt->close_on_exec->fds_bits[j];
+		if (!set)
+			continue;
+		fdt->close_on_exec->fds_bits[j] = 0;
+		spin_unlock(&files->file_lock);
+		for ( ; set ; i++,set >>= 1) {
+			if (set & 1) {
+				sys_close(i);
+			}
+		}
+		spin_lock(&files->file_lock);
+
+	}
+	spin_unlock(&files->file_lock);
+}
+
+char *get_task_comm(char *buf, struct task_struct *tsk)
+{
+	/* buf must be at least sizeof(tsk->comm) in size */
+	task_lock(tsk);
+	strncpy(buf, tsk->comm, sizeof(tsk->comm));
+	task_unlock(tsk);
+	return buf;
+}
+EXPORT_SYMBOL_GPL(get_task_comm);
+
+void set_task_comm(struct task_struct *tsk, char *buf)
+{
+	task_lock(tsk);
+
+	/*
+	 * Threads may access current->comm without holding
+	 * the task lock, so write the string carefully.
+	 * Readers without a lock may see incomplete new
+	 * names but are safe from non-terminating string reads.
+	 */
+	memset(tsk->comm, 0, TASK_COMM_LEN);
+	wmb();
+	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
+	task_unlock(tsk);
+	perf_event_comm(tsk);
+}
+
+int flush_old_exec(struct linux_binprm * bprm)
+{
+	int retval;
+
+	/*
+	 * Make sure we have a private signal table and that
+	 * we are unassociated from the previous thread group.
+	 */
+	retval = de_thread(current);
+	if (retval)
+		goto out;
+
+	set_mm_exe_file(bprm->mm, bprm->file);
+
+	/*
+	 * Release all of the old mmap stuff
+	 */
+	acct_arg_size(bprm, 0);
+	retval = exec_mmap(bprm->mm);
+	if (retval)
+		goto out;
+
+	bprm->mm = NULL;		/* We're using it now */
+
+	set_fs(USER_DS);
+	current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
+	flush_thread();
+	current->personality &= ~bprm->per_clear;
+
+	return 0;
+
+out:
+	return retval;
+}
+EXPORT_SYMBOL(flush_old_exec);
+
+void setup_new_exec(struct linux_binprm * bprm)
+{
+	int i, ch;
+	const char *name;
+	char tcomm[sizeof(current->comm)];
+
+	arch_pick_mmap_layout(current->mm);
+
+	/* This is the point of no return */
+	current->sas_ss_sp = current->sas_ss_size = 0;
+
+	if (current_euid() == current_uid() && current_egid() == current_gid())
+		set_dumpable(current->mm, 1);
+	else
+		set_dumpable(current->mm, suid_dumpable);
+
+	name = bprm->filename;
+
+	/* Copies the binary name from after last slash */
+	for (i=0; (ch = *(name++)) != '\0';) {
+		if (ch == '/')
+			i = 0; /* overwrite what we wrote */
+		else
+			if (i < (sizeof(tcomm) - 1))
+				tcomm[i++] = ch;
+	}
+	tcomm[i] = '\0';
+	set_task_comm(current, tcomm);
+
+	/* Set the new mm task size. We have to do that late because it may
+	 * depend on TIF_32BIT which is only updated in flush_thread() on
+	 * some architectures like powerpc
+	 */
+	current->mm->task_size = TASK_SIZE;
+
+	/* install the new credentials */
+	if (bprm->cred->uid != current_euid() ||
+	    bprm->cred->gid != current_egid()) {
+		current->pdeath_signal = 0;
+	} else if (file_permission(bprm->file, MAY_READ) ||
+		   bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) {
+		set_dumpable(current->mm, suid_dumpable);
+	}
+
+	/*
+	 * Flush performance counters when crossing a
+	 * security domain:
+	 */
+	if (!get_dumpable(current->mm))
+		perf_event_exit_task(current);
+
+	/* An exec changes our domain. We are no longer part of the thread
+	   group */
+
+	current->self_exec_id++;
+			
+	flush_signal_handlers(current, 0);
+	flush_old_files(current->files);
+}
+EXPORT_SYMBOL(setup_new_exec);
+
+/*
+ * Prepare credentials and lock ->cred_guard_mutex.
+ * install_exec_creds() commits the new creds and drops the lock.
+ * Or, if exec fails before, free_bprm() should release ->cred and
+ * and unlock.
+ */
+int prepare_bprm_creds(struct linux_binprm *bprm)
+{
+	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
+		return -ERESTARTNOINTR;
+
+	bprm->cred = prepare_exec_creds();
+	if (likely(bprm->cred))
+		return 0;
+
+	mutex_unlock(&current->signal->cred_guard_mutex);
+	return -ENOMEM;
+}
+
+void free_bprm(struct linux_binprm *bprm)
+{
+	free_arg_pages(bprm);
+	if (bprm->cred) {
+		mutex_unlock(&current->signal->cred_guard_mutex);
+		abort_creds(bprm->cred);
+	}
+	kfree(bprm);
+}
+
+/*
+ * install the new credentials for this executable
+ */
+void install_exec_creds(struct linux_binprm *bprm)
+{
+	security_bprm_committing_creds(bprm);
+
+	commit_creds(bprm->cred);
+	bprm->cred = NULL;
+	/*
+	 * cred_guard_mutex must be held at least to this point to prevent
+	 * ptrace_attach() from altering our determination of the task's
+	 * credentials; any time after this it may be unlocked.
+	 */
+	security_bprm_committed_creds(bprm);
+	mutex_unlock(&current->signal->cred_guard_mutex);
+}
+EXPORT_SYMBOL(install_exec_creds);
+
+/*
+ * determine how safe it is to execute the proposed program
+ * - the caller must hold ->cred_guard_mutex to protect against
+ *   PTRACE_ATTACH
+ */
+int check_unsafe_exec(struct linux_binprm *bprm)
+{
+	struct task_struct *p = current, *t;
+	unsigned n_fs;
+	int res = 0;
+
+	bprm->unsafe = tracehook_unsafe_exec(p);
+
+	n_fs = 1;
+	spin_lock(&p->fs->lock);
+	rcu_read_lock();
+	for (t = next_thread(p); t != p; t = next_thread(t)) {
+		if (t->fs == p->fs)
+			n_fs++;
+	}
+	rcu_read_unlock();
+
+	if (p->fs->users > n_fs) {
+		bprm->unsafe |= LSM_UNSAFE_SHARE;
+	} else {
+		res = -EAGAIN;
+		if (!p->fs->in_exec) {
+			p->fs->in_exec = 1;
+			res = 1;
+		}
+	}
+	spin_unlock(&p->fs->lock);
+
+	return res;
+}
+
+/* 
+ * Fill the binprm structure from the inode. 
+ * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
+ *
+ * This may be called multiple times for binary chains (scripts for example).
+ */
+int prepare_binprm(struct linux_binprm *bprm)
+{
+	umode_t mode;
+	struct inode * inode = bprm->file->f_path.dentry->d_inode;
+	int retval;
+
+	mode = inode->i_mode;
+	if (bprm->file->f_op == NULL)
+		return -EACCES;
+
+	/* clear any previous set[ug]id data from a previous binary */
+	bprm->cred->euid = current_euid();
+	bprm->cred->egid = current_egid();
+
+	if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
+		/* Set-uid? */
+		if (mode & S_ISUID) {
+			bprm->per_clear |= PER_CLEAR_ON_SETID;
+			bprm->cred->euid = inode->i_uid;
+		}
+
+		/* Set-gid? */
+		/*
+		 * If setgid is set but no group execute bit then this
+		 * is a candidate for mandatory locking, not a setgid
+		 * executable.
+		 */
+		if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+			bprm->per_clear |= PER_CLEAR_ON_SETID;
+			bprm->cred->egid = inode->i_gid;
+		}
+	}
+
+	/* fill in binprm security blob */
+	retval = security_bprm_set_creds(bprm);
+	if (retval)
+		return retval;
+	bprm->cred_prepared = 1;
+
+	memset(bprm->buf, 0, BINPRM_BUF_SIZE);
+	return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
+}
+
+EXPORT_SYMBOL(prepare_binprm);
+
+/*
+ * Arguments are '\0' separated strings found at the location bprm->p
+ * points to; chop off the first by relocating brpm->p to right after
+ * the first '\0' encountered.
+ */
+int remove_arg_zero(struct linux_binprm *bprm)
+{
+	int ret = 0;
+	unsigned long offset;
+	char *kaddr;
+	struct page *page;
+
+	if (!bprm->argc)
+		return 0;
+
+	do {
+		offset = bprm->p & ~PAGE_MASK;
+		page = get_arg_page(bprm, bprm->p, 0);
+		if (!page) {
+			ret = -EFAULT;
+			goto out;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+
+		for (; offset < PAGE_SIZE && kaddr[offset];
+				offset++, bprm->p++)
+			;
+
+		kunmap_atomic(kaddr, KM_USER0);
+		put_arg_page(page);
+
+		if (offset == PAGE_SIZE)
+			free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
+	} while (offset == PAGE_SIZE);
+
+	bprm->p++;
+	bprm->argc--;
+	ret = 0;
+
+out:
+	return ret;
+}
+EXPORT_SYMBOL(remove_arg_zero);
+
+/*
+ * cycle the list of binary formats handler, until one recognizes the image
+ */
+int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
+{
+	unsigned int depth = bprm->recursion_depth;
+	int try,retval;
+	struct linux_binfmt *fmt;
+
+	retval = security_bprm_check(bprm);
+	if (retval)
+		return retval;
+
+	retval = audit_bprm(bprm);
+	if (retval)
+		return retval;
+
+	retval = -ENOENT;
+	for (try=0; try<2; try++) {
+		read_lock(&binfmt_lock);
+		list_for_each_entry(fmt, &formats, lh) {
+			int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
+			if (!fn)
+				continue;
+			if (!try_module_get(fmt->module))
+				continue;
+			read_unlock(&binfmt_lock);
+			retval = fn(bprm, regs);
+			/*
+			 * Restore the depth counter to its starting value
+			 * in this call, so we don't have to rely on every
+			 * load_binary function to restore it on return.
+			 */
+			bprm->recursion_depth = depth;
+			if (retval >= 0) {
+				if (depth == 0)
+					tracehook_report_exec(fmt, bprm, regs);
+				put_binfmt(fmt);
+				allow_write_access(bprm->file);
+				if (bprm->file)
+					fput(bprm->file);
+				bprm->file = NULL;
+				current->did_exec = 1;
+				proc_exec_connector(current);
+				return retval;
+			}
+			read_lock(&binfmt_lock);
+			put_binfmt(fmt);
+			if (retval != -ENOEXEC || bprm->mm == NULL)
+				break;
+			if (!bprm->file) {
+				read_unlock(&binfmt_lock);
+				return retval;
+			}
+		}
+		read_unlock(&binfmt_lock);
+		if (retval != -ENOEXEC || bprm->mm == NULL) {
+			break;
+#ifdef CONFIG_MODULES
+		} else {
+#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
+			if (printable(bprm->buf[0]) &&
+			    printable(bprm->buf[1]) &&
+			    printable(bprm->buf[2]) &&
+			    printable(bprm->buf[3]))
+				break; /* -ENOEXEC */
+			if (try)
+				break; /* -ENOEXEC */
+			request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
+#endif
+		}
+	}
+	return retval;
+}
+
+EXPORT_SYMBOL(search_binary_handler);
+
+/*
+ * sys_execve() executes a new program.
+ */
+static int do_execve_common(const char *filename,
+				struct user_arg_ptr argv,
+				struct user_arg_ptr envp,
+				struct pt_regs *regs)
+{
+	struct linux_binprm *bprm;
+	struct file *file;
+	struct files_struct *displaced;
+	bool clear_in_exec;
+	int retval;
+
+	retval = unshare_files(&displaced);
+	if (retval)
+		goto out_ret;
+
+	retval = -ENOMEM;
+	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+	if (!bprm)
+		goto out_files;
+
+	retval = prepare_bprm_creds(bprm);
+	if (retval)
+		goto out_free;
+
+	retval = check_unsafe_exec(bprm);
+	if (retval < 0)
+		goto out_free;
+	clear_in_exec = retval;
+	current->in_execve = 1;
+
+	file = open_exec(filename);
+	retval = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out_unmark;
+
+	sched_exec();
+
+	bprm->file = file;
+	bprm->filename = filename;
+	bprm->interp = filename;
+
+	retval = bprm_mm_init(bprm);
+	if (retval)
+		goto out_file;
+
+	bprm->argc = count(argv, MAX_ARG_STRINGS);
+	if ((retval = bprm->argc) < 0)
+		goto out;
+
+	bprm->envc = count(envp, MAX_ARG_STRINGS);
+	if ((retval = bprm->envc) < 0)
+		goto out;
+
+	retval = prepare_binprm(bprm);
+	if (retval < 0)
+		goto out;
+
+	retval = copy_strings_kernel(1, &bprm->filename, bprm);
+	if (retval < 0)
+		goto out;
+
+	bprm->exec = bprm->p;
+	retval = copy_strings(bprm->envc, envp, bprm);
+	if (retval < 0)
+		goto out;
+
+	retval = copy_strings(bprm->argc, argv, bprm);
+	if (retval < 0)
+		goto out;
+
+	retval = search_binary_handler(bprm,regs);
+	if (retval < 0)
+		goto out;
+
+	/* execve succeeded */
+	current->fs->in_exec = 0;
+	current->in_execve = 0;
+	acct_update_integrals(current);
+	free_bprm(bprm);
+	if (displaced)
+		put_files_struct(displaced);
+	return retval;
+
+out:
+	if (bprm->mm) {
+		acct_arg_size(bprm, 0);
+		mmput(bprm->mm);
+	}
+
+out_file:
+	if (bprm->file) {
+		allow_write_access(bprm->file);
+		fput(bprm->file);
+	}
+
+out_unmark:
+	if (clear_in_exec)
+		current->fs->in_exec = 0;
+	current->in_execve = 0;
+
+out_free:
+	free_bprm(bprm);
+
+out_files:
+	if (displaced)
+		reset_files_struct(displaced);
+out_ret:
+	return retval;
+}
+
+int do_execve(const char *filename,
+	const char __user *const __user *__argv,
+	const char __user *const __user *__envp,
+	struct pt_regs *regs)
+{
+	struct user_arg_ptr argv = { .ptr.native = __argv };
+	struct user_arg_ptr envp = { .ptr.native = __envp };
+	return do_execve_common(filename, argv, envp, regs);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_do_execve(char *filename,
+	compat_uptr_t __user *__argv,
+	compat_uptr_t __user *__envp,
+	struct pt_regs *regs)
+{
+	struct user_arg_ptr argv = {
+		.is_compat = true,
+		.ptr.compat = __argv,
+	};
+	struct user_arg_ptr envp = {
+		.is_compat = true,
+		.ptr.compat = __envp,
+	};
+	return do_execve_common(filename, argv, envp, regs);
+}
+#endif
+
+void set_binfmt(struct linux_binfmt *new)
+{
+	struct mm_struct *mm = current->mm;
+
+	if (mm->binfmt)
+		module_put(mm->binfmt->module);
+
+	mm->binfmt = new;
+	if (new)
+		__module_get(new->module);
+}
+
+EXPORT_SYMBOL(set_binfmt);
+
+static int expand_corename(struct core_name *cn)
+{
+	char *old_corename = cn->corename;
+
+	cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
+	cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
+
+	if (!cn->corename) {
+		kfree(old_corename);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int cn_printf(struct core_name *cn, const char *fmt, ...)
+{
+	char *cur;
+	int need;
+	int ret;
+	va_list arg;
+
+	va_start(arg, fmt);
+	need = vsnprintf(NULL, 0, fmt, arg);
+	va_end(arg);
+
+	if (likely(need < cn->size - cn->used - 1))
+		goto out_printf;
+
+	ret = expand_corename(cn);
+	if (ret)
+		goto expand_fail;
+
+out_printf:
+	cur = cn->corename + cn->used;
+	va_start(arg, fmt);
+	vsnprintf(cur, need + 1, fmt, arg);
+	va_end(arg);
+	cn->used += need;
+	return 0;
+
+expand_fail:
+	return ret;
+}
+
+static int cn_print_exe_file(struct core_name *cn)
+{
+	struct file *exe_file;
+	char *pathbuf, *path, *p;
+	int ret;
+
+	exe_file = get_mm_exe_file(current->mm);
+	if (!exe_file)
+		return cn_printf(cn, "(unknown)");
+
+	pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
+	if (!pathbuf) {
+		ret = -ENOMEM;
+		goto put_exe_file;
+	}
+
+	path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+	if (IS_ERR(path)) {
+		ret = PTR_ERR(path);
+		goto free_buf;
+	}
+
+	for (p = path; *p; p++)
+		if (*p == '/')
+			*p = '!';
+
+	ret = cn_printf(cn, "%s", path);
+
+free_buf:
+	kfree(pathbuf);
+put_exe_file:
+	fput(exe_file);
+	return ret;
+}
+
+/* format_corename will inspect the pattern parameter, and output a
+ * name into corename, which must have space for at least
+ * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
+ */
+static int format_corename(struct core_name *cn, long signr)
+{
+	const struct cred *cred = current_cred();
+	const char *pat_ptr = core_pattern;
+	int ispipe = (*pat_ptr == '|');
+	int pid_in_pattern = 0;
+	int err = 0;
+
+	cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
+	cn->corename = kmalloc(cn->size, GFP_KERNEL);
+	cn->used = 0;
+
+	if (!cn->corename)
+		return -ENOMEM;
+
+	/* Repeat as long as we have more pattern to process and more output
+	   space */
+	while (*pat_ptr) {
+		if (*pat_ptr != '%') {
+			if (*pat_ptr == 0)
+				goto out;
+			err = cn_printf(cn, "%c", *pat_ptr++);
+		} else {
+			switch (*++pat_ptr) {
+			/* single % at the end, drop that */
+			case 0:
+				goto out;
+			/* Double percent, output one percent */
+			case '%':
+				err = cn_printf(cn, "%c", '%');
+				break;
+			/* pid */
+			case 'p':
+				pid_in_pattern = 1;
+				err = cn_printf(cn, "%d",
+					      task_tgid_vnr(current));
+				break;
+			/* uid */
+			case 'u':
+				err = cn_printf(cn, "%d", cred->uid);
+				break;
+			/* gid */
+			case 'g':
+				err = cn_printf(cn, "%d", cred->gid);
+				break;
+			/* signal that caused the coredump */
+			case 's':
+				err = cn_printf(cn, "%ld", signr);
+				break;
+			/* UNIX time of coredump */
+			case 't': {
+				struct timeval tv;
+				do_gettimeofday(&tv);
+				err = cn_printf(cn, "%lu", tv.tv_sec);
+				break;
+			}
+			/* hostname */
+			case 'h':
+				down_read(&uts_sem);
+				err = cn_printf(cn, "%s",
+					      utsname()->nodename);
+				up_read(&uts_sem);
+				break;
+			/* executable */
+			case 'e':
+				err = cn_printf(cn, "%s", current->comm);
+				break;
+			case 'E':
+				err = cn_print_exe_file(cn);
+				break;
+			/* core limit size */
+			case 'c':
+				err = cn_printf(cn, "%lu",
+					      rlimit(RLIMIT_CORE));
+				break;
+			default:
+				break;
+			}
+			++pat_ptr;
+		}
+
+		if (err)
+			return err;
+	}
+
+	/* Backward compatibility with core_uses_pid:
+	 *
+	 * If core_pattern does not include a %p (as is the default)
+	 * and core_uses_pid is set, then .%pid will be appended to
+	 * the filename. Do not do this for piped commands. */
+	if (!ispipe && !pid_in_pattern && core_uses_pid) {
+		err = cn_printf(cn, ".%d", task_tgid_vnr(current));
+		if (err)
+			return err;
+	}
+out:
+	return ispipe;
+}
+
+static int zap_process(struct task_struct *start, int exit_code)
+{
+	struct task_struct *t;
+	int nr = 0;
+
+	start->signal->flags = SIGNAL_GROUP_EXIT;
+	start->signal->group_exit_code = exit_code;
+	start->signal->group_stop_count = 0;
+
+	t = start;
+	do {
+		task_clear_group_stop_pending(t);
+		if (t != current && t->mm) {
+			sigaddset(&t->pending.signal, SIGKILL);
+			signal_wake_up(t, 1);
+			nr++;
+		}
+	} while_each_thread(start, t);
+
+	return nr;
+}
+
+static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+				struct core_state *core_state, int exit_code)
+{
+	struct task_struct *g, *p;
+	unsigned long flags;
+	int nr = -EAGAIN;
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (!signal_group_exit(tsk->signal)) {
+		mm->core_state = core_state;
+		nr = zap_process(tsk, exit_code);
+	}
+	spin_unlock_irq(&tsk->sighand->siglock);
+	if (unlikely(nr < 0))
+		return nr;
+
+	if (atomic_read(&mm->mm_users) == nr + 1)
+		goto done;
+	/*
+	 * We should find and kill all tasks which use this mm, and we should
+	 * count them correctly into ->nr_threads. We don't take tasklist
+	 * lock, but this is safe wrt:
+	 *
+	 * fork:
+	 *	None of sub-threads can fork after zap_process(leader). All
+	 *	processes which were created before this point should be
+	 *	visible to zap_threads() because copy_process() adds the new
+	 *	process to the tail of init_task.tasks list, and lock/unlock
+	 *	of ->siglock provides a memory barrier.
+	 *
+	 * do_exit:
+	 *	The caller holds mm->mmap_sem. This means that the task which
+	 *	uses this mm can't pass exit_mm(), so it can't exit or clear
+	 *	its ->mm.
+	 *
+	 * de_thread:
+	 *	It does list_replace_rcu(&leader->tasks, &current->tasks),
+	 *	we must see either old or new leader, this does not matter.
+	 *	However, it can change p->sighand, so lock_task_sighand(p)
+	 *	must be used. Since p->mm != NULL and we hold ->mmap_sem
+	 *	it can't fail.
+	 *
+	 *	Note also that "g" can be the old leader with ->mm == NULL
+	 *	and already unhashed and thus removed from ->thread_group.
+	 *	This is OK, __unhash_process()->list_del_rcu() does not
+	 *	clear the ->next pointer, we will find the new leader via
+	 *	next_thread().
+	 */
+	rcu_read_lock();
+	for_each_process(g) {
+		if (g == tsk->group_leader)
+			continue;
+		if (g->flags & PF_KTHREAD)
+			continue;
+		p = g;
+		do {
+			if (p->mm) {
+				if (unlikely(p->mm == mm)) {
+					lock_task_sighand(p, &flags);
+					nr += zap_process(p, exit_code);
+					unlock_task_sighand(p, &flags);
+				}
+				break;
+			}
+		} while_each_thread(g, p);
+	}
+	rcu_read_unlock();
+done:
+	atomic_set(&core_state->nr_threads, nr);
+	return nr;
+}
+
+static int coredump_wait(int exit_code, struct core_state *core_state)
+{
+	struct task_struct *tsk = current;
+	struct mm_struct *mm = tsk->mm;
+	struct completion *vfork_done;
+	int core_waiters = -EBUSY;
+
+	init_completion(&core_state->startup);
+	core_state->dumper.task = tsk;
+	core_state->dumper.next = NULL;
+
+	down_write(&mm->mmap_sem);
+	if (!mm->core_state)
+		core_waiters = zap_threads(tsk, mm, core_state, exit_code);
+	up_write(&mm->mmap_sem);
+
+	if (unlikely(core_waiters < 0))
+		goto fail;
+
+	/*
+	 * Make sure nobody is waiting for us to release the VM,
+	 * otherwise we can deadlock when we wait on each other
+	 */
+	vfork_done = tsk->vfork_done;
+	if (vfork_done) {
+		tsk->vfork_done = NULL;
+		complete(vfork_done);
+	}
+
+	if (core_waiters)
+		wait_for_completion(&core_state->startup);
+fail:
+	return core_waiters;
+}
+
+static void coredump_finish(struct mm_struct *mm)
+{
+	struct core_thread *curr, *next;
+	struct task_struct *task;
+
+	next = mm->core_state->dumper.next;
+	while ((curr = next) != NULL) {
+		next = curr->next;
+		task = curr->task;
+		/*
+		 * see exit_mm(), curr->task must not see
+		 * ->task == NULL before we read ->next.
+		 */
+		smp_mb();
+		curr->task = NULL;
+		wake_up_process(task);
+	}
+
+	mm->core_state = NULL;
+}
+
+/*
+ * set_dumpable converts traditional three-value dumpable to two flags and
+ * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
+ * these bits are not changed atomically.  So get_dumpable can observe the
+ * intermediate state.  To avoid doing unexpected behavior, get get_dumpable
+ * return either old dumpable or new one by paying attention to the order of
+ * modifying the bits.
+ *
+ * dumpable |   mm->flags (binary)
+ * old  new | initial interim  final
+ * ---------+-----------------------
+ *  0    1  |   00      01      01
+ *  0    2  |   00      10(*)   11
+ *  1    0  |   01      00      00
+ *  1    2  |   01      11      11
+ *  2    0  |   11      10(*)   00
+ *  2    1  |   11      11      01
+ *
+ * (*) get_dumpable regards interim value of 10 as 11.
+ */
+void set_dumpable(struct mm_struct *mm, int value)
+{
+	switch (value) {
+	case 0:
+		clear_bit(MMF_DUMPABLE, &mm->flags);
+		smp_wmb();
+		clear_bit(MMF_DUMP_SECURELY, &mm->flags);
+		break;
+	case 1:
+		set_bit(MMF_DUMPABLE, &mm->flags);
+		smp_wmb();
+		clear_bit(MMF_DUMP_SECURELY, &mm->flags);
+		break;
+	case 2:
+		set_bit(MMF_DUMP_SECURELY, &mm->flags);
+		smp_wmb();
+		set_bit(MMF_DUMPABLE, &mm->flags);
+		break;
+	}
+}
+
+static int __get_dumpable(unsigned long mm_flags)
+{
+	int ret;
+
+	ret = mm_flags & MMF_DUMPABLE_MASK;
+	return (ret >= 2) ? 2 : ret;
+}
+
+int get_dumpable(struct mm_struct *mm)
+{
+	return __get_dumpable(mm->flags);
+}
+
+static void wait_for_dump_helpers(struct file *file)
+{
+	struct pipe_inode_info *pipe;
+
+	pipe = file->f_path.dentry->d_inode->i_pipe;
+
+	pipe_lock(pipe);
+	pipe->readers++;
+	pipe->writers--;
+
+	while ((pipe->readers > 1) && (!signal_pending(current))) {
+		wake_up_interruptible_sync(&pipe->wait);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+		pipe_wait(pipe);
+	}
+
+	pipe->readers--;
+	pipe->writers++;
+	pipe_unlock(pipe);
+
+}
+
+
+/*
+ * umh_pipe_setup
+ * helper function to customize the process used
+ * to collect the core in userspace.  Specifically
+ * it sets up a pipe and installs it as fd 0 (stdin)
+ * for the process.  Returns 0 on success, or
+ * PTR_ERR on failure.
+ * Note that it also sets the core limit to 1.  This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+{
+	struct file *rp, *wp;
+	struct fdtable *fdt;
+	struct coredump_params *cp = (struct coredump_params *)info->data;
+	struct files_struct *cf = current->files;
+
+	wp = create_write_pipe(0);
+	if (IS_ERR(wp))
+		return PTR_ERR(wp);
+
+	rp = create_read_pipe(wp, 0);
+	if (IS_ERR(rp)) {
+		free_write_pipe(wp);
+		return PTR_ERR(rp);
+	}
+
+	cp->file = wp;
+
+	sys_close(0);
+	fd_install(0, rp);
+	spin_lock(&cf->file_lock);
+	fdt = files_fdtable(cf);
+	FD_SET(0, fdt->open_fds);
+	FD_CLR(0, fdt->close_on_exec);
+	spin_unlock(&cf->file_lock);
+
+	/* and disallow core files too */
+	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+
+	return 0;
+}
+
+void do_coredump(long signr, int exit_code, struct pt_regs *regs)
+{
+	struct core_state core_state;
+	struct core_name cn;
+	struct mm_struct *mm = current->mm;
+	struct linux_binfmt * binfmt;
+	const struct cred *old_cred;
+	struct cred *cred;
+	int retval = 0;
+	int flag = 0;
+	int ispipe;
+	static atomic_t core_dump_count = ATOMIC_INIT(0);
+	struct coredump_params cprm = {
+		.signr = signr,
+		.regs = regs,
+		.limit = rlimit(RLIMIT_CORE),
+		/*
+		 * We must use the same mm->flags while dumping core to avoid
+		 * inconsistency of bit flags, since this flag is not protected
+		 * by any locks.
+		 */
+		.mm_flags = mm->flags,
+	};
+
+	audit_core_dumps(signr);
+
+	binfmt = mm->binfmt;
+	if (!binfmt || !binfmt->core_dump)
+		goto fail;
+	if (!__get_dumpable(cprm.mm_flags))
+		goto fail;
+
+	cred = prepare_creds();
+	if (!cred)
+		goto fail;
+	/*
+	 *	We cannot trust fsuid as being the "true" uid of the
+	 *	process nor do we know its entire history. We only know it
+	 *	was tainted so we dump it as root in mode 2.
+	 */
+	if (__get_dumpable(cprm.mm_flags) == 2) {
+		/* Setuid core dump mode */
+		flag = O_EXCL;		/* Stop rewrite attacks */
+		cred->fsuid = 0;	/* Dump root private */
+	}
+
+	retval = coredump_wait(exit_code, &core_state);
+	if (retval < 0)
+		goto fail_creds;
+
+	old_cred = override_creds(cred);
+
+	/*
+	 * Clear any false indication of pending signals that might
+	 * be seen by the filesystem code called to write the core file.
+	 */
+	clear_thread_flag(TIF_SIGPENDING);
+
+	ispipe = format_corename(&cn, signr);
+
+	if (ispipe == -ENOMEM) {
+		printk(KERN_WARNING "format_corename failed\n");
+		printk(KERN_WARNING "Aborting core\n");
+		goto fail_corename;
+	}
+
+ 	if (ispipe) {
+		int dump_count;
+		char **helper_argv;
+
+		if (cprm.limit == 1) {
+			/*
+			 * Normally core limits are irrelevant to pipes, since
+			 * we're not writing to the file system, but we use
+			 * cprm.limit of 1 here as a speacial value. Any
+			 * non-1 limit gets set to RLIM_INFINITY below, but
+			 * a limit of 0 skips the dump.  This is a consistent
+			 * way to catch recursive crashes.  We can still crash
+			 * if the core_pattern binary sets RLIM_CORE =  !1
+			 * but it runs as root, and can do lots of stupid things
+			 * Note that we use task_tgid_vnr here to grab the pid
+			 * of the process group leader.  That way we get the
+			 * right pid if a thread in a multi-threaded
+			 * core_pattern process dies.
+			 */
+			printk(KERN_WARNING
+				"Process %d(%s) has RLIMIT_CORE set to 1\n",
+				task_tgid_vnr(current), current->comm);
+			printk(KERN_WARNING "Aborting core\n");
+			goto fail_unlock;
+		}
+		cprm.limit = RLIM_INFINITY;
+
+		dump_count = atomic_inc_return(&core_dump_count);
+		if (core_pipe_limit && (core_pipe_limit < dump_count)) {
+			printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
+			       task_tgid_vnr(current), current->comm);
+			printk(KERN_WARNING "Skipping core dump\n");
+			goto fail_dropcount;
+		}
+
+		helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
+		if (!helper_argv) {
+			printk(KERN_WARNING "%s failed to allocate memory\n",
+			       __func__);
+			goto fail_dropcount;
+		}
+
+		retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
+					NULL, UMH_WAIT_EXEC, umh_pipe_setup,
+					NULL, &cprm);
+		argv_free(helper_argv);
+		if (retval) {
+ 			printk(KERN_INFO "Core dump to %s pipe failed\n",
+			       cn.corename);
+			goto close_fail;
+ 		}
+	} else {
+		struct inode *inode;
+
+		if (cprm.limit < binfmt->min_coredump)
+			goto fail_unlock;
+
+		cprm.file = filp_open(cn.corename,
+				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
+				 0600);
+		if (IS_ERR(cprm.file))
+			goto fail_unlock;
+
+		inode = cprm.file->f_path.dentry->d_inode;
+		if (inode->i_nlink > 1)
+			goto close_fail;
+		if (d_unhashed(cprm.file->f_path.dentry))
+			goto close_fail;
+		/*
+		 * AK: actually i see no reason to not allow this for named
+		 * pipes etc, but keep the previous behaviour for now.
+		 */
+		if (!S_ISREG(inode->i_mode))
+			goto close_fail;
+		/*
+		 * Dont allow local users get cute and trick others to coredump
+		 * into their pre-created files.
+		 */
+		if (inode->i_uid != current_fsuid())
+			goto close_fail;
+		if (!cprm.file->f_op || !cprm.file->f_op->write)
+			goto close_fail;
+		if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+			goto close_fail;
+	}
+
+	retval = binfmt->core_dump(&cprm);
+	if (retval)
+		current->signal->group_exit_code |= 0x80;
+
+	if (ispipe && core_pipe_limit)
+		wait_for_dump_helpers(cprm.file);
+close_fail:
+	if (cprm.file)
+		filp_close(cprm.file, NULL);
+fail_dropcount:
+	if (ispipe)
+		atomic_dec(&core_dump_count);
+fail_unlock:
+	kfree(cn.corename);
+fail_corename:
+	coredump_finish(mm);
+	revert_creds(old_cred);
+fail_creds:
+	put_cred(cred);
+fail:
+	return;
+}
+
+/*
+ * Core dumping helper functions.  These are the only things you should
+ * do on a core-file: use only these functions to write out all the
+ * necessary info.
+ */
+int dump_write(struct file *file, const void *addr, int nr)
+{
+	return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+}
+EXPORT_SYMBOL(dump_write);
+
+int dump_seek(struct file *file, loff_t off)
+{
+	int ret = 1;
+
+	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+		if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
+			return 0;
+	} else {
+		char *buf = (char *)get_zeroed_page(GFP_KERNEL);
+
+		if (!buf)
+			return 0;
+		while (off > 0) {
+			unsigned long n = off;
+
+			if (n > PAGE_SIZE)
+				n = PAGE_SIZE;
+			if (!dump_write(file, buf, n)) {
+				ret = 0;
+				break;
+			}
+			off -= n;
+		}
+		free_page((unsigned long)buf);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(dump_seek);
diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
new file mode 100644
index 00000000..1b2d4c63
--- /dev/null
+++ b/fs/exofs/BUGS
@@ -0,0 +1,3 @@
+- Out-of-space may cause a severe problem if the object (and directory entry)
+  were written, but the inode attributes failed. Then if the filesystem was
+  unmounted and mounted the kernel can get into an endless loop doing a readdir.
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
new file mode 100644
index 00000000..2d0f757f
--- /dev/null
+++ b/fs/exofs/Kbuild
@@ -0,0 +1,16 @@
+#
+# Kbuild for the EXOFS module
+#
+# Copyright (C) 2008 Panasas Inc.  All rights reserved.
+#
+# Authors:
+#   Boaz Harrosh <bharrosh@panasas.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2
+#
+# Kbuild - Gets included from the Kernels Makefile and build system
+#
+
+exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
+obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
new file mode 100644
index 00000000..86194b2f
--- /dev/null
+++ b/fs/exofs/Kconfig
@@ -0,0 +1,13 @@
+config EXOFS_FS
+	tristate "exofs: OSD based file system support"
+	depends on SCSI_OSD_ULD
+	help
+	  EXOFS is a file system that uses an OSD storage device,
+	  as its backing storage.
+
+# Debugging-related stuff
+config EXOFS_DEBUG
+	bool "Enable debugging"
+	depends on EXOFS_FS
+	help
+	  This option enables EXOFS debug prints.
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
new file mode 100644
index 00000000..3bbd4695
--- /dev/null
+++ b/fs/exofs/common.h
@@ -0,0 +1,262 @@
+/*
+ * common.h - Common definitions for both Kernel and user-mode utilities
+ *
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef __EXOFS_COM_H__
+#define __EXOFS_COM_H__
+
+#include <linux/types.h>
+
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_sec.h>
+
+/****************************************************************************
+ * Object ID related defines
+ * NOTE: inode# = object ID - EXOFS_OBJ_OFF
+ ****************************************************************************/
+#define EXOFS_MIN_PID   0x10000	/* Smallest partition ID */
+#define EXOFS_OBJ_OFF	0x10000	/* offset for objects */
+#define EXOFS_SUPER_ID	0x10000	/* object ID for on-disk superblock */
+#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
+#define EXOFS_ROOT_ID	0x10002	/* object ID for root directory */
+
+/* exofs Application specific page/attribute */
+/* Inode attrs */
+# define EXOFS_APAGE_FS_DATA	(OSD_APAGE_APP_DEFINED_FIRST + 3)
+# define EXOFS_ATTR_INODE_DATA	1
+# define EXOFS_ATTR_INODE_FILE_LAYOUT	2
+# define EXOFS_ATTR_INODE_DIR_LAYOUT	3
+/* Partition attrs */
+# define EXOFS_APAGE_SB_DATA	(0xF0000000U + 3)
+# define EXOFS_ATTR_SB_STATS	1
+
+/*
+ * The maximum number of files we can have is limited by the size of the
+ * inode number.  This is the largest object ID that the file system supports.
+ * Object IDs 0, 1, and 2 are always in use (see above defines).
+ */
+enum {
+	EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
+					(1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
+	EXOFS_MAX_ID	 = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
+};
+
+/****************************************************************************
+ * Misc.
+ ****************************************************************************/
+#define EXOFS_BLKSHIFT	12
+#define EXOFS_BLKSIZE	(1UL << EXOFS_BLKSHIFT)
+
+/****************************************************************************
+ * superblock-related things
+ ****************************************************************************/
+#define EXOFS_SUPER_MAGIC	0x5DF5
+
+/*
+ * The file system control block - stored in object EXOFS_SUPER_ID's data.
+ * This is where the in-memory superblock is stored on disk.
+ */
+enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
+struct exofs_fscb {
+	__le64  s_nextid;	/* Only used after mkfs */
+	__le64  s_numfiles;	/* Only used after mkfs */
+	__le32	s_version;	/* == EXOFS_FSCB_VER */
+	__le16  s_magic;	/* Magic signature */
+	__le16  s_newfs;	/* Non-zero if this is a new fs */
+
+	/* From here on it's a static part, only written by mkexofs */
+	__le64	s_dev_table_oid;   /* Resurved, not used */
+	__le64	s_dev_table_count; /* == 0 means no dev_table */
+} __packed;
+
+/*
+ * This struct is set on the FS partition's attributes.
+ * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together
+ * with the create command, to atomically persist the sb writeable information.
+ */
+struct exofs_sb_stats {
+	__le64  s_nextid;	/* Highest object ID used */
+	__le64  s_numfiles;	/* Number of files on fs */
+} __packed;
+
+/*
+ * Describes the raid used in the FS. It is part of the device table.
+ * This here is taken from the pNFS-objects definition. In exofs we
+ * use one raid policy through-out the filesystem. (NOTE: the funny
+ * alignment at beginning. We take care of it at exofs_device_table.
+ */
+struct exofs_dt_data_map {
+	__le32	cb_num_comps;
+	__le64	cb_stripe_unit;
+	__le32	cb_group_width;
+	__le32	cb_group_depth;
+	__le32	cb_mirror_cnt;
+	__le32	cb_raid_algorithm;
+} __packed;
+
+/*
+ * This is an osd device information descriptor. It is a single entry in
+ * the exofs device table. It describes an osd target lun which
+ * contains data belonging to this FS. (Same partition_id on all devices)
+ */
+struct exofs_dt_device_info {
+	__le32	systemid_len;
+	u8	systemid[OSD_SYSTEMID_LEN];
+	__le64	long_name_offset;	/* If !0 then offset-in-file */
+	__le32	osdname_len;		/* */
+	u8	osdname[44];		/* Embbeded, Usually an asci uuid */
+} __packed;
+
+/*
+ * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
+ * It contains the raid used for this multy-device FS and an array of
+ * participating devices.
+ */
+struct exofs_device_table {
+	__le32				dt_version;	/* == EXOFS_DT_VER */
+	struct exofs_dt_data_map	dt_data_map;	/* Raid policy to use */
+
+	/* Resurved space For future use. Total includeing this:
+	 * (8 * sizeof(le64))
+	 */
+	__le64				__Resurved[4];
+
+	__le64				dt_num_devices;	/* Array size */
+	struct exofs_dt_device_info	dt_dev_table[];	/* Array of devices */
+} __packed;
+
+/****************************************************************************
+ * inode-related things
+ ****************************************************************************/
+#define EXOFS_IDATA		5
+
+/*
+ * The file control block - stored in an object's attributes.  This is where
+ * the in-memory inode is stored on disk.
+ */
+struct exofs_fcb {
+	__le64  i_size;			/* Size of the file */
+	__le16  i_mode;         	/* File mode */
+	__le16  i_links_count;  	/* Links count */
+	__le32  i_uid;          	/* Owner Uid */
+	__le32  i_gid;          	/* Group Id */
+	__le32  i_atime;        	/* Access time */
+	__le32  i_ctime;        	/* Creation time */
+	__le32  i_mtime;        	/* Modification time */
+	__le32  i_flags;        	/* File flags (unused for now)*/
+	__le32  i_generation;   	/* File version (for NFS) */
+	__le32  i_data[EXOFS_IDATA];	/* Short symlink names and device #s */
+};
+
+#define EXOFS_INO_ATTR_SIZE	sizeof(struct exofs_fcb)
+
+/* This is the Attribute the fcb is stored in */
+static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
+	EXOFS_APAGE_FS_DATA,
+	EXOFS_ATTR_INODE_DATA,
+	EXOFS_INO_ATTR_SIZE);
+
+/****************************************************************************
+ * dentry-related things
+ ****************************************************************************/
+#define EXOFS_NAME_LEN	255
+
+/*
+ * The on-disk directory entry
+ */
+struct exofs_dir_entry {
+	__le64		inode_no;		/* inode number           */
+	__le16		rec_len;		/* directory entry length */
+	u8		name_len;		/* name length            */
+	u8		file_type;		/* umm...file type        */
+	char		name[EXOFS_NAME_LEN];	/* file name              */
+};
+
+enum {
+	EXOFS_FT_UNKNOWN,
+	EXOFS_FT_REG_FILE,
+	EXOFS_FT_DIR,
+	EXOFS_FT_CHRDEV,
+	EXOFS_FT_BLKDEV,
+	EXOFS_FT_FIFO,
+	EXOFS_FT_SOCK,
+	EXOFS_FT_SYMLINK,
+	EXOFS_FT_MAX
+};
+
+#define EXOFS_DIR_PAD			4
+#define EXOFS_DIR_ROUND			(EXOFS_DIR_PAD - 1)
+#define EXOFS_DIR_REC_LEN(name_len) \
+	(((name_len) + offsetof(struct exofs_dir_entry, name)  + \
+	  EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
+
+/*
+ * The on-disk (optional) layout structure.
+ * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT
+ * attribute, attached to any inode, usually to a directory.
+ */
+
+enum exofs_inode_layout_gen_functions {
+	LAYOUT_MOVING_WINDOW = 0,
+	LAYOUT_IMPLICT = 1,
+};
+
+struct exofs_on_disk_inode_layout {
+	__le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */
+	__le16 pad;
+	union {
+		/* gen_func == LAYOUT_MOVING_WINDOW (default) */
+		struct exofs_layout_sliding_window {
+			__le32 num_devices; /* first n devices in global-table*/
+		} sliding_window __packed;
+
+		/* gen_func == LAYOUT_IMPLICT */
+		struct exofs_layout_implict_list {
+			struct exofs_dt_data_map data_map;
+			/* Variable array of size data_map.cb_num_comps. These
+			 * are device indexes of the devices in the global table
+			 */
+			__le32 dev_indexes[];
+		} implict __packed;
+	};
+} __packed;
+
+static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs)
+{
+	return sizeof(struct exofs_on_disk_inode_layout) +
+		max_devs * sizeof(__le32);
+}
+
+#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
new file mode 100644
index 00000000..d0941c6a
--- /dev/null
+++ b/fs/exofs/dir.c
@@ -0,0 +1,673 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "exofs.h"
+
+static inline unsigned exofs_chunk_size(struct inode *inode)
+{
+	return inode->i_sb->s_blocksize;
+}
+
+static inline void exofs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+/* Accesses dir's inode->i_size must be called under inode lock */
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+
+static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+	loff_t last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
+}
+
+static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *dir = mapping->host;
+	int err = 0;
+
+	dir->i_version++;
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+
+	if (pos+len > dir->i_size) {
+		i_size_write(dir, pos+len);
+		mark_inode_dirty(dir);
+	}
+	set_page_dirty(page);
+
+	if (IS_DIRSYNC(dir))
+		err = write_one_page(page, 1);
+	else
+		unlock_page(page);
+
+	return err;
+}
+
+static void exofs_check_page(struct page *page)
+{
+	struct inode *dir = page->mapping->host;
+	unsigned chunk_size = exofs_chunk_size(dir);
+	char *kaddr = page_address(page);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	struct exofs_dir_entry *p;
+	char *error;
+
+	/* if the page is the last one in the directory */
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & (chunk_size - 1))
+			goto Ebadsize;
+		if (!limit)
+			goto out;
+	}
+	for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
+		p = (struct exofs_dir_entry *)(kaddr + offs);
+		rec_len = le16_to_cpu(p->rec_len);
+
+		if (rec_len < EXOFS_DIR_REC_LEN(1))
+			goto Eshort;
+		if (rec_len & 3)
+			goto Ealign;
+		if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
+			goto Enamelen;
+		if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+			goto Espan;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+Ebadsize:
+	EXOFS_ERR("ERROR [exofs_check_page]: "
+		"size of directory(0x%lx) is not a multiple of chunk size\n",
+		dir->i_ino
+	);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+	goto bad_entry;
+bad_entry:
+	EXOFS_ERR(
+		"ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
+		"offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n",
+		dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		_LLU(le64_to_cpu(p->inode_no)),
+		rec_len, p->name_len);
+	goto fail;
+Eend:
+	p = (struct exofs_dir_entry *)(kaddr + offs);
+	EXOFS_ERR("ERROR [exofs_check_page]: "
+		"entry in directory(0x%lx) spans the page boundary"
+		"offset=%lu, inode=0x%llx\n",
+		dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		_LLU(le64_to_cpu(p->inode_no)));
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
+
+static struct page *exofs_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (!PageChecked(page))
+			exofs_check_page(page);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	exofs_put_page(page);
+	return ERR_PTR(-EIO);
+}
+
+static inline int exofs_match(int len, const unsigned char *name,
+					struct exofs_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode_no)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+static inline
+struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
+{
+	return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+}
+
+static inline unsigned
+exofs_validate_entry(char *base, unsigned offset, unsigned mask)
+{
+	struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
+	struct exofs_dir_entry *p =
+			(struct exofs_dir_entry *)(base + (offset&mask));
+	while ((char *)p < (char *)de) {
+		if (p->rec_len == 0)
+			break;
+		p = exofs_next_entry(p);
+	}
+	return (char *)p - base;
+}
+
+static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
+	[EXOFS_FT_UNKNOWN]	= DT_UNKNOWN,
+	[EXOFS_FT_REG_FILE]	= DT_REG,
+	[EXOFS_FT_DIR]		= DT_DIR,
+	[EXOFS_FT_CHRDEV]	= DT_CHR,
+	[EXOFS_FT_BLKDEV]	= DT_BLK,
+	[EXOFS_FT_FIFO]		= DT_FIFO,
+	[EXOFS_FT_SOCK]		= DT_SOCK,
+	[EXOFS_FT_SYMLINK]	= DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= EXOFS_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= EXOFS_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= EXOFS_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= EXOFS_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= EXOFS_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= EXOFS_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= EXOFS_FT_SYMLINK,
+};
+
+static inline
+void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
+{
+	mode_t mode = inode->i_mode;
+	de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
+}
+
+static int
+exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	loff_t pos = filp->f_pos;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = dir_pages(inode);
+	unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
+	unsigned char *types = NULL;
+	int need_revalidate = (filp->f_version != inode->i_version);
+
+	if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
+		return 0;
+
+	types = exofs_filetype_table;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct exofs_dir_entry *de;
+		struct page *page = exofs_get_page(inode, n);
+
+		if (IS_ERR(page)) {
+			EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
+				  inode->i_ino);
+			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			return PTR_ERR(page);
+		}
+		kaddr = page_address(page);
+		if (unlikely(need_revalidate)) {
+			if (offset) {
+				offset = exofs_validate_entry(kaddr, offset,
+								chunk_mask);
+				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+			}
+			filp->f_version = inode->i_version;
+			need_revalidate = 0;
+		}
+		de = (struct exofs_dir_entry *)(kaddr + offset);
+		limit = kaddr + exofs_last_byte(inode, n) -
+							EXOFS_DIR_REC_LEN(1);
+		for (; (char *)de <= limit; de = exofs_next_entry(de)) {
+			if (de->rec_len == 0) {
+				EXOFS_ERR("ERROR: "
+				     "zero-length entry in directory(0x%lx)\n",
+				     inode->i_ino);
+				exofs_put_page(page);
+				return -EIO;
+			}
+			if (de->inode_no) {
+				int over;
+				unsigned char d_type = DT_UNKNOWN;
+
+				if (types && de->file_type < EXOFS_FT_MAX)
+					d_type = types[de->file_type];
+
+				offset = (char *)de - kaddr;
+				over = filldir(dirent, de->name, de->name_len,
+						(n<<PAGE_CACHE_SHIFT) | offset,
+						le64_to_cpu(de->inode_no),
+						d_type);
+				if (over) {
+					exofs_put_page(page);
+					return 0;
+				}
+			}
+			filp->f_pos += le16_to_cpu(de->rec_len);
+		}
+		exofs_put_page(page);
+	}
+
+	return 0;
+}
+
+struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
+			struct dentry *dentry, struct page **res_page)
+{
+	const unsigned char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = dir_pages(dir);
+	struct page *page = NULL;
+	struct exofs_i_info *oi = exofs_i(dir);
+	struct exofs_dir_entry *de;
+
+	if (npages == 0)
+		goto out;
+
+	*res_page = NULL;
+
+	start = oi->i_dir_start_lookup;
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = exofs_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (struct exofs_dir_entry *) kaddr;
+			kaddr += exofs_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->rec_len == 0) {
+					EXOFS_ERR("ERROR: zero-length entry in "
+						  "directory(0x%lx)\n",
+						  dir->i_ino);
+					exofs_put_page(page);
+					goto out;
+				}
+				if (exofs_match(namelen, name, de))
+					goto found;
+				de = exofs_next_entry(de);
+			}
+			exofs_put_page(page);
+		}
+		if (++n >= npages)
+			n = 0;
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	oi->i_dir_start_lookup = n;
+	return de;
+}
+
+struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
+{
+	struct page *page = exofs_get_page(dir, 0);
+	struct exofs_dir_entry *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = exofs_next_entry(
+				(struct exofs_dir_entry *)page_address(page));
+		*p = page;
+	}
+	return de;
+}
+
+ino_t exofs_parent_ino(struct dentry *child)
+{
+	struct page *page;
+	struct exofs_dir_entry *de;
+	ino_t ino;
+
+	de = exofs_dotdot(child->d_inode, &page);
+	if (!de)
+		return 0;
+
+	ino = le64_to_cpu(de->inode_no);
+	exofs_put_page(page);
+	return ino;
+}
+
+ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+	ino_t res = 0;
+	struct exofs_dir_entry *de;
+	struct page *page;
+
+	de = exofs_find_entry(dir, dentry, &page);
+	if (de) {
+		res = le64_to_cpu(de->inode_no);
+		exofs_put_page(page);
+	}
+	return res;
+}
+
+int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
+			struct page *page, struct inode *inode)
+{
+	loff_t pos = page_offset(page) +
+			(char *) de - (char *) page_address(page);
+	unsigned len = le16_to_cpu(de->rec_len);
+	int err;
+
+	lock_page(page);
+	err = exofs_write_begin(NULL, page->mapping, pos, len,
+				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	if (err)
+		EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
+			  err);
+
+	de->inode_no = cpu_to_le64(inode->i_ino);
+	exofs_set_de_type(de, inode);
+	if (likely(!err))
+		err = exofs_commit_chunk(page, pos, len);
+	exofs_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(dir);
+	return err;
+}
+
+int exofs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const unsigned char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned chunk_size = exofs_chunk_size(dir);
+	unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	struct exofs_dir_entry *de;
+	unsigned long npages = dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	loff_t pos;
+	int err;
+
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = exofs_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + exofs_last_byte(dir, n);
+		de = (struct exofs_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				name_len = 0;
+				rec_len = chunk_size;
+				de->rec_len = cpu_to_le16(chunk_size);
+				de->inode_no = 0;
+				goto got_it;
+			}
+			if (de->rec_len == 0) {
+				EXOFS_ERR("ERROR: exofs_add_link: "
+				      "zero-length entry in directory(0x%lx)\n",
+				      inode->i_ino);
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (exofs_match(namelen, name, de))
+				goto out_unlock;
+			name_len = EXOFS_DIR_REC_LEN(de->name_len);
+			rec_len = le16_to_cpu(de->rec_len);
+			if (!de->inode_no && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (struct exofs_dir_entry *) ((char *) de + rec_len);
+		}
+		unlock_page(page);
+		exofs_put_page(page);
+	}
+
+	EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n",
+		  dentry, inode->i_ino);
+	return -EINVAL;
+
+got_it:
+	pos = page_offset(page) +
+		(char *)de - (char *)page_address(page);
+	err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
+							&page, NULL);
+	if (err)
+		goto out_unlock;
+	if (de->inode_no) {
+		struct exofs_dir_entry *de1 =
+			(struct exofs_dir_entry *)((char *)de + name_len);
+		de1->rec_len = cpu_to_le16(rec_len - name_len);
+		de->rec_len = cpu_to_le16(name_len);
+		de = de1;
+	}
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+	de->inode_no = cpu_to_le64(inode->i_ino);
+	exofs_set_de_type(de, inode);
+	err = exofs_commit_chunk(page, pos, rec_len);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(dir);
+	sbi->s_numfiles++;
+
+out_put:
+	exofs_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	char *kaddr = page_address(page);
+	unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
+	unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+	loff_t pos;
+	struct exofs_dir_entry *pde = NULL;
+	struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
+	int err;
+
+	while (de < dir) {
+		if (de->rec_len == 0) {
+			EXOFS_ERR("ERROR: exofs_delete_entry:"
+				  "zero-length entry in directory(0x%lx)\n",
+				  inode->i_ino);
+			err = -EIO;
+			goto out;
+		}
+		pde = de;
+		de = exofs_next_entry(de);
+	}
+	if (pde)
+		from = (char *)pde - (char *)page_address(page);
+	pos = page_offset(page) + from;
+	lock_page(page);
+	err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
+							&page, NULL);
+	if (err)
+		EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n",
+			  err);
+	if (pde)
+		pde->rec_len = cpu_to_le16(to - from);
+	dir->inode_no = 0;
+	if (likely(!err))
+		err = exofs_commit_chunk(page, pos, to - from);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+	sbi->s_numfiles--;
+out:
+	exofs_put_page(page);
+	return err;
+}
+
+/* kept aligned on 4 bytes */
+#define THIS_DIR ".\0\0"
+#define PARENT_DIR "..\0"
+
+int exofs_make_empty(struct inode *inode, struct inode *parent)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
+	unsigned chunk_size = exofs_chunk_size(inode);
+	struct exofs_dir_entry *de;
+	int err;
+	void *kaddr;
+
+	if (!page)
+		return -ENOMEM;
+
+	err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
+							&page, NULL);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	de = (struct exofs_dir_entry *)kaddr;
+	de->name_len = 1;
+	de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
+	memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
+	de->inode_no = cpu_to_le64(inode->i_ino);
+	exofs_set_de_type(de, inode);
+
+	de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
+	de->name_len = 2;
+	de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
+	de->inode_no = cpu_to_le64(parent->i_ino);
+	memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
+	exofs_set_de_type(de, inode);
+	kunmap_atomic(kaddr, KM_USER0);
+	err = exofs_commit_chunk(page, 0, chunk_size);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+int exofs_empty_dir(struct inode *inode)
+{
+	struct page *page = NULL;
+	unsigned long i, npages = dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct exofs_dir_entry *de;
+		page = exofs_get_page(inode, i);
+
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = page_address(page);
+		de = (struct exofs_dir_entry *)kaddr;
+		kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->rec_len == 0) {
+				EXOFS_ERR("ERROR: exofs_empty_dir: "
+					  "zero-length directory entry"
+					  "kaddr=%p, de=%p\n", kaddr, de);
+				goto not_empty;
+			}
+			if (de->inode_no != 0) {
+				/* check for . and .. */
+				if (de->name[0] != '.')
+					goto not_empty;
+				if (de->name_len > 2)
+					goto not_empty;
+				if (de->name_len < 2) {
+					if (le64_to_cpu(de->inode_no) !=
+					    inode->i_ino)
+						goto not_empty;
+				} else if (de->name[1] != '.')
+					goto not_empty;
+			}
+			de = exofs_next_entry(de);
+		}
+		exofs_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	exofs_put_page(page);
+	return 0;
+}
+
+const struct file_operations exofs_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= exofs_readdir,
+};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
new file mode 100644
index 00000000..c965806c
--- /dev/null
+++ b/fs/exofs/exofs.h
@@ -0,0 +1,308 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __EXOFS_H__
+#define __EXOFS_H__
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/backing-dev.h>
+#include "common.h"
+
+/* FIXME: Remove once pnfs hits mainline
+ * #include <linux/exportfs/pnfs_osd_xdr.h>
+ */
+#include "pnfs.h"
+
+#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
+
+#ifdef CONFIG_EXOFS_DEBUG
+#define EXOFS_DBGMSG(fmt, a...) \
+	printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define EXOFS_DBGMSG(fmt, a...) \
+	do { if (0) printk(fmt, ##a); } while (0)
+#endif
+
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+
+struct exofs_layout {
+	osd_id		s_pid;			/* partition ID of file system*/
+
+	/* Our way of looking at the data_map */
+	unsigned stripe_unit;
+	unsigned mirrors_p1;
+
+	unsigned group_width;
+	u64	 group_depth;
+	unsigned group_count;
+
+	enum exofs_inode_layout_gen_functions lay_func;
+
+	unsigned	s_numdevs;		/* Num of devices in array    */
+	struct osd_dev	*s_ods[0];		/* Variable length            */
+};
+
+/*
+ * our extension to the in-memory superblock
+ */
+struct exofs_sb_info {
+	struct exofs_sb_stats s_ess;		/* Written often, pre-allocate*/
+	int		s_timeout;		/* timeout for OSD operations */
+	uint64_t	s_nextid;		/* highest object ID used     */
+	uint32_t	s_numfiles;		/* number of files on fs      */
+	spinlock_t	s_next_gen_lock;	/* spinlock for gen # update  */
+	u32		s_next_generation;	/* next gen # to use          */
+	atomic_t	s_curr_pending;		/* number of pending commands */
+	uint8_t		s_cred[OSD_CAP_LEN];	/* credential for the fscb    */
+	struct 		backing_dev_info bdi;	/* register our bdi with VFS  */
+
+	struct pnfs_osd_data_map data_map;	/* Default raid to use
+						 * FIXME: Needed ?
+						 */
+/*	struct exofs_layout	dir_layout;*/	/* Default dir layout */
+	struct exofs_layout	layout;		/* Default files layout,
+						 * contains the variable osd_dev
+						 * array. Keep last */
+	struct osd_dev	*_min_one_dev[1];	/* Place holder for one dev   */
+};
+
+/*
+ * our extension to the in-memory inode
+ */
+struct exofs_i_info {
+	struct inode   vfs_inode;          /* normal in-memory inode          */
+	wait_queue_head_t i_wq;            /* wait queue for inode            */
+	unsigned long  i_flags;            /* various atomic flags            */
+	uint32_t       i_data[EXOFS_IDATA];/*short symlink names and device #s*/
+	uint32_t       i_dir_start_lookup; /* which page to start lookup      */
+	uint64_t       i_commit_size;      /* the object's written length     */
+	uint8_t        i_cred[OSD_CAP_LEN];/* all-powerful credential         */
+};
+
+static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
+{
+	return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
+}
+
+struct exofs_io_state;
+typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private);
+
+struct exofs_io_state {
+	struct kref		kref;
+
+	void			*private;
+	exofs_io_done_fn	done;
+
+	struct exofs_layout	*layout;
+	struct osd_obj_id	obj;
+	u8			*cred;
+
+	/* Global read/write IO*/
+	loff_t			offset;
+	unsigned long		length;
+	void			*kern_buff;
+
+	struct page		**pages;
+	unsigned		nr_pages;
+	unsigned		pgbase;
+	unsigned		pages_consumed;
+
+	/* Attributes */
+	unsigned		in_attr_len;
+	struct osd_attr 	*in_attr;
+	unsigned		out_attr_len;
+	struct osd_attr 	*out_attr;
+
+	/* Variable array of size numdevs */
+	unsigned numdevs;
+	struct exofs_per_dev_state {
+		struct osd_request *or;
+		struct bio *bio;
+		loff_t offset;
+		unsigned length;
+		unsigned dev;
+	} per_dev[];
+};
+
+static inline unsigned exofs_io_state_size(unsigned numdevs)
+{
+	return sizeof(struct exofs_io_state) +
+		sizeof(struct exofs_per_dev_state) * numdevs;
+}
+
+/*
+ * our inode flags
+ */
+#define OBJ_2BCREATED	0	/* object will be created soon*/
+#define OBJ_CREATED	1	/* object has been created on the osd*/
+
+static inline int obj_2bcreated(struct exofs_i_info *oi)
+{
+	return test_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+
+static inline void set_obj_2bcreated(struct exofs_i_info *oi)
+{
+	set_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+
+static inline int obj_created(struct exofs_i_info *oi)
+{
+	return test_bit(OBJ_CREATED, &oi->i_flags);
+}
+
+static inline void set_obj_created(struct exofs_i_info *oi)
+{
+	set_bit(OBJ_CREATED, &oi->i_flags);
+}
+
+int __exofs_wait_obj_created(struct exofs_i_info *oi);
+static inline int wait_obj_created(struct exofs_i_info *oi)
+{
+	if (likely(obj_created(oi)))
+		return 0;
+
+	return __exofs_wait_obj_created(oi);
+}
+
+/*
+ * get to our inode from the vfs inode
+ */
+static inline struct exofs_i_info *exofs_i(struct inode *inode)
+{
+	return container_of(inode, struct exofs_i_info, vfs_inode);
+}
+
+/*
+ * Given a layout, object_number and stripe_index return the associated global
+ * dev_index
+ */
+unsigned exofs_layout_od_id(struct exofs_layout *layout,
+			    osd_id obj_no, unsigned layout_index);
+/*
+ * Maximum count of links to a file
+ */
+#define EXOFS_LINK_MAX           32000
+
+/*************************
+ * function declarations *
+ *************************/
+
+/* ios.c */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+			   const struct osd_obj_id *obj);
+int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+		    u64 offset, void *p, unsigned length);
+
+int  exofs_get_io_state(struct exofs_layout *layout,
+			struct exofs_io_state **ios);
+void exofs_put_io_state(struct exofs_io_state *ios);
+
+int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
+
+int exofs_sbi_create(struct exofs_io_state *ios);
+int exofs_sbi_remove(struct exofs_io_state *ios);
+int exofs_sbi_write(struct exofs_io_state *ios);
+int exofs_sbi_read(struct exofs_io_state *ios);
+
+int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr);
+
+int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);
+static inline int exofs_oi_write(struct exofs_i_info *oi,
+				 struct exofs_io_state *ios)
+{
+	ios->obj.id = exofs_oi_objno(oi);
+	ios->cred = oi->i_cred;
+	return exofs_sbi_write(ios);
+}
+
+static inline int exofs_oi_read(struct exofs_i_info *oi,
+				struct exofs_io_state *ios)
+{
+	ios->obj.id = exofs_oi_objno(oi);
+	ios->cred = oi->i_cred;
+	return exofs_sbi_read(ios);
+}
+
+/* inode.c               */
+unsigned exofs_max_io_pages(struct exofs_layout *layout,
+			    unsigned expected_pages);
+int exofs_setattr(struct dentry *, struct iattr *);
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata);
+extern struct inode *exofs_iget(struct super_block *, unsigned long);
+struct inode *exofs_new_inode(struct inode *, int);
+extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
+extern void exofs_evict_inode(struct inode *);
+
+/* dir.c:                */
+int exofs_add_link(struct dentry *, struct inode *);
+ino_t exofs_inode_by_name(struct inode *, struct dentry *);
+int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
+int exofs_make_empty(struct inode *, struct inode *);
+struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
+					 struct page **);
+int exofs_empty_dir(struct inode *);
+struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
+ino_t exofs_parent_ino(struct dentry *child);
+int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
+		    struct inode *);
+
+/* super.c               */
+int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
+
+/*********************
+ * operation vectors *
+ *********************/
+/* dir.c:            */
+extern const struct file_operations exofs_dir_operations;
+
+/* file.c            */
+extern const struct inode_operations exofs_file_inode_operations;
+extern const struct file_operations exofs_file_operations;
+
+/* inode.c           */
+extern const struct address_space_operations exofs_aops;
+extern const struct osd_attr g_attr_logical_length;
+
+/* namei.c           */
+extern const struct inode_operations exofs_dir_inode_operations;
+extern const struct inode_operations exofs_special_inode_operations;
+
+/* symlink.c         */
+extern const struct inode_operations exofs_symlink_inode_operations;
+extern const struct inode_operations exofs_fast_symlink_inode_operations;
+
+#endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
new file mode 100644
index 00000000..45ca323d
--- /dev/null
+++ b/fs/exofs/file.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "exofs.h"
+
+static int exofs_release_file(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/* exofs_file_fsync - flush the inode to disk
+ *
+ *   Note, in exofs all metadata is written as part of inode, regardless.
+ *   The writeout is synchronous
+ */
+static int exofs_file_fsync(struct file *filp, int datasync)
+{
+	int ret;
+
+	ret = sync_inode_metadata(filp->f_mapping->host, 1);
+	return ret;
+}
+
+static int exofs_flush(struct file *file, fl_owner_t id)
+{
+	int ret = vfs_fsync(file, 0);
+	/* TODO: Flush the OSD target */
+	return ret;
+}
+
+const struct file_operations exofs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.open		= generic_file_open,
+	.release	= exofs_release_file,
+	.fsync		= exofs_file_fsync,
+	.flush		= exofs_flush,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+};
+
+const struct inode_operations exofs_file_inode_operations = {
+	.setattr	= exofs_setattr,
+};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
new file mode 100644
index 00000000..8472c098
--- /dev/null
+++ b/fs/exofs/inode.c
@@ -0,0 +1,1374 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/slab.h>
+
+#include "exofs.h"
+
+#define EXOFS_DBGMSG2(M...) do {} while (0)
+
+enum { BIO_MAX_PAGES_KMALLOC =
+		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+	MAX_PAGES_KMALLOC =
+		PAGE_SIZE / sizeof(struct page *),
+};
+
+unsigned exofs_max_io_pages(struct exofs_layout *layout,
+			    unsigned expected_pages)
+{
+	unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
+
+	/* TODO: easily support bio chaining */
+	pages =  min_t(unsigned, pages,
+		       layout->group_width * BIO_MAX_PAGES_KMALLOC);
+	return pages;
+}
+
+struct page_collect {
+	struct exofs_sb_info *sbi;
+	struct inode *inode;
+	unsigned expected_pages;
+	struct exofs_io_state *ios;
+
+	struct page **pages;
+	unsigned alloc_pages;
+	unsigned nr_pages;
+	unsigned long length;
+	loff_t pg_first; /* keep 64bit also in 32-arches */
+	bool read_4_write; /* This means two things: that the read is sync
+			    * And the pages should not be unlocked.
+			    */
+};
+
+static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
+		       struct inode *inode)
+{
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+
+	pcol->sbi = sbi;
+	pcol->inode = inode;
+	pcol->expected_pages = expected_pages;
+
+	pcol->ios = NULL;
+	pcol->pages = NULL;
+	pcol->alloc_pages = 0;
+	pcol->nr_pages = 0;
+	pcol->length = 0;
+	pcol->pg_first = -1;
+	pcol->read_4_write = false;
+}
+
+static void _pcol_reset(struct page_collect *pcol)
+{
+	pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
+
+	pcol->pages = NULL;
+	pcol->alloc_pages = 0;
+	pcol->nr_pages = 0;
+	pcol->length = 0;
+	pcol->pg_first = -1;
+	pcol->ios = NULL;
+
+	/* this is probably the end of the loop but in writes
+	 * it might not end here. don't be left with nothing
+	 */
+	if (!pcol->expected_pages)
+		pcol->expected_pages = MAX_PAGES_KMALLOC;
+}
+
+static int pcol_try_alloc(struct page_collect *pcol)
+{
+	unsigned pages;
+
+	if (!pcol->ios) { /* First time allocate io_state */
+		int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
+
+		if (ret)
+			return ret;
+	}
+
+	/* TODO: easily support bio chaining */
+	pages =  exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
+
+	for (; pages; pages >>= 1) {
+		pcol->pages = kmalloc(pages * sizeof(struct page *),
+				      GFP_KERNEL);
+		if (likely(pcol->pages)) {
+			pcol->alloc_pages = pages;
+			return 0;
+		}
+	}
+
+	EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
+		  pcol->expected_pages);
+	return -ENOMEM;
+}
+
+static void pcol_free(struct page_collect *pcol)
+{
+	kfree(pcol->pages);
+	pcol->pages = NULL;
+
+	if (pcol->ios) {
+		exofs_put_io_state(pcol->ios);
+		pcol->ios = NULL;
+	}
+}
+
+static int pcol_add_page(struct page_collect *pcol, struct page *page,
+			 unsigned len)
+{
+	if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
+		return -ENOMEM;
+
+	pcol->pages[pcol->nr_pages++] = page;
+	pcol->length += len;
+	return 0;
+}
+
+static int update_read_page(struct page *page, int ret)
+{
+	if (ret == 0) {
+		/* Everything is OK */
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+	} else if (ret == -EFAULT) {
+		/* In this case we were trying to read something that wasn't on
+		 * disk yet - return a page full of zeroes.  This should be OK,
+		 * because the object should be empty (if there was a write
+		 * before this read, the read would be waiting with the page
+		 * locked */
+		clear_highpage(page);
+
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+		ret = 0; /* recovered error */
+		EXOFS_DBGMSG("recovered read error\n");
+	} else /* Error */
+		SetPageError(page);
+
+	return ret;
+}
+
+static void update_write_page(struct page *page, int ret)
+{
+	if (ret) {
+		mapping_set_error(page->mapping, ret);
+		SetPageError(page);
+	}
+	end_page_writeback(page);
+}
+
+/* Called at the end of reads, to optionally unlock pages and update their
+ * status.
+ */
+static int __readpages_done(struct page_collect *pcol)
+{
+	int i;
+	u64 resid;
+	u64 good_bytes;
+	u64 length = 0;
+	int ret = exofs_check_io(pcol->ios, &resid);
+
+	if (likely(!ret))
+		good_bytes = pcol->length;
+	else
+		good_bytes = pcol->length - resid;
+
+	EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
+		     " length=0x%lx nr_pages=%u\n",
+		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+		     pcol->nr_pages);
+
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
+		struct inode *inode = page->mapping->host;
+		int page_stat;
+
+		if (inode != pcol->inode)
+			continue; /* osd might add more pages at end */
+
+		if (likely(length < good_bytes))
+			page_stat = 0;
+		else
+			page_stat = ret;
+
+		EXOFS_DBGMSG2("    readpages_done(0x%lx, 0x%lx) %s\n",
+			  inode->i_ino, page->index,
+			  page_stat ? "bad_bytes" : "good_bytes");
+
+		ret = update_read_page(page, page_stat);
+		if (!pcol->read_4_write)
+			unlock_page(page);
+		length += PAGE_SIZE;
+	}
+
+	pcol_free(pcol);
+	EXOFS_DBGMSG2("readpages_done END\n");
+	return ret;
+}
+
+/* callback of async reads */
+static void readpages_done(struct exofs_io_state *ios, void *p)
+{
+	struct page_collect *pcol = p;
+
+	__readpages_done(pcol);
+	atomic_dec(&pcol->sbi->s_curr_pending);
+	kfree(pcol);
+}
+
+static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
+{
+	int i;
+
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
+
+		if (rw == READ)
+			update_read_page(page, ret);
+		else
+			update_write_page(page, ret);
+
+		unlock_page(page);
+	}
+}
+
+static int read_exec(struct page_collect *pcol)
+{
+	struct exofs_i_info *oi = exofs_i(pcol->inode);
+	struct exofs_io_state *ios = pcol->ios;
+	struct page_collect *pcol_copy = NULL;
+	int ret;
+
+	if (!pcol->pages)
+		return 0;
+
+	ios->pages = pcol->pages;
+	ios->nr_pages = pcol->nr_pages;
+	ios->length = pcol->length;
+	ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
+
+	if (pcol->read_4_write) {
+		exofs_oi_read(oi, pcol->ios);
+		return __readpages_done(pcol);
+	}
+
+	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+	if (!pcol_copy) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	*pcol_copy = *pcol;
+	ios->done = readpages_done;
+	ios->private = pcol_copy;
+	ret = exofs_oi_read(oi, ios);
+	if (unlikely(ret))
+		goto err;
+
+	atomic_inc(&pcol->sbi->s_curr_pending);
+
+	EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
+		  ios->obj.id, _LLU(ios->offset), pcol->length);
+
+	/* pages ownership was passed to pcol_copy */
+	_pcol_reset(pcol);
+	return 0;
+
+err:
+	if (!pcol->read_4_write)
+		_unlock_pcol_pages(pcol, ret, READ);
+
+	pcol_free(pcol);
+
+	kfree(pcol_copy);
+	return ret;
+}
+
+/* readpage_strip is called either directly from readpage() or by the VFS from
+ * within read_cache_pages(), to add one more page to be read. It will try to
+ * collect as many contiguous pages as posible. If a discontinuity is
+ * encountered, or it runs out of resources, it will submit the previous segment
+ * and will start a new collection. Eventually caller must submit the last
+ * segment if present.
+ */
+static int readpage_strip(void *data, struct page *page)
+{
+	struct page_collect *pcol = data;
+	struct inode *inode = pcol->inode;
+	struct exofs_i_info *oi = exofs_i(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	size_t len;
+	int ret;
+
+	/* FIXME: Just for debugging, will be removed */
+	if (PageUptodate(page))
+		EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
+			  page->index);
+
+	if (page->index < end_index)
+		len = PAGE_CACHE_SIZE;
+	else if (page->index == end_index)
+		len = i_size & ~PAGE_CACHE_MASK;
+	else
+		len = 0;
+
+	if (!len || !obj_created(oi)) {
+		/* this will be out of bounds, or doesn't exist yet.
+		 * Current page is cleared and the request is split
+		 */
+		clear_highpage(page);
+
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+
+		if (!pcol->read_4_write)
+			unlock_page(page);
+		EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx "
+			     "read_4_write=%d index=0x%lx end_index=0x%lx "
+			     "splitting\n", inode->i_ino, len,
+			     pcol->read_4_write, page->index, end_index);
+
+		return read_exec(pcol);
+	}
+
+try_again:
+
+	if (unlikely(pcol->pg_first == -1)) {
+		pcol->pg_first = page->index;
+	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+		   page->index)) {
+		/* Discontinuity detected, split the request */
+		ret = read_exec(pcol);
+		if (unlikely(ret))
+			goto fail;
+		goto try_again;
+	}
+
+	if (!pcol->pages) {
+		ret = pcol_try_alloc(pcol);
+		if (unlikely(ret))
+			goto fail;
+	}
+
+	if (len != PAGE_CACHE_SIZE)
+		zero_user(page, len, PAGE_CACHE_SIZE - len);
+
+	EXOFS_DBGMSG2("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+		     inode->i_ino, page->index, len);
+
+	ret = pcol_add_page(pcol, page, len);
+	if (ret) {
+		EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p "
+			  "this_len=0x%zx nr_pages=%u length=0x%lx\n",
+			  page, len, pcol->nr_pages, pcol->length);
+
+		/* split the request, and start again with current page */
+		ret = read_exec(pcol);
+		if (unlikely(ret))
+			goto fail;
+
+		goto try_again;
+	}
+
+	return 0;
+
+fail:
+	/* SetPageError(page); ??? */
+	unlock_page(page);
+	return ret;
+}
+
+static int exofs_readpages(struct file *file, struct address_space *mapping,
+			   struct list_head *pages, unsigned nr_pages)
+{
+	struct page_collect pcol;
+	int ret;
+
+	_pcol_init(&pcol, nr_pages, mapping->host);
+
+	ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
+	if (ret) {
+		EXOFS_ERR("read_cache_pages => %d\n", ret);
+		return ret;
+	}
+
+	return read_exec(&pcol);
+}
+
+static int _readpage(struct page *page, bool read_4_write)
+{
+	struct page_collect pcol;
+	int ret;
+
+	_pcol_init(&pcol, 1, page->mapping->host);
+
+	pcol.read_4_write = read_4_write;
+	ret = readpage_strip(&pcol, page);
+	if (ret) {
+		EXOFS_ERR("_readpage => %d\n", ret);
+		return ret;
+	}
+
+	return read_exec(&pcol);
+}
+
+/*
+ * We don't need the file
+ */
+static int exofs_readpage(struct file *file, struct page *page)
+{
+	return _readpage(page, false);
+}
+
+/* Callback for osd_write. All writes are asynchronous */
+static void writepages_done(struct exofs_io_state *ios, void *p)
+{
+	struct page_collect *pcol = p;
+	int i;
+	u64 resid;
+	u64  good_bytes;
+	u64  length = 0;
+	int ret = exofs_check_io(ios, &resid);
+
+	atomic_dec(&pcol->sbi->s_curr_pending);
+
+	if (likely(!ret))
+		good_bytes = pcol->length;
+	else
+		good_bytes = pcol->length - resid;
+
+	EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
+		     " length=0x%lx nr_pages=%u\n",
+		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+		     pcol->nr_pages);
+
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
+		struct inode *inode = page->mapping->host;
+		int page_stat;
+
+		if (inode != pcol->inode)
+			continue; /* osd might add more pages to a bio */
+
+		if (likely(length < good_bytes))
+			page_stat = 0;
+		else
+			page_stat = ret;
+
+		update_write_page(page, page_stat);
+		unlock_page(page);
+		EXOFS_DBGMSG2("    writepages_done(0x%lx, 0x%lx) status=%d\n",
+			     inode->i_ino, page->index, page_stat);
+
+		length += PAGE_SIZE;
+	}
+
+	pcol_free(pcol);
+	kfree(pcol);
+	EXOFS_DBGMSG2("writepages_done END\n");
+}
+
+static int write_exec(struct page_collect *pcol)
+{
+	struct exofs_i_info *oi = exofs_i(pcol->inode);
+	struct exofs_io_state *ios = pcol->ios;
+	struct page_collect *pcol_copy = NULL;
+	int ret;
+
+	if (!pcol->pages)
+		return 0;
+
+	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+	if (!pcol_copy) {
+		EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	*pcol_copy = *pcol;
+
+	ios->pages = pcol_copy->pages;
+	ios->nr_pages = pcol_copy->nr_pages;
+	ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
+	ios->length = pcol_copy->length;
+	ios->done = writepages_done;
+	ios->private = pcol_copy;
+
+	ret = exofs_oi_write(oi, ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("write_exec: exofs_oi_write() Failed\n");
+		goto err;
+	}
+
+	atomic_inc(&pcol->sbi->s_curr_pending);
+	EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
+		  pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
+		  pcol->length);
+	/* pages ownership was passed to pcol_copy */
+	_pcol_reset(pcol);
+	return 0;
+
+err:
+	_unlock_pcol_pages(pcol, ret, WRITE);
+	pcol_free(pcol);
+	kfree(pcol_copy);
+
+	return ret;
+}
+
+/* writepage_strip is called either directly from writepage() or by the VFS from
+ * within write_cache_pages(), to add one more page to be written to storage.
+ * It will try to collect as many contiguous pages as possible. If a
+ * discontinuity is encountered or it runs out of resources it will submit the
+ * previous segment and will start a new collection.
+ * Eventually caller must submit the last segment if present.
+ */
+static int writepage_strip(struct page *page,
+			   struct writeback_control *wbc_unused, void *data)
+{
+	struct page_collect *pcol = data;
+	struct inode *inode = pcol->inode;
+	struct exofs_i_info *oi = exofs_i(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	size_t len;
+	int ret;
+
+	BUG_ON(!PageLocked(page));
+
+	ret = wait_obj_created(oi);
+	if (unlikely(ret))
+		goto fail;
+
+	if (page->index < end_index)
+		/* in this case, the page is within the limits of the file */
+		len = PAGE_CACHE_SIZE;
+	else {
+		len = i_size & ~PAGE_CACHE_MASK;
+
+		if (page->index > end_index || !len) {
+			/* in this case, the page is outside the limits
+			 * (truncate in progress)
+			 */
+			ret = write_exec(pcol);
+			if (unlikely(ret))
+				goto fail;
+			if (PageError(page))
+				ClearPageError(page);
+			unlock_page(page);
+			EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) "
+				     "outside the limits\n",
+				     inode->i_ino, page->index);
+			return 0;
+		}
+	}
+
+try_again:
+
+	if (unlikely(pcol->pg_first == -1)) {
+		pcol->pg_first = page->index;
+	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+		   page->index)) {
+		/* Discontinuity detected, split the request */
+		ret = write_exec(pcol);
+		if (unlikely(ret))
+			goto fail;
+
+		EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n",
+			     inode->i_ino, page->index);
+		goto try_again;
+	}
+
+	if (!pcol->pages) {
+		ret = pcol_try_alloc(pcol);
+		if (unlikely(ret))
+			goto fail;
+	}
+
+	EXOFS_DBGMSG2("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+		     inode->i_ino, page->index, len);
+
+	ret = pcol_add_page(pcol, page, len);
+	if (unlikely(ret)) {
+		EXOFS_DBGMSG2("Failed pcol_add_page "
+			     "nr_pages=%u total_length=0x%lx\n",
+			     pcol->nr_pages, pcol->length);
+
+		/* split the request, next loop will start again */
+		ret = write_exec(pcol);
+		if (unlikely(ret)) {
+			EXOFS_DBGMSG("write_exec failed => %d", ret);
+			goto fail;
+		}
+
+		goto try_again;
+	}
+
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	return 0;
+
+fail:
+	EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
+		     inode->i_ino, page->index, ret);
+	set_bit(AS_EIO, &page->mapping->flags);
+	unlock_page(page);
+	return ret;
+}
+
+static int exofs_writepages(struct address_space *mapping,
+		       struct writeback_control *wbc)
+{
+	struct page_collect pcol;
+	long start, end, expected_pages;
+	int ret;
+
+	start = wbc->range_start >> PAGE_CACHE_SHIFT;
+	end = (wbc->range_end == LLONG_MAX) ?
+			start + mapping->nrpages :
+			wbc->range_end >> PAGE_CACHE_SHIFT;
+
+	if (start || end)
+		expected_pages = end - start + 1;
+	else
+		expected_pages = mapping->nrpages;
+
+	if (expected_pages < 32L)
+		expected_pages = 32L;
+
+	EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
+		     "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
+		     mapping->host->i_ino, wbc->range_start, wbc->range_end,
+		     mapping->nrpages, start, end, expected_pages);
+
+	_pcol_init(&pcol, expected_pages, mapping->host);
+
+	ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
+	if (ret) {
+		EXOFS_ERR("write_cache_pages => %d\n", ret);
+		return ret;
+	}
+
+	return write_exec(&pcol);
+}
+
+static int exofs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct page_collect pcol;
+	int ret;
+
+	_pcol_init(&pcol, 1, page->mapping->host);
+
+	ret = writepage_strip(page, NULL, &pcol);
+	if (ret) {
+		EXOFS_ERR("exofs_writepage => %d\n", ret);
+		return ret;
+	}
+
+	return write_exec(&pcol);
+}
+
+/* i_mutex held using inode->i_size directly */
+static void _write_failed(struct inode *inode, loff_t to)
+{
+	if (to > inode->i_size)
+		truncate_pagecache(inode, to, inode->i_size);
+}
+
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	int ret = 0;
+	struct page *page;
+
+	page = *pagep;
+	if (page == NULL) {
+		ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
+					 fsdata);
+		if (ret) {
+			EXOFS_DBGMSG("simple_write_begin failed\n");
+			goto out;
+		}
+
+		page = *pagep;
+	}
+
+	 /* read modify write */
+	if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+		loff_t i_size = i_size_read(mapping->host);
+		pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+		size_t rlen;
+
+		if (page->index < end_index)
+			rlen = PAGE_CACHE_SIZE;
+		else if (page->index == end_index)
+			rlen = i_size & ~PAGE_CACHE_MASK;
+		else
+			rlen = 0;
+
+		if (!rlen) {
+			clear_highpage(page);
+			SetPageUptodate(page);
+			goto out;
+		}
+
+		ret = _readpage(page, true);
+		if (ret) {
+			/*SetPageError was done by _readpage. Is it ok?*/
+			unlock_page(page);
+			EXOFS_DBGMSG("__readpage failed\n");
+		}
+	}
+out:
+	if (unlikely(ret))
+		_write_failed(mapping->host, pos + len);
+
+	return ret;
+}
+
+static int exofs_write_begin_export(struct file *file,
+		struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	*pagep = NULL;
+
+	return exofs_write_begin(file, mapping, pos, len, flags, pagep,
+					fsdata);
+}
+
+static int exofs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	/* According to comment in simple_write_end i_mutex is held */
+	loff_t i_size = inode->i_size;
+	int ret;
+
+	ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata);
+	if (unlikely(ret))
+		_write_failed(inode, pos + len);
+
+	/* TODO: once simple_write_end marks inode dirty remove */
+	if (i_size != inode->i_size)
+		mark_inode_dirty(inode);
+	return ret;
+}
+
+static int exofs_releasepage(struct page *page, gfp_t gfp)
+{
+	EXOFS_DBGMSG("page 0x%lx\n", page->index);
+	WARN_ON(1);
+	return 0;
+}
+
+static void exofs_invalidatepage(struct page *page, unsigned long offset)
+{
+	EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
+	WARN_ON(1);
+}
+
+const struct address_space_operations exofs_aops = {
+	.readpage	= exofs_readpage,
+	.readpages	= exofs_readpages,
+	.writepage	= exofs_writepage,
+	.writepages	= exofs_writepages,
+	.write_begin	= exofs_write_begin_export,
+	.write_end	= exofs_write_end,
+	.releasepage	= exofs_releasepage,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.invalidatepage = exofs_invalidatepage,
+
+	/* Not implemented Yet */
+	.bmap		= NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
+	.direct_IO	= NULL, /* TODO: Should be trivial to do */
+
+	/* With these NULL has special meaning or default is not exported */
+	.get_xip_mem	= NULL,
+	.migratepage	= NULL,
+	.launder_page	= NULL,
+	.is_partially_uptodate = NULL,
+	.error_remove_page = NULL,
+};
+
+/******************************************************************************
+ * INODE OPERATIONS
+ *****************************************************************************/
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int exofs_inode_is_fast_symlink(struct inode *inode)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+
+	return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
+}
+
+const struct osd_attr g_attr_logical_length = ATTR_DEF(
+	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+
+static int _do_truncate(struct inode *inode, loff_t newsize)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+	int ret;
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	ret = exofs_oi_truncate(oi, (u64)newsize);
+	if (likely(!ret))
+		truncate_setsize(inode, newsize);
+
+	EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n",
+		     inode->i_ino, newsize, ret);
+	return ret;
+}
+
+/*
+ * Set inode attributes - update size attribute on OSD if needed,
+ *                        otherwise just call generic functions.
+ */
+int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	/* if we are about to modify an object, and it hasn't been
+	 * created yet, wait
+	 */
+	error = wait_obj_created(exofs_i(inode));
+	if (unlikely(error))
+		return error;
+
+	error = inode_change_ok(inode, iattr);
+	if (unlikely(error))
+		return error;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		error = _do_truncate(inode, iattr->ia_size);
+		if (unlikely(error))
+			return error;
+	}
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
+	EXOFS_APAGE_FS_DATA,
+	EXOFS_ATTR_INODE_FILE_LAYOUT,
+	0);
+static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
+	EXOFS_APAGE_FS_DATA,
+	EXOFS_ATTR_INODE_DIR_LAYOUT,
+	0);
+
+/*
+ * Read the Linux inode info from the OSD, and return it as is. In exofs the
+ * inode info is in an application specific page/attribute of the osd-object.
+ */
+static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
+		    struct exofs_fcb *inode)
+{
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct osd_attr attrs[] = {
+		[0] = g_attr_inode_data,
+		[1] = g_attr_inode_file_layout,
+		[2] = g_attr_inode_dir_layout,
+	};
+	struct exofs_io_state *ios;
+	struct exofs_on_disk_inode_layout *layout;
+	int ret;
+
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+		return ret;
+	}
+
+	ios->obj.id = exofs_oi_objno(oi);
+	exofs_make_credential(oi->i_cred, &ios->obj);
+	ios->cred = oi->i_cred;
+
+	attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
+	attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
+
+	ios->in_attr = attrs;
+	ios->in_attr_len = ARRAY_SIZE(attrs);
+
+	ret = exofs_sbi_read(ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
+			  _LLU(ios->obj.id), ret);
+		memset(inode, 0, sizeof(*inode));
+		inode->i_mode = 0040000 | (0777 & ~022);
+		/* If object is lost on target we might as well enable it's
+		 * delete.
+		 */
+		if ((ret == -ENOENT) || (ret == -EINVAL))
+			ret = 0;
+		goto out;
+	}
+
+	ret = extract_attr_from_ios(ios, &attrs[0]);
+	if (ret) {
+		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+		goto out;
+	}
+	WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
+	memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
+
+	ret = extract_attr_from_ios(ios, &attrs[1]);
+	if (ret) {
+		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+		goto out;
+	}
+	if (attrs[1].len) {
+		layout = attrs[1].val_ptr;
+		if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
+			EXOFS_ERR("%s: unsupported files layout %d\n",
+				__func__, layout->gen_func);
+			ret = -ENOTSUPP;
+			goto out;
+		}
+	}
+
+	ret = extract_attr_from_ios(ios, &attrs[2]);
+	if (ret) {
+		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+		goto out;
+	}
+	if (attrs[2].len) {
+		layout = attrs[2].val_ptr;
+		if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
+			EXOFS_ERR("%s: unsupported meta-data layout %d\n",
+				__func__, layout->gen_func);
+			ret = -ENOTSUPP;
+			goto out;
+		}
+	}
+
+out:
+	exofs_put_io_state(ios);
+	return ret;
+}
+
+static void __oi_init(struct exofs_i_info *oi)
+{
+	init_waitqueue_head(&oi->i_wq);
+	oi->i_flags = 0;
+}
+/*
+ * Fill in an inode read from the OSD and set it up for use
+ */
+struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct exofs_i_info *oi;
+	struct exofs_fcb fcb;
+	struct inode *inode;
+	int ret;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+	oi = exofs_i(inode);
+	__oi_init(oi);
+
+	/* read the inode from the osd */
+	ret = exofs_get_inode(sb, oi, &fcb);
+	if (ret)
+		goto bad_inode;
+
+	set_obj_created(oi);
+
+	/* copy stuff from on-disk struct to in-memory struct */
+	inode->i_mode = le16_to_cpu(fcb.i_mode);
+	inode->i_uid = le32_to_cpu(fcb.i_uid);
+	inode->i_gid = le32_to_cpu(fcb.i_gid);
+	inode->i_nlink = le16_to_cpu(fcb.i_links_count);
+	inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
+	inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
+	inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
+	inode->i_ctime.tv_nsec =
+		inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+	oi->i_commit_size = le64_to_cpu(fcb.i_size);
+	i_size_write(inode, oi->i_commit_size);
+	inode->i_blkbits = EXOFS_BLKSHIFT;
+	inode->i_generation = le32_to_cpu(fcb.i_generation);
+
+	oi->i_dir_start_lookup = 0;
+
+	if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
+		ret = -ESTALE;
+		goto bad_inode;
+	}
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (fcb.i_data[0])
+			inode->i_rdev =
+				old_decode_dev(le32_to_cpu(fcb.i_data[0]));
+		else
+			inode->i_rdev =
+				new_decode_dev(le32_to_cpu(fcb.i_data[1]));
+	} else {
+		memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
+	}
+
+	inode->i_mapping->backing_dev_info = sb->s_bdi;
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &exofs_file_inode_operations;
+		inode->i_fop = &exofs_file_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &exofs_dir_inode_operations;
+		inode->i_fop = &exofs_dir_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (exofs_inode_is_fast_symlink(inode))
+			inode->i_op = &exofs_fast_symlink_inode_operations;
+		else {
+			inode->i_op = &exofs_symlink_inode_operations;
+			inode->i_mapping->a_ops = &exofs_aops;
+		}
+	} else {
+		inode->i_op = &exofs_special_inode_operations;
+		if (fcb.i_data[0])
+			init_special_inode(inode, inode->i_mode,
+			   old_decode_dev(le32_to_cpu(fcb.i_data[0])));
+		else
+			init_special_inode(inode, inode->i_mode,
+			   new_decode_dev(le32_to_cpu(fcb.i_data[1])));
+	}
+
+	unlock_new_inode(inode);
+	return inode;
+
+bad_inode:
+	iget_failed(inode);
+	return ERR_PTR(ret);
+}
+
+int __exofs_wait_obj_created(struct exofs_i_info *oi)
+{
+	if (!obj_created(oi)) {
+		EXOFS_DBGMSG("!obj_created\n");
+		BUG_ON(!obj_2bcreated(oi));
+		wait_event(oi->i_wq, obj_created(oi));
+		EXOFS_DBGMSG("wait_event done\n");
+	}
+	return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
+}
+
+/*
+ * Callback function from exofs_new_inode().  The important thing is that we
+ * set the obj_created flag so that other methods know that the object exists on
+ * the OSD.
+ */
+static void create_done(struct exofs_io_state *ios, void *p)
+{
+	struct inode *inode = p;
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	int ret;
+
+	ret = exofs_check_io(ios, NULL);
+	exofs_put_io_state(ios);
+
+	atomic_dec(&sbi->s_curr_pending);
+
+	if (unlikely(ret)) {
+		EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
+			  _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
+		/*TODO: When FS is corrupted creation can fail, object already
+		 * exist. Get rid of this asynchronous creation, if exist
+		 * increment the obj counter and try the next object. Until we
+		 * succeed. All these dangling objects will be made into lost
+		 * files by chkfs.exofs
+		 */
+	}
+
+	set_obj_created(oi);
+
+	wake_up(&oi->i_wq);
+}
+
+/*
+ * Set up a new inode and create an object for it on the OSD
+ */
+struct inode *exofs_new_inode(struct inode *dir, int mode)
+{
+	struct super_block *sb;
+	struct inode *inode;
+	struct exofs_i_info *oi;
+	struct exofs_sb_info *sbi;
+	struct exofs_io_state *ios;
+	int ret;
+
+	sb = dir->i_sb;
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	oi = exofs_i(inode);
+	__oi_init(oi);
+
+	set_obj_2bcreated(oi);
+
+	sbi = sb->s_fs_info;
+
+	inode->i_mapping->backing_dev_info = sb->s_bdi;
+	inode_init_owner(inode, dir, mode);
+	inode->i_ino = sbi->s_nextid++;
+	inode->i_blkbits = EXOFS_BLKSHIFT;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	oi->i_commit_size = inode->i_size = 0;
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+	insert_inode_hash(inode);
+
+	exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
+
+	mark_inode_dirty(inode);
+
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
+		return ERR_PTR(ret);
+	}
+
+	ios->obj.id = exofs_oi_objno(oi);
+	exofs_make_credential(oi->i_cred, &ios->obj);
+
+	ios->done = create_done;
+	ios->private = inode;
+	ios->cred = oi->i_cred;
+	ret = exofs_sbi_create(ios);
+	if (ret) {
+		exofs_put_io_state(ios);
+		return ERR_PTR(ret);
+	}
+	atomic_inc(&sbi->s_curr_pending);
+
+	return inode;
+}
+
+/*
+ * struct to pass two arguments to update_inode's callback
+ */
+struct updatei_args {
+	struct exofs_sb_info	*sbi;
+	struct exofs_fcb	fcb;
+};
+
+/*
+ * Callback function from exofs_update_inode().
+ */
+static void updatei_done(struct exofs_io_state *ios, void *p)
+{
+	struct updatei_args *args = p;
+
+	exofs_put_io_state(ios);
+
+	atomic_dec(&args->sbi->s_curr_pending);
+
+	kfree(args);
+}
+
+/*
+ * Write the inode to the OSD.  Just fill up the struct, and set the attribute
+ * synchronously or asynchronously depending on the do_sync flag.
+ */
+static int exofs_update_inode(struct inode *inode, int do_sync)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct super_block *sb = inode->i_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct exofs_io_state *ios;
+	struct osd_attr attr;
+	struct exofs_fcb *fcb;
+	struct updatei_args *args;
+	int ret;
+
+	args = kzalloc(sizeof(*args), GFP_KERNEL);
+	if (!args) {
+		EXOFS_DBGMSG("Failed kzalloc of args\n");
+		return -ENOMEM;
+	}
+
+	fcb = &args->fcb;
+
+	fcb->i_mode = cpu_to_le16(inode->i_mode);
+	fcb->i_uid = cpu_to_le32(inode->i_uid);
+	fcb->i_gid = cpu_to_le32(inode->i_gid);
+	fcb->i_links_count = cpu_to_le16(inode->i_nlink);
+	fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	oi->i_commit_size = i_size_read(inode);
+	fcb->i_size = cpu_to_le64(oi->i_commit_size);
+	fcb->i_generation = cpu_to_le32(inode->i_generation);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			fcb->i_data[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
+			fcb->i_data[1] = 0;
+		} else {
+			fcb->i_data[0] = 0;
+			fcb->i_data[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
+			fcb->i_data[2] = 0;
+		}
+	} else
+		memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
+
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+		goto free_args;
+	}
+
+	attr = g_attr_inode_data;
+	attr.val_ptr = fcb;
+	ios->out_attr_len = 1;
+	ios->out_attr = &attr;
+
+	wait_obj_created(oi);
+
+	if (!do_sync) {
+		args->sbi = sbi;
+		ios->done = updatei_done;
+		ios->private = args;
+	}
+
+	ret = exofs_oi_write(oi, ios);
+	if (!do_sync && !ret) {
+		atomic_inc(&sbi->s_curr_pending);
+		goto out; /* deallocation in updatei_done */
+	}
+
+	exofs_put_io_state(ios);
+free_args:
+	kfree(args);
+out:
+	EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n",
+		     inode->i_ino, do_sync, ret);
+	return ret;
+}
+
+int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	/* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */
+	return exofs_update_inode(inode, 1);
+}
+
+/*
+ * Callback function from exofs_delete_inode() - don't have much cleaning up to
+ * do.
+ */
+static void delete_done(struct exofs_io_state *ios, void *p)
+{
+	struct exofs_sb_info *sbi = p;
+
+	exofs_put_io_state(ios);
+
+	atomic_dec(&sbi->s_curr_pending);
+}
+
+/*
+ * Called when the refcount of an inode reaches zero.  We remove the object
+ * from the OSD here.  We make sure the object was created before we try and
+ * delete it.
+ */
+void exofs_evict_inode(struct inode *inode)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct super_block *sb = inode->i_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct exofs_io_state *ios;
+	int ret;
+
+	truncate_inode_pages(&inode->i_data, 0);
+
+	/* TODO: should do better here */
+	if (inode->i_nlink || is_bad_inode(inode))
+		goto no_delete;
+
+	inode->i_size = 0;
+	end_writeback(inode);
+
+	/* if we are deleting an obj that hasn't been created yet, wait.
+	 * This also makes sure that create_done cannot be called with an
+	 * already evicted inode.
+	 */
+	wait_obj_created(oi);
+	/* ignore the error, attempt a remove anyway */
+
+	/* Now Remove the OSD objects */
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
+		return;
+	}
+
+	ios->obj.id = exofs_oi_objno(oi);
+	ios->done = delete_done;
+	ios->private = sbi;
+	ios->cred = oi->i_cred;
+	ret = exofs_sbi_remove(ios);
+	if (ret) {
+		EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__);
+		exofs_put_io_state(ios);
+		return;
+	}
+	atomic_inc(&sbi->s_curr_pending);
+
+	return;
+
+no_delete:
+	end_writeback(inode);
+}
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
new file mode 100644
index 00000000..f74a2ec0
--- /dev/null
+++ b/fs/exofs/ios.c
@@ -0,0 +1,803 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/slab.h>
+#include <scsi/scsi_device.h>
+#include <asm/div64.h>
+
+#include "exofs.h"
+
+#define EXOFS_DBGMSG2(M...) do {} while (0)
+/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */
+
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
+{
+	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+
+int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+		    u64 offset, void *p, unsigned length)
+{
+	struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+/*	struct osd_sense_info osi = {.key = 0};*/
+	int ret;
+
+	if (unlikely(!or)) {
+		EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
+		return -ENOMEM;
+	}
+	ret = osd_req_read_kern(or, obj, offset, p, length);
+	if (unlikely(ret)) {
+		EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
+		goto out;
+	}
+
+	ret = osd_finalize_request(or, 0, cred, NULL);
+	if (unlikely(ret)) {
+		EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
+		goto out;
+	}
+
+	ret = osd_execute_request(or);
+	if (unlikely(ret))
+		EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+	/* osd_req_decode_sense(or, ret); */
+
+out:
+	osd_end_request(or);
+	return ret;
+}
+
+int exofs_get_io_state(struct exofs_layout *layout,
+		       struct exofs_io_state **pios)
+{
+	struct exofs_io_state *ios;
+
+	/*TODO: Maybe use kmem_cach per sbi of size
+	 * exofs_io_state_size(layout->s_numdevs)
+	 */
+	ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
+	if (unlikely(!ios)) {
+		EXOFS_DBGMSG("Failed kzalloc bytes=%d\n",
+			     exofs_io_state_size(layout->s_numdevs));
+		*pios = NULL;
+		return -ENOMEM;
+	}
+
+	ios->layout = layout;
+	ios->obj.partition = layout->s_pid;
+	*pios = ios;
+	return 0;
+}
+
+void exofs_put_io_state(struct exofs_io_state *ios)
+{
+	if (ios) {
+		unsigned i;
+
+		for (i = 0; i < ios->numdevs; i++) {
+			struct exofs_per_dev_state *per_dev = &ios->per_dev[i];
+
+			if (per_dev->or)
+				osd_end_request(per_dev->or);
+			if (per_dev->bio)
+				bio_put(per_dev->bio);
+		}
+
+		kfree(ios);
+	}
+}
+
+unsigned exofs_layout_od_id(struct exofs_layout *layout,
+			    osd_id obj_no, unsigned layout_index)
+{
+/*	switch (layout->lay_func) {
+	case LAYOUT_MOVING_WINDOW:
+	{*/
+		unsigned dev_mod = obj_no;
+
+		return (layout_index + dev_mod * layout->mirrors_p1) %
+							      layout->s_numdevs;
+/*	}
+	case LAYOUT_FUNC_IMPLICT:
+		return layout->devs[layout_index];
+	}*/
+}
+
+static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
+					   unsigned layout_index)
+{
+	return ios->layout->s_ods[
+		exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)];
+}
+
+static void _sync_done(struct exofs_io_state *ios, void *p)
+{
+	struct completion *waiting = p;
+
+	complete(waiting);
+}
+
+static void _last_io(struct kref *kref)
+{
+	struct exofs_io_state *ios = container_of(
+					kref, struct exofs_io_state, kref);
+
+	ios->done(ios, ios->private);
+}
+
+static void _done_io(struct osd_request *or, void *p)
+{
+	struct exofs_io_state *ios = p;
+
+	kref_put(&ios->kref, _last_io);
+}
+
+static int exofs_io_execute(struct exofs_io_state *ios)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	bool sync = (ios->done == NULL);
+	int i, ret;
+
+	if (sync) {
+		ios->done = _sync_done;
+		ios->private = &wait;
+	}
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct osd_request *or = ios->per_dev[i].or;
+		if (unlikely(!or))
+			continue;
+
+		ret = osd_finalize_request(or, 0, ios->cred, NULL);
+		if (unlikely(ret)) {
+			EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n",
+				     ret);
+			return ret;
+		}
+	}
+
+	kref_init(&ios->kref);
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct osd_request *or = ios->per_dev[i].or;
+		if (unlikely(!or))
+			continue;
+
+		kref_get(&ios->kref);
+		osd_execute_request_async(or, _done_io, ios);
+	}
+
+	kref_put(&ios->kref, _last_io);
+	ret = 0;
+
+	if (sync) {
+		wait_for_completion(&wait);
+		ret = exofs_check_io(ios, NULL);
+	}
+	return ret;
+}
+
+static void _clear_bio(struct bio *bio)
+{
+	struct bio_vec *bv;
+	unsigned i;
+
+	__bio_for_each_segment(bv, bio, i, 0) {
+		unsigned this_count = bv->bv_len;
+
+		if (likely(PAGE_SIZE == this_count))
+			clear_highpage(bv->bv_page);
+		else
+			zero_user(bv->bv_page, bv->bv_offset, this_count);
+	}
+}
+
+int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
+{
+	enum osd_err_priority acumulated_osd_err = 0;
+	int acumulated_lin_err = 0;
+	int i;
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct osd_sense_info osi;
+		struct osd_request *or = ios->per_dev[i].or;
+		int ret;
+
+		if (unlikely(!or))
+			continue;
+
+		ret = osd_req_decode_sense(or, &osi);
+		if (likely(!ret))
+			continue;
+
+		if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+			/* start read offset passed endof file */
+			_clear_bio(ios->per_dev[i].bio);
+			EXOFS_DBGMSG("start read offset passed end of file "
+				"offset=0x%llx, length=0x%llx\n",
+				_LLU(ios->per_dev[i].offset),
+				_LLU(ios->per_dev[i].length));
+
+			continue; /* we recovered */
+		}
+
+		if (osi.osd_err_pri >= acumulated_osd_err) {
+			acumulated_osd_err = osi.osd_err_pri;
+			acumulated_lin_err = ret;
+		}
+	}
+
+	/* TODO: raid specific residual calculations */
+	if (resid) {
+		if (likely(!acumulated_lin_err))
+			*resid = 0;
+		else
+			*resid = ios->length;
+	}
+
+	return acumulated_lin_err;
+}
+
+/*
+ * L - logical offset into the file
+ *
+ * U - The number of bytes in a stripe within a group
+ *
+ *	U = stripe_unit * group_width
+ *
+ * T - The number of bytes striped within a group of component objects
+ *     (before advancing to the next group)
+ *
+ *	T = stripe_unit * group_width * group_depth
+ *
+ * S - The number of bytes striped across all component objects
+ *     before the pattern repeats
+ *
+ *	S = stripe_unit * group_width * group_depth * group_count
+ *
+ * M - The "major" (i.e., across all components) stripe number
+ *
+ *	M = L / S
+ *
+ * G - Counts the groups from the beginning of the major stripe
+ *
+ *	G = (L - (M * S)) / T	[or (L % S) / T]
+ *
+ * H - The byte offset within the group
+ *
+ *	H = (L - (M * S)) % T	[or (L % S) % T]
+ *
+ * N - The "minor" (i.e., across the group) stripe number
+ *
+ *	N = H / U
+ *
+ * C - The component index coresponding to L
+ *
+ *	C = (H - (N * U)) / stripe_unit + G * group_width
+ *	[or (L % U) / stripe_unit + G * group_width]
+ *
+ * O - The component offset coresponding to L
+ *
+ *	O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
+ */
+struct _striping_info {
+	u64 obj_offset;
+	u64 group_length;
+	unsigned dev;
+	unsigned unit_off;
+};
+
+static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
+			      struct _striping_info *si)
+{
+	u32	stripe_unit = ios->layout->stripe_unit;
+	u32	group_width = ios->layout->group_width;
+	u64	group_depth = ios->layout->group_depth;
+
+	u32	U = stripe_unit * group_width;
+	u64	T = U * group_depth;
+	u64	S = T * ios->layout->group_count;
+	u64	M = div64_u64(file_offset, S);
+
+	/*
+	G = (L - (M * S)) / T
+	H = (L - (M * S)) % T
+	*/
+	u64	LmodS = file_offset - M * S;
+	u32	G = div64_u64(LmodS, T);
+	u64	H = LmodS - G * T;
+
+	u32	N = div_u64(H, U);
+
+	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
+	si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+	si->dev *= ios->layout->mirrors_p1;
+
+	div_u64_rem(file_offset, stripe_unit, &si->unit_off);
+
+	si->obj_offset = si->unit_off + (N * stripe_unit) +
+				  (M * group_depth * stripe_unit);
+
+	si->group_length = T - H;
+}
+
+static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
+		unsigned pgbase, struct exofs_per_dev_state *per_dev,
+		int cur_len)
+{
+	unsigned pg = *cur_pg;
+	struct request_queue *q =
+			osd_request_queue(exofs_ios_od(ios, per_dev->dev));
+
+	per_dev->length += cur_len;
+
+	if (per_dev->bio == NULL) {
+		unsigned pages_in_stripe = ios->layout->group_width *
+					(ios->layout->stripe_unit / PAGE_SIZE);
+		unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
+						ios->layout->group_width;
+
+		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
+		if (unlikely(!per_dev->bio)) {
+			EXOFS_DBGMSG("Failed to allocate BIO size=%u\n",
+				     bio_size);
+			return -ENOMEM;
+		}
+	}
+
+	while (cur_len > 0) {
+		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+		unsigned added_len;
+
+		BUG_ON(ios->nr_pages <= pg);
+		cur_len -= pglen;
+
+		added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
+					    pglen, pgbase);
+		if (unlikely(pglen != added_len))
+			return -ENOMEM;
+		pgbase = 0;
+		++pg;
+	}
+	BUG_ON(cur_len);
+
+	*cur_pg = pg;
+	return 0;
+}
+
+static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
+			      struct _striping_info *si)
+{
+	unsigned stripe_unit = ios->layout->stripe_unit;
+	unsigned mirrors_p1 = ios->layout->mirrors_p1;
+	unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
+	unsigned dev = si->dev;
+	unsigned first_dev = dev - (dev % devs_in_group);
+	unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+	unsigned cur_pg = ios->pages_consumed;
+	int ret = 0;
+
+	while (length) {
+		struct exofs_per_dev_state *per_dev = &ios->per_dev[dev];
+		unsigned cur_len, page_off = 0;
+
+		if (!per_dev->length) {
+			per_dev->dev = dev;
+			if (dev < si->dev) {
+				per_dev->offset = si->obj_offset + stripe_unit -
+								   si->unit_off;
+				cur_len = stripe_unit;
+			} else if (dev == si->dev) {
+				per_dev->offset = si->obj_offset;
+				cur_len = stripe_unit - si->unit_off;
+				page_off = si->unit_off & ~PAGE_MASK;
+				BUG_ON(page_off && (page_off != ios->pgbase));
+			} else { /* dev > si->dev */
+				per_dev->offset = si->obj_offset - si->unit_off;
+				cur_len = stripe_unit;
+			}
+
+			if (max_comp < dev)
+				max_comp = dev;
+		} else {
+			cur_len = stripe_unit;
+		}
+		if (cur_len >= length)
+			cur_len = length;
+
+		ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
+				       cur_len);
+		if (unlikely(ret))
+			goto out;
+
+		dev += mirrors_p1;
+		dev = (dev % devs_in_group) + first_dev;
+
+		length -= cur_len;
+	}
+out:
+	ios->numdevs = max_comp + mirrors_p1;
+	ios->pages_consumed = cur_pg;
+	return ret;
+}
+
+static int _prepare_for_striping(struct exofs_io_state *ios)
+{
+	u64 length = ios->length;
+	u64 offset = ios->offset;
+	struct _striping_info si;
+	int ret = 0;
+
+	if (!ios->pages) {
+		if (ios->kern_buff) {
+			struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
+
+			_calc_stripe_info(ios, ios->offset, &si);
+			per_dev->offset = si.obj_offset;
+			per_dev->dev = si.dev;
+
+			/* no cross device without page array */
+			BUG_ON((ios->layout->group_width > 1) &&
+			       (si.unit_off + ios->length >
+				ios->layout->stripe_unit));
+		}
+		ios->numdevs = ios->layout->mirrors_p1;
+		return 0;
+	}
+
+	while (length) {
+		_calc_stripe_info(ios, offset, &si);
+
+		if (length < si.group_length)
+			si.group_length = length;
+
+		ret = _prepare_one_group(ios, si.group_length, &si);
+		if (unlikely(ret))
+			goto out;
+
+		offset += si.group_length;
+		length -= si.group_length;
+	}
+
+out:
+	return ret;
+}
+
+int exofs_sbi_create(struct exofs_io_state *ios)
+{
+	int i, ret;
+
+	for (i = 0; i < ios->layout->s_numdevs; i++) {
+		struct osd_request *or;
+
+		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		ios->per_dev[i].or = or;
+		ios->numdevs++;
+
+		osd_req_create_object(or, &ios->obj);
+	}
+	ret = exofs_io_execute(ios);
+
+out:
+	return ret;
+}
+
+int exofs_sbi_remove(struct exofs_io_state *ios)
+{
+	int i, ret;
+
+	for (i = 0; i < ios->layout->s_numdevs; i++) {
+		struct osd_request *or;
+
+		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		ios->per_dev[i].or = or;
+		ios->numdevs++;
+
+		osd_req_remove_object(or, &ios->obj);
+	}
+	ret = exofs_io_execute(ios);
+
+out:
+	return ret;
+}
+
+static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
+{
+	struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp];
+	unsigned dev = ios->per_dev[cur_comp].dev;
+	unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
+	int ret = 0;
+
+	if (ios->pages && !master_dev->length)
+		return 0; /* Just an empty slot */
+
+	for (; cur_comp < last_comp; ++cur_comp, ++dev) {
+		struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+		struct osd_request *or;
+
+		or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			ret = -ENOMEM;
+			goto out;
+		}
+		per_dev->or = or;
+		per_dev->offset = master_dev->offset;
+
+		if (ios->pages) {
+			struct bio *bio;
+
+			if (per_dev != master_dev) {
+				bio = bio_kmalloc(GFP_KERNEL,
+						  master_dev->bio->bi_max_vecs);
+				if (unlikely(!bio)) {
+					EXOFS_DBGMSG(
+					      "Failed to allocate BIO size=%u\n",
+					      master_dev->bio->bi_max_vecs);
+					ret = -ENOMEM;
+					goto out;
+				}
+
+				__bio_clone(bio, master_dev->bio);
+				bio->bi_bdev = NULL;
+				bio->bi_next = NULL;
+				per_dev->length = master_dev->length;
+				per_dev->bio =  bio;
+				per_dev->dev = dev;
+			} else {
+				bio = master_dev->bio;
+				/* FIXME: bio_set_dir() */
+				bio->bi_rw |= REQ_WRITE;
+			}
+
+			osd_req_write(or, &ios->obj, per_dev->offset, bio,
+				      per_dev->length);
+			EXOFS_DBGMSG("write(0x%llx) offset=0x%llx "
+				      "length=0x%llx dev=%d\n",
+				     _LLU(ios->obj.id), _LLU(per_dev->offset),
+				     _LLU(per_dev->length), dev);
+		} else if (ios->kern_buff) {
+			ret = osd_req_write_kern(or, &ios->obj, per_dev->offset,
+					   ios->kern_buff, ios->length);
+			if (unlikely(ret))
+				goto out;
+			EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
+				      "length=0x%llx dev=%d\n",
+				     _LLU(ios->obj.id), _LLU(per_dev->offset),
+				     _LLU(ios->length), dev);
+		} else {
+			osd_req_set_attributes(or, &ios->obj);
+			EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
+				     _LLU(ios->obj.id), ios->out_attr_len, dev);
+		}
+
+		if (ios->out_attr)
+			osd_req_add_set_attr_list(or, ios->out_attr,
+						  ios->out_attr_len);
+
+		if (ios->in_attr)
+			osd_req_add_get_attr_list(or, ios->in_attr,
+						  ios->in_attr_len);
+	}
+
+out:
+	return ret;
+}
+
+int exofs_sbi_write(struct exofs_io_state *ios)
+{
+	int i;
+	int ret;
+
+	ret = _prepare_for_striping(ios);
+	if (unlikely(ret))
+		return ret;
+
+	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+		ret = _sbi_write_mirror(ios, i);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	ret = exofs_io_execute(ios);
+	return ret;
+}
+
+static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
+{
+	struct osd_request *or;
+	struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+	unsigned first_dev = (unsigned)ios->obj.id;
+
+	if (ios->pages && !per_dev->length)
+		return 0; /* Just an empty slot */
+
+	first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
+	or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+		return -ENOMEM;
+	}
+	per_dev->or = or;
+
+	if (ios->pages) {
+		osd_req_read(or, &ios->obj, per_dev->offset,
+				per_dev->bio, per_dev->length);
+		EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
+			     " dev=%d\n", _LLU(ios->obj.id),
+			     _LLU(per_dev->offset), _LLU(per_dev->length),
+			     first_dev);
+	} else if (ios->kern_buff) {
+		int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset,
+					    ios->kern_buff, ios->length);
+		EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
+			      "length=0x%llx dev=%d ret=>%d\n",
+			      _LLU(ios->obj.id), _LLU(per_dev->offset),
+			      _LLU(ios->length), first_dev, ret);
+		if (unlikely(ret))
+			return ret;
+	} else {
+		osd_req_get_attributes(or, &ios->obj);
+		EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
+			      _LLU(ios->obj.id), ios->in_attr_len, first_dev);
+	}
+	if (ios->out_attr)
+		osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
+
+	if (ios->in_attr)
+		osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
+
+	return 0;
+}
+
+int exofs_sbi_read(struct exofs_io_state *ios)
+{
+	int i;
+	int ret;
+
+	ret = _prepare_for_striping(ios);
+	if (unlikely(ret))
+		return ret;
+
+	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+		ret = _sbi_read_mirror(ios, i);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	ret = exofs_io_execute(ios);
+	return ret;
+}
+
+int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
+{
+	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+	void *iter = NULL;
+	int nelem;
+
+	do {
+		nelem = 1;
+		osd_req_decode_get_attr_list(ios->per_dev[0].or,
+					     &cur_attr, &nelem, &iter);
+		if ((cur_attr.attr_page == attr->attr_page) &&
+		    (cur_attr.attr_id == attr->attr_id)) {
+			attr->len = cur_attr.len;
+			attr->val_ptr = cur_attr.val_ptr;
+			return 0;
+		}
+	} while (iter);
+
+	return -EIO;
+}
+
+static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp,
+			     struct osd_attr *attr)
+{
+	int last_comp = cur_comp + ios->layout->mirrors_p1;
+
+	for (; cur_comp < last_comp; ++cur_comp) {
+		struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+		struct osd_request *or;
+
+		or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			return -ENOMEM;
+		}
+		per_dev->or = or;
+
+		osd_req_set_attributes(or, &ios->obj);
+		osd_req_add_set_attr_list(or, attr, 1);
+	}
+
+	return 0;
+}
+
+int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
+{
+	struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
+	struct exofs_io_state *ios;
+	struct exofs_trunc_attr {
+		struct osd_attr attr;
+		__be64 newsize;
+	} *size_attrs;
+	struct _striping_info si;
+	int i, ret;
+
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (unlikely(ret))
+		return ret;
+
+	size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs),
+			     GFP_KERNEL);
+	if (unlikely(!size_attrs)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ios->obj.id = exofs_oi_objno(oi);
+	ios->cred = oi->i_cred;
+
+	ios->numdevs = ios->layout->s_numdevs;
+	_calc_stripe_info(ios, size, &si);
+
+	for (i = 0; i < ios->layout->group_width; ++i) {
+		struct exofs_trunc_attr *size_attr = &size_attrs[i];
+		u64 obj_size;
+
+		if (i < si.dev)
+			obj_size = si.obj_offset +
+					ios->layout->stripe_unit - si.unit_off;
+		else if (i == si.dev)
+			obj_size = si.obj_offset;
+		else /* i > si.dev */
+			obj_size = si.obj_offset - si.unit_off;
+
+		size_attr->newsize = cpu_to_be64(obj_size);
+		size_attr->attr = g_attr_logical_length;
+		size_attr->attr.val_ptr = &size_attr->newsize;
+
+		ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
+					&size_attr->attr);
+		if (unlikely(ret))
+			goto out;
+	}
+	ret = exofs_io_execute(ios);
+
+out:
+	kfree(size_attrs);
+	exofs_put_io_state(ios);
+	return ret;
+}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
new file mode 100644
index 00000000..4d70db11
--- /dev/null
+++ b/fs/exofs/namei.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "exofs.h"
+
+static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = exofs_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode *inode;
+	ino_t ino;
+
+	if (dentry->d_name.len > EXOFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ino = exofs_inode_by_name(dir, dentry);
+	inode = NULL;
+	if (ino) {
+		inode = exofs_iget(dir->i_sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
+			 struct nameidata *nd)
+{
+	struct inode *inode = exofs_new_inode(dir, mode);
+	int err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &exofs_file_inode_operations;
+		inode->i_fop = &exofs_file_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+		mark_inode_dirty(inode);
+		err = exofs_add_nondir(dentry, inode);
+	}
+	return err;
+}
+
+static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+		       dev_t rdev)
+{
+	struct inode *inode;
+	int err;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	inode = exofs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+		mark_inode_dirty(inode);
+		err = exofs_add_nondir(dentry, inode);
+	}
+	return err;
+}
+
+static int exofs_symlink(struct inode *dir, struct dentry *dentry,
+			  const char *symname)
+{
+	struct super_block *sb = dir->i_sb;
+	int err = -ENAMETOOLONG;
+	unsigned l = strlen(symname)+1;
+	struct inode *inode;
+	struct exofs_i_info *oi;
+
+	if (l > sb->s_blocksize)
+		goto out;
+
+	inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	oi = exofs_i(inode);
+	if (l > sizeof(oi->i_data)) {
+		/* slow symlink */
+		inode->i_op = &exofs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+		memset(oi->i_data, 0, sizeof(oi->i_data));
+
+		err = page_symlink(inode, symname, l);
+		if (err)
+			goto out_fail;
+	} else {
+		/* fast symlink */
+		inode->i_op = &exofs_fast_symlink_inode_operations;
+		memcpy(oi->i_data, symname, l);
+		inode->i_size = l-1;
+	}
+	mark_inode_dirty(inode);
+
+	err = exofs_add_nondir(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	iput(inode);
+	goto out;
+}
+
+static int exofs_link(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+
+	if (inode->i_nlink >= EXOFS_LINK_MAX)
+		return -EMLINK;
+
+	inode->i_ctime = CURRENT_TIME;
+	inode_inc_link_count(inode);
+	ihold(inode);
+
+	return exofs_add_nondir(dentry, inode);
+}
+
+static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+	int err = -EMLINK;
+
+	if (dir->i_nlink >= EXOFS_LINK_MAX)
+		goto out;
+
+	inode_inc_link_count(dir);
+
+	inode = exofs_new_inode(dir, S_IFDIR | mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_dir;
+
+	inode->i_op = &exofs_dir_inode_operations;
+	inode->i_fop = &exofs_dir_operations;
+	inode->i_mapping->a_ops = &exofs_aops;
+
+	inode_inc_link_count(inode);
+
+	err = exofs_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = exofs_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+
+	d_instantiate(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	inode_dec_link_count(inode);
+	iput(inode);
+out_dir:
+	inode_dec_link_count(dir);
+	goto out;
+}
+
+static int exofs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct exofs_dir_entry *de;
+	struct page *page;
+	int err = -ENOENT;
+
+	de = exofs_find_entry(dir, dentry, &page);
+	if (!de)
+		goto out;
+
+	err = exofs_delete_entry(de, page);
+	if (err)
+		goto out;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+	err = 0;
+out:
+	return err;
+}
+
+static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int err = -ENOTEMPTY;
+
+	if (exofs_empty_dir(inode)) {
+		err = exofs_unlink(dir, dentry);
+		if (!err) {
+			inode->i_size = 0;
+			inode_dec_link_count(inode);
+			inode_dec_link_count(dir);
+		}
+	}
+	return err;
+}
+
+static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct page *dir_page = NULL;
+	struct exofs_dir_entry *dir_de = NULL;
+	struct page *old_page;
+	struct exofs_dir_entry *old_de;
+	int err = -ENOENT;
+
+	old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = exofs_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page *new_page;
+		struct exofs_dir_entry *new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !exofs_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
+		if (!new_de)
+			goto out_dir;
+		err = exofs_set_link(new_dir, new_de, new_page, old_inode);
+		new_inode->i_ctime = CURRENT_TIME;
+		if (dir_de)
+			drop_nlink(new_inode);
+		inode_dec_link_count(new_inode);
+		if (err)
+			goto out_dir;
+	} else {
+		if (dir_de) {
+			err = -EMLINK;
+			if (new_dir->i_nlink >= EXOFS_LINK_MAX)
+				goto out_dir;
+		}
+		err = exofs_add_link(new_dentry, old_inode);
+		if (err)
+			goto out_dir;
+		if (dir_de)
+			inode_inc_link_count(new_dir);
+	}
+
+	old_inode->i_ctime = CURRENT_TIME;
+
+	exofs_delete_entry(old_de, old_page);
+	mark_inode_dirty(old_inode);
+
+	if (dir_de) {
+		err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
+		inode_dec_link_count(old_dir);
+		if (err)
+			goto out_dir;
+	}
+	return 0;
+
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	return err;
+}
+
+const struct inode_operations exofs_dir_inode_operations = {
+	.create 	= exofs_create,
+	.lookup 	= exofs_lookup,
+	.link   	= exofs_link,
+	.unlink 	= exofs_unlink,
+	.symlink	= exofs_symlink,
+	.mkdir  	= exofs_mkdir,
+	.rmdir  	= exofs_rmdir,
+	.mknod  	= exofs_mknod,
+	.rename 	= exofs_rename,
+	.setattr	= exofs_setattr,
+};
+
+const struct inode_operations exofs_special_inode_operations = {
+	.setattr	= exofs_setattr,
+};
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
new file mode 100644
index 00000000..c52e9888
--- /dev/null
+++ b/fs/exofs/pnfs.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License  version 2 as published by the Free
+ * Software Foundation.
+ *
+ */
+
+/* FIXME: Remove this file once pnfs hits mainline */
+
+#ifndef __EXOFS_PNFS_H__
+#define __EXOFS_PNFS_H__
+
+#if ! defined(__PNFS_OSD_XDR_H__)
+
+enum pnfs_iomode {
+	IOMODE_READ = 1,
+	IOMODE_RW = 2,
+	IOMODE_ANY = 3,
+};
+
+/* Layout Structure */
+enum pnfs_osd_raid_algorithm4 {
+	PNFS_OSD_RAID_0		= 1,
+	PNFS_OSD_RAID_4		= 2,
+	PNFS_OSD_RAID_5		= 3,
+	PNFS_OSD_RAID_PQ	= 4     /* Reed-Solomon P+Q */
+};
+
+struct pnfs_osd_data_map {
+	u32	odm_num_comps;
+	u64	odm_stripe_unit;
+	u32	odm_group_width;
+	u32	odm_group_depth;
+	u32	odm_mirror_cnt;
+	u32	odm_raid_algorithm;
+};
+
+#endif /* ! defined(__PNFS_OSD_XDR_H__) */
+
+#endif /* __EXOFS_PNFS_H__ */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
new file mode 100644
index 00000000..c57beddc
--- /dev/null
+++ b/fs/exofs/super.c
@@ -0,0 +1,1001 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/string.h>
+#include <linux/parser.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+#include <linux/exportfs.h>
+#include <linux/slab.h>
+
+#include "exofs.h"
+
+/******************************************************************************
+ * MOUNT OPTIONS
+ *****************************************************************************/
+
+/*
+ * struct to hold what we get from mount options
+ */
+struct exofs_mountopt {
+	bool is_osdname;
+	const char *dev_name;
+	uint64_t pid;
+	int timeout;
+};
+
+/*
+ * exofs-specific mount-time options.
+ */
+enum { Opt_name, Opt_pid, Opt_to, Opt_err };
+
+/*
+ * Our mount-time options.  These should ideally be 64-bit unsigned, but the
+ * kernel's parsing functions do not currently support that.  32-bit should be
+ * sufficient for most applications now.
+ */
+static match_table_t tokens = {
+	{Opt_name, "osdname=%s"},
+	{Opt_pid, "pid=%u"},
+	{Opt_to, "to=%u"},
+	{Opt_err, NULL}
+};
+
+/*
+ * The main option parsing method.  Also makes sure that all of the mandatory
+ * mount options were set.
+ */
+static int parse_options(char *options, struct exofs_mountopt *opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	bool s_pid = false;
+
+	EXOFS_DBGMSG("parse_options %s\n", options);
+	/* defaults */
+	memset(opts, 0, sizeof(*opts));
+	opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		char str[32];
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_name:
+			opts->dev_name = match_strdup(&args[0]);
+			if (unlikely(!opts->dev_name)) {
+				EXOFS_ERR("Error allocating dev_name");
+				return -ENOMEM;
+			}
+			opts->is_osdname = true;
+			break;
+		case Opt_pid:
+			if (0 == match_strlcpy(str, &args[0], sizeof(str)))
+				return -EINVAL;
+			opts->pid = simple_strtoull(str, NULL, 0);
+			if (opts->pid < EXOFS_MIN_PID) {
+				EXOFS_ERR("Partition ID must be >= %u",
+					  EXOFS_MIN_PID);
+				return -EINVAL;
+			}
+			s_pid = 1;
+			break;
+		case Opt_to:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			if (option <= 0) {
+				EXOFS_ERR("Timout must be > 0");
+				return -EINVAL;
+			}
+			opts->timeout = option * HZ;
+			break;
+		}
+	}
+
+	if (!s_pid) {
+		EXOFS_ERR("Need to specify the following options:\n");
+		EXOFS_ERR("    -o pid=pid_no_to_use\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/******************************************************************************
+ * INODE CACHE
+ *****************************************************************************/
+
+/*
+ * Our inode cache.  Isn't it pretty?
+ */
+static struct kmem_cache *exofs_inode_cachep;
+
+/*
+ * Allocate an inode in the cache
+ */
+static struct inode *exofs_alloc_inode(struct super_block *sb)
+{
+	struct exofs_i_info *oi;
+
+	oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
+	if (!oi)
+		return NULL;
+
+	oi->vfs_inode.i_version = 1;
+	return &oi->vfs_inode;
+}
+
+static void exofs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+}
+
+/*
+ * Remove an inode from the cache
+ */
+static void exofs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, exofs_i_callback);
+}
+
+/*
+ * Initialize the inode
+ */
+static void exofs_init_once(void *foo)
+{
+	struct exofs_i_info *oi = foo;
+
+	inode_init_once(&oi->vfs_inode);
+}
+
+/*
+ * Create and initialize the inode cache
+ */
+static int init_inodecache(void)
+{
+	exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
+				sizeof(struct exofs_i_info), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				exofs_init_once);
+	if (exofs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ * Destroy the inode cache
+ */
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(exofs_inode_cachep);
+}
+
+/******************************************************************************
+ * SUPERBLOCK FUNCTIONS
+ *****************************************************************************/
+static const struct super_operations exofs_sops;
+static const struct export_operations exofs_export_ops;
+
+static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
+	EXOFS_APAGE_SB_DATA,
+	EXOFS_ATTR_SB_STATS,
+	sizeof(struct exofs_sb_stats));
+
+static int __sbi_read_stats(struct exofs_sb_info *sbi)
+{
+	struct osd_attr attrs[] = {
+		[0] = g_attr_sb_stats,
+	};
+	struct exofs_io_state *ios;
+	int ret;
+
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+		return ret;
+	}
+
+	ios->cred = sbi->s_cred;
+
+	ios->in_attr = attrs;
+	ios->in_attr_len = ARRAY_SIZE(attrs);
+
+	ret = exofs_sbi_read(ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("Error reading super_block stats => %d\n", ret);
+		goto out;
+	}
+
+	ret = extract_attr_from_ios(ios, &attrs[0]);
+	if (ret) {
+		EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
+		goto out;
+	}
+	if (attrs[0].len) {
+		struct exofs_sb_stats *ess;
+
+		if (unlikely(attrs[0].len != sizeof(*ess))) {
+			EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
+				  "size(%d) != expected(%zd)\n",
+				  __func__, attrs[0].len, sizeof(*ess));
+			goto out;
+		}
+
+		ess = attrs[0].val_ptr;
+		sbi->s_nextid = le64_to_cpu(ess->s_nextid);
+		sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
+	}
+
+out:
+	exofs_put_io_state(ios);
+	return ret;
+}
+
+static void stats_done(struct exofs_io_state *ios, void *p)
+{
+	exofs_put_io_state(ios);
+	/* Good thanks nothing to do anymore */
+}
+
+/* Asynchronously write the stats attribute */
+int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
+{
+	struct osd_attr attrs[] = {
+		[0] = g_attr_sb_stats,
+	};
+	struct exofs_io_state *ios;
+	int ret;
+
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+		return ret;
+	}
+
+	sbi->s_ess.s_nextid   = cpu_to_le64(sbi->s_nextid);
+	sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
+	attrs[0].val_ptr = &sbi->s_ess;
+
+	ios->cred = sbi->s_cred;
+	ios->done = stats_done;
+	ios->private = sbi;
+	ios->out_attr = attrs;
+	ios->out_attr_len = ARRAY_SIZE(attrs);
+
+	ret = exofs_sbi_write(ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
+		exofs_put_io_state(ios);
+	}
+
+	return ret;
+}
+
+/*
+ * Write the superblock to the OSD
+ */
+int exofs_sync_fs(struct super_block *sb, int wait)
+{
+	struct exofs_sb_info *sbi;
+	struct exofs_fscb *fscb;
+	struct exofs_io_state *ios;
+	int ret = -ENOMEM;
+
+	fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
+	if (unlikely(!fscb))
+		return -ENOMEM;
+
+	sbi = sb->s_fs_info;
+
+	/* NOTE: We no longer dirty the super_block anywhere in exofs. The
+	 * reason we write the fscb here on unmount is so we can stay backwards
+	 * compatible with fscb->s_version == 1. (What we are not compatible
+	 * with is if a new version FS crashed and then we try to mount an old
+	 * version). Otherwise the exofs_fscb is read-only from mkfs time. All
+	 * the writeable info is set in exofs_sbi_write_stats() above.
+	 */
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (unlikely(ret))
+		goto out;
+
+	lock_super(sb);
+
+	ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
+	memset(fscb, 0, ios->length);
+	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
+	fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
+	fscb->s_magic = cpu_to_le16(sb->s_magic);
+	fscb->s_newfs = 0;
+	fscb->s_version = EXOFS_FSCB_VER;
+
+	ios->obj.id = EXOFS_SUPER_ID;
+	ios->offset = 0;
+	ios->kern_buff = fscb;
+	ios->cred = sbi->s_cred;
+
+	ret = exofs_sbi_write(ios);
+	if (unlikely(ret))
+		EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
+	else
+		sb->s_dirt = 0;
+
+
+	unlock_super(sb);
+out:
+	EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
+	exofs_put_io_state(ios);
+	kfree(fscb);
+	return ret;
+}
+
+static void exofs_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		exofs_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
+}
+
+static void _exofs_print_device(const char *msg, const char *dev_path,
+				struct osd_dev *od, u64 pid)
+{
+	const struct osd_dev_info *odi = osduld_device_info(od);
+
+	printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
+		msg, dev_path ?: "", odi->osdname, _LLU(pid));
+}
+
+void exofs_free_sbi(struct exofs_sb_info *sbi)
+{
+	while (sbi->layout.s_numdevs) {
+		int i = --sbi->layout.s_numdevs;
+		struct osd_dev *od = sbi->layout.s_ods[i];
+
+		if (od) {
+			sbi->layout.s_ods[i] = NULL;
+			osduld_put_device(od);
+		}
+	}
+	kfree(sbi);
+}
+
+/*
+ * This function is called when the vfs is freeing the superblock.  We just
+ * need to free our own part.
+ */
+static void exofs_put_super(struct super_block *sb)
+{
+	int num_pend;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+
+	/* make sure there are no pending commands */
+	for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
+	     num_pend = atomic_read(&sbi->s_curr_pending)) {
+		wait_queue_head_t wq;
+
+		printk(KERN_NOTICE "%s: !!Pending operations in flight. "
+		       "This is a BUG. please report to osd-dev@open-osd.org\n",
+		       __func__);
+		init_waitqueue_head(&wq);
+		wait_event_timeout(wq,
+				  (atomic_read(&sbi->s_curr_pending) == 0),
+				  msecs_to_jiffies(100));
+	}
+
+	_exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
+			    sbi->layout.s_pid);
+
+	bdi_destroy(&sbi->bdi);
+	exofs_free_sbi(sbi);
+	sb->s_fs_info = NULL;
+}
+
+static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
+				    struct exofs_device_table *dt)
+{
+	u64 stripe_length;
+
+	sbi->data_map.odm_num_comps   =
+				le32_to_cpu(dt->dt_data_map.cb_num_comps);
+	sbi->data_map.odm_stripe_unit =
+				le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
+	sbi->data_map.odm_group_width =
+				le32_to_cpu(dt->dt_data_map.cb_group_width);
+	sbi->data_map.odm_group_depth =
+				le32_to_cpu(dt->dt_data_map.cb_group_depth);
+	sbi->data_map.odm_mirror_cnt  =
+				le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
+	sbi->data_map.odm_raid_algorithm  =
+				le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
+
+/* FIXME: Only raid0 for now. if not so, do not mount */
+	if (sbi->data_map.odm_num_comps != numdevs) {
+		EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
+			  sbi->data_map.odm_num_comps, numdevs);
+		return -EINVAL;
+	}
+	if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
+		EXOFS_ERR("Only RAID_0 for now\n");
+		return -EINVAL;
+	}
+	if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
+		EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
+			  numdevs, sbi->data_map.odm_mirror_cnt);
+		return -EINVAL;
+	}
+
+	if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
+		EXOFS_ERR("Stripe Unit(0x%llx)"
+			  " must be Multples of PAGE_SIZE(0x%lx)\n",
+			  _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
+		return -EINVAL;
+	}
+
+	sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
+	sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
+
+	if (sbi->data_map.odm_group_width) {
+		sbi->layout.group_width = sbi->data_map.odm_group_width;
+		sbi->layout.group_depth = sbi->data_map.odm_group_depth;
+		if (!sbi->layout.group_depth) {
+			EXOFS_ERR("group_depth == 0 && group_width != 0\n");
+			return -EINVAL;
+		}
+		sbi->layout.group_count = sbi->data_map.odm_num_comps /
+						sbi->layout.mirrors_p1 /
+						sbi->data_map.odm_group_width;
+	} else {
+		if (sbi->data_map.odm_group_depth) {
+			printk(KERN_NOTICE "Warning: group_depth ignored "
+				"group_width == 0 && group_depth == %d\n",
+				sbi->data_map.odm_group_depth);
+			sbi->data_map.odm_group_depth = 0;
+		}
+		sbi->layout.group_width = sbi->data_map.odm_num_comps /
+							sbi->layout.mirrors_p1;
+		sbi->layout.group_depth = -1;
+		sbi->layout.group_count = 1;
+	}
+
+	stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
+	if (stripe_length >= (1ULL << 32)) {
+		EXOFS_ERR("Total Stripe length(0x%llx)"
+			  " >= 32bit is not supported\n", _LLU(stripe_length));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static unsigned __ra_pages(struct exofs_layout *layout)
+{
+	const unsigned _MIN_RA = 32; /* min 128K read-ahead */
+	unsigned ra_pages = layout->group_width * layout->stripe_unit /
+				PAGE_SIZE;
+	unsigned max_io_pages = exofs_max_io_pages(layout, ~0);
+
+	ra_pages *= 2; /* two stripes */
+	if (ra_pages < _MIN_RA)
+		ra_pages = roundup(_MIN_RA, ra_pages / 2);
+
+	if (ra_pages > max_io_pages)
+		ra_pages = max_io_pages;
+
+	return ra_pages;
+}
+
+/* @odi is valid only as long as @fscb_dev is valid */
+static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
+			     struct osd_dev_info *odi)
+{
+	odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
+	memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
+
+	odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
+	odi->osdname = dt_dev->osdname;
+
+	/* FIXME support long names. Will need a _put function */
+	if (dt_dev->long_name_offset)
+		return -EINVAL;
+
+	/* Make sure osdname is printable!
+	 * mkexofs should give us space for a null-terminator else the
+	 * device-table is invalid.
+	 */
+	if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
+		odi->osdname_len = sizeof(dt_dev->osdname) - 1;
+	dt_dev->osdname[odi->osdname_len] = 0;
+
+	/* If it's all zeros something is bad we read past end-of-obj */
+	return !(odi->systemid_len || odi->osdname_len);
+}
+
+static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
+				       unsigned table_count)
+{
+	struct exofs_sb_info *sbi = *psbi;
+	struct osd_dev *fscb_od;
+	struct osd_obj_id obj = {.partition = sbi->layout.s_pid,
+				 .id = EXOFS_DEVTABLE_ID};
+	struct exofs_device_table *dt;
+	unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
+					     sizeof(*dt);
+	unsigned numdevs, i;
+	int ret;
+
+	dt = kmalloc(table_bytes, GFP_KERNEL);
+	if (unlikely(!dt)) {
+		EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
+			  table_bytes);
+		return -ENOMEM;
+	}
+
+	fscb_od = sbi->layout.s_ods[0];
+	sbi->layout.s_ods[0] = NULL;
+	sbi->layout.s_numdevs = 0;
+	ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
+	if (unlikely(ret)) {
+		EXOFS_ERR("ERROR: reading device table\n");
+		goto out;
+	}
+
+	numdevs = le64_to_cpu(dt->dt_num_devices);
+	if (unlikely(!numdevs)) {
+		ret = -EINVAL;
+		goto out;
+	}
+	WARN_ON(table_count != numdevs);
+
+	ret = _read_and_match_data_map(sbi, numdevs, dt);
+	if (unlikely(ret))
+		goto out;
+
+	if (likely(numdevs > 1)) {
+		unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]);
+
+		sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
+		if (unlikely(!sbi)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		memset(&sbi->layout.s_ods[1], 0,
+		       size - sizeof(sbi->layout.s_ods[0]));
+		*psbi = sbi;
+	}
+
+	for (i = 0; i < numdevs; i++) {
+		struct exofs_fscb fscb;
+		struct osd_dev_info odi;
+		struct osd_dev *od;
+
+		if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
+			EXOFS_ERR("ERROR: Read all-zeros device entry\n");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
+		       i, odi.osdname);
+
+		/* On all devices the device table is identical. The user can
+		 * specify any one of the participating devices on the command
+		 * line. We always keep them in device-table order.
+		 */
+		if (fscb_od && osduld_device_same(fscb_od, &odi)) {
+			sbi->layout.s_ods[i] = fscb_od;
+			++sbi->layout.s_numdevs;
+			fscb_od = NULL;
+			continue;
+		}
+
+		od = osduld_info_lookup(&odi);
+		if (IS_ERR(od)) {
+			ret = PTR_ERR(od);
+			EXOFS_ERR("ERROR: device requested is not found "
+				  "osd_name-%s =>%d\n", odi.osdname, ret);
+			goto out;
+		}
+
+		sbi->layout.s_ods[i] = od;
+		++sbi->layout.s_numdevs;
+
+		/* Read the fscb of the other devices to make sure the FS
+		 * partition is there.
+		 */
+		ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb,
+				      sizeof(fscb));
+		if (unlikely(ret)) {
+			EXOFS_ERR("ERROR: Malformed participating device "
+				  "error reading fscb osd_name-%s\n",
+				  odi.osdname);
+			goto out;
+		}
+
+		/* TODO: verify other information is correct and FS-uuid
+		 *	 matches. Benny what did you say about device table
+		 *	 generation and old devices?
+		 */
+	}
+
+out:
+	kfree(dt);
+	if (unlikely(!ret && fscb_od)) {
+		EXOFS_ERR(
+		      "ERROR: Bad device-table container device not present\n");
+		osduld_put_device(fscb_od);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+/*
+ * Read the superblock from the OSD and fill in the fields
+ */
+static int exofs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *root;
+	struct exofs_mountopt *opts = data;
+	struct exofs_sb_info *sbi;	/*extended info                  */
+	struct osd_dev *od;		/* Master device                 */
+	struct exofs_fscb fscb;		/*on-disk superblock info        */
+	struct osd_obj_id obj;
+	unsigned table_count;
+	int ret;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
+	if (ret)
+		goto free_bdi;
+
+	/* use mount options to fill superblock */
+	if (opts->is_osdname) {
+		struct osd_dev_info odi = {.systemid_len = 0};
+
+		odi.osdname_len = strlen(opts->dev_name);
+		odi.osdname = (u8 *)opts->dev_name;
+		od = osduld_info_lookup(&odi);
+	} else {
+		od = osduld_path_lookup(opts->dev_name);
+	}
+	if (IS_ERR(od)) {
+		ret = -EINVAL;
+		goto free_sbi;
+	}
+
+	/* Default layout in case we do not have a device-table */
+	sbi->layout.stripe_unit = PAGE_SIZE;
+	sbi->layout.mirrors_p1 = 1;
+	sbi->layout.group_width = 1;
+	sbi->layout.group_depth = -1;
+	sbi->layout.group_count = 1;
+	sbi->layout.s_ods[0] = od;
+	sbi->layout.s_numdevs = 1;
+	sbi->layout.s_pid = opts->pid;
+	sbi->s_timeout = opts->timeout;
+
+	/* fill in some other data by hand */
+	memset(sb->s_id, 0, sizeof(sb->s_id));
+	strcpy(sb->s_id, "exofs");
+	sb->s_blocksize = EXOFS_BLKSIZE;
+	sb->s_blocksize_bits = EXOFS_BLKSHIFT;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	atomic_set(&sbi->s_curr_pending, 0);
+	sb->s_bdev = NULL;
+	sb->s_dev = 0;
+
+	obj.partition = sbi->layout.s_pid;
+	obj.id = EXOFS_SUPER_ID;
+	exofs_make_credential(sbi->s_cred, &obj);
+
+	ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb));
+	if (unlikely(ret))
+		goto free_sbi;
+
+	sb->s_magic = le16_to_cpu(fscb.s_magic);
+	/* NOTE: we read below to be backward compatible with old versions */
+	sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
+	sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
+
+	/* make sure what we read from the object store is correct */
+	if (sb->s_magic != EXOFS_SUPER_MAGIC) {
+		if (!silent)
+			EXOFS_ERR("ERROR: Bad magic value\n");
+		ret = -EINVAL;
+		goto free_sbi;
+	}
+	if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
+		EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
+			  EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
+		ret = -EINVAL;
+		goto free_sbi;
+	}
+
+	/* start generation numbers from a random point */
+	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+	spin_lock_init(&sbi->s_next_gen_lock);
+
+	table_count = le64_to_cpu(fscb.s_dev_table_count);
+	if (table_count) {
+		ret = exofs_read_lookup_dev_table(&sbi, table_count);
+		if (unlikely(ret))
+			goto free_sbi;
+	}
+
+	__sbi_read_stats(sbi);
+
+	/* set up operation vectors */
+	sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
+	sb->s_bdi = &sbi->bdi;
+	sb->s_fs_info = sbi;
+	sb->s_op = &exofs_sops;
+	sb->s_export_op = &exofs_export_ops;
+	root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
+	if (IS_ERR(root)) {
+		EXOFS_ERR("ERROR: exofs_iget failed\n");
+		ret = PTR_ERR(root);
+		goto free_sbi;
+	}
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		iput(root);
+		EXOFS_ERR("ERROR: get root inode failed\n");
+		ret = -ENOMEM;
+		goto free_sbi;
+	}
+
+	if (!S_ISDIR(root->i_mode)) {
+		dput(sb->s_root);
+		sb->s_root = NULL;
+		EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
+		       root->i_mode);
+		ret = -EINVAL;
+		goto free_sbi;
+	}
+
+	_exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
+			    sbi->layout.s_pid);
+	if (opts->is_osdname)
+		kfree(opts->dev_name);
+	return 0;
+
+free_sbi:
+	bdi_destroy(&sbi->bdi);
+free_bdi:
+	EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
+		  opts->dev_name, sbi->layout.s_pid, ret);
+	exofs_free_sbi(sbi);
+	if (opts->is_osdname)
+		kfree(opts->dev_name);
+	return ret;
+}
+
+/*
+ * Set up the superblock (calls exofs_fill_super eventually)
+ */
+static struct dentry *exofs_mount(struct file_system_type *type,
+			  int flags, const char *dev_name,
+			  void *data)
+{
+	struct exofs_mountopt opts;
+	int ret;
+
+	ret = parse_options(data, &opts);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!opts.dev_name)
+		opts.dev_name = dev_name;
+	return mount_nodev(type, flags, &opts, exofs_fill_super);
+}
+
+/*
+ * Return information about the file system state in the buffer.  This is used
+ * by the 'df' command, for example.
+ */
+static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct exofs_io_state *ios;
+	struct osd_attr attrs[] = {
+		ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
+			OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
+		ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
+			OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
+	};
+	uint64_t capacity = ULLONG_MAX;
+	uint64_t used = ULLONG_MAX;
+	uint8_t cred_a[OSD_CAP_LEN];
+	int ret;
+
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (ret) {
+		EXOFS_DBGMSG("exofs_get_io_state failed.\n");
+		return ret;
+	}
+
+	exofs_make_credential(cred_a, &ios->obj);
+	ios->cred = sbi->s_cred;
+	ios->in_attr = attrs;
+	ios->in_attr_len = ARRAY_SIZE(attrs);
+
+	ret = exofs_sbi_read(ios);
+	if (unlikely(ret))
+		goto out;
+
+	ret = extract_attr_from_ios(ios, &attrs[0]);
+	if (likely(!ret)) {
+		capacity = get_unaligned_be64(attrs[0].val_ptr);
+		if (unlikely(!capacity))
+			capacity = ULLONG_MAX;
+	} else
+		EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
+
+	ret = extract_attr_from_ios(ios, &attrs[1]);
+	if (likely(!ret))
+		used = get_unaligned_be64(attrs[1].val_ptr);
+	else
+		EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
+
+	/* fill in the stats buffer */
+	buf->f_type = EXOFS_SUPER_MAGIC;
+	buf->f_bsize = EXOFS_BLKSIZE;
+	buf->f_blocks = capacity >> 9;
+	buf->f_bfree = (capacity - used) >> 9;
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = sbi->s_numfiles;
+	buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
+	buf->f_namelen = EXOFS_NAME_LEN;
+
+out:
+	exofs_put_io_state(ios);
+	return ret;
+}
+
+static const struct super_operations exofs_sops = {
+	.alloc_inode    = exofs_alloc_inode,
+	.destroy_inode  = exofs_destroy_inode,
+	.write_inode    = exofs_write_inode,
+	.evict_inode    = exofs_evict_inode,
+	.put_super      = exofs_put_super,
+	.write_super    = exofs_write_super,
+	.sync_fs	= exofs_sync_fs,
+	.statfs         = exofs_statfs,
+};
+
+/******************************************************************************
+ * EXPORT OPERATIONS
+ *****************************************************************************/
+
+struct dentry *exofs_get_parent(struct dentry *child)
+{
+	unsigned long ino = exofs_parent_ino(child);
+
+	if (!ino)
+		return ERR_PTR(-ESTALE);
+
+	return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino));
+}
+
+static struct inode *exofs_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	inode = exofs_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		/* we didn't find the right inode.. */
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return inode;
+}
+
+static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
+				struct fid *fid, int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    exofs_nfs_get_inode);
+}
+
+static struct dentry *exofs_fh_to_parent(struct super_block *sb,
+				struct fid *fid, int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    exofs_nfs_get_inode);
+}
+
+static const struct export_operations exofs_export_ops = {
+	.fh_to_dentry = exofs_fh_to_dentry,
+	.fh_to_parent = exofs_fh_to_parent,
+	.get_parent = exofs_get_parent,
+};
+
+/******************************************************************************
+ * INSMOD/RMMOD
+ *****************************************************************************/
+
+/*
+ * struct that describes this file system
+ */
+static struct file_system_type exofs_type = {
+	.owner          = THIS_MODULE,
+	.name           = "exofs",
+	.mount          = exofs_mount,
+	.kill_sb        = generic_shutdown_super,
+};
+
+static int __init init_exofs(void)
+{
+	int err;
+
+	err = init_inodecache();
+	if (err)
+		goto out;
+
+	err = register_filesystem(&exofs_type);
+	if (err)
+		goto out_d;
+
+	return 0;
+out_d:
+	destroy_inodecache();
+out:
+	return err;
+}
+
+static void __exit exit_exofs(void)
+{
+	unregister_filesystem(&exofs_type);
+	destroy_inodecache();
+}
+
+MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
+MODULE_DESCRIPTION("exofs");
+MODULE_LICENSE("GPL");
+
+module_init(init_exofs)
+module_exit(exit_exofs)
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
new file mode 100644
index 00000000..4dd687c3
--- /dev/null
+++ b/fs/exofs/symlink.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/namei.h>
+
+#include "exofs.h"
+
+static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct exofs_i_info *oi = exofs_i(dentry->d_inode);
+
+	nd_set_link(nd, (char *)oi->i_data);
+	return NULL;
+}
+
+const struct inode_operations exofs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+};
+
+const struct inode_operations exofs_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= exofs_follow_link,
+};
diff --git a/fs/exportfs/Makefile b/fs/exportfs/Makefile
new file mode 100644
index 00000000..d7c5d4dd
--- /dev/null
+++ b/fs/exportfs/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the filesystem export support routines.
+
+obj-$(CONFIG_EXPORTFS) += exportfs.o
+
+exportfs-objs := expfs.o
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
new file mode 100644
index 00000000..b05acb79
--- /dev/null
+++ b/fs/exportfs/expfs.c
@@ -0,0 +1,497 @@
+/*
+ * Copyright (C) Neil Brown 2002
+ * Copyright (C) Christoph Hellwig 2007
+ *
+ * This file contains the code mapping from inodes to NFS file handles,
+ * and for mapping back from file handles to dentries.
+ *
+ * For details on why we do all the strange and hairy things in here
+ * take a look at Documentation/filesystems/nfs/Exporting.
+ */
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+
+#define dprintk(fmt, args...) do{}while(0)
+
+
+static int get_name(struct vfsmount *mnt, struct dentry *dentry, char *name,
+		struct dentry *child);
+
+
+static int exportfs_get_name(struct vfsmount *mnt, struct dentry *dir,
+		char *name, struct dentry *child)
+{
+	const struct export_operations *nop = dir->d_sb->s_export_op;
+
+	if (nop->get_name)
+		return nop->get_name(dir, name, child);
+	else
+		return get_name(mnt, dir, name, child);
+}
+
+/*
+ * Check if the dentry or any of it's aliases is acceptable.
+ */
+static struct dentry *
+find_acceptable_alias(struct dentry *result,
+		int (*acceptable)(void *context, struct dentry *dentry),
+		void *context)
+{
+	struct dentry *dentry, *toput = NULL;
+	struct inode *inode;
+
+	if (acceptable(context, result))
+		return result;
+
+	inode = result->d_inode;
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+		dget(dentry);
+		spin_unlock(&inode->i_lock);
+		if (toput)
+			dput(toput);
+		if (dentry != result && acceptable(context, dentry)) {
+			dput(result);
+			return dentry;
+		}
+		spin_lock(&inode->i_lock);
+		toput = dentry;
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (toput)
+		dput(toput);
+	return NULL;
+}
+
+/*
+ * Find root of a disconnected subtree and return a reference to it.
+ */
+static struct dentry *
+find_disconnected_root(struct dentry *dentry)
+{
+	dget(dentry);
+	while (!IS_ROOT(dentry)) {
+		struct dentry *parent = dget_parent(dentry);
+
+		if (!(parent->d_flags & DCACHE_DISCONNECTED)) {
+			dput(parent);
+			break;
+		}
+
+		dput(dentry);
+		dentry = parent;
+	}
+	return dentry;
+}
+
+/*
+ * Make sure target_dir is fully connected to the dentry tree.
+ *
+ * It may already be, as the flag isn't always updated when connection happens.
+ */
+static int
+reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
+{
+	int noprogress = 0;
+	int err = -ESTALE;
+
+	/*
+	 * It is possible that a confused file system might not let us complete
+	 * the path to the root.  For example, if get_parent returns a directory
+	 * in which we cannot find a name for the child.  While this implies a
+	 * very sick filesystem we don't want it to cause knfsd to spin.  Hence
+	 * the noprogress counter.  If we go through the loop 10 times (2 is
+	 * probably enough) without getting anywhere, we just give up
+	 */
+	while (target_dir->d_flags & DCACHE_DISCONNECTED && noprogress++ < 10) {
+		struct dentry *pd = find_disconnected_root(target_dir);
+
+		if (!IS_ROOT(pd)) {
+			/* must have found a connected parent - great */
+			spin_lock(&pd->d_lock);
+			pd->d_flags &= ~DCACHE_DISCONNECTED;
+			spin_unlock(&pd->d_lock);
+			noprogress = 0;
+		} else if (pd == mnt->mnt_sb->s_root) {
+			printk(KERN_ERR "export: Eeek filesystem root is not connected, impossible\n");
+			spin_lock(&pd->d_lock);
+			pd->d_flags &= ~DCACHE_DISCONNECTED;
+			spin_unlock(&pd->d_lock);
+			noprogress = 0;
+		} else {
+			/*
+			 * We have hit the top of a disconnected path, try to
+			 * find parent and connect.
+			 *
+			 * Racing with some other process renaming a directory
+			 * isn't much of a problem here.  If someone renames
+			 * the directory, it will end up properly connected,
+			 * which is what we want
+			 *
+			 * Getting the parent can't be supported generically,
+			 * the locking is too icky.
+			 *
+			 * Instead we just return EACCES.  If server reboots
+			 * or inodes get flushed, you lose
+			 */
+			struct dentry *ppd = ERR_PTR(-EACCES);
+			struct dentry *npd;
+
+			mutex_lock(&pd->d_inode->i_mutex);
+			if (mnt->mnt_sb->s_export_op->get_parent)
+				ppd = mnt->mnt_sb->s_export_op->get_parent(pd);
+			mutex_unlock(&pd->d_inode->i_mutex);
+
+			if (IS_ERR(ppd)) {
+				err = PTR_ERR(ppd);
+				dprintk("%s: get_parent of %ld failed, err %d\n",
+					__func__, pd->d_inode->i_ino, err);
+				dput(pd);
+				break;
+			}
+
+			dprintk("%s: find name of %lu in %lu\n", __func__,
+				pd->d_inode->i_ino, ppd->d_inode->i_ino);
+			err = exportfs_get_name(mnt, ppd, nbuf, pd);
+			if (err) {
+				dput(ppd);
+				dput(pd);
+				if (err == -ENOENT)
+					/* some race between get_parent and
+					 * get_name?  just try again
+					 */
+					continue;
+				break;
+			}
+			dprintk("%s: found name: %s\n", __func__, nbuf);
+			mutex_lock(&ppd->d_inode->i_mutex);
+			npd = lookup_one_len(nbuf, ppd, strlen(nbuf));
+			mutex_unlock(&ppd->d_inode->i_mutex);
+			if (IS_ERR(npd)) {
+				err = PTR_ERR(npd);
+				dprintk("%s: lookup failed: %d\n",
+					__func__, err);
+				dput(ppd);
+				dput(pd);
+				break;
+			}
+			/* we didn't really want npd, we really wanted
+			 * a side-effect of the lookup.
+			 * hopefully, npd == pd, though it isn't really
+			 * a problem if it isn't
+			 */
+			if (npd == pd)
+				noprogress = 0;
+			else
+				printk("%s: npd != pd\n", __func__);
+			dput(npd);
+			dput(ppd);
+			if (IS_ROOT(pd)) {
+				/* something went wrong, we have to give up */
+				dput(pd);
+				break;
+			}
+		}
+		dput(pd);
+	}
+
+	if (target_dir->d_flags & DCACHE_DISCONNECTED) {
+		/* something went wrong - oh-well */
+		if (!err)
+			err = -ESTALE;
+		return err;
+	}
+
+	return 0;
+}
+
+struct getdents_callback {
+	char *name;		/* name that was found. It already points to a
+				   buffer NAME_MAX+1 is size */
+	unsigned long ino;	/* the inum we are looking for */
+	int found;		/* inode matched? */
+	int sequence;		/* sequence counter */
+};
+
+/*
+ * A rather strange filldir function to capture
+ * the name matching the specified inode number.
+ */
+static int filldir_one(void * __buf, const char * name, int len,
+			loff_t pos, u64 ino, unsigned int d_type)
+{
+	struct getdents_callback *buf = __buf;
+	int result = 0;
+
+	buf->sequence++;
+	if (buf->ino == ino) {
+		memcpy(buf->name, name, len);
+		buf->name[len] = '\0';
+		buf->found = 1;
+		result = -1;
+	}
+	return result;
+}
+
+/**
+ * get_name - default export_operations->get_name function
+ * @dentry: the directory in which to find a name
+ * @name:   a pointer to a %NAME_MAX+1 char buffer to store the name
+ * @child:  the dentry for the child directory.
+ *
+ * calls readdir on the parent until it finds an entry with
+ * the same inode number as the child, and returns that.
+ */
+static int get_name(struct vfsmount *mnt, struct dentry *dentry,
+		char *name, struct dentry *child)
+{
+	const struct cred *cred = current_cred();
+	struct inode *dir = dentry->d_inode;
+	int error;
+	struct file *file;
+	struct getdents_callback buffer;
+
+	error = -ENOTDIR;
+	if (!dir || !S_ISDIR(dir->i_mode))
+		goto out;
+	error = -EINVAL;
+	if (!dir->i_fop)
+		goto out;
+	/*
+	 * Open the directory ...
+	 */
+	file = dentry_open(dget(dentry), mntget(mnt), O_RDONLY, cred);
+	error = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out;
+
+	error = -EINVAL;
+	if (!file->f_op->readdir)
+		goto out_close;
+
+	buffer.name = name;
+	buffer.ino = child->d_inode->i_ino;
+	buffer.found = 0;
+	buffer.sequence = 0;
+	while (1) {
+		int old_seq = buffer.sequence;
+
+		error = vfs_readdir(file, filldir_one, &buffer);
+		if (buffer.found) {
+			error = 0;
+			break;
+		}
+
+		if (error < 0)
+			break;
+
+		error = -ENOENT;
+		if (old_seq == buffer.sequence)
+			break;
+	}
+
+out_close:
+	fput(file);
+out:
+	return error;
+}
+
+/**
+ * export_encode_fh - default export_operations->encode_fh function
+ * @dentry:  the dentry to encode
+ * @fh:      where to store the file handle fragment
+ * @max_len: maximum length to store there
+ * @connectable: whether to store parent information
+ *
+ * This default encode_fh function assumes that the 32 inode number
+ * is suitable for locating an inode, and that the generation number
+ * can be used to check that it is still valid.  It places them in the
+ * filehandle fragment where export_decode_fh expects to find them.
+ */
+static int export_encode_fh(struct dentry *dentry, struct fid *fid,
+		int *max_len, int connectable)
+{
+	struct inode * inode = dentry->d_inode;
+	int len = *max_len;
+	int type = FILEID_INO32_GEN;
+
+	if (connectable && (len < 4)) {
+		*max_len = 4;
+		return 255;
+	} else if (len < 2) {
+		*max_len = 2;
+		return 255;
+	}
+
+	len = 2;
+	fid->i32.ino = inode->i_ino;
+	fid->i32.gen = inode->i_generation;
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		struct inode *parent;
+
+		spin_lock(&dentry->d_lock);
+		parent = dentry->d_parent->d_inode;
+		fid->i32.parent_ino = parent->i_ino;
+		fid->i32.parent_gen = parent->i_generation;
+		spin_unlock(&dentry->d_lock);
+		len = 4;
+		type = FILEID_INO32_GEN_PARENT;
+	}
+	*max_len = len;
+	return type;
+}
+
+int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
+		int connectable)
+{
+	const struct export_operations *nop = dentry->d_sb->s_export_op;
+	int error;
+
+	if (nop->encode_fh)
+		error = nop->encode_fh(dentry, fid->raw, max_len, connectable);
+	else
+		error = export_encode_fh(dentry, fid, max_len, connectable);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(exportfs_encode_fh);
+
+struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
+		int fh_len, int fileid_type,
+		int (*acceptable)(void *, struct dentry *), void *context)
+{
+	const struct export_operations *nop = mnt->mnt_sb->s_export_op;
+	struct dentry *result, *alias;
+	char nbuf[NAME_MAX+1];
+	int err;
+
+	/*
+	 * Try to get any dentry for the given file handle from the filesystem.
+	 */
+	if (!nop || !nop->fh_to_dentry)
+		return ERR_PTR(-ESTALE);
+	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
+	if (!result)
+		result = ERR_PTR(-ESTALE);
+	if (IS_ERR(result))
+		return result;
+
+	if (S_ISDIR(result->d_inode->i_mode)) {
+		/*
+		 * This request is for a directory.
+		 *
+		 * On the positive side there is only one dentry for each
+		 * directory inode.  On the negative side this implies that we
+		 * to ensure our dentry is connected all the way up to the
+		 * filesystem root.
+		 */
+		if (result->d_flags & DCACHE_DISCONNECTED) {
+			err = reconnect_path(mnt, result, nbuf);
+			if (err)
+				goto err_result;
+		}
+
+		if (!acceptable(context, result)) {
+			err = -EACCES;
+			goto err_result;
+		}
+
+		return result;
+	} else {
+		/*
+		 * It's not a directory.  Life is a little more complicated.
+		 */
+		struct dentry *target_dir, *nresult;
+
+		/*
+		 * See if either the dentry we just got from the filesystem
+		 * or any alias for it is acceptable.  This is always true
+		 * if this filesystem is exported without the subtreecheck
+		 * option.  If the filesystem is exported with the subtree
+		 * check option there's a fair chance we need to look at
+		 * the parent directory in the file handle and make sure
+		 * it's connected to the filesystem root.
+		 */
+		alias = find_acceptable_alias(result, acceptable, context);
+		if (alias)
+			return alias;
+
+		/*
+		 * Try to extract a dentry for the parent directory from the
+		 * file handle.  If this fails we'll have to give up.
+		 */
+		err = -ESTALE;
+		if (!nop->fh_to_parent)
+			goto err_result;
+
+		target_dir = nop->fh_to_parent(mnt->mnt_sb, fid,
+				fh_len, fileid_type);
+		if (!target_dir)
+			goto err_result;
+		err = PTR_ERR(target_dir);
+		if (IS_ERR(target_dir))
+			goto err_result;
+
+		/*
+		 * And as usual we need to make sure the parent directory is
+		 * connected to the filesystem root.  The VFS really doesn't
+		 * like disconnected directories..
+		 */
+		err = reconnect_path(mnt, target_dir, nbuf);
+		if (err) {
+			dput(target_dir);
+			goto err_result;
+		}
+
+		/*
+		 * Now that we've got both a well-connected parent and a
+		 * dentry for the inode we're after, make sure that our
+		 * inode is actually connected to the parent.
+		 */
+		err = exportfs_get_name(mnt, target_dir, nbuf, result);
+		if (!err) {
+			mutex_lock(&target_dir->d_inode->i_mutex);
+			nresult = lookup_one_len(nbuf, target_dir,
+						 strlen(nbuf));
+			mutex_unlock(&target_dir->d_inode->i_mutex);
+			if (!IS_ERR(nresult)) {
+				if (nresult->d_inode) {
+					dput(result);
+					result = nresult;
+				} else
+					dput(nresult);
+			}
+		}
+
+		/*
+		 * At this point we are done with the parent, but it's pinned
+		 * by the child dentry anyway.
+		 */
+		dput(target_dir);
+
+		/*
+		 * And finally make sure the dentry is actually acceptable
+		 * to NFSD.
+		 */
+		alias = find_acceptable_alias(result, acceptable, context);
+		if (!alias) {
+			err = -EACCES;
+			goto err_result;
+		}
+
+		return alias;
+	}
+
+ err_result:
+	dput(result);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(exportfs_decode_fh);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
new file mode 100644
index 00000000..14a6780f
--- /dev/null
+++ b/fs/ext2/Kconfig
@@ -0,0 +1,55 @@
+config EXT2_FS
+	tristate "Second extended fs support"
+	help
+	  Ext2 is a standard Linux file system for hard disks.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ext2.
+
+	  If unsure, say Y.
+
+config EXT2_FS_XATTR
+	bool "Ext2 extended attributes"
+	depends on EXT2_FS
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config EXT2_FS_POSIX_ACL
+	bool "Ext2 POSIX Access Control Lists"
+	depends on EXT2_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config EXT2_FS_SECURITY
+	bool "Ext2 Security Labels"
+	depends on EXT2_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ext2 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
+config EXT2_FS_XIP
+	bool "Ext2 execute in place support"
+	depends on EXT2_FS && MMU
+	help
+	  Execute in place can be used on memory-backed block devices. If you
+	  enable this option, you can select to mount block devices which are
+	  capable of this feature without using the page cache.
+
+	  If you do not use a block device that is capable of using this,
+	  or if unsure, say N.
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
new file mode 100644
index 00000000..f42af45c
--- /dev/null
+++ b/fs/ext2/Makefile
@@ -0,0 +1,13 @@
+#
+# Makefile for the linux ext2-filesystem routines.
+#
+
+obj-$(CONFIG_EXT2_FS) += ext2.o
+
+ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
+	  ioctl.o namei.o super.o symlink.o
+
+ext2-$(CONFIG_EXT2_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
+ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o
+ext2-$(CONFIG_EXT2_FS_SECURITY)	 += xattr_security.o
+ext2-$(CONFIG_EXT2_FS_XIP)	 += xip.o
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
new file mode 100644
index 00000000..abea5a17
--- /dev/null
+++ b/fs/ext2/acl.c
@@ -0,0 +1,445 @@
+/*
+ * linux/fs/ext2/acl.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ */
+
+#include <linux/capability.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include "ext2.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *
+ext2_acl_from_disk(const void *value, size_t size)
+{
+	const char *end = (char *)value + size;
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(ext2_acl_header))
+		 return ERR_PTR(-EINVAL);
+	if (((ext2_acl_header *)value)->a_version !=
+	    cpu_to_le32(EXT2_ACL_VERSION))
+		return ERR_PTR(-EINVAL);
+	value = (char *)value + sizeof(ext2_acl_header);
+	count = ext2_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n=0; n < count; n++) {
+		ext2_acl_entry *entry =
+			(ext2_acl_entry *)value;
+		if ((char *)value + sizeof(ext2_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		switch(acl->a_entries[n].e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				value = (char *)value +
+					sizeof(ext2_acl_entry_short);
+				acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
+				break;
+
+			case ACL_USER:
+			case ACL_GROUP:
+				value = (char *)value + sizeof(ext2_acl_entry);
+				if ((char *)value > end)
+					goto fail;
+				acl->a_entries[n].e_id =
+					le32_to_cpu(entry->e_id);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *
+ext2_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+	ext2_acl_header *ext_acl;
+	char *e;
+	size_t n;
+
+	*size = ext2_acl_size(acl->a_count);
+	ext_acl = kmalloc(sizeof(ext2_acl_header) + acl->a_count *
+			sizeof(ext2_acl_entry), GFP_KERNEL);
+	if (!ext_acl)
+		return ERR_PTR(-ENOMEM);
+	ext_acl->a_version = cpu_to_le32(EXT2_ACL_VERSION);
+	e = (char *)ext_acl + sizeof(ext2_acl_header);
+	for (n=0; n < acl->a_count; n++) {
+		ext2_acl_entry *entry = (ext2_acl_entry *)e;
+		entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+		switch(acl->a_entries[n].e_tag) {
+			case ACL_USER:
+			case ACL_GROUP:
+				entry->e_id =
+					cpu_to_le32(acl->a_entries[n].e_id);
+				e += sizeof(ext2_acl_entry);
+				break;
+
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				e += sizeof(ext2_acl_entry_short);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	return (char *)ext_acl;
+
+fail:
+	kfree(ext_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * inode->i_mutex: don't care
+ */
+static struct posix_acl *
+ext2_get_acl(struct inode *inode, int type)
+{
+	int name_index;
+	char *value = NULL;
+	struct posix_acl *acl;
+	int retval;
+
+	if (!test_opt(inode->i_sb, POSIX_ACL))
+		return NULL;
+
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	retval = ext2_xattr_get(inode, name_index, "", NULL, 0);
+	if (retval > 0) {
+		value = kmalloc(retval, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		retval = ext2_xattr_get(inode, name_index, "", value, retval);
+	}
+	if (retval > 0)
+		acl = ext2_acl_from_disk(value, retval);
+	else if (retval == -ENODATA || retval == -ENOSYS)
+		acl = NULL;
+	else
+		acl = ERR_PTR(retval);
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+/*
+ * inode->i_mutex: down
+ */
+static int
+ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int error;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	if (!test_opt(inode->i_sb, POSIX_ACL))
+		return 0;
+
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
+			if (acl) {
+				mode_t mode = inode->i_mode;
+				error = posix_acl_equiv_mode(acl, &mode);
+				if (error < 0)
+					return error;
+				else {
+					inode->i_mode = mode;
+					inode->i_ctime = CURRENT_TIME_SEC;
+					mark_inode_dirty(inode);
+					if (error == 0)
+						acl = NULL;
+				}
+			}
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+			if (!S_ISDIR(inode->i_mode))
+				return acl ? -EACCES : 0;
+			break;
+
+		default:
+			return -EINVAL;
+	}
+ 	if (acl) {
+		value = ext2_acl_to_disk(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	error = ext2_xattr_set(inode, name_index, "", value, size, 0);
+
+	kfree(value);
+	if (!error)
+		set_cached_acl(inode, type, acl);
+	return error;
+}
+
+int
+ext2_check_acl(struct inode *inode, int mask, unsigned int flags)
+{
+	struct posix_acl *acl;
+
+	if (flags & IPERM_FLAG_RCU) {
+		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+			return -ECHILD;
+		return -EAGAIN;
+	}
+
+	acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		int error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return error;
+	}
+
+	return -EAGAIN;
+}
+
+/*
+ * Initialize the ACLs of a new inode. Called from ext2_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
+ */
+int
+ext2_init_acl(struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *acl = NULL;
+	int error = 0;
+
+	if (!S_ISLNK(inode->i_mode)) {
+		if (test_opt(dir->i_sb, POSIX_ACL)) {
+			acl = ext2_get_acl(dir, ACL_TYPE_DEFAULT);
+			if (IS_ERR(acl))
+				return PTR_ERR(acl);
+		}
+		if (!acl)
+			inode->i_mode &= ~current_umask();
+	}
+	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
+               struct posix_acl *clone;
+	       mode_t mode;
+
+		if (S_ISDIR(inode->i_mode)) {
+			error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+			if (error)
+				goto cleanup;
+		}
+		clone = posix_acl_clone(acl, GFP_KERNEL);
+		error = -ENOMEM;
+		if (!clone)
+			goto cleanup;
+		mode = inode->i_mode;
+		error = posix_acl_create_masq(clone, &mode);
+		if (error >= 0) {
+			inode->i_mode = mode;
+			if (error > 0) {
+				/* This is an extended ACL */
+				error = ext2_set_acl(inode,
+						     ACL_TYPE_ACCESS, clone);
+			}
+		}
+		posix_acl_release(clone);
+	}
+cleanup:
+       posix_acl_release(acl);
+       return error;
+}
+
+/*
+ * Does chmod for an inode that may have an Access Control List. The
+ * inode->i_mode field must be updated to the desired value by the caller
+ * before calling this function.
+ * Returns 0 on success, or a negative error number.
+ *
+ * We change the ACL rather than storing some ACL entries in the file
+ * mode permission bits (which would be more efficient), because that
+ * would break once additional permissions (like  ACL_APPEND, ACL_DELETE
+ * for directories) are added. There are no more bits available in the
+ * file mode.
+ *
+ * inode->i_mutex: down
+ */
+int
+ext2_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+        int error;
+
+	if (!test_opt(inode->i_sb, POSIX_ACL))
+		return 0;
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+	error = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!error)
+		error = ext2_set_acl(inode, ACL_TYPE_ACCESS, clone);
+	posix_acl_release(clone);
+	return error;
+}
+
+/*
+ * Extended attribut handlers
+ */
+static size_t
+ext2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_size,
+			   const char *name, size_t name_len, int type)
+{
+	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+	if (!test_opt(dentry->d_sb, POSIX_ACL))
+		return 0;
+	if (list && size <= list_size)
+		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+	return size;
+}
+
+static size_t
+ext2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_size,
+			    const char *name, size_t name_len, int type)
+{
+	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+	if (!test_opt(dentry->d_sb, POSIX_ACL))
+		return 0;
+	if (list && size <= list_size)
+		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+	return size;
+}
+
+static int
+ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
+		   size_t size, int type)
+{
+	struct posix_acl *acl;
+	int error;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, POSIX_ACL))
+		return -EOPNOTSUPP;
+
+	acl = ext2_get_acl(dentry->d_inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	error = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+static int
+ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
+		   size_t size, int flags, int type)
+{
+	struct posix_acl *acl;
+	int error;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, POSIX_ACL))
+		return -EOPNOTSUPP;
+	if (!inode_owner_or_capable(dentry->d_inode))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		else if (acl) {
+			error = posix_acl_valid(acl);
+			if (error)
+				goto release_and_out;
+		}
+	} else
+		acl = NULL;
+
+	error = ext2_set_acl(dentry->d_inode, type, acl);
+
+release_and_out:
+	posix_acl_release(acl);
+	return error;
+}
+
+const struct xattr_handler ext2_xattr_acl_access_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.flags	= ACL_TYPE_ACCESS,
+	.list	= ext2_xattr_list_acl_access,
+	.get	= ext2_xattr_get_acl,
+	.set	= ext2_xattr_set_acl,
+};
+
+const struct xattr_handler ext2_xattr_acl_default_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.flags	= ACL_TYPE_DEFAULT,
+	.list	= ext2_xattr_list_acl_default,
+	.get	= ext2_xattr_get_acl,
+	.set	= ext2_xattr_set_acl,
+};
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
new file mode 100644
index 00000000..c939b7b1
--- /dev/null
+++ b/fs/ext2/acl.h
@@ -0,0 +1,78 @@
+/*
+  File: fs/ext2/acl.h
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/posix_acl_xattr.h>
+
+#define EXT2_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} ext2_acl_entry;
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} ext2_acl_entry_short;
+
+typedef struct {
+	__le32		a_version;
+} ext2_acl_header;
+
+static inline size_t ext2_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(ext2_acl_header) +
+		       count * sizeof(ext2_acl_entry_short);
+	} else {
+		return sizeof(ext2_acl_header) +
+		       4 * sizeof(ext2_acl_entry_short) +
+		       (count - 4) * sizeof(ext2_acl_entry);
+	}
+}
+
+static inline int ext2_acl_count(size_t size)
+{
+	ssize_t s;
+	size -= sizeof(ext2_acl_header);
+	s = size - 4 * sizeof(ext2_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(ext2_acl_entry_short))
+			return -1;
+		return size / sizeof(ext2_acl_entry_short);
+	} else {
+		if (s % sizeof(ext2_acl_entry))
+			return -1;
+		return s / sizeof(ext2_acl_entry) + 4;
+	}
+}
+
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+
+/* acl.c */
+extern int ext2_check_acl (struct inode *, int, unsigned int);
+extern int ext2_acl_chmod (struct inode *);
+extern int ext2_init_acl (struct inode *, struct inode *);
+
+#else
+#include <linux/sched.h>
+#define ext2_check_acl	NULL
+#define ext2_get_acl	NULL
+#define ext2_set_acl	NULL
+
+static inline int
+ext2_acl_chmod (struct inode *inode)
+{
+	return 0;
+}
+
+static inline int ext2_init_acl (struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+#endif
+
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
new file mode 100644
index 00000000..8f44cef1
--- /dev/null
+++ b/fs/ext2/balloc.c
@@ -0,0 +1,1555 @@
+/*
+ *  linux/fs/ext2/balloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include "ext2.h"
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/buffer_head.h>
+#include <linux/capability.h>
+
+/*
+ * balloc.c contains the blocks allocation and deallocation routines
+ */
+
+/*
+ * The free blocks are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.  The descriptors are loaded in memory
+ * when a file system is mounted (see ext2_fill_super).
+ */
+
+
+#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
+
+struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
+					     unsigned int block_group,
+					     struct buffer_head ** bh)
+{
+	unsigned long group_desc;
+	unsigned long offset;
+	struct ext2_group_desc * desc;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+	if (block_group >= sbi->s_groups_count) {
+		ext2_error (sb, "ext2_get_group_desc",
+			    "block_group >= groups_count - "
+			    "block_group = %d, groups_count = %lu",
+			    block_group, sbi->s_groups_count);
+
+		return NULL;
+	}
+
+	group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(sb);
+	offset = block_group & (EXT2_DESC_PER_BLOCK(sb) - 1);
+	if (!sbi->s_group_desc[group_desc]) {
+		ext2_error (sb, "ext2_get_group_desc",
+			    "Group descriptor not loaded - "
+			    "block_group = %d, group_desc = %lu, desc = %lu",
+			     block_group, group_desc, offset);
+		return NULL;
+	}
+
+	desc = (struct ext2_group_desc *) sbi->s_group_desc[group_desc]->b_data;
+	if (bh)
+		*bh = sbi->s_group_desc[group_desc];
+	return desc + offset;
+}
+
+static int ext2_valid_block_bitmap(struct super_block *sb,
+					struct ext2_group_desc *desc,
+					unsigned int block_group,
+					struct buffer_head *bh)
+{
+	ext2_grpblk_t offset;
+	ext2_grpblk_t next_zero_bit;
+	ext2_fsblk_t bitmap_blk;
+	ext2_fsblk_t group_first_block;
+
+	group_first_block = ext2_group_first_block_no(sb, block_group);
+
+	/* check whether block bitmap block number is set */
+	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
+	offset = bitmap_blk - group_first_block;
+	if (!ext2_test_bit(offset, bh->b_data))
+		/* bad block bitmap */
+		goto err_out;
+
+	/* check whether the inode bitmap block number is set */
+	bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
+	offset = bitmap_blk - group_first_block;
+	if (!ext2_test_bit(offset, bh->b_data))
+		/* bad block bitmap */
+		goto err_out;
+
+	/* check whether the inode table block number is set */
+	bitmap_blk = le32_to_cpu(desc->bg_inode_table);
+	offset = bitmap_blk - group_first_block;
+	next_zero_bit = ext2_find_next_zero_bit(bh->b_data,
+				offset + EXT2_SB(sb)->s_itb_per_group,
+				offset);
+	if (next_zero_bit >= offset + EXT2_SB(sb)->s_itb_per_group)
+		/* good bitmap for inode tables */
+		return 1;
+
+err_out:
+	ext2_error(sb, __func__,
+			"Invalid block bitmap - "
+			"block_group = %d, block = %lu",
+			block_group, bitmap_blk);
+	return 0;
+}
+
+/*
+ * Read the bitmap for a given block_group,and validate the
+ * bits for block/inode/inode tables are set in the bitmaps
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+static struct buffer_head *
+read_block_bitmap(struct super_block *sb, unsigned int block_group)
+{
+	struct ext2_group_desc * desc;
+	struct buffer_head * bh = NULL;
+	ext2_fsblk_t bitmap_blk;
+
+	desc = ext2_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		return NULL;
+	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
+	bh = sb_getblk(sb, bitmap_blk);
+	if (unlikely(!bh)) {
+		ext2_error(sb, __func__,
+			    "Cannot read block bitmap - "
+			    "block_group = %d, block_bitmap = %u",
+			    block_group, le32_to_cpu(desc->bg_block_bitmap));
+		return NULL;
+	}
+	if (likely(bh_uptodate_or_lock(bh)))
+		return bh;
+
+	if (bh_submit_read(bh) < 0) {
+		brelse(bh);
+		ext2_error(sb, __func__,
+			    "Cannot read block bitmap - "
+			    "block_group = %d, block_bitmap = %u",
+			    block_group, le32_to_cpu(desc->bg_block_bitmap));
+		return NULL;
+	}
+
+	ext2_valid_block_bitmap(sb, desc, block_group, bh);
+	/*
+	 * file system mounted not to panic on error, continue with corrupt
+	 * bitmap
+	 */
+	return bh;
+}
+
+static void release_blocks(struct super_block *sb, int count)
+{
+	if (count) {
+		struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+		percpu_counter_add(&sbi->s_freeblocks_counter, count);
+		sb->s_dirt = 1;
+	}
+}
+
+static void group_adjust_blocks(struct super_block *sb, int group_no,
+	struct ext2_group_desc *desc, struct buffer_head *bh, int count)
+{
+	if (count) {
+		struct ext2_sb_info *sbi = EXT2_SB(sb);
+		unsigned free_blocks;
+
+		spin_lock(sb_bgl_lock(sbi, group_no));
+		free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
+		desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count);
+		spin_unlock(sb_bgl_lock(sbi, group_no));
+		sb->s_dirt = 1;
+		mark_buffer_dirty(bh);
+	}
+}
+
+/*
+ * The reservation window structure operations
+ * --------------------------------------------
+ * Operations include:
+ * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
+ *
+ * We use a red-black tree to represent per-filesystem reservation
+ * windows.
+ *
+ */
+
+/**
+ * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
+ * @rb_root:		root of per-filesystem reservation rb tree
+ * @verbose:		verbose mode
+ * @fn:			function which wishes to dump the reservation map
+ *
+ * If verbose is turned on, it will print the whole block reservation
+ * windows(start, end). Otherwise, it will only print out the "bad" windows,
+ * those windows that overlap with their immediate neighbors.
+ */
+#if 1
+static void __rsv_window_dump(struct rb_root *root, int verbose,
+			      const char *fn)
+{
+	struct rb_node *n;
+	struct ext2_reserve_window_node *rsv, *prev;
+	int bad;
+
+restart:
+	n = rb_first(root);
+	bad = 0;
+	prev = NULL;
+
+	printk("Block Allocation Reservation Windows Map (%s):\n", fn);
+	while (n) {
+		rsv = rb_entry(n, struct ext2_reserve_window_node, rsv_node);
+		if (verbose)
+			printk("reservation window 0x%p "
+				"start: %lu, end: %lu\n",
+				rsv, rsv->rsv_start, rsv->rsv_end);
+		if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
+			printk("Bad reservation %p (start >= end)\n",
+			       rsv);
+			bad = 1;
+		}
+		if (prev && prev->rsv_end >= rsv->rsv_start) {
+			printk("Bad reservation %p (prev->end >= start)\n",
+			       rsv);
+			bad = 1;
+		}
+		if (bad) {
+			if (!verbose) {
+				printk("Restarting reservation walk in verbose mode\n");
+				verbose = 1;
+				goto restart;
+			}
+		}
+		n = rb_next(n);
+		prev = rsv;
+	}
+	printk("Window map complete.\n");
+	BUG_ON(bad);
+}
+#define rsv_window_dump(root, verbose) \
+	__rsv_window_dump((root), (verbose), __func__)
+#else
+#define rsv_window_dump(root, verbose) do {} while (0)
+#endif
+
+/**
+ * goal_in_my_reservation()
+ * @rsv:		inode's reservation window
+ * @grp_goal:		given goal block relative to the allocation block group
+ * @group:		the current allocation block group
+ * @sb:			filesystem super block
+ *
+ * Test if the given goal block (group relative) is within the file's
+ * own block reservation window range.
+ *
+ * If the reservation window is outside the goal allocation group, return 0;
+ * grp_goal (given goal block) could be -1, which means no specific
+ * goal block. In this case, always return 1.
+ * If the goal block is within the reservation window, return 1;
+ * otherwise, return 0;
+ */
+static int
+goal_in_my_reservation(struct ext2_reserve_window *rsv, ext2_grpblk_t grp_goal,
+			unsigned int group, struct super_block * sb)
+{
+	ext2_fsblk_t group_first_block, group_last_block;
+
+	group_first_block = ext2_group_first_block_no(sb, group);
+	group_last_block = group_first_block + EXT2_BLOCKS_PER_GROUP(sb) - 1;
+
+	if ((rsv->_rsv_start > group_last_block) ||
+	    (rsv->_rsv_end < group_first_block))
+		return 0;
+	if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
+		|| (grp_goal + group_first_block > rsv->_rsv_end)))
+		return 0;
+	return 1;
+}
+
+/**
+ * search_reserve_window()
+ * @rb_root:		root of reservation tree
+ * @goal:		target allocation block
+ *
+ * Find the reserved window which includes the goal, or the previous one
+ * if the goal is not in any window.
+ * Returns NULL if there are no windows or if all windows start after the goal.
+ */
+static struct ext2_reserve_window_node *
+search_reserve_window(struct rb_root *root, ext2_fsblk_t goal)
+{
+	struct rb_node *n = root->rb_node;
+	struct ext2_reserve_window_node *rsv;
+
+	if (!n)
+		return NULL;
+
+	do {
+		rsv = rb_entry(n, struct ext2_reserve_window_node, rsv_node);
+
+		if (goal < rsv->rsv_start)
+			n = n->rb_left;
+		else if (goal > rsv->rsv_end)
+			n = n->rb_right;
+		else
+			return rsv;
+	} while (n);
+	/*
+	 * We've fallen off the end of the tree: the goal wasn't inside
+	 * any particular node.  OK, the previous node must be to one
+	 * side of the interval containing the goal.  If it's the RHS,
+	 * we need to back up one.
+	 */
+	if (rsv->rsv_start > goal) {
+		n = rb_prev(&rsv->rsv_node);
+		rsv = rb_entry(n, struct ext2_reserve_window_node, rsv_node);
+	}
+	return rsv;
+}
+
+/*
+ * ext2_rsv_window_add() -- Insert a window to the block reservation rb tree.
+ * @sb:			super block
+ * @rsv:		reservation window to add
+ *
+ * Must be called with rsv_lock held.
+ */
+void ext2_rsv_window_add(struct super_block *sb,
+		    struct ext2_reserve_window_node *rsv)
+{
+	struct rb_root *root = &EXT2_SB(sb)->s_rsv_window_root;
+	struct rb_node *node = &rsv->rsv_node;
+	ext2_fsblk_t start = rsv->rsv_start;
+
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct ext2_reserve_window_node *this;
+
+	while (*p)
+	{
+		parent = *p;
+		this = rb_entry(parent, struct ext2_reserve_window_node, rsv_node);
+
+		if (start < this->rsv_start)
+			p = &(*p)->rb_left;
+		else if (start > this->rsv_end)
+			p = &(*p)->rb_right;
+		else {
+			rsv_window_dump(root, 1);
+			BUG();
+		}
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+}
+
+/**
+ * rsv_window_remove() -- unlink a window from the reservation rb tree
+ * @sb:			super block
+ * @rsv:		reservation window to remove
+ *
+ * Mark the block reservation window as not allocated, and unlink it
+ * from the filesystem reservation window rb tree. Must be called with
+ * rsv_lock held.
+ */
+static void rsv_window_remove(struct super_block *sb,
+			      struct ext2_reserve_window_node *rsv)
+{
+	rsv->rsv_start = EXT2_RESERVE_WINDOW_NOT_ALLOCATED;
+	rsv->rsv_end = EXT2_RESERVE_WINDOW_NOT_ALLOCATED;
+	rsv->rsv_alloc_hit = 0;
+	rb_erase(&rsv->rsv_node, &EXT2_SB(sb)->s_rsv_window_root);
+}
+
+/*
+ * rsv_is_empty() -- Check if the reservation window is allocated.
+ * @rsv:		given reservation window to check
+ *
+ * returns 1 if the end block is EXT2_RESERVE_WINDOW_NOT_ALLOCATED.
+ */
+static inline int rsv_is_empty(struct ext2_reserve_window *rsv)
+{
+	/* a valid reservation end block could not be 0 */
+	return (rsv->_rsv_end == EXT2_RESERVE_WINDOW_NOT_ALLOCATED);
+}
+
+/**
+ * ext2_init_block_alloc_info()
+ * @inode:		file inode structure
+ *
+ * Allocate and initialize the  reservation window structure, and
+ * link the window to the ext2 inode structure at last
+ *
+ * The reservation window structure is only dynamically allocated
+ * and linked to ext2 inode the first time the open file
+ * needs a new block. So, before every ext2_new_block(s) call, for
+ * regular files, we should check whether the reservation window
+ * structure exists or not. In the latter case, this function is called.
+ * Fail to do so will result in block reservation being turned off for that
+ * open file.
+ *
+ * This function is called from ext2_get_blocks_handle(), also called
+ * when setting the reservation window size through ioctl before the file
+ * is open for write (needs block allocation).
+ *
+ * Needs truncate_mutex protection prior to calling this function.
+ */
+void ext2_init_block_alloc_info(struct inode *inode)
+{
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	struct ext2_block_alloc_info *block_i = ei->i_block_alloc_info;
+	struct super_block *sb = inode->i_sb;
+
+	block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
+	if (block_i) {
+		struct ext2_reserve_window_node *rsv = &block_i->rsv_window_node;
+
+		rsv->rsv_start = EXT2_RESERVE_WINDOW_NOT_ALLOCATED;
+		rsv->rsv_end = EXT2_RESERVE_WINDOW_NOT_ALLOCATED;
+
+	 	/*
+		 * if filesystem is mounted with NORESERVATION, the goal
+		 * reservation window size is set to zero to indicate
+		 * block reservation is off
+		 */
+		if (!test_opt(sb, RESERVATION))
+			rsv->rsv_goal_size = 0;
+		else
+			rsv->rsv_goal_size = EXT2_DEFAULT_RESERVE_BLOCKS;
+		rsv->rsv_alloc_hit = 0;
+		block_i->last_alloc_logical_block = 0;
+		block_i->last_alloc_physical_block = 0;
+	}
+	ei->i_block_alloc_info = block_i;
+}
+
+/**
+ * ext2_discard_reservation()
+ * @inode:		inode
+ *
+ * Discard(free) block reservation window on last file close, or truncate
+ * or at last iput().
+ *
+ * It is being called in three cases:
+ * 	ext2_release_file(): last writer closes the file
+ * 	ext2_clear_inode(): last iput(), when nobody links to this file.
+ * 	ext2_truncate(): when the block indirect map is about to change.
+ */
+void ext2_discard_reservation(struct inode *inode)
+{
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	struct ext2_block_alloc_info *block_i = ei->i_block_alloc_info;
+	struct ext2_reserve_window_node *rsv;
+	spinlock_t *rsv_lock = &EXT2_SB(inode->i_sb)->s_rsv_window_lock;
+
+	if (!block_i)
+		return;
+
+	rsv = &block_i->rsv_window_node;
+	if (!rsv_is_empty(&rsv->rsv_window)) {
+		spin_lock(rsv_lock);
+		if (!rsv_is_empty(&rsv->rsv_window))
+			rsv_window_remove(inode->i_sb, rsv);
+		spin_unlock(rsv_lock);
+	}
+}
+
+/**
+ * ext2_free_blocks_sb() -- Free given blocks and update quota and i_blocks
+ * @inode:		inode
+ * @block:		start physcial block to free
+ * @count:		number of blocks to free
+ */
+void ext2_free_blocks (struct inode * inode, unsigned long block,
+		       unsigned long count)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head * bh2;
+	unsigned long block_group;
+	unsigned long bit;
+	unsigned long i;
+	unsigned long overflow;
+	struct super_block * sb = inode->i_sb;
+	struct ext2_sb_info * sbi = EXT2_SB(sb);
+	struct ext2_group_desc * desc;
+	struct ext2_super_block * es = sbi->s_es;
+	unsigned freed = 0, group_freed;
+
+	if (block < le32_to_cpu(es->s_first_data_block) ||
+	    block + count < block ||
+	    block + count > le32_to_cpu(es->s_blocks_count)) {
+		ext2_error (sb, "ext2_free_blocks",
+			    "Freeing blocks not in datazone - "
+			    "block = %lu, count = %lu", block, count);
+		goto error_return;
+	}
+
+	ext2_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
+
+do_more:
+	overflow = 0;
+	block_group = (block - le32_to_cpu(es->s_first_data_block)) /
+		      EXT2_BLOCKS_PER_GROUP(sb);
+	bit = (block - le32_to_cpu(es->s_first_data_block)) %
+		      EXT2_BLOCKS_PER_GROUP(sb);
+	/*
+	 * Check to see if we are freeing blocks across a group
+	 * boundary.
+	 */
+	if (bit + count > EXT2_BLOCKS_PER_GROUP(sb)) {
+		overflow = bit + count - EXT2_BLOCKS_PER_GROUP(sb);
+		count -= overflow;
+	}
+	brelse(bitmap_bh);
+	bitmap_bh = read_block_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		goto error_return;
+
+	desc = ext2_get_group_desc (sb, block_group, &bh2);
+	if (!desc)
+		goto error_return;
+
+	if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
+	    in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
+	    in_range (block, le32_to_cpu(desc->bg_inode_table),
+		      sbi->s_itb_per_group) ||
+	    in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
+		      sbi->s_itb_per_group)) {
+		ext2_error (sb, "ext2_free_blocks",
+			    "Freeing blocks in system zones - "
+			    "Block = %lu, count = %lu",
+			    block, count);
+		goto error_return;
+	}
+
+	for (i = 0, group_freed = 0; i < count; i++) {
+		if (!ext2_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+						bit + i, bitmap_bh->b_data)) {
+			ext2_error(sb, __func__,
+				"bit already cleared for block %lu", block + i);
+		} else {
+			group_freed++;
+		}
+	}
+
+	mark_buffer_dirty(bitmap_bh);
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		sync_dirty_buffer(bitmap_bh);
+
+	group_adjust_blocks(sb, block_group, desc, bh2, group_freed);
+	freed += group_freed;
+
+	if (overflow) {
+		block += count;
+		count = overflow;
+		goto do_more;
+	}
+error_return:
+	brelse(bitmap_bh);
+	release_blocks(sb, freed);
+	dquot_free_block_nodirty(inode, freed);
+}
+
+/**
+ * bitmap_search_next_usable_block()
+ * @start:		the starting block (group relative) of the search
+ * @bh:			bufferhead contains the block group bitmap
+ * @maxblocks:		the ending block (group relative) of the reservation
+ *
+ * The bitmap search --- search forward through the actual bitmap on disk until
+ * we find a bit free.
+ */
+static ext2_grpblk_t
+bitmap_search_next_usable_block(ext2_grpblk_t start, struct buffer_head *bh,
+					ext2_grpblk_t maxblocks)
+{
+	ext2_grpblk_t next;
+
+	next = ext2_find_next_zero_bit(bh->b_data, maxblocks, start);
+	if (next >= maxblocks)
+		return -1;
+	return next;
+}
+
+/**
+ * find_next_usable_block()
+ * @start:		the starting block (group relative) to find next
+ * 			allocatable block in bitmap.
+ * @bh:			bufferhead contains the block group bitmap
+ * @maxblocks:		the ending block (group relative) for the search
+ *
+ * Find an allocatable block in a bitmap.  We perform the "most
+ * appropriate allocation" algorithm of looking for a free block near
+ * the initial goal; then for a free byte somewhere in the bitmap;
+ * then for any free bit in the bitmap.
+ */
+static ext2_grpblk_t
+find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
+{
+	ext2_grpblk_t here, next;
+	char *p, *r;
+
+	if (start > 0) {
+		/*
+		 * The goal was occupied; search forward for a free 
+		 * block within the next XX blocks.
+		 *
+		 * end_goal is more or less random, but it has to be
+		 * less than EXT2_BLOCKS_PER_GROUP. Aligning up to the
+		 * next 64-bit boundary is simple..
+		 */
+		ext2_grpblk_t end_goal = (start + 63) & ~63;
+		if (end_goal > maxblocks)
+			end_goal = maxblocks;
+		here = ext2_find_next_zero_bit(bh->b_data, end_goal, start);
+		if (here < end_goal)
+			return here;
+		ext2_debug("Bit not found near goal\n");
+	}
+
+	here = start;
+	if (here < 0)
+		here = 0;
+
+	p = ((char *)bh->b_data) + (here >> 3);
+	r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
+	next = (r - ((char *)bh->b_data)) << 3;
+
+	if (next < maxblocks && next >= here)
+		return next;
+
+	here = bitmap_search_next_usable_block(here, bh, maxblocks);
+	return here;
+}
+
+/**
+ * ext2_try_to_allocate()
+ * @sb:			superblock
+ * @group:		given allocation block group
+ * @bitmap_bh:		bufferhead holds the block bitmap
+ * @grp_goal:		given target block within the group
+ * @count:		target number of blocks to allocate
+ * @my_rsv:		reservation window
+ *
+ * Attempt to allocate blocks within a give range. Set the range of allocation
+ * first, then find the first free bit(s) from the bitmap (within the range),
+ * and at last, allocate the blocks by claiming the found free bit as allocated.
+ *
+ * To set the range of this allocation:
+ * 	if there is a reservation window, only try to allocate block(s)
+ * 	from the file's own reservation window;
+ * 	Otherwise, the allocation range starts from the give goal block,
+ * 	ends at the block group's last block.
+ *
+ * If we failed to allocate the desired block then we may end up crossing to a
+ * new bitmap.
+ */
+static int
+ext2_try_to_allocate(struct super_block *sb, int group,
+			struct buffer_head *bitmap_bh, ext2_grpblk_t grp_goal,
+			unsigned long *count,
+			struct ext2_reserve_window *my_rsv)
+{
+	ext2_fsblk_t group_first_block;
+       	ext2_grpblk_t start, end;
+	unsigned long num = 0;
+
+	/* we do allocation within the reservation window if we have a window */
+	if (my_rsv) {
+		group_first_block = ext2_group_first_block_no(sb, group);
+		if (my_rsv->_rsv_start >= group_first_block)
+			start = my_rsv->_rsv_start - group_first_block;
+		else
+			/* reservation window cross group boundary */
+			start = 0;
+		end = my_rsv->_rsv_end - group_first_block + 1;
+		if (end > EXT2_BLOCKS_PER_GROUP(sb))
+			/* reservation window crosses group boundary */
+			end = EXT2_BLOCKS_PER_GROUP(sb);
+		if ((start <= grp_goal) && (grp_goal < end))
+			start = grp_goal;
+		else
+			grp_goal = -1;
+	} else {
+		if (grp_goal > 0)
+			start = grp_goal;
+		else
+			start = 0;
+		end = EXT2_BLOCKS_PER_GROUP(sb);
+	}
+
+	BUG_ON(start > EXT2_BLOCKS_PER_GROUP(sb));
+
+repeat:
+	if (grp_goal < 0) {
+		grp_goal = find_next_usable_block(start, bitmap_bh, end);
+		if (grp_goal < 0)
+			goto fail_access;
+		if (!my_rsv) {
+			int i;
+
+			for (i = 0; i < 7 && grp_goal > start &&
+					!ext2_test_bit(grp_goal - 1,
+					     		bitmap_bh->b_data);
+			     		i++, grp_goal--)
+				;
+		}
+	}
+	start = grp_goal;
+
+	if (ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), grp_goal,
+			       				bitmap_bh->b_data)) {
+		/*
+		 * The block was allocated by another thread, or it was
+		 * allocated and then freed by another thread
+		 */
+		start++;
+		grp_goal++;
+		if (start >= end)
+			goto fail_access;
+		goto repeat;
+	}
+	num++;
+	grp_goal++;
+	while (num < *count && grp_goal < end
+		&& !ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group),
+					grp_goal, bitmap_bh->b_data)) {
+		num++;
+		grp_goal++;
+	}
+	*count = num;
+	return grp_goal - num;
+fail_access:
+	*count = num;
+	return -1;
+}
+
+/**
+ * 	find_next_reservable_window():
+ *		find a reservable space within the given range.
+ *		It does not allocate the reservation window for now:
+ *		alloc_new_reservation() will do the work later.
+ *
+ * 	@search_head: the head of the searching list;
+ *		This is not necessarily the list head of the whole filesystem
+ *
+ *		We have both head and start_block to assist the search
+ *		for the reservable space. The list starts from head,
+ *		but we will shift to the place where start_block is,
+ *		then start from there, when looking for a reservable space.
+ *
+ * 	@size: the target new reservation window size
+ *
+ * 	@group_first_block: the first block we consider to start
+ *			the real search from
+ *
+ * 	@last_block:
+ *		the maximum block number that our goal reservable space
+ *		could start from. This is normally the last block in this
+ *		group. The search will end when we found the start of next
+ *		possible reservable space is out of this boundary.
+ *		This could handle the cross boundary reservation window
+ *		request.
+ *
+ * 	basically we search from the given range, rather than the whole
+ * 	reservation double linked list, (start_block, last_block)
+ * 	to find a free region that is of my size and has not
+ * 	been reserved.
+ *
+ */
+static int find_next_reservable_window(
+				struct ext2_reserve_window_node *search_head,
+				struct ext2_reserve_window_node *my_rsv,
+				struct super_block * sb,
+				ext2_fsblk_t start_block,
+				ext2_fsblk_t last_block)
+{
+	struct rb_node *next;
+	struct ext2_reserve_window_node *rsv, *prev;
+	ext2_fsblk_t cur;
+	int size = my_rsv->rsv_goal_size;
+
+	/* TODO: make the start of the reservation window byte-aligned */
+	/* cur = *start_block & ~7;*/
+	cur = start_block;
+	rsv = search_head;
+	if (!rsv)
+		return -1;
+
+	while (1) {
+		if (cur <= rsv->rsv_end)
+			cur = rsv->rsv_end + 1;
+
+		/* TODO?
+		 * in the case we could not find a reservable space
+		 * that is what is expected, during the re-search, we could
+		 * remember what's the largest reservable space we could have
+		 * and return that one.
+		 *
+		 * For now it will fail if we could not find the reservable
+		 * space with expected-size (or more)...
+		 */
+		if (cur > last_block)
+			return -1;		/* fail */
+
+		prev = rsv;
+		next = rb_next(&rsv->rsv_node);
+		rsv = rb_entry(next,struct ext2_reserve_window_node,rsv_node);
+
+		/*
+		 * Reached the last reservation, we can just append to the
+		 * previous one.
+		 */
+		if (!next)
+			break;
+
+		if (cur + size <= rsv->rsv_start) {
+			/*
+			 * Found a reserveable space big enough.  We could
+			 * have a reservation across the group boundary here
+		 	 */
+			break;
+		}
+	}
+	/*
+	 * we come here either :
+	 * when we reach the end of the whole list,
+	 * and there is empty reservable space after last entry in the list.
+	 * append it to the end of the list.
+	 *
+	 * or we found one reservable space in the middle of the list,
+	 * return the reservation window that we could append to.
+	 * succeed.
+	 */
+
+	if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
+		rsv_window_remove(sb, my_rsv);
+
+	/*
+	 * Let's book the whole available window for now.  We will check the
+	 * disk bitmap later and then, if there are free blocks then we adjust
+	 * the window size if it's larger than requested.
+	 * Otherwise, we will remove this node from the tree next time
+	 * call find_next_reservable_window.
+	 */
+	my_rsv->rsv_start = cur;
+	my_rsv->rsv_end = cur + size - 1;
+	my_rsv->rsv_alloc_hit = 0;
+
+	if (prev != my_rsv)
+		ext2_rsv_window_add(sb, my_rsv);
+
+	return 0;
+}
+
+/**
+ * 	alloc_new_reservation()--allocate a new reservation window
+ *
+ *		To make a new reservation, we search part of the filesystem
+ *		reservation list (the list that inside the group). We try to
+ *		allocate a new reservation window near the allocation goal,
+ *		or the beginning of the group, if there is no goal.
+ *
+ *		We first find a reservable space after the goal, then from
+ *		there, we check the bitmap for the first free block after
+ *		it. If there is no free block until the end of group, then the
+ *		whole group is full, we failed. Otherwise, check if the free
+ *		block is inside the expected reservable space, if so, we
+ *		succeed.
+ *		If the first free block is outside the reservable space, then
+ *		start from the first free block, we search for next available
+ *		space, and go on.
+ *
+ *	on succeed, a new reservation will be found and inserted into the list
+ *	It contains at least one free block, and it does not overlap with other
+ *	reservation windows.
+ *
+ *	failed: we failed to find a reservation window in this group
+ *
+ *	@rsv: the reservation
+ *
+ *	@grp_goal: The goal (group-relative).  It is where the search for a
+ *		free reservable space should start from.
+ *		if we have a goal(goal >0 ), then start from there,
+ *		no goal(goal = -1), we start from the first block
+ *		of the group.
+ *
+ *	@sb: the super block
+ *	@group: the group we are trying to allocate in
+ *	@bitmap_bh: the block group block bitmap
+ *
+ */
+static int alloc_new_reservation(struct ext2_reserve_window_node *my_rsv,
+		ext2_grpblk_t grp_goal, struct super_block *sb,
+		unsigned int group, struct buffer_head *bitmap_bh)
+{
+	struct ext2_reserve_window_node *search_head;
+	ext2_fsblk_t group_first_block, group_end_block, start_block;
+	ext2_grpblk_t first_free_block;
+	struct rb_root *fs_rsv_root = &EXT2_SB(sb)->s_rsv_window_root;
+	unsigned long size;
+	int ret;
+	spinlock_t *rsv_lock = &EXT2_SB(sb)->s_rsv_window_lock;
+
+	group_first_block = ext2_group_first_block_no(sb, group);
+	group_end_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1);
+
+	if (grp_goal < 0)
+		start_block = group_first_block;
+	else
+		start_block = grp_goal + group_first_block;
+
+	size = my_rsv->rsv_goal_size;
+
+	if (!rsv_is_empty(&my_rsv->rsv_window)) {
+		/*
+		 * if the old reservation is cross group boundary
+		 * and if the goal is inside the old reservation window,
+		 * we will come here when we just failed to allocate from
+		 * the first part of the window. We still have another part
+		 * that belongs to the next group. In this case, there is no
+		 * point to discard our window and try to allocate a new one
+		 * in this group(which will fail). we should
+		 * keep the reservation window, just simply move on.
+		 *
+		 * Maybe we could shift the start block of the reservation
+		 * window to the first block of next group.
+		 */
+
+		if ((my_rsv->rsv_start <= group_end_block) &&
+				(my_rsv->rsv_end > group_end_block) &&
+				(start_block >= my_rsv->rsv_start))
+			return -1;
+
+		if ((my_rsv->rsv_alloc_hit >
+		     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
+			/*
+			 * if the previously allocation hit ratio is
+			 * greater than 1/2, then we double the size of
+			 * the reservation window the next time,
+			 * otherwise we keep the same size window
+			 */
+			size = size * 2;
+			if (size > EXT2_MAX_RESERVE_BLOCKS)
+				size = EXT2_MAX_RESERVE_BLOCKS;
+			my_rsv->rsv_goal_size= size;
+		}
+	}
+
+	spin_lock(rsv_lock);
+	/*
+	 * shift the search start to the window near the goal block
+	 */
+	search_head = search_reserve_window(fs_rsv_root, start_block);
+
+	/*
+	 * find_next_reservable_window() simply finds a reservable window
+	 * inside the given range(start_block, group_end_block).
+	 *
+	 * To make sure the reservation window has a free bit inside it, we
+	 * need to check the bitmap after we found a reservable window.
+	 */
+retry:
+	ret = find_next_reservable_window(search_head, my_rsv, sb,
+						start_block, group_end_block);
+
+	if (ret == -1) {
+		if (!rsv_is_empty(&my_rsv->rsv_window))
+			rsv_window_remove(sb, my_rsv);
+		spin_unlock(rsv_lock);
+		return -1;
+	}
+
+	/*
+	 * On success, find_next_reservable_window() returns the
+	 * reservation window where there is a reservable space after it.
+	 * Before we reserve this reservable space, we need
+	 * to make sure there is at least a free block inside this region.
+	 *
+	 * Search the first free bit on the block bitmap.  Search starts from
+	 * the start block of the reservable space we just found.
+	 */
+	spin_unlock(rsv_lock);
+	first_free_block = bitmap_search_next_usable_block(
+			my_rsv->rsv_start - group_first_block,
+			bitmap_bh, group_end_block - group_first_block + 1);
+
+	if (first_free_block < 0) {
+		/*
+		 * no free block left on the bitmap, no point
+		 * to reserve the space. return failed.
+		 */
+		spin_lock(rsv_lock);
+		if (!rsv_is_empty(&my_rsv->rsv_window))
+			rsv_window_remove(sb, my_rsv);
+		spin_unlock(rsv_lock);
+		return -1;		/* failed */
+	}
+
+	start_block = first_free_block + group_first_block;
+	/*
+	 * check if the first free block is within the
+	 * free space we just reserved
+	 */
+	if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
+		return 0;		/* success */
+	/*
+	 * if the first free bit we found is out of the reservable space
+	 * continue search for next reservable space,
+	 * start from where the free block is,
+	 * we also shift the list head to where we stopped last time
+	 */
+	search_head = my_rsv;
+	spin_lock(rsv_lock);
+	goto retry;
+}
+
+/**
+ * try_to_extend_reservation()
+ * @my_rsv:		given reservation window
+ * @sb:			super block
+ * @size:		the delta to extend
+ *
+ * Attempt to expand the reservation window large enough to have
+ * required number of free blocks
+ *
+ * Since ext2_try_to_allocate() will always allocate blocks within
+ * the reservation window range, if the window size is too small,
+ * multiple blocks allocation has to stop at the end of the reservation
+ * window. To make this more efficient, given the total number of
+ * blocks needed and the current size of the window, we try to
+ * expand the reservation window size if necessary on a best-effort
+ * basis before ext2_new_blocks() tries to allocate blocks.
+ */
+static void try_to_extend_reservation(struct ext2_reserve_window_node *my_rsv,
+			struct super_block *sb, int size)
+{
+	struct ext2_reserve_window_node *next_rsv;
+	struct rb_node *next;
+	spinlock_t *rsv_lock = &EXT2_SB(sb)->s_rsv_window_lock;
+
+	if (!spin_trylock(rsv_lock))
+		return;
+
+	next = rb_next(&my_rsv->rsv_node);
+
+	if (!next)
+		my_rsv->rsv_end += size;
+	else {
+		next_rsv = rb_entry(next, struct ext2_reserve_window_node, rsv_node);
+
+		if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
+			my_rsv->rsv_end += size;
+		else
+			my_rsv->rsv_end = next_rsv->rsv_start - 1;
+	}
+	spin_unlock(rsv_lock);
+}
+
+/**
+ * ext2_try_to_allocate_with_rsv()
+ * @sb:			superblock
+ * @group:		given allocation block group
+ * @bitmap_bh:		bufferhead holds the block bitmap
+ * @grp_goal:		given target block within the group
+ * @count:		target number of blocks to allocate
+ * @my_rsv:		reservation window
+ *
+ * This is the main function used to allocate a new block and its reservation
+ * window.
+ *
+ * Each time when a new block allocation is need, first try to allocate from
+ * its own reservation.  If it does not have a reservation window, instead of
+ * looking for a free bit on bitmap first, then look up the reservation list to
+ * see if it is inside somebody else's reservation window, we try to allocate a
+ * reservation window for it starting from the goal first. Then do the block
+ * allocation within the reservation window.
+ *
+ * This will avoid keeping on searching the reservation list again and
+ * again when somebody is looking for a free block (without
+ * reservation), and there are lots of free blocks, but they are all
+ * being reserved.
+ *
+ * We use a red-black tree for the per-filesystem reservation list.
+ */
+static ext2_grpblk_t
+ext2_try_to_allocate_with_rsv(struct super_block *sb, unsigned int group,
+			struct buffer_head *bitmap_bh, ext2_grpblk_t grp_goal,
+			struct ext2_reserve_window_node * my_rsv,
+			unsigned long *count)
+{
+	ext2_fsblk_t group_first_block, group_last_block;
+	ext2_grpblk_t ret = 0;
+	unsigned long num = *count;
+
+	/*
+	 * we don't deal with reservation when
+	 * filesystem is mounted without reservation
+	 * or the file is not a regular file
+	 * or last attempt to allocate a block with reservation turned on failed
+	 */
+	if (my_rsv == NULL) {
+		return ext2_try_to_allocate(sb, group, bitmap_bh,
+						grp_goal, count, NULL);
+	}
+	/*
+	 * grp_goal is a group relative block number (if there is a goal)
+	 * 0 <= grp_goal < EXT2_BLOCKS_PER_GROUP(sb)
+	 * first block is a filesystem wide block number
+	 * first block is the block number of the first block in this group
+	 */
+	group_first_block = ext2_group_first_block_no(sb, group);
+	group_last_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1);
+
+	/*
+	 * Basically we will allocate a new block from inode's reservation
+	 * window.
+	 *
+	 * We need to allocate a new reservation window, if:
+	 * a) inode does not have a reservation window; or
+	 * b) last attempt to allocate a block from existing reservation
+	 *    failed; or
+	 * c) we come here with a goal and with a reservation window
+	 *
+	 * We do not need to allocate a new reservation window if we come here
+	 * at the beginning with a goal and the goal is inside the window, or
+	 * we don't have a goal but already have a reservation window.
+	 * then we could go to allocate from the reservation window directly.
+	 */
+	while (1) {
+		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
+			!goal_in_my_reservation(&my_rsv->rsv_window,
+						grp_goal, group, sb)) {
+			if (my_rsv->rsv_goal_size < *count)
+				my_rsv->rsv_goal_size = *count;
+			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
+							group, bitmap_bh);
+			if (ret < 0)
+				break;			/* failed */
+
+			if (!goal_in_my_reservation(&my_rsv->rsv_window,
+							grp_goal, group, sb))
+				grp_goal = -1;
+		} else if (grp_goal >= 0) {
+			int curr = my_rsv->rsv_end -
+					(grp_goal + group_first_block) + 1;
+
+			if (curr < *count)
+				try_to_extend_reservation(my_rsv, sb,
+							*count - curr);
+		}
+
+		if ((my_rsv->rsv_start > group_last_block) ||
+				(my_rsv->rsv_end < group_first_block)) {
+			rsv_window_dump(&EXT2_SB(sb)->s_rsv_window_root, 1);
+			BUG();
+		}
+		ret = ext2_try_to_allocate(sb, group, bitmap_bh, grp_goal,
+					   &num, &my_rsv->rsv_window);
+		if (ret >= 0) {
+			my_rsv->rsv_alloc_hit += num;
+			*count = num;
+			break;				/* succeed */
+		}
+		num = *count;
+	}
+	return ret;
+}
+
+/**
+ * ext2_has_free_blocks()
+ * @sbi:		in-core super block structure.
+ *
+ * Check if filesystem has at least 1 free block available for allocation.
+ */
+static int ext2_has_free_blocks(struct ext2_sb_info *sbi)
+{
+	ext2_fsblk_t free_blocks, root_blocks;
+
+	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
+	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+		sbi->s_resuid != current_fsuid() &&
+		(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * ext2_new_blocks() -- core block(s) allocation function
+ * @inode:		file inode
+ * @goal:		given target block(filesystem wide)
+ * @count:		target number of blocks to allocate
+ * @errp:		error code
+ *
+ * ext2_new_blocks uses a goal block to assist allocation.  If the goal is
+ * free, or there is a free block within 32 blocks of the goal, that block
+ * is allocated.  Otherwise a forward search is made for a free block; within 
+ * each block group the search first looks for an entire free byte in the block
+ * bitmap, and then for any free bit if that fails.
+ * This function also updates quota and i_blocks field.
+ */
+ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
+		    unsigned long *count, int *errp)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *gdp_bh;
+	int group_no;
+	int goal_group;
+	ext2_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
+	ext2_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
+	ext2_fsblk_t ret_block;		/* filesyetem-wide allocated block */
+	int bgi;			/* blockgroup iteration index */
+	int performed_allocation = 0;
+	ext2_grpblk_t free_blocks;	/* number of free blocks in a group */
+	struct super_block *sb;
+	struct ext2_group_desc *gdp;
+	struct ext2_super_block *es;
+	struct ext2_sb_info *sbi;
+	struct ext2_reserve_window_node *my_rsv = NULL;
+	struct ext2_block_alloc_info *block_i;
+	unsigned short windowsz = 0;
+	unsigned long ngroups;
+	unsigned long num = *count;
+	int ret;
+
+	*errp = -ENOSPC;
+	sb = inode->i_sb;
+	if (!sb) {
+		printk("ext2_new_blocks: nonexistent device");
+		return 0;
+	}
+
+	/*
+	 * Check quota for allocation of this block.
+	 */
+	ret = dquot_alloc_block(inode, num);
+	if (ret) {
+		*errp = ret;
+		return 0;
+	}
+
+	sbi = EXT2_SB(sb);
+	es = EXT2_SB(sb)->s_es;
+	ext2_debug("goal=%lu.\n", goal);
+	/*
+	 * Allocate a block from reservation only when
+	 * filesystem is mounted with reservation(default,-o reservation), and
+	 * it's a regular file, and
+	 * the desired window size is greater than 0 (One could use ioctl
+	 * command EXT2_IOC_SETRSVSZ to set the window size to 0 to turn off
+	 * reservation on that particular file)
+	 */
+	block_i = EXT2_I(inode)->i_block_alloc_info;
+	if (block_i) {
+		windowsz = block_i->rsv_window_node.rsv_goal_size;
+		if (windowsz > 0)
+			my_rsv = &block_i->rsv_window_node;
+	}
+
+	if (!ext2_has_free_blocks(sbi)) {
+		*errp = -ENOSPC;
+		goto out;
+	}
+
+	/*
+	 * First, test whether the goal block is free.
+	 */
+	if (goal < le32_to_cpu(es->s_first_data_block) ||
+	    goal >= le32_to_cpu(es->s_blocks_count))
+		goal = le32_to_cpu(es->s_first_data_block);
+	group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
+			EXT2_BLOCKS_PER_GROUP(sb);
+	goal_group = group_no;
+retry_alloc:
+	gdp = ext2_get_group_desc(sb, group_no, &gdp_bh);
+	if (!gdp)
+		goto io_error;
+
+	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+	/*
+	 * if there is not enough free blocks to make a new resevation
+	 * turn off reservation for this allocation
+	 */
+	if (my_rsv && (free_blocks < windowsz)
+		&& (free_blocks > 0)
+		&& (rsv_is_empty(&my_rsv->rsv_window)))
+		my_rsv = NULL;
+
+	if (free_blocks > 0) {
+		grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
+				EXT2_BLOCKS_PER_GROUP(sb));
+		bitmap_bh = read_block_bitmap(sb, group_no);
+		if (!bitmap_bh)
+			goto io_error;
+		grp_alloc_blk = ext2_try_to_allocate_with_rsv(sb, group_no,
+					bitmap_bh, grp_target_blk,
+					my_rsv, &num);
+		if (grp_alloc_blk >= 0)
+			goto allocated;
+	}
+
+	ngroups = EXT2_SB(sb)->s_groups_count;
+	smp_rmb();
+
+	/*
+	 * Now search the rest of the groups.  We assume that
+	 * group_no and gdp correctly point to the last group visited.
+	 */
+	for (bgi = 0; bgi < ngroups; bgi++) {
+		group_no++;
+		if (group_no >= ngroups)
+			group_no = 0;
+		gdp = ext2_get_group_desc(sb, group_no, &gdp_bh);
+		if (!gdp)
+			goto io_error;
+
+		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+		/*
+		 * skip this group (and avoid loading bitmap) if there
+		 * are no free blocks
+		 */
+		if (!free_blocks)
+			continue;
+		/*
+		 * skip this group if the number of
+		 * free blocks is less than half of the reservation
+		 * window size.
+		 */
+		if (my_rsv && (free_blocks <= (windowsz/2)))
+			continue;
+
+		brelse(bitmap_bh);
+		bitmap_bh = read_block_bitmap(sb, group_no);
+		if (!bitmap_bh)
+			goto io_error;
+		/*
+		 * try to allocate block(s) from this group, without a goal(-1).
+		 */
+		grp_alloc_blk = ext2_try_to_allocate_with_rsv(sb, group_no,
+					bitmap_bh, -1, my_rsv, &num);
+		if (grp_alloc_blk >= 0)
+			goto allocated;
+	}
+	/*
+	 * We may end up a bogus earlier ENOSPC error due to
+	 * filesystem is "full" of reservations, but
+	 * there maybe indeed free blocks available on disk
+	 * In this case, we just forget about the reservations
+	 * just do block allocation as without reservations.
+	 */
+	if (my_rsv) {
+		my_rsv = NULL;
+		windowsz = 0;
+		group_no = goal_group;
+		goto retry_alloc;
+	}
+	/* No space left on the device */
+	*errp = -ENOSPC;
+	goto out;
+
+allocated:
+
+	ext2_debug("using block group %d(%d)\n",
+			group_no, gdp->bg_free_blocks_count);
+
+	ret_block = grp_alloc_blk + ext2_group_first_block_no(sb, group_no);
+
+	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
+	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
+	    in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
+		      EXT2_SB(sb)->s_itb_per_group) ||
+	    in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
+		      EXT2_SB(sb)->s_itb_per_group)) {
+		ext2_error(sb, "ext2_new_blocks",
+			    "Allocating block in system zone - "
+			    "blocks from "E2FSBLK", length %lu",
+			    ret_block, num);
+		/*
+		 * ext2_try_to_allocate marked the blocks we allocated as in
+		 * use.  So we may want to selectively mark some of the blocks
+		 * as free
+		 */
+		goto retry_alloc;
+	}
+
+	performed_allocation = 1;
+
+	if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
+		ext2_error(sb, "ext2_new_blocks",
+			    "block("E2FSBLK") >= blocks count(%d) - "
+			    "block_group = %d, es == %p ", ret_block,
+			le32_to_cpu(es->s_blocks_count), group_no, es);
+		goto out;
+	}
+
+	group_adjust_blocks(sb, group_no, gdp, gdp_bh, -num);
+	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+
+	mark_buffer_dirty(bitmap_bh);
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		sync_dirty_buffer(bitmap_bh);
+
+	*errp = 0;
+	brelse(bitmap_bh);
+	dquot_free_block_nodirty(inode, *count-num);
+	mark_inode_dirty(inode);
+	*count = num;
+	return ret_block;
+
+io_error:
+	*errp = -EIO;
+out:
+	/*
+	 * Undo the block allocation
+	 */
+	if (!performed_allocation) {
+		dquot_free_block_nodirty(inode, *count);
+		mark_inode_dirty(inode);
+	}
+	brelse(bitmap_bh);
+	return 0;
+}
+
+ext2_fsblk_t ext2_new_block(struct inode *inode, unsigned long goal, int *errp)
+{
+	unsigned long count = 1;
+
+	return ext2_new_blocks(inode, goal, &count, errp);
+}
+
+#ifdef EXT2FS_DEBUG
+
+static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+
+unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
+{
+	unsigned int i;
+	unsigned long sum = 0;
+
+	if (!map)
+		return (0);
+	for (i = 0; i < numchars; i++)
+		sum += nibblemap[map->b_data[i] & 0xf] +
+			nibblemap[(map->b_data[i] >> 4) & 0xf];
+	return (sum);
+}
+
+#endif  /*  EXT2FS_DEBUG  */
+
+unsigned long ext2_count_free_blocks (struct super_block * sb)
+{
+	struct ext2_group_desc * desc;
+	unsigned long desc_count = 0;
+	int i;
+#ifdef EXT2FS_DEBUG
+	unsigned long bitmap_count, x;
+	struct ext2_super_block *es;
+
+	es = EXT2_SB(sb)->s_es;
+	desc_count = 0;
+	bitmap_count = 0;
+	desc = NULL;
+	for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+		struct buffer_head *bitmap_bh;
+		desc = ext2_get_group_desc (sb, i, NULL);
+		if (!desc)
+			continue;
+		desc_count += le16_to_cpu(desc->bg_free_blocks_count);
+		bitmap_bh = read_block_bitmap(sb, i);
+		if (!bitmap_bh)
+			continue;
+		
+		x = ext2_count_free(bitmap_bh, sb->s_blocksize);
+		printk ("group %d: stored = %d, counted = %lu\n",
+			i, le16_to_cpu(desc->bg_free_blocks_count), x);
+		bitmap_count += x;
+		brelse(bitmap_bh);
+	}
+	printk("ext2_count_free_blocks: stored = %lu, computed = %lu, %lu\n",
+		(long)le32_to_cpu(es->s_free_blocks_count),
+		desc_count, bitmap_count);
+	return bitmap_count;
+#else
+        for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+                desc = ext2_get_group_desc (sb, i, NULL);
+                if (!desc)
+                        continue;
+                desc_count += le16_to_cpu(desc->bg_free_blocks_count);
+	}
+	return desc_count;
+#endif
+}
+
+static inline int test_root(int a, int b)
+{
+	int num = b;
+
+	while (a > num)
+		num *= b;
+	return num == a;
+}
+
+static int ext2_group_sparse(int group)
+{
+	if (group <= 1)
+		return 1;
+	return (test_root(group, 3) || test_root(group, 5) ||
+		test_root(group, 7));
+}
+
+/**
+ *	ext2_bg_has_super - number of blocks used by the superblock in group
+ *	@sb: superblock for filesystem
+ *	@group: group number to check
+ *
+ *	Return the number of blocks used by the superblock (primary or backup)
+ *	in this group.  Currently this will be only 0 or 1.
+ */
+int ext2_bg_has_super(struct super_block *sb, int group)
+{
+	if (EXT2_HAS_RO_COMPAT_FEATURE(sb,EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
+	    !ext2_group_sparse(group))
+		return 0;
+	return 1;
+}
+
+/**
+ *	ext2_bg_num_gdb - number of blocks used by the group table in group
+ *	@sb: superblock for filesystem
+ *	@group: group number to check
+ *
+ *	Return the number of blocks used by the group descriptor table
+ *	(primary or backup) in this group.  In the future there may be a
+ *	different number of descriptor blocks in each group.
+ */
+unsigned long ext2_bg_num_gdb(struct super_block *sb, int group)
+{
+	return ext2_bg_has_super(sb, group) ? EXT2_SB(sb)->s_gdb_count : 0;
+}
+
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
new file mode 100644
index 00000000..47cda410
--- /dev/null
+++ b/fs/ext2/dir.c
@@ -0,0 +1,733 @@
+/*
+ *  linux/fs/ext2/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 directory handling functions
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * All code that works with directory layout had been switched to pagecache
+ * and moved here. AV
+ */
+
+#include "ext2.h"
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+
+typedef struct ext2_dir_entry_2 ext2_dirent;
+
+/*
+ * Tests against MAX_REC_LEN etc were put in place for 64k block
+ * sizes; if that is not possible on this arch, we can skip
+ * those tests and speed things up.
+ */
+static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
+{
+	unsigned len = le16_to_cpu(dlen);
+
+#if (PAGE_CACHE_SIZE >= 65536)
+	if (len == EXT2_MAX_REC_LEN)
+		return 1 << 16;
+#endif
+	return len;
+}
+
+static inline __le16 ext2_rec_len_to_disk(unsigned len)
+{
+#if (PAGE_CACHE_SIZE >= 65536)
+	if (len == (1 << 16))
+		return cpu_to_le16(EXT2_MAX_REC_LEN);
+	else
+		BUG_ON(len > (1 << 16));
+#endif
+	return cpu_to_le16(len);
+}
+
+/*
+ * ext2 uses block-sized chunks. Arguably, sector-sized ones would be
+ * more robust, but we have what we have
+ */
+static inline unsigned ext2_chunk_size(struct inode *inode)
+{
+	return inode->i_sb->s_blocksize;
+}
+
+static inline void ext2_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+
+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned
+ext2_last_byte(struct inode *inode, unsigned long page_nr)
+{
+	unsigned last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
+}
+
+static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *dir = mapping->host;
+	int err = 0;
+
+	dir->i_version++;
+	block_write_end(NULL, mapping, pos, len, len, page, NULL);
+
+	if (pos+len > dir->i_size) {
+		i_size_write(dir, pos+len);
+		mark_inode_dirty(dir);
+	}
+
+	if (IS_DIRSYNC(dir)) {
+		err = write_one_page(page, 1);
+		if (!err)
+			err = sync_inode_metadata(dir, 1);
+	} else {
+		unlock_page(page);
+	}
+
+	return err;
+}
+
+static void ext2_check_page(struct page *page, int quiet)
+{
+	struct inode *dir = page->mapping->host;
+	struct super_block *sb = dir->i_sb;
+	unsigned chunk_size = ext2_chunk_size(dir);
+	char *kaddr = page_address(page);
+	u32 max_inumber = le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	ext2_dirent *p;
+	char *error;
+
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & (chunk_size - 1))
+			goto Ebadsize;
+		if (!limit)
+			goto out;
+	}
+	for (offs = 0; offs <= limit - EXT2_DIR_REC_LEN(1); offs += rec_len) {
+		p = (ext2_dirent *)(kaddr + offs);
+		rec_len = ext2_rec_len_from_disk(p->rec_len);
+
+		if (unlikely(rec_len < EXT2_DIR_REC_LEN(1)))
+			goto Eshort;
+		if (unlikely(rec_len & 3))
+			goto Ealign;
+		if (unlikely(rec_len < EXT2_DIR_REC_LEN(p->name_len)))
+			goto Enamelen;
+		if (unlikely(((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)))
+			goto Espan;
+		if (unlikely(le32_to_cpu(p->inode) > max_inumber))
+			goto Einumber;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+	/* Too bad, we had an error */
+
+Ebadsize:
+	if (!quiet)
+		ext2_error(sb, __func__,
+			"size of directory #%lu is not a multiple "
+			"of chunk size", dir->i_ino);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+	goto bad_entry;
+Einumber:
+	error = "inode out of bounds";
+bad_entry:
+	if (!quiet)
+		ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - "
+			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+			dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+			(unsigned long) le32_to_cpu(p->inode),
+			rec_len, p->name_len);
+	goto fail;
+Eend:
+	if (!quiet) {
+		p = (ext2_dirent *)(kaddr + offs);
+		ext2_error(sb, "ext2_check_page",
+			"entry in directory #%lu spans the page boundary"
+			"offset=%lu, inode=%lu",
+			dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+			(unsigned long) le32_to_cpu(p->inode));
+	}
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
+
+static struct page * ext2_get_page(struct inode *dir, unsigned long n,
+				   int quiet)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (!PageChecked(page))
+			ext2_check_page(page, quiet);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	ext2_put_page(page);
+	return ERR_PTR(-EIO);
+}
+
+/*
+ * NOTE! unlike strncmp, ext2_match returns 1 for success, 0 for failure.
+ *
+ * len <= EXT2_NAME_LEN and de != NULL are guaranteed by caller.
+ */
+static inline int ext2_match (int len, const char * const name,
+					struct ext2_dir_entry_2 * de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline ext2_dirent *ext2_next_entry(ext2_dirent *p)
+{
+	return (ext2_dirent *)((char *)p +
+			ext2_rec_len_from_disk(p->rec_len));
+}
+
+static inline unsigned 
+ext2_validate_entry(char *base, unsigned offset, unsigned mask)
+{
+	ext2_dirent *de = (ext2_dirent*)(base + offset);
+	ext2_dirent *p = (ext2_dirent*)(base + (offset&mask));
+	while ((char*)p < (char*)de) {
+		if (p->rec_len == 0)
+			break;
+		p = ext2_next_entry(p);
+	}
+	return (char *)p - base;
+}
+
+static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
+	[EXT2_FT_UNKNOWN]	= DT_UNKNOWN,
+	[EXT2_FT_REG_FILE]	= DT_REG,
+	[EXT2_FT_DIR]		= DT_DIR,
+	[EXT2_FT_CHRDEV]	= DT_CHR,
+	[EXT2_FT_BLKDEV]	= DT_BLK,
+	[EXT2_FT_FIFO]		= DT_FIFO,
+	[EXT2_FT_SOCK]		= DT_SOCK,
+	[EXT2_FT_SYMLINK]	= DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= EXT2_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= EXT2_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= EXT2_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= EXT2_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= EXT2_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= EXT2_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= EXT2_FT_SYMLINK,
+};
+
+static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
+{
+	mode_t mode = inode->i_mode;
+	if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
+		de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+	else
+		de->file_type = 0;
+}
+
+static int
+ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
+{
+	loff_t pos = filp->f_pos;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = dir_pages(inode);
+	unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
+	unsigned char *types = NULL;
+	int need_revalidate = filp->f_version != inode->i_version;
+
+	if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
+		return 0;
+
+	if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
+		types = ext2_filetype_table;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		ext2_dirent *de;
+		struct page *page = ext2_get_page(inode, n, 0);
+
+		if (IS_ERR(page)) {
+			ext2_error(sb, __func__,
+				   "bad page in #%lu",
+				   inode->i_ino);
+			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			return PTR_ERR(page);
+		}
+		kaddr = page_address(page);
+		if (unlikely(need_revalidate)) {
+			if (offset) {
+				offset = ext2_validate_entry(kaddr, offset, chunk_mask);
+				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+			}
+			filp->f_version = inode->i_version;
+			need_revalidate = 0;
+		}
+		de = (ext2_dirent *)(kaddr+offset);
+		limit = kaddr + ext2_last_byte(inode, n) - EXT2_DIR_REC_LEN(1);
+		for ( ;(char*)de <= limit; de = ext2_next_entry(de)) {
+			if (de->rec_len == 0) {
+				ext2_error(sb, __func__,
+					"zero-length directory entry");
+				ext2_put_page(page);
+				return -EIO;
+			}
+			if (de->inode) {
+				int over;
+				unsigned char d_type = DT_UNKNOWN;
+
+				if (types && de->file_type < EXT2_FT_MAX)
+					d_type = types[de->file_type];
+
+				offset = (char *)de - kaddr;
+				over = filldir(dirent, de->name, de->name_len,
+						(n<<PAGE_CACHE_SHIFT) | offset,
+						le32_to_cpu(de->inode), d_type);
+				if (over) {
+					ext2_put_page(page);
+					return 0;
+				}
+			}
+			filp->f_pos += ext2_rec_len_from_disk(de->rec_len);
+		}
+		ext2_put_page(page);
+	}
+	return 0;
+}
+
+/*
+ *	ext2_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the page in which the entry was found (as a parameter - res_page),
+ * and the entry itself. Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir,
+			struct qstr *child, struct page ** res_page)
+{
+	const char *name = child->name;
+	int namelen = child->len;
+	unsigned reclen = EXT2_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = dir_pages(dir);
+	struct page *page = NULL;
+	struct ext2_inode_info *ei = EXT2_I(dir);
+	ext2_dirent * de;
+	int dir_has_error = 0;
+
+	if (npages == 0)
+		goto out;
+
+	/* OFFSET_CACHE */
+	*res_page = NULL;
+
+	start = ei->i_dir_start_lookup;
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = ext2_get_page(dir, n, dir_has_error);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (ext2_dirent *) kaddr;
+			kaddr += ext2_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->rec_len == 0) {
+					ext2_error(dir->i_sb, __func__,
+						"zero-length directory entry");
+					ext2_put_page(page);
+					goto out;
+				}
+				if (ext2_match (namelen, name, de))
+					goto found;
+				de = ext2_next_entry(de);
+			}
+			ext2_put_page(page);
+		} else
+			dir_has_error = 1;
+
+		if (++n >= npages)
+			n = 0;
+		/* next page is past the blocks we've got */
+		if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
+			ext2_error(dir->i_sb, __func__,
+				"dir %lu size %lld exceeds block count %llu",
+				dir->i_ino, dir->i_size,
+				(unsigned long long)dir->i_blocks);
+			goto out;
+		}
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	ei->i_dir_start_lookup = n;
+	return de;
+}
+
+struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p)
+{
+	struct page *page = ext2_get_page(dir, 0, 0);
+	ext2_dirent *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = ext2_next_entry((ext2_dirent *) page_address(page));
+		*p = page;
+	}
+	return de;
+}
+
+ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
+{
+	ino_t res = 0;
+	struct ext2_dir_entry_2 *de;
+	struct page *page;
+	
+	de = ext2_find_entry (dir, child, &page);
+	if (de) {
+		res = le32_to_cpu(de->inode);
+		ext2_put_page(page);
+	}
+	return res;
+}
+
+static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	return __block_write_begin(page, pos, len, ext2_get_block);
+}
+
+/* Releases the page */
+void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
+		   struct page *page, struct inode *inode, int update_times)
+{
+	loff_t pos = page_offset(page) +
+			(char *) de - (char *) page_address(page);
+	unsigned len = ext2_rec_len_from_disk(de->rec_len);
+	int err;
+
+	lock_page(page);
+	err = ext2_prepare_chunk(page, pos, len);
+	BUG_ON(err);
+	de->inode = cpu_to_le32(inode->i_ino);
+	ext2_set_de_type(de, inode);
+	err = ext2_commit_chunk(page, pos, len);
+	ext2_put_page(page);
+	if (update_times)
+		dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
+	mark_inode_dirty(dir);
+}
+
+/*
+ *	Parent is locked.
+ */
+int ext2_add_link (struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned chunk_size = ext2_chunk_size(dir);
+	unsigned reclen = EXT2_DIR_REC_LEN(namelen);
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	ext2_dirent * de;
+	unsigned long npages = dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	loff_t pos;
+	int err;
+
+	/*
+	 * We take care of directory expansion in the same loop.
+	 * This code plays outside i_size, so it locks the page
+	 * to protect that region.
+	 */
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = ext2_get_page(dir, n, 0);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + ext2_last_byte(dir, n);
+		de = (ext2_dirent *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				/* We hit i_size */
+				name_len = 0;
+				rec_len = chunk_size;
+				de->rec_len = ext2_rec_len_to_disk(chunk_size);
+				de->inode = 0;
+				goto got_it;
+			}
+			if (de->rec_len == 0) {
+				ext2_error(dir->i_sb, __func__,
+					"zero-length directory entry");
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (ext2_match (namelen, name, de))
+				goto out_unlock;
+			name_len = EXT2_DIR_REC_LEN(de->name_len);
+			rec_len = ext2_rec_len_from_disk(de->rec_len);
+			if (!de->inode && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (ext2_dirent *) ((char *) de + rec_len);
+		}
+		unlock_page(page);
+		ext2_put_page(page);
+	}
+	BUG();
+	return -EINVAL;
+
+got_it:
+	pos = page_offset(page) +
+		(char*)de - (char*)page_address(page);
+	err = ext2_prepare_chunk(page, pos, rec_len);
+	if (err)
+		goto out_unlock;
+	if (de->inode) {
+		ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
+		de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
+		de->rec_len = ext2_rec_len_to_disk(name_len);
+		de = de1;
+	}
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+	de->inode = cpu_to_le32(inode->i_ino);
+	ext2_set_de_type (de, inode);
+	err = ext2_commit_chunk(page, pos, rec_len);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
+	mark_inode_dirty(dir);
+	/* OFFSET_CACHE */
+out_put:
+	ext2_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+/*
+ * ext2_delete_entry deletes a directory entry by merging it with the
+ * previous entry. Page is up-to-date. Releases the page.
+ */
+int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
+{
+	struct inode *inode = page->mapping->host;
+	char *kaddr = page_address(page);
+	unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1);
+	unsigned to = ((char *)dir - kaddr) +
+				ext2_rec_len_from_disk(dir->rec_len);
+	loff_t pos;
+	ext2_dirent * pde = NULL;
+	ext2_dirent * de = (ext2_dirent *) (kaddr + from);
+	int err;
+
+	while ((char*)de < (char*)dir) {
+		if (de->rec_len == 0) {
+			ext2_error(inode->i_sb, __func__,
+				"zero-length directory entry");
+			err = -EIO;
+			goto out;
+		}
+		pde = de;
+		de = ext2_next_entry(de);
+	}
+	if (pde)
+		from = (char*)pde - (char*)page_address(page);
+	pos = page_offset(page) + from;
+	lock_page(page);
+	err = ext2_prepare_chunk(page, pos, to - from);
+	BUG_ON(err);
+	if (pde)
+		pde->rec_len = ext2_rec_len_to_disk(to - from);
+	dir->inode = 0;
+	err = ext2_commit_chunk(page, pos, to - from);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL;
+	mark_inode_dirty(inode);
+out:
+	ext2_put_page(page);
+	return err;
+}
+
+/*
+ * Set the first fragment of directory.
+ */
+int ext2_make_empty(struct inode *inode, struct inode *parent)
+{
+	struct page *page = grab_cache_page(inode->i_mapping, 0);
+	unsigned chunk_size = ext2_chunk_size(inode);
+	struct ext2_dir_entry_2 * de;
+	int err;
+	void *kaddr;
+
+	if (!page)
+		return -ENOMEM;
+
+	err = ext2_prepare_chunk(page, 0, chunk_size);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr, 0, chunk_size);
+	de = (struct ext2_dir_entry_2 *)kaddr;
+	de->name_len = 1;
+	de->rec_len = ext2_rec_len_to_disk(EXT2_DIR_REC_LEN(1));
+	memcpy (de->name, ".\0\0", 4);
+	de->inode = cpu_to_le32(inode->i_ino);
+	ext2_set_de_type (de, inode);
+
+	de = (struct ext2_dir_entry_2 *)(kaddr + EXT2_DIR_REC_LEN(1));
+	de->name_len = 2;
+	de->rec_len = ext2_rec_len_to_disk(chunk_size - EXT2_DIR_REC_LEN(1));
+	de->inode = cpu_to_le32(parent->i_ino);
+	memcpy (de->name, "..\0", 4);
+	ext2_set_de_type (de, inode);
+	kunmap_atomic(kaddr, KM_USER0);
+	err = ext2_commit_chunk(page, 0, chunk_size);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int ext2_empty_dir (struct inode * inode)
+{
+	struct page *page = NULL;
+	unsigned long i, npages = dir_pages(inode);
+	int dir_has_error = 0;
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		ext2_dirent * de;
+		page = ext2_get_page(inode, i, dir_has_error);
+
+		if (IS_ERR(page)) {
+			dir_has_error = 1;
+			continue;
+		}
+
+		kaddr = page_address(page);
+		de = (ext2_dirent *)kaddr;
+		kaddr += ext2_last_byte(inode, i) - EXT2_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->rec_len == 0) {
+				ext2_error(inode->i_sb, __func__,
+					"zero-length directory entry");
+				printk("kaddr=%p, de=%p\n", kaddr, de);
+				goto not_empty;
+			}
+			if (de->inode != 0) {
+				/* check for . and .. */
+				if (de->name[0] != '.')
+					goto not_empty;
+				if (de->name_len > 2)
+					goto not_empty;
+				if (de->name_len < 2) {
+					if (de->inode !=
+					    cpu_to_le32(inode->i_ino))
+						goto not_empty;
+				} else if (de->name[1] != '.')
+					goto not_empty;
+			}
+			de = ext2_next_entry(de);
+		}
+		ext2_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	ext2_put_page(page);
+	return 0;
+}
+
+const struct file_operations ext2_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= ext2_readdir,
+	.unlocked_ioctl = ext2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext2_compat_ioctl,
+#endif
+	.fsync		= ext2_fsync,
+};
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
new file mode 100644
index 00000000..645be9e7
--- /dev/null
+++ b/fs/ext2/ext2.h
@@ -0,0 +1,182 @@
+#include <linux/fs.h>
+#include <linux/ext2_fs.h>
+
+/*
+ * ext2 mount options
+ */
+struct ext2_mount_options {
+	unsigned long s_mount_opt;
+	uid_t s_resuid;
+	gid_t s_resgid;
+};
+
+/*
+ * second extended file system inode data in memory
+ */
+struct ext2_inode_info {
+	__le32	i_data[15];
+	__u32	i_flags;
+	__u32	i_faddr;
+	__u8	i_frag_no;
+	__u8	i_frag_size;
+	__u16	i_state;
+	__u32	i_file_acl;
+	__u32	i_dir_acl;
+	__u32	i_dtime;
+
+	/*
+	 * i_block_group is the number of the block group which contains
+	 * this file's inode.  Constant across the lifetime of the inode,
+	 * it is used for making block allocation decisions - we try to
+	 * place a file's data blocks near its inode block, and new inodes
+	 * near to their parent directory's inode.
+	 */
+	__u32	i_block_group;
+
+	/* block reservation info */
+	struct ext2_block_alloc_info *i_block_alloc_info;
+
+	__u32	i_dir_start_lookup;
+#ifdef CONFIG_EXT2_FS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_mutex even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
+	rwlock_t i_meta_lock;
+
+	/*
+	 * truncate_mutex is for serialising ext2_truncate() against
+	 * ext2_getblock().  It also protects the internals of the inode's
+	 * reservation data structures: ext2_reserve_window and
+	 * ext2_reserve_window_node.
+	 */
+	struct mutex truncate_mutex;
+	struct inode	vfs_inode;
+	struct list_head i_orphan;	/* unlinked but open inodes */
+};
+
+/*
+ * Inode dynamic state flags
+ */
+#define EXT2_STATE_NEW			0x00000001 /* inode is newly created */
+
+
+/*
+ * Function prototypes
+ */
+
+/*
+ * Ok, these declarations are also in <linux/kernel.h> but none of the
+ * ext2 source programs needs to include it so they are duplicated here.
+ */
+
+static inline struct ext2_inode_info *EXT2_I(struct inode *inode)
+{
+	return container_of(inode, struct ext2_inode_info, vfs_inode);
+}
+
+/* balloc.c */
+extern int ext2_bg_has_super(struct super_block *sb, int group);
+extern unsigned long ext2_bg_num_gdb(struct super_block *sb, int group);
+extern ext2_fsblk_t ext2_new_block(struct inode *, unsigned long, int *);
+extern ext2_fsblk_t ext2_new_blocks(struct inode *, unsigned long,
+				unsigned long *, int *);
+extern void ext2_free_blocks (struct inode *, unsigned long,
+			      unsigned long);
+extern unsigned long ext2_count_free_blocks (struct super_block *);
+extern unsigned long ext2_count_dirs (struct super_block *);
+extern void ext2_check_blocks_bitmap (struct super_block *);
+extern struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
+						    unsigned int block_group,
+						    struct buffer_head ** bh);
+extern void ext2_discard_reservation (struct inode *);
+extern int ext2_should_retry_alloc(struct super_block *sb, int *retries);
+extern void ext2_init_block_alloc_info(struct inode *);
+extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_window_node *rsv);
+
+/* dir.c */
+extern int ext2_add_link (struct dentry *, struct inode *);
+extern ino_t ext2_inode_by_name(struct inode *, struct qstr *);
+extern int ext2_make_empty(struct inode *, struct inode *);
+extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *, struct page **);
+extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
+extern int ext2_empty_dir (struct inode *);
+extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
+extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
+
+/* ialloc.c */
+extern struct inode * ext2_new_inode (struct inode *, int, const struct qstr *);
+extern void ext2_free_inode (struct inode *);
+extern unsigned long ext2_count_free_inodes (struct super_block *);
+extern void ext2_check_inodes_bitmap (struct super_block *);
+extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
+
+/* inode.c */
+extern struct inode *ext2_iget (struct super_block *, unsigned long);
+extern int ext2_write_inode (struct inode *, struct writeback_control *);
+extern void ext2_evict_inode(struct inode *);
+extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern int ext2_setattr (struct dentry *, struct iattr *);
+extern void ext2_set_inode_flags(struct inode *inode);
+extern void ext2_get_inode_flags(struct ext2_inode_info *);
+extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		       u64 start, u64 len);
+
+/* ioctl.c */
+extern long ext2_ioctl(struct file *, unsigned int, unsigned long);
+extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* namei.c */
+struct dentry *ext2_get_parent(struct dentry *child);
+
+/* super.c */
+extern void ext2_error (struct super_block *, const char *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern void ext2_msg(struct super_block *, const char *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern void ext2_update_dynamic_rev (struct super_block *sb);
+extern void ext2_write_super (struct super_block *);
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations ext2_dir_operations;
+
+/* file.c */
+extern int ext2_fsync(struct file *file, int datasync);
+extern const struct inode_operations ext2_file_inode_operations;
+extern const struct file_operations ext2_file_operations;
+extern const struct file_operations ext2_xip_file_operations;
+
+/* inode.c */
+extern const struct address_space_operations ext2_aops;
+extern const struct address_space_operations ext2_aops_xip;
+extern const struct address_space_operations ext2_nobh_aops;
+
+/* namei.c */
+extern const struct inode_operations ext2_dir_inode_operations;
+extern const struct inode_operations ext2_special_inode_operations;
+
+/* symlink.c */
+extern const struct inode_operations ext2_fast_symlink_inode_operations;
+extern const struct inode_operations ext2_symlink_inode_operations;
+
+static inline ext2_fsblk_t
+ext2_group_first_block_no(struct super_block *sb, unsigned long group_no)
+{
+	return group_no * (ext2_fsblk_t)EXT2_BLOCKS_PER_GROUP(sb) +
+		le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block);
+}
+
+#define ext2_set_bit	__test_and_set_bit_le
+#define ext2_clear_bit	__test_and_clear_bit_le
+#define ext2_test_bit	test_bit_le
+#define ext2_find_first_zero_bit	find_first_zero_bit_le
+#define ext2_find_next_zero_bit		find_next_zero_bit_le
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
new file mode 100644
index 00000000..49eec945
--- /dev/null
+++ b/fs/ext2/file.c
@@ -0,0 +1,107 @@
+/*
+ *  linux/fs/ext2/file.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/file.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 fs regular file handling primitives
+ *
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ * 	(jj@sunsite.ms.mff.cuni.cz)
+ */
+
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include "ext2.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Called when filp is released. This happens when all file descriptors
+ * for a single struct file are closed. Note that different open() calls
+ * for the same file yield different struct file structures.
+ */
+static int ext2_release_file (struct inode * inode, struct file * filp)
+{
+	if (filp->f_mode & FMODE_WRITE) {
+		mutex_lock(&EXT2_I(inode)->truncate_mutex);
+		ext2_discard_reservation(inode);
+		mutex_unlock(&EXT2_I(inode)->truncate_mutex);
+	}
+	return 0;
+}
+
+int ext2_fsync(struct file *file, int datasync)
+{
+	int ret;
+	struct super_block *sb = file->f_mapping->host->i_sb;
+	struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+
+	ret = generic_file_fsync(file, datasync);
+	if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
+		/* We don't really know where the IO error happened... */
+		ext2_error(sb, __func__,
+			   "detected IO error when writing metadata buffers");
+		ret = -EIO;
+	}
+	return ret;
+}
+
+/*
+ * We have mostly NULL's here: the current defaults are ok for
+ * the ext2 filesystem.
+ */
+const struct file_operations ext2_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.unlocked_ioctl = ext2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext2_compat_ioctl,
+#endif
+	.mmap		= generic_file_mmap,
+	.open		= dquot_file_open,
+	.release	= ext2_release_file,
+	.fsync		= ext2_fsync,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+};
+
+#ifdef CONFIG_EXT2_FS_XIP
+const struct file_operations ext2_xip_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= xip_file_read,
+	.write		= xip_file_write,
+	.unlocked_ioctl = ext2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext2_compat_ioctl,
+#endif
+	.mmap		= xip_file_mmap,
+	.open		= dquot_file_open,
+	.release	= ext2_release_file,
+	.fsync		= ext2_fsync,
+};
+#endif
+
+const struct inode_operations ext2_file_inode_operations = {
+#ifdef CONFIG_EXT2_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext2_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.setattr	= ext2_setattr,
+	.check_acl	= ext2_check_acl,
+	.fiemap		= ext2_fiemap,
+};
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
new file mode 100644
index 00000000..ee9ed319
--- /dev/null
+++ b/fs/ext2/ialloc.c
@@ -0,0 +1,674 @@
+/*
+ *  linux/fs/ext2/ialloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  BSD ufs-inspired inode and directory allocation by 
+ *  Stephen Tweedie (sct@dcs.ed.ac.uk), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/quotaops.h>
+#include <linux/sched.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/random.h>
+#include "ext2.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * ialloc.c contains the inodes allocation and deallocation routines
+ */
+
+/*
+ * The free inodes are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.
+ */
+
+
+/*
+ * Read the inode allocation bitmap for a given block_group, reading
+ * into the specified slot in the superblock's bitmap cache.
+ *
+ * Return buffer_head of bitmap on success or NULL.
+ */
+static struct buffer_head *
+read_inode_bitmap(struct super_block * sb, unsigned long block_group)
+{
+	struct ext2_group_desc *desc;
+	struct buffer_head *bh = NULL;
+
+	desc = ext2_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		goto error_out;
+
+	bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap));
+	if (!bh)
+		ext2_error(sb, "read_inode_bitmap",
+			    "Cannot read inode bitmap - "
+			    "block_group = %lu, inode_bitmap = %u",
+			    block_group, le32_to_cpu(desc->bg_inode_bitmap));
+error_out:
+	return bh;
+}
+
+static void ext2_release_inode(struct super_block *sb, int group, int dir)
+{
+	struct ext2_group_desc * desc;
+	struct buffer_head *bh;
+
+	desc = ext2_get_group_desc(sb, group, &bh);
+	if (!desc) {
+		ext2_error(sb, "ext2_release_inode",
+			"can't get descriptor for group %d", group);
+		return;
+	}
+
+	spin_lock(sb_bgl_lock(EXT2_SB(sb), group));
+	le16_add_cpu(&desc->bg_free_inodes_count, 1);
+	if (dir)
+		le16_add_cpu(&desc->bg_used_dirs_count, -1);
+	spin_unlock(sb_bgl_lock(EXT2_SB(sb), group));
+	if (dir)
+		percpu_counter_dec(&EXT2_SB(sb)->s_dirs_counter);
+	sb->s_dirt = 1;
+	mark_buffer_dirty(bh);
+}
+
+/*
+ * NOTE! When we get the inode, we're the only people
+ * that have access to it, and as such there are no
+ * race conditions we have to worry about. The inode
+ * is not on the hash-lists, and it cannot be reached
+ * through the filesystem because the directory entry
+ * has been deleted earlier.
+ *
+ * HOWEVER: we must make sure that we get no aliases,
+ * which means that we have to call "clear_inode()"
+ * _before_ we mark the inode not in use in the inode
+ * bitmaps. Otherwise a newly created file might use
+ * the same inode number (not actually the same pointer
+ * though), and then we'd have two inodes sharing the
+ * same inode number and space on the harddisk.
+ */
+void ext2_free_inode (struct inode * inode)
+{
+	struct super_block * sb = inode->i_sb;
+	int is_directory;
+	unsigned long ino;
+	struct buffer_head *bitmap_bh;
+	unsigned long block_group;
+	unsigned long bit;
+	struct ext2_super_block * es;
+
+	ino = inode->i_ino;
+	ext2_debug ("freeing inode %lu\n", ino);
+
+	/*
+	 * Note: we must free any quota before locking the superblock,
+	 * as writing the quota to disk may need the lock as well.
+	 */
+	/* Quota is already initialized in iput() */
+	ext2_xattr_delete_inode(inode);
+	dquot_free_inode(inode);
+	dquot_drop(inode);
+
+	es = EXT2_SB(sb)->s_es;
+	is_directory = S_ISDIR(inode->i_mode);
+
+	if (ino < EXT2_FIRST_INO(sb) ||
+	    ino > le32_to_cpu(es->s_inodes_count)) {
+		ext2_error (sb, "ext2_free_inode",
+			    "reserved or nonexistent inode %lu", ino);
+		return;
+	}
+	block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
+	bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb);
+	bitmap_bh = read_inode_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		return;
+
+	/* Ok, now we can actually update the inode bitmaps.. */
+	if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group),
+				bit, (void *) bitmap_bh->b_data))
+		ext2_error (sb, "ext2_free_inode",
+			      "bit already cleared for inode %lu", ino);
+	else
+		ext2_release_inode(sb, block_group, is_directory);
+	mark_buffer_dirty(bitmap_bh);
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		sync_dirty_buffer(bitmap_bh);
+
+	brelse(bitmap_bh);
+}
+
+/*
+ * We perform asynchronous prereading of the new inode's inode block when
+ * we create the inode, in the expectation that the inode will be written
+ * back soon.  There are two reasons:
+ *
+ * - When creating a large number of files, the async prereads will be
+ *   nicely merged into large reads
+ * - When writing out a large number of inodes, we don't need to keep on
+ *   stalling the writes while we read the inode block.
+ *
+ * FIXME: ext2_get_group_desc() needs to be simplified.
+ */
+static void ext2_preread_inode(struct inode *inode)
+{
+	unsigned long block_group;
+	unsigned long offset;
+	unsigned long block;
+	struct ext2_group_desc * gdp;
+	struct backing_dev_info *bdi;
+
+	bdi = inode->i_mapping->backing_dev_info;
+	if (bdi_read_congested(bdi))
+		return;
+	if (bdi_write_congested(bdi))
+		return;
+
+	block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
+	gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL);
+	if (gdp == NULL)
+		return;
+
+	/*
+	 * Figure out the offset within the block group inode table
+	 */
+	offset = ((inode->i_ino - 1) % EXT2_INODES_PER_GROUP(inode->i_sb)) *
+				EXT2_INODE_SIZE(inode->i_sb);
+	block = le32_to_cpu(gdp->bg_inode_table) +
+				(offset >> EXT2_BLOCK_SIZE_BITS(inode->i_sb));
+	sb_breadahead(inode->i_sb, block);
+}
+
+/*
+ * There are two policies for allocating an inode.  If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory\'s block
+ * group to find a free inode.
+ */
+static int find_group_dir(struct super_block *sb, struct inode *parent)
+{
+	int ngroups = EXT2_SB(sb)->s_groups_count;
+	int avefreei = ext2_count_free_inodes(sb) / ngroups;
+	struct ext2_group_desc *desc, *best_desc = NULL;
+	int group, best_group = -1;
+
+	for (group = 0; group < ngroups; group++) {
+		desc = ext2_get_group_desc (sb, group, NULL);
+		if (!desc || !desc->bg_free_inodes_count)
+			continue;
+		if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+			continue;
+		if (!best_desc || 
+		    (le16_to_cpu(desc->bg_free_blocks_count) >
+		     le16_to_cpu(best_desc->bg_free_blocks_count))) {
+			best_group = group;
+			best_desc = desc;
+		}
+	}
+	if (!best_desc)
+		return -1;
+
+	return best_group;
+}
+
+/* 
+ * Orlov's allocator for directories. 
+ * 
+ * We always try to spread first-level directories.
+ *
+ * If there are blockgroups with both free inodes and free blocks counts 
+ * not worse than average we return one with smallest directory count. 
+ * Otherwise we simply return a random group. 
+ * 
+ * For the rest rules look so: 
+ * 
+ * It's OK to put directory into a group unless 
+ * it has too many directories already (max_dirs) or 
+ * it has too few free inodes left (min_inodes) or 
+ * it has too few free blocks left (min_blocks) or 
+ * it's already running too large debt (max_debt). 
+ * Parent's group is preferred, if it doesn't satisfy these 
+ * conditions we search cyclically through the rest. If none 
+ * of the groups look good we just look for a group with more 
+ * free inodes than average (starting at parent's group). 
+ * 
+ * Debt is incremented each time we allocate a directory and decremented 
+ * when we allocate an inode, within 0--255. 
+ */ 
+
+#define INODE_COST 64
+#define BLOCK_COST 256
+
+static int find_group_orlov(struct super_block *sb, struct inode *parent)
+{
+	int parent_group = EXT2_I(parent)->i_block_group;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	struct ext2_super_block *es = sbi->s_es;
+	int ngroups = sbi->s_groups_count;
+	int inodes_per_group = EXT2_INODES_PER_GROUP(sb);
+	int freei;
+	int avefreei;
+	int free_blocks;
+	int avefreeb;
+	int blocks_per_dir;
+	int ndirs;
+	int max_debt, max_dirs, min_blocks, min_inodes;
+	int group = -1, i;
+	struct ext2_group_desc *desc;
+
+	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
+	avefreei = freei / ngroups;
+	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	avefreeb = free_blocks / ngroups;
+	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
+
+	if ((parent == sb->s_root->d_inode) ||
+	    (EXT2_I(parent)->i_flags & EXT2_TOPDIR_FL)) {
+		struct ext2_group_desc *best_desc = NULL;
+		int best_ndir = inodes_per_group;
+		int best_group = -1;
+
+		get_random_bytes(&group, sizeof(group));
+		parent_group = (unsigned)group % ngroups;
+		for (i = 0; i < ngroups; i++) {
+			group = (parent_group + i) % ngroups;
+			desc = ext2_get_group_desc (sb, group, NULL);
+			if (!desc || !desc->bg_free_inodes_count)
+				continue;
+			if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+				continue;
+			if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+				continue;
+			if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+				continue;
+			best_group = group;
+			best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+			best_desc = desc;
+		}
+		if (best_group >= 0) {
+			desc = best_desc;
+			group = best_group;
+			goto found;
+		}
+		goto fallback;
+	}
+
+	if (ndirs == 0)
+		ndirs = 1;	/* percpu_counters are approximate... */
+
+	blocks_per_dir = (le32_to_cpu(es->s_blocks_count)-free_blocks) / ndirs;
+
+	max_dirs = ndirs / ngroups + inodes_per_group / 16;
+	min_inodes = avefreei - inodes_per_group / 4;
+	min_blocks = avefreeb - EXT2_BLOCKS_PER_GROUP(sb) / 4;
+
+	max_debt = EXT2_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, BLOCK_COST);
+	if (max_debt * INODE_COST > inodes_per_group)
+		max_debt = inodes_per_group / INODE_COST;
+	if (max_debt > 255)
+		max_debt = 255;
+	if (max_debt == 0)
+		max_debt = 1;
+
+	for (i = 0; i < ngroups; i++) {
+		group = (parent_group + i) % ngroups;
+		desc = ext2_get_group_desc (sb, group, NULL);
+		if (!desc || !desc->bg_free_inodes_count)
+			continue;
+		if (sbi->s_debts[group] >= max_debt)
+			continue;
+		if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+			continue;
+		if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+			continue;
+		if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
+			continue;
+		goto found;
+	}
+
+fallback:
+	for (i = 0; i < ngroups; i++) {
+		group = (parent_group + i) % ngroups;
+		desc = ext2_get_group_desc (sb, group, NULL);
+		if (!desc || !desc->bg_free_inodes_count)
+			continue;
+		if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+			goto found;
+	}
+
+	if (avefreei) {
+		/*
+		 * The free-inodes counter is approximate, and for really small
+		 * filesystems the above test can fail to find any blockgroups
+		 */
+		avefreei = 0;
+		goto fallback;
+	}
+
+	return -1;
+
+found:
+	return group;
+}
+
+static int find_group_other(struct super_block *sb, struct inode *parent)
+{
+	int parent_group = EXT2_I(parent)->i_block_group;
+	int ngroups = EXT2_SB(sb)->s_groups_count;
+	struct ext2_group_desc *desc;
+	int group, i;
+
+	/*
+	 * Try to place the inode in its parent directory
+	 */
+	group = parent_group;
+	desc = ext2_get_group_desc (sb, group, NULL);
+	if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+			le16_to_cpu(desc->bg_free_blocks_count))
+		goto found;
+
+	/*
+	 * We're going to place this inode in a different blockgroup from its
+	 * parent.  We want to cause files in a common directory to all land in
+	 * the same blockgroup.  But we want files which are in a different
+	 * directory which shares a blockgroup with our parent to land in a
+	 * different blockgroup.
+	 *
+	 * So add our directory's i_ino into the starting point for the hash.
+	 */
+	group = (group + parent->i_ino) % ngroups;
+
+	/*
+	 * Use a quadratic hash to find a group with a free inode and some
+	 * free blocks.
+	 */
+	for (i = 1; i < ngroups; i <<= 1) {
+		group += i;
+		if (group >= ngroups)
+			group -= ngroups;
+		desc = ext2_get_group_desc (sb, group, NULL);
+		if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+				le16_to_cpu(desc->bg_free_blocks_count))
+			goto found;
+	}
+
+	/*
+	 * That failed: try linear search for a free inode, even if that group
+	 * has no free blocks.
+	 */
+	group = parent_group;
+	for (i = 0; i < ngroups; i++) {
+		if (++group >= ngroups)
+			group = 0;
+		desc = ext2_get_group_desc (sb, group, NULL);
+		if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+			goto found;
+	}
+
+	return -1;
+
+found:
+	return group;
+}
+
+struct inode *ext2_new_inode(struct inode *dir, int mode,
+			     const struct qstr *qstr)
+{
+	struct super_block *sb;
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *bh2;
+	int group, i;
+	ino_t ino = 0;
+	struct inode * inode;
+	struct ext2_group_desc *gdp;
+	struct ext2_super_block *es;
+	struct ext2_inode_info *ei;
+	struct ext2_sb_info *sbi;
+	int err;
+
+	sb = dir->i_sb;
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	ei = EXT2_I(inode);
+	sbi = EXT2_SB(sb);
+	es = sbi->s_es;
+	if (S_ISDIR(mode)) {
+		if (test_opt(sb, OLDALLOC))
+			group = find_group_dir(sb, dir);
+		else
+			group = find_group_orlov(sb, dir);
+	} else 
+		group = find_group_other(sb, dir);
+
+	if (group == -1) {
+		err = -ENOSPC;
+		goto fail;
+	}
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		gdp = ext2_get_group_desc(sb, group, &bh2);
+		brelse(bitmap_bh);
+		bitmap_bh = read_inode_bitmap(sb, group);
+		if (!bitmap_bh) {
+			err = -EIO;
+			goto fail;
+		}
+		ino = 0;
+
+repeat_in_this_group:
+		ino = ext2_find_next_zero_bit((unsigned long *)bitmap_bh->b_data,
+					      EXT2_INODES_PER_GROUP(sb), ino);
+		if (ino >= EXT2_INODES_PER_GROUP(sb)) {
+			/*
+			 * Rare race: find_group_xx() decided that there were
+			 * free inodes in this group, but by the time we tried
+			 * to allocate one, they're all gone.  This can also
+			 * occur because the counters which find_group_orlov()
+			 * uses are approximate.  So just go and search the
+			 * next block group.
+			 */
+			if (++group == sbi->s_groups_count)
+				group = 0;
+			continue;
+		}
+		if (ext2_set_bit_atomic(sb_bgl_lock(sbi, group),
+						ino, bitmap_bh->b_data)) {
+			/* we lost this inode */
+			if (++ino >= EXT2_INODES_PER_GROUP(sb)) {
+				/* this group is exhausted, try next group */
+				if (++group == sbi->s_groups_count)
+					group = 0;
+				continue;
+			}
+			/* try to find free inode in the same group */
+			goto repeat_in_this_group;
+		}
+		goto got;
+	}
+
+	/*
+	 * Scanned all blockgroups.
+	 */
+	err = -ENOSPC;
+	goto fail;
+got:
+	mark_buffer_dirty(bitmap_bh);
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		sync_dirty_buffer(bitmap_bh);
+	brelse(bitmap_bh);
+
+	ino += group * EXT2_INODES_PER_GROUP(sb) + 1;
+	if (ino < EXT2_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+		ext2_error (sb, "ext2_new_inode",
+			    "reserved inode or inode > inodes count - "
+			    "block_group = %d,inode=%lu", group,
+			    (unsigned long) ino);
+		err = -EIO;
+		goto fail;
+	}
+
+	percpu_counter_add(&sbi->s_freeinodes_counter, -1);
+	if (S_ISDIR(mode))
+		percpu_counter_inc(&sbi->s_dirs_counter);
+
+	spin_lock(sb_bgl_lock(sbi, group));
+	le16_add_cpu(&gdp->bg_free_inodes_count, -1);
+	if (S_ISDIR(mode)) {
+		if (sbi->s_debts[group] < 255)
+			sbi->s_debts[group]++;
+		le16_add_cpu(&gdp->bg_used_dirs_count, 1);
+	} else {
+		if (sbi->s_debts[group])
+			sbi->s_debts[group]--;
+	}
+	spin_unlock(sb_bgl_lock(sbi, group));
+
+	sb->s_dirt = 1;
+	mark_buffer_dirty(bh2);
+	if (test_opt(sb, GRPID)) {
+		inode->i_mode = mode;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = dir->i_gid;
+	} else
+		inode_init_owner(inode, dir, mode);
+
+	inode->i_ino = ino;
+	inode->i_blocks = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	memset(ei->i_data, 0, sizeof(ei->i_data));
+	ei->i_flags =
+		ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
+	ei->i_faddr = 0;
+	ei->i_frag_no = 0;
+	ei->i_frag_size = 0;
+	ei->i_file_acl = 0;
+	ei->i_dir_acl = 0;
+	ei->i_dtime = 0;
+	ei->i_block_alloc_info = NULL;
+	ei->i_block_group = group;
+	ei->i_dir_start_lookup = 0;
+	ei->i_state = EXT2_STATE_NEW;
+	ext2_set_inode_flags(inode);
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+	if (insert_inode_locked(inode) < 0) {
+		err = -EINVAL;
+		goto fail_drop;
+	}
+
+	dquot_initialize(inode);
+	err = dquot_alloc_inode(inode);
+	if (err)
+		goto fail_drop;
+
+	err = ext2_init_acl(inode, dir);
+	if (err)
+		goto fail_free_drop;
+
+	err = ext2_init_security(inode, dir, qstr);
+	if (err)
+		goto fail_free_drop;
+
+	mark_inode_dirty(inode);
+	ext2_debug("allocating inode %lu\n", inode->i_ino);
+	ext2_preread_inode(inode);
+	return inode;
+
+fail_free_drop:
+	dquot_free_inode(inode);
+
+fail_drop:
+	dquot_drop(inode);
+	inode->i_flags |= S_NOQUOTA;
+	inode->i_nlink = 0;
+	unlock_new_inode(inode);
+	iput(inode);
+	return ERR_PTR(err);
+
+fail:
+	make_bad_inode(inode);
+	iput(inode);
+	return ERR_PTR(err);
+}
+
+unsigned long ext2_count_free_inodes (struct super_block * sb)
+{
+	struct ext2_group_desc *desc;
+	unsigned long desc_count = 0;
+	int i;	
+
+#ifdef EXT2FS_DEBUG
+	struct ext2_super_block *es;
+	unsigned long bitmap_count = 0;
+	struct buffer_head *bitmap_bh = NULL;
+
+	es = EXT2_SB(sb)->s_es;
+	for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+		unsigned x;
+
+		desc = ext2_get_group_desc (sb, i, NULL);
+		if (!desc)
+			continue;
+		desc_count += le16_to_cpu(desc->bg_free_inodes_count);
+		brelse(bitmap_bh);
+		bitmap_bh = read_inode_bitmap(sb, i);
+		if (!bitmap_bh)
+			continue;
+
+		x = ext2_count_free(bitmap_bh, EXT2_INODES_PER_GROUP(sb) / 8);
+		printk("group %d: stored = %d, counted = %u\n",
+			i, le16_to_cpu(desc->bg_free_inodes_count), x);
+		bitmap_count += x;
+	}
+	brelse(bitmap_bh);
+	printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n",
+		percpu_counter_read(&EXT2_SB(sb)->s_freeinodes_counter),
+		desc_count, bitmap_count);
+	return desc_count;
+#else
+	for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+		desc = ext2_get_group_desc (sb, i, NULL);
+		if (!desc)
+			continue;
+		desc_count += le16_to_cpu(desc->bg_free_inodes_count);
+	}
+	return desc_count;
+#endif
+}
+
+/* Called at mount-time, super-block is locked */
+unsigned long ext2_count_dirs (struct super_block * sb)
+{
+	unsigned long count = 0;
+	int i;
+
+	for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+		struct ext2_group_desc *gdp = ext2_get_group_desc (sb, i, NULL);
+		if (!gdp)
+			continue;
+		count += le16_to_cpu(gdp->bg_used_dirs_count);
+	}
+	return count;
+}
+
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
new file mode 100644
index 00000000..788e09a0
--- /dev/null
+++ b/fs/ext2/inode.c
@@ -0,0 +1,1552 @@
+/*
+ *  linux/fs/ext2/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Goal-directed block allocation by Stephen Tweedie
+ * 	(sct@dcs.ed.ac.uk), 1993, 1998
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ * 	(jj@sunsite.ms.mff.cuni.cz)
+ *
+ *  Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
+ */
+
+#include <linux/time.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/fiemap.h>
+#include <linux/namei.h>
+#include "ext2.h"
+#include "acl.h"
+#include "xip.h"
+
+MODULE_AUTHOR("Remy Card and others");
+MODULE_DESCRIPTION("Second Extended Filesystem");
+MODULE_LICENSE("GPL");
+
+static int __ext2_write_inode(struct inode *inode, int do_sync);
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int ext2_inode_is_fast_symlink(struct inode *inode)
+{
+	int ea_blocks = EXT2_I(inode)->i_file_acl ?
+		(inode->i_sb->s_blocksize >> 9) : 0;
+
+	return (S_ISLNK(inode->i_mode) &&
+		inode->i_blocks - ea_blocks == 0);
+}
+
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset);
+
+static void ext2_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, to, inode->i_size);
+		ext2_truncate_blocks(inode, inode->i_size);
+	}
+}
+
+/*
+ * Called at the last iput() if i_nlink is zero.
+ */
+void ext2_evict_inode(struct inode * inode)
+{
+	struct ext2_block_alloc_info *rsv;
+	int want_delete = 0;
+
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
+		want_delete = 1;
+		dquot_initialize(inode);
+	} else {
+		dquot_drop(inode);
+	}
+
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (want_delete) {
+		/* set dtime */
+		EXT2_I(inode)->i_dtime	= get_seconds();
+		mark_inode_dirty(inode);
+		__ext2_write_inode(inode, inode_needs_sync(inode));
+		/* truncate to 0 */
+		inode->i_size = 0;
+		if (inode->i_blocks)
+			ext2_truncate_blocks(inode, 0);
+	}
+
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
+
+	ext2_discard_reservation(inode);
+	rsv = EXT2_I(inode)->i_block_alloc_info;
+	EXT2_I(inode)->i_block_alloc_info = NULL;
+	if (unlikely(rsv))
+		kfree(rsv);
+
+	if (want_delete)
+		ext2_free_inode(inode);
+}
+
+typedef struct {
+	__le32	*p;
+	__le32	key;
+	struct buffer_head *bh;
+} Indirect;
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+{
+	p->key = *(p->p = v);
+	p->bh = bh;
+}
+
+static inline int verify_chain(Indirect *from, Indirect *to)
+{
+	while (from <= to && from->key == *from->p)
+		from++;
+	return (from > to);
+}
+
+/**
+ *	ext2_block_to_path - parse the block number into array of offsets
+ *	@inode: inode in question (we are only interested in its superblock)
+ *	@i_block: block number to be parsed
+ *	@offsets: array to store the offsets in
+ *      @boundary: set this non-zero if the referred-to block is likely to be
+ *             followed (on disk) by an indirect block.
+ *	To store the locations of file's data ext2 uses a data structure common
+ *	for UNIX filesystems - tree of pointers anchored in the inode, with
+ *	data blocks at leaves and indirect blocks in intermediate nodes.
+ *	This function translates the block number into path in that tree -
+ *	return value is the path length and @offsets[n] is the offset of
+ *	pointer to (n+1)th node in the nth one. If @block is out of range
+ *	(negative or too large) warning is printed and zero returned.
+ *
+ *	Note: function doesn't find node addresses, so no IO is needed. All
+ *	we need to know is the capacity of indirect blocks (taken from the
+ *	inode->i_sb).
+ */
+
+/*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+
+static int ext2_block_to_path(struct inode *inode,
+			long i_block, int offsets[4], int *boundary)
+{
+	int ptrs = EXT2_ADDR_PER_BLOCK(inode->i_sb);
+	int ptrs_bits = EXT2_ADDR_PER_BLOCK_BITS(inode->i_sb);
+	const long direct_blocks = EXT2_NDIR_BLOCKS,
+		indirect_blocks = ptrs,
+		double_blocks = (1 << (ptrs_bits * 2));
+	int n = 0;
+	int final = 0;
+
+	if (i_block < 0) {
+		ext2_msg(inode->i_sb, KERN_WARNING,
+			"warning: %s: block < 0", __func__);
+	} else if (i_block < direct_blocks) {
+		offsets[n++] = i_block;
+		final = direct_blocks;
+	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
+		offsets[n++] = EXT2_IND_BLOCK;
+		offsets[n++] = i_block;
+		final = ptrs;
+	} else if ((i_block -= indirect_blocks) < double_blocks) {
+		offsets[n++] = EXT2_DIND_BLOCK;
+		offsets[n++] = i_block >> ptrs_bits;
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+		offsets[n++] = EXT2_TIND_BLOCK;
+		offsets[n++] = i_block >> (ptrs_bits * 2);
+		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else {
+		ext2_msg(inode->i_sb, KERN_WARNING,
+			"warning: %s: block is too big", __func__);
+	}
+	if (boundary)
+		*boundary = final - 1 - (i_block & (ptrs - 1));
+
+	return n;
+}
+
+/**
+ *	ext2_get_branch - read the chain of indirect blocks leading to data
+ *	@inode: inode in question
+ *	@depth: depth of the chain (1 - direct pointer, etc.)
+ *	@offsets: offsets of pointers in inode/indirect blocks
+ *	@chain: place to store the result
+ *	@err: here we store the error value
+ *
+ *	Function fills the array of triples <key, p, bh> and returns %NULL
+ *	if everything went OK or the pointer to the last filled triple
+ *	(incomplete one) otherwise. Upon the return chain[i].key contains
+ *	the number of (i+1)-th block in the chain (as it is stored in memory,
+ *	i.e. little-endian 32-bit), chain[i].p contains the address of that
+ *	number (it points into struct inode for i==0 and into the bh->b_data
+ *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ *	block for i>0 and NULL for i==0. In other words, it holds the block
+ *	numbers of the chain, addresses they were taken from (and where we can
+ *	verify that chain did not change) and buffer_heads hosting these
+ *	numbers.
+ *
+ *	Function stops when it stumbles upon zero pointer (absent block)
+ *		(pointer to last triple returned, *@err == 0)
+ *	or when it gets an IO error reading an indirect block
+ *		(ditto, *@err == -EIO)
+ *	or when it notices that chain had been changed while it was reading
+ *		(ditto, *@err == -EAGAIN)
+ *	or when it reads all @depth-1 indirect blocks successfully and finds
+ *	the whole chain, all way to the data (returns %NULL, *err == 0).
+ */
+static Indirect *ext2_get_branch(struct inode *inode,
+				 int depth,
+				 int *offsets,
+				 Indirect chain[4],
+				 int *err)
+{
+	struct super_block *sb = inode->i_sb;
+	Indirect *p = chain;
+	struct buffer_head *bh;
+
+	*err = 0;
+	/* i_data is not going away, no lock needed */
+	add_chain (chain, NULL, EXT2_I(inode)->i_data + *offsets);
+	if (!p->key)
+		goto no_block;
+	while (--depth) {
+		bh = sb_bread(sb, le32_to_cpu(p->key));
+		if (!bh)
+			goto failure;
+		read_lock(&EXT2_I(inode)->i_meta_lock);
+		if (!verify_chain(chain, p))
+			goto changed;
+		add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
+		read_unlock(&EXT2_I(inode)->i_meta_lock);
+		if (!p->key)
+			goto no_block;
+	}
+	return NULL;
+
+changed:
+	read_unlock(&EXT2_I(inode)->i_meta_lock);
+	brelse(bh);
+	*err = -EAGAIN;
+	goto no_block;
+failure:
+	*err = -EIO;
+no_block:
+	return p;
+}
+
+/**
+ *	ext2_find_near - find a place for allocation with sufficient locality
+ *	@inode: owner
+ *	@ind: descriptor of indirect block.
+ *
+ *	This function returns the preferred place for block allocation.
+ *	It is used when heuristic for sequential allocation fails.
+ *	Rules are:
+ *	  + if there is a block to the left of our position - allocate near it.
+ *	  + if pointer will live in indirect block - allocate near that block.
+ *	  + if pointer will live in inode - allocate in the same cylinder group.
+ *
+ * In the latter case we colour the starting block by the callers PID to
+ * prevent it from clashing with concurrent allocations for a different inode
+ * in the same block group.   The PID is used here so that functionally related
+ * files will be close-by on-disk.
+ *
+ *	Caller must make sure that @ind is valid and will stay that way.
+ */
+
+static ext2_fsblk_t ext2_find_near(struct inode *inode, Indirect *ind)
+{
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	__le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
+	__le32 *p;
+	ext2_fsblk_t bg_start;
+	ext2_fsblk_t colour;
+
+	/* Try to find previous block */
+	for (p = ind->p - 1; p >= start; p--)
+		if (*p)
+			return le32_to_cpu(*p);
+
+	/* No such thing, so let's try location of indirect block */
+	if (ind->bh)
+		return ind->bh->b_blocknr;
+
+	/*
+	 * It is going to be referred from inode itself? OK, just put it into
+	 * the same cylinder group then.
+	 */
+	bg_start = ext2_group_first_block_no(inode->i_sb, ei->i_block_group);
+	colour = (current->pid % 16) *
+			(EXT2_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	return bg_start + colour;
+}
+
+/**
+ *	ext2_find_goal - find a preferred place for allocation.
+ *	@inode: owner
+ *	@block:  block we want
+ *	@partial: pointer to the last triple within a chain
+ *
+ *	Returns preferred place for a block (the goal).
+ */
+
+static inline ext2_fsblk_t ext2_find_goal(struct inode *inode, long block,
+					  Indirect *partial)
+{
+	struct ext2_block_alloc_info *block_i;
+
+	block_i = EXT2_I(inode)->i_block_alloc_info;
+
+	/*
+	 * try the heuristic for sequential allocation,
+	 * failing that at least try to get decent locality.
+	 */
+	if (block_i && (block == block_i->last_alloc_logical_block + 1)
+		&& (block_i->last_alloc_physical_block != 0)) {
+		return block_i->last_alloc_physical_block + 1;
+	}
+
+	return ext2_find_near(inode, partial);
+}
+
+/**
+ *	ext2_blks_to_allocate: Look up the block map and count the number
+ *	of direct blocks need to be allocated for the given branch.
+ *
+ * 	@branch: chain of indirect blocks
+ *	@k: number of blocks need for indirect blocks
+ *	@blks: number of data blocks to be mapped.
+ *	@blocks_to_boundary:  the offset in the indirect block
+ *
+ *	return the total number of blocks to be allocate, including the
+ *	direct and indirect blocks.
+ */
+static int
+ext2_blks_to_allocate(Indirect * branch, int k, unsigned long blks,
+		int blocks_to_boundary)
+{
+	unsigned long count = 0;
+
+	/*
+	 * Simple case, [t,d]Indirect block(s) has not allocated yet
+	 * then it's clear blocks on that path have not allocated
+	 */
+	if (k > 0) {
+		/* right now don't hanel cross boundary allocation */
+		if (blks < blocks_to_boundary + 1)
+			count += blks;
+		else
+			count += blocks_to_boundary + 1;
+		return count;
+	}
+
+	count++;
+	while (count < blks && count <= blocks_to_boundary
+		&& le32_to_cpu(*(branch[0].p + count)) == 0) {
+		count++;
+	}
+	return count;
+}
+
+/**
+ *	ext2_alloc_blocks: multiple allocate blocks needed for a branch
+ *	@indirect_blks: the number of blocks need to allocate for indirect
+ *			blocks
+ *
+ *	@new_blocks: on return it will store the new block numbers for
+ *	the indirect blocks(if needed) and the first direct block,
+ *	@blks:	on return it will store the total number of allocated
+ *		direct blocks
+ */
+static int ext2_alloc_blocks(struct inode *inode,
+			ext2_fsblk_t goal, int indirect_blks, int blks,
+			ext2_fsblk_t new_blocks[4], int *err)
+{
+	int target, i;
+	unsigned long count = 0;
+	int index = 0;
+	ext2_fsblk_t current_block = 0;
+	int ret = 0;
+
+	/*
+	 * Here we try to allocate the requested multiple blocks at once,
+	 * on a best-effort basis.
+	 * To build a branch, we should allocate blocks for
+	 * the indirect blocks(if not allocated yet), and at least
+	 * the first direct block of this branch.  That's the
+	 * minimum number of blocks need to allocate(required)
+	 */
+	target = blks + indirect_blks;
+
+	while (1) {
+		count = target;
+		/* allocating blocks for indirect blocks and direct blocks */
+		current_block = ext2_new_blocks(inode,goal,&count,err);
+		if (*err)
+			goto failed_out;
+
+		target -= count;
+		/* allocate blocks for indirect blocks */
+		while (index < indirect_blks && count) {
+			new_blocks[index++] = current_block++;
+			count--;
+		}
+
+		if (count > 0)
+			break;
+	}
+
+	/* save the new block number for the first direct block */
+	new_blocks[index] = current_block;
+
+	/* total number of blocks allocated for direct blocks */
+	ret = count;
+	*err = 0;
+	return ret;
+failed_out:
+	for (i = 0; i <index; i++)
+		ext2_free_blocks(inode, new_blocks[i], 1);
+	if (index)
+		mark_inode_dirty(inode);
+	return ret;
+}
+
+/**
+ *	ext2_alloc_branch - allocate and set up a chain of blocks.
+ *	@inode: owner
+ *	@num: depth of the chain (number of blocks to allocate)
+ *	@offsets: offsets (in the blocks) to store the pointers to next.
+ *	@branch: place to store the chain in.
+ *
+ *	This function allocates @num blocks, zeroes out all but the last one,
+ *	links them into chain and (if we are synchronous) writes them to disk.
+ *	In other words, it prepares a branch that can be spliced onto the
+ *	inode. It stores the information about that chain in the branch[], in
+ *	the same format as ext2_get_branch() would do. We are calling it after
+ *	we had read the existing part of chain and partial points to the last
+ *	triple of that (one with zero ->key). Upon the exit we have the same
+ *	picture as after the successful ext2_get_block(), except that in one
+ *	place chain is disconnected - *branch->p is still zero (we did not
+ *	set the last link), but branch->key contains the number that should
+ *	be placed into *branch->p to fill that gap.
+ *
+ *	If allocation fails we free all blocks we've allocated (and forget
+ *	their buffer_heads) and return the error value the from failed
+ *	ext2_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ *	as described above and return 0.
+ */
+
+static int ext2_alloc_branch(struct inode *inode,
+			int indirect_blks, int *blks, ext2_fsblk_t goal,
+			int *offsets, Indirect *branch)
+{
+	int blocksize = inode->i_sb->s_blocksize;
+	int i, n = 0;
+	int err = 0;
+	struct buffer_head *bh;
+	int num;
+	ext2_fsblk_t new_blocks[4];
+	ext2_fsblk_t current_block;
+
+	num = ext2_alloc_blocks(inode, goal, indirect_blks,
+				*blks, new_blocks, &err);
+	if (err)
+		return err;
+
+	branch[0].key = cpu_to_le32(new_blocks[0]);
+	/*
+	 * metadata blocks and data blocks are allocated.
+	 */
+	for (n = 1; n <= indirect_blks;  n++) {
+		/*
+		 * Get buffer_head for parent block, zero it out
+		 * and set the pointer to new one, then send
+		 * parent to disk.
+		 */
+		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+		branch[n].bh = bh;
+		lock_buffer(bh);
+		memset(bh->b_data, 0, blocksize);
+		branch[n].p = (__le32 *) bh->b_data + offsets[n];
+		branch[n].key = cpu_to_le32(new_blocks[n]);
+		*branch[n].p = branch[n].key;
+		if ( n == indirect_blks) {
+			current_block = new_blocks[n];
+			/*
+			 * End of chain, update the last new metablock of
+			 * the chain to point to the new allocated
+			 * data blocks numbers
+			 */
+			for (i=1; i < num; i++)
+				*(branch[n].p + i) = cpu_to_le32(++current_block);
+		}
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		mark_buffer_dirty_inode(bh, inode);
+		/* We used to sync bh here if IS_SYNC(inode).
+		 * But we now rely upon generic_write_sync()
+		 * and b_inode_buffers.  But not for directories.
+		 */
+		if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
+			sync_dirty_buffer(bh);
+	}
+	*blks = num;
+	return err;
+}
+
+/**
+ * ext2_splice_branch - splice the allocated branch onto inode.
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @where: location of missing link
+ * @num:   number of indirect blocks we are adding
+ * @blks:  number of direct blocks we are adding
+ *
+ * This function fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
+ */
+static void ext2_splice_branch(struct inode *inode,
+			long block, Indirect *where, int num, int blks)
+{
+	int i;
+	struct ext2_block_alloc_info *block_i;
+	ext2_fsblk_t current_block;
+
+	block_i = EXT2_I(inode)->i_block_alloc_info;
+
+	/* XXX LOCKING probably should have i_meta_lock ?*/
+	/* That's it */
+
+	*where->p = where->key;
+
+	/*
+	 * Update the host buffer_head or inode to point to more just allocated
+	 * direct blocks blocks
+	 */
+	if (num == 0 && blks > 1) {
+		current_block = le32_to_cpu(where->key) + 1;
+		for (i = 1; i < blks; i++)
+			*(where->p + i ) = cpu_to_le32(current_block++);
+	}
+
+	/*
+	 * update the most recently allocated logical & physical block
+	 * in i_block_alloc_info, to assist find the proper goal block for next
+	 * allocation
+	 */
+	if (block_i) {
+		block_i->last_alloc_logical_block = block + blks - 1;
+		block_i->last_alloc_physical_block =
+				le32_to_cpu(where[num].key) + blks - 1;
+	}
+
+	/* We are done with atomic stuff, now do the rest of housekeeping */
+
+	/* had we spliced it onto indirect block? */
+	if (where->bh)
+		mark_buffer_dirty_inode(where->bh, inode);
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+}
+
+/*
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ *
+ * `handle' can be NULL if create == 0.
+ *
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ */
+static int ext2_get_blocks(struct inode *inode,
+			   sector_t iblock, unsigned long maxblocks,
+			   struct buffer_head *bh_result,
+			   int create)
+{
+	int err = -EIO;
+	int offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	ext2_fsblk_t goal;
+	int indirect_blks;
+	int blocks_to_boundary = 0;
+	int depth;
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	int count = 0;
+	ext2_fsblk_t first_block = 0;
+
+	depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
+
+	if (depth == 0)
+		return (err);
+
+	partial = ext2_get_branch(inode, depth, offsets, chain, &err);
+	/* Simplest case - block found, no allocation needed */
+	if (!partial) {
+		first_block = le32_to_cpu(chain[depth - 1].key);
+		clear_buffer_new(bh_result); /* What's this do? */
+		count++;
+		/*map more blocks*/
+		while (count < maxblocks && count <= blocks_to_boundary) {
+			ext2_fsblk_t blk;
+
+			if (!verify_chain(chain, chain + depth - 1)) {
+				/*
+				 * Indirect block might be removed by
+				 * truncate while we were reading it.
+				 * Handling of that case: forget what we've
+				 * got now, go to reread.
+				 */
+				err = -EAGAIN;
+				count = 0;
+				break;
+			}
+			blk = le32_to_cpu(*(chain[depth-1].p + count));
+			if (blk == first_block + count)
+				count++;
+			else
+				break;
+		}
+		if (err != -EAGAIN)
+			goto got_it;
+	}
+
+	/* Next simple case - plain lookup or failed read of indirect block */
+	if (!create || err == -EIO)
+		goto cleanup;
+
+	mutex_lock(&ei->truncate_mutex);
+	/*
+	 * If the indirect block is missing while we are reading
+	 * the chain(ext2_get_branch() returns -EAGAIN err), or
+	 * if the chain has been changed after we grab the semaphore,
+	 * (either because another process truncated this branch, or
+	 * another get_block allocated this branch) re-grab the chain to see if
+	 * the request block has been allocated or not.
+	 *
+	 * Since we already block the truncate/other get_block
+	 * at this point, we will have the current copy of the chain when we
+	 * splice the branch into the tree.
+	 */
+	if (err == -EAGAIN || !verify_chain(chain, partial)) {
+		while (partial > chain) {
+			brelse(partial->bh);
+			partial--;
+		}
+		partial = ext2_get_branch(inode, depth, offsets, chain, &err);
+		if (!partial) {
+			count++;
+			mutex_unlock(&ei->truncate_mutex);
+			if (err)
+				goto cleanup;
+			clear_buffer_new(bh_result);
+			goto got_it;
+		}
+	}
+
+	/*
+	 * Okay, we need to do block allocation.  Lazily initialize the block
+	 * allocation info here if necessary
+	*/
+	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
+		ext2_init_block_alloc_info(inode);
+
+	goal = ext2_find_goal(inode, iblock, partial);
+
+	/* the number of blocks need to allocate for [d,t]indirect blocks */
+	indirect_blks = (chain + depth) - partial - 1;
+	/*
+	 * Next look up the indirect map to count the totoal number of
+	 * direct blocks to allocate for this branch.
+	 */
+	count = ext2_blks_to_allocate(partial, indirect_blks,
+					maxblocks, blocks_to_boundary);
+	/*
+	 * XXX ???? Block out ext2_truncate while we alter the tree
+	 */
+	err = ext2_alloc_branch(inode, indirect_blks, &count, goal,
+				offsets + (partial - chain), partial);
+
+	if (err) {
+		mutex_unlock(&ei->truncate_mutex);
+		goto cleanup;
+	}
+
+	if (ext2_use_xip(inode->i_sb)) {
+		/*
+		 * we need to clear the block
+		 */
+		err = ext2_clear_xip_target (inode,
+			le32_to_cpu(chain[depth-1].key));
+		if (err) {
+			mutex_unlock(&ei->truncate_mutex);
+			goto cleanup;
+		}
+	}
+
+	ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
+	mutex_unlock(&ei->truncate_mutex);
+	set_buffer_new(bh_result);
+got_it:
+	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+	if (count > blocks_to_boundary)
+		set_buffer_boundary(bh_result);
+	err = count;
+	/* Clean up and exit */
+	partial = chain + depth - 1;	/* the whole chain */
+cleanup:
+	while (partial > chain) {
+		brelse(partial->bh);
+		partial--;
+	}
+	return err;
+}
+
+int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
+{
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int ret = ext2_get_blocks(inode, iblock, max_blocks,
+			      bh_result, create);
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
+	}
+	return ret;
+
+}
+
+int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		u64 start, u64 len)
+{
+	return generic_block_fiemap(inode, fieinfo, start, len,
+				    ext2_get_block);
+}
+
+static int ext2_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, ext2_get_block, wbc);
+}
+
+static int ext2_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, ext2_get_block);
+}
+
+static int
+ext2_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
+}
+
+static int
+ext2_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				ext2_get_block);
+	if (ret < 0)
+		ext2_write_failed(mapping, pos + len);
+	return ret;
+}
+
+static int ext2_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	int ret;
+
+	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	if (ret < len)
+		ext2_write_failed(mapping, pos + len);
+	return ret;
+}
+
+static int
+ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
+			       ext2_get_block);
+	if (ret < 0)
+		ext2_write_failed(mapping, pos + len);
+	return ret;
+}
+
+static int ext2_nobh_writepage(struct page *page,
+			struct writeback_control *wbc)
+{
+	return nobh_writepage(page, ext2_get_block, wbc);
+}
+
+static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,ext2_get_block);
+}
+
+static ssize_t
+ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+			loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+				iov, offset, nr_segs, ext2_get_block, NULL);
+	if (ret < 0 && (rw & WRITE))
+		ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
+	return ret;
+}
+
+static int
+ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, ext2_get_block);
+}
+
+const struct address_space_operations ext2_aops = {
+	.readpage		= ext2_readpage,
+	.readpages		= ext2_readpages,
+	.writepage		= ext2_writepage,
+	.write_begin		= ext2_write_begin,
+	.write_end		= ext2_write_end,
+	.bmap			= ext2_bmap,
+	.direct_IO		= ext2_direct_IO,
+	.writepages		= ext2_writepages,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate	= block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+const struct address_space_operations ext2_aops_xip = {
+	.bmap			= ext2_bmap,
+	.get_xip_mem		= ext2_get_xip_mem,
+};
+
+const struct address_space_operations ext2_nobh_aops = {
+	.readpage		= ext2_readpage,
+	.readpages		= ext2_readpages,
+	.writepage		= ext2_nobh_writepage,
+	.write_begin		= ext2_nobh_write_begin,
+	.write_end		= nobh_write_end,
+	.bmap			= ext2_bmap,
+	.direct_IO		= ext2_direct_IO,
+	.writepages		= ext2_writepages,
+	.migratepage		= buffer_migrate_page,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+/*
+ * Probably it should be a library function... search for first non-zero word
+ * or memcmp with zero_page, whatever is better for particular architecture.
+ * Linus?
+ */
+static inline int all_zeroes(__le32 *p, __le32 *q)
+{
+	while (p < q)
+		if (*p++)
+			return 0;
+	return 1;
+}
+
+/**
+ *	ext2_find_shared - find the indirect blocks for partial truncation.
+ *	@inode:	  inode in question
+ *	@depth:	  depth of the affected branch
+ *	@offsets: offsets of pointers in that branch (see ext2_block_to_path)
+ *	@chain:	  place to store the pointers to partial indirect blocks
+ *	@top:	  place to the (detached) top of branch
+ *
+ *	This is a helper function used by ext2_truncate().
+ *
+ *	When we do truncate() we may have to clean the ends of several indirect
+ *	blocks but leave the blocks themselves alive. Block is partially
+ *	truncated if some data below the new i_size is referred from it (and
+ *	it is on the path to the first completely truncated data block, indeed).
+ *	We have to free the top of that path along with everything to the right
+ *	of the path. Since no allocation past the truncation point is possible
+ *	until ext2_truncate() finishes, we may safely do the latter, but top
+ *	of branch may require special attention - pageout below the truncation
+ *	point might try to populate it.
+ *
+ *	We atomically detach the top of branch from the tree, store the block
+ *	number of its root in *@top, pointers to buffer_heads of partially
+ *	truncated blocks - in @chain[].bh and pointers to their last elements
+ *	that should not be removed - in @chain[].p. Return value is the pointer
+ *	to last filled element of @chain.
+ *
+ *	The work left to caller to do the actual freeing of subtrees:
+ *		a) free the subtree starting from *@top
+ *		b) free the subtrees whose roots are stored in
+ *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
+ *		c) free the subtrees growing from the inode past the @chain[0].p
+ *			(no partially truncated stuff there).
+ */
+
+static Indirect *ext2_find_shared(struct inode *inode,
+				int depth,
+				int offsets[4],
+				Indirect chain[4],
+				__le32 *top)
+{
+	Indirect *partial, *p;
+	int k, err;
+
+	*top = 0;
+	for (k = depth; k > 1 && !offsets[k-1]; k--)
+		;
+	partial = ext2_get_branch(inode, k, offsets, chain, &err);
+	if (!partial)
+		partial = chain + k-1;
+	/*
+	 * If the branch acquired continuation since we've looked at it -
+	 * fine, it should all survive and (new) top doesn't belong to us.
+	 */
+	write_lock(&EXT2_I(inode)->i_meta_lock);
+	if (!partial->key && *partial->p) {
+		write_unlock(&EXT2_I(inode)->i_meta_lock);
+		goto no_top;
+	}
+	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
+		;
+	/*
+	 * OK, we've found the last block that must survive. The rest of our
+	 * branch should be detached before unlocking. However, if that rest
+	 * of branch is all ours and does not grow immediately from the inode
+	 * it's easier to cheat and just decrement partial->p.
+	 */
+	if (p == chain + k - 1 && p > chain) {
+		p->p--;
+	} else {
+		*top = *p->p;
+		*p->p = 0;
+	}
+	write_unlock(&EXT2_I(inode)->i_meta_lock);
+
+	while(partial > p)
+	{
+		brelse(partial->bh);
+		partial--;
+	}
+no_top:
+	return partial;
+}
+
+/**
+ *	ext2_free_data - free a list of data blocks
+ *	@inode:	inode we are dealing with
+ *	@p:	array of block numbers
+ *	@q:	points immediately past the end of array
+ *
+ *	We are freeing all blocks referred from that array (numbers are
+ *	stored as little-endian 32-bit) and updating @inode->i_blocks
+ *	appropriately.
+ */
+static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
+{
+	unsigned long block_to_free = 0, count = 0;
+	unsigned long nr;
+
+	for ( ; p < q ; p++) {
+		nr = le32_to_cpu(*p);
+		if (nr) {
+			*p = 0;
+			/* accumulate blocks to free if they're contiguous */
+			if (count == 0)
+				goto free_this;
+			else if (block_to_free == nr - count)
+				count++;
+			else {
+				ext2_free_blocks (inode, block_to_free, count);
+				mark_inode_dirty(inode);
+			free_this:
+				block_to_free = nr;
+				count = 1;
+			}
+		}
+	}
+	if (count > 0) {
+		ext2_free_blocks (inode, block_to_free, count);
+		mark_inode_dirty(inode);
+	}
+}
+
+/**
+ *	ext2_free_branches - free an array of branches
+ *	@inode:	inode we are dealing with
+ *	@p:	array of block numbers
+ *	@q:	pointer immediately past the end of array
+ *	@depth:	depth of the branches to free
+ *
+ *	We are freeing all blocks referred from these branches (numbers are
+ *	stored as little-endian 32-bit) and updating @inode->i_blocks
+ *	appropriately.
+ */
+static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int depth)
+{
+	struct buffer_head * bh;
+	unsigned long nr;
+
+	if (depth--) {
+		int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb);
+		for ( ; p < q ; p++) {
+			nr = le32_to_cpu(*p);
+			if (!nr)
+				continue;
+			*p = 0;
+			bh = sb_bread(inode->i_sb, nr);
+			/*
+			 * A read failure? Report error and clear slot
+			 * (should be rare).
+			 */ 
+			if (!bh) {
+				ext2_error(inode->i_sb, "ext2_free_branches",
+					"Read failure, inode=%ld, block=%ld",
+					inode->i_ino, nr);
+				continue;
+			}
+			ext2_free_branches(inode,
+					   (__le32*)bh->b_data,
+					   (__le32*)bh->b_data + addr_per_block,
+					   depth);
+			bforget(bh);
+			ext2_free_blocks(inode, nr, 1);
+			mark_inode_dirty(inode);
+		}
+	} else
+		ext2_free_data(inode, p, q);
+}
+
+static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
+{
+	__le32 *i_data = EXT2_I(inode)->i_data;
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb);
+	int offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	__le32 nr = 0;
+	int n;
+	long iblock;
+	unsigned blocksize;
+	blocksize = inode->i_sb->s_blocksize;
+	iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
+
+	n = ext2_block_to_path(inode, iblock, offsets, NULL);
+	if (n == 0)
+		return;
+
+	/*
+	 * From here we block out all ext2_get_block() callers who want to
+	 * modify the block allocation tree.
+	 */
+	mutex_lock(&ei->truncate_mutex);
+
+	if (n == 1) {
+		ext2_free_data(inode, i_data+offsets[0],
+					i_data + EXT2_NDIR_BLOCKS);
+		goto do_indirects;
+	}
+
+	partial = ext2_find_shared(inode, n, offsets, chain, &nr);
+	/* Kill the top of shared branch (already detached) */
+	if (nr) {
+		if (partial == chain)
+			mark_inode_dirty(inode);
+		else
+			mark_buffer_dirty_inode(partial->bh, inode);
+		ext2_free_branches(inode, &nr, &nr+1, (chain+n-1) - partial);
+	}
+	/* Clear the ends of indirect blocks on the shared branch */
+	while (partial > chain) {
+		ext2_free_branches(inode,
+				   partial->p + 1,
+				   (__le32*)partial->bh->b_data+addr_per_block,
+				   (chain+n-1) - partial);
+		mark_buffer_dirty_inode(partial->bh, inode);
+		brelse (partial->bh);
+		partial--;
+	}
+do_indirects:
+	/* Kill the remaining (whole) subtrees */
+	switch (offsets[0]) {
+		default:
+			nr = i_data[EXT2_IND_BLOCK];
+			if (nr) {
+				i_data[EXT2_IND_BLOCK] = 0;
+				mark_inode_dirty(inode);
+				ext2_free_branches(inode, &nr, &nr+1, 1);
+			}
+		case EXT2_IND_BLOCK:
+			nr = i_data[EXT2_DIND_BLOCK];
+			if (nr) {
+				i_data[EXT2_DIND_BLOCK] = 0;
+				mark_inode_dirty(inode);
+				ext2_free_branches(inode, &nr, &nr+1, 2);
+			}
+		case EXT2_DIND_BLOCK:
+			nr = i_data[EXT2_TIND_BLOCK];
+			if (nr) {
+				i_data[EXT2_TIND_BLOCK] = 0;
+				mark_inode_dirty(inode);
+				ext2_free_branches(inode, &nr, &nr+1, 3);
+			}
+		case EXT2_TIND_BLOCK:
+			;
+	}
+
+	ext2_discard_reservation(inode);
+
+	mutex_unlock(&ei->truncate_mutex);
+}
+
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
+{
+	/*
+	 * XXX: it seems like a bug here that we don't allow
+	 * IS_APPEND inode to have blocks-past-i_size trimmed off.
+	 * review and fix this.
+	 *
+	 * Also would be nice to be able to handle IO errors and such,
+	 * but that's probably too much to ask.
+	 */
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	    S_ISLNK(inode->i_mode)))
+		return;
+	if (ext2_inode_is_fast_symlink(inode))
+		return;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return;
+	__ext2_truncate_blocks(inode, offset);
+}
+
+static int ext2_setsize(struct inode *inode, loff_t newsize)
+{
+	int error;
+
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	    S_ISLNK(inode->i_mode)))
+		return -EINVAL;
+	if (ext2_inode_is_fast_symlink(inode))
+		return -EINVAL;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return -EPERM;
+
+	if (mapping_is_xip(inode->i_mapping))
+		error = xip_truncate_page(inode->i_mapping, newsize);
+	else if (test_opt(inode->i_sb, NOBH))
+		error = nobh_truncate_page(inode->i_mapping,
+				newsize, ext2_get_block);
+	else
+		error = block_truncate_page(inode->i_mapping,
+				newsize, ext2_get_block);
+	if (error)
+		return error;
+
+	truncate_setsize(inode, newsize);
+	__ext2_truncate_blocks(inode, newsize);
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+	if (inode_needs_sync(inode)) {
+		sync_mapping_buffers(inode->i_mapping);
+		sync_inode_metadata(inode, 1);
+	} else {
+		mark_inode_dirty(inode);
+	}
+
+	return 0;
+}
+
+static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
+					struct buffer_head **p)
+{
+	struct buffer_head * bh;
+	unsigned long block_group;
+	unsigned long block;
+	unsigned long offset;
+	struct ext2_group_desc * gdp;
+
+	*p = NULL;
+	if ((ino != EXT2_ROOT_INO && ino < EXT2_FIRST_INO(sb)) ||
+	    ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count))
+		goto Einval;
+
+	block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
+	gdp = ext2_get_group_desc(sb, block_group, NULL);
+	if (!gdp)
+		goto Egdp;
+	/*
+	 * Figure out the offset within the block group inode table
+	 */
+	offset = ((ino - 1) % EXT2_INODES_PER_GROUP(sb)) * EXT2_INODE_SIZE(sb);
+	block = le32_to_cpu(gdp->bg_inode_table) +
+		(offset >> EXT2_BLOCK_SIZE_BITS(sb));
+	if (!(bh = sb_bread(sb, block)))
+		goto Eio;
+
+	*p = bh;
+	offset &= (EXT2_BLOCK_SIZE(sb) - 1);
+	return (struct ext2_inode *) (bh->b_data + offset);
+
+Einval:
+	ext2_error(sb, "ext2_get_inode", "bad inode number: %lu",
+		   (unsigned long) ino);
+	return ERR_PTR(-EINVAL);
+Eio:
+	ext2_error(sb, "ext2_get_inode",
+		   "unable to read inode block - inode=%lu, block=%lu",
+		   (unsigned long) ino, block);
+Egdp:
+	return ERR_PTR(-EIO);
+}
+
+void ext2_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = EXT2_I(inode)->i_flags;
+
+	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+	if (flags & EXT2_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & EXT2_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & EXT2_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (flags & EXT2_NOATIME_FL)
+		inode->i_flags |= S_NOATIME;
+	if (flags & EXT2_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+}
+
+/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
+void ext2_get_inode_flags(struct ext2_inode_info *ei)
+{
+	unsigned int flags = ei->vfs_inode.i_flags;
+
+	ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL|
+			EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL);
+	if (flags & S_SYNC)
+		ei->i_flags |= EXT2_SYNC_FL;
+	if (flags & S_APPEND)
+		ei->i_flags |= EXT2_APPEND_FL;
+	if (flags & S_IMMUTABLE)
+		ei->i_flags |= EXT2_IMMUTABLE_FL;
+	if (flags & S_NOATIME)
+		ei->i_flags |= EXT2_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		ei->i_flags |= EXT2_DIRSYNC_FL;
+}
+
+struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
+{
+	struct ext2_inode_info *ei;
+	struct buffer_head * bh;
+	struct ext2_inode *raw_inode;
+	struct inode *inode;
+	long ret = -EIO;
+	int n;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	ei = EXT2_I(inode);
+	ei->i_block_alloc_info = NULL;
+
+	raw_inode = ext2_get_inode(inode->i_sb, ino, &bh);
+	if (IS_ERR(raw_inode)) {
+		ret = PTR_ERR(raw_inode);
+ 		goto bad_inode;
+	}
+
+	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+	if (!(test_opt (inode->i_sb, NO_UID32))) {
+		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+	}
+	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+	inode->i_size = le32_to_cpu(raw_inode->i_size);
+	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
+	inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
+	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
+	inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
+	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
+	/* We now have enough fields to check if the inode was active or not.
+	 * This is needed because nfsd might try to access dead inodes
+	 * the test is that same one that e2fsck uses
+	 * NeilBrown 1999oct15
+	 */
+	if (inode->i_nlink == 0 && (inode->i_mode == 0 || ei->i_dtime)) {
+		/* this inode is deleted */
+		brelse (bh);
+		ret = -ESTALE;
+		goto bad_inode;
+	}
+	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
+	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
+	ei->i_frag_no = raw_inode->i_frag;
+	ei->i_frag_size = raw_inode->i_fsize;
+	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+	ei->i_dir_acl = 0;
+	if (S_ISREG(inode->i_mode))
+		inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
+	else
+		ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
+	ei->i_dtime = 0;
+	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+	ei->i_state = 0;
+	ei->i_block_group = (ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
+	ei->i_dir_start_lookup = 0;
+
+	/*
+	 * NOTE! The in-memory inode i_data array is in little-endian order
+	 * even on big-endian machines: we do NOT byteswap the block numbers!
+	 */
+	for (n = 0; n < EXT2_N_BLOCKS; n++)
+		ei->i_data[n] = raw_inode->i_block[n];
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &ext2_file_inode_operations;
+		if (ext2_use_xip(inode->i_sb)) {
+			inode->i_mapping->a_ops = &ext2_aops_xip;
+			inode->i_fop = &ext2_xip_file_operations;
+		} else if (test_opt(inode->i_sb, NOBH)) {
+			inode->i_mapping->a_ops = &ext2_nobh_aops;
+			inode->i_fop = &ext2_file_operations;
+		} else {
+			inode->i_mapping->a_ops = &ext2_aops;
+			inode->i_fop = &ext2_file_operations;
+		}
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &ext2_dir_inode_operations;
+		inode->i_fop = &ext2_dir_operations;
+		if (test_opt(inode->i_sb, NOBH))
+			inode->i_mapping->a_ops = &ext2_nobh_aops;
+		else
+			inode->i_mapping->a_ops = &ext2_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (ext2_inode_is_fast_symlink(inode)) {
+			inode->i_op = &ext2_fast_symlink_inode_operations;
+			nd_terminate_link(ei->i_data, inode->i_size,
+				sizeof(ei->i_data) - 1);
+		} else {
+			inode->i_op = &ext2_symlink_inode_operations;
+			if (test_opt(inode->i_sb, NOBH))
+				inode->i_mapping->a_ops = &ext2_nobh_aops;
+			else
+				inode->i_mapping->a_ops = &ext2_aops;
+		}
+	} else {
+		inode->i_op = &ext2_special_inode_operations;
+		if (raw_inode->i_block[0])
+			init_special_inode(inode, inode->i_mode,
+			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
+		else 
+			init_special_inode(inode, inode->i_mode,
+			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+	}
+	brelse (bh);
+	ext2_set_inode_flags(inode);
+	unlock_new_inode(inode);
+	return inode;
+	
+bad_inode:
+	iget_failed(inode);
+	return ERR_PTR(ret);
+}
+
+static int __ext2_write_inode(struct inode *inode, int do_sync)
+{
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	struct super_block *sb = inode->i_sb;
+	ino_t ino = inode->i_ino;
+	uid_t uid = inode->i_uid;
+	gid_t gid = inode->i_gid;
+	struct buffer_head * bh;
+	struct ext2_inode * raw_inode = ext2_get_inode(sb, ino, &bh);
+	int n;
+	int err = 0;
+
+	if (IS_ERR(raw_inode))
+ 		return -EIO;
+
+	/* For fields not not tracking in the in-memory inode,
+	 * initialise them to zero for new inodes. */
+	if (ei->i_state & EXT2_STATE_NEW)
+		memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size);
+
+	ext2_get_inode_flags(ei);
+	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+	if (!(test_opt(sb, NO_UID32))) {
+		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid));
+		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid));
+/*
+ * Fix up interoperability with old kernels. Otherwise, old inodes get
+ * re-used with the upper 16 bits of the uid/gid intact
+ */
+		if (!ei->i_dtime) {
+			raw_inode->i_uid_high = cpu_to_le16(high_16_bits(uid));
+			raw_inode->i_gid_high = cpu_to_le16(high_16_bits(gid));
+		} else {
+			raw_inode->i_uid_high = 0;
+			raw_inode->i_gid_high = 0;
+		}
+	} else {
+		raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(uid));
+		raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(gid));
+		raw_inode->i_uid_high = 0;
+		raw_inode->i_gid_high = 0;
+	}
+	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+	raw_inode->i_size = cpu_to_le32(inode->i_size);
+	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+
+	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
+	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+	raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
+	raw_inode->i_frag = ei->i_frag_no;
+	raw_inode->i_fsize = ei->i_frag_size;
+	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
+	if (!S_ISREG(inode->i_mode))
+		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
+	else {
+		raw_inode->i_size_high = cpu_to_le32(inode->i_size >> 32);
+		if (inode->i_size > 0x7fffffffULL) {
+			if (!EXT2_HAS_RO_COMPAT_FEATURE(sb,
+					EXT2_FEATURE_RO_COMPAT_LARGE_FILE) ||
+			    EXT2_SB(sb)->s_es->s_rev_level ==
+					cpu_to_le32(EXT2_GOOD_OLD_REV)) {
+			       /* If this is the first large file
+				* created, add a flag to the superblock.
+				*/
+				spin_lock(&EXT2_SB(sb)->s_lock);
+				ext2_update_dynamic_rev(sb);
+				EXT2_SET_RO_COMPAT_FEATURE(sb,
+					EXT2_FEATURE_RO_COMPAT_LARGE_FILE);
+				spin_unlock(&EXT2_SB(sb)->s_lock);
+				ext2_write_super(sb);
+			}
+		}
+	}
+	
+	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			raw_inode->i_block[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
+			raw_inode->i_block[1] = 0;
+		} else {
+			raw_inode->i_block[0] = 0;
+			raw_inode->i_block[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
+			raw_inode->i_block[2] = 0;
+		}
+	} else for (n = 0; n < EXT2_N_BLOCKS; n++)
+		raw_inode->i_block[n] = ei->i_data[n];
+	mark_buffer_dirty(bh);
+	if (do_sync) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh)) {
+			printk ("IO error syncing ext2 inode [%s:%08lx]\n",
+				sb->s_id, (unsigned long) ino);
+			err = -EIO;
+		}
+	}
+	ei->i_state &= ~EXT2_STATE_NEW;
+	brelse (bh);
+	return err;
+}
+
+int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
+int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		return error;
+
+	if (is_quota_modification(inode, iattr))
+		dquot_initialize(inode);
+	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
+	    (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
+		error = dquot_transfer(inode, iattr);
+		if (error)
+			return error;
+	}
+	if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) {
+		error = ext2_setsize(inode, iattr->ia_size);
+		if (error)
+			return error;
+	}
+	setattr_copy(inode, iattr);
+	if (iattr->ia_valid & ATTR_MODE)
+		error = ext2_acl_chmod(inode);
+	mark_inode_dirty(inode);
+
+	return error;
+}
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
new file mode 100644
index 00000000..f81e250a
--- /dev/null
+++ b/fs/ext2/ioctl.c
@@ -0,0 +1,178 @@
+/*
+ * linux/fs/ext2/ioctl.c
+ *
+ * Copyright (C) 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include "ext2.h"
+#include <linux/capability.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include <asm/current.h>
+#include <asm/uaccess.h>
+
+
+long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	unsigned int flags;
+	unsigned short rsv_window_size;
+	int ret;
+
+	ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+
+	switch (cmd) {
+	case EXT2_IOC_GETFLAGS:
+		ext2_get_inode_flags(ei);
+		flags = ei->i_flags & EXT2_FL_USER_VISIBLE;
+		return put_user(flags, (int __user *) arg);
+	case EXT2_IOC_SETFLAGS: {
+		unsigned int oldflags;
+
+		ret = mnt_want_write(filp->f_path.mnt);
+		if (ret)
+			return ret;
+
+		if (!inode_owner_or_capable(inode)) {
+			ret = -EACCES;
+			goto setflags_out;
+		}
+
+		if (get_user(flags, (int __user *) arg)) {
+			ret = -EFAULT;
+			goto setflags_out;
+		}
+
+		flags = ext2_mask_flags(inode->i_mode, flags);
+
+		mutex_lock(&inode->i_mutex);
+		/* Is it quota file? Do not allow user to mess with it */
+		if (IS_NOQUOTA(inode)) {
+			mutex_unlock(&inode->i_mutex);
+			ret = -EPERM;
+			goto setflags_out;
+		}
+		oldflags = ei->i_flags;
+
+		/*
+		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+		 * the relevant capability.
+		 *
+		 * This test looks nicer. Thanks to Pauline Middelink
+		 */
+		if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
+			if (!capable(CAP_LINUX_IMMUTABLE)) {
+				mutex_unlock(&inode->i_mutex);
+				ret = -EPERM;
+				goto setflags_out;
+			}
+		}
+
+		flags = flags & EXT2_FL_USER_MODIFIABLE;
+		flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE;
+		ei->i_flags = flags;
+		mutex_unlock(&inode->i_mutex);
+
+		ext2_set_inode_flags(inode);
+		inode->i_ctime = CURRENT_TIME_SEC;
+		mark_inode_dirty(inode);
+setflags_out:
+		mnt_drop_write(filp->f_path.mnt);
+		return ret;
+	}
+	case EXT2_IOC_GETVERSION:
+		return put_user(inode->i_generation, (int __user *) arg);
+	case EXT2_IOC_SETVERSION:
+		if (!inode_owner_or_capable(inode))
+			return -EPERM;
+		ret = mnt_want_write(filp->f_path.mnt);
+		if (ret)
+			return ret;
+		if (get_user(inode->i_generation, (int __user *) arg)) {
+			ret = -EFAULT;
+		} else {
+			inode->i_ctime = CURRENT_TIME_SEC;
+			mark_inode_dirty(inode);
+		}
+		mnt_drop_write(filp->f_path.mnt);
+		return ret;
+	case EXT2_IOC_GETRSVSZ:
+		if (test_opt(inode->i_sb, RESERVATION)
+			&& S_ISREG(inode->i_mode)
+			&& ei->i_block_alloc_info) {
+			rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
+			return put_user(rsv_window_size, (int __user *)arg);
+		}
+		return -ENOTTY;
+	case EXT2_IOC_SETRSVSZ: {
+
+		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
+			return -ENOTTY;
+
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		if (get_user(rsv_window_size, (int __user *)arg))
+			return -EFAULT;
+
+		ret = mnt_want_write(filp->f_path.mnt);
+		if (ret)
+			return ret;
+
+		if (rsv_window_size > EXT2_MAX_RESERVE_BLOCKS)
+			rsv_window_size = EXT2_MAX_RESERVE_BLOCKS;
+
+		/*
+		 * need to allocate reservation structure for this inode
+		 * before set the window size
+		 */
+		/*
+		 * XXX What lock should protect the rsv_goal_size?
+		 * Accessed in ext2_get_block only.  ext3 uses i_truncate.
+		 */
+		mutex_lock(&ei->truncate_mutex);
+		if (!ei->i_block_alloc_info)
+			ext2_init_block_alloc_info(inode);
+
+		if (ei->i_block_alloc_info){
+			struct ext2_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
+			rsv->rsv_goal_size = rsv_window_size;
+		}
+		mutex_unlock(&ei->truncate_mutex);
+		mnt_drop_write(filp->f_path.mnt);
+		return 0;
+	}
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long ext2_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case EXT2_IOC32_GETFLAGS:
+		cmd = EXT2_IOC_GETFLAGS;
+		break;
+	case EXT2_IOC32_SETFLAGS:
+		cmd = EXT2_IOC_SETFLAGS;
+		break;
+	case EXT2_IOC32_GETVERSION:
+		cmd = EXT2_IOC_GETVERSION;
+		break;
+	case EXT2_IOC32_SETVERSION:
+		cmd = EXT2_IOC_SETVERSION;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return ext2_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
new file mode 100644
index 00000000..ed5c5d49
--- /dev/null
+++ b/fs/ext2/namei.c
@@ -0,0 +1,427 @@
+/*
+ * linux/fs/ext2/namei.c
+ *
+ * Rewrite to pagecache. Almost all code had been changed, so blame me
+ * if the things go wrong. Please, send bug reports to
+ * viro@parcelfarce.linux.theplanet.co.uk
+ *
+ * Stuff here is basically a glue between the VFS and generic UNIXish
+ * filesystem that keeps everything in pagecache. All knowledge of the
+ * directory layout is in fs/ext2/dir.c - it turned out to be easily separatable
+ * and it's easier to debug that way. In principle we might want to
+ * generalize that a bit and turn it into a library. Or not.
+ *
+ * The only non-static object here is ext2_dir_inode_operations.
+ *
+ * TODO: get rid of kmap() use, add readahead.
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include "ext2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "xip.h"
+
+static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = ext2_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		unlock_new_inode(inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
+}
+
+/*
+ * Methods themselves.
+ */
+
+static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode * inode;
+	ino_t ino;
+	
+	if (dentry->d_name.len > EXT2_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ino = ext2_inode_by_name(dir, &dentry->d_name);
+	inode = NULL;
+	if (ino) {
+		inode = ext2_iget(dir->i_sb, ino);
+		if (IS_ERR(inode)) {
+			if (PTR_ERR(inode) == -ESTALE) {
+				ext2_error(dir->i_sb, __func__,
+						"deleted inode referenced: %lu",
+						(unsigned long) ino);
+				return ERR_PTR(-EIO);
+			} else {
+				return ERR_CAST(inode);
+			}
+		}
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+struct dentry *ext2_get_parent(struct dentry *child)
+{
+	struct qstr dotdot = {.name = "..", .len = 2};
+	unsigned long ino = ext2_inode_by_name(child->d_inode, &dotdot);
+	if (!ino)
+		return ERR_PTR(-ENOENT);
+	return d_obtain_alias(ext2_iget(child->d_inode->i_sb, ino));
+} 
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate(). 
+ */
+static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd)
+{
+	struct inode *inode;
+
+	dquot_initialize(dir);
+
+	inode = ext2_new_inode(dir, mode, &dentry->d_name);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &ext2_file_inode_operations;
+	if (ext2_use_xip(inode->i_sb)) {
+		inode->i_mapping->a_ops = &ext2_aops_xip;
+		inode->i_fop = &ext2_xip_file_operations;
+	} else if (test_opt(inode->i_sb, NOBH)) {
+		inode->i_mapping->a_ops = &ext2_nobh_aops;
+		inode->i_fop = &ext2_file_operations;
+	} else {
+		inode->i_mapping->a_ops = &ext2_aops;
+		inode->i_fop = &ext2_file_operations;
+	}
+	mark_inode_dirty(inode);
+	return ext2_add_nondir(dentry, inode);
+}
+
+static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev)
+{
+	struct inode * inode;
+	int err;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	dquot_initialize(dir);
+
+	inode = ext2_new_inode (dir, mode, &dentry->d_name);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+#ifdef CONFIG_EXT2_FS_XATTR
+		inode->i_op = &ext2_special_inode_operations;
+#endif
+		mark_inode_dirty(inode);
+		err = ext2_add_nondir(dentry, inode);
+	}
+	return err;
+}
+
+static int ext2_symlink (struct inode * dir, struct dentry * dentry,
+	const char * symname)
+{
+	struct super_block * sb = dir->i_sb;
+	int err = -ENAMETOOLONG;
+	unsigned l = strlen(symname)+1;
+	struct inode * inode;
+
+	if (l > sb->s_blocksize)
+		goto out;
+
+	dquot_initialize(dir);
+
+	inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	if (l > sizeof (EXT2_I(inode)->i_data)) {
+		/* slow symlink */
+		inode->i_op = &ext2_symlink_inode_operations;
+		if (test_opt(inode->i_sb, NOBH))
+			inode->i_mapping->a_ops = &ext2_nobh_aops;
+		else
+			inode->i_mapping->a_ops = &ext2_aops;
+		err = page_symlink(inode, symname, l);
+		if (err)
+			goto out_fail;
+	} else {
+		/* fast symlink */
+		inode->i_op = &ext2_fast_symlink_inode_operations;
+		memcpy((char*)(EXT2_I(inode)->i_data),symname,l);
+		inode->i_size = l-1;
+	}
+	mark_inode_dirty(inode);
+
+	err = ext2_add_nondir(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
+	iput (inode);
+	goto out;
+}
+
+static int ext2_link (struct dentry * old_dentry, struct inode * dir,
+	struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+	int err;
+
+	if (inode->i_nlink >= EXT2_LINK_MAX)
+		return -EMLINK;
+
+	dquot_initialize(dir);
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	inode_inc_link_count(inode);
+	ihold(inode);
+
+	err = ext2_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+{
+	struct inode * inode;
+	int err = -EMLINK;
+
+	if (dir->i_nlink >= EXT2_LINK_MAX)
+		goto out;
+
+	dquot_initialize(dir);
+
+	inode_inc_link_count(dir);
+
+	inode = ext2_new_inode(dir, S_IFDIR | mode, &dentry->d_name);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_dir;
+
+	inode->i_op = &ext2_dir_inode_operations;
+	inode->i_fop = &ext2_dir_operations;
+	if (test_opt(inode->i_sb, NOBH))
+		inode->i_mapping->a_ops = &ext2_nobh_aops;
+	else
+		inode->i_mapping->a_ops = &ext2_aops;
+
+	inode_inc_link_count(inode);
+
+	err = ext2_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = ext2_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+out_dir:
+	inode_dec_link_count(dir);
+	goto out;
+}
+
+static int ext2_unlink(struct inode * dir, struct dentry *dentry)
+{
+	struct inode * inode = dentry->d_inode;
+	struct ext2_dir_entry_2 * de;
+	struct page * page;
+	int err = -ENOENT;
+
+	dquot_initialize(dir);
+
+	de = ext2_find_entry (dir, &dentry->d_name, &page);
+	if (!de)
+		goto out;
+
+	err = ext2_delete_entry (de, page);
+	if (err)
+		goto out;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+	err = 0;
+out:
+	return err;
+}
+
+static int ext2_rmdir (struct inode * dir, struct dentry *dentry)
+{
+	struct inode * inode = dentry->d_inode;
+	int err = -ENOTEMPTY;
+
+	if (ext2_empty_dir(inode)) {
+		err = ext2_unlink(dir, dentry);
+		if (!err) {
+			inode->i_size = 0;
+			inode_dec_link_count(inode);
+			inode_dec_link_count(dir);
+		}
+	}
+	return err;
+}
+
+static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
+	struct inode * new_dir,	struct dentry * new_dentry )
+{
+	struct inode * old_inode = old_dentry->d_inode;
+	struct inode * new_inode = new_dentry->d_inode;
+	struct page * dir_page = NULL;
+	struct ext2_dir_entry_2 * dir_de = NULL;
+	struct page * old_page;
+	struct ext2_dir_entry_2 * old_de;
+	int err = -ENOENT;
+
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
+	old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = ext2_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page *new_page;
+		struct ext2_dir_entry_2 *new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !ext2_empty_dir (new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page);
+		if (!new_de)
+			goto out_dir;
+		ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
+		new_inode->i_ctime = CURRENT_TIME_SEC;
+		if (dir_de)
+			drop_nlink(new_inode);
+		inode_dec_link_count(new_inode);
+	} else {
+		if (dir_de) {
+			err = -EMLINK;
+			if (new_dir->i_nlink >= EXT2_LINK_MAX)
+				goto out_dir;
+		}
+		err = ext2_add_link(new_dentry, old_inode);
+		if (err)
+			goto out_dir;
+		if (dir_de)
+			inode_inc_link_count(new_dir);
+	}
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+ 	 * rename.
+	 */
+	old_inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(old_inode);
+
+	ext2_delete_entry (old_de, old_page);
+
+	if (dir_de) {
+		if (old_dir != new_dir)
+			ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
+		else {
+			kunmap(dir_page);
+			page_cache_release(dir_page);
+		}
+		inode_dec_link_count(old_dir);
+	}
+	return 0;
+
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	return err;
+}
+
+const struct inode_operations ext2_dir_inode_operations = {
+	.create		= ext2_create,
+	.lookup		= ext2_lookup,
+	.link		= ext2_link,
+	.unlink		= ext2_unlink,
+	.symlink	= ext2_symlink,
+	.mkdir		= ext2_mkdir,
+	.rmdir		= ext2_rmdir,
+	.mknod		= ext2_mknod,
+	.rename		= ext2_rename,
+#ifdef CONFIG_EXT2_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext2_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.setattr	= ext2_setattr,
+	.check_acl	= ext2_check_acl,
+};
+
+const struct inode_operations ext2_special_inode_operations = {
+#ifdef CONFIG_EXT2_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext2_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.setattr	= ext2_setattr,
+	.check_acl	= ext2_check_acl,
+};
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
new file mode 100644
index 00000000..1dd62ed3
--- /dev/null
+++ b/fs/ext2/super.c
@@ -0,0 +1,1525 @@
+/*
+ *  linux/fs/ext2/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/vfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/log2.h>
+#include <linux/quotaops.h>
+#include <asm/uaccess.h>
+#include "ext2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "xip.h"
+
+static void ext2_sync_super(struct super_block *sb,
+			    struct ext2_super_block *es, int wait);
+static int ext2_remount (struct super_block * sb, int * flags, char * data);
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
+static int ext2_sync_fs(struct super_block *sb, int wait);
+
+void ext2_error(struct super_block *sb, const char *function,
+		const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	struct ext2_super_block *es = sbi->s_es;
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		spin_lock(&sbi->s_lock);
+		sbi->s_mount_state |= EXT2_ERROR_FS;
+		es->s_state |= cpu_to_le16(EXT2_ERROR_FS);
+		spin_unlock(&sbi->s_lock);
+		ext2_sync_super(sb, es, 1);
+	}
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_CRIT "EXT2-fs (%s): error: %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
+	va_end(args);
+
+	if (test_opt(sb, ERRORS_PANIC))
+		panic("EXT2-fs: panic from previous error\n");
+	if (test_opt(sb, ERRORS_RO)) {
+		ext2_msg(sb, KERN_CRIT,
+			     "error: remounting filesystem read-only");
+		sb->s_flags |= MS_RDONLY;
+	}
+}
+
+void ext2_msg(struct super_block *sb, const char *prefix,
+		const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk("%sEXT2-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+
+	va_end(args);
+}
+
+/*
+ * This must be called with sbi->s_lock held.
+ */
+void ext2_update_dynamic_rev(struct super_block *sb)
+{
+	struct ext2_super_block *es = EXT2_SB(sb)->s_es;
+
+	if (le32_to_cpu(es->s_rev_level) > EXT2_GOOD_OLD_REV)
+		return;
+
+	ext2_msg(sb, KERN_WARNING,
+		     "warning: updating to rev %d because of "
+		     "new feature flag, running e2fsck is recommended",
+		     EXT2_DYNAMIC_REV);
+
+	es->s_first_ino = cpu_to_le32(EXT2_GOOD_OLD_FIRST_INO);
+	es->s_inode_size = cpu_to_le16(EXT2_GOOD_OLD_INODE_SIZE);
+	es->s_rev_level = cpu_to_le32(EXT2_DYNAMIC_REV);
+	/* leave es->s_feature_*compat flags alone */
+	/* es->s_uuid will be set by e2fsck if empty */
+
+	/*
+	 * The rest of the superblock fields should be zero, and if not it
+	 * means they are likely already in use, so leave them alone.  We
+	 * can leave it up to e2fsck to clean up any inconsistencies there.
+	 */
+}
+
+static void ext2_put_super (struct super_block * sb)
+{
+	int db_count;
+	int i;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
+	if (sb->s_dirt)
+		ext2_write_super(sb);
+
+	ext2_xattr_put_super(sb);
+	if (!(sb->s_flags & MS_RDONLY)) {
+		struct ext2_super_block *es = sbi->s_es;
+
+		spin_lock(&sbi->s_lock);
+		es->s_state = cpu_to_le16(sbi->s_mount_state);
+		spin_unlock(&sbi->s_lock);
+		ext2_sync_super(sb, es, 1);
+	}
+	db_count = sbi->s_gdb_count;
+	for (i = 0; i < db_count; i++)
+		if (sbi->s_group_desc[i])
+			brelse (sbi->s_group_desc[i]);
+	kfree(sbi->s_group_desc);
+	kfree(sbi->s_debts);
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	brelse (sbi->s_sbh);
+	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
+	kfree(sbi);
+}
+
+static struct kmem_cache * ext2_inode_cachep;
+
+static struct inode *ext2_alloc_inode(struct super_block *sb)
+{
+	struct ext2_inode_info *ei;
+	ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	ei->i_block_alloc_info = NULL;
+	ei->vfs_inode.i_version = 1;
+	return &ei->vfs_inode;
+}
+
+static void ext2_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
+}
+
+static void ext2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ext2_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
+
+	rwlock_init(&ei->i_meta_lock);
+#ifdef CONFIG_EXT2_FS_XATTR
+	init_rwsem(&ei->xattr_sem);
+#endif
+	mutex_init(&ei->truncate_mutex);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
+					     sizeof(struct ext2_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (ext2_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(ext2_inode_cachep);
+}
+
+static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct super_block *sb = vfs->mnt_sb;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	struct ext2_super_block *es = sbi->s_es;
+	unsigned long def_mount_opts;
+
+	spin_lock(&sbi->s_lock);
+	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+
+	if (sbi->s_sb_block != 1)
+		seq_printf(seq, ",sb=%lu", sbi->s_sb_block);
+	if (test_opt(sb, MINIX_DF))
+		seq_puts(seq, ",minixdf");
+	if (test_opt(sb, GRPID))
+		seq_puts(seq, ",grpid");
+	if (!test_opt(sb, GRPID) && (def_mount_opts & EXT2_DEFM_BSDGROUPS))
+		seq_puts(seq, ",nogrpid");
+	if (sbi->s_resuid != EXT2_DEF_RESUID ||
+	    le16_to_cpu(es->s_def_resuid) != EXT2_DEF_RESUID) {
+		seq_printf(seq, ",resuid=%u", sbi->s_resuid);
+	}
+	if (sbi->s_resgid != EXT2_DEF_RESGID ||
+	    le16_to_cpu(es->s_def_resgid) != EXT2_DEF_RESGID) {
+		seq_printf(seq, ",resgid=%u", sbi->s_resgid);
+	}
+	if (test_opt(sb, ERRORS_RO)) {
+		int def_errors = le16_to_cpu(es->s_errors);
+
+		if (def_errors == EXT2_ERRORS_PANIC ||
+		    def_errors == EXT2_ERRORS_CONTINUE) {
+			seq_puts(seq, ",errors=remount-ro");
+		}
+	}
+	if (test_opt(sb, ERRORS_CONT))
+		seq_puts(seq, ",errors=continue");
+	if (test_opt(sb, ERRORS_PANIC))
+		seq_puts(seq, ",errors=panic");
+	if (test_opt(sb, NO_UID32))
+		seq_puts(seq, ",nouid32");
+	if (test_opt(sb, DEBUG))
+		seq_puts(seq, ",debug");
+	if (test_opt(sb, OLDALLOC))
+		seq_puts(seq, ",oldalloc");
+
+#ifdef CONFIG_EXT2_FS_XATTR
+	if (test_opt(sb, XATTR_USER))
+		seq_puts(seq, ",user_xattr");
+	if (!test_opt(sb, XATTR_USER) &&
+	    (def_mount_opts & EXT2_DEFM_XATTR_USER)) {
+		seq_puts(seq, ",nouser_xattr");
+	}
+#endif
+
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+	if (test_opt(sb, POSIX_ACL))
+		seq_puts(seq, ",acl");
+	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT2_DEFM_ACL))
+		seq_puts(seq, ",noacl");
+#endif
+
+	if (test_opt(sb, NOBH))
+		seq_puts(seq, ",nobh");
+
+#if defined(CONFIG_QUOTA)
+	if (sbi->s_mount_opt & EXT2_MOUNT_USRQUOTA)
+		seq_puts(seq, ",usrquota");
+
+	if (sbi->s_mount_opt & EXT2_MOUNT_GRPQUOTA)
+		seq_puts(seq, ",grpquota");
+#endif
+
+#if defined(CONFIG_EXT2_FS_XIP)
+	if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
+		seq_puts(seq, ",xip");
+#endif
+
+	if (!test_opt(sb, RESERVATION))
+		seq_puts(seq, ",noreservation");
+
+	spin_unlock(&sbi->s_lock);
+	return 0;
+}
+
+#ifdef CONFIG_QUOTA
+static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off);
+static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off);
+#endif
+
+static const struct super_operations ext2_sops = {
+	.alloc_inode	= ext2_alloc_inode,
+	.destroy_inode	= ext2_destroy_inode,
+	.write_inode	= ext2_write_inode,
+	.evict_inode	= ext2_evict_inode,
+	.put_super	= ext2_put_super,
+	.write_super	= ext2_write_super,
+	.sync_fs	= ext2_sync_fs,
+	.statfs		= ext2_statfs,
+	.remount_fs	= ext2_remount,
+	.show_options	= ext2_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read	= ext2_quota_read,
+	.quota_write	= ext2_quota_write,
+#endif
+};
+
+static struct inode *ext2_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	if (ino < EXT2_FIRST_INO(sb) && ino != EXT2_ROOT_INO)
+		return ERR_PTR(-ESTALE);
+	if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count))
+		return ERR_PTR(-ESTALE);
+
+	/* iget isn't really right if the inode is currently unallocated!!
+	 * ext2_read_inode currently does appropriate checks, but
+	 * it might be "neater" to call ext2_get_inode first and check
+	 * if the inode is valid.....
+	 */
+	inode = ext2_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		/* we didn't find the right inode.. */
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return inode;
+}
+
+static struct dentry *ext2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    ext2_nfs_get_inode);
+}
+
+static struct dentry *ext2_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    ext2_nfs_get_inode);
+}
+
+/* Yes, most of these are left as NULL!!
+ * A NULL value implies the default, which works with ext2-like file
+ * systems, but can be improved upon.
+ * Currently only get_parent is required.
+ */
+static const struct export_operations ext2_export_ops = {
+	.fh_to_dentry = ext2_fh_to_dentry,
+	.fh_to_parent = ext2_fh_to_parent,
+	.get_parent = ext2_get_parent,
+};
+
+static unsigned long get_sb_block(void **data)
+{
+	unsigned long 	sb_block;
+	char 		*options = (char *) *data;
+
+	if (!options || strncmp(options, "sb=", 3) != 0)
+		return 1;	/* Default location */
+	options += 3;
+	sb_block = simple_strtoul(options, &options, 0);
+	if (*options && *options != ',') {
+		printk("EXT2-fs: Invalid sb specification: %s\n",
+		       (char *) *data);
+		return 1;
+	}
+	if (*options == ',')
+		options++;
+	*data = (void *) options;
+	return sb_block;
+}
+
+enum {
+	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
+	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic,
+	Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug,
+	Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr,
+	Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota,
+	Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
+};
+
+static const match_table_t tokens = {
+	{Opt_bsd_df, "bsddf"},
+	{Opt_minix_df, "minixdf"},
+	{Opt_grpid, "grpid"},
+	{Opt_grpid, "bsdgroups"},
+	{Opt_nogrpid, "nogrpid"},
+	{Opt_nogrpid, "sysvgroups"},
+	{Opt_resgid, "resgid=%u"},
+	{Opt_resuid, "resuid=%u"},
+	{Opt_sb, "sb=%u"},
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_nouid32, "nouid32"},
+	{Opt_nocheck, "check=none"},
+	{Opt_nocheck, "nocheck"},
+	{Opt_debug, "debug"},
+	{Opt_oldalloc, "oldalloc"},
+	{Opt_orlov, "orlov"},
+	{Opt_nobh, "nobh"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_xip, "xip"},
+	{Opt_grpquota, "grpquota"},
+	{Opt_ignore, "noquota"},
+	{Opt_quota, "quota"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_reservation, "reservation"},
+	{Opt_noreservation, "noreservation"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(char *options, struct super_block *sb)
+{
+	char *p;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep (&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_bsd_df:
+			clear_opt (sbi->s_mount_opt, MINIX_DF);
+			break;
+		case Opt_minix_df:
+			set_opt (sbi->s_mount_opt, MINIX_DF);
+			break;
+		case Opt_grpid:
+			set_opt (sbi->s_mount_opt, GRPID);
+			break;
+		case Opt_nogrpid:
+			clear_opt (sbi->s_mount_opt, GRPID);
+			break;
+		case Opt_resuid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_resuid = option;
+			break;
+		case Opt_resgid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_resgid = option;
+			break;
+		case Opt_sb:
+			/* handled by get_sb_block() instead of here */
+			/* *sb_block = match_int(&args[0]); */
+			break;
+		case Opt_err_panic:
+			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt (sbi->s_mount_opt, ERRORS_RO);
+			set_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			break;
+		case Opt_err_ro:
+			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt (sbi->s_mount_opt, ERRORS_RO);
+			break;
+		case Opt_err_cont:
+			clear_opt (sbi->s_mount_opt, ERRORS_RO);
+			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt (sbi->s_mount_opt, ERRORS_CONT);
+			break;
+		case Opt_nouid32:
+			set_opt (sbi->s_mount_opt, NO_UID32);
+			break;
+		case Opt_nocheck:
+			clear_opt (sbi->s_mount_opt, CHECK);
+			break;
+		case Opt_debug:
+			set_opt (sbi->s_mount_opt, DEBUG);
+			break;
+		case Opt_oldalloc:
+			set_opt (sbi->s_mount_opt, OLDALLOC);
+			break;
+		case Opt_orlov:
+			clear_opt (sbi->s_mount_opt, OLDALLOC);
+			break;
+		case Opt_nobh:
+			set_opt (sbi->s_mount_opt, NOBH);
+			break;
+#ifdef CONFIG_EXT2_FS_XATTR
+		case Opt_user_xattr:
+			set_opt (sbi->s_mount_opt, XATTR_USER);
+			break;
+		case Opt_nouser_xattr:
+			clear_opt (sbi->s_mount_opt, XATTR_USER);
+			break;
+#else
+		case Opt_user_xattr:
+		case Opt_nouser_xattr:
+			ext2_msg(sb, KERN_INFO, "(no)user_xattr options"
+				"not supported");
+			break;
+#endif
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+		case Opt_acl:
+			set_opt(sbi->s_mount_opt, POSIX_ACL);
+			break;
+		case Opt_noacl:
+			clear_opt(sbi->s_mount_opt, POSIX_ACL);
+			break;
+#else
+		case Opt_acl:
+		case Opt_noacl:
+			ext2_msg(sb, KERN_INFO,
+				"(no)acl options not supported");
+			break;
+#endif
+		case Opt_xip:
+#ifdef CONFIG_EXT2_FS_XIP
+			set_opt (sbi->s_mount_opt, XIP);
+#else
+			ext2_msg(sb, KERN_INFO, "xip option not supported");
+#endif
+			break;
+
+#if defined(CONFIG_QUOTA)
+		case Opt_quota:
+		case Opt_usrquota:
+			set_opt(sbi->s_mount_opt, USRQUOTA);
+			break;
+
+		case Opt_grpquota:
+			set_opt(sbi->s_mount_opt, GRPQUOTA);
+			break;
+#else
+		case Opt_quota:
+		case Opt_usrquota:
+		case Opt_grpquota:
+			ext2_msg(sb, KERN_INFO,
+				"quota operations not supported");
+			break;
+#endif
+
+		case Opt_reservation:
+			set_opt(sbi->s_mount_opt, RESERVATION);
+			ext2_msg(sb, KERN_INFO, "reservations ON");
+			break;
+		case Opt_noreservation:
+			clear_opt(sbi->s_mount_opt, RESERVATION);
+			ext2_msg(sb, KERN_INFO, "reservations OFF");
+			break;
+		case Opt_ignore:
+			break;
+		default:
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static int ext2_setup_super (struct super_block * sb,
+			      struct ext2_super_block * es,
+			      int read_only)
+{
+	int res = 0;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+	if (le32_to_cpu(es->s_rev_level) > EXT2_MAX_SUPP_REV) {
+		ext2_msg(sb, KERN_ERR,
+			"error: revision level too high, "
+			"forcing read-only mode");
+		res = MS_RDONLY;
+	}
+	if (read_only)
+		return res;
+	if (!(sbi->s_mount_state & EXT2_VALID_FS))
+		ext2_msg(sb, KERN_WARNING,
+			"warning: mounting unchecked fs, "
+			"running e2fsck is recommended");
+	else if ((sbi->s_mount_state & EXT2_ERROR_FS))
+		ext2_msg(sb, KERN_WARNING,
+			"warning: mounting fs with errors, "
+			"running e2fsck is recommended");
+	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+		 le16_to_cpu(es->s_mnt_count) >=
+		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+		ext2_msg(sb, KERN_WARNING,
+			"warning: maximal mount count reached, "
+			"running e2fsck is recommended");
+	else if (le32_to_cpu(es->s_checkinterval) &&
+		(le32_to_cpu(es->s_lastcheck) +
+			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
+		ext2_msg(sb, KERN_WARNING,
+			"warning: checktime reached, "
+			"running e2fsck is recommended");
+	if (!le16_to_cpu(es->s_max_mnt_count))
+		es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT);
+	le16_add_cpu(&es->s_mnt_count, 1);
+	if (test_opt (sb, DEBUG))
+		ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, "
+			"bpg=%lu, ipg=%lu, mo=%04lx]",
+			EXT2FS_VERSION, EXT2FS_DATE, sb->s_blocksize,
+			sbi->s_frag_size,
+			sbi->s_groups_count,
+			EXT2_BLOCKS_PER_GROUP(sb),
+			EXT2_INODES_PER_GROUP(sb),
+			sbi->s_mount_opt);
+	return res;
+}
+
+static int ext2_check_descriptors(struct super_block *sb)
+{
+	int i;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+	ext2_debug ("Checking group descriptors");
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		struct ext2_group_desc *gdp = ext2_get_group_desc(sb, i, NULL);
+		ext2_fsblk_t first_block = ext2_group_first_block_no(sb, i);
+		ext2_fsblk_t last_block;
+
+		if (i == sbi->s_groups_count - 1)
+			last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
+		else
+			last_block = first_block +
+				(EXT2_BLOCKS_PER_GROUP(sb) - 1);
+
+		if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
+		    le32_to_cpu(gdp->bg_block_bitmap) > last_block)
+		{
+			ext2_error (sb, "ext2_check_descriptors",
+				    "Block bitmap for group %d"
+				    " not in group (block %lu)!",
+				    i, (unsigned long) le32_to_cpu(gdp->bg_block_bitmap));
+			return 0;
+		}
+		if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block ||
+		    le32_to_cpu(gdp->bg_inode_bitmap) > last_block)
+		{
+			ext2_error (sb, "ext2_check_descriptors",
+				    "Inode bitmap for group %d"
+				    " not in group (block %lu)!",
+				    i, (unsigned long) le32_to_cpu(gdp->bg_inode_bitmap));
+			return 0;
+		}
+		if (le32_to_cpu(gdp->bg_inode_table) < first_block ||
+		    le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group - 1 >
+		    last_block)
+		{
+			ext2_error (sb, "ext2_check_descriptors",
+				    "Inode table for group %d"
+				    " not in group (block %lu)!",
+				    i, (unsigned long) le32_to_cpu(gdp->bg_inode_table));
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/*
+ * Maximal file size.  There is a direct, and {,double-,triple-}indirect
+ * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
+ * We need to be 1 filesystem block less than the 2^32 sector limit.
+ */
+static loff_t ext2_max_size(int bits)
+{
+	loff_t res = EXT2_NDIR_BLOCKS;
+	int meta_blocks;
+	loff_t upper_limit;
+
+	/* This is calculated to be the largest file size for a
+	 * dense, file such that the total number of
+	 * sectors in the file, including data and all indirect blocks,
+	 * does not exceed 2^32 -1
+	 * __u32 i_blocks representing the total number of
+	 * 512 bytes blocks of the file
+	 */
+	upper_limit = (1LL << 32) - 1;
+
+	/* total blocks in file system block size */
+	upper_limit >>= (bits - 9);
+
+
+	/* indirect blocks */
+	meta_blocks = 1;
+	/* double indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2));
+	/* tripple indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
+
+	upper_limit -= meta_blocks;
+	upper_limit <<= bits;
+
+	res += 1LL << (bits-2);
+	res += 1LL << (2*(bits-2));
+	res += 1LL << (3*(bits-2));
+	res <<= bits;
+	if (res > upper_limit)
+		res = upper_limit;
+
+	if (res > MAX_LFS_FILESIZE)
+		res = MAX_LFS_FILESIZE;
+
+	return res;
+}
+
+static unsigned long descriptor_loc(struct super_block *sb,
+				    unsigned long logic_sb_block,
+				    int nr)
+{
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	unsigned long bg, first_meta_bg;
+	int has_super = 0;
+	
+	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
+
+	if (!EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_META_BG) ||
+	    nr < first_meta_bg)
+		return (logic_sb_block + nr + 1);
+	bg = sbi->s_desc_per_block * nr;
+	if (ext2_bg_has_super(sb, bg))
+		has_super = 1;
+
+	return ext2_group_first_block_no(sb, bg) + has_super;
+}
+
+static int ext2_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct buffer_head * bh;
+	struct ext2_sb_info * sbi;
+	struct ext2_super_block * es;
+	struct inode *root;
+	unsigned long block;
+	unsigned long sb_block = get_sb_block(&data);
+	unsigned long logic_sb_block;
+	unsigned long offset = 0;
+	unsigned long def_mount_opts;
+	long ret = -EINVAL;
+	int blocksize = BLOCK_SIZE;
+	int db_count;
+	int i, j;
+	__le32 features;
+	int err;
+
+	err = -ENOMEM;
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		goto failed_unlock;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock) {
+		kfree(sbi);
+		goto failed_unlock;
+	}
+	sb->s_fs_info = sbi;
+	sbi->s_sb_block = sb_block;
+
+	spin_lock_init(&sbi->s_lock);
+
+	/*
+	 * See what the current blocksize for the device is, and
+	 * use that as the blocksize.  Otherwise (or if the blocksize
+	 * is smaller than the default) use the default.
+	 * This is important for devices that have a hardware
+	 * sectorsize that is larger than the default.
+	 */
+	blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
+	if (!blocksize) {
+		ext2_msg(sb, KERN_ERR, "error: unable to set blocksize");
+		goto failed_sbi;
+	}
+
+	/*
+	 * If the superblock doesn't start on a hardware sector boundary,
+	 * calculate the offset.  
+	 */
+	if (blocksize != BLOCK_SIZE) {
+		logic_sb_block = (sb_block*BLOCK_SIZE) / blocksize;
+		offset = (sb_block*BLOCK_SIZE) % blocksize;
+	} else {
+		logic_sb_block = sb_block;
+	}
+
+	if (!(bh = sb_bread(sb, logic_sb_block))) {
+		ext2_msg(sb, KERN_ERR, "error: unable to read superblock");
+		goto failed_sbi;
+	}
+	/*
+	 * Note: s_es must be initialized as soon as possible because
+	 *       some ext2 macro-instructions depend on its value
+	 */
+	es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
+	sbi->s_es = es;
+	sb->s_magic = le16_to_cpu(es->s_magic);
+
+	if (sb->s_magic != EXT2_SUPER_MAGIC)
+		goto cantfind_ext2;
+
+	/* Set defaults before we parse the mount options */
+	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+	if (def_mount_opts & EXT2_DEFM_DEBUG)
+		set_opt(sbi->s_mount_opt, DEBUG);
+	if (def_mount_opts & EXT2_DEFM_BSDGROUPS)
+		set_opt(sbi->s_mount_opt, GRPID);
+	if (def_mount_opts & EXT2_DEFM_UID16)
+		set_opt(sbi->s_mount_opt, NO_UID32);
+#ifdef CONFIG_EXT2_FS_XATTR
+	if (def_mount_opts & EXT2_DEFM_XATTR_USER)
+		set_opt(sbi->s_mount_opt, XATTR_USER);
+#endif
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+	if (def_mount_opts & EXT2_DEFM_ACL)
+		set_opt(sbi->s_mount_opt, POSIX_ACL);
+#endif
+	
+	if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC)
+		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_CONTINUE)
+		set_opt(sbi->s_mount_opt, ERRORS_CONT);
+	else
+		set_opt(sbi->s_mount_opt, ERRORS_RO);
+
+	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
+	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+	
+	set_opt(sbi->s_mount_opt, RESERVATION);
+
+	if (!parse_options((char *) data, sb))
+		goto failed_mount;
+
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
+		 MS_POSIXACL : 0);
+
+	ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
+				    EXT2_MOUNT_XIP if not */
+
+	if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
+	    (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
+	     EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
+	     EXT2_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+		ext2_msg(sb, KERN_WARNING,
+			"warning: feature flags set on rev 0 fs, "
+			"running e2fsck is recommended");
+	/*
+	 * Check feature flags regardless of the revision level, since we
+	 * previously didn't change the revision level when setting the flags,
+	 * so there is a chance incompat flags are set on a rev 0 filesystem.
+	 */
+	features = EXT2_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP);
+	if (features) {
+		ext2_msg(sb, KERN_ERR,	"error: couldn't mount because of "
+		       "unsupported optional features (%x)",
+			le32_to_cpu(features));
+		goto failed_mount;
+	}
+	if (!(sb->s_flags & MS_RDONLY) &&
+	    (features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){
+		ext2_msg(sb, KERN_ERR, "error: couldn't mount RDWR because of "
+		       "unsupported optional features (%x)",
+		       le32_to_cpu(features));
+		goto failed_mount;
+	}
+
+	blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
+
+	if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) {
+		if (!silent)
+			ext2_msg(sb, KERN_ERR,
+				"error: unsupported blocksize for xip");
+		goto failed_mount;
+	}
+
+	/* If the blocksize doesn't match, re-read the thing.. */
+	if (sb->s_blocksize != blocksize) {
+		brelse(bh);
+
+		if (!sb_set_blocksize(sb, blocksize)) {
+			ext2_msg(sb, KERN_ERR,
+				"error: bad blocksize %d", blocksize);
+			goto failed_sbi;
+		}
+
+		logic_sb_block = (sb_block*BLOCK_SIZE) / blocksize;
+		offset = (sb_block*BLOCK_SIZE) % blocksize;
+		bh = sb_bread(sb, logic_sb_block);
+		if(!bh) {
+			ext2_msg(sb, KERN_ERR, "error: couldn't read"
+				"superblock on 2nd try");
+			goto failed_sbi;
+		}
+		es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
+		sbi->s_es = es;
+		if (es->s_magic != cpu_to_le16(EXT2_SUPER_MAGIC)) {
+			ext2_msg(sb, KERN_ERR, "error: magic mismatch");
+			goto failed_mount;
+		}
+	}
+
+	sb->s_maxbytes = ext2_max_size(sb->s_blocksize_bits);
+
+	if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV) {
+		sbi->s_inode_size = EXT2_GOOD_OLD_INODE_SIZE;
+		sbi->s_first_ino = EXT2_GOOD_OLD_FIRST_INO;
+	} else {
+		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+		if ((sbi->s_inode_size < EXT2_GOOD_OLD_INODE_SIZE) ||
+		    !is_power_of_2(sbi->s_inode_size) ||
+		    (sbi->s_inode_size > blocksize)) {
+			ext2_msg(sb, KERN_ERR,
+				"error: unsupported inode size: %d",
+				sbi->s_inode_size);
+			goto failed_mount;
+		}
+	}
+
+	sbi->s_frag_size = EXT2_MIN_FRAG_SIZE <<
+				   le32_to_cpu(es->s_log_frag_size);
+	if (sbi->s_frag_size == 0)
+		goto cantfind_ext2;
+	sbi->s_frags_per_block = sb->s_blocksize / sbi->s_frag_size;
+
+	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+	sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
+	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
+
+	if (EXT2_INODE_SIZE(sb) == 0)
+		goto cantfind_ext2;
+	sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb);
+	if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0)
+		goto cantfind_ext2;
+	sbi->s_itb_per_group = sbi->s_inodes_per_group /
+					sbi->s_inodes_per_block;
+	sbi->s_desc_per_block = sb->s_blocksize /
+					sizeof (struct ext2_group_desc);
+	sbi->s_sbh = bh;
+	sbi->s_mount_state = le16_to_cpu(es->s_state);
+	sbi->s_addr_per_block_bits =
+		ilog2 (EXT2_ADDR_PER_BLOCK(sb));
+	sbi->s_desc_per_block_bits =
+		ilog2 (EXT2_DESC_PER_BLOCK(sb));
+
+	if (sb->s_magic != EXT2_SUPER_MAGIC)
+		goto cantfind_ext2;
+
+	if (sb->s_blocksize != bh->b_size) {
+		if (!silent)
+			ext2_msg(sb, KERN_ERR, "error: unsupported blocksize");
+		goto failed_mount;
+	}
+
+	if (sb->s_blocksize != sbi->s_frag_size) {
+		ext2_msg(sb, KERN_ERR,
+			"error: fragsize %lu != blocksize %lu"
+			"(not supported yet)",
+			sbi->s_frag_size, sb->s_blocksize);
+		goto failed_mount;
+	}
+
+	if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
+		ext2_msg(sb, KERN_ERR,
+			"error: #blocks per group too big: %lu",
+			sbi->s_blocks_per_group);
+		goto failed_mount;
+	}
+	if (sbi->s_frags_per_group > sb->s_blocksize * 8) {
+		ext2_msg(sb, KERN_ERR,
+			"error: #fragments per group too big: %lu",
+			sbi->s_frags_per_group);
+		goto failed_mount;
+	}
+	if (sbi->s_inodes_per_group > sb->s_blocksize * 8) {
+		ext2_msg(sb, KERN_ERR,
+			"error: #inodes per group too big: %lu",
+			sbi->s_inodes_per_group);
+		goto failed_mount;
+	}
+
+	if (EXT2_BLOCKS_PER_GROUP(sb) == 0)
+		goto cantfind_ext2;
+ 	sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
+ 				le32_to_cpu(es->s_first_data_block) - 1)
+ 					/ EXT2_BLOCKS_PER_GROUP(sb)) + 1;
+	db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
+		   EXT2_DESC_PER_BLOCK(sb);
+	sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL);
+	if (sbi->s_group_desc == NULL) {
+		ext2_msg(sb, KERN_ERR, "error: not enough memory");
+		goto failed_mount;
+	}
+	bgl_lock_init(sbi->s_blockgroup_lock);
+	sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
+	if (!sbi->s_debts) {
+		ext2_msg(sb, KERN_ERR, "error: not enough memory");
+		goto failed_mount_group_desc;
+	}
+	for (i = 0; i < db_count; i++) {
+		block = descriptor_loc(sb, logic_sb_block, i);
+		sbi->s_group_desc[i] = sb_bread(sb, block);
+		if (!sbi->s_group_desc[i]) {
+			for (j = 0; j < i; j++)
+				brelse (sbi->s_group_desc[j]);
+			ext2_msg(sb, KERN_ERR,
+				"error: unable to read group descriptors");
+			goto failed_mount_group_desc;
+		}
+	}
+	if (!ext2_check_descriptors (sb)) {
+		ext2_msg(sb, KERN_ERR, "group descriptors corrupted");
+		goto failed_mount2;
+	}
+	sbi->s_gdb_count = db_count;
+	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+	spin_lock_init(&sbi->s_next_gen_lock);
+
+	/* per fileystem reservation list head & lock */
+	spin_lock_init(&sbi->s_rsv_window_lock);
+	sbi->s_rsv_window_root = RB_ROOT;
+	/*
+	 * Add a single, static dummy reservation to the start of the
+	 * reservation window list --- it gives us a placeholder for
+	 * append-at-start-of-list which makes the allocation logic
+	 * _much_ simpler.
+	 */
+	sbi->s_rsv_window_head.rsv_start = EXT2_RESERVE_WINDOW_NOT_ALLOCATED;
+	sbi->s_rsv_window_head.rsv_end = EXT2_RESERVE_WINDOW_NOT_ALLOCATED;
+	sbi->s_rsv_window_head.rsv_alloc_hit = 0;
+	sbi->s_rsv_window_head.rsv_goal_size = 0;
+	ext2_rsv_window_add(sb, &sbi->s_rsv_window_head);
+
+	err = percpu_counter_init(&sbi->s_freeblocks_counter,
+				ext2_count_free_blocks(sb));
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_freeinodes_counter,
+				ext2_count_free_inodes(sb));
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_dirs_counter,
+				ext2_count_dirs(sb));
+	}
+	if (err) {
+		ext2_msg(sb, KERN_ERR, "error: insufficient memory");
+		goto failed_mount3;
+	}
+	/*
+	 * set up enough so that it can read an inode
+	 */
+	sb->s_op = &ext2_sops;
+	sb->s_export_op = &ext2_export_ops;
+	sb->s_xattr = ext2_xattr_handlers;
+
+#ifdef CONFIG_QUOTA
+	sb->dq_op = &dquot_operations;
+	sb->s_qcop = &dquot_quotactl_ops;
+#endif
+
+	root = ext2_iget(sb, EXT2_ROOT_INO);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto failed_mount3;
+	}
+	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+		iput(root);
+		ext2_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
+		goto failed_mount3;
+	}
+
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		iput(root);
+		ext2_msg(sb, KERN_ERR, "error: get root inode failed");
+		ret = -ENOMEM;
+		goto failed_mount3;
+	}
+	if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
+		ext2_msg(sb, KERN_WARNING,
+			"warning: mounting ext3 filesystem as ext2");
+	if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY))
+		sb->s_flags |= MS_RDONLY;
+	ext2_write_super(sb);
+	return 0;
+
+cantfind_ext2:
+	if (!silent)
+		ext2_msg(sb, KERN_ERR,
+			"error: can't find an ext2 filesystem on dev %s.",
+			sb->s_id);
+	goto failed_mount;
+failed_mount3:
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+failed_mount2:
+	for (i = 0; i < db_count; i++)
+		brelse(sbi->s_group_desc[i]);
+failed_mount_group_desc:
+	kfree(sbi->s_group_desc);
+	kfree(sbi->s_debts);
+failed_mount:
+	brelse(bh);
+failed_sbi:
+	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
+	kfree(sbi);
+failed_unlock:
+	return ret;
+}
+
+static void ext2_clear_super_error(struct super_block *sb)
+{
+	struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
+
+	if (buffer_write_io_error(sbh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		ext2_msg(sb, KERN_ERR,
+		       "previous I/O error to superblock detected\n");
+		clear_buffer_write_io_error(sbh);
+		set_buffer_uptodate(sbh);
+	}
+}
+
+static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
+			    int wait)
+{
+	ext2_clear_super_error(sb);
+	spin_lock(&EXT2_SB(sb)->s_lock);
+	es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
+	es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
+	es->s_wtime = cpu_to_le32(get_seconds());
+	/* unlock before we do IO */
+	spin_unlock(&EXT2_SB(sb)->s_lock);
+	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
+	if (wait)
+		sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
+	sb->s_dirt = 0;
+}
+
+/*
+ * In the second extended file system, it is not necessary to
+ * write the super block since we use a mapping of the
+ * disk super block in a buffer.
+ *
+ * However, this function is still used to set the fs valid
+ * flags to 0.  We need to set this flag to 0 since the fs
+ * may have been checked while mounted and e2fsck may have
+ * set s_state to EXT2_VALID_FS after some corrections.
+ */
+static int ext2_sync_fs(struct super_block *sb, int wait)
+{
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	struct ext2_super_block *es = EXT2_SB(sb)->s_es;
+
+	spin_lock(&sbi->s_lock);
+	if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
+		ext2_debug("setting valid to 0\n");
+		es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
+	}
+	spin_unlock(&sbi->s_lock);
+	ext2_sync_super(sb, es, wait);
+	return 0;
+}
+
+
+void ext2_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		ext2_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
+}
+
+static int ext2_remount (struct super_block * sb, int * flags, char * data)
+{
+	struct ext2_sb_info * sbi = EXT2_SB(sb);
+	struct ext2_super_block * es;
+	unsigned long old_mount_opt = sbi->s_mount_opt;
+	struct ext2_mount_options old_opts;
+	unsigned long old_sb_flags;
+	int err;
+
+	spin_lock(&sbi->s_lock);
+
+	/* Store the old options */
+	old_sb_flags = sb->s_flags;
+	old_opts.s_mount_opt = sbi->s_mount_opt;
+	old_opts.s_resuid = sbi->s_resuid;
+	old_opts.s_resgid = sbi->s_resgid;
+
+	/*
+	 * Allow the "check" option to be passed as a remount option.
+	 */
+	if (!parse_options(data, sb)) {
+		err = -EINVAL;
+		goto restore_opts;
+	}
+
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
+	ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
+				    EXT2_MOUNT_XIP if not */
+
+	if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) {
+		ext2_msg(sb, KERN_WARNING,
+			"warning: unsupported blocksize for xip");
+		err = -EINVAL;
+		goto restore_opts;
+	}
+
+	es = sbi->s_es;
+	if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
+		ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
+			 "xip flag with busy inodes while remounting");
+		sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
+		sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
+	}
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+		spin_unlock(&sbi->s_lock);
+		return 0;
+	}
+	if (*flags & MS_RDONLY) {
+		if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
+		    !(sbi->s_mount_state & EXT2_VALID_FS)) {
+			spin_unlock(&sbi->s_lock);
+			return 0;
+		}
+
+		/*
+		 * OK, we are remounting a valid rw partition rdonly, so set
+		 * the rdonly flag and then mark the partition as valid again.
+		 */
+		es->s_state = cpu_to_le16(sbi->s_mount_state);
+		es->s_mtime = cpu_to_le32(get_seconds());
+		spin_unlock(&sbi->s_lock);
+
+		err = dquot_suspend(sb, -1);
+		if (err < 0) {
+			spin_lock(&sbi->s_lock);
+			goto restore_opts;
+		}
+
+		ext2_sync_super(sb, es, 1);
+	} else {
+		__le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
+					       ~EXT2_FEATURE_RO_COMPAT_SUPP);
+		if (ret) {
+			ext2_msg(sb, KERN_WARNING,
+				"warning: couldn't remount RDWR because of "
+				"unsupported optional features (%x).",
+				le32_to_cpu(ret));
+			err = -EROFS;
+			goto restore_opts;
+		}
+		/*
+		 * Mounting a RDONLY partition read-write, so reread and
+		 * store the current valid flag.  (It may have been changed
+		 * by e2fsck since we originally mounted the partition.)
+		 */
+		sbi->s_mount_state = le16_to_cpu(es->s_state);
+		if (!ext2_setup_super (sb, es, 0))
+			sb->s_flags &= ~MS_RDONLY;
+		spin_unlock(&sbi->s_lock);
+
+		ext2_write_super(sb);
+
+		dquot_resume(sb, -1);
+	}
+
+	return 0;
+restore_opts:
+	sbi->s_mount_opt = old_opts.s_mount_opt;
+	sbi->s_resuid = old_opts.s_resuid;
+	sbi->s_resgid = old_opts.s_resgid;
+	sb->s_flags = old_sb_flags;
+	spin_unlock(&sbi->s_lock);
+	return err;
+}
+
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	struct ext2_super_block *es = sbi->s_es;
+	u64 fsid;
+
+	spin_lock(&sbi->s_lock);
+
+	if (test_opt (sb, MINIX_DF))
+		sbi->s_overhead_last = 0;
+	else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
+		unsigned long i, overhead = 0;
+		smp_rmb();
+
+		/*
+		 * Compute the overhead (FS structures). This is constant
+		 * for a given filesystem unless the number of block groups
+		 * changes so we cache the previous value until it does.
+		 */
+
+		/*
+		 * All of the blocks before first_data_block are
+		 * overhead
+		 */
+		overhead = le32_to_cpu(es->s_first_data_block);
+
+		/*
+		 * Add the overhead attributed to the superblock and
+		 * block group descriptors.  If the sparse superblocks
+		 * feature is turned on, then not all groups have this.
+		 */
+		for (i = 0; i < sbi->s_groups_count; i++)
+			overhead += ext2_bg_has_super(sb, i) +
+				ext2_bg_num_gdb(sb, i);
+
+		/*
+		 * Every block group has an inode bitmap, a block
+		 * bitmap, and an inode table.
+		 */
+		overhead += (sbi->s_groups_count *
+			     (2 + sbi->s_itb_per_group));
+		sbi->s_overhead_last = overhead;
+		smp_wmb();
+		sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
+	}
+
+	buf->f_type = EXT2_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
+	buf->f_bfree = ext2_count_free_blocks(sb);
+	es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
+	buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
+	if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
+		buf->f_bavail = 0;
+	buf->f_files = le32_to_cpu(es->s_inodes_count);
+	buf->f_ffree = ext2_count_free_inodes(sb);
+	es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
+	buf->f_namelen = EXT2_NAME_LEN;
+	fsid = le64_to_cpup((void *)es->s_uuid) ^
+	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
+	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+	spin_unlock(&sbi->s_lock);
+	return 0;
+}
+
+static struct dentry *ext2_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
+}
+
+#ifdef CONFIG_QUOTA
+
+/* Read data from quotafile - avoid pagecache and such because we cannot afford
+ * acquiring the locks... As quota files are never truncated and quota code
+ * itself serializes the operations (and no one else should touch the files)
+ * we don't have to be afraid of races */
+static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data,
+			       size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	sector_t blk = off >> EXT2_BLOCK_SIZE_BITS(sb);
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	int tocopy;
+	size_t toread;
+	struct buffer_head tmp_bh;
+	struct buffer_head *bh;
+	loff_t i_size = i_size_read(inode);
+
+	if (off > i_size)
+		return 0;
+	if (off+len > i_size)
+		len = i_size-off;
+	toread = len;
+	while (toread > 0) {
+		tocopy = sb->s_blocksize - offset < toread ?
+				sb->s_blocksize - offset : toread;
+
+		tmp_bh.b_state = 0;
+		tmp_bh.b_size = sb->s_blocksize;
+		err = ext2_get_block(inode, blk, &tmp_bh, 0);
+		if (err < 0)
+			return err;
+		if (!buffer_mapped(&tmp_bh))	/* A hole? */
+			memset(data, 0, tocopy);
+		else {
+			bh = sb_bread(sb, tmp_bh.b_blocknr);
+			if (!bh)
+				return -EIO;
+			memcpy(data, bh->b_data+offset, tocopy);
+			brelse(bh);
+		}
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/* Write to quotafile */
+static ssize_t ext2_quota_write(struct super_block *sb, int type,
+				const char *data, size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	sector_t blk = off >> EXT2_BLOCK_SIZE_BITS(sb);
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	int tocopy;
+	size_t towrite = len;
+	struct buffer_head tmp_bh;
+	struct buffer_head *bh;
+
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
+	while (towrite > 0) {
+		tocopy = sb->s_blocksize - offset < towrite ?
+				sb->s_blocksize - offset : towrite;
+
+		tmp_bh.b_state = 0;
+		err = ext2_get_block(inode, blk, &tmp_bh, 1);
+		if (err < 0)
+			goto out;
+		if (offset || tocopy != EXT2_BLOCK_SIZE(sb))
+			bh = sb_bread(sb, tmp_bh.b_blocknr);
+		else
+			bh = sb_getblk(sb, tmp_bh.b_blocknr);
+		if (!bh) {
+			err = -EIO;
+			goto out;
+		}
+		lock_buffer(bh);
+		memcpy(bh->b_data+offset, data, tocopy);
+		flush_dcache_page(bh->b_page);
+		set_buffer_uptodate(bh);
+		mark_buffer_dirty(bh);
+		unlock_buffer(bh);
+		brelse(bh);
+		offset = 0;
+		towrite -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+out:
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
+		return err;
+	}
+	if (inode->i_size < off+len-towrite)
+		i_size_write(inode, off+len-towrite);
+	inode->i_version++;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+	mutex_unlock(&inode->i_mutex);
+	return len - towrite;
+}
+
+#endif
+
+static struct file_system_type ext2_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext2",
+	.mount		= ext2_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_ext2_fs(void)
+{
+	int err = init_ext2_xattr();
+	if (err)
+		return err;
+	err = init_inodecache();
+	if (err)
+		goto out1;
+        err = register_filesystem(&ext2_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	exit_ext2_xattr();
+	return err;
+}
+
+static void __exit exit_ext2_fs(void)
+{
+	unregister_filesystem(&ext2_fs_type);
+	destroy_inodecache();
+	exit_ext2_xattr();
+}
+
+module_init(init_ext2_fs)
+module_exit(exit_ext2_fs)
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
new file mode 100644
index 00000000..565cf817
--- /dev/null
+++ b/fs/ext2/symlink.c
@@ -0,0 +1,54 @@
+/*
+ *  linux/fs/ext2/symlink.c
+ *
+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/symlink.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 symlink handling code
+ */
+
+#include "ext2.h"
+#include "xattr.h"
+#include <linux/namei.h>
+
+static void *ext2_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ext2_inode_info *ei = EXT2_I(dentry->d_inode);
+	nd_set_link(nd, (char *)ei->i_data);
+	return NULL;
+}
+
+const struct inode_operations ext2_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= ext2_setattr,
+#ifdef CONFIG_EXT2_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext2_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
+ 
+const struct inode_operations ext2_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= ext2_follow_link,
+	.setattr	= ext2_setattr,
+#ifdef CONFIG_EXT2_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext2_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
new file mode 100644
index 00000000..52997061
--- /dev/null
+++ b/fs/ext2/xattr.c
@@ -0,0 +1,1031 @@
+/*
+ * linux/fs/ext2/xattr.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher <agruen@suse.de>
+ *
+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
+ * Extended attributes for symlinks and special files added per
+ *  suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ *  Red Hat Inc.
+ *
+ */
+
+/*
+ * Extended attributes are stored on disk blocks allocated outside of
+ * any inode. The i_file_acl field is then made to point to this allocated
+ * block. If all extended attributes of an inode are identical, these
+ * inodes may share the same extended attribute block. Such situations
+ * are automatically detected by keeping a cache of recent attribute block
+ * numbers and hashes over the block's contents in memory.
+ *
+ *
+ * Extended attribute block layout:
+ *
+ *   +------------------+
+ *   | header           |
+ *   | entry 1          | |
+ *   | entry 2          | | growing downwards
+ *   | entry 3          | v
+ *   | four null bytes  |
+ *   | . . .            |
+ *   | value 1          | ^
+ *   | value 3          | | growing upwards
+ *   | value 2          | |
+ *   +------------------+
+ *
+ * The block header is followed by multiple entry descriptors. These entry
+ * descriptors are variable in size, and aligned to EXT2_XATTR_PAD
+ * byte boundaries. The entry descriptors are sorted by attribute name,
+ * so that two extended attribute blocks can be compared efficiently.
+ *
+ * Attribute values are aligned to the end of the block, stored in
+ * no specific order. They are also padded to EXT2_XATTR_PAD byte
+ * boundaries. No additional gaps are left between them.
+ *
+ * Locking strategy
+ * ----------------
+ * EXT2_I(inode)->i_file_acl is protected by EXT2_I(inode)->xattr_sem.
+ * EA blocks are only changed if they are exclusive to an inode, so
+ * holding xattr_sem also means that nothing but the EA block's reference
+ * count will change. Multiple writers to an EA block are synchronized
+ * by the bh lock. No more than a single bh lock is held at any time
+ * to avoid deadlocks.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/mbcache.h>
+#include <linux/quotaops.h>
+#include <linux/rwsem.h>
+#include <linux/security.h>
+#include "ext2.h"
+#include "xattr.h"
+#include "acl.h"
+
+#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr))
+#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
+#ifdef EXT2_XATTR_DEBUG
+# define ea_idebug(inode, f...) do { \
+		printk(KERN_DEBUG "inode %s:%ld: ", \
+			inode->i_sb->s_id, inode->i_ino); \
+		printk(f); \
+		printk("\n"); \
+	} while (0)
+# define ea_bdebug(bh, f...) do { \
+		char b[BDEVNAME_SIZE]; \
+		printk(KERN_DEBUG "block %s:%lu: ", \
+			bdevname(bh->b_bdev, b), \
+			(unsigned long) bh->b_blocknr); \
+		printk(f); \
+		printk("\n"); \
+	} while (0)
+#else
+# define ea_idebug(f...)
+# define ea_bdebug(f...)
+#endif
+
+static int ext2_xattr_set2(struct inode *, struct buffer_head *,
+			   struct ext2_xattr_header *);
+
+static int ext2_xattr_cache_insert(struct buffer_head *);
+static struct buffer_head *ext2_xattr_cache_find(struct inode *,
+						 struct ext2_xattr_header *);
+static void ext2_xattr_rehash(struct ext2_xattr_header *,
+			      struct ext2_xattr_entry *);
+
+static struct mb_cache *ext2_xattr_cache;
+
+static const struct xattr_handler *ext2_xattr_handler_map[] = {
+	[EXT2_XATTR_INDEX_USER]		     = &ext2_xattr_user_handler,
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+	[EXT2_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext2_xattr_acl_access_handler,
+	[EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext2_xattr_acl_default_handler,
+#endif
+	[EXT2_XATTR_INDEX_TRUSTED]	     = &ext2_xattr_trusted_handler,
+#ifdef CONFIG_EXT2_FS_SECURITY
+	[EXT2_XATTR_INDEX_SECURITY]	     = &ext2_xattr_security_handler,
+#endif
+};
+
+const struct xattr_handler *ext2_xattr_handlers[] = {
+	&ext2_xattr_user_handler,
+	&ext2_xattr_trusted_handler,
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+	&ext2_xattr_acl_access_handler,
+	&ext2_xattr_acl_default_handler,
+#endif
+#ifdef CONFIG_EXT2_FS_SECURITY
+	&ext2_xattr_security_handler,
+#endif
+	NULL
+};
+
+static inline const struct xattr_handler *
+ext2_xattr_handler(int name_index)
+{
+	const struct xattr_handler *handler = NULL;
+
+	if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map))
+		handler = ext2_xattr_handler_map[name_index];
+	return handler;
+}
+
+/*
+ * ext2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+int
+ext2_xattr_get(struct inode *inode, int name_index, const char *name,
+	       void *buffer, size_t buffer_size)
+{
+	struct buffer_head *bh = NULL;
+	struct ext2_xattr_entry *entry;
+	size_t name_len, size;
+	char *end;
+	int error;
+
+	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
+		  name_index, name, buffer, (long)buffer_size);
+
+	if (name == NULL)
+		return -EINVAL;
+	down_read(&EXT2_I(inode)->xattr_sem);
+	error = -ENODATA;
+	if (!EXT2_I(inode)->i_file_acl)
+		goto cleanup;
+	ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl);
+	bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
+	error = -EIO;
+	if (!bh)
+		goto cleanup;
+	ea_bdebug(bh, "b_count=%d, refcount=%d",
+		atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
+	end = bh->b_data + bh->b_size;
+	if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
+	    HDR(bh)->h_blocks != cpu_to_le32(1)) {
+bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
+			"inode %ld: bad block %d", inode->i_ino,
+			EXT2_I(inode)->i_file_acl);
+		error = -EIO;
+		goto cleanup;
+	}
+	/* find named attribute */
+	name_len = strlen(name);
+
+	error = -ERANGE;
+	if (name_len > 255)
+		goto cleanup;
+	entry = FIRST_ENTRY(bh);
+	while (!IS_LAST_ENTRY(entry)) {
+		struct ext2_xattr_entry *next =
+			EXT2_XATTR_NEXT(entry);
+		if ((char *)next >= end)
+			goto bad_block;
+		if (name_index == entry->e_name_index &&
+		    name_len == entry->e_name_len &&
+		    memcmp(name, entry->e_name, name_len) == 0)
+			goto found;
+		entry = next;
+	}
+	if (ext2_xattr_cache_insert(bh))
+		ea_idebug(inode, "cache insert failed");
+	error = -ENODATA;
+	goto cleanup;
+found:
+	/* check the buffer size */
+	if (entry->e_value_block != 0)
+		goto bad_block;
+	size = le32_to_cpu(entry->e_value_size);
+	if (size > inode->i_sb->s_blocksize ||
+	    le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
+		goto bad_block;
+
+	if (ext2_xattr_cache_insert(bh))
+		ea_idebug(inode, "cache insert failed");
+	if (buffer) {
+		error = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+		/* return value of attribute */
+		memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
+			size);
+	}
+	error = size;
+
+cleanup:
+	brelse(bh);
+	up_read(&EXT2_I(inode)->xattr_sem);
+
+	return error;
+}
+
+/*
+ * ext2_xattr_list()
+ *
+ * Copy a list of attribute names into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+static int
+ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct buffer_head *bh = NULL;
+	struct ext2_xattr_entry *entry;
+	char *end;
+	size_t rest = buffer_size;
+	int error;
+
+	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
+		  buffer, (long)buffer_size);
+
+	down_read(&EXT2_I(inode)->xattr_sem);
+	error = 0;
+	if (!EXT2_I(inode)->i_file_acl)
+		goto cleanup;
+	ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl);
+	bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
+	error = -EIO;
+	if (!bh)
+		goto cleanup;
+	ea_bdebug(bh, "b_count=%d, refcount=%d",
+		atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
+	end = bh->b_data + bh->b_size;
+	if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
+	    HDR(bh)->h_blocks != cpu_to_le32(1)) {
+bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
+			"inode %ld: bad block %d", inode->i_ino,
+			EXT2_I(inode)->i_file_acl);
+		error = -EIO;
+		goto cleanup;
+	}
+
+	/* check the on-disk data structure */
+	entry = FIRST_ENTRY(bh);
+	while (!IS_LAST_ENTRY(entry)) {
+		struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(entry);
+
+		if ((char *)next >= end)
+			goto bad_block;
+		entry = next;
+	}
+	if (ext2_xattr_cache_insert(bh))
+		ea_idebug(inode, "cache insert failed");
+
+	/* list the attribute names */
+	for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
+	     entry = EXT2_XATTR_NEXT(entry)) {
+		const struct xattr_handler *handler =
+			ext2_xattr_handler(entry->e_name_index);
+
+		if (handler) {
+			size_t size = handler->list(dentry, buffer, rest,
+						    entry->e_name,
+						    entry->e_name_len,
+						    handler->flags);
+			if (buffer) {
+				if (size > rest) {
+					error = -ERANGE;
+					goto cleanup;
+				}
+				buffer += size;
+			}
+			rest -= size;
+		}
+	}
+	error = buffer_size - rest;  /* total size */
+
+cleanup:
+	brelse(bh);
+	up_read(&EXT2_I(inode)->xattr_sem);
+
+	return error;
+}
+
+/*
+ * Inode operation listxattr()
+ *
+ * dentry->d_inode->i_mutex: don't care
+ */
+ssize_t
+ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	return ext2_xattr_list(dentry, buffer, size);
+}
+
+/*
+ * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is
+ * not set, set it.
+ */
+static void ext2_xattr_update_super_block(struct super_block *sb)
+{
+	if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
+		return;
+
+	spin_lock(&EXT2_SB(sb)->s_lock);
+	EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
+	spin_unlock(&EXT2_SB(sb)->s_lock);
+	sb->s_dirt = 1;
+	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
+}
+
+/*
+ * ext2_xattr_set()
+ *
+ * Create, replace or remove an extended attribute for this inode.  Value
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext2_xattr_set(struct inode *inode, int name_index, const char *name,
+	       const void *value, size_t value_len, int flags)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh = NULL;
+	struct ext2_xattr_header *header = NULL;
+	struct ext2_xattr_entry *here, *last;
+	size_t name_len, free, min_offs = sb->s_blocksize;
+	int not_found = 1, error;
+	char *end;
+	
+	/*
+	 * header -- Points either into bh, or to a temporarily
+	 *           allocated buffer.
+	 * here -- The named entry found, or the place for inserting, within
+	 *         the block pointed to by header.
+	 * last -- Points right after the last named entry within the block
+	 *         pointed to by header.
+	 * min_offs -- The offset of the first value (values are aligned
+	 *             towards the end of the block).
+	 * end -- Points right after the block pointed to by header.
+	 */
+	
+	ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
+		  name_index, name, value, (long)value_len);
+
+	if (value == NULL)
+		value_len = 0;
+	if (name == NULL)
+		return -EINVAL;
+	name_len = strlen(name);
+	if (name_len > 255 || value_len > sb->s_blocksize)
+		return -ERANGE;
+	down_write(&EXT2_I(inode)->xattr_sem);
+	if (EXT2_I(inode)->i_file_acl) {
+		/* The inode already has an extended attribute block. */
+		bh = sb_bread(sb, EXT2_I(inode)->i_file_acl);
+		error = -EIO;
+		if (!bh)
+			goto cleanup;
+		ea_bdebug(bh, "b_count=%d, refcount=%d",
+			atomic_read(&(bh->b_count)),
+			le32_to_cpu(HDR(bh)->h_refcount));
+		header = HDR(bh);
+		end = bh->b_data + bh->b_size;
+		if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
+		    header->h_blocks != cpu_to_le32(1)) {
+bad_block:		ext2_error(sb, "ext2_xattr_set",
+				"inode %ld: bad block %d", inode->i_ino, 
+				   EXT2_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		/* Find the named attribute. */
+		here = FIRST_ENTRY(bh);
+		while (!IS_LAST_ENTRY(here)) {
+			struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here);
+			if ((char *)next >= end)
+				goto bad_block;
+			if (!here->e_value_block && here->e_value_size) {
+				size_t offs = le16_to_cpu(here->e_value_offs);
+				if (offs < min_offs)
+					min_offs = offs;
+			}
+			not_found = name_index - here->e_name_index;
+			if (!not_found)
+				not_found = name_len - here->e_name_len;
+			if (!not_found)
+				not_found = memcmp(name, here->e_name,name_len);
+			if (not_found <= 0)
+				break;
+			here = next;
+		}
+		last = here;
+		/* We still need to compute min_offs and last. */
+		while (!IS_LAST_ENTRY(last)) {
+			struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last);
+			if ((char *)next >= end)
+				goto bad_block;
+			if (!last->e_value_block && last->e_value_size) {
+				size_t offs = le16_to_cpu(last->e_value_offs);
+				if (offs < min_offs)
+					min_offs = offs;
+			}
+			last = next;
+		}
+
+		/* Check whether we have enough space left. */
+		free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
+	} else {
+		/* We will use a new extended attribute block. */
+		free = sb->s_blocksize -
+			sizeof(struct ext2_xattr_header) - sizeof(__u32);
+		here = last = NULL;  /* avoid gcc uninitialized warning. */
+	}
+
+	if (not_found) {
+		/* Request to remove a nonexistent attribute? */
+		error = -ENODATA;
+		if (flags & XATTR_REPLACE)
+			goto cleanup;
+		error = 0;
+		if (value == NULL)
+			goto cleanup;
+	} else {
+		/* Request to create an existing attribute? */
+		error = -EEXIST;
+		if (flags & XATTR_CREATE)
+			goto cleanup;
+		if (!here->e_value_block && here->e_value_size) {
+			size_t size = le32_to_cpu(here->e_value_size);
+
+			if (le16_to_cpu(here->e_value_offs) + size > 
+			    sb->s_blocksize || size > sb->s_blocksize)
+				goto bad_block;
+			free += EXT2_XATTR_SIZE(size);
+		}
+		free += EXT2_XATTR_LEN(name_len);
+	}
+	error = -ENOSPC;
+	if (free < EXT2_XATTR_LEN(name_len) + EXT2_XATTR_SIZE(value_len))
+		goto cleanup;
+
+	/* Here we know that we can set the new attribute. */
+
+	if (header) {
+		struct mb_cache_entry *ce;
+
+		/* assert(header == HDR(bh)); */
+		ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev,
+					bh->b_blocknr);
+		lock_buffer(bh);
+		if (header->h_refcount == cpu_to_le32(1)) {
+			ea_bdebug(bh, "modifying in-place");
+			if (ce)
+				mb_cache_entry_free(ce);
+			/* keep the buffer locked while modifying it. */
+		} else {
+			int offset;
+
+			if (ce)
+				mb_cache_entry_release(ce);
+			unlock_buffer(bh);
+			ea_bdebug(bh, "cloning");
+			header = kmalloc(bh->b_size, GFP_KERNEL);
+			error = -ENOMEM;
+			if (header == NULL)
+				goto cleanup;
+			memcpy(header, HDR(bh), bh->b_size);
+			header->h_refcount = cpu_to_le32(1);
+
+			offset = (char *)here - bh->b_data;
+			here = ENTRY((char *)header + offset);
+			offset = (char *)last - bh->b_data;
+			last = ENTRY((char *)header + offset);
+		}
+	} else {
+		/* Allocate a buffer where we construct the new block. */
+		header = kzalloc(sb->s_blocksize, GFP_KERNEL);
+		error = -ENOMEM;
+		if (header == NULL)
+			goto cleanup;
+		end = (char *)header + sb->s_blocksize;
+		header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC);
+		header->h_blocks = header->h_refcount = cpu_to_le32(1);
+		last = here = ENTRY(header+1);
+	}
+
+	/* Iff we are modifying the block in-place, bh is locked here. */
+
+	if (not_found) {
+		/* Insert the new name. */
+		size_t size = EXT2_XATTR_LEN(name_len);
+		size_t rest = (char *)last - (char *)here;
+		memmove((char *)here + size, here, rest);
+		memset(here, 0, size);
+		here->e_name_index = name_index;
+		here->e_name_len = name_len;
+		memcpy(here->e_name, name, name_len);
+	} else {
+		if (!here->e_value_block && here->e_value_size) {
+			char *first_val = (char *)header + min_offs;
+			size_t offs = le16_to_cpu(here->e_value_offs);
+			char *val = (char *)header + offs;
+			size_t size = EXT2_XATTR_SIZE(
+				le32_to_cpu(here->e_value_size));
+
+			if (size == EXT2_XATTR_SIZE(value_len)) {
+				/* The old and the new value have the same
+				   size. Just replace. */
+				here->e_value_size = cpu_to_le32(value_len);
+				memset(val + size - EXT2_XATTR_PAD, 0,
+				       EXT2_XATTR_PAD); /* Clear pad bytes. */
+				memcpy(val, value, value_len);
+				goto skip_replace;
+			}
+
+			/* Remove the old value. */
+			memmove(first_val + size, first_val, val - first_val);
+			memset(first_val, 0, size);
+			here->e_value_offs = 0;
+			min_offs += size;
+
+			/* Adjust all value offsets. */
+			last = ENTRY(header+1);
+			while (!IS_LAST_ENTRY(last)) {
+				size_t o = le16_to_cpu(last->e_value_offs);
+				if (!last->e_value_block && o < offs)
+					last->e_value_offs =
+						cpu_to_le16(o + size);
+				last = EXT2_XATTR_NEXT(last);
+			}
+		}
+		if (value == NULL) {
+			/* Remove the old name. */
+			size_t size = EXT2_XATTR_LEN(name_len);
+			last = ENTRY((char *)last - size);
+			memmove(here, (char*)here + size,
+				(char*)last - (char*)here);
+			memset(last, 0, size);
+		}
+	}
+
+	if (value != NULL) {
+		/* Insert the new value. */
+		here->e_value_size = cpu_to_le32(value_len);
+		if (value_len) {
+			size_t size = EXT2_XATTR_SIZE(value_len);
+			char *val = (char *)header + min_offs - size;
+			here->e_value_offs =
+				cpu_to_le16((char *)val - (char *)header);
+			memset(val + size - EXT2_XATTR_PAD, 0,
+			       EXT2_XATTR_PAD); /* Clear the pad bytes. */
+			memcpy(val, value, value_len);
+		}
+	}
+
+skip_replace:
+	if (IS_LAST_ENTRY(ENTRY(header+1))) {
+		/* This block is now empty. */
+		if (bh && header == HDR(bh))
+			unlock_buffer(bh);  /* we were modifying in-place. */
+		error = ext2_xattr_set2(inode, bh, NULL);
+	} else {
+		ext2_xattr_rehash(header, here);
+		if (bh && header == HDR(bh))
+			unlock_buffer(bh);  /* we were modifying in-place. */
+		error = ext2_xattr_set2(inode, bh, header);
+	}
+
+cleanup:
+	brelse(bh);
+	if (!(bh && header == HDR(bh)))
+		kfree(header);
+	up_write(&EXT2_I(inode)->xattr_sem);
+
+	return error;
+}
+
+/*
+ * Second half of ext2_xattr_set(): Update the file system.
+ */
+static int
+ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
+		struct ext2_xattr_header *header)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *new_bh = NULL;
+	int error;
+
+	if (header) {
+		new_bh = ext2_xattr_cache_find(inode, header);
+		if (new_bh) {
+			/* We found an identical block in the cache. */
+			if (new_bh == old_bh) {
+				ea_bdebug(new_bh, "keeping this block");
+			} else {
+				/* The old block is released after updating
+				   the inode.  */
+				ea_bdebug(new_bh, "reusing block");
+
+				error = dquot_alloc_block(inode, 1);
+				if (error) {
+					unlock_buffer(new_bh);
+					goto cleanup;
+				}
+				le32_add_cpu(&HDR(new_bh)->h_refcount, 1);
+				ea_bdebug(new_bh, "refcount now=%d",
+					le32_to_cpu(HDR(new_bh)->h_refcount));
+			}
+			unlock_buffer(new_bh);
+		} else if (old_bh && header == HDR(old_bh)) {
+			/* Keep this block. No need to lock the block as we
+			   don't need to change the reference count. */
+			new_bh = old_bh;
+			get_bh(new_bh);
+			ext2_xattr_cache_insert(new_bh);
+		} else {
+			/* We need to allocate a new block */
+			ext2_fsblk_t goal = ext2_group_first_block_no(sb,
+						EXT2_I(inode)->i_block_group);
+			int block = ext2_new_block(inode, goal, &error);
+			if (error)
+				goto cleanup;
+			ea_idebug(inode, "creating block %d", block);
+
+			new_bh = sb_getblk(sb, block);
+			if (!new_bh) {
+				ext2_free_blocks(inode, block, 1);
+				mark_inode_dirty(inode);
+				error = -EIO;
+				goto cleanup;
+			}
+			lock_buffer(new_bh);
+			memcpy(new_bh->b_data, header, new_bh->b_size);
+			set_buffer_uptodate(new_bh);
+			unlock_buffer(new_bh);
+			ext2_xattr_cache_insert(new_bh);
+			
+			ext2_xattr_update_super_block(sb);
+		}
+		mark_buffer_dirty(new_bh);
+		if (IS_SYNC(inode)) {
+			sync_dirty_buffer(new_bh);
+			error = -EIO;
+			if (buffer_req(new_bh) && !buffer_uptodate(new_bh))
+				goto cleanup;
+		}
+	}
+
+	/* Update the inode. */
+	EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
+	inode->i_ctime = CURRENT_TIME_SEC;
+	if (IS_SYNC(inode)) {
+		error = sync_inode_metadata(inode, 1);
+		/* In case sync failed due to ENOSPC the inode was actually
+		 * written (only some dirty data were not) so we just proceed
+		 * as if nothing happened and cleanup the unused block */
+		if (error && error != -ENOSPC) {
+			if (new_bh && new_bh != old_bh) {
+				dquot_free_block_nodirty(inode, 1);
+				mark_inode_dirty(inode);
+			}
+			goto cleanup;
+		}
+	} else
+		mark_inode_dirty(inode);
+
+	error = 0;
+	if (old_bh && old_bh != new_bh) {
+		struct mb_cache_entry *ce;
+
+		/*
+		 * If there was an old block and we are no longer using it,
+		 * release the old block.
+		 */
+		ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev,
+					old_bh->b_blocknr);
+		lock_buffer(old_bh);
+		if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
+			/* Free the old block. */
+			if (ce)
+				mb_cache_entry_free(ce);
+			ea_bdebug(old_bh, "freeing");
+			ext2_free_blocks(inode, old_bh->b_blocknr, 1);
+			mark_inode_dirty(inode);
+			/* We let our caller release old_bh, so we
+			 * need to duplicate the buffer before. */
+			get_bh(old_bh);
+			bforget(old_bh);
+		} else {
+			/* Decrement the refcount only. */
+			le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
+			if (ce)
+				mb_cache_entry_release(ce);
+			dquot_free_block_nodirty(inode, 1);
+			mark_inode_dirty(inode);
+			mark_buffer_dirty(old_bh);
+			ea_bdebug(old_bh, "refcount now=%d",
+				le32_to_cpu(HDR(old_bh)->h_refcount));
+		}
+		unlock_buffer(old_bh);
+	}
+
+cleanup:
+	brelse(new_bh);
+
+	return error;
+}
+
+/*
+ * ext2_xattr_delete_inode()
+ *
+ * Free extended attribute resources associated with this inode. This
+ * is called immediately before an inode is freed.
+ */
+void
+ext2_xattr_delete_inode(struct inode *inode)
+{
+	struct buffer_head *bh = NULL;
+	struct mb_cache_entry *ce;
+
+	down_write(&EXT2_I(inode)->xattr_sem);
+	if (!EXT2_I(inode)->i_file_acl)
+		goto cleanup;
+	bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
+	if (!bh) {
+		ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
+			"inode %ld: block %d read error", inode->i_ino,
+			EXT2_I(inode)->i_file_acl);
+		goto cleanup;
+	}
+	ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
+	if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
+	    HDR(bh)->h_blocks != cpu_to_le32(1)) {
+		ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
+			"inode %ld: bad block %d", inode->i_ino,
+			EXT2_I(inode)->i_file_acl);
+		goto cleanup;
+	}
+	ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr);
+	lock_buffer(bh);
+	if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
+		if (ce)
+			mb_cache_entry_free(ce);
+		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
+		get_bh(bh);
+		bforget(bh);
+		unlock_buffer(bh);
+	} else {
+		le32_add_cpu(&HDR(bh)->h_refcount, -1);
+		if (ce)
+			mb_cache_entry_release(ce);
+		ea_bdebug(bh, "refcount now=%d",
+			le32_to_cpu(HDR(bh)->h_refcount));
+		unlock_buffer(bh);
+		mark_buffer_dirty(bh);
+		if (IS_SYNC(inode))
+			sync_dirty_buffer(bh);
+		dquot_free_block_nodirty(inode, 1);
+	}
+	EXT2_I(inode)->i_file_acl = 0;
+
+cleanup:
+	brelse(bh);
+	up_write(&EXT2_I(inode)->xattr_sem);
+}
+
+/*
+ * ext2_xattr_put_super()
+ *
+ * This is called when a file system is unmounted.
+ */
+void
+ext2_xattr_put_super(struct super_block *sb)
+{
+	mb_cache_shrink(sb->s_bdev);
+}
+
+
+/*
+ * ext2_xattr_cache_insert()
+ *
+ * Create a new entry in the extended attribute cache, and insert
+ * it unless such an entry is already in the cache.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+static int
+ext2_xattr_cache_insert(struct buffer_head *bh)
+{
+	__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
+	struct mb_cache_entry *ce;
+	int error;
+
+	ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
+	if (!ce)
+		return -ENOMEM;
+	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+	if (error) {
+		mb_cache_entry_free(ce);
+		if (error == -EBUSY) {
+			ea_bdebug(bh, "already in cache (%d cache entries)",
+				atomic_read(&ext2_xattr_cache->c_entry_count));
+			error = 0;
+		}
+	} else {
+		ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
+			  atomic_read(&ext2_xattr_cache->c_entry_count));
+		mb_cache_entry_release(ce);
+	}
+	return error;
+}
+
+/*
+ * ext2_xattr_cmp()
+ *
+ * Compare two extended attribute blocks for equality.
+ *
+ * Returns 0 if the blocks are equal, 1 if they differ, and
+ * a negative error number on errors.
+ */
+static int
+ext2_xattr_cmp(struct ext2_xattr_header *header1,
+	       struct ext2_xattr_header *header2)
+{
+	struct ext2_xattr_entry *entry1, *entry2;
+
+	entry1 = ENTRY(header1+1);
+	entry2 = ENTRY(header2+1);
+	while (!IS_LAST_ENTRY(entry1)) {
+		if (IS_LAST_ENTRY(entry2))
+			return 1;
+		if (entry1->e_hash != entry2->e_hash ||
+		    entry1->e_name_index != entry2->e_name_index ||
+		    entry1->e_name_len != entry2->e_name_len ||
+		    entry1->e_value_size != entry2->e_value_size ||
+		    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
+			return 1;
+		if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
+			return -EIO;
+		if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
+			   (char *)header2 + le16_to_cpu(entry2->e_value_offs),
+			   le32_to_cpu(entry1->e_value_size)))
+			return 1;
+
+		entry1 = EXT2_XATTR_NEXT(entry1);
+		entry2 = EXT2_XATTR_NEXT(entry2);
+	}
+	if (!IS_LAST_ENTRY(entry2))
+		return 1;
+	return 0;
+}
+
+/*
+ * ext2_xattr_cache_find()
+ *
+ * Find an identical extended attribute block.
+ *
+ * Returns a locked buffer head to the block found, or NULL if such
+ * a block was not found or an error occurred.
+ */
+static struct buffer_head *
+ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
+{
+	__u32 hash = le32_to_cpu(header->h_hash);
+	struct mb_cache_entry *ce;
+
+	if (!header->h_hash)
+		return NULL;  /* never share */
+	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
+again:
+	ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev,
+				       hash);
+	while (ce) {
+		struct buffer_head *bh;
+
+		if (IS_ERR(ce)) {
+			if (PTR_ERR(ce) == -EAGAIN)
+				goto again;
+			break;
+		}
+
+		bh = sb_bread(inode->i_sb, ce->e_block);
+		if (!bh) {
+			ext2_error(inode->i_sb, "ext2_xattr_cache_find",
+				"inode %ld: block %ld read error",
+				inode->i_ino, (unsigned long) ce->e_block);
+		} else {
+			lock_buffer(bh);
+			if (le32_to_cpu(HDR(bh)->h_refcount) >
+				   EXT2_XATTR_REFCOUNT_MAX) {
+				ea_idebug(inode, "block %ld refcount %d>%d",
+					  (unsigned long) ce->e_block,
+					  le32_to_cpu(HDR(bh)->h_refcount),
+					  EXT2_XATTR_REFCOUNT_MAX);
+			} else if (!ext2_xattr_cmp(header, HDR(bh))) {
+				ea_bdebug(bh, "b_count=%d",
+					  atomic_read(&(bh->b_count)));
+				mb_cache_entry_release(ce);
+				return bh;
+			}
+			unlock_buffer(bh);
+			brelse(bh);
+		}
+		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+	}
+	return NULL;
+}
+
+#define NAME_HASH_SHIFT 5
+#define VALUE_HASH_SHIFT 16
+
+/*
+ * ext2_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header,
+					 struct ext2_xattr_entry *entry)
+{
+	__u32 hash = 0;
+	char *name = entry->e_name;
+	int n;
+
+	for (n=0; n < entry->e_name_len; n++) {
+		hash = (hash << NAME_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
+		       *name++;
+	}
+
+	if (entry->e_value_block == 0 && entry->e_value_size != 0) {
+		__le32 *value = (__le32 *)((char *)header +
+			le16_to_cpu(entry->e_value_offs));
+		for (n = (le32_to_cpu(entry->e_value_size) +
+		     EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) {
+			hash = (hash << VALUE_HASH_SHIFT) ^
+			       (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
+			       le32_to_cpu(*value++);
+		}
+	}
+	entry->e_hash = cpu_to_le32(hash);
+}
+
+#undef NAME_HASH_SHIFT
+#undef VALUE_HASH_SHIFT
+
+#define BLOCK_HASH_SHIFT 16
+
+/*
+ * ext2_xattr_rehash()
+ *
+ * Re-compute the extended attribute hash value after an entry has changed.
+ */
+static void ext2_xattr_rehash(struct ext2_xattr_header *header,
+			      struct ext2_xattr_entry *entry)
+{
+	struct ext2_xattr_entry *here;
+	__u32 hash = 0;
+	
+	ext2_xattr_hash_entry(header, entry);
+	here = ENTRY(header+1);
+	while (!IS_LAST_ENTRY(here)) {
+		if (!here->e_hash) {
+			/* Block is not shared if an entry's hash value == 0 */
+			hash = 0;
+			break;
+		}
+		hash = (hash << BLOCK_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
+		       le32_to_cpu(here->e_hash);
+		here = EXT2_XATTR_NEXT(here);
+	}
+	header->h_hash = cpu_to_le32(hash);
+}
+
+#undef BLOCK_HASH_SHIFT
+
+int __init
+init_ext2_xattr(void)
+{
+	ext2_xattr_cache = mb_cache_create("ext2_xattr", 6);
+	if (!ext2_xattr_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+exit_ext2_xattr(void)
+{
+	mb_cache_destroy(ext2_xattr_cache);
+}
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
new file mode 100644
index 00000000..5e41cccf
--- /dev/null
+++ b/fs/ext2/xattr.h
@@ -0,0 +1,127 @@
+/*
+  File: linux/ext2_xattr.h
+
+  On-disk format of extended attributes for the ext2 filesystem.
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+/* Magic value in attribute blocks */
+#define EXT2_XATTR_MAGIC		0xEA020000
+
+/* Maximum number of references to one attribute block */
+#define EXT2_XATTR_REFCOUNT_MAX		1024
+
+/* Name indexes */
+#define EXT2_XATTR_INDEX_USER			1
+#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS	2
+#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT	3
+#define EXT2_XATTR_INDEX_TRUSTED		4
+#define	EXT2_XATTR_INDEX_LUSTRE			5
+#define EXT2_XATTR_INDEX_SECURITY	        6
+
+struct ext2_xattr_header {
+	__le32	h_magic;	/* magic number for identification */
+	__le32	h_refcount;	/* reference count */
+	__le32	h_blocks;	/* number of disk blocks used */
+	__le32	h_hash;		/* hash value of all attributes */
+	__u32	h_reserved[4];	/* zero right now */
+};
+
+struct ext2_xattr_entry {
+	__u8	e_name_len;	/* length of name */
+	__u8	e_name_index;	/* attribute name index */
+	__le16	e_value_offs;	/* offset in disk block of value */
+	__le32	e_value_block;	/* disk block attribute is stored on (n/i) */
+	__le32	e_value_size;	/* size of attribute value */
+	__le32	e_hash;		/* hash value of name and value */
+	char	e_name[0];	/* attribute name */
+};
+
+#define EXT2_XATTR_PAD_BITS		2
+#define EXT2_XATTR_PAD		(1<<EXT2_XATTR_PAD_BITS)
+#define EXT2_XATTR_ROUND		(EXT2_XATTR_PAD-1)
+#define EXT2_XATTR_LEN(name_len) \
+	(((name_len) + EXT2_XATTR_ROUND + \
+	sizeof(struct ext2_xattr_entry)) & ~EXT2_XATTR_ROUND)
+#define EXT2_XATTR_NEXT(entry) \
+	( (struct ext2_xattr_entry *)( \
+	  (char *)(entry) + EXT2_XATTR_LEN((entry)->e_name_len)) )
+#define EXT2_XATTR_SIZE(size) \
+	(((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
+
+# ifdef CONFIG_EXT2_FS_XATTR
+
+extern const struct xattr_handler ext2_xattr_user_handler;
+extern const struct xattr_handler ext2_xattr_trusted_handler;
+extern const struct xattr_handler ext2_xattr_acl_access_handler;
+extern const struct xattr_handler ext2_xattr_acl_default_handler;
+extern const struct xattr_handler ext2_xattr_security_handler;
+
+extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
+
+extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
+
+extern void ext2_xattr_delete_inode(struct inode *);
+extern void ext2_xattr_put_super(struct super_block *);
+
+extern int init_ext2_xattr(void);
+extern void exit_ext2_xattr(void);
+
+extern const struct xattr_handler *ext2_xattr_handlers[];
+
+# else  /* CONFIG_EXT2_FS_XATTR */
+
+static inline int
+ext2_xattr_get(struct inode *inode, int name_index,
+	       const char *name, void *buffer, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ext2_xattr_set(struct inode *inode, int name_index, const char *name,
+	       const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void
+ext2_xattr_delete_inode(struct inode *inode)
+{
+}
+
+static inline void
+ext2_xattr_put_super(struct super_block *sb)
+{
+}
+
+static inline int
+init_ext2_xattr(void)
+{
+	return 0;
+}
+
+static inline void
+exit_ext2_xattr(void)
+{
+}
+
+#define ext2_xattr_handlers NULL
+
+# endif  /* CONFIG_EXT2_FS_XATTR */
+
+#ifdef CONFIG_EXT2_FS_SECURITY
+extern int ext2_init_security(struct inode *inode, struct inode *dir,
+			      const struct qstr *qstr);
+#else
+static inline int ext2_init_security(struct inode *inode, struct inode *dir,
+				     const struct qstr *qstr)
+{
+	return 0;
+}
+#endif
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
new file mode 100644
index 00000000..5d979b43
--- /dev/null
+++ b/fs/ext2/xattr_security.c
@@ -0,0 +1,76 @@
+/*
+ * linux/fs/ext2/xattr_security.c
+ * Handler for storing security labels as extended attributes.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/ext2_fs.h>
+#include <linux/security.h>
+#include "xattr.h"
+
+static size_t
+ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
+			 const char *name, size_t name_len, int type)
+{
+	const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext2_xattr_security_get(struct dentry *dentry, const char *name,
+		       void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name,
+			      buffer, size);
+}
+
+static int
+ext2_xattr_security_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name,
+			      value, size, flags);
+}
+
+int
+ext2_init_security(struct inode *inode, struct inode *dir,
+		   const struct qstr *qstr)
+{
+	int err;
+	size_t len;
+	void *value;
+	char *name;
+
+	err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+	err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY,
+			     name, value, len, 0);
+	kfree(name);
+	kfree(value);
+	return err;
+}
+
+const struct xattr_handler ext2_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= ext2_xattr_security_list,
+	.get	= ext2_xattr_security_get,
+	.set	= ext2_xattr_security_set,
+};
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
new file mode 100644
index 00000000..667e46a8
--- /dev/null
+++ b/fs/ext2/xattr_trusted.c
@@ -0,0 +1,58 @@
+/*
+ * linux/fs/ext2/xattr_trusted.c
+ * Handler for trusted extended attributes.
+ *
+ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/ext2_fs.h>
+#include "xattr.h"
+
+static size_t
+ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
+{
+	const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext2_xattr_trusted_get(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name,
+			      buffer, size);
+}
+
+static int
+ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name,
+			      value, size, flags);
+}
+
+const struct xattr_handler ext2_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= ext2_xattr_trusted_list,
+	.get	= ext2_xattr_trusted_get,
+	.set	= ext2_xattr_trusted_set,
+};
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
new file mode 100644
index 00000000..099d20f4
--- /dev/null
+++ b/fs/ext2/xattr_user.c
@@ -0,0 +1,62 @@
+/*
+ * linux/fs/ext2/xattr_user.c
+ * Handler for extended user attributes.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include "ext2.h"
+#include "xattr.h"
+
+static size_t
+ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_USER_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext2_xattr_user_get(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+	return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_USER,
+			      name, buffer, size);
+}
+
+static int
+ext2_xattr_user_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+
+	return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_USER,
+			      name, value, size, flags);
+}
+
+const struct xattr_handler ext2_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= ext2_xattr_user_list,
+	.get	= ext2_xattr_user_get,
+	.set	= ext2_xattr_user_set,
+};
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
new file mode 100644
index 00000000..322a56b2
--- /dev/null
+++ b/fs/ext2/xip.c
@@ -0,0 +1,92 @@
+/*
+ *  linux/fs/ext2/xip.c
+ *
+ * Copyright (C) 2005 IBM Corporation
+ * Author: Carsten Otte (cotte@de.ibm.com)
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/buffer_head.h>
+#include <linux/ext2_fs_sb.h>
+#include <linux/ext2_fs.h>
+#include <linux/blkdev.h>
+#include "ext2.h"
+#include "xip.h"
+
+static inline int
+__inode_direct_access(struct inode *inode, sector_t block,
+		      void **kaddr, unsigned long *pfn)
+{
+	struct block_device *bdev = inode->i_sb->s_bdev;
+	const struct block_device_operations *ops = bdev->bd_disk->fops;
+	sector_t sector;
+
+	sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
+
+	BUG_ON(!ops->direct_access);
+	return ops->direct_access(bdev, sector, kaddr, pfn);
+}
+
+static inline int
+__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create,
+		   sector_t *result)
+{
+	struct buffer_head tmp;
+	int rc;
+
+	memset(&tmp, 0, sizeof(struct buffer_head));
+	rc = ext2_get_block(inode, pgoff, &tmp, create);
+	*result = tmp.b_blocknr;
+
+	/* did we get a sparse block (hole in the file)? */
+	if (!tmp.b_blocknr && !rc) {
+		BUG_ON(create);
+		rc = -ENODATA;
+	}
+
+	return rc;
+}
+
+int
+ext2_clear_xip_target(struct inode *inode, sector_t block)
+{
+	void *kaddr;
+	unsigned long pfn;
+	int rc;
+
+	rc = __inode_direct_access(inode, block, &kaddr, &pfn);
+	if (!rc)
+		clear_page(kaddr);
+	return rc;
+}
+
+void ext2_xip_verify_sb(struct super_block *sb)
+{
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+	if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) &&
+	    !sb->s_bdev->bd_disk->fops->direct_access) {
+		sbi->s_mount_opt &= (~EXT2_MOUNT_XIP);
+		ext2_msg(sb, KERN_WARNING,
+			     "warning: ignoring xip option - "
+			     "not supported by bdev");
+	}
+}
+
+int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
+				void **kmem, unsigned long *pfn)
+{
+	int rc;
+	sector_t block;
+
+	/* first, retrieve the sector number */
+	rc = __ext2_get_block(mapping->host, pgoff, create, &block);
+	if (rc)
+		return rc;
+
+	/* retrieve address of the target data */
+	rc = __inode_direct_access(mapping->host, block, kmem, pfn);
+	return rc;
+}
diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h
new file mode 100644
index 00000000..18b34d2f
--- /dev/null
+++ b/fs/ext2/xip.h
@@ -0,0 +1,26 @@
+/*
+ *  linux/fs/ext2/xip.h
+ *
+ * Copyright (C) 2005 IBM Corporation
+ * Author: Carsten Otte (cotte@de.ibm.com)
+ */
+
+#ifdef CONFIG_EXT2_FS_XIP
+extern void ext2_xip_verify_sb (struct super_block *);
+extern int ext2_clear_xip_target (struct inode *, sector_t);
+
+static inline int ext2_use_xip (struct super_block *sb)
+{
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	return (sbi->s_mount_opt & EXT2_MOUNT_XIP);
+}
+int ext2_get_xip_mem(struct address_space *, pgoff_t, int,
+				void **, unsigned long *);
+#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem)
+#else
+#define mapping_is_xip(map)			0
+#define ext2_xip_verify_sb(sb)			do { } while (0)
+#define ext2_use_xip(sb)			0
+#define ext2_clear_xip_target(inode, chain)	0
+#define ext2_get_xip_mem			NULL
+#endif
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
new file mode 100644
index 00000000..e8c6ba0e
--- /dev/null
+++ b/fs/ext3/Kconfig
@@ -0,0 +1,89 @@
+config EXT3_FS
+	tristate "Ext3 journalling file system support"
+	select JBD
+	help
+	  This is the journalling version of the Second extended file system
+	  (often called ext3), the de facto standard Linux file system
+	  (method to organize files on a storage device) for hard disks.
+
+	  The journalling code included in this driver means you do not have
+	  to run e2fsck (file system checker) on your file systems after a
+	  crash.  The journal keeps track of any changes that were being made
+	  at the time the system crashed, and can ensure that your file system
+	  is consistent without the need for a lengthy check.
+
+	  Other than adding the journal to the file system, the on-disk format
+	  of ext3 is identical to ext2.  It is possible to freely switch
+	  between using the ext3 driver and the ext2 driver, as long as the
+	  file system has been cleanly unmounted, or e2fsck is run on the file
+	  system.
+
+	  To add a journal on an existing ext2 file system or change the
+	  behavior of ext3 file systems, you can use the tune2fs utility ("man
+	  tune2fs").  To modify attributes of files and directories on ext3
+	  file systems, use chattr ("man chattr").  You need to be using
+	  e2fsprogs version 1.20 or later in order to create ext3 journals
+	  (available at <http://sourceforge.net/projects/e2fsprogs/>).
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ext3.
+
+config EXT3_DEFAULTS_TO_ORDERED
+	bool "Default to 'data=ordered' in ext3"
+	depends on EXT3_FS
+	default y
+	help
+	  The journal mode options for ext3 have different tradeoffs
+	  between when data is guaranteed to be on disk and
+	  performance.	The use of "data=writeback" can cause
+	  unwritten data to appear in files after an system crash or
+	  power failure, which can be a security issue.	 However,
+	  "data=ordered" mode can also result in major performance
+	  problems, including seconds-long delays before an fsync()
+	  call returns.	 For details, see:
+
+	  http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs
+
+	  If you have been historically happy with ext3's performance,
+	  data=ordered mode will be a safe choice and you should
+	  answer 'y' here.  If you understand the reliability and data
+	  privacy issues of data=writeback and are willing to make
+	  that trade off, answer 'n'.
+
+config EXT3_FS_XATTR
+	bool "Ext3 extended attributes"
+	depends on EXT3_FS
+	default y
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+	  You need this for POSIX ACL support on ext3.
+
+config EXT3_FS_POSIX_ACL
+	bool "Ext3 POSIX Access Control Lists"
+	depends on EXT3_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config EXT3_FS_SECURITY
+	bool "Ext3 Security Labels"
+	depends on EXT3_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ext3 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
diff --git a/fs/ext3/Makefile b/fs/ext3/Makefile
new file mode 100644
index 00000000..e77766a8
--- /dev/null
+++ b/fs/ext3/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the linux ext3-filesystem routines.
+#
+
+obj-$(CONFIG_EXT3_FS) += ext3.o
+
+ext3-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+	   ioctl.o namei.o super.o symlink.o hash.o resize.o ext3_jbd.o
+
+ext3-$(CONFIG_EXT3_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
+ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+ext3-$(CONFIG_EXT3_FS_SECURITY)	 += xattr_security.o
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
new file mode 100644
index 00000000..9d021c0d
--- /dev/null
+++ b/fs/ext3/acl.c
@@ -0,0 +1,481 @@
+/*
+ * linux/fs/ext3/acl.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/ext3_jbd.h>
+#include <linux/ext3_fs.h>
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *
+ext3_acl_from_disk(const void *value, size_t size)
+{
+	const char *end = (char *)value + size;
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(ext3_acl_header))
+		 return ERR_PTR(-EINVAL);
+	if (((ext3_acl_header *)value)->a_version !=
+	    cpu_to_le32(EXT3_ACL_VERSION))
+		return ERR_PTR(-EINVAL);
+	value = (char *)value + sizeof(ext3_acl_header);
+	count = ext3_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n=0; n < count; n++) {
+		ext3_acl_entry *entry =
+			(ext3_acl_entry *)value;
+		if ((char *)value + sizeof(ext3_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		switch(acl->a_entries[n].e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				value = (char *)value +
+					sizeof(ext3_acl_entry_short);
+				acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
+				break;
+
+			case ACL_USER:
+			case ACL_GROUP:
+				value = (char *)value + sizeof(ext3_acl_entry);
+				if ((char *)value > end)
+					goto fail;
+				acl->a_entries[n].e_id =
+					le32_to_cpu(entry->e_id);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *
+ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+	ext3_acl_header *ext_acl;
+	char *e;
+	size_t n;
+
+	*size = ext3_acl_size(acl->a_count);
+	ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count *
+			sizeof(ext3_acl_entry), GFP_NOFS);
+	if (!ext_acl)
+		return ERR_PTR(-ENOMEM);
+	ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION);
+	e = (char *)ext_acl + sizeof(ext3_acl_header);
+	for (n=0; n < acl->a_count; n++) {
+		ext3_acl_entry *entry = (ext3_acl_entry *)e;
+		entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+		switch(acl->a_entries[n].e_tag) {
+			case ACL_USER:
+			case ACL_GROUP:
+				entry->e_id =
+					cpu_to_le32(acl->a_entries[n].e_id);
+				e += sizeof(ext3_acl_entry);
+				break;
+
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				e += sizeof(ext3_acl_entry_short);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	return (char *)ext_acl;
+
+fail:
+	kfree(ext_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Inode operation get_posix_acl().
+ *
+ * inode->i_mutex: don't care
+ */
+static struct posix_acl *
+ext3_get_acl(struct inode *inode, int type)
+{
+	int name_index;
+	char *value = NULL;
+	struct posix_acl *acl;
+	int retval;
+
+	if (!test_opt(inode->i_sb, POSIX_ACL))
+		return NULL;
+
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+
+	retval = ext3_xattr_get(inode, name_index, "", NULL, 0);
+	if (retval > 0) {
+		value = kmalloc(retval, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		retval = ext3_xattr_get(inode, name_index, "", value, retval);
+	}
+	if (retval > 0)
+		acl = ext3_acl_from_disk(value, retval);
+	else if (retval == -ENODATA || retval == -ENOSYS)
+		acl = NULL;
+	else
+		acl = ERR_PTR(retval);
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ *
+ * inode->i_mutex: down unless called from ext3_new_inode
+ */
+static int
+ext3_set_acl(handle_t *handle, struct inode *inode, int type,
+	     struct posix_acl *acl)
+{
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int error;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
+			if (acl) {
+				mode_t mode = inode->i_mode;
+				error = posix_acl_equiv_mode(acl, &mode);
+				if (error < 0)
+					return error;
+				else {
+					inode->i_mode = mode;
+					inode->i_ctime = CURRENT_TIME_SEC;
+					ext3_mark_inode_dirty(handle, inode);
+					if (error == 0)
+						acl = NULL;
+				}
+			}
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
+			if (!S_ISDIR(inode->i_mode))
+				return acl ? -EACCES : 0;
+			break;
+
+		default:
+			return -EINVAL;
+	}
+	if (acl) {
+		value = ext3_acl_to_disk(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	error = ext3_xattr_set_handle(handle, inode, name_index, "",
+				      value, size, 0);
+
+	kfree(value);
+
+	if (!error)
+		set_cached_acl(inode, type, acl);
+
+	return error;
+}
+
+int
+ext3_check_acl(struct inode *inode, int mask, unsigned int flags)
+{
+	struct posix_acl *acl;
+
+	if (flags & IPERM_FLAG_RCU) {
+		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+			return -ECHILD;
+		return -EAGAIN;
+	}
+
+	acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		int error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return error;
+	}
+
+	return -EAGAIN;
+}
+
+/*
+ * Initialize the ACLs of a new inode. Called from ext3_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
+ */
+int
+ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *acl = NULL;
+	int error = 0;
+
+	if (!S_ISLNK(inode->i_mode)) {
+		if (test_opt(dir->i_sb, POSIX_ACL)) {
+			acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT);
+			if (IS_ERR(acl))
+				return PTR_ERR(acl);
+		}
+		if (!acl)
+			inode->i_mode &= ~current_umask();
+	}
+	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
+		struct posix_acl *clone;
+		mode_t mode;
+
+		if (S_ISDIR(inode->i_mode)) {
+			error = ext3_set_acl(handle, inode,
+					     ACL_TYPE_DEFAULT, acl);
+			if (error)
+				goto cleanup;
+		}
+		clone = posix_acl_clone(acl, GFP_NOFS);
+		error = -ENOMEM;
+		if (!clone)
+			goto cleanup;
+
+		mode = inode->i_mode;
+		error = posix_acl_create_masq(clone, &mode);
+		if (error >= 0) {
+			inode->i_mode = mode;
+			if (error > 0) {
+				/* This is an extended ACL */
+				error = ext3_set_acl(handle, inode,
+						     ACL_TYPE_ACCESS, clone);
+			}
+		}
+		posix_acl_release(clone);
+	}
+cleanup:
+	posix_acl_release(acl);
+	return error;
+}
+
+/*
+ * Does chmod for an inode that may have an Access Control List. The
+ * inode->i_mode field must be updated to the desired value by the caller
+ * before calling this function.
+ * Returns 0 on success, or a negative error number.
+ *
+ * We change the ACL rather than storing some ACL entries in the file
+ * mode permission bits (which would be more efficient), because that
+ * would break once additional permissions (like  ACL_APPEND, ACL_DELETE
+ * for directories) are added. There are no more bits available in the
+ * file mode.
+ *
+ * inode->i_mutex: down
+ */
+int
+ext3_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+        int error;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	if (!test_opt(inode->i_sb, POSIX_ACL))
+		return 0;
+	acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+	error = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!error) {
+		handle_t *handle;
+		int retries = 0;
+
+	retry:
+		handle = ext3_journal_start(inode,
+				EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
+		if (IS_ERR(handle)) {
+			error = PTR_ERR(handle);
+			ext3_std_error(inode->i_sb, error);
+			goto out;
+		}
+		error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
+		ext3_journal_stop(handle);
+		if (error == -ENOSPC &&
+		    ext3_should_retry_alloc(inode->i_sb, &retries))
+			goto retry;
+	}
+out:
+	posix_acl_release(clone);
+	return error;
+}
+
+/*
+ * Extended attribute handlers
+ */
+static size_t
+ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
+			   const char *name, size_t name_len, int type)
+{
+	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+	if (!test_opt(dentry->d_sb, POSIX_ACL))
+		return 0;
+	if (list && size <= list_len)
+		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+	return size;
+}
+
+static size_t
+ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
+			    const char *name, size_t name_len, int type)
+{
+	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+	if (!test_opt(dentry->d_sb, POSIX_ACL))
+		return 0;
+	if (list && size <= list_len)
+		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+	return size;
+}
+
+static int
+ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
+		   size_t size, int type)
+{
+	struct posix_acl *acl;
+	int error;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, POSIX_ACL))
+		return -EOPNOTSUPP;
+
+	acl = ext3_get_acl(dentry->d_inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	error = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+static int
+ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
+		   size_t size, int flags, int type)
+{
+	struct inode *inode = dentry->d_inode;
+	handle_t *handle;
+	struct posix_acl *acl;
+	int error, retries = 0;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	if (!test_opt(inode->i_sb, POSIX_ACL))
+		return -EOPNOTSUPP;
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		else if (acl) {
+			error = posix_acl_valid(acl);
+			if (error)
+				goto release_and_out;
+		}
+	} else
+		acl = NULL;
+
+retry:
+	handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	error = ext3_set_acl(handle, inode, type, acl);
+	ext3_journal_stop(handle);
+	if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+
+release_and_out:
+	posix_acl_release(acl);
+	return error;
+}
+
+const struct xattr_handler ext3_xattr_acl_access_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.flags	= ACL_TYPE_ACCESS,
+	.list	= ext3_xattr_list_acl_access,
+	.get	= ext3_xattr_get_acl,
+	.set	= ext3_xattr_set_acl,
+};
+
+const struct xattr_handler ext3_xattr_acl_default_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.flags	= ACL_TYPE_DEFAULT,
+	.list	= ext3_xattr_list_acl_default,
+	.get	= ext3_xattr_get_acl,
+	.set	= ext3_xattr_set_acl,
+};
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
new file mode 100644
index 00000000..5faf8048
--- /dev/null
+++ b/fs/ext3/acl.h
@@ -0,0 +1,77 @@
+/*
+  File: fs/ext3/acl.h
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/posix_acl_xattr.h>
+
+#define EXT3_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} ext3_acl_entry;
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} ext3_acl_entry_short;
+
+typedef struct {
+	__le32		a_version;
+} ext3_acl_header;
+
+static inline size_t ext3_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(ext3_acl_header) +
+		       count * sizeof(ext3_acl_entry_short);
+	} else {
+		return sizeof(ext3_acl_header) +
+		       4 * sizeof(ext3_acl_entry_short) +
+		       (count - 4) * sizeof(ext3_acl_entry);
+	}
+}
+
+static inline int ext3_acl_count(size_t size)
+{
+	ssize_t s;
+	size -= sizeof(ext3_acl_header);
+	s = size - 4 * sizeof(ext3_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(ext3_acl_entry_short))
+			return -1;
+		return size / sizeof(ext3_acl_entry_short);
+	} else {
+		if (s % sizeof(ext3_acl_entry))
+			return -1;
+		return s / sizeof(ext3_acl_entry) + 4;
+	}
+}
+
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
+
+/* acl.c */
+extern int ext3_check_acl (struct inode *, int, unsigned int);
+extern int ext3_acl_chmod (struct inode *);
+extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
+
+#else  /* CONFIG_EXT3_FS_POSIX_ACL */
+#include <linux/sched.h>
+#define ext3_check_acl NULL
+
+static inline int
+ext3_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+
+static inline int
+ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+#endif  /* CONFIG_EXT3_FS_POSIX_ACL */
+
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
new file mode 100644
index 00000000..fe52297e
--- /dev/null
+++ b/fs/ext3/balloc.c
@@ -0,0 +1,2156 @@
+/*
+ *  linux/fs/ext3/balloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/time.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/jbd.h>
+#include <linux/ext3_fs.h>
+#include <linux/ext3_jbd.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+
+/*
+ * balloc.c contains the blocks allocation and deallocation routines
+ */
+
+/*
+ * The free blocks are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.  The descriptors are loaded in memory
+ * when a file system is mounted (see ext3_fill_super).
+ */
+
+
+#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
+
+/*
+ * Calculate the block group number and offset, given a block number
+ */
+static void ext3_get_group_no_and_offset(struct super_block *sb,
+	ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
+{
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+
+	blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+	if (offsetp)
+		*offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
+	if (blockgrpp)
+		*blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
+}
+
+/**
+ * ext3_get_group_desc() -- load group descriptor from disk
+ * @sb:			super block
+ * @block_group:	given block group
+ * @bh:			pointer to the buffer head to store the block
+ *			group descriptor
+ */
+struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
+					     unsigned int block_group,
+					     struct buffer_head ** bh)
+{
+	unsigned long group_desc;
+	unsigned long offset;
+	struct ext3_group_desc * desc;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	if (block_group >= sbi->s_groups_count) {
+		ext3_error (sb, "ext3_get_group_desc",
+			    "block_group >= groups_count - "
+			    "block_group = %d, groups_count = %lu",
+			    block_group, sbi->s_groups_count);
+
+		return NULL;
+	}
+	smp_rmb();
+
+	group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
+	offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
+	if (!sbi->s_group_desc[group_desc]) {
+		ext3_error (sb, "ext3_get_group_desc",
+			    "Group descriptor not loaded - "
+			    "block_group = %d, group_desc = %lu, desc = %lu",
+			     block_group, group_desc, offset);
+		return NULL;
+	}
+
+	desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data;
+	if (bh)
+		*bh = sbi->s_group_desc[group_desc];
+	return desc + offset;
+}
+
+static int ext3_valid_block_bitmap(struct super_block *sb,
+					struct ext3_group_desc *desc,
+					unsigned int block_group,
+					struct buffer_head *bh)
+{
+	ext3_grpblk_t offset;
+	ext3_grpblk_t next_zero_bit;
+	ext3_fsblk_t bitmap_blk;
+	ext3_fsblk_t group_first_block;
+
+	group_first_block = ext3_group_first_block_no(sb, block_group);
+
+	/* check whether block bitmap block number is set */
+	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
+	offset = bitmap_blk - group_first_block;
+	if (!ext3_test_bit(offset, bh->b_data))
+		/* bad block bitmap */
+		goto err_out;
+
+	/* check whether the inode bitmap block number is set */
+	bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
+	offset = bitmap_blk - group_first_block;
+	if (!ext3_test_bit(offset, bh->b_data))
+		/* bad block bitmap */
+		goto err_out;
+
+	/* check whether the inode table block number is set */
+	bitmap_blk = le32_to_cpu(desc->bg_inode_table);
+	offset = bitmap_blk - group_first_block;
+	next_zero_bit = ext3_find_next_zero_bit(bh->b_data,
+				offset + EXT3_SB(sb)->s_itb_per_group,
+				offset);
+	if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group)
+		/* good bitmap for inode tables */
+		return 1;
+
+err_out:
+	ext3_error(sb, __func__,
+			"Invalid block bitmap - "
+			"block_group = %d, block = %lu",
+			block_group, bitmap_blk);
+	return 0;
+}
+
+/**
+ * read_block_bitmap()
+ * @sb:			super block
+ * @block_group:	given block group
+ *
+ * Read the bitmap for a given block_group,and validate the
+ * bits for block/inode/inode tables are set in the bitmaps
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+static struct buffer_head *
+read_block_bitmap(struct super_block *sb, unsigned int block_group)
+{
+	struct ext3_group_desc * desc;
+	struct buffer_head * bh = NULL;
+	ext3_fsblk_t bitmap_blk;
+
+	desc = ext3_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		return NULL;
+	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
+	bh = sb_getblk(sb, bitmap_blk);
+	if (unlikely(!bh)) {
+		ext3_error(sb, __func__,
+			    "Cannot read block bitmap - "
+			    "block_group = %d, block_bitmap = %u",
+			    block_group, le32_to_cpu(desc->bg_block_bitmap));
+		return NULL;
+	}
+	if (likely(bh_uptodate_or_lock(bh)))
+		return bh;
+
+	if (bh_submit_read(bh) < 0) {
+		brelse(bh);
+		ext3_error(sb, __func__,
+			    "Cannot read block bitmap - "
+			    "block_group = %d, block_bitmap = %u",
+			    block_group, le32_to_cpu(desc->bg_block_bitmap));
+		return NULL;
+	}
+	ext3_valid_block_bitmap(sb, desc, block_group, bh);
+	/*
+	 * file system mounted not to panic on error, continue with corrupt
+	 * bitmap
+	 */
+	return bh;
+}
+/*
+ * The reservation window structure operations
+ * --------------------------------------------
+ * Operations include:
+ * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
+ *
+ * We use a red-black tree to represent per-filesystem reservation
+ * windows.
+ *
+ */
+
+/**
+ * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
+ * @rb_root:		root of per-filesystem reservation rb tree
+ * @verbose:		verbose mode
+ * @fn:			function which wishes to dump the reservation map
+ *
+ * If verbose is turned on, it will print the whole block reservation
+ * windows(start, end).	Otherwise, it will only print out the "bad" windows,
+ * those windows that overlap with their immediate neighbors.
+ */
+#if 1
+static void __rsv_window_dump(struct rb_root *root, int verbose,
+			      const char *fn)
+{
+	struct rb_node *n;
+	struct ext3_reserve_window_node *rsv, *prev;
+	int bad;
+
+restart:
+	n = rb_first(root);
+	bad = 0;
+	prev = NULL;
+
+	printk("Block Allocation Reservation Windows Map (%s):\n", fn);
+	while (n) {
+		rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
+		if (verbose)
+			printk("reservation window 0x%p "
+			       "start:  %lu, end:  %lu\n",
+			       rsv, rsv->rsv_start, rsv->rsv_end);
+		if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
+			printk("Bad reservation %p (start >= end)\n",
+			       rsv);
+			bad = 1;
+		}
+		if (prev && prev->rsv_end >= rsv->rsv_start) {
+			printk("Bad reservation %p (prev->end >= start)\n",
+			       rsv);
+			bad = 1;
+		}
+		if (bad) {
+			if (!verbose) {
+				printk("Restarting reservation walk in verbose mode\n");
+				verbose = 1;
+				goto restart;
+			}
+		}
+		n = rb_next(n);
+		prev = rsv;
+	}
+	printk("Window map complete.\n");
+	BUG_ON(bad);
+}
+#define rsv_window_dump(root, verbose) \
+	__rsv_window_dump((root), (verbose), __func__)
+#else
+#define rsv_window_dump(root, verbose) do {} while (0)
+#endif
+
+/**
+ * goal_in_my_reservation()
+ * @rsv:		inode's reservation window
+ * @grp_goal:		given goal block relative to the allocation block group
+ * @group:		the current allocation block group
+ * @sb:			filesystem super block
+ *
+ * Test if the given goal block (group relative) is within the file's
+ * own block reservation window range.
+ *
+ * If the reservation window is outside the goal allocation group, return 0;
+ * grp_goal (given goal block) could be -1, which means no specific
+ * goal block. In this case, always return 1.
+ * If the goal block is within the reservation window, return 1;
+ * otherwise, return 0;
+ */
+static int
+goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
+			unsigned int group, struct super_block * sb)
+{
+	ext3_fsblk_t group_first_block, group_last_block;
+
+	group_first_block = ext3_group_first_block_no(sb, group);
+	group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
+
+	if ((rsv->_rsv_start > group_last_block) ||
+	    (rsv->_rsv_end < group_first_block))
+		return 0;
+	if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
+		|| (grp_goal + group_first_block > rsv->_rsv_end)))
+		return 0;
+	return 1;
+}
+
+/**
+ * search_reserve_window()
+ * @rb_root:		root of reservation tree
+ * @goal:		target allocation block
+ *
+ * Find the reserved window which includes the goal, or the previous one
+ * if the goal is not in any window.
+ * Returns NULL if there are no windows or if all windows start after the goal.
+ */
+static struct ext3_reserve_window_node *
+search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
+{
+	struct rb_node *n = root->rb_node;
+	struct ext3_reserve_window_node *rsv;
+
+	if (!n)
+		return NULL;
+
+	do {
+		rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
+
+		if (goal < rsv->rsv_start)
+			n = n->rb_left;
+		else if (goal > rsv->rsv_end)
+			n = n->rb_right;
+		else
+			return rsv;
+	} while (n);
+	/*
+	 * We've fallen off the end of the tree: the goal wasn't inside
+	 * any particular node.  OK, the previous node must be to one
+	 * side of the interval containing the goal.  If it's the RHS,
+	 * we need to back up one.
+	 */
+	if (rsv->rsv_start > goal) {
+		n = rb_prev(&rsv->rsv_node);
+		rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
+	}
+	return rsv;
+}
+
+/**
+ * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree.
+ * @sb:			super block
+ * @rsv:		reservation window to add
+ *
+ * Must be called with rsv_lock hold.
+ */
+void ext3_rsv_window_add(struct super_block *sb,
+		    struct ext3_reserve_window_node *rsv)
+{
+	struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
+	struct rb_node *node = &rsv->rsv_node;
+	ext3_fsblk_t start = rsv->rsv_start;
+
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct ext3_reserve_window_node *this;
+
+	while (*p)
+	{
+		parent = *p;
+		this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node);
+
+		if (start < this->rsv_start)
+			p = &(*p)->rb_left;
+		else if (start > this->rsv_end)
+			p = &(*p)->rb_right;
+		else {
+			rsv_window_dump(root, 1);
+			BUG();
+		}
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+}
+
+/**
+ * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree
+ * @sb:			super block
+ * @rsv:		reservation window to remove
+ *
+ * Mark the block reservation window as not allocated, and unlink it
+ * from the filesystem reservation window rb tree. Must be called with
+ * rsv_lock hold.
+ */
+static void rsv_window_remove(struct super_block *sb,
+			      struct ext3_reserve_window_node *rsv)
+{
+	rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+	rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+	rsv->rsv_alloc_hit = 0;
+	rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
+}
+
+/*
+ * rsv_is_empty() -- Check if the reservation window is allocated.
+ * @rsv:		given reservation window to check
+ *
+ * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED.
+ */
+static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
+{
+	/* a valid reservation end block could not be 0 */
+	return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+}
+
+/**
+ * ext3_init_block_alloc_info()
+ * @inode:		file inode structure
+ *
+ * Allocate and initialize the	reservation window structure, and
+ * link the window to the ext3 inode structure at last
+ *
+ * The reservation window structure is only dynamically allocated
+ * and linked to ext3 inode the first time the open file
+ * needs a new block. So, before every ext3_new_block(s) call, for
+ * regular files, we should check whether the reservation window
+ * structure exists or not. In the latter case, this function is called.
+ * Fail to do so will result in block reservation being turned off for that
+ * open file.
+ *
+ * This function is called from ext3_get_blocks_handle(), also called
+ * when setting the reservation window size through ioctl before the file
+ * is open for write (needs block allocation).
+ *
+ * Needs truncate_mutex protection prior to call this function.
+ */
+void ext3_init_block_alloc_info(struct inode *inode)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
+	struct super_block *sb = inode->i_sb;
+
+	block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
+	if (block_i) {
+		struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node;
+
+		rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+		rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+
+		/*
+		 * if filesystem is mounted with NORESERVATION, the goal
+		 * reservation window size is set to zero to indicate
+		 * block reservation is off
+		 */
+		if (!test_opt(sb, RESERVATION))
+			rsv->rsv_goal_size = 0;
+		else
+			rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS;
+		rsv->rsv_alloc_hit = 0;
+		block_i->last_alloc_logical_block = 0;
+		block_i->last_alloc_physical_block = 0;
+	}
+	ei->i_block_alloc_info = block_i;
+}
+
+/**
+ * ext3_discard_reservation()
+ * @inode:		inode
+ *
+ * Discard(free) block reservation window on last file close, or truncate
+ * or at last iput().
+ *
+ * It is being called in three cases:
+ *	ext3_release_file(): last writer close the file
+ *	ext3_clear_inode(): last iput(), when nobody link to this file.
+ *	ext3_truncate(): when the block indirect map is about to change.
+ *
+ */
+void ext3_discard_reservation(struct inode *inode)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
+	struct ext3_reserve_window_node *rsv;
+	spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock;
+
+	if (!block_i)
+		return;
+
+	rsv = &block_i->rsv_window_node;
+	if (!rsv_is_empty(&rsv->rsv_window)) {
+		spin_lock(rsv_lock);
+		if (!rsv_is_empty(&rsv->rsv_window))
+			rsv_window_remove(inode->i_sb, rsv);
+		spin_unlock(rsv_lock);
+	}
+}
+
+/**
+ * ext3_free_blocks_sb() -- Free given blocks and update quota
+ * @handle:			handle to this transaction
+ * @sb:				super block
+ * @block:			start physcial block to free
+ * @count:			number of blocks to free
+ * @pdquot_freed_blocks:	pointer to quota
+ */
+void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
+			 ext3_fsblk_t block, unsigned long count,
+			 unsigned long *pdquot_freed_blocks)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *gd_bh;
+	unsigned long block_group;
+	ext3_grpblk_t bit;
+	unsigned long i;
+	unsigned long overflow;
+	struct ext3_group_desc * desc;
+	struct ext3_super_block * es;
+	struct ext3_sb_info *sbi;
+	int err = 0, ret;
+	ext3_grpblk_t group_freed;
+
+	*pdquot_freed_blocks = 0;
+	sbi = EXT3_SB(sb);
+	es = sbi->s_es;
+	if (block < le32_to_cpu(es->s_first_data_block) ||
+	    block + count < block ||
+	    block + count > le32_to_cpu(es->s_blocks_count)) {
+		ext3_error (sb, "ext3_free_blocks",
+			    "Freeing blocks not in datazone - "
+			    "block = "E3FSBLK", count = %lu", block, count);
+		goto error_return;
+	}
+
+	ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
+
+do_more:
+	overflow = 0;
+	block_group = (block - le32_to_cpu(es->s_first_data_block)) /
+		      EXT3_BLOCKS_PER_GROUP(sb);
+	bit = (block - le32_to_cpu(es->s_first_data_block)) %
+		      EXT3_BLOCKS_PER_GROUP(sb);
+	/*
+	 * Check to see if we are freeing blocks across a group
+	 * boundary.
+	 */
+	if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
+		overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
+		count -= overflow;
+	}
+	brelse(bitmap_bh);
+	bitmap_bh = read_block_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		goto error_return;
+	desc = ext3_get_group_desc (sb, block_group, &gd_bh);
+	if (!desc)
+		goto error_return;
+
+	if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
+	    in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
+	    in_range (block, le32_to_cpu(desc->bg_inode_table),
+		      sbi->s_itb_per_group) ||
+	    in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
+		      sbi->s_itb_per_group)) {
+		ext3_error (sb, "ext3_free_blocks",
+			    "Freeing blocks in system zones - "
+			    "Block = "E3FSBLK", count = %lu",
+			    block, count);
+		goto error_return;
+	}
+
+	/*
+	 * We are about to start releasing blocks in the bitmap,
+	 * so we need undo access.
+	 */
+	/* @@@ check errors */
+	BUFFER_TRACE(bitmap_bh, "getting undo access");
+	err = ext3_journal_get_undo_access(handle, bitmap_bh);
+	if (err)
+		goto error_return;
+
+	/*
+	 * We are about to modify some metadata.  Call the journal APIs
+	 * to unshare ->b_data if a currently-committing transaction is
+	 * using it
+	 */
+	BUFFER_TRACE(gd_bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, gd_bh);
+	if (err)
+		goto error_return;
+
+	jbd_lock_bh_state(bitmap_bh);
+
+	for (i = 0, group_freed = 0; i < count; i++) {
+		/*
+		 * An HJ special.  This is expensive...
+		 */
+#ifdef CONFIG_JBD_DEBUG
+		jbd_unlock_bh_state(bitmap_bh);
+		{
+			struct buffer_head *debug_bh;
+			debug_bh = sb_find_get_block(sb, block + i);
+			if (debug_bh) {
+				BUFFER_TRACE(debug_bh, "Deleted!");
+				if (!bh2jh(bitmap_bh)->b_committed_data)
+					BUFFER_TRACE(debug_bh,
+						"No committed data in bitmap");
+				BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
+				__brelse(debug_bh);
+			}
+		}
+		jbd_lock_bh_state(bitmap_bh);
+#endif
+		if (need_resched()) {
+			jbd_unlock_bh_state(bitmap_bh);
+			cond_resched();
+			jbd_lock_bh_state(bitmap_bh);
+		}
+		/* @@@ This prevents newly-allocated data from being
+		 * freed and then reallocated within the same
+		 * transaction.
+		 *
+		 * Ideally we would want to allow that to happen, but to
+		 * do so requires making journal_forget() capable of
+		 * revoking the queued write of a data block, which
+		 * implies blocking on the journal lock.  *forget()
+		 * cannot block due to truncate races.
+		 *
+		 * Eventually we can fix this by making journal_forget()
+		 * return a status indicating whether or not it was able
+		 * to revoke the buffer.  On successful revoke, it is
+		 * safe not to set the allocation bit in the committed
+		 * bitmap, because we know that there is no outstanding
+		 * activity on the buffer any more and so it is safe to
+		 * reallocate it.
+		 */
+		BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
+		J_ASSERT_BH(bitmap_bh,
+				bh2jh(bitmap_bh)->b_committed_data != NULL);
+		ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
+				bh2jh(bitmap_bh)->b_committed_data);
+
+		/*
+		 * We clear the bit in the bitmap after setting the committed
+		 * data bit, because this is the reverse order to that which
+		 * the allocator uses.
+		 */
+		BUFFER_TRACE(bitmap_bh, "clear bit");
+		if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+						bit + i, bitmap_bh->b_data)) {
+			jbd_unlock_bh_state(bitmap_bh);
+			ext3_error(sb, __func__,
+				"bit already cleared for block "E3FSBLK,
+				 block + i);
+			jbd_lock_bh_state(bitmap_bh);
+			BUFFER_TRACE(bitmap_bh, "bit already cleared");
+		} else {
+			group_freed++;
+		}
+	}
+	jbd_unlock_bh_state(bitmap_bh);
+
+	spin_lock(sb_bgl_lock(sbi, block_group));
+	le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
+	spin_unlock(sb_bgl_lock(sbi, block_group));
+	percpu_counter_add(&sbi->s_freeblocks_counter, count);
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+	ret = ext3_journal_dirty_metadata(handle, gd_bh);
+	if (!err) err = ret;
+	*pdquot_freed_blocks += group_freed;
+
+	if (overflow && !err) {
+		block += count;
+		count = overflow;
+		goto do_more;
+	}
+
+error_return:
+	brelse(bitmap_bh);
+	ext3_std_error(sb, err);
+	return;
+}
+
+/**
+ * ext3_free_blocks() -- Free given blocks and update quota
+ * @handle:		handle for this transaction
+ * @inode:		inode
+ * @block:		start physical block to free
+ * @count:		number of blocks to count
+ */
+void ext3_free_blocks(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t block, unsigned long count)
+{
+	struct super_block * sb;
+	unsigned long dquot_freed_blocks;
+
+	sb = inode->i_sb;
+	if (!sb) {
+		printk ("ext3_free_blocks: nonexistent device");
+		return;
+	}
+	ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+	if (dquot_freed_blocks)
+		dquot_free_block(inode, dquot_freed_blocks);
+	return;
+}
+
+/**
+ * ext3_test_allocatable()
+ * @nr:			given allocation block group
+ * @bh:			bufferhead contains the bitmap of the given block group
+ *
+ * For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy.  This
+ * prevents deletes from freeing up the page for reuse until we have
+ * committed the delete transaction.
+ *
+ * If we didn't do this, then deleting something and reallocating it as
+ * data would allow the old block to be overwritten before the
+ * transaction committed (because we force data to disk before commit).
+ * This would lead to corruption if we crashed between overwriting the
+ * data and committing the delete.
+ *
+ * @@@ We may want to make this allocation behaviour conditional on
+ * data-writes at some point, and disable it for metadata allocations or
+ * sync-data inodes.
+ */
+static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
+{
+	int ret;
+	struct journal_head *jh = bh2jh(bh);
+
+	if (ext3_test_bit(nr, bh->b_data))
+		return 0;
+
+	jbd_lock_bh_state(bh);
+	if (!jh->b_committed_data)
+		ret = 1;
+	else
+		ret = !ext3_test_bit(nr, jh->b_committed_data);
+	jbd_unlock_bh_state(bh);
+	return ret;
+}
+
+/**
+ * bitmap_search_next_usable_block()
+ * @start:		the starting block (group relative) of the search
+ * @bh:			bufferhead contains the block group bitmap
+ * @maxblocks:		the ending block (group relative) of the reservation
+ *
+ * The bitmap search --- search forward alternately through the actual
+ * bitmap on disk and the last-committed copy in journal, until we find a
+ * bit free in both bitmaps.
+ */
+static ext3_grpblk_t
+bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
+					ext3_grpblk_t maxblocks)
+{
+	ext3_grpblk_t next;
+	struct journal_head *jh = bh2jh(bh);
+
+	while (start < maxblocks) {
+		next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
+		if (next >= maxblocks)
+			return -1;
+		if (ext3_test_allocatable(next, bh))
+			return next;
+		jbd_lock_bh_state(bh);
+		if (jh->b_committed_data)
+			start = ext3_find_next_zero_bit(jh->b_committed_data,
+							maxblocks, next);
+		jbd_unlock_bh_state(bh);
+	}
+	return -1;
+}
+
+/**
+ * find_next_usable_block()
+ * @start:		the starting block (group relative) to find next
+ *			allocatable block in bitmap.
+ * @bh:			bufferhead contains the block group bitmap
+ * @maxblocks:		the ending block (group relative) for the search
+ *
+ * Find an allocatable block in a bitmap.  We honor both the bitmap and
+ * its last-committed copy (if that exists), and perform the "most
+ * appropriate allocation" algorithm of looking for a free block near
+ * the initial goal; then for a free byte somewhere in the bitmap; then
+ * for any free bit in the bitmap.
+ */
+static ext3_grpblk_t
+find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
+			ext3_grpblk_t maxblocks)
+{
+	ext3_grpblk_t here, next;
+	char *p, *r;
+
+	if (start > 0) {
+		/*
+		 * The goal was occupied; search forward for a free
+		 * block within the next XX blocks.
+		 *
+		 * end_goal is more or less random, but it has to be
+		 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
+		 * next 64-bit boundary is simple..
+		 */
+		ext3_grpblk_t end_goal = (start + 63) & ~63;
+		if (end_goal > maxblocks)
+			end_goal = maxblocks;
+		here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
+		if (here < end_goal && ext3_test_allocatable(here, bh))
+			return here;
+		ext3_debug("Bit not found near goal\n");
+	}
+
+	here = start;
+	if (here < 0)
+		here = 0;
+
+	p = bh->b_data + (here >> 3);
+	r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
+	next = (r - bh->b_data) << 3;
+
+	if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
+		return next;
+
+	/*
+	 * The bitmap search --- search forward alternately through the actual
+	 * bitmap and the last-committed copy until we find a bit free in
+	 * both
+	 */
+	here = bitmap_search_next_usable_block(here, bh, maxblocks);
+	return here;
+}
+
+/**
+ * claim_block()
+ * @lock:		the spin lock for this block group
+ * @block:		the free block (group relative) to allocate
+ * @bh:			the buffer_head contains the block group bitmap
+ *
+ * We think we can allocate this block in this bitmap.  Try to set the bit.
+ * If that succeeds then check that nobody has allocated and then freed the
+ * block since we saw that is was not marked in b_committed_data.  If it _was_
+ * allocated and freed then clear the bit in the bitmap again and return
+ * zero (failure).
+ */
+static inline int
+claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
+{
+	struct journal_head *jh = bh2jh(bh);
+	int ret;
+
+	if (ext3_set_bit_atomic(lock, block, bh->b_data))
+		return 0;
+	jbd_lock_bh_state(bh);
+	if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) {
+		ext3_clear_bit_atomic(lock, block, bh->b_data);
+		ret = 0;
+	} else {
+		ret = 1;
+	}
+	jbd_unlock_bh_state(bh);
+	return ret;
+}
+
+/**
+ * ext3_try_to_allocate()
+ * @sb:			superblock
+ * @handle:		handle to this transaction
+ * @group:		given allocation block group
+ * @bitmap_bh:		bufferhead holds the block bitmap
+ * @grp_goal:		given target block within the group
+ * @count:		target number of blocks to allocate
+ * @my_rsv:		reservation window
+ *
+ * Attempt to allocate blocks within a give range. Set the range of allocation
+ * first, then find the first free bit(s) from the bitmap (within the range),
+ * and at last, allocate the blocks by claiming the found free bit as allocated.
+ *
+ * To set the range of this allocation:
+ *	if there is a reservation window, only try to allocate block(s) from the
+ *	file's own reservation window;
+ *	Otherwise, the allocation range starts from the give goal block, ends at
+ *	the block group's last block.
+ *
+ * If we failed to allocate the desired block then we may end up crossing to a
+ * new bitmap.  In that case we must release write access to the old one via
+ * ext3_journal_release_buffer(), else we'll run out of credits.
+ */
+static ext3_grpblk_t
+ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
+			struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
+			unsigned long *count, struct ext3_reserve_window *my_rsv)
+{
+	ext3_fsblk_t group_first_block;
+	ext3_grpblk_t start, end;
+	unsigned long num = 0;
+
+	/* we do allocation within the reservation window if we have a window */
+	if (my_rsv) {
+		group_first_block = ext3_group_first_block_no(sb, group);
+		if (my_rsv->_rsv_start >= group_first_block)
+			start = my_rsv->_rsv_start - group_first_block;
+		else
+			/* reservation window cross group boundary */
+			start = 0;
+		end = my_rsv->_rsv_end - group_first_block + 1;
+		if (end > EXT3_BLOCKS_PER_GROUP(sb))
+			/* reservation window crosses group boundary */
+			end = EXT3_BLOCKS_PER_GROUP(sb);
+		if ((start <= grp_goal) && (grp_goal < end))
+			start = grp_goal;
+		else
+			grp_goal = -1;
+	} else {
+		if (grp_goal > 0)
+			start = grp_goal;
+		else
+			start = 0;
+		end = EXT3_BLOCKS_PER_GROUP(sb);
+	}
+
+	BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
+
+repeat:
+	if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
+		grp_goal = find_next_usable_block(start, bitmap_bh, end);
+		if (grp_goal < 0)
+			goto fail_access;
+		if (!my_rsv) {
+			int i;
+
+			for (i = 0; i < 7 && grp_goal > start &&
+					ext3_test_allocatable(grp_goal - 1,
+								bitmap_bh);
+					i++, grp_goal--)
+				;
+		}
+	}
+	start = grp_goal;
+
+	if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group),
+		grp_goal, bitmap_bh)) {
+		/*
+		 * The block was allocated by another thread, or it was
+		 * allocated and then freed by another thread
+		 */
+		start++;
+		grp_goal++;
+		if (start >= end)
+			goto fail_access;
+		goto repeat;
+	}
+	num++;
+	grp_goal++;
+	while (num < *count && grp_goal < end
+		&& ext3_test_allocatable(grp_goal, bitmap_bh)
+		&& claim_block(sb_bgl_lock(EXT3_SB(sb), group),
+				grp_goal, bitmap_bh)) {
+		num++;
+		grp_goal++;
+	}
+	*count = num;
+	return grp_goal - num;
+fail_access:
+	*count = num;
+	return -1;
+}
+
+/**
+ *	find_next_reservable_window():
+ *		find a reservable space within the given range.
+ *		It does not allocate the reservation window for now:
+ *		alloc_new_reservation() will do the work later.
+ *
+ *	@search_head: the head of the searching list;
+ *		This is not necessarily the list head of the whole filesystem
+ *
+ *		We have both head and start_block to assist the search
+ *		for the reservable space. The list starts from head,
+ *		but we will shift to the place where start_block is,
+ *		then start from there, when looking for a reservable space.
+ *
+ *	@my_rsv: the reservation window
+ *
+ *	@sb: the super block
+ *
+ *	@start_block: the first block we consider to start
+ *			the real search from
+ *
+ *	@last_block:
+ *		the maximum block number that our goal reservable space
+ *		could start from. This is normally the last block in this
+ *		group. The search will end when we found the start of next
+ *		possible reservable space is out of this boundary.
+ *		This could handle the cross boundary reservation window
+ *		request.
+ *
+ *	basically we search from the given range, rather than the whole
+ *	reservation double linked list, (start_block, last_block)
+ *	to find a free region that is of my size and has not
+ *	been reserved.
+ *
+ */
+static int find_next_reservable_window(
+				struct ext3_reserve_window_node *search_head,
+				struct ext3_reserve_window_node *my_rsv,
+				struct super_block * sb,
+				ext3_fsblk_t start_block,
+				ext3_fsblk_t last_block)
+{
+	struct rb_node *next;
+	struct ext3_reserve_window_node *rsv, *prev;
+	ext3_fsblk_t cur;
+	int size = my_rsv->rsv_goal_size;
+
+	/* TODO: make the start of the reservation window byte-aligned */
+	/* cur = *start_block & ~7;*/
+	cur = start_block;
+	rsv = search_head;
+	if (!rsv)
+		return -1;
+
+	while (1) {
+		if (cur <= rsv->rsv_end)
+			cur = rsv->rsv_end + 1;
+
+		/* TODO?
+		 * in the case we could not find a reservable space
+		 * that is what is expected, during the re-search, we could
+		 * remember what's the largest reservable space we could have
+		 * and return that one.
+		 *
+		 * For now it will fail if we could not find the reservable
+		 * space with expected-size (or more)...
+		 */
+		if (cur > last_block)
+			return -1;		/* fail */
+
+		prev = rsv;
+		next = rb_next(&rsv->rsv_node);
+		rsv = rb_entry(next,struct ext3_reserve_window_node,rsv_node);
+
+		/*
+		 * Reached the last reservation, we can just append to the
+		 * previous one.
+		 */
+		if (!next)
+			break;
+
+		if (cur + size <= rsv->rsv_start) {
+			/*
+			 * Found a reserveable space big enough.  We could
+			 * have a reservation across the group boundary here
+			 */
+			break;
+		}
+	}
+	/*
+	 * we come here either :
+	 * when we reach the end of the whole list,
+	 * and there is empty reservable space after last entry in the list.
+	 * append it to the end of the list.
+	 *
+	 * or we found one reservable space in the middle of the list,
+	 * return the reservation window that we could append to.
+	 * succeed.
+	 */
+
+	if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
+		rsv_window_remove(sb, my_rsv);
+
+	/*
+	 * Let's book the whole available window for now.  We will check the
+	 * disk bitmap later and then, if there are free blocks then we adjust
+	 * the window size if it's larger than requested.
+	 * Otherwise, we will remove this node from the tree next time
+	 * call find_next_reservable_window.
+	 */
+	my_rsv->rsv_start = cur;
+	my_rsv->rsv_end = cur + size - 1;
+	my_rsv->rsv_alloc_hit = 0;
+
+	if (prev != my_rsv)
+		ext3_rsv_window_add(sb, my_rsv);
+
+	return 0;
+}
+
+/**
+ *	alloc_new_reservation()--allocate a new reservation window
+ *
+ *		To make a new reservation, we search part of the filesystem
+ *		reservation list (the list that inside the group). We try to
+ *		allocate a new reservation window near the allocation goal,
+ *		or the beginning of the group, if there is no goal.
+ *
+ *		We first find a reservable space after the goal, then from
+ *		there, we check the bitmap for the first free block after
+ *		it. If there is no free block until the end of group, then the
+ *		whole group is full, we failed. Otherwise, check if the free
+ *		block is inside the expected reservable space, if so, we
+ *		succeed.
+ *		If the first free block is outside the reservable space, then
+ *		start from the first free block, we search for next available
+ *		space, and go on.
+ *
+ *	on succeed, a new reservation will be found and inserted into the list
+ *	It contains at least one free block, and it does not overlap with other
+ *	reservation windows.
+ *
+ *	failed: we failed to find a reservation window in this group
+ *
+ *	@my_rsv: the reservation window
+ *
+ *	@grp_goal: The goal (group-relative).  It is where the search for a
+ *		free reservable space should start from.
+ *		if we have a grp_goal(grp_goal >0 ), then start from there,
+ *		no grp_goal(grp_goal = -1), we start from the first block
+ *		of the group.
+ *
+ *	@sb: the super block
+ *	@group: the group we are trying to allocate in
+ *	@bitmap_bh: the block group block bitmap
+ *
+ */
+static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
+		ext3_grpblk_t grp_goal, struct super_block *sb,
+		unsigned int group, struct buffer_head *bitmap_bh)
+{
+	struct ext3_reserve_window_node *search_head;
+	ext3_fsblk_t group_first_block, group_end_block, start_block;
+	ext3_grpblk_t first_free_block;
+	struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
+	unsigned long size;
+	int ret;
+	spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
+
+	group_first_block = ext3_group_first_block_no(sb, group);
+	group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
+
+	if (grp_goal < 0)
+		start_block = group_first_block;
+	else
+		start_block = grp_goal + group_first_block;
+
+	size = my_rsv->rsv_goal_size;
+
+	if (!rsv_is_empty(&my_rsv->rsv_window)) {
+		/*
+		 * if the old reservation is cross group boundary
+		 * and if the goal is inside the old reservation window,
+		 * we will come here when we just failed to allocate from
+		 * the first part of the window. We still have another part
+		 * that belongs to the next group. In this case, there is no
+		 * point to discard our window and try to allocate a new one
+		 * in this group(which will fail). we should
+		 * keep the reservation window, just simply move on.
+		 *
+		 * Maybe we could shift the start block of the reservation
+		 * window to the first block of next group.
+		 */
+
+		if ((my_rsv->rsv_start <= group_end_block) &&
+				(my_rsv->rsv_end > group_end_block) &&
+				(start_block >= my_rsv->rsv_start))
+			return -1;
+
+		if ((my_rsv->rsv_alloc_hit >
+		     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
+			/*
+			 * if the previously allocation hit ratio is
+			 * greater than 1/2, then we double the size of
+			 * the reservation window the next time,
+			 * otherwise we keep the same size window
+			 */
+			size = size * 2;
+			if (size > EXT3_MAX_RESERVE_BLOCKS)
+				size = EXT3_MAX_RESERVE_BLOCKS;
+			my_rsv->rsv_goal_size= size;
+		}
+	}
+
+	spin_lock(rsv_lock);
+	/*
+	 * shift the search start to the window near the goal block
+	 */
+	search_head = search_reserve_window(fs_rsv_root, start_block);
+
+	/*
+	 * find_next_reservable_window() simply finds a reservable window
+	 * inside the given range(start_block, group_end_block).
+	 *
+	 * To make sure the reservation window has a free bit inside it, we
+	 * need to check the bitmap after we found a reservable window.
+	 */
+retry:
+	ret = find_next_reservable_window(search_head, my_rsv, sb,
+						start_block, group_end_block);
+
+	if (ret == -1) {
+		if (!rsv_is_empty(&my_rsv->rsv_window))
+			rsv_window_remove(sb, my_rsv);
+		spin_unlock(rsv_lock);
+		return -1;
+	}
+
+	/*
+	 * On success, find_next_reservable_window() returns the
+	 * reservation window where there is a reservable space after it.
+	 * Before we reserve this reservable space, we need
+	 * to make sure there is at least a free block inside this region.
+	 *
+	 * searching the first free bit on the block bitmap and copy of
+	 * last committed bitmap alternatively, until we found a allocatable
+	 * block. Search start from the start block of the reservable space
+	 * we just found.
+	 */
+	spin_unlock(rsv_lock);
+	first_free_block = bitmap_search_next_usable_block(
+			my_rsv->rsv_start - group_first_block,
+			bitmap_bh, group_end_block - group_first_block + 1);
+
+	if (first_free_block < 0) {
+		/*
+		 * no free block left on the bitmap, no point
+		 * to reserve the space. return failed.
+		 */
+		spin_lock(rsv_lock);
+		if (!rsv_is_empty(&my_rsv->rsv_window))
+			rsv_window_remove(sb, my_rsv);
+		spin_unlock(rsv_lock);
+		return -1;		/* failed */
+	}
+
+	start_block = first_free_block + group_first_block;
+	/*
+	 * check if the first free block is within the
+	 * free space we just reserved
+	 */
+	if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
+		return 0;		/* success */
+	/*
+	 * if the first free bit we found is out of the reservable space
+	 * continue search for next reservable space,
+	 * start from where the free block is,
+	 * we also shift the list head to where we stopped last time
+	 */
+	search_head = my_rsv;
+	spin_lock(rsv_lock);
+	goto retry;
+}
+
+/**
+ * try_to_extend_reservation()
+ * @my_rsv:		given reservation window
+ * @sb:			super block
+ * @size:		the delta to extend
+ *
+ * Attempt to expand the reservation window large enough to have
+ * required number of free blocks
+ *
+ * Since ext3_try_to_allocate() will always allocate blocks within
+ * the reservation window range, if the window size is too small,
+ * multiple blocks allocation has to stop at the end of the reservation
+ * window. To make this more efficient, given the total number of
+ * blocks needed and the current size of the window, we try to
+ * expand the reservation window size if necessary on a best-effort
+ * basis before ext3_new_blocks() tries to allocate blocks,
+ */
+static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
+			struct super_block *sb, int size)
+{
+	struct ext3_reserve_window_node *next_rsv;
+	struct rb_node *next;
+	spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
+
+	if (!spin_trylock(rsv_lock))
+		return;
+
+	next = rb_next(&my_rsv->rsv_node);
+
+	if (!next)
+		my_rsv->rsv_end += size;
+	else {
+		next_rsv = rb_entry(next, struct ext3_reserve_window_node, rsv_node);
+
+		if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
+			my_rsv->rsv_end += size;
+		else
+			my_rsv->rsv_end = next_rsv->rsv_start - 1;
+	}
+	spin_unlock(rsv_lock);
+}
+
+/**
+ * ext3_try_to_allocate_with_rsv()
+ * @sb:			superblock
+ * @handle:		handle to this transaction
+ * @group:		given allocation block group
+ * @bitmap_bh:		bufferhead holds the block bitmap
+ * @grp_goal:		given target block within the group
+ * @my_rsv:		reservation window
+ * @count:		target number of blocks to allocate
+ * @errp:		pointer to store the error code
+ *
+ * This is the main function used to allocate a new block and its reservation
+ * window.
+ *
+ * Each time when a new block allocation is need, first try to allocate from
+ * its own reservation.  If it does not have a reservation window, instead of
+ * looking for a free bit on bitmap first, then look up the reservation list to
+ * see if it is inside somebody else's reservation window, we try to allocate a
+ * reservation window for it starting from the goal first. Then do the block
+ * allocation within the reservation window.
+ *
+ * This will avoid keeping on searching the reservation list again and
+ * again when somebody is looking for a free block (without
+ * reservation), and there are lots of free blocks, but they are all
+ * being reserved.
+ *
+ * We use a red-black tree for the per-filesystem reservation list.
+ *
+ */
+static ext3_grpblk_t
+ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
+			unsigned int group, struct buffer_head *bitmap_bh,
+			ext3_grpblk_t grp_goal,
+			struct ext3_reserve_window_node * my_rsv,
+			unsigned long *count, int *errp)
+{
+	ext3_fsblk_t group_first_block, group_last_block;
+	ext3_grpblk_t ret = 0;
+	int fatal;
+	unsigned long num = *count;
+
+	*errp = 0;
+
+	/*
+	 * Make sure we use undo access for the bitmap, because it is critical
+	 * that we do the frozen_data COW on bitmap buffers in all cases even
+	 * if the buffer is in BJ_Forget state in the committing transaction.
+	 */
+	BUFFER_TRACE(bitmap_bh, "get undo access for new block");
+	fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
+	if (fatal) {
+		*errp = fatal;
+		return -1;
+	}
+
+	/*
+	 * we don't deal with reservation when
+	 * filesystem is mounted without reservation
+	 * or the file is not a regular file
+	 * or last attempt to allocate a block with reservation turned on failed
+	 */
+	if (my_rsv == NULL ) {
+		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
+						grp_goal, count, NULL);
+		goto out;
+	}
+	/*
+	 * grp_goal is a group relative block number (if there is a goal)
+	 * 0 <= grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
+	 * first block is a filesystem wide block number
+	 * first block is the block number of the first block in this group
+	 */
+	group_first_block = ext3_group_first_block_no(sb, group);
+	group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
+
+	/*
+	 * Basically we will allocate a new block from inode's reservation
+	 * window.
+	 *
+	 * We need to allocate a new reservation window, if:
+	 * a) inode does not have a reservation window; or
+	 * b) last attempt to allocate a block from existing reservation
+	 *    failed; or
+	 * c) we come here with a goal and with a reservation window
+	 *
+	 * We do not need to allocate a new reservation window if we come here
+	 * at the beginning with a goal and the goal is inside the window, or
+	 * we don't have a goal but already have a reservation window.
+	 * then we could go to allocate from the reservation window directly.
+	 */
+	while (1) {
+		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
+			!goal_in_my_reservation(&my_rsv->rsv_window,
+						grp_goal, group, sb)) {
+			if (my_rsv->rsv_goal_size < *count)
+				my_rsv->rsv_goal_size = *count;
+			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
+							group, bitmap_bh);
+			if (ret < 0)
+				break;			/* failed */
+
+			if (!goal_in_my_reservation(&my_rsv->rsv_window,
+							grp_goal, group, sb))
+				grp_goal = -1;
+		} else if (grp_goal >= 0) {
+			int curr = my_rsv->rsv_end -
+					(grp_goal + group_first_block) + 1;
+
+			if (curr < *count)
+				try_to_extend_reservation(my_rsv, sb,
+							*count - curr);
+		}
+
+		if ((my_rsv->rsv_start > group_last_block) ||
+				(my_rsv->rsv_end < group_first_block)) {
+			rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1);
+			BUG();
+		}
+		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
+					   grp_goal, &num, &my_rsv->rsv_window);
+		if (ret >= 0) {
+			my_rsv->rsv_alloc_hit += num;
+			*count = num;
+			break;				/* succeed */
+		}
+		num = *count;
+	}
+out:
+	if (ret >= 0) {
+		BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
+					"bitmap block");
+		fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
+		if (fatal) {
+			*errp = fatal;
+			return -1;
+		}
+		return ret;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
+	ext3_journal_release_buffer(handle, bitmap_bh);
+	return ret;
+}
+
+/**
+ * ext3_has_free_blocks()
+ * @sbi:		in-core super block structure.
+ *
+ * Check if filesystem has at least 1 free block available for allocation.
+ */
+static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
+{
+	ext3_fsblk_t free_blocks, root_blocks;
+
+	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
+	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+		sbi->s_resuid != current_fsuid() &&
+		(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * ext3_should_retry_alloc()
+ * @sb:			super block
+ * @retries		number of attemps has been made
+ *
+ * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
+ * it is profitable to retry the operation, this function will wait
+ * for the current or committing transaction to complete, and then
+ * return TRUE.
+ *
+ * if the total number of retries exceed three times, return FALSE.
+ */
+int ext3_should_retry_alloc(struct super_block *sb, int *retries)
+{
+	if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3)
+		return 0;
+
+	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
+
+	return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
+}
+
+/**
+ * ext3_new_blocks() -- core block(s) allocation function
+ * @handle:		handle to this transaction
+ * @inode:		file inode
+ * @goal:		given target block(filesystem wide)
+ * @count:		target number of blocks to allocate
+ * @errp:		error code
+ *
+ * ext3_new_blocks uses a goal block to assist allocation.  It tries to
+ * allocate block(s) from the block group contains the goal block first. If that
+ * fails, it will try to allocate block(s) from other block groups without
+ * any specific goal block.
+ *
+ */
+ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, unsigned long *count, int *errp)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *gdp_bh;
+	int group_no;
+	int goal_group;
+	ext3_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
+	ext3_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
+	ext3_fsblk_t ret_block;		/* filesyetem-wide allocated block */
+	int bgi;			/* blockgroup iteration index */
+	int fatal = 0, err;
+	int performed_allocation = 0;
+	ext3_grpblk_t free_blocks;	/* number of free blocks in a group */
+	struct super_block *sb;
+	struct ext3_group_desc *gdp;
+	struct ext3_super_block *es;
+	struct ext3_sb_info *sbi;
+	struct ext3_reserve_window_node *my_rsv = NULL;
+	struct ext3_block_alloc_info *block_i;
+	unsigned short windowsz = 0;
+#ifdef EXT3FS_DEBUG
+	static int goal_hits, goal_attempts;
+#endif
+	unsigned long ngroups;
+	unsigned long num = *count;
+
+	*errp = -ENOSPC;
+	sb = inode->i_sb;
+	if (!sb) {
+		printk("ext3_new_block: nonexistent device");
+		return 0;
+	}
+
+	/*
+	 * Check quota for allocation of this block.
+	 */
+	err = dquot_alloc_block(inode, num);
+	if (err) {
+		*errp = err;
+		return 0;
+	}
+
+	sbi = EXT3_SB(sb);
+	es = EXT3_SB(sb)->s_es;
+	ext3_debug("goal=%lu.\n", goal);
+	/*
+	 * Allocate a block from reservation only when
+	 * filesystem is mounted with reservation(default,-o reservation), and
+	 * it's a regular file, and
+	 * the desired window size is greater than 0 (One could use ioctl
+	 * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off
+	 * reservation on that particular file)
+	 */
+	block_i = EXT3_I(inode)->i_block_alloc_info;
+	if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
+		my_rsv = &block_i->rsv_window_node;
+
+	if (!ext3_has_free_blocks(sbi)) {
+		*errp = -ENOSPC;
+		goto out;
+	}
+
+	/*
+	 * First, test whether the goal block is free.
+	 */
+	if (goal < le32_to_cpu(es->s_first_data_block) ||
+	    goal >= le32_to_cpu(es->s_blocks_count))
+		goal = le32_to_cpu(es->s_first_data_block);
+	group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
+			EXT3_BLOCKS_PER_GROUP(sb);
+	goal_group = group_no;
+retry_alloc:
+	gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
+	if (!gdp)
+		goto io_error;
+
+	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+	/*
+	 * if there is not enough free blocks to make a new resevation
+	 * turn off reservation for this allocation
+	 */
+	if (my_rsv && (free_blocks < windowsz)
+		&& (free_blocks > 0)
+		&& (rsv_is_empty(&my_rsv->rsv_window)))
+		my_rsv = NULL;
+
+	if (free_blocks > 0) {
+		grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
+				EXT3_BLOCKS_PER_GROUP(sb));
+		bitmap_bh = read_block_bitmap(sb, group_no);
+		if (!bitmap_bh)
+			goto io_error;
+		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+					group_no, bitmap_bh, grp_target_blk,
+					my_rsv,	&num, &fatal);
+		if (fatal)
+			goto out;
+		if (grp_alloc_blk >= 0)
+			goto allocated;
+	}
+
+	ngroups = EXT3_SB(sb)->s_groups_count;
+	smp_rmb();
+
+	/*
+	 * Now search the rest of the groups.  We assume that
+	 * group_no and gdp correctly point to the last group visited.
+	 */
+	for (bgi = 0; bgi < ngroups; bgi++) {
+		group_no++;
+		if (group_no >= ngroups)
+			group_no = 0;
+		gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
+		if (!gdp)
+			goto io_error;
+		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+		/*
+		 * skip this group (and avoid loading bitmap) if there
+		 * are no free blocks
+		 */
+		if (!free_blocks)
+			continue;
+		/*
+		 * skip this group if the number of
+		 * free blocks is less than half of the reservation
+		 * window size.
+		 */
+		if (my_rsv && (free_blocks <= (windowsz/2)))
+			continue;
+
+		brelse(bitmap_bh);
+		bitmap_bh = read_block_bitmap(sb, group_no);
+		if (!bitmap_bh)
+			goto io_error;
+		/*
+		 * try to allocate block(s) from this group, without a goal(-1).
+		 */
+		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+					group_no, bitmap_bh, -1, my_rsv,
+					&num, &fatal);
+		if (fatal)
+			goto out;
+		if (grp_alloc_blk >= 0)
+			goto allocated;
+	}
+	/*
+	 * We may end up a bogus earlier ENOSPC error due to
+	 * filesystem is "full" of reservations, but
+	 * there maybe indeed free blocks available on disk
+	 * In this case, we just forget about the reservations
+	 * just do block allocation as without reservations.
+	 */
+	if (my_rsv) {
+		my_rsv = NULL;
+		windowsz = 0;
+		group_no = goal_group;
+		goto retry_alloc;
+	}
+	/* No space left on the device */
+	*errp = -ENOSPC;
+	goto out;
+
+allocated:
+
+	ext3_debug("using block group %d(%d)\n",
+			group_no, gdp->bg_free_blocks_count);
+
+	BUFFER_TRACE(gdp_bh, "get_write_access");
+	fatal = ext3_journal_get_write_access(handle, gdp_bh);
+	if (fatal)
+		goto out;
+
+	ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
+
+	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
+	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
+	    in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
+		      EXT3_SB(sb)->s_itb_per_group) ||
+	    in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
+		      EXT3_SB(sb)->s_itb_per_group)) {
+		ext3_error(sb, "ext3_new_block",
+			    "Allocating block in system zone - "
+			    "blocks from "E3FSBLK", length %lu",
+			     ret_block, num);
+		/*
+		 * claim_block() marked the blocks we allocated as in use. So we
+		 * may want to selectively mark some of the blocks as free.
+		 */
+		goto retry_alloc;
+	}
+
+	performed_allocation = 1;
+
+#ifdef CONFIG_JBD_DEBUG
+	{
+		struct buffer_head *debug_bh;
+
+		/* Record bitmap buffer state in the newly allocated block */
+		debug_bh = sb_find_get_block(sb, ret_block);
+		if (debug_bh) {
+			BUFFER_TRACE(debug_bh, "state when allocated");
+			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
+			brelse(debug_bh);
+		}
+	}
+	jbd_lock_bh_state(bitmap_bh);
+	spin_lock(sb_bgl_lock(sbi, group_no));
+	if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
+		int i;
+
+		for (i = 0; i < num; i++) {
+			if (ext3_test_bit(grp_alloc_blk+i,
+					bh2jh(bitmap_bh)->b_committed_data)) {
+				printk("%s: block was unexpectedly set in "
+					"b_committed_data\n", __func__);
+			}
+		}
+	}
+	ext3_debug("found bit %d\n", grp_alloc_blk);
+	spin_unlock(sb_bgl_lock(sbi, group_no));
+	jbd_unlock_bh_state(bitmap_bh);
+#endif
+
+	if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
+		ext3_error(sb, "ext3_new_block",
+			    "block("E3FSBLK") >= blocks count(%d) - "
+			    "block_group = %d, es == %p ", ret_block,
+			le32_to_cpu(es->s_blocks_count), group_no, es);
+		goto out;
+	}
+
+	/*
+	 * It is up to the caller to add the new buffer to a journal
+	 * list of some description.  We don't know in advance whether
+	 * the caller wants to use it as metadata or data.
+	 */
+	ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
+			ret_block, goal_hits, goal_attempts);
+
+	spin_lock(sb_bgl_lock(sbi, group_no));
+	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
+	spin_unlock(sb_bgl_lock(sbi, group_no));
+	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+
+	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
+	err = ext3_journal_dirty_metadata(handle, gdp_bh);
+	if (!fatal)
+		fatal = err;
+
+	if (fatal)
+		goto out;
+
+	*errp = 0;
+	brelse(bitmap_bh);
+	dquot_free_block(inode, *count-num);
+	*count = num;
+	return ret_block;
+
+io_error:
+	*errp = -EIO;
+out:
+	if (fatal) {
+		*errp = fatal;
+		ext3_std_error(sb, fatal);
+	}
+	/*
+	 * Undo the block allocation
+	 */
+	if (!performed_allocation)
+		dquot_free_block(inode, *count);
+	brelse(bitmap_bh);
+	return 0;
+}
+
+ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, int *errp)
+{
+	unsigned long count = 1;
+
+	return ext3_new_blocks(handle, inode, goal, &count, errp);
+}
+
+/**
+ * ext3_count_free_blocks() -- count filesystem free blocks
+ * @sb:		superblock
+ *
+ * Adds up the number of free blocks from each block group.
+ */
+ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
+{
+	ext3_fsblk_t desc_count;
+	struct ext3_group_desc *gdp;
+	int i;
+	unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
+#ifdef EXT3FS_DEBUG
+	struct ext3_super_block *es;
+	ext3_fsblk_t bitmap_count;
+	unsigned long x;
+	struct buffer_head *bitmap_bh = NULL;
+
+	es = EXT3_SB(sb)->s_es;
+	desc_count = 0;
+	bitmap_count = 0;
+	gdp = NULL;
+
+	smp_rmb();
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext3_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+		brelse(bitmap_bh);
+		bitmap_bh = read_block_bitmap(sb, i);
+		if (bitmap_bh == NULL)
+			continue;
+
+		x = ext3_count_free(bitmap_bh, sb->s_blocksize);
+		printk("group %d: stored = %d, counted = %lu\n",
+			i, le16_to_cpu(gdp->bg_free_blocks_count), x);
+		bitmap_count += x;
+	}
+	brelse(bitmap_bh);
+	printk("ext3_count_free_blocks: stored = "E3FSBLK
+		", computed = "E3FSBLK", "E3FSBLK"\n",
+	       le32_to_cpu(es->s_free_blocks_count),
+		desc_count, bitmap_count);
+	return bitmap_count;
+#else
+	desc_count = 0;
+	smp_rmb();
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext3_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+	}
+
+	return desc_count;
+#endif
+}
+
+static inline int test_root(int a, int b)
+{
+	int num = b;
+
+	while (a > num)
+		num *= b;
+	return num == a;
+}
+
+static int ext3_group_sparse(int group)
+{
+	if (group <= 1)
+		return 1;
+	if (!(group & 1))
+		return 0;
+	return (test_root(group, 7) || test_root(group, 5) ||
+		test_root(group, 3));
+}
+
+/**
+ *	ext3_bg_has_super - number of blocks used by the superblock in group
+ *	@sb: superblock for filesystem
+ *	@group: group number to check
+ *
+ *	Return the number of blocks used by the superblock (primary or backup)
+ *	in this group.  Currently this will be only 0 or 1.
+ */
+int ext3_bg_has_super(struct super_block *sb, int group)
+{
+	if (EXT3_HAS_RO_COMPAT_FEATURE(sb,
+				EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
+			!ext3_group_sparse(group))
+		return 0;
+	return 1;
+}
+
+static unsigned long ext3_bg_num_gdb_meta(struct super_block *sb, int group)
+{
+	unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
+	unsigned long first = metagroup * EXT3_DESC_PER_BLOCK(sb);
+	unsigned long last = first + EXT3_DESC_PER_BLOCK(sb) - 1;
+
+	if (group == first || group == first + 1 || group == last)
+		return 1;
+	return 0;
+}
+
+static unsigned long ext3_bg_num_gdb_nometa(struct super_block *sb, int group)
+{
+	return ext3_bg_has_super(sb, group) ? EXT3_SB(sb)->s_gdb_count : 0;
+}
+
+/**
+ *	ext3_bg_num_gdb - number of blocks used by the group table in group
+ *	@sb: superblock for filesystem
+ *	@group: group number to check
+ *
+ *	Return the number of blocks used by the group descriptor table
+ *	(primary or backup) in this group.  In the future there may be a
+ *	different number of descriptor blocks in each group.
+ */
+unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
+{
+	unsigned long first_meta_bg =
+			le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg);
+	unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
+
+	if (!EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) ||
+			metagroup < first_meta_bg)
+		return ext3_bg_num_gdb_nometa(sb,group);
+
+	return ext3_bg_num_gdb_meta(sb,group);
+
+}
+
+/**
+ * ext3_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:			super block for file system
+ * @group:		allocation group to trim
+ * @start:		first group block to examine
+ * @max:		last group block to examine
+ * @gdp:		allocation group description structure
+ * @minblocks:		minimum extent block count
+ *
+ * ext3_trim_all_free walks through group's block bitmap searching for free
+ * blocks. When the free block is found, it tries to allocate this block and
+ * consequent free block to get the biggest free extent possible, until it
+ * reaches any used block. Then issue a TRIM command on this extent and free
+ * the extent in the block bitmap. This is done until whole group is scanned.
+ */
+ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
+				ext3_grpblk_t start, ext3_grpblk_t max,
+				ext3_grpblk_t minblocks)
+{
+	handle_t *handle;
+	ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
+	ext3_fsblk_t discard_block;
+	struct ext3_sb_info *sbi;
+	struct buffer_head *gdp_bh, *bitmap_bh = NULL;
+	struct ext3_group_desc *gdp;
+	int err = 0, ret = 0;
+
+	/*
+	 * We will update one block bitmap, and one group descriptor
+	 */
+	handle = ext3_journal_start_sb(sb, 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	bitmap_bh = read_block_bitmap(sb, group);
+	if (!bitmap_bh) {
+		err = -EIO;
+		goto err_out;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "getting undo access");
+	err = ext3_journal_get_undo_access(handle, bitmap_bh);
+	if (err)
+		goto err_out;
+
+	gdp = ext3_get_group_desc(sb, group, &gdp_bh);
+	if (!gdp) {
+		err = -EIO;
+		goto err_out;
+	}
+
+	BUFFER_TRACE(gdp_bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, gdp_bh);
+	if (err)
+		goto err_out;
+
+	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+	sbi = EXT3_SB(sb);
+
+	 /* Walk through the whole group */
+	while (start < max) {
+		start = bitmap_search_next_usable_block(start, bitmap_bh, max);
+		if (start < 0)
+			break;
+		next = start;
+
+		/*
+		 * Allocate contiguous free extents by setting bits in the
+		 * block bitmap
+		 */
+		while (next < max
+			&& claim_block(sb_bgl_lock(sbi, group),
+					next, bitmap_bh)) {
+			next++;
+		}
+
+		 /* We did not claim any blocks */
+		if (next == start)
+			continue;
+
+		discard_block = (ext3_fsblk_t)start +
+				ext3_group_first_block_no(sb, group);
+
+		/* Update counters */
+		spin_lock(sb_bgl_lock(sbi, group));
+		le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
+		spin_unlock(sb_bgl_lock(sbi, group));
+		percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+
+		free_blocks -= next - start;
+		/* Do not issue a TRIM on extents smaller than minblocks */
+		if ((next - start) < minblocks)
+			goto free_extent;
+
+		 /* Send the TRIM command down to the device */
+		err = sb_issue_discard(sb, discard_block, next - start,
+				       GFP_NOFS, 0);
+		count += (next - start);
+free_extent:
+		freed = 0;
+
+		/*
+		 * Clear bits in the bitmap
+		 */
+		for (bit = start; bit < next; bit++) {
+			BUFFER_TRACE(bitmap_bh, "clear bit");
+			if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
+						bit, bitmap_bh->b_data)) {
+				ext3_error(sb, __func__,
+					"bit already cleared for block "E3FSBLK,
+					 (unsigned long)bit);
+				BUFFER_TRACE(bitmap_bh, "bit already cleared");
+			} else {
+				freed++;
+			}
+		}
+
+		/* Update couters */
+		spin_lock(sb_bgl_lock(sbi, group));
+		le16_add_cpu(&gdp->bg_free_blocks_count, freed);
+		spin_unlock(sb_bgl_lock(sbi, group));
+		percpu_counter_add(&sbi->s_freeblocks_counter, freed);
+
+		start = next;
+		if (err < 0) {
+			if (err != -EOPNOTSUPP)
+				ext3_warning(sb, __func__, "Discard command "
+					     "returned error %d\n", err);
+			break;
+		}
+
+		if (fatal_signal_pending(current)) {
+			err = -ERESTARTSYS;
+			break;
+		}
+
+		cond_resched();
+
+		/* No more suitable extents */
+		if (free_blocks < minblocks)
+			break;
+	}
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
+	if (!err)
+		err = ret;
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
+	ret = ext3_journal_dirty_metadata(handle, gdp_bh);
+	if (!err)
+		err = ret;
+
+	ext3_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+err_out:
+	if (err)
+		count = err;
+	ext3_journal_stop(handle);
+	brelse(bitmap_bh);
+
+	return count;
+}
+
+/**
+ * ext3_trim_fs() -- trim ioctl handle function
+ * @sb:			superblock for filesystem
+ * @start:		First Byte to trim
+ * @len:		number of Bytes to trim from start
+ * @minlen:		minimum extent length in Bytes
+ *
+ * ext3_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext3_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	ext3_grpblk_t last_block, first_block, free_blocks;
+	unsigned long first_group, last_group;
+	unsigned long group, ngroups;
+	struct ext3_group_desc *gdp;
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+	uint64_t start, len, minlen, trimmed;
+	ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
+	int ret = 0;
+
+	start = (range->start >> sb->s_blocksize_bits) +
+		le32_to_cpu(es->s_first_data_block);
+	len = range->len >> sb->s_blocksize_bits;
+	minlen = range->minlen >> sb->s_blocksize_bits;
+	trimmed = 0;
+
+	if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
+		return -EINVAL;
+	if (start >= max_blks)
+		goto out;
+	if (start + len > max_blks)
+		len = max_blks - start;
+
+	ngroups = EXT3_SB(sb)->s_groups_count;
+	smp_rmb();
+
+	/* Determine first and last group to examine based on start and len */
+	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
+				     &first_group, &first_block);
+	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
+				     &last_group, &last_block);
+	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+	last_block = EXT3_BLOCKS_PER_GROUP(sb);
+
+	if (first_group > last_group)
+		return -EINVAL;
+
+	for (group = first_group; group <= last_group; group++) {
+		gdp = ext3_get_group_desc(sb, group, NULL);
+		if (!gdp)
+			break;
+
+		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+		if (free_blocks < minlen)
+			continue;
+
+		/*
+		 * For all the groups except the last one, last block will
+		 * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to
+		 * change it for the last group in which case first_block +
+		 * len < EXT3_BLOCKS_PER_GROUP(sb).
+		 */
+		if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb))
+			last_block = first_block + len;
+		len -= last_block - first_block;
+
+		ret = ext3_trim_all_free(sb, group, first_block,
+					last_block, minlen);
+		if (ret < 0)
+			break;
+
+		trimmed += ret;
+		first_block = 0;
+	}
+
+	if (ret >= 0)
+		ret = 0;
+
+out:
+	range->len = trimmed * sb->s_blocksize;
+
+	return ret;
+}
diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c
new file mode 100644
index 00000000..6afc39d8
--- /dev/null
+++ b/fs/ext3/bitmap.c
@@ -0,0 +1,32 @@
+/*
+ *  linux/fs/ext3/bitmap.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/jbd.h>
+#include <linux/ext3_fs.h>
+
+#ifdef EXT3FS_DEBUG
+
+static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+
+unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
+{
+	unsigned int i;
+	unsigned long sum = 0;
+
+	if (!map)
+		return (0);
+	for (i = 0; i < numchars; i++)
+		sum += nibblemap[map->b_data[i] & 0xf] +
+			nibblemap[(map->b_data[i] >> 4) & 0xf];
+	return (sum);
+}
+
+#endif  /*  EXT3FS_DEBUG  */
+
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
new file mode 100644
index 00000000..34f0a072
--- /dev/null
+++ b/fs/ext3/dir.c
@@ -0,0 +1,519 @@
+/*
+ *  linux/fs/ext3/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext3 directory handling functions
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * Hash Tree Directory indexing (c) 2001  Daniel Phillips
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/ext3_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+
+static unsigned char ext3_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int ext3_readdir(struct file *, void *, filldir_t);
+static int ext3_dx_readdir(struct file * filp,
+			   void * dirent, filldir_t filldir);
+static int ext3_release_dir (struct inode * inode,
+				struct file * filp);
+
+const struct file_operations ext3_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= ext3_readdir,		/* we take BKL. needed?*/
+	.unlocked_ioctl	= ext3_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext3_compat_ioctl,
+#endif
+	.fsync		= ext3_sync_file,	/* BKL held */
+	.release	= ext3_release_dir,
+};
+
+
+static unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
+	    (filetype >= EXT3_FT_MAX))
+		return DT_UNKNOWN;
+
+	return (ext3_filetype_table[filetype]);
+}
+
+
+int ext3_check_dir_entry (const char * function, struct inode * dir,
+			  struct ext3_dir_entry_2 * de,
+			  struct buffer_head * bh,
+			  unsigned long offset)
+{
+	const char * error_msg = NULL;
+	const int rlen = ext3_rec_len_from_disk(de->rec_len);
+
+	if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
+		error_msg = "rec_len is smaller than minimal";
+	else if (unlikely(rlen % 4 != 0))
+		error_msg = "rec_len % 4 != 0";
+	else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
+		error_msg = "rec_len is too small for name_len";
+	else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
+		error_msg = "directory entry across blocks";
+	else if (unlikely(le32_to_cpu(de->inode) >
+			le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
+		error_msg = "inode out of bounds";
+
+	if (unlikely(error_msg != NULL))
+		ext3_error (dir->i_sb, function,
+			"bad entry in directory #%lu: %s - "
+			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+			dir->i_ino, error_msg, offset,
+			(unsigned long) le32_to_cpu(de->inode),
+			rlen, de->name_len);
+
+	return error_msg == NULL ? 1 : 0;
+}
+
+static int ext3_readdir(struct file * filp,
+			 void * dirent, filldir_t filldir)
+{
+	int error = 0;
+	unsigned long offset;
+	int i, stored;
+	struct ext3_dir_entry_2 *de;
+	struct super_block *sb;
+	int err;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int ret = 0;
+	int dir_has_error = 0;
+
+	sb = inode->i_sb;
+
+	if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
+				    EXT3_FEATURE_COMPAT_DIR_INDEX) &&
+	    ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
+	     ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
+		err = ext3_dx_readdir(filp, dirent, filldir);
+		if (err != ERR_BAD_DX_DIR) {
+			ret = err;
+			goto out;
+		}
+		/*
+		 * We don't set the inode dirty flag since it's not
+		 * critical that it get flushed back to the disk.
+		 */
+		EXT3_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
+	}
+	stored = 0;
+	offset = filp->f_pos & (sb->s_blocksize - 1);
+
+	while (!error && !stored && filp->f_pos < inode->i_size) {
+		unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb);
+		struct buffer_head map_bh;
+		struct buffer_head *bh = NULL;
+
+		map_bh.b_state = 0;
+		err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
+		if (err > 0) {
+			pgoff_t index = map_bh.b_blocknr >>
+					(PAGE_CACHE_SHIFT - inode->i_blkbits);
+			if (!ra_has_index(&filp->f_ra, index))
+				page_cache_sync_readahead(
+					sb->s_bdev->bd_inode->i_mapping,
+					&filp->f_ra, filp,
+					index, 1);
+			filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+			bh = ext3_bread(NULL, inode, blk, 0, &err);
+		}
+
+		/*
+		 * We ignore I/O errors on directories so users have a chance
+		 * of recovering data when there's a bad sector
+		 */
+		if (!bh) {
+			if (!dir_has_error) {
+				ext3_error(sb, __func__, "directory #%lu "
+					"contains a hole at offset %lld",
+					inode->i_ino, filp->f_pos);
+				dir_has_error = 1;
+			}
+			/* corrupt size?  Maybe no more blocks to read */
+			if (filp->f_pos > inode->i_blocks << 9)
+				break;
+			filp->f_pos += sb->s_blocksize - offset;
+			continue;
+		}
+
+revalidate:
+		/* If the dir block has changed since the last call to
+		 * readdir(2), then we might be pointing to an invalid
+		 * dirent right now.  Scan from the start of the block
+		 * to make sure. */
+		if (filp->f_version != inode->i_version) {
+			for (i = 0; i < sb->s_blocksize && i < offset; ) {
+				de = (struct ext3_dir_entry_2 *)
+					(bh->b_data + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (ext3_rec_len_from_disk(de->rec_len) <
+						EXT3_DIR_REC_LEN(1))
+					break;
+				i += ext3_rec_len_from_disk(de->rec_len);
+			}
+			offset = i;
+			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+				| offset;
+			filp->f_version = inode->i_version;
+		}
+
+		while (!error && filp->f_pos < inode->i_size
+		       && offset < sb->s_blocksize) {
+			de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
+			if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
+						   bh, offset)) {
+				/* On error, skip the f_pos to the
+                                   next block. */
+				filp->f_pos = (filp->f_pos |
+						(sb->s_blocksize - 1)) + 1;
+				brelse (bh);
+				ret = stored;
+				goto out;
+			}
+			offset += ext3_rec_len_from_disk(de->rec_len);
+			if (le32_to_cpu(de->inode)) {
+				/* We might block in the next section
+				 * if the data destination is
+				 * currently swapped out.  So, use a
+				 * version stamp to detect whether or
+				 * not the directory has been modified
+				 * during the copy operation.
+				 */
+				u64 version = filp->f_version;
+
+				error = filldir(dirent, de->name,
+						de->name_len,
+						filp->f_pos,
+						le32_to_cpu(de->inode),
+						get_dtype(sb, de->file_type));
+				if (error)
+					break;
+				if (version != filp->f_version)
+					goto revalidate;
+				stored ++;
+			}
+			filp->f_pos += ext3_rec_len_from_disk(de->rec_len);
+		}
+		offset = 0;
+		brelse (bh);
+	}
+out:
+	return ret;
+}
+
+/*
+ * These functions convert from the major/minor hash to an f_pos
+ * value.
+ *
+ * Currently we only use major hash numer.  This is unfortunate, but
+ * on 32-bit machines, the same VFS interface is used for lseek and
+ * llseek, so if we use the 64 bit offset, then the 32-bit versions of
+ * lseek/telldir/seekdir will blow out spectacularly, and from within
+ * the ext2 low-level routine, we don't know if we're being called by
+ * a 64-bit version of the system call or the 32-bit version of the
+ * system call.  Worse yet, NFSv2 only allows for a 32-bit readdir
+ * cookie.  Sigh.
+ */
+#define hash2pos(major, minor)	(major >> 1)
+#define pos2maj_hash(pos)	((pos << 1) & 0xffffffff)
+#define pos2min_hash(pos)	(0)
+
+/*
+ * This structure holds the nodes of the red-black tree used to store
+ * the directory entry in hash order.
+ */
+struct fname {
+	__u32		hash;
+	__u32		minor_hash;
+	struct rb_node	rb_hash;
+	struct fname	*next;
+	__u32		inode;
+	__u8		name_len;
+	__u8		file_type;
+	char		name[0];
+};
+
+/*
+ * This functoin implements a non-recursive way of freeing all of the
+ * nodes in the red-black tree.
+ */
+static void free_rb_tree_fname(struct rb_root *root)
+{
+	struct rb_node	*n = root->rb_node;
+	struct rb_node	*parent;
+	struct fname	*fname;
+
+	while (n) {
+		/* Do the node's children first */
+		if (n->rb_left) {
+			n = n->rb_left;
+			continue;
+		}
+		if (n->rb_right) {
+			n = n->rb_right;
+			continue;
+		}
+		/*
+		 * The node has no children; free it, and then zero
+		 * out parent's link to it.  Finally go to the
+		 * beginning of the loop and try to free the parent
+		 * node.
+		 */
+		parent = rb_parent(n);
+		fname = rb_entry(n, struct fname, rb_hash);
+		while (fname) {
+			struct fname * old = fname;
+			fname = fname->next;
+			kfree (old);
+		}
+		if (!parent)
+			*root = RB_ROOT;
+		else if (parent->rb_left == n)
+			parent->rb_left = NULL;
+		else if (parent->rb_right == n)
+			parent->rb_right = NULL;
+		n = parent;
+	}
+}
+
+
+static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos)
+{
+	struct dir_private_info *p;
+
+	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+	if (!p)
+		return NULL;
+	p->curr_hash = pos2maj_hash(pos);
+	p->curr_minor_hash = pos2min_hash(pos);
+	return p;
+}
+
+void ext3_htree_free_dir_info(struct dir_private_info *p)
+{
+	free_rb_tree_fname(&p->root);
+	kfree(p);
+}
+
+/*
+ * Given a directory entry, enter it into the fname rb tree.
+ */
+int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
+			     __u32 minor_hash,
+			     struct ext3_dir_entry_2 *dirent)
+{
+	struct rb_node **p, *parent = NULL;
+	struct fname * fname, *new_fn;
+	struct dir_private_info *info;
+	int len;
+
+	info = (struct dir_private_info *) dir_file->private_data;
+	p = &info->root.rb_node;
+
+	/* Create and allocate the fname structure */
+	len = sizeof(struct fname) + dirent->name_len + 1;
+	new_fn = kzalloc(len, GFP_KERNEL);
+	if (!new_fn)
+		return -ENOMEM;
+	new_fn->hash = hash;
+	new_fn->minor_hash = minor_hash;
+	new_fn->inode = le32_to_cpu(dirent->inode);
+	new_fn->name_len = dirent->name_len;
+	new_fn->file_type = dirent->file_type;
+	memcpy(new_fn->name, dirent->name, dirent->name_len);
+	new_fn->name[dirent->name_len] = 0;
+
+	while (*p) {
+		parent = *p;
+		fname = rb_entry(parent, struct fname, rb_hash);
+
+		/*
+		 * If the hash and minor hash match up, then we put
+		 * them on a linked list.  This rarely happens...
+		 */
+		if ((new_fn->hash == fname->hash) &&
+		    (new_fn->minor_hash == fname->minor_hash)) {
+			new_fn->next = fname->next;
+			fname->next = new_fn;
+			return 0;
+		}
+
+		if (new_fn->hash < fname->hash)
+			p = &(*p)->rb_left;
+		else if (new_fn->hash > fname->hash)
+			p = &(*p)->rb_right;
+		else if (new_fn->minor_hash < fname->minor_hash)
+			p = &(*p)->rb_left;
+		else /* if (new_fn->minor_hash > fname->minor_hash) */
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&new_fn->rb_hash, parent, p);
+	rb_insert_color(&new_fn->rb_hash, &info->root);
+	return 0;
+}
+
+
+
+/*
+ * This is a helper function for ext3_dx_readdir.  It calls filldir
+ * for all entres on the fname linked list.  (Normally there is only
+ * one entry on the linked list, unless there are 62 bit hash collisions.)
+ */
+static int call_filldir(struct file * filp, void * dirent,
+			filldir_t filldir, struct fname *fname)
+{
+	struct dir_private_info *info = filp->private_data;
+	loff_t	curr_pos;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block * sb;
+	int error;
+
+	sb = inode->i_sb;
+
+	if (!fname) {
+		printk("call_filldir: called with null fname?!?\n");
+		return 0;
+	}
+	curr_pos = hash2pos(fname->hash, fname->minor_hash);
+	while (fname) {
+		error = filldir(dirent, fname->name,
+				fname->name_len, curr_pos,
+				fname->inode,
+				get_dtype(sb, fname->file_type));
+		if (error) {
+			filp->f_pos = curr_pos;
+			info->extra_fname = fname;
+			return error;
+		}
+		fname = fname->next;
+	}
+	return 0;
+}
+
+static int ext3_dx_readdir(struct file * filp,
+			 void * dirent, filldir_t filldir)
+{
+	struct dir_private_info *info = filp->private_data;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct fname *fname;
+	int	ret;
+
+	if (!info) {
+		info = ext3_htree_create_dir_info(filp->f_pos);
+		if (!info)
+			return -ENOMEM;
+		filp->private_data = info;
+	}
+
+	if (filp->f_pos == EXT3_HTREE_EOF)
+		return 0;	/* EOF */
+
+	/* Some one has messed with f_pos; reset the world */
+	if (info->last_pos != filp->f_pos) {
+		free_rb_tree_fname(&info->root);
+		info->curr_node = NULL;
+		info->extra_fname = NULL;
+		info->curr_hash = pos2maj_hash(filp->f_pos);
+		info->curr_minor_hash = pos2min_hash(filp->f_pos);
+	}
+
+	/*
+	 * If there are any leftover names on the hash collision
+	 * chain, return them first.
+	 */
+	if (info->extra_fname) {
+		if (call_filldir(filp, dirent, filldir, info->extra_fname))
+			goto finished;
+		info->extra_fname = NULL;
+		goto next_node;
+	} else if (!info->curr_node)
+		info->curr_node = rb_first(&info->root);
+
+	while (1) {
+		/*
+		 * Fill the rbtree if we have no more entries,
+		 * or the inode has changed since we last read in the
+		 * cached entries.
+		 */
+		if ((!info->curr_node) ||
+		    (filp->f_version != inode->i_version)) {
+			info->curr_node = NULL;
+			free_rb_tree_fname(&info->root);
+			filp->f_version = inode->i_version;
+			ret = ext3_htree_fill_tree(filp, info->curr_hash,
+						   info->curr_minor_hash,
+						   &info->next_hash);
+			if (ret < 0)
+				return ret;
+			if (ret == 0) {
+				filp->f_pos = EXT3_HTREE_EOF;
+				break;
+			}
+			info->curr_node = rb_first(&info->root);
+		}
+
+		fname = rb_entry(info->curr_node, struct fname, rb_hash);
+		info->curr_hash = fname->hash;
+		info->curr_minor_hash = fname->minor_hash;
+		if (call_filldir(filp, dirent, filldir, fname))
+			break;
+	next_node:
+		info->curr_node = rb_next(info->curr_node);
+		if (info->curr_node) {
+			fname = rb_entry(info->curr_node, struct fname,
+					 rb_hash);
+			info->curr_hash = fname->hash;
+			info->curr_minor_hash = fname->minor_hash;
+		} else {
+			if (info->next_hash == ~0) {
+				filp->f_pos = EXT3_HTREE_EOF;
+				break;
+			}
+			info->curr_hash = info->next_hash;
+			info->curr_minor_hash = 0;
+		}
+	}
+finished:
+	info->last_pos = filp->f_pos;
+	return 0;
+}
+
+static int ext3_release_dir (struct inode * inode, struct file * filp)
+{
+       if (filp->private_data)
+		ext3_htree_free_dir_info(filp->private_data);
+
+	return 0;
+}
diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c
new file mode 100644
index 00000000..d401f148
--- /dev/null
+++ b/fs/ext3/ext3_jbd.c
@@ -0,0 +1,59 @@
+/*
+ * Interface between ext3 and JBD
+ */
+
+#include <linux/ext3_jbd.h>
+
+int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
+				struct buffer_head *bh)
+{
+	int err = journal_get_undo_access(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __func__, bh, handle,err);
+	return err;
+}
+
+int __ext3_journal_get_write_access(const char *where, handle_t *handle,
+				struct buffer_head *bh)
+{
+	int err = journal_get_write_access(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __func__, bh, handle,err);
+	return err;
+}
+
+int __ext3_journal_forget(const char *where, handle_t *handle,
+				struct buffer_head *bh)
+{
+	int err = journal_forget(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __func__, bh, handle,err);
+	return err;
+}
+
+int __ext3_journal_revoke(const char *where, handle_t *handle,
+				unsigned long blocknr, struct buffer_head *bh)
+{
+	int err = journal_revoke(handle, blocknr, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __func__, bh, handle,err);
+	return err;
+}
+
+int __ext3_journal_get_create_access(const char *where,
+				handle_t *handle, struct buffer_head *bh)
+{
+	int err = journal_get_create_access(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __func__, bh, handle,err);
+	return err;
+}
+
+int __ext3_journal_dirty_metadata(const char *where,
+				handle_t *handle, struct buffer_head *bh)
+{
+	int err = journal_dirty_metadata(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(where, __func__, bh, handle,err);
+	return err;
+}
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
new file mode 100644
index 00000000..f55df0e6
--- /dev/null
+++ b/fs/ext3/file.c
@@ -0,0 +1,85 @@
+/*
+ *  linux/fs/ext3/file.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/file.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext3 fs regular file handling primitives
+ *
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ *	(jj@sunsite.ms.mff.cuni.cz)
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/quotaops.h>
+#include <linux/ext3_fs.h>
+#include <linux/ext3_jbd.h>
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Called when an inode is released. Note that this is different
+ * from ext3_file_open: open gets called at every open, but release
+ * gets called only when /all/ the files are closed.
+ */
+static int ext3_release_file (struct inode * inode, struct file * filp)
+{
+	if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) {
+		filemap_flush(inode->i_mapping);
+		ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
+	}
+	/* if we are the last writer on the inode, drop the block reservation */
+	if ((filp->f_mode & FMODE_WRITE) &&
+			(atomic_read(&inode->i_writecount) == 1))
+	{
+		mutex_lock(&EXT3_I(inode)->truncate_mutex);
+		ext3_discard_reservation(inode);
+		mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+	}
+	if (is_dx(inode) && filp->private_data)
+		ext3_htree_free_dir_info(filp->private_data);
+
+	return 0;
+}
+
+const struct file_operations ext3_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.unlocked_ioctl	= ext3_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext3_compat_ioctl,
+#endif
+	.mmap		= generic_file_mmap,
+	.open		= dquot_file_open,
+	.release	= ext3_release_file,
+	.fsync		= ext3_sync_file,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+};
+
+const struct inode_operations ext3_file_inode_operations = {
+	.truncate	= ext3_truncate,
+	.setattr	= ext3_setattr,
+#ifdef CONFIG_EXT3_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext3_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.check_acl	= ext3_check_acl,
+	.fiemap		= ext3_fiemap,
+};
+
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
new file mode 100644
index 00000000..09b13bb3
--- /dev/null
+++ b/fs/ext3/fsync.c
@@ -0,0 +1,95 @@
+/*
+ *  linux/fs/ext3/fsync.c
+ *
+ *  Copyright (C) 1993  Stephen Tweedie (sct@redhat.com)
+ *  from
+ *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
+ *                      Laboratoire MASI - Institut Blaise Pascal
+ *                      Universite Pierre et Marie Curie (Paris VI)
+ *  from
+ *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext3fs fsync primitive
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ *  Removed unnecessary code duplication for little endian machines
+ *  and excessive __inline__s.
+ *        Andi Kleen, 1997
+ *
+ * Major simplications and cleanup - we only need to do the metadata, because
+ * we can depend on generic_block_fdatasync() to sync the data blocks.
+ */
+
+#include <linux/time.h>
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/jbd.h>
+#include <linux/ext3_fs.h>
+#include <linux/ext3_jbd.h>
+
+/*
+ * akpm: A new design for ext3_sync_file().
+ *
+ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
+ * There cannot be a transaction open by this task.
+ * Another task could have dirtied this inode.  Its data can be in any
+ * state in the journalling system.
+ *
+ * What we do is just kick off a commit and wait on it.  This will snapshot the
+ * inode to disk.
+ */
+
+int ext3_sync_file(struct file *file, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
+	int ret, needs_barrier = 0;
+	tid_t commit_tid;
+
+	if (inode->i_sb->s_flags & MS_RDONLY)
+		return 0;
+
+	J_ASSERT(ext3_journal_current_handle() == NULL);
+
+	/*
+	 * data=writeback,ordered:
+	 *  The caller's filemap_fdatawrite()/wait will sync the data.
+	 *  Metadata is in the journal, we wait for a proper transaction
+	 *  to commit here.
+	 *
+	 * data=journal:
+	 *  filemap_fdatawrite won't do anything (the buffers are clean).
+	 *  ext3_force_commit will write the file data into the journal and
+	 *  will wait on that.
+	 *  filemap_fdatawait() will encounter a ton of newly-dirtied pages
+	 *  (they were dirtied by commit).  But that's OK - the blocks are
+	 *  safe in-journal, which is all fsync() needs to ensure.
+	 */
+	if (ext3_should_journal_data(inode))
+		return ext3_force_commit(inode->i_sb);
+
+	if (datasync)
+		commit_tid = atomic_read(&ei->i_datasync_tid);
+	else
+		commit_tid = atomic_read(&ei->i_sync_tid);
+
+	if (test_opt(inode->i_sb, BARRIER) &&
+	    !journal_trans_will_send_data_barrier(journal, commit_tid))
+		needs_barrier = 1;
+	log_start_commit(journal, commit_tid);
+	ret = log_wait_commit(journal, commit_tid);
+
+	/*
+	 * In case we didn't commit a transaction, we have to flush
+	 * disk caches manually so that data really is on persistent
+	 * storage
+	 */
+	if (needs_barrier)
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+	return ret;
+}
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
new file mode 100644
index 00000000..7d215b4d
--- /dev/null
+++ b/fs/ext3/hash.c
@@ -0,0 +1,208 @@
+/*
+ *  linux/fs/ext3/hash.c
+ *
+ * Copyright (C) 2002 by Theodore Ts'o
+ *
+ * This file is released under the GPL v2.
+ *
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/ext3_fs.h>
+#include <linux/cryptohash.h>
+
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+	__u32	sum = 0;
+	__u32	b0 = buf[0], b1 = buf[1];
+	__u32	a = in[0], b = in[1], c = in[2], d = in[3];
+	int	n = 16;
+
+	do {
+		sum += DELTA;
+		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+	} while(--n);
+
+	buf[0] += b0;
+	buf[1] += b1;
+}
+
+
+/* The old legacy hash */
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
+{
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const unsigned char *ucp = (const unsigned char *) name;
+
+	while (len--) {
+		hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
+		hash1 = hash0;
+		hash0 = hash;
+	}
+	return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const signed char *scp = (const signed char *) name;
+
+	while (len--) {
+		hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
+
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
+		hash1 = hash0;
+		hash0 = hash;
+	}
+	return hash0 << 1;
+}
+
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+	const signed char *scp = (const signed char *) msg;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i = 0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = ((int) scp[i]) + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+	const unsigned char *ucp = (const unsigned char *) msg;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i=0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = ((int) ucp[i]) + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+/*
+ * Returns the hash of a filename.  If len is 0 and name is NULL, then
+ * this function can be used to test whether or not a hash version is
+ * supported.
+ *
+ * The seed is an 4 longword (32 bits) "secret" which can be used to
+ * uniquify a hash.  If the seed is all zero's, then some default seed
+ * may be used.
+ *
+ * A particular hash version specifies whether or not the seed is
+ * represented, and whether or not the returned hash is 32 bits or 64
+ * bits.  32 bit hashes will return 0 for the minor hash.
+ */
+int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
+{
+	__u32	hash;
+	__u32	minor_hash = 0;
+	const char	*p;
+	int		i;
+	__u32		in[8], buf[4];
+	void		(*str2hashbuf)(const char *, int, __u32 *, int) =
+				str2hashbuf_signed;
+
+	/* Initialize the default seed for the hash checksum functions */
+	buf[0] = 0x67452301;
+	buf[1] = 0xefcdab89;
+	buf[2] = 0x98badcfe;
+	buf[3] = 0x10325476;
+
+	/* Check to see if the seed is all zero's */
+	if (hinfo->seed) {
+		for (i=0; i < 4; i++) {
+			if (hinfo->seed[i])
+				break;
+		}
+		if (i < 4)
+			memcpy(buf, hinfo->seed, sizeof(buf));
+	}
+
+	switch (hinfo->hash_version) {
+	case DX_HASH_LEGACY_UNSIGNED:
+		hash = dx_hack_hash_unsigned(name, len);
+		break;
+	case DX_HASH_LEGACY:
+		hash = dx_hack_hash_signed(name, len);
+		break;
+	case DX_HASH_HALF_MD4_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
+	case DX_HASH_HALF_MD4:
+		p = name;
+		while (len > 0) {
+			(*str2hashbuf)(p, len, in, 8);
+			half_md4_transform(buf, in);
+			len -= 32;
+			p += 32;
+		}
+		minor_hash = buf[2];
+		hash = buf[1];
+		break;
+	case DX_HASH_TEA_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
+	case DX_HASH_TEA:
+		p = name;
+		while (len > 0) {
+			(*str2hashbuf)(p, len, in, 4);
+			TEA_transform(buf, in);
+			len -= 16;
+			p += 16;
+		}
+		hash = buf[0];
+		minor_hash = buf[1];
+		break;
+	default:
+		hinfo->hash = 0;
+		return -1;
+	}
+	hash = hash & ~1;
+	if (hash == (EXT3_HTREE_EOF << 1))
+		hash = (EXT3_HTREE_EOF-1) << 1;
+	hinfo->hash = hash;
+	hinfo->minor_hash = minor_hash;
+	return 0;
+}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
new file mode 100644
index 00000000..0b3da7cc
--- /dev/null
+++ b/fs/ext3/ialloc.c
@@ -0,0 +1,768 @@
+/*
+ *  linux/fs/ext3/ialloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  BSD ufs-inspired inode and directory allocation by
+ *  Stephen Tweedie (sct@redhat.com), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/ext3_fs.h>
+#include <linux/ext3_jbd.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/random.h>
+#include <linux/bitops.h>
+
+#include <asm/byteorder.h>
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * ialloc.c contains the inodes allocation and deallocation routines
+ */
+
+/*
+ * The free inodes are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.
+ */
+
+
+/*
+ * Read the inode allocation bitmap for a given block_group, reading
+ * into the specified slot in the superblock's bitmap cache.
+ *
+ * Return buffer_head of bitmap on success or NULL.
+ */
+static struct buffer_head *
+read_inode_bitmap(struct super_block * sb, unsigned long block_group)
+{
+	struct ext3_group_desc *desc;
+	struct buffer_head *bh = NULL;
+
+	desc = ext3_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		goto error_out;
+
+	bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap));
+	if (!bh)
+		ext3_error(sb, "read_inode_bitmap",
+			    "Cannot read inode bitmap - "
+			    "block_group = %lu, inode_bitmap = %u",
+			    block_group, le32_to_cpu(desc->bg_inode_bitmap));
+error_out:
+	return bh;
+}
+
+/*
+ * NOTE! When we get the inode, we're the only people
+ * that have access to it, and as such there are no
+ * race conditions we have to worry about. The inode
+ * is not on the hash-lists, and it cannot be reached
+ * through the filesystem because the directory entry
+ * has been deleted earlier.
+ *
+ * HOWEVER: we must make sure that we get no aliases,
+ * which means that we have to call "clear_inode()"
+ * _before_ we mark the inode not in use in the inode
+ * bitmaps. Otherwise a newly created file might use
+ * the same inode number (not actually the same pointer
+ * though), and then we'd have two inodes sharing the
+ * same inode number and space on the harddisk.
+ */
+void ext3_free_inode (handle_t *handle, struct inode * inode)
+{
+	struct super_block * sb = inode->i_sb;
+	int is_directory;
+	unsigned long ino;
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *bh2;
+	unsigned long block_group;
+	unsigned long bit;
+	struct ext3_group_desc * gdp;
+	struct ext3_super_block * es;
+	struct ext3_sb_info *sbi;
+	int fatal = 0, err;
+
+	if (atomic_read(&inode->i_count) > 1) {
+		printk ("ext3_free_inode: inode has count=%d\n",
+					atomic_read(&inode->i_count));
+		return;
+	}
+	if (inode->i_nlink) {
+		printk ("ext3_free_inode: inode has nlink=%d\n",
+			inode->i_nlink);
+		return;
+	}
+	if (!sb) {
+		printk("ext3_free_inode: inode on nonexistent device\n");
+		return;
+	}
+	sbi = EXT3_SB(sb);
+
+	ino = inode->i_ino;
+	ext3_debug ("freeing inode %lu\n", ino);
+
+	is_directory = S_ISDIR(inode->i_mode);
+
+	es = EXT3_SB(sb)->s_es;
+	if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+		ext3_error (sb, "ext3_free_inode",
+			    "reserved or nonexistent inode %lu", ino);
+		goto error_return;
+	}
+	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
+	bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
+	bitmap_bh = read_inode_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		goto error_return;
+
+	BUFFER_TRACE(bitmap_bh, "get_write_access");
+	fatal = ext3_journal_get_write_access(handle, bitmap_bh);
+	if (fatal)
+		goto error_return;
+
+	/* Ok, now we can actually update the inode bitmaps.. */
+	if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+					bit, bitmap_bh->b_data))
+		ext3_error (sb, "ext3_free_inode",
+			      "bit already cleared for inode %lu", ino);
+	else {
+		gdp = ext3_get_group_desc (sb, block_group, &bh2);
+
+		BUFFER_TRACE(bh2, "get_write_access");
+		fatal = ext3_journal_get_write_access(handle, bh2);
+		if (fatal) goto error_return;
+
+		if (gdp) {
+			spin_lock(sb_bgl_lock(sbi, block_group));
+			le16_add_cpu(&gdp->bg_free_inodes_count, 1);
+			if (is_directory)
+				le16_add_cpu(&gdp->bg_used_dirs_count, -1);
+			spin_unlock(sb_bgl_lock(sbi, block_group));
+			percpu_counter_inc(&sbi->s_freeinodes_counter);
+			if (is_directory)
+				percpu_counter_dec(&sbi->s_dirs_counter);
+
+		}
+		BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
+		err = ext3_journal_dirty_metadata(handle, bh2);
+		if (!fatal) fatal = err;
+	}
+	BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata");
+	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
+	if (!fatal)
+		fatal = err;
+
+error_return:
+	brelse(bitmap_bh);
+	ext3_std_error(sb, fatal);
+}
+
+/*
+ * There are two policies for allocating an inode.  If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory\'s block
+ * group to find a free inode.
+ */
+static int find_group_dir(struct super_block *sb, struct inode *parent)
+{
+	int ngroups = EXT3_SB(sb)->s_groups_count;
+	unsigned int freei, avefreei;
+	struct ext3_group_desc *desc, *best_desc = NULL;
+	int group, best_group = -1;
+
+	freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter);
+	avefreei = freei / ngroups;
+
+	for (group = 0; group < ngroups; group++) {
+		desc = ext3_get_group_desc (sb, group, NULL);
+		if (!desc || !desc->bg_free_inodes_count)
+			continue;
+		if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+			continue;
+		if (!best_desc ||
+		    (le16_to_cpu(desc->bg_free_blocks_count) >
+		     le16_to_cpu(best_desc->bg_free_blocks_count))) {
+			best_group = group;
+			best_desc = desc;
+		}
+	}
+	return best_group;
+}
+
+/*
+ * Orlov's allocator for directories.
+ *
+ * We always try to spread first-level directories.
+ *
+ * If there are blockgroups with both free inodes and free blocks counts
+ * not worse than average we return one with smallest directory count.
+ * Otherwise we simply return a random group.
+ *
+ * For the rest rules look so:
+ *
+ * It's OK to put directory into a group unless
+ * it has too many directories already (max_dirs) or
+ * it has too few free inodes left (min_inodes) or
+ * it has too few free blocks left (min_blocks) or
+ * it's already running too large debt (max_debt).
+ * Parent's group is preferred, if it doesn't satisfy these
+ * conditions we search cyclically through the rest. If none
+ * of the groups look good we just look for a group with more
+ * free inodes than average (starting at parent's group).
+ *
+ * Debt is incremented each time we allocate a directory and decremented
+ * when we allocate an inode, within 0--255.
+ */
+
+#define INODE_COST 64
+#define BLOCK_COST 256
+
+static int find_group_orlov(struct super_block *sb, struct inode *parent)
+{
+	int parent_group = EXT3_I(parent)->i_block_group;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_super_block *es = sbi->s_es;
+	int ngroups = sbi->s_groups_count;
+	int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
+	unsigned int freei, avefreei;
+	ext3_fsblk_t freeb, avefreeb;
+	ext3_fsblk_t blocks_per_dir;
+	unsigned int ndirs;
+	int max_debt, max_dirs, min_inodes;
+	ext3_grpblk_t min_blocks;
+	int group = -1, i;
+	struct ext3_group_desc *desc;
+
+	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
+	avefreei = freei / ngroups;
+	freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	avefreeb = freeb / ngroups;
+	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
+
+	if ((parent == sb->s_root->d_inode) ||
+	    (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) {
+		int best_ndir = inodes_per_group;
+		int best_group = -1;
+
+		get_random_bytes(&group, sizeof(group));
+		parent_group = (unsigned)group % ngroups;
+		for (i = 0; i < ngroups; i++) {
+			group = (parent_group + i) % ngroups;
+			desc = ext3_get_group_desc (sb, group, NULL);
+			if (!desc || !desc->bg_free_inodes_count)
+				continue;
+			if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+				continue;
+			if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+				continue;
+			if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+				continue;
+			best_group = group;
+			best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+		}
+		if (best_group >= 0)
+			return best_group;
+		goto fallback;
+	}
+
+	blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs;
+
+	max_dirs = ndirs / ngroups + inodes_per_group / 16;
+	min_inodes = avefreei - inodes_per_group / 4;
+	min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
+
+	max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST);
+	if (max_debt * INODE_COST > inodes_per_group)
+		max_debt = inodes_per_group / INODE_COST;
+	if (max_debt > 255)
+		max_debt = 255;
+	if (max_debt == 0)
+		max_debt = 1;
+
+	for (i = 0; i < ngroups; i++) {
+		group = (parent_group + i) % ngroups;
+		desc = ext3_get_group_desc (sb, group, NULL);
+		if (!desc || !desc->bg_free_inodes_count)
+			continue;
+		if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+			continue;
+		if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+			continue;
+		if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
+			continue;
+		return group;
+	}
+
+fallback:
+	for (i = 0; i < ngroups; i++) {
+		group = (parent_group + i) % ngroups;
+		desc = ext3_get_group_desc (sb, group, NULL);
+		if (!desc || !desc->bg_free_inodes_count)
+			continue;
+		if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+			return group;
+	}
+
+	if (avefreei) {
+		/*
+		 * The free-inodes counter is approximate, and for really small
+		 * filesystems the above test can fail to find any blockgroups
+		 */
+		avefreei = 0;
+		goto fallback;
+	}
+
+	return -1;
+}
+
+static int find_group_other(struct super_block *sb, struct inode *parent)
+{
+	int parent_group = EXT3_I(parent)->i_block_group;
+	int ngroups = EXT3_SB(sb)->s_groups_count;
+	struct ext3_group_desc *desc;
+	int group, i;
+
+	/*
+	 * Try to place the inode in its parent directory
+	 */
+	group = parent_group;
+	desc = ext3_get_group_desc (sb, group, NULL);
+	if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+			le16_to_cpu(desc->bg_free_blocks_count))
+		return group;
+
+	/*
+	 * We're going to place this inode in a different blockgroup from its
+	 * parent.  We want to cause files in a common directory to all land in
+	 * the same blockgroup.  But we want files which are in a different
+	 * directory which shares a blockgroup with our parent to land in a
+	 * different blockgroup.
+	 *
+	 * So add our directory's i_ino into the starting point for the hash.
+	 */
+	group = (group + parent->i_ino) % ngroups;
+
+	/*
+	 * Use a quadratic hash to find a group with a free inode and some free
+	 * blocks.
+	 */
+	for (i = 1; i < ngroups; i <<= 1) {
+		group += i;
+		if (group >= ngroups)
+			group -= ngroups;
+		desc = ext3_get_group_desc (sb, group, NULL);
+		if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+				le16_to_cpu(desc->bg_free_blocks_count))
+			return group;
+	}
+
+	/*
+	 * That failed: try linear search for a free inode, even if that group
+	 * has no free blocks.
+	 */
+	group = parent_group;
+	for (i = 0; i < ngroups; i++) {
+		if (++group >= ngroups)
+			group = 0;
+		desc = ext3_get_group_desc (sb, group, NULL);
+		if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+			return group;
+	}
+
+	return -1;
+}
+
+/*
+ * There are two policies for allocating an inode.  If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory's block
+ * group to find a free inode.
+ */
+struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
+			     const struct qstr *qstr, int mode)
+{
+	struct super_block *sb;
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *bh2;
+	int group;
+	unsigned long ino = 0;
+	struct inode * inode;
+	struct ext3_group_desc * gdp = NULL;
+	struct ext3_super_block * es;
+	struct ext3_inode_info *ei;
+	struct ext3_sb_info *sbi;
+	int err = 0;
+	struct inode *ret;
+	int i;
+
+	/* Cannot create files in a deleted directory */
+	if (!dir || !dir->i_nlink)
+		return ERR_PTR(-EPERM);
+
+	sb = dir->i_sb;
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	ei = EXT3_I(inode);
+
+	sbi = EXT3_SB(sb);
+	es = sbi->s_es;
+	if (S_ISDIR(mode)) {
+		if (test_opt (sb, OLDALLOC))
+			group = find_group_dir(sb, dir);
+		else
+			group = find_group_orlov(sb, dir);
+	} else
+		group = find_group_other(sb, dir);
+
+	err = -ENOSPC;
+	if (group == -1)
+		goto out;
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		err = -EIO;
+
+		gdp = ext3_get_group_desc(sb, group, &bh2);
+		if (!gdp)
+			goto fail;
+
+		brelse(bitmap_bh);
+		bitmap_bh = read_inode_bitmap(sb, group);
+		if (!bitmap_bh)
+			goto fail;
+
+		ino = 0;
+
+repeat_in_this_group:
+		ino = ext3_find_next_zero_bit((unsigned long *)
+				bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino);
+		if (ino < EXT3_INODES_PER_GROUP(sb)) {
+
+			BUFFER_TRACE(bitmap_bh, "get_write_access");
+			err = ext3_journal_get_write_access(handle, bitmap_bh);
+			if (err)
+				goto fail;
+
+			if (!ext3_set_bit_atomic(sb_bgl_lock(sbi, group),
+						ino, bitmap_bh->b_data)) {
+				/* we won it */
+				BUFFER_TRACE(bitmap_bh,
+					"call ext3_journal_dirty_metadata");
+				err = ext3_journal_dirty_metadata(handle,
+								bitmap_bh);
+				if (err)
+					goto fail;
+				goto got;
+			}
+			/* we lost it */
+			journal_release_buffer(handle, bitmap_bh);
+
+			if (++ino < EXT3_INODES_PER_GROUP(sb))
+				goto repeat_in_this_group;
+		}
+
+		/*
+		 * This case is possible in concurrent environment.  It is very
+		 * rare.  We cannot repeat the find_group_xxx() call because
+		 * that will simply return the same blockgroup, because the
+		 * group descriptor metadata has not yet been updated.
+		 * So we just go onto the next blockgroup.
+		 */
+		if (++group == sbi->s_groups_count)
+			group = 0;
+	}
+	err = -ENOSPC;
+	goto out;
+
+got:
+	ino += group * EXT3_INODES_PER_GROUP(sb) + 1;
+	if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+		ext3_error (sb, "ext3_new_inode",
+			    "reserved inode or inode > inodes count - "
+			    "block_group = %d, inode=%lu", group, ino);
+		err = -EIO;
+		goto fail;
+	}
+
+	BUFFER_TRACE(bh2, "get_write_access");
+	err = ext3_journal_get_write_access(handle, bh2);
+	if (err) goto fail;
+	spin_lock(sb_bgl_lock(sbi, group));
+	le16_add_cpu(&gdp->bg_free_inodes_count, -1);
+	if (S_ISDIR(mode)) {
+		le16_add_cpu(&gdp->bg_used_dirs_count, 1);
+	}
+	spin_unlock(sb_bgl_lock(sbi, group));
+	BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
+	err = ext3_journal_dirty_metadata(handle, bh2);
+	if (err) goto fail;
+
+	percpu_counter_dec(&sbi->s_freeinodes_counter);
+	if (S_ISDIR(mode))
+		percpu_counter_inc(&sbi->s_dirs_counter);
+
+
+	if (test_opt(sb, GRPID)) {
+		inode->i_mode = mode;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = dir->i_gid;
+	} else
+		inode_init_owner(inode, dir, mode);
+
+	inode->i_ino = ino;
+	/* This is the optimal IO size (for stat), not the fs block size */
+	inode->i_blocks = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+
+	memset(ei->i_data, 0, sizeof(ei->i_data));
+	ei->i_dir_start_lookup = 0;
+	ei->i_disksize = 0;
+
+	ei->i_flags =
+		ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
+#ifdef EXT3_FRAGMENTS
+	ei->i_faddr = 0;
+	ei->i_frag_no = 0;
+	ei->i_frag_size = 0;
+#endif
+	ei->i_file_acl = 0;
+	ei->i_dir_acl = 0;
+	ei->i_dtime = 0;
+	ei->i_block_alloc_info = NULL;
+	ei->i_block_group = group;
+
+	ext3_set_inode_flags(inode);
+	if (IS_DIRSYNC(inode))
+		handle->h_sync = 1;
+	if (insert_inode_locked(inode) < 0) {
+		/*
+		 * Likely a bitmap corruption causing inode to be allocated
+		 * twice.
+		 */
+		err = -EIO;
+		goto fail;
+	}
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+
+	ei->i_state_flags = 0;
+	ext3_set_inode_state(inode, EXT3_STATE_NEW);
+
+	/* See comment in ext3_iget for explanation */
+	if (ino >= EXT3_FIRST_INO(sb) + 1 &&
+	    EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) {
+		ei->i_extra_isize =
+			sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE;
+	} else {
+		ei->i_extra_isize = 0;
+	}
+
+	ret = inode;
+	dquot_initialize(inode);
+	err = dquot_alloc_inode(inode);
+	if (err)
+		goto fail_drop;
+
+	err = ext3_init_acl(handle, inode, dir);
+	if (err)
+		goto fail_free_drop;
+
+	err = ext3_init_security(handle, inode, dir, qstr);
+	if (err)
+		goto fail_free_drop;
+
+	err = ext3_mark_inode_dirty(handle, inode);
+	if (err) {
+		ext3_std_error(sb, err);
+		goto fail_free_drop;
+	}
+
+	ext3_debug("allocating inode %lu\n", inode->i_ino);
+	goto really_out;
+fail:
+	ext3_std_error(sb, err);
+out:
+	iput(inode);
+	ret = ERR_PTR(err);
+really_out:
+	brelse(bitmap_bh);
+	return ret;
+
+fail_free_drop:
+	dquot_free_inode(inode);
+
+fail_drop:
+	dquot_drop(inode);
+	inode->i_flags |= S_NOQUOTA;
+	inode->i_nlink = 0;
+	unlock_new_inode(inode);
+	iput(inode);
+	brelse(bitmap_bh);
+	return ERR_PTR(err);
+}
+
+/* Verify that we are loading a valid orphan from disk */
+struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
+{
+	unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
+	unsigned long block_group;
+	int bit;
+	struct buffer_head *bitmap_bh;
+	struct inode *inode = NULL;
+	long err = -EIO;
+
+	/* Error cases - e2fsck has already cleaned up for us */
+	if (ino > max_ino) {
+		ext3_warning(sb, __func__,
+			     "bad orphan ino %lu!  e2fsck was run?", ino);
+		goto error;
+	}
+
+	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
+	bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
+	bitmap_bh = read_inode_bitmap(sb, block_group);
+	if (!bitmap_bh) {
+		ext3_warning(sb, __func__,
+			     "inode bitmap error for orphan %lu", ino);
+		goto error;
+	}
+
+	/* Having the inode bit set should be a 100% indicator that this
+	 * is a valid orphan (no e2fsck run on fs).  Orphans also include
+	 * inodes that were being truncated, so we can't check i_nlink==0.
+	 */
+	if (!ext3_test_bit(bit, bitmap_bh->b_data))
+		goto bad_orphan;
+
+	inode = ext3_iget(sb, ino);
+	if (IS_ERR(inode))
+		goto iget_failed;
+
+	/*
+	 * If the orphans has i_nlinks > 0 then it should be able to be
+	 * truncated, otherwise it won't be removed from the orphan list
+	 * during processing and an infinite loop will result.
+	 */
+	if (inode->i_nlink && !ext3_can_truncate(inode))
+		goto bad_orphan;
+
+	if (NEXT_ORPHAN(inode) > max_ino)
+		goto bad_orphan;
+	brelse(bitmap_bh);
+	return inode;
+
+iget_failed:
+	err = PTR_ERR(inode);
+	inode = NULL;
+bad_orphan:
+	ext3_warning(sb, __func__,
+		     "bad orphan inode %lu!  e2fsck was run?", ino);
+	printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n",
+	       bit, (unsigned long long)bitmap_bh->b_blocknr,
+	       ext3_test_bit(bit, bitmap_bh->b_data));
+	printk(KERN_NOTICE "inode=%p\n", inode);
+	if (inode) {
+		printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+		       is_bad_inode(inode));
+		printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
+		       NEXT_ORPHAN(inode));
+		printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+		printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
+		/* Avoid freeing blocks if we got a bad deleted inode */
+		if (inode->i_nlink == 0)
+			inode->i_blocks = 0;
+		iput(inode);
+	}
+	brelse(bitmap_bh);
+error:
+	return ERR_PTR(err);
+}
+
+unsigned long ext3_count_free_inodes (struct super_block * sb)
+{
+	unsigned long desc_count;
+	struct ext3_group_desc *gdp;
+	int i;
+#ifdef EXT3FS_DEBUG
+	struct ext3_super_block *es;
+	unsigned long bitmap_count, x;
+	struct buffer_head *bitmap_bh = NULL;
+
+	es = EXT3_SB(sb)->s_es;
+	desc_count = 0;
+	bitmap_count = 0;
+	gdp = NULL;
+	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
+		gdp = ext3_get_group_desc (sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+		brelse(bitmap_bh);
+		bitmap_bh = read_inode_bitmap(sb, i);
+		if (!bitmap_bh)
+			continue;
+
+		x = ext3_count_free(bitmap_bh, EXT3_INODES_PER_GROUP(sb) / 8);
+		printk("group %d: stored = %d, counted = %lu\n",
+			i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+		bitmap_count += x;
+	}
+	brelse(bitmap_bh);
+	printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n",
+		le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+	return desc_count;
+#else
+	desc_count = 0;
+	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
+		gdp = ext3_get_group_desc (sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+		cond_resched();
+	}
+	return desc_count;
+#endif
+}
+
+/* Called at mount-time, super-block is locked */
+unsigned long ext3_count_dirs (struct super_block * sb)
+{
+	unsigned long count = 0;
+	int i;
+
+	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
+		struct ext3_group_desc *gdp = ext3_get_group_desc (sb, i, NULL);
+		if (!gdp)
+			continue;
+		count += le16_to_cpu(gdp->bg_used_dirs_count);
+	}
+	return count;
+}
+
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
new file mode 100644
index 00000000..db9ba1a3
--- /dev/null
+++ b/fs/ext3/inode.c
@@ -0,0 +1,3516 @@
+/*
+ *  linux/fs/ext3/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Goal-directed block allocation by Stephen Tweedie
+ *	(sct@redhat.com), 1993, 1998
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ *	(jj@sunsite.ms.mff.cuni.cz)
+ *
+ *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/ext3_jbd.h>
+#include <linux/jbd.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/fiemap.h>
+#include <linux/namei.h>
+#include "xattr.h"
+#include "acl.h"
+
+static int ext3_writepage_trans_blocks(struct inode *inode);
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static int ext3_inode_is_fast_symlink(struct inode *inode)
+{
+	int ea_blocks = EXT3_I(inode)->i_file_acl ?
+		(inode->i_sb->s_blocksize >> 9) : 0;
+
+	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
+}
+
+/*
+ * The ext3 forget function must perform a revoke if we are freeing data
+ * which has been journaled.  Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ */
+int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
+			struct buffer_head *bh, ext3_fsblk_t blocknr)
+{
+	int err;
+
+	might_sleep();
+
+	BUFFER_TRACE(bh, "enter");
+
+	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+		  "data mode %lx\n",
+		  bh, is_metadata, inode->i_mode,
+		  test_opt(inode->i_sb, DATA_FLAGS));
+
+	/* Never use the revoke function if we are doing full data
+	 * journaling: there is no need to, and a V1 superblock won't
+	 * support it.  Otherwise, only skip the revoke on un-journaled
+	 * data blocks. */
+
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
+	    (!is_metadata && !ext3_should_journal_data(inode))) {
+		if (bh) {
+			BUFFER_TRACE(bh, "call journal_forget");
+			return ext3_journal_forget(handle, bh);
+		}
+		return 0;
+	}
+
+	/*
+	 * data!=journal && (is_metadata || should_journal_data(inode))
+	 */
+	BUFFER_TRACE(bh, "call ext3_journal_revoke");
+	err = ext3_journal_revoke(handle, blocknr, bh);
+	if (err)
+		ext3_abort(inode->i_sb, __func__,
+			   "error %d when attempting revoke", err);
+	BUFFER_TRACE(bh, "exit");
+	return err;
+}
+
+/*
+ * Work out how many blocks we need to proceed with the next chunk of a
+ * truncate transaction.
+ */
+static unsigned long blocks_for_truncate(struct inode *inode)
+{
+	unsigned long needed;
+
+	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+
+	/* Give ourselves just enough room to cope with inodes in which
+	 * i_blocks is corrupt: we've seen disk corruptions in the past
+	 * which resulted in random data in an inode which looked enough
+	 * like a regular file for ext3 to try to delete it.  Things
+	 * will go a bit crazy if that happens, but at least we should
+	 * try not to panic the whole kernel. */
+	if (needed < 2)
+		needed = 2;
+
+	/* But we need to bound the transaction so we don't overflow the
+	 * journal. */
+	if (needed > EXT3_MAX_TRANS_DATA)
+		needed = EXT3_MAX_TRANS_DATA;
+
+	return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+}
+
+/*
+ * Truncate transactions can be complex and absolutely huge.  So we need to
+ * be able to restart the transaction at a conventient checkpoint to make
+ * sure we don't overflow the journal.
+ *
+ * start_transaction gets us a new handle for a truncate transaction,
+ * and extend_transaction tries to extend the existing one a bit.  If
+ * extend fails, we need to propagate the failure up and restart the
+ * transaction in the top-level truncate loop. --sct
+ */
+static handle_t *start_transaction(struct inode *inode)
+{
+	handle_t *result;
+
+	result = ext3_journal_start(inode, blocks_for_truncate(inode));
+	if (!IS_ERR(result))
+		return result;
+
+	ext3_std_error(inode->i_sb, PTR_ERR(result));
+	return result;
+}
+
+/*
+ * Try to extend this transaction for the purposes of truncation.
+ *
+ * Returns 0 if we managed to create more room.  If we can't create more
+ * room, and the transaction must be restarted we return 1.
+ */
+static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+{
+	if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
+		return 0;
+	if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
+		return 0;
+	return 1;
+}
+
+/*
+ * Restart the transaction associated with *handle.  This does a commit,
+ * so before we call here everything must be consistently dirtied against
+ * this transaction.
+ */
+static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
+{
+	int ret;
+
+	jbd_debug(2, "restarting handle %p\n", handle);
+	/*
+	 * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
+	 * At this moment, get_block can be called only for blocks inside
+	 * i_size since page cache has been already dropped and writes are
+	 * blocked by i_mutex. So we can safely drop the truncate_mutex.
+	 */
+	mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+	ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
+	mutex_lock(&EXT3_I(inode)->truncate_mutex);
+	return ret;
+}
+
+/*
+ * Called at inode eviction from icache
+ */
+void ext3_evict_inode (struct inode *inode)
+{
+	struct ext3_block_alloc_info *rsv;
+	handle_t *handle;
+	int want_delete = 0;
+
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
+		dquot_initialize(inode);
+		want_delete = 1;
+	}
+
+	truncate_inode_pages(&inode->i_data, 0);
+
+	ext3_discard_reservation(inode);
+	rsv = EXT3_I(inode)->i_block_alloc_info;
+	EXT3_I(inode)->i_block_alloc_info = NULL;
+	if (unlikely(rsv))
+		kfree(rsv);
+
+	if (!want_delete)
+		goto no_delete;
+
+	handle = start_transaction(inode);
+	if (IS_ERR(handle)) {
+		/*
+		 * If we're going to skip the normal cleanup, we still need to
+		 * make sure that the in-core orphan linked list is properly
+		 * cleaned up.
+		 */
+		ext3_orphan_del(NULL, inode);
+		goto no_delete;
+	}
+
+	if (IS_SYNC(inode))
+		handle->h_sync = 1;
+	inode->i_size = 0;
+	if (inode->i_blocks)
+		ext3_truncate(inode);
+	/*
+	 * Kill off the orphan record which ext3_truncate created.
+	 * AKPM: I think this can be inside the above `if'.
+	 * Note that ext3_orphan_del() has to be able to cope with the
+	 * deletion of a non-existent orphan - this is because we don't
+	 * know if ext3_truncate() actually created an orphan record.
+	 * (Well, we could do this if we need to, but heck - it works)
+	 */
+	ext3_orphan_del(handle, inode);
+	EXT3_I(inode)->i_dtime	= get_seconds();
+
+	/*
+	 * One subtle ordering requirement: if anything has gone wrong
+	 * (transaction abort, IO errors, whatever), then we can still
+	 * do these next steps (the fs will already have been marked as
+	 * having errors), but we can't free the inode if the mark_dirty
+	 * fails.
+	 */
+	if (ext3_mark_inode_dirty(handle, inode)) {
+		/* If that failed, just dquot_drop() and be done with that */
+		dquot_drop(inode);
+		end_writeback(inode);
+	} else {
+		ext3_xattr_delete_inode(handle, inode);
+		dquot_free_inode(inode);
+		dquot_drop(inode);
+		end_writeback(inode);
+		ext3_free_inode(handle, inode);
+	}
+	ext3_journal_stop(handle);
+	return;
+no_delete:
+	end_writeback(inode);
+	dquot_drop(inode);
+}
+
+typedef struct {
+	__le32	*p;
+	__le32	key;
+	struct buffer_head *bh;
+} Indirect;
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+{
+	p->key = *(p->p = v);
+	p->bh = bh;
+}
+
+static int verify_chain(Indirect *from, Indirect *to)
+{
+	while (from <= to && from->key == *from->p)
+		from++;
+	return (from > to);
+}
+
+/**
+ *	ext3_block_to_path - parse the block number into array of offsets
+ *	@inode: inode in question (we are only interested in its superblock)
+ *	@i_block: block number to be parsed
+ *	@offsets: array to store the offsets in
+ *      @boundary: set this non-zero if the referred-to block is likely to be
+ *             followed (on disk) by an indirect block.
+ *
+ *	To store the locations of file's data ext3 uses a data structure common
+ *	for UNIX filesystems - tree of pointers anchored in the inode, with
+ *	data blocks at leaves and indirect blocks in intermediate nodes.
+ *	This function translates the block number into path in that tree -
+ *	return value is the path length and @offsets[n] is the offset of
+ *	pointer to (n+1)th node in the nth one. If @block is out of range
+ *	(negative or too large) warning is printed and zero returned.
+ *
+ *	Note: function doesn't find node addresses, so no IO is needed. All
+ *	we need to know is the capacity of indirect blocks (taken from the
+ *	inode->i_sb).
+ */
+
+/*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+
+static int ext3_block_to_path(struct inode *inode,
+			long i_block, int offsets[4], int *boundary)
+{
+	int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+	int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
+	const long direct_blocks = EXT3_NDIR_BLOCKS,
+		indirect_blocks = ptrs,
+		double_blocks = (1 << (ptrs_bits * 2));
+	int n = 0;
+	int final = 0;
+
+	if (i_block < 0) {
+		ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
+	} else if (i_block < direct_blocks) {
+		offsets[n++] = i_block;
+		final = direct_blocks;
+	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
+		offsets[n++] = EXT3_IND_BLOCK;
+		offsets[n++] = i_block;
+		final = ptrs;
+	} else if ((i_block -= indirect_blocks) < double_blocks) {
+		offsets[n++] = EXT3_DIND_BLOCK;
+		offsets[n++] = i_block >> ptrs_bits;
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+		offsets[n++] = EXT3_TIND_BLOCK;
+		offsets[n++] = i_block >> (ptrs_bits * 2);
+		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else {
+		ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big");
+	}
+	if (boundary)
+		*boundary = final - 1 - (i_block & (ptrs - 1));
+	return n;
+}
+
+/**
+ *	ext3_get_branch - read the chain of indirect blocks leading to data
+ *	@inode: inode in question
+ *	@depth: depth of the chain (1 - direct pointer, etc.)
+ *	@offsets: offsets of pointers in inode/indirect blocks
+ *	@chain: place to store the result
+ *	@err: here we store the error value
+ *
+ *	Function fills the array of triples <key, p, bh> and returns %NULL
+ *	if everything went OK or the pointer to the last filled triple
+ *	(incomplete one) otherwise. Upon the return chain[i].key contains
+ *	the number of (i+1)-th block in the chain (as it is stored in memory,
+ *	i.e. little-endian 32-bit), chain[i].p contains the address of that
+ *	number (it points into struct inode for i==0 and into the bh->b_data
+ *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ *	block for i>0 and NULL for i==0. In other words, it holds the block
+ *	numbers of the chain, addresses they were taken from (and where we can
+ *	verify that chain did not change) and buffer_heads hosting these
+ *	numbers.
+ *
+ *	Function stops when it stumbles upon zero pointer (absent block)
+ *		(pointer to last triple returned, *@err == 0)
+ *	or when it gets an IO error reading an indirect block
+ *		(ditto, *@err == -EIO)
+ *	or when it notices that chain had been changed while it was reading
+ *		(ditto, *@err == -EAGAIN)
+ *	or when it reads all @depth-1 indirect blocks successfully and finds
+ *	the whole chain, all way to the data (returns %NULL, *err == 0).
+ */
+static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
+				 Indirect chain[4], int *err)
+{
+	struct super_block *sb = inode->i_sb;
+	Indirect *p = chain;
+	struct buffer_head *bh;
+
+	*err = 0;
+	/* i_data is not going away, no lock needed */
+	add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
+	if (!p->key)
+		goto no_block;
+	while (--depth) {
+		bh = sb_bread(sb, le32_to_cpu(p->key));
+		if (!bh)
+			goto failure;
+		/* Reader: pointers */
+		if (!verify_chain(chain, p))
+			goto changed;
+		add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
+		/* Reader: end */
+		if (!p->key)
+			goto no_block;
+	}
+	return NULL;
+
+changed:
+	brelse(bh);
+	*err = -EAGAIN;
+	goto no_block;
+failure:
+	*err = -EIO;
+no_block:
+	return p;
+}
+
+/**
+ *	ext3_find_near - find a place for allocation with sufficient locality
+ *	@inode: owner
+ *	@ind: descriptor of indirect block.
+ *
+ *	This function returns the preferred place for block allocation.
+ *	It is used when heuristic for sequential allocation fails.
+ *	Rules are:
+ *	  + if there is a block to the left of our position - allocate near it.
+ *	  + if pointer will live in indirect block - allocate near that block.
+ *	  + if pointer will live in inode - allocate in the same
+ *	    cylinder group.
+ *
+ * In the latter case we colour the starting block by the callers PID to
+ * prevent it from clashing with concurrent allocations for a different inode
+ * in the same block group.   The PID is used here so that functionally related
+ * files will be close-by on-disk.
+ *
+ *	Caller must make sure that @ind is valid and will stay that way.
+ */
+static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
+	__le32 *p;
+	ext3_fsblk_t bg_start;
+	ext3_grpblk_t colour;
+
+	/* Try to find previous block */
+	for (p = ind->p - 1; p >= start; p--) {
+		if (*p)
+			return le32_to_cpu(*p);
+	}
+
+	/* No such thing, so let's try location of indirect block */
+	if (ind->bh)
+		return ind->bh->b_blocknr;
+
+	/*
+	 * It is going to be referred to from the inode itself? OK, just put it
+	 * into the same cylinder group then.
+	 */
+	bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
+	colour = (current->pid % 16) *
+			(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	return bg_start + colour;
+}
+
+/**
+ *	ext3_find_goal - find a preferred place for allocation.
+ *	@inode: owner
+ *	@block:  block we want
+ *	@partial: pointer to the last triple within a chain
+ *
+ *	Normally this function find the preferred place for block allocation,
+ *	returns it.
+ */
+
+static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
+				   Indirect *partial)
+{
+	struct ext3_block_alloc_info *block_i;
+
+	block_i =  EXT3_I(inode)->i_block_alloc_info;
+
+	/*
+	 * try the heuristic for sequential allocation,
+	 * failing that at least try to get decent locality.
+	 */
+	if (block_i && (block == block_i->last_alloc_logical_block + 1)
+		&& (block_i->last_alloc_physical_block != 0)) {
+		return block_i->last_alloc_physical_block + 1;
+	}
+
+	return ext3_find_near(inode, partial);
+}
+
+/**
+ *	ext3_blks_to_allocate - Look up the block map and count the number
+ *	of direct blocks need to be allocated for the given branch.
+ *
+ *	@branch: chain of indirect blocks
+ *	@k: number of blocks need for indirect blocks
+ *	@blks: number of data blocks to be mapped.
+ *	@blocks_to_boundary:  the offset in the indirect block
+ *
+ *	return the total number of blocks to be allocate, including the
+ *	direct and indirect blocks.
+ */
+static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
+		int blocks_to_boundary)
+{
+	unsigned long count = 0;
+
+	/*
+	 * Simple case, [t,d]Indirect block(s) has not allocated yet
+	 * then it's clear blocks on that path have not allocated
+	 */
+	if (k > 0) {
+		/* right now we don't handle cross boundary allocation */
+		if (blks < blocks_to_boundary + 1)
+			count += blks;
+		else
+			count += blocks_to_boundary + 1;
+		return count;
+	}
+
+	count++;
+	while (count < blks && count <= blocks_to_boundary &&
+		le32_to_cpu(*(branch[0].p + count)) == 0) {
+		count++;
+	}
+	return count;
+}
+
+/**
+ *	ext3_alloc_blocks - multiple allocate blocks needed for a branch
+ *	@handle: handle for this transaction
+ *	@inode: owner
+ *	@goal: preferred place for allocation
+ *	@indirect_blks: the number of blocks need to allocate for indirect
+ *			blocks
+ *	@blks:	number of blocks need to allocated for direct blocks
+ *	@new_blocks: on return it will store the new block numbers for
+ *	the indirect blocks(if needed) and the first direct block,
+ *	@err: here we store the error value
+ *
+ *	return the number of direct blocks allocated
+ */
+static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, int indirect_blks, int blks,
+			ext3_fsblk_t new_blocks[4], int *err)
+{
+	int target, i;
+	unsigned long count = 0;
+	int index = 0;
+	ext3_fsblk_t current_block = 0;
+	int ret = 0;
+
+	/*
+	 * Here we try to allocate the requested multiple blocks at once,
+	 * on a best-effort basis.
+	 * To build a branch, we should allocate blocks for
+	 * the indirect blocks(if not allocated yet), and at least
+	 * the first direct block of this branch.  That's the
+	 * minimum number of blocks need to allocate(required)
+	 */
+	target = blks + indirect_blks;
+
+	while (1) {
+		count = target;
+		/* allocating blocks for indirect blocks and direct blocks */
+		current_block = ext3_new_blocks(handle,inode,goal,&count,err);
+		if (*err)
+			goto failed_out;
+
+		target -= count;
+		/* allocate blocks for indirect blocks */
+		while (index < indirect_blks && count) {
+			new_blocks[index++] = current_block++;
+			count--;
+		}
+
+		if (count > 0)
+			break;
+	}
+
+	/* save the new block number for the first direct block */
+	new_blocks[index] = current_block;
+
+	/* total number of blocks allocated for direct blocks */
+	ret = count;
+	*err = 0;
+	return ret;
+failed_out:
+	for (i = 0; i <index; i++)
+		ext3_free_blocks(handle, inode, new_blocks[i], 1);
+	return ret;
+}
+
+/**
+ *	ext3_alloc_branch - allocate and set up a chain of blocks.
+ *	@handle: handle for this transaction
+ *	@inode: owner
+ *	@indirect_blks: number of allocated indirect blocks
+ *	@blks: number of allocated direct blocks
+ *	@goal: preferred place for allocation
+ *	@offsets: offsets (in the blocks) to store the pointers to next.
+ *	@branch: place to store the chain in.
+ *
+ *	This function allocates blocks, zeroes out all but the last one,
+ *	links them into chain and (if we are synchronous) writes them to disk.
+ *	In other words, it prepares a branch that can be spliced onto the
+ *	inode. It stores the information about that chain in the branch[], in
+ *	the same format as ext3_get_branch() would do. We are calling it after
+ *	we had read the existing part of chain and partial points to the last
+ *	triple of that (one with zero ->key). Upon the exit we have the same
+ *	picture as after the successful ext3_get_block(), except that in one
+ *	place chain is disconnected - *branch->p is still zero (we did not
+ *	set the last link), but branch->key contains the number that should
+ *	be placed into *branch->p to fill that gap.
+ *
+ *	If allocation fails we free all blocks we've allocated (and forget
+ *	their buffer_heads) and return the error value the from failed
+ *	ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ *	as described above and return 0.
+ */
+static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
+			int indirect_blks, int *blks, ext3_fsblk_t goal,
+			int *offsets, Indirect *branch)
+{
+	int blocksize = inode->i_sb->s_blocksize;
+	int i, n = 0;
+	int err = 0;
+	struct buffer_head *bh;
+	int num;
+	ext3_fsblk_t new_blocks[4];
+	ext3_fsblk_t current_block;
+
+	num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
+				*blks, new_blocks, &err);
+	if (err)
+		return err;
+
+	branch[0].key = cpu_to_le32(new_blocks[0]);
+	/*
+	 * metadata blocks and data blocks are allocated.
+	 */
+	for (n = 1; n <= indirect_blks;  n++) {
+		/*
+		 * Get buffer_head for parent block, zero it out
+		 * and set the pointer to new one, then send
+		 * parent to disk.
+		 */
+		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+		branch[n].bh = bh;
+		lock_buffer(bh);
+		BUFFER_TRACE(bh, "call get_create_access");
+		err = ext3_journal_get_create_access(handle, bh);
+		if (err) {
+			unlock_buffer(bh);
+			brelse(bh);
+			goto failed;
+		}
+
+		memset(bh->b_data, 0, blocksize);
+		branch[n].p = (__le32 *) bh->b_data + offsets[n];
+		branch[n].key = cpu_to_le32(new_blocks[n]);
+		*branch[n].p = branch[n].key;
+		if ( n == indirect_blks) {
+			current_block = new_blocks[n];
+			/*
+			 * End of chain, update the last new metablock of
+			 * the chain to point to the new allocated
+			 * data blocks numbers
+			 */
+			for (i=1; i < num; i++)
+				*(branch[n].p + i) = cpu_to_le32(++current_block);
+		}
+		BUFFER_TRACE(bh, "marking uptodate");
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+
+		BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+		err = ext3_journal_dirty_metadata(handle, bh);
+		if (err)
+			goto failed;
+	}
+	*blks = num;
+	return err;
+failed:
+	/* Allocation failed, free what we already allocated */
+	for (i = 1; i <= n ; i++) {
+		BUFFER_TRACE(branch[i].bh, "call journal_forget");
+		ext3_journal_forget(handle, branch[i].bh);
+	}
+	for (i = 0; i <indirect_blks; i++)
+		ext3_free_blocks(handle, inode, new_blocks[i], 1);
+
+	ext3_free_blocks(handle, inode, new_blocks[i], num);
+
+	return err;
+}
+
+/**
+ * ext3_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @where: location of missing link
+ * @num:   number of indirect blocks we are adding
+ * @blks:  number of direct blocks we are adding
+ *
+ * This function fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
+ */
+static int ext3_splice_branch(handle_t *handle, struct inode *inode,
+			long block, Indirect *where, int num, int blks)
+{
+	int i;
+	int err = 0;
+	struct ext3_block_alloc_info *block_i;
+	ext3_fsblk_t current_block;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+
+	block_i = ei->i_block_alloc_info;
+	/*
+	 * If we're splicing into a [td]indirect block (as opposed to the
+	 * inode) then we need to get write access to the [td]indirect block
+	 * before the splice.
+	 */
+	if (where->bh) {
+		BUFFER_TRACE(where->bh, "get_write_access");
+		err = ext3_journal_get_write_access(handle, where->bh);
+		if (err)
+			goto err_out;
+	}
+	/* That's it */
+
+	*where->p = where->key;
+
+	/*
+	 * Update the host buffer_head or inode to point to more just allocated
+	 * direct blocks blocks
+	 */
+	if (num == 0 && blks > 1) {
+		current_block = le32_to_cpu(where->key) + 1;
+		for (i = 1; i < blks; i++)
+			*(where->p + i ) = cpu_to_le32(current_block++);
+	}
+
+	/*
+	 * update the most recently allocated logical & physical block
+	 * in i_block_alloc_info, to assist find the proper goal block for next
+	 * allocation
+	 */
+	if (block_i) {
+		block_i->last_alloc_logical_block = block + blks - 1;
+		block_i->last_alloc_physical_block =
+				le32_to_cpu(where[num].key) + blks - 1;
+	}
+
+	/* We are done with atomic stuff, now do the rest of housekeeping */
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	ext3_mark_inode_dirty(handle, inode);
+	/* ext3_mark_inode_dirty already updated i_sync_tid */
+	atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
+
+	/* had we spliced it onto indirect block? */
+	if (where->bh) {
+		/*
+		 * If we spliced it onto an indirect block, we haven't
+		 * altered the inode.  Note however that if it is being spliced
+		 * onto an indirect block at the very end of the file (the
+		 * file is growing) then we *will* alter the inode to reflect
+		 * the new i_size.  But that is not done here - it is done in
+		 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
+		 */
+		jbd_debug(5, "splicing indirect only\n");
+		BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
+		err = ext3_journal_dirty_metadata(handle, where->bh);
+		if (err)
+			goto err_out;
+	} else {
+		/*
+		 * OK, we spliced it into the inode itself on a direct block.
+		 * Inode was dirtied above.
+		 */
+		jbd_debug(5, "splicing direct\n");
+	}
+	return err;
+
+err_out:
+	for (i = 1; i <= num; i++) {
+		BUFFER_TRACE(where[i].bh, "call journal_forget");
+		ext3_journal_forget(handle, where[i].bh);
+		ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
+	}
+	ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
+
+	return err;
+}
+
+/*
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ *
+ * `handle' can be NULL if create == 0.
+ *
+ * The BKL may not be held on entry here.  Be sure to take it early.
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ */
+int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
+		sector_t iblock, unsigned long maxblocks,
+		struct buffer_head *bh_result,
+		int create)
+{
+	int err = -EIO;
+	int offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	ext3_fsblk_t goal;
+	int indirect_blks;
+	int blocks_to_boundary = 0;
+	int depth;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	int count = 0;
+	ext3_fsblk_t first_block = 0;
+
+
+	J_ASSERT(handle != NULL || create == 0);
+	depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
+
+	if (depth == 0)
+		goto out;
+
+	partial = ext3_get_branch(inode, depth, offsets, chain, &err);
+
+	/* Simplest case - block found, no allocation needed */
+	if (!partial) {
+		first_block = le32_to_cpu(chain[depth - 1].key);
+		clear_buffer_new(bh_result);
+		count++;
+		/*map more blocks*/
+		while (count < maxblocks && count <= blocks_to_boundary) {
+			ext3_fsblk_t blk;
+
+			if (!verify_chain(chain, chain + depth - 1)) {
+				/*
+				 * Indirect block might be removed by
+				 * truncate while we were reading it.
+				 * Handling of that case: forget what we've
+				 * got now. Flag the err as EAGAIN, so it
+				 * will reread.
+				 */
+				err = -EAGAIN;
+				count = 0;
+				break;
+			}
+			blk = le32_to_cpu(*(chain[depth-1].p + count));
+
+			if (blk == first_block + count)
+				count++;
+			else
+				break;
+		}
+		if (err != -EAGAIN)
+			goto got_it;
+	}
+
+	/* Next simple case - plain lookup or failed read of indirect block */
+	if (!create || err == -EIO)
+		goto cleanup;
+
+	mutex_lock(&ei->truncate_mutex);
+
+	/*
+	 * If the indirect block is missing while we are reading
+	 * the chain(ext3_get_branch() returns -EAGAIN err), or
+	 * if the chain has been changed after we grab the semaphore,
+	 * (either because another process truncated this branch, or
+	 * another get_block allocated this branch) re-grab the chain to see if
+	 * the request block has been allocated or not.
+	 *
+	 * Since we already block the truncate/other get_block
+	 * at this point, we will have the current copy of the chain when we
+	 * splice the branch into the tree.
+	 */
+	if (err == -EAGAIN || !verify_chain(chain, partial)) {
+		while (partial > chain) {
+			brelse(partial->bh);
+			partial--;
+		}
+		partial = ext3_get_branch(inode, depth, offsets, chain, &err);
+		if (!partial) {
+			count++;
+			mutex_unlock(&ei->truncate_mutex);
+			if (err)
+				goto cleanup;
+			clear_buffer_new(bh_result);
+			goto got_it;
+		}
+	}
+
+	/*
+	 * Okay, we need to do block allocation.  Lazily initialize the block
+	 * allocation info here if necessary
+	*/
+	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
+		ext3_init_block_alloc_info(inode);
+
+	goal = ext3_find_goal(inode, iblock, partial);
+
+	/* the number of blocks need to allocate for [d,t]indirect blocks */
+	indirect_blks = (chain + depth) - partial - 1;
+
+	/*
+	 * Next look up the indirect map to count the totoal number of
+	 * direct blocks to allocate for this branch.
+	 */
+	count = ext3_blks_to_allocate(partial, indirect_blks,
+					maxblocks, blocks_to_boundary);
+	/*
+	 * Block out ext3_truncate while we alter the tree
+	 */
+	err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
+				offsets + (partial - chain), partial);
+
+	/*
+	 * The ext3_splice_branch call will free and forget any buffers
+	 * on the new chain if there is a failure, but that risks using
+	 * up transaction credits, especially for bitmaps where the
+	 * credits cannot be returned.  Can we handle this somehow?  We
+	 * may need to return -EAGAIN upwards in the worst case.  --sct
+	 */
+	if (!err)
+		err = ext3_splice_branch(handle, inode, iblock,
+					partial, indirect_blks, count);
+	mutex_unlock(&ei->truncate_mutex);
+	if (err)
+		goto cleanup;
+
+	set_buffer_new(bh_result);
+got_it:
+	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+	if (count > blocks_to_boundary)
+		set_buffer_boundary(bh_result);
+	err = count;
+	/* Clean up and exit */
+	partial = chain + depth - 1;	/* the whole chain */
+cleanup:
+	while (partial > chain) {
+		BUFFER_TRACE(partial->bh, "call brelse");
+		brelse(partial->bh);
+		partial--;
+	}
+	BUFFER_TRACE(bh_result, "returned");
+out:
+	return err;
+}
+
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+/*
+ * Number of credits we need for writing DIO_MAX_BLOCKS:
+ * We need sb + group descriptor + bitmap + inode -> 4
+ * For B blocks with A block pointers per block we need:
+ * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
+ * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
+ */
+#define DIO_CREDITS 25
+
+static int ext3_get_block(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create)
+{
+	handle_t *handle = ext3_journal_current_handle();
+	int ret = 0, started = 0;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+	if (create && !handle) {	/* Direct IO write... */
+		if (max_blocks > DIO_MAX_BLOCKS)
+			max_blocks = DIO_MAX_BLOCKS;
+		handle = ext3_journal_start(inode, DIO_CREDITS +
+				EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+		started = 1;
+	}
+
+	ret = ext3_get_blocks_handle(handle, inode, iblock,
+					max_blocks, bh_result, create);
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
+	}
+	if (started)
+		ext3_journal_stop(handle);
+out:
+	return ret;
+}
+
+int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		u64 start, u64 len)
+{
+	return generic_block_fiemap(inode, fieinfo, start, len,
+				    ext3_get_block);
+}
+
+/*
+ * `handle' can be NULL if create is zero
+ */
+struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
+				long block, int create, int *errp)
+{
+	struct buffer_head dummy;
+	int fatal = 0, err;
+
+	J_ASSERT(handle != NULL || create == 0);
+
+	dummy.b_state = 0;
+	dummy.b_blocknr = -1000;
+	buffer_trace_init(&dummy.b_history);
+	err = ext3_get_blocks_handle(handle, inode, block, 1,
+					&dummy, create);
+	/*
+	 * ext3_get_blocks_handle() returns number of blocks
+	 * mapped. 0 in case of a HOLE.
+	 */
+	if (err > 0) {
+		if (err > 1)
+			WARN_ON(1);
+		err = 0;
+	}
+	*errp = err;
+	if (!err && buffer_mapped(&dummy)) {
+		struct buffer_head *bh;
+		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+		if (!bh) {
+			*errp = -EIO;
+			goto err;
+		}
+		if (buffer_new(&dummy)) {
+			J_ASSERT(create != 0);
+			J_ASSERT(handle != NULL);
+
+			/*
+			 * Now that we do not always journal data, we should
+			 * keep in mind whether this should always journal the
+			 * new buffer as metadata.  For now, regular file
+			 * writes use ext3_get_block instead, so it's not a
+			 * problem.
+			 */
+			lock_buffer(bh);
+			BUFFER_TRACE(bh, "call get_create_access");
+			fatal = ext3_journal_get_create_access(handle, bh);
+			if (!fatal && !buffer_uptodate(bh)) {
+				memset(bh->b_data,0,inode->i_sb->s_blocksize);
+				set_buffer_uptodate(bh);
+			}
+			unlock_buffer(bh);
+			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+			err = ext3_journal_dirty_metadata(handle, bh);
+			if (!fatal)
+				fatal = err;
+		} else {
+			BUFFER_TRACE(bh, "not a new buffer");
+		}
+		if (fatal) {
+			*errp = fatal;
+			brelse(bh);
+			bh = NULL;
+		}
+		return bh;
+	}
+err:
+	return NULL;
+}
+
+struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
+			       int block, int create, int *err)
+{
+	struct buffer_head * bh;
+
+	bh = ext3_getblk(handle, inode, block, create, err);
+	if (!bh)
+		return bh;
+	if (buffer_uptodate(bh))
+		return bh;
+	ll_rw_block(READ_META, 1, &bh);
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return bh;
+	put_bh(bh);
+	*err = -EIO;
+	return NULL;
+}
+
+static int walk_page_buffers(	handle_t *handle,
+				struct buffer_head *head,
+				unsigned from,
+				unsigned to,
+				int *partial,
+				int (*fn)(	handle_t *handle,
+						struct buffer_head *bh))
+{
+	struct buffer_head *bh;
+	unsigned block_start, block_end;
+	unsigned blocksize = head->b_size;
+	int err, ret = 0;
+	struct buffer_head *next;
+
+	for (	bh = head, block_start = 0;
+		ret == 0 && (bh != head || !block_start);
+		block_start = block_end, bh = next)
+	{
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (partial && !buffer_uptodate(bh))
+				*partial = 1;
+			continue;
+		}
+		err = (*fn)(handle, bh);
+		if (!ret)
+			ret = err;
+	}
+	return ret;
+}
+
+/*
+ * To preserve ordering, it is essential that the hole instantiation and
+ * the data write be encapsulated in a single transaction.  We cannot
+ * close off a transaction and start a new one between the ext3_get_block()
+ * and the commit_write().  So doing the journal_start at the start of
+ * prepare_write() is the right place.
+ *
+ * Also, this function can nest inside ext3_writepage() ->
+ * block_write_full_page(). In that case, we *know* that ext3_writepage()
+ * has generated enough buffer credits to do the whole page.  So we won't
+ * block on the journal in that case, which is good, because the caller may
+ * be PF_MEMALLOC.
+ *
+ * By accident, ext3 can be reentered when a transaction is open via
+ * quota file writes.  If we were to commit the transaction while thus
+ * reentered, there can be a deadlock - we would be holding a quota
+ * lock, and the commit would never complete if another thread had a
+ * transaction open and was blocking on the quota lock - a ranking
+ * violation.
+ *
+ * So what we do is to rely on the fact that journal_stop/journal_start
+ * will _not_ run commit under these circumstances because handle->h_ref
+ * is elevated.  We'll still have enough credits for the tiny quotafile
+ * write.
+ */
+static int do_journal_get_write_access(handle_t *handle,
+					struct buffer_head *bh)
+{
+	int dirty = buffer_dirty(bh);
+	int ret;
+
+	if (!buffer_mapped(bh) || buffer_freed(bh))
+		return 0;
+	/*
+	 * __block_prepare_write() could have dirtied some buffers. Clean
+	 * the dirty bit as jbd2_journal_get_write_access() could complain
+	 * otherwise about fs integrity issues. Setting of the dirty bit
+	 * by __block_prepare_write() isn't a real problem here as we clear
+	 * the bit before releasing a page lock and thus writeback cannot
+	 * ever write the buffer.
+	 */
+	if (dirty)
+		clear_buffer_dirty(bh);
+	ret = ext3_journal_get_write_access(handle, bh);
+	if (!ret && dirty)
+		ret = ext3_journal_dirty_metadata(handle, bh);
+	return ret;
+}
+
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext3_truncate_failed_write(struct inode *inode)
+{
+	truncate_inode_pages(inode->i_mapping, inode->i_size);
+	ext3_truncate(inode);
+}
+
+static int ext3_write_begin(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	int ret;
+	handle_t *handle;
+	int retries = 0;
+	struct page *page;
+	pgoff_t index;
+	unsigned from, to;
+	/* Reserve one block more for addition to orphan list in case
+	 * we allocate blocks but write fails for some reason */
+	int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+retry:
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	handle = ext3_journal_start(inode, needed_blocks);
+	if (IS_ERR(handle)) {
+		unlock_page(page);
+		page_cache_release(page);
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+	ret = __block_write_begin(page, pos, len, ext3_get_block);
+	if (ret)
+		goto write_begin_failed;
+
+	if (ext3_should_journal_data(inode)) {
+		ret = walk_page_buffers(handle, page_buffers(page),
+				from, to, NULL, do_journal_get_write_access);
+	}
+write_begin_failed:
+	if (ret) {
+		/*
+		 * block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 *
+		 * Add inode to orphan list in case we crash before truncate
+		 * finishes. Do this only if ext3_can_truncate() agrees so
+		 * that orphan processing code is happy.
+		 */
+		if (pos + len > inode->i_size && ext3_can_truncate(inode))
+			ext3_orphan_add(handle, inode);
+		ext3_journal_stop(handle);
+		unlock_page(page);
+		page_cache_release(page);
+		if (pos + len > inode->i_size)
+			ext3_truncate_failed_write(inode);
+	}
+	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+out:
+	return ret;
+}
+
+
+int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
+{
+	int err = journal_dirty_data(handle, bh);
+	if (err)
+		ext3_journal_abort_handle(__func__, __func__,
+						bh, handle, err);
+	return err;
+}
+
+/* For ordered writepage and write_end functions */
+static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+{
+	/*
+	 * Write could have mapped the buffer but it didn't copy the data in
+	 * yet. So avoid filing such buffer into a transaction.
+	 */
+	if (buffer_mapped(bh) && buffer_uptodate(bh))
+		return ext3_journal_dirty_data(handle, bh);
+	return 0;
+}
+
+/* For write_end() in data=journal mode */
+static int write_end_fn(handle_t *handle, struct buffer_head *bh)
+{
+	if (!buffer_mapped(bh) || buffer_freed(bh))
+		return 0;
+	set_buffer_uptodate(bh);
+	return ext3_journal_dirty_metadata(handle, bh);
+}
+
+/*
+ * This is nasty and subtle: ext3_write_begin() could have allocated blocks
+ * for the whole page but later we failed to copy the data in. Update inode
+ * size according to what we managed to copy. The rest is going to be
+ * truncated in write_end function.
+ */
+static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
+{
+	/* What matters to us is i_disksize. We don't write i_size anywhere */
+	if (pos + copied > inode->i_size)
+		i_size_write(inode, pos + copied);
+	if (pos + copied > EXT3_I(inode)->i_disksize) {
+		EXT3_I(inode)->i_disksize = pos + copied;
+		mark_inode_dirty(inode);
+	}
+}
+
+/*
+ * We need to pick up the new inode size which generic_commit_write gave us
+ * `file' can be NULL - eg, when called from page_symlink().
+ *
+ * ext3 never places buffers on inode->i_mapping->private_list.  metadata
+ * buffers are managed internally.
+ */
+static int ext3_ordered_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	handle_t *handle = ext3_journal_current_handle();
+	struct inode *inode = file->f_mapping->host;
+	unsigned from, to;
+	int ret = 0, ret2;
+
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + copied;
+	ret = walk_page_buffers(handle, page_buffers(page),
+		from, to, NULL, journal_dirty_data_fn);
+
+	if (ret == 0)
+		update_file_sizes(inode, pos, copied);
+	/*
+	 * There may be allocated blocks outside of i_size because
+	 * we failed to copy some data. Prepare for truncate.
+	 */
+	if (pos + len > inode->i_size && ext3_can_truncate(inode))
+		ext3_orphan_add(handle, inode);
+	ret2 = ext3_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (pos + len > inode->i_size)
+		ext3_truncate_failed_write(inode);
+	return ret ? ret : copied;
+}
+
+static int ext3_writeback_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	handle_t *handle = ext3_journal_current_handle();
+	struct inode *inode = file->f_mapping->host;
+	int ret;
+
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+	update_file_sizes(inode, pos, copied);
+	/*
+	 * There may be allocated blocks outside of i_size because
+	 * we failed to copy some data. Prepare for truncate.
+	 */
+	if (pos + len > inode->i_size && ext3_can_truncate(inode))
+		ext3_orphan_add(handle, inode);
+	ret = ext3_journal_stop(handle);
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (pos + len > inode->i_size)
+		ext3_truncate_failed_write(inode);
+	return ret ? ret : copied;
+}
+
+static int ext3_journalled_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	handle_t *handle = ext3_journal_current_handle();
+	struct inode *inode = mapping->host;
+	int ret = 0, ret2;
+	int partial = 0;
+	unsigned from, to;
+
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+	if (copied < len) {
+		if (!PageUptodate(page))
+			copied = 0;
+		page_zero_new_buffers(page, from + copied, to);
+		to = from + copied;
+	}
+
+	ret = walk_page_buffers(handle, page_buffers(page), from,
+				to, &partial, write_end_fn);
+	if (!partial)
+		SetPageUptodate(page);
+
+	if (pos + copied > inode->i_size)
+		i_size_write(inode, pos + copied);
+	/*
+	 * There may be allocated blocks outside of i_size because
+	 * we failed to copy some data. Prepare for truncate.
+	 */
+	if (pos + len > inode->i_size && ext3_can_truncate(inode))
+		ext3_orphan_add(handle, inode);
+	ext3_set_inode_state(inode, EXT3_STATE_JDATA);
+	if (inode->i_size > EXT3_I(inode)->i_disksize) {
+		EXT3_I(inode)->i_disksize = inode->i_size;
+		ret2 = ext3_mark_inode_dirty(handle, inode);
+		if (!ret)
+			ret = ret2;
+	}
+
+	ret2 = ext3_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (pos + len > inode->i_size)
+		ext3_truncate_failed_write(inode);
+	return ret ? ret : copied;
+}
+
+/*
+ * bmap() is special.  It gets used by applications such as lilo and by
+ * the swapper to find the on-disk block of a specific piece of data.
+ *
+ * Naturally, this is dangerous if the block concerned is still in the
+ * journal.  If somebody makes a swapfile on an ext3 data-journaling
+ * filesystem and enables swap, then they may get a nasty shock when the
+ * data getting swapped to that swapfile suddenly gets overwritten by
+ * the original zero's written out previously to the journal and
+ * awaiting writeback in the kernel's buffer cache.
+ *
+ * So, if we see any bmap calls here on a modified, data-journaled file,
+ * take extra steps to flush any blocks which might be in the cache.
+ */
+static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
+{
+	struct inode *inode = mapping->host;
+	journal_t *journal;
+	int err;
+
+	if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
+		/*
+		 * This is a REALLY heavyweight approach, but the use of
+		 * bmap on dirty files is expected to be extremely rare:
+		 * only if we run lilo or swapon on a freshly made file
+		 * do we expect this to happen.
+		 *
+		 * (bmap requires CAP_SYS_RAWIO so this does not
+		 * represent an unprivileged user DOS attack --- we'd be
+		 * in trouble if mortal users could trigger this path at
+		 * will.)
+		 *
+		 * NB. EXT3_STATE_JDATA is not set on files other than
+		 * regular files.  If somebody wants to bmap a directory
+		 * or symlink and gets confused because the buffer
+		 * hasn't yet been flushed to disk, they deserve
+		 * everything they get.
+		 */
+
+		ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
+		journal = EXT3_JOURNAL(inode);
+		journal_lock_updates(journal);
+		err = journal_flush(journal);
+		journal_unlock_updates(journal);
+
+		if (err)
+			return 0;
+	}
+
+	return generic_block_bmap(mapping,block,ext3_get_block);
+}
+
+static int bget_one(handle_t *handle, struct buffer_head *bh)
+{
+	get_bh(bh);
+	return 0;
+}
+
+static int bput_one(handle_t *handle, struct buffer_head *bh)
+{
+	put_bh(bh);
+	return 0;
+}
+
+static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh);
+}
+
+/*
+ * Note that we always start a transaction even if we're not journalling
+ * data.  This is to preserve ordering: any hole instantiation within
+ * __block_write_full_page -> ext3_get_block() should be journalled
+ * along with the data so we don't crash and then get metadata which
+ * refers to old data.
+ *
+ * In all journalling modes block_write_full_page() will start the I/O.
+ *
+ * Problem:
+ *
+ *	ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+ *		ext3_writepage()
+ *
+ * Similar for:
+ *
+ *	ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
+ *
+ * Same applies to ext3_get_block().  We will deadlock on various things like
+ * lock_journal and i_truncate_mutex.
+ *
+ * Setting PF_MEMALLOC here doesn't work - too many internal memory
+ * allocations fail.
+ *
+ * 16May01: If we're reentered then journal_current_handle() will be
+ *	    non-zero. We simply *return*.
+ *
+ * 1 July 2001: @@@ FIXME:
+ *   In journalled data mode, a data buffer may be metadata against the
+ *   current transaction.  But the same file is part of a shared mapping
+ *   and someone does a writepage() on it.
+ *
+ *   We will move the buffer onto the async_data list, but *after* it has
+ *   been dirtied. So there's a small window where we have dirty data on
+ *   BJ_Metadata.
+ *
+ *   Note that this only applies to the last partial page in the file.  The
+ *   bit which block_write_full_page() uses prepare/commit for.  (That's
+ *   broken code anyway: it's wrong for msync()).
+ *
+ *   It's a rare case: affects the final partial page, for journalled data
+ *   where the file is subject to bith write() and writepage() in the same
+ *   transction.  To fix it we'll need a custom block_write_full_page().
+ *   We'll probably need that anyway for journalling writepage() output.
+ *
+ * We don't honour synchronous mounts for writepage().  That would be
+ * disastrous.  Any write() or metadata operation will sync the fs for
+ * us.
+ *
+ * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
+ * we don't need to open a transaction here.
+ */
+static int ext3_ordered_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head *page_bufs;
+	handle_t *handle = NULL;
+	int ret = 0;
+	int err;
+
+	J_ASSERT(PageLocked(page));
+	/*
+	 * We don't want to warn for emergency remount. The condition is
+	 * ordered to avoid dereferencing inode->i_sb in non-error case to
+	 * avoid slow-downs.
+	 */
+	WARN_ON_ONCE(IS_RDONLY(inode) &&
+		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
+
+	/*
+	 * We give up here if we're reentered, because it might be for a
+	 * different filesystem.
+	 */
+	if (ext3_journal_current_handle())
+		goto out_fail;
+
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, inode->i_sb->s_blocksize,
+				(1 << BH_Dirty)|(1 << BH_Uptodate));
+		page_bufs = page_buffers(page);
+	} else {
+		page_bufs = page_buffers(page);
+		if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
+				       NULL, buffer_unmapped)) {
+			/* Provide NULL get_block() to catch bugs if buffers
+			 * weren't really mapped */
+			return block_write_full_page(page, NULL, wbc);
+		}
+	}
+	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
+
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_fail;
+	}
+
+	walk_page_buffers(handle, page_bufs, 0,
+			PAGE_CACHE_SIZE, NULL, bget_one);
+
+	ret = block_write_full_page(page, ext3_get_block, wbc);
+
+	/*
+	 * The page can become unlocked at any point now, and
+	 * truncate can then come in and change things.  So we
+	 * can't touch *page from now on.  But *page_bufs is
+	 * safe due to elevated refcount.
+	 */
+
+	/*
+	 * And attach them to the current transaction.  But only if
+	 * block_write_full_page() succeeded.  Otherwise they are unmapped,
+	 * and generally junk.
+	 */
+	if (ret == 0) {
+		err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+					NULL, journal_dirty_data_fn);
+		if (!ret)
+			ret = err;
+	}
+	walk_page_buffers(handle, page_bufs, 0,
+			PAGE_CACHE_SIZE, NULL, bput_one);
+	err = ext3_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+
+out_fail:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return ret;
+}
+
+static int ext3_writeback_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	handle_t *handle = NULL;
+	int ret = 0;
+	int err;
+
+	J_ASSERT(PageLocked(page));
+	/*
+	 * We don't want to warn for emergency remount. The condition is
+	 * ordered to avoid dereferencing inode->i_sb in non-error case to
+	 * avoid slow-downs.
+	 */
+	WARN_ON_ONCE(IS_RDONLY(inode) &&
+		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
+
+	if (ext3_journal_current_handle())
+		goto out_fail;
+
+	if (page_has_buffers(page)) {
+		if (!walk_page_buffers(NULL, page_buffers(page), 0,
+				      PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
+			/* Provide NULL get_block() to catch bugs if buffers
+			 * weren't really mapped */
+			return block_write_full_page(page, NULL, wbc);
+		}
+	}
+
+	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_fail;
+	}
+
+	ret = block_write_full_page(page, ext3_get_block, wbc);
+
+	err = ext3_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+
+out_fail:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return ret;
+}
+
+static int ext3_journalled_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	handle_t *handle = NULL;
+	int ret = 0;
+	int err;
+
+	J_ASSERT(PageLocked(page));
+	/*
+	 * We don't want to warn for emergency remount. The condition is
+	 * ordered to avoid dereferencing inode->i_sb in non-error case to
+	 * avoid slow-downs.
+	 */
+	WARN_ON_ONCE(IS_RDONLY(inode) &&
+		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
+
+	if (ext3_journal_current_handle())
+		goto no_write;
+
+	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto no_write;
+	}
+
+	if (!page_has_buffers(page) || PageChecked(page)) {
+		/*
+		 * It's mmapped pagecache.  Add buffers and journal it.  There
+		 * doesn't seem much point in redirtying the page here.
+		 */
+		ClearPageChecked(page);
+		ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
+					  ext3_get_block);
+		if (ret != 0) {
+			ext3_journal_stop(handle);
+			goto out_unlock;
+		}
+		ret = walk_page_buffers(handle, page_buffers(page), 0,
+			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+
+		err = walk_page_buffers(handle, page_buffers(page), 0,
+				PAGE_CACHE_SIZE, NULL, write_end_fn);
+		if (ret == 0)
+			ret = err;
+		ext3_set_inode_state(inode, EXT3_STATE_JDATA);
+		unlock_page(page);
+	} else {
+		/*
+		 * It may be a page full of checkpoint-mode buffers.  We don't
+		 * really know unless we go poke around in the buffer_heads.
+		 * But block_write_full_page will do the right thing.
+		 */
+		ret = block_write_full_page(page, ext3_get_block, wbc);
+	}
+	err = ext3_journal_stop(handle);
+	if (!ret)
+		ret = err;
+out:
+	return ret;
+
+no_write:
+	redirty_page_for_writepage(wbc, page);
+out_unlock:
+	unlock_page(page);
+	goto out;
+}
+
+static int ext3_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, ext3_get_block);
+}
+
+static int
+ext3_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
+}
+
+static void ext3_invalidatepage(struct page *page, unsigned long offset)
+{
+	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+
+	/*
+	 * If it's a full truncate we just forget about the pending dirtying
+	 */
+	if (offset == 0)
+		ClearPageChecked(page);
+
+	journal_invalidatepage(journal, page, offset);
+}
+
+static int ext3_releasepage(struct page *page, gfp_t wait)
+{
+	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+
+	WARN_ON(PageChecked(page));
+	if (!page_has_buffers(page))
+		return 0;
+	return journal_try_to_free_buffers(journal, page, wait);
+}
+
+/*
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list.  So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ * If the O_DIRECT write is intantiating holes inside i_size and the machine
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
+ */
+static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
+			const struct iovec *iov, loff_t offset,
+			unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	handle_t *handle;
+	ssize_t ret;
+	int orphan = 0;
+	size_t count = iov_length(iov, nr_segs);
+	int retries = 0;
+
+	if (rw == WRITE) {
+		loff_t final_size = offset + count;
+
+		if (final_size > inode->i_size) {
+			/* Credits for sb + inode write */
+			handle = ext3_journal_start(inode, 2);
+			if (IS_ERR(handle)) {
+				ret = PTR_ERR(handle);
+				goto out;
+			}
+			ret = ext3_orphan_add(handle, inode);
+			if (ret) {
+				ext3_journal_stop(handle);
+				goto out;
+			}
+			orphan = 1;
+			ei->i_disksize = inode->i_size;
+			ext3_journal_stop(handle);
+		}
+	}
+
+retry:
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				 offset, nr_segs,
+				 ext3_get_block, NULL);
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+
+	if (orphan) {
+		int err;
+
+		/* Credits for sb + inode write */
+		handle = ext3_journal_start(inode, 2);
+		if (IS_ERR(handle)) {
+			/* This is really bad luck. We've written the data
+			 * but cannot extend i_size. Truncate allocated blocks
+			 * and pretend the write failed... */
+			ext3_truncate(inode);
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+		if (inode->i_nlink)
+			ext3_orphan_del(handle, inode);
+		if (ret > 0) {
+			loff_t end = offset + ret;
+			if (end > inode->i_size) {
+				ei->i_disksize = end;
+				i_size_write(inode, end);
+				/*
+				 * We're going to return a positive `ret'
+				 * here due to non-zero-length I/O, so there's
+				 * no way of reporting error returns from
+				 * ext3_mark_inode_dirty() to userspace.  So
+				 * ignore it.
+				 */
+				ext3_mark_inode_dirty(handle, inode);
+			}
+		}
+		err = ext3_journal_stop(handle);
+		if (ret == 0)
+			ret = err;
+	}
+out:
+	return ret;
+}
+
+/*
+ * Pages can be marked dirty completely asynchronously from ext3's journalling
+ * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
+ * much here because ->set_page_dirty is called under VFS locks.  The page is
+ * not necessarily locked.
+ *
+ * We cannot just dirty the page and leave attached buffers clean, because the
+ * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
+ * or jbddirty because all the journalling code will explode.
+ *
+ * So what we do is to mark the page "pending dirty" and next time writepage
+ * is called, propagate that into the buffers appropriately.
+ */
+static int ext3_journalled_set_page_dirty(struct page *page)
+{
+	SetPageChecked(page);
+	return __set_page_dirty_nobuffers(page);
+}
+
+static const struct address_space_operations ext3_ordered_aops = {
+	.readpage		= ext3_readpage,
+	.readpages		= ext3_readpages,
+	.writepage		= ext3_ordered_writepage,
+	.write_begin		= ext3_write_begin,
+	.write_end		= ext3_ordered_write_end,
+	.bmap			= ext3_bmap,
+	.invalidatepage		= ext3_invalidatepage,
+	.releasepage		= ext3_releasepage,
+	.direct_IO		= ext3_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+static const struct address_space_operations ext3_writeback_aops = {
+	.readpage		= ext3_readpage,
+	.readpages		= ext3_readpages,
+	.writepage		= ext3_writeback_writepage,
+	.write_begin		= ext3_write_begin,
+	.write_end		= ext3_writeback_write_end,
+	.bmap			= ext3_bmap,
+	.invalidatepage		= ext3_invalidatepage,
+	.releasepage		= ext3_releasepage,
+	.direct_IO		= ext3_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+static const struct address_space_operations ext3_journalled_aops = {
+	.readpage		= ext3_readpage,
+	.readpages		= ext3_readpages,
+	.writepage		= ext3_journalled_writepage,
+	.write_begin		= ext3_write_begin,
+	.write_end		= ext3_journalled_write_end,
+	.set_page_dirty		= ext3_journalled_set_page_dirty,
+	.bmap			= ext3_bmap,
+	.invalidatepage		= ext3_invalidatepage,
+	.releasepage		= ext3_releasepage,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+void ext3_set_aops(struct inode *inode)
+{
+	if (ext3_should_order_data(inode))
+		inode->i_mapping->a_ops = &ext3_ordered_aops;
+	else if (ext3_should_writeback_data(inode))
+		inode->i_mapping->a_ops = &ext3_writeback_aops;
+	else
+		inode->i_mapping->a_ops = &ext3_journalled_aops;
+}
+
+/*
+ * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+static int ext3_block_truncate_page(handle_t *handle, struct page *page,
+		struct address_space *mapping, loff_t from)
+{
+	ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize, iblock, length, pos;
+	struct inode *inode = mapping->host;
+	struct buffer_head *bh;
+	int err = 0;
+
+	blocksize = inode->i_sb->s_blocksize;
+	length = blocksize - (offset & (blocksize - 1));
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	/* Find the buffer that contains "offset" */
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	err = 0;
+	if (buffer_freed(bh)) {
+		BUFFER_TRACE(bh, "freed: skip");
+		goto unlock;
+	}
+
+	if (!buffer_mapped(bh)) {
+		BUFFER_TRACE(bh, "unmapped");
+		ext3_get_block(inode, iblock, bh, 0);
+		/* unmapped? It's a hole - nothing to do */
+		if (!buffer_mapped(bh)) {
+			BUFFER_TRACE(bh, "still unmapped");
+			goto unlock;
+		}
+	}
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!buffer_uptodate(bh)) {
+		err = -EIO;
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (!buffer_uptodate(bh))
+			goto unlock;
+	}
+
+	if (ext3_should_journal_data(inode)) {
+		BUFFER_TRACE(bh, "get write access");
+		err = ext3_journal_get_write_access(handle, bh);
+		if (err)
+			goto unlock;
+	}
+
+	zero_user(page, offset, length);
+	BUFFER_TRACE(bh, "zeroed end of block");
+
+	err = 0;
+	if (ext3_should_journal_data(inode)) {
+		err = ext3_journal_dirty_metadata(handle, bh);
+	} else {
+		if (ext3_should_order_data(inode))
+			err = ext3_journal_dirty_data(handle, bh);
+		mark_buffer_dirty(bh);
+	}
+
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * Probably it should be a library function... search for first non-zero word
+ * or memcmp with zero_page, whatever is better for particular architecture.
+ * Linus?
+ */
+static inline int all_zeroes(__le32 *p, __le32 *q)
+{
+	while (p < q)
+		if (*p++)
+			return 0;
+	return 1;
+}
+
+/**
+ *	ext3_find_shared - find the indirect blocks for partial truncation.
+ *	@inode:	  inode in question
+ *	@depth:	  depth of the affected branch
+ *	@offsets: offsets of pointers in that branch (see ext3_block_to_path)
+ *	@chain:	  place to store the pointers to partial indirect blocks
+ *	@top:	  place to the (detached) top of branch
+ *
+ *	This is a helper function used by ext3_truncate().
+ *
+ *	When we do truncate() we may have to clean the ends of several
+ *	indirect blocks but leave the blocks themselves alive. Block is
+ *	partially truncated if some data below the new i_size is referred
+ *	from it (and it is on the path to the first completely truncated
+ *	data block, indeed).  We have to free the top of that path along
+ *	with everything to the right of the path. Since no allocation
+ *	past the truncation point is possible until ext3_truncate()
+ *	finishes, we may safely do the latter, but top of branch may
+ *	require special attention - pageout below the truncation point
+ *	might try to populate it.
+ *
+ *	We atomically detach the top of branch from the tree, store the
+ *	block number of its root in *@top, pointers to buffer_heads of
+ *	partially truncated blocks - in @chain[].bh and pointers to
+ *	their last elements that should not be removed - in
+ *	@chain[].p. Return value is the pointer to last filled element
+ *	of @chain.
+ *
+ *	The work left to caller to do the actual freeing of subtrees:
+ *		a) free the subtree starting from *@top
+ *		b) free the subtrees whose roots are stored in
+ *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
+ *		c) free the subtrees growing from the inode past the @chain[0].
+ *			(no partially truncated stuff there).  */
+
+static Indirect *ext3_find_shared(struct inode *inode, int depth,
+			int offsets[4], Indirect chain[4], __le32 *top)
+{
+	Indirect *partial, *p;
+	int k, err;
+
+	*top = 0;
+	/* Make k index the deepest non-null offset + 1 */
+	for (k = depth; k > 1 && !offsets[k-1]; k--)
+		;
+	partial = ext3_get_branch(inode, k, offsets, chain, &err);
+	/* Writer: pointers */
+	if (!partial)
+		partial = chain + k-1;
+	/*
+	 * If the branch acquired continuation since we've looked at it -
+	 * fine, it should all survive and (new) top doesn't belong to us.
+	 */
+	if (!partial->key && *partial->p)
+		/* Writer: end */
+		goto no_top;
+	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
+		;
+	/*
+	 * OK, we've found the last block that must survive. The rest of our
+	 * branch should be detached before unlocking. However, if that rest
+	 * of branch is all ours and does not grow immediately from the inode
+	 * it's easier to cheat and just decrement partial->p.
+	 */
+	if (p == chain + k - 1 && p > chain) {
+		p->p--;
+	} else {
+		*top = *p->p;
+		/* Nope, don't do this in ext3.  Must leave the tree intact */
+#if 0
+		*p->p = 0;
+#endif
+	}
+	/* Writer: end */
+
+	while(partial > p) {
+		brelse(partial->bh);
+		partial--;
+	}
+no_top:
+	return partial;
+}
+
+/*
+ * Zero a number of block pointers in either an inode or an indirect block.
+ * If we restart the transaction we must again get write access to the
+ * indirect block for further modification.
+ *
+ * We release `count' blocks on disk, but (last - first) may be greater
+ * than `count' because there can be holes in there.
+ */
+static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
+		struct buffer_head *bh, ext3_fsblk_t block_to_free,
+		unsigned long count, __le32 *first, __le32 *last)
+{
+	__le32 *p;
+	if (try_to_extend_transaction(handle, inode)) {
+		if (bh) {
+			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+			if (ext3_journal_dirty_metadata(handle, bh))
+				return;
+		}
+		ext3_mark_inode_dirty(handle, inode);
+		truncate_restart_transaction(handle, inode);
+		if (bh) {
+			BUFFER_TRACE(bh, "retaking write access");
+			if (ext3_journal_get_write_access(handle, bh))
+				return;
+		}
+	}
+
+	/*
+	 * Any buffers which are on the journal will be in memory. We find
+	 * them on the hash table so journal_revoke() will run journal_forget()
+	 * on them.  We've already detached each block from the file, so
+	 * bforget() in journal_forget() should be safe.
+	 *
+	 * AKPM: turn on bforget in journal_forget()!!!
+	 */
+	for (p = first; p < last; p++) {
+		u32 nr = le32_to_cpu(*p);
+		if (nr) {
+			struct buffer_head *bh;
+
+			*p = 0;
+			bh = sb_find_get_block(inode->i_sb, nr);
+			ext3_forget(handle, 0, inode, bh, nr);
+		}
+	}
+
+	ext3_free_blocks(handle, inode, block_to_free, count);
+}
+
+/**
+ * ext3_free_data - free a list of data blocks
+ * @handle:	handle for this transaction
+ * @inode:	inode we are dealing with
+ * @this_bh:	indirect buffer_head which contains *@first and *@last
+ * @first:	array of block numbers
+ * @last:	points immediately past the end of array
+ *
+ * We are freeing all blocks referred from that array (numbers are stored as
+ * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+ *
+ * We accumulate contiguous runs of blocks to free.  Conveniently, if these
+ * blocks are contiguous then releasing them at one time will only affect one
+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
+ * actually use a lot of journal space.
+ *
+ * @this_bh will be %NULL if @first and @last point into the inode's direct
+ * block pointers.
+ */
+static void ext3_free_data(handle_t *handle, struct inode *inode,
+			   struct buffer_head *this_bh,
+			   __le32 *first, __le32 *last)
+{
+	ext3_fsblk_t block_to_free = 0;    /* Starting block # of a run */
+	unsigned long count = 0;	    /* Number of blocks in the run */
+	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
+					       corresponding to
+					       block_to_free */
+	ext3_fsblk_t nr;		    /* Current block # */
+	__le32 *p;			    /* Pointer into inode/ind
+					       for current block */
+	int err;
+
+	if (this_bh) {				/* For indirect block */
+		BUFFER_TRACE(this_bh, "get_write_access");
+		err = ext3_journal_get_write_access(handle, this_bh);
+		/* Important: if we can't update the indirect pointers
+		 * to the blocks, we can't free them. */
+		if (err)
+			return;
+	}
+
+	for (p = first; p < last; p++) {
+		nr = le32_to_cpu(*p);
+		if (nr) {
+			/* accumulate blocks to free if they're contiguous */
+			if (count == 0) {
+				block_to_free = nr;
+				block_to_free_p = p;
+				count = 1;
+			} else if (nr == block_to_free + count) {
+				count++;
+			} else {
+				ext3_clear_blocks(handle, inode, this_bh,
+						  block_to_free,
+						  count, block_to_free_p, p);
+				block_to_free = nr;
+				block_to_free_p = p;
+				count = 1;
+			}
+		}
+	}
+
+	if (count > 0)
+		ext3_clear_blocks(handle, inode, this_bh, block_to_free,
+				  count, block_to_free_p, p);
+
+	if (this_bh) {
+		BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
+
+		/*
+		 * The buffer head should have an attached journal head at this
+		 * point. However, if the data is corrupted and an indirect
+		 * block pointed to itself, it would have been detached when
+		 * the block was cleared. Check for this instead of OOPSing.
+		 */
+		if (bh2jh(this_bh))
+			ext3_journal_dirty_metadata(handle, this_bh);
+		else
+			ext3_error(inode->i_sb, "ext3_free_data",
+				   "circular indirect block detected, "
+				   "inode=%lu, block=%llu",
+				   inode->i_ino,
+				   (unsigned long long)this_bh->b_blocknr);
+	}
+}
+
+/**
+ *	ext3_free_branches - free an array of branches
+ *	@handle: JBD handle for this transaction
+ *	@inode:	inode we are dealing with
+ *	@parent_bh: the buffer_head which contains *@first and *@last
+ *	@first:	array of block numbers
+ *	@last:	pointer immediately past the end of array
+ *	@depth:	depth of the branches to free
+ *
+ *	We are freeing all blocks referred from these branches (numbers are
+ *	stored as little-endian 32-bit) and updating @inode->i_blocks
+ *	appropriately.
+ */
+static void ext3_free_branches(handle_t *handle, struct inode *inode,
+			       struct buffer_head *parent_bh,
+			       __le32 *first, __le32 *last, int depth)
+{
+	ext3_fsblk_t nr;
+	__le32 *p;
+
+	if (is_handle_aborted(handle))
+		return;
+
+	if (depth--) {
+		struct buffer_head *bh;
+		int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+		p = last;
+		while (--p >= first) {
+			nr = le32_to_cpu(*p);
+			if (!nr)
+				continue;		/* A hole */
+
+			/* Go read the buffer for the next level down */
+			bh = sb_bread(inode->i_sb, nr);
+
+			/*
+			 * A read failure? Report error and clear slot
+			 * (should be rare).
+			 */
+			if (!bh) {
+				ext3_error(inode->i_sb, "ext3_free_branches",
+					   "Read failure, inode=%lu, block="E3FSBLK,
+					   inode->i_ino, nr);
+				continue;
+			}
+
+			/* This zaps the entire block.  Bottom up. */
+			BUFFER_TRACE(bh, "free child branches");
+			ext3_free_branches(handle, inode, bh,
+					   (__le32*)bh->b_data,
+					   (__le32*)bh->b_data + addr_per_block,
+					   depth);
+
+			/*
+			 * Everything below this this pointer has been
+			 * released.  Now let this top-of-subtree go.
+			 *
+			 * We want the freeing of this indirect block to be
+			 * atomic in the journal with the updating of the
+			 * bitmap block which owns it.  So make some room in
+			 * the journal.
+			 *
+			 * We zero the parent pointer *after* freeing its
+			 * pointee in the bitmaps, so if extend_transaction()
+			 * for some reason fails to put the bitmap changes and
+			 * the release into the same transaction, recovery
+			 * will merely complain about releasing a free block,
+			 * rather than leaking blocks.
+			 */
+			if (is_handle_aborted(handle))
+				return;
+			if (try_to_extend_transaction(handle, inode)) {
+				ext3_mark_inode_dirty(handle, inode);
+				truncate_restart_transaction(handle, inode);
+			}
+
+			/*
+			 * We've probably journalled the indirect block several
+			 * times during the truncate.  But it's no longer
+			 * needed and we now drop it from the transaction via
+			 * journal_revoke().
+			 *
+			 * That's easy if it's exclusively part of this
+			 * transaction.  But if it's part of the committing
+			 * transaction then journal_forget() will simply
+			 * brelse() it.  That means that if the underlying
+			 * block is reallocated in ext3_get_block(),
+			 * unmap_underlying_metadata() will find this block
+			 * and will try to get rid of it.  damn, damn. Thus
+			 * we don't allow a block to be reallocated until
+			 * a transaction freeing it has fully committed.
+			 *
+			 * We also have to make sure journal replay after a
+			 * crash does not overwrite non-journaled data blocks
+			 * with old metadata when the block got reallocated for
+			 * data.  Thus we have to store a revoke record for a
+			 * block in the same transaction in which we free the
+			 * block.
+			 */
+			ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
+
+			ext3_free_blocks(handle, inode, nr, 1);
+
+			if (parent_bh) {
+				/*
+				 * The block which we have just freed is
+				 * pointed to by an indirect block: journal it
+				 */
+				BUFFER_TRACE(parent_bh, "get_write_access");
+				if (!ext3_journal_get_write_access(handle,
+								   parent_bh)){
+					*p = 0;
+					BUFFER_TRACE(parent_bh,
+					"call ext3_journal_dirty_metadata");
+					ext3_journal_dirty_metadata(handle,
+								    parent_bh);
+				}
+			}
+		}
+	} else {
+		/* We have reached the bottom of the tree. */
+		BUFFER_TRACE(parent_bh, "free data blocks");
+		ext3_free_data(handle, inode, parent_bh, first, last);
+	}
+}
+
+int ext3_can_truncate(struct inode *inode)
+{
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return 0;
+	if (S_ISREG(inode->i_mode))
+		return 1;
+	if (S_ISDIR(inode->i_mode))
+		return 1;
+	if (S_ISLNK(inode->i_mode))
+		return !ext3_inode_is_fast_symlink(inode);
+	return 0;
+}
+
+/*
+ * ext3_truncate()
+ *
+ * We block out ext3_get_block() block instantiations across the entire
+ * transaction, and VFS/VM ensures that ext3_truncate() cannot run
+ * simultaneously on behalf of the same inode.
+ *
+ * As we work through the truncate and commmit bits of it to the journal there
+ * is one core, guiding principle: the file's tree must always be consistent on
+ * disk.  We must be able to restart the truncate after a crash.
+ *
+ * The file's tree may be transiently inconsistent in memory (although it
+ * probably isn't), but whenever we close off and commit a journal transaction,
+ * the contents of (the filesystem + the journal) must be consistent and
+ * restartable.  It's pretty simple, really: bottom up, right to left (although
+ * left-to-right works OK too).
+ *
+ * Note that at recovery time, journal replay occurs *before* the restart of
+ * truncate against the orphan inode list.
+ *
+ * The committed inode has the new, desired i_size (which is the same as
+ * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
+ * that this inode's truncate did not complete and it will again call
+ * ext3_truncate() to have another go.  So there will be instantiated blocks
+ * to the right of the truncation point in a crashed ext3 filesystem.  But
+ * that's fine - as long as they are linked from the inode, the post-crash
+ * ext3_truncate() run will find them and release them.
+ */
+void ext3_truncate(struct inode *inode)
+{
+	handle_t *handle;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	__le32 *i_data = ei->i_data;
+	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+	struct address_space *mapping = inode->i_mapping;
+	int offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	__le32 nr = 0;
+	int n;
+	long last_block;
+	unsigned blocksize = inode->i_sb->s_blocksize;
+	struct page *page;
+
+	if (!ext3_can_truncate(inode))
+		goto out_notrans;
+
+	if (inode->i_size == 0 && ext3_should_writeback_data(inode))
+		ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
+
+	/*
+	 * We have to lock the EOF page here, because lock_page() nests
+	 * outside journal_start().
+	 */
+	if ((inode->i_size & (blocksize - 1)) == 0) {
+		/* Block boundary? Nothing to do */
+		page = NULL;
+	} else {
+		page = grab_cache_page(mapping,
+				inode->i_size >> PAGE_CACHE_SHIFT);
+		if (!page)
+			goto out_notrans;
+	}
+
+	handle = start_transaction(inode);
+	if (IS_ERR(handle)) {
+		if (page) {
+			clear_highpage(page);
+			flush_dcache_page(page);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+		goto out_notrans;
+	}
+
+	last_block = (inode->i_size + blocksize-1)
+					>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
+
+	if (page)
+		ext3_block_truncate_page(handle, page, mapping, inode->i_size);
+
+	n = ext3_block_to_path(inode, last_block, offsets, NULL);
+	if (n == 0)
+		goto out_stop;	/* error */
+
+	/*
+	 * OK.  This truncate is going to happen.  We add the inode to the
+	 * orphan list, so that if this truncate spans multiple transactions,
+	 * and we crash, we will resume the truncate when the filesystem
+	 * recovers.  It also marks the inode dirty, to catch the new size.
+	 *
+	 * Implication: the file must always be in a sane, consistent
+	 * truncatable state while each transaction commits.
+	 */
+	if (ext3_orphan_add(handle, inode))
+		goto out_stop;
+
+	/*
+	 * The orphan list entry will now protect us from any crash which
+	 * occurs before the truncate completes, so it is now safe to propagate
+	 * the new, shorter inode size (held for now in i_size) into the
+	 * on-disk inode. We do this via i_disksize, which is the value which
+	 * ext3 *really* writes onto the disk inode.
+	 */
+	ei->i_disksize = inode->i_size;
+
+	/*
+	 * From here we block out all ext3_get_block() callers who want to
+	 * modify the block allocation tree.
+	 */
+	mutex_lock(&ei->truncate_mutex);
+
+	if (n == 1) {		/* direct blocks */
+		ext3_free_data(handle, inode, NULL, i_data+offsets[0],
+			       i_data + EXT3_NDIR_BLOCKS);
+		goto do_indirects;
+	}
+
+	partial = ext3_find_shared(inode, n, offsets, chain, &nr);
+	/* Kill the top of shared branch (not detached) */
+	if (nr) {
+		if (partial == chain) {
+			/* Shared branch grows from the inode */
+			ext3_free_branches(handle, inode, NULL,
+					   &nr, &nr+1, (chain+n-1) - partial);
+			*partial->p = 0;
+			/*
+			 * We mark the inode dirty prior to restart,
+			 * and prior to stop.  No need for it here.
+			 */
+		} else {
+			/* Shared branch grows from an indirect block */
+			ext3_free_branches(handle, inode, partial->bh,
+					partial->p,
+					partial->p+1, (chain+n-1) - partial);
+		}
+	}
+	/* Clear the ends of indirect blocks on the shared branch */
+	while (partial > chain) {
+		ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
+				   (__le32*)partial->bh->b_data+addr_per_block,
+				   (chain+n-1) - partial);
+		BUFFER_TRACE(partial->bh, "call brelse");
+		brelse (partial->bh);
+		partial--;
+	}
+do_indirects:
+	/* Kill the remaining (whole) subtrees */
+	switch (offsets[0]) {
+	default:
+		nr = i_data[EXT3_IND_BLOCK];
+		if (nr) {
+			ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
+			i_data[EXT3_IND_BLOCK] = 0;
+		}
+	case EXT3_IND_BLOCK:
+		nr = i_data[EXT3_DIND_BLOCK];
+		if (nr) {
+			ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
+			i_data[EXT3_DIND_BLOCK] = 0;
+		}
+	case EXT3_DIND_BLOCK:
+		nr = i_data[EXT3_TIND_BLOCK];
+		if (nr) {
+			ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
+			i_data[EXT3_TIND_BLOCK] = 0;
+		}
+	case EXT3_TIND_BLOCK:
+		;
+	}
+
+	ext3_discard_reservation(inode);
+
+	mutex_unlock(&ei->truncate_mutex);
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+	ext3_mark_inode_dirty(handle, inode);
+
+	/*
+	 * In a multi-transaction truncate, we only make the final transaction
+	 * synchronous
+	 */
+	if (IS_SYNC(inode))
+		handle->h_sync = 1;
+out_stop:
+	/*
+	 * If this was a simple ftruncate(), and the file will remain alive
+	 * then we need to clear up the orphan record which we created above.
+	 * However, if this was a real unlink then we were called by
+	 * ext3_evict_inode(), and we allow that function to clean up the
+	 * orphan info for us.
+	 */
+	if (inode->i_nlink)
+		ext3_orphan_del(handle, inode);
+
+	ext3_journal_stop(handle);
+	return;
+out_notrans:
+	/*
+	 * Delete the inode from orphan list so that it doesn't stay there
+	 * forever and trigger assertion on umount.
+	 */
+	if (inode->i_nlink)
+		ext3_orphan_del(NULL, inode);
+}
+
+static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
+		unsigned long ino, struct ext3_iloc *iloc)
+{
+	unsigned long block_group;
+	unsigned long offset;
+	ext3_fsblk_t block;
+	struct ext3_group_desc *gdp;
+
+	if (!ext3_valid_inum(sb, ino)) {
+		/*
+		 * This error is already checked for in namei.c unless we are
+		 * looking at an NFS filehandle, in which case no error
+		 * report is needed
+		 */
+		return 0;
+	}
+
+	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
+	gdp = ext3_get_group_desc(sb, block_group, NULL);
+	if (!gdp)
+		return 0;
+	/*
+	 * Figure out the offset within the block group inode table
+	 */
+	offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
+		EXT3_INODE_SIZE(sb);
+	block = le32_to_cpu(gdp->bg_inode_table) +
+		(offset >> EXT3_BLOCK_SIZE_BITS(sb));
+
+	iloc->block_group = block_group;
+	iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
+	return block;
+}
+
+/*
+ * ext3_get_inode_loc returns with an extra refcount against the inode's
+ * underlying buffer_head on success. If 'in_mem' is true, we have all
+ * data in memory that is needed to recreate the on-disk version of this
+ * inode.
+ */
+static int __ext3_get_inode_loc(struct inode *inode,
+				struct ext3_iloc *iloc, int in_mem)
+{
+	ext3_fsblk_t block;
+	struct buffer_head *bh;
+
+	block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
+	if (!block)
+		return -EIO;
+
+	bh = sb_getblk(inode->i_sb, block);
+	if (!bh) {
+		ext3_error (inode->i_sb, "ext3_get_inode_loc",
+				"unable to read inode block - "
+				"inode=%lu, block="E3FSBLK,
+				 inode->i_ino, block);
+		return -EIO;
+	}
+	if (!buffer_uptodate(bh)) {
+		lock_buffer(bh);
+
+		/*
+		 * If the buffer has the write error flag, we have failed
+		 * to write out another inode in the same block.  In this
+		 * case, we don't have to read the block because we may
+		 * read the old inode data successfully.
+		 */
+		if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+			set_buffer_uptodate(bh);
+
+		if (buffer_uptodate(bh)) {
+			/* someone brought it uptodate while we waited */
+			unlock_buffer(bh);
+			goto has_buffer;
+		}
+
+		/*
+		 * If we have all information of the inode in memory and this
+		 * is the only valid inode in the block, we need not read the
+		 * block.
+		 */
+		if (in_mem) {
+			struct buffer_head *bitmap_bh;
+			struct ext3_group_desc *desc;
+			int inodes_per_buffer;
+			int inode_offset, i;
+			int block_group;
+			int start;
+
+			block_group = (inode->i_ino - 1) /
+					EXT3_INODES_PER_GROUP(inode->i_sb);
+			inodes_per_buffer = bh->b_size /
+				EXT3_INODE_SIZE(inode->i_sb);
+			inode_offset = ((inode->i_ino - 1) %
+					EXT3_INODES_PER_GROUP(inode->i_sb));
+			start = inode_offset & ~(inodes_per_buffer - 1);
+
+			/* Is the inode bitmap in cache? */
+			desc = ext3_get_group_desc(inode->i_sb,
+						block_group, NULL);
+			if (!desc)
+				goto make_io;
+
+			bitmap_bh = sb_getblk(inode->i_sb,
+					le32_to_cpu(desc->bg_inode_bitmap));
+			if (!bitmap_bh)
+				goto make_io;
+
+			/*
+			 * If the inode bitmap isn't in cache then the
+			 * optimisation may end up performing two reads instead
+			 * of one, so skip it.
+			 */
+			if (!buffer_uptodate(bitmap_bh)) {
+				brelse(bitmap_bh);
+				goto make_io;
+			}
+			for (i = start; i < start + inodes_per_buffer; i++) {
+				if (i == inode_offset)
+					continue;
+				if (ext3_test_bit(i, bitmap_bh->b_data))
+					break;
+			}
+			brelse(bitmap_bh);
+			if (i == start + inodes_per_buffer) {
+				/* all other inodes are free, so skip I/O */
+				memset(bh->b_data, 0, bh->b_size);
+				set_buffer_uptodate(bh);
+				unlock_buffer(bh);
+				goto has_buffer;
+			}
+		}
+
+make_io:
+		/*
+		 * There are other valid inodes in the buffer, this inode
+		 * has in-inode xattrs, or we don't have this inode in memory.
+		 * Read the block from disk.
+		 */
+		get_bh(bh);
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ_META, bh);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			ext3_error(inode->i_sb, "ext3_get_inode_loc",
+					"unable to read inode block - "
+					"inode=%lu, block="E3FSBLK,
+					inode->i_ino, block);
+			brelse(bh);
+			return -EIO;
+		}
+	}
+has_buffer:
+	iloc->bh = bh;
+	return 0;
+}
+
+int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
+{
+	/* We have all inode data except xattrs in memory here. */
+	return __ext3_get_inode_loc(inode, iloc,
+		!ext3_test_inode_state(inode, EXT3_STATE_XATTR));
+}
+
+void ext3_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = EXT3_I(inode)->i_flags;
+
+	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+	if (flags & EXT3_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & EXT3_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & EXT3_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (flags & EXT3_NOATIME_FL)
+		inode->i_flags |= S_NOATIME;
+	if (flags & EXT3_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+}
+
+/* Propagate flags from i_flags to EXT3_I(inode)->i_flags */
+void ext3_get_inode_flags(struct ext3_inode_info *ei)
+{
+	unsigned int flags = ei->vfs_inode.i_flags;
+
+	ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL|
+			EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL);
+	if (flags & S_SYNC)
+		ei->i_flags |= EXT3_SYNC_FL;
+	if (flags & S_APPEND)
+		ei->i_flags |= EXT3_APPEND_FL;
+	if (flags & S_IMMUTABLE)
+		ei->i_flags |= EXT3_IMMUTABLE_FL;
+	if (flags & S_NOATIME)
+		ei->i_flags |= EXT3_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		ei->i_flags |= EXT3_DIRSYNC_FL;
+}
+
+struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
+{
+	struct ext3_iloc iloc;
+	struct ext3_inode *raw_inode;
+	struct ext3_inode_info *ei;
+	struct buffer_head *bh;
+	struct inode *inode;
+	journal_t *journal = EXT3_SB(sb)->s_journal;
+	transaction_t *transaction;
+	long ret;
+	int block;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	ei = EXT3_I(inode);
+	ei->i_block_alloc_info = NULL;
+
+	ret = __ext3_get_inode_loc(inode, &iloc, 0);
+	if (ret < 0)
+		goto bad_inode;
+	bh = iloc.bh;
+	raw_inode = ext3_raw_inode(&iloc);
+	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+	if(!(test_opt (inode->i_sb, NO_UID32))) {
+		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+	}
+	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+	inode->i_size = le32_to_cpu(raw_inode->i_size);
+	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
+	inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
+	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
+	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+
+	ei->i_state_flags = 0;
+	ei->i_dir_start_lookup = 0;
+	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
+	/* We now have enough fields to check if the inode was active or not.
+	 * This is needed because nfsd might try to access dead inodes
+	 * the test is that same one that e2fsck uses
+	 * NeilBrown 1999oct15
+	 */
+	if (inode->i_nlink == 0) {
+		if (inode->i_mode == 0 ||
+		    !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
+			/* this inode is deleted */
+			brelse (bh);
+			ret = -ESTALE;
+			goto bad_inode;
+		}
+		/* The only unlinked inodes we let through here have
+		 * valid i_mode and are being read by the orphan
+		 * recovery code: that's fine, we're about to complete
+		 * the process of deleting those. */
+	}
+	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
+	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+#ifdef EXT3_FRAGMENTS
+	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
+	ei->i_frag_no = raw_inode->i_frag;
+	ei->i_frag_size = raw_inode->i_fsize;
+#endif
+	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+	if (!S_ISREG(inode->i_mode)) {
+		ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
+	} else {
+		inode->i_size |=
+			((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
+	}
+	ei->i_disksize = inode->i_size;
+	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+	ei->i_block_group = iloc.block_group;
+	/*
+	 * NOTE! The in-memory inode i_data array is in little-endian order
+	 * even on big-endian machines: we do NOT byteswap the block numbers!
+	 */
+	for (block = 0; block < EXT3_N_BLOCKS; block++)
+		ei->i_data[block] = raw_inode->i_block[block];
+	INIT_LIST_HEAD(&ei->i_orphan);
+
+	/*
+	 * Set transaction id's of transactions that have to be committed
+	 * to finish f[data]sync. We set them to currently running transaction
+	 * as we cannot be sure that the inode or some of its metadata isn't
+	 * part of the transaction - the inode could have been reclaimed and
+	 * now it is reread from disk.
+	 */
+	if (journal) {
+		tid_t tid;
+
+		spin_lock(&journal->j_state_lock);
+		if (journal->j_running_transaction)
+			transaction = journal->j_running_transaction;
+		else
+			transaction = journal->j_committing_transaction;
+		if (transaction)
+			tid = transaction->t_tid;
+		else
+			tid = journal->j_commit_sequence;
+		spin_unlock(&journal->j_state_lock);
+		atomic_set(&ei->i_sync_tid, tid);
+		atomic_set(&ei->i_datasync_tid, tid);
+	}
+
+	if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
+	    EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
+		/*
+		 * When mke2fs creates big inodes it does not zero out
+		 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
+		 * so ignore those first few inodes.
+		 */
+		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+		if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+		    EXT3_INODE_SIZE(inode->i_sb)) {
+			brelse (bh);
+			ret = -EIO;
+			goto bad_inode;
+		}
+		if (ei->i_extra_isize == 0) {
+			/* The extra space is currently unused. Use it. */
+			ei->i_extra_isize = sizeof(struct ext3_inode) -
+					    EXT3_GOOD_OLD_INODE_SIZE;
+		} else {
+			__le32 *magic = (void *)raw_inode +
+					EXT3_GOOD_OLD_INODE_SIZE +
+					ei->i_extra_isize;
+			if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
+				 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
+		}
+	} else
+		ei->i_extra_isize = 0;
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &ext3_file_inode_operations;
+		inode->i_fop = &ext3_file_operations;
+		ext3_set_aops(inode);
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &ext3_dir_inode_operations;
+		inode->i_fop = &ext3_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (ext3_inode_is_fast_symlink(inode)) {
+			inode->i_op = &ext3_fast_symlink_inode_operations;
+			nd_terminate_link(ei->i_data, inode->i_size,
+				sizeof(ei->i_data) - 1);
+		} else {
+			inode->i_op = &ext3_symlink_inode_operations;
+			ext3_set_aops(inode);
+		}
+	} else {
+		inode->i_op = &ext3_special_inode_operations;
+		if (raw_inode->i_block[0])
+			init_special_inode(inode, inode->i_mode,
+			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
+		else
+			init_special_inode(inode, inode->i_mode,
+			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+	}
+	brelse (iloc.bh);
+	ext3_set_inode_flags(inode);
+	unlock_new_inode(inode);
+	return inode;
+
+bad_inode:
+	iget_failed(inode);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Post the struct inode info into an on-disk inode location in the
+ * buffer-cache.  This gobbles the caller's reference to the
+ * buffer_head in the inode location struct.
+ *
+ * The caller must have write access to iloc->bh.
+ */
+static int ext3_do_update_inode(handle_t *handle,
+				struct inode *inode,
+				struct ext3_iloc *iloc)
+{
+	struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	struct buffer_head *bh = iloc->bh;
+	int err = 0, rc, block;
+
+again:
+	/* we can't allow multiple procs in here at once, its a bit racey */
+	lock_buffer(bh);
+
+	/* For fields not not tracking in the in-memory inode,
+	 * initialise them to zero for new inodes. */
+	if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
+		memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
+
+	ext3_get_inode_flags(ei);
+	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+	if(!(test_opt(inode->i_sb, NO_UID32))) {
+		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
+		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+/*
+ * Fix up interoperability with old kernels. Otherwise, old inodes get
+ * re-used with the upper 16 bits of the uid/gid intact
+ */
+		if(!ei->i_dtime) {
+			raw_inode->i_uid_high =
+				cpu_to_le16(high_16_bits(inode->i_uid));
+			raw_inode->i_gid_high =
+				cpu_to_le16(high_16_bits(inode->i_gid));
+		} else {
+			raw_inode->i_uid_high = 0;
+			raw_inode->i_gid_high = 0;
+		}
+	} else {
+		raw_inode->i_uid_low =
+			cpu_to_le16(fs_high2lowuid(inode->i_uid));
+		raw_inode->i_gid_low =
+			cpu_to_le16(fs_high2lowgid(inode->i_gid));
+		raw_inode->i_uid_high = 0;
+		raw_inode->i_gid_high = 0;
+	}
+	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+	raw_inode->i_size = cpu_to_le32(ei->i_disksize);
+	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
+	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+#ifdef EXT3_FRAGMENTS
+	raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
+	raw_inode->i_frag = ei->i_frag_no;
+	raw_inode->i_fsize = ei->i_frag_size;
+#endif
+	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
+	if (!S_ISREG(inode->i_mode)) {
+		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
+	} else {
+		raw_inode->i_size_high =
+			cpu_to_le32(ei->i_disksize >> 32);
+		if (ei->i_disksize > 0x7fffffffULL) {
+			struct super_block *sb = inode->i_sb;
+			if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
+					EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
+			    EXT3_SB(sb)->s_es->s_rev_level ==
+					cpu_to_le32(EXT3_GOOD_OLD_REV)) {
+			       /* If this is the first large file
+				* created, add a flag to the superblock.
+				*/
+				unlock_buffer(bh);
+				err = ext3_journal_get_write_access(handle,
+						EXT3_SB(sb)->s_sbh);
+				if (err)
+					goto out_brelse;
+
+				ext3_update_dynamic_rev(sb);
+				EXT3_SET_RO_COMPAT_FEATURE(sb,
+					EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
+				handle->h_sync = 1;
+				err = ext3_journal_dirty_metadata(handle,
+						EXT3_SB(sb)->s_sbh);
+				/* get our lock and start over */
+				goto again;
+			}
+		}
+	}
+	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			raw_inode->i_block[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
+			raw_inode->i_block[1] = 0;
+		} else {
+			raw_inode->i_block[0] = 0;
+			raw_inode->i_block[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
+			raw_inode->i_block[2] = 0;
+		}
+	} else for (block = 0; block < EXT3_N_BLOCKS; block++)
+		raw_inode->i_block[block] = ei->i_data[block];
+
+	if (ei->i_extra_isize)
+		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+
+	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+	unlock_buffer(bh);
+	rc = ext3_journal_dirty_metadata(handle, bh);
+	if (!err)
+		err = rc;
+	ext3_clear_inode_state(inode, EXT3_STATE_NEW);
+
+	atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
+out_brelse:
+	brelse (bh);
+	ext3_std_error(inode->i_sb, err);
+	return err;
+}
+
+/*
+ * ext3_write_inode()
+ *
+ * We are called from a few places:
+ *
+ * - Within generic_file_write() for O_SYNC files.
+ *   Here, there will be no transaction running. We wait for any running
+ *   trasnaction to commit.
+ *
+ * - Within sys_sync(), kupdate and such.
+ *   We wait on commit, if tol to.
+ *
+ * - Within prune_icache() (PF_MEMALLOC == true)
+ *   Here we simply return.  We can't afford to block kswapd on the
+ *   journal commit.
+ *
+ * In all cases it is actually safe for us to return without doing anything,
+ * because the inode has been copied into a raw inode buffer in
+ * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
+ * knfsd.
+ *
+ * Note that we are absolutely dependent upon all inode dirtiers doing the
+ * right thing: they *must* call mark_inode_dirty() after dirtying info in
+ * which we are interested.
+ *
+ * It would be a bug for them to not do this.  The code:
+ *
+ *	mark_inode_dirty(inode)
+ *	stuff();
+ *	inode->i_size = expr;
+ *
+ * is in error because a kswapd-driven write_inode() could occur while
+ * `stuff()' is running, and the new i_size will be lost.  Plus the inode
+ * will no longer be on the superblock's dirty inode list.
+ */
+int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	if (current->flags & PF_MEMALLOC)
+		return 0;
+
+	if (ext3_journal_current_handle()) {
+		jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+		dump_stack();
+		return -EIO;
+	}
+
+	if (wbc->sync_mode != WB_SYNC_ALL)
+		return 0;
+
+	return ext3_force_commit(inode->i_sb);
+}
+
+/*
+ * ext3_setattr()
+ *
+ * Called from notify_change.
+ *
+ * We want to trap VFS attempts to truncate the file as soon as
+ * possible.  In particular, we want to make sure that when the VFS
+ * shrinks i_size, we put the inode on the orphan list and modify
+ * i_disksize immediately, so that during the subsequent flushing of
+ * dirty pages and freeing of disk blocks, we can guarantee that any
+ * commit will leave the blocks being flushed in an unused state on
+ * disk.  (On recovery, the inode will get truncated and the blocks will
+ * be freed, so we have a strong guarantee that no future commit will
+ * leave these blocks visible to the user.)
+ *
+ * Called with inode->sem down.
+ */
+int ext3_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error, rc = 0;
+	const unsigned int ia_valid = attr->ia_valid;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if (is_quota_modification(inode, attr))
+		dquot_initialize(inode);
+	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+		(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+		handle_t *handle;
+
+		/* (user+group)*(old+new) structure, inode write (sb,
+		 * inode block, ? - but truncate inode update has it) */
+		handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+					EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3);
+		if (IS_ERR(handle)) {
+			error = PTR_ERR(handle);
+			goto err_out;
+		}
+		error = dquot_transfer(inode, attr);
+		if (error) {
+			ext3_journal_stop(handle);
+			return error;
+		}
+		/* Update corresponding info in inode so that everything is in
+		 * one transaction */
+		if (attr->ia_valid & ATTR_UID)
+			inode->i_uid = attr->ia_uid;
+		if (attr->ia_valid & ATTR_GID)
+			inode->i_gid = attr->ia_gid;
+		error = ext3_mark_inode_dirty(handle, inode);
+		ext3_journal_stop(handle);
+	}
+
+	if (S_ISREG(inode->i_mode) &&
+	    attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+		handle_t *handle;
+
+		handle = ext3_journal_start(inode, 3);
+		if (IS_ERR(handle)) {
+			error = PTR_ERR(handle);
+			goto err_out;
+		}
+
+		error = ext3_orphan_add(handle, inode);
+		EXT3_I(inode)->i_disksize = attr->ia_size;
+		rc = ext3_mark_inode_dirty(handle, inode);
+		if (!error)
+			error = rc;
+		ext3_journal_stop(handle);
+	}
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		rc = vmtruncate(inode, attr->ia_size);
+		if (rc)
+			goto err_out;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
+	if (ia_valid & ATTR_MODE)
+		rc = ext3_acl_chmod(inode);
+
+err_out:
+	ext3_std_error(inode->i_sb, error);
+	if (!error)
+		error = rc;
+	return error;
+}
+
+
+/*
+ * How many blocks doth make a writepage()?
+ *
+ * With N blocks per page, it may be:
+ * N data blocks
+ * 2 indirect block
+ * 2 dindirect
+ * 1 tindirect
+ * N+5 bitmap blocks (from the above)
+ * N+5 group descriptor summary blocks
+ * 1 inode block
+ * 1 superblock.
+ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
+ *
+ * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
+ *
+ * With ordered or writeback data it's the same, less the N data blocks.
+ *
+ * If the inode's direct blocks can hold an integral number of pages then a
+ * page cannot straddle two indirect blocks, and we can only touch one indirect
+ * and dindirect block, and the "5" above becomes "3".
+ *
+ * This still overestimates under most circumstances.  If we were to pass the
+ * start and end offsets in here as well we could do block_to_path() on each
+ * block and work out the exact number of indirects which are touched.  Pah.
+ */
+
+static int ext3_writepage_trans_blocks(struct inode *inode)
+{
+	int bpp = ext3_journal_blocks_per_page(inode);
+	int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
+	int ret;
+
+	if (ext3_should_journal_data(inode))
+		ret = 3 * (bpp + indirects) + 2;
+	else
+		ret = 2 * (bpp + indirects) + indirects + 2;
+
+#ifdef CONFIG_QUOTA
+	/* We know that structure was already allocated during dquot_initialize so
+	 * we will be updating only the data blocks + inodes */
+	ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
+#endif
+
+	return ret;
+}
+
+/*
+ * The caller must have previously called ext3_reserve_inode_write().
+ * Give this, we know that the caller already has write access to iloc->bh.
+ */
+int ext3_mark_iloc_dirty(handle_t *handle,
+		struct inode *inode, struct ext3_iloc *iloc)
+{
+	int err = 0;
+
+	/* the do_update_inode consumes one bh->b_count */
+	get_bh(iloc->bh);
+
+	/* ext3_do_update_inode() does journal_dirty_metadata */
+	err = ext3_do_update_inode(handle, inode, iloc);
+	put_bh(iloc->bh);
+	return err;
+}
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh.  This _must_ be cleaned up later.
+ */
+
+int
+ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
+			 struct ext3_iloc *iloc)
+{
+	int err = 0;
+	if (handle) {
+		err = ext3_get_inode_loc(inode, iloc);
+		if (!err) {
+			BUFFER_TRACE(iloc->bh, "get_write_access");
+			err = ext3_journal_get_write_access(handle, iloc->bh);
+			if (err) {
+				brelse(iloc->bh);
+				iloc->bh = NULL;
+			}
+		}
+	}
+	ext3_std_error(inode->i_sb, err);
+	return err;
+}
+
+/*
+ * What we do here is to mark the in-core inode as clean with respect to inode
+ * dirtiness (it may still be data-dirty).
+ * This means that the in-core inode may be reaped by prune_icache
+ * without having to perform any I/O.  This is a very good thing,
+ * because *any* task may call prune_icache - even ones which
+ * have a transaction open against a different journal.
+ *
+ * Is this cheating?  Not really.  Sure, we haven't written the
+ * inode out, but prune_icache isn't a user-visible syncing function.
+ * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
+ * we start and wait on commits.
+ *
+ * Is this efficient/effective?  Well, we're being nice to the system
+ * by cleaning up our inodes proactively so they can be reaped
+ * without I/O.  But we are potentially leaving up to five seconds'
+ * worth of inodes floating about which prune_icache wants us to
+ * write out.  One way to fix that would be to get prune_icache()
+ * to do a write_super() to free up some memory.  It has the desired
+ * effect.
+ */
+int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
+{
+	struct ext3_iloc iloc;
+	int err;
+
+	might_sleep();
+	err = ext3_reserve_inode_write(handle, inode, &iloc);
+	if (!err)
+		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+	return err;
+}
+
+/*
+ * ext3_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We're really interested in the case where a file is being extended.
+ * i_size has been changed by generic_commit_write() and we thus need
+ * to include the updated inode in the current transaction.
+ *
+ * Also, dquot_alloc_space() will always dirty the inode when blocks
+ * are allocated to the file.
+ *
+ * If the inode is marked synchronous, we don't honour that here - doing
+ * so would cause a commit on atime updates, which we don't bother doing.
+ * We handle synchronous inodes at the highest possible level.
+ */
+void ext3_dirty_inode(struct inode *inode, int flags)
+{
+	handle_t *current_handle = ext3_journal_current_handle();
+	handle_t *handle;
+
+	handle = ext3_journal_start(inode, 2);
+	if (IS_ERR(handle))
+		goto out;
+	if (current_handle &&
+		current_handle->h_transaction != handle->h_transaction) {
+		/* This task has a transaction open against a different fs */
+		printk(KERN_EMERG "%s: transactions do not match!\n",
+		       __func__);
+	} else {
+		jbd_debug(5, "marking dirty.  outer handle=%p\n",
+				current_handle);
+		ext3_mark_inode_dirty(handle, inode);
+	}
+	ext3_journal_stop(handle);
+out:
+	return;
+}
+
+#if 0
+/*
+ * Bind an inode's backing buffer_head into this transaction, to prevent
+ * it from being flushed to disk early.  Unlike
+ * ext3_reserve_inode_write, this leaves behind no bh reference and
+ * returns no iloc structure, so the caller needs to repeat the iloc
+ * lookup to mark the inode dirty later.
+ */
+static int ext3_pin_inode(handle_t *handle, struct inode *inode)
+{
+	struct ext3_iloc iloc;
+
+	int err = 0;
+	if (handle) {
+		err = ext3_get_inode_loc(inode, &iloc);
+		if (!err) {
+			BUFFER_TRACE(iloc.bh, "get_write_access");
+			err = journal_get_write_access(handle, iloc.bh);
+			if (!err)
+				err = ext3_journal_dirty_metadata(handle,
+								  iloc.bh);
+			brelse(iloc.bh);
+		}
+	}
+	ext3_std_error(inode->i_sb, err);
+	return err;
+}
+#endif
+
+int ext3_change_inode_journal_flag(struct inode *inode, int val)
+{
+	journal_t *journal;
+	handle_t *handle;
+	int err;
+
+	/*
+	 * We have to be very careful here: changing a data block's
+	 * journaling status dynamically is dangerous.  If we write a
+	 * data block to the journal, change the status and then delete
+	 * that block, we risk forgetting to revoke the old log record
+	 * from the journal and so a subsequent replay can corrupt data.
+	 * So, first we make sure that the journal is empty and that
+	 * nobody is changing anything.
+	 */
+
+	journal = EXT3_JOURNAL(inode);
+	if (is_journal_aborted(journal))
+		return -EROFS;
+
+	journal_lock_updates(journal);
+	journal_flush(journal);
+
+	/*
+	 * OK, there are no updates running now, and all cached data is
+	 * synced to disk.  We are now in a completely consistent state
+	 * which doesn't have anything in the journal, and we know that
+	 * no filesystem updates are running, so it is safe to modify
+	 * the inode's in-core data-journaling state flag now.
+	 */
+
+	if (val)
+		EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
+	else
+		EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
+	ext3_set_aops(inode);
+
+	journal_unlock_updates(journal);
+
+	/* Finally we can mark the inode as dirty. */
+
+	handle = ext3_journal_start(inode, 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	err = ext3_mark_inode_dirty(handle, inode);
+	handle->h_sync = 1;
+	ext3_journal_stop(handle);
+	ext3_std_error(inode->i_sb, err);
+
+	return err;
+}
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
new file mode 100644
index 00000000..f4090bd2
--- /dev/null
+++ b/fs/ext3/ioctl.c
@@ -0,0 +1,352 @@
+/*
+ * linux/fs/ext3/ioctl.c
+ *
+ * Copyright (C) 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/capability.h>
+#include <linux/ext3_fs.h>
+#include <linux/ext3_jbd.h>
+#include <linux/mount.h>
+#include <linux/time.h>
+#include <linux/compat.h>
+#include <asm/uaccess.h>
+
+long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	unsigned int flags;
+	unsigned short rsv_window_size;
+
+	ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+
+	switch (cmd) {
+	case EXT3_IOC_GETFLAGS:
+		ext3_get_inode_flags(ei);
+		flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
+		return put_user(flags, (int __user *) arg);
+	case EXT3_IOC_SETFLAGS: {
+		handle_t *handle = NULL;
+		int err;
+		struct ext3_iloc iloc;
+		unsigned int oldflags;
+		unsigned int jflag;
+
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		if (get_user(flags, (int __user *) arg))
+			return -EFAULT;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+
+		flags = ext3_mask_flags(inode->i_mode, flags);
+
+		mutex_lock(&inode->i_mutex);
+
+		/* Is it quota file? Do not allow user to mess with it */
+		err = -EPERM;
+		if (IS_NOQUOTA(inode))
+			goto flags_out;
+
+		oldflags = ei->i_flags;
+
+		/* The JOURNAL_DATA flag is modifiable only by root */
+		jflag = flags & EXT3_JOURNAL_DATA_FL;
+
+		/*
+		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+		 * the relevant capability.
+		 *
+		 * This test looks nicer. Thanks to Pauline Middelink
+		 */
+		if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
+			if (!capable(CAP_LINUX_IMMUTABLE))
+				goto flags_out;
+		}
+
+		/*
+		 * The JOURNAL_DATA flag can only be changed by
+		 * the relevant capability.
+		 */
+		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
+			if (!capable(CAP_SYS_RESOURCE))
+				goto flags_out;
+		}
+
+		handle = ext3_journal_start(inode, 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto flags_out;
+		}
+		if (IS_SYNC(inode))
+			handle->h_sync = 1;
+		err = ext3_reserve_inode_write(handle, inode, &iloc);
+		if (err)
+			goto flags_err;
+
+		flags = flags & EXT3_FL_USER_MODIFIABLE;
+		flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE;
+		ei->i_flags = flags;
+
+		ext3_set_inode_flags(inode);
+		inode->i_ctime = CURRENT_TIME_SEC;
+
+		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+flags_err:
+		ext3_journal_stop(handle);
+		if (err)
+			goto flags_out;
+
+		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
+			err = ext3_change_inode_journal_flag(inode, jflag);
+flags_out:
+		mutex_unlock(&inode->i_mutex);
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
+	}
+	case EXT3_IOC_GETVERSION:
+	case EXT3_IOC_GETVERSION_OLD:
+		return put_user(inode->i_generation, (int __user *) arg);
+	case EXT3_IOC_SETVERSION:
+	case EXT3_IOC_SETVERSION_OLD: {
+		handle_t *handle;
+		struct ext3_iloc iloc;
+		__u32 generation;
+		int err;
+
+		if (!inode_owner_or_capable(inode))
+			return -EPERM;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+		if (get_user(generation, (int __user *) arg)) {
+			err = -EFAULT;
+			goto setversion_out;
+		}
+
+		handle = ext3_journal_start(inode, 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto setversion_out;
+		}
+		err = ext3_reserve_inode_write(handle, inode, &iloc);
+		if (err == 0) {
+			inode->i_ctime = CURRENT_TIME_SEC;
+			inode->i_generation = generation;
+			err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+		}
+		ext3_journal_stop(handle);
+setversion_out:
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
+	}
+#ifdef CONFIG_JBD_DEBUG
+	case EXT3_IOC_WAIT_FOR_READONLY:
+		/*
+		 * This is racy - by the time we're woken up and running,
+		 * the superblock could be released.  And the module could
+		 * have been unloaded.  So sue me.
+		 *
+		 * Returns 1 if it slept, else zero.
+		 */
+		{
+			struct super_block *sb = inode->i_sb;
+			DECLARE_WAITQUEUE(wait, current);
+			int ret = 0;
+
+			set_current_state(TASK_INTERRUPTIBLE);
+			add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
+			if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) {
+				schedule();
+				ret = 1;
+			}
+			remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
+			return ret;
+		}
+#endif
+	case EXT3_IOC_GETRSVSZ:
+		if (test_opt(inode->i_sb, RESERVATION)
+			&& S_ISREG(inode->i_mode)
+			&& ei->i_block_alloc_info) {
+			rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
+			return put_user(rsv_window_size, (int __user *)arg);
+		}
+		return -ENOTTY;
+	case EXT3_IOC_SETRSVSZ: {
+		int err;
+
+		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
+			return -ENOTTY;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+
+		if (!inode_owner_or_capable(inode)) {
+			err = -EACCES;
+			goto setrsvsz_out;
+		}
+
+		if (get_user(rsv_window_size, (int __user *)arg)) {
+			err = -EFAULT;
+			goto setrsvsz_out;
+		}
+
+		if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
+			rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
+
+		/*
+		 * need to allocate reservation structure for this inode
+		 * before set the window size
+		 */
+		mutex_lock(&ei->truncate_mutex);
+		if (!ei->i_block_alloc_info)
+			ext3_init_block_alloc_info(inode);
+
+		if (ei->i_block_alloc_info){
+			struct ext3_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
+			rsv->rsv_goal_size = rsv_window_size;
+		}
+		mutex_unlock(&ei->truncate_mutex);
+setrsvsz_out:
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
+	}
+	case EXT3_IOC_GROUP_EXTEND: {
+		ext3_fsblk_t n_blocks_count;
+		struct super_block *sb = inode->i_sb;
+		int err, err2;
+
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+
+		if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+			err = -EFAULT;
+			goto group_extend_out;
+		}
+		err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
+		journal_lock_updates(EXT3_SB(sb)->s_journal);
+		err2 = journal_flush(EXT3_SB(sb)->s_journal);
+		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+		if (err == 0)
+			err = err2;
+group_extend_out:
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
+	}
+	case EXT3_IOC_GROUP_ADD: {
+		struct ext3_new_group_data input;
+		struct super_block *sb = inode->i_sb;
+		int err, err2;
+
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+
+		if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg,
+				sizeof(input))) {
+			err = -EFAULT;
+			goto group_add_out;
+		}
+
+		err = ext3_group_add(sb, &input);
+		journal_lock_updates(EXT3_SB(sb)->s_journal);
+		err2 = journal_flush(EXT3_SB(sb)->s_journal);
+		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+		if (err == 0)
+			err = err2;
+group_add_out:
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
+	}
+	case FITRIM: {
+
+		struct super_block *sb = inode->i_sb;
+		struct fstrim_range range;
+		int ret = 0;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&range, (struct fstrim_range *)arg,
+				   sizeof(range)))
+			return -EFAULT;
+
+		ret = ext3_trim_fs(sb, &range);
+		if (ret < 0)
+			return ret;
+
+		if (copy_to_user((struct fstrim_range *)arg, &range,
+				 sizeof(range)))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case EXT3_IOC32_GETFLAGS:
+		cmd = EXT3_IOC_GETFLAGS;
+		break;
+	case EXT3_IOC32_SETFLAGS:
+		cmd = EXT3_IOC_SETFLAGS;
+		break;
+	case EXT3_IOC32_GETVERSION:
+		cmd = EXT3_IOC_GETVERSION;
+		break;
+	case EXT3_IOC32_SETVERSION:
+		cmd = EXT3_IOC_SETVERSION;
+		break;
+	case EXT3_IOC32_GROUP_EXTEND:
+		cmd = EXT3_IOC_GROUP_EXTEND;
+		break;
+	case EXT3_IOC32_GETVERSION_OLD:
+		cmd = EXT3_IOC_GETVERSION_OLD;
+		break;
+	case EXT3_IOC32_SETVERSION_OLD:
+		cmd = EXT3_IOC_SETVERSION_OLD;
+		break;
+#ifdef CONFIG_JBD_DEBUG
+	case EXT3_IOC32_WAIT_FOR_READONLY:
+		cmd = EXT3_IOC_WAIT_FOR_READONLY;
+		break;
+#endif
+	case EXT3_IOC32_GETRSVSZ:
+		cmd = EXT3_IOC_GETRSVSZ;
+		break;
+	case EXT3_IOC32_SETRSVSZ:
+		cmd = EXT3_IOC_SETRSVSZ;
+		break;
+	case EXT3_IOC_GROUP_ADD:
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
new file mode 100644
index 00000000..e5a71111
--- /dev/null
+++ b/fs/ext3/namei.c
@@ -0,0 +1,2550 @@
+/*
+ *  linux/fs/ext3/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *  Directory entry file type support and forward compatibility hooks
+ *	for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
+ *  Hash Tree Directory indexing (c)
+ *	Daniel Phillips, 2001
+ *  Hash Tree Directory indexing porting
+ *	Christopher Li, 2002
+ *  Hash Tree Directory indexing cleanup
+ *	Theodore Ts'o, 2002
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/jbd.h>
+#include <linux/time.h>
+#include <linux/ext3_fs.h>
+#include <linux/ext3_jbd.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+
+#include "namei.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * define how far ahead to read directories while searching them.
+ */
+#define NAMEI_RA_CHUNKS  2
+#define NAMEI_RA_BLOCKS  4
+#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+
+static struct buffer_head *ext3_append(handle_t *handle,
+					struct inode *inode,
+					u32 *block, int *err)
+{
+	struct buffer_head *bh;
+
+	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+
+	bh = ext3_bread(handle, inode, *block, 1, err);
+	if (bh) {
+		inode->i_size += inode->i_sb->s_blocksize;
+		EXT3_I(inode)->i_disksize = inode->i_size;
+		*err = ext3_journal_get_write_access(handle, bh);
+		if (*err) {
+			brelse(bh);
+			bh = NULL;
+		}
+	}
+	return bh;
+}
+
+#ifndef assert
+#define assert(test) J_ASSERT(test)
+#endif
+
+#ifdef DX_DEBUG
+#define dxtrace(command) command
+#else
+#define dxtrace(command)
+#endif
+
+struct fake_dirent
+{
+	__le32 inode;
+	__le16 rec_len;
+	u8 name_len;
+	u8 file_type;
+};
+
+struct dx_countlimit
+{
+	__le16 limit;
+	__le16 count;
+};
+
+struct dx_entry
+{
+	__le32 hash;
+	__le32 block;
+};
+
+/*
+ * dx_root_info is laid out so that if it should somehow get overlaid by a
+ * dirent the two low bits of the hash version will be zero.  Therefore, the
+ * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
+ */
+
+struct dx_root
+{
+	struct fake_dirent dot;
+	char dot_name[4];
+	struct fake_dirent dotdot;
+	char dotdot_name[4];
+	struct dx_root_info
+	{
+		__le32 reserved_zero;
+		u8 hash_version;
+		u8 info_length; /* 8 */
+		u8 indirect_levels;
+		u8 unused_flags;
+	}
+	info;
+	struct dx_entry	entries[0];
+};
+
+struct dx_node
+{
+	struct fake_dirent fake;
+	struct dx_entry	entries[0];
+};
+
+
+struct dx_frame
+{
+	struct buffer_head *bh;
+	struct dx_entry *entries;
+	struct dx_entry *at;
+};
+
+struct dx_map_entry
+{
+	u32 hash;
+	u16 offs;
+	u16 size;
+};
+
+static inline unsigned dx_get_block (struct dx_entry *entry);
+static void dx_set_block (struct dx_entry *entry, unsigned value);
+static inline unsigned dx_get_hash (struct dx_entry *entry);
+static void dx_set_hash (struct dx_entry *entry, unsigned value);
+static unsigned dx_get_count (struct dx_entry *entries);
+static unsigned dx_get_limit (struct dx_entry *entries);
+static void dx_set_count (struct dx_entry *entries, unsigned value);
+static void dx_set_limit (struct dx_entry *entries, unsigned value);
+static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
+static unsigned dx_node_limit (struct inode *dir);
+static struct dx_frame *dx_probe(struct qstr *entry,
+				 struct inode *dir,
+				 struct dx_hash_info *hinfo,
+				 struct dx_frame *frame,
+				 int *err);
+static void dx_release (struct dx_frame *frames);
+static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
+			struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
+		struct dx_map_entry *offsets, int count);
+static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
+static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
+static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+				 struct dx_frame *frame,
+				 struct dx_frame *frames,
+				 __u32 *start_hash);
+static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
+			struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
+			int *err);
+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+			     struct inode *inode);
+
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext3_dir_entry_2 *
+ext3_next_entry(struct ext3_dir_entry_2 *p)
+{
+	return (struct ext3_dir_entry_2 *)((char *)p +
+		ext3_rec_len_from_disk(p->rec_len));
+}
+
+/*
+ * Future: use high four bits of block for coalesce-on-delete flags
+ * Mask them off for now.
+ */
+
+static inline unsigned dx_get_block (struct dx_entry *entry)
+{
+	return le32_to_cpu(entry->block) & 0x00ffffff;
+}
+
+static inline void dx_set_block (struct dx_entry *entry, unsigned value)
+{
+	entry->block = cpu_to_le32(value);
+}
+
+static inline unsigned dx_get_hash (struct dx_entry *entry)
+{
+	return le32_to_cpu(entry->hash);
+}
+
+static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
+{
+	entry->hash = cpu_to_le32(value);
+}
+
+static inline unsigned dx_get_count (struct dx_entry *entries)
+{
+	return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+}
+
+static inline unsigned dx_get_limit (struct dx_entry *entries)
+{
+	return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
+}
+
+static inline void dx_set_count (struct dx_entry *entries, unsigned value)
+{
+	((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
+}
+
+static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
+{
+	((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+}
+
+static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
+{
+	unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
+		EXT3_DIR_REC_LEN(2) - infosize;
+	return entry_space / sizeof(struct dx_entry);
+}
+
+static inline unsigned dx_node_limit (struct inode *dir)
+{
+	unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
+	return entry_space / sizeof(struct dx_entry);
+}
+
+/*
+ * Debug
+ */
+#ifdef DX_DEBUG
+static void dx_show_index (char * label, struct dx_entry *entries)
+{
+        int i, n = dx_get_count (entries);
+        printk("%s index ", label);
+        for (i = 0; i < n; i++)
+        {
+                printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i));
+        }
+        printk("\n");
+}
+
+struct stats
+{
+	unsigned names;
+	unsigned space;
+	unsigned bcount;
+};
+
+static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
+				 int size, int show_names)
+{
+	unsigned names = 0, space = 0;
+	char *base = (char *) de;
+	struct dx_hash_info h = *hinfo;
+
+	printk("names: ");
+	while ((char *) de < base + size)
+	{
+		if (de->inode)
+		{
+			if (show_names)
+			{
+				int len = de->name_len;
+				char *name = de->name;
+				while (len--) printk("%c", *name++);
+				ext3fs_dirhash(de->name, de->name_len, &h);
+				printk(":%x.%u ", h.hash,
+				       ((char *) de - base));
+			}
+			space += EXT3_DIR_REC_LEN(de->name_len);
+			names++;
+		}
+		de = ext3_next_entry(de);
+	}
+	printk("(%i)\n", names);
+	return (struct stats) { names, space, 1 };
+}
+
+struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+			     struct dx_entry *entries, int levels)
+{
+	unsigned blocksize = dir->i_sb->s_blocksize;
+	unsigned count = dx_get_count (entries), names = 0, space = 0, i;
+	unsigned bcount = 0;
+	struct buffer_head *bh;
+	int err;
+	printk("%i indexed blocks...\n", count);
+	for (i = 0; i < count; i++, entries++)
+	{
+		u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
+		u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
+		struct stats stats;
+		printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
+		if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
+		stats = levels?
+		   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
+		   dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
+		names += stats.names;
+		space += stats.space;
+		bcount += stats.bcount;
+		brelse (bh);
+	}
+	if (bcount)
+		printk("%snames %u, fullness %u (%u%%)\n", levels?"":"   ",
+			names, space/bcount,(space/bcount)*100/blocksize);
+	return (struct stats) { names, space, bcount};
+}
+#endif /* DX_DEBUG */
+
+/*
+ * Probe for a directory leaf block to search.
+ *
+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+ * error in the directory index, and the caller should fall back to
+ * searching the directory normally.  The callers of dx_probe **MUST**
+ * check for this error code, and make sure it never gets reflected
+ * back to userspace.
+ */
+static struct dx_frame *
+dx_probe(struct qstr *entry, struct inode *dir,
+	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+{
+	unsigned count, indirect;
+	struct dx_entry *at, *entries, *p, *q, *m;
+	struct dx_root *root;
+	struct buffer_head *bh;
+	struct dx_frame *frame = frame_in;
+	u32 hash;
+
+	frame->bh = NULL;
+	if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
+		goto fail;
+	root = (struct dx_root *) bh->b_data;
+	if (root->info.hash_version != DX_HASH_TEA &&
+	    root->info.hash_version != DX_HASH_HALF_MD4 &&
+	    root->info.hash_version != DX_HASH_LEGACY) {
+		ext3_warning(dir->i_sb, __func__,
+			     "Unrecognised inode hash code %d",
+			     root->info.hash_version);
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+	hinfo->hash_version = root->info.hash_version;
+	if (hinfo->hash_version <= DX_HASH_TEA)
+		hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
+	hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+	if (entry)
+		ext3fs_dirhash(entry->name, entry->len, hinfo);
+	hash = hinfo->hash;
+
+	if (root->info.unused_flags & 1) {
+		ext3_warning(dir->i_sb, __func__,
+			     "Unimplemented inode hash flags: %#06x",
+			     root->info.unused_flags);
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+
+	if ((indirect = root->info.indirect_levels) > 1) {
+		ext3_warning(dir->i_sb, __func__,
+			     "Unimplemented inode hash depth: %#06x",
+			     root->info.indirect_levels);
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+
+	entries = (struct dx_entry *) (((char *)&root->info) +
+				       root->info.info_length);
+
+	if (dx_get_limit(entries) != dx_root_limit(dir,
+						   root->info.info_length)) {
+		ext3_warning(dir->i_sb, __func__,
+			     "dx entry: limit != root limit");
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+
+	dxtrace (printk("Look up %x", hash));
+	while (1)
+	{
+		count = dx_get_count(entries);
+		if (!count || count > dx_get_limit(entries)) {
+			ext3_warning(dir->i_sb, __func__,
+				     "dx entry: no count or count > limit");
+			brelse(bh);
+			*err = ERR_BAD_DX_DIR;
+			goto fail2;
+		}
+
+		p = entries + 1;
+		q = entries + count - 1;
+		while (p <= q)
+		{
+			m = p + (q - p)/2;
+			dxtrace(printk("."));
+			if (dx_get_hash(m) > hash)
+				q = m - 1;
+			else
+				p = m + 1;
+		}
+
+		if (0) // linear search cross check
+		{
+			unsigned n = count - 1;
+			at = entries;
+			while (n--)
+			{
+				dxtrace(printk(","));
+				if (dx_get_hash(++at) > hash)
+				{
+					at--;
+					break;
+				}
+			}
+			assert (at == p - 1);
+		}
+
+		at = p - 1;
+		dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
+		frame->bh = bh;
+		frame->entries = entries;
+		frame->at = at;
+		if (!indirect--) return frame;
+		if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
+			goto fail2;
+		at = entries = ((struct dx_node *) bh->b_data)->entries;
+		if (dx_get_limit(entries) != dx_node_limit (dir)) {
+			ext3_warning(dir->i_sb, __func__,
+				     "dx entry: limit != node limit");
+			brelse(bh);
+			*err = ERR_BAD_DX_DIR;
+			goto fail2;
+		}
+		frame++;
+		frame->bh = NULL;
+	}
+fail2:
+	while (frame >= frame_in) {
+		brelse(frame->bh);
+		frame--;
+	}
+fail:
+	if (*err == ERR_BAD_DX_DIR)
+		ext3_warning(dir->i_sb, __func__,
+			     "Corrupt dir inode %ld, running e2fsck is "
+			     "recommended.", dir->i_ino);
+	return NULL;
+}
+
+static void dx_release (struct dx_frame *frames)
+{
+	if (frames[0].bh == NULL)
+		return;
+
+	if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
+		brelse(frames[1].bh);
+	brelse(frames[0].bh);
+}
+
+/*
+ * This function increments the frame pointer to search the next leaf
+ * block, and reads in the necessary intervening nodes if the search
+ * should be necessary.  Whether or not the search is necessary is
+ * controlled by the hash parameter.  If the hash value is even, then
+ * the search is only continued if the next block starts with that
+ * hash value.  This is used if we are searching for a specific file.
+ *
+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
+ *
+ * This function returns 1 if the caller should continue to search,
+ * or 0 if it should not.  If there is an error reading one of the
+ * index blocks, it will a negative error code.
+ *
+ * If start_hash is non-null, it will be filled in with the starting
+ * hash of the next page.
+ */
+static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+				 struct dx_frame *frame,
+				 struct dx_frame *frames,
+				 __u32 *start_hash)
+{
+	struct dx_frame *p;
+	struct buffer_head *bh;
+	int err, num_frames = 0;
+	__u32 bhash;
+
+	p = frame;
+	/*
+	 * Find the next leaf page by incrementing the frame pointer.
+	 * If we run out of entries in the interior node, loop around and
+	 * increment pointer in the parent node.  When we break out of
+	 * this loop, num_frames indicates the number of interior
+	 * nodes need to be read.
+	 */
+	while (1) {
+		if (++(p->at) < p->entries + dx_get_count(p->entries))
+			break;
+		if (p == frames)
+			return 0;
+		num_frames++;
+		p--;
+	}
+
+	/*
+	 * If the hash is 1, then continue only if the next page has a
+	 * continuation hash of any value.  This is used for readdir
+	 * handling.  Otherwise, check to see if the hash matches the
+	 * desired contiuation hash.  If it doesn't, return since
+	 * there's no point to read in the successive index pages.
+	 */
+	bhash = dx_get_hash(p->at);
+	if (start_hash)
+		*start_hash = bhash;
+	if ((hash & 1) == 0) {
+		if ((bhash & ~1) != hash)
+			return 0;
+	}
+	/*
+	 * If the hash is HASH_NB_ALWAYS, we always go to the next
+	 * block so no check is necessary
+	 */
+	while (num_frames--) {
+		if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
+				      0, &err)))
+			return err; /* Failure */
+		p++;
+		brelse (p->bh);
+		p->bh = bh;
+		p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
+	}
+	return 1;
+}
+
+
+/*
+ * This function fills a red-black tree with information from a
+ * directory block.  It returns the number directory entries loaded
+ * into the tree.  If there is an error it is returned in err.
+ */
+static int htree_dirblock_to_tree(struct file *dir_file,
+				  struct inode *dir, int block,
+				  struct dx_hash_info *hinfo,
+				  __u32 start_hash, __u32 start_minor_hash)
+{
+	struct buffer_head *bh;
+	struct ext3_dir_entry_2 *de, *top;
+	int err, count = 0;
+
+	dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
+	if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
+		return err;
+
+	de = (struct ext3_dir_entry_2 *) bh->b_data;
+	top = (struct ext3_dir_entry_2 *) ((char *) de +
+					   dir->i_sb->s_blocksize -
+					   EXT3_DIR_REC_LEN(0));
+	for (; de < top; de = ext3_next_entry(de)) {
+		if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
+					(block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
+						+((char *)de - bh->b_data))) {
+			/* On error, skip the f_pos to the next block. */
+			dir_file->f_pos = (dir_file->f_pos |
+					(dir->i_sb->s_blocksize - 1)) + 1;
+			brelse (bh);
+			return count;
+		}
+		ext3fs_dirhash(de->name, de->name_len, hinfo);
+		if ((hinfo->hash < start_hash) ||
+		    ((hinfo->hash == start_hash) &&
+		     (hinfo->minor_hash < start_minor_hash)))
+			continue;
+		if (de->inode == 0)
+			continue;
+		if ((err = ext3_htree_store_dirent(dir_file,
+				   hinfo->hash, hinfo->minor_hash, de)) != 0) {
+			brelse(bh);
+			return err;
+		}
+		count++;
+	}
+	brelse(bh);
+	return count;
+}
+
+
+/*
+ * This function fills a red-black tree with information from a
+ * directory.  We start scanning the directory in hash order, starting
+ * at start_hash and start_minor_hash.
+ *
+ * This function returns the number of entries inserted into the tree,
+ * or a negative error code.
+ */
+int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+			 __u32 start_minor_hash, __u32 *next_hash)
+{
+	struct dx_hash_info hinfo;
+	struct ext3_dir_entry_2 *de;
+	struct dx_frame frames[2], *frame;
+	struct inode *dir;
+	int block, err;
+	int count = 0;
+	int ret;
+	__u32 hashval;
+
+	dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
+		       start_minor_hash));
+	dir = dir_file->f_path.dentry->d_inode;
+	if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
+		hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+		if (hinfo.hash_version <= DX_HASH_TEA)
+			hinfo.hash_version +=
+				EXT3_SB(dir->i_sb)->s_hash_unsigned;
+		hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
+					       start_hash, start_minor_hash);
+		*next_hash = ~0;
+		return count;
+	}
+	hinfo.hash = start_hash;
+	hinfo.minor_hash = 0;
+	frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err);
+	if (!frame)
+		return err;
+
+	/* Add '.' and '..' from the htree header */
+	if (!start_hash && !start_minor_hash) {
+		de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
+		if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+			goto errout;
+		count++;
+	}
+	if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
+		de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
+		de = ext3_next_entry(de);
+		if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0)
+			goto errout;
+		count++;
+	}
+
+	while (1) {
+		block = dx_get_block(frame->at);
+		ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
+					     start_hash, start_minor_hash);
+		if (ret < 0) {
+			err = ret;
+			goto errout;
+		}
+		count += ret;
+		hashval = ~0;
+		ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
+					    frame, frames, &hashval);
+		*next_hash = hashval;
+		if (ret < 0) {
+			err = ret;
+			goto errout;
+		}
+		/*
+		 * Stop if:  (a) there are no more entries, or
+		 * (b) we have inserted at least one entry and the
+		 * next hash value is not a continuation
+		 */
+		if ((ret == 0) ||
+		    (count && ((hashval & 1) == 0)))
+			break;
+	}
+	dx_release(frames);
+	dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+		       count, *next_hash));
+	return count;
+errout:
+	dx_release(frames);
+	return (err);
+}
+
+
+/*
+ * Directory block splitting, compacting
+ */
+
+/*
+ * Create map of hash values, offsets, and sizes, stored at end of block.
+ * Returns number of entries mapped.
+ */
+static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
+		struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+{
+	int count = 0;
+	char *base = (char *) de;
+	struct dx_hash_info h = *hinfo;
+
+	while ((char *) de < base + blocksize)
+	{
+		if (de->name_len && de->inode) {
+			ext3fs_dirhash(de->name, de->name_len, &h);
+			map_tail--;
+			map_tail->hash = h.hash;
+			map_tail->offs = (u16) ((char *) de - base);
+			map_tail->size = le16_to_cpu(de->rec_len);
+			count++;
+			cond_resched();
+		}
+		/* XXX: do we need to check rec_len == 0 case? -Chris */
+		de = ext3_next_entry(de);
+	}
+	return count;
+}
+
+/* Sort map by hash value */
+static void dx_sort_map (struct dx_map_entry *map, unsigned count)
+{
+        struct dx_map_entry *p, *q, *top = map + count - 1;
+        int more;
+        /* Combsort until bubble sort doesn't suck */
+        while (count > 2)
+	{
+                count = count*10/13;
+                if (count - 9 < 2) /* 9, 10 -> 11 */
+                        count = 11;
+                for (p = top, q = p - count; q >= map; p--, q--)
+                        if (p->hash < q->hash)
+                                swap(*p, *q);
+        }
+        /* Garden variety bubble sort */
+        do {
+                more = 0;
+                q = top;
+                while (q-- > map)
+		{
+                        if (q[1].hash >= q[0].hash)
+				continue;
+                        swap(*(q+1), *q);
+                        more = 1;
+		}
+	} while(more);
+}
+
+static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
+{
+	struct dx_entry *entries = frame->entries;
+	struct dx_entry *old = frame->at, *new = old + 1;
+	int count = dx_get_count(entries);
+
+	assert(count < dx_get_limit(entries));
+	assert(old < entries + count);
+	memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
+	dx_set_hash(new, hash);
+	dx_set_block(new, block);
+	dx_set_count(entries, count + 1);
+}
+
+static void ext3_update_dx_flag(struct inode *inode)
+{
+	if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
+				     EXT3_FEATURE_COMPAT_DIR_INDEX))
+		EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
+}
+
+/*
+ * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
+ *
+ * `len <= EXT3_NAME_LEN' is guaranteed by caller.
+ * `de != NULL' is guaranteed by caller.
+ */
+static inline int ext3_match (int len, const char * const name,
+			      struct ext3_dir_entry_2 * de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+static inline int search_dirblock(struct buffer_head * bh,
+				  struct inode *dir,
+				  struct qstr *child,
+				  unsigned long offset,
+				  struct ext3_dir_entry_2 ** res_dir)
+{
+	struct ext3_dir_entry_2 * de;
+	char * dlimit;
+	int de_len;
+	const char *name = child->name;
+	int namelen = child->len;
+
+	de = (struct ext3_dir_entry_2 *) bh->b_data;
+	dlimit = bh->b_data + dir->i_sb->s_blocksize;
+	while ((char *) de < dlimit) {
+		/* this code is executed quadratically often */
+		/* do minimal checking `by hand' */
+
+		if ((char *) de + namelen <= dlimit &&
+		    ext3_match (namelen, name, de)) {
+			/* found a match - just to be sure, do a full check */
+			if (!ext3_check_dir_entry("ext3_find_entry",
+						  dir, de, bh, offset))
+				return -1;
+			*res_dir = de;
+			return 1;
+		}
+		/* prevent looping on a bad block */
+		de_len = ext3_rec_len_from_disk(de->rec_len);
+		if (de_len <= 0)
+			return -1;
+		offset += de_len;
+		de = (struct ext3_dir_entry_2 *) ((char *) de + de_len);
+	}
+	return 0;
+}
+
+
+/*
+ *	ext3_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the cache buffer in which the entry was found, and the entry
+ * itself (as a parameter - res_dir). It does NOT read the inode of the
+ * entry - you'll have to do that yourself if you want to.
+ *
+ * The returned buffer_head has ->b_count elevated.  The caller is expected
+ * to brelse() it when appropriate.
+ */
+static struct buffer_head *ext3_find_entry(struct inode *dir,
+					struct qstr *entry,
+					struct ext3_dir_entry_2 **res_dir)
+{
+	struct super_block * sb;
+	struct buffer_head * bh_use[NAMEI_RA_SIZE];
+	struct buffer_head * bh, *ret = NULL;
+	unsigned long start, block, b;
+	const u8 *name = entry->name;
+	int ra_max = 0;		/* Number of bh's in the readahead
+				   buffer, bh_use[] */
+	int ra_ptr = 0;		/* Current index into readahead
+				   buffer */
+	int num = 0;
+	int nblocks, i, err;
+	int namelen;
+
+	*res_dir = NULL;
+	sb = dir->i_sb;
+	namelen = entry->len;
+	if (namelen > EXT3_NAME_LEN)
+		return NULL;
+	if ((namelen <= 2) && (name[0] == '.') &&
+	    (name[1] == '.' || name[1] == 0)) {
+		/*
+		 * "." or ".." will only be in the first block
+		 * NFS may look up ".."; "." should be handled by the VFS
+		 */
+		block = start = 0;
+		nblocks = 1;
+		goto restart;
+	}
+	if (is_dx(dir)) {
+		bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
+		/*
+		 * On success, or if the error was file not found,
+		 * return.  Otherwise, fall back to doing a search the
+		 * old fashioned way.
+		 */
+		if (bh || (err != ERR_BAD_DX_DIR))
+			return bh;
+		dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
+	}
+	nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+	start = EXT3_I(dir)->i_dir_start_lookup;
+	if (start >= nblocks)
+		start = 0;
+	block = start;
+restart:
+	do {
+		/*
+		 * We deal with the read-ahead logic here.
+		 */
+		if (ra_ptr >= ra_max) {
+			/* Refill the readahead buffer */
+			ra_ptr = 0;
+			b = block;
+			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+				/*
+				 * Terminate if we reach the end of the
+				 * directory and must wrap, or if our
+				 * search has finished at this block.
+				 */
+				if (b >= nblocks || (num && block == start)) {
+					bh_use[ra_max] = NULL;
+					break;
+				}
+				num++;
+				bh = ext3_getblk(NULL, dir, b++, 0, &err);
+				bh_use[ra_max] = bh;
+				if (bh)
+					ll_rw_block(READ_META, 1, &bh);
+			}
+		}
+		if ((bh = bh_use[ra_ptr++]) == NULL)
+			goto next;
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			/* read error, skip block & hope for the best */
+			ext3_error(sb, __func__, "reading directory #%lu "
+				   "offset %lu", dir->i_ino, block);
+			brelse(bh);
+			goto next;
+		}
+		i = search_dirblock(bh, dir, entry,
+			    block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
+		if (i == 1) {
+			EXT3_I(dir)->i_dir_start_lookup = block;
+			ret = bh;
+			goto cleanup_and_exit;
+		} else {
+			brelse(bh);
+			if (i < 0)
+				goto cleanup_and_exit;
+		}
+	next:
+		if (++block >= nblocks)
+			block = 0;
+	} while (block != start);
+
+	/*
+	 * If the directory has grown while we were searching, then
+	 * search the last part of the directory before giving up.
+	 */
+	block = nblocks;
+	nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+	if (block < nblocks) {
+		start = 0;
+		goto restart;
+	}
+
+cleanup_and_exit:
+	/* Clean up the read-ahead blocks */
+	for (; ra_ptr < ra_max; ra_ptr++)
+		brelse (bh_use[ra_ptr]);
+	return ret;
+}
+
+static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
+			struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
+			int *err)
+{
+	struct super_block *sb = dir->i_sb;
+	struct dx_hash_info	hinfo;
+	struct dx_frame frames[2], *frame;
+	struct buffer_head *bh;
+	unsigned long block;
+	int retval;
+
+	if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
+		return NULL;
+	do {
+		block = dx_get_block(frame->at);
+		if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
+			goto errout;
+
+		retval = search_dirblock(bh, dir, entry,
+					 block << EXT3_BLOCK_SIZE_BITS(sb),
+					 res_dir);
+		if (retval == 1) {
+			dx_release(frames);
+			return bh;
+		}
+		brelse(bh);
+		if (retval == -1) {
+			*err = ERR_BAD_DX_DIR;
+			goto errout;
+		}
+
+		/* Check to see if we should continue to search */
+		retval = ext3_htree_next_block(dir, hinfo.hash, frame,
+					       frames, NULL);
+		if (retval < 0) {
+			ext3_warning(sb, __func__,
+			     "error reading index page in directory #%lu",
+			     dir->i_ino);
+			*err = retval;
+			goto errout;
+		}
+	} while (retval == 1);
+
+	*err = -ENOENT;
+errout:
+	dxtrace(printk("%s not found\n", name));
+	dx_release (frames);
+	return NULL;
+}
+
+static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode * inode;
+	struct ext3_dir_entry_2 * de;
+	struct buffer_head * bh;
+
+	if (dentry->d_name.len > EXT3_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	bh = ext3_find_entry(dir, &dentry->d_name, &de);
+	inode = NULL;
+	if (bh) {
+		unsigned long ino = le32_to_cpu(de->inode);
+		brelse (bh);
+		if (!ext3_valid_inum(dir->i_sb, ino)) {
+			ext3_error(dir->i_sb, "ext3_lookup",
+				   "bad inode number: %lu", ino);
+			return ERR_PTR(-EIO);
+		}
+		inode = ext3_iget(dir->i_sb, ino);
+		if (IS_ERR(inode)) {
+			if (PTR_ERR(inode) == -ESTALE) {
+				ext3_error(dir->i_sb, __func__,
+						"deleted inode referenced: %lu",
+						ino);
+				return ERR_PTR(-EIO);
+			} else {
+				return ERR_CAST(inode);
+			}
+		}
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+
+struct dentry *ext3_get_parent(struct dentry *child)
+{
+	unsigned long ino;
+	struct qstr dotdot = {.name = "..", .len = 2};
+	struct ext3_dir_entry_2 * de;
+	struct buffer_head *bh;
+
+	bh = ext3_find_entry(child->d_inode, &dotdot, &de);
+	if (!bh)
+		return ERR_PTR(-ENOENT);
+	ino = le32_to_cpu(de->inode);
+	brelse(bh);
+
+	if (!ext3_valid_inum(child->d_inode->i_sb, ino)) {
+		ext3_error(child->d_inode->i_sb, "ext3_get_parent",
+			   "bad inode number: %lu", ino);
+		return ERR_PTR(-EIO);
+	}
+
+	return d_obtain_alias(ext3_iget(child->d_inode->i_sb, ino));
+}
+
+#define S_SHIFT 12
+static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= EXT3_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= EXT3_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= EXT3_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= EXT3_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= EXT3_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= EXT3_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= EXT3_FT_SYMLINK,
+};
+
+static inline void ext3_set_de_type(struct super_block *sb,
+				struct ext3_dir_entry_2 *de,
+				umode_t mode) {
+	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE))
+		de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+/*
+ * Move count entries from end of map between two memory locations.
+ * Returns pointer to last entry moved.
+ */
+static struct ext3_dir_entry_2 *
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
+{
+	unsigned rec_len = 0;
+
+	while (count--) {
+		struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
+		rec_len = EXT3_DIR_REC_LEN(de->name_len);
+		memcpy (to, de, rec_len);
+		((struct ext3_dir_entry_2 *) to)->rec_len =
+				ext3_rec_len_to_disk(rec_len);
+		de->inode = 0;
+		map++;
+		to += rec_len;
+	}
+	return (struct ext3_dir_entry_2 *) (to - rec_len);
+}
+
+/*
+ * Compact each dir entry in the range to the minimal rec_len.
+ * Returns pointer to last entry in range.
+ */
+static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
+{
+	struct ext3_dir_entry_2 *next, *to, *prev;
+	struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
+	unsigned rec_len = 0;
+
+	prev = to = de;
+	while ((char *)de < base + blocksize) {
+		next = ext3_next_entry(de);
+		if (de->inode && de->name_len) {
+			rec_len = EXT3_DIR_REC_LEN(de->name_len);
+			if (de > to)
+				memmove(to, de, rec_len);
+			to->rec_len = ext3_rec_len_to_disk(rec_len);
+			prev = to;
+			to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
+		}
+		de = next;
+	}
+	return prev;
+}
+
+/*
+ * Split a full leaf block to make room for a new dir entry.
+ * Allocate a new block, and move entries so that they are approx. equally full.
+ * Returns pointer to de in block into which the new entry will be inserted.
+ */
+static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+			struct buffer_head **bh,struct dx_frame *frame,
+			struct dx_hash_info *hinfo, int *error)
+{
+	unsigned blocksize = dir->i_sb->s_blocksize;
+	unsigned count, continued;
+	struct buffer_head *bh2;
+	u32 newblock;
+	u32 hash2;
+	struct dx_map_entry *map;
+	char *data1 = (*bh)->b_data, *data2;
+	unsigned split, move, size;
+	struct ext3_dir_entry_2 *de = NULL, *de2;
+	int	err = 0, i;
+
+	bh2 = ext3_append (handle, dir, &newblock, &err);
+	if (!(bh2)) {
+		brelse(*bh);
+		*bh = NULL;
+		goto errout;
+	}
+
+	BUFFER_TRACE(*bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, *bh);
+	if (err)
+		goto journal_error;
+
+	BUFFER_TRACE(frame->bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, frame->bh);
+	if (err)
+		goto journal_error;
+
+	data2 = bh2->b_data;
+
+	/* create map in the end of data2 block */
+	map = (struct dx_map_entry *) (data2 + blocksize);
+	count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
+			     blocksize, hinfo, map);
+	map -= count;
+	dx_sort_map (map, count);
+	/* Split the existing block in the middle, size-wise */
+	size = 0;
+	move = 0;
+	for (i = count-1; i >= 0; i--) {
+		/* is more than half of this entry in 2nd half of the block? */
+		if (size + map[i].size/2 > blocksize/2)
+			break;
+		size += map[i].size;
+		move++;
+	}
+	/* map index at which we will split */
+	split = count - move;
+	hash2 = map[split].hash;
+	continued = hash2 == map[split - 1].hash;
+	dxtrace(printk("Split block %i at %x, %i/%i\n",
+		dx_get_block(frame->at), hash2, split, count-split));
+
+	/* Fancy dance to stay within two buffers */
+	de2 = dx_move_dirents(data1, data2, map + split, count - split);
+	de = dx_pack_dirents(data1,blocksize);
+	de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
+	de2->rec_len = ext3_rec_len_to_disk(data2 + blocksize - (char *) de2);
+	dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
+	dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
+
+	/* Which block gets the new entry? */
+	if (hinfo->hash >= hash2)
+	{
+		swap(*bh, bh2);
+		de = de2;
+	}
+	dx_insert_block (frame, hash2 + continued, newblock);
+	err = ext3_journal_dirty_metadata (handle, bh2);
+	if (err)
+		goto journal_error;
+	err = ext3_journal_dirty_metadata (handle, frame->bh);
+	if (err)
+		goto journal_error;
+	brelse (bh2);
+	dxtrace(dx_show_index ("frame", frame->entries));
+	return de;
+
+journal_error:
+	brelse(*bh);
+	brelse(bh2);
+	*bh = NULL;
+	ext3_std_error(dir->i_sb, err);
+errout:
+	*error = err;
+	return NULL;
+}
+
+
+/*
+ * Add a new entry into a directory (leaf) block.  If de is non-NULL,
+ * it points to a directory entry which is guaranteed to be large
+ * enough for new directory entry.  If de is NULL, then
+ * add_dirent_to_buf will attempt search the directory block for
+ * space.  It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ *
+ * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
+ * all other cases bh is released.
+ */
+static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
+			     struct inode *inode, struct ext3_dir_entry_2 *de,
+			     struct buffer_head * bh)
+{
+	struct inode	*dir = dentry->d_parent->d_inode;
+	const char	*name = dentry->d_name.name;
+	int		namelen = dentry->d_name.len;
+	unsigned long	offset = 0;
+	unsigned short	reclen;
+	int		nlen, rlen, err;
+	char		*top;
+
+	reclen = EXT3_DIR_REC_LEN(namelen);
+	if (!de) {
+		de = (struct ext3_dir_entry_2 *)bh->b_data;
+		top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+		while ((char *) de <= top) {
+			if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
+						  bh, offset)) {
+				brelse (bh);
+				return -EIO;
+			}
+			if (ext3_match (namelen, name, de)) {
+				brelse (bh);
+				return -EEXIST;
+			}
+			nlen = EXT3_DIR_REC_LEN(de->name_len);
+			rlen = ext3_rec_len_from_disk(de->rec_len);
+			if ((de->inode? rlen - nlen: rlen) >= reclen)
+				break;
+			de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
+			offset += rlen;
+		}
+		if ((char *) de > top)
+			return -ENOSPC;
+	}
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, bh);
+	if (err) {
+		ext3_std_error(dir->i_sb, err);
+		brelse(bh);
+		return err;
+	}
+
+	/* By now the buffer is marked for journaling */
+	nlen = EXT3_DIR_REC_LEN(de->name_len);
+	rlen = ext3_rec_len_from_disk(de->rec_len);
+	if (de->inode) {
+		struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
+		de1->rec_len = ext3_rec_len_to_disk(rlen - nlen);
+		de->rec_len = ext3_rec_len_to_disk(nlen);
+		de = de1;
+	}
+	de->file_type = EXT3_FT_UNKNOWN;
+	if (inode) {
+		de->inode = cpu_to_le32(inode->i_ino);
+		ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+	} else
+		de->inode = 0;
+	de->name_len = namelen;
+	memcpy (de->name, name, namelen);
+	/*
+	 * XXX shouldn't update any times until successful
+	 * completion of syscall, but too many callers depend
+	 * on this.
+	 *
+	 * XXX similarly, too many callers depend on
+	 * ext3_new_inode() setting the times, but error
+	 * recovery deletes the inode, so the worst that can
+	 * happen is that the times are slightly out of date
+	 * and/or different from the directory change time.
+	 */
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	ext3_update_dx_flag(dir);
+	dir->i_version++;
+	ext3_mark_inode_dirty(handle, dir);
+	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+	err = ext3_journal_dirty_metadata(handle, bh);
+	if (err)
+		ext3_std_error(dir->i_sb, err);
+	brelse(bh);
+	return 0;
+}
+
+/*
+ * This converts a one block unindexed directory to a 3 block indexed
+ * directory, and adds the dentry to the indexed directory.
+ */
+static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
+			    struct inode *inode, struct buffer_head *bh)
+{
+	struct inode	*dir = dentry->d_parent->d_inode;
+	const char	*name = dentry->d_name.name;
+	int		namelen = dentry->d_name.len;
+	struct buffer_head *bh2;
+	struct dx_root	*root;
+	struct dx_frame	frames[2], *frame;
+	struct dx_entry *entries;
+	struct ext3_dir_entry_2	*de, *de2;
+	char		*data1, *top;
+	unsigned	len;
+	int		retval;
+	unsigned	blocksize;
+	struct dx_hash_info hinfo;
+	u32		block;
+	struct fake_dirent *fde;
+
+	blocksize =  dir->i_sb->s_blocksize;
+	dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
+	retval = ext3_journal_get_write_access(handle, bh);
+	if (retval) {
+		ext3_std_error(dir->i_sb, retval);
+		brelse(bh);
+		return retval;
+	}
+	root = (struct dx_root *) bh->b_data;
+
+	/* The 0th block becomes the root, move the dirents out */
+	fde = &root->dotdot;
+	de = (struct ext3_dir_entry_2 *)((char *)fde +
+			ext3_rec_len_from_disk(fde->rec_len));
+	if ((char *) de >= (((char *) root) + blocksize)) {
+		ext3_error(dir->i_sb, __func__,
+			   "invalid rec_len for '..' in inode %lu",
+			   dir->i_ino);
+		brelse(bh);
+		return -EIO;
+	}
+	len = ((char *) root) + blocksize - (char *) de;
+
+	bh2 = ext3_append (handle, dir, &block, &retval);
+	if (!(bh2)) {
+		brelse(bh);
+		return retval;
+	}
+	EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
+	data1 = bh2->b_data;
+
+	memcpy (data1, de, len);
+	de = (struct ext3_dir_entry_2 *) data1;
+	top = data1 + len;
+	while ((char *)(de2 = ext3_next_entry(de)) < top)
+		de = de2;
+	de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
+	/* Initialize the root; the dot dirents already exist */
+	de = (struct ext3_dir_entry_2 *) (&root->dotdot);
+	de->rec_len = ext3_rec_len_to_disk(blocksize - EXT3_DIR_REC_LEN(2));
+	memset (&root->info, 0, sizeof(root->info));
+	root->info.info_length = sizeof(root->info);
+	root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+	entries = root->entries;
+	dx_set_block (entries, 1);
+	dx_set_count (entries, 1);
+	dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
+
+	/* Initialize as for dx_probe */
+	hinfo.hash_version = root->info.hash_version;
+	if (hinfo.hash_version <= DX_HASH_TEA)
+		hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
+	hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+	ext3fs_dirhash(name, namelen, &hinfo);
+	frame = frames;
+	frame->entries = entries;
+	frame->at = entries;
+	frame->bh = bh;
+	bh = bh2;
+	/*
+	 * Mark buffers dirty here so that if do_split() fails we write a
+	 * consistent set of buffers to disk.
+	 */
+	ext3_journal_dirty_metadata(handle, frame->bh);
+	ext3_journal_dirty_metadata(handle, bh);
+	de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
+	if (!de) {
+		ext3_mark_inode_dirty(handle, dir);
+		dx_release(frames);
+		return retval;
+	}
+	dx_release(frames);
+
+	return add_dirent_to_buf(handle, dentry, inode, de, bh);
+}
+
+/*
+ *	ext3_add_entry()
+ *
+ * adds a file entry to the specified directory, using the same
+ * semantics as ext3_find_entry(). It returns NULL if it failed.
+ *
+ * NOTE!! The inode part of 'de' is left at 0 - which means you
+ * may not sleep between calling this and putting something into
+ * the entry, as someone else might have used it while you slept.
+ */
+static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
+	struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct buffer_head * bh;
+	struct ext3_dir_entry_2 *de;
+	struct super_block * sb;
+	int	retval;
+	int	dx_fallback=0;
+	unsigned blocksize;
+	u32 block, blocks;
+
+	sb = dir->i_sb;
+	blocksize = sb->s_blocksize;
+	if (!dentry->d_name.len)
+		return -EINVAL;
+	if (is_dx(dir)) {
+		retval = ext3_dx_add_entry(handle, dentry, inode);
+		if (!retval || (retval != ERR_BAD_DX_DIR))
+			return retval;
+		EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
+		dx_fallback++;
+		ext3_mark_inode_dirty(handle, dir);
+	}
+	blocks = dir->i_size >> sb->s_blocksize_bits;
+	for (block = 0; block < blocks; block++) {
+		bh = ext3_bread(handle, dir, block, 0, &retval);
+		if(!bh)
+			return retval;
+		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+		if (retval != -ENOSPC)
+			return retval;
+
+		if (blocks == 1 && !dx_fallback &&
+		    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
+			return make_indexed_dir(handle, dentry, inode, bh);
+		brelse(bh);
+	}
+	bh = ext3_append(handle, dir, &block, &retval);
+	if (!bh)
+		return retval;
+	de = (struct ext3_dir_entry_2 *) bh->b_data;
+	de->inode = 0;
+	de->rec_len = ext3_rec_len_to_disk(blocksize);
+	return add_dirent_to_buf(handle, dentry, inode, de, bh);
+}
+
+/*
+ * Returns 0 for success, or a negative error value
+ */
+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+			     struct inode *inode)
+{
+	struct dx_frame frames[2], *frame;
+	struct dx_entry *entries, *at;
+	struct dx_hash_info hinfo;
+	struct buffer_head * bh;
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct super_block * sb = dir->i_sb;
+	struct ext3_dir_entry_2 *de;
+	int err;
+
+	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+	if (!frame)
+		return err;
+	entries = frame->entries;
+	at = frame->at;
+
+	if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+		goto cleanup;
+
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, bh);
+	if (err)
+		goto journal_error;
+
+	err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+	if (err != -ENOSPC) {
+		bh = NULL;
+		goto cleanup;
+	}
+
+	/* Block full, should compress but for now just split */
+	dxtrace(printk("using %u of %u node entries\n",
+		       dx_get_count(entries), dx_get_limit(entries)));
+	/* Need to split index? */
+	if (dx_get_count(entries) == dx_get_limit(entries)) {
+		u32 newblock;
+		unsigned icount = dx_get_count(entries);
+		int levels = frame - frames;
+		struct dx_entry *entries2;
+		struct dx_node *node2;
+		struct buffer_head *bh2;
+
+		if (levels && (dx_get_count(frames->entries) ==
+			       dx_get_limit(frames->entries))) {
+			ext3_warning(sb, __func__,
+				     "Directory index full!");
+			err = -ENOSPC;
+			goto cleanup;
+		}
+		bh2 = ext3_append (handle, dir, &newblock, &err);
+		if (!(bh2))
+			goto cleanup;
+		node2 = (struct dx_node *)(bh2->b_data);
+		entries2 = node2->entries;
+		memset(&node2->fake, 0, sizeof(struct fake_dirent));
+		node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
+		BUFFER_TRACE(frame->bh, "get_write_access");
+		err = ext3_journal_get_write_access(handle, frame->bh);
+		if (err)
+			goto journal_error;
+		if (levels) {
+			unsigned icount1 = icount/2, icount2 = icount - icount1;
+			unsigned hash2 = dx_get_hash(entries + icount1);
+			dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+
+			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+			err = ext3_journal_get_write_access(handle,
+							     frames[0].bh);
+			if (err)
+				goto journal_error;
+
+			memcpy ((char *) entries2, (char *) (entries + icount1),
+				icount2 * sizeof(struct dx_entry));
+			dx_set_count (entries, icount1);
+			dx_set_count (entries2, icount2);
+			dx_set_limit (entries2, dx_node_limit(dir));
+
+			/* Which index block gets the new entry? */
+			if (at - entries >= icount1) {
+				frame->at = at = at - entries - icount1 + entries2;
+				frame->entries = entries = entries2;
+				swap(frame->bh, bh2);
+			}
+			dx_insert_block (frames + 0, hash2, newblock);
+			dxtrace(dx_show_index ("node", frames[1].entries));
+			dxtrace(dx_show_index ("node",
+			       ((struct dx_node *) bh2->b_data)->entries));
+			err = ext3_journal_dirty_metadata(handle, bh2);
+			if (err)
+				goto journal_error;
+			brelse (bh2);
+		} else {
+			dxtrace(printk("Creating second level index...\n"));
+			memcpy((char *) entries2, (char *) entries,
+			       icount * sizeof(struct dx_entry));
+			dx_set_limit(entries2, dx_node_limit(dir));
+
+			/* Set up root */
+			dx_set_count(entries, 1);
+			dx_set_block(entries + 0, newblock);
+			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
+
+			/* Add new access path frame */
+			frame = frames + 1;
+			frame->at = at = at - entries + entries2;
+			frame->entries = entries = entries2;
+			frame->bh = bh2;
+			err = ext3_journal_get_write_access(handle,
+							     frame->bh);
+			if (err)
+				goto journal_error;
+		}
+		err = ext3_journal_dirty_metadata(handle, frames[0].bh);
+		if (err)
+			goto journal_error;
+	}
+	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
+	if (!de)
+		goto cleanup;
+	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	bh = NULL;
+	goto cleanup;
+
+journal_error:
+	ext3_std_error(dir->i_sb, err);
+cleanup:
+	if (bh)
+		brelse(bh);
+	dx_release(frames);
+	return err;
+}
+
+/*
+ * ext3_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+static int ext3_delete_entry (handle_t *handle,
+			      struct inode * dir,
+			      struct ext3_dir_entry_2 * de_del,
+			      struct buffer_head * bh)
+{
+	struct ext3_dir_entry_2 * de, * pde;
+	int i;
+
+	i = 0;
+	pde = NULL;
+	de = (struct ext3_dir_entry_2 *) bh->b_data;
+	while (i < bh->b_size) {
+		if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
+			return -EIO;
+		if (de == de_del)  {
+			int err;
+
+			BUFFER_TRACE(bh, "get_write_access");
+			err = ext3_journal_get_write_access(handle, bh);
+			if (err)
+				goto journal_error;
+
+			if (pde)
+				pde->rec_len = ext3_rec_len_to_disk(
+					ext3_rec_len_from_disk(pde->rec_len) +
+					ext3_rec_len_from_disk(de->rec_len));
+			else
+				de->inode = 0;
+			dir->i_version++;
+			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+			err = ext3_journal_dirty_metadata(handle, bh);
+			if (err) {
+journal_error:
+				ext3_std_error(dir->i_sb, err);
+				return err;
+			}
+			return 0;
+		}
+		i += ext3_rec_len_from_disk(de->rec_len);
+		pde = de;
+		de = ext3_next_entry(de);
+	}
+	return -ENOENT;
+}
+
+static int ext3_add_nondir(handle_t *handle,
+		struct dentry *dentry, struct inode *inode)
+{
+	int err = ext3_add_entry(handle, dentry, inode);
+	if (!err) {
+		ext3_mark_inode_dirty(handle, inode);
+		d_instantiate(dentry, inode);
+		unlock_new_inode(inode);
+		return 0;
+	}
+	drop_nlink(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
+}
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
+		struct nameidata *nd)
+{
+	handle_t *handle;
+	struct inode * inode;
+	int err, retries = 0;
+
+	dquot_initialize(dir);
+
+retry:
+	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+					EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &ext3_file_inode_operations;
+		inode->i_fop = &ext3_file_operations;
+		ext3_set_aops(inode);
+		err = ext3_add_nondir(handle, dentry, inode);
+	}
+	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+static int ext3_mknod (struct inode * dir, struct dentry *dentry,
+			int mode, dev_t rdev)
+{
+	handle_t *handle;
+	struct inode *inode;
+	int err, retries = 0;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	dquot_initialize(dir);
+
+retry:
+	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+					EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+#ifdef CONFIG_EXT3_FS_XATTR
+		inode->i_op = &ext3_special_inode_operations;
+#endif
+		err = ext3_add_nondir(handle, dentry, inode);
+	}
+	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+{
+	handle_t *handle;
+	struct inode * inode;
+	struct buffer_head * dir_block = NULL;
+	struct ext3_dir_entry_2 * de;
+	int err, retries = 0;
+
+	if (dir->i_nlink >= EXT3_LINK_MAX)
+		return -EMLINK;
+
+	dquot_initialize(dir);
+
+retry:
+	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+					EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_stop;
+
+	inode->i_op = &ext3_dir_inode_operations;
+	inode->i_fop = &ext3_dir_operations;
+	inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+	dir_block = ext3_bread (handle, inode, 0, 1, &err);
+	if (!dir_block)
+		goto out_clear_inode;
+
+	BUFFER_TRACE(dir_block, "get_write_access");
+	err = ext3_journal_get_write_access(handle, dir_block);
+	if (err)
+		goto out_clear_inode;
+
+	de = (struct ext3_dir_entry_2 *) dir_block->b_data;
+	de->inode = cpu_to_le32(inode->i_ino);
+	de->name_len = 1;
+	de->rec_len = ext3_rec_len_to_disk(EXT3_DIR_REC_LEN(de->name_len));
+	strcpy (de->name, ".");
+	ext3_set_de_type(dir->i_sb, de, S_IFDIR);
+	de = ext3_next_entry(de);
+	de->inode = cpu_to_le32(dir->i_ino);
+	de->rec_len = ext3_rec_len_to_disk(inode->i_sb->s_blocksize -
+					EXT3_DIR_REC_LEN(1));
+	de->name_len = 2;
+	strcpy (de->name, "..");
+	ext3_set_de_type(dir->i_sb, de, S_IFDIR);
+	inode->i_nlink = 2;
+	BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
+	err = ext3_journal_dirty_metadata(handle, dir_block);
+	if (err)
+		goto out_clear_inode;
+
+	err = ext3_mark_inode_dirty(handle, inode);
+	if (!err)
+		err = ext3_add_entry (handle, dentry, inode);
+
+	if (err) {
+out_clear_inode:
+		inode->i_nlink = 0;
+		unlock_new_inode(inode);
+		ext3_mark_inode_dirty(handle, inode);
+		iput (inode);
+		goto out_stop;
+	}
+	inc_nlink(dir);
+	ext3_update_dx_flag(dir);
+	err = ext3_mark_inode_dirty(handle, dir);
+	if (err)
+		goto out_clear_inode;
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+out_stop:
+	brelse(dir_block);
+	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+static int empty_dir (struct inode * inode)
+{
+	unsigned long offset;
+	struct buffer_head * bh;
+	struct ext3_dir_entry_2 * de, * de1;
+	struct super_block * sb;
+	int err = 0;
+
+	sb = inode->i_sb;
+	if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
+	    !(bh = ext3_bread (NULL, inode, 0, 0, &err))) {
+		if (err)
+			ext3_error(inode->i_sb, __func__,
+				   "error %d reading directory #%lu offset 0",
+				   err, inode->i_ino);
+		else
+			ext3_warning(inode->i_sb, __func__,
+				     "bad directory (dir #%lu) - no data block",
+				     inode->i_ino);
+		return 1;
+	}
+	de = (struct ext3_dir_entry_2 *) bh->b_data;
+	de1 = ext3_next_entry(de);
+	if (le32_to_cpu(de->inode) != inode->i_ino ||
+			!le32_to_cpu(de1->inode) ||
+			strcmp (".", de->name) ||
+			strcmp ("..", de1->name)) {
+		ext3_warning (inode->i_sb, "empty_dir",
+			      "bad directory (dir #%lu) - no `.' or `..'",
+			      inode->i_ino);
+		brelse (bh);
+		return 1;
+	}
+	offset = ext3_rec_len_from_disk(de->rec_len) +
+			ext3_rec_len_from_disk(de1->rec_len);
+	de = ext3_next_entry(de1);
+	while (offset < inode->i_size ) {
+		if (!bh ||
+			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+			err = 0;
+			brelse (bh);
+			bh = ext3_bread (NULL, inode,
+				offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err);
+			if (!bh) {
+				if (err)
+					ext3_error(sb, __func__,
+						   "error %d reading directory"
+						   " #%lu offset %lu",
+						   err, inode->i_ino, offset);
+				offset += sb->s_blocksize;
+				continue;
+			}
+			de = (struct ext3_dir_entry_2 *) bh->b_data;
+		}
+		if (!ext3_check_dir_entry("empty_dir", inode, de, bh, offset)) {
+			de = (struct ext3_dir_entry_2 *)(bh->b_data +
+							 sb->s_blocksize);
+			offset = (offset | (sb->s_blocksize - 1)) + 1;
+			continue;
+		}
+		if (le32_to_cpu(de->inode)) {
+			brelse (bh);
+			return 0;
+		}
+		offset += ext3_rec_len_from_disk(de->rec_len);
+		de = ext3_next_entry(de);
+	}
+	brelse (bh);
+	return 1;
+}
+
+/* ext3_orphan_add() links an unlinked or truncated inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the
+ * file is closed/deleted, or in case the inode truncate spans multiple
+ * transactions and the last transaction is not recovered after a crash.
+ *
+ * At filesystem recovery time, we walk this list deleting unlinked
+ * inodes and truncating linked inodes in ext3_orphan_cleanup().
+ */
+int ext3_orphan_add(handle_t *handle, struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext3_iloc iloc;
+	int err = 0, rc;
+
+	mutex_lock(&EXT3_SB(sb)->s_orphan_lock);
+	if (!list_empty(&EXT3_I(inode)->i_orphan))
+		goto out_unlock;
+
+	/* Orphan handling is only valid for files with data blocks
+	 * being truncated, or files being unlinked. */
+
+	/* @@@ FIXME: Observation from aviro:
+	 * I think I can trigger J_ASSERT in ext3_orphan_add().  We block
+	 * here (on s_orphan_lock), so race with ext3_link() which might bump
+	 * ->i_nlink. For, say it, character device. Not a regular file,
+	 * not a directory, not a symlink and ->i_nlink > 0.
+	 *
+	 * tytso, 4/25/2009: I'm not sure how that could happen;
+	 * shouldn't the fs core protect us from these sort of
+	 * unlink()/link() races?
+	 */
+	J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+
+	BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
+	if (err)
+		goto out_unlock;
+
+	err = ext3_reserve_inode_write(handle, inode, &iloc);
+	if (err)
+		goto out_unlock;
+
+	/* Insert this inode at the head of the on-disk orphan list... */
+	NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
+	EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
+	if (!err)
+		err = rc;
+
+	/* Only add to the head of the in-memory list if all the
+	 * previous operations succeeded.  If the orphan_add is going to
+	 * fail (possibly taking the journal offline), we can't risk
+	 * leaving the inode on the orphan list: stray orphan-list
+	 * entries can cause panics at unmount time.
+	 *
+	 * This is safe: on error we're going to ignore the orphan list
+	 * anyway on the next recovery. */
+	if (!err)
+		list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
+
+	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
+	jbd_debug(4, "orphan inode %lu will point to %d\n",
+			inode->i_ino, NEXT_ORPHAN(inode));
+out_unlock:
+	mutex_unlock(&EXT3_SB(sb)->s_orphan_lock);
+	ext3_std_error(inode->i_sb, err);
+	return err;
+}
+
+/*
+ * ext3_orphan_del() removes an unlinked or truncated inode from the list
+ * of such inodes stored on disk, because it is finally being cleaned up.
+ */
+int ext3_orphan_del(handle_t *handle, struct inode *inode)
+{
+	struct list_head *prev;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	struct ext3_sb_info *sbi;
+	unsigned long ino_next;
+	struct ext3_iloc iloc;
+	int err = 0;
+
+	mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
+	if (list_empty(&ei->i_orphan))
+		goto out;
+
+	ino_next = NEXT_ORPHAN(inode);
+	prev = ei->i_orphan.prev;
+	sbi = EXT3_SB(inode->i_sb);
+
+	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+
+	list_del_init(&ei->i_orphan);
+
+	/* If we're on an error path, we may not have a valid
+	 * transaction handle with which to update the orphan list on
+	 * disk, but we still need to remove the inode from the linked
+	 * list in memory. */
+	if (!handle)
+		goto out;
+
+	err = ext3_reserve_inode_write(handle, inode, &iloc);
+	if (err)
+		goto out_err;
+
+	if (prev == &sbi->s_orphan) {
+		jbd_debug(4, "superblock will point to %lu\n", ino_next);
+		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+		err = ext3_journal_get_write_access(handle, sbi->s_sbh);
+		if (err)
+			goto out_brelse;
+		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+		err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+	} else {
+		struct ext3_iloc iloc2;
+		struct inode *i_prev =
+			&list_entry(prev, struct ext3_inode_info, i_orphan)->vfs_inode;
+
+		jbd_debug(4, "orphan inode %lu will point to %lu\n",
+			  i_prev->i_ino, ino_next);
+		err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
+		if (err)
+			goto out_brelse;
+		NEXT_ORPHAN(i_prev) = ino_next;
+		err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2);
+	}
+	if (err)
+		goto out_brelse;
+	NEXT_ORPHAN(inode) = 0;
+	err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+
+out_err:
+	ext3_std_error(inode->i_sb, err);
+out:
+	mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
+	return err;
+
+out_brelse:
+	brelse(iloc.bh);
+	goto out_err;
+}
+
+static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
+{
+	int retval;
+	struct inode * inode;
+	struct buffer_head * bh;
+	struct ext3_dir_entry_2 * de;
+	handle_t *handle;
+
+	/* Initialize quotas before so that eventual writes go in
+	 * separate transaction */
+	dquot_initialize(dir);
+	dquot_initialize(dentry->d_inode);
+
+	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	retval = -ENOENT;
+	bh = ext3_find_entry(dir, &dentry->d_name, &de);
+	if (!bh)
+		goto end_rmdir;
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode = dentry->d_inode;
+
+	retval = -EIO;
+	if (le32_to_cpu(de->inode) != inode->i_ino)
+		goto end_rmdir;
+
+	retval = -ENOTEMPTY;
+	if (!empty_dir (inode))
+		goto end_rmdir;
+
+	retval = ext3_delete_entry(handle, dir, de, bh);
+	if (retval)
+		goto end_rmdir;
+	if (inode->i_nlink != 2)
+		ext3_warning (inode->i_sb, "ext3_rmdir",
+			      "empty directory has nlink!=2 (%d)",
+			      inode->i_nlink);
+	inode->i_version++;
+	clear_nlink(inode);
+	/* There's no need to set i_disksize: the fact that i_nlink is
+	 * zero will ensure that the right thing happens during any
+	 * recovery. */
+	inode->i_size = 0;
+	ext3_orphan_add(handle, inode);
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	ext3_mark_inode_dirty(handle, inode);
+	drop_nlink(dir);
+	ext3_update_dx_flag(dir);
+	ext3_mark_inode_dirty(handle, dir);
+
+end_rmdir:
+	ext3_journal_stop(handle);
+	brelse (bh);
+	return retval;
+}
+
+static int ext3_unlink(struct inode * dir, struct dentry *dentry)
+{
+	int retval;
+	struct inode * inode;
+	struct buffer_head * bh;
+	struct ext3_dir_entry_2 * de;
+	handle_t *handle;
+
+	/* Initialize quotas before so that eventual writes go
+	 * in separate transaction */
+	dquot_initialize(dir);
+	dquot_initialize(dentry->d_inode);
+
+	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	retval = -ENOENT;
+	bh = ext3_find_entry(dir, &dentry->d_name, &de);
+	if (!bh)
+		goto end_unlink;
+
+	inode = dentry->d_inode;
+
+	retval = -EIO;
+	if (le32_to_cpu(de->inode) != inode->i_ino)
+		goto end_unlink;
+
+	if (!inode->i_nlink) {
+		ext3_warning (inode->i_sb, "ext3_unlink",
+			      "Deleting nonexistent file (%lu), %d",
+			      inode->i_ino, inode->i_nlink);
+		inode->i_nlink = 1;
+	}
+	retval = ext3_delete_entry(handle, dir, de, bh);
+	if (retval)
+		goto end_unlink;
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	ext3_update_dx_flag(dir);
+	ext3_mark_inode_dirty(handle, dir);
+	drop_nlink(inode);
+	if (!inode->i_nlink)
+		ext3_orphan_add(handle, inode);
+	inode->i_ctime = dir->i_ctime;
+	ext3_mark_inode_dirty(handle, inode);
+	retval = 0;
+
+end_unlink:
+	ext3_journal_stop(handle);
+	brelse (bh);
+	return retval;
+}
+
+static int ext3_symlink (struct inode * dir,
+		struct dentry *dentry, const char * symname)
+{
+	handle_t *handle;
+	struct inode * inode;
+	int l, err, retries = 0;
+	int credits;
+
+	l = strlen(symname)+1;
+	if (l > dir->i_sb->s_blocksize)
+		return -ENAMETOOLONG;
+
+	dquot_initialize(dir);
+
+	if (l > EXT3_N_BLOCKS * 4) {
+		/*
+		 * For non-fast symlinks, we just allocate inode and put it on
+		 * orphan list in the first transaction => we need bitmap,
+		 * group descriptor, sb, inode block, quota blocks, and
+		 * possibly selinux xattr blocks.
+		 */
+		credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+			  EXT3_XATTR_TRANS_BLOCKS;
+	} else {
+		/*
+		 * Fast symlink. We have to add entry to directory
+		 * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
+		 * allocate new inode (bitmap, group descriptor, inode block,
+		 * quota blocks, sb is already counted in previous macros).
+		 */
+		credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+			  EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+			  EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+	}
+retry:
+	handle = ext3_journal_start(dir, credits);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_stop;
+
+	if (l > EXT3_N_BLOCKS * 4) {
+		inode->i_op = &ext3_symlink_inode_operations;
+		ext3_set_aops(inode);
+		/*
+		 * We cannot call page_symlink() with transaction started
+		 * because it calls into ext3_write_begin() which acquires page
+		 * lock which ranks below transaction start (and it can also
+		 * wait for journal commit if we are running out of space). So
+		 * we have to stop transaction now and restart it when symlink
+		 * contents is written. 
+		 *
+		 * To keep fs consistent in case of crash, we have to put inode
+		 * to orphan list in the mean time.
+		 */
+		drop_nlink(inode);
+		err = ext3_orphan_add(handle, inode);
+		ext3_journal_stop(handle);
+		if (err)
+			goto err_drop_inode;
+		err = __page_symlink(inode, symname, l, 1);
+		if (err)
+			goto err_drop_inode;
+		/*
+		 * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
+		 * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+		 */
+		handle = ext3_journal_start(dir,
+				EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+				EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto err_drop_inode;
+		}
+		inc_nlink(inode);
+		err = ext3_orphan_del(handle, inode);
+		if (err) {
+			ext3_journal_stop(handle);
+			drop_nlink(inode);
+			goto err_drop_inode;
+		}
+	} else {
+		inode->i_op = &ext3_fast_symlink_inode_operations;
+		memcpy((char*)&EXT3_I(inode)->i_data,symname,l);
+		inode->i_size = l-1;
+	}
+	EXT3_I(inode)->i_disksize = inode->i_size;
+	err = ext3_add_nondir(handle, dentry, inode);
+out_stop:
+	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+err_drop_inode:
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
+}
+
+static int ext3_link (struct dentry * old_dentry,
+		struct inode * dir, struct dentry *dentry)
+{
+	handle_t *handle;
+	struct inode *inode = old_dentry->d_inode;
+	int err, retries = 0;
+
+	if (inode->i_nlink >= EXT3_LINK_MAX)
+		return -EMLINK;
+
+	dquot_initialize(dir);
+
+retry:
+	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT3_INDEX_EXTRA_TRANS_BLOCKS);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	inc_nlink(inode);
+	ihold(inode);
+
+	err = ext3_add_entry(handle, dentry, inode);
+	if (!err) {
+		ext3_mark_inode_dirty(handle, inode);
+		d_instantiate(dentry, inode);
+	} else {
+		drop_nlink(inode);
+		iput(inode);
+	}
+	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+#define PARENT_INO(buffer) \
+	(ext3_next_entry((struct ext3_dir_entry_2 *)(buffer))->inode)
+
+/*
+ * Anybody can rename anything with this: the permission checks are left to the
+ * higher-level routines.
+ */
+static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
+			   struct inode * new_dir,struct dentry *new_dentry)
+{
+	handle_t *handle;
+	struct inode * old_inode, * new_inode;
+	struct buffer_head * old_bh, * new_bh, * dir_bh;
+	struct ext3_dir_entry_2 * old_de, * new_de;
+	int retval, flush_file = 0;
+
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
+	old_bh = new_bh = dir_bh = NULL;
+
+	/* Initialize quotas before so that eventual writes go
+	 * in separate transaction */
+	if (new_dentry->d_inode)
+		dquot_initialize(new_dentry->d_inode);
+	handle = ext3_journal_start(old_dir, 2 *
+					EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+		handle->h_sync = 1;
+
+	old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de);
+	/*
+	 *  Check for inode number is _not_ due to possible IO errors.
+	 *  We might rmdir the source, keep it as pwd of some process
+	 *  and merrily kill the link to whatever was created under the
+	 *  same name. Goodbye sticky bit ;-<
+	 */
+	old_inode = old_dentry->d_inode;
+	retval = -ENOENT;
+	if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+		goto end_rename;
+
+	new_inode = new_dentry->d_inode;
+	new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de);
+	if (new_bh) {
+		if (!new_inode) {
+			brelse (new_bh);
+			new_bh = NULL;
+		}
+	}
+	if (S_ISDIR(old_inode->i_mode)) {
+		if (new_inode) {
+			retval = -ENOTEMPTY;
+			if (!empty_dir (new_inode))
+				goto end_rename;
+		}
+		retval = -EIO;
+		dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval);
+		if (!dir_bh)
+			goto end_rename;
+		if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+			goto end_rename;
+		retval = -EMLINK;
+		if (!new_inode && new_dir!=old_dir &&
+				new_dir->i_nlink >= EXT3_LINK_MAX)
+			goto end_rename;
+	}
+	if (!new_bh) {
+		retval = ext3_add_entry (handle, new_dentry, old_inode);
+		if (retval)
+			goto end_rename;
+	} else {
+		BUFFER_TRACE(new_bh, "get write access");
+		retval = ext3_journal_get_write_access(handle, new_bh);
+		if (retval)
+			goto journal_error;
+		new_de->inode = cpu_to_le32(old_inode->i_ino);
+		if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
+					      EXT3_FEATURE_INCOMPAT_FILETYPE))
+			new_de->file_type = old_de->file_type;
+		new_dir->i_version++;
+		new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
+		ext3_mark_inode_dirty(handle, new_dir);
+		BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
+		retval = ext3_journal_dirty_metadata(handle, new_bh);
+		if (retval)
+			goto journal_error;
+		brelse(new_bh);
+		new_bh = NULL;
+	}
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+	 * rename.
+	 */
+	old_inode->i_ctime = CURRENT_TIME_SEC;
+	ext3_mark_inode_dirty(handle, old_inode);
+
+	/*
+	 * ok, that's it
+	 */
+	if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
+	    old_de->name_len != old_dentry->d_name.len ||
+	    strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
+	    (retval = ext3_delete_entry(handle, old_dir,
+					old_de, old_bh)) == -ENOENT) {
+		/* old_de could have moved from under us during htree split, so
+		 * make sure that we are deleting the right entry.  We might
+		 * also be pointing to a stale entry in the unused part of
+		 * old_bh so just checking inum and the name isn't enough. */
+		struct buffer_head *old_bh2;
+		struct ext3_dir_entry_2 *old_de2;
+
+		old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name,
+					  &old_de2);
+		if (old_bh2) {
+			retval = ext3_delete_entry(handle, old_dir,
+						   old_de2, old_bh2);
+			brelse(old_bh2);
+		}
+	}
+	if (retval) {
+		ext3_warning(old_dir->i_sb, "ext3_rename",
+				"Deleting old file (%lu), %d, error=%d",
+				old_dir->i_ino, old_dir->i_nlink, retval);
+	}
+
+	if (new_inode) {
+		drop_nlink(new_inode);
+		new_inode->i_ctime = CURRENT_TIME_SEC;
+	}
+	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+	ext3_update_dx_flag(old_dir);
+	if (dir_bh) {
+		BUFFER_TRACE(dir_bh, "get_write_access");
+		retval = ext3_journal_get_write_access(handle, dir_bh);
+		if (retval)
+			goto journal_error;
+		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
+		BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
+		retval = ext3_journal_dirty_metadata(handle, dir_bh);
+		if (retval) {
+journal_error:
+			ext3_std_error(new_dir->i_sb, retval);
+			goto end_rename;
+		}
+		drop_nlink(old_dir);
+		if (new_inode) {
+			drop_nlink(new_inode);
+		} else {
+			inc_nlink(new_dir);
+			ext3_update_dx_flag(new_dir);
+			ext3_mark_inode_dirty(handle, new_dir);
+		}
+	}
+	ext3_mark_inode_dirty(handle, old_dir);
+	if (new_inode) {
+		ext3_mark_inode_dirty(handle, new_inode);
+		if (!new_inode->i_nlink)
+			ext3_orphan_add(handle, new_inode);
+		if (ext3_should_writeback_data(new_inode))
+			flush_file = 1;
+	}
+	retval = 0;
+
+end_rename:
+	brelse (dir_bh);
+	brelse (old_bh);
+	brelse (new_bh);
+	ext3_journal_stop(handle);
+	if (retval == 0 && flush_file)
+		filemap_flush(old_inode->i_mapping);
+	return retval;
+}
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations ext3_dir_inode_operations = {
+	.create		= ext3_create,
+	.lookup		= ext3_lookup,
+	.link		= ext3_link,
+	.unlink		= ext3_unlink,
+	.symlink	= ext3_symlink,
+	.mkdir		= ext3_mkdir,
+	.rmdir		= ext3_rmdir,
+	.mknod		= ext3_mknod,
+	.rename		= ext3_rename,
+	.setattr	= ext3_setattr,
+#ifdef CONFIG_EXT3_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext3_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.check_acl	= ext3_check_acl,
+};
+
+const struct inode_operations ext3_special_inode_operations = {
+	.setattr	= ext3_setattr,
+#ifdef CONFIG_EXT3_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext3_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.check_acl	= ext3_check_acl,
+};
diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h
new file mode 100644
index 00000000..f2ce2b00
--- /dev/null
+++ b/fs/ext3/namei.h
@@ -0,0 +1,8 @@
+/*  linux/fs/ext3/namei.h
+ *
+ * Copyright (C) 2005 Simtec Electronics
+ *	Ben Dooks <ben@simtec.co.uk>
+ *
+*/
+
+extern struct dentry *ext3_get_parent(struct dentry *child);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
new file mode 100644
index 00000000..7916e4ce
--- /dev/null
+++ b/fs/ext3/resize.c
@@ -0,0 +1,1120 @@
+/*
+ *  linux/fs/ext3/resize.c
+ *
+ * Support for resizing an ext3 filesystem while it is mounted.
+ *
+ * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
+ *
+ * This could probably be made into a module, because it is not often in use.
+ */
+
+
+#define EXT3FS_DEBUG
+
+#include <linux/ext3_jbd.h>
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+
+#define outside(b, first, last)	((b) < (first) || (b) >= (last))
+#define inside(b, first, last)	((b) >= (first) && (b) < (last))
+
+static int verify_group_input(struct super_block *sb,
+			      struct ext3_new_group_data *input)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_super_block *es = sbi->s_es;
+	ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count);
+	ext3_fsblk_t end = start + input->blocks_count;
+	unsigned group = input->group;
+	ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
+	unsigned overhead = ext3_bg_has_super(sb, group) ?
+		(1 + ext3_bg_num_gdb(sb, group) +
+		 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+	ext3_fsblk_t metaend = start + overhead;
+	struct buffer_head *bh = NULL;
+	ext3_grpblk_t free_blocks_count;
+	int err = -EINVAL;
+
+	input->free_blocks_count = free_blocks_count =
+		input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT3-fs: adding %s group %u: %u blocks "
+		       "(%d free, %u reserved)\n",
+		       ext3_bg_has_super(sb, input->group) ? "normal" :
+		       "no-super", input->group, input->blocks_count,
+		       free_blocks_count, input->reserved_blocks);
+
+	if (group != sbi->s_groups_count)
+		ext3_warning(sb, __func__,
+			     "Cannot add at group %u (only %lu groups)",
+			     input->group, sbi->s_groups_count);
+	else if ((start - le32_to_cpu(es->s_first_data_block)) %
+		 EXT3_BLOCKS_PER_GROUP(sb))
+		ext3_warning(sb, __func__, "Last group not full");
+	else if (input->reserved_blocks > input->blocks_count / 5)
+		ext3_warning(sb, __func__, "Reserved blocks too high (%u)",
+			     input->reserved_blocks);
+	else if (free_blocks_count < 0)
+		ext3_warning(sb, __func__, "Bad blocks count %u",
+			     input->blocks_count);
+	else if (!(bh = sb_bread(sb, end - 1)))
+		ext3_warning(sb, __func__,
+			     "Cannot read last block ("E3FSBLK")",
+			     end - 1);
+	else if (outside(input->block_bitmap, start, end))
+		ext3_warning(sb, __func__,
+			     "Block bitmap not in group (block %u)",
+			     input->block_bitmap);
+	else if (outside(input->inode_bitmap, start, end))
+		ext3_warning(sb, __func__,
+			     "Inode bitmap not in group (block %u)",
+			     input->inode_bitmap);
+	else if (outside(input->inode_table, start, end) ||
+	         outside(itend - 1, start, end))
+		ext3_warning(sb, __func__,
+			     "Inode table not in group (blocks %u-"E3FSBLK")",
+			     input->inode_table, itend - 1);
+	else if (input->inode_bitmap == input->block_bitmap)
+		ext3_warning(sb, __func__,
+			     "Block bitmap same as inode bitmap (%u)",
+			     input->block_bitmap);
+	else if (inside(input->block_bitmap, input->inode_table, itend))
+		ext3_warning(sb, __func__,
+			     "Block bitmap (%u) in inode table (%u-"E3FSBLK")",
+			     input->block_bitmap, input->inode_table, itend-1);
+	else if (inside(input->inode_bitmap, input->inode_table, itend))
+		ext3_warning(sb, __func__,
+			     "Inode bitmap (%u) in inode table (%u-"E3FSBLK")",
+			     input->inode_bitmap, input->inode_table, itend-1);
+	else if (inside(input->block_bitmap, start, metaend))
+		ext3_warning(sb, __func__,
+			     "Block bitmap (%u) in GDT table"
+			     " ("E3FSBLK"-"E3FSBLK")",
+			     input->block_bitmap, start, metaend - 1);
+	else if (inside(input->inode_bitmap, start, metaend))
+		ext3_warning(sb, __func__,
+			     "Inode bitmap (%u) in GDT table"
+			     " ("E3FSBLK"-"E3FSBLK")",
+			     input->inode_bitmap, start, metaend - 1);
+	else if (inside(input->inode_table, start, metaend) ||
+	         inside(itend - 1, start, metaend))
+		ext3_warning(sb, __func__,
+			     "Inode table (%u-"E3FSBLK") overlaps"
+			     "GDT table ("E3FSBLK"-"E3FSBLK")",
+			     input->inode_table, itend - 1, start, metaend - 1);
+	else
+		err = 0;
+	brelse(bh);
+
+	return err;
+}
+
+static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
+				  ext3_fsblk_t blk)
+{
+	struct buffer_head *bh;
+	int err;
+
+	bh = sb_getblk(sb, blk);
+	if (!bh)
+		return ERR_PTR(-EIO);
+	if ((err = ext3_journal_get_write_access(handle, bh))) {
+		brelse(bh);
+		bh = ERR_PTR(err);
+	} else {
+		lock_buffer(bh);
+		memset(bh->b_data, 0, sb->s_blocksize);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+	}
+
+	return bh;
+}
+
+/*
+ * To avoid calling the atomic setbit hundreds or thousands of times, we only
+ * need to use it within a single byte (to ensure we get endianness right).
+ * We can use memset for the rest of the bitmap as there are no other users.
+ */
+static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+{
+	int i;
+
+	if (start_bit >= end_bit)
+		return;
+
+	ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
+	for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
+		ext3_set_bit(i, bitmap);
+	if (i < end_bit)
+		memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
+}
+
+/*
+ * If we have fewer than thresh credits, extend by EXT3_MAX_TRANS_DATA.
+ * If that fails, restart the transaction & regain write access for the
+ * buffer head which is used for block_bitmap modifications.
+ */
+static int extend_or_restart_transaction(handle_t *handle, int thresh,
+					 struct buffer_head *bh)
+{
+	int err;
+
+	if (handle->h_buffer_credits >= thresh)
+		return 0;
+
+	err = ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA);
+	if (err < 0)
+		return err;
+	if (err) {
+		err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA);
+		if (err)
+			return err;
+		err = ext3_journal_get_write_access(handle, bh);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/*
+ * Set up the block and inode bitmaps, and the inode table for the new group.
+ * This doesn't need to be part of the main transaction, since we are only
+ * changing blocks outside the actual filesystem.  We still do journaling to
+ * ensure the recovery is correct in case of a failure just after resize.
+ * If any part of this fails, we simply abort the resize.
+ */
+static int setup_new_group_blocks(struct super_block *sb,
+				  struct ext3_new_group_data *input)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group);
+	int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
+		le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
+	unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group);
+	struct buffer_head *bh;
+	handle_t *handle;
+	ext3_fsblk_t block;
+	ext3_grpblk_t bit;
+	int i;
+	int err = 0, err2;
+
+	/* This transaction may be extended/restarted along the way */
+	handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA);
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	mutex_lock(&sbi->s_resize_lock);
+	if (input->group != sbi->s_groups_count) {
+		err = -EBUSY;
+		goto exit_journal;
+	}
+
+	if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
+		err = PTR_ERR(bh);
+		goto exit_journal;
+	}
+
+	if (ext3_bg_has_super(sb, input->group)) {
+		ext3_debug("mark backup superblock %#04lx (+0)\n", start);
+		ext3_set_bit(0, bh->b_data);
+	}
+
+	/* Copy all of the GDT blocks into the backup in this group */
+	for (i = 0, bit = 1, block = start + 1;
+	     i < gdblocks; i++, block++, bit++) {
+		struct buffer_head *gdb;
+
+		ext3_debug("update backup group %#04lx (+%d)\n", block, bit);
+
+		err = extend_or_restart_transaction(handle, 1, bh);
+		if (err)
+			goto exit_bh;
+
+		gdb = sb_getblk(sb, block);
+		if (!gdb) {
+			err = -EIO;
+			goto exit_bh;
+		}
+		if ((err = ext3_journal_get_write_access(handle, gdb))) {
+			brelse(gdb);
+			goto exit_bh;
+		}
+		lock_buffer(gdb);
+		memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
+		set_buffer_uptodate(gdb);
+		unlock_buffer(gdb);
+		err = ext3_journal_dirty_metadata(handle, gdb);
+		if (err) {
+			brelse(gdb);
+			goto exit_bh;
+		}
+		ext3_set_bit(bit, bh->b_data);
+		brelse(gdb);
+	}
+
+	/* Zero out all of the reserved backup group descriptor table blocks */
+	for (i = 0, bit = gdblocks + 1, block = start + bit;
+	     i < reserved_gdb; i++, block++, bit++) {
+		struct buffer_head *gdb;
+
+		ext3_debug("clear reserved block %#04lx (+%d)\n", block, bit);
+
+		err = extend_or_restart_transaction(handle, 1, bh);
+		if (err)
+			goto exit_bh;
+
+		if (IS_ERR(gdb = bclean(handle, sb, block))) {
+			err = PTR_ERR(gdb);
+			goto exit_bh;
+		}
+		err = ext3_journal_dirty_metadata(handle, gdb);
+		if (err) {
+			brelse(gdb);
+			goto exit_bh;
+		}
+		ext3_set_bit(bit, bh->b_data);
+		brelse(gdb);
+	}
+	ext3_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
+		   input->block_bitmap - start);
+	ext3_set_bit(input->block_bitmap - start, bh->b_data);
+	ext3_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
+		   input->inode_bitmap - start);
+	ext3_set_bit(input->inode_bitmap - start, bh->b_data);
+
+	/* Zero out all of the inode table blocks */
+	for (i = 0, block = input->inode_table, bit = block - start;
+	     i < sbi->s_itb_per_group; i++, bit++, block++) {
+		struct buffer_head *it;
+
+		ext3_debug("clear inode block %#04lx (+%d)\n", block, bit);
+
+		err = extend_or_restart_transaction(handle, 1, bh);
+		if (err)
+			goto exit_bh;
+
+		if (IS_ERR(it = bclean(handle, sb, block))) {
+			err = PTR_ERR(it);
+			goto exit_bh;
+		}
+		err = ext3_journal_dirty_metadata(handle, it);
+		if (err) {
+			brelse(it);
+			goto exit_bh;
+		}
+		brelse(it);
+		ext3_set_bit(bit, bh->b_data);
+	}
+
+	err = extend_or_restart_transaction(handle, 2, bh);
+	if (err)
+		goto exit_bh;
+
+	mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
+			bh->b_data);
+	err = ext3_journal_dirty_metadata(handle, bh);
+	if (err)
+		goto exit_bh;
+	brelse(bh);
+
+	/* Mark unused entries in inode bitmap used */
+	ext3_debug("clear inode bitmap %#04x (+%ld)\n",
+		   input->inode_bitmap, input->inode_bitmap - start);
+	if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
+		err = PTR_ERR(bh);
+		goto exit_journal;
+	}
+
+	mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
+			bh->b_data);
+	err = ext3_journal_dirty_metadata(handle, bh);
+exit_bh:
+	brelse(bh);
+
+exit_journal:
+	mutex_unlock(&sbi->s_resize_lock);
+	if ((err2 = ext3_journal_stop(handle)) && !err)
+		err = err2;
+
+	return err;
+}
+
+/*
+ * Iterate through the groups which hold BACKUP superblock/GDT copies in an
+ * ext3 filesystem.  The counters should be initialized to 1, 5, and 7 before
+ * calling this for the first time.  In a sparse filesystem it will be the
+ * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
+ * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
+ */
+static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
+				  unsigned *five, unsigned *seven)
+{
+	unsigned *min = three;
+	int mult = 3;
+	unsigned ret;
+
+	if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
+					EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+		ret = *min;
+		*min += 1;
+		return ret;
+	}
+
+	if (*five < *min) {
+		min = five;
+		mult = 5;
+	}
+	if (*seven < *min) {
+		min = seven;
+		mult = 7;
+	}
+
+	ret = *min;
+	*min *= mult;
+
+	return ret;
+}
+
+/*
+ * Check that all of the backup GDT blocks are held in the primary GDT block.
+ * It is assumed that they are stored in group order.  Returns the number of
+ * groups in current filesystem that have BACKUPS, or -ve error code.
+ */
+static int verify_reserved_gdb(struct super_block *sb,
+			       struct buffer_head *primary)
+{
+	const ext3_fsblk_t blk = primary->b_blocknr;
+	const unsigned long end = EXT3_SB(sb)->s_groups_count;
+	unsigned three = 1;
+	unsigned five = 5;
+	unsigned seven = 7;
+	unsigned grp;
+	__le32 *p = (__le32 *)primary->b_data;
+	int gdbackups = 0;
+
+	while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
+		if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
+			ext3_warning(sb, __func__,
+				     "reserved GDT "E3FSBLK
+				     " missing grp %d ("E3FSBLK")",
+				     blk, grp,
+				     grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
+			return -EINVAL;
+		}
+		if (++gdbackups > EXT3_ADDR_PER_BLOCK(sb))
+			return -EFBIG;
+	}
+
+	return gdbackups;
+}
+
+/*
+ * Called when we need to bring a reserved group descriptor table block into
+ * use from the resize inode.  The primary copy of the new GDT block currently
+ * is an indirect block (under the double indirect block in the resize inode).
+ * The new backup GDT blocks will be stored as leaf blocks in this indirect
+ * block, in group order.  Even though we know all the block numbers we need,
+ * we check to ensure that the resize inode has actually reserved these blocks.
+ *
+ * Don't need to update the block bitmaps because the blocks are still in use.
+ *
+ * We get all of the error cases out of the way, so that we are sure to not
+ * fail once we start modifying the data on disk, because JBD has no rollback.
+ */
+static int add_new_gdb(handle_t *handle, struct inode *inode,
+		       struct ext3_new_group_data *input,
+		       struct buffer_head **primary)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+	unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
+	ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
+	struct buffer_head **o_group_desc, **n_group_desc;
+	struct buffer_head *dind;
+	int gdbackups;
+	struct ext3_iloc iloc;
+	__le32 *data;
+	int err;
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG
+		       "EXT3-fs: ext3_add_new_gdb: adding group block %lu\n",
+		       gdb_num);
+
+	/*
+	 * If we are not using the primary superblock/GDT copy don't resize,
+	 * because the user tools have no way of handling this.  Probably a
+	 * bad time to do it anyways.
+	 */
+	if (EXT3_SB(sb)->s_sbh->b_blocknr !=
+	    le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) {
+		ext3_warning(sb, __func__,
+			"won't resize using backup superblock at %llu",
+			(unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr);
+		return -EPERM;
+	}
+
+	*primary = sb_bread(sb, gdblock);
+	if (!*primary)
+		return -EIO;
+
+	if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
+		err = gdbackups;
+		goto exit_bh;
+	}
+
+	data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
+	dind = sb_bread(sb, le32_to_cpu(*data));
+	if (!dind) {
+		err = -EIO;
+		goto exit_bh;
+	}
+
+	data = (__le32 *)dind->b_data;
+	if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
+		ext3_warning(sb, __func__,
+			     "new group %u GDT block "E3FSBLK" not reserved",
+			     input->group, gdblock);
+		err = -EINVAL;
+		goto exit_dind;
+	}
+
+	if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh)))
+		goto exit_dind;
+
+	if ((err = ext3_journal_get_write_access(handle, *primary)))
+		goto exit_sbh;
+
+	if ((err = ext3_journal_get_write_access(handle, dind)))
+		goto exit_primary;
+
+	/* ext3_reserve_inode_write() gets a reference on the iloc */
+	if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
+		goto exit_dindj;
+
+	n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
+			GFP_NOFS);
+	if (!n_group_desc) {
+		err = -ENOMEM;
+		ext3_warning (sb, __func__,
+			      "not enough memory for %lu groups", gdb_num + 1);
+		goto exit_inode;
+	}
+
+	/*
+	 * Finally, we have all of the possible failures behind us...
+	 *
+	 * Remove new GDT block from inode double-indirect block and clear out
+	 * the new GDT block for use (which also "frees" the backup GDT blocks
+	 * from the reserved inode).  We don't need to change the bitmaps for
+	 * these blocks, because they are marked as in-use from being in the
+	 * reserved inode, and will become GDT blocks (primary and backup).
+	 */
+	data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
+	err = ext3_journal_dirty_metadata(handle, dind);
+	if (err)
+		goto exit_group_desc;
+	brelse(dind);
+	dind = NULL;
+	inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
+	err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+	if (err)
+		goto exit_group_desc;
+	memset((*primary)->b_data, 0, sb->s_blocksize);
+	err = ext3_journal_dirty_metadata(handle, *primary);
+	if (err)
+		goto exit_group_desc;
+
+	o_group_desc = EXT3_SB(sb)->s_group_desc;
+	memcpy(n_group_desc, o_group_desc,
+	       EXT3_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+	n_group_desc[gdb_num] = *primary;
+	EXT3_SB(sb)->s_group_desc = n_group_desc;
+	EXT3_SB(sb)->s_gdb_count++;
+	kfree(o_group_desc);
+
+	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
+	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	if (err)
+		goto exit_inode;
+
+	return 0;
+
+exit_group_desc:
+	kfree(n_group_desc);
+exit_inode:
+	//ext3_journal_release_buffer(handle, iloc.bh);
+	brelse(iloc.bh);
+exit_dindj:
+	//ext3_journal_release_buffer(handle, dind);
+exit_primary:
+	//ext3_journal_release_buffer(handle, *primary);
+exit_sbh:
+	//ext3_journal_release_buffer(handle, *primary);
+exit_dind:
+	brelse(dind);
+exit_bh:
+	brelse(*primary);
+
+	ext3_debug("leaving with error %d\n", err);
+	return err;
+}
+
+/*
+ * Called when we are adding a new group which has a backup copy of each of
+ * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
+ * We need to add these reserved backup GDT blocks to the resize inode, so
+ * that they are kept for future resizing and not allocated to files.
+ *
+ * Each reserved backup GDT block will go into a different indirect block.
+ * The indirect blocks are actually the primary reserved GDT blocks,
+ * so we know in advance what their block numbers are.  We only get the
+ * double-indirect block to verify it is pointing to the primary reserved
+ * GDT blocks so we don't overwrite a data block by accident.  The reserved
+ * backup GDT blocks are stored in their reserved primary GDT block.
+ */
+static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
+			      struct ext3_new_group_data *input)
+{
+	struct super_block *sb = inode->i_sb;
+	int reserved_gdb =le16_to_cpu(EXT3_SB(sb)->s_es->s_reserved_gdt_blocks);
+	struct buffer_head **primary;
+	struct buffer_head *dind;
+	struct ext3_iloc iloc;
+	ext3_fsblk_t blk;
+	__le32 *data, *end;
+	int gdbackups = 0;
+	int res, i;
+	int err;
+
+	primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS);
+	if (!primary)
+		return -ENOMEM;
+
+	data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
+	dind = sb_bread(sb, le32_to_cpu(*data));
+	if (!dind) {
+		err = -EIO;
+		goto exit_free;
+	}
+
+	blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count;
+	data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count %
+					 EXT3_ADDR_PER_BLOCK(sb));
+	end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
+
+	/* Get each reserved primary GDT block and verify it holds backups */
+	for (res = 0; res < reserved_gdb; res++, blk++) {
+		if (le32_to_cpu(*data) != blk) {
+			ext3_warning(sb, __func__,
+				     "reserved block "E3FSBLK
+				     " not at offset %ld",
+				     blk,
+				     (long)(data - (__le32 *)dind->b_data));
+			err = -EINVAL;
+			goto exit_bh;
+		}
+		primary[res] = sb_bread(sb, blk);
+		if (!primary[res]) {
+			err = -EIO;
+			goto exit_bh;
+		}
+		if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
+			brelse(primary[res]);
+			err = gdbackups;
+			goto exit_bh;
+		}
+		if (++data >= end)
+			data = (__le32 *)dind->b_data;
+	}
+
+	for (i = 0; i < reserved_gdb; i++) {
+		if ((err = ext3_journal_get_write_access(handle, primary[i]))) {
+			/*
+			int j;
+			for (j = 0; j < i; j++)
+				ext3_journal_release_buffer(handle, primary[j]);
+			 */
+			goto exit_bh;
+		}
+	}
+
+	if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
+		goto exit_bh;
+
+	/*
+	 * Finally we can add each of the reserved backup GDT blocks from
+	 * the new group to its reserved primary GDT block.
+	 */
+	blk = input->group * EXT3_BLOCKS_PER_GROUP(sb);
+	for (i = 0; i < reserved_gdb; i++) {
+		int err2;
+		data = (__le32 *)primary[i]->b_data;
+		/* printk("reserving backup %lu[%u] = %lu\n",
+		       primary[i]->b_blocknr, gdbackups,
+		       blk + primary[i]->b_blocknr); */
+		data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
+		err2 = ext3_journal_dirty_metadata(handle, primary[i]);
+		if (!err)
+			err = err2;
+	}
+	inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
+	ext3_mark_iloc_dirty(handle, inode, &iloc);
+
+exit_bh:
+	while (--res >= 0)
+		brelse(primary[res]);
+	brelse(dind);
+
+exit_free:
+	kfree(primary);
+
+	return err;
+}
+
+/*
+ * Update the backup copies of the ext3 metadata.  These don't need to be part
+ * of the main resize transaction, because e2fsck will re-write them if there
+ * is a problem (basically only OOM will cause a problem).  However, we
+ * _should_ update the backups if possible, in case the primary gets trashed
+ * for some reason and we need to run e2fsck from a backup superblock.  The
+ * important part is that the new block and inode counts are in the backup
+ * superblocks, and the location of the new group metadata in the GDT backups.
+ *
+ * We do not need take the s_resize_lock for this, because these
+ * blocks are not otherwise touched by the filesystem code when it is
+ * mounted.  We don't need to worry about last changing from
+ * sbi->s_groups_count, because the worst that can happen is that we
+ * do not copy the full number of backups at this time.  The resize
+ * which changed s_groups_count will backup again.
+ */
+static void update_backups(struct super_block *sb,
+			   int blk_off, char *data, int size)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	const unsigned long last = sbi->s_groups_count;
+	const int bpg = EXT3_BLOCKS_PER_GROUP(sb);
+	unsigned three = 1;
+	unsigned five = 5;
+	unsigned seven = 7;
+	unsigned group;
+	int rest = sb->s_blocksize - size;
+	handle_t *handle;
+	int err = 0, err2;
+
+	handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA);
+	if (IS_ERR(handle)) {
+		group = 1;
+		err = PTR_ERR(handle);
+		goto exit_err;
+	}
+
+	while ((group = ext3_list_backups(sb, &three, &five, &seven)) < last) {
+		struct buffer_head *bh;
+
+		/* Out of journal space, and can't get more - abort - so sad */
+		if (handle->h_buffer_credits == 0 &&
+		    ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA) &&
+		    (err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA)))
+			break;
+
+		bh = sb_getblk(sb, group * bpg + blk_off);
+		if (!bh) {
+			err = -EIO;
+			break;
+		}
+		ext3_debug("update metadata backup %#04lx\n",
+			  (unsigned long)bh->b_blocknr);
+		if ((err = ext3_journal_get_write_access(handle, bh))) {
+			brelse(bh);
+			break;
+		}
+		lock_buffer(bh);
+		memcpy(bh->b_data, data, size);
+		if (rest)
+			memset(bh->b_data + size, 0, rest);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		err = ext3_journal_dirty_metadata(handle, bh);
+		brelse(bh);
+		if (err)
+			break;
+	}
+	if ((err2 = ext3_journal_stop(handle)) && !err)
+		err = err2;
+
+	/*
+	 * Ugh! Need to have e2fsck write the backup copies.  It is too
+	 * late to revert the resize, we shouldn't fail just because of
+	 * the backup copies (they are only needed in case of corruption).
+	 *
+	 * However, if we got here we have a journal problem too, so we
+	 * can't really start a transaction to mark the superblock.
+	 * Chicken out and just set the flag on the hope it will be written
+	 * to disk, and if not - we will simply wait until next fsck.
+	 */
+exit_err:
+	if (err) {
+		ext3_warning(sb, __func__,
+			     "can't update backup for group %d (err %d), "
+			     "forcing fsck on next reboot", group, err);
+		sbi->s_mount_state &= ~EXT3_VALID_FS;
+		sbi->s_es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
+		mark_buffer_dirty(sbi->s_sbh);
+	}
+}
+
+/* Add group descriptor data to an existing or new group descriptor block.
+ * Ensure we handle all possible error conditions _before_ we start modifying
+ * the filesystem, because we cannot abort the transaction and not have it
+ * write the data to disk.
+ *
+ * If we are on a GDT block boundary, we need to get the reserved GDT block.
+ * Otherwise, we may need to add backup GDT blocks for a sparse group.
+ *
+ * We only need to hold the superblock lock while we are actually adding
+ * in the new group's counts to the superblock.  Prior to that we have
+ * not really "added" the group at all.  We re-check that we are still
+ * adding in the last group in case things have changed since verifying.
+ */
+int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_super_block *es = sbi->s_es;
+	int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
+		le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+	struct buffer_head *primary = NULL;
+	struct ext3_group_desc *gdp;
+	struct inode *inode = NULL;
+	handle_t *handle;
+	int gdb_off, gdb_num;
+	int err, err2;
+
+	gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
+	gdb_off = input->group % EXT3_DESC_PER_BLOCK(sb);
+
+	if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb,
+					EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+		ext3_warning(sb, __func__,
+			     "Can't resize non-sparse filesystem further");
+		return -EPERM;
+	}
+
+	if (le32_to_cpu(es->s_blocks_count) + input->blocks_count <
+	    le32_to_cpu(es->s_blocks_count)) {
+		ext3_warning(sb, __func__, "blocks_count overflow\n");
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) <
+	    le32_to_cpu(es->s_inodes_count)) {
+		ext3_warning(sb, __func__, "inodes_count overflow\n");
+		return -EINVAL;
+	}
+
+	if (reserved_gdb || gdb_off == 0) {
+		if (!EXT3_HAS_COMPAT_FEATURE(sb,
+					     EXT3_FEATURE_COMPAT_RESIZE_INODE)
+		    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
+			ext3_warning(sb, __func__,
+				     "No reserved GDT blocks, can't resize");
+			return -EPERM;
+		}
+		inode = ext3_iget(sb, EXT3_RESIZE_INO);
+		if (IS_ERR(inode)) {
+			ext3_warning(sb, __func__,
+				     "Error opening resize inode");
+			return PTR_ERR(inode);
+		}
+	}
+
+	if ((err = verify_group_input(sb, input)))
+		goto exit_put;
+
+	if ((err = setup_new_group_blocks(sb, input)))
+		goto exit_put;
+
+	/*
+	 * We will always be modifying at least the superblock and a GDT
+	 * block.  If we are adding a group past the last current GDT block,
+	 * we will also modify the inode and the dindirect block.  If we
+	 * are adding a group with superblock/GDT backups  we will also
+	 * modify each of the reserved GDT dindirect blocks.
+	 */
+	handle = ext3_journal_start_sb(sb,
+				       ext3_bg_has_super(sb, input->group) ?
+				       3 + reserved_gdb : 4);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto exit_put;
+	}
+
+	mutex_lock(&sbi->s_resize_lock);
+	if (input->group != sbi->s_groups_count) {
+		ext3_warning(sb, __func__,
+			     "multiple resizers run on filesystem!");
+		err = -EBUSY;
+		goto exit_journal;
+	}
+
+	if ((err = ext3_journal_get_write_access(handle, sbi->s_sbh)))
+		goto exit_journal;
+
+	/*
+	 * We will only either add reserved group blocks to a backup group
+	 * or remove reserved blocks for the first group in a new group block.
+	 * Doing both would be mean more complex code, and sane people don't
+	 * use non-sparse filesystems anymore.  This is already checked above.
+	 */
+	if (gdb_off) {
+		primary = sbi->s_group_desc[gdb_num];
+		if ((err = ext3_journal_get_write_access(handle, primary)))
+			goto exit_journal;
+
+		if (reserved_gdb && ext3_bg_num_gdb(sb, input->group) &&
+		    (err = reserve_backup_gdb(handle, inode, input)))
+			goto exit_journal;
+	} else if ((err = add_new_gdb(handle, inode, input, &primary)))
+		goto exit_journal;
+
+	/*
+	 * OK, now we've set up the new group.  Time to make it active.
+	 *
+	 * We do not lock all allocations via s_resize_lock
+	 * so we have to be safe wrt. concurrent accesses the group
+	 * data.  So we need to be careful to set all of the relevant
+	 * group descriptor data etc. *before* we enable the group.
+	 *
+	 * The key field here is sbi->s_groups_count: as long as
+	 * that retains its old value, nobody is going to access the new
+	 * group.
+	 *
+	 * So first we update all the descriptor metadata for the new
+	 * group; then we update the total disk blocks count; then we
+	 * update the groups count to enable the group; then finally we
+	 * update the free space counts so that the system can start
+	 * using the new disk blocks.
+	 */
+
+	/* Update group descriptor block for new group */
+	gdp = (struct ext3_group_desc *)primary->b_data + gdb_off;
+
+	gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap);
+	gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap);
+	gdp->bg_inode_table = cpu_to_le32(input->inode_table);
+	gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
+	gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb));
+
+	/*
+	 * Make the new blocks and inodes valid next.  We do this before
+	 * increasing the group count so that once the group is enabled,
+	 * all of its blocks and inodes are already valid.
+	 *
+	 * We always allocate group-by-group, then block-by-block or
+	 * inode-by-inode within a group, so enabling these
+	 * blocks/inodes before the group is live won't actually let us
+	 * allocate the new space yet.
+	 */
+	le32_add_cpu(&es->s_blocks_count, input->blocks_count);
+	le32_add_cpu(&es->s_inodes_count, EXT3_INODES_PER_GROUP(sb));
+
+	/*
+	 * We need to protect s_groups_count against other CPUs seeing
+	 * inconsistent state in the superblock.
+	 *
+	 * The precise rules we use are:
+	 *
+	 * * Writers of s_groups_count *must* hold s_resize_lock
+	 * AND
+	 * * Writers must perform a smp_wmb() after updating all dependent
+	 *   data and before modifying the groups count
+	 *
+	 * * Readers must hold s_resize_lock over the access
+	 * OR
+	 * * Readers must perform an smp_rmb() after reading the groups count
+	 *   and before reading any dependent data.
+	 *
+	 * NB. These rules can be relaxed when checking the group count
+	 * while freeing data, as we can only allocate from a block
+	 * group after serialising against the group count, and we can
+	 * only then free after serialising in turn against that
+	 * allocation.
+	 */
+	smp_wmb();
+
+	/* Update the global fs size fields */
+	sbi->s_groups_count++;
+
+	err = ext3_journal_dirty_metadata(handle, primary);
+	if (err)
+		goto exit_journal;
+
+	/* Update the reserved block counts only once the new group is
+	 * active. */
+	le32_add_cpu(&es->s_r_blocks_count, input->reserved_blocks);
+
+	/* Update the free space counts */
+	percpu_counter_add(&sbi->s_freeblocks_counter,
+			   input->free_blocks_count);
+	percpu_counter_add(&sbi->s_freeinodes_counter,
+			   EXT3_INODES_PER_GROUP(sb));
+
+	err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+
+exit_journal:
+	mutex_unlock(&sbi->s_resize_lock);
+	if ((err2 = ext3_journal_stop(handle)) && !err)
+		err = err2;
+	if (!err) {
+		update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+			       sizeof(struct ext3_super_block));
+		update_backups(sb, primary->b_blocknr, primary->b_data,
+			       primary->b_size);
+	}
+exit_put:
+	iput(inode);
+	return err;
+} /* ext3_group_add */
+
+/* Extend the filesystem to the new number of blocks specified.  This entry
+ * point is only used to extend the current filesystem to the end of the last
+ * existing group.  It can be accessed via ioctl, or by "remount,resize=<size>"
+ * for emergencies (because it has no dependencies on reserved blocks).
+ *
+ * If we _really_ wanted, we could use default values to call ext3_group_add()
+ * allow the "remount" trick to work for arbitrary resizing, assuming enough
+ * GDT blocks are reserved to grow to the desired size.
+ */
+int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
+		      ext3_fsblk_t n_blocks_count)
+{
+	ext3_fsblk_t o_blocks_count;
+	ext3_grpblk_t last;
+	ext3_grpblk_t add;
+	struct buffer_head * bh;
+	handle_t *handle;
+	int err;
+	unsigned long freed_blocks;
+
+	/* We don't need to worry about locking wrt other resizers just
+	 * yet: we're going to revalidate es->s_blocks_count after
+	 * taking the s_resize_lock below. */
+	o_blocks_count = le32_to_cpu(es->s_blocks_count);
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
+		       " up to "E3FSBLK" blocks\n",
+		       o_blocks_count, n_blocks_count);
+
+	if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
+		return 0;
+
+	if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+		printk(KERN_ERR "EXT3-fs: filesystem on %s:"
+			" too large to resize to "E3FSBLK" blocks safely\n",
+			sb->s_id, n_blocks_count);
+		if (sizeof(sector_t) < 8)
+			ext3_warning(sb, __func__,
+			"CONFIG_LBDAF not enabled\n");
+		return -EINVAL;
+	}
+
+	if (n_blocks_count < o_blocks_count) {
+		ext3_warning(sb, __func__,
+			     "can't shrink FS - resize aborted");
+		return -EBUSY;
+	}
+
+	/* Handle the remaining blocks in the last group only. */
+	last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) %
+		EXT3_BLOCKS_PER_GROUP(sb);
+
+	if (last == 0) {
+		ext3_warning(sb, __func__,
+			     "need to use ext2online to resize further");
+		return -EPERM;
+	}
+
+	add = EXT3_BLOCKS_PER_GROUP(sb) - last;
+
+	if (o_blocks_count + add < o_blocks_count) {
+		ext3_warning(sb, __func__, "blocks_count overflow");
+		return -EINVAL;
+	}
+
+	if (o_blocks_count + add > n_blocks_count)
+		add = n_blocks_count - o_blocks_count;
+
+	if (o_blocks_count + add < n_blocks_count)
+		ext3_warning(sb, __func__,
+			     "will only finish group ("E3FSBLK
+			     " blocks, %u new)",
+			     o_blocks_count + add, add);
+
+	/* See if the device is actually as big as what was requested */
+	bh = sb_bread(sb, o_blocks_count + add -1);
+	if (!bh) {
+		ext3_warning(sb, __func__,
+			     "can't read last block, resize aborted");
+		return -ENOSPC;
+	}
+	brelse(bh);
+
+	/* We will update the superblock, one block bitmap, and
+	 * one group descriptor via ext3_free_blocks().
+	 */
+	handle = ext3_journal_start_sb(sb, 3);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		ext3_warning(sb, __func__, "error %d on journal start",err);
+		goto exit_put;
+	}
+
+	mutex_lock(&EXT3_SB(sb)->s_resize_lock);
+	if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
+		ext3_warning(sb, __func__,
+			     "multiple resizers run on filesystem!");
+		mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
+		ext3_journal_stop(handle);
+		err = -EBUSY;
+		goto exit_put;
+	}
+
+	if ((err = ext3_journal_get_write_access(handle,
+						 EXT3_SB(sb)->s_sbh))) {
+		ext3_warning(sb, __func__,
+			     "error %d on journal write access", err);
+		mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
+		ext3_journal_stop(handle);
+		goto exit_put;
+	}
+	es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
+	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
+	if (err) {
+		ext3_warning(sb, __func__,
+			     "error %d on journal dirty metadata", err);
+		ext3_journal_stop(handle);
+		goto exit_put;
+	}
+	ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
+		   o_blocks_count, o_blocks_count + add);
+	ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
+	ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n",
+		   o_blocks_count, o_blocks_count + add);
+	if ((err = ext3_journal_stop(handle)))
+		goto exit_put;
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT3-fs: extended group to %u blocks\n",
+		       le32_to_cpu(es->s_blocks_count));
+	update_backups(sb, EXT3_SB(sb)->s_sbh->b_blocknr, (char *)es,
+		       sizeof(struct ext3_super_block));
+exit_put:
+	return err;
+} /* ext3_group_extend */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
new file mode 100644
index 00000000..aad153ef
--- /dev/null
+++ b/fs/ext3/super.c
@@ -0,0 +1,3084 @@
+/*
+ *  linux/fs/ext3/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd.h>
+#include <linux/ext3_fs.h>
+#include <linux/ext3_jbd.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/seq_file.h>
+#include <linux/log2.h>
+#include <linux/cleancache.h>
+
+#include <asm/uaccess.h>
+
+#include "xattr.h"
+#include "acl.h"
+#include "namei.h"
+
+#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
+#else
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
+#endif
+
+static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
+			     unsigned long journal_devnum);
+static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
+			       unsigned int);
+static int ext3_commit_super(struct super_block *sb,
+			       struct ext3_super_block *es,
+			       int sync);
+static void ext3_mark_recovery_complete(struct super_block * sb,
+					struct ext3_super_block * es);
+static void ext3_clear_journal_err(struct super_block * sb,
+				   struct ext3_super_block * es);
+static int ext3_sync_fs(struct super_block *sb, int wait);
+static const char *ext3_decode_error(struct super_block * sb, int errno,
+				     char nbuf[16]);
+static int ext3_remount (struct super_block * sb, int * flags, char * data);
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
+static int ext3_unfreeze(struct super_block *sb);
+static int ext3_freeze(struct super_block *sb);
+
+/*
+ * Wrappers for journal_start/end.
+ *
+ * The only special thing we need to do here is to make sure that all
+ * journal_end calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ */
+handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
+{
+	journal_t *journal;
+
+	if (sb->s_flags & MS_RDONLY)
+		return ERR_PTR(-EROFS);
+
+	/* Special case here: if the journal has aborted behind our
+	 * backs (eg. EIO in the commit thread), then we still need to
+	 * take the FS itself readonly cleanly. */
+	journal = EXT3_SB(sb)->s_journal;
+	if (is_journal_aborted(journal)) {
+		ext3_abort(sb, __func__,
+			   "Detected aborted journal");
+		return ERR_PTR(-EROFS);
+	}
+
+	return journal_start(journal, nblocks);
+}
+
+/*
+ * The only special thing we need to do here is to make sure that all
+ * journal_stop calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ */
+int __ext3_journal_stop(const char *where, handle_t *handle)
+{
+	struct super_block *sb;
+	int err;
+	int rc;
+
+	sb = handle->h_transaction->t_journal->j_private;
+	err = handle->h_err;
+	rc = journal_stop(handle);
+
+	if (!err)
+		err = rc;
+	if (err)
+		__ext3_std_error(sb, where, err);
+	return err;
+}
+
+void ext3_journal_abort_handle(const char *caller, const char *err_fn,
+		struct buffer_head *bh, handle_t *handle, int err)
+{
+	char nbuf[16];
+	const char *errstr = ext3_decode_error(NULL, err, nbuf);
+
+	if (bh)
+		BUFFER_TRACE(bh, "abort");
+
+	if (!handle->h_err)
+		handle->h_err = err;
+
+	if (is_handle_aborted(handle))
+		return;
+
+	printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n",
+		caller, errstr, err_fn);
+
+	journal_abort_handle(handle);
+}
+
+void ext3_msg(struct super_block *sb, const char *prefix,
+		const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+
+	va_end(args);
+}
+
+/* Deal with the reporting of failure conditions on a filesystem such as
+ * inconsistencies detected or read IO failures.
+ *
+ * On ext2, we can store the error state of the filesystem in the
+ * superblock.  That is not possible on ext3, because we may have other
+ * write ordering constraints on the superblock which prevent us from
+ * writing it out straight away; and given that the journal is about to
+ * be aborted, we can't rely on the current, or future, transactions to
+ * write out the superblock safely.
+ *
+ * We'll just use the journal_abort() error code to record an error in
+ * the journal instead.  On recovery, the journal will complain about
+ * that error until we've noted it down and cleared it.
+ */
+
+static void ext3_handle_error(struct super_block *sb)
+{
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+
+	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
+	es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	if (!test_opt (sb, ERRORS_CONT)) {
+		journal_t *journal = EXT3_SB(sb)->s_journal;
+
+		set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
+		if (journal)
+			journal_abort(journal, -EIO);
+	}
+	if (test_opt (sb, ERRORS_RO)) {
+		ext3_msg(sb, KERN_CRIT,
+			"error: remounting filesystem read-only");
+		sb->s_flags |= MS_RDONLY;
+	}
+	ext3_commit_super(sb, es, 1);
+	if (test_opt(sb, ERRORS_PANIC))
+		panic("EXT3-fs (%s): panic forced after error\n",
+			sb->s_id);
+}
+
+void ext3_error(struct super_block *sb, const char *function,
+		const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
+	va_end(args);
+
+	ext3_handle_error(sb);
+}
+
+static const char *ext3_decode_error(struct super_block * sb, int errno,
+				     char nbuf[16])
+{
+	char *errstr = NULL;
+
+	switch (errno) {
+	case -EIO:
+		errstr = "IO failure";
+		break;
+	case -ENOMEM:
+		errstr = "Out of memory";
+		break;
+	case -EROFS:
+		if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT)
+			errstr = "Journal has aborted";
+		else
+			errstr = "Readonly filesystem";
+		break;
+	default:
+		/* If the caller passed in an extra buffer for unknown
+		 * errors, textualise them now.  Else we just return
+		 * NULL. */
+		if (nbuf) {
+			/* Check for truncated error codes... */
+			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+				errstr = nbuf;
+		}
+		break;
+	}
+
+	return errstr;
+}
+
+/* __ext3_std_error decodes expected errors from journaling functions
+ * automatically and invokes the appropriate error response.  */
+
+void __ext3_std_error (struct super_block * sb, const char * function,
+		       int errno)
+{
+	char nbuf[16];
+	const char *errstr;
+
+	/* Special case: if the error is EROFS, and we're not already
+	 * inside a transaction, then there's really no point in logging
+	 * an error. */
+	if (errno == -EROFS && journal_current_handle() == NULL &&
+	    (sb->s_flags & MS_RDONLY))
+		return;
+
+	errstr = ext3_decode_error(sb, errno, nbuf);
+	ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr);
+
+	ext3_handle_error(sb);
+}
+
+/*
+ * ext3_abort is a much stronger failure handler than ext3_error.  The
+ * abort function may be used to deal with unrecoverable failures such
+ * as journal IO errors or ENOMEM at a critical moment in log management.
+ *
+ * We unconditionally force the filesystem into an ABORT|READONLY state,
+ * unless the error response on the fs has been set to panic in which
+ * case we take the easy way out and panic immediately.
+ */
+
+void ext3_abort(struct super_block *sb, const char *function,
+		 const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
+	va_end(args);
+
+	if (test_opt(sb, ERRORS_PANIC))
+		panic("EXT3-fs: panic from previous error\n");
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	ext3_msg(sb, KERN_CRIT,
+		"error: remounting filesystem read-only");
+	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
+	sb->s_flags |= MS_RDONLY;
+	set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
+	if (EXT3_SB(sb)->s_journal)
+		journal_abort(EXT3_SB(sb)->s_journal, -EIO);
+}
+
+void ext3_warning(struct super_block *sb, const char *function,
+		  const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
+	va_end(args);
+}
+
+void ext3_update_dynamic_rev(struct super_block *sb)
+{
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+
+	if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
+		return;
+
+	ext3_msg(sb, KERN_WARNING,
+		"warning: updating to rev %d because of "
+		"new feature flag, running e2fsck is recommended",
+		EXT3_DYNAMIC_REV);
+
+	es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
+	es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
+	es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV);
+	/* leave es->s_feature_*compat flags alone */
+	/* es->s_uuid will be set by e2fsck if empty */
+
+	/*
+	 * The rest of the superblock fields should be zero, and if not it
+	 * means they are likely already in use, so leave them alone.  We
+	 * can leave it up to e2fsck to clean up any inconsistencies there.
+	 */
+}
+
+/*
+ * Open the external journal device
+ */
+static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
+{
+	struct block_device *bdev;
+	char b[BDEVNAME_SIZE];
+
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
+	if (IS_ERR(bdev))
+		goto fail;
+	return bdev;
+
+fail:
+	ext3_msg(sb, "error: failed to open journal device %s: %ld",
+		__bdevname(dev, b), PTR_ERR(bdev));
+
+	return NULL;
+}
+
+/*
+ * Release the journal device
+ */
+static int ext3_blkdev_put(struct block_device *bdev)
+{
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+}
+
+static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
+{
+	struct block_device *bdev;
+	int ret = -ENODEV;
+
+	bdev = sbi->journal_bdev;
+	if (bdev) {
+		ret = ext3_blkdev_put(bdev);
+		sbi->journal_bdev = NULL;
+	}
+	return ret;
+}
+
+static inline struct inode *orphan_list_entry(struct list_head *l)
+{
+	return &list_entry(l, struct ext3_inode_info, i_orphan)->vfs_inode;
+}
+
+static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
+{
+	struct list_head *l;
+
+	ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d",
+	       le32_to_cpu(sbi->s_es->s_last_orphan));
+
+	ext3_msg(sb, KERN_ERR, "sb_info orphan list:");
+	list_for_each(l, &sbi->s_orphan) {
+		struct inode *inode = orphan_list_entry(l);
+		ext3_msg(sb, KERN_ERR, "  "
+		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
+		       inode->i_sb->s_id, inode->i_ino, inode,
+		       inode->i_mode, inode->i_nlink,
+		       NEXT_ORPHAN(inode));
+	}
+}
+
+static void ext3_put_super (struct super_block * sb)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_super_block *es = sbi->s_es;
+	int i, err;
+
+	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+	ext3_xattr_put_super(sb);
+	err = journal_destroy(sbi->s_journal);
+	sbi->s_journal = NULL;
+	if (err < 0)
+		ext3_abort(sb, __func__, "Couldn't clean up the journal");
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+		es->s_state = cpu_to_le16(sbi->s_mount_state);
+		BUFFER_TRACE(sbi->s_sbh, "marking dirty");
+		mark_buffer_dirty(sbi->s_sbh);
+		ext3_commit_super(sb, es, 1);
+	}
+
+	for (i = 0; i < sbi->s_gdb_count; i++)
+		brelse(sbi->s_group_desc[i]);
+	kfree(sbi->s_group_desc);
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	brelse(sbi->s_sbh);
+#ifdef CONFIG_QUOTA
+	for (i = 0; i < MAXQUOTAS; i++)
+		kfree(sbi->s_qf_names[i]);
+#endif
+
+	/* Debugging code just in case the in-memory inode orphan list
+	 * isn't empty.  The on-disk one can be non-empty if we've
+	 * detected an error and taken the fs readonly, but the
+	 * in-memory list had better be clean by this point. */
+	if (!list_empty(&sbi->s_orphan))
+		dump_orphan_list(sb, sbi);
+	J_ASSERT(list_empty(&sbi->s_orphan));
+
+	invalidate_bdev(sb->s_bdev);
+	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
+		/*
+		 * Invalidate the journal device's buffers.  We don't want them
+		 * floating about in memory - the physical journal device may
+		 * hotswapped, and it breaks the `ro-after' testing code.
+		 */
+		sync_blockdev(sbi->journal_bdev);
+		invalidate_bdev(sbi->journal_bdev);
+		ext3_blkdev_remove(sbi);
+	}
+	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
+	kfree(sbi);
+}
+
+static struct kmem_cache *ext3_inode_cachep;
+
+/*
+ * Called inside transaction, so use GFP_NOFS
+ */
+static struct inode *ext3_alloc_inode(struct super_block *sb)
+{
+	struct ext3_inode_info *ei;
+
+	ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+	ei->i_block_alloc_info = NULL;
+	ei->vfs_inode.i_version = 1;
+	atomic_set(&ei->i_datasync_tid, 0);
+	atomic_set(&ei->i_sync_tid, 0);
+	return &ei->vfs_inode;
+}
+
+static void ext3_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+}
+
+static void ext3_destroy_inode(struct inode *inode)
+{
+	if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
+		printk("EXT3 Inode %p: orphan list check failed!\n",
+			EXT3_I(inode));
+		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
+				EXT3_I(inode), sizeof(struct ext3_inode_info),
+				false);
+		dump_stack();
+	}
+	call_rcu(&inode->i_rcu, ext3_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
+
+	INIT_LIST_HEAD(&ei->i_orphan);
+#ifdef CONFIG_EXT3_FS_XATTR
+	init_rwsem(&ei->xattr_sem);
+#endif
+	mutex_init(&ei->truncate_mutex);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
+					     sizeof(struct ext3_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (ext3_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(ext3_inode_cachep);
+}
+
+static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
+{
+#if defined(CONFIG_QUOTA)
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	if (sbi->s_jquota_fmt) {
+		char *fmtname = "";
+
+		switch (sbi->s_jquota_fmt) {
+		case QFMT_VFS_OLD:
+			fmtname = "vfsold";
+			break;
+		case QFMT_VFS_V0:
+			fmtname = "vfsv0";
+			break;
+		case QFMT_VFS_V1:
+			fmtname = "vfsv1";
+			break;
+		}
+		seq_printf(seq, ",jqfmt=%s", fmtname);
+	}
+
+	if (sbi->s_qf_names[USRQUOTA])
+		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+
+	if (sbi->s_qf_names[GRPQUOTA])
+		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+
+	if (test_opt(sb, USRQUOTA))
+		seq_puts(seq, ",usrquota");
+
+	if (test_opt(sb, GRPQUOTA))
+		seq_puts(seq, ",grpquota");
+#endif
+}
+
+static char *data_mode_string(unsigned long mode)
+{
+	switch (mode) {
+	case EXT3_MOUNT_JOURNAL_DATA:
+		return "journal";
+	case EXT3_MOUNT_ORDERED_DATA:
+		return "ordered";
+	case EXT3_MOUNT_WRITEBACK_DATA:
+		return "writeback";
+	}
+	return "unknown";
+}
+
+/*
+ * Show an option if
+ *  - it's set to a non-default value OR
+ *  - if the per-sb default is different from the global default
+ */
+static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct super_block *sb = vfs->mnt_sb;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_super_block *es = sbi->s_es;
+	unsigned long def_mount_opts;
+
+	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+
+	if (sbi->s_sb_block != 1)
+		seq_printf(seq, ",sb=%lu", sbi->s_sb_block);
+	if (test_opt(sb, MINIX_DF))
+		seq_puts(seq, ",minixdf");
+	if (test_opt(sb, GRPID))
+		seq_puts(seq, ",grpid");
+	if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS))
+		seq_puts(seq, ",nogrpid");
+	if (sbi->s_resuid != EXT3_DEF_RESUID ||
+	    le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) {
+		seq_printf(seq, ",resuid=%u", sbi->s_resuid);
+	}
+	if (sbi->s_resgid != EXT3_DEF_RESGID ||
+	    le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) {
+		seq_printf(seq, ",resgid=%u", sbi->s_resgid);
+	}
+	if (test_opt(sb, ERRORS_RO)) {
+		int def_errors = le16_to_cpu(es->s_errors);
+
+		if (def_errors == EXT3_ERRORS_PANIC ||
+		    def_errors == EXT3_ERRORS_CONTINUE) {
+			seq_puts(seq, ",errors=remount-ro");
+		}
+	}
+	if (test_opt(sb, ERRORS_CONT))
+		seq_puts(seq, ",errors=continue");
+	if (test_opt(sb, ERRORS_PANIC))
+		seq_puts(seq, ",errors=panic");
+	if (test_opt(sb, NO_UID32))
+		seq_puts(seq, ",nouid32");
+	if (test_opt(sb, DEBUG))
+		seq_puts(seq, ",debug");
+	if (test_opt(sb, OLDALLOC))
+		seq_puts(seq, ",oldalloc");
+#ifdef CONFIG_EXT3_FS_XATTR
+	if (test_opt(sb, XATTR_USER))
+		seq_puts(seq, ",user_xattr");
+	if (!test_opt(sb, XATTR_USER) &&
+	    (def_mount_opts & EXT3_DEFM_XATTR_USER)) {
+		seq_puts(seq, ",nouser_xattr");
+	}
+#endif
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
+	if (test_opt(sb, POSIX_ACL))
+		seq_puts(seq, ",acl");
+	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT3_DEFM_ACL))
+		seq_puts(seq, ",noacl");
+#endif
+	if (!test_opt(sb, RESERVATION))
+		seq_puts(seq, ",noreservation");
+	if (sbi->s_commit_interval) {
+		seq_printf(seq, ",commit=%u",
+			   (unsigned) (sbi->s_commit_interval / HZ));
+	}
+
+	/*
+	 * Always display barrier state so it's clear what the status is.
+	 */
+	seq_puts(seq, ",barrier=");
+	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
+	seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
+	if (test_opt(sb, DATA_ERR_ABORT))
+		seq_puts(seq, ",data_err=abort");
+
+	if (test_opt(sb, NOLOAD))
+		seq_puts(seq, ",norecovery");
+
+	ext3_show_quota_options(seq, sb);
+
+	return 0;
+}
+
+
+static struct inode *ext3_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO)
+		return ERR_PTR(-ESTALE);
+	if (ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count))
+		return ERR_PTR(-ESTALE);
+
+	/* iget isn't really right if the inode is currently unallocated!!
+	 *
+	 * ext3_read_inode will return a bad_inode if the inode had been
+	 * deleted, so we should be safe.
+	 *
+	 * Currently we don't know the generation for parent directory, so
+	 * a generation of 0 means "accept any"
+	 */
+	inode = ext3_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return inode;
+}
+
+static struct dentry *ext3_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    ext3_nfs_get_inode);
+}
+
+static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    ext3_nfs_get_inode);
+}
+
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device.  Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+				 gfp_t wait)
+{
+	journal_t *journal = EXT3_SB(sb)->s_journal;
+
+	WARN_ON(PageChecked(page));
+	if (!page_has_buffers(page))
+		return 0;
+	if (journal)
+		return journal_try_to_free_buffers(journal, page, 
+						   wait & ~__GFP_WAIT);
+	return try_to_free_buffers(page);
+}
+
+#ifdef CONFIG_QUOTA
+#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
+#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+
+static int ext3_write_dquot(struct dquot *dquot);
+static int ext3_acquire_dquot(struct dquot *dquot);
+static int ext3_release_dquot(struct dquot *dquot);
+static int ext3_mark_dquot_dirty(struct dquot *dquot);
+static int ext3_write_info(struct super_block *sb, int type);
+static int ext3_quota_on(struct super_block *sb, int type, int format_id,
+			 struct path *path);
+static int ext3_quota_on_mount(struct super_block *sb, int type);
+static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
+			       size_t len, loff_t off);
+static ssize_t ext3_quota_write(struct super_block *sb, int type,
+				const char *data, size_t len, loff_t off);
+
+static const struct dquot_operations ext3_quota_operations = {
+	.write_dquot	= ext3_write_dquot,
+	.acquire_dquot	= ext3_acquire_dquot,
+	.release_dquot	= ext3_release_dquot,
+	.mark_dirty	= ext3_mark_dquot_dirty,
+	.write_info	= ext3_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
+};
+
+static const struct quotactl_ops ext3_qctl_operations = {
+	.quota_on	= ext3_quota_on,
+	.quota_off	= dquot_quota_off,
+	.quota_sync	= dquot_quota_sync,
+	.get_info	= dquot_get_dqinfo,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk
+};
+#endif
+
+static const struct super_operations ext3_sops = {
+	.alloc_inode	= ext3_alloc_inode,
+	.destroy_inode	= ext3_destroy_inode,
+	.write_inode	= ext3_write_inode,
+	.dirty_inode	= ext3_dirty_inode,
+	.evict_inode	= ext3_evict_inode,
+	.put_super	= ext3_put_super,
+	.sync_fs	= ext3_sync_fs,
+	.freeze_fs	= ext3_freeze,
+	.unfreeze_fs	= ext3_unfreeze,
+	.statfs		= ext3_statfs,
+	.remount_fs	= ext3_remount,
+	.show_options	= ext3_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read	= ext3_quota_read,
+	.quota_write	= ext3_quota_write,
+#endif
+	.bdev_try_to_free_page = bdev_try_to_free_page,
+};
+
+static const struct export_operations ext3_export_ops = {
+	.fh_to_dentry = ext3_fh_to_dentry,
+	.fh_to_parent = ext3_fh_to_parent,
+	.get_parent = ext3_get_parent,
+};
+
+enum {
+	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
+	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
+	Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
+	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
+	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+	Opt_data_err_abort, Opt_data_err_ignore,
+	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+	Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
+	Opt_resize, Opt_usrquota, Opt_grpquota
+};
+
+static const match_table_t tokens = {
+	{Opt_bsd_df, "bsddf"},
+	{Opt_minix_df, "minixdf"},
+	{Opt_grpid, "grpid"},
+	{Opt_grpid, "bsdgroups"},
+	{Opt_nogrpid, "nogrpid"},
+	{Opt_nogrpid, "sysvgroups"},
+	{Opt_resgid, "resgid=%u"},
+	{Opt_resuid, "resuid=%u"},
+	{Opt_sb, "sb=%u"},
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_nouid32, "nouid32"},
+	{Opt_nocheck, "nocheck"},
+	{Opt_nocheck, "check=none"},
+	{Opt_debug, "debug"},
+	{Opt_oldalloc, "oldalloc"},
+	{Opt_orlov, "orlov"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_reservation, "reservation"},
+	{Opt_noreservation, "noreservation"},
+	{Opt_noload, "noload"},
+	{Opt_noload, "norecovery"},
+	{Opt_nobh, "nobh"},
+	{Opt_bh, "bh"},
+	{Opt_commit, "commit=%u"},
+	{Opt_journal_update, "journal=update"},
+	{Opt_journal_inum, "journal=%u"},
+	{Opt_journal_dev, "journal_dev=%u"},
+	{Opt_abort, "abort"},
+	{Opt_data_journal, "data=journal"},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_err_abort, "data_err=abort"},
+	{Opt_data_err_ignore, "data_err=ignore"},
+	{Opt_offusrjquota, "usrjquota="},
+	{Opt_usrjquota, "usrjquota=%s"},
+	{Opt_offgrpjquota, "grpjquota="},
+	{Opt_grpjquota, "grpjquota=%s"},
+	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
+	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
+	{Opt_grpquota, "grpquota"},
+	{Opt_noquota, "noquota"},
+	{Opt_quota, "quota"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_barrier, "barrier=%u"},
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_resize, "resize"},
+	{Opt_err, NULL},
+};
+
+static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
+{
+	ext3_fsblk_t	sb_block;
+	char		*options = (char *) *data;
+
+	if (!options || strncmp(options, "sb=", 3) != 0)
+		return 1;	/* Default location */
+	options += 3;
+	/*todo: use simple_strtoll with >32bit ext3 */
+	sb_block = simple_strtoul(options, &options, 0);
+	if (*options && *options != ',') {
+		ext3_msg(sb, "error: invalid sb specification: %s",
+		       (char *) *data);
+		return 1;
+	}
+	if (*options == ',')
+		options++;
+	*data = (void *) options;
+	return sb_block;
+}
+
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	char *qname;
+
+	if (sb_any_quota_loaded(sb) &&
+		!sbi->s_qf_names[qtype]) {
+		ext3_msg(sb, KERN_ERR,
+			"Cannot change journaled "
+			"quota options when quota turned on");
+		return 0;
+	}
+	qname = match_strdup(args);
+	if (!qname) {
+		ext3_msg(sb, KERN_ERR,
+			"Not enough memory for storing quotafile name");
+		return 0;
+	}
+	if (sbi->s_qf_names[qtype] &&
+		strcmp(sbi->s_qf_names[qtype], qname)) {
+		ext3_msg(sb, KERN_ERR,
+			"%s quota file already specified", QTYPE2NAME(qtype));
+		kfree(qname);
+		return 0;
+	}
+	sbi->s_qf_names[qtype] = qname;
+	if (strchr(sbi->s_qf_names[qtype], '/')) {
+		ext3_msg(sb, KERN_ERR,
+			"quotafile must be on filesystem root");
+		kfree(sbi->s_qf_names[qtype]);
+		sbi->s_qf_names[qtype] = NULL;
+		return 0;
+	}
+	set_opt(sbi->s_mount_opt, QUOTA);
+	return 1;
+}
+
+static int clear_qf_name(struct super_block *sb, int qtype) {
+
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	if (sb_any_quota_loaded(sb) &&
+		sbi->s_qf_names[qtype]) {
+		ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+			" when quota turned on");
+		return 0;
+	}
+	/*
+	 * The space will be released later when all options are confirmed
+	 * to be correct
+	 */
+	sbi->s_qf_names[qtype] = NULL;
+	return 1;
+}
+#endif
+
+static int parse_options (char *options, struct super_block *sb,
+			  unsigned int *inum, unsigned long *journal_devnum,
+			  ext3_fsblk_t *n_blocks_count, int is_remount)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	char * p;
+	substring_t args[MAX_OPT_ARGS];
+	int data_opt = 0;
+	int option;
+#ifdef CONFIG_QUOTA
+	int qfmt;
+#endif
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep (&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+		/*
+		 * Initialize args struct so we know whether arg was
+		 * found; some options take optional arguments.
+		 */
+		args[0].to = args[0].from = 0;
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_bsd_df:
+			clear_opt (sbi->s_mount_opt, MINIX_DF);
+			break;
+		case Opt_minix_df:
+			set_opt (sbi->s_mount_opt, MINIX_DF);
+			break;
+		case Opt_grpid:
+			set_opt (sbi->s_mount_opt, GRPID);
+			break;
+		case Opt_nogrpid:
+			clear_opt (sbi->s_mount_opt, GRPID);
+			break;
+		case Opt_resuid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_resuid = option;
+			break;
+		case Opt_resgid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_resgid = option;
+			break;
+		case Opt_sb:
+			/* handled by get_sb_block() instead of here */
+			/* *sb_block = match_int(&args[0]); */
+			break;
+		case Opt_err_panic:
+			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt (sbi->s_mount_opt, ERRORS_RO);
+			set_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			break;
+		case Opt_err_ro:
+			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt (sbi->s_mount_opt, ERRORS_RO);
+			break;
+		case Opt_err_cont:
+			clear_opt (sbi->s_mount_opt, ERRORS_RO);
+			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt (sbi->s_mount_opt, ERRORS_CONT);
+			break;
+		case Opt_nouid32:
+			set_opt (sbi->s_mount_opt, NO_UID32);
+			break;
+		case Opt_nocheck:
+			clear_opt (sbi->s_mount_opt, CHECK);
+			break;
+		case Opt_debug:
+			set_opt (sbi->s_mount_opt, DEBUG);
+			break;
+		case Opt_oldalloc:
+			set_opt (sbi->s_mount_opt, OLDALLOC);
+			break;
+		case Opt_orlov:
+			clear_opt (sbi->s_mount_opt, OLDALLOC);
+			break;
+#ifdef CONFIG_EXT3_FS_XATTR
+		case Opt_user_xattr:
+			set_opt (sbi->s_mount_opt, XATTR_USER);
+			break;
+		case Opt_nouser_xattr:
+			clear_opt (sbi->s_mount_opt, XATTR_USER);
+			break;
+#else
+		case Opt_user_xattr:
+		case Opt_nouser_xattr:
+			ext3_msg(sb, KERN_INFO,
+				"(no)user_xattr options not supported");
+			break;
+#endif
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
+		case Opt_acl:
+			set_opt(sbi->s_mount_opt, POSIX_ACL);
+			break;
+		case Opt_noacl:
+			clear_opt(sbi->s_mount_opt, POSIX_ACL);
+			break;
+#else
+		case Opt_acl:
+		case Opt_noacl:
+			ext3_msg(sb, KERN_INFO,
+				"(no)acl options not supported");
+			break;
+#endif
+		case Opt_reservation:
+			set_opt(sbi->s_mount_opt, RESERVATION);
+			break;
+		case Opt_noreservation:
+			clear_opt(sbi->s_mount_opt, RESERVATION);
+			break;
+		case Opt_journal_update:
+			/* @@@ FIXME */
+			/* Eventually we will want to be able to create
+			   a journal file here.  For now, only allow the
+			   user to specify an existing inode to be the
+			   journal file. */
+			if (is_remount) {
+				ext3_msg(sb, KERN_ERR, "error: cannot specify "
+					"journal on remount");
+				return 0;
+			}
+			set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
+			break;
+		case Opt_journal_inum:
+			if (is_remount) {
+				ext3_msg(sb, KERN_ERR, "error: cannot specify "
+				       "journal on remount");
+				return 0;
+			}
+			if (match_int(&args[0], &option))
+				return 0;
+			*inum = option;
+			break;
+		case Opt_journal_dev:
+			if (is_remount) {
+				ext3_msg(sb, KERN_ERR, "error: cannot specify "
+				       "journal on remount");
+				return 0;
+			}
+			if (match_int(&args[0], &option))
+				return 0;
+			*journal_devnum = option;
+			break;
+		case Opt_noload:
+			set_opt (sbi->s_mount_opt, NOLOAD);
+			break;
+		case Opt_commit:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			if (option == 0)
+				option = JBD_DEFAULT_MAX_COMMIT_AGE;
+			sbi->s_commit_interval = HZ * option;
+			break;
+		case Opt_data_journal:
+			data_opt = EXT3_MOUNT_JOURNAL_DATA;
+			goto datacheck;
+		case Opt_data_ordered:
+			data_opt = EXT3_MOUNT_ORDERED_DATA;
+			goto datacheck;
+		case Opt_data_writeback:
+			data_opt = EXT3_MOUNT_WRITEBACK_DATA;
+		datacheck:
+			if (is_remount) {
+				if (test_opt(sb, DATA_FLAGS) == data_opt)
+					break;
+				ext3_msg(sb, KERN_ERR,
+					"error: cannot change "
+					"data mode on remount. The filesystem "
+					"is mounted in data=%s mode and you "
+					"try to remount it in data=%s mode.",
+					data_mode_string(test_opt(sb,
+							DATA_FLAGS)),
+					data_mode_string(data_opt));
+				return 0;
+			} else {
+				clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+				sbi->s_mount_opt |= data_opt;
+			}
+			break;
+		case Opt_data_err_abort:
+			set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
+		case Opt_data_err_ignore:
+			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
+#ifdef CONFIG_QUOTA
+		case Opt_usrjquota:
+			if (!set_qf_name(sb, USRQUOTA, &args[0]))
+				return 0;
+			break;
+		case Opt_grpjquota:
+			if (!set_qf_name(sb, GRPQUOTA, &args[0]))
+				return 0;
+			break;
+		case Opt_offusrjquota:
+			if (!clear_qf_name(sb, USRQUOTA))
+				return 0;
+			break;
+		case Opt_offgrpjquota:
+			if (!clear_qf_name(sb, GRPQUOTA))
+				return 0;
+			break;
+		case Opt_jqfmt_vfsold:
+			qfmt = QFMT_VFS_OLD;
+			goto set_qf_format;
+		case Opt_jqfmt_vfsv0:
+			qfmt = QFMT_VFS_V0;
+			goto set_qf_format;
+		case Opt_jqfmt_vfsv1:
+			qfmt = QFMT_VFS_V1;
+set_qf_format:
+			if (sb_any_quota_loaded(sb) &&
+			    sbi->s_jquota_fmt != qfmt) {
+				ext3_msg(sb, KERN_ERR, "error: cannot change "
+					"journaled quota options when "
+					"quota turned on.");
+				return 0;
+			}
+			sbi->s_jquota_fmt = qfmt;
+			break;
+		case Opt_quota:
+		case Opt_usrquota:
+			set_opt(sbi->s_mount_opt, QUOTA);
+			set_opt(sbi->s_mount_opt, USRQUOTA);
+			break;
+		case Opt_grpquota:
+			set_opt(sbi->s_mount_opt, QUOTA);
+			set_opt(sbi->s_mount_opt, GRPQUOTA);
+			break;
+		case Opt_noquota:
+			if (sb_any_quota_loaded(sb)) {
+				ext3_msg(sb, KERN_ERR, "error: cannot change "
+					"quota options when quota turned on.");
+				return 0;
+			}
+			clear_opt(sbi->s_mount_opt, QUOTA);
+			clear_opt(sbi->s_mount_opt, USRQUOTA);
+			clear_opt(sbi->s_mount_opt, GRPQUOTA);
+			break;
+#else
+		case Opt_quota:
+		case Opt_usrquota:
+		case Opt_grpquota:
+			ext3_msg(sb, KERN_ERR,
+				"error: quota options not supported.");
+			break;
+		case Opt_usrjquota:
+		case Opt_grpjquota:
+		case Opt_offusrjquota:
+		case Opt_offgrpjquota:
+		case Opt_jqfmt_vfsold:
+		case Opt_jqfmt_vfsv0:
+		case Opt_jqfmt_vfsv1:
+			ext3_msg(sb, KERN_ERR,
+				"error: journaled quota options not "
+				"supported.");
+			break;
+		case Opt_noquota:
+			break;
+#endif
+		case Opt_abort:
+			set_opt(sbi->s_mount_opt, ABORT);
+			break;
+		case Opt_nobarrier:
+			clear_opt(sbi->s_mount_opt, BARRIER);
+			break;
+		case Opt_barrier:
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = 1;	/* No argument, default to 1 */
+			if (option)
+				set_opt(sbi->s_mount_opt, BARRIER);
+			else
+				clear_opt(sbi->s_mount_opt, BARRIER);
+			break;
+		case Opt_ignore:
+			break;
+		case Opt_resize:
+			if (!is_remount) {
+				ext3_msg(sb, KERN_ERR,
+					"error: resize option only available "
+					"for remount");
+				return 0;
+			}
+			if (match_int(&args[0], &option) != 0)
+				return 0;
+			*n_blocks_count = option;
+			break;
+		case Opt_nobh:
+			ext3_msg(sb, KERN_WARNING,
+				"warning: ignoring deprecated nobh option");
+			break;
+		case Opt_bh:
+			ext3_msg(sb, KERN_WARNING,
+				"warning: ignoring deprecated bh option");
+			break;
+		default:
+			ext3_msg(sb, KERN_ERR,
+				"error: unrecognized mount option \"%s\" "
+				"or missing value", p);
+			return 0;
+		}
+	}
+#ifdef CONFIG_QUOTA
+	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
+			clear_opt(sbi->s_mount_opt, USRQUOTA);
+		if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
+			clear_opt(sbi->s_mount_opt, GRPQUOTA);
+
+		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
+			ext3_msg(sb, KERN_ERR, "error: old and new quota "
+					"format mixing.");
+			return 0;
+		}
+
+		if (!sbi->s_jquota_fmt) {
+			ext3_msg(sb, KERN_ERR, "error: journaled quota format "
+					"not specified.");
+			return 0;
+		}
+	} else {
+		if (sbi->s_jquota_fmt) {
+			ext3_msg(sb, KERN_ERR, "error: journaled quota format "
+					"specified with no journaling "
+					"enabled.");
+			return 0;
+		}
+	}
+#endif
+	return 1;
+}
+
+static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
+			    int read_only)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	int res = 0;
+
+	if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
+		ext3_msg(sb, KERN_ERR,
+			"error: revision level too high, "
+			"forcing read-only mode");
+		res = MS_RDONLY;
+	}
+	if (read_only)
+		return res;
+	if (!(sbi->s_mount_state & EXT3_VALID_FS))
+		ext3_msg(sb, KERN_WARNING,
+			"warning: mounting unchecked fs, "
+			"running e2fsck is recommended");
+	else if ((sbi->s_mount_state & EXT3_ERROR_FS))
+		ext3_msg(sb, KERN_WARNING,
+			"warning: mounting fs with errors, "
+			"running e2fsck is recommended");
+	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
+		 le16_to_cpu(es->s_mnt_count) >=
+			le16_to_cpu(es->s_max_mnt_count))
+		ext3_msg(sb, KERN_WARNING,
+			"warning: maximal mount count reached, "
+			"running e2fsck is recommended");
+	else if (le32_to_cpu(es->s_checkinterval) &&
+		(le32_to_cpu(es->s_lastcheck) +
+			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
+		ext3_msg(sb, KERN_WARNING,
+			"warning: checktime reached, "
+			"running e2fsck is recommended");
+#if 0
+		/* @@@ We _will_ want to clear the valid bit if we find
+                   inconsistencies, to force a fsck at reboot.  But for
+                   a plain journaled filesystem we can keep it set as
+                   valid forever! :) */
+	es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
+#endif
+	if (!le16_to_cpu(es->s_max_mnt_count))
+		es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
+	le16_add_cpu(&es->s_mnt_count, 1);
+	es->s_mtime = cpu_to_le32(get_seconds());
+	ext3_update_dynamic_rev(sb);
+	EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+
+	ext3_commit_super(sb, es, 1);
+	if (test_opt(sb, DEBUG))
+		ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, "
+				"bpg=%lu, ipg=%lu, mo=%04lx]",
+			sb->s_blocksize,
+			sbi->s_groups_count,
+			EXT3_BLOCKS_PER_GROUP(sb),
+			EXT3_INODES_PER_GROUP(sb),
+			sbi->s_mount_opt);
+
+	if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
+		char b[BDEVNAME_SIZE];
+		ext3_msg(sb, KERN_INFO, "using external journal on %s",
+			bdevname(EXT3_SB(sb)->s_journal->j_dev, b));
+	} else {
+		ext3_msg(sb, KERN_INFO, "using internal journal");
+	}
+	cleancache_init_fs(sb);
+	return res;
+}
+
+/* Called at mount-time, super-block is locked */
+static int ext3_check_descriptors(struct super_block *sb)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	int i;
+
+	ext3_debug ("Checking group descriptors");
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL);
+		ext3_fsblk_t first_block = ext3_group_first_block_no(sb, i);
+		ext3_fsblk_t last_block;
+
+		if (i == sbi->s_groups_count - 1)
+			last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
+		else
+			last_block = first_block +
+				(EXT3_BLOCKS_PER_GROUP(sb) - 1);
+
+		if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
+		    le32_to_cpu(gdp->bg_block_bitmap) > last_block)
+		{
+			ext3_error (sb, "ext3_check_descriptors",
+				    "Block bitmap for group %d"
+				    " not in group (block %lu)!",
+				    i, (unsigned long)
+					le32_to_cpu(gdp->bg_block_bitmap));
+			return 0;
+		}
+		if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block ||
+		    le32_to_cpu(gdp->bg_inode_bitmap) > last_block)
+		{
+			ext3_error (sb, "ext3_check_descriptors",
+				    "Inode bitmap for group %d"
+				    " not in group (block %lu)!",
+				    i, (unsigned long)
+					le32_to_cpu(gdp->bg_inode_bitmap));
+			return 0;
+		}
+		if (le32_to_cpu(gdp->bg_inode_table) < first_block ||
+		    le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group - 1 >
+		    last_block)
+		{
+			ext3_error (sb, "ext3_check_descriptors",
+				    "Inode table for group %d"
+				    " not in group (block %lu)!",
+				    i, (unsigned long)
+					le32_to_cpu(gdp->bg_inode_table));
+			return 0;
+		}
+	}
+
+	sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb));
+	sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3_count_free_inodes(sb));
+	return 1;
+}
+
+
+/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash.  We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode.  The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext3_free_inode().  The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+static void ext3_orphan_cleanup (struct super_block * sb,
+				 struct ext3_super_block * es)
+{
+	unsigned int s_flags = sb->s_flags;
+	int nr_orphans = 0, nr_truncates = 0;
+#ifdef CONFIG_QUOTA
+	int i;
+#endif
+	if (!es->s_last_orphan) {
+		jbd_debug(4, "no orphan inodes to clean up\n");
+		return;
+	}
+
+	if (bdev_read_only(sb->s_bdev)) {
+		ext3_msg(sb, KERN_ERR, "error: write access "
+			"unavailable, skipping orphan cleanup.");
+		return;
+	}
+
+	/* Check if feature set allows readwrite operations */
+	if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) {
+		ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+			 "unknown ROCOMPAT features");
+		return;
+	}
+
+	if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
+		if (es->s_last_orphan)
+			jbd_debug(1, "Errors on filesystem, "
+				  "clearing orphan list.\n");
+		es->s_last_orphan = 0;
+		jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+		return;
+	}
+
+	if (s_flags & MS_RDONLY) {
+		ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
+		sb->s_flags &= ~MS_RDONLY;
+	}
+#ifdef CONFIG_QUOTA
+	/* Needed for iput() to work correctly and not trash data */
+	sb->s_flags |= MS_ACTIVE;
+	/* Turn on quotas so that they are updated correctly */
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (EXT3_SB(sb)->s_qf_names[i]) {
+			int ret = ext3_quota_on_mount(sb, i);
+			if (ret < 0)
+				ext3_msg(sb, KERN_ERR,
+					"error: cannot turn on journaled "
+					"quota: %d", ret);
+		}
+	}
+#endif
+
+	while (es->s_last_orphan) {
+		struct inode *inode;
+
+		inode = ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
+		if (IS_ERR(inode)) {
+			es->s_last_orphan = 0;
+			break;
+		}
+
+		list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
+		dquot_initialize(inode);
+		if (inode->i_nlink) {
+			printk(KERN_DEBUG
+				"%s: truncating inode %lu to %Ld bytes\n",
+				__func__, inode->i_ino, inode->i_size);
+			jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
+				  inode->i_ino, inode->i_size);
+			ext3_truncate(inode);
+			nr_truncates++;
+		} else {
+			printk(KERN_DEBUG
+				"%s: deleting unreferenced inode %lu\n",
+				__func__, inode->i_ino);
+			jbd_debug(2, "deleting unreferenced inode %lu\n",
+				  inode->i_ino);
+			nr_orphans++;
+		}
+		iput(inode);  /* The delete magic happens here! */
+	}
+
+#define PLURAL(x) (x), ((x)==1) ? "" : "s"
+
+	if (nr_orphans)
+		ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+		       PLURAL(nr_orphans));
+	if (nr_truncates)
+		ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+		       PLURAL(nr_truncates));
+#ifdef CONFIG_QUOTA
+	/* Turn quotas off */
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (sb_dqopt(sb)->files[i])
+			dquot_quota_off(sb, i);
+	}
+#endif
+	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+}
+
+/*
+ * Maximal file size.  There is a direct, and {,double-,triple-}indirect
+ * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
+ * We need to be 1 filesystem block less than the 2^32 sector limit.
+ */
+static loff_t ext3_max_size(int bits)
+{
+	loff_t res = EXT3_NDIR_BLOCKS;
+	int meta_blocks;
+	loff_t upper_limit;
+
+	/* This is calculated to be the largest file size for a
+	 * dense, file such that the total number of
+	 * sectors in the file, including data and all indirect blocks,
+	 * does not exceed 2^32 -1
+	 * __u32 i_blocks representing the total number of
+	 * 512 bytes blocks of the file
+	 */
+	upper_limit = (1LL << 32) - 1;
+
+	/* total blocks in file system block size */
+	upper_limit >>= (bits - 9);
+
+
+	/* indirect blocks */
+	meta_blocks = 1;
+	/* double indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2));
+	/* tripple indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
+
+	upper_limit -= meta_blocks;
+	upper_limit <<= bits;
+
+	res += 1LL << (bits-2);
+	res += 1LL << (2*(bits-2));
+	res += 1LL << (3*(bits-2));
+	res <<= bits;
+	if (res > upper_limit)
+		res = upper_limit;
+
+	if (res > MAX_LFS_FILESIZE)
+		res = MAX_LFS_FILESIZE;
+
+	return res;
+}
+
+static ext3_fsblk_t descriptor_loc(struct super_block *sb,
+				    ext3_fsblk_t logic_sb_block,
+				    int nr)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	unsigned long bg, first_meta_bg;
+	int has_super = 0;
+
+	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
+
+	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
+	    nr < first_meta_bg)
+		return (logic_sb_block + nr + 1);
+	bg = sbi->s_desc_per_block * nr;
+	if (ext3_bg_has_super(sb, bg))
+		has_super = 1;
+	return (has_super + ext3_group_first_block_no(sb, bg));
+}
+
+
+static int ext3_fill_super (struct super_block *sb, void *data, int silent)
+{
+	struct buffer_head * bh;
+	struct ext3_super_block *es = NULL;
+	struct ext3_sb_info *sbi;
+	ext3_fsblk_t block;
+	ext3_fsblk_t sb_block = get_sb_block(&data, sb);
+	ext3_fsblk_t logic_sb_block;
+	unsigned long offset = 0;
+	unsigned int journal_inum = 0;
+	unsigned long journal_devnum = 0;
+	unsigned long def_mount_opts;
+	struct inode *root;
+	int blocksize;
+	int hblock;
+	int db_count;
+	int i;
+	int needs_recovery;
+	int ret = -EINVAL;
+	__le32 features;
+	int err;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock) {
+		kfree(sbi);
+		return -ENOMEM;
+	}
+	sb->s_fs_info = sbi;
+	sbi->s_mount_opt = 0;
+	sbi->s_resuid = EXT3_DEF_RESUID;
+	sbi->s_resgid = EXT3_DEF_RESGID;
+	sbi->s_sb_block = sb_block;
+
+	blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
+	if (!blocksize) {
+		ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
+		goto out_fail;
+	}
+
+	/*
+	 * The ext3 superblock will not be buffer aligned for other than 1kB
+	 * block sizes.  We need to calculate the offset from buffer start.
+	 */
+	if (blocksize != EXT3_MIN_BLOCK_SIZE) {
+		logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
+		offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
+	} else {
+		logic_sb_block = sb_block;
+	}
+
+	if (!(bh = sb_bread(sb, logic_sb_block))) {
+		ext3_msg(sb, KERN_ERR, "error: unable to read superblock");
+		goto out_fail;
+	}
+	/*
+	 * Note: s_es must be initialized as soon as possible because
+	 *       some ext3 macro-instructions depend on its value
+	 */
+	es = (struct ext3_super_block *) (bh->b_data + offset);
+	sbi->s_es = es;
+	sb->s_magic = le16_to_cpu(es->s_magic);
+	if (sb->s_magic != EXT3_SUPER_MAGIC)
+		goto cantfind_ext3;
+
+	/* Set defaults before we parse the mount options */
+	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+	if (def_mount_opts & EXT3_DEFM_DEBUG)
+		set_opt(sbi->s_mount_opt, DEBUG);
+	if (def_mount_opts & EXT3_DEFM_BSDGROUPS)
+		set_opt(sbi->s_mount_opt, GRPID);
+	if (def_mount_opts & EXT3_DEFM_UID16)
+		set_opt(sbi->s_mount_opt, NO_UID32);
+#ifdef CONFIG_EXT3_FS_XATTR
+	if (def_mount_opts & EXT3_DEFM_XATTR_USER)
+		set_opt(sbi->s_mount_opt, XATTR_USER);
+#endif
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
+	if (def_mount_opts & EXT3_DEFM_ACL)
+		set_opt(sbi->s_mount_opt, POSIX_ACL);
+#endif
+	if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
+		set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+	else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
+		set_opt(sbi->s_mount_opt, ORDERED_DATA);
+	else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
+		set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+
+	if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
+		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_CONTINUE)
+		set_opt(sbi->s_mount_opt, ERRORS_CONT);
+	else
+		set_opt(sbi->s_mount_opt, ERRORS_RO);
+
+	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
+	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+
+	set_opt(sbi->s_mount_opt, RESERVATION);
+
+	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
+			    NULL, 0))
+		goto failed_mount;
+
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+
+	if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
+	    (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
+	     EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
+	     EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+		ext3_msg(sb, KERN_WARNING,
+			"warning: feature flags set on rev 0 fs, "
+			"running e2fsck is recommended");
+	/*
+	 * Check feature flags regardless of the revision level, since we
+	 * previously didn't change the revision level when setting the flags,
+	 * so there is a chance incompat flags are set on a rev 0 filesystem.
+	 */
+	features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
+	if (features) {
+		ext3_msg(sb, KERN_ERR,
+			"error: couldn't mount because of unsupported "
+			"optional features (%x)", le32_to_cpu(features));
+		goto failed_mount;
+	}
+	features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
+	if (!(sb->s_flags & MS_RDONLY) && features) {
+		ext3_msg(sb, KERN_ERR,
+			"error: couldn't mount RDWR because of unsupported "
+			"optional features (%x)", le32_to_cpu(features));
+		goto failed_mount;
+	}
+	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+
+	if (blocksize < EXT3_MIN_BLOCK_SIZE ||
+	    blocksize > EXT3_MAX_BLOCK_SIZE) {
+		ext3_msg(sb, KERN_ERR,
+			"error: couldn't mount because of unsupported "
+			"filesystem blocksize %d", blocksize);
+		goto failed_mount;
+	}
+
+	hblock = bdev_logical_block_size(sb->s_bdev);
+	if (sb->s_blocksize != blocksize) {
+		/*
+		 * Make sure the blocksize for the filesystem is larger
+		 * than the hardware sectorsize for the machine.
+		 */
+		if (blocksize < hblock) {
+			ext3_msg(sb, KERN_ERR,
+				"error: fsblocksize %d too small for "
+				"hardware sectorsize %d", blocksize, hblock);
+			goto failed_mount;
+		}
+
+		brelse (bh);
+		if (!sb_set_blocksize(sb, blocksize)) {
+			ext3_msg(sb, KERN_ERR,
+				"error: bad blocksize %d", blocksize);
+			goto out_fail;
+		}
+		logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
+		offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
+		bh = sb_bread(sb, logic_sb_block);
+		if (!bh) {
+			ext3_msg(sb, KERN_ERR,
+			       "error: can't read superblock on 2nd try");
+			goto failed_mount;
+		}
+		es = (struct ext3_super_block *)(bh->b_data + offset);
+		sbi->s_es = es;
+		if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
+			ext3_msg(sb, KERN_ERR,
+				"error: magic mismatch");
+			goto failed_mount;
+		}
+	}
+
+	sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);
+
+	if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
+		sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
+		sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
+	} else {
+		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+		if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
+		    (!is_power_of_2(sbi->s_inode_size)) ||
+		    (sbi->s_inode_size > blocksize)) {
+			ext3_msg(sb, KERN_ERR,
+				"error: unsupported inode size: %d",
+				sbi->s_inode_size);
+			goto failed_mount;
+		}
+	}
+	sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
+				   le32_to_cpu(es->s_log_frag_size);
+	if (blocksize != sbi->s_frag_size) {
+		ext3_msg(sb, KERN_ERR,
+		       "error: fragsize %lu != blocksize %u (unsupported)",
+		       sbi->s_frag_size, blocksize);
+		goto failed_mount;
+	}
+	sbi->s_frags_per_block = 1;
+	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+	sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
+	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
+	if (EXT3_INODE_SIZE(sb) == 0 || EXT3_INODES_PER_GROUP(sb) == 0)
+		goto cantfind_ext3;
+	sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
+	if (sbi->s_inodes_per_block == 0)
+		goto cantfind_ext3;
+	sbi->s_itb_per_group = sbi->s_inodes_per_group /
+					sbi->s_inodes_per_block;
+	sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
+	sbi->s_sbh = bh;
+	sbi->s_mount_state = le16_to_cpu(es->s_state);
+	sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb));
+	sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb));
+	for (i=0; i < 4; i++)
+		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
+	sbi->s_def_hash_version = es->s_def_hash_version;
+	i = le32_to_cpu(es->s_flags);
+	if (i & EXT2_FLAGS_UNSIGNED_HASH)
+		sbi->s_hash_unsigned = 3;
+	else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+		es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+		sbi->s_hash_unsigned = 3;
+#else
+		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+	}
+
+	if (sbi->s_blocks_per_group > blocksize * 8) {
+		ext3_msg(sb, KERN_ERR,
+			"#blocks per group too big: %lu",
+			sbi->s_blocks_per_group);
+		goto failed_mount;
+	}
+	if (sbi->s_frags_per_group > blocksize * 8) {
+		ext3_msg(sb, KERN_ERR,
+			"error: #fragments per group too big: %lu",
+			sbi->s_frags_per_group);
+		goto failed_mount;
+	}
+	if (sbi->s_inodes_per_group > blocksize * 8) {
+		ext3_msg(sb, KERN_ERR,
+			"error: #inodes per group too big: %lu",
+			sbi->s_inodes_per_group);
+		goto failed_mount;
+	}
+
+	err = generic_check_addressable(sb->s_blocksize_bits,
+					le32_to_cpu(es->s_blocks_count));
+	if (err) {
+		ext3_msg(sb, KERN_ERR,
+			"error: filesystem is too large to mount safely");
+		if (sizeof(sector_t) < 8)
+			ext3_msg(sb, KERN_ERR,
+				"error: CONFIG_LBDAF not enabled");
+		ret = err;
+		goto failed_mount;
+	}
+
+	if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
+		goto cantfind_ext3;
+	sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
+			       le32_to_cpu(es->s_first_data_block) - 1)
+				       / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
+	db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb));
+	sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
+				    GFP_KERNEL);
+	if (sbi->s_group_desc == NULL) {
+		ext3_msg(sb, KERN_ERR,
+			"error: not enough memory");
+		ret = -ENOMEM;
+		goto failed_mount;
+	}
+
+	bgl_lock_init(sbi->s_blockgroup_lock);
+
+	for (i = 0; i < db_count; i++) {
+		block = descriptor_loc(sb, logic_sb_block, i);
+		sbi->s_group_desc[i] = sb_bread(sb, block);
+		if (!sbi->s_group_desc[i]) {
+			ext3_msg(sb, KERN_ERR,
+				"error: can't read group descriptor %d", i);
+			db_count = i;
+			goto failed_mount2;
+		}
+	}
+	if (!ext3_check_descriptors (sb)) {
+		ext3_msg(sb, KERN_ERR,
+			"error: group descriptors corrupted");
+		goto failed_mount2;
+	}
+	sbi->s_gdb_count = db_count;
+	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+	spin_lock_init(&sbi->s_next_gen_lock);
+
+	/* per fileystem reservation list head & lock */
+	spin_lock_init(&sbi->s_rsv_window_lock);
+	sbi->s_rsv_window_root = RB_ROOT;
+	/* Add a single, static dummy reservation to the start of the
+	 * reservation window list --- it gives us a placeholder for
+	 * append-at-start-of-list which makes the allocation logic
+	 * _much_ simpler. */
+	sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+	sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+	sbi->s_rsv_window_head.rsv_alloc_hit = 0;
+	sbi->s_rsv_window_head.rsv_goal_size = 0;
+	ext3_rsv_window_add(sb, &sbi->s_rsv_window_head);
+
+	/*
+	 * set up enough so that it can read an inode
+	 */
+	sb->s_op = &ext3_sops;
+	sb->s_export_op = &ext3_export_ops;
+	sb->s_xattr = ext3_xattr_handlers;
+#ifdef CONFIG_QUOTA
+	sb->s_qcop = &ext3_qctl_operations;
+	sb->dq_op = &ext3_quota_operations;
+#endif
+	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+	mutex_init(&sbi->s_orphan_lock);
+	mutex_init(&sbi->s_resize_lock);
+
+	sb->s_root = NULL;
+
+	needs_recovery = (es->s_last_orphan != 0 ||
+			  EXT3_HAS_INCOMPAT_FEATURE(sb,
+				    EXT3_FEATURE_INCOMPAT_RECOVER));
+
+	/*
+	 * The first inode we look at is the journal inode.  Don't try
+	 * root first: it may be modified in the journal!
+	 */
+	if (!test_opt(sb, NOLOAD) &&
+	    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
+		if (ext3_load_journal(sb, es, journal_devnum))
+			goto failed_mount2;
+	} else if (journal_inum) {
+		if (ext3_create_journal(sb, es, journal_inum))
+			goto failed_mount2;
+	} else {
+		if (!silent)
+			ext3_msg(sb, KERN_ERR,
+				"error: no journal found. "
+				"mounting ext3 over ext2?");
+		goto failed_mount2;
+	}
+	err = percpu_counter_init(&sbi->s_freeblocks_counter,
+			ext3_count_free_blocks(sb));
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_freeinodes_counter,
+				ext3_count_free_inodes(sb));
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_dirs_counter,
+				ext3_count_dirs(sb));
+	}
+	if (err) {
+		ext3_msg(sb, KERN_ERR, "error: insufficient memory");
+		ret = err;
+		goto failed_mount3;
+	}
+
+	/* We have now updated the journal if required, so we can
+	 * validate the data journaling mode. */
+	switch (test_opt(sb, DATA_FLAGS)) {
+	case 0:
+		/* No mode set, assume a default based on the journal
+                   capabilities: ORDERED_DATA if the journal can
+                   cope, else JOURNAL_DATA */
+		if (journal_check_available_features
+		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
+			set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
+		else
+			set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+		break;
+
+	case EXT3_MOUNT_ORDERED_DATA:
+	case EXT3_MOUNT_WRITEBACK_DATA:
+		if (!journal_check_available_features
+		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
+			ext3_msg(sb, KERN_ERR,
+				"error: journal does not support "
+				"requested data journaling mode");
+			goto failed_mount3;
+		}
+	default:
+		break;
+	}
+
+	/*
+	 * The journal_load will have done any necessary log recovery,
+	 * so we can safely mount the rest of the filesystem now.
+	 */
+
+	root = ext3_iget(sb, EXT3_ROOT_INO);
+	if (IS_ERR(root)) {
+		ext3_msg(sb, KERN_ERR, "error: get root inode failed");
+		ret = PTR_ERR(root);
+		goto failed_mount3;
+	}
+	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+		iput(root);
+		ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
+		goto failed_mount3;
+	}
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
+		iput(root);
+		ret = -ENOMEM;
+		goto failed_mount3;
+	}
+
+	ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+
+	EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
+	ext3_orphan_cleanup(sb, es);
+	EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
+	if (needs_recovery)
+		ext3_msg(sb, KERN_INFO, "recovery complete");
+	ext3_mark_recovery_complete(sb, es);
+	ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
+		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
+		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
+		"writeback");
+
+	return 0;
+
+cantfind_ext3:
+	if (!silent)
+		ext3_msg(sb, KERN_INFO,
+			"error: can't find ext3 filesystem on dev %s.",
+		       sb->s_id);
+	goto failed_mount;
+
+failed_mount3:
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	journal_destroy(sbi->s_journal);
+failed_mount2:
+	for (i = 0; i < db_count; i++)
+		brelse(sbi->s_group_desc[i]);
+	kfree(sbi->s_group_desc);
+failed_mount:
+#ifdef CONFIG_QUOTA
+	for (i = 0; i < MAXQUOTAS; i++)
+		kfree(sbi->s_qf_names[i]);
+#endif
+	ext3_blkdev_remove(sbi);
+	brelse(bh);
+out_fail:
+	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
+	kfree(sbi);
+	return ret;
+}
+
+/*
+ * Setup any per-fs journal parameters now.  We'll do this both on
+ * initial mount, once the journal has been initialised but before we've
+ * done any recovery; and again on any subsequent remount.
+ */
+static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	if (sbi->s_commit_interval)
+		journal->j_commit_interval = sbi->s_commit_interval;
+	/* We could also set up an ext3-specific default for the commit
+	 * interval here, but for now we'll just fall back to the jbd
+	 * default. */
+
+	spin_lock(&journal->j_state_lock);
+	if (test_opt(sb, BARRIER))
+		journal->j_flags |= JFS_BARRIER;
+	else
+		journal->j_flags &= ~JFS_BARRIER;
+	if (test_opt(sb, DATA_ERR_ABORT))
+		journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR;
+	else
+		journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR;
+	spin_unlock(&journal->j_state_lock);
+}
+
+static journal_t *ext3_get_journal(struct super_block *sb,
+				   unsigned int journal_inum)
+{
+	struct inode *journal_inode;
+	journal_t *journal;
+
+	/* First, test for the existence of a valid inode on disk.  Bad
+	 * things happen if we iget() an unused inode, as the subsequent
+	 * iput() will try to delete it. */
+
+	journal_inode = ext3_iget(sb, journal_inum);
+	if (IS_ERR(journal_inode)) {
+		ext3_msg(sb, KERN_ERR, "error: no journal found");
+		return NULL;
+	}
+	if (!journal_inode->i_nlink) {
+		make_bad_inode(journal_inode);
+		iput(journal_inode);
+		ext3_msg(sb, KERN_ERR, "error: journal inode is deleted");
+		return NULL;
+	}
+
+	jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
+		  journal_inode, journal_inode->i_size);
+	if (!S_ISREG(journal_inode->i_mode)) {
+		ext3_msg(sb, KERN_ERR, "error: invalid journal inode");
+		iput(journal_inode);
+		return NULL;
+	}
+
+	journal = journal_init_inode(journal_inode);
+	if (!journal) {
+		ext3_msg(sb, KERN_ERR, "error: could not load journal inode");
+		iput(journal_inode);
+		return NULL;
+	}
+	journal->j_private = sb;
+	ext3_init_journal_params(sb, journal);
+	return journal;
+}
+
+static journal_t *ext3_get_dev_journal(struct super_block *sb,
+				       dev_t j_dev)
+{
+	struct buffer_head * bh;
+	journal_t *journal;
+	ext3_fsblk_t start;
+	ext3_fsblk_t len;
+	int hblock, blocksize;
+	ext3_fsblk_t sb_block;
+	unsigned long offset;
+	struct ext3_super_block * es;
+	struct block_device *bdev;
+
+	bdev = ext3_blkdev_get(j_dev, sb);
+	if (bdev == NULL)
+		return NULL;
+
+	blocksize = sb->s_blocksize;
+	hblock = bdev_logical_block_size(bdev);
+	if (blocksize < hblock) {
+		ext3_msg(sb, KERN_ERR,
+			"error: blocksize too small for journal device");
+		goto out_bdev;
+	}
+
+	sb_block = EXT3_MIN_BLOCK_SIZE / blocksize;
+	offset = EXT3_MIN_BLOCK_SIZE % blocksize;
+	set_blocksize(bdev, blocksize);
+	if (!(bh = __bread(bdev, sb_block, blocksize))) {
+		ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of "
+			"external journal");
+		goto out_bdev;
+	}
+
+	es = (struct ext3_super_block *) (bh->b_data + offset);
+	if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
+	    !(le32_to_cpu(es->s_feature_incompat) &
+	      EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
+		ext3_msg(sb, KERN_ERR, "error: external journal has "
+			"bad superblock");
+		brelse(bh);
+		goto out_bdev;
+	}
+
+	if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
+		ext3_msg(sb, KERN_ERR, "error: journal UUID does not match");
+		brelse(bh);
+		goto out_bdev;
+	}
+
+	len = le32_to_cpu(es->s_blocks_count);
+	start = sb_block + 1;
+	brelse(bh);	/* we're done with the superblock */
+
+	journal = journal_init_dev(bdev, sb->s_bdev,
+					start, len, blocksize);
+	if (!journal) {
+		ext3_msg(sb, KERN_ERR,
+			"error: failed to create device journal");
+		goto out_bdev;
+	}
+	journal->j_private = sb;
+	ll_rw_block(READ, 1, &journal->j_sb_buffer);
+	wait_on_buffer(journal->j_sb_buffer);
+	if (!buffer_uptodate(journal->j_sb_buffer)) {
+		ext3_msg(sb, KERN_ERR, "I/O error on journal device");
+		goto out_journal;
+	}
+	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
+		ext3_msg(sb, KERN_ERR,
+			"error: external journal has more than one "
+			"user (unsupported) - %d",
+			be32_to_cpu(journal->j_superblock->s_nr_users));
+		goto out_journal;
+	}
+	EXT3_SB(sb)->journal_bdev = bdev;
+	ext3_init_journal_params(sb, journal);
+	return journal;
+out_journal:
+	journal_destroy(journal);
+out_bdev:
+	ext3_blkdev_put(bdev);
+	return NULL;
+}
+
+static int ext3_load_journal(struct super_block *sb,
+			     struct ext3_super_block *es,
+			     unsigned long journal_devnum)
+{
+	journal_t *journal;
+	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
+	dev_t journal_dev;
+	int err = 0;
+	int really_read_only;
+
+	if (journal_devnum &&
+	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+		ext3_msg(sb, KERN_INFO, "external journal device major/minor "
+			"numbers have changed");
+		journal_dev = new_decode_dev(journal_devnum);
+	} else
+		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
+
+	really_read_only = bdev_read_only(sb->s_bdev);
+
+	/*
+	 * Are we loading a blank journal or performing recovery after a
+	 * crash?  For recovery, we need to check in advance whether we
+	 * can get read-write access to the device.
+	 */
+
+	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
+		if (sb->s_flags & MS_RDONLY) {
+			ext3_msg(sb, KERN_INFO,
+				"recovery required on readonly filesystem");
+			if (really_read_only) {
+				ext3_msg(sb, KERN_ERR, "error: write access "
+					"unavailable, cannot proceed");
+				return -EROFS;
+			}
+			ext3_msg(sb, KERN_INFO,
+				"write access will be enabled during recovery");
+		}
+	}
+
+	if (journal_inum && journal_dev) {
+		ext3_msg(sb, KERN_ERR, "error: filesystem has both journal "
+		       "and inode journals");
+		return -EINVAL;
+	}
+
+	if (journal_inum) {
+		if (!(journal = ext3_get_journal(sb, journal_inum)))
+			return -EINVAL;
+	} else {
+		if (!(journal = ext3_get_dev_journal(sb, journal_dev)))
+			return -EINVAL;
+	}
+
+	if (!(journal->j_flags & JFS_BARRIER))
+		printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
+
+	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
+		err = journal_update_format(journal);
+		if (err)  {
+			ext3_msg(sb, KERN_ERR, "error updating journal");
+			journal_destroy(journal);
+			return err;
+		}
+	}
+
+	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER))
+		err = journal_wipe(journal, !really_read_only);
+	if (!err)
+		err = journal_load(journal);
+
+	if (err) {
+		ext3_msg(sb, KERN_ERR, "error loading journal");
+		journal_destroy(journal);
+		return err;
+	}
+
+	EXT3_SB(sb)->s_journal = journal;
+	ext3_clear_journal_err(sb, es);
+
+	if (!really_read_only && journal_devnum &&
+	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+		es->s_journal_dev = cpu_to_le32(journal_devnum);
+
+		/* Make sure we flush the recovery flag to disk. */
+		ext3_commit_super(sb, es, 1);
+	}
+
+	return 0;
+}
+
+static int ext3_create_journal(struct super_block *sb,
+			       struct ext3_super_block *es,
+			       unsigned int journal_inum)
+{
+	journal_t *journal;
+	int err;
+
+	if (sb->s_flags & MS_RDONLY) {
+		ext3_msg(sb, KERN_ERR,
+			"error: readonly filesystem when trying to "
+			"create journal");
+		return -EROFS;
+	}
+
+	journal = ext3_get_journal(sb, journal_inum);
+	if (!journal)
+		return -EINVAL;
+
+	ext3_msg(sb, KERN_INFO, "creating new journal on inode %u",
+	       journal_inum);
+
+	err = journal_create(journal);
+	if (err) {
+		ext3_msg(sb, KERN_ERR, "error creating journal");
+		journal_destroy(journal);
+		return -EIO;
+	}
+
+	EXT3_SB(sb)->s_journal = journal;
+
+	ext3_update_dynamic_rev(sb);
+	EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+	EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
+
+	es->s_journal_inum = cpu_to_le32(journal_inum);
+
+	/* Make sure we flush the recovery flag to disk. */
+	ext3_commit_super(sb, es, 1);
+
+	return 0;
+}
+
+static int ext3_commit_super(struct super_block *sb,
+			       struct ext3_super_block *es,
+			       int sync)
+{
+	struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
+	int error = 0;
+
+	if (!sbh)
+		return error;
+
+	if (buffer_write_io_error(sbh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		ext3_msg(sb, KERN_ERR, "previous I/O error to "
+		       "superblock detected");
+		clear_buffer_write_io_error(sbh);
+		set_buffer_uptodate(sbh);
+	}
+	/*
+	 * If the file system is mounted read-only, don't update the
+	 * superblock write time.  This avoids updating the superblock
+	 * write time when we are mounting the root file system
+	 * read/only but we need to replay the journal; at that point,
+	 * for people who are east of GMT and who make their clock
+	 * tick in localtime for Windows bug-for-bug compatibility,
+	 * the clock is set in the future, and this will cause e2fsck
+	 * to complain and force a full file system check.
+	 */
+	if (!(sb->s_flags & MS_RDONLY))
+		es->s_wtime = cpu_to_le32(get_seconds());
+	es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
+	es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
+	BUFFER_TRACE(sbh, "marking dirty");
+	mark_buffer_dirty(sbh);
+	if (sync) {
+		error = sync_dirty_buffer(sbh);
+		if (buffer_write_io_error(sbh)) {
+			ext3_msg(sb, KERN_ERR, "I/O error while writing "
+			       "superblock");
+			clear_buffer_write_io_error(sbh);
+			set_buffer_uptodate(sbh);
+		}
+	}
+	return error;
+}
+
+
+/*
+ * Have we just finished recovery?  If so, and if we are mounting (or
+ * remounting) the filesystem readonly, then we will end up with a
+ * consistent fs on disk.  Record that fact.
+ */
+static void ext3_mark_recovery_complete(struct super_block * sb,
+					struct ext3_super_block * es)
+{
+	journal_t *journal = EXT3_SB(sb)->s_journal;
+
+	journal_lock_updates(journal);
+	if (journal_flush(journal) < 0)
+		goto out;
+
+	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
+	    sb->s_flags & MS_RDONLY) {
+		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+		ext3_commit_super(sb, es, 1);
+	}
+
+out:
+	journal_unlock_updates(journal);
+}
+
+/*
+ * If we are mounting (or read-write remounting) a filesystem whose journal
+ * has recorded an error from a previous lifetime, move that error to the
+ * main filesystem now.
+ */
+static void ext3_clear_journal_err(struct super_block *sb,
+				   struct ext3_super_block *es)
+{
+	journal_t *journal;
+	int j_errno;
+	const char *errstr;
+
+	journal = EXT3_SB(sb)->s_journal;
+
+	/*
+	 * Now check for any error status which may have been recorded in the
+	 * journal by a prior ext3_error() or ext3_abort()
+	 */
+
+	j_errno = journal_errno(journal);
+	if (j_errno) {
+		char nbuf[16];
+
+		errstr = ext3_decode_error(sb, j_errno, nbuf);
+		ext3_warning(sb, __func__, "Filesystem error recorded "
+			     "from previous mount: %s", errstr);
+		ext3_warning(sb, __func__, "Marking fs in need of "
+			     "filesystem check.");
+
+		EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
+		es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
+		ext3_commit_super (sb, es, 1);
+
+		journal_clear_err(journal);
+	}
+}
+
+/*
+ * Force the running and committing transactions to commit,
+ * and wait on the commit.
+ */
+int ext3_force_commit(struct super_block *sb)
+{
+	journal_t *journal;
+	int ret;
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	journal = EXT3_SB(sb)->s_journal;
+	ret = ext3_journal_force_commit(journal);
+	return ret;
+}
+
+static int ext3_sync_fs(struct super_block *sb, int wait)
+{
+	tid_t target;
+
+	if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
+		if (wait)
+			log_wait_commit(EXT3_SB(sb)->s_journal, target);
+	}
+	return 0;
+}
+
+/*
+ * LVM calls this function before a (read-only) snapshot is created.  This
+ * gives us a chance to flush the journal completely and mark the fs clean.
+ */
+static int ext3_freeze(struct super_block *sb)
+{
+	int error = 0;
+	journal_t *journal;
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		journal = EXT3_SB(sb)->s_journal;
+
+		/* Now we set up the journal barrier. */
+		journal_lock_updates(journal);
+
+		/*
+		 * We don't want to clear needs_recovery flag when we failed
+		 * to flush the journal.
+		 */
+		error = journal_flush(journal);
+		if (error < 0)
+			goto out;
+
+		/* Journal blocked and flushed, clear needs_recovery flag. */
+		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+		error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+		if (error)
+			goto out;
+	}
+	return 0;
+
+out:
+	journal_unlock_updates(journal);
+	return error;
+}
+
+/*
+ * Called by LVM after the snapshot is done.  We need to reset the RECOVER
+ * flag here, even though the filesystem is not technically dirty yet.
+ */
+static int ext3_unfreeze(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY)) {
+		lock_super(sb);
+		/* Reser the needs_recovery flag before the fs is unlocked. */
+		EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+		ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+		unlock_super(sb);
+		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+	}
+	return 0;
+}
+
+static int ext3_remount (struct super_block * sb, int * flags, char * data)
+{
+	struct ext3_super_block * es;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	ext3_fsblk_t n_blocks_count = 0;
+	unsigned long old_sb_flags;
+	struct ext3_mount_options old_opts;
+	int enable_quota = 0;
+	int err;
+#ifdef CONFIG_QUOTA
+	int i;
+#endif
+
+	/* Store the original options */
+	lock_super(sb);
+	old_sb_flags = sb->s_flags;
+	old_opts.s_mount_opt = sbi->s_mount_opt;
+	old_opts.s_resuid = sbi->s_resuid;
+	old_opts.s_resgid = sbi->s_resgid;
+	old_opts.s_commit_interval = sbi->s_commit_interval;
+#ifdef CONFIG_QUOTA
+	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
+	for (i = 0; i < MAXQUOTAS; i++)
+		old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+#endif
+
+	/*
+	 * Allow the "check" option to be passed as a remount option.
+	 */
+	if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+		err = -EINVAL;
+		goto restore_opts;
+	}
+
+	if (test_opt(sb, ABORT))
+		ext3_abort(sb, __func__, "Abort forced by user");
+
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+
+	es = sbi->s_es;
+
+	ext3_init_journal_params(sb, sbi->s_journal);
+
+	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
+		n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
+		if (test_opt(sb, ABORT)) {
+			err = -EROFS;
+			goto restore_opts;
+		}
+
+		if (*flags & MS_RDONLY) {
+			err = dquot_suspend(sb, -1);
+			if (err < 0)
+				goto restore_opts;
+
+			/*
+			 * First of all, the unconditional stuff we have to do
+			 * to disable replay of the journal when we next remount
+			 */
+			sb->s_flags |= MS_RDONLY;
+
+			/*
+			 * OK, test if we are remounting a valid rw partition
+			 * readonly, and if so set the rdonly flag and then
+			 * mark the partition as valid again.
+			 */
+			if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) &&
+			    (sbi->s_mount_state & EXT3_VALID_FS))
+				es->s_state = cpu_to_le16(sbi->s_mount_state);
+
+			ext3_mark_recovery_complete(sb, es);
+		} else {
+			__le32 ret;
+			if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
+					~EXT3_FEATURE_RO_COMPAT_SUPP))) {
+				ext3_msg(sb, KERN_WARNING,
+					"warning: couldn't remount RDWR "
+					"because of unsupported optional "
+					"features (%x)", le32_to_cpu(ret));
+				err = -EROFS;
+				goto restore_opts;
+			}
+
+			/*
+			 * If we have an unprocessed orphan list hanging
+			 * around from a previously readonly bdev mount,
+			 * require a full umount/remount for now.
+			 */
+			if (es->s_last_orphan) {
+				ext3_msg(sb, KERN_WARNING, "warning: couldn't "
+				       "remount RDWR because of unprocessed "
+				       "orphan inode list.  Please "
+				       "umount/remount instead.");
+				err = -EINVAL;
+				goto restore_opts;
+			}
+
+			/*
+			 * Mounting a RDONLY partition read-write, so reread
+			 * and store the current valid flag.  (It may have
+			 * been changed by e2fsck since we originally mounted
+			 * the partition.)
+			 */
+			ext3_clear_journal_err(sb, es);
+			sbi->s_mount_state = le16_to_cpu(es->s_state);
+			if ((err = ext3_group_extend(sb, es, n_blocks_count)))
+				goto restore_opts;
+			if (!ext3_setup_super (sb, es, 0))
+				sb->s_flags &= ~MS_RDONLY;
+			enable_quota = 1;
+		}
+	}
+#ifdef CONFIG_QUOTA
+	/* Release old quota file names */
+	for (i = 0; i < MAXQUOTAS; i++)
+		if (old_opts.s_qf_names[i] &&
+		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
+			kfree(old_opts.s_qf_names[i]);
+#endif
+	unlock_super(sb);
+
+	if (enable_quota)
+		dquot_resume(sb, -1);
+	return 0;
+restore_opts:
+	sb->s_flags = old_sb_flags;
+	sbi->s_mount_opt = old_opts.s_mount_opt;
+	sbi->s_resuid = old_opts.s_resuid;
+	sbi->s_resgid = old_opts.s_resgid;
+	sbi->s_commit_interval = old_opts.s_commit_interval;
+#ifdef CONFIG_QUOTA
+	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (sbi->s_qf_names[i] &&
+		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
+			kfree(sbi->s_qf_names[i]);
+		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
+	}
+#endif
+	unlock_super(sb);
+	return err;
+}
+
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_super_block *es = sbi->s_es;
+	u64 fsid;
+
+	if (test_opt(sb, MINIX_DF)) {
+		sbi->s_overhead_last = 0;
+	} else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
+		unsigned long ngroups = sbi->s_groups_count, i;
+		ext3_fsblk_t overhead = 0;
+		smp_rmb();
+
+		/*
+		 * Compute the overhead (FS structures).  This is constant
+		 * for a given filesystem unless the number of block groups
+		 * changes so we cache the previous value until it does.
+		 */
+
+		/*
+		 * All of the blocks before first_data_block are
+		 * overhead
+		 */
+		overhead = le32_to_cpu(es->s_first_data_block);
+
+		/*
+		 * Add the overhead attributed to the superblock and
+		 * block group descriptors.  If the sparse superblocks
+		 * feature is turned on, then not all groups have this.
+		 */
+		for (i = 0; i < ngroups; i++) {
+			overhead += ext3_bg_has_super(sb, i) +
+				ext3_bg_num_gdb(sb, i);
+			cond_resched();
+		}
+
+		/*
+		 * Every block group has an inode bitmap, a block
+		 * bitmap, and an inode table.
+		 */
+		overhead += ngroups * (2 + sbi->s_itb_per_group);
+		sbi->s_overhead_last = overhead;
+		smp_wmb();
+		sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
+	}
+
+	buf->f_type = EXT3_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
+	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+	buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
+	if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
+		buf->f_bavail = 0;
+	buf->f_files = le32_to_cpu(es->s_inodes_count);
+	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
+	buf->f_namelen = EXT3_NAME_LEN;
+	fsid = le64_to_cpup((void *)es->s_uuid) ^
+	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
+	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+	return 0;
+}
+
+/* Helper function for writing quotas on sync - we need to start transaction before quota file
+ * is locked for write. Otherwise the are possible deadlocks:
+ * Process 1                         Process 2
+ * ext3_create()                     quota_sync()
+ *   journal_start()                   write_dquot()
+ *   dquot_initialize()                       down(dqio_mutex)
+ *     down(dqio_mutex)                    journal_start()
+ *
+ */
+
+#ifdef CONFIG_QUOTA
+
+static inline struct inode *dquot_to_inode(struct dquot *dquot)
+{
+	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
+}
+
+static int ext3_write_dquot(struct dquot *dquot)
+{
+	int ret, err;
+	handle_t *handle;
+	struct inode *inode;
+
+	inode = dquot_to_inode(dquot);
+	handle = ext3_journal_start(inode,
+					EXT3_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_commit(dquot);
+	err = ext3_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext3_acquire_dquot(struct dquot *dquot)
+{
+	int ret, err;
+	handle_t *handle;
+
+	handle = ext3_journal_start(dquot_to_inode(dquot),
+					EXT3_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_acquire(dquot);
+	err = ext3_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext3_release_dquot(struct dquot *dquot)
+{
+	int ret, err;
+	handle_t *handle;
+
+	handle = ext3_journal_start(dquot_to_inode(dquot),
+					EXT3_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle)) {
+		/* Release dquot anyway to avoid endless cycle in dqput() */
+		dquot_release(dquot);
+		return PTR_ERR(handle);
+	}
+	ret = dquot_release(dquot);
+	err = ext3_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext3_mark_dquot_dirty(struct dquot *dquot)
+{
+	/* Are we journaling quotas? */
+	if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
+	    EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+		dquot_mark_dquot_dirty(dquot);
+		return ext3_write_dquot(dquot);
+	} else {
+		return dquot_mark_dquot_dirty(dquot);
+	}
+}
+
+static int ext3_write_info(struct super_block *sb, int type)
+{
+	int ret, err;
+	handle_t *handle;
+
+	/* Data block + inode block */
+	handle = ext3_journal_start(sb->s_root->d_inode, 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_commit_info(sb, type);
+	err = ext3_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+/*
+ * Turn on quotas during mount time - we need to find
+ * the quota file and such...
+ */
+static int ext3_quota_on_mount(struct super_block *sb, int type)
+{
+	return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
+					EXT3_SB(sb)->s_jquota_fmt, type);
+}
+
+/*
+ * Standard function to be called on quota_on
+ */
+static int ext3_quota_on(struct super_block *sb, int type, int format_id,
+			 struct path *path)
+{
+	int err;
+
+	if (!test_opt(sb, QUOTA))
+		return -EINVAL;
+
+	/* Quotafile not on the same filesystem? */
+	if (path->mnt->mnt_sb != sb)
+		return -EXDEV;
+	/* Journaling quota? */
+	if (EXT3_SB(sb)->s_qf_names[type]) {
+		/* Quotafile not of fs root? */
+		if (path->dentry->d_parent != sb->s_root)
+			ext3_msg(sb, KERN_WARNING,
+				"warning: Quota file not on filesystem root. "
+				"Journaled quota will not work.");
+	}
+
+	/*
+	 * When we journal data on quota file, we have to flush journal to see
+	 * all updates to the file when we bypass pagecache...
+	 */
+	if (ext3_should_journal_data(path->dentry->d_inode)) {
+		/*
+		 * We don't need to lock updates but journal_flush() could
+		 * otherwise be livelocked...
+		 */
+		journal_lock_updates(EXT3_SB(sb)->s_journal);
+		err = journal_flush(EXT3_SB(sb)->s_journal);
+		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+		if (err)
+			return err;
+	}
+
+	return dquot_quota_on(sb, type, format_id, path);
+}
+
+/* Read data from quotafile - avoid pagecache and such because we cannot afford
+ * acquiring the locks... As quota files are never truncated and quota code
+ * itself serializes the operations (and no one else should touch the files)
+ * we don't have to be afraid of races */
+static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
+			       size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	int tocopy;
+	size_t toread;
+	struct buffer_head *bh;
+	loff_t i_size = i_size_read(inode);
+
+	if (off > i_size)
+		return 0;
+	if (off+len > i_size)
+		len = i_size-off;
+	toread = len;
+	while (toread > 0) {
+		tocopy = sb->s_blocksize - offset < toread ?
+				sb->s_blocksize - offset : toread;
+		bh = ext3_bread(NULL, inode, blk, 0, &err);
+		if (err)
+			return err;
+		if (!bh)	/* A hole? */
+			memset(data, 0, tocopy);
+		else
+			memcpy(data, bh->b_data+offset, tocopy);
+		brelse(bh);
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+static ssize_t ext3_quota_write(struct super_block *sb, int type,
+				const char *data, size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
+	struct buffer_head *bh;
+	handle_t *handle = journal_current_handle();
+
+	if (!handle) {
+		ext3_msg(sb, KERN_WARNING,
+			"warning: quota write (off=%llu, len=%llu)"
+			" cancelled because transaction is not started.",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+
+	/*
+	 * Since we account only one data block in transaction credits,
+	 * then it is impossible to cross a block boundary.
+	 */
+	if (sb->s_blocksize - offset < len) {
+		ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+			" cancelled because not block aligned",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
+	bh = ext3_bread(handle, inode, blk, 1, &err);
+	if (!bh)
+		goto out;
+	if (journal_quota) {
+		err = ext3_journal_get_write_access(handle, bh);
+		if (err) {
+			brelse(bh);
+			goto out;
+		}
+	}
+	lock_buffer(bh);
+	memcpy(bh->b_data+offset, data, len);
+	flush_dcache_page(bh->b_page);
+	unlock_buffer(bh);
+	if (journal_quota)
+		err = ext3_journal_dirty_metadata(handle, bh);
+	else {
+		/* Always do at least ordered writes for quotas */
+		err = ext3_journal_dirty_data(handle, bh);
+		mark_buffer_dirty(bh);
+	}
+	brelse(bh);
+out:
+	if (err) {
+		mutex_unlock(&inode->i_mutex);
+		return err;
+	}
+	if (inode->i_size < off + len) {
+		i_size_write(inode, off + len);
+		EXT3_I(inode)->i_disksize = inode->i_size;
+	}
+	inode->i_version++;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	ext3_mark_inode_dirty(handle, inode);
+	mutex_unlock(&inode->i_mutex);
+	return len;
+}
+
+#endif
+
+static struct dentry *ext3_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
+}
+
+static struct file_system_type ext3_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext3",
+	.mount		= ext3_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_ext3_fs(void)
+{
+	int err = init_ext3_xattr();
+	if (err)
+		return err;
+	err = init_inodecache();
+	if (err)
+		goto out1;
+        err = register_filesystem(&ext3_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	exit_ext3_xattr();
+	return err;
+}
+
+static void __exit exit_ext3_fs(void)
+{
+	unregister_filesystem(&ext3_fs_type);
+	destroy_inodecache();
+	exit_ext3_xattr();
+}
+
+MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
+MODULE_LICENSE("GPL");
+module_init(init_ext3_fs)
+module_exit(exit_ext3_fs)
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
new file mode 100644
index 00000000..7c489820
--- /dev/null
+++ b/fs/ext3/symlink.c
@@ -0,0 +1,56 @@
+/*
+ *  linux/fs/ext3/symlink.c
+ *
+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/symlink.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext3 symlink handling code
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/ext3_fs.h>
+#include <linux/namei.h>
+#include "xattr.h"
+
+static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ext3_inode_info *ei = EXT3_I(dentry->d_inode);
+	nd_set_link(nd, (char*)ei->i_data);
+	return NULL;
+}
+
+const struct inode_operations ext3_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= ext3_setattr,
+#ifdef CONFIG_EXT3_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext3_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
+
+const struct inode_operations ext3_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= ext3_follow_link,
+	.setattr	= ext3_setattr,
+#ifdef CONFIG_EXT3_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext3_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
new file mode 100644
index 00000000..d565759d
--- /dev/null
+++ b/fs/ext3/xattr.c
@@ -0,0 +1,1336 @@
+/*
+ * linux/fs/ext3/xattr.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
+ * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
+ * Extended attributes for symlinks and special files added per
+ *  suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ *  Red Hat Inc.
+ * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
+ *  and Andreas Gruenbacher <agruen@suse.de>.
+ */
+
+/*
+ * Extended attributes are stored directly in inodes (on file systems with
+ * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
+ * field contains the block number if an inode uses an additional block. All
+ * attributes must fit in the inode and one additional block. Blocks that
+ * contain the identical set of attributes may be shared among several inodes.
+ * Identical blocks are detected by keeping a cache of blocks that have
+ * recently been accessed.
+ *
+ * The attributes in inodes and on blocks have a different header; the entries
+ * are stored in the same format:
+ *
+ *   +------------------+
+ *   | header           |
+ *   | entry 1          | |
+ *   | entry 2          | | growing downwards
+ *   | entry 3          | v
+ *   | four null bytes  |
+ *   | . . .            |
+ *   | value 1          | ^
+ *   | value 3          | | growing upwards
+ *   | value 2          | |
+ *   +------------------+
+ *
+ * The header is followed by multiple entry descriptors. In disk blocks, the
+ * entry descriptors are kept sorted. In inodes, they are unsorted. The
+ * attribute values are aligned to the end of the block in no specific order.
+ *
+ * Locking strategy
+ * ----------------
+ * EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem.
+ * EA blocks are only changed if they are exclusive to an inode, so
+ * holding xattr_sem also means that nothing but the EA block's reference
+ * count can change. Multiple writers to the same block are synchronized
+ * by the buffer lock.
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/ext3_jbd.h>
+#include <linux/ext3_fs.h>
+#include <linux/mbcache.h>
+#include <linux/quotaops.h>
+#include <linux/rwsem.h>
+#include "xattr.h"
+#include "acl.h"
+
+#define BHDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
+#define BFIRST(bh) ENTRY(BHDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
+#define IHDR(inode, raw_inode) \
+	((struct ext3_xattr_ibody_header *) \
+		((void *)raw_inode + \
+		 EXT3_GOOD_OLD_INODE_SIZE + \
+		 EXT3_I(inode)->i_extra_isize))
+#define IFIRST(hdr) ((struct ext3_xattr_entry *)((hdr)+1))
+
+#ifdef EXT3_XATTR_DEBUG
+# define ea_idebug(inode, f...) do { \
+		printk(KERN_DEBUG "inode %s:%lu: ", \
+			inode->i_sb->s_id, inode->i_ino); \
+		printk(f); \
+		printk("\n"); \
+	} while (0)
+# define ea_bdebug(bh, f...) do { \
+		char b[BDEVNAME_SIZE]; \
+		printk(KERN_DEBUG "block %s:%lu: ", \
+			bdevname(bh->b_bdev, b), \
+			(unsigned long) bh->b_blocknr); \
+		printk(f); \
+		printk("\n"); \
+	} while (0)
+#else
+# define ea_idebug(f...)
+# define ea_bdebug(f...)
+#endif
+
+static void ext3_xattr_cache_insert(struct buffer_head *);
+static struct buffer_head *ext3_xattr_cache_find(struct inode *,
+						 struct ext3_xattr_header *,
+						 struct mb_cache_entry **);
+static void ext3_xattr_rehash(struct ext3_xattr_header *,
+			      struct ext3_xattr_entry *);
+static int ext3_xattr_list(struct dentry *dentry, char *buffer,
+			   size_t buffer_size);
+
+static struct mb_cache *ext3_xattr_cache;
+
+static const struct xattr_handler *ext3_xattr_handler_map[] = {
+	[EXT3_XATTR_INDEX_USER]		     = &ext3_xattr_user_handler,
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
+	[EXT3_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext3_xattr_acl_access_handler,
+	[EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler,
+#endif
+	[EXT3_XATTR_INDEX_TRUSTED]	     = &ext3_xattr_trusted_handler,
+#ifdef CONFIG_EXT3_FS_SECURITY
+	[EXT3_XATTR_INDEX_SECURITY]	     = &ext3_xattr_security_handler,
+#endif
+};
+
+const struct xattr_handler *ext3_xattr_handlers[] = {
+	&ext3_xattr_user_handler,
+	&ext3_xattr_trusted_handler,
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
+	&ext3_xattr_acl_access_handler,
+	&ext3_xattr_acl_default_handler,
+#endif
+#ifdef CONFIG_EXT3_FS_SECURITY
+	&ext3_xattr_security_handler,
+#endif
+	NULL
+};
+
+static inline const struct xattr_handler *
+ext3_xattr_handler(int name_index)
+{
+	const struct xattr_handler *handler = NULL;
+
+	if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
+		handler = ext3_xattr_handler_map[name_index];
+	return handler;
+}
+
+/*
+ * Inode operation listxattr()
+ *
+ * dentry->d_inode->i_mutex: don't care
+ */
+ssize_t
+ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	return ext3_xattr_list(dentry, buffer, size);
+}
+
+static int
+ext3_xattr_check_names(struct ext3_xattr_entry *entry, void *end)
+{
+	while (!IS_LAST_ENTRY(entry)) {
+		struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(entry);
+		if ((void *)next >= end)
+			return -EIO;
+		entry = next;
+	}
+	return 0;
+}
+
+static inline int
+ext3_xattr_check_block(struct buffer_head *bh)
+{
+	int error;
+
+	if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
+	    BHDR(bh)->h_blocks != cpu_to_le32(1))
+		return -EIO;
+	error = ext3_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+	return error;
+}
+
+static inline int
+ext3_xattr_check_entry(struct ext3_xattr_entry *entry, size_t size)
+{
+	size_t value_size = le32_to_cpu(entry->e_value_size);
+
+	if (entry->e_value_block != 0 || value_size > size ||
+	    le16_to_cpu(entry->e_value_offs) + value_size > size)
+		return -EIO;
+	return 0;
+}
+
+static int
+ext3_xattr_find_entry(struct ext3_xattr_entry **pentry, int name_index,
+		      const char *name, size_t size, int sorted)
+{
+	struct ext3_xattr_entry *entry;
+	size_t name_len;
+	int cmp = 1;
+
+	if (name == NULL)
+		return -EINVAL;
+	name_len = strlen(name);
+	entry = *pentry;
+	for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
+		cmp = name_index - entry->e_name_index;
+		if (!cmp)
+			cmp = name_len - entry->e_name_len;
+		if (!cmp)
+			cmp = memcmp(name, entry->e_name, name_len);
+		if (cmp <= 0 && (sorted || cmp == 0))
+			break;
+	}
+	*pentry = entry;
+	if (!cmp && ext3_xattr_check_entry(entry, size))
+			return -EIO;
+	return cmp ? -ENODATA : 0;
+}
+
+static int
+ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
+		     void *buffer, size_t buffer_size)
+{
+	struct buffer_head *bh = NULL;
+	struct ext3_xattr_entry *entry;
+	size_t size;
+	int error;
+
+	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
+		  name_index, name, buffer, (long)buffer_size);
+
+	error = -ENODATA;
+	if (!EXT3_I(inode)->i_file_acl)
+		goto cleanup;
+	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
+	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
+	if (!bh)
+		goto cleanup;
+	ea_bdebug(bh, "b_count=%d, refcount=%d",
+		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+	if (ext3_xattr_check_block(bh)) {
+bad_block:	ext3_error(inode->i_sb, __func__,
+			   "inode %lu: bad block "E3FSBLK, inode->i_ino,
+			   EXT3_I(inode)->i_file_acl);
+		error = -EIO;
+		goto cleanup;
+	}
+	ext3_xattr_cache_insert(bh);
+	entry = BFIRST(bh);
+	error = ext3_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
+	if (error == -EIO)
+		goto bad_block;
+	if (error)
+		goto cleanup;
+	size = le32_to_cpu(entry->e_value_size);
+	if (buffer) {
+		error = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+		memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
+		       size);
+	}
+	error = size;
+
+cleanup:
+	brelse(bh);
+	return error;
+}
+
+static int
+ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
+		     void *buffer, size_t buffer_size)
+{
+	struct ext3_xattr_ibody_header *header;
+	struct ext3_xattr_entry *entry;
+	struct ext3_inode *raw_inode;
+	struct ext3_iloc iloc;
+	size_t size;
+	void *end;
+	int error;
+
+	if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
+		return -ENODATA;
+	error = ext3_get_inode_loc(inode, &iloc);
+	if (error)
+		return error;
+	raw_inode = ext3_raw_inode(&iloc);
+	header = IHDR(inode, raw_inode);
+	entry = IFIRST(header);
+	end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
+	error = ext3_xattr_check_names(entry, end);
+	if (error)
+		goto cleanup;
+	error = ext3_xattr_find_entry(&entry, name_index, name,
+				      end - (void *)entry, 0);
+	if (error)
+		goto cleanup;
+	size = le32_to_cpu(entry->e_value_size);
+	if (buffer) {
+		error = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+		memcpy(buffer, (void *)IFIRST(header) +
+		       le16_to_cpu(entry->e_value_offs), size);
+	}
+	error = size;
+
+cleanup:
+	brelse(iloc.bh);
+	return error;
+}
+
+/*
+ * ext3_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+int
+ext3_xattr_get(struct inode *inode, int name_index, const char *name,
+	       void *buffer, size_t buffer_size)
+{
+	int error;
+
+	down_read(&EXT3_I(inode)->xattr_sem);
+	error = ext3_xattr_ibody_get(inode, name_index, name, buffer,
+				     buffer_size);
+	if (error == -ENODATA)
+		error = ext3_xattr_block_get(inode, name_index, name, buffer,
+					     buffer_size);
+	up_read(&EXT3_I(inode)->xattr_sem);
+	return error;
+}
+
+static int
+ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
+			char *buffer, size_t buffer_size)
+{
+	size_t rest = buffer_size;
+
+	for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
+		const struct xattr_handler *handler =
+			ext3_xattr_handler(entry->e_name_index);
+
+		if (handler) {
+			size_t size = handler->list(dentry, buffer, rest,
+						    entry->e_name,
+						    entry->e_name_len,
+						    handler->flags);
+			if (buffer) {
+				if (size > rest)
+					return -ERANGE;
+				buffer += size;
+			}
+			rest -= size;
+		}
+	}
+	return buffer_size - rest;
+}
+
+static int
+ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct buffer_head *bh = NULL;
+	int error;
+
+	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
+		  buffer, (long)buffer_size);
+
+	error = 0;
+	if (!EXT3_I(inode)->i_file_acl)
+		goto cleanup;
+	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
+	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
+	error = -EIO;
+	if (!bh)
+		goto cleanup;
+	ea_bdebug(bh, "b_count=%d, refcount=%d",
+		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+	if (ext3_xattr_check_block(bh)) {
+		ext3_error(inode->i_sb, __func__,
+			   "inode %lu: bad block "E3FSBLK, inode->i_ino,
+			   EXT3_I(inode)->i_file_acl);
+		error = -EIO;
+		goto cleanup;
+	}
+	ext3_xattr_cache_insert(bh);
+	error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
+
+cleanup:
+	brelse(bh);
+
+	return error;
+}
+
+static int
+ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ext3_xattr_ibody_header *header;
+	struct ext3_inode *raw_inode;
+	struct ext3_iloc iloc;
+	void *end;
+	int error;
+
+	if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
+		return 0;
+	error = ext3_get_inode_loc(inode, &iloc);
+	if (error)
+		return error;
+	raw_inode = ext3_raw_inode(&iloc);
+	header = IHDR(inode, raw_inode);
+	end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
+	error = ext3_xattr_check_names(IFIRST(header), end);
+	if (error)
+		goto cleanup;
+	error = ext3_xattr_list_entries(dentry, IFIRST(header),
+					buffer, buffer_size);
+
+cleanup:
+	brelse(iloc.bh);
+	return error;
+}
+
+/*
+ * ext3_xattr_list()
+ *
+ * Copy a list of attribute names into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+static int
+ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	int i_error, b_error;
+
+	down_read(&EXT3_I(dentry->d_inode)->xattr_sem);
+	i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size);
+	if (i_error < 0) {
+		b_error = 0;
+	} else {
+		if (buffer) {
+			buffer += i_error;
+			buffer_size -= i_error;
+		}
+		b_error = ext3_xattr_block_list(dentry, buffer, buffer_size);
+		if (b_error < 0)
+			i_error = 0;
+	}
+	up_read(&EXT3_I(dentry->d_inode)->xattr_sem);
+	return i_error + b_error;
+}
+
+/*
+ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
+ * not set, set it.
+ */
+static void ext3_xattr_update_super_block(handle_t *handle,
+					  struct super_block *sb)
+{
+	if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
+		return;
+
+	if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
+		EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
+		ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	}
+}
+
+/*
+ * Release the xattr block BH: If the reference count is > 1, decrement
+ * it; otherwise free the block.
+ */
+static void
+ext3_xattr_release_block(handle_t *handle, struct inode *inode,
+			 struct buffer_head *bh)
+{
+	struct mb_cache_entry *ce = NULL;
+	int error = 0;
+
+	ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr);
+	error = ext3_journal_get_write_access(handle, bh);
+	if (error)
+		 goto out;
+
+	lock_buffer(bh);
+
+	if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
+		ea_bdebug(bh, "refcount now=0; freeing");
+		if (ce)
+			mb_cache_entry_free(ce);
+		ext3_free_blocks(handle, inode, bh->b_blocknr, 1);
+		get_bh(bh);
+		ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
+	} else {
+		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
+		error = ext3_journal_dirty_metadata(handle, bh);
+		if (IS_SYNC(inode))
+			handle->h_sync = 1;
+		dquot_free_block(inode, 1);
+		ea_bdebug(bh, "refcount now=%d; releasing",
+			  le32_to_cpu(BHDR(bh)->h_refcount));
+		if (ce)
+			mb_cache_entry_release(ce);
+	}
+	unlock_buffer(bh);
+out:
+	ext3_std_error(inode->i_sb, error);
+	return;
+}
+
+struct ext3_xattr_info {
+	int name_index;
+	const char *name;
+	const void *value;
+	size_t value_len;
+};
+
+struct ext3_xattr_search {
+	struct ext3_xattr_entry *first;
+	void *base;
+	void *end;
+	struct ext3_xattr_entry *here;
+	int not_found;
+};
+
+static int
+ext3_xattr_set_entry(struct ext3_xattr_info *i, struct ext3_xattr_search *s)
+{
+	struct ext3_xattr_entry *last;
+	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
+
+	/* Compute min_offs and last. */
+	last = s->first;
+	for (; !IS_LAST_ENTRY(last); last = EXT3_XATTR_NEXT(last)) {
+		if (!last->e_value_block && last->e_value_size) {
+			size_t offs = le16_to_cpu(last->e_value_offs);
+			if (offs < min_offs)
+				min_offs = offs;
+		}
+	}
+	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
+	if (!s->not_found) {
+		if (!s->here->e_value_block && s->here->e_value_size) {
+			size_t size = le32_to_cpu(s->here->e_value_size);
+			free += EXT3_XATTR_SIZE(size);
+		}
+		free += EXT3_XATTR_LEN(name_len);
+	}
+	if (i->value) {
+		if (free < EXT3_XATTR_SIZE(i->value_len) ||
+		    free < EXT3_XATTR_LEN(name_len) +
+			   EXT3_XATTR_SIZE(i->value_len))
+			return -ENOSPC;
+	}
+
+	if (i->value && s->not_found) {
+		/* Insert the new name. */
+		size_t size = EXT3_XATTR_LEN(name_len);
+		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
+		memmove((void *)s->here + size, s->here, rest);
+		memset(s->here, 0, size);
+		s->here->e_name_index = i->name_index;
+		s->here->e_name_len = name_len;
+		memcpy(s->here->e_name, i->name, name_len);
+	} else {
+		if (!s->here->e_value_block && s->here->e_value_size) {
+			void *first_val = s->base + min_offs;
+			size_t offs = le16_to_cpu(s->here->e_value_offs);
+			void *val = s->base + offs;
+			size_t size = EXT3_XATTR_SIZE(
+				le32_to_cpu(s->here->e_value_size));
+
+			if (i->value && size == EXT3_XATTR_SIZE(i->value_len)) {
+				/* The old and the new value have the same
+				   size. Just replace. */
+				s->here->e_value_size =
+					cpu_to_le32(i->value_len);
+				memset(val + size - EXT3_XATTR_PAD, 0,
+				       EXT3_XATTR_PAD); /* Clear pad bytes. */
+				memcpy(val, i->value, i->value_len);
+				return 0;
+			}
+
+			/* Remove the old value. */
+			memmove(first_val + size, first_val, val - first_val);
+			memset(first_val, 0, size);
+			s->here->e_value_size = 0;
+			s->here->e_value_offs = 0;
+			min_offs += size;
+
+			/* Adjust all value offsets. */
+			last = s->first;
+			while (!IS_LAST_ENTRY(last)) {
+				size_t o = le16_to_cpu(last->e_value_offs);
+				if (!last->e_value_block &&
+				    last->e_value_size && o < offs)
+					last->e_value_offs =
+						cpu_to_le16(o + size);
+				last = EXT3_XATTR_NEXT(last);
+			}
+		}
+		if (!i->value) {
+			/* Remove the old name. */
+			size_t size = EXT3_XATTR_LEN(name_len);
+			last = ENTRY((void *)last - size);
+			memmove(s->here, (void *)s->here + size,
+				(void *)last - (void *)s->here + sizeof(__u32));
+			memset(last, 0, size);
+		}
+	}
+
+	if (i->value) {
+		/* Insert the new value. */
+		s->here->e_value_size = cpu_to_le32(i->value_len);
+		if (i->value_len) {
+			size_t size = EXT3_XATTR_SIZE(i->value_len);
+			void *val = s->base + min_offs - size;
+			s->here->e_value_offs = cpu_to_le16(min_offs - size);
+			memset(val + size - EXT3_XATTR_PAD, 0,
+			       EXT3_XATTR_PAD); /* Clear the pad bytes. */
+			memcpy(val, i->value, i->value_len);
+		}
+	}
+	return 0;
+}
+
+struct ext3_xattr_block_find {
+	struct ext3_xattr_search s;
+	struct buffer_head *bh;
+};
+
+static int
+ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
+		      struct ext3_xattr_block_find *bs)
+{
+	struct super_block *sb = inode->i_sb;
+	int error;
+
+	ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
+		  i->name_index, i->name, i->value, (long)i->value_len);
+
+	if (EXT3_I(inode)->i_file_acl) {
+		/* The inode already has an extended attribute block. */
+		bs->bh = sb_bread(sb, EXT3_I(inode)->i_file_acl);
+		error = -EIO;
+		if (!bs->bh)
+			goto cleanup;
+		ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
+			atomic_read(&(bs->bh->b_count)),
+			le32_to_cpu(BHDR(bs->bh)->h_refcount));
+		if (ext3_xattr_check_block(bs->bh)) {
+			ext3_error(sb, __func__,
+				"inode %lu: bad block "E3FSBLK, inode->i_ino,
+				EXT3_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		/* Find the named attribute. */
+		bs->s.base = BHDR(bs->bh);
+		bs->s.first = BFIRST(bs->bh);
+		bs->s.end = bs->bh->b_data + bs->bh->b_size;
+		bs->s.here = bs->s.first;
+		error = ext3_xattr_find_entry(&bs->s.here, i->name_index,
+					      i->name, bs->bh->b_size, 1);
+		if (error && error != -ENODATA)
+			goto cleanup;
+		bs->s.not_found = error;
+	}
+	error = 0;
+
+cleanup:
+	return error;
+}
+
+static int
+ext3_xattr_block_set(handle_t *handle, struct inode *inode,
+		     struct ext3_xattr_info *i,
+		     struct ext3_xattr_block_find *bs)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *new_bh = NULL;
+	struct ext3_xattr_search *s = &bs->s;
+	struct mb_cache_entry *ce = NULL;
+	int error = 0;
+
+#define header(x) ((struct ext3_xattr_header *)(x))
+
+	if (i->value && i->value_len > sb->s_blocksize)
+		return -ENOSPC;
+	if (s->base) {
+		ce = mb_cache_entry_get(ext3_xattr_cache, bs->bh->b_bdev,
+					bs->bh->b_blocknr);
+		error = ext3_journal_get_write_access(handle, bs->bh);
+		if (error)
+			goto cleanup;
+		lock_buffer(bs->bh);
+
+		if (header(s->base)->h_refcount == cpu_to_le32(1)) {
+			if (ce) {
+				mb_cache_entry_free(ce);
+				ce = NULL;
+			}
+			ea_bdebug(bs->bh, "modifying in-place");
+			error = ext3_xattr_set_entry(i, s);
+			if (!error) {
+				if (!IS_LAST_ENTRY(s->first))
+					ext3_xattr_rehash(header(s->base),
+							  s->here);
+				ext3_xattr_cache_insert(bs->bh);
+			}
+			unlock_buffer(bs->bh);
+			if (error == -EIO)
+				goto bad_block;
+			if (!error)
+				error = ext3_journal_dirty_metadata(handle,
+								    bs->bh);
+			if (error)
+				goto cleanup;
+			goto inserted;
+		} else {
+			int offset = (char *)s->here - bs->bh->b_data;
+
+			unlock_buffer(bs->bh);
+			journal_release_buffer(handle, bs->bh);
+
+			if (ce) {
+				mb_cache_entry_release(ce);
+				ce = NULL;
+			}
+			ea_bdebug(bs->bh, "cloning");
+			s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
+			error = -ENOMEM;
+			if (s->base == NULL)
+				goto cleanup;
+			memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
+			s->first = ENTRY(header(s->base)+1);
+			header(s->base)->h_refcount = cpu_to_le32(1);
+			s->here = ENTRY(s->base + offset);
+			s->end = s->base + bs->bh->b_size;
+		}
+	} else {
+		/* Allocate a buffer where we construct the new block. */
+		s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
+		/* assert(header == s->base) */
+		error = -ENOMEM;
+		if (s->base == NULL)
+			goto cleanup;
+		header(s->base)->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
+		header(s->base)->h_blocks = cpu_to_le32(1);
+		header(s->base)->h_refcount = cpu_to_le32(1);
+		s->first = ENTRY(header(s->base)+1);
+		s->here = ENTRY(header(s->base)+1);
+		s->end = s->base + sb->s_blocksize;
+	}
+
+	error = ext3_xattr_set_entry(i, s);
+	if (error == -EIO)
+		goto bad_block;
+	if (error)
+		goto cleanup;
+	if (!IS_LAST_ENTRY(s->first))
+		ext3_xattr_rehash(header(s->base), s->here);
+
+inserted:
+	if (!IS_LAST_ENTRY(s->first)) {
+		new_bh = ext3_xattr_cache_find(inode, header(s->base), &ce);
+		if (new_bh) {
+			/* We found an identical block in the cache. */
+			if (new_bh == bs->bh)
+				ea_bdebug(new_bh, "keeping");
+			else {
+				/* The old block is released after updating
+				   the inode. */
+				error = dquot_alloc_block(inode, 1);
+				if (error)
+					goto cleanup;
+				error = ext3_journal_get_write_access(handle,
+								      new_bh);
+				if (error)
+					goto cleanup_dquot;
+				lock_buffer(new_bh);
+				le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
+				ea_bdebug(new_bh, "reusing; refcount now=%d",
+					le32_to_cpu(BHDR(new_bh)->h_refcount));
+				unlock_buffer(new_bh);
+				error = ext3_journal_dirty_metadata(handle,
+								    new_bh);
+				if (error)
+					goto cleanup_dquot;
+			}
+			mb_cache_entry_release(ce);
+			ce = NULL;
+		} else if (bs->bh && s->base == bs->bh->b_data) {
+			/* We were modifying this block in-place. */
+			ea_bdebug(bs->bh, "keeping this block");
+			new_bh = bs->bh;
+			get_bh(new_bh);
+		} else {
+			/* We need to allocate a new block */
+			ext3_fsblk_t goal = ext3_group_first_block_no(sb,
+						EXT3_I(inode)->i_block_group);
+			ext3_fsblk_t block;
+
+			/*
+			 * Protect us agaist concurrent allocations to the
+			 * same inode from ext3_..._writepage(). Reservation
+			 * code does not expect racing allocations.
+			 */
+			mutex_lock(&EXT3_I(inode)->truncate_mutex);
+			block = ext3_new_block(handle, inode, goal, &error);
+			mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+			if (error)
+				goto cleanup;
+			ea_idebug(inode, "creating block %d", block);
+
+			new_bh = sb_getblk(sb, block);
+			if (!new_bh) {
+getblk_failed:
+				ext3_free_blocks(handle, inode, block, 1);
+				error = -EIO;
+				goto cleanup;
+			}
+			lock_buffer(new_bh);
+			error = ext3_journal_get_create_access(handle, new_bh);
+			if (error) {
+				unlock_buffer(new_bh);
+				goto getblk_failed;
+			}
+			memcpy(new_bh->b_data, s->base, new_bh->b_size);
+			set_buffer_uptodate(new_bh);
+			unlock_buffer(new_bh);
+			ext3_xattr_cache_insert(new_bh);
+			error = ext3_journal_dirty_metadata(handle, new_bh);
+			if (error)
+				goto cleanup;
+		}
+	}
+
+	/* Update the inode. */
+	EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
+
+	/* Drop the previous xattr block. */
+	if (bs->bh && bs->bh != new_bh)
+		ext3_xattr_release_block(handle, inode, bs->bh);
+	error = 0;
+
+cleanup:
+	if (ce)
+		mb_cache_entry_release(ce);
+	brelse(new_bh);
+	if (!(bs->bh && s->base == bs->bh->b_data))
+		kfree(s->base);
+
+	return error;
+
+cleanup_dquot:
+	dquot_free_block(inode, 1);
+	goto cleanup;
+
+bad_block:
+	ext3_error(inode->i_sb, __func__,
+		   "inode %lu: bad block "E3FSBLK, inode->i_ino,
+		   EXT3_I(inode)->i_file_acl);
+	goto cleanup;
+
+#undef header
+}
+
+struct ext3_xattr_ibody_find {
+	struct ext3_xattr_search s;
+	struct ext3_iloc iloc;
+};
+
+static int
+ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
+		      struct ext3_xattr_ibody_find *is)
+{
+	struct ext3_xattr_ibody_header *header;
+	struct ext3_inode *raw_inode;
+	int error;
+
+	if (EXT3_I(inode)->i_extra_isize == 0)
+		return 0;
+	raw_inode = ext3_raw_inode(&is->iloc);
+	header = IHDR(inode, raw_inode);
+	is->s.base = is->s.first = IFIRST(header);
+	is->s.here = is->s.first;
+	is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
+	if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) {
+		error = ext3_xattr_check_names(IFIRST(header), is->s.end);
+		if (error)
+			return error;
+		/* Find the named attribute. */
+		error = ext3_xattr_find_entry(&is->s.here, i->name_index,
+					      i->name, is->s.end -
+					      (void *)is->s.base, 0);
+		if (error && error != -ENODATA)
+			return error;
+		is->s.not_found = error;
+	}
+	return 0;
+}
+
+static int
+ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
+		     struct ext3_xattr_info *i,
+		     struct ext3_xattr_ibody_find *is)
+{
+	struct ext3_xattr_ibody_header *header;
+	struct ext3_xattr_search *s = &is->s;
+	int error;
+
+	if (EXT3_I(inode)->i_extra_isize == 0)
+		return -ENOSPC;
+	error = ext3_xattr_set_entry(i, s);
+	if (error)
+		return error;
+	header = IHDR(inode, ext3_raw_inode(&is->iloc));
+	if (!IS_LAST_ENTRY(s->first)) {
+		header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
+		ext3_set_inode_state(inode, EXT3_STATE_XATTR);
+	} else {
+		header->h_magic = cpu_to_le32(0);
+		ext3_clear_inode_state(inode, EXT3_STATE_XATTR);
+	}
+	return 0;
+}
+
+/*
+ * ext3_xattr_set_handle()
+ *
+ * Create, replace or remove an extended attribute for this inode.  Value
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+		      const char *name, const void *value, size_t value_len,
+		      int flags)
+{
+	struct ext3_xattr_info i = {
+		.name_index = name_index,
+		.name = name,
+		.value = value,
+		.value_len = value_len,
+
+	};
+	struct ext3_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext3_xattr_block_find bs = {
+		.s = { .not_found = -ENODATA, },
+	};
+	int error;
+
+	if (!name)
+		return -EINVAL;
+	if (strlen(name) > 255)
+		return -ERANGE;
+	down_write(&EXT3_I(inode)->xattr_sem);
+	error = ext3_get_inode_loc(inode, &is.iloc);
+	if (error)
+		goto cleanup;
+
+	error = ext3_journal_get_write_access(handle, is.iloc.bh);
+	if (error)
+		goto cleanup;
+
+	if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) {
+		struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
+		memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
+		ext3_clear_inode_state(inode, EXT3_STATE_NEW);
+	}
+
+	error = ext3_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		goto cleanup;
+	if (is.s.not_found)
+		error = ext3_xattr_block_find(inode, &i, &bs);
+	if (error)
+		goto cleanup;
+	if (is.s.not_found && bs.s.not_found) {
+		error = -ENODATA;
+		if (flags & XATTR_REPLACE)
+			goto cleanup;
+		error = 0;
+		if (!value)
+			goto cleanup;
+	} else {
+		error = -EEXIST;
+		if (flags & XATTR_CREATE)
+			goto cleanup;
+	}
+	if (!value) {
+		if (!is.s.not_found)
+			error = ext3_xattr_ibody_set(handle, inode, &i, &is);
+		else if (!bs.s.not_found)
+			error = ext3_xattr_block_set(handle, inode, &i, &bs);
+	} else {
+		error = ext3_xattr_ibody_set(handle, inode, &i, &is);
+		if (!error && !bs.s.not_found) {
+			i.value = NULL;
+			error = ext3_xattr_block_set(handle, inode, &i, &bs);
+		} else if (error == -ENOSPC) {
+			if (EXT3_I(inode)->i_file_acl && !bs.s.base) {
+				error = ext3_xattr_block_find(inode, &i, &bs);
+				if (error)
+					goto cleanup;
+			}
+			error = ext3_xattr_block_set(handle, inode, &i, &bs);
+			if (error)
+				goto cleanup;
+			if (!is.s.not_found) {
+				i.value = NULL;
+				error = ext3_xattr_ibody_set(handle, inode, &i,
+							     &is);
+			}
+		}
+	}
+	if (!error) {
+		ext3_xattr_update_super_block(handle, inode->i_sb);
+		inode->i_ctime = CURRENT_TIME_SEC;
+		error = ext3_mark_iloc_dirty(handle, inode, &is.iloc);
+		/*
+		 * The bh is consumed by ext3_mark_iloc_dirty, even with
+		 * error != 0.
+		 */
+		is.iloc.bh = NULL;
+		if (IS_SYNC(inode))
+			handle->h_sync = 1;
+	}
+
+cleanup:
+	brelse(is.iloc.bh);
+	brelse(bs.bh);
+	up_write(&EXT3_I(inode)->xattr_sem);
+	return error;
+}
+
+/*
+ * ext3_xattr_set()
+ *
+ * Like ext3_xattr_set_handle, but start from an inode. This extended
+ * attribute modification is a filesystem transaction by itself.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext3_xattr_set(struct inode *inode, int name_index, const char *name,
+	       const void *value, size_t value_len, int flags)
+{
+	handle_t *handle;
+	int error, retries = 0;
+
+retry:
+	handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
+	if (IS_ERR(handle)) {
+		error = PTR_ERR(handle);
+	} else {
+		int error2;
+
+		error = ext3_xattr_set_handle(handle, inode, name_index, name,
+					      value, value_len, flags);
+		error2 = ext3_journal_stop(handle);
+		if (error == -ENOSPC &&
+		    ext3_should_retry_alloc(inode->i_sb, &retries))
+			goto retry;
+		if (error == 0)
+			error = error2;
+	}
+
+	return error;
+}
+
+/*
+ * ext3_xattr_delete_inode()
+ *
+ * Free extended attribute resources associated with this inode. This
+ * is called immediately before an inode is freed. We have exclusive
+ * access to the inode.
+ */
+void
+ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+	struct buffer_head *bh = NULL;
+
+	if (!EXT3_I(inode)->i_file_acl)
+		goto cleanup;
+	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
+	if (!bh) {
+		ext3_error(inode->i_sb, __func__,
+			"inode %lu: block "E3FSBLK" read error", inode->i_ino,
+			EXT3_I(inode)->i_file_acl);
+		goto cleanup;
+	}
+	if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
+	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
+		ext3_error(inode->i_sb, __func__,
+			"inode %lu: bad block "E3FSBLK, inode->i_ino,
+			EXT3_I(inode)->i_file_acl);
+		goto cleanup;
+	}
+	ext3_xattr_release_block(handle, inode, bh);
+	EXT3_I(inode)->i_file_acl = 0;
+
+cleanup:
+	brelse(bh);
+}
+
+/*
+ * ext3_xattr_put_super()
+ *
+ * This is called when a file system is unmounted.
+ */
+void
+ext3_xattr_put_super(struct super_block *sb)
+{
+	mb_cache_shrink(sb->s_bdev);
+}
+
+/*
+ * ext3_xattr_cache_insert()
+ *
+ * Create a new entry in the extended attribute cache, and insert
+ * it unless such an entry is already in the cache.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+static void
+ext3_xattr_cache_insert(struct buffer_head *bh)
+{
+	__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
+	struct mb_cache_entry *ce;
+	int error;
+
+	ce = mb_cache_entry_alloc(ext3_xattr_cache, GFP_NOFS);
+	if (!ce) {
+		ea_bdebug(bh, "out of memory");
+		return;
+	}
+	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+	if (error) {
+		mb_cache_entry_free(ce);
+		if (error == -EBUSY) {
+			ea_bdebug(bh, "already in cache");
+			error = 0;
+		}
+	} else {
+		ea_bdebug(bh, "inserting [%x]", (int)hash);
+		mb_cache_entry_release(ce);
+	}
+}
+
+/*
+ * ext3_xattr_cmp()
+ *
+ * Compare two extended attribute blocks for equality.
+ *
+ * Returns 0 if the blocks are equal, 1 if they differ, and
+ * a negative error number on errors.
+ */
+static int
+ext3_xattr_cmp(struct ext3_xattr_header *header1,
+	       struct ext3_xattr_header *header2)
+{
+	struct ext3_xattr_entry *entry1, *entry2;
+
+	entry1 = ENTRY(header1+1);
+	entry2 = ENTRY(header2+1);
+	while (!IS_LAST_ENTRY(entry1)) {
+		if (IS_LAST_ENTRY(entry2))
+			return 1;
+		if (entry1->e_hash != entry2->e_hash ||
+		    entry1->e_name_index != entry2->e_name_index ||
+		    entry1->e_name_len != entry2->e_name_len ||
+		    entry1->e_value_size != entry2->e_value_size ||
+		    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
+			return 1;
+		if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
+			return -EIO;
+		if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
+			   (char *)header2 + le16_to_cpu(entry2->e_value_offs),
+			   le32_to_cpu(entry1->e_value_size)))
+			return 1;
+
+		entry1 = EXT3_XATTR_NEXT(entry1);
+		entry2 = EXT3_XATTR_NEXT(entry2);
+	}
+	if (!IS_LAST_ENTRY(entry2))
+		return 1;
+	return 0;
+}
+
+/*
+ * ext3_xattr_cache_find()
+ *
+ * Find an identical extended attribute block.
+ *
+ * Returns a pointer to the block found, or NULL if such a block was
+ * not found or an error occurred.
+ */
+static struct buffer_head *
+ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header,
+		      struct mb_cache_entry **pce)
+{
+	__u32 hash = le32_to_cpu(header->h_hash);
+	struct mb_cache_entry *ce;
+
+	if (!header->h_hash)
+		return NULL;  /* never share */
+	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
+again:
+	ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev,
+				       hash);
+	while (ce) {
+		struct buffer_head *bh;
+
+		if (IS_ERR(ce)) {
+			if (PTR_ERR(ce) == -EAGAIN)
+				goto again;
+			break;
+		}
+		bh = sb_bread(inode->i_sb, ce->e_block);
+		if (!bh) {
+			ext3_error(inode->i_sb, __func__,
+				"inode %lu: block %lu read error",
+				inode->i_ino, (unsigned long) ce->e_block);
+		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
+				EXT3_XATTR_REFCOUNT_MAX) {
+			ea_idebug(inode, "block %lu refcount %d>=%d",
+				  (unsigned long) ce->e_block,
+				  le32_to_cpu(BHDR(bh)->h_refcount),
+					  EXT3_XATTR_REFCOUNT_MAX);
+		} else if (ext3_xattr_cmp(header, BHDR(bh)) == 0) {
+			*pce = ce;
+			return bh;
+		}
+		brelse(bh);
+		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+	}
+	return NULL;
+}
+
+#define NAME_HASH_SHIFT 5
+#define VALUE_HASH_SHIFT 16
+
+/*
+ * ext3_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
+					 struct ext3_xattr_entry *entry)
+{
+	__u32 hash = 0;
+	char *name = entry->e_name;
+	int n;
+
+	for (n=0; n < entry->e_name_len; n++) {
+		hash = (hash << NAME_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
+		       *name++;
+	}
+
+	if (entry->e_value_block == 0 && entry->e_value_size != 0) {
+		__le32 *value = (__le32 *)((char *)header +
+			le16_to_cpu(entry->e_value_offs));
+		for (n = (le32_to_cpu(entry->e_value_size) +
+		     EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
+			hash = (hash << VALUE_HASH_SHIFT) ^
+			       (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
+			       le32_to_cpu(*value++);
+		}
+	}
+	entry->e_hash = cpu_to_le32(hash);
+}
+
+#undef NAME_HASH_SHIFT
+#undef VALUE_HASH_SHIFT
+
+#define BLOCK_HASH_SHIFT 16
+
+/*
+ * ext3_xattr_rehash()
+ *
+ * Re-compute the extended attribute hash value after an entry has changed.
+ */
+static void ext3_xattr_rehash(struct ext3_xattr_header *header,
+			      struct ext3_xattr_entry *entry)
+{
+	struct ext3_xattr_entry *here;
+	__u32 hash = 0;
+
+	ext3_xattr_hash_entry(header, entry);
+	here = ENTRY(header+1);
+	while (!IS_LAST_ENTRY(here)) {
+		if (!here->e_hash) {
+			/* Block is not shared if an entry's hash value == 0 */
+			hash = 0;
+			break;
+		}
+		hash = (hash << BLOCK_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
+		       le32_to_cpu(here->e_hash);
+		here = EXT3_XATTR_NEXT(here);
+	}
+	header->h_hash = cpu_to_le32(hash);
+}
+
+#undef BLOCK_HASH_SHIFT
+
+int __init
+init_ext3_xattr(void)
+{
+	ext3_xattr_cache = mb_cache_create("ext3_xattr", 6);
+	if (!ext3_xattr_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+exit_ext3_xattr(void)
+{
+	if (ext3_xattr_cache)
+		mb_cache_destroy(ext3_xattr_cache);
+	ext3_xattr_cache = NULL;
+}
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
new file mode 100644
index 00000000..2be4f69b
--- /dev/null
+++ b/fs/ext3/xattr.h
@@ -0,0 +1,138 @@
+/*
+  File: fs/ext3/xattr.h
+
+  On-disk format of extended attributes for the ext3 filesystem.
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/xattr.h>
+
+/* Magic value in attribute blocks */
+#define EXT3_XATTR_MAGIC		0xEA020000
+
+/* Maximum number of references to one attribute block */
+#define EXT3_XATTR_REFCOUNT_MAX		1024
+
+/* Name indexes */
+#define EXT3_XATTR_INDEX_USER			1
+#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS	2
+#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT	3
+#define EXT3_XATTR_INDEX_TRUSTED		4
+#define	EXT3_XATTR_INDEX_LUSTRE			5
+#define EXT3_XATTR_INDEX_SECURITY	        6
+
+struct ext3_xattr_header {
+	__le32	h_magic;	/* magic number for identification */
+	__le32	h_refcount;	/* reference count */
+	__le32	h_blocks;	/* number of disk blocks used */
+	__le32	h_hash;		/* hash value of all attributes */
+	__u32	h_reserved[4];	/* zero right now */
+};
+
+struct ext3_xattr_ibody_header {
+	__le32	h_magic;	/* magic number for identification */
+};
+
+struct ext3_xattr_entry {
+	__u8	e_name_len;	/* length of name */
+	__u8	e_name_index;	/* attribute name index */
+	__le16	e_value_offs;	/* offset in disk block of value */
+	__le32	e_value_block;	/* disk block attribute is stored on (n/i) */
+	__le32	e_value_size;	/* size of attribute value */
+	__le32	e_hash;		/* hash value of name and value */
+	char	e_name[0];	/* attribute name */
+};
+
+#define EXT3_XATTR_PAD_BITS		2
+#define EXT3_XATTR_PAD		(1<<EXT3_XATTR_PAD_BITS)
+#define EXT3_XATTR_ROUND		(EXT3_XATTR_PAD-1)
+#define EXT3_XATTR_LEN(name_len) \
+	(((name_len) + EXT3_XATTR_ROUND + \
+	sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
+#define EXT3_XATTR_NEXT(entry) \
+	( (struct ext3_xattr_entry *)( \
+	  (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
+#define EXT3_XATTR_SIZE(size) \
+	(((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
+
+# ifdef CONFIG_EXT3_FS_XATTR
+
+extern const struct xattr_handler ext3_xattr_user_handler;
+extern const struct xattr_handler ext3_xattr_trusted_handler;
+extern const struct xattr_handler ext3_xattr_acl_access_handler;
+extern const struct xattr_handler ext3_xattr_acl_default_handler;
+extern const struct xattr_handler ext3_xattr_security_handler;
+
+extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
+
+extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
+extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
+
+extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
+extern void ext3_xattr_put_super(struct super_block *);
+
+extern int init_ext3_xattr(void);
+extern void exit_ext3_xattr(void);
+
+extern const struct xattr_handler *ext3_xattr_handlers[];
+
+# else  /* CONFIG_EXT3_FS_XATTR */
+
+static inline int
+ext3_xattr_get(struct inode *inode, int name_index, const char *name,
+	       void *buffer, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ext3_xattr_set(struct inode *inode, int name_index, const char *name,
+	       const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+	       const char *name, const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void
+ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+}
+
+static inline void
+ext3_xattr_put_super(struct super_block *sb)
+{
+}
+
+static inline int
+init_ext3_xattr(void)
+{
+	return 0;
+}
+
+static inline void
+exit_ext3_xattr(void)
+{
+}
+
+#define ext3_xattr_handlers	NULL
+
+# endif  /* CONFIG_EXT3_FS_XATTR */
+
+#ifdef CONFIG_EXT3_FS_SECURITY
+extern int ext3_init_security(handle_t *handle, struct inode *inode,
+			      struct inode *dir, const struct qstr *qstr);
+#else
+static inline int ext3_init_security(handle_t *handle, struct inode *inode,
+				     struct inode *dir, const struct qstr *qstr)
+{
+	return 0;
+}
+#endif
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
new file mode 100644
index 00000000..b8d9f83a
--- /dev/null
+++ b/fs/ext3/xattr_security.c
@@ -0,0 +1,78 @@
+/*
+ * linux/fs/ext3/xattr_security.c
+ * Handler for storing security labels as extended attributes.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/ext3_jbd.h>
+#include <linux/ext3_fs.h>
+#include <linux/security.h>
+#include "xattr.h"
+
+static size_t
+ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
+			 const char *name, size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext3_xattr_security_get(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY,
+			      name, buffer, size);
+}
+
+static int
+ext3_xattr_security_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY,
+			      name, value, size, flags);
+}
+
+int
+ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+		   const struct qstr *qstr)
+{
+	int err;
+	size_t len;
+	void *value;
+	char *name;
+
+	err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+	err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY,
+				    name, value, len, 0);
+	kfree(name);
+	kfree(value);
+	return err;
+}
+
+const struct xattr_handler ext3_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= ext3_xattr_security_list,
+	.get	= ext3_xattr_security_get,
+	.set	= ext3_xattr_security_set,
+};
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
new file mode 100644
index 00000000..dc8edda9
--- /dev/null
+++ b/fs/ext3/xattr_trusted.c
@@ -0,0 +1,59 @@
+/*
+ * linux/fs/ext3/xattr_trusted.c
+ * Handler for trusted extended attributes.
+ *
+ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/ext3_jbd.h>
+#include <linux/ext3_fs.h>
+#include "xattr.h"
+
+static size_t
+ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext3_xattr_trusted_get(struct dentry *dentry, const char *name,
+		       void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED,
+			      name, buffer, size);
+}
+
+static int
+ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, name,
+			      value, size, flags);
+}
+
+const struct xattr_handler ext3_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= ext3_xattr_trusted_list,
+	.get	= ext3_xattr_trusted_get,
+	.set	= ext3_xattr_trusted_set,
+};
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
new file mode 100644
index 00000000..7a321974
--- /dev/null
+++ b/fs/ext3/xattr_user.c
@@ -0,0 +1,62 @@
+/*
+ * linux/fs/ext3/xattr_user.c
+ * Handler for extended user attributes.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/ext3_jbd.h>
+#include <linux/ext3_fs.h>
+#include "xattr.h"
+
+static size_t
+ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_USER_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer,
+		size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+	return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_USER,
+			      name, buffer, size);
+}
+
+static int
+ext3_xattr_user_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+	return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_USER,
+			      name, value, size, flags);
+}
+
+const struct xattr_handler ext3_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= ext3_xattr_user_list,
+	.get	= ext3_xattr_user_get,
+	.set	= ext3_xattr_user_set,
+};
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
new file mode 100644
index 00000000..9ed1bb1f
--- /dev/null
+++ b/fs/ext4/Kconfig
@@ -0,0 +1,85 @@
+config EXT4_FS
+	tristate "The Extended 4 (ext4) filesystem"
+	select JBD2
+	select CRC16
+	help
+	  This is the next generation of the ext3 filesystem.
+
+	  Unlike the change from ext2 filesystem to ext3 filesystem,
+	  the on-disk format of ext4 is not forwards compatible with
+	  ext3; it is based on extent maps and it supports 48-bit
+	  physical block numbers.  The ext4 filesystem also supports delayed
+	  allocation, persistent preallocation, high resolution time stamps,
+	  and a number of other features to improve performance and speed
+	  up fsck time.  For more information, please see the web pages at
+	  http://ext4.wiki.kernel.org.
+
+	  The ext4 filesystem will support mounting an ext3
+	  filesystem; while there will be some performance gains from
+	  the delayed allocation and inode table readahead, the best
+	  performance gains will require enabling ext4 features in the
+	  filesystem, or formatting a new filesystem as an ext4
+	  filesystem initially.
+
+	  To compile this file system support as a module, choose M here. The
+	  module will be called ext4.
+
+	  If unsure, say N.
+
+config EXT4_USE_FOR_EXT23
+	bool "Use ext4 for ext2/ext3 file systems"
+	depends on EXT4_FS
+	depends on EXT3_FS=n || EXT2_FS=n
+	default y
+	help
+	  Allow the ext4 file system driver code to be used for ext2 or
+	  ext3 file system mounts.  This allows users to reduce their
+	  compiled kernel size by using one file system driver for
+	  ext2, ext3, and ext4 file systems.
+
+config EXT4_FS_XATTR
+	bool "Ext4 extended attributes"
+	depends on EXT4_FS
+	default y
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+	  You need this for POSIX ACL support on ext4.
+
+config EXT4_FS_POSIX_ACL
+	bool "Ext4 POSIX Access Control Lists"
+	depends on EXT4_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config EXT4_FS_SECURITY
+	bool "Ext4 Security Labels"
+	depends on EXT4_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ext4 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
+config EXT4_DEBUG
+	bool "EXT4 debugging support"
+	depends on EXT4_FS
+	help
+	  Enables run-time debugging support for the ext4 filesystem.
+
+	  If you select Y here, then you will be able to turn on debugging
+	  with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
new file mode 100644
index 00000000..04109460
--- /dev/null
+++ b/fs/ext4/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for the linux ext4-filesystem routines.
+#
+
+obj-$(CONFIG_EXT4_FS) += ext4.o
+
+ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
+		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+		mmp.o
+
+ext4-$(CONFIG_EXT4_FS_XATTR)		+= xattr.o xattr_user.o xattr_trusted.o
+ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
+ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
new file mode 100644
index 00000000..21eacd7b
--- /dev/null
+++ b/fs/ext4/acl.c
@@ -0,0 +1,479 @@
+/*
+ * linux/fs/ext4/acl.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *
+ext4_acl_from_disk(const void *value, size_t size)
+{
+	const char *end = (char *)value + size;
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(ext4_acl_header))
+		 return ERR_PTR(-EINVAL);
+	if (((ext4_acl_header *)value)->a_version !=
+	    cpu_to_le32(EXT4_ACL_VERSION))
+		return ERR_PTR(-EINVAL);
+	value = (char *)value + sizeof(ext4_acl_header);
+	count = ext4_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n = 0; n < count; n++) {
+		ext4_acl_entry *entry =
+			(ext4_acl_entry *)value;
+		if ((char *)value + sizeof(ext4_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+
+		switch (acl->a_entries[n].e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			value = (char *)value +
+				sizeof(ext4_acl_entry_short);
+			acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
+			break;
+
+		case ACL_USER:
+		case ACL_GROUP:
+			value = (char *)value + sizeof(ext4_acl_entry);
+			if ((char *)value > end)
+				goto fail;
+			acl->a_entries[n].e_id =
+				le32_to_cpu(entry->e_id);
+			break;
+
+		default:
+			goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *
+ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+	ext4_acl_header *ext_acl;
+	char *e;
+	size_t n;
+
+	*size = ext4_acl_size(acl->a_count);
+	ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
+			sizeof(ext4_acl_entry), GFP_NOFS);
+	if (!ext_acl)
+		return ERR_PTR(-ENOMEM);
+	ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
+	e = (char *)ext_acl + sizeof(ext4_acl_header);
+	for (n = 0; n < acl->a_count; n++) {
+		ext4_acl_entry *entry = (ext4_acl_entry *)e;
+		entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+		switch (acl->a_entries[n].e_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
+			e += sizeof(ext4_acl_entry);
+			break;
+
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			e += sizeof(ext4_acl_entry_short);
+			break;
+
+		default:
+			goto fail;
+		}
+	}
+	return (char *)ext_acl;
+
+fail:
+	kfree(ext_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Inode operation get_posix_acl().
+ *
+ * inode->i_mutex: don't care
+ */
+static struct posix_acl *
+ext4_get_acl(struct inode *inode, int type)
+{
+	int name_index;
+	char *value = NULL;
+	struct posix_acl *acl;
+	int retval;
+
+	if (!test_opt(inode->i_sb, POSIX_ACL))
+		return NULL;
+
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
+	if (retval > 0) {
+		value = kmalloc(retval, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		retval = ext4_xattr_get(inode, name_index, "", value, retval);
+	}
+	if (retval > 0)
+		acl = ext4_acl_from_disk(value, retval);
+	else if (retval == -ENODATA || retval == -ENOSYS)
+		acl = NULL;
+	else
+		acl = ERR_PTR(retval);
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ *
+ * inode->i_mutex: down unless called from ext4_new_inode
+ */
+static int
+ext4_set_acl(handle_t *handle, struct inode *inode, int type,
+	     struct posix_acl *acl)
+{
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int error;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			error = posix_acl_equiv_mode(acl, &mode);
+			if (error < 0)
+				return error;
+			else {
+				inode->i_mode = mode;
+				inode->i_ctime = ext4_current_time(inode);
+				ext4_mark_inode_dirty(handle, inode);
+				if (error == 0)
+					acl = NULL;
+			}
+		}
+		break;
+
+	case ACL_TYPE_DEFAULT:
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	if (acl) {
+		value = ext4_acl_to_disk(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	error = ext4_xattr_set_handle(handle, inode, name_index, "",
+				      value, size, 0);
+
+	kfree(value);
+	if (!error)
+		set_cached_acl(inode, type, acl);
+
+	return error;
+}
+
+int
+ext4_check_acl(struct inode *inode, int mask, unsigned int flags)
+{
+	struct posix_acl *acl;
+
+	if (flags & IPERM_FLAG_RCU) {
+		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+			return -ECHILD;
+		return -EAGAIN;
+	}
+
+	acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		int error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return error;
+	}
+
+	return -EAGAIN;
+}
+
+/*
+ * Initialize the ACLs of a new inode. Called from ext4_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
+ */
+int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *acl = NULL;
+	int error = 0;
+
+	if (!S_ISLNK(inode->i_mode)) {
+		if (test_opt(dir->i_sb, POSIX_ACL)) {
+			acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
+			if (IS_ERR(acl))
+				return PTR_ERR(acl);
+		}
+		if (!acl)
+			inode->i_mode &= ~current_umask();
+	}
+	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
+		struct posix_acl *clone;
+		mode_t mode;
+
+		if (S_ISDIR(inode->i_mode)) {
+			error = ext4_set_acl(handle, inode,
+					     ACL_TYPE_DEFAULT, acl);
+			if (error)
+				goto cleanup;
+		}
+		clone = posix_acl_clone(acl, GFP_NOFS);
+		error = -ENOMEM;
+		if (!clone)
+			goto cleanup;
+
+		mode = inode->i_mode;
+		error = posix_acl_create_masq(clone, &mode);
+		if (error >= 0) {
+			inode->i_mode = mode;
+			if (error > 0) {
+				/* This is an extended ACL */
+				error = ext4_set_acl(handle, inode,
+						     ACL_TYPE_ACCESS, clone);
+			}
+		}
+		posix_acl_release(clone);
+	}
+cleanup:
+	posix_acl_release(acl);
+	return error;
+}
+
+/*
+ * Does chmod for an inode that may have an Access Control List. The
+ * inode->i_mode field must be updated to the desired value by the caller
+ * before calling this function.
+ * Returns 0 on success, or a negative error number.
+ *
+ * We change the ACL rather than storing some ACL entries in the file
+ * mode permission bits (which would be more efficient), because that
+ * would break once additional permissions (like  ACL_APPEND, ACL_DELETE
+ * for directories) are added. There are no more bits available in the
+ * file mode.
+ *
+ * inode->i_mutex: down
+ */
+int
+ext4_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+	int error;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	if (!test_opt(inode->i_sb, POSIX_ACL))
+		return 0;
+	acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+	error = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!error) {
+		handle_t *handle;
+		int retries = 0;
+
+	retry:
+		handle = ext4_journal_start(inode,
+				EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+		if (IS_ERR(handle)) {
+			error = PTR_ERR(handle);
+			ext4_std_error(inode->i_sb, error);
+			goto out;
+		}
+		error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
+		ext4_journal_stop(handle);
+		if (error == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry;
+	}
+out:
+	posix_acl_release(clone);
+	return error;
+}
+
+/*
+ * Extended attribute handlers
+ */
+static size_t
+ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
+			   const char *name, size_t name_len, int type)
+{
+	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+	if (!test_opt(dentry->d_sb, POSIX_ACL))
+		return 0;
+	if (list && size <= list_len)
+		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+	return size;
+}
+
+static size_t
+ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
+			    const char *name, size_t name_len, int type)
+{
+	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+	if (!test_opt(dentry->d_sb, POSIX_ACL))
+		return 0;
+	if (list && size <= list_len)
+		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+	return size;
+}
+
+static int
+ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
+		   size_t size, int type)
+{
+	struct posix_acl *acl;
+	int error;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, POSIX_ACL))
+		return -EOPNOTSUPP;
+
+	acl = ext4_get_acl(dentry->d_inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	error = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+static int
+ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
+		   size_t size, int flags, int type)
+{
+	struct inode *inode = dentry->d_inode;
+	handle_t *handle;
+	struct posix_acl *acl;
+	int error, retries = 0;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	if (!test_opt(inode->i_sb, POSIX_ACL))
+		return -EOPNOTSUPP;
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		else if (acl) {
+			error = posix_acl_valid(acl);
+			if (error)
+				goto release_and_out;
+		}
+	} else
+		acl = NULL;
+
+retry:
+	handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	error = ext4_set_acl(handle, inode, type, acl);
+	ext4_journal_stop(handle);
+	if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+
+release_and_out:
+	posix_acl_release(acl);
+	return error;
+}
+
+const struct xattr_handler ext4_xattr_acl_access_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.flags	= ACL_TYPE_ACCESS,
+	.list	= ext4_xattr_list_acl_access,
+	.get	= ext4_xattr_get_acl,
+	.set	= ext4_xattr_set_acl,
+};
+
+const struct xattr_handler ext4_xattr_acl_default_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.flags	= ACL_TYPE_DEFAULT,
+	.list	= ext4_xattr_list_acl_default,
+	.get	= ext4_xattr_get_acl,
+	.set	= ext4_xattr_set_acl,
+};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
new file mode 100644
index 00000000..dec82116
--- /dev/null
+++ b/fs/ext4/acl.h
@@ -0,0 +1,77 @@
+/*
+  File: fs/ext4/acl.h
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/posix_acl_xattr.h>
+
+#define EXT4_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} ext4_acl_entry;
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} ext4_acl_entry_short;
+
+typedef struct {
+	__le32		a_version;
+} ext4_acl_header;
+
+static inline size_t ext4_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(ext4_acl_header) +
+		       count * sizeof(ext4_acl_entry_short);
+	} else {
+		return sizeof(ext4_acl_header) +
+		       4 * sizeof(ext4_acl_entry_short) +
+		       (count - 4) * sizeof(ext4_acl_entry);
+	}
+}
+
+static inline int ext4_acl_count(size_t size)
+{
+	ssize_t s;
+	size -= sizeof(ext4_acl_header);
+	s = size - 4 * sizeof(ext4_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(ext4_acl_entry_short))
+			return -1;
+		return size / sizeof(ext4_acl_entry_short);
+	} else {
+		if (s % sizeof(ext4_acl_entry))
+			return -1;
+		return s / sizeof(ext4_acl_entry) + 4;
+	}
+}
+
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+
+/* acl.c */
+extern int ext4_check_acl(struct inode *, int, unsigned int);
+extern int ext4_acl_chmod(struct inode *);
+extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
+
+#else  /* CONFIG_EXT4_FS_POSIX_ACL */
+#include <linux/sched.h>
+#define ext4_check_acl NULL
+
+static inline int
+ext4_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+
+static inline int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+#endif  /* CONFIG_EXT4_FS_POSIX_ACL */
+
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
new file mode 100644
index 00000000..264f6949
--- /dev/null
+++ b/fs/ext4/balloc.c
@@ -0,0 +1,622 @@
+/*
+ *  linux/fs/ext4/balloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/time.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "mballoc.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * balloc.c contains the blocks allocation and deallocation routines
+ */
+
+/*
+ * Calculate the block group number and offset, given a block number
+ */
+void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
+		ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	ext4_grpblk_t offset;
+
+	blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+	offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
+	if (offsetp)
+		*offsetp = offset;
+	if (blockgrpp)
+		*blockgrpp = blocknr;
+
+}
+
+static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
+			ext4_group_t block_group)
+{
+	ext4_group_t actual_group;
+	ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
+	if (actual_group == block_group)
+		return 1;
+	return 0;
+}
+
+static int ext4_group_used_meta_blocks(struct super_block *sb,
+				       ext4_group_t block_group,
+				       struct ext4_group_desc *gdp)
+{
+	ext4_fsblk_t tmp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	/* block bitmap, inode bitmap, and inode table blocks */
+	int used_blocks = sbi->s_itb_per_group + 2;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+		if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
+					block_group))
+			used_blocks--;
+
+		if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp),
+					block_group))
+			used_blocks--;
+
+		tmp = ext4_inode_table(sb, gdp);
+		for (; tmp < ext4_inode_table(sb, gdp) +
+				sbi->s_itb_per_group; tmp++) {
+			if (!ext4_block_in_group(sb, tmp, block_group))
+				used_blocks -= 1;
+		}
+	}
+	return used_blocks;
+}
+
+/* Initializes an uninitialized block bitmap if given, and returns the
+ * number of blocks free in the group. */
+unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
+		 ext4_group_t block_group, struct ext4_group_desc *gdp)
+{
+	int bit, bit_max;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	unsigned free_blocks, group_blocks;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (bh) {
+		J_ASSERT_BH(bh, buffer_locked(bh));
+
+		/* If checksum is bad mark all blocks used to prevent allocation
+		 * essentially implementing a per-group read-only flag. */
+		if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
+			ext4_error(sb, "Checksum bad for group %u",
+					block_group);
+			ext4_free_blks_set(sb, gdp, 0);
+			ext4_free_inodes_set(sb, gdp, 0);
+			ext4_itable_unused_set(sb, gdp, 0);
+			memset(bh->b_data, 0xff, sb->s_blocksize);
+			return 0;
+		}
+		memset(bh->b_data, 0, sb->s_blocksize);
+	}
+
+	/* Check for superblock and gdt backups in this group */
+	bit_max = ext4_bg_has_super(sb, block_group);
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+	    block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
+			  sbi->s_desc_per_block) {
+		if (bit_max) {
+			bit_max += ext4_bg_num_gdb(sb, block_group);
+			bit_max +=
+				le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+		}
+	} else { /* For META_BG_BLOCK_GROUPS */
+		bit_max += ext4_bg_num_gdb(sb, block_group);
+	}
+
+	if (block_group == ngroups - 1) {
+		/*
+		 * Even though mke2fs always initialize first and last group
+		 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
+		 * to make sure we calculate the right free blocks
+		 */
+		group_blocks = ext4_blocks_count(sbi->s_es) -
+			ext4_group_first_block_no(sb, ngroups - 1);
+	} else {
+		group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
+	}
+
+	free_blocks = group_blocks - bit_max;
+
+	if (bh) {
+		ext4_fsblk_t start, tmp;
+		int flex_bg = 0;
+
+		for (bit = 0; bit < bit_max; bit++)
+			ext4_set_bit(bit, bh->b_data);
+
+		start = ext4_group_first_block_no(sb, block_group);
+
+		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+					      EXT4_FEATURE_INCOMPAT_FLEX_BG))
+			flex_bg = 1;
+
+		/* Set bits for block and inode bitmaps, and inode table */
+		tmp = ext4_block_bitmap(sb, gdp);
+		if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+			ext4_set_bit(tmp - start, bh->b_data);
+
+		tmp = ext4_inode_bitmap(sb, gdp);
+		if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+			ext4_set_bit(tmp - start, bh->b_data);
+
+		tmp = ext4_inode_table(sb, gdp);
+		for (; tmp < ext4_inode_table(sb, gdp) +
+				sbi->s_itb_per_group; tmp++) {
+			if (!flex_bg ||
+				ext4_block_in_group(sb, tmp, block_group))
+				ext4_set_bit(tmp - start, bh->b_data);
+		}
+		/*
+		 * Also if the number of blocks within the group is
+		 * less than the blocksize * 8 ( which is the size
+		 * of bitmap ), set rest of the block bitmap to 1
+		 */
+		ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
+				     bh->b_data);
+	}
+	return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
+}
+
+
+/*
+ * The free blocks are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.  The descriptors are loaded in memory
+ * when a file system is mounted (see ext4_fill_super).
+ */
+
+/**
+ * ext4_get_group_desc() -- load group descriptor from disk
+ * @sb:			super block
+ * @block_group:	given block group
+ * @bh:			pointer to the buffer head to store the block
+ *			group descriptor
+ */
+struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
+					     ext4_group_t block_group,
+					     struct buffer_head **bh)
+{
+	unsigned int group_desc;
+	unsigned int offset;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_group_desc *desc;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (block_group >= ngroups) {
+		ext4_error(sb, "block_group >= groups_count - block_group = %u,"
+			   " groups_count = %u", block_group, ngroups);
+
+		return NULL;
+	}
+
+	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
+	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+	if (!sbi->s_group_desc[group_desc]) {
+		ext4_error(sb, "Group descriptor not loaded - "
+			   "block_group = %u, group_desc = %u, desc = %u",
+			   block_group, group_desc, offset);
+		return NULL;
+	}
+
+	desc = (struct ext4_group_desc *)(
+		(__u8 *)sbi->s_group_desc[group_desc]->b_data +
+		offset * EXT4_DESC_SIZE(sb));
+	if (bh)
+		*bh = sbi->s_group_desc[group_desc];
+	return desc;
+}
+
+static int ext4_valid_block_bitmap(struct super_block *sb,
+					struct ext4_group_desc *desc,
+					unsigned int block_group,
+					struct buffer_head *bh)
+{
+	ext4_grpblk_t offset;
+	ext4_grpblk_t next_zero_bit;
+	ext4_fsblk_t bitmap_blk;
+	ext4_fsblk_t group_first_block;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+		/* with FLEX_BG, the inode/block bitmaps and itable
+		 * blocks may not be in the group at all
+		 * so the bitmap validation will be skipped for those groups
+		 * or it has to also read the block group where the bitmaps
+		 * are located to verify they are set.
+		 */
+		return 1;
+	}
+	group_first_block = ext4_group_first_block_no(sb, block_group);
+
+	/* check whether block bitmap block number is set */
+	bitmap_blk = ext4_block_bitmap(sb, desc);
+	offset = bitmap_blk - group_first_block;
+	if (!ext4_test_bit(offset, bh->b_data))
+		/* bad block bitmap */
+		goto err_out;
+
+	/* check whether the inode bitmap block number is set */
+	bitmap_blk = ext4_inode_bitmap(sb, desc);
+	offset = bitmap_blk - group_first_block;
+	if (!ext4_test_bit(offset, bh->b_data))
+		/* bad block bitmap */
+		goto err_out;
+
+	/* check whether the inode table block number is set */
+	bitmap_blk = ext4_inode_table(sb, desc);
+	offset = bitmap_blk - group_first_block;
+	next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
+				offset + EXT4_SB(sb)->s_itb_per_group,
+				offset);
+	if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group)
+		/* good bitmap for inode tables */
+		return 1;
+
+err_out:
+	ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
+			block_group, bitmap_blk);
+	return 0;
+}
+/**
+ * ext4_read_block_bitmap()
+ * @sb:			super block
+ * @block_group:	given block group
+ *
+ * Read the bitmap for a given block_group,and validate the
+ * bits for block/inode/inode tables are set in the bitmaps
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+struct buffer_head *
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+{
+	struct ext4_group_desc *desc;
+	struct buffer_head *bh = NULL;
+	ext4_fsblk_t bitmap_blk;
+
+	desc = ext4_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		return NULL;
+	bitmap_blk = ext4_block_bitmap(sb, desc);
+	bh = sb_getblk(sb, bitmap_blk);
+	if (unlikely(!bh)) {
+		ext4_error(sb, "Cannot read block bitmap - "
+			    "block_group = %u, block_bitmap = %llu",
+			    block_group, bitmap_blk);
+		return NULL;
+	}
+
+	if (bitmap_uptodate(bh))
+		return bh;
+
+	lock_buffer(bh);
+	if (bitmap_uptodate(bh)) {
+		unlock_buffer(bh);
+		return bh;
+	}
+	ext4_lock_group(sb, block_group);
+	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+		ext4_init_block_bitmap(sb, bh, block_group, desc);
+		set_bitmap_uptodate(bh);
+		set_buffer_uptodate(bh);
+		ext4_unlock_group(sb, block_group);
+		unlock_buffer(bh);
+		return bh;
+	}
+	ext4_unlock_group(sb, block_group);
+	if (buffer_uptodate(bh)) {
+		/*
+		 * if not uninit if bh is uptodate,
+		 * bitmap is also uptodate
+		 */
+		set_bitmap_uptodate(bh);
+		unlock_buffer(bh);
+		return bh;
+	}
+	/*
+	 * submit the buffer_head for read. We can
+	 * safely mark the bitmap as uptodate now.
+	 * We do it here so the bitmap uptodate bit
+	 * get set with buffer lock held.
+	 */
+	trace_ext4_read_block_bitmap_load(sb, block_group);
+	set_bitmap_uptodate(bh);
+	if (bh_submit_read(bh) < 0) {
+		put_bh(bh);
+		ext4_error(sb, "Cannot read block bitmap - "
+			    "block_group = %u, block_bitmap = %llu",
+			    block_group, bitmap_blk);
+		return NULL;
+	}
+	ext4_valid_block_bitmap(sb, desc, block_group, bh);
+	/*
+	 * file system mounted not to panic on error,
+	 * continue with corrupt bitmap
+	 */
+	return bh;
+}
+
+/**
+ * ext4_has_free_blocks()
+ * @sbi:	in-core super block structure.
+ * @nblocks:	number of needed blocks
+ *
+ * Check if filesystem has nblocks free & available for allocation.
+ * On success return 1, return 0 on failure.
+ */
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
+				s64 nblocks, unsigned int flags)
+{
+	s64 free_blocks, dirty_blocks, root_blocks;
+	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
+
+	free_blocks  = percpu_counter_read_positive(fbc);
+	dirty_blocks = percpu_counter_read_positive(dbc);
+	root_blocks = ext4_r_blocks_count(sbi->s_es);
+
+	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
+						EXT4_FREEBLOCKS_WATERMARK) {
+		free_blocks  = percpu_counter_sum_positive(fbc);
+		dirty_blocks = percpu_counter_sum_positive(dbc);
+	}
+	/* Check whether we have space after
+	 * accounting for current dirty blocks & root reserved blocks.
+	 */
+	if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks))
+		return 1;
+
+	/* Hm, nope.  Are (enough) root reserved blocks available? */
+	if (sbi->s_resuid == current_fsuid() ||
+	    ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
+	    capable(CAP_SYS_RESOURCE) ||
+		(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+
+		if (free_blocks >= (nblocks + dirty_blocks))
+			return 1;
+	}
+
+	return 0;
+}
+
+int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+			   s64 nblocks, unsigned int flags)
+{
+	if (ext4_has_free_blocks(sbi, nblocks, flags)) {
+		percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
+		return 0;
+	} else
+		return -ENOSPC;
+}
+
+/**
+ * ext4_should_retry_alloc()
+ * @sb:			super block
+ * @retries		number of attemps has been made
+ *
+ * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
+ * it is profitable to retry the operation, this function will wait
+ * for the current or committing transaction to complete, and then
+ * return TRUE.
+ *
+ * if the total number of retries exceed three times, return FALSE.
+ */
+int ext4_should_retry_alloc(struct super_block *sb, int *retries)
+{
+	if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
+	    (*retries)++ > 3 ||
+	    !EXT4_SB(sb)->s_journal)
+		return 0;
+
+	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
+
+	return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+}
+
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:		pointer to total number of blocks needed
+ * @errp:               error code
+ *
+ * Return 1st allocated block number on success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+				  ext4_fsblk_t goal, unsigned int flags,
+				  unsigned long *count, int *errp)
+{
+	struct ext4_allocation_request ar;
+	ext4_fsblk_t ret;
+
+	memset(&ar, 0, sizeof(ar));
+	/* Fill with neighbour allocated blocks */
+	ar.inode = inode;
+	ar.goal = goal;
+	ar.len = count ? *count : 1;
+	ar.flags = flags;
+
+	ret = ext4_mb_new_blocks(handle, &ar, errp);
+	if (count)
+		*count = ar.len;
+	/*
+	 * Account for the allocated meta blocks.  We will never
+	 * fail EDQUOT for metdata, but we do account for it.
+	 */
+	if (!(*errp) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
+		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+		EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
+		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		dquot_alloc_block_nofail(inode, ar.len);
+	}
+	return ret;
+}
+
+/**
+ * ext4_count_free_blocks() -- count filesystem free blocks
+ * @sb:		superblock
+ *
+ * Adds up the number of free blocks from each block group.
+ */
+ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
+{
+	ext4_fsblk_t desc_count;
+	struct ext4_group_desc *gdp;
+	ext4_group_t i;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+#ifdef EXT4FS_DEBUG
+	struct ext4_super_block *es;
+	ext4_fsblk_t bitmap_count;
+	unsigned int x;
+	struct buffer_head *bitmap_bh = NULL;
+
+	es = EXT4_SB(sb)->s_es;
+	desc_count = 0;
+	bitmap_count = 0;
+	gdp = NULL;
+
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += ext4_free_blks_count(sb, gdp);
+		brelse(bitmap_bh);
+		bitmap_bh = ext4_read_block_bitmap(sb, i);
+		if (bitmap_bh == NULL)
+			continue;
+
+		x = ext4_count_free(bitmap_bh, sb->s_blocksize);
+		printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
+			i, ext4_free_blks_count(sb, gdp), x);
+		bitmap_count += x;
+	}
+	brelse(bitmap_bh);
+	printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
+		", computed = %llu, %llu\n", ext4_free_blocks_count(es),
+	       desc_count, bitmap_count);
+	return bitmap_count;
+#else
+	desc_count = 0;
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += ext4_free_blks_count(sb, gdp);
+	}
+
+	return desc_count;
+#endif
+}
+
+static inline int test_root(ext4_group_t a, int b)
+{
+	int num = b;
+
+	while (a > num)
+		num *= b;
+	return num == a;
+}
+
+static int ext4_group_sparse(ext4_group_t group)
+{
+	if (group <= 1)
+		return 1;
+	if (!(group & 1))
+		return 0;
+	return (test_root(group, 7) || test_root(group, 5) ||
+		test_root(group, 3));
+}
+
+/**
+ *	ext4_bg_has_super - number of blocks used by the superblock in group
+ *	@sb: superblock for filesystem
+ *	@group: group number to check
+ *
+ *	Return the number of blocks used by the superblock (primary or backup)
+ *	in this group.  Currently this will be only 0 or 1.
+ */
+int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
+{
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
+			!ext4_group_sparse(group))
+		return 0;
+	return 1;
+}
+
+static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
+					ext4_group_t group)
+{
+	unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
+	ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
+	ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
+
+	if (group == first || group == first + 1 || group == last)
+		return 1;
+	return 0;
+}
+
+static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
+					ext4_group_t group)
+{
+	if (!ext4_bg_has_super(sb, group))
+		return 0;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+		return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+	else
+		return EXT4_SB(sb)->s_gdb_count;
+}
+
+/**
+ *	ext4_bg_num_gdb - number of blocks used by the group table in group
+ *	@sb: superblock for filesystem
+ *	@group: group number to check
+ *
+ *	Return the number of blocks used by the group descriptor table
+ *	(primary or backup) in this group.  In the future there may be a
+ *	different number of descriptor blocks in each group.
+ */
+unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
+{
+	unsigned long first_meta_bg =
+			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+	unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
+			metagroup < first_meta_bg)
+		return ext4_bg_num_gdb_nometa(sb, group);
+
+	return ext4_bg_num_gdb_meta(sb,group);
+
+}
+
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
new file mode 100644
index 00000000..fa3af81a
--- /dev/null
+++ b/fs/ext4/bitmap.c
@@ -0,0 +1,31 @@
+/*
+ *  linux/fs/ext4/bitmap.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/jbd2.h>
+#include "ext4.h"
+
+#ifdef EXT4FS_DEBUG
+
+static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+
+unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
+{
+	unsigned int i, sum = 0;
+
+	if (!map)
+		return 0;
+	for (i = 0; i < numchars; i++)
+		sum += nibblemap[map->b_data[i] & 0xf] +
+			nibblemap[(map->b_data[i] >> 4) & 0xf];
+	return sum;
+}
+
+#endif  /*  EXT4FS_DEBUG  */
+
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
new file mode 100644
index 00000000..fac90f3f
--- /dev/null
+++ b/fs/ext4/block_validity.c
@@ -0,0 +1,248 @@
+/*
+ *  linux/fs/ext4/block_validity.c
+ *
+ * Copyright (C) 2009
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ * Track which blocks in the filesystem are metadata blocks that
+ * should never be used as data blocks by files or directories.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include "ext4.h"
+
+struct ext4_system_zone {
+	struct rb_node	node;
+	ext4_fsblk_t	start_blk;
+	unsigned int	count;
+};
+
+static struct kmem_cache *ext4_system_zone_cachep;
+
+int __init ext4_init_system_zone(void)
+{
+	ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
+	if (ext4_system_zone_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void ext4_exit_system_zone(void)
+{
+	kmem_cache_destroy(ext4_system_zone_cachep);
+}
+
+static inline int can_merge(struct ext4_system_zone *entry1,
+		     struct ext4_system_zone *entry2)
+{
+	if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+		return 1;
+	return 0;
+}
+
+/*
+ * Mark a range of blocks as belonging to the "system zone" --- that
+ * is, filesystem metadata blocks which should never be used by
+ * inodes.
+ */
+static int add_system_zone(struct ext4_sb_info *sbi,
+			   ext4_fsblk_t start_blk,
+			   unsigned int count)
+{
+	struct ext4_system_zone *new_entry = NULL, *entry;
+	struct rb_node **n = &sbi->system_blks.rb_node, *node;
+	struct rb_node *parent = NULL, *new_node = NULL;
+
+	while (*n) {
+		parent = *n;
+		entry = rb_entry(parent, struct ext4_system_zone, node);
+		if (start_blk < entry->start_blk)
+			n = &(*n)->rb_left;
+		else if (start_blk >= (entry->start_blk + entry->count))
+			n = &(*n)->rb_right;
+		else {
+			if (start_blk + count > (entry->start_blk +
+						 entry->count))
+				entry->count = (start_blk + count -
+						entry->start_blk);
+			new_node = *n;
+			new_entry = rb_entry(new_node, struct ext4_system_zone,
+					     node);
+			break;
+		}
+	}
+
+	if (!new_entry) {
+		new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+					     GFP_KERNEL);
+		if (!new_entry)
+			return -ENOMEM;
+		new_entry->start_blk = start_blk;
+		new_entry->count = count;
+		new_node = &new_entry->node;
+
+		rb_link_node(new_node, parent, n);
+		rb_insert_color(new_node, &sbi->system_blks);
+	}
+
+	/* Can we merge to the left? */
+	node = rb_prev(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		if (can_merge(entry, new_entry)) {
+			new_entry->start_blk = entry->start_blk;
+			new_entry->count += entry->count;
+			rb_erase(node, &sbi->system_blks);
+			kmem_cache_free(ext4_system_zone_cachep, entry);
+		}
+	}
+
+	/* Can we merge to the right? */
+	node = rb_next(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		if (can_merge(new_entry, entry)) {
+			new_entry->count += entry->count;
+			rb_erase(node, &sbi->system_blks);
+			kmem_cache_free(ext4_system_zone_cachep, entry);
+		}
+	}
+	return 0;
+}
+
+static void debug_print_tree(struct ext4_sb_info *sbi)
+{
+	struct rb_node *node;
+	struct ext4_system_zone *entry;
+	int first = 1;
+
+	printk(KERN_INFO "System zones: ");
+	node = rb_first(&sbi->system_blks);
+	while (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		printk("%s%llu-%llu", first ? "" : ", ",
+		       entry->start_blk, entry->start_blk + entry->count - 1);
+		first = 0;
+		node = rb_next(node);
+	}
+	printk("\n");
+}
+
+int ext4_setup_system_zone(struct super_block *sb)
+{
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp;
+	ext4_group_t i;
+	int flex_size = ext4_flex_bg_size(sbi);
+	int ret;
+
+	if (!test_opt(sb, BLOCK_VALIDITY)) {
+		if (EXT4_SB(sb)->system_blks.rb_node)
+			ext4_release_system_zone(sb);
+		return 0;
+	}
+	if (EXT4_SB(sb)->system_blks.rb_node)
+		return 0;
+
+	for (i=0; i < ngroups; i++) {
+		if (ext4_bg_has_super(sb, i) &&
+		    ((i < 5) || ((i % flex_size) == 0)))
+			add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+					ext4_bg_num_gdb(sb, i) + 1);
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+		if (ret)
+			return ret;
+		ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+		if (ret)
+			return ret;
+		ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+				sbi->s_itb_per_group);
+		if (ret)
+			return ret;
+	}
+
+	if (test_opt(sb, DEBUG))
+		debug_print_tree(EXT4_SB(sb));
+	return 0;
+}
+
+/* Called when the filesystem is unmounted */
+void ext4_release_system_zone(struct super_block *sb)
+{
+	struct rb_node	*n = EXT4_SB(sb)->system_blks.rb_node;
+	struct rb_node	*parent;
+	struct ext4_system_zone	*entry;
+
+	while (n) {
+		/* Do the node's children first */
+		if (n->rb_left) {
+			n = n->rb_left;
+			continue;
+		}
+		if (n->rb_right) {
+			n = n->rb_right;
+			continue;
+		}
+		/*
+		 * The node has no children; free it, and then zero
+		 * out parent's link to it.  Finally go to the
+		 * beginning of the loop and try to free the parent
+		 * node.
+		 */
+		parent = rb_parent(n);
+		entry = rb_entry(n, struct ext4_system_zone, node);
+		kmem_cache_free(ext4_system_zone_cachep, entry);
+		if (!parent)
+			EXT4_SB(sb)->system_blks = RB_ROOT;
+		else if (parent->rb_left == n)
+			parent->rb_left = NULL;
+		else if (parent->rb_right == n)
+			parent->rb_right = NULL;
+		n = parent;
+	}
+	EXT4_SB(sb)->system_blks = RB_ROOT;
+}
+
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+			  unsigned int count)
+{
+	struct ext4_system_zone *entry;
+	struct rb_node *n = sbi->system_blks.rb_node;
+
+	if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+	    (start_blk + count < start_blk) ||
+	    (start_blk + count > ext4_blocks_count(sbi->s_es))) {
+		sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+		return 0;
+	}
+	while (n) {
+		entry = rb_entry(n, struct ext4_system_zone, node);
+		if (start_blk + count - 1 < entry->start_blk)
+			n = n->rb_left;
+		else if (start_blk >= (entry->start_blk + entry->count))
+			n = n->rb_right;
+		else {
+			sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+			return 0;
+		}
+	}
+	return 1;
+}
+
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
new file mode 100644
index 00000000..164c5609
--- /dev/null
+++ b/fs/ext4/dir.c
@@ -0,0 +1,542 @@
+/*
+ *  linux/fs/ext4/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4 directory handling functions
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * Hash Tree Directory indexing (c) 2001  Daniel Phillips
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include "ext4.h"
+
+static unsigned char ext4_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int ext4_readdir(struct file *, void *, filldir_t);
+static int ext4_dx_readdir(struct file *filp,
+			   void *dirent, filldir_t filldir);
+static int ext4_release_dir(struct inode *inode,
+				struct file *filp);
+
+const struct file_operations ext4_dir_operations = {
+	.llseek		= ext4_llseek,
+	.read		= generic_read_dir,
+	.readdir	= ext4_readdir,		/* we take BKL. needed?*/
+	.unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext4_compat_ioctl,
+#endif
+	.fsync		= ext4_sync_file,
+	.release	= ext4_release_dir,
+};
+
+
+static unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
+	    (filetype >= EXT4_FT_MAX))
+		return DT_UNKNOWN;
+
+	return (ext4_filetype_table[filetype]);
+}
+
+/*
+ * Return 0 if the directory entry is OK, and 1 if there is a problem
+ *
+ * Note: this is the opposite of what ext2 and ext3 historically returned...
+ */
+int __ext4_check_dir_entry(const char *function, unsigned int line,
+			   struct inode *dir, struct file *filp,
+			   struct ext4_dir_entry_2 *de,
+			   struct buffer_head *bh,
+			   unsigned int offset)
+{
+	const char *error_msg = NULL;
+	const int rlen = ext4_rec_len_from_disk(de->rec_len,
+						dir->i_sb->s_blocksize);
+
+	if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
+		error_msg = "rec_len is smaller than minimal";
+	else if (unlikely(rlen % 4 != 0))
+		error_msg = "rec_len % 4 != 0";
+	else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
+		error_msg = "rec_len is too small for name_len";
+	else if (unlikely(((char *) de - bh->b_data) + rlen >
+			  dir->i_sb->s_blocksize))
+		error_msg = "directory entry across blocks";
+	else if (unlikely(le32_to_cpu(de->inode) >
+			le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
+		error_msg = "inode out of bounds";
+	else
+		return 0;
+
+	if (filp)
+		ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
+				"bad entry in directory: %s - offset=%u(%u), "
+				"inode=%u, rec_len=%d, name_len=%d",
+				error_msg, (unsigned) (offset%bh->b_size),
+				offset, le32_to_cpu(de->inode),
+				rlen, de->name_len);
+	else
+		ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
+				"bad entry in directory: %s - offset=%u(%u), "
+				"inode=%u, rec_len=%d, name_len=%d",
+				error_msg, (unsigned) (offset%bh->b_size),
+				offset, le32_to_cpu(de->inode),
+				rlen, de->name_len);
+
+	return 1;
+}
+
+static int ext4_readdir(struct file *filp,
+			 void *dirent, filldir_t filldir)
+{
+	int error = 0;
+	unsigned int offset;
+	int i, stored;
+	struct ext4_dir_entry_2 *de;
+	struct super_block *sb;
+	int err;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int ret = 0;
+	int dir_has_error = 0;
+
+	sb = inode->i_sb;
+
+	if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+				    EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+	    ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
+	     ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
+		err = ext4_dx_readdir(filp, dirent, filldir);
+		if (err != ERR_BAD_DX_DIR) {
+			ret = err;
+			goto out;
+		}
+		/*
+		 * We don't set the inode dirty flag since it's not
+		 * critical that it get flushed back to the disk.
+		 */
+		ext4_clear_inode_flag(filp->f_path.dentry->d_inode,
+				      EXT4_INODE_INDEX);
+	}
+	stored = 0;
+	offset = filp->f_pos & (sb->s_blocksize - 1);
+
+	while (!error && !stored && filp->f_pos < inode->i_size) {
+		struct ext4_map_blocks map;
+		struct buffer_head *bh = NULL;
+
+		map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+		map.m_len = 1;
+		err = ext4_map_blocks(NULL, inode, &map, 0);
+		if (err > 0) {
+			pgoff_t index = map.m_pblk >>
+					(PAGE_CACHE_SHIFT - inode->i_blkbits);
+			if (!ra_has_index(&filp->f_ra, index))
+				page_cache_sync_readahead(
+					sb->s_bdev->bd_inode->i_mapping,
+					&filp->f_ra, filp,
+					index, 1);
+			filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+			bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
+		}
+
+		/*
+		 * We ignore I/O errors on directories so users have a chance
+		 * of recovering data when there's a bad sector
+		 */
+		if (!bh) {
+			if (!dir_has_error) {
+				EXT4_ERROR_FILE(filp, 0,
+						"directory contains a "
+						"hole at offset %llu",
+					   (unsigned long long) filp->f_pos);
+				dir_has_error = 1;
+			}
+			/* corrupt size?  Maybe no more blocks to read */
+			if (filp->f_pos > inode->i_blocks << 9)
+				break;
+			filp->f_pos += sb->s_blocksize - offset;
+			continue;
+		}
+
+revalidate:
+		/* If the dir block has changed since the last call to
+		 * readdir(2), then we might be pointing to an invalid
+		 * dirent right now.  Scan from the start of the block
+		 * to make sure. */
+		if (filp->f_version != inode->i_version) {
+			for (i = 0; i < sb->s_blocksize && i < offset; ) {
+				de = (struct ext4_dir_entry_2 *)
+					(bh->b_data + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (ext4_rec_len_from_disk(de->rec_len,
+					sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
+					break;
+				i += ext4_rec_len_from_disk(de->rec_len,
+							    sb->s_blocksize);
+			}
+			offset = i;
+			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+				| offset;
+			filp->f_version = inode->i_version;
+		}
+
+		while (!error && filp->f_pos < inode->i_size
+		       && offset < sb->s_blocksize) {
+			de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
+			if (ext4_check_dir_entry(inode, filp, de,
+						 bh, offset)) {
+				/*
+				 * On error, skip the f_pos to the next block
+				 */
+				filp->f_pos = (filp->f_pos |
+						(sb->s_blocksize - 1)) + 1;
+				brelse(bh);
+				ret = stored;
+				goto out;
+			}
+			offset += ext4_rec_len_from_disk(de->rec_len,
+					sb->s_blocksize);
+			if (le32_to_cpu(de->inode)) {
+				/* We might block in the next section
+				 * if the data destination is
+				 * currently swapped out.  So, use a
+				 * version stamp to detect whether or
+				 * not the directory has been modified
+				 * during the copy operation.
+				 */
+				u64 version = filp->f_version;
+
+				error = filldir(dirent, de->name,
+						de->name_len,
+						filp->f_pos,
+						le32_to_cpu(de->inode),
+						get_dtype(sb, de->file_type));
+				if (error)
+					break;
+				if (version != filp->f_version)
+					goto revalidate;
+				stored++;
+			}
+			filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+						sb->s_blocksize);
+		}
+		offset = 0;
+		brelse(bh);
+	}
+out:
+	return ret;
+}
+
+/*
+ * These functions convert from the major/minor hash to an f_pos
+ * value.
+ *
+ * Currently we only use major hash numer.  This is unfortunate, but
+ * on 32-bit machines, the same VFS interface is used for lseek and
+ * llseek, so if we use the 64 bit offset, then the 32-bit versions of
+ * lseek/telldir/seekdir will blow out spectacularly, and from within
+ * the ext2 low-level routine, we don't know if we're being called by
+ * a 64-bit version of the system call or the 32-bit version of the
+ * system call.  Worse yet, NFSv2 only allows for a 32-bit readdir
+ * cookie.  Sigh.
+ */
+#define hash2pos(major, minor)	(major >> 1)
+#define pos2maj_hash(pos)	((pos << 1) & 0xffffffff)
+#define pos2min_hash(pos)	(0)
+
+/*
+ * This structure holds the nodes of the red-black tree used to store
+ * the directory entry in hash order.
+ */
+struct fname {
+	__u32		hash;
+	__u32		minor_hash;
+	struct rb_node	rb_hash;
+	struct fname	*next;
+	__u32		inode;
+	__u8		name_len;
+	__u8		file_type;
+	char		name[0];
+};
+
+/*
+ * This functoin implements a non-recursive way of freeing all of the
+ * nodes in the red-black tree.
+ */
+static void free_rb_tree_fname(struct rb_root *root)
+{
+	struct rb_node	*n = root->rb_node;
+	struct rb_node	*parent;
+	struct fname	*fname;
+
+	while (n) {
+		/* Do the node's children first */
+		if (n->rb_left) {
+			n = n->rb_left;
+			continue;
+		}
+		if (n->rb_right) {
+			n = n->rb_right;
+			continue;
+		}
+		/*
+		 * The node has no children; free it, and then zero
+		 * out parent's link to it.  Finally go to the
+		 * beginning of the loop and try to free the parent
+		 * node.
+		 */
+		parent = rb_parent(n);
+		fname = rb_entry(n, struct fname, rb_hash);
+		while (fname) {
+			struct fname *old = fname;
+			fname = fname->next;
+			kfree(old);
+		}
+		if (!parent)
+			*root = RB_ROOT;
+		else if (parent->rb_left == n)
+			parent->rb_left = NULL;
+		else if (parent->rb_right == n)
+			parent->rb_right = NULL;
+		n = parent;
+	}
+}
+
+
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
+{
+	struct dir_private_info *p;
+
+	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+	if (!p)
+		return NULL;
+	p->curr_hash = pos2maj_hash(pos);
+	p->curr_minor_hash = pos2min_hash(pos);
+	return p;
+}
+
+void ext4_htree_free_dir_info(struct dir_private_info *p)
+{
+	free_rb_tree_fname(&p->root);
+	kfree(p);
+}
+
+/*
+ * Given a directory entry, enter it into the fname rb tree.
+ */
+int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+			     __u32 minor_hash,
+			     struct ext4_dir_entry_2 *dirent)
+{
+	struct rb_node **p, *parent = NULL;
+	struct fname *fname, *new_fn;
+	struct dir_private_info *info;
+	int len;
+
+	info = dir_file->private_data;
+	p = &info->root.rb_node;
+
+	/* Create and allocate the fname structure */
+	len = sizeof(struct fname) + dirent->name_len + 1;
+	new_fn = kzalloc(len, GFP_KERNEL);
+	if (!new_fn)
+		return -ENOMEM;
+	new_fn->hash = hash;
+	new_fn->minor_hash = minor_hash;
+	new_fn->inode = le32_to_cpu(dirent->inode);
+	new_fn->name_len = dirent->name_len;
+	new_fn->file_type = dirent->file_type;
+	memcpy(new_fn->name, dirent->name, dirent->name_len);
+	new_fn->name[dirent->name_len] = 0;
+
+	while (*p) {
+		parent = *p;
+		fname = rb_entry(parent, struct fname, rb_hash);
+
+		/*
+		 * If the hash and minor hash match up, then we put
+		 * them on a linked list.  This rarely happens...
+		 */
+		if ((new_fn->hash == fname->hash) &&
+		    (new_fn->minor_hash == fname->minor_hash)) {
+			new_fn->next = fname->next;
+			fname->next = new_fn;
+			return 0;
+		}
+
+		if (new_fn->hash < fname->hash)
+			p = &(*p)->rb_left;
+		else if (new_fn->hash > fname->hash)
+			p = &(*p)->rb_right;
+		else if (new_fn->minor_hash < fname->minor_hash)
+			p = &(*p)->rb_left;
+		else /* if (new_fn->minor_hash > fname->minor_hash) */
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&new_fn->rb_hash, parent, p);
+	rb_insert_color(&new_fn->rb_hash, &info->root);
+	return 0;
+}
+
+
+
+/*
+ * This is a helper function for ext4_dx_readdir.  It calls filldir
+ * for all entres on the fname linked list.  (Normally there is only
+ * one entry on the linked list, unless there are 62 bit hash collisions.)
+ */
+static int call_filldir(struct file *filp, void *dirent,
+			filldir_t filldir, struct fname *fname)
+{
+	struct dir_private_info *info = filp->private_data;
+	loff_t	curr_pos;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block *sb;
+	int error;
+
+	sb = inode->i_sb;
+
+	if (!fname) {
+		printk(KERN_ERR "EXT4-fs: call_filldir: called with "
+		       "null fname?!?\n");
+		return 0;
+	}
+	curr_pos = hash2pos(fname->hash, fname->minor_hash);
+	while (fname) {
+		error = filldir(dirent, fname->name,
+				fname->name_len, curr_pos,
+				fname->inode,
+				get_dtype(sb, fname->file_type));
+		if (error) {
+			filp->f_pos = curr_pos;
+			info->extra_fname = fname;
+			return error;
+		}
+		fname = fname->next;
+	}
+	return 0;
+}
+
+static int ext4_dx_readdir(struct file *filp,
+			 void *dirent, filldir_t filldir)
+{
+	struct dir_private_info *info = filp->private_data;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct fname *fname;
+	int	ret;
+
+	if (!info) {
+		info = ext4_htree_create_dir_info(filp->f_pos);
+		if (!info)
+			return -ENOMEM;
+		filp->private_data = info;
+	}
+
+	if (filp->f_pos == EXT4_HTREE_EOF)
+		return 0;	/* EOF */
+
+	/* Some one has messed with f_pos; reset the world */
+	if (info->last_pos != filp->f_pos) {
+		free_rb_tree_fname(&info->root);
+		info->curr_node = NULL;
+		info->extra_fname = NULL;
+		info->curr_hash = pos2maj_hash(filp->f_pos);
+		info->curr_minor_hash = pos2min_hash(filp->f_pos);
+	}
+
+	/*
+	 * If there are any leftover names on the hash collision
+	 * chain, return them first.
+	 */
+	if (info->extra_fname) {
+		if (call_filldir(filp, dirent, filldir, info->extra_fname))
+			goto finished;
+		info->extra_fname = NULL;
+		goto next_node;
+	} else if (!info->curr_node)
+		info->curr_node = rb_first(&info->root);
+
+	while (1) {
+		/*
+		 * Fill the rbtree if we have no more entries,
+		 * or the inode has changed since we last read in the
+		 * cached entries.
+		 */
+		if ((!info->curr_node) ||
+		    (filp->f_version != inode->i_version)) {
+			info->curr_node = NULL;
+			free_rb_tree_fname(&info->root);
+			filp->f_version = inode->i_version;
+			ret = ext4_htree_fill_tree(filp, info->curr_hash,
+						   info->curr_minor_hash,
+						   &info->next_hash);
+			if (ret < 0)
+				return ret;
+			if (ret == 0) {
+				filp->f_pos = EXT4_HTREE_EOF;
+				break;
+			}
+			info->curr_node = rb_first(&info->root);
+		}
+
+		fname = rb_entry(info->curr_node, struct fname, rb_hash);
+		info->curr_hash = fname->hash;
+		info->curr_minor_hash = fname->minor_hash;
+		if (call_filldir(filp, dirent, filldir, fname))
+			break;
+	next_node:
+		info->curr_node = rb_next(info->curr_node);
+		if (info->curr_node) {
+			fname = rb_entry(info->curr_node, struct fname,
+					 rb_hash);
+			info->curr_hash = fname->hash;
+			info->curr_minor_hash = fname->minor_hash;
+		} else {
+			if (info->next_hash == ~0) {
+				filp->f_pos = EXT4_HTREE_EOF;
+				break;
+			}
+			info->curr_hash = info->next_hash;
+			info->curr_minor_hash = 0;
+		}
+	}
+finished:
+	info->last_pos = filp->f_pos;
+	return 0;
+}
+
+static int ext4_release_dir(struct inode *inode, struct file *filp)
+{
+	if (filp->private_data)
+		ext4_htree_free_dir_info(filp->private_data);
+
+	return 0;
+}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
new file mode 100644
index 00000000..1a34c1c8
--- /dev/null
+++ b/fs/ext4/ext4.h
@@ -0,0 +1,2236 @@
+/*
+ *  ext4.h
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/include/linux/minix_fs.h
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#ifndef _EXT4_H
+#define _EXT4_H
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include <linux/magic.h>
+#include <linux/jbd2.h>
+#include <linux/quota.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif
+
+/*
+ * The fourth extended filesystem constants/structures
+ */
+
+/*
+ * Define EXT4FS_DEBUG to produce debug messages
+ */
+#undef EXT4FS_DEBUG
+
+/*
+ * Debug code
+ */
+#ifdef EXT4FS_DEBUG
+#define ext4_debug(f, a...)						\
+	do {								\
+		printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
+			__FILE__, __LINE__, __func__);			\
+		printk(KERN_DEBUG f, ## a);				\
+	} while (0)
+#else
+#define ext4_debug(f, a...)	do {} while (0)
+#endif
+
+#define EXT4_ERROR_INODE(inode, fmt, a...) \
+	ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
+
+#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)			\
+	ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
+
+#define EXT4_ERROR_FILE(file, block, fmt, a...)				\
+	ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
+
+/* data type for block offset of block group */
+typedef int ext4_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long long ext4_fsblk_t;
+
+/* data type for file logical block number */
+typedef __u32 ext4_lblk_t;
+
+/* data type for block group number */
+typedef unsigned int ext4_group_t;
+
+/*
+ * Flags used in mballoc's allocation_context flags field.
+ *
+ * Also used to show what's going on for debugging purposes when the
+ * flag field is exported via the traceport interface
+ */
+
+/* prefer goal again. length */
+#define EXT4_MB_HINT_MERGE		0x0001
+/* blocks already reserved */
+#define EXT4_MB_HINT_RESERVED		0x0002
+/* metadata is being allocated */
+#define EXT4_MB_HINT_METADATA		0x0004
+/* first blocks in the file */
+#define EXT4_MB_HINT_FIRST		0x0008
+/* search for the best chunk */
+#define EXT4_MB_HINT_BEST		0x0010
+/* data is being allocated */
+#define EXT4_MB_HINT_DATA		0x0020
+/* don't preallocate (for tails) */
+#define EXT4_MB_HINT_NOPREALLOC		0x0040
+/* allocate for locality group */
+#define EXT4_MB_HINT_GROUP_ALLOC	0x0080
+/* allocate goal blocks or none */
+#define EXT4_MB_HINT_GOAL_ONLY		0x0100
+/* goal is meaningful */
+#define EXT4_MB_HINT_TRY_GOAL		0x0200
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED	0x0400
+/* We are doing stream allocation */
+#define EXT4_MB_STREAM_ALLOC		0x0800
+/* Use reserved root blocks if needed */
+#define EXT4_MB_USE_ROOT_BLOCKS		0x1000
+
+struct ext4_allocation_request {
+	/* target inode for block we're allocating */
+	struct inode *inode;
+	/* how many blocks we want to allocate */
+	unsigned int len;
+	/* logical block in target inode */
+	ext4_lblk_t logical;
+	/* the closest logical allocated block to the left */
+	ext4_lblk_t lleft;
+	/* the closest logical allocated block to the right */
+	ext4_lblk_t lright;
+	/* phys. target (a hint) */
+	ext4_fsblk_t goal;
+	/* phys. block for the closest logical allocated block to the left */
+	ext4_fsblk_t pleft;
+	/* phys. block for the closest logical allocated block to the right */
+	ext4_fsblk_t pright;
+	/* flags. see above EXT4_MB_HINT_* */
+	unsigned int flags;
+};
+
+/*
+ * Logical to physical block mapping, used by ext4_map_blocks()
+ *
+ * This structure is used to pass requests into ext4_map_blocks() as
+ * well as to store the information returned by ext4_map_blocks().  It
+ * takes less room on the stack than a struct buffer_head.
+ */
+#define EXT4_MAP_NEW		(1 << BH_New)
+#define EXT4_MAP_MAPPED		(1 << BH_Mapped)
+#define EXT4_MAP_UNWRITTEN	(1 << BH_Unwritten)
+#define EXT4_MAP_BOUNDARY	(1 << BH_Boundary)
+#define EXT4_MAP_UNINIT		(1 << BH_Uninit)
+#define EXT4_MAP_FLAGS		(EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
+				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+				 EXT4_MAP_UNINIT)
+
+struct ext4_map_blocks {
+	ext4_fsblk_t m_pblk;
+	ext4_lblk_t m_lblk;
+	unsigned int m_len;
+	unsigned int m_flags;
+};
+
+/*
+ * For delayed allocation tracking
+ */
+struct mpage_da_data {
+	struct inode *inode;
+	sector_t b_blocknr;		/* start block number of extent */
+	size_t b_size;			/* size of extent */
+	unsigned long b_state;		/* state of the extent */
+	unsigned long first_page, next_page;	/* extent of pages */
+	struct writeback_control *wbc;
+	int io_done;
+	int pages_written;
+	int retval;
+};
+
+/*
+ * Flags for ext4_io_end->flags
+ */
+#define	EXT4_IO_END_UNWRITTEN	0x0001
+#define EXT4_IO_END_ERROR	0x0002
+#define EXT4_IO_END_QUEUED	0x0004
+
+struct ext4_io_page {
+	struct page	*p_page;
+	atomic_t	p_count;
+};
+
+#define MAX_IO_PAGES 128
+
+typedef struct ext4_io_end {
+	struct list_head	list;		/* per-file finished IO list */
+	struct inode		*inode;		/* file being written to */
+	unsigned int		flag;		/* unwritten or not */
+	struct page		*page;		/* page struct for buffer write */
+	loff_t			offset;		/* offset in the file */
+	ssize_t			size;		/* size of the extent */
+	struct work_struct	work;		/* data work queue */
+	struct kiocb		*iocb;		/* iocb struct for AIO */
+	int			result;		/* error value for AIO */
+	int			num_io_pages;
+	struct ext4_io_page	*pages[MAX_IO_PAGES];
+} ext4_io_end_t;
+
+struct ext4_io_submit {
+	int			io_op;
+	struct bio		*io_bio;
+	ext4_io_end_t		*io_end;
+	struct ext4_io_page	*io_page;
+	sector_t		io_next_block;
+};
+
+/*
+ * Special inodes numbers
+ */
+#define	EXT4_BAD_INO		 1	/* Bad blocks inode */
+#define EXT4_ROOT_INO		 2	/* Root inode */
+#define EXT4_USR_QUOTA_INO	 3	/* User quota inode */
+#define EXT4_GRP_QUOTA_INO	 4	/* Group quota inode */
+#define EXT4_BOOT_LOADER_INO	 5	/* Boot loader inode */
+#define EXT4_UNDEL_DIR_INO	 6	/* Undelete directory inode */
+#define EXT4_RESIZE_INO		 7	/* Reserved group descriptors inode */
+#define EXT4_JOURNAL_INO	 8	/* Journal inode */
+
+/* First non-reserved inode for old ext4 filesystems */
+#define EXT4_GOOD_OLD_FIRST_INO	11
+
+/*
+ * Maximal count of links to a file
+ */
+#define EXT4_LINK_MAX		65000
+
+/*
+ * Macro-instructions used to manage several block sizes
+ */
+#define EXT4_MIN_BLOCK_SIZE		1024
+#define	EXT4_MAX_BLOCK_SIZE		65536
+#define EXT4_MIN_BLOCK_LOG_SIZE		10
+#define EXT4_MAX_BLOCK_LOG_SIZE		16
+#ifdef __KERNEL__
+# define EXT4_BLOCK_SIZE(s)		((s)->s_blocksize)
+#else
+# define EXT4_BLOCK_SIZE(s)		(EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+#endif
+#define	EXT4_ADDR_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / sizeof(__u32))
+#ifdef __KERNEL__
+# define EXT4_BLOCK_SIZE_BITS(s)	((s)->s_blocksize_bits)
+#else
+# define EXT4_BLOCK_SIZE_BITS(s)	((s)->s_log_block_size + 10)
+#endif
+#ifdef __KERNEL__
+#define	EXT4_ADDR_PER_BLOCK_BITS(s)	(EXT4_SB(s)->s_addr_per_block_bits)
+#define EXT4_INODE_SIZE(s)		(EXT4_SB(s)->s_inode_size)
+#define EXT4_FIRST_INO(s)		(EXT4_SB(s)->s_first_ino)
+#else
+#define EXT4_INODE_SIZE(s)	(((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
+				 EXT4_GOOD_OLD_INODE_SIZE : \
+				 (s)->s_inode_size)
+#define EXT4_FIRST_INO(s)	(((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
+				 EXT4_GOOD_OLD_FIRST_INO : \
+				 (s)->s_first_ino)
+#endif
+#define EXT4_BLOCK_ALIGN(size, blkbits)		ALIGN((size), (1 << (blkbits)))
+
+/*
+ * Structure of a blocks group descriptor
+ */
+struct ext4_group_desc
+{
+	__le32	bg_block_bitmap_lo;	/* Blocks bitmap block */
+	__le32	bg_inode_bitmap_lo;	/* Inodes bitmap block */
+	__le32	bg_inode_table_lo;	/* Inodes table block */
+	__le16	bg_free_blocks_count_lo;/* Free blocks count */
+	__le16	bg_free_inodes_count_lo;/* Free inodes count */
+	__le16	bg_used_dirs_count_lo;	/* Directories count */
+	__le16	bg_flags;		/* EXT4_BG_flags (INODE_UNINIT, etc) */
+	__u32	bg_reserved[2];		/* Likely block/inode bitmap checksum */
+	__le16  bg_itable_unused_lo;	/* Unused inodes count */
+	__le16  bg_checksum;		/* crc16(sb_uuid+group+desc) */
+	__le32	bg_block_bitmap_hi;	/* Blocks bitmap block MSB */
+	__le32	bg_inode_bitmap_hi;	/* Inodes bitmap block MSB */
+	__le32	bg_inode_table_hi;	/* Inodes table block MSB */
+	__le16	bg_free_blocks_count_hi;/* Free blocks count MSB */
+	__le16	bg_free_inodes_count_hi;/* Free inodes count MSB */
+	__le16	bg_used_dirs_count_hi;	/* Directories count MSB */
+	__le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
+	__u32	bg_reserved2[3];
+};
+
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+	atomic_t free_inodes;
+	atomic_t free_blocks;
+	atomic_t used_dirs;
+};
+
+#define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
+#define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
+#define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
+
+/*
+ * Macro-instructions used to manage group descriptors
+ */
+#define EXT4_MIN_DESC_SIZE		32
+#define EXT4_MIN_DESC_SIZE_64BIT	64
+#define	EXT4_MAX_DESC_SIZE		EXT4_MIN_BLOCK_SIZE
+#define EXT4_DESC_SIZE(s)		(EXT4_SB(s)->s_desc_size)
+#ifdef __KERNEL__
+# define EXT4_BLOCKS_PER_GROUP(s)	(EXT4_SB(s)->s_blocks_per_group)
+# define EXT4_DESC_PER_BLOCK(s)		(EXT4_SB(s)->s_desc_per_block)
+# define EXT4_INODES_PER_GROUP(s)	(EXT4_SB(s)->s_inodes_per_group)
+# define EXT4_DESC_PER_BLOCK_BITS(s)	(EXT4_SB(s)->s_desc_per_block_bits)
+#else
+# define EXT4_BLOCKS_PER_GROUP(s)	((s)->s_blocks_per_group)
+# define EXT4_DESC_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s))
+# define EXT4_INODES_PER_GROUP(s)	((s)->s_inodes_per_group)
+#endif
+
+/*
+ * Constants relative to the data blocks
+ */
+#define	EXT4_NDIR_BLOCKS		12
+#define	EXT4_IND_BLOCK			EXT4_NDIR_BLOCKS
+#define	EXT4_DIND_BLOCK			(EXT4_IND_BLOCK + 1)
+#define	EXT4_TIND_BLOCK			(EXT4_DIND_BLOCK + 1)
+#define	EXT4_N_BLOCKS			(EXT4_TIND_BLOCK + 1)
+
+/*
+ * Inode flags
+ */
+#define	EXT4_SECRM_FL			0x00000001 /* Secure deletion */
+#define	EXT4_UNRM_FL			0x00000002 /* Undelete */
+#define	EXT4_COMPR_FL			0x00000004 /* Compress file */
+#define EXT4_SYNC_FL			0x00000008 /* Synchronous updates */
+#define EXT4_IMMUTABLE_FL		0x00000010 /* Immutable file */
+#define EXT4_APPEND_FL			0x00000020 /* writes to file may only append */
+#define EXT4_NODUMP_FL			0x00000040 /* do not dump file */
+#define EXT4_NOATIME_FL			0x00000080 /* do not update atime */
+/* Reserved for compression usage... */
+#define EXT4_DIRTY_FL			0x00000100
+#define EXT4_COMPRBLK_FL		0x00000200 /* One or more compressed clusters */
+#define EXT4_NOCOMPR_FL			0x00000400 /* Don't compress */
+#define EXT4_ECOMPR_FL			0x00000800 /* Compression error */
+/* End compression flags --- maybe not all used */
+#define EXT4_INDEX_FL			0x00001000 /* hash-indexed directory */
+#define EXT4_IMAGIC_FL			0x00002000 /* AFS directory */
+#define EXT4_JOURNAL_DATA_FL		0x00004000 /* file data should be journaled */
+#define EXT4_NOTAIL_FL			0x00008000 /* file tail should not be merged */
+#define EXT4_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
+#define EXT4_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
+#define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
+#define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
+#define EXT4_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
+#define EXT4_EOFBLOCKS_FL		0x00400000 /* Blocks allocated beyond EOF */
+#define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
+
+#define EXT4_FL_USER_VISIBLE		0x004BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE		0x004B80FF /* User modifiable flags */
+
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+			   EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
+			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & EXT4_REG_FLMASK;
+	else
+		return flags & EXT4_OTHER_FLMASK;
+}
+
+/*
+ * Inode flags used for atomic set/get
+ */
+enum {
+	EXT4_INODE_SECRM	= 0,	/* Secure deletion */
+	EXT4_INODE_UNRM		= 1,	/* Undelete */
+	EXT4_INODE_COMPR	= 2,	/* Compress file */
+	EXT4_INODE_SYNC		= 3,	/* Synchronous updates */
+	EXT4_INODE_IMMUTABLE	= 4,	/* Immutable file */
+	EXT4_INODE_APPEND	= 5,	/* writes to file may only append */
+	EXT4_INODE_NODUMP	= 6,	/* do not dump file */
+	EXT4_INODE_NOATIME	= 7,	/* do not update atime */
+/* Reserved for compression usage... */
+	EXT4_INODE_DIRTY	= 8,
+	EXT4_INODE_COMPRBLK	= 9,	/* One or more compressed clusters */
+	EXT4_INODE_NOCOMPR	= 10,	/* Don't compress */
+	EXT4_INODE_ECOMPR	= 11,	/* Compression error */
+/* End compression flags --- maybe not all used */
+	EXT4_INODE_INDEX	= 12,	/* hash-indexed directory */
+	EXT4_INODE_IMAGIC	= 13,	/* AFS directory */
+	EXT4_INODE_JOURNAL_DATA	= 14,	/* file data should be journaled */
+	EXT4_INODE_NOTAIL	= 15,	/* file tail should not be merged */
+	EXT4_INODE_DIRSYNC	= 16,	/* dirsync behaviour (directories only) */
+	EXT4_INODE_TOPDIR	= 17,	/* Top of directory hierarchies*/
+	EXT4_INODE_HUGE_FILE	= 18,	/* Set to each huge file */
+	EXT4_INODE_EXTENTS	= 19,	/* Inode uses extents */
+	EXT4_INODE_EA_INODE	= 21,	/* Inode used for large EA */
+	EXT4_INODE_EOFBLOCKS	= 22,	/* Blocks allocated beyond EOF */
+	EXT4_INODE_RESERVED	= 31,	/* reserved for ext4 lib */
+};
+
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
+	printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
+		EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
+
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, and we
+ * can't do a compile-time test for ENUM values, we use a run-time
+ * test to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX.  If all is well the printk and BUG_ON will all drop
+ * out so it won't cost any extra space in the compiled kernel image.
+ * But it's important that these values are the same, since we are
+ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
+ * must be consistent with the values of FS_XXX_FL defined in
+ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
+ * ext4 filesystems, and of course the values defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
+ */
+static inline void ext4_check_flag_values(void)
+{
+	CHECK_FLAG_VALUE(SECRM);
+	CHECK_FLAG_VALUE(UNRM);
+	CHECK_FLAG_VALUE(COMPR);
+	CHECK_FLAG_VALUE(SYNC);
+	CHECK_FLAG_VALUE(IMMUTABLE);
+	CHECK_FLAG_VALUE(APPEND);
+	CHECK_FLAG_VALUE(NODUMP);
+	CHECK_FLAG_VALUE(NOATIME);
+	CHECK_FLAG_VALUE(DIRTY);
+	CHECK_FLAG_VALUE(COMPRBLK);
+	CHECK_FLAG_VALUE(NOCOMPR);
+	CHECK_FLAG_VALUE(ECOMPR);
+	CHECK_FLAG_VALUE(INDEX);
+	CHECK_FLAG_VALUE(IMAGIC);
+	CHECK_FLAG_VALUE(JOURNAL_DATA);
+	CHECK_FLAG_VALUE(NOTAIL);
+	CHECK_FLAG_VALUE(DIRSYNC);
+	CHECK_FLAG_VALUE(TOPDIR);
+	CHECK_FLAG_VALUE(HUGE_FILE);
+	CHECK_FLAG_VALUE(EXTENTS);
+	CHECK_FLAG_VALUE(EA_INODE);
+	CHECK_FLAG_VALUE(EOFBLOCKS);
+	CHECK_FLAG_VALUE(RESERVED);
+}
+
+/* Used to pass group descriptor data when online resize is done */
+struct ext4_new_group_input {
+	__u32 group;		/* Group number for this data */
+	__u64 block_bitmap;	/* Absolute block number of block bitmap */
+	__u64 inode_bitmap;	/* Absolute block number of inode bitmap */
+	__u64 inode_table;	/* Absolute block number of inode table start */
+	__u32 blocks_count;	/* Total number of blocks in this group */
+	__u16 reserved_blocks;	/* Number of reserved blocks in this group */
+	__u16 unused;
+};
+
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+	u32 group;
+	compat_u64 block_bitmap;
+	compat_u64 inode_bitmap;
+	compat_u64 inode_table;
+	u32 blocks_count;
+	u16 reserved_blocks;
+	u16 unused;
+};
+#endif
+
+/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
+struct ext4_new_group_data {
+	__u32 group;
+	__u64 block_bitmap;
+	__u64 inode_bitmap;
+	__u64 inode_table;
+	__u32 blocks_count;
+	__u16 reserved_blocks;
+	__u16 unused;
+	__u32 free_blocks_count;
+};
+
+/*
+ * Flags used by ext4_map_blocks()
+ */
+	/* Allocate any needed blocks and/or convert an unitialized
+	   extent to be an initialized ext4 */
+#define EXT4_GET_BLOCKS_CREATE			0x0001
+	/* Request the creation of an unitialized extent */
+#define EXT4_GET_BLOCKS_UNINIT_EXT		0x0002
+#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
+						 EXT4_GET_BLOCKS_CREATE)
+	/* Caller is from the delayed allocation writeout path,
+	   so set the magic i_delalloc_reserve_flag after taking the
+	   inode allocation semaphore for */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0004
+	/* caller is from the direct IO path, request to creation of an
+	unitialized extents if not allocated, split the uninitialized
+	extent if blocks has been preallocated already*/
+#define EXT4_GET_BLOCKS_PRE_IO			0x0008
+#define EXT4_GET_BLOCKS_CONVERT			0x0010
+#define EXT4_GET_BLOCKS_IO_CREATE_EXT		(EXT4_GET_BLOCKS_PRE_IO|\
+					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+	/* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
+					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+	/* Punch out blocks of an extent */
+#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT		0x0020
+	/* Don't normalize allocation size (used for fallocate) */
+#define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040
+
+/*
+ * Flags used by ext4_free_blocks
+ */
+#define EXT4_FREE_BLOCKS_METADATA	0x0001
+#define EXT4_FREE_BLOCKS_FORGET		0x0002
+#define EXT4_FREE_BLOCKS_VALIDATED	0x0004
+#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE	0x0008
+
+/*
+ * ioctl commands
+ */
+#define	EXT4_IOC_GETFLAGS		FS_IOC_GETFLAGS
+#define	EXT4_IOC_SETFLAGS		FS_IOC_SETFLAGS
+#define	EXT4_IOC_GETVERSION		_IOR('f', 3, long)
+#define	EXT4_IOC_SETVERSION		_IOW('f', 4, long)
+#define	EXT4_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
+#define	EXT4_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
+#ifdef CONFIG_JBD2_DEBUG
+#define EXT4_IOC_WAIT_FOR_READONLY	_IOR('f', 99, long)
+#endif
+#define EXT4_IOC_GETRSVSZ		_IOR('f', 5, long)
+#define EXT4_IOC_SETRSVSZ		_IOW('f', 6, long)
+#define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
+#define EXT4_IOC_GROUP_ADD		_IOW('f', 8, struct ext4_new_group_input)
+#define EXT4_IOC_MIGRATE		_IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
+ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
+#define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
+
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define EXT4_IOC32_GETFLAGS		FS_IOC32_GETFLAGS
+#define EXT4_IOC32_SETFLAGS		FS_IOC32_SETFLAGS
+#define EXT4_IOC32_GETVERSION		_IOR('f', 3, int)
+#define EXT4_IOC32_SETVERSION		_IOW('f', 4, int)
+#define EXT4_IOC32_GETRSVSZ		_IOR('f', 5, int)
+#define EXT4_IOC32_SETRSVSZ		_IOW('f', 6, int)
+#define EXT4_IOC32_GROUP_EXTEND		_IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD		_IOW('f', 8, struct compat_ext4_new_group_input)
+#ifdef CONFIG_JBD2_DEBUG
+#define EXT4_IOC32_WAIT_FOR_READONLY	_IOR('f', 99, int)
+#endif
+#define EXT4_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
+#define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
+#endif
+
+/* Max physical block we can address w/o extents */
+#define EXT4_MAX_BLOCK_FILE_PHYS	0xFFFFFFFF
+
+/*
+ * Structure of an inode on the disk
+ */
+struct ext4_inode {
+	__le16	i_mode;		/* File mode */
+	__le16	i_uid;		/* Low 16 bits of Owner Uid */
+	__le32	i_size_lo;	/* Size in bytes */
+	__le32	i_atime;	/* Access time */
+	__le32	i_ctime;	/* Inode Change time */
+	__le32	i_mtime;	/* Modification time */
+	__le32	i_dtime;	/* Deletion Time */
+	__le16	i_gid;		/* Low 16 bits of Group Id */
+	__le16	i_links_count;	/* Links count */
+	__le32	i_blocks_lo;	/* Blocks count */
+	__le32	i_flags;	/* File flags */
+	union {
+		struct {
+			__le32  l_i_version;
+		} linux1;
+		struct {
+			__u32  h_i_translator;
+		} hurd1;
+		struct {
+			__u32  m_i_reserved1;
+		} masix1;
+	} osd1;				/* OS dependent 1 */
+	__le32	i_block[EXT4_N_BLOCKS];/* Pointers to blocks */
+	__le32	i_generation;	/* File version (for NFS) */
+	__le32	i_file_acl_lo;	/* File ACL */
+	__le32	i_size_high;
+	__le32	i_obso_faddr;	/* Obsoleted fragment address */
+	union {
+		struct {
+			__le16	l_i_blocks_high; /* were l_i_reserved1 */
+			__le16	l_i_file_acl_high;
+			__le16	l_i_uid_high;	/* these 2 fields */
+			__le16	l_i_gid_high;	/* were reserved2[0] */
+			__u32	l_i_reserved2;
+		} linux2;
+		struct {
+			__le16	h_i_reserved1;	/* Obsoleted fragment number/size which are removed in ext4 */
+			__u16	h_i_mode_high;
+			__u16	h_i_uid_high;
+			__u16	h_i_gid_high;
+			__u32	h_i_author;
+		} hurd2;
+		struct {
+			__le16	h_i_reserved1;	/* Obsoleted fragment number/size which are removed in ext4 */
+			__le16	m_i_file_acl_high;
+			__u32	m_i_reserved2[2];
+		} masix2;
+	} osd2;				/* OS dependent 2 */
+	__le16	i_extra_isize;
+	__le16	i_pad1;
+	__le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
+	__le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
+	__le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
+	__le32  i_crtime;       /* File Creation time */
+	__le32  i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
+	__le32  i_version_hi;	/* high 32 bits for 64-bit version */
+};
+
+struct move_extent {
+	__u32 reserved;		/* should be zero */
+	__u32 donor_fd;		/* donor file descriptor */
+	__u64 orig_start;	/* logical start offset in block for orig */
+	__u64 donor_start;	/* logical start offset in block for donor */
+	__u64 len;		/* block length to be moved */
+	__u64 moved_len;	/* moved block length */
+};
+
+#define EXT4_EPOCH_BITS 2
+#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
+#define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)
+
+/*
+ * Extended fields will fit into an inode if the filesystem was formatted
+ * with large inodes (-I 256 or larger) and there are not currently any EAs
+ * consuming all of the available space. For new inodes we always reserve
+ * enough space for the kernel's known extended fields, but for inodes
+ * created with an old kernel this might not have been the case. None of
+ * the extended inode fields is critical for correct filesystem operation.
+ * This macro checks if a certain field fits in the inode. Note that
+ * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
+ */
+#define EXT4_FITS_IN_INODE(ext4_inode, einode, field)	\
+	((offsetof(typeof(*ext4_inode), field) +	\
+	  sizeof((ext4_inode)->field))			\
+	<= (EXT4_GOOD_OLD_INODE_SIZE +			\
+	    (einode)->i_extra_isize))			\
+
+static inline __le32 ext4_encode_extra_time(struct timespec *time)
+{
+       return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
+			   (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
+                          ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
+}
+
+static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
+{
+       if (sizeof(time->tv_sec) > 4)
+	       time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
+			       << 32;
+       time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+}
+
+#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)			       \
+do {									       \
+	(raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec);	       \
+	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
+		(raw_inode)->xtime ## _extra =				       \
+				ext4_encode_extra_time(&(inode)->xtime);       \
+} while (0)
+
+#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)			       \
+do {									       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
+		(raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec);      \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
+		(raw_inode)->xtime ## _extra =				       \
+				ext4_encode_extra_time(&(einode)->xtime);      \
+} while (0)
+
+#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)			       \
+do {									       \
+	(inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime);       \
+	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
+		ext4_decode_extra_time(&(inode)->xtime,			       \
+				       raw_inode->xtime ## _extra);	       \
+	else								       \
+		(inode)->xtime.tv_nsec = 0;				       \
+} while (0)
+
+#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)			       \
+do {									       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
+		(einode)->xtime.tv_sec = 				       \
+			(signed)le32_to_cpu((raw_inode)->xtime);	       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
+		ext4_decode_extra_time(&(einode)->xtime,		       \
+				       raw_inode->xtime ## _extra);	       \
+	else								       \
+		(einode)->xtime.tv_nsec = 0;				       \
+} while (0)
+
+#define i_disk_version osd1.linux1.l_i_version
+
+#if defined(__KERNEL__) || defined(__linux__)
+#define i_reserved1	osd1.linux1.l_i_reserved1
+#define i_file_acl_high	osd2.linux2.l_i_file_acl_high
+#define i_blocks_high	osd2.linux2.l_i_blocks_high
+#define i_uid_low	i_uid
+#define i_gid_low	i_gid
+#define i_uid_high	osd2.linux2.l_i_uid_high
+#define i_gid_high	osd2.linux2.l_i_gid_high
+#define i_reserved2	osd2.linux2.l_i_reserved2
+
+#elif defined(__GNU__)
+
+#define i_translator	osd1.hurd1.h_i_translator
+#define i_uid_high	osd2.hurd2.h_i_uid_high
+#define i_gid_high	osd2.hurd2.h_i_gid_high
+#define i_author	osd2.hurd2.h_i_author
+
+#elif defined(__masix__)
+
+#define i_reserved1	osd1.masix1.m_i_reserved1
+#define i_file_acl_high	osd2.masix2.m_i_file_acl_high
+#define i_reserved2	osd2.masix2.m_i_reserved2
+
+#endif /* defined(__KERNEL__) || defined(__linux__) */
+
+/*
+ * storage for cached extent
+ * If ec_len == 0, then the cache is invalid.
+ * If ec_start == 0, then the cache represents a gap (null mapping)
+ */
+struct ext4_ext_cache {
+	ext4_fsblk_t	ec_start;
+	ext4_lblk_t	ec_block;
+	__u32		ec_len; /* must be 32bit to return holes */
+};
+
+/*
+ * fourth extended file system inode data in memory
+ */
+struct ext4_inode_info {
+	__le32	i_data[15];	/* unconverted */
+	__u32	i_dtime;
+	ext4_fsblk_t	i_file_acl;
+
+	/*
+	 * i_block_group is the number of the block group which contains
+	 * this file's inode.  Constant across the lifetime of the inode,
+	 * it is ued for making block allocation decisions - we try to
+	 * place a file's data blocks near its inode block, and new inodes
+	 * near to their parent directory's inode.
+	 */
+	ext4_group_t	i_block_group;
+	ext4_lblk_t	i_dir_start_lookup;
+#if (BITS_PER_LONG < 64)
+	unsigned long	i_state_flags;		/* Dynamic state flags */
+#endif
+	unsigned long	i_flags;
+
+#ifdef CONFIG_EXT4_FS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_mutex even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
+
+	struct list_head i_orphan;	/* unlinked but open inodes */
+
+	/*
+	 * i_disksize keeps track of what the inode size is ON DISK, not
+	 * in memory.  During truncate, i_size is set to the new size by
+	 * the VFS prior to calling ext4_truncate(), but the filesystem won't
+	 * set i_disksize to 0 until the truncate is actually under way.
+	 *
+	 * The intent is that i_disksize always represents the blocks which
+	 * are used by this file.  This allows recovery to restart truncate
+	 * on orphans if we crash during truncate.  We actually write i_disksize
+	 * into the on-disk inode when writing inodes out, instead of i_size.
+	 *
+	 * The only time when i_disksize and i_size may be different is when
+	 * a truncate is in progress.  The only things which change i_disksize
+	 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
+	 */
+	loff_t	i_disksize;
+
+	/*
+	 * i_data_sem is for serialising ext4_truncate() against
+	 * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
+	 * data tree are chopped off during truncate. We can't do that in
+	 * ext4 because whenever we perform intermediate commits during
+	 * truncate, the inode and all the metadata blocks *must* be in a
+	 * consistent state which allows truncation of the orphans to restart
+	 * during recovery.  Hence we must fix the get_block-vs-truncate race
+	 * by other means, so we have i_data_sem.
+	 */
+	struct rw_semaphore i_data_sem;
+	struct inode vfs_inode;
+	struct jbd2_inode *jinode;
+
+	struct ext4_ext_cache i_cached_extent;
+	/*
+	 * File creation time. Its function is same as that of
+	 * struct timespec i_{a,c,m}time in the generic inode.
+	 */
+	struct timespec i_crtime;
+
+	/* mballoc */
+	struct list_head i_prealloc_list;
+	spinlock_t i_prealloc_lock;
+
+	/* ialloc */
+	ext4_group_t	i_last_alloc_group;
+
+	/* allocation reservation info for delalloc */
+	unsigned int i_reserved_data_blocks;
+	unsigned int i_reserved_meta_blocks;
+	unsigned int i_allocated_meta_blocks;
+	ext4_lblk_t i_da_metadata_calc_last_lblock;
+	int i_da_metadata_calc_len;
+
+	/* on-disk additional length */
+	__u16 i_extra_isize;
+
+#ifdef CONFIG_QUOTA
+	/* quota space reservation, managed internally by quota code */
+	qsize_t i_reserved_quota;
+#endif
+
+	/* completed IOs that might need unwritten extents handling */
+	struct list_head i_completed_io_list;
+	spinlock_t i_completed_io_lock;
+	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */
+	/* current io_end structure for async DIO write*/
+	ext4_io_end_t *cur_aio_dio;
+	atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
+
+	spinlock_t i_block_reservation_lock;
+
+	/*
+	 * Transactions that contain inode's metadata needed to complete
+	 * fsync and fdatasync, respectively.
+	 */
+	tid_t i_sync_tid;
+	tid_t i_datasync_tid;
+};
+
+/*
+ * File system states
+ */
+#define	EXT4_VALID_FS			0x0001	/* Unmounted cleanly */
+#define	EXT4_ERROR_FS			0x0002	/* Errors detected */
+#define	EXT4_ORPHAN_FS			0x0004	/* Orphans being recovered */
+
+/*
+ * Misc. filesystem flags
+ */
+#define EXT2_FLAGS_SIGNED_HASH		0x0001  /* Signed dirhash in use */
+#define EXT2_FLAGS_UNSIGNED_HASH	0x0002  /* Unsigned dirhash in use */
+#define EXT2_FLAGS_TEST_FILESYS		0x0004	/* to test development code */
+
+/*
+ * Mount flags
+ */
+#define EXT4_MOUNT_OLDALLOC		0x00002  /* Don't use the new Orlov allocator */
+#define EXT4_MOUNT_GRPID		0x00004	/* Create files with directory's group */
+#define EXT4_MOUNT_DEBUG		0x00008	/* Some debugging messages */
+#define EXT4_MOUNT_ERRORS_CONT		0x00010	/* Continue on errors */
+#define EXT4_MOUNT_ERRORS_RO		0x00020	/* Remount fs ro on errors */
+#define EXT4_MOUNT_ERRORS_PANIC		0x00040	/* Panic on errors */
+#define EXT4_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */
+#define EXT4_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/
+#define EXT4_MOUNT_DATA_FLAGS		0x00C00	/* Mode for data writes: */
+#define EXT4_MOUNT_JOURNAL_DATA		0x00400	/* Write data to journal */
+#define EXT4_MOUNT_ORDERED_DATA		0x00800	/* Flush data before commit */
+#define EXT4_MOUNT_WRITEBACK_DATA	0x00C00	/* No data ordering */
+#define EXT4_MOUNT_UPDATE_JOURNAL	0x01000	/* Update the journal format */
+#define EXT4_MOUNT_NO_UID32		0x02000  /* Disable 32-bit UIDs */
+#define EXT4_MOUNT_XATTR_USER		0x04000	/* Extended user attributes */
+#define EXT4_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
+#define EXT4_MOUNT_NO_AUTO_DA_ALLOC	0x10000	/* No auto delalloc mapping */
+#define EXT4_MOUNT_BARRIER		0x20000 /* Use block barriers */
+#define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
+#define EXT4_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
+#define EXT4_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
+#define EXT4_MOUNT_DIOREAD_NOLOCK	0x400000 /* Enable support for dio read nolocking */
+#define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
+#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
+#define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
+#define EXT4_MOUNT_MBLK_IO_SUBMIT	0x4000000 /* multi-block io submits */
+#define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
+#define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
+#define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
+#define EXT4_MOUNT_DISCARD		0x40000000 /* Issue DISCARD requests */
+#define EXT4_MOUNT_INIT_INODE_TABLE	0x80000000 /* Initialize uninitialized itables */
+
+#define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
+						~EXT4_MOUNT_##opt
+#define set_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt |= \
+						EXT4_MOUNT_##opt
+#define test_opt(sb, opt)		(EXT4_SB(sb)->s_mount_opt & \
+					 EXT4_MOUNT_##opt)
+
+#define clear_opt2(sb, opt)		EXT4_SB(sb)->s_mount_opt2 &= \
+						~EXT4_MOUNT2_##opt
+#define set_opt2(sb, opt)		EXT4_SB(sb)->s_mount_opt2 |= \
+						EXT4_MOUNT2_##opt
+#define test_opt2(sb, opt)		(EXT4_SB(sb)->s_mount_opt2 & \
+					 EXT4_MOUNT2_##opt)
+
+#define ext4_set_bit			__test_and_set_bit_le
+#define ext4_set_bit_atomic		ext2_set_bit_atomic
+#define ext4_clear_bit			__test_and_clear_bit_le
+#define ext4_clear_bit_atomic		ext2_clear_bit_atomic
+#define ext4_test_bit			test_bit_le
+#define ext4_find_first_zero_bit	find_first_zero_bit_le
+#define ext4_find_next_zero_bit		find_next_zero_bit_le
+#define ext4_find_next_bit		find_next_bit_le
+
+/*
+ * Maximal mount counts between two filesystem checks
+ */
+#define EXT4_DFL_MAX_MNT_COUNT		20	/* Allow 20 mounts */
+#define EXT4_DFL_CHECKINTERVAL		0	/* Don't use interval check */
+
+/*
+ * Behaviour when detecting errors
+ */
+#define EXT4_ERRORS_CONTINUE		1	/* Continue execution */
+#define EXT4_ERRORS_RO			2	/* Remount fs read-only */
+#define EXT4_ERRORS_PANIC		3	/* Panic */
+#define EXT4_ERRORS_DEFAULT		EXT4_ERRORS_CONTINUE
+
+/*
+ * Structure of the super block
+ */
+struct ext4_super_block {
+/*00*/	__le32	s_inodes_count;		/* Inodes count */
+	__le32	s_blocks_count_lo;	/* Blocks count */
+	__le32	s_r_blocks_count_lo;	/* Reserved blocks count */
+	__le32	s_free_blocks_count_lo;	/* Free blocks count */
+/*10*/	__le32	s_free_inodes_count;	/* Free inodes count */
+	__le32	s_first_data_block;	/* First Data Block */
+	__le32	s_log_block_size;	/* Block size */
+	__le32	s_obso_log_frag_size;	/* Obsoleted fragment size */
+/*20*/	__le32	s_blocks_per_group;	/* # Blocks per group */
+	__le32	s_obso_frags_per_group;	/* Obsoleted fragments per group */
+	__le32	s_inodes_per_group;	/* # Inodes per group */
+	__le32	s_mtime;		/* Mount time */
+/*30*/	__le32	s_wtime;		/* Write time */
+	__le16	s_mnt_count;		/* Mount count */
+	__le16	s_max_mnt_count;	/* Maximal mount count */
+	__le16	s_magic;		/* Magic signature */
+	__le16	s_state;		/* File system state */
+	__le16	s_errors;		/* Behaviour when detecting errors */
+	__le16	s_minor_rev_level;	/* minor revision level */
+/*40*/	__le32	s_lastcheck;		/* time of last check */
+	__le32	s_checkinterval;	/* max. time between checks */
+	__le32	s_creator_os;		/* OS */
+	__le32	s_rev_level;		/* Revision level */
+/*50*/	__le16	s_def_resuid;		/* Default uid for reserved blocks */
+	__le16	s_def_resgid;		/* Default gid for reserved blocks */
+	/*
+	 * These fields are for EXT4_DYNAMIC_REV superblocks only.
+	 *
+	 * Note: the difference between the compatible feature set and
+	 * the incompatible feature set is that if there is a bit set
+	 * in the incompatible feature set that the kernel doesn't
+	 * know about, it should refuse to mount the filesystem.
+	 *
+	 * e2fsck's requirements are more strict; if it doesn't know
+	 * about a feature in either the compatible or incompatible
+	 * feature set, it must abort and not try to meddle with
+	 * things it doesn't understand...
+	 */
+	__le32	s_first_ino;		/* First non-reserved inode */
+	__le16  s_inode_size;		/* size of inode structure */
+	__le16	s_block_group_nr;	/* block group # of this superblock */
+	__le32	s_feature_compat;	/* compatible feature set */
+/*60*/	__le32	s_feature_incompat;	/* incompatible feature set */
+	__le32	s_feature_ro_compat;	/* readonly-compatible feature set */
+/*68*/	__u8	s_uuid[16];		/* 128-bit uuid for volume */
+/*78*/	char	s_volume_name[16];	/* volume name */
+/*88*/	char	s_last_mounted[64];	/* directory where last mounted */
+/*C8*/	__le32	s_algorithm_usage_bitmap; /* For compression */
+	/*
+	 * Performance hints.  Directory preallocation should only
+	 * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
+	 */
+	__u8	s_prealloc_blocks;	/* Nr of blocks to try to preallocate*/
+	__u8	s_prealloc_dir_blocks;	/* Nr to preallocate for dirs */
+	__le16	s_reserved_gdt_blocks;	/* Per group desc for online growth */
+	/*
+	 * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
+	 */
+/*D0*/	__u8	s_journal_uuid[16];	/* uuid of journal superblock */
+/*E0*/	__le32	s_journal_inum;		/* inode number of journal file */
+	__le32	s_journal_dev;		/* device number of journal file */
+	__le32	s_last_orphan;		/* start of list of inodes to delete */
+	__le32	s_hash_seed[4];		/* HTREE hash seed */
+	__u8	s_def_hash_version;	/* Default hash version to use */
+	__u8	s_jnl_backup_type;
+	__le16  s_desc_size;		/* size of group descriptor */
+/*100*/	__le32	s_default_mount_opts;
+	__le32	s_first_meta_bg;	/* First metablock block group */
+	__le32	s_mkfs_time;		/* When the filesystem was created */
+	__le32	s_jnl_blocks[17];	/* Backup of the journal inode */
+	/* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
+/*150*/	__le32	s_blocks_count_hi;	/* Blocks count */
+	__le32	s_r_blocks_count_hi;	/* Reserved blocks count */
+	__le32	s_free_blocks_count_hi;	/* Free blocks count */
+	__le16	s_min_extra_isize;	/* All inodes have at least # bytes */
+	__le16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
+	__le32	s_flags;		/* Miscellaneous flags */
+	__le16  s_raid_stride;		/* RAID stride */
+	__le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
+	__le64  s_mmp_block;            /* Block for multi-mount protection */
+	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
+	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
+	__u8	s_reserved_char_pad;
+	__le16  s_reserved_pad;
+	__le64	s_kbytes_written;	/* nr of lifetime kilobytes written */
+	__le32	s_snapshot_inum;	/* Inode number of active snapshot */
+	__le32	s_snapshot_id;		/* sequential ID of active snapshot */
+	__le64	s_snapshot_r_blocks_count; /* reserved blocks for active
+					      snapshot's future use */
+	__le32	s_snapshot_list;	/* inode number of the head of the
+					   on-disk snapshot list */
+#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
+	__le32	s_error_count;		/* number of fs errors */
+	__le32	s_first_error_time;	/* first time an error happened */
+	__le32	s_first_error_ino;	/* inode involved in first error */
+	__le64	s_first_error_block;	/* block involved of first error */
+	__u8	s_first_error_func[32];	/* function where the error happened */
+	__le32	s_first_error_line;	/* line number where error happened */
+	__le32	s_last_error_time;	/* most recent time of an error */
+	__le32	s_last_error_ino;	/* inode involved in last error */
+	__le32	s_last_error_line;	/* line number where error happened */
+	__le64	s_last_error_block;	/* block involved of last error */
+	__u8	s_last_error_func[32];	/* function where the error happened */
+#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
+	__u8	s_mount_opts[64];
+	__le32	s_reserved[112];        /* Padding to the end of the block */
+};
+
+#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
+
+#ifdef __KERNEL__
+
+/*
+ * run-time mount flags
+ */
+#define EXT4_MF_MNTDIR_SAMPLED	0x0001
+#define EXT4_MF_FS_ABORTED	0x0002	/* Fatal error detected */
+
+/*
+ * fourth extended-fs super-block data in memory
+ */
+struct ext4_sb_info {
+	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
+	unsigned long s_inodes_per_block;/* Number of inodes per block */
+	unsigned long s_blocks_per_group;/* Number of blocks in a group */
+	unsigned long s_inodes_per_group;/* Number of inodes in a group */
+	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
+	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
+	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
+	ext4_group_t s_groups_count;	/* Number of groups in the fs */
+	ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
+	unsigned long s_overhead_last;  /* Last calculated overhead */
+	unsigned long s_blocks_last;    /* Last seen block count */
+	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
+	struct buffer_head * s_sbh;	/* Buffer containing the super block */
+	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
+	struct buffer_head **s_group_desc;
+	unsigned int s_mount_opt;
+	unsigned int s_mount_opt2;
+	unsigned int s_mount_flags;
+	ext4_fsblk_t s_sb_block;
+	uid_t s_resuid;
+	gid_t s_resgid;
+	unsigned short s_mount_state;
+	unsigned short s_pad;
+	int s_addr_per_block_bits;
+	int s_desc_per_block_bits;
+	int s_inode_size;
+	int s_first_ino;
+	unsigned int s_inode_readahead_blks;
+	unsigned int s_inode_goal;
+	spinlock_t s_next_gen_lock;
+	u32 s_next_generation;
+	u32 s_hash_seed[4];
+	int s_def_hash_version;
+	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
+	struct percpu_counter s_freeblocks_counter;
+	struct percpu_counter s_freeinodes_counter;
+	struct percpu_counter s_dirs_counter;
+	struct percpu_counter s_dirtyblocks_counter;
+	struct blockgroup_lock *s_blockgroup_lock;
+	struct proc_dir_entry *s_proc;
+	struct kobject s_kobj;
+	struct completion s_kobj_unregister;
+
+	/* Journaling */
+	struct journal_s *s_journal;
+	struct list_head s_orphan;
+	struct mutex s_orphan_lock;
+	struct mutex s_resize_lock;
+	unsigned long s_commit_interval;
+	u32 s_max_batch_time;
+	u32 s_min_batch_time;
+	struct block_device *journal_bdev;
+#ifdef CONFIG_JBD2_DEBUG
+	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
+	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
+#endif
+#ifdef CONFIG_QUOTA
+	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
+	int s_jquota_fmt;			/* Format of quota to use */
+#endif
+	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+	struct rb_root system_blks;
+
+#ifdef EXTENTS_STATS
+	/* ext4 extents stats */
+	unsigned long s_ext_min;
+	unsigned long s_ext_max;
+	unsigned long s_depth_max;
+	spinlock_t s_ext_stats_lock;
+	unsigned long s_ext_blocks;
+	unsigned long s_ext_extents;
+#endif
+	/* ext4 extent cache stats */
+	unsigned long extent_cache_hits;
+	unsigned long extent_cache_misses;
+
+	/* for buddy allocator */
+	struct ext4_group_info ***s_group_info;
+	struct inode *s_buddy_cache;
+	spinlock_t s_md_lock;
+	unsigned short *s_mb_offsets;
+	unsigned int *s_mb_maxs;
+
+	/* tunables */
+	unsigned long s_stripe;
+	unsigned int s_mb_stream_request;
+	unsigned int s_mb_max_to_scan;
+	unsigned int s_mb_min_to_scan;
+	unsigned int s_mb_stats;
+	unsigned int s_mb_order2_reqs;
+	unsigned int s_mb_group_prealloc;
+	unsigned int s_max_writeback_mb_bump;
+	/* where last allocation was done - for stream allocation */
+	unsigned long s_mb_last_group;
+	unsigned long s_mb_last_start;
+
+	/* stats for buddy allocator */
+	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
+	atomic_t s_bal_success;	/* we found long enough chunks */
+	atomic_t s_bal_allocated;	/* in blocks */
+	atomic_t s_bal_ex_scanned;	/* total extents scanned */
+	atomic_t s_bal_goals;	/* goal hits */
+	atomic_t s_bal_breaks;	/* too long searches */
+	atomic_t s_bal_2orders;	/* 2^order hits */
+	spinlock_t s_bal_lock;
+	unsigned long s_mb_buddies_generated;
+	unsigned long long s_mb_generation_time;
+	atomic_t s_mb_lost_chunks;
+	atomic_t s_mb_preallocated;
+	atomic_t s_mb_discarded;
+	atomic_t s_lock_busy;
+
+	/* locality groups */
+	struct ext4_locality_group __percpu *s_locality_groups;
+
+	/* for write statistics */
+	unsigned long s_sectors_written_start;
+	u64 s_kbytes_written;
+
+	unsigned int s_log_groups_per_flex;
+	struct flex_groups *s_flex_groups;
+
+	/* workqueue for dio unwritten */
+	struct workqueue_struct *dio_unwritten_wq;
+
+	/* timer for periodic error stats printing */
+	struct timer_list s_err_report;
+
+	/* Lazy inode table initialization info */
+	struct ext4_li_request *s_li_request;
+	/* Wait multiplier for lazy initialization thread */
+	unsigned int s_li_wait_mult;
+
+	/* Kernel thread for multiple mount protection */
+	struct task_struct *s_mmp_tsk;
+};
+
+static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
+{
+	return container_of(inode, struct ext4_inode_info, vfs_inode);
+}
+
+static inline struct timespec ext4_current_time(struct inode *inode)
+{
+	return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
+		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+}
+
+static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
+{
+	return ino == EXT4_ROOT_INO ||
+		ino == EXT4_JOURNAL_INO ||
+		ino == EXT4_RESIZE_INO ||
+		(ino >= EXT4_FIRST_INO(sb) &&
+		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
+}
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+	EXT4_STATE_JDATA,		/* journaled data exists */
+	EXT4_STATE_NEW,			/* inode is newly created */
+	EXT4_STATE_XATTR,		/* has in-inode xattrs */
+	EXT4_STATE_NO_EXPAND,		/* No space for expansion */
+	EXT4_STATE_DA_ALLOC_CLOSE,	/* Alloc DA blks on close */
+	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */
+	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
+	EXT4_STATE_NEWENTRY,		/* File just added to dir */
+	EXT4_STATE_DELALLOC_RESERVED,	/* blks already reserved for delalloc */
+};
+
+#define EXT4_INODE_BIT_FNS(name, field, offset)				\
+static inline int ext4_test_inode_##name(struct inode *inode, int bit)	\
+{									\
+	return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);	\
+}									\
+static inline void ext4_set_inode_##name(struct inode *inode, int bit)	\
+{									\
+	set_bit(bit + (offset), &EXT4_I(inode)->i_##field);		\
+}									\
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{									\
+	clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);		\
+}
+
+EXT4_INODE_BIT_FNS(flag, flags, 0)
+#if (BITS_PER_LONG < 64)
+EXT4_INODE_BIT_FNS(state, state_flags, 0)
+
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+	(ei)->i_state_flags = 0;
+}
+#else
+EXT4_INODE_BIT_FNS(state, flags, 32)
+
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+	/* We depend on the fact that callers will set i_flags */
+}
+#endif
+#else
+/* Assume that user mode programs are passing in an ext4fs superblock, not
+ * a kernel struct super_block.  This will allow us to call the feature-test
+ * macros from user land. */
+#define EXT4_SB(sb)	(sb)
+#endif
+
+#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
+
+/*
+ * Codes for operating systems
+ */
+#define EXT4_OS_LINUX		0
+#define EXT4_OS_HURD		1
+#define EXT4_OS_MASIX		2
+#define EXT4_OS_FREEBSD		3
+#define EXT4_OS_LITES		4
+
+/*
+ * Revision levels
+ */
+#define EXT4_GOOD_OLD_REV	0	/* The good old (original) format */
+#define EXT4_DYNAMIC_REV	1	/* V2 format w/ dynamic inode sizes */
+
+#define EXT4_CURRENT_REV	EXT4_GOOD_OLD_REV
+#define EXT4_MAX_SUPP_REV	EXT4_DYNAMIC_REV
+
+#define EXT4_GOOD_OLD_INODE_SIZE 128
+
+/*
+ * Feature set definitions
+ */
+
+#define EXT4_HAS_COMPAT_FEATURE(sb,mask)			\
+	((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
+#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask)			\
+	((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
+#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask)			\
+	((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
+#define EXT4_SET_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
+#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
+#define EXT4_SET_INCOMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
+#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
+#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
+#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
+
+#define EXT4_FEATURE_COMPAT_DIR_PREALLOC	0x0001
+#define EXT4_FEATURE_COMPAT_IMAGIC_INODES	0x0002
+#define EXT4_FEATURE_COMPAT_HAS_JOURNAL		0x0004
+#define EXT4_FEATURE_COMPAT_EXT_ATTR		0x0008
+#define EXT4_FEATURE_COMPAT_RESIZE_INODE	0x0010
+#define EXT4_FEATURE_COMPAT_DIR_INDEX		0x0020
+
+#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
+#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
+#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR	0x0004
+#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE        0x0008
+#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM		0x0010
+#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK	0x0020
+#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
+#define EXT4_FEATURE_RO_COMPAT_QUOTA		0x0100
+
+#define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
+#define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
+#define EXT4_FEATURE_INCOMPAT_RECOVER		0x0004 /* Needs recovery */
+#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV	0x0008 /* Journal device */
+#define EXT4_FEATURE_INCOMPAT_META_BG		0x0010
+#define EXT4_FEATURE_INCOMPAT_EXTENTS		0x0040 /* extents support */
+#define EXT4_FEATURE_INCOMPAT_64BIT		0x0080
+#define EXT4_FEATURE_INCOMPAT_MMP               0x0100
+#define EXT4_FEATURE_INCOMPAT_FLEX_BG		0x0200
+#define EXT4_FEATURE_INCOMPAT_EA_INODE		0x0400 /* EA in inode */
+#define EXT4_FEATURE_INCOMPAT_DIRDATA		0x1000 /* data in dirent */
+
+#define EXT2_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT2_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
+					 EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT2_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
+#define EXT3_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
+					 EXT4_FEATURE_INCOMPAT_RECOVER| \
+					 EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
+#define EXT4_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT4_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
+					 EXT4_FEATURE_INCOMPAT_RECOVER| \
+					 EXT4_FEATURE_INCOMPAT_META_BG| \
+					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
+					 EXT4_FEATURE_INCOMPAT_64BIT| \
+					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+					 EXT4_FEATURE_INCOMPAT_MMP)
+#define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
+					 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
+					 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
+					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
+					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
+
+/*
+ * Default values for user and/or group using reserved blocks
+ */
+#define	EXT4_DEF_RESUID		0
+#define	EXT4_DEF_RESGID		0
+
+#define EXT4_DEF_INODE_READAHEAD_BLKS	32
+
+/*
+ * Default mount options
+ */
+#define EXT4_DEFM_DEBUG		0x0001
+#define EXT4_DEFM_BSDGROUPS	0x0002
+#define EXT4_DEFM_XATTR_USER	0x0004
+#define EXT4_DEFM_ACL		0x0008
+#define EXT4_DEFM_UID16		0x0010
+#define EXT4_DEFM_JMODE		0x0060
+#define EXT4_DEFM_JMODE_DATA	0x0020
+#define EXT4_DEFM_JMODE_ORDERED	0x0040
+#define EXT4_DEFM_JMODE_WBACK	0x0060
+#define EXT4_DEFM_NOBARRIER	0x0100
+#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
+#define EXT4_DEFM_DISCARD	0x0400
+#define EXT4_DEFM_NODELALLOC	0x0800
+
+/*
+ * Default journal batch times
+ */
+#define EXT4_DEF_MIN_BATCH_TIME	0
+#define EXT4_DEF_MAX_BATCH_TIME	15000 /* 15ms */
+
+/*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME	4
+
+/*
+ * Structure of a directory entry
+ */
+#define EXT4_NAME_LEN 255
+
+struct ext4_dir_entry {
+	__le32	inode;			/* Inode number */
+	__le16	rec_len;		/* Directory entry length */
+	__le16	name_len;		/* Name length */
+	char	name[EXT4_NAME_LEN];	/* File name */
+};
+
+/*
+ * The new version of the directory entry.  Since EXT4 structures are
+ * stored in intel byte order, and the name_len field could never be
+ * bigger than 255 chars, it's safe to reclaim the extra byte for the
+ * file_type field.
+ */
+struct ext4_dir_entry_2 {
+	__le32	inode;			/* Inode number */
+	__le16	rec_len;		/* Directory entry length */
+	__u8	name_len;		/* Name length */
+	__u8	file_type;
+	char	name[EXT4_NAME_LEN];	/* File name */
+};
+
+/*
+ * Ext4 directory file types.  Only the low 3 bits are used.  The
+ * other bits are reserved for now.
+ */
+#define EXT4_FT_UNKNOWN		0
+#define EXT4_FT_REG_FILE	1
+#define EXT4_FT_DIR		2
+#define EXT4_FT_CHRDEV		3
+#define EXT4_FT_BLKDEV		4
+#define EXT4_FT_FIFO		5
+#define EXT4_FT_SOCK		6
+#define EXT4_FT_SYMLINK		7
+
+#define EXT4_FT_MAX		8
+
+/*
+ * EXT4_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define EXT4_DIR_PAD			4
+#define EXT4_DIR_ROUND			(EXT4_DIR_PAD - 1)
+#define EXT4_DIR_REC_LEN(name_len)	(((name_len) + 8 + EXT4_DIR_ROUND) & \
+					 ~EXT4_DIR_ROUND)
+#define EXT4_MAX_REC_LEN		((1<<16)-1)
+
+/*
+ * If we ever get support for fs block sizes > page_size, we'll need
+ * to remove the #if statements in the next two functions...
+ */
+static inline unsigned int
+ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+	unsigned len = le16_to_cpu(dlen);
+
+#if (PAGE_CACHE_SIZE >= 65536)
+	if (len == EXT4_MAX_REC_LEN || len == 0)
+		return blocksize;
+	return (len & 65532) | ((len & 3) << 16);
+#else
+	return len;
+#endif
+}
+
+static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+	if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+		BUG();
+#if (PAGE_CACHE_SIZE >= 65536)
+	if (len < 65536)
+		return cpu_to_le16(len);
+	if (len == blocksize) {
+		if (blocksize == 65536)
+			return cpu_to_le16(EXT4_MAX_REC_LEN);
+		else
+			return cpu_to_le16(0);
+	}
+	return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+#else
+	return cpu_to_le16(len);
+#endif
+}
+
+/*
+ * Hash Tree Directory indexing
+ * (c) Daniel Phillips, 2001
+ */
+
+#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
+				      EXT4_FEATURE_COMPAT_DIR_INDEX) && \
+		    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
+#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
+#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
+
+/* Legal values for the dx_root hash_version field: */
+
+#define DX_HASH_LEGACY		0
+#define DX_HASH_HALF_MD4	1
+#define DX_HASH_TEA		2
+#define DX_HASH_LEGACY_UNSIGNED	3
+#define DX_HASH_HALF_MD4_UNSIGNED	4
+#define DX_HASH_TEA_UNSIGNED		5
+
+#ifdef __KERNEL__
+
+/* hash info structure used by the directory hash */
+struct dx_hash_info
+{
+	u32		hash;
+	u32		minor_hash;
+	int		hash_version;
+	u32		*seed;
+};
+
+#define EXT4_HTREE_EOF	0x7fffffff
+
+/*
+ * Control parameters used by ext4_htree_next_block
+ */
+#define HASH_NB_ALWAYS		1
+
+
+/*
+ * Describe an inode's exact location on disk and in memory
+ */
+struct ext4_iloc
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	ext4_group_t block_group;
+};
+
+static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
+{
+	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
+}
+
+/*
+ * This structure is stuffed into the struct file's private_data field
+ * for directories.  It is where we put information so that we can do
+ * readdir operations in hash tree order.
+ */
+struct dir_private_info {
+	struct rb_root	root;
+	struct rb_node	*curr_node;
+	struct fname	*extra_fname;
+	loff_t		last_pos;
+	__u32		curr_hash;
+	__u32		curr_minor_hash;
+	__u32		next_hash;
+};
+
+/* calculate the first block number of the group */
+static inline ext4_fsblk_t
+ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
+{
+	return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+}
+
+/*
+ * Special error return code only used by dx_probe() and its callers.
+ */
+#define ERR_BAD_DX_DIR	-75000
+
+void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
+			ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
+
+/*
+ * Timeout and state flag for lazy initialization inode thread.
+ */
+#define EXT4_DEF_LI_WAIT_MULT			10
+#define EXT4_DEF_LI_MAX_START_DELAY		5
+#define EXT4_LAZYINIT_QUIT			0x0001
+#define EXT4_LAZYINIT_RUNNING			0x0002
+
+/*
+ * Lazy inode table initialization info
+ */
+struct ext4_lazy_init {
+	unsigned long		li_state;
+	struct list_head	li_request_list;
+	struct mutex		li_list_mtx;
+};
+
+struct ext4_li_request {
+	struct super_block	*lr_super;
+	struct ext4_sb_info	*lr_sbi;
+	ext4_group_t		lr_next_group;
+	struct list_head	lr_request;
+	unsigned long		lr_next_sched;
+	unsigned long		lr_timeout;
+};
+
+struct ext4_features {
+	struct kobject f_kobj;
+	struct completion f_kobj_unregister;
+};
+
+/*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
+
+struct mmp_struct {
+	__le32	mmp_magic;		/* Magic number for MMP */
+	__le32	mmp_seq;		/* Sequence no. updated periodically */
+
+	/*
+	 * mmp_time, mmp_nodename & mmp_bdevname are only used for information
+	 * purposes and do not affect the correctness of the algorithm
+	 */
+	__le64	mmp_time;		/* Time last updated */
+	char	mmp_nodename[64];	/* Node which last updated MMP block */
+	char	mmp_bdevname[32];	/* Bdev which last updated MMP block */
+
+	/*
+	 * mmp_check_interval is used to verify if the MMP block has been
+	 * updated on the block device. The value is updated based on the
+	 * maximum time to write the MMP block during an update cycle.
+	 */
+	__le16	mmp_check_interval;
+
+	__le16	mmp_pad1;
+	__le32	mmp_pad2[227];
+};
+
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+	struct buffer_head *bh; /* bh from initial read_mmp_block() */
+	struct super_block *sb;  /* super block of the fs */
+};
+
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define EXT4_MMP_CHECK_MULT		2UL
+
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL	5UL
+
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
+
+/*
+ * Function prototypes
+ */
+
+/*
+ * Ok, these declarations are also in <linux/kernel.h> but none of the
+ * ext4 source programs needs to include it so they are duplicated here.
+ */
+# define NORET_TYPE	/**/
+# define ATTRIB_NORET	__attribute__((noreturn))
+# define NORET_AND	noreturn,
+
+/* bitmap.c */
+extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
+
+/* balloc.c */
+extern unsigned int ext4_block_group(struct super_block *sb,
+			ext4_fsblk_t blocknr);
+extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
+			ext4_fsblk_t blocknr);
+extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
+extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
+			ext4_group_t group);
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+					 ext4_fsblk_t goal,
+					 unsigned int flags,
+					 unsigned long *count,
+					 int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+				  s64 nblocks, unsigned int flags);
+extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
+extern void ext4_check_blocks_bitmap(struct super_block *);
+extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+						    ext4_group_t block_group,
+						    struct buffer_head ** bh);
+extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+				      ext4_group_t block_group);
+extern unsigned ext4_init_block_bitmap(struct super_block *sb,
+				       struct buffer_head *bh,
+				       ext4_group_t group,
+				       struct ext4_group_desc *desc);
+#define ext4_free_blocks_after_init(sb, group, desc)			\
+		ext4_init_block_bitmap(sb, NULL, group, desc)
+
+/* dir.c */
+extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
+				  struct file *,
+				  struct ext4_dir_entry_2 *,
+				  struct buffer_head *, unsigned int);
+#define ext4_check_dir_entry(dir, filp, de, bh, offset)			\
+	unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
+					(de), (bh), (offset)))
+extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+				    __u32 minor_hash,
+				    struct ext4_dir_entry_2 *dirent);
+extern void ext4_htree_free_dir_info(struct dir_private_info *p);
+
+/* fsync.c */
+extern int ext4_sync_file(struct file *, int);
+extern int ext4_flush_completed_IO(struct inode *);
+
+/* hash.c */
+extern int ext4fs_dirhash(const char *name, int len, struct
+			  dx_hash_info *hinfo);
+
+/* ialloc.c */
+extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
+				    const struct qstr *qstr, __u32 goal);
+extern void ext4_free_inode(handle_t *, struct inode *);
+extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
+extern unsigned long ext4_count_free_inodes(struct super_block *);
+extern unsigned long ext4_count_dirs(struct super_block *);
+extern void ext4_check_inodes_bitmap(struct super_block *);
+extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
+extern int ext4_init_inode_table(struct super_block *sb,
+				 ext4_group_t group, int barrier);
+
+/* mballoc.c */
+extern long ext4_mb_stats;
+extern long ext4_mb_max_to_scan;
+extern int ext4_mb_init(struct super_block *, int);
+extern int ext4_mb_release(struct super_block *);
+extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
+				struct ext4_allocation_request *, int *);
+extern int ext4_mb_reserve_blocks(struct super_block *, int);
+extern void ext4_discard_preallocations(struct inode *);
+extern int __init ext4_init_mballoc(void);
+extern void ext4_exit_mballoc(void);
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
+			     struct buffer_head *bh, ext4_fsblk_t block,
+			     unsigned long count, int flags);
+extern int ext4_mb_add_groupinfo(struct super_block *sb,
+		ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+				ext4_fsblk_t block, unsigned long count);
+extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+
+/* inode.c */
+struct buffer_head *ext4_getblk(handle_t *, struct inode *,
+						ext4_lblk_t, int, int *);
+struct buffer_head *ext4_bread(handle_t *, struct inode *,
+						ext4_lblk_t, int, int *);
+int ext4_get_block(struct inode *inode, sector_t iblock,
+				struct buffer_head *bh_result, int create);
+
+extern struct inode *ext4_iget(struct super_block *, unsigned long);
+extern int  ext4_write_inode(struct inode *, struct writeback_control *);
+extern int  ext4_setattr(struct dentry *, struct iattr *);
+extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				struct kstat *stat);
+extern void ext4_evict_inode(struct inode *);
+extern void ext4_clear_inode(struct inode *);
+extern int  ext4_sync_inode(handle_t *, struct inode *);
+extern void ext4_dirty_inode(struct inode *, int);
+extern int ext4_change_inode_journal_flag(struct inode *, int);
+extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_can_truncate(struct inode *inode);
+extern void ext4_truncate(struct inode *);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
+extern void ext4_set_inode_flags(struct inode *);
+extern void ext4_get_inode_flags(struct ext4_inode_info *);
+extern int ext4_alloc_da_blocks(struct inode *inode);
+extern void ext4_set_aops(struct inode *inode);
+extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+extern int ext4_block_truncate_page(handle_t *handle,
+		struct address_space *mapping, loff_t from);
+extern int ext4_block_zero_page_range(handle_t *handle,
+		struct address_space *mapping, loff_t from, loff_t length);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern qsize_t *ext4_get_reserved_space(struct inode *inode);
+extern void ext4_da_update_reserve_space(struct inode *inode,
+					int used, int quota_claim);
+/* ioctl.c */
+extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
+extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* migrate.c */
+extern int ext4_ext_migrate(struct inode *);
+
+/* namei.c */
+extern int ext4_orphan_add(handle_t *, struct inode *);
+extern int ext4_orphan_del(handle_t *, struct inode *);
+extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+				__u32 start_minor_hash, __u32 *next_hash);
+
+/* resize.c */
+extern int ext4_group_add(struct super_block *sb,
+				struct ext4_new_group_data *input);
+extern int ext4_group_extend(struct super_block *sb,
+				struct ext4_super_block *es,
+				ext4_fsblk_t n_blocks_count);
+
+/* super.c */
+extern void __ext4_error(struct super_block *, const char *, unsigned int,
+			 const char *, ...)
+	__attribute__ ((format (printf, 4, 5)));
+#define ext4_error(sb, message...)	__ext4_error(sb, __func__,	\
+						     __LINE__, ## message)
+extern void ext4_error_inode(struct inode *, const char *, unsigned int,
+			     ext4_fsblk_t, const char *, ...)
+	__attribute__ ((format (printf, 5, 6)));
+extern void ext4_error_file(struct file *, const char *, unsigned int,
+			    ext4_fsblk_t, const char *, ...)
+	__attribute__ ((format (printf, 5, 6)));
+extern void __ext4_std_error(struct super_block *, const char *,
+			     unsigned int, int);
+extern void __ext4_abort(struct super_block *, const char *, unsigned int,
+		       const char *, ...)
+	__attribute__ ((format (printf, 4, 5)));
+#define ext4_abort(sb, message...)	__ext4_abort(sb, __func__, \
+						       __LINE__, ## message)
+extern void __ext4_warning(struct super_block *, const char *, unsigned int,
+			  const char *, ...)
+	__attribute__ ((format (printf, 4, 5)));
+#define ext4_warning(sb, message...)	__ext4_warning(sb, __func__, \
+						       __LINE__, ## message)
+extern void ext4_msg(struct super_block *, const char *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+			   const char *, unsigned int, const char *);
+#define dump_mmp_msg(sb, mmp, msg)	__dump_mmp_msg(sb, mmp, __func__, \
+						       __LINE__, msg)
+extern void __ext4_grp_locked_error(const char *, unsigned int, \
+				    struct super_block *, ext4_group_t, \
+				    unsigned long, ext4_fsblk_t, \
+				    const char *, ...)
+	__attribute__ ((format (printf, 7, 8)));
+#define ext4_grp_locked_error(sb, grp, message...) \
+	__ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
+extern void ext4_update_dynamic_rev(struct super_block *sb);
+extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
+					__u32 compat);
+extern int ext4_update_rocompat_feature(handle_t *handle,
+					struct super_block *sb,	__u32 rocompat);
+extern int ext4_update_incompat_feature(handle_t *handle,
+					struct super_block *sb,	__u32 incompat);
+extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+				      struct ext4_group_desc *bg);
+extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
+				      struct ext4_group_desc *bg);
+extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
+				     struct ext4_group_desc *bg);
+extern __u32 ext4_free_blks_count(struct super_block *sb,
+				struct ext4_group_desc *bg);
+extern __u32 ext4_free_inodes_count(struct super_block *sb,
+				 struct ext4_group_desc *bg);
+extern __u32 ext4_used_dirs_count(struct super_block *sb,
+				struct ext4_group_desc *bg);
+extern __u32 ext4_itable_unused_count(struct super_block *sb,
+				   struct ext4_group_desc *bg);
+extern void ext4_block_bitmap_set(struct super_block *sb,
+				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_inode_bitmap_set(struct super_block *sb,
+				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_inode_table_set(struct super_block *sb,
+				 struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_free_blks_set(struct super_block *sb,
+			       struct ext4_group_desc *bg, __u32 count);
+extern void ext4_free_inodes_set(struct super_block *sb,
+				struct ext4_group_desc *bg, __u32 count);
+extern void ext4_used_dirs_set(struct super_block *sb,
+				struct ext4_group_desc *bg, __u32 count);
+extern void ext4_itable_unused_set(struct super_block *sb,
+				   struct ext4_group_desc *bg, __u32 count);
+extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
+				   struct ext4_group_desc *gdp);
+extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
+				       struct ext4_group_desc *gdp);
+
+static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
+{
+	return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
+		le32_to_cpu(es->s_blocks_count_lo);
+}
+
+static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
+{
+	return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
+		le32_to_cpu(es->s_r_blocks_count_lo);
+}
+
+static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
+{
+	return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) |
+		le32_to_cpu(es->s_free_blocks_count_lo);
+}
+
+static inline void ext4_blocks_count_set(struct ext4_super_block *es,
+					 ext4_fsblk_t blk)
+{
+	es->s_blocks_count_lo = cpu_to_le32((u32)blk);
+	es->s_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline void ext4_free_blocks_count_set(struct ext4_super_block *es,
+					      ext4_fsblk_t blk)
+{
+	es->s_free_blocks_count_lo = cpu_to_le32((u32)blk);
+	es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
+					   ext4_fsblk_t blk)
+{
+	es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
+	es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
+{
+	if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+		return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+			le32_to_cpu(raw_inode->i_size_lo);
+	else
+		return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+}
+
+static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+{
+	raw_inode->i_size_lo = cpu_to_le32(i_size);
+	raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
+}
+
+static inline
+struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
+					    ext4_group_t group)
+{
+	 struct ext4_group_info ***grp_info;
+	 long indexv, indexh;
+	 grp_info = EXT4_SB(sb)->s_group_info;
+	 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
+	 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
+	 return grp_info[indexv][indexh];
+}
+
+/*
+ * Reading s_groups_count requires using smp_rmb() afterwards.  See
+ * the locking protocol documented in the comments of ext4_group_add()
+ * in resize.c
+ */
+static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
+{
+	ext4_group_t	ngroups = EXT4_SB(sb)->s_groups_count;
+
+	smp_rmb();
+	return ngroups;
+}
+
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+					     ext4_group_t block_group)
+{
+	return block_group >> sbi->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+	return 1 << sbi->s_log_groups_per_flex;
+}
+
+#define ext4_std_error(sb, errno)				\
+do {								\
+	if ((errno))						\
+		__ext4_std_error((sb), __func__, __LINE__, (errno));	\
+} while (0)
+
+#ifdef CONFIG_SMP
+/* Each CPU can accumulate percpu_counter_batch blocks in their local
+ * counters. So we need to make sure we have free blocks more
+ * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
+ */
+#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
+#else
+#define EXT4_FREEBLOCKS_WATERMARK 0
+#endif
+
+static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
+{
+	/*
+	 * XXX: replace with spinlock if seen contended -bzzz
+	 */
+	down_write(&EXT4_I(inode)->i_data_sem);
+	if (newsize > EXT4_I(inode)->i_disksize)
+		EXT4_I(inode)->i_disksize = newsize;
+	up_write(&EXT4_I(inode)->i_data_sem);
+	return ;
+}
+
+struct ext4_group_info {
+	unsigned long   bb_state;
+	struct rb_root  bb_free_root;
+	ext4_grpblk_t	bb_first_free;	/* first free block */
+	ext4_grpblk_t	bb_free;	/* total free blocks */
+	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
+	ext4_grpblk_t	bb_largest_free_order;/* order of largest frag in BG */
+	struct          list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+	void            *bb_bitmap;
+#endif
+	struct rw_semaphore alloc_sem;
+	ext4_grpblk_t	bb_counters[];	/* Nr of free power-of-two-block
+					 * regions, index is order.
+					 * bb_counters[3] = 5 means
+					 * 5 free 8-block regions. */
+};
+
+#define EXT4_GROUP_INFO_NEED_INIT_BIT	0
+
+#define EXT4_MB_GRP_NEED_INIT(grp)	\
+	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+
+#define EXT4_MAX_CONTENTION		8
+#define EXT4_CONTENTION_THRESHOLD	2
+
+static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
+					      ext4_group_t group)
+{
+	return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
+}
+
+/*
+ * Returns true if the filesystem is busy enough that attempts to
+ * access the block group locks has run into contention.
+ */
+static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
+{
+	return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
+}
+
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+	spinlock_t *lock = ext4_group_lock_ptr(sb, group);
+	if (spin_trylock(lock))
+		/*
+		 * We're able to grab the lock right away, so drop the
+		 * lock contention counter.
+		 */
+		atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+	else {
+		/*
+		 * The lock is busy, so bump the contention counter,
+		 * and then wait on the spin lock.
+		 */
+		atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
+				  EXT4_MAX_CONTENTION);
+		spin_lock(lock);
+	}
+}
+
+static inline void ext4_unlock_group(struct super_block *sb,
+					ext4_group_t group)
+{
+	spin_unlock(ext4_group_lock_ptr(sb, group));
+}
+
+static inline void ext4_mark_super_dirty(struct super_block *sb)
+{
+	if (EXT4_SB(sb)->s_journal == NULL)
+		sb->s_dirt =1;
+}
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations ext4_dir_operations;
+
+/* file.c */
+extern const struct inode_operations ext4_file_inode_operations;
+extern const struct file_operations ext4_file_operations;
+extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
+
+/* namei.c */
+extern const struct inode_operations ext4_dir_inode_operations;
+extern const struct inode_operations ext4_special_inode_operations;
+extern struct dentry *ext4_get_parent(struct dentry *child);
+
+/* symlink.c */
+extern const struct inode_operations ext4_symlink_inode_operations;
+extern const struct inode_operations ext4_fast_symlink_inode_operations;
+
+/* block_validity */
+extern void ext4_release_system_zone(struct super_block *sb);
+extern int ext4_setup_system_zone(struct super_block *sb);
+extern int __init ext4_init_system_zone(void);
+extern void ext4_exit_system_zone(void);
+extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
+				 ext4_fsblk_t start_blk,
+				 unsigned int count);
+
+/* extents.c */
+extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
+extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
+				       int chunk);
+extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+			       struct ext4_map_blocks *map, int flags);
+extern void ext4_ext_truncate(struct inode *);
+extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
+				loff_t length);
+extern void ext4_ext_init(struct super_block *);
+extern void ext4_ext_release(struct super_block *);
+extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
+			  loff_t len);
+extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+			  ssize_t len);
+extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
+			   struct ext4_map_blocks *map, int flags);
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+			__u64 start, __u64 len);
+/* move_extent.c */
+extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
+			     __u64 start_orig, __u64 start_donor,
+			     __u64 len, __u64 *moved_len);
+
+/* page-io.c */
+extern int __init ext4_init_pageio(void);
+extern void ext4_exit_pageio(void);
+extern void ext4_ioend_wait(struct inode *);
+extern void ext4_free_io_end(ext4_io_end_t *io);
+extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern int ext4_end_io_nolock(ext4_io_end_t *io);
+extern void ext4_io_submit(struct ext4_io_submit *io);
+extern int ext4_bio_write_page(struct ext4_io_submit *io,
+			       struct page *page,
+			       int len,
+			       struct writeback_control *wbc);
+
+/* mmp.c */
+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+
+/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+enum ext4_state_bits {
+	BH_Uninit	/* blocks are allocated but uninitialized on disk */
+	  = BH_JBDPrivateStart,
+};
+
+BUFFER_FNS(Uninit, uninit)
+TAS_BUFFER_FNS(Uninit, uninit)
+
+/*
+ * Add new method to test wether block and inode bitmaps are properly
+ * initialized. With uninit_bg reading the block from disk is not enough
+ * to mark the bitmap uptodate. We need to also zero-out the bitmap
+ */
+#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
+
+static inline int bitmap_uptodate(struct buffer_head *bh)
+{
+	return (buffer_uptodate(bh) &&
+			test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
+}
+static inline void set_bitmap_uptodate(struct buffer_head *bh)
+{
+	set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
+}
+
+#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
+
+/* For ioend & aio unwritten conversion wait queues */
+#define EXT4_WQ_HASH_SZ		37
+#define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
+					    EXT4_WQ_HASH_SZ])
+#define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
+					     EXT4_WQ_HASH_SZ])
+extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
+#endif	/* __KERNEL__ */
+
+#endif	/* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
new file mode 100644
index 00000000..095c36f3
--- /dev/null
+++ b/fs/ext4/ext4_extents.h
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+
+#ifndef _EXT4_EXTENTS
+#define _EXT4_EXTENTS
+
+#include "ext4.h"
+
+/*
+ * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks
+ * becomes very small, so index split, in-depth growing and
+ * other hard changes happen much more often.
+ * This is for debug purposes only.
+ */
+#define AGGRESSIVE_TEST_
+
+/*
+ * With EXTENTS_STATS defined, the number of blocks and extents
+ * are collected in the truncate path. They'll be shown at
+ * umount time.
+ */
+#define EXTENTS_STATS__
+
+/*
+ * If CHECK_BINSEARCH is defined, then the results of the binary search
+ * will also be checked by linear search.
+ */
+#define CHECK_BINSEARCH__
+
+/*
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
+ */
+#define EXT_DEBUG__
+#ifdef EXT_DEBUG
+#define ext_debug(a...)		printk(a)
+#else
+#define ext_debug(a...)
+#endif
+
+/*
+ * If EXT_STATS is defined then stats numbers are collected.
+ * These number will be displayed at umount time.
+ */
+#define EXT_STATS_
+
+
+/*
+ * ext4_inode has i_block array (60 bytes total).
+ * The first 12 bytes store ext4_extent_header;
+ * the remainder stores an array of ext4_extent.
+ */
+
+/*
+ * This is the extent on-disk structure.
+ * It's used at the bottom of the tree.
+ */
+struct ext4_extent {
+	__le32	ee_block;	/* first logical block extent covers */
+	__le16	ee_len;		/* number of blocks covered by extent */
+	__le16	ee_start_hi;	/* high 16 bits of physical block */
+	__le32	ee_start_lo;	/* low 32 bits of physical block */
+};
+
+/*
+ * This is index on-disk structure.
+ * It's used at all the levels except the bottom.
+ */
+struct ext4_extent_idx {
+	__le32	ei_block;	/* index covers logical blocks from 'block' */
+	__le32	ei_leaf_lo;	/* pointer to the physical block of the next *
+				 * level. leaf or next index could be there */
+	__le16	ei_leaf_hi;	/* high 16 bits of physical block */
+	__u16	ei_unused;
+};
+
+/*
+ * Each block (leaves and indexes), even inode-stored has header.
+ */
+struct ext4_extent_header {
+	__le16	eh_magic;	/* probably will support different formats */
+	__le16	eh_entries;	/* number of valid entries */
+	__le16	eh_max;		/* capacity of store in entries */
+	__le16	eh_depth;	/* has tree real underlying blocks? */
+	__le32	eh_generation;	/* generation of the tree */
+};
+
+#define EXT4_EXT_MAGIC		cpu_to_le16(0xf30a)
+
+/*
+ * Array of ext4_ext_path contains path to some extent.
+ * Creation/lookup routines use it for traversal/splitting/etc.
+ * Truncate uses it to simulate recursive walking.
+ */
+struct ext4_ext_path {
+	ext4_fsblk_t			p_block;
+	__u16				p_depth;
+	struct ext4_extent		*p_ext;
+	struct ext4_extent_idx		*p_idx;
+	struct ext4_extent_header	*p_hdr;
+	struct buffer_head		*p_bh;
+};
+
+/*
+ * structure for external API
+ */
+
+/*
+ * to be called by ext4_ext_walk_space()
+ * negative retcode - error
+ * positive retcode - signal for ext4_ext_walk_space(), see below
+ * callback must return valid extent (passed or newly created)
+ */
+typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
+					struct ext4_ext_cache *,
+					struct ext4_extent *, void *);
+
+#define EXT_CONTINUE   0
+#define EXT_BREAK      1
+#define EXT_REPEAT     2
+
+/*
+ * Maximum number of logical blocks in a file; ext4_extent's ee_block is
+ * __le32.
+ */
+#define EXT_MAX_BLOCKS	0xffffffff
+
+/*
+ * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
+ * initialized extent. This is 2^15 and not (2^16 - 1), since we use the
+ * MSB of ee_len field in the extent datastructure to signify if this
+ * particular extent is an initialized extent or an uninitialized (i.e.
+ * preallocated).
+ * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an
+ * uninitialized extent.
+ * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an
+ * uninitialized one. In other words, if MSB of ee_len is set, it is an
+ * uninitialized extent with only one special scenario when ee_len = 0x8000.
+ * In this case we can not have an uninitialized extent of zero length and
+ * thus we make it as a special case of initialized extent with 0x8000 length.
+ * This way we get better extent-to-group alignment for initialized extents.
+ * Hence, the maximum number of blocks we can have in an *initialized*
+ * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767).
+ */
+#define EXT_INIT_MAX_LEN	(1UL << 15)
+#define EXT_UNINIT_MAX_LEN	(EXT_INIT_MAX_LEN - 1)
+
+
+#define EXT_FIRST_EXTENT(__hdr__) \
+	((struct ext4_extent *) (((char *) (__hdr__)) +		\
+				 sizeof(struct ext4_extent_header)))
+#define EXT_FIRST_INDEX(__hdr__) \
+	((struct ext4_extent_idx *) (((char *) (__hdr__)) +	\
+				     sizeof(struct ext4_extent_header)))
+#define EXT_HAS_FREE_INDEX(__path__) \
+	(le16_to_cpu((__path__)->p_hdr->eh_entries) \
+				     < le16_to_cpu((__path__)->p_hdr->eh_max))
+#define EXT_LAST_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
+#define EXT_LAST_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
+#define EXT_MAX_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+
+static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
+{
+	return (struct ext4_extent_header *) EXT4_I(inode)->i_data;
+}
+
+static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh)
+{
+	return (struct ext4_extent_header *) bh->b_data;
+}
+
+static inline unsigned short ext_depth(struct inode *inode)
+{
+	return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
+}
+
+static inline void
+ext4_ext_invalidate_cache(struct inode *inode)
+{
+	EXT4_I(inode)->i_cached_extent.ec_len = 0;
+}
+
+static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
+{
+	/* We can not have an uninitialized extent of zero length! */
+	BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0);
+	ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN);
+}
+
+static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext)
+{
+	/* Extent with ee_len of 0x8000 is treated as an initialized extent */
+	return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN);
+}
+
+static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
+{
+	return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ?
+		le16_to_cpu(ext->ee_len) :
+		(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
+}
+
+static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
+{
+	ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
+}
+
+/*
+ * ext4_ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
+{
+	ext4_fsblk_t block;
+
+	block = le32_to_cpu(ex->ee_start_lo);
+	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
+	return block;
+}
+
+/*
+ * ext4_idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
+{
+	ext4_fsblk_t block;
+
+	block = le32_to_cpu(ix->ei_leaf_lo);
+	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
+	return block;
+}
+
+/*
+ * ext4_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
+					 ext4_fsblk_t pb)
+{
+	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+				      0xffff);
+}
+
+/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
+					 ext4_fsblk_t pb)
+{
+	ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+				     0xffff);
+}
+
+extern int ext4_ext_calc_metadata_amount(struct inode *inode,
+					 ext4_lblk_t lblocks);
+extern int ext4_extent_tree_init(handle_t *, struct inode *);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+						   int num,
+						   struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+				      struct ext4_extent *ex1,
+				      struct ext4_extent *ex2);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
+extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+							struct ext4_ext_path *);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
+#endif /* _EXT4_EXTENTS */
+
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
new file mode 100644
index 00000000..f5240aa1
--- /dev/null
+++ b/fs/ext4/ext4_jbd2.c
@@ -0,0 +1,152 @@
+/*
+ * Interface between ext4 and JBD
+ */
+
+#include "ext4_jbd2.h"
+
+#include <trace/events/ext4.h>
+
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
+				    handle_t *handle, struct buffer_head *bh)
+{
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_get_write_access(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, line, __func__, bh,
+						  handle, err);
+	}
+	return err;
+}
+
+/*
+ * The ext4 forget function must perform a revoke if we are freeing data
+ * which has been journaled.  Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling, but we still need to
+ * call into ext4_journal_revoke() to put the buffer head.
+ */
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
+		  int is_metadata, struct inode *inode,
+		  struct buffer_head *bh, ext4_fsblk_t blocknr)
+{
+	int err;
+
+	might_sleep();
+
+	trace_ext4_forget(inode, is_metadata, blocknr);
+	BUFFER_TRACE(bh, "enter");
+
+	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+		  "data mode %x\n",
+		  bh, is_metadata, inode->i_mode,
+		  test_opt(inode->i_sb, DATA_FLAGS));
+
+	/* In the no journal case, we can just do a bforget and return */
+	if (!ext4_handle_valid(handle)) {
+		bforget(bh);
+		return 0;
+	}
+
+	/* Never use the revoke function if we are doing full data
+	 * journaling: there is no need to, and a V1 superblock won't
+	 * support it.  Otherwise, only skip the revoke on un-journaled
+	 * data blocks. */
+
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+	    (!is_metadata && !ext4_should_journal_data(inode))) {
+		if (bh) {
+			BUFFER_TRACE(bh, "call jbd2_journal_forget");
+			err = jbd2_journal_forget(handle, bh);
+			if (err)
+				ext4_journal_abort_handle(where, line, __func__,
+							  bh, handle, err);
+			return err;
+		}
+		return 0;
+	}
+
+	/*
+	 * data!=journal && (is_metadata || should_journal_data(inode))
+	 */
+	BUFFER_TRACE(bh, "call jbd2_journal_revoke");
+	err = jbd2_journal_revoke(handle, blocknr, bh);
+	if (err) {
+		ext4_journal_abort_handle(where, line, __func__,
+					  bh, handle, err);
+		__ext4_abort(inode->i_sb, where, line,
+			   "error %d when attempting revoke", err);
+	}
+	BUFFER_TRACE(bh, "exit");
+	return err;
+}
+
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
+				handle_t *handle, struct buffer_head *bh)
+{
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_get_create_access(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, line, __func__,
+						  bh, handle, err);
+	}
+	return err;
+}
+
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
+				 handle_t *handle, struct inode *inode,
+				 struct buffer_head *bh)
+{
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_dirty_metadata(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, line, __func__,
+						  bh, handle, err);
+	} else {
+		if (inode)
+			mark_buffer_dirty_inode(bh, inode);
+		else
+			mark_buffer_dirty(bh);
+		if (inode && inode_needs_sync(inode)) {
+			sync_dirty_buffer(bh);
+			if (buffer_req(bh) && !buffer_uptodate(bh)) {
+				struct ext4_super_block *es;
+
+				es = EXT4_SB(inode->i_sb)->s_es;
+				es->s_last_error_block =
+					cpu_to_le64(bh->b_blocknr);
+				ext4_error_inode(inode, where, line,
+						 bh->b_blocknr,
+					"IO error syncing itable block");
+				err = -EIO;
+			}
+		}
+	}
+	return err;
+}
+
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+			      handle_t *handle, struct super_block *sb)
+{
+	struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_dirty_metadata(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, line, __func__,
+						  bh, handle, err);
+	} else
+		sb->s_dirt = 1;
+	return err;
+}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
new file mode 100644
index 00000000..95af6f87
--- /dev/null
+++ b/fs/ext4/ext4_jbd2.h
@@ -0,0 +1,327 @@
+/*
+ * ext4_jbd2.h
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Ext4-specific journaling extensions.
+ */
+
+#ifndef _EXT4_JBD2_H
+#define _EXT4_JBD2_H
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include "ext4.h"
+
+#define EXT4_JOURNAL(inode)	(EXT4_SB((inode)->i_sb)->s_journal)
+
+/* Define the number of blocks we need to account to a transaction to
+ * modify one block of data.
+ *
+ * We may have to touch one inode, one bitmap buffer, up to three
+ * indirection blocks, the group and superblock summaries, and the data
+ * block to complete the transaction.
+ *
+ * For extents-enabled fs we may have to allocate and modify up to
+ * 5 levels of tree + root which are stored in the inode. */
+
+#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)				\
+	(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
+	 ? 27U : 8U)
+
+/* Extended attribute operations touch at most two data buffers,
+ * two bitmap buffers, and two group summaries, in addition to the inode
+ * and the superblock, which are already accounted for. */
+
+#define EXT4_XATTR_TRANS_BLOCKS		6U
+
+/* Define the minimum size for a transaction which modifies data.  This
+ * needs to take into account the fact that we may end up modifying two
+ * quota files too (one for the group, one for the user quota).  The
+ * superblock only gets updated once, of course, so don't bother
+ * counting that again for the quota updates. */
+
+#define EXT4_DATA_TRANS_BLOCKS(sb)	(EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
+					 EXT4_XATTR_TRANS_BLOCKS - 2 + \
+					 EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+/*
+ * Define the number of metadata blocks we need to account to modify data.
+ *
+ * This include super block, inode block, quota blocks and xattr blocks
+ */
+#define EXT4_META_TRANS_BLOCKS(sb)	(EXT4_XATTR_TRANS_BLOCKS + \
+					EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+/* Delete operations potentially hit one directory's namespace plus an
+ * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
+ * generous.  We can grow the delete transaction later if necessary. */
+
+#define EXT4_DELETE_TRANS_BLOCKS(sb)	(2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64)
+
+/* Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction.  For unbounded transactions such as
+ * write(2) and truncate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go. */
+
+#define EXT4_MAX_TRANS_DATA		64U
+
+/* We break up a large truncate or write transaction once the handle's
+ * buffer credits gets this low, we need either to extend the
+ * transaction or to start a new one.  Reserve enough space here for
+ * inode, bitmap, superblock, group and indirection updates for at least
+ * one block, plus two quota updates.  Quota allocations are not
+ * needed. */
+
+#define EXT4_RESERVE_TRANS_BLOCKS	12U
+
+#define EXT4_INDEX_EXTRA_TRANS_BLOCKS	8
+
+#ifdef CONFIG_QUOTA
+/* Amount of blocks needed for quota update - we know that the structure was
+ * allocated so we need to update only data block */
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0)
+/* Amount of blocks needed for quota insert/delete - we do some block writes
+ * but inode, sb and group updates are done only once */
+#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
+		(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
+
+#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
+		(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
+#else
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
+#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
+#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
+#endif
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+
+int
+ext4_mark_iloc_dirty(handle_t *handle,
+		     struct inode *inode,
+		     struct ext4_iloc *iloc);
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh.  This _must_ be cleaned up later.
+ */
+
+int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
+			struct ext4_iloc *iloc);
+
+int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
+
+/*
+ * Wrapper functions with which ext4 calls into JBD.
+ */
+void ext4_journal_abort_handle(const char *caller, unsigned int line,
+			       const char *err_fn,
+		struct buffer_head *bh, handle_t *handle, int err);
+
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
+				    handle_t *handle, struct buffer_head *bh);
+
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
+		  int is_metadata, struct inode *inode,
+		  struct buffer_head *bh, ext4_fsblk_t blocknr);
+
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
+				handle_t *handle, struct buffer_head *bh);
+
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
+				 handle_t *handle, struct inode *inode,
+				 struct buffer_head *bh);
+
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+			      handle_t *handle, struct super_block *sb);
+
+#define ext4_journal_get_write_access(handle, bh) \
+	__ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
+#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
+	__ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
+		      (bh), (block_nr))
+#define ext4_journal_get_create_access(handle, bh) \
+	__ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
+#define ext4_handle_dirty_metadata(handle, inode, bh) \
+	__ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
+				     (bh))
+#define ext4_handle_dirty_super(handle, sb) \
+	__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
+
+handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
+
+#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
+
+/* Note:  Do not use this for NULL handles.  This is only to determine if
+ * a properly allocated handle is using a journal or not. */
+static inline int ext4_handle_valid(handle_t *handle)
+{
+	if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
+		return 0;
+	return 1;
+}
+
+static inline void ext4_handle_sync(handle_t *handle)
+{
+	if (ext4_handle_valid(handle))
+		handle->h_sync = 1;
+}
+
+static inline void ext4_handle_release_buffer(handle_t *handle,
+						struct buffer_head *bh)
+{
+	if (ext4_handle_valid(handle))
+		jbd2_journal_release_buffer(handle, bh);
+}
+
+static inline int ext4_handle_is_aborted(handle_t *handle)
+{
+	if (ext4_handle_valid(handle))
+		return is_handle_aborted(handle);
+	return 0;
+}
+
+static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+{
+	if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
+		return 0;
+	return 1;
+}
+
+static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
+{
+	return ext4_journal_start_sb(inode->i_sb, nblocks);
+}
+
+#define ext4_journal_stop(handle) \
+	__ext4_journal_stop(__func__, __LINE__, (handle))
+
+static inline handle_t *ext4_journal_current_handle(void)
+{
+	return journal_current_handle();
+}
+
+static inline int ext4_journal_extend(handle_t *handle, int nblocks)
+{
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_extend(handle, nblocks);
+	return 0;
+}
+
+static inline int ext4_journal_restart(handle_t *handle, int nblocks)
+{
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_restart(handle, nblocks);
+	return 0;
+}
+
+static inline int ext4_journal_blocks_per_page(struct inode *inode)
+{
+	if (EXT4_JOURNAL(inode) != NULL)
+		return jbd2_journal_blocks_per_page(inode);
+	return 0;
+}
+
+static inline int ext4_journal_force_commit(journal_t *journal)
+{
+	if (journal)
+		return jbd2_journal_force_commit(journal);
+	return 0;
+}
+
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
+	return 0;
+}
+
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+						 struct inode *inode,
+						 int datasync)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	if (ext4_handle_valid(handle)) {
+		ei->i_sync_tid = handle->h_transaction->t_tid;
+		if (datasync)
+			ei->i_datasync_tid = handle->h_transaction->t_tid;
+	}
+}
+
+/* super.c */
+int ext4_force_commit(struct super_block *sb);
+
+/*
+ * Ext4 inode journal modes
+ */
+#define EXT4_INODE_JOURNAL_DATA_MODE	0x01 /* journal data mode */
+#define EXT4_INODE_ORDERED_DATA_MODE	0x02 /* ordered data mode */
+#define EXT4_INODE_WRITEBACK_DATA_MODE	0x04 /* writeback data mode */
+
+static inline int ext4_inode_journal_mode(struct inode *inode)
+{
+	if (EXT4_JOURNAL(inode) == NULL)
+		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
+	/* We do not support data journalling with delayed allocation */
+	if (!S_ISREG(inode->i_mode) ||
+	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */
+	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+	    !test_opt(inode->i_sb, DELALLOC))
+		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+		return EXT4_INODE_ORDERED_DATA_MODE;	/* ordered */
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
+	else
+		BUG();
+}
+
+static inline int ext4_should_journal_data(struct inode *inode)
+{
+	return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
+}
+
+static inline int ext4_should_order_data(struct inode *inode)
+{
+	return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
+}
+
+static inline int ext4_should_writeback_data(struct inode *inode)
+{
+	return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
+}
+
+/*
+ * This function controls whether or not we should try to go down the
+ * dioread_nolock code paths, which makes it safe to avoid taking
+ * i_mutex for direct I/O reads.  This only works for extent-based
+ * files, and it doesn't work if data journaling is enabled, since the
+ * dioread_nolock code uses b_private to pass information back to the
+ * I/O completion handler, and this conflicts with the jbd's use of
+ * b_private.
+ */
+static inline int ext4_should_dioread_nolock(struct inode *inode)
+{
+	if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
+		return 0;
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return 0;
+	if (ext4_should_journal_data(inode))
+		return 0;
+	return 1;
+}
+
+#endif	/* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
new file mode 100644
index 00000000..611647b2
--- /dev/null
+++ b/fs/ext4/extents.c
@@ -0,0 +1,4364 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * Architecture independence:
+ *   Copyright (c) 2005, Bull S.A.
+ *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+
+/*
+ * Extents support for EXT4
+ *
+ * TODO:
+ *   - ext4*_error() should be used in some situations
+ *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
+ *   - smart tree reduction
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/falloc.h>
+#include <asm/uaccess.h>
+#include <linux/fiemap.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+
+#include <trace/events/ext4.h>
+
+static int ext4_split_extent(handle_t *handle,
+				struct inode *inode,
+				struct ext4_ext_path *path,
+				struct ext4_map_blocks *map,
+				int split_flag,
+				int flags);
+
+static int ext4_ext_truncate_extend_restart(handle_t *handle,
+					    struct inode *inode,
+					    int needed)
+{
+	int err;
+
+	if (!ext4_handle_valid(handle))
+		return 0;
+	if (handle->h_buffer_credits > needed)
+		return 0;
+	err = ext4_journal_extend(handle, needed);
+	if (err <= 0)
+		return err;
+	err = ext4_truncate_restart_trans(handle, inode, needed);
+	if (err == 0)
+		err = -EAGAIN;
+
+	return err;
+}
+
+/*
+ * could return:
+ *  - EROFS
+ *  - ENOMEM
+ */
+static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
+				struct ext4_ext_path *path)
+{
+	if (path->p_bh) {
+		/* path points to block */
+		return ext4_journal_get_write_access(handle, path->p_bh);
+	}
+	/* path points to leaf/index in inode body */
+	/* we use in-core data, no need to protect them */
+	return 0;
+}
+
+/*
+ * could return:
+ *  - EROFS
+ *  - ENOMEM
+ *  - EIO
+ */
+static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
+				struct ext4_ext_path *path)
+{
+	int err;
+	if (path->p_bh) {
+		/* path points to block */
+		err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
+	} else {
+		/* path points to leaf/index in inode body */
+		err = ext4_mark_inode_dirty(handle, inode);
+	}
+	return err;
+}
+
+static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
+			      struct ext4_ext_path *path,
+			      ext4_lblk_t block)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	ext4_fsblk_t bg_start;
+	ext4_fsblk_t last_block;
+	ext4_grpblk_t colour;
+	ext4_group_t block_group;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
+	int depth;
+
+	if (path) {
+		struct ext4_extent *ex;
+		depth = path->p_depth;
+
+		/*
+		 * Try to predict block placement assuming that we are
+		 * filling in a file which will eventually be
+		 * non-sparse --- i.e., in the case of libbfd writing
+		 * an ELF object sections out-of-order but in a way
+		 * the eventually results in a contiguous object or
+		 * executable file, or some database extending a table
+		 * space file.  However, this is actually somewhat
+		 * non-ideal if we are writing a sparse file such as
+		 * qemu or KVM writing a raw image file that is going
+		 * to stay fairly sparse, since it will end up
+		 * fragmenting the file system's free space.  Maybe we
+		 * should have some hueristics or some way to allow
+		 * userspace to pass a hint to file system,
+		 * especially if the latter case turns out to be
+		 * common.
+		 */
+		ex = path[depth].p_ext;
+		if (ex) {
+			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
+			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
+
+			if (block > ext_block)
+				return ext_pblk + (block - ext_block);
+			else
+				return ext_pblk - (ext_block - block);
+		}
+
+		/* it looks like index is empty;
+		 * try to find starting block from index itself */
+		if (path[depth].p_bh)
+			return path[depth].p_bh->b_blocknr;
+	}
+
+	/* OK. use inode's group */
+	block_group = ei->i_block_group;
+	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+		/*
+		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+		 * block groups per flexgroup, reserve the first block
+		 * group for directories and special files.  Regular
+		 * files will start at the second block group.  This
+		 * tends to speed up directory access and improves
+		 * fsck times.
+		 */
+		block_group &= ~(flex_size-1);
+		if (S_ISREG(inode->i_mode))
+			block_group++;
+	}
+	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
+	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+
+	/*
+	 * If we are doing delayed allocation, we don't need take
+	 * colour into account.
+	 */
+	if (test_opt(inode->i_sb, DELALLOC))
+		return bg_start;
+
+	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
+		colour = (current->pid % 16) *
+			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	else
+		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
+	return bg_start + colour + block;
+}
+
+/*
+ * Allocation for a meta data block
+ */
+static ext4_fsblk_t
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
+			struct ext4_ext_path *path,
+			struct ext4_extent *ex, int *err, unsigned int flags)
+{
+	ext4_fsblk_t goal, newblock;
+
+	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
+	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+					NULL, err);
+	return newblock;
+}
+
+static inline int ext4_ext_space_block(struct inode *inode, int check)
+{
+	int size;
+
+	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+			/ sizeof(struct ext4_extent);
+	if (!check) {
+#ifdef AGGRESSIVE_TEST
+		if (size > 6)
+			size = 6;
+#endif
+	}
+	return size;
+}
+
+static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
+{
+	int size;
+
+	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+			/ sizeof(struct ext4_extent_idx);
+	if (!check) {
+#ifdef AGGRESSIVE_TEST
+		if (size > 5)
+			size = 5;
+#endif
+	}
+	return size;
+}
+
+static inline int ext4_ext_space_root(struct inode *inode, int check)
+{
+	int size;
+
+	size = sizeof(EXT4_I(inode)->i_data);
+	size -= sizeof(struct ext4_extent_header);
+	size /= sizeof(struct ext4_extent);
+	if (!check) {
+#ifdef AGGRESSIVE_TEST
+		if (size > 3)
+			size = 3;
+#endif
+	}
+	return size;
+}
+
+static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
+{
+	int size;
+
+	size = sizeof(EXT4_I(inode)->i_data);
+	size -= sizeof(struct ext4_extent_header);
+	size /= sizeof(struct ext4_extent_idx);
+	if (!check) {
+#ifdef AGGRESSIVE_TEST
+		if (size > 4)
+			size = 4;
+#endif
+	}
+	return size;
+}
+
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int idxs, num = 0;
+
+	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+		/ sizeof(struct ext4_extent_idx));
+
+	/*
+	 * If the new delayed allocation block is contiguous with the
+	 * previous da block, it can share index blocks with the
+	 * previous block, so we only need to allocate a new index
+	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
+	 * an additional index block, and at ldxs**3 blocks, yet
+	 * another index blocks.
+	 */
+	if (ei->i_da_metadata_calc_len &&
+	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
+		if ((ei->i_da_metadata_calc_len % idxs) == 0)
+			num++;
+		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
+			num++;
+		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
+			num++;
+			ei->i_da_metadata_calc_len = 0;
+		} else
+			ei->i_da_metadata_calc_len++;
+		ei->i_da_metadata_calc_last_lblock++;
+		return num;
+	}
+
+	/*
+	 * In the worst case we need a new set of index blocks at
+	 * every level of the inode's extent tree.
+	 */
+	ei->i_da_metadata_calc_len = 1;
+	ei->i_da_metadata_calc_last_lblock = lblock;
+	return ext_depth(inode) + 1;
+}
+
+static int
+ext4_ext_max_entries(struct inode *inode, int depth)
+{
+	int max;
+
+	if (depth == ext_depth(inode)) {
+		if (depth == 0)
+			max = ext4_ext_space_root(inode, 1);
+		else
+			max = ext4_ext_space_root_idx(inode, 1);
+	} else {
+		if (depth == 0)
+			max = ext4_ext_space_block(inode, 1);
+		else
+			max = ext4_ext_space_block_idx(inode, 1);
+	}
+
+	return max;
+}
+
+static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
+{
+	ext4_fsblk_t block = ext4_ext_pblock(ext);
+	int len = ext4_ext_get_actual_len(ext);
+
+	if (len == 0)
+		return 0;
+	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
+}
+
+static int ext4_valid_extent_idx(struct inode *inode,
+				struct ext4_extent_idx *ext_idx)
+{
+	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
+
+	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
+}
+
+static int ext4_valid_extent_entries(struct inode *inode,
+				struct ext4_extent_header *eh,
+				int depth)
+{
+	struct ext4_extent *ext;
+	struct ext4_extent_idx *ext_idx;
+	unsigned short entries;
+	if (eh->eh_entries == 0)
+		return 1;
+
+	entries = le16_to_cpu(eh->eh_entries);
+
+	if (depth == 0) {
+		/* leaf entries */
+		ext = EXT_FIRST_EXTENT(eh);
+		while (entries) {
+			if (!ext4_valid_extent(inode, ext))
+				return 0;
+			ext++;
+			entries--;
+		}
+	} else {
+		ext_idx = EXT_FIRST_INDEX(eh);
+		while (entries) {
+			if (!ext4_valid_extent_idx(inode, ext_idx))
+				return 0;
+			ext_idx++;
+			entries--;
+		}
+	}
+	return 1;
+}
+
+static int __ext4_ext_check(const char *function, unsigned int line,
+			    struct inode *inode, struct ext4_extent_header *eh,
+			    int depth)
+{
+	const char *error_msg;
+	int max = 0;
+
+	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
+		error_msg = "invalid magic";
+		goto corrupted;
+	}
+	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
+		error_msg = "unexpected eh_depth";
+		goto corrupted;
+	}
+	if (unlikely(eh->eh_max == 0)) {
+		error_msg = "invalid eh_max";
+		goto corrupted;
+	}
+	max = ext4_ext_max_entries(inode, depth);
+	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
+		error_msg = "too large eh_max";
+		goto corrupted;
+	}
+	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
+		error_msg = "invalid eh_entries";
+		goto corrupted;
+	}
+	if (!ext4_valid_extent_entries(inode, eh, depth)) {
+		error_msg = "invalid extent entries";
+		goto corrupted;
+	}
+	return 0;
+
+corrupted:
+	ext4_error_inode(inode, function, line, 0,
+			"bad header/extent: %s - magic %x, "
+			"entries %u, max %u(%u), depth %u(%u)",
+			error_msg, le16_to_cpu(eh->eh_magic),
+			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
+			max, le16_to_cpu(eh->eh_depth), depth);
+
+	return -EIO;
+}
+
+#define ext4_ext_check(inode, eh, depth)	\
+	__ext4_ext_check(__func__, __LINE__, inode, eh, depth)
+
+int ext4_ext_check_inode(struct inode *inode)
+{
+	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+}
+
+#ifdef EXT_DEBUG
+static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
+{
+	int k, l = path->p_depth;
+
+	ext_debug("path:");
+	for (k = 0; k <= l; k++, path++) {
+		if (path->p_idx) {
+		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
+			    ext4_idx_pblock(path->p_idx));
+		} else if (path->p_ext) {
+			ext_debug("  %d:[%d]%d:%llu ",
+				  le32_to_cpu(path->p_ext->ee_block),
+				  ext4_ext_is_uninitialized(path->p_ext),
+				  ext4_ext_get_actual_len(path->p_ext),
+				  ext4_ext_pblock(path->p_ext));
+		} else
+			ext_debug("  []");
+	}
+	ext_debug("\n");
+}
+
+static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
+{
+	int depth = ext_depth(inode);
+	struct ext4_extent_header *eh;
+	struct ext4_extent *ex;
+	int i;
+
+	if (!path)
+		return;
+
+	eh = path[depth].p_hdr;
+	ex = EXT_FIRST_EXTENT(eh);
+
+	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
+
+	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
+		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
+			  ext4_ext_is_uninitialized(ex),
+			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
+	}
+	ext_debug("\n");
+}
+
+static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
+			ext4_fsblk_t newblock, int level)
+{
+	int depth = ext_depth(inode);
+	struct ext4_extent *ex;
+
+	if (depth != level) {
+		struct ext4_extent_idx *idx;
+		idx = path[level].p_idx;
+		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
+			ext_debug("%d: move %d:%llu in new index %llu\n", level,
+					le32_to_cpu(idx->ei_block),
+					ext4_idx_pblock(idx),
+					newblock);
+			idx++;
+		}
+
+		return;
+	}
+
+	ex = path[depth].p_ext;
+	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
+		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+				le32_to_cpu(ex->ee_block),
+				ext4_ext_pblock(ex),
+				ext4_ext_is_uninitialized(ex),
+				ext4_ext_get_actual_len(ex),
+				newblock);
+		ex++;
+	}
+}
+
+#else
+#define ext4_ext_show_path(inode, path)
+#define ext4_ext_show_leaf(inode, path)
+#define ext4_ext_show_move(inode, path, newblock, level)
+#endif
+
+void ext4_ext_drop_refs(struct ext4_ext_path *path)
+{
+	int depth = path->p_depth;
+	int i;
+
+	for (i = 0; i <= depth; i++, path++)
+		if (path->p_bh) {
+			brelse(path->p_bh);
+			path->p_bh = NULL;
+		}
+}
+
+/*
+ * ext4_ext_binsearch_idx:
+ * binary search for the closest index of the given block
+ * the header must be checked before calling this
+ */
+static void
+ext4_ext_binsearch_idx(struct inode *inode,
+			struct ext4_ext_path *path, ext4_lblk_t block)
+{
+	struct ext4_extent_header *eh = path->p_hdr;
+	struct ext4_extent_idx *r, *l, *m;
+
+
+	ext_debug("binsearch for %u(idx):  ", block);
+
+	l = EXT_FIRST_INDEX(eh) + 1;
+	r = EXT_LAST_INDEX(eh);
+	while (l <= r) {
+		m = l + (r - l) / 2;
+		if (block < le32_to_cpu(m->ei_block))
+			r = m - 1;
+		else
+			l = m + 1;
+		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
+				m, le32_to_cpu(m->ei_block),
+				r, le32_to_cpu(r->ei_block));
+	}
+
+	path->p_idx = l - 1;
+	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
+		  ext4_idx_pblock(path->p_idx));
+
+#ifdef CHECK_BINSEARCH
+	{
+		struct ext4_extent_idx *chix, *ix;
+		int k;
+
+		chix = ix = EXT_FIRST_INDEX(eh);
+		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
+		  if (k != 0 &&
+		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
+				printk(KERN_DEBUG "k=%d, ix=0x%p, "
+				       "first=0x%p\n", k,
+				       ix, EXT_FIRST_INDEX(eh));
+				printk(KERN_DEBUG "%u <= %u\n",
+				       le32_to_cpu(ix->ei_block),
+				       le32_to_cpu(ix[-1].ei_block));
+			}
+			BUG_ON(k && le32_to_cpu(ix->ei_block)
+					   <= le32_to_cpu(ix[-1].ei_block));
+			if (block < le32_to_cpu(ix->ei_block))
+				break;
+			chix = ix;
+		}
+		BUG_ON(chix != path->p_idx);
+	}
+#endif
+
+}
+
+/*
+ * ext4_ext_binsearch:
+ * binary search for closest extent of the given block
+ * the header must be checked before calling this
+ */
+static void
+ext4_ext_binsearch(struct inode *inode,
+		struct ext4_ext_path *path, ext4_lblk_t block)
+{
+	struct ext4_extent_header *eh = path->p_hdr;
+	struct ext4_extent *r, *l, *m;
+
+	if (eh->eh_entries == 0) {
+		/*
+		 * this leaf is empty:
+		 * we get such a leaf in split/add case
+		 */
+		return;
+	}
+
+	ext_debug("binsearch for %u:  ", block);
+
+	l = EXT_FIRST_EXTENT(eh) + 1;
+	r = EXT_LAST_EXTENT(eh);
+
+	while (l <= r) {
+		m = l + (r - l) / 2;
+		if (block < le32_to_cpu(m->ee_block))
+			r = m - 1;
+		else
+			l = m + 1;
+		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
+				m, le32_to_cpu(m->ee_block),
+				r, le32_to_cpu(r->ee_block));
+	}
+
+	path->p_ext = l - 1;
+	ext_debug("  -> %d:%llu:[%d]%d ",
+			le32_to_cpu(path->p_ext->ee_block),
+			ext4_ext_pblock(path->p_ext),
+			ext4_ext_is_uninitialized(path->p_ext),
+			ext4_ext_get_actual_len(path->p_ext));
+
+#ifdef CHECK_BINSEARCH
+	{
+		struct ext4_extent *chex, *ex;
+		int k;
+
+		chex = ex = EXT_FIRST_EXTENT(eh);
+		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
+			BUG_ON(k && le32_to_cpu(ex->ee_block)
+					  <= le32_to_cpu(ex[-1].ee_block));
+			if (block < le32_to_cpu(ex->ee_block))
+				break;
+			chex = ex;
+		}
+		BUG_ON(chex != path->p_ext);
+	}
+#endif
+
+}
+
+int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
+{
+	struct ext4_extent_header *eh;
+
+	eh = ext_inode_hdr(inode);
+	eh->eh_depth = 0;
+	eh->eh_entries = 0;
+	eh->eh_magic = EXT4_EXT_MAGIC;
+	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_ext_invalidate_cache(inode);
+	return 0;
+}
+
+struct ext4_ext_path *
+ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
+					struct ext4_ext_path *path)
+{
+	struct ext4_extent_header *eh;
+	struct buffer_head *bh;
+	short int depth, i, ppos = 0, alloc = 0;
+
+	eh = ext_inode_hdr(inode);
+	depth = ext_depth(inode);
+
+	/* account possible depth increase */
+	if (!path) {
+		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
+				GFP_NOFS);
+		if (!path)
+			return ERR_PTR(-ENOMEM);
+		alloc = 1;
+	}
+	path[0].p_hdr = eh;
+	path[0].p_bh = NULL;
+
+	i = depth;
+	/* walk through the tree */
+	while (i) {
+		int need_to_validate = 0;
+
+		ext_debug("depth %d: num %d, max %d\n",
+			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+
+		ext4_ext_binsearch_idx(inode, path + ppos, block);
+		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
+		path[ppos].p_depth = i;
+		path[ppos].p_ext = NULL;
+
+		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
+		if (unlikely(!bh))
+			goto err;
+		if (!bh_uptodate_or_lock(bh)) {
+			trace_ext4_ext_load_extent(inode, block,
+						path[ppos].p_block);
+			if (bh_submit_read(bh) < 0) {
+				put_bh(bh);
+				goto err;
+			}
+			/* validate the extent entries */
+			need_to_validate = 1;
+		}
+		eh = ext_block_hdr(bh);
+		ppos++;
+		if (unlikely(ppos > depth)) {
+			put_bh(bh);
+			EXT4_ERROR_INODE(inode,
+					 "ppos %d > depth %d", ppos, depth);
+			goto err;
+		}
+		path[ppos].p_bh = bh;
+		path[ppos].p_hdr = eh;
+		i--;
+
+		if (need_to_validate && ext4_ext_check(inode, eh, i))
+			goto err;
+	}
+
+	path[ppos].p_depth = i;
+	path[ppos].p_ext = NULL;
+	path[ppos].p_idx = NULL;
+
+	/* find extent */
+	ext4_ext_binsearch(inode, path + ppos, block);
+	/* if not an empty leaf */
+	if (path[ppos].p_ext)
+		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
+
+	ext4_ext_show_path(inode, path);
+
+	return path;
+
+err:
+	ext4_ext_drop_refs(path);
+	if (alloc)
+		kfree(path);
+	return ERR_PTR(-EIO);
+}
+
+/*
+ * ext4_ext_insert_index:
+ * insert new index [@logical;@ptr] into the block at @curp;
+ * check where to insert: before @curp or after @curp
+ */
+static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+				 struct ext4_ext_path *curp,
+				 int logical, ext4_fsblk_t ptr)
+{
+	struct ext4_extent_idx *ix;
+	int len, err;
+
+	err = ext4_ext_get_access(handle, inode, curp);
+	if (err)
+		return err;
+
+	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d == ei_block %d!",
+				 logical, le32_to_cpu(curp->p_idx->ei_block));
+		return -EIO;
+	}
+	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
+	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
+		/* insert after */
+		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
+			len = (len - 1) * sizeof(struct ext4_extent_idx);
+			len = len < 0 ? 0 : len;
+			ext_debug("insert new index %d after: %llu. "
+					"move %d from 0x%p to 0x%p\n",
+					logical, ptr, len,
+					(curp->p_idx + 1), (curp->p_idx + 2));
+			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
+		}
+		ix = curp->p_idx + 1;
+	} else {
+		/* insert before */
+		len = len * sizeof(struct ext4_extent_idx);
+		len = len < 0 ? 0 : len;
+		ext_debug("insert new index %d before: %llu. "
+				"move %d from 0x%p to 0x%p\n",
+				logical, ptr, len,
+				curp->p_idx, (curp->p_idx + 1));
+		memmove(curp->p_idx + 1, curp->p_idx, len);
+		ix = curp->p_idx;
+	}
+
+	ix->ei_block = cpu_to_le32(logical);
+	ext4_idx_store_pblock(ix, ptr);
+	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
+
+	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
+			     > le16_to_cpu(curp->p_hdr->eh_max))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d == ei_block %d!",
+				 logical, le32_to_cpu(curp->p_idx->ei_block));
+		return -EIO;
+	}
+	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
+		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
+		return -EIO;
+	}
+
+	err = ext4_ext_dirty(handle, inode, curp);
+	ext4_std_error(inode->i_sb, err);
+
+	return err;
+}
+
+/*
+ * ext4_ext_split:
+ * inserts new subtree into the path, using free index entry
+ * at depth @at:
+ * - allocates all needed blocks (new leaf and all intermediate index blocks)
+ * - makes decision where to split
+ * - moves remaining extents and index entries (right to the split point)
+ *   into the newly allocated blocks
+ * - initializes subtree
+ */
+static int ext4_ext_split(handle_t *handle, struct inode *inode,
+			  unsigned int flags,
+			  struct ext4_ext_path *path,
+			  struct ext4_extent *newext, int at)
+{
+	struct buffer_head *bh = NULL;
+	int depth = ext_depth(inode);
+	struct ext4_extent_header *neh;
+	struct ext4_extent_idx *fidx;
+	int i = at, k, m, a;
+	ext4_fsblk_t newblock, oldblock;
+	__le32 border;
+	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
+	int err = 0;
+
+	/* make decision: where to split? */
+	/* FIXME: now decision is simplest: at current extent */
+
+	/* if current leaf will be split, then we should use
+	 * border from split point */
+	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
+		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
+		return -EIO;
+	}
+	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
+		border = path[depth].p_ext[1].ee_block;
+		ext_debug("leaf will be split."
+				" next leaf starts at %d\n",
+				  le32_to_cpu(border));
+	} else {
+		border = newext->ee_block;
+		ext_debug("leaf will be added."
+				" next leaf starts at %d\n",
+				le32_to_cpu(border));
+	}
+
+	/*
+	 * If error occurs, then we break processing
+	 * and mark filesystem read-only. index won't
+	 * be inserted and tree will be in consistent
+	 * state. Next mount will repair buffers too.
+	 */
+
+	/*
+	 * Get array to track all allocated blocks.
+	 * We need this to handle errors and free blocks
+	 * upon them.
+	 */
+	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
+	if (!ablocks)
+		return -ENOMEM;
+
+	/* allocate all needed blocks */
+	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
+	for (a = 0; a < depth - at; a++) {
+		newblock = ext4_ext_new_meta_block(handle, inode, path,
+						   newext, &err, flags);
+		if (newblock == 0)
+			goto cleanup;
+		ablocks[a] = newblock;
+	}
+
+	/* initialize new leaf */
+	newblock = ablocks[--a];
+	if (unlikely(newblock == 0)) {
+		EXT4_ERROR_INODE(inode, "newblock == 0!");
+		err = -EIO;
+		goto cleanup;
+	}
+	bh = sb_getblk(inode->i_sb, newblock);
+	if (!bh) {
+		err = -EIO;
+		goto cleanup;
+	}
+	lock_buffer(bh);
+
+	err = ext4_journal_get_create_access(handle, bh);
+	if (err)
+		goto cleanup;
+
+	neh = ext_block_hdr(bh);
+	neh->eh_entries = 0;
+	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
+	neh->eh_magic = EXT4_EXT_MAGIC;
+	neh->eh_depth = 0;
+
+	/* move remainder of path[depth] to the new leaf */
+	if (unlikely(path[depth].p_hdr->eh_entries !=
+		     path[depth].p_hdr->eh_max)) {
+		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
+				 path[depth].p_hdr->eh_entries,
+				 path[depth].p_hdr->eh_max);
+		err = -EIO;
+		goto cleanup;
+	}
+	/* start copy from next extent */
+	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
+	ext4_ext_show_move(inode, path, newblock, depth);
+	if (m) {
+		struct ext4_extent *ex;
+		ex = EXT_FIRST_EXTENT(neh);
+		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
+		le16_add_cpu(&neh->eh_entries, m);
+	}
+
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	err = ext4_handle_dirty_metadata(handle, inode, bh);
+	if (err)
+		goto cleanup;
+	brelse(bh);
+	bh = NULL;
+
+	/* correct old leaf */
+	if (m) {
+		err = ext4_ext_get_access(handle, inode, path + depth);
+		if (err)
+			goto cleanup;
+		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		if (err)
+			goto cleanup;
+
+	}
+
+	/* create intermediate indexes */
+	k = depth - at - 1;
+	if (unlikely(k < 0)) {
+		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
+		err = -EIO;
+		goto cleanup;
+	}
+	if (k)
+		ext_debug("create %d intermediate indices\n", k);
+	/* insert new index into current index block */
+	/* current depth stored in i var */
+	i = depth - 1;
+	while (k--) {
+		oldblock = newblock;
+		newblock = ablocks[--a];
+		bh = sb_getblk(inode->i_sb, newblock);
+		if (!bh) {
+			err = -EIO;
+			goto cleanup;
+		}
+		lock_buffer(bh);
+
+		err = ext4_journal_get_create_access(handle, bh);
+		if (err)
+			goto cleanup;
+
+		neh = ext_block_hdr(bh);
+		neh->eh_entries = cpu_to_le16(1);
+		neh->eh_magic = EXT4_EXT_MAGIC;
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
+		neh->eh_depth = cpu_to_le16(depth - i);
+		fidx = EXT_FIRST_INDEX(neh);
+		fidx->ei_block = border;
+		ext4_idx_store_pblock(fidx, oldblock);
+
+		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
+				i, newblock, le32_to_cpu(border), oldblock);
+
+		/* move remainder of path[i] to the new index block */
+		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
+					EXT_LAST_INDEX(path[i].p_hdr))) {
+			EXT4_ERROR_INODE(inode,
+					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
+					 le32_to_cpu(path[i].p_ext->ee_block));
+			err = -EIO;
+			goto cleanup;
+		}
+		/* start copy indexes */
+		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
+		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+				EXT_MAX_INDEX(path[i].p_hdr));
+		ext4_ext_show_move(inode, path, newblock, i);
+		if (m) {
+			memmove(++fidx, path[i].p_idx,
+				sizeof(struct ext4_extent_idx) * m);
+			le16_add_cpu(&neh->eh_entries, m);
+		}
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
+		if (err)
+			goto cleanup;
+		brelse(bh);
+		bh = NULL;
+
+		/* correct old index */
+		if (m) {
+			err = ext4_ext_get_access(handle, inode, path + i);
+			if (err)
+				goto cleanup;
+			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
+			err = ext4_ext_dirty(handle, inode, path + i);
+			if (err)
+				goto cleanup;
+		}
+
+		i--;
+	}
+
+	/* insert new index */
+	err = ext4_ext_insert_index(handle, inode, path + at,
+				    le32_to_cpu(border), newblock);
+
+cleanup:
+	if (bh) {
+		if (buffer_locked(bh))
+			unlock_buffer(bh);
+		brelse(bh);
+	}
+
+	if (err) {
+		/* free all allocated blocks in error case */
+		for (i = 0; i < depth; i++) {
+			if (!ablocks[i])
+				continue;
+			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
+					 EXT4_FREE_BLOCKS_METADATA);
+		}
+	}
+	kfree(ablocks);
+
+	return err;
+}
+
+/*
+ * ext4_ext_grow_indepth:
+ * implements tree growing procedure:
+ * - allocates new block
+ * - moves top-level data (index block or leaf) into the new block
+ * - initializes new top-level, creating index that points to the
+ *   just created block
+ */
+static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
+				 unsigned int flags,
+				 struct ext4_ext_path *path,
+				 struct ext4_extent *newext)
+{
+	struct ext4_ext_path *curp = path;
+	struct ext4_extent_header *neh;
+	struct buffer_head *bh;
+	ext4_fsblk_t newblock;
+	int err = 0;
+
+	newblock = ext4_ext_new_meta_block(handle, inode, path,
+		newext, &err, flags);
+	if (newblock == 0)
+		return err;
+
+	bh = sb_getblk(inode->i_sb, newblock);
+	if (!bh) {
+		err = -EIO;
+		ext4_std_error(inode->i_sb, err);
+		return err;
+	}
+	lock_buffer(bh);
+
+	err = ext4_journal_get_create_access(handle, bh);
+	if (err) {
+		unlock_buffer(bh);
+		goto out;
+	}
+
+	/* move top-level index/leaf into new block */
+	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
+
+	/* set size of new block */
+	neh = ext_block_hdr(bh);
+	/* old root could have indexes or leaves
+	 * so calculate e_max right way */
+	if (ext_depth(inode))
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
+	else
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
+	neh->eh_magic = EXT4_EXT_MAGIC;
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	err = ext4_handle_dirty_metadata(handle, inode, bh);
+	if (err)
+		goto out;
+
+	/* create index in new top-level index: num,max,pointer */
+	err = ext4_ext_get_access(handle, inode, curp);
+	if (err)
+		goto out;
+
+	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
+	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
+	curp->p_hdr->eh_entries = cpu_to_le16(1);
+	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
+
+	if (path[0].p_hdr->eh_depth)
+		curp->p_idx->ei_block =
+			EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
+	else
+		curp->p_idx->ei_block =
+			EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
+	ext4_idx_store_pblock(curp->p_idx, newblock);
+
+	neh = ext_inode_hdr(inode);
+	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
+		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
+		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
+		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
+
+	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
+	err = ext4_ext_dirty(handle, inode, curp);
+out:
+	brelse(bh);
+
+	return err;
+}
+
+/*
+ * ext4_ext_create_new_leaf:
+ * finds empty index and adds new leaf.
+ * if no free index is found, then it requests in-depth growing.
+ */
+static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
+				    unsigned int flags,
+				    struct ext4_ext_path *path,
+				    struct ext4_extent *newext)
+{
+	struct ext4_ext_path *curp;
+	int depth, i, err = 0;
+
+repeat:
+	i = depth = ext_depth(inode);
+
+	/* walk up to the tree and look for free index entry */
+	curp = path + depth;
+	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
+		i--;
+		curp--;
+	}
+
+	/* we use already allocated block for index block,
+	 * so subsequent data blocks should be contiguous */
+	if (EXT_HAS_FREE_INDEX(curp)) {
+		/* if we found index with free entry, then use that
+		 * entry: create all needed subtree and add new leaf */
+		err = ext4_ext_split(handle, inode, flags, path, newext, i);
+		if (err)
+			goto out;
+
+		/* refill path */
+		ext4_ext_drop_refs(path);
+		path = ext4_ext_find_extent(inode,
+				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
+				    path);
+		if (IS_ERR(path))
+			err = PTR_ERR(path);
+	} else {
+		/* tree is full, time to grow in depth */
+		err = ext4_ext_grow_indepth(handle, inode, flags,
+					    path, newext);
+		if (err)
+			goto out;
+
+		/* refill path */
+		ext4_ext_drop_refs(path);
+		path = ext4_ext_find_extent(inode,
+				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
+				    path);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			goto out;
+		}
+
+		/*
+		 * only first (depth 0 -> 1) produces free space;
+		 * in all other cases we have to split the grown tree
+		 */
+		depth = ext_depth(inode);
+		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
+			/* now we need to split */
+			goto repeat;
+		}
+	}
+
+out:
+	return err;
+}
+
+/*
+ * search the closest allocated block to the left for *logical
+ * and returns it at @logical + it's physical address at @phys
+ * if *logical is the smallest allocated block, the function
+ * returns 0 at @phys
+ * return value contains 0 (success) or error code
+ */
+static int ext4_ext_search_left(struct inode *inode,
+				struct ext4_ext_path *path,
+				ext4_lblk_t *logical, ext4_fsblk_t *phys)
+{
+	struct ext4_extent_idx *ix;
+	struct ext4_extent *ex;
+	int depth, ee_len;
+
+	if (unlikely(path == NULL)) {
+		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+		return -EIO;
+	}
+	depth = path->p_depth;
+	*phys = 0;
+
+	if (depth == 0 && path->p_ext == NULL)
+		return 0;
+
+	/* usually extent in the path covers blocks smaller
+	 * then *logical, but it can be that extent is the
+	 * first one in the file */
+
+	ex = path[depth].p_ext;
+	ee_len = ext4_ext_get_actual_len(ex);
+	if (*logical < le32_to_cpu(ex->ee_block)) {
+		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+			EXT4_ERROR_INODE(inode,
+					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
+					 *logical, le32_to_cpu(ex->ee_block));
+			return -EIO;
+		}
+		while (--depth >= 0) {
+			ix = path[depth].p_idx;
+			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+				EXT4_ERROR_INODE(inode,
+				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
+				  ix != NULL ? ix->ei_block : 0,
+				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
+				    EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
+				  depth);
+				return -EIO;
+			}
+		}
+		return 0;
+	}
+
+	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d < ee_block %d + ee_len %d!",
+				 *logical, le32_to_cpu(ex->ee_block), ee_len);
+		return -EIO;
+	}
+
+	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
+	*phys = ext4_ext_pblock(ex) + ee_len - 1;
+	return 0;
+}
+
+/*
+ * search the closest allocated block to the right for *logical
+ * and returns it at @logical + it's physical address at @phys
+ * if *logical is the smallest allocated block, the function
+ * returns 0 at @phys
+ * return value contains 0 (success) or error code
+ */
+static int ext4_ext_search_right(struct inode *inode,
+				 struct ext4_ext_path *path,
+				 ext4_lblk_t *logical, ext4_fsblk_t *phys)
+{
+	struct buffer_head *bh = NULL;
+	struct ext4_extent_header *eh;
+	struct ext4_extent_idx *ix;
+	struct ext4_extent *ex;
+	ext4_fsblk_t block;
+	int depth;	/* Note, NOT eh_depth; depth from top of tree */
+	int ee_len;
+
+	if (unlikely(path == NULL)) {
+		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+		return -EIO;
+	}
+	depth = path->p_depth;
+	*phys = 0;
+
+	if (depth == 0 && path->p_ext == NULL)
+		return 0;
+
+	/* usually extent in the path covers blocks smaller
+	 * then *logical, but it can be that extent is the
+	 * first one in the file */
+
+	ex = path[depth].p_ext;
+	ee_len = ext4_ext_get_actual_len(ex);
+	if (*logical < le32_to_cpu(ex->ee_block)) {
+		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+			EXT4_ERROR_INODE(inode,
+					 "first_extent(path[%d].p_hdr) != ex",
+					 depth);
+			return -EIO;
+		}
+		while (--depth >= 0) {
+			ix = path[depth].p_idx;
+			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+				EXT4_ERROR_INODE(inode,
+						 "ix != EXT_FIRST_INDEX *logical %d!",
+						 *logical);
+				return -EIO;
+			}
+		}
+		*logical = le32_to_cpu(ex->ee_block);
+		*phys = ext4_ext_pblock(ex);
+		return 0;
+	}
+
+	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d < ee_block %d + ee_len %d!",
+				 *logical, le32_to_cpu(ex->ee_block), ee_len);
+		return -EIO;
+	}
+
+	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
+		/* next allocated block in this leaf */
+		ex++;
+		*logical = le32_to_cpu(ex->ee_block);
+		*phys = ext4_ext_pblock(ex);
+		return 0;
+	}
+
+	/* go up and search for index to the right */
+	while (--depth >= 0) {
+		ix = path[depth].p_idx;
+		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
+			goto got_index;
+	}
+
+	/* we've gone up to the root and found no index to the right */
+	return 0;
+
+got_index:
+	/* we've found index to the right, let's
+	 * follow it and find the closest allocated
+	 * block to the right */
+	ix++;
+	block = ext4_idx_pblock(ix);
+	while (++depth < path->p_depth) {
+		bh = sb_bread(inode->i_sb, block);
+		if (bh == NULL)
+			return -EIO;
+		eh = ext_block_hdr(bh);
+		/* subtract from p_depth to get proper eh_depth */
+		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
+			put_bh(bh);
+			return -EIO;
+		}
+		ix = EXT_FIRST_INDEX(eh);
+		block = ext4_idx_pblock(ix);
+		put_bh(bh);
+	}
+
+	bh = sb_bread(inode->i_sb, block);
+	if (bh == NULL)
+		return -EIO;
+	eh = ext_block_hdr(bh);
+	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
+		put_bh(bh);
+		return -EIO;
+	}
+	ex = EXT_FIRST_EXTENT(eh);
+	*logical = le32_to_cpu(ex->ee_block);
+	*phys = ext4_ext_pblock(ex);
+	put_bh(bh);
+	return 0;
+}
+
+/*
+ * ext4_ext_next_allocated_block:
+ * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
+ * NOTE: it considers block number from index entry as
+ * allocated block. Thus, index entries have to be consistent
+ * with leaves.
+ */
+static ext4_lblk_t
+ext4_ext_next_allocated_block(struct ext4_ext_path *path)
+{
+	int depth;
+
+	BUG_ON(path == NULL);
+	depth = path->p_depth;
+
+	if (depth == 0 && path->p_ext == NULL)
+		return EXT_MAX_BLOCKS;
+
+	while (depth >= 0) {
+		if (depth == path->p_depth) {
+			/* leaf */
+			if (path[depth].p_ext !=
+					EXT_LAST_EXTENT(path[depth].p_hdr))
+			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
+		} else {
+			/* index */
+			if (path[depth].p_idx !=
+					EXT_LAST_INDEX(path[depth].p_hdr))
+			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
+		}
+		depth--;
+	}
+
+	return EXT_MAX_BLOCKS;
+}
+
+/*
+ * ext4_ext_next_leaf_block:
+ * returns first allocated block from next leaf or EXT_MAX_BLOCKS
+ */
+static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
+					struct ext4_ext_path *path)
+{
+	int depth;
+
+	BUG_ON(path == NULL);
+	depth = path->p_depth;
+
+	/* zero-tree has no leaf blocks at all */
+	if (depth == 0)
+		return EXT_MAX_BLOCKS;
+
+	/* go to index block */
+	depth--;
+
+	while (depth >= 0) {
+		if (path[depth].p_idx !=
+				EXT_LAST_INDEX(path[depth].p_hdr))
+			return (ext4_lblk_t)
+				le32_to_cpu(path[depth].p_idx[1].ei_block);
+		depth--;
+	}
+
+	return EXT_MAX_BLOCKS;
+}
+
+/*
+ * ext4_ext_correct_indexes:
+ * if leaf gets modified and modified extent is first in the leaf,
+ * then we have to correct all indexes above.
+ * TODO: do we need to correct tree in all cases?
+ */
+static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
+				struct ext4_ext_path *path)
+{
+	struct ext4_extent_header *eh;
+	int depth = ext_depth(inode);
+	struct ext4_extent *ex;
+	__le32 border;
+	int k, err = 0;
+
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+
+	if (unlikely(ex == NULL || eh == NULL)) {
+		EXT4_ERROR_INODE(inode,
+				 "ex %p == NULL or eh %p == NULL", ex, eh);
+		return -EIO;
+	}
+
+	if (depth == 0) {
+		/* there is no tree at all */
+		return 0;
+	}
+
+	if (ex != EXT_FIRST_EXTENT(eh)) {
+		/* we correct tree if first leaf got modified only */
+		return 0;
+	}
+
+	/*
+	 * TODO: we need correction if border is smaller than current one
+	 */
+	k = depth - 1;
+	border = path[depth].p_ext->ee_block;
+	err = ext4_ext_get_access(handle, inode, path + k);
+	if (err)
+		return err;
+	path[k].p_idx->ei_block = border;
+	err = ext4_ext_dirty(handle, inode, path + k);
+	if (err)
+		return err;
+
+	while (k--) {
+		/* change all left-side indexes */
+		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
+			break;
+		err = ext4_ext_get_access(handle, inode, path + k);
+		if (err)
+			break;
+		path[k].p_idx->ei_block = border;
+		err = ext4_ext_dirty(handle, inode, path + k);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+int
+ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
+				struct ext4_extent *ex2)
+{
+	unsigned short ext1_ee_len, ext2_ee_len, max_len;
+
+	/*
+	 * Make sure that either both extents are uninitialized, or
+	 * both are _not_.
+	 */
+	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
+		return 0;
+
+	if (ext4_ext_is_uninitialized(ex1))
+		max_len = EXT_UNINIT_MAX_LEN;
+	else
+		max_len = EXT_INIT_MAX_LEN;
+
+	ext1_ee_len = ext4_ext_get_actual_len(ex1);
+	ext2_ee_len = ext4_ext_get_actual_len(ex2);
+
+	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
+			le32_to_cpu(ex2->ee_block))
+		return 0;
+
+	/*
+	 * To allow future support for preallocated extents to be added
+	 * as an RO_COMPAT feature, refuse to merge to extents if
+	 * this can result in the top bit of ee_len being set.
+	 */
+	if (ext1_ee_len + ext2_ee_len > max_len)
+		return 0;
+#ifdef AGGRESSIVE_TEST
+	if (ext1_ee_len >= 4)
+		return 0;
+#endif
+
+	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
+		return 1;
+	return 0;
+}
+
+/*
+ * This function tries to merge the "ex" extent to the next extent in the tree.
+ * It always tries to merge towards right. If you want to merge towards
+ * left, pass "ex - 1" as argument instead of "ex".
+ * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
+ * 1 if they got merged.
+ */
+static int ext4_ext_try_to_merge_right(struct inode *inode,
+				 struct ext4_ext_path *path,
+				 struct ext4_extent *ex)
+{
+	struct ext4_extent_header *eh;
+	unsigned int depth, len;
+	int merge_done = 0;
+	int uninitialized = 0;
+
+	depth = ext_depth(inode);
+	BUG_ON(path[depth].p_hdr == NULL);
+	eh = path[depth].p_hdr;
+
+	while (ex < EXT_LAST_EXTENT(eh)) {
+		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
+			break;
+		/* merge with next extent! */
+		if (ext4_ext_is_uninitialized(ex))
+			uninitialized = 1;
+		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+				+ ext4_ext_get_actual_len(ex + 1));
+		if (uninitialized)
+			ext4_ext_mark_uninitialized(ex);
+
+		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
+			len = (EXT_LAST_EXTENT(eh) - ex - 1)
+				* sizeof(struct ext4_extent);
+			memmove(ex + 1, ex + 2, len);
+		}
+		le16_add_cpu(&eh->eh_entries, -1);
+		merge_done = 1;
+		WARN_ON(eh->eh_entries == 0);
+		if (!eh->eh_entries)
+			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
+	}
+
+	return merge_done;
+}
+
+/*
+ * This function tries to merge the @ex extent to neighbours in the tree.
+ * return 1 if merge left else 0.
+ */
+static int ext4_ext_try_to_merge(struct inode *inode,
+				  struct ext4_ext_path *path,
+				  struct ext4_extent *ex) {
+	struct ext4_extent_header *eh;
+	unsigned int depth;
+	int merge_done = 0;
+	int ret = 0;
+
+	depth = ext_depth(inode);
+	BUG_ON(path[depth].p_hdr == NULL);
+	eh = path[depth].p_hdr;
+
+	if (ex > EXT_FIRST_EXTENT(eh))
+		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
+
+	if (!merge_done)
+		ret = ext4_ext_try_to_merge_right(inode, path, ex);
+
+	return ret;
+}
+
+/*
+ * check if a portion of the "newext" extent overlaps with an
+ * existing extent.
+ *
+ * If there is an overlap discovered, it updates the length of the newext
+ * such that there will be no overlap, and then returns 1.
+ * If there is no overlap found, it returns 0.
+ */
+static unsigned int ext4_ext_check_overlap(struct inode *inode,
+					   struct ext4_extent *newext,
+					   struct ext4_ext_path *path)
+{
+	ext4_lblk_t b1, b2;
+	unsigned int depth, len1;
+	unsigned int ret = 0;
+
+	b1 = le32_to_cpu(newext->ee_block);
+	len1 = ext4_ext_get_actual_len(newext);
+	depth = ext_depth(inode);
+	if (!path[depth].p_ext)
+		goto out;
+	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
+
+	/*
+	 * get the next allocated block if the extent in the path
+	 * is before the requested block(s)
+	 */
+	if (b2 < b1) {
+		b2 = ext4_ext_next_allocated_block(path);
+		if (b2 == EXT_MAX_BLOCKS)
+			goto out;
+	}
+
+	/* check for wrap through zero on extent logical start block*/
+	if (b1 + len1 < b1) {
+		len1 = EXT_MAX_BLOCKS - b1;
+		newext->ee_len = cpu_to_le16(len1);
+		ret = 1;
+	}
+
+	/* check for overlap */
+	if (b1 + len1 > b2) {
+		newext->ee_len = cpu_to_le16(b2 - b1);
+		ret = 1;
+	}
+out:
+	return ret;
+}
+
+/*
+ * ext4_ext_insert_extent:
+ * tries to merge requsted extent into the existing extent or
+ * inserts requested extent as new one into the tree,
+ * creating new leaf in the no-space case.
+ */
+int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
+				struct ext4_ext_path *path,
+				struct ext4_extent *newext, int flag)
+{
+	struct ext4_extent_header *eh;
+	struct ext4_extent *ex, *fex;
+	struct ext4_extent *nearex; /* nearest extent */
+	struct ext4_ext_path *npath = NULL;
+	int depth, len, err;
+	ext4_lblk_t next;
+	unsigned uninitialized = 0;
+	int flags = 0;
+
+	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
+		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
+		return -EIO;
+	}
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	if (unlikely(path[depth].p_hdr == NULL)) {
+		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+		return -EIO;
+	}
+
+	/* try to insert block into found extent and return */
+	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
+		&& ext4_can_extents_be_merged(inode, ex, newext)) {
+		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
+			  ext4_ext_is_uninitialized(newext),
+			  ext4_ext_get_actual_len(newext),
+			  le32_to_cpu(ex->ee_block),
+			  ext4_ext_is_uninitialized(ex),
+			  ext4_ext_get_actual_len(ex),
+			  ext4_ext_pblock(ex));
+		err = ext4_ext_get_access(handle, inode, path + depth);
+		if (err)
+			return err;
+
+		/*
+		 * ext4_can_extents_be_merged should have checked that either
+		 * both extents are uninitialized, or both aren't. Thus we
+		 * need to check only one of them here.
+		 */
+		if (ext4_ext_is_uninitialized(ex))
+			uninitialized = 1;
+		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+					+ ext4_ext_get_actual_len(newext));
+		if (uninitialized)
+			ext4_ext_mark_uninitialized(ex);
+		eh = path[depth].p_hdr;
+		nearex = ex;
+		goto merge;
+	}
+
+repeat:
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
+		goto has_space;
+
+	/* probably next leaf has space for us? */
+	fex = EXT_LAST_EXTENT(eh);
+	next = ext4_ext_next_leaf_block(inode, path);
+	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
+	    && next != EXT_MAX_BLOCKS) {
+		ext_debug("next leaf block - %d\n", next);
+		BUG_ON(npath != NULL);
+		npath = ext4_ext_find_extent(inode, next, NULL);
+		if (IS_ERR(npath))
+			return PTR_ERR(npath);
+		BUG_ON(npath->p_depth != path->p_depth);
+		eh = npath[depth].p_hdr;
+		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
+			ext_debug("next leaf isn't full(%d)\n",
+				  le16_to_cpu(eh->eh_entries));
+			path = npath;
+			goto repeat;
+		}
+		ext_debug("next leaf has no free space(%d,%d)\n",
+			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+	}
+
+	/*
+	 * There is no free space in the found leaf.
+	 * We're gonna add a new leaf in the tree.
+	 */
+	if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
+		flags = EXT4_MB_USE_ROOT_BLOCKS;
+	err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
+	if (err)
+		goto cleanup;
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+
+has_space:
+	nearex = path[depth].p_ext;
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto cleanup;
+
+	if (!nearex) {
+		/* there is no extent in this leaf, create first one */
+		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
+				le32_to_cpu(newext->ee_block),
+				ext4_ext_pblock(newext),
+				ext4_ext_is_uninitialized(newext),
+				ext4_ext_get_actual_len(newext));
+		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
+	} else if (le32_to_cpu(newext->ee_block)
+			   > le32_to_cpu(nearex->ee_block)) {
+/*		BUG_ON(newext->ee_block == nearex->ee_block); */
+		if (nearex != EXT_LAST_EXTENT(eh)) {
+			len = EXT_MAX_EXTENT(eh) - nearex;
+			len = (len - 1) * sizeof(struct ext4_extent);
+			len = len < 0 ? 0 : len;
+			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
+					"move %d from 0x%p to 0x%p\n",
+					le32_to_cpu(newext->ee_block),
+					ext4_ext_pblock(newext),
+					ext4_ext_is_uninitialized(newext),
+					ext4_ext_get_actual_len(newext),
+					nearex, len, nearex + 1, nearex + 2);
+			memmove(nearex + 2, nearex + 1, len);
+		}
+		path[depth].p_ext = nearex + 1;
+	} else {
+		BUG_ON(newext->ee_block == nearex->ee_block);
+		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
+		len = len < 0 ? 0 : len;
+		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
+				"move %d from 0x%p to 0x%p\n",
+				le32_to_cpu(newext->ee_block),
+				ext4_ext_pblock(newext),
+				ext4_ext_is_uninitialized(newext),
+				ext4_ext_get_actual_len(newext),
+				nearex, len, nearex + 1, nearex + 2);
+		memmove(nearex + 1, nearex, len);
+		path[depth].p_ext = nearex;
+	}
+
+	le16_add_cpu(&eh->eh_entries, 1);
+	nearex = path[depth].p_ext;
+	nearex->ee_block = newext->ee_block;
+	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
+	nearex->ee_len = newext->ee_len;
+
+merge:
+	/* try to merge extents to the right */
+	if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
+		ext4_ext_try_to_merge(inode, path, nearex);
+
+	/* try to merge extents to the left */
+
+	/* time to correct all indexes above */
+	err = ext4_ext_correct_indexes(handle, inode, path);
+	if (err)
+		goto cleanup;
+
+	err = ext4_ext_dirty(handle, inode, path + depth);
+
+cleanup:
+	if (npath) {
+		ext4_ext_drop_refs(npath);
+		kfree(npath);
+	}
+	ext4_ext_invalidate_cache(inode);
+	return err;
+}
+
+static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+			       ext4_lblk_t num, ext_prepare_callback func,
+			       void *cbdata)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_ext_cache cbex;
+	struct ext4_extent *ex;
+	ext4_lblk_t next, start = 0, end = 0;
+	ext4_lblk_t last = block + num;
+	int depth, exists, err = 0;
+
+	BUG_ON(func == NULL);
+	BUG_ON(inode == NULL);
+
+	while (block < last && block != EXT_MAX_BLOCKS) {
+		num = last - block;
+		/* find extent for this block */
+		down_read(&EXT4_I(inode)->i_data_sem);
+		path = ext4_ext_find_extent(inode, block, path);
+		up_read(&EXT4_I(inode)->i_data_sem);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			path = NULL;
+			break;
+		}
+
+		depth = ext_depth(inode);
+		if (unlikely(path[depth].p_hdr == NULL)) {
+			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+			err = -EIO;
+			break;
+		}
+		ex = path[depth].p_ext;
+		next = ext4_ext_next_allocated_block(path);
+
+		exists = 0;
+		if (!ex) {
+			/* there is no extent yet, so try to allocate
+			 * all requested space */
+			start = block;
+			end = block + num;
+		} else if (le32_to_cpu(ex->ee_block) > block) {
+			/* need to allocate space before found extent */
+			start = block;
+			end = le32_to_cpu(ex->ee_block);
+			if (block + num < end)
+				end = block + num;
+		} else if (block >= le32_to_cpu(ex->ee_block)
+					+ ext4_ext_get_actual_len(ex)) {
+			/* need to allocate space after found extent */
+			start = block;
+			end = block + num;
+			if (end >= next)
+				end = next;
+		} else if (block >= le32_to_cpu(ex->ee_block)) {
+			/*
+			 * some part of requested space is covered
+			 * by found extent
+			 */
+			start = block;
+			end = le32_to_cpu(ex->ee_block)
+				+ ext4_ext_get_actual_len(ex);
+			if (block + num < end)
+				end = block + num;
+			exists = 1;
+		} else {
+			BUG();
+		}
+		BUG_ON(end <= start);
+
+		if (!exists) {
+			cbex.ec_block = start;
+			cbex.ec_len = end - start;
+			cbex.ec_start = 0;
+		} else {
+			cbex.ec_block = le32_to_cpu(ex->ee_block);
+			cbex.ec_len = ext4_ext_get_actual_len(ex);
+			cbex.ec_start = ext4_ext_pblock(ex);
+		}
+
+		if (unlikely(cbex.ec_len == 0)) {
+			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
+			err = -EIO;
+			break;
+		}
+		err = func(inode, next, &cbex, ex, cbdata);
+		ext4_ext_drop_refs(path);
+
+		if (err < 0)
+			break;
+
+		if (err == EXT_REPEAT)
+			continue;
+		else if (err == EXT_BREAK) {
+			err = 0;
+			break;
+		}
+
+		if (ext_depth(inode) != depth) {
+			/* depth was changed. we have to realloc path */
+			kfree(path);
+			path = NULL;
+		}
+
+		block = cbex.ec_block + cbex.ec_len;
+	}
+
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+
+	return err;
+}
+
+static void
+ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
+			__u32 len, ext4_fsblk_t start)
+{
+	struct ext4_ext_cache *cex;
+	BUG_ON(len == 0);
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	cex = &EXT4_I(inode)->i_cached_extent;
+	cex->ec_block = block;
+	cex->ec_len = len;
+	cex->ec_start = start;
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+
+/*
+ * ext4_ext_put_gap_in_cache:
+ * calculate boundaries of the gap that the requested block fits into
+ * and cache this gap
+ */
+static void
+ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
+				ext4_lblk_t block)
+{
+	int depth = ext_depth(inode);
+	unsigned long len;
+	ext4_lblk_t lblock;
+	struct ext4_extent *ex;
+
+	ex = path[depth].p_ext;
+	if (ex == NULL) {
+		/* there is no extent yet, so gap is [0;-] */
+		lblock = 0;
+		len = EXT_MAX_BLOCKS;
+		ext_debug("cache gap(whole file):");
+	} else if (block < le32_to_cpu(ex->ee_block)) {
+		lblock = block;
+		len = le32_to_cpu(ex->ee_block) - block;
+		ext_debug("cache gap(before): %u [%u:%u]",
+				block,
+				le32_to_cpu(ex->ee_block),
+				 ext4_ext_get_actual_len(ex));
+	} else if (block >= le32_to_cpu(ex->ee_block)
+			+ ext4_ext_get_actual_len(ex)) {
+		ext4_lblk_t next;
+		lblock = le32_to_cpu(ex->ee_block)
+			+ ext4_ext_get_actual_len(ex);
+
+		next = ext4_ext_next_allocated_block(path);
+		ext_debug("cache gap(after): [%u:%u] %u",
+				le32_to_cpu(ex->ee_block),
+				ext4_ext_get_actual_len(ex),
+				block);
+		BUG_ON(next == lblock);
+		len = next - lblock;
+	} else {
+		lblock = len = 0;
+		BUG();
+	}
+
+	ext_debug(" -> %u:%lu\n", lblock, len);
+	ext4_ext_put_in_cache(inode, lblock, len, 0);
+}
+
+/*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * cache extent pointer.  If the cached extent is a hole,
+ * this routine should be used instead of
+ * ext4_ext_in_cache if the calling function needs to
+ * know the size of the hole.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex:    Pointer where the cached extent will be stored
+ *         if it contains block
+ *
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
+static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
+	struct ext4_ext_cache *ex){
+	struct ext4_ext_cache *cex;
+	struct ext4_sb_info *sbi;
+	int ret = 0;
+
+	/*
+	 * We borrow i_block_reservation_lock to protect i_cached_extent
+	 */
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	cex = &EXT4_I(inode)->i_cached_extent;
+	sbi = EXT4_SB(inode->i_sb);
+
+	/* has cache valid data? */
+	if (cex->ec_len == 0)
+		goto errout;
+
+	if (in_range(block, cex->ec_block, cex->ec_len)) {
+		memcpy(ex, cex, sizeof(struct ext4_ext_cache));
+		ext_debug("%u cached by %u:%u:%llu\n",
+				block,
+				cex->ec_block, cex->ec_len, cex->ec_start);
+		ret = 1;
+	}
+errout:
+	if (!ret)
+		sbi->extent_cache_misses++;
+	else
+		sbi->extent_cache_hits++;
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+	return ret;
+}
+
+/*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * extent pointer.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex:    Pointer where the cached extent will be stored
+ *         if it contains block
+ *
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
+static int
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+			struct ext4_extent *ex)
+{
+	struct ext4_ext_cache cex;
+	int ret = 0;
+
+	if (ext4_ext_check_cache(inode, block, &cex)) {
+		ex->ee_block = cpu_to_le32(cex.ec_block);
+		ext4_ext_store_pblock(ex, cex.ec_start);
+		ex->ee_len = cpu_to_le16(cex.ec_len);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+
+/*
+ * ext4_ext_rm_idx:
+ * removes index from the index block.
+ * It's used in truncate case only, thus all requests are for
+ * last index in the block only.
+ */
+static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
+			struct ext4_ext_path *path)
+{
+	int err;
+	ext4_fsblk_t leaf;
+
+	/* free index block */
+	path--;
+	leaf = ext4_idx_pblock(path->p_idx);
+	if (unlikely(path->p_hdr->eh_entries == 0)) {
+		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
+		return -EIO;
+	}
+	err = ext4_ext_get_access(handle, inode, path);
+	if (err)
+		return err;
+	le16_add_cpu(&path->p_hdr->eh_entries, -1);
+	err = ext4_ext_dirty(handle, inode, path);
+	if (err)
+		return err;
+	ext_debug("index is empty, remove it, free block %llu\n", leaf);
+	ext4_free_blocks(handle, inode, NULL, leaf, 1,
+			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+	return err;
+}
+
+/*
+ * ext4_ext_calc_credits_for_single_extent:
+ * This routine returns max. credits that needed to insert an extent
+ * to the extent tree.
+ * When pass the actual path, the caller should calculate credits
+ * under i_data_sem.
+ */
+int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
+						struct ext4_ext_path *path)
+{
+	if (path) {
+		int depth = ext_depth(inode);
+		int ret = 0;
+
+		/* probably there is space in leaf? */
+		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
+				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
+
+			/*
+			 *  There are some space in the leaf tree, no
+			 *  need to account for leaf block credit
+			 *
+			 *  bitmaps and block group descriptor blocks
+			 *  and other metadat blocks still need to be
+			 *  accounted.
+			 */
+			/* 1 bitmap, 1 block group descriptor */
+			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
+			return ret;
+		}
+	}
+
+	return ext4_chunk_trans_blocks(inode, nrblocks);
+}
+
+/*
+ * How many index/leaf blocks need to change/allocate to modify nrblocks?
+ *
+ * if nrblocks are fit in a single extent (chunk flag is 1), then
+ * in the worse case, each tree level index/leaf need to be changed
+ * if the tree split due to insert a new extent, then the old tree
+ * index/leaf need to be updated too
+ *
+ * If the nrblocks are discontiguous, they could cause
+ * the whole tree split more than once, but this is really rare.
+ */
+int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+	int index;
+	int depth = ext_depth(inode);
+
+	if (chunk)
+		index = depth * 2;
+	else
+		index = depth * 3;
+
+	return index;
+}
+
+static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
+				struct ext4_extent *ex,
+				ext4_lblk_t from, ext4_lblk_t to)
+{
+	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
+	int flags = EXT4_FREE_BLOCKS_FORGET;
+
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		flags |= EXT4_FREE_BLOCKS_METADATA;
+#ifdef EXTENTS_STATS
+	{
+		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+		spin_lock(&sbi->s_ext_stats_lock);
+		sbi->s_ext_blocks += ee_len;
+		sbi->s_ext_extents++;
+		if (ee_len < sbi->s_ext_min)
+			sbi->s_ext_min = ee_len;
+		if (ee_len > sbi->s_ext_max)
+			sbi->s_ext_max = ee_len;
+		if (ext_depth(inode) > sbi->s_depth_max)
+			sbi->s_depth_max = ext_depth(inode);
+		spin_unlock(&sbi->s_ext_stats_lock);
+	}
+#endif
+	if (from >= le32_to_cpu(ex->ee_block)
+	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
+		/* tail removal */
+		ext4_lblk_t num;
+		ext4_fsblk_t start;
+
+		num = le32_to_cpu(ex->ee_block) + ee_len - from;
+		start = ext4_ext_pblock(ex) + ee_len - num;
+		ext_debug("free last %u blocks starting %llu\n", num, start);
+		ext4_free_blocks(handle, inode, NULL, start, num, flags);
+	} else if (from == le32_to_cpu(ex->ee_block)
+		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
+		/* head removal */
+		ext4_lblk_t num;
+		ext4_fsblk_t start;
+
+		num = to - from;
+		start = ext4_ext_pblock(ex);
+
+		ext_debug("free first %u blocks starting %llu\n", num, start);
+		ext4_free_blocks(handle, inode, 0, start, num, flags);
+
+	} else {
+		printk(KERN_INFO "strange request: removal(2) "
+				"%u-%u from %u:%u\n",
+				from, to, le32_to_cpu(ex->ee_block), ee_len);
+	}
+	return 0;
+}
+
+
+/*
+ * ext4_ext_rm_leaf() Removes the extents associated with the
+ * blocks appearing between "start" and "end", and splits the extents
+ * if "start" and "end" appear in the same extent
+ *
+ * @handle: The journal handle
+ * @inode:  The files inode
+ * @path:   The path to the leaf
+ * @start:  The first block to remove
+ * @end:   The last block to remove
+ */
+static int
+ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
+		struct ext4_ext_path *path, ext4_lblk_t start,
+		ext4_lblk_t end)
+{
+	int err = 0, correct_index = 0;
+	int depth = ext_depth(inode), credits;
+	struct ext4_extent_header *eh;
+	ext4_lblk_t a, b, block;
+	unsigned num;
+	ext4_lblk_t ex_ee_block;
+	unsigned short ex_ee_len;
+	unsigned uninitialized = 0;
+	struct ext4_extent *ex;
+	struct ext4_map_blocks map;
+
+	/* the header must be checked already in ext4_ext_remove_space() */
+	ext_debug("truncate since %u in leaf\n", start);
+	if (!path[depth].p_hdr)
+		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
+	eh = path[depth].p_hdr;
+	if (unlikely(path[depth].p_hdr == NULL)) {
+		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+		return -EIO;
+	}
+	/* find where to start removing */
+	ex = EXT_LAST_EXTENT(eh);
+
+	ex_ee_block = le32_to_cpu(ex->ee_block);
+	ex_ee_len = ext4_ext_get_actual_len(ex);
+
+	while (ex >= EXT_FIRST_EXTENT(eh) &&
+			ex_ee_block + ex_ee_len > start) {
+
+		if (ext4_ext_is_uninitialized(ex))
+			uninitialized = 1;
+		else
+			uninitialized = 0;
+
+		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
+			 uninitialized, ex_ee_len);
+		path[depth].p_ext = ex;
+
+		a = ex_ee_block > start ? ex_ee_block : start;
+		b = ex_ee_block+ex_ee_len - 1 < end ?
+			ex_ee_block+ex_ee_len - 1 : end;
+
+		ext_debug("  border %u:%u\n", a, b);
+
+		/* If this extent is beyond the end of the hole, skip it */
+		if (end <= ex_ee_block) {
+			ex--;
+			ex_ee_block = le32_to_cpu(ex->ee_block);
+			ex_ee_len = ext4_ext_get_actual_len(ex);
+			continue;
+		} else if (a != ex_ee_block &&
+			b != ex_ee_block + ex_ee_len - 1) {
+			/*
+			 * If this is a truncate, then this condition should
+			 * never happen because at least one of the end points
+			 * needs to be on the edge of the extent.
+			 */
+			if (end == EXT_MAX_BLOCKS - 1) {
+				ext_debug("  bad truncate %u:%u\n",
+						start, end);
+				block = 0;
+				num = 0;
+				err = -EIO;
+				goto out;
+			}
+			/*
+			 * else this is a hole punch, so the extent needs to
+			 * be split since neither edge of the hole is on the
+			 * extent edge
+			 */
+			else{
+				map.m_pblk = ext4_ext_pblock(ex);
+				map.m_lblk = ex_ee_block;
+				map.m_len = b - ex_ee_block;
+
+				err = ext4_split_extent(handle,
+					inode, path, &map, 0,
+					EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+					EXT4_GET_BLOCKS_PRE_IO);
+
+				if (err < 0)
+					goto out;
+
+				ex_ee_len = ext4_ext_get_actual_len(ex);
+
+				b = ex_ee_block+ex_ee_len - 1 < end ?
+					ex_ee_block+ex_ee_len - 1 : end;
+
+				/* Then remove tail of this extent */
+				block = ex_ee_block;
+				num = a - block;
+			}
+		} else if (a != ex_ee_block) {
+			/* remove tail of the extent */
+			block = ex_ee_block;
+			num = a - block;
+		} else if (b != ex_ee_block + ex_ee_len - 1) {
+			/* remove head of the extent */
+			block = b;
+			num =  ex_ee_block + ex_ee_len - b;
+
+			/*
+			 * If this is a truncate, this condition
+			 * should never happen
+			 */
+			if (end == EXT_MAX_BLOCKS - 1) {
+				ext_debug("  bad truncate %u:%u\n",
+					start, end);
+				err = -EIO;
+				goto out;
+			}
+		} else {
+			/* remove whole extent: excellent! */
+			block = ex_ee_block;
+			num = 0;
+			if (a != ex_ee_block) {
+				ext_debug("  bad truncate %u:%u\n",
+					start, end);
+				err = -EIO;
+				goto out;
+			}
+
+			if (b != ex_ee_block + ex_ee_len - 1) {
+				ext_debug("  bad truncate %u:%u\n",
+					start, end);
+				err = -EIO;
+				goto out;
+			}
+		}
+
+		/*
+		 * 3 for leaf, sb, and inode plus 2 (bmap and group
+		 * descriptor) for each block group; assume two block
+		 * groups plus ex_ee_len/blocks_per_block_group for
+		 * the worst case
+		 */
+		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
+		if (ex == EXT_FIRST_EXTENT(eh)) {
+			correct_index = 1;
+			credits += (ext_depth(inode)) + 1;
+		}
+		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
+
+		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+		if (err)
+			goto out;
+
+		err = ext4_ext_get_access(handle, inode, path + depth);
+		if (err)
+			goto out;
+
+		err = ext4_remove_blocks(handle, inode, ex, a, b);
+		if (err)
+			goto out;
+
+		if (num == 0) {
+			/* this extent is removed; mark slot entirely unused */
+			ext4_ext_store_pblock(ex, 0);
+		} else if (block != ex_ee_block) {
+			/*
+			 * If this was a head removal, then we need to update
+			 * the physical block since it is now at a different
+			 * location
+			 */
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
+		}
+
+		ex->ee_block = cpu_to_le32(block);
+		ex->ee_len = cpu_to_le16(num);
+		/*
+		 * Do not mark uninitialized if all the blocks in the
+		 * extent have been removed.
+		 */
+		if (uninitialized && num)
+			ext4_ext_mark_uninitialized(ex);
+
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		if (err)
+			goto out;
+
+		/*
+		 * If the extent was completely released,
+		 * we need to remove it from the leaf
+		 */
+		if (num == 0) {
+			if (end != EXT_MAX_BLOCKS - 1) {
+				/*
+				 * For hole punching, we need to scoot all the
+				 * extents up when an extent is removed so that
+				 * we dont have blank extents in the middle
+				 */
+				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
+					sizeof(struct ext4_extent));
+
+				/* Now get rid of the one at the end */
+				memset(EXT_LAST_EXTENT(eh), 0,
+					sizeof(struct ext4_extent));
+			}
+			le16_add_cpu(&eh->eh_entries, -1);
+		}
+
+		ext_debug("new extent: %u:%u:%llu\n", block, num,
+				ext4_ext_pblock(ex));
+		ex--;
+		ex_ee_block = le32_to_cpu(ex->ee_block);
+		ex_ee_len = ext4_ext_get_actual_len(ex);
+	}
+
+	if (correct_index && eh->eh_entries)
+		err = ext4_ext_correct_indexes(handle, inode, path);
+
+	/* if this leaf is free, then we should
+	 * remove it from index block above */
+	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
+		err = ext4_ext_rm_idx(handle, inode, path + depth);
+
+out:
+	return err;
+}
+
+/*
+ * ext4_ext_more_to_rm:
+ * returns 1 if current index has to be freed (even partial)
+ */
+static int
+ext4_ext_more_to_rm(struct ext4_ext_path *path)
+{
+	BUG_ON(path->p_idx == NULL);
+
+	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
+		return 0;
+
+	/*
+	 * if truncate on deeper level happened, it wasn't partial,
+	 * so we have to consider current index for truncation
+	 */
+	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
+		return 0;
+	return 1;
+}
+
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+				ext4_lblk_t end)
+{
+	struct super_block *sb = inode->i_sb;
+	int depth = ext_depth(inode);
+	struct ext4_ext_path *path;
+	handle_t *handle;
+	int i, err;
+
+	ext_debug("truncate since %u\n", start);
+
+	/* probably first extent we're gonna free will be last in block */
+	handle = ext4_journal_start(inode, depth + 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+again:
+	ext4_ext_invalidate_cache(inode);
+
+	/*
+	 * We start scanning from right side, freeing all the blocks
+	 * after i_size and walking into the tree depth-wise.
+	 */
+	depth = ext_depth(inode);
+	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
+	if (path == NULL) {
+		ext4_journal_stop(handle);
+		return -ENOMEM;
+	}
+	path[0].p_depth = depth;
+	path[0].p_hdr = ext_inode_hdr(inode);
+	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
+		err = -EIO;
+		goto out;
+	}
+	i = err = 0;
+
+	while (i >= 0 && err == 0) {
+		if (i == depth) {
+			/* this is leaf block */
+			err = ext4_ext_rm_leaf(handle, inode, path,
+					start, end);
+			/* root level has p_bh == NULL, brelse() eats this */
+			brelse(path[i].p_bh);
+			path[i].p_bh = NULL;
+			i--;
+			continue;
+		}
+
+		/* this is index block */
+		if (!path[i].p_hdr) {
+			ext_debug("initialize header\n");
+			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
+		}
+
+		if (!path[i].p_idx) {
+			/* this level hasn't been touched yet */
+			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
+			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
+			ext_debug("init index ptr: hdr 0x%p, num %d\n",
+				  path[i].p_hdr,
+				  le16_to_cpu(path[i].p_hdr->eh_entries));
+		} else {
+			/* we were already here, see at next index */
+			path[i].p_idx--;
+		}
+
+		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
+				i, EXT_FIRST_INDEX(path[i].p_hdr),
+				path[i].p_idx);
+		if (ext4_ext_more_to_rm(path + i)) {
+			struct buffer_head *bh;
+			/* go to the next level */
+			ext_debug("move to level %d (block %llu)\n",
+				  i + 1, ext4_idx_pblock(path[i].p_idx));
+			memset(path + i + 1, 0, sizeof(*path));
+			bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
+			if (!bh) {
+				/* should we reset i_size? */
+				err = -EIO;
+				break;
+			}
+			if (WARN_ON(i + 1 > depth)) {
+				err = -EIO;
+				break;
+			}
+			if (ext4_ext_check(inode, ext_block_hdr(bh),
+							depth - i - 1)) {
+				err = -EIO;
+				break;
+			}
+			path[i + 1].p_bh = bh;
+
+			/* save actual number of indexes since this
+			 * number is changed at the next iteration */
+			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
+			i++;
+		} else {
+			/* we finished processing this index, go up */
+			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
+				/* index is empty, remove it;
+				 * handle must be already prepared by the
+				 * truncatei_leaf() */
+				err = ext4_ext_rm_idx(handle, inode, path + i);
+			}
+			/* root level has p_bh == NULL, brelse() eats this */
+			brelse(path[i].p_bh);
+			path[i].p_bh = NULL;
+			i--;
+			ext_debug("return to level %d\n", i);
+		}
+	}
+
+	/* TODO: flexible tree reduction should be here */
+	if (path->p_hdr->eh_entries == 0) {
+		/*
+		 * truncate to zero freed all the tree,
+		 * so we need to correct eh_depth
+		 */
+		err = ext4_ext_get_access(handle, inode, path);
+		if (err == 0) {
+			ext_inode_hdr(inode)->eh_depth = 0;
+			ext_inode_hdr(inode)->eh_max =
+				cpu_to_le16(ext4_ext_space_root(inode, 0));
+			err = ext4_ext_dirty(handle, inode, path);
+		}
+	}
+out:
+	ext4_ext_drop_refs(path);
+	kfree(path);
+	if (err == -EAGAIN)
+		goto again;
+	ext4_journal_stop(handle);
+
+	return err;
+}
+
+/*
+ * called at mount time
+ */
+void ext4_ext_init(struct super_block *sb)
+{
+	/*
+	 * possible initialization would be here
+	 */
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
+		printk(KERN_INFO "EXT4-fs: file extents enabled");
+#ifdef AGGRESSIVE_TEST
+		printk(", aggressive tests");
+#endif
+#ifdef CHECK_BINSEARCH
+		printk(", check binsearch");
+#endif
+#ifdef EXTENTS_STATS
+		printk(", stats");
+#endif
+		printk("\n");
+#endif
+#ifdef EXTENTS_STATS
+		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
+		EXT4_SB(sb)->s_ext_min = 1 << 30;
+		EXT4_SB(sb)->s_ext_max = 0;
+#endif
+	}
+}
+
+/*
+ * called at umount time
+ */
+void ext4_ext_release(struct super_block *sb)
+{
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+		return;
+
+#ifdef EXTENTS_STATS
+	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
+		struct ext4_sb_info *sbi = EXT4_SB(sb);
+		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
+			sbi->s_ext_blocks, sbi->s_ext_extents,
+			sbi->s_ext_blocks / sbi->s_ext_extents);
+		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
+			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
+	}
+#endif
+}
+
+/* FIXME!! we need to try to merge to left or right after zero-out  */
+static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
+{
+	ext4_fsblk_t ee_pblock;
+	unsigned int ee_len;
+	int ret;
+
+	ee_len    = ext4_ext_get_actual_len(ex);
+	ee_pblock = ext4_ext_pblock(ex);
+
+	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
+	if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
+/*
+ * used by extent splitting.
+ */
+#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
+					due to ENOSPC */
+#define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */
+#define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */
+
+/*
+ * ext4_split_extent_at() splits an extent at given block.
+ *
+ * @handle: the journal handle
+ * @inode: the file inode
+ * @path: the path to the extent
+ * @split: the logical block where the extent is splitted.
+ * @split_flags: indicates if the extent could be zeroout if split fails, and
+ *		 the states(init or uninit) of new extents.
+ * @flags: flags used to insert new extent to extent tree.
+ *
+ *
+ * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
+ * of which are deterimined by split_flag.
+ *
+ * There are two cases:
+ *  a> the extent are splitted into two extent.
+ *  b> split is not needed, and just mark the extent.
+ *
+ * return 0 on success.
+ */
+static int ext4_split_extent_at(handle_t *handle,
+			     struct inode *inode,
+			     struct ext4_ext_path *path,
+			     ext4_lblk_t split,
+			     int split_flag,
+			     int flags)
+{
+	ext4_fsblk_t newblock;
+	ext4_lblk_t ee_block;
+	struct ext4_extent *ex, newex, orig_ex;
+	struct ext4_extent *ex2 = NULL;
+	unsigned int ee_len, depth;
+	int err = 0;
+
+	ext_debug("ext4_split_extents_at: inode %lu, logical"
+		"block %llu\n", inode->i_ino, (unsigned long long)split);
+
+	ext4_ext_show_leaf(inode, path);
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	newblock = split - ee_block + ext4_ext_pblock(ex);
+
+	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
+
+	if (split == ee_block) {
+		/*
+		 * case b: block @split is the block that the extent begins with
+		 * then we just change the state of the extent, and splitting
+		 * is not needed.
+		 */
+		if (split_flag & EXT4_EXT_MARK_UNINIT2)
+			ext4_ext_mark_uninitialized(ex);
+		else
+			ext4_ext_mark_initialized(ex);
+
+		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+			ext4_ext_try_to_merge(inode, path, ex);
+
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		goto out;
+	}
+
+	/* case a */
+	memcpy(&orig_ex, ex, sizeof(orig_ex));
+	ex->ee_len = cpu_to_le16(split - ee_block);
+	if (split_flag & EXT4_EXT_MARK_UNINIT1)
+		ext4_ext_mark_uninitialized(ex);
+
+	/*
+	 * path may lead to new leaf, not to original leaf any more
+	 * after ext4_ext_insert_extent() returns,
+	 */
+	err = ext4_ext_dirty(handle, inode, path + depth);
+	if (err)
+		goto fix_extent_len;
+
+	ex2 = &newex;
+	ex2->ee_block = cpu_to_le32(split);
+	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
+	ext4_ext_store_pblock(ex2, newblock);
+	if (split_flag & EXT4_EXT_MARK_UNINIT2)
+		ext4_ext_mark_uninitialized(ex2);
+
+	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+		err = ext4_ext_zeroout(inode, &orig_ex);
+		if (err)
+			goto fix_extent_len;
+		/* update the extent length and mark as initialized */
+		ex->ee_len = cpu_to_le16(ee_len);
+		ext4_ext_try_to_merge(inode, path, ex);
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		goto out;
+	} else if (err)
+		goto fix_extent_len;
+
+out:
+	ext4_ext_show_leaf(inode, path);
+	return err;
+
+fix_extent_len:
+	ex->ee_len = orig_ex.ee_len;
+	ext4_ext_dirty(handle, inode, path + depth);
+	return err;
+}
+
+/*
+ * ext4_split_extents() splits an extent and mark extent which is covered
+ * by @map as split_flags indicates
+ *
+ * It may result in splitting the extent into multiple extents (upto three)
+ * There are three possibilities:
+ *   a> There is no split required
+ *   b> Splits in two extents: Split is happening at either end of the extent
+ *   c> Splits in three extents: Somone is splitting in middle of the extent
+ *
+ */
+static int ext4_split_extent(handle_t *handle,
+			      struct inode *inode,
+			      struct ext4_ext_path *path,
+			      struct ext4_map_blocks *map,
+			      int split_flag,
+			      int flags)
+{
+	ext4_lblk_t ee_block;
+	struct ext4_extent *ex;
+	unsigned int ee_len, depth;
+	int err = 0;
+	int uninitialized;
+	int split_flag1, flags1;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	uninitialized = ext4_ext_is_uninitialized(ex);
+
+	if (map->m_lblk + map->m_len < ee_block + ee_len) {
+		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+			      EXT4_EXT_MAY_ZEROOUT : 0;
+		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+		if (uninitialized)
+			split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
+				       EXT4_EXT_MARK_UNINIT2;
+		err = ext4_split_extent_at(handle, inode, path,
+				map->m_lblk + map->m_len, split_flag1, flags1);
+		if (err)
+			goto out;
+	}
+
+	ext4_ext_drop_refs(path);
+	path = ext4_ext_find_extent(inode, map->m_lblk, path);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	if (map->m_lblk >= ee_block) {
+		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+			      EXT4_EXT_MAY_ZEROOUT : 0;
+		if (uninitialized)
+			split_flag1 |= EXT4_EXT_MARK_UNINIT1;
+		if (split_flag & EXT4_EXT_MARK_UNINIT2)
+			split_flag1 |= EXT4_EXT_MARK_UNINIT2;
+		err = ext4_split_extent_at(handle, inode, path,
+				map->m_lblk, split_flag1, flags);
+		if (err)
+			goto out;
+	}
+
+	ext4_ext_show_leaf(inode, path);
+out:
+	return err ? err : map->m_len;
+}
+
+#define EXT4_EXT_ZERO_LEN 7
+/*
+ * This function is called by ext4_ext_map_blocks() if someone tries to write
+ * to an uninitialized extent. It may result in splitting the uninitialized
+ * extent into multiple extents (up to three - one initialized and two
+ * uninitialized).
+ * There are three possibilities:
+ *   a> There is no split required: Entire extent should be initialized
+ *   b> Splits in two extents: Write is happening at either end of the extent
+ *   c> Splits in three extents: Somone is writing in middle of the extent
+ */
+static int ext4_ext_convert_to_initialized(handle_t *handle,
+					   struct inode *inode,
+					   struct ext4_map_blocks *map,
+					   struct ext4_ext_path *path)
+{
+	struct ext4_map_blocks split_map;
+	struct ext4_extent zero_ex;
+	struct ext4_extent *ex;
+	ext4_lblk_t ee_block, eof_block;
+	unsigned int allocated, ee_len, depth;
+	int err = 0;
+	int split_flag = 0;
+
+	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+		"block %llu, max_blocks %u\n", inode->i_ino,
+		(unsigned long long)map->m_lblk, map->m_len);
+
+	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+	if (eof_block < map->m_lblk + map->m_len)
+		eof_block = map->m_lblk + map->m_len;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	allocated = ee_len - (map->m_lblk - ee_block);
+
+	WARN_ON(map->m_lblk < ee_block);
+	/*
+	 * It is safe to convert extent to initialized via explicit
+	 * zeroout only if extent is fully insde i_size or new_size.
+	 */
+	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+
+	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
+	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
+	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+		err = ext4_ext_zeroout(inode, ex);
+		if (err)
+			goto out;
+
+		err = ext4_ext_get_access(handle, inode, path + depth);
+		if (err)
+			goto out;
+		ext4_ext_mark_initialized(ex);
+		ext4_ext_try_to_merge(inode, path, ex);
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		goto out;
+	}
+
+	/*
+	 * four cases:
+	 * 1. split the extent into three extents.
+	 * 2. split the extent into two extents, zeroout the first half.
+	 * 3. split the extent into two extents, zeroout the second half.
+	 * 4. split the extent into two extents with out zeroout.
+	 */
+	split_map.m_lblk = map->m_lblk;
+	split_map.m_len = map->m_len;
+
+	if (allocated > map->m_len) {
+		if (allocated <= EXT4_EXT_ZERO_LEN &&
+		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+			/* case 3 */
+			zero_ex.ee_block =
+					 cpu_to_le32(map->m_lblk);
+			zero_ex.ee_len = cpu_to_le16(allocated);
+			ext4_ext_store_pblock(&zero_ex,
+				ext4_ext_pblock(ex) + map->m_lblk - ee_block);
+			err = ext4_ext_zeroout(inode, &zero_ex);
+			if (err)
+				goto out;
+			split_map.m_lblk = map->m_lblk;
+			split_map.m_len = allocated;
+		} else if ((map->m_lblk - ee_block + map->m_len <
+			   EXT4_EXT_ZERO_LEN) &&
+			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+			/* case 2 */
+			if (map->m_lblk != ee_block) {
+				zero_ex.ee_block = ex->ee_block;
+				zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+							ee_block);
+				ext4_ext_store_pblock(&zero_ex,
+						      ext4_ext_pblock(ex));
+				err = ext4_ext_zeroout(inode, &zero_ex);
+				if (err)
+					goto out;
+			}
+
+			split_map.m_lblk = ee_block;
+			split_map.m_len = map->m_lblk - ee_block + map->m_len;
+			allocated = map->m_len;
+		}
+	}
+
+	allocated = ext4_split_extent(handle, inode, path,
+				       &split_map, split_flag, 0);
+	if (allocated < 0)
+		err = allocated;
+
+out:
+	return err ? err : allocated;
+}
+
+/*
+ * This function is called by ext4_ext_map_blocks() from
+ * ext4_get_blocks_dio_write() when DIO to write
+ * to an uninitialized extent.
+ *
+ * Writing to an uninitialized extent may result in splitting the uninitialized
+ * extent into multiple /initialized uninitialized extents (up to three)
+ * There are three possibilities:
+ *   a> There is no split required: Entire extent should be uninitialized
+ *   b> Splits in two extents: Write is happening at either end of the extent
+ *   c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * One of more index blocks maybe needed if the extent tree grow after
+ * the uninitialized extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the uninitialized extent before DIO submit
+ * the IO. The uninitialized extent called at this time will be split
+ * into three uninitialized extent(at most). After IO complete, the part
+ * being filled will be convert to initialized by the end_io callback function
+ * via ext4_convert_unwritten_extents().
+ *
+ * Returns the size of uninitialized extent to be written on success.
+ */
+static int ext4_split_unwritten_extents(handle_t *handle,
+					struct inode *inode,
+					struct ext4_map_blocks *map,
+					struct ext4_ext_path *path,
+					int flags)
+{
+	ext4_lblk_t eof_block;
+	ext4_lblk_t ee_block;
+	struct ext4_extent *ex;
+	unsigned int ee_len;
+	int split_flag = 0, depth;
+
+	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
+		"block %llu, max_blocks %u\n", inode->i_ino,
+		(unsigned long long)map->m_lblk, map->m_len);
+
+	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+	if (eof_block < map->m_lblk + map->m_len)
+		eof_block = map->m_lblk + map->m_len;
+	/*
+	 * It is safe to convert extent to initialized via explicit
+	 * zeroout only if extent is fully insde i_size or new_size.
+	 */
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+
+	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+	split_flag |= EXT4_EXT_MARK_UNINIT2;
+
+	flags |= EXT4_GET_BLOCKS_PRE_IO;
+	return ext4_split_extent(handle, inode, path, map, split_flag, flags);
+}
+
+static int ext4_convert_unwritten_extents_endio(handle_t *handle,
+					      struct inode *inode,
+					      struct ext4_ext_path *path)
+{
+	struct ext4_extent *ex;
+	struct ext4_extent_header *eh;
+	int depth;
+	int err = 0;
+
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+
+	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
+		"block %llu, max_blocks %u\n", inode->i_ino,
+		(unsigned long long)le32_to_cpu(ex->ee_block),
+		ext4_ext_get_actual_len(ex));
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
+	/* first mark the extent as initialized */
+	ext4_ext_mark_initialized(ex);
+
+	/* note: ext4_ext_correct_indexes() isn't needed here because
+	 * borders are not changed
+	 */
+	ext4_ext_try_to_merge(inode, path, ex);
+
+	/* Mark modified extent as dirty */
+	err = ext4_ext_dirty(handle, inode, path + depth);
+out:
+	ext4_ext_show_leaf(inode, path);
+	return err;
+}
+
+static void unmap_underlying_metadata_blocks(struct block_device *bdev,
+			sector_t block, int count)
+{
+	int i;
+	for (i = 0; i < count; i++)
+                unmap_underlying_metadata(bdev, block + i);
+}
+
+/*
+ * Handle EOFBLOCKS_FL flag, clearing it if necessary
+ */
+static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+			      ext4_lblk_t lblk,
+			      struct ext4_ext_path *path,
+			      unsigned int len)
+{
+	int i, depth;
+	struct ext4_extent_header *eh;
+	struct ext4_extent *last_ex;
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+		return 0;
+
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+
+	if (unlikely(!eh->eh_entries)) {
+		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
+				 "EOFBLOCKS_FL set");
+		return -EIO;
+	}
+	last_ex = EXT_LAST_EXTENT(eh);
+	/*
+	 * We should clear the EOFBLOCKS_FL flag if we are writing the
+	 * last block in the last extent in the file.  We test this by
+	 * first checking to see if the caller to
+	 * ext4_ext_get_blocks() was interested in the last block (or
+	 * a block beyond the last block) in the current extent.  If
+	 * this turns out to be false, we can bail out from this
+	 * function immediately.
+	 */
+	if (lblk + len < le32_to_cpu(last_ex->ee_block) +
+	    ext4_ext_get_actual_len(last_ex))
+		return 0;
+	/*
+	 * If the caller does appear to be planning to write at or
+	 * beyond the end of the current extent, we then test to see
+	 * if the current extent is the last extent in the file, by
+	 * checking to make sure it was reached via the rightmost node
+	 * at each level of the tree.
+	 */
+	for (i = depth-1; i >= 0; i--)
+		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+			return 0;
+	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+	return ext4_mark_inode_dirty(handle, inode);
+}
+
+static int
+ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
+			struct ext4_map_blocks *map,
+			struct ext4_ext_path *path, int flags,
+			unsigned int allocated, ext4_fsblk_t newblock)
+{
+	int ret = 0;
+	int err = 0;
+	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+
+	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
+		  "block %llu, max_blocks %u, flags %d, allocated %u",
+		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
+		  flags, allocated);
+	ext4_ext_show_leaf(inode, path);
+
+	/* get_block() before submit the IO, split the extent */
+	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
+		ret = ext4_split_unwritten_extents(handle, inode, map,
+						   path, flags);
+		/*
+		 * Flag the inode(non aio case) or end_io struct (aio case)
+		 * that this IO needs to conversion to written when IO is
+		 * completed
+		 */
+		if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
+			io->flag = EXT4_IO_END_UNWRITTEN;
+			atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+		} else
+			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+		if (ext4_should_dioread_nolock(inode))
+			map->m_flags |= EXT4_MAP_UNINIT;
+		goto out;
+	}
+	/* IO end_io complete, convert the filled extent to written */
+	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
+		ret = ext4_convert_unwritten_extents_endio(handle, inode,
+							path);
+		if (ret >= 0) {
+			ext4_update_inode_fsync_trans(handle, inode, 1);
+			err = check_eofblocks_fl(handle, inode, map->m_lblk,
+						 path, map->m_len);
+		} else
+			err = ret;
+		goto out2;
+	}
+	/* buffered IO case */
+	/*
+	 * repeat fallocate creation request
+	 * we already have an unwritten extent
+	 */
+	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+		goto map_out;
+
+	/* buffered READ or buffered write_begin() lookup */
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+		/*
+		 * We have blocks reserved already.  We
+		 * return allocated blocks so that delalloc
+		 * won't do block reservation for us.  But
+		 * the buffer head will be unmapped so that
+		 * a read from the block returns 0s.
+		 */
+		map->m_flags |= EXT4_MAP_UNWRITTEN;
+		goto out1;
+	}
+
+	/* buffered write, writepage time, convert*/
+	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
+	if (ret >= 0) {
+		ext4_update_inode_fsync_trans(handle, inode, 1);
+		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+					 map->m_len);
+		if (err < 0)
+			goto out2;
+	}
+
+out:
+	if (ret <= 0) {
+		err = ret;
+		goto out2;
+	} else
+		allocated = ret;
+	map->m_flags |= EXT4_MAP_NEW;
+	/*
+	 * if we allocated more blocks than requested
+	 * we need to make sure we unmap the extra block
+	 * allocated. The actual needed block will get
+	 * unmapped later when we find the buffer_head marked
+	 * new.
+	 */
+	if (allocated > map->m_len) {
+		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
+					newblock + map->m_len,
+					allocated - map->m_len);
+		allocated = map->m_len;
+	}
+
+	/*
+	 * If we have done fallocate with the offset that is already
+	 * delayed allocated, we would have block reservation
+	 * and quota reservation done in the delayed write path.
+	 * But fallocate would have already updated quota and block
+	 * count for this offset. So cancel these reservation
+	 */
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+		ext4_da_update_reserve_space(inode, allocated, 0);
+
+map_out:
+	map->m_flags |= EXT4_MAP_MAPPED;
+out1:
+	if (allocated > map->m_len)
+		allocated = map->m_len;
+	ext4_ext_show_leaf(inode, path);
+	map->m_pblk = newblock;
+	map->m_len = allocated;
+out2:
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+	return err ? err : allocated;
+}
+
+/*
+ * Block allocation/map/preallocation routine for extents based files
+ *
+ *
+ * Need to be called with
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
+ * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ *
+ * return > 0, number of of blocks already mapped/allocated
+ *          if create == 0 and these are pre-allocated blocks
+ *          	buffer head is unmapped
+ *          otherwise blocks are mapped
+ *
+ * return = 0, if plain look up failed (blocks have not been allocated)
+ *          buffer head is unmapped
+ *
+ * return < 0, error case.
+ */
+int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+			struct ext4_map_blocks *map, int flags)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent newex, *ex;
+	ext4_fsblk_t newblock = 0;
+	int err = 0, depth, ret;
+	unsigned int allocated = 0;
+	unsigned int punched_out = 0;
+	unsigned int result = 0;
+	struct ext4_allocation_request ar;
+	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+	struct ext4_map_blocks punch_map;
+
+	ext_debug("blocks %u/%u requested for inode %lu\n",
+		  map->m_lblk, map->m_len, inode->i_ino);
+	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
+
+	/* check in cache */
+	if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
+		((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
+		if (!newex.ee_start_lo && !newex.ee_start_hi) {
+			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+				/*
+				 * block isn't allocated yet and
+				 * user doesn't want to allocate it
+				 */
+				goto out2;
+			}
+			/* we should allocate requested block */
+		} else {
+			/* block is already allocated */
+			newblock = map->m_lblk
+				   - le32_to_cpu(newex.ee_block)
+				   + ext4_ext_pblock(&newex);
+			/* number of remaining blocks in the extent */
+			allocated = ext4_ext_get_actual_len(&newex) -
+				(map->m_lblk - le32_to_cpu(newex.ee_block));
+			goto out;
+		}
+	}
+
+	/* find extent for this block */
+	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
+	if (IS_ERR(path)) {
+		err = PTR_ERR(path);
+		path = NULL;
+		goto out2;
+	}
+
+	depth = ext_depth(inode);
+
+	/*
+	 * consistent leaf must not be empty;
+	 * this situation is possible, though, _during_ tree modification;
+	 * this is why assert can't be put in ext4_ext_find_extent()
+	 */
+	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
+		EXT4_ERROR_INODE(inode, "bad extent address "
+				 "lblock: %lu, depth: %d pblock %lld",
+				 (unsigned long) map->m_lblk, depth,
+				 path[depth].p_block);
+		err = -EIO;
+		goto out2;
+	}
+
+	ex = path[depth].p_ext;
+	if (ex) {
+		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
+		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
+		unsigned short ee_len;
+
+		/*
+		 * Uninitialized extents are treated as holes, except that
+		 * we split out initialized portions during a write.
+		 */
+		ee_len = ext4_ext_get_actual_len(ex);
+		/* if found extent covers block, simply return it */
+		if (in_range(map->m_lblk, ee_block, ee_len)) {
+			newblock = map->m_lblk - ee_block + ee_start;
+			/* number of remaining blocks in the extent */
+			allocated = ee_len - (map->m_lblk - ee_block);
+			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
+				  ee_block, ee_len, newblock);
+
+			if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
+				/*
+				 * Do not put uninitialized extent
+				 * in the cache
+				 */
+				if (!ext4_ext_is_uninitialized(ex)) {
+					ext4_ext_put_in_cache(inode, ee_block,
+						ee_len, ee_start);
+					goto out;
+				}
+				ret = ext4_ext_handle_uninitialized_extents(
+					handle, inode, map, path, flags,
+					allocated, newblock);
+				return ret;
+			}
+
+			/*
+			 * Punch out the map length, but only to the
+			 * end of the extent
+			 */
+			punched_out = allocated < map->m_len ?
+				allocated : map->m_len;
+
+			/*
+			 * Sense extents need to be converted to
+			 * uninitialized, they must fit in an
+			 * uninitialized extent
+			 */
+			if (punched_out > EXT_UNINIT_MAX_LEN)
+				punched_out = EXT_UNINIT_MAX_LEN;
+
+			punch_map.m_lblk = map->m_lblk;
+			punch_map.m_pblk = newblock;
+			punch_map.m_len = punched_out;
+			punch_map.m_flags = 0;
+
+			/* Check to see if the extent needs to be split */
+			if (punch_map.m_len != ee_len ||
+				punch_map.m_lblk != ee_block) {
+
+				ret = ext4_split_extent(handle, inode,
+				path, &punch_map, 0,
+				EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+				EXT4_GET_BLOCKS_PRE_IO);
+
+				if (ret < 0) {
+					err = ret;
+					goto out2;
+				}
+				/*
+				 * find extent for the block at
+				 * the start of the hole
+				 */
+				ext4_ext_drop_refs(path);
+				kfree(path);
+
+				path = ext4_ext_find_extent(inode,
+				map->m_lblk, NULL);
+				if (IS_ERR(path)) {
+					err = PTR_ERR(path);
+					path = NULL;
+					goto out2;
+				}
+
+				depth = ext_depth(inode);
+				ex = path[depth].p_ext;
+				ee_len = ext4_ext_get_actual_len(ex);
+				ee_block = le32_to_cpu(ex->ee_block);
+				ee_start = ext4_ext_pblock(ex);
+
+			}
+
+			ext4_ext_mark_uninitialized(ex);
+
+			err = ext4_ext_remove_space(inode, map->m_lblk,
+				map->m_lblk + punched_out);
+
+			goto out2;
+		}
+	}
+
+	/*
+	 * requested block isn't allocated yet;
+	 * we couldn't try to create block if create flag is zero
+	 */
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+		/*
+		 * put just found gap into cache to speed up
+		 * subsequent requests
+		 */
+		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+		goto out2;
+	}
+	/*
+	 * Okay, we need to do block allocation.
+	 */
+
+	/* find neighbour allocated blocks */
+	ar.lleft = map->m_lblk;
+	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
+	if (err)
+		goto out2;
+	ar.lright = map->m_lblk;
+	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
+	if (err)
+		goto out2;
+
+	/*
+	 * See if request is beyond maximum number of blocks we can have in
+	 * a single extent. For an initialized extent this limit is
+	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
+	 * EXT_UNINIT_MAX_LEN.
+	 */
+	if (map->m_len > EXT_INIT_MAX_LEN &&
+	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
+		map->m_len = EXT_INIT_MAX_LEN;
+	else if (map->m_len > EXT_UNINIT_MAX_LEN &&
+		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
+		map->m_len = EXT_UNINIT_MAX_LEN;
+
+	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
+	newex.ee_block = cpu_to_le32(map->m_lblk);
+	newex.ee_len = cpu_to_le16(map->m_len);
+	err = ext4_ext_check_overlap(inode, &newex, path);
+	if (err)
+		allocated = ext4_ext_get_actual_len(&newex);
+	else
+		allocated = map->m_len;
+
+	/* allocate new block */
+	ar.inode = inode;
+	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
+	ar.logical = map->m_lblk;
+	ar.len = allocated;
+	if (S_ISREG(inode->i_mode))
+		ar.flags = EXT4_MB_HINT_DATA;
+	else
+		/* disable in-core preallocation for non-regular files */
+		ar.flags = 0;
+	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
+	newblock = ext4_mb_new_blocks(handle, &ar, &err);
+	if (!newblock)
+		goto out2;
+	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
+		  ar.goal, newblock, allocated);
+
+	/* try to insert new extent into found leaf and return */
+	ext4_ext_store_pblock(&newex, newblock);
+	newex.ee_len = cpu_to_le16(ar.len);
+	/* Mark uninitialized */
+	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
+		ext4_ext_mark_uninitialized(&newex);
+		/*
+		 * io_end structure was created for every IO write to an
+		 * uninitialized extent. To avoid unnecessary conversion,
+		 * here we flag the IO that really needs the conversion.
+		 * For non asycn direct IO case, flag the inode state
+		 * that we need to perform conversion when IO is done.
+		 */
+		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
+			if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
+				io->flag = EXT4_IO_END_UNWRITTEN;
+				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+			} else
+				ext4_set_inode_state(inode,
+						     EXT4_STATE_DIO_UNWRITTEN);
+		}
+		if (ext4_should_dioread_nolock(inode))
+			map->m_flags |= EXT4_MAP_UNINIT;
+	}
+
+	err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
+	if (!err)
+		err = ext4_ext_insert_extent(handle, inode, path,
+					     &newex, flags);
+	if (err) {
+		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
+			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
+		/* free data blocks we just allocated */
+		/* not a good idea to call discard here directly,
+		 * but otherwise we'd need to call it every free() */
+		ext4_discard_preallocations(inode);
+		ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
+				 ext4_ext_get_actual_len(&newex), fb_flags);
+		goto out2;
+	}
+
+	/* previous routine could use block we allocated */
+	newblock = ext4_ext_pblock(&newex);
+	allocated = ext4_ext_get_actual_len(&newex);
+	if (allocated > map->m_len)
+		allocated = map->m_len;
+	map->m_flags |= EXT4_MAP_NEW;
+
+	/*
+	 * Update reserved blocks/metadata blocks after successful
+	 * block allocation which had been deferred till now.
+	 */
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+		ext4_da_update_reserve_space(inode, allocated, 1);
+
+	/*
+	 * Cache the extent and update transaction to commit on fdatasync only
+	 * when it is _not_ an uninitialized extent.
+	 */
+	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
+		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
+		ext4_update_inode_fsync_trans(handle, inode, 1);
+	} else
+		ext4_update_inode_fsync_trans(handle, inode, 0);
+out:
+	if (allocated > map->m_len)
+		allocated = map->m_len;
+	ext4_ext_show_leaf(inode, path);
+	map->m_flags |= EXT4_MAP_MAPPED;
+	map->m_pblk = newblock;
+	map->m_len = allocated;
+out2:
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+		newblock, map->m_len, err ? err : allocated);
+
+	result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
+			punched_out : allocated;
+
+	return err ? err : result;
+}
+
+void ext4_ext_truncate(struct inode *inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct super_block *sb = inode->i_sb;
+	ext4_lblk_t last_block;
+	handle_t *handle;
+	int err = 0;
+
+	/*
+	 * finish any pending end_io work so we won't run the risk of
+	 * converting any truncated blocks to initialized later
+	 */
+	ext4_flush_completed_IO(inode);
+
+	/*
+	 * probably first extent we're gonna free will be last in block
+	 */
+	err = ext4_writepage_trans_blocks(inode);
+	handle = ext4_journal_start(inode, err);
+	if (IS_ERR(handle))
+		return;
+
+	if (inode->i_size & (sb->s_blocksize - 1))
+		ext4_block_truncate_page(handle, mapping, inode->i_size);
+
+	if (ext4_orphan_add(handle, inode))
+		goto out_stop;
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ext4_ext_invalidate_cache(inode);
+
+	ext4_discard_preallocations(inode);
+
+	/*
+	 * TODO: optimization is possible here.
+	 * Probably we need not scan at all,
+	 * because page truncation is enough.
+	 */
+
+	/* we have to know where to truncate from in crash case */
+	EXT4_I(inode)->i_disksize = inode->i_size;
+	ext4_mark_inode_dirty(handle, inode);
+
+	last_block = (inode->i_size + sb->s_blocksize - 1)
+			>> EXT4_BLOCK_SIZE_BITS(sb);
+	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+
+	/* In a multi-transaction truncate, we only make the final
+	 * transaction synchronous.
+	 */
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+
+	up_write(&EXT4_I(inode)->i_data_sem);
+
+out_stop:
+	/*
+	 * If this was a simple ftruncate() and the file will remain alive,
+	 * then we need to clear up the orphan record which we created above.
+	 * However, if this was a real unlink then we were called by
+	 * ext4_delete_inode(), and we allow that function to clean up the
+	 * orphan info for us.
+	 */
+	if (inode->i_nlink)
+		ext4_orphan_del(handle, inode);
+
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_journal_stop(handle);
+}
+
+static void ext4_falloc_update_inode(struct inode *inode,
+				int mode, loff_t new_size, int update_ctime)
+{
+	struct timespec now;
+
+	if (update_ctime) {
+		now = current_fs_time(inode->i_sb);
+		if (!timespec_equal(&inode->i_ctime, &now))
+			inode->i_ctime = now;
+	}
+	/*
+	 * Update only when preallocation was requested beyond
+	 * the file size.
+	 */
+	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+		if (new_size > i_size_read(inode))
+			i_size_write(inode, new_size);
+		if (new_size > EXT4_I(inode)->i_disksize)
+			ext4_update_i_disksize(inode, new_size);
+	} else {
+		/*
+		 * Mark that we allocate beyond EOF so the subsequent truncate
+		 * can proceed even if the new size is the same as i_size.
+		 */
+		if (new_size > i_size_read(inode))
+			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+	}
+
+}
+
+/*
+ * preallocate space for a file. This implements ext4's fallocate file
+ * operation, which gets called from sys_fallocate system call.
+ * For block-mapped files, posix_fallocate should fall back to the method
+ * of writing zeroes to the required new blocks (the same behavior which is
+ * expected for file systems which do not support fallocate() system call).
+ */
+long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	handle_t *handle;
+	loff_t new_size;
+	unsigned int max_blocks;
+	int ret = 0;
+	int ret2 = 0;
+	int retries = 0;
+	struct ext4_map_blocks map;
+	unsigned int credits, blkbits = inode->i_blkbits;
+
+	/*
+	 * currently supporting (pre)allocate mode for extent-based
+	 * files _only_
+	 */
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return -EOPNOTSUPP;
+
+	/* Return error if mode is not supported */
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		return ext4_punch_hole(file, offset, len);
+
+	trace_ext4_fallocate_enter(inode, offset, len, mode);
+	map.m_lblk = offset >> blkbits;
+	/*
+	 * We can't just convert len to max_blocks because
+	 * If blocksize = 4096 offset = 3072 and len = 2048
+	 */
+	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+		- map.m_lblk;
+	/*
+	 * credits to insert 1 extent into extent tree
+	 */
+	credits = ext4_chunk_trans_blocks(inode, max_blocks);
+	mutex_lock(&inode->i_mutex);
+	ret = inode_newsize_ok(inode, (len + offset));
+	if (ret) {
+		mutex_unlock(&inode->i_mutex);
+		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+		return ret;
+	}
+retry:
+	while (ret >= 0 && ret < max_blocks) {
+		map.m_lblk = map.m_lblk + ret;
+		map.m_len = max_blocks = max_blocks - ret;
+		handle = ext4_journal_start(inode, credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			break;
+		}
+		ret = ext4_map_blocks(handle, inode, &map,
+				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+				      EXT4_GET_BLOCKS_NO_NORMALIZE);
+		if (ret <= 0) {
+#ifdef EXT4FS_DEBUG
+			WARN_ON(ret <= 0);
+			printk(KERN_ERR "%s: ext4_ext_map_blocks "
+				    "returned error inode#%lu, block=%u, "
+				    "max_blocks=%u", __func__,
+				    inode->i_ino, map.m_lblk, max_blocks);
+#endif
+			ext4_mark_inode_dirty(handle, inode);
+			ret2 = ext4_journal_stop(handle);
+			break;
+		}
+		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
+						blkbits) >> blkbits))
+			new_size = offset + len;
+		else
+			new_size = (map.m_lblk + ret) << blkbits;
+
+		ext4_falloc_update_inode(inode, mode, new_size,
+					 (map.m_flags & EXT4_MAP_NEW));
+		ext4_mark_inode_dirty(handle, inode);
+		ret2 = ext4_journal_stop(handle);
+		if (ret2)
+			break;
+	}
+	if (ret == -ENOSPC &&
+			ext4_should_retry_alloc(inode->i_sb, &retries)) {
+		ret = 0;
+		goto retry;
+	}
+	mutex_unlock(&inode->i_mutex);
+	trace_ext4_fallocate_exit(inode, offset, max_blocks,
+				ret > 0 ? ret2 : ret);
+	return ret > 0 ? ret2 : ret;
+}
+
+/*
+ * This function convert a range of blocks to written extents
+ * The caller of this function will pass the start offset and the size.
+ * all unwritten extents within this range will be converted to
+ * written extents.
+ *
+ * This function is called from the direct IO end io call back
+ * function, to convert the fallocated extents after IO is completed.
+ * Returns 0 on success.
+ */
+int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+				    ssize_t len)
+{
+	handle_t *handle;
+	unsigned int max_blocks;
+	int ret = 0;
+	int ret2 = 0;
+	struct ext4_map_blocks map;
+	unsigned int credits, blkbits = inode->i_blkbits;
+
+	map.m_lblk = offset >> blkbits;
+	/*
+	 * We can't just convert len to max_blocks because
+	 * If blocksize = 4096 offset = 3072 and len = 2048
+	 */
+	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
+		      map.m_lblk);
+	/*
+	 * credits to insert 1 extent into extent tree
+	 */
+	credits = ext4_chunk_trans_blocks(inode, max_blocks);
+	while (ret >= 0 && ret < max_blocks) {
+		map.m_lblk += ret;
+		map.m_len = (max_blocks -= ret);
+		handle = ext4_journal_start(inode, credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			break;
+		}
+		ret = ext4_map_blocks(handle, inode, &map,
+				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+		if (ret <= 0) {
+			WARN_ON(ret <= 0);
+			printk(KERN_ERR "%s: ext4_ext_map_blocks "
+				    "returned error inode#%lu, block=%u, "
+				    "max_blocks=%u", __func__,
+				    inode->i_ino, map.m_lblk, map.m_len);
+		}
+		ext4_mark_inode_dirty(handle, inode);
+		ret2 = ext4_journal_stop(handle);
+		if (ret <= 0 || ret2 )
+			break;
+	}
+	return ret > 0 ? ret2 : ret;
+}
+
+/*
+ * Callback function called for each extent to gather FIEMAP information.
+ */
+static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
+		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
+		       void *data)
+{
+	__u64	logical;
+	__u64	physical;
+	__u64	length;
+	__u32	flags = 0;
+	int		ret = 0;
+	struct fiemap_extent_info *fieinfo = data;
+	unsigned char blksize_bits;
+
+	blksize_bits = inode->i_sb->s_blocksize_bits;
+	logical = (__u64)newex->ec_block << blksize_bits;
+
+	if (newex->ec_start == 0) {
+		/*
+		 * No extent in extent-tree contains block @newex->ec_start,
+		 * then the block may stay in 1)a hole or 2)delayed-extent.
+		 *
+		 * Holes or delayed-extents are processed as follows.
+		 * 1. lookup dirty pages with specified range in pagecache.
+		 *    If no page is got, then there is no delayed-extent and
+		 *    return with EXT_CONTINUE.
+		 * 2. find the 1st mapped buffer,
+		 * 3. check if the mapped buffer is both in the request range
+		 *    and a delayed buffer. If not, there is no delayed-extent,
+		 *    then return.
+		 * 4. a delayed-extent is found, the extent will be collected.
+		 */
+		ext4_lblk_t	end = 0;
+		pgoff_t		last_offset;
+		pgoff_t		offset;
+		pgoff_t		index;
+		pgoff_t		start_index = 0;
+		struct page	**pages = NULL;
+		struct buffer_head *bh = NULL;
+		struct buffer_head *head = NULL;
+		unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
+
+		pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
+		if (pages == NULL)
+			return -ENOMEM;
+
+		offset = logical >> PAGE_SHIFT;
+repeat:
+		last_offset = offset;
+		head = NULL;
+		ret = find_get_pages_tag(inode->i_mapping, &offset,
+					PAGECACHE_TAG_DIRTY, nr_pages, pages);
+
+		if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+			/* First time, try to find a mapped buffer. */
+			if (ret == 0) {
+out:
+				for (index = 0; index < ret; index++)
+					page_cache_release(pages[index]);
+				/* just a hole. */
+				kfree(pages);
+				return EXT_CONTINUE;
+			}
+			index = 0;
+
+next_page:
+			/* Try to find the 1st mapped buffer. */
+			end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
+				  blksize_bits;
+			if (!page_has_buffers(pages[index]))
+				goto out;
+			head = page_buffers(pages[index]);
+			if (!head)
+				goto out;
+
+			index++;
+			bh = head;
+			do {
+				if (end >= newex->ec_block +
+					newex->ec_len)
+					/* The buffer is out of
+					 * the request range.
+					 */
+					goto out;
+
+				if (buffer_mapped(bh) &&
+				    end >= newex->ec_block) {
+					start_index = index - 1;
+					/* get the 1st mapped buffer. */
+					goto found_mapped_buffer;
+				}
+
+				bh = bh->b_this_page;
+				end++;
+			} while (bh != head);
+
+			/* No mapped buffer in the range found in this page,
+			 * We need to look up next page.
+			 */
+			if (index >= ret) {
+				/* There is no page left, but we need to limit
+				 * newex->ec_len.
+				 */
+				newex->ec_len = end - newex->ec_block;
+				goto out;
+			}
+			goto next_page;
+		} else {
+			/*Find contiguous delayed buffers. */
+			if (ret > 0 && pages[0]->index == last_offset)
+				head = page_buffers(pages[0]);
+			bh = head;
+			index = 1;
+			start_index = 0;
+		}
+
+found_mapped_buffer:
+		if (bh != NULL && buffer_delay(bh)) {
+			/* 1st or contiguous delayed buffer found. */
+			if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+				/*
+				 * 1st delayed buffer found, record
+				 * the start of extent.
+				 */
+				flags |= FIEMAP_EXTENT_DELALLOC;
+				newex->ec_block = end;
+				logical = (__u64)end << blksize_bits;
+			}
+			/* Find contiguous delayed buffers. */
+			do {
+				if (!buffer_delay(bh))
+					goto found_delayed_extent;
+				bh = bh->b_this_page;
+				end++;
+			} while (bh != head);
+
+			for (; index < ret; index++) {
+				if (!page_has_buffers(pages[index])) {
+					bh = NULL;
+					break;
+				}
+				head = page_buffers(pages[index]);
+				if (!head) {
+					bh = NULL;
+					break;
+				}
+
+				if (pages[index]->index !=
+				    pages[start_index]->index + index
+				    - start_index) {
+					/* Blocks are not contiguous. */
+					bh = NULL;
+					break;
+				}
+				bh = head;
+				do {
+					if (!buffer_delay(bh))
+						/* Delayed-extent ends. */
+						goto found_delayed_extent;
+					bh = bh->b_this_page;
+					end++;
+				} while (bh != head);
+			}
+		} else if (!(flags & FIEMAP_EXTENT_DELALLOC))
+			/* a hole found. */
+			goto out;
+
+found_delayed_extent:
+		newex->ec_len = min(end - newex->ec_block,
+						(ext4_lblk_t)EXT_INIT_MAX_LEN);
+		if (ret == nr_pages && bh != NULL &&
+			newex->ec_len < EXT_INIT_MAX_LEN &&
+			buffer_delay(bh)) {
+			/* Have not collected an extent and continue. */
+			for (index = 0; index < ret; index++)
+				page_cache_release(pages[index]);
+			goto repeat;
+		}
+
+		for (index = 0; index < ret; index++)
+			page_cache_release(pages[index]);
+		kfree(pages);
+	}
+
+	physical = (__u64)newex->ec_start << blksize_bits;
+	length =   (__u64)newex->ec_len << blksize_bits;
+
+	if (ex && ext4_ext_is_uninitialized(ex))
+		flags |= FIEMAP_EXTENT_UNWRITTEN;
+
+	if (next == EXT_MAX_BLOCKS)
+		flags |= FIEMAP_EXTENT_LAST;
+
+	ret = fiemap_fill_next_extent(fieinfo, logical, physical,
+					length, flags);
+	if (ret < 0)
+		return ret;
+	if (ret == 1)
+		return EXT_BREAK;
+	return EXT_CONTINUE;
+}
+
+/* fiemap flags we can handle specified here */
+#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+
+static int ext4_xattr_fiemap(struct inode *inode,
+				struct fiemap_extent_info *fieinfo)
+{
+	__u64 physical = 0;
+	__u64 length;
+	__u32 flags = FIEMAP_EXTENT_LAST;
+	int blockbits = inode->i_sb->s_blocksize_bits;
+	int error = 0;
+
+	/* in-inode? */
+	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+		struct ext4_iloc iloc;
+		int offset;	/* offset of xattr in inode */
+
+		error = ext4_get_inode_loc(inode, &iloc);
+		if (error)
+			return error;
+		physical = iloc.bh->b_blocknr << blockbits;
+		offset = EXT4_GOOD_OLD_INODE_SIZE +
+				EXT4_I(inode)->i_extra_isize;
+		physical += offset;
+		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
+		flags |= FIEMAP_EXTENT_DATA_INLINE;
+		brelse(iloc.bh);
+	} else { /* external block */
+		physical = EXT4_I(inode)->i_file_acl << blockbits;
+		length = inode->i_sb->s_blocksize;
+	}
+
+	if (physical)
+		error = fiemap_fill_next_extent(fieinfo, 0, physical,
+						length, flags);
+	return (error < 0 ? error : 0);
+}
+
+/*
+ * ext4_ext_punch_hole
+ *
+ * Punches a hole of "length" bytes in a file starting
+ * at byte "offset"
+ *
+ * @inode:  The inode of the file to punch a hole in
+ * @offset: The starting byte offset of the hole
+ * @length: The length of the hole
+ *
+ * Returns the number of blocks removed or negative on err
+ */
+int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct ext4_ext_cache cache_ex;
+	ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
+	struct address_space *mapping = inode->i_mapping;
+	struct ext4_map_blocks map;
+	handle_t *handle;
+	loff_t first_block_offset, last_block_offset, block_len;
+	loff_t first_page, last_page, first_page_offset, last_page_offset;
+	int ret, credits, blocks_released, err = 0;
+
+	first_block = (offset + sb->s_blocksize - 1) >>
+		EXT4_BLOCK_SIZE_BITS(sb);
+	last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+	first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
+	last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
+
+	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+
+	first_page_offset = first_page << PAGE_CACHE_SHIFT;
+	last_page_offset = last_page << PAGE_CACHE_SHIFT;
+
+	/*
+	 * Write out all dirty pages to avoid race conditions
+	 * Then release them.
+	 */
+	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+		err = filemap_write_and_wait_range(mapping,
+			first_page_offset == 0 ? 0 : first_page_offset-1,
+			last_page_offset);
+
+			if (err)
+				return err;
+	}
+
+	/* Now release the pages */
+	if (last_page_offset > first_page_offset) {
+		truncate_inode_pages_range(mapping, first_page_offset,
+					   last_page_offset-1);
+	}
+
+	/* finish any pending end_io work */
+	ext4_flush_completed_IO(inode);
+
+	credits = ext4_writepage_trans_blocks(inode);
+	handle = ext4_journal_start(inode, credits);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	err = ext4_orphan_add(handle, inode);
+	if (err)
+		goto out;
+
+	/*
+	 * Now we need to zero out the un block aligned data.
+	 * If the file is smaller than a block, just
+	 * zero out the middle
+	 */
+	if (first_block > last_block)
+		ext4_block_zero_page_range(handle, mapping, offset, length);
+	else {
+		/* zero out the head of the hole before the first block */
+		block_len  = first_block_offset - offset;
+		if (block_len > 0)
+			ext4_block_zero_page_range(handle, mapping,
+						   offset, block_len);
+
+		/* zero out the tail of the hole after the last block */
+		block_len = offset + length - last_block_offset;
+		if (block_len > 0) {
+			ext4_block_zero_page_range(handle, mapping,
+					last_block_offset, block_len);
+		}
+	}
+
+	/* If there are no blocks to remove, return now */
+	if (first_block >= last_block)
+		goto out;
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ext4_ext_invalidate_cache(inode);
+	ext4_discard_preallocations(inode);
+
+	/*
+	 * Loop over all the blocks and identify blocks
+	 * that need to be punched out
+	 */
+	iblock = first_block;
+	blocks_released = 0;
+	while (iblock < last_block) {
+		max_blocks = last_block - iblock;
+		num_blocks = 1;
+		memset(&map, 0, sizeof(map));
+		map.m_lblk = iblock;
+		map.m_len = max_blocks;
+		ret = ext4_ext_map_blocks(handle, inode, &map,
+			EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+
+		if (ret > 0) {
+			blocks_released += ret;
+			num_blocks = ret;
+		} else if (ret == 0) {
+			/*
+			 * If map blocks could not find the block,
+			 * then it is in a hole.  If the hole was
+			 * not already cached, then map blocks should
+			 * put it in the cache.  So we can get the hole
+			 * out of the cache
+			 */
+			memset(&cache_ex, 0, sizeof(cache_ex));
+			if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
+				!cache_ex.ec_start) {
+
+				/* The hole is cached */
+				num_blocks = cache_ex.ec_block +
+				cache_ex.ec_len - iblock;
+
+			} else {
+				/* The block could not be identified */
+				err = -EIO;
+				break;
+			}
+		} else {
+			/* Map blocks error */
+			err = ret;
+			break;
+		}
+
+		if (num_blocks == 0) {
+			/* This condition should never happen */
+			ext_debug("Block lookup failed");
+			err = -EIO;
+			break;
+		}
+
+		iblock += num_blocks;
+	}
+
+	if (blocks_released > 0) {
+		ext4_ext_invalidate_cache(inode);
+		ext4_discard_preallocations(inode);
+	}
+
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+
+	up_write(&EXT4_I(inode)->i_data_sem);
+
+out:
+	ext4_orphan_del(handle, inode);
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_journal_stop(handle);
+	return err;
+}
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len)
+{
+	ext4_lblk_t start_blk;
+	int error = 0;
+
+	/* fallback to generic here if not in extents fmt */
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return generic_block_fiemap(inode, fieinfo, start, len,
+			ext4_get_block);
+
+	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
+		return -EBADR;
+
+	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+		error = ext4_xattr_fiemap(inode, fieinfo);
+	} else {
+		ext4_lblk_t len_blks;
+		__u64 last_blk;
+
+		start_blk = start >> inode->i_sb->s_blocksize_bits;
+		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
+		if (last_blk >= EXT_MAX_BLOCKS)
+			last_blk = EXT_MAX_BLOCKS-1;
+		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
+
+		/*
+		 * Walk the extent tree gathering extent information.
+		 * ext4_ext_fiemap_cb will push extents back to user.
+		 */
+		error = ext4_ext_walk_space(inode, start_blk, len_blks,
+					  ext4_ext_fiemap_cb, fieinfo);
+	}
+
+	return error;
+}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
new file mode 100644
index 00000000..2c097232
--- /dev/null
+++ b/fs/ext4/file.c
@@ -0,0 +1,286 @@
+/*
+ *  linux/fs/ext4/file.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/file.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4 fs regular file handling primitives
+ *
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ *	(jj@sunsite.ms.mff.cuni.cz)
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/quotaops.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Called when an inode is released. Note that this is different
+ * from ext4_file_open: open gets called at every open, but release
+ * gets called only when /all/ the files are closed.
+ */
+static int ext4_release_file(struct inode *inode, struct file *filp)
+{
+	if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
+		ext4_alloc_da_blocks(inode);
+		ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+	}
+	/* if we are the last writer on the inode, drop the block reservation */
+	if ((filp->f_mode & FMODE_WRITE) &&
+			(atomic_read(&inode->i_writecount) == 1) &&
+		        !EXT4_I(inode)->i_reserved_data_blocks)
+	{
+		down_write(&EXT4_I(inode)->i_data_sem);
+		ext4_discard_preallocations(inode);
+		up_write(&EXT4_I(inode)->i_data_sem);
+	}
+	if (is_dx(inode) && filp->private_data)
+		ext4_htree_free_dir_info(filp->private_data);
+
+	return 0;
+}
+
+static void ext4_aiodio_wait(struct inode *inode)
+{
+	wait_queue_head_t *wq = ext4_ioend_wq(inode);
+
+	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+}
+
+/*
+ * This tests whether the IO in question is block-aligned or not.
+ * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
+ * are converted to written only after the IO is complete.  Until they are
+ * mapped, these blocks appear as holes, so dio_zero_block() will assume that
+ * it needs to zero out portions of the start and/or end block.  If 2 AIO
+ * threads are at work on the same unwritten block, they must be synchronized
+ * or one thread will zero the other's data, causing corruption.
+ */
+static int
+ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
+		   unsigned long nr_segs, loff_t pos)
+{
+	struct super_block *sb = inode->i_sb;
+	int blockmask = sb->s_blocksize - 1;
+	size_t count = iov_length(iov, nr_segs);
+	loff_t final_size = pos + count;
+
+	if (pos >= inode->i_size)
+		return 0;
+
+	if ((pos & blockmask) || (final_size & blockmask))
+		return 1;
+
+	return 0;
+}
+
+static ssize_t
+ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
+{
+	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+	int unaligned_aio = 0;
+	int ret;
+
+	/*
+	 * If we have encountered a bitmap-format file, the size limit
+	 * is smaller than s_maxbytes, which is for extent-mapped files.
+	 */
+
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+		size_t length = iov_length(iov, nr_segs);
+
+		if ((pos > sbi->s_bitmap_maxbytes ||
+		    (pos == sbi->s_bitmap_maxbytes && length > 0)))
+			return -EFBIG;
+
+		if (pos + length > sbi->s_bitmap_maxbytes) {
+			nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
+					      sbi->s_bitmap_maxbytes - pos);
+		}
+	} else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
+		   !is_sync_kiocb(iocb))) {
+		unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
+	}
+
+	/* Unaligned direct AIO must be serialized; see comment above */
+	if (unaligned_aio) {
+		static unsigned long unaligned_warn_time;
+
+		/* Warn about this once per day */
+		if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
+			ext4_msg(inode->i_sb, KERN_WARNING,
+				 "Unaligned AIO/DIO on inode %ld by %s; "
+				 "performance will be poor.",
+				 inode->i_ino, current->comm);
+		mutex_lock(ext4_aio_mutex(inode));
+		ext4_aiodio_wait(inode);
+	}
+
+	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+	if (unaligned_aio)
+		mutex_unlock(ext4_aio_mutex(inode));
+
+	return ret;
+}
+
+static const struct vm_operations_struct ext4_file_vm_ops = {
+	.fault		= filemap_fault,
+	.page_mkwrite   = ext4_page_mkwrite,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+	file_accessed(file);
+	vma->vm_ops = &ext4_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+	return 0;
+}
+
+static int ext4_file_open(struct inode * inode, struct file * filp)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct vfsmount *mnt = filp->f_path.mnt;
+	struct path path;
+	char buf[64], *cp;
+
+	if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
+		     !(sb->s_flags & MS_RDONLY))) {
+		sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+		/*
+		 * Sample where the filesystem has been mounted and
+		 * store it in the superblock for sysadmin convenience
+		 * when trying to sort through large numbers of block
+		 * devices or filesystem images.
+		 */
+		memset(buf, 0, sizeof(buf));
+		path.mnt = mnt;
+		path.dentry = mnt->mnt_root;
+		cp = d_path(&path, buf, sizeof(buf));
+		if (!IS_ERR(cp)) {
+			memcpy(sbi->s_es->s_last_mounted, cp,
+			       sizeof(sbi->s_es->s_last_mounted));
+			ext4_mark_super_dirty(sb);
+		}
+	}
+	/*
+	 * Set up the jbd2_inode if we are opening the inode for
+	 * writing and the journal is present
+	 */
+	if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
+		struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
+
+		spin_lock(&inode->i_lock);
+		if (!ei->jinode) {
+			if (!jinode) {
+				spin_unlock(&inode->i_lock);
+				return -ENOMEM;
+			}
+			ei->jinode = jinode;
+			jbd2_journal_init_jbd_inode(ei->jinode, inode);
+			jinode = NULL;
+		}
+		spin_unlock(&inode->i_lock);
+		if (unlikely(jinode != NULL))
+			jbd2_free_inode(jinode);
+	}
+	return dquot_file_open(inode, filp);
+}
+
+/*
+ * ext4_llseek() copied from generic_file_llseek() to handle both
+ * block-mapped and extent-mapped maxbytes values. This should
+ * otherwise be identical with generic_file_llseek().
+ */
+loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_mapping->host;
+	loff_t maxbytes;
+
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+	else
+		maxbytes = inode->i_sb->s_maxbytes;
+	mutex_lock(&inode->i_mutex);
+	switch (origin) {
+	case SEEK_END:
+		offset += inode->i_size;
+		break;
+	case SEEK_CUR:
+		if (offset == 0) {
+			mutex_unlock(&inode->i_mutex);
+			return file->f_pos;
+		}
+		offset += file->f_pos;
+		break;
+	}
+
+	if (offset < 0 || offset > maxbytes) {
+		mutex_unlock(&inode->i_mutex);
+		return -EINVAL;
+	}
+
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
+	}
+	mutex_unlock(&inode->i_mutex);
+
+	return offset;
+}
+
+const struct file_operations ext4_file_operations = {
+	.llseek		= ext4_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= ext4_file_write,
+	.unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext4_compat_ioctl,
+#endif
+	.mmap		= ext4_file_mmap,
+	.open		= ext4_file_open,
+	.release	= ext4_release_file,
+	.fsync		= ext4_sync_file,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+	.fallocate	= ext4_fallocate,
+};
+
+const struct inode_operations ext4_file_inode_operations = {
+	.setattr	= ext4_setattr,
+	.getattr	= ext4_getattr,
+#ifdef CONFIG_EXT4_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.check_acl	= ext4_check_acl,
+	.fiemap		= ext4_fiemap,
+};
+
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
new file mode 100644
index 00000000..ce66d2fe
--- /dev/null
+++ b/fs/ext4/fsync.c
@@ -0,0 +1,225 @@
+/*
+ *  linux/fs/ext4/fsync.c
+ *
+ *  Copyright (C) 1993  Stephen Tweedie (sct@redhat.com)
+ *  from
+ *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
+ *                      Laboratoire MASI - Institut Blaise Pascal
+ *                      Universite Pierre et Marie Curie (Paris VI)
+ *  from
+ *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4fs fsync primitive
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ *  Removed unnecessary code duplication for little endian machines
+ *  and excessive __inline__s.
+ *        Andi Kleen, 1997
+ *
+ * Major simplications and cleanup - we only need to do the metadata, because
+ * we can depend on generic_block_fdatasync() to sync the data blocks.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/jbd2.h>
+#include <linux/blkdev.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+#include <trace/events/ext4.h>
+
+static void dump_completed_IO(struct inode * inode)
+{
+#ifdef	EXT4FS_DEBUG
+	struct list_head *cur, *before, *after;
+	ext4_io_end_t *io, *io0, *io1;
+	unsigned long flags;
+
+	if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+		ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
+		return;
+	}
+
+	ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
+		cur = &io->list;
+		before = cur->prev;
+		io0 = container_of(before, ext4_io_end_t, list);
+		after = cur->next;
+		io1 = container_of(after, ext4_io_end_t, list);
+
+		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+			    io, inode->i_ino, io0, io1);
+	}
+	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+#endif
+}
+
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
+ */
+extern int ext4_flush_completed_IO(struct inode *inode)
+{
+	ext4_io_end_t *io;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned long flags;
+	int ret = 0;
+	int ret2 = 0;
+
+	if (list_empty(&ei->i_completed_io_list))
+		return ret;
+
+	dump_completed_IO(inode);
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	while (!list_empty(&ei->i_completed_io_list)){
+		io = list_entry(ei->i_completed_io_list.next,
+				ext4_io_end_t, list);
+		/*
+		 * Calling ext4_end_io_nolock() to convert completed
+		 * IO to written.
+		 *
+		 * When ext4_sync_file() is called, run_queue() may already
+		 * about to flush the work corresponding to this io structure.
+		 * It will be upset if it founds the io structure related
+		 * to the work-to-be schedule is freed.
+		 *
+		 * Thus we need to keep the io structure still valid here after
+		 * conversion finished. The io structure has a flag to
+		 * avoid double converting from both fsync and background work
+		 * queue work.
+		 */
+		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+		ret = ext4_end_io_nolock(io);
+		spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+		if (ret < 0)
+			ret2 = ret;
+		else
+			list_del_init(&io->list);
+	}
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+	return (ret2 < 0) ? ret2 : 0;
+}
+
+/*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file.  This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static int ext4_sync_parent(struct inode *inode)
+{
+	struct writeback_control wbc;
+	struct dentry *dentry = NULL;
+	int ret = 0;
+
+	while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+		ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+		dentry = list_entry(inode->i_dentry.next,
+				    struct dentry, d_alias);
+		if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
+			break;
+		inode = dentry->d_parent->d_inode;
+		ret = sync_mapping_buffers(inode->i_mapping);
+		if (ret)
+			break;
+		memset(&wbc, 0, sizeof(wbc));
+		wbc.sync_mode = WB_SYNC_ALL;
+		wbc.nr_to_write = 0;         /* only write out the inode */
+		ret = sync_inode(inode, &wbc);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+/*
+ * akpm: A new design for ext4_sync_file().
+ *
+ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
+ * There cannot be a transaction open by this task.
+ * Another task could have dirtied this inode.  Its data can be in any
+ * state in the journalling system.
+ *
+ * What we do is just kick off a commit and wait on it.  This will snapshot the
+ * inode to disk.
+ *
+ * i_mutex lock is held when entering and exiting this function
+ */
+
+int ext4_sync_file(struct file *file, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+	int ret;
+	tid_t commit_tid;
+	bool needs_barrier = false;
+
+	J_ASSERT(ext4_journal_current_handle() == NULL);
+
+	trace_ext4_sync_file_enter(file, datasync);
+
+	if (inode->i_sb->s_flags & MS_RDONLY)
+		return 0;
+
+	ret = ext4_flush_completed_IO(inode);
+	if (ret < 0)
+		goto out;
+
+	if (!journal) {
+		ret = generic_file_fsync(file, datasync);
+		if (!ret && !list_empty(&inode->i_dentry))
+			ret = ext4_sync_parent(inode);
+		goto out;
+	}
+
+	/*
+	 * data=writeback,ordered:
+	 *  The caller's filemap_fdatawrite()/wait will sync the data.
+	 *  Metadata is in the journal, we wait for proper transaction to
+	 *  commit here.
+	 *
+	 * data=journal:
+	 *  filemap_fdatawrite won't do anything (the buffers are clean).
+	 *  ext4_force_commit will write the file data into the journal and
+	 *  will wait on that.
+	 *  filemap_fdatawait() will encounter a ton of newly-dirtied pages
+	 *  (they were dirtied by commit).  But that's OK - the blocks are
+	 *  safe in-journal, which is all fsync() needs to ensure.
+	 */
+	if (ext4_should_journal_data(inode)) {
+		ret = ext4_force_commit(inode->i_sb);
+		goto out;
+	}
+
+	commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+	if (journal->j_flags & JBD2_BARRIER &&
+	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+		needs_barrier = true;
+	jbd2_log_start_commit(journal, commit_tid);
+	ret = jbd2_log_wait_commit(journal, commit_tid);
+	if (needs_barrier)
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ out:
+	trace_ext4_sync_file_exit(inode, ret);
+	return ret;
+}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
new file mode 100644
index 00000000..ac8f168c
--- /dev/null
+++ b/fs/ext4/hash.c
@@ -0,0 +1,208 @@
+/*
+ *  linux/fs/ext4/hash.c
+ *
+ * Copyright (C) 2002 by Theodore Ts'o
+ *
+ * This file is released under the GPL v2.
+ *
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/cryptohash.h>
+#include "ext4.h"
+
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+	__u32	sum = 0;
+	__u32	b0 = buf[0], b1 = buf[1];
+	__u32	a = in[0], b = in[1], c = in[2], d = in[3];
+	int	n = 16;
+
+	do {
+		sum += DELTA;
+		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+	} while (--n);
+
+	buf[0] += b0;
+	buf[1] += b1;
+}
+
+
+/* The old legacy hash */
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
+{
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const unsigned char *ucp = (const unsigned char *) name;
+
+	while (len--) {
+		hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
+		hash1 = hash0;
+		hash0 = hash;
+	}
+	return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const signed char *scp = (const signed char *) name;
+
+	while (len--) {
+		hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
+
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
+		hash1 = hash0;
+		hash0 = hash;
+	}
+	return hash0 << 1;
+}
+
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+	const signed char *scp = (const signed char *) msg;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i = 0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = ((int) scp[i]) + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+	const unsigned char *ucp = (const unsigned char *) msg;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i = 0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = ((int) ucp[i]) + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+/*
+ * Returns the hash of a filename.  If len is 0 and name is NULL, then
+ * this function can be used to test whether or not a hash version is
+ * supported.
+ *
+ * The seed is an 4 longword (32 bits) "secret" which can be used to
+ * uniquify a hash.  If the seed is all zero's, then some default seed
+ * may be used.
+ *
+ * A particular hash version specifies whether or not the seed is
+ * represented, and whether or not the returned hash is 32 bits or 64
+ * bits.  32 bit hashes will return 0 for the minor hash.
+ */
+int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
+{
+	__u32	hash;
+	__u32	minor_hash = 0;
+	const char	*p;
+	int		i;
+	__u32		in[8], buf[4];
+	void		(*str2hashbuf)(const char *, int, __u32 *, int) =
+				str2hashbuf_signed;
+
+	/* Initialize the default seed for the hash checksum functions */
+	buf[0] = 0x67452301;
+	buf[1] = 0xefcdab89;
+	buf[2] = 0x98badcfe;
+	buf[3] = 0x10325476;
+
+	/* Check to see if the seed is all zero's */
+	if (hinfo->seed) {
+		for (i = 0; i < 4; i++) {
+			if (hinfo->seed[i])
+				break;
+		}
+		if (i < 4)
+			memcpy(buf, hinfo->seed, sizeof(buf));
+	}
+
+	switch (hinfo->hash_version) {
+	case DX_HASH_LEGACY_UNSIGNED:
+		hash = dx_hack_hash_unsigned(name, len);
+		break;
+	case DX_HASH_LEGACY:
+		hash = dx_hack_hash_signed(name, len);
+		break;
+	case DX_HASH_HALF_MD4_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
+	case DX_HASH_HALF_MD4:
+		p = name;
+		while (len > 0) {
+			(*str2hashbuf)(p, len, in, 8);
+			half_md4_transform(buf, in);
+			len -= 32;
+			p += 32;
+		}
+		minor_hash = buf[2];
+		hash = buf[1];
+		break;
+	case DX_HASH_TEA_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
+	case DX_HASH_TEA:
+		p = name;
+		while (len > 0) {
+			(*str2hashbuf)(p, len, in, 4);
+			TEA_transform(buf, in);
+			len -= 16;
+			p += 16;
+		}
+		hash = buf[0];
+		minor_hash = buf[1];
+		break;
+	default:
+		hinfo->hash = 0;
+		return -1;
+	}
+	hash = hash & ~1;
+	if (hash == (EXT4_HTREE_EOF << 1))
+		hash = (EXT4_HTREE_EOF-1) << 1;
+	hinfo->hash = hash;
+	hinfo->minor_hash = minor_hash;
+	return 0;
+}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
new file mode 100644
index 00000000..412469b2
--- /dev/null
+++ b/fs/ext4/ialloc.c
@@ -0,0 +1,1338 @@
+/*
+ *  linux/fs/ext4/ialloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  BSD ufs-inspired inode and directory allocation by
+ *  Stephen Tweedie (sct@redhat.com), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/random.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <asm/byteorder.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * ialloc.c contains the inodes allocation and deallocation routines
+ */
+
+/*
+ * The free inodes are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.
+ */
+
+/*
+ * To avoid calling the atomic setbit hundreds or thousands of times, we only
+ * need to use it within a single byte (to ensure we get endianness right).
+ * We can use memset for the rest of the bitmap as there are no other users.
+ */
+void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+{
+	int i;
+
+	if (start_bit >= end_bit)
+		return;
+
+	ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
+	for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
+		ext4_set_bit(i, bitmap);
+	if (i < end_bit)
+		memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
+}
+
+/* Initializes an uninitialized inode bitmap */
+static unsigned ext4_init_inode_bitmap(struct super_block *sb,
+				       struct buffer_head *bh,
+				       ext4_group_t block_group,
+				       struct ext4_group_desc *gdp)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	J_ASSERT_BH(bh, buffer_locked(bh));
+
+	/* If checksum is bad mark all blocks and inodes use to prevent
+	 * allocation, essentially implementing a per-group read-only flag. */
+	if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
+		ext4_error(sb, "Checksum bad for group %u", block_group);
+		ext4_free_blks_set(sb, gdp, 0);
+		ext4_free_inodes_set(sb, gdp, 0);
+		ext4_itable_unused_set(sb, gdp, 0);
+		memset(bh->b_data, 0xff, sb->s_blocksize);
+		return 0;
+	}
+
+	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
+	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+			bh->b_data);
+
+	return EXT4_INODES_PER_GROUP(sb);
+}
+
+/*
+ * Read the inode allocation bitmap for a given block_group, reading
+ * into the specified slot in the superblock's bitmap cache.
+ *
+ * Return buffer_head of bitmap on success or NULL.
+ */
+static struct buffer_head *
+ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
+{
+	struct ext4_group_desc *desc;
+	struct buffer_head *bh = NULL;
+	ext4_fsblk_t bitmap_blk;
+
+	desc = ext4_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		return NULL;
+
+	bitmap_blk = ext4_inode_bitmap(sb, desc);
+	bh = sb_getblk(sb, bitmap_blk);
+	if (unlikely(!bh)) {
+		ext4_error(sb, "Cannot read inode bitmap - "
+			    "block_group = %u, inode_bitmap = %llu",
+			    block_group, bitmap_blk);
+		return NULL;
+	}
+	if (bitmap_uptodate(bh))
+		return bh;
+
+	lock_buffer(bh);
+	if (bitmap_uptodate(bh)) {
+		unlock_buffer(bh);
+		return bh;
+	}
+
+	ext4_lock_group(sb, block_group);
+	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+		ext4_init_inode_bitmap(sb, bh, block_group, desc);
+		set_bitmap_uptodate(bh);
+		set_buffer_uptodate(bh);
+		ext4_unlock_group(sb, block_group);
+		unlock_buffer(bh);
+		return bh;
+	}
+	ext4_unlock_group(sb, block_group);
+
+	if (buffer_uptodate(bh)) {
+		/*
+		 * if not uninit if bh is uptodate,
+		 * bitmap is also uptodate
+		 */
+		set_bitmap_uptodate(bh);
+		unlock_buffer(bh);
+		return bh;
+	}
+	/*
+	 * submit the buffer_head for read. We can
+	 * safely mark the bitmap as uptodate now.
+	 * We do it here so the bitmap uptodate bit
+	 * get set with buffer lock held.
+	 */
+	trace_ext4_load_inode_bitmap(sb, block_group);
+	set_bitmap_uptodate(bh);
+	if (bh_submit_read(bh) < 0) {
+		put_bh(bh);
+		ext4_error(sb, "Cannot read inode bitmap - "
+			    "block_group = %u, inode_bitmap = %llu",
+			    block_group, bitmap_blk);
+		return NULL;
+	}
+	return bh;
+}
+
+/*
+ * NOTE! When we get the inode, we're the only people
+ * that have access to it, and as such there are no
+ * race conditions we have to worry about. The inode
+ * is not on the hash-lists, and it cannot be reached
+ * through the filesystem because the directory entry
+ * has been deleted earlier.
+ *
+ * HOWEVER: we must make sure that we get no aliases,
+ * which means that we have to call "clear_inode()"
+ * _before_ we mark the inode not in use in the inode
+ * bitmaps. Otherwise a newly created file might use
+ * the same inode number (not actually the same pointer
+ * though), and then we'd have two inodes sharing the
+ * same inode number and space on the harddisk.
+ */
+void ext4_free_inode(handle_t *handle, struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	int is_directory;
+	unsigned long ino;
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *bh2;
+	ext4_group_t block_group;
+	unsigned long bit;
+	struct ext4_group_desc *gdp;
+	struct ext4_super_block *es;
+	struct ext4_sb_info *sbi;
+	int fatal = 0, err, count, cleared;
+
+	if (atomic_read(&inode->i_count) > 1) {
+		printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
+		       atomic_read(&inode->i_count));
+		return;
+	}
+	if (inode->i_nlink) {
+		printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
+		       inode->i_nlink);
+		return;
+	}
+	if (!sb) {
+		printk(KERN_ERR "ext4_free_inode: inode on "
+		       "nonexistent device\n");
+		return;
+	}
+	sbi = EXT4_SB(sb);
+
+	ino = inode->i_ino;
+	ext4_debug("freeing inode %lu\n", ino);
+	trace_ext4_free_inode(inode);
+
+	/*
+	 * Note: we must free any quota before locking the superblock,
+	 * as writing the quota to disk may need the lock as well.
+	 */
+	dquot_initialize(inode);
+	ext4_xattr_delete_inode(handle, inode);
+	dquot_free_inode(inode);
+	dquot_drop(inode);
+
+	is_directory = S_ISDIR(inode->i_mode);
+
+	/* Do this BEFORE marking the inode not in use or returning an error */
+	ext4_clear_inode(inode);
+
+	es = EXT4_SB(sb)->s_es;
+	if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+		ext4_error(sb, "reserved or nonexistent inode %lu", ino);
+		goto error_return;
+	}
+	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		goto error_return;
+
+	BUFFER_TRACE(bitmap_bh, "get_write_access");
+	fatal = ext4_journal_get_write_access(handle, bitmap_bh);
+	if (fatal)
+		goto error_return;
+
+	fatal = -ESRCH;
+	gdp = ext4_get_group_desc(sb, block_group, &bh2);
+	if (gdp) {
+		BUFFER_TRACE(bh2, "get_write_access");
+		fatal = ext4_journal_get_write_access(handle, bh2);
+	}
+	ext4_lock_group(sb, block_group);
+	cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+	if (fatal || !cleared) {
+		ext4_unlock_group(sb, block_group);
+		goto out;
+	}
+
+	count = ext4_free_inodes_count(sb, gdp) + 1;
+	ext4_free_inodes_set(sb, gdp, count);
+	if (is_directory) {
+		count = ext4_used_dirs_count(sb, gdp) - 1;
+		ext4_used_dirs_set(sb, gdp, count);
+		percpu_counter_dec(&sbi->s_dirs_counter);
+	}
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+	ext4_unlock_group(sb, block_group);
+
+	percpu_counter_inc(&sbi->s_freeinodes_counter);
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t f = ext4_flex_group(sbi, block_group);
+
+		atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+		if (is_directory)
+			atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+	}
+	BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+	fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
+out:
+	if (cleared) {
+		BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+		if (!fatal)
+			fatal = err;
+		ext4_mark_super_dirty(sb);
+	} else
+		ext4_error(sb, "bit already cleared for inode %lu", ino);
+
+error_return:
+	brelse(bitmap_bh);
+	ext4_std_error(sb, fatal);
+}
+
+/*
+ * There are two policies for allocating an inode.  If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory\'s block
+ * group to find a free inode.
+ */
+static int find_group_dir(struct super_block *sb, struct inode *parent,
+				ext4_group_t *best_group)
+{
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	unsigned int freei, avefreei;
+	struct ext4_group_desc *desc, *best_desc = NULL;
+	ext4_group_t group;
+	int ret = -1;
+
+	freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
+	avefreei = freei / ngroups;
+
+	for (group = 0; group < ngroups; group++) {
+		desc = ext4_get_group_desc(sb, group, NULL);
+		if (!desc || !ext4_free_inodes_count(sb, desc))
+			continue;
+		if (ext4_free_inodes_count(sb, desc) < avefreei)
+			continue;
+		if (!best_desc ||
+		    (ext4_free_blks_count(sb, desc) >
+		     ext4_free_blks_count(sb, best_desc))) {
+			*best_group = group;
+			best_desc = desc;
+			ret = 0;
+		}
+	}
+	return ret;
+}
+
+#define free_block_ratio 10
+
+static int find_group_flex(struct super_block *sb, struct inode *parent,
+			   ext4_group_t *best_group)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *desc;
+	struct flex_groups *flex_group = sbi->s_flex_groups;
+	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+	ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	int flex_size = ext4_flex_bg_size(sbi);
+	ext4_group_t best_flex = parent_fbg_group;
+	int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+	int flexbg_free_blocks;
+	int flex_freeb_ratio;
+	ext4_group_t n_fbg_groups;
+	ext4_group_t i;
+
+	n_fbg_groups = (ngroups + flex_size - 1) >>
+		sbi->s_log_groups_per_flex;
+
+find_close_to_parent:
+	flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
+	flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+	if (atomic_read(&flex_group[best_flex].free_inodes) &&
+	    flex_freeb_ratio > free_block_ratio)
+		goto found_flexbg;
+
+	if (best_flex && best_flex == parent_fbg_group) {
+		best_flex--;
+		goto find_close_to_parent;
+	}
+
+	for (i = 0; i < n_fbg_groups; i++) {
+		if (i == parent_fbg_group || i == parent_fbg_group - 1)
+			continue;
+
+		flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
+		flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+
+		if (flex_freeb_ratio > free_block_ratio &&
+		    (atomic_read(&flex_group[i].free_inodes))) {
+			best_flex = i;
+			goto found_flexbg;
+		}
+
+		if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
+		    ((atomic_read(&flex_group[i].free_blocks) >
+		      atomic_read(&flex_group[best_flex].free_blocks)) &&
+		     atomic_read(&flex_group[i].free_inodes)))
+			best_flex = i;
+	}
+
+	if (!atomic_read(&flex_group[best_flex].free_inodes) ||
+	    !atomic_read(&flex_group[best_flex].free_blocks))
+		return -1;
+
+found_flexbg:
+	for (i = best_flex * flex_size; i < ngroups &&
+		     i < (best_flex + 1) * flex_size; i++) {
+		desc = ext4_get_group_desc(sb, i, NULL);
+		if (ext4_free_inodes_count(sb, desc)) {
+			*best_group = i;
+			goto out;
+		}
+	}
+
+	return -1;
+out:
+	return 0;
+}
+
+struct orlov_stats {
+	__u32 free_inodes;
+	__u32 free_blocks;
+	__u32 used_dirs;
+};
+
+/*
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg.  If flex_size is 1, then g
+ * is a block group number; otherwise it is flex_bg number.
+ */
+static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+			    int flex_size, struct orlov_stats *stats)
+{
+	struct ext4_group_desc *desc;
+	struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
+
+	if (flex_size > 1) {
+		stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
+		stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
+		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+		return;
+	}
+
+	desc = ext4_get_group_desc(sb, g, NULL);
+	if (desc) {
+		stats->free_inodes = ext4_free_inodes_count(sb, desc);
+		stats->free_blocks = ext4_free_blks_count(sb, desc);
+		stats->used_dirs = ext4_used_dirs_count(sb, desc);
+	} else {
+		stats->free_inodes = 0;
+		stats->free_blocks = 0;
+		stats->used_dirs = 0;
+	}
+}
+
+/*
+ * Orlov's allocator for directories.
+ *
+ * We always try to spread first-level directories.
+ *
+ * If there are blockgroups with both free inodes and free blocks counts
+ * not worse than average we return one with smallest directory count.
+ * Otherwise we simply return a random group.
+ *
+ * For the rest rules look so:
+ *
+ * It's OK to put directory into a group unless
+ * it has too many directories already (max_dirs) or
+ * it has too few free inodes left (min_inodes) or
+ * it has too few free blocks left (min_blocks) or
+ * Parent's group is preferred, if it doesn't satisfy these
+ * conditions we search cyclically through the rest. If none
+ * of the groups look good we just look for a group with more
+ * free inodes than average (starting at parent's group).
+ */
+
+static int find_group_orlov(struct super_block *sb, struct inode *parent,
+			    ext4_group_t *group, int mode,
+			    const struct qstr *qstr)
+{
+	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_group_t real_ngroups = ext4_get_groups_count(sb);
+	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
+	unsigned int freei, avefreei;
+	ext4_fsblk_t freeb, avefreeb;
+	unsigned int ndirs;
+	int max_dirs, min_inodes;
+	ext4_grpblk_t min_blocks;
+	ext4_group_t i, grp, g, ngroups;
+	struct ext4_group_desc *desc;
+	struct orlov_stats stats;
+	int flex_size = ext4_flex_bg_size(sbi);
+	struct dx_hash_info hinfo;
+
+	ngroups = real_ngroups;
+	if (flex_size > 1) {
+		ngroups = (real_ngroups + flex_size - 1) >>
+			sbi->s_log_groups_per_flex;
+		parent_group >>= sbi->s_log_groups_per_flex;
+	}
+
+	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
+	avefreei = freei / ngroups;
+	freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	avefreeb = freeb;
+	do_div(avefreeb, ngroups);
+	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
+
+	if (S_ISDIR(mode) &&
+	    ((parent == sb->s_root->d_inode) ||
+	     (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
+		int best_ndir = inodes_per_group;
+		int ret = -1;
+
+		if (qstr) {
+			hinfo.hash_version = DX_HASH_HALF_MD4;
+			hinfo.seed = sbi->s_hash_seed;
+			ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
+			grp = hinfo.hash;
+		} else
+			get_random_bytes(&grp, sizeof(grp));
+		parent_group = (unsigned)grp % ngroups;
+		for (i = 0; i < ngroups; i++) {
+			g = (parent_group + i) % ngroups;
+			get_orlov_stats(sb, g, flex_size, &stats);
+			if (!stats.free_inodes)
+				continue;
+			if (stats.used_dirs >= best_ndir)
+				continue;
+			if (stats.free_inodes < avefreei)
+				continue;
+			if (stats.free_blocks < avefreeb)
+				continue;
+			grp = g;
+			ret = 0;
+			best_ndir = stats.used_dirs;
+		}
+		if (ret)
+			goto fallback;
+	found_flex_bg:
+		if (flex_size == 1) {
+			*group = grp;
+			return 0;
+		}
+
+		/*
+		 * We pack inodes at the beginning of the flexgroup's
+		 * inode tables.  Block allocation decisions will do
+		 * something similar, although regular files will
+		 * start at 2nd block group of the flexgroup.  See
+		 * ext4_ext_find_goal() and ext4_find_near().
+		 */
+		grp *= flex_size;
+		for (i = 0; i < flex_size; i++) {
+			if (grp+i >= real_ngroups)
+				break;
+			desc = ext4_get_group_desc(sb, grp+i, NULL);
+			if (desc && ext4_free_inodes_count(sb, desc)) {
+				*group = grp+i;
+				return 0;
+			}
+		}
+		goto fallback;
+	}
+
+	max_dirs = ndirs / ngroups + inodes_per_group / 16;
+	min_inodes = avefreei - inodes_per_group*flex_size / 4;
+	if (min_inodes < 1)
+		min_inodes = 1;
+	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
+
+	/*
+	 * Start looking in the flex group where we last allocated an
+	 * inode for this parent directory
+	 */
+	if (EXT4_I(parent)->i_last_alloc_group != ~0) {
+		parent_group = EXT4_I(parent)->i_last_alloc_group;
+		if (flex_size > 1)
+			parent_group >>= sbi->s_log_groups_per_flex;
+	}
+
+	for (i = 0; i < ngroups; i++) {
+		grp = (parent_group + i) % ngroups;
+		get_orlov_stats(sb, grp, flex_size, &stats);
+		if (stats.used_dirs >= max_dirs)
+			continue;
+		if (stats.free_inodes < min_inodes)
+			continue;
+		if (stats.free_blocks < min_blocks)
+			continue;
+		goto found_flex_bg;
+	}
+
+fallback:
+	ngroups = real_ngroups;
+	avefreei = freei / ngroups;
+fallback_retry:
+	parent_group = EXT4_I(parent)->i_block_group;
+	for (i = 0; i < ngroups; i++) {
+		grp = (parent_group + i) % ngroups;
+		desc = ext4_get_group_desc(sb, grp, NULL);
+		if (desc && ext4_free_inodes_count(sb, desc) &&
+		    ext4_free_inodes_count(sb, desc) >= avefreei) {
+			*group = grp;
+			return 0;
+		}
+	}
+
+	if (avefreei) {
+		/*
+		 * The free-inodes counter is approximate, and for really small
+		 * filesystems the above test can fail to find any blockgroups
+		 */
+		avefreei = 0;
+		goto fallback_retry;
+	}
+
+	return -1;
+}
+
+static int find_group_other(struct super_block *sb, struct inode *parent,
+			    ext4_group_t *group, int mode)
+{
+	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+	ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
+	struct ext4_group_desc *desc;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+
+	/*
+	 * Try to place the inode is the same flex group as its
+	 * parent.  If we can't find space, use the Orlov algorithm to
+	 * find another flex group, and store that information in the
+	 * parent directory's inode information so that use that flex
+	 * group for future allocations.
+	 */
+	if (flex_size > 1) {
+		int retry = 0;
+
+	try_again:
+		parent_group &= ~(flex_size-1);
+		last = parent_group + flex_size;
+		if (last > ngroups)
+			last = ngroups;
+		for  (i = parent_group; i < last; i++) {
+			desc = ext4_get_group_desc(sb, i, NULL);
+			if (desc && ext4_free_inodes_count(sb, desc)) {
+				*group = i;
+				return 0;
+			}
+		}
+		if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+			retry = 1;
+			parent_group = EXT4_I(parent)->i_last_alloc_group;
+			goto try_again;
+		}
+		/*
+		 * If this didn't work, use the Orlov search algorithm
+		 * to find a new flex group; we pass in the mode to
+		 * avoid the topdir algorithms.
+		 */
+		*group = parent_group + flex_size;
+		if (*group > ngroups)
+			*group = 0;
+		return find_group_orlov(sb, parent, group, mode, NULL);
+	}
+
+	/*
+	 * Try to place the inode in its parent directory
+	 */
+	*group = parent_group;
+	desc = ext4_get_group_desc(sb, *group, NULL);
+	if (desc && ext4_free_inodes_count(sb, desc) &&
+			ext4_free_blks_count(sb, desc))
+		return 0;
+
+	/*
+	 * We're going to place this inode in a different blockgroup from its
+	 * parent.  We want to cause files in a common directory to all land in
+	 * the same blockgroup.  But we want files which are in a different
+	 * directory which shares a blockgroup with our parent to land in a
+	 * different blockgroup.
+	 *
+	 * So add our directory's i_ino into the starting point for the hash.
+	 */
+	*group = (*group + parent->i_ino) % ngroups;
+
+	/*
+	 * Use a quadratic hash to find a group with a free inode and some free
+	 * blocks.
+	 */
+	for (i = 1; i < ngroups; i <<= 1) {
+		*group += i;
+		if (*group >= ngroups)
+			*group -= ngroups;
+		desc = ext4_get_group_desc(sb, *group, NULL);
+		if (desc && ext4_free_inodes_count(sb, desc) &&
+				ext4_free_blks_count(sb, desc))
+			return 0;
+	}
+
+	/*
+	 * That failed: try linear search for a free inode, even if that group
+	 * has no free blocks.
+	 */
+	*group = parent_group;
+	for (i = 0; i < ngroups; i++) {
+		if (++*group >= ngroups)
+			*group = 0;
+		desc = ext4_get_group_desc(sb, *group, NULL);
+		if (desc && ext4_free_inodes_count(sb, desc))
+			return 0;
+	}
+
+	return -1;
+}
+
+/*
+ * claim the inode from the inode bitmap. If the group
+ * is uninit we need to take the groups's ext4_group_lock
+ * and clear the uninit flag. The inode bitmap update
+ * and group desc uninit flag clear should be done
+ * after holding ext4_group_lock so that ext4_read_inode_bitmap
+ * doesn't race with the ext4_claim_inode
+ */
+static int ext4_claim_inode(struct super_block *sb,
+			struct buffer_head *inode_bitmap_bh,
+			unsigned long ino, ext4_group_t group, int mode)
+{
+	int free = 0, retval = 0, count;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+
+	/*
+	 * We have to be sure that new inode allocation does not race with
+	 * inode table initialization, because otherwise we may end up
+	 * allocating and writing new inode right before sb_issue_zeroout
+	 * takes place and overwriting our new inode with zeroes. So we
+	 * take alloc_sem to prevent it.
+	 */
+	down_read(&grp->alloc_sem);
+	ext4_lock_group(sb, group);
+	if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+		/* not a free inode */
+		retval = 1;
+		goto err_ret;
+	}
+	ino++;
+	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
+			ino > EXT4_INODES_PER_GROUP(sb)) {
+		ext4_unlock_group(sb, group);
+		up_read(&grp->alloc_sem);
+		ext4_error(sb, "reserved inode or inode > inodes count - "
+			   "block_group = %u, inode=%lu", group,
+			   ino + group * EXT4_INODES_PER_GROUP(sb));
+		return 1;
+	}
+	/* If we didn't allocate from within the initialized part of the inode
+	 * table then we need to initialize up to this inode. */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+
+		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+			/* When marking the block group with
+			 * ~EXT4_BG_INODE_UNINIT we don't want to depend
+			 * on the value of bg_itable_unused even though
+			 * mke2fs could have initialized the same for us.
+			 * Instead we calculated the value below
+			 */
+
+			free = 0;
+		} else {
+			free = EXT4_INODES_PER_GROUP(sb) -
+				ext4_itable_unused_count(sb, gdp);
+		}
+
+		/*
+		 * Check the relative inode number against the last used
+		 * relative inode number in this group. if it is greater
+		 * we need to  update the bg_itable_unused count
+		 *
+		 */
+		if (ino > free)
+			ext4_itable_unused_set(sb, gdp,
+					(EXT4_INODES_PER_GROUP(sb) - ino));
+	}
+	count = ext4_free_inodes_count(sb, gdp) - 1;
+	ext4_free_inodes_set(sb, gdp, count);
+	if (S_ISDIR(mode)) {
+		count = ext4_used_dirs_count(sb, gdp) + 1;
+		ext4_used_dirs_set(sb, gdp, count);
+		if (sbi->s_log_groups_per_flex) {
+			ext4_group_t f = ext4_flex_group(sbi, group);
+
+			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+		}
+	}
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+err_ret:
+	ext4_unlock_group(sb, group);
+	up_read(&grp->alloc_sem);
+	return retval;
+}
+
+/*
+ * There are two policies for allocating an inode.  If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory's block
+ * group to find a free inode.
+ */
+struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
+			     const struct qstr *qstr, __u32 goal)
+{
+	struct super_block *sb;
+	struct buffer_head *inode_bitmap_bh = NULL;
+	struct buffer_head *group_desc_bh;
+	ext4_group_t ngroups, group = 0;
+	unsigned long ino = 0;
+	struct inode *inode;
+	struct ext4_group_desc *gdp = NULL;
+	struct ext4_inode_info *ei;
+	struct ext4_sb_info *sbi;
+	int ret2, err = 0;
+	struct inode *ret;
+	ext4_group_t i;
+	int free = 0;
+	static int once = 1;
+	ext4_group_t flex_group;
+
+	/* Cannot create files in a deleted directory */
+	if (!dir || !dir->i_nlink)
+		return ERR_PTR(-EPERM);
+
+	sb = dir->i_sb;
+	ngroups = ext4_get_groups_count(sb);
+	trace_ext4_request_inode(dir, mode);
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	ei = EXT4_I(inode);
+	sbi = EXT4_SB(sb);
+
+	if (!goal)
+		goal = sbi->s_inode_goal;
+
+	if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
+		group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
+		ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
+		ret2 = 0;
+		goto got_group;
+	}
+
+	if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
+		ret2 = find_group_flex(sb, dir, &group);
+		if (ret2 == -1) {
+			ret2 = find_group_other(sb, dir, &group, mode);
+			if (ret2 == 0 && once) {
+				once = 0;
+				printk(KERN_NOTICE "ext4: find_group_flex "
+				       "failed, fallback succeeded dir %lu\n",
+				       dir->i_ino);
+			}
+		}
+		goto got_group;
+	}
+
+	if (S_ISDIR(mode)) {
+		if (test_opt(sb, OLDALLOC))
+			ret2 = find_group_dir(sb, dir, &group);
+		else
+			ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
+	} else
+		ret2 = find_group_other(sb, dir, &group, mode);
+
+got_group:
+	EXT4_I(dir)->i_last_alloc_group = group;
+	err = -ENOSPC;
+	if (ret2 == -1)
+		goto out;
+
+	for (i = 0; i < ngroups; i++, ino = 0) {
+		err = -EIO;
+
+		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+		if (!gdp)
+			goto fail;
+
+		brelse(inode_bitmap_bh);
+		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
+		if (!inode_bitmap_bh)
+			goto fail;
+
+repeat_in_this_group:
+		ino = ext4_find_next_zero_bit((unsigned long *)
+					      inode_bitmap_bh->b_data,
+					      EXT4_INODES_PER_GROUP(sb), ino);
+
+		if (ino < EXT4_INODES_PER_GROUP(sb)) {
+
+			BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+			err = ext4_journal_get_write_access(handle,
+							    inode_bitmap_bh);
+			if (err)
+				goto fail;
+
+			BUFFER_TRACE(group_desc_bh, "get_write_access");
+			err = ext4_journal_get_write_access(handle,
+								group_desc_bh);
+			if (err)
+				goto fail;
+			if (!ext4_claim_inode(sb, inode_bitmap_bh,
+						ino, group, mode)) {
+				/* we won it */
+				BUFFER_TRACE(inode_bitmap_bh,
+					"call ext4_handle_dirty_metadata");
+				err = ext4_handle_dirty_metadata(handle,
+								 NULL,
+							inode_bitmap_bh);
+				if (err)
+					goto fail;
+				/* zero bit is inode number 1*/
+				ino++;
+				goto got;
+			}
+			/* we lost it */
+			ext4_handle_release_buffer(handle, inode_bitmap_bh);
+			ext4_handle_release_buffer(handle, group_desc_bh);
+
+			if (++ino < EXT4_INODES_PER_GROUP(sb))
+				goto repeat_in_this_group;
+		}
+
+		/*
+		 * This case is possible in concurrent environment.  It is very
+		 * rare.  We cannot repeat the find_group_xxx() call because
+		 * that will simply return the same blockgroup, because the
+		 * group descriptor metadata has not yet been updated.
+		 * So we just go onto the next blockgroup.
+		 */
+		if (++group == ngroups)
+			group = 0;
+	}
+	err = -ENOSPC;
+	goto out;
+
+got:
+	/* We may have to initialize the block bitmap if it isn't already */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
+	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+		struct buffer_head *block_bitmap_bh;
+
+		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+		BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
+		err = ext4_journal_get_write_access(handle, block_bitmap_bh);
+		if (err) {
+			brelse(block_bitmap_bh);
+			goto fail;
+		}
+
+		free = 0;
+		ext4_lock_group(sb, group);
+		/* recheck and clear flag under lock if we still need to */
+		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+			free = ext4_free_blocks_after_init(sb, group, gdp);
+			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+			ext4_free_blks_set(sb, gdp, free);
+			gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
+								gdp);
+		}
+		ext4_unlock_group(sb, group);
+
+		/* Don't need to dirty bitmap block if we didn't change it */
+		if (free) {
+			BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+			err = ext4_handle_dirty_metadata(handle,
+							NULL, block_bitmap_bh);
+		}
+
+		brelse(block_bitmap_bh);
+		if (err)
+			goto fail;
+	}
+	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
+	if (err)
+		goto fail;
+
+	percpu_counter_dec(&sbi->s_freeinodes_counter);
+	if (S_ISDIR(mode))
+		percpu_counter_inc(&sbi->s_dirs_counter);
+	ext4_mark_super_dirty(sb);
+
+	if (sbi->s_log_groups_per_flex) {
+		flex_group = ext4_flex_group(sbi, group);
+		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
+	}
+
+	if (test_opt(sb, GRPID)) {
+		inode->i_mode = mode;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = dir->i_gid;
+	} else
+		inode_init_owner(inode, dir, mode);
+
+	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
+	/* This is the optimal IO size (for stat), not the fs block size */
+	inode->i_blocks = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
+						       ext4_current_time(inode);
+
+	memset(ei->i_data, 0, sizeof(ei->i_data));
+	ei->i_dir_start_lookup = 0;
+	ei->i_disksize = 0;
+
+	/*
+	 * Don't inherit extent flag from directory, amongst others. We set
+	 * extent flag on newly created directory and file only if -o extent
+	 * mount option is specified
+	 */
+	ei->i_flags =
+		ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
+	ei->i_file_acl = 0;
+	ei->i_dtime = 0;
+	ei->i_block_group = group;
+	ei->i_last_alloc_group = ~0;
+
+	ext4_set_inode_flags(inode);
+	if (IS_DIRSYNC(inode))
+		ext4_handle_sync(handle);
+	if (insert_inode_locked(inode) < 0) {
+		/*
+		 * Likely a bitmap corruption causing inode to be allocated
+		 * twice.
+		 */
+		err = -EIO;
+		goto fail;
+	}
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+
+	ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
+	ext4_set_inode_state(inode, EXT4_STATE_NEW);
+
+	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
+
+	ret = inode;
+	dquot_initialize(inode);
+	err = dquot_alloc_inode(inode);
+	if (err)
+		goto fail_drop;
+
+	err = ext4_init_acl(handle, inode, dir);
+	if (err)
+		goto fail_free_drop;
+
+	err = ext4_init_security(handle, inode, dir, qstr);
+	if (err)
+		goto fail_free_drop;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+		/* set extent flag only for directory, file and normal symlink*/
+		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
+			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+			ext4_ext_tree_init(handle, inode);
+		}
+	}
+
+	if (ext4_handle_valid(handle)) {
+		ei->i_sync_tid = handle->h_transaction->t_tid;
+		ei->i_datasync_tid = handle->h_transaction->t_tid;
+	}
+
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (err) {
+		ext4_std_error(sb, err);
+		goto fail_free_drop;
+	}
+
+	ext4_debug("allocating inode %lu\n", inode->i_ino);
+	trace_ext4_allocate_inode(inode, dir, mode);
+	goto really_out;
+fail:
+	ext4_std_error(sb, err);
+out:
+	iput(inode);
+	ret = ERR_PTR(err);
+really_out:
+	brelse(inode_bitmap_bh);
+	return ret;
+
+fail_free_drop:
+	dquot_free_inode(inode);
+
+fail_drop:
+	dquot_drop(inode);
+	inode->i_flags |= S_NOQUOTA;
+	inode->i_nlink = 0;
+	unlock_new_inode(inode);
+	iput(inode);
+	brelse(inode_bitmap_bh);
+	return ERR_PTR(err);
+}
+
+/* Verify that we are loading a valid orphan from disk */
+struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
+{
+	unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
+	ext4_group_t block_group;
+	int bit;
+	struct buffer_head *bitmap_bh;
+	struct inode *inode = NULL;
+	long err = -EIO;
+
+	/* Error cases - e2fsck has already cleaned up for us */
+	if (ino > max_ino) {
+		ext4_warning(sb, "bad orphan ino %lu!  e2fsck was run?", ino);
+		goto error;
+	}
+
+	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
+	if (!bitmap_bh) {
+		ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
+		goto error;
+	}
+
+	/* Having the inode bit set should be a 100% indicator that this
+	 * is a valid orphan (no e2fsck run on fs).  Orphans also include
+	 * inodes that were being truncated, so we can't check i_nlink==0.
+	 */
+	if (!ext4_test_bit(bit, bitmap_bh->b_data))
+		goto bad_orphan;
+
+	inode = ext4_iget(sb, ino);
+	if (IS_ERR(inode))
+		goto iget_failed;
+
+	/*
+	 * If the orphans has i_nlinks > 0 then it should be able to be
+	 * truncated, otherwise it won't be removed from the orphan list
+	 * during processing and an infinite loop will result.
+	 */
+	if (inode->i_nlink && !ext4_can_truncate(inode))
+		goto bad_orphan;
+
+	if (NEXT_ORPHAN(inode) > max_ino)
+		goto bad_orphan;
+	brelse(bitmap_bh);
+	return inode;
+
+iget_failed:
+	err = PTR_ERR(inode);
+	inode = NULL;
+bad_orphan:
+	ext4_warning(sb, "bad orphan inode %lu!  e2fsck was run?", ino);
+	printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+	       bit, (unsigned long long)bitmap_bh->b_blocknr,
+	       ext4_test_bit(bit, bitmap_bh->b_data));
+	printk(KERN_NOTICE "inode=%p\n", inode);
+	if (inode) {
+		printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+		       is_bad_inode(inode));
+		printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
+		       NEXT_ORPHAN(inode));
+		printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+		printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
+		/* Avoid freeing blocks if we got a bad deleted inode */
+		if (inode->i_nlink == 0)
+			inode->i_blocks = 0;
+		iput(inode);
+	}
+	brelse(bitmap_bh);
+error:
+	return ERR_PTR(err);
+}
+
+unsigned long ext4_count_free_inodes(struct super_block *sb)
+{
+	unsigned long desc_count;
+	struct ext4_group_desc *gdp;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+#ifdef EXT4FS_DEBUG
+	struct ext4_super_block *es;
+	unsigned long bitmap_count, x;
+	struct buffer_head *bitmap_bh = NULL;
+
+	es = EXT4_SB(sb)->s_es;
+	desc_count = 0;
+	bitmap_count = 0;
+	gdp = NULL;
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += ext4_free_inodes_count(sb, gdp);
+		brelse(bitmap_bh);
+		bitmap_bh = ext4_read_inode_bitmap(sb, i);
+		if (!bitmap_bh)
+			continue;
+
+		x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
+		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
+			(unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
+		bitmap_count += x;
+	}
+	brelse(bitmap_bh);
+	printk(KERN_DEBUG "ext4_count_free_inodes: "
+	       "stored = %u, computed = %lu, %lu\n",
+	       le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+	return desc_count;
+#else
+	desc_count = 0;
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += ext4_free_inodes_count(sb, gdp);
+		cond_resched();
+	}
+	return desc_count;
+#endif
+}
+
+/* Called at mount-time, super-block is locked */
+unsigned long ext4_count_dirs(struct super_block * sb)
+{
+	unsigned long count = 0;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+
+	for (i = 0; i < ngroups; i++) {
+		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		count += ext4_used_dirs_count(sb, gdp);
+	}
+	return count;
+}
+
+/*
+ * Zeroes not yet zeroed inode table - just write zeroes through the whole
+ * inode table. Must be called without any spinlock held. The only place
+ * where it is called from on active part of filesystem is ext4lazyinit
+ * thread, so we do not need any special locks, however we have to prevent
+ * inode allocation from the current group, so we take alloc_sem lock, to
+ * block ext4_claim_inode until we are finished.
+ */
+extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+				 int barrier)
+{
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp = NULL;
+	struct buffer_head *group_desc_bh;
+	handle_t *handle;
+	ext4_fsblk_t blk;
+	int num, ret = 0, used_blks = 0;
+
+	/* This should not happen, but just to be sure check this */
+	if (sb->s_flags & MS_RDONLY) {
+		ret = 1;
+		goto out;
+	}
+
+	gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+	if (!gdp)
+		goto out;
+
+	/*
+	 * We do not need to lock this, because we are the only one
+	 * handling this flag.
+	 */
+	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+		goto out;
+
+	handle = ext4_journal_start_sb(sb, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	down_write(&grp->alloc_sem);
+	/*
+	 * If inode bitmap was already initialized there may be some
+	 * used inodes so we need to skip blocks with used inodes in
+	 * inode table.
+	 */
+	if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+		used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+			    ext4_itable_unused_count(sb, gdp)),
+			    sbi->s_inodes_per_block);
+
+	if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
+		ext4_error(sb, "Something is wrong with group %u\n"
+			   "Used itable blocks: %d"
+			   "itable unused count: %u\n",
+			   group, used_blks,
+			   ext4_itable_unused_count(sb, gdp));
+		ret = 1;
+		goto out;
+	}
+
+	blk = ext4_inode_table(sb, gdp) + used_blks;
+	num = sbi->s_itb_per_group - used_blks;
+
+	BUFFER_TRACE(group_desc_bh, "get_write_access");
+	ret = ext4_journal_get_write_access(handle,
+					    group_desc_bh);
+	if (ret)
+		goto err_out;
+
+	/*
+	 * Skip zeroout if the inode table is full. But we set the ZEROED
+	 * flag anyway, because obviously, when it is full it does not need
+	 * further zeroing.
+	 */
+	if (unlikely(num == 0))
+		goto skip_zeroout;
+
+	ext4_debug("going to zero out inode table in group %d\n",
+		   group);
+	ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
+	if (ret < 0)
+		goto err_out;
+	if (barrier)
+		blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
+
+skip_zeroout:
+	ext4_lock_group(sb, group);
+	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+	ext4_unlock_group(sb, group);
+
+	BUFFER_TRACE(group_desc_bh,
+		     "call ext4_handle_dirty_metadata");
+	ret = ext4_handle_dirty_metadata(handle, NULL,
+					 group_desc_bh);
+
+err_out:
+	up_write(&grp->alloc_sem);
+	ext4_journal_stop(handle);
+out:
+	return ret;
+}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
new file mode 100644
index 00000000..c1e6a726
--- /dev/null
+++ b/fs/ext4/inode.c
@@ -0,0 +1,5957 @@
+/*
+ *  linux/fs/ext4/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Goal-directed block allocation by Stephen Tweedie
+ *	(sct@redhat.com), 1993, 1998
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ *	(jj@sunsite.ms.mff.cuni.cz)
+ *
+ *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "ext4_extents.h"
+
+#include <trace/events/ext4.h>
+
+#define MPAGE_DA_EXTENT_TAIL 0x01
+
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+					      loff_t new_size)
+{
+	trace_ext4_begin_ordered_truncate(inode, new_size);
+	/*
+	 * If jinode is zero, then we never opened the file for
+	 * writing, so there's no need to call
+	 * jbd2_journal_begin_ordered_truncate() since there's no
+	 * outstanding writes we need to flush.
+	 */
+	if (!EXT4_I(inode)->jinode)
+		return 0;
+	return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+						   EXT4_I(inode)->jinode,
+						   new_size);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create);
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static int ext4_inode_is_fast_symlink(struct inode *inode)
+{
+	int ea_blocks = EXT4_I(inode)->i_file_acl ?
+		(inode->i_sb->s_blocksize >> 9) : 0;
+
+	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
+}
+
+/*
+ * Work out how many blocks we need to proceed with the next chunk of a
+ * truncate transaction.
+ */
+static unsigned long blocks_for_truncate(struct inode *inode)
+{
+	ext4_lblk_t needed;
+
+	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+
+	/* Give ourselves just enough room to cope with inodes in which
+	 * i_blocks is corrupt: we've seen disk corruptions in the past
+	 * which resulted in random data in an inode which looked enough
+	 * like a regular file for ext4 to try to delete it.  Things
+	 * will go a bit crazy if that happens, but at least we should
+	 * try not to panic the whole kernel. */
+	if (needed < 2)
+		needed = 2;
+
+	/* But we need to bound the transaction so we don't overflow the
+	 * journal. */
+	if (needed > EXT4_MAX_TRANS_DATA)
+		needed = EXT4_MAX_TRANS_DATA;
+
+	return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+}
+
+/*
+ * Truncate transactions can be complex and absolutely huge.  So we need to
+ * be able to restart the transaction at a conventient checkpoint to make
+ * sure we don't overflow the journal.
+ *
+ * start_transaction gets us a new handle for a truncate transaction,
+ * and extend_transaction tries to extend the existing one a bit.  If
+ * extend fails, we need to propagate the failure up and restart the
+ * transaction in the top-level truncate loop. --sct
+ */
+static handle_t *start_transaction(struct inode *inode)
+{
+	handle_t *result;
+
+	result = ext4_journal_start(inode, blocks_for_truncate(inode));
+	if (!IS_ERR(result))
+		return result;
+
+	ext4_std_error(inode->i_sb, PTR_ERR(result));
+	return result;
+}
+
+/*
+ * Try to extend this transaction for the purposes of truncation.
+ *
+ * Returns 0 if we managed to create more room.  If we can't create more
+ * room, and the transaction must be restarted we return 1.
+ */
+static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+{
+	if (!ext4_handle_valid(handle))
+		return 0;
+	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
+		return 0;
+	if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
+		return 0;
+	return 1;
+}
+
+/*
+ * Restart the transaction associated with *handle.  This does a commit,
+ * so before we call here everything must be consistently dirtied against
+ * this transaction.
+ */
+int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+				 int nblocks)
+{
+	int ret;
+
+	/*
+	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
+	 * moment, get_block can be called only for blocks inside i_size since
+	 * page cache has been already dropped and writes are blocked by
+	 * i_mutex. So we can safely drop the i_data_sem here.
+	 */
+	BUG_ON(EXT4_JOURNAL(inode) == NULL);
+	jbd_debug(2, "restarting handle %p\n", handle);
+	up_write(&EXT4_I(inode)->i_data_sem);
+	ret = ext4_journal_restart(handle, nblocks);
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ext4_discard_preallocations(inode);
+
+	return ret;
+}
+
+/*
+ * Called at the last iput() if i_nlink is zero.
+ */
+void ext4_evict_inode(struct inode *inode)
+{
+	handle_t *handle;
+	int err;
+
+	trace_ext4_evict_inode(inode);
+
+	ext4_ioend_wait(inode);
+
+	if (inode->i_nlink) {
+		truncate_inode_pages(&inode->i_data, 0);
+		goto no_delete;
+	}
+
+	if (!is_bad_inode(inode))
+		dquot_initialize(inode);
+
+	if (ext4_should_order_data(inode))
+		ext4_begin_ordered_truncate(inode, 0);
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (is_bad_inode(inode))
+		goto no_delete;
+
+	handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
+	if (IS_ERR(handle)) {
+		ext4_std_error(inode->i_sb, PTR_ERR(handle));
+		/*
+		 * If we're going to skip the normal cleanup, we still need to
+		 * make sure that the in-core orphan linked list is properly
+		 * cleaned up.
+		 */
+		ext4_orphan_del(NULL, inode);
+		goto no_delete;
+	}
+
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+	inode->i_size = 0;
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (err) {
+		ext4_warning(inode->i_sb,
+			     "couldn't mark inode dirty (err %d)", err);
+		goto stop_handle;
+	}
+	if (inode->i_blocks)
+		ext4_truncate(inode);
+
+	/*
+	 * ext4_ext_truncate() doesn't reserve any slop when it
+	 * restarts journal transactions; therefore there may not be
+	 * enough credits left in the handle to remove the inode from
+	 * the orphan list and set the dtime field.
+	 */
+	if (!ext4_handle_has_enough_credits(handle, 3)) {
+		err = ext4_journal_extend(handle, 3);
+		if (err > 0)
+			err = ext4_journal_restart(handle, 3);
+		if (err != 0) {
+			ext4_warning(inode->i_sb,
+				     "couldn't extend journal (err %d)", err);
+		stop_handle:
+			ext4_journal_stop(handle);
+			ext4_orphan_del(NULL, inode);
+			goto no_delete;
+		}
+	}
+
+	/*
+	 * Kill off the orphan record which ext4_truncate created.
+	 * AKPM: I think this can be inside the above `if'.
+	 * Note that ext4_orphan_del() has to be able to cope with the
+	 * deletion of a non-existent orphan - this is because we don't
+	 * know if ext4_truncate() actually created an orphan record.
+	 * (Well, we could do this if we need to, but heck - it works)
+	 */
+	ext4_orphan_del(handle, inode);
+	EXT4_I(inode)->i_dtime	= get_seconds();
+
+	/*
+	 * One subtle ordering requirement: if anything has gone wrong
+	 * (transaction abort, IO errors, whatever), then we can still
+	 * do these next steps (the fs will already have been marked as
+	 * having errors), but we can't free the inode if the mark_dirty
+	 * fails.
+	 */
+	if (ext4_mark_inode_dirty(handle, inode))
+		/* If that failed, just do the required in-core inode clear. */
+		ext4_clear_inode(inode);
+	else
+		ext4_free_inode(handle, inode);
+	ext4_journal_stop(handle);
+	return;
+no_delete:
+	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
+}
+
+typedef struct {
+	__le32	*p;
+	__le32	key;
+	struct buffer_head *bh;
+} Indirect;
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+{
+	p->key = *(p->p = v);
+	p->bh = bh;
+}
+
+/**
+ *	ext4_block_to_path - parse the block number into array of offsets
+ *	@inode: inode in question (we are only interested in its superblock)
+ *	@i_block: block number to be parsed
+ *	@offsets: array to store the offsets in
+ *	@boundary: set this non-zero if the referred-to block is likely to be
+ *	       followed (on disk) by an indirect block.
+ *
+ *	To store the locations of file's data ext4 uses a data structure common
+ *	for UNIX filesystems - tree of pointers anchored in the inode, with
+ *	data blocks at leaves and indirect blocks in intermediate nodes.
+ *	This function translates the block number into path in that tree -
+ *	return value is the path length and @offsets[n] is the offset of
+ *	pointer to (n+1)th node in the nth one. If @block is out of range
+ *	(negative or too large) warning is printed and zero returned.
+ *
+ *	Note: function doesn't find node addresses, so no IO is needed. All
+ *	we need to know is the capacity of indirect blocks (taken from the
+ *	inode->i_sb).
+ */
+
+/*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+
+static int ext4_block_to_path(struct inode *inode,
+			      ext4_lblk_t i_block,
+			      ext4_lblk_t offsets[4], int *boundary)
+{
+	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
+	const long direct_blocks = EXT4_NDIR_BLOCKS,
+		indirect_blocks = ptrs,
+		double_blocks = (1 << (ptrs_bits * 2));
+	int n = 0;
+	int final = 0;
+
+	if (i_block < direct_blocks) {
+		offsets[n++] = i_block;
+		final = direct_blocks;
+	} else if ((i_block -= direct_blocks) < indirect_blocks) {
+		offsets[n++] = EXT4_IND_BLOCK;
+		offsets[n++] = i_block;
+		final = ptrs;
+	} else if ((i_block -= indirect_blocks) < double_blocks) {
+		offsets[n++] = EXT4_DIND_BLOCK;
+		offsets[n++] = i_block >> ptrs_bits;
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+		offsets[n++] = EXT4_TIND_BLOCK;
+		offsets[n++] = i_block >> (ptrs_bits * 2);
+		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else {
+		ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
+			     i_block + direct_blocks +
+			     indirect_blocks + double_blocks, inode->i_ino);
+	}
+	if (boundary)
+		*boundary = final - 1 - (i_block & (ptrs - 1));
+	return n;
+}
+
+static int __ext4_check_blockref(const char *function, unsigned int line,
+				 struct inode *inode,
+				 __le32 *p, unsigned int max)
+{
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+	__le32 *bref = p;
+	unsigned int blk;
+
+	while (bref < p+max) {
+		blk = le32_to_cpu(*bref++);
+		if (blk &&
+		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+						    blk, 1))) {
+			es->s_last_error_block = cpu_to_le64(blk);
+			ext4_error_inode(inode, function, line, blk,
+					 "invalid block");
+			return -EIO;
+		}
+	}
+	return 0;
+}
+
+
+#define ext4_check_indirect_blockref(inode, bh)                         \
+	__ext4_check_blockref(__func__, __LINE__, inode,		\
+			      (__le32 *)(bh)->b_data,			\
+			      EXT4_ADDR_PER_BLOCK((inode)->i_sb))
+
+#define ext4_check_inode_blockref(inode)                                \
+	__ext4_check_blockref(__func__, __LINE__, inode,		\
+			      EXT4_I(inode)->i_data,			\
+			      EXT4_NDIR_BLOCKS)
+
+/**
+ *	ext4_get_branch - read the chain of indirect blocks leading to data
+ *	@inode: inode in question
+ *	@depth: depth of the chain (1 - direct pointer, etc.)
+ *	@offsets: offsets of pointers in inode/indirect blocks
+ *	@chain: place to store the result
+ *	@err: here we store the error value
+ *
+ *	Function fills the array of triples <key, p, bh> and returns %NULL
+ *	if everything went OK or the pointer to the last filled triple
+ *	(incomplete one) otherwise. Upon the return chain[i].key contains
+ *	the number of (i+1)-th block in the chain (as it is stored in memory,
+ *	i.e. little-endian 32-bit), chain[i].p contains the address of that
+ *	number (it points into struct inode for i==0 and into the bh->b_data
+ *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ *	block for i>0 and NULL for i==0. In other words, it holds the block
+ *	numbers of the chain, addresses they were taken from (and where we can
+ *	verify that chain did not change) and buffer_heads hosting these
+ *	numbers.
+ *
+ *	Function stops when it stumbles upon zero pointer (absent block)
+ *		(pointer to last triple returned, *@err == 0)
+ *	or when it gets an IO error reading an indirect block
+ *		(ditto, *@err == -EIO)
+ *	or when it reads all @depth-1 indirect blocks successfully and finds
+ *	the whole chain, all way to the data (returns %NULL, *err == 0).
+ *
+ *      Need to be called with
+ *      down_read(&EXT4_I(inode)->i_data_sem)
+ */
+static Indirect *ext4_get_branch(struct inode *inode, int depth,
+				 ext4_lblk_t  *offsets,
+				 Indirect chain[4], int *err)
+{
+	struct super_block *sb = inode->i_sb;
+	Indirect *p = chain;
+	struct buffer_head *bh;
+
+	*err = 0;
+	/* i_data is not going away, no lock needed */
+	add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
+	if (!p->key)
+		goto no_block;
+	while (--depth) {
+		bh = sb_getblk(sb, le32_to_cpu(p->key));
+		if (unlikely(!bh))
+			goto failure;
+
+		if (!bh_uptodate_or_lock(bh)) {
+			if (bh_submit_read(bh) < 0) {
+				put_bh(bh);
+				goto failure;
+			}
+			/* validate block references */
+			if (ext4_check_indirect_blockref(inode, bh)) {
+				put_bh(bh);
+				goto failure;
+			}
+		}
+
+		add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
+		/* Reader: end */
+		if (!p->key)
+			goto no_block;
+	}
+	return NULL;
+
+failure:
+	*err = -EIO;
+no_block:
+	return p;
+}
+
+/**
+ *	ext4_find_near - find a place for allocation with sufficient locality
+ *	@inode: owner
+ *	@ind: descriptor of indirect block.
+ *
+ *	This function returns the preferred place for block allocation.
+ *	It is used when heuristic for sequential allocation fails.
+ *	Rules are:
+ *	  + if there is a block to the left of our position - allocate near it.
+ *	  + if pointer will live in indirect block - allocate near that block.
+ *	  + if pointer will live in inode - allocate in the same
+ *	    cylinder group.
+ *
+ * In the latter case we colour the starting block by the callers PID to
+ * prevent it from clashing with concurrent allocations for a different inode
+ * in the same block group.   The PID is used here so that functionally related
+ * files will be close-by on-disk.
+ *
+ *	Caller must make sure that @ind is valid and will stay that way.
+ */
+static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	__le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
+	__le32 *p;
+	ext4_fsblk_t bg_start;
+	ext4_fsblk_t last_block;
+	ext4_grpblk_t colour;
+	ext4_group_t block_group;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
+
+	/* Try to find previous block */
+	for (p = ind->p - 1; p >= start; p--) {
+		if (*p)
+			return le32_to_cpu(*p);
+	}
+
+	/* No such thing, so let's try location of indirect block */
+	if (ind->bh)
+		return ind->bh->b_blocknr;
+
+	/*
+	 * It is going to be referred to from the inode itself? OK, just put it
+	 * into the same cylinder group then.
+	 */
+	block_group = ei->i_block_group;
+	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+		block_group &= ~(flex_size-1);
+		if (S_ISREG(inode->i_mode))
+			block_group++;
+	}
+	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
+	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+
+	/*
+	 * If we are doing delayed allocation, we don't need take
+	 * colour into account.
+	 */
+	if (test_opt(inode->i_sb, DELALLOC))
+		return bg_start;
+
+	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
+		colour = (current->pid % 16) *
+			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	else
+		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
+	return bg_start + colour;
+}
+
+/**
+ *	ext4_find_goal - find a preferred place for allocation.
+ *	@inode: owner
+ *	@block:  block we want
+ *	@partial: pointer to the last triple within a chain
+ *
+ *	Normally this function find the preferred place for block allocation,
+ *	returns it.
+ *	Because this is only used for non-extent files, we limit the block nr
+ *	to 32 bits.
+ */
+static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
+				   Indirect *partial)
+{
+	ext4_fsblk_t goal;
+
+	/*
+	 * XXX need to get goal block from mballoc's data structures
+	 */
+
+	goal = ext4_find_near(inode, partial);
+	goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+	return goal;
+}
+
+/**
+ *	ext4_blks_to_allocate - Look up the block map and count the number
+ *	of direct blocks need to be allocated for the given branch.
+ *
+ *	@branch: chain of indirect blocks
+ *	@k: number of blocks need for indirect blocks
+ *	@blks: number of data blocks to be mapped.
+ *	@blocks_to_boundary:  the offset in the indirect block
+ *
+ *	return the total number of blocks to be allocate, including the
+ *	direct and indirect blocks.
+ */
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
+				 int blocks_to_boundary)
+{
+	unsigned int count = 0;
+
+	/*
+	 * Simple case, [t,d]Indirect block(s) has not allocated yet
+	 * then it's clear blocks on that path have not allocated
+	 */
+	if (k > 0) {
+		/* right now we don't handle cross boundary allocation */
+		if (blks < blocks_to_boundary + 1)
+			count += blks;
+		else
+			count += blocks_to_boundary + 1;
+		return count;
+	}
+
+	count++;
+	while (count < blks && count <= blocks_to_boundary &&
+		le32_to_cpu(*(branch[0].p + count)) == 0) {
+		count++;
+	}
+	return count;
+}
+
+/**
+ *	ext4_alloc_blocks: multiple allocate blocks needed for a branch
+ *	@handle: handle for this transaction
+ *	@inode: inode which needs allocated blocks
+ *	@iblock: the logical block to start allocated at
+ *	@goal: preferred physical block of allocation
+ *	@indirect_blks: the number of blocks need to allocate for indirect
+ *			blocks
+ *	@blks: number of desired blocks
+ *	@new_blocks: on return it will store the new block numbers for
+ *	the indirect blocks(if needed) and the first direct block,
+ *	@err: on return it will store the error code
+ *
+ *	This function will return the number of blocks allocated as
+ *	requested by the passed-in parameters.
+ */
+static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
+			     ext4_lblk_t iblock, ext4_fsblk_t goal,
+			     int indirect_blks, int blks,
+			     ext4_fsblk_t new_blocks[4], int *err)
+{
+	struct ext4_allocation_request ar;
+	int target, i;
+	unsigned long count = 0, blk_allocated = 0;
+	int index = 0;
+	ext4_fsblk_t current_block = 0;
+	int ret = 0;
+
+	/*
+	 * Here we try to allocate the requested multiple blocks at once,
+	 * on a best-effort basis.
+	 * To build a branch, we should allocate blocks for
+	 * the indirect blocks(if not allocated yet), and at least
+	 * the first direct block of this branch.  That's the
+	 * minimum number of blocks need to allocate(required)
+	 */
+	/* first we try to allocate the indirect blocks */
+	target = indirect_blks;
+	while (target > 0) {
+		count = target;
+		/* allocating blocks for indirect blocks and direct blocks */
+		current_block = ext4_new_meta_blocks(handle, inode, goal,
+						     0, &count, err);
+		if (*err)
+			goto failed_out;
+
+		if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
+			EXT4_ERROR_INODE(inode,
+					 "current_block %llu + count %lu > %d!",
+					 current_block, count,
+					 EXT4_MAX_BLOCK_FILE_PHYS);
+			*err = -EIO;
+			goto failed_out;
+		}
+
+		target -= count;
+		/* allocate blocks for indirect blocks */
+		while (index < indirect_blks && count) {
+			new_blocks[index++] = current_block++;
+			count--;
+		}
+		if (count > 0) {
+			/*
+			 * save the new block number
+			 * for the first direct block
+			 */
+			new_blocks[index] = current_block;
+			printk(KERN_INFO "%s returned more blocks than "
+						"requested\n", __func__);
+			WARN_ON(1);
+			break;
+		}
+	}
+
+	target = blks - count ;
+	blk_allocated = count;
+	if (!target)
+		goto allocated;
+	/* Now allocate data blocks */
+	memset(&ar, 0, sizeof(ar));
+	ar.inode = inode;
+	ar.goal = goal;
+	ar.len = target;
+	ar.logical = iblock;
+	if (S_ISREG(inode->i_mode))
+		/* enable in-core preallocation only for regular files */
+		ar.flags = EXT4_MB_HINT_DATA;
+
+	current_block = ext4_mb_new_blocks(handle, &ar, err);
+	if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
+		EXT4_ERROR_INODE(inode,
+				 "current_block %llu + ar.len %d > %d!",
+				 current_block, ar.len,
+				 EXT4_MAX_BLOCK_FILE_PHYS);
+		*err = -EIO;
+		goto failed_out;
+	}
+
+	if (*err && (target == blks)) {
+		/*
+		 * if the allocation failed and we didn't allocate
+		 * any blocks before
+		 */
+		goto failed_out;
+	}
+	if (!*err) {
+		if (target == blks) {
+			/*
+			 * save the new block number
+			 * for the first direct block
+			 */
+			new_blocks[index] = current_block;
+		}
+		blk_allocated += ar.len;
+	}
+allocated:
+	/* total number of blocks allocated for direct blocks */
+	ret = blk_allocated;
+	*err = 0;
+	return ret;
+failed_out:
+	for (i = 0; i < index; i++)
+		ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
+	return ret;
+}
+
+/**
+ *	ext4_alloc_branch - allocate and set up a chain of blocks.
+ *	@handle: handle for this transaction
+ *	@inode: owner
+ *	@indirect_blks: number of allocated indirect blocks
+ *	@blks: number of allocated direct blocks
+ *	@goal: preferred place for allocation
+ *	@offsets: offsets (in the blocks) to store the pointers to next.
+ *	@branch: place to store the chain in.
+ *
+ *	This function allocates blocks, zeroes out all but the last one,
+ *	links them into chain and (if we are synchronous) writes them to disk.
+ *	In other words, it prepares a branch that can be spliced onto the
+ *	inode. It stores the information about that chain in the branch[], in
+ *	the same format as ext4_get_branch() would do. We are calling it after
+ *	we had read the existing part of chain and partial points to the last
+ *	triple of that (one with zero ->key). Upon the exit we have the same
+ *	picture as after the successful ext4_get_block(), except that in one
+ *	place chain is disconnected - *branch->p is still zero (we did not
+ *	set the last link), but branch->key contains the number that should
+ *	be placed into *branch->p to fill that gap.
+ *
+ *	If allocation fails we free all blocks we've allocated (and forget
+ *	their buffer_heads) and return the error value the from failed
+ *	ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ *	as described above and return 0.
+ */
+static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+			     ext4_lblk_t iblock, int indirect_blks,
+			     int *blks, ext4_fsblk_t goal,
+			     ext4_lblk_t *offsets, Indirect *branch)
+{
+	int blocksize = inode->i_sb->s_blocksize;
+	int i, n = 0;
+	int err = 0;
+	struct buffer_head *bh;
+	int num;
+	ext4_fsblk_t new_blocks[4];
+	ext4_fsblk_t current_block;
+
+	num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
+				*blks, new_blocks, &err);
+	if (err)
+		return err;
+
+	branch[0].key = cpu_to_le32(new_blocks[0]);
+	/*
+	 * metadata blocks and data blocks are allocated.
+	 */
+	for (n = 1; n <= indirect_blks;  n++) {
+		/*
+		 * Get buffer_head for parent block, zero it out
+		 * and set the pointer to new one, then send
+		 * parent to disk.
+		 */
+		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+		if (unlikely(!bh)) {
+			err = -EIO;
+			goto failed;
+		}
+
+		branch[n].bh = bh;
+		lock_buffer(bh);
+		BUFFER_TRACE(bh, "call get_create_access");
+		err = ext4_journal_get_create_access(handle, bh);
+		if (err) {
+			/* Don't brelse(bh) here; it's done in
+			 * ext4_journal_forget() below */
+			unlock_buffer(bh);
+			goto failed;
+		}
+
+		memset(bh->b_data, 0, blocksize);
+		branch[n].p = (__le32 *) bh->b_data + offsets[n];
+		branch[n].key = cpu_to_le32(new_blocks[n]);
+		*branch[n].p = branch[n].key;
+		if (n == indirect_blks) {
+			current_block = new_blocks[n];
+			/*
+			 * End of chain, update the last new metablock of
+			 * the chain to point to the new allocated
+			 * data blocks numbers
+			 */
+			for (i = 1; i < num; i++)
+				*(branch[n].p + i) = cpu_to_le32(++current_block);
+		}
+		BUFFER_TRACE(bh, "marking uptodate");
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+
+		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
+		if (err)
+			goto failed;
+	}
+	*blks = num;
+	return err;
+failed:
+	/* Allocation failed, free what we already allocated */
+	ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
+	for (i = 1; i <= n ; i++) {
+		/*
+		 * branch[i].bh is newly allocated, so there is no
+		 * need to revoke the block, which is why we don't
+		 * need to set EXT4_FREE_BLOCKS_METADATA.
+		 */
+		ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
+				 EXT4_FREE_BLOCKS_FORGET);
+	}
+	for (i = n+1; i < indirect_blks; i++)
+		ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
+
+	ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
+
+	return err;
+}
+
+/**
+ * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @chain: chain of indirect blocks (with a missing link - see
+ *	ext4_alloc_branch)
+ * @where: location of missing link
+ * @num:   number of indirect blocks we are adding
+ * @blks:  number of direct blocks we are adding
+ *
+ * This function fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
+ */
+static int ext4_splice_branch(handle_t *handle, struct inode *inode,
+			      ext4_lblk_t block, Indirect *where, int num,
+			      int blks)
+{
+	int i;
+	int err = 0;
+	ext4_fsblk_t current_block;
+
+	/*
+	 * If we're splicing into a [td]indirect block (as opposed to the
+	 * inode) then we need to get write access to the [td]indirect block
+	 * before the splice.
+	 */
+	if (where->bh) {
+		BUFFER_TRACE(where->bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, where->bh);
+		if (err)
+			goto err_out;
+	}
+	/* That's it */
+
+	*where->p = where->key;
+
+	/*
+	 * Update the host buffer_head or inode to point to more just allocated
+	 * direct blocks blocks
+	 */
+	if (num == 0 && blks > 1) {
+		current_block = le32_to_cpu(where->key) + 1;
+		for (i = 1; i < blks; i++)
+			*(where->p + i) = cpu_to_le32(current_block++);
+	}
+
+	/* We are done with atomic stuff, now do the rest of housekeeping */
+	/* had we spliced it onto indirect block? */
+	if (where->bh) {
+		/*
+		 * If we spliced it onto an indirect block, we haven't
+		 * altered the inode.  Note however that if it is being spliced
+		 * onto an indirect block at the very end of the file (the
+		 * file is growing) then we *will* alter the inode to reflect
+		 * the new i_size.  But that is not done here - it is done in
+		 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
+		 */
+		jbd_debug(5, "splicing indirect only\n");
+		BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, inode, where->bh);
+		if (err)
+			goto err_out;
+	} else {
+		/*
+		 * OK, we spliced it into the inode itself on a direct block.
+		 */
+		ext4_mark_inode_dirty(handle, inode);
+		jbd_debug(5, "splicing direct\n");
+	}
+	return err;
+
+err_out:
+	for (i = 1; i <= num; i++) {
+		/*
+		 * branch[i].bh is newly allocated, so there is no
+		 * need to revoke the block, which is why we don't
+		 * need to set EXT4_FREE_BLOCKS_METADATA.
+		 */
+		ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+				 EXT4_FREE_BLOCKS_FORGET);
+	}
+	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
+			 blks, 0);
+
+	return err;
+}
+
+/*
+ * The ext4_ind_map_blocks() function handles non-extents inodes
+ * (i.e., using the traditional indirect/double-indirect i_blocks
+ * scheme) for ext4_map_blocks().
+ *
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ *
+ * `handle' can be NULL if create == 0.
+ *
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ *
+ * The ext4_ind_get_blocks() function should be called with
+ * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
+ * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
+ * blocks.
+ */
+static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+			       struct ext4_map_blocks *map,
+			       int flags)
+{
+	int err = -EIO;
+	ext4_lblk_t offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	ext4_fsblk_t goal;
+	int indirect_blks;
+	int blocks_to_boundary = 0;
+	int depth;
+	int count = 0;
+	ext4_fsblk_t first_block = 0;
+
+	trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
+	J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
+	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
+	depth = ext4_block_to_path(inode, map->m_lblk, offsets,
+				   &blocks_to_boundary);
+
+	if (depth == 0)
+		goto out;
+
+	partial = ext4_get_branch(inode, depth, offsets, chain, &err);
+
+	/* Simplest case - block found, no allocation needed */
+	if (!partial) {
+		first_block = le32_to_cpu(chain[depth - 1].key);
+		count++;
+		/*map more blocks*/
+		while (count < map->m_len && count <= blocks_to_boundary) {
+			ext4_fsblk_t blk;
+
+			blk = le32_to_cpu(*(chain[depth-1].p + count));
+
+			if (blk == first_block + count)
+				count++;
+			else
+				break;
+		}
+		goto got_it;
+	}
+
+	/* Next simple case - plain lookup or failed read of indirect block */
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
+		goto cleanup;
+
+	/*
+	 * Okay, we need to do block allocation.
+	*/
+	goal = ext4_find_goal(inode, map->m_lblk, partial);
+
+	/* the number of blocks need to allocate for [d,t]indirect blocks */
+	indirect_blks = (chain + depth) - partial - 1;
+
+	/*
+	 * Next look up the indirect map to count the totoal number of
+	 * direct blocks to allocate for this branch.
+	 */
+	count = ext4_blks_to_allocate(partial, indirect_blks,
+				      map->m_len, blocks_to_boundary);
+	/*
+	 * Block out ext4_truncate while we alter the tree
+	 */
+	err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
+				&count, goal,
+				offsets + (partial - chain), partial);
+
+	/*
+	 * The ext4_splice_branch call will free and forget any buffers
+	 * on the new chain if there is a failure, but that risks using
+	 * up transaction credits, especially for bitmaps where the
+	 * credits cannot be returned.  Can we handle this somehow?  We
+	 * may need to return -EAGAIN upwards in the worst case.  --sct
+	 */
+	if (!err)
+		err = ext4_splice_branch(handle, inode, map->m_lblk,
+					 partial, indirect_blks, count);
+	if (err)
+		goto cleanup;
+
+	map->m_flags |= EXT4_MAP_NEW;
+
+	ext4_update_inode_fsync_trans(handle, inode, 1);
+got_it:
+	map->m_flags |= EXT4_MAP_MAPPED;
+	map->m_pblk = le32_to_cpu(chain[depth-1].key);
+	map->m_len = count;
+	if (count > blocks_to_boundary)
+		map->m_flags |= EXT4_MAP_BOUNDARY;
+	err = count;
+	/* Clean up and exit */
+	partial = chain + depth - 1;	/* the whole chain */
+cleanup:
+	while (partial > chain) {
+		BUFFER_TRACE(partial->bh, "call brelse");
+		brelse(partial->bh);
+		partial--;
+	}
+out:
+	trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
+				map->m_pblk, map->m_len, err);
+	return err;
+}
+
+#ifdef CONFIG_QUOTA
+qsize_t *ext4_get_reserved_space(struct inode *inode)
+{
+	return &EXT4_I(inode)->i_reserved_quota;
+}
+#endif
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate a new block at @lblocks for non extent file based file
+ */
+static int ext4_indirect_calc_metadata_amount(struct inode *inode,
+					      sector_t lblock)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
+	int blk_bits;
+
+	if (lblock < EXT4_NDIR_BLOCKS)
+		return 0;
+
+	lblock -= EXT4_NDIR_BLOCKS;
+
+	if (ei->i_da_metadata_calc_len &&
+	    (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
+		ei->i_da_metadata_calc_len++;
+		return 0;
+	}
+	ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+	ei->i_da_metadata_calc_len = 1;
+	blk_bits = order_base_2(lblock);
+	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate a block located at @lblock
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
+{
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		return ext4_ext_calc_metadata_amount(inode, lblock);
+
+	return ext4_indirect_calc_metadata_amount(inode, lblock);
+}
+
+/*
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
+ */
+void ext4_da_update_reserve_space(struct inode *inode,
+					int used, int quota_claim)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	spin_lock(&ei->i_block_reservation_lock);
+	trace_ext4_da_update_reserve_space(inode, used);
+	if (unlikely(used > ei->i_reserved_data_blocks)) {
+		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+			 "with only %d reserved data blocks\n",
+			 __func__, inode->i_ino, used,
+			 ei->i_reserved_data_blocks);
+		WARN_ON(1);
+		used = ei->i_reserved_data_blocks;
+	}
+
+	/* Update per-inode reservations */
+	ei->i_reserved_data_blocks -= used;
+	ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+	percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+			   used + ei->i_allocated_meta_blocks);
+	ei->i_allocated_meta_blocks = 0;
+
+	if (ei->i_reserved_data_blocks == 0) {
+		/*
+		 * We can release all of the reserved metadata blocks
+		 * only when we have written all of the delayed
+		 * allocation blocks.
+		 */
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+				   ei->i_reserved_meta_blocks);
+		ei->i_reserved_meta_blocks = 0;
+		ei->i_da_metadata_calc_len = 0;
+	}
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	/* Update quota subsystem for data blocks */
+	if (quota_claim)
+		dquot_claim_block(inode, used);
+	else {
+		/*
+		 * We did fallocate with an offset that is already delayed
+		 * allocated. So on delayed allocated writeback we should
+		 * not re-claim the quota for fallocated blocks.
+		 */
+		dquot_release_reservation_block(inode, used);
+	}
+
+	/*
+	 * If we have done all the pending block allocations and if
+	 * there aren't any writers on the inode, we can discard the
+	 * inode's preallocations.
+	 */
+	if ((ei->i_reserved_data_blocks == 0) &&
+	    (atomic_read(&inode->i_writecount) == 0))
+		ext4_discard_preallocations(inode);
+}
+
+static int __check_block_validity(struct inode *inode, const char *func,
+				unsigned int line,
+				struct ext4_map_blocks *map)
+{
+	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
+				   map->m_len)) {
+		ext4_error_inode(inode, func, line, map->m_pblk,
+				 "lblock %lu mapped to illegal pblock "
+				 "(length %d)", (unsigned long) map->m_lblk,
+				 map->m_len);
+		return -EIO;
+	}
+	return 0;
+}
+
+#define check_block_validity(inode, map)	\
+	__check_block_validity((inode), __func__, __LINE__, (map))
+
+/*
+ * Return the number of contiguous dirty pages in a given inode
+ * starting at page frame idx.
+ */
+static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
+				    unsigned int max_pages)
+{
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t	index;
+	struct pagevec pvec;
+	pgoff_t num = 0;
+	int i, nr_pages, done = 0;
+
+	if (max_pages == 0)
+		return 0;
+	pagevec_init(&pvec, 0);
+	while (!done) {
+		index = idx;
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					      PAGECACHE_TAG_DIRTY,
+					      (pgoff_t)PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+			struct buffer_head *bh, *head;
+
+			lock_page(page);
+			if (unlikely(page->mapping != mapping) ||
+			    !PageDirty(page) ||
+			    PageWriteback(page) ||
+			    page->index != idx) {
+				done = 1;
+				unlock_page(page);
+				break;
+			}
+			if (page_has_buffers(page)) {
+				bh = head = page_buffers(page);
+				do {
+					if (!buffer_delay(bh) &&
+					    !buffer_unwritten(bh))
+						done = 1;
+					bh = bh->b_this_page;
+				} while (!done && (bh != head));
+			}
+			unlock_page(page);
+			if (done)
+				break;
+			idx++;
+			num++;
+			if (num >= max_pages) {
+				done = 1;
+				break;
+			}
+		}
+		pagevec_release(&pvec);
+	}
+	return num;
+}
+
+/*
+ * The ext4_map_blocks() function tries to look up the requested blocks,
+ * and returns if the blocks are already mapped.
+ *
+ * Otherwise it takes the write lock of the i_data_sem and allocate blocks
+ * and store the allocated blocks in the result buffer head and mark it
+ * mapped.
+ *
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
+ * based files
+ *
+ * On success, it returns the number of blocks being mapped or allocate.
+ * if create==0 and the blocks are pre-allocated and uninitialized block,
+ * the result buffer head is unmapped. If the create ==1, it will make sure
+ * the buffer head is mapped.
+ *
+ * It returns 0 if plain look up failed (blocks have not been allocated), in
+ * that casem, buffer head is unmapped
+ *
+ * It returns the error in case of allocation failure.
+ */
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
+		    struct ext4_map_blocks *map, int flags)
+{
+	int retval;
+
+	map->m_flags = 0;
+	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
+		  (unsigned long) map->m_lblk);
+	/*
+	 * Try to see if we can get the block without requesting a new
+	 * file system block.
+	 */
+	down_read((&EXT4_I(inode)->i_data_sem));
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		retval = ext4_ext_map_blocks(handle, inode, map, 0);
+	} else {
+		retval = ext4_ind_map_blocks(handle, inode, map, 0);
+	}
+	up_read((&EXT4_I(inode)->i_data_sem));
+
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+		int ret = check_block_validity(inode, map);
+		if (ret != 0)
+			return ret;
+	}
+
+	/* If it is only a block(s) look up */
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
+		return retval;
+
+	/*
+	 * Returns if the blocks have already allocated
+	 *
+	 * Note that if blocks have been preallocated
+	 * ext4_ext_get_block() returns th create = 0
+	 * with buffer head unmapped.
+	 */
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
+		return retval;
+
+	/*
+	 * When we call get_blocks without the create flag, the
+	 * BH_Unwritten flag could have gotten set if the blocks
+	 * requested were part of a uninitialized extent.  We need to
+	 * clear this flag now that we are committed to convert all or
+	 * part of the uninitialized extent to be an initialized
+	 * extent.  This is because we need to avoid the combination
+	 * of BH_Unwritten and BH_Mapped flags being simultaneously
+	 * set on the buffer_head.
+	 */
+	map->m_flags &= ~EXT4_MAP_UNWRITTEN;
+
+	/*
+	 * New blocks allocate and/or writing to uninitialized extent
+	 * will possibly result in updating i_data, so we take
+	 * the write lock of i_data_sem, and call get_blocks()
+	 * with create == 1 flag.
+	 */
+	down_write((&EXT4_I(inode)->i_data_sem));
+
+	/*
+	 * if the caller is from delayed allocation writeout path
+	 * we have already reserved fs blocks for allocation
+	 * let the underlying get_block() function know to
+	 * avoid double accounting
+	 */
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+		ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
+	/*
+	 * We need to check for EXT4 here because migrate
+	 * could have changed the inode type in between
+	 */
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		retval = ext4_ext_map_blocks(handle, inode, map, flags);
+	} else {
+		retval = ext4_ind_map_blocks(handle, inode, map, flags);
+
+		if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
+			/*
+			 * We allocated new blocks which will result in
+			 * i_data's format changing.  Force the migrate
+			 * to fail by clearing migrate flags
+			 */
+			ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+		}
+
+		/*
+		 * Update reserved blocks/metadata blocks after successful
+		 * block allocation which had been deferred till now. We don't
+		 * support fallocate for non extent files. So we can update
+		 * reserve space here.
+		 */
+		if ((retval > 0) &&
+			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+			ext4_da_update_reserve_space(inode, retval, 1);
+	}
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
+
+	up_write((&EXT4_I(inode)->i_data_sem));
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+		int ret = check_block_validity(inode, map);
+		if (ret != 0)
+			return ret;
+	}
+	return retval;
+}
+
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh, int flags)
+{
+	handle_t *handle = ext4_journal_current_handle();
+	struct ext4_map_blocks map;
+	int ret = 0, started = 0;
+	int dio_credits;
+
+	map.m_lblk = iblock;
+	map.m_len = bh->b_size >> inode->i_blkbits;
+
+	if (flags && !handle) {
+		/* Direct IO write... */
+		if (map.m_len > DIO_MAX_BLOCKS)
+			map.m_len = DIO_MAX_BLOCKS;
+		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
+		handle = ext4_journal_start(inode, dio_credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			return ret;
+		}
+		started = 1;
+	}
+
+	ret = ext4_map_blocks(handle, inode, &map, flags);
+	if (ret > 0) {
+		map_bh(bh, inode->i_sb, map.m_pblk);
+		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
+		ret = 0;
+	}
+	if (started)
+		ext4_journal_stop(handle);
+	return ret;
+}
+
+int ext4_get_block(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh, int create)
+{
+	return _ext4_get_block(inode, iblock, bh,
+			       create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
+
+/*
+ * `handle' can be NULL if create is zero
+ */
+struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
+				ext4_lblk_t block, int create, int *errp)
+{
+	struct ext4_map_blocks map;
+	struct buffer_head *bh;
+	int fatal = 0, err;
+
+	J_ASSERT(handle != NULL || create == 0);
+
+	map.m_lblk = block;
+	map.m_len = 1;
+	err = ext4_map_blocks(handle, inode, &map,
+			      create ? EXT4_GET_BLOCKS_CREATE : 0);
+
+	if (err < 0)
+		*errp = err;
+	if (err <= 0)
+		return NULL;
+	*errp = 0;
+
+	bh = sb_getblk(inode->i_sb, map.m_pblk);
+	if (!bh) {
+		*errp = -EIO;
+		return NULL;
+	}
+	if (map.m_flags & EXT4_MAP_NEW) {
+		J_ASSERT(create != 0);
+		J_ASSERT(handle != NULL);
+
+		/*
+		 * Now that we do not always journal data, we should
+		 * keep in mind whether this should always journal the
+		 * new buffer as metadata.  For now, regular file
+		 * writes use ext4_get_block instead, so it's not a
+		 * problem.
+		 */
+		lock_buffer(bh);
+		BUFFER_TRACE(bh, "call get_create_access");
+		fatal = ext4_journal_get_create_access(handle, bh);
+		if (!fatal && !buffer_uptodate(bh)) {
+			memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+			set_buffer_uptodate(bh);
+		}
+		unlock_buffer(bh);
+		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
+		if (!fatal)
+			fatal = err;
+	} else {
+		BUFFER_TRACE(bh, "not a new buffer");
+	}
+	if (fatal) {
+		*errp = fatal;
+		brelse(bh);
+		bh = NULL;
+	}
+	return bh;
+}
+
+struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
+			       ext4_lblk_t block, int create, int *err)
+{
+	struct buffer_head *bh;
+
+	bh = ext4_getblk(handle, inode, block, create, err);
+	if (!bh)
+		return bh;
+	if (buffer_uptodate(bh))
+		return bh;
+	ll_rw_block(READ_META, 1, &bh);
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return bh;
+	put_bh(bh);
+	*err = -EIO;
+	return NULL;
+}
+
+static int walk_page_buffers(handle_t *handle,
+			     struct buffer_head *head,
+			     unsigned from,
+			     unsigned to,
+			     int *partial,
+			     int (*fn)(handle_t *handle,
+				       struct buffer_head *bh))
+{
+	struct buffer_head *bh;
+	unsigned block_start, block_end;
+	unsigned blocksize = head->b_size;
+	int err, ret = 0;
+	struct buffer_head *next;
+
+	for (bh = head, block_start = 0;
+	     ret == 0 && (bh != head || !block_start);
+	     block_start = block_end, bh = next) {
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (partial && !buffer_uptodate(bh))
+				*partial = 1;
+			continue;
+		}
+		err = (*fn)(handle, bh);
+		if (!ret)
+			ret = err;
+	}
+	return ret;
+}
+
+/*
+ * To preserve ordering, it is essential that the hole instantiation and
+ * the data write be encapsulated in a single transaction.  We cannot
+ * close off a transaction and start a new one between the ext4_get_block()
+ * and the commit_write().  So doing the jbd2_journal_start at the start of
+ * prepare_write() is the right place.
+ *
+ * Also, this function can nest inside ext4_writepage() ->
+ * block_write_full_page(). In that case, we *know* that ext4_writepage()
+ * has generated enough buffer credits to do the whole page.  So we won't
+ * block on the journal in that case, which is good, because the caller may
+ * be PF_MEMALLOC.
+ *
+ * By accident, ext4 can be reentered when a transaction is open via
+ * quota file writes.  If we were to commit the transaction while thus
+ * reentered, there can be a deadlock - we would be holding a quota
+ * lock, and the commit would never complete if another thread had a
+ * transaction open and was blocking on the quota lock - a ranking
+ * violation.
+ *
+ * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
+ * will _not_ run commit under these circumstances because handle->h_ref
+ * is elevated.  We'll still have enough credits for the tiny quotafile
+ * write.
+ */
+static int do_journal_get_write_access(handle_t *handle,
+				       struct buffer_head *bh)
+{
+	int dirty = buffer_dirty(bh);
+	int ret;
+
+	if (!buffer_mapped(bh) || buffer_freed(bh))
+		return 0;
+	/*
+	 * __block_write_begin() could have dirtied some buffers. Clean
+	 * the dirty bit as jbd2_journal_get_write_access() could complain
+	 * otherwise about fs integrity issues. Setting of the dirty bit
+	 * by __block_write_begin() isn't a real problem here as we clear
+	 * the bit before releasing a page lock and thus writeback cannot
+	 * ever write the buffer.
+	 */
+	if (dirty)
+		clear_buffer_dirty(bh);
+	ret = ext4_journal_get_write_access(handle, bh);
+	if (!ret && dirty)
+		ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+	return ret;
+}
+
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext4_truncate_failed_write(struct inode *inode)
+{
+	truncate_inode_pages(inode->i_mapping, inode->i_size);
+	ext4_truncate(inode);
+}
+
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create);
+static int ext4_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	int ret, needed_blocks;
+	handle_t *handle;
+	int retries = 0;
+	struct page *page;
+	pgoff_t index;
+	unsigned from, to;
+
+	trace_ext4_write_begin(inode, pos, len, flags);
+	/*
+	 * Reserve one block more for addition to orphan list in case
+	 * we allocate blocks but write fails for some reason
+	 */
+	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+	index = pos >> PAGE_CACHE_SHIFT;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+retry:
+	handle = ext4_journal_start(inode, needed_blocks);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	/* We cannot recurse into the filesystem as the transaction is already
+	 * started */
+	flags |= AOP_FLAG_NOFS;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page) {
+		ext4_journal_stop(handle);
+		ret = -ENOMEM;
+		goto out;
+	}
+	*pagep = page;
+
+	if (ext4_should_dioread_nolock(inode))
+		ret = __block_write_begin(page, pos, len, ext4_get_block_write);
+	else
+		ret = __block_write_begin(page, pos, len, ext4_get_block);
+
+	if (!ret && ext4_should_journal_data(inode)) {
+		ret = walk_page_buffers(handle, page_buffers(page),
+				from, to, NULL, do_journal_get_write_access);
+	}
+
+	if (ret) {
+		unlock_page(page);
+		page_cache_release(page);
+		/*
+		 * __block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 *
+		 * Add inode to orphan list in case we crash before
+		 * truncate finishes
+		 */
+		if (pos + len > inode->i_size && ext4_can_truncate(inode))
+			ext4_orphan_add(handle, inode);
+
+		ext4_journal_stop(handle);
+		if (pos + len > inode->i_size) {
+			ext4_truncate_failed_write(inode);
+			/*
+			 * If truncate failed early the inode might
+			 * still be on the orphan list; we need to
+			 * make sure the inode is removed from the
+			 * orphan list in that case.
+			 */
+			if (inode->i_nlink)
+				ext4_orphan_del(NULL, inode);
+		}
+	}
+
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+out:
+	return ret;
+}
+
+/* For write_end() in data=journal mode */
+static int write_end_fn(handle_t *handle, struct buffer_head *bh)
+{
+	if (!buffer_mapped(bh) || buffer_freed(bh))
+		return 0;
+	set_buffer_uptodate(bh);
+	return ext4_handle_dirty_metadata(handle, NULL, bh);
+}
+
+static int ext4_generic_write_end(struct file *file,
+				  struct address_space *mapping,
+				  loff_t pos, unsigned len, unsigned copied,
+				  struct page *page, void *fsdata)
+{
+	int i_size_changed = 0;
+	struct inode *inode = mapping->host;
+	handle_t *handle = ext4_journal_current_handle();
+
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_mutex.
+	 *
+	 * But it's important to update i_size while still holding page lock:
+	 * page writeout could otherwise come in and zero beyond i_size.
+	 */
+	if (pos + copied > inode->i_size) {
+		i_size_write(inode, pos + copied);
+		i_size_changed = 1;
+	}
+
+	if (pos + copied >  EXT4_I(inode)->i_disksize) {
+		/* We need to mark inode dirty even if
+		 * new_i_size is less that inode->i_size
+		 * bu greater than i_disksize.(hint delalloc)
+		 */
+		ext4_update_i_disksize(inode, (pos + copied));
+		i_size_changed = 1;
+	}
+	unlock_page(page);
+	page_cache_release(page);
+
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		ext4_mark_inode_dirty(handle, inode);
+
+	return copied;
+}
+
+/*
+ * We need to pick up the new inode size which generic_commit_write gave us
+ * `file' can be NULL - eg, when called from page_symlink().
+ *
+ * ext4 never places buffers on inode->i_mapping->private_list.  metadata
+ * buffers are managed internally.
+ */
+static int ext4_ordered_write_end(struct file *file,
+				  struct address_space *mapping,
+				  loff_t pos, unsigned len, unsigned copied,
+				  struct page *page, void *fsdata)
+{
+	handle_t *handle = ext4_journal_current_handle();
+	struct inode *inode = mapping->host;
+	int ret = 0, ret2;
+
+	trace_ext4_ordered_write_end(inode, pos, len, copied);
+	ret = ext4_jbd2_file_inode(handle, inode);
+
+	if (ret == 0) {
+		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+							page, fsdata);
+		copied = ret2;
+		if (pos + len > inode->i_size && ext4_can_truncate(inode))
+			/* if we have allocated more blocks and copied
+			 * less. We will have blocks allocated outside
+			 * inode->i_size. So truncate them
+			 */
+			ext4_orphan_add(handle, inode);
+		if (ret2 < 0)
+			ret = ret2;
+	}
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+
+	if (pos + len > inode->i_size) {
+		ext4_truncate_failed_write(inode);
+		/*
+		 * If truncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+
+
+	return ret ? ret : copied;
+}
+
+static int ext4_writeback_write_end(struct file *file,
+				    struct address_space *mapping,
+				    loff_t pos, unsigned len, unsigned copied,
+				    struct page *page, void *fsdata)
+{
+	handle_t *handle = ext4_journal_current_handle();
+	struct inode *inode = mapping->host;
+	int ret = 0, ret2;
+
+	trace_ext4_writeback_write_end(inode, pos, len, copied);
+	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+							page, fsdata);
+	copied = ret2;
+	if (pos + len > inode->i_size && ext4_can_truncate(inode))
+		/* if we have allocated more blocks and copied
+		 * less. We will have blocks allocated outside
+		 * inode->i_size. So truncate them
+		 */
+		ext4_orphan_add(handle, inode);
+
+	if (ret2 < 0)
+		ret = ret2;
+
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+
+	if (pos + len > inode->i_size) {
+		ext4_truncate_failed_write(inode);
+		/*
+		 * If truncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+
+	return ret ? ret : copied;
+}
+
+static int ext4_journalled_write_end(struct file *file,
+				     struct address_space *mapping,
+				     loff_t pos, unsigned len, unsigned copied,
+				     struct page *page, void *fsdata)
+{
+	handle_t *handle = ext4_journal_current_handle();
+	struct inode *inode = mapping->host;
+	int ret = 0, ret2;
+	int partial = 0;
+	unsigned from, to;
+	loff_t new_i_size;
+
+	trace_ext4_journalled_write_end(inode, pos, len, copied);
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+	BUG_ON(!ext4_handle_valid(handle));
+
+	if (copied < len) {
+		if (!PageUptodate(page))
+			copied = 0;
+		page_zero_new_buffers(page, from+copied, to);
+	}
+
+	ret = walk_page_buffers(handle, page_buffers(page), from,
+				to, &partial, write_end_fn);
+	if (!partial)
+		SetPageUptodate(page);
+	new_i_size = pos + copied;
+	if (new_i_size > inode->i_size)
+		i_size_write(inode, pos+copied);
+	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+	if (new_i_size > EXT4_I(inode)->i_disksize) {
+		ext4_update_i_disksize(inode, new_i_size);
+		ret2 = ext4_mark_inode_dirty(handle, inode);
+		if (!ret)
+			ret = ret2;
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+	if (pos + len > inode->i_size && ext4_can_truncate(inode))
+		/* if we have allocated more blocks and copied
+		 * less. We will have blocks allocated outside
+		 * inode->i_size. So truncate them
+		 */
+		ext4_orphan_add(handle, inode);
+
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+	if (pos + len > inode->i_size) {
+		ext4_truncate_failed_write(inode);
+		/*
+		 * If truncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+
+	return ret ? ret : copied;
+}
+
+/*
+ * Reserve a single block located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
+{
+	int retries = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned long md_needed;
+	int ret;
+
+	/*
+	 * recalculate the amount of metadata blocks to reserve
+	 * in order to allocate nrblocks
+	 * worse case is one extent per block
+	 */
+repeat:
+	spin_lock(&ei->i_block_reservation_lock);
+	md_needed = ext4_calc_metadata_amount(inode, lblock);
+	trace_ext4_da_reserve_space(inode, md_needed);
+	spin_unlock(&ei->i_block_reservation_lock);
+
+	/*
+	 * We will charge metadata quota at writeout time; this saves
+	 * us from metadata over-estimation, though we may go over by
+	 * a small amount in the end.  Here we just reserve for data.
+	 */
+	ret = dquot_reserve_block(inode, 1);
+	if (ret)
+		return ret;
+	/*
+	 * We do still charge estimated metadata to the sb though;
+	 * we cannot afford to run out of free blocks.
+	 */
+	if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
+		dquot_release_reservation_block(inode, 1);
+		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+			yield();
+			goto repeat;
+		}
+		return -ENOSPC;
+	}
+	spin_lock(&ei->i_block_reservation_lock);
+	ei->i_reserved_data_blocks++;
+	ei->i_reserved_meta_blocks += md_needed;
+	spin_unlock(&ei->i_block_reservation_lock);
+
+	return 0;       /* success */
+}
+
+static void ext4_da_release_space(struct inode *inode, int to_free)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	if (!to_free)
+		return;		/* Nothing to release, exit */
+
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	trace_ext4_da_release_space(inode, to_free);
+	if (unlikely(to_free > ei->i_reserved_data_blocks)) {
+		/*
+		 * if there aren't enough reserved blocks, then the
+		 * counter is messed up somewhere.  Since this
+		 * function is called from invalidate page, it's
+		 * harmless to return without any action.
+		 */
+		ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+			 "ino %lu, to_free %d with only %d reserved "
+			 "data blocks\n", inode->i_ino, to_free,
+			 ei->i_reserved_data_blocks);
+		WARN_ON(1);
+		to_free = ei->i_reserved_data_blocks;
+	}
+	ei->i_reserved_data_blocks -= to_free;
+
+	if (ei->i_reserved_data_blocks == 0) {
+		/*
+		 * We can release all of the reserved metadata blocks
+		 * only when we have written all of the delayed
+		 * allocation blocks.
+		 */
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+				   ei->i_reserved_meta_blocks);
+		ei->i_reserved_meta_blocks = 0;
+		ei->i_da_metadata_calc_len = 0;
+	}
+
+	/* update fs dirty data blocks counter */
+	percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
+
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	dquot_release_reservation_block(inode, to_free);
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+					     unsigned long offset)
+{
+	int to_release = 0;
+	struct buffer_head *head, *bh;
+	unsigned int curr_off = 0;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+
+		if ((offset <= curr_off) && (buffer_delay(bh))) {
+			to_release++;
+			clear_buffer_delay(bh);
+		}
+		curr_off = next_off;
+	} while ((bh = bh->b_this_page) != head);
+	ext4_da_release_space(page->mapping->host, to_release);
+}
+
+/*
+ * Delayed allocation stuff
+ */
+
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with writepage() call back
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd,
+			      struct ext4_map_blocks *map)
+{
+	struct pagevec pvec;
+	unsigned long index, end;
+	int ret = 0, err, nr_pages, i;
+	struct inode *inode = mpd->inode;
+	struct address_space *mapping = inode->i_mapping;
+	loff_t size = i_size_read(inode);
+	unsigned int len, block_start;
+	struct buffer_head *bh, *page_bufs = NULL;
+	int journal_data = ext4_should_journal_data(inode);
+	sector_t pblock = 0, cur_logical = 0;
+	struct ext4_io_submit io_submit;
+
+	BUG_ON(mpd->next_page <= mpd->first_page);
+	memset(&io_submit, 0, sizeof(io_submit));
+	/*
+	 * We need to start from the first_page to the next_page - 1
+	 * to make sure we also write the mapped dirty buffer_heads.
+	 * If we look at mpd->b_blocknr we would only be looking
+	 * at the currently mapped buffer_heads.
+	 */
+	index = mpd->first_page;
+	end = mpd->next_page - 1;
+
+	pagevec_init(&pvec, 0);
+	while (index <= end) {
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			int commit_write = 0, skip_page = 0;
+			struct page *page = pvec.pages[i];
+
+			index = page->index;
+			if (index > end)
+				break;
+
+			if (index == size >> PAGE_CACHE_SHIFT)
+				len = size & ~PAGE_CACHE_MASK;
+			else
+				len = PAGE_CACHE_SIZE;
+			if (map) {
+				cur_logical = index << (PAGE_CACHE_SHIFT -
+							inode->i_blkbits);
+				pblock = map->m_pblk + (cur_logical -
+							map->m_lblk);
+			}
+			index++;
+
+			BUG_ON(!PageLocked(page));
+			BUG_ON(PageWriteback(page));
+
+			/*
+			 * If the page does not have buffers (for
+			 * whatever reason), try to create them using
+			 * __block_write_begin.  If this fails,
+			 * skip the page and move on.
+			 */
+			if (!page_has_buffers(page)) {
+				if (__block_write_begin(page, 0, len,
+						noalloc_get_block_write)) {
+				skip_page:
+					unlock_page(page);
+					continue;
+				}
+				commit_write = 1;
+			}
+
+			bh = page_bufs = page_buffers(page);
+			block_start = 0;
+			do {
+				if (!bh)
+					goto skip_page;
+				if (map && (cur_logical >= map->m_lblk) &&
+				    (cur_logical <= (map->m_lblk +
+						     (map->m_len - 1)))) {
+					if (buffer_delay(bh)) {
+						clear_buffer_delay(bh);
+						bh->b_blocknr = pblock;
+					}
+					if (buffer_unwritten(bh) ||
+					    buffer_mapped(bh))
+						BUG_ON(bh->b_blocknr != pblock);
+					if (map->m_flags & EXT4_MAP_UNINIT)
+						set_buffer_uninit(bh);
+					clear_buffer_unwritten(bh);
+				}
+
+				/*
+				 * skip page if block allocation undone and
+				 * block is dirty
+				 */
+				if (ext4_bh_delay_or_unwritten(NULL, bh))
+					skip_page = 1;
+				bh = bh->b_this_page;
+				block_start += bh->b_size;
+				cur_logical++;
+				pblock++;
+			} while (bh != page_bufs);
+
+			if (skip_page)
+				goto skip_page;
+
+			if (commit_write)
+				/* mark the buffer_heads as dirty & uptodate */
+				block_commit_write(page, 0, len);
+
+			clear_page_dirty_for_io(page);
+			/*
+			 * Delalloc doesn't support data journalling,
+			 * but eventually maybe we'll lift this
+			 * restriction.
+			 */
+			if (unlikely(journal_data && PageChecked(page)))
+				err = __ext4_journalled_writepage(page, len);
+			else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
+				err = ext4_bio_write_page(&io_submit, page,
+							  len, mpd->wbc);
+			else if (buffer_uninit(page_bufs)) {
+				ext4_set_bh_endio(page_bufs, inode);
+				err = block_write_full_page_endio(page,
+					noalloc_get_block_write,
+					mpd->wbc, ext4_end_io_buffer_write);
+			} else
+				err = block_write_full_page(page,
+					noalloc_get_block_write, mpd->wbc);
+
+			if (!err)
+				mpd->pages_written++;
+			/*
+			 * In error case, we have to continue because
+			 * remaining pages are still locked
+			 */
+			if (ret == 0)
+				ret = err;
+		}
+		pagevec_release(&pvec);
+	}
+	ext4_io_submit(&io_submit);
+	return ret;
+}
+
+static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
+{
+	int nr_pages, i;
+	pgoff_t index, end;
+	struct pagevec pvec;
+	struct inode *inode = mpd->inode;
+	struct address_space *mapping = inode->i_mapping;
+
+	index = mpd->first_page;
+	end   = mpd->next_page - 1;
+	while (index <= end) {
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+			if (page->index > end)
+				break;
+			BUG_ON(!PageLocked(page));
+			BUG_ON(PageWriteback(page));
+			block_invalidatepage(page, 0);
+			ClearPageUptodate(page);
+			unlock_page(page);
+		}
+		index = pvec.pages[nr_pages - 1]->index + 1;
+		pagevec_release(&pvec);
+	}
+	return;
+}
+
+static void ext4_print_free_blocks(struct inode *inode)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	printk(KERN_CRIT "Total free blocks count %lld\n",
+	       ext4_count_free_blocks(inode->i_sb));
+	printk(KERN_CRIT "Free/Dirty block details\n");
+	printk(KERN_CRIT "free_blocks=%lld\n",
+	       (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
+	printk(KERN_CRIT "dirty_blocks=%lld\n",
+	       (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+	printk(KERN_CRIT "Block reservation details\n");
+	printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
+	       EXT4_I(inode)->i_reserved_data_blocks);
+	printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
+	       EXT4_I(inode)->i_reserved_meta_blocks);
+	return;
+}
+
+/*
+ * mpage_da_map_and_submit - go through given space, map them
+ *       if necessary, and then submit them for I/O
+ *
+ * @mpd - bh describing space
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ */
+static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
+{
+	int err, blks, get_blocks_flags;
+	struct ext4_map_blocks map, *mapp = NULL;
+	sector_t next = mpd->b_blocknr;
+	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
+	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
+	handle_t *handle = NULL;
+
+	/*
+	 * If the blocks are mapped already, or we couldn't accumulate
+	 * any blocks, then proceed immediately to the submission stage.
+	 */
+	if ((mpd->b_size == 0) ||
+	    ((mpd->b_state  & (1 << BH_Mapped)) &&
+	     !(mpd->b_state & (1 << BH_Delay)) &&
+	     !(mpd->b_state & (1 << BH_Unwritten))))
+		goto submit_io;
+
+	handle = ext4_journal_current_handle();
+	BUG_ON(!handle);
+
+	/*
+	 * Call ext4_map_blocks() to allocate any delayed allocation
+	 * blocks, or to convert an uninitialized extent to be
+	 * initialized (in the case where we have written into
+	 * one or more preallocated blocks).
+	 *
+	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
+	 * indicate that we are on the delayed allocation path.  This
+	 * affects functions in many different parts of the allocation
+	 * call path.  This flag exists primarily because we don't
+	 * want to change *many* call functions, so ext4_map_blocks()
+	 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
+	 * inode's allocation semaphore is taken.
+	 *
+	 * If the blocks in questions were delalloc blocks, set
+	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
+	 * variables are updated after the blocks have been allocated.
+	 */
+	map.m_lblk = next;
+	map.m_len = max_blocks;
+	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
+	if (ext4_should_dioread_nolock(mpd->inode))
+		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+	if (mpd->b_state & (1 << BH_Delay))
+		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
+	if (blks < 0) {
+		struct super_block *sb = mpd->inode->i_sb;
+
+		err = blks;
+		/*
+		 * If get block returns EAGAIN or ENOSPC and there
+		 * appears to be free blocks we will just let
+		 * mpage_da_submit_io() unlock all of the pages.
+		 */
+		if (err == -EAGAIN)
+			goto submit_io;
+
+		if (err == -ENOSPC &&
+		    ext4_count_free_blocks(sb)) {
+			mpd->retval = err;
+			goto submit_io;
+		}
+
+		/*
+		 * get block failure will cause us to loop in
+		 * writepages, because a_ops->writepage won't be able
+		 * to make progress. The page will be redirtied by
+		 * writepage and writepages will again try to write
+		 * the same.
+		 */
+		if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+			ext4_msg(sb, KERN_CRIT,
+				 "delayed block allocation failed for inode %lu "
+				 "at logical offset %llu with max blocks %zd "
+				 "with error %d", mpd->inode->i_ino,
+				 (unsigned long long) next,
+				 mpd->b_size >> mpd->inode->i_blkbits, err);
+			ext4_msg(sb, KERN_CRIT,
+				"This should not happen!! Data will be lost\n");
+			if (err == -ENOSPC)
+				ext4_print_free_blocks(mpd->inode);
+		}
+		/* invalidate all the pages */
+		ext4_da_block_invalidatepages(mpd);
+
+		/* Mark this page range as having been completed */
+		mpd->io_done = 1;
+		return;
+	}
+	BUG_ON(blks == 0);
+
+	mapp = &map;
+	if (map.m_flags & EXT4_MAP_NEW) {
+		struct block_device *bdev = mpd->inode->i_sb->s_bdev;
+		int i;
+
+		for (i = 0; i < map.m_len; i++)
+			unmap_underlying_metadata(bdev, map.m_pblk + i);
+	}
+
+	if (ext4_should_order_data(mpd->inode)) {
+		err = ext4_jbd2_file_inode(handle, mpd->inode);
+		if (err)
+			/* This only happens if the journal is aborted */
+			return;
+	}
+
+	/*
+	 * Update on-disk size along with block allocation.
+	 */
+	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
+	if (disksize > i_size_read(mpd->inode))
+		disksize = i_size_read(mpd->inode);
+	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
+		ext4_update_i_disksize(mpd->inode, disksize);
+		err = ext4_mark_inode_dirty(handle, mpd->inode);
+		if (err)
+			ext4_error(mpd->inode->i_sb,
+				   "Failed to mark inode %lu dirty",
+				   mpd->inode->i_ino);
+	}
+
+submit_io:
+	mpage_da_submit_io(mpd, mapp);
+	mpd->io_done = 1;
+}
+
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
+		(1 << BH_Delay) | (1 << BH_Unwritten))
+
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+				   sector_t logical, size_t b_size,
+				   unsigned long b_state)
+{
+	sector_t next;
+	int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+
+	/*
+	 * XXX Don't go larger than mballoc is willing to allocate
+	 * This is a stopgap solution.  We eventually need to fold
+	 * mpage_da_submit_io() into this function and then call
+	 * ext4_map_blocks() multiple times in a loop
+	 */
+	if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+		goto flush_it;
+
+	/* check if thereserved journal credits might overflow */
+	if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
+		if (nrblocks >= EXT4_MAX_TRANS_DATA) {
+			/*
+			 * With non-extent format we are limited by the journal
+			 * credit available.  Total credit needed to insert
+			 * nrblocks contiguous blocks is dependent on the
+			 * nrblocks.  So limit nrblocks.
+			 */
+			goto flush_it;
+		} else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
+				EXT4_MAX_TRANS_DATA) {
+			/*
+			 * Adding the new buffer_head would make it cross the
+			 * allowed limit for which we have journal credit
+			 * reserved. So limit the new bh->b_size
+			 */
+			b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
+						mpd->inode->i_blkbits;
+			/* we will do mpage_da_submit_io in the next loop */
+		}
+	}
+	/*
+	 * First block in the extent
+	 */
+	if (mpd->b_size == 0) {
+		mpd->b_blocknr = logical;
+		mpd->b_size = b_size;
+		mpd->b_state = b_state & BH_FLAGS;
+		return;
+	}
+
+	next = mpd->b_blocknr + nrblocks;
+	/*
+	 * Can we merge the block to our big extent?
+	 */
+	if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
+		mpd->b_size += b_size;
+		return;
+	}
+
+flush_it:
+	/*
+	 * We couldn't merge the block to our extent, so we
+	 * need to flush current  extent and start new one
+	 */
+	mpage_da_map_and_submit(mpd);
+	return;
+}
+
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
+{
+	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
+}
+
+/*
+ * This is a special get_blocks_t callback which is used by
+ * ext4_da_write_begin().  It will either return mapped block or
+ * reserve space for a single block.
+ *
+ * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
+ * We also have b_blocknr = -1 and b_bdev initialized properly
+ *
+ * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
+ * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
+ * initialized properly.
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+				  struct buffer_head *bh, int create)
+{
+	struct ext4_map_blocks map;
+	int ret = 0;
+	sector_t invalid_block = ~((sector_t) 0xffff);
+
+	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+		invalid_block = ~0;
+
+	BUG_ON(create == 0);
+	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+
+	map.m_lblk = iblock;
+	map.m_len = 1;
+
+	/*
+	 * first, we need to know whether the block is allocated already
+	 * preallocated blocks are unmapped but should treated
+	 * the same as allocated blocks.
+	 */
+	ret = ext4_map_blocks(NULL, inode, &map, 0);
+	if (ret < 0)
+		return ret;
+	if (ret == 0) {
+		if (buffer_delay(bh))
+			return 0; /* Not sure this could or should happen */
+		/*
+		 * XXX: __block_write_begin() unmaps passed block, is it OK?
+		 */
+		ret = ext4_da_reserve_space(inode, iblock);
+		if (ret)
+			/* not enough space to reserve */
+			return ret;
+
+		map_bh(bh, inode->i_sb, invalid_block);
+		set_buffer_new(bh);
+		set_buffer_delay(bh);
+		return 0;
+	}
+
+	map_bh(bh, inode->i_sb, map.m_pblk);
+	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+
+	if (buffer_unwritten(bh)) {
+		/* A delayed write to unwritten bh should be marked
+		 * new and mapped.  Mapped ensures that we don't do
+		 * get_block multiple times when we write to the same
+		 * offset and new ensures that we do proper zero out
+		 * for partial write.
+		 */
+		set_buffer_new(bh);
+		set_buffer_mapped(bh);
+	}
+	return 0;
+}
+
+/*
+ * This function is used as a standard get_block_t calback function
+ * when there is no desire to allocate any blocks.  It is used as a
+ * callback function for block_write_begin() and block_write_full_page().
+ * These functions should only try to map a single block at a time.
+ *
+ * Since this function doesn't do block allocations even if the caller
+ * requests it by passing in create=1, it is critically important that
+ * any caller checks to make sure that any buffer heads are returned
+ * by this function are either all already mapped or marked for
+ * delayed allocation before calling  block_write_full_page().  Otherwise,
+ * b_blocknr could be left unitialized, and the page write functions will
+ * be taken by surprise.
+ */
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+	return _ext4_get_block(inode, iblock, bh_result, 0);
+}
+
+static int bget_one(handle_t *handle, struct buffer_head *bh)
+{
+	get_bh(bh);
+	return 0;
+}
+
+static int bput_one(handle_t *handle, struct buffer_head *bh)
+{
+	put_bh(bh);
+	return 0;
+}
+
+static int __ext4_journalled_writepage(struct page *page,
+				       unsigned int len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct buffer_head *page_bufs;
+	handle_t *handle = NULL;
+	int ret = 0;
+	int err;
+
+	ClearPageChecked(page);
+	page_bufs = page_buffers(page);
+	BUG_ON(!page_bufs);
+	walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
+	/* As soon as we unlock the page, it can go away, but we have
+	 * references to buffers so we are safe */
+	unlock_page(page);
+
+	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	BUG_ON(!ext4_handle_valid(handle));
+
+	ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+				do_journal_get_write_access);
+
+	err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+				write_end_fn);
+	if (ret == 0)
+		ret = err;
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+
+	walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+out:
+	return ret;
+}
+
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+
+/*
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), no one guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
+ *
+ * This function can get called via...
+ *   - ext4_da_writepages after taking page lock (have journal handle)
+ *   - journal_submit_inode_data_buffers (no journal handle)
+ *   - shrink_page_list via pdflush (no journal handle)
+ *   - grab_page_cache when doing write_begin (have journal handle)
+ *
+ * We don't do any block allocation in this function. If we have page with
+ * multiple blocks we need to write those buffer_heads that are mapped. This
+ * is important for mmaped based write. So if we do with blocksize 1K
+ * truncate(f, 1024);
+ * a = mmap(f, 0, 4096);
+ * a[0] = 'a';
+ * truncate(f, 4096);
+ * we have in the page first buffer_head mapped via page_mkwrite call back
+ * but other bufer_heads would be unmapped but dirty(dirty done via the
+ * do_wp_page). So writepage should write the first block. If we modify
+ * the mmap area beyond 1024 we will again get a page_fault and the
+ * page_mkwrite callback will do the block allocation and mark the
+ * buffer_heads mapped.
+ *
+ * We redirty the page if we have any buffer_heads that is either delay or
+ * unwritten in the page.
+ *
+ * We can get recursively called as show below.
+ *
+ *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+ *		ext4_writepage()
+ *
+ * But since we don't do any block allocation we should not deadlock.
+ * Page also have the dirty flag cleared so we don't get recurive page_lock.
+ */
+static int ext4_writepage(struct page *page,
+			  struct writeback_control *wbc)
+{
+	int ret = 0, commit_write = 0;
+	loff_t size;
+	unsigned int len;
+	struct buffer_head *page_bufs = NULL;
+	struct inode *inode = page->mapping->host;
+
+	trace_ext4_writepage(page);
+	size = i_size_read(inode);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	/*
+	 * If the page does not have buffers (for whatever reason),
+	 * try to create them using __block_write_begin.  If this
+	 * fails, redirty the page and move on.
+	 */
+	if (!page_has_buffers(page)) {
+		if (__block_write_begin(page, 0, len,
+					noalloc_get_block_write)) {
+		redirty_page:
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
+		commit_write = 1;
+	}
+	page_bufs = page_buffers(page);
+	if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+			      ext4_bh_delay_or_unwritten)) {
+		/*
+		 * We don't want to do block allocation, so redirty
+		 * the page and return.  We may reach here when we do
+		 * a journal commit via journal_submit_inode_data_buffers.
+		 * We can also reach here via shrink_page_list
+		 */
+		goto redirty_page;
+	}
+	if (commit_write)
+		/* now mark the buffer_heads as dirty and uptodate */
+		block_commit_write(page, 0, len);
+
+	if (PageChecked(page) && ext4_should_journal_data(inode))
+		/*
+		 * It's mmapped pagecache.  Add buffers and journal it.  There
+		 * doesn't seem much point in redirtying the page here.
+		 */
+		return __ext4_journalled_writepage(page, len);
+
+	if (buffer_uninit(page_bufs)) {
+		ext4_set_bh_endio(page_bufs, inode);
+		ret = block_write_full_page_endio(page, noalloc_get_block_write,
+					    wbc, ext4_end_io_buffer_write);
+	} else
+		ret = block_write_full_page(page, noalloc_get_block_write,
+					    wbc);
+
+	return ret;
+}
+
+/*
+ * This is called via ext4_da_writepages() to
+ * calculate the total number of credits to reserve to fit
+ * a single extent allocation into a single transaction,
+ * ext4_da_writpeages() will loop calling this before
+ * the block allocation.
+ */
+
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+	int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+
+	/*
+	 * With non-extent format the journal credit needed to
+	 * insert nrblocks contiguous block is dependent on
+	 * number of contiguous block. So we will limit
+	 * number of contiguous block to a sane value
+	 */
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
+	    (max_blocks > EXT4_MAX_TRANS_DATA))
+		max_blocks = EXT4_MAX_TRANS_DATA;
+
+	return ext4_chunk_trans_blocks(inode, max_blocks);
+}
+
+/*
+ * write_cache_pages_da - walk the list of dirty pages of the given
+ * address space and accumulate pages that need writing, and call
+ * mpage_da_map_and_submit to map a single contiguous memory region
+ * and then write them.
+ */
+static int write_cache_pages_da(struct address_space *mapping,
+				struct writeback_control *wbc,
+				struct mpage_da_data *mpd,
+				pgoff_t *done_index)
+{
+	struct buffer_head	*bh, *head;
+	struct inode		*inode = mapping->host;
+	struct pagevec		pvec;
+	unsigned int		nr_pages;
+	sector_t		logical;
+	pgoff_t			index, end;
+	long			nr_to_write = wbc->nr_to_write;
+	int			i, tag, ret = 0;
+
+	memset(mpd, 0, sizeof(struct mpage_da_data));
+	mpd->wbc = wbc;
+	mpd->inode = inode;
+	pagevec_init(&pvec, 0);
+	index = wbc->range_start >> PAGE_CACHE_SHIFT;
+	end = wbc->range_end >> PAGE_CACHE_SHIFT;
+
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
+
+	*done_index = index;
+	while (index <= end) {
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		if (nr_pages == 0)
+			return 0;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point, the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or
+			 * even swizzled back from swapper_space to tmpfs file
+			 * mapping. However, page->index will not change
+			 * because we have a reference on the page.
+			 */
+			if (page->index > end)
+				goto out;
+
+			*done_index = page->index + 1;
+
+			/*
+			 * If we can't merge this page, and we have
+			 * accumulated an contiguous region, write it
+			 */
+			if ((mpd->next_page != page->index) &&
+			    (mpd->next_page != mpd->first_page)) {
+				mpage_da_map_and_submit(mpd);
+				goto ret_extent_tail;
+			}
+
+			lock_page(page);
+
+			/*
+			 * If the page is no longer dirty, or its
+			 * mapping no longer corresponds to inode we
+			 * are writing (which means it has been
+			 * truncated or invalidated), or the page is
+			 * already under writeback and we are not
+			 * doing a data integrity writeback, skip the page
+			 */
+			if (!PageDirty(page) ||
+			    (PageWriteback(page) &&
+			     (wbc->sync_mode == WB_SYNC_NONE)) ||
+			    unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			wait_on_page_writeback(page);
+			BUG_ON(PageWriteback(page));
+
+			if (mpd->next_page != page->index)
+				mpd->first_page = page->index;
+			mpd->next_page = page->index + 1;
+			logical = (sector_t) page->index <<
+				(PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+			if (!page_has_buffers(page)) {
+				mpage_add_bh_to_extent(mpd, logical,
+						       PAGE_CACHE_SIZE,
+						       (1 << BH_Dirty) | (1 << BH_Uptodate));
+				if (mpd->io_done)
+					goto ret_extent_tail;
+			} else {
+				/*
+				 * Page with regular buffer heads,
+				 * just add all dirty ones
+				 */
+				head = page_buffers(page);
+				bh = head;
+				do {
+					BUG_ON(buffer_locked(bh));
+					/*
+					 * We need to try to allocate
+					 * unmapped blocks in the same page.
+					 * Otherwise we won't make progress
+					 * with the page in ext4_writepage
+					 */
+					if (ext4_bh_delay_or_unwritten(NULL, bh)) {
+						mpage_add_bh_to_extent(mpd, logical,
+								       bh->b_size,
+								       bh->b_state);
+						if (mpd->io_done)
+							goto ret_extent_tail;
+					} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
+						/*
+						 * mapped dirty buffer. We need
+						 * to update the b_state
+						 * because we look at b_state
+						 * in mpage_da_map_blocks.  We
+						 * don't update b_size because
+						 * if we find an unmapped
+						 * buffer_head later we need to
+						 * use the b_state flag of that
+						 * buffer_head.
+						 */
+						if (mpd->b_size == 0)
+							mpd->b_state = bh->b_state & BH_FLAGS;
+					}
+					logical++;
+				} while ((bh = bh->b_this_page) != head);
+			}
+
+			if (nr_to_write > 0) {
+				nr_to_write--;
+				if (nr_to_write == 0 &&
+				    wbc->sync_mode == WB_SYNC_NONE)
+					/*
+					 * We stop writing back only if we are
+					 * not doing integrity sync. In case of
+					 * integrity sync we have to keep going
+					 * because someone may be concurrently
+					 * dirtying pages, and we might have
+					 * synced a lot of newly appeared dirty
+					 * pages, but have not synced all of the
+					 * old dirty pages.
+					 */
+					goto out;
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	return 0;
+ret_extent_tail:
+	ret = MPAGE_DA_EXTENT_TAIL;
+out:
+	pagevec_release(&pvec);
+	cond_resched();
+	return ret;
+}
+
+
+static int ext4_da_writepages(struct address_space *mapping,
+			      struct writeback_control *wbc)
+{
+	pgoff_t	index;
+	int range_whole = 0;
+	handle_t *handle = NULL;
+	struct mpage_da_data mpd;
+	struct inode *inode = mapping->host;
+	int pages_written = 0;
+	unsigned int max_pages;
+	int range_cyclic, cycled = 1, io_done = 0;
+	int needed_blocks, ret = 0;
+	long desired_nr_to_write, nr_to_writebump = 0;
+	loff_t range_start = wbc->range_start;
+	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+	pgoff_t done_index = 0;
+	pgoff_t end;
+
+	trace_ext4_da_writepages(inode, wbc);
+
+	/*
+	 * No pages to write? This is mainly a kludge to avoid starting
+	 * a transaction for special inodes like journal inode on last iput()
+	 * because that could violate lock ordering on umount
+	 */
+	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+		return 0;
+
+	/*
+	 * If the filesystem has aborted, it is read-only, so return
+	 * right away instead of dumping stack traces later on that
+	 * will obscure the real source of the problem.  We test
+	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
+	 * the latter could be true if the filesystem is mounted
+	 * read-only, and in that case, ext4_da_writepages should
+	 * *never* be called, so if that ever happens, we would want
+	 * the stack trace.
+	 */
+	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
+		return -EROFS;
+
+	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+		range_whole = 1;
+
+	range_cyclic = wbc->range_cyclic;
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index;
+		if (index)
+			cycled = 0;
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
+		wbc->range_end  = LLONG_MAX;
+		wbc->range_cyclic = 0;
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+	}
+
+	/*
+	 * This works around two forms of stupidity.  The first is in
+	 * the writeback code, which caps the maximum number of pages
+	 * written to be 1024 pages.  This is wrong on multiple
+	 * levels; different architectues have a different page size,
+	 * which changes the maximum amount of data which gets
+	 * written.  Secondly, 4 megabytes is way too small.  XFS
+	 * forces this value to be 16 megabytes by multiplying
+	 * nr_to_write parameter by four, and then relies on its
+	 * allocator to allocate larger extents to make them
+	 * contiguous.  Unfortunately this brings us to the second
+	 * stupidity, which is that ext4's mballoc code only allocates
+	 * at most 2048 blocks.  So we force contiguous writes up to
+	 * the number of dirty blocks in the inode, or
+	 * sbi->max_writeback_mb_bump whichever is smaller.
+	 */
+	max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
+	if (!range_cyclic && range_whole) {
+		if (wbc->nr_to_write == LONG_MAX)
+			desired_nr_to_write = wbc->nr_to_write;
+		else
+			desired_nr_to_write = wbc->nr_to_write * 8;
+	} else
+		desired_nr_to_write = ext4_num_dirty_pages(inode, index,
+							   max_pages);
+	if (desired_nr_to_write > max_pages)
+		desired_nr_to_write = max_pages;
+
+	if (wbc->nr_to_write < desired_nr_to_write) {
+		nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
+		wbc->nr_to_write = desired_nr_to_write;
+	}
+
+retry:
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag_pages_for_writeback(mapping, index, end);
+
+	while (!ret && wbc->nr_to_write > 0) {
+
+		/*
+		 * we  insert one extent at a time. So we need
+		 * credit needed for single extent allocation.
+		 * journalled mode is currently not supported
+		 * by delalloc
+		 */
+		BUG_ON(ext4_should_journal_data(inode));
+		needed_blocks = ext4_da_writepages_trans_blocks(inode);
+
+		/* start a new transaction*/
+		handle = ext4_journal_start(inode, needed_blocks);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
+			       "%ld pages, ino %lu; err %d", __func__,
+				wbc->nr_to_write, inode->i_ino, ret);
+			goto out_writepages;
+		}
+
+		/*
+		 * Now call write_cache_pages_da() to find the next
+		 * contiguous region of logical blocks that need
+		 * blocks to be allocated by ext4 and submit them.
+		 */
+		ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
+		/*
+		 * If we have a contiguous extent of pages and we
+		 * haven't done the I/O yet, map the blocks and submit
+		 * them for I/O.
+		 */
+		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
+			mpage_da_map_and_submit(&mpd);
+			ret = MPAGE_DA_EXTENT_TAIL;
+		}
+		trace_ext4_da_write_pages(inode, &mpd);
+		wbc->nr_to_write -= mpd.pages_written;
+
+		ext4_journal_stop(handle);
+
+		if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
+			/* commit the transaction which would
+			 * free blocks released in the transaction
+			 * and try again
+			 */
+			jbd2_journal_force_commit_nested(sbi->s_journal);
+			ret = 0;
+		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
+			/*
+			 * got one extent now try with
+			 * rest of the pages
+			 */
+			pages_written += mpd.pages_written;
+			ret = 0;
+			io_done = 1;
+		} else if (wbc->nr_to_write)
+			/*
+			 * There is no more writeout needed
+			 * or we requested for a noblocking writeout
+			 * and we found the device congested
+			 */
+			break;
+	}
+	if (!io_done && !cycled) {
+		cycled = 1;
+		index = 0;
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
+		wbc->range_end  = mapping->writeback_index - 1;
+		goto retry;
+	}
+
+	/* Update index */
+	wbc->range_cyclic = range_cyclic;
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		/*
+		 * set the writeback_index so that range_cyclic
+		 * mode will write it back later
+		 */
+		mapping->writeback_index = done_index;
+
+out_writepages:
+	wbc->nr_to_write -= nr_to_writebump;
+	wbc->range_start = range_start;
+	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+	return ret;
+}
+
+#define FALL_BACK_TO_NONDELALLOC 1
+static int ext4_nonda_switch(struct super_block *sb)
+{
+	s64 free_blocks, dirty_blocks;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	/*
+	 * switch to non delalloc mode if we are running low
+	 * on free block. The free block accounting via percpu
+	 * counters can get slightly wrong with percpu_counter_batch getting
+	 * accumulated on each CPU without updating global counters
+	 * Delalloc need an accurate free block accounting. So switch
+	 * to non delalloc when we are near to error range.
+	 */
+	free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
+	if (2 * free_blocks < 3 * dirty_blocks ||
+		free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
+		/*
+		 * free block count is less than 150% of dirty blocks
+		 * or free blocks is less than watermark
+		 */
+		return 1;
+	}
+	/*
+	 * Even if we don't switch but are nearing capacity,
+	 * start pushing delalloc when 1/2 of free blocks are dirty.
+	 */
+	if (free_blocks < 2 * dirty_blocks)
+		writeback_inodes_sb_if_idle(sb);
+
+	return 0;
+}
+
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+			       loff_t pos, unsigned len, unsigned flags,
+			       struct page **pagep, void **fsdata)
+{
+	int ret, retries = 0;
+	struct page *page;
+	pgoff_t index;
+	struct inode *inode = mapping->host;
+	handle_t *handle;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+
+	if (ext4_nonda_switch(inode->i_sb)) {
+		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
+		return ext4_write_begin(file, mapping, pos,
+					len, flags, pagep, fsdata);
+	}
+	*fsdata = (void *)0;
+	trace_ext4_da_write_begin(inode, pos, len, flags);
+retry:
+	/*
+	 * With delayed allocation, we don't log the i_disksize update
+	 * if there is delayed block allocation. But we still need
+	 * to journalling the i_disksize update if writes to the end
+	 * of file which has an already mapped buffer.
+	 */
+	handle = ext4_journal_start(inode, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+	/* We cannot recurse into the filesystem as the transaction is already
+	 * started */
+	flags |= AOP_FLAG_NOFS;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page) {
+		ext4_journal_stop(handle);
+		ret = -ENOMEM;
+		goto out;
+	}
+	*pagep = page;
+
+	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
+	if (ret < 0) {
+		unlock_page(page);
+		ext4_journal_stop(handle);
+		page_cache_release(page);
+		/*
+		 * block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 */
+		if (pos + len > inode->i_size)
+			ext4_truncate_failed_write(inode);
+	}
+
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+out:
+	return ret;
+}
+
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+					    unsigned long offset)
+{
+	struct buffer_head *bh;
+	struct inode *inode = page->mapping->host;
+	unsigned int idx;
+	int i;
+
+	bh = page_buffers(page);
+	idx = offset >> inode->i_blkbits;
+
+	for (i = 0; i < idx; i++)
+		bh = bh->b_this_page;
+
+	if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
+		return 0;
+	return 1;
+}
+
+static int ext4_da_write_end(struct file *file,
+			     struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned copied,
+			     struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int ret = 0, ret2;
+	handle_t *handle = ext4_journal_current_handle();
+	loff_t new_i_size;
+	unsigned long start, end;
+	int write_mode = (int)(unsigned long)fsdata;
+
+	if (write_mode == FALL_BACK_TO_NONDELALLOC) {
+		switch (ext4_inode_journal_mode(inode)) {
+		case EXT4_INODE_ORDERED_DATA_MODE:
+			return ext4_ordered_write_end(file, mapping, pos,
+					len, copied, page, fsdata);
+		case EXT4_INODE_WRITEBACK_DATA_MODE:
+			return ext4_writeback_write_end(file, mapping, pos,
+					len, copied, page, fsdata);
+		default:
+			BUG();
+		}
+	}
+
+	trace_ext4_da_write_end(inode, pos, len, copied);
+	start = pos & (PAGE_CACHE_SIZE - 1);
+	end = start + copied - 1;
+
+	/*
+	 * generic_write_end() will run mark_inode_dirty() if i_size
+	 * changes.  So let's piggyback the i_disksize mark_inode_dirty
+	 * into that.
+	 */
+
+	new_i_size = pos + copied;
+	if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
+		if (ext4_da_should_update_i_disksize(page, end)) {
+			down_write(&EXT4_I(inode)->i_data_sem);
+			if (new_i_size > EXT4_I(inode)->i_disksize) {
+				/*
+				 * Updating i_disksize when extending file
+				 * without needing block allocation
+				 */
+				if (ext4_should_order_data(inode))
+					ret = ext4_jbd2_file_inode(handle,
+								   inode);
+
+				EXT4_I(inode)->i_disksize = new_i_size;
+			}
+			up_write(&EXT4_I(inode)->i_data_sem);
+			/* We need to mark inode dirty even if
+			 * new_i_size is less that inode->i_size
+			 * bu greater than i_disksize.(hint delalloc)
+			 */
+			ext4_mark_inode_dirty(handle, inode);
+		}
+	}
+	ret2 = generic_write_end(file, mapping, pos, len, copied,
+							page, fsdata);
+	copied = ret2;
+	if (ret2 < 0)
+		ret = ret2;
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+
+	return ret ? ret : copied;
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+	/*
+	 * Drop reserved blocks
+	 */
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		goto out;
+
+	ext4_da_page_release_reservation(page, offset);
+
+out:
+	ext4_invalidatepage(page, offset);
+
+	return;
+}
+
+/*
+ * Force all delayed allocation blocks to be allocated for a given inode.
+ */
+int ext4_alloc_da_blocks(struct inode *inode)
+{
+	trace_ext4_alloc_da_blocks(inode);
+
+	if (!EXT4_I(inode)->i_reserved_data_blocks &&
+	    !EXT4_I(inode)->i_reserved_meta_blocks)
+		return 0;
+
+	/*
+	 * We do something simple for now.  The filemap_flush() will
+	 * also start triggering a write of the data blocks, which is
+	 * not strictly speaking necessary (and for users of
+	 * laptop_mode, not even desirable).  However, to do otherwise
+	 * would require replicating code paths in:
+	 *
+	 * ext4_da_writepages() ->
+	 *    write_cache_pages() ---> (via passed in callback function)
+	 *        __mpage_da_writepage() -->
+	 *           mpage_add_bh_to_extent()
+	 *           mpage_da_map_blocks()
+	 *
+	 * The problem is that write_cache_pages(), located in
+	 * mm/page-writeback.c, marks pages clean in preparation for
+	 * doing I/O, which is not desirable if we're not planning on
+	 * doing I/O at all.
+	 *
+	 * We could call write_cache_pages(), and then redirty all of
+	 * the pages by calling redirty_page_for_writepage() but that
+	 * would be ugly in the extreme.  So instead we would need to
+	 * replicate parts of the code in the above functions,
+	 * simplifying them because we wouldn't actually intend to
+	 * write out the pages, but rather only collect contiguous
+	 * logical block extents, call the multi-block allocator, and
+	 * then update the buffer heads with the block allocations.
+	 *
+	 * For now, though, we'll cheat by calling filemap_flush(),
+	 * which will map the blocks, and start the I/O, but not
+	 * actually wait for the I/O to complete.
+	 */
+	return filemap_flush(inode->i_mapping);
+}
+
+/*
+ * bmap() is special.  It gets used by applications such as lilo and by
+ * the swapper to find the on-disk block of a specific piece of data.
+ *
+ * Naturally, this is dangerous if the block concerned is still in the
+ * journal.  If somebody makes a swapfile on an ext4 data-journaling
+ * filesystem and enables swap, then they may get a nasty shock when the
+ * data getting swapped to that swapfile suddenly gets overwritten by
+ * the original zero's written out previously to the journal and
+ * awaiting writeback in the kernel's buffer cache.
+ *
+ * So, if we see any bmap calls here on a modified, data-journaled file,
+ * take extra steps to flush any blocks which might be in the cache.
+ */
+static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
+{
+	struct inode *inode = mapping->host;
+	journal_t *journal;
+	int err;
+
+	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+			test_opt(inode->i_sb, DELALLOC)) {
+		/*
+		 * With delalloc we want to sync the file
+		 * so that we can make sure we allocate
+		 * blocks for file
+		 */
+		filemap_write_and_wait(mapping);
+	}
+
+	if (EXT4_JOURNAL(inode) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
+		/*
+		 * This is a REALLY heavyweight approach, but the use of
+		 * bmap on dirty files is expected to be extremely rare:
+		 * only if we run lilo or swapon on a freshly made file
+		 * do we expect this to happen.
+		 *
+		 * (bmap requires CAP_SYS_RAWIO so this does not
+		 * represent an unprivileged user DOS attack --- we'd be
+		 * in trouble if mortal users could trigger this path at
+		 * will.)
+		 *
+		 * NB. EXT4_STATE_JDATA is not set on files other than
+		 * regular files.  If somebody wants to bmap a directory
+		 * or symlink and gets confused because the buffer
+		 * hasn't yet been flushed to disk, they deserve
+		 * everything they get.
+		 */
+
+		ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
+		journal = EXT4_JOURNAL(inode);
+		jbd2_journal_lock_updates(journal);
+		err = jbd2_journal_flush(journal);
+		jbd2_journal_unlock_updates(journal);
+
+		if (err)
+			return 0;
+	}
+
+	return generic_block_bmap(mapping, block, ext4_get_block);
+}
+
+static int ext4_readpage(struct file *file, struct page *page)
+{
+	trace_ext4_readpage(page);
+	return mpage_readpage(page, ext4_get_block);
+}
+
+static int
+ext4_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
+}
+
+static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
+{
+	struct buffer_head *head, *bh;
+	unsigned int curr_off = 0;
+
+	if (!page_has_buffers(page))
+		return;
+	head = bh = page_buffers(page);
+	do {
+		if (offset <= curr_off && test_clear_buffer_uninit(bh)
+					&& bh->b_private) {
+			ext4_free_io_end(bh->b_private);
+			bh->b_private = NULL;
+			bh->b_end_io = NULL;
+		}
+		curr_off = curr_off + bh->b_size;
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned long offset)
+{
+	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+
+	trace_ext4_invalidatepage(page, offset);
+
+	/*
+	 * free any io_end structure allocated for buffers to be discarded
+	 */
+	if (ext4_should_dioread_nolock(page->mapping->host))
+		ext4_invalidatepage_free_endio(page, offset);
+	/*
+	 * If it's a full truncate we just forget about the pending dirtying
+	 */
+	if (offset == 0)
+		ClearPageChecked(page);
+
+	if (journal)
+		jbd2_journal_invalidatepage(journal, page, offset);
+	else
+		block_invalidatepage(page, offset);
+}
+
+static int ext4_releasepage(struct page *page, gfp_t wait)
+{
+	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+
+	trace_ext4_releasepage(page);
+
+	WARN_ON(PageChecked(page));
+	if (!page_has_buffers(page))
+		return 0;
+	if (journal)
+		return jbd2_journal_try_to_free_buffers(journal, page, wait);
+	else
+		return try_to_free_buffers(page);
+}
+
+/*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list.  So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ * If the O_DIRECT write is intantiating holes inside i_size and the machine
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
+ */
+static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
+			      const struct iovec *iov, loff_t offset,
+			      unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	handle_t *handle;
+	ssize_t ret;
+	int orphan = 0;
+	size_t count = iov_length(iov, nr_segs);
+	int retries = 0;
+
+	if (rw == WRITE) {
+		loff_t final_size = offset + count;
+
+		if (final_size > inode->i_size) {
+			/* Credits for sb + inode write */
+			handle = ext4_journal_start(inode, 2);
+			if (IS_ERR(handle)) {
+				ret = PTR_ERR(handle);
+				goto out;
+			}
+			ret = ext4_orphan_add(handle, inode);
+			if (ret) {
+				ext4_journal_stop(handle);
+				goto out;
+			}
+			orphan = 1;
+			ei->i_disksize = inode->i_size;
+			ext4_journal_stop(handle);
+		}
+	}
+
+retry:
+	if (rw == READ && ext4_should_dioread_nolock(inode)) {
+		if (unlikely(!list_empty(&ei->i_completed_io_list))) {
+			mutex_lock(&inode->i_mutex);
+			ext4_flush_completed_IO(inode);
+			mutex_unlock(&inode->i_mutex);
+		}
+		ret = __blockdev_direct_IO(rw, iocb, inode,
+				 inode->i_sb->s_bdev, iov,
+				 offset, nr_segs,
+				 ext4_get_block, NULL, NULL, 0);
+	} else {
+		ret = blockdev_direct_IO(rw, iocb, inode,
+				 inode->i_sb->s_bdev, iov,
+				 offset, nr_segs,
+				 ext4_get_block, NULL);
+
+		if (unlikely((rw & WRITE) && ret < 0)) {
+			loff_t isize = i_size_read(inode);
+			loff_t end = offset + iov_length(iov, nr_segs);
+
+			if (end > isize)
+				ext4_truncate_failed_write(inode);
+		}
+	}
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+
+	if (orphan) {
+		int err;
+
+		/* Credits for sb + inode write */
+		handle = ext4_journal_start(inode, 2);
+		if (IS_ERR(handle)) {
+			/* This is really bad luck. We've written the data
+			 * but cannot extend i_size. Bail out and pretend
+			 * the write failed... */
+			ret = PTR_ERR(handle);
+			if (inode->i_nlink)
+				ext4_orphan_del(NULL, inode);
+
+			goto out;
+		}
+		if (inode->i_nlink)
+			ext4_orphan_del(handle, inode);
+		if (ret > 0) {
+			loff_t end = offset + ret;
+			if (end > inode->i_size) {
+				ei->i_disksize = end;
+				i_size_write(inode, end);
+				/*
+				 * We're going to return a positive `ret'
+				 * here due to non-zero-length I/O, so there's
+				 * no way of reporting error returns from
+				 * ext4_mark_inode_dirty() to userspace.  So
+				 * ignore it.
+				 */
+				ext4_mark_inode_dirty(handle, inode);
+			}
+		}
+		err = ext4_journal_stop(handle);
+		if (ret == 0)
+			ret = err;
+	}
+out:
+	return ret;
+}
+
+/*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create)
+{
+	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
+		   inode->i_ino, create);
+	return _ext4_get_block(inode, iblock, bh_result,
+			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+			    ssize_t size, void *private, int ret,
+			    bool is_async)
+{
+        ext4_io_end_t *io_end = iocb->private;
+	struct workqueue_struct *wq;
+	unsigned long flags;
+	struct ext4_inode_info *ei;
+
+	/* if not async direct IO or dio with 0 bytes write, just return */
+	if (!io_end || !size)
+		goto out;
+
+	ext_debug("ext4_end_io_dio(): io_end 0x%p"
+		  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+ 		  iocb->private, io_end->inode->i_ino, iocb, offset,
+		  size);
+
+	/* if not aio dio with unwritten extents, just free io and return */
+	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		ext4_free_io_end(io_end);
+		iocb->private = NULL;
+out:
+		if (is_async)
+			aio_complete(iocb, ret, 0);
+		return;
+	}
+
+	io_end->offset = offset;
+	io_end->size = size;
+	if (is_async) {
+		io_end->iocb = iocb;
+		io_end->result = ret;
+	}
+	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+	/* Add the io_end to per-inode completed aio dio list*/
+	ei = EXT4_I(io_end->inode);
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	list_add_tail(&io_end->list, &ei->i_completed_io_list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+	/* queue the work to convert unwritten extents to written */
+	queue_work(wq, &io_end->work);
+	iocb->private = NULL;
+}
+
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
+{
+	ext4_io_end_t *io_end = bh->b_private;
+	struct workqueue_struct *wq;
+	struct inode *inode;
+	unsigned long flags;
+
+	if (!test_clear_buffer_uninit(bh) || !io_end)
+		goto out;
+
+	if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
+		printk("sb umounted, discard end_io request for inode %lu\n",
+			io_end->inode->i_ino);
+		ext4_free_io_end(io_end);
+		goto out;
+	}
+
+	/*
+	 * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
+	 * but being more careful is always safe for the future change.
+	 */
+	inode = io_end->inode;
+	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		io_end->flag |= EXT4_IO_END_UNWRITTEN;
+		atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+	}
+
+	/* Add the io_end to per-inode completed io list*/
+	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+
+	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+	/* queue the work to convert unwritten extents to written */
+	queue_work(wq, &io_end->work);
+out:
+	bh->b_private = NULL;
+	bh->b_end_io = NULL;
+	clear_buffer_uninit(bh);
+	end_buffer_async_write(bh, uptodate);
+}
+
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
+{
+	ext4_io_end_t *io_end;
+	struct page *page = bh->b_page;
+	loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
+	size_t size = bh->b_size;
+
+retry:
+	io_end = ext4_init_io_end(inode, GFP_ATOMIC);
+	if (!io_end) {
+		pr_warn_ratelimited("%s: allocation fail\n", __func__);
+		schedule();
+		goto retry;
+	}
+	io_end->offset = offset;
+	io_end->size = size;
+	/*
+	 * We need to hold a reference to the page to make sure it
+	 * doesn't get evicted before ext4_end_io_work() has a chance
+	 * to convert the extent from written to unwritten.
+	 */
+	io_end->page = page;
+	get_page(io_end->page);
+
+	bh->b_private = io_end;
+	bh->b_end_io = ext4_end_io_buffer_write;
+	return 0;
+}
+
+/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as uninitialized
+ * If those blocks were preallocated, we mark sure they are splited, but
+ * still keep the range to write as uninitialized.
+ *
+ * The unwrritten extents will be converted to written when DIO is completed.
+ * For async direct IO, since the IO may still pending when return, we
+ * set up an end_io call back function, which will do the conversion
+ * when async direct IO completed.
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list.  So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ */
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+			      const struct iovec *iov, loff_t offset,
+			      unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
+	size_t count = iov_length(iov, nr_segs);
+
+	loff_t final_size = offset + count;
+	if (rw == WRITE && final_size <= inode->i_size) {
+		/*
+ 		 * We could direct write to holes and fallocate.
+		 *
+ 		 * Allocated blocks to fill the hole are marked as uninitialized
+ 		 * to prevent parallel buffered read to expose the stale data
+ 		 * before DIO complete the data IO.
+		 *
+ 		 * As to previously fallocated extents, ext4 get_block
+ 		 * will just simply mark the buffer mapped but still
+ 		 * keep the extents uninitialized.
+ 		 *
+		 * for non AIO case, we will convert those unwritten extents
+		 * to written after return back from blockdev_direct_IO.
+		 *
+		 * for async DIO, the conversion needs to be defered when
+		 * the IO is completed. The ext4 end_io callback function
+		 * will be called to take care of the conversion work.
+		 * Here for async case, we allocate an io_end structure to
+		 * hook to the iocb.
+ 		 */
+		iocb->private = NULL;
+		EXT4_I(inode)->cur_aio_dio = NULL;
+		if (!is_sync_kiocb(iocb)) {
+			iocb->private = ext4_init_io_end(inode, GFP_NOFS);
+			if (!iocb->private)
+				return -ENOMEM;
+			/*
+			 * we save the io structure for current async
+			 * direct IO, so that later ext4_map_blocks()
+			 * could flag the io structure whether there
+			 * is a unwritten extents needs to be converted
+			 * when IO is completed.
+			 */
+			EXT4_I(inode)->cur_aio_dio = iocb->private;
+		}
+
+		ret = blockdev_direct_IO(rw, iocb, inode,
+					 inode->i_sb->s_bdev, iov,
+					 offset, nr_segs,
+					 ext4_get_block_write,
+					 ext4_end_io_dio);
+		if (iocb->private)
+			EXT4_I(inode)->cur_aio_dio = NULL;
+		/*
+		 * The io_end structure takes a reference to the inode,
+		 * that structure needs to be destroyed and the
+		 * reference to the inode need to be dropped, when IO is
+		 * complete, even with 0 byte write, or failed.
+		 *
+		 * In the successful AIO DIO case, the io_end structure will be
+		 * desctroyed and the reference to the inode will be dropped
+		 * after the end_io call back function is called.
+		 *
+		 * In the case there is 0 byte write, or error case, since
+		 * VFS direct IO won't invoke the end_io call back function,
+		 * we need to free the end_io structure here.
+		 */
+		if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+			ext4_free_io_end(iocb->private);
+			iocb->private = NULL;
+		} else if (ret > 0 && ext4_test_inode_state(inode,
+						EXT4_STATE_DIO_UNWRITTEN)) {
+			int err;
+			/*
+			 * for non AIO case, since the IO is already
+			 * completed, we could do the conversion right here
+			 */
+			err = ext4_convert_unwritten_extents(inode,
+							     offset, ret);
+			if (err < 0)
+				ret = err;
+			ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+		}
+		return ret;
+	}
+
+	/* for write the the end of file case, we fall back to old way */
+	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+			      const struct iovec *iov, loff_t offset,
+			      unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
+
+	trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+	else
+		ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+	trace_ext4_direct_IO_exit(inode, offset,
+				iov_length(iov, nr_segs), rw, ret);
+	return ret;
+}
+
+/*
+ * Pages can be marked dirty completely asynchronously from ext4's journalling
+ * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
+ * much here because ->set_page_dirty is called under VFS locks.  The page is
+ * not necessarily locked.
+ *
+ * We cannot just dirty the page and leave attached buffers clean, because the
+ * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
+ * or jbddirty because all the journalling code will explode.
+ *
+ * So what we do is to mark the page "pending dirty" and next time writepage
+ * is called, propagate that into the buffers appropriately.
+ */
+static int ext4_journalled_set_page_dirty(struct page *page)
+{
+	SetPageChecked(page);
+	return __set_page_dirty_nobuffers(page);
+}
+
+static const struct address_space_operations ext4_ordered_aops = {
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_writepage,
+	.write_begin		= ext4_write_begin,
+	.write_end		= ext4_ordered_write_end,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+static const struct address_space_operations ext4_writeback_aops = {
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_writepage,
+	.write_begin		= ext4_write_begin,
+	.write_end		= ext4_writeback_write_end,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+static const struct address_space_operations ext4_journalled_aops = {
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_writepage,
+	.write_begin		= ext4_write_begin,
+	.write_end		= ext4_journalled_write_end,
+	.set_page_dirty		= ext4_journalled_set_page_dirty,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+static const struct address_space_operations ext4_da_aops = {
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_writepage,
+	.writepages		= ext4_da_writepages,
+	.write_begin		= ext4_da_write_begin,
+	.write_end		= ext4_da_write_end,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_da_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
+
+void ext4_set_aops(struct inode *inode)
+{
+	switch (ext4_inode_journal_mode(inode)) {
+	case EXT4_INODE_ORDERED_DATA_MODE:
+		if (test_opt(inode->i_sb, DELALLOC))
+			inode->i_mapping->a_ops = &ext4_da_aops;
+		else
+			inode->i_mapping->a_ops = &ext4_ordered_aops;
+		break;
+	case EXT4_INODE_WRITEBACK_DATA_MODE:
+		if (test_opt(inode->i_sb, DELALLOC))
+			inode->i_mapping->a_ops = &ext4_da_aops;
+		else
+			inode->i_mapping->a_ops = &ext4_writeback_aops;
+		break;
+	case EXT4_INODE_JOURNAL_DATA_MODE:
+		inode->i_mapping->a_ops = &ext4_journalled_aops;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+int ext4_block_truncate_page(handle_t *handle,
+		struct address_space *mapping, loff_t from)
+{
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned length;
+	unsigned blocksize;
+	struct inode *inode = mapping->host;
+
+	blocksize = inode->i_sb->s_blocksize;
+	length = blocksize - (offset & (blocksize - 1));
+
+	return ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
+/*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+int ext4_block_zero_page_range(handle_t *handle,
+		struct address_space *mapping, loff_t from, loff_t length)
+{
+	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize, max, pos;
+	ext4_lblk_t iblock;
+	struct inode *inode = mapping->host;
+	struct buffer_head *bh;
+	struct page *page;
+	int err = 0;
+
+	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+				   mapping_gfp_mask(mapping) & ~__GFP_FS);
+	if (!page)
+		return -EINVAL;
+
+	blocksize = inode->i_sb->s_blocksize;
+	max = blocksize - (offset & (blocksize - 1));
+
+	/*
+	 * correct length if it does not fall between
+	 * 'from' and the end of the block
+	 */
+	if (length > max || length < 0)
+		length = max;
+
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	/* Find the buffer that contains "offset" */
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	err = 0;
+	if (buffer_freed(bh)) {
+		BUFFER_TRACE(bh, "freed: skip");
+		goto unlock;
+	}
+
+	if (!buffer_mapped(bh)) {
+		BUFFER_TRACE(bh, "unmapped");
+		ext4_get_block(inode, iblock, bh, 0);
+		/* unmapped? It's a hole - nothing to do */
+		if (!buffer_mapped(bh)) {
+			BUFFER_TRACE(bh, "still unmapped");
+			goto unlock;
+		}
+	}
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!buffer_uptodate(bh)) {
+		err = -EIO;
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (!buffer_uptodate(bh))
+			goto unlock;
+	}
+
+	if (ext4_should_journal_data(inode)) {
+		BUFFER_TRACE(bh, "get write access");
+		err = ext4_journal_get_write_access(handle, bh);
+		if (err)
+			goto unlock;
+	}
+
+	zero_user(page, offset, length);
+
+	BUFFER_TRACE(bh, "zeroed end of block");
+
+	err = 0;
+	if (ext4_should_journal_data(inode)) {
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
+	} else {
+		if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
+			err = ext4_jbd2_file_inode(handle, inode);
+		mark_buffer_dirty(bh);
+	}
+
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * Probably it should be a library function... search for first non-zero word
+ * or memcmp with zero_page, whatever is better for particular architecture.
+ * Linus?
+ */
+static inline int all_zeroes(__le32 *p, __le32 *q)
+{
+	while (p < q)
+		if (*p++)
+			return 0;
+	return 1;
+}
+
+/**
+ *	ext4_find_shared - find the indirect blocks for partial truncation.
+ *	@inode:	  inode in question
+ *	@depth:	  depth of the affected branch
+ *	@offsets: offsets of pointers in that branch (see ext4_block_to_path)
+ *	@chain:	  place to store the pointers to partial indirect blocks
+ *	@top:	  place to the (detached) top of branch
+ *
+ *	This is a helper function used by ext4_truncate().
+ *
+ *	When we do truncate() we may have to clean the ends of several
+ *	indirect blocks but leave the blocks themselves alive. Block is
+ *	partially truncated if some data below the new i_size is referred
+ *	from it (and it is on the path to the first completely truncated
+ *	data block, indeed).  We have to free the top of that path along
+ *	with everything to the right of the path. Since no allocation
+ *	past the truncation point is possible until ext4_truncate()
+ *	finishes, we may safely do the latter, but top of branch may
+ *	require special attention - pageout below the truncation point
+ *	might try to populate it.
+ *
+ *	We atomically detach the top of branch from the tree, store the
+ *	block number of its root in *@top, pointers to buffer_heads of
+ *	partially truncated blocks - in @chain[].bh and pointers to
+ *	their last elements that should not be removed - in
+ *	@chain[].p. Return value is the pointer to last filled element
+ *	of @chain.
+ *
+ *	The work left to caller to do the actual freeing of subtrees:
+ *		a) free the subtree starting from *@top
+ *		b) free the subtrees whose roots are stored in
+ *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
+ *		c) free the subtrees growing from the inode past the @chain[0].
+ *			(no partially truncated stuff there).  */
+
+static Indirect *ext4_find_shared(struct inode *inode, int depth,
+				  ext4_lblk_t offsets[4], Indirect chain[4],
+				  __le32 *top)
+{
+	Indirect *partial, *p;
+	int k, err;
+
+	*top = 0;
+	/* Make k index the deepest non-null offset + 1 */
+	for (k = depth; k > 1 && !offsets[k-1]; k--)
+		;
+	partial = ext4_get_branch(inode, k, offsets, chain, &err);
+	/* Writer: pointers */
+	if (!partial)
+		partial = chain + k-1;
+	/*
+	 * If the branch acquired continuation since we've looked at it -
+	 * fine, it should all survive and (new) top doesn't belong to us.
+	 */
+	if (!partial->key && *partial->p)
+		/* Writer: end */
+		goto no_top;
+	for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
+		;
+	/*
+	 * OK, we've found the last block that must survive. The rest of our
+	 * branch should be detached before unlocking. However, if that rest
+	 * of branch is all ours and does not grow immediately from the inode
+	 * it's easier to cheat and just decrement partial->p.
+	 */
+	if (p == chain + k - 1 && p > chain) {
+		p->p--;
+	} else {
+		*top = *p->p;
+		/* Nope, don't do this in ext4.  Must leave the tree intact */
+#if 0
+		*p->p = 0;
+#endif
+	}
+	/* Writer: end */
+
+	while (partial > p) {
+		brelse(partial->bh);
+		partial--;
+	}
+no_top:
+	return partial;
+}
+
+/*
+ * Zero a number of block pointers in either an inode or an indirect block.
+ * If we restart the transaction we must again get write access to the
+ * indirect block for further modification.
+ *
+ * We release `count' blocks on disk, but (last - first) may be greater
+ * than `count' because there can be holes in there.
+ *
+ * Return 0 on success, 1 on invalid block range
+ * and < 0 on fatal error.
+ */
+static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
+			     struct buffer_head *bh,
+			     ext4_fsblk_t block_to_free,
+			     unsigned long count, __le32 *first,
+			     __le32 *last)
+{
+	__le32 *p;
+	int	flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+	int	err;
+
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		flags |= EXT4_FREE_BLOCKS_METADATA;
+
+	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
+				   count)) {
+		EXT4_ERROR_INODE(inode, "attempt to clear invalid "
+				 "blocks %llu len %lu",
+				 (unsigned long long) block_to_free, count);
+		return 1;
+	}
+
+	if (try_to_extend_transaction(handle, inode)) {
+		if (bh) {
+			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+			err = ext4_handle_dirty_metadata(handle, inode, bh);
+			if (unlikely(err))
+				goto out_err;
+		}
+		err = ext4_mark_inode_dirty(handle, inode);
+		if (unlikely(err))
+			goto out_err;
+		err = ext4_truncate_restart_trans(handle, inode,
+						  blocks_for_truncate(inode));
+		if (unlikely(err))
+			goto out_err;
+		if (bh) {
+			BUFFER_TRACE(bh, "retaking write access");
+			err = ext4_journal_get_write_access(handle, bh);
+			if (unlikely(err))
+				goto out_err;
+		}
+	}
+
+	for (p = first; p < last; p++)
+		*p = 0;
+
+	ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
+	return 0;
+out_err:
+	ext4_std_error(inode->i_sb, err);
+	return err;
+}
+
+/**
+ * ext4_free_data - free a list of data blocks
+ * @handle:	handle for this transaction
+ * @inode:	inode we are dealing with
+ * @this_bh:	indirect buffer_head which contains *@first and *@last
+ * @first:	array of block numbers
+ * @last:	points immediately past the end of array
+ *
+ * We are freeing all blocks referred from that array (numbers are stored as
+ * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+ *
+ * We accumulate contiguous runs of blocks to free.  Conveniently, if these
+ * blocks are contiguous then releasing them at one time will only affect one
+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
+ * actually use a lot of journal space.
+ *
+ * @this_bh will be %NULL if @first and @last point into the inode's direct
+ * block pointers.
+ */
+static void ext4_free_data(handle_t *handle, struct inode *inode,
+			   struct buffer_head *this_bh,
+			   __le32 *first, __le32 *last)
+{
+	ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
+	unsigned long count = 0;	    /* Number of blocks in the run */
+	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
+					       corresponding to
+					       block_to_free */
+	ext4_fsblk_t nr;		    /* Current block # */
+	__le32 *p;			    /* Pointer into inode/ind
+					       for current block */
+	int err = 0;
+
+	if (this_bh) {				/* For indirect block */
+		BUFFER_TRACE(this_bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, this_bh);
+		/* Important: if we can't update the indirect pointers
+		 * to the blocks, we can't free them. */
+		if (err)
+			return;
+	}
+
+	for (p = first; p < last; p++) {
+		nr = le32_to_cpu(*p);
+		if (nr) {
+			/* accumulate blocks to free if they're contiguous */
+			if (count == 0) {
+				block_to_free = nr;
+				block_to_free_p = p;
+				count = 1;
+			} else if (nr == block_to_free + count) {
+				count++;
+			} else {
+				err = ext4_clear_blocks(handle, inode, this_bh,
+						        block_to_free, count,
+						        block_to_free_p, p);
+				if (err)
+					break;
+				block_to_free = nr;
+				block_to_free_p = p;
+				count = 1;
+			}
+		}
+	}
+
+	if (!err && count > 0)
+		err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
+					count, block_to_free_p, p);
+	if (err < 0)
+		/* fatal error */
+		return;
+
+	if (this_bh) {
+		BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
+
+		/*
+		 * The buffer head should have an attached journal head at this
+		 * point. However, if the data is corrupted and an indirect
+		 * block pointed to itself, it would have been detached when
+		 * the block was cleared. Check for this instead of OOPSing.
+		 */
+		if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
+			ext4_handle_dirty_metadata(handle, inode, this_bh);
+		else
+			EXT4_ERROR_INODE(inode,
+					 "circular indirect block detected at "
+					 "block %llu",
+				(unsigned long long) this_bh->b_blocknr);
+	}
+}
+
+/**
+ *	ext4_free_branches - free an array of branches
+ *	@handle: JBD handle for this transaction
+ *	@inode:	inode we are dealing with
+ *	@parent_bh: the buffer_head which contains *@first and *@last
+ *	@first:	array of block numbers
+ *	@last:	pointer immediately past the end of array
+ *	@depth:	depth of the branches to free
+ *
+ *	We are freeing all blocks referred from these branches (numbers are
+ *	stored as little-endian 32-bit) and updating @inode->i_blocks
+ *	appropriately.
+ */
+static void ext4_free_branches(handle_t *handle, struct inode *inode,
+			       struct buffer_head *parent_bh,
+			       __le32 *first, __le32 *last, int depth)
+{
+	ext4_fsblk_t nr;
+	__le32 *p;
+
+	if (ext4_handle_is_aborted(handle))
+		return;
+
+	if (depth--) {
+		struct buffer_head *bh;
+		int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+		p = last;
+		while (--p >= first) {
+			nr = le32_to_cpu(*p);
+			if (!nr)
+				continue;		/* A hole */
+
+			if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+						   nr, 1)) {
+				EXT4_ERROR_INODE(inode,
+						 "invalid indirect mapped "
+						 "block %lu (level %d)",
+						 (unsigned long) nr, depth);
+				break;
+			}
+
+			/* Go read the buffer for the next level down */
+			bh = sb_bread(inode->i_sb, nr);
+
+			/*
+			 * A read failure? Report error and clear slot
+			 * (should be rare).
+			 */
+			if (!bh) {
+				EXT4_ERROR_INODE_BLOCK(inode, nr,
+						       "Read failure");
+				continue;
+			}
+
+			/* This zaps the entire block.  Bottom up. */
+			BUFFER_TRACE(bh, "free child branches");
+			ext4_free_branches(handle, inode, bh,
+					(__le32 *) bh->b_data,
+					(__le32 *) bh->b_data + addr_per_block,
+					depth);
+			brelse(bh);
+
+			/*
+			 * Everything below this this pointer has been
+			 * released.  Now let this top-of-subtree go.
+			 *
+			 * We want the freeing of this indirect block to be
+			 * atomic in the journal with the updating of the
+			 * bitmap block which owns it.  So make some room in
+			 * the journal.
+			 *
+			 * We zero the parent pointer *after* freeing its
+			 * pointee in the bitmaps, so if extend_transaction()
+			 * for some reason fails to put the bitmap changes and
+			 * the release into the same transaction, recovery
+			 * will merely complain about releasing a free block,
+			 * rather than leaking blocks.
+			 */
+			if (ext4_handle_is_aborted(handle))
+				return;
+			if (try_to_extend_transaction(handle, inode)) {
+				ext4_mark_inode_dirty(handle, inode);
+				ext4_truncate_restart_trans(handle, inode,
+					    blocks_for_truncate(inode));
+			}
+
+			/*
+			 * The forget flag here is critical because if
+			 * we are journaling (and not doing data
+			 * journaling), we have to make sure a revoke
+			 * record is written to prevent the journal
+			 * replay from overwriting the (former)
+			 * indirect block if it gets reallocated as a
+			 * data block.  This must happen in the same
+			 * transaction where the data blocks are
+			 * actually freed.
+			 */
+			ext4_free_blocks(handle, inode, NULL, nr, 1,
+					 EXT4_FREE_BLOCKS_METADATA|
+					 EXT4_FREE_BLOCKS_FORGET);
+
+			if (parent_bh) {
+				/*
+				 * The block which we have just freed is
+				 * pointed to by an indirect block: journal it
+				 */
+				BUFFER_TRACE(parent_bh, "get_write_access");
+				if (!ext4_journal_get_write_access(handle,
+								   parent_bh)){
+					*p = 0;
+					BUFFER_TRACE(parent_bh,
+					"call ext4_handle_dirty_metadata");
+					ext4_handle_dirty_metadata(handle,
+								   inode,
+								   parent_bh);
+				}
+			}
+		}
+	} else {
+		/* We have reached the bottom of the tree. */
+		BUFFER_TRACE(parent_bh, "free data blocks");
+		ext4_free_data(handle, inode, parent_bh, first, last);
+	}
+}
+
+int ext4_can_truncate(struct inode *inode)
+{
+	if (S_ISREG(inode->i_mode))
+		return 1;
+	if (S_ISDIR(inode->i_mode))
+		return 1;
+	if (S_ISLNK(inode->i_mode))
+		return !ext4_inode_is_fast_symlink(inode);
+	return 0;
+}
+
+/*
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
+ *
+ * @inode:  File inode
+ * @offset: The offset where the hole will begin
+ * @len:    The length of the hole
+ *
+ * Returns: 0 on sucess or negative on failure
+ */
+
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	if (!S_ISREG(inode->i_mode))
+		return -ENOTSUPP;
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		/* TODO: Add support for non extent hole punching */
+		return -ENOTSUPP;
+	}
+
+	return ext4_ext_punch_hole(file, offset, length);
+}
+
+/*
+ * ext4_truncate()
+ *
+ * We block out ext4_get_block() block instantiations across the entire
+ * transaction, and VFS/VM ensures that ext4_truncate() cannot run
+ * simultaneously on behalf of the same inode.
+ *
+ * As we work through the truncate and commmit bits of it to the journal there
+ * is one core, guiding principle: the file's tree must always be consistent on
+ * disk.  We must be able to restart the truncate after a crash.
+ *
+ * The file's tree may be transiently inconsistent in memory (although it
+ * probably isn't), but whenever we close off and commit a journal transaction,
+ * the contents of (the filesystem + the journal) must be consistent and
+ * restartable.  It's pretty simple, really: bottom up, right to left (although
+ * left-to-right works OK too).
+ *
+ * Note that at recovery time, journal replay occurs *before* the restart of
+ * truncate against the orphan inode list.
+ *
+ * The committed inode has the new, desired i_size (which is the same as
+ * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
+ * that this inode's truncate did not complete and it will again call
+ * ext4_truncate() to have another go.  So there will be instantiated blocks
+ * to the right of the truncation point in a crashed ext4 filesystem.  But
+ * that's fine - as long as they are linked from the inode, the post-crash
+ * ext4_truncate() run will find them and release them.
+ */
+void ext4_truncate(struct inode *inode)
+{
+	handle_t *handle;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	__le32 *i_data = ei->i_data;
+	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	struct address_space *mapping = inode->i_mapping;
+	ext4_lblk_t offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	__le32 nr = 0;
+	int n = 0;
+	ext4_lblk_t last_block, max_block;
+	unsigned blocksize = inode->i_sb->s_blocksize;
+
+	trace_ext4_truncate_enter(inode);
+
+	if (!ext4_can_truncate(inode))
+		return;
+
+	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+
+	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		ext4_ext_truncate(inode);
+		trace_ext4_truncate_exit(inode);
+		return;
+	}
+
+	handle = start_transaction(inode);
+	if (IS_ERR(handle))
+		return;		/* AKPM: return what? */
+
+	last_block = (inode->i_size + blocksize-1)
+					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+
+	if (inode->i_size & (blocksize - 1))
+		if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+			goto out_stop;
+
+	if (last_block != max_block) {
+		n = ext4_block_to_path(inode, last_block, offsets, NULL);
+		if (n == 0)
+			goto out_stop;	/* error */
+	}
+
+	/*
+	 * OK.  This truncate is going to happen.  We add the inode to the
+	 * orphan list, so that if this truncate spans multiple transactions,
+	 * and we crash, we will resume the truncate when the filesystem
+	 * recovers.  It also marks the inode dirty, to catch the new size.
+	 *
+	 * Implication: the file must always be in a sane, consistent
+	 * truncatable state while each transaction commits.
+	 */
+	if (ext4_orphan_add(handle, inode))
+		goto out_stop;
+
+	/*
+	 * From here we block out all ext4_get_block() callers who want to
+	 * modify the block allocation tree.
+	 */
+	down_write(&ei->i_data_sem);
+
+	ext4_discard_preallocations(inode);
+
+	/*
+	 * The orphan list entry will now protect us from any crash which
+	 * occurs before the truncate completes, so it is now safe to propagate
+	 * the new, shorter inode size (held for now in i_size) into the
+	 * on-disk inode. We do this via i_disksize, which is the value which
+	 * ext4 *really* writes onto the disk inode.
+	 */
+	ei->i_disksize = inode->i_size;
+
+	if (last_block == max_block) {
+		/*
+		 * It is unnecessary to free any data blocks if last_block is
+		 * equal to the indirect block limit.
+		 */
+		goto out_unlock;
+	} else if (n == 1) {		/* direct blocks */
+		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
+			       i_data + EXT4_NDIR_BLOCKS);
+		goto do_indirects;
+	}
+
+	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+	/* Kill the top of shared branch (not detached) */
+	if (nr) {
+		if (partial == chain) {
+			/* Shared branch grows from the inode */
+			ext4_free_branches(handle, inode, NULL,
+					   &nr, &nr+1, (chain+n-1) - partial);
+			*partial->p = 0;
+			/*
+			 * We mark the inode dirty prior to restart,
+			 * and prior to stop.  No need for it here.
+			 */
+		} else {
+			/* Shared branch grows from an indirect block */
+			BUFFER_TRACE(partial->bh, "get_write_access");
+			ext4_free_branches(handle, inode, partial->bh,
+					partial->p,
+					partial->p+1, (chain+n-1) - partial);
+		}
+	}
+	/* Clear the ends of indirect blocks on the shared branch */
+	while (partial > chain) {
+		ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
+				   (__le32*)partial->bh->b_data+addr_per_block,
+				   (chain+n-1) - partial);
+		BUFFER_TRACE(partial->bh, "call brelse");
+		brelse(partial->bh);
+		partial--;
+	}
+do_indirects:
+	/* Kill the remaining (whole) subtrees */
+	switch (offsets[0]) {
+	default:
+		nr = i_data[EXT4_IND_BLOCK];
+		if (nr) {
+			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
+			i_data[EXT4_IND_BLOCK] = 0;
+		}
+	case EXT4_IND_BLOCK:
+		nr = i_data[EXT4_DIND_BLOCK];
+		if (nr) {
+			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
+			i_data[EXT4_DIND_BLOCK] = 0;
+		}
+	case EXT4_DIND_BLOCK:
+		nr = i_data[EXT4_TIND_BLOCK];
+		if (nr) {
+			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
+			i_data[EXT4_TIND_BLOCK] = 0;
+		}
+	case EXT4_TIND_BLOCK:
+		;
+	}
+
+out_unlock:
+	up_write(&ei->i_data_sem);
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+
+	/*
+	 * In a multi-transaction truncate, we only make the final transaction
+	 * synchronous
+	 */
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+out_stop:
+	/*
+	 * If this was a simple ftruncate(), and the file will remain alive
+	 * then we need to clear up the orphan record which we created above.
+	 * However, if this was a real unlink then we were called by
+	 * ext4_delete_inode(), and we allow that function to clean up the
+	 * orphan info for us.
+	 */
+	if (inode->i_nlink)
+		ext4_orphan_del(handle, inode);
+
+	ext4_journal_stop(handle);
+	trace_ext4_truncate_exit(inode);
+}
+
+/*
+ * ext4_get_inode_loc returns with an extra refcount against the inode's
+ * underlying buffer_head on success. If 'in_mem' is true, we have all
+ * data in memory that is needed to recreate the on-disk version of this
+ * inode.
+ */
+static int __ext4_get_inode_loc(struct inode *inode,
+				struct ext4_iloc *iloc, int in_mem)
+{
+	struct ext4_group_desc	*gdp;
+	struct buffer_head	*bh;
+	struct super_block	*sb = inode->i_sb;
+	ext4_fsblk_t		block;
+	int			inodes_per_block, inode_offset;
+
+	iloc->bh = NULL;
+	if (!ext4_valid_inum(sb, inode->i_ino))
+		return -EIO;
+
+	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+	if (!gdp)
+		return -EIO;
+
+	/*
+	 * Figure out the offset within the block group inode table
+	 */
+	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	inode_offset = ((inode->i_ino - 1) %
+			EXT4_INODES_PER_GROUP(sb));
+	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
+	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+
+	bh = sb_getblk(sb, block);
+	if (!bh) {
+		EXT4_ERROR_INODE_BLOCK(inode, block,
+				       "unable to read itable block");
+		return -EIO;
+	}
+	if (!buffer_uptodate(bh)) {
+		lock_buffer(bh);
+
+		/*
+		 * If the buffer has the write error flag, we have failed
+		 * to write out another inode in the same block.  In this
+		 * case, we don't have to read the block because we may
+		 * read the old inode data successfully.
+		 */
+		if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+			set_buffer_uptodate(bh);
+
+		if (buffer_uptodate(bh)) {
+			/* someone brought it uptodate while we waited */
+			unlock_buffer(bh);
+			goto has_buffer;
+		}
+
+		/*
+		 * If we have all information of the inode in memory and this
+		 * is the only valid inode in the block, we need not read the
+		 * block.
+		 */
+		if (in_mem) {
+			struct buffer_head *bitmap_bh;
+			int i, start;
+
+			start = inode_offset & ~(inodes_per_block - 1);
+
+			/* Is the inode bitmap in cache? */
+			bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
+			if (!bitmap_bh)
+				goto make_io;
+
+			/*
+			 * If the inode bitmap isn't in cache then the
+			 * optimisation may end up performing two reads instead
+			 * of one, so skip it.
+			 */
+			if (!buffer_uptodate(bitmap_bh)) {
+				brelse(bitmap_bh);
+				goto make_io;
+			}
+			for (i = start; i < start + inodes_per_block; i++) {
+				if (i == inode_offset)
+					continue;
+				if (ext4_test_bit(i, bitmap_bh->b_data))
+					break;
+			}
+			brelse(bitmap_bh);
+			if (i == start + inodes_per_block) {
+				/* all other inodes are free, so skip I/O */
+				memset(bh->b_data, 0, bh->b_size);
+				set_buffer_uptodate(bh);
+				unlock_buffer(bh);
+				goto has_buffer;
+			}
+		}
+
+make_io:
+		/*
+		 * If we need to do any I/O, try to pre-readahead extra
+		 * blocks from the inode table.
+		 */
+		if (EXT4_SB(sb)->s_inode_readahead_blks) {
+			ext4_fsblk_t b, end, table;
+			unsigned num;
+
+			table = ext4_inode_table(sb, gdp);
+			/* s_inode_readahead_blks is always a power of 2 */
+			b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
+			if (table > b)
+				b = table;
+			end = b + EXT4_SB(sb)->s_inode_readahead_blks;
+			num = EXT4_INODES_PER_GROUP(sb);
+			if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+				num -= ext4_itable_unused_count(sb, gdp);
+			table += num / inodes_per_block;
+			if (end > table)
+				end = table;
+			while (b <= end)
+				sb_breadahead(sb, b++);
+		}
+
+		/*
+		 * There are other valid inodes in the buffer, this inode
+		 * has in-inode xattrs, or we don't have this inode in memory.
+		 * Read the block from disk.
+		 */
+		trace_ext4_load_inode(inode);
+		get_bh(bh);
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ_META, bh);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			EXT4_ERROR_INODE_BLOCK(inode, block,
+					       "unable to read itable block");
+			brelse(bh);
+			return -EIO;
+		}
+	}
+has_buffer:
+	iloc->bh = bh;
+	return 0;
+}
+
+int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
+{
+	/* We have all inode data except xattrs in memory here. */
+	return __ext4_get_inode_loc(inode, iloc,
+		!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
+}
+
+void ext4_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = EXT4_I(inode)->i_flags;
+
+	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+	if (flags & EXT4_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & EXT4_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & EXT4_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (flags & EXT4_NOATIME_FL)
+		inode->i_flags |= S_NOATIME;
+	if (flags & EXT4_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+}
+
+/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
+void ext4_get_inode_flags(struct ext4_inode_info *ei)
+{
+	unsigned int vfs_fl;
+	unsigned long old_fl, new_fl;
+
+	do {
+		vfs_fl = ei->vfs_inode.i_flags;
+		old_fl = ei->i_flags;
+		new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
+				EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
+				EXT4_DIRSYNC_FL);
+		if (vfs_fl & S_SYNC)
+			new_fl |= EXT4_SYNC_FL;
+		if (vfs_fl & S_APPEND)
+			new_fl |= EXT4_APPEND_FL;
+		if (vfs_fl & S_IMMUTABLE)
+			new_fl |= EXT4_IMMUTABLE_FL;
+		if (vfs_fl & S_NOATIME)
+			new_fl |= EXT4_NOATIME_FL;
+		if (vfs_fl & S_DIRSYNC)
+			new_fl |= EXT4_DIRSYNC_FL;
+	} while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
+}
+
+static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
+				  struct ext4_inode_info *ei)
+{
+	blkcnt_t i_blocks ;
+	struct inode *inode = &(ei->vfs_inode);
+	struct super_block *sb = inode->i_sb;
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+		/* we are using combined 48 bit field */
+		i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
+					le32_to_cpu(raw_inode->i_blocks_lo);
+		if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
+			/* i_blocks represent file system block size */
+			return i_blocks  << (inode->i_blkbits - 9);
+		} else {
+			return i_blocks;
+		}
+	} else {
+		return le32_to_cpu(raw_inode->i_blocks_lo);
+	}
+}
+
+struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
+{
+	struct ext4_iloc iloc;
+	struct ext4_inode *raw_inode;
+	struct ext4_inode_info *ei;
+	struct inode *inode;
+	journal_t *journal = EXT4_SB(sb)->s_journal;
+	long ret;
+	int block;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	ei = EXT4_I(inode);
+	iloc.bh = NULL;
+
+	ret = __ext4_get_inode_loc(inode, &iloc, 0);
+	if (ret < 0)
+		goto bad_inode;
+	raw_inode = ext4_raw_inode(&iloc);
+	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+	if (!(test_opt(inode->i_sb, NO_UID32))) {
+		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+	}
+	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+
+	ext4_clear_state_flags(ei);	/* Only relevant on 32-bit archs */
+	ei->i_dir_start_lookup = 0;
+	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
+	/* We now have enough fields to check if the inode was active or not.
+	 * This is needed because nfsd might try to access dead inodes
+	 * the test is that same one that e2fsck uses
+	 * NeilBrown 1999oct15
+	 */
+	if (inode->i_nlink == 0) {
+		if (inode->i_mode == 0 ||
+		    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+			/* this inode is deleted */
+			ret = -ESTALE;
+			goto bad_inode;
+		}
+		/* The only unlinked inodes we let through here have
+		 * valid i_mode and are being read by the orphan
+		 * recovery code: that's fine, we're about to complete
+		 * the process of deleting those. */
+	}
+	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
+	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+		ei->i_file_acl |=
+			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+	inode->i_size = ext4_isize(raw_inode);
+	ei->i_disksize = inode->i_size;
+#ifdef CONFIG_QUOTA
+	ei->i_reserved_quota = 0;
+#endif
+	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+	ei->i_block_group = iloc.block_group;
+	ei->i_last_alloc_group = ~0;
+	/*
+	 * NOTE! The in-memory inode i_data array is in little-endian order
+	 * even on big-endian machines: we do NOT byteswap the block numbers!
+	 */
+	for (block = 0; block < EXT4_N_BLOCKS; block++)
+		ei->i_data[block] = raw_inode->i_block[block];
+	INIT_LIST_HEAD(&ei->i_orphan);
+
+	/*
+	 * Set transaction id's of transactions that have to be committed
+	 * to finish f[data]sync. We set them to currently running transaction
+	 * as we cannot be sure that the inode or some of its metadata isn't
+	 * part of the transaction - the inode could have been reclaimed and
+	 * now it is reread from disk.
+	 */
+	if (journal) {
+		transaction_t *transaction;
+		tid_t tid;
+
+		read_lock(&journal->j_state_lock);
+		if (journal->j_running_transaction)
+			transaction = journal->j_running_transaction;
+		else
+			transaction = journal->j_committing_transaction;
+		if (transaction)
+			tid = transaction->t_tid;
+		else
+			tid = journal->j_commit_sequence;
+		read_unlock(&journal->j_state_lock);
+		ei->i_sync_tid = tid;
+		ei->i_datasync_tid = tid;
+	}
+
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+		    EXT4_INODE_SIZE(inode->i_sb)) {
+			ret = -EIO;
+			goto bad_inode;
+		}
+		if (ei->i_extra_isize == 0) {
+			/* The extra space is currently unused. Use it. */
+			ei->i_extra_isize = sizeof(struct ext4_inode) -
+					    EXT4_GOOD_OLD_INODE_SIZE;
+		} else {
+			__le32 *magic = (void *)raw_inode +
+					EXT4_GOOD_OLD_INODE_SIZE +
+					ei->i_extra_isize;
+			if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
+				ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+		}
+	} else
+		ei->i_extra_isize = 0;
+
+	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
+	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
+	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
+	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
+
+	inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+			inode->i_version |=
+			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+	}
+
+	ret = 0;
+	if (ei->i_file_acl &&
+	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
+		EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
+				 ei->i_file_acl);
+		ret = -EIO;
+		goto bad_inode;
+	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		    (S_ISLNK(inode->i_mode) &&
+		     !ext4_inode_is_fast_symlink(inode)))
+			/* Validate extent which is part of inode */
+			ret = ext4_ext_check_inode(inode);
+	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		   (S_ISLNK(inode->i_mode) &&
+		    !ext4_inode_is_fast_symlink(inode))) {
+		/* Validate block references which are part of inode */
+		ret = ext4_check_inode_blockref(inode);
+	}
+	if (ret)
+		goto bad_inode;
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &ext4_file_inode_operations;
+		inode->i_fop = &ext4_file_operations;
+		ext4_set_aops(inode);
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &ext4_dir_inode_operations;
+		inode->i_fop = &ext4_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (ext4_inode_is_fast_symlink(inode)) {
+			inode->i_op = &ext4_fast_symlink_inode_operations;
+			nd_terminate_link(ei->i_data, inode->i_size,
+				sizeof(ei->i_data) - 1);
+		} else {
+			inode->i_op = &ext4_symlink_inode_operations;
+			ext4_set_aops(inode);
+		}
+	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+	      S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+		inode->i_op = &ext4_special_inode_operations;
+		if (raw_inode->i_block[0])
+			init_special_inode(inode, inode->i_mode,
+			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
+		else
+			init_special_inode(inode, inode->i_mode,
+			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+	} else {
+		ret = -EIO;
+		EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
+		goto bad_inode;
+	}
+	brelse(iloc.bh);
+	ext4_set_inode_flags(inode);
+	unlock_new_inode(inode);
+	return inode;
+
+bad_inode:
+	brelse(iloc.bh);
+	iget_failed(inode);
+	return ERR_PTR(ret);
+}
+
+static int ext4_inode_blocks_set(handle_t *handle,
+				struct ext4_inode *raw_inode,
+				struct ext4_inode_info *ei)
+{
+	struct inode *inode = &(ei->vfs_inode);
+	u64 i_blocks = inode->i_blocks;
+	struct super_block *sb = inode->i_sb;
+
+	if (i_blocks <= ~0U) {
+		/*
+		 * i_blocks can be represnted in a 32 bit variable
+		 * as multiple of 512 bytes
+		 */
+		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+		raw_inode->i_blocks_high = 0;
+		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+		return 0;
+	}
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+		return -EFBIG;
+
+	if (i_blocks <= 0xffffffffffffULL) {
+		/*
+		 * i_blocks can be represented in a 48 bit variable
+		 * as multiple of 512 bytes
+		 */
+		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+	} else {
+		ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+		/* i_block is stored in file system block size */
+		i_blocks = i_blocks >> (inode->i_blkbits - 9);
+		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+	}
+	return 0;
+}
+
+/*
+ * Post the struct inode info into an on-disk inode location in the
+ * buffer-cache.  This gobbles the caller's reference to the
+ * buffer_head in the inode location struct.
+ *
+ * The caller must have write access to iloc->bh.
+ */
+static int ext4_do_update_inode(handle_t *handle,
+				struct inode *inode,
+				struct ext4_iloc *iloc)
+{
+	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct buffer_head *bh = iloc->bh;
+	int err = 0, rc, block;
+
+	/* For fields not not tracking in the in-memory inode,
+	 * initialise them to zero for new inodes. */
+	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
+		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+
+	ext4_get_inode_flags(ei);
+	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+	if (!(test_opt(inode->i_sb, NO_UID32))) {
+		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
+		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+/*
+ * Fix up interoperability with old kernels. Otherwise, old inodes get
+ * re-used with the upper 16 bits of the uid/gid intact
+ */
+		if (!ei->i_dtime) {
+			raw_inode->i_uid_high =
+				cpu_to_le16(high_16_bits(inode->i_uid));
+			raw_inode->i_gid_high =
+				cpu_to_le16(high_16_bits(inode->i_gid));
+		} else {
+			raw_inode->i_uid_high = 0;
+			raw_inode->i_gid_high = 0;
+		}
+	} else {
+		raw_inode->i_uid_low =
+			cpu_to_le16(fs_high2lowuid(inode->i_uid));
+		raw_inode->i_gid_low =
+			cpu_to_le16(fs_high2lowgid(inode->i_gid));
+		raw_inode->i_uid_high = 0;
+		raw_inode->i_gid_high = 0;
+	}
+	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+
+	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
+	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
+
+	if (ext4_inode_blocks_set(handle, raw_inode, ei))
+		goto out_brelse;
+	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+	raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
+	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+	    cpu_to_le32(EXT4_OS_HURD))
+		raw_inode->i_file_acl_high =
+			cpu_to_le16(ei->i_file_acl >> 32);
+	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+	ext4_isize_set(raw_inode, ei->i_disksize);
+	if (ei->i_disksize > 0x7fffffffULL) {
+		struct super_block *sb = inode->i_sb;
+		if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
+				EXT4_SB(sb)->s_es->s_rev_level ==
+				cpu_to_le32(EXT4_GOOD_OLD_REV)) {
+			/* If this is the first large file
+			 * created, add a flag to the superblock.
+			 */
+			err = ext4_journal_get_write_access(handle,
+					EXT4_SB(sb)->s_sbh);
+			if (err)
+				goto out_brelse;
+			ext4_update_dynamic_rev(sb);
+			EXT4_SET_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+			sb->s_dirt = 1;
+			ext4_handle_sync(handle);
+			err = ext4_handle_dirty_metadata(handle, NULL,
+					EXT4_SB(sb)->s_sbh);
+		}
+	}
+	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			raw_inode->i_block[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
+			raw_inode->i_block[1] = 0;
+		} else {
+			raw_inode->i_block[0] = 0;
+			raw_inode->i_block[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
+			raw_inode->i_block[2] = 0;
+		}
+	} else
+		for (block = 0; block < EXT4_N_BLOCKS; block++)
+			raw_inode->i_block[block] = ei->i_data[block];
+
+	raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+	if (ei->i_extra_isize) {
+		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+			raw_inode->i_version_hi =
+			cpu_to_le32(inode->i_version >> 32);
+		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+	}
+
+	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+	rc = ext4_handle_dirty_metadata(handle, NULL, bh);
+	if (!err)
+		err = rc;
+	ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+
+	ext4_update_inode_fsync_trans(handle, inode, 0);
+out_brelse:
+	brelse(bh);
+	ext4_std_error(inode->i_sb, err);
+	return err;
+}
+
+/*
+ * ext4_write_inode()
+ *
+ * We are called from a few places:
+ *
+ * - Within generic_file_write() for O_SYNC files.
+ *   Here, there will be no transaction running. We wait for any running
+ *   trasnaction to commit.
+ *
+ * - Within sys_sync(), kupdate and such.
+ *   We wait on commit, if tol to.
+ *
+ * - Within prune_icache() (PF_MEMALLOC == true)
+ *   Here we simply return.  We can't afford to block kswapd on the
+ *   journal commit.
+ *
+ * In all cases it is actually safe for us to return without doing anything,
+ * because the inode has been copied into a raw inode buffer in
+ * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
+ * knfsd.
+ *
+ * Note that we are absolutely dependent upon all inode dirtiers doing the
+ * right thing: they *must* call mark_inode_dirty() after dirtying info in
+ * which we are interested.
+ *
+ * It would be a bug for them to not do this.  The code:
+ *
+ *	mark_inode_dirty(inode)
+ *	stuff();
+ *	inode->i_size = expr;
+ *
+ * is in error because a kswapd-driven write_inode() could occur while
+ * `stuff()' is running, and the new i_size will be lost.  Plus the inode
+ * will no longer be on the superblock's dirty inode list.
+ */
+int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int err;
+
+	if (current->flags & PF_MEMALLOC)
+		return 0;
+
+	if (EXT4_SB(inode->i_sb)->s_journal) {
+		if (ext4_journal_current_handle()) {
+			jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+			dump_stack();
+			return -EIO;
+		}
+
+		if (wbc->sync_mode != WB_SYNC_ALL)
+			return 0;
+
+		err = ext4_force_commit(inode->i_sb);
+	} else {
+		struct ext4_iloc iloc;
+
+		err = __ext4_get_inode_loc(inode, &iloc, 0);
+		if (err)
+			return err;
+		if (wbc->sync_mode == WB_SYNC_ALL)
+			sync_dirty_buffer(iloc.bh);
+		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+			EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
+					 "IO error syncing inode");
+			err = -EIO;
+		}
+		brelse(iloc.bh);
+	}
+	return err;
+}
+
+/*
+ * ext4_setattr()
+ *
+ * Called from notify_change.
+ *
+ * We want to trap VFS attempts to truncate the file as soon as
+ * possible.  In particular, we want to make sure that when the VFS
+ * shrinks i_size, we put the inode on the orphan list and modify
+ * i_disksize immediately, so that during the subsequent flushing of
+ * dirty pages and freeing of disk blocks, we can guarantee that any
+ * commit will leave the blocks being flushed in an unused state on
+ * disk.  (On recovery, the inode will get truncated and the blocks will
+ * be freed, so we have a strong guarantee that no future commit will
+ * leave these blocks visible to the user.)
+ *
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
+ */
+int ext4_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error, rc = 0;
+	int orphan = 0;
+	const unsigned int ia_valid = attr->ia_valid;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if (is_quota_modification(inode, attr))
+		dquot_initialize(inode);
+	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+		(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+		handle_t *handle;
+
+		/* (user+group)*(old+new) structure, inode write (sb,
+		 * inode block, ? - but truncate inode update has it) */
+		handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+					EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
+		if (IS_ERR(handle)) {
+			error = PTR_ERR(handle);
+			goto err_out;
+		}
+		error = dquot_transfer(inode, attr);
+		if (error) {
+			ext4_journal_stop(handle);
+			return error;
+		}
+		/* Update corresponding info in inode so that everything is in
+		 * one transaction */
+		if (attr->ia_valid & ATTR_UID)
+			inode->i_uid = attr->ia_uid;
+		if (attr->ia_valid & ATTR_GID)
+			inode->i_gid = attr->ia_gid;
+		error = ext4_mark_inode_dirty(handle, inode);
+		ext4_journal_stop(handle);
+	}
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+			if (attr->ia_size > sbi->s_bitmap_maxbytes)
+				return -EFBIG;
+		}
+	}
+
+	if (S_ISREG(inode->i_mode) &&
+	    attr->ia_valid & ATTR_SIZE &&
+	    (attr->ia_size < inode->i_size)) {
+		handle_t *handle;
+
+		handle = ext4_journal_start(inode, 3);
+		if (IS_ERR(handle)) {
+			error = PTR_ERR(handle);
+			goto err_out;
+		}
+		if (ext4_handle_valid(handle)) {
+			error = ext4_orphan_add(handle, inode);
+			orphan = 1;
+		}
+		EXT4_I(inode)->i_disksize = attr->ia_size;
+		rc = ext4_mark_inode_dirty(handle, inode);
+		if (!error)
+			error = rc;
+		ext4_journal_stop(handle);
+
+		if (ext4_should_order_data(inode)) {
+			error = ext4_begin_ordered_truncate(inode,
+							    attr->ia_size);
+			if (error) {
+				/* Do as much error cleanup as possible */
+				handle = ext4_journal_start(inode, 3);
+				if (IS_ERR(handle)) {
+					ext4_orphan_del(NULL, inode);
+					goto err_out;
+				}
+				ext4_orphan_del(handle, inode);
+				orphan = 0;
+				ext4_journal_stop(handle);
+				goto err_out;
+			}
+		}
+	}
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		if (attr->ia_size != i_size_read(inode)) {
+			truncate_setsize(inode, attr->ia_size);
+			ext4_truncate(inode);
+		} else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+			ext4_truncate(inode);
+	}
+
+	if (!rc) {
+		setattr_copy(inode, attr);
+		mark_inode_dirty(inode);
+	}
+
+	/*
+	 * If the call to ext4_truncate failed to get a transaction handle at
+	 * all, we need to clean up the in-core orphan list manually.
+	 */
+	if (orphan && inode->i_nlink)
+		ext4_orphan_del(NULL, inode);
+
+	if (!rc && (ia_valid & ATTR_MODE))
+		rc = ext4_acl_chmod(inode);
+
+err_out:
+	ext4_std_error(inode->i_sb, error);
+	if (!error)
+		error = rc;
+	return error;
+}
+
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct inode *inode;
+	unsigned long delalloc_blocks;
+
+	inode = dentry->d_inode;
+	generic_fillattr(inode, stat);
+
+	/*
+	 * We can't update i_blocks if the block allocation is delayed
+	 * otherwise in the case of system crash before the real block
+	 * allocation is done, we will have i_blocks inconsistent with
+	 * on-disk file blocks.
+	 * We always keep i_blocks updated together with real
+	 * allocation. But to not confuse with user, stat
+	 * will return the blocks that include the delayed allocation
+	 * blocks for this file.
+	 */
+	delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+
+	stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+	return 0;
+}
+
+static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
+				      int chunk)
+{
+	int indirects;
+
+	/* if nrblocks are contiguous */
+	if (chunk) {
+		/*
+		 * With N contiguous data blocks, we need at most
+		 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+		 * 2 dindirect blocks, and 1 tindirect block
+		 */
+		return DIV_ROUND_UP(nrblocks,
+				    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
+	}
+	/*
+	 * if nrblocks are not contiguous, worse case, each block touch
+	 * a indirect block, and each indirect block touch a double indirect
+	 * block, plus a triple indirect block
+	 */
+	indirects = nrblocks * 2 + 1;
+	return indirects;
+}
+
+static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
+	return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
+}
+
+/*
+ * Account for index blocks, block groups bitmaps and block group
+ * descriptor blocks if modify datablocks and index blocks
+ * worse case, the indexs blocks spread over different block groups
+ *
+ * If datablocks are discontiguous, they are possible to spread over
+ * different block groups too. If they are contiuguous, with flexbg,
+ * they could still across block group boundary.
+ *
+ * Also account for superblock, inode, quota and xattr blocks
+ */
+static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+	int gdpblocks;
+	int idxblocks;
+	int ret = 0;
+
+	/*
+	 * How many index blocks need to touch to modify nrblocks?
+	 * The "Chunk" flag indicating whether the nrblocks is
+	 * physically contiguous on disk
+	 *
+	 * For Direct IO and fallocate, they calls get_block to allocate
+	 * one single extent at a time, so they could set the "Chunk" flag
+	 */
+	idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+
+	ret = idxblocks;
+
+	/*
+	 * Now let's see how many group bitmaps and group descriptors need
+	 * to account
+	 */
+	groups = idxblocks;
+	if (chunk)
+		groups += 1;
+	else
+		groups += nrblocks;
+
+	gdpblocks = groups;
+	if (groups > ngroups)
+		groups = ngroups;
+	if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
+		gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
+
+	/* bitmaps and block group descriptor blocks */
+	ret += groups + gdpblocks;
+
+	/* Blocks for super block, inode, quota and xattr blocks */
+	ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
+
+	return ret;
+}
+
+/*
+ * Calculate the total number of credits to reserve to fit
+ * the modification of a single pages into a single transaction,
+ * which may include multiple chunks of block allocations.
+ *
+ * This could be called via ext4_write_begin()
+ *
+ * We need to consider the worse case, when
+ * one new block per extent.
+ */
+int ext4_writepage_trans_blocks(struct inode *inode)
+{
+	int bpp = ext4_journal_blocks_per_page(inode);
+	int ret;
+
+	ret = ext4_meta_trans_blocks(inode, bpp, 0);
+
+	/* Account for data blocks for journalled mode */
+	if (ext4_should_journal_data(inode))
+		ret += bpp;
+	return ret;
+}
+
+/*
+ * Calculate the journal credits for a chunk of data modification.
+ *
+ * This is called from DIO, fallocate or whoever calling
+ * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
+ *
+ * journal buffers for data blocks are not included here, as DIO
+ * and fallocate do no need to journal data buffers.
+ */
+int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
+{
+	return ext4_meta_trans_blocks(inode, nrblocks, 1);
+}
+
+/*
+ * The caller must have previously called ext4_reserve_inode_write().
+ * Give this, we know that the caller already has write access to iloc->bh.
+ */
+int ext4_mark_iloc_dirty(handle_t *handle,
+			 struct inode *inode, struct ext4_iloc *iloc)
+{
+	int err = 0;
+
+	if (test_opt(inode->i_sb, I_VERSION))
+		inode_inc_iversion(inode);
+
+	/* the do_update_inode consumes one bh->b_count */
+	get_bh(iloc->bh);
+
+	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
+	err = ext4_do_update_inode(handle, inode, iloc);
+	put_bh(iloc->bh);
+	return err;
+}
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh.  This _must_ be cleaned up later.
+ */
+
+int
+ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
+			 struct ext4_iloc *iloc)
+{
+	int err;
+
+	err = ext4_get_inode_loc(inode, iloc);
+	if (!err) {
+		BUFFER_TRACE(iloc->bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, iloc->bh);
+		if (err) {
+			brelse(iloc->bh);
+			iloc->bh = NULL;
+		}
+	}
+	ext4_std_error(inode->i_sb, err);
+	return err;
+}
+
+/*
+ * Expand an inode by new_extra_isize bytes.
+ * Returns 0 on success or negative error number on failure.
+ */
+static int ext4_expand_extra_isize(struct inode *inode,
+				   unsigned int new_extra_isize,
+				   struct ext4_iloc iloc,
+				   handle_t *handle)
+{
+	struct ext4_inode *raw_inode;
+	struct ext4_xattr_ibody_header *header;
+
+	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
+		return 0;
+
+	raw_inode = ext4_raw_inode(&iloc);
+
+	header = IHDR(inode, raw_inode);
+
+	/* No extended attributes present */
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+	    header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+		memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
+			new_extra_isize);
+		EXT4_I(inode)->i_extra_isize = new_extra_isize;
+		return 0;
+	}
+
+	/* try to expand with EAs present */
+	return ext4_expand_extra_isize_ea(inode, new_extra_isize,
+					  raw_inode, handle);
+}
+
+/*
+ * What we do here is to mark the in-core inode as clean with respect to inode
+ * dirtiness (it may still be data-dirty).
+ * This means that the in-core inode may be reaped by prune_icache
+ * without having to perform any I/O.  This is a very good thing,
+ * because *any* task may call prune_icache - even ones which
+ * have a transaction open against a different journal.
+ *
+ * Is this cheating?  Not really.  Sure, we haven't written the
+ * inode out, but prune_icache isn't a user-visible syncing function.
+ * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
+ * we start and wait on commits.
+ *
+ * Is this efficient/effective?  Well, we're being nice to the system
+ * by cleaning up our inodes proactively so they can be reaped
+ * without I/O.  But we are potentially leaving up to five seconds'
+ * worth of inodes floating about which prune_icache wants us to
+ * write out.  One way to fix that would be to get prune_icache()
+ * to do a write_super() to free up some memory.  It has the desired
+ * effect.
+ */
+int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
+{
+	struct ext4_iloc iloc;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	static unsigned int mnt_count;
+	int err, ret;
+
+	might_sleep();
+	trace_ext4_mark_inode_dirty(inode, _RET_IP_);
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (ext4_handle_valid(handle) &&
+	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+	    !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
+		/*
+		 * We need extra buffer credits since we may write into EA block
+		 * with this same handle. If journal_extend fails, then it will
+		 * only result in a minor loss of functionality for that inode.
+		 * If this is felt to be critical, then e2fsck should be run to
+		 * force a large enough s_min_extra_isize.
+		 */
+		if ((jbd2_journal_extend(handle,
+			     EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
+			ret = ext4_expand_extra_isize(inode,
+						      sbi->s_want_extra_isize,
+						      iloc, handle);
+			if (ret) {
+				ext4_set_inode_state(inode,
+						     EXT4_STATE_NO_EXPAND);
+				if (mnt_count !=
+					le16_to_cpu(sbi->s_es->s_mnt_count)) {
+					ext4_warning(inode->i_sb,
+					"Unable to expand inode %lu. Delete"
+					" some EAs or run e2fsck.",
+					inode->i_ino);
+					mnt_count =
+					  le16_to_cpu(sbi->s_es->s_mnt_count);
+				}
+			}
+		}
+	}
+	if (!err)
+		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+	return err;
+}
+
+/*
+ * ext4_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We're really interested in the case where a file is being extended.
+ * i_size has been changed by generic_commit_write() and we thus need
+ * to include the updated inode in the current transaction.
+ *
+ * Also, dquot_alloc_block() will always dirty the inode when blocks
+ * are allocated to the file.
+ *
+ * If the inode is marked synchronous, we don't honour that here - doing
+ * so would cause a commit on atime updates, which we don't bother doing.
+ * We handle synchronous inodes at the highest possible level.
+ */
+void ext4_dirty_inode(struct inode *inode, int flags)
+{
+	handle_t *handle;
+
+	handle = ext4_journal_start(inode, 2);
+	if (IS_ERR(handle))
+		goto out;
+
+	ext4_mark_inode_dirty(handle, inode);
+
+	ext4_journal_stop(handle);
+out:
+	return;
+}
+
+#if 0
+/*
+ * Bind an inode's backing buffer_head into this transaction, to prevent
+ * it from being flushed to disk early.  Unlike
+ * ext4_reserve_inode_write, this leaves behind no bh reference and
+ * returns no iloc structure, so the caller needs to repeat the iloc
+ * lookup to mark the inode dirty later.
+ */
+static int ext4_pin_inode(handle_t *handle, struct inode *inode)
+{
+	struct ext4_iloc iloc;
+
+	int err = 0;
+	if (handle) {
+		err = ext4_get_inode_loc(inode, &iloc);
+		if (!err) {
+			BUFFER_TRACE(iloc.bh, "get_write_access");
+			err = jbd2_journal_get_write_access(handle, iloc.bh);
+			if (!err)
+				err = ext4_handle_dirty_metadata(handle,
+								 NULL,
+								 iloc.bh);
+			brelse(iloc.bh);
+		}
+	}
+	ext4_std_error(inode->i_sb, err);
+	return err;
+}
+#endif
+
+int ext4_change_inode_journal_flag(struct inode *inode, int val)
+{
+	journal_t *journal;
+	handle_t *handle;
+	int err;
+
+	/*
+	 * We have to be very careful here: changing a data block's
+	 * journaling status dynamically is dangerous.  If we write a
+	 * data block to the journal, change the status and then delete
+	 * that block, we risk forgetting to revoke the old log record
+	 * from the journal and so a subsequent replay can corrupt data.
+	 * So, first we make sure that the journal is empty and that
+	 * nobody is changing anything.
+	 */
+
+	journal = EXT4_JOURNAL(inode);
+	if (!journal)
+		return 0;
+	if (is_journal_aborted(journal))
+		return -EROFS;
+
+	jbd2_journal_lock_updates(journal);
+	jbd2_journal_flush(journal);
+
+	/*
+	 * OK, there are no updates running now, and all cached data is
+	 * synced to disk.  We are now in a completely consistent state
+	 * which doesn't have anything in the journal, and we know that
+	 * no filesystem updates are running, so it is safe to modify
+	 * the inode's in-core data-journaling state flag now.
+	 */
+
+	if (val)
+		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+	else
+		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+	ext4_set_aops(inode);
+
+	jbd2_journal_unlock_updates(journal);
+
+	/* Finally we can mark the inode as dirty. */
+
+	handle = ext4_journal_start(inode, 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	err = ext4_mark_inode_dirty(handle, inode);
+	ext4_handle_sync(handle);
+	ext4_journal_stop(handle);
+	ext4_std_error(inode->i_sb, err);
+
+	return err;
+}
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	loff_t size;
+	unsigned long len;
+	int ret = -EINVAL;
+	void *fsdata;
+	struct file *file = vma->vm_file;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+
+	/*
+	 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+	 * get i_mutex because we are already holding mmap_sem.
+	 */
+	down_read(&inode->i_alloc_sem);
+	size = i_size_read(inode);
+	if (page->mapping != mapping || size <= page_offset(page)
+	    || !PageUptodate(page)) {
+		/* page got truncated from under us? */
+		goto out_unlock;
+	}
+	ret = 0;
+
+	lock_page(page);
+	wait_on_page_writeback(page);
+	if (PageMappedToDisk(page)) {
+		up_read(&inode->i_alloc_sem);
+		return VM_FAULT_LOCKED;
+	}
+
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	/*
+	 * return if we have all the buffers mapped. This avoid
+	 * the need to call write_begin/write_end which does a
+	 * journal_start/journal_stop which can block and take
+	 * long time
+	 */
+	if (page_has_buffers(page)) {
+		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+					ext4_bh_unmapped)) {
+			up_read(&inode->i_alloc_sem);
+			return VM_FAULT_LOCKED;
+		}
+	}
+	unlock_page(page);
+	/*
+	 * OK, we need to fill the hole... Do write_begin write_end
+	 * to do block allocation/reservation.We are not holding
+	 * inode.i__mutex here. That allow * parallel write_begin,
+	 * write_end call. lock_page prevent this from happening
+	 * on the same page though
+	 */
+	ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
+			len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+	if (ret < 0)
+		goto out_unlock;
+	ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
+			len, len, page, fsdata);
+	if (ret < 0)
+		goto out_unlock;
+	ret = 0;
+
+	/*
+	 * write_begin/end might have created a dirty page and someone
+	 * could wander in and start the IO.  Make sure that hasn't
+	 * happened.
+	 */
+	lock_page(page);
+	wait_on_page_writeback(page);
+	up_read(&inode->i_alloc_sem);
+	return VM_FAULT_LOCKED;
+out_unlock:
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
+	up_read(&inode->i_alloc_sem);
+	return ret;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
new file mode 100644
index 00000000..4cbe1c2c
--- /dev/null
+++ b/fs/ext4/ioctl.c
@@ -0,0 +1,442 @@
+/*
+ * linux/fs/ext4/ioctl.c
+ *
+ * Copyright (C) 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/capability.h>
+#include <linux/time.h>
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <asm/uaccess.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+
+long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned int flags;
+
+	ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
+
+	switch (cmd) {
+	case EXT4_IOC_GETFLAGS:
+		ext4_get_inode_flags(ei);
+		flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
+		return put_user(flags, (int __user *) arg);
+	case EXT4_IOC_SETFLAGS: {
+		handle_t *handle = NULL;
+		int err, migrate = 0;
+		struct ext4_iloc iloc;
+		unsigned int oldflags, mask, i;
+		unsigned int jflag;
+
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		if (get_user(flags, (int __user *) arg))
+			return -EFAULT;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+
+		flags = ext4_mask_flags(inode->i_mode, flags);
+
+		err = -EPERM;
+		mutex_lock(&inode->i_mutex);
+		/* Is it quota file? Do not allow user to mess with it */
+		if (IS_NOQUOTA(inode))
+			goto flags_out;
+
+		oldflags = ei->i_flags;
+
+		/* The JOURNAL_DATA flag is modifiable only by root */
+		jflag = flags & EXT4_JOURNAL_DATA_FL;
+
+		/*
+		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+		 * the relevant capability.
+		 *
+		 * This test looks nicer. Thanks to Pauline Middelink
+		 */
+		if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
+			if (!capable(CAP_LINUX_IMMUTABLE))
+				goto flags_out;
+		}
+
+		/*
+		 * The JOURNAL_DATA flag can only be changed by
+		 * the relevant capability.
+		 */
+		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+			if (!capable(CAP_SYS_RESOURCE))
+				goto flags_out;
+		}
+		if (oldflags & EXT4_EXTENTS_FL) {
+			/* We don't support clearning extent flags */
+			if (!(flags & EXT4_EXTENTS_FL)) {
+				err = -EOPNOTSUPP;
+				goto flags_out;
+			}
+		} else if (flags & EXT4_EXTENTS_FL) {
+			/* migrate the file */
+			migrate = 1;
+			flags &= ~EXT4_EXTENTS_FL;
+		}
+
+		if (flags & EXT4_EOFBLOCKS_FL) {
+			/* we don't support adding EOFBLOCKS flag */
+			if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+				err = -EOPNOTSUPP;
+				goto flags_out;
+			}
+		} else if (oldflags & EXT4_EOFBLOCKS_FL)
+			ext4_truncate(inode);
+
+		handle = ext4_journal_start(inode, 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto flags_out;
+		}
+		if (IS_SYNC(inode))
+			ext4_handle_sync(handle);
+		err = ext4_reserve_inode_write(handle, inode, &iloc);
+		if (err)
+			goto flags_err;
+
+		for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+			if (!(mask & EXT4_FL_USER_MODIFIABLE))
+				continue;
+			if (mask & flags)
+				ext4_set_inode_flag(inode, i);
+			else
+				ext4_clear_inode_flag(inode, i);
+		}
+
+		ext4_set_inode_flags(inode);
+		inode->i_ctime = ext4_current_time(inode);
+
+		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+flags_err:
+		ext4_journal_stop(handle);
+		if (err)
+			goto flags_out;
+
+		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
+			err = ext4_change_inode_journal_flag(inode, jflag);
+		if (err)
+			goto flags_out;
+		if (migrate)
+			err = ext4_ext_migrate(inode);
+flags_out:
+		mutex_unlock(&inode->i_mutex);
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
+	}
+	case EXT4_IOC_GETVERSION:
+	case EXT4_IOC_GETVERSION_OLD:
+		return put_user(inode->i_generation, (int __user *) arg);
+	case EXT4_IOC_SETVERSION:
+	case EXT4_IOC_SETVERSION_OLD: {
+		handle_t *handle;
+		struct ext4_iloc iloc;
+		__u32 generation;
+		int err;
+
+		if (!inode_owner_or_capable(inode))
+			return -EPERM;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+		if (get_user(generation, (int __user *) arg)) {
+			err = -EFAULT;
+			goto setversion_out;
+		}
+
+		handle = ext4_journal_start(inode, 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto setversion_out;
+		}
+		err = ext4_reserve_inode_write(handle, inode, &iloc);
+		if (err == 0) {
+			inode->i_ctime = ext4_current_time(inode);
+			inode->i_generation = generation;
+			err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+		}
+		ext4_journal_stop(handle);
+setversion_out:
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
+	}
+#ifdef CONFIG_JBD2_DEBUG
+	case EXT4_IOC_WAIT_FOR_READONLY:
+		/*
+		 * This is racy - by the time we're woken up and running,
+		 * the superblock could be released.  And the module could
+		 * have been unloaded.  So sue me.
+		 *
+		 * Returns 1 if it slept, else zero.
+		 */
+		{
+			struct super_block *sb = inode->i_sb;
+			DECLARE_WAITQUEUE(wait, current);
+			int ret = 0;
+
+			set_current_state(TASK_INTERRUPTIBLE);
+			add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
+			if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
+				schedule();
+				ret = 1;
+			}
+			remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
+			return ret;
+		}
+#endif
+	case EXT4_IOC_GROUP_EXTEND: {
+		ext4_fsblk_t n_blocks_count;
+		struct super_block *sb = inode->i_sb;
+		int err, err2=0;
+
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		if (get_user(n_blocks_count, (__u32 __user *)arg))
+			return -EFAULT;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+
+		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
+		if (EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
+		if (err == 0)
+			err = err2;
+		mnt_drop_write(filp->f_path.mnt);
+
+		return err;
+	}
+
+	case EXT4_IOC_MOVE_EXT: {
+		struct move_extent me;
+		struct file *donor_filp;
+		int err;
+
+		if (!(filp->f_mode & FMODE_READ) ||
+		    !(filp->f_mode & FMODE_WRITE))
+			return -EBADF;
+
+		if (copy_from_user(&me,
+			(struct move_extent __user *)arg, sizeof(me)))
+			return -EFAULT;
+		me.moved_len = 0;
+
+		donor_filp = fget(me.donor_fd);
+		if (!donor_filp)
+			return -EBADF;
+
+		if (!(donor_filp->f_mode & FMODE_WRITE)) {
+			err = -EBADF;
+			goto mext_out;
+		}
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			goto mext_out;
+
+		err = ext4_move_extents(filp, donor_filp, me.orig_start,
+					me.donor_start, me.len, &me.moved_len);
+		mnt_drop_write(filp->f_path.mnt);
+		if (me.moved_len > 0)
+			file_remove_suid(donor_filp);
+
+		if (copy_to_user((struct move_extent __user *)arg,
+				 &me, sizeof(me)))
+			err = -EFAULT;
+mext_out:
+		fput(donor_filp);
+		return err;
+	}
+
+	case EXT4_IOC_GROUP_ADD: {
+		struct ext4_new_group_data input;
+		struct super_block *sb = inode->i_sb;
+		int err, err2=0;
+
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
+				sizeof(input)))
+			return -EFAULT;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+
+		err = ext4_group_add(sb, &input);
+		if (EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
+		if (err == 0)
+			err = err2;
+		mnt_drop_write(filp->f_path.mnt);
+
+		return err;
+	}
+
+	case EXT4_IOC_MIGRATE:
+	{
+		int err;
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+		/*
+		 * inode_mutex prevent write and truncate on the file.
+		 * Read still goes through. We take i_data_sem in
+		 * ext4_ext_swap_inode_data before we switch the
+		 * inode format to prevent read.
+		 */
+		mutex_lock(&(inode->i_mutex));
+		err = ext4_ext_migrate(inode);
+		mutex_unlock(&(inode->i_mutex));
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
+	}
+
+	case EXT4_IOC_ALLOC_DA_BLKS:
+	{
+		int err;
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+		err = ext4_alloc_da_blocks(inode);
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
+	}
+
+	case FITRIM:
+	{
+		struct super_block *sb = inode->i_sb;
+		struct request_queue *q = bdev_get_queue(sb->s_bdev);
+		struct fstrim_range range;
+		int ret = 0;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (!blk_queue_discard(q))
+			return -EOPNOTSUPP;
+
+		if (copy_from_user(&range, (struct fstrim_range *)arg,
+		    sizeof(range)))
+			return -EFAULT;
+
+		range.minlen = max((unsigned int)range.minlen,
+				   q->limits.discard_granularity);
+		ret = ext4_trim_fs(sb, &range);
+		if (ret < 0)
+			return ret;
+
+		if (copy_to_user((struct fstrim_range *)arg, &range,
+		    sizeof(range)))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case EXT4_IOC32_GETFLAGS:
+		cmd = EXT4_IOC_GETFLAGS;
+		break;
+	case EXT4_IOC32_SETFLAGS:
+		cmd = EXT4_IOC_SETFLAGS;
+		break;
+	case EXT4_IOC32_GETVERSION:
+		cmd = EXT4_IOC_GETVERSION;
+		break;
+	case EXT4_IOC32_SETVERSION:
+		cmd = EXT4_IOC_SETVERSION;
+		break;
+	case EXT4_IOC32_GROUP_EXTEND:
+		cmd = EXT4_IOC_GROUP_EXTEND;
+		break;
+	case EXT4_IOC32_GETVERSION_OLD:
+		cmd = EXT4_IOC_GETVERSION_OLD;
+		break;
+	case EXT4_IOC32_SETVERSION_OLD:
+		cmd = EXT4_IOC_SETVERSION_OLD;
+		break;
+#ifdef CONFIG_JBD2_DEBUG
+	case EXT4_IOC32_WAIT_FOR_READONLY:
+		cmd = EXT4_IOC_WAIT_FOR_READONLY;
+		break;
+#endif
+	case EXT4_IOC32_GETRSVSZ:
+		cmd = EXT4_IOC_GETRSVSZ;
+		break;
+	case EXT4_IOC32_SETRSVSZ:
+		cmd = EXT4_IOC_SETRSVSZ;
+		break;
+	case EXT4_IOC32_GROUP_ADD: {
+		struct compat_ext4_new_group_input __user *uinput;
+		struct ext4_new_group_input input;
+		mm_segment_t old_fs;
+		int err;
+
+		uinput = compat_ptr(arg);
+		err = get_user(input.group, &uinput->group);
+		err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+		err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+		err |= get_user(input.inode_table, &uinput->inode_table);
+		err |= get_user(input.blocks_count, &uinput->blocks_count);
+		err |= get_user(input.reserved_blocks,
+				&uinput->reserved_blocks);
+		if (err)
+			return -EFAULT;
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+				 (unsigned long) &input);
+		set_fs(old_fs);
+		return err;
+	}
+	case EXT4_IOC_MOVE_EXT:
+	case FITRIM:
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
new file mode 100644
index 00000000..b6adf68a
--- /dev/null
+++ b/fs/ext4/mballoc.c
@@ -0,0 +1,4958 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+
+
+/*
+ * mballoc.c contains the multiblocks allocation routines
+ */
+
+#include "mballoc.h"
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <trace/events/ext4.h>
+
+/*
+ * MUSTDO:
+ *   - test ext4_ext_search_left() and ext4_ext_search_right()
+ *   - search for metadata in few groups
+ *
+ * TODO v4:
+ *   - normalization should take into account whether file is still open
+ *   - discard preallocations if no free space left (policy?)
+ *   - don't normalize tails
+ *   - quota
+ *   - reservation for superuser
+ *
+ * TODO v3:
+ *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
+ *   - track min/max extents in each group for better group selection
+ *   - mb_mark_used() may allocate chunk right after splitting buddy
+ *   - tree of groups sorted by number of free blocks
+ *   - error handling
+ */
+
+/*
+ * The allocation request involve request for multiple number of blocks
+ * near to the goal(block) value specified.
+ *
+ * During initialization phase of the allocator we decide to use the
+ * group preallocation or inode preallocation depending on the size of
+ * the file. The size of the file could be the resulting file size we
+ * would have after allocation, or the current file size, which ever
+ * is larger. If the size is less than sbi->s_mb_stream_request we
+ * select to use the group preallocation. The default value of
+ * s_mb_stream_request is 16 blocks. This can also be tuned via
+ * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
+ * terms of number of blocks.
+ *
+ * The main motivation for having small file use group preallocation is to
+ * ensure that we have small files closer together on the disk.
+ *
+ * First stage the allocator looks at the inode prealloc list,
+ * ext4_inode_info->i_prealloc_list, which contains list of prealloc
+ * spaces for this particular inode. The inode prealloc space is
+ * represented as:
+ *
+ * pa_lstart -> the logical start block for this prealloc space
+ * pa_pstart -> the physical start block for this prealloc space
+ * pa_len    -> length for this prealloc space
+ * pa_free   ->  free space available in this prealloc space
+ *
+ * The inode preallocation space is used looking at the _logical_ start
+ * block. If only the logical file block falls within the range of prealloc
+ * space we will consume the particular prealloc space. This make sure that
+ * that the we have contiguous physical blocks representing the file blocks
+ *
+ * The important thing to be noted in case of inode prealloc space is that
+ * we don't modify the values associated to inode prealloc space except
+ * pa_free.
+ *
+ * If we are not able to find blocks in the inode prealloc space and if we
+ * have the group allocation flag set then we look at the locality group
+ * prealloc space. These are per CPU prealloc list repreasented as
+ *
+ * ext4_sb_info.s_locality_groups[smp_processor_id()]
+ *
+ * The reason for having a per cpu locality group is to reduce the contention
+ * between CPUs. It is possible to get scheduled at this point.
+ *
+ * The locality group prealloc space is used looking at whether we have
+ * enough free space (pa_free) within the prealloc space.
+ *
+ * If we can't allocate blocks via inode prealloc or/and locality group
+ * prealloc then we look at the buddy cache. The buddy cache is represented
+ * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
+ * mapped to the buddy and bitmap information regarding different
+ * groups. The buddy information is attached to buddy cache inode so that
+ * we can access them through the page cache. The information regarding
+ * each group is loaded via ext4_mb_load_buddy.  The information involve
+ * block bitmap and buddy information. The information are stored in the
+ * inode as:
+ *
+ *  {                        page                        }
+ *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
+ *
+ *
+ * one block each for bitmap and buddy information.  So for each group we
+ * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
+ * blocksize) blocks.  So it can have information regarding groups_per_page
+ * which is blocks_per_page/2
+ *
+ * The buddy cache inode is not stored on disk. The inode is thrown
+ * away when the filesystem is unmounted.
+ *
+ * We look for count number of blocks in the buddy cache. If we were able
+ * to locate that many free blocks we return with additional information
+ * regarding rest of the contiguous physical block available
+ *
+ * Before allocating blocks via buddy cache we normalize the request
+ * blocks. This ensure we ask for more blocks that we needed. The extra
+ * blocks that we get after allocation is added to the respective prealloc
+ * list. In case of inode preallocation we follow a list of heuristics
+ * based on file size. This can be found in ext4_mb_normalize_request. If
+ * we are doing a group prealloc we try to normalize the request to
+ * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
+ * 512 blocks. This can be tuned via
+ * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
+ * terms of number of blocks. If we have mounted the file system with -O
+ * stripe=<value> option the group prealloc request is normalized to the
+ * stripe value (sbi->s_stripe)
+ *
+ * The regular allocator(using the buddy cache) supports few tunables.
+ *
+ * /sys/fs/ext4/<partition>/mb_min_to_scan
+ * /sys/fs/ext4/<partition>/mb_max_to_scan
+ * /sys/fs/ext4/<partition>/mb_order2_req
+ *
+ * The regular allocator uses buddy scan only if the request len is power of
+ * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
+ * value of s_mb_order2_reqs can be tuned via
+ * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
+ * stripe size (sbi->s_stripe), we try to search for contiguous block in
+ * stripe size. This should result in better allocation on RAID setups. If
+ * not, we search in the specific group using bitmap for best extents. The
+ * tunable min_to_scan and max_to_scan control the behaviour here.
+ * min_to_scan indicate how long the mballoc __must__ look for a best
+ * extent and max_to_scan indicates how long the mballoc __can__ look for a
+ * best extent in the found extents. Searching for the blocks starts with
+ * the group specified as the goal value in allocation context via
+ * ac_g_ex. Each group is first checked based on the criteria whether it
+ * can used for allocation. ext4_mb_good_group explains how the groups are
+ * checked.
+ *
+ * Both the prealloc space are getting populated as above. So for the first
+ * request we will hit the buddy cache which will result in this prealloc
+ * space getting filled. The prealloc space is then later used for the
+ * subsequent request.
+ */
+
+/*
+ * mballoc operates on the following data:
+ *  - on-disk bitmap
+ *  - in-core buddy (actually includes buddy and bitmap)
+ *  - preallocation descriptors (PAs)
+ *
+ * there are two types of preallocations:
+ *  - inode
+ *    assiged to specific inode and can be used for this inode only.
+ *    it describes part of inode's space preallocated to specific
+ *    physical blocks. any block from that preallocated can be used
+ *    independent. the descriptor just tracks number of blocks left
+ *    unused. so, before taking some block from descriptor, one must
+ *    make sure corresponded logical block isn't allocated yet. this
+ *    also means that freeing any block within descriptor's range
+ *    must discard all preallocated blocks.
+ *  - locality group
+ *    assigned to specific locality group which does not translate to
+ *    permanent set of inodes: inode can join and leave group. space
+ *    from this type of preallocation can be used for any inode. thus
+ *    it's consumed from the beginning to the end.
+ *
+ * relation between them can be expressed as:
+ *    in-core buddy = on-disk bitmap + preallocation descriptors
+ *
+ * this mean blocks mballoc considers used are:
+ *  - allocated blocks (persistent)
+ *  - preallocated blocks (non-persistent)
+ *
+ * consistency in mballoc world means that at any time a block is either
+ * free or used in ALL structures. notice: "any time" should not be read
+ * literally -- time is discrete and delimited by locks.
+ *
+ *  to keep it simple, we don't use block numbers, instead we count number of
+ *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
+ *
+ * all operations can be expressed as:
+ *  - init buddy:			buddy = on-disk + PAs
+ *  - new PA:				buddy += N; PA = N
+ *  - use inode PA:			on-disk += N; PA -= N
+ *  - discard inode PA			buddy -= on-disk - PA; PA = 0
+ *  - use locality group PA		on-disk += N; PA -= N
+ *  - discard locality group PA		buddy -= PA; PA = 0
+ *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
+ *        is used in real operation because we can't know actual used
+ *        bits from PA, only from on-disk bitmap
+ *
+ * if we follow this strict logic, then all operations above should be atomic.
+ * given some of them can block, we'd have to use something like semaphores
+ * killing performance on high-end SMP hardware. let's try to relax it using
+ * the following knowledge:
+ *  1) if buddy is referenced, it's already initialized
+ *  2) while block is used in buddy and the buddy is referenced,
+ *     nobody can re-allocate that block
+ *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
+ *     bit set and PA claims same block, it's OK. IOW, one can set bit in
+ *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
+ *     block
+ *
+ * so, now we're building a concurrency table:
+ *  - init buddy vs.
+ *    - new PA
+ *      blocks for PA are allocated in the buddy, buddy must be referenced
+ *      until PA is linked to allocation group to avoid concurrent buddy init
+ *    - use inode PA
+ *      we need to make sure that either on-disk bitmap or PA has uptodate data
+ *      given (3) we care that PA-=N operation doesn't interfere with init
+ *    - discard inode PA
+ *      the simplest way would be to have buddy initialized by the discard
+ *    - use locality group PA
+ *      again PA-=N must be serialized with init
+ *    - discard locality group PA
+ *      the simplest way would be to have buddy initialized by the discard
+ *  - new PA vs.
+ *    - use inode PA
+ *      i_data_sem serializes them
+ *    - discard inode PA
+ *      discard process must wait until PA isn't used by another process
+ *    - use locality group PA
+ *      some mutex should serialize them
+ *    - discard locality group PA
+ *      discard process must wait until PA isn't used by another process
+ *  - use inode PA
+ *    - use inode PA
+ *      i_data_sem or another mutex should serializes them
+ *    - discard inode PA
+ *      discard process must wait until PA isn't used by another process
+ *    - use locality group PA
+ *      nothing wrong here -- they're different PAs covering different blocks
+ *    - discard locality group PA
+ *      discard process must wait until PA isn't used by another process
+ *
+ * now we're ready to make few consequences:
+ *  - PA is referenced and while it is no discard is possible
+ *  - PA is referenced until block isn't marked in on-disk bitmap
+ *  - PA changes only after on-disk bitmap
+ *  - discard must not compete with init. either init is done before
+ *    any discard or they're serialized somehow
+ *  - buddy init as sum of on-disk bitmap and PAs is done atomically
+ *
+ * a special case when we've used PA to emptiness. no need to modify buddy
+ * in this case, but we should care about concurrent init
+ *
+ */
+
+ /*
+ * Logic in few words:
+ *
+ *  - allocation:
+ *    load group
+ *    find blocks
+ *    mark bits in on-disk bitmap
+ *    release group
+ *
+ *  - use preallocation:
+ *    find proper PA (per-inode or group)
+ *    load group
+ *    mark bits in on-disk bitmap
+ *    release group
+ *    release PA
+ *
+ *  - free:
+ *    load group
+ *    mark bits in on-disk bitmap
+ *    release group
+ *
+ *  - discard preallocations in group:
+ *    mark PAs deleted
+ *    move them onto local list
+ *    load on-disk bitmap
+ *    load group
+ *    remove PA from object (inode or locality group)
+ *    mark free blocks in-core
+ *
+ *  - discard inode's preallocations:
+ */
+
+/*
+ * Locking rules
+ *
+ * Locks:
+ *  - bitlock on a group	(group)
+ *  - object (inode/locality)	(object)
+ *  - per-pa lock		(pa)
+ *
+ * Paths:
+ *  - new pa
+ *    object
+ *    group
+ *
+ *  - find and use pa:
+ *    pa
+ *
+ *  - release consumed pa:
+ *    pa
+ *    group
+ *    object
+ *
+ *  - generate in-core bitmap:
+ *    group
+ *        pa
+ *
+ *  - discard all for given object (inode, locality group):
+ *    object
+ *        pa
+ *    group
+ *
+ *  - discard all for given group:
+ *    group
+ *        pa
+ *    group
+ *        object
+ *
+ */
+static struct kmem_cache *ext4_pspace_cachep;
+static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
+
+/* We create slab caches for groupinfo data structures based on the
+ * superblock block size.  There will be one per mounted filesystem for
+ * each unique s_blocksize_bits */
+#define NR_GRPINFO_CACHES 8
+static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+
+static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+	"ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
+	"ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
+	"ext4_groupinfo_64k", "ext4_groupinfo_128k"
+};
+
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+					ext4_group_t group);
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+						ext4_group_t group);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+
+static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
+{
+#if BITS_PER_LONG == 64
+	*bit += ((unsigned long) addr & 7UL) << 3;
+	addr = (void *) ((unsigned long) addr & ~7UL);
+#elif BITS_PER_LONG == 32
+	*bit += ((unsigned long) addr & 3UL) << 3;
+	addr = (void *) ((unsigned long) addr & ~3UL);
+#else
+#error "how many bits you are?!"
+#endif
+	return addr;
+}
+
+static inline int mb_test_bit(int bit, void *addr)
+{
+	/*
+	 * ext4_test_bit on architecture like powerpc
+	 * needs unsigned long aligned address
+	 */
+	addr = mb_correct_addr_and_bit(&bit, addr);
+	return ext4_test_bit(bit, addr);
+}
+
+static inline void mb_set_bit(int bit, void *addr)
+{
+	addr = mb_correct_addr_and_bit(&bit, addr);
+	ext4_set_bit(bit, addr);
+}
+
+static inline void mb_clear_bit(int bit, void *addr)
+{
+	addr = mb_correct_addr_and_bit(&bit, addr);
+	ext4_clear_bit(bit, addr);
+}
+
+static inline int mb_find_next_zero_bit(void *addr, int max, int start)
+{
+	int fix = 0, ret, tmpmax;
+	addr = mb_correct_addr_and_bit(&fix, addr);
+	tmpmax = max + fix;
+	start += fix;
+
+	ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+	if (ret > max)
+		return max;
+	return ret;
+}
+
+static inline int mb_find_next_bit(void *addr, int max, int start)
+{
+	int fix = 0, ret, tmpmax;
+	addr = mb_correct_addr_and_bit(&fix, addr);
+	tmpmax = max + fix;
+	start += fix;
+
+	ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+	if (ret > max)
+		return max;
+	return ret;
+}
+
+static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
+{
+	char *bb;
+
+	BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+	BUG_ON(max == NULL);
+
+	if (order > e4b->bd_blkbits + 1) {
+		*max = 0;
+		return NULL;
+	}
+
+	/* at order 0 we see each particular block */
+	if (order == 0) {
+		*max = 1 << (e4b->bd_blkbits + 3);
+		return EXT4_MB_BITMAP(e4b);
+	}
+
+	bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
+	*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
+
+	return bb;
+}
+
+#ifdef DOUBLE_CHECK
+static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
+			   int first, int count)
+{
+	int i;
+	struct super_block *sb = e4b->bd_sb;
+
+	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
+		return;
+	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
+	for (i = 0; i < count; i++) {
+		if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
+			ext4_fsblk_t blocknr;
+
+			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+			blocknr += first + i;
+			ext4_grp_locked_error(sb, e4b->bd_group,
+					      inode ? inode->i_ino : 0,
+					      blocknr,
+					      "freeing block already freed "
+					      "(bit %u)",
+					      first + i);
+		}
+		mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
+	}
+}
+
+static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
+{
+	int i;
+
+	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
+		return;
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
+	for (i = 0; i < count; i++) {
+		BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
+		mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
+	}
+}
+
+static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
+{
+	if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
+		unsigned char *b1, *b2;
+		int i;
+		b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
+		b2 = (unsigned char *) bitmap;
+		for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
+			if (b1[i] != b2[i]) {
+				printk(KERN_ERR "corruption in group %u "
+				       "at byte %u(%u): %x in copy != %x "
+				       "on disk/prealloc\n",
+				       e4b->bd_group, i, i * 8, b1[i], b2[i]);
+				BUG();
+			}
+		}
+	}
+}
+
+#else
+static inline void mb_free_blocks_double(struct inode *inode,
+				struct ext4_buddy *e4b, int first, int count)
+{
+	return;
+}
+static inline void mb_mark_used_double(struct ext4_buddy *e4b,
+						int first, int count)
+{
+	return;
+}
+static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
+{
+	return;
+}
+#endif
+
+#ifdef AGGRESSIVE_CHECK
+
+#define MB_CHECK_ASSERT(assert)						\
+do {									\
+	if (!(assert)) {						\
+		printk(KERN_EMERG					\
+			"Assertion failure in %s() at %s:%d: \"%s\"\n",	\
+			function, file, line, # assert);		\
+		BUG();							\
+	}								\
+} while (0)
+
+static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
+				const char *function, int line)
+{
+	struct super_block *sb = e4b->bd_sb;
+	int order = e4b->bd_blkbits + 1;
+	int max;
+	int max2;
+	int i;
+	int j;
+	int k;
+	int count;
+	struct ext4_group_info *grp;
+	int fragments = 0;
+	int fstart;
+	struct list_head *cur;
+	void *buddy;
+	void *buddy2;
+
+	{
+		static int mb_check_counter;
+		if (mb_check_counter++ % 100 != 0)
+			return 0;
+	}
+
+	while (order > 1) {
+		buddy = mb_find_buddy(e4b, order, &max);
+		MB_CHECK_ASSERT(buddy);
+		buddy2 = mb_find_buddy(e4b, order - 1, &max2);
+		MB_CHECK_ASSERT(buddy2);
+		MB_CHECK_ASSERT(buddy != buddy2);
+		MB_CHECK_ASSERT(max * 2 == max2);
+
+		count = 0;
+		for (i = 0; i < max; i++) {
+
+			if (mb_test_bit(i, buddy)) {
+				/* only single bit in buddy2 may be 1 */
+				if (!mb_test_bit(i << 1, buddy2)) {
+					MB_CHECK_ASSERT(
+						mb_test_bit((i<<1)+1, buddy2));
+				} else if (!mb_test_bit((i << 1) + 1, buddy2)) {
+					MB_CHECK_ASSERT(
+						mb_test_bit(i << 1, buddy2));
+				}
+				continue;
+			}
+
+			/* both bits in buddy2 must be 0 */
+			MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
+			MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
+
+			for (j = 0; j < (1 << order); j++) {
+				k = (i * (1 << order)) + j;
+				MB_CHECK_ASSERT(
+					!mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
+			}
+			count++;
+		}
+		MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
+		order--;
+	}
+
+	fstart = -1;
+	buddy = mb_find_buddy(e4b, 0, &max);
+	for (i = 0; i < max; i++) {
+		if (!mb_test_bit(i, buddy)) {
+			MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
+			if (fstart == -1) {
+				fragments++;
+				fstart = i;
+			}
+			continue;
+		}
+		fstart = -1;
+		/* check used bits only */
+		for (j = 0; j < e4b->bd_blkbits + 1; j++) {
+			buddy2 = mb_find_buddy(e4b, j, &max2);
+			k = i >> j;
+			MB_CHECK_ASSERT(k < max2);
+			MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
+		}
+	}
+	MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
+	MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
+
+	grp = ext4_get_group_info(sb, e4b->bd_group);
+	list_for_each(cur, &grp->bb_prealloc_list) {
+		ext4_group_t groupnr;
+		struct ext4_prealloc_space *pa;
+		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
+		MB_CHECK_ASSERT(groupnr == e4b->bd_group);
+		for (i = 0; i < pa->pa_len; i++)
+			MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
+	}
+	return 0;
+}
+#undef MB_CHECK_ASSERT
+#define mb_check_buddy(e4b) __mb_check_buddy(e4b,	\
+					__FILE__, __func__, __LINE__)
+#else
+#define mb_check_buddy(e4b)
+#endif
+
+/*
+ * Divide blocks started from @first with length @len into
+ * smaller chunks with power of 2 blocks.
+ * Clear the bits in bitmap which the blocks of the chunk(s) covered,
+ * then increase bb_counters[] for corresponded chunk size.
+ */
+static void ext4_mb_mark_free_simple(struct super_block *sb,
+				void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
+					struct ext4_group_info *grp)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_grpblk_t min;
+	ext4_grpblk_t max;
+	ext4_grpblk_t chunk;
+	unsigned short border;
+
+	BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
+
+	border = 2 << sb->s_blocksize_bits;
+
+	while (len > 0) {
+		/* find how many blocks can be covered since this position */
+		max = ffs(first | border) - 1;
+
+		/* find how many blocks of power 2 we need to mark */
+		min = fls(len) - 1;
+
+		if (max < min)
+			min = max;
+		chunk = 1 << min;
+
+		/* mark multiblock chunks only */
+		grp->bb_counters[min]++;
+		if (min > 0)
+			mb_clear_bit(first >> min,
+				     buddy + sbi->s_mb_offsets[min]);
+
+		len -= chunk;
+		first += chunk;
+	}
+}
+
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+	int i;
+	int bits;
+
+	grp->bb_largest_free_order = -1; /* uninit */
+
+	bits = sb->s_blocksize_bits + 1;
+	for (i = bits; i >= 0; i--) {
+		if (grp->bb_counters[i] > 0) {
+			grp->bb_largest_free_order = i;
+			break;
+		}
+	}
+}
+
+static noinline_for_stack
+void ext4_mb_generate_buddy(struct super_block *sb,
+				void *buddy, void *bitmap, ext4_group_t group)
+{
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+	ext4_grpblk_t i = 0;
+	ext4_grpblk_t first;
+	ext4_grpblk_t len;
+	unsigned free = 0;
+	unsigned fragments = 0;
+	unsigned long long period = get_cycles();
+
+	/* initialize buddy from bitmap which is aggregation
+	 * of on-disk bitmap and preallocations */
+	i = mb_find_next_zero_bit(bitmap, max, 0);
+	grp->bb_first_free = i;
+	while (i < max) {
+		fragments++;
+		first = i;
+		i = mb_find_next_bit(bitmap, max, i);
+		len = i - first;
+		free += len;
+		if (len > 1)
+			ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
+		else
+			grp->bb_counters[0]++;
+		if (i < max)
+			i = mb_find_next_zero_bit(bitmap, max, i);
+	}
+	grp->bb_fragments = fragments;
+
+	if (free != grp->bb_free) {
+		ext4_grp_locked_error(sb, group, 0, 0,
+				      "%u blocks in bitmap, %u in gd",
+				      free, grp->bb_free);
+		/*
+		 * If we intent to continue, we consider group descritor
+		 * corrupt and update bb_free using bitmap value
+		 */
+		grp->bb_free = free;
+	}
+	mb_set_largest_free_order(sb, grp);
+
+	clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+
+	period = get_cycles() - period;
+	spin_lock(&EXT4_SB(sb)->s_bal_lock);
+	EXT4_SB(sb)->s_mb_buddies_generated++;
+	EXT4_SB(sb)->s_mb_generation_time += period;
+	spin_unlock(&EXT4_SB(sb)->s_bal_lock);
+}
+
+/* The buddy information is attached the buddy cache inode
+ * for convenience. The information regarding each group
+ * is loaded via ext4_mb_load_buddy. The information involve
+ * block bitmap and buddy information. The information are
+ * stored in the inode as
+ *
+ * {                        page                        }
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
+ *
+ *
+ * one block each for bitmap and buddy information.
+ * So for each group we take up 2 blocks. A page can
+ * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
+ * So it can have information regarding groups_per_page which
+ * is blocks_per_page/2
+ *
+ * Locking note:  This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
+ */
+
+static int ext4_mb_init_cache(struct page *page, char *incore)
+{
+	ext4_group_t ngroups;
+	int blocksize;
+	int blocks_per_page;
+	int groups_per_page;
+	int err = 0;
+	int i;
+	ext4_group_t first_group;
+	int first_block;
+	struct super_block *sb;
+	struct buffer_head *bhs;
+	struct buffer_head **bh;
+	struct inode *inode;
+	char *data;
+	char *bitmap;
+	struct ext4_group_info *grinfo;
+
+	mb_debug(1, "init page %lu\n", page->index);
+
+	inode = page->mapping->host;
+	sb = inode->i_sb;
+	ngroups = ext4_get_groups_count(sb);
+	blocksize = 1 << inode->i_blkbits;
+	blocks_per_page = PAGE_CACHE_SIZE / blocksize;
+
+	groups_per_page = blocks_per_page >> 1;
+	if (groups_per_page == 0)
+		groups_per_page = 1;
+
+	/* allocate buffer_heads to read bitmaps */
+	if (groups_per_page > 1) {
+		err = -ENOMEM;
+		i = sizeof(struct buffer_head *) * groups_per_page;
+		bh = kzalloc(i, GFP_NOFS);
+		if (bh == NULL)
+			goto out;
+	} else
+		bh = &bhs;
+
+	first_group = page->index * blocks_per_page / 2;
+
+	/* read all groups the page covers into the cache */
+	for (i = 0; i < groups_per_page; i++) {
+		struct ext4_group_desc *desc;
+
+		if (first_group + i >= ngroups)
+			break;
+
+		grinfo = ext4_get_group_info(sb, first_group + i);
+		/*
+		 * If page is uptodate then we came here after online resize
+		 * which added some new uninitialized group info structs, so
+		 * we must skip all initialized uptodate buddies on the page,
+		 * which may be currently in use by an allocating task.
+		 */
+		if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+			bh[i] = NULL;
+			continue;
+		}
+
+		err = -EIO;
+		desc = ext4_get_group_desc(sb, first_group + i, NULL);
+		if (desc == NULL)
+			goto out;
+
+		err = -ENOMEM;
+		bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
+		if (bh[i] == NULL)
+			goto out;
+
+		if (bitmap_uptodate(bh[i]))
+			continue;
+
+		lock_buffer(bh[i]);
+		if (bitmap_uptodate(bh[i])) {
+			unlock_buffer(bh[i]);
+			continue;
+		}
+		ext4_lock_group(sb, first_group + i);
+		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+			ext4_init_block_bitmap(sb, bh[i],
+						first_group + i, desc);
+			set_bitmap_uptodate(bh[i]);
+			set_buffer_uptodate(bh[i]);
+			ext4_unlock_group(sb, first_group + i);
+			unlock_buffer(bh[i]);
+			continue;
+		}
+		ext4_unlock_group(sb, first_group + i);
+		if (buffer_uptodate(bh[i])) {
+			/*
+			 * if not uninit if bh is uptodate,
+			 * bitmap is also uptodate
+			 */
+			set_bitmap_uptodate(bh[i]);
+			unlock_buffer(bh[i]);
+			continue;
+		}
+		get_bh(bh[i]);
+		/*
+		 * submit the buffer_head for read. We can
+		 * safely mark the bitmap as uptodate now.
+		 * We do it here so the bitmap uptodate bit
+		 * get set with buffer lock held.
+		 */
+		set_bitmap_uptodate(bh[i]);
+		bh[i]->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, bh[i]);
+		mb_debug(1, "read bitmap for group %u\n", first_group + i);
+	}
+
+	/* wait for I/O completion */
+	for (i = 0; i < groups_per_page; i++)
+		if (bh[i])
+			wait_on_buffer(bh[i]);
+
+	err = -EIO;
+	for (i = 0; i < groups_per_page; i++)
+		if (bh[i] && !buffer_uptodate(bh[i]))
+			goto out;
+
+	err = 0;
+	first_block = page->index * blocks_per_page;
+	for (i = 0; i < blocks_per_page; i++) {
+		int group;
+
+		group = (first_block + i) >> 1;
+		if (group >= ngroups)
+			break;
+
+		if (!bh[group - first_group])
+			/* skip initialized uptodate buddy */
+			continue;
+
+		/*
+		 * data carry information regarding this
+		 * particular group in the format specified
+		 * above
+		 *
+		 */
+		data = page_address(page) + (i * blocksize);
+		bitmap = bh[group - first_group]->b_data;
+
+		/*
+		 * We place the buddy block and bitmap block
+		 * close together
+		 */
+		if ((first_block + i) & 1) {
+			/* this is block of buddy */
+			BUG_ON(incore == NULL);
+			mb_debug(1, "put buddy for group %u in page %lu/%x\n",
+				group, page->index, i * blocksize);
+			trace_ext4_mb_buddy_bitmap_load(sb, group);
+			grinfo = ext4_get_group_info(sb, group);
+			grinfo->bb_fragments = 0;
+			memset(grinfo->bb_counters, 0,
+			       sizeof(*grinfo->bb_counters) *
+				(sb->s_blocksize_bits+2));
+			/*
+			 * incore got set to the group block bitmap below
+			 */
+			ext4_lock_group(sb, group);
+			/* init the buddy */
+			memset(data, 0xff, blocksize);
+			ext4_mb_generate_buddy(sb, data, incore, group);
+			ext4_unlock_group(sb, group);
+			incore = NULL;
+		} else {
+			/* this is block of bitmap */
+			BUG_ON(incore != NULL);
+			mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
+				group, page->index, i * blocksize);
+			trace_ext4_mb_bitmap_load(sb, group);
+
+			/* see comments in ext4_mb_put_pa() */
+			ext4_lock_group(sb, group);
+			memcpy(data, bitmap, blocksize);
+
+			/* mark all preallocated blks used in in-core bitmap */
+			ext4_mb_generate_from_pa(sb, data, group);
+			ext4_mb_generate_from_freelist(sb, data, group);
+			ext4_unlock_group(sb, group);
+
+			/* set incore so that the buddy information can be
+			 * generated using this
+			 */
+			incore = data;
+		}
+	}
+	SetPageUptodate(page);
+
+out:
+	if (bh) {
+		for (i = 0; i < groups_per_page; i++)
+			brelse(bh[i]);
+		if (bh != &bhs)
+			kfree(bh);
+	}
+	return err;
+}
+
+/*
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
+ */
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+		ext4_group_t group, struct ext4_buddy *e4b)
+{
+	struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
+	int block, pnum, poff;
+	int blocks_per_page;
+	struct page *page;
+
+	e4b->bd_buddy_page = NULL;
+	e4b->bd_bitmap_page = NULL;
+
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (!page)
+		return -EIO;
+	BUG_ON(page->mapping != inode->i_mapping);
+	e4b->bd_bitmap_page = page;
+	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+	if (blocks_per_page >= 2) {
+		/* buddy and bitmap are on the same page */
+		return 0;
+	}
+
+	block++;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (!page)
+		return -EIO;
+	BUG_ON(page->mapping != inode->i_mapping);
+	e4b->bd_buddy_page = page;
+	return 0;
+}
+
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
+{
+	if (e4b->bd_bitmap_page) {
+		unlock_page(e4b->bd_bitmap_page);
+		page_cache_release(e4b->bd_bitmap_page);
+	}
+	if (e4b->bd_buddy_page) {
+		unlock_page(e4b->bd_buddy_page);
+		page_cache_release(e4b->bd_buddy_page);
+	}
+}
+
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+	struct ext4_group_info *this_grp;
+	struct ext4_buddy e4b;
+	struct page *page;
+	int ret = 0;
+
+	mb_debug(1, "init group %u\n", group);
+	this_grp = ext4_get_group_info(sb, group);
+	/*
+	 * This ensures that we don't reinit the buddy cache
+	 * page which map to the group from which we are already
+	 * allocating. If we are looking at the buddy cache we would
+	 * have taken a reference using ext4_mb_load_buddy and that
+	 * would have pinned buddy page to page cache.
+	 */
+	ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+	if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
+		/*
+		 * somebody initialized the group
+		 * return without doing anything
+		 */
+		goto err;
+	}
+
+	page = e4b.bd_bitmap_page;
+	ret = ext4_mb_init_cache(page, NULL);
+	if (ret)
+		goto err;
+	if (!PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+	mark_page_accessed(page);
+
+	if (e4b.bd_buddy_page == NULL) {
+		/*
+		 * If both the bitmap and buddy are in
+		 * the same page we don't need to force
+		 * init the buddy
+		 */
+		ret = 0;
+		goto err;
+	}
+	/* init buddy cache */
+	page = e4b.bd_buddy_page;
+	ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+	if (ret)
+		goto err;
+	if (!PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+	mark_page_accessed(page);
+err:
+	ext4_mb_put_buddy_page_lock(&e4b);
+	return ret;
+}
+
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
+static noinline_for_stack int
+ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+					struct ext4_buddy *e4b)
+{
+	int blocks_per_page;
+	int block;
+	int pnum;
+	int poff;
+	struct page *page;
+	int ret;
+	struct ext4_group_info *grp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode = sbi->s_buddy_cache;
+
+	mb_debug(1, "load group %u\n", group);
+
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	grp = ext4_get_group_info(sb, group);
+
+	e4b->bd_blkbits = sb->s_blocksize_bits;
+	e4b->bd_info = ext4_get_group_info(sb, group);
+	e4b->bd_sb = sb;
+	e4b->bd_group = group;
+	e4b->bd_buddy_page = NULL;
+	e4b->bd_bitmap_page = NULL;
+
+	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+		/*
+		 * we need full data about the group
+		 * to make a good selection
+		 */
+		ret = ext4_mb_init_group(sb, group);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+
+	/* we could use find_or_create_page(), but it locks page
+	 * what we'd like to avoid in fast path ... */
+	page = find_get_page(inode->i_mapping, pnum);
+	if (page == NULL || !PageUptodate(page)) {
+		if (page)
+			/*
+			 * drop the page reference and try
+			 * to get the page with lock. If we
+			 * are not uptodate that implies
+			 * somebody just created the page but
+			 * is yet to initialize the same. So
+			 * wait for it to initialize.
+			 */
+			page_cache_release(page);
+		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+		if (page) {
+			BUG_ON(page->mapping != inode->i_mapping);
+			if (!PageUptodate(page)) {
+				ret = ext4_mb_init_cache(page, NULL);
+				if (ret) {
+					unlock_page(page);
+					goto err;
+				}
+				mb_cmp_bitmaps(e4b, page_address(page) +
+					       (poff * sb->s_blocksize));
+			}
+			unlock_page(page);
+		}
+	}
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+	e4b->bd_bitmap_page = page;
+	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+	mark_page_accessed(page);
+
+	block++;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+
+	page = find_get_page(inode->i_mapping, pnum);
+	if (page == NULL || !PageUptodate(page)) {
+		if (page)
+			page_cache_release(page);
+		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+		if (page) {
+			BUG_ON(page->mapping != inode->i_mapping);
+			if (!PageUptodate(page)) {
+				ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+				if (ret) {
+					unlock_page(page);
+					goto err;
+				}
+			}
+			unlock_page(page);
+		}
+	}
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+	e4b->bd_buddy_page = page;
+	e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
+	mark_page_accessed(page);
+
+	BUG_ON(e4b->bd_bitmap_page == NULL);
+	BUG_ON(e4b->bd_buddy_page == NULL);
+
+	return 0;
+
+err:
+	if (page)
+		page_cache_release(page);
+	if (e4b->bd_bitmap_page)
+		page_cache_release(e4b->bd_bitmap_page);
+	if (e4b->bd_buddy_page)
+		page_cache_release(e4b->bd_buddy_page);
+	e4b->bd_buddy = NULL;
+	e4b->bd_bitmap = NULL;
+	return ret;
+}
+
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
+{
+	if (e4b->bd_bitmap_page)
+		page_cache_release(e4b->bd_bitmap_page);
+	if (e4b->bd_buddy_page)
+		page_cache_release(e4b->bd_buddy_page);
+}
+
+
+static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
+{
+	int order = 1;
+	void *bb;
+
+	BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+	BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
+
+	bb = EXT4_MB_BUDDY(e4b);
+	while (order <= e4b->bd_blkbits + 1) {
+		block = block >> 1;
+		if (!mb_test_bit(block, bb)) {
+			/* this block is part of buddy of order 'order' */
+			return order;
+		}
+		bb += 1 << (e4b->bd_blkbits - order);
+		order++;
+	}
+	return 0;
+}
+
+static void mb_clear_bits(void *bm, int cur, int len)
+{
+	__u32 *addr;
+
+	len = cur + len;
+	while (cur < len) {
+		if ((cur & 31) == 0 && (len - cur) >= 32) {
+			/* fast path: clear whole word at once */
+			addr = bm + (cur >> 3);
+			*addr = 0;
+			cur += 32;
+			continue;
+		}
+		mb_clear_bit(cur, bm);
+		cur++;
+	}
+}
+
+static void mb_set_bits(void *bm, int cur, int len)
+{
+	__u32 *addr;
+
+	len = cur + len;
+	while (cur < len) {
+		if ((cur & 31) == 0 && (len - cur) >= 32) {
+			/* fast path: set whole word at once */
+			addr = bm + (cur >> 3);
+			*addr = 0xffffffff;
+			cur += 32;
+			continue;
+		}
+		mb_set_bit(cur, bm);
+		cur++;
+	}
+}
+
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+			  int first, int count)
+{
+	int block = 0;
+	int max = 0;
+	int order;
+	void *buddy;
+	void *buddy2;
+	struct super_block *sb = e4b->bd_sb;
+
+	BUG_ON(first + count > (sb->s_blocksize << 3));
+	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
+	mb_check_buddy(e4b);
+	mb_free_blocks_double(inode, e4b, first, count);
+
+	e4b->bd_info->bb_free += count;
+	if (first < e4b->bd_info->bb_first_free)
+		e4b->bd_info->bb_first_free = first;
+
+	/* let's maintain fragments counter */
+	if (first != 0)
+		block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
+	if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
+		max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
+	if (block && max)
+		e4b->bd_info->bb_fragments--;
+	else if (!block && !max)
+		e4b->bd_info->bb_fragments++;
+
+	/* let's maintain buddy itself */
+	while (count-- > 0) {
+		block = first++;
+		order = 0;
+
+		if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
+			ext4_fsblk_t blocknr;
+
+			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+			blocknr += block;
+			ext4_grp_locked_error(sb, e4b->bd_group,
+					      inode ? inode->i_ino : 0,
+					      blocknr,
+					      "freeing already freed block "
+					      "(bit %u)", block);
+		}
+		mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
+		e4b->bd_info->bb_counters[order]++;
+
+		/* start of the buddy */
+		buddy = mb_find_buddy(e4b, order, &max);
+
+		do {
+			block &= ~1UL;
+			if (mb_test_bit(block, buddy) ||
+					mb_test_bit(block + 1, buddy))
+				break;
+
+			/* both the buddies are free, try to coalesce them */
+			buddy2 = mb_find_buddy(e4b, order + 1, &max);
+
+			if (!buddy2)
+				break;
+
+			if (order > 0) {
+				/* for special purposes, we don't set
+				 * free bits in bitmap */
+				mb_set_bit(block, buddy);
+				mb_set_bit(block + 1, buddy);
+			}
+			e4b->bd_info->bb_counters[order]--;
+			e4b->bd_info->bb_counters[order]--;
+
+			block = block >> 1;
+			order++;
+			e4b->bd_info->bb_counters[order]++;
+
+			mb_clear_bit(block, buddy2);
+			buddy = buddy2;
+		} while (1);
+	}
+	mb_set_largest_free_order(sb, e4b->bd_info);
+	mb_check_buddy(e4b);
+}
+
+static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
+				int needed, struct ext4_free_extent *ex)
+{
+	int next = block;
+	int max;
+	int ord;
+	void *buddy;
+
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
+	BUG_ON(ex == NULL);
+
+	buddy = mb_find_buddy(e4b, order, &max);
+	BUG_ON(buddy == NULL);
+	BUG_ON(block >= max);
+	if (mb_test_bit(block, buddy)) {
+		ex->fe_len = 0;
+		ex->fe_start = 0;
+		ex->fe_group = 0;
+		return 0;
+	}
+
+	/* FIXME dorp order completely ? */
+	if (likely(order == 0)) {
+		/* find actual order */
+		order = mb_find_order_for_block(e4b, block);
+		block = block >> order;
+	}
+
+	ex->fe_len = 1 << order;
+	ex->fe_start = block << order;
+	ex->fe_group = e4b->bd_group;
+
+	/* calc difference from given start */
+	next = next - ex->fe_start;
+	ex->fe_len -= next;
+	ex->fe_start += next;
+
+	while (needed > ex->fe_len &&
+	       (buddy = mb_find_buddy(e4b, order, &max))) {
+
+		if (block + 1 >= max)
+			break;
+
+		next = (block + 1) * (1 << order);
+		if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
+			break;
+
+		ord = mb_find_order_for_block(e4b, next);
+
+		order = ord;
+		block = next >> order;
+		ex->fe_len += 1 << order;
+	}
+
+	BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
+	return ex->fe_len;
+}
+
+static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
+{
+	int ord;
+	int mlen = 0;
+	int max = 0;
+	int cur;
+	int start = ex->fe_start;
+	int len = ex->fe_len;
+	unsigned ret = 0;
+	int len0 = len;
+	void *buddy;
+
+	BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
+	BUG_ON(e4b->bd_group != ex->fe_group);
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
+	mb_check_buddy(e4b);
+	mb_mark_used_double(e4b, start, len);
+
+	e4b->bd_info->bb_free -= len;
+	if (e4b->bd_info->bb_first_free == start)
+		e4b->bd_info->bb_first_free += len;
+
+	/* let's maintain fragments counter */
+	if (start != 0)
+		mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
+	if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
+		max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
+	if (mlen && max)
+		e4b->bd_info->bb_fragments++;
+	else if (!mlen && !max)
+		e4b->bd_info->bb_fragments--;
+
+	/* let's maintain buddy itself */
+	while (len) {
+		ord = mb_find_order_for_block(e4b, start);
+
+		if (((start >> ord) << ord) == start && len >= (1 << ord)) {
+			/* the whole chunk may be allocated at once! */
+			mlen = 1 << ord;
+			buddy = mb_find_buddy(e4b, ord, &max);
+			BUG_ON((start >> ord) >= max);
+			mb_set_bit(start >> ord, buddy);
+			e4b->bd_info->bb_counters[ord]--;
+			start += mlen;
+			len -= mlen;
+			BUG_ON(len < 0);
+			continue;
+		}
+
+		/* store for history */
+		if (ret == 0)
+			ret = len | (ord << 16);
+
+		/* we have to split large buddy */
+		BUG_ON(ord <= 0);
+		buddy = mb_find_buddy(e4b, ord, &max);
+		mb_set_bit(start >> ord, buddy);
+		e4b->bd_info->bb_counters[ord]--;
+
+		ord--;
+		cur = (start >> ord) & ~1U;
+		buddy = mb_find_buddy(e4b, ord, &max);
+		mb_clear_bit(cur, buddy);
+		mb_clear_bit(cur + 1, buddy);
+		e4b->bd_info->bb_counters[ord]++;
+		e4b->bd_info->bb_counters[ord]++;
+	}
+	mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
+
+	mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+	mb_check_buddy(e4b);
+
+	return ret;
+}
+
+/*
+ * Must be called under group lock!
+ */
+static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
+					struct ext4_buddy *e4b)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	int ret;
+
+	BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
+	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+
+	ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
+	ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
+	ret = mb_mark_used(e4b, &ac->ac_b_ex);
+
+	/* preallocation can change ac_b_ex, thus we store actually
+	 * allocated blocks for history */
+	ac->ac_f_ex = ac->ac_b_ex;
+
+	ac->ac_status = AC_STATUS_FOUND;
+	ac->ac_tail = ret & 0xffff;
+	ac->ac_buddy = ret >> 16;
+
+	/*
+	 * take the page reference. We want the page to be pinned
+	 * so that we don't get a ext4_mb_init_cache_call for this
+	 * group until we update the bitmap. That would mean we
+	 * double allocate blocks. The reference is dropped
+	 * in ext4_mb_release_context
+	 */
+	ac->ac_bitmap_page = e4b->bd_bitmap_page;
+	get_page(ac->ac_bitmap_page);
+	ac->ac_buddy_page = e4b->bd_buddy_page;
+	get_page(ac->ac_buddy_page);
+	/* store last allocated for subsequent stream allocation */
+	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
+		spin_lock(&sbi->s_md_lock);
+		sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
+		sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
+		spin_unlock(&sbi->s_md_lock);
+	}
+}
+
+/*
+ * regular allocator, for general purposes allocation
+ */
+
+static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
+					struct ext4_buddy *e4b,
+					int finish_group)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	struct ext4_free_extent *bex = &ac->ac_b_ex;
+	struct ext4_free_extent *gex = &ac->ac_g_ex;
+	struct ext4_free_extent ex;
+	int max;
+
+	if (ac->ac_status == AC_STATUS_FOUND)
+		return;
+	/*
+	 * We don't want to scan for a whole year
+	 */
+	if (ac->ac_found > sbi->s_mb_max_to_scan &&
+			!(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+		ac->ac_status = AC_STATUS_BREAK;
+		return;
+	}
+
+	/*
+	 * Haven't found good chunk so far, let's continue
+	 */
+	if (bex->fe_len < gex->fe_len)
+		return;
+
+	if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
+			&& bex->fe_group == e4b->bd_group) {
+		/* recheck chunk's availability - we don't know
+		 * when it was found (within this lock-unlock
+		 * period or not) */
+		max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
+		if (max >= gex->fe_len) {
+			ext4_mb_use_best_found(ac, e4b);
+			return;
+		}
+	}
+}
+
+/*
+ * The routine checks whether found extent is good enough. If it is,
+ * then the extent gets marked used and flag is set to the context
+ * to stop scanning. Otherwise, the extent is compared with the
+ * previous found extent and if new one is better, then it's stored
+ * in the context. Later, the best found extent will be used, if
+ * mballoc can't find good enough extent.
+ *
+ * FIXME: real allocation policy is to be designed yet!
+ */
+static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
+					struct ext4_free_extent *ex,
+					struct ext4_buddy *e4b)
+{
+	struct ext4_free_extent *bex = &ac->ac_b_ex;
+	struct ext4_free_extent *gex = &ac->ac_g_ex;
+
+	BUG_ON(ex->fe_len <= 0);
+	BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+	BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+	BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
+
+	ac->ac_found++;
+
+	/*
+	 * The special case - take what you catch first
+	 */
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+		*bex = *ex;
+		ext4_mb_use_best_found(ac, e4b);
+		return;
+	}
+
+	/*
+	 * Let's check whether the chuck is good enough
+	 */
+	if (ex->fe_len == gex->fe_len) {
+		*bex = *ex;
+		ext4_mb_use_best_found(ac, e4b);
+		return;
+	}
+
+	/*
+	 * If this is first found extent, just store it in the context
+	 */
+	if (bex->fe_len == 0) {
+		*bex = *ex;
+		return;
+	}
+
+	/*
+	 * If new found extent is better, store it in the context
+	 */
+	if (bex->fe_len < gex->fe_len) {
+		/* if the request isn't satisfied, any found extent
+		 * larger than previous best one is better */
+		if (ex->fe_len > bex->fe_len)
+			*bex = *ex;
+	} else if (ex->fe_len > gex->fe_len) {
+		/* if the request is satisfied, then we try to find
+		 * an extent that still satisfy the request, but is
+		 * smaller than previous one */
+		if (ex->fe_len < bex->fe_len)
+			*bex = *ex;
+	}
+
+	ext4_mb_check_limits(ac, e4b, 0);
+}
+
+static noinline_for_stack
+int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
+					struct ext4_buddy *e4b)
+{
+	struct ext4_free_extent ex = ac->ac_b_ex;
+	ext4_group_t group = ex.fe_group;
+	int max;
+	int err;
+
+	BUG_ON(ex.fe_len <= 0);
+	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
+	if (err)
+		return err;
+
+	ext4_lock_group(ac->ac_sb, group);
+	max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
+
+	if (max > 0) {
+		ac->ac_b_ex = ex;
+		ext4_mb_use_best_found(ac, e4b);
+	}
+
+	ext4_unlock_group(ac->ac_sb, group);
+	ext4_mb_unload_buddy(e4b);
+
+	return 0;
+}
+
+static noinline_for_stack
+int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
+				struct ext4_buddy *e4b)
+{
+	ext4_group_t group = ac->ac_g_ex.fe_group;
+	int max;
+	int err;
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	struct ext4_free_extent ex;
+
+	if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
+		return 0;
+
+	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
+	if (err)
+		return err;
+
+	ext4_lock_group(ac->ac_sb, group);
+	max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
+			     ac->ac_g_ex.fe_len, &ex);
+
+	if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
+		ext4_fsblk_t start;
+
+		start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
+			ex.fe_start;
+		/* use do_div to get remainder (would be 64-bit modulo) */
+		if (do_div(start, sbi->s_stripe) == 0) {
+			ac->ac_found++;
+			ac->ac_b_ex = ex;
+			ext4_mb_use_best_found(ac, e4b);
+		}
+	} else if (max >= ac->ac_g_ex.fe_len) {
+		BUG_ON(ex.fe_len <= 0);
+		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
+		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
+		ac->ac_found++;
+		ac->ac_b_ex = ex;
+		ext4_mb_use_best_found(ac, e4b);
+	} else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
+		/* Sometimes, caller may want to merge even small
+		 * number of blocks to an existing extent */
+		BUG_ON(ex.fe_len <= 0);
+		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
+		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
+		ac->ac_found++;
+		ac->ac_b_ex = ex;
+		ext4_mb_use_best_found(ac, e4b);
+	}
+	ext4_unlock_group(ac->ac_sb, group);
+	ext4_mb_unload_buddy(e4b);
+
+	return 0;
+}
+
+/*
+ * The routine scans buddy structures (not bitmap!) from given order
+ * to max order and tries to find big enough chunk to satisfy the req
+ */
+static noinline_for_stack
+void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
+					struct ext4_buddy *e4b)
+{
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_group_info *grp = e4b->bd_info;
+	void *buddy;
+	int i;
+	int k;
+	int max;
+
+	BUG_ON(ac->ac_2order <= 0);
+	for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
+		if (grp->bb_counters[i] == 0)
+			continue;
+
+		buddy = mb_find_buddy(e4b, i, &max);
+		BUG_ON(buddy == NULL);
+
+		k = mb_find_next_zero_bit(buddy, max, 0);
+		BUG_ON(k >= max);
+
+		ac->ac_found++;
+
+		ac->ac_b_ex.fe_len = 1 << i;
+		ac->ac_b_ex.fe_start = k << i;
+		ac->ac_b_ex.fe_group = e4b->bd_group;
+
+		ext4_mb_use_best_found(ac, e4b);
+
+		BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
+
+		if (EXT4_SB(sb)->s_mb_stats)
+			atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
+
+		break;
+	}
+}
+
+/*
+ * The routine scans the group and measures all found extents.
+ * In order to optimize scanning, caller must pass number of
+ * free blocks in the group, so the routine can know upper limit.
+ */
+static noinline_for_stack
+void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
+					struct ext4_buddy *e4b)
+{
+	struct super_block *sb = ac->ac_sb;
+	void *bitmap = EXT4_MB_BITMAP(e4b);
+	struct ext4_free_extent ex;
+	int i;
+	int free;
+
+	free = e4b->bd_info->bb_free;
+	BUG_ON(free <= 0);
+
+	i = e4b->bd_info->bb_first_free;
+
+	while (free && ac->ac_status == AC_STATUS_CONTINUE) {
+		i = mb_find_next_zero_bit(bitmap,
+						EXT4_BLOCKS_PER_GROUP(sb), i);
+		if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
+			/*
+			 * IF we have corrupt bitmap, we won't find any
+			 * free blocks even though group info says we
+			 * we have free blocks
+			 */
+			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
+					"%d free blocks as per "
+					"group info. But bitmap says 0",
+					free);
+			break;
+		}
+
+		mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
+		BUG_ON(ex.fe_len <= 0);
+		if (free < ex.fe_len) {
+			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
+					"%d free blocks as per "
+					"group info. But got %d blocks",
+					free, ex.fe_len);
+			/*
+			 * The number of free blocks differs. This mostly
+			 * indicate that the bitmap is corrupt. So exit
+			 * without claiming the space.
+			 */
+			break;
+		}
+
+		ext4_mb_measure_extent(ac, &ex, e4b);
+
+		i += ex.fe_len;
+		free -= ex.fe_len;
+	}
+
+	ext4_mb_check_limits(ac, e4b, 1);
+}
+
+/*
+ * This is a special case for storages like raid5
+ * we try to find stripe-aligned chunks for stripe-size-multiple requests
+ */
+static noinline_for_stack
+void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
+				 struct ext4_buddy *e4b)
+{
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	void *bitmap = EXT4_MB_BITMAP(e4b);
+	struct ext4_free_extent ex;
+	ext4_fsblk_t first_group_block;
+	ext4_fsblk_t a;
+	ext4_grpblk_t i;
+	int max;
+
+	BUG_ON(sbi->s_stripe == 0);
+
+	/* find first stripe-aligned block in group */
+	first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
+
+	a = first_group_block + sbi->s_stripe - 1;
+	do_div(a, sbi->s_stripe);
+	i = (a * sbi->s_stripe) - first_group_block;
+
+	while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
+		if (!mb_test_bit(i, bitmap)) {
+			max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
+			if (max >= sbi->s_stripe) {
+				ac->ac_found++;
+				ac->ac_b_ex = ex;
+				ext4_mb_use_best_found(ac, e4b);
+				break;
+			}
+		}
+		i += sbi->s_stripe;
+	}
+}
+
+/* This is now called BEFORE we load the buddy bitmap. */
+static int ext4_mb_good_group(struct ext4_allocation_context *ac,
+				ext4_group_t group, int cr)
+{
+	unsigned free, fragments;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
+	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
+
+	BUG_ON(cr < 0 || cr >= 4);
+
+	/* We only do this if the grp has never been initialized */
+	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+		int ret = ext4_mb_init_group(ac->ac_sb, group);
+		if (ret)
+			return 0;
+	}
+
+	free = grp->bb_free;
+	fragments = grp->bb_fragments;
+	if (free == 0)
+		return 0;
+	if (fragments == 0)
+		return 0;
+
+	switch (cr) {
+	case 0:
+		BUG_ON(ac->ac_2order == 0);
+
+		if (grp->bb_largest_free_order < ac->ac_2order)
+			return 0;
+
+		/* Avoid using the first bg of a flexgroup for data files */
+		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+		    ((group % flex_size) == 0))
+			return 0;
+
+		return 1;
+	case 1:
+		if ((free / fragments) >= ac->ac_g_ex.fe_len)
+			return 1;
+		break;
+	case 2:
+		if (free >= ac->ac_g_ex.fe_len)
+			return 1;
+		break;
+	case 3:
+		return 1;
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static noinline_for_stack int
+ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
+{
+	ext4_group_t ngroups, group, i;
+	int cr;
+	int err = 0;
+	struct ext4_sb_info *sbi;
+	struct super_block *sb;
+	struct ext4_buddy e4b;
+
+	sb = ac->ac_sb;
+	sbi = EXT4_SB(sb);
+	ngroups = ext4_get_groups_count(sb);
+	/* non-extent files are limited to low blocks/groups */
+	if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
+		ngroups = sbi->s_blockfile_groups;
+
+	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+
+	/* first, try the goal */
+	err = ext4_mb_find_by_goal(ac, &e4b);
+	if (err || ac->ac_status == AC_STATUS_FOUND)
+		goto out;
+
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+		goto out;
+
+	/*
+	 * ac->ac2_order is set only if the fe_len is a power of 2
+	 * if ac2_order is set we also set criteria to 0 so that we
+	 * try exact allocation using buddy.
+	 */
+	i = fls(ac->ac_g_ex.fe_len);
+	ac->ac_2order = 0;
+	/*
+	 * We search using buddy data only if the order of the request
+	 * is greater than equal to the sbi_s_mb_order2_reqs
+	 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
+	 */
+	if (i >= sbi->s_mb_order2_reqs) {
+		/*
+		 * This should tell if fe_len is exactly power of 2
+		 */
+		if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
+			ac->ac_2order = i - 1;
+	}
+
+	/* if stream allocation is enabled, use global goal */
+	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
+		/* TBD: may be hot point */
+		spin_lock(&sbi->s_md_lock);
+		ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
+		ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
+		spin_unlock(&sbi->s_md_lock);
+	}
+
+	/* Let's just scan groups to find more-less suitable blocks */
+	cr = ac->ac_2order ? 0 : 1;
+	/*
+	 * cr == 0 try to get exact allocation,
+	 * cr == 3  try to get anything
+	 */
+repeat:
+	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
+		ac->ac_criteria = cr;
+		/*
+		 * searching for the right group start
+		 * from the goal value specified
+		 */
+		group = ac->ac_g_ex.fe_group;
+
+		for (i = 0; i < ngroups; group++, i++) {
+			if (group == ngroups)
+				group = 0;
+
+			/* This now checks without needing the buddy page */
+			if (!ext4_mb_good_group(ac, group, cr))
+				continue;
+
+			err = ext4_mb_load_buddy(sb, group, &e4b);
+			if (err)
+				goto out;
+
+			ext4_lock_group(sb, group);
+
+			/*
+			 * We need to check again after locking the
+			 * block group
+			 */
+			if (!ext4_mb_good_group(ac, group, cr)) {
+				ext4_unlock_group(sb, group);
+				ext4_mb_unload_buddy(&e4b);
+				continue;
+			}
+
+			ac->ac_groups_scanned++;
+			if (cr == 0)
+				ext4_mb_simple_scan_group(ac, &e4b);
+			else if (cr == 1 && sbi->s_stripe &&
+					!(ac->ac_g_ex.fe_len % sbi->s_stripe))
+				ext4_mb_scan_aligned(ac, &e4b);
+			else
+				ext4_mb_complex_scan_group(ac, &e4b);
+
+			ext4_unlock_group(sb, group);
+			ext4_mb_unload_buddy(&e4b);
+
+			if (ac->ac_status != AC_STATUS_CONTINUE)
+				break;
+		}
+	}
+
+	if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
+	    !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+		/*
+		 * We've been searching too long. Let's try to allocate
+		 * the best chunk we've found so far
+		 */
+
+		ext4_mb_try_best_found(ac, &e4b);
+		if (ac->ac_status != AC_STATUS_FOUND) {
+			/*
+			 * Someone more lucky has already allocated it.
+			 * The only thing we can do is just take first
+			 * found block(s)
+			printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
+			 */
+			ac->ac_b_ex.fe_group = 0;
+			ac->ac_b_ex.fe_start = 0;
+			ac->ac_b_ex.fe_len = 0;
+			ac->ac_status = AC_STATUS_CONTINUE;
+			ac->ac_flags |= EXT4_MB_HINT_FIRST;
+			cr = 3;
+			atomic_inc(&sbi->s_mb_lost_chunks);
+			goto repeat;
+		}
+	}
+out:
+	return err;
+}
+
+static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
+{
+	struct super_block *sb = seq->private;
+	ext4_group_t group;
+
+	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
+		return NULL;
+	group = *pos + 1;
+	return (void *) ((unsigned long) group);
+}
+
+static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct super_block *sb = seq->private;
+	ext4_group_t group;
+
+	++*pos;
+	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
+		return NULL;
+	group = *pos + 1;
+	return (void *) ((unsigned long) group);
+}
+
+static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
+{
+	struct super_block *sb = seq->private;
+	ext4_group_t group = (ext4_group_t) ((unsigned long) v);
+	int i;
+	int err;
+	struct ext4_buddy e4b;
+	struct sg {
+		struct ext4_group_info info;
+		ext4_grpblk_t counters[16];
+	} sg;
+
+	group--;
+	if (group == 0)
+		seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
+				"[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
+				  "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
+			   "group", "free", "frags", "first",
+			   "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
+			   "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
+
+	i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+		sizeof(struct ext4_group_info);
+	err = ext4_mb_load_buddy(sb, group, &e4b);
+	if (err) {
+		seq_printf(seq, "#%-5u: I/O error\n", group);
+		return 0;
+	}
+	ext4_lock_group(sb, group);
+	memcpy(&sg, ext4_get_group_info(sb, group), i);
+	ext4_unlock_group(sb, group);
+	ext4_mb_unload_buddy(&e4b);
+
+	seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
+			sg.info.bb_fragments, sg.info.bb_first_free);
+	for (i = 0; i <= 13; i++)
+		seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
+				sg.info.bb_counters[i] : 0);
+	seq_printf(seq, " ]\n");
+
+	return 0;
+}
+
+static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations ext4_mb_seq_groups_ops = {
+	.start  = ext4_mb_seq_groups_start,
+	.next   = ext4_mb_seq_groups_next,
+	.stop   = ext4_mb_seq_groups_stop,
+	.show   = ext4_mb_seq_groups_show,
+};
+
+static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
+{
+	struct super_block *sb = PDE(inode)->data;
+	int rc;
+
+	rc = seq_open(file, &ext4_mb_seq_groups_ops);
+	if (rc == 0) {
+		struct seq_file *m = file->private_data;
+		m->private = sb;
+	}
+	return rc;
+
+}
+
+static const struct file_operations ext4_mb_seq_groups_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ext4_mb_seq_groups_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+{
+	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+	struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
+
+	BUG_ON(!cachep);
+	return cachep;
+}
+
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+			  struct ext4_group_desc *desc)
+{
+	int i;
+	int metalen = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_info **meta_group_info;
+	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+
+	/*
+	 * First check if this group is the first of a reserved block.
+	 * If it's true, we have to allocate a new table of pointers
+	 * to ext4_group_info structures
+	 */
+	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+		metalen = sizeof(*meta_group_info) <<
+			EXT4_DESC_PER_BLOCK_BITS(sb);
+		meta_group_info = kmalloc(metalen, GFP_KERNEL);
+		if (meta_group_info == NULL) {
+			printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+			       "buddy group\n");
+			goto exit_meta_group_info;
+		}
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+			meta_group_info;
+	}
+
+	meta_group_info =
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+
+	meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
+	if (meta_group_info[i] == NULL) {
+		printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+		goto exit_group_info;
+	}
+	memset(meta_group_info[i], 0, kmem_cache_size(cachep));
+	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+		&(meta_group_info[i]->bb_state));
+
+	/*
+	 * initialize bb_free to be able to skip
+	 * empty groups without initialization
+	 */
+	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+		meta_group_info[i]->bb_free =
+			ext4_free_blocks_after_init(sb, group, desc);
+	} else {
+		meta_group_info[i]->bb_free =
+			ext4_free_blks_count(sb, desc);
+	}
+
+	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+	init_rwsem(&meta_group_info[i]->alloc_sem);
+	meta_group_info[i]->bb_free_root = RB_ROOT;
+	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
+
+#ifdef DOUBLE_CHECK
+	{
+		struct buffer_head *bh;
+		meta_group_info[i]->bb_bitmap =
+			kmalloc(sb->s_blocksize, GFP_KERNEL);
+		BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+		bh = ext4_read_block_bitmap(sb, group);
+		BUG_ON(bh == NULL);
+		memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+			sb->s_blocksize);
+		put_bh(bh);
+	}
+#endif
+
+	return 0;
+
+exit_group_info:
+	/* If a meta_group_info table has been allocated, release it now */
+	if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+		kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+exit_meta_group_info:
+	return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+
+static int ext4_mb_init_backend(struct super_block *sb)
+{
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	ext4_group_t i;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int num_meta_group_infos;
+	int num_meta_group_infos_max;
+	int array_size;
+	struct ext4_group_desc *desc;
+	struct kmem_cache *cachep;
+
+	/* This is the number of blocks used by GDT */
+	num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
+				1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
+
+	/*
+	 * This is the total number of blocks used by GDT including
+	 * the number of reserved blocks for GDT.
+	 * The s_group_info array is allocated with this value
+	 * to allow a clean online resize without a complex
+	 * manipulation of pointer.
+	 * The drawback is the unused memory when no resize
+	 * occurs but it's very low in terms of pages
+	 * (see comments below)
+	 * Need to handle this properly when META_BG resizing is allowed
+	 */
+	num_meta_group_infos_max = num_meta_group_infos +
+				le16_to_cpu(es->s_reserved_gdt_blocks);
+
+	/*
+	 * array_size is the size of s_group_info array. We round it
+	 * to the next power of two because this approximation is done
+	 * internally by kmalloc so we can have some more memory
+	 * for free here (e.g. may be used for META_BG resize).
+	 */
+	array_size = 1;
+	while (array_size < sizeof(*sbi->s_group_info) *
+	       num_meta_group_infos_max)
+		array_size = array_size << 1;
+	/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
+	 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
+	 * So a two level scheme suffices for now. */
+	sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
+	if (sbi->s_group_info == NULL) {
+		printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
+		return -ENOMEM;
+	}
+	sbi->s_buddy_cache = new_inode(sb);
+	if (sbi->s_buddy_cache == NULL) {
+		printk(KERN_ERR "EXT4-fs: can't get new inode\n");
+		goto err_freesgi;
+	}
+	sbi->s_buddy_cache->i_ino = get_next_ino();
+	EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
+	for (i = 0; i < ngroups; i++) {
+		desc = ext4_get_group_desc(sb, i, NULL);
+		if (desc == NULL) {
+			printk(KERN_ERR
+				"EXT4-fs: can't read descriptor %u\n", i);
+			goto err_freebuddy;
+		}
+		if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
+			goto err_freebuddy;
+	}
+
+	return 0;
+
+err_freebuddy:
+	cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+	while (i-- > 0)
+		kmem_cache_free(cachep, ext4_get_group_info(sb, i));
+	i = num_meta_group_infos;
+	while (i-- > 0)
+		kfree(sbi->s_group_info[i]);
+	iput(sbi->s_buddy_cache);
+err_freesgi:
+	kfree(sbi->s_group_info);
+	return -ENOMEM;
+}
+
+static void ext4_groupinfo_destroy_slabs(void)
+{
+	int i;
+
+	for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+		if (ext4_groupinfo_caches[i])
+			kmem_cache_destroy(ext4_groupinfo_caches[i]);
+		ext4_groupinfo_caches[i] = NULL;
+	}
+}
+
+static int ext4_groupinfo_create_slab(size_t size)
+{
+	static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
+	int slab_size;
+	int blocksize_bits = order_base_2(size);
+	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+	struct kmem_cache *cachep;
+
+	if (cache_index >= NR_GRPINFO_CACHES)
+		return -EINVAL;
+
+	if (unlikely(cache_index < 0))
+		cache_index = 0;
+
+	mutex_lock(&ext4_grpinfo_slab_create_mutex);
+	if (ext4_groupinfo_caches[cache_index]) {
+		mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+		return 0;	/* Already created */
+	}
+
+	slab_size = offsetof(struct ext4_group_info,
+				bb_counters[blocksize_bits + 2]);
+
+	cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
+					slab_size, 0, SLAB_RECLAIM_ACCOUNT,
+					NULL);
+
+	mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+	if (!cachep) {
+		printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
+		return -ENOMEM;
+	}
+
+	ext4_groupinfo_caches[cache_index] = cachep;
+
+	return 0;
+}
+
+int ext4_mb_init(struct super_block *sb, int needs_recovery)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	unsigned i, j;
+	unsigned offset;
+	unsigned max;
+	int ret;
+
+	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
+
+	sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
+	if (sbi->s_mb_offsets == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
+	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
+	if (sbi->s_mb_maxs == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = ext4_groupinfo_create_slab(sb->s_blocksize);
+	if (ret < 0)
+		goto out;
+
+	/* order 0 is regular bitmap */
+	sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
+	sbi->s_mb_offsets[0] = 0;
+
+	i = 1;
+	offset = 0;
+	max = sb->s_blocksize << 2;
+	do {
+		sbi->s_mb_offsets[i] = offset;
+		sbi->s_mb_maxs[i] = max;
+		offset += 1 << (sb->s_blocksize_bits - i);
+		max = max >> 1;
+		i++;
+	} while (i <= sb->s_blocksize_bits + 1);
+
+	/* init file for buddy data */
+	ret = ext4_mb_init_backend(sb);
+	if (ret != 0) {
+		goto out;
+	}
+
+	spin_lock_init(&sbi->s_md_lock);
+	spin_lock_init(&sbi->s_bal_lock);
+
+	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
+	sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
+	sbi->s_mb_stats = MB_DEFAULT_STATS;
+	sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
+	sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+	sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
+
+	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
+	if (sbi->s_locality_groups == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	for_each_possible_cpu(i) {
+		struct ext4_locality_group *lg;
+		lg = per_cpu_ptr(sbi->s_locality_groups, i);
+		mutex_init(&lg->lg_mutex);
+		for (j = 0; j < PREALLOC_TB_SIZE; j++)
+			INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
+		spin_lock_init(&lg->lg_prealloc_lock);
+	}
+
+	if (sbi->s_proc)
+		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
+				 &ext4_mb_seq_groups_fops, sb);
+
+	if (sbi->s_journal)
+		sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+out:
+	if (ret) {
+		kfree(sbi->s_mb_offsets);
+		kfree(sbi->s_mb_maxs);
+	}
+	return ret;
+}
+
+/* need to called with the ext4 group lock held */
+static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
+{
+	struct ext4_prealloc_space *pa;
+	struct list_head *cur, *tmp;
+	int count = 0;
+
+	list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
+		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+		list_del(&pa->pa_group_list);
+		count++;
+		kmem_cache_free(ext4_pspace_cachep, pa);
+	}
+	if (count)
+		mb_debug(1, "mballoc: %u PAs left\n", count);
+
+}
+
+int ext4_mb_release(struct super_block *sb)
+{
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	ext4_group_t i;
+	int num_meta_group_infos;
+	struct ext4_group_info *grinfo;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+
+	if (sbi->s_proc)
+		remove_proc_entry("mb_groups", sbi->s_proc);
+
+	if (sbi->s_group_info) {
+		for (i = 0; i < ngroups; i++) {
+			grinfo = ext4_get_group_info(sb, i);
+#ifdef DOUBLE_CHECK
+			kfree(grinfo->bb_bitmap);
+#endif
+			ext4_lock_group(sb, i);
+			ext4_mb_cleanup_pa(grinfo);
+			ext4_unlock_group(sb, i);
+			kmem_cache_free(cachep, grinfo);
+		}
+		num_meta_group_infos = (ngroups +
+				EXT4_DESC_PER_BLOCK(sb) - 1) >>
+			EXT4_DESC_PER_BLOCK_BITS(sb);
+		for (i = 0; i < num_meta_group_infos; i++)
+			kfree(sbi->s_group_info[i]);
+		kfree(sbi->s_group_info);
+	}
+	kfree(sbi->s_mb_offsets);
+	kfree(sbi->s_mb_maxs);
+	if (sbi->s_buddy_cache)
+		iput(sbi->s_buddy_cache);
+	if (sbi->s_mb_stats) {
+		printk(KERN_INFO
+		       "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n",
+				atomic_read(&sbi->s_bal_allocated),
+				atomic_read(&sbi->s_bal_reqs),
+				atomic_read(&sbi->s_bal_success));
+		printk(KERN_INFO
+		      "EXT4-fs: mballoc: %u extents scanned, %u goal hits, "
+				"%u 2^N hits, %u breaks, %u lost\n",
+				atomic_read(&sbi->s_bal_ex_scanned),
+				atomic_read(&sbi->s_bal_goals),
+				atomic_read(&sbi->s_bal_2orders),
+				atomic_read(&sbi->s_bal_breaks),
+				atomic_read(&sbi->s_mb_lost_chunks));
+		printk(KERN_INFO
+		       "EXT4-fs: mballoc: %lu generated and it took %Lu\n",
+				sbi->s_mb_buddies_generated++,
+				sbi->s_mb_generation_time);
+		printk(KERN_INFO
+		       "EXT4-fs: mballoc: %u preallocated, %u discarded\n",
+				atomic_read(&sbi->s_mb_preallocated),
+				atomic_read(&sbi->s_mb_discarded));
+	}
+
+	free_percpu(sbi->s_locality_groups);
+
+	return 0;
+}
+
+static inline int ext4_issue_discard(struct super_block *sb,
+		ext4_group_t block_group, ext4_grpblk_t block, int count)
+{
+	ext4_fsblk_t discard_block;
+
+	discard_block = block + ext4_group_first_block_no(sb, block_group);
+	trace_ext4_discard_blocks(sb,
+			(unsigned long long) discard_block, count);
+	return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+}
+
+/*
+ * This function is called by the jbd2 layer once the commit has finished,
+ * so we know we can free the blocks that were released with that commit.
+ */
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
+{
+	struct super_block *sb = journal->j_private;
+	struct ext4_buddy e4b;
+	struct ext4_group_info *db;
+	int err, count = 0, count2 = 0;
+	struct ext4_free_data *entry;
+	struct list_head *l, *ltmp;
+
+	list_for_each_safe(l, ltmp, &txn->t_private_list) {
+		entry = list_entry(l, struct ext4_free_data, list);
+
+		mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+			 entry->count, entry->group, entry);
+
+		if (test_opt(sb, DISCARD))
+			ext4_issue_discard(sb, entry->group,
+					   entry->start_blk, entry->count);
+
+		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
+		/* we expect to find existing buddy because it's pinned */
+		BUG_ON(err != 0);
+
+		db = e4b.bd_info;
+		/* there are blocks to put in buddy to make them really free */
+		count += entry->count;
+		count2++;
+		ext4_lock_group(sb, entry->group);
+		/* Take it out of per group rb tree */
+		rb_erase(&entry->node, &(db->bb_free_root));
+		mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+
+		if (!db->bb_free_root.rb_node) {
+			/* No more items in the per group rb tree
+			 * balance refcounts from ext4_mb_free_metadata()
+			 */
+			page_cache_release(e4b.bd_buddy_page);
+			page_cache_release(e4b.bd_bitmap_page);
+		}
+		ext4_unlock_group(sb, entry->group);
+		kmem_cache_free(ext4_free_ext_cachep, entry);
+		ext4_mb_unload_buddy(&e4b);
+	}
+
+	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
+}
+
+#ifdef CONFIG_EXT4_DEBUG
+u8 mb_enable_debug __read_mostly;
+
+static struct dentry *debugfs_dir;
+static struct dentry *debugfs_debug;
+
+static void __init ext4_create_debugfs_entry(void)
+{
+	debugfs_dir = debugfs_create_dir("ext4", NULL);
+	if (debugfs_dir)
+		debugfs_debug = debugfs_create_u8("mballoc-debug",
+						  S_IRUGO | S_IWUSR,
+						  debugfs_dir,
+						  &mb_enable_debug);
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+	debugfs_remove(debugfs_debug);
+	debugfs_remove(debugfs_dir);
+}
+
+#else
+
+static void __init ext4_create_debugfs_entry(void)
+{
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+}
+
+#endif
+
+int __init ext4_init_mballoc(void)
+{
+	ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
+					SLAB_RECLAIM_ACCOUNT);
+	if (ext4_pspace_cachep == NULL)
+		return -ENOMEM;
+
+	ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
+				    SLAB_RECLAIM_ACCOUNT);
+	if (ext4_ac_cachep == NULL) {
+		kmem_cache_destroy(ext4_pspace_cachep);
+		return -ENOMEM;
+	}
+
+	ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
+					  SLAB_RECLAIM_ACCOUNT);
+	if (ext4_free_ext_cachep == NULL) {
+		kmem_cache_destroy(ext4_pspace_cachep);
+		kmem_cache_destroy(ext4_ac_cachep);
+		return -ENOMEM;
+	}
+	ext4_create_debugfs_entry();
+	return 0;
+}
+
+void ext4_exit_mballoc(void)
+{
+	/*
+	 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
+	 * before destroying the slab cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(ext4_pspace_cachep);
+	kmem_cache_destroy(ext4_ac_cachep);
+	kmem_cache_destroy(ext4_free_ext_cachep);
+	ext4_groupinfo_destroy_slabs();
+	ext4_remove_debugfs_entry();
+}
+
+
+/*
+ * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
+ * Returns 0 if success or error code
+ */
+static noinline_for_stack int
+ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
+				handle_t *handle, unsigned int reserv_blks)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct ext4_group_desc *gdp;
+	struct buffer_head *gdp_bh;
+	struct ext4_sb_info *sbi;
+	struct super_block *sb;
+	ext4_fsblk_t block;
+	int err, len;
+
+	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+	BUG_ON(ac->ac_b_ex.fe_len <= 0);
+
+	sb = ac->ac_sb;
+	sbi = EXT4_SB(sb);
+
+	err = -EIO;
+	bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+	if (!bitmap_bh)
+		goto out_err;
+
+	err = ext4_journal_get_write_access(handle, bitmap_bh);
+	if (err)
+		goto out_err;
+
+	err = -EIO;
+	gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
+	if (!gdp)
+		goto out_err;
+
+	ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
+			ext4_free_blks_count(sb, gdp));
+
+	err = ext4_journal_get_write_access(handle, gdp_bh);
+	if (err)
+		goto out_err;
+
+	block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+
+	len = ac->ac_b_ex.fe_len;
+	if (!ext4_data_block_valid(sbi, block, len)) {
+		ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
+			   "fs metadata\n", block, block+len);
+		/* File system mounted not to panic on error
+		 * Fix the bitmap and repeat the block allocation
+		 * We leak some of the blocks here.
+		 */
+		ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+		mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+			    ac->ac_b_ex.fe_len);
+		ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+		if (!err)
+			err = -EAGAIN;
+		goto out_err;
+	}
+
+	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+#ifdef AGGRESSIVE_CHECK
+	{
+		int i;
+		for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
+			BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
+						bitmap_bh->b_data));
+		}
+	}
+#endif
+	mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
+	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+		ext4_free_blks_set(sb, gdp,
+					ext4_free_blocks_after_init(sb,
+					ac->ac_b_ex.fe_group, gdp));
+	}
+	len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
+	ext4_free_blks_set(sb, gdp, len);
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
+
+	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+	/*
+	 * Now reduce the dirty block count also. Should not go negative
+	 */
+	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+		/* release all the reserved blocks if non delalloc */
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi,
+							  ac->ac_b_ex.fe_group);
+		atomic_sub(ac->ac_b_ex.fe_len,
+			   &sbi->s_flex_groups[flex_group].free_blocks);
+	}
+
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+	if (err)
+		goto out_err;
+	err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
+
+out_err:
+	ext4_mark_super_dirty(sb);
+	brelse(bitmap_bh);
+	return err;
+}
+
+/*
+ * here we normalize request for locality group
+ * Group request are normalized to s_strip size if we set the same via mount
+ * option. If not we set it to s_mb_group_prealloc which can be configured via
+ * /sys/fs/ext4/<partition>/mb_group_prealloc
+ *
+ * XXX: should we try to preallocate more than the group has now?
+ */
+static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
+{
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_locality_group *lg = ac->ac_lg;
+
+	BUG_ON(lg == NULL);
+	if (EXT4_SB(sb)->s_stripe)
+		ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
+	else
+		ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
+	mb_debug(1, "#%u: goal %u blocks for locality group\n",
+		current->pid, ac->ac_g_ex.fe_len);
+}
+
+/*
+ * Normalization means making request better in terms of
+ * size and alignment
+ */
+static noinline_for_stack void
+ext4_mb_normalize_request(struct ext4_allocation_context *ac,
+				struct ext4_allocation_request *ar)
+{
+	int bsbits, max;
+	ext4_lblk_t end;
+	loff_t size, orig_size, start_off;
+	ext4_lblk_t start;
+	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+	struct ext4_prealloc_space *pa;
+
+	/* do normalize only data requests, metadata requests
+	   do not need preallocation */
+	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+		return;
+
+	/* sometime caller may want exact blocks */
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+		return;
+
+	/* caller may indicate that preallocation isn't
+	 * required (it's a tail, for example) */
+	if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
+		return;
+
+	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
+		ext4_mb_normalize_group_request(ac);
+		return ;
+	}
+
+	bsbits = ac->ac_sb->s_blocksize_bits;
+
+	/* first, let's learn actual file size
+	 * given current request is allocated */
+	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+	size = size << bsbits;
+	if (size < i_size_read(ac->ac_inode))
+		size = i_size_read(ac->ac_inode);
+	orig_size = size;
+
+	/* max size of free chunks */
+	max = 2 << bsbits;
+
+#define NRL_CHECK_SIZE(req, size, max, chunk_size)	\
+		(req <= (size) || max <= (chunk_size))
+
+	/* first, try to predict filesize */
+	/* XXX: should this table be tunable? */
+	start_off = 0;
+	if (size <= 16 * 1024) {
+		size = 16 * 1024;
+	} else if (size <= 32 * 1024) {
+		size = 32 * 1024;
+	} else if (size <= 64 * 1024) {
+		size = 64 * 1024;
+	} else if (size <= 128 * 1024) {
+		size = 128 * 1024;
+	} else if (size <= 256 * 1024) {
+		size = 256 * 1024;
+	} else if (size <= 512 * 1024) {
+		size = 512 * 1024;
+	} else if (size <= 1024 * 1024) {
+		size = 1024 * 1024;
+	} else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
+		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+						(21 - bsbits)) << 21;
+		size = 2 * 1024 * 1024;
+	} else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
+		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+							(22 - bsbits)) << 22;
+		size = 4 * 1024 * 1024;
+	} else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
+					(8<<20)>>bsbits, max, 8 * 1024)) {
+		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+							(23 - bsbits)) << 23;
+		size = 8 * 1024 * 1024;
+	} else {
+		start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
+		size	  = ac->ac_o_ex.fe_len << bsbits;
+	}
+	size = size >> bsbits;
+	start = start_off >> bsbits;
+
+	/* don't cover already allocated blocks in selected range */
+	if (ar->pleft && start <= ar->lleft) {
+		size -= ar->lleft + 1 - start;
+		start = ar->lleft + 1;
+	}
+	if (ar->pright && start + size - 1 >= ar->lright)
+		size -= start + size - ar->lright;
+
+	end = start + size;
+
+	/* check we don't cross already preallocated blocks */
+	rcu_read_lock();
+	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+		ext4_lblk_t pa_end;
+
+		if (pa->pa_deleted)
+			continue;
+		spin_lock(&pa->pa_lock);
+		if (pa->pa_deleted) {
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+
+		pa_end = pa->pa_lstart + pa->pa_len;
+
+		/* PA must not overlap original request */
+		BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
+			ac->ac_o_ex.fe_logical < pa->pa_lstart));
+
+		/* skip PAs this normalized request doesn't overlap with */
+		if (pa->pa_lstart >= end || pa_end <= start) {
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+		BUG_ON(pa->pa_lstart <= start && pa_end >= end);
+
+		/* adjust start or end to be adjacent to this pa */
+		if (pa_end <= ac->ac_o_ex.fe_logical) {
+			BUG_ON(pa_end < start);
+			start = pa_end;
+		} else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+			BUG_ON(pa->pa_lstart > end);
+			end = pa->pa_lstart;
+		}
+		spin_unlock(&pa->pa_lock);
+	}
+	rcu_read_unlock();
+	size = end - start;
+
+	/* XXX: extra loop to check we really don't overlap preallocations */
+	rcu_read_lock();
+	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+		ext4_lblk_t pa_end;
+		spin_lock(&pa->pa_lock);
+		if (pa->pa_deleted == 0) {
+			pa_end = pa->pa_lstart + pa->pa_len;
+			BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
+		}
+		spin_unlock(&pa->pa_lock);
+	}
+	rcu_read_unlock();
+
+	if (start + size <= ac->ac_o_ex.fe_logical &&
+			start > ac->ac_o_ex.fe_logical) {
+		printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n",
+			(unsigned long) start, (unsigned long) size,
+			(unsigned long) ac->ac_o_ex.fe_logical);
+	}
+	BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
+			start > ac->ac_o_ex.fe_logical);
+	BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+
+	/* now prepare goal request */
+
+	/* XXX: is it better to align blocks WRT to logical
+	 * placement or satisfy big request as is */
+	ac->ac_g_ex.fe_logical = start;
+	ac->ac_g_ex.fe_len = size;
+
+	/* define goal start in order to merge */
+	if (ar->pright && (ar->lright == (start + size))) {
+		/* merge to the right */
+		ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
+						&ac->ac_f_ex.fe_group,
+						&ac->ac_f_ex.fe_start);
+		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
+	}
+	if (ar->pleft && (ar->lleft + 1 == start)) {
+		/* merge to the left */
+		ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
+						&ac->ac_f_ex.fe_group,
+						&ac->ac_f_ex.fe_start);
+		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
+	}
+
+	mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
+		(unsigned) orig_size, (unsigned) start);
+}
+
+static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+
+	if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
+		atomic_inc(&sbi->s_bal_reqs);
+		atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
+		if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
+			atomic_inc(&sbi->s_bal_success);
+		atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
+		if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
+				ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
+			atomic_inc(&sbi->s_bal_goals);
+		if (ac->ac_found > sbi->s_mb_max_to_scan)
+			atomic_inc(&sbi->s_bal_breaks);
+	}
+
+	if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
+		trace_ext4_mballoc_alloc(ac);
+	else
+		trace_ext4_mballoc_prealloc(ac);
+}
+
+/*
+ * Called on failure; free up any blocks from the inode PA for this
+ * context.  We don't need this for MB_GROUP_PA because we only change
+ * pa_free in ext4_mb_release_context(), but on failure, we've already
+ * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
+ */
+static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
+{
+	struct ext4_prealloc_space *pa = ac->ac_pa;
+	int len;
+
+	if (pa && pa->pa_type == MB_INODE_PA) {
+		len = ac->ac_b_ex.fe_len;
+		pa->pa_free += len;
+	}
+
+}
+
+/*
+ * use blocks preallocated to inode
+ */
+static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
+				struct ext4_prealloc_space *pa)
+{
+	ext4_fsblk_t start;
+	ext4_fsblk_t end;
+	int len;
+
+	/* found preallocated blocks, use them */
+	start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
+	end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
+	len = end - start;
+	ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
+					&ac->ac_b_ex.fe_start);
+	ac->ac_b_ex.fe_len = len;
+	ac->ac_status = AC_STATUS_FOUND;
+	ac->ac_pa = pa;
+
+	BUG_ON(start < pa->pa_pstart);
+	BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
+	BUG_ON(pa->pa_free < len);
+	pa->pa_free -= len;
+
+	mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
+}
+
+/*
+ * use blocks preallocated to locality group
+ */
+static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
+				struct ext4_prealloc_space *pa)
+{
+	unsigned int len = ac->ac_o_ex.fe_len;
+
+	ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
+					&ac->ac_b_ex.fe_group,
+					&ac->ac_b_ex.fe_start);
+	ac->ac_b_ex.fe_len = len;
+	ac->ac_status = AC_STATUS_FOUND;
+	ac->ac_pa = pa;
+
+	/* we don't correct pa_pstart or pa_plen here to avoid
+	 * possible race when the group is being loaded concurrently
+	 * instead we correct pa later, after blocks are marked
+	 * in on-disk bitmap -- see ext4_mb_release_context()
+	 * Other CPUs are prevented from allocating from this pa by lg_mutex
+	 */
+	mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+}
+
+/*
+ * Return the prealloc space that have minimal distance
+ * from the goal block. @cpa is the prealloc
+ * space that is having currently known minimal distance
+ * from the goal block.
+ */
+static struct ext4_prealloc_space *
+ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
+			struct ext4_prealloc_space *pa,
+			struct ext4_prealloc_space *cpa)
+{
+	ext4_fsblk_t cur_distance, new_distance;
+
+	if (cpa == NULL) {
+		atomic_inc(&pa->pa_count);
+		return pa;
+	}
+	cur_distance = abs(goal_block - cpa->pa_pstart);
+	new_distance = abs(goal_block - pa->pa_pstart);
+
+	if (cur_distance <= new_distance)
+		return cpa;
+
+	/* drop the previous reference */
+	atomic_dec(&cpa->pa_count);
+	atomic_inc(&pa->pa_count);
+	return pa;
+}
+
+/*
+ * search goal blocks in preallocated space
+ */
+static noinline_for_stack int
+ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
+{
+	int order, i;
+	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+	struct ext4_locality_group *lg;
+	struct ext4_prealloc_space *pa, *cpa = NULL;
+	ext4_fsblk_t goal_block;
+
+	/* only data can be preallocated */
+	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+		return 0;
+
+	/* first, try per-file preallocation */
+	rcu_read_lock();
+	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+
+		/* all fields in this condition don't change,
+		 * so we can skip locking for them */
+		if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
+			ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
+			continue;
+
+		/* non-extent files can't have physical blocks past 2^32 */
+		if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
+			pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+			continue;
+
+		/* found preallocated blocks, use them */
+		spin_lock(&pa->pa_lock);
+		if (pa->pa_deleted == 0 && pa->pa_free) {
+			atomic_inc(&pa->pa_count);
+			ext4_mb_use_inode_pa(ac, pa);
+			spin_unlock(&pa->pa_lock);
+			ac->ac_criteria = 10;
+			rcu_read_unlock();
+			return 1;
+		}
+		spin_unlock(&pa->pa_lock);
+	}
+	rcu_read_unlock();
+
+	/* can we use group allocation? */
+	if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
+		return 0;
+
+	/* inode may have no locality group for some reason */
+	lg = ac->ac_lg;
+	if (lg == NULL)
+		return 0;
+	order  = fls(ac->ac_o_ex.fe_len) - 1;
+	if (order > PREALLOC_TB_SIZE - 1)
+		/* The max size of hash table is PREALLOC_TB_SIZE */
+		order = PREALLOC_TB_SIZE - 1;
+
+	goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
+	/*
+	 * search for the prealloc space that is having
+	 * minimal distance from the goal block.
+	 */
+	for (i = order; i < PREALLOC_TB_SIZE; i++) {
+		rcu_read_lock();
+		list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
+					pa_inode_list) {
+			spin_lock(&pa->pa_lock);
+			if (pa->pa_deleted == 0 &&
+					pa->pa_free >= ac->ac_o_ex.fe_len) {
+
+				cpa = ext4_mb_check_group_pa(goal_block,
+								pa, cpa);
+			}
+			spin_unlock(&pa->pa_lock);
+		}
+		rcu_read_unlock();
+	}
+	if (cpa) {
+		ext4_mb_use_group_pa(ac, cpa);
+		ac->ac_criteria = 20;
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * the function goes through all block freed in the group
+ * but not yet committed and marks them used in in-core bitmap.
+ * buddy must be generated from this bitmap
+ * Need to be called with the ext4 group lock held
+ */
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+						ext4_group_t group)
+{
+	struct rb_node *n;
+	struct ext4_group_info *grp;
+	struct ext4_free_data *entry;
+
+	grp = ext4_get_group_info(sb, group);
+	n = rb_first(&(grp->bb_free_root));
+
+	while (n) {
+		entry = rb_entry(n, struct ext4_free_data, node);
+		mb_set_bits(bitmap, entry->start_blk, entry->count);
+		n = rb_next(n);
+	}
+	return;
+}
+
+/*
+ * the function goes through all preallocation in this group and marks them
+ * used in in-core bitmap. buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock held
+ */
+static noinline_for_stack
+void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+					ext4_group_t group)
+{
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	struct ext4_prealloc_space *pa;
+	struct list_head *cur;
+	ext4_group_t groupnr;
+	ext4_grpblk_t start;
+	int preallocated = 0;
+	int count = 0;
+	int len;
+
+	/* all form of preallocation discards first load group,
+	 * so the only competing code is preallocation use.
+	 * we don't need any locking here
+	 * notice we do NOT ignore preallocations with pa_deleted
+	 * otherwise we could leave used blocks available for
+	 * allocation in buddy when concurrent ext4_mb_put_pa()
+	 * is dropping preallocation
+	 */
+	list_for_each(cur, &grp->bb_prealloc_list) {
+		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+		spin_lock(&pa->pa_lock);
+		ext4_get_group_no_and_offset(sb, pa->pa_pstart,
+					     &groupnr, &start);
+		len = pa->pa_len;
+		spin_unlock(&pa->pa_lock);
+		if (unlikely(len == 0))
+			continue;
+		BUG_ON(groupnr != group);
+		mb_set_bits(bitmap, start, len);
+		preallocated += len;
+		count++;
+	}
+	mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
+}
+
+static void ext4_mb_pa_callback(struct rcu_head *head)
+{
+	struct ext4_prealloc_space *pa;
+	pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+	kmem_cache_free(ext4_pspace_cachep, pa);
+}
+
+/*
+ * drops a reference to preallocated space descriptor
+ * if this was the last reference and the space is consumed
+ */
+static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
+			struct super_block *sb, struct ext4_prealloc_space *pa)
+{
+	ext4_group_t grp;
+	ext4_fsblk_t grp_blk;
+
+	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
+		return;
+
+	/* in this short window concurrent discard can set pa_deleted */
+	spin_lock(&pa->pa_lock);
+	if (pa->pa_deleted == 1) {
+		spin_unlock(&pa->pa_lock);
+		return;
+	}
+
+	pa->pa_deleted = 1;
+	spin_unlock(&pa->pa_lock);
+
+	grp_blk = pa->pa_pstart;
+	/*
+	 * If doing group-based preallocation, pa_pstart may be in the
+	 * next group when pa is used up
+	 */
+	if (pa->pa_type == MB_GROUP_PA)
+		grp_blk--;
+
+	ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
+
+	/*
+	 * possible race:
+	 *
+	 *  P1 (buddy init)			P2 (regular allocation)
+	 *					find block B in PA
+	 *  copy on-disk bitmap to buddy
+	 *  					mark B in on-disk bitmap
+	 *					drop PA from group
+	 *  mark all PAs in buddy
+	 *
+	 * thus, P1 initializes buddy with B available. to prevent this
+	 * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
+	 * against that pair
+	 */
+	ext4_lock_group(sb, grp);
+	list_del(&pa->pa_group_list);
+	ext4_unlock_group(sb, grp);
+
+	spin_lock(pa->pa_obj_lock);
+	list_del_rcu(&pa->pa_inode_list);
+	spin_unlock(pa->pa_obj_lock);
+
+	call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+}
+
+/*
+ * creates new preallocated space for given inode
+ */
+static noinline_for_stack int
+ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
+{
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_prealloc_space *pa;
+	struct ext4_group_info *grp;
+	struct ext4_inode_info *ei;
+
+	/* preallocate only when found space is larger then requested */
+	BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
+	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+	BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+
+	pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
+	if (pa == NULL)
+		return -ENOMEM;
+
+	if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
+		int winl;
+		int wins;
+		int win;
+		int offs;
+
+		/* we can't allocate as much as normalizer wants.
+		 * so, found space must get proper lstart
+		 * to cover original request */
+		BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
+		BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
+
+		/* we're limited by original request in that
+		 * logical block must be covered any way
+		 * winl is window we can move our chunk within */
+		winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
+
+		/* also, we should cover whole original request */
+		wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
+
+		/* the smallest one defines real window */
+		win = min(winl, wins);
+
+		offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
+		if (offs && offs < win)
+			win = offs;
+
+		ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
+		BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
+		BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
+	}
+
+	/* preallocation can change ac_b_ex, thus we store actually
+	 * allocated blocks for history */
+	ac->ac_f_ex = ac->ac_b_ex;
+
+	pa->pa_lstart = ac->ac_b_ex.fe_logical;
+	pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+	pa->pa_len = ac->ac_b_ex.fe_len;
+	pa->pa_free = pa->pa_len;
+	atomic_set(&pa->pa_count, 1);
+	spin_lock_init(&pa->pa_lock);
+	INIT_LIST_HEAD(&pa->pa_inode_list);
+	INIT_LIST_HEAD(&pa->pa_group_list);
+	pa->pa_deleted = 0;
+	pa->pa_type = MB_INODE_PA;
+
+	mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
+			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+	trace_ext4_mb_new_inode_pa(ac, pa);
+
+	ext4_mb_use_inode_pa(ac, pa);
+	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+
+	ei = EXT4_I(ac->ac_inode);
+	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+
+	pa->pa_obj_lock = &ei->i_prealloc_lock;
+	pa->pa_inode = ac->ac_inode;
+
+	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
+	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+
+	spin_lock(pa->pa_obj_lock);
+	list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
+	spin_unlock(pa->pa_obj_lock);
+
+	return 0;
+}
+
+/*
+ * creates new preallocated space for locality group inodes belongs to
+ */
+static noinline_for_stack int
+ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
+{
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_locality_group *lg;
+	struct ext4_prealloc_space *pa;
+	struct ext4_group_info *grp;
+
+	/* preallocate only when found space is larger then requested */
+	BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
+	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+	BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+
+	BUG_ON(ext4_pspace_cachep == NULL);
+	pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
+	if (pa == NULL)
+		return -ENOMEM;
+
+	/* preallocation can change ac_b_ex, thus we store actually
+	 * allocated blocks for history */
+	ac->ac_f_ex = ac->ac_b_ex;
+
+	pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+	pa->pa_lstart = pa->pa_pstart;
+	pa->pa_len = ac->ac_b_ex.fe_len;
+	pa->pa_free = pa->pa_len;
+	atomic_set(&pa->pa_count, 1);
+	spin_lock_init(&pa->pa_lock);
+	INIT_LIST_HEAD(&pa->pa_inode_list);
+	INIT_LIST_HEAD(&pa->pa_group_list);
+	pa->pa_deleted = 0;
+	pa->pa_type = MB_GROUP_PA;
+
+	mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
+			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+	trace_ext4_mb_new_group_pa(ac, pa);
+
+	ext4_mb_use_group_pa(ac, pa);
+	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+
+	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+	lg = ac->ac_lg;
+	BUG_ON(lg == NULL);
+
+	pa->pa_obj_lock = &lg->lg_prealloc_lock;
+	pa->pa_inode = NULL;
+
+	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
+	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+
+	/*
+	 * We will later add the new pa to the right bucket
+	 * after updating the pa_free in ext4_mb_release_context
+	 */
+	return 0;
+}
+
+static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
+{
+	int err;
+
+	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+		err = ext4_mb_new_group_pa(ac);
+	else
+		err = ext4_mb_new_inode_pa(ac);
+	return err;
+}
+
+/*
+ * finds all unused blocks in on-disk bitmap, frees them in
+ * in-core bitmap and buddy.
+ * @pa must be unlinked from inode and group lists, so that
+ * nobody else can find/use it.
+ * the caller MUST hold group/inode locks.
+ * TODO: optimize the case when there are no in-core structures yet
+ */
+static noinline_for_stack int
+ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
+			struct ext4_prealloc_space *pa)
+{
+	struct super_block *sb = e4b->bd_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	unsigned int end;
+	unsigned int next;
+	ext4_group_t group;
+	ext4_grpblk_t bit;
+	unsigned long long grp_blk_start;
+	int err = 0;
+	int free = 0;
+
+	BUG_ON(pa->pa_deleted == 0);
+	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+	grp_blk_start = pa->pa_pstart - bit;
+	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+	end = bit + pa->pa_len;
+
+	while (bit < end) {
+		bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
+		if (bit >= end)
+			break;
+		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
+		mb_debug(1, "    free preallocated %u/%u in group %u\n",
+			 (unsigned) ext4_group_first_block_no(sb, group) + bit,
+			 (unsigned) next - bit, (unsigned) group);
+		free += next - bit;
+
+		trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
+		trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit,
+					       next - bit);
+		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
+		bit = next + 1;
+	}
+	if (free != pa->pa_free) {
+		printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n",
+			pa, (unsigned long) pa->pa_lstart,
+			(unsigned long) pa->pa_pstart,
+			(unsigned long) pa->pa_len);
+		ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
+					free, pa->pa_free);
+		/*
+		 * pa is already deleted so we use the value obtained
+		 * from the bitmap and continue.
+		 */
+	}
+	atomic_add(free, &sbi->s_mb_discarded);
+
+	return err;
+}
+
+static noinline_for_stack int
+ext4_mb_release_group_pa(struct ext4_buddy *e4b,
+				struct ext4_prealloc_space *pa)
+{
+	struct super_block *sb = e4b->bd_sb;
+	ext4_group_t group;
+	ext4_grpblk_t bit;
+
+	trace_ext4_mb_release_group_pa(pa);
+	BUG_ON(pa->pa_deleted == 0);
+	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+	mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
+	atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
+	trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
+
+	return 0;
+}
+
+/*
+ * releases all preallocations in given group
+ *
+ * first, we need to decide discard policy:
+ * - when do we discard
+ *   1) ENOSPC
+ * - how many do we discard
+ *   1) how many requested
+ */
+static noinline_for_stack int
+ext4_mb_discard_group_preallocations(struct super_block *sb,
+					ext4_group_t group, int needed)
+{
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	struct buffer_head *bitmap_bh = NULL;
+	struct ext4_prealloc_space *pa, *tmp;
+	struct list_head list;
+	struct ext4_buddy e4b;
+	int err;
+	int busy = 0;
+	int free = 0;
+
+	mb_debug(1, "discard preallocation for group %u\n", group);
+
+	if (list_empty(&grp->bb_prealloc_list))
+		return 0;
+
+	bitmap_bh = ext4_read_block_bitmap(sb, group);
+	if (bitmap_bh == NULL) {
+		ext4_error(sb, "Error reading block bitmap for %u", group);
+		return 0;
+	}
+
+	err = ext4_mb_load_buddy(sb, group, &e4b);
+	if (err) {
+		ext4_error(sb, "Error loading buddy information for %u", group);
+		put_bh(bitmap_bh);
+		return 0;
+	}
+
+	if (needed == 0)
+		needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
+
+	INIT_LIST_HEAD(&list);
+repeat:
+	ext4_lock_group(sb, group);
+	list_for_each_entry_safe(pa, tmp,
+				&grp->bb_prealloc_list, pa_group_list) {
+		spin_lock(&pa->pa_lock);
+		if (atomic_read(&pa->pa_count)) {
+			spin_unlock(&pa->pa_lock);
+			busy = 1;
+			continue;
+		}
+		if (pa->pa_deleted) {
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+
+		/* seems this one can be freed ... */
+		pa->pa_deleted = 1;
+
+		/* we can trust pa_free ... */
+		free += pa->pa_free;
+
+		spin_unlock(&pa->pa_lock);
+
+		list_del(&pa->pa_group_list);
+		list_add(&pa->u.pa_tmp_list, &list);
+	}
+
+	/* if we still need more blocks and some PAs were used, try again */
+	if (free < needed && busy) {
+		busy = 0;
+		ext4_unlock_group(sb, group);
+		/*
+		 * Yield the CPU here so that we don't get soft lockup
+		 * in non preempt case.
+		 */
+		yield();
+		goto repeat;
+	}
+
+	/* found anything to free? */
+	if (list_empty(&list)) {
+		BUG_ON(free != 0);
+		goto out;
+	}
+
+	/* now free all selected PAs */
+	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
+
+		/* remove from object (inode or locality group) */
+		spin_lock(pa->pa_obj_lock);
+		list_del_rcu(&pa->pa_inode_list);
+		spin_unlock(pa->pa_obj_lock);
+
+		if (pa->pa_type == MB_GROUP_PA)
+			ext4_mb_release_group_pa(&e4b, pa);
+		else
+			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
+
+		list_del(&pa->u.pa_tmp_list);
+		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+	}
+
+out:
+	ext4_unlock_group(sb, group);
+	ext4_mb_unload_buddy(&e4b);
+	put_bh(bitmap_bh);
+	return free;
+}
+
+/*
+ * releases all non-used preallocated blocks for given inode
+ *
+ * It's important to discard preallocations under i_data_sem
+ * We don't want another block to be served from the prealloc
+ * space when we are discarding the inode prealloc space.
+ *
+ * FIXME!! Make sure it is valid at all the call sites
+ */
+void ext4_discard_preallocations(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bitmap_bh = NULL;
+	struct ext4_prealloc_space *pa, *tmp;
+	ext4_group_t group = 0;
+	struct list_head list;
+	struct ext4_buddy e4b;
+	int err;
+
+	if (!S_ISREG(inode->i_mode)) {
+		/*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
+		return;
+	}
+
+	mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
+	trace_ext4_discard_preallocations(inode);
+
+	INIT_LIST_HEAD(&list);
+
+repeat:
+	/* first, collect all pa's in the inode */
+	spin_lock(&ei->i_prealloc_lock);
+	while (!list_empty(&ei->i_prealloc_list)) {
+		pa = list_entry(ei->i_prealloc_list.next,
+				struct ext4_prealloc_space, pa_inode_list);
+		BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
+		spin_lock(&pa->pa_lock);
+		if (atomic_read(&pa->pa_count)) {
+			/* this shouldn't happen often - nobody should
+			 * use preallocation while we're discarding it */
+			spin_unlock(&pa->pa_lock);
+			spin_unlock(&ei->i_prealloc_lock);
+			printk(KERN_ERR "uh-oh! used pa while discarding\n");
+			WARN_ON(1);
+			schedule_timeout_uninterruptible(HZ);
+			goto repeat;
+
+		}
+		if (pa->pa_deleted == 0) {
+			pa->pa_deleted = 1;
+			spin_unlock(&pa->pa_lock);
+			list_del_rcu(&pa->pa_inode_list);
+			list_add(&pa->u.pa_tmp_list, &list);
+			continue;
+		}
+
+		/* someone is deleting pa right now */
+		spin_unlock(&pa->pa_lock);
+		spin_unlock(&ei->i_prealloc_lock);
+
+		/* we have to wait here because pa_deleted
+		 * doesn't mean pa is already unlinked from
+		 * the list. as we might be called from
+		 * ->clear_inode() the inode will get freed
+		 * and concurrent thread which is unlinking
+		 * pa from inode's list may access already
+		 * freed memory, bad-bad-bad */
+
+		/* XXX: if this happens too often, we can
+		 * add a flag to force wait only in case
+		 * of ->clear_inode(), but not in case of
+		 * regular truncate */
+		schedule_timeout_uninterruptible(HZ);
+		goto repeat;
+	}
+	spin_unlock(&ei->i_prealloc_lock);
+
+	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
+		BUG_ON(pa->pa_type != MB_INODE_PA);
+		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+
+		err = ext4_mb_load_buddy(sb, group, &e4b);
+		if (err) {
+			ext4_error(sb, "Error loading buddy information for %u",
+					group);
+			continue;
+		}
+
+		bitmap_bh = ext4_read_block_bitmap(sb, group);
+		if (bitmap_bh == NULL) {
+			ext4_error(sb, "Error reading block bitmap for %u",
+					group);
+			ext4_mb_unload_buddy(&e4b);
+			continue;
+		}
+
+		ext4_lock_group(sb, group);
+		list_del(&pa->pa_group_list);
+		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
+		ext4_unlock_group(sb, group);
+
+		ext4_mb_unload_buddy(&e4b);
+		put_bh(bitmap_bh);
+
+		list_del(&pa->u.pa_tmp_list);
+		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+	}
+}
+
+#ifdef CONFIG_EXT4_DEBUG
+static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
+{
+	struct super_block *sb = ac->ac_sb;
+	ext4_group_t ngroups, i;
+
+	if (!mb_enable_debug ||
+	    (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
+		return;
+
+	printk(KERN_ERR "EXT4-fs: Can't allocate:"
+			" Allocation context details:\n");
+	printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
+			ac->ac_status, ac->ac_flags);
+	printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, "
+			"best %lu/%lu/%lu@%lu cr %d\n",
+			(unsigned long)ac->ac_o_ex.fe_group,
+			(unsigned long)ac->ac_o_ex.fe_start,
+			(unsigned long)ac->ac_o_ex.fe_len,
+			(unsigned long)ac->ac_o_ex.fe_logical,
+			(unsigned long)ac->ac_g_ex.fe_group,
+			(unsigned long)ac->ac_g_ex.fe_start,
+			(unsigned long)ac->ac_g_ex.fe_len,
+			(unsigned long)ac->ac_g_ex.fe_logical,
+			(unsigned long)ac->ac_b_ex.fe_group,
+			(unsigned long)ac->ac_b_ex.fe_start,
+			(unsigned long)ac->ac_b_ex.fe_len,
+			(unsigned long)ac->ac_b_ex.fe_logical,
+			(int)ac->ac_criteria);
+	printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
+		ac->ac_found);
+	printk(KERN_ERR "EXT4-fs: groups: \n");
+	ngroups = ext4_get_groups_count(sb);
+	for (i = 0; i < ngroups; i++) {
+		struct ext4_group_info *grp = ext4_get_group_info(sb, i);
+		struct ext4_prealloc_space *pa;
+		ext4_grpblk_t start;
+		struct list_head *cur;
+		ext4_lock_group(sb, i);
+		list_for_each(cur, &grp->bb_prealloc_list) {
+			pa = list_entry(cur, struct ext4_prealloc_space,
+					pa_group_list);
+			spin_lock(&pa->pa_lock);
+			ext4_get_group_no_and_offset(sb, pa->pa_pstart,
+						     NULL, &start);
+			spin_unlock(&pa->pa_lock);
+			printk(KERN_ERR "PA:%u:%d:%u \n", i,
+			       start, pa->pa_len);
+		}
+		ext4_unlock_group(sb, i);
+
+		if (grp->bb_free == 0)
+			continue;
+		printk(KERN_ERR "%u: %d/%d \n",
+		       i, grp->bb_free, grp->bb_fragments);
+	}
+	printk(KERN_ERR "\n");
+}
+#else
+static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
+{
+	return;
+}
+#endif
+
+/*
+ * We use locality group preallocation for small size file. The size of the
+ * file is determined by the current size or the resulting size after
+ * allocation which ever is larger
+ *
+ * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
+ */
+static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	int bsbits = ac->ac_sb->s_blocksize_bits;
+	loff_t size, isize;
+
+	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+		return;
+
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+		return;
+
+	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+		>> bsbits;
+
+	if ((size == isize) &&
+	    !ext4_fs_is_busy(sbi) &&
+	    (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+		ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
+		return;
+	}
+
+	/* don't use group allocation for large files */
+	size = max(size, isize);
+	if (size > sbi->s_mb_stream_request) {
+		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+		return;
+	}
+
+	BUG_ON(ac->ac_lg != NULL);
+	/*
+	 * locality group prealloc space are per cpu. The reason for having
+	 * per cpu locality group is to reduce the contention between block
+	 * request from multiple CPUs.
+	 */
+	ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups);
+
+	/* we're going to use group allocation */
+	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
+
+	/* serialize all allocations in the group */
+	mutex_lock(&ac->ac_lg->lg_mutex);
+}
+
+static noinline_for_stack int
+ext4_mb_initialize_context(struct ext4_allocation_context *ac,
+				struct ext4_allocation_request *ar)
+{
+	struct super_block *sb = ar->inode->i_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	ext4_group_t group;
+	unsigned int len;
+	ext4_fsblk_t goal;
+	ext4_grpblk_t block;
+
+	/* we can't allocate > group size */
+	len = ar->len;
+
+	/* just a dirty hack to filter too big requests  */
+	if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
+		len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
+
+	/* start searching from the goal */
+	goal = ar->goal;
+	if (goal < le32_to_cpu(es->s_first_data_block) ||
+			goal >= ext4_blocks_count(es))
+		goal = le32_to_cpu(es->s_first_data_block);
+	ext4_get_group_no_and_offset(sb, goal, &group, &block);
+
+	/* set up allocation goals */
+	memset(ac, 0, sizeof(struct ext4_allocation_context));
+	ac->ac_b_ex.fe_logical = ar->logical;
+	ac->ac_status = AC_STATUS_CONTINUE;
+	ac->ac_sb = sb;
+	ac->ac_inode = ar->inode;
+	ac->ac_o_ex.fe_logical = ar->logical;
+	ac->ac_o_ex.fe_group = group;
+	ac->ac_o_ex.fe_start = block;
+	ac->ac_o_ex.fe_len = len;
+	ac->ac_g_ex.fe_logical = ar->logical;
+	ac->ac_g_ex.fe_group = group;
+	ac->ac_g_ex.fe_start = block;
+	ac->ac_g_ex.fe_len = len;
+	ac->ac_flags = ar->flags;
+
+	/* we have to define context: we'll we work with a file or
+	 * locality group. this is a policy, actually */
+	ext4_mb_group_or_file(ac);
+
+	mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+			"left: %u/%u, right %u/%u to %swritable\n",
+			(unsigned) ar->len, (unsigned) ar->logical,
+			(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
+			(unsigned) ar->lleft, (unsigned) ar->pleft,
+			(unsigned) ar->lright, (unsigned) ar->pright,
+			atomic_read(&ar->inode->i_writecount) ? "" : "non-");
+	return 0;
+
+}
+
+static noinline_for_stack void
+ext4_mb_discard_lg_preallocations(struct super_block *sb,
+					struct ext4_locality_group *lg,
+					int order, int total_entries)
+{
+	ext4_group_t group = 0;
+	struct ext4_buddy e4b;
+	struct list_head discard_list;
+	struct ext4_prealloc_space *pa, *tmp;
+
+	mb_debug(1, "discard locality group preallocation\n");
+
+	INIT_LIST_HEAD(&discard_list);
+
+	spin_lock(&lg->lg_prealloc_lock);
+	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
+						pa_inode_list) {
+		spin_lock(&pa->pa_lock);
+		if (atomic_read(&pa->pa_count)) {
+			/*
+			 * This is the pa that we just used
+			 * for block allocation. So don't
+			 * free that
+			 */
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+		if (pa->pa_deleted) {
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+		/* only lg prealloc space */
+		BUG_ON(pa->pa_type != MB_GROUP_PA);
+
+		/* seems this one can be freed ... */
+		pa->pa_deleted = 1;
+		spin_unlock(&pa->pa_lock);
+
+		list_del_rcu(&pa->pa_inode_list);
+		list_add(&pa->u.pa_tmp_list, &discard_list);
+
+		total_entries--;
+		if (total_entries <= 5) {
+			/*
+			 * we want to keep only 5 entries
+			 * allowing it to grow to 8. This
+			 * mak sure we don't call discard
+			 * soon for this list.
+			 */
+			break;
+		}
+	}
+	spin_unlock(&lg->lg_prealloc_lock);
+
+	list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
+
+		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+		if (ext4_mb_load_buddy(sb, group, &e4b)) {
+			ext4_error(sb, "Error loading buddy information for %u",
+					group);
+			continue;
+		}
+		ext4_lock_group(sb, group);
+		list_del(&pa->pa_group_list);
+		ext4_mb_release_group_pa(&e4b, pa);
+		ext4_unlock_group(sb, group);
+
+		ext4_mb_unload_buddy(&e4b);
+		list_del(&pa->u.pa_tmp_list);
+		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+	}
+}
+
+/*
+ * We have incremented pa_count. So it cannot be freed at this
+ * point. Also we hold lg_mutex. So no parallel allocation is
+ * possible from this lg. That means pa_free cannot be updated.
+ *
+ * A parallel ext4_mb_discard_group_preallocations is possible.
+ * which can cause the lg_prealloc_list to be updated.
+ */
+
+static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
+{
+	int order, added = 0, lg_prealloc_count = 1;
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_locality_group *lg = ac->ac_lg;
+	struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
+
+	order = fls(pa->pa_free) - 1;
+	if (order > PREALLOC_TB_SIZE - 1)
+		/* The max size of hash table is PREALLOC_TB_SIZE */
+		order = PREALLOC_TB_SIZE - 1;
+	/* Add the prealloc space to lg */
+	rcu_read_lock();
+	list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
+						pa_inode_list) {
+		spin_lock(&tmp_pa->pa_lock);
+		if (tmp_pa->pa_deleted) {
+			spin_unlock(&tmp_pa->pa_lock);
+			continue;
+		}
+		if (!added && pa->pa_free < tmp_pa->pa_free) {
+			/* Add to the tail of the previous entry */
+			list_add_tail_rcu(&pa->pa_inode_list,
+						&tmp_pa->pa_inode_list);
+			added = 1;
+			/*
+			 * we want to count the total
+			 * number of entries in the list
+			 */
+		}
+		spin_unlock(&tmp_pa->pa_lock);
+		lg_prealloc_count++;
+	}
+	if (!added)
+		list_add_tail_rcu(&pa->pa_inode_list,
+					&lg->lg_prealloc_list[order]);
+	rcu_read_unlock();
+
+	/* Now trim the list to be not more than 8 elements */
+	if (lg_prealloc_count > 8) {
+		ext4_mb_discard_lg_preallocations(sb, lg,
+						order, lg_prealloc_count);
+		return;
+	}
+	return ;
+}
+
+/*
+ * release all resource we used in allocation
+ */
+static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+{
+	struct ext4_prealloc_space *pa = ac->ac_pa;
+	if (pa) {
+		if (pa->pa_type == MB_GROUP_PA) {
+			/* see comment in ext4_mb_use_group_pa() */
+			spin_lock(&pa->pa_lock);
+			pa->pa_pstart += ac->ac_b_ex.fe_len;
+			pa->pa_lstart += ac->ac_b_ex.fe_len;
+			pa->pa_free -= ac->ac_b_ex.fe_len;
+			pa->pa_len -= ac->ac_b_ex.fe_len;
+			spin_unlock(&pa->pa_lock);
+		}
+	}
+	if (pa) {
+		/*
+		 * We want to add the pa to the right bucket.
+		 * Remove it from the list and while adding
+		 * make sure the list to which we are adding
+		 * doesn't grow big.
+		 */
+		if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
+			spin_lock(pa->pa_obj_lock);
+			list_del_rcu(&pa->pa_inode_list);
+			spin_unlock(pa->pa_obj_lock);
+			ext4_mb_add_n_trim(ac);
+		}
+		ext4_mb_put_pa(ac, ac->ac_sb, pa);
+	}
+	if (ac->ac_bitmap_page)
+		page_cache_release(ac->ac_bitmap_page);
+	if (ac->ac_buddy_page)
+		page_cache_release(ac->ac_buddy_page);
+	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+		mutex_unlock(&ac->ac_lg->lg_mutex);
+	ext4_mb_collect_stats(ac);
+	return 0;
+}
+
+static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
+{
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+	int ret;
+	int freed = 0;
+
+	trace_ext4_mb_discard_preallocations(sb, needed);
+	for (i = 0; i < ngroups && needed > 0; i++) {
+		ret = ext4_mb_discard_group_preallocations(sb, i, needed);
+		freed += ret;
+		needed -= ret;
+	}
+
+	return freed;
+}
+
+/*
+ * Main entry point into mballoc to allocate blocks
+ * it tries to use preallocation first, then falls back
+ * to usual allocation
+ */
+ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
+				struct ext4_allocation_request *ar, int *errp)
+{
+	int freed;
+	struct ext4_allocation_context *ac = NULL;
+	struct ext4_sb_info *sbi;
+	struct super_block *sb;
+	ext4_fsblk_t block = 0;
+	unsigned int inquota = 0;
+	unsigned int reserv_blks = 0;
+
+	sb = ar->inode->i_sb;
+	sbi = EXT4_SB(sb);
+
+	trace_ext4_request_blocks(ar);
+
+	/*
+	 * For delayed allocation, we could skip the ENOSPC and
+	 * EDQUOT check, as blocks and quotas have been already
+	 * reserved when data being copied into pagecache.
+	 */
+	if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
+		ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+	else {
+		/* Without delayed allocation we need to verify
+		 * there is enough free blocks to do block allocation
+		 * and verify allocation doesn't exceed the quota limits.
+		 */
+		while (ar->len &&
+			ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
+
+			/* let others to free the space */
+			yield();
+			ar->len = ar->len >> 1;
+		}
+		if (!ar->len) {
+			*errp = -ENOSPC;
+			return 0;
+		}
+		reserv_blks = ar->len;
+		if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
+			dquot_alloc_block_nofail(ar->inode, ar->len);
+		} else {
+			while (ar->len &&
+				dquot_alloc_block(ar->inode, ar->len)) {
+
+				ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+				ar->len--;
+			}
+		}
+		inquota = ar->len;
+		if (ar->len == 0) {
+			*errp = -EDQUOT;
+			goto out;
+		}
+	}
+
+	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+	if (!ac) {
+		ar->len = 0;
+		*errp = -ENOMEM;
+		goto out;
+	}
+
+	*errp = ext4_mb_initialize_context(ac, ar);
+	if (*errp) {
+		ar->len = 0;
+		goto out;
+	}
+
+	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
+	if (!ext4_mb_use_preallocated(ac)) {
+		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
+		ext4_mb_normalize_request(ac, ar);
+repeat:
+		/* allocate space in core */
+		*errp = ext4_mb_regular_allocator(ac);
+		if (*errp)
+			goto errout;
+
+		/* as we've just preallocated more space than
+		 * user requested orinally, we store allocated
+		 * space in a special descriptor */
+		if (ac->ac_status == AC_STATUS_FOUND &&
+				ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+			ext4_mb_new_preallocation(ac);
+	}
+	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
+		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
+		if (*errp == -EAGAIN) {
+			/*
+			 * drop the reference that we took
+			 * in ext4_mb_use_best_found
+			 */
+			ext4_mb_release_context(ac);
+			ac->ac_b_ex.fe_group = 0;
+			ac->ac_b_ex.fe_start = 0;
+			ac->ac_b_ex.fe_len = 0;
+			ac->ac_status = AC_STATUS_CONTINUE;
+			goto repeat;
+		} else if (*errp)
+		errout:
+			ext4_discard_allocated_blocks(ac);
+		else {
+			block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+			ar->len = ac->ac_b_ex.fe_len;
+		}
+	} else {
+		freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
+		if (freed)
+			goto repeat;
+		*errp = -ENOSPC;
+	}
+
+	if (*errp) {
+		ac->ac_b_ex.fe_len = 0;
+		ar->len = 0;
+		ext4_mb_show_ac(ac);
+	}
+	ext4_mb_release_context(ac);
+out:
+	if (ac)
+		kmem_cache_free(ext4_ac_cachep, ac);
+	if (inquota && ar->len < inquota)
+		dquot_free_block(ar->inode, inquota - ar->len);
+	if (!ar->len) {
+		if (!ext4_test_inode_state(ar->inode,
+					   EXT4_STATE_DELALLOC_RESERVED))
+			/* release all the reserved blocks if non delalloc */
+			percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+						reserv_blks);
+	}
+
+	trace_ext4_allocate_blocks(ar, (unsigned long long)block);
+
+	return block;
+}
+
+/*
+ * We can merge two free data extents only if the physical blocks
+ * are contiguous, AND the extents were freed by the same transaction,
+ * AND the blocks are associated with the same group.
+ */
+static int can_merge(struct ext4_free_data *entry1,
+			struct ext4_free_data *entry2)
+{
+	if ((entry1->t_tid == entry2->t_tid) &&
+	    (entry1->group == entry2->group) &&
+	    ((entry1->start_blk + entry1->count) == entry2->start_blk))
+		return 1;
+	return 0;
+}
+
+static noinline_for_stack int
+ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
+		      struct ext4_free_data *new_entry)
+{
+	ext4_group_t group = e4b->bd_group;
+	ext4_grpblk_t block;
+	struct ext4_free_data *entry;
+	struct ext4_group_info *db = e4b->bd_info;
+	struct super_block *sb = e4b->bd_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct rb_node **n = &db->bb_free_root.rb_node, *node;
+	struct rb_node *parent = NULL, *new_node;
+
+	BUG_ON(!ext4_handle_valid(handle));
+	BUG_ON(e4b->bd_bitmap_page == NULL);
+	BUG_ON(e4b->bd_buddy_page == NULL);
+
+	new_node = &new_entry->node;
+	block = new_entry->start_blk;
+
+	if (!*n) {
+		/* first free block exent. We need to
+		   protect buddy cache from being freed,
+		 * otherwise we'll refresh it from
+		 * on-disk bitmap and lose not-yet-available
+		 * blocks */
+		page_cache_get(e4b->bd_buddy_page);
+		page_cache_get(e4b->bd_bitmap_page);
+	}
+	while (*n) {
+		parent = *n;
+		entry = rb_entry(parent, struct ext4_free_data, node);
+		if (block < entry->start_blk)
+			n = &(*n)->rb_left;
+		else if (block >= (entry->start_blk + entry->count))
+			n = &(*n)->rb_right;
+		else {
+			ext4_grp_locked_error(sb, group, 0,
+				ext4_group_first_block_no(sb, group) + block,
+				"Block already on to-be-freed list");
+			return 0;
+		}
+	}
+
+	rb_link_node(new_node, parent, n);
+	rb_insert_color(new_node, &db->bb_free_root);
+
+	/* Now try to see the extent can be merged to left and right */
+	node = rb_prev(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_free_data, node);
+		if (can_merge(entry, new_entry)) {
+			new_entry->start_blk = entry->start_blk;
+			new_entry->count += entry->count;
+			rb_erase(node, &(db->bb_free_root));
+			spin_lock(&sbi->s_md_lock);
+			list_del(&entry->list);
+			spin_unlock(&sbi->s_md_lock);
+			kmem_cache_free(ext4_free_ext_cachep, entry);
+		}
+	}
+
+	node = rb_next(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_free_data, node);
+		if (can_merge(new_entry, entry)) {
+			new_entry->count += entry->count;
+			rb_erase(node, &(db->bb_free_root));
+			spin_lock(&sbi->s_md_lock);
+			list_del(&entry->list);
+			spin_unlock(&sbi->s_md_lock);
+			kmem_cache_free(ext4_free_ext_cachep, entry);
+		}
+	}
+	/* Add the extent to transaction's private list */
+	spin_lock(&sbi->s_md_lock);
+	list_add(&new_entry->list, &handle->h_transaction->t_private_list);
+	spin_unlock(&sbi->s_md_lock);
+	return 0;
+}
+
+/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle:		handle for this transaction
+ * @inode:		inode
+ * @block:		start physical block to free
+ * @count:		number of blocks to count
+ * @flags:		flags used by ext4_free_blocks
+ */
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+		      struct buffer_head *bh, ext4_fsblk_t block,
+		      unsigned long count, int flags)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct super_block *sb = inode->i_sb;
+	struct ext4_group_desc *gdp;
+	unsigned long freed = 0;
+	unsigned int overflow;
+	ext4_grpblk_t bit;
+	struct buffer_head *gd_bh;
+	ext4_group_t block_group;
+	struct ext4_sb_info *sbi;
+	struct ext4_buddy e4b;
+	int err = 0;
+	int ret;
+
+	if (bh) {
+		if (block)
+			BUG_ON(block != bh->b_blocknr);
+		else
+			block = bh->b_blocknr;
+	}
+
+	sbi = EXT4_SB(sb);
+	if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+	    !ext4_data_block_valid(sbi, block, count)) {
+		ext4_error(sb, "Freeing blocks not in datazone - "
+			   "block = %llu, count = %lu", block, count);
+		goto error_return;
+	}
+
+	ext4_debug("freeing block %llu\n", block);
+	trace_ext4_free_blocks(inode, block, count, flags);
+
+	if (flags & EXT4_FREE_BLOCKS_FORGET) {
+		struct buffer_head *tbh = bh;
+		int i;
+
+		BUG_ON(bh && (count > 1));
+
+		for (i = 0; i < count; i++) {
+			if (!bh)
+				tbh = sb_find_get_block(inode->i_sb,
+							block + i);
+			if (unlikely(!tbh))
+				continue;
+			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+				    inode, tbh, block + i);
+		}
+	}
+
+	/*
+	 * We need to make sure we don't reuse the freed block until
+	 * after the transaction is committed, which we can do by
+	 * treating the block as metadata, below.  We make an
+	 * exception if the inode is to be written in writeback mode
+	 * since writeback mode has weak data consistency guarantees.
+	 */
+	if (!ext4_should_writeback_data(inode))
+		flags |= EXT4_FREE_BLOCKS_METADATA;
+
+do_more:
+	overflow = 0;
+	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+
+	/*
+	 * Check to see if we are freeing blocks across a group
+	 * boundary.
+	 */
+	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+		overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+		count -= overflow;
+	}
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+	if (!bitmap_bh) {
+		err = -EIO;
+		goto error_return;
+	}
+	gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
+	if (!gdp) {
+		err = -EIO;
+		goto error_return;
+	}
+
+	if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
+	    in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
+	    in_range(block, ext4_inode_table(sb, gdp),
+		      EXT4_SB(sb)->s_itb_per_group) ||
+	    in_range(block + count - 1, ext4_inode_table(sb, gdp),
+		      EXT4_SB(sb)->s_itb_per_group)) {
+
+		ext4_error(sb, "Freeing blocks in system zone - "
+			   "Block = %llu, count = %lu", block, count);
+		/* err = 0. ext4_std_error should be a no op */
+		goto error_return;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "getting write access");
+	err = ext4_journal_get_write_access(handle, bitmap_bh);
+	if (err)
+		goto error_return;
+
+	/*
+	 * We are about to modify some metadata.  Call the journal APIs
+	 * to unshare ->b_data if a currently-committing transaction is
+	 * using it
+	 */
+	BUFFER_TRACE(gd_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gd_bh);
+	if (err)
+		goto error_return;
+#ifdef AGGRESSIVE_CHECK
+	{
+		int i;
+		for (i = 0; i < count; i++)
+			BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
+	}
+#endif
+	trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
+
+	err = ext4_mb_load_buddy(sb, block_group, &e4b);
+	if (err)
+		goto error_return;
+
+	if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
+		struct ext4_free_data *new_entry;
+		/*
+		 * blocks being freed are metadata. these blocks shouldn't
+		 * be used until this transaction is committed
+		 */
+		new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+		if (!new_entry) {
+			ext4_mb_unload_buddy(&e4b);
+			err = -ENOMEM;
+			goto error_return;
+		}
+		new_entry->start_blk = bit;
+		new_entry->group  = block_group;
+		new_entry->count = count;
+		new_entry->t_tid = handle->h_transaction->t_tid;
+
+		ext4_lock_group(sb, block_group);
+		mb_clear_bits(bitmap_bh->b_data, bit, count);
+		ext4_mb_free_metadata(handle, &e4b, new_entry);
+	} else {
+		/* need to update group_info->bb_free and bitmap
+		 * with group lock held. generate_buddy look at
+		 * them with group lock_held
+		 */
+		ext4_lock_group(sb, block_group);
+		mb_clear_bits(bitmap_bh->b_data, bit, count);
+		mb_free_blocks(inode, &e4b, bit, count);
+	}
+
+	ret = ext4_free_blks_count(sb, gdp) + count;
+	ext4_free_blks_set(sb, gdp, ret);
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+	ext4_unlock_group(sb, block_group);
+	percpu_counter_add(&sbi->s_freeblocks_counter, count);
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
+	}
+
+	ext4_mb_unload_buddy(&e4b);
+
+	freed += count;
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+	if (!err)
+		err = ret;
+
+	if (overflow && !err) {
+		block += count;
+		count = overflow;
+		put_bh(bitmap_bh);
+		goto do_more;
+	}
+	ext4_mark_super_dirty(sb);
+error_return:
+	if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+		dquot_free_block(inode, freed);
+	brelse(bitmap_bh);
+	ext4_std_error(sb, err);
+	return;
+}
+
+/**
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
+ * @handle:			handle to this transaction
+ * @sb:				super block
+ * @block:			start physcial block to add to the block group
+ * @count:			number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+			 ext4_fsblk_t block, unsigned long count)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *gd_bh;
+	ext4_group_t block_group;
+	ext4_grpblk_t bit;
+	unsigned int i;
+	struct ext4_group_desc *desc;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_buddy e4b;
+	int err = 0, ret, blk_free_count;
+	ext4_grpblk_t blocks_freed;
+	struct ext4_group_info *grp;
+
+	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+
+	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+	grp = ext4_get_group_info(sb, block_group);
+	/*
+	 * Check to see if we are freeing blocks across a group
+	 * boundary.
+	 */
+	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
+		goto error_return;
+
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		goto error_return;
+	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+	if (!desc)
+		goto error_return;
+
+	if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+	    in_range(block + count - 1, ext4_inode_table(sb, desc),
+		     sbi->s_itb_per_group)) {
+		ext4_error(sb, "Adding blocks in system zones - "
+			   "Block = %llu, count = %lu",
+			   block, count);
+		goto error_return;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "getting write access");
+	err = ext4_journal_get_write_access(handle, bitmap_bh);
+	if (err)
+		goto error_return;
+
+	/*
+	 * We are about to modify some metadata.  Call the journal APIs
+	 * to unshare ->b_data if a currently-committing transaction is
+	 * using it
+	 */
+	BUFFER_TRACE(gd_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gd_bh);
+	if (err)
+		goto error_return;
+
+	for (i = 0, blocks_freed = 0; i < count; i++) {
+		BUFFER_TRACE(bitmap_bh, "clear bit");
+		if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+			ext4_error(sb, "bit already cleared for block %llu",
+				   (ext4_fsblk_t)(block + i));
+			BUFFER_TRACE(bitmap_bh, "bit already cleared");
+		} else {
+			blocks_freed++;
+		}
+	}
+
+	err = ext4_mb_load_buddy(sb, block_group, &e4b);
+	if (err)
+		goto error_return;
+
+	/*
+	 * need to update group_info->bb_free and bitmap
+	 * with group lock held. generate_buddy look at
+	 * them with group lock_held
+	 */
+	ext4_lock_group(sb, block_group);
+	mb_clear_bits(bitmap_bh->b_data, bit, count);
+	mb_free_blocks(NULL, &e4b, bit, count);
+	blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+	ext4_free_blks_set(sb, desc, blk_free_count);
+	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+	ext4_unlock_group(sb, block_group);
+	percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		atomic_add(blocks_freed,
+			   &sbi->s_flex_groups[flex_group].free_blocks);
+	}
+
+	ext4_mb_unload_buddy(&e4b);
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+	if (!err)
+		err = ret;
+
+error_return:
+	brelse(bitmap_bh);
+	ext4_std_error(sb, err);
+	return;
+}
+
+/**
+ * ext4_trim_extent -- function to TRIM one single free extent in the group
+ * @sb:		super block for the file system
+ * @start:	starting block of the free extent in the alloc. group
+ * @count:	number of blocks to TRIM
+ * @group:	alloc. group we are working with
+ * @e4b:	ext4 buddy for the group
+ *
+ * Trim "count" blocks starting at "start" in the "group". To assure that no
+ * one will allocate those blocks, mark it as used in buddy bitmap. This must
+ * be called with under the group lock.
+ */
+static void ext4_trim_extent(struct super_block *sb, int start, int count,
+			     ext4_group_t group, struct ext4_buddy *e4b)
+{
+	struct ext4_free_extent ex;
+
+	assert_spin_locked(ext4_group_lock_ptr(sb, group));
+
+	ex.fe_start = start;
+	ex.fe_group = group;
+	ex.fe_len = count;
+
+	/*
+	 * Mark blocks used, so no one can reuse them while
+	 * being trimmed.
+	 */
+	mb_mark_used(e4b, &ex);
+	ext4_unlock_group(sb, group);
+	ext4_issue_discard(sb, group, start, count);
+	ext4_lock_group(sb, group);
+	mb_free_blocks(NULL, e4b, start, ex.fe_len);
+}
+
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:			super block for file system
+ * @e4b:		ext4 buddy
+ * @start:		first group block to examine
+ * @max:		last group block to examine
+ * @minblocks:		minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's buddy bitmap searching for free
+ * extents. When the free block is found, ext4_trim_extent is called to TRIM
+ * the extent.
+ *
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap. This is done until whole group is scanned.
+ */
+static ext4_grpblk_t
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+		   ext4_grpblk_t start, ext4_grpblk_t max,
+		   ext4_grpblk_t minblocks)
+{
+	void *bitmap;
+	ext4_grpblk_t next, count = 0;
+	struct ext4_buddy e4b;
+	int ret;
+
+	ret = ext4_mb_load_buddy(sb, group, &e4b);
+	if (ret) {
+		ext4_error(sb, "Error in loading buddy "
+				"information for %u", group);
+		return ret;
+	}
+	bitmap = e4b.bd_bitmap;
+
+	ext4_lock_group(sb, group);
+	start = (e4b.bd_info->bb_first_free > start) ?
+		e4b.bd_info->bb_first_free : start;
+
+	while (start < max) {
+		start = mb_find_next_zero_bit(bitmap, max, start);
+		if (start >= max)
+			break;
+		next = mb_find_next_bit(bitmap, max, start);
+
+		if ((next - start) >= minblocks) {
+			ext4_trim_extent(sb, start,
+					 next - start, group, &e4b);
+			count += next - start;
+		}
+		start = next + 1;
+
+		if (fatal_signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		if (need_resched()) {
+			ext4_unlock_group(sb, group);
+			cond_resched();
+			ext4_lock_group(sb, group);
+		}
+
+		if ((e4b.bd_info->bb_free - count) < minblocks)
+			break;
+	}
+	ext4_unlock_group(sb, group);
+	ext4_mb_unload_buddy(&e4b);
+
+	ext4_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+	return count;
+}
+
+/**
+ * ext4_trim_fs() -- trim ioctl handle function
+ * @sb:			superblock for filesystem
+ * @range:		fstrim_range structure
+ *
+ * start:	First Byte to trim
+ * len:		number of Bytes to trim from start
+ * minlen:	minimum extent length in Bytes
+ * ext4_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext4_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	struct ext4_group_info *grp;
+	ext4_group_t first_group, last_group;
+	ext4_group_t group, ngroups = ext4_get_groups_count(sb);
+	ext4_grpblk_t cnt = 0, first_block, last_block;
+	uint64_t start, len, minlen, trimmed = 0;
+	ext4_fsblk_t first_data_blk =
+			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+	int ret = 0;
+
+	start = range->start >> sb->s_blocksize_bits;
+	len = range->len >> sb->s_blocksize_bits;
+	minlen = range->minlen >> sb->s_blocksize_bits;
+
+	if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+		return -EINVAL;
+	if (start < first_data_blk) {
+		len -= first_data_blk - start;
+		start = first_data_blk;
+	}
+
+	/* Determine first and last group to examine based on start and len */
+	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
+				     &first_group, &first_block);
+	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
+				     &last_group, &last_block);
+	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+	last_block = EXT4_BLOCKS_PER_GROUP(sb);
+
+	if (first_group > last_group)
+		return -EINVAL;
+
+	for (group = first_group; group <= last_group; group++) {
+		grp = ext4_get_group_info(sb, group);
+		/* We only do this if the grp has never been initialized */
+		if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+			ret = ext4_mb_init_group(sb, group);
+			if (ret)
+				break;
+		}
+
+		/*
+		 * For all the groups except the last one, last block will
+		 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
+		 * change it for the last group in which case start +
+		 * len < EXT4_BLOCKS_PER_GROUP(sb).
+		 */
+		if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
+			last_block = first_block + len;
+		len -= last_block - first_block;
+
+		if (grp->bb_free >= minlen) {
+			cnt = ext4_trim_all_free(sb, group, first_block,
+						last_block, minlen);
+			if (cnt < 0) {
+				ret = cnt;
+				break;
+			}
+		}
+		trimmed += cnt;
+		first_block = 0;
+	}
+	range->len = trimmed * sb->s_blocksize;
+
+	return ret;
+}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
new file mode 100644
index 00000000..20b5e7bf
--- /dev/null
+++ b/fs/ext4/mballoc.h
@@ -0,0 +1,222 @@
+/*
+ *  fs/ext4/mballoc.h
+ *
+ *  Written by: Alex Tomas <alex@clusterfs.com>
+ *
+ */
+#ifndef _EXT4_MBALLOC_H
+#define _EXT4_MBALLOC_H
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+
+/*
+ * with AGGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+
+/*
+ * with DOUBLE_CHECK defined mballoc creates persistent in-core
+ * bitmaps, maintains and uses them to check for double allocations
+ */
+#define DOUBLE_CHECK__
+
+/*
+ */
+#ifdef CONFIG_EXT4_DEBUG
+extern u8 mb_enable_debug;
+
+#define mb_debug(n, fmt, a...)	                                        \
+	do {								\
+		if ((n) <= mb_enable_debug) {		        	\
+			printk(KERN_DEBUG "(%s, %d): %s: ",		\
+			       __FILE__, __LINE__, __func__);		\
+			printk(fmt, ## a);				\
+		}							\
+	} while (0)
+#else
+#define mb_debug(n, fmt, a...)
+#endif
+
+#define EXT4_MB_HISTORY_ALLOC		1	/* allocation */
+#define EXT4_MB_HISTORY_PREALLOC	2	/* preallocated blocks used */
+
+/*
+ * How long mballoc can look for a best extent (in found extents)
+ */
+#define MB_DEFAULT_MAX_TO_SCAN		200
+
+/*
+ * How long mballoc must look for a best extent
+ */
+#define MB_DEFAULT_MIN_TO_SCAN		10
+
+/*
+ * How many groups mballoc will scan looking for the best chunk
+ */
+#define MB_DEFAULT_MAX_GROUPS_TO_SCAN	5
+
+/*
+ * with 'ext4_mb_stats' allocator will collect stats that will be
+ * shown at umount. The collecting costs though!
+ */
+#define MB_DEFAULT_STATS		0
+
+/*
+ * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
+ * by the stream allocator, which purpose is to pack requests
+ * as close each to other as possible to produce smooth I/O traffic
+ * We use locality group prealloc space for stream request.
+ * We can tune the same via /proc/fs/ext4/<parition>/stream_req
+ */
+#define MB_DEFAULT_STREAM_THRESHOLD	16	/* 64K */
+
+/*
+ * for which requests use 2^N search using buddies
+ */
+#define MB_DEFAULT_ORDER2_REQS		2
+
+/*
+ * default group prealloc size 512 blocks
+ */
+#define MB_DEFAULT_GROUP_PREALLOC	512
+
+
+struct ext4_free_data {
+	/* this links the free block information from group_info */
+	struct rb_node node;
+
+	/* this links the free block information from ext4_sb_info */
+	struct list_head list;
+
+	/* group which free block extent belongs */
+	ext4_group_t group;
+
+	/* free block extent */
+	ext4_grpblk_t start_blk;
+	ext4_grpblk_t count;
+
+	/* transaction which freed this extent */
+	tid_t	t_tid;
+};
+
+struct ext4_prealloc_space {
+	struct list_head	pa_inode_list;
+	struct list_head	pa_group_list;
+	union {
+		struct list_head pa_tmp_list;
+		struct rcu_head	pa_rcu;
+	} u;
+	spinlock_t		pa_lock;
+	atomic_t		pa_count;
+	unsigned		pa_deleted;
+	ext4_fsblk_t		pa_pstart;	/* phys. block */
+	ext4_lblk_t		pa_lstart;	/* log. block */
+	ext4_grpblk_t		pa_len;		/* len of preallocated chunk */
+	ext4_grpblk_t		pa_free;	/* how many blocks are free */
+	unsigned short		pa_type;	/* pa type. inode or group */
+	spinlock_t		*pa_obj_lock;
+	struct inode		*pa_inode;	/* hack, for history only */
+};
+
+enum {
+	MB_INODE_PA = 0,
+	MB_GROUP_PA = 1
+};
+
+struct ext4_free_extent {
+	ext4_lblk_t fe_logical;
+	ext4_grpblk_t fe_start;
+	ext4_group_t fe_group;
+	ext4_grpblk_t fe_len;
+};
+
+/*
+ * Locality group:
+ *   we try to group all related changes together
+ *   so that writeback can flush/allocate them together as well
+ *   Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
+ *   (512). We store prealloc space into the hash based on the pa_free blocks
+ *   order value.ie, fls(pa_free)-1;
+ */
+#define PREALLOC_TB_SIZE 10
+struct ext4_locality_group {
+	/* for allocator */
+	/* to serialize allocates */
+	struct mutex		lg_mutex;
+	/* list of preallocations */
+	struct list_head	lg_prealloc_list[PREALLOC_TB_SIZE];
+	spinlock_t		lg_prealloc_lock;
+};
+
+struct ext4_allocation_context {
+	struct inode *ac_inode;
+	struct super_block *ac_sb;
+
+	/* original request */
+	struct ext4_free_extent ac_o_ex;
+
+	/* goal request (normalized ac_o_ex) */
+	struct ext4_free_extent ac_g_ex;
+
+	/* the best found extent */
+	struct ext4_free_extent ac_b_ex;
+
+	/* copy of the bext found extent taken before preallocation efforts */
+	struct ext4_free_extent ac_f_ex;
+
+	/* number of iterations done. we have to track to limit searching */
+	unsigned long ac_ex_scanned;
+	__u16 ac_groups_scanned;
+	__u16 ac_found;
+	__u16 ac_tail;
+	__u16 ac_buddy;
+	__u16 ac_flags;		/* allocation hints */
+	__u8 ac_status;
+	__u8 ac_criteria;
+	__u8 ac_repeats;
+	__u8 ac_2order;		/* if request is to allocate 2^N blocks and
+				 * N > 0, the field stores N, otherwise 0 */
+	__u8 ac_op;		/* operation, for history only */
+	struct page *ac_bitmap_page;
+	struct page *ac_buddy_page;
+	struct ext4_prealloc_space *ac_pa;
+	struct ext4_locality_group *ac_lg;
+};
+
+#define AC_STATUS_CONTINUE	1
+#define AC_STATUS_FOUND		2
+#define AC_STATUS_BREAK		3
+
+struct ext4_buddy {
+	struct page *bd_buddy_page;
+	void *bd_buddy;
+	struct page *bd_bitmap_page;
+	void *bd_bitmap;
+	struct ext4_group_info *bd_info;
+	struct super_block *bd_sb;
+	__u16 bd_blkbits;
+	ext4_group_t bd_group;
+};
+#define EXT4_MB_BITMAP(e4b)	((e4b)->bd_bitmap)
+#define EXT4_MB_BUDDY(e4b)	((e4b)->bd_buddy)
+
+static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
+					struct ext4_free_extent *fex)
+{
+	return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
+}
+#endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
new file mode 100644
index 00000000..b57b98fb
--- /dev/null
+++ b/fs/ext4/migrate.c
@@ -0,0 +1,634 @@
+/*
+ * Copyright IBM Corporation, 2007
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+
+/*
+ * The contiguous blocks details which can be
+ * represented by a single extent
+ */
+struct list_blocks_struct {
+	ext4_lblk_t first_block, last_block;
+	ext4_fsblk_t first_pblock, last_pblock;
+};
+
+static int finish_range(handle_t *handle, struct inode *inode,
+				struct list_blocks_struct *lb)
+
+{
+	int retval = 0, needed;
+	struct ext4_extent newext;
+	struct ext4_ext_path *path;
+	if (lb->first_pblock == 0)
+		return 0;
+
+	/* Add the extent to temp inode*/
+	newext.ee_block = cpu_to_le32(lb->first_block);
+	newext.ee_len   = cpu_to_le16(lb->last_block - lb->first_block + 1);
+	ext4_ext_store_pblock(&newext, lb->first_pblock);
+	path = ext4_ext_find_extent(inode, lb->first_block, NULL);
+
+	if (IS_ERR(path)) {
+		retval = PTR_ERR(path);
+		path = NULL;
+		goto err_out;
+	}
+
+	/*
+	 * Calculate the credit needed to inserting this extent
+	 * Since we are doing this in loop we may accumalate extra
+	 * credit. But below we try to not accumalate too much
+	 * of them by restarting the journal.
+	 */
+	needed = ext4_ext_calc_credits_for_single_extent(inode,
+		    lb->last_block - lb->first_block + 1, path);
+
+	/*
+	 * Make sure the credit we accumalated is not really high
+	 */
+	if (needed && ext4_handle_has_enough_credits(handle,
+						EXT4_RESERVE_TRANS_BLOCKS)) {
+		retval = ext4_journal_restart(handle, needed);
+		if (retval)
+			goto err_out;
+	} else if (needed) {
+		retval = ext4_journal_extend(handle, needed);
+		if (retval) {
+			/*
+			 * IF not able to extend the journal restart the journal
+			 */
+			retval = ext4_journal_restart(handle, needed);
+			if (retval)
+				goto err_out;
+		}
+	}
+	retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
+err_out:
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+	lb->first_pblock = 0;
+	return retval;
+}
+
+static int update_extent_range(handle_t *handle, struct inode *inode,
+				ext4_fsblk_t pblock, ext4_lblk_t blk_num,
+				struct list_blocks_struct *lb)
+{
+	int retval;
+	/*
+	 * See if we can add on to the existing range (if it exists)
+	 */
+	if (lb->first_pblock &&
+		(lb->last_pblock+1 == pblock) &&
+		(lb->last_block+1 == blk_num)) {
+		lb->last_pblock = pblock;
+		lb->last_block = blk_num;
+		return 0;
+	}
+	/*
+	 * Start a new range.
+	 */
+	retval = finish_range(handle, inode, lb);
+	lb->first_pblock = lb->last_pblock = pblock;
+	lb->first_block = lb->last_block = blk_num;
+
+	return retval;
+}
+
+static int update_ind_extent_range(handle_t *handle, struct inode *inode,
+				   ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
+				   struct list_blocks_struct *lb)
+{
+	struct buffer_head *bh;
+	__le32 *i_data;
+	int i, retval = 0;
+	ext4_lblk_t blk_count = *blk_nump;
+	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+	if (!pblock) {
+		/* Only update the file block number */
+		*blk_nump += max_entries;
+		return 0;
+	}
+
+	bh = sb_bread(inode->i_sb, pblock);
+	if (!bh)
+		return -EIO;
+
+	i_data = (__le32 *)bh->b_data;
+	for (i = 0; i < max_entries; i++, blk_count++) {
+		if (i_data[i]) {
+			retval = update_extent_range(handle, inode,
+						le32_to_cpu(i_data[i]),
+						blk_count, lb);
+			if (retval)
+				break;
+		}
+	}
+
+	/* Update the file block number */
+	*blk_nump = blk_count;
+	put_bh(bh);
+	return retval;
+
+}
+
+static int update_dind_extent_range(handle_t *handle, struct inode *inode,
+				    ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
+				    struct list_blocks_struct *lb)
+{
+	struct buffer_head *bh;
+	__le32 *i_data;
+	int i, retval = 0;
+	ext4_lblk_t blk_count = *blk_nump;
+	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+	if (!pblock) {
+		/* Only update the file block number */
+		*blk_nump += max_entries * max_entries;
+		return 0;
+	}
+	bh = sb_bread(inode->i_sb, pblock);
+	if (!bh)
+		return -EIO;
+
+	i_data = (__le32 *)bh->b_data;
+	for (i = 0; i < max_entries; i++) {
+		if (i_data[i]) {
+			retval = update_ind_extent_range(handle, inode,
+						le32_to_cpu(i_data[i]),
+						&blk_count, lb);
+			if (retval)
+				break;
+		} else {
+			/* Only update the file block number */
+			blk_count += max_entries;
+		}
+	}
+
+	/* Update the file block number */
+	*blk_nump = blk_count;
+	put_bh(bh);
+	return retval;
+
+}
+
+static int update_tind_extent_range(handle_t *handle, struct inode *inode,
+				     ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
+				     struct list_blocks_struct *lb)
+{
+	struct buffer_head *bh;
+	__le32 *i_data;
+	int i, retval = 0;
+	ext4_lblk_t blk_count = *blk_nump;
+	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+	if (!pblock) {
+		/* Only update the file block number */
+		*blk_nump += max_entries * max_entries * max_entries;
+		return 0;
+	}
+	bh = sb_bread(inode->i_sb, pblock);
+	if (!bh)
+		return -EIO;
+
+	i_data = (__le32 *)bh->b_data;
+	for (i = 0; i < max_entries; i++) {
+		if (i_data[i]) {
+			retval = update_dind_extent_range(handle, inode,
+						le32_to_cpu(i_data[i]),
+						&blk_count, lb);
+			if (retval)
+				break;
+		} else
+			/* Only update the file block number */
+			blk_count += max_entries * max_entries;
+	}
+	/* Update the file block number */
+	*blk_nump = blk_count;
+	put_bh(bh);
+	return retval;
+
+}
+
+static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
+{
+	int retval = 0, needed;
+
+	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
+		return 0;
+	/*
+	 * We are freeing a blocks. During this we touch
+	 * superblock, group descriptor and block bitmap.
+	 * So allocate a credit of 3. We may update
+	 * quota (user and group).
+	 */
+	needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
+
+	if (ext4_journal_extend(handle, needed) != 0)
+		retval = ext4_journal_restart(handle, needed);
+
+	return retval;
+}
+
+static int free_dind_blocks(handle_t *handle,
+				struct inode *inode, __le32 i_data)
+{
+	int i;
+	__le32 *tmp_idata;
+	struct buffer_head *bh;
+	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+	bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
+	if (!bh)
+		return -EIO;
+
+	tmp_idata = (__le32 *)bh->b_data;
+	for (i = 0; i < max_entries; i++) {
+		if (tmp_idata[i]) {
+			extend_credit_for_blkdel(handle, inode);
+			ext4_free_blocks(handle, inode, NULL,
+					 le32_to_cpu(tmp_idata[i]), 1,
+					 EXT4_FREE_BLOCKS_METADATA |
+					 EXT4_FREE_BLOCKS_FORGET);
+		}
+	}
+	put_bh(bh);
+	extend_credit_for_blkdel(handle, inode);
+	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
+			 EXT4_FREE_BLOCKS_METADATA |
+			 EXT4_FREE_BLOCKS_FORGET);
+	return 0;
+}
+
+static int free_tind_blocks(handle_t *handle,
+				struct inode *inode, __le32 i_data)
+{
+	int i, retval = 0;
+	__le32 *tmp_idata;
+	struct buffer_head *bh;
+	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+	bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
+	if (!bh)
+		return -EIO;
+
+	tmp_idata = (__le32 *)bh->b_data;
+	for (i = 0; i < max_entries; i++) {
+		if (tmp_idata[i]) {
+			retval = free_dind_blocks(handle,
+					inode, tmp_idata[i]);
+			if (retval) {
+				put_bh(bh);
+				return retval;
+			}
+		}
+	}
+	put_bh(bh);
+	extend_credit_for_blkdel(handle, inode);
+	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
+			 EXT4_FREE_BLOCKS_METADATA |
+			 EXT4_FREE_BLOCKS_FORGET);
+	return 0;
+}
+
+static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
+{
+	int retval;
+
+	/* ei->i_data[EXT4_IND_BLOCK] */
+	if (i_data[0]) {
+		extend_credit_for_blkdel(handle, inode);
+		ext4_free_blocks(handle, inode, NULL,
+				le32_to_cpu(i_data[0]), 1,
+				 EXT4_FREE_BLOCKS_METADATA |
+				 EXT4_FREE_BLOCKS_FORGET);
+	}
+
+	/* ei->i_data[EXT4_DIND_BLOCK] */
+	if (i_data[1]) {
+		retval = free_dind_blocks(handle, inode, i_data[1]);
+		if (retval)
+			return retval;
+	}
+
+	/* ei->i_data[EXT4_TIND_BLOCK] */
+	if (i_data[2]) {
+		retval = free_tind_blocks(handle, inode, i_data[2]);
+		if (retval)
+			return retval;
+	}
+	return 0;
+}
+
+static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
+						struct inode *tmp_inode)
+{
+	int retval;
+	__le32	i_data[3];
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode);
+
+	/*
+	 * One credit accounted for writing the
+	 * i_data field of the original inode
+	 */
+	retval = ext4_journal_extend(handle, 1);
+	if (retval) {
+		retval = ext4_journal_restart(handle, 1);
+		if (retval)
+			goto err_out;
+	}
+
+	i_data[0] = ei->i_data[EXT4_IND_BLOCK];
+	i_data[1] = ei->i_data[EXT4_DIND_BLOCK];
+	i_data[2] = ei->i_data[EXT4_TIND_BLOCK];
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	/*
+	 * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
+	 * happened after we started the migrate. We need to
+	 * fail the migrate
+	 */
+	if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
+		retval = -EAGAIN;
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto err_out;
+	} else
+		ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+	/*
+	 * We have the extent map build with the tmp inode.
+	 * Now copy the i_data across
+	 */
+	ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+	memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
+
+	/*
+	 * Update i_blocks with the new blocks that got
+	 * allocated while adding extents for extent index
+	 * blocks.
+	 *
+	 * While converting to extents we need not
+	 * update the orignal inode i_blocks for extent blocks
+	 * via quota APIs. The quota update happened via tmp_inode already.
+	 */
+	spin_lock(&inode->i_lock);
+	inode->i_blocks += tmp_inode->i_blocks;
+	spin_unlock(&inode->i_lock);
+	up_write(&EXT4_I(inode)->i_data_sem);
+
+	/*
+	 * We mark the inode dirty after, because we decrement the
+	 * i_blocks when freeing the indirect meta-data blocks
+	 */
+	retval = free_ind_block(handle, inode, i_data);
+	ext4_mark_inode_dirty(handle, inode);
+
+err_out:
+	return retval;
+}
+
+static int free_ext_idx(handle_t *handle, struct inode *inode,
+					struct ext4_extent_idx *ix)
+{
+	int i, retval = 0;
+	ext4_fsblk_t block;
+	struct buffer_head *bh;
+	struct ext4_extent_header *eh;
+
+	block = ext4_idx_pblock(ix);
+	bh = sb_bread(inode->i_sb, block);
+	if (!bh)
+		return -EIO;
+
+	eh = (struct ext4_extent_header *)bh->b_data;
+	if (eh->eh_depth != 0) {
+		ix = EXT_FIRST_INDEX(eh);
+		for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
+			retval = free_ext_idx(handle, inode, ix);
+			if (retval)
+				break;
+		}
+	}
+	put_bh(bh);
+	extend_credit_for_blkdel(handle, inode);
+	ext4_free_blocks(handle, inode, NULL, block, 1,
+			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+	return retval;
+}
+
+/*
+ * Free the extent meta data blocks only
+ */
+static int free_ext_block(handle_t *handle, struct inode *inode)
+{
+	int i, retval = 0;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data;
+	struct ext4_extent_idx *ix;
+	if (eh->eh_depth == 0)
+		/*
+		 * No extra blocks allocated for extent meta data
+		 */
+		return 0;
+	ix = EXT_FIRST_INDEX(eh);
+	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
+		retval = free_ext_idx(handle, inode, ix);
+		if (retval)
+			return retval;
+	}
+	return retval;
+
+}
+
+int ext4_ext_migrate(struct inode *inode)
+{
+	handle_t *handle;
+	int retval = 0, i;
+	__le32 *i_data;
+	ext4_lblk_t blk_count = 0;
+	struct ext4_inode_info *ei;
+	struct inode *tmp_inode = NULL;
+	struct list_blocks_struct lb;
+	unsigned long max_entries;
+	__u32 goal;
+
+	/*
+	 * If the filesystem does not support extents, or the inode
+	 * already is extent-based, error out.
+	 */
+	if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+	    (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return -EINVAL;
+
+	if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
+		/*
+		 * don't migrate fast symlink
+		 */
+		return retval;
+
+	handle = ext4_journal_start(inode,
+					EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
+					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+					EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
+					+ 1);
+	if (IS_ERR(handle)) {
+		retval = PTR_ERR(handle);
+		return retval;
+	}
+	goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
+		EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
+	tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
+				   S_IFREG, NULL, goal);
+	if (IS_ERR(tmp_inode)) {
+		retval = -ENOMEM;
+		ext4_journal_stop(handle);
+		return retval;
+	}
+	i_size_write(tmp_inode, i_size_read(inode));
+	/*
+	 * Set the i_nlink to zero so it will be deleted later
+	 * when we drop inode reference.
+	 */
+	tmp_inode->i_nlink = 0;
+
+	ext4_ext_tree_init(handle, tmp_inode);
+	ext4_orphan_add(handle, tmp_inode);
+	ext4_journal_stop(handle);
+
+	/*
+	 * start with one credit accounted for
+	 * superblock modification.
+	 *
+	 * For the tmp_inode we already have committed the
+	 * trascation that created the inode. Later as and
+	 * when we add extents we extent the journal
+	 */
+	/*
+	 * Even though we take i_mutex we can still cause block
+	 * allocation via mmap write to holes. If we have allocated
+	 * new blocks we fail migrate.  New block allocation will
+	 * clear EXT4_STATE_EXT_MIGRATE flag.  The flag is updated
+	 * with i_data_sem held to prevent racing with block
+	 * allocation.
+	 */
+	down_read((&EXT4_I(inode)->i_data_sem));
+	ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+	up_read((&EXT4_I(inode)->i_data_sem));
+
+	handle = ext4_journal_start(inode, 1);
+	if (IS_ERR(handle)) {
+		/*
+		 * It is impossible to update on-disk structures without
+		 * a handle, so just rollback in-core changes and live other
+		 * work to orphan_list_cleanup()
+		 */
+		ext4_orphan_del(NULL, tmp_inode);
+		retval = PTR_ERR(handle);
+		goto out;
+	}
+
+	ei = EXT4_I(inode);
+	i_data = ei->i_data;
+	memset(&lb, 0, sizeof(lb));
+
+	/* 32 bit block address 4 bytes */
+	max_entries = inode->i_sb->s_blocksize >> 2;
+	for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) {
+		if (i_data[i]) {
+			retval = update_extent_range(handle, tmp_inode,
+						le32_to_cpu(i_data[i]),
+						blk_count, &lb);
+			if (retval)
+				goto err_out;
+		}
+	}
+	if (i_data[EXT4_IND_BLOCK]) {
+		retval = update_ind_extent_range(handle, tmp_inode,
+					le32_to_cpu(i_data[EXT4_IND_BLOCK]),
+					&blk_count, &lb);
+			if (retval)
+				goto err_out;
+	} else
+		blk_count +=  max_entries;
+	if (i_data[EXT4_DIND_BLOCK]) {
+		retval = update_dind_extent_range(handle, tmp_inode,
+					le32_to_cpu(i_data[EXT4_DIND_BLOCK]),
+					&blk_count, &lb);
+			if (retval)
+				goto err_out;
+	} else
+		blk_count += max_entries * max_entries;
+	if (i_data[EXT4_TIND_BLOCK]) {
+		retval = update_tind_extent_range(handle, tmp_inode,
+					le32_to_cpu(i_data[EXT4_TIND_BLOCK]),
+					&blk_count, &lb);
+			if (retval)
+				goto err_out;
+	}
+	/*
+	 * Build the last extent
+	 */
+	retval = finish_range(handle, tmp_inode, &lb);
+err_out:
+	if (retval)
+		/*
+		 * Failure case delete the extent information with the
+		 * tmp_inode
+		 */
+		free_ext_block(handle, tmp_inode);
+	else {
+		retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode);
+		if (retval)
+			/*
+			 * if we fail to swap inode data free the extent
+			 * details of the tmp inode
+			 */
+			free_ext_block(handle, tmp_inode);
+	}
+
+	/* We mark the tmp_inode dirty via ext4_ext_tree_init. */
+	if (ext4_journal_extend(handle, 1) != 0)
+		ext4_journal_restart(handle, 1);
+
+	/*
+	 * Mark the tmp_inode as of size zero
+	 */
+	i_size_write(tmp_inode, 0);
+
+	/*
+	 * set the  i_blocks count to zero
+	 * so that the ext4_delete_inode does the
+	 * right job
+	 *
+	 * We don't need to take the i_lock because
+	 * the inode is not visible to user space.
+	 */
+	tmp_inode->i_blocks = 0;
+
+	/* Reset the extent details */
+	ext4_ext_tree_init(handle, tmp_inode);
+	ext4_journal_stop(handle);
+out:
+	unlock_new_inode(tmp_inode);
+	iput(tmp_inode);
+
+	return retval;
+}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 00000000..9bdef3f5
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,351 @@
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/utsname.h>
+#include <linux/kthread.h>
+
+#include "ext4.h"
+
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct buffer_head *bh)
+{
+	mark_buffer_dirty(bh);
+	lock_buffer(bh);
+	bh->b_end_io = end_buffer_write_sync;
+	get_bh(bh);
+	submit_bh(WRITE_SYNC, bh);
+	wait_on_buffer(bh);
+	if (unlikely(!buffer_uptodate(bh)))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+			  ext4_fsblk_t mmp_block)
+{
+	struct mmp_struct *mmp;
+
+	if (*bh)
+		clear_buffer_uptodate(*bh);
+
+	/* This would be sb_bread(sb, mmp_block), except we need to be sure
+	 * that the MD RAID device cache has been bypassed, and that the read
+	 * is not blocked in the elevator. */
+	if (!*bh)
+		*bh = sb_getblk(sb, mmp_block);
+	if (*bh) {
+		get_bh(*bh);
+		lock_buffer(*bh);
+		(*bh)->b_end_io = end_buffer_read_sync;
+		submit_bh(READ_SYNC, *bh);
+		wait_on_buffer(*bh);
+		if (!buffer_uptodate(*bh)) {
+			brelse(*bh);
+			*bh = NULL;
+		}
+	}
+	if (!*bh) {
+		ext4_warning(sb, "Error while reading MMP block %llu",
+			     mmp_block);
+		return -EIO;
+	}
+
+	mmp = (struct mmp_struct *)((*bh)->b_data);
+	if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+		    const char *function, unsigned int line, const char *msg)
+{
+	__ext4_warning(sb, function, line, msg);
+	__ext4_warning(sb, function, line,
+		       "MMP failure info: last update time: %llu, last update "
+		       "node: %s, last update device: %s\n",
+		       (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+		       mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+	struct super_block *sb = ((struct mmpd_data *) data)->sb;
+	struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct mmp_struct *mmp;
+	ext4_fsblk_t mmp_block;
+	u32 seq = 0;
+	unsigned long failed_writes = 0;
+	int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+	unsigned mmp_check_interval;
+	unsigned long last_update_time;
+	unsigned long diff;
+	int retval;
+
+	mmp_block = le64_to_cpu(es->s_mmp_block);
+	mmp = (struct mmp_struct *)(bh->b_data);
+	mmp->mmp_time = cpu_to_le64(get_seconds());
+	/*
+	 * Start with the higher mmp_check_interval and reduce it if
+	 * the MMP block is being updated on time.
+	 */
+	mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+				 EXT4_MMP_MIN_CHECK_INTERVAL);
+	mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+	bdevname(bh->b_bdev, mmp->mmp_bdevname);
+
+	memcpy(mmp->mmp_nodename, init_utsname()->sysname,
+	       sizeof(mmp->mmp_nodename));
+
+	while (!kthread_should_stop()) {
+		if (++seq > EXT4_MMP_SEQ_MAX)
+			seq = 1;
+
+		mmp->mmp_seq = cpu_to_le32(seq);
+		mmp->mmp_time = cpu_to_le64(get_seconds());
+		last_update_time = jiffies;
+
+		retval = write_mmp_block(bh);
+		/*
+		 * Don't spew too many error messages. Print one every
+		 * (s_mmp_update_interval * 60) seconds.
+		 */
+		if (retval && (failed_writes % 60) == 0) {
+			ext4_error(sb, "Error writing to MMP block");
+			failed_writes++;
+		}
+
+		if (!(le32_to_cpu(es->s_feature_incompat) &
+		    EXT4_FEATURE_INCOMPAT_MMP)) {
+			ext4_warning(sb, "kmmpd being stopped since MMP feature"
+				     " has been disabled.");
+			EXT4_SB(sb)->s_mmp_tsk = NULL;
+			goto failed;
+		}
+
+		if (sb->s_flags & MS_RDONLY) {
+			ext4_warning(sb, "kmmpd being stopped since filesystem "
+				     "has been remounted as readonly.");
+			EXT4_SB(sb)->s_mmp_tsk = NULL;
+			goto failed;
+		}
+
+		diff = jiffies - last_update_time;
+		if (diff < mmp_update_interval * HZ)
+			schedule_timeout_interruptible(mmp_update_interval *
+						       HZ - diff);
+
+		/*
+		 * We need to make sure that more than mmp_check_interval
+		 * seconds have not passed since writing. If that has happened
+		 * we need to check if the MMP block is as we left it.
+		 */
+		diff = jiffies - last_update_time;
+		if (diff > mmp_check_interval * HZ) {
+			struct buffer_head *bh_check = NULL;
+			struct mmp_struct *mmp_check;
+
+			retval = read_mmp_block(sb, &bh_check, mmp_block);
+			if (retval) {
+				ext4_error(sb, "error reading MMP data: %d",
+					   retval);
+
+				EXT4_SB(sb)->s_mmp_tsk = NULL;
+				goto failed;
+			}
+
+			mmp_check = (struct mmp_struct *)(bh_check->b_data);
+			if (mmp->mmp_seq != mmp_check->mmp_seq ||
+			    memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+				   sizeof(mmp->mmp_nodename))) {
+				dump_mmp_msg(sb, mmp_check,
+					     "Error while updating MMP info. "
+					     "The filesystem seems to have been"
+					     " multiply mounted.");
+				ext4_error(sb, "abort");
+				goto failed;
+			}
+			put_bh(bh_check);
+		}
+
+		 /*
+		 * Adjust the mmp_check_interval depending on how much time
+		 * it took for the MMP block to be written.
+		 */
+		mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
+					     EXT4_MMP_MAX_CHECK_INTERVAL),
+					 EXT4_MMP_MIN_CHECK_INTERVAL);
+		mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+	}
+
+	/*
+	 * Unmount seems to be clean.
+	 */
+	mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+	mmp->mmp_time = cpu_to_le64(get_seconds());
+
+	retval = write_mmp_block(bh);
+
+failed:
+	kfree(data);
+	brelse(bh);
+	return retval;
+}
+
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+	u32 new_seq;
+
+	do {
+		get_random_bytes(&new_seq, sizeof(u32));
+	} while (new_seq > EXT4_MMP_SEQ_MAX);
+
+	return new_seq;
+}
+
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+int ext4_multi_mount_protect(struct super_block *sb,
+				    ext4_fsblk_t mmp_block)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct buffer_head *bh = NULL;
+	struct mmp_struct *mmp = NULL;
+	struct mmpd_data *mmpd_data;
+	u32 seq;
+	unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+	unsigned int wait_time = 0;
+	int retval;
+
+	if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+	    mmp_block >= ext4_blocks_count(es)) {
+		ext4_warning(sb, "Invalid MMP block in superblock");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+
+	mmp = (struct mmp_struct *)(bh->b_data);
+
+	if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+		mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+
+	/*
+	 * If check_interval in MMP block is larger, use that instead of
+	 * update_interval from the superblock.
+	 */
+	if (mmp->mmp_check_interval > mmp_check_interval)
+		mmp_check_interval = mmp->mmp_check_interval;
+
+	seq = le32_to_cpu(mmp->mmp_seq);
+	if (seq == EXT4_MMP_SEQ_CLEAN)
+		goto skip;
+
+	if (seq == EXT4_MMP_SEQ_FSCK) {
+		dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+		goto failed;
+	}
+
+	wait_time = min(mmp_check_interval * 2 + 1,
+			mmp_check_interval + 60);
+
+	/* Print MMP interval if more than 20 secs. */
+	if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+		ext4_warning(sb, "MMP interval %u higher than expected, please"
+			     " wait.\n", wait_time * 2);
+
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+	mmp = (struct mmp_struct *)(bh->b_data);
+	if (seq != le32_to_cpu(mmp->mmp_seq)) {
+		dump_mmp_msg(sb, mmp,
+			     "Device is already active on another node.");
+		goto failed;
+	}
+
+skip:
+	/*
+	 * write a new random sequence number.
+	 */
+	mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+
+	retval = write_mmp_block(bh);
+	if (retval)
+		goto failed;
+
+	/*
+	 * wait for MMP interval and check mmp_seq.
+	 */
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+	mmp = (struct mmp_struct *)(bh->b_data);
+	if (seq != le32_to_cpu(mmp->mmp_seq)) {
+		dump_mmp_msg(sb, mmp,
+			     "Device is already active on another node.");
+		goto failed;
+	}
+
+	mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+	if (!mmpd_data) {
+		ext4_warning(sb, "not enough memory for mmpd_data");
+		goto failed;
+	}
+	mmpd_data->sb = sb;
+	mmpd_data->bh = bh;
+
+	/*
+	 * Start a kernel thread to update the MMP block periodically.
+	 */
+	EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+					     bdevname(bh->b_bdev,
+						      mmp->mmp_bdevname));
+	if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+		EXT4_SB(sb)->s_mmp_tsk = NULL;
+		kfree(mmpd_data);
+		ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+			     sb->s_id);
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	brelse(bh);
+	return 1;
+}
+
+
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
new file mode 100644
index 00000000..f57455a1
--- /dev/null
+++ b/fs/ext4/move_extent.c
@@ -0,0 +1,1424 @@
+/*
+ * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@yk.jp.nec.com>
+ *            Akira Fujita <a-fujita@rs.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "ext4.h"
+
+/**
+ * get_ext_path - Find an extent path for designated logical block number.
+ *
+ * @inode:	an inode which is searched
+ * @lblock:	logical block number to find an extent path
+ * @path:	pointer to an extent path pointer (for output)
+ *
+ * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * on failure.
+ */
+static inline int
+get_ext_path(struct inode *inode, ext4_lblk_t lblock,
+		struct ext4_ext_path **path)
+{
+	int ret = 0;
+
+	*path = ext4_ext_find_extent(inode, lblock, *path);
+	if (IS_ERR(*path)) {
+		ret = PTR_ERR(*path);
+		*path = NULL;
+	} else if ((*path)[ext_depth(inode)].p_ext == NULL)
+		ret = -ENODATA;
+
+	return ret;
+}
+
+/**
+ * copy_extent_status - Copy the extent's initialization status
+ *
+ * @src:	an extent for getting initialize status
+ * @dest:	an extent to be set the status
+ */
+static void
+copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
+{
+	if (ext4_ext_is_uninitialized(src))
+		ext4_ext_mark_uninitialized(dest);
+	else
+		dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
+}
+
+/**
+ * mext_next_extent - Search for the next extent and set it to "extent"
+ *
+ * @inode:	inode which is searched
+ * @path:	this will obtain data for the next extent
+ * @extent:	pointer to the next extent we have just gotten
+ *
+ * Search the next extent in the array of ext4_ext_path structure (@path)
+ * and set it to ext4_extent structure (@extent). In addition, the member of
+ * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
+ * ext4_ext_path structure refers to the last extent, or a negative error
+ * value on failure.
+ */
+static int
+mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+		      struct ext4_extent **extent)
+{
+	struct ext4_extent_header *eh;
+	int ppos, leaf_ppos = path->p_depth;
+
+	ppos = leaf_ppos;
+	if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
+		/* leaf block */
+		*extent = ++path[ppos].p_ext;
+		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
+		return 0;
+	}
+
+	while (--ppos >= 0) {
+		if (EXT_LAST_INDEX(path[ppos].p_hdr) >
+		    path[ppos].p_idx) {
+			int cur_ppos = ppos;
+
+			/* index block */
+			path[ppos].p_idx++;
+			path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
+			if (path[ppos+1].p_bh)
+				brelse(path[ppos+1].p_bh);
+			path[ppos+1].p_bh =
+				sb_bread(inode->i_sb, path[ppos].p_block);
+			if (!path[ppos+1].p_bh)
+				return -EIO;
+			path[ppos+1].p_hdr =
+				ext_block_hdr(path[ppos+1].p_bh);
+
+			/* Halfway index block */
+			while (++cur_ppos < leaf_ppos) {
+				path[cur_ppos].p_idx =
+					EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
+				path[cur_ppos].p_block =
+					ext4_idx_pblock(path[cur_ppos].p_idx);
+				if (path[cur_ppos+1].p_bh)
+					brelse(path[cur_ppos+1].p_bh);
+				path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
+					path[cur_ppos].p_block);
+				if (!path[cur_ppos+1].p_bh)
+					return -EIO;
+				path[cur_ppos+1].p_hdr =
+					ext_block_hdr(path[cur_ppos+1].p_bh);
+			}
+
+			path[leaf_ppos].p_ext = *extent = NULL;
+
+			eh = path[leaf_ppos].p_hdr;
+			if (le16_to_cpu(eh->eh_entries) == 0)
+				/* empty leaf is found */
+				return -ENODATA;
+
+			/* leaf block */
+			path[leaf_ppos].p_ext = *extent =
+				EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+			path[leaf_ppos].p_block =
+					ext4_ext_pblock(path[leaf_ppos].p_ext);
+			return 0;
+		}
+	}
+	/* We found the last extent */
+	return 1;
+}
+
+/**
+ * mext_check_null_inode - NULL check for two inodes
+ *
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ */
+static int
+mext_check_null_inode(struct inode *inode1, struct inode *inode2,
+		      const char *function, unsigned int line)
+{
+	int ret = 0;
+
+	if (inode1 == NULL) {
+		__ext4_error(inode2->i_sb, function, line,
+			"Both inodes should not be NULL: "
+			"inode1 NULL inode2 %lu", inode2->i_ino);
+		ret = -EIO;
+	} else if (inode2 == NULL) {
+		__ext4_error(inode1->i_sb, function, line,
+			"Both inodes should not be NULL: "
+			"inode1 %lu inode2 NULL", inode1->i_ino);
+		ret = -EIO;
+	}
+	return ret;
+}
+
+/**
+ * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
+ *
+ * @orig_inode:		original inode structure
+ * @donor_inode:	donor inode structure
+ * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
+ * i_ino order.
+ */
+static void
+double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+{
+	struct inode *first = orig_inode, *second = donor_inode;
+
+	/*
+	 * Use the inode number to provide the stable locking order instead
+	 * of its address, because the C language doesn't guarantee you can
+	 * compare pointers that don't come from the same array.
+	 */
+	if (donor_inode->i_ino < orig_inode->i_ino) {
+		first = donor_inode;
+		second = orig_inode;
+	}
+
+	down_write(&EXT4_I(first)->i_data_sem);
+	down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
+}
+
+/**
+ * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
+ *
+ * @orig_inode:		original inode structure to be released its lock first
+ * @donor_inode:	donor inode structure to be released its lock second
+ * Release write lock of i_data_sem of two inodes (orig and donor).
+ */
+static void
+double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+{
+	up_write(&EXT4_I(orig_inode)->i_data_sem);
+	up_write(&EXT4_I(donor_inode)->i_data_sem);
+}
+
+/**
+ * mext_insert_across_blocks - Insert extents across leaf block
+ *
+ * @handle:		journal handle
+ * @orig_inode:		original inode
+ * @o_start:		first original extent to be changed
+ * @o_end:		last original extent to be changed
+ * @start_ext:		first new extent to be inserted
+ * @new_ext:		middle of new extent to be inserted
+ * @end_ext:		last new extent to be inserted
+ *
+ * Allocate a new leaf block and insert extents into it. Return 0 on success,
+ * or a negative error value on failure.
+ */
+static int
+mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
+		struct ext4_extent *o_start, struct ext4_extent *o_end,
+		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+		struct ext4_extent *end_ext)
+{
+	struct ext4_ext_path *orig_path = NULL;
+	ext4_lblk_t eblock = 0;
+	int new_flag = 0;
+	int end_flag = 0;
+	int err = 0;
+
+	if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
+		if (o_start == o_end) {
+
+			/*       start_ext   new_ext    end_ext
+			 * donor |---------|-----------|--------|
+			 * orig  |------------------------------|
+			 */
+			end_flag = 1;
+		} else {
+
+			/*       start_ext   new_ext   end_ext
+			 * donor |---------|----------|---------|
+			 * orig  |---------------|--------------|
+			 */
+			o_end->ee_block = end_ext->ee_block;
+			o_end->ee_len = end_ext->ee_len;
+			ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
+		}
+
+		o_start->ee_len = start_ext->ee_len;
+		eblock = le32_to_cpu(start_ext->ee_block);
+		new_flag = 1;
+
+	} else if (start_ext->ee_len && new_ext->ee_len &&
+		   !end_ext->ee_len && o_start == o_end) {
+
+		/*	 start_ext	new_ext
+		 * donor |--------------|---------------|
+		 * orig  |------------------------------|
+		 */
+		o_start->ee_len = start_ext->ee_len;
+		eblock = le32_to_cpu(start_ext->ee_block);
+		new_flag = 1;
+
+	} else if (!start_ext->ee_len && new_ext->ee_len &&
+		   end_ext->ee_len && o_start == o_end) {
+
+		/*	  new_ext	end_ext
+		 * donor |--------------|---------------|
+		 * orig  |------------------------------|
+		 */
+		o_end->ee_block = end_ext->ee_block;
+		o_end->ee_len = end_ext->ee_len;
+		ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
+
+		/*
+		 * Set 0 to the extent block if new_ext was
+		 * the first block.
+		 */
+		if (new_ext->ee_block)
+			eblock = le32_to_cpu(new_ext->ee_block);
+
+		new_flag = 1;
+	} else {
+		ext4_debug("ext4 move extent: Unexpected insert case\n");
+		return -EIO;
+	}
+
+	if (new_flag) {
+		err = get_ext_path(orig_inode, eblock, &orig_path);
+		if (err)
+			goto out;
+
+		if (ext4_ext_insert_extent(handle, orig_inode,
+					orig_path, new_ext, 0))
+			goto out;
+	}
+
+	if (end_flag) {
+		err = get_ext_path(orig_inode,
+				le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
+		if (err)
+			goto out;
+
+		if (ext4_ext_insert_extent(handle, orig_inode,
+					   orig_path, end_ext, 0))
+			goto out;
+	}
+out:
+	if (orig_path) {
+		ext4_ext_drop_refs(orig_path);
+		kfree(orig_path);
+	}
+
+	return err;
+
+}
+
+/**
+ * mext_insert_inside_block - Insert new extent to the extent block
+ *
+ * @o_start:		first original extent to be moved
+ * @o_end:		last original extent to be moved
+ * @start_ext:		first new extent to be inserted
+ * @new_ext:		middle of new extent to be inserted
+ * @end_ext:		last new extent to be inserted
+ * @eh:			extent header of target leaf block
+ * @range_to_move:	used to decide how to insert extent
+ *
+ * Insert extents into the leaf block. The extent (@o_start) is overwritten
+ * by inserted extents.
+ */
+static void
+mext_insert_inside_block(struct ext4_extent *o_start,
+			      struct ext4_extent *o_end,
+			      struct ext4_extent *start_ext,
+			      struct ext4_extent *new_ext,
+			      struct ext4_extent *end_ext,
+			      struct ext4_extent_header *eh,
+			      int range_to_move)
+{
+	int i = 0;
+	unsigned long len;
+
+	/* Move the existing extents */
+	if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+		len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
+			(unsigned long)(o_end + 1);
+		memmove(o_end + 1 + range_to_move, o_end + 1, len);
+	}
+
+	/* Insert start entry */
+	if (start_ext->ee_len)
+		o_start[i++].ee_len = start_ext->ee_len;
+
+	/* Insert new entry */
+	if (new_ext->ee_len) {
+		o_start[i] = *new_ext;
+		ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
+	}
+
+	/* Insert end entry */
+	if (end_ext->ee_len)
+		o_start[i] = *end_ext;
+
+	/* Increment the total entries counter on the extent block */
+	le16_add_cpu(&eh->eh_entries, range_to_move);
+}
+
+/**
+ * mext_insert_extents - Insert new extent
+ *
+ * @handle:	journal handle
+ * @orig_inode:	original inode
+ * @orig_path:	path indicates first extent to be changed
+ * @o_start:	first original extent to be changed
+ * @o_end:	last original extent to be changed
+ * @start_ext:	first new extent to be inserted
+ * @new_ext:	middle of new extent to be inserted
+ * @end_ext:	last new extent to be inserted
+ *
+ * Call the function to insert extents. If we cannot add more extents into
+ * the leaf block, we call mext_insert_across_blocks() to create a
+ * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+mext_insert_extents(handle_t *handle, struct inode *orig_inode,
+			 struct ext4_ext_path *orig_path,
+			 struct ext4_extent *o_start,
+			 struct ext4_extent *o_end,
+			 struct ext4_extent *start_ext,
+			 struct ext4_extent *new_ext,
+			 struct ext4_extent *end_ext)
+{
+	struct  ext4_extent_header *eh;
+	unsigned long need_slots, slots_range;
+	int	range_to_move, depth, ret;
+
+	/*
+	 * The extents need to be inserted
+	 * start_extent + new_extent + end_extent.
+	 */
+	need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
+		(new_ext->ee_len ? 1 : 0);
+
+	/* The number of slots between start and end */
+	slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
+		/ sizeof(struct ext4_extent);
+
+	/* Range to move the end of extent */
+	range_to_move = need_slots - slots_range;
+	depth = orig_path->p_depth;
+	orig_path += depth;
+	eh = orig_path->p_hdr;
+
+	if (depth) {
+		/* Register to journal */
+		ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
+		if (ret)
+			return ret;
+	}
+
+	/* Expansion */
+	if (range_to_move > 0 &&
+		(range_to_move > le16_to_cpu(eh->eh_max)
+			- le16_to_cpu(eh->eh_entries))) {
+
+		ret = mext_insert_across_blocks(handle, orig_inode, o_start,
+					o_end, start_ext, new_ext, end_ext);
+		if (ret < 0)
+			return ret;
+	} else
+		mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
+						end_ext, eh, range_to_move);
+
+	if (depth) {
+		ret = ext4_handle_dirty_metadata(handle, orig_inode,
+						 orig_path->p_bh);
+		if (ret)
+			return ret;
+	} else {
+		ret = ext4_mark_inode_dirty(handle, orig_inode);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * mext_leaf_block - Move one leaf extent block into the inode.
+ *
+ * @handle:		journal handle
+ * @orig_inode:		original inode
+ * @orig_path:		path indicates first extent to be changed
+ * @dext:		donor extent
+ * @from:		start offset on the target file
+ *
+ * In order to insert extents into the leaf block, we must divide the extent
+ * in the leaf block into three extents. The one is located to be inserted
+ * extents, and the others are located around it.
+ *
+ * Therefore, this function creates structures to save extents of the leaf
+ * block, and inserts extents by calling mext_insert_extents() with
+ * created extents. Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_leaf_block(handle_t *handle, struct inode *orig_inode,
+		     struct ext4_ext_path *orig_path, struct ext4_extent *dext,
+		     ext4_lblk_t *from)
+{
+	struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
+	struct ext4_extent new_ext, start_ext, end_ext;
+	ext4_lblk_t new_ext_end;
+	int oext_alen, new_ext_alen, end_ext_alen;
+	int depth = ext_depth(orig_inode);
+	int ret;
+
+	start_ext.ee_block = end_ext.ee_block = 0;
+	o_start = o_end = oext = orig_path[depth].p_ext;
+	oext_alen = ext4_ext_get_actual_len(oext);
+	start_ext.ee_len = end_ext.ee_len = 0;
+
+	new_ext.ee_block = cpu_to_le32(*from);
+	ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
+	new_ext.ee_len = dext->ee_len;
+	new_ext_alen = ext4_ext_get_actual_len(&new_ext);
+	new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
+
+	/*
+	 * Case: original extent is first
+	 * oext      |--------|
+	 * new_ext      |--|
+	 * start_ext |--|
+	 */
+	if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
+		le32_to_cpu(new_ext.ee_block) <
+		le32_to_cpu(oext->ee_block) + oext_alen) {
+		start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
+					       le32_to_cpu(oext->ee_block));
+		start_ext.ee_block = oext->ee_block;
+		copy_extent_status(oext, &start_ext);
+	} else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
+		prev_ext = oext - 1;
+		/*
+		 * We can merge new_ext into previous extent,
+		 * if these are contiguous and same extent type.
+		 */
+		if (ext4_can_extents_be_merged(orig_inode, prev_ext,
+					       &new_ext)) {
+			o_start = prev_ext;
+			start_ext.ee_len = cpu_to_le16(
+				ext4_ext_get_actual_len(prev_ext) +
+				new_ext_alen);
+			start_ext.ee_block = oext->ee_block;
+			copy_extent_status(prev_ext, &start_ext);
+			new_ext.ee_len = 0;
+		}
+	}
+
+	/*
+	 * Case: new_ext_end must be less than oext
+	 * oext      |-----------|
+	 * new_ext       |-------|
+	 */
+	if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
+		EXT4_ERROR_INODE(orig_inode,
+			"new_ext_end(%u) should be less than or equal to "
+			"oext->ee_block(%u) + oext_alen(%d) - 1",
+			new_ext_end, le32_to_cpu(oext->ee_block),
+			oext_alen);
+		ret = -EIO;
+		goto out;
+	}
+
+	/*
+	 * Case: new_ext is smaller than original extent
+	 * oext    |---------------|
+	 * new_ext |-----------|
+	 * end_ext             |---|
+	 */
+	if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
+		new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
+		end_ext.ee_len =
+			cpu_to_le16(le32_to_cpu(oext->ee_block) +
+			oext_alen - 1 - new_ext_end);
+		copy_extent_status(oext, &end_ext);
+		end_ext_alen = ext4_ext_get_actual_len(&end_ext);
+		ext4_ext_store_pblock(&end_ext,
+			(ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
+		end_ext.ee_block =
+			cpu_to_le32(le32_to_cpu(o_end->ee_block) +
+			oext_alen - end_ext_alen);
+	}
+
+	ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
+				o_end, &start_ext, &new_ext, &end_ext);
+out:
+	return ret;
+}
+
+/**
+ * mext_calc_swap_extents - Calculate extents for extent swapping.
+ *
+ * @tmp_dext:		the extent that will belong to the original inode
+ * @tmp_oext:		the extent that will belong to the donor inode
+ * @orig_off:		block offset of original inode
+ * @donor_off:		block offset of donor inode
+ * @max_count:		the maximum length of extents
+ *
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_calc_swap_extents(struct ext4_extent *tmp_dext,
+			      struct ext4_extent *tmp_oext,
+			      ext4_lblk_t orig_off, ext4_lblk_t donor_off,
+			      ext4_lblk_t max_count)
+{
+	ext4_lblk_t diff, orig_diff;
+	struct ext4_extent dext_old, oext_old;
+
+	BUG_ON(orig_off != donor_off);
+
+	/* original and donor extents have to cover the same block offset */
+	if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
+	    le32_to_cpu(tmp_oext->ee_block) +
+			ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
+		return -ENODATA;
+
+	if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
+	    le32_to_cpu(tmp_dext->ee_block) +
+			ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
+		return -ENODATA;
+
+	dext_old = *tmp_dext;
+	oext_old = *tmp_oext;
+
+	/* When tmp_dext is too large, pick up the target range. */
+	diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
+
+	ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
+	tmp_dext->ee_block =
+			cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
+	tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
+
+	if (max_count < ext4_ext_get_actual_len(tmp_dext))
+		tmp_dext->ee_len = cpu_to_le16(max_count);
+
+	orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
+	ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
+
+	/* Adjust extent length if donor extent is larger than orig */
+	if (ext4_ext_get_actual_len(tmp_dext) >
+	    ext4_ext_get_actual_len(tmp_oext) - orig_diff)
+		tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
+						orig_diff);
+
+	tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
+
+	copy_extent_status(&oext_old, tmp_dext);
+	copy_extent_status(&dext_old, tmp_oext);
+
+	return 0;
+}
+
+/**
+ * mext_replace_branches - Replace original extents with new extents
+ *
+ * @handle:		journal handle
+ * @orig_inode:		original inode
+ * @donor_inode:	donor inode
+ * @from:		block offset of orig_inode
+ * @count:		block count to be replaced
+ * @err:		pointer to save return value
+ *
+ * Replace original inode extents and donor inode extents page by page.
+ * We implement this replacement in the following three steps:
+ * 1. Save the block information of original and donor inodes into
+ *    dummy extents.
+ * 2. Change the block information of original inode to point at the
+ *    donor inode blocks.
+ * 3. Change the block information of donor inode to point at the saved
+ *    original inode blocks in the dummy extents.
+ *
+ * Return replaced block count.
+ */
+static int
+mext_replace_branches(handle_t *handle, struct inode *orig_inode,
+			   struct inode *donor_inode, ext4_lblk_t from,
+			   ext4_lblk_t count, int *err)
+{
+	struct ext4_ext_path *orig_path = NULL;
+	struct ext4_ext_path *donor_path = NULL;
+	struct ext4_extent *oext, *dext;
+	struct ext4_extent tmp_dext, tmp_oext;
+	ext4_lblk_t orig_off = from, donor_off = from;
+	int depth;
+	int replaced_count = 0;
+	int dext_alen;
+
+	/* Protect extent trees against block allocations via delalloc */
+	double_down_write_data_sem(orig_inode, donor_inode);
+
+	/* Get the original extent for the block "orig_off" */
+	*err = get_ext_path(orig_inode, orig_off, &orig_path);
+	if (*err)
+		goto out;
+
+	/* Get the donor extent for the head */
+	*err = get_ext_path(donor_inode, donor_off, &donor_path);
+	if (*err)
+		goto out;
+	depth = ext_depth(orig_inode);
+	oext = orig_path[depth].p_ext;
+	tmp_oext = *oext;
+
+	depth = ext_depth(donor_inode);
+	dext = donor_path[depth].p_ext;
+	tmp_dext = *dext;
+
+	*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+				      donor_off, count);
+	if (*err)
+		goto out;
+
+	/* Loop for the donor extents */
+	while (1) {
+		/* The extent for donor must be found. */
+		if (!dext) {
+			EXT4_ERROR_INODE(donor_inode,
+				   "The extent for donor must be found");
+			*err = -EIO;
+			goto out;
+		} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
+			EXT4_ERROR_INODE(donor_inode,
+				"Donor offset(%u) and the first block of donor "
+				"extent(%u) should be equal",
+				donor_off,
+				le32_to_cpu(tmp_dext.ee_block));
+			*err = -EIO;
+			goto out;
+		}
+
+		/* Set donor extent to orig extent */
+		*err = mext_leaf_block(handle, orig_inode,
+					   orig_path, &tmp_dext, &orig_off);
+		if (*err)
+			goto out;
+
+		/* Set orig extent to donor extent */
+		*err = mext_leaf_block(handle, donor_inode,
+					   donor_path, &tmp_oext, &donor_off);
+		if (*err)
+			goto out;
+
+		dext_alen = ext4_ext_get_actual_len(&tmp_dext);
+		replaced_count += dext_alen;
+		donor_off += dext_alen;
+		orig_off += dext_alen;
+
+		/* Already moved the expected blocks */
+		if (replaced_count >= count)
+			break;
+
+		if (orig_path)
+			ext4_ext_drop_refs(orig_path);
+		*err = get_ext_path(orig_inode, orig_off, &orig_path);
+		if (*err)
+			goto out;
+		depth = ext_depth(orig_inode);
+		oext = orig_path[depth].p_ext;
+		tmp_oext = *oext;
+
+		if (donor_path)
+			ext4_ext_drop_refs(donor_path);
+		*err = get_ext_path(donor_inode, donor_off, &donor_path);
+		if (*err)
+			goto out;
+		depth = ext_depth(donor_inode);
+		dext = donor_path[depth].p_ext;
+		tmp_dext = *dext;
+
+		*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+					   donor_off, count - replaced_count);
+		if (*err)
+			goto out;
+	}
+
+out:
+	if (orig_path) {
+		ext4_ext_drop_refs(orig_path);
+		kfree(orig_path);
+	}
+	if (donor_path) {
+		ext4_ext_drop_refs(donor_path);
+		kfree(donor_path);
+	}
+
+	ext4_ext_invalidate_cache(orig_inode);
+	ext4_ext_invalidate_cache(donor_inode);
+
+	double_up_write_data_sem(orig_inode, donor_inode);
+
+	return replaced_count;
+}
+
+/**
+ * move_extent_per_page - Move extent data per page
+ *
+ * @o_filp:			file structure of original file
+ * @donor_inode:		donor inode
+ * @orig_page_offset:		page index on original file
+ * @data_offset_in_page:	block index where data swapping starts
+ * @block_len_in_page:		the number of blocks to be swapped
+ * @uninit:			orig extent is uninitialized or not
+ * @err:			pointer to save return value
+ *
+ * Save the data in original inode blocks and replace original inode extents
+ * with donor inode extents by calling mext_replace_branches().
+ * Finally, write out the saved data in new original inode blocks. Return
+ * replaced block count.
+ */
+static int
+move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+		  pgoff_t orig_page_offset, int data_offset_in_page,
+		  int block_len_in_page, int uninit, int *err)
+{
+	struct inode *orig_inode = o_filp->f_dentry->d_inode;
+	struct address_space *mapping = orig_inode->i_mapping;
+	struct buffer_head *bh;
+	struct page *page = NULL;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	handle_t *handle;
+	ext4_lblk_t orig_blk_offset;
+	long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
+	unsigned long blocksize = orig_inode->i_sb->s_blocksize;
+	unsigned int w_flags = 0;
+	unsigned int tmp_data_size, data_size, replaced_size;
+	void *fsdata;
+	int i, jblocks;
+	int err2 = 0;
+	int replaced_count = 0;
+	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+
+	/*
+	 * It needs twice the amount of ordinary journal buffers because
+	 * inode and donor_inode may change each different metadata blocks.
+	 */
+	jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
+	handle = ext4_journal_start(orig_inode, jblocks);
+	if (IS_ERR(handle)) {
+		*err = PTR_ERR(handle);
+		return 0;
+	}
+
+	if (segment_eq(get_fs(), KERNEL_DS))
+		w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+	orig_blk_offset = orig_page_offset * blocks_per_page +
+		data_offset_in_page;
+
+	/*
+	 * If orig extent is uninitialized one,
+	 * it's not necessary force the page into memory
+	 * and then force it to be written out again.
+	 * Just swap data blocks between orig and donor.
+	 */
+	if (uninit) {
+		replaced_count = mext_replace_branches(handle, orig_inode,
+						donor_inode, orig_blk_offset,
+						block_len_in_page, err);
+		goto out2;
+	}
+
+	offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
+
+	/* Calculate data_size */
+	if ((orig_blk_offset + block_len_in_page - 1) ==
+	    ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
+		/* Replace the last block */
+		tmp_data_size = orig_inode->i_size & (blocksize - 1);
+		/*
+		 * If data_size equal zero, it shows data_size is multiples of
+		 * blocksize. So we set appropriate value.
+		 */
+		if (tmp_data_size == 0)
+			tmp_data_size = blocksize;
+
+		data_size = tmp_data_size +
+			((block_len_in_page - 1) << orig_inode->i_blkbits);
+	} else
+		data_size = block_len_in_page << orig_inode->i_blkbits;
+
+	replaced_size = data_size;
+
+	*err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
+				 &page, &fsdata);
+	if (unlikely(*err < 0))
+		goto out;
+
+	if (!PageUptodate(page)) {
+		mapping->a_ops->readpage(o_filp, page);
+		lock_page(page);
+	}
+
+	/*
+	 * try_to_release_page() doesn't call releasepage in writeback mode.
+	 * We should care about the order of writing to the same file
+	 * by multiple move extent processes.
+	 * It needs to call wait_on_page_writeback() to wait for the
+	 * writeback of the page.
+	 */
+	wait_on_page_writeback(page);
+
+	/* Release old bh and drop refs */
+	try_to_release_page(page, 0);
+
+	replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
+					orig_blk_offset, block_len_in_page,
+					&err2);
+	if (err2) {
+		if (replaced_count) {
+			block_len_in_page = replaced_count;
+			replaced_size =
+				block_len_in_page << orig_inode->i_blkbits;
+		} else
+			goto out;
+	}
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
+
+	bh = page_buffers(page);
+	for (i = 0; i < data_offset_in_page; i++)
+		bh = bh->b_this_page;
+
+	for (i = 0; i < block_len_in_page; i++) {
+		*err = ext4_get_block(orig_inode,
+				(sector_t)(orig_blk_offset + i), bh, 0);
+		if (*err < 0)
+			goto out;
+
+		if (bh->b_this_page != NULL)
+			bh = bh->b_this_page;
+	}
+
+	*err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
+			       page, fsdata);
+	page = NULL;
+
+out:
+	if (unlikely(page)) {
+		if (PageLocked(page))
+			unlock_page(page);
+		page_cache_release(page);
+		ext4_journal_stop(handle);
+	}
+out2:
+	ext4_journal_stop(handle);
+
+	if (err2)
+		*err = err2;
+
+	return replaced_count;
+}
+
+/**
+ * mext_check_arguments - Check whether move extent can be done
+ *
+ * @orig_inode:		original inode
+ * @donor_inode:	donor inode
+ * @orig_start:		logical start offset in block for orig
+ * @donor_start:	logical start offset in block for donor
+ * @len:		the number of blocks to be moved
+ *
+ * Check the arguments of ext4_move_extents() whether the files can be
+ * exchanged with each other.
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_check_arguments(struct inode *orig_inode,
+		     struct inode *donor_inode, __u64 orig_start,
+		     __u64 donor_start, __u64 *len)
+{
+	ext4_lblk_t orig_blocks, donor_blocks;
+	unsigned int blkbits = orig_inode->i_blkbits;
+	unsigned int blocksize = 1 << blkbits;
+
+	if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
+		ext4_debug("ext4 move extent: suid or sgid is set"
+			   " to donor file [ino:orig %lu, donor %lu]\n",
+			   orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
+		return -EPERM;
+
+	/* Ext4 move extent does not support swapfile */
+	if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
+		ext4_debug("ext4 move extent: The argument files should "
+			"not be swapfile [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* Files should be in the same ext4 FS */
+	if (orig_inode->i_sb != donor_inode->i_sb) {
+		ext4_debug("ext4 move extent: The argument files "
+			"should be in same FS [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* Ext4 move extent supports only extent based file */
+	if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
+		ext4_debug("ext4 move extent: orig file is not extents "
+			"based file [ino:orig %lu]\n", orig_inode->i_ino);
+		return -EOPNOTSUPP;
+	} else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
+		ext4_debug("ext4 move extent: donor file is not extents "
+			"based file [ino:donor %lu]\n", donor_inode->i_ino);
+		return -EOPNOTSUPP;
+	}
+
+	if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
+		ext4_debug("ext4 move extent: File size is 0 byte\n");
+		return -EINVAL;
+	}
+
+	/* Start offset should be same */
+	if (orig_start != donor_start) {
+		ext4_debug("ext4 move extent: orig and donor's start "
+			"offset are not same [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	if ((orig_start >= EXT_MAX_BLOCKS) ||
+	    (donor_start >= EXT_MAX_BLOCKS) ||
+	    (*len > EXT_MAX_BLOCKS) ||
+	    (orig_start + *len >= EXT_MAX_BLOCKS))  {
+		ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
+			"[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	if (orig_inode->i_size > donor_inode->i_size) {
+		donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
+		/* TODO: eliminate this artificial restriction */
+		if (orig_start >= donor_blocks) {
+			ext4_debug("ext4 move extent: orig start offset "
+			"[%llu] should be less than donor file blocks "
+			"[%u] [ino:orig %lu, donor %lu]\n",
+			orig_start, donor_blocks,
+			orig_inode->i_ino, donor_inode->i_ino);
+			return -EINVAL;
+		}
+
+		/* TODO: eliminate this artificial restriction */
+		if (orig_start + *len > donor_blocks) {
+			ext4_debug("ext4 move extent: End offset [%llu] should "
+				"be less than donor file blocks [%u]."
+				"So adjust length from %llu to %llu "
+				"[ino:orig %lu, donor %lu]\n",
+				orig_start + *len, donor_blocks,
+				*len, donor_blocks - orig_start,
+				orig_inode->i_ino, donor_inode->i_ino);
+			*len = donor_blocks - orig_start;
+		}
+	} else {
+		orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
+		if (orig_start >= orig_blocks) {
+			ext4_debug("ext4 move extent: start offset [%llu] "
+				"should be less than original file blocks "
+				"[%u] [ino:orig %lu, donor %lu]\n",
+				 orig_start, orig_blocks,
+				orig_inode->i_ino, donor_inode->i_ino);
+			return -EINVAL;
+		}
+
+		if (orig_start + *len > orig_blocks) {
+			ext4_debug("ext4 move extent: Adjust length "
+				"from %llu to %llu. Because it should be "
+				"less than original file blocks "
+				"[ino:orig %lu, donor %lu]\n",
+				*len, orig_blocks - orig_start,
+				orig_inode->i_ino, donor_inode->i_ino);
+			*len = orig_blocks - orig_start;
+		}
+	}
+
+	if (!*len) {
+		ext4_debug("ext4 move extent: len should not be 0 "
+			"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+			donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ *
+ * @inode1:	the inode structure
+ * @inode2:	the inode structure
+ *
+ * Lock two inodes' i_mutex by i_ino order.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ */
+static int
+mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+{
+	int ret = 0;
+
+	BUG_ON(inode1 == NULL && inode2 == NULL);
+
+	ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
+	if (ret < 0)
+		goto out;
+
+	if (inode1 == inode2) {
+		mutex_lock(&inode1->i_mutex);
+		goto out;
+	}
+
+	if (inode1->i_ino < inode2->i_ino) {
+		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+	} else {
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
+	}
+
+out:
+	return ret;
+}
+
+/**
+ * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ *
+ * @inode1:     the inode that is released first
+ * @inode2:     the inode that is released second
+ *
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ */
+
+static int
+mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+{
+	int ret = 0;
+
+	BUG_ON(inode1 == NULL && inode2 == NULL);
+
+	ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
+	if (ret < 0)
+		goto out;
+
+	if (inode1)
+		mutex_unlock(&inode1->i_mutex);
+
+	if (inode2 && inode2 != inode1)
+		mutex_unlock(&inode2->i_mutex);
+
+out:
+	return ret;
+}
+
+/**
+ * ext4_move_extents - Exchange the specified range of a file
+ *
+ * @o_filp:		file structure of the original file
+ * @d_filp:		file structure of the donor file
+ * @orig_start:		start offset in block for orig
+ * @donor_start:	start offset in block for donor
+ * @len:		the number of blocks to be moved
+ * @moved_len:		moved block length
+ *
+ * This function returns 0 and moved block length is set in moved_len
+ * if succeed, otherwise returns error value.
+ *
+ * Note: ext4_move_extents() proceeds the following order.
+ * 1:ext4_move_extents() calculates the last block number of moving extent
+ *   function by the start block number (orig_start) and the number of blocks
+ *   to be moved (len) specified as arguments.
+ *   If the {orig, donor}_start points a hole, the extent's start offset
+ *   pointed by ext_cur (current extent), holecheck_path, orig_path are set
+ *   after hole behind.
+ * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
+ *   or the ext_cur exceeds the block_end which is last logical block number.
+ * 3:To get the length of continues area, call mext_next_extent()
+ *   specified with the ext_cur (initial value is holecheck_path) re-cursive,
+ *   until find un-continuous extent, the start logical block number exceeds
+ *   the block_end or the extent points to the last extent.
+ * 4:Exchange the original inode data with donor inode data
+ *   from orig_page_offset to seq_end_page.
+ *   The start indexes of data are specified as arguments.
+ *   That of the original inode is orig_page_offset,
+ *   and the donor inode is also orig_page_offset
+ *   (To easily handle blocksize != pagesize case, the offset for the
+ *   donor inode is block unit).
+ * 5:Update holecheck_path and orig_path to points a next proceeding extent,
+ *   then returns to step 2.
+ * 6:Release holecheck_path, orig_path and set the len to moved_len
+ *   which shows the number of moved blocks.
+ *   The moved_len is useful for the command to calculate the file offset
+ *   for starting next move extent ioctl.
+ * 7:Return 0 on success, or a negative error value on failure.
+ */
+int
+ext4_move_extents(struct file *o_filp, struct file *d_filp,
+		 __u64 orig_start, __u64 donor_start, __u64 len,
+		 __u64 *moved_len)
+{
+	struct inode *orig_inode = o_filp->f_dentry->d_inode;
+	struct inode *donor_inode = d_filp->f_dentry->d_inode;
+	struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
+	struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
+	ext4_lblk_t block_start = orig_start;
+	ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+	ext4_lblk_t rest_blocks;
+	pgoff_t orig_page_offset = 0, seq_end_page;
+	int ret1, ret2, depth, last_extent = 0;
+	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+	int data_offset_in_page;
+	int block_len_in_page;
+	int uninit;
+
+	/* orig and donor should be different file */
+	if (orig_inode->i_ino == donor_inode->i_ino) {
+		ext4_debug("ext4 move extent: The argument files should not "
+			"be same file [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* Regular file check */
+	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+		ext4_debug("ext4 move extent: The argument files should be "
+			"regular file [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* Protect orig and donor inodes against a truncate */
+	ret1 = mext_inode_double_lock(orig_inode, donor_inode);
+	if (ret1 < 0)
+		return ret1;
+
+	/* Protect extent tree against block allocations via delalloc */
+	double_down_write_data_sem(orig_inode, donor_inode);
+	/* Check the filesystem environment whether move_extent can be done */
+	ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
+				    donor_start, &len);
+	if (ret1)
+		goto out;
+
+	file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
+	block_end = block_start + len - 1;
+	if (file_end < block_end)
+		len -= block_end - file_end;
+
+	ret1 = get_ext_path(orig_inode, block_start, &orig_path);
+	if (ret1)
+		goto out;
+
+	/* Get path structure to check the hole */
+	ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
+	if (ret1)
+		goto out;
+
+	depth = ext_depth(orig_inode);
+	ext_cur = holecheck_path[depth].p_ext;
+
+	/*
+	 * Get proper starting location of block replacement if block_start was
+	 * within the hole.
+	 */
+	if (le32_to_cpu(ext_cur->ee_block) +
+		ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+		/*
+		 * The hole exists between extents or the tail of
+		 * original file.
+		 */
+		last_extent = mext_next_extent(orig_inode,
+					holecheck_path, &ext_cur);
+		if (last_extent < 0) {
+			ret1 = last_extent;
+			goto out;
+		}
+		last_extent = mext_next_extent(orig_inode, orig_path,
+							&ext_dummy);
+		if (last_extent < 0) {
+			ret1 = last_extent;
+			goto out;
+		}
+		seq_start = le32_to_cpu(ext_cur->ee_block);
+	} else if (le32_to_cpu(ext_cur->ee_block) > block_start)
+		/* The hole exists at the beginning of original file. */
+		seq_start = le32_to_cpu(ext_cur->ee_block);
+	else
+		seq_start = block_start;
+
+	/* No blocks within the specified range. */
+	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+		ext4_debug("ext4 move extent: The specified range of file "
+							"may be the hole\n");
+		ret1 = -EINVAL;
+		goto out;
+	}
+
+	/* Adjust start blocks */
+	add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+			 ext4_ext_get_actual_len(ext_cur), block_end + 1) -
+		     max(le32_to_cpu(ext_cur->ee_block), block_start);
+
+	while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+		seq_blocks += add_blocks;
+
+		/* Adjust tail blocks */
+		if (seq_start + seq_blocks - 1 > block_end)
+			seq_blocks = block_end - seq_start + 1;
+
+		ext_prev = ext_cur;
+		last_extent = mext_next_extent(orig_inode, holecheck_path,
+						&ext_cur);
+		if (last_extent < 0) {
+			ret1 = last_extent;
+			break;
+		}
+		add_blocks = ext4_ext_get_actual_len(ext_cur);
+
+		/*
+		 * Extend the length of contiguous block (seq_blocks)
+		 * if extents are contiguous.
+		 */
+		if (ext4_can_extents_be_merged(orig_inode,
+					       ext_prev, ext_cur) &&
+		    block_end >= le32_to_cpu(ext_cur->ee_block) &&
+		    !last_extent)
+			continue;
+
+		/* Is original extent is uninitialized */
+		uninit = ext4_ext_is_uninitialized(ext_prev);
+
+		data_offset_in_page = seq_start % blocks_per_page;
+
+		/*
+		 * Calculate data blocks count that should be swapped
+		 * at the first page.
+		 */
+		if (data_offset_in_page + seq_blocks > blocks_per_page) {
+			/* Swapped blocks are across pages */
+			block_len_in_page =
+					blocks_per_page - data_offset_in_page;
+		} else {
+			/* Swapped blocks are in a page */
+			block_len_in_page = seq_blocks;
+		}
+
+		orig_page_offset = seq_start >>
+				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+		seq_end_page = (seq_start + seq_blocks - 1) >>
+				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+		seq_start = le32_to_cpu(ext_cur->ee_block);
+		rest_blocks = seq_blocks;
+
+		/*
+		 * Up semaphore to avoid following problems:
+		 * a. transaction deadlock among ext4_journal_start,
+		 *    ->write_begin via pagefault, and jbd2_journal_commit
+		 * b. racing with ->readpage, ->write_begin, and ext4_get_block
+		 *    in move_extent_per_page
+		 */
+		double_up_write_data_sem(orig_inode, donor_inode);
+
+		while (orig_page_offset <= seq_end_page) {
+
+			/* Swap original branches with new branches */
+			block_len_in_page = move_extent_per_page(
+						o_filp, donor_inode,
+						orig_page_offset,
+						data_offset_in_page,
+						block_len_in_page, uninit,
+						&ret1);
+
+			/* Count how many blocks we have exchanged */
+			*moved_len += block_len_in_page;
+			if (ret1 < 0)
+				break;
+			if (*moved_len > len) {
+				EXT4_ERROR_INODE(orig_inode,
+					"We replaced blocks too much! "
+					"sum of replaced: %llu requested: %llu",
+					*moved_len, len);
+				ret1 = -EIO;
+				break;
+			}
+
+			orig_page_offset++;
+			data_offset_in_page = 0;
+			rest_blocks -= block_len_in_page;
+			if (rest_blocks > blocks_per_page)
+				block_len_in_page = blocks_per_page;
+			else
+				block_len_in_page = rest_blocks;
+		}
+
+		double_down_write_data_sem(orig_inode, donor_inode);
+		if (ret1 < 0)
+			break;
+
+		/* Decrease buffer counter */
+		if (holecheck_path)
+			ext4_ext_drop_refs(holecheck_path);
+		ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
+		if (ret1)
+			break;
+		depth = holecheck_path->p_depth;
+
+		/* Decrease buffer counter */
+		if (orig_path)
+			ext4_ext_drop_refs(orig_path);
+		ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
+		if (ret1)
+			break;
+
+		ext_cur = holecheck_path[depth].p_ext;
+		add_blocks = ext4_ext_get_actual_len(ext_cur);
+		seq_blocks = 0;
+
+	}
+out:
+	if (*moved_len) {
+		ext4_discard_preallocations(orig_inode);
+		ext4_discard_preallocations(donor_inode);
+	}
+
+	if (orig_path) {
+		ext4_ext_drop_refs(orig_path);
+		kfree(orig_path);
+	}
+	if (holecheck_path) {
+		ext4_ext_drop_refs(holecheck_path);
+		kfree(holecheck_path);
+	}
+	double_up_write_data_sem(orig_inode, donor_inode);
+	ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
+
+	if (ret1)
+		return ret1;
+	else if (ret2)
+		return ret2;
+
+	return 0;
+}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
new file mode 100644
index 00000000..3d36d5a1
--- /dev/null
+++ b/fs/ext4/namei.c
@@ -0,0 +1,2612 @@
+/*
+ *  linux/fs/ext4/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *  Directory entry file type support and forward compatibility hooks
+ *	for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
+ *  Hash Tree Directory indexing (c)
+ *	Daniel Phillips, 2001
+ *  Hash Tree Directory indexing porting
+ *	Christopher Li, 2002
+ *  Hash Tree Directory indexing cleanup
+ *	Theodore Ts'o, 2002
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/jbd2.h>
+#include <linux/time.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+#include "xattr.h"
+#include "acl.h"
+
+#include <trace/events/ext4.h>
+/*
+ * define how far ahead to read directories while searching them.
+ */
+#define NAMEI_RA_CHUNKS  2
+#define NAMEI_RA_BLOCKS  4
+#define NAMEI_RA_SIZE	     (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+
+static struct buffer_head *ext4_append(handle_t *handle,
+					struct inode *inode,
+					ext4_lblk_t *block, int *err)
+{
+	struct buffer_head *bh;
+
+	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+
+	bh = ext4_bread(handle, inode, *block, 1, err);
+	if (bh) {
+		inode->i_size += inode->i_sb->s_blocksize;
+		EXT4_I(inode)->i_disksize = inode->i_size;
+		*err = ext4_journal_get_write_access(handle, bh);
+		if (*err) {
+			brelse(bh);
+			bh = NULL;
+		}
+	}
+	return bh;
+}
+
+#ifndef assert
+#define assert(test) J_ASSERT(test)
+#endif
+
+#ifdef DX_DEBUG
+#define dxtrace(command) command
+#else
+#define dxtrace(command)
+#endif
+
+struct fake_dirent
+{
+	__le32 inode;
+	__le16 rec_len;
+	u8 name_len;
+	u8 file_type;
+};
+
+struct dx_countlimit
+{
+	__le16 limit;
+	__le16 count;
+};
+
+struct dx_entry
+{
+	__le32 hash;
+	__le32 block;
+};
+
+/*
+ * dx_root_info is laid out so that if it should somehow get overlaid by a
+ * dirent the two low bits of the hash version will be zero.  Therefore, the
+ * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
+ */
+
+struct dx_root
+{
+	struct fake_dirent dot;
+	char dot_name[4];
+	struct fake_dirent dotdot;
+	char dotdot_name[4];
+	struct dx_root_info
+	{
+		__le32 reserved_zero;
+		u8 hash_version;
+		u8 info_length; /* 8 */
+		u8 indirect_levels;
+		u8 unused_flags;
+	}
+	info;
+	struct dx_entry	entries[0];
+};
+
+struct dx_node
+{
+	struct fake_dirent fake;
+	struct dx_entry	entries[0];
+};
+
+
+struct dx_frame
+{
+	struct buffer_head *bh;
+	struct dx_entry *entries;
+	struct dx_entry *at;
+};
+
+struct dx_map_entry
+{
+	u32 hash;
+	u16 offs;
+	u16 size;
+};
+
+static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
+static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
+static inline unsigned dx_get_hash(struct dx_entry *entry);
+static void dx_set_hash(struct dx_entry *entry, unsigned value);
+static unsigned dx_get_count(struct dx_entry *entries);
+static unsigned dx_get_limit(struct dx_entry *entries);
+static void dx_set_count(struct dx_entry *entries, unsigned value);
+static void dx_set_limit(struct dx_entry *entries, unsigned value);
+static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
+static unsigned dx_node_limit(struct inode *dir);
+static struct dx_frame *dx_probe(const struct qstr *d_name,
+				 struct inode *dir,
+				 struct dx_hash_info *hinfo,
+				 struct dx_frame *frame,
+				 int *err);
+static void dx_release(struct dx_frame *frames);
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+		       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
+		struct dx_map_entry *offsets, int count, unsigned blocksize);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
+static void dx_insert_block(struct dx_frame *frame,
+					u32 hash, ext4_lblk_t block);
+static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+				 struct dx_frame *frame,
+				 struct dx_frame *frames,
+				 __u32 *start_hash);
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+		const struct qstr *d_name,
+		struct ext4_dir_entry_2 **res_dir,
+		int *err);
+static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+			     struct inode *inode);
+
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
+{
+	return (struct ext4_dir_entry_2 *)((char *)p +
+		ext4_rec_len_from_disk(p->rec_len, blocksize));
+}
+
+/*
+ * Future: use high four bits of block for coalesce-on-delete flags
+ * Mask them off for now.
+ */
+
+static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
+{
+	return le32_to_cpu(entry->block) & 0x00ffffff;
+}
+
+static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
+{
+	entry->block = cpu_to_le32(value);
+}
+
+static inline unsigned dx_get_hash(struct dx_entry *entry)
+{
+	return le32_to_cpu(entry->hash);
+}
+
+static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
+{
+	entry->hash = cpu_to_le32(value);
+}
+
+static inline unsigned dx_get_count(struct dx_entry *entries)
+{
+	return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+}
+
+static inline unsigned dx_get_limit(struct dx_entry *entries)
+{
+	return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
+}
+
+static inline void dx_set_count(struct dx_entry *entries, unsigned value)
+{
+	((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
+}
+
+static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
+{
+	((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+}
+
+static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
+{
+	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
+		EXT4_DIR_REC_LEN(2) - infosize;
+	return entry_space / sizeof(struct dx_entry);
+}
+
+static inline unsigned dx_node_limit(struct inode *dir)
+{
+	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
+	return entry_space / sizeof(struct dx_entry);
+}
+
+/*
+ * Debug
+ */
+#ifdef DX_DEBUG
+static void dx_show_index(char * label, struct dx_entry *entries)
+{
+	int i, n = dx_get_count (entries);
+	printk(KERN_DEBUG "%s index ", label);
+	for (i = 0; i < n; i++) {
+		printk("%x->%lu ", i ? dx_get_hash(entries + i) :
+				0, (unsigned long)dx_get_block(entries + i));
+	}
+	printk("\n");
+}
+
+struct stats
+{
+	unsigned names;
+	unsigned space;
+	unsigned bcount;
+};
+
+static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de,
+				 int size, int show_names)
+{
+	unsigned names = 0, space = 0;
+	char *base = (char *) de;
+	struct dx_hash_info h = *hinfo;
+
+	printk("names: ");
+	while ((char *) de < base + size)
+	{
+		if (de->inode)
+		{
+			if (show_names)
+			{
+				int len = de->name_len;
+				char *name = de->name;
+				while (len--) printk("%c", *name++);
+				ext4fs_dirhash(de->name, de->name_len, &h);
+				printk(":%x.%u ", h.hash,
+				       ((char *) de - base));
+			}
+			space += EXT4_DIR_REC_LEN(de->name_len);
+			names++;
+		}
+		de = ext4_next_entry(de, size);
+	}
+	printk("(%i)\n", names);
+	return (struct stats) { names, space, 1 };
+}
+
+struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+			     struct dx_entry *entries, int levels)
+{
+	unsigned blocksize = dir->i_sb->s_blocksize;
+	unsigned count = dx_get_count(entries), names = 0, space = 0, i;
+	unsigned bcount = 0;
+	struct buffer_head *bh;
+	int err;
+	printk("%i indexed blocks...\n", count);
+	for (i = 0; i < count; i++, entries++)
+	{
+		ext4_lblk_t block = dx_get_block(entries);
+		ext4_lblk_t hash  = i ? dx_get_hash(entries): 0;
+		u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
+		struct stats stats;
+		printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
+		if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue;
+		stats = levels?
+		   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
+		   dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
+		names += stats.names;
+		space += stats.space;
+		bcount += stats.bcount;
+		brelse(bh);
+	}
+	if (bcount)
+		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
+		       levels ? "" : "   ", names, space/bcount,
+		       (space/bcount)*100/blocksize);
+	return (struct stats) { names, space, bcount};
+}
+#endif /* DX_DEBUG */
+
+/*
+ * Probe for a directory leaf block to search.
+ *
+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+ * error in the directory index, and the caller should fall back to
+ * searching the directory normally.  The callers of dx_probe **MUST**
+ * check for this error code, and make sure it never gets reflected
+ * back to userspace.
+ */
+static struct dx_frame *
+dx_probe(const struct qstr *d_name, struct inode *dir,
+	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+{
+	unsigned count, indirect;
+	struct dx_entry *at, *entries, *p, *q, *m;
+	struct dx_root *root;
+	struct buffer_head *bh;
+	struct dx_frame *frame = frame_in;
+	u32 hash;
+
+	frame->bh = NULL;
+	if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+		goto fail;
+	root = (struct dx_root *) bh->b_data;
+	if (root->info.hash_version != DX_HASH_TEA &&
+	    root->info.hash_version != DX_HASH_HALF_MD4 &&
+	    root->info.hash_version != DX_HASH_LEGACY) {
+		ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
+			     root->info.hash_version);
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+	hinfo->hash_version = root->info.hash_version;
+	if (hinfo->hash_version <= DX_HASH_TEA)
+		hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
+	hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+	if (d_name)
+		ext4fs_dirhash(d_name->name, d_name->len, hinfo);
+	hash = hinfo->hash;
+
+	if (root->info.unused_flags & 1) {
+		ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
+			     root->info.unused_flags);
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+
+	if ((indirect = root->info.indirect_levels) > 1) {
+		ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
+			     root->info.indirect_levels);
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+
+	entries = (struct dx_entry *) (((char *)&root->info) +
+				       root->info.info_length);
+
+	if (dx_get_limit(entries) != dx_root_limit(dir,
+						   root->info.info_length)) {
+		ext4_warning(dir->i_sb, "dx entry: limit != root limit");
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+
+	dxtrace(printk("Look up %x", hash));
+	while (1)
+	{
+		count = dx_get_count(entries);
+		if (!count || count > dx_get_limit(entries)) {
+			ext4_warning(dir->i_sb,
+				     "dx entry: no count or count > limit");
+			brelse(bh);
+			*err = ERR_BAD_DX_DIR;
+			goto fail2;
+		}
+
+		p = entries + 1;
+		q = entries + count - 1;
+		while (p <= q)
+		{
+			m = p + (q - p)/2;
+			dxtrace(printk("."));
+			if (dx_get_hash(m) > hash)
+				q = m - 1;
+			else
+				p = m + 1;
+		}
+
+		if (0) // linear search cross check
+		{
+			unsigned n = count - 1;
+			at = entries;
+			while (n--)
+			{
+				dxtrace(printk(","));
+				if (dx_get_hash(++at) > hash)
+				{
+					at--;
+					break;
+				}
+			}
+			assert (at == p - 1);
+		}
+
+		at = p - 1;
+		dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
+		frame->bh = bh;
+		frame->entries = entries;
+		frame->at = at;
+		if (!indirect--) return frame;
+		if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
+			goto fail2;
+		at = entries = ((struct dx_node *) bh->b_data)->entries;
+		if (dx_get_limit(entries) != dx_node_limit (dir)) {
+			ext4_warning(dir->i_sb,
+				     "dx entry: limit != node limit");
+			brelse(bh);
+			*err = ERR_BAD_DX_DIR;
+			goto fail2;
+		}
+		frame++;
+		frame->bh = NULL;
+	}
+fail2:
+	while (frame >= frame_in) {
+		brelse(frame->bh);
+		frame--;
+	}
+fail:
+	if (*err == ERR_BAD_DX_DIR)
+		ext4_warning(dir->i_sb,
+			     "Corrupt dir inode %ld, running e2fsck is "
+			     "recommended.", dir->i_ino);
+	return NULL;
+}
+
+static void dx_release (struct dx_frame *frames)
+{
+	if (frames[0].bh == NULL)
+		return;
+
+	if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
+		brelse(frames[1].bh);
+	brelse(frames[0].bh);
+}
+
+/*
+ * This function increments the frame pointer to search the next leaf
+ * block, and reads in the necessary intervening nodes if the search
+ * should be necessary.  Whether or not the search is necessary is
+ * controlled by the hash parameter.  If the hash value is even, then
+ * the search is only continued if the next block starts with that
+ * hash value.  This is used if we are searching for a specific file.
+ *
+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
+ *
+ * This function returns 1 if the caller should continue to search,
+ * or 0 if it should not.  If there is an error reading one of the
+ * index blocks, it will a negative error code.
+ *
+ * If start_hash is non-null, it will be filled in with the starting
+ * hash of the next page.
+ */
+static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+				 struct dx_frame *frame,
+				 struct dx_frame *frames,
+				 __u32 *start_hash)
+{
+	struct dx_frame *p;
+	struct buffer_head *bh;
+	int err, num_frames = 0;
+	__u32 bhash;
+
+	p = frame;
+	/*
+	 * Find the next leaf page by incrementing the frame pointer.
+	 * If we run out of entries in the interior node, loop around and
+	 * increment pointer in the parent node.  When we break out of
+	 * this loop, num_frames indicates the number of interior
+	 * nodes need to be read.
+	 */
+	while (1) {
+		if (++(p->at) < p->entries + dx_get_count(p->entries))
+			break;
+		if (p == frames)
+			return 0;
+		num_frames++;
+		p--;
+	}
+
+	/*
+	 * If the hash is 1, then continue only if the next page has a
+	 * continuation hash of any value.  This is used for readdir
+	 * handling.  Otherwise, check to see if the hash matches the
+	 * desired contiuation hash.  If it doesn't, return since
+	 * there's no point to read in the successive index pages.
+	 */
+	bhash = dx_get_hash(p->at);
+	if (start_hash)
+		*start_hash = bhash;
+	if ((hash & 1) == 0) {
+		if ((bhash & ~1) != hash)
+			return 0;
+	}
+	/*
+	 * If the hash is HASH_NB_ALWAYS, we always go to the next
+	 * block so no check is necessary
+	 */
+	while (num_frames--) {
+		if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
+				      0, &err)))
+			return err; /* Failure */
+		p++;
+		brelse(p->bh);
+		p->bh = bh;
+		p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
+	}
+	return 1;
+}
+
+
+/*
+ * This function fills a red-black tree with information from a
+ * directory block.  It returns the number directory entries loaded
+ * into the tree.  If there is an error it is returned in err.
+ */
+static int htree_dirblock_to_tree(struct file *dir_file,
+				  struct inode *dir, ext4_lblk_t block,
+				  struct dx_hash_info *hinfo,
+				  __u32 start_hash, __u32 start_minor_hash)
+{
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de, *top;
+	int err, count = 0;
+
+	dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
+							(unsigned long)block));
+	if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
+		return err;
+
+	de = (struct ext4_dir_entry_2 *) bh->b_data;
+	top = (struct ext4_dir_entry_2 *) ((char *) de +
+					   dir->i_sb->s_blocksize -
+					   EXT4_DIR_REC_LEN(0));
+	for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
+		if (ext4_check_dir_entry(dir, NULL, de, bh,
+				(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+					 + ((char *)de - bh->b_data))) {
+			/* On error, skip the f_pos to the next block. */
+			dir_file->f_pos = (dir_file->f_pos |
+					(dir->i_sb->s_blocksize - 1)) + 1;
+			brelse(bh);
+			return count;
+		}
+		ext4fs_dirhash(de->name, de->name_len, hinfo);
+		if ((hinfo->hash < start_hash) ||
+		    ((hinfo->hash == start_hash) &&
+		     (hinfo->minor_hash < start_minor_hash)))
+			continue;
+		if (de->inode == 0)
+			continue;
+		if ((err = ext4_htree_store_dirent(dir_file,
+				   hinfo->hash, hinfo->minor_hash, de)) != 0) {
+			brelse(bh);
+			return err;
+		}
+		count++;
+	}
+	brelse(bh);
+	return count;
+}
+
+
+/*
+ * This function fills a red-black tree with information from a
+ * directory.  We start scanning the directory in hash order, starting
+ * at start_hash and start_minor_hash.
+ *
+ * This function returns the number of entries inserted into the tree,
+ * or a negative error code.
+ */
+int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+			 __u32 start_minor_hash, __u32 *next_hash)
+{
+	struct dx_hash_info hinfo;
+	struct ext4_dir_entry_2 *de;
+	struct dx_frame frames[2], *frame;
+	struct inode *dir;
+	ext4_lblk_t block;
+	int count = 0;
+	int ret, err;
+	__u32 hashval;
+
+	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
+		       start_hash, start_minor_hash));
+	dir = dir_file->f_path.dentry->d_inode;
+	if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
+		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+		if (hinfo.hash_version <= DX_HASH_TEA)
+			hinfo.hash_version +=
+				EXT4_SB(dir->i_sb)->s_hash_unsigned;
+		hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
+					       start_hash, start_minor_hash);
+		*next_hash = ~0;
+		return count;
+	}
+	hinfo.hash = start_hash;
+	hinfo.minor_hash = 0;
+	frame = dx_probe(NULL, dir, &hinfo, frames, &err);
+	if (!frame)
+		return err;
+
+	/* Add '.' and '..' from the htree header */
+	if (!start_hash && !start_minor_hash) {
+		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
+		if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+			goto errout;
+		count++;
+	}
+	if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
+		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
+		de = ext4_next_entry(de, dir->i_sb->s_blocksize);
+		if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
+			goto errout;
+		count++;
+	}
+
+	while (1) {
+		block = dx_get_block(frame->at);
+		ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
+					     start_hash, start_minor_hash);
+		if (ret < 0) {
+			err = ret;
+			goto errout;
+		}
+		count += ret;
+		hashval = ~0;
+		ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
+					    frame, frames, &hashval);
+		*next_hash = hashval;
+		if (ret < 0) {
+			err = ret;
+			goto errout;
+		}
+		/*
+		 * Stop if:  (a) there are no more entries, or
+		 * (b) we have inserted at least one entry and the
+		 * next hash value is not a continuation
+		 */
+		if ((ret == 0) ||
+		    (count && ((hashval & 1) == 0)))
+			break;
+	}
+	dx_release(frames);
+	dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
+		       "next hash: %x\n", count, *next_hash));
+	return count;
+errout:
+	dx_release(frames);
+	return (err);
+}
+
+
+/*
+ * Directory block splitting, compacting
+ */
+
+/*
+ * Create map of hash values, offsets, and sizes, stored at end of block.
+ * Returns number of entries mapped.
+ */
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+		       struct dx_hash_info *hinfo,
+		       struct dx_map_entry *map_tail)
+{
+	int count = 0;
+	char *base = (char *) de;
+	struct dx_hash_info h = *hinfo;
+
+	while ((char *) de < base + blocksize) {
+		if (de->name_len && de->inode) {
+			ext4fs_dirhash(de->name, de->name_len, &h);
+			map_tail--;
+			map_tail->hash = h.hash;
+			map_tail->offs = ((char *) de - base)>>2;
+			map_tail->size = le16_to_cpu(de->rec_len);
+			count++;
+			cond_resched();
+		}
+		/* XXX: do we need to check rec_len == 0 case? -Chris */
+		de = ext4_next_entry(de, blocksize);
+	}
+	return count;
+}
+
+/* Sort map by hash value */
+static void dx_sort_map (struct dx_map_entry *map, unsigned count)
+{
+	struct dx_map_entry *p, *q, *top = map + count - 1;
+	int more;
+	/* Combsort until bubble sort doesn't suck */
+	while (count > 2) {
+		count = count*10/13;
+		if (count - 9 < 2) /* 9, 10 -> 11 */
+			count = 11;
+		for (p = top, q = p - count; q >= map; p--, q--)
+			if (p->hash < q->hash)
+				swap(*p, *q);
+	}
+	/* Garden variety bubble sort */
+	do {
+		more = 0;
+		q = top;
+		while (q-- > map) {
+			if (q[1].hash >= q[0].hash)
+				continue;
+			swap(*(q+1), *q);
+			more = 1;
+		}
+	} while(more);
+}
+
+static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
+{
+	struct dx_entry *entries = frame->entries;
+	struct dx_entry *old = frame->at, *new = old + 1;
+	int count = dx_get_count(entries);
+
+	assert(count < dx_get_limit(entries));
+	assert(old < entries + count);
+	memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
+	dx_set_hash(new, hash);
+	dx_set_block(new, block);
+	dx_set_count(entries, count + 1);
+}
+
+static void ext4_update_dx_flag(struct inode *inode)
+{
+	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+				     EXT4_FEATURE_COMPAT_DIR_INDEX))
+		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+}
+
+/*
+ * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
+ *
+ * `len <= EXT4_NAME_LEN' is guaranteed by caller.
+ * `de != NULL' is guaranteed by caller.
+ */
+static inline int ext4_match (int len, const char * const name,
+			      struct ext4_dir_entry_2 * de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+static inline int search_dirblock(struct buffer_head *bh,
+				  struct inode *dir,
+				  const struct qstr *d_name,
+				  unsigned int offset,
+				  struct ext4_dir_entry_2 ** res_dir)
+{
+	struct ext4_dir_entry_2 * de;
+	char * dlimit;
+	int de_len;
+	const char *name = d_name->name;
+	int namelen = d_name->len;
+
+	de = (struct ext4_dir_entry_2 *) bh->b_data;
+	dlimit = bh->b_data + dir->i_sb->s_blocksize;
+	while ((char *) de < dlimit) {
+		/* this code is executed quadratically often */
+		/* do minimal checking `by hand' */
+
+		if ((char *) de + namelen <= dlimit &&
+		    ext4_match (namelen, name, de)) {
+			/* found a match - just to be sure, do a full check */
+			if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
+				return -1;
+			*res_dir = de;
+			return 1;
+		}
+		/* prevent looping on a bad block */
+		de_len = ext4_rec_len_from_disk(de->rec_len,
+						dir->i_sb->s_blocksize);
+		if (de_len <= 0)
+			return -1;
+		offset += de_len;
+		de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
+	}
+	return 0;
+}
+
+
+/*
+ *	ext4_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the cache buffer in which the entry was found, and the entry
+ * itself (as a parameter - res_dir). It does NOT read the inode of the
+ * entry - you'll have to do that yourself if you want to.
+ *
+ * The returned buffer_head has ->b_count elevated.  The caller is expected
+ * to brelse() it when appropriate.
+ */
+static struct buffer_head * ext4_find_entry (struct inode *dir,
+					const struct qstr *d_name,
+					struct ext4_dir_entry_2 ** res_dir)
+{
+	struct super_block *sb;
+	struct buffer_head *bh_use[NAMEI_RA_SIZE];
+	struct buffer_head *bh, *ret = NULL;
+	ext4_lblk_t start, block, b;
+	const u8 *name = d_name->name;
+	int ra_max = 0;		/* Number of bh's in the readahead
+				   buffer, bh_use[] */
+	int ra_ptr = 0;		/* Current index into readahead
+				   buffer */
+	int num = 0;
+	ext4_lblk_t  nblocks;
+	int i, err;
+	int namelen;
+
+	*res_dir = NULL;
+	sb = dir->i_sb;
+	namelen = d_name->len;
+	if (namelen > EXT4_NAME_LEN)
+		return NULL;
+	if ((namelen <= 2) && (name[0] == '.') &&
+	    (name[1] == '.' || name[1] == '\0')) {
+		/*
+		 * "." or ".." will only be in the first block
+		 * NFS may look up ".."; "." should be handled by the VFS
+		 */
+		block = start = 0;
+		nblocks = 1;
+		goto restart;
+	}
+	if (is_dx(dir)) {
+		bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
+		/*
+		 * On success, or if the error was file not found,
+		 * return.  Otherwise, fall back to doing a search the
+		 * old fashioned way.
+		 */
+		if (bh || (err != ERR_BAD_DX_DIR))
+			return bh;
+		dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+			       "falling back\n"));
+	}
+	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+	start = EXT4_I(dir)->i_dir_start_lookup;
+	if (start >= nblocks)
+		start = 0;
+	block = start;
+restart:
+	do {
+		/*
+		 * We deal with the read-ahead logic here.
+		 */
+		if (ra_ptr >= ra_max) {
+			/* Refill the readahead buffer */
+			ra_ptr = 0;
+			b = block;
+			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+				/*
+				 * Terminate if we reach the end of the
+				 * directory and must wrap, or if our
+				 * search has finished at this block.
+				 */
+				if (b >= nblocks || (num && block == start)) {
+					bh_use[ra_max] = NULL;
+					break;
+				}
+				num++;
+				bh = ext4_getblk(NULL, dir, b++, 0, &err);
+				bh_use[ra_max] = bh;
+				if (bh)
+					ll_rw_block(READ_META, 1, &bh);
+			}
+		}
+		if ((bh = bh_use[ra_ptr++]) == NULL)
+			goto next;
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			/* read error, skip block & hope for the best */
+			EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
+					 (unsigned long) block);
+			brelse(bh);
+			goto next;
+		}
+		i = search_dirblock(bh, dir, d_name,
+			    block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
+		if (i == 1) {
+			EXT4_I(dir)->i_dir_start_lookup = block;
+			ret = bh;
+			goto cleanup_and_exit;
+		} else {
+			brelse(bh);
+			if (i < 0)
+				goto cleanup_and_exit;
+		}
+	next:
+		if (++block >= nblocks)
+			block = 0;
+	} while (block != start);
+
+	/*
+	 * If the directory has grown while we were searching, then
+	 * search the last part of the directory before giving up.
+	 */
+	block = nblocks;
+	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+	if (block < nblocks) {
+		start = 0;
+		goto restart;
+	}
+
+cleanup_and_exit:
+	/* Clean up the read-ahead blocks */
+	for (; ra_ptr < ra_max; ra_ptr++)
+		brelse(bh_use[ra_ptr]);
+	return ret;
+}
+
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
+		       struct ext4_dir_entry_2 **res_dir, int *err)
+{
+	struct super_block * sb = dir->i_sb;
+	struct dx_hash_info	hinfo;
+	struct dx_frame frames[2], *frame;
+	struct buffer_head *bh;
+	ext4_lblk_t block;
+	int retval;
+
+	if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
+		return NULL;
+	do {
+		block = dx_get_block(frame->at);
+		if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
+			goto errout;
+
+		retval = search_dirblock(bh, dir, d_name,
+					 block << EXT4_BLOCK_SIZE_BITS(sb),
+					 res_dir);
+		if (retval == 1) { 	/* Success! */
+			dx_release(frames);
+			return bh;
+		}
+		brelse(bh);
+		if (retval == -1) {
+			*err = ERR_BAD_DX_DIR;
+			goto errout;
+		}
+
+		/* Check to see if we should continue to search */
+		retval = ext4_htree_next_block(dir, hinfo.hash, frame,
+					       frames, NULL);
+		if (retval < 0) {
+			ext4_warning(sb,
+			     "error reading index page in directory #%lu",
+			     dir->i_ino);
+			*err = retval;
+			goto errout;
+		}
+	} while (retval == 1);
+
+	*err = -ENOENT;
+errout:
+	dxtrace(printk(KERN_DEBUG "%s not found\n", name));
+	dx_release (frames);
+	return NULL;
+}
+
+static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode;
+	struct ext4_dir_entry_2 *de;
+	struct buffer_head *bh;
+
+	if (dentry->d_name.len > EXT4_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	bh = ext4_find_entry(dir, &dentry->d_name, &de);
+	inode = NULL;
+	if (bh) {
+		__u32 ino = le32_to_cpu(de->inode);
+		brelse(bh);
+		if (!ext4_valid_inum(dir->i_sb, ino)) {
+			EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
+			return ERR_PTR(-EIO);
+		}
+		inode = ext4_iget(dir->i_sb, ino);
+		if (IS_ERR(inode)) {
+			if (PTR_ERR(inode) == -ESTALE) {
+				EXT4_ERROR_INODE(dir,
+						 "deleted inode referenced: %u",
+						 ino);
+				return ERR_PTR(-EIO);
+			} else {
+				return ERR_CAST(inode);
+			}
+		}
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+
+struct dentry *ext4_get_parent(struct dentry *child)
+{
+	__u32 ino;
+	static const struct qstr dotdot = {
+		.name = "..",
+		.len = 2,
+	};
+	struct ext4_dir_entry_2 * de;
+	struct buffer_head *bh;
+
+	bh = ext4_find_entry(child->d_inode, &dotdot, &de);
+	if (!bh)
+		return ERR_PTR(-ENOENT);
+	ino = le32_to_cpu(de->inode);
+	brelse(bh);
+
+	if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
+		EXT4_ERROR_INODE(child->d_inode,
+				 "bad parent inode number: %u", ino);
+		return ERR_PTR(-EIO);
+	}
+
+	return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
+}
+
+#define S_SHIFT 12
+static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= EXT4_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= EXT4_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= EXT4_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= EXT4_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= EXT4_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= EXT4_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= EXT4_FT_SYMLINK,
+};
+
+static inline void ext4_set_de_type(struct super_block *sb,
+				struct ext4_dir_entry_2 *de,
+				umode_t mode) {
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+		de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+/*
+ * Move count entries from end of map between two memory locations.
+ * Returns pointer to last entry moved.
+ */
+static struct ext4_dir_entry_2 *
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+		unsigned blocksize)
+{
+	unsigned rec_len = 0;
+
+	while (count--) {
+		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
+						(from + (map->offs<<2));
+		rec_len = EXT4_DIR_REC_LEN(de->name_len);
+		memcpy (to, de, rec_len);
+		((struct ext4_dir_entry_2 *) to)->rec_len =
+				ext4_rec_len_to_disk(rec_len, blocksize);
+		de->inode = 0;
+		map++;
+		to += rec_len;
+	}
+	return (struct ext4_dir_entry_2 *) (to - rec_len);
+}
+
+/*
+ * Compact each dir entry in the range to the minimal rec_len.
+ * Returns pointer to last entry in range.
+ */
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
+{
+	struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
+	unsigned rec_len = 0;
+
+	prev = to = de;
+	while ((char*)de < base + blocksize) {
+		next = ext4_next_entry(de, blocksize);
+		if (de->inode && de->name_len) {
+			rec_len = EXT4_DIR_REC_LEN(de->name_len);
+			if (de > to)
+				memmove(to, de, rec_len);
+			to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
+			prev = to;
+			to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
+		}
+		de = next;
+	}
+	return prev;
+}
+
+/*
+ * Split a full leaf block to make room for a new dir entry.
+ * Allocate a new block, and move entries so that they are approx. equally full.
+ * Returns pointer to de in block into which the new entry will be inserted.
+ */
+static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+			struct buffer_head **bh,struct dx_frame *frame,
+			struct dx_hash_info *hinfo, int *error)
+{
+	unsigned blocksize = dir->i_sb->s_blocksize;
+	unsigned count, continued;
+	struct buffer_head *bh2;
+	ext4_lblk_t newblock;
+	u32 hash2;
+	struct dx_map_entry *map;
+	char *data1 = (*bh)->b_data, *data2;
+	unsigned split, move, size;
+	struct ext4_dir_entry_2 *de = NULL, *de2;
+	int	err = 0, i;
+
+	bh2 = ext4_append (handle, dir, &newblock, &err);
+	if (!(bh2)) {
+		brelse(*bh);
+		*bh = NULL;
+		goto errout;
+	}
+
+	BUFFER_TRACE(*bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, *bh);
+	if (err)
+		goto journal_error;
+
+	BUFFER_TRACE(frame->bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, frame->bh);
+	if (err)
+		goto journal_error;
+
+	data2 = bh2->b_data;
+
+	/* create map in the end of data2 block */
+	map = (struct dx_map_entry *) (data2 + blocksize);
+	count = dx_make_map((struct ext4_dir_entry_2 *) data1,
+			     blocksize, hinfo, map);
+	map -= count;
+	dx_sort_map(map, count);
+	/* Split the existing block in the middle, size-wise */
+	size = 0;
+	move = 0;
+	for (i = count-1; i >= 0; i--) {
+		/* is more than half of this entry in 2nd half of the block? */
+		if (size + map[i].size/2 > blocksize/2)
+			break;
+		size += map[i].size;
+		move++;
+	}
+	/* map index at which we will split */
+	split = count - move;
+	hash2 = map[split].hash;
+	continued = hash2 == map[split - 1].hash;
+	dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
+			(unsigned long)dx_get_block(frame->at),
+					hash2, split, count-split));
+
+	/* Fancy dance to stay within two buffers */
+	de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
+	de = dx_pack_dirents(data1, blocksize);
+	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+					   blocksize);
+	de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
+					    blocksize);
+	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
+	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
+
+	/* Which block gets the new entry? */
+	if (hinfo->hash >= hash2)
+	{
+		swap(*bh, bh2);
+		de = de2;
+	}
+	dx_insert_block(frame, hash2 + continued, newblock);
+	err = ext4_handle_dirty_metadata(handle, dir, bh2);
+	if (err)
+		goto journal_error;
+	err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
+	if (err)
+		goto journal_error;
+	brelse(bh2);
+	dxtrace(dx_show_index("frame", frame->entries));
+	return de;
+
+journal_error:
+	brelse(*bh);
+	brelse(bh2);
+	*bh = NULL;
+	ext4_std_error(dir->i_sb, err);
+errout:
+	*error = err;
+	return NULL;
+}
+
+/*
+ * Add a new entry into a directory (leaf) block.  If de is non-NULL,
+ * it points to a directory entry which is guaranteed to be large
+ * enough for new directory entry.  If de is NULL, then
+ * add_dirent_to_buf will attempt search the directory block for
+ * space.  It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ */
+static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
+			     struct inode *inode, struct ext4_dir_entry_2 *de,
+			     struct buffer_head *bh)
+{
+	struct inode	*dir = dentry->d_parent->d_inode;
+	const char	*name = dentry->d_name.name;
+	int		namelen = dentry->d_name.len;
+	unsigned int	offset = 0;
+	unsigned int	blocksize = dir->i_sb->s_blocksize;
+	unsigned short	reclen;
+	int		nlen, rlen, err;
+	char		*top;
+
+	reclen = EXT4_DIR_REC_LEN(namelen);
+	if (!de) {
+		de = (struct ext4_dir_entry_2 *)bh->b_data;
+		top = bh->b_data + blocksize - reclen;
+		while ((char *) de <= top) {
+			if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
+				return -EIO;
+			if (ext4_match(namelen, name, de))
+				return -EEXIST;
+			nlen = EXT4_DIR_REC_LEN(de->name_len);
+			rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
+			if ((de->inode? rlen - nlen: rlen) >= reclen)
+				break;
+			de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+			offset += rlen;
+		}
+		if ((char *) de > top)
+			return -ENOSPC;
+	}
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, bh);
+	if (err) {
+		ext4_std_error(dir->i_sb, err);
+		return err;
+	}
+
+	/* By now the buffer is marked for journaling */
+	nlen = EXT4_DIR_REC_LEN(de->name_len);
+	rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
+	if (de->inode) {
+		struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
+		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
+		de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
+		de = de1;
+	}
+	de->file_type = EXT4_FT_UNKNOWN;
+	if (inode) {
+		de->inode = cpu_to_le32(inode->i_ino);
+		ext4_set_de_type(dir->i_sb, de, inode->i_mode);
+	} else
+		de->inode = 0;
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+	/*
+	 * XXX shouldn't update any times until successful
+	 * completion of syscall, but too many callers depend
+	 * on this.
+	 *
+	 * XXX similarly, too many callers depend on
+	 * ext4_new_inode() setting the times, but error
+	 * recovery deletes the inode, so the worst that can
+	 * happen is that the times are slightly out of date
+	 * and/or different from the directory change time.
+	 */
+	dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+	ext4_update_dx_flag(dir);
+	dir->i_version++;
+	ext4_mark_inode_dirty(handle, dir);
+	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, dir, bh);
+	if (err)
+		ext4_std_error(dir->i_sb, err);
+	return 0;
+}
+
+/*
+ * This converts a one block unindexed directory to a 3 block indexed
+ * directory, and adds the dentry to the indexed directory.
+ */
+static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
+			    struct inode *inode, struct buffer_head *bh)
+{
+	struct inode	*dir = dentry->d_parent->d_inode;
+	const char	*name = dentry->d_name.name;
+	int		namelen = dentry->d_name.len;
+	struct buffer_head *bh2;
+	struct dx_root	*root;
+	struct dx_frame	frames[2], *frame;
+	struct dx_entry *entries;
+	struct ext4_dir_entry_2	*de, *de2;
+	char		*data1, *top;
+	unsigned	len;
+	int		retval;
+	unsigned	blocksize;
+	struct dx_hash_info hinfo;
+	ext4_lblk_t  block;
+	struct fake_dirent *fde;
+
+	blocksize =  dir->i_sb->s_blocksize;
+	dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
+	retval = ext4_journal_get_write_access(handle, bh);
+	if (retval) {
+		ext4_std_error(dir->i_sb, retval);
+		brelse(bh);
+		return retval;
+	}
+	root = (struct dx_root *) bh->b_data;
+
+	/* The 0th block becomes the root, move the dirents out */
+	fde = &root->dotdot;
+	de = (struct ext4_dir_entry_2 *)((char *)fde +
+		ext4_rec_len_from_disk(fde->rec_len, blocksize));
+	if ((char *) de >= (((char *) root) + blocksize)) {
+		EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
+		brelse(bh);
+		return -EIO;
+	}
+	len = ((char *) root) + blocksize - (char *) de;
+
+	/* Allocate new block for the 0th block's dirents */
+	bh2 = ext4_append(handle, dir, &block, &retval);
+	if (!(bh2)) {
+		brelse(bh);
+		return retval;
+	}
+	ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
+	data1 = bh2->b_data;
+
+	memcpy (data1, de, len);
+	de = (struct ext4_dir_entry_2 *) data1;
+	top = data1 + len;
+	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
+		de = de2;
+	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+					   blocksize);
+	/* Initialize the root; the dot dirents already exist */
+	de = (struct ext4_dir_entry_2 *) (&root->dotdot);
+	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
+					   blocksize);
+	memset (&root->info, 0, sizeof(root->info));
+	root->info.info_length = sizeof(root->info);
+	root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+	entries = root->entries;
+	dx_set_block(entries, 1);
+	dx_set_count(entries, 1);
+	dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
+
+	/* Initialize as for dx_probe */
+	hinfo.hash_version = root->info.hash_version;
+	if (hinfo.hash_version <= DX_HASH_TEA)
+		hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
+	hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+	ext4fs_dirhash(name, namelen, &hinfo);
+	frame = frames;
+	frame->entries = entries;
+	frame->at = entries;
+	frame->bh = bh;
+	bh = bh2;
+
+	ext4_handle_dirty_metadata(handle, dir, frame->bh);
+	ext4_handle_dirty_metadata(handle, dir, bh);
+
+	de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
+	if (!de) {
+		/*
+		 * Even if the block split failed, we have to properly write
+		 * out all the changes we did so far. Otherwise we can end up
+		 * with corrupted filesystem.
+		 */
+		ext4_mark_inode_dirty(handle, dir);
+		dx_release(frames);
+		return retval;
+	}
+	dx_release(frames);
+
+	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	brelse(bh);
+	return retval;
+}
+
+/*
+ *	ext4_add_entry()
+ *
+ * adds a file entry to the specified directory, using the same
+ * semantics as ext4_find_entry(). It returns NULL if it failed.
+ *
+ * NOTE!! The inode part of 'de' is left at 0 - which means you
+ * may not sleep between calling this and putting something into
+ * the entry, as someone else might have used it while you slept.
+ */
+static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+			  struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
+	struct super_block *sb;
+	int	retval;
+	int	dx_fallback=0;
+	unsigned blocksize;
+	ext4_lblk_t block, blocks;
+
+	sb = dir->i_sb;
+	blocksize = sb->s_blocksize;
+	if (!dentry->d_name.len)
+		return -EINVAL;
+	if (is_dx(dir)) {
+		retval = ext4_dx_add_entry(handle, dentry, inode);
+		if (!retval || (retval != ERR_BAD_DX_DIR))
+			return retval;
+		ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
+		dx_fallback++;
+		ext4_mark_inode_dirty(handle, dir);
+	}
+	blocks = dir->i_size >> sb->s_blocksize_bits;
+	for (block = 0; block < blocks; block++) {
+		bh = ext4_bread(handle, dir, block, 0, &retval);
+		if(!bh)
+			return retval;
+		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+		if (retval != -ENOSPC) {
+			brelse(bh);
+			return retval;
+		}
+
+		if (blocks == 1 && !dx_fallback &&
+		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
+			return make_indexed_dir(handle, dentry, inode, bh);
+		brelse(bh);
+	}
+	bh = ext4_append(handle, dir, &block, &retval);
+	if (!bh)
+		return retval;
+	de = (struct ext4_dir_entry_2 *) bh->b_data;
+	de->inode = 0;
+	de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
+	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	brelse(bh);
+	if (retval == 0)
+		ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
+	return retval;
+}
+
+/*
+ * Returns 0 for success, or a negative error value
+ */
+static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+			     struct inode *inode)
+{
+	struct dx_frame frames[2], *frame;
+	struct dx_entry *entries, *at;
+	struct dx_hash_info hinfo;
+	struct buffer_head *bh;
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct super_block *sb = dir->i_sb;
+	struct ext4_dir_entry_2 *de;
+	int err;
+
+	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+	if (!frame)
+		return err;
+	entries = frame->entries;
+	at = frame->at;
+
+	if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+		goto cleanup;
+
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, bh);
+	if (err)
+		goto journal_error;
+
+	err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+	if (err != -ENOSPC)
+		goto cleanup;
+
+	/* Block full, should compress but for now just split */
+	dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
+		       dx_get_count(entries), dx_get_limit(entries)));
+	/* Need to split index? */
+	if (dx_get_count(entries) == dx_get_limit(entries)) {
+		ext4_lblk_t newblock;
+		unsigned icount = dx_get_count(entries);
+		int levels = frame - frames;
+		struct dx_entry *entries2;
+		struct dx_node *node2;
+		struct buffer_head *bh2;
+
+		if (levels && (dx_get_count(frames->entries) ==
+			       dx_get_limit(frames->entries))) {
+			ext4_warning(sb, "Directory index full!");
+			err = -ENOSPC;
+			goto cleanup;
+		}
+		bh2 = ext4_append (handle, dir, &newblock, &err);
+		if (!(bh2))
+			goto cleanup;
+		node2 = (struct dx_node *)(bh2->b_data);
+		entries2 = node2->entries;
+		memset(&node2->fake, 0, sizeof(struct fake_dirent));
+		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
+							   sb->s_blocksize);
+		BUFFER_TRACE(frame->bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, frame->bh);
+		if (err)
+			goto journal_error;
+		if (levels) {
+			unsigned icount1 = icount/2, icount2 = icount - icount1;
+			unsigned hash2 = dx_get_hash(entries + icount1);
+			dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+				       icount1, icount2));
+
+			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+			err = ext4_journal_get_write_access(handle,
+							     frames[0].bh);
+			if (err)
+				goto journal_error;
+
+			memcpy((char *) entries2, (char *) (entries + icount1),
+			       icount2 * sizeof(struct dx_entry));
+			dx_set_count(entries, icount1);
+			dx_set_count(entries2, icount2);
+			dx_set_limit(entries2, dx_node_limit(dir));
+
+			/* Which index block gets the new entry? */
+			if (at - entries >= icount1) {
+				frame->at = at = at - entries - icount1 + entries2;
+				frame->entries = entries = entries2;
+				swap(frame->bh, bh2);
+			}
+			dx_insert_block(frames + 0, hash2, newblock);
+			dxtrace(dx_show_index("node", frames[1].entries));
+			dxtrace(dx_show_index("node",
+			       ((struct dx_node *) bh2->b_data)->entries));
+			err = ext4_handle_dirty_metadata(handle, dir, bh2);
+			if (err)
+				goto journal_error;
+			brelse (bh2);
+		} else {
+			dxtrace(printk(KERN_DEBUG
+				       "Creating second level index...\n"));
+			memcpy((char *) entries2, (char *) entries,
+			       icount * sizeof(struct dx_entry));
+			dx_set_limit(entries2, dx_node_limit(dir));
+
+			/* Set up root */
+			dx_set_count(entries, 1);
+			dx_set_block(entries + 0, newblock);
+			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
+
+			/* Add new access path frame */
+			frame = frames + 1;
+			frame->at = at = at - entries + entries2;
+			frame->entries = entries = entries2;
+			frame->bh = bh2;
+			err = ext4_journal_get_write_access(handle,
+							     frame->bh);
+			if (err)
+				goto journal_error;
+		}
+		err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
+		if (err) {
+			ext4_std_error(inode->i_sb, err);
+			goto cleanup;
+		}
+	}
+	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
+	if (!de)
+		goto cleanup;
+	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	goto cleanup;
+
+journal_error:
+	ext4_std_error(dir->i_sb, err);
+cleanup:
+	if (bh)
+		brelse(bh);
+	dx_release(frames);
+	return err;
+}
+
+/*
+ * ext4_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+static int ext4_delete_entry(handle_t *handle,
+			     struct inode *dir,
+			     struct ext4_dir_entry_2 *de_del,
+			     struct buffer_head *bh)
+{
+	struct ext4_dir_entry_2 *de, *pde;
+	unsigned int blocksize = dir->i_sb->s_blocksize;
+	int i, err;
+
+	i = 0;
+	pde = NULL;
+	de = (struct ext4_dir_entry_2 *) bh->b_data;
+	while (i < bh->b_size) {
+		if (ext4_check_dir_entry(dir, NULL, de, bh, i))
+			return -EIO;
+		if (de == de_del)  {
+			BUFFER_TRACE(bh, "get_write_access");
+			err = ext4_journal_get_write_access(handle, bh);
+			if (unlikely(err)) {
+				ext4_std_error(dir->i_sb, err);
+				return err;
+			}
+			if (pde)
+				pde->rec_len = ext4_rec_len_to_disk(
+					ext4_rec_len_from_disk(pde->rec_len,
+							       blocksize) +
+					ext4_rec_len_from_disk(de->rec_len,
+							       blocksize),
+					blocksize);
+			else
+				de->inode = 0;
+			dir->i_version++;
+			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+			err = ext4_handle_dirty_metadata(handle, dir, bh);
+			if (unlikely(err)) {
+				ext4_std_error(dir->i_sb, err);
+				return err;
+			}
+			return 0;
+		}
+		i += ext4_rec_len_from_disk(de->rec_len, blocksize);
+		pde = de;
+		de = ext4_next_entry(de, blocksize);
+	}
+	return -ENOENT;
+}
+
+/*
+ * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
+ * since this indicates that nlinks count was previously 1.
+ */
+static void ext4_inc_count(handle_t *handle, struct inode *inode)
+{
+	inc_nlink(inode);
+	if (is_dx(inode) && inode->i_nlink > 1) {
+		/* limit is 16-bit i_links_count */
+		if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
+			inode->i_nlink = 1;
+			EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
+					      EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
+		}
+	}
+}
+
+/*
+ * If a directory had nlink == 1, then we should let it be 1. This indicates
+ * directory has >EXT4_LINK_MAX subdirs.
+ */
+static void ext4_dec_count(handle_t *handle, struct inode *inode)
+{
+	drop_nlink(inode);
+	if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0)
+		inc_nlink(inode);
+}
+
+
+static int ext4_add_nondir(handle_t *handle,
+		struct dentry *dentry, struct inode *inode)
+{
+	int err = ext4_add_entry(handle, dentry, inode);
+	if (!err) {
+		ext4_mark_inode_dirty(handle, inode);
+		d_instantiate(dentry, inode);
+		unlock_new_inode(inode);
+		return 0;
+	}
+	drop_nlink(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
+}
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
+		       struct nameidata *nd)
+{
+	handle_t *handle;
+	struct inode *inode;
+	int err, retries = 0;
+
+	dquot_initialize(dir);
+
+retry:
+	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
+	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &ext4_file_inode_operations;
+		inode->i_fop = &ext4_file_operations;
+		ext4_set_aops(inode);
+		err = ext4_add_nondir(handle, dentry, inode);
+	}
+	ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+static int ext4_mknod(struct inode *dir, struct dentry *dentry,
+		      int mode, dev_t rdev)
+{
+	handle_t *handle;
+	struct inode *inode;
+	int err, retries = 0;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	dquot_initialize(dir);
+
+retry:
+	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
+	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+#ifdef CONFIG_EXT4_FS_XATTR
+		inode->i_op = &ext4_special_inode_operations;
+#endif
+		err = ext4_add_nondir(handle, dentry, inode);
+	}
+	ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	handle_t *handle;
+	struct inode *inode;
+	struct buffer_head *dir_block = NULL;
+	struct ext4_dir_entry_2 *de;
+	unsigned int blocksize = dir->i_sb->s_blocksize;
+	int err, retries = 0;
+
+	if (EXT4_DIR_LINK_MAX(dir))
+		return -EMLINK;
+
+	dquot_initialize(dir);
+
+retry:
+	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
+	inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
+			       &dentry->d_name, 0);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_stop;
+
+	inode->i_op = &ext4_dir_inode_operations;
+	inode->i_fop = &ext4_dir_operations;
+	inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+	dir_block = ext4_bread(handle, inode, 0, 1, &err);
+	if (!dir_block)
+		goto out_clear_inode;
+	BUFFER_TRACE(dir_block, "get_write_access");
+	err = ext4_journal_get_write_access(handle, dir_block);
+	if (err)
+		goto out_clear_inode;
+	de = (struct ext4_dir_entry_2 *) dir_block->b_data;
+	de->inode = cpu_to_le32(inode->i_ino);
+	de->name_len = 1;
+	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+					   blocksize);
+	strcpy(de->name, ".");
+	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
+	de = ext4_next_entry(de, blocksize);
+	de->inode = cpu_to_le32(dir->i_ino);
+	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
+					   blocksize);
+	de->name_len = 2;
+	strcpy(de->name, "..");
+	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
+	inode->i_nlink = 2;
+	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, inode, dir_block);
+	if (err)
+		goto out_clear_inode;
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (!err)
+		err = ext4_add_entry(handle, dentry, inode);
+	if (err) {
+out_clear_inode:
+		clear_nlink(inode);
+		unlock_new_inode(inode);
+		ext4_mark_inode_dirty(handle, inode);
+		iput(inode);
+		goto out_stop;
+	}
+	ext4_inc_count(handle, dir);
+	ext4_update_dx_flag(dir);
+	err = ext4_mark_inode_dirty(handle, dir);
+	if (err)
+		goto out_clear_inode;
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+out_stop:
+	brelse(dir_block);
+	ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+static int empty_dir(struct inode *inode)
+{
+	unsigned int offset;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de, *de1;
+	struct super_block *sb;
+	int err = 0;
+
+	sb = inode->i_sb;
+	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
+	    !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
+		if (err)
+			EXT4_ERROR_INODE(inode,
+				"error %d reading directory lblock 0", err);
+		else
+			ext4_warning(inode->i_sb,
+				     "bad directory (dir #%lu) - no data block",
+				     inode->i_ino);
+		return 1;
+	}
+	de = (struct ext4_dir_entry_2 *) bh->b_data;
+	de1 = ext4_next_entry(de, sb->s_blocksize);
+	if (le32_to_cpu(de->inode) != inode->i_ino ||
+			!le32_to_cpu(de1->inode) ||
+			strcmp(".", de->name) ||
+			strcmp("..", de1->name)) {
+		ext4_warning(inode->i_sb,
+			     "bad directory (dir #%lu) - no `.' or `..'",
+			     inode->i_ino);
+		brelse(bh);
+		return 1;
+	}
+	offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
+		 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
+	de = ext4_next_entry(de1, sb->s_blocksize);
+	while (offset < inode->i_size) {
+		if (!bh ||
+		    (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+			unsigned int lblock;
+			err = 0;
+			brelse(bh);
+			lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+			bh = ext4_bread(NULL, inode, lblock, 0, &err);
+			if (!bh) {
+				if (err)
+					EXT4_ERROR_INODE(inode,
+						"error %d reading directory "
+						"lblock %u", err, lblock);
+				offset += sb->s_blocksize;
+				continue;
+			}
+			de = (struct ext4_dir_entry_2 *) bh->b_data;
+		}
+		if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
+			de = (struct ext4_dir_entry_2 *)(bh->b_data +
+							 sb->s_blocksize);
+			offset = (offset | (sb->s_blocksize - 1)) + 1;
+			continue;
+		}
+		if (le32_to_cpu(de->inode)) {
+			brelse(bh);
+			return 0;
+		}
+		offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
+		de = ext4_next_entry(de, sb->s_blocksize);
+	}
+	brelse(bh);
+	return 1;
+}
+
+/* ext4_orphan_add() links an unlinked or truncated inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the
+ * file is closed/deleted, or in case the inode truncate spans multiple
+ * transactions and the last transaction is not recovered after a crash.
+ *
+ * At filesystem recovery time, we walk this list deleting unlinked
+ * inodes and truncating linked inodes in ext4_orphan_cleanup().
+ */
+int ext4_orphan_add(handle_t *handle, struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext4_iloc iloc;
+	int err = 0, rc;
+
+	if (!ext4_handle_valid(handle))
+		return 0;
+
+	mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
+	if (!list_empty(&EXT4_I(inode)->i_orphan))
+		goto out_unlock;
+
+	/* Orphan handling is only valid for files with data blocks
+	 * being truncated, or files being unlinked. */
+
+	/* @@@ FIXME: Observation from aviro:
+	 * I think I can trigger J_ASSERT in ext4_orphan_add().  We block
+	 * here (on s_orphan_lock), so race with ext4_link() which might bump
+	 * ->i_nlink. For, say it, character device. Not a regular file,
+	 * not a directory, not a symlink and ->i_nlink > 0.
+	 *
+	 * tytso, 4/25/2009: I'm not sure how that could happen;
+	 * shouldn't the fs core protect us from these sort of
+	 * unlink()/link() races?
+	 */
+	J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+
+	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+	if (err)
+		goto out_unlock;
+
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (err)
+		goto out_unlock;
+	/*
+	 * Due to previous errors inode may be already a part of on-disk
+	 * orphan list. If so skip on-disk list modification.
+	 */
+	if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
+		(le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
+			goto mem_insert;
+
+	/* Insert this inode at the head of the on-disk orphan list... */
+	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
+	EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+	err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+	if (!err)
+		err = rc;
+
+	/* Only add to the head of the in-memory list if all the
+	 * previous operations succeeded.  If the orphan_add is going to
+	 * fail (possibly taking the journal offline), we can't risk
+	 * leaving the inode on the orphan list: stray orphan-list
+	 * entries can cause panics at unmount time.
+	 *
+	 * This is safe: on error we're going to ignore the orphan list
+	 * anyway on the next recovery. */
+mem_insert:
+	if (!err)
+		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+
+	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
+	jbd_debug(4, "orphan inode %lu will point to %d\n",
+			inode->i_ino, NEXT_ORPHAN(inode));
+out_unlock:
+	mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
+	ext4_std_error(inode->i_sb, err);
+	return err;
+}
+
+/*
+ * ext4_orphan_del() removes an unlinked or truncated inode from the list
+ * of such inodes stored on disk, because it is finally being cleaned up.
+ */
+int ext4_orphan_del(handle_t *handle, struct inode *inode)
+{
+	struct list_head *prev;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi;
+	__u32 ino_next;
+	struct ext4_iloc iloc;
+	int err = 0;
+
+	/* ext4_handle_valid() assumes a valid handle_t pointer */
+	if (handle && !ext4_handle_valid(handle))
+		return 0;
+
+	mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
+	if (list_empty(&ei->i_orphan))
+		goto out;
+
+	ino_next = NEXT_ORPHAN(inode);
+	prev = ei->i_orphan.prev;
+	sbi = EXT4_SB(inode->i_sb);
+
+	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+
+	list_del_init(&ei->i_orphan);
+
+	/* If we're on an error path, we may not have a valid
+	 * transaction handle with which to update the orphan list on
+	 * disk, but we still need to remove the inode from the linked
+	 * list in memory. */
+	if (sbi->s_journal && !handle)
+		goto out;
+
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (err)
+		goto out_err;
+
+	if (prev == &sbi->s_orphan) {
+		jbd_debug(4, "superblock will point to %u\n", ino_next);
+		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+		if (err)
+			goto out_brelse;
+		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+		err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+	} else {
+		struct ext4_iloc iloc2;
+		struct inode *i_prev =
+			&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
+
+		jbd_debug(4, "orphan inode %lu will point to %u\n",
+			  i_prev->i_ino, ino_next);
+		err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
+		if (err)
+			goto out_brelse;
+		NEXT_ORPHAN(i_prev) = ino_next;
+		err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+	}
+	if (err)
+		goto out_brelse;
+	NEXT_ORPHAN(inode) = 0;
+	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+
+out_err:
+	ext4_std_error(inode->i_sb, err);
+out:
+	mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
+	return err;
+
+out_brelse:
+	brelse(iloc.bh);
+	goto out_err;
+}
+
+static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int retval;
+	struct inode *inode;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
+	handle_t *handle;
+
+	/* Initialize quotas before so that eventual writes go in
+	 * separate transaction */
+	dquot_initialize(dir);
+	dquot_initialize(dentry->d_inode);
+
+	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	retval = -ENOENT;
+	bh = ext4_find_entry(dir, &dentry->d_name, &de);
+	if (!bh)
+		goto end_rmdir;
+
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
+	inode = dentry->d_inode;
+
+	retval = -EIO;
+	if (le32_to_cpu(de->inode) != inode->i_ino)
+		goto end_rmdir;
+
+	retval = -ENOTEMPTY;
+	if (!empty_dir(inode))
+		goto end_rmdir;
+
+	retval = ext4_delete_entry(handle, dir, de, bh);
+	if (retval)
+		goto end_rmdir;
+	if (!EXT4_DIR_LINK_EMPTY(inode))
+		ext4_warning(inode->i_sb,
+			     "empty directory has too many links (%d)",
+			     inode->i_nlink);
+	inode->i_version++;
+	clear_nlink(inode);
+	/* There's no need to set i_disksize: the fact that i_nlink is
+	 * zero will ensure that the right thing happens during any
+	 * recovery. */
+	inode->i_size = 0;
+	ext4_orphan_add(handle, inode);
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_dec_count(handle, dir);
+	ext4_update_dx_flag(dir);
+	ext4_mark_inode_dirty(handle, dir);
+
+end_rmdir:
+	ext4_journal_stop(handle);
+	brelse(bh);
+	return retval;
+}
+
+static int ext4_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int retval;
+	struct inode *inode;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
+	handle_t *handle;
+
+	trace_ext4_unlink_enter(dir, dentry);
+	/* Initialize quotas before so that eventual writes go
+	 * in separate transaction */
+	dquot_initialize(dir);
+	dquot_initialize(dentry->d_inode);
+
+	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
+	retval = -ENOENT;
+	bh = ext4_find_entry(dir, &dentry->d_name, &de);
+	if (!bh)
+		goto end_unlink;
+
+	inode = dentry->d_inode;
+
+	retval = -EIO;
+	if (le32_to_cpu(de->inode) != inode->i_ino)
+		goto end_unlink;
+
+	if (!inode->i_nlink) {
+		ext4_warning(inode->i_sb,
+			     "Deleting nonexistent file (%lu), %d",
+			     inode->i_ino, inode->i_nlink);
+		inode->i_nlink = 1;
+	}
+	retval = ext4_delete_entry(handle, dir, de, bh);
+	if (retval)
+		goto end_unlink;
+	dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
+	ext4_update_dx_flag(dir);
+	ext4_mark_inode_dirty(handle, dir);
+	drop_nlink(inode);
+	if (!inode->i_nlink)
+		ext4_orphan_add(handle, inode);
+	inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+	retval = 0;
+
+end_unlink:
+	ext4_journal_stop(handle);
+	brelse(bh);
+	trace_ext4_unlink_exit(dentry, retval);
+	return retval;
+}
+
+static int ext4_symlink(struct inode *dir,
+			struct dentry *dentry, const char *symname)
+{
+	handle_t *handle;
+	struct inode *inode;
+	int l, err, retries = 0;
+	int credits;
+
+	l = strlen(symname)+1;
+	if (l > dir->i_sb->s_blocksize)
+		return -ENAMETOOLONG;
+
+	dquot_initialize(dir);
+
+	if (l > EXT4_N_BLOCKS * 4) {
+		/*
+		 * For non-fast symlinks, we just allocate inode and put it on
+		 * orphan list in the first transaction => we need bitmap,
+		 * group descriptor, sb, inode block, quota blocks, and
+		 * possibly selinux xattr blocks.
+		 */
+		credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+			  EXT4_XATTR_TRANS_BLOCKS;
+	} else {
+		/*
+		 * Fast symlink. We have to add entry to directory
+		 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
+		 * allocate new inode (bitmap, group descriptor, inode block,
+		 * quota blocks, sb is already counted in previous macros).
+		 */
+		credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+			  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+			  EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+	}
+retry:
+	handle = ext4_journal_start(dir, credits);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
+	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
+			       &dentry->d_name, 0);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_stop;
+
+	if (l > EXT4_N_BLOCKS * 4) {
+		inode->i_op = &ext4_symlink_inode_operations;
+		ext4_set_aops(inode);
+		/*
+		 * We cannot call page_symlink() with transaction started
+		 * because it calls into ext4_write_begin() which can wait
+		 * for transaction commit if we are running out of space
+		 * and thus we deadlock. So we have to stop transaction now
+		 * and restart it when symlink contents is written.
+		 * 
+		 * To keep fs consistent in case of crash, we have to put inode
+		 * to orphan list in the mean time.
+		 */
+		drop_nlink(inode);
+		err = ext4_orphan_add(handle, inode);
+		ext4_journal_stop(handle);
+		if (err)
+			goto err_drop_inode;
+		err = __page_symlink(inode, symname, l, 1);
+		if (err)
+			goto err_drop_inode;
+		/*
+		 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
+		 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+		 */
+		handle = ext4_journal_start(dir,
+				EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+				EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto err_drop_inode;
+		}
+		inc_nlink(inode);
+		err = ext4_orphan_del(handle, inode);
+		if (err) {
+			ext4_journal_stop(handle);
+			clear_nlink(inode);
+			goto err_drop_inode;
+		}
+	} else {
+		/* clear the extent format for fast symlink */
+		ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+		inode->i_op = &ext4_fast_symlink_inode_operations;
+		memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
+		inode->i_size = l-1;
+	}
+	EXT4_I(inode)->i_disksize = inode->i_size;
+	err = ext4_add_nondir(handle, dentry, inode);
+out_stop:
+	ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+err_drop_inode:
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
+}
+
+static int ext4_link(struct dentry *old_dentry,
+		     struct inode *dir, struct dentry *dentry)
+{
+	handle_t *handle;
+	struct inode *inode = old_dentry->d_inode;
+	int err, retries = 0;
+
+	if (inode->i_nlink >= EXT4_LINK_MAX)
+		return -EMLINK;
+
+	dquot_initialize(dir);
+
+retry:
+	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT4_INDEX_EXTRA_TRANS_BLOCKS);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
+	inode->i_ctime = ext4_current_time(inode);
+	ext4_inc_count(handle, inode);
+	ihold(inode);
+
+	err = ext4_add_entry(handle, dentry, inode);
+	if (!err) {
+		ext4_mark_inode_dirty(handle, inode);
+		d_instantiate(dentry, inode);
+	} else {
+		drop_nlink(inode);
+		iput(inode);
+	}
+	ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+#define PARENT_INO(buffer, size) \
+	(ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
+
+/*
+ * Anybody can rename anything with this: the permission checks are left to the
+ * higher-level routines.
+ */
+static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
+{
+	handle_t *handle;
+	struct inode *old_inode, *new_inode;
+	struct buffer_head *old_bh, *new_bh, *dir_bh;
+	struct ext4_dir_entry_2 *old_de, *new_de;
+	int retval, force_da_alloc = 0;
+
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
+	old_bh = new_bh = dir_bh = NULL;
+
+	/* Initialize quotas before so that eventual writes go
+	 * in separate transaction */
+	if (new_dentry->d_inode)
+		dquot_initialize(new_dentry->d_inode);
+	handle = ext4_journal_start(old_dir, 2 *
+					EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+		ext4_handle_sync(handle);
+
+	old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
+	/*
+	 *  Check for inode number is _not_ due to possible IO errors.
+	 *  We might rmdir the source, keep it as pwd of some process
+	 *  and merrily kill the link to whatever was created under the
+	 *  same name. Goodbye sticky bit ;-<
+	 */
+	old_inode = old_dentry->d_inode;
+	retval = -ENOENT;
+	if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+		goto end_rename;
+
+	new_inode = new_dentry->d_inode;
+	new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
+	if (new_bh) {
+		if (!new_inode) {
+			brelse(new_bh);
+			new_bh = NULL;
+		}
+	}
+	if (S_ISDIR(old_inode->i_mode)) {
+		if (new_inode) {
+			retval = -ENOTEMPTY;
+			if (!empty_dir(new_inode))
+				goto end_rename;
+		}
+		retval = -EIO;
+		dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
+		if (!dir_bh)
+			goto end_rename;
+		if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
+				old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
+			goto end_rename;
+		retval = -EMLINK;
+		if (!new_inode && new_dir != old_dir &&
+		    EXT4_DIR_LINK_MAX(new_dir))
+			goto end_rename;
+		BUFFER_TRACE(dir_bh, "get_write_access");
+		retval = ext4_journal_get_write_access(handle, dir_bh);
+		if (retval)
+			goto end_rename;
+	}
+	if (!new_bh) {
+		retval = ext4_add_entry(handle, new_dentry, old_inode);
+		if (retval)
+			goto end_rename;
+	} else {
+		BUFFER_TRACE(new_bh, "get write access");
+		retval = ext4_journal_get_write_access(handle, new_bh);
+		if (retval)
+			goto end_rename;
+		new_de->inode = cpu_to_le32(old_inode->i_ino);
+		if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
+					      EXT4_FEATURE_INCOMPAT_FILETYPE))
+			new_de->file_type = old_de->file_type;
+		new_dir->i_version++;
+		new_dir->i_ctime = new_dir->i_mtime =
+					ext4_current_time(new_dir);
+		ext4_mark_inode_dirty(handle, new_dir);
+		BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
+		retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+		if (unlikely(retval)) {
+			ext4_std_error(new_dir->i_sb, retval);
+			goto end_rename;
+		}
+		brelse(new_bh);
+		new_bh = NULL;
+	}
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+	 * rename.
+	 */
+	old_inode->i_ctime = ext4_current_time(old_inode);
+	ext4_mark_inode_dirty(handle, old_inode);
+
+	/*
+	 * ok, that's it
+	 */
+	if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
+	    old_de->name_len != old_dentry->d_name.len ||
+	    strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
+	    (retval = ext4_delete_entry(handle, old_dir,
+					old_de, old_bh)) == -ENOENT) {
+		/* old_de could have moved from under us during htree split, so
+		 * make sure that we are deleting the right entry.  We might
+		 * also be pointing to a stale entry in the unused part of
+		 * old_bh so just checking inum and the name isn't enough. */
+		struct buffer_head *old_bh2;
+		struct ext4_dir_entry_2 *old_de2;
+
+		old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
+		if (old_bh2) {
+			retval = ext4_delete_entry(handle, old_dir,
+						   old_de2, old_bh2);
+			brelse(old_bh2);
+		}
+	}
+	if (retval) {
+		ext4_warning(old_dir->i_sb,
+				"Deleting old file (%lu), %d, error=%d",
+				old_dir->i_ino, old_dir->i_nlink, retval);
+	}
+
+	if (new_inode) {
+		ext4_dec_count(handle, new_inode);
+		new_inode->i_ctime = ext4_current_time(new_inode);
+	}
+	old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
+	ext4_update_dx_flag(old_dir);
+	if (dir_bh) {
+		PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
+						cpu_to_le32(new_dir->i_ino);
+		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
+		retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh);
+		if (retval) {
+			ext4_std_error(old_dir->i_sb, retval);
+			goto end_rename;
+		}
+		ext4_dec_count(handle, old_dir);
+		if (new_inode) {
+			/* checked empty_dir above, can't have another parent,
+			 * ext4_dec_count() won't work for many-linked dirs */
+			new_inode->i_nlink = 0;
+		} else {
+			ext4_inc_count(handle, new_dir);
+			ext4_update_dx_flag(new_dir);
+			ext4_mark_inode_dirty(handle, new_dir);
+		}
+	}
+	ext4_mark_inode_dirty(handle, old_dir);
+	if (new_inode) {
+		ext4_mark_inode_dirty(handle, new_inode);
+		if (!new_inode->i_nlink)
+			ext4_orphan_add(handle, new_inode);
+		if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
+			force_da_alloc = 1;
+	}
+	retval = 0;
+
+end_rename:
+	brelse(dir_bh);
+	brelse(old_bh);
+	brelse(new_bh);
+	ext4_journal_stop(handle);
+	if (retval == 0 && force_da_alloc)
+		ext4_alloc_da_blocks(old_inode);
+	return retval;
+}
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations ext4_dir_inode_operations = {
+	.create		= ext4_create,
+	.lookup		= ext4_lookup,
+	.link		= ext4_link,
+	.unlink		= ext4_unlink,
+	.symlink	= ext4_symlink,
+	.mkdir		= ext4_mkdir,
+	.rmdir		= ext4_rmdir,
+	.mknod		= ext4_mknod,
+	.rename		= ext4_rename,
+	.setattr	= ext4_setattr,
+#ifdef CONFIG_EXT4_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.check_acl	= ext4_check_acl,
+	.fiemap         = ext4_fiemap,
+};
+
+const struct inode_operations ext4_special_inode_operations = {
+	.setattr	= ext4_setattr,
+#ifdef CONFIG_EXT4_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.check_acl	= ext4_check_acl,
+};
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
new file mode 100644
index 00000000..d99d74ac
--- /dev/null
+++ b/fs/ext4/page-io.c
@@ -0,0 +1,447 @@
+/*
+ * linux/fs/ext4/page-io.c
+ *
+ * This contains the new page_io functions for ext4
+ *
+ * Written by Theodore Ts'o, 2010.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "ext4_extents.h"
+
+static struct kmem_cache *io_page_cachep, *io_end_cachep;
+
+int __init ext4_init_pageio(void)
+{
+	io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
+	if (io_page_cachep == NULL)
+		return -ENOMEM;
+	io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
+	if (io_end_cachep == NULL) {
+		kmem_cache_destroy(io_page_cachep);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void ext4_exit_pageio(void)
+{
+	kmem_cache_destroy(io_end_cachep);
+	kmem_cache_destroy(io_page_cachep);
+}
+
+void ext4_ioend_wait(struct inode *inode)
+{
+	wait_queue_head_t *wq = ext4_ioend_wq(inode);
+
+	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+}
+
+static void put_io_page(struct ext4_io_page *io_page)
+{
+	if (atomic_dec_and_test(&io_page->p_count)) {
+		end_page_writeback(io_page->p_page);
+		put_page(io_page->p_page);
+		kmem_cache_free(io_page_cachep, io_page);
+	}
+}
+
+void ext4_free_io_end(ext4_io_end_t *io)
+{
+	int i;
+	wait_queue_head_t *wq;
+
+	BUG_ON(!io);
+	if (io->page)
+		put_page(io->page);
+	for (i = 0; i < io->num_io_pages; i++)
+		put_io_page(io->pages[i]);
+	io->num_io_pages = 0;
+	wq = ext4_ioend_wq(io->inode);
+	if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
+	    waitqueue_active(wq))
+		wake_up_all(wq);
+	kmem_cache_free(io_end_cachep, io);
+}
+
+/*
+ * check a range of space and convert unwritten extents to written.
+ */
+int ext4_end_io_nolock(ext4_io_end_t *io)
+{
+	struct inode *inode = io->inode;
+	loff_t offset = io->offset;
+	ssize_t size = io->size;
+	wait_queue_head_t *wq;
+	int ret = 0;
+
+	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+		   "list->prev 0x%p\n",
+		   io, inode->i_ino, io->list.next, io->list.prev);
+
+	if (list_empty(&io->list))
+		return ret;
+
+	if (!(io->flag & EXT4_IO_END_UNWRITTEN))
+		return ret;
+
+	ret = ext4_convert_unwritten_extents(inode, offset, size);
+	if (ret < 0) {
+		printk(KERN_EMERG "%s: failed to convert unwritten "
+			"extents to written extents, error is %d "
+			"io is still on inode %lu aio dio list\n",
+		       __func__, ret, inode->i_ino);
+		return ret;
+	}
+
+	if (io->iocb)
+		aio_complete(io->iocb, io->result, 0);
+	/* clear the DIO AIO unwritten flag */
+	if (io->flag & EXT4_IO_END_UNWRITTEN) {
+		io->flag &= ~EXT4_IO_END_UNWRITTEN;
+		/* Wake up anyone waiting on unwritten extent conversion */
+		wq = ext4_ioend_wq(io->inode);
+		if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
+		    waitqueue_active(wq)) {
+			wake_up_all(wq);
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+	ext4_io_end_t		*io = container_of(work, ext4_io_end_t, work);
+	struct inode		*inode = io->inode;
+	struct ext4_inode_info	*ei = EXT4_I(inode);
+	unsigned long		flags;
+	int			ret;
+
+	if (!mutex_trylock(&inode->i_mutex)) {
+		/*
+		 * Requeue the work instead of waiting so that the work
+		 * items queued after this can be processed.
+		 */
+		queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
+		/*
+		 * To prevent the ext4-dio-unwritten thread from keeping
+		 * requeueing end_io requests and occupying cpu for too long,
+		 * yield the cpu if it sees an end_io request that has already
+		 * been requeued.
+		 */
+		if (io->flag & EXT4_IO_END_QUEUED)
+			yield();
+		io->flag |= EXT4_IO_END_QUEUED;
+		return;
+	}
+	ret = ext4_end_io_nolock(io);
+	if (ret < 0) {
+		mutex_unlock(&inode->i_mutex);
+		return;
+	}
+
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	if (!list_empty(&io->list))
+		list_del_init(&io->list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+	mutex_unlock(&inode->i_mutex);
+	ext4_free_io_end(io);
+}
+
+ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+{
+	ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
+	if (io) {
+		atomic_inc(&EXT4_I(inode)->i_ioend_count);
+		io->inode = inode;
+		INIT_WORK(&io->work, ext4_end_io_work);
+		INIT_LIST_HEAD(&io->list);
+	}
+	return io;
+}
+
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c.  This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message.  We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+	printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+			bdevname(bh->b_bdev, b),
+			(unsigned long long)bh->b_blocknr);
+}
+
+static void ext4_end_bio(struct bio *bio, int error)
+{
+	ext4_io_end_t *io_end = bio->bi_private;
+	struct workqueue_struct *wq;
+	struct inode *inode;
+	unsigned long flags;
+	int i;
+	sector_t bi_sector = bio->bi_sector;
+
+	BUG_ON(!io_end);
+	bio->bi_private = NULL;
+	bio->bi_end_io = NULL;
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+		error = 0;
+	bio_put(bio);
+
+	for (i = 0; i < io_end->num_io_pages; i++) {
+		struct page *page = io_end->pages[i]->p_page;
+		struct buffer_head *bh, *head;
+		loff_t offset;
+		loff_t io_end_offset;
+
+		if (error) {
+			SetPageError(page);
+			set_bit(AS_EIO, &page->mapping->flags);
+			head = page_buffers(page);
+			BUG_ON(!head);
+
+			io_end_offset = io_end->offset + io_end->size;
+
+			offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
+			bh = head;
+			do {
+				if ((offset >= io_end->offset) &&
+				    (offset+bh->b_size <= io_end_offset))
+					buffer_io_error(bh);
+
+				offset += bh->b_size;
+				bh = bh->b_this_page;
+			} while (bh != head);
+		}
+
+		put_io_page(io_end->pages[i]);
+	}
+	io_end->num_io_pages = 0;
+	inode = io_end->inode;
+
+	if (error) {
+		io_end->flag |= EXT4_IO_END_ERROR;
+		ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+			     "(offset %llu size %ld starting block %llu)",
+			     inode->i_ino,
+			     (unsigned long long) io_end->offset,
+			     (long) io_end->size,
+			     (unsigned long long)
+			     bi_sector >> (inode->i_blkbits - 9));
+	}
+
+	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		ext4_free_io_end(io_end);
+		return;
+	}
+
+	/* Add the io_end to per-inode completed io list*/
+	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+
+	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+	/* queue the work to convert unwritten extents to written */
+	queue_work(wq, &io_end->work);
+}
+
+void ext4_io_submit(struct ext4_io_submit *io)
+{
+	struct bio *bio = io->io_bio;
+
+	if (bio) {
+		bio_get(io->io_bio);
+		submit_bio(io->io_op, io->io_bio);
+		BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
+		bio_put(io->io_bio);
+	}
+	io->io_bio = NULL;
+	io->io_op = 0;
+	io->io_end = NULL;
+}
+
+static int io_submit_init(struct ext4_io_submit *io,
+			  struct inode *inode,
+			  struct writeback_control *wbc,
+			  struct buffer_head *bh)
+{
+	ext4_io_end_t *io_end;
+	struct page *page = bh->b_page;
+	int nvecs = bio_get_nr_vecs(bh->b_bdev);
+	struct bio *bio;
+
+	io_end = ext4_init_io_end(inode, GFP_NOFS);
+	if (!io_end)
+		return -ENOMEM;
+	do {
+		bio = bio_alloc(GFP_NOIO, nvecs);
+		nvecs >>= 1;
+	} while (bio == NULL);
+
+	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+	bio->bi_private = io->io_end = io_end;
+	bio->bi_end_io = ext4_end_bio;
+
+	io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
+
+	io->io_bio = bio;
+	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
+	io->io_next_block = bh->b_blocknr;
+	return 0;
+}
+
+static int io_submit_add_bh(struct ext4_io_submit *io,
+			    struct ext4_io_page *io_page,
+			    struct inode *inode,
+			    struct writeback_control *wbc,
+			    struct buffer_head *bh)
+{
+	ext4_io_end_t *io_end;
+	int ret;
+
+	if (buffer_new(bh)) {
+		clear_buffer_new(bh);
+		unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+	}
+
+	if (!buffer_mapped(bh) || buffer_delay(bh)) {
+		if (!buffer_mapped(bh))
+			clear_buffer_dirty(bh);
+		if (io->io_bio)
+			ext4_io_submit(io);
+		return 0;
+	}
+
+	if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+submit_and_retry:
+		ext4_io_submit(io);
+	}
+	if (io->io_bio == NULL) {
+		ret = io_submit_init(io, inode, wbc, bh);
+		if (ret)
+			return ret;
+	}
+	io_end = io->io_end;
+	if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
+	    (io_end->pages[io_end->num_io_pages-1] != io_page))
+		goto submit_and_retry;
+	if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		io_end->flag |= EXT4_IO_END_UNWRITTEN;
+		atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+	}
+	io->io_end->size += bh->b_size;
+	io->io_next_block++;
+	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
+	if (ret != bh->b_size)
+		goto submit_and_retry;
+	if ((io_end->num_io_pages == 0) ||
+	    (io_end->pages[io_end->num_io_pages-1] != io_page)) {
+		io_end->pages[io_end->num_io_pages++] = io_page;
+		atomic_inc(&io_page->p_count);
+	}
+	return 0;
+}
+
+int ext4_bio_write_page(struct ext4_io_submit *io,
+			struct page *page,
+			int len,
+			struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned block_start, block_end, blocksize;
+	struct ext4_io_page *io_page;
+	struct buffer_head *bh, *head;
+	int ret = 0;
+
+	blocksize = 1 << inode->i_blkbits;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(PageWriteback(page));
+
+	io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
+	if (!io_page) {
+		set_page_dirty(page);
+		unlock_page(page);
+		return -ENOMEM;
+	}
+	io_page->p_page = page;
+	atomic_set(&io_page->p_count, 1);
+	get_page(page);
+	set_page_writeback(page);
+	ClearPageError(page);
+
+	for (bh = head = page_buffers(page), block_start = 0;
+	     bh != head || !block_start;
+	     block_start = block_end, bh = bh->b_this_page) {
+
+		block_end = block_start + blocksize;
+		if (block_start >= len) {
+			/*
+			 * Comments copied from block_write_full_page_endio:
+			 *
+			 * The page straddles i_size.  It must be zeroed out on
+			 * each and every writepage invocation because it may
+			 * be mmapped.  "A file is mapped in multiples of the
+			 * page size.  For a file that is not a multiple of
+			 * the  page size, the remaining memory is zeroed when
+			 * mapped, and writes to that region are not written
+			 * out to the file."
+			 */
+			zero_user_segment(page, block_start, block_end);
+			clear_buffer_dirty(bh);
+			set_buffer_uptodate(bh);
+			continue;
+		}
+		clear_buffer_dirty(bh);
+		ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+		if (ret) {
+			/*
+			 * We only get here on ENOMEM.  Not much else
+			 * we can do but mark the page as dirty, and
+			 * better luck next time.
+			 */
+			set_page_dirty(page);
+			break;
+		}
+	}
+	unlock_page(page);
+	/*
+	 * If the page was truncated before we could do the writeback,
+	 * or we had a memory allocation error while trying to write
+	 * the first buffer head, we won't have submitted any pages for
+	 * I/O.  In that case we need to make sure we've cleared the
+	 * PageWriteback bit from the page to prevent the system from
+	 * wedging later on.
+	 */
+	put_io_page(io_page);
+	return ret;
+}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
new file mode 100644
index 00000000..80bbc9c6
--- /dev/null
+++ b/fs/ext4/resize.c
@@ -0,0 +1,1076 @@
+/*
+ *  linux/fs/ext4/resize.c
+ *
+ * Support for resizing an ext4 filesystem while it is mounted.
+ *
+ * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
+ *
+ * This could probably be made into a module, because it is not often in use.
+ */
+
+
+#define EXT4FS_DEBUG
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+#include "ext4_jbd2.h"
+
+#define outside(b, first, last)	((b) < (first) || (b) >= (last))
+#define inside(b, first, last)	((b) >= (first) && (b) < (last))
+
+static int verify_group_input(struct super_block *sb,
+			      struct ext4_new_group_data *input)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	ext4_fsblk_t start = ext4_blocks_count(es);
+	ext4_fsblk_t end = start + input->blocks_count;
+	ext4_group_t group = input->group;
+	ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
+	unsigned overhead = ext4_bg_has_super(sb, group) ?
+		(1 + ext4_bg_num_gdb(sb, group) +
+		 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+	ext4_fsblk_t metaend = start + overhead;
+	struct buffer_head *bh = NULL;
+	ext4_grpblk_t free_blocks_count, offset;
+	int err = -EINVAL;
+
+	input->free_blocks_count = free_blocks_count =
+		input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks "
+		       "(%d free, %u reserved)\n",
+		       ext4_bg_has_super(sb, input->group) ? "normal" :
+		       "no-super", input->group, input->blocks_count,
+		       free_blocks_count, input->reserved_blocks);
+
+	ext4_get_group_no_and_offset(sb, start, NULL, &offset);
+	if (group != sbi->s_groups_count)
+		ext4_warning(sb, "Cannot add at group %u (only %u groups)",
+			     input->group, sbi->s_groups_count);
+	else if (offset != 0)
+			ext4_warning(sb, "Last group not full");
+	else if (input->reserved_blocks > input->blocks_count / 5)
+		ext4_warning(sb, "Reserved blocks too high (%u)",
+			     input->reserved_blocks);
+	else if (free_blocks_count < 0)
+		ext4_warning(sb, "Bad blocks count %u",
+			     input->blocks_count);
+	else if (!(bh = sb_bread(sb, end - 1)))
+		ext4_warning(sb, "Cannot read last block (%llu)",
+			     end - 1);
+	else if (outside(input->block_bitmap, start, end))
+		ext4_warning(sb, "Block bitmap not in group (block %llu)",
+			     (unsigned long long)input->block_bitmap);
+	else if (outside(input->inode_bitmap, start, end))
+		ext4_warning(sb, "Inode bitmap not in group (block %llu)",
+			     (unsigned long long)input->inode_bitmap);
+	else if (outside(input->inode_table, start, end) ||
+		 outside(itend - 1, start, end))
+		ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)",
+			     (unsigned long long)input->inode_table, itend - 1);
+	else if (input->inode_bitmap == input->block_bitmap)
+		ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)",
+			     (unsigned long long)input->block_bitmap);
+	else if (inside(input->block_bitmap, input->inode_table, itend))
+		ext4_warning(sb, "Block bitmap (%llu) in inode table "
+			     "(%llu-%llu)",
+			     (unsigned long long)input->block_bitmap,
+			     (unsigned long long)input->inode_table, itend - 1);
+	else if (inside(input->inode_bitmap, input->inode_table, itend))
+		ext4_warning(sb, "Inode bitmap (%llu) in inode table "
+			     "(%llu-%llu)",
+			     (unsigned long long)input->inode_bitmap,
+			     (unsigned long long)input->inode_table, itend - 1);
+	else if (inside(input->block_bitmap, start, metaend))
+		ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)",
+			     (unsigned long long)input->block_bitmap,
+			     start, metaend - 1);
+	else if (inside(input->inode_bitmap, start, metaend))
+		ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)",
+			     (unsigned long long)input->inode_bitmap,
+			     start, metaend - 1);
+	else if (inside(input->inode_table, start, metaend) ||
+		 inside(itend - 1, start, metaend))
+		ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table "
+			     "(%llu-%llu)",
+			     (unsigned long long)input->inode_table,
+			     itend - 1, start, metaend - 1);
+	else
+		err = 0;
+	brelse(bh);
+
+	return err;
+}
+
+static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
+				  ext4_fsblk_t blk)
+{
+	struct buffer_head *bh;
+	int err;
+
+	bh = sb_getblk(sb, blk);
+	if (!bh)
+		return ERR_PTR(-EIO);
+	if ((err = ext4_journal_get_write_access(handle, bh))) {
+		brelse(bh);
+		bh = ERR_PTR(err);
+	} else {
+		lock_buffer(bh);
+		memset(bh->b_data, 0, sb->s_blocksize);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+	}
+
+	return bh;
+}
+
+/*
+ * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA.
+ * If that fails, restart the transaction & regain write access for the
+ * buffer head which is used for block_bitmap modifications.
+ */
+static int extend_or_restart_transaction(handle_t *handle, int thresh,
+					 struct buffer_head *bh)
+{
+	int err;
+
+	if (ext4_handle_has_enough_credits(handle, thresh))
+		return 0;
+
+	err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
+	if (err < 0)
+		return err;
+	if (err) {
+		if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+			return err;
+		if ((err = ext4_journal_get_write_access(handle, bh)))
+			return err;
+	}
+
+	return 0;
+}
+
+/*
+ * Set up the block and inode bitmaps, and the inode table for the new group.
+ * This doesn't need to be part of the main transaction, since we are only
+ * changing blocks outside the actual filesystem.  We still do journaling to
+ * ensure the recovery is correct in case of a failure just after resize.
+ * If any part of this fails, we simply abort the resize.
+ */
+static int setup_new_group_blocks(struct super_block *sb,
+				  struct ext4_new_group_data *input)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
+	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
+		le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
+	unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
+	struct buffer_head *bh;
+	handle_t *handle;
+	ext4_fsblk_t block;
+	ext4_grpblk_t bit;
+	int i;
+	int err = 0, err2;
+
+	/* This transaction may be extended/restarted along the way */
+	handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	mutex_lock(&sbi->s_resize_lock);
+	if (input->group != sbi->s_groups_count) {
+		err = -EBUSY;
+		goto exit_journal;
+	}
+
+	if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
+		err = PTR_ERR(bh);
+		goto exit_journal;
+	}
+
+	if (ext4_bg_has_super(sb, input->group)) {
+		ext4_debug("mark backup superblock %#04llx (+0)\n", start);
+		ext4_set_bit(0, bh->b_data);
+	}
+
+	/* Copy all of the GDT blocks into the backup in this group */
+	for (i = 0, bit = 1, block = start + 1;
+	     i < gdblocks; i++, block++, bit++) {
+		struct buffer_head *gdb;
+
+		ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
+
+		if ((err = extend_or_restart_transaction(handle, 1, bh)))
+			goto exit_bh;
+
+		gdb = sb_getblk(sb, block);
+		if (!gdb) {
+			err = -EIO;
+			goto exit_bh;
+		}
+		if ((err = ext4_journal_get_write_access(handle, gdb))) {
+			brelse(gdb);
+			goto exit_bh;
+		}
+		lock_buffer(gdb);
+		memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
+		set_buffer_uptodate(gdb);
+		unlock_buffer(gdb);
+		err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+		if (unlikely(err)) {
+			brelse(gdb);
+			goto exit_bh;
+		}
+		ext4_set_bit(bit, bh->b_data);
+		brelse(gdb);
+	}
+
+	/* Zero out all of the reserved backup group descriptor table blocks */
+	ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
+			block, sbi->s_itb_per_group);
+	err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
+			       GFP_NOFS);
+	if (err)
+		goto exit_bh;
+	for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++)
+		ext4_set_bit(bit, bh->b_data);
+
+	ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
+		   input->block_bitmap - start);
+	ext4_set_bit(input->block_bitmap - start, bh->b_data);
+	ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap,
+		   input->inode_bitmap - start);
+	ext4_set_bit(input->inode_bitmap - start, bh->b_data);
+
+	/* Zero out all of the inode table blocks */
+	block = input->inode_table;
+	ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
+			block, sbi->s_itb_per_group);
+	err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
+	if (err)
+		goto exit_bh;
+	for (i = 0, bit = input->inode_table - start;
+	     i < sbi->s_itb_per_group; i++, bit++)
+		ext4_set_bit(bit, bh->b_data);
+
+	if ((err = extend_or_restart_transaction(handle, 2, bh)))
+		goto exit_bh;
+
+	ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
+			     bh->b_data);
+	err = ext4_handle_dirty_metadata(handle, NULL, bh);
+	if (unlikely(err)) {
+		ext4_std_error(sb, err);
+		goto exit_bh;
+	}
+	brelse(bh);
+	/* Mark unused entries in inode bitmap used */
+	ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
+		   input->inode_bitmap, input->inode_bitmap - start);
+	if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
+		err = PTR_ERR(bh);
+		goto exit_journal;
+	}
+
+	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+			     bh->b_data);
+	err = ext4_handle_dirty_metadata(handle, NULL, bh);
+	if (unlikely(err))
+		ext4_std_error(sb, err);
+exit_bh:
+	brelse(bh);
+
+exit_journal:
+	mutex_unlock(&sbi->s_resize_lock);
+	if ((err2 = ext4_journal_stop(handle)) && !err)
+		err = err2;
+
+	return err;
+}
+
+/*
+ * Iterate through the groups which hold BACKUP superblock/GDT copies in an
+ * ext4 filesystem.  The counters should be initialized to 1, 5, and 7 before
+ * calling this for the first time.  In a sparse filesystem it will be the
+ * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
+ * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
+ */
+static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
+				  unsigned *five, unsigned *seven)
+{
+	unsigned *min = three;
+	int mult = 3;
+	unsigned ret;
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+		ret = *min;
+		*min += 1;
+		return ret;
+	}
+
+	if (*five < *min) {
+		min = five;
+		mult = 5;
+	}
+	if (*seven < *min) {
+		min = seven;
+		mult = 7;
+	}
+
+	ret = *min;
+	*min *= mult;
+
+	return ret;
+}
+
+/*
+ * Check that all of the backup GDT blocks are held in the primary GDT block.
+ * It is assumed that they are stored in group order.  Returns the number of
+ * groups in current filesystem that have BACKUPS, or -ve error code.
+ */
+static int verify_reserved_gdb(struct super_block *sb,
+			       struct buffer_head *primary)
+{
+	const ext4_fsblk_t blk = primary->b_blocknr;
+	const ext4_group_t end = EXT4_SB(sb)->s_groups_count;
+	unsigned three = 1;
+	unsigned five = 5;
+	unsigned seven = 7;
+	unsigned grp;
+	__le32 *p = (__le32 *)primary->b_data;
+	int gdbackups = 0;
+
+	while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
+		if (le32_to_cpu(*p++) !=
+		    grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
+			ext4_warning(sb, "reserved GDT %llu"
+				     " missing grp %d (%llu)",
+				     blk, grp,
+				     grp *
+				     (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
+				     blk);
+			return -EINVAL;
+		}
+		if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb))
+			return -EFBIG;
+	}
+
+	return gdbackups;
+}
+
+/*
+ * Called when we need to bring a reserved group descriptor table block into
+ * use from the resize inode.  The primary copy of the new GDT block currently
+ * is an indirect block (under the double indirect block in the resize inode).
+ * The new backup GDT blocks will be stored as leaf blocks in this indirect
+ * block, in group order.  Even though we know all the block numbers we need,
+ * we check to ensure that the resize inode has actually reserved these blocks.
+ *
+ * Don't need to update the block bitmaps because the blocks are still in use.
+ *
+ * We get all of the error cases out of the way, so that we are sure to not
+ * fail once we start modifying the data on disk, because JBD has no rollback.
+ */
+static int add_new_gdb(handle_t *handle, struct inode *inode,
+		       struct ext4_new_group_data *input,
+		       struct buffer_head **primary)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
+	ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
+	struct buffer_head **o_group_desc, **n_group_desc;
+	struct buffer_head *dind;
+	int gdbackups;
+	struct ext4_iloc iloc;
+	__le32 *data;
+	int err;
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG
+		       "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
+		       gdb_num);
+
+	/*
+	 * If we are not using the primary superblock/GDT copy don't resize,
+         * because the user tools have no way of handling this.  Probably a
+         * bad time to do it anyways.
+         */
+	if (EXT4_SB(sb)->s_sbh->b_blocknr !=
+	    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
+		ext4_warning(sb, "won't resize using backup superblock at %llu",
+			(unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
+		return -EPERM;
+	}
+
+	*primary = sb_bread(sb, gdblock);
+	if (!*primary)
+		return -EIO;
+
+	if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
+		err = gdbackups;
+		goto exit_bh;
+	}
+
+	data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
+	dind = sb_bread(sb, le32_to_cpu(*data));
+	if (!dind) {
+		err = -EIO;
+		goto exit_bh;
+	}
+
+	data = (__le32 *)dind->b_data;
+	if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
+		ext4_warning(sb, "new group %u GDT block %llu not reserved",
+			     input->group, gdblock);
+		err = -EINVAL;
+		goto exit_dind;
+	}
+
+	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+	if (unlikely(err))
+		goto exit_dind;
+
+	err = ext4_journal_get_write_access(handle, *primary);
+	if (unlikely(err))
+		goto exit_sbh;
+
+	err = ext4_journal_get_write_access(handle, dind);
+	if (unlikely(err))
+		ext4_std_error(sb, err);
+
+	/* ext4_reserve_inode_write() gets a reference on the iloc */
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (unlikely(err))
+		goto exit_dindj;
+
+	n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
+			GFP_NOFS);
+	if (!n_group_desc) {
+		err = -ENOMEM;
+		ext4_warning(sb,
+			      "not enough memory for %lu groups", gdb_num + 1);
+		goto exit_inode;
+	}
+
+	/*
+	 * Finally, we have all of the possible failures behind us...
+	 *
+	 * Remove new GDT block from inode double-indirect block and clear out
+	 * the new GDT block for use (which also "frees" the backup GDT blocks
+	 * from the reserved inode).  We don't need to change the bitmaps for
+	 * these blocks, because they are marked as in-use from being in the
+	 * reserved inode, and will become GDT blocks (primary and backup).
+	 */
+	data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
+	err = ext4_handle_dirty_metadata(handle, NULL, dind);
+	if (unlikely(err)) {
+		ext4_std_error(sb, err);
+		goto exit_inode;
+	}
+	inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
+	ext4_mark_iloc_dirty(handle, inode, &iloc);
+	memset((*primary)->b_data, 0, sb->s_blocksize);
+	err = ext4_handle_dirty_metadata(handle, NULL, *primary);
+	if (unlikely(err)) {
+		ext4_std_error(sb, err);
+		goto exit_inode;
+	}
+	brelse(dind);
+
+	o_group_desc = EXT4_SB(sb)->s_group_desc;
+	memcpy(n_group_desc, o_group_desc,
+	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+	n_group_desc[gdb_num] = *primary;
+	EXT4_SB(sb)->s_group_desc = n_group_desc;
+	EXT4_SB(sb)->s_gdb_count++;
+	kfree(o_group_desc);
+
+	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
+	err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+	if (err)
+		ext4_std_error(sb, err);
+
+	return err;
+
+exit_inode:
+	/* ext4_handle_release_buffer(handle, iloc.bh); */
+	brelse(iloc.bh);
+exit_dindj:
+	/* ext4_handle_release_buffer(handle, dind); */
+exit_sbh:
+	/* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
+exit_dind:
+	brelse(dind);
+exit_bh:
+	brelse(*primary);
+
+	ext4_debug("leaving with error %d\n", err);
+	return err;
+}
+
+/*
+ * Called when we are adding a new group which has a backup copy of each of
+ * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
+ * We need to add these reserved backup GDT blocks to the resize inode, so
+ * that they are kept for future resizing and not allocated to files.
+ *
+ * Each reserved backup GDT block will go into a different indirect block.
+ * The indirect blocks are actually the primary reserved GDT blocks,
+ * so we know in advance what their block numbers are.  We only get the
+ * double-indirect block to verify it is pointing to the primary reserved
+ * GDT blocks so we don't overwrite a data block by accident.  The reserved
+ * backup GDT blocks are stored in their reserved primary GDT block.
+ */
+static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
+			      struct ext4_new_group_data *input)
+{
+	struct super_block *sb = inode->i_sb;
+	int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
+	struct buffer_head **primary;
+	struct buffer_head *dind;
+	struct ext4_iloc iloc;
+	ext4_fsblk_t blk;
+	__le32 *data, *end;
+	int gdbackups = 0;
+	int res, i;
+	int err;
+
+	primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS);
+	if (!primary)
+		return -ENOMEM;
+
+	data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
+	dind = sb_bread(sb, le32_to_cpu(*data));
+	if (!dind) {
+		err = -EIO;
+		goto exit_free;
+	}
+
+	blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count;
+	data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count %
+					 EXT4_ADDR_PER_BLOCK(sb));
+	end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb);
+
+	/* Get each reserved primary GDT block and verify it holds backups */
+	for (res = 0; res < reserved_gdb; res++, blk++) {
+		if (le32_to_cpu(*data) != blk) {
+			ext4_warning(sb, "reserved block %llu"
+				     " not at offset %ld",
+				     blk,
+				     (long)(data - (__le32 *)dind->b_data));
+			err = -EINVAL;
+			goto exit_bh;
+		}
+		primary[res] = sb_bread(sb, blk);
+		if (!primary[res]) {
+			err = -EIO;
+			goto exit_bh;
+		}
+		if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
+			brelse(primary[res]);
+			err = gdbackups;
+			goto exit_bh;
+		}
+		if (++data >= end)
+			data = (__le32 *)dind->b_data;
+	}
+
+	for (i = 0; i < reserved_gdb; i++) {
+		if ((err = ext4_journal_get_write_access(handle, primary[i]))) {
+			/*
+			int j;
+			for (j = 0; j < i; j++)
+				ext4_handle_release_buffer(handle, primary[j]);
+			 */
+			goto exit_bh;
+		}
+	}
+
+	if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+		goto exit_bh;
+
+	/*
+	 * Finally we can add each of the reserved backup GDT blocks from
+	 * the new group to its reserved primary GDT block.
+	 */
+	blk = input->group * EXT4_BLOCKS_PER_GROUP(sb);
+	for (i = 0; i < reserved_gdb; i++) {
+		int err2;
+		data = (__le32 *)primary[i]->b_data;
+		/* printk("reserving backup %lu[%u] = %lu\n",
+		       primary[i]->b_blocknr, gdbackups,
+		       blk + primary[i]->b_blocknr); */
+		data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
+		err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
+		if (!err)
+			err = err2;
+	}
+	inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
+	ext4_mark_iloc_dirty(handle, inode, &iloc);
+
+exit_bh:
+	while (--res >= 0)
+		brelse(primary[res]);
+	brelse(dind);
+
+exit_free:
+	kfree(primary);
+
+	return err;
+}
+
+/*
+ * Update the backup copies of the ext4 metadata.  These don't need to be part
+ * of the main resize transaction, because e2fsck will re-write them if there
+ * is a problem (basically only OOM will cause a problem).  However, we
+ * _should_ update the backups if possible, in case the primary gets trashed
+ * for some reason and we need to run e2fsck from a backup superblock.  The
+ * important part is that the new block and inode counts are in the backup
+ * superblocks, and the location of the new group metadata in the GDT backups.
+ *
+ * We do not need take the s_resize_lock for this, because these
+ * blocks are not otherwise touched by the filesystem code when it is
+ * mounted.  We don't need to worry about last changing from
+ * sbi->s_groups_count, because the worst that can happen is that we
+ * do not copy the full number of backups at this time.  The resize
+ * which changed s_groups_count will backup again.
+ */
+static void update_backups(struct super_block *sb,
+			   int blk_off, char *data, int size)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	const ext4_group_t last = sbi->s_groups_count;
+	const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
+	unsigned three = 1;
+	unsigned five = 5;
+	unsigned seven = 7;
+	ext4_group_t group;
+	int rest = sb->s_blocksize - size;
+	handle_t *handle;
+	int err = 0, err2;
+
+	handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+	if (IS_ERR(handle)) {
+		group = 1;
+		err = PTR_ERR(handle);
+		goto exit_err;
+	}
+
+	while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
+		struct buffer_head *bh;
+
+		/* Out of journal space, and can't get more - abort - so sad */
+		if (ext4_handle_valid(handle) &&
+		    handle->h_buffer_credits == 0 &&
+		    ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
+		    (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+			break;
+
+		bh = sb_getblk(sb, group * bpg + blk_off);
+		if (!bh) {
+			err = -EIO;
+			break;
+		}
+		ext4_debug("update metadata backup %#04lx\n",
+			  (unsigned long)bh->b_blocknr);
+		if ((err = ext4_journal_get_write_access(handle, bh)))
+			break;
+		lock_buffer(bh);
+		memcpy(bh->b_data, data, size);
+		if (rest)
+			memset(bh->b_data + size, 0, rest);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (unlikely(err))
+			ext4_std_error(sb, err);
+		brelse(bh);
+	}
+	if ((err2 = ext4_journal_stop(handle)) && !err)
+		err = err2;
+
+	/*
+	 * Ugh! Need to have e2fsck write the backup copies.  It is too
+	 * late to revert the resize, we shouldn't fail just because of
+	 * the backup copies (they are only needed in case of corruption).
+	 *
+	 * However, if we got here we have a journal problem too, so we
+	 * can't really start a transaction to mark the superblock.
+	 * Chicken out and just set the flag on the hope it will be written
+	 * to disk, and if not - we will simply wait until next fsck.
+	 */
+exit_err:
+	if (err) {
+		ext4_warning(sb, "can't update backup for group %u (err %d), "
+			     "forcing fsck on next reboot", group, err);
+		sbi->s_mount_state &= ~EXT4_VALID_FS;
+		sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
+		mark_buffer_dirty(sbi->s_sbh);
+	}
+}
+
+/* Add group descriptor data to an existing or new group descriptor block.
+ * Ensure we handle all possible error conditions _before_ we start modifying
+ * the filesystem, because we cannot abort the transaction and not have it
+ * write the data to disk.
+ *
+ * If we are on a GDT block boundary, we need to get the reserved GDT block.
+ * Otherwise, we may need to add backup GDT blocks for a sparse group.
+ *
+ * We only need to hold the superblock lock while we are actually adding
+ * in the new group's counts to the superblock.  Prior to that we have
+ * not really "added" the group at all.  We re-check that we are still
+ * adding in the last group in case things have changed since verifying.
+ */
+int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
+		le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+	struct buffer_head *primary = NULL;
+	struct ext4_group_desc *gdp;
+	struct inode *inode = NULL;
+	handle_t *handle;
+	int gdb_off, gdb_num;
+	int err, err2;
+
+	gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
+	gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
+
+	if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+		ext4_warning(sb, "Can't resize non-sparse filesystem further");
+		return -EPERM;
+	}
+
+	if (ext4_blocks_count(es) + input->blocks_count <
+	    ext4_blocks_count(es)) {
+		ext4_warning(sb, "blocks_count overflow");
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
+	    le32_to_cpu(es->s_inodes_count)) {
+		ext4_warning(sb, "inodes_count overflow");
+		return -EINVAL;
+	}
+
+	if (reserved_gdb || gdb_off == 0) {
+		if (!EXT4_HAS_COMPAT_FEATURE(sb,
+					     EXT4_FEATURE_COMPAT_RESIZE_INODE)
+		    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
+			ext4_warning(sb,
+				     "No reserved GDT blocks, can't resize");
+			return -EPERM;
+		}
+		inode = ext4_iget(sb, EXT4_RESIZE_INO);
+		if (IS_ERR(inode)) {
+			ext4_warning(sb, "Error opening resize inode");
+			return PTR_ERR(inode);
+		}
+	}
+
+
+	if ((err = verify_group_input(sb, input)))
+		goto exit_put;
+
+	if ((err = setup_new_group_blocks(sb, input)))
+		goto exit_put;
+
+	/*
+	 * We will always be modifying at least the superblock and a GDT
+	 * block.  If we are adding a group past the last current GDT block,
+	 * we will also modify the inode and the dindirect block.  If we
+	 * are adding a group with superblock/GDT backups  we will also
+	 * modify each of the reserved GDT dindirect blocks.
+	 */
+	handle = ext4_journal_start_sb(sb,
+				       ext4_bg_has_super(sb, input->group) ?
+				       3 + reserved_gdb : 4);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto exit_put;
+	}
+
+	mutex_lock(&sbi->s_resize_lock);
+	if (input->group != sbi->s_groups_count) {
+		ext4_warning(sb, "multiple resizers run on filesystem!");
+		err = -EBUSY;
+		goto exit_journal;
+	}
+
+	if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
+		goto exit_journal;
+
+        /*
+         * We will only either add reserved group blocks to a backup group
+         * or remove reserved blocks for the first group in a new group block.
+         * Doing both would be mean more complex code, and sane people don't
+         * use non-sparse filesystems anymore.  This is already checked above.
+         */
+	if (gdb_off) {
+		primary = sbi->s_group_desc[gdb_num];
+		if ((err = ext4_journal_get_write_access(handle, primary)))
+			goto exit_journal;
+
+		if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) &&
+		    (err = reserve_backup_gdb(handle, inode, input)))
+			goto exit_journal;
+	} else if ((err = add_new_gdb(handle, inode, input, &primary)))
+		goto exit_journal;
+
+        /*
+         * OK, now we've set up the new group.  Time to make it active.
+         *
+         * We do not lock all allocations via s_resize_lock
+         * so we have to be safe wrt. concurrent accesses the group
+         * data.  So we need to be careful to set all of the relevant
+         * group descriptor data etc. *before* we enable the group.
+         *
+         * The key field here is sbi->s_groups_count: as long as
+         * that retains its old value, nobody is going to access the new
+         * group.
+         *
+         * So first we update all the descriptor metadata for the new
+         * group; then we update the total disk blocks count; then we
+         * update the groups count to enable the group; then finally we
+         * update the free space counts so that the system can start
+         * using the new disk blocks.
+         */
+
+	/* Update group descriptor block for new group */
+	gdp = (struct ext4_group_desc *)((char *)primary->b_data +
+					 gdb_off * EXT4_DESC_SIZE(sb));
+
+	memset(gdp, 0, EXT4_DESC_SIZE(sb));
+	ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
+	ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
+	ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
+	ext4_free_blks_set(sb, gdp, input->free_blocks_count);
+	ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+	gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
+
+	/*
+	 * We can allocate memory for mb_alloc based on the new group
+	 * descriptor
+	 */
+	err = ext4_mb_add_groupinfo(sb, input->group, gdp);
+	if (err)
+		goto exit_journal;
+
+	/*
+	 * Make the new blocks and inodes valid next.  We do this before
+	 * increasing the group count so that once the group is enabled,
+	 * all of its blocks and inodes are already valid.
+	 *
+	 * We always allocate group-by-group, then block-by-block or
+	 * inode-by-inode within a group, so enabling these
+	 * blocks/inodes before the group is live won't actually let us
+	 * allocate the new space yet.
+	 */
+	ext4_blocks_count_set(es, ext4_blocks_count(es) +
+		input->blocks_count);
+	le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb));
+
+	/*
+	 * We need to protect s_groups_count against other CPUs seeing
+	 * inconsistent state in the superblock.
+	 *
+	 * The precise rules we use are:
+	 *
+	 * * Writers of s_groups_count *must* hold s_resize_lock
+	 * AND
+	 * * Writers must perform a smp_wmb() after updating all dependent
+	 *   data and before modifying the groups count
+	 *
+	 * * Readers must hold s_resize_lock over the access
+	 * OR
+	 * * Readers must perform an smp_rmb() after reading the groups count
+	 *   and before reading any dependent data.
+	 *
+	 * NB. These rules can be relaxed when checking the group count
+	 * while freeing data, as we can only allocate from a block
+	 * group after serialising against the group count, and we can
+	 * only then free after serialising in turn against that
+	 * allocation.
+	 */
+	smp_wmb();
+
+	/* Update the global fs size fields */
+	sbi->s_groups_count++;
+
+	err = ext4_handle_dirty_metadata(handle, NULL, primary);
+	if (unlikely(err)) {
+		ext4_std_error(sb, err);
+		goto exit_journal;
+	}
+
+	/* Update the reserved block counts only once the new group is
+	 * active. */
+	ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
+		input->reserved_blocks);
+
+	/* Update the free space counts */
+	percpu_counter_add(&sbi->s_freeblocks_counter,
+			   input->free_blocks_count);
+	percpu_counter_add(&sbi->s_freeinodes_counter,
+			   EXT4_INODES_PER_GROUP(sb));
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+	    sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group;
+		flex_group = ext4_flex_group(sbi, input->group);
+		atomic_add(input->free_blocks_count,
+			   &sbi->s_flex_groups[flex_group].free_blocks);
+		atomic_add(EXT4_INODES_PER_GROUP(sb),
+			   &sbi->s_flex_groups[flex_group].free_inodes);
+	}
+
+	ext4_handle_dirty_super(handle, sb);
+
+exit_journal:
+	mutex_unlock(&sbi->s_resize_lock);
+	if ((err2 = ext4_journal_stop(handle)) && !err)
+		err = err2;
+	if (!err) {
+		update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+			       sizeof(struct ext4_super_block));
+		update_backups(sb, primary->b_blocknr, primary->b_data,
+			       primary->b_size);
+	}
+exit_put:
+	iput(inode);
+	return err;
+} /* ext4_group_add */
+
+/*
+ * Extend the filesystem to the new number of blocks specified.  This entry
+ * point is only used to extend the current filesystem to the end of the last
+ * existing group.  It can be accessed via ioctl, or by "remount,resize=<size>"
+ * for emergencies (because it has no dependencies on reserved blocks).
+ *
+ * If we _really_ wanted, we could use default values to call ext4_group_add()
+ * allow the "remount" trick to work for arbitrary resizing, assuming enough
+ * GDT blocks are reserved to grow to the desired size.
+ */
+int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
+		      ext4_fsblk_t n_blocks_count)
+{
+	ext4_fsblk_t o_blocks_count;
+	ext4_grpblk_t last;
+	ext4_grpblk_t add;
+	struct buffer_head *bh;
+	handle_t *handle;
+	int err;
+	ext4_group_t group;
+
+	/* We don't need to worry about locking wrt other resizers just
+	 * yet: we're going to revalidate es->s_blocks_count after
+	 * taking the s_resize_lock below. */
+	o_blocks_count = ext4_blocks_count(es);
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
+		       o_blocks_count, n_blocks_count);
+
+	if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
+		return 0;
+
+	if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+		printk(KERN_ERR "EXT4-fs: filesystem on %s:"
+			" too large to resize to %llu blocks safely\n",
+			sb->s_id, n_blocks_count);
+		if (sizeof(sector_t) < 8)
+			ext4_warning(sb, "CONFIG_LBDAF not enabled");
+		return -EINVAL;
+	}
+
+	if (n_blocks_count < o_blocks_count) {
+		ext4_warning(sb, "can't shrink FS - resize aborted");
+		return -EBUSY;
+	}
+
+	/* Handle the remaining blocks in the last group only. */
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+
+	if (last == 0) {
+		ext4_warning(sb, "need to use ext2online to resize further");
+		return -EPERM;
+	}
+
+	add = EXT4_BLOCKS_PER_GROUP(sb) - last;
+
+	if (o_blocks_count + add < o_blocks_count) {
+		ext4_warning(sb, "blocks_count overflow");
+		return -EINVAL;
+	}
+
+	if (o_blocks_count + add > n_blocks_count)
+		add = n_blocks_count - o_blocks_count;
+
+	if (o_blocks_count + add < n_blocks_count)
+		ext4_warning(sb, "will only finish group (%llu blocks, %u new)",
+			     o_blocks_count + add, add);
+
+	/* See if the device is actually as big as what was requested */
+	bh = sb_bread(sb, o_blocks_count + add - 1);
+	if (!bh) {
+		ext4_warning(sb, "can't read last block, resize aborted");
+		return -ENOSPC;
+	}
+	brelse(bh);
+
+	/* We will update the superblock, one block bitmap, and
+	 * one group descriptor via ext4_free_blocks().
+	 */
+	handle = ext4_journal_start_sb(sb, 3);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		ext4_warning(sb, "error %d on journal start", err);
+		goto exit_put;
+	}
+
+	mutex_lock(&EXT4_SB(sb)->s_resize_lock);
+	if (o_blocks_count != ext4_blocks_count(es)) {
+		ext4_warning(sb, "multiple resizers run on filesystem!");
+		mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
+		ext4_journal_stop(handle);
+		err = -EBUSY;
+		goto exit_put;
+	}
+
+	if ((err = ext4_journal_get_write_access(handle,
+						 EXT4_SB(sb)->s_sbh))) {
+		ext4_warning(sb, "error %d on journal write access", err);
+		mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
+		ext4_journal_stop(handle);
+		goto exit_put;
+	}
+	ext4_blocks_count_set(es, o_blocks_count + add);
+	mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
+	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
+		   o_blocks_count + add);
+	/* We add the blocks to the bitmap and set the group need init bit */
+	ext4_add_groupblocks(handle, sb, o_blocks_count, add);
+	ext4_handle_dirty_super(handle, sb);
+	ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
+		   o_blocks_count + add);
+	if ((err = ext4_journal_stop(handle)))
+		goto exit_put;
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
+		       ext4_blocks_count(es));
+	update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
+		       sizeof(struct ext4_super_block));
+exit_put:
+	return err;
+} /* ext4_group_extend */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
new file mode 100644
index 00000000..113b1076
--- /dev/null
+++ b/fs/ext4/super.c
@@ -0,0 +1,5027 @@
+/*
+ *  linux/fs/ext4/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/vmalloc.h>
+#include <linux/jbd2.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/log2.h>
+#include <linux/crc16.h>
+#include <linux/cleancache.h>
+#include <asm/uaccess.h>
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "mballoc.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/ext4.h>
+
+static struct proc_dir_entry *ext4_proc_root;
+static struct kset *ext4_kset;
+static struct ext4_lazy_init *ext4_li_info;
+static struct mutex ext4_li_mtx;
+static struct ext4_features *ext4_feat;
+
+static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
+			     unsigned long journal_devnum);
+static int ext4_commit_super(struct super_block *sb, int sync);
+static void ext4_mark_recovery_complete(struct super_block *sb,
+					struct ext4_super_block *es);
+static void ext4_clear_journal_err(struct super_block *sb,
+				   struct ext4_super_block *es);
+static int ext4_sync_fs(struct super_block *sb, int wait);
+static const char *ext4_decode_error(struct super_block *sb, int errno,
+				     char nbuf[16]);
+static int ext4_remount(struct super_block *sb, int *flags, char *data);
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
+static int ext4_unfreeze(struct super_block *sb);
+static void ext4_write_super(struct super_block *sb);
+static int ext4_freeze(struct super_block *sb);
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data);
+static inline int ext2_feature_set_ok(struct super_block *sb);
+static inline int ext3_feature_set_ok(struct super_block *sb);
+static int ext4_feature_set_ok(struct super_block *sb, int readonly);
+static void ext4_destroy_lazyinit_thread(void);
+static void ext4_unregister_li_request(struct super_block *sb);
+static void ext4_clear_request_list(void);
+
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext2",
+	.mount		= ext4_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#else
+#define IS_EXT2_SB(sb) (0)
+#endif
+
+
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext3_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext3",
+	.mount		= ext4_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+#else
+#define IS_EXT3_SB(sb) (0)
+#endif
+
+ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+			       struct ext4_group_desc *bg)
+{
+	return le32_to_cpu(bg->bg_block_bitmap_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+}
+
+ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
+			       struct ext4_group_desc *bg)
+{
+	return le32_to_cpu(bg->bg_inode_bitmap_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+}
+
+ext4_fsblk_t ext4_inode_table(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le32_to_cpu(bg->bg_inode_table_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+}
+
+__u32 ext4_free_blks_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+}
+
+__u32 ext4_free_inodes_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+}
+
+__u32 ext4_used_dirs_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+}
+
+__u32 ext4_itable_unused_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_itable_unused_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+}
+
+void ext4_block_bitmap_set(struct super_block *sb,
+			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+	bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
+}
+
+void ext4_inode_bitmap_set(struct super_block *sb,
+			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+	bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
+}
+
+void ext4_inode_table_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+	bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
+}
+
+void ext4_free_blks_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_free_inodes_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_used_dirs_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_itable_unused_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
+}
+
+
+/* Just increment the non-pointer handle value */
+static handle_t *ext4_get_nojournal(void)
+{
+	handle_t *handle = current->journal_info;
+	unsigned long ref_cnt = (unsigned long)handle;
+
+	BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
+
+	ref_cnt++;
+	handle = (handle_t *)ref_cnt;
+
+	current->journal_info = handle;
+	return handle;
+}
+
+
+/* Decrement the non-pointer handle value */
+static void ext4_put_nojournal(handle_t *handle)
+{
+	unsigned long ref_cnt = (unsigned long)handle;
+
+	BUG_ON(ref_cnt == 0);
+
+	ref_cnt--;
+	handle = (handle_t *)ref_cnt;
+
+	current->journal_info = handle;
+}
+
+/*
+ * Wrappers for jbd2_journal_start/end.
+ *
+ * The only special thing we need to do here is to make sure that all
+ * journal_end calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ *
+ * To avoid j_barrier hold in userspace when a user calls freeze(),
+ * ext4 prevents a new handle from being started by s_frozen, which
+ * is in an upper layer.
+ */
+handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
+{
+	journal_t *journal;
+	handle_t  *handle;
+
+	if (sb->s_flags & MS_RDONLY)
+		return ERR_PTR(-EROFS);
+
+	journal = EXT4_SB(sb)->s_journal;
+	handle = ext4_journal_current_handle();
+
+	/*
+	 * If a handle has been started, it should be allowed to
+	 * finish, otherwise deadlock could happen between freeze
+	 * and others(e.g. truncate) due to the restart of the
+	 * journal handle if the filesystem is forzen and active
+	 * handles are not stopped.
+	 */
+	if (!handle)
+		vfs_check_frozen(sb, SB_FREEZE_TRANS);
+
+	if (!journal)
+		return ext4_get_nojournal();
+	/*
+	 * Special case here: if the journal has aborted behind our
+	 * backs (eg. EIO in the commit thread), then we still need to
+	 * take the FS itself readonly cleanly.
+	 */
+	if (is_journal_aborted(journal)) {
+		ext4_abort(sb, "Detected aborted journal");
+		return ERR_PTR(-EROFS);
+	}
+	return jbd2_journal_start(journal, nblocks);
+}
+
+/*
+ * The only special thing we need to do here is to make sure that all
+ * jbd2_journal_stop calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ */
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
+{
+	struct super_block *sb;
+	int err;
+	int rc;
+
+	if (!ext4_handle_valid(handle)) {
+		ext4_put_nojournal(handle);
+		return 0;
+	}
+	sb = handle->h_transaction->t_journal->j_private;
+	err = handle->h_err;
+	rc = jbd2_journal_stop(handle);
+
+	if (!err)
+		err = rc;
+	if (err)
+		__ext4_std_error(sb, where, line, err);
+	return err;
+}
+
+void ext4_journal_abort_handle(const char *caller, unsigned int line,
+			       const char *err_fn, struct buffer_head *bh,
+			       handle_t *handle, int err)
+{
+	char nbuf[16];
+	const char *errstr = ext4_decode_error(NULL, err, nbuf);
+
+	BUG_ON(!ext4_handle_valid(handle));
+
+	if (bh)
+		BUFFER_TRACE(bh, "abort");
+
+	if (!handle->h_err)
+		handle->h_err = err;
+
+	if (is_handle_aborted(handle))
+		return;
+
+	printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n",
+	       caller, line, errstr, err_fn);
+
+	jbd2_journal_abort_handle(handle);
+}
+
+static void __save_error_info(struct super_block *sb, const char *func,
+			    unsigned int line)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+	es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+	es->s_last_error_time = cpu_to_le32(get_seconds());
+	strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
+	es->s_last_error_line = cpu_to_le32(line);
+	if (!es->s_first_error_time) {
+		es->s_first_error_time = es->s_last_error_time;
+		strncpy(es->s_first_error_func, func,
+			sizeof(es->s_first_error_func));
+		es->s_first_error_line = cpu_to_le32(line);
+		es->s_first_error_ino = es->s_last_error_ino;
+		es->s_first_error_block = es->s_last_error_block;
+	}
+	/*
+	 * Start the daily error reporting function if it hasn't been
+	 * started already
+	 */
+	if (!es->s_error_count)
+		mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
+	es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1);
+}
+
+static void save_error_info(struct super_block *sb, const char *func,
+			    unsigned int line)
+{
+	__save_error_info(sb, func, line);
+	ext4_commit_super(sb, 1);
+}
+
+
+/* Deal with the reporting of failure conditions on a filesystem such as
+ * inconsistencies detected or read IO failures.
+ *
+ * On ext2, we can store the error state of the filesystem in the
+ * superblock.  That is not possible on ext4, because we may have other
+ * write ordering constraints on the superblock which prevent us from
+ * writing it out straight away; and given that the journal is about to
+ * be aborted, we can't rely on the current, or future, transactions to
+ * write out the superblock safely.
+ *
+ * We'll just use the jbd2_journal_abort() error code to record an error in
+ * the journal instead.  On recovery, the journal will complain about
+ * that error until we've noted it down and cleared it.
+ */
+
+static void ext4_handle_error(struct super_block *sb)
+{
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	if (!test_opt(sb, ERRORS_CONT)) {
+		journal_t *journal = EXT4_SB(sb)->s_journal;
+
+		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+		if (journal)
+			jbd2_journal_abort(journal, -EIO);
+	}
+	if (test_opt(sb, ERRORS_RO)) {
+		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+		sb->s_flags |= MS_RDONLY;
+	}
+	if (test_opt(sb, ERRORS_PANIC))
+		panic("EXT4-fs (device %s): panic forced after error\n",
+			sb->s_id);
+}
+
+void __ext4_error(struct super_block *sb, const char *function,
+		  unsigned int line, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
+	       sb->s_id, function, line, current->comm, &vaf);
+	va_end(args);
+	save_error_info(sb, function, line);
+
+	ext4_handle_error(sb);
+}
+
+void ext4_error_inode(struct inode *inode, const char *function,
+		      unsigned int line, ext4_fsblk_t block,
+		      const char *fmt, ...)
+{
+	va_list args;
+	struct va_format vaf;
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+
+	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+	es->s_last_error_block = cpu_to_le64(block);
+	save_error_info(inode->i_sb, function, line);
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
+	       inode->i_sb->s_id, function, line, inode->i_ino);
+	if (block)
+		printk(KERN_CONT "block %llu: ", block);
+	printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
+	va_end(args);
+
+	ext4_handle_error(inode->i_sb);
+}
+
+void ext4_error_file(struct file *file, const char *function,
+		     unsigned int line, ext4_fsblk_t block,
+		     const char *fmt, ...)
+{
+	va_list args;
+	struct va_format vaf;
+	struct ext4_super_block *es;
+	struct inode *inode = file->f_dentry->d_inode;
+	char pathname[80], *path;
+
+	es = EXT4_SB(inode->i_sb)->s_es;
+	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+	save_error_info(inode->i_sb, function, line);
+	path = d_path(&(file->f_path), pathname, sizeof(pathname));
+	if (IS_ERR(path))
+		path = "(unknown)";
+	printk(KERN_CRIT
+	       "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
+	       inode->i_sb->s_id, function, line, inode->i_ino);
+	if (block)
+		printk(KERN_CONT "block %llu: ", block);
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
+	va_end(args);
+
+	ext4_handle_error(inode->i_sb);
+}
+
+static const char *ext4_decode_error(struct super_block *sb, int errno,
+				     char nbuf[16])
+{
+	char *errstr = NULL;
+
+	switch (errno) {
+	case -EIO:
+		errstr = "IO failure";
+		break;
+	case -ENOMEM:
+		errstr = "Out of memory";
+		break;
+	case -EROFS:
+		if (!sb || (EXT4_SB(sb)->s_journal &&
+			    EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
+			errstr = "Journal has aborted";
+		else
+			errstr = "Readonly filesystem";
+		break;
+	default:
+		/* If the caller passed in an extra buffer for unknown
+		 * errors, textualise them now.  Else we just return
+		 * NULL. */
+		if (nbuf) {
+			/* Check for truncated error codes... */
+			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+				errstr = nbuf;
+		}
+		break;
+	}
+
+	return errstr;
+}
+
+/* __ext4_std_error decodes expected errors from journaling functions
+ * automatically and invokes the appropriate error response.  */
+
+void __ext4_std_error(struct super_block *sb, const char *function,
+		      unsigned int line, int errno)
+{
+	char nbuf[16];
+	const char *errstr;
+
+	/* Special case: if the error is EROFS, and we're not already
+	 * inside a transaction, then there's really no point in logging
+	 * an error. */
+	if (errno == -EROFS && journal_current_handle() == NULL &&
+	    (sb->s_flags & MS_RDONLY))
+		return;
+
+	errstr = ext4_decode_error(sb, errno, nbuf);
+	printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
+	       sb->s_id, function, line, errstr);
+	save_error_info(sb, function, line);
+
+	ext4_handle_error(sb);
+}
+
+/*
+ * ext4_abort is a much stronger failure handler than ext4_error.  The
+ * abort function may be used to deal with unrecoverable failures such
+ * as journal IO errors or ENOMEM at a critical moment in log management.
+ *
+ * We unconditionally force the filesystem into an ABORT|READONLY state,
+ * unless the error response on the fs has been set to panic in which
+ * case we take the easy way out and panic immediately.
+ */
+
+void __ext4_abort(struct super_block *sb, const char *function,
+		unsigned int line, const char *fmt, ...)
+{
+	va_list args;
+
+	save_error_info(sb, function, line);
+	va_start(args, fmt);
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
+	       function, line);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+
+	if ((sb->s_flags & MS_RDONLY) == 0) {
+		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+		sb->s_flags |= MS_RDONLY;
+		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+		if (EXT4_SB(sb)->s_journal)
+			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+		save_error_info(sb, function, line);
+	}
+	if (test_opt(sb, ERRORS_PANIC))
+		panic("EXT4-fs panic from previous error\n");
+}
+
+void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+	va_end(args);
+}
+
+void __ext4_warning(struct super_block *sb, const char *function,
+		    unsigned int line, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
+	       sb->s_id, function, line, &vaf);
+	va_end(args);
+}
+
+void __ext4_grp_locked_error(const char *function, unsigned int line,
+			     struct super_block *sb, ext4_group_t grp,
+			     unsigned long ino, ext4_fsblk_t block,
+			     const char *fmt, ...)
+__releases(bitlock)
+__acquires(bitlock)
+{
+	struct va_format vaf;
+	va_list args;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	es->s_last_error_ino = cpu_to_le32(ino);
+	es->s_last_error_block = cpu_to_le64(block);
+	__save_error_info(sb, function, line);
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
+	       sb->s_id, function, line, grp);
+	if (ino)
+		printk(KERN_CONT "inode %lu: ", ino);
+	if (block)
+		printk(KERN_CONT "block %llu:", (unsigned long long) block);
+	printk(KERN_CONT "%pV\n", &vaf);
+	va_end(args);
+
+	if (test_opt(sb, ERRORS_CONT)) {
+		ext4_commit_super(sb, 0);
+		return;
+	}
+
+	ext4_unlock_group(sb, grp);
+	ext4_handle_error(sb);
+	/*
+	 * We only get here in the ERRORS_RO case; relocking the group
+	 * may be dangerous, but nothing bad will happen since the
+	 * filesystem will have already been marked read/only and the
+	 * journal has been aborted.  We return 1 as a hint to callers
+	 * who might what to use the return value from
+	 * ext4_grp_locked_error() to distinguish between the
+	 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
+	 * aggressively from the ext4 function in question, with a
+	 * more appropriate error code.
+	 */
+	ext4_lock_group(sb, grp);
+	return;
+}
+
+void ext4_update_dynamic_rev(struct super_block *sb)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
+		return;
+
+	ext4_warning(sb,
+		     "updating to rev %d because of new feature flag, "
+		     "running e2fsck is recommended",
+		     EXT4_DYNAMIC_REV);
+
+	es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
+	es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
+	es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
+	/* leave es->s_feature_*compat flags alone */
+	/* es->s_uuid will be set by e2fsck if empty */
+
+	/*
+	 * The rest of the superblock fields should be zero, and if not it
+	 * means they are likely already in use, so leave them alone.  We
+	 * can leave it up to e2fsck to clean up any inconsistencies there.
+	 */
+}
+
+/*
+ * Open the external journal device
+ */
+static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
+{
+	struct block_device *bdev;
+	char b[BDEVNAME_SIZE];
+
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
+	if (IS_ERR(bdev))
+		goto fail;
+	return bdev;
+
+fail:
+	ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
+			__bdevname(dev, b), PTR_ERR(bdev));
+	return NULL;
+}
+
+/*
+ * Release the journal device
+ */
+static int ext4_blkdev_put(struct block_device *bdev)
+{
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+}
+
+static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
+{
+	struct block_device *bdev;
+	int ret = -ENODEV;
+
+	bdev = sbi->journal_bdev;
+	if (bdev) {
+		ret = ext4_blkdev_put(bdev);
+		sbi->journal_bdev = NULL;
+	}
+	return ret;
+}
+
+static inline struct inode *orphan_list_entry(struct list_head *l)
+{
+	return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
+}
+
+static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
+{
+	struct list_head *l;
+
+	ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
+		 le32_to_cpu(sbi->s_es->s_last_orphan));
+
+	printk(KERN_ERR "sb_info orphan list:\n");
+	list_for_each(l, &sbi->s_orphan) {
+		struct inode *inode = orphan_list_entry(l);
+		printk(KERN_ERR "  "
+		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
+		       inode->i_sb->s_id, inode->i_ino, inode,
+		       inode->i_mode, inode->i_nlink,
+		       NEXT_ORPHAN(inode));
+	}
+}
+
+static void ext4_put_super(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int i, err;
+
+	ext4_unregister_li_request(sb);
+	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
+	flush_workqueue(sbi->dio_unwritten_wq);
+	destroy_workqueue(sbi->dio_unwritten_wq);
+
+	lock_super(sb);
+	if (sb->s_dirt)
+		ext4_commit_super(sb, 1);
+
+	if (sbi->s_journal) {
+		err = jbd2_journal_destroy(sbi->s_journal);
+		sbi->s_journal = NULL;
+		if (err < 0)
+			ext4_abort(sb, "Couldn't clean up the journal");
+	}
+
+	del_timer(&sbi->s_err_report);
+	ext4_release_system_zone(sb);
+	ext4_mb_release(sb);
+	ext4_ext_release(sb);
+	ext4_xattr_put_super(sb);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+		es->s_state = cpu_to_le16(sbi->s_mount_state);
+		ext4_commit_super(sb, 1);
+	}
+	if (sbi->s_proc) {
+		remove_proc_entry(sb->s_id, ext4_proc_root);
+	}
+	kobject_del(&sbi->s_kobj);
+
+	for (i = 0; i < sbi->s_gdb_count; i++)
+		brelse(sbi->s_group_desc[i]);
+	kfree(sbi->s_group_desc);
+	if (is_vmalloc_addr(sbi->s_flex_groups))
+		vfree(sbi->s_flex_groups);
+	else
+		kfree(sbi->s_flex_groups);
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+	brelse(sbi->s_sbh);
+#ifdef CONFIG_QUOTA
+	for (i = 0; i < MAXQUOTAS; i++)
+		kfree(sbi->s_qf_names[i]);
+#endif
+
+	/* Debugging code just in case the in-memory inode orphan list
+	 * isn't empty.  The on-disk one can be non-empty if we've
+	 * detected an error and taken the fs readonly, but the
+	 * in-memory list had better be clean by this point. */
+	if (!list_empty(&sbi->s_orphan))
+		dump_orphan_list(sb, sbi);
+	J_ASSERT(list_empty(&sbi->s_orphan));
+
+	invalidate_bdev(sb->s_bdev);
+	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
+		/*
+		 * Invalidate the journal device's buffers.  We don't want them
+		 * floating about in memory - the physical journal device may
+		 * hotswapped, and it breaks the `ro-after' testing code.
+		 */
+		sync_blockdev(sbi->journal_bdev);
+		invalidate_bdev(sbi->journal_bdev);
+		ext4_blkdev_remove(sbi);
+	}
+	if (sbi->s_mmp_tsk)
+		kthread_stop(sbi->s_mmp_tsk);
+	sb->s_fs_info = NULL;
+	/*
+	 * Now that we are completely done shutting down the
+	 * superblock, we need to actually destroy the kobject.
+	 */
+	unlock_super(sb);
+	kobject_put(&sbi->s_kobj);
+	wait_for_completion(&sbi->s_kobj_unregister);
+	kfree(sbi->s_blockgroup_lock);
+	kfree(sbi);
+}
+
+static struct kmem_cache *ext4_inode_cachep;
+
+/*
+ * Called inside transaction, so use GFP_NOFS
+ */
+static struct inode *ext4_alloc_inode(struct super_block *sb)
+{
+	struct ext4_inode_info *ei;
+
+	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+
+	ei->vfs_inode.i_version = 1;
+	ei->vfs_inode.i_data.writeback_index = 0;
+	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+	INIT_LIST_HEAD(&ei->i_prealloc_list);
+	spin_lock_init(&ei->i_prealloc_lock);
+	ei->i_reserved_data_blocks = 0;
+	ei->i_reserved_meta_blocks = 0;
+	ei->i_allocated_meta_blocks = 0;
+	ei->i_da_metadata_calc_len = 0;
+	spin_lock_init(&(ei->i_block_reservation_lock));
+#ifdef CONFIG_QUOTA
+	ei->i_reserved_quota = 0;
+#endif
+	ei->jinode = NULL;
+	INIT_LIST_HEAD(&ei->i_completed_io_list);
+	spin_lock_init(&ei->i_completed_io_lock);
+	ei->cur_aio_dio = NULL;
+	ei->i_sync_tid = 0;
+	ei->i_datasync_tid = 0;
+	atomic_set(&ei->i_ioend_count, 0);
+	atomic_set(&ei->i_aiodio_unwritten, 0);
+
+	return &ei->vfs_inode;
+}
+
+static int ext4_drop_inode(struct inode *inode)
+{
+	int drop = generic_drop_inode(inode);
+
+	trace_ext4_drop_inode(inode, drop);
+	return drop;
+}
+
+static void ext4_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
+
+static void ext4_destroy_inode(struct inode *inode)
+{
+	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
+		ext4_msg(inode->i_sb, KERN_ERR,
+			 "Inode %lu (%p): orphan list check failed!",
+			 inode->i_ino, EXT4_I(inode));
+		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
+				EXT4_I(inode), sizeof(struct ext4_inode_info),
+				true);
+		dump_stack();
+	}
+	call_rcu(&inode->i_rcu, ext4_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
+
+	INIT_LIST_HEAD(&ei->i_orphan);
+#ifdef CONFIG_EXT4_FS_XATTR
+	init_rwsem(&ei->xattr_sem);
+#endif
+	init_rwsem(&ei->i_data_sem);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
+					     sizeof(struct ext4_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (ext4_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(ext4_inode_cachep);
+}
+
+void ext4_clear_inode(struct inode *inode)
+{
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
+	dquot_drop(inode);
+	ext4_discard_preallocations(inode);
+	if (EXT4_I(inode)->jinode) {
+		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
+					       EXT4_I(inode)->jinode);
+		jbd2_free_inode(EXT4_I(inode)->jinode);
+		EXT4_I(inode)->jinode = NULL;
+	}
+}
+
+static inline void ext4_show_quota_options(struct seq_file *seq,
+					   struct super_block *sb)
+{
+#if defined(CONFIG_QUOTA)
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (sbi->s_jquota_fmt) {
+		char *fmtname = "";
+
+		switch (sbi->s_jquota_fmt) {
+		case QFMT_VFS_OLD:
+			fmtname = "vfsold";
+			break;
+		case QFMT_VFS_V0:
+			fmtname = "vfsv0";
+			break;
+		case QFMT_VFS_V1:
+			fmtname = "vfsv1";
+			break;
+		}
+		seq_printf(seq, ",jqfmt=%s", fmtname);
+	}
+
+	if (sbi->s_qf_names[USRQUOTA])
+		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+
+	if (sbi->s_qf_names[GRPQUOTA])
+		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+
+	if (test_opt(sb, USRQUOTA))
+		seq_puts(seq, ",usrquota");
+
+	if (test_opt(sb, GRPQUOTA))
+		seq_puts(seq, ",grpquota");
+#endif
+}
+
+/*
+ * Show an option if
+ *  - it's set to a non-default value OR
+ *  - if the per-sb default is different from the global default
+ */
+static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	int def_errors;
+	unsigned long def_mount_opts;
+	struct super_block *sb = vfs->mnt_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+
+	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+	def_errors     = le16_to_cpu(es->s_errors);
+
+	if (sbi->s_sb_block != 1)
+		seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
+	if (test_opt(sb, MINIX_DF))
+		seq_puts(seq, ",minixdf");
+	if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
+		seq_puts(seq, ",grpid");
+	if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
+		seq_puts(seq, ",nogrpid");
+	if (sbi->s_resuid != EXT4_DEF_RESUID ||
+	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) {
+		seq_printf(seq, ",resuid=%u", sbi->s_resuid);
+	}
+	if (sbi->s_resgid != EXT4_DEF_RESGID ||
+	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
+		seq_printf(seq, ",resgid=%u", sbi->s_resgid);
+	}
+	if (test_opt(sb, ERRORS_RO)) {
+		if (def_errors == EXT4_ERRORS_PANIC ||
+		    def_errors == EXT4_ERRORS_CONTINUE) {
+			seq_puts(seq, ",errors=remount-ro");
+		}
+	}
+	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
+		seq_puts(seq, ",errors=continue");
+	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
+		seq_puts(seq, ",errors=panic");
+	if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
+		seq_puts(seq, ",nouid32");
+	if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
+		seq_puts(seq, ",debug");
+	if (test_opt(sb, OLDALLOC))
+		seq_puts(seq, ",oldalloc");
+#ifdef CONFIG_EXT4_FS_XATTR
+	if (test_opt(sb, XATTR_USER))
+		seq_puts(seq, ",user_xattr");
+	if (!test_opt(sb, XATTR_USER))
+		seq_puts(seq, ",nouser_xattr");
+#endif
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
+		seq_puts(seq, ",acl");
+	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
+		seq_puts(seq, ",noacl");
+#endif
+	if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
+		seq_printf(seq, ",commit=%u",
+			   (unsigned) (sbi->s_commit_interval / HZ));
+	}
+	if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
+		seq_printf(seq, ",min_batch_time=%u",
+			   (unsigned) sbi->s_min_batch_time);
+	}
+	if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
+		seq_printf(seq, ",max_batch_time=%u",
+			   (unsigned) sbi->s_min_batch_time);
+	}
+
+	/*
+	 * We're changing the default of barrier mount option, so
+	 * let's always display its mount state so it's clear what its
+	 * status is.
+	 */
+	seq_puts(seq, ",barrier=");
+	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
+	if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+		seq_puts(seq, ",journal_async_commit");
+	else if (test_opt(sb, JOURNAL_CHECKSUM))
+		seq_puts(seq, ",journal_checksum");
+	if (test_opt(sb, I_VERSION))
+		seq_puts(seq, ",i_version");
+	if (!test_opt(sb, DELALLOC) &&
+	    !(def_mount_opts & EXT4_DEFM_NODELALLOC))
+		seq_puts(seq, ",nodelalloc");
+
+	if (!test_opt(sb, MBLK_IO_SUBMIT))
+		seq_puts(seq, ",nomblk_io_submit");
+	if (sbi->s_stripe)
+		seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
+	/*
+	 * journal mode get enabled in different ways
+	 * So just print the value even if we didn't specify it
+	 */
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+		seq_puts(seq, ",data=journal");
+	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+		seq_puts(seq, ",data=ordered");
+	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+		seq_puts(seq, ",data=writeback");
+
+	if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+		seq_printf(seq, ",inode_readahead_blks=%u",
+			   sbi->s_inode_readahead_blks);
+
+	if (test_opt(sb, DATA_ERR_ABORT))
+		seq_puts(seq, ",data_err=abort");
+
+	if (test_opt(sb, NO_AUTO_DA_ALLOC))
+		seq_puts(seq, ",noauto_da_alloc");
+
+	if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
+		seq_puts(seq, ",discard");
+
+	if (test_opt(sb, NOLOAD))
+		seq_puts(seq, ",norecovery");
+
+	if (test_opt(sb, DIOREAD_NOLOCK))
+		seq_puts(seq, ",dioread_nolock");
+
+	if (test_opt(sb, BLOCK_VALIDITY) &&
+	    !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
+		seq_puts(seq, ",block_validity");
+
+	if (!test_opt(sb, INIT_INODE_TABLE))
+		seq_puts(seq, ",noinit_itable");
+	else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
+		seq_printf(seq, ",init_itable=%u",
+			   (unsigned) sbi->s_li_wait_mult);
+
+	ext4_show_quota_options(seq, sb);
+
+	return 0;
+}
+
+static struct inode *ext4_nfs_get_inode(struct super_block *sb,
+					u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
+		return ERR_PTR(-ESTALE);
+	if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
+		return ERR_PTR(-ESTALE);
+
+	/* iget isn't really right if the inode is currently unallocated!!
+	 *
+	 * ext4_read_inode will return a bad_inode if the inode had been
+	 * deleted, so we should be safe.
+	 *
+	 * Currently we don't know the generation for parent directory, so
+	 * a generation of 0 means "accept any"
+	 */
+	inode = ext4_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return inode;
+}
+
+static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
+					int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    ext4_nfs_get_inode);
+}
+
+static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
+					int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    ext4_nfs_get_inode);
+}
+
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device.  Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd2 layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+				 gfp_t wait)
+{
+	journal_t *journal = EXT4_SB(sb)->s_journal;
+
+	WARN_ON(PageChecked(page));
+	if (!page_has_buffers(page))
+		return 0;
+	if (journal)
+		return jbd2_journal_try_to_free_buffers(journal, page,
+							wait & ~__GFP_WAIT);
+	return try_to_free_buffers(page);
+}
+
+#ifdef CONFIG_QUOTA
+#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
+#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+
+static int ext4_write_dquot(struct dquot *dquot);
+static int ext4_acquire_dquot(struct dquot *dquot);
+static int ext4_release_dquot(struct dquot *dquot);
+static int ext4_mark_dquot_dirty(struct dquot *dquot);
+static int ext4_write_info(struct super_block *sb, int type);
+static int ext4_quota_on(struct super_block *sb, int type, int format_id,
+			 struct path *path);
+static int ext4_quota_off(struct super_block *sb, int type);
+static int ext4_quota_on_mount(struct super_block *sb, int type);
+static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
+			       size_t len, loff_t off);
+static ssize_t ext4_quota_write(struct super_block *sb, int type,
+				const char *data, size_t len, loff_t off);
+
+static const struct dquot_operations ext4_quota_operations = {
+	.get_reserved_space = ext4_get_reserved_space,
+	.write_dquot	= ext4_write_dquot,
+	.acquire_dquot	= ext4_acquire_dquot,
+	.release_dquot	= ext4_release_dquot,
+	.mark_dirty	= ext4_mark_dquot_dirty,
+	.write_info	= ext4_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
+};
+
+static const struct quotactl_ops ext4_qctl_operations = {
+	.quota_on	= ext4_quota_on,
+	.quota_off	= ext4_quota_off,
+	.quota_sync	= dquot_quota_sync,
+	.get_info	= dquot_get_dqinfo,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk
+};
+#endif
+
+static const struct super_operations ext4_sops = {
+	.alloc_inode	= ext4_alloc_inode,
+	.destroy_inode	= ext4_destroy_inode,
+	.write_inode	= ext4_write_inode,
+	.dirty_inode	= ext4_dirty_inode,
+	.drop_inode	= ext4_drop_inode,
+	.evict_inode	= ext4_evict_inode,
+	.put_super	= ext4_put_super,
+	.sync_fs	= ext4_sync_fs,
+	.freeze_fs	= ext4_freeze,
+	.unfreeze_fs	= ext4_unfreeze,
+	.statfs		= ext4_statfs,
+	.remount_fs	= ext4_remount,
+	.show_options	= ext4_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read	= ext4_quota_read,
+	.quota_write	= ext4_quota_write,
+#endif
+	.bdev_try_to_free_page = bdev_try_to_free_page,
+};
+
+static const struct super_operations ext4_nojournal_sops = {
+	.alloc_inode	= ext4_alloc_inode,
+	.destroy_inode	= ext4_destroy_inode,
+	.write_inode	= ext4_write_inode,
+	.dirty_inode	= ext4_dirty_inode,
+	.drop_inode	= ext4_drop_inode,
+	.evict_inode	= ext4_evict_inode,
+	.write_super	= ext4_write_super,
+	.put_super	= ext4_put_super,
+	.statfs		= ext4_statfs,
+	.remount_fs	= ext4_remount,
+	.show_options	= ext4_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read	= ext4_quota_read,
+	.quota_write	= ext4_quota_write,
+#endif
+	.bdev_try_to_free_page = bdev_try_to_free_page,
+};
+
+static const struct export_operations ext4_export_ops = {
+	.fh_to_dentry = ext4_fh_to_dentry,
+	.fh_to_parent = ext4_fh_to_parent,
+	.get_parent = ext4_get_parent,
+};
+
+enum {
+	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
+	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
+	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
+	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
+	Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
+	Opt_journal_update, Opt_journal_dev,
+	Opt_journal_checksum, Opt_journal_async_commit,
+	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+	Opt_data_err_abort, Opt_data_err_ignore,
+	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+	Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
+	Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
+	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
+	Opt_inode_readahead_blks, Opt_journal_ioprio,
+	Opt_dioread_nolock, Opt_dioread_lock,
+	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+};
+
+static const match_table_t tokens = {
+	{Opt_bsd_df, "bsddf"},
+	{Opt_minix_df, "minixdf"},
+	{Opt_grpid, "grpid"},
+	{Opt_grpid, "bsdgroups"},
+	{Opt_nogrpid, "nogrpid"},
+	{Opt_nogrpid, "sysvgroups"},
+	{Opt_resgid, "resgid=%u"},
+	{Opt_resuid, "resuid=%u"},
+	{Opt_sb, "sb=%u"},
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_nouid32, "nouid32"},
+	{Opt_debug, "debug"},
+	{Opt_oldalloc, "oldalloc"},
+	{Opt_orlov, "orlov"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_noload, "noload"},
+	{Opt_noload, "norecovery"},
+	{Opt_nobh, "nobh"},
+	{Opt_bh, "bh"},
+	{Opt_commit, "commit=%u"},
+	{Opt_min_batch_time, "min_batch_time=%u"},
+	{Opt_max_batch_time, "max_batch_time=%u"},
+	{Opt_journal_update, "journal=update"},
+	{Opt_journal_dev, "journal_dev=%u"},
+	{Opt_journal_checksum, "journal_checksum"},
+	{Opt_journal_async_commit, "journal_async_commit"},
+	{Opt_abort, "abort"},
+	{Opt_data_journal, "data=journal"},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_err_abort, "data_err=abort"},
+	{Opt_data_err_ignore, "data_err=ignore"},
+	{Opt_offusrjquota, "usrjquota="},
+	{Opt_usrjquota, "usrjquota=%s"},
+	{Opt_offgrpjquota, "grpjquota="},
+	{Opt_grpjquota, "grpjquota=%s"},
+	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
+	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
+	{Opt_grpquota, "grpquota"},
+	{Opt_noquota, "noquota"},
+	{Opt_quota, "quota"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_barrier, "barrier=%u"},
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_i_version, "i_version"},
+	{Opt_stripe, "stripe=%u"},
+	{Opt_resize, "resize"},
+	{Opt_delalloc, "delalloc"},
+	{Opt_nodelalloc, "nodelalloc"},
+	{Opt_mblk_io_submit, "mblk_io_submit"},
+	{Opt_nomblk_io_submit, "nomblk_io_submit"},
+	{Opt_block_validity, "block_validity"},
+	{Opt_noblock_validity, "noblock_validity"},
+	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
+	{Opt_journal_ioprio, "journal_ioprio=%u"},
+	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
+	{Opt_auto_da_alloc, "auto_da_alloc"},
+	{Opt_noauto_da_alloc, "noauto_da_alloc"},
+	{Opt_dioread_nolock, "dioread_nolock"},
+	{Opt_dioread_lock, "dioread_lock"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
+	{Opt_init_itable, "init_itable=%u"},
+	{Opt_init_itable, "init_itable"},
+	{Opt_noinit_itable, "noinit_itable"},
+	{Opt_err, NULL},
+};
+
+static ext4_fsblk_t get_sb_block(void **data)
+{
+	ext4_fsblk_t	sb_block;
+	char		*options = (char *) *data;
+
+	if (!options || strncmp(options, "sb=", 3) != 0)
+		return 1;	/* Default location */
+
+	options += 3;
+	/* TODO: use simple_strtoll with >32bit ext4 */
+	sb_block = simple_strtoul(options, &options, 0);
+	if (*options && *options != ',') {
+		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
+		       (char *) *data);
+		return 1;
+	}
+	if (*options == ',')
+		options++;
+	*data = (void *) options;
+
+	return sb_block;
+}
+
+#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
+	"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
+
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	char *qname;
+
+	if (sb_any_quota_loaded(sb) &&
+		!sbi->s_qf_names[qtype]) {
+		ext4_msg(sb, KERN_ERR,
+			"Cannot change journaled "
+			"quota options when quota turned on");
+		return 0;
+	}
+	qname = match_strdup(args);
+	if (!qname) {
+		ext4_msg(sb, KERN_ERR,
+			"Not enough memory for storing quotafile name");
+		return 0;
+	}
+	if (sbi->s_qf_names[qtype] &&
+		strcmp(sbi->s_qf_names[qtype], qname)) {
+		ext4_msg(sb, KERN_ERR,
+			"%s quota file already specified", QTYPE2NAME(qtype));
+		kfree(qname);
+		return 0;
+	}
+	sbi->s_qf_names[qtype] = qname;
+	if (strchr(sbi->s_qf_names[qtype], '/')) {
+		ext4_msg(sb, KERN_ERR,
+			"quotafile must be on filesystem root");
+		kfree(sbi->s_qf_names[qtype]);
+		sbi->s_qf_names[qtype] = NULL;
+		return 0;
+	}
+	set_opt(sb, QUOTA);
+	return 1;
+}
+
+static int clear_qf_name(struct super_block *sb, int qtype)
+{
+
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (sb_any_quota_loaded(sb) &&
+		sbi->s_qf_names[qtype]) {
+		ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+			" when quota turned on");
+		return 0;
+	}
+	/*
+	 * The space will be released later when all options are confirmed
+	 * to be correct
+	 */
+	sbi->s_qf_names[qtype] = NULL;
+	return 1;
+}
+#endif
+
+static int parse_options(char *options, struct super_block *sb,
+			 unsigned long *journal_devnum,
+			 unsigned int *journal_ioprio,
+			 ext4_fsblk_t *n_blocks_count, int is_remount)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int data_opt = 0;
+	int option;
+#ifdef CONFIG_QUOTA
+	int qfmt;
+#endif
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		/*
+		 * Initialize args struct so we know whether arg was
+		 * found; some options take optional arguments.
+		 */
+		args[0].to = args[0].from = NULL;
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_bsd_df:
+			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
+			clear_opt(sb, MINIX_DF);
+			break;
+		case Opt_minix_df:
+			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
+			set_opt(sb, MINIX_DF);
+
+			break;
+		case Opt_grpid:
+			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
+			set_opt(sb, GRPID);
+
+			break;
+		case Opt_nogrpid:
+			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
+			clear_opt(sb, GRPID);
+
+			break;
+		case Opt_resuid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_resuid = option;
+			break;
+		case Opt_resgid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_resgid = option;
+			break;
+		case Opt_sb:
+			/* handled by get_sb_block() instead of here */
+			/* *sb_block = match_int(&args[0]); */
+			break;
+		case Opt_err_panic:
+			clear_opt(sb, ERRORS_CONT);
+			clear_opt(sb, ERRORS_RO);
+			set_opt(sb, ERRORS_PANIC);
+			break;
+		case Opt_err_ro:
+			clear_opt(sb, ERRORS_CONT);
+			clear_opt(sb, ERRORS_PANIC);
+			set_opt(sb, ERRORS_RO);
+			break;
+		case Opt_err_cont:
+			clear_opt(sb, ERRORS_RO);
+			clear_opt(sb, ERRORS_PANIC);
+			set_opt(sb, ERRORS_CONT);
+			break;
+		case Opt_nouid32:
+			set_opt(sb, NO_UID32);
+			break;
+		case Opt_debug:
+			set_opt(sb, DEBUG);
+			break;
+		case Opt_oldalloc:
+			set_opt(sb, OLDALLOC);
+			break;
+		case Opt_orlov:
+			clear_opt(sb, OLDALLOC);
+			break;
+#ifdef CONFIG_EXT4_FS_XATTR
+		case Opt_user_xattr:
+			set_opt(sb, XATTR_USER);
+			break;
+		case Opt_nouser_xattr:
+			clear_opt(sb, XATTR_USER);
+			break;
+#else
+		case Opt_user_xattr:
+		case Opt_nouser_xattr:
+			ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
+			break;
+#endif
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+		case Opt_acl:
+			set_opt(sb, POSIX_ACL);
+			break;
+		case Opt_noacl:
+			clear_opt(sb, POSIX_ACL);
+			break;
+#else
+		case Opt_acl:
+		case Opt_noacl:
+			ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
+			break;
+#endif
+		case Opt_journal_update:
+			/* @@@ FIXME */
+			/* Eventually we will want to be able to create
+			   a journal file here.  For now, only allow the
+			   user to specify an existing inode to be the
+			   journal file. */
+			if (is_remount) {
+				ext4_msg(sb, KERN_ERR,
+					 "Cannot specify journal on remount");
+				return 0;
+			}
+			set_opt(sb, UPDATE_JOURNAL);
+			break;
+		case Opt_journal_dev:
+			if (is_remount) {
+				ext4_msg(sb, KERN_ERR,
+					"Cannot specify journal on remount");
+				return 0;
+			}
+			if (match_int(&args[0], &option))
+				return 0;
+			*journal_devnum = option;
+			break;
+		case Opt_journal_checksum:
+			set_opt(sb, JOURNAL_CHECKSUM);
+			break;
+		case Opt_journal_async_commit:
+			set_opt(sb, JOURNAL_ASYNC_COMMIT);
+			set_opt(sb, JOURNAL_CHECKSUM);
+			break;
+		case Opt_noload:
+			set_opt(sb, NOLOAD);
+			break;
+		case Opt_commit:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			if (option == 0)
+				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
+			sbi->s_commit_interval = HZ * option;
+			break;
+		case Opt_max_batch_time:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			if (option == 0)
+				option = EXT4_DEF_MAX_BATCH_TIME;
+			sbi->s_max_batch_time = option;
+			break;
+		case Opt_min_batch_time:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			sbi->s_min_batch_time = option;
+			break;
+		case Opt_data_journal:
+			data_opt = EXT4_MOUNT_JOURNAL_DATA;
+			goto datacheck;
+		case Opt_data_ordered:
+			data_opt = EXT4_MOUNT_ORDERED_DATA;
+			goto datacheck;
+		case Opt_data_writeback:
+			data_opt = EXT4_MOUNT_WRITEBACK_DATA;
+		datacheck:
+			if (is_remount) {
+				if (test_opt(sb, DATA_FLAGS) != data_opt) {
+					ext4_msg(sb, KERN_ERR,
+						"Cannot change data mode on remount");
+					return 0;
+				}
+			} else {
+				clear_opt(sb, DATA_FLAGS);
+				sbi->s_mount_opt |= data_opt;
+			}
+			break;
+		case Opt_data_err_abort:
+			set_opt(sb, DATA_ERR_ABORT);
+			break;
+		case Opt_data_err_ignore:
+			clear_opt(sb, DATA_ERR_ABORT);
+			break;
+#ifdef CONFIG_QUOTA
+		case Opt_usrjquota:
+			if (!set_qf_name(sb, USRQUOTA, &args[0]))
+				return 0;
+			break;
+		case Opt_grpjquota:
+			if (!set_qf_name(sb, GRPQUOTA, &args[0]))
+				return 0;
+			break;
+		case Opt_offusrjquota:
+			if (!clear_qf_name(sb, USRQUOTA))
+				return 0;
+			break;
+		case Opt_offgrpjquota:
+			if (!clear_qf_name(sb, GRPQUOTA))
+				return 0;
+			break;
+
+		case Opt_jqfmt_vfsold:
+			qfmt = QFMT_VFS_OLD;
+			goto set_qf_format;
+		case Opt_jqfmt_vfsv0:
+			qfmt = QFMT_VFS_V0;
+			goto set_qf_format;
+		case Opt_jqfmt_vfsv1:
+			qfmt = QFMT_VFS_V1;
+set_qf_format:
+			if (sb_any_quota_loaded(sb) &&
+			    sbi->s_jquota_fmt != qfmt) {
+				ext4_msg(sb, KERN_ERR, "Cannot change "
+					"journaled quota options when "
+					"quota turned on");
+				return 0;
+			}
+			sbi->s_jquota_fmt = qfmt;
+			break;
+		case Opt_quota:
+		case Opt_usrquota:
+			set_opt(sb, QUOTA);
+			set_opt(sb, USRQUOTA);
+			break;
+		case Opt_grpquota:
+			set_opt(sb, QUOTA);
+			set_opt(sb, GRPQUOTA);
+			break;
+		case Opt_noquota:
+			if (sb_any_quota_loaded(sb)) {
+				ext4_msg(sb, KERN_ERR, "Cannot change quota "
+					"options when quota turned on");
+				return 0;
+			}
+			clear_opt(sb, QUOTA);
+			clear_opt(sb, USRQUOTA);
+			clear_opt(sb, GRPQUOTA);
+			break;
+#else
+		case Opt_quota:
+		case Opt_usrquota:
+		case Opt_grpquota:
+			ext4_msg(sb, KERN_ERR,
+				"quota options not supported");
+			break;
+		case Opt_usrjquota:
+		case Opt_grpjquota:
+		case Opt_offusrjquota:
+		case Opt_offgrpjquota:
+		case Opt_jqfmt_vfsold:
+		case Opt_jqfmt_vfsv0:
+		case Opt_jqfmt_vfsv1:
+			ext4_msg(sb, KERN_ERR,
+				"journaled quota options not supported");
+			break;
+		case Opt_noquota:
+			break;
+#endif
+		case Opt_abort:
+			sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+			break;
+		case Opt_nobarrier:
+			clear_opt(sb, BARRIER);
+			break;
+		case Opt_barrier:
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = 1;	/* No argument, default to 1 */
+			if (option)
+				set_opt(sb, BARRIER);
+			else
+				clear_opt(sb, BARRIER);
+			break;
+		case Opt_ignore:
+			break;
+		case Opt_resize:
+			if (!is_remount) {
+				ext4_msg(sb, KERN_ERR,
+					"resize option only available "
+					"for remount");
+				return 0;
+			}
+			if (match_int(&args[0], &option) != 0)
+				return 0;
+			*n_blocks_count = option;
+			break;
+		case Opt_nobh:
+			ext4_msg(sb, KERN_WARNING,
+				 "Ignoring deprecated nobh option");
+			break;
+		case Opt_bh:
+			ext4_msg(sb, KERN_WARNING,
+				 "Ignoring deprecated bh option");
+			break;
+		case Opt_i_version:
+			set_opt(sb, I_VERSION);
+			sb->s_flags |= MS_I_VERSION;
+			break;
+		case Opt_nodelalloc:
+			clear_opt(sb, DELALLOC);
+			break;
+		case Opt_mblk_io_submit:
+			set_opt(sb, MBLK_IO_SUBMIT);
+			break;
+		case Opt_nomblk_io_submit:
+			clear_opt(sb, MBLK_IO_SUBMIT);
+			break;
+		case Opt_stripe:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			sbi->s_stripe = option;
+			break;
+		case Opt_delalloc:
+			set_opt(sb, DELALLOC);
+			break;
+		case Opt_block_validity:
+			set_opt(sb, BLOCK_VALIDITY);
+			break;
+		case Opt_noblock_validity:
+			clear_opt(sb, BLOCK_VALIDITY);
+			break;
+		case Opt_inode_readahead_blks:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0 || option > (1 << 30))
+				return 0;
+			if (option && !is_power_of_2(option)) {
+				ext4_msg(sb, KERN_ERR,
+					 "EXT4-fs: inode_readahead_blks"
+					 " must be a power of 2");
+				return 0;
+			}
+			sbi->s_inode_readahead_blks = option;
+			break;
+		case Opt_journal_ioprio:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0 || option > 7)
+				break;
+			*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
+							    option);
+			break;
+		case Opt_noauto_da_alloc:
+			set_opt(sb, NO_AUTO_DA_ALLOC);
+			break;
+		case Opt_auto_da_alloc:
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = 1;	/* No argument, default to 1 */
+			if (option)
+				clear_opt(sb, NO_AUTO_DA_ALLOC);
+			else
+				set_opt(sb,NO_AUTO_DA_ALLOC);
+			break;
+		case Opt_discard:
+			set_opt(sb, DISCARD);
+			break;
+		case Opt_nodiscard:
+			clear_opt(sb, DISCARD);
+			break;
+		case Opt_dioread_nolock:
+			set_opt(sb, DIOREAD_NOLOCK);
+			break;
+		case Opt_dioread_lock:
+			clear_opt(sb, DIOREAD_NOLOCK);
+			break;
+		case Opt_init_itable:
+			set_opt(sb, INIT_INODE_TABLE);
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = EXT4_DEF_LI_WAIT_MULT;
+			if (option < 0)
+				return 0;
+			sbi->s_li_wait_mult = option;
+			break;
+		case Opt_noinit_itable:
+			clear_opt(sb, INIT_INODE_TABLE);
+			break;
+		default:
+			ext4_msg(sb, KERN_ERR,
+			       "Unrecognized mount option \"%s\" "
+			       "or missing value", p);
+			return 0;
+		}
+	}
+#ifdef CONFIG_QUOTA
+	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
+			clear_opt(sb, USRQUOTA);
+
+		if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
+			clear_opt(sb, GRPQUOTA);
+
+		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
+			ext4_msg(sb, KERN_ERR, "old and new quota "
+					"format mixing");
+			return 0;
+		}
+
+		if (!sbi->s_jquota_fmt) {
+			ext4_msg(sb, KERN_ERR, "journaled quota format "
+					"not specified");
+			return 0;
+		}
+	} else {
+		if (sbi->s_jquota_fmt) {
+			ext4_msg(sb, KERN_ERR, "journaled quota format "
+					"specified with no journaling "
+					"enabled");
+			return 0;
+		}
+	}
+#endif
+	return 1;
+}
+
+static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
+			    int read_only)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int res = 0;
+
+	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
+		ext4_msg(sb, KERN_ERR, "revision level too high, "
+			 "forcing read-only mode");
+		res = MS_RDONLY;
+	}
+	if (read_only)
+		return res;
+	if (!(sbi->s_mount_state & EXT4_VALID_FS))
+		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
+			 "running e2fsck is recommended");
+	else if ((sbi->s_mount_state & EXT4_ERROR_FS))
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: mounting fs with errors, "
+			 "running e2fsck is recommended");
+	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
+		 le16_to_cpu(es->s_mnt_count) >=
+		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: maximal mount count reached, "
+			 "running e2fsck is recommended");
+	else if (le32_to_cpu(es->s_checkinterval) &&
+		(le32_to_cpu(es->s_lastcheck) +
+			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: checktime reached, "
+			 "running e2fsck is recommended");
+	if (!sbi->s_journal)
+		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
+	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
+		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
+	le16_add_cpu(&es->s_mnt_count, 1);
+	es->s_mtime = cpu_to_le32(get_seconds());
+	ext4_update_dynamic_rev(sb);
+	if (sbi->s_journal)
+		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+
+	ext4_commit_super(sb, 1);
+	if (test_opt(sb, DEBUG))
+		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
+				"bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
+			sb->s_blocksize,
+			sbi->s_groups_count,
+			EXT4_BLOCKS_PER_GROUP(sb),
+			EXT4_INODES_PER_GROUP(sb),
+			sbi->s_mount_opt, sbi->s_mount_opt2);
+
+	cleancache_init_fs(sb);
+	return res;
+}
+
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp = NULL;
+	ext4_group_t flex_group_count;
+	ext4_group_t flex_group;
+	unsigned int groups_per_flex = 0;
+	size_t size;
+	int i;
+
+	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+	if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
+		sbi->s_log_groups_per_flex = 0;
+		return 1;
+	}
+	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+	/* We allocate both existing and potentially added groups */
+	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
+			((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
+			      EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
+	size = flex_group_count * sizeof(struct flex_groups);
+	sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
+	if (sbi->s_flex_groups == NULL) {
+		sbi->s_flex_groups = vzalloc(size);
+		if (sbi->s_flex_groups == NULL) {
+			ext4_msg(sb, KERN_ERR,
+				 "not enough memory for %u flex groups",
+				 flex_group_count);
+			goto failed;
+		}
+	}
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+
+		flex_group = ext4_flex_group(sbi, i);
+		atomic_add(ext4_free_inodes_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].free_inodes);
+		atomic_add(ext4_free_blks_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].free_blocks);
+		atomic_add(ext4_used_dirs_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].used_dirs);
+	}
+
+	return 1;
+failed:
+	return 0;
+}
+
+__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
+			    struct ext4_group_desc *gdp)
+{
+	__u16 crc = 0;
+
+	if (sbi->s_es->s_feature_ro_compat &
+	    cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+		int offset = offsetof(struct ext4_group_desc, bg_checksum);
+		__le32 le_group = cpu_to_le32(block_group);
+
+		crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
+		crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
+		crc = crc16(crc, (__u8 *)gdp, offset);
+		offset += sizeof(gdp->bg_checksum); /* skip checksum */
+		/* for checksum of struct ext4_group_desc do the rest...*/
+		if ((sbi->s_es->s_feature_incompat &
+		     cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
+		    offset < le16_to_cpu(sbi->s_es->s_desc_size))
+			crc = crc16(crc, (__u8 *)gdp + offset,
+				    le16_to_cpu(sbi->s_es->s_desc_size) -
+					offset);
+	}
+
+	return cpu_to_le16(crc);
+}
+
+int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
+				struct ext4_group_desc *gdp)
+{
+	if ((sbi->s_es->s_feature_ro_compat &
+	     cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) &&
+	    (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp)))
+		return 0;
+
+	return 1;
+}
+
+/* Called at mount-time, super-block is locked */
+static int ext4_check_descriptors(struct super_block *sb,
+				  ext4_group_t *first_not_zeroed)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
+	ext4_fsblk_t last_block;
+	ext4_fsblk_t block_bitmap;
+	ext4_fsblk_t inode_bitmap;
+	ext4_fsblk_t inode_table;
+	int flexbg_flag = 0;
+	ext4_group_t i, grp = sbi->s_groups_count;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+		flexbg_flag = 1;
+
+	ext4_debug("Checking group descriptors");
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
+
+		if (i == sbi->s_groups_count - 1 || flexbg_flag)
+			last_block = ext4_blocks_count(sbi->s_es) - 1;
+		else
+			last_block = first_block +
+				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
+
+		if ((grp == sbi->s_groups_count) &&
+		   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			grp = i;
+
+		block_bitmap = ext4_block_bitmap(sb, gdp);
+		if (block_bitmap < first_block || block_bitmap > last_block) {
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+			       "Block bitmap for group %u not in group "
+			       "(block %llu)!", i, block_bitmap);
+			return 0;
+		}
+		inode_bitmap = ext4_inode_bitmap(sb, gdp);
+		if (inode_bitmap < first_block || inode_bitmap > last_block) {
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+			       "Inode bitmap for group %u not in group "
+			       "(block %llu)!", i, inode_bitmap);
+			return 0;
+		}
+		inode_table = ext4_inode_table(sb, gdp);
+		if (inode_table < first_block ||
+		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+			       "Inode table for group %u not in group "
+			       "(block %llu)!", i, inode_table);
+			return 0;
+		}
+		ext4_lock_group(sb, i);
+		if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+				 "Checksum for group %u failed (%u!=%u)",
+				 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+				     gdp)), le16_to_cpu(gdp->bg_checksum));
+			if (!(sb->s_flags & MS_RDONLY)) {
+				ext4_unlock_group(sb, i);
+				return 0;
+			}
+		}
+		ext4_unlock_group(sb, i);
+		if (!flexbg_flag)
+			first_block += EXT4_BLOCKS_PER_GROUP(sb);
+	}
+	if (NULL != first_not_zeroed)
+		*first_not_zeroed = grp;
+
+	ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
+	sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
+	return 1;
+}
+
+/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash.  We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode.  The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext4_free_inode().  The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+static void ext4_orphan_cleanup(struct super_block *sb,
+				struct ext4_super_block *es)
+{
+	unsigned int s_flags = sb->s_flags;
+	int nr_orphans = 0, nr_truncates = 0;
+#ifdef CONFIG_QUOTA
+	int i;
+#endif
+	if (!es->s_last_orphan) {
+		jbd_debug(4, "no orphan inodes to clean up\n");
+		return;
+	}
+
+	if (bdev_read_only(sb->s_bdev)) {
+		ext4_msg(sb, KERN_ERR, "write access "
+			"unavailable, skipping orphan cleanup");
+		return;
+	}
+
+	/* Check if feature set would not allow a r/w mount */
+	if (!ext4_feature_set_ok(sb, 0)) {
+		ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+			 "unknown ROCOMPAT features");
+		return;
+	}
+
+	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+		if (es->s_last_orphan)
+			jbd_debug(1, "Errors on filesystem, "
+				  "clearing orphan list.\n");
+		es->s_last_orphan = 0;
+		jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+		return;
+	}
+
+	if (s_flags & MS_RDONLY) {
+		ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
+		sb->s_flags &= ~MS_RDONLY;
+	}
+#ifdef CONFIG_QUOTA
+	/* Needed for iput() to work correctly and not trash data */
+	sb->s_flags |= MS_ACTIVE;
+	/* Turn on quotas so that they are updated correctly */
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (EXT4_SB(sb)->s_qf_names[i]) {
+			int ret = ext4_quota_on_mount(sb, i);
+			if (ret < 0)
+				ext4_msg(sb, KERN_ERR,
+					"Cannot turn on journaled "
+					"quota: error %d", ret);
+		}
+	}
+#endif
+
+	while (es->s_last_orphan) {
+		struct inode *inode;
+
+		inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
+		if (IS_ERR(inode)) {
+			es->s_last_orphan = 0;
+			break;
+		}
+
+		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+		dquot_initialize(inode);
+		if (inode->i_nlink) {
+			ext4_msg(sb, KERN_DEBUG,
+				"%s: truncating inode %lu to %lld bytes",
+				__func__, inode->i_ino, inode->i_size);
+			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
+				  inode->i_ino, inode->i_size);
+			ext4_truncate(inode);
+			nr_truncates++;
+		} else {
+			ext4_msg(sb, KERN_DEBUG,
+				"%s: deleting unreferenced inode %lu",
+				__func__, inode->i_ino);
+			jbd_debug(2, "deleting unreferenced inode %lu\n",
+				  inode->i_ino);
+			nr_orphans++;
+		}
+		iput(inode);  /* The delete magic happens here! */
+	}
+
+#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
+
+	if (nr_orphans)
+		ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+		       PLURAL(nr_orphans));
+	if (nr_truncates)
+		ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+		       PLURAL(nr_truncates));
+#ifdef CONFIG_QUOTA
+	/* Turn quotas off */
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (sb_dqopt(sb)->files[i])
+			dquot_quota_off(sb, i);
+	}
+#endif
+	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+}
+
+/*
+ * Maximal extent format file size.
+ * Resulting logical blkno at s_maxbytes must fit in our on-disk
+ * extent format containers, within a sector_t, and within i_blocks
+ * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
+ * so that won't be a limiting factor.
+ *
+ * However there is other limiting factor. We do store extents in the form
+ * of starting block and length, hence the resulting length of the extent
+ * covering maximum file size must fit into on-disk format containers as
+ * well. Given that length is always by 1 unit bigger than max unit (because
+ * we count 0 as well) we have to lower the s_maxbytes by one fs block.
+ *
+ * Note, this does *not* consider any metadata overhead for vfs i_blocks.
+ */
+static loff_t ext4_max_size(int blkbits, int has_huge_files)
+{
+	loff_t res;
+	loff_t upper_limit = MAX_LFS_FILESIZE;
+
+	/* small i_blocks in vfs inode? */
+	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
+		/*
+		 * CONFIG_LBDAF is not enabled implies the inode
+		 * i_block represent total blocks in 512 bytes
+		 * 32 == size of vfs inode i_blocks * 8
+		 */
+		upper_limit = (1LL << 32) - 1;
+
+		/* total blocks in file system block size */
+		upper_limit >>= (blkbits - 9);
+		upper_limit <<= blkbits;
+	}
+
+	/*
+	 * 32-bit extent-start container, ee_block. We lower the maxbytes
+	 * by one fs block, so ee_len can cover the extent of maximum file
+	 * size
+	 */
+	res = (1LL << 32) - 1;
+	res <<= blkbits;
+
+	/* Sanity check against vm- & vfs- imposed limits */
+	if (res > upper_limit)
+		res = upper_limit;
+
+	return res;
+}
+
+/*
+ * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
+ * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
+ * We need to be 1 filesystem block less than the 2^48 sector limit.
+ */
+static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
+{
+	loff_t res = EXT4_NDIR_BLOCKS;
+	int meta_blocks;
+	loff_t upper_limit;
+	/* This is calculated to be the largest file size for a dense, block
+	 * mapped file such that the file's total number of 512-byte sectors,
+	 * including data and all indirect blocks, does not exceed (2^48 - 1).
+	 *
+	 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
+	 * number of 512-byte sectors of the file.
+	 */
+
+	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
+		/*
+		 * !has_huge_files or CONFIG_LBDAF not enabled implies that
+		 * the inode i_block field represents total file blocks in
+		 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
+		 */
+		upper_limit = (1LL << 32) - 1;
+
+		/* total blocks in file system block size */
+		upper_limit >>= (bits - 9);
+
+	} else {
+		/*
+		 * We use 48 bit ext4_inode i_blocks
+		 * With EXT4_HUGE_FILE_FL set the i_blocks
+		 * represent total number of blocks in
+		 * file system block size
+		 */
+		upper_limit = (1LL << 48) - 1;
+
+	}
+
+	/* indirect blocks */
+	meta_blocks = 1;
+	/* double indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2));
+	/* tripple indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
+
+	upper_limit -= meta_blocks;
+	upper_limit <<= bits;
+
+	res += 1LL << (bits-2);
+	res += 1LL << (2*(bits-2));
+	res += 1LL << (3*(bits-2));
+	res <<= bits;
+	if (res > upper_limit)
+		res = upper_limit;
+
+	if (res > MAX_LFS_FILESIZE)
+		res = MAX_LFS_FILESIZE;
+
+	return res;
+}
+
+static ext4_fsblk_t descriptor_loc(struct super_block *sb,
+				   ext4_fsblk_t logical_sb_block, int nr)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_group_t bg, first_meta_bg;
+	int has_super = 0;
+
+	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+	    nr < first_meta_bg)
+		return logical_sb_block + nr + 1;
+	bg = sbi->s_desc_per_block * nr;
+	if (ext4_bg_has_super(sb, bg))
+		has_super = 1;
+
+	return (has_super + ext4_group_first_block_no(sb, bg));
+}
+
+/**
+ * ext4_get_stripe_size: Get the stripe size.
+ * @sbi: In memory super block info
+ *
+ * If we have specified it via mount option, then
+ * use the mount option value. If the value specified at mount time is
+ * greater than the blocks per group use the super block value.
+ * If the super block value is greater than blocks per group return 0.
+ * Allocator needs it be less than blocks per group.
+ *
+ */
+static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
+{
+	unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
+	unsigned long stripe_width =
+			le32_to_cpu(sbi->s_es->s_raid_stripe_width);
+
+	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
+		return sbi->s_stripe;
+
+	if (stripe_width <= sbi->s_blocks_per_group)
+		return stripe_width;
+
+	if (stride <= sbi->s_blocks_per_group)
+		return stride;
+
+	return 0;
+}
+
+/* sysfs supprt */
+
+struct ext4_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
+	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
+			 const char *, size_t);
+	int offset;
+};
+
+static int parse_strtoul(const char *buf,
+		unsigned long max, unsigned long *value)
+{
+	char *endp;
+
+	*value = simple_strtoul(skip_spaces(buf), &endp, 0);
+	endp = skip_spaces(endp);
+	if (*endp || *value > max)
+		return -EINVAL;
+
+	return 0;
+}
+
+static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+}
+
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+					 struct ext4_sb_info *sbi, char *buf)
+{
+	struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+	if (!sb->s_bdev->bd_part)
+		return snprintf(buf, PAGE_SIZE, "0\n");
+	return snprintf(buf, PAGE_SIZE, "%lu\n",
+			(part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			 sbi->s_sectors_written_start) >> 1);
+}
+
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+					  struct ext4_sb_info *sbi, char *buf)
+{
+	struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+	if (!sb->s_bdev->bd_part)
+		return snprintf(buf, PAGE_SIZE, "0\n");
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(unsigned long long)(sbi->s_kbytes_written +
+			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
+}
+
+static ssize_t extent_cache_hits_show(struct ext4_attr *a,
+				      struct ext4_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
+}
+
+static ssize_t extent_cache_misses_show(struct ext4_attr *a,
+					struct ext4_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
+}
+
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+					  struct ext4_sb_info *sbi,
+					  const char *buf, size_t count)
+{
+	unsigned long t;
+
+	if (parse_strtoul(buf, 0x40000000, &t))
+		return -EINVAL;
+
+	if (t && !is_power_of_2(t))
+		return -EINVAL;
+
+	sbi->s_inode_readahead_blks = t;
+	return count;
+}
+
+static ssize_t sbi_ui_show(struct ext4_attr *a,
+			   struct ext4_sb_info *sbi, char *buf)
+{
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t sbi_ui_store(struct ext4_attr *a,
+			    struct ext4_sb_info *sbi,
+			    const char *buf, size_t count)
+{
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+	unsigned long t;
+
+	if (parse_strtoul(buf, 0xffffffff, &t))
+		return -EINVAL;
+	*ui = t;
+	return count;
+}
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
+static struct ext4_attr ext4_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.show	= _show,					\
+	.store	= _store,					\
+	.offset = offsetof(struct ext4_sb_info, _elname),	\
+}
+#define EXT4_ATTR(name, mode, show, store) \
+static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+
+#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
+#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
+#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+#define EXT4_RW_ATTR_SBI_UI(name, elname)	\
+	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+
+EXT4_RO_ATTR(delayed_allocation_blocks);
+EXT4_RO_ATTR(session_write_kbytes);
+EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RO_ATTR(extent_cache_hits);
+EXT4_RO_ATTR(extent_cache_misses);
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
+		 inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+
+static struct attribute *ext4_attrs[] = {
+	ATTR_LIST(delayed_allocation_blocks),
+	ATTR_LIST(session_write_kbytes),
+	ATTR_LIST(lifetime_write_kbytes),
+	ATTR_LIST(extent_cache_hits),
+	ATTR_LIST(extent_cache_misses),
+	ATTR_LIST(inode_readahead_blks),
+	ATTR_LIST(inode_goal),
+	ATTR_LIST(mb_stats),
+	ATTR_LIST(mb_max_to_scan),
+	ATTR_LIST(mb_min_to_scan),
+	ATTR_LIST(mb_order2_req),
+	ATTR_LIST(mb_stream_req),
+	ATTR_LIST(mb_group_prealloc),
+	ATTR_LIST(max_writeback_mb_bump),
+	NULL,
+};
+
+/* Features this copy of ext4 supports */
+EXT4_INFO_ATTR(lazy_itable_init);
+EXT4_INFO_ATTR(batched_discard);
+
+static struct attribute *ext4_feat_attrs[] = {
+	ATTR_LIST(lazy_itable_init),
+	ATTR_LIST(batched_discard),
+	NULL,
+};
+
+static ssize_t ext4_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+	return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+	return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void ext4_sb_release(struct kobject *kobj)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	complete(&sbi->s_kobj_unregister);
+}
+
+static const struct sysfs_ops ext4_attr_ops = {
+	.show	= ext4_attr_show,
+	.store	= ext4_attr_store,
+};
+
+static struct kobj_type ext4_ktype = {
+	.default_attrs	= ext4_attrs,
+	.sysfs_ops	= &ext4_attr_ops,
+	.release	= ext4_sb_release,
+};
+
+static void ext4_feat_release(struct kobject *kobj)
+{
+	complete(&ext4_feat->f_kobj_unregister);
+}
+
+static struct kobj_type ext4_feat_ktype = {
+	.default_attrs	= ext4_feat_attrs,
+	.sysfs_ops	= &ext4_attr_ops,
+	.release	= ext4_feat_release,
+};
+
+/*
+ * Check whether this filesystem can be mounted based on
+ * the features present and the RDONLY/RDWR mount requested.
+ * Returns 1 if this filesystem can be mounted as requested,
+ * 0 if it cannot be.
+ */
+static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+{
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+		ext4_msg(sb, KERN_ERR,
+			"Couldn't mount because of "
+			"unsupported optional features (%x)",
+			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+			~EXT4_FEATURE_INCOMPAT_SUPP));
+		return 0;
+	}
+
+	if (readonly)
+		return 1;
+
+	/* Check that feature set is OK for a read-write mount */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
+			 "unsupported optional features (%x)",
+			 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+				~EXT4_FEATURE_RO_COMPAT_SUPP));
+		return 0;
+	}
+	/*
+	 * Large file size enabled file system can only be mounted
+	 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
+	 */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+		if (sizeof(blkcnt_t) < sizeof(u64)) {
+			ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
+				 "cannot be mounted RDWR without "
+				 "CONFIG_LBDAF");
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/*
+ * This function is called once a day if we have errors logged
+ * on the file system
+ */
+static void print_daily_error_info(unsigned long arg)
+{
+	struct super_block *sb = (struct super_block *) arg;
+	struct ext4_sb_info *sbi;
+	struct ext4_super_block *es;
+
+	sbi = EXT4_SB(sb);
+	es = sbi->s_es;
+
+	if (es->s_error_count)
+		ext4_msg(sb, KERN_NOTICE, "error count: %u",
+			 le32_to_cpu(es->s_error_count));
+	if (es->s_first_error_time) {
+		printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d",
+		       sb->s_id, le32_to_cpu(es->s_first_error_time),
+		       (int) sizeof(es->s_first_error_func),
+		       es->s_first_error_func,
+		       le32_to_cpu(es->s_first_error_line));
+		if (es->s_first_error_ino)
+			printk(": inode %u",
+			       le32_to_cpu(es->s_first_error_ino));
+		if (es->s_first_error_block)
+			printk(": block %llu", (unsigned long long)
+			       le64_to_cpu(es->s_first_error_block));
+		printk("\n");
+	}
+	if (es->s_last_error_time) {
+		printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d",
+		       sb->s_id, le32_to_cpu(es->s_last_error_time),
+		       (int) sizeof(es->s_last_error_func),
+		       es->s_last_error_func,
+		       le32_to_cpu(es->s_last_error_line));
+		if (es->s_last_error_ino)
+			printk(": inode %u",
+			       le32_to_cpu(es->s_last_error_ino));
+		if (es->s_last_error_block)
+			printk(": block %llu", (unsigned long long)
+			       le64_to_cpu(es->s_last_error_block));
+		printk("\n");
+	}
+	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
+}
+
+/* Find next suitable group and run ext4_init_inode_table */
+static int ext4_run_li_request(struct ext4_li_request *elr)
+{
+	struct ext4_group_desc *gdp = NULL;
+	ext4_group_t group, ngroups;
+	struct super_block *sb;
+	unsigned long timeout = 0;
+	int ret = 0;
+
+	sb = elr->lr_super;
+	ngroups = EXT4_SB(sb)->s_groups_count;
+
+	for (group = elr->lr_next_group; group < ngroups; group++) {
+		gdp = ext4_get_group_desc(sb, group, NULL);
+		if (!gdp) {
+			ret = 1;
+			break;
+		}
+
+		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			break;
+	}
+
+	if (group == ngroups)
+		ret = 1;
+
+	if (!ret) {
+		timeout = jiffies;
+		ret = ext4_init_inode_table(sb, group,
+					    elr->lr_timeout ? 0 : 1);
+		if (elr->lr_timeout == 0) {
+			timeout = (jiffies - timeout) *
+				  elr->lr_sbi->s_li_wait_mult;
+			elr->lr_timeout = timeout;
+		}
+		elr->lr_next_sched = jiffies + elr->lr_timeout;
+		elr->lr_next_group = group + 1;
+	}
+
+	return ret;
+}
+
+/*
+ * Remove lr_request from the list_request and free the
+ * request structure. Should be called with li_list_mtx held
+ */
+static void ext4_remove_li_request(struct ext4_li_request *elr)
+{
+	struct ext4_sb_info *sbi;
+
+	if (!elr)
+		return;
+
+	sbi = elr->lr_sbi;
+
+	list_del(&elr->lr_request);
+	sbi->s_li_request = NULL;
+	kfree(elr);
+}
+
+static void ext4_unregister_li_request(struct super_block *sb)
+{
+	mutex_lock(&ext4_li_mtx);
+	if (!ext4_li_info) {
+		mutex_unlock(&ext4_li_mtx);
+		return;
+	}
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+	mutex_unlock(&ext4_li_mtx);
+}
+
+static struct task_struct *ext4_lazyinit_task;
+
+/*
+ * This is the function where ext4lazyinit thread lives. It walks
+ * through the request list searching for next scheduled filesystem.
+ * When such a fs is found, run the lazy initialization request
+ * (ext4_rn_li_request) and keep track of the time spend in this
+ * function. Based on that time we compute next schedule time of
+ * the request. When walking through the list is complete, compute
+ * next waking time and put itself into sleep.
+ */
+static int ext4_lazyinit_thread(void *arg)
+{
+	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+	struct list_head *pos, *n;
+	struct ext4_li_request *elr;
+	unsigned long next_wakeup, cur;
+
+	BUG_ON(NULL == eli);
+
+cont_thread:
+	while (true) {
+		next_wakeup = MAX_JIFFY_OFFSET;
+
+		mutex_lock(&eli->li_list_mtx);
+		if (list_empty(&eli->li_request_list)) {
+			mutex_unlock(&eli->li_list_mtx);
+			goto exit_thread;
+		}
+
+		list_for_each_safe(pos, n, &eli->li_request_list) {
+			elr = list_entry(pos, struct ext4_li_request,
+					 lr_request);
+
+			if (time_after_eq(jiffies, elr->lr_next_sched)) {
+				if (ext4_run_li_request(elr) != 0) {
+					/* error, remove the lazy_init job */
+					ext4_remove_li_request(elr);
+					continue;
+				}
+			}
+
+			if (time_before(elr->lr_next_sched, next_wakeup))
+				next_wakeup = elr->lr_next_sched;
+		}
+		mutex_unlock(&eli->li_list_mtx);
+
+		if (freezing(current))
+			refrigerator();
+
+		cur = jiffies;
+		if ((time_after_eq(cur, next_wakeup)) ||
+		    (MAX_JIFFY_OFFSET == next_wakeup)) {
+			cond_resched();
+			continue;
+		}
+
+		schedule_timeout_interruptible(next_wakeup - cur);
+
+		if (kthread_should_stop()) {
+			ext4_clear_request_list();
+			goto exit_thread;
+		}
+	}
+
+exit_thread:
+	/*
+	 * It looks like the request list is empty, but we need
+	 * to check it under the li_list_mtx lock, to prevent any
+	 * additions into it, and of course we should lock ext4_li_mtx
+	 * to atomically free the list and ext4_li_info, because at
+	 * this point another ext4 filesystem could be registering
+	 * new one.
+	 */
+	mutex_lock(&ext4_li_mtx);
+	mutex_lock(&eli->li_list_mtx);
+	if (!list_empty(&eli->li_request_list)) {
+		mutex_unlock(&eli->li_list_mtx);
+		mutex_unlock(&ext4_li_mtx);
+		goto cont_thread;
+	}
+	mutex_unlock(&eli->li_list_mtx);
+	kfree(ext4_li_info);
+	ext4_li_info = NULL;
+	mutex_unlock(&ext4_li_mtx);
+
+	return 0;
+}
+
+static void ext4_clear_request_list(void)
+{
+	struct list_head *pos, *n;
+	struct ext4_li_request *elr;
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
+		elr = list_entry(pos, struct ext4_li_request,
+				 lr_request);
+		ext4_remove_li_request(elr);
+	}
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+
+static int ext4_run_lazyinit_thread(void)
+{
+	ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
+					 ext4_li_info, "ext4lazyinit");
+	if (IS_ERR(ext4_lazyinit_task)) {
+		int err = PTR_ERR(ext4_lazyinit_task);
+		ext4_clear_request_list();
+		kfree(ext4_li_info);
+		ext4_li_info = NULL;
+		printk(KERN_CRIT "EXT4: error %d creating inode table "
+				 "initialization thread\n",
+				 err);
+		return err;
+	}
+	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+	return 0;
+}
+
+/*
+ * Check whether it make sense to run itable init. thread or not.
+ * If there is at least one uninitialized inode table, return
+ * corresponding group number, else the loop goes through all
+ * groups and return total number of groups.
+ */
+static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
+{
+	ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
+	struct ext4_group_desc *gdp = NULL;
+
+	for (group = 0; group < ngroups; group++) {
+		gdp = ext4_get_group_desc(sb, group, NULL);
+		if (!gdp)
+			continue;
+
+		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			break;
+	}
+
+	return group;
+}
+
+static int ext4_li_info_new(void)
+{
+	struct ext4_lazy_init *eli = NULL;
+
+	eli = kzalloc(sizeof(*eli), GFP_KERNEL);
+	if (!eli)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&eli->li_request_list);
+	mutex_init(&eli->li_list_mtx);
+
+	eli->li_state |= EXT4_LAZYINIT_QUIT;
+
+	ext4_li_info = eli;
+
+	return 0;
+}
+
+static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+					    ext4_group_t start)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_li_request *elr;
+	unsigned long rnd;
+
+	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+	if (!elr)
+		return NULL;
+
+	elr->lr_super = sb;
+	elr->lr_sbi = sbi;
+	elr->lr_next_group = start;
+
+	/*
+	 * Randomize first schedule time of the request to
+	 * spread the inode table initialization requests
+	 * better.
+	 */
+	get_random_bytes(&rnd, sizeof(rnd));
+	elr->lr_next_sched = jiffies + (unsigned long)rnd %
+			     (EXT4_DEF_LI_MAX_START_DELAY * HZ);
+
+	return elr;
+}
+
+static int ext4_register_li_request(struct super_block *sb,
+				    ext4_group_t first_not_zeroed)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_li_request *elr;
+	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	int ret = 0;
+
+	if (sbi->s_li_request != NULL) {
+		/*
+		 * Reset timeout so it can be computed again, because
+		 * s_li_wait_mult might have changed.
+		 */
+		sbi->s_li_request->lr_timeout = 0;
+		return 0;
+	}
+
+	if (first_not_zeroed == ngroups ||
+	    (sb->s_flags & MS_RDONLY) ||
+	    !test_opt(sb, INIT_INODE_TABLE))
+		return 0;
+
+	elr = ext4_li_request_new(sb, first_not_zeroed);
+	if (!elr)
+		return -ENOMEM;
+
+	mutex_lock(&ext4_li_mtx);
+
+	if (NULL == ext4_li_info) {
+		ret = ext4_li_info_new();
+		if (ret)
+			goto out;
+	}
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	list_add(&elr->lr_request, &ext4_li_info->li_request_list);
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+
+	sbi->s_li_request = elr;
+	/*
+	 * set elr to NULL here since it has been inserted to
+	 * the request_list and the removal and free of it is
+	 * handled by ext4_clear_request_list from now on.
+	 */
+	elr = NULL;
+
+	if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
+		ret = ext4_run_lazyinit_thread();
+		if (ret)
+			goto out;
+	}
+out:
+	mutex_unlock(&ext4_li_mtx);
+	if (ret)
+		kfree(elr);
+	return ret;
+}
+
+/*
+ * We do not need to lock anything since this is called on
+ * module unload.
+ */
+static void ext4_destroy_lazyinit_thread(void)
+{
+	/*
+	 * If thread exited earlier
+	 * there's nothing to be done.
+	 */
+	if (!ext4_li_info || !ext4_lazyinit_task)
+		return;
+
+	kthread_stop(ext4_lazyinit_task);
+}
+
+static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+				__releases(kernel_lock)
+				__acquires(kernel_lock)
+{
+	char *orig_data = kstrdup(data, GFP_KERNEL);
+	struct buffer_head *bh;
+	struct ext4_super_block *es = NULL;
+	struct ext4_sb_info *sbi;
+	ext4_fsblk_t block;
+	ext4_fsblk_t sb_block = get_sb_block(&data);
+	ext4_fsblk_t logical_sb_block;
+	unsigned long offset = 0;
+	unsigned long journal_devnum = 0;
+	unsigned long def_mount_opts;
+	struct inode *root;
+	char *cp;
+	const char *descr;
+	int ret = -ENOMEM;
+	int blocksize;
+	unsigned int db_count;
+	unsigned int i;
+	int needs_recovery, has_huge_files;
+	__u64 blocks_count;
+	int err;
+	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+	ext4_group_t first_not_zeroed;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		goto out_free_orig;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock) {
+		kfree(sbi);
+		goto out_free_orig;
+	}
+	sb->s_fs_info = sbi;
+	sbi->s_mount_opt = 0;
+	sbi->s_resuid = EXT4_DEF_RESUID;
+	sbi->s_resgid = EXT4_DEF_RESGID;
+	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
+	sbi->s_sb_block = sb_block;
+	if (sb->s_bdev->bd_part)
+		sbi->s_sectors_written_start =
+			part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+
+	/* Cleanup superblock name */
+	for (cp = sb->s_id; (cp = strchr(cp, '/'));)
+		*cp = '!';
+
+	ret = -EINVAL;
+	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
+	if (!blocksize) {
+		ext4_msg(sb, KERN_ERR, "unable to set blocksize");
+		goto out_fail;
+	}
+
+	/*
+	 * The ext4 superblock will not be buffer aligned for other than 1kB
+	 * block sizes.  We need to calculate the offset from buffer start.
+	 */
+	if (blocksize != EXT4_MIN_BLOCK_SIZE) {
+		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
+		offset = do_div(logical_sb_block, blocksize);
+	} else {
+		logical_sb_block = sb_block;
+	}
+
+	if (!(bh = sb_bread(sb, logical_sb_block))) {
+		ext4_msg(sb, KERN_ERR, "unable to read superblock");
+		goto out_fail;
+	}
+	/*
+	 * Note: s_es must be initialized as soon as possible because
+	 *       some ext4 macro-instructions depend on its value
+	 */
+	es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+	sbi->s_es = es;
+	sb->s_magic = le16_to_cpu(es->s_magic);
+	if (sb->s_magic != EXT4_SUPER_MAGIC)
+		goto cantfind_ext4;
+	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
+
+	/* Set defaults before we parse the mount options */
+	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+	set_opt(sb, INIT_INODE_TABLE);
+	if (def_mount_opts & EXT4_DEFM_DEBUG)
+		set_opt(sb, DEBUG);
+	if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
+		ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
+			"2.6.38");
+		set_opt(sb, GRPID);
+	}
+	if (def_mount_opts & EXT4_DEFM_UID16)
+		set_opt(sb, NO_UID32);
+	/* xattr user namespace & acls are now defaulted on */
+#ifdef CONFIG_EXT4_FS_XATTR
+	set_opt(sb, XATTR_USER);
+#endif
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	set_opt(sb, POSIX_ACL);
+#endif
+	set_opt(sb, MBLK_IO_SUBMIT);
+	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
+		set_opt(sb, JOURNAL_DATA);
+	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
+		set_opt(sb, ORDERED_DATA);
+	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
+		set_opt(sb, WRITEBACK_DATA);
+
+	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
+		set_opt(sb, ERRORS_PANIC);
+	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
+		set_opt(sb, ERRORS_CONT);
+	else
+		set_opt(sb, ERRORS_RO);
+	if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
+		set_opt(sb, BLOCK_VALIDITY);
+	if (def_mount_opts & EXT4_DEFM_DISCARD)
+		set_opt(sb, DISCARD);
+
+	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
+	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+
+	if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
+		set_opt(sb, BARRIER);
+
+	/*
+	 * enable delayed allocation by default
+	 * Use -o nodelalloc to turn it off
+	 */
+	if (!IS_EXT3_SB(sb) &&
+	    ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
+		set_opt(sb, DELALLOC);
+
+	/*
+	 * set default s_li_wait_mult for lazyinit, for the case there is
+	 * no mount option specified.
+	 */
+	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+
+	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
+			   &journal_devnum, &journal_ioprio, NULL, 0)) {
+		ext4_msg(sb, KERN_WARNING,
+			 "failed to parse options in superblock: %s",
+			 sbi->s_es->s_mount_opts);
+	}
+	if (!parse_options((char *) data, sb, &journal_devnum,
+			   &journal_ioprio, NULL, 0))
+		goto failed_mount;
+
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+
+	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
+	    (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
+	     EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
+	     EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+		ext4_msg(sb, KERN_WARNING,
+		       "feature flags set on rev 0 fs, "
+		       "running e2fsck is recommended");
+
+	if (IS_EXT2_SB(sb)) {
+		if (ext2_feature_set_ok(sb))
+			ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
+				 "using the ext4 subsystem");
+		else {
+			ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
+				 "to feature incompatibilities");
+			goto failed_mount;
+		}
+	}
+
+	if (IS_EXT3_SB(sb)) {
+		if (ext3_feature_set_ok(sb))
+			ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
+				 "using the ext4 subsystem");
+		else {
+			ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
+				 "to feature incompatibilities");
+			goto failed_mount;
+		}
+	}
+
+	/*
+	 * Check feature flags regardless of the revision level, since we
+	 * previously didn't change the revision level when setting the flags,
+	 * so there is a chance incompat flags are set on a rev 0 filesystem.
+	 */
+	if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
+		goto failed_mount;
+
+	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+
+	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
+	    blocksize > EXT4_MAX_BLOCK_SIZE) {
+		ext4_msg(sb, KERN_ERR,
+		       "Unsupported filesystem blocksize %d", blocksize);
+		goto failed_mount;
+	}
+
+	if (sb->s_blocksize != blocksize) {
+		/* Validate the filesystem blocksize */
+		if (!sb_set_blocksize(sb, blocksize)) {
+			ext4_msg(sb, KERN_ERR, "bad block size %d",
+					blocksize);
+			goto failed_mount;
+		}
+
+		brelse(bh);
+		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
+		offset = do_div(logical_sb_block, blocksize);
+		bh = sb_bread(sb, logical_sb_block);
+		if (!bh) {
+			ext4_msg(sb, KERN_ERR,
+			       "Can't read superblock on 2nd try");
+			goto failed_mount;
+		}
+		es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
+		sbi->s_es = es;
+		if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
+			ext4_msg(sb, KERN_ERR,
+			       "Magic mismatch, very weird!");
+			goto failed_mount;
+		}
+	}
+
+	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
+						      has_huge_files);
+	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
+
+	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
+		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
+		sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
+	} else {
+		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
+		    (!is_power_of_2(sbi->s_inode_size)) ||
+		    (sbi->s_inode_size > blocksize)) {
+			ext4_msg(sb, KERN_ERR,
+			       "unsupported inode size: %d",
+			       sbi->s_inode_size);
+			goto failed_mount;
+		}
+		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
+			sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
+	}
+
+	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
+		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
+		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
+		    !is_power_of_2(sbi->s_desc_size)) {
+			ext4_msg(sb, KERN_ERR,
+			       "unsupported descriptor size %lu",
+			       sbi->s_desc_size);
+			goto failed_mount;
+		}
+	} else
+		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
+
+	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
+	if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
+		goto cantfind_ext4;
+
+	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
+	if (sbi->s_inodes_per_block == 0)
+		goto cantfind_ext4;
+	sbi->s_itb_per_group = sbi->s_inodes_per_group /
+					sbi->s_inodes_per_block;
+	sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
+	sbi->s_sbh = bh;
+	sbi->s_mount_state = le16_to_cpu(es->s_state);
+	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
+	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
+
+	for (i = 0; i < 4; i++)
+		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
+	sbi->s_def_hash_version = es->s_def_hash_version;
+	i = le32_to_cpu(es->s_flags);
+	if (i & EXT2_FLAGS_UNSIGNED_HASH)
+		sbi->s_hash_unsigned = 3;
+	else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+		es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+		sbi->s_hash_unsigned = 3;
+#else
+		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+		sb->s_dirt = 1;
+	}
+
+	if (sbi->s_blocks_per_group > blocksize * 8) {
+		ext4_msg(sb, KERN_ERR,
+		       "#blocks per group too big: %lu",
+		       sbi->s_blocks_per_group);
+		goto failed_mount;
+	}
+	if (sbi->s_inodes_per_group > blocksize * 8) {
+		ext4_msg(sb, KERN_ERR,
+		       "#inodes per group too big: %lu",
+		       sbi->s_inodes_per_group);
+		goto failed_mount;
+	}
+
+	/*
+	 * Test whether we have more sectors than will fit in sector_t,
+	 * and whether the max offset is addressable by the page cache.
+	 */
+	err = generic_check_addressable(sb->s_blocksize_bits,
+					ext4_blocks_count(es));
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "filesystem"
+			 " too large to mount safely on this system");
+		if (sizeof(sector_t) < 8)
+			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+		ret = err;
+		goto failed_mount;
+	}
+
+	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
+		goto cantfind_ext4;
+
+	/* check blocks count against device size */
+	blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
+		ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
+		       "exceeds size of device (%llu blocks)",
+		       ext4_blocks_count(es), blocks_count);
+		goto failed_mount;
+	}
+
+	/*
+	 * It makes no sense for the first data block to be beyond the end
+	 * of the filesystem.
+	 */
+	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+                ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
+			 "block %u is beyond end of filesystem (%llu)",
+			 le32_to_cpu(es->s_first_data_block),
+			 ext4_blocks_count(es));
+		goto failed_mount;
+	}
+	blocks_count = (ext4_blocks_count(es) -
+			le32_to_cpu(es->s_first_data_block) +
+			EXT4_BLOCKS_PER_GROUP(sb) - 1);
+	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
+		ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
+		       "(block count %llu, first data block %u, "
+		       "blocks per group %lu)", sbi->s_groups_count,
+		       ext4_blocks_count(es),
+		       le32_to_cpu(es->s_first_data_block),
+		       EXT4_BLOCKS_PER_GROUP(sb));
+		goto failed_mount;
+	}
+	sbi->s_groups_count = blocks_count;
+	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
+	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+		   EXT4_DESC_PER_BLOCK(sb);
+	sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
+				    GFP_KERNEL);
+	if (sbi->s_group_desc == NULL) {
+		ext4_msg(sb, KERN_ERR, "not enough memory");
+		goto failed_mount;
+	}
+
+#ifdef CONFIG_PROC_FS
+	if (ext4_proc_root)
+		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+#endif
+
+	bgl_lock_init(sbi->s_blockgroup_lock);
+
+	for (i = 0; i < db_count; i++) {
+		block = descriptor_loc(sb, logical_sb_block, i);
+		sbi->s_group_desc[i] = sb_bread(sb, block);
+		if (!sbi->s_group_desc[i]) {
+			ext4_msg(sb, KERN_ERR,
+			       "can't read group descriptor %d", i);
+			db_count = i;
+			goto failed_mount2;
+		}
+	}
+	if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
+		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
+		goto failed_mount2;
+	}
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+		if (!ext4_fill_flex_info(sb)) {
+			ext4_msg(sb, KERN_ERR,
+			       "unable to initialize "
+			       "flex_bg meta info!");
+			goto failed_mount2;
+		}
+
+	sbi->s_gdb_count = db_count;
+	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+	spin_lock_init(&sbi->s_next_gen_lock);
+
+	init_timer(&sbi->s_err_report);
+	sbi->s_err_report.function = print_daily_error_info;
+	sbi->s_err_report.data = (unsigned long) sb;
+
+	err = percpu_counter_init(&sbi->s_freeblocks_counter,
+			ext4_count_free_blocks(sb));
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_freeinodes_counter,
+				ext4_count_free_inodes(sb));
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_dirs_counter,
+				ext4_count_dirs(sb));
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+	}
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "insufficient memory");
+		goto failed_mount3;
+	}
+
+	sbi->s_stripe = ext4_get_stripe_size(sbi);
+	sbi->s_max_writeback_mb_bump = 128;
+
+	/*
+	 * set up enough so that it can read an inode
+	 */
+	if (!test_opt(sb, NOLOAD) &&
+	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+		sb->s_op = &ext4_sops;
+	else
+		sb->s_op = &ext4_nojournal_sops;
+	sb->s_export_op = &ext4_export_ops;
+	sb->s_xattr = ext4_xattr_handlers;
+#ifdef CONFIG_QUOTA
+	sb->s_qcop = &ext4_qctl_operations;
+	sb->dq_op = &ext4_quota_operations;
+#endif
+	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+
+	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+	mutex_init(&sbi->s_orphan_lock);
+	mutex_init(&sbi->s_resize_lock);
+
+	sb->s_root = NULL;
+
+	needs_recovery = (es->s_last_orphan != 0 ||
+			  EXT4_HAS_INCOMPAT_FEATURE(sb,
+				    EXT4_FEATURE_INCOMPAT_RECOVER));
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+	    !(sb->s_flags & MS_RDONLY))
+		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+			goto failed_mount3;
+
+	/*
+	 * The first inode we look at is the journal inode.  Don't try
+	 * root first: it may be modified in the journal!
+	 */
+	if (!test_opt(sb, NOLOAD) &&
+	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+		if (ext4_load_journal(sb, es, journal_devnum))
+			goto failed_mount3;
+	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
+	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+		ext4_msg(sb, KERN_ERR, "required journal recovery "
+		       "suppressed and not mounted read-only");
+		goto failed_mount_wq;
+	} else {
+		clear_opt(sb, DATA_FLAGS);
+		sbi->s_journal = NULL;
+		needs_recovery = 0;
+		goto no_journal;
+	}
+
+	if (ext4_blocks_count(es) > 0xffffffffULL &&
+	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+				       JBD2_FEATURE_INCOMPAT_64BIT)) {
+		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
+		goto failed_mount_wq;
+	}
+
+	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+		jbd2_journal_set_features(sbi->s_journal,
+				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
+		jbd2_journal_set_features(sbi->s_journal,
+				JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
+		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
+				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+	} else {
+		jbd2_journal_clear_features(sbi->s_journal,
+				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+	}
+
+	/* We have now updated the journal if required, so we can
+	 * validate the data journaling mode. */
+	switch (test_opt(sb, DATA_FLAGS)) {
+	case 0:
+		/* No mode set, assume a default based on the journal
+		 * capabilities: ORDERED_DATA if the journal can
+		 * cope, else JOURNAL_DATA
+		 */
+		if (jbd2_journal_check_available_features
+		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
+			set_opt(sb, ORDERED_DATA);
+		else
+			set_opt(sb, JOURNAL_DATA);
+		break;
+
+	case EXT4_MOUNT_ORDERED_DATA:
+	case EXT4_MOUNT_WRITEBACK_DATA:
+		if (!jbd2_journal_check_available_features
+		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+			ext4_msg(sb, KERN_ERR, "Journal does not support "
+			       "requested data journaling mode");
+			goto failed_mount_wq;
+		}
+	default:
+		break;
+	}
+	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+
+	/*
+	 * The journal may have updated the bg summary counts, so we
+	 * need to update the global counters.
+	 */
+	percpu_counter_set(&sbi->s_freeblocks_counter,
+			   ext4_count_free_blocks(sb));
+	percpu_counter_set(&sbi->s_freeinodes_counter,
+			   ext4_count_free_inodes(sb));
+	percpu_counter_set(&sbi->s_dirs_counter,
+			   ext4_count_dirs(sb));
+	percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
+
+no_journal:
+	/*
+	 * The maximum number of concurrent works can be high and
+	 * concurrency isn't really necessary.  Limit it to 1.
+	 */
+	EXT4_SB(sb)->dio_unwritten_wq =
+		alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+	if (!EXT4_SB(sb)->dio_unwritten_wq) {
+		printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+		goto failed_mount_wq;
+	}
+
+	/*
+	 * The jbd2_journal_load will have done any necessary log recovery,
+	 * so we can safely mount the rest of the filesystem now.
+	 */
+
+	root = ext4_iget(sb, EXT4_ROOT_INO);
+	if (IS_ERR(root)) {
+		ext4_msg(sb, KERN_ERR, "get root inode failed");
+		ret = PTR_ERR(root);
+		root = NULL;
+		goto failed_mount4;
+	}
+	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
+		goto failed_mount4;
+	}
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		ext4_msg(sb, KERN_ERR, "get root dentry failed");
+		ret = -ENOMEM;
+		goto failed_mount4;
+	}
+
+	if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY))
+		sb->s_flags |= MS_RDONLY;
+
+	/* determine the minimum size of new large inodes, if present */
+	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
+		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
+						     EXT4_GOOD_OLD_INODE_SIZE;
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
+			if (sbi->s_want_extra_isize <
+			    le16_to_cpu(es->s_want_extra_isize))
+				sbi->s_want_extra_isize =
+					le16_to_cpu(es->s_want_extra_isize);
+			if (sbi->s_want_extra_isize <
+			    le16_to_cpu(es->s_min_extra_isize))
+				sbi->s_want_extra_isize =
+					le16_to_cpu(es->s_min_extra_isize);
+		}
+	}
+	/* Check if enough inode space is available */
+	if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
+							sbi->s_inode_size) {
+		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
+						       EXT4_GOOD_OLD_INODE_SIZE;
+		ext4_msg(sb, KERN_INFO, "required extra inode space not"
+			 "available");
+	}
+
+	if (test_opt(sb, DELALLOC) &&
+	    (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
+		ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
+			 "requested data journaling mode");
+		clear_opt(sb, DELALLOC);
+	}
+	if (test_opt(sb, DIOREAD_NOLOCK)) {
+		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+			ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
+				"option - requested data journaling mode");
+			clear_opt(sb, DIOREAD_NOLOCK);
+		}
+		if (sb->s_blocksize < PAGE_SIZE) {
+			ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
+				"option - block size is too small");
+			clear_opt(sb, DIOREAD_NOLOCK);
+		}
+	}
+
+	err = ext4_setup_system_zone(sb);
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "failed to initialize system "
+			 "zone (%d)", err);
+		goto failed_mount4;
+	}
+
+	ext4_ext_init(sb);
+	err = ext4_mb_init(sb, needs_recovery);
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
+			 err);
+		goto failed_mount4;
+	}
+
+	err = ext4_register_li_request(sb, first_not_zeroed);
+	if (err)
+		goto failed_mount4;
+
+	sbi->s_kobj.kset = ext4_kset;
+	init_completion(&sbi->s_kobj_unregister);
+	err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
+				   "%s", sb->s_id);
+	if (err) {
+		ext4_mb_release(sb);
+		ext4_ext_release(sb);
+		goto failed_mount4;
+	};
+
+	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
+	ext4_orphan_cleanup(sb, es);
+	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
+	if (needs_recovery) {
+		ext4_msg(sb, KERN_INFO, "recovery complete");
+		ext4_mark_recovery_complete(sb, es);
+	}
+	if (EXT4_SB(sb)->s_journal) {
+		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+			descr = " journalled data mode";
+		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+			descr = " ordered data mode";
+		else
+			descr = " writeback data mode";
+	} else
+		descr = "out journal";
+
+	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+		 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
+		 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
+
+	if (es->s_error_count)
+		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
+
+	kfree(orig_data);
+	return 0;
+
+cantfind_ext4:
+	if (!silent)
+		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
+	goto failed_mount;
+
+failed_mount4:
+	iput(root);
+	sb->s_root = NULL;
+	ext4_msg(sb, KERN_ERR, "mount failed");
+	destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+failed_mount_wq:
+	ext4_release_system_zone(sb);
+	if (sbi->s_journal) {
+		jbd2_journal_destroy(sbi->s_journal);
+		sbi->s_journal = NULL;
+	}
+failed_mount3:
+	del_timer(&sbi->s_err_report);
+	if (sbi->s_flex_groups) {
+		if (is_vmalloc_addr(sbi->s_flex_groups))
+			vfree(sbi->s_flex_groups);
+		else
+			kfree(sbi->s_flex_groups);
+	}
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+	if (sbi->s_mmp_tsk)
+		kthread_stop(sbi->s_mmp_tsk);
+failed_mount2:
+	for (i = 0; i < db_count; i++)
+		brelse(sbi->s_group_desc[i]);
+	kfree(sbi->s_group_desc);
+failed_mount:
+	if (sbi->s_proc) {
+		remove_proc_entry(sb->s_id, ext4_proc_root);
+	}
+#ifdef CONFIG_QUOTA
+	for (i = 0; i < MAXQUOTAS; i++)
+		kfree(sbi->s_qf_names[i]);
+#endif
+	ext4_blkdev_remove(sbi);
+	brelse(bh);
+out_fail:
+	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
+	kfree(sbi);
+out_free_orig:
+	kfree(orig_data);
+	return ret;
+}
+
+/*
+ * Setup any per-fs journal parameters now.  We'll do this both on
+ * initial mount, once the journal has been initialised but before we've
+ * done any recovery; and again on any subsequent remount.
+ */
+static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	journal->j_commit_interval = sbi->s_commit_interval;
+	journal->j_min_batch_time = sbi->s_min_batch_time;
+	journal->j_max_batch_time = sbi->s_max_batch_time;
+
+	write_lock(&journal->j_state_lock);
+	if (test_opt(sb, BARRIER))
+		journal->j_flags |= JBD2_BARRIER;
+	else
+		journal->j_flags &= ~JBD2_BARRIER;
+	if (test_opt(sb, DATA_ERR_ABORT))
+		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
+	else
+		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
+	write_unlock(&journal->j_state_lock);
+}
+
+static journal_t *ext4_get_journal(struct super_block *sb,
+				   unsigned int journal_inum)
+{
+	struct inode *journal_inode;
+	journal_t *journal;
+
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
+	/* First, test for the existence of a valid inode on disk.  Bad
+	 * things happen if we iget() an unused inode, as the subsequent
+	 * iput() will try to delete it. */
+
+	journal_inode = ext4_iget(sb, journal_inum);
+	if (IS_ERR(journal_inode)) {
+		ext4_msg(sb, KERN_ERR, "no journal found");
+		return NULL;
+	}
+	if (!journal_inode->i_nlink) {
+		make_bad_inode(journal_inode);
+		iput(journal_inode);
+		ext4_msg(sb, KERN_ERR, "journal inode is deleted");
+		return NULL;
+	}
+
+	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
+		  journal_inode, journal_inode->i_size);
+	if (!S_ISREG(journal_inode->i_mode)) {
+		ext4_msg(sb, KERN_ERR, "invalid journal inode");
+		iput(journal_inode);
+		return NULL;
+	}
+
+	journal = jbd2_journal_init_inode(journal_inode);
+	if (!journal) {
+		ext4_msg(sb, KERN_ERR, "Could not load journal inode");
+		iput(journal_inode);
+		return NULL;
+	}
+	journal->j_private = sb;
+	ext4_init_journal_params(sb, journal);
+	return journal;
+}
+
+static journal_t *ext4_get_dev_journal(struct super_block *sb,
+				       dev_t j_dev)
+{
+	struct buffer_head *bh;
+	journal_t *journal;
+	ext4_fsblk_t start;
+	ext4_fsblk_t len;
+	int hblock, blocksize;
+	ext4_fsblk_t sb_block;
+	unsigned long offset;
+	struct ext4_super_block *es;
+	struct block_device *bdev;
+
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
+	bdev = ext4_blkdev_get(j_dev, sb);
+	if (bdev == NULL)
+		return NULL;
+
+	blocksize = sb->s_blocksize;
+	hblock = bdev_logical_block_size(bdev);
+	if (blocksize < hblock) {
+		ext4_msg(sb, KERN_ERR,
+			"blocksize too small for journal device");
+		goto out_bdev;
+	}
+
+	sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
+	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
+	set_blocksize(bdev, blocksize);
+	if (!(bh = __bread(bdev, sb_block, blocksize))) {
+		ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
+		       "external journal");
+		goto out_bdev;
+	}
+
+	es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
+	    !(le32_to_cpu(es->s_feature_incompat) &
+	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
+		ext4_msg(sb, KERN_ERR, "external journal has "
+					"bad superblock");
+		brelse(bh);
+		goto out_bdev;
+	}
+
+	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
+		ext4_msg(sb, KERN_ERR, "journal UUID does not match");
+		brelse(bh);
+		goto out_bdev;
+	}
+
+	len = ext4_blocks_count(es);
+	start = sb_block + 1;
+	brelse(bh);	/* we're done with the superblock */
+
+	journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
+					start, len, blocksize);
+	if (!journal) {
+		ext4_msg(sb, KERN_ERR, "failed to create device journal");
+		goto out_bdev;
+	}
+	journal->j_private = sb;
+	ll_rw_block(READ, 1, &journal->j_sb_buffer);
+	wait_on_buffer(journal->j_sb_buffer);
+	if (!buffer_uptodate(journal->j_sb_buffer)) {
+		ext4_msg(sb, KERN_ERR, "I/O error on journal device");
+		goto out_journal;
+	}
+	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
+		ext4_msg(sb, KERN_ERR, "External journal has more than one "
+					"user (unsupported) - %d",
+			be32_to_cpu(journal->j_superblock->s_nr_users));
+		goto out_journal;
+	}
+	EXT4_SB(sb)->journal_bdev = bdev;
+	ext4_init_journal_params(sb, journal);
+	return journal;
+
+out_journal:
+	jbd2_journal_destroy(journal);
+out_bdev:
+	ext4_blkdev_put(bdev);
+	return NULL;
+}
+
+static int ext4_load_journal(struct super_block *sb,
+			     struct ext4_super_block *es,
+			     unsigned long journal_devnum)
+{
+	journal_t *journal;
+	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
+	dev_t journal_dev;
+	int err = 0;
+	int really_read_only;
+
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
+	if (journal_devnum &&
+	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+		ext4_msg(sb, KERN_INFO, "external journal device major/minor "
+			"numbers have changed");
+		journal_dev = new_decode_dev(journal_devnum);
+	} else
+		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
+
+	really_read_only = bdev_read_only(sb->s_bdev);
+
+	/*
+	 * Are we loading a blank journal or performing recovery after a
+	 * crash?  For recovery, we need to check in advance whether we
+	 * can get read-write access to the device.
+	 */
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+		if (sb->s_flags & MS_RDONLY) {
+			ext4_msg(sb, KERN_INFO, "INFO: recovery "
+					"required on readonly filesystem");
+			if (really_read_only) {
+				ext4_msg(sb, KERN_ERR, "write access "
+					"unavailable, cannot proceed");
+				return -EROFS;
+			}
+			ext4_msg(sb, KERN_INFO, "write access will "
+			       "be enabled during recovery");
+		}
+	}
+
+	if (journal_inum && journal_dev) {
+		ext4_msg(sb, KERN_ERR, "filesystem has both journal "
+		       "and inode journals!");
+		return -EINVAL;
+	}
+
+	if (journal_inum) {
+		if (!(journal = ext4_get_journal(sb, journal_inum)))
+			return -EINVAL;
+	} else {
+		if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
+			return -EINVAL;
+	}
+
+	if (!(journal->j_flags & JBD2_BARRIER))
+		ext4_msg(sb, KERN_INFO, "barriers disabled");
+
+	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
+		err = jbd2_journal_update_format(journal);
+		if (err)  {
+			ext4_msg(sb, KERN_ERR, "error updating journal");
+			jbd2_journal_destroy(journal);
+			return err;
+		}
+	}
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
+		err = jbd2_journal_wipe(journal, !really_read_only);
+	if (!err) {
+		char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
+		if (save)
+			memcpy(save, ((char *) es) +
+			       EXT4_S_ERR_START, EXT4_S_ERR_LEN);
+		err = jbd2_journal_load(journal);
+		if (save)
+			memcpy(((char *) es) + EXT4_S_ERR_START,
+			       save, EXT4_S_ERR_LEN);
+		kfree(save);
+	}
+
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "error loading journal");
+		jbd2_journal_destroy(journal);
+		return err;
+	}
+
+	EXT4_SB(sb)->s_journal = journal;
+	ext4_clear_journal_err(sb, es);
+
+	if (!really_read_only && journal_devnum &&
+	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+		es->s_journal_dev = cpu_to_le32(journal_devnum);
+
+		/* Make sure we flush the recovery flag to disk. */
+		ext4_commit_super(sb, 1);
+	}
+
+	return 0;
+}
+
+static int ext4_commit_super(struct super_block *sb, int sync)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
+	int error = 0;
+
+	if (!sbh)
+		return error;
+	if (buffer_write_io_error(sbh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		ext4_msg(sb, KERN_ERR, "previous I/O error to "
+		       "superblock detected");
+		clear_buffer_write_io_error(sbh);
+		set_buffer_uptodate(sbh);
+	}
+	/*
+	 * If the file system is mounted read-only, don't update the
+	 * superblock write time.  This avoids updating the superblock
+	 * write time when we are mounting the root file system
+	 * read/only but we need to replay the journal; at that point,
+	 * for people who are east of GMT and who make their clock
+	 * tick in localtime for Windows bug-for-bug compatibility,
+	 * the clock is set in the future, and this will cause e2fsck
+	 * to complain and force a full file system check.
+	 */
+	if (!(sb->s_flags & MS_RDONLY))
+		es->s_wtime = cpu_to_le32(get_seconds());
+	if (sb->s_bdev->bd_part)
+		es->s_kbytes_written =
+			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
+	else
+		es->s_kbytes_written =
+			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
+	ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+					   &EXT4_SB(sb)->s_freeblocks_counter));
+	es->s_free_inodes_count =
+		cpu_to_le32(percpu_counter_sum_positive(
+				&EXT4_SB(sb)->s_freeinodes_counter));
+	sb->s_dirt = 0;
+	BUFFER_TRACE(sbh, "marking dirty");
+	mark_buffer_dirty(sbh);
+	if (sync) {
+		error = sync_dirty_buffer(sbh);
+		if (error)
+			return error;
+
+		error = buffer_write_io_error(sbh);
+		if (error) {
+			ext4_msg(sb, KERN_ERR, "I/O error while writing "
+			       "superblock");
+			clear_buffer_write_io_error(sbh);
+			set_buffer_uptodate(sbh);
+		}
+	}
+	return error;
+}
+
+/*
+ * Have we just finished recovery?  If so, and if we are mounting (or
+ * remounting) the filesystem readonly, then we will end up with a
+ * consistent fs on disk.  Record that fact.
+ */
+static void ext4_mark_recovery_complete(struct super_block *sb,
+					struct ext4_super_block *es)
+{
+	journal_t *journal = EXT4_SB(sb)->s_journal;
+
+	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+		BUG_ON(journal != NULL);
+		return;
+	}
+	jbd2_journal_lock_updates(journal);
+	if (jbd2_journal_flush(journal) < 0)
+		goto out;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
+	    sb->s_flags & MS_RDONLY) {
+		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+		ext4_commit_super(sb, 1);
+	}
+
+out:
+	jbd2_journal_unlock_updates(journal);
+}
+
+/*
+ * If we are mounting (or read-write remounting) a filesystem whose journal
+ * has recorded an error from a previous lifetime, move that error to the
+ * main filesystem now.
+ */
+static void ext4_clear_journal_err(struct super_block *sb,
+				   struct ext4_super_block *es)
+{
+	journal_t *journal;
+	int j_errno;
+	const char *errstr;
+
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
+	journal = EXT4_SB(sb)->s_journal;
+
+	/*
+	 * Now check for any error status which may have been recorded in the
+	 * journal by a prior ext4_error() or ext4_abort()
+	 */
+
+	j_errno = jbd2_journal_errno(journal);
+	if (j_errno) {
+		char nbuf[16];
+
+		errstr = ext4_decode_error(sb, j_errno, nbuf);
+		ext4_warning(sb, "Filesystem error recorded "
+			     "from previous mount: %s", errstr);
+		ext4_warning(sb, "Marking fs in need of filesystem check.");
+
+		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+		ext4_commit_super(sb, 1);
+
+		jbd2_journal_clear_err(journal);
+	}
+}
+
+/*
+ * Force the running and committing transactions to commit,
+ * and wait on the commit.
+ */
+int ext4_force_commit(struct super_block *sb)
+{
+	journal_t *journal;
+	int ret = 0;
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	journal = EXT4_SB(sb)->s_journal;
+	if (journal) {
+		vfs_check_frozen(sb, SB_FREEZE_TRANS);
+		ret = ext4_journal_force_commit(journal);
+	}
+
+	return ret;
+}
+
+static void ext4_write_super(struct super_block *sb)
+{
+	lock_super(sb);
+	ext4_commit_super(sb, 1);
+	unlock_super(sb);
+}
+
+static int ext4_sync_fs(struct super_block *sb, int wait)
+{
+	int ret = 0;
+	tid_t target;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	trace_ext4_sync_fs(sb, wait);
+	flush_workqueue(sbi->dio_unwritten_wq);
+	if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
+		if (wait)
+			jbd2_log_wait_commit(sbi->s_journal, target);
+	}
+	return ret;
+}
+
+/*
+ * LVM calls this function before a (read-only) snapshot is created.  This
+ * gives us a chance to flush the journal completely and mark the fs clean.
+ *
+ * Note that only this function cannot bring a filesystem to be in a clean
+ * state independently, because ext4 prevents a new handle from being started
+ * by @sb->s_frozen, which stays in an upper layer.  It thus needs help from
+ * the upper layer.
+ */
+static int ext4_freeze(struct super_block *sb)
+{
+	int error = 0;
+	journal_t *journal;
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	journal = EXT4_SB(sb)->s_journal;
+
+	/* Now we set up the journal barrier. */
+	jbd2_journal_lock_updates(journal);
+
+	/*
+	 * Don't clear the needs_recovery flag if we failed to flush
+	 * the journal.
+	 */
+	error = jbd2_journal_flush(journal);
+	if (error < 0)
+		goto out;
+
+	/* Journal blocked and flushed, clear needs_recovery flag. */
+	EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	error = ext4_commit_super(sb, 1);
+out:
+	/* we rely on s_frozen to stop further updates */
+	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+	return error;
+}
+
+/*
+ * Called by LVM after the snapshot is done.  We need to reset the RECOVER
+ * flag here, even though the filesystem is not technically dirty yet.
+ */
+static int ext4_unfreeze(struct super_block *sb)
+{
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	lock_super(sb);
+	/* Reset the needs_recovery flag before the fs is unlocked. */
+	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	ext4_commit_super(sb, 1);
+	unlock_super(sb);
+	return 0;
+}
+
+/*
+ * Structure to save mount options for ext4_remount's benefit
+ */
+struct ext4_mount_options {
+	unsigned long s_mount_opt;
+	unsigned long s_mount_opt2;
+	uid_t s_resuid;
+	gid_t s_resgid;
+	unsigned long s_commit_interval;
+	u32 s_min_batch_time, s_max_batch_time;
+#ifdef CONFIG_QUOTA
+	int s_jquota_fmt;
+	char *s_qf_names[MAXQUOTAS];
+#endif
+};
+
+static int ext4_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct ext4_super_block *es;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_fsblk_t n_blocks_count = 0;
+	unsigned long old_sb_flags;
+	struct ext4_mount_options old_opts;
+	int enable_quota = 0;
+	ext4_group_t g;
+	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+	int err = 0;
+#ifdef CONFIG_QUOTA
+	int i;
+#endif
+	char *orig_data = kstrdup(data, GFP_KERNEL);
+
+	/* Store the original options */
+	lock_super(sb);
+	old_sb_flags = sb->s_flags;
+	old_opts.s_mount_opt = sbi->s_mount_opt;
+	old_opts.s_mount_opt2 = sbi->s_mount_opt2;
+	old_opts.s_resuid = sbi->s_resuid;
+	old_opts.s_resgid = sbi->s_resgid;
+	old_opts.s_commit_interval = sbi->s_commit_interval;
+	old_opts.s_min_batch_time = sbi->s_min_batch_time;
+	old_opts.s_max_batch_time = sbi->s_max_batch_time;
+#ifdef CONFIG_QUOTA
+	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
+	for (i = 0; i < MAXQUOTAS; i++)
+		old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+#endif
+	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
+
+	/*
+	 * Allow the "check" option to be passed as a remount option.
+	 */
+	if (!parse_options(data, sb, NULL, &journal_ioprio,
+			   &n_blocks_count, 1)) {
+		err = -EINVAL;
+		goto restore_opts;
+	}
+
+	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
+		ext4_abort(sb, "Abort forced by user");
+
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+
+	es = sbi->s_es;
+
+	if (sbi->s_journal) {
+		ext4_init_journal_params(sb, sbi->s_journal);
+		set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+	}
+
+	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
+		n_blocks_count > ext4_blocks_count(es)) {
+		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
+			err = -EROFS;
+			goto restore_opts;
+		}
+
+		if (*flags & MS_RDONLY) {
+			err = dquot_suspend(sb, -1);
+			if (err < 0)
+				goto restore_opts;
+
+			/*
+			 * First of all, the unconditional stuff we have to do
+			 * to disable replay of the journal when we next remount
+			 */
+			sb->s_flags |= MS_RDONLY;
+
+			/*
+			 * OK, test if we are remounting a valid rw partition
+			 * readonly, and if so set the rdonly flag and then
+			 * mark the partition as valid again.
+			 */
+			if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
+			    (sbi->s_mount_state & EXT4_VALID_FS))
+				es->s_state = cpu_to_le16(sbi->s_mount_state);
+
+			if (sbi->s_journal)
+				ext4_mark_recovery_complete(sb, es);
+		} else {
+			/* Make sure we can mount this feature set readwrite */
+			if (!ext4_feature_set_ok(sb, 0)) {
+				err = -EROFS;
+				goto restore_opts;
+			}
+			/*
+			 * Make sure the group descriptor checksums
+			 * are sane.  If they aren't, refuse to remount r/w.
+			 */
+			for (g = 0; g < sbi->s_groups_count; g++) {
+				struct ext4_group_desc *gdp =
+					ext4_get_group_desc(sb, g, NULL);
+
+				if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
+					ext4_msg(sb, KERN_ERR,
+	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
+		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
+					       le16_to_cpu(gdp->bg_checksum));
+					err = -EINVAL;
+					goto restore_opts;
+				}
+			}
+
+			/*
+			 * If we have an unprocessed orphan list hanging
+			 * around from a previously readonly bdev mount,
+			 * require a full umount/remount for now.
+			 */
+			if (es->s_last_orphan) {
+				ext4_msg(sb, KERN_WARNING, "Couldn't "
+				       "remount RDWR because of unprocessed "
+				       "orphan inode list.  Please "
+				       "umount/remount instead");
+				err = -EINVAL;
+				goto restore_opts;
+			}
+
+			/*
+			 * Mounting a RDONLY partition read-write, so reread
+			 * and store the current valid flag.  (It may have
+			 * been changed by e2fsck since we originally mounted
+			 * the partition.)
+			 */
+			if (sbi->s_journal)
+				ext4_clear_journal_err(sb, es);
+			sbi->s_mount_state = le16_to_cpu(es->s_state);
+			if ((err = ext4_group_extend(sb, es, n_blocks_count)))
+				goto restore_opts;
+			if (!ext4_setup_super(sb, es, 0))
+				sb->s_flags &= ~MS_RDONLY;
+			if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+						     EXT4_FEATURE_INCOMPAT_MMP))
+				if (ext4_multi_mount_protect(sb,
+						le64_to_cpu(es->s_mmp_block))) {
+					err = -EROFS;
+					goto restore_opts;
+				}
+			enable_quota = 1;
+		}
+	}
+
+	/*
+	 * Reinitialize lazy itable initialization thread based on
+	 * current settings
+	 */
+	if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
+		ext4_unregister_li_request(sb);
+	else {
+		ext4_group_t first_not_zeroed;
+		first_not_zeroed = ext4_has_uninit_itable(sb);
+		ext4_register_li_request(sb, first_not_zeroed);
+	}
+
+	ext4_setup_system_zone(sb);
+	if (sbi->s_journal == NULL)
+		ext4_commit_super(sb, 1);
+
+#ifdef CONFIG_QUOTA
+	/* Release old quota file names */
+	for (i = 0; i < MAXQUOTAS; i++)
+		if (old_opts.s_qf_names[i] &&
+		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
+			kfree(old_opts.s_qf_names[i]);
+#endif
+	unlock_super(sb);
+	if (enable_quota)
+		dquot_resume(sb, -1);
+
+	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+	kfree(orig_data);
+	return 0;
+
+restore_opts:
+	sb->s_flags = old_sb_flags;
+	sbi->s_mount_opt = old_opts.s_mount_opt;
+	sbi->s_mount_opt2 = old_opts.s_mount_opt2;
+	sbi->s_resuid = old_opts.s_resuid;
+	sbi->s_resgid = old_opts.s_resgid;
+	sbi->s_commit_interval = old_opts.s_commit_interval;
+	sbi->s_min_batch_time = old_opts.s_min_batch_time;
+	sbi->s_max_batch_time = old_opts.s_max_batch_time;
+#ifdef CONFIG_QUOTA
+	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (sbi->s_qf_names[i] &&
+		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
+			kfree(sbi->s_qf_names[i]);
+		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
+	}
+#endif
+	unlock_super(sb);
+	kfree(orig_data);
+	return err;
+}
+
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	u64 fsid;
+	s64 bfree;
+
+	if (test_opt(sb, MINIX_DF)) {
+		sbi->s_overhead_last = 0;
+	} else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
+		ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+		ext4_fsblk_t overhead = 0;
+
+		/*
+		 * Compute the overhead (FS structures).  This is constant
+		 * for a given filesystem unless the number of block groups
+		 * changes so we cache the previous value until it does.
+		 */
+
+		/*
+		 * All of the blocks before first_data_block are
+		 * overhead
+		 */
+		overhead = le32_to_cpu(es->s_first_data_block);
+
+		/*
+		 * Add the overhead attributed to the superblock and
+		 * block group descriptors.  If the sparse superblocks
+		 * feature is turned on, then not all groups have this.
+		 */
+		for (i = 0; i < ngroups; i++) {
+			overhead += ext4_bg_has_super(sb, i) +
+				ext4_bg_num_gdb(sb, i);
+			cond_resched();
+		}
+
+		/*
+		 * Every block group has an inode bitmap, a block
+		 * bitmap, and an inode table.
+		 */
+		overhead += ngroups * (2 + sbi->s_itb_per_group);
+		sbi->s_overhead_last = overhead;
+		smp_wmb();
+		sbi->s_blocks_last = ext4_blocks_count(es);
+	}
+
+	buf->f_type = EXT4_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
+	bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+		       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
+	/* prevent underflow in case that few free space is available */
+	buf->f_bfree = max_t(s64, bfree, 0);
+	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
+	if (buf->f_bfree < ext4_r_blocks_count(es))
+		buf->f_bavail = 0;
+	buf->f_files = le32_to_cpu(es->s_inodes_count);
+	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
+	buf->f_namelen = EXT4_NAME_LEN;
+	fsid = le64_to_cpup((void *)es->s_uuid) ^
+	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
+	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+
+	return 0;
+}
+
+/* Helper function for writing quotas on sync - we need to start transaction
+ * before quota file is locked for write. Otherwise the are possible deadlocks:
+ * Process 1                         Process 2
+ * ext4_create()                     quota_sync()
+ *   jbd2_journal_start()                  write_dquot()
+ *   dquot_initialize()                         down(dqio_mutex)
+ *     down(dqio_mutex)                    jbd2_journal_start()
+ *
+ */
+
+#ifdef CONFIG_QUOTA
+
+static inline struct inode *dquot_to_inode(struct dquot *dquot)
+{
+	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
+}
+
+static int ext4_write_dquot(struct dquot *dquot)
+{
+	int ret, err;
+	handle_t *handle;
+	struct inode *inode;
+
+	inode = dquot_to_inode(dquot);
+	handle = ext4_journal_start(inode,
+				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_commit(dquot);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext4_acquire_dquot(struct dquot *dquot)
+{
+	int ret, err;
+	handle_t *handle;
+
+	handle = ext4_journal_start(dquot_to_inode(dquot),
+				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_acquire(dquot);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext4_release_dquot(struct dquot *dquot)
+{
+	int ret, err;
+	handle_t *handle;
+
+	handle = ext4_journal_start(dquot_to_inode(dquot),
+				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle)) {
+		/* Release dquot anyway to avoid endless cycle in dqput() */
+		dquot_release(dquot);
+		return PTR_ERR(handle);
+	}
+	ret = dquot_release(dquot);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext4_mark_dquot_dirty(struct dquot *dquot)
+{
+	/* Are we journaling quotas? */
+	if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
+	    EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+		dquot_mark_dquot_dirty(dquot);
+		return ext4_write_dquot(dquot);
+	} else {
+		return dquot_mark_dquot_dirty(dquot);
+	}
+}
+
+static int ext4_write_info(struct super_block *sb, int type)
+{
+	int ret, err;
+	handle_t *handle;
+
+	/* Data block + inode block */
+	handle = ext4_journal_start(sb->s_root->d_inode, 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_commit_info(sb, type);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+/*
+ * Turn on quotas during mount time - we need to find
+ * the quota file and such...
+ */
+static int ext4_quota_on_mount(struct super_block *sb, int type)
+{
+	return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
+					EXT4_SB(sb)->s_jquota_fmt, type);
+}
+
+/*
+ * Standard function to be called on quota_on
+ */
+static int ext4_quota_on(struct super_block *sb, int type, int format_id,
+			 struct path *path)
+{
+	int err;
+
+	if (!test_opt(sb, QUOTA))
+		return -EINVAL;
+
+	/* Quotafile not on the same filesystem? */
+	if (path->mnt->mnt_sb != sb)
+		return -EXDEV;
+	/* Journaling quota? */
+	if (EXT4_SB(sb)->s_qf_names[type]) {
+		/* Quotafile not in fs root? */
+		if (path->dentry->d_parent != sb->s_root)
+			ext4_msg(sb, KERN_WARNING,
+				"Quota file not on filesystem root. "
+				"Journaled quota will not work");
+	}
+
+	/*
+	 * When we journal data on quota file, we have to flush journal to see
+	 * all updates to the file when we bypass pagecache...
+	 */
+	if (EXT4_SB(sb)->s_journal &&
+	    ext4_should_journal_data(path->dentry->d_inode)) {
+		/*
+		 * We don't need to lock updates but journal_flush() could
+		 * otherwise be livelocked...
+		 */
+		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		if (err)
+			return err;
+	}
+
+	return dquot_quota_on(sb, type, format_id, path);
+}
+
+static int ext4_quota_off(struct super_block *sb, int type)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	handle_t *handle;
+
+	/* Force all delayed allocation blocks to be allocated.
+	 * Caller already holds s_umount sem */
+	if (test_opt(sb, DELALLOC))
+		sync_filesystem(sb);
+
+	if (!inode)
+		goto out;
+
+	/* Update modification times of quota files when userspace can
+	 * start looking at them */
+	handle = ext4_journal_start(inode, 1);
+	if (IS_ERR(handle))
+		goto out;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_journal_stop(handle);
+
+out:
+	return dquot_quota_off(sb, type);
+}
+
+/* Read data from quotafile - avoid pagecache and such because we cannot afford
+ * acquiring the locks... As quota files are never truncated and quota code
+ * itself serializes the operations (and no one else should touch the files)
+ * we don't have to be afraid of races */
+static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
+			       size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	int tocopy;
+	size_t toread;
+	struct buffer_head *bh;
+	loff_t i_size = i_size_read(inode);
+
+	if (off > i_size)
+		return 0;
+	if (off+len > i_size)
+		len = i_size-off;
+	toread = len;
+	while (toread > 0) {
+		tocopy = sb->s_blocksize - offset < toread ?
+				sb->s_blocksize - offset : toread;
+		bh = ext4_bread(NULL, inode, blk, 0, &err);
+		if (err)
+			return err;
+		if (!bh)	/* A hole? */
+			memset(data, 0, tocopy);
+		else
+			memcpy(data, bh->b_data+offset, tocopy);
+		brelse(bh);
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+static ssize_t ext4_quota_write(struct super_block *sb, int type,
+				const char *data, size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	struct buffer_head *bh;
+	handle_t *handle = journal_current_handle();
+
+	if (EXT4_SB(sb)->s_journal && !handle) {
+		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+			" cancelled because transaction is not started",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+	/*
+	 * Since we account only one data block in transaction credits,
+	 * then it is impossible to cross a block boundary.
+	 */
+	if (sb->s_blocksize - offset < len) {
+		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+			" cancelled because not block aligned",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
+	bh = ext4_bread(handle, inode, blk, 1, &err);
+	if (!bh)
+		goto out;
+	err = ext4_journal_get_write_access(handle, bh);
+	if (err) {
+		brelse(bh);
+		goto out;
+	}
+	lock_buffer(bh);
+	memcpy(bh->b_data+offset, data, len);
+	flush_dcache_page(bh->b_page);
+	unlock_buffer(bh);
+	err = ext4_handle_dirty_metadata(handle, NULL, bh);
+	brelse(bh);
+out:
+	if (err) {
+		mutex_unlock(&inode->i_mutex);
+		return err;
+	}
+	if (inode->i_size < off + len) {
+		i_size_write(inode, off + len);
+		EXT4_I(inode)->i_disksize = inode->i_size;
+		ext4_mark_inode_dirty(handle, inode);
+	}
+	mutex_unlock(&inode->i_mutex);
+	return len;
+}
+
+#endif
+
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
+}
+
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static inline void register_as_ext2(void)
+{
+	int err = register_filesystem(&ext2_fs_type);
+	if (err)
+		printk(KERN_WARNING
+		       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
+}
+
+static inline void unregister_as_ext2(void)
+{
+	unregister_filesystem(&ext2_fs_type);
+}
+
+static inline int ext2_feature_set_ok(struct super_block *sb)
+{
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+		return 0;
+	if (sb->s_flags & MS_RDONLY)
+		return 1;
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+		return 0;
+	return 1;
+}
+MODULE_ALIAS("ext2");
+#else
+static inline void register_as_ext2(void) { }
+static inline void unregister_as_ext2(void) { }
+static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
+#endif
+
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static inline void register_as_ext3(void)
+{
+	int err = register_filesystem(&ext3_fs_type);
+	if (err)
+		printk(KERN_WARNING
+		       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
+}
+
+static inline void unregister_as_ext3(void)
+{
+	unregister_filesystem(&ext3_fs_type);
+}
+
+static inline int ext3_feature_set_ok(struct super_block *sb)
+{
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+		return 0;
+	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+		return 0;
+	if (sb->s_flags & MS_RDONLY)
+		return 1;
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+		return 0;
+	return 1;
+}
+MODULE_ALIAS("ext3");
+#else
+static inline void register_as_ext3(void) { }
+static inline void unregister_as_ext3(void) { }
+static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
+#endif
+
+static struct file_system_type ext4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext4",
+	.mount		= ext4_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init ext4_init_feat_adverts(void)
+{
+	struct ext4_features *ef;
+	int ret = -ENOMEM;
+
+	ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
+	if (!ef)
+		goto out;
+
+	ef->f_kobj.kset = ext4_kset;
+	init_completion(&ef->f_kobj_unregister);
+	ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
+				   "features");
+	if (ret) {
+		kfree(ef);
+		goto out;
+	}
+
+	ext4_feat = ef;
+	ret = 0;
+out:
+	return ret;
+}
+
+static void ext4_exit_feat_adverts(void)
+{
+	kobject_put(&ext4_feat->f_kobj);
+	wait_for_completion(&ext4_feat->f_kobj_unregister);
+	kfree(ext4_feat);
+}
+
+/* Shared across all ext4 file systems */
+wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
+static int __init ext4_init_fs(void)
+{
+	int i, err;
+
+	ext4_check_flag_values();
+
+	for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
+		mutex_init(&ext4__aio_mutex[i]);
+		init_waitqueue_head(&ext4__ioend_wq[i]);
+	}
+
+	err = ext4_init_pageio();
+	if (err)
+		return err;
+	err = ext4_init_system_zone();
+	if (err)
+		goto out7;
+	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
+	if (!ext4_kset)
+		goto out6;
+	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+	if (!ext4_proc_root)
+		goto out5;
+
+	err = ext4_init_feat_adverts();
+	if (err)
+		goto out4;
+
+	err = ext4_init_mballoc();
+	if (err)
+		goto out3;
+
+	err = ext4_init_xattr();
+	if (err)
+		goto out2;
+	err = init_inodecache();
+	if (err)
+		goto out1;
+	register_as_ext3();
+	register_as_ext2();
+	err = register_filesystem(&ext4_fs_type);
+	if (err)
+		goto out;
+
+	ext4_li_info = NULL;
+	mutex_init(&ext4_li_mtx);
+	return 0;
+out:
+	unregister_as_ext2();
+	unregister_as_ext3();
+	destroy_inodecache();
+out1:
+	ext4_exit_xattr();
+out2:
+	ext4_exit_mballoc();
+out3:
+	ext4_exit_feat_adverts();
+out4:
+	remove_proc_entry("fs/ext4", NULL);
+out5:
+	kset_unregister(ext4_kset);
+out6:
+	ext4_exit_system_zone();
+out7:
+	ext4_exit_pageio();
+	return err;
+}
+
+static void __exit ext4_exit_fs(void)
+{
+	ext4_destroy_lazyinit_thread();
+	unregister_as_ext2();
+	unregister_as_ext3();
+	unregister_filesystem(&ext4_fs_type);
+	destroy_inodecache();
+	ext4_exit_xattr();
+	ext4_exit_mballoc();
+	ext4_exit_feat_adverts();
+	remove_proc_entry("fs/ext4", NULL);
+	kset_unregister(ext4_kset);
+	ext4_exit_system_zone();
+	ext4_exit_pageio();
+}
+
+MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+MODULE_DESCRIPTION("Fourth Extended Filesystem");
+MODULE_LICENSE("GPL");
+module_init(ext4_init_fs)
+module_exit(ext4_exit_fs)
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
new file mode 100644
index 00000000..ed9354af
--- /dev/null
+++ b/fs/ext4/symlink.c
@@ -0,0 +1,56 @@
+/*
+ *  linux/fs/ext4/symlink.c
+ *
+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/symlink.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4 symlink handling code
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/namei.h>
+#include "ext4.h"
+#include "xattr.h"
+
+static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
+	nd_set_link(nd, (char *) ei->i_data);
+	return NULL;
+}
+
+const struct inode_operations ext4_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= ext4_setattr,
+#ifdef CONFIG_EXT4_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
+
+const struct inode_operations ext4_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= ext4_follow_link,
+	.setattr	= ext4_setattr,
+#ifdef CONFIG_EXT4_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
new file mode 100644
index 00000000..c2865cc3
--- /dev/null
+++ b/fs/ext4/xattr.c
@@ -0,0 +1,1612 @@
+/*
+ * linux/fs/ext4/xattr.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
+ * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>.
+ * Extended attributes for symlinks and special files added per
+ *  suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ *  Red Hat Inc.
+ * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
+ *  and Andreas Gruenbacher <agruen@suse.de>.
+ */
+
+/*
+ * Extended attributes are stored directly in inodes (on file systems with
+ * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
+ * field contains the block number if an inode uses an additional block. All
+ * attributes must fit in the inode and one additional block. Blocks that
+ * contain the identical set of attributes may be shared among several inodes.
+ * Identical blocks are detected by keeping a cache of blocks that have
+ * recently been accessed.
+ *
+ * The attributes in inodes and on blocks have a different header; the entries
+ * are stored in the same format:
+ *
+ *   +------------------+
+ *   | header           |
+ *   | entry 1          | |
+ *   | entry 2          | | growing downwards
+ *   | entry 3          | v
+ *   | four null bytes  |
+ *   | . . .            |
+ *   | value 1          | ^
+ *   | value 3          | | growing upwards
+ *   | value 2          | |
+ *   +------------------+
+ *
+ * The header is followed by multiple entry descriptors. In disk blocks, the
+ * entry descriptors are kept sorted. In inodes, they are unsorted. The
+ * attribute values are aligned to the end of the block in no specific order.
+ *
+ * Locking strategy
+ * ----------------
+ * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem.
+ * EA blocks are only changed if they are exclusive to an inode, so
+ * holding xattr_sem also means that nothing but the EA block's reference
+ * count can change. Multiple writers to the same block are synchronized
+ * by the buffer lock.
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/mbcache.h>
+#include <linux/quotaops.h>
+#include <linux/rwsem.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+#include "acl.h"
+
+#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
+#define BFIRST(bh) ENTRY(BHDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
+#ifdef EXT4_XATTR_DEBUG
+# define ea_idebug(inode, f...) do { \
+		printk(KERN_DEBUG "inode %s:%lu: ", \
+			inode->i_sb->s_id, inode->i_ino); \
+		printk(f); \
+		printk("\n"); \
+	} while (0)
+# define ea_bdebug(bh, f...) do { \
+		char b[BDEVNAME_SIZE]; \
+		printk(KERN_DEBUG "block %s:%lu: ", \
+			bdevname(bh->b_bdev, b), \
+			(unsigned long) bh->b_blocknr); \
+		printk(f); \
+		printk("\n"); \
+	} while (0)
+#else
+# define ea_idebug(f...)
+# define ea_bdebug(f...)
+#endif
+
+static void ext4_xattr_cache_insert(struct buffer_head *);
+static struct buffer_head *ext4_xattr_cache_find(struct inode *,
+						 struct ext4_xattr_header *,
+						 struct mb_cache_entry **);
+static void ext4_xattr_rehash(struct ext4_xattr_header *,
+			      struct ext4_xattr_entry *);
+static int ext4_xattr_list(struct dentry *dentry, char *buffer,
+			   size_t buffer_size);
+
+static struct mb_cache *ext4_xattr_cache;
+
+static const struct xattr_handler *ext4_xattr_handler_map[] = {
+	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler,
+	[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
+#endif
+	[EXT4_XATTR_INDEX_TRUSTED]	     = &ext4_xattr_trusted_handler,
+#ifdef CONFIG_EXT4_FS_SECURITY
+	[EXT4_XATTR_INDEX_SECURITY]	     = &ext4_xattr_security_handler,
+#endif
+};
+
+const struct xattr_handler *ext4_xattr_handlers[] = {
+	&ext4_xattr_user_handler,
+	&ext4_xattr_trusted_handler,
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	&ext4_xattr_acl_access_handler,
+	&ext4_xattr_acl_default_handler,
+#endif
+#ifdef CONFIG_EXT4_FS_SECURITY
+	&ext4_xattr_security_handler,
+#endif
+	NULL
+};
+
+static inline const struct xattr_handler *
+ext4_xattr_handler(int name_index)
+{
+	const struct xattr_handler *handler = NULL;
+
+	if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
+		handler = ext4_xattr_handler_map[name_index];
+	return handler;
+}
+
+/*
+ * Inode operation listxattr()
+ *
+ * dentry->d_inode->i_mutex: don't care
+ */
+ssize_t
+ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	return ext4_xattr_list(dentry, buffer, size);
+}
+
+static int
+ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
+{
+	while (!IS_LAST_ENTRY(entry)) {
+		struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry);
+		if ((void *)next >= end)
+			return -EIO;
+		entry = next;
+	}
+	return 0;
+}
+
+static inline int
+ext4_xattr_check_block(struct buffer_head *bh)
+{
+	int error;
+
+	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
+	    BHDR(bh)->h_blocks != cpu_to_le32(1))
+		return -EIO;
+	error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+	return error;
+}
+
+static inline int
+ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
+{
+	size_t value_size = le32_to_cpu(entry->e_value_size);
+
+	if (entry->e_value_block != 0 || value_size > size ||
+	    le16_to_cpu(entry->e_value_offs) + value_size > size)
+		return -EIO;
+	return 0;
+}
+
+static int
+ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
+		      const char *name, size_t size, int sorted)
+{
+	struct ext4_xattr_entry *entry;
+	size_t name_len;
+	int cmp = 1;
+
+	if (name == NULL)
+		return -EINVAL;
+	name_len = strlen(name);
+	entry = *pentry;
+	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+		cmp = name_index - entry->e_name_index;
+		if (!cmp)
+			cmp = name_len - entry->e_name_len;
+		if (!cmp)
+			cmp = memcmp(name, entry->e_name, name_len);
+		if (cmp <= 0 && (sorted || cmp == 0))
+			break;
+	}
+	*pentry = entry;
+	if (!cmp && ext4_xattr_check_entry(entry, size))
+			return -EIO;
+	return cmp ? -ENODATA : 0;
+}
+
+static int
+ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
+		     void *buffer, size_t buffer_size)
+{
+	struct buffer_head *bh = NULL;
+	struct ext4_xattr_entry *entry;
+	size_t size;
+	int error;
+
+	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
+		  name_index, name, buffer, (long)buffer_size);
+
+	error = -ENODATA;
+	if (!EXT4_I(inode)->i_file_acl)
+		goto cleanup;
+	ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+	if (!bh)
+		goto cleanup;
+	ea_bdebug(bh, "b_count=%d, refcount=%d",
+		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+	if (ext4_xattr_check_block(bh)) {
+bad_block:
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
+		error = -EIO;
+		goto cleanup;
+	}
+	ext4_xattr_cache_insert(bh);
+	entry = BFIRST(bh);
+	error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
+	if (error == -EIO)
+		goto bad_block;
+	if (error)
+		goto cleanup;
+	size = le32_to_cpu(entry->e_value_size);
+	if (buffer) {
+		error = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+		memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
+		       size);
+	}
+	error = size;
+
+cleanup:
+	brelse(bh);
+	return error;
+}
+
+static int
+ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
+		     void *buffer, size_t buffer_size)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_entry *entry;
+	struct ext4_inode *raw_inode;
+	struct ext4_iloc iloc;
+	size_t size;
+	void *end;
+	int error;
+
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+		return -ENODATA;
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error)
+		return error;
+	raw_inode = ext4_raw_inode(&iloc);
+	header = IHDR(inode, raw_inode);
+	entry = IFIRST(header);
+	end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+	error = ext4_xattr_check_names(entry, end);
+	if (error)
+		goto cleanup;
+	error = ext4_xattr_find_entry(&entry, name_index, name,
+				      end - (void *)entry, 0);
+	if (error)
+		goto cleanup;
+	size = le32_to_cpu(entry->e_value_size);
+	if (buffer) {
+		error = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+		memcpy(buffer, (void *)IFIRST(header) +
+		       le16_to_cpu(entry->e_value_offs), size);
+	}
+	error = size;
+
+cleanup:
+	brelse(iloc.bh);
+	return error;
+}
+
+/*
+ * ext4_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+int
+ext4_xattr_get(struct inode *inode, int name_index, const char *name,
+	       void *buffer, size_t buffer_size)
+{
+	int error;
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
+				     buffer_size);
+	if (error == -ENODATA)
+		error = ext4_xattr_block_get(inode, name_index, name, buffer,
+					     buffer_size);
+	up_read(&EXT4_I(inode)->xattr_sem);
+	return error;
+}
+
+static int
+ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
+			char *buffer, size_t buffer_size)
+{
+	size_t rest = buffer_size;
+
+	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+		const struct xattr_handler *handler =
+			ext4_xattr_handler(entry->e_name_index);
+
+		if (handler) {
+			size_t size = handler->list(dentry, buffer, rest,
+						    entry->e_name,
+						    entry->e_name_len,
+						    handler->flags);
+			if (buffer) {
+				if (size > rest)
+					return -ERANGE;
+				buffer += size;
+			}
+			rest -= size;
+		}
+	}
+	return buffer_size - rest;
+}
+
+static int
+ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct buffer_head *bh = NULL;
+	int error;
+
+	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
+		  buffer, (long)buffer_size);
+
+	error = 0;
+	if (!EXT4_I(inode)->i_file_acl)
+		goto cleanup;
+	ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+	error = -EIO;
+	if (!bh)
+		goto cleanup;
+	ea_bdebug(bh, "b_count=%d, refcount=%d",
+		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+	if (ext4_xattr_check_block(bh)) {
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
+		error = -EIO;
+		goto cleanup;
+	}
+	ext4_xattr_cache_insert(bh);
+	error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
+
+cleanup:
+	brelse(bh);
+
+	return error;
+}
+
+static int
+ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_inode *raw_inode;
+	struct ext4_iloc iloc;
+	void *end;
+	int error;
+
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+		return 0;
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error)
+		return error;
+	raw_inode = ext4_raw_inode(&iloc);
+	header = IHDR(inode, raw_inode);
+	end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+	error = ext4_xattr_check_names(IFIRST(header), end);
+	if (error)
+		goto cleanup;
+	error = ext4_xattr_list_entries(dentry, IFIRST(header),
+					buffer, buffer_size);
+
+cleanup:
+	brelse(iloc.bh);
+	return error;
+}
+
+/*
+ * ext4_xattr_list()
+ *
+ * Copy a list of attribute names into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+static int
+ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	int ret, ret2;
+
+	down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
+	ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
+	if (ret < 0)
+		goto errout;
+	if (buffer) {
+		buffer += ret;
+		buffer_size -= ret;
+	}
+	ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
+	if (ret < 0)
+		goto errout;
+	ret += ret2;
+errout:
+	up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
+	return ret;
+}
+
+/*
+ * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is
+ * not set, set it.
+ */
+static void ext4_xattr_update_super_block(handle_t *handle,
+					  struct super_block *sb)
+{
+	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
+		return;
+
+	if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
+		EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
+		ext4_handle_dirty_super(handle, sb);
+	}
+}
+
+/*
+ * Release the xattr block BH: If the reference count is > 1, decrement
+ * it; otherwise free the block.
+ */
+static void
+ext4_xattr_release_block(handle_t *handle, struct inode *inode,
+			 struct buffer_head *bh)
+{
+	struct mb_cache_entry *ce = NULL;
+	int error = 0;
+
+	ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
+	error = ext4_journal_get_write_access(handle, bh);
+	if (error)
+		goto out;
+
+	lock_buffer(bh);
+	if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
+		ea_bdebug(bh, "refcount now=0; freeing");
+		if (ce)
+			mb_cache_entry_free(ce);
+		get_bh(bh);
+		ext4_free_blocks(handle, inode, bh, 0, 1,
+				 EXT4_FREE_BLOCKS_METADATA |
+				 EXT4_FREE_BLOCKS_FORGET);
+		unlock_buffer(bh);
+	} else {
+		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
+		if (ce)
+			mb_cache_entry_release(ce);
+		unlock_buffer(bh);
+		error = ext4_handle_dirty_metadata(handle, inode, bh);
+		if (IS_SYNC(inode))
+			ext4_handle_sync(handle);
+		dquot_free_block(inode, 1);
+		ea_bdebug(bh, "refcount now=%d; releasing",
+			  le32_to_cpu(BHDR(bh)->h_refcount));
+	}
+out:
+	ext4_std_error(inode->i_sb, error);
+	return;
+}
+
+/*
+ * Find the available free space for EAs. This also returns the total number of
+ * bytes used by EA entries.
+ */
+static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
+				    size_t *min_offs, void *base, int *total)
+{
+	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+		*total += EXT4_XATTR_LEN(last->e_name_len);
+		if (!last->e_value_block && last->e_value_size) {
+			size_t offs = le16_to_cpu(last->e_value_offs);
+			if (offs < *min_offs)
+				*min_offs = offs;
+		}
+	}
+	return (*min_offs - ((void *)last - base) - sizeof(__u32));
+}
+
+struct ext4_xattr_info {
+	int name_index;
+	const char *name;
+	const void *value;
+	size_t value_len;
+};
+
+struct ext4_xattr_search {
+	struct ext4_xattr_entry *first;
+	void *base;
+	void *end;
+	struct ext4_xattr_entry *here;
+	int not_found;
+};
+
+static int
+ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
+{
+	struct ext4_xattr_entry *last;
+	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
+
+	/* Compute min_offs and last. */
+	last = s->first;
+	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+		if (!last->e_value_block && last->e_value_size) {
+			size_t offs = le16_to_cpu(last->e_value_offs);
+			if (offs < min_offs)
+				min_offs = offs;
+		}
+	}
+	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
+	if (!s->not_found) {
+		if (!s->here->e_value_block && s->here->e_value_size) {
+			size_t size = le32_to_cpu(s->here->e_value_size);
+			free += EXT4_XATTR_SIZE(size);
+		}
+		free += EXT4_XATTR_LEN(name_len);
+	}
+	if (i->value) {
+		if (free < EXT4_XATTR_SIZE(i->value_len) ||
+		    free < EXT4_XATTR_LEN(name_len) +
+			   EXT4_XATTR_SIZE(i->value_len))
+			return -ENOSPC;
+	}
+
+	if (i->value && s->not_found) {
+		/* Insert the new name. */
+		size_t size = EXT4_XATTR_LEN(name_len);
+		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
+		memmove((void *)s->here + size, s->here, rest);
+		memset(s->here, 0, size);
+		s->here->e_name_index = i->name_index;
+		s->here->e_name_len = name_len;
+		memcpy(s->here->e_name, i->name, name_len);
+	} else {
+		if (!s->here->e_value_block && s->here->e_value_size) {
+			void *first_val = s->base + min_offs;
+			size_t offs = le16_to_cpu(s->here->e_value_offs);
+			void *val = s->base + offs;
+			size_t size = EXT4_XATTR_SIZE(
+				le32_to_cpu(s->here->e_value_size));
+
+			if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
+				/* The old and the new value have the same
+				   size. Just replace. */
+				s->here->e_value_size =
+					cpu_to_le32(i->value_len);
+				memset(val + size - EXT4_XATTR_PAD, 0,
+				       EXT4_XATTR_PAD); /* Clear pad bytes. */
+				memcpy(val, i->value, i->value_len);
+				return 0;
+			}
+
+			/* Remove the old value. */
+			memmove(first_val + size, first_val, val - first_val);
+			memset(first_val, 0, size);
+			s->here->e_value_size = 0;
+			s->here->e_value_offs = 0;
+			min_offs += size;
+
+			/* Adjust all value offsets. */
+			last = s->first;
+			while (!IS_LAST_ENTRY(last)) {
+				size_t o = le16_to_cpu(last->e_value_offs);
+				if (!last->e_value_block &&
+				    last->e_value_size && o < offs)
+					last->e_value_offs =
+						cpu_to_le16(o + size);
+				last = EXT4_XATTR_NEXT(last);
+			}
+		}
+		if (!i->value) {
+			/* Remove the old name. */
+			size_t size = EXT4_XATTR_LEN(name_len);
+			last = ENTRY((void *)last - size);
+			memmove(s->here, (void *)s->here + size,
+				(void *)last - (void *)s->here + sizeof(__u32));
+			memset(last, 0, size);
+		}
+	}
+
+	if (i->value) {
+		/* Insert the new value. */
+		s->here->e_value_size = cpu_to_le32(i->value_len);
+		if (i->value_len) {
+			size_t size = EXT4_XATTR_SIZE(i->value_len);
+			void *val = s->base + min_offs - size;
+			s->here->e_value_offs = cpu_to_le16(min_offs - size);
+			memset(val + size - EXT4_XATTR_PAD, 0,
+			       EXT4_XATTR_PAD); /* Clear the pad bytes. */
+			memcpy(val, i->value, i->value_len);
+		}
+	}
+	return 0;
+}
+
+struct ext4_xattr_block_find {
+	struct ext4_xattr_search s;
+	struct buffer_head *bh;
+};
+
+static int
+ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
+		      struct ext4_xattr_block_find *bs)
+{
+	struct super_block *sb = inode->i_sb;
+	int error;
+
+	ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
+		  i->name_index, i->name, i->value, (long)i->value_len);
+
+	if (EXT4_I(inode)->i_file_acl) {
+		/* The inode already has an extended attribute block. */
+		bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl);
+		error = -EIO;
+		if (!bs->bh)
+			goto cleanup;
+		ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
+			atomic_read(&(bs->bh->b_count)),
+			le32_to_cpu(BHDR(bs->bh)->h_refcount));
+		if (ext4_xattr_check_block(bs->bh)) {
+			EXT4_ERROR_INODE(inode, "bad block %llu",
+					 EXT4_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		/* Find the named attribute. */
+		bs->s.base = BHDR(bs->bh);
+		bs->s.first = BFIRST(bs->bh);
+		bs->s.end = bs->bh->b_data + bs->bh->b_size;
+		bs->s.here = bs->s.first;
+		error = ext4_xattr_find_entry(&bs->s.here, i->name_index,
+					      i->name, bs->bh->b_size, 1);
+		if (error && error != -ENODATA)
+			goto cleanup;
+		bs->s.not_found = error;
+	}
+	error = 0;
+
+cleanup:
+	return error;
+}
+
+static int
+ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+		     struct ext4_xattr_info *i,
+		     struct ext4_xattr_block_find *bs)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *new_bh = NULL;
+	struct ext4_xattr_search *s = &bs->s;
+	struct mb_cache_entry *ce = NULL;
+	int error = 0;
+
+#define header(x) ((struct ext4_xattr_header *)(x))
+
+	if (i->value && i->value_len > sb->s_blocksize)
+		return -ENOSPC;
+	if (s->base) {
+		ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
+					bs->bh->b_blocknr);
+		error = ext4_journal_get_write_access(handle, bs->bh);
+		if (error)
+			goto cleanup;
+		lock_buffer(bs->bh);
+
+		if (header(s->base)->h_refcount == cpu_to_le32(1)) {
+			if (ce) {
+				mb_cache_entry_free(ce);
+				ce = NULL;
+			}
+			ea_bdebug(bs->bh, "modifying in-place");
+			error = ext4_xattr_set_entry(i, s);
+			if (!error) {
+				if (!IS_LAST_ENTRY(s->first))
+					ext4_xattr_rehash(header(s->base),
+							  s->here);
+				ext4_xattr_cache_insert(bs->bh);
+			}
+			unlock_buffer(bs->bh);
+			if (error == -EIO)
+				goto bad_block;
+			if (!error)
+				error = ext4_handle_dirty_metadata(handle,
+								   inode,
+								   bs->bh);
+			if (error)
+				goto cleanup;
+			goto inserted;
+		} else {
+			int offset = (char *)s->here - bs->bh->b_data;
+
+			unlock_buffer(bs->bh);
+			ext4_handle_release_buffer(handle, bs->bh);
+			if (ce) {
+				mb_cache_entry_release(ce);
+				ce = NULL;
+			}
+			ea_bdebug(bs->bh, "cloning");
+			s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
+			error = -ENOMEM;
+			if (s->base == NULL)
+				goto cleanup;
+			memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
+			s->first = ENTRY(header(s->base)+1);
+			header(s->base)->h_refcount = cpu_to_le32(1);
+			s->here = ENTRY(s->base + offset);
+			s->end = s->base + bs->bh->b_size;
+		}
+	} else {
+		/* Allocate a buffer where we construct the new block. */
+		s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
+		/* assert(header == s->base) */
+		error = -ENOMEM;
+		if (s->base == NULL)
+			goto cleanup;
+		header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+		header(s->base)->h_blocks = cpu_to_le32(1);
+		header(s->base)->h_refcount = cpu_to_le32(1);
+		s->first = ENTRY(header(s->base)+1);
+		s->here = ENTRY(header(s->base)+1);
+		s->end = s->base + sb->s_blocksize;
+	}
+
+	error = ext4_xattr_set_entry(i, s);
+	if (error == -EIO)
+		goto bad_block;
+	if (error)
+		goto cleanup;
+	if (!IS_LAST_ENTRY(s->first))
+		ext4_xattr_rehash(header(s->base), s->here);
+
+inserted:
+	if (!IS_LAST_ENTRY(s->first)) {
+		new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce);
+		if (new_bh) {
+			/* We found an identical block in the cache. */
+			if (new_bh == bs->bh)
+				ea_bdebug(new_bh, "keeping");
+			else {
+				/* The old block is released after updating
+				   the inode. */
+				error = dquot_alloc_block(inode, 1);
+				if (error)
+					goto cleanup;
+				error = ext4_journal_get_write_access(handle,
+								      new_bh);
+				if (error)
+					goto cleanup_dquot;
+				lock_buffer(new_bh);
+				le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
+				ea_bdebug(new_bh, "reusing; refcount now=%d",
+					le32_to_cpu(BHDR(new_bh)->h_refcount));
+				unlock_buffer(new_bh);
+				error = ext4_handle_dirty_metadata(handle,
+								   inode,
+								   new_bh);
+				if (error)
+					goto cleanup_dquot;
+			}
+			mb_cache_entry_release(ce);
+			ce = NULL;
+		} else if (bs->bh && s->base == bs->bh->b_data) {
+			/* We were modifying this block in-place. */
+			ea_bdebug(bs->bh, "keeping this block");
+			new_bh = bs->bh;
+			get_bh(new_bh);
+		} else {
+			/* We need to allocate a new block */
+			ext4_fsblk_t goal, block;
+
+			goal = ext4_group_first_block_no(sb,
+						EXT4_I(inode)->i_block_group);
+
+			/* non-extent files can't have physical blocks past 2^32 */
+			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+				goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+
+			/*
+			 * take i_data_sem because we will test
+			 * i_delalloc_reserved_flag in ext4_mb_new_blocks
+			 */
+			down_read((&EXT4_I(inode)->i_data_sem));
+			block = ext4_new_meta_blocks(handle, inode, goal, 0,
+						     NULL, &error);
+			up_read((&EXT4_I(inode)->i_data_sem));
+			if (error)
+				goto cleanup;
+
+			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+				BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
+
+			ea_idebug(inode, "creating block %d", block);
+
+			new_bh = sb_getblk(sb, block);
+			if (!new_bh) {
+getblk_failed:
+				ext4_free_blocks(handle, inode, NULL, block, 1,
+						 EXT4_FREE_BLOCKS_METADATA);
+				error = -EIO;
+				goto cleanup;
+			}
+			lock_buffer(new_bh);
+			error = ext4_journal_get_create_access(handle, new_bh);
+			if (error) {
+				unlock_buffer(new_bh);
+				goto getblk_failed;
+			}
+			memcpy(new_bh->b_data, s->base, new_bh->b_size);
+			set_buffer_uptodate(new_bh);
+			unlock_buffer(new_bh);
+			ext4_xattr_cache_insert(new_bh);
+			error = ext4_handle_dirty_metadata(handle,
+							   inode, new_bh);
+			if (error)
+				goto cleanup;
+		}
+	}
+
+	/* Update the inode. */
+	EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
+
+	/* Drop the previous xattr block. */
+	if (bs->bh && bs->bh != new_bh)
+		ext4_xattr_release_block(handle, inode, bs->bh);
+	error = 0;
+
+cleanup:
+	if (ce)
+		mb_cache_entry_release(ce);
+	brelse(new_bh);
+	if (!(bs->bh && s->base == bs->bh->b_data))
+		kfree(s->base);
+
+	return error;
+
+cleanup_dquot:
+	dquot_free_block(inode, 1);
+	goto cleanup;
+
+bad_block:
+	EXT4_ERROR_INODE(inode, "bad block %llu",
+			 EXT4_I(inode)->i_file_acl);
+	goto cleanup;
+
+#undef header
+}
+
+struct ext4_xattr_ibody_find {
+	struct ext4_xattr_search s;
+	struct ext4_iloc iloc;
+};
+
+static int
+ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+		      struct ext4_xattr_ibody_find *is)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_inode *raw_inode;
+	int error;
+
+	if (EXT4_I(inode)->i_extra_isize == 0)
+		return 0;
+	raw_inode = ext4_raw_inode(&is->iloc);
+	header = IHDR(inode, raw_inode);
+	is->s.base = is->s.first = IFIRST(header);
+	is->s.here = is->s.first;
+	is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+		error = ext4_xattr_check_names(IFIRST(header), is->s.end);
+		if (error)
+			return error;
+		/* Find the named attribute. */
+		error = ext4_xattr_find_entry(&is->s.here, i->name_index,
+					      i->name, is->s.end -
+					      (void *)is->s.base, 0);
+		if (error && error != -ENODATA)
+			return error;
+		is->s.not_found = error;
+	}
+	return 0;
+}
+
+static int
+ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+		     struct ext4_xattr_info *i,
+		     struct ext4_xattr_ibody_find *is)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_search *s = &is->s;
+	int error;
+
+	if (EXT4_I(inode)->i_extra_isize == 0)
+		return -ENOSPC;
+	error = ext4_xattr_set_entry(i, s);
+	if (error)
+		return error;
+	header = IHDR(inode, ext4_raw_inode(&is->iloc));
+	if (!IS_LAST_ENTRY(s->first)) {
+		header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+	} else {
+		header->h_magic = cpu_to_le32(0);
+		ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
+	}
+	return 0;
+}
+
+/*
+ * ext4_xattr_set_handle()
+ *
+ * Create, replace or remove an extended attribute for this inode.  Value
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+		      const char *name, const void *value, size_t value_len,
+		      int flags)
+{
+	struct ext4_xattr_info i = {
+		.name_index = name_index,
+		.name = name,
+		.value = value,
+		.value_len = value_len,
+
+	};
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext4_xattr_block_find bs = {
+		.s = { .not_found = -ENODATA, },
+	};
+	unsigned long no_expand;
+	int error;
+
+	if (!name)
+		return -EINVAL;
+	if (strlen(name) > 255)
+		return -ERANGE;
+	down_write(&EXT4_I(inode)->xattr_sem);
+	no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
+
+	error = ext4_get_inode_loc(inode, &is.iloc);
+	if (error)
+		goto cleanup;
+
+	error = ext4_journal_get_write_access(handle, is.iloc.bh);
+	if (error)
+		goto cleanup;
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
+		struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
+		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+		ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+	}
+
+	error = ext4_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		goto cleanup;
+	if (is.s.not_found)
+		error = ext4_xattr_block_find(inode, &i, &bs);
+	if (error)
+		goto cleanup;
+	if (is.s.not_found && bs.s.not_found) {
+		error = -ENODATA;
+		if (flags & XATTR_REPLACE)
+			goto cleanup;
+		error = 0;
+		if (!value)
+			goto cleanup;
+	} else {
+		error = -EEXIST;
+		if (flags & XATTR_CREATE)
+			goto cleanup;
+	}
+	if (!value) {
+		if (!is.s.not_found)
+			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+		else if (!bs.s.not_found)
+			error = ext4_xattr_block_set(handle, inode, &i, &bs);
+	} else {
+		error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+		if (!error && !bs.s.not_found) {
+			i.value = NULL;
+			error = ext4_xattr_block_set(handle, inode, &i, &bs);
+		} else if (error == -ENOSPC) {
+			if (EXT4_I(inode)->i_file_acl && !bs.s.base) {
+				error = ext4_xattr_block_find(inode, &i, &bs);
+				if (error)
+					goto cleanup;
+			}
+			error = ext4_xattr_block_set(handle, inode, &i, &bs);
+			if (error)
+				goto cleanup;
+			if (!is.s.not_found) {
+				i.value = NULL;
+				error = ext4_xattr_ibody_set(handle, inode, &i,
+							     &is);
+			}
+		}
+	}
+	if (!error) {
+		ext4_xattr_update_super_block(handle, inode->i_sb);
+		inode->i_ctime = ext4_current_time(inode);
+		if (!value)
+			ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
+		error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+		/*
+		 * The bh is consumed by ext4_mark_iloc_dirty, even with
+		 * error != 0.
+		 */
+		is.iloc.bh = NULL;
+		if (IS_SYNC(inode))
+			ext4_handle_sync(handle);
+	}
+
+cleanup:
+	brelse(is.iloc.bh);
+	brelse(bs.bh);
+	if (no_expand == 0)
+		ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	up_write(&EXT4_I(inode)->xattr_sem);
+	return error;
+}
+
+/*
+ * ext4_xattr_set()
+ *
+ * Like ext4_xattr_set_handle, but start from an inode. This extended
+ * attribute modification is a filesystem transaction by itself.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+	       const void *value, size_t value_len, int flags)
+{
+	handle_t *handle;
+	int error, retries = 0;
+
+retry:
+	handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+	if (IS_ERR(handle)) {
+		error = PTR_ERR(handle);
+	} else {
+		int error2;
+
+		error = ext4_xattr_set_handle(handle, inode, name_index, name,
+					      value, value_len, flags);
+		error2 = ext4_journal_stop(handle);
+		if (error == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry;
+		if (error == 0)
+			error = error2;
+	}
+
+	return error;
+}
+
+/*
+ * Shift the EA entries in the inode to create space for the increased
+ * i_extra_isize.
+ */
+static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
+				     int value_offs_shift, void *to,
+				     void *from, size_t n, int blocksize)
+{
+	struct ext4_xattr_entry *last = entry;
+	int new_offs;
+
+	/* Adjust the value offsets of the entries */
+	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+		if (!last->e_value_block && last->e_value_size) {
+			new_offs = le16_to_cpu(last->e_value_offs) +
+							value_offs_shift;
+			BUG_ON(new_offs + le32_to_cpu(last->e_value_size)
+				 > blocksize);
+			last->e_value_offs = cpu_to_le16(new_offs);
+		}
+	}
+	/* Shift the entries by n bytes */
+	memmove(to, from, n);
+}
+
+/*
+ * Expand an inode by new_extra_isize bytes when EAs are present.
+ * Returns 0 on success or negative error number on failure.
+ */
+int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+			       struct ext4_inode *raw_inode, handle_t *handle)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_entry *entry, *last, *first;
+	struct buffer_head *bh = NULL;
+	struct ext4_xattr_ibody_find *is = NULL;
+	struct ext4_xattr_block_find *bs = NULL;
+	char *buffer = NULL, *b_entry_name = NULL;
+	size_t min_offs, free;
+	int total_ino, total_blk;
+	void *base, *start, *end;
+	int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
+	int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+retry:
+	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) {
+		up_write(&EXT4_I(inode)->xattr_sem);
+		return 0;
+	}
+
+	header = IHDR(inode, raw_inode);
+	entry = IFIRST(header);
+
+	/*
+	 * Check if enough free space is available in the inode to shift the
+	 * entries ahead by new_extra_isize.
+	 */
+
+	base = start = entry;
+	end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+	min_offs = end - base;
+	last = entry;
+	total_ino = sizeof(struct ext4_xattr_ibody_header);
+
+	free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
+	if (free >= new_extra_isize) {
+		entry = IFIRST(header);
+		ext4_xattr_shift_entries(entry,	EXT4_I(inode)->i_extra_isize
+				- new_extra_isize, (void *)raw_inode +
+				EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
+				(void *)header, total_ino,
+				inode->i_sb->s_blocksize);
+		EXT4_I(inode)->i_extra_isize = new_extra_isize;
+		error = 0;
+		goto cleanup;
+	}
+
+	/*
+	 * Enough free space isn't available in the inode, check if
+	 * EA block can hold new_extra_isize bytes.
+	 */
+	if (EXT4_I(inode)->i_file_acl) {
+		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+		error = -EIO;
+		if (!bh)
+			goto cleanup;
+		if (ext4_xattr_check_block(bh)) {
+			EXT4_ERROR_INODE(inode, "bad block %llu",
+					 EXT4_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		base = BHDR(bh);
+		first = BFIRST(bh);
+		end = bh->b_data + bh->b_size;
+		min_offs = end - base;
+		free = ext4_xattr_free_space(first, &min_offs, base,
+					     &total_blk);
+		if (free < new_extra_isize) {
+			if (!tried_min_extra_isize && s_min_extra_isize) {
+				tried_min_extra_isize++;
+				new_extra_isize = s_min_extra_isize;
+				brelse(bh);
+				goto retry;
+			}
+			error = -1;
+			goto cleanup;
+		}
+	} else {
+		free = inode->i_sb->s_blocksize;
+	}
+
+	while (new_extra_isize > 0) {
+		size_t offs, size, entry_size;
+		struct ext4_xattr_entry *small_entry = NULL;
+		struct ext4_xattr_info i = {
+			.value = NULL,
+			.value_len = 0,
+		};
+		unsigned int total_size;  /* EA entry size + value size */
+		unsigned int shift_bytes; /* No. of bytes to shift EAs by? */
+		unsigned int min_total_size = ~0U;
+
+		is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
+		bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
+		if (!is || !bs) {
+			error = -ENOMEM;
+			goto cleanup;
+		}
+
+		is->s.not_found = -ENODATA;
+		bs->s.not_found = -ENODATA;
+		is->iloc.bh = NULL;
+		bs->bh = NULL;
+
+		last = IFIRST(header);
+		/* Find the entry best suited to be pushed into EA block */
+		entry = NULL;
+		for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+			total_size =
+			EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
+					EXT4_XATTR_LEN(last->e_name_len);
+			if (total_size <= free && total_size < min_total_size) {
+				if (total_size < new_extra_isize) {
+					small_entry = last;
+				} else {
+					entry = last;
+					min_total_size = total_size;
+				}
+			}
+		}
+
+		if (entry == NULL) {
+			if (small_entry) {
+				entry = small_entry;
+			} else {
+				if (!tried_min_extra_isize &&
+				    s_min_extra_isize) {
+					tried_min_extra_isize++;
+					new_extra_isize = s_min_extra_isize;
+					goto retry;
+				}
+				error = -1;
+				goto cleanup;
+			}
+		}
+		offs = le16_to_cpu(entry->e_value_offs);
+		size = le32_to_cpu(entry->e_value_size);
+		entry_size = EXT4_XATTR_LEN(entry->e_name_len);
+		i.name_index = entry->e_name_index,
+		buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS);
+		b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
+		if (!buffer || !b_entry_name) {
+			error = -ENOMEM;
+			goto cleanup;
+		}
+		/* Save the entry name and the entry value */
+		memcpy(buffer, (void *)IFIRST(header) + offs,
+		       EXT4_XATTR_SIZE(size));
+		memcpy(b_entry_name, entry->e_name, entry->e_name_len);
+		b_entry_name[entry->e_name_len] = '\0';
+		i.name = b_entry_name;
+
+		error = ext4_get_inode_loc(inode, &is->iloc);
+		if (error)
+			goto cleanup;
+
+		error = ext4_xattr_ibody_find(inode, &i, is);
+		if (error)
+			goto cleanup;
+
+		/* Remove the chosen entry from the inode */
+		error = ext4_xattr_ibody_set(handle, inode, &i, is);
+		if (error)
+			goto cleanup;
+
+		entry = IFIRST(header);
+		if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
+			shift_bytes = new_extra_isize;
+		else
+			shift_bytes = entry_size + size;
+		/* Adjust the offsets and shift the remaining entries ahead */
+		ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize -
+			shift_bytes, (void *)raw_inode +
+			EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes,
+			(void *)header, total_ino - entry_size,
+			inode->i_sb->s_blocksize);
+
+		extra_isize += shift_bytes;
+		new_extra_isize -= shift_bytes;
+		EXT4_I(inode)->i_extra_isize = extra_isize;
+
+		i.name = b_entry_name;
+		i.value = buffer;
+		i.value_len = size;
+		error = ext4_xattr_block_find(inode, &i, bs);
+		if (error)
+			goto cleanup;
+
+		/* Add entry which was removed from the inode into the block */
+		error = ext4_xattr_block_set(handle, inode, &i, bs);
+		if (error)
+			goto cleanup;
+		kfree(b_entry_name);
+		kfree(buffer);
+		b_entry_name = NULL;
+		buffer = NULL;
+		brelse(is->iloc.bh);
+		kfree(is);
+		kfree(bs);
+	}
+	brelse(bh);
+	up_write(&EXT4_I(inode)->xattr_sem);
+	return 0;
+
+cleanup:
+	kfree(b_entry_name);
+	kfree(buffer);
+	if (is)
+		brelse(is->iloc.bh);
+	kfree(is);
+	kfree(bs);
+	brelse(bh);
+	up_write(&EXT4_I(inode)->xattr_sem);
+	return error;
+}
+
+
+
+/*
+ * ext4_xattr_delete_inode()
+ *
+ * Free extended attribute resources associated with this inode. This
+ * is called immediately before an inode is freed. We have exclusive
+ * access to the inode.
+ */
+void
+ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+	struct buffer_head *bh = NULL;
+
+	if (!EXT4_I(inode)->i_file_acl)
+		goto cleanup;
+	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+	if (!bh) {
+		EXT4_ERROR_INODE(inode, "block %llu read error",
+				 EXT4_I(inode)->i_file_acl);
+		goto cleanup;
+	}
+	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
+	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
+		goto cleanup;
+	}
+	ext4_xattr_release_block(handle, inode, bh);
+	EXT4_I(inode)->i_file_acl = 0;
+
+cleanup:
+	brelse(bh);
+}
+
+/*
+ * ext4_xattr_put_super()
+ *
+ * This is called when a file system is unmounted.
+ */
+void
+ext4_xattr_put_super(struct super_block *sb)
+{
+	mb_cache_shrink(sb->s_bdev);
+}
+
+/*
+ * ext4_xattr_cache_insert()
+ *
+ * Create a new entry in the extended attribute cache, and insert
+ * it unless such an entry is already in the cache.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+static void
+ext4_xattr_cache_insert(struct buffer_head *bh)
+{
+	__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
+	struct mb_cache_entry *ce;
+	int error;
+
+	ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS);
+	if (!ce) {
+		ea_bdebug(bh, "out of memory");
+		return;
+	}
+	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+	if (error) {
+		mb_cache_entry_free(ce);
+		if (error == -EBUSY) {
+			ea_bdebug(bh, "already in cache");
+			error = 0;
+		}
+	} else {
+		ea_bdebug(bh, "inserting [%x]", (int)hash);
+		mb_cache_entry_release(ce);
+	}
+}
+
+/*
+ * ext4_xattr_cmp()
+ *
+ * Compare two extended attribute blocks for equality.
+ *
+ * Returns 0 if the blocks are equal, 1 if they differ, and
+ * a negative error number on errors.
+ */
+static int
+ext4_xattr_cmp(struct ext4_xattr_header *header1,
+	       struct ext4_xattr_header *header2)
+{
+	struct ext4_xattr_entry *entry1, *entry2;
+
+	entry1 = ENTRY(header1+1);
+	entry2 = ENTRY(header2+1);
+	while (!IS_LAST_ENTRY(entry1)) {
+		if (IS_LAST_ENTRY(entry2))
+			return 1;
+		if (entry1->e_hash != entry2->e_hash ||
+		    entry1->e_name_index != entry2->e_name_index ||
+		    entry1->e_name_len != entry2->e_name_len ||
+		    entry1->e_value_size != entry2->e_value_size ||
+		    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
+			return 1;
+		if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
+			return -EIO;
+		if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
+			   (char *)header2 + le16_to_cpu(entry2->e_value_offs),
+			   le32_to_cpu(entry1->e_value_size)))
+			return 1;
+
+		entry1 = EXT4_XATTR_NEXT(entry1);
+		entry2 = EXT4_XATTR_NEXT(entry2);
+	}
+	if (!IS_LAST_ENTRY(entry2))
+		return 1;
+	return 0;
+}
+
+/*
+ * ext4_xattr_cache_find()
+ *
+ * Find an identical extended attribute block.
+ *
+ * Returns a pointer to the block found, or NULL if such a block was
+ * not found or an error occurred.
+ */
+static struct buffer_head *
+ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
+		      struct mb_cache_entry **pce)
+{
+	__u32 hash = le32_to_cpu(header->h_hash);
+	struct mb_cache_entry *ce;
+
+	if (!header->h_hash)
+		return NULL;  /* never share */
+	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
+again:
+	ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev,
+				       hash);
+	while (ce) {
+		struct buffer_head *bh;
+
+		if (IS_ERR(ce)) {
+			if (PTR_ERR(ce) == -EAGAIN)
+				goto again;
+			break;
+		}
+		bh = sb_bread(inode->i_sb, ce->e_block);
+		if (!bh) {
+			EXT4_ERROR_INODE(inode, "block %lu read error",
+					 (unsigned long) ce->e_block);
+		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
+				EXT4_XATTR_REFCOUNT_MAX) {
+			ea_idebug(inode, "block %lu refcount %d>=%d",
+				  (unsigned long) ce->e_block,
+				  le32_to_cpu(BHDR(bh)->h_refcount),
+					  EXT4_XATTR_REFCOUNT_MAX);
+		} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
+			*pce = ce;
+			return bh;
+		}
+		brelse(bh);
+		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+	}
+	return NULL;
+}
+
+#define NAME_HASH_SHIFT 5
+#define VALUE_HASH_SHIFT 16
+
+/*
+ * ext4_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
+					 struct ext4_xattr_entry *entry)
+{
+	__u32 hash = 0;
+	char *name = entry->e_name;
+	int n;
+
+	for (n = 0; n < entry->e_name_len; n++) {
+		hash = (hash << NAME_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
+		       *name++;
+	}
+
+	if (entry->e_value_block == 0 && entry->e_value_size != 0) {
+		__le32 *value = (__le32 *)((char *)header +
+			le16_to_cpu(entry->e_value_offs));
+		for (n = (le32_to_cpu(entry->e_value_size) +
+		     EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) {
+			hash = (hash << VALUE_HASH_SHIFT) ^
+			       (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
+			       le32_to_cpu(*value++);
+		}
+	}
+	entry->e_hash = cpu_to_le32(hash);
+}
+
+#undef NAME_HASH_SHIFT
+#undef VALUE_HASH_SHIFT
+
+#define BLOCK_HASH_SHIFT 16
+
+/*
+ * ext4_xattr_rehash()
+ *
+ * Re-compute the extended attribute hash value after an entry has changed.
+ */
+static void ext4_xattr_rehash(struct ext4_xattr_header *header,
+			      struct ext4_xattr_entry *entry)
+{
+	struct ext4_xattr_entry *here;
+	__u32 hash = 0;
+
+	ext4_xattr_hash_entry(header, entry);
+	here = ENTRY(header+1);
+	while (!IS_LAST_ENTRY(here)) {
+		if (!here->e_hash) {
+			/* Block is not shared if an entry's hash value == 0 */
+			hash = 0;
+			break;
+		}
+		hash = (hash << BLOCK_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
+		       le32_to_cpu(here->e_hash);
+		here = EXT4_XATTR_NEXT(here);
+	}
+	header->h_hash = cpu_to_le32(hash);
+}
+
+#undef BLOCK_HASH_SHIFT
+
+int __init
+ext4_init_xattr(void)
+{
+	ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
+	if (!ext4_xattr_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+ext4_exit_xattr(void)
+{
+	if (ext4_xattr_cache)
+		mb_cache_destroy(ext4_xattr_cache);
+	ext4_xattr_cache = NULL;
+}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
new file mode 100644
index 00000000..25b7387f
--- /dev/null
+++ b/fs/ext4/xattr.h
@@ -0,0 +1,155 @@
+/*
+  File: fs/ext4/xattr.h
+
+  On-disk format of extended attributes for the ext4 filesystem.
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/xattr.h>
+
+/* Magic value in attribute blocks */
+#define EXT4_XATTR_MAGIC		0xEA020000
+
+/* Maximum number of references to one attribute block */
+#define EXT4_XATTR_REFCOUNT_MAX		1024
+
+/* Name indexes */
+#define EXT4_XATTR_INDEX_USER			1
+#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS	2
+#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT	3
+#define EXT4_XATTR_INDEX_TRUSTED		4
+#define	EXT4_XATTR_INDEX_LUSTRE			5
+#define EXT4_XATTR_INDEX_SECURITY	        6
+
+struct ext4_xattr_header {
+	__le32	h_magic;	/* magic number for identification */
+	__le32	h_refcount;	/* reference count */
+	__le32	h_blocks;	/* number of disk blocks used */
+	__le32	h_hash;		/* hash value of all attributes */
+	__u32	h_reserved[4];	/* zero right now */
+};
+
+struct ext4_xattr_ibody_header {
+	__le32	h_magic;	/* magic number for identification */
+};
+
+struct ext4_xattr_entry {
+	__u8	e_name_len;	/* length of name */
+	__u8	e_name_index;	/* attribute name index */
+	__le16	e_value_offs;	/* offset in disk block of value */
+	__le32	e_value_block;	/* disk block attribute is stored on (n/i) */
+	__le32	e_value_size;	/* size of attribute value */
+	__le32	e_hash;		/* hash value of name and value */
+	char	e_name[0];	/* attribute name */
+};
+
+#define EXT4_XATTR_PAD_BITS		2
+#define EXT4_XATTR_PAD		(1<<EXT4_XATTR_PAD_BITS)
+#define EXT4_XATTR_ROUND		(EXT4_XATTR_PAD-1)
+#define EXT4_XATTR_LEN(name_len) \
+	(((name_len) + EXT4_XATTR_ROUND + \
+	sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
+#define EXT4_XATTR_NEXT(entry) \
+	((struct ext4_xattr_entry *)( \
+	 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
+#define EXT4_XATTR_SIZE(size) \
+	(((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
+
+#define IHDR(inode, raw_inode) \
+	((struct ext4_xattr_ibody_header *) \
+		((void *)raw_inode + \
+		EXT4_GOOD_OLD_INODE_SIZE + \
+		EXT4_I(inode)->i_extra_isize))
+#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
+
+# ifdef CONFIG_EXT4_FS_XATTR
+
+extern const struct xattr_handler ext4_xattr_user_handler;
+extern const struct xattr_handler ext4_xattr_trusted_handler;
+extern const struct xattr_handler ext4_xattr_acl_access_handler;
+extern const struct xattr_handler ext4_xattr_acl_default_handler;
+extern const struct xattr_handler ext4_xattr_security_handler;
+
+extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
+
+extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
+extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
+
+extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
+extern void ext4_xattr_put_super(struct super_block *);
+
+extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+			    struct ext4_inode *raw_inode, handle_t *handle);
+
+extern int __init ext4_init_xattr(void);
+extern void ext4_exit_xattr(void);
+
+extern const struct xattr_handler *ext4_xattr_handlers[];
+
+# else  /* CONFIG_EXT4_FS_XATTR */
+
+static inline int
+ext4_xattr_get(struct inode *inode, int name_index, const char *name,
+	       void *buffer, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+	       const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+	       const char *name, const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void
+ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+}
+
+static inline void
+ext4_xattr_put_super(struct super_block *sb)
+{
+}
+
+static __init inline int
+ext4_init_xattr(void)
+{
+	return 0;
+}
+
+static inline void
+ext4_exit_xattr(void)
+{
+}
+
+static inline int
+ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+			    struct ext4_inode *raw_inode, handle_t *handle)
+{
+	return -EOPNOTSUPP;
+}
+
+#define ext4_xattr_handlers	NULL
+
+# endif  /* CONFIG_EXT4_FS_XATTR */
+
+#ifdef CONFIG_EXT4_FS_SECURITY
+extern int ext4_init_security(handle_t *handle, struct inode *inode,
+			      struct inode *dir, const struct qstr *qstr);
+#else
+static inline int ext4_init_security(handle_t *handle, struct inode *inode,
+				     struct inode *dir, const struct qstr *qstr)
+{
+	return 0;
+}
+#endif
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
new file mode 100644
index 00000000..007c3bfb
--- /dev/null
+++ b/fs/ext4/xattr_security.c
@@ -0,0 +1,78 @@
+/*
+ * linux/fs/ext4/xattr_security.c
+ * Handler for storing security labels as extended attributes.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+
+static size_t
+ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
+{
+	const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+	const size_t total_len = prefix_len + name_len + 1;
+
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext4_xattr_security_get(struct dentry *dentry, const char *name,
+		       void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
+			      name, buffer, size);
+}
+
+static int
+ext4_xattr_security_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
+			      name, value, size, flags);
+}
+
+int
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+		   const struct qstr *qstr)
+{
+	int err;
+	size_t len;
+	void *value;
+	char *name;
+
+	err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+	err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY,
+				    name, value, len, 0);
+	kfree(name);
+	kfree(value);
+	return err;
+}
+
+const struct xattr_handler ext4_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= ext4_xattr_security_list,
+	.get	= ext4_xattr_security_get,
+	.set	= ext4_xattr_security_set,
+};
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
new file mode 100644
index 00000000..37e6ebca
--- /dev/null
+++ b/fs/ext4/xattr_trusted.c
@@ -0,0 +1,59 @@
+/*
+ * linux/fs/ext4/xattr_trusted.c
+ * Handler for trusted extended attributes.
+ *
+ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+
+static size_t
+ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
+		size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
+			      name, buffer, size);
+}
+
+static int
+ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
+			      name, value, size, flags);
+}
+
+const struct xattr_handler ext4_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= ext4_xattr_trusted_list,
+	.get	= ext4_xattr_trusted_get,
+	.set	= ext4_xattr_trusted_set,
+};
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
new file mode 100644
index 00000000..98c37535
--- /dev/null
+++ b/fs/ext4/xattr_user.c
@@ -0,0 +1,62 @@
+/*
+ * linux/fs/ext4/xattr_user.c
+ * Handler for extended user attributes.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+
+static size_t
+ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
+		     const char *name, size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_USER_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext4_xattr_user_get(struct dentry *dentry, const char *name,
+		    void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+	return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER,
+			      name, buffer, size);
+}
+
+static int
+ext4_xattr_user_set(struct dentry *dentry, const char *name,
+		    const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (!test_opt(dentry->d_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+	return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER,
+			      name, value, size, flags);
+}
+
+const struct xattr_handler ext4_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= ext4_xattr_user_list,
+	.get	= ext4_xattr_user_get,
+	.set	= ext4_xattr_user_set,
+};
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
new file mode 100644
index 00000000..182f9ffe
--- /dev/null
+++ b/fs/fat/Kconfig
@@ -0,0 +1,100 @@
+config FAT_FS
+	tristate
+	select NLS
+	help
+	  If you want to use one of the FAT-based file systems (the MS-DOS and
+	  VFAT (Windows 95) file systems), then you must say Y or M here
+	  to include FAT support. You will then be able to mount partitions or
+	  diskettes with FAT-based file systems and transparently access the
+	  files on them, i.e. MSDOS files will look and behave just like all
+	  other Unix files.
+
+	  This FAT support is not a file system in itself, it only provides
+	  the foundation for the other file systems. You will have to say Y or
+	  M to at least one of "MSDOS fs support" or "VFAT fs support" in
+	  order to make use of it.
+
+	  Another way to read and write MSDOS floppies and hard drive
+	  partitions from within Linux (but not transparently) is with the
+	  mtools ("man mtools") program suite. You don't need to say Y here in
+	  order to do that.
+
+	  If you need to move large files on floppies between a DOS and a
+	  Linux box, say Y here, mount the floppy under Linux with an MSDOS
+	  file system and use GNU tar's M option. GNU tar is a program
+	  available for Unix and DOS ("man tar" or "info tar").
+
+	  The FAT support will enlarge your kernel by about 37 KB. If unsure,
+	  say Y.
+
+	  To compile this as a module, choose M here: the module will be called
+	  fat.  Note that if you compile the FAT support as a module, you
+	  cannot compile any of the FAT-based file systems into the kernel
+	  -- they will have to be modules as well.
+
+config MSDOS_FS
+	tristate "MSDOS fs support"
+	select FAT_FS
+	help
+	  This allows you to mount MSDOS partitions of your hard drive (unless
+	  they are compressed; to access compressed MSDOS partitions under
+	  Linux, you can either use the DOS emulator DOSEMU, described in the
+	  DOSEMU-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in
+	  <ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you
+	  intend to use dosemu with a non-compressed MSDOS partition, say Y
+	  here) and MSDOS floppies. This means that file access becomes
+	  transparent, i.e. the MSDOS files look and behave just like all
+	  other Unix files.
+
+	  If you have Windows 95 or Windows NT installed on your MSDOS
+	  partitions, you should use the VFAT file system (say Y to "VFAT fs
+	  support" below), or you will not be able to see the long filenames
+	  generated by Windows 95 / Windows NT.
+
+	  This option will enlarge your kernel by about 7 KB. If unsure,
+	  answer Y. This will only work if you said Y to "DOS FAT fs support"
+	  as well. To compile this as a module, choose M here: the module will
+	  be called msdos.
+
+config VFAT_FS
+	tristate "VFAT (Windows-95) fs support"
+	select FAT_FS
+	help
+	  This option provides support for normal Windows file systems with
+	  long filenames.  That includes non-compressed FAT-based file systems
+	  used by Windows 95, Windows 98, Windows NT 4.0, and the Unix
+	  programs from the mtools package.
+
+	  The VFAT support enlarges your kernel by about 10 KB and it only
+	  works if you said Y to the "DOS FAT fs support" above.  Please read
+	  the file <file:Documentation/filesystems/vfat.txt> for details.  If
+	  unsure, say Y.
+
+	  To compile this as a module, choose M here: the module will be called
+	  vfat.
+
+config FAT_DEFAULT_CODEPAGE
+	int "Default codepage for FAT"
+	depends on MSDOS_FS || VFAT_FS
+	default 437
+	help
+	  This option should be set to the codepage of your FAT filesystems.
+	  It can be overridden with the "codepage" mount option.
+	  See <file:Documentation/filesystems/vfat.txt> for more information.
+
+config FAT_DEFAULT_IOCHARSET
+	string "Default iocharset for FAT"
+	depends on VFAT_FS
+	default "iso8859-1"
+	help
+	  Set this to the default input/output character set you'd
+	  like FAT to use. It should probably match the character set
+	  that most of your FAT filesystems use, and can be overridden
+	  with the "iocharset" mount option for FAT filesystems.
+	  Note that "utf8" is not recommended for FAT filesystems.
+	  If unsure, you shouldn't set "utf8" here.
+	  See <file:Documentation/filesystems/vfat.txt> for more information.
+
+	  Enable any character sets you need in File Systems/Native Language
+	  Support.
diff --git a/fs/fat/Makefile b/fs/fat/Makefile
new file mode 100644
index 00000000..e0619032
--- /dev/null
+++ b/fs/fat/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the Linux fat filesystem support.
+#
+
+obj-$(CONFIG_FAT_FS) += fat.o
+obj-$(CONFIG_VFAT_FS) += vfat.o
+obj-$(CONFIG_MSDOS_FS) += msdos.o
+
+fat-y := cache.o dir.o fatent.o file.o inode.o misc.o
+vfat-y := namei_vfat.o
+msdos-y := namei_msdos.o
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
new file mode 100644
index 00000000..1cc7038e
--- /dev/null
+++ b/fs/fat/cache.c
@@ -0,0 +1,351 @@
+/*
+ *  linux/fs/fat/cache.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *
+ *  Mar 1999. AV. Changed cache, so that it uses the starting cluster instead
+ *	of inode number.
+ *  May 1999. AV. Fixed the bogosity with FAT32 (read "FAT28"). Fscking lusers.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include "fat.h"
+
+/* this must be > 0. */
+#define FAT_MAX_CACHE	8
+
+struct fat_cache {
+	struct list_head cache_list;
+	int nr_contig;	/* number of contiguous clusters */
+	int fcluster;	/* cluster number in the file. */
+	int dcluster;	/* cluster number on disk. */
+};
+
+struct fat_cache_id {
+	unsigned int id;
+	int nr_contig;
+	int fcluster;
+	int dcluster;
+};
+
+static inline int fat_max_cache(struct inode *inode)
+{
+	return FAT_MAX_CACHE;
+}
+
+static struct kmem_cache *fat_cache_cachep;
+
+static void init_once(void *foo)
+{
+	struct fat_cache *cache = (struct fat_cache *)foo;
+
+	INIT_LIST_HEAD(&cache->cache_list);
+}
+
+int __init fat_cache_init(void)
+{
+	fat_cache_cachep = kmem_cache_create("fat_cache",
+				sizeof(struct fat_cache),
+				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+				init_once);
+	if (fat_cache_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void fat_cache_destroy(void)
+{
+	kmem_cache_destroy(fat_cache_cachep);
+}
+
+static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
+{
+	return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS);
+}
+
+static inline void fat_cache_free(struct fat_cache *cache)
+{
+	BUG_ON(!list_empty(&cache->cache_list));
+	kmem_cache_free(fat_cache_cachep, cache);
+}
+
+static inline void fat_cache_update_lru(struct inode *inode,
+					struct fat_cache *cache)
+{
+	if (MSDOS_I(inode)->cache_lru.next != &cache->cache_list)
+		list_move(&cache->cache_list, &MSDOS_I(inode)->cache_lru);
+}
+
+static int fat_cache_lookup(struct inode *inode, int fclus,
+			    struct fat_cache_id *cid,
+			    int *cached_fclus, int *cached_dclus)
+{
+	static struct fat_cache nohit = { .fcluster = 0, };
+
+	struct fat_cache *hit = &nohit, *p;
+	int offset = -1;
+
+	spin_lock(&MSDOS_I(inode)->cache_lru_lock);
+	list_for_each_entry(p, &MSDOS_I(inode)->cache_lru, cache_list) {
+		/* Find the cache of "fclus" or nearest cache. */
+		if (p->fcluster <= fclus && hit->fcluster < p->fcluster) {
+			hit = p;
+			if ((hit->fcluster + hit->nr_contig) < fclus) {
+				offset = hit->nr_contig;
+			} else {
+				offset = fclus - hit->fcluster;
+				break;
+			}
+		}
+	}
+	if (hit != &nohit) {
+		fat_cache_update_lru(inode, hit);
+
+		cid->id = MSDOS_I(inode)->cache_valid_id;
+		cid->nr_contig = hit->nr_contig;
+		cid->fcluster = hit->fcluster;
+		cid->dcluster = hit->dcluster;
+		*cached_fclus = cid->fcluster + offset;
+		*cached_dclus = cid->dcluster + offset;
+	}
+	spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
+
+	return offset;
+}
+
+static struct fat_cache *fat_cache_merge(struct inode *inode,
+					 struct fat_cache_id *new)
+{
+	struct fat_cache *p;
+
+	list_for_each_entry(p, &MSDOS_I(inode)->cache_lru, cache_list) {
+		/* Find the same part as "new" in cluster-chain. */
+		if (p->fcluster == new->fcluster) {
+			BUG_ON(p->dcluster != new->dcluster);
+			if (new->nr_contig > p->nr_contig)
+				p->nr_contig = new->nr_contig;
+			return p;
+		}
+	}
+	return NULL;
+}
+
+static void fat_cache_add(struct inode *inode, struct fat_cache_id *new)
+{
+	struct fat_cache *cache, *tmp;
+
+	if (new->fcluster == -1) /* dummy cache */
+		return;
+
+	spin_lock(&MSDOS_I(inode)->cache_lru_lock);
+	if (new->id != FAT_CACHE_VALID &&
+	    new->id != MSDOS_I(inode)->cache_valid_id)
+		goto out;	/* this cache was invalidated */
+
+	cache = fat_cache_merge(inode, new);
+	if (cache == NULL) {
+		if (MSDOS_I(inode)->nr_caches < fat_max_cache(inode)) {
+			MSDOS_I(inode)->nr_caches++;
+			spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
+
+			tmp = fat_cache_alloc(inode);
+			if (!tmp) {
+				spin_lock(&MSDOS_I(inode)->cache_lru_lock);
+				MSDOS_I(inode)->nr_caches--;
+				spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
+				return;
+			}
+
+			spin_lock(&MSDOS_I(inode)->cache_lru_lock);
+			cache = fat_cache_merge(inode, new);
+			if (cache != NULL) {
+				MSDOS_I(inode)->nr_caches--;
+				fat_cache_free(tmp);
+				goto out_update_lru;
+			}
+			cache = tmp;
+		} else {
+			struct list_head *p = MSDOS_I(inode)->cache_lru.prev;
+			cache = list_entry(p, struct fat_cache, cache_list);
+		}
+		cache->fcluster = new->fcluster;
+		cache->dcluster = new->dcluster;
+		cache->nr_contig = new->nr_contig;
+	}
+out_update_lru:
+	fat_cache_update_lru(inode, cache);
+out:
+	spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
+}
+
+/*
+ * Cache invalidation occurs rarely, thus the LRU chain is not updated. It
+ * fixes itself after a while.
+ */
+static void __fat_cache_inval_inode(struct inode *inode)
+{
+	struct msdos_inode_info *i = MSDOS_I(inode);
+	struct fat_cache *cache;
+
+	while (!list_empty(&i->cache_lru)) {
+		cache = list_entry(i->cache_lru.next, struct fat_cache, cache_list);
+		list_del_init(&cache->cache_list);
+		i->nr_caches--;
+		fat_cache_free(cache);
+	}
+	/* Update. The copy of caches before this id is discarded. */
+	i->cache_valid_id++;
+	if (i->cache_valid_id == FAT_CACHE_VALID)
+		i->cache_valid_id++;
+}
+
+void fat_cache_inval_inode(struct inode *inode)
+{
+	spin_lock(&MSDOS_I(inode)->cache_lru_lock);
+	__fat_cache_inval_inode(inode);
+	spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
+}
+
+static inline int cache_contiguous(struct fat_cache_id *cid, int dclus)
+{
+	cid->nr_contig++;
+	return ((cid->dcluster + cid->nr_contig) == dclus);
+}
+
+static inline void cache_init(struct fat_cache_id *cid, int fclus, int dclus)
+{
+	cid->id = FAT_CACHE_VALID;
+	cid->fcluster = fclus;
+	cid->dcluster = dclus;
+	cid->nr_contig = 0;
+}
+
+int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
+{
+	struct super_block *sb = inode->i_sb;
+	const int limit = sb->s_maxbytes >> MSDOS_SB(sb)->cluster_bits;
+	struct fat_entry fatent;
+	struct fat_cache_id cid;
+	int nr;
+
+	BUG_ON(MSDOS_I(inode)->i_start == 0);
+
+	*fclus = 0;
+	*dclus = MSDOS_I(inode)->i_start;
+	if (cluster == 0)
+		return 0;
+
+	if (fat_cache_lookup(inode, cluster, &cid, fclus, dclus) < 0) {
+		/*
+		 * dummy, always not contiguous
+		 * This is reinitialized by cache_init(), later.
+		 */
+		cache_init(&cid, -1, -1);
+	}
+
+	fatent_init(&fatent);
+	while (*fclus < cluster) {
+		/* prevent the infinite loop of cluster chain */
+		if (*fclus > limit) {
+			fat_fs_error_ratelimit(sb,
+					"%s: detected the cluster chain loop"
+					" (i_pos %lld)", __func__,
+					MSDOS_I(inode)->i_pos);
+			nr = -EIO;
+			goto out;
+		}
+
+		nr = fat_ent_read(inode, &fatent, *dclus);
+		if (nr < 0)
+			goto out;
+		else if (nr == FAT_ENT_FREE) {
+			fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
+					       " (i_pos %lld)", __func__,
+					       MSDOS_I(inode)->i_pos);
+			nr = -EIO;
+			goto out;
+		} else if (nr == FAT_ENT_EOF) {
+			fat_cache_add(inode, &cid);
+			goto out;
+		}
+		(*fclus)++;
+		*dclus = nr;
+		if (!cache_contiguous(&cid, *dclus))
+			cache_init(&cid, *fclus, *dclus);
+	}
+	nr = 0;
+	fat_cache_add(inode, &cid);
+out:
+	fatent_brelse(&fatent);
+	return nr;
+}
+
+static int fat_bmap_cluster(struct inode *inode, int cluster)
+{
+	struct super_block *sb = inode->i_sb;
+	int ret, fclus, dclus;
+
+	if (MSDOS_I(inode)->i_start == 0)
+		return 0;
+
+	ret = fat_get_cluster(inode, cluster, &fclus, &dclus);
+	if (ret < 0)
+		return ret;
+	else if (ret == FAT_ENT_EOF) {
+		fat_fs_error(sb, "%s: request beyond EOF (i_pos %lld)",
+			     __func__, MSDOS_I(inode)->i_pos);
+		return -EIO;
+	}
+	return dclus;
+}
+
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+	     unsigned long *mapped_blocks, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	const unsigned long blocksize = sb->s_blocksize;
+	const unsigned char blocksize_bits = sb->s_blocksize_bits;
+	sector_t last_block;
+	int cluster, offset;
+
+	*phys = 0;
+	*mapped_blocks = 0;
+	if ((sbi->fat_bits != 32) && (inode->i_ino == MSDOS_ROOT_INO)) {
+		if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) {
+			*phys = sector + sbi->dir_start;
+			*mapped_blocks = 1;
+		}
+		return 0;
+	}
+
+	last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+	if (sector >= last_block) {
+		if (!create)
+			return 0;
+
+		/*
+		 * ->mmu_private can access on only allocation path.
+		 * (caller must hold ->i_mutex)
+		 */
+		last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+			>> blocksize_bits;
+		if (sector >= last_block)
+			return 0;
+	}
+
+	cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
+	offset  = sector & (sbi->sec_per_clus - 1);
+	cluster = fat_bmap_cluster(inode, cluster);
+	if (cluster < 0)
+		return cluster;
+	else if (cluster) {
+		*phys = fat_clus_to_blknr(sbi, cluster) + offset;
+		*mapped_blocks = sbi->sec_per_clus - offset;
+		if (*mapped_blocks > last_block - sector)
+			*mapped_blocks = last_block - sector;
+	}
+	return 0;
+}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
new file mode 100644
index 00000000..dc563788
--- /dev/null
+++ b/fs/fat/dir.c
@@ -0,0 +1,1371 @@
+/*
+ *  linux/fs/fat/dir.c
+ *
+ *  directory handling functions for fat-based filesystems
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *
+ *  Hidden files 1995 by Albert Cahalan <albert@ccs.neu.edu> <adc@coe.neu.edu>
+ *
+ *  VFAT extensions by Gordon Chaffee <chaffee@plateau.cs.berkeley.edu>
+ *  Merged with msdos fs by Henrik Storner <storner@osiris.ping.dk>
+ *  Rewritten for constant inumbers. Plugged buffer overrun in readdir(). AV
+ *  Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/buffer_head.h>
+#include <linux/compat.h>
+#include <asm/uaccess.h>
+#include <linux/kernel.h>
+#include "fat.h"
+
+/*
+ * Maximum buffer size of short name.
+ * [(MSDOS_NAME + '.') * max one char + nul]
+ * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
+ */
+#define FAT_MAX_SHORT_SIZE	((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
+/*
+ * Maximum buffer size of unicode chars from slots.
+ * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
+ */
+#define FAT_MAX_UNI_CHARS	((MSDOS_SLOTS - 1) * 13 + 1)
+#define FAT_MAX_UNI_SIZE	(FAT_MAX_UNI_CHARS * sizeof(wchar_t))
+
+static inline loff_t fat_make_i_pos(struct super_block *sb,
+				    struct buffer_head *bh,
+				    struct msdos_dir_entry *de)
+{
+	return ((loff_t)bh->b_blocknr << MSDOS_SB(sb)->dir_per_block_bits)
+		| (de - (struct msdos_dir_entry *)bh->b_data);
+}
+
+static inline void fat_dir_readahead(struct inode *dir, sector_t iblock,
+				     sector_t phys)
+{
+	struct super_block *sb = dir->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bh;
+	int sec;
+
+	/* This is not a first sector of cluster, or sec_per_clus == 1 */
+	if ((iblock & (sbi->sec_per_clus - 1)) || sbi->sec_per_clus == 1)
+		return;
+	/* root dir of FAT12/FAT16 */
+	if ((sbi->fat_bits != 32) && (dir->i_ino == MSDOS_ROOT_INO))
+		return;
+
+	bh = sb_find_get_block(sb, phys);
+	if (bh == NULL || !buffer_uptodate(bh)) {
+		for (sec = 0; sec < sbi->sec_per_clus; sec++)
+			sb_breadahead(sb, phys + sec);
+	}
+	brelse(bh);
+}
+
+/* Returns the inode number of the directory entry at offset pos. If bh is
+   non-NULL, it is brelse'd before. Pos is incremented. The buffer header is
+   returned in bh.
+   AV. Most often we do it item-by-item. Makes sense to optimize.
+   AV. OK, there we go: if both bh and de are non-NULL we assume that we just
+   AV. want the next entry (took one explicit de=NULL in vfat/namei.c).
+   AV. It's done in fat_get_entry() (inlined), here the slow case lives.
+   AV. Additionally, when we return -1 (i.e. reached the end of directory)
+   AV. we make bh NULL.
+ */
+static int fat__get_entry(struct inode *dir, loff_t *pos,
+			  struct buffer_head **bh, struct msdos_dir_entry **de)
+{
+	struct super_block *sb = dir->i_sb;
+	sector_t phys, iblock;
+	unsigned long mapped_blocks;
+	int err, offset;
+
+next:
+	if (*bh)
+		brelse(*bh);
+
+	*bh = NULL;
+	iblock = *pos >> sb->s_blocksize_bits;
+	err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
+	if (err || !phys)
+		return -1;	/* beyond EOF or error */
+
+	fat_dir_readahead(dir, iblock, phys);
+
+	*bh = sb_bread(sb, phys);
+	if (*bh == NULL) {
+		fat_msg(sb, KERN_ERR, "Directory bread(block %llu) failed",
+		       (llu)phys);
+		/* skip this block */
+		*pos = (iblock + 1) << sb->s_blocksize_bits;
+		goto next;
+	}
+
+	offset = *pos & (sb->s_blocksize - 1);
+	*pos += sizeof(struct msdos_dir_entry);
+	*de = (struct msdos_dir_entry *)((*bh)->b_data + offset);
+
+	return 0;
+}
+
+static inline int fat_get_entry(struct inode *dir, loff_t *pos,
+				struct buffer_head **bh,
+				struct msdos_dir_entry **de)
+{
+	/* Fast stuff first */
+	if (*bh && *de &&
+	    (*de - (struct msdos_dir_entry *)(*bh)->b_data) < MSDOS_SB(dir->i_sb)->dir_per_block - 1) {
+		*pos += sizeof(struct msdos_dir_entry);
+		(*de)++;
+		return 0;
+	}
+	return fat__get_entry(dir, pos, bh, de);
+}
+
+/*
+ * Convert Unicode 16 to UTF-8, translated Unicode, or ASCII.
+ * If uni_xlate is enabled and we can't get a 1:1 conversion, use a
+ * colon as an escape character since it is normally invalid on the vfat
+ * filesystem. The following four characters are the hexadecimal digits
+ * of Unicode value. This lets us do a full dump and restore of Unicode
+ * filenames. We could get into some trouble with long Unicode names,
+ * but ignore that right now.
+ * Ahem... Stack smashing in ring 0 isn't fun. Fixed.
+ */
+static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
+		       const wchar_t *uni, int len, struct nls_table *nls)
+{
+	int uni_xlate = MSDOS_SB(sb)->options.unicode_xlate;
+	const wchar_t *ip;
+	wchar_t ec;
+	unsigned char *op;
+	int charlen;
+
+	ip = uni;
+	op = ascii;
+
+	while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
+		ec = *ip++;
+		if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
+			op += charlen;
+			len -= charlen;
+		} else {
+			if (uni_xlate == 1) {
+				*op++ = ':';
+				op = pack_hex_byte(op, ec >> 8);
+				op = pack_hex_byte(op, ec);
+				len -= 5;
+			} else {
+				*op++ = '?';
+				len--;
+			}
+		}
+	}
+
+	if (unlikely(*ip)) {
+		fat_msg(sb, KERN_WARNING, "filename was truncated while "
+			"converting.");
+	}
+
+	*op = 0;
+	return (op - ascii);
+}
+
+static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni,
+				unsigned char *buf, int size)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	if (sbi->options.utf8)
+		return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
+				UTF16_HOST_ENDIAN, buf, size);
+	else
+		return uni16_to_x8(sb, buf, uni, size, sbi->nls_io);
+}
+
+static inline int
+fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni)
+{
+	int charlen;
+
+	charlen = t->char2uni(c, clen, uni);
+	if (charlen < 0) {
+		*uni = 0x003f;	/* a question mark */
+		charlen = 1;
+	}
+	return charlen;
+}
+
+static inline int
+fat_short2lower_uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni)
+{
+	int charlen;
+	wchar_t wc;
+
+	charlen = t->char2uni(c, clen, &wc);
+	if (charlen < 0) {
+		*uni = 0x003f;	/* a question mark */
+		charlen = 1;
+	} else if (charlen <= 1) {
+		unsigned char nc = t->charset2lower[*c];
+
+		if (!nc)
+			nc = *c;
+
+		if ( (charlen = t->char2uni(&nc, 1, uni)) < 0) {
+			*uni = 0x003f;	/* a question mark */
+			charlen = 1;
+		}
+	} else
+		*uni = wc;
+
+	return charlen;
+}
+
+static inline int
+fat_shortname2uni(struct nls_table *nls, unsigned char *buf, int buf_size,
+		  wchar_t *uni_buf, unsigned short opt, int lower)
+{
+	int len = 0;
+
+	if (opt & VFAT_SFN_DISPLAY_LOWER)
+		len =  fat_short2lower_uni(nls, buf, buf_size, uni_buf);
+	else if (opt & VFAT_SFN_DISPLAY_WIN95)
+		len = fat_short2uni(nls, buf, buf_size, uni_buf);
+	else if (opt & VFAT_SFN_DISPLAY_WINNT) {
+		if (lower)
+			len = fat_short2lower_uni(nls, buf, buf_size, uni_buf);
+		else
+			len = fat_short2uni(nls, buf, buf_size, uni_buf);
+	} else
+		len = fat_short2uni(nls, buf, buf_size, uni_buf);
+
+	return len;
+}
+
+static inline int fat_name_match(struct msdos_sb_info *sbi,
+				 const unsigned char *a, int a_len,
+				 const unsigned char *b, int b_len)
+{
+	if (a_len != b_len)
+		return 0;
+
+	if (sbi->options.name_check != 's')
+		return !nls_strnicmp(sbi->nls_io, a, b, a_len);
+	else
+		return !memcmp(a, b, a_len);
+}
+
+enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, };
+
+/**
+ * fat_parse_long - Parse extended directory entry.
+ *
+ * This function returns zero on success, negative value on error, or one of
+ * the following:
+ *
+ * %PARSE_INVALID - Directory entry is invalid.
+ * %PARSE_NOT_LONGNAME - Directory entry does not contain longname.
+ * %PARSE_EOF - Directory has no more entries.
+ */
+static int fat_parse_long(struct inode *dir, loff_t *pos,
+			  struct buffer_head **bh, struct msdos_dir_entry **de,
+			  wchar_t **unicode, unsigned char *nr_slots)
+{
+	struct msdos_dir_slot *ds;
+	unsigned char id, slot, slots, alias_checksum;
+
+	if (!*unicode) {
+		*unicode = __getname();
+		if (!*unicode) {
+			brelse(*bh);
+			return -ENOMEM;
+		}
+	}
+parse_long:
+	slots = 0;
+	ds = (struct msdos_dir_slot *)*de;
+	id = ds->id;
+	if (!(id & 0x40))
+		return PARSE_INVALID;
+	slots = id & ~0x40;
+	if (slots > 20 || !slots)	/* ceil(256 * 2 / 26) */
+		return PARSE_INVALID;
+	*nr_slots = slots;
+	alias_checksum = ds->alias_checksum;
+
+	slot = slots;
+	while (1) {
+		int offset;
+
+		slot--;
+		offset = slot * 13;
+		fat16_towchar(*unicode + offset, ds->name0_4, 5);
+		fat16_towchar(*unicode + offset + 5, ds->name5_10, 6);
+		fat16_towchar(*unicode + offset + 11, ds->name11_12, 2);
+
+		if (ds->id & 0x40)
+			(*unicode)[offset + 13] = 0;
+		if (fat_get_entry(dir, pos, bh, de) < 0)
+			return PARSE_EOF;
+		if (slot == 0)
+			break;
+		ds = (struct msdos_dir_slot *)*de;
+		if (ds->attr != ATTR_EXT)
+			return PARSE_NOT_LONGNAME;
+		if ((ds->id & ~0x40) != slot)
+			goto parse_long;
+		if (ds->alias_checksum != alias_checksum)
+			goto parse_long;
+	}
+	if ((*de)->name[0] == DELETED_FLAG)
+		return PARSE_INVALID;
+	if ((*de)->attr == ATTR_EXT)
+		goto parse_long;
+	if (IS_FREE((*de)->name) || ((*de)->attr & ATTR_VOLUME))
+		return PARSE_INVALID;
+	if (fat_checksum((*de)->name) != alias_checksum)
+		*nr_slots = 0;
+
+	return 0;
+}
+
+/*
+ * Return values: negative -> error, 0 -> not found, positive -> found,
+ * value is the total amount of slots, including the shortname entry.
+ */
+int fat_search_long(struct inode *inode, const unsigned char *name,
+		    int name_len, struct fat_slot_info *sinfo)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bh = NULL;
+	struct msdos_dir_entry *de;
+	struct nls_table *nls_disk = sbi->nls_disk;
+	unsigned char nr_slots;
+	wchar_t bufuname[14];
+	wchar_t *unicode = NULL;
+	unsigned char work[MSDOS_NAME];
+	unsigned char bufname[FAT_MAX_SHORT_SIZE];
+	unsigned short opt_shortname = sbi->options.shortname;
+	loff_t cpos = 0;
+	int chl, i, j, last_u, err, len;
+
+	err = -ENOENT;
+	while (1) {
+		if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
+			goto end_of_dir;
+parse_record:
+		nr_slots = 0;
+		if (de->name[0] == DELETED_FLAG)
+			continue;
+		if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME))
+			continue;
+		if (de->attr != ATTR_EXT && IS_FREE(de->name))
+			continue;
+		if (de->attr == ATTR_EXT) {
+			int status = fat_parse_long(inode, &cpos, &bh, &de,
+						    &unicode, &nr_slots);
+			if (status < 0) {
+				err = status;
+				goto end_of_dir;
+			} else if (status == PARSE_INVALID)
+				continue;
+			else if (status == PARSE_NOT_LONGNAME)
+				goto parse_record;
+			else if (status == PARSE_EOF)
+				goto end_of_dir;
+		}
+
+		memcpy(work, de->name, sizeof(de->name));
+		/* see namei.c, msdos_format_name */
+		if (work[0] == 0x05)
+			work[0] = 0xE5;
+		for (i = 0, j = 0, last_u = 0; i < 8;) {
+			if (!work[i])
+				break;
+			chl = fat_shortname2uni(nls_disk, &work[i], 8 - i,
+						&bufuname[j++], opt_shortname,
+						de->lcase & CASE_LOWER_BASE);
+			if (chl <= 1) {
+				if (work[i] != ' ')
+					last_u = j;
+			} else {
+				last_u = j;
+			}
+			i += chl;
+		}
+		j = last_u;
+		fat_short2uni(nls_disk, ".", 1, &bufuname[j++]);
+		for (i = 8; i < MSDOS_NAME;) {
+			if (!work[i])
+				break;
+			chl = fat_shortname2uni(nls_disk, &work[i],
+						MSDOS_NAME - i,
+						&bufuname[j++], opt_shortname,
+						de->lcase & CASE_LOWER_EXT);
+			if (chl <= 1) {
+				if (work[i] != ' ')
+					last_u = j;
+			} else {
+				last_u = j;
+			}
+			i += chl;
+		}
+		if (!last_u)
+			continue;
+
+		/* Compare shortname */
+		bufuname[last_u] = 0x0000;
+		len = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname));
+		if (fat_name_match(sbi, name, name_len, bufname, len))
+			goto found;
+
+		if (nr_slots) {
+			void *longname = unicode + FAT_MAX_UNI_CHARS;
+			int size = PATH_MAX - FAT_MAX_UNI_SIZE;
+
+			/* Compare longname */
+			len = fat_uni_to_x8(sb, unicode, longname, size);
+			if (fat_name_match(sbi, name, name_len, longname, len))
+				goto found;
+		}
+	}
+
+found:
+	nr_slots++;	/* include the de */
+	sinfo->slot_off = cpos - nr_slots * sizeof(*de);
+	sinfo->nr_slots = nr_slots;
+	sinfo->de = de;
+	sinfo->bh = bh;
+	sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
+	err = 0;
+end_of_dir:
+	if (unicode)
+		__putname(unicode);
+
+	return err;
+}
+
+EXPORT_SYMBOL_GPL(fat_search_long);
+
+struct fat_ioctl_filldir_callback {
+	void __user *dirent;
+	int result;
+	/* for dir ioctl */
+	const char *longname;
+	int long_len;
+	const char *shortname;
+	int short_len;
+};
+
+static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
+			 filldir_t filldir, int short_only, int both)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bh;
+	struct msdos_dir_entry *de;
+	struct nls_table *nls_disk = sbi->nls_disk;
+	unsigned char nr_slots;
+	wchar_t bufuname[14];
+	wchar_t *unicode = NULL;
+	unsigned char c, work[MSDOS_NAME];
+	unsigned char bufname[FAT_MAX_SHORT_SIZE], *ptname = bufname;
+	unsigned short opt_shortname = sbi->options.shortname;
+	int isvfat = sbi->options.isvfat;
+	int nocase = sbi->options.nocase;
+	const char *fill_name = NULL;
+	unsigned long inum;
+	unsigned long lpos, dummy, *furrfu = &lpos;
+	loff_t cpos;
+	int chi, chl, i, i2, j, last, last_u, dotoffset = 0, fill_len = 0;
+	int ret = 0;
+
+	lock_super(sb);
+
+	cpos = filp->f_pos;
+	/* Fake . and .. for the root directory. */
+	if (inode->i_ino == MSDOS_ROOT_INO) {
+		while (cpos < 2) {
+			if (filldir(dirent, "..", cpos+1, cpos, MSDOS_ROOT_INO, DT_DIR) < 0)
+				goto out;
+			cpos++;
+			filp->f_pos++;
+		}
+		if (cpos == 2) {
+			dummy = 2;
+			furrfu = &dummy;
+			cpos = 0;
+		}
+	}
+	if (cpos & (sizeof(struct msdos_dir_entry) - 1)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	bh = NULL;
+get_new:
+	if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
+		goto end_of_dir;
+parse_record:
+	nr_slots = 0;
+	/*
+	 * Check for long filename entry, but if short_only, we don't
+	 * need to parse long filename.
+	 */
+	if (isvfat && !short_only) {
+		if (de->name[0] == DELETED_FLAG)
+			goto record_end;
+		if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME))
+			goto record_end;
+		if (de->attr != ATTR_EXT && IS_FREE(de->name))
+			goto record_end;
+	} else {
+		if ((de->attr & ATTR_VOLUME) || IS_FREE(de->name))
+			goto record_end;
+	}
+
+	if (isvfat && de->attr == ATTR_EXT) {
+		int status = fat_parse_long(inode, &cpos, &bh, &de,
+					    &unicode, &nr_slots);
+		if (status < 0) {
+			filp->f_pos = cpos;
+			ret = status;
+			goto out;
+		} else if (status == PARSE_INVALID)
+			goto record_end;
+		else if (status == PARSE_NOT_LONGNAME)
+			goto parse_record;
+		else if (status == PARSE_EOF)
+			goto end_of_dir;
+
+		if (nr_slots) {
+			void *longname = unicode + FAT_MAX_UNI_CHARS;
+			int size = PATH_MAX - FAT_MAX_UNI_SIZE;
+			int len = fat_uni_to_x8(sb, unicode, longname, size);
+
+			fill_name = longname;
+			fill_len = len;
+			/* !both && !short_only, so we don't need shortname. */
+			if (!both)
+				goto start_filldir;
+		}
+	}
+
+	if (sbi->options.dotsOK) {
+		ptname = bufname;
+		dotoffset = 0;
+		if (de->attr & ATTR_HIDDEN) {
+			*ptname++ = '.';
+			dotoffset = 1;
+		}
+	}
+
+	memcpy(work, de->name, sizeof(de->name));
+	/* see namei.c, msdos_format_name */
+	if (work[0] == 0x05)
+		work[0] = 0xE5;
+	for (i = 0, j = 0, last = 0, last_u = 0; i < 8;) {
+		if (!(c = work[i]))
+			break;
+		chl = fat_shortname2uni(nls_disk, &work[i], 8 - i,
+					&bufuname[j++], opt_shortname,
+					de->lcase & CASE_LOWER_BASE);
+		if (chl <= 1) {
+			ptname[i++] = (!nocase && c>='A' && c<='Z') ? c+32 : c;
+			if (c != ' ') {
+				last = i;
+				last_u = j;
+			}
+		} else {
+			last_u = j;
+			for (chi = 0; chi < chl && i < 8; chi++) {
+				ptname[i] = work[i];
+				i++; last = i;
+			}
+		}
+	}
+	i = last;
+	j = last_u;
+	fat_short2uni(nls_disk, ".", 1, &bufuname[j++]);
+	ptname[i++] = '.';
+	for (i2 = 8; i2 < MSDOS_NAME;) {
+		if (!(c = work[i2]))
+			break;
+		chl = fat_shortname2uni(nls_disk, &work[i2], MSDOS_NAME - i2,
+					&bufuname[j++], opt_shortname,
+					de->lcase & CASE_LOWER_EXT);
+		if (chl <= 1) {
+			i2++;
+			ptname[i++] = (!nocase && c>='A' && c<='Z') ? c+32 : c;
+			if (c != ' ') {
+				last = i;
+				last_u = j;
+			}
+		} else {
+			last_u = j;
+			for (chi = 0; chi < chl && i2 < MSDOS_NAME; chi++) {
+				ptname[i++] = work[i2++];
+				last = i;
+			}
+		}
+	}
+	if (!last)
+		goto record_end;
+
+	i = last + dotoffset;
+	j = last_u;
+
+	if (isvfat) {
+		bufuname[j] = 0x0000;
+		i = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname));
+	}
+	if (nr_slots) {
+		/* hack for fat_ioctl_filldir() */
+		struct fat_ioctl_filldir_callback *p = dirent;
+
+		p->longname = fill_name;
+		p->long_len = fill_len;
+		p->shortname = bufname;
+		p->short_len = i;
+		fill_name = NULL;
+		fill_len = 0;
+	} else {
+		fill_name = bufname;
+		fill_len = i;
+	}
+
+start_filldir:
+	lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
+	if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME))
+		inum = inode->i_ino;
+	else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
+		inum = parent_ino(filp->f_path.dentry);
+	} else {
+		loff_t i_pos = fat_make_i_pos(sb, bh, de);
+		struct inode *tmp = fat_iget(sb, i_pos);
+		if (tmp) {
+			inum = tmp->i_ino;
+			iput(tmp);
+		} else
+			inum = iunique(sb, MSDOS_ROOT_INO);
+	}
+
+	if (filldir(dirent, fill_name, fill_len, *furrfu, inum,
+		    (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
+		goto fill_failed;
+
+record_end:
+	furrfu = &lpos;
+	filp->f_pos = cpos;
+	goto get_new;
+end_of_dir:
+	filp->f_pos = cpos;
+fill_failed:
+	brelse(bh);
+	if (unicode)
+		__putname(unicode);
+out:
+	unlock_super(sb);
+	return ret;
+}
+
+static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	return __fat_readdir(inode, filp, dirent, filldir, 0, 0);
+}
+
+#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type)			   \
+static int func(void *__buf, const char *name, int name_len,		   \
+			     loff_t offset, u64 ino, unsigned int d_type)  \
+{									   \
+	struct fat_ioctl_filldir_callback *buf = __buf;			   \
+	struct dirent_type __user *d1 = buf->dirent;			   \
+	struct dirent_type __user *d2 = d1 + 1;				   \
+									   \
+	if (buf->result)						   \
+		return -EINVAL;						   \
+	buf->result++;							   \
+									   \
+	if (name != NULL) {						   \
+		/* dirent has only short name */			   \
+		if (name_len >= sizeof(d1->d_name))			   \
+			name_len = sizeof(d1->d_name) - 1;		   \
+									   \
+		if (put_user(0, d2->d_name)			||	   \
+		    put_user(0, &d2->d_reclen)			||	   \
+		    copy_to_user(d1->d_name, name, name_len)	||	   \
+		    put_user(0, d1->d_name + name_len)		||	   \
+		    put_user(name_len, &d1->d_reclen))			   \
+			goto efault;					   \
+	} else {							   \
+		/* dirent has short and long name */			   \
+		const char *longname = buf->longname;			   \
+		int long_len = buf->long_len;				   \
+		const char *shortname = buf->shortname;			   \
+		int short_len = buf->short_len;				   \
+									   \
+		if (long_len >= sizeof(d1->d_name))			   \
+			long_len = sizeof(d1->d_name) - 1;		   \
+		if (short_len >= sizeof(d1->d_name))			   \
+			short_len = sizeof(d1->d_name) - 1;		   \
+									   \
+		if (copy_to_user(d2->d_name, longname, long_len)	|| \
+		    put_user(0, d2->d_name + long_len)			|| \
+		    put_user(long_len, &d2->d_reclen)			|| \
+		    put_user(ino, &d2->d_ino)				|| \
+		    put_user(offset, &d2->d_off)			|| \
+		    copy_to_user(d1->d_name, shortname, short_len)	|| \
+		    put_user(0, d1->d_name + short_len)			|| \
+		    put_user(short_len, &d1->d_reclen))			   \
+			goto efault;					   \
+	}								   \
+	return 0;							   \
+efault:									   \
+	buf->result = -EFAULT;						   \
+	return -EFAULT;							   \
+}
+
+FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
+
+static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
+			     void __user *dirent, filldir_t filldir,
+			     int short_only, int both)
+{
+	struct fat_ioctl_filldir_callback buf;
+	int ret;
+
+	buf.dirent = dirent;
+	buf.result = 0;
+	mutex_lock(&inode->i_mutex);
+	ret = -ENOENT;
+	if (!IS_DEADDIR(inode)) {
+		ret = __fat_readdir(inode, filp, &buf, filldir,
+				    short_only, both);
+	}
+	mutex_unlock(&inode->i_mutex);
+	if (ret >= 0)
+		ret = buf.result;
+	return ret;
+}
+
+static int fat_ioctl_volume_id(struct inode *dir)
+{
+	struct super_block *sb = dir->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	return sbi->vol_id;
+}
+
+static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
+			  unsigned long arg)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
+	int short_only, both;
+
+	switch (cmd) {
+	case VFAT_IOCTL_READDIR_SHORT:
+		short_only = 1;
+		both = 0;
+		break;
+	case VFAT_IOCTL_READDIR_BOTH:
+		short_only = 0;
+		both = 1;
+		break;
+	case VFAT_IOCTL_GET_VOLUME_ID:
+		return fat_ioctl_volume_id(inode);
+	default:
+		return fat_generic_ioctl(filp, cmd, arg);
+	}
+
+	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
+		return -EFAULT;
+	/*
+	 * Yes, we don't need this put_user() absolutely. However old
+	 * code didn't return the right value. So, app use this value,
+	 * in order to check whether it is EOF.
+	 */
+	if (put_user(0, &d1->d_reclen))
+		return -EFAULT;
+
+	return fat_ioctl_readdir(inode, filp, d1, fat_ioctl_filldir,
+				 short_only, both);
+}
+
+#ifdef CONFIG_COMPAT
+#define	VFAT_IOCTL_READDIR_BOTH32	_IOR('r', 1, struct compat_dirent[2])
+#define	VFAT_IOCTL_READDIR_SHORT32	_IOR('r', 2, struct compat_dirent[2])
+
+FAT_IOCTL_FILLDIR_FUNC(fat_compat_ioctl_filldir, compat_dirent)
+
+static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
+				 unsigned long arg)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct compat_dirent __user *d1 = compat_ptr(arg);
+	int short_only, both;
+
+	switch (cmd) {
+	case VFAT_IOCTL_READDIR_SHORT32:
+		short_only = 1;
+		both = 0;
+		break;
+	case VFAT_IOCTL_READDIR_BOTH32:
+		short_only = 0;
+		both = 1;
+		break;
+	default:
+		return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
+	}
+
+	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
+		return -EFAULT;
+	/*
+	 * Yes, we don't need this put_user() absolutely. However old
+	 * code didn't return the right value. So, app use this value,
+	 * in order to check whether it is EOF.
+	 */
+	if (put_user(0, &d1->d_reclen))
+		return -EFAULT;
+
+	return fat_ioctl_readdir(inode, filp, d1, fat_compat_ioctl_filldir,
+				 short_only, both);
+}
+#endif /* CONFIG_COMPAT */
+
+const struct file_operations fat_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= fat_readdir,
+	.unlocked_ioctl	= fat_dir_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= fat_compat_dir_ioctl,
+#endif
+	.fsync		= fat_file_fsync,
+};
+
+static int fat_get_short_entry(struct inode *dir, loff_t *pos,
+			       struct buffer_head **bh,
+			       struct msdos_dir_entry **de)
+{
+	while (fat_get_entry(dir, pos, bh, de) >= 0) {
+		/* free entry or long name entry or volume label */
+		if (!IS_FREE((*de)->name) && !((*de)->attr & ATTR_VOLUME))
+			return 0;
+	}
+	return -ENOENT;
+}
+
+/*
+ * The ".." entry can not provide the "struct fat_slot_info" informations
+ * for inode. So, this function provide the some informations only.
+ */
+int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
+			 struct msdos_dir_entry **de, loff_t *i_pos)
+{
+	loff_t offset;
+
+	offset = 0;
+	*bh = NULL;
+	while (fat_get_short_entry(dir, &offset, bh, de) >= 0) {
+		if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME)) {
+			*i_pos = fat_make_i_pos(dir->i_sb, *bh, *de);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+EXPORT_SYMBOL_GPL(fat_get_dotdot_entry);
+
+/* See if directory is empty */
+int fat_dir_empty(struct inode *dir)
+{
+	struct buffer_head *bh;
+	struct msdos_dir_entry *de;
+	loff_t cpos;
+	int result = 0;
+
+	bh = NULL;
+	cpos = 0;
+	while (fat_get_short_entry(dir, &cpos, &bh, &de) >= 0) {
+		if (strncmp(de->name, MSDOS_DOT   , MSDOS_NAME) &&
+		    strncmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
+			result = -ENOTEMPTY;
+			break;
+		}
+	}
+	brelse(bh);
+	return result;
+}
+
+EXPORT_SYMBOL_GPL(fat_dir_empty);
+
+/*
+ * fat_subdirs counts the number of sub-directories of dir. It can be run
+ * on directories being created.
+ */
+int fat_subdirs(struct inode *dir)
+{
+	struct buffer_head *bh;
+	struct msdos_dir_entry *de;
+	loff_t cpos;
+	int count = 0;
+
+	bh = NULL;
+	cpos = 0;
+	while (fat_get_short_entry(dir, &cpos, &bh, &de) >= 0) {
+		if (de->attr & ATTR_DIR)
+			count++;
+	}
+	brelse(bh);
+	return count;
+}
+
+/*
+ * Scans a directory for a given file (name points to its formatted name).
+ * Returns an error code or zero.
+ */
+int fat_scan(struct inode *dir, const unsigned char *name,
+	     struct fat_slot_info *sinfo)
+{
+	struct super_block *sb = dir->i_sb;
+
+	sinfo->slot_off = 0;
+	sinfo->bh = NULL;
+	while (fat_get_short_entry(dir, &sinfo->slot_off, &sinfo->bh,
+				   &sinfo->de) >= 0) {
+		if (!strncmp(sinfo->de->name, name, MSDOS_NAME)) {
+			sinfo->slot_off -= sizeof(*sinfo->de);
+			sinfo->nr_slots = 1;
+			sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+EXPORT_SYMBOL_GPL(fat_scan);
+
+static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
+{
+	struct super_block *sb = dir->i_sb;
+	struct buffer_head *bh;
+	struct msdos_dir_entry *de, *endp;
+	int err = 0, orig_slots;
+
+	while (nr_slots) {
+		bh = NULL;
+		if (fat_get_entry(dir, &pos, &bh, &de) < 0) {
+			err = -EIO;
+			break;
+		}
+
+		orig_slots = nr_slots;
+		endp = (struct msdos_dir_entry *)(bh->b_data + sb->s_blocksize);
+		while (nr_slots && de < endp) {
+			de->name[0] = DELETED_FLAG;
+			de++;
+			nr_slots--;
+		}
+		mark_buffer_dirty_inode(bh, dir);
+		if (IS_DIRSYNC(dir))
+			err = sync_dirty_buffer(bh);
+		brelse(bh);
+		if (err)
+			break;
+
+		/* pos is *next* de's position, so this does `- sizeof(de)' */
+		pos += ((orig_slots - nr_slots) * sizeof(*de)) - sizeof(*de);
+	}
+
+	return err;
+}
+
+int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
+{
+	struct super_block *sb = dir->i_sb;
+	struct msdos_dir_entry *de;
+	struct buffer_head *bh;
+	int err = 0, nr_slots;
+
+	/*
+	 * First stage: Remove the shortname. By this, the directory
+	 * entry is removed.
+	 */
+	nr_slots = sinfo->nr_slots;
+	de = sinfo->de;
+	sinfo->de = NULL;
+	bh = sinfo->bh;
+	sinfo->bh = NULL;
+	while (nr_slots && de >= (struct msdos_dir_entry *)bh->b_data) {
+		de->name[0] = DELETED_FLAG;
+		de--;
+		nr_slots--;
+	}
+	mark_buffer_dirty_inode(bh, dir);
+	if (IS_DIRSYNC(dir))
+		err = sync_dirty_buffer(bh);
+	brelse(bh);
+	if (err)
+		return err;
+	dir->i_version++;
+
+	if (nr_slots) {
+		/*
+		 * Second stage: remove the remaining longname slots.
+		 * (This directory entry is already removed, and so return
+		 * the success)
+		 */
+		err = __fat_remove_entries(dir, sinfo->slot_off, nr_slots);
+		if (err) {
+			fat_msg(sb, KERN_WARNING,
+			       "Couldn't remove the long name slots");
+		}
+	}
+
+	dir->i_mtime = dir->i_atime = CURRENT_TIME_SEC;
+	if (IS_DIRSYNC(dir))
+		(void)fat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(fat_remove_entries);
+
+static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
+			      struct buffer_head **bhs, int nr_bhs)
+{
+	struct super_block *sb = dir->i_sb;
+	sector_t last_blknr = blknr + MSDOS_SB(sb)->sec_per_clus;
+	int err, i, n;
+
+	/* Zeroing the unused blocks on this cluster */
+	blknr += nr_used;
+	n = nr_used;
+	while (blknr < last_blknr) {
+		bhs[n] = sb_getblk(sb, blknr);
+		if (!bhs[n]) {
+			err = -ENOMEM;
+			goto error;
+		}
+		memset(bhs[n]->b_data, 0, sb->s_blocksize);
+		set_buffer_uptodate(bhs[n]);
+		mark_buffer_dirty_inode(bhs[n], dir);
+
+		n++;
+		blknr++;
+		if (n == nr_bhs) {
+			if (IS_DIRSYNC(dir)) {
+				err = fat_sync_bhs(bhs, n);
+				if (err)
+					goto error;
+			}
+			for (i = 0; i < n; i++)
+				brelse(bhs[i]);
+			n = 0;
+		}
+	}
+	if (IS_DIRSYNC(dir)) {
+		err = fat_sync_bhs(bhs, n);
+		if (err)
+			goto error;
+	}
+	for (i = 0; i < n; i++)
+		brelse(bhs[i]);
+
+	return 0;
+
+error:
+	for (i = 0; i < n; i++)
+		bforget(bhs[i]);
+	return err;
+}
+
+int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
+{
+	struct super_block *sb = dir->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
+	struct msdos_dir_entry *de;
+	sector_t blknr;
+	__le16 date, time;
+	u8 time_cs;
+	int err, cluster;
+
+	err = fat_alloc_clusters(dir, &cluster, 1);
+	if (err)
+		goto error;
+
+	blknr = fat_clus_to_blknr(sbi, cluster);
+	bhs[0] = sb_getblk(sb, blknr);
+	if (!bhs[0]) {
+		err = -ENOMEM;
+		goto error_free;
+	}
+
+	fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
+
+	de = (struct msdos_dir_entry *)bhs[0]->b_data;
+	/* filling the new directory slots ("." and ".." entries) */
+	memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME);
+	memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME);
+	de->attr = de[1].attr = ATTR_DIR;
+	de[0].lcase = de[1].lcase = 0;
+	de[0].time = de[1].time = time;
+	de[0].date = de[1].date = date;
+	if (sbi->options.isvfat) {
+		/* extra timestamps */
+		de[0].ctime = de[1].ctime = time;
+		de[0].ctime_cs = de[1].ctime_cs = time_cs;
+		de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = date;
+	} else {
+		de[0].ctime = de[1].ctime = 0;
+		de[0].ctime_cs = de[1].ctime_cs = 0;
+		de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = 0;
+	}
+	de[0].start = cpu_to_le16(cluster);
+	de[0].starthi = cpu_to_le16(cluster >> 16);
+	de[1].start = cpu_to_le16(MSDOS_I(dir)->i_logstart);
+	de[1].starthi = cpu_to_le16(MSDOS_I(dir)->i_logstart >> 16);
+	de[0].size = de[1].size = 0;
+	memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
+	set_buffer_uptodate(bhs[0]);
+	mark_buffer_dirty_inode(bhs[0], dir);
+
+	err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
+	if (err)
+		goto error_free;
+
+	return cluster;
+
+error_free:
+	fat_free_clusters(dir, cluster);
+error:
+	return err;
+}
+
+EXPORT_SYMBOL_GPL(fat_alloc_new_dir);
+
+static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
+			       int *nr_cluster, struct msdos_dir_entry **de,
+			       struct buffer_head **bh, loff_t *i_pos)
+{
+	struct super_block *sb = dir->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
+	sector_t blknr, start_blknr, last_blknr;
+	unsigned long size, copy;
+	int err, i, n, offset, cluster[2];
+
+	/*
+	 * The minimum cluster size is 512bytes, and maximum entry
+	 * size is 32*slots (672bytes).  So, iff the cluster size is
+	 * 512bytes, we may need two clusters.
+	 */
+	size = nr_slots * sizeof(struct msdos_dir_entry);
+	*nr_cluster = (size + (sbi->cluster_size - 1)) >> sbi->cluster_bits;
+	BUG_ON(*nr_cluster > 2);
+
+	err = fat_alloc_clusters(dir, cluster, *nr_cluster);
+	if (err)
+		goto error;
+
+	/*
+	 * First stage: Fill the directory entry.  NOTE: This cluster
+	 * is not referenced from any inode yet, so updates order is
+	 * not important.
+	 */
+	i = n = copy = 0;
+	do {
+		start_blknr = blknr = fat_clus_to_blknr(sbi, cluster[i]);
+		last_blknr = start_blknr + sbi->sec_per_clus;
+		while (blknr < last_blknr) {
+			bhs[n] = sb_getblk(sb, blknr);
+			if (!bhs[n]) {
+				err = -ENOMEM;
+				goto error_nomem;
+			}
+
+			/* fill the directory entry */
+			copy = min(size, sb->s_blocksize);
+			memcpy(bhs[n]->b_data, slots, copy);
+			slots += copy;
+			size -= copy;
+			set_buffer_uptodate(bhs[n]);
+			mark_buffer_dirty_inode(bhs[n], dir);
+			if (!size)
+				break;
+			n++;
+			blknr++;
+		}
+	} while (++i < *nr_cluster);
+
+	memset(bhs[n]->b_data + copy, 0, sb->s_blocksize - copy);
+	offset = copy - sizeof(struct msdos_dir_entry);
+	get_bh(bhs[n]);
+	*bh = bhs[n];
+	*de = (struct msdos_dir_entry *)((*bh)->b_data + offset);
+	*i_pos = fat_make_i_pos(sb, *bh, *de);
+
+	/* Second stage: clear the rest of cluster, and write outs */
+	err = fat_zeroed_cluster(dir, start_blknr, ++n, bhs, MAX_BUF_PER_PAGE);
+	if (err)
+		goto error_free;
+
+	return cluster[0];
+
+error_free:
+	brelse(*bh);
+	*bh = NULL;
+	n = 0;
+error_nomem:
+	for (i = 0; i < n; i++)
+		bforget(bhs[i]);
+	fat_free_clusters(dir, cluster[0]);
+error:
+	return err;
+}
+
+int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
+		    struct fat_slot_info *sinfo)
+{
+	struct super_block *sb = dir->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */
+	struct msdos_dir_entry *de;
+	int err, free_slots, i, nr_bhs;
+	loff_t pos, i_pos;
+
+	sinfo->nr_slots = nr_slots;
+
+	/* First stage: search free direcotry entries */
+	free_slots = nr_bhs = 0;
+	bh = prev = NULL;
+	pos = 0;
+	err = -ENOSPC;
+	while (fat_get_entry(dir, &pos, &bh, &de) > -1) {
+		/* check the maximum size of directory */
+		if (pos >= FAT_MAX_DIR_SIZE)
+			goto error;
+
+		if (IS_FREE(de->name)) {
+			if (prev != bh) {
+				get_bh(bh);
+				bhs[nr_bhs] = prev = bh;
+				nr_bhs++;
+			}
+			free_slots++;
+			if (free_slots == nr_slots)
+				goto found;
+		} else {
+			for (i = 0; i < nr_bhs; i++)
+				brelse(bhs[i]);
+			prev = NULL;
+			free_slots = nr_bhs = 0;
+		}
+	}
+	if (dir->i_ino == MSDOS_ROOT_INO) {
+		if (sbi->fat_bits != 32)
+			goto error;
+	} else if (MSDOS_I(dir)->i_start == 0) {
+		fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)",
+		       MSDOS_I(dir)->i_pos);
+		err = -EIO;
+		goto error;
+	}
+
+found:
+	err = 0;
+	pos -= free_slots * sizeof(*de);
+	nr_slots -= free_slots;
+	if (free_slots) {
+		/*
+		 * Second stage: filling the free entries with new entries.
+		 * NOTE: If this slots has shortname, first, we write
+		 * the long name slots, then write the short name.
+		 */
+		int size = free_slots * sizeof(*de);
+		int offset = pos & (sb->s_blocksize - 1);
+		int long_bhs = nr_bhs - (nr_slots == 0);
+
+		/* Fill the long name slots. */
+		for (i = 0; i < long_bhs; i++) {
+			int copy = min_t(int, sb->s_blocksize - offset, size);
+			memcpy(bhs[i]->b_data + offset, slots, copy);
+			mark_buffer_dirty_inode(bhs[i], dir);
+			offset = 0;
+			slots += copy;
+			size -= copy;
+		}
+		if (long_bhs && IS_DIRSYNC(dir))
+			err = fat_sync_bhs(bhs, long_bhs);
+		if (!err && i < nr_bhs) {
+			/* Fill the short name slot. */
+			int copy = min_t(int, sb->s_blocksize - offset, size);
+			memcpy(bhs[i]->b_data + offset, slots, copy);
+			mark_buffer_dirty_inode(bhs[i], dir);
+			if (IS_DIRSYNC(dir))
+				err = sync_dirty_buffer(bhs[i]);
+		}
+		for (i = 0; i < nr_bhs; i++)
+			brelse(bhs[i]);
+		if (err)
+			goto error_remove;
+	}
+
+	if (nr_slots) {
+		int cluster, nr_cluster;
+
+		/*
+		 * Third stage: allocate the cluster for new entries.
+		 * And initialize the cluster with new entries, then
+		 * add the cluster to dir.
+		 */
+		cluster = fat_add_new_entries(dir, slots, nr_slots, &nr_cluster,
+					      &de, &bh, &i_pos);
+		if (cluster < 0) {
+			err = cluster;
+			goto error_remove;
+		}
+		err = fat_chain_add(dir, cluster, nr_cluster);
+		if (err) {
+			fat_free_clusters(dir, cluster);
+			goto error_remove;
+		}
+		if (dir->i_size & (sbi->cluster_size - 1)) {
+			fat_fs_error(sb, "Odd directory size");
+			dir->i_size = (dir->i_size + sbi->cluster_size - 1)
+				& ~((loff_t)sbi->cluster_size - 1);
+		}
+		dir->i_size += nr_cluster << sbi->cluster_bits;
+		MSDOS_I(dir)->mmu_private += nr_cluster << sbi->cluster_bits;
+	}
+	sinfo->slot_off = pos;
+	sinfo->de = de;
+	sinfo->bh = bh;
+	sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
+
+	return 0;
+
+error:
+	brelse(bh);
+	for (i = 0; i < nr_bhs; i++)
+		brelse(bhs[i]);
+	return err;
+
+error_remove:
+	brelse(bh);
+	if (free_slots)
+		__fat_remove_entries(dir, pos, free_slots);
+	return err;
+}
+
+EXPORT_SYMBOL_GPL(fat_add_entries);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
new file mode 100644
index 00000000..74fe5d3e
--- /dev/null
+++ b/fs/fat/fat.h
@@ -0,0 +1,351 @@
+#ifndef _FAT_H
+#define _FAT_H
+
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/ratelimit.h>
+#include <linux/msdos_fs.h>
+
+/*
+ * vfat shortname flags
+ */
+#define VFAT_SFN_DISPLAY_LOWER	0x0001 /* convert to lowercase for display */
+#define VFAT_SFN_DISPLAY_WIN95	0x0002 /* emulate win95 rule for display */
+#define VFAT_SFN_DISPLAY_WINNT	0x0004 /* emulate winnt rule for display */
+#define VFAT_SFN_CREATE_WIN95	0x0100 /* emulate win95 rule for create */
+#define VFAT_SFN_CREATE_WINNT	0x0200 /* emulate winnt rule for create */
+
+#define FAT_ERRORS_CONT		1      /* ignore error and continue */
+#define FAT_ERRORS_PANIC	2      /* panic on error */
+#define FAT_ERRORS_RO		3      /* remount r/o on error */
+
+struct fat_mount_options {
+	uid_t fs_uid;
+	gid_t fs_gid;
+	unsigned short fs_fmask;
+	unsigned short fs_dmask;
+	unsigned short codepage;  /* Codepage for shortname conversions */
+	char *iocharset;          /* Charset used for filename input/display */
+	unsigned short shortname; /* flags for shortname display/create rule */
+	unsigned char name_check; /* r = relaxed, n = normal, s = strict */
+	unsigned char errors;	  /* On error: continue, panic, remount-ro */
+	unsigned short allow_utime;/* permission for setting the [am]time */
+	unsigned quiet:1,         /* set = fake successful chmods and chowns */
+		 showexec:1,      /* set = only set x bit for com/exe/bat */
+		 sys_immutable:1, /* set = system files are immutable */
+		 dotsOK:1,        /* set = hidden and system files are named '.filename' */
+		 isvfat:1,        /* 0=no vfat long filename support, 1=vfat support */
+		 utf8:1,	  /* Use of UTF-8 character set (Default) */
+		 unicode_xlate:1, /* create escape sequences for unhandled Unicode */
+		 numtail:1,       /* Does first alias have a numeric '~1' type tail? */
+		 flush:1,	  /* write things quickly */
+		 nocase:1,	  /* Does this need case conversion? 0=need case conversion*/
+		 usefree:1,	  /* Use free_clusters for FAT32 */
+		 tz_utc:1,	  /* Filesystem timestamps are in UTC */
+		 rodir:1,	  /* allow ATTR_RO for directory */
+		 discard:1;	  /* Issue discard requests on deletions */
+};
+
+#define FAT_HASH_BITS	8
+#define FAT_HASH_SIZE	(1UL << FAT_HASH_BITS)
+
+/*
+ * MS-DOS file system in-core superblock data
+ */
+struct msdos_sb_info {
+	unsigned short sec_per_clus; /* sectors/cluster */
+	unsigned short cluster_bits; /* log2(cluster_size) */
+	unsigned int cluster_size;   /* cluster size */
+	unsigned char fats,fat_bits; /* number of FATs, FAT bits (12 or 16) */
+	unsigned short fat_start;
+	unsigned long fat_length;    /* FAT start & length (sec.) */
+	unsigned long dir_start;
+	unsigned short dir_entries;  /* root dir start & entries */
+	unsigned long data_start;    /* first data sector */
+	unsigned long max_cluster;   /* maximum cluster number */
+	unsigned long root_cluster;  /* first cluster of the root directory */
+	unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */
+	struct mutex fat_lock;
+	unsigned int prev_free;      /* previously allocated cluster number */
+	unsigned int free_clusters;  /* -1 if undefined */
+	unsigned int free_clus_valid; /* is free_clusters valid? */
+	struct fat_mount_options options;
+	struct nls_table *nls_disk;  /* Codepage used on disk */
+	struct nls_table *nls_io;    /* Charset used for input and display */
+	const void *dir_ops;		     /* Opaque; default directory operations */
+	int dir_per_block;	     /* dir entries per block */
+	int dir_per_block_bits;	     /* log2(dir_per_block) */
+	unsigned long vol_id;        /* volume ID */
+
+	int fatent_shift;
+	struct fatent_operations *fatent_ops;
+	struct inode *fat_inode;
+
+	struct ratelimit_state ratelimit;
+
+	spinlock_t inode_hash_lock;
+	struct hlist_head inode_hashtable[FAT_HASH_SIZE];
+};
+
+#define FAT_CACHE_VALID	0	/* special case for valid cache */
+
+/*
+ * MS-DOS file system inode data in memory
+ */
+struct msdos_inode_info {
+	spinlock_t cache_lru_lock;
+	struct list_head cache_lru;
+	int nr_caches;
+	/* for avoiding the race between fat_free() and fat_get_cluster() */
+	unsigned int cache_valid_id;
+
+	/* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */
+	loff_t mmu_private;	/* physically allocated size */
+
+	int i_start;		/* first cluster or 0 */
+	int i_logstart;		/* logical first cluster */
+	int i_attrs;		/* unused attribute bits */
+	loff_t i_pos;		/* on-disk position of directory entry or 0 */
+	struct hlist_node i_fat_hash;	/* hash by i_location */
+	struct inode vfs_inode;
+};
+
+struct fat_slot_info {
+	loff_t i_pos;		/* on-disk position of directory entry */
+	loff_t slot_off;	/* offset for slot or de start */
+	int nr_slots;		/* number of slots + 1(de) in filename */
+	struct msdos_dir_entry *de;
+	struct buffer_head *bh;
+};
+
+static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
+{
+	return container_of(inode, struct msdos_inode_info, vfs_inode);
+}
+
+/*
+ * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to
+ * save ATTR_RO instead of ->i_mode.
+ *
+ * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only
+ * bit, it's just used as flag for app.
+ */
+static inline int fat_mode_can_hold_ro(struct inode *inode)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	mode_t mask;
+
+	if (S_ISDIR(inode->i_mode)) {
+		if (!sbi->options.rodir)
+			return 0;
+		mask = ~sbi->options.fs_dmask;
+	} else
+		mask = ~sbi->options.fs_fmask;
+
+	if (!(mask & S_IWUGO))
+		return 0;
+	return 1;
+}
+
+/* Convert attribute bits and a mask to the UNIX mode. */
+static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
+				   u8 attrs, mode_t mode)
+{
+	if (attrs & ATTR_RO && !((attrs & ATTR_DIR) && !sbi->options.rodir))
+		mode &= ~S_IWUGO;
+
+	if (attrs & ATTR_DIR)
+		return (mode & ~sbi->options.fs_dmask) | S_IFDIR;
+	else
+		return (mode & ~sbi->options.fs_fmask) | S_IFREG;
+}
+
+/* Return the FAT attribute byte for this inode */
+static inline u8 fat_make_attrs(struct inode *inode)
+{
+	u8 attrs = MSDOS_I(inode)->i_attrs;
+	if (S_ISDIR(inode->i_mode))
+		attrs |= ATTR_DIR;
+	if (fat_mode_can_hold_ro(inode) && !(inode->i_mode & S_IWUGO))
+		attrs |= ATTR_RO;
+	return attrs;
+}
+
+static inline void fat_save_attrs(struct inode *inode, u8 attrs)
+{
+	if (fat_mode_can_hold_ro(inode))
+		MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED;
+	else
+		MSDOS_I(inode)->i_attrs = attrs & (ATTR_UNUSED | ATTR_RO);
+}
+
+static inline unsigned char fat_checksum(const __u8 *name)
+{
+	unsigned char s = name[0];
+	s = (s<<7) + (s>>1) + name[1];	s = (s<<7) + (s>>1) + name[2];
+	s = (s<<7) + (s>>1) + name[3];	s = (s<<7) + (s>>1) + name[4];
+	s = (s<<7) + (s>>1) + name[5];	s = (s<<7) + (s>>1) + name[6];
+	s = (s<<7) + (s>>1) + name[7];	s = (s<<7) + (s>>1) + name[8];
+	s = (s<<7) + (s>>1) + name[9];	s = (s<<7) + (s>>1) + name[10];
+	return s;
+}
+
+static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus)
+{
+	return ((sector_t)clus - FAT_START_ENT) * sbi->sec_per_clus
+		+ sbi->data_start;
+}
+
+static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len)
+{
+#ifdef __BIG_ENDIAN
+	while (len--) {
+		*dst++ = src[0] | (src[1] << 8);
+		src += 2;
+	}
+#else
+	memcpy(dst, src, len * 2);
+#endif
+}
+
+static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
+{
+#ifdef __BIG_ENDIAN
+	while (len--) {
+		dst[0] = *src & 0x00FF;
+		dst[1] = (*src & 0xFF00) >> 8;
+		dst += 2;
+		src++;
+	}
+#else
+	memcpy(dst, src, len * 2);
+#endif
+}
+
+/* fat/cache.c */
+extern void fat_cache_inval_inode(struct inode *inode);
+extern int fat_get_cluster(struct inode *inode, int cluster,
+			   int *fclus, int *dclus);
+extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+		    unsigned long *mapped_blocks, int create);
+
+/* fat/dir.c */
+extern const struct file_operations fat_dir_operations;
+extern int fat_search_long(struct inode *inode, const unsigned char *name,
+			   int name_len, struct fat_slot_info *sinfo);
+extern int fat_dir_empty(struct inode *dir);
+extern int fat_subdirs(struct inode *dir);
+extern int fat_scan(struct inode *dir, const unsigned char *name,
+		    struct fat_slot_info *sinfo);
+extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
+				struct msdos_dir_entry **de, loff_t *i_pos);
+extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
+extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
+			   struct fat_slot_info *sinfo);
+extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo);
+
+/* fat/fatent.c */
+struct fat_entry {
+	int entry;
+	union {
+		u8 *ent12_p[2];
+		__le16 *ent16_p;
+		__le32 *ent32_p;
+	} u;
+	int nr_bhs;
+	struct buffer_head *bhs[2];
+	struct inode *fat_inode;
+};
+
+static inline void fatent_init(struct fat_entry *fatent)
+{
+	fatent->nr_bhs = 0;
+	fatent->entry = 0;
+	fatent->u.ent32_p = NULL;
+	fatent->bhs[0] = fatent->bhs[1] = NULL;
+	fatent->fat_inode = NULL;
+}
+
+static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
+{
+	fatent->entry = entry;
+	fatent->u.ent32_p = NULL;
+}
+
+static inline void fatent_brelse(struct fat_entry *fatent)
+{
+	int i;
+	fatent->u.ent32_p = NULL;
+	for (i = 0; i < fatent->nr_bhs; i++)
+		brelse(fatent->bhs[i]);
+	fatent->nr_bhs = 0;
+	fatent->bhs[0] = fatent->bhs[1] = NULL;
+	fatent->fat_inode = NULL;
+}
+
+extern void fat_ent_access_init(struct super_block *sb);
+extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent,
+			int entry);
+extern int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
+			 int new, int wait);
+extern int fat_alloc_clusters(struct inode *inode, int *cluster,
+			      int nr_cluster);
+extern int fat_free_clusters(struct inode *inode, int cluster);
+extern int fat_count_free_clusters(struct super_block *sb);
+
+/* fat/file.c */
+extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
+			      unsigned long arg);
+extern const struct file_operations fat_file_operations;
+extern const struct inode_operations fat_file_inode_operations;
+extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
+extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
+extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		       struct kstat *stat);
+extern int fat_file_fsync(struct file *file, int datasync);
+
+/* fat/inode.c */
+extern void fat_attach(struct inode *inode, loff_t i_pos);
+extern void fat_detach(struct inode *inode);
+extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);
+extern struct inode *fat_build_inode(struct super_block *sb,
+			struct msdos_dir_entry *de, loff_t i_pos);
+extern int fat_sync_inode(struct inode *inode);
+extern int fat_fill_super(struct super_block *sb, void *data, int silent,
+			  int isvfat, void (*setup)(struct super_block *));
+
+extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
+		            struct inode *i2);
+/* fat/misc.c */
+extern void
+__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
+	__attribute__ ((format (printf, 3, 4))) __cold;
+#define fat_fs_error(sb, fmt, args...)		\
+	__fat_fs_error(sb, 1, fmt , ## args)
+#define fat_fs_error_ratelimit(sb, fmt, args...) \
+	__fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+	__attribute__ ((format (printf, 3, 4))) __cold;
+extern int fat_clusters_flush(struct super_block *sb);
+extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
+extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+			      __le16 __time, __le16 __date, u8 time_cs);
+extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+			      __le16 *time, __le16 *date, u8 *time_cs);
+extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
+
+int fat_cache_init(void);
+void fat_cache_destroy(void);
+
+/* helper for printk */
+typedef unsigned long long	llu;
+
+#endif /* !_FAT_H */
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
new file mode 100644
index 00000000..2e81ac0d
--- /dev/null
+++ b/fs/fat/fatent.c
@@ -0,0 +1,685 @@
+/*
+ * Copyright (C) 2004, OGAWA Hirofumi
+ * Released under GPL v2.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/msdos_fs.h>
+#include <linux/blkdev.h>
+#include "fat.h"
+
+struct fatent_operations {
+	void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
+	void (*ent_set_ptr)(struct fat_entry *, int);
+	int (*ent_bread)(struct super_block *, struct fat_entry *,
+			 int, sector_t);
+	int (*ent_get)(struct fat_entry *);
+	void (*ent_put)(struct fat_entry *, int);
+	int (*ent_next)(struct fat_entry *);
+};
+
+static DEFINE_SPINLOCK(fat12_entry_lock);
+
+static void fat12_ent_blocknr(struct super_block *sb, int entry,
+			      int *offset, sector_t *blocknr)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	int bytes = entry + (entry >> 1);
+	WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry);
+	*offset = bytes & (sb->s_blocksize - 1);
+	*blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits);
+}
+
+static void fat_ent_blocknr(struct super_block *sb, int entry,
+			    int *offset, sector_t *blocknr)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	int bytes = (entry << sbi->fatent_shift);
+	WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry);
+	*offset = bytes & (sb->s_blocksize - 1);
+	*blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits);
+}
+
+static void fat12_ent_set_ptr(struct fat_entry *fatent, int offset)
+{
+	struct buffer_head **bhs = fatent->bhs;
+	if (fatent->nr_bhs == 1) {
+		WARN_ON(offset >= (bhs[0]->b_size - 1));
+		fatent->u.ent12_p[0] = bhs[0]->b_data + offset;
+		fatent->u.ent12_p[1] = bhs[0]->b_data + (offset + 1);
+	} else {
+		WARN_ON(offset != (bhs[0]->b_size - 1));
+		fatent->u.ent12_p[0] = bhs[0]->b_data + offset;
+		fatent->u.ent12_p[1] = bhs[1]->b_data;
+	}
+}
+
+static void fat16_ent_set_ptr(struct fat_entry *fatent, int offset)
+{
+	WARN_ON(offset & (2 - 1));
+	fatent->u.ent16_p = (__le16 *)(fatent->bhs[0]->b_data + offset);
+}
+
+static void fat32_ent_set_ptr(struct fat_entry *fatent, int offset)
+{
+	WARN_ON(offset & (4 - 1));
+	fatent->u.ent32_p = (__le32 *)(fatent->bhs[0]->b_data + offset);
+}
+
+static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
+			   int offset, sector_t blocknr)
+{
+	struct buffer_head **bhs = fatent->bhs;
+
+	WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
+	fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
+
+	bhs[0] = sb_bread(sb, blocknr);
+	if (!bhs[0])
+		goto err;
+
+	if ((offset + 1) < sb->s_blocksize)
+		fatent->nr_bhs = 1;
+	else {
+		/* This entry is block boundary, it needs the next block */
+		blocknr++;
+		bhs[1] = sb_bread(sb, blocknr);
+		if (!bhs[1])
+			goto err_brelse;
+		fatent->nr_bhs = 2;
+	}
+	fat12_ent_set_ptr(fatent, offset);
+	return 0;
+
+err_brelse:
+	brelse(bhs[0]);
+err:
+	fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr);
+	return -EIO;
+}
+
+static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
+			 int offset, sector_t blocknr)
+{
+	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+
+	WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
+	fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
+	fatent->bhs[0] = sb_bread(sb, blocknr);
+	if (!fatent->bhs[0]) {
+		fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
+		       (llu)blocknr);
+		return -EIO;
+	}
+	fatent->nr_bhs = 1;
+	ops->ent_set_ptr(fatent, offset);
+	return 0;
+}
+
+static int fat12_ent_get(struct fat_entry *fatent)
+{
+	u8 **ent12_p = fatent->u.ent12_p;
+	int next;
+
+	spin_lock(&fat12_entry_lock);
+	if (fatent->entry & 1)
+		next = (*ent12_p[0] >> 4) | (*ent12_p[1] << 4);
+	else
+		next = (*ent12_p[1] << 8) | *ent12_p[0];
+	spin_unlock(&fat12_entry_lock);
+
+	next &= 0x0fff;
+	if (next >= BAD_FAT12)
+		next = FAT_ENT_EOF;
+	return next;
+}
+
+static int fat16_ent_get(struct fat_entry *fatent)
+{
+	int next = le16_to_cpu(*fatent->u.ent16_p);
+	WARN_ON((unsigned long)fatent->u.ent16_p & (2 - 1));
+	if (next >= BAD_FAT16)
+		next = FAT_ENT_EOF;
+	return next;
+}
+
+static int fat32_ent_get(struct fat_entry *fatent)
+{
+	int next = le32_to_cpu(*fatent->u.ent32_p) & 0x0fffffff;
+	WARN_ON((unsigned long)fatent->u.ent32_p & (4 - 1));
+	if (next >= BAD_FAT32)
+		next = FAT_ENT_EOF;
+	return next;
+}
+
+static void fat12_ent_put(struct fat_entry *fatent, int new)
+{
+	u8 **ent12_p = fatent->u.ent12_p;
+
+	if (new == FAT_ENT_EOF)
+		new = EOF_FAT12;
+
+	spin_lock(&fat12_entry_lock);
+	if (fatent->entry & 1) {
+		*ent12_p[0] = (new << 4) | (*ent12_p[0] & 0x0f);
+		*ent12_p[1] = new >> 4;
+	} else {
+		*ent12_p[0] = new & 0xff;
+		*ent12_p[1] = (*ent12_p[1] & 0xf0) | (new >> 8);
+	}
+	spin_unlock(&fat12_entry_lock);
+
+	mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
+	if (fatent->nr_bhs == 2)
+		mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode);
+}
+
+static void fat16_ent_put(struct fat_entry *fatent, int new)
+{
+	if (new == FAT_ENT_EOF)
+		new = EOF_FAT16;
+
+	*fatent->u.ent16_p = cpu_to_le16(new);
+	mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
+}
+
+static void fat32_ent_put(struct fat_entry *fatent, int new)
+{
+	if (new == FAT_ENT_EOF)
+		new = EOF_FAT32;
+
+	WARN_ON(new & 0xf0000000);
+	new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff;
+	*fatent->u.ent32_p = cpu_to_le32(new);
+	mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
+}
+
+static int fat12_ent_next(struct fat_entry *fatent)
+{
+	u8 **ent12_p = fatent->u.ent12_p;
+	struct buffer_head **bhs = fatent->bhs;
+	u8 *nextp = ent12_p[1] + 1 + (fatent->entry & 1);
+
+	fatent->entry++;
+	if (fatent->nr_bhs == 1) {
+		WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 2)));
+		WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1)));
+		if (nextp < (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))) {
+			ent12_p[0] = nextp - 1;
+			ent12_p[1] = nextp;
+			return 1;
+		}
+	} else {
+		WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1)));
+		WARN_ON(ent12_p[1] != (u8 *)bhs[1]->b_data);
+		ent12_p[0] = nextp - 1;
+		ent12_p[1] = nextp;
+		brelse(bhs[0]);
+		bhs[0] = bhs[1];
+		fatent->nr_bhs = 1;
+		return 1;
+	}
+	ent12_p[0] = NULL;
+	ent12_p[1] = NULL;
+	return 0;
+}
+
+static int fat16_ent_next(struct fat_entry *fatent)
+{
+	const struct buffer_head *bh = fatent->bhs[0];
+	fatent->entry++;
+	if (fatent->u.ent16_p < (__le16 *)(bh->b_data + (bh->b_size - 2))) {
+		fatent->u.ent16_p++;
+		return 1;
+	}
+	fatent->u.ent16_p = NULL;
+	return 0;
+}
+
+static int fat32_ent_next(struct fat_entry *fatent)
+{
+	const struct buffer_head *bh = fatent->bhs[0];
+	fatent->entry++;
+	if (fatent->u.ent32_p < (__le32 *)(bh->b_data + (bh->b_size - 4))) {
+		fatent->u.ent32_p++;
+		return 1;
+	}
+	fatent->u.ent32_p = NULL;
+	return 0;
+}
+
+static struct fatent_operations fat12_ops = {
+	.ent_blocknr	= fat12_ent_blocknr,
+	.ent_set_ptr	= fat12_ent_set_ptr,
+	.ent_bread	= fat12_ent_bread,
+	.ent_get	= fat12_ent_get,
+	.ent_put	= fat12_ent_put,
+	.ent_next	= fat12_ent_next,
+};
+
+static struct fatent_operations fat16_ops = {
+	.ent_blocknr	= fat_ent_blocknr,
+	.ent_set_ptr	= fat16_ent_set_ptr,
+	.ent_bread	= fat_ent_bread,
+	.ent_get	= fat16_ent_get,
+	.ent_put	= fat16_ent_put,
+	.ent_next	= fat16_ent_next,
+};
+
+static struct fatent_operations fat32_ops = {
+	.ent_blocknr	= fat_ent_blocknr,
+	.ent_set_ptr	= fat32_ent_set_ptr,
+	.ent_bread	= fat_ent_bread,
+	.ent_get	= fat32_ent_get,
+	.ent_put	= fat32_ent_put,
+	.ent_next	= fat32_ent_next,
+};
+
+static inline void lock_fat(struct msdos_sb_info *sbi)
+{
+	mutex_lock(&sbi->fat_lock);
+}
+
+static inline void unlock_fat(struct msdos_sb_info *sbi)
+{
+	mutex_unlock(&sbi->fat_lock);
+}
+
+void fat_ent_access_init(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+
+	mutex_init(&sbi->fat_lock);
+
+	switch (sbi->fat_bits) {
+	case 32:
+		sbi->fatent_shift = 2;
+		sbi->fatent_ops = &fat32_ops;
+		break;
+	case 16:
+		sbi->fatent_shift = 1;
+		sbi->fatent_ops = &fat16_ops;
+		break;
+	case 12:
+		sbi->fatent_shift = -1;
+		sbi->fatent_ops = &fat12_ops;
+		break;
+	}
+}
+
+static inline int fat_ent_update_ptr(struct super_block *sb,
+				     struct fat_entry *fatent,
+				     int offset, sector_t blocknr)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct fatent_operations *ops = sbi->fatent_ops;
+	struct buffer_head **bhs = fatent->bhs;
+
+	/* Is this fatent's blocks including this entry? */
+	if (!fatent->nr_bhs || bhs[0]->b_blocknr != blocknr)
+		return 0;
+	if (sbi->fat_bits == 12) {
+		if ((offset + 1) < sb->s_blocksize) {
+			/* This entry is on bhs[0]. */
+			if (fatent->nr_bhs == 2) {
+				brelse(bhs[1]);
+				fatent->nr_bhs = 1;
+			}
+		} else {
+			/* This entry needs the next block. */
+			if (fatent->nr_bhs != 2)
+				return 0;
+			if (bhs[1]->b_blocknr != (blocknr + 1))
+				return 0;
+		}
+	}
+	ops->ent_set_ptr(fatent, offset);
+	return 1;
+}
+
+int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	struct fatent_operations *ops = sbi->fatent_ops;
+	int err, offset;
+	sector_t blocknr;
+
+	if (entry < FAT_START_ENT || sbi->max_cluster <= entry) {
+		fatent_brelse(fatent);
+		fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry);
+		return -EIO;
+	}
+
+	fatent_set_entry(fatent, entry);
+	ops->ent_blocknr(sb, entry, &offset, &blocknr);
+
+	if (!fat_ent_update_ptr(sb, fatent, offset, blocknr)) {
+		fatent_brelse(fatent);
+		err = ops->ent_bread(sb, fatent, offset, blocknr);
+		if (err)
+			return err;
+	}
+	return ops->ent_get(fatent);
+}
+
+/* FIXME: We can write the blocks as more big chunk. */
+static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
+			  int nr_bhs)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *c_bh;
+	int err, n, copy;
+
+	err = 0;
+	for (copy = 1; copy < sbi->fats; copy++) {
+		sector_t backup_fat = sbi->fat_length * copy;
+
+		for (n = 0; n < nr_bhs; n++) {
+			c_bh = sb_getblk(sb, backup_fat + bhs[n]->b_blocknr);
+			if (!c_bh) {
+				err = -ENOMEM;
+				goto error;
+			}
+			memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
+			set_buffer_uptodate(c_bh);
+			mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
+			if (sb->s_flags & MS_SYNCHRONOUS)
+				err = sync_dirty_buffer(c_bh);
+			brelse(c_bh);
+			if (err)
+				goto error;
+		}
+	}
+error:
+	return err;
+}
+
+int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
+		  int new, int wait)
+{
+	struct super_block *sb = inode->i_sb;
+	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+	int err;
+
+	ops->ent_put(fatent, new);
+	if (wait) {
+		err = fat_sync_bhs(fatent->bhs, fatent->nr_bhs);
+		if (err)
+			return err;
+	}
+	return fat_mirror_bhs(sb, fatent->bhs, fatent->nr_bhs);
+}
+
+static inline int fat_ent_next(struct msdos_sb_info *sbi,
+			       struct fat_entry *fatent)
+{
+	if (sbi->fatent_ops->ent_next(fatent)) {
+		if (fatent->entry < sbi->max_cluster)
+			return 1;
+	}
+	return 0;
+}
+
+static inline int fat_ent_read_block(struct super_block *sb,
+				     struct fat_entry *fatent)
+{
+	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+	sector_t blocknr;
+	int offset;
+
+	fatent_brelse(fatent);
+	ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr);
+	return ops->ent_bread(sb, fatent, offset, blocknr);
+}
+
+static void fat_collect_bhs(struct buffer_head **bhs, int *nr_bhs,
+			    struct fat_entry *fatent)
+{
+	int n, i;
+
+	for (n = 0; n < fatent->nr_bhs; n++) {
+		for (i = 0; i < *nr_bhs; i++) {
+			if (fatent->bhs[n] == bhs[i])
+				break;
+		}
+		if (i == *nr_bhs) {
+			get_bh(fatent->bhs[n]);
+			bhs[i] = fatent->bhs[n];
+			(*nr_bhs)++;
+		}
+	}
+}
+
+int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct fatent_operations *ops = sbi->fatent_ops;
+	struct fat_entry fatent, prev_ent;
+	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
+	int i, count, err, nr_bhs, idx_clus;
+
+	BUG_ON(nr_cluster > (MAX_BUF_PER_PAGE / 2));	/* fixed limit */
+
+	lock_fat(sbi);
+	if (sbi->free_clusters != -1 && sbi->free_clus_valid &&
+	    sbi->free_clusters < nr_cluster) {
+		unlock_fat(sbi);
+		return -ENOSPC;
+	}
+
+	err = nr_bhs = idx_clus = 0;
+	count = FAT_START_ENT;
+	fatent_init(&prev_ent);
+	fatent_init(&fatent);
+	fatent_set_entry(&fatent, sbi->prev_free + 1);
+	while (count < sbi->max_cluster) {
+		if (fatent.entry >= sbi->max_cluster)
+			fatent.entry = FAT_START_ENT;
+		fatent_set_entry(&fatent, fatent.entry);
+		err = fat_ent_read_block(sb, &fatent);
+		if (err)
+			goto out;
+
+		/* Find the free entries in a block */
+		do {
+			if (ops->ent_get(&fatent) == FAT_ENT_FREE) {
+				int entry = fatent.entry;
+
+				/* make the cluster chain */
+				ops->ent_put(&fatent, FAT_ENT_EOF);
+				if (prev_ent.nr_bhs)
+					ops->ent_put(&prev_ent, entry);
+
+				fat_collect_bhs(bhs, &nr_bhs, &fatent);
+
+				sbi->prev_free = entry;
+				if (sbi->free_clusters != -1)
+					sbi->free_clusters--;
+				sb->s_dirt = 1;
+
+				cluster[idx_clus] = entry;
+				idx_clus++;
+				if (idx_clus == nr_cluster)
+					goto out;
+
+				/*
+				 * fat_collect_bhs() gets ref-count of bhs,
+				 * so we can still use the prev_ent.
+				 */
+				prev_ent = fatent;
+			}
+			count++;
+			if (count == sbi->max_cluster)
+				break;
+		} while (fat_ent_next(sbi, &fatent));
+	}
+
+	/* Couldn't allocate the free entries */
+	sbi->free_clusters = 0;
+	sbi->free_clus_valid = 1;
+	sb->s_dirt = 1;
+	err = -ENOSPC;
+
+out:
+	unlock_fat(sbi);
+	fatent_brelse(&fatent);
+	if (!err) {
+		if (inode_needs_sync(inode))
+			err = fat_sync_bhs(bhs, nr_bhs);
+		if (!err)
+			err = fat_mirror_bhs(sb, bhs, nr_bhs);
+	}
+	for (i = 0; i < nr_bhs; i++)
+		brelse(bhs[i]);
+
+	if (err && idx_clus)
+		fat_free_clusters(inode, cluster[0]);
+
+	return err;
+}
+
+int fat_free_clusters(struct inode *inode, int cluster)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct fatent_operations *ops = sbi->fatent_ops;
+	struct fat_entry fatent;
+	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
+	int i, err, nr_bhs;
+	int first_cl = cluster;
+
+	nr_bhs = 0;
+	fatent_init(&fatent);
+	lock_fat(sbi);
+	do {
+		cluster = fat_ent_read(inode, &fatent, cluster);
+		if (cluster < 0) {
+			err = cluster;
+			goto error;
+		} else if (cluster == FAT_ENT_FREE) {
+			fat_fs_error(sb, "%s: deleting FAT entry beyond EOF",
+				     __func__);
+			err = -EIO;
+			goto error;
+		}
+
+		if (sbi->options.discard) {
+			/*
+			 * Issue discard for the sectors we no longer
+			 * care about, batching contiguous clusters
+			 * into one request
+			 */
+			if (cluster != fatent.entry + 1) {
+				int nr_clus = fatent.entry - first_cl + 1;
+
+				sb_issue_discard(sb,
+					fat_clus_to_blknr(sbi, first_cl),
+					nr_clus * sbi->sec_per_clus,
+					GFP_NOFS, 0);
+
+				first_cl = cluster;
+			}
+		}
+
+		ops->ent_put(&fatent, FAT_ENT_FREE);
+		if (sbi->free_clusters != -1) {
+			sbi->free_clusters++;
+			sb->s_dirt = 1;
+		}
+
+		if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) {
+			if (sb->s_flags & MS_SYNCHRONOUS) {
+				err = fat_sync_bhs(bhs, nr_bhs);
+				if (err)
+					goto error;
+			}
+			err = fat_mirror_bhs(sb, bhs, nr_bhs);
+			if (err)
+				goto error;
+			for (i = 0; i < nr_bhs; i++)
+				brelse(bhs[i]);
+			nr_bhs = 0;
+		}
+		fat_collect_bhs(bhs, &nr_bhs, &fatent);
+	} while (cluster != FAT_ENT_EOF);
+
+	if (sb->s_flags & MS_SYNCHRONOUS) {
+		err = fat_sync_bhs(bhs, nr_bhs);
+		if (err)
+			goto error;
+	}
+	err = fat_mirror_bhs(sb, bhs, nr_bhs);
+error:
+	fatent_brelse(&fatent);
+	for (i = 0; i < nr_bhs; i++)
+		brelse(bhs[i]);
+	unlock_fat(sbi);
+
+	return err;
+}
+
+EXPORT_SYMBOL_GPL(fat_free_clusters);
+
+/* 128kb is the whole sectors for FAT12 and FAT16 */
+#define FAT_READA_SIZE		(128 * 1024)
+
+static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent,
+			  unsigned long reada_blocks)
+{
+	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+	sector_t blocknr;
+	int i, offset;
+
+	ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr);
+
+	for (i = 0; i < reada_blocks; i++)
+		sb_breadahead(sb, blocknr + i);
+}
+
+int fat_count_free_clusters(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct fatent_operations *ops = sbi->fatent_ops;
+	struct fat_entry fatent;
+	unsigned long reada_blocks, reada_mask, cur_block;
+	int err = 0, free;
+
+	lock_fat(sbi);
+	if (sbi->free_clusters != -1 && sbi->free_clus_valid)
+		goto out;
+
+	reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits;
+	reada_mask = reada_blocks - 1;
+	cur_block = 0;
+
+	free = 0;
+	fatent_init(&fatent);
+	fatent_set_entry(&fatent, FAT_START_ENT);
+	while (fatent.entry < sbi->max_cluster) {
+		/* readahead of fat blocks */
+		if ((cur_block & reada_mask) == 0) {
+			unsigned long rest = sbi->fat_length - cur_block;
+			fat_ent_reada(sb, &fatent, min(reada_blocks, rest));
+		}
+		cur_block++;
+
+		err = fat_ent_read_block(sb, &fatent);
+		if (err)
+			goto out;
+
+		do {
+			if (ops->ent_get(&fatent) == FAT_ENT_FREE)
+				free++;
+		} while (fat_ent_next(sbi, &fatent));
+	}
+	sbi->free_clusters = free;
+	sbi->free_clus_valid = 1;
+	sb->s_dirt = 1;
+	fatent_brelse(&fatent);
+out:
+	unlock_fat(sbi);
+	return err;
+}
diff --git a/fs/fat/file.c b/fs/fat/file.c
new file mode 100644
index 00000000..7018e1d8
--- /dev/null
+++ b/fs/fat/file.c
@@ -0,0 +1,446 @@
+/*
+ *  linux/fs/fat/file.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *
+ *  regular file handling primitives for fat-based filesystems
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include <linux/time.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/fsnotify.h>
+#include <linux/security.h>
+#include "fat.h"
+
+static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
+{
+	u32 attr;
+
+	mutex_lock(&inode->i_mutex);
+	attr = fat_make_attrs(inode);
+	mutex_unlock(&inode->i_mutex);
+
+	return put_user(attr, user_attr);
+}
+
+static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	int is_dir = S_ISDIR(inode->i_mode);
+	u32 attr, oldattr;
+	struct iattr ia;
+	int err;
+
+	err = get_user(attr, user_attr);
+	if (err)
+		goto out;
+
+	mutex_lock(&inode->i_mutex);
+	err = mnt_want_write(file->f_path.mnt);
+	if (err)
+		goto out_unlock_inode;
+
+	/*
+	 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
+	 * prevents the user from turning us into a VFAT
+	 * longname entry.  Also, we obviously can't set
+	 * any of the NTFS attributes in the high 24 bits.
+	 */
+	attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR);
+	/* Merge in ATTR_VOLUME and ATTR_DIR */
+	attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
+		(is_dir ? ATTR_DIR : 0);
+	oldattr = fat_make_attrs(inode);
+
+	/* Equivalent to a chmod() */
+	ia.ia_valid = ATTR_MODE | ATTR_CTIME;
+	ia.ia_ctime = current_fs_time(inode->i_sb);
+	if (is_dir)
+		ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
+	else {
+		ia.ia_mode = fat_make_mode(sbi, attr,
+			S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
+	}
+
+	/* The root directory has no attributes */
+	if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
+		err = -EINVAL;
+		goto out_drop_write;
+	}
+
+	if (sbi->options.sys_immutable &&
+	    ((attr | oldattr) & ATTR_SYS) &&
+	    !capable(CAP_LINUX_IMMUTABLE)) {
+		err = -EPERM;
+		goto out_drop_write;
+	}
+
+	/*
+	 * The security check is questionable...  We single
+	 * out the RO attribute for checking by the security
+	 * module, just because it maps to a file mode.
+	 */
+	err = security_inode_setattr(file->f_path.dentry, &ia);
+	if (err)
+		goto out_drop_write;
+
+	/* This MUST be done before doing anything irreversible... */
+	err = fat_setattr(file->f_path.dentry, &ia);
+	if (err)
+		goto out_drop_write;
+
+	fsnotify_change(file->f_path.dentry, ia.ia_valid);
+	if (sbi->options.sys_immutable) {
+		if (attr & ATTR_SYS)
+			inode->i_flags |= S_IMMUTABLE;
+		else
+			inode->i_flags &= ~S_IMMUTABLE;
+	}
+
+	fat_save_attrs(inode, attr);
+	mark_inode_dirty(inode);
+out_drop_write:
+	mnt_drop_write(file->f_path.mnt);
+out_unlock_inode:
+	mutex_unlock(&inode->i_mutex);
+out:
+	return err;
+}
+
+long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	u32 __user *user_attr = (u32 __user *)arg;
+
+	switch (cmd) {
+	case FAT_IOCTL_GET_ATTRIBUTES:
+		return fat_ioctl_get_attributes(inode, user_attr);
+	case FAT_IOCTL_SET_ATTRIBUTES:
+		return fat_ioctl_set_attributes(filp, user_attr);
+	default:
+		return -ENOTTY;	/* Inappropriate ioctl for device */
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
+				      unsigned long arg)
+
+{
+	return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static int fat_file_release(struct inode *inode, struct file *filp)
+{
+	if ((filp->f_mode & FMODE_WRITE) &&
+	     MSDOS_SB(inode->i_sb)->options.flush) {
+		fat_flush_inodes(inode->i_sb, inode, NULL);
+		congestion_wait(BLK_RW_ASYNC, HZ/10);
+	}
+	return 0;
+}
+
+int fat_file_fsync(struct file *filp, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int res, err;
+
+	res = generic_file_fsync(filp, datasync);
+	err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
+
+	return res ? res : err;
+}
+
+
+const struct file_operations fat_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.release	= fat_file_release,
+	.unlocked_ioctl	= fat_generic_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= fat_generic_compat_ioctl,
+#endif
+	.fsync		= fat_file_fsync,
+	.splice_read	= generic_file_splice_read,
+};
+
+static int fat_cont_expand(struct inode *inode, loff_t size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	loff_t start = inode->i_size, count = size - inode->i_size;
+	int err;
+
+	err = generic_cont_expand_simple(inode, size);
+	if (err)
+		goto out;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	if (IS_SYNC(inode)) {
+		int err2;
+
+		/*
+		 * Opencode syncing since we don't have a file open to use
+		 * standard fsync path.
+		 */
+		err = filemap_fdatawrite_range(mapping, start,
+					       start + count - 1);
+		err2 = sync_mapping_buffers(mapping);
+		if (!err)
+			err = err2;
+		err2 = write_inode_now(inode, 1);
+		if (!err)
+			err = err2;
+		if (!err) {
+			err =  filemap_fdatawait_range(mapping, start,
+						       start + count - 1);
+		}
+	}
+out:
+	return err;
+}
+
+/* Free all clusters after the skip'th cluster. */
+static int fat_free(struct inode *inode, int skip)
+{
+	struct super_block *sb = inode->i_sb;
+	int err, wait, free_start, i_start, i_logstart;
+
+	if (MSDOS_I(inode)->i_start == 0)
+		return 0;
+
+	fat_cache_inval_inode(inode);
+
+	wait = IS_DIRSYNC(inode);
+	i_start = free_start = MSDOS_I(inode)->i_start;
+	i_logstart = MSDOS_I(inode)->i_logstart;
+
+	/* First, we write the new file size. */
+	if (!skip) {
+		MSDOS_I(inode)->i_start = 0;
+		MSDOS_I(inode)->i_logstart = 0;
+	}
+	MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	if (wait) {
+		err = fat_sync_inode(inode);
+		if (err) {
+			MSDOS_I(inode)->i_start = i_start;
+			MSDOS_I(inode)->i_logstart = i_logstart;
+			return err;
+		}
+	} else
+		mark_inode_dirty(inode);
+
+	/* Write a new EOF, and get the remaining cluster chain for freeing. */
+	if (skip) {
+		struct fat_entry fatent;
+		int ret, fclus, dclus;
+
+		ret = fat_get_cluster(inode, skip - 1, &fclus, &dclus);
+		if (ret < 0)
+			return ret;
+		else if (ret == FAT_ENT_EOF)
+			return 0;
+
+		fatent_init(&fatent);
+		ret = fat_ent_read(inode, &fatent, dclus);
+		if (ret == FAT_ENT_EOF) {
+			fatent_brelse(&fatent);
+			return 0;
+		} else if (ret == FAT_ENT_FREE) {
+			fat_fs_error(sb,
+				     "%s: invalid cluster chain (i_pos %lld)",
+				     __func__, MSDOS_I(inode)->i_pos);
+			ret = -EIO;
+		} else if (ret > 0) {
+			err = fat_ent_write(inode, &fatent, FAT_ENT_EOF, wait);
+			if (err)
+				ret = err;
+		}
+		fatent_brelse(&fatent);
+		if (ret < 0)
+			return ret;
+
+		free_start = ret;
+	}
+	inode->i_blocks = skip << (MSDOS_SB(sb)->cluster_bits - 9);
+
+	/* Freeing the remained cluster chain */
+	return fat_free_clusters(inode, free_start);
+}
+
+void fat_truncate_blocks(struct inode *inode, loff_t offset)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	const unsigned int cluster_size = sbi->cluster_size;
+	int nr_clusters;
+
+	/*
+	 * This protects against truncating a file bigger than it was then
+	 * trying to write into the hole.
+	 */
+	if (MSDOS_I(inode)->mmu_private > offset)
+		MSDOS_I(inode)->mmu_private = offset;
+
+	nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
+
+	fat_free(inode, nr_clusters);
+	fat_flush_inodes(inode->i_sb, inode, NULL);
+}
+
+int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	generic_fillattr(inode, stat);
+	stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fat_getattr);
+
+static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
+			     struct inode *inode, umode_t *mode_ptr)
+{
+	mode_t mask, perm;
+
+	/*
+	 * Note, the basic check is already done by a caller of
+	 * (attr->ia_mode & ~FAT_VALID_MODE)
+	 */
+
+	if (S_ISREG(inode->i_mode))
+		mask = sbi->options.fs_fmask;
+	else
+		mask = sbi->options.fs_dmask;
+
+	perm = *mode_ptr & ~(S_IFMT | mask);
+
+	/*
+	 * Of the r and x bits, all (subject to umask) must be present. Of the
+	 * w bits, either all (subject to umask) or none must be present.
+	 *
+	 * If fat_mode_can_hold_ro(inode) is false, can't change w bits.
+	 */
+	if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
+		return -EPERM;
+	if (fat_mode_can_hold_ro(inode)) {
+		if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
+			return -EPERM;
+	} else {
+		if ((perm & S_IWUGO) != (S_IWUGO & ~mask))
+			return -EPERM;
+	}
+
+	*mode_ptr &= S_IFMT | perm;
+
+	return 0;
+}
+
+static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
+{
+	mode_t allow_utime = sbi->options.allow_utime;
+
+	if (current_fsuid() != inode->i_uid) {
+		if (in_group_p(inode->i_gid))
+			allow_utime >>= 3;
+		if (allow_utime & MAY_WRITE)
+			return 1;
+	}
+
+	/* use a default check */
+	return 0;
+}
+
+#define TIMES_SET_FLAGS	(ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+/* valid file mode bits */
+#define FAT_VALID_MODE	(S_IFREG | S_IFDIR | S_IRWXUGO)
+
+int fat_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
+	struct inode *inode = dentry->d_inode;
+	unsigned int ia_valid;
+	int error;
+
+	/* Check for setting the inode time. */
+	ia_valid = attr->ia_valid;
+	if (ia_valid & TIMES_SET_FLAGS) {
+		if (fat_allow_set_time(sbi, inode))
+			attr->ia_valid &= ~TIMES_SET_FLAGS;
+	}
+
+	error = inode_change_ok(inode, attr);
+	attr->ia_valid = ia_valid;
+	if (error) {
+		if (sbi->options.quiet)
+			error = 0;
+		goto out;
+	}
+
+	/*
+	 * Expand the file. Since inode_setattr() updates ->i_size
+	 * before calling the ->truncate(), but FAT needs to fill the
+	 * hole before it. XXX: this is no longer true with new truncate
+	 * sequence.
+	 */
+	if (attr->ia_valid & ATTR_SIZE) {
+		if (attr->ia_size > inode->i_size) {
+			error = fat_cont_expand(inode, attr->ia_size);
+			if (error || attr->ia_valid == ATTR_SIZE)
+				goto out;
+			attr->ia_valid &= ~ATTR_SIZE;
+		}
+	}
+
+	if (((attr->ia_valid & ATTR_UID) &&
+	     (attr->ia_uid != sbi->options.fs_uid)) ||
+	    ((attr->ia_valid & ATTR_GID) &&
+	     (attr->ia_gid != sbi->options.fs_gid)) ||
+	    ((attr->ia_valid & ATTR_MODE) &&
+	     (attr->ia_mode & ~FAT_VALID_MODE)))
+		error = -EPERM;
+
+	if (error) {
+		if (sbi->options.quiet)
+			error = 0;
+		goto out;
+	}
+
+	/*
+	 * We don't return -EPERM here. Yes, strange, but this is too
+	 * old behavior.
+	 */
+	if (attr->ia_valid & ATTR_MODE) {
+		if (fat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0)
+			attr->ia_valid &= ~ATTR_MODE;
+	}
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		truncate_setsize(inode, attr->ia_size);
+		fat_truncate_blocks(inode, attr->ia_size);
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+out:
+	return error;
+}
+EXPORT_SYMBOL_GPL(fat_setattr);
+
+const struct inode_operations fat_file_inode_operations = {
+	.setattr	= fat_setattr,
+	.getattr	= fat_getattr,
+};
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
new file mode 100644
index 00000000..9836839e
--- /dev/null
+++ b/fs/fat/inode.c
@@ -0,0 +1,1619 @@
+/*
+ *  linux/fs/fat/inode.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  VFAT extensions by Gordon Chaffee, merged with msdos fs by Henrik Storner
+ *  Rewritten for the constant inumbers support by Al Viro
+ *
+ *  Fixes:
+ *
+ *	Max Cohan: Fixed invalid FSINFO offset when info_sector is 0
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/mpage.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/parser.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+#include <linux/log2.h>
+#include <linux/hash.h>
+#include <asm/unaligned.h>
+#include "fat.h"
+
+#ifndef CONFIG_FAT_DEFAULT_IOCHARSET
+/* if user don't select VFAT, this is undefined. */
+#define CONFIG_FAT_DEFAULT_IOCHARSET	""
+#endif
+
+static int fat_default_codepage = CONFIG_FAT_DEFAULT_CODEPAGE;
+static char fat_default_iocharset[] = CONFIG_FAT_DEFAULT_IOCHARSET;
+
+
+static int fat_add_cluster(struct inode *inode)
+{
+	int err, cluster;
+
+	err = fat_alloc_clusters(inode, &cluster, 1);
+	if (err)
+		return err;
+	/* FIXME: this cluster should be added after data of this
+	 * cluster is writed */
+	err = fat_chain_add(inode, cluster, 1);
+	if (err)
+		fat_free_clusters(inode, cluster);
+	return err;
+}
+
+static inline int __fat_get_block(struct inode *inode, sector_t iblock,
+				  unsigned long *max_blocks,
+				  struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	unsigned long mapped_blocks;
+	sector_t phys;
+	int err, offset;
+
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+	if (err)
+		return err;
+	if (phys) {
+		map_bh(bh_result, sb, phys);
+		*max_blocks = min(mapped_blocks, *max_blocks);
+		return 0;
+	}
+	if (!create)
+		return 0;
+
+	if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) {
+		fat_fs_error(sb, "corrupted file size (i_pos %lld, %lld)",
+			MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
+		return -EIO;
+	}
+
+	offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
+	if (!offset) {
+		/* TODO: multiple cluster allocation would be desirable. */
+		err = fat_add_cluster(inode);
+		if (err)
+			return err;
+	}
+	/* available blocks on this cluster */
+	mapped_blocks = sbi->sec_per_clus - offset;
+
+	*max_blocks = min(mapped_blocks, *max_blocks);
+	MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
+
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+	if (err)
+		return err;
+
+	BUG_ON(!phys);
+	BUG_ON(*max_blocks != mapped_blocks);
+	set_buffer_new(bh_result);
+	map_bh(bh_result, sb, phys);
+
+	return 0;
+}
+
+static int fat_get_block(struct inode *inode, sector_t iblock,
+			 struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int err;
+
+	err = __fat_get_block(inode, iblock, &max_blocks, bh_result, create);
+	if (err)
+		return err;
+	bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+	return 0;
+}
+
+static int fat_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, fat_get_block, wbc);
+}
+
+static int fat_writepages(struct address_space *mapping,
+			  struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, fat_get_block);
+}
+
+static int fat_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, fat_get_block);
+}
+
+static int fat_readpages(struct file *file, struct address_space *mapping,
+			 struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
+}
+
+static void fat_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, to, inode->i_size);
+		fat_truncate_blocks(inode, inode->i_size);
+	}
+}
+
+static int fat_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int err;
+
+	*pagep = NULL;
+	err = cont_write_begin(file, mapping, pos, len, flags,
+				pagep, fsdata, fat_get_block,
+				&MSDOS_I(mapping->host)->mmu_private);
+	if (err < 0)
+		fat_write_failed(mapping, pos + len);
+	return err;
+}
+
+static int fat_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *pagep, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int err;
+	err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+	if (err < len)
+		fat_write_failed(mapping, pos + len);
+	if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
+		inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+		MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
+		mark_inode_dirty(inode);
+	}
+	return err;
+}
+
+static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
+			     const struct iovec *iov,
+			     loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+
+	if (rw == WRITE) {
+		/*
+		 * FIXME: blockdev_direct_IO() doesn't use ->write_begin(),
+		 * so we need to update the ->mmu_private to block boundary.
+		 *
+		 * But we must fill the remaining area or hole by nul for
+		 * updating ->mmu_private.
+		 *
+		 * Return 0, and fallback to normal buffered write.
+		 */
+		loff_t size = offset + iov_length(iov, nr_segs);
+		if (MSDOS_I(inode)->mmu_private < size)
+			return 0;
+	}
+
+	/*
+	 * FAT need to use the DIO_LOCKING for avoiding the race
+	 * condition of fat_get_block() and ->truncate().
+	 */
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+				 iov, offset, nr_segs, fat_get_block, NULL);
+	if (ret < 0 && (rw & WRITE))
+		fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
+
+	return ret;
+}
+
+static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
+{
+	sector_t blocknr;
+
+	/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
+	down_read(&mapping->host->i_alloc_sem);
+	blocknr = generic_block_bmap(mapping, block, fat_get_block);
+	up_read(&mapping->host->i_alloc_sem);
+
+	return blocknr;
+}
+
+static const struct address_space_operations fat_aops = {
+	.readpage	= fat_readpage,
+	.readpages	= fat_readpages,
+	.writepage	= fat_writepage,
+	.writepages	= fat_writepages,
+	.write_begin	= fat_write_begin,
+	.write_end	= fat_write_end,
+	.direct_IO	= fat_direct_IO,
+	.bmap		= _fat_bmap
+};
+
+/*
+ * New FAT inode stuff. We do the following:
+ *	a) i_ino is constant and has nothing with on-disk location.
+ *	b) FAT manages its own cache of directory entries.
+ *	c) *This* cache is indexed by on-disk location.
+ *	d) inode has an associated directory entry, all right, but
+ *		it may be unhashed.
+ *	e) currently entries are stored within struct inode. That should
+ *		change.
+ *	f) we deal with races in the following way:
+ *		1. readdir() and lookup() do FAT-dir-cache lookup.
+ *		2. rename() unhashes the F-d-c entry and rehashes it in
+ *			a new place.
+ *		3. unlink() and rmdir() unhash F-d-c entry.
+ *		4. fat_write_inode() checks whether the thing is unhashed.
+ *			If it is we silently return. If it isn't we do bread(),
+ *			check if the location is still valid and retry if it
+ *			isn't. Otherwise we do changes.
+ *		5. Spinlock is used to protect hash/unhash/location check/lookup
+ *		6. fat_evict_inode() unhashes the F-d-c entry.
+ *		7. lookup() and readdir() do igrab() if they find a F-d-c entry
+ *			and consider negative result as cache miss.
+ */
+
+static void fat_hash_init(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	int i;
+
+	spin_lock_init(&sbi->inode_hash_lock);
+	for (i = 0; i < FAT_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&sbi->inode_hashtable[i]);
+}
+
+static inline unsigned long fat_hash(loff_t i_pos)
+{
+	return hash_32(i_pos, FAT_HASH_BITS);
+}
+
+void fat_attach(struct inode *inode, loff_t i_pos)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
+
+	spin_lock(&sbi->inode_hash_lock);
+	MSDOS_I(inode)->i_pos = i_pos;
+	hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head);
+	spin_unlock(&sbi->inode_hash_lock);
+}
+EXPORT_SYMBOL_GPL(fat_attach);
+
+void fat_detach(struct inode *inode)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	spin_lock(&sbi->inode_hash_lock);
+	MSDOS_I(inode)->i_pos = 0;
+	hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
+	spin_unlock(&sbi->inode_hash_lock);
+}
+EXPORT_SYMBOL_GPL(fat_detach);
+
+struct inode *fat_iget(struct super_block *sb, loff_t i_pos)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
+	struct hlist_node *_p;
+	struct msdos_inode_info *i;
+	struct inode *inode = NULL;
+
+	spin_lock(&sbi->inode_hash_lock);
+	hlist_for_each_entry(i, _p, head, i_fat_hash) {
+		BUG_ON(i->vfs_inode.i_sb != sb);
+		if (i->i_pos != i_pos)
+			continue;
+		inode = igrab(&i->vfs_inode);
+		if (inode)
+			break;
+	}
+	spin_unlock(&sbi->inode_hash_lock);
+	return inode;
+}
+
+static int is_exec(unsigned char *extension)
+{
+	unsigned char *exe_extensions = "EXECOMBAT", *walk;
+
+	for (walk = exe_extensions; *walk; walk += 3)
+		if (!strncmp(extension, walk, 3))
+			return 1;
+	return 0;
+}
+
+static int fat_calc_dir_size(struct inode *inode)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	int ret, fclus, dclus;
+
+	inode->i_size = 0;
+	if (MSDOS_I(inode)->i_start == 0)
+		return 0;
+
+	ret = fat_get_cluster(inode, FAT_ENT_EOF, &fclus, &dclus);
+	if (ret < 0)
+		return ret;
+	inode->i_size = (fclus + 1) << sbi->cluster_bits;
+
+	return 0;
+}
+
+/* doesn't deal with root inode */
+static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	int error;
+
+	MSDOS_I(inode)->i_pos = 0;
+	inode->i_uid = sbi->options.fs_uid;
+	inode->i_gid = sbi->options.fs_gid;
+	inode->i_version++;
+	inode->i_generation = get_seconds();
+
+	if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) {
+		inode->i_generation &= ~1;
+		inode->i_mode = fat_make_mode(sbi, de->attr, S_IRWXUGO);
+		inode->i_op = sbi->dir_ops;
+		inode->i_fop = &fat_dir_operations;
+
+		MSDOS_I(inode)->i_start = le16_to_cpu(de->start);
+		if (sbi->fat_bits == 32)
+			MSDOS_I(inode)->i_start |= (le16_to_cpu(de->starthi) << 16);
+
+		MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start;
+		error = fat_calc_dir_size(inode);
+		if (error < 0)
+			return error;
+		MSDOS_I(inode)->mmu_private = inode->i_size;
+
+		inode->i_nlink = fat_subdirs(inode);
+	} else { /* not a directory */
+		inode->i_generation |= 1;
+		inode->i_mode = fat_make_mode(sbi, de->attr,
+			((sbi->options.showexec && !is_exec(de->name + 8))
+			 ? S_IRUGO|S_IWUGO : S_IRWXUGO));
+		MSDOS_I(inode)->i_start = le16_to_cpu(de->start);
+		if (sbi->fat_bits == 32)
+			MSDOS_I(inode)->i_start |= (le16_to_cpu(de->starthi) << 16);
+
+		MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start;
+		inode->i_size = le32_to_cpu(de->size);
+		inode->i_op = &fat_file_inode_operations;
+		inode->i_fop = &fat_file_operations;
+		inode->i_mapping->a_ops = &fat_aops;
+		MSDOS_I(inode)->mmu_private = inode->i_size;
+	}
+	if (de->attr & ATTR_SYS) {
+		if (sbi->options.sys_immutable)
+			inode->i_flags |= S_IMMUTABLE;
+	}
+	fat_save_attrs(inode, de->attr);
+
+	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
+			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
+
+	fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
+	if (sbi->options.isvfat) {
+		fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime,
+				  de->cdate, de->ctime_cs);
+		fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
+	} else
+		inode->i_ctime = inode->i_atime = inode->i_mtime;
+
+	return 0;
+}
+
+struct inode *fat_build_inode(struct super_block *sb,
+			struct msdos_dir_entry *de, loff_t i_pos)
+{
+	struct inode *inode;
+	int err;
+
+	inode = fat_iget(sb, i_pos);
+	if (inode)
+		goto out;
+	inode = new_inode(sb);
+	if (!inode) {
+		inode = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	inode->i_ino = iunique(sb, MSDOS_ROOT_INO);
+	inode->i_version = 1;
+	err = fat_fill_inode(inode, de);
+	if (err) {
+		iput(inode);
+		inode = ERR_PTR(err);
+		goto out;
+	}
+	fat_attach(inode, i_pos);
+	insert_inode_hash(inode);
+out:
+	return inode;
+}
+
+EXPORT_SYMBOL_GPL(fat_build_inode);
+
+static void fat_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	if (!inode->i_nlink) {
+		inode->i_size = 0;
+		fat_truncate_blocks(inode, 0);
+	}
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
+	fat_cache_inval_inode(inode);
+	fat_detach(inode);
+}
+
+static void fat_write_super(struct super_block *sb)
+{
+	lock_super(sb);
+	sb->s_dirt = 0;
+
+	if (!(sb->s_flags & MS_RDONLY))
+		fat_clusters_flush(sb);
+	unlock_super(sb);
+}
+
+static int fat_sync_fs(struct super_block *sb, int wait)
+{
+	int err = 0;
+
+	if (sb->s_dirt) {
+		lock_super(sb);
+		sb->s_dirt = 0;
+		err = fat_clusters_flush(sb);
+		unlock_super(sb);
+	}
+
+	return err;
+}
+
+static void fat_put_super(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+
+	if (sb->s_dirt)
+		fat_write_super(sb);
+
+	iput(sbi->fat_inode);
+
+	unload_nls(sbi->nls_disk);
+	unload_nls(sbi->nls_io);
+
+	if (sbi->options.iocharset != fat_default_iocharset)
+		kfree(sbi->options.iocharset);
+
+	sb->s_fs_info = NULL;
+	kfree(sbi);
+}
+
+static struct kmem_cache *fat_inode_cachep;
+
+static struct inode *fat_alloc_inode(struct super_block *sb)
+{
+	struct msdos_inode_info *ei;
+	ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void fat_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
+}
+
+static void fat_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, fat_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
+
+	spin_lock_init(&ei->cache_lru_lock);
+	ei->nr_caches = 0;
+	ei->cache_valid_id = FAT_CACHE_VALID + 1;
+	INIT_LIST_HEAD(&ei->cache_lru);
+	INIT_HLIST_NODE(&ei->i_fat_hash);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int __init fat_init_inodecache(void)
+{
+	fat_inode_cachep = kmem_cache_create("fat_inode_cache",
+					     sizeof(struct msdos_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (fat_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void __exit fat_destroy_inodecache(void)
+{
+	kmem_cache_destroy(fat_inode_cachep);
+}
+
+static int fat_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	*flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
+	return 0;
+}
+
+static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	/* If the count of free cluster is still unknown, counts it here. */
+	if (sbi->free_clusters == -1 || !sbi->free_clus_valid) {
+		int err = fat_count_free_clusters(dentry->d_sb);
+		if (err)
+			return err;
+	}
+
+	buf->f_type = dentry->d_sb->s_magic;
+	buf->f_bsize = sbi->cluster_size;
+	buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
+	buf->f_bfree = sbi->free_clusters;
+	buf->f_bavail = sbi->free_clusters;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen =
+		(sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE;
+
+	return 0;
+}
+
+static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
+				    struct inode *inode)
+{
+	loff_t i_pos;
+#if BITS_PER_LONG == 32
+	spin_lock(&sbi->inode_hash_lock);
+#endif
+	i_pos = MSDOS_I(inode)->i_pos;
+#if BITS_PER_LONG == 32
+	spin_unlock(&sbi->inode_hash_lock);
+#endif
+	return i_pos;
+}
+
+static int __fat_write_inode(struct inode *inode, int wait)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bh;
+	struct msdos_dir_entry *raw_entry;
+	loff_t i_pos;
+	int err;
+
+	if (inode->i_ino == MSDOS_ROOT_INO)
+		return 0;
+
+retry:
+	i_pos = fat_i_pos_read(sbi, inode);
+	if (!i_pos)
+		return 0;
+
+	bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
+	if (!bh) {
+		fat_msg(sb, KERN_ERR, "unable to read inode block "
+		       "for updating (i_pos %lld)", i_pos);
+		return -EIO;
+	}
+	spin_lock(&sbi->inode_hash_lock);
+	if (i_pos != MSDOS_I(inode)->i_pos) {
+		spin_unlock(&sbi->inode_hash_lock);
+		brelse(bh);
+		goto retry;
+	}
+
+	raw_entry = &((struct msdos_dir_entry *) (bh->b_data))
+	    [i_pos & (sbi->dir_per_block - 1)];
+	if (S_ISDIR(inode->i_mode))
+		raw_entry->size = 0;
+	else
+		raw_entry->size = cpu_to_le32(inode->i_size);
+	raw_entry->attr = fat_make_attrs(inode);
+	raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart);
+	raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16);
+	fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
+			  &raw_entry->date, NULL);
+	if (sbi->options.isvfat) {
+		__le16 atime;
+		fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime,
+				  &raw_entry->cdate, &raw_entry->ctime_cs);
+		fat_time_unix2fat(sbi, &inode->i_atime, &atime,
+				  &raw_entry->adate, NULL);
+	}
+	spin_unlock(&sbi->inode_hash_lock);
+	mark_buffer_dirty(bh);
+	err = 0;
+	if (wait)
+		err = sync_dirty_buffer(bh);
+	brelse(bh);
+	return err;
+}
+
+static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
+int fat_sync_inode(struct inode *inode)
+{
+	return __fat_write_inode(inode, 1);
+}
+
+EXPORT_SYMBOL_GPL(fat_sync_inode);
+
+static int fat_show_options(struct seq_file *m, struct vfsmount *mnt);
+static const struct super_operations fat_sops = {
+	.alloc_inode	= fat_alloc_inode,
+	.destroy_inode	= fat_destroy_inode,
+	.write_inode	= fat_write_inode,
+	.evict_inode	= fat_evict_inode,
+	.put_super	= fat_put_super,
+	.write_super	= fat_write_super,
+	.sync_fs	= fat_sync_fs,
+	.statfs		= fat_statfs,
+	.remount_fs	= fat_remount,
+
+	.show_options	= fat_show_options,
+};
+
+/*
+ * a FAT file handle with fhtype 3 is
+ *  0/  i_ino - for fast, reliable lookup if still in the cache
+ *  1/  i_generation - to see if i_ino is still valid
+ *          bit 0 == 0 iff directory
+ *  2/  i_pos(8-39) - if ino has changed, but still in cache
+ *  3/  i_pos(4-7)|i_logstart - to semi-verify inode found at i_pos
+ *  4/  i_pos(0-3)|parent->i_logstart - maybe used to hunt for the file on disc
+ *
+ * Hack for NFSv2: Maximum FAT entry number is 28bits and maximum
+ * i_pos is 40bits (blocknr(32) + dir offset(8)), so two 4bits
+ * of i_logstart is used to store the directory entry offset.
+ */
+
+static struct dentry *fat_fh_to_dentry(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	struct inode *inode = NULL;
+	u32 *fh = fid->raw;
+
+	if (fh_len < 5 || fh_type != 3)
+		return NULL;
+
+	inode = ilookup(sb, fh[0]);
+	if (!inode || inode->i_generation != fh[1]) {
+		if (inode)
+			iput(inode);
+		inode = NULL;
+	}
+	if (!inode) {
+		loff_t i_pos;
+		int i_logstart = fh[3] & 0x0fffffff;
+
+		i_pos = (loff_t)fh[2] << 8;
+		i_pos |= ((fh[3] >> 24) & 0xf0) | (fh[4] >> 28);
+
+		/* try 2 - see if i_pos is in F-d-c
+		 * require i_logstart to be the same
+		 * Will fail if you truncate and then re-write
+		 */
+
+		inode = fat_iget(sb, i_pos);
+		if (inode && MSDOS_I(inode)->i_logstart != i_logstart) {
+			iput(inode);
+			inode = NULL;
+		}
+	}
+
+	/*
+	 * For now, do nothing if the inode is not found.
+	 *
+	 * What we could do is:
+	 *
+	 *	- follow the file starting at fh[4], and record the ".." entry,
+	 *	  and the name of the fh[2] entry.
+	 *	- then follow the ".." file finding the next step up.
+	 *
+	 * This way we build a path to the root of the tree. If this works, we
+	 * lookup the path and so get this inode into the cache.  Finally try
+	 * the fat_iget lookup again.  If that fails, then we are totally out
+	 * of luck.  But all that is for another day
+	 */
+	return d_obtain_alias(inode);
+}
+
+static int
+fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
+{
+	int len = *lenp;
+	struct inode *inode =  de->d_inode;
+	u32 ipos_h, ipos_m, ipos_l;
+
+	if (len < 5) {
+		*lenp = 5;
+		return 255; /* no room */
+	}
+
+	ipos_h = MSDOS_I(inode)->i_pos >> 8;
+	ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24;
+	ipos_l = (MSDOS_I(inode)->i_pos & 0x0f) << 28;
+	*lenp = 5;
+	fh[0] = inode->i_ino;
+	fh[1] = inode->i_generation;
+	fh[2] = ipos_h;
+	fh[3] = ipos_m | MSDOS_I(inode)->i_logstart;
+	spin_lock(&de->d_lock);
+	fh[4] = ipos_l | MSDOS_I(de->d_parent->d_inode)->i_logstart;
+	spin_unlock(&de->d_lock);
+	return 3;
+}
+
+static struct dentry *fat_get_parent(struct dentry *child)
+{
+	struct super_block *sb = child->d_sb;
+	struct buffer_head *bh;
+	struct msdos_dir_entry *de;
+	loff_t i_pos;
+	struct dentry *parent;
+	struct inode *inode;
+	int err;
+
+	lock_super(sb);
+
+	err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
+	if (err) {
+		parent = ERR_PTR(err);
+		goto out;
+	}
+	inode = fat_build_inode(sb, de, i_pos);
+	brelse(bh);
+
+	parent = d_obtain_alias(inode);
+out:
+	unlock_super(sb);
+
+	return parent;
+}
+
+static const struct export_operations fat_export_ops = {
+	.encode_fh	= fat_encode_fh,
+	.fh_to_dentry	= fat_fh_to_dentry,
+	.get_parent	= fat_get_parent,
+};
+
+static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(mnt->mnt_sb);
+	struct fat_mount_options *opts = &sbi->options;
+	int isvfat = opts->isvfat;
+
+	if (opts->fs_uid != 0)
+		seq_printf(m, ",uid=%u", opts->fs_uid);
+	if (opts->fs_gid != 0)
+		seq_printf(m, ",gid=%u", opts->fs_gid);
+	seq_printf(m, ",fmask=%04o", opts->fs_fmask);
+	seq_printf(m, ",dmask=%04o", opts->fs_dmask);
+	if (opts->allow_utime)
+		seq_printf(m, ",allow_utime=%04o", opts->allow_utime);
+	if (sbi->nls_disk)
+		seq_printf(m, ",codepage=%s", sbi->nls_disk->charset);
+	if (isvfat) {
+		if (sbi->nls_io)
+			seq_printf(m, ",iocharset=%s", sbi->nls_io->charset);
+
+		switch (opts->shortname) {
+		case VFAT_SFN_DISPLAY_WIN95 | VFAT_SFN_CREATE_WIN95:
+			seq_puts(m, ",shortname=win95");
+			break;
+		case VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WINNT:
+			seq_puts(m, ",shortname=winnt");
+			break;
+		case VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WIN95:
+			seq_puts(m, ",shortname=mixed");
+			break;
+		case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95:
+			seq_puts(m, ",shortname=lower");
+			break;
+		default:
+			seq_puts(m, ",shortname=unknown");
+			break;
+		}
+	}
+	if (opts->name_check != 'n')
+		seq_printf(m, ",check=%c", opts->name_check);
+	if (opts->usefree)
+		seq_puts(m, ",usefree");
+	if (opts->quiet)
+		seq_puts(m, ",quiet");
+	if (opts->showexec)
+		seq_puts(m, ",showexec");
+	if (opts->sys_immutable)
+		seq_puts(m, ",sys_immutable");
+	if (!isvfat) {
+		if (opts->dotsOK)
+			seq_puts(m, ",dotsOK=yes");
+		if (opts->nocase)
+			seq_puts(m, ",nocase");
+	} else {
+		if (opts->utf8)
+			seq_puts(m, ",utf8");
+		if (opts->unicode_xlate)
+			seq_puts(m, ",uni_xlate");
+		if (!opts->numtail)
+			seq_puts(m, ",nonumtail");
+		if (opts->rodir)
+			seq_puts(m, ",rodir");
+	}
+	if (opts->flush)
+		seq_puts(m, ",flush");
+	if (opts->tz_utc)
+		seq_puts(m, ",tz=UTC");
+	if (opts->errors == FAT_ERRORS_CONT)
+		seq_puts(m, ",errors=continue");
+	else if (opts->errors == FAT_ERRORS_PANIC)
+		seq_puts(m, ",errors=panic");
+	else
+		seq_puts(m, ",errors=remount-ro");
+	if (opts->discard)
+		seq_puts(m, ",discard");
+
+	return 0;
+}
+
+enum {
+	Opt_check_n, Opt_check_r, Opt_check_s, Opt_uid, Opt_gid,
+	Opt_umask, Opt_dmask, Opt_fmask, Opt_allow_utime, Opt_codepage,
+	Opt_usefree, Opt_nocase, Opt_quiet, Opt_showexec, Opt_debug,
+	Opt_immutable, Opt_dots, Opt_nodots,
+	Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
+	Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
+	Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
+	Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
+	Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err,
+};
+
+static const match_table_t fat_tokens = {
+	{Opt_check_r, "check=relaxed"},
+	{Opt_check_s, "check=strict"},
+	{Opt_check_n, "check=normal"},
+	{Opt_check_r, "check=r"},
+	{Opt_check_s, "check=s"},
+	{Opt_check_n, "check=n"},
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_umask, "umask=%o"},
+	{Opt_dmask, "dmask=%o"},
+	{Opt_fmask, "fmask=%o"},
+	{Opt_allow_utime, "allow_utime=%o"},
+	{Opt_codepage, "codepage=%u"},
+	{Opt_usefree, "usefree"},
+	{Opt_nocase, "nocase"},
+	{Opt_quiet, "quiet"},
+	{Opt_showexec, "showexec"},
+	{Opt_debug, "debug"},
+	{Opt_immutable, "sys_immutable"},
+	{Opt_flush, "flush"},
+	{Opt_tz_utc, "tz=UTC"},
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_discard, "discard"},
+	{Opt_obsolate, "conv=binary"},
+	{Opt_obsolate, "conv=text"},
+	{Opt_obsolate, "conv=auto"},
+	{Opt_obsolate, "conv=b"},
+	{Opt_obsolate, "conv=t"},
+	{Opt_obsolate, "conv=a"},
+	{Opt_obsolate, "fat=%u"},
+	{Opt_obsolate, "blocksize=%u"},
+	{Opt_obsolate, "cvf_format=%20s"},
+	{Opt_obsolate, "cvf_options=%100s"},
+	{Opt_obsolate, "posix"},
+	{Opt_err, NULL},
+};
+static const match_table_t msdos_tokens = {
+	{Opt_nodots, "nodots"},
+	{Opt_nodots, "dotsOK=no"},
+	{Opt_dots, "dots"},
+	{Opt_dots, "dotsOK=yes"},
+	{Opt_err, NULL}
+};
+static const match_table_t vfat_tokens = {
+	{Opt_charset, "iocharset=%s"},
+	{Opt_shortname_lower, "shortname=lower"},
+	{Opt_shortname_win95, "shortname=win95"},
+	{Opt_shortname_winnt, "shortname=winnt"},
+	{Opt_shortname_mixed, "shortname=mixed"},
+	{Opt_utf8_no, "utf8=0"},		/* 0 or no or false */
+	{Opt_utf8_no, "utf8=no"},
+	{Opt_utf8_no, "utf8=false"},
+	{Opt_utf8_yes, "utf8=1"},		/* empty or 1 or yes or true */
+	{Opt_utf8_yes, "utf8=yes"},
+	{Opt_utf8_yes, "utf8=true"},
+	{Opt_utf8_yes, "utf8"},
+	{Opt_uni_xl_no, "uni_xlate=0"},		/* 0 or no or false */
+	{Opt_uni_xl_no, "uni_xlate=no"},
+	{Opt_uni_xl_no, "uni_xlate=false"},
+	{Opt_uni_xl_yes, "uni_xlate=1"},	/* empty or 1 or yes or true */
+	{Opt_uni_xl_yes, "uni_xlate=yes"},
+	{Opt_uni_xl_yes, "uni_xlate=true"},
+	{Opt_uni_xl_yes, "uni_xlate"},
+	{Opt_nonumtail_no, "nonumtail=0"},	/* 0 or no or false */
+	{Opt_nonumtail_no, "nonumtail=no"},
+	{Opt_nonumtail_no, "nonumtail=false"},
+	{Opt_nonumtail_yes, "nonumtail=1"},	/* empty or 1 or yes or true */
+	{Opt_nonumtail_yes, "nonumtail=yes"},
+	{Opt_nonumtail_yes, "nonumtail=true"},
+	{Opt_nonumtail_yes, "nonumtail"},
+	{Opt_rodir, "rodir"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(struct super_block *sb, char *options, int is_vfat,
+			 int silent, int *debug, struct fat_mount_options *opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	char *iocharset;
+
+	opts->isvfat = is_vfat;
+
+	opts->fs_uid = current_uid();
+	opts->fs_gid = current_gid();
+	opts->fs_fmask = opts->fs_dmask = current_umask();
+	opts->allow_utime = -1;
+	opts->codepage = fat_default_codepage;
+	opts->iocharset = fat_default_iocharset;
+	if (is_vfat) {
+		opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95;
+		opts->rodir = 0;
+	} else {
+		opts->shortname = 0;
+		opts->rodir = 1;
+	}
+	opts->name_check = 'n';
+	opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK =  0;
+	opts->utf8 = opts->unicode_xlate = 0;
+	opts->numtail = 1;
+	opts->usefree = opts->nocase = 0;
+	opts->tz_utc = 0;
+	opts->errors = FAT_ERRORS_RO;
+	*debug = 0;
+
+	if (!options)
+		goto out;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, fat_tokens, args);
+		if (token == Opt_err) {
+			if (is_vfat)
+				token = match_token(p, vfat_tokens, args);
+			else
+				token = match_token(p, msdos_tokens, args);
+		}
+		switch (token) {
+		case Opt_check_s:
+			opts->name_check = 's';
+			break;
+		case Opt_check_r:
+			opts->name_check = 'r';
+			break;
+		case Opt_check_n:
+			opts->name_check = 'n';
+			break;
+		case Opt_usefree:
+			opts->usefree = 1;
+			break;
+		case Opt_nocase:
+			if (!is_vfat)
+				opts->nocase = 1;
+			else {
+				/* for backward compatibility */
+				opts->shortname = VFAT_SFN_DISPLAY_WIN95
+					| VFAT_SFN_CREATE_WIN95;
+			}
+			break;
+		case Opt_quiet:
+			opts->quiet = 1;
+			break;
+		case Opt_showexec:
+			opts->showexec = 1;
+			break;
+		case Opt_debug:
+			*debug = 1;
+			break;
+		case Opt_immutable:
+			opts->sys_immutable = 1;
+			break;
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return 0;
+			opts->fs_uid = option;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+			opts->fs_gid = option;
+			break;
+		case Opt_umask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			opts->fs_fmask = opts->fs_dmask = option;
+			break;
+		case Opt_dmask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			opts->fs_dmask = option;
+			break;
+		case Opt_fmask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			opts->fs_fmask = option;
+			break;
+		case Opt_allow_utime:
+			if (match_octal(&args[0], &option))
+				return 0;
+			opts->allow_utime = option & (S_IWGRP | S_IWOTH);
+			break;
+		case Opt_codepage:
+			if (match_int(&args[0], &option))
+				return 0;
+			opts->codepage = option;
+			break;
+		case Opt_flush:
+			opts->flush = 1;
+			break;
+		case Opt_tz_utc:
+			opts->tz_utc = 1;
+			break;
+		case Opt_err_cont:
+			opts->errors = FAT_ERRORS_CONT;
+			break;
+		case Opt_err_panic:
+			opts->errors = FAT_ERRORS_PANIC;
+			break;
+		case Opt_err_ro:
+			opts->errors = FAT_ERRORS_RO;
+			break;
+
+		/* msdos specific */
+		case Opt_dots:
+			opts->dotsOK = 1;
+			break;
+		case Opt_nodots:
+			opts->dotsOK = 0;
+			break;
+
+		/* vfat specific */
+		case Opt_charset:
+			if (opts->iocharset != fat_default_iocharset)
+				kfree(opts->iocharset);
+			iocharset = match_strdup(&args[0]);
+			if (!iocharset)
+				return -ENOMEM;
+			opts->iocharset = iocharset;
+			break;
+		case Opt_shortname_lower:
+			opts->shortname = VFAT_SFN_DISPLAY_LOWER
+					| VFAT_SFN_CREATE_WIN95;
+			break;
+		case Opt_shortname_win95:
+			opts->shortname = VFAT_SFN_DISPLAY_WIN95
+					| VFAT_SFN_CREATE_WIN95;
+			break;
+		case Opt_shortname_winnt:
+			opts->shortname = VFAT_SFN_DISPLAY_WINNT
+					| VFAT_SFN_CREATE_WINNT;
+			break;
+		case Opt_shortname_mixed:
+			opts->shortname = VFAT_SFN_DISPLAY_WINNT
+					| VFAT_SFN_CREATE_WIN95;
+			break;
+		case Opt_utf8_no:		/* 0 or no or false */
+			opts->utf8 = 0;
+			break;
+		case Opt_utf8_yes:		/* empty or 1 or yes or true */
+			opts->utf8 = 1;
+			break;
+		case Opt_uni_xl_no:		/* 0 or no or false */
+			opts->unicode_xlate = 0;
+			break;
+		case Opt_uni_xl_yes:		/* empty or 1 or yes or true */
+			opts->unicode_xlate = 1;
+			break;
+		case Opt_nonumtail_no:		/* 0 or no or false */
+			opts->numtail = 1;	/* negated option */
+			break;
+		case Opt_nonumtail_yes:		/* empty or 1 or yes or true */
+			opts->numtail = 0;	/* negated option */
+			break;
+		case Opt_rodir:
+			opts->rodir = 1;
+			break;
+		case Opt_discard:
+			opts->discard = 1;
+			break;
+
+		/* obsolete mount options */
+		case Opt_obsolate:
+			fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, "
+			       "not supported now", p);
+			break;
+		/* unknown option */
+		default:
+			if (!silent) {
+				fat_msg(sb, KERN_ERR,
+				       "Unrecognized mount option \"%s\" "
+				       "or missing value", p);
+			}
+			return -EINVAL;
+		}
+	}
+
+out:
+	/* UTF-8 doesn't provide FAT semantics */
+	if (!strcmp(opts->iocharset, "utf8")) {
+		fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset"
+		       " for FAT filesystems, filesystem will be "
+		       "case sensitive!\n");
+	}
+
+	/* If user doesn't specify allow_utime, it's initialized from dmask. */
+	if (opts->allow_utime == (unsigned short)-1)
+		opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH);
+	if (opts->unicode_xlate)
+		opts->utf8 = 0;
+
+	return 0;
+}
+
+static int fat_read_root(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	int error;
+
+	MSDOS_I(inode)->i_pos = 0;
+	inode->i_uid = sbi->options.fs_uid;
+	inode->i_gid = sbi->options.fs_gid;
+	inode->i_version++;
+	inode->i_generation = 0;
+	inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO);
+	inode->i_op = sbi->dir_ops;
+	inode->i_fop = &fat_dir_operations;
+	if (sbi->fat_bits == 32) {
+		MSDOS_I(inode)->i_start = sbi->root_cluster;
+		error = fat_calc_dir_size(inode);
+		if (error < 0)
+			return error;
+	} else {
+		MSDOS_I(inode)->i_start = 0;
+		inode->i_size = sbi->dir_entries * sizeof(struct msdos_dir_entry);
+	}
+	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
+			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
+	MSDOS_I(inode)->i_logstart = 0;
+	MSDOS_I(inode)->mmu_private = inode->i_size;
+
+	fat_save_attrs(inode, ATTR_DIR);
+	inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0;
+	inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0;
+	inode->i_nlink = fat_subdirs(inode)+2;
+
+	return 0;
+}
+
+/*
+ * Read the super block of an MS-DOS FS.
+ */
+int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
+		   void (*setup)(struct super_block *))
+{
+	struct inode *root_inode = NULL, *fat_inode = NULL;
+	struct buffer_head *bh;
+	struct fat_boot_sector *b;
+	struct fat_boot_bsx *bsx;
+	struct msdos_sb_info *sbi;
+	u16 logical_sector_size;
+	u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors;
+	int debug;
+	unsigned int media;
+	long error;
+	char buf[50];
+
+	/*
+	 * GFP_KERNEL is ok here, because while we do hold the
+	 * supeblock lock, memory pressure can't call back into
+	 * the filesystem, since we're only just about to mount
+	 * it and have no inodes etc active!
+	 */
+	sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	sb->s_fs_info = sbi;
+
+	sb->s_flags |= MS_NODIRATIME;
+	sb->s_magic = MSDOS_SUPER_MAGIC;
+	sb->s_op = &fat_sops;
+	sb->s_export_op = &fat_export_ops;
+	ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
+			     DEFAULT_RATELIMIT_BURST);
+
+	error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options);
+	if (error)
+		goto out_fail;
+
+	setup(sb); /* flavour-specific stuff that needs options */
+
+	error = -EIO;
+	sb_min_blocksize(sb, 512);
+	bh = sb_bread(sb, 0);
+	if (bh == NULL) {
+		fat_msg(sb, KERN_ERR, "unable to read boot sector");
+		goto out_fail;
+	}
+
+	b = (struct fat_boot_sector *) bh->b_data;
+	if (!b->reserved) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR, "bogus number of reserved sectors");
+		brelse(bh);
+		goto out_invalid;
+	}
+	if (!b->fats) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
+		brelse(bh);
+		goto out_invalid;
+	}
+
+	/*
+	 * Earlier we checked here that b->secs_track and b->head are nonzero,
+	 * but it turns out valid FAT filesystems can have zero there.
+	 */
+
+	media = b->media;
+	if (!fat_valid_media(media)) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
+			       media);
+		brelse(bh);
+		goto out_invalid;
+	}
+	logical_sector_size = get_unaligned_le16(&b->sector_size);
+	if (!is_power_of_2(logical_sector_size)
+	    || (logical_sector_size < 512)
+	    || (logical_sector_size > 4096)) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
+			       logical_sector_size);
+		brelse(bh);
+		goto out_invalid;
+	}
+	sbi->sec_per_clus = b->sec_per_clus;
+	if (!is_power_of_2(sbi->sec_per_clus)) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
+			       sbi->sec_per_clus);
+		brelse(bh);
+		goto out_invalid;
+	}
+
+	if (logical_sector_size < sb->s_blocksize) {
+		fat_msg(sb, KERN_ERR, "logical sector size too small for device"
+		       " (logical sector size = %u)", logical_sector_size);
+		brelse(bh);
+		goto out_fail;
+	}
+	if (logical_sector_size > sb->s_blocksize) {
+		brelse(bh);
+
+		if (!sb_set_blocksize(sb, logical_sector_size)) {
+			fat_msg(sb, KERN_ERR, "unable to set blocksize %u",
+			       logical_sector_size);
+			goto out_fail;
+		}
+		bh = sb_bread(sb, 0);
+		if (bh == NULL) {
+			fat_msg(sb, KERN_ERR, "unable to read boot sector"
+			       " (logical sector size = %lu)",
+			       sb->s_blocksize);
+			goto out_fail;
+		}
+		b = (struct fat_boot_sector *) bh->b_data;
+	}
+
+	sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus;
+	sbi->cluster_bits = ffs(sbi->cluster_size) - 1;
+	sbi->fats = b->fats;
+	sbi->fat_bits = 0;		/* Don't know yet */
+	sbi->fat_start = le16_to_cpu(b->reserved);
+	sbi->fat_length = le16_to_cpu(b->fat_length);
+	sbi->root_cluster = 0;
+	sbi->free_clusters = -1;	/* Don't know yet */
+	sbi->free_clus_valid = 0;
+	sbi->prev_free = FAT_START_ENT;
+
+	if (!sbi->fat_length && b->fat32_length) {
+		struct fat_boot_fsinfo *fsinfo;
+		struct buffer_head *fsinfo_bh;
+
+		/* Must be FAT32 */
+		sbi->fat_bits = 32;
+		sbi->fat_length = le32_to_cpu(b->fat32_length);
+		sbi->root_cluster = le32_to_cpu(b->root_cluster);
+
+		sb->s_maxbytes = 0xffffffff;
+
+		/* MC - if info_sector is 0, don't multiply by 0 */
+		sbi->fsinfo_sector = le16_to_cpu(b->info_sector);
+		if (sbi->fsinfo_sector == 0)
+			sbi->fsinfo_sector = 1;
+
+		fsinfo_bh = sb_bread(sb, sbi->fsinfo_sector);
+		if (fsinfo_bh == NULL) {
+			fat_msg(sb, KERN_ERR, "bread failed, FSINFO block"
+			       " (sector = %lu)", sbi->fsinfo_sector);
+			brelse(bh);
+			goto out_fail;
+		}
+
+		bsx = (struct fat_boot_bsx *)(bh->b_data + FAT32_BSX_OFFSET);
+
+		fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data;
+		if (!IS_FSINFO(fsinfo)) {
+			fat_msg(sb, KERN_WARNING, "Invalid FSINFO signature: "
+			       "0x%08x, 0x%08x (sector = %lu)",
+			       le32_to_cpu(fsinfo->signature1),
+			       le32_to_cpu(fsinfo->signature2),
+			       sbi->fsinfo_sector);
+		} else {
+			if (sbi->options.usefree)
+				sbi->free_clus_valid = 1;
+			sbi->free_clusters = le32_to_cpu(fsinfo->free_clusters);
+			sbi->prev_free = le32_to_cpu(fsinfo->next_cluster);
+		}
+
+		brelse(fsinfo_bh);
+	} else {
+		bsx = (struct fat_boot_bsx *)(bh->b_data + FAT16_BSX_OFFSET);
+	}
+
+	/* interpret volume ID as a little endian 32 bit integer */
+	sbi->vol_id = (((u32)bsx->vol_id[0]) | ((u32)bsx->vol_id[1] << 8) |
+		((u32)bsx->vol_id[2] << 16) | ((u32)bsx->vol_id[3] << 24));
+
+	sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
+	sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
+
+	sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length;
+	sbi->dir_entries = get_unaligned_le16(&b->dir_entries);
+	if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR, "bogus directroy-entries per block"
+			       " (%u)", sbi->dir_entries);
+		brelse(bh);
+		goto out_invalid;
+	}
+
+	rootdir_sectors = sbi->dir_entries
+		* sizeof(struct msdos_dir_entry) / sb->s_blocksize;
+	sbi->data_start = sbi->dir_start + rootdir_sectors;
+	total_sectors = get_unaligned_le16(&b->sectors);
+	if (total_sectors == 0)
+		total_sectors = le32_to_cpu(b->total_sect);
+
+	total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus;
+
+	if (sbi->fat_bits != 32)
+		sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12;
+
+	/* check that FAT table does not overflow */
+	fat_clusters = sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
+	total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
+	if (total_clusters > MAX_FAT(sb)) {
+		if (!silent)
+			fat_msg(sb, KERN_ERR, "count of clusters too big (%u)",
+			       total_clusters);
+		brelse(bh);
+		goto out_invalid;
+	}
+
+	sbi->max_cluster = total_clusters + FAT_START_ENT;
+	/* check the free_clusters, it's not necessarily correct */
+	if (sbi->free_clusters != -1 && sbi->free_clusters > total_clusters)
+		sbi->free_clusters = -1;
+	/* check the prev_free, it's not necessarily correct */
+	sbi->prev_free %= sbi->max_cluster;
+	if (sbi->prev_free < FAT_START_ENT)
+		sbi->prev_free = FAT_START_ENT;
+
+	brelse(bh);
+
+	/* set up enough so that it can read an inode */
+	fat_hash_init(sb);
+	fat_ent_access_init(sb);
+
+	/*
+	 * The low byte of FAT's first entry must have same value with
+	 * media-field.  But in real world, too many devices is
+	 * writing wrong value.  So, removed that validity check.
+	 *
+	 * if (FAT_FIRST_ENT(sb, media) != first)
+	 */
+
+	error = -EINVAL;
+	sprintf(buf, "cp%d", sbi->options.codepage);
+	sbi->nls_disk = load_nls(buf);
+	if (!sbi->nls_disk) {
+		fat_msg(sb, KERN_ERR, "codepage %s not found", buf);
+		goto out_fail;
+	}
+
+	/* FIXME: utf8 is using iocharset for upper/lower conversion */
+	if (sbi->options.isvfat) {
+		sbi->nls_io = load_nls(sbi->options.iocharset);
+		if (!sbi->nls_io) {
+			fat_msg(sb, KERN_ERR, "IO charset %s not found",
+			       sbi->options.iocharset);
+			goto out_fail;
+		}
+	}
+
+	error = -ENOMEM;
+	fat_inode = new_inode(sb);
+	if (!fat_inode)
+		goto out_fail;
+	MSDOS_I(fat_inode)->i_pos = 0;
+	sbi->fat_inode = fat_inode;
+	root_inode = new_inode(sb);
+	if (!root_inode)
+		goto out_fail;
+	root_inode->i_ino = MSDOS_ROOT_INO;
+	root_inode->i_version = 1;
+	error = fat_read_root(root_inode);
+	if (error < 0)
+		goto out_fail;
+	error = -ENOMEM;
+	insert_inode_hash(root_inode);
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root) {
+		fat_msg(sb, KERN_ERR, "get root inode failed");
+		goto out_fail;
+	}
+
+	return 0;
+
+out_invalid:
+	error = -EINVAL;
+	if (!silent)
+		fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem");
+
+out_fail:
+	if (fat_inode)
+		iput(fat_inode);
+	if (root_inode)
+		iput(root_inode);
+	unload_nls(sbi->nls_io);
+	unload_nls(sbi->nls_disk);
+	if (sbi->options.iocharset != fat_default_iocharset)
+		kfree(sbi->options.iocharset);
+	sb->s_fs_info = NULL;
+	kfree(sbi);
+	return error;
+}
+
+EXPORT_SYMBOL_GPL(fat_fill_super);
+
+/*
+ * helper function for fat_flush_inodes.  This writes both the inode
+ * and the file data blocks, waiting for in flight data blocks before
+ * the start of the call.  It does not wait for any io started
+ * during the call
+ */
+static int writeback_inode(struct inode *inode)
+{
+
+	int ret;
+	struct address_space *mapping = inode->i_mapping;
+	struct writeback_control wbc = {
+	       .sync_mode = WB_SYNC_NONE,
+	      .nr_to_write = 0,
+	};
+	/* if we used WB_SYNC_ALL, sync_inode waits for the io for the
+	* inode to finish.  So WB_SYNC_NONE is sent down to sync_inode
+	* and filemap_fdatawrite is used for the data blocks
+	*/
+	ret = sync_inode(inode, &wbc);
+	if (!ret)
+	       ret = filemap_fdatawrite(mapping);
+	return ret;
+}
+
+/*
+ * write data and metadata corresponding to i1 and i2.  The io is
+ * started but we do not wait for any of it to finish.
+ *
+ * filemap_flush is used for the block device, so if there is a dirty
+ * page for a block already in flight, we will not wait and start the
+ * io over again
+ */
+int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
+{
+	int ret = 0;
+	if (!MSDOS_SB(sb)->options.flush)
+		return 0;
+	if (i1)
+		ret = writeback_inode(i1);
+	if (!ret && i2)
+		ret = writeback_inode(i2);
+	if (!ret) {
+		struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+		ret = filemap_flush(mapping);
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(fat_flush_inodes);
+
+static int __init init_fat_fs(void)
+{
+	int err;
+
+	err = fat_cache_init();
+	if (err)
+		return err;
+
+	err = fat_init_inodecache();
+	if (err)
+		goto failed;
+
+	return 0;
+
+failed:
+	fat_cache_destroy();
+	return err;
+}
+
+static void __exit exit_fat_fs(void)
+{
+	fat_cache_destroy();
+	fat_destroy_inodecache();
+}
+
+module_init(init_fat_fs)
+module_exit(exit_fat_fs)
+
+MODULE_LICENSE("GPL");
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
new file mode 100644
index 00000000..6d93360c
--- /dev/null
+++ b/fs/fat/misc.c
@@ -0,0 +1,278 @@
+/*
+ *  linux/fs/fat/misc.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980
+ *		 and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru)
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/time.h>
+#include "fat.h"
+
+/*
+ * fat_fs_error reports a file system problem that might indicate fa data
+ * corruption/inconsistency. Depending on 'errors' mount option the
+ * panic() is called, or error message is printed FAT and nothing is done,
+ * or filesystem is remounted read-only (default behavior).
+ * In case the file system is remounted read-only, it can be made writable
+ * again by remounting it.
+ */
+void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
+{
+	struct fat_mount_options *opts = &MSDOS_SB(sb)->options;
+	va_list args;
+	struct va_format vaf;
+
+	if (report) {
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+		printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf);
+		va_end(args);
+	}
+
+	if (opts->errors == FAT_ERRORS_PANIC)
+		panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
+	else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
+		sb->s_flags |= MS_RDONLY;
+		printk(KERN_ERR "FAT-fs (%s): Filesystem has been "
+				"set read-only\n", sb->s_id);
+	}
+}
+EXPORT_SYMBOL_GPL(__fat_fs_error);
+
+/**
+ * fat_msg() - print preformated FAT specific messages. Every thing what is
+ * not fat_fs_error() should be fat_msg().
+ */
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf);
+	va_end(args);
+}
+
+/* Flushes the number of free clusters on FAT32 */
+/* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
+int fat_clusters_flush(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bh;
+	struct fat_boot_fsinfo *fsinfo;
+
+	if (sbi->fat_bits != 32)
+		return 0;
+
+	bh = sb_bread(sb, sbi->fsinfo_sector);
+	if (bh == NULL) {
+		fat_msg(sb, KERN_ERR, "bread failed in fat_clusters_flush");
+		return -EIO;
+	}
+
+	fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
+	/* Sanity check */
+	if (!IS_FSINFO(fsinfo)) {
+		fat_msg(sb, KERN_ERR, "Invalid FSINFO signature: "
+		       "0x%08x, 0x%08x (sector = %lu)",
+		       le32_to_cpu(fsinfo->signature1),
+		       le32_to_cpu(fsinfo->signature2),
+		       sbi->fsinfo_sector);
+	} else {
+		if (sbi->free_clusters != -1)
+			fsinfo->free_clusters = cpu_to_le32(sbi->free_clusters);
+		if (sbi->prev_free != -1)
+			fsinfo->next_cluster = cpu_to_le32(sbi->prev_free);
+		mark_buffer_dirty(bh);
+	}
+	brelse(bh);
+
+	return 0;
+}
+
+/*
+ * fat_chain_add() adds a new cluster to the chain of clusters represented
+ * by inode.
+ */
+int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	int ret, new_fclus, last;
+
+	/*
+	 * We must locate the last cluster of the file to add this new
+	 * one (new_dclus) to the end of the link list (the FAT).
+	 */
+	last = new_fclus = 0;
+	if (MSDOS_I(inode)->i_start) {
+		int fclus, dclus;
+
+		ret = fat_get_cluster(inode, FAT_ENT_EOF, &fclus, &dclus);
+		if (ret < 0)
+			return ret;
+		new_fclus = fclus + 1;
+		last = dclus;
+	}
+
+	/* add new one to the last of the cluster chain */
+	if (last) {
+		struct fat_entry fatent;
+
+		fatent_init(&fatent);
+		ret = fat_ent_read(inode, &fatent, last);
+		if (ret >= 0) {
+			int wait = inode_needs_sync(inode);
+			ret = fat_ent_write(inode, &fatent, new_dclus, wait);
+			fatent_brelse(&fatent);
+		}
+		if (ret < 0)
+			return ret;
+//		fat_cache_add(inode, new_fclus, new_dclus);
+	} else {
+		MSDOS_I(inode)->i_start = new_dclus;
+		MSDOS_I(inode)->i_logstart = new_dclus;
+		/*
+		 * Since generic_write_sync() synchronizes regular files later,
+		 * we sync here only directories.
+		 */
+		if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) {
+			ret = fat_sync_inode(inode);
+			if (ret)
+				return ret;
+		} else
+			mark_inode_dirty(inode);
+	}
+	if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) {
+		fat_fs_error(sb, "clusters badly computed (%d != %llu)",
+			     new_fclus,
+			     (llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
+		fat_cache_inval_inode(inode);
+	}
+	inode->i_blocks += nr_cluster << (sbi->cluster_bits - 9);
+
+	return 0;
+}
+
+extern struct timezone sys_tz;
+
+/*
+ * The epoch of FAT timestamp is 1980.
+ *     :  bits :     value
+ * date:  0 -  4: day	(1 -  31)
+ * date:  5 -  8: month	(1 -  12)
+ * date:  9 - 15: year	(0 - 127) from 1980
+ * time:  0 -  4: sec	(0 -  29) 2sec counts
+ * time:  5 - 10: min	(0 -  59)
+ * time: 11 - 15: hour	(0 -  23)
+ */
+#define SECS_PER_MIN	60
+#define SECS_PER_HOUR	(60 * 60)
+#define SECS_PER_DAY	(SECS_PER_HOUR * 24)
+/* days between 1.1.70 and 1.1.80 (2 leap days) */
+#define DAYS_DELTA	(365 * 10 + 2)
+/* 120 (2100 - 1980) isn't leap year */
+#define YEAR_2100	120
+#define IS_LEAP_YEAR(y)	(!((y) & 3) && (y) != YEAR_2100)
+
+/* Linear day numbers of the respective 1sts in non-leap years. */
+static time_t days_in_year[] = {
+	/* Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
+	0,   0,  31,  59,  90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
+};
+
+/* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */
+void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+		       __le16 __time, __le16 __date, u8 time_cs)
+{
+	u16 time = le16_to_cpu(__time), date = le16_to_cpu(__date);
+	time_t second, day, leap_day, month, year;
+
+	year  = date >> 9;
+	month = max(1, (date >> 5) & 0xf);
+	day   = max(1, date & 0x1f) - 1;
+
+	leap_day = (year + 3) / 4;
+	if (year > YEAR_2100)		/* 2100 isn't leap year */
+		leap_day--;
+	if (IS_LEAP_YEAR(year) && month > 2)
+		leap_day++;
+
+	second =  (time & 0x1f) << 1;
+	second += ((time >> 5) & 0x3f) * SECS_PER_MIN;
+	second += (time >> 11) * SECS_PER_HOUR;
+	second += (year * 365 + leap_day
+		   + days_in_year[month] + day
+		   + DAYS_DELTA) * SECS_PER_DAY;
+
+	if (!sbi->options.tz_utc)
+		second += sys_tz.tz_minuteswest * SECS_PER_MIN;
+
+	if (time_cs) {
+		ts->tv_sec = second + (time_cs / 100);
+		ts->tv_nsec = (time_cs % 100) * 10000000;
+	} else {
+		ts->tv_sec = second;
+		ts->tv_nsec = 0;
+	}
+}
+
+/* Convert linear UNIX date to a FAT time/date pair. */
+void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+		       __le16 *time, __le16 *date, u8 *time_cs)
+{
+	struct tm tm;
+	time_to_tm(ts->tv_sec, sbi->options.tz_utc ? 0 :
+		   -sys_tz.tz_minuteswest * 60, &tm);
+
+	/*  FAT can only support year between 1980 to 2107 */
+	if (tm.tm_year < 1980 - 1900) {
+		*time = 0;
+		*date = cpu_to_le16((0 << 9) | (1 << 5) | 1);
+		if (time_cs)
+			*time_cs = 0;
+		return;
+	}
+	if (tm.tm_year > 2107 - 1900) {
+		*time = cpu_to_le16((23 << 11) | (59 << 5) | 29);
+		*date = cpu_to_le16((127 << 9) | (12 << 5) | 31);
+		if (time_cs)
+			*time_cs = 199;
+		return;
+	}
+
+	/* from 1900 -> from 1980 */
+	tm.tm_year -= 80;
+	/* 0~11 -> 1~12 */
+	tm.tm_mon++;
+	/* 0~59 -> 0~29(2sec counts) */
+	tm.tm_sec >>= 1;
+
+	*time = cpu_to_le16(tm.tm_hour << 11 | tm.tm_min << 5 | tm.tm_sec);
+	*date = cpu_to_le16(tm.tm_year << 9 | tm.tm_mon << 5 | tm.tm_mday);
+	if (time_cs)
+		*time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000;
+}
+EXPORT_SYMBOL_GPL(fat_time_unix2fat);
+
+int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
+{
+	int i, err = 0;
+
+	for (i = 0; i < nr_bhs; i++)
+		write_dirty_buffer(bhs[i], WRITE);
+
+	for (i = 0; i < nr_bhs; i++) {
+		wait_on_buffer(bhs[i]);
+		if (!err && !buffer_uptodate(bhs[i]))
+			err = -EIO;
+	}
+	return err;
+}
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
new file mode 100644
index 00000000..3b222daf
--- /dev/null
+++ b/fs/fat/namei_msdos.c
@@ -0,0 +1,702 @@
+/*
+ *  linux/fs/msdos/namei.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  Hidden files 1995 by Albert Cahalan <albert@ccs.neu.edu> <adc@coe.neu.edu>
+ *  Rewritten for constant inumbers 1999 by Al Viro
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/buffer_head.h>
+#include "fat.h"
+
+/* Characters that are undesirable in an MS-DOS file name */
+static unsigned char bad_chars[] = "*?<>|\"";
+static unsigned char bad_if_strict[] = "+=,; ";
+
+/***** Formats an MS-DOS file name. Rejects invalid names. */
+static int msdos_format_name(const unsigned char *name, int len,
+			     unsigned char *res, struct fat_mount_options *opts)
+	/*
+	 * name is the proposed name, len is its length, res is
+	 * the resulting name, opts->name_check is either (r)elaxed,
+	 * (n)ormal or (s)trict, opts->dotsOK allows dots at the
+	 * beginning of name (for hidden files)
+	 */
+{
+	unsigned char *walk;
+	unsigned char c;
+	int space;
+
+	if (name[0] == '.') {	/* dotfile because . and .. already done */
+		if (opts->dotsOK) {
+			/* Get rid of dot - test for it elsewhere */
+			name++;
+			len--;
+		} else
+			return -EINVAL;
+	}
+	/*
+	 * disallow names that _really_ start with a dot
+	 */
+	space = 1;
+	c = 0;
+	for (walk = res; len && walk - res < 8; walk++) {
+		c = *name++;
+		len--;
+		if (opts->name_check != 'r' && strchr(bad_chars, c))
+			return -EINVAL;
+		if (opts->name_check == 's' && strchr(bad_if_strict, c))
+			return -EINVAL;
+		if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
+			return -EINVAL;
+		if (c < ' ' || c == ':' || c == '\\')
+			return -EINVAL;
+	/*
+	 * 0xE5 is legal as a first character, but we must substitute
+	 * 0x05 because 0xE5 marks deleted files.  Yes, DOS really
+	 * does this.
+	 * It seems that Microsoft hacked DOS to support non-US
+	 * characters after the 0xE5 character was already in use to
+	 * mark deleted files.
+	 */
+		if ((res == walk) && (c == 0xE5))
+			c = 0x05;
+		if (c == '.')
+			break;
+		space = (c == ' ');
+		*walk = (!opts->nocase && c >= 'a' && c <= 'z') ? c - 32 : c;
+	}
+	if (space)
+		return -EINVAL;
+	if (opts->name_check == 's' && len && c != '.') {
+		c = *name++;
+		len--;
+		if (c != '.')
+			return -EINVAL;
+	}
+	while (c != '.' && len--)
+		c = *name++;
+	if (c == '.') {
+		while (walk - res < 8)
+			*walk++ = ' ';
+		while (len > 0 && walk - res < MSDOS_NAME) {
+			c = *name++;
+			len--;
+			if (opts->name_check != 'r' && strchr(bad_chars, c))
+				return -EINVAL;
+			if (opts->name_check == 's' &&
+			    strchr(bad_if_strict, c))
+				return -EINVAL;
+			if (c < ' ' || c == ':' || c == '\\')
+				return -EINVAL;
+			if (c == '.') {
+				if (opts->name_check == 's')
+					return -EINVAL;
+				break;
+			}
+			if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
+				return -EINVAL;
+			space = c == ' ';
+			if (!opts->nocase && c >= 'a' && c <= 'z')
+				*walk++ = c - 32;
+			else
+				*walk++ = c;
+		}
+		if (space)
+			return -EINVAL;
+		if (opts->name_check == 's' && len)
+			return -EINVAL;
+	}
+	while (walk - res < MSDOS_NAME)
+		*walk++ = ' ';
+
+	return 0;
+}
+
+/***** Locates a directory entry.  Uses unformatted name. */
+static int msdos_find(struct inode *dir, const unsigned char *name, int len,
+		      struct fat_slot_info *sinfo)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
+	unsigned char msdos_name[MSDOS_NAME];
+	int err;
+
+	err = msdos_format_name(name, len, msdos_name, &sbi->options);
+	if (err)
+		return -ENOENT;
+
+	err = fat_scan(dir, msdos_name, sinfo);
+	if (!err && sbi->options.dotsOK) {
+		if (name[0] == '.') {
+			if (!(sinfo->de->attr & ATTR_HIDDEN))
+				err = -ENOENT;
+		} else {
+			if (sinfo->de->attr & ATTR_HIDDEN)
+				err = -ENOENT;
+		}
+		if (err)
+			brelse(sinfo->bh);
+	}
+	return err;
+}
+
+/*
+ * Compute the hash for the msdos name corresponding to the dentry.
+ * Note: if the name is invalid, we leave the hash code unchanged so
+ * that the existing dentry can be used. The msdos fs routines will
+ * return ENOENT or EINVAL as appropriate.
+ */
+static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
+	       struct qstr *qstr)
+{
+	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
+	unsigned char msdos_name[MSDOS_NAME];
+	int error;
+
+	error = msdos_format_name(qstr->name, qstr->len, msdos_name, options);
+	if (!error)
+		qstr->hash = full_name_hash(msdos_name, MSDOS_NAME);
+	return 0;
+}
+
+/*
+ * Compare two msdos names. If either of the names are invalid,
+ * we fall back to doing the standard name comparison.
+ */
+static int msdos_cmp(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
+	unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
+	int error;
+
+	error = msdos_format_name(name->name, name->len, a_msdos_name, options);
+	if (error)
+		goto old_compare;
+	error = msdos_format_name(str, len, b_msdos_name, options);
+	if (error)
+		goto old_compare;
+	error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
+out:
+	return error;
+
+old_compare:
+	error = 1;
+	if (name->len == len)
+		error = memcmp(name->name, str, len);
+	goto out;
+}
+
+static const struct dentry_operations msdos_dentry_operations = {
+	.d_hash		= msdos_hash,
+	.d_compare	= msdos_cmp,
+};
+
+/*
+ * AV. Wrappers for FAT sb operations. Is it wise?
+ */
+
+/***** Get inode using directory and name */
+static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	struct inode *inode;
+	int err;
+
+	lock_super(sb);
+
+	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (err) {
+		if (err == -ENOENT) {
+			inode = NULL;
+			goto out;
+		}
+		goto error;
+	}
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto error;
+	}
+out:
+	unlock_super(sb);
+	return d_splice_alias(inode, dentry);
+
+error:
+	unlock_super(sb);
+	return ERR_PTR(err);
+}
+
+/***** Creates a directory entry (name is already formatted). */
+static int msdos_add_entry(struct inode *dir, const unsigned char *name,
+			   int is_dir, int is_hid, int cluster,
+			   struct timespec *ts, struct fat_slot_info *sinfo)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
+	struct msdos_dir_entry de;
+	__le16 time, date;
+	int err;
+
+	memcpy(de.name, name, MSDOS_NAME);
+	de.attr = is_dir ? ATTR_DIR : ATTR_ARCH;
+	if (is_hid)
+		de.attr |= ATTR_HIDDEN;
+	de.lcase = 0;
+	fat_time_unix2fat(sbi, ts, &time, &date, NULL);
+	de.cdate = de.adate = 0;
+	de.ctime = 0;
+	de.ctime_cs = 0;
+	de.time = time;
+	de.date = date;
+	de.start = cpu_to_le16(cluster);
+	de.starthi = cpu_to_le16(cluster >> 16);
+	de.size = 0;
+
+	err = fat_add_entries(dir, &de, 1, sinfo);
+	if (err)
+		return err;
+
+	dir->i_ctime = dir->i_mtime = *ts;
+	if (IS_DIRSYNC(dir))
+		(void)fat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+
+	return 0;
+}
+
+/***** Create a file */
+static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
+			struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = NULL;
+	struct fat_slot_info sinfo;
+	struct timespec ts;
+	unsigned char msdos_name[MSDOS_NAME];
+	int err, is_hid;
+
+	lock_super(sb);
+
+	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
+				msdos_name, &MSDOS_SB(sb)->options);
+	if (err)
+		goto out;
+	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
+	/* Have to do it due to foo vs. .foo conflicts */
+	if (!fat_scan(dir, msdos_name, &sinfo)) {
+		brelse(sinfo.bh);
+		err = -EINVAL;
+		goto out;
+	}
+
+	ts = CURRENT_TIME_SEC;
+	err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo);
+	if (err)
+		goto out;
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	d_instantiate(dentry, inode);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
+	return err;
+}
+
+/***** Remove a directory */
+static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = dentry->d_inode;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+	/*
+	 * Check whether the directory is not in use, then check
+	 * whether it is empty.
+	 */
+	err = fat_dir_empty(inode);
+	if (err)
+		goto out;
+	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	drop_nlink(dir);
+
+	clear_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
+
+	return err;
+}
+
+/***** Make a directory */
+static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	struct inode *inode;
+	unsigned char msdos_name[MSDOS_NAME];
+	struct timespec ts;
+	int err, is_hid, cluster;
+
+	lock_super(sb);
+
+	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
+				msdos_name, &MSDOS_SB(sb)->options);
+	if (err)
+		goto out;
+	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
+	/* foo vs .foo situation */
+	if (!fat_scan(dir, msdos_name, &sinfo)) {
+		brelse(sinfo.bh);
+		err = -EINVAL;
+		goto out;
+	}
+
+	ts = CURRENT_TIME_SEC;
+	cluster = fat_alloc_new_dir(dir, &ts);
+	if (cluster < 0) {
+		err = cluster;
+		goto out;
+	}
+	err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo);
+	if (err)
+		goto out_free;
+	inc_nlink(dir);
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		/* the directory was completed, just return a error */
+		goto out;
+	}
+	inode->i_nlink = 2;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	d_instantiate(dentry, inode);
+
+	unlock_super(sb);
+	fat_flush_inodes(sb, dir, inode);
+	return 0;
+
+out_free:
+	fat_free_clusters(dir, cluster);
+out:
+	unlock_super(sb);
+	return err;
+}
+
+/***** Unlink a file */
+static int msdos_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb= inode->i_sb;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	clear_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
+
+	return err;
+}
+
+static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
+			   struct dentry *old_dentry,
+			   struct inode *new_dir, unsigned char *new_name,
+			   struct dentry *new_dentry, int is_hid)
+{
+	struct buffer_head *dotdot_bh;
+	struct msdos_dir_entry *dotdot_de;
+	struct inode *old_inode, *new_inode;
+	struct fat_slot_info old_sinfo, sinfo;
+	struct timespec ts;
+	loff_t dotdot_i_pos, new_i_pos;
+	int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
+
+	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
+	old_inode = old_dentry->d_inode;
+	new_inode = new_dentry->d_inode;
+
+	err = fat_scan(old_dir, old_name, &old_sinfo);
+	if (err) {
+		err = -EIO;
+		goto out;
+	}
+
+	is_dir = S_ISDIR(old_inode->i_mode);
+	update_dotdot = (is_dir && old_dir != new_dir);
+	if (update_dotdot) {
+		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
+					 &dotdot_i_pos) < 0) {
+			err = -EIO;
+			goto out;
+		}
+	}
+
+	old_attrs = MSDOS_I(old_inode)->i_attrs;
+	err = fat_scan(new_dir, new_name, &sinfo);
+	if (!err) {
+		if (!new_inode) {
+			/* "foo" -> ".foo" case. just change the ATTR_HIDDEN */
+			if (sinfo.de != old_sinfo.de) {
+				err = -EINVAL;
+				goto out;
+			}
+			if (is_hid)
+				MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
+			else
+				MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
+			if (IS_DIRSYNC(old_dir)) {
+				err = fat_sync_inode(old_inode);
+				if (err) {
+					MSDOS_I(old_inode)->i_attrs = old_attrs;
+					goto out;
+				}
+			} else
+				mark_inode_dirty(old_inode);
+
+			old_dir->i_version++;
+			old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+			if (IS_DIRSYNC(old_dir))
+				(void)fat_sync_inode(old_dir);
+			else
+				mark_inode_dirty(old_dir);
+			goto out;
+		}
+	}
+
+	ts = CURRENT_TIME_SEC;
+	if (new_inode) {
+		if (err)
+			goto out;
+		if (is_dir) {
+			err = fat_dir_empty(new_inode);
+			if (err)
+				goto out;
+		}
+		new_i_pos = MSDOS_I(new_inode)->i_pos;
+		fat_detach(new_inode);
+	} else {
+		err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0,
+				      &ts, &sinfo);
+		if (err)
+			goto out;
+		new_i_pos = sinfo.i_pos;
+	}
+	new_dir->i_version++;
+
+	fat_detach(old_inode);
+	fat_attach(old_inode, new_i_pos);
+	if (is_hid)
+		MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
+	else
+		MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
+	if (IS_DIRSYNC(new_dir)) {
+		err = fat_sync_inode(old_inode);
+		if (err)
+			goto error_inode;
+	} else
+		mark_inode_dirty(old_inode);
+
+	if (update_dotdot) {
+		int start = MSDOS_I(new_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
+		if (IS_DIRSYNC(new_dir)) {
+			err = sync_dirty_buffer(dotdot_bh);
+			if (err)
+				goto error_dotdot;
+		}
+		drop_nlink(old_dir);
+		if (!new_inode)
+			inc_nlink(new_dir);
+	}
+
+	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
+	old_sinfo.bh = NULL;
+	if (err)
+		goto error_dotdot;
+	old_dir->i_version++;
+	old_dir->i_ctime = old_dir->i_mtime = ts;
+	if (IS_DIRSYNC(old_dir))
+		(void)fat_sync_inode(old_dir);
+	else
+		mark_inode_dirty(old_dir);
+
+	if (new_inode) {
+		drop_nlink(new_inode);
+		if (is_dir)
+			drop_nlink(new_inode);
+		new_inode->i_ctime = ts;
+	}
+out:
+	brelse(sinfo.bh);
+	brelse(dotdot_bh);
+	brelse(old_sinfo.bh);
+	return err;
+
+error_dotdot:
+	/* data cluster is shared, serious corruption */
+	corrupt = 1;
+
+	if (update_dotdot) {
+		int start = MSDOS_I(old_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
+		corrupt |= sync_dirty_buffer(dotdot_bh);
+	}
+error_inode:
+	fat_detach(old_inode);
+	fat_attach(old_inode, old_sinfo.i_pos);
+	MSDOS_I(old_inode)->i_attrs = old_attrs;
+	if (new_inode) {
+		fat_attach(new_inode, new_i_pos);
+		if (corrupt)
+			corrupt |= fat_sync_inode(new_inode);
+	} else {
+		/*
+		 * If new entry was not sharing the data cluster, it
+		 * shouldn't be serious corruption.
+		 */
+		int err2 = fat_remove_entries(new_dir, &sinfo);
+		if (corrupt)
+			corrupt |= err2;
+		sinfo.bh = NULL;
+	}
+	if (corrupt < 0) {
+		fat_fs_error(new_dir->i_sb,
+			     "%s: Filesystem corrupted (i_pos %lld)",
+			     __func__, sinfo.i_pos);
+	}
+	goto out;
+}
+
+/***** Rename, a wrapper for rename_same_dir & rename_diff_dir */
+static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct super_block *sb = old_dir->i_sb;
+	unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
+	int err, is_hid;
+
+	lock_super(sb);
+
+	err = msdos_format_name(old_dentry->d_name.name,
+				old_dentry->d_name.len, old_msdos_name,
+				&MSDOS_SB(old_dir->i_sb)->options);
+	if (err)
+		goto out;
+	err = msdos_format_name(new_dentry->d_name.name,
+				new_dentry->d_name.len, new_msdos_name,
+				&MSDOS_SB(new_dir->i_sb)->options);
+	if (err)
+		goto out;
+
+	is_hid =
+	     (new_dentry->d_name.name[0] == '.') && (new_msdos_name[0] != '.');
+
+	err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
+			      new_dir, new_msdos_name, new_dentry, is_hid);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, old_dir, new_dir);
+	return err;
+}
+
+static const struct inode_operations msdos_dir_inode_operations = {
+	.create		= msdos_create,
+	.lookup		= msdos_lookup,
+	.unlink		= msdos_unlink,
+	.mkdir		= msdos_mkdir,
+	.rmdir		= msdos_rmdir,
+	.rename		= msdos_rename,
+	.setattr	= fat_setattr,
+	.getattr	= fat_getattr,
+};
+
+static void setup(struct super_block *sb)
+{
+	MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations;
+	sb->s_d_op = &msdos_dentry_operations;
+	sb->s_flags |= MS_NOATIME;
+}
+
+static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+{
+	return fat_fill_super(sb, data, silent, 0, setup);
+}
+
+static struct dentry *msdos_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
+}
+
+static struct file_system_type msdos_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "msdos",
+	.mount		= msdos_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_msdos_fs(void)
+{
+	return register_filesystem(&msdos_fs_type);
+}
+
+static void __exit exit_msdos_fs(void)
+{
+	unregister_filesystem(&msdos_fs_type);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Werner Almesberger");
+MODULE_DESCRIPTION("MS-DOS filesystem support");
+
+module_init(init_msdos_fs)
+module_exit(exit_msdos_fs)
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
new file mode 100644
index 00000000..20b4ea53
--- /dev/null
+++ b/fs/fat/namei_vfat.c
@@ -0,0 +1,1110 @@
+/*
+ *  linux/fs/vfat/namei.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *
+ *  Windows95/Windows NT compatible extended MSDOS filesystem
+ *    by Gordon Chaffee Copyright (C) 1995.  Send bug reports for the
+ *    VFAT filesystem to <chaffee@cs.berkeley.edu>.  Specify
+ *    what file operation caused you trouble and if you can duplicate
+ *    the problem, send a script that demonstrates it.
+ *
+ *  Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de>
+ *
+ *  Support Multibyte characters and cleanup by
+ *				OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+ */
+
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/namei.h>
+#include "fat.h"
+
+/*
+ * If new entry was created in the parent, it could create the 8.3
+ * alias (the shortname of logname).  So, the parent may have the
+ * negative-dentry which matches the created 8.3 alias.
+ *
+ * If it happened, the negative dentry isn't actually negative
+ * anymore.  So, drop it.
+ */
+static int vfat_revalidate_shortname(struct dentry *dentry)
+{
+	int ret = 1;
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_time != dentry->d_parent->d_inode->i_version)
+		ret = 0;
+	spin_unlock(&dentry->d_lock);
+	return ret;
+}
+
+static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	/* This is not negative dentry. Always valid. */
+	if (dentry->d_inode)
+		return 1;
+	return vfat_revalidate_shortname(dentry);
+}
+
+static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
+{
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	/*
+	 * This is not negative dentry. Always valid.
+	 *
+	 * Note, rename() to existing directory entry will have ->d_inode,
+	 * and will use existing name which isn't specified name by user.
+	 *
+	 * We may be able to drop this positive dentry here. But dropping
+	 * positive dentry isn't good idea. So it's unsupported like
+	 * rename("filename", "FILENAME") for now.
+	 */
+	if (dentry->d_inode)
+		return 1;
+
+	/*
+	 * This may be nfsd (or something), anyway, we can't see the
+	 * intent of this. So, since this can be for creation, drop it.
+	 */
+	if (!nd)
+		return 0;
+
+	/*
+	 * Drop the negative dentry, in order to make sure to use the
+	 * case sensitive name which is specified by user if this is
+	 * for creation.
+	 */
+	if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+		if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+			return 0;
+	}
+
+	return vfat_revalidate_shortname(dentry);
+}
+
+/* returns the length of a struct qstr, ignoring trailing dots */
+static unsigned int __vfat_striptail_len(unsigned int len, const char *name)
+{
+	while (len && name[len - 1] == '.')
+		len--;
+	return len;
+}
+
+static unsigned int vfat_striptail_len(const struct qstr *qstr)
+{
+	return __vfat_striptail_len(qstr->len, qstr->name);
+}
+
+/*
+ * Compute the hash for the vfat name corresponding to the dentry.
+ * Note: if the name is invalid, we leave the hash code unchanged so
+ * that the existing dentry can be used. The vfat fs routines will
+ * return ENOENT or EINVAL as appropriate.
+ */
+static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
+{
+	qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
+	return 0;
+}
+
+/*
+ * Compute the hash for the vfat name corresponding to the dentry.
+ * Note: if the name is invalid, we leave the hash code unchanged so
+ * that the existing dentry can be used. The vfat fs routines will
+ * return ENOENT or EINVAL as appropriate.
+ */
+static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
+{
+	struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
+	const unsigned char *name;
+	unsigned int len;
+	unsigned long hash;
+
+	name = qstr->name;
+	len = vfat_striptail_len(qstr);
+
+	hash = init_name_hash();
+	while (len--)
+		hash = partial_name_hash(nls_tolower(t, *name++), hash);
+	qstr->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+/*
+ * Case insensitive compare of two vfat names.
+ */
+static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
+	unsigned int alen, blen;
+
+	/* A filename cannot end in '.' or we treat it like it has none */
+	alen = vfat_striptail_len(name);
+	blen = __vfat_striptail_len(len, str);
+	if (alen == blen) {
+		if (nls_strnicmp(t, name->name, str, alen) == 0)
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * Case sensitive compare of two vfat names.
+ */
+static int vfat_cmp(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	unsigned int alen, blen;
+
+	/* A filename cannot end in '.' or we treat it like it has none */
+	alen = vfat_striptail_len(name);
+	blen = __vfat_striptail_len(len, str);
+	if (alen == blen) {
+		if (strncmp(name->name, str, alen) == 0)
+			return 0;
+	}
+	return 1;
+}
+
+static const struct dentry_operations vfat_ci_dentry_ops = {
+	.d_revalidate	= vfat_revalidate_ci,
+	.d_hash		= vfat_hashi,
+	.d_compare	= vfat_cmpi,
+};
+
+static const struct dentry_operations vfat_dentry_ops = {
+	.d_revalidate	= vfat_revalidate,
+	.d_hash		= vfat_hash,
+	.d_compare	= vfat_cmp,
+};
+
+/* Characters that are undesirable in an MS-DOS file name */
+
+static inline wchar_t vfat_bad_char(wchar_t w)
+{
+	return (w < 0x0020)
+	    || (w == '*') || (w == '?') || (w == '<') || (w == '>')
+	    || (w == '|') || (w == '"') || (w == ':') || (w == '/')
+	    || (w == '\\');
+}
+
+static inline wchar_t vfat_replace_char(wchar_t w)
+{
+	return (w == '[') || (w == ']') || (w == ';') || (w == ',')
+	    || (w == '+') || (w == '=');
+}
+
+static wchar_t vfat_skip_char(wchar_t w)
+{
+	return (w == '.') || (w == ' ');
+}
+
+static inline int vfat_is_used_badchars(const wchar_t *s, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (vfat_bad_char(s[i]))
+			return -EINVAL;
+
+	if (s[i - 1] == ' ') /* last character cannot be space */
+		return -EINVAL;
+
+	return 0;
+}
+
+static int vfat_find_form(struct inode *dir, unsigned char *name)
+{
+	struct fat_slot_info sinfo;
+	int err = fat_scan(dir, name, &sinfo);
+	if (err)
+		return -ENOENT;
+	brelse(sinfo.bh);
+	return 0;
+}
+
+/*
+ * 1) Valid characters for the 8.3 format alias are any combination of
+ * letters, uppercase alphabets, digits, any of the
+ * following special characters:
+ *     $ % ' ` - @ { } ~ ! # ( ) & _ ^
+ * In this case Longfilename is not stored in disk.
+ *
+ * WinNT's Extension:
+ * File name and extension name is contain uppercase/lowercase
+ * only. And it is expressed by CASE_LOWER_BASE and CASE_LOWER_EXT.
+ *
+ * 2) File name is 8.3 format, but it contain the uppercase and
+ * lowercase char, muliti bytes char, etc. In this case numtail is not
+ * added, but Longfilename is stored.
+ *
+ * 3) When the one except for the above, or the following special
+ * character are contained:
+ *        .   [ ] ; , + =
+ * numtail is added, and Longfilename must be stored in disk .
+ */
+struct shortname_info {
+	unsigned char lower:1,
+		      upper:1,
+		      valid:1;
+};
+#define INIT_SHORTNAME_INFO(x)	do {		\
+	(x)->lower = 1;				\
+	(x)->upper = 1;				\
+	(x)->valid = 1;				\
+} while (0)
+
+static inline int to_shortname_char(struct nls_table *nls,
+				    unsigned char *buf, int buf_size,
+				    wchar_t *src, struct shortname_info *info)
+{
+	int len;
+
+	if (vfat_skip_char(*src)) {
+		info->valid = 0;
+		return 0;
+	}
+	if (vfat_replace_char(*src)) {
+		info->valid = 0;
+		buf[0] = '_';
+		return 1;
+	}
+
+	len = nls->uni2char(*src, buf, buf_size);
+	if (len <= 0) {
+		info->valid = 0;
+		buf[0] = '_';
+		len = 1;
+	} else if (len == 1) {
+		unsigned char prev = buf[0];
+
+		if (buf[0] >= 0x7F) {
+			info->lower = 0;
+			info->upper = 0;
+		}
+
+		buf[0] = nls_toupper(nls, buf[0]);
+		if (isalpha(buf[0])) {
+			if (buf[0] == prev)
+				info->lower = 0;
+			else
+				info->upper = 0;
+		}
+	} else {
+		info->lower = 0;
+		info->upper = 0;
+	}
+
+	return len;
+}
+
+/*
+ * Given a valid longname, create a unique shortname.  Make sure the
+ * shortname does not exist
+ * Returns negative number on error, 0 for a normal
+ * return, and 1 for valid shortname
+ */
+static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
+				 wchar_t *uname, int ulen,
+				 unsigned char *name_res, unsigned char *lcase)
+{
+	struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
+	wchar_t *ip, *ext_start, *end, *name_start;
+	unsigned char base[9], ext[4], buf[5], *p;
+	unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
+	int chl, chi;
+	int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
+	int is_shortname;
+	struct shortname_info base_info, ext_info;
+
+	is_shortname = 1;
+	INIT_SHORTNAME_INFO(&base_info);
+	INIT_SHORTNAME_INFO(&ext_info);
+
+	/* Now, we need to create a shortname from the long name */
+	ext_start = end = &uname[ulen];
+	while (--ext_start >= uname) {
+		if (*ext_start == 0x002E) {	/* is `.' */
+			if (ext_start == end - 1) {
+				sz = ulen;
+				ext_start = NULL;
+			}
+			break;
+		}
+	}
+
+	if (ext_start == uname - 1) {
+		sz = ulen;
+		ext_start = NULL;
+	} else if (ext_start) {
+		/*
+		 * Names which start with a dot could be just
+		 * an extension eg. "...test".  In this case Win95
+		 * uses the extension as the name and sets no extension.
+		 */
+		name_start = &uname[0];
+		while (name_start < ext_start) {
+			if (!vfat_skip_char(*name_start))
+				break;
+			name_start++;
+		}
+		if (name_start != ext_start) {
+			sz = ext_start - uname;
+			ext_start++;
+		} else {
+			sz = ulen;
+			ext_start = NULL;
+		}
+	}
+
+	numtail_baselen = 6;
+	numtail2_baselen = 2;
+	for (baselen = i = 0, p = base, ip = uname; i < sz; i++, ip++) {
+		chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
+					ip, &base_info);
+		if (chl == 0)
+			continue;
+
+		if (baselen < 2 && (baselen + chl) > 2)
+			numtail2_baselen = baselen;
+		if (baselen < 6 && (baselen + chl) > 6)
+			numtail_baselen = baselen;
+		for (chi = 0; chi < chl; chi++) {
+			*p++ = charbuf[chi];
+			baselen++;
+			if (baselen >= 8)
+				break;
+		}
+		if (baselen >= 8) {
+			if ((chi < chl - 1) || (ip + 1) - uname < sz)
+				is_shortname = 0;
+			break;
+		}
+	}
+	if (baselen == 0) {
+		return -EINVAL;
+	}
+
+	extlen = 0;
+	if (ext_start) {
+		for (p = ext, ip = ext_start; extlen < 3 && ip < end; ip++) {
+			chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
+						ip, &ext_info);
+			if (chl == 0)
+				continue;
+
+			if ((extlen + chl) > 3) {
+				is_shortname = 0;
+				break;
+			}
+			for (chi = 0; chi < chl; chi++) {
+				*p++ = charbuf[chi];
+				extlen++;
+			}
+			if (extlen >= 3) {
+				if (ip + 1 != end)
+					is_shortname = 0;
+				break;
+			}
+		}
+	}
+	ext[extlen] = '\0';
+	base[baselen] = '\0';
+
+	/* Yes, it can happen. ".\xe5" would do it. */
+	if (base[0] == DELETED_FLAG)
+		base[0] = 0x05;
+
+	/* OK, at this point we know that base is not longer than 8 symbols,
+	 * ext is not longer than 3, base is nonempty, both don't contain
+	 * any bad symbols (lowercase transformed to uppercase).
+	 */
+
+	memset(name_res, ' ', MSDOS_NAME);
+	memcpy(name_res, base, baselen);
+	memcpy(name_res + 8, ext, extlen);
+	*lcase = 0;
+	if (is_shortname && base_info.valid && ext_info.valid) {
+		if (vfat_find_form(dir, name_res) == 0)
+			return -EEXIST;
+
+		if (opts->shortname & VFAT_SFN_CREATE_WIN95) {
+			return (base_info.upper && ext_info.upper);
+		} else if (opts->shortname & VFAT_SFN_CREATE_WINNT) {
+			if ((base_info.upper || base_info.lower) &&
+			    (ext_info.upper || ext_info.lower)) {
+				if (!base_info.upper && base_info.lower)
+					*lcase |= CASE_LOWER_BASE;
+				if (!ext_info.upper && ext_info.lower)
+					*lcase |= CASE_LOWER_EXT;
+				return 1;
+			}
+			return 0;
+		} else {
+			BUG();
+		}
+	}
+
+	if (opts->numtail == 0)
+		if (vfat_find_form(dir, name_res) < 0)
+			return 0;
+
+	/*
+	 * Try to find a unique extension.  This used to
+	 * iterate through all possibilities sequentially,
+	 * but that gave extremely bad performance.  Windows
+	 * only tries a few cases before using random
+	 * values for part of the base.
+	 */
+
+	if (baselen > 6) {
+		baselen = numtail_baselen;
+		name_res[7] = ' ';
+	}
+	name_res[baselen] = '~';
+	for (i = 1; i < 10; i++) {
+		name_res[baselen + 1] = i + '0';
+		if (vfat_find_form(dir, name_res) < 0)
+			return 0;
+	}
+
+	i = jiffies;
+	sz = (jiffies >> 16) & 0x7;
+	if (baselen > 2) {
+		baselen = numtail2_baselen;
+		name_res[7] = ' ';
+	}
+	name_res[baselen + 4] = '~';
+	name_res[baselen + 5] = '1' + sz;
+	while (1) {
+		snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
+		memcpy(&name_res[baselen], buf, 4);
+		if (vfat_find_form(dir, name_res) < 0)
+			break;
+		i -= 11;
+	}
+	return 0;
+}
+
+/* Translate a string, including coded sequences into Unicode */
+static int
+xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
+	     int *longlen, int *outlen, int escape, int utf8,
+	     struct nls_table *nls)
+{
+	const unsigned char *ip;
+	unsigned char nc;
+	unsigned char *op;
+	unsigned int ec;
+	int i, k, fill;
+	int charlen;
+
+	if (utf8) {
+		*outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
+		if (*outlen < 0)
+			return *outlen;
+		else if (*outlen > FAT_LFN_LEN)
+			return -ENAMETOOLONG;
+
+		op = &outname[*outlen * sizeof(wchar_t)];
+	} else {
+		if (nls) {
+			for (i = 0, ip = name, op = outname, *outlen = 0;
+			     i < len && *outlen <= FAT_LFN_LEN;
+			     *outlen += 1)
+			{
+				if (escape && (*ip == ':')) {
+					if (i > len - 5)
+						return -EINVAL;
+					ec = 0;
+					for (k = 1; k < 5; k++) {
+						nc = ip[k];
+						ec <<= 4;
+						if (nc >= '0' && nc <= '9') {
+							ec |= nc - '0';
+							continue;
+						}
+						if (nc >= 'a' && nc <= 'f') {
+							ec |= nc - ('a' - 10);
+							continue;
+						}
+						if (nc >= 'A' && nc <= 'F') {
+							ec |= nc - ('A' - 10);
+							continue;
+						}
+						return -EINVAL;
+					}
+					*op++ = ec & 0xFF;
+					*op++ = ec >> 8;
+					ip += 5;
+					i += 5;
+				} else {
+					if ((charlen = nls->char2uni(ip, len - i, (wchar_t *)op)) < 0)
+						return -EINVAL;
+					ip += charlen;
+					i += charlen;
+					op += 2;
+				}
+			}
+			if (i < len)
+				return -ENAMETOOLONG;
+		} else {
+			for (i = 0, ip = name, op = outname, *outlen = 0;
+			     i < len && *outlen <= FAT_LFN_LEN;
+			     i++, *outlen += 1)
+			{
+				*op++ = *ip++;
+				*op++ = 0;
+			}
+			if (i < len)
+				return -ENAMETOOLONG;
+		}
+	}
+
+	*longlen = *outlen;
+	if (*outlen % 13) {
+		*op++ = 0;
+		*op++ = 0;
+		*outlen += 1;
+		if (*outlen % 13) {
+			fill = 13 - (*outlen % 13);
+			for (i = 0; i < fill; i++) {
+				*op++ = 0xff;
+				*op++ = 0xff;
+			}
+			*outlen += fill;
+		}
+	}
+
+	return 0;
+}
+
+static int vfat_build_slots(struct inode *dir, const unsigned char *name,
+			    int len, int is_dir, int cluster,
+			    struct timespec *ts,
+			    struct msdos_dir_slot *slots, int *nr_slots)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
+	struct fat_mount_options *opts = &sbi->options;
+	struct msdos_dir_slot *ps;
+	struct msdos_dir_entry *de;
+	unsigned char cksum, lcase;
+	unsigned char msdos_name[MSDOS_NAME];
+	wchar_t *uname;
+	__le16 time, date;
+	u8 time_cs;
+	int err, ulen, usize, i;
+	loff_t offset;
+
+	*nr_slots = 0;
+
+	uname = __getname();
+	if (!uname)
+		return -ENOMEM;
+
+	err = xlate_to_uni(name, len, (unsigned char *)uname, &ulen, &usize,
+			   opts->unicode_xlate, opts->utf8, sbi->nls_io);
+	if (err)
+		goto out_free;
+
+	err = vfat_is_used_badchars(uname, ulen);
+	if (err)
+		goto out_free;
+
+	err = vfat_create_shortname(dir, sbi->nls_disk, uname, ulen,
+				    msdos_name, &lcase);
+	if (err < 0)
+		goto out_free;
+	else if (err == 1) {
+		de = (struct msdos_dir_entry *)slots;
+		err = 0;
+		goto shortname;
+	}
+
+	/* build the entry of long file name */
+	cksum = fat_checksum(msdos_name);
+
+	*nr_slots = usize / 13;
+	for (ps = slots, i = *nr_slots; i > 0; i--, ps++) {
+		ps->id = i;
+		ps->attr = ATTR_EXT;
+		ps->reserved = 0;
+		ps->alias_checksum = cksum;
+		ps->start = 0;
+		offset = (i - 1) * 13;
+		fatwchar_to16(ps->name0_4, uname + offset, 5);
+		fatwchar_to16(ps->name5_10, uname + offset + 5, 6);
+		fatwchar_to16(ps->name11_12, uname + offset + 11, 2);
+	}
+	slots[0].id |= 0x40;
+	de = (struct msdos_dir_entry *)ps;
+
+shortname:
+	/* build the entry of 8.3 alias name */
+	(*nr_slots)++;
+	memcpy(de->name, msdos_name, MSDOS_NAME);
+	de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
+	de->lcase = lcase;
+	fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
+	de->time = de->ctime = time;
+	de->date = de->cdate = de->adate = date;
+	de->ctime_cs = time_cs;
+	de->start = cpu_to_le16(cluster);
+	de->starthi = cpu_to_le16(cluster >> 16);
+	de->size = 0;
+out_free:
+	__putname(uname);
+	return err;
+}
+
+static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
+			  int cluster, struct timespec *ts,
+			  struct fat_slot_info *sinfo)
+{
+	struct msdos_dir_slot *slots;
+	unsigned int len;
+	int err, nr_slots;
+
+	len = vfat_striptail_len(qname);
+	if (len == 0)
+		return -ENOENT;
+
+	slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
+	if (slots == NULL)
+		return -ENOMEM;
+
+	err = vfat_build_slots(dir, qname->name, len, is_dir, cluster, ts,
+			       slots, &nr_slots);
+	if (err)
+		goto cleanup;
+
+	err = fat_add_entries(dir, slots, nr_slots, sinfo);
+	if (err)
+		goto cleanup;
+
+	/* update timestamp */
+	dir->i_ctime = dir->i_mtime = dir->i_atime = *ts;
+	if (IS_DIRSYNC(dir))
+		(void)fat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+cleanup:
+	kfree(slots);
+	return err;
+}
+
+static int vfat_find(struct inode *dir, struct qstr *qname,
+		     struct fat_slot_info *sinfo)
+{
+	unsigned int len = vfat_striptail_len(qname);
+	if (len == 0)
+		return -ENOENT;
+	return fat_search_long(dir, qname->name, len, sinfo);
+}
+
+/*
+ * (nfsd's) anonymous disconnected dentry?
+ * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job).
+ */
+static int vfat_d_anon_disconn(struct dentry *dentry)
+{
+	return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED);
+}
+
+static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
+				  struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	struct inode *inode;
+	struct dentry *alias;
+	int err;
+
+	lock_super(sb);
+
+	err = vfat_find(dir, &dentry->d_name, &sinfo);
+	if (err) {
+		if (err == -ENOENT) {
+			inode = NULL;
+			goto out;
+		}
+		goto error;
+	}
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto error;
+	}
+
+	alias = d_find_alias(inode);
+	if (alias && !vfat_d_anon_disconn(alias)) {
+		/*
+		 * This inode has non anonymous-DCACHE_DISCONNECTED
+		 * dentry. This means, the user did ->lookup() by an
+		 * another name (longname vs 8.3 alias of it) in past.
+		 *
+		 * Switch to new one for reason of locality if possible.
+		 */
+		BUG_ON(d_unhashed(alias));
+		if (!S_ISDIR(inode->i_mode))
+			d_move(alias, dentry);
+		iput(inode);
+		unlock_super(sb);
+		return alias;
+	} else
+		dput(alias);
+
+out:
+	unlock_super(sb);
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	dentry = d_splice_alias(inode, dentry);
+	if (dentry)
+		dentry->d_time = dentry->d_parent->d_inode->i_version;
+	return dentry;
+
+error:
+	unlock_super(sb);
+	return ERR_PTR(err);
+}
+
+static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
+		       struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct fat_slot_info sinfo;
+	struct timespec ts;
+	int err;
+
+	lock_super(sb);
+
+	ts = CURRENT_TIME_SEC;
+	err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
+	if (err)
+		goto out;
+	dir->i_version++;
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_version++;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	d_instantiate(dentry, inode);
+out:
+	unlock_super(sb);
+	return err;
+}
+
+static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+
+	err = fat_dir_empty(inode);
+	if (err)
+		goto out;
+	err = vfat_find(dir, &dentry->d_name, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	drop_nlink(dir);
+
+	clear_nlink(inode);
+	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+
+	return err;
+}
+
+static int vfat_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+
+	err = vfat_find(dir, &dentry->d_name, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	clear_nlink(inode);
+	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+
+	return err;
+}
+
+static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct fat_slot_info sinfo;
+	struct timespec ts;
+	int err, cluster;
+
+	lock_super(sb);
+
+	ts = CURRENT_TIME_SEC;
+	cluster = fat_alloc_new_dir(dir, &ts);
+	if (cluster < 0) {
+		err = cluster;
+		goto out;
+	}
+	err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo);
+	if (err)
+		goto out_free;
+	dir->i_version++;
+	inc_nlink(dir);
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		/* the directory was completed, just return a error */
+		goto out;
+	}
+	inode->i_version++;
+	inode->i_nlink = 2;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	d_instantiate(dentry, inode);
+
+	unlock_super(sb);
+	return 0;
+
+out_free:
+	fat_free_clusters(dir, cluster);
+out:
+	unlock_super(sb);
+	return err;
+}
+
+static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct buffer_head *dotdot_bh;
+	struct msdos_dir_entry *dotdot_de;
+	struct inode *old_inode, *new_inode;
+	struct fat_slot_info old_sinfo, sinfo;
+	struct timespec ts;
+	loff_t dotdot_i_pos, new_i_pos;
+	int err, is_dir, update_dotdot, corrupt = 0;
+	struct super_block *sb = old_dir->i_sb;
+
+	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
+	old_inode = old_dentry->d_inode;
+	new_inode = new_dentry->d_inode;
+	lock_super(sb);
+	err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
+	if (err)
+		goto out;
+
+	is_dir = S_ISDIR(old_inode->i_mode);
+	update_dotdot = (is_dir && old_dir != new_dir);
+	if (update_dotdot) {
+		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
+					 &dotdot_i_pos) < 0) {
+			err = -EIO;
+			goto out;
+		}
+	}
+
+	ts = CURRENT_TIME_SEC;
+	if (new_inode) {
+		if (is_dir) {
+			err = fat_dir_empty(new_inode);
+			if (err)
+				goto out;
+		}
+		new_i_pos = MSDOS_I(new_inode)->i_pos;
+		fat_detach(new_inode);
+	} else {
+		err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0,
+				     &ts, &sinfo);
+		if (err)
+			goto out;
+		new_i_pos = sinfo.i_pos;
+	}
+	new_dir->i_version++;
+
+	fat_detach(old_inode);
+	fat_attach(old_inode, new_i_pos);
+	if (IS_DIRSYNC(new_dir)) {
+		err = fat_sync_inode(old_inode);
+		if (err)
+			goto error_inode;
+	} else
+		mark_inode_dirty(old_inode);
+
+	if (update_dotdot) {
+		int start = MSDOS_I(new_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
+		if (IS_DIRSYNC(new_dir)) {
+			err = sync_dirty_buffer(dotdot_bh);
+			if (err)
+				goto error_dotdot;
+		}
+		drop_nlink(old_dir);
+		if (!new_inode)
+ 			inc_nlink(new_dir);
+	}
+
+	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
+	old_sinfo.bh = NULL;
+	if (err)
+		goto error_dotdot;
+	old_dir->i_version++;
+	old_dir->i_ctime = old_dir->i_mtime = ts;
+	if (IS_DIRSYNC(old_dir))
+		(void)fat_sync_inode(old_dir);
+	else
+		mark_inode_dirty(old_dir);
+
+	if (new_inode) {
+		drop_nlink(new_inode);
+		if (is_dir)
+			drop_nlink(new_inode);
+		new_inode->i_ctime = ts;
+	}
+out:
+	brelse(sinfo.bh);
+	brelse(dotdot_bh);
+	brelse(old_sinfo.bh);
+	unlock_super(sb);
+
+	return err;
+
+error_dotdot:
+	/* data cluster is shared, serious corruption */
+	corrupt = 1;
+
+	if (update_dotdot) {
+		int start = MSDOS_I(old_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
+		corrupt |= sync_dirty_buffer(dotdot_bh);
+	}
+error_inode:
+	fat_detach(old_inode);
+	fat_attach(old_inode, old_sinfo.i_pos);
+	if (new_inode) {
+		fat_attach(new_inode, new_i_pos);
+		if (corrupt)
+			corrupt |= fat_sync_inode(new_inode);
+	} else {
+		/*
+		 * If new entry was not sharing the data cluster, it
+		 * shouldn't be serious corruption.
+		 */
+		int err2 = fat_remove_entries(new_dir, &sinfo);
+		if (corrupt)
+			corrupt |= err2;
+		sinfo.bh = NULL;
+	}
+	if (corrupt < 0) {
+		fat_fs_error(new_dir->i_sb,
+			     "%s: Filesystem corrupted (i_pos %lld)",
+			     __func__, sinfo.i_pos);
+	}
+	goto out;
+}
+
+static const struct inode_operations vfat_dir_inode_operations = {
+	.create		= vfat_create,
+	.lookup		= vfat_lookup,
+	.unlink		= vfat_unlink,
+	.mkdir		= vfat_mkdir,
+	.rmdir		= vfat_rmdir,
+	.rename		= vfat_rename,
+	.setattr	= fat_setattr,
+	.getattr	= fat_getattr,
+};
+
+static void setup(struct super_block *sb)
+{
+	MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations;
+	if (MSDOS_SB(sb)->options.name_check != 's')
+		sb->s_d_op = &vfat_ci_dentry_ops;
+	else
+		sb->s_d_op = &vfat_dentry_ops;
+}
+
+static int vfat_fill_super(struct super_block *sb, void *data, int silent)
+{
+	return fat_fill_super(sb, data, silent, 1, setup);
+}
+
+static struct dentry *vfat_mount(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
+}
+
+static struct file_system_type vfat_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "vfat",
+	.mount		= vfat_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_vfat_fs(void)
+{
+	return register_filesystem(&vfat_fs_type);
+}
+
+static void __exit exit_vfat_fs(void)
+{
+	unregister_filesystem(&vfat_fs_type);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("VFAT filesystem support");
+MODULE_AUTHOR("Gordon Chaffee");
+
+module_init(init_vfat_fs)
+module_exit(exit_vfat_fs)
diff --git a/fs/fcntl.c b/fs/fcntl.c
new file mode 100644
index 00000000..22764c7c
--- /dev/null
+++ b/fs/fcntl.c
@@ -0,0 +1,853 @@
+/*
+ *  linux/fs/fcntl.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/syscalls.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/capability.h>
+#include <linux/dnotify.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/security.h>
+#include <linux/ptrace.h>
+#include <linux/signal.h>
+#include <linux/rcupdate.h>
+#include <linux/pid_namespace.h>
+
+#include <asm/poll.h>
+#include <asm/siginfo.h>
+#include <asm/uaccess.h>
+
+void set_close_on_exec(unsigned int fd, int flag)
+{
+	struct files_struct *files = current->files;
+	struct fdtable *fdt;
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	if (flag)
+		FD_SET(fd, fdt->close_on_exec);
+	else
+		FD_CLR(fd, fdt->close_on_exec);
+	spin_unlock(&files->file_lock);
+}
+
+static int get_close_on_exec(unsigned int fd)
+{
+	struct files_struct *files = current->files;
+	struct fdtable *fdt;
+	int res;
+	rcu_read_lock();
+	fdt = files_fdtable(files);
+	res = FD_ISSET(fd, fdt->close_on_exec);
+	rcu_read_unlock();
+	return res;
+}
+
+SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
+{
+	int err = -EBADF;
+	struct file * file, *tofree;
+	struct files_struct * files = current->files;
+	struct fdtable *fdt;
+
+	if ((flags & ~O_CLOEXEC) != 0)
+		return -EINVAL;
+
+	if (unlikely(oldfd == newfd))
+		return -EINVAL;
+
+	spin_lock(&files->file_lock);
+	err = expand_files(files, newfd);
+	file = fcheck(oldfd);
+	if (unlikely(!file))
+		goto Ebadf;
+	if (unlikely(err < 0)) {
+		if (err == -EMFILE)
+			goto Ebadf;
+		goto out_unlock;
+	}
+	/*
+	 * We need to detect attempts to do dup2() over allocated but still
+	 * not finished descriptor.  NB: OpenBSD avoids that at the price of
+	 * extra work in their equivalent of fget() - they insert struct
+	 * file immediately after grabbing descriptor, mark it larval if
+	 * more work (e.g. actual opening) is needed and make sure that
+	 * fget() treats larval files as absent.  Potentially interesting,
+	 * but while extra work in fget() is trivial, locking implications
+	 * and amount of surgery on open()-related paths in VFS are not.
+	 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
+	 * deadlocks in rather amusing ways, AFAICS.  All of that is out of
+	 * scope of POSIX or SUS, since neither considers shared descriptor
+	 * tables and this condition does not arise without those.
+	 */
+	err = -EBUSY;
+	fdt = files_fdtable(files);
+	tofree = fdt->fd[newfd];
+	if (!tofree && FD_ISSET(newfd, fdt->open_fds))
+		goto out_unlock;
+	get_file(file);
+	rcu_assign_pointer(fdt->fd[newfd], file);
+	FD_SET(newfd, fdt->open_fds);
+	if (flags & O_CLOEXEC)
+		FD_SET(newfd, fdt->close_on_exec);
+	else
+		FD_CLR(newfd, fdt->close_on_exec);
+	spin_unlock(&files->file_lock);
+
+	if (tofree)
+		filp_close(tofree, files);
+
+	return newfd;
+
+Ebadf:
+	err = -EBADF;
+out_unlock:
+	spin_unlock(&files->file_lock);
+	return err;
+}
+
+SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
+{
+	if (unlikely(newfd == oldfd)) { /* corner case */
+		struct files_struct *files = current->files;
+		int retval = oldfd;
+
+		rcu_read_lock();
+		if (!fcheck_files(files, oldfd))
+			retval = -EBADF;
+		rcu_read_unlock();
+		return retval;
+	}
+	return sys_dup3(oldfd, newfd, 0);
+}
+
+SYSCALL_DEFINE1(dup, unsigned int, fildes)
+{
+	int ret = -EBADF;
+	struct file *file = fget_raw(fildes);
+
+	if (file) {
+		ret = get_unused_fd();
+		if (ret >= 0)
+			fd_install(ret, file);
+		else
+			fput(file);
+	}
+	return ret;
+}
+
+#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
+
+static int setfl(int fd, struct file * filp, unsigned long arg)
+{
+	struct inode * inode = filp->f_path.dentry->d_inode;
+	int error = 0;
+
+	/*
+	 * O_APPEND cannot be cleared if the file is marked as append-only
+	 * and the file is open for write.
+	 */
+	if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode))
+		return -EPERM;
+
+	/* O_NOATIME can only be set by the owner or superuser */
+	if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
+		if (!inode_owner_or_capable(inode))
+			return -EPERM;
+
+	/* required for strict SunOS emulation */
+	if (O_NONBLOCK != O_NDELAY)
+	       if (arg & O_NDELAY)
+		   arg |= O_NONBLOCK;
+
+	if (arg & O_DIRECT) {
+		if (!filp->f_mapping || !filp->f_mapping->a_ops ||
+			!filp->f_mapping->a_ops->direct_IO)
+				return -EINVAL;
+	}
+
+	if (filp->f_op && filp->f_op->check_flags)
+		error = filp->f_op->check_flags(arg);
+	if (error)
+		return error;
+
+	/*
+	 * ->fasync() is responsible for setting the FASYNC bit.
+	 */
+	if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op &&
+			filp->f_op->fasync) {
+		error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
+		if (error < 0)
+			goto out;
+		if (error > 0)
+			error = 0;
+	}
+	spin_lock(&filp->f_lock);
+	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
+	spin_unlock(&filp->f_lock);
+
+ out:
+	return error;
+}
+
+static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
+                     int force)
+{
+	write_lock_irq(&filp->f_owner.lock);
+	if (force || !filp->f_owner.pid) {
+		put_pid(filp->f_owner.pid);
+		filp->f_owner.pid = get_pid(pid);
+		filp->f_owner.pid_type = type;
+
+		if (pid) {
+			const struct cred *cred = current_cred();
+			filp->f_owner.uid = cred->uid;
+			filp->f_owner.euid = cred->euid;
+		}
+	}
+	write_unlock_irq(&filp->f_owner.lock);
+}
+
+int __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
+		int force)
+{
+	int err;
+
+	err = security_file_set_fowner(filp);
+	if (err)
+		return err;
+
+	f_modown(filp, pid, type, force);
+	return 0;
+}
+EXPORT_SYMBOL(__f_setown);
+
+int f_setown(struct file *filp, unsigned long arg, int force)
+{
+	enum pid_type type;
+	struct pid *pid;
+	int who = arg;
+	int result;
+	type = PIDTYPE_PID;
+	if (who < 0) {
+		type = PIDTYPE_PGID;
+		who = -who;
+	}
+	rcu_read_lock();
+	pid = find_vpid(who);
+	result = __f_setown(filp, pid, type, force);
+	rcu_read_unlock();
+	return result;
+}
+EXPORT_SYMBOL(f_setown);
+
+void f_delown(struct file *filp)
+{
+	f_modown(filp, NULL, PIDTYPE_PID, 1);
+}
+
+pid_t f_getown(struct file *filp)
+{
+	pid_t pid;
+	read_lock(&filp->f_owner.lock);
+	pid = pid_vnr(filp->f_owner.pid);
+	if (filp->f_owner.pid_type == PIDTYPE_PGID)
+		pid = -pid;
+	read_unlock(&filp->f_owner.lock);
+	return pid;
+}
+
+static int f_setown_ex(struct file *filp, unsigned long arg)
+{
+	struct f_owner_ex * __user owner_p = (void * __user)arg;
+	struct f_owner_ex owner;
+	struct pid *pid;
+	int type;
+	int ret;
+
+	ret = copy_from_user(&owner, owner_p, sizeof(owner));
+	if (ret)
+		return -EFAULT;
+
+	switch (owner.type) {
+	case F_OWNER_TID:
+		type = PIDTYPE_MAX;
+		break;
+
+	case F_OWNER_PID:
+		type = PIDTYPE_PID;
+		break;
+
+	case F_OWNER_PGRP:
+		type = PIDTYPE_PGID;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	rcu_read_lock();
+	pid = find_vpid(owner.pid);
+	if (owner.pid && !pid)
+		ret = -ESRCH;
+	else
+		ret = __f_setown(filp, pid, type, 1);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int f_getown_ex(struct file *filp, unsigned long arg)
+{
+	struct f_owner_ex * __user owner_p = (void * __user)arg;
+	struct f_owner_ex owner;
+	int ret = 0;
+
+	read_lock(&filp->f_owner.lock);
+	owner.pid = pid_vnr(filp->f_owner.pid);
+	switch (filp->f_owner.pid_type) {
+	case PIDTYPE_MAX:
+		owner.type = F_OWNER_TID;
+		break;
+
+	case PIDTYPE_PID:
+		owner.type = F_OWNER_PID;
+		break;
+
+	case PIDTYPE_PGID:
+		owner.type = F_OWNER_PGRP;
+		break;
+
+	default:
+		WARN_ON(1);
+		ret = -EINVAL;
+		break;
+	}
+	read_unlock(&filp->f_owner.lock);
+
+	if (!ret) {
+		ret = copy_to_user(owner_p, &owner, sizeof(owner));
+		if (ret)
+			ret = -EFAULT;
+	}
+	return ret;
+}
+
+static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
+		struct file *filp)
+{
+	long err = -EINVAL;
+
+	switch (cmd) {
+	case F_DUPFD:
+	case F_DUPFD_CLOEXEC:
+		if (arg >= rlimit(RLIMIT_NOFILE))
+			break;
+		err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0);
+		if (err >= 0) {
+			get_file(filp);
+			fd_install(err, filp);
+		}
+		break;
+	case F_GETFD:
+		err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
+		break;
+	case F_SETFD:
+		err = 0;
+		set_close_on_exec(fd, arg & FD_CLOEXEC);
+		break;
+	case F_GETFL:
+		err = filp->f_flags;
+		break;
+	case F_SETFL:
+		err = setfl(fd, filp, arg);
+		break;
+	case F_GETLK:
+		err = fcntl_getlk(filp, (struct flock __user *) arg);
+		break;
+	case F_SETLK:
+	case F_SETLKW:
+		err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg);
+		break;
+	case F_GETOWN:
+		/*
+		 * XXX If f_owner is a process group, the
+		 * negative return value will get converted
+		 * into an error.  Oops.  If we keep the
+		 * current syscall conventions, the only way
+		 * to fix this will be in libc.
+		 */
+		err = f_getown(filp);
+		force_successful_syscall_return();
+		break;
+	case F_SETOWN:
+		err = f_setown(filp, arg, 1);
+		break;
+	case F_GETOWN_EX:
+		err = f_getown_ex(filp, arg);
+		break;
+	case F_SETOWN_EX:
+		err = f_setown_ex(filp, arg);
+		break;
+	case F_GETSIG:
+		err = filp->f_owner.signum;
+		break;
+	case F_SETSIG:
+		/* arg == 0 restores default behaviour. */
+		if (!valid_signal(arg)) {
+			break;
+		}
+		err = 0;
+		filp->f_owner.signum = arg;
+		break;
+	case F_GETLEASE:
+		err = fcntl_getlease(filp);
+		break;
+	case F_SETLEASE:
+		err = fcntl_setlease(fd, filp, arg);
+		break;
+	case F_NOTIFY:
+		err = fcntl_dirnotify(fd, filp, arg);
+		break;
+	case F_SETPIPE_SZ:
+	case F_GETPIPE_SZ:
+		err = pipe_fcntl(filp, cmd, arg);
+		break;
+	default:
+		break;
+	}
+	return err;
+}
+
+static int check_fcntl_cmd(unsigned cmd)
+{
+	switch (cmd) {
+	case F_DUPFD:
+	case F_DUPFD_CLOEXEC:
+	case F_GETFD:
+	case F_SETFD:
+	case F_GETFL:
+		return 1;
+	}
+	return 0;
+}
+
+SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
+{	
+	struct file *filp;
+	long err = -EBADF;
+
+	filp = fget_raw(fd);
+	if (!filp)
+		goto out;
+
+	if (unlikely(filp->f_mode & FMODE_PATH)) {
+		if (!check_fcntl_cmd(cmd)) {
+			fput(filp);
+			goto out;
+		}
+	}
+
+	err = security_file_fcntl(filp, cmd, arg);
+	if (err) {
+		fput(filp);
+		return err;
+	}
+
+	err = do_fcntl(fd, cmd, arg, filp);
+
+ 	fput(filp);
+out:
+	return err;
+}
+
+#if BITS_PER_LONG == 32
+SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+		unsigned long, arg)
+{	
+	struct file * filp;
+	long err;
+
+	err = -EBADF;
+	filp = fget_raw(fd);
+	if (!filp)
+		goto out;
+
+	if (unlikely(filp->f_mode & FMODE_PATH)) {
+		if (!check_fcntl_cmd(cmd)) {
+			fput(filp);
+			goto out;
+		}
+	}
+
+	err = security_file_fcntl(filp, cmd, arg);
+	if (err) {
+		fput(filp);
+		return err;
+	}
+	err = -EBADF;
+	
+	switch (cmd) {
+		case F_GETLK64:
+			err = fcntl_getlk64(filp, (struct flock64 __user *) arg);
+			break;
+		case F_SETLK64:
+		case F_SETLKW64:
+			err = fcntl_setlk64(fd, filp, cmd,
+					(struct flock64 __user *) arg);
+			break;
+		default:
+			err = do_fcntl(fd, cmd, arg, filp);
+			break;
+	}
+	fput(filp);
+out:
+	return err;
+}
+#endif
+
+/* Table to convert sigio signal codes into poll band bitmaps */
+
+static const long band_table[NSIGPOLL] = {
+	POLLIN | POLLRDNORM,			/* POLL_IN */
+	POLLOUT | POLLWRNORM | POLLWRBAND,	/* POLL_OUT */
+	POLLIN | POLLRDNORM | POLLMSG,		/* POLL_MSG */
+	POLLERR,				/* POLL_ERR */
+	POLLPRI | POLLRDBAND,			/* POLL_PRI */
+	POLLHUP | POLLERR			/* POLL_HUP */
+};
+
+static inline int sigio_perm(struct task_struct *p,
+                             struct fown_struct *fown, int sig)
+{
+	const struct cred *cred;
+	int ret;
+
+	rcu_read_lock();
+	cred = __task_cred(p);
+	ret = ((fown->euid == 0 ||
+		fown->euid == cred->suid || fown->euid == cred->uid ||
+		fown->uid  == cred->suid || fown->uid  == cred->uid) &&
+	       !security_file_send_sigiotask(p, fown, sig));
+	rcu_read_unlock();
+	return ret;
+}
+
+static void send_sigio_to_task(struct task_struct *p,
+			       struct fown_struct *fown,
+			       int fd, int reason, int group)
+{
+	/*
+	 * F_SETSIG can change ->signum lockless in parallel, make
+	 * sure we read it once and use the same value throughout.
+	 */
+	int signum = ACCESS_ONCE(fown->signum);
+
+	if (!sigio_perm(p, fown, signum))
+		return;
+
+	switch (signum) {
+		siginfo_t si;
+		default:
+			/* Queue a rt signal with the appropriate fd as its
+			   value.  We use SI_SIGIO as the source, not 
+			   SI_KERNEL, since kernel signals always get 
+			   delivered even if we can't queue.  Failure to
+			   queue in this case _should_ be reported; we fall
+			   back to SIGIO in that case. --sct */
+			si.si_signo = signum;
+			si.si_errno = 0;
+		        si.si_code  = reason;
+			/* Make sure we are called with one of the POLL_*
+			   reasons, otherwise we could leak kernel stack into
+			   userspace.  */
+			BUG_ON((reason & __SI_MASK) != __SI_POLL);
+			if (reason - POLL_IN >= NSIGPOLL)
+				si.si_band  = ~0L;
+			else
+				si.si_band = band_table[reason - POLL_IN];
+			si.si_fd    = fd;
+			if (!do_send_sig_info(signum, &si, p, group))
+				break;
+		/* fall-through: fall back on the old plain SIGIO signal */
+		case 0:
+			do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group);
+	}
+}
+
+void send_sigio(struct fown_struct *fown, int fd, int band)
+{
+	struct task_struct *p;
+	enum pid_type type;
+	struct pid *pid;
+	int group = 1;
+	
+	read_lock(&fown->lock);
+
+	type = fown->pid_type;
+	if (type == PIDTYPE_MAX) {
+		group = 0;
+		type = PIDTYPE_PID;
+	}
+
+	pid = fown->pid;
+	if (!pid)
+		goto out_unlock_fown;
+	
+	read_lock(&tasklist_lock);
+	do_each_pid_task(pid, type, p) {
+		send_sigio_to_task(p, fown, fd, band, group);
+	} while_each_pid_task(pid, type, p);
+	read_unlock(&tasklist_lock);
+ out_unlock_fown:
+	read_unlock(&fown->lock);
+}
+
+static void send_sigurg_to_task(struct task_struct *p,
+				struct fown_struct *fown, int group)
+{
+	if (sigio_perm(p, fown, SIGURG))
+		do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group);
+}
+
+int send_sigurg(struct fown_struct *fown)
+{
+	struct task_struct *p;
+	enum pid_type type;
+	struct pid *pid;
+	int group = 1;
+	int ret = 0;
+	
+	read_lock(&fown->lock);
+
+	type = fown->pid_type;
+	if (type == PIDTYPE_MAX) {
+		group = 0;
+		type = PIDTYPE_PID;
+	}
+
+	pid = fown->pid;
+	if (!pid)
+		goto out_unlock_fown;
+
+	ret = 1;
+	
+	read_lock(&tasklist_lock);
+	do_each_pid_task(pid, type, p) {
+		send_sigurg_to_task(p, fown, group);
+	} while_each_pid_task(pid, type, p);
+	read_unlock(&tasklist_lock);
+ out_unlock_fown:
+	read_unlock(&fown->lock);
+	return ret;
+}
+
+static DEFINE_SPINLOCK(fasync_lock);
+static struct kmem_cache *fasync_cache __read_mostly;
+
+static void fasync_free_rcu(struct rcu_head *head)
+{
+	kmem_cache_free(fasync_cache,
+			container_of(head, struct fasync_struct, fa_rcu));
+}
+
+/*
+ * Remove a fasync entry. If successfully removed, return
+ * positive and clear the FASYNC flag. If no entry exists,
+ * do nothing and return 0.
+ *
+ * NOTE! It is very important that the FASYNC flag always
+ * match the state "is the filp on a fasync list".
+ *
+ */
+int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
+{
+	struct fasync_struct *fa, **fp;
+	int result = 0;
+
+	spin_lock(&filp->f_lock);
+	spin_lock(&fasync_lock);
+	for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
+		if (fa->fa_file != filp)
+			continue;
+
+		spin_lock_irq(&fa->fa_lock);
+		fa->fa_file = NULL;
+		spin_unlock_irq(&fa->fa_lock);
+
+		*fp = fa->fa_next;
+		call_rcu(&fa->fa_rcu, fasync_free_rcu);
+		filp->f_flags &= ~FASYNC;
+		result = 1;
+		break;
+	}
+	spin_unlock(&fasync_lock);
+	spin_unlock(&filp->f_lock);
+	return result;
+}
+
+struct fasync_struct *fasync_alloc(void)
+{
+	return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
+}
+
+/*
+ * NOTE! This can be used only for unused fasync entries:
+ * entries that actually got inserted on the fasync list
+ * need to be released by rcu - see fasync_remove_entry.
+ */
+void fasync_free(struct fasync_struct *new)
+{
+	kmem_cache_free(fasync_cache, new);
+}
+
+/*
+ * Insert a new entry into the fasync list.  Return the pointer to the
+ * old one if we didn't use the new one.
+ *
+ * NOTE! It is very important that the FASYNC flag always
+ * match the state "is the filp on a fasync list".
+ */
+struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
+{
+        struct fasync_struct *fa, **fp;
+
+	spin_lock(&filp->f_lock);
+	spin_lock(&fasync_lock);
+	for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
+		if (fa->fa_file != filp)
+			continue;
+
+		spin_lock_irq(&fa->fa_lock);
+		fa->fa_fd = fd;
+		spin_unlock_irq(&fa->fa_lock);
+		goto out;
+	}
+
+	spin_lock_init(&new->fa_lock);
+	new->magic = FASYNC_MAGIC;
+	new->fa_file = filp;
+	new->fa_fd = fd;
+	new->fa_next = *fapp;
+	rcu_assign_pointer(*fapp, new);
+	filp->f_flags |= FASYNC;
+
+out:
+	spin_unlock(&fasync_lock);
+	spin_unlock(&filp->f_lock);
+	return fa;
+}
+
+/*
+ * Add a fasync entry. Return negative on error, positive if
+ * added, and zero if did nothing but change an existing one.
+ */
+static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+{
+	struct fasync_struct *new;
+
+	new = fasync_alloc();
+	if (!new)
+		return -ENOMEM;
+
+	/*
+	 * fasync_insert_entry() returns the old (update) entry if
+	 * it existed.
+	 *
+	 * So free the (unused) new entry and return 0 to let the
+	 * caller know that we didn't add any new fasync entries.
+	 */
+	if (fasync_insert_entry(fd, filp, fapp, new)) {
+		fasync_free(new);
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * fasync_helper() is used by almost all character device drivers
+ * to set up the fasync queue, and for regular files by the file
+ * lease code. It returns negative on error, 0 if it did no changes
+ * and positive if it added/deleted the entry.
+ */
+int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
+{
+	if (!on)
+		return fasync_remove_entry(filp, fapp);
+	return fasync_add_entry(fd, filp, fapp);
+}
+
+EXPORT_SYMBOL(fasync_helper);
+
+/*
+ * rcu_read_lock() is held
+ */
+static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
+{
+	while (fa) {
+		struct fown_struct *fown;
+		unsigned long flags;
+
+		if (fa->magic != FASYNC_MAGIC) {
+			printk(KERN_ERR "kill_fasync: bad magic number in "
+			       "fasync_struct!\n");
+			return;
+		}
+		spin_lock_irqsave(&fa->fa_lock, flags);
+		if (fa->fa_file) {
+			fown = &fa->fa_file->f_owner;
+			/* Don't send SIGURG to processes which have not set a
+			   queued signum: SIGURG has its own default signalling
+			   mechanism. */
+			if (!(sig == SIGURG && fown->signum == 0))
+				send_sigio(fown, fa->fa_fd, band);
+		}
+		spin_unlock_irqrestore(&fa->fa_lock, flags);
+		fa = rcu_dereference(fa->fa_next);
+	}
+}
+
+void kill_fasync(struct fasync_struct **fp, int sig, int band)
+{
+	/* First a quick test without locking: usually
+	 * the list is empty.
+	 */
+	if (*fp) {
+		rcu_read_lock();
+		kill_fasync_rcu(rcu_dereference(*fp), sig, band);
+		rcu_read_unlock();
+	}
+}
+EXPORT_SYMBOL(kill_fasync);
+
+static int __init fcntl_init(void)
+{
+	/*
+	 * Please add new bits here to ensure allocation uniqueness.
+	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
+	 * is defined as O_NONBLOCK on some platforms and not on others.
+	 */
+	BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+		O_RDONLY	| O_WRONLY	| O_RDWR	|
+		O_CREAT		| O_EXCL	| O_NOCTTY	|
+		O_TRUNC		| O_APPEND	| /* O_NONBLOCK	| */
+		__O_SYNC	| O_DSYNC	| FASYNC	|
+		O_DIRECT	| O_LARGEFILE	| O_DIRECTORY	|
+		O_NOFOLLOW	| O_NOATIME	| O_CLOEXEC	|
+		__FMODE_EXEC	| O_PATH
+		));
+
+	fasync_cache = kmem_cache_create("fasync_cache",
+		sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
+	return 0;
+}
+
+module_init(fcntl_init)
diff --git a/fs/fhandle.c b/fs/fhandle.c
new file mode 100644
index 00000000..6b088641
--- /dev/null
+++ b/fs/fhandle.c
@@ -0,0 +1,266 @@
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/exportfs.h>
+#include <linux/fs_struct.h>
+#include <linux/fsnotify.h>
+#include <linux/personality.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static long do_sys_name_to_handle(struct path *path,
+				  struct file_handle __user *ufh,
+				  int __user *mnt_id)
+{
+	long retval;
+	struct file_handle f_handle;
+	int handle_dwords, handle_bytes;
+	struct file_handle *handle = NULL;
+
+	/*
+	 * We need t make sure wether the file system
+	 * support decoding of the file handle
+	 */
+	if (!path->mnt->mnt_sb->s_export_op ||
+	    !path->mnt->mnt_sb->s_export_op->fh_to_dentry)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
+		return -EFAULT;
+
+	if (f_handle.handle_bytes > MAX_HANDLE_SZ)
+		return -EINVAL;
+
+	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+			 GFP_KERNEL);
+	if (!handle)
+		return -ENOMEM;
+
+	/* convert handle size to  multiple of sizeof(u32) */
+	handle_dwords = f_handle.handle_bytes >> 2;
+
+	/* we ask for a non connected handle */
+	retval = exportfs_encode_fh(path->dentry,
+				    (struct fid *)handle->f_handle,
+				    &handle_dwords,  0);
+	handle->handle_type = retval;
+	/* convert handle size to bytes */
+	handle_bytes = handle_dwords * sizeof(u32);
+	handle->handle_bytes = handle_bytes;
+	if ((handle->handle_bytes > f_handle.handle_bytes) ||
+	    (retval == 255) || (retval == -ENOSPC)) {
+		/* As per old exportfs_encode_fh documentation
+		 * we could return ENOSPC to indicate overflow
+		 * But file system returned 255 always. So handle
+		 * both the values
+		 */
+		/*
+		 * set the handle size to zero so we copy only
+		 * non variable part of the file_handle
+		 */
+		handle_bytes = 0;
+		retval = -EOVERFLOW;
+	} else
+		retval = 0;
+	/* copy the mount id */
+	if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) ||
+	    copy_to_user(ufh, handle,
+			 sizeof(struct file_handle) + handle_bytes))
+		retval = -EFAULT;
+	kfree(handle);
+	return retval;
+}
+
+/**
+ * sys_name_to_handle_at: convert name to handle
+ * @dfd: directory relative to which name is interpreted if not absolute
+ * @name: name that should be converted to handle.
+ * @handle: resulting file handle
+ * @mnt_id: mount id of the file system containing the file
+ * @flag: flag value to indicate whether to follow symlink or not
+ *
+ * @handle->handle_size indicate the space available to store the
+ * variable part of the file handle in bytes. If there is not
+ * enough space, the field is updated to return the minimum
+ * value required.
+ */
+SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
+		struct file_handle __user *, handle, int __user *, mnt_id,
+		int, flag)
+{
+	struct path path;
+	int lookup_flags;
+	int err;
+
+	if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+	err = user_path_at(dfd, name, lookup_flags, &path);
+	if (!err) {
+		err = do_sys_name_to_handle(&path, handle, mnt_id);
+		path_put(&path);
+	}
+	return err;
+}
+
+static struct vfsmount *get_vfsmount_from_fd(int fd)
+{
+	struct path path;
+
+	if (fd == AT_FDCWD) {
+		struct fs_struct *fs = current->fs;
+		spin_lock(&fs->lock);
+		path = fs->pwd;
+		mntget(path.mnt);
+		spin_unlock(&fs->lock);
+	} else {
+		int fput_needed;
+		struct file *file = fget_light(fd, &fput_needed);
+		if (!file)
+			return ERR_PTR(-EBADF);
+		path = file->f_path;
+		mntget(path.mnt);
+		fput_light(file, fput_needed);
+	}
+	return path.mnt;
+}
+
+static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
+{
+	return 1;
+}
+
+static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
+			     struct path *path)
+{
+	int retval = 0;
+	int handle_dwords;
+
+	path->mnt = get_vfsmount_from_fd(mountdirfd);
+	if (IS_ERR(path->mnt)) {
+		retval = PTR_ERR(path->mnt);
+		goto out_err;
+	}
+	/* change the handle size to multiple of sizeof(u32) */
+	handle_dwords = handle->handle_bytes >> 2;
+	path->dentry = exportfs_decode_fh(path->mnt,
+					  (struct fid *)handle->f_handle,
+					  handle_dwords, handle->handle_type,
+					  vfs_dentry_acceptable, NULL);
+	if (IS_ERR(path->dentry)) {
+		retval = PTR_ERR(path->dentry);
+		goto out_mnt;
+	}
+	return 0;
+out_mnt:
+	mntput(path->mnt);
+out_err:
+	return retval;
+}
+
+static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
+		   struct path *path)
+{
+	int retval = 0;
+	struct file_handle f_handle;
+	struct file_handle *handle = NULL;
+
+	/*
+	 * With handle we don't look at the execute bit on the
+	 * the directory. Ideally we would like CAP_DAC_SEARCH.
+	 * But we don't have that
+	 */
+	if (!capable(CAP_DAC_READ_SEARCH)) {
+		retval = -EPERM;
+		goto out_err;
+	}
+	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
+		retval = -EFAULT;
+		goto out_err;
+	}
+	if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
+	    (f_handle.handle_bytes == 0)) {
+		retval = -EINVAL;
+		goto out_err;
+	}
+	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+			 GFP_KERNEL);
+	if (!handle) {
+		retval = -ENOMEM;
+		goto out_err;
+	}
+	/* copy the full handle */
+	if (copy_from_user(handle, ufh,
+			   sizeof(struct file_handle) +
+			   f_handle.handle_bytes)) {
+		retval = -EFAULT;
+		goto out_handle;
+	}
+
+	retval = do_handle_to_path(mountdirfd, handle, path);
+
+out_handle:
+	kfree(handle);
+out_err:
+	return retval;
+}
+
+long do_handle_open(int mountdirfd,
+		    struct file_handle __user *ufh, int open_flag)
+{
+	long retval = 0;
+	struct path path;
+	struct file *file;
+	int fd;
+
+	retval = handle_to_path(mountdirfd, ufh, &path);
+	if (retval)
+		return retval;
+
+	fd = get_unused_fd_flags(open_flag);
+	if (fd < 0) {
+		path_put(&path);
+		return fd;
+	}
+	file = file_open_root(path.dentry, path.mnt, "", open_flag);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		retval =  PTR_ERR(file);
+	} else {
+		retval = fd;
+		fsnotify_open(file);
+		fd_install(fd, file);
+	}
+	path_put(&path);
+	return retval;
+}
+
+/**
+ * sys_open_by_handle_at: Open the file handle
+ * @mountdirfd: directory file descriptor
+ * @handle: file handle to be opened
+ * @flag: open flags.
+ *
+ * @mountdirfd indicate the directory file descriptor
+ * of the mount point. file handle is decoded relative
+ * to the vfsmount pointed by the @mountdirfd. @flags
+ * value is same as the open(2) flags.
+ */
+SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+		struct file_handle __user *, handle,
+		int, flags)
+{
+	long ret;
+
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
+	ret = do_handle_open(mountdirfd, handle, flags);
+	return ret;
+}
diff --git a/fs/fifo.c b/fs/fifo.c
new file mode 100644
index 00000000..b1a524d7
--- /dev/null
+++ b/fs/fifo.c
@@ -0,0 +1,154 @@
+/*
+ *  linux/fs/fifo.c
+ *
+ *  written by Paul H. Hargrove
+ *
+ *  Fixes:
+ *	10-06-1999, AV: fixed OOM handling in fifo_open(), moved
+ *			initialization there, switched to external
+ *			allocation of pipe_inode_info.
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/pipe_fs_i.h>
+
+static void wait_for_partner(struct inode* inode, unsigned int *cnt)
+{
+	int cur = *cnt;	
+
+	while (cur == *cnt) {
+		pipe_wait(inode->i_pipe);
+		if (signal_pending(current))
+			break;
+	}
+}
+
+static void wake_up_partner(struct inode* inode)
+{
+	wake_up_interruptible(&inode->i_pipe->wait);
+}
+
+static int fifo_open(struct inode *inode, struct file *filp)
+{
+	struct pipe_inode_info *pipe;
+	int ret;
+
+	mutex_lock(&inode->i_mutex);
+	pipe = inode->i_pipe;
+	if (!pipe) {
+		ret = -ENOMEM;
+		pipe = alloc_pipe_info(inode);
+		if (!pipe)
+			goto err_nocleanup;
+		inode->i_pipe = pipe;
+	}
+	filp->f_version = 0;
+
+	/* We can only do regular read/write on fifos */
+	filp->f_mode &= (FMODE_READ | FMODE_WRITE);
+
+	switch (filp->f_mode) {
+	case FMODE_READ:
+	/*
+	 *  O_RDONLY
+	 *  POSIX.1 says that O_NONBLOCK means return with the FIFO
+	 *  opened, even when there is no process writing the FIFO.
+	 */
+		filp->f_op = &read_pipefifo_fops;
+		pipe->r_counter++;
+		if (pipe->readers++ == 0)
+			wake_up_partner(inode);
+
+		if (!pipe->writers) {
+			if ((filp->f_flags & O_NONBLOCK)) {
+				/* suppress POLLHUP until we have
+				 * seen a writer */
+				filp->f_version = pipe->w_counter;
+			} else {
+				wait_for_partner(inode, &pipe->w_counter);
+				if(signal_pending(current))
+					goto err_rd;
+			}
+		}
+		break;
+	
+	case FMODE_WRITE:
+	/*
+	 *  O_WRONLY
+	 *  POSIX.1 says that O_NONBLOCK means return -1 with
+	 *  errno=ENXIO when there is no process reading the FIFO.
+	 */
+		ret = -ENXIO;
+		if ((filp->f_flags & O_NONBLOCK) && !pipe->readers)
+			goto err;
+
+		filp->f_op = &write_pipefifo_fops;
+		pipe->w_counter++;
+		if (!pipe->writers++)
+			wake_up_partner(inode);
+
+		if (!pipe->readers) {
+			wait_for_partner(inode, &pipe->r_counter);
+			if (signal_pending(current))
+				goto err_wr;
+		}
+		break;
+	
+	case FMODE_READ | FMODE_WRITE:
+	/*
+	 *  O_RDWR
+	 *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
+	 *  This implementation will NEVER block on a O_RDWR open, since
+	 *  the process can at least talk to itself.
+	 */
+		filp->f_op = &rdwr_pipefifo_fops;
+
+		pipe->readers++;
+		pipe->writers++;
+		pipe->r_counter++;
+		pipe->w_counter++;
+		if (pipe->readers == 1 || pipe->writers == 1)
+			wake_up_partner(inode);
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto err;
+	}
+
+	/* Ok! */
+	mutex_unlock(&inode->i_mutex);
+	return 0;
+
+err_rd:
+	if (!--pipe->readers)
+		wake_up_interruptible(&pipe->wait);
+	ret = -ERESTARTSYS;
+	goto err;
+
+err_wr:
+	if (!--pipe->writers)
+		wake_up_interruptible(&pipe->wait);
+	ret = -ERESTARTSYS;
+	goto err;
+
+err:
+	if (!pipe->readers && !pipe->writers)
+		free_pipe_info(inode);
+
+err_nocleanup:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+/*
+ * Dummy default file-operations: the only thing this does
+ * is contain the open that then fills in the correct operations
+ * depending on the access mode of the file...
+ */
+const struct file_operations def_fifo_fops = {
+	.open		= fifo_open,	/* will set read_ or write_pipefifo_fops */
+	.llseek		= noop_llseek,
+};
diff --git a/fs/file.c b/fs/file.c
new file mode 100644
index 00000000..4c6992d8
--- /dev/null
+++ b/fs/file.c
@@ -0,0 +1,486 @@
+/*
+ *  linux/fs/file.c
+ *
+ *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
+ *
+ *  Manage the dynamic fd arrays in the process files_struct.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/bitops.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+
+struct fdtable_defer {
+	spinlock_t lock;
+	struct work_struct wq;
+	struct fdtable *next;
+};
+
+int sysctl_nr_open __read_mostly = 1024*1024;
+int sysctl_nr_open_min = BITS_PER_LONG;
+int sysctl_nr_open_max = 1024 * 1024; /* raised later */
+
+/*
+ * We use this list to defer free fdtables that have vmalloced
+ * sets/arrays. By keeping a per-cpu list, we avoid having to embed
+ * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
+ * this per-task structure.
+ */
+static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
+
+static void *alloc_fdmem(unsigned int size)
+{
+	/*
+	 * Very large allocations can stress page reclaim, so fall back to
+	 * vmalloc() if the allocation size will be considered "large" by the VM.
+	 */
+	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+		void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
+		if (data != NULL)
+			return data;
+	}
+	return vmalloc(size);
+}
+
+static void free_fdmem(void *ptr)
+{
+	is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
+}
+
+static void __free_fdtable(struct fdtable *fdt)
+{
+	free_fdmem(fdt->fd);
+	free_fdmem(fdt->open_fds);
+	kfree(fdt);
+}
+
+static void free_fdtable_work(struct work_struct *work)
+{
+	struct fdtable_defer *f =
+		container_of(work, struct fdtable_defer, wq);
+	struct fdtable *fdt;
+
+	spin_lock_bh(&f->lock);
+	fdt = f->next;
+	f->next = NULL;
+	spin_unlock_bh(&f->lock);
+	while(fdt) {
+		struct fdtable *next = fdt->next;
+
+		__free_fdtable(fdt);
+		fdt = next;
+	}
+}
+
+void free_fdtable_rcu(struct rcu_head *rcu)
+{
+	struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
+	struct fdtable_defer *fddef;
+
+	BUG_ON(!fdt);
+
+	if (fdt->max_fds <= NR_OPEN_DEFAULT) {
+		/*
+		 * This fdtable is embedded in the files structure and that
+		 * structure itself is getting destroyed.
+		 */
+		kmem_cache_free(files_cachep,
+				container_of(fdt, struct files_struct, fdtab));
+		return;
+	}
+	if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
+		kfree(fdt->fd);
+		kfree(fdt->open_fds);
+		kfree(fdt);
+	} else {
+		fddef = &get_cpu_var(fdtable_defer_list);
+		spin_lock(&fddef->lock);
+		fdt->next = fddef->next;
+		fddef->next = fdt;
+		/* vmallocs are handled from the workqueue context */
+		schedule_work(&fddef->wq);
+		spin_unlock(&fddef->lock);
+		put_cpu_var(fdtable_defer_list);
+	}
+}
+
+/*
+ * Expand the fdset in the files_struct.  Called with the files spinlock
+ * held for write.
+ */
+static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
+{
+	unsigned int cpy, set;
+
+	BUG_ON(nfdt->max_fds < ofdt->max_fds);
+
+	cpy = ofdt->max_fds * sizeof(struct file *);
+	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
+	memcpy(nfdt->fd, ofdt->fd, cpy);
+	memset((char *)(nfdt->fd) + cpy, 0, set);
+
+	cpy = ofdt->max_fds / BITS_PER_BYTE;
+	set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
+	memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
+	memset((char *)(nfdt->open_fds) + cpy, 0, set);
+	memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
+	memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
+}
+
+static struct fdtable * alloc_fdtable(unsigned int nr)
+{
+	struct fdtable *fdt;
+	char *data;
+
+	/*
+	 * Figure out how many fds we actually want to support in this fdtable.
+	 * Allocation steps are keyed to the size of the fdarray, since it
+	 * grows far faster than any of the other dynamic data. We try to fit
+	 * the fdarray into comfortable page-tuned chunks: starting at 1024B
+	 * and growing in powers of two from there on.
+	 */
+	nr /= (1024 / sizeof(struct file *));
+	nr = roundup_pow_of_two(nr + 1);
+	nr *= (1024 / sizeof(struct file *));
+	/*
+	 * Note that this can drive nr *below* what we had passed if sysctl_nr_open
+	 * had been set lower between the check in expand_files() and here.  Deal
+	 * with that in caller, it's cheaper that way.
+	 *
+	 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
+	 * bitmaps handling below becomes unpleasant, to put it mildly...
+	 */
+	if (unlikely(nr > sysctl_nr_open))
+		nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
+
+	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
+	if (!fdt)
+		goto out;
+	fdt->max_fds = nr;
+	data = alloc_fdmem(nr * sizeof(struct file *));
+	if (!data)
+		goto out_fdt;
+	fdt->fd = (struct file **)data;
+	data = alloc_fdmem(max_t(unsigned int,
+				 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
+	if (!data)
+		goto out_arr;
+	fdt->open_fds = (fd_set *)data;
+	data += nr / BITS_PER_BYTE;
+	fdt->close_on_exec = (fd_set *)data;
+	fdt->next = NULL;
+
+	return fdt;
+
+out_arr:
+	free_fdmem(fdt->fd);
+out_fdt:
+	kfree(fdt);
+out:
+	return NULL;
+}
+
+/*
+ * Expand the file descriptor table.
+ * This function will allocate a new fdtable and both fd array and fdset, of
+ * the given size.
+ * Return <0 error code on error; 1 on successful completion.
+ * The files->file_lock should be held on entry, and will be held on exit.
+ */
+static int expand_fdtable(struct files_struct *files, int nr)
+	__releases(files->file_lock)
+	__acquires(files->file_lock)
+{
+	struct fdtable *new_fdt, *cur_fdt;
+
+	spin_unlock(&files->file_lock);
+	new_fdt = alloc_fdtable(nr);
+	spin_lock(&files->file_lock);
+	if (!new_fdt)
+		return -ENOMEM;
+	/*
+	 * extremely unlikely race - sysctl_nr_open decreased between the check in
+	 * caller and alloc_fdtable().  Cheaper to catch it here...
+	 */
+	if (unlikely(new_fdt->max_fds <= nr)) {
+		__free_fdtable(new_fdt);
+		return -EMFILE;
+	}
+	/*
+	 * Check again since another task may have expanded the fd table while
+	 * we dropped the lock
+	 */
+	cur_fdt = files_fdtable(files);
+	if (nr >= cur_fdt->max_fds) {
+		/* Continue as planned */
+		copy_fdtable(new_fdt, cur_fdt);
+		rcu_assign_pointer(files->fdt, new_fdt);
+		if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
+			free_fdtable(cur_fdt);
+	} else {
+		/* Somebody else expanded, so undo our attempt */
+		__free_fdtable(new_fdt);
+	}
+	return 1;
+}
+
+/*
+ * Expand files.
+ * This function will expand the file structures, if the requested size exceeds
+ * the current capacity and there is room for expansion.
+ * Return <0 error code on error; 0 when nothing done; 1 when files were
+ * expanded and execution may have blocked.
+ * The files->file_lock should be held on entry, and will be held on exit.
+ */
+int expand_files(struct files_struct *files, int nr)
+{
+	struct fdtable *fdt;
+
+	fdt = files_fdtable(files);
+
+	/*
+	 * N.B. For clone tasks sharing a files structure, this test
+	 * will limit the total number of files that can be opened.
+	 */
+	if (nr >= rlimit(RLIMIT_NOFILE))
+		return -EMFILE;
+
+	/* Do we need to expand? */
+	if (nr < fdt->max_fds)
+		return 0;
+
+	/* Can we expand? */
+	if (nr >= sysctl_nr_open)
+		return -EMFILE;
+
+	/* All good, so we try */
+	return expand_fdtable(files, nr);
+}
+
+static int count_open_files(struct fdtable *fdt)
+{
+	int size = fdt->max_fds;
+	int i;
+
+	/* Find the last open fd */
+	for (i = size/(8*sizeof(long)); i > 0; ) {
+		if (fdt->open_fds->fds_bits[--i])
+			break;
+	}
+	i = (i+1) * 8 * sizeof(long);
+	return i;
+}
+
+/*
+ * Allocate a new files structure and copy contents from the
+ * passed in files structure.
+ * errorp will be valid only when the returned files_struct is NULL.
+ */
+struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
+{
+	struct files_struct *newf;
+	struct file **old_fds, **new_fds;
+	int open_files, size, i;
+	struct fdtable *old_fdt, *new_fdt;
+
+	*errorp = -ENOMEM;
+	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
+	if (!newf)
+		goto out;
+
+	atomic_set(&newf->count, 1);
+
+	spin_lock_init(&newf->file_lock);
+	newf->next_fd = 0;
+	new_fdt = &newf->fdtab;
+	new_fdt->max_fds = NR_OPEN_DEFAULT;
+	new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
+	new_fdt->open_fds = (fd_set *)&newf->open_fds_init;
+	new_fdt->fd = &newf->fd_array[0];
+	new_fdt->next = NULL;
+
+	spin_lock(&oldf->file_lock);
+	old_fdt = files_fdtable(oldf);
+	open_files = count_open_files(old_fdt);
+
+	/*
+	 * Check whether we need to allocate a larger fd array and fd set.
+	 */
+	while (unlikely(open_files > new_fdt->max_fds)) {
+		spin_unlock(&oldf->file_lock);
+
+		if (new_fdt != &newf->fdtab)
+			__free_fdtable(new_fdt);
+
+		new_fdt = alloc_fdtable(open_files - 1);
+		if (!new_fdt) {
+			*errorp = -ENOMEM;
+			goto out_release;
+		}
+
+		/* beyond sysctl_nr_open; nothing to do */
+		if (unlikely(new_fdt->max_fds < open_files)) {
+			__free_fdtable(new_fdt);
+			*errorp = -EMFILE;
+			goto out_release;
+		}
+
+		/*
+		 * Reacquire the oldf lock and a pointer to its fd table
+		 * who knows it may have a new bigger fd table. We need
+		 * the latest pointer.
+		 */
+		spin_lock(&oldf->file_lock);
+		old_fdt = files_fdtable(oldf);
+		open_files = count_open_files(old_fdt);
+	}
+
+	old_fds = old_fdt->fd;
+	new_fds = new_fdt->fd;
+
+	memcpy(new_fdt->open_fds->fds_bits,
+		old_fdt->open_fds->fds_bits, open_files/8);
+	memcpy(new_fdt->close_on_exec->fds_bits,
+		old_fdt->close_on_exec->fds_bits, open_files/8);
+
+	for (i = open_files; i != 0; i--) {
+		struct file *f = *old_fds++;
+		if (f) {
+			get_file(f);
+		} else {
+			/*
+			 * The fd may be claimed in the fd bitmap but not yet
+			 * instantiated in the files array if a sibling thread
+			 * is partway through open().  So make sure that this
+			 * fd is available to the new process.
+			 */
+			FD_CLR(open_files - i, new_fdt->open_fds);
+		}
+		rcu_assign_pointer(*new_fds++, f);
+	}
+	spin_unlock(&oldf->file_lock);
+
+	/* compute the remainder to be cleared */
+	size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
+
+	/* This is long word aligned thus could use a optimized version */
+	memset(new_fds, 0, size);
+
+	if (new_fdt->max_fds > open_files) {
+		int left = (new_fdt->max_fds-open_files)/8;
+		int start = open_files / (8 * sizeof(unsigned long));
+
+		memset(&new_fdt->open_fds->fds_bits[start], 0, left);
+		memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
+	}
+
+	rcu_assign_pointer(newf->fdt, new_fdt);
+
+	return newf;
+
+out_release:
+	kmem_cache_free(files_cachep, newf);
+out:
+	return NULL;
+}
+
+static void __devinit fdtable_defer_list_init(int cpu)
+{
+	struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
+	spin_lock_init(&fddef->lock);
+	INIT_WORK(&fddef->wq, free_fdtable_work);
+	fddef->next = NULL;
+}
+
+void __init files_defer_init(void)
+{
+	int i;
+	for_each_possible_cpu(i)
+		fdtable_defer_list_init(i);
+	sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
+			     -BITS_PER_LONG;
+}
+
+struct files_struct init_files = {
+	.count		= ATOMIC_INIT(1),
+	.fdt		= &init_files.fdtab,
+	.fdtab		= {
+		.max_fds	= NR_OPEN_DEFAULT,
+		.fd		= &init_files.fd_array[0],
+		.close_on_exec	= (fd_set *)&init_files.close_on_exec_init,
+		.open_fds	= (fd_set *)&init_files.open_fds_init,
+	},
+	.file_lock	= __SPIN_LOCK_UNLOCKED(init_task.file_lock),
+};
+
+/*
+ * allocate a file descriptor, mark it busy.
+ */
+int alloc_fd(unsigned start, unsigned flags)
+{
+	struct files_struct *files = current->files;
+	unsigned int fd;
+	int error;
+	struct fdtable *fdt;
+
+	spin_lock(&files->file_lock);
+repeat:
+	fdt = files_fdtable(files);
+	fd = start;
+	if (fd < files->next_fd)
+		fd = files->next_fd;
+
+	if (fd < fdt->max_fds)
+		fd = find_next_zero_bit(fdt->open_fds->fds_bits,
+					   fdt->max_fds, fd);
+
+	error = expand_files(files, fd);
+	if (error < 0)
+		goto out;
+
+	/*
+	 * If we needed to expand the fs array we
+	 * might have blocked - try again.
+	 */
+	if (error)
+		goto repeat;
+
+	if (start <= files->next_fd)
+		files->next_fd = fd + 1;
+
+	FD_SET(fd, fdt->open_fds);
+	if (flags & O_CLOEXEC)
+		FD_SET(fd, fdt->close_on_exec);
+	else
+		FD_CLR(fd, fdt->close_on_exec);
+	error = fd;
+#if 1
+	/* Sanity check */
+	if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
+		printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
+		rcu_assign_pointer(fdt->fd[fd], NULL);
+	}
+#endif
+
+out:
+	spin_unlock(&files->file_lock);
+	return error;
+}
+
+int get_unused_fd(void)
+{
+	return alloc_fd(0, 0);
+}
+EXPORT_SYMBOL(get_unused_fd);
diff --git a/fs/file_table.c b/fs/file_table.c
new file mode 100644
index 00000000..01e4c1e8
--- /dev/null
+++ b/fs/file_table.c
@@ -0,0 +1,554 @@
+/*
+ *  linux/fs/file_table.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/security.h>
+#include <linux/eventpoll.h>
+#include <linux/rcupdate.h>
+#include <linux/mount.h>
+#include <linux/capability.h>
+#include <linux/cdev.h>
+#include <linux/fsnotify.h>
+#include <linux/sysctl.h>
+#include <linux/lglock.h>
+#include <linux/percpu_counter.h>
+#include <linux/percpu.h>
+#include <linux/ima.h>
+
+#include <asm/atomic.h>
+
+#include "internal.h"
+
+/* sysctl tunables... */
+struct files_stat_struct files_stat = {
+	.max_files = NR_FILE
+};
+
+DECLARE_LGLOCK(files_lglock);
+DEFINE_LGLOCK(files_lglock);
+
+/* SLAB cache for file structures */
+static struct kmem_cache *filp_cachep __read_mostly;
+
+static struct percpu_counter nr_files __cacheline_aligned_in_smp;
+
+static inline void file_free_rcu(struct rcu_head *head)
+{
+	struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
+
+	put_cred(f->f_cred);
+	kmem_cache_free(filp_cachep, f);
+}
+
+static inline void file_free(struct file *f)
+{
+	percpu_counter_dec(&nr_files);
+	file_check_state(f);
+	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
+}
+
+/*
+ * Return the total number of open files in the system
+ */
+static long get_nr_files(void)
+{
+	return percpu_counter_read_positive(&nr_files);
+}
+
+/*
+ * Return the maximum number of open files in the system
+ */
+unsigned long get_max_files(void)
+{
+	return files_stat.max_files;
+}
+EXPORT_SYMBOL_GPL(get_max_files);
+
+/*
+ * Handle nr_files sysctl
+ */
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+int proc_nr_files(ctl_table *table, int write,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	files_stat.nr_files = get_nr_files();
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
+#else
+int proc_nr_files(ctl_table *table, int write,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+#endif
+
+/* Find an unused file structure and return a pointer to it.
+ * Returns NULL, if there are no more free file structures or
+ * we run out of memory.
+ *
+ * Be very careful using this.  You are responsible for
+ * getting write access to any mount that you might assign
+ * to this filp, if it is opened for write.  If this is not
+ * done, you will imbalance int the mount's writer count
+ * and a warning at __fput() time.
+ */
+struct file *get_empty_filp(void)
+{
+	const struct cred *cred = current_cred();
+	static long old_max;
+	struct file * f;
+
+	/*
+	 * Privileged users can go above max_files
+	 */
+	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
+		/*
+		 * percpu_counters are inaccurate.  Do an expensive check before
+		 * we go and fail.
+		 */
+		if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
+			goto over;
+	}
+
+	f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+	if (f == NULL)
+		goto fail;
+
+	percpu_counter_inc(&nr_files);
+	f->f_cred = get_cred(cred);
+	if (security_file_alloc(f))
+		goto fail_sec;
+
+	INIT_LIST_HEAD(&f->f_u.fu_list);
+	atomic_long_set(&f->f_count, 1);
+	rwlock_init(&f->f_owner.lock);
+	spin_lock_init(&f->f_lock);
+	eventpoll_init_file(f);
+	/* f->f_version: 0 */
+	return f;
+
+over:
+	/* Ran out of filps - report that */
+	if (get_nr_files() > old_max) {
+		pr_info("VFS: file-max limit %lu reached\n", get_max_files());
+		old_max = get_nr_files();
+	}
+	goto fail;
+
+fail_sec:
+	file_free(f);
+fail:
+	return NULL;
+}
+
+/**
+ * alloc_file - allocate and initialize a 'struct file'
+ * @mnt: the vfsmount on which the file will reside
+ * @dentry: the dentry representing the new file
+ * @mode: the mode with which the new file will be opened
+ * @fop: the 'struct file_operations' for the new file
+ *
+ * Use this instead of get_empty_filp() to get a new
+ * 'struct file'.  Do so because of the same initialization
+ * pitfalls reasons listed for init_file().  This is a
+ * preferred interface to using init_file().
+ *
+ * If all the callers of init_file() are eliminated, its
+ * code should be moved into this function.
+ */
+struct file *alloc_file(struct path *path, fmode_t mode,
+		const struct file_operations *fop)
+{
+	struct file *file;
+
+	file = get_empty_filp();
+	if (!file)
+		return NULL;
+
+	file->f_path = *path;
+	file->f_mapping = path->dentry->d_inode->i_mapping;
+	file->f_mode = mode;
+	file->f_op = fop;
+
+	/*
+	 * These mounts don't really matter in practice
+	 * for r/o bind mounts.  They aren't userspace-
+	 * visible.  We do this for consistency, and so
+	 * that we can do debugging checks at __fput()
+	 */
+	if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) {
+		file_take_write(file);
+		WARN_ON(mnt_clone_write(path->mnt));
+	}
+	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		i_readcount_inc(path->dentry->d_inode);
+	return file;
+}
+EXPORT_SYMBOL(alloc_file);
+
+/**
+ * drop_file_write_access - give up ability to write to a file
+ * @file: the file to which we will stop writing
+ *
+ * This is a central place which will give up the ability
+ * to write to @file, along with access to write through
+ * its vfsmount.
+ */
+void drop_file_write_access(struct file *file)
+{
+	struct vfsmount *mnt = file->f_path.mnt;
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+
+	put_write_access(inode);
+
+	if (special_file(inode->i_mode))
+		return;
+	if (file_check_writeable(file) != 0)
+		return;
+	mnt_drop_write(mnt);
+	file_release_write(file);
+}
+EXPORT_SYMBOL_GPL(drop_file_write_access);
+
+/* the real guts of fput() - releasing the last reference to file
+ */
+static void __fput(struct file *file)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct vfsmount *mnt = file->f_path.mnt;
+	struct inode *inode = dentry->d_inode;
+
+	might_sleep();
+
+	fsnotify_close(file);
+	/*
+	 * The function eventpoll_release() should be the first called
+	 * in the file cleanup chain.
+	 */
+	eventpoll_release(file);
+	locks_remove_flock(file);
+
+	if (unlikely(file->f_flags & FASYNC)) {
+		if (file->f_op && file->f_op->fasync)
+			file->f_op->fasync(-1, file, 0);
+	}
+	if (file->f_op && file->f_op->release)
+		file->f_op->release(inode, file);
+	security_file_free(file);
+	ima_file_free(file);
+	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
+		     !(file->f_mode & FMODE_PATH))) {
+		cdev_put(inode->i_cdev);
+	}
+	fops_put(file->f_op);
+	put_pid(file->f_owner.pid);
+	file_sb_list_del(file);
+	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		i_readcount_dec(inode);
+	if (file->f_mode & FMODE_WRITE)
+		drop_file_write_access(file);
+	file->f_path.dentry = NULL;
+	file->f_path.mnt = NULL;
+	file_free(file);
+	dput(dentry);
+	mntput(mnt);
+}
+
+void fput(struct file *file)
+{
+	if (atomic_long_dec_and_test(&file->f_count))
+		__fput(file);
+}
+
+EXPORT_SYMBOL(fput);
+
+struct file *fget(unsigned int fd)
+{
+	struct file *file;
+	struct files_struct *files = current->files;
+
+	rcu_read_lock();
+	file = fcheck_files(files, fd);
+	if (file) {
+		/* File object ref couldn't be taken */
+		if (file->f_mode & FMODE_PATH ||
+		    !atomic_long_inc_not_zero(&file->f_count))
+			file = NULL;
+	}
+	rcu_read_unlock();
+
+	return file;
+}
+
+EXPORT_SYMBOL(fget);
+
+struct file *fget_raw(unsigned int fd)
+{
+	struct file *file;
+	struct files_struct *files = current->files;
+
+	rcu_read_lock();
+	file = fcheck_files(files, fd);
+	if (file) {
+		/* File object ref couldn't be taken */
+		if (!atomic_long_inc_not_zero(&file->f_count))
+			file = NULL;
+	}
+	rcu_read_unlock();
+
+	return file;
+}
+
+EXPORT_SYMBOL(fget_raw);
+
+/*
+ * Lightweight file lookup - no refcnt increment if fd table isn't shared.
+ *
+ * You can use this instead of fget if you satisfy all of the following
+ * conditions:
+ * 1) You must call fput_light before exiting the syscall and returning control
+ *    to userspace (i.e. you cannot remember the returned struct file * after
+ *    returning to userspace).
+ * 2) You must not call filp_close on the returned struct file * in between
+ *    calls to fget_light and fput_light.
+ * 3) You must not clone the current task in between the calls to fget_light
+ *    and fput_light.
+ *
+ * The fput_needed flag returned by fget_light should be passed to the
+ * corresponding fput_light.
+ */
+struct file *fget_light(unsigned int fd, int *fput_needed)
+{
+	struct file *file;
+	struct files_struct *files = current->files;
+
+	*fput_needed = 0;
+	if (atomic_read(&files->count) == 1) {
+		file = fcheck_files(files, fd);
+		if (file && (file->f_mode & FMODE_PATH))
+			file = NULL;
+	} else {
+		rcu_read_lock();
+		file = fcheck_files(files, fd);
+		if (file) {
+			if (!(file->f_mode & FMODE_PATH) &&
+			    atomic_long_inc_not_zero(&file->f_count))
+				*fput_needed = 1;
+			else
+				/* Didn't get the reference, someone's freed */
+				file = NULL;
+		}
+		rcu_read_unlock();
+	}
+
+	return file;
+}
+
+struct file *fget_raw_light(unsigned int fd, int *fput_needed)
+{
+	struct file *file;
+	struct files_struct *files = current->files;
+
+	*fput_needed = 0;
+	if (atomic_read(&files->count) == 1) {
+		file = fcheck_files(files, fd);
+	} else {
+		rcu_read_lock();
+		file = fcheck_files(files, fd);
+		if (file) {
+			if (atomic_long_inc_not_zero(&file->f_count))
+				*fput_needed = 1;
+			else
+				/* Didn't get the reference, someone's freed */
+				file = NULL;
+		}
+		rcu_read_unlock();
+	}
+
+	return file;
+}
+
+void put_filp(struct file *file)
+{
+	if (atomic_long_dec_and_test(&file->f_count)) {
+		security_file_free(file);
+		file_sb_list_del(file);
+		file_free(file);
+	}
+}
+
+static inline int file_list_cpu(struct file *file)
+{
+#ifdef CONFIG_SMP
+	return file->f_sb_list_cpu;
+#else
+	return smp_processor_id();
+#endif
+}
+
+/* helper for file_sb_list_add to reduce ifdefs */
+static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
+{
+	struct list_head *list;
+#ifdef CONFIG_SMP
+	int cpu;
+	cpu = smp_processor_id();
+	file->f_sb_list_cpu = cpu;
+	list = per_cpu_ptr(sb->s_files, cpu);
+#else
+	list = &sb->s_files;
+#endif
+	list_add(&file->f_u.fu_list, list);
+}
+
+/**
+ * file_sb_list_add - add a file to the sb's file list
+ * @file: file to add
+ * @sb: sb to add it to
+ *
+ * Use this function to associate a file with the superblock of the inode it
+ * refers to.
+ */
+void file_sb_list_add(struct file *file, struct super_block *sb)
+{
+	lg_local_lock(files_lglock);
+	__file_sb_list_add(file, sb);
+	lg_local_unlock(files_lglock);
+}
+
+/**
+ * file_sb_list_del - remove a file from the sb's file list
+ * @file: file to remove
+ * @sb: sb to remove it from
+ *
+ * Use this function to remove a file from its superblock.
+ */
+void file_sb_list_del(struct file *file)
+{
+	if (!list_empty(&file->f_u.fu_list)) {
+		lg_local_lock_cpu(files_lglock, file_list_cpu(file));
+		list_del_init(&file->f_u.fu_list);
+		lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
+	}
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * These macros iterate all files on all CPUs for a given superblock.
+ * files_lglock must be held globally.
+ */
+#define do_file_list_for_each_entry(__sb, __file)		\
+{								\
+	int i;							\
+	for_each_possible_cpu(i) {				\
+		struct list_head *list;				\
+		list = per_cpu_ptr((__sb)->s_files, i);		\
+		list_for_each_entry((__file), list, f_u.fu_list)
+
+#define while_file_list_for_each_entry				\
+	}							\
+}
+
+#else
+
+#define do_file_list_for_each_entry(__sb, __file)		\
+{								\
+	struct list_head *list;					\
+	list = &(sb)->s_files;					\
+	list_for_each_entry((__file), list, f_u.fu_list)
+
+#define while_file_list_for_each_entry				\
+}
+
+#endif
+
+int fs_may_remount_ro(struct super_block *sb)
+{
+	struct file *file;
+	/* Check that no files are currently opened for writing. */
+	lg_global_lock(files_lglock);
+	do_file_list_for_each_entry(sb, file) {
+		struct inode *inode = file->f_path.dentry->d_inode;
+
+		/* File with pending delete? */
+		if (inode->i_nlink == 0)
+			goto too_bad;
+
+		/* Writeable file? */
+		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
+			goto too_bad;
+	} while_file_list_for_each_entry;
+	lg_global_unlock(files_lglock);
+	return 1; /* Tis' cool bro. */
+too_bad:
+	lg_global_unlock(files_lglock);
+	return 0;
+}
+
+/**
+ *	mark_files_ro - mark all files read-only
+ *	@sb: superblock in question
+ *
+ *	All files are marked read-only.  We don't care about pending
+ *	delete files so this should be used in 'force' mode only.
+ */
+void mark_files_ro(struct super_block *sb)
+{
+	struct file *f;
+
+retry:
+	lg_global_lock(files_lglock);
+	do_file_list_for_each_entry(sb, f) {
+		struct vfsmount *mnt;
+		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
+		       continue;
+		if (!file_count(f))
+			continue;
+		if (!(f->f_mode & FMODE_WRITE))
+			continue;
+		spin_lock(&f->f_lock);
+		f->f_mode &= ~FMODE_WRITE;
+		spin_unlock(&f->f_lock);
+		if (file_check_writeable(f) != 0)
+			continue;
+		file_release_write(f);
+		mnt = mntget(f->f_path.mnt);
+		/* This can sleep, so we can't hold the spinlock. */
+		lg_global_unlock(files_lglock);
+		mnt_drop_write(mnt);
+		mntput(mnt);
+		goto retry;
+	} while_file_list_for_each_entry;
+	lg_global_unlock(files_lglock);
+}
+
+void __init files_init(unsigned long mempages)
+{ 
+	unsigned long n;
+
+	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
+			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+
+	/*
+	 * One file with associated inode and dcache is very roughly 1K.
+	 * Per default don't use more than 10% of our memory for files. 
+	 */ 
+
+	n = (mempages * (PAGE_SIZE / 1024)) / 10;
+	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
+	files_defer_init();
+	lg_lock_init(files_lglock);
+	percpu_counter_init(&nr_files, 0);
+} 
diff --git a/fs/filesystems.c b/fs/filesystems.c
new file mode 100644
index 00000000..0845f84f
--- /dev/null
+++ b/fs/filesystems.c
@@ -0,0 +1,287 @@
+/*
+ *  linux/fs/filesystems.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  table of configured filesystems
+ */
+
+#include <linux/syscalls.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kmod.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+
+/*
+ * Handling of filesystem drivers list.
+ * Rules:
+ *	Inclusion to/removals from/scanning of list are protected by spinlock.
+ *	During the unload module must call unregister_filesystem().
+ *	We can access the fields of list element if:
+ *		1) spinlock is held or
+ *		2) we hold the reference to the module.
+ *	The latter can be guaranteed by call of try_module_get(); if it
+ *	returned 0 we must skip the element, otherwise we got the reference.
+ *	Once the reference is obtained we can drop the spinlock.
+ */
+
+static struct file_system_type *file_systems;
+static DEFINE_RWLOCK(file_systems_lock);
+
+/* WARNING: This can be used only if we _already_ own a reference */
+void get_filesystem(struct file_system_type *fs)
+{
+	__module_get(fs->owner);
+}
+
+void put_filesystem(struct file_system_type *fs)
+{
+	module_put(fs->owner);
+}
+
+static struct file_system_type **find_filesystem(const char *name, unsigned len)
+{
+	struct file_system_type **p;
+	for (p=&file_systems; *p; p=&(*p)->next)
+		if (strlen((*p)->name) == len &&
+		    strncmp((*p)->name, name, len) == 0)
+			break;
+	return p;
+}
+
+/**
+ *	register_filesystem - register a new filesystem
+ *	@fs: the file system structure
+ *
+ *	Adds the file system passed to the list of file systems the kernel
+ *	is aware of for mount and other syscalls. Returns 0 on success,
+ *	or a negative errno code on an error.
+ *
+ *	The &struct file_system_type that is passed is linked into the kernel 
+ *	structures and must not be freed until the file system has been
+ *	unregistered.
+ */
+ 
+int register_filesystem(struct file_system_type * fs)
+{
+	int res = 0;
+	struct file_system_type ** p;
+
+	BUG_ON(strchr(fs->name, '.'));
+	if (fs->next)
+		return -EBUSY;
+	INIT_LIST_HEAD(&fs->fs_supers);
+	write_lock(&file_systems_lock);
+	p = find_filesystem(fs->name, strlen(fs->name));
+	if (*p)
+		res = -EBUSY;
+	else
+		*p = fs;
+	write_unlock(&file_systems_lock);
+	return res;
+}
+
+EXPORT_SYMBOL(register_filesystem);
+
+/**
+ *	unregister_filesystem - unregister a file system
+ *	@fs: filesystem to unregister
+ *
+ *	Remove a file system that was previously successfully registered
+ *	with the kernel. An error is returned if the file system is not found.
+ *	Zero is returned on a success.
+ *	
+ *	Once this function has returned the &struct file_system_type structure
+ *	may be freed or reused.
+ */
+ 
+int unregister_filesystem(struct file_system_type * fs)
+{
+	struct file_system_type ** tmp;
+
+	write_lock(&file_systems_lock);
+	tmp = &file_systems;
+	while (*tmp) {
+		if (fs == *tmp) {
+			*tmp = fs->next;
+			fs->next = NULL;
+			write_unlock(&file_systems_lock);
+			synchronize_rcu();
+			return 0;
+		}
+		tmp = &(*tmp)->next;
+	}
+	write_unlock(&file_systems_lock);
+
+	return -EINVAL;
+}
+
+EXPORT_SYMBOL(unregister_filesystem);
+
+static int fs_index(const char __user * __name)
+{
+	struct file_system_type * tmp;
+	char * name;
+	int err, index;
+
+	name = getname(__name);
+	err = PTR_ERR(name);
+	if (IS_ERR(name))
+		return err;
+
+	err = -EINVAL;
+	read_lock(&file_systems_lock);
+	for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
+		if (strcmp(tmp->name,name) == 0) {
+			err = index;
+			break;
+		}
+	}
+	read_unlock(&file_systems_lock);
+	putname(name);
+	return err;
+}
+
+static int fs_name(unsigned int index, char __user * buf)
+{
+	struct file_system_type * tmp;
+	int len, res;
+
+	read_lock(&file_systems_lock);
+	for (tmp = file_systems; tmp; tmp = tmp->next, index--)
+		if (index <= 0 && try_module_get(tmp->owner))
+			break;
+	read_unlock(&file_systems_lock);
+	if (!tmp)
+		return -EINVAL;
+
+	/* OK, we got the reference, so we can safely block */
+	len = strlen(tmp->name) + 1;
+	res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
+	put_filesystem(tmp);
+	return res;
+}
+
+static int fs_maxindex(void)
+{
+	struct file_system_type * tmp;
+	int index;
+
+	read_lock(&file_systems_lock);
+	for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
+		;
+	read_unlock(&file_systems_lock);
+	return index;
+}
+
+/*
+ * Whee.. Weird sysv syscall. 
+ */
+SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
+{
+	int retval = -EINVAL;
+
+	switch (option) {
+		case 1:
+			retval = fs_index((const char __user *) arg1);
+			break;
+
+		case 2:
+			retval = fs_name(arg1, (char __user *) arg2);
+			break;
+
+		case 3:
+			retval = fs_maxindex();
+			break;
+	}
+	return retval;
+}
+
+int __init get_filesystem_list(char *buf)
+{
+	int len = 0;
+	struct file_system_type * tmp;
+
+	read_lock(&file_systems_lock);
+	tmp = file_systems;
+	while (tmp && len < PAGE_SIZE - 80) {
+		len += sprintf(buf+len, "%s\t%s\n",
+			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+			tmp->name);
+		tmp = tmp->next;
+	}
+	read_unlock(&file_systems_lock);
+	return len;
+}
+
+#ifdef CONFIG_PROC_FS
+static int filesystems_proc_show(struct seq_file *m, void *v)
+{
+	struct file_system_type * tmp;
+
+	read_lock(&file_systems_lock);
+	tmp = file_systems;
+	while (tmp) {
+		seq_printf(m, "%s\t%s\n",
+			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+			tmp->name);
+		tmp = tmp->next;
+	}
+	read_unlock(&file_systems_lock);
+	return 0;
+}
+
+static int filesystems_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, filesystems_proc_show, NULL);
+}
+
+static const struct file_operations filesystems_proc_fops = {
+	.open		= filesystems_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_filesystems_init(void)
+{
+	proc_create("filesystems", 0, NULL, &filesystems_proc_fops);
+	return 0;
+}
+module_init(proc_filesystems_init);
+#endif
+
+static struct file_system_type *__get_fs_type(const char *name, int len)
+{
+	struct file_system_type *fs;
+
+	read_lock(&file_systems_lock);
+	fs = *(find_filesystem(name, len));
+	if (fs && !try_module_get(fs->owner))
+		fs = NULL;
+	read_unlock(&file_systems_lock);
+	return fs;
+}
+
+struct file_system_type *get_fs_type(const char *name)
+{
+	struct file_system_type *fs;
+	const char *dot = strchr(name, '.');
+	int len = dot ? dot - name : strlen(name);
+
+	fs = __get_fs_type(name, len);
+	if (!fs && (request_module("%.*s", len, name) == 0))
+		fs = __get_fs_type(name, len);
+
+	if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
+		put_filesystem(fs);
+		fs = NULL;
+	}
+	return fs;
+}
+
+EXPORT_SYMBOL(get_fs_type);
diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig
new file mode 100644
index 00000000..8dc1cd5c
--- /dev/null
+++ b/fs/freevxfs/Kconfig
@@ -0,0 +1,16 @@
+config VXFS_FS
+	tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
+	depends on BLOCK
+	help
+	  FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
+	  file system format.  VERITAS VxFS(TM) is the standard file system
+	  of SCO UnixWare (and possibly others) and optionally available
+	  for Sunsoft Solaris, HP-UX and many other operating systems.
+	  Currently only readonly access is supported.
+
+	  NOTE: the file system type as used by mount(1), mount(2) and
+	  fstab(5) is 'vxfs' as it describes the file system format, not
+	  the actual driver.
+
+	  To compile this as a module, choose M here: the module will be
+	  called freevxfs.  If unsure, say N.
diff --git a/fs/freevxfs/Makefile b/fs/freevxfs/Makefile
new file mode 100644
index 00000000..87ad0974
--- /dev/null
+++ b/fs/freevxfs/Makefile
@@ -0,0 +1,8 @@
+#
+# VxFS Makefile
+#
+
+obj-$(CONFIG_VXFS_FS) += freevxfs.o
+
+freevxfs-objs := vxfs_bmap.o vxfs_fshead.o vxfs_immed.o vxfs_inode.o \
+		 vxfs_lookup.o vxfs_olt.o vxfs_subr.o vxfs_super.o
diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h
new file mode 100644
index 00000000..c8a92652
--- /dev/null
+++ b/fs/freevxfs/vxfs.h
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#ifndef _VXFS_SUPER_H_
+#define _VXFS_SUPER_H_
+
+/*
+ * Veritas filesystem driver - superblock structure.
+ *
+ * This file contains the definition of the disk and core
+ * superblocks of the Veritas Filesystem.
+ */
+#include <linux/types.h>
+
+
+/*
+ * Data types for use with the VxFS ondisk format.
+ */
+typedef	int32_t		vx_daddr_t;
+typedef int32_t		vx_ino_t;
+
+/*
+ * Superblock magic number (vxfs_super->vs_magic).
+ */
+#define VXFS_SUPER_MAGIC	0xa501FCF5
+
+/*
+ * The root inode.
+ */
+#define VXFS_ROOT_INO		2
+
+/*
+ * Num of entries in free extent array
+ */
+#define VXFS_NEFREE		32
+
+
+/*
+ * VxFS superblock (disk).
+ */
+struct vxfs_sb {
+	/*
+	 * Readonly fields for the version 1 superblock.
+	 *
+	 * Lots of this fields are no more used by version 2
+	 * and never filesystems.
+	 */
+	u_int32_t	vs_magic;		/* Magic number */
+	int32_t		vs_version;		/* VxFS version */
+	u_int32_t	vs_ctime;		/* create time - secs */
+	u_int32_t	vs_cutime;		/* create time - usecs */
+	int32_t		__unused1;		/* unused */
+	int32_t		__unused2;		/* unused */
+	vx_daddr_t	vs_old_logstart;	/* obsolete */
+	vx_daddr_t	vs_old_logend;		/* obsolete */
+	int32_t		vs_bsize;		/* block size */
+	int32_t		vs_size;		/* number of blocks */
+	int32_t		vs_dsize;		/* number of data blocks */
+	u_int32_t	vs_old_ninode;		/* obsolete */
+	int32_t		vs_old_nau;		/* obsolete */
+	int32_t		__unused3;		/* unused */
+	int32_t		vs_old_defiextsize;	/* obsolete */
+	int32_t		vs_old_ilbsize;		/* obsolete */
+	int32_t		vs_immedlen;		/* size of immediate data area */
+	int32_t		vs_ndaddr;		/* number of direct extentes */
+	vx_daddr_t	vs_firstau;		/* address of first AU */
+	vx_daddr_t	vs_emap;		/* offset of extent map in AU */
+	vx_daddr_t	vs_imap;		/* offset of inode map in AU */
+	vx_daddr_t	vs_iextop;		/* offset of ExtOp. map in AU */
+	vx_daddr_t	vs_istart;		/* offset of inode list in AU */
+	vx_daddr_t	vs_bstart;		/* offset of fdblock in AU */
+	vx_daddr_t	vs_femap;		/* aufirst + emap */
+	vx_daddr_t	vs_fimap;		/* aufirst + imap */
+	vx_daddr_t	vs_fiextop;		/* aufirst + iextop */
+	vx_daddr_t	vs_fistart;		/* aufirst + istart */
+	vx_daddr_t	vs_fbstart;		/* aufirst + bstart */
+	int32_t		vs_nindir;		/* number of entries in indir */
+	int32_t		vs_aulen;		/* length of AU in blocks */
+	int32_t		vs_auimlen;		/* length of imap in blocks */
+	int32_t		vs_auemlen;		/* length of emap in blocks */
+	int32_t		vs_auilen;		/* length of ilist in blocks */
+	int32_t		vs_aupad;		/* length of pad in blocks */
+	int32_t		vs_aublocks;		/* data blocks in AU */
+	int32_t		vs_maxtier;		/* log base 2 of aublocks */
+	int32_t		vs_inopb;		/* number of inodes per blk */
+	int32_t		vs_old_inopau;		/* obsolete */
+	int32_t		vs_old_inopilb;		/* obsolete */
+	int32_t		vs_old_ndiripau;	/* obsolete */
+	int32_t		vs_iaddrlen;		/* size of indirect addr ext. */
+	int32_t		vs_bshift;		/* log base 2 of bsize */
+	int32_t		vs_inoshift;		/* log base 2 of inobp */
+	int32_t		vs_bmask;		/* ~( bsize - 1 ) */
+	int32_t		vs_boffmask;		/* bsize - 1 */
+	int32_t		vs_old_inomask;		/* old_inopilb - 1 */
+	int32_t		vs_checksum;		/* checksum of V1 data */
+	
+	/*
+	 * Version 1, writable
+	 */
+	int32_t		vs_free;		/* number of free blocks */
+	int32_t		vs_ifree;		/* number of free inodes */
+	int32_t		vs_efree[VXFS_NEFREE];	/* number of free extents by size */
+	int32_t		vs_flags;		/* flags ?!? */
+	u_int8_t	vs_mod;			/* filesystem has been changed */
+	u_int8_t	vs_clean;		/* clean FS */
+	u_int16_t	__unused4;		/* unused */
+	u_int32_t	vs_firstlogid;		/* mount time log ID */
+	u_int32_t	vs_wtime;		/* last time written - sec */
+	u_int32_t	vs_wutime;		/* last time written - usec */
+	u_int8_t	vs_fname[6];		/* FS name */
+	u_int8_t	vs_fpack[6];		/* FS pack name */
+	int32_t		vs_logversion;		/* log format version */
+	int32_t		__unused5;		/* unused */
+	
+	/*
+	 * Version 2, Read-only
+	 */
+	vx_daddr_t	vs_oltext[2];		/* OLT extent and replica */
+	int32_t		vs_oltsize;		/* OLT extent size */
+	int32_t		vs_iauimlen;		/* size of inode map */
+	int32_t		vs_iausize;		/* size of IAU in blocks */
+	int32_t		vs_dinosize;		/* size of inode in bytes */
+	int32_t		vs_old_dniaddr;		/* indir levels per inode */
+	int32_t		vs_checksum2;		/* checksum of V2 RO */
+
+	/*
+	 * Actually much more...
+	 */
+};
+
+
+/*
+ * In core superblock filesystem private data for VxFS.
+ */
+struct vxfs_sb_info {
+	struct vxfs_sb		*vsi_raw;	/* raw (on disk) superblock */
+	struct buffer_head	*vsi_bp;	/* buffer for raw superblock*/
+	struct inode		*vsi_fship;	/* fileset header inode */
+	struct inode		*vsi_ilist;	/* inode list inode */
+	struct inode		*vsi_stilist;	/* structural inode list inode */
+	u_long			vsi_iext;	/* initial inode list */
+	ino_t			vsi_fshino;	/* fileset header inode */
+	daddr_t			vsi_oltext;	/* OLT extent */
+	daddr_t			vsi_oltsize;	/* OLT size */
+};
+
+
+/*
+ * File modes.  File types above 0xf000 are vxfs internal only, they should
+ * not be passed back to higher levels of the system.  vxfs file types must
+ * never have one of the regular file type bits set.
+ */
+enum vxfs_mode {
+	VXFS_ISUID = 0x00000800,	/* setuid */
+	VXFS_ISGID = 0x00000400,	/* setgid */
+	VXFS_ISVTX = 0x00000200,	/* sticky bit */
+	VXFS_IREAD = 0x00000100,	/* read */
+	VXFS_IWRITE = 0x00000080,	/* write */
+	VXFS_IEXEC = 0x00000040,	/* exec */
+
+	VXFS_IFIFO = 0x00001000,	/* Named pipe */
+	VXFS_IFCHR = 0x00002000,	/* Character device */
+	VXFS_IFDIR = 0x00004000,	/* Directory */
+	VXFS_IFNAM = 0x00005000,	/* Xenix device ?? */
+	VXFS_IFBLK = 0x00006000,	/* Block device */
+	VXFS_IFREG = 0x00008000,	/* Regular file */
+	VXFS_IFCMP = 0x00009000,	/* Compressed file ?!? */
+	VXFS_IFLNK = 0x0000a000,	/* Symlink */
+	VXFS_IFSOC = 0x0000c000,	/* Socket */
+
+	/* VxFS internal */
+	VXFS_IFFSH = 0x10000000,	/* Fileset header */
+	VXFS_IFILT = 0x20000000,	/* Inode list */
+	VXFS_IFIAU = 0x30000000,	/* Inode allocation unit */
+	VXFS_IFCUT = 0x40000000,	/* Current usage table */
+	VXFS_IFATT = 0x50000000,	/* Attr. inode */
+	VXFS_IFLCT = 0x60000000,	/* Link count table */
+	VXFS_IFIAT = 0x70000000,	/* Indirect attribute file */
+	VXFS_IFEMR = 0x80000000,	/* Extent map reorg file */
+	VXFS_IFQUO = 0x90000000,	/* BSD quota file */
+	VXFS_IFPTI = 0xa0000000,	/* "Pass through" inode */
+	VXFS_IFLAB = 0x11000000,	/* Device label file */
+	VXFS_IFOLT = 0x12000000,	/* OLT file */
+	VXFS_IFLOG = 0x13000000,	/* Log file */
+	VXFS_IFEMP = 0x14000000,	/* Extent map file */
+	VXFS_IFEAU = 0x15000000,	/* Extent AU file */
+	VXFS_IFAUS = 0x16000000,	/* Extent AU summary file */
+	VXFS_IFDEV = 0x17000000,	/* Device config file */
+
+};
+
+#define	VXFS_TYPE_MASK		0xfffff000
+
+#define VXFS_IS_TYPE(ip,type)	(((ip)->vii_mode & VXFS_TYPE_MASK) == (type))
+#define VXFS_ISFIFO(x)		VXFS_IS_TYPE((x),VXFS_IFIFO)
+#define VXFS_ISCHR(x)		VXFS_IS_TYPE((x),VXFS_IFCHR)
+#define VXFS_ISDIR(x)		VXFS_IS_TYPE((x),VXFS_IFDIR)
+#define VXFS_ISNAM(x)		VXFS_IS_TYPE((x),VXFS_IFNAM)
+#define VXFS_ISBLK(x)		VXFS_IS_TYPE((x),VXFS_IFBLK)
+#define VXFS_ISLNK(x)		VXFS_IS_TYPE((x),VXFS_IFLNK)
+#define VXFS_ISREG(x)		VXFS_IS_TYPE((x),VXFS_IFREG)
+#define VXFS_ISCMP(x)		VXFS_IS_TYPE((x),VXFS_IFCMP)
+#define VXFS_ISSOC(x)		VXFS_IS_TYPE((x),VXFS_IFSOC)
+
+#define VXFS_ISFSH(x)		VXFS_IS_TYPE((x),VXFS_IFFSH)
+#define VXFS_ISILT(x)		VXFS_IS_TYPE((x),VXFS_IFILT)
+
+/*
+ * Inmode organisation types.
+ */
+enum {
+	VXFS_ORG_NONE	= 0,	/* Inode has *no* format ?!? */
+	VXFS_ORG_EXT4	= 1,	/* Ext4 */
+	VXFS_ORG_IMMED	= 2,	/* All data stored in inode */
+	VXFS_ORG_TYPED	= 3,	/* Typed extents */
+};
+
+#define VXFS_IS_ORG(ip,org)	((ip)->vii_orgtype == (org))
+#define VXFS_ISNONE(ip)		VXFS_IS_ORG((ip), VXFS_ORG_NONE)
+#define VXFS_ISEXT4(ip)		VXFS_IS_ORG((ip), VXFS_ORG_EXT4)
+#define VXFS_ISIMMED(ip)	VXFS_IS_ORG((ip), VXFS_ORG_IMMED)
+#define VXFS_ISTYPED(ip)	VXFS_IS_ORG((ip), VXFS_ORG_TYPED)
+
+
+/*
+ * Get filesystem private data from VFS inode.
+ */
+#define VXFS_INO(ip) \
+	((struct vxfs_inode_info *)(ip)->i_private)
+
+/*
+ * Get filesystem private data from VFS superblock.
+ */
+#define VXFS_SBI(sbp) \
+	((struct vxfs_sb_info *)(sbp)->s_fs_info)
+
+#endif /* _VXFS_SUPER_H_ */
diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c
new file mode 100644
index 00000000..f86fd3ca
--- /dev/null
+++ b/fs/freevxfs/vxfs_bmap.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Veritas filesystem driver - filesystem to disk block mapping.
+ */
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/kernel.h>
+
+#include "vxfs.h"
+#include "vxfs_inode.h"
+#include "vxfs_extern.h"
+
+
+#ifdef DIAGNOSTIC
+static void
+vxfs_typdump(struct vxfs_typed *typ)
+{
+	printk(KERN_DEBUG "type=%Lu ", typ->vt_hdr >> VXFS_TYPED_TYPESHIFT);
+	printk("offset=%Lx ", typ->vt_hdr & VXFS_TYPED_OFFSETMASK);
+	printk("block=%x ", typ->vt_block);
+	printk("size=%x\n", typ->vt_size);
+}
+#endif
+
+/**
+ * vxfs_bmap_ext4 - do bmap for ext4 extents
+ * @ip:		pointer to the inode we do bmap for
+ * @iblock:	logical block.
+ *
+ * Description:
+ *   vxfs_bmap_ext4 performs the bmap operation for inodes with
+ *   ext4-style extents (which are much like the traditional UNIX
+ *   inode organisation).
+ *
+ * Returns:
+ *   The physical block number on success, else Zero.
+ */
+static daddr_t
+vxfs_bmap_ext4(struct inode *ip, long bn)
+{
+	struct super_block *sb = ip->i_sb;
+	struct vxfs_inode_info *vip = VXFS_INO(ip);
+	unsigned long bsize = sb->s_blocksize;
+	u32 indsize = vip->vii_ext4.ve4_indsize;
+	int i;
+
+	if (indsize > sb->s_blocksize)
+		goto fail_size;
+
+	for (i = 0; i < VXFS_NDADDR; i++) {
+		struct direct *d = vip->vii_ext4.ve4_direct + i;
+		if (bn >= 0 && bn < d->size)
+			return (bn + d->extent);
+		bn -= d->size;
+	}
+
+	if ((bn / (indsize * indsize * bsize / 4)) == 0) {
+		struct buffer_head *buf;
+		daddr_t	bno;
+		u32 *indir;
+
+		buf = sb_bread(sb, vip->vii_ext4.ve4_indir[0]);
+		if (!buf || !buffer_mapped(buf))
+			goto fail_buf;
+
+		indir = (u32 *)buf->b_data;
+		bno = indir[(bn/indsize) % (indsize*bn)] + (bn%indsize);
+
+		brelse(buf);
+		return bno;
+	} else
+		printk(KERN_WARNING "no matching indir?");
+
+	return 0;
+
+fail_size:
+	printk("vxfs: indirect extent too big!\n");
+fail_buf:
+	return 0;
+}
+
+/**
+ * vxfs_bmap_indir - recursion for vxfs_bmap_typed
+ * @ip:		pointer to the inode we do bmap for
+ * @indir:	indirect block we start reading at
+ * @size:	size of the typed area to search
+ * @block:	partially result from further searches
+ *
+ * Description:
+ *   vxfs_bmap_indir reads a &struct vxfs_typed at @indir
+ *   and performs the type-defined action.
+ *
+ * Return Value:
+ *   The physical block number on success, else Zero.
+ *
+ * Note:
+ *   Kernelstack is rare.  Unrecurse?
+ */
+static daddr_t
+vxfs_bmap_indir(struct inode *ip, long indir, int size, long block)
+{
+	struct buffer_head		*bp = NULL;
+	daddr_t				pblock = 0;
+	int				i;
+
+	for (i = 0; i < size * VXFS_TYPED_PER_BLOCK(ip->i_sb); i++) {
+		struct vxfs_typed	*typ;
+		int64_t			off;
+
+		bp = sb_bread(ip->i_sb,
+				indir + (i / VXFS_TYPED_PER_BLOCK(ip->i_sb)));
+		if (!bp || !buffer_mapped(bp))
+			return 0;
+
+		typ = ((struct vxfs_typed *)bp->b_data) +
+			(i % VXFS_TYPED_PER_BLOCK(ip->i_sb));
+		off = (typ->vt_hdr & VXFS_TYPED_OFFSETMASK);
+
+		if (block < off) {
+			brelse(bp);
+			continue;
+		}
+
+		switch ((u_int32_t)(typ->vt_hdr >> VXFS_TYPED_TYPESHIFT)) {
+		case VXFS_TYPED_INDIRECT:
+			pblock = vxfs_bmap_indir(ip, typ->vt_block,
+					typ->vt_size, block - off);
+			if (pblock == -2)
+				break;
+			goto out;
+		case VXFS_TYPED_DATA:
+			if ((block - off) >= typ->vt_size)
+				break;
+			pblock = (typ->vt_block + block - off);
+			goto out;
+		case VXFS_TYPED_INDIRECT_DEV4:
+		case VXFS_TYPED_DATA_DEV4: {
+			struct vxfs_typed_dev4	*typ4 =
+				(struct vxfs_typed_dev4 *)typ;
+
+			printk(KERN_INFO "\n\nTYPED_DEV4 detected!\n");
+			printk(KERN_INFO "block: %Lu\tsize: %Ld\tdev: %d\n",
+			       (unsigned long long) typ4->vd4_block,
+			       (unsigned long long) typ4->vd4_size,
+			       typ4->vd4_dev);
+			goto fail;
+		}
+		default:
+			BUG();
+		}
+		brelse(bp);
+	}
+
+fail:
+	pblock = 0;
+out:
+	brelse(bp);
+	return (pblock);
+}
+
+/**
+ * vxfs_bmap_typed - bmap for typed extents
+ * @ip:		pointer to the inode we do bmap for
+ * @iblock:	logical block
+ *
+ * Description:
+ *   Performs the bmap operation for typed extents.
+ *
+ * Return Value:
+ *   The physical block number on success, else Zero.
+ */
+static daddr_t
+vxfs_bmap_typed(struct inode *ip, long iblock)
+{
+	struct vxfs_inode_info		*vip = VXFS_INO(ip);
+	daddr_t				pblock = 0;
+	int				i;
+
+	for (i = 0; i < VXFS_NTYPED; i++) {
+		struct vxfs_typed	*typ = vip->vii_org.typed + i;
+		int64_t			off = (typ->vt_hdr & VXFS_TYPED_OFFSETMASK);
+
+#ifdef DIAGNOSTIC
+		vxfs_typdump(typ);
+#endif
+		if (iblock < off)
+			continue;
+		switch ((u_int32_t)(typ->vt_hdr >> VXFS_TYPED_TYPESHIFT)) {
+		case VXFS_TYPED_INDIRECT:
+			pblock = vxfs_bmap_indir(ip, typ->vt_block,
+					typ->vt_size, iblock - off);
+			if (pblock == -2)
+				break;
+			return (pblock);
+		case VXFS_TYPED_DATA:
+			if ((iblock - off) < typ->vt_size)
+				return (typ->vt_block + iblock - off);
+			break;
+		case VXFS_TYPED_INDIRECT_DEV4:
+		case VXFS_TYPED_DATA_DEV4: {
+			struct vxfs_typed_dev4	*typ4 =
+				(struct vxfs_typed_dev4 *)typ;
+
+			printk(KERN_INFO "\n\nTYPED_DEV4 detected!\n");
+			printk(KERN_INFO "block: %Lu\tsize: %Ld\tdev: %d\n",
+			       (unsigned long long) typ4->vd4_block,
+			       (unsigned long long) typ4->vd4_size,
+			       typ4->vd4_dev);
+			return 0;
+		}
+		default:
+			BUG();
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * vxfs_bmap1 - vxfs-internal bmap operation
+ * @ip:			pointer to the inode we do bmap for
+ * @iblock:		logical block
+ *
+ * Description:
+ *   vxfs_bmap1 perfoms a logical to physical block mapping
+ *   for vxfs-internal purposes.
+ *
+ * Return Value:
+ *   The physical block number on success, else Zero.
+ */
+daddr_t
+vxfs_bmap1(struct inode *ip, long iblock)
+{
+	struct vxfs_inode_info		*vip = VXFS_INO(ip);
+
+	if (VXFS_ISEXT4(vip))
+		return vxfs_bmap_ext4(ip, iblock);
+	if (VXFS_ISTYPED(vip))
+		return vxfs_bmap_typed(ip, iblock);
+	if (VXFS_ISNONE(vip))
+		goto unsupp;
+	if (VXFS_ISIMMED(vip))
+		goto unsupp;
+
+	printk(KERN_WARNING "vxfs: inode %ld has no valid orgtype (%x)\n",
+			ip->i_ino, vip->vii_orgtype);
+	BUG();
+
+unsupp:
+	printk(KERN_WARNING "vxfs: inode %ld has an unsupported orgtype (%x)\n",
+			ip->i_ino, vip->vii_orgtype);
+	return 0;
+}
diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h
new file mode 100644
index 00000000..aaf1fb09
--- /dev/null
+++ b/fs/freevxfs/vxfs_dir.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#ifndef _VXFS_DIR_H_
+#define _VXFS_DIR_H_
+
+/*
+ * Veritas filesystem driver - directory structure.
+ *
+ * This file contains the definition of the vxfs directory format.
+ */
+
+
+/*
+ * VxFS directory block header.
+ *
+ * This entry is the head of every filesystem block in a directory.
+ * It is used for free space management and additionally includes
+ * a hash for speeding up directory search (lookup).
+ *
+ * The hash may be empty and in fact we do not use it all in the
+ * Linux driver for now.
+ */
+struct vxfs_dirblk {
+	u_int16_t	d_free;		/* free space in dirblock */
+	u_int16_t	d_nhash;	/* no of hash chains */
+	u_int16_t	d_hash[1];	/* hash chain */
+};
+
+/*
+ * VXFS_NAMELEN is the maximum length of the d_name field
+ *	of an VxFS directory entry.
+ */
+#define VXFS_NAMELEN	256
+
+/*
+ * VxFS directory entry.
+ */
+struct vxfs_direct {
+	vx_ino_t	d_ino;			/* inode number */
+	u_int16_t	d_reclen;		/* record length */
+	u_int16_t	d_namelen;		/* d_name length */
+	u_int16_t	d_hashnext;		/* next hash entry */
+	char		d_name[VXFS_NAMELEN];	/* name */
+};
+
+/*
+ * VXFS_DIRPAD defines the directory entry boundaries, is _must_ be
+ *	a multiple of four.
+ * VXFS_NAMEMIN is the length of a directory entry with a NULL d_name.
+ * VXFS_DIRROUND is an internal macros that rounds a length to a value
+ *	usable for directory sizes.
+ * VXFS_DIRLEN calculates the directory entry size for an entry with
+ *	a d_name with size len.
+ */
+#define VXFS_DIRPAD		4
+#define VXFS_NAMEMIN		offsetof(struct vxfs_direct, d_name)
+#define VXFS_DIRROUND(len)	((VXFS_DIRPAD + (len) - 1) & ~(VXFS_DIRPAD -1))
+#define VXFS_DIRLEN(len)	(VXFS_DIRROUND(VXFS_NAMEMIN + (len)))
+
+/*
+ * VXFS_DIRBLKOV is the overhead of a specific dirblock.
+ */
+#define VXFS_DIRBLKOV(dbp)	((sizeof(short) * dbp->d_nhash) + 4)
+
+#endif /* _VXFS_DIR_H_ */
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h
new file mode 100644
index 00000000..881aa3d2
--- /dev/null
+++ b/fs/freevxfs/vxfs_extern.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#ifndef _VXFS_EXTERN_H_
+#define _VXFS_EXTERN_H_
+
+/*
+ * Veritas filesystem driver - external prototypes.
+ *
+ * This file contains prototypes for all vxfs functions used
+ * outside their respective source files.
+ */
+
+
+struct kmem_cache;
+struct super_block;
+struct vxfs_inode_info;
+struct inode;
+
+
+/* vxfs_bmap.c */
+extern daddr_t			vxfs_bmap1(struct inode *, long);
+
+/* vxfs_fshead.c */
+extern int			vxfs_read_fshead(struct super_block *);
+
+/* vxfs_immed.c */
+extern const struct inode_operations vxfs_immed_symlink_iops;
+
+/* vxfs_inode.c */
+extern const struct address_space_operations vxfs_immed_aops;
+extern struct kmem_cache	*vxfs_inode_cachep;
+extern void			vxfs_dumpi(struct vxfs_inode_info *, ino_t);
+extern struct inode *		vxfs_get_fake_inode(struct super_block *,
+					struct vxfs_inode_info *);
+extern void			vxfs_put_fake_inode(struct inode *);
+extern struct vxfs_inode_info *	vxfs_blkiget(struct super_block *, u_long, ino_t);
+extern struct vxfs_inode_info *	vxfs_stiget(struct super_block *, ino_t);
+extern struct inode *		vxfs_iget(struct super_block *, ino_t);
+extern void			vxfs_evict_inode(struct inode *);
+
+/* vxfs_lookup.c */
+extern const struct inode_operations	vxfs_dir_inode_ops;
+extern const struct file_operations	vxfs_dir_operations;
+
+/* vxfs_olt.c */
+extern int			vxfs_read_olt(struct super_block *, u_long);
+
+/* vxfs_subr.c */
+extern const struct address_space_operations vxfs_aops;
+extern struct page *		vxfs_get_page(struct address_space *, u_long);
+extern void			vxfs_put_page(struct page *);
+extern struct buffer_head *	vxfs_bread(struct inode *, int);
+
+#endif /* _VXFS_EXTERN_H_ */
diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c
new file mode 100644
index 00000000..c9a6a94e
--- /dev/null
+++ b/fs/freevxfs/vxfs_fshead.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Veritas filesystem driver - fileset header routines.
+ */
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "vxfs.h"
+#include "vxfs_inode.h"
+#include "vxfs_extern.h"
+#include "vxfs_fshead.h"
+
+
+#ifdef DIAGNOSTIC
+static void
+vxfs_dumpfsh(struct vxfs_fsh *fhp)
+{
+	printk("\n\ndumping fileset header:\n");
+	printk("----------------------------\n");
+	printk("version: %u\n", fhp->fsh_version);
+	printk("fsindex: %u\n", fhp->fsh_fsindex);
+	printk("iauino: %u\tninodes:%u\n",
+			fhp->fsh_iauino, fhp->fsh_ninodes);
+	printk("maxinode: %u\tlctino: %u\n",
+			fhp->fsh_maxinode, fhp->fsh_lctino);
+	printk("nau: %u\n", fhp->fsh_nau);
+	printk("ilistino[0]: %u\tilistino[1]: %u\n",
+			fhp->fsh_ilistino[0], fhp->fsh_ilistino[1]);
+}
+#endif
+
+/**
+ * vxfs_getfsh - read fileset header into memory
+ * @ip:		the (fake) fileset header inode
+ * @which:	0 for the structural, 1 for the primary fsh.
+ *
+ * Description:
+ *   vxfs_getfsh reads either the structural or primary fileset header
+ *   described by @ip into memory.
+ *
+ * Returns:
+ *   The fileset header structure on success, else Zero.
+ */
+static struct vxfs_fsh *
+vxfs_getfsh(struct inode *ip, int which)
+{
+	struct buffer_head		*bp;
+
+	bp = vxfs_bread(ip, which);
+	if (bp) {
+		struct vxfs_fsh		*fhp;
+
+		if (!(fhp = kmalloc(sizeof(*fhp), GFP_KERNEL)))
+			goto out;
+		memcpy(fhp, bp->b_data, sizeof(*fhp));
+
+		put_bh(bp);
+		return (fhp);
+	}
+out:
+	brelse(bp);
+	return NULL;
+}
+
+/**
+ * vxfs_read_fshead - read the fileset headers
+ * @sbp:	superblock to which the fileset belongs
+ *
+ * Description:
+ *   vxfs_read_fshead will fill the inode and structural inode list in @sb.
+ *
+ * Returns:
+ *   Zero on success, else a negative error code (-EINVAL).
+ */
+int
+vxfs_read_fshead(struct super_block *sbp)
+{
+	struct vxfs_sb_info		*infp = VXFS_SBI(sbp);
+	struct vxfs_fsh			*pfp, *sfp;
+	struct vxfs_inode_info		*vip, *tip;
+
+	vip = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino);
+	if (!vip) {
+		printk(KERN_ERR "vxfs: unable to read fsh inode\n");
+		return -EINVAL;
+	}
+	if (!VXFS_ISFSH(vip)) {
+		printk(KERN_ERR "vxfs: fsh list inode is of wrong type (%x)\n",
+				vip->vii_mode & VXFS_TYPE_MASK); 
+		goto out_free_fship;
+	}
+
+
+#ifdef DIAGNOSTIC
+	printk("vxfs: fsh inode dump:\n");
+	vxfs_dumpi(vip, infp->vsi_fshino);
+#endif
+
+	infp->vsi_fship = vxfs_get_fake_inode(sbp, vip);
+	if (!infp->vsi_fship) {
+		printk(KERN_ERR "vxfs: unable to get fsh inode\n");
+		goto out_free_fship;
+	}
+
+	sfp = vxfs_getfsh(infp->vsi_fship, 0);
+	if (!sfp) {
+		printk(KERN_ERR "vxfs: unable to get structural fsh\n");
+		goto out_iput_fship;
+	} 
+
+#ifdef DIAGNOSTIC
+	vxfs_dumpfsh(sfp);
+#endif
+
+	pfp = vxfs_getfsh(infp->vsi_fship, 1);
+	if (!pfp) {
+		printk(KERN_ERR "vxfs: unable to get primary fsh\n");
+		goto out_free_sfp;
+	}
+
+#ifdef DIAGNOSTIC
+	vxfs_dumpfsh(pfp);
+#endif
+
+	tip = vxfs_blkiget(sbp, infp->vsi_iext, sfp->fsh_ilistino[0]);
+	if (!tip)
+		goto out_free_pfp;
+
+	infp->vsi_stilist = vxfs_get_fake_inode(sbp, tip);
+	if (!infp->vsi_stilist) {
+		printk(KERN_ERR "vxfs: unable to get structural list inode\n");
+		kfree(tip);
+		goto out_free_pfp;
+	}
+	if (!VXFS_ISILT(VXFS_INO(infp->vsi_stilist))) {
+		printk(KERN_ERR "vxfs: structural list inode is of wrong type (%x)\n",
+				VXFS_INO(infp->vsi_stilist)->vii_mode & VXFS_TYPE_MASK); 
+		goto out_iput_stilist;
+	}
+
+	tip = vxfs_stiget(sbp, pfp->fsh_ilistino[0]);
+	if (!tip)
+		goto out_iput_stilist;
+	infp->vsi_ilist = vxfs_get_fake_inode(sbp, tip);
+	if (!infp->vsi_ilist) {
+		printk(KERN_ERR "vxfs: unable to get inode list inode\n");
+		kfree(tip);
+		goto out_iput_stilist;
+	}
+	if (!VXFS_ISILT(VXFS_INO(infp->vsi_ilist))) {
+		printk(KERN_ERR "vxfs: inode list inode is of wrong type (%x)\n",
+				VXFS_INO(infp->vsi_ilist)->vii_mode & VXFS_TYPE_MASK);
+		goto out_iput_ilist;
+	}
+
+	return 0;
+
+ out_iput_ilist:
+ 	iput(infp->vsi_ilist);
+ out_iput_stilist:
+ 	iput(infp->vsi_stilist);
+ out_free_pfp:
+	kfree(pfp);
+ out_free_sfp:
+ 	kfree(sfp);
+ out_iput_fship:
+	iput(infp->vsi_fship);
+	return -EINVAL;
+ out_free_fship:
+ 	kfree(vip);
+	return -EINVAL;
+}
diff --git a/fs/freevxfs/vxfs_fshead.h b/fs/freevxfs/vxfs_fshead.h
new file mode 100644
index 00000000..ead0d640
--- /dev/null
+++ b/fs/freevxfs/vxfs_fshead.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#ifndef _VXFS_FSHEAD_H_
+#define _VXFS_FSHEAD_H_
+
+/*
+ * Veritas filesystem driver - fileset header structures.
+ *
+ * This file contains the physical structure of the VxFS
+ * fileset header.
+ */
+
+
+/*
+ * Fileset header 
+ */
+struct vxfs_fsh {
+	u_int32_t	fsh_version;		/* fileset header version */
+	u_int32_t	fsh_fsindex;		/* fileset index */
+	u_int32_t	fsh_time;		/* modification time - sec */
+	u_int32_t	fsh_utime;		/* modification time - usec */
+	u_int32_t	fsh_extop;		/* extop flags */
+	vx_ino_t	fsh_ninodes;		/* allocated inodes */
+	u_int32_t	fsh_nau;		/* number of IAUs */
+	u_int32_t	fsh_old_ilesize;	/* old size of ilist */
+	u_int32_t	fsh_dflags;		/* flags */
+	u_int32_t	fsh_quota;		/* quota limit */
+	vx_ino_t	fsh_maxinode;		/* maximum inode number */
+	vx_ino_t	fsh_iauino;		/* IAU inode */
+	vx_ino_t	fsh_ilistino[2];	/* ilist inodes */
+	vx_ino_t	fsh_lctino;		/* link count table inode */
+
+	/*
+	 * Slightly more fields follow, but they
+	 *  a) are not of any interest for us, and
+	 *  b) differ a lot in different vxfs versions/ports
+	 */
+};
+
+#endif /* _VXFS_FSHEAD_H_ */
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
new file mode 100644
index 00000000..c36aeaf9
--- /dev/null
+++ b/fs/freevxfs/vxfs_immed.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Veritas filesystem driver - support for 'immed' inodes.
+ */
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+
+#include "vxfs.h"
+#include "vxfs_extern.h"
+#include "vxfs_inode.h"
+
+
+static void *	vxfs_immed_follow_link(struct dentry *, struct nameidata *);
+
+static int	vxfs_immed_readpage(struct file *, struct page *);
+
+/*
+ * Inode operations for immed symlinks.
+ *
+ * Unliked all other operations we do not go through the pagecache,
+ * but do all work directly on the inode.
+ */
+const struct inode_operations vxfs_immed_symlink_iops = {
+	.readlink =		generic_readlink,
+	.follow_link =		vxfs_immed_follow_link,
+};
+
+/*
+ * Address space operations for immed files and directories.
+ */
+const struct address_space_operations vxfs_immed_aops = {
+	.readpage =		vxfs_immed_readpage,
+};
+
+/**
+ * vxfs_immed_follow_link - follow immed symlink
+ * @dp:		dentry for the link
+ * @np:		pathname lookup data for the current path walk
+ *
+ * Description:
+ *   vxfs_immed_follow_link restarts the pathname lookup with
+ *   the data obtained from @dp.
+ *
+ * Returns:
+ *   Zero on success, else a negative error code.
+ */
+static void *
+vxfs_immed_follow_link(struct dentry *dp, struct nameidata *np)
+{
+	struct vxfs_inode_info		*vip = VXFS_INO(dp->d_inode);
+	nd_set_link(np, vip->vii_immed.vi_immed);
+	return NULL;
+}
+
+/**
+ * vxfs_immed_readpage - read part of an immed inode into pagecache
+ * @file:	file context (unused)
+ * @page:	page frame to fill in.
+ *
+ * Description:
+ *   vxfs_immed_readpage reads a part of the immed area of the
+ *   file that hosts @pp into the pagecache.
+ *
+ * Returns:
+ *   Zero on success, else a negative error code.
+ *
+ * Locking status:
+ *   @page is locked and will be unlocked.
+ */
+static int
+vxfs_immed_readpage(struct file *fp, struct page *pp)
+{
+	struct vxfs_inode_info	*vip = VXFS_INO(pp->mapping->host);
+	u_int64_t	offset = (u_int64_t)pp->index << PAGE_CACHE_SHIFT;
+	caddr_t		kaddr;
+
+	kaddr = kmap(pp);
+	memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_CACHE_SIZE);
+	kunmap(pp);
+	
+	flush_dcache_page(pp);
+	SetPageUptodate(pp);
+        unlock_page(pp);
+
+	return 0;
+}
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
new file mode 100644
index 00000000..1a431143
--- /dev/null
+++ b/fs/freevxfs/vxfs_inode.c
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Veritas filesystem driver - inode routines.
+ */
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "vxfs.h"
+#include "vxfs_inode.h"
+#include "vxfs_extern.h"
+
+
+struct kmem_cache		*vxfs_inode_cachep;
+
+
+#ifdef DIAGNOSTIC
+/*
+ * Dump inode contents (partially).
+ */
+void
+vxfs_dumpi(struct vxfs_inode_info *vip, ino_t ino)
+{
+	printk(KERN_DEBUG "\n\n");
+	if (ino)
+		printk(KERN_DEBUG "dumping vxfs inode %ld\n", ino);
+	else
+		printk(KERN_DEBUG "dumping unknown vxfs inode\n");
+
+	printk(KERN_DEBUG "---------------------------\n");
+	printk(KERN_DEBUG "mode is %x\n", vip->vii_mode);
+	printk(KERN_DEBUG "nlink:%u, uid:%u, gid:%u\n",
+			vip->vii_nlink, vip->vii_uid, vip->vii_gid);
+	printk(KERN_DEBUG "size:%Lx, blocks:%u\n",
+			vip->vii_size, vip->vii_blocks);
+	printk(KERN_DEBUG "orgtype:%u\n", vip->vii_orgtype);
+}
+#endif
+
+
+/**
+ * vxfs_blkiget - find inode based on extent #
+ * @sbp:	superblock of the filesystem we search in
+ * @extent:	number of the extent to search
+ * @ino:	inode number to search
+ *
+ * Description:
+ *  vxfs_blkiget searches inode @ino in the filesystem described by
+ *  @sbp in the extent @extent.
+ *  Returns the matching VxFS inode on success, else a NULL pointer.
+ *
+ * NOTE:
+ *  While __vxfs_iget uses the pagecache vxfs_blkiget uses the
+ *  buffercache.  This function should not be used outside the
+ *  read_super() method, otherwise the data may be incoherent.
+ */
+struct vxfs_inode_info *
+vxfs_blkiget(struct super_block *sbp, u_long extent, ino_t ino)
+{
+	struct buffer_head		*bp;
+	u_long				block, offset;
+
+	block = extent + ((ino * VXFS_ISIZE) / sbp->s_blocksize);
+	offset = ((ino % (sbp->s_blocksize / VXFS_ISIZE)) * VXFS_ISIZE);
+	bp = sb_bread(sbp, block);
+
+	if (bp && buffer_mapped(bp)) {
+		struct vxfs_inode_info	*vip;
+		struct vxfs_dinode	*dip;
+
+		if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL)))
+			goto fail;
+		dip = (struct vxfs_dinode *)(bp->b_data + offset);
+		memcpy(vip, dip, sizeof(*vip));
+#ifdef DIAGNOSTIC
+		vxfs_dumpi(vip, ino);
+#endif
+		brelse(bp);
+		return (vip);
+	}
+
+fail:
+	printk(KERN_WARNING "vxfs: unable to read block %ld\n", block);
+	brelse(bp);
+	return NULL;
+}
+
+/**
+ * __vxfs_iget - generic find inode facility
+ * @sbp:		VFS superblock
+ * @ino:		inode number
+ * @ilistp:		inode list
+ *
+ * Description:
+ *  Search the for inode number @ino in the filesystem
+ *  described by @sbp.  Use the specified inode table (@ilistp).
+ *  Returns the matching VxFS inode on success, else an error code.
+ */
+static struct vxfs_inode_info *
+__vxfs_iget(ino_t ino, struct inode *ilistp)
+{
+	struct page			*pp;
+	u_long				offset;
+
+	offset = (ino % (PAGE_SIZE / VXFS_ISIZE)) * VXFS_ISIZE;
+	pp = vxfs_get_page(ilistp->i_mapping, ino * VXFS_ISIZE / PAGE_SIZE);
+
+	if (!IS_ERR(pp)) {
+		struct vxfs_inode_info	*vip;
+		struct vxfs_dinode	*dip;
+		caddr_t			kaddr = (char *)page_address(pp);
+
+		if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL)))
+			goto fail;
+		dip = (struct vxfs_dinode *)(kaddr + offset);
+		memcpy(vip, dip, sizeof(*vip));
+#ifdef DIAGNOSTIC
+		vxfs_dumpi(vip, ino);
+#endif
+		vxfs_put_page(pp);
+		return (vip);
+	}
+
+	printk(KERN_WARNING "vxfs: error on page %p\n", pp);
+	return ERR_CAST(pp);
+
+fail:
+	printk(KERN_WARNING "vxfs: unable to read inode %ld\n", (unsigned long)ino);
+	vxfs_put_page(pp);
+	return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * vxfs_stiget - find inode using the structural inode list
+ * @sbp:	VFS superblock
+ * @ino:	inode #
+ *
+ * Description:
+ *  Find inode @ino in the filesystem described by @sbp using
+ *  the structural inode list.
+ *  Returns the matching VxFS inode on success, else a NULL pointer.
+ */
+struct vxfs_inode_info *
+vxfs_stiget(struct super_block *sbp, ino_t ino)
+{
+	struct vxfs_inode_info *vip;
+
+	vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_stilist);
+	return IS_ERR(vip) ? NULL : vip;
+}
+
+/**
+ * vxfs_transmod - mode for a VxFS inode
+ * @vip:	VxFS inode
+ *
+ * Description:
+ *  vxfs_transmod returns a Linux mode_t for a given
+ *  VxFS inode structure.
+ */
+static __inline__ mode_t
+vxfs_transmod(struct vxfs_inode_info *vip)
+{
+	mode_t			ret = vip->vii_mode & ~VXFS_TYPE_MASK;
+
+	if (VXFS_ISFIFO(vip))
+		ret |= S_IFIFO;
+	if (VXFS_ISCHR(vip))
+		ret |= S_IFCHR;
+	if (VXFS_ISDIR(vip))
+		ret |= S_IFDIR;
+	if (VXFS_ISBLK(vip))
+		ret |= S_IFBLK;
+	if (VXFS_ISLNK(vip))
+		ret |= S_IFLNK;
+	if (VXFS_ISREG(vip))
+		ret |= S_IFREG;
+	if (VXFS_ISSOC(vip))
+		ret |= S_IFSOCK;
+
+	return (ret);
+}
+
+/**
+ * vxfs_iinit- helper to fill inode fields
+ * @ip:		VFS inode
+ * @vip:	VxFS inode
+ *
+ * Description:
+ *  vxfs_instino is a helper function to fill in all relevant
+ *  fields in @ip from @vip.
+ */
+static void
+vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip)
+{
+
+	ip->i_mode = vxfs_transmod(vip);
+	ip->i_uid = (uid_t)vip->vii_uid;
+	ip->i_gid = (gid_t)vip->vii_gid;
+
+	ip->i_nlink = vip->vii_nlink;
+	ip->i_size = vip->vii_size;
+
+	ip->i_atime.tv_sec = vip->vii_atime;
+	ip->i_ctime.tv_sec = vip->vii_ctime;
+	ip->i_mtime.tv_sec = vip->vii_mtime;
+	ip->i_atime.tv_nsec = 0;
+	ip->i_ctime.tv_nsec = 0;
+	ip->i_mtime.tv_nsec = 0;
+
+	ip->i_blocks = vip->vii_blocks;
+	ip->i_generation = vip->vii_gen;
+
+	ip->i_private = vip;
+	
+}
+
+/**
+ * vxfs_get_fake_inode - get fake inode structure
+ * @sbp:		filesystem superblock
+ * @vip:		fspriv inode
+ *
+ * Description:
+ *  vxfs_fake_inode gets a fake inode (not in the inode hash) for a
+ *  superblock, vxfs_inode pair.
+ *  Returns the filled VFS inode.
+ */
+struct inode *
+vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
+{
+	struct inode			*ip = NULL;
+
+	if ((ip = new_inode(sbp))) {
+		ip->i_ino = get_next_ino();
+		vxfs_iinit(ip, vip);
+		ip->i_mapping->a_ops = &vxfs_aops;
+	}
+	return (ip);
+}
+
+/**
+ * vxfs_put_fake_inode - free faked inode
+ * *ip:			VFS inode
+ *
+ * Description:
+ *  vxfs_put_fake_inode frees all data associated with @ip.
+ */
+void
+vxfs_put_fake_inode(struct inode *ip)
+{
+	iput(ip);
+}
+
+/**
+ * vxfs_iget - get an inode
+ * @sbp:	the superblock to get the inode for
+ * @ino:	the number of the inode to get
+ *
+ * Description:
+ *  vxfs_read_inode creates an inode, reads the disk inode for @ino and fills
+ *  in all relevant fields in the new inode.
+ */
+struct inode *
+vxfs_iget(struct super_block *sbp, ino_t ino)
+{
+	struct vxfs_inode_info		*vip;
+	const struct address_space_operations	*aops;
+	struct inode *ip;
+
+	ip = iget_locked(sbp, ino);
+	if (!ip)
+		return ERR_PTR(-ENOMEM);
+	if (!(ip->i_state & I_NEW))
+		return ip;
+
+	vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_ilist);
+	if (IS_ERR(vip)) {
+		iget_failed(ip);
+		return ERR_CAST(vip);
+	}
+
+	vxfs_iinit(ip, vip);
+
+	if (VXFS_ISIMMED(vip))
+		aops = &vxfs_immed_aops;
+	else
+		aops = &vxfs_aops;
+
+	if (S_ISREG(ip->i_mode)) {
+		ip->i_fop = &generic_ro_fops;
+		ip->i_mapping->a_ops = aops;
+	} else if (S_ISDIR(ip->i_mode)) {
+		ip->i_op = &vxfs_dir_inode_ops;
+		ip->i_fop = &vxfs_dir_operations;
+		ip->i_mapping->a_ops = aops;
+	} else if (S_ISLNK(ip->i_mode)) {
+		if (!VXFS_ISIMMED(vip)) {
+			ip->i_op = &page_symlink_inode_operations;
+			ip->i_mapping->a_ops = &vxfs_aops;
+		} else {
+			ip->i_op = &vxfs_immed_symlink_iops;
+			vip->vii_immed.vi_immed[ip->i_size] = '\0';
+		}
+	} else
+		init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev));
+
+	unlock_new_inode(ip);
+	return ip;
+}
+
+static void vxfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(vxfs_inode_cachep, inode->i_private);
+}
+
+/**
+ * vxfs_evict_inode - remove inode from main memory
+ * @ip:		inode to discard.
+ *
+ * Description:
+ *  vxfs_evict_inode() is called on the final iput and frees the private
+ *  inode area.
+ */
+void
+vxfs_evict_inode(struct inode *ip)
+{
+	truncate_inode_pages(&ip->i_data, 0);
+	end_writeback(ip);
+	call_rcu(&ip->i_rcu, vxfs_i_callback);
+}
diff --git a/fs/freevxfs/vxfs_inode.h b/fs/freevxfs/vxfs_inode.h
new file mode 100644
index 00000000..240aeb11
--- /dev/null
+++ b/fs/freevxfs/vxfs_inode.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#ifndef _VXFS_INODE_H_
+#define _VXFS_INODE_H_
+
+/*
+ * Veritas filesystem driver - inode structure.
+ *
+ * This file contains the definition of the disk and core
+ * inodes of the Veritas Filesystem.
+ */
+
+
+#define VXFS_ISIZE		0x100		/* Inode size */
+
+#define VXFS_NDADDR		10		/* Number of direct addrs in inode */
+#define VXFS_NIADDR		2		/* Number of indirect addrs in inode */
+#define VXFS_NIMMED		96		/* Size of immediate data in inode */
+#define VXFS_NTYPED		6		/* Num of typed extents */
+
+#define VXFS_TYPED_OFFSETMASK	(0x00FFFFFFFFFFFFFFULL)
+#define VXFS_TYPED_TYPEMASK	(0xFF00000000000000ULL)
+#define VXFS_TYPED_TYPESHIFT	56
+
+#define VXFS_TYPED_PER_BLOCK(sbp) \
+	((sbp)->s_blocksize / sizeof(struct vxfs_typed))
+
+/*
+ * Possible extent descriptor types for %VXFS_ORG_TYPED extents.
+ */
+enum {
+	VXFS_TYPED_INDIRECT		= 1,
+	VXFS_TYPED_DATA			= 2,
+	VXFS_TYPED_INDIRECT_DEV4	= 3,
+	VXFS_TYPED_DATA_DEV4		= 4,
+};
+
+/*
+ * Data stored immediately in the inode.
+ */
+struct vxfs_immed {
+	u_int8_t	vi_immed[VXFS_NIMMED];
+};
+
+struct vxfs_ext4 {
+	u_int32_t		ve4_spare;		/* ?? */
+	u_int32_t		ve4_indsize;		/* Indirect extent size */
+	vx_daddr_t		ve4_indir[VXFS_NIADDR];	/* Indirect extents */
+	struct direct {					/* Direct extents */
+		vx_daddr_t	extent;			/* Extent number */
+		int32_t		size;			/* Size of extent */
+	} ve4_direct[VXFS_NDADDR];
+};
+
+struct vxfs_typed {
+	u_int64_t	vt_hdr;		/* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */
+	vx_daddr_t	vt_block;	/* Extent block */
+	int32_t		vt_size;	/* Size in blocks */
+};
+
+struct vxfs_typed_dev4 {
+	u_int64_t	vd4_hdr;	/* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */
+	u_int64_t	vd4_block;	/* Extent block */
+	u_int64_t	vd4_size;	/* Size in blocks */
+	int32_t		vd4_dev;	/* Device ID */
+	u_int32_t	__pad1;
+};
+
+/*
+ * The inode as contained on the physical device.
+ */
+struct vxfs_dinode {
+	int32_t		vdi_mode;
+	u_int32_t	vdi_nlink;	/* Link count */
+	u_int32_t	vdi_uid;	/* UID */
+	u_int32_t	vdi_gid;	/* GID */
+	u_int64_t	vdi_size;	/* Inode size in bytes */
+	u_int32_t	vdi_atime;	/* Last time accessed - sec */
+	u_int32_t	vdi_autime;	/* Last time accessed - usec */
+	u_int32_t	vdi_mtime;	/* Last modify time - sec */
+	u_int32_t	vdi_mutime;	/* Last modify time - usec */
+	u_int32_t	vdi_ctime;	/* Create time - sec */
+	u_int32_t	vdi_cutime;	/* Create time - usec */
+	u_int8_t	vdi_aflags;	/* Allocation flags */
+	u_int8_t	vdi_orgtype;	/* Organisation type */
+	u_int16_t	vdi_eopflags;
+	u_int32_t	vdi_eopdata;
+	union {
+		u_int32_t		rdev;
+		u_int32_t		dotdot;
+		struct {
+			u_int32_t	reserved;
+			u_int32_t	fixextsize;
+		} i_regular;
+		struct {
+			u_int32_t	matchino;
+			u_int32_t	fsetindex;
+		} i_vxspec;
+		u_int64_t		align;
+	} vdi_ftarea;
+	u_int32_t	vdi_blocks;	/* How much blocks does inode occupy */
+	u_int32_t	vdi_gen;	/* Inode generation */
+	u_int64_t	vdi_version;	/* Version */
+	union {
+		struct vxfs_immed	immed;
+		struct vxfs_ext4	ext4;
+		struct vxfs_typed	typed[VXFS_NTYPED];
+	} vdi_org;
+	u_int32_t	vdi_iattrino;
+};
+
+#define vdi_rdev	vdi_ftarea.rdev
+#define vdi_dotdot	vdi_ftarea.dotdot
+#define vdi_fixextsize	vdi_ftarea.regular.fixextsize
+#define vdi_matchino	vdi_ftarea.vxspec.matchino
+#define vdi_fsetindex	vdi_ftarea.vxspec.fsetindex
+
+#define vdi_immed	vdi_org.immed
+#define vdi_ext4	vdi_org.ext4
+#define vdi_typed	vdi_org.typed
+
+
+/*
+ * The inode as represented in the main memory.
+ *
+ * TBD: This should become a separate structure...
+ */
+#define vxfs_inode_info	vxfs_dinode
+
+#define vii_mode	vdi_mode
+#define vii_uid		vdi_uid
+#define vii_gid		vdi_gid
+#define vii_nlink	vdi_nlink
+#define vii_size	vdi_size
+#define vii_atime	vdi_atime
+#define vii_ctime	vdi_ctime
+#define vii_mtime	vdi_mtime
+#define vii_blocks	vdi_blocks
+#define vii_org		vdi_org
+#define vii_orgtype	vdi_orgtype
+#define vii_gen		vdi_gen
+
+#define vii_rdev	vdi_ftarea.rdev
+#define vii_dotdot	vdi_ftarea.dotdot
+#define vii_fixextsize	vdi_ftarea.regular.fixextsize
+#define vii_matchino	vdi_ftarea.vxspec.matchino
+#define vii_fsetindex	vdi_ftarea.vxspec.fsetindex
+
+#define vii_immed	vdi_org.immed
+#define vii_ext4	vdi_org.ext4
+#define vii_typed	vdi_org.typed
+
+#endif /* _VXFS_INODE_H_ */
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
new file mode 100644
index 00000000..3360f1e6
--- /dev/null
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Veritas filesystem driver - lookup and other directory related code.
+ */
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/kernel.h>
+#include <linux/pagemap.h>
+
+#include "vxfs.h"
+#include "vxfs_dir.h"
+#include "vxfs_inode.h"
+#include "vxfs_extern.h"
+
+/*
+ * Number of VxFS blocks per page.
+ */
+#define VXFS_BLOCK_PER_PAGE(sbp)  ((PAGE_CACHE_SIZE / (sbp)->s_blocksize))
+
+
+static struct dentry *	vxfs_lookup(struct inode *, struct dentry *, struct nameidata *);
+static int		vxfs_readdir(struct file *, void *, filldir_t);
+
+const struct inode_operations vxfs_dir_inode_ops = {
+	.lookup =		vxfs_lookup,
+};
+
+const struct file_operations vxfs_dir_operations = {
+	.llseek =		generic_file_llseek,
+	.read =			generic_read_dir,
+	.readdir =		vxfs_readdir,
+};
+
+ 
+static inline u_long
+dir_pages(struct inode *inode)
+{
+	return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+ 
+static inline u_long
+dir_blocks(struct inode *ip)
+{
+	u_long			bsize = ip->i_sb->s_blocksize;
+	return (ip->i_size + bsize - 1) & ~(bsize - 1);
+}
+
+/*
+ * NOTE! unlike strncmp, vxfs_match returns 1 for success, 0 for failure.
+ *
+ * len <= VXFS_NAMELEN and de != NULL are guaranteed by caller.
+ */
+static inline int
+vxfs_match(int len, const char * const name, struct vxfs_direct *de)
+{
+	if (len != de->d_namelen)
+		return 0;
+	if (!de->d_ino)
+		return 0;
+	return !memcmp(name, de->d_name, len);
+}
+
+static inline struct vxfs_direct *
+vxfs_next_entry(struct vxfs_direct *de)
+{
+	return ((struct vxfs_direct *)((char*)de + de->d_reclen));
+}
+
+/**
+ * vxfs_find_entry - find a mathing directory entry for a dentry
+ * @ip:		directory inode
+ * @dp:		dentry for which we want to find a direct
+ * @ppp:	gets filled with the page the return value sits in
+ *
+ * Description:
+ *   vxfs_find_entry finds a &struct vxfs_direct for the VFS directory
+ *   cache entry @dp.  @ppp will be filled with the page the return
+ *   value resides in.
+ *
+ * Returns:
+ *   The wanted direct on success, else a NULL pointer.
+ */
+static struct vxfs_direct *
+vxfs_find_entry(struct inode *ip, struct dentry *dp, struct page **ppp)
+{
+	u_long				npages, page, nblocks, pblocks, block;
+	u_long				bsize = ip->i_sb->s_blocksize;
+	const char			*name = dp->d_name.name;
+	int				namelen = dp->d_name.len;
+
+	npages = dir_pages(ip);
+	nblocks = dir_blocks(ip);
+	pblocks = VXFS_BLOCK_PER_PAGE(ip->i_sb);
+	
+	for (page = 0; page < npages; page++) {
+		caddr_t			kaddr;
+		struct page		*pp;
+
+		pp = vxfs_get_page(ip->i_mapping, page);
+		if (IS_ERR(pp))
+			continue;
+		kaddr = (caddr_t)page_address(pp);
+
+		for (block = 0; block <= nblocks && block <= pblocks; block++) {
+			caddr_t			baddr, limit;
+			struct vxfs_dirblk	*dbp;
+			struct vxfs_direct	*de;
+
+			baddr = kaddr + (block * bsize);
+			limit = baddr + bsize - VXFS_DIRLEN(1);
+			
+			dbp = (struct vxfs_dirblk *)baddr;
+			de = (struct vxfs_direct *)(baddr + VXFS_DIRBLKOV(dbp));
+
+			for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) {
+				if (!de->d_reclen)
+					break;
+				if (!de->d_ino)
+					continue;
+				if (vxfs_match(namelen, name, de)) {
+					*ppp = pp;
+					return (de);
+				}
+			}
+		}
+		vxfs_put_page(pp);
+	}
+
+	return NULL;
+}
+
+/**
+ * vxfs_inode_by_name - find inode number for dentry
+ * @dip:	directory to search in
+ * @dp:		dentry we search for
+ *
+ * Description:
+ *   vxfs_inode_by_name finds out the inode number of
+ *   the path component described by @dp in @dip.
+ *
+ * Returns:
+ *   The wanted inode number on success, else Zero.
+ */
+static ino_t
+vxfs_inode_by_name(struct inode *dip, struct dentry *dp)
+{
+	struct vxfs_direct		*de;
+	struct page			*pp;
+	ino_t				ino = 0;
+
+	de = vxfs_find_entry(dip, dp, &pp);
+	if (de) {
+		ino = de->d_ino;
+		kunmap(pp);
+		page_cache_release(pp);
+	}
+	
+	return (ino);
+}
+
+/**
+ * vxfs_lookup - lookup pathname component
+ * @dip:	dir in which we lookup
+ * @dp:		dentry we lookup
+ * @nd:		lookup nameidata
+ *
+ * Description:
+ *   vxfs_lookup tries to lookup the pathname component described
+ *   by @dp in @dip.
+ *
+ * Returns:
+ *   A NULL-pointer on success, else an negative error code encoded
+ *   in the return pointer.
+ */
+static struct dentry *
+vxfs_lookup(struct inode *dip, struct dentry *dp, struct nameidata *nd)
+{
+	struct inode		*ip = NULL;
+	ino_t			ino;
+			 
+	if (dp->d_name.len > VXFS_NAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+				 
+	ino = vxfs_inode_by_name(dip, dp);
+	if (ino) {
+		ip = vxfs_iget(dip->i_sb, ino);
+		if (IS_ERR(ip))
+			return ERR_CAST(ip);
+	}
+	d_add(dp, ip);
+	return NULL;
+}
+
+/**
+ * vxfs_readdir - read a directory
+ * @fp:		the directory to read
+ * @retp:	return buffer
+ * @filler:	filldir callback
+ *
+ * Description:
+ *   vxfs_readdir fills @retp with directory entries from @fp
+ *   using the VFS supplied callback @filler.
+ *
+ * Returns:
+ *   Zero.
+ */
+static int
+vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
+{
+	struct inode		*ip = fp->f_path.dentry->d_inode;
+	struct super_block	*sbp = ip->i_sb;
+	u_long			bsize = sbp->s_blocksize;
+	u_long			page, npages, block, pblocks, nblocks, offset;
+	loff_t			pos;
+
+	switch ((long)fp->f_pos) {
+	case 0:
+		if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
+			goto out;
+		fp->f_pos++;
+		/* fallthrough */
+	case 1:
+		if (filler(retp, "..", 2, fp->f_pos, VXFS_INO(ip)->vii_dotdot, DT_DIR) < 0)
+			goto out;
+		fp->f_pos++;
+		/* fallthrough */
+	}
+
+	pos = fp->f_pos - 2;
+	
+	if (pos > VXFS_DIRROUND(ip->i_size))
+		return 0;
+
+	npages = dir_pages(ip);
+	nblocks = dir_blocks(ip);
+	pblocks = VXFS_BLOCK_PER_PAGE(sbp);
+
+	page = pos >> PAGE_CACHE_SHIFT;
+	offset = pos & ~PAGE_CACHE_MASK;
+	block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks;
+
+	for (; page < npages; page++, block = 0) {
+		caddr_t			kaddr;
+		struct page		*pp;
+
+		pp = vxfs_get_page(ip->i_mapping, page);
+		if (IS_ERR(pp))
+			continue;
+		kaddr = (caddr_t)page_address(pp);
+
+		for (; block <= nblocks && block <= pblocks; block++) {
+			caddr_t			baddr, limit;
+			struct vxfs_dirblk	*dbp;
+			struct vxfs_direct	*de;
+
+			baddr = kaddr + (block * bsize);
+			limit = baddr + bsize - VXFS_DIRLEN(1);
+	
+			dbp = (struct vxfs_dirblk *)baddr;
+			de = (struct vxfs_direct *)
+				(offset ?
+				 (kaddr + offset) :
+				 (baddr + VXFS_DIRBLKOV(dbp)));
+
+			for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) {
+				int	over;
+
+				if (!de->d_reclen)
+					break;
+				if (!de->d_ino)
+					continue;
+
+				offset = (caddr_t)de - kaddr;
+				over = filler(retp, de->d_name, de->d_namelen,
+					((page << PAGE_CACHE_SHIFT) | offset) + 2,
+					de->d_ino, DT_UNKNOWN);
+				if (over) {
+					vxfs_put_page(pp);
+					goto done;
+				}
+			}
+			offset = 0;
+		}
+		vxfs_put_page(pp);
+		offset = 0;
+	}
+
+done:
+	fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
+out:
+	return 0;
+}
diff --git a/fs/freevxfs/vxfs_olt.c b/fs/freevxfs/vxfs_olt.c
new file mode 100644
index 00000000..04950084
--- /dev/null
+++ b/fs/freevxfs/vxfs_olt.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* 
+ * Veritas filesystem driver - object location table support.
+ */
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/kernel.h>
+
+#include "vxfs.h"
+#include "vxfs_olt.h"
+#include "vxfs_extern.h"
+
+
+static inline void
+vxfs_get_fshead(struct vxfs_oltfshead *fshp, struct vxfs_sb_info *infp)
+{
+	BUG_ON(infp->vsi_fshino);
+	infp->vsi_fshino = fshp->olt_fsino[0];
+}
+
+static inline void
+vxfs_get_ilist(struct vxfs_oltilist *ilistp, struct vxfs_sb_info *infp)
+{
+	BUG_ON(infp->vsi_iext);
+	infp->vsi_iext = ilistp->olt_iext[0]; 
+}
+
+static inline u_long
+vxfs_oblock(struct super_block *sbp, daddr_t block, u_long bsize)
+{
+	BUG_ON(sbp->s_blocksize % bsize);
+	return (block * (sbp->s_blocksize / bsize));
+}
+
+
+/**
+ * vxfs_read_olt - read olt
+ * @sbp:	superblock of the filesystem
+ * @bsize:	blocksize of the filesystem
+ *
+ * Description:
+ *   vxfs_read_olt reads the olt of the filesystem described by @sbp
+ *   into main memory and does some basic setup.
+ *
+ * Returns:
+ *   Zero on success, else a negative error code.
+ */
+int
+vxfs_read_olt(struct super_block *sbp, u_long bsize)
+{
+	struct vxfs_sb_info	*infp = VXFS_SBI(sbp);
+	struct buffer_head	*bp;
+	struct vxfs_olt		*op;
+	char			*oaddr, *eaddr;
+
+
+	bp = sb_bread(sbp, vxfs_oblock(sbp, infp->vsi_oltext, bsize));
+	if (!bp || !bp->b_data)
+		goto fail;
+
+	op = (struct vxfs_olt *)bp->b_data;
+	if (op->olt_magic != VXFS_OLT_MAGIC) {
+		printk(KERN_NOTICE "vxfs: ivalid olt magic number\n");
+		goto fail;
+	}
+
+	/*
+	 * It is in theory possible that vsi_oltsize is > 1.
+	 * I've not seen any such filesystem yet and I'm lazy..  --hch
+	 */
+	if (infp->vsi_oltsize > 1) {
+		printk(KERN_NOTICE "vxfs: oltsize > 1 detected.\n");
+		printk(KERN_NOTICE "vxfs: please notify hch@infradead.org\n");
+		goto fail;
+	}
+
+	oaddr = bp->b_data + op->olt_size;
+	eaddr = bp->b_data + (infp->vsi_oltsize * sbp->s_blocksize);
+
+	while (oaddr < eaddr) {
+		struct vxfs_oltcommon	*ocp =
+			(struct vxfs_oltcommon *)oaddr;
+		
+		switch (ocp->olt_type) {
+		case VXFS_OLT_FSHEAD:
+			vxfs_get_fshead((struct vxfs_oltfshead *)oaddr, infp);
+			break;
+		case VXFS_OLT_ILIST:
+			vxfs_get_ilist((struct vxfs_oltilist *)oaddr, infp);
+			break;
+		}
+
+		oaddr += ocp->olt_size;
+	}
+
+	brelse(bp);
+	return 0;
+
+fail:
+	brelse(bp);
+	return -EINVAL;
+}
diff --git a/fs/freevxfs/vxfs_olt.h b/fs/freevxfs/vxfs_olt.h
new file mode 100644
index 00000000..b7b3af50
--- /dev/null
+++ b/fs/freevxfs/vxfs_olt.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#ifndef _VXFS_OLT_H_
+#define _VXFS_OLT_H_
+
+/*
+ * Veritas filesystem driver - Object Location Table data structures.
+ *
+ * This file contains definitions for the Object Location Table used
+ * by the Veritas Filesystem version 2 and newer.
+ */
+
+
+/*
+ * OLT magic number (vxfs_olt->olt_magic).
+ */
+#define VXFS_OLT_MAGIC		0xa504FCF5
+
+/*
+ * VxFS OLT entry types.
+ */
+enum {
+	VXFS_OLT_FREE	= 1,
+	VXFS_OLT_FSHEAD	= 2,
+	VXFS_OLT_CUT	= 3,
+	VXFS_OLT_ILIST	= 4,
+	VXFS_OLT_DEV	= 5,
+	VXFS_OLT_SB	= 6
+};
+
+/*
+ * VxFS OLT header.
+ *
+ * The Object Location Table header is placed at the beginning of each
+ * OLT extent.  It is used to fing certain filesystem-wide metadata, e.g.
+ * the initial inode list, the fileset header or the device configuration.
+ */
+struct vxfs_olt {
+	u_int32_t	olt_magic;	/* magic number			*/
+	u_int32_t	olt_size;	/* size of this entry		*/
+	u_int32_t	olt_checksum;	/* checksum of extent		*/
+	u_int32_t	__unused1;	/* ???				*/
+	u_int32_t	olt_mtime;	/* time of last mod. (sec)	*/
+	u_int32_t	olt_mutime;	/* time of last mod. (usec)	*/
+	u_int32_t	olt_totfree;	/* free space in OLT extent	*/
+	vx_daddr_t	olt_extents[2];	/* addr of this extent, replica	*/
+	u_int32_t	olt_esize;	/* size of this extent		*/
+	vx_daddr_t	olt_next[2];    /* addr of next extent, replica	*/
+	u_int32_t	olt_nsize;	/* size of next extent		*/
+	u_int32_t	__unused2;	/* align to 8 byte boundary	*/
+};
+
+/*
+ * VxFS common OLT entry (on disk).
+ */
+struct vxfs_oltcommon {
+	u_int32_t	olt_type;	/* type of this record		*/
+	u_int32_t	olt_size;	/* size of this record		*/
+};
+
+/*
+ * VxFS free OLT entry (on disk).
+ */
+struct vxfs_oltfree {
+	u_int32_t	olt_type;	/* type of this record		*/
+	u_int32_t	olt_fsize;	/* size of this free record	*/
+};
+
+/*
+ * VxFS initial-inode list (on disk).
+ */
+struct vxfs_oltilist {
+	u_int32_t	olt_type;	/* type of this record		*/
+	u_int32_t	olt_size;	/* size of this record		*/
+	vx_ino_t	olt_iext[2];	/* initial inode list, replica	*/
+};
+
+/*
+ * Current Usage Table 
+ */
+struct vxfs_oltcut {
+	u_int32_t	olt_type;	/* type of this record		*/
+	u_int32_t	olt_size;	/* size of this record		*/
+	vx_ino_t	olt_cutino;	/* inode of current usage table	*/
+	u_int32_t	__pad;		/* unused, 8 byte align		*/
+};
+
+/*
+ * Inodes containing Superblock, Intent log and OLTs 
+ */
+struct vxfs_oltsb {
+	u_int32_t	olt_type;	/* type of this record		*/
+	u_int32_t	olt_size;	/* size of this record		*/
+	vx_ino_t	olt_sbino;	/* inode of superblock file	*/
+	u_int32_t	__unused1;	/* ???				*/
+	vx_ino_t	olt_logino[2];	/* inode of log file,replica	*/
+	vx_ino_t	olt_oltino[2];	/* inode of OLT, replica	*/
+};
+
+/*
+ * Inode containing device configuration + it's replica 
+ */
+struct vxfs_oltdev {
+	u_int32_t	olt_type;	/* type of this record		*/
+	u_int32_t	olt_size;	/* size of this record		*/
+	vx_ino_t	olt_devino[2];	/* inode of device config files	*/
+};
+
+/*
+ * Fileset header 
+ */
+struct vxfs_oltfshead {
+	u_int32_t	olt_type;	/* type number			*/
+	u_int32_t	olt_size;	/* size of this record		*/
+	vx_ino_t	olt_fsino[2];   /* inodes of fileset header	*/
+};
+
+#endif /* _VXFS_OLT_H_ */
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
new file mode 100644
index 00000000..5d318c44
--- /dev/null
+++ b/fs/freevxfs/vxfs_subr.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Veritas filesystem driver - shared subroutines.
+ */
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/kernel.h>
+#include <linux/pagemap.h>
+
+#include "vxfs_extern.h"
+
+
+static int		vxfs_readpage(struct file *, struct page *);
+static sector_t		vxfs_bmap(struct address_space *, sector_t);
+
+const struct address_space_operations vxfs_aops = {
+	.readpage =		vxfs_readpage,
+	.bmap =			vxfs_bmap,
+};
+
+inline void
+vxfs_put_page(struct page *pp)
+{
+	kunmap(pp);
+	page_cache_release(pp);
+}
+
+/**
+ * vxfs_get_page - read a page into memory.
+ * @ip:		inode to read from
+ * @n:		page number
+ *
+ * Description:
+ *   vxfs_get_page reads the @n th page of @ip into the pagecache.
+ *
+ * Returns:
+ *   The wanted page on success, else a NULL pointer.
+ */
+struct page *
+vxfs_get_page(struct address_space *mapping, u_long n)
+{
+	struct page *			pp;
+
+	pp = read_mapping_page(mapping, n, NULL);
+
+	if (!IS_ERR(pp)) {
+		kmap(pp);
+		/** if (!PageChecked(pp)) **/
+			/** vxfs_check_page(pp); **/
+		if (PageError(pp))
+			goto fail;
+	}
+	
+	return (pp);
+		 
+fail:
+	vxfs_put_page(pp);
+	return ERR_PTR(-EIO);
+}
+
+/**
+ * vxfs_bread - read buffer for a give inode,block tuple
+ * @ip:		inode
+ * @block:	logical block
+ *
+ * Description:
+ *   The vxfs_bread function reads block no @block  of
+ *   @ip into the buffercache.
+ *
+ * Returns:
+ *   The resulting &struct buffer_head.
+ */
+struct buffer_head *
+vxfs_bread(struct inode *ip, int block)
+{
+	struct buffer_head	*bp;
+	daddr_t			pblock;
+
+	pblock = vxfs_bmap1(ip, block);
+	bp = sb_bread(ip->i_sb, pblock);
+
+	return (bp);
+}
+
+/**
+ * vxfs_get_block - locate buffer for given inode,block tuple 
+ * @ip:		inode
+ * @iblock:	logical block
+ * @bp:		buffer skeleton
+ * @create:	%TRUE if blocks may be newly allocated.
+ *
+ * Description:
+ *   The vxfs_get_block function fills @bp with the right physical
+ *   block and device number to perform a lowlevel read/write on
+ *   it.
+ *
+ * Returns:
+ *   Zero on success, else a negativ error code (-EIO).
+ */
+static int
+vxfs_getblk(struct inode *ip, sector_t iblock,
+	    struct buffer_head *bp, int create)
+{
+	daddr_t			pblock;
+
+	pblock = vxfs_bmap1(ip, iblock);
+	if (pblock != 0) {
+		map_bh(bp, ip->i_sb, pblock);
+		return 0;
+	}
+
+	return -EIO;
+}
+
+/**
+ * vxfs_readpage - read one page synchronously into the pagecache
+ * @file:	file context (unused)
+ * @page:	page frame to fill in.
+ *
+ * Description:
+ *   The vxfs_readpage routine reads @page synchronously into the
+ *   pagecache.
+ *
+ * Returns:
+ *   Zero on success, else a negative error code.
+ *
+ * Locking status:
+ *   @page is locked and will be unlocked.
+ */
+static int
+vxfs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, vxfs_getblk);
+}
+ 
+/**
+ * vxfs_bmap - perform logical to physical block mapping
+ * @mapping:	logical to physical mapping to use
+ * @block:	logical block (relative to @mapping).
+ *
+ * Description:
+ *   Vxfs_bmap find out the corresponding phsical block to the
+ *   @mapping, @block pair.
+ *
+ * Returns:
+ *   Physical block number on success, else Zero.
+ *
+ * Locking status:
+ *   We are under the bkl.
+ */
+static sector_t
+vxfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, vxfs_getblk);
+}
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
new file mode 100644
index 00000000..9d1c9955
--- /dev/null
+++ b/fs/freevxfs/vxfs_super.c
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2000-2001 Christoph Hellwig.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Veritas filesystem driver - superblock related routines.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+
+#include "vxfs.h"
+#include "vxfs_extern.h"
+#include "vxfs_dir.h"
+#include "vxfs_inode.h"
+
+
+MODULE_AUTHOR("Christoph Hellwig");
+MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+MODULE_ALIAS("vxfs"); /* makes mount -t vxfs autoload the module */
+
+
+static void		vxfs_put_super(struct super_block *);
+static int		vxfs_statfs(struct dentry *, struct kstatfs *);
+static int		vxfs_remount(struct super_block *, int *, char *);
+
+static const struct super_operations vxfs_super_ops = {
+	.evict_inode =		vxfs_evict_inode,
+	.put_super =		vxfs_put_super,
+	.statfs =		vxfs_statfs,
+	.remount_fs =		vxfs_remount,
+};
+
+/**
+ * vxfs_put_super - free superblock resources
+ * @sbp:	VFS superblock.
+ *
+ * Description:
+ *   vxfs_put_super frees all resources allocated for @sbp
+ *   after the last instance of the filesystem is unmounted.
+ */
+
+static void
+vxfs_put_super(struct super_block *sbp)
+{
+	struct vxfs_sb_info	*infp = VXFS_SBI(sbp);
+
+	vxfs_put_fake_inode(infp->vsi_fship);
+	vxfs_put_fake_inode(infp->vsi_ilist);
+	vxfs_put_fake_inode(infp->vsi_stilist);
+
+	brelse(infp->vsi_bp);
+	kfree(infp);
+}
+
+/**
+ * vxfs_statfs - get filesystem information
+ * @dentry:	VFS dentry to locate superblock
+ * @bufp:	output buffer
+ *
+ * Description:
+ *   vxfs_statfs fills the statfs buffer @bufp with information
+ *   about the filesystem described by @dentry.
+ *
+ * Returns:
+ *   Zero.
+ *
+ * Locking:
+ *   No locks held.
+ *
+ * Notes:
+ *   This is everything but complete...
+ */
+static int
+vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
+{
+	struct vxfs_sb_info		*infp = VXFS_SBI(dentry->d_sb);
+
+	bufp->f_type = VXFS_SUPER_MAGIC;
+	bufp->f_bsize = dentry->d_sb->s_blocksize;
+	bufp->f_blocks = infp->vsi_raw->vs_dsize;
+	bufp->f_bfree = infp->vsi_raw->vs_free;
+	bufp->f_bavail = 0;
+	bufp->f_files = 0;
+	bufp->f_ffree = infp->vsi_raw->vs_ifree;
+	bufp->f_namelen = VXFS_NAMELEN;
+
+	return 0;
+}
+
+static int vxfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+/**
+ * vxfs_read_super - read superblock into memory and initialize filesystem
+ * @sbp:		VFS superblock (to fill)
+ * @dp:			fs private mount data
+ * @silent:		do not complain loudly when sth is wrong
+ *
+ * Description:
+ *   We are called on the first mount of a filesystem to read the
+ *   superblock into memory and do some basic setup.
+ *
+ * Returns:
+ *   The superblock on success, else %NULL.
+ *
+ * Locking:
+ *   We are under @sbp->s_lock.
+ */
+static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
+{
+	struct vxfs_sb_info	*infp;
+	struct vxfs_sb		*rsbp;
+	struct buffer_head	*bp = NULL;
+	u_long			bsize;
+	struct inode *root;
+	int ret = -EINVAL;
+
+	sbp->s_flags |= MS_RDONLY;
+
+	infp = kzalloc(sizeof(*infp), GFP_KERNEL);
+	if (!infp) {
+		printk(KERN_WARNING "vxfs: unable to allocate incore superblock\n");
+		return -ENOMEM;
+	}
+
+	bsize = sb_min_blocksize(sbp, BLOCK_SIZE);
+	if (!bsize) {
+		printk(KERN_WARNING "vxfs: unable to set blocksize\n");
+		goto out;
+	}
+
+	bp = sb_bread(sbp, 1);
+	if (!bp || !buffer_mapped(bp)) {
+		if (!silent) {
+			printk(KERN_WARNING
+				"vxfs: unable to read disk superblock\n");
+		}
+		goto out;
+	}
+
+	rsbp = (struct vxfs_sb *)bp->b_data;
+	if (rsbp->vs_magic != VXFS_SUPER_MAGIC) {
+		if (!silent)
+			printk(KERN_NOTICE "vxfs: WRONG superblock magic\n");
+		goto out;
+	}
+
+	if ((rsbp->vs_version < 2 || rsbp->vs_version > 4) && !silent) {
+		printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n",
+		       rsbp->vs_version);
+		goto out;
+	}
+
+#ifdef DIAGNOSTIC
+	printk(KERN_DEBUG "vxfs: supported VxFS version (%d)\n", rsbp->vs_version);
+	printk(KERN_DEBUG "vxfs: blocksize: %d\n", rsbp->vs_bsize);
+#endif
+
+	sbp->s_magic = rsbp->vs_magic;
+	sbp->s_fs_info = infp;
+
+	infp->vsi_raw = rsbp;
+	infp->vsi_bp = bp;
+	infp->vsi_oltext = rsbp->vs_oltext[0];
+	infp->vsi_oltsize = rsbp->vs_oltsize;
+
+	if (!sb_set_blocksize(sbp, rsbp->vs_bsize)) {
+		printk(KERN_WARNING "vxfs: unable to set final block size\n");
+		goto out;
+	}
+
+	if (vxfs_read_olt(sbp, bsize)) {
+		printk(KERN_WARNING "vxfs: unable to read olt\n");
+		goto out;
+	}
+
+	if (vxfs_read_fshead(sbp)) {
+		printk(KERN_WARNING "vxfs: unable to read fshead\n");
+		goto out;
+	}
+
+	sbp->s_op = &vxfs_super_ops;
+	root = vxfs_iget(sbp, VXFS_ROOT_INO);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out;
+	}
+	sbp->s_root = d_alloc_root(root);
+	if (!sbp->s_root) {
+		iput(root);
+		printk(KERN_WARNING "vxfs: unable to get root dentry.\n");
+		goto out_free_ilist;
+	}
+
+	return 0;
+	
+out_free_ilist:
+	vxfs_put_fake_inode(infp->vsi_fship);
+	vxfs_put_fake_inode(infp->vsi_ilist);
+	vxfs_put_fake_inode(infp->vsi_stilist);
+out:
+	brelse(bp);
+	kfree(infp);
+	return ret;
+}
+
+/*
+ * The usual module blurb.
+ */
+static struct dentry *vxfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
+}
+
+static struct file_system_type vxfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "vxfs",
+	.mount		= vxfs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init
+vxfs_init(void)
+{
+	int rv;
+
+	vxfs_inode_cachep = kmem_cache_create("vxfs_inode",
+			sizeof(struct vxfs_inode_info), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+	if (!vxfs_inode_cachep)
+		return -ENOMEM;
+	rv = register_filesystem(&vxfs_fs_type);
+	if (rv < 0)
+		kmem_cache_destroy(vxfs_inode_cachep);
+	return rv;
+}
+
+static void __exit
+vxfs_cleanup(void)
+{
+	unregister_filesystem(&vxfs_fs_type);
+	kmem_cache_destroy(vxfs_inode_cachep);
+}
+
+module_init(vxfs_init);
+module_exit(vxfs_cleanup);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
new file mode 100644
index 00000000..539d0d25
--- /dev/null
+++ b/fs/fs-writeback.c
@@ -0,0 +1,1373 @@
+/*
+ * fs/fs-writeback.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * Contains all the functions related to writing back and waiting
+ * upon dirty inodes against superblocks, and writing back dirty
+ * pages against inodes.  ie: data writeback.  Writeout of the
+ * inode itself is not handled here.
+ *
+ * 10Apr2002	Andrew Morton
+ *		Split out of fs/inode.c
+ *		Additions for address_space-based writeback
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/tracepoint.h>
+#include "internal.h"
+
+/*
+ * Passed into wb_writeback(), essentially a subset of writeback_control
+ */
+struct wb_writeback_work {
+	long nr_pages;
+	struct super_block *sb;
+	enum writeback_sync_modes sync_mode;
+	unsigned int tagged_writepages:1;
+	unsigned int for_kupdate:1;
+	unsigned int range_cyclic:1;
+	unsigned int for_background:1;
+
+	struct list_head list;		/* pending work list */
+	struct completion *done;	/* set if the caller waits */
+};
+
+/*
+ * Include the creation of the trace points after defining the
+ * wb_writeback_work structure so that the definition remains local to this
+ * file.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/writeback.h>
+
+/*
+ * We don't actually have pdflush, but this one is exported though /proc...
+ */
+int nr_pdflush_threads;
+
+/**
+ * writeback_in_progress - determine whether there is writeback in progress
+ * @bdi: the device's backing_dev_info structure.
+ *
+ * Determine whether there is writeback waiting to be handled against a
+ * backing device.
+ */
+int writeback_in_progress(struct backing_dev_info *bdi)
+{
+	return test_bit(BDI_writeback_running, &bdi->state);
+}
+
+static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (strcmp(sb->s_type->name, "bdev") == 0)
+		return inode->i_mapping->backing_dev_info;
+
+	return sb->s_bdi;
+}
+
+static inline struct inode *wb_inode(struct list_head *head)
+{
+	return list_entry(head, struct inode, i_wb_list);
+}
+
+/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
+static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
+{
+	if (bdi->wb.task) {
+		wake_up_process(bdi->wb.task);
+	} else {
+		/*
+		 * The bdi thread isn't there, wake up the forker thread which
+		 * will create and run it.
+		 */
+		wake_up_process(default_backing_dev_info.wb.task);
+	}
+}
+
+static void bdi_queue_work(struct backing_dev_info *bdi,
+			   struct wb_writeback_work *work)
+{
+	trace_writeback_queue(bdi, work);
+
+	spin_lock_bh(&bdi->wb_lock);
+	list_add_tail(&work->list, &bdi->work_list);
+	if (!bdi->wb.task)
+		trace_writeback_nothread(bdi, work);
+	bdi_wakeup_flusher(bdi);
+	spin_unlock_bh(&bdi->wb_lock);
+}
+
+static void
+__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
+		      bool range_cyclic)
+{
+	struct wb_writeback_work *work;
+
+	/*
+	 * This is WB_SYNC_NONE writeback, so if allocation fails just
+	 * wakeup the thread for old dirty data writeback
+	 */
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work) {
+		if (bdi->wb.task) {
+			trace_writeback_nowork(bdi);
+			wake_up_process(bdi->wb.task);
+		}
+		return;
+	}
+
+	work->sync_mode	= WB_SYNC_NONE;
+	work->nr_pages	= nr_pages;
+	work->range_cyclic = range_cyclic;
+
+	bdi_queue_work(bdi, work);
+}
+
+/**
+ * bdi_start_writeback - start writeback
+ * @bdi: the backing device to write from
+ * @nr_pages: the number of pages to write
+ *
+ * Description:
+ *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ *   started when this function returns, we make no guarantees on
+ *   completion. Caller need not hold sb s_umount semaphore.
+ *
+ */
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
+{
+	__bdi_start_writeback(bdi, nr_pages, true);
+}
+
+/**
+ * bdi_start_background_writeback - start background writeback
+ * @bdi: the backing device to write from
+ *
+ * Description:
+ *   This makes sure WB_SYNC_NONE background writeback happens. When
+ *   this function returns, it is only guaranteed that for given BDI
+ *   some IO is happening if we are over background dirty threshold.
+ *   Caller need not hold sb s_umount semaphore.
+ */
+void bdi_start_background_writeback(struct backing_dev_info *bdi)
+{
+	/*
+	 * We just wake up the flusher thread. It will perform background
+	 * writeback as soon as there is no other work to do.
+	 */
+	trace_writeback_wake_background(bdi);
+	spin_lock_bh(&bdi->wb_lock);
+	bdi_wakeup_flusher(bdi);
+	spin_unlock_bh(&bdi->wb_lock);
+}
+
+/*
+ * Remove the inode from the writeback list it is on.
+ */
+void inode_wb_list_del(struct inode *inode)
+{
+	spin_lock(&inode_wb_list_lock);
+	list_del_init(&inode->i_wb_list);
+	spin_unlock(&inode_wb_list_lock);
+}
+
+
+/*
+ * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
+ * furthest end of its superblock's dirty-inode list.
+ *
+ * Before stamping the inode's ->dirtied_when, we check to see whether it is
+ * already the most-recently-dirtied inode on the b_dirty list.  If that is
+ * the case then the inode must have been redirtied while it was being written
+ * out and we don't reset its dirtied_when.
+ */
+static void redirty_tail(struct inode *inode)
+{
+	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+
+	assert_spin_locked(&inode_wb_list_lock);
+	if (!list_empty(&wb->b_dirty)) {
+		struct inode *tail;
+
+		tail = wb_inode(wb->b_dirty.next);
+		if (time_before(inode->dirtied_when, tail->dirtied_when))
+			inode->dirtied_when = jiffies;
+	}
+	list_move(&inode->i_wb_list, &wb->b_dirty);
+}
+
+/*
+ * requeue inode for re-scanning after bdi->b_io list is exhausted.
+ */
+static void requeue_io(struct inode *inode)
+{
+	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+
+	assert_spin_locked(&inode_wb_list_lock);
+	list_move(&inode->i_wb_list, &wb->b_more_io);
+}
+
+static void inode_sync_complete(struct inode *inode)
+{
+	/*
+	 * Prevent speculative execution through
+	 * spin_unlock(&inode_wb_list_lock);
+	 */
+
+	smp_mb();
+	wake_up_bit(&inode->i_state, __I_SYNC);
+}
+
+static bool inode_dirtied_after(struct inode *inode, unsigned long t)
+{
+	bool ret = time_after(inode->dirtied_when, t);
+#ifndef CONFIG_64BIT
+	/*
+	 * For inodes being constantly redirtied, dirtied_when can get stuck.
+	 * It _appears_ to be in the future, but is actually in distant past.
+	 * This test is necessary to prevent such wrapped-around relative times
+	 * from permanently stopping the whole bdi writeback.
+	 */
+	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
+#endif
+	return ret;
+}
+
+/*
+ * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
+ */
+static void move_expired_inodes(struct list_head *delaying_queue,
+			       struct list_head *dispatch_queue,
+				unsigned long *older_than_this)
+{
+	LIST_HEAD(tmp);
+	struct list_head *pos, *node;
+	struct super_block *sb = NULL;
+	struct inode *inode;
+	int do_sb_sort = 0;
+
+	while (!list_empty(delaying_queue)) {
+		inode = wb_inode(delaying_queue->prev);
+		if (older_than_this &&
+		    inode_dirtied_after(inode, *older_than_this))
+			break;
+		if (sb && sb != inode->i_sb)
+			do_sb_sort = 1;
+		sb = inode->i_sb;
+		list_move(&inode->i_wb_list, &tmp);
+	}
+
+	/* just one sb in list, splice to dispatch_queue and we're done */
+	if (!do_sb_sort) {
+		list_splice(&tmp, dispatch_queue);
+		return;
+	}
+
+	/* Move inodes from one superblock together */
+	while (!list_empty(&tmp)) {
+		sb = wb_inode(tmp.prev)->i_sb;
+		list_for_each_prev_safe(pos, node, &tmp) {
+			inode = wb_inode(pos);
+			if (inode->i_sb == sb)
+				list_move(&inode->i_wb_list, dispatch_queue);
+		}
+	}
+}
+
+/*
+ * Queue all expired dirty inodes for io, eldest first.
+ * Before
+ *         newly dirtied     b_dirty    b_io    b_more_io
+ *         =============>    gf         edc     BA
+ * After
+ *         newly dirtied     b_dirty    b_io    b_more_io
+ *         =============>    g          fBAedc
+ *                                           |
+ *                                           +--> dequeue for IO
+ */
+static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
+{
+	assert_spin_locked(&inode_wb_list_lock);
+	list_splice_init(&wb->b_more_io, &wb->b_io);
+	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+}
+
+static int write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
+		return inode->i_sb->s_op->write_inode(inode, wbc);
+	return 0;
+}
+
+/*
+ * Wait for writeback on an inode to complete.
+ */
+static void inode_wait_for_writeback(struct inode *inode)
+{
+	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
+	wait_queue_head_t *wqh;
+
+	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
+	while (inode->i_state & I_SYNC) {
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_wb_list_lock);
+		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
+		spin_lock(&inode_wb_list_lock);
+		spin_lock(&inode->i_lock);
+	}
+}
+
+/*
+ * Write out an inode's dirty pages.  Called under inode_wb_list_lock and
+ * inode->i_lock.  Either the caller has an active reference on the inode or
+ * the inode has I_WILL_FREE set.
+ *
+ * If `wait' is set, wait on the writeout.
+ *
+ * The whole writeout design is quite complex and fragile.  We want to avoid
+ * starvation of particular inodes when others are being redirtied, prevent
+ * livelocks, etc.
+ */
+static int
+writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct address_space *mapping = inode->i_mapping;
+	unsigned dirty;
+	int ret;
+
+	assert_spin_locked(&inode_wb_list_lock);
+	assert_spin_locked(&inode->i_lock);
+
+	if (!atomic_read(&inode->i_count))
+		WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
+	else
+		WARN_ON(inode->i_state & I_WILL_FREE);
+
+	if (inode->i_state & I_SYNC) {
+		/*
+		 * If this inode is locked for writeback and we are not doing
+		 * writeback-for-data-integrity, move it to b_more_io so that
+		 * writeback can proceed with the other inodes on s_io.
+		 *
+		 * We'll have another go at writing back this inode when we
+		 * completed a full scan of b_io.
+		 */
+		if (wbc->sync_mode != WB_SYNC_ALL) {
+			requeue_io(inode);
+			return 0;
+		}
+
+		/*
+		 * It's a data-integrity sync.  We must wait.
+		 */
+		inode_wait_for_writeback(inode);
+	}
+
+	BUG_ON(inode->i_state & I_SYNC);
+
+	/* Set I_SYNC, reset I_DIRTY_PAGES */
+	inode->i_state |= I_SYNC;
+	inode->i_state &= ~I_DIRTY_PAGES;
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_wb_list_lock);
+
+	ret = do_writepages(mapping, wbc);
+
+	/*
+	 * Make sure to wait on the data before writing out the metadata.
+	 * This is important for filesystems that modify metadata on data
+	 * I/O completion.
+	 */
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		int err = filemap_fdatawait(mapping);
+		if (ret == 0)
+			ret = err;
+	}
+
+	/*
+	 * Some filesystems may redirty the inode during the writeback
+	 * due to delalloc, clear dirty metadata flags right before
+	 * write_inode()
+	 */
+	spin_lock(&inode->i_lock);
+	dirty = inode->i_state & I_DIRTY;
+	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+	spin_unlock(&inode->i_lock);
+	/* Don't write the inode if only I_DIRTY_PAGES was set */
+	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+		int err = write_inode(inode, wbc);
+		if (ret == 0)
+			ret = err;
+	}
+
+	spin_lock(&inode_wb_list_lock);
+	spin_lock(&inode->i_lock);
+	inode->i_state &= ~I_SYNC;
+	if (!(inode->i_state & I_FREEING)) {
+		/*
+		 * Sync livelock prevention. Each inode is tagged and synced in
+		 * one shot. If still dirty, it will be redirty_tail()'ed below.
+		 * Update the dirty time to prevent enqueue and sync it again.
+		 */
+		if ((inode->i_state & I_DIRTY) &&
+		    (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
+			inode->dirtied_when = jiffies;
+
+		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+			/*
+			 * We didn't write back all the pages.  nfs_writepages()
+			 * sometimes bales out without doing anything.
+			 */
+			inode->i_state |= I_DIRTY_PAGES;
+			if (wbc->nr_to_write <= 0) {
+				/*
+				 * slice used up: queue for next turn
+				 */
+				requeue_io(inode);
+			} else {
+				/*
+				 * Writeback blocked by something other than
+				 * congestion. Delay the inode for some time to
+				 * avoid spinning on the CPU (100% iowait)
+				 * retrying writeback of the dirty page/inode
+				 * that cannot be performed immediately.
+				 */
+				redirty_tail(inode);
+			}
+		} else if (inode->i_state & I_DIRTY) {
+			/*
+			 * Filesystems can dirty the inode during writeback
+			 * operations, such as delayed allocation during
+			 * submission or metadata updates after data IO
+			 * completion.
+			 */
+			redirty_tail(inode);
+		} else {
+			/*
+			 * The inode is clean.  At this point we either have
+			 * a reference to the inode or it's on it's way out.
+			 * No need to add it back to the LRU.
+			 */
+			list_del_init(&inode->i_wb_list);
+		}
+	}
+	inode_sync_complete(inode);
+	return ret;
+}
+
+/*
+ * For background writeback the caller does not have the sb pinned
+ * before calling writeback. So make sure that we do pin it, so it doesn't
+ * go away while we are writing inodes from it.
+ */
+static bool pin_sb_for_writeback(struct super_block *sb)
+{
+	spin_lock(&sb_lock);
+	if (list_empty(&sb->s_instances)) {
+		spin_unlock(&sb_lock);
+		return false;
+	}
+
+	sb->s_count++;
+	spin_unlock(&sb_lock);
+
+	if (down_read_trylock(&sb->s_umount)) {
+		if (sb->s_root)
+			return true;
+		up_read(&sb->s_umount);
+	}
+
+	put_super(sb);
+	return false;
+}
+
+/*
+ * Write a portion of b_io inodes which belong to @sb.
+ *
+ * If @only_this_sb is true, then find and write all such
+ * inodes. Otherwise write only ones which go sequentially
+ * in reverse order.
+ *
+ * Return 1, if the caller writeback routine should be
+ * interrupted. Otherwise return 0.
+ */
+static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
+		struct writeback_control *wbc, bool only_this_sb)
+{
+	while (!list_empty(&wb->b_io)) {
+		long pages_skipped;
+		struct inode *inode = wb_inode(wb->b_io.prev);
+
+		if (inode->i_sb != sb) {
+			if (only_this_sb) {
+				/*
+				 * We only want to write back data for this
+				 * superblock, move all inodes not belonging
+				 * to it back onto the dirty list.
+				 */
+				redirty_tail(inode);
+				continue;
+			}
+
+			/*
+			 * The inode belongs to a different superblock.
+			 * Bounce back to the caller to unpin this and
+			 * pin the next superblock.
+			 */
+			return 0;
+		}
+
+		/*
+		 * Don't bother with new inodes or inodes beeing freed, first
+		 * kind does not need peridic writeout yet, and for the latter
+		 * kind writeout is handled by the freer.
+		 */
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			spin_unlock(&inode->i_lock);
+			requeue_io(inode);
+			continue;
+		}
+
+		/*
+		 * Was this inode dirtied after sync_sb_inodes was called?
+		 * This keeps sync from extra jobs and livelock.
+		 */
+		if (inode_dirtied_after(inode, wbc->wb_start)) {
+			spin_unlock(&inode->i_lock);
+			return 1;
+		}
+
+		__iget(inode);
+
+		pages_skipped = wbc->pages_skipped;
+		writeback_single_inode(inode, wbc);
+		if (wbc->pages_skipped != pages_skipped) {
+			/*
+			 * writeback is not making progress due to locked
+			 * buffers.  Skip this inode for now.
+			 */
+			redirty_tail(inode);
+		}
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_wb_list_lock);
+		iput(inode);
+		cond_resched();
+		spin_lock(&inode_wb_list_lock);
+		if (wbc->nr_to_write <= 0) {
+			wbc->more_io = 1;
+			return 1;
+		}
+		if (!list_empty(&wb->b_more_io))
+			wbc->more_io = 1;
+	}
+	/* b_io is empty */
+	return 1;
+}
+
+void writeback_inodes_wb(struct bdi_writeback *wb,
+		struct writeback_control *wbc)
+{
+	int ret = 0;
+
+	if (!wbc->wb_start)
+		wbc->wb_start = jiffies; /* livelock avoidance */
+	spin_lock(&inode_wb_list_lock);
+	if (!wbc->for_kupdate || list_empty(&wb->b_io))
+		queue_io(wb, wbc->older_than_this);
+
+	while (!list_empty(&wb->b_io)) {
+		struct inode *inode = wb_inode(wb->b_io.prev);
+		struct super_block *sb = inode->i_sb;
+
+		if (!pin_sb_for_writeback(sb)) {
+			requeue_io(inode);
+			continue;
+		}
+		ret = writeback_sb_inodes(sb, wb, wbc, false);
+		drop_super(sb);
+
+		if (ret)
+			break;
+	}
+	spin_unlock(&inode_wb_list_lock);
+	/* Leave any unwritten inodes on b_io */
+}
+
+static void __writeback_inodes_sb(struct super_block *sb,
+		struct bdi_writeback *wb, struct writeback_control *wbc)
+{
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+	spin_lock(&inode_wb_list_lock);
+	if (!wbc->for_kupdate || list_empty(&wb->b_io))
+		queue_io(wb, wbc->older_than_this);
+	writeback_sb_inodes(sb, wb, wbc, true);
+	spin_unlock(&inode_wb_list_lock);
+}
+
+/*
+ * The maximum number of pages to writeout in a single bdi flush/kupdate
+ * operation.  We do this so we don't hold I_SYNC against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode.  Also, the code reevaluates
+ * the dirty each time it has written this many pages.
+ */
+#define MAX_WRITEBACK_PAGES     1024
+
+static inline bool over_bground_thresh(void)
+{
+	unsigned long background_thresh, dirty_thresh;
+
+	global_dirty_limits(&background_thresh, &dirty_thresh);
+
+	return (global_page_state(NR_FILE_DIRTY) +
+		global_page_state(NR_UNSTABLE_NFS) > background_thresh);
+}
+
+/*
+ * Explicit flushing or periodic writeback of "old" data.
+ *
+ * Define "old": the first time one of an inode's pages is dirtied, we mark the
+ * dirtying-time in the inode's address_space.  So this periodic writeback code
+ * just walks the superblock inode list, writing back any inodes which are
+ * older than a specific point in time.
+ *
+ * Try to run once per dirty_writeback_interval.  But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
+ * one-second gap.
+ *
+ * older_than_this takes precedence over nr_to_write.  So we'll only write back
+ * all dirty pages if they are all attached to "old" mappings.
+ */
+static long wb_writeback(struct bdi_writeback *wb,
+			 struct wb_writeback_work *work)
+{
+	struct writeback_control wbc = {
+		.sync_mode		= work->sync_mode,
+		.tagged_writepages	= work->tagged_writepages,
+		.older_than_this	= NULL,
+		.for_kupdate		= work->for_kupdate,
+		.for_background		= work->for_background,
+		.range_cyclic		= work->range_cyclic,
+	};
+	unsigned long oldest_jif;
+	long wrote = 0;
+	long write_chunk = MAX_WRITEBACK_PAGES;
+	struct inode *inode;
+
+	if (wbc.for_kupdate) {
+		wbc.older_than_this = &oldest_jif;
+		oldest_jif = jiffies -
+				msecs_to_jiffies(dirty_expire_interval * 10);
+	}
+	if (!wbc.range_cyclic) {
+		wbc.range_start = 0;
+		wbc.range_end = LLONG_MAX;
+	}
+
+	/*
+	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+	 * here avoids calling into writeback_inodes_wb() more than once.
+	 *
+	 * The intended call sequence for WB_SYNC_ALL writeback is:
+	 *
+	 *      wb_writeback()
+	 *          __writeback_inodes_sb()     <== called only once
+	 *              write_cache_pages()     <== called once for each inode
+	 *                   (quickly) tag currently dirty pages
+	 *                   (maybe slowly) sync all tagged pages
+	 */
+	if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages)
+		write_chunk = LONG_MAX;
+
+	wbc.wb_start = jiffies; /* livelock avoidance */
+	for (;;) {
+		/*
+		 * Stop writeback when nr_pages has been consumed
+		 */
+		if (work->nr_pages <= 0)
+			break;
+
+		/*
+		 * Background writeout and kupdate-style writeback may
+		 * run forever. Stop them if there is other work to do
+		 * so that e.g. sync can proceed. They'll be restarted
+		 * after the other works are all done.
+		 */
+		if ((work->for_background || work->for_kupdate) &&
+		    !list_empty(&wb->bdi->work_list))
+			break;
+
+		/*
+		 * For background writeout, stop when we are below the
+		 * background dirty threshold
+		 */
+		if (work->for_background && !over_bground_thresh())
+			break;
+
+		wbc.more_io = 0;
+		wbc.nr_to_write = write_chunk;
+		wbc.pages_skipped = 0;
+
+		trace_wbc_writeback_start(&wbc, wb->bdi);
+		if (work->sb)
+			__writeback_inodes_sb(work->sb, wb, &wbc);
+		else
+			writeback_inodes_wb(wb, &wbc);
+		trace_wbc_writeback_written(&wbc, wb->bdi);
+
+		work->nr_pages -= write_chunk - wbc.nr_to_write;
+		wrote += write_chunk - wbc.nr_to_write;
+
+		/*
+		 * If we consumed everything, see if we have more
+		 */
+		if (wbc.nr_to_write <= 0)
+			continue;
+		/*
+		 * Didn't write everything and we don't have more IO, bail
+		 */
+		if (!wbc.more_io)
+			break;
+		/*
+		 * Did we write something? Try for more
+		 */
+		if (wbc.nr_to_write < write_chunk)
+			continue;
+		/*
+		 * Nothing written. Wait for some inode to
+		 * become available for writeback. Otherwise
+		 * we'll just busyloop.
+		 */
+		spin_lock(&inode_wb_list_lock);
+		if (!list_empty(&wb->b_more_io))  {
+			inode = wb_inode(wb->b_more_io.prev);
+			trace_wbc_writeback_wait(&wbc, wb->bdi);
+			spin_lock(&inode->i_lock);
+			inode_wait_for_writeback(inode);
+			spin_unlock(&inode->i_lock);
+		}
+		spin_unlock(&inode_wb_list_lock);
+	}
+
+	return wrote;
+}
+
+/*
+ * Return the next wb_writeback_work struct that hasn't been processed yet.
+ */
+static struct wb_writeback_work *
+get_next_work_item(struct backing_dev_info *bdi)
+{
+	struct wb_writeback_work *work = NULL;
+
+	spin_lock_bh(&bdi->wb_lock);
+	if (!list_empty(&bdi->work_list)) {
+		work = list_entry(bdi->work_list.next,
+				  struct wb_writeback_work, list);
+		list_del_init(&work->list);
+	}
+	spin_unlock_bh(&bdi->wb_lock);
+	return work;
+}
+
+/*
+ * Add in the number of potentially dirty inodes, because each inode
+ * write can dirty pagecache in the underlying blockdev.
+ */
+static unsigned long get_nr_dirty_pages(void)
+{
+	return global_page_state(NR_FILE_DIRTY) +
+		global_page_state(NR_UNSTABLE_NFS) +
+		get_nr_dirty_inodes();
+}
+
+static long wb_check_background_flush(struct bdi_writeback *wb)
+{
+	if (over_bground_thresh()) {
+
+		struct wb_writeback_work work = {
+			.nr_pages	= LONG_MAX,
+			.sync_mode	= WB_SYNC_NONE,
+			.for_background	= 1,
+			.range_cyclic	= 1,
+		};
+
+		return wb_writeback(wb, &work);
+	}
+
+	return 0;
+}
+
+static long wb_check_old_data_flush(struct bdi_writeback *wb)
+{
+	unsigned long expired;
+	long nr_pages;
+
+	/*
+	 * When set to zero, disable periodic writeback
+	 */
+	if (!dirty_writeback_interval)
+		return 0;
+
+	expired = wb->last_old_flush +
+			msecs_to_jiffies(dirty_writeback_interval * 10);
+	if (time_before(jiffies, expired))
+		return 0;
+
+	wb->last_old_flush = jiffies;
+	nr_pages = get_nr_dirty_pages();
+
+	if (nr_pages) {
+		struct wb_writeback_work work = {
+			.nr_pages	= nr_pages,
+			.sync_mode	= WB_SYNC_NONE,
+			.for_kupdate	= 1,
+			.range_cyclic	= 1,
+		};
+
+		return wb_writeback(wb, &work);
+	}
+
+	return 0;
+}
+
+/*
+ * Retrieve work items and do the writeback they describe
+ */
+long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+{
+	struct backing_dev_info *bdi = wb->bdi;
+	struct wb_writeback_work *work;
+	long wrote = 0;
+
+	set_bit(BDI_writeback_running, &wb->bdi->state);
+	while ((work = get_next_work_item(bdi)) != NULL) {
+		/*
+		 * Override sync mode, in case we must wait for completion
+		 * because this thread is exiting now.
+		 */
+		if (force_wait)
+			work->sync_mode = WB_SYNC_ALL;
+
+		trace_writeback_exec(bdi, work);
+
+		wrote += wb_writeback(wb, work);
+
+		/*
+		 * Notify the caller of completion if this is a synchronous
+		 * work item, otherwise just free it.
+		 */
+		if (work->done)
+			complete(work->done);
+		else
+			kfree(work);
+	}
+
+	/*
+	 * Check for periodic writeback, kupdated() style
+	 */
+	wrote += wb_check_old_data_flush(wb);
+	wrote += wb_check_background_flush(wb);
+	clear_bit(BDI_writeback_running, &wb->bdi->state);
+
+	return wrote;
+}
+
+/*
+ * Handle writeback of dirty data for the device backed by this bdi. Also
+ * wakes up periodically and does kupdated style flushing.
+ */
+int bdi_writeback_thread(void *data)
+{
+	struct bdi_writeback *wb = data;
+	struct backing_dev_info *bdi = wb->bdi;
+	long pages_written;
+
+	current->flags |= PF_SWAPWRITE;
+	set_freezable();
+	wb->last_active = jiffies;
+
+	/*
+	 * Our parent may run at a different priority, just set us to normal
+	 */
+	set_user_nice(current, 0);
+
+	trace_writeback_thread_start(bdi);
+
+	while (!kthread_should_stop()) {
+		/*
+		 * Remove own delayed wake-up timer, since we are already awake
+		 * and we'll take care of the preriodic write-back.
+		 */
+		del_timer(&wb->wakeup_timer);
+
+		pages_written = wb_do_writeback(wb, 0);
+
+		trace_writeback_pages_written(pages_written);
+
+		if (pages_written)
+			wb->last_active = jiffies;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
+			__set_current_state(TASK_RUNNING);
+			continue;
+		}
+
+		if (wb_has_dirty_io(wb) && dirty_writeback_interval)
+			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+		else {
+			/*
+			 * We have nothing to do, so can go sleep without any
+			 * timeout and save power. When a work is queued or
+			 * something is made dirty - we will be woken up.
+			 */
+			schedule();
+		}
+
+		try_to_freeze();
+	}
+
+	/* Flush any work that raced with us exiting */
+	if (!list_empty(&bdi->work_list))
+		wb_do_writeback(wb, 1);
+
+	trace_writeback_thread_stop(bdi);
+	return 0;
+}
+
+
+/*
+ * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
+ * the whole world.
+ */
+void wakeup_flusher_threads(long nr_pages)
+{
+	struct backing_dev_info *bdi;
+
+	if (!nr_pages) {
+		nr_pages = global_page_state(NR_FILE_DIRTY) +
+				global_page_state(NR_UNSTABLE_NFS);
+	}
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+		if (!bdi_has_dirty_io(bdi))
+			continue;
+		__bdi_start_writeback(bdi, nr_pages, false);
+	}
+	rcu_read_unlock();
+}
+
+static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+{
+	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
+		struct dentry *dentry;
+		const char *name = "?";
+
+		dentry = d_find_alias(inode);
+		if (dentry) {
+			spin_lock(&dentry->d_lock);
+			name = (const char *) dentry->d_name.name;
+		}
+		printk(KERN_DEBUG
+		       "%s(%d): dirtied inode %lu (%s) on %s\n",
+		       current->comm, task_pid_nr(current), inode->i_ino,
+		       name, inode->i_sb->s_id);
+		if (dentry) {
+			spin_unlock(&dentry->d_lock);
+			dput(dentry);
+		}
+	}
+}
+
+/**
+ *	__mark_inode_dirty -	internal function
+ *	@inode: inode to mark
+ *	@flags: what kind of dirty (i.e. I_DIRTY_SYNC)
+ *	Mark an inode as dirty. Callers should use mark_inode_dirty or
+ *  	mark_inode_dirty_sync.
+ *
+ * Put the inode on the super block's dirty list.
+ *
+ * CAREFUL! We mark it dirty unconditionally, but move it onto the
+ * dirty list only if it is hashed or if it refers to a blockdev.
+ * If it was not hashed, it will never be added to the dirty list
+ * even if it is later hashed, as it will have been marked dirty already.
+ *
+ * In short, make sure you hash any inodes _before_ you start marking
+ * them dirty.
+ *
+ * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
+ * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
+ * the kernel-internal blockdev inode represents the dirtying time of the
+ * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
+ * page->mapping->host, so the page-dirtying time is recorded in the internal
+ * blockdev inode.
+ */
+void __mark_inode_dirty(struct inode *inode, int flags)
+{
+	struct super_block *sb = inode->i_sb;
+	struct backing_dev_info *bdi = NULL;
+
+	/*
+	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
+	 * dirty the inode itself
+	 */
+	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+		if (sb->s_op->dirty_inode)
+			sb->s_op->dirty_inode(inode, flags);
+	}
+
+	/*
+	 * make sure that changes are seen by all cpus before we test i_state
+	 * -- mikulas
+	 */
+	smp_mb();
+
+	/* avoid the locking if we can */
+	if ((inode->i_state & flags) == flags)
+		return;
+
+	if (unlikely(block_dump > 1))
+		block_dump___mark_inode_dirty(inode);
+
+	spin_lock(&inode->i_lock);
+	if ((inode->i_state & flags) != flags) {
+		const int was_dirty = inode->i_state & I_DIRTY;
+
+		inode->i_state |= flags;
+
+		/*
+		 * If the inode is being synced, just update its dirty state.
+		 * The unlocker will place the inode on the appropriate
+		 * superblock list, based upon its state.
+		 */
+		if (inode->i_state & I_SYNC)
+			goto out_unlock_inode;
+
+		/*
+		 * Only add valid (hashed) inodes to the superblock's
+		 * dirty list.  Add blockdev inodes as well.
+		 */
+		if (!S_ISBLK(inode->i_mode)) {
+			if (inode_unhashed(inode))
+				goto out_unlock_inode;
+		}
+		if (inode->i_state & I_FREEING)
+			goto out_unlock_inode;
+
+		/*
+		 * If the inode was already on b_dirty/b_io/b_more_io, don't
+		 * reposition it (that would break b_dirty time-ordering).
+		 */
+		if (!was_dirty) {
+			bool wakeup_bdi = false;
+			bdi = inode_to_bdi(inode);
+			if (!bdi)
+				goto out_unlock_inode;
+
+			if (bdi_cap_writeback_dirty(bdi)) {
+				WARN(!test_bit(BDI_registered, &bdi->state),
+				     "bdi-%s not registered\n", bdi->name);
+
+				/*
+				 * If this is the first dirty inode for this
+				 * bdi, we have to wake-up the corresponding
+				 * bdi thread to make sure background
+				 * write-back happens later.
+				 */
+				if (!wb_has_dirty_io(&bdi->wb))
+					wakeup_bdi = true;
+			}
+
+			spin_unlock(&inode->i_lock);
+			spin_lock(&inode_wb_list_lock);
+			inode->dirtied_when = jiffies;
+			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+			spin_unlock(&inode_wb_list_lock);
+
+			if (wakeup_bdi)
+				bdi_wakeup_thread_delayed(bdi);
+			return;
+		}
+	}
+out_unlock_inode:
+	spin_unlock(&inode->i_lock);
+
+}
+EXPORT_SYMBOL(__mark_inode_dirty);
+
+/*
+ * Write out a superblock's list of dirty inodes.  A wait will be performed
+ * upon no inodes, all inodes or the final one, depending upon sync_mode.
+ *
+ * If older_than_this is non-NULL, then only write out inodes which
+ * had their first dirtying at a time earlier than *older_than_this.
+ *
+ * If `bdi' is non-zero then we're being asked to writeback a specific queue.
+ * This function assumes that the blockdev superblock's inodes are backed by
+ * a variety of queues, so all inodes are searched.  For other superblocks,
+ * assume that all inodes are backed by the same queue.
+ *
+ * The inodes to be written are parked on bdi->b_io.  They are moved back onto
+ * bdi->b_dirty as they are selected for writing.  This way, none can be missed
+ * on the writer throttling path, and we get decent balancing between many
+ * throttled threads: we don't want them all piling up on inode_sync_wait.
+ */
+static void wait_sb_inodes(struct super_block *sb)
+{
+	struct inode *inode, *old_inode = NULL;
+
+	/*
+	 * We need to be protected against the filesystem going from
+	 * r/o to r/w or vice versa.
+	 */
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+	spin_lock(&inode_sb_list_lock);
+
+	/*
+	 * Data integrity sync. Must wait for all pages under writeback,
+	 * because there may have been pages dirtied before our sync
+	 * call, but which had writeout started before we write it out.
+	 * In which case, the inode may not be on the dirty list, but
+	 * we still have to wait for that writeout.
+	 */
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		struct address_space *mapping = inode->i_mapping;
+
+		spin_lock(&inode->i_lock);
+		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+		    (mapping->nrpages == 0)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_sb_list_lock);
+
+		/*
+		 * We hold a reference to 'inode' so it couldn't have been
+		 * removed from s_inodes list while we dropped the
+		 * inode_sb_list_lock.  We cannot iput the inode now as we can
+		 * be holding the last reference and we cannot iput it under
+		 * inode_sb_list_lock. So we keep the reference and iput it
+		 * later.
+		 */
+		iput(old_inode);
+		old_inode = inode;
+
+		filemap_fdatawait(mapping);
+
+		cond_resched();
+
+		spin_lock(&inode_sb_list_lock);
+	}
+	spin_unlock(&inode_sb_list_lock);
+	iput(old_inode);
+}
+
+/**
+ * writeback_inodes_sb_nr -	writeback dirty inodes from given super_block
+ * @sb: the superblock
+ * @nr: the number of pages to write
+ *
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO.
+ */
+void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
+{
+	DECLARE_COMPLETION_ONSTACK(done);
+	struct wb_writeback_work work = {
+		.sb			= sb,
+		.sync_mode		= WB_SYNC_NONE,
+		.tagged_writepages	= 1,
+		.done			= &done,
+		.nr_pages		= nr,
+	};
+
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+	bdi_queue_work(sb->s_bdi, &work);
+	wait_for_completion(&done);
+}
+EXPORT_SYMBOL(writeback_inodes_sb_nr);
+
+/**
+ * writeback_inodes_sb	-	writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO.
+ */
+void writeback_inodes_sb(struct super_block *sb)
+{
+	return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
+}
+EXPORT_SYMBOL(writeback_inodes_sb);
+
+/**
+ * writeback_inodes_sb_if_idle	-	start writeback if none underway
+ * @sb: the superblock
+ *
+ * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Returns 1 if writeback was started, 0 if not.
+ */
+int writeback_inodes_sb_if_idle(struct super_block *sb)
+{
+	if (!writeback_in_progress(sb->s_bdi)) {
+		down_read(&sb->s_umount);
+		writeback_inodes_sb(sb);
+		up_read(&sb->s_umount);
+		return 1;
+	} else
+		return 0;
+}
+EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
+
+/**
+ * writeback_inodes_sb_if_idle	-	start writeback if none underway
+ * @sb: the superblock
+ * @nr: the number of pages to write
+ *
+ * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Returns 1 if writeback was started, 0 if not.
+ */
+int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
+				   unsigned long nr)
+{
+	if (!writeback_in_progress(sb->s_bdi)) {
+		down_read(&sb->s_umount);
+		writeback_inodes_sb_nr(sb, nr);
+		up_read(&sb->s_umount);
+		return 1;
+	} else
+		return 0;
+}
+EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
+
+/**
+ * sync_inodes_sb	-	sync sb inode pages
+ * @sb: the superblock
+ *
+ * This function writes and waits on any dirty inode belonging to this
+ * super_block.
+ */
+void sync_inodes_sb(struct super_block *sb)
+{
+	DECLARE_COMPLETION_ONSTACK(done);
+	struct wb_writeback_work work = {
+		.sb		= sb,
+		.sync_mode	= WB_SYNC_ALL,
+		.nr_pages	= LONG_MAX,
+		.range_cyclic	= 0,
+		.done		= &done,
+	};
+
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+	bdi_queue_work(sb->s_bdi, &work);
+	wait_for_completion(&done);
+
+	wait_sb_inodes(sb);
+}
+EXPORT_SYMBOL(sync_inodes_sb);
+
+/**
+ * write_inode_now	-	write an inode to disk
+ * @inode: inode to write to disk
+ * @sync: whether the write should be synchronous or not
+ *
+ * This function commits an inode to disk immediately if it is dirty. This is
+ * primarily needed by knfsd.
+ *
+ * The caller must either have a ref on the inode or must have set I_WILL_FREE.
+ */
+int write_inode_now(struct inode *inode, int sync)
+{
+	int ret;
+	struct writeback_control wbc = {
+		.nr_to_write = LONG_MAX,
+		.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+	};
+
+	if (!mapping_cap_writeback_dirty(inode->i_mapping))
+		wbc.nr_to_write = 0;
+
+	might_sleep();
+	spin_lock(&inode_wb_list_lock);
+	spin_lock(&inode->i_lock);
+	ret = writeback_single_inode(inode, &wbc);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_wb_list_lock);
+	if (sync)
+		inode_sync_wait(inode);
+	return ret;
+}
+EXPORT_SYMBOL(write_inode_now);
+
+/**
+ * sync_inode - write an inode and its pages to disk.
+ * @inode: the inode to sync
+ * @wbc: controls the writeback mode
+ *
+ * sync_inode() will write an inode and its pages to disk.  It will also
+ * correctly update the inode on its superblock's dirty inode lists and will
+ * update inode->i_state.
+ *
+ * The caller must have a ref on the inode.
+ */
+int sync_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int ret;
+
+	spin_lock(&inode_wb_list_lock);
+	spin_lock(&inode->i_lock);
+	ret = writeback_single_inode(inode, wbc);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_wb_list_lock);
+	return ret;
+}
+EXPORT_SYMBOL(sync_inode);
+
+/**
+ * sync_inode_metadata - write an inode to disk
+ * @inode: the inode to sync
+ * @wait: wait for I/O to complete.
+ *
+ * Write an inode to disk and adjust its dirty state after completion.
+ *
+ * Note: only writes the actual inode, no associated data or other metadata.
+ */
+int sync_inode_metadata(struct inode *inode, int wait)
+{
+	struct writeback_control wbc = {
+		.sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+		.nr_to_write = 0, /* metadata-only */
+	};
+
+	return sync_inode(inode, &wbc);
+}
+EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
new file mode 100644
index 00000000..78b519c1
--- /dev/null
+++ b/fs/fs_struct.c
@@ -0,0 +1,201 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/fs_struct.h>
+#include "internal.h"
+
+static inline void path_get_longterm(struct path *path)
+{
+	path_get(path);
+	mnt_make_longterm(path->mnt);
+}
+
+static inline void path_put_longterm(struct path *path)
+{
+	mnt_make_shortterm(path->mnt);
+	path_put(path);
+}
+
+/*
+ * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_root(struct fs_struct *fs, struct path *path)
+{
+	struct path old_root;
+
+	spin_lock(&fs->lock);
+	write_seqcount_begin(&fs->seq);
+	old_root = fs->root;
+	fs->root = *path;
+	path_get_longterm(path);
+	write_seqcount_end(&fs->seq);
+	spin_unlock(&fs->lock);
+	if (old_root.dentry)
+		path_put_longterm(&old_root);
+}
+
+/*
+ * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_pwd(struct fs_struct *fs, struct path *path)
+{
+	struct path old_pwd;
+
+	spin_lock(&fs->lock);
+	write_seqcount_begin(&fs->seq);
+	old_pwd = fs->pwd;
+	fs->pwd = *path;
+	path_get_longterm(path);
+	write_seqcount_end(&fs->seq);
+	spin_unlock(&fs->lock);
+
+	if (old_pwd.dentry)
+		path_put_longterm(&old_pwd);
+}
+
+void chroot_fs_refs(struct path *old_root, struct path *new_root)
+{
+	struct task_struct *g, *p;
+	struct fs_struct *fs;
+	int count = 0;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		task_lock(p);
+		fs = p->fs;
+		if (fs) {
+			spin_lock(&fs->lock);
+			write_seqcount_begin(&fs->seq);
+			if (fs->root.dentry == old_root->dentry
+			    && fs->root.mnt == old_root->mnt) {
+				path_get_longterm(new_root);
+				fs->root = *new_root;
+				count++;
+			}
+			if (fs->pwd.dentry == old_root->dentry
+			    && fs->pwd.mnt == old_root->mnt) {
+				path_get_longterm(new_root);
+				fs->pwd = *new_root;
+				count++;
+			}
+			write_seqcount_end(&fs->seq);
+			spin_unlock(&fs->lock);
+		}
+		task_unlock(p);
+	} while_each_thread(g, p);
+	read_unlock(&tasklist_lock);
+	while (count--)
+		path_put_longterm(old_root);
+}
+
+void free_fs_struct(struct fs_struct *fs)
+{
+	path_put_longterm(&fs->root);
+	path_put_longterm(&fs->pwd);
+	kmem_cache_free(fs_cachep, fs);
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+	struct fs_struct *fs = tsk->fs;
+
+	if (fs) {
+		int kill;
+		task_lock(tsk);
+		spin_lock(&fs->lock);
+		write_seqcount_begin(&fs->seq);
+		tsk->fs = NULL;
+		kill = !--fs->users;
+		write_seqcount_end(&fs->seq);
+		spin_unlock(&fs->lock);
+		task_unlock(tsk);
+		if (kill)
+			free_fs_struct(fs);
+	}
+}
+
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+	/* We don't need to lock fs - think why ;-) */
+	if (fs) {
+		fs->users = 1;
+		fs->in_exec = 0;
+		spin_lock_init(&fs->lock);
+		seqcount_init(&fs->seq);
+		fs->umask = old->umask;
+
+		spin_lock(&old->lock);
+		fs->root = old->root;
+		path_get_longterm(&fs->root);
+		fs->pwd = old->pwd;
+		path_get_longterm(&fs->pwd);
+		spin_unlock(&old->lock);
+	}
+	return fs;
+}
+
+int unshare_fs_struct(void)
+{
+	struct fs_struct *fs = current->fs;
+	struct fs_struct *new_fs = copy_fs_struct(fs);
+	int kill;
+
+	if (!new_fs)
+		return -ENOMEM;
+
+	task_lock(current);
+	spin_lock(&fs->lock);
+	kill = !--fs->users;
+	current->fs = new_fs;
+	spin_unlock(&fs->lock);
+	task_unlock(current);
+
+	if (kill)
+		free_fs_struct(fs);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unshare_fs_struct);
+
+int current_umask(void)
+{
+	return current->fs->umask;
+}
+EXPORT_SYMBOL(current_umask);
+
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+	.users		= 1,
+	.lock		= __SPIN_LOCK_UNLOCKED(init_fs.lock),
+	.seq		= SEQCNT_ZERO,
+	.umask		= 0022,
+};
+
+void daemonize_fs_struct(void)
+{
+	struct fs_struct *fs = current->fs;
+
+	if (fs) {
+		int kill;
+
+		task_lock(current);
+
+		spin_lock(&init_fs.lock);
+		init_fs.users++;
+		spin_unlock(&init_fs.lock);
+
+		spin_lock(&fs->lock);
+		current->fs = &init_fs;
+		kill = !--fs->users;
+		spin_unlock(&fs->lock);
+
+		task_unlock(current);
+		if (kill)
+			free_fs_struct(fs);
+	}
+}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
new file mode 100644
index 00000000..3f6dfa98
--- /dev/null
+++ b/fs/fscache/Kconfig
@@ -0,0 +1,61 @@
+
+config FSCACHE
+	tristate "General filesystem local caching manager"
+	help
+	  This option enables a generic filesystem caching manager that can be
+	  used by various network and other filesystems to cache data locally.
+	  Different sorts of caches can be plugged in, depending on the
+	  resources available.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_STATS
+	bool "Gather statistical information on local caching"
+	depends on FSCACHE && PROC_FS
+	help
+	  This option causes statistical information to be gathered on local
+	  caching and exported through file:
+
+		/proc/fs/fscache/stats
+
+	  The gathering of statistics adds a certain amount of overhead to
+	  execution as there are a quite a few stats gathered, and on a
+	  multi-CPU system these may be on cachelines that keep bouncing
+	  between CPUs.  On the other hand, the stats are very useful for
+	  debugging purposes.  Saying 'Y' here is recommended.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_HISTOGRAM
+	bool "Gather latency information on local caching"
+	depends on FSCACHE && PROC_FS
+	help
+	  This option causes latency information to be gathered on local
+	  caching and exported through file:
+
+		/proc/fs/fscache/histogram
+
+	  The generation of this histogram adds a certain amount of overhead to
+	  execution as there are a number of points at which data is gathered,
+	  and on a multi-CPU system these may be on cachelines that keep
+	  bouncing between CPUs.  On the other hand, the histogram may be
+	  useful for debugging purposes.  Saying 'N' here is recommended.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_DEBUG
+	bool "Debug FS-Cache"
+	depends on FSCACHE
+	help
+	  This permits debugging to be dynamically enabled in the local caching
+	  management module.  If this is set, the debugging output may be
+	  enabled by setting bits in /sys/modules/fscache/parameter/debug.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_OBJECT_LIST
+	bool "Maintain global object list for debugging purposes"
+	depends on FSCACHE && PROC_FS
+	help
+	  Maintain a global list of active fscache objects that can be
+	  retrieved through /proc/fs/fscache/objects for debugging purposes
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
new file mode 100644
index 00000000..6d561531
--- /dev/null
+++ b/fs/fscache/Makefile
@@ -0,0 +1,20 @@
+#
+# Makefile for general filesystem caching code
+#
+
+fscache-y := \
+	cache.o \
+	cookie.o \
+	fsdef.o \
+	main.o \
+	netfs.o \
+	object.o \
+	operation.o \
+	page.o
+
+fscache-$(CONFIG_PROC_FS) += proc.o
+fscache-$(CONFIG_FSCACHE_STATS) += stats.o
+fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
+fscache-$(CONFIG_FSCACHE_OBJECT_LIST) += object-list.o
+
+obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
new file mode 100644
index 00000000..6a3c48ab
--- /dev/null
+++ b/fs/fscache/cache.c
@@ -0,0 +1,420 @@
+/* FS-Cache cache handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+LIST_HEAD(fscache_cache_list);
+DECLARE_RWSEM(fscache_addremove_sem);
+DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq);
+EXPORT_SYMBOL(fscache_cache_cleared_wq);
+
+static LIST_HEAD(fscache_cache_tag_list);
+
+/*
+ * look up a cache tag
+ */
+struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name)
+{
+	struct fscache_cache_tag *tag, *xtag;
+
+	/* firstly check for the existence of the tag under read lock */
+	down_read(&fscache_addremove_sem);
+
+	list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+		if (strcmp(tag->name, name) == 0) {
+			atomic_inc(&tag->usage);
+			up_read(&fscache_addremove_sem);
+			return tag;
+		}
+	}
+
+	up_read(&fscache_addremove_sem);
+
+	/* the tag does not exist - create a candidate */
+	xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL);
+	if (!xtag)
+		/* return a dummy tag if out of memory */
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&xtag->usage, 1);
+	strcpy(xtag->name, name);
+
+	/* write lock, search again and add if still not present */
+	down_write(&fscache_addremove_sem);
+
+	list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+		if (strcmp(tag->name, name) == 0) {
+			atomic_inc(&tag->usage);
+			up_write(&fscache_addremove_sem);
+			kfree(xtag);
+			return tag;
+		}
+	}
+
+	list_add_tail(&xtag->link, &fscache_cache_tag_list);
+	up_write(&fscache_addremove_sem);
+	return xtag;
+}
+
+/*
+ * release a reference to a cache tag
+ */
+void __fscache_release_cache_tag(struct fscache_cache_tag *tag)
+{
+	if (tag != ERR_PTR(-ENOMEM)) {
+		down_write(&fscache_addremove_sem);
+
+		if (atomic_dec_and_test(&tag->usage))
+			list_del_init(&tag->link);
+		else
+			tag = NULL;
+
+		up_write(&fscache_addremove_sem);
+
+		kfree(tag);
+	}
+}
+
+/*
+ * select a cache in which to store an object
+ * - the cache addremove semaphore must be at least read-locked by the caller
+ * - the object will never be an index
+ */
+struct fscache_cache *fscache_select_cache_for_object(
+	struct fscache_cookie *cookie)
+{
+	struct fscache_cache_tag *tag;
+	struct fscache_object *object;
+	struct fscache_cache *cache;
+
+	_enter("");
+
+	if (list_empty(&fscache_cache_list)) {
+		_leave(" = NULL [no cache]");
+		return NULL;
+	}
+
+	/* we check the parent to determine the cache to use */
+	spin_lock(&cookie->lock);
+
+	/* the first in the parent's backing list should be the preferred
+	 * cache */
+	if (!hlist_empty(&cookie->backing_objects)) {
+		object = hlist_entry(cookie->backing_objects.first,
+				     struct fscache_object, cookie_link);
+
+		cache = object->cache;
+		if (object->state >= FSCACHE_OBJECT_DYING ||
+		    test_bit(FSCACHE_IOERROR, &cache->flags))
+			cache = NULL;
+
+		spin_unlock(&cookie->lock);
+		_leave(" = %p [parent]", cache);
+		return cache;
+	}
+
+	/* the parent is unbacked */
+	if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		/* cookie not an index and is unbacked */
+		spin_unlock(&cookie->lock);
+		_leave(" = NULL [cookie ub,ni]");
+		return NULL;
+	}
+
+	spin_unlock(&cookie->lock);
+
+	if (!cookie->def->select_cache)
+		goto no_preference;
+
+	/* ask the netfs for its preference */
+	tag = cookie->def->select_cache(cookie->parent->netfs_data,
+					cookie->netfs_data);
+	if (!tag)
+		goto no_preference;
+
+	if (tag == ERR_PTR(-ENOMEM)) {
+		_leave(" = NULL [nomem tag]");
+		return NULL;
+	}
+
+	if (!tag->cache) {
+		_leave(" = NULL [unbacked tag]");
+		return NULL;
+	}
+
+	if (test_bit(FSCACHE_IOERROR, &tag->cache->flags))
+		return NULL;
+
+	_leave(" = %p [specific]", tag->cache);
+	return tag->cache;
+
+no_preference:
+	/* netfs has no preference - just select first cache */
+	cache = list_entry(fscache_cache_list.next,
+			   struct fscache_cache, link);
+	_leave(" = %p [first]", cache);
+	return cache;
+}
+
+/**
+ * fscache_init_cache - Initialise a cache record
+ * @cache: The cache record to be initialised
+ * @ops: The cache operations to be installed in that record
+ * @idfmt: Format string to define identifier
+ * @...: sprintf-style arguments
+ *
+ * Initialise a record of a cache and fill in the name.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_init_cache(struct fscache_cache *cache,
+			const struct fscache_cache_ops *ops,
+			const char *idfmt,
+			...)
+{
+	va_list va;
+
+	memset(cache, 0, sizeof(*cache));
+
+	cache->ops = ops;
+
+	va_start(va, idfmt);
+	vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va);
+	va_end(va);
+
+	INIT_WORK(&cache->op_gc, fscache_operation_gc);
+	INIT_LIST_HEAD(&cache->link);
+	INIT_LIST_HEAD(&cache->object_list);
+	INIT_LIST_HEAD(&cache->op_gc_list);
+	spin_lock_init(&cache->object_list_lock);
+	spin_lock_init(&cache->op_gc_list_lock);
+}
+EXPORT_SYMBOL(fscache_init_cache);
+
+/**
+ * fscache_add_cache - Declare a cache as being open for business
+ * @cache: The record describing the cache
+ * @ifsdef: The record of the cache object describing the top-level index
+ * @tagname: The tag describing this cache
+ *
+ * Add a cache to the system, making it available for netfs's to use.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+int fscache_add_cache(struct fscache_cache *cache,
+		      struct fscache_object *ifsdef,
+		      const char *tagname)
+{
+	struct fscache_cache_tag *tag;
+
+	BUG_ON(!cache->ops);
+	BUG_ON(!ifsdef);
+
+	cache->flags = 0;
+	ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+	ifsdef->state = FSCACHE_OBJECT_ACTIVE;
+
+	if (!tagname)
+		tagname = cache->identifier;
+
+	BUG_ON(!tagname[0]);
+
+	_enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname);
+
+	/* we use the cache tag to uniquely identify caches */
+	tag = __fscache_lookup_cache_tag(tagname);
+	if (IS_ERR(tag))
+		goto nomem;
+
+	if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags))
+		goto tag_in_use;
+
+	cache->kobj = kobject_create_and_add(tagname, fscache_root);
+	if (!cache->kobj)
+		goto error;
+
+	ifsdef->cookie = &fscache_fsdef_index;
+	ifsdef->cache = cache;
+	cache->fsdef = ifsdef;
+
+	down_write(&fscache_addremove_sem);
+
+	tag->cache = cache;
+	cache->tag = tag;
+
+	/* add the cache to the list */
+	list_add(&cache->link, &fscache_cache_list);
+
+	/* add the cache's netfs definition index object to the cache's
+	 * list */
+	spin_lock(&cache->object_list_lock);
+	list_add_tail(&ifsdef->cache_link, &cache->object_list);
+	spin_unlock(&cache->object_list_lock);
+	fscache_objlist_add(ifsdef);
+
+	/* add the cache's netfs definition index object to the top level index
+	 * cookie as a known backing object */
+	spin_lock(&fscache_fsdef_index.lock);
+
+	hlist_add_head(&ifsdef->cookie_link,
+		       &fscache_fsdef_index.backing_objects);
+
+	atomic_inc(&fscache_fsdef_index.usage);
+
+	/* done */
+	spin_unlock(&fscache_fsdef_index.lock);
+	up_write(&fscache_addremove_sem);
+
+	printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n",
+	       cache->tag->name, cache->ops->name);
+	kobject_uevent(cache->kobj, KOBJ_ADD);
+
+	_leave(" = 0 [%s]", cache->identifier);
+	return 0;
+
+tag_in_use:
+	printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname);
+	__fscache_release_cache_tag(tag);
+	_leave(" = -EXIST");
+	return -EEXIST;
+
+error:
+	__fscache_release_cache_tag(tag);
+	_leave(" = -EINVAL");
+	return -EINVAL;
+
+nomem:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(fscache_add_cache);
+
+/**
+ * fscache_io_error - Note a cache I/O error
+ * @cache: The record describing the cache
+ *
+ * Note that an I/O error occurred in a cache and that it should no longer be
+ * used for anything.  This also reports the error into the kernel log.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_io_error(struct fscache_cache *cache)
+{
+	set_bit(FSCACHE_IOERROR, &cache->flags);
+
+	printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n",
+	       cache->ops->name);
+}
+EXPORT_SYMBOL(fscache_io_error);
+
+/*
+ * request withdrawal of all the objects in a cache
+ * - all the objects being withdrawn are moved onto the supplied list
+ */
+static void fscache_withdraw_all_objects(struct fscache_cache *cache,
+					 struct list_head *dying_objects)
+{
+	struct fscache_object *object;
+
+	spin_lock(&cache->object_list_lock);
+
+	while (!list_empty(&cache->object_list)) {
+		object = list_entry(cache->object_list.next,
+				    struct fscache_object, cache_link);
+		list_move_tail(&object->cache_link, dying_objects);
+
+		_debug("withdraw %p", object->cookie);
+
+		spin_lock(&object->lock);
+		spin_unlock(&cache->object_list_lock);
+		fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
+		spin_unlock(&object->lock);
+
+		cond_resched();
+		spin_lock(&cache->object_list_lock);
+	}
+
+	spin_unlock(&cache->object_list_lock);
+}
+
+/**
+ * fscache_withdraw_cache - Withdraw a cache from the active service
+ * @cache: The record describing the cache
+ *
+ * Withdraw a cache from service, unbinding all its cache objects from the
+ * netfs cookies they're currently representing.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_withdraw_cache(struct fscache_cache *cache)
+{
+	LIST_HEAD(dying_objects);
+
+	_enter("");
+
+	printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n",
+	       cache->tag->name);
+
+	/* make the cache unavailable for cookie acquisition */
+	if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags))
+		BUG();
+
+	down_write(&fscache_addremove_sem);
+	list_del_init(&cache->link);
+	cache->tag->cache = NULL;
+	up_write(&fscache_addremove_sem);
+
+	/* make sure all pages pinned by operations on behalf of the netfs are
+	 * written to disk */
+	fscache_stat(&fscache_n_cop_sync_cache);
+	cache->ops->sync_cache(cache);
+	fscache_stat_d(&fscache_n_cop_sync_cache);
+
+	/* dissociate all the netfs pages backed by this cache from the block
+	 * mappings in the cache */
+	fscache_stat(&fscache_n_cop_dissociate_pages);
+	cache->ops->dissociate_pages(cache);
+	fscache_stat_d(&fscache_n_cop_dissociate_pages);
+
+	/* we now have to destroy all the active objects pertaining to this
+	 * cache - which we do by passing them off to thread pool to be
+	 * disposed of */
+	_debug("destroy");
+
+	fscache_withdraw_all_objects(cache, &dying_objects);
+
+	/* wait for all extant objects to finish their outstanding operations
+	 * and go away */
+	_debug("wait for finish");
+	wait_event(fscache_cache_cleared_wq,
+		   atomic_read(&cache->object_count) == 0);
+	_debug("wait for clearance");
+	wait_event(fscache_cache_cleared_wq,
+		   list_empty(&cache->object_list));
+	_debug("cleared");
+	ASSERT(list_empty(&dying_objects));
+
+	kobject_put(cache->kobj);
+
+	clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags);
+	fscache_release_cache_tag(cache->tag);
+	cache->tag = NULL;
+
+	_leave("");
+}
+EXPORT_SYMBOL(fscache_withdraw_cache);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
new file mode 100644
index 00000000..99053507
--- /dev/null
+++ b/fs/fscache/cookie.c
@@ -0,0 +1,514 @@
+/* netfs cookie management
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for more information on
+ * the netfs API.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+struct kmem_cache *fscache_cookie_jar;
+
+static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
+
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie);
+static int fscache_alloc_object(struct fscache_cache *cache,
+				struct fscache_cookie *cookie);
+static int fscache_attach_object(struct fscache_cookie *cookie,
+				 struct fscache_object *object);
+
+/*
+ * initialise an cookie jar slab element prior to any use
+ */
+void fscache_cookie_init_once(void *_cookie)
+{
+	struct fscache_cookie *cookie = _cookie;
+
+	memset(cookie, 0, sizeof(*cookie));
+	spin_lock_init(&cookie->lock);
+	spin_lock_init(&cookie->stores_lock);
+	INIT_HLIST_HEAD(&cookie->backing_objects);
+}
+
+/*
+ * request a cookie to represent an object (index, datafile, xattr, etc)
+ * - parent specifies the parent object
+ *   - the top level index cookie for each netfs is stored in the fscache_netfs
+ *     struct upon registration
+ * - def points to the definition
+ * - the netfs_data will be passed to the functions pointed to in *def
+ * - all attached caches will be searched to see if they contain this object
+ * - index objects aren't stored on disk until there's a dependent file that
+ *   needs storing
+ * - other objects are stored in a selected cache immediately, and all the
+ *   indices forming the path to it are instantiated if necessary
+ * - we never let on to the netfs about errors
+ *   - we may set a negative cookie pointer, but that's okay
+ */
+struct fscache_cookie *__fscache_acquire_cookie(
+	struct fscache_cookie *parent,
+	const struct fscache_cookie_def *def,
+	void *netfs_data)
+{
+	struct fscache_cookie *cookie;
+
+	BUG_ON(!def);
+
+	_enter("{%s},{%s},%p",
+	       parent ? (char *) parent->def->name : "<no-parent>",
+	       def->name, netfs_data);
+
+	fscache_stat(&fscache_n_acquires);
+
+	/* if there's no parent cookie, then we don't create one here either */
+	if (!parent) {
+		fscache_stat(&fscache_n_acquires_null);
+		_leave(" [no parent]");
+		return NULL;
+	}
+
+	/* validate the definition */
+	BUG_ON(!def->get_key);
+	BUG_ON(!def->name[0]);
+
+	BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
+	       parent->def->type != FSCACHE_COOKIE_TYPE_INDEX);
+
+	/* allocate and initialise a cookie */
+	cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
+	if (!cookie) {
+		fscache_stat(&fscache_n_acquires_oom);
+		_leave(" [ENOMEM]");
+		return NULL;
+	}
+
+	atomic_set(&cookie->usage, 1);
+	atomic_set(&cookie->n_children, 0);
+
+	atomic_inc(&parent->usage);
+	atomic_inc(&parent->n_children);
+
+	cookie->def		= def;
+	cookie->parent		= parent;
+	cookie->netfs_data	= netfs_data;
+	cookie->flags		= 0;
+
+	/* radix tree insertion won't use the preallocation pool unless it's
+	 * told it may not wait */
+	INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT);
+
+	switch (cookie->def->type) {
+	case FSCACHE_COOKIE_TYPE_INDEX:
+		fscache_stat(&fscache_n_cookie_index);
+		break;
+	case FSCACHE_COOKIE_TYPE_DATAFILE:
+		fscache_stat(&fscache_n_cookie_data);
+		break;
+	default:
+		fscache_stat(&fscache_n_cookie_special);
+		break;
+	}
+
+	/* if the object is an index then we need do nothing more here - we
+	 * create indices on disk when we need them as an index may exist in
+	 * multiple caches */
+	if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		if (fscache_acquire_non_index_cookie(cookie) < 0) {
+			atomic_dec(&parent->n_children);
+			__fscache_cookie_put(cookie);
+			fscache_stat(&fscache_n_acquires_nobufs);
+			_leave(" = NULL");
+			return NULL;
+		}
+	}
+
+	fscache_stat(&fscache_n_acquires_ok);
+	_leave(" = %p", cookie);
+	return cookie;
+}
+EXPORT_SYMBOL(__fscache_acquire_cookie);
+
+/*
+ * acquire a non-index cookie
+ * - this must make sure the index chain is instantiated and instantiate the
+ *   object representation too
+ */
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+	struct fscache_cache *cache;
+	uint64_t i_size;
+	int ret;
+
+	_enter("");
+
+	cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
+
+	/* now we need to see whether the backing objects for this cookie yet
+	 * exist, if not there'll be nothing to search */
+	down_read(&fscache_addremove_sem);
+
+	if (list_empty(&fscache_cache_list)) {
+		up_read(&fscache_addremove_sem);
+		_leave(" = 0 [no caches]");
+		return 0;
+	}
+
+	/* select a cache in which to store the object */
+	cache = fscache_select_cache_for_object(cookie->parent);
+	if (!cache) {
+		up_read(&fscache_addremove_sem);
+		fscache_stat(&fscache_n_acquires_no_cache);
+		_leave(" = -ENOMEDIUM [no cache]");
+		return -ENOMEDIUM;
+	}
+
+	_debug("cache %s", cache->tag->name);
+
+	cookie->flags =
+		(1 << FSCACHE_COOKIE_LOOKING_UP) |
+		(1 << FSCACHE_COOKIE_CREATING) |
+		(1 << FSCACHE_COOKIE_NO_DATA_YET);
+
+	/* ask the cache to allocate objects for this cookie and its parent
+	 * chain */
+	ret = fscache_alloc_object(cache, cookie);
+	if (ret < 0) {
+		up_read(&fscache_addremove_sem);
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* pass on how big the object we're caching is supposed to be */
+	cookie->def->get_attr(cookie->netfs_data, &i_size);
+
+	spin_lock(&cookie->lock);
+	if (hlist_empty(&cookie->backing_objects)) {
+		spin_unlock(&cookie->lock);
+		goto unavailable;
+	}
+
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	fscache_set_store_limit(object, i_size);
+
+	/* initiate the process of looking up all the objects in the chain
+	 * (done by fscache_initialise_object()) */
+	fscache_enqueue_object(object);
+
+	spin_unlock(&cookie->lock);
+
+	/* we may be required to wait for lookup to complete at this point */
+	if (!fscache_defer_lookup) {
+		_debug("non-deferred lookup %p", &cookie->flags);
+		wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+		_debug("complete");
+		if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
+			goto unavailable;
+	}
+
+	up_read(&fscache_addremove_sem);
+	_leave(" = 0 [deferred]");
+	return 0;
+
+unavailable:
+	up_read(&fscache_addremove_sem);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+
+/*
+ * recursively allocate cache object records for a cookie/cache combination
+ * - caller must be holding the addremove sem
+ */
+static int fscache_alloc_object(struct fscache_cache *cache,
+				struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+	struct hlist_node *_n;
+	int ret;
+
+	_enter("%p,%p{%s}", cache, cookie, cookie->def->name);
+
+	spin_lock(&cookie->lock);
+	hlist_for_each_entry(object, _n, &cookie->backing_objects,
+			     cookie_link) {
+		if (object->cache == cache)
+			goto object_already_extant;
+	}
+	spin_unlock(&cookie->lock);
+
+	/* ask the cache to allocate an object (we may end up with duplicate
+	 * objects at this stage, but we sort that out later) */
+	fscache_stat(&fscache_n_cop_alloc_object);
+	object = cache->ops->alloc_object(cache, cookie);
+	fscache_stat_d(&fscache_n_cop_alloc_object);
+	if (IS_ERR(object)) {
+		fscache_stat(&fscache_n_object_no_alloc);
+		ret = PTR_ERR(object);
+		goto error;
+	}
+
+	fscache_stat(&fscache_n_object_alloc);
+
+	object->debug_id = atomic_inc_return(&fscache_object_debug_id);
+
+	_debug("ALLOC OBJ%x: %s {%lx}",
+	       object->debug_id, cookie->def->name, object->events);
+
+	ret = fscache_alloc_object(cache, cookie->parent);
+	if (ret < 0)
+		goto error_put;
+
+	/* only attach if we managed to allocate all we needed, otherwise
+	 * discard the object we just allocated and instead use the one
+	 * attached to the cookie */
+	if (fscache_attach_object(cookie, object) < 0) {
+		fscache_stat(&fscache_n_cop_put_object);
+		cache->ops->put_object(object);
+		fscache_stat_d(&fscache_n_cop_put_object);
+	}
+
+	_leave(" = 0");
+	return 0;
+
+object_already_extant:
+	ret = -ENOBUFS;
+	if (object->state >= FSCACHE_OBJECT_DYING) {
+		spin_unlock(&cookie->lock);
+		goto error;
+	}
+	spin_unlock(&cookie->lock);
+	_leave(" = 0 [found]");
+	return 0;
+
+error_put:
+	fscache_stat(&fscache_n_cop_put_object);
+	cache->ops->put_object(object);
+	fscache_stat_d(&fscache_n_cop_put_object);
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * attach a cache object to a cookie
+ */
+static int fscache_attach_object(struct fscache_cookie *cookie,
+				 struct fscache_object *object)
+{
+	struct fscache_object *p;
+	struct fscache_cache *cache = object->cache;
+	struct hlist_node *_n;
+	int ret;
+
+	_enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
+
+	spin_lock(&cookie->lock);
+
+	/* there may be multiple initial creations of this object, but we only
+	 * want one */
+	ret = -EEXIST;
+	hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) {
+		if (p->cache == object->cache) {
+			if (p->state >= FSCACHE_OBJECT_DYING)
+				ret = -ENOBUFS;
+			goto cant_attach_object;
+		}
+	}
+
+	/* pin the parent object */
+	spin_lock_nested(&cookie->parent->lock, 1);
+	hlist_for_each_entry(p, _n, &cookie->parent->backing_objects,
+			     cookie_link) {
+		if (p->cache == object->cache) {
+			if (p->state >= FSCACHE_OBJECT_DYING) {
+				ret = -ENOBUFS;
+				spin_unlock(&cookie->parent->lock);
+				goto cant_attach_object;
+			}
+			object->parent = p;
+			spin_lock(&p->lock);
+			p->n_children++;
+			spin_unlock(&p->lock);
+			break;
+		}
+	}
+	spin_unlock(&cookie->parent->lock);
+
+	/* attach to the cache's object list */
+	if (list_empty(&object->cache_link)) {
+		spin_lock(&cache->object_list_lock);
+		list_add(&object->cache_link, &cache->object_list);
+		spin_unlock(&cache->object_list_lock);
+	}
+
+	/* attach to the cookie */
+	object->cookie = cookie;
+	atomic_inc(&cookie->usage);
+	hlist_add_head(&object->cookie_link, &cookie->backing_objects);
+
+	fscache_objlist_add(object);
+	ret = 0;
+
+cant_attach_object:
+	spin_unlock(&cookie->lock);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * update the index entries backing a cookie
+ */
+void __fscache_update_cookie(struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+	struct hlist_node *_p;
+
+	fscache_stat(&fscache_n_updates);
+
+	if (!cookie) {
+		fscache_stat(&fscache_n_updates_null);
+		_leave(" [no cookie]");
+		return;
+	}
+
+	_enter("{%s}", cookie->def->name);
+
+	BUG_ON(!cookie->def->get_aux);
+
+	spin_lock(&cookie->lock);
+
+	/* update the index entry on disk in each cache backing this cookie */
+	hlist_for_each_entry(object, _p,
+			     &cookie->backing_objects, cookie_link) {
+		fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+	}
+
+	spin_unlock(&cookie->lock);
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_update_cookie);
+
+/*
+ * release a cookie back to the cache
+ * - the object will be marked as recyclable on disk if retire is true
+ * - all dependents of this cookie must have already been unregistered
+ *   (indices/files/pages)
+ */
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+{
+	struct fscache_cache *cache;
+	struct fscache_object *object;
+	unsigned long event;
+
+	fscache_stat(&fscache_n_relinquishes);
+	if (retire)
+		fscache_stat(&fscache_n_relinquishes_retire);
+
+	if (!cookie) {
+		fscache_stat(&fscache_n_relinquishes_null);
+		_leave(" [no cookie]");
+		return;
+	}
+
+	_enter("%p{%s,%p},%d",
+	       cookie, cookie->def->name, cookie->netfs_data, retire);
+
+	if (atomic_read(&cookie->n_children) != 0) {
+		printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
+		       cookie->def->name);
+		BUG();
+	}
+
+	/* wait for the cookie to finish being instantiated (or to fail) */
+	if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
+		fscache_stat(&fscache_n_relinquishes_waitcrt);
+		wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+	}
+
+	event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+
+	spin_lock(&cookie->lock);
+
+	/* break links with all the active objects */
+	while (!hlist_empty(&cookie->backing_objects)) {
+		object = hlist_entry(cookie->backing_objects.first,
+				     struct fscache_object,
+				     cookie_link);
+
+		_debug("RELEASE OBJ%x", object->debug_id);
+
+		/* detach each cache object from the object cookie */
+		spin_lock(&object->lock);
+		hlist_del_init(&object->cookie_link);
+
+		cache = object->cache;
+		object->cookie = NULL;
+		fscache_raise_event(object, event);
+		spin_unlock(&object->lock);
+
+		if (atomic_dec_and_test(&cookie->usage))
+			/* the cookie refcount shouldn't be reduced to 0 yet */
+			BUG();
+	}
+
+	/* detach pointers back to the netfs */
+	cookie->netfs_data	= NULL;
+	cookie->def		= NULL;
+
+	spin_unlock(&cookie->lock);
+
+	if (cookie->parent) {
+		ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
+		ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0);
+		atomic_dec(&cookie->parent->n_children);
+	}
+
+	/* finally dispose of the cookie */
+	ASSERTCMP(atomic_read(&cookie->usage), >, 0);
+	fscache_cookie_put(cookie);
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_relinquish_cookie);
+
+/*
+ * destroy a cookie
+ */
+void __fscache_cookie_put(struct fscache_cookie *cookie)
+{
+	struct fscache_cookie *parent;
+
+	_enter("%p", cookie);
+
+	for (;;) {
+		_debug("FREE COOKIE %p", cookie);
+		parent = cookie->parent;
+		BUG_ON(!hlist_empty(&cookie->backing_objects));
+		kmem_cache_free(fscache_cookie_jar, cookie);
+
+		if (!parent)
+			break;
+
+		cookie = parent;
+		BUG_ON(atomic_read(&cookie->usage) <= 0);
+		if (!atomic_dec_and_test(&cookie->usage))
+			break;
+	}
+
+	_leave("");
+}
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
new file mode 100644
index 00000000..f5b4baee
--- /dev/null
+++ b/fs/fscache/fsdef.c
@@ -0,0 +1,144 @@
+/* Filesystem index definition
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include "internal.h"
+
+static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax);
+
+static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax);
+
+static
+enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data,
+						    const void *data,
+						    uint16_t datalen);
+
+/*
+ * The root index is owned by FS-Cache itself.
+ *
+ * When a netfs requests caching facilities, FS-Cache will, if one doesn't
+ * already exist, create an entry in the root index with the key being the name
+ * of the netfs ("AFS" for example), and the auxiliary data holding the index
+ * structure version supplied by the netfs:
+ *
+ *				     FSDEF
+ *				       |
+ *				 +-----------+
+ *				 |           |
+ *				NFS         AFS
+ *			       [v=1]       [v=1]
+ *
+ * If an entry with the appropriate name does already exist, the version is
+ * compared.  If the version is different, the entire subtree from that entry
+ * will be discarded and a new entry created.
+ *
+ * The new entry will be an index, and a cookie referring to it will be passed
+ * to the netfs.  This is then the root handle by which the netfs accesses the
+ * cache.  It can create whatever objects it likes in that index, including
+ * further indices.
+ */
+static struct fscache_cookie_def fscache_fsdef_index_def = {
+	.name		= ".FS-Cache",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+};
+
+struct fscache_cookie fscache_fsdef_index = {
+	.usage		= ATOMIC_INIT(1),
+	.lock		= __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
+	.backing_objects = HLIST_HEAD_INIT,
+	.def		= &fscache_fsdef_index_def,
+};
+EXPORT_SYMBOL(fscache_fsdef_index);
+
+/*
+ * Definition of an entry in the root index.  Each entry is an index, keyed to
+ * a specific netfs and only applicable to a particular version of the index
+ * structure used by that netfs.
+ */
+struct fscache_cookie_def fscache_fsdef_netfs_def = {
+	.name		= "FSDEF.netfs",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= fscache_fsdef_netfs_get_key,
+	.get_aux	= fscache_fsdef_netfs_get_aux,
+	.check_aux	= fscache_fsdef_netfs_check_aux,
+};
+
+/*
+ * get the key data for an FSDEF index record - this is the name of the netfs
+ * for which this entry is created
+ */
+static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax)
+{
+	const struct fscache_netfs *netfs = cookie_netfs_data;
+	unsigned klen;
+
+	_enter("{%s.%u},", netfs->name, netfs->version);
+
+	klen = strlen(netfs->name);
+	if (klen > bufmax)
+		return 0;
+
+	memcpy(buffer, netfs->name, klen);
+	return klen;
+}
+
+/*
+ * get the auxiliary data for an FSDEF index record - this is the index
+ * structure version number of the netfs for which this version is created
+ */
+static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax)
+{
+	const struct fscache_netfs *netfs = cookie_netfs_data;
+	unsigned dlen;
+
+	_enter("{%s.%u},", netfs->name, netfs->version);
+
+	dlen = sizeof(uint32_t);
+	if (dlen > bufmax)
+		return 0;
+
+	memcpy(buffer, &netfs->version, dlen);
+	return dlen;
+}
+
+/*
+ * check that the index structure version number stored in the auxiliary data
+ * matches the one the netfs gave us
+ */
+static enum fscache_checkaux fscache_fsdef_netfs_check_aux(
+	void *cookie_netfs_data,
+	const void *data,
+	uint16_t datalen)
+{
+	struct fscache_netfs *netfs = cookie_netfs_data;
+	uint32_t version;
+
+	_enter("{%s},,%hu", netfs->name, datalen);
+
+	if (datalen != sizeof(version)) {
+		_leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version));
+		return FSCACHE_CHECKAUX_OBSOLETE;
+	}
+
+	memcpy(&version, data, sizeof(version));
+	if (version != netfs->version) {
+		_leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version);
+		return FSCACHE_CHECKAUX_OBSOLETE;
+	}
+
+	_leave(" = OKAY");
+	return FSCACHE_CHECKAUX_OKAY;
+}
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
new file mode 100644
index 00000000..bad49674
--- /dev/null
+++ b/fs/fscache/histogram.c
@@ -0,0 +1,109 @@
+/* FS-Cache latency histogram
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+atomic_t fscache_obj_instantiate_histogram[HZ];
+atomic_t fscache_objs_histogram[HZ];
+atomic_t fscache_ops_histogram[HZ];
+atomic_t fscache_retrieval_delay_histogram[HZ];
+atomic_t fscache_retrieval_histogram[HZ];
+
+/*
+ * display the time-taken histogram
+ */
+static int fscache_histogram_show(struct seq_file *m, void *v)
+{
+	unsigned long index;
+	unsigned n[5], t;
+
+	switch ((unsigned long) v) {
+	case 1:
+		seq_puts(m, "JIFS  SECS  OBJ INST  OP RUNS   OBJ RUNS "
+			 " RETRV DLY RETRIEVLS\n");
+		return 0;
+	case 2:
+		seq_puts(m, "===== ===== ========= ========= ========="
+			 " ========= =========\n");
+		return 0;
+	default:
+		index = (unsigned long) v - 3;
+		n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]);
+		n[1] = atomic_read(&fscache_ops_histogram[index]);
+		n[2] = atomic_read(&fscache_objs_histogram[index]);
+		n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]);
+		n[4] = atomic_read(&fscache_retrieval_histogram[index]);
+		if (!(n[0] | n[1] | n[2] | n[3] | n[4]))
+			return 0;
+
+		t = (index * 1000) / HZ;
+
+		seq_printf(m, "%4lu  0.%03u %9u %9u %9u %9u %9u\n",
+			   index, t, n[0], n[1], n[2], n[3], n[4]);
+		return 0;
+	}
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+	if ((unsigned long long)*_pos >= HZ + 2)
+		return NULL;
+	if (*_pos == 0)
+		*_pos = 1;
+	return (void *)(unsigned long) *_pos;
+}
+
+/*
+ * move to the next line
+ */
+static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return (unsigned long long)*pos > HZ + 2 ?
+		NULL : (void *)(unsigned long) *pos;
+}
+
+/*
+ * clean up after reading
+ */
+static void fscache_histogram_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations fscache_histogram_ops = {
+	.start		= fscache_histogram_start,
+	.stop		= fscache_histogram_stop,
+	.next		= fscache_histogram_next,
+	.show		= fscache_histogram_show,
+};
+
+/*
+ * open "/proc/fs/fscache/histogram" to provide latency data
+ */
+static int fscache_histogram_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &fscache_histogram_ops);
+}
+
+const struct file_operations fscache_histogram_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fscache_histogram_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
new file mode 100644
index 00000000..f6aad48d
--- /dev/null
+++ b/fs/fscache/internal.h
@@ -0,0 +1,438 @@
+/* Internal definitions for FS-Cache
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Lock order, in the order in which multiple locks should be obtained:
+ * - fscache_addremove_sem
+ * - cookie->lock
+ * - cookie->parent->lock
+ * - cache->object_list_lock
+ * - object->lock
+ * - object->parent->lock
+ * - cookie->stores_lock
+ * - fscache_thread_lock
+ *
+ */
+
+#include <linux/fscache-cache.h>
+#include <linux/sched.h>
+
+#define FSCACHE_MIN_THREADS	4
+#define FSCACHE_MAX_THREADS	32
+
+/*
+ * cache.c
+ */
+extern struct list_head fscache_cache_list;
+extern struct rw_semaphore fscache_addremove_sem;
+
+extern struct fscache_cache *fscache_select_cache_for_object(
+	struct fscache_cookie *);
+
+/*
+ * cookie.c
+ */
+extern struct kmem_cache *fscache_cookie_jar;
+
+extern void fscache_cookie_init_once(void *);
+extern void __fscache_cookie_put(struct fscache_cookie *);
+
+/*
+ * fsdef.c
+ */
+extern struct fscache_cookie fscache_fsdef_index;
+extern struct fscache_cookie_def fscache_fsdef_netfs_def;
+
+/*
+ * histogram.c
+ */
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+extern atomic_t fscache_obj_instantiate_histogram[HZ];
+extern atomic_t fscache_objs_histogram[HZ];
+extern atomic_t fscache_ops_histogram[HZ];
+extern atomic_t fscache_retrieval_delay_histogram[HZ];
+extern atomic_t fscache_retrieval_histogram[HZ];
+
+static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
+{
+	unsigned long jif = jiffies - start_jif;
+	if (jif >= HZ)
+		jif = HZ - 1;
+	atomic_inc(&histogram[jif]);
+}
+
+extern const struct file_operations fscache_histogram_fops;
+
+#else
+#define fscache_hist(hist, start_jif) do {} while (0)
+#endif
+
+/*
+ * main.c
+ */
+extern unsigned fscache_defer_lookup;
+extern unsigned fscache_defer_create;
+extern unsigned fscache_debug;
+extern struct kobject *fscache_root;
+extern struct workqueue_struct *fscache_object_wq;
+extern struct workqueue_struct *fscache_op_wq;
+DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+
+static inline bool fscache_object_congested(void)
+{
+	return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
+}
+
+extern int fscache_wait_bit(void *);
+extern int fscache_wait_bit_interruptible(void *);
+
+/*
+ * object.c
+ */
+extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
+
+extern void fscache_withdrawing_object(struct fscache_cache *,
+				       struct fscache_object *);
+extern void fscache_enqueue_object(struct fscache_object *);
+
+/*
+ * object-list.c
+ */
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+extern const struct file_operations fscache_objlist_fops;
+
+extern void fscache_objlist_add(struct fscache_object *);
+#else
+#define fscache_objlist_add(object) do {} while(0)
+#endif
+
+/*
+ * operation.c
+ */
+extern int fscache_submit_exclusive_op(struct fscache_object *,
+				       struct fscache_operation *);
+extern int fscache_submit_op(struct fscache_object *,
+			     struct fscache_operation *);
+extern int fscache_cancel_op(struct fscache_operation *);
+extern void fscache_abort_object(struct fscache_object *);
+extern void fscache_start_operations(struct fscache_object *);
+extern void fscache_operation_gc(struct work_struct *);
+
+/*
+ * proc.c
+ */
+#ifdef CONFIG_PROC_FS
+extern int __init fscache_proc_init(void);
+extern void fscache_proc_cleanup(void);
+#else
+#define fscache_proc_init()	(0)
+#define fscache_proc_cleanup()	do {} while (0)
+#endif
+
+/*
+ * stats.c
+ */
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS];
+extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS];
+
+extern atomic_t fscache_n_op_pend;
+extern atomic_t fscache_n_op_run;
+extern atomic_t fscache_n_op_enqueue;
+extern atomic_t fscache_n_op_deferred_release;
+extern atomic_t fscache_n_op_release;
+extern atomic_t fscache_n_op_gc;
+extern atomic_t fscache_n_op_cancelled;
+extern atomic_t fscache_n_op_rejected;
+
+extern atomic_t fscache_n_attr_changed;
+extern atomic_t fscache_n_attr_changed_ok;
+extern atomic_t fscache_n_attr_changed_nobufs;
+extern atomic_t fscache_n_attr_changed_nomem;
+extern atomic_t fscache_n_attr_changed_calls;
+
+extern atomic_t fscache_n_allocs;
+extern atomic_t fscache_n_allocs_ok;
+extern atomic_t fscache_n_allocs_wait;
+extern atomic_t fscache_n_allocs_nobufs;
+extern atomic_t fscache_n_allocs_intr;
+extern atomic_t fscache_n_allocs_object_dead;
+extern atomic_t fscache_n_alloc_ops;
+extern atomic_t fscache_n_alloc_op_waits;
+
+extern atomic_t fscache_n_retrievals;
+extern atomic_t fscache_n_retrievals_ok;
+extern atomic_t fscache_n_retrievals_wait;
+extern atomic_t fscache_n_retrievals_nodata;
+extern atomic_t fscache_n_retrievals_nobufs;
+extern atomic_t fscache_n_retrievals_intr;
+extern atomic_t fscache_n_retrievals_nomem;
+extern atomic_t fscache_n_retrievals_object_dead;
+extern atomic_t fscache_n_retrieval_ops;
+extern atomic_t fscache_n_retrieval_op_waits;
+
+extern atomic_t fscache_n_stores;
+extern atomic_t fscache_n_stores_ok;
+extern atomic_t fscache_n_stores_again;
+extern atomic_t fscache_n_stores_nobufs;
+extern atomic_t fscache_n_stores_oom;
+extern atomic_t fscache_n_store_ops;
+extern atomic_t fscache_n_store_calls;
+extern atomic_t fscache_n_store_pages;
+extern atomic_t fscache_n_store_radix_deletes;
+extern atomic_t fscache_n_store_pages_over_limit;
+
+extern atomic_t fscache_n_store_vmscan_not_storing;
+extern atomic_t fscache_n_store_vmscan_gone;
+extern atomic_t fscache_n_store_vmscan_busy;
+extern atomic_t fscache_n_store_vmscan_cancelled;
+
+extern atomic_t fscache_n_marks;
+extern atomic_t fscache_n_uncaches;
+
+extern atomic_t fscache_n_acquires;
+extern atomic_t fscache_n_acquires_null;
+extern atomic_t fscache_n_acquires_no_cache;
+extern atomic_t fscache_n_acquires_ok;
+extern atomic_t fscache_n_acquires_nobufs;
+extern atomic_t fscache_n_acquires_oom;
+
+extern atomic_t fscache_n_updates;
+extern atomic_t fscache_n_updates_null;
+extern atomic_t fscache_n_updates_run;
+
+extern atomic_t fscache_n_relinquishes;
+extern atomic_t fscache_n_relinquishes_null;
+extern atomic_t fscache_n_relinquishes_waitcrt;
+extern atomic_t fscache_n_relinquishes_retire;
+
+extern atomic_t fscache_n_cookie_index;
+extern atomic_t fscache_n_cookie_data;
+extern atomic_t fscache_n_cookie_special;
+
+extern atomic_t fscache_n_object_alloc;
+extern atomic_t fscache_n_object_no_alloc;
+extern atomic_t fscache_n_object_lookups;
+extern atomic_t fscache_n_object_lookups_negative;
+extern atomic_t fscache_n_object_lookups_positive;
+extern atomic_t fscache_n_object_lookups_timed_out;
+extern atomic_t fscache_n_object_created;
+extern atomic_t fscache_n_object_avail;
+extern atomic_t fscache_n_object_dead;
+
+extern atomic_t fscache_n_checkaux_none;
+extern atomic_t fscache_n_checkaux_okay;
+extern atomic_t fscache_n_checkaux_update;
+extern atomic_t fscache_n_checkaux_obsolete;
+
+extern atomic_t fscache_n_cop_alloc_object;
+extern atomic_t fscache_n_cop_lookup_object;
+extern atomic_t fscache_n_cop_lookup_complete;
+extern atomic_t fscache_n_cop_grab_object;
+extern atomic_t fscache_n_cop_update_object;
+extern atomic_t fscache_n_cop_drop_object;
+extern atomic_t fscache_n_cop_put_object;
+extern atomic_t fscache_n_cop_sync_cache;
+extern atomic_t fscache_n_cop_attr_changed;
+extern atomic_t fscache_n_cop_read_or_alloc_page;
+extern atomic_t fscache_n_cop_read_or_alloc_pages;
+extern atomic_t fscache_n_cop_allocate_page;
+extern atomic_t fscache_n_cop_allocate_pages;
+extern atomic_t fscache_n_cop_write_page;
+extern atomic_t fscache_n_cop_uncache_page;
+extern atomic_t fscache_n_cop_dissociate_pages;
+
+static inline void fscache_stat(atomic_t *stat)
+{
+	atomic_inc(stat);
+}
+
+static inline void fscache_stat_d(atomic_t *stat)
+{
+	atomic_dec(stat);
+}
+
+#define __fscache_stat(stat) (stat)
+
+extern const struct file_operations fscache_stats_fops;
+#else
+
+#define __fscache_stat(stat) (NULL)
+#define fscache_stat(stat) do {} while (0)
+#define fscache_stat_d(stat) do {} while (0)
+#endif
+
+/*
+ * raise an event on an object
+ * - if the event is not masked for that object, then the object is
+ *   queued for attention by the thread pool.
+ */
+static inline void fscache_raise_event(struct fscache_object *object,
+				       unsigned event)
+{
+	if (!test_and_set_bit(event, &object->events) &&
+	    test_bit(event, &object->event_mask))
+		fscache_enqueue_object(object);
+}
+
+/*
+ * drop a reference to a cookie
+ */
+static inline void fscache_cookie_put(struct fscache_cookie *cookie)
+{
+	BUG_ON(atomic_read(&cookie->usage) <= 0);
+	if (atomic_dec_and_test(&cookie->usage))
+		__fscache_cookie_put(cookie);
+}
+
+/*
+ * get an extra reference to a netfs retrieval context
+ */
+static inline
+void *fscache_get_context(struct fscache_cookie *cookie, void *context)
+{
+	if (cookie->def->get_context)
+		cookie->def->get_context(cookie->netfs_data, context);
+	return context;
+}
+
+/*
+ * release a reference to a netfs retrieval context
+ */
+static inline
+void fscache_put_context(struct fscache_cookie *cookie, void *context)
+{
+	if (cookie->def->put_context)
+		cookie->def->put_context(cookie->netfs_data, context);
+}
+
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+	printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+
+#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
+
+#ifdef __KDEBUG
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+
+#elif defined(CONFIG_FSCACHE_DEBUG)
+#define _enter(FMT, ...)			\
+do {						\
+	if (__do_kdebug(ENTER))			\
+		kenter(FMT, ##__VA_ARGS__);	\
+} while (0)
+
+#define _leave(FMT, ...)			\
+do {						\
+	if (__do_kdebug(LEAVE))			\
+		kleave(FMT, ##__VA_ARGS__);	\
+} while (0)
+
+#define _debug(FMT, ...)			\
+do {						\
+	if (__do_kdebug(DEBUG))			\
+		kdebug(FMT, ##__VA_ARGS__);	\
+} while (0)
+
+#else
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
+#endif
+
+/*
+ * determine whether a particular optional debugging point should be logged
+ * - we need to go through three steps to persuade cpp to correctly join the
+ *   shorthand in FSCACHE_DEBUG_LEVEL with its prefix
+ */
+#define ____do_kdebug(LEVEL, POINT) \
+	unlikely((fscache_debug & \
+		  (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3))))
+#define ___do_kdebug(LEVEL, POINT) \
+	____do_kdebug(LEVEL, POINT)
+#define __do_kdebug(POINT) \
+	___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT)
+
+#define FSCACHE_DEBUG_CACHE	0
+#define FSCACHE_DEBUG_COOKIE	1
+#define FSCACHE_DEBUG_PAGE	2
+#define FSCACHE_DEBUG_OPERATION	3
+
+#define FSCACHE_POINT_ENTER	1
+#define FSCACHE_POINT_LEAVE	2
+#define FSCACHE_POINT_DEBUG	4
+
+#ifndef FSCACHE_DEBUG_LEVEL
+#define FSCACHE_DEBUG_LEVEL CACHE
+#endif
+
+/*
+ * assertions
+ */
+#if 1 /* defined(__KDEBUGALL) */
+
+#define ASSERT(X)							\
+do {									\
+	if (unlikely(!(X))) {						\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "FS-Cache: Assertion failed\n");	\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTCMP(X, OP, Y)						\
+do {									\
+	if (unlikely(!((X) OP (Y)))) {					\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "FS-Cache: Assertion failed\n");	\
+		printk(KERN_ERR "%lx " #OP " %lx is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTIF(C, X)							\
+do {									\
+	if (unlikely((C) && !(X))) {					\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "FS-Cache: Assertion failed\n");	\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTIFCMP(C, X, OP, Y)					\
+do {									\
+	if (unlikely((C) && !((X) OP (Y)))) {				\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "FS-Cache: Assertion failed\n");	\
+		printk(KERN_ERR "%lx " #OP " %lx is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while (0)
+
+#else
+
+#define ASSERT(X)			do {} while (0)
+#define ASSERTCMP(X, OP, Y)		do {} while (0)
+#define ASSERTIF(C, X)			do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)	do {} while (0)
+
+#endif /* assert or not */
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
new file mode 100644
index 00000000..f9d85677
--- /dev/null
+++ b/fs/fscache/main.c
@@ -0,0 +1,218 @@
+/* General filesystem local caching manager
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+MODULE_DESCRIPTION("FS Cache Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+unsigned fscache_defer_lookup = 1;
+module_param_named(defer_lookup, fscache_defer_lookup, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_defer_lookup,
+		 "Defer cookie lookup to background thread");
+
+unsigned fscache_defer_create = 1;
+module_param_named(defer_create, fscache_defer_create, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_defer_create,
+		 "Defer cookie creation to background thread");
+
+unsigned fscache_debug;
+module_param_named(debug, fscache_debug, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_debug,
+		 "FS-Cache debugging mask");
+
+struct kobject *fscache_root;
+struct workqueue_struct *fscache_object_wq;
+struct workqueue_struct *fscache_op_wq;
+
+DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+
+/* these values serve as lower bounds, will be adjusted in fscache_init() */
+static unsigned fscache_object_max_active = 4;
+static unsigned fscache_op_max_active = 2;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *fscache_sysctl_header;
+
+static int fscache_max_active_sysctl(struct ctl_table *table, int write,
+				     void __user *buffer,
+				     size_t *lenp, loff_t *ppos)
+{
+	struct workqueue_struct **wqp = table->extra1;
+	unsigned int *datap = table->data;
+	int ret;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret == 0)
+		workqueue_set_max_active(*wqp, *datap);
+	return ret;
+}
+
+ctl_table fscache_sysctls[] = {
+	{
+		.procname	= "object_max_active",
+		.data		= &fscache_object_max_active,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= fscache_max_active_sysctl,
+		.extra1		= &fscache_object_wq,
+	},
+	{
+		.procname	= "operation_max_active",
+		.data		= &fscache_op_max_active,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= fscache_max_active_sysctl,
+		.extra1		= &fscache_op_wq,
+	},
+	{}
+};
+
+ctl_table fscache_sysctls_root[] = {
+	{
+		.procname	= "fscache",
+		.mode		= 0555,
+		.child		= fscache_sysctls,
+	},
+	{}
+};
+#endif
+
+/*
+ * initialise the fs caching module
+ */
+static int __init fscache_init(void)
+{
+	unsigned int nr_cpus = num_possible_cpus();
+	unsigned int cpu;
+	int ret;
+
+	fscache_object_max_active =
+		clamp_val(nr_cpus,
+			  fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE);
+
+	ret = -ENOMEM;
+	fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND,
+					    fscache_object_max_active);
+	if (!fscache_object_wq)
+		goto error_object_wq;
+
+	fscache_op_max_active =
+		clamp_val(fscache_object_max_active / 2,
+			  fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE);
+
+	ret = -ENOMEM;
+	fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND,
+					fscache_op_max_active);
+	if (!fscache_op_wq)
+		goto error_op_wq;
+
+	for_each_possible_cpu(cpu)
+		init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu));
+
+	ret = fscache_proc_init();
+	if (ret < 0)
+		goto error_proc;
+
+#ifdef CONFIG_SYSCTL
+	ret = -ENOMEM;
+	fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root);
+	if (!fscache_sysctl_header)
+		goto error_sysctl;
+#endif
+
+	fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
+					       sizeof(struct fscache_cookie),
+					       0,
+					       0,
+					       fscache_cookie_init_once);
+	if (!fscache_cookie_jar) {
+		printk(KERN_NOTICE
+		       "FS-Cache: Failed to allocate a cookie jar\n");
+		ret = -ENOMEM;
+		goto error_cookie_jar;
+	}
+
+	fscache_root = kobject_create_and_add("fscache", kernel_kobj);
+	if (!fscache_root)
+		goto error_kobj;
+
+	printk(KERN_NOTICE "FS-Cache: Loaded\n");
+	return 0;
+
+error_kobj:
+	kmem_cache_destroy(fscache_cookie_jar);
+error_cookie_jar:
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(fscache_sysctl_header);
+error_sysctl:
+#endif
+	fscache_proc_cleanup();
+error_proc:
+	destroy_workqueue(fscache_op_wq);
+error_op_wq:
+	destroy_workqueue(fscache_object_wq);
+error_object_wq:
+	return ret;
+}
+
+fs_initcall(fscache_init);
+
+/*
+ * clean up on module removal
+ */
+static void __exit fscache_exit(void)
+{
+	_enter("");
+
+	kobject_put(fscache_root);
+	kmem_cache_destroy(fscache_cookie_jar);
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(fscache_sysctl_header);
+#endif
+	fscache_proc_cleanup();
+	destroy_workqueue(fscache_op_wq);
+	destroy_workqueue(fscache_object_wq);
+	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
+}
+
+module_exit(fscache_exit);
+
+/*
+ * wait_on_bit() sleep function for uninterruptible waiting
+ */
+int fscache_wait_bit(void *flags)
+{
+	schedule();
+	return 0;
+}
+EXPORT_SYMBOL(fscache_wait_bit);
+
+/*
+ * wait_on_bit() sleep function for interruptible waiting
+ */
+int fscache_wait_bit_interruptible(void *flags)
+{
+	schedule();
+	return signal_pending(current);
+}
+EXPORT_SYMBOL(fscache_wait_bit_interruptible);
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
new file mode 100644
index 00000000..e028b8eb
--- /dev/null
+++ b/fs/fscache/netfs.c
@@ -0,0 +1,103 @@
+/* FS-Cache netfs (client) registration
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static LIST_HEAD(fscache_netfs_list);
+
+/*
+ * register a network filesystem for caching
+ */
+int __fscache_register_netfs(struct fscache_netfs *netfs)
+{
+	struct fscache_netfs *ptr;
+	int ret;
+
+	_enter("{%s}", netfs->name);
+
+	INIT_LIST_HEAD(&netfs->link);
+
+	/* allocate a cookie for the primary index */
+	netfs->primary_index =
+		kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
+
+	if (!netfs->primary_index) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	/* initialise the primary index cookie */
+	atomic_set(&netfs->primary_index->usage, 1);
+	atomic_set(&netfs->primary_index->n_children, 0);
+
+	netfs->primary_index->def		= &fscache_fsdef_netfs_def;
+	netfs->primary_index->parent		= &fscache_fsdef_index;
+	netfs->primary_index->netfs_data	= netfs;
+
+	atomic_inc(&netfs->primary_index->parent->usage);
+	atomic_inc(&netfs->primary_index->parent->n_children);
+
+	spin_lock_init(&netfs->primary_index->lock);
+	INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
+
+	/* check the netfs type is not already present */
+	down_write(&fscache_addremove_sem);
+
+	ret = -EEXIST;
+	list_for_each_entry(ptr, &fscache_netfs_list, link) {
+		if (strcmp(ptr->name, netfs->name) == 0)
+			goto already_registered;
+	}
+
+	list_add(&netfs->link, &fscache_netfs_list);
+	ret = 0;
+
+	printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n",
+	       netfs->name);
+
+already_registered:
+	up_write(&fscache_addremove_sem);
+
+	if (ret < 0) {
+		netfs->primary_index->parent = NULL;
+		__fscache_cookie_put(netfs->primary_index);
+		netfs->primary_index = NULL;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(__fscache_register_netfs);
+
+/*
+ * unregister a network filesystem from the cache
+ * - all cookies must have been released first
+ */
+void __fscache_unregister_netfs(struct fscache_netfs *netfs)
+{
+	_enter("{%s.%u}", netfs->name, netfs->version);
+
+	down_write(&fscache_addremove_sem);
+
+	list_del(&netfs->link);
+	fscache_relinquish_cookie(netfs->primary_index, 0);
+
+	up_write(&fscache_addremove_sem);
+
+	printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n",
+	       netfs->name);
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_unregister_netfs);
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
new file mode 100644
index 00000000..ebe29c58
--- /dev/null
+++ b/fs/fscache/object-list.c
@@ -0,0 +1,432 @@
+/* Global fscache object list maintainer and viewer
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/key.h>
+#include <keys/user-type.h>
+#include "internal.h"
+
+static struct rb_root fscache_object_list;
+static DEFINE_RWLOCK(fscache_object_list_lock);
+
+struct fscache_objlist_data {
+	unsigned long	config;		/* display configuration */
+#define FSCACHE_OBJLIST_CONFIG_KEY	0x00000001	/* show object keys */
+#define FSCACHE_OBJLIST_CONFIG_AUX	0x00000002	/* show object auxdata */
+#define FSCACHE_OBJLIST_CONFIG_COOKIE	0x00000004	/* show objects with cookies */
+#define FSCACHE_OBJLIST_CONFIG_NOCOOKIE	0x00000008	/* show objects without cookies */
+#define FSCACHE_OBJLIST_CONFIG_BUSY	0x00000010	/* show busy objects */
+#define FSCACHE_OBJLIST_CONFIG_IDLE	0x00000020	/* show idle objects */
+#define FSCACHE_OBJLIST_CONFIG_PENDWR	0x00000040	/* show objects with pending writes */
+#define FSCACHE_OBJLIST_CONFIG_NOPENDWR	0x00000080	/* show objects without pending writes */
+#define FSCACHE_OBJLIST_CONFIG_READS	0x00000100	/* show objects with active reads */
+#define FSCACHE_OBJLIST_CONFIG_NOREADS	0x00000200	/* show objects without active reads */
+#define FSCACHE_OBJLIST_CONFIG_EVENTS	0x00000400	/* show objects with events */
+#define FSCACHE_OBJLIST_CONFIG_NOEVENTS	0x00000800	/* show objects without no events */
+#define FSCACHE_OBJLIST_CONFIG_WORK	0x00001000	/* show objects with work */
+#define FSCACHE_OBJLIST_CONFIG_NOWORK	0x00002000	/* show objects without work */
+
+	u8		buf[512];	/* key and aux data buffer */
+};
+
+/*
+ * Add an object to the object list
+ * - we use the address of the fscache_object structure as the key into the
+ *   tree
+ */
+void fscache_objlist_add(struct fscache_object *obj)
+{
+	struct fscache_object *xobj;
+	struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL;
+
+	write_lock(&fscache_object_list_lock);
+
+	while (*p) {
+		parent = *p;
+		xobj = rb_entry(parent, struct fscache_object, objlist_link);
+
+		if (obj < xobj)
+			p = &(*p)->rb_left;
+		else if (obj > xobj)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&obj->objlist_link, parent, p);
+	rb_insert_color(&obj->objlist_link, &fscache_object_list);
+
+	write_unlock(&fscache_object_list_lock);
+}
+
+/**
+ * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * @object: The object to be destroyed
+ *
+ * Note the imminent destruction and deallocation of a cache object record.
+ */
+void fscache_object_destroy(struct fscache_object *obj)
+{
+	write_lock(&fscache_object_list_lock);
+
+	BUG_ON(RB_EMPTY_ROOT(&fscache_object_list));
+	rb_erase(&obj->objlist_link, &fscache_object_list);
+
+	write_unlock(&fscache_object_list_lock);
+}
+EXPORT_SYMBOL(fscache_object_destroy);
+
+/*
+ * find the object in the tree on or after the specified index
+ */
+static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
+{
+	struct fscache_object *pobj, *obj = NULL, *minobj = NULL;
+	struct rb_node *p;
+	unsigned long pos;
+
+	if (*_pos >= (unsigned long) ERR_PTR(-ENOENT))
+		return NULL;
+	pos = *_pos;
+
+	/* banners (can't represent line 0 by pos 0 as that would involve
+	 * returning a NULL pointer) */
+	if (pos == 0)
+		return (struct fscache_object *)(long)++(*_pos);
+	if (pos < 3)
+		return (struct fscache_object *)pos;
+
+	pobj = (struct fscache_object *)pos;
+	p = fscache_object_list.rb_node;
+	while (p) {
+		obj = rb_entry(p, struct fscache_object, objlist_link);
+		if (pobj < obj) {
+			if (!minobj || minobj > obj)
+				minobj = obj;
+			p = p->rb_left;
+		} else if (pobj > obj) {
+			p = p->rb_right;
+		} else {
+			minobj = obj;
+			break;
+		}
+		obj = NULL;
+	}
+
+	if (!minobj)
+		*_pos = (unsigned long) ERR_PTR(-ENOENT);
+	else if (minobj != obj)
+		*_pos = (unsigned long) minobj;
+	return minobj;
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *fscache_objlist_start(struct seq_file *m, loff_t *_pos)
+	__acquires(&fscache_object_list_lock)
+{
+	read_lock(&fscache_object_list_lock);
+	return fscache_objlist_lookup(_pos);
+}
+
+/*
+ * move to the next line
+ */
+static void *fscache_objlist_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+	(*_pos)++;
+	return fscache_objlist_lookup(_pos);
+}
+
+/*
+ * clean up after reading
+ */
+static void fscache_objlist_stop(struct seq_file *m, void *v)
+	__releases(&fscache_object_list_lock)
+{
+	read_unlock(&fscache_object_list_lock);
+}
+
+/*
+ * display an object
+ */
+static int fscache_objlist_show(struct seq_file *m, void *v)
+{
+	struct fscache_objlist_data *data = m->private;
+	struct fscache_object *obj = v;
+	unsigned long config = data->config;
+	uint16_t keylen, auxlen;
+	char _type[3], *type;
+	bool no_cookie;
+	u8 *buf = data->buf, *p;
+
+	if ((unsigned long) v == 1) {
+		seq_puts(m, "OBJECT   PARENT   STAT CHLDN OPS OOP IPR EX READS"
+			 " EM EV F S"
+			 " | NETFS_COOKIE_DEF TY FL NETFS_DATA");
+		if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
+			      FSCACHE_OBJLIST_CONFIG_AUX))
+			seq_puts(m, "       ");
+		if (config & FSCACHE_OBJLIST_CONFIG_KEY)
+			seq_puts(m, "OBJECT_KEY");
+		if ((config & (FSCACHE_OBJLIST_CONFIG_KEY |
+			       FSCACHE_OBJLIST_CONFIG_AUX)) ==
+		    (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX))
+			seq_puts(m, ", ");
+		if (config & FSCACHE_OBJLIST_CONFIG_AUX)
+			seq_puts(m, "AUX_DATA");
+		seq_puts(m, "\n");
+		return 0;
+	}
+
+	if ((unsigned long) v == 2) {
+		seq_puts(m, "======== ======== ==== ===== === === === == ====="
+			 " == == = ="
+			 " | ================ == == ================");
+		if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
+			      FSCACHE_OBJLIST_CONFIG_AUX))
+			seq_puts(m, " ================");
+		seq_puts(m, "\n");
+		return 0;
+	}
+
+	/* filter out any unwanted objects */
+#define FILTER(criterion, _yes, _no)					\
+	do {								\
+		unsigned long yes = FSCACHE_OBJLIST_CONFIG_##_yes;	\
+		unsigned long no = FSCACHE_OBJLIST_CONFIG_##_no;	\
+		if (criterion) {					\
+			if (!(config & yes))				\
+				return 0;				\
+		} else {						\
+			if (!(config & no))				\
+				return 0;				\
+		}							\
+	} while(0)
+
+	if (~config) {
+		FILTER(obj->cookie,
+		       COOKIE, NOCOOKIE);
+		FILTER(obj->state != FSCACHE_OBJECT_ACTIVE ||
+		       obj->n_ops != 0 ||
+		       obj->n_obj_ops != 0 ||
+		       obj->flags ||
+		       !list_empty(&obj->dependents),
+		       BUSY, IDLE);
+		FILTER(test_bit(FSCACHE_OBJECT_PENDING_WRITE, &obj->flags),
+		       PENDWR, NOPENDWR);
+		FILTER(atomic_read(&obj->n_reads),
+		       READS, NOREADS);
+		FILTER(obj->events & obj->event_mask,
+		       EVENTS, NOEVENTS);
+		FILTER(work_busy(&obj->work), WORK, NOWORK);
+	}
+
+	seq_printf(m,
+		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
+		   obj->debug_id,
+		   obj->parent ? obj->parent->debug_id : -1,
+		   fscache_object_states_short[obj->state],
+		   obj->n_children,
+		   obj->n_ops,
+		   obj->n_obj_ops,
+		   obj->n_in_progress,
+		   obj->n_exclusive,
+		   atomic_read(&obj->n_reads),
+		   obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
+		   obj->events,
+		   obj->flags,
+		   work_busy(&obj->work));
+
+	no_cookie = true;
+	keylen = auxlen = 0;
+	if (obj->cookie) {
+		spin_lock(&obj->lock);
+		if (obj->cookie) {
+			switch (obj->cookie->def->type) {
+			case 0:
+				type = "IX";
+				break;
+			case 1:
+				type = "DT";
+				break;
+			default:
+				sprintf(_type, "%02u",
+					obj->cookie->def->type);
+				type = _type;
+				break;
+			}
+
+			seq_printf(m, "%-16s %s %2lx %16p",
+				   obj->cookie->def->name,
+				   type,
+				   obj->cookie->flags,
+				   obj->cookie->netfs_data);
+
+			if (obj->cookie->def->get_key &&
+			    config & FSCACHE_OBJLIST_CONFIG_KEY)
+				keylen = obj->cookie->def->get_key(
+					obj->cookie->netfs_data,
+					buf, 400);
+
+			if (obj->cookie->def->get_aux &&
+			    config & FSCACHE_OBJLIST_CONFIG_AUX)
+				auxlen = obj->cookie->def->get_aux(
+					obj->cookie->netfs_data,
+					buf + keylen, 512 - keylen);
+
+			no_cookie = false;
+		}
+		spin_unlock(&obj->lock);
+
+		if (!no_cookie && (keylen > 0 || auxlen > 0)) {
+			seq_printf(m, " ");
+			for (p = buf; keylen > 0; keylen--)
+				seq_printf(m, "%02x", *p++);
+			if (auxlen > 0) {
+				if (config & FSCACHE_OBJLIST_CONFIG_KEY)
+					seq_printf(m, ", ");
+				for (; auxlen > 0; auxlen--)
+					seq_printf(m, "%02x", *p++);
+			}
+		}
+	}
+
+	if (no_cookie)
+		seq_printf(m, "<no_cookie>\n");
+	else
+		seq_printf(m, "\n");
+	return 0;
+}
+
+static const struct seq_operations fscache_objlist_ops = {
+	.start		= fscache_objlist_start,
+	.stop		= fscache_objlist_stop,
+	.next		= fscache_objlist_next,
+	.show		= fscache_objlist_show,
+};
+
+/*
+ * get the configuration for filtering the list
+ */
+static void fscache_objlist_config(struct fscache_objlist_data *data)
+{
+#ifdef CONFIG_KEYS
+	struct user_key_payload *confkey;
+	unsigned long config;
+	struct key *key;
+	const char *buf;
+	int len;
+
+	key = request_key(&key_type_user, "fscache:objlist", NULL);
+	if (IS_ERR(key))
+		goto no_config;
+
+	config = 0;
+	rcu_read_lock();
+
+	confkey = key->payload.data;
+	buf = confkey->data;
+
+	for (len = confkey->datalen - 1; len >= 0; len--) {
+		switch (buf[len]) {
+		case 'K': config |= FSCACHE_OBJLIST_CONFIG_KEY;		break;
+		case 'A': config |= FSCACHE_OBJLIST_CONFIG_AUX;		break;
+		case 'C': config |= FSCACHE_OBJLIST_CONFIG_COOKIE;	break;
+		case 'c': config |= FSCACHE_OBJLIST_CONFIG_NOCOOKIE;	break;
+		case 'B': config |= FSCACHE_OBJLIST_CONFIG_BUSY;	break;
+		case 'b': config |= FSCACHE_OBJLIST_CONFIG_IDLE;	break;
+		case 'W': config |= FSCACHE_OBJLIST_CONFIG_PENDWR;	break;
+		case 'w': config |= FSCACHE_OBJLIST_CONFIG_NOPENDWR;	break;
+		case 'R': config |= FSCACHE_OBJLIST_CONFIG_READS;	break;
+		case 'r': config |= FSCACHE_OBJLIST_CONFIG_NOREADS;	break;
+		case 'S': config |= FSCACHE_OBJLIST_CONFIG_WORK;	break;
+		case 's': config |= FSCACHE_OBJLIST_CONFIG_NOWORK;	break;
+		}
+	}
+
+	rcu_read_unlock();
+	key_put(key);
+
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK;
+
+	data->config = config;
+	return;
+
+no_config:
+#endif
+	data->config = ULONG_MAX;
+}
+
+/*
+ * open "/proc/fs/fscache/objects" to provide a list of active objects
+ * - can be configured by a user-defined key added to the caller's keyrings
+ */
+static int fscache_objlist_open(struct inode *inode, struct file *file)
+{
+	struct fscache_objlist_data *data;
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open(file, &fscache_objlist_ops);
+	if (ret < 0)
+		return ret;
+
+	m = file->private_data;
+
+	/* buffer for key extraction */
+	data = kmalloc(sizeof(struct fscache_objlist_data), GFP_KERNEL);
+	if (!data) {
+		seq_release(inode, file);
+		return -ENOMEM;
+	}
+
+	/* get the configuration key */
+	fscache_objlist_config(data);
+
+	m->private = data;
+	return 0;
+}
+
+/*
+ * clean up on close
+ */
+static int fscache_objlist_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+
+	kfree(m->private);
+	m->private = NULL;
+	return seq_release(inode, file);
+}
+
+const struct file_operations fscache_objlist_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fscache_objlist_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= fscache_objlist_release,
+};
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
new file mode 100644
index 00000000..b6b897c5
--- /dev/null
+++ b/fs/fscache/object.c
@@ -0,0 +1,892 @@
+/* FS-Cache object state machine handler
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/object.txt for a description of the
+ * object state machine and the in-kernel representations.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include "internal.h"
+
+const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
+	[FSCACHE_OBJECT_INIT]		= "OBJECT_INIT",
+	[FSCACHE_OBJECT_LOOKING_UP]	= "OBJECT_LOOKING_UP",
+	[FSCACHE_OBJECT_CREATING]	= "OBJECT_CREATING",
+	[FSCACHE_OBJECT_AVAILABLE]	= "OBJECT_AVAILABLE",
+	[FSCACHE_OBJECT_ACTIVE]		= "OBJECT_ACTIVE",
+	[FSCACHE_OBJECT_UPDATING]	= "OBJECT_UPDATING",
+	[FSCACHE_OBJECT_DYING]		= "OBJECT_DYING",
+	[FSCACHE_OBJECT_LC_DYING]	= "OBJECT_LC_DYING",
+	[FSCACHE_OBJECT_ABORT_INIT]	= "OBJECT_ABORT_INIT",
+	[FSCACHE_OBJECT_RELEASING]	= "OBJECT_RELEASING",
+	[FSCACHE_OBJECT_RECYCLING]	= "OBJECT_RECYCLING",
+	[FSCACHE_OBJECT_WITHDRAWING]	= "OBJECT_WITHDRAWING",
+	[FSCACHE_OBJECT_DEAD]		= "OBJECT_DEAD",
+};
+EXPORT_SYMBOL(fscache_object_states);
+
+const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
+	[FSCACHE_OBJECT_INIT]		= "INIT",
+	[FSCACHE_OBJECT_LOOKING_UP]	= "LOOK",
+	[FSCACHE_OBJECT_CREATING]	= "CRTN",
+	[FSCACHE_OBJECT_AVAILABLE]	= "AVBL",
+	[FSCACHE_OBJECT_ACTIVE]		= "ACTV",
+	[FSCACHE_OBJECT_UPDATING]	= "UPDT",
+	[FSCACHE_OBJECT_DYING]		= "DYNG",
+	[FSCACHE_OBJECT_LC_DYING]	= "LCDY",
+	[FSCACHE_OBJECT_ABORT_INIT]	= "ABTI",
+	[FSCACHE_OBJECT_RELEASING]	= "RELS",
+	[FSCACHE_OBJECT_RECYCLING]	= "RCYC",
+	[FSCACHE_OBJECT_WITHDRAWING]	= "WTHD",
+	[FSCACHE_OBJECT_DEAD]		= "DEAD",
+};
+
+static int  fscache_get_object(struct fscache_object *);
+static void fscache_put_object(struct fscache_object *);
+static void fscache_initialise_object(struct fscache_object *);
+static void fscache_lookup_object(struct fscache_object *);
+static void fscache_object_available(struct fscache_object *);
+static void fscache_release_object(struct fscache_object *);
+static void fscache_withdraw_object(struct fscache_object *);
+static void fscache_enqueue_dependents(struct fscache_object *);
+static void fscache_dequeue_object(struct fscache_object *);
+
+/*
+ * we need to notify the parent when an op completes that we had outstanding
+ * upon it
+ */
+static inline void fscache_done_parent_op(struct fscache_object *object)
+{
+	struct fscache_object *parent = object->parent;
+
+	_enter("OBJ%x {OBJ%x,%x}",
+	       object->debug_id, parent->debug_id, parent->n_ops);
+
+	spin_lock_nested(&parent->lock, 1);
+	parent->n_ops--;
+	parent->n_obj_ops--;
+	if (parent->n_ops == 0)
+		fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+	spin_unlock(&parent->lock);
+}
+
+/*
+ * process events that have been sent to an object's state machine
+ * - initiates parent lookup
+ * - does object lookup
+ * - does object creation
+ * - does object recycling and retirement
+ * - does object withdrawal
+ */
+static void fscache_object_state_machine(struct fscache_object *object)
+{
+	enum fscache_object_state new_state;
+	struct fscache_cookie *cookie;
+
+	ASSERT(object != NULL);
+
+	_enter("{OBJ%x,%s,%lx}",
+	       object->debug_id, fscache_object_states[object->state],
+	       object->events);
+
+	switch (object->state) {
+		/* wait for the parent object to become ready */
+	case FSCACHE_OBJECT_INIT:
+		object->event_mask =
+			ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+		fscache_initialise_object(object);
+		goto done;
+
+		/* look up the object metadata on disk */
+	case FSCACHE_OBJECT_LOOKING_UP:
+		fscache_lookup_object(object);
+		goto lookup_transit;
+
+		/* create the object metadata on disk */
+	case FSCACHE_OBJECT_CREATING:
+		fscache_lookup_object(object);
+		goto lookup_transit;
+
+		/* handle an object becoming available; start pending
+		 * operations and queue dependent operations for processing */
+	case FSCACHE_OBJECT_AVAILABLE:
+		fscache_object_available(object);
+		goto active_transit;
+
+		/* normal running state */
+	case FSCACHE_OBJECT_ACTIVE:
+		goto active_transit;
+
+		/* update the object metadata on disk */
+	case FSCACHE_OBJECT_UPDATING:
+		clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
+		fscache_stat(&fscache_n_updates_run);
+		fscache_stat(&fscache_n_cop_update_object);
+		object->cache->ops->update_object(object);
+		fscache_stat_d(&fscache_n_cop_update_object);
+		goto active_transit;
+
+		/* handle an object dying during lookup or creation */
+	case FSCACHE_OBJECT_LC_DYING:
+		object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
+		fscache_stat(&fscache_n_cop_lookup_complete);
+		object->cache->ops->lookup_complete(object);
+		fscache_stat_d(&fscache_n_cop_lookup_complete);
+
+		spin_lock(&object->lock);
+		object->state = FSCACHE_OBJECT_DYING;
+		cookie = object->cookie;
+		if (cookie) {
+			if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP,
+					       &cookie->flags))
+				wake_up_bit(&cookie->flags,
+					    FSCACHE_COOKIE_LOOKING_UP);
+			if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+					       &cookie->flags))
+				wake_up_bit(&cookie->flags,
+					    FSCACHE_COOKIE_CREATING);
+		}
+		spin_unlock(&object->lock);
+
+		fscache_done_parent_op(object);
+
+		/* wait for completion of all active operations on this object
+		 * and the death of all child objects of this object */
+	case FSCACHE_OBJECT_DYING:
+	dying:
+		clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
+		spin_lock(&object->lock);
+		_debug("dying OBJ%x {%d,%d}",
+		       object->debug_id, object->n_ops, object->n_children);
+		if (object->n_ops == 0 && object->n_children == 0) {
+			object->event_mask &=
+				~(1 << FSCACHE_OBJECT_EV_CLEARED);
+			object->event_mask |=
+				(1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+				(1 << FSCACHE_OBJECT_EV_RETIRE) |
+				(1 << FSCACHE_OBJECT_EV_RELEASE) |
+				(1 << FSCACHE_OBJECT_EV_ERROR);
+		} else {
+			object->event_mask &=
+				~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+				  (1 << FSCACHE_OBJECT_EV_RETIRE) |
+				  (1 << FSCACHE_OBJECT_EV_RELEASE) |
+				  (1 << FSCACHE_OBJECT_EV_ERROR));
+			object->event_mask |=
+				1 << FSCACHE_OBJECT_EV_CLEARED;
+		}
+		spin_unlock(&object->lock);
+		fscache_enqueue_dependents(object);
+		fscache_start_operations(object);
+		goto terminal_transit;
+
+		/* handle an abort during initialisation */
+	case FSCACHE_OBJECT_ABORT_INIT:
+		_debug("handle abort init %lx", object->events);
+		object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
+
+		spin_lock(&object->lock);
+		fscache_dequeue_object(object);
+
+		object->state = FSCACHE_OBJECT_DYING;
+		if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+				       &object->cookie->flags))
+			wake_up_bit(&object->cookie->flags,
+				    FSCACHE_COOKIE_CREATING);
+		spin_unlock(&object->lock);
+		goto dying;
+
+		/* handle the netfs releasing an object and possibly marking it
+		 * obsolete too */
+	case FSCACHE_OBJECT_RELEASING:
+	case FSCACHE_OBJECT_RECYCLING:
+		object->event_mask &=
+			~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+			  (1 << FSCACHE_OBJECT_EV_RETIRE) |
+			  (1 << FSCACHE_OBJECT_EV_RELEASE) |
+			  (1 << FSCACHE_OBJECT_EV_ERROR));
+		fscache_release_object(object);
+		spin_lock(&object->lock);
+		object->state = FSCACHE_OBJECT_DEAD;
+		spin_unlock(&object->lock);
+		fscache_stat(&fscache_n_object_dead);
+		goto terminal_transit;
+
+		/* handle the parent cache of this object being withdrawn from
+		 * active service */
+	case FSCACHE_OBJECT_WITHDRAWING:
+		object->event_mask &=
+			~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+			  (1 << FSCACHE_OBJECT_EV_RETIRE) |
+			  (1 << FSCACHE_OBJECT_EV_RELEASE) |
+			  (1 << FSCACHE_OBJECT_EV_ERROR));
+		fscache_withdraw_object(object);
+		spin_lock(&object->lock);
+		object->state = FSCACHE_OBJECT_DEAD;
+		spin_unlock(&object->lock);
+		fscache_stat(&fscache_n_object_dead);
+		goto terminal_transit;
+
+		/* complain about the object being woken up once it is
+		 * deceased */
+	case FSCACHE_OBJECT_DEAD:
+		printk(KERN_ERR "FS-Cache:"
+		       " Unexpected event in dead state %lx\n",
+		       object->events & object->event_mask);
+		BUG();
+
+	default:
+		printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
+		       object->state);
+		BUG();
+	}
+
+	/* determine the transition from a lookup state */
+lookup_transit:
+	switch (fls(object->events & object->event_mask) - 1) {
+	case FSCACHE_OBJECT_EV_WITHDRAW:
+	case FSCACHE_OBJECT_EV_RETIRE:
+	case FSCACHE_OBJECT_EV_RELEASE:
+	case FSCACHE_OBJECT_EV_ERROR:
+		new_state = FSCACHE_OBJECT_LC_DYING;
+		goto change_state;
+	case FSCACHE_OBJECT_EV_REQUEUE:
+		goto done;
+	case -1:
+		goto done; /* sleep until event */
+	default:
+		goto unsupported_event;
+	}
+
+	/* determine the transition from an active state */
+active_transit:
+	switch (fls(object->events & object->event_mask) - 1) {
+	case FSCACHE_OBJECT_EV_WITHDRAW:
+	case FSCACHE_OBJECT_EV_RETIRE:
+	case FSCACHE_OBJECT_EV_RELEASE:
+	case FSCACHE_OBJECT_EV_ERROR:
+		new_state = FSCACHE_OBJECT_DYING;
+		goto change_state;
+	case FSCACHE_OBJECT_EV_UPDATE:
+		new_state = FSCACHE_OBJECT_UPDATING;
+		goto change_state;
+	case -1:
+		new_state = FSCACHE_OBJECT_ACTIVE;
+		goto change_state; /* sleep until event */
+	default:
+		goto unsupported_event;
+	}
+
+	/* determine the transition from a terminal state */
+terminal_transit:
+	switch (fls(object->events & object->event_mask) - 1) {
+	case FSCACHE_OBJECT_EV_WITHDRAW:
+		new_state = FSCACHE_OBJECT_WITHDRAWING;
+		goto change_state;
+	case FSCACHE_OBJECT_EV_RETIRE:
+		new_state = FSCACHE_OBJECT_RECYCLING;
+		goto change_state;
+	case FSCACHE_OBJECT_EV_RELEASE:
+		new_state = FSCACHE_OBJECT_RELEASING;
+		goto change_state;
+	case FSCACHE_OBJECT_EV_ERROR:
+		new_state = FSCACHE_OBJECT_WITHDRAWING;
+		goto change_state;
+	case FSCACHE_OBJECT_EV_CLEARED:
+		new_state = FSCACHE_OBJECT_DYING;
+		goto change_state;
+	case -1:
+		goto done; /* sleep until event */
+	default:
+		goto unsupported_event;
+	}
+
+change_state:
+	spin_lock(&object->lock);
+	object->state = new_state;
+	spin_unlock(&object->lock);
+
+done:
+	_leave(" [->%s]", fscache_object_states[object->state]);
+	return;
+
+unsupported_event:
+	printk(KERN_ERR "FS-Cache:"
+	       " Unsupported event %lx [mask %lx] in state %s\n",
+	       object->events, object->event_mask,
+	       fscache_object_states[object->state]);
+	BUG();
+}
+
+/*
+ * execute an object
+ */
+void fscache_object_work_func(struct work_struct *work)
+{
+	struct fscache_object *object =
+		container_of(work, struct fscache_object, work);
+	unsigned long start;
+
+	_enter("{OBJ%x}", object->debug_id);
+
+	start = jiffies;
+	fscache_object_state_machine(object);
+	fscache_hist(fscache_objs_histogram, start);
+	if (object->events & object->event_mask)
+		fscache_enqueue_object(object);
+	clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+	fscache_put_object(object);
+}
+EXPORT_SYMBOL(fscache_object_work_func);
+
+/*
+ * initialise an object
+ * - check the specified object's parent to see if we can make use of it
+ *   immediately to do a creation
+ * - we may need to start the process of creating a parent and we need to wait
+ *   for the parent's lookup and creation to complete if it's not there yet
+ * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
+ *   leaf-most cookies of the object and all its children
+ */
+static void fscache_initialise_object(struct fscache_object *object)
+{
+	struct fscache_object *parent;
+
+	_enter("");
+	ASSERT(object->cookie != NULL);
+	ASSERT(object->cookie->parent != NULL);
+
+	if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
+			      (1 << FSCACHE_OBJECT_EV_RELEASE) |
+			      (1 << FSCACHE_OBJECT_EV_RETIRE) |
+			      (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
+		_debug("abort init %lx", object->events);
+		spin_lock(&object->lock);
+		object->state = FSCACHE_OBJECT_ABORT_INIT;
+		spin_unlock(&object->lock);
+		return;
+	}
+
+	spin_lock(&object->cookie->lock);
+	spin_lock_nested(&object->cookie->parent->lock, 1);
+
+	parent = object->parent;
+	if (!parent) {
+		_debug("no parent");
+		set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+	} else {
+		spin_lock(&object->lock);
+		spin_lock_nested(&parent->lock, 1);
+		_debug("parent %s", fscache_object_states[parent->state]);
+
+		if (parent->state >= FSCACHE_OBJECT_DYING) {
+			_debug("bad parent");
+			set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+		} else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
+			_debug("wait");
+
+			/* we may get woken up in this state by child objects
+			 * binding on to us, so we need to make sure we don't
+			 * add ourself to the list multiple times */
+			if (list_empty(&object->dep_link)) {
+				fscache_stat(&fscache_n_cop_grab_object);
+				object->cache->ops->grab_object(object);
+				fscache_stat_d(&fscache_n_cop_grab_object);
+				list_add(&object->dep_link,
+					 &parent->dependents);
+
+				/* fscache_acquire_non_index_cookie() uses this
+				 * to wake the chain up */
+				if (parent->state == FSCACHE_OBJECT_INIT)
+					fscache_enqueue_object(parent);
+			}
+		} else {
+			_debug("go");
+			parent->n_ops++;
+			parent->n_obj_ops++;
+			object->lookup_jif = jiffies;
+			object->state = FSCACHE_OBJECT_LOOKING_UP;
+			set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+		}
+
+		spin_unlock(&parent->lock);
+		spin_unlock(&object->lock);
+	}
+
+	spin_unlock(&object->cookie->parent->lock);
+	spin_unlock(&object->cookie->lock);
+	_leave("");
+}
+
+/*
+ * look an object up in the cache from which it was allocated
+ * - we hold an "access lock" on the parent object, so the parent object cannot
+ *   be withdrawn by either party till we've finished
+ * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
+ *   leaf-most cookies of the object and all its children
+ */
+static void fscache_lookup_object(struct fscache_object *object)
+{
+	struct fscache_cookie *cookie = object->cookie;
+	struct fscache_object *parent;
+	int ret;
+
+	_enter("");
+
+	parent = object->parent;
+	ASSERT(parent != NULL);
+	ASSERTCMP(parent->n_ops, >, 0);
+	ASSERTCMP(parent->n_obj_ops, >, 0);
+
+	/* make sure the parent is still available */
+	ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
+
+	if (parent->state >= FSCACHE_OBJECT_DYING ||
+	    test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+		_debug("unavailable");
+		set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+		_leave("");
+		return;
+	}
+
+	_debug("LOOKUP \"%s/%s\" in \"%s\"",
+	       parent->cookie->def->name, cookie->def->name,
+	       object->cache->tag->name);
+
+	fscache_stat(&fscache_n_object_lookups);
+	fscache_stat(&fscache_n_cop_lookup_object);
+	ret = object->cache->ops->lookup_object(object);
+	fscache_stat_d(&fscache_n_cop_lookup_object);
+
+	if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
+		set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+
+	if (ret == -ETIMEDOUT) {
+		/* probably stuck behind another object, so move this one to
+		 * the back of the queue */
+		fscache_stat(&fscache_n_object_lookups_timed_out);
+		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+	}
+
+	_leave("");
+}
+
+/**
+ * fscache_object_lookup_negative - Note negative cookie lookup
+ * @object: Object pointing to cookie to mark
+ *
+ * Note negative lookup, permitting those waiting to read data from an already
+ * existing backing object to continue as there's no data for them to read.
+ */
+void fscache_object_lookup_negative(struct fscache_object *object)
+{
+	struct fscache_cookie *cookie = object->cookie;
+
+	_enter("{OBJ%x,%s}",
+	       object->debug_id, fscache_object_states[object->state]);
+
+	spin_lock(&object->lock);
+	if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+		fscache_stat(&fscache_n_object_lookups_negative);
+
+		/* transit here to allow write requests to begin stacking up
+		 * and read requests to begin returning ENODATA */
+		object->state = FSCACHE_OBJECT_CREATING;
+		spin_unlock(&object->lock);
+
+		set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
+		set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+		_debug("wake up lookup %p", &cookie->flags);
+		smp_mb__before_clear_bit();
+		clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
+		smp_mb__after_clear_bit();
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+	} else {
+		ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
+		spin_unlock(&object->lock);
+	}
+
+	_leave("");
+}
+EXPORT_SYMBOL(fscache_object_lookup_negative);
+
+/**
+ * fscache_obtained_object - Note successful object lookup or creation
+ * @object: Object pointing to cookie to mark
+ *
+ * Note successful lookup and/or creation, permitting those waiting to write
+ * data to a backing object to continue.
+ *
+ * Note that after calling this, an object's cookie may be relinquished by the
+ * netfs, and so must be accessed with object lock held.
+ */
+void fscache_obtained_object(struct fscache_object *object)
+{
+	struct fscache_cookie *cookie = object->cookie;
+
+	_enter("{OBJ%x,%s}",
+	       object->debug_id, fscache_object_states[object->state]);
+
+	/* if we were still looking up, then we must have a positive lookup
+	 * result, in which case there may be data available */
+	spin_lock(&object->lock);
+	if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+		fscache_stat(&fscache_n_object_lookups_positive);
+
+		clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+		object->state = FSCACHE_OBJECT_AVAILABLE;
+		spin_unlock(&object->lock);
+
+		smp_mb__before_clear_bit();
+		clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
+		smp_mb__after_clear_bit();
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+	} else {
+		ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
+		fscache_stat(&fscache_n_object_created);
+
+		object->state = FSCACHE_OBJECT_AVAILABLE;
+		spin_unlock(&object->lock);
+		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+		smp_wmb();
+	}
+
+	if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
+
+	_leave("");
+}
+EXPORT_SYMBOL(fscache_obtained_object);
+
+/*
+ * handle an object that has just become available
+ */
+static void fscache_object_available(struct fscache_object *object)
+{
+	_enter("{OBJ%x}", object->debug_id);
+
+	spin_lock(&object->lock);
+
+	if (object->cookie &&
+	    test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
+		wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
+
+	fscache_done_parent_op(object);
+	if (object->n_in_progress == 0) {
+		if (object->n_ops > 0) {
+			ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
+			ASSERTIF(object->n_ops > object->n_obj_ops,
+				 !list_empty(&object->pending_ops));
+			fscache_start_operations(object);
+		} else {
+			ASSERT(list_empty(&object->pending_ops));
+		}
+	}
+	spin_unlock(&object->lock);
+
+	fscache_stat(&fscache_n_cop_lookup_complete);
+	object->cache->ops->lookup_complete(object);
+	fscache_stat_d(&fscache_n_cop_lookup_complete);
+	fscache_enqueue_dependents(object);
+
+	fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
+	fscache_stat(&fscache_n_object_avail);
+
+	_leave("");
+}
+
+/*
+ * drop an object's attachments
+ */
+static void fscache_drop_object(struct fscache_object *object)
+{
+	struct fscache_object *parent = object->parent;
+	struct fscache_cache *cache = object->cache;
+
+	_enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+
+	ASSERTCMP(object->cookie, ==, NULL);
+	ASSERT(hlist_unhashed(&object->cookie_link));
+
+	spin_lock(&cache->object_list_lock);
+	list_del_init(&object->cache_link);
+	spin_unlock(&cache->object_list_lock);
+
+	fscache_stat(&fscache_n_cop_drop_object);
+	cache->ops->drop_object(object);
+	fscache_stat_d(&fscache_n_cop_drop_object);
+
+	if (parent) {
+		_debug("release parent OBJ%x {%d}",
+		       parent->debug_id, parent->n_children);
+
+		spin_lock(&parent->lock);
+		parent->n_children--;
+		if (parent->n_children == 0)
+			fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+		spin_unlock(&parent->lock);
+		object->parent = NULL;
+	}
+
+	/* this just shifts the object release to the work processor */
+	fscache_put_object(object);
+
+	_leave("");
+}
+
+/*
+ * release or recycle an object that the netfs has discarded
+ */
+static void fscache_release_object(struct fscache_object *object)
+{
+	_enter("");
+
+	fscache_drop_object(object);
+}
+
+/*
+ * withdraw an object from active service
+ */
+static void fscache_withdraw_object(struct fscache_object *object)
+{
+	struct fscache_cookie *cookie;
+	bool detached;
+
+	_enter("");
+
+	spin_lock(&object->lock);
+	cookie = object->cookie;
+	if (cookie) {
+		/* need to get the cookie lock before the object lock, starting
+		 * from the object pointer */
+		atomic_inc(&cookie->usage);
+		spin_unlock(&object->lock);
+
+		detached = false;
+		spin_lock(&cookie->lock);
+		spin_lock(&object->lock);
+
+		if (object->cookie == cookie) {
+			hlist_del_init(&object->cookie_link);
+			object->cookie = NULL;
+			detached = true;
+		}
+		spin_unlock(&cookie->lock);
+		fscache_cookie_put(cookie);
+		if (detached)
+			fscache_cookie_put(cookie);
+	}
+
+	spin_unlock(&object->lock);
+
+	fscache_drop_object(object);
+}
+
+/*
+ * withdraw an object from active service at the behest of the cache
+ * - need break the links to a cached object cookie
+ * - called under two situations:
+ *   (1) recycler decides to reclaim an in-use object
+ *   (2) a cache is unmounted
+ * - have to take care as the cookie can be being relinquished by the netfs
+ *   simultaneously
+ * - the object is pinned by the caller holding a refcount on it
+ */
+void fscache_withdrawing_object(struct fscache_cache *cache,
+				struct fscache_object *object)
+{
+	bool enqueue = false;
+
+	_enter(",OBJ%x", object->debug_id);
+
+	spin_lock(&object->lock);
+	if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
+		object->state = FSCACHE_OBJECT_WITHDRAWING;
+		enqueue = true;
+	}
+	spin_unlock(&object->lock);
+
+	if (enqueue)
+		fscache_enqueue_object(object);
+
+	_leave("");
+}
+
+/*
+ * get a ref on an object
+ */
+static int fscache_get_object(struct fscache_object *object)
+{
+	int ret;
+
+	fscache_stat(&fscache_n_cop_grab_object);
+	ret = object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
+	fscache_stat_d(&fscache_n_cop_grab_object);
+	return ret;
+}
+
+/*
+ * discard a ref on a work item
+ */
+static void fscache_put_object(struct fscache_object *object)
+{
+	fscache_stat(&fscache_n_cop_put_object);
+	object->cache->ops->put_object(object);
+	fscache_stat_d(&fscache_n_cop_put_object);
+}
+
+/*
+ * enqueue an object for metadata-type processing
+ */
+void fscache_enqueue_object(struct fscache_object *object)
+{
+	_enter("{OBJ%x}", object->debug_id);
+
+	if (fscache_get_object(object) >= 0) {
+		wait_queue_head_t *cong_wq =
+			&get_cpu_var(fscache_object_cong_wait);
+
+		if (queue_work(fscache_object_wq, &object->work)) {
+			if (fscache_object_congested())
+				wake_up(cong_wq);
+		} else
+			fscache_put_object(object);
+
+		put_cpu_var(fscache_object_cong_wait);
+	}
+}
+
+/**
+ * fscache_object_sleep_till_congested - Sleep until object wq is congested
+ * @timoutp: Scheduler sleep timeout
+ *
+ * Allow an object handler to sleep until the object workqueue is congested.
+ *
+ * The caller must set up a wake up event before calling this and must have set
+ * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
+ * condition before calling this function as no test is made here.
+ *
+ * %true is returned if the object wq is congested, %false otherwise.
+ */
+bool fscache_object_sleep_till_congested(signed long *timeoutp)
+{
+	wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait);
+	DEFINE_WAIT(wait);
+
+	if (fscache_object_congested())
+		return true;
+
+	add_wait_queue_exclusive(cong_wq, &wait);
+	if (!fscache_object_congested())
+		*timeoutp = schedule_timeout(*timeoutp);
+	finish_wait(cong_wq, &wait);
+
+	return fscache_object_congested();
+}
+EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
+
+/*
+ * enqueue the dependents of an object for metadata-type processing
+ * - the caller must hold the object's lock
+ * - this may cause an already locked object to wind up being processed again
+ */
+static void fscache_enqueue_dependents(struct fscache_object *object)
+{
+	struct fscache_object *dep;
+
+	_enter("{OBJ%x}", object->debug_id);
+
+	if (list_empty(&object->dependents))
+		return;
+
+	spin_lock(&object->lock);
+
+	while (!list_empty(&object->dependents)) {
+		dep = list_entry(object->dependents.next,
+				 struct fscache_object, dep_link);
+		list_del_init(&dep->dep_link);
+
+
+		/* sort onto appropriate lists */
+		fscache_enqueue_object(dep);
+		fscache_put_object(dep);
+
+		if (!list_empty(&object->dependents))
+			cond_resched_lock(&object->lock);
+	}
+
+	spin_unlock(&object->lock);
+}
+
+/*
+ * remove an object from whatever queue it's waiting on
+ * - the caller must hold object->lock
+ */
+void fscache_dequeue_object(struct fscache_object *object)
+{
+	_enter("{OBJ%x}", object->debug_id);
+
+	if (!list_empty(&object->dep_link)) {
+		spin_lock(&object->parent->lock);
+		list_del_init(&object->dep_link);
+		spin_unlock(&object->parent->lock);
+	}
+
+	_leave("");
+}
+
+/**
+ * fscache_check_aux - Ask the netfs whether an object on disk is still valid
+ * @object: The object to ask about
+ * @data: The auxiliary data for the object
+ * @datalen: The size of the auxiliary data
+ *
+ * This function consults the netfs about the coherency state of an object
+ */
+enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
+					const void *data, uint16_t datalen)
+{
+	enum fscache_checkaux result;
+
+	if (!object->cookie->def->check_aux) {
+		fscache_stat(&fscache_n_checkaux_none);
+		return FSCACHE_CHECKAUX_OKAY;
+	}
+
+	result = object->cookie->def->check_aux(object->cookie->netfs_data,
+						data, datalen);
+	switch (result) {
+		/* entry okay as is */
+	case FSCACHE_CHECKAUX_OKAY:
+		fscache_stat(&fscache_n_checkaux_okay);
+		break;
+
+		/* entry requires update */
+	case FSCACHE_CHECKAUX_NEEDS_UPDATE:
+		fscache_stat(&fscache_n_checkaux_update);
+		break;
+
+		/* entry requires deletion */
+	case FSCACHE_CHECKAUX_OBSOLETE:
+		fscache_stat(&fscache_n_checkaux_obsolete);
+		break;
+
+	default:
+		BUG();
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(fscache_check_aux);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
new file mode 100644
index 00000000..30afdfa7
--- /dev/null
+++ b/fs/fscache/operation.c
@@ -0,0 +1,463 @@
+/* FS-Cache worker operation management routines
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/operations.txt
+ */
+
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+atomic_t fscache_op_debug_id;
+EXPORT_SYMBOL(fscache_op_debug_id);
+
+/**
+ * fscache_enqueue_operation - Enqueue an operation for processing
+ * @op: The operation to enqueue
+ *
+ * Enqueue an operation for processing by the FS-Cache thread pool.
+ *
+ * This will get its own ref on the object.
+ */
+void fscache_enqueue_operation(struct fscache_operation *op)
+{
+	_enter("{OBJ%x OP%x,%u}",
+	       op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+	ASSERT(list_empty(&op->pend_link));
+	ASSERT(op->processor != NULL);
+	ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
+	ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+	fscache_stat(&fscache_n_op_enqueue);
+	switch (op->flags & FSCACHE_OP_TYPE) {
+	case FSCACHE_OP_ASYNC:
+		_debug("queue async");
+		atomic_inc(&op->usage);
+		if (!queue_work(fscache_op_wq, &op->work))
+			fscache_put_operation(op);
+		break;
+	case FSCACHE_OP_MYTHREAD:
+		_debug("queue for caller's attention");
+		break;
+	default:
+		printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
+		       op->flags);
+		BUG();
+		break;
+	}
+}
+EXPORT_SYMBOL(fscache_enqueue_operation);
+
+/*
+ * start an op running
+ */
+static void fscache_run_op(struct fscache_object *object,
+			   struct fscache_operation *op)
+{
+	object->n_in_progress++;
+	if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+		wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+	if (op->processor)
+		fscache_enqueue_operation(op);
+	fscache_stat(&fscache_n_op_run);
+}
+
+/*
+ * submit an exclusive operation for an object
+ * - other ops are excluded from running simultaneously with this one
+ * - this gets any extra refs it needs on an op
+ */
+int fscache_submit_exclusive_op(struct fscache_object *object,
+				struct fscache_operation *op)
+{
+	int ret;
+
+	_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
+
+	spin_lock(&object->lock);
+	ASSERTCMP(object->n_ops, >=, object->n_in_progress);
+	ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+	ASSERT(list_empty(&op->pend_link));
+
+	ret = -ENOBUFS;
+	if (fscache_object_is_active(object)) {
+		op->object = object;
+		object->n_ops++;
+		object->n_exclusive++;	/* reads and writes must wait */
+
+		if (object->n_ops > 1) {
+			atomic_inc(&op->usage);
+			list_add_tail(&op->pend_link, &object->pending_ops);
+			fscache_stat(&fscache_n_op_pend);
+		} else if (!list_empty(&object->pending_ops)) {
+			atomic_inc(&op->usage);
+			list_add_tail(&op->pend_link, &object->pending_ops);
+			fscache_stat(&fscache_n_op_pend);
+			fscache_start_operations(object);
+		} else {
+			ASSERTCMP(object->n_in_progress, ==, 0);
+			fscache_run_op(object, op);
+		}
+
+		/* need to issue a new write op after this */
+		clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+		ret = 0;
+	} else if (object->state == FSCACHE_OBJECT_CREATING) {
+		op->object = object;
+		object->n_ops++;
+		object->n_exclusive++;	/* reads and writes must wait */
+		atomic_inc(&op->usage);
+		list_add_tail(&op->pend_link, &object->pending_ops);
+		fscache_stat(&fscache_n_op_pend);
+		ret = 0;
+	} else {
+		/* not allowed to submit ops in any other state */
+		BUG();
+	}
+
+	spin_unlock(&object->lock);
+	return ret;
+}
+
+/*
+ * report an unexpected submission
+ */
+static void fscache_report_unexpected_submission(struct fscache_object *object,
+						 struct fscache_operation *op,
+						 unsigned long ostate)
+{
+	static bool once_only;
+	struct fscache_operation *p;
+	unsigned n;
+
+	if (once_only)
+		return;
+	once_only = true;
+
+	kdebug("unexpected submission OP%x [OBJ%x %s]",
+	       op->debug_id, object->debug_id,
+	       fscache_object_states[object->state]);
+	kdebug("objstate=%s [%s]",
+	       fscache_object_states[object->state],
+	       fscache_object_states[ostate]);
+	kdebug("objflags=%lx", object->flags);
+	kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
+	kdebug("ops=%u inp=%u exc=%u",
+	       object->n_ops, object->n_in_progress, object->n_exclusive);
+
+	if (!list_empty(&object->pending_ops)) {
+		n = 0;
+		list_for_each_entry(p, &object->pending_ops, pend_link) {
+			ASSERTCMP(p->object, ==, object);
+			kdebug("%p %p", op->processor, op->release);
+			n++;
+		}
+
+		kdebug("n=%u", n);
+	}
+
+	dump_stack();
+}
+
+/*
+ * submit an operation for an object
+ * - objects may be submitted only in the following states:
+ *   - during object creation (write ops may be submitted)
+ *   - whilst the object is active
+ *   - after an I/O error incurred in one of the two above states (op rejected)
+ * - this gets any extra refs it needs on an op
+ */
+int fscache_submit_op(struct fscache_object *object,
+		      struct fscache_operation *op)
+{
+	unsigned long ostate;
+	int ret;
+
+	_enter("{OBJ%x OP%x},{%u}",
+	       object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+	ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+	spin_lock(&object->lock);
+	ASSERTCMP(object->n_ops, >=, object->n_in_progress);
+	ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+	ASSERT(list_empty(&op->pend_link));
+
+	ostate = object->state;
+	smp_rmb();
+
+	if (fscache_object_is_active(object)) {
+		op->object = object;
+		object->n_ops++;
+
+		if (object->n_exclusive > 0) {
+			atomic_inc(&op->usage);
+			list_add_tail(&op->pend_link, &object->pending_ops);
+			fscache_stat(&fscache_n_op_pend);
+		} else if (!list_empty(&object->pending_ops)) {
+			atomic_inc(&op->usage);
+			list_add_tail(&op->pend_link, &object->pending_ops);
+			fscache_stat(&fscache_n_op_pend);
+			fscache_start_operations(object);
+		} else {
+			ASSERTCMP(object->n_exclusive, ==, 0);
+			fscache_run_op(object, op);
+		}
+		ret = 0;
+	} else if (object->state == FSCACHE_OBJECT_CREATING) {
+		op->object = object;
+		object->n_ops++;
+		atomic_inc(&op->usage);
+		list_add_tail(&op->pend_link, &object->pending_ops);
+		fscache_stat(&fscache_n_op_pend);
+		ret = 0;
+	} else if (object->state == FSCACHE_OBJECT_DYING ||
+		   object->state == FSCACHE_OBJECT_LC_DYING ||
+		   object->state == FSCACHE_OBJECT_WITHDRAWING) {
+		fscache_stat(&fscache_n_op_rejected);
+		ret = -ENOBUFS;
+	} else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+		fscache_report_unexpected_submission(object, op, ostate);
+		ASSERT(!fscache_object_is_active(object));
+		ret = -ENOBUFS;
+	} else {
+		ret = -ENOBUFS;
+	}
+
+	spin_unlock(&object->lock);
+	return ret;
+}
+
+/*
+ * queue an object for withdrawal on error, aborting all following asynchronous
+ * operations
+ */
+void fscache_abort_object(struct fscache_object *object)
+{
+	_enter("{OBJ%x}", object->debug_id);
+
+	fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
+}
+
+/*
+ * jump start the operation processing on an object
+ * - caller must hold object->lock
+ */
+void fscache_start_operations(struct fscache_object *object)
+{
+	struct fscache_operation *op;
+	bool stop = false;
+
+	while (!list_empty(&object->pending_ops) && !stop) {
+		op = list_entry(object->pending_ops.next,
+				struct fscache_operation, pend_link);
+
+		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+			if (object->n_in_progress > 0)
+				break;
+			stop = true;
+		}
+		list_del_init(&op->pend_link);
+		fscache_run_op(object, op);
+
+		/* the pending queue was holding a ref on the object */
+		fscache_put_operation(op);
+	}
+
+	ASSERTCMP(object->n_in_progress, <=, object->n_ops);
+
+	_debug("woke %d ops on OBJ%x",
+	       object->n_in_progress, object->debug_id);
+}
+
+/*
+ * cancel an operation that's pending on an object
+ */
+int fscache_cancel_op(struct fscache_operation *op)
+{
+	struct fscache_object *object = op->object;
+	int ret;
+
+	_enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
+
+	spin_lock(&object->lock);
+
+	ret = -EBUSY;
+	if (!list_empty(&op->pend_link)) {
+		fscache_stat(&fscache_n_op_cancelled);
+		list_del_init(&op->pend_link);
+		object->n_ops--;
+		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+			object->n_exclusive--;
+		if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+			wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+		fscache_put_operation(op);
+		ret = 0;
+	}
+
+	spin_unlock(&object->lock);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * release an operation
+ * - queues pending ops if this is the last in-progress op
+ */
+void fscache_put_operation(struct fscache_operation *op)
+{
+	struct fscache_object *object;
+	struct fscache_cache *cache;
+
+	_enter("{OBJ%x OP%x,%d}",
+	       op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+	ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+	if (!atomic_dec_and_test(&op->usage))
+		return;
+
+	_debug("PUT OP");
+	if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
+		BUG();
+
+	fscache_stat(&fscache_n_op_release);
+
+	if (op->release) {
+		op->release(op);
+		op->release = NULL;
+	}
+
+	object = op->object;
+
+	if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
+		atomic_dec(&object->n_reads);
+
+	/* now... we may get called with the object spinlock held, so we
+	 * complete the cleanup here only if we can immediately acquire the
+	 * lock, and defer it otherwise */
+	if (!spin_trylock(&object->lock)) {
+		_debug("defer put");
+		fscache_stat(&fscache_n_op_deferred_release);
+
+		cache = object->cache;
+		spin_lock(&cache->op_gc_list_lock);
+		list_add_tail(&op->pend_link, &cache->op_gc_list);
+		spin_unlock(&cache->op_gc_list_lock);
+		schedule_work(&cache->op_gc);
+		_leave(" [defer]");
+		return;
+	}
+
+	if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+		ASSERTCMP(object->n_exclusive, >, 0);
+		object->n_exclusive--;
+	}
+
+	ASSERTCMP(object->n_in_progress, >, 0);
+	object->n_in_progress--;
+	if (object->n_in_progress == 0)
+		fscache_start_operations(object);
+
+	ASSERTCMP(object->n_ops, >, 0);
+	object->n_ops--;
+	if (object->n_ops == 0)
+		fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
+
+	spin_unlock(&object->lock);
+
+	kfree(op);
+	_leave(" [done]");
+}
+EXPORT_SYMBOL(fscache_put_operation);
+
+/*
+ * garbage collect operations that have had their release deferred
+ */
+void fscache_operation_gc(struct work_struct *work)
+{
+	struct fscache_operation *op;
+	struct fscache_object *object;
+	struct fscache_cache *cache =
+		container_of(work, struct fscache_cache, op_gc);
+	int count = 0;
+
+	_enter("");
+
+	do {
+		spin_lock(&cache->op_gc_list_lock);
+		if (list_empty(&cache->op_gc_list)) {
+			spin_unlock(&cache->op_gc_list_lock);
+			break;
+		}
+
+		op = list_entry(cache->op_gc_list.next,
+				struct fscache_operation, pend_link);
+		list_del(&op->pend_link);
+		spin_unlock(&cache->op_gc_list_lock);
+
+		object = op->object;
+
+		_debug("GC DEFERRED REL OBJ%x OP%x",
+		       object->debug_id, op->debug_id);
+		fscache_stat(&fscache_n_op_gc);
+
+		ASSERTCMP(atomic_read(&op->usage), ==, 0);
+
+		spin_lock(&object->lock);
+		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+			ASSERTCMP(object->n_exclusive, >, 0);
+			object->n_exclusive--;
+		}
+
+		ASSERTCMP(object->n_in_progress, >, 0);
+		object->n_in_progress--;
+		if (object->n_in_progress == 0)
+			fscache_start_operations(object);
+
+		ASSERTCMP(object->n_ops, >, 0);
+		object->n_ops--;
+		if (object->n_ops == 0)
+			fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
+
+		spin_unlock(&object->lock);
+
+	} while (count++ < 20);
+
+	if (!list_empty(&cache->op_gc_list))
+		schedule_work(&cache->op_gc);
+
+	_leave("");
+}
+
+/*
+ * execute an operation using fs_op_wq to provide processing context -
+ * the caller holds a ref to this object, so we don't need to hold one
+ */
+void fscache_op_work_func(struct work_struct *work)
+{
+	struct fscache_operation *op =
+		container_of(work, struct fscache_operation, work);
+	unsigned long start;
+
+	_enter("{OBJ%x OP%x,%d}",
+	       op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+	ASSERT(op->processor != NULL);
+	start = jiffies;
+	op->processor(op);
+	fscache_hist(fscache_ops_histogram, start);
+	fscache_put_operation(op);
+
+	_leave("");
+}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
new file mode 100644
index 00000000..3f7a59bf
--- /dev/null
+++ b/fs/fscache/page.c
@@ -0,0 +1,996 @@
+/* Cache page management and data I/O routines
+ *
+ * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL PAGE
+#include <linux/module.h>
+#include <linux/fscache-cache.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+/*
+ * check to see if a page is being written to the cache
+ */
+bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+	void *val;
+
+	rcu_read_lock();
+	val = radix_tree_lookup(&cookie->stores, page->index);
+	rcu_read_unlock();
+
+	return val != NULL;
+}
+EXPORT_SYMBOL(__fscache_check_page_write);
+
+/*
+ * wait for a page to finish being written to the cache
+ */
+void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+
+	wait_event(*wq, !__fscache_check_page_write(cookie, page));
+}
+EXPORT_SYMBOL(__fscache_wait_on_page_write);
+
+/*
+ * decide whether a page can be released, possibly by cancelling a store to it
+ * - we're allowed to sleep if __GFP_WAIT is flagged
+ */
+bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
+				  struct page *page,
+				  gfp_t gfp)
+{
+	struct page *xpage;
+	void *val;
+
+	_enter("%p,%p,%x", cookie, page, gfp);
+
+	rcu_read_lock();
+	val = radix_tree_lookup(&cookie->stores, page->index);
+	if (!val) {
+		rcu_read_unlock();
+		fscache_stat(&fscache_n_store_vmscan_not_storing);
+		__fscache_uncache_page(cookie, page);
+		return true;
+	}
+
+	/* see if the page is actually undergoing storage - if so we can't get
+	 * rid of it till the cache has finished with it */
+	if (radix_tree_tag_get(&cookie->stores, page->index,
+			       FSCACHE_COOKIE_STORING_TAG)) {
+		rcu_read_unlock();
+		goto page_busy;
+	}
+
+	/* the page is pending storage, so we attempt to cancel the store and
+	 * discard the store request so that the page can be reclaimed */
+	spin_lock(&cookie->stores_lock);
+	rcu_read_unlock();
+
+	if (radix_tree_tag_get(&cookie->stores, page->index,
+			       FSCACHE_COOKIE_STORING_TAG)) {
+		/* the page started to undergo storage whilst we were looking,
+		 * so now we can only wait or return */
+		spin_unlock(&cookie->stores_lock);
+		goto page_busy;
+	}
+
+	xpage = radix_tree_delete(&cookie->stores, page->index);
+	spin_unlock(&cookie->stores_lock);
+
+	if (xpage) {
+		fscache_stat(&fscache_n_store_vmscan_cancelled);
+		fscache_stat(&fscache_n_store_radix_deletes);
+		ASSERTCMP(xpage, ==, page);
+	} else {
+		fscache_stat(&fscache_n_store_vmscan_gone);
+	}
+
+	wake_up_bit(&cookie->flags, 0);
+	if (xpage)
+		page_cache_release(xpage);
+	__fscache_uncache_page(cookie, page);
+	return true;
+
+page_busy:
+	/* we might want to wait here, but that could deadlock the allocator as
+	 * the work threads writing to the cache may all end up sleeping
+	 * on memory allocation */
+	fscache_stat(&fscache_n_store_vmscan_busy);
+	return false;
+}
+EXPORT_SYMBOL(__fscache_maybe_release_page);
+
+/*
+ * note that a page has finished being written to the cache
+ */
+static void fscache_end_page_write(struct fscache_object *object,
+				   struct page *page)
+{
+	struct fscache_cookie *cookie;
+	struct page *xpage = NULL;
+
+	spin_lock(&object->lock);
+	cookie = object->cookie;
+	if (cookie) {
+		/* delete the page from the tree if it is now no longer
+		 * pending */
+		spin_lock(&cookie->stores_lock);
+		radix_tree_tag_clear(&cookie->stores, page->index,
+				     FSCACHE_COOKIE_STORING_TAG);
+		if (!radix_tree_tag_get(&cookie->stores, page->index,
+					FSCACHE_COOKIE_PENDING_TAG)) {
+			fscache_stat(&fscache_n_store_radix_deletes);
+			xpage = radix_tree_delete(&cookie->stores, page->index);
+		}
+		spin_unlock(&cookie->stores_lock);
+		wake_up_bit(&cookie->flags, 0);
+	}
+	spin_unlock(&object->lock);
+	if (xpage)
+		page_cache_release(xpage);
+}
+
+/*
+ * actually apply the changed attributes to a cache object
+ */
+static void fscache_attr_changed_op(struct fscache_operation *op)
+{
+	struct fscache_object *object = op->object;
+	int ret;
+
+	_enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
+
+	fscache_stat(&fscache_n_attr_changed_calls);
+
+	if (fscache_object_is_active(object)) {
+		fscache_stat(&fscache_n_cop_attr_changed);
+		ret = object->cache->ops->attr_changed(object);
+		fscache_stat_d(&fscache_n_cop_attr_changed);
+		if (ret < 0)
+			fscache_abort_object(object);
+	}
+
+	_leave("");
+}
+
+/*
+ * notification that the attributes on an object have changed
+ */
+int __fscache_attr_changed(struct fscache_cookie *cookie)
+{
+	struct fscache_operation *op;
+	struct fscache_object *object;
+
+	_enter("%p", cookie);
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+
+	fscache_stat(&fscache_n_attr_changed);
+
+	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	if (!op) {
+		fscache_stat(&fscache_n_attr_changed_nomem);
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	fscache_operation_init(op, fscache_attr_changed_op, NULL);
+	op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	if (fscache_submit_exclusive_op(object, op) < 0)
+		goto nobufs;
+	spin_unlock(&cookie->lock);
+	fscache_stat(&fscache_n_attr_changed_ok);
+	fscache_put_operation(op);
+	_leave(" = 0");
+	return 0;
+
+nobufs:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+	fscache_stat(&fscache_n_attr_changed_nobufs);
+	_leave(" = %d", -ENOBUFS);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_attr_changed);
+
+/*
+ * release a retrieval op reference
+ */
+static void fscache_release_retrieval_op(struct fscache_operation *_op)
+{
+	struct fscache_retrieval *op =
+		container_of(_op, struct fscache_retrieval, op);
+
+	_enter("{OP%x}", op->op.debug_id);
+
+	fscache_hist(fscache_retrieval_histogram, op->start_time);
+	if (op->context)
+		fscache_put_context(op->op.object->cookie, op->context);
+
+	_leave("");
+}
+
+/*
+ * allocate a retrieval op
+ */
+static struct fscache_retrieval *fscache_alloc_retrieval(
+	struct address_space *mapping,
+	fscache_rw_complete_t end_io_func,
+	void *context)
+{
+	struct fscache_retrieval *op;
+
+	/* allocate a retrieval operation and attempt to submit it */
+	op = kzalloc(sizeof(*op), GFP_NOIO);
+	if (!op) {
+		fscache_stat(&fscache_n_retrievals_nomem);
+		return NULL;
+	}
+
+	fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
+	op->op.flags	= FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+	op->mapping	= mapping;
+	op->end_io_func	= end_io_func;
+	op->context	= context;
+	op->start_time	= jiffies;
+	INIT_LIST_HEAD(&op->to_do);
+	return op;
+}
+
+/*
+ * wait for a deferred lookup to complete
+ */
+static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
+{
+	unsigned long jif;
+
+	_enter("");
+
+	if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) {
+		_leave(" = 0 [imm]");
+		return 0;
+	}
+
+	fscache_stat(&fscache_n_retrievals_wait);
+
+	jif = jiffies;
+	if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+			fscache_wait_bit_interruptible,
+			TASK_INTERRUPTIBLE) != 0) {
+		fscache_stat(&fscache_n_retrievals_intr);
+		_leave(" = -ERESTARTSYS");
+		return -ERESTARTSYS;
+	}
+
+	ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags));
+
+	smp_rmb();
+	fscache_hist(fscache_retrieval_delay_histogram, jif);
+	_leave(" = 0 [dly]");
+	return 0;
+}
+
+/*
+ * wait for an object to become active (or dead)
+ */
+static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
+						 struct fscache_retrieval *op,
+						 atomic_t *stat_op_waits,
+						 atomic_t *stat_object_dead)
+{
+	int ret;
+
+	if (!test_bit(FSCACHE_OP_WAITING, &op->op.flags))
+		goto check_if_dead;
+
+	_debug(">>> WT");
+	fscache_stat(stat_op_waits);
+	if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+			fscache_wait_bit_interruptible,
+			TASK_INTERRUPTIBLE) < 0) {
+		ret = fscache_cancel_op(&op->op);
+		if (ret == 0)
+			return -ERESTARTSYS;
+
+		/* it's been removed from the pending queue by another party,
+		 * so we should get to run shortly */
+		wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+	}
+	_debug("<<< GO");
+
+check_if_dead:
+	if (unlikely(fscache_object_is_dead(object))) {
+		fscache_stat(stat_object_dead);
+		return -ENOBUFS;
+	}
+	return 0;
+}
+
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - we return:
+ *   -ENOMEM	- out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS	- no backing object available in which to cache the block
+ *   -ENODATA	- no data available in the backing object for this block
+ *   0		- dispatched a read - it'll call end_io_func() when finished
+ */
+int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+				 struct page *page,
+				 fscache_rw_complete_t end_io_func,
+				 void *context,
+				 gfp_t gfp)
+{
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,%p,,,", cookie, page);
+
+	fscache_stat(&fscache_n_retrievals);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(page, !=, NULL);
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+	if (!op) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+
+	atomic_inc(&object->n_reads);
+	set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_retrieval_ops);
+
+	/* pin the netfs read context in case we need to do the actual netfs
+	 * read because we've encountered a cache read failure */
+	fscache_get_context(object->cookie, op->context);
+
+	/* we wait for the operation to become active, and then process it
+	 * *here*, in this thread, and not in the thread pool */
+	ret = fscache_wait_for_retrieval_activation(
+		object, op,
+		__fscache_stat(&fscache_n_retrieval_op_waits),
+		__fscache_stat(&fscache_n_retrievals_object_dead));
+	if (ret < 0)
+		goto error;
+
+	/* ask the cache to honour the operation */
+	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+		fscache_stat(&fscache_n_cop_allocate_page);
+		ret = object->cache->ops->allocate_page(op, page, gfp);
+		fscache_stat_d(&fscache_n_cop_allocate_page);
+		if (ret == 0)
+			ret = -ENODATA;
+	} else {
+		fscache_stat(&fscache_n_cop_read_or_alloc_page);
+		ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
+		fscache_stat_d(&fscache_n_cop_read_or_alloc_page);
+	}
+
+error:
+	if (ret == -ENOMEM)
+		fscache_stat(&fscache_n_retrievals_nomem);
+	else if (ret == -ERESTARTSYS)
+		fscache_stat(&fscache_n_retrievals_intr);
+	else if (ret == -ENODATA)
+		fscache_stat(&fscache_n_retrievals_nodata);
+	else if (ret < 0)
+		fscache_stat(&fscache_n_retrievals_nobufs);
+	else
+		fscache_stat(&fscache_n_retrievals_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+nobufs:
+	fscache_stat(&fscache_n_retrievals_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_page);
+
+/*
+ * read a list of page from the cache or allocate a block in which to store
+ * them
+ * - we return:
+ *   -ENOMEM	- out of memory, some pages may be being read
+ *   -ERESTARTSYS - interrupted, some pages may be being read
+ *   -ENOBUFS	- no backing object or space available in which to cache any
+ *                pages not being read
+ *   -ENODATA	- no data available in the backing object for some or all of
+ *                the pages
+ *   0		- dispatched a read on all pages
+ *
+ * end_io_func() will be called for each page read from the cache as it is
+ * finishes being read
+ *
+ * any pages for which a read is dispatched will be removed from pages and
+ * nr_pages
+ */
+int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+				  struct address_space *mapping,
+				  struct list_head *pages,
+				  unsigned *nr_pages,
+				  fscache_rw_complete_t end_io_func,
+				  void *context,
+				  gfp_t gfp)
+{
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,,%d,,,", cookie, *nr_pages);
+
+	fscache_stat(&fscache_n_retrievals);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(*nr_pages, >, 0);
+	ASSERT(!list_empty(pages));
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(mapping, end_io_func, context);
+	if (!op)
+		return -ENOMEM;
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	atomic_inc(&object->n_reads);
+	set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_retrieval_ops);
+
+	/* pin the netfs read context in case we need to do the actual netfs
+	 * read because we've encountered a cache read failure */
+	fscache_get_context(object->cookie, op->context);
+
+	/* we wait for the operation to become active, and then process it
+	 * *here*, in this thread, and not in the thread pool */
+	ret = fscache_wait_for_retrieval_activation(
+		object, op,
+		__fscache_stat(&fscache_n_retrieval_op_waits),
+		__fscache_stat(&fscache_n_retrievals_object_dead));
+	if (ret < 0)
+		goto error;
+
+	/* ask the cache to honour the operation */
+	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+		fscache_stat(&fscache_n_cop_allocate_pages);
+		ret = object->cache->ops->allocate_pages(
+			op, pages, nr_pages, gfp);
+		fscache_stat_d(&fscache_n_cop_allocate_pages);
+	} else {
+		fscache_stat(&fscache_n_cop_read_or_alloc_pages);
+		ret = object->cache->ops->read_or_alloc_pages(
+			op, pages, nr_pages, gfp);
+		fscache_stat_d(&fscache_n_cop_read_or_alloc_pages);
+	}
+
+error:
+	if (ret == -ENOMEM)
+		fscache_stat(&fscache_n_retrievals_nomem);
+	else if (ret == -ERESTARTSYS)
+		fscache_stat(&fscache_n_retrievals_intr);
+	else if (ret == -ENODATA)
+		fscache_stat(&fscache_n_retrievals_nodata);
+	else if (ret < 0)
+		fscache_stat(&fscache_n_retrievals_nobufs);
+	else
+		fscache_stat(&fscache_n_retrievals_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+nobufs:
+	fscache_stat(&fscache_n_retrievals_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_pages);
+
+/*
+ * allocate a block in the cache on which to store a page
+ * - we return:
+ *   -ENOMEM	- out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS	- no backing object available in which to cache the block
+ *   0		- block allocated
+ */
+int __fscache_alloc_page(struct fscache_cookie *cookie,
+			 struct page *page,
+			 gfp_t gfp)
+{
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,%p,,,", cookie, page);
+
+	fscache_stat(&fscache_n_allocs);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(page, !=, NULL);
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+	if (!op)
+		return -ENOMEM;
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_alloc_ops);
+
+	ret = fscache_wait_for_retrieval_activation(
+		object, op,
+		__fscache_stat(&fscache_n_alloc_op_waits),
+		__fscache_stat(&fscache_n_allocs_object_dead));
+	if (ret < 0)
+		goto error;
+
+	/* ask the cache to honour the operation */
+	fscache_stat(&fscache_n_cop_allocate_page);
+	ret = object->cache->ops->allocate_page(op, page, gfp);
+	fscache_stat_d(&fscache_n_cop_allocate_page);
+
+error:
+	if (ret == -ERESTARTSYS)
+		fscache_stat(&fscache_n_allocs_intr);
+	else if (ret < 0)
+		fscache_stat(&fscache_n_allocs_nobufs);
+	else
+		fscache_stat(&fscache_n_allocs_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+nobufs:
+	fscache_stat(&fscache_n_allocs_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_alloc_page);
+
+/*
+ * release a write op reference
+ */
+static void fscache_release_write_op(struct fscache_operation *_op)
+{
+	_enter("{OP%x}", _op->debug_id);
+}
+
+/*
+ * perform the background storage of a page into the cache
+ */
+static void fscache_write_op(struct fscache_operation *_op)
+{
+	struct fscache_storage *op =
+		container_of(_op, struct fscache_storage, op);
+	struct fscache_object *object = op->op.object;
+	struct fscache_cookie *cookie;
+	struct page *page;
+	unsigned n;
+	void *results[1];
+	int ret;
+
+	_enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
+
+	spin_lock(&object->lock);
+	cookie = object->cookie;
+
+	if (!fscache_object_is_active(object) || !cookie) {
+		spin_unlock(&object->lock);
+		_leave("");
+		return;
+	}
+
+	spin_lock(&cookie->stores_lock);
+
+	fscache_stat(&fscache_n_store_calls);
+
+	/* find a page to store */
+	page = NULL;
+	n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
+				       FSCACHE_COOKIE_PENDING_TAG);
+	if (n != 1)
+		goto superseded;
+	page = results[0];
+	_debug("gang %d [%lx]", n, page->index);
+	if (page->index > op->store_limit) {
+		fscache_stat(&fscache_n_store_pages_over_limit);
+		goto superseded;
+	}
+
+	radix_tree_tag_set(&cookie->stores, page->index,
+			   FSCACHE_COOKIE_STORING_TAG);
+	radix_tree_tag_clear(&cookie->stores, page->index,
+			     FSCACHE_COOKIE_PENDING_TAG);
+
+	spin_unlock(&cookie->stores_lock);
+	spin_unlock(&object->lock);
+
+	fscache_stat(&fscache_n_store_pages);
+	fscache_stat(&fscache_n_cop_write_page);
+	ret = object->cache->ops->write_page(op, page);
+	fscache_stat_d(&fscache_n_cop_write_page);
+	fscache_end_page_write(object, page);
+	if (ret < 0) {
+		fscache_abort_object(object);
+	} else {
+		fscache_enqueue_operation(&op->op);
+	}
+
+	_leave("");
+	return;
+
+superseded:
+	/* this writer is going away and there aren't any more things to
+	 * write */
+	_debug("cease");
+	spin_unlock(&cookie->stores_lock);
+	clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+	spin_unlock(&object->lock);
+	_leave("");
+}
+
+/*
+ * request a page be stored in the cache
+ * - returns:
+ *   -ENOMEM	- out of memory, nothing done
+ *   -ENOBUFS	- no backing object available in which to cache the page
+ *   0		- dispatched a write - it'll call end_io_func() when finished
+ *
+ * if the cookie still has a backing object at this point, that object can be
+ * in one of a few states with respect to storage processing:
+ *
+ *  (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
+ *      set)
+ *
+ *	(a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
+ *	    fill op)
+ *
+ *	(b) writes deferred till post-creation (mark page for writing and
+ *	    return immediately)
+ *
+ *  (2) negative lookup, object created, initial fill being made from netfs
+ *      (FSCACHE_COOKIE_INITIAL_FILL is set)
+ *
+ *	(a) fill point not yet reached this page (mark page for writing and
+ *          return)
+ *
+ *	(b) fill point passed this page (queue op to store this page)
+ *
+ *  (3) object extant (queue op to store this page)
+ *
+ * any other state is invalid
+ */
+int __fscache_write_page(struct fscache_cookie *cookie,
+			 struct page *page,
+			 gfp_t gfp)
+{
+	struct fscache_storage *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,%x,", cookie, (u32) page->flags);
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERT(PageFsCache(page));
+
+	fscache_stat(&fscache_n_stores);
+
+	op = kzalloc(sizeof(*op), GFP_NOIO);
+	if (!op)
+		goto nomem;
+
+	fscache_operation_init(&op->op, fscache_write_op,
+			       fscache_release_write_op);
+	op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
+
+	ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+	if (ret < 0)
+		goto nomem_free;
+
+	ret = -ENOBUFS;
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+	if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
+		goto nobufs;
+
+	/* add the page to the pending-storage radix tree on the backing
+	 * object */
+	spin_lock(&object->lock);
+	spin_lock(&cookie->stores_lock);
+
+	_debug("store limit %llx", (unsigned long long) object->store_limit);
+
+	ret = radix_tree_insert(&cookie->stores, page->index, page);
+	if (ret < 0) {
+		if (ret == -EEXIST)
+			goto already_queued;
+		_debug("insert failed %d", ret);
+		goto nobufs_unlock_obj;
+	}
+
+	radix_tree_tag_set(&cookie->stores, page->index,
+			   FSCACHE_COOKIE_PENDING_TAG);
+	page_cache_get(page);
+
+	/* we only want one writer at a time, but we do need to queue new
+	 * writers after exclusive ops */
+	if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
+		goto already_pending;
+
+	spin_unlock(&cookie->stores_lock);
+	spin_unlock(&object->lock);
+
+	op->op.debug_id	= atomic_inc_return(&fscache_op_debug_id);
+	op->store_limit = object->store_limit;
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto submit_failed;
+
+	spin_unlock(&cookie->lock);
+	radix_tree_preload_end();
+	fscache_stat(&fscache_n_store_ops);
+	fscache_stat(&fscache_n_stores_ok);
+
+	/* the work queue now carries its own ref on the object */
+	fscache_put_operation(&op->op);
+	_leave(" = 0");
+	return 0;
+
+already_queued:
+	fscache_stat(&fscache_n_stores_again);
+already_pending:
+	spin_unlock(&cookie->stores_lock);
+	spin_unlock(&object->lock);
+	spin_unlock(&cookie->lock);
+	radix_tree_preload_end();
+	kfree(op);
+	fscache_stat(&fscache_n_stores_ok);
+	_leave(" = 0");
+	return 0;
+
+submit_failed:
+	spin_lock(&cookie->stores_lock);
+	radix_tree_delete(&cookie->stores, page->index);
+	spin_unlock(&cookie->stores_lock);
+	page_cache_release(page);
+	ret = -ENOBUFS;
+	goto nobufs;
+
+nobufs_unlock_obj:
+	spin_unlock(&cookie->stores_lock);
+	spin_unlock(&object->lock);
+nobufs:
+	spin_unlock(&cookie->lock);
+	radix_tree_preload_end();
+	kfree(op);
+	fscache_stat(&fscache_n_stores_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+
+nomem_free:
+	kfree(op);
+nomem:
+	fscache_stat(&fscache_n_stores_oom);
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(__fscache_write_page);
+
+/*
+ * remove a page from the cache
+ */
+void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
+{
+	struct fscache_object *object;
+
+	_enter(",%p", page);
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(page, !=, NULL);
+
+	fscache_stat(&fscache_n_uncaches);
+
+	/* cache withdrawal may beat us to it */
+	if (!PageFsCache(page))
+		goto done;
+
+	/* get the object */
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects)) {
+		ClearPageFsCache(page);
+		goto done_unlock;
+	}
+
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	/* there might now be stuff on disk we could read */
+	clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+	/* only invoke the cache backend if we managed to mark the page
+	 * uncached here; this deals with synchronisation vs withdrawal */
+	if (TestClearPageFsCache(page) &&
+	    object->cache->ops->uncache_page) {
+		/* the cache backend releases the cookie lock */
+		fscache_stat(&fscache_n_cop_uncache_page);
+		object->cache->ops->uncache_page(object, page);
+		fscache_stat_d(&fscache_n_cop_uncache_page);
+		goto done;
+	}
+
+done_unlock:
+	spin_unlock(&cookie->lock);
+done:
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_uncache_page);
+
+/**
+ * fscache_mark_pages_cached - Mark pages as being cached
+ * @op: The retrieval op pages are being marked for
+ * @pagevec: The pages to be marked
+ *
+ * Mark a bunch of netfs pages as being cached.  After this is called,
+ * the netfs must call fscache_uncache_page() to remove the mark.
+ */
+void fscache_mark_pages_cached(struct fscache_retrieval *op,
+			       struct pagevec *pagevec)
+{
+	struct fscache_cookie *cookie = op->op.object->cookie;
+	unsigned long loop;
+
+#ifdef CONFIG_FSCACHE_STATS
+	atomic_add(pagevec->nr, &fscache_n_marks);
+#endif
+
+	for (loop = 0; loop < pagevec->nr; loop++) {
+		struct page *page = pagevec->pages[loop];
+
+		_debug("- mark %p{%lx}", page, page->index);
+		if (TestSetPageFsCache(page)) {
+			static bool once_only;
+			if (!once_only) {
+				once_only = true;
+				printk(KERN_WARNING "FS-Cache:"
+				       " Cookie type %s marked page %lx"
+				       " multiple times\n",
+				       cookie->def->name, page->index);
+			}
+		}
+	}
+
+	if (cookie->def->mark_pages_cached)
+		cookie->def->mark_pages_cached(cookie->netfs_data,
+					       op->mapping, pagevec);
+	pagevec_reinit(pagevec);
+}
+EXPORT_SYMBOL(fscache_mark_pages_cached);
+
+/*
+ * Uncache all the pages in an inode that are marked PG_fscache, assuming them
+ * to be associated with the given cookie.
+ */
+void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
+				       struct inode *inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	pgoff_t next;
+	int i;
+
+	_enter("%p,%p", cookie, inode);
+
+	if (!mapping || mapping->nrpages == 0) {
+		_leave(" [no pages]");
+		return;
+	}
+
+	pagevec_init(&pvec, 0);
+	next = 0;
+	do {
+		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
+			break;
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			next = page->index;
+			if (PageFsCache(page)) {
+				__fscache_wait_on_page_write(cookie, page);
+				__fscache_uncache_page(cookie, page);
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	} while (++next);
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_uncache_all_inode_pages);
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
new file mode 100644
index 00000000..1d9e4951
--- /dev/null
+++ b/fs/fscache/proc.c
@@ -0,0 +1,81 @@
+/* FS-Cache statistics viewing interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * initialise the /proc/fs/fscache/ directory
+ */
+int __init fscache_proc_init(void)
+{
+	_enter("");
+
+	if (!proc_mkdir("fs/fscache", NULL))
+		goto error_dir;
+
+#ifdef CONFIG_FSCACHE_STATS
+	if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
+			 &fscache_stats_fops))
+		goto error_stats;
+#endif
+
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+	if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
+			 &fscache_histogram_fops))
+		goto error_histogram;
+#endif
+
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+	if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL,
+			 &fscache_objlist_fops))
+		goto error_objects;
+#endif
+
+	_leave(" = 0");
+	return 0;
+
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+error_objects:
+#endif
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+	remove_proc_entry("fs/fscache/histogram", NULL);
+error_histogram:
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+	remove_proc_entry("fs/fscache/stats", NULL);
+error_stats:
+#endif
+	remove_proc_entry("fs/fscache", NULL);
+error_dir:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/fscache/ directory
+ */
+void fscache_proc_cleanup(void)
+{
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+	remove_proc_entry("fs/fscache/objects", NULL);
+#endif
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+	remove_proc_entry("fs/fscache/histogram", NULL);
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+	remove_proc_entry("fs/fscache/stats", NULL);
+#endif
+	remove_proc_entry("fs/fscache", NULL);
+}
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
new file mode 100644
index 00000000..4765190d
--- /dev/null
+++ b/fs/fscache/stats.c
@@ -0,0 +1,280 @@
+/* FS-Cache statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * operation counters
+ */
+atomic_t fscache_n_op_pend;
+atomic_t fscache_n_op_run;
+atomic_t fscache_n_op_enqueue;
+atomic_t fscache_n_op_requeue;
+atomic_t fscache_n_op_deferred_release;
+atomic_t fscache_n_op_release;
+atomic_t fscache_n_op_gc;
+atomic_t fscache_n_op_cancelled;
+atomic_t fscache_n_op_rejected;
+
+atomic_t fscache_n_attr_changed;
+atomic_t fscache_n_attr_changed_ok;
+atomic_t fscache_n_attr_changed_nobufs;
+atomic_t fscache_n_attr_changed_nomem;
+atomic_t fscache_n_attr_changed_calls;
+
+atomic_t fscache_n_allocs;
+atomic_t fscache_n_allocs_ok;
+atomic_t fscache_n_allocs_wait;
+atomic_t fscache_n_allocs_nobufs;
+atomic_t fscache_n_allocs_intr;
+atomic_t fscache_n_allocs_object_dead;
+atomic_t fscache_n_alloc_ops;
+atomic_t fscache_n_alloc_op_waits;
+
+atomic_t fscache_n_retrievals;
+atomic_t fscache_n_retrievals_ok;
+atomic_t fscache_n_retrievals_wait;
+atomic_t fscache_n_retrievals_nodata;
+atomic_t fscache_n_retrievals_nobufs;
+atomic_t fscache_n_retrievals_intr;
+atomic_t fscache_n_retrievals_nomem;
+atomic_t fscache_n_retrievals_object_dead;
+atomic_t fscache_n_retrieval_ops;
+atomic_t fscache_n_retrieval_op_waits;
+
+atomic_t fscache_n_stores;
+atomic_t fscache_n_stores_ok;
+atomic_t fscache_n_stores_again;
+atomic_t fscache_n_stores_nobufs;
+atomic_t fscache_n_stores_oom;
+atomic_t fscache_n_store_ops;
+atomic_t fscache_n_store_calls;
+atomic_t fscache_n_store_pages;
+atomic_t fscache_n_store_radix_deletes;
+atomic_t fscache_n_store_pages_over_limit;
+
+atomic_t fscache_n_store_vmscan_not_storing;
+atomic_t fscache_n_store_vmscan_gone;
+atomic_t fscache_n_store_vmscan_busy;
+atomic_t fscache_n_store_vmscan_cancelled;
+
+atomic_t fscache_n_marks;
+atomic_t fscache_n_uncaches;
+
+atomic_t fscache_n_acquires;
+atomic_t fscache_n_acquires_null;
+atomic_t fscache_n_acquires_no_cache;
+atomic_t fscache_n_acquires_ok;
+atomic_t fscache_n_acquires_nobufs;
+atomic_t fscache_n_acquires_oom;
+
+atomic_t fscache_n_updates;
+atomic_t fscache_n_updates_null;
+atomic_t fscache_n_updates_run;
+
+atomic_t fscache_n_relinquishes;
+atomic_t fscache_n_relinquishes_null;
+atomic_t fscache_n_relinquishes_waitcrt;
+atomic_t fscache_n_relinquishes_retire;
+
+atomic_t fscache_n_cookie_index;
+atomic_t fscache_n_cookie_data;
+atomic_t fscache_n_cookie_special;
+
+atomic_t fscache_n_object_alloc;
+atomic_t fscache_n_object_no_alloc;
+atomic_t fscache_n_object_lookups;
+atomic_t fscache_n_object_lookups_negative;
+atomic_t fscache_n_object_lookups_positive;
+atomic_t fscache_n_object_lookups_timed_out;
+atomic_t fscache_n_object_created;
+atomic_t fscache_n_object_avail;
+atomic_t fscache_n_object_dead;
+
+atomic_t fscache_n_checkaux_none;
+atomic_t fscache_n_checkaux_okay;
+atomic_t fscache_n_checkaux_update;
+atomic_t fscache_n_checkaux_obsolete;
+
+atomic_t fscache_n_cop_alloc_object;
+atomic_t fscache_n_cop_lookup_object;
+atomic_t fscache_n_cop_lookup_complete;
+atomic_t fscache_n_cop_grab_object;
+atomic_t fscache_n_cop_update_object;
+atomic_t fscache_n_cop_drop_object;
+atomic_t fscache_n_cop_put_object;
+atomic_t fscache_n_cop_sync_cache;
+atomic_t fscache_n_cop_attr_changed;
+atomic_t fscache_n_cop_read_or_alloc_page;
+atomic_t fscache_n_cop_read_or_alloc_pages;
+atomic_t fscache_n_cop_allocate_page;
+atomic_t fscache_n_cop_allocate_pages;
+atomic_t fscache_n_cop_write_page;
+atomic_t fscache_n_cop_uncache_page;
+atomic_t fscache_n_cop_dissociate_pages;
+
+/*
+ * display the general statistics
+ */
+static int fscache_stats_show(struct seq_file *m, void *v)
+{
+	seq_puts(m, "FS-Cache statistics\n");
+
+	seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n",
+		   atomic_read(&fscache_n_cookie_index),
+		   atomic_read(&fscache_n_cookie_data),
+		   atomic_read(&fscache_n_cookie_special));
+
+	seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n",
+		   atomic_read(&fscache_n_object_alloc),
+		   atomic_read(&fscache_n_object_no_alloc),
+		   atomic_read(&fscache_n_object_avail),
+		   atomic_read(&fscache_n_object_dead));
+	seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n",
+		   atomic_read(&fscache_n_checkaux_none),
+		   atomic_read(&fscache_n_checkaux_okay),
+		   atomic_read(&fscache_n_checkaux_update),
+		   atomic_read(&fscache_n_checkaux_obsolete));
+
+	seq_printf(m, "Pages  : mrk=%u unc=%u\n",
+		   atomic_read(&fscache_n_marks),
+		   atomic_read(&fscache_n_uncaches));
+
+	seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u"
+		   " oom=%u\n",
+		   atomic_read(&fscache_n_acquires),
+		   atomic_read(&fscache_n_acquires_null),
+		   atomic_read(&fscache_n_acquires_no_cache),
+		   atomic_read(&fscache_n_acquires_ok),
+		   atomic_read(&fscache_n_acquires_nobufs),
+		   atomic_read(&fscache_n_acquires_oom));
+
+	seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u tmo=%u\n",
+		   atomic_read(&fscache_n_object_lookups),
+		   atomic_read(&fscache_n_object_lookups_negative),
+		   atomic_read(&fscache_n_object_lookups_positive),
+		   atomic_read(&fscache_n_object_created),
+		   atomic_read(&fscache_n_object_lookups_timed_out));
+
+	seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
+		   atomic_read(&fscache_n_updates),
+		   atomic_read(&fscache_n_updates_null),
+		   atomic_read(&fscache_n_updates_run));
+
+	seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u rtr=%u\n",
+		   atomic_read(&fscache_n_relinquishes),
+		   atomic_read(&fscache_n_relinquishes_null),
+		   atomic_read(&fscache_n_relinquishes_waitcrt),
+		   atomic_read(&fscache_n_relinquishes_retire));
+
+	seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
+		   atomic_read(&fscache_n_attr_changed),
+		   atomic_read(&fscache_n_attr_changed_ok),
+		   atomic_read(&fscache_n_attr_changed_nobufs),
+		   atomic_read(&fscache_n_attr_changed_nomem),
+		   atomic_read(&fscache_n_attr_changed_calls));
+
+	seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u int=%u\n",
+		   atomic_read(&fscache_n_allocs),
+		   atomic_read(&fscache_n_allocs_ok),
+		   atomic_read(&fscache_n_allocs_wait),
+		   atomic_read(&fscache_n_allocs_nobufs),
+		   atomic_read(&fscache_n_allocs_intr));
+	seq_printf(m, "Allocs : ops=%u owt=%u abt=%u\n",
+		   atomic_read(&fscache_n_alloc_ops),
+		   atomic_read(&fscache_n_alloc_op_waits),
+		   atomic_read(&fscache_n_allocs_object_dead));
+
+	seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
+		   " int=%u oom=%u\n",
+		   atomic_read(&fscache_n_retrievals),
+		   atomic_read(&fscache_n_retrievals_ok),
+		   atomic_read(&fscache_n_retrievals_wait),
+		   atomic_read(&fscache_n_retrievals_nodata),
+		   atomic_read(&fscache_n_retrievals_nobufs),
+		   atomic_read(&fscache_n_retrievals_intr),
+		   atomic_read(&fscache_n_retrievals_nomem));
+	seq_printf(m, "Retrvls: ops=%u owt=%u abt=%u\n",
+		   atomic_read(&fscache_n_retrieval_ops),
+		   atomic_read(&fscache_n_retrieval_op_waits),
+		   atomic_read(&fscache_n_retrievals_object_dead));
+
+	seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
+		   atomic_read(&fscache_n_stores),
+		   atomic_read(&fscache_n_stores_ok),
+		   atomic_read(&fscache_n_stores_again),
+		   atomic_read(&fscache_n_stores_nobufs),
+		   atomic_read(&fscache_n_stores_oom));
+	seq_printf(m, "Stores : ops=%u run=%u pgs=%u rxd=%u olm=%u\n",
+		   atomic_read(&fscache_n_store_ops),
+		   atomic_read(&fscache_n_store_calls),
+		   atomic_read(&fscache_n_store_pages),
+		   atomic_read(&fscache_n_store_radix_deletes),
+		   atomic_read(&fscache_n_store_pages_over_limit));
+
+	seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n",
+		   atomic_read(&fscache_n_store_vmscan_not_storing),
+		   atomic_read(&fscache_n_store_vmscan_gone),
+		   atomic_read(&fscache_n_store_vmscan_busy),
+		   atomic_read(&fscache_n_store_vmscan_cancelled));
+
+	seq_printf(m, "Ops    : pend=%u run=%u enq=%u can=%u rej=%u\n",
+		   atomic_read(&fscache_n_op_pend),
+		   atomic_read(&fscache_n_op_run),
+		   atomic_read(&fscache_n_op_enqueue),
+		   atomic_read(&fscache_n_op_cancelled),
+		   atomic_read(&fscache_n_op_rejected));
+	seq_printf(m, "Ops    : dfr=%u rel=%u gc=%u\n",
+		   atomic_read(&fscache_n_op_deferred_release),
+		   atomic_read(&fscache_n_op_release),
+		   atomic_read(&fscache_n_op_gc));
+
+	seq_printf(m, "CacheOp: alo=%d luo=%d luc=%d gro=%d\n",
+		   atomic_read(&fscache_n_cop_alloc_object),
+		   atomic_read(&fscache_n_cop_lookup_object),
+		   atomic_read(&fscache_n_cop_lookup_complete),
+		   atomic_read(&fscache_n_cop_grab_object));
+	seq_printf(m, "CacheOp: upo=%d dro=%d pto=%d atc=%d syn=%d\n",
+		   atomic_read(&fscache_n_cop_update_object),
+		   atomic_read(&fscache_n_cop_drop_object),
+		   atomic_read(&fscache_n_cop_put_object),
+		   atomic_read(&fscache_n_cop_attr_changed),
+		   atomic_read(&fscache_n_cop_sync_cache));
+	seq_printf(m, "CacheOp: rap=%d ras=%d alp=%d als=%d wrp=%d ucp=%d dsp=%d\n",
+		   atomic_read(&fscache_n_cop_read_or_alloc_page),
+		   atomic_read(&fscache_n_cop_read_or_alloc_pages),
+		   atomic_read(&fscache_n_cop_allocate_page),
+		   atomic_read(&fscache_n_cop_allocate_pages),
+		   atomic_read(&fscache_n_cop_write_page),
+		   atomic_read(&fscache_n_cop_uncache_page),
+		   atomic_read(&fscache_n_cop_dissociate_pages));
+	return 0;
+}
+
+/*
+ * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
+ */
+static int fscache_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, fscache_stats_show, NULL);
+}
+
+const struct file_operations fscache_stats_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fscache_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
new file mode 100644
index 00000000..0cf160a9
--- /dev/null
+++ b/fs/fuse/Kconfig
@@ -0,0 +1,15 @@
+config FUSE_FS
+	tristate "FUSE (Filesystem in Userspace) support"
+	help
+	  With FUSE it is possible to implement a fully functional filesystem
+	  in a userspace program.
+
+	  There's also companion library: libfuse.  This library along with
+	  utilities is available from the FUSE homepage:
+	  <http://fuse.sourceforge.net/>
+
+	  See <file:Documentation/filesystems/fuse.txt> for more information.
+	  See <file:Documentation/Changes> for needed library/utility version.
+
+	  If you want to develop a userspace FS, or if you want to use
+	  a filesystem based on FUSE, answer Y or M.
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
new file mode 100644
index 00000000..e95eeb44
--- /dev/null
+++ b/fs/fuse/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the FUSE filesystem.
+#
+
+obj-$(CONFIG_FUSE_FS) += fuse.o
+obj-$(CONFIG_CUSE) += cuse.o
+
+fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
new file mode 100644
index 00000000..85542a7d
--- /dev/null
+++ b/fs/fuse/control.c
@@ -0,0 +1,359 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+
+#define FUSE_CTL_SUPER_MAGIC 0x65735543
+
+/*
+ * This is non-NULL when the single instance of the control filesystem
+ * exists.  Protected by fuse_mutex
+ */
+static struct super_block *fuse_control_sb;
+
+static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
+{
+	struct fuse_conn *fc;
+	mutex_lock(&fuse_mutex);
+	fc = file->f_path.dentry->d_inode->i_private;
+	if (fc)
+		fc = fuse_conn_get(fc);
+	mutex_unlock(&fuse_mutex);
+	return fc;
+}
+
+static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf,
+				     size_t count, loff_t *ppos)
+{
+	struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+	if (fc) {
+		fuse_abort_conn(fc);
+		fuse_conn_put(fc);
+	}
+	return count;
+}
+
+static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
+				      size_t len, loff_t *ppos)
+{
+	char tmp[32];
+	size_t size;
+
+	if (!*ppos) {
+		long value;
+		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+		if (!fc)
+			return 0;
+
+		value = atomic_read(&fc->num_waiting);
+		file->private_data = (void *)value;
+		fuse_conn_put(fc);
+	}
+	size = sprintf(tmp, "%ld\n", (long)file->private_data);
+	return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf,
+				    size_t len, loff_t *ppos, unsigned val)
+{
+	char tmp[32];
+	size_t size = sprintf(tmp, "%u\n", val);
+
+	return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
+				     size_t count, loff_t *ppos, unsigned *val,
+				     unsigned global_limit)
+{
+	unsigned long t;
+	char tmp[32];
+	unsigned limit = (1 << 16) - 1;
+	int err;
+
+	if (*ppos || count >= sizeof(tmp) - 1)
+		return -EINVAL;
+
+	if (copy_from_user(tmp, buf, count))
+		return -EINVAL;
+
+	tmp[count] = '\0';
+
+	err = strict_strtoul(tmp, 0, &t);
+	if (err)
+		return err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		limit = min(limit, global_limit);
+
+	if (t > limit)
+		return -EINVAL;
+
+	*val = t;
+
+	return count;
+}
+
+static ssize_t fuse_conn_max_background_read(struct file *file,
+					     char __user *buf, size_t len,
+					     loff_t *ppos)
+{
+	struct fuse_conn *fc;
+	unsigned val;
+
+	fc = fuse_ctl_file_conn_get(file);
+	if (!fc)
+		return 0;
+
+	val = fc->max_background;
+	fuse_conn_put(fc);
+
+	return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_max_background_write(struct file *file,
+					      const char __user *buf,
+					      size_t count, loff_t *ppos)
+{
+	unsigned val;
+	ssize_t ret;
+
+	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+				    max_user_bgreq);
+	if (ret > 0) {
+		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+		if (fc) {
+			fc->max_background = val;
+			fuse_conn_put(fc);
+		}
+	}
+
+	return ret;
+}
+
+static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
+						   char __user *buf, size_t len,
+						   loff_t *ppos)
+{
+	struct fuse_conn *fc;
+	unsigned val;
+
+	fc = fuse_ctl_file_conn_get(file);
+	if (!fc)
+		return 0;
+
+	val = fc->congestion_threshold;
+	fuse_conn_put(fc);
+
+	return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
+						    const char __user *buf,
+						    size_t count, loff_t *ppos)
+{
+	unsigned val;
+	ssize_t ret;
+
+	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+				    max_user_congthresh);
+	if (ret > 0) {
+		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+		if (fc) {
+			fc->congestion_threshold = val;
+			fuse_conn_put(fc);
+		}
+	}
+
+	return ret;
+}
+
+static const struct file_operations fuse_ctl_abort_ops = {
+	.open = nonseekable_open,
+	.write = fuse_conn_abort_write,
+	.llseek = no_llseek,
+};
+
+static const struct file_operations fuse_ctl_waiting_ops = {
+	.open = nonseekable_open,
+	.read = fuse_conn_waiting_read,
+	.llseek = no_llseek,
+};
+
+static const struct file_operations fuse_conn_max_background_ops = {
+	.open = nonseekable_open,
+	.read = fuse_conn_max_background_read,
+	.write = fuse_conn_max_background_write,
+	.llseek = no_llseek,
+};
+
+static const struct file_operations fuse_conn_congestion_threshold_ops = {
+	.open = nonseekable_open,
+	.read = fuse_conn_congestion_threshold_read,
+	.write = fuse_conn_congestion_threshold_write,
+	.llseek = no_llseek,
+};
+
+static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
+					  struct fuse_conn *fc,
+					  const char *name,
+					  int mode, int nlink,
+					  const struct inode_operations *iop,
+					  const struct file_operations *fop)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+
+	BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES);
+	dentry = d_alloc_name(parent, name);
+	if (!dentry)
+		return NULL;
+
+	fc->ctl_dentry[fc->ctl_ndents++] = dentry;
+	inode = new_inode(fuse_control_sb);
+	if (!inode)
+		return NULL;
+
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_uid = fc->user_id;
+	inode->i_gid = fc->group_id;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	/* setting ->i_op to NULL is not allowed */
+	if (iop)
+		inode->i_op = iop;
+	inode->i_fop = fop;
+	inode->i_nlink = nlink;
+	inode->i_private = fc;
+	d_add(dentry, inode);
+	return dentry;
+}
+
+/*
+ * Add a connection to the control filesystem (if it exists).  Caller
+ * must hold fuse_mutex
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc)
+{
+	struct dentry *parent;
+	char name[32];
+
+	if (!fuse_control_sb)
+		return 0;
+
+	parent = fuse_control_sb->s_root;
+	inc_nlink(parent->d_inode);
+	sprintf(name, "%u", fc->dev);
+	parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
+				     &simple_dir_inode_operations,
+				     &simple_dir_operations);
+	if (!parent)
+		goto err;
+
+	if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
+				 NULL, &fuse_ctl_waiting_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
+				 NULL, &fuse_ctl_abort_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
+				 1, NULL, &fuse_conn_max_background_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
+				 S_IFREG | 0600, 1, NULL,
+				 &fuse_conn_congestion_threshold_ops))
+		goto err;
+
+	return 0;
+
+ err:
+	fuse_ctl_remove_conn(fc);
+	return -ENOMEM;
+}
+
+/*
+ * Remove a connection from the control filesystem (if it exists).
+ * Caller must hold fuse_mutex
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc)
+{
+	int i;
+
+	if (!fuse_control_sb)
+		return;
+
+	for (i = fc->ctl_ndents - 1; i >= 0; i--) {
+		struct dentry *dentry = fc->ctl_dentry[i];
+		dentry->d_inode->i_private = NULL;
+		d_drop(dentry);
+		dput(dentry);
+	}
+	drop_nlink(fuse_control_sb->s_root->d_inode);
+}
+
+static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct tree_descr empty_descr = {""};
+	struct fuse_conn *fc;
+	int err;
+
+	err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr);
+	if (err)
+		return err;
+
+	mutex_lock(&fuse_mutex);
+	BUG_ON(fuse_control_sb);
+	fuse_control_sb = sb;
+	list_for_each_entry(fc, &fuse_conn_list, entry) {
+		err = fuse_ctl_add_conn(fc);
+		if (err) {
+			fuse_control_sb = NULL;
+			mutex_unlock(&fuse_mutex);
+			return err;
+		}
+	}
+	mutex_unlock(&fuse_mutex);
+
+	return 0;
+}
+
+static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *raw_data)
+{
+	return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super);
+}
+
+static void fuse_ctl_kill_sb(struct super_block *sb)
+{
+	struct fuse_conn *fc;
+
+	mutex_lock(&fuse_mutex);
+	fuse_control_sb = NULL;
+	list_for_each_entry(fc, &fuse_conn_list, entry)
+		fc->ctl_ndents = 0;
+	mutex_unlock(&fuse_mutex);
+
+	kill_litter_super(sb);
+}
+
+static struct file_system_type fuse_ctl_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "fusectl",
+	.mount		= fuse_ctl_mount,
+	.kill_sb	= fuse_ctl_kill_sb,
+};
+
+int __init fuse_ctl_init(void)
+{
+	return register_filesystem(&fuse_ctl_fs_type);
+}
+
+void fuse_ctl_cleanup(void)
+{
+	unregister_filesystem(&fuse_ctl_fs_type);
+}
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
new file mode 100644
index 00000000..b6cca47f
--- /dev/null
+++ b/fs/fuse/cuse.c
@@ -0,0 +1,620 @@
+/*
+ * CUSE: Character device in Userspace
+ *
+ * Copyright (C) 2008-2009  SUSE Linux Products GmbH
+ * Copyright (C) 2008-2009  Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * CUSE enables character devices to be implemented from userland much
+ * like FUSE allows filesystems.  On initialization /dev/cuse is
+ * created.  By opening the file and replying to the CUSE_INIT request
+ * userland CUSE server can create a character device.  After that the
+ * operation is very similar to FUSE.
+ *
+ * A CUSE instance involves the following objects.
+ *
+ * cuse_conn	: contains fuse_conn and serves as bonding structure
+ * channel	: file handle connected to the userland CUSE server
+ * cdev		: the implemented character device
+ * dev		: generic device for cdev
+ *
+ * Note that 'channel' is what 'dev' is in FUSE.  As CUSE deals with
+ * devices, it's called 'channel' to reduce confusion.
+ *
+ * channel determines when the character device dies.  When channel is
+ * closed, everything begins to destruct.  The cuse_conn is taken off
+ * the lookup table preventing further access from cdev, cdev and
+ * generic device are removed and the base reference of cuse_conn is
+ * put.
+ *
+ * On each open, the matching cuse_conn is looked up and if found an
+ * additional reference is taken which is released when the file is
+ * closed.
+ */
+
+#include <linux/fuse.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/magic.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+
+#include "fuse_i.h"
+
+#define CUSE_CONNTBL_LEN	64
+
+struct cuse_conn {
+	struct list_head	list;	/* linked on cuse_conntbl */
+	struct fuse_conn	fc;	/* fuse connection */
+	struct cdev		*cdev;	/* associated character device */
+	struct device		*dev;	/* device representing @cdev */
+
+	/* init parameters, set once during initialization */
+	bool			unrestricted_ioctl;
+};
+
+static DEFINE_SPINLOCK(cuse_lock);		/* protects cuse_conntbl */
+static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];
+static struct class *cuse_class;
+
+static struct cuse_conn *fc_to_cc(struct fuse_conn *fc)
+{
+	return container_of(fc, struct cuse_conn, fc);
+}
+
+static struct list_head *cuse_conntbl_head(dev_t devt)
+{
+	return &cuse_conntbl[(MAJOR(devt) + MINOR(devt)) % CUSE_CONNTBL_LEN];
+}
+
+
+/**************************************************************************
+ * CUSE frontend operations
+ *
+ * These are file operations for the character device.
+ *
+ * On open, CUSE opens a file from the FUSE mnt and stores it to
+ * private_data of the open file.  All other ops call FUSE ops on the
+ * FUSE file.
+ */
+
+static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
+			 loff_t *ppos)
+{
+	loff_t pos = 0;
+
+	return fuse_direct_io(file, buf, count, &pos, 0);
+}
+
+static ssize_t cuse_write(struct file *file, const char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	loff_t pos = 0;
+	/*
+	 * No locking or generic_write_checks(), the server is
+	 * responsible for locking and sanity checks.
+	 */
+	return fuse_direct_io(file, buf, count, &pos, 1);
+}
+
+static int cuse_open(struct inode *inode, struct file *file)
+{
+	dev_t devt = inode->i_cdev->dev;
+	struct cuse_conn *cc = NULL, *pos;
+	int rc;
+
+	/* look up and get the connection */
+	spin_lock(&cuse_lock);
+	list_for_each_entry(pos, cuse_conntbl_head(devt), list)
+		if (pos->dev->devt == devt) {
+			fuse_conn_get(&pos->fc);
+			cc = pos;
+			break;
+		}
+	spin_unlock(&cuse_lock);
+
+	/* dead? */
+	if (!cc)
+		return -ENODEV;
+
+	/*
+	 * Generic permission check is already done against the chrdev
+	 * file, proceed to open.
+	 */
+	rc = fuse_do_open(&cc->fc, 0, file, 0);
+	if (rc)
+		fuse_conn_put(&cc->fc);
+	return rc;
+}
+
+static int cuse_release(struct inode *inode, struct file *file)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+
+	fuse_sync_release(ff, file->f_flags);
+	fuse_conn_put(fc);
+
+	return 0;
+}
+
+static long cuse_file_ioctl(struct file *file, unsigned int cmd,
+			    unsigned long arg)
+{
+	struct fuse_file *ff = file->private_data;
+	struct cuse_conn *cc = fc_to_cc(ff->fc);
+	unsigned int flags = 0;
+
+	if (cc->unrestricted_ioctl)
+		flags |= FUSE_IOCTL_UNRESTRICTED;
+
+	return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+				   unsigned long arg)
+{
+	struct fuse_file *ff = file->private_data;
+	struct cuse_conn *cc = fc_to_cc(ff->fc);
+	unsigned int flags = FUSE_IOCTL_COMPAT;
+
+	if (cc->unrestricted_ioctl)
+		flags |= FUSE_IOCTL_UNRESTRICTED;
+
+	return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+static const struct file_operations cuse_frontend_fops = {
+	.owner			= THIS_MODULE,
+	.read			= cuse_read,
+	.write			= cuse_write,
+	.open			= cuse_open,
+	.release		= cuse_release,
+	.unlocked_ioctl		= cuse_file_ioctl,
+	.compat_ioctl		= cuse_file_compat_ioctl,
+	.poll			= fuse_file_poll,
+	.llseek		= noop_llseek,
+};
+
+
+/**************************************************************************
+ * CUSE channel initialization and destruction
+ */
+
+struct cuse_devinfo {
+	const char		*name;
+};
+
+/**
+ * cuse_parse_one - parse one key=value pair
+ * @pp: i/o parameter for the current position
+ * @end: points to one past the end of the packed string
+ * @keyp: out parameter for key
+ * @valp: out parameter for value
+ *
+ * *@pp points to packed strings - "key0=val0\0key1=val1\0" which ends
+ * at @end - 1.  This function parses one pair and set *@keyp to the
+ * start of the key and *@valp to the start of the value.  Note that
+ * the original string is modified such that the key string is
+ * terminated with '\0'.  *@pp is updated to point to the next string.
+ *
+ * RETURNS:
+ * 1 on successful parse, 0 on EOF, -errno on failure.
+ */
+static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
+{
+	char *p = *pp;
+	char *key, *val;
+
+	while (p < end && *p == '\0')
+		p++;
+	if (p == end)
+		return 0;
+
+	if (end[-1] != '\0') {
+		printk(KERN_ERR "CUSE: info not properly terminated\n");
+		return -EINVAL;
+	}
+
+	key = val = p;
+	p += strlen(p);
+
+	if (valp) {
+		strsep(&val, "=");
+		if (!val)
+			val = key + strlen(key);
+		key = strstrip(key);
+		val = strstrip(val);
+	} else
+		key = strstrip(key);
+
+	if (!strlen(key)) {
+		printk(KERN_ERR "CUSE: zero length info key specified\n");
+		return -EINVAL;
+	}
+
+	*pp = p;
+	*keyp = key;
+	if (valp)
+		*valp = val;
+
+	return 1;
+}
+
+/**
+ * cuse_parse_dev_info - parse device info
+ * @p: device info string
+ * @len: length of device info string
+ * @devinfo: out parameter for parsed device info
+ *
+ * Parse @p to extract device info and store it into @devinfo.  String
+ * pointed to by @p is modified by parsing and @devinfo points into
+ * them, so @p shouldn't be freed while @devinfo is in use.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
+{
+	char *end = p + len;
+	char *key, *val;
+	int rc;
+
+	while (true) {
+		rc = cuse_parse_one(&p, end, &key, &val);
+		if (rc < 0)
+			return rc;
+		if (!rc)
+			break;
+		if (strcmp(key, "DEVNAME") == 0)
+			devinfo->name = val;
+		else
+			printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n",
+			       key);
+	}
+
+	if (!devinfo->name || !strlen(devinfo->name)) {
+		printk(KERN_ERR "CUSE: DEVNAME unspecified\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void cuse_gendev_release(struct device *dev)
+{
+	kfree(dev);
+}
+
+/**
+ * cuse_process_init_reply - finish initializing CUSE channel
+ *
+ * This function creates the character device and sets up all the
+ * required data structures for it.  Please read the comment at the
+ * top of this file for high level overview.
+ */
+static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct cuse_conn *cc = fc_to_cc(fc);
+	struct cuse_init_out *arg = req->out.args[0].value;
+	struct page *page = req->pages[0];
+	struct cuse_devinfo devinfo = { };
+	struct device *dev;
+	struct cdev *cdev;
+	dev_t devt;
+	int rc;
+
+	if (req->out.h.error ||
+	    arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
+		goto err;
+	}
+
+	fc->minor = arg->minor;
+	fc->max_read = max_t(unsigned, arg->max_read, 4096);
+	fc->max_write = max_t(unsigned, arg->max_write, 4096);
+
+	/* parse init reply */
+	cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL;
+
+	rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size,
+				&devinfo);
+	if (rc)
+		goto err;
+
+	/* determine and reserve devt */
+	devt = MKDEV(arg->dev_major, arg->dev_minor);
+	if (!MAJOR(devt))
+		rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name);
+	else
+		rc = register_chrdev_region(devt, 1, devinfo.name);
+	if (rc) {
+		printk(KERN_ERR "CUSE: failed to register chrdev region\n");
+		goto err;
+	}
+
+	/* devt determined, create device */
+	rc = -ENOMEM;
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		goto err_region;
+
+	device_initialize(dev);
+	dev_set_uevent_suppress(dev, 1);
+	dev->class = cuse_class;
+	dev->devt = devt;
+	dev->release = cuse_gendev_release;
+	dev_set_drvdata(dev, cc);
+	dev_set_name(dev, "%s", devinfo.name);
+
+	rc = device_add(dev);
+	if (rc)
+		goto err_device;
+
+	/* register cdev */
+	rc = -ENOMEM;
+	cdev = cdev_alloc();
+	if (!cdev)
+		goto err_device;
+
+	cdev->owner = THIS_MODULE;
+	cdev->ops = &cuse_frontend_fops;
+
+	rc = cdev_add(cdev, devt, 1);
+	if (rc)
+		goto err_cdev;
+
+	cc->dev = dev;
+	cc->cdev = cdev;
+
+	/* make the device available */
+	spin_lock(&cuse_lock);
+	list_add(&cc->list, cuse_conntbl_head(devt));
+	spin_unlock(&cuse_lock);
+
+	/* announce device availability */
+	dev_set_uevent_suppress(dev, 0);
+	kobject_uevent(&dev->kobj, KOBJ_ADD);
+out:
+	kfree(arg);
+	__free_page(page);
+	return;
+
+err_cdev:
+	cdev_del(cdev);
+err_device:
+	put_device(dev);
+err_region:
+	unregister_chrdev_region(devt, 1);
+err:
+	fc->conn_error = 1;
+	goto out;
+}
+
+static int cuse_send_init(struct cuse_conn *cc)
+{
+	int rc;
+	struct fuse_req *req;
+	struct page *page;
+	struct fuse_conn *fc = &cc->fc;
+	struct cuse_init_in *arg;
+	void *outarg;
+
+	BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req)) {
+		rc = PTR_ERR(req);
+		goto err;
+	}
+
+	rc = -ENOMEM;
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page)
+		goto err_put_req;
+
+	outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL);
+	if (!outarg)
+		goto err_free_page;
+
+	arg = &req->misc.cuse_init_in;
+	arg->major = FUSE_KERNEL_VERSION;
+	arg->minor = FUSE_KERNEL_MINOR_VERSION;
+	arg->flags |= CUSE_UNRESTRICTED_IOCTL;
+	req->in.h.opcode = CUSE_INIT;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(struct cuse_init_in);
+	req->in.args[0].value = arg;
+	req->out.numargs = 2;
+	req->out.args[0].size = sizeof(struct cuse_init_out);
+	req->out.args[0].value = outarg;
+	req->out.args[1].size = CUSE_INIT_INFO_MAX;
+	req->out.argvar = 1;
+	req->out.argpages = 1;
+	req->pages[0] = page;
+	req->num_pages = 1;
+	req->end = cuse_process_init_reply;
+	fuse_request_send_background(fc, req);
+
+	return 0;
+
+err_free_page:
+	__free_page(page);
+err_put_req:
+	fuse_put_request(fc, req);
+err:
+	return rc;
+}
+
+static void cuse_fc_release(struct fuse_conn *fc)
+{
+	struct cuse_conn *cc = fc_to_cc(fc);
+	kfree(cc);
+}
+
+/**
+ * cuse_channel_open - open method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being opened
+ *
+ * Userland CUSE server can create a CUSE device by opening /dev/cuse
+ * and replying to the initialization request kernel sends.  This
+ * function is responsible for handling CUSE device initialization.
+ * Because the fd opened by this function is used during
+ * initialization, this function only creates cuse_conn and sends
+ * init.  The rest is delegated to a kthread.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_open(struct inode *inode, struct file *file)
+{
+	struct cuse_conn *cc;
+	int rc;
+
+	/* set up cuse_conn */
+	cc = kzalloc(sizeof(*cc), GFP_KERNEL);
+	if (!cc)
+		return -ENOMEM;
+
+	fuse_conn_init(&cc->fc);
+
+	INIT_LIST_HEAD(&cc->list);
+	cc->fc.release = cuse_fc_release;
+
+	cc->fc.connected = 1;
+	cc->fc.blocked = 0;
+	rc = cuse_send_init(cc);
+	if (rc) {
+		fuse_conn_put(&cc->fc);
+		return rc;
+	}
+	file->private_data = &cc->fc;	/* channel owns base reference to cc */
+
+	return 0;
+}
+
+/**
+ * cuse_channel_release - release method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being closed
+ *
+ * Disconnect the channel, deregister CUSE device and initiate
+ * destruction by putting the default reference.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_release(struct inode *inode, struct file *file)
+{
+	struct cuse_conn *cc = fc_to_cc(file->private_data);
+	int rc;
+
+	/* remove from the conntbl, no more access from this point on */
+	spin_lock(&cuse_lock);
+	list_del_init(&cc->list);
+	spin_unlock(&cuse_lock);
+
+	/* remove device */
+	if (cc->dev)
+		device_unregister(cc->dev);
+	if (cc->cdev) {
+		unregister_chrdev_region(cc->cdev->dev, 1);
+		cdev_del(cc->cdev);
+	}
+
+	/* kill connection and shutdown channel */
+	fuse_conn_kill(&cc->fc);
+	rc = fuse_dev_release(inode, file);	/* puts the base reference */
+
+	return rc;
+}
+
+static struct file_operations cuse_channel_fops; /* initialized during init */
+
+
+/**************************************************************************
+ * Misc stuff and module initializatiion
+ *
+ * CUSE exports the same set of attributes to sysfs as fusectl.
+ */
+
+static ssize_t cuse_class_waiting_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct cuse_conn *cc = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
+}
+
+static ssize_t cuse_class_abort_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct cuse_conn *cc = dev_get_drvdata(dev);
+
+	fuse_abort_conn(&cc->fc);
+	return count;
+}
+
+static struct device_attribute cuse_class_dev_attrs[] = {
+	__ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL),
+	__ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store),
+	{ }
+};
+
+static struct miscdevice cuse_miscdev = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "cuse",
+	.fops		= &cuse_channel_fops,
+};
+
+static int __init cuse_init(void)
+{
+	int i, rc;
+
+	/* init conntbl */
+	for (i = 0; i < CUSE_CONNTBL_LEN; i++)
+		INIT_LIST_HEAD(&cuse_conntbl[i]);
+
+	/* inherit and extend fuse_dev_operations */
+	cuse_channel_fops		= fuse_dev_operations;
+	cuse_channel_fops.owner		= THIS_MODULE;
+	cuse_channel_fops.open		= cuse_channel_open;
+	cuse_channel_fops.release	= cuse_channel_release;
+
+	cuse_class = class_create(THIS_MODULE, "cuse");
+	if (IS_ERR(cuse_class))
+		return PTR_ERR(cuse_class);
+
+	cuse_class->dev_attrs = cuse_class_dev_attrs;
+
+	rc = misc_register(&cuse_miscdev);
+	if (rc) {
+		class_destroy(cuse_class);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void __exit cuse_exit(void)
+{
+	misc_deregister(&cuse_miscdev);
+	class_destroy(cuse_class);
+}
+
+module_init(cuse_init);
+module_exit(cuse_exit);
+
+MODULE_AUTHOR("Tejun Heo <tj@kernel.org>");
+MODULE_DESCRIPTION("Character device in Userspace");
+MODULE_LICENSE("GPL");
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
new file mode 100644
index 00000000..c858b5c8
--- /dev/null
+++ b/fs/fuse/dev.c
@@ -0,0 +1,2049 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/uio.h>
+#include <linux/miscdevice.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/swap.h>
+#include <linux/splice.h>
+#include <linux/freezer.h>
+
+MODULE_ALIAS_MISCDEV(FUSE_MINOR);
+MODULE_ALIAS("devname:fuse");
+
+static struct kmem_cache *fuse_req_cachep;
+
+static struct fuse_conn *fuse_get_conn(struct file *file)
+{
+	/*
+	 * Lockless access is OK, because file->private data is set
+	 * once during mount and is valid until the file is released.
+	 */
+	return file->private_data;
+}
+
+static void fuse_request_init(struct fuse_req *req)
+{
+	memset(req, 0, sizeof(*req));
+	INIT_LIST_HEAD(&req->list);
+	INIT_LIST_HEAD(&req->intr_entry);
+	init_waitqueue_head(&req->waitq);
+	atomic_set(&req->count, 1);
+}
+
+struct fuse_req *fuse_request_alloc(void)
+{
+	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL);
+	if (req)
+		fuse_request_init(req);
+	return req;
+}
+EXPORT_SYMBOL_GPL(fuse_request_alloc);
+
+struct fuse_req *fuse_request_alloc_nofs(void)
+{
+	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS);
+	if (req)
+		fuse_request_init(req);
+	return req;
+}
+
+void fuse_request_free(struct fuse_req *req)
+{
+	kmem_cache_free(fuse_req_cachep, req);
+}
+
+static void block_sigs(sigset_t *oldset)
+{
+	sigset_t mask;
+
+	siginitsetinv(&mask, sigmask(SIGKILL));
+	sigprocmask(SIG_BLOCK, &mask, oldset);
+}
+
+static void restore_sigs(sigset_t *oldset)
+{
+	sigprocmask(SIG_SETMASK, oldset, NULL);
+}
+
+static void __fuse_get_request(struct fuse_req *req)
+{
+	atomic_inc(&req->count);
+}
+
+/* Must be called with > 1 refcount */
+static void __fuse_put_request(struct fuse_req *req)
+{
+	BUG_ON(atomic_read(&req->count) < 2);
+	atomic_dec(&req->count);
+}
+
+static void fuse_req_init_context(struct fuse_req *req)
+{
+	req->in.h.uid = current_fsuid();
+	req->in.h.gid = current_fsgid();
+	req->in.h.pid = current->pid;
+}
+
+struct fuse_req *fuse_get_req(struct fuse_conn *fc)
+{
+	struct fuse_req *req;
+	sigset_t oldset;
+	int intr;
+	int err;
+
+	atomic_inc(&fc->num_waiting);
+	block_sigs(&oldset);
+	intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked);
+	restore_sigs(&oldset);
+	err = -EINTR;
+	if (intr)
+		goto out;
+
+	err = -ENOTCONN;
+	if (!fc->connected)
+		goto out;
+
+	req = fuse_request_alloc();
+	err = -ENOMEM;
+	if (!req)
+		goto out;
+
+	fuse_req_init_context(req);
+	req->waiting = 1;
+	return req;
+
+ out:
+	atomic_dec(&fc->num_waiting);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(fuse_get_req);
+
+/*
+ * Return request in fuse_file->reserved_req.  However that may
+ * currently be in use.  If that is the case, wait for it to become
+ * available.
+ */
+static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
+					 struct file *file)
+{
+	struct fuse_req *req = NULL;
+	struct fuse_file *ff = file->private_data;
+
+	do {
+		wait_event(fc->reserved_req_waitq, ff->reserved_req);
+		spin_lock(&fc->lock);
+		if (ff->reserved_req) {
+			req = ff->reserved_req;
+			ff->reserved_req = NULL;
+			get_file(file);
+			req->stolen_file = file;
+		}
+		spin_unlock(&fc->lock);
+	} while (!req);
+
+	return req;
+}
+
+/*
+ * Put stolen request back into fuse_file->reserved_req
+ */
+static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct file *file = req->stolen_file;
+	struct fuse_file *ff = file->private_data;
+
+	spin_lock(&fc->lock);
+	fuse_request_init(req);
+	BUG_ON(ff->reserved_req);
+	ff->reserved_req = req;
+	wake_up_all(&fc->reserved_req_waitq);
+	spin_unlock(&fc->lock);
+	fput(file);
+}
+
+/*
+ * Gets a requests for a file operation, always succeeds
+ *
+ * This is used for sending the FLUSH request, which must get to
+ * userspace, due to POSIX locks which may need to be unlocked.
+ *
+ * If allocation fails due to OOM, use the reserved request in
+ * fuse_file.
+ *
+ * This is very unlikely to deadlock accidentally, since the
+ * filesystem should not have it's own file open.  If deadlock is
+ * intentional, it can still be broken by "aborting" the filesystem.
+ */
+struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file)
+{
+	struct fuse_req *req;
+
+	atomic_inc(&fc->num_waiting);
+	wait_event(fc->blocked_waitq, !fc->blocked);
+	req = fuse_request_alloc();
+	if (!req)
+		req = get_reserved_req(fc, file);
+
+	fuse_req_init_context(req);
+	req->waiting = 1;
+	return req;
+}
+
+void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	if (atomic_dec_and_test(&req->count)) {
+		if (req->waiting)
+			atomic_dec(&fc->num_waiting);
+
+		if (req->stolen_file)
+			put_reserved_req(fc, req);
+		else
+			fuse_request_free(req);
+	}
+}
+EXPORT_SYMBOL_GPL(fuse_put_request);
+
+static unsigned len_args(unsigned numargs, struct fuse_arg *args)
+{
+	unsigned nbytes = 0;
+	unsigned i;
+
+	for (i = 0; i < numargs; i++)
+		nbytes += args[i].size;
+
+	return nbytes;
+}
+
+static u64 fuse_get_unique(struct fuse_conn *fc)
+{
+	fc->reqctr++;
+	/* zero is special */
+	if (fc->reqctr == 0)
+		fc->reqctr = 1;
+
+	return fc->reqctr;
+}
+
+static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->in.h.len = sizeof(struct fuse_in_header) +
+		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
+	list_add_tail(&req->list, &fc->pending);
+	req->state = FUSE_REQ_PENDING;
+	if (!req->waiting) {
+		req->waiting = 1;
+		atomic_inc(&fc->num_waiting);
+	}
+	wake_up(&fc->waitq);
+	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+}
+
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+		       u64 nodeid, u64 nlookup)
+{
+	forget->forget_one.nodeid = nodeid;
+	forget->forget_one.nlookup = nlookup;
+
+	spin_lock(&fc->lock);
+	if (fc->connected) {
+		fc->forget_list_tail->next = forget;
+		fc->forget_list_tail = forget;
+		wake_up(&fc->waitq);
+		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+	} else {
+		kfree(forget);
+	}
+	spin_unlock(&fc->lock);
+}
+
+static void flush_bg_queue(struct fuse_conn *fc)
+{
+	while (fc->active_background < fc->max_background &&
+	       !list_empty(&fc->bg_queue)) {
+		struct fuse_req *req;
+
+		req = list_entry(fc->bg_queue.next, struct fuse_req, list);
+		list_del(&req->list);
+		fc->active_background++;
+		req->in.h.unique = fuse_get_unique(fc);
+		queue_request(fc, req);
+	}
+}
+
+/*
+ * This function is called when a request is finished.  Either a reply
+ * has arrived or it was aborted (and not yet sent) or some error
+ * occurred during communication with userspace, or the device file
+ * was closed.  The requester thread is woken up (if still waiting),
+ * the 'end' callback is called if given, else the reference to the
+ * request is released
+ *
+ * Called with fc->lock, unlocks it
+ */
+static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+__releases(fc->lock)
+{
+	void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
+	req->end = NULL;
+	list_del(&req->list);
+	list_del(&req->intr_entry);
+	req->state = FUSE_REQ_FINISHED;
+	if (req->background) {
+		if (fc->num_background == fc->max_background) {
+			fc->blocked = 0;
+			wake_up_all(&fc->blocked_waitq);
+		}
+		if (fc->num_background == fc->congestion_threshold &&
+		    fc->connected && fc->bdi_initialized) {
+			clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
+			clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+		}
+		fc->num_background--;
+		fc->active_background--;
+		flush_bg_queue(fc);
+	}
+	spin_unlock(&fc->lock);
+	wake_up(&req->waitq);
+	if (end)
+		end(fc, req);
+	fuse_put_request(fc, req);
+}
+
+static void wait_answer_interruptible(struct fuse_conn *fc,
+				      struct fuse_req *req)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	if (signal_pending(current))
+		return;
+
+	spin_unlock(&fc->lock);
+	wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
+	spin_lock(&fc->lock);
+}
+
+static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
+{
+	list_add_tail(&req->intr_entry, &fc->interrupts);
+	wake_up(&fc->waitq);
+	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+}
+
+static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	if (!fc->no_interrupt) {
+		/* Any signal may interrupt this */
+		wait_answer_interruptible(fc, req);
+
+		if (req->aborted)
+			goto aborted;
+		if (req->state == FUSE_REQ_FINISHED)
+			return;
+
+		req->interrupted = 1;
+		if (req->state == FUSE_REQ_SENT)
+			queue_interrupt(fc, req);
+	}
+
+	if (!req->force) {
+		sigset_t oldset;
+
+		/* Only fatal signals may interrupt this */
+		block_sigs(&oldset);
+		wait_answer_interruptible(fc, req);
+		restore_sigs(&oldset);
+
+		if (req->aborted)
+			goto aborted;
+		if (req->state == FUSE_REQ_FINISHED)
+			return;
+
+		/* Request is not yet in userspace, bail out */
+		if (req->state == FUSE_REQ_PENDING) {
+			list_del(&req->list);
+			__fuse_put_request(req);
+			req->out.h.error = -EINTR;
+			return;
+		}
+	}
+
+	/*
+	 * Either request is already in userspace, or it was forced.
+	 * Wait it out.
+	 */
+	spin_unlock(&fc->lock);
+
+	while (req->state != FUSE_REQ_FINISHED)
+		wait_event_freezable(req->waitq,
+				     req->state == FUSE_REQ_FINISHED);
+	spin_lock(&fc->lock);
+
+	if (!req->aborted)
+		return;
+
+ aborted:
+	BUG_ON(req->state != FUSE_REQ_FINISHED);
+	if (req->locked) {
+		/* This is uninterruptible sleep, because data is
+		   being copied to/from the buffers of req.  During
+		   locked state, there mustn't be any filesystem
+		   operation (e.g. page fault), since that could lead
+		   to deadlock */
+		spin_unlock(&fc->lock);
+		wait_event(req->waitq, !req->locked);
+		spin_lock(&fc->lock);
+	}
+}
+
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->isreply = 1;
+	spin_lock(&fc->lock);
+	if (!fc->connected)
+		req->out.h.error = -ENOTCONN;
+	else if (fc->conn_error)
+		req->out.h.error = -ECONNREFUSED;
+	else {
+		req->in.h.unique = fuse_get_unique(fc);
+		queue_request(fc, req);
+		/* acquire extra reference, since request is still needed
+		   after request_end() */
+		__fuse_get_request(req);
+
+		request_wait_answer(fc, req);
+	}
+	spin_unlock(&fc->lock);
+}
+EXPORT_SYMBOL_GPL(fuse_request_send);
+
+static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
+					    struct fuse_req *req)
+{
+	req->background = 1;
+	fc->num_background++;
+	if (fc->num_background == fc->max_background)
+		fc->blocked = 1;
+	if (fc->num_background == fc->congestion_threshold &&
+	    fc->bdi_initialized) {
+		set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
+		set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+	}
+	list_add_tail(&req->list, &fc->bg_queue);
+	flush_bg_queue(fc);
+}
+
+static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+{
+	spin_lock(&fc->lock);
+	if (fc->connected) {
+		fuse_request_send_nowait_locked(fc, req);
+		spin_unlock(&fc->lock);
+	} else {
+		req->out.h.error = -ENOTCONN;
+		request_end(fc, req);
+	}
+}
+
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->isreply = 1;
+	fuse_request_send_nowait(fc, req);
+}
+EXPORT_SYMBOL_GPL(fuse_request_send_background);
+
+static int fuse_request_send_notify_reply(struct fuse_conn *fc,
+					  struct fuse_req *req, u64 unique)
+{
+	int err = -ENODEV;
+
+	req->isreply = 0;
+	req->in.h.unique = unique;
+	spin_lock(&fc->lock);
+	if (fc->connected) {
+		queue_request(fc, req);
+		err = 0;
+	}
+	spin_unlock(&fc->lock);
+
+	return err;
+}
+
+/*
+ * Called under fc->lock
+ *
+ * fc->connected must have been checked previously
+ */
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+					 struct fuse_req *req)
+{
+	req->isreply = 1;
+	fuse_request_send_nowait_locked(fc, req);
+}
+
+/*
+ * Lock the request.  Up to the next unlock_request() there mustn't be
+ * anything that could cause a page-fault.  If the request was already
+ * aborted bail out.
+ */
+static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	int err = 0;
+	if (req) {
+		spin_lock(&fc->lock);
+		if (req->aborted)
+			err = -ENOENT;
+		else
+			req->locked = 1;
+		spin_unlock(&fc->lock);
+	}
+	return err;
+}
+
+/*
+ * Unlock request.  If it was aborted during being locked, the
+ * requester thread is currently waiting for it to be unlocked, so
+ * wake it up.
+ */
+static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	if (req) {
+		spin_lock(&fc->lock);
+		req->locked = 0;
+		if (req->aborted)
+			wake_up(&req->waitq);
+		spin_unlock(&fc->lock);
+	}
+}
+
+struct fuse_copy_state {
+	struct fuse_conn *fc;
+	int write;
+	struct fuse_req *req;
+	const struct iovec *iov;
+	struct pipe_buffer *pipebufs;
+	struct pipe_buffer *currbuf;
+	struct pipe_inode_info *pipe;
+	unsigned long nr_segs;
+	unsigned long seglen;
+	unsigned long addr;
+	struct page *pg;
+	void *mapaddr;
+	void *buf;
+	unsigned len;
+	unsigned move_pages:1;
+};
+
+static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
+			   int write,
+			   const struct iovec *iov, unsigned long nr_segs)
+{
+	memset(cs, 0, sizeof(*cs));
+	cs->fc = fc;
+	cs->write = write;
+	cs->iov = iov;
+	cs->nr_segs = nr_segs;
+}
+
+/* Unmap and put previous page of userspace buffer */
+static void fuse_copy_finish(struct fuse_copy_state *cs)
+{
+	if (cs->currbuf) {
+		struct pipe_buffer *buf = cs->currbuf;
+
+		if (!cs->write) {
+			buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
+		} else {
+			kunmap(buf->page);
+			buf->len = PAGE_SIZE - cs->len;
+		}
+		cs->currbuf = NULL;
+		cs->mapaddr = NULL;
+	} else if (cs->mapaddr) {
+		kunmap(cs->pg);
+		if (cs->write) {
+			flush_dcache_page(cs->pg);
+			set_page_dirty_lock(cs->pg);
+		}
+		put_page(cs->pg);
+		cs->mapaddr = NULL;
+	}
+}
+
+/*
+ * Get another pagefull of userspace buffer, and map it to kernel
+ * address space, and lock request
+ */
+static int fuse_copy_fill(struct fuse_copy_state *cs)
+{
+	unsigned long offset;
+	int err;
+
+	unlock_request(cs->fc, cs->req);
+	fuse_copy_finish(cs);
+	if (cs->pipebufs) {
+		struct pipe_buffer *buf = cs->pipebufs;
+
+		if (!cs->write) {
+			err = buf->ops->confirm(cs->pipe, buf);
+			if (err)
+				return err;
+
+			BUG_ON(!cs->nr_segs);
+			cs->currbuf = buf;
+			cs->mapaddr = buf->ops->map(cs->pipe, buf, 0);
+			cs->len = buf->len;
+			cs->buf = cs->mapaddr + buf->offset;
+			cs->pipebufs++;
+			cs->nr_segs--;
+		} else {
+			struct page *page;
+
+			if (cs->nr_segs == cs->pipe->buffers)
+				return -EIO;
+
+			page = alloc_page(GFP_HIGHUSER);
+			if (!page)
+				return -ENOMEM;
+
+			buf->page = page;
+			buf->offset = 0;
+			buf->len = 0;
+
+			cs->currbuf = buf;
+			cs->mapaddr = kmap(page);
+			cs->buf = cs->mapaddr;
+			cs->len = PAGE_SIZE;
+			cs->pipebufs++;
+			cs->nr_segs++;
+		}
+	} else {
+		if (!cs->seglen) {
+			BUG_ON(!cs->nr_segs);
+			cs->seglen = cs->iov[0].iov_len;
+			cs->addr = (unsigned long) cs->iov[0].iov_base;
+			cs->iov++;
+			cs->nr_segs--;
+		}
+		err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg);
+		if (err < 0)
+			return err;
+		BUG_ON(err != 1);
+		offset = cs->addr % PAGE_SIZE;
+		cs->mapaddr = kmap(cs->pg);
+		cs->buf = cs->mapaddr + offset;
+		cs->len = min(PAGE_SIZE - offset, cs->seglen);
+		cs->seglen -= cs->len;
+		cs->addr += cs->len;
+	}
+
+	return lock_request(cs->fc, cs->req);
+}
+
+/* Do as much copy to/from userspace buffer as we can */
+static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
+{
+	unsigned ncpy = min(*size, cs->len);
+	if (val) {
+		if (cs->write)
+			memcpy(cs->buf, *val, ncpy);
+		else
+			memcpy(*val, cs->buf, ncpy);
+		*val += ncpy;
+	}
+	*size -= ncpy;
+	cs->len -= ncpy;
+	cs->buf += ncpy;
+	return ncpy;
+}
+
+static int fuse_check_page(struct page *page)
+{
+	if (page_mapcount(page) ||
+	    page->mapping != NULL ||
+	    page_count(page) != 1 ||
+	    (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
+	     ~(1 << PG_locked |
+	       1 << PG_referenced |
+	       1 << PG_uptodate |
+	       1 << PG_lru |
+	       1 << PG_active |
+	       1 << PG_reclaim))) {
+		printk(KERN_WARNING "fuse: trying to steal weird page\n");
+		printk(KERN_WARNING "  page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
+		return 1;
+	}
+	return 0;
+}
+
+static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
+{
+	int err;
+	struct page *oldpage = *pagep;
+	struct page *newpage;
+	struct pipe_buffer *buf = cs->pipebufs;
+	struct address_space *mapping;
+	pgoff_t index;
+
+	unlock_request(cs->fc, cs->req);
+	fuse_copy_finish(cs);
+
+	err = buf->ops->confirm(cs->pipe, buf);
+	if (err)
+		return err;
+
+	BUG_ON(!cs->nr_segs);
+	cs->currbuf = buf;
+	cs->len = buf->len;
+	cs->pipebufs++;
+	cs->nr_segs--;
+
+	if (cs->len != PAGE_SIZE)
+		goto out_fallback;
+
+	if (buf->ops->steal(cs->pipe, buf) != 0)
+		goto out_fallback;
+
+	newpage = buf->page;
+
+	if (WARN_ON(!PageUptodate(newpage)))
+		return -EIO;
+
+	ClearPageMappedToDisk(newpage);
+
+	if (fuse_check_page(newpage) != 0)
+		goto out_fallback_unlock;
+
+	mapping = oldpage->mapping;
+	index = oldpage->index;
+
+	/*
+	 * This is a new and locked page, it shouldn't be mapped or
+	 * have any special flags on it
+	 */
+	if (WARN_ON(page_mapped(oldpage)))
+		goto out_fallback_unlock;
+	if (WARN_ON(page_has_private(oldpage)))
+		goto out_fallback_unlock;
+	if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
+		goto out_fallback_unlock;
+	if (WARN_ON(PageMlocked(oldpage)))
+		goto out_fallback_unlock;
+
+	err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
+	if (err) {
+		unlock_page(newpage);
+		return err;
+	}
+
+	page_cache_get(newpage);
+
+	if (!(buf->flags & PIPE_BUF_FLAG_LRU))
+		lru_cache_add_file(newpage);
+
+	err = 0;
+	spin_lock(&cs->fc->lock);
+	if (cs->req->aborted)
+		err = -ENOENT;
+	else
+		*pagep = newpage;
+	spin_unlock(&cs->fc->lock);
+
+	if (err) {
+		unlock_page(newpage);
+		page_cache_release(newpage);
+		return err;
+	}
+
+	unlock_page(oldpage);
+	page_cache_release(oldpage);
+	cs->len = 0;
+
+	return 0;
+
+out_fallback_unlock:
+	unlock_page(newpage);
+out_fallback:
+	cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+	cs->buf = cs->mapaddr + buf->offset;
+
+	err = lock_request(cs->fc, cs->req);
+	if (err)
+		return err;
+
+	return 1;
+}
+
+static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
+			 unsigned offset, unsigned count)
+{
+	struct pipe_buffer *buf;
+
+	if (cs->nr_segs == cs->pipe->buffers)
+		return -EIO;
+
+	unlock_request(cs->fc, cs->req);
+	fuse_copy_finish(cs);
+
+	buf = cs->pipebufs;
+	page_cache_get(page);
+	buf->page = page;
+	buf->offset = offset;
+	buf->len = count;
+
+	cs->pipebufs++;
+	cs->nr_segs++;
+	cs->len = 0;
+
+	return 0;
+}
+
+/*
+ * Copy a page in the request to/from the userspace buffer.  Must be
+ * done atomically
+ */
+static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
+			  unsigned offset, unsigned count, int zeroing)
+{
+	int err;
+	struct page *page = *pagep;
+
+	if (page && zeroing && count < PAGE_SIZE)
+		clear_highpage(page);
+
+	while (count) {
+		if (cs->write && cs->pipebufs && page) {
+			return fuse_ref_page(cs, page, offset, count);
+		} else if (!cs->len) {
+			if (cs->move_pages && page &&
+			    offset == 0 && count == PAGE_SIZE) {
+				err = fuse_try_move_page(cs, pagep);
+				if (err <= 0)
+					return err;
+			} else {
+				err = fuse_copy_fill(cs);
+				if (err)
+					return err;
+			}
+		}
+		if (page) {
+			void *mapaddr = kmap_atomic(page, KM_USER0);
+			void *buf = mapaddr + offset;
+			offset += fuse_copy_do(cs, &buf, &count);
+			kunmap_atomic(mapaddr, KM_USER0);
+		} else
+			offset += fuse_copy_do(cs, NULL, &count);
+	}
+	if (page && !cs->write)
+		flush_dcache_page(page);
+	return 0;
+}
+
+/* Copy pages in the request to/from userspace buffer */
+static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
+			   int zeroing)
+{
+	unsigned i;
+	struct fuse_req *req = cs->req;
+	unsigned offset = req->page_offset;
+	unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
+
+	for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
+		int err;
+
+		err = fuse_copy_page(cs, &req->pages[i], offset, count,
+				     zeroing);
+		if (err)
+			return err;
+
+		nbytes -= count;
+		count = min(nbytes, (unsigned) PAGE_SIZE);
+		offset = 0;
+	}
+	return 0;
+}
+
+/* Copy a single argument in the request to/from userspace buffer */
+static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
+{
+	while (size) {
+		if (!cs->len) {
+			int err = fuse_copy_fill(cs);
+			if (err)
+				return err;
+		}
+		fuse_copy_do(cs, &val, &size);
+	}
+	return 0;
+}
+
+/* Copy request arguments to/from userspace buffer */
+static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
+			  unsigned argpages, struct fuse_arg *args,
+			  int zeroing)
+{
+	int err = 0;
+	unsigned i;
+
+	for (i = 0; !err && i < numargs; i++)  {
+		struct fuse_arg *arg = &args[i];
+		if (i == numargs - 1 && argpages)
+			err = fuse_copy_pages(cs, arg->size, zeroing);
+		else
+			err = fuse_copy_one(cs, arg->value, arg->size);
+	}
+	return err;
+}
+
+static int forget_pending(struct fuse_conn *fc)
+{
+	return fc->forget_list_head.next != NULL;
+}
+
+static int request_pending(struct fuse_conn *fc)
+{
+	return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
+		forget_pending(fc);
+}
+
+/* Wait until a request is available on the pending list */
+static void request_wait(struct fuse_conn *fc)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue_exclusive(&fc->waitq, &wait);
+	while (fc->connected && !request_pending(fc)) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (signal_pending(current))
+			break;
+
+		spin_unlock(&fc->lock);
+		schedule();
+		spin_lock(&fc->lock);
+	}
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&fc->waitq, &wait);
+}
+
+/*
+ * Transfer an interrupt request to userspace
+ *
+ * Unlike other requests this is assembled on demand, without a need
+ * to allocate a separate fuse_req structure.
+ *
+ * Called with fc->lock held, releases it
+ */
+static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
+			       size_t nbytes, struct fuse_req *req)
+__releases(fc->lock)
+{
+	struct fuse_in_header ih;
+	struct fuse_interrupt_in arg;
+	unsigned reqsize = sizeof(ih) + sizeof(arg);
+	int err;
+
+	list_del_init(&req->intr_entry);
+	req->intr_unique = fuse_get_unique(fc);
+	memset(&ih, 0, sizeof(ih));
+	memset(&arg, 0, sizeof(arg));
+	ih.len = reqsize;
+	ih.opcode = FUSE_INTERRUPT;
+	ih.unique = req->intr_unique;
+	arg.unique = req->in.h.unique;
+
+	spin_unlock(&fc->lock);
+	if (nbytes < reqsize)
+		return -EINVAL;
+
+	err = fuse_copy_one(cs, &ih, sizeof(ih));
+	if (!err)
+		err = fuse_copy_one(cs, &arg, sizeof(arg));
+	fuse_copy_finish(cs);
+
+	return err ? err : reqsize;
+}
+
+static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
+					       unsigned max,
+					       unsigned *countp)
+{
+	struct fuse_forget_link *head = fc->forget_list_head.next;
+	struct fuse_forget_link **newhead = &head;
+	unsigned count;
+
+	for (count = 0; *newhead != NULL && count < max; count++)
+		newhead = &(*newhead)->next;
+
+	fc->forget_list_head.next = *newhead;
+	*newhead = NULL;
+	if (fc->forget_list_head.next == NULL)
+		fc->forget_list_tail = &fc->forget_list_head;
+
+	if (countp != NULL)
+		*countp = count;
+
+	return head;
+}
+
+static int fuse_read_single_forget(struct fuse_conn *fc,
+				   struct fuse_copy_state *cs,
+				   size_t nbytes)
+__releases(fc->lock)
+{
+	int err;
+	struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
+	struct fuse_forget_in arg = {
+		.nlookup = forget->forget_one.nlookup,
+	};
+	struct fuse_in_header ih = {
+		.opcode = FUSE_FORGET,
+		.nodeid = forget->forget_one.nodeid,
+		.unique = fuse_get_unique(fc),
+		.len = sizeof(ih) + sizeof(arg),
+	};
+
+	spin_unlock(&fc->lock);
+	kfree(forget);
+	if (nbytes < ih.len)
+		return -EINVAL;
+
+	err = fuse_copy_one(cs, &ih, sizeof(ih));
+	if (!err)
+		err = fuse_copy_one(cs, &arg, sizeof(arg));
+	fuse_copy_finish(cs);
+
+	if (err)
+		return err;
+
+	return ih.len;
+}
+
+static int fuse_read_batch_forget(struct fuse_conn *fc,
+				   struct fuse_copy_state *cs, size_t nbytes)
+__releases(fc->lock)
+{
+	int err;
+	unsigned max_forgets;
+	unsigned count;
+	struct fuse_forget_link *head;
+	struct fuse_batch_forget_in arg = { .count = 0 };
+	struct fuse_in_header ih = {
+		.opcode = FUSE_BATCH_FORGET,
+		.unique = fuse_get_unique(fc),
+		.len = sizeof(ih) + sizeof(arg),
+	};
+
+	if (nbytes < ih.len) {
+		spin_unlock(&fc->lock);
+		return -EINVAL;
+	}
+
+	max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
+	head = dequeue_forget(fc, max_forgets, &count);
+	spin_unlock(&fc->lock);
+
+	arg.count = count;
+	ih.len += count * sizeof(struct fuse_forget_one);
+	err = fuse_copy_one(cs, &ih, sizeof(ih));
+	if (!err)
+		err = fuse_copy_one(cs, &arg, sizeof(arg));
+
+	while (head) {
+		struct fuse_forget_link *forget = head;
+
+		if (!err) {
+			err = fuse_copy_one(cs, &forget->forget_one,
+					    sizeof(forget->forget_one));
+		}
+		head = forget->next;
+		kfree(forget);
+	}
+
+	fuse_copy_finish(cs);
+
+	if (err)
+		return err;
+
+	return ih.len;
+}
+
+static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
+			    size_t nbytes)
+__releases(fc->lock)
+{
+	if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
+		return fuse_read_single_forget(fc, cs, nbytes);
+	else
+		return fuse_read_batch_forget(fc, cs, nbytes);
+}
+
+/*
+ * Read a single request into the userspace filesystem's buffer.  This
+ * function waits until a request is available, then removes it from
+ * the pending list and copies request data to userspace buffer.  If
+ * no reply is needed (FORGET) or request has been aborted or there
+ * was an error during the copying then it's finished by calling
+ * request_end().  Otherwise add it to the processing list, and set
+ * the 'sent' flag.
+ */
+static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
+				struct fuse_copy_state *cs, size_t nbytes)
+{
+	int err;
+	struct fuse_req *req;
+	struct fuse_in *in;
+	unsigned reqsize;
+
+ restart:
+	spin_lock(&fc->lock);
+	err = -EAGAIN;
+	if ((file->f_flags & O_NONBLOCK) && fc->connected &&
+	    !request_pending(fc))
+		goto err_unlock;
+
+	request_wait(fc);
+	err = -ENODEV;
+	if (!fc->connected)
+		goto err_unlock;
+	err = -ERESTARTSYS;
+	if (!request_pending(fc))
+		goto err_unlock;
+
+	if (!list_empty(&fc->interrupts)) {
+		req = list_entry(fc->interrupts.next, struct fuse_req,
+				 intr_entry);
+		return fuse_read_interrupt(fc, cs, nbytes, req);
+	}
+
+	if (forget_pending(fc)) {
+		if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
+			return fuse_read_forget(fc, cs, nbytes);
+
+		if (fc->forget_batch <= -8)
+			fc->forget_batch = 16;
+	}
+
+	req = list_entry(fc->pending.next, struct fuse_req, list);
+	req->state = FUSE_REQ_READING;
+	list_move(&req->list, &fc->io);
+
+	in = &req->in;
+	reqsize = in->h.len;
+	/* If request is too large, reply with an error and restart the read */
+	if (nbytes < reqsize) {
+		req->out.h.error = -EIO;
+		/* SETXATTR is special, since it may contain too large data */
+		if (in->h.opcode == FUSE_SETXATTR)
+			req->out.h.error = -E2BIG;
+		request_end(fc, req);
+		goto restart;
+	}
+	spin_unlock(&fc->lock);
+	cs->req = req;
+	err = fuse_copy_one(cs, &in->h, sizeof(in->h));
+	if (!err)
+		err = fuse_copy_args(cs, in->numargs, in->argpages,
+				     (struct fuse_arg *) in->args, 0);
+	fuse_copy_finish(cs);
+	spin_lock(&fc->lock);
+	req->locked = 0;
+	if (req->aborted) {
+		request_end(fc, req);
+		return -ENODEV;
+	}
+	if (err) {
+		req->out.h.error = -EIO;
+		request_end(fc, req);
+		return err;
+	}
+	if (!req->isreply)
+		request_end(fc, req);
+	else {
+		req->state = FUSE_REQ_SENT;
+		list_move_tail(&req->list, &fc->processing);
+		if (req->interrupted)
+			queue_interrupt(fc, req);
+		spin_unlock(&fc->lock);
+	}
+	return reqsize;
+
+ err_unlock:
+	spin_unlock(&fc->lock);
+	return err;
+}
+
+static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
+			      unsigned long nr_segs, loff_t pos)
+{
+	struct fuse_copy_state cs;
+	struct file *file = iocb->ki_filp;
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (!fc)
+		return -EPERM;
+
+	fuse_copy_init(&cs, fc, 1, iov, nr_segs);
+
+	return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
+}
+
+static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
+				   struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = generic_pipe_buf_release,
+	.steal = fuse_dev_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
+				    struct pipe_inode_info *pipe,
+				    size_t len, unsigned int flags)
+{
+	int ret;
+	int page_nr = 0;
+	int do_wakeup = 0;
+	struct pipe_buffer *bufs;
+	struct fuse_copy_state cs;
+	struct fuse_conn *fc = fuse_get_conn(in);
+	if (!fc)
+		return -EPERM;
+
+	bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
+	if (!bufs)
+		return -ENOMEM;
+
+	fuse_copy_init(&cs, fc, 1, NULL, 0);
+	cs.pipebufs = bufs;
+	cs.pipe = pipe;
+	ret = fuse_dev_do_read(fc, in, &cs, len);
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+	pipe_lock(pipe);
+
+	if (!pipe->readers) {
+		send_sig(SIGPIPE, current, 0);
+		if (!ret)
+			ret = -EPIPE;
+		goto out_unlock;
+	}
+
+	if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
+		ret = -EIO;
+		goto out_unlock;
+	}
+
+	while (page_nr < cs.nr_segs) {
+		int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+		struct pipe_buffer *buf = pipe->bufs + newbuf;
+
+		buf->page = bufs[page_nr].page;
+		buf->offset = bufs[page_nr].offset;
+		buf->len = bufs[page_nr].len;
+		buf->ops = &fuse_dev_pipe_buf_ops;
+
+		pipe->nrbufs++;
+		page_nr++;
+		ret += buf->len;
+
+		if (pipe->inode)
+			do_wakeup = 1;
+	}
+
+out_unlock:
+	pipe_unlock(pipe);
+
+	if (do_wakeup) {
+		smp_mb();
+		if (waitqueue_active(&pipe->wait))
+			wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+	}
+
+out:
+	for (; page_nr < cs.nr_segs; page_nr++)
+		page_cache_release(bufs[page_nr].page);
+
+	kfree(bufs);
+	return ret;
+}
+
+static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
+			    struct fuse_copy_state *cs)
+{
+	struct fuse_notify_poll_wakeup_out outarg;
+	int err = -EINVAL;
+
+	if (size != sizeof(outarg))
+		goto err;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto err;
+
+	fuse_copy_finish(cs);
+	return fuse_notify_poll_wakeup(fc, &outarg);
+
+err:
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
+				   struct fuse_copy_state *cs)
+{
+	struct fuse_notify_inval_inode_out outarg;
+	int err = -EINVAL;
+
+	if (size != sizeof(outarg))
+		goto err;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto err;
+	fuse_copy_finish(cs);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (fc->sb) {
+		err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
+					       outarg.off, outarg.len);
+	}
+	up_read(&fc->killsb);
+	return err;
+
+err:
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
+				   struct fuse_copy_state *cs)
+{
+	struct fuse_notify_inval_entry_out outarg;
+	int err = -ENOMEM;
+	char *buf;
+	struct qstr name;
+
+	buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+	if (!buf)
+		goto err;
+
+	err = -EINVAL;
+	if (size < sizeof(outarg))
+		goto err;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto err;
+
+	err = -ENAMETOOLONG;
+	if (outarg.namelen > FUSE_NAME_MAX)
+		goto err;
+
+	err = -EINVAL;
+	if (size != sizeof(outarg) + outarg.namelen + 1)
+		goto err;
+
+	name.name = buf;
+	name.len = outarg.namelen;
+	err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+	if (err)
+		goto err;
+	fuse_copy_finish(cs);
+	buf[outarg.namelen] = 0;
+	name.hash = full_name_hash(name.name, name.len);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (fc->sb)
+		err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
+	up_read(&fc->killsb);
+	kfree(buf);
+	return err;
+
+err:
+	kfree(buf);
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
+			     struct fuse_copy_state *cs)
+{
+	struct fuse_notify_store_out outarg;
+	struct inode *inode;
+	struct address_space *mapping;
+	u64 nodeid;
+	int err;
+	pgoff_t index;
+	unsigned int offset;
+	unsigned int num;
+	loff_t file_size;
+	loff_t end;
+
+	err = -EINVAL;
+	if (size < sizeof(outarg))
+		goto out_finish;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto out_finish;
+
+	err = -EINVAL;
+	if (size - sizeof(outarg) != outarg.size)
+		goto out_finish;
+
+	nodeid = outarg.nodeid;
+
+	down_read(&fc->killsb);
+
+	err = -ENOENT;
+	if (!fc->sb)
+		goto out_up_killsb;
+
+	inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+	if (!inode)
+		goto out_up_killsb;
+
+	mapping = inode->i_mapping;
+	index = outarg.offset >> PAGE_CACHE_SHIFT;
+	offset = outarg.offset & ~PAGE_CACHE_MASK;
+	file_size = i_size_read(inode);
+	end = outarg.offset + outarg.size;
+	if (end > file_size) {
+		file_size = end;
+		fuse_write_update_size(inode, file_size);
+	}
+
+	num = outarg.size;
+	while (num) {
+		struct page *page;
+		unsigned int this_num;
+
+		err = -ENOMEM;
+		page = find_or_create_page(mapping, index,
+					   mapping_gfp_mask(mapping));
+		if (!page)
+			goto out_iput;
+
+		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+		err = fuse_copy_page(cs, &page, offset, this_num, 0);
+		if (!err && offset == 0 && (num != 0 || file_size == end))
+			SetPageUptodate(page);
+		unlock_page(page);
+		page_cache_release(page);
+
+		if (err)
+			goto out_iput;
+
+		num -= this_num;
+		offset = 0;
+		index++;
+	}
+
+	err = 0;
+
+out_iput:
+	iput(inode);
+out_up_killsb:
+	up_read(&fc->killsb);
+out_finish:
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+	release_pages(req->pages, req->num_pages, 0);
+}
+
+static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
+			 struct fuse_notify_retrieve_out *outarg)
+{
+	int err;
+	struct address_space *mapping = inode->i_mapping;
+	struct fuse_req *req;
+	pgoff_t index;
+	loff_t file_size;
+	unsigned int num;
+	unsigned int offset;
+	size_t total_len = 0;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	offset = outarg->offset & ~PAGE_CACHE_MASK;
+
+	req->in.h.opcode = FUSE_NOTIFY_REPLY;
+	req->in.h.nodeid = outarg->nodeid;
+	req->in.numargs = 2;
+	req->in.argpages = 1;
+	req->page_offset = offset;
+	req->end = fuse_retrieve_end;
+
+	index = outarg->offset >> PAGE_CACHE_SHIFT;
+	file_size = i_size_read(inode);
+	num = outarg->size;
+	if (outarg->offset > file_size)
+		num = 0;
+	else if (outarg->offset + num > file_size)
+		num = file_size - outarg->offset;
+
+	while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) {
+		struct page *page;
+		unsigned int this_num;
+
+		page = find_get_page(mapping, index);
+		if (!page)
+			break;
+
+		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+		req->pages[req->num_pages] = page;
+		req->num_pages++;
+
+		num -= this_num;
+		total_len += this_num;
+		index++;
+	}
+	req->misc.retrieve_in.offset = outarg->offset;
+	req->misc.retrieve_in.size = total_len;
+	req->in.args[0].size = sizeof(req->misc.retrieve_in);
+	req->in.args[0].value = &req->misc.retrieve_in;
+	req->in.args[1].size = total_len;
+
+	err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique);
+	if (err)
+		fuse_retrieve_end(fc, req);
+
+	return err;
+}
+
+static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
+				struct fuse_copy_state *cs)
+{
+	struct fuse_notify_retrieve_out outarg;
+	struct inode *inode;
+	int err;
+
+	err = -EINVAL;
+	if (size != sizeof(outarg))
+		goto copy_finish;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto copy_finish;
+
+	fuse_copy_finish(cs);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (fc->sb) {
+		u64 nodeid = outarg.nodeid;
+
+		inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+		if (inode) {
+			err = fuse_retrieve(fc, inode, &outarg);
+			iput(inode);
+		}
+	}
+	up_read(&fc->killsb);
+
+	return err;
+
+copy_finish:
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
+		       unsigned int size, struct fuse_copy_state *cs)
+{
+	switch (code) {
+	case FUSE_NOTIFY_POLL:
+		return fuse_notify_poll(fc, size, cs);
+
+	case FUSE_NOTIFY_INVAL_INODE:
+		return fuse_notify_inval_inode(fc, size, cs);
+
+	case FUSE_NOTIFY_INVAL_ENTRY:
+		return fuse_notify_inval_entry(fc, size, cs);
+
+	case FUSE_NOTIFY_STORE:
+		return fuse_notify_store(fc, size, cs);
+
+	case FUSE_NOTIFY_RETRIEVE:
+		return fuse_notify_retrieve(fc, size, cs);
+
+	default:
+		fuse_copy_finish(cs);
+		return -EINVAL;
+	}
+}
+
+/* Look up request on processing list by unique ID */
+static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
+{
+	struct list_head *entry;
+
+	list_for_each(entry, &fc->processing) {
+		struct fuse_req *req;
+		req = list_entry(entry, struct fuse_req, list);
+		if (req->in.h.unique == unique || req->intr_unique == unique)
+			return req;
+	}
+	return NULL;
+}
+
+static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
+			 unsigned nbytes)
+{
+	unsigned reqsize = sizeof(struct fuse_out_header);
+
+	if (out->h.error)
+		return nbytes != reqsize ? -EINVAL : 0;
+
+	reqsize += len_args(out->numargs, out->args);
+
+	if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
+		return -EINVAL;
+	else if (reqsize > nbytes) {
+		struct fuse_arg *lastarg = &out->args[out->numargs-1];
+		unsigned diffsize = reqsize - nbytes;
+		if (diffsize > lastarg->size)
+			return -EINVAL;
+		lastarg->size -= diffsize;
+	}
+	return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
+			      out->page_zeroing);
+}
+
+/*
+ * Write a single reply to a request.  First the header is copied from
+ * the write buffer.  The request is then searched on the processing
+ * list by the unique ID found in the header.  If found, then remove
+ * it from the list and copy the rest of the buffer to the request.
+ * The request is finished by calling request_end()
+ */
+static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
+				 struct fuse_copy_state *cs, size_t nbytes)
+{
+	int err;
+	struct fuse_req *req;
+	struct fuse_out_header oh;
+
+	if (nbytes < sizeof(struct fuse_out_header))
+		return -EINVAL;
+
+	err = fuse_copy_one(cs, &oh, sizeof(oh));
+	if (err)
+		goto err_finish;
+
+	err = -EINVAL;
+	if (oh.len != nbytes)
+		goto err_finish;
+
+	/*
+	 * Zero oh.unique indicates unsolicited notification message
+	 * and error contains notification code.
+	 */
+	if (!oh.unique) {
+		err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
+		return err ? err : nbytes;
+	}
+
+	err = -EINVAL;
+	if (oh.error <= -1000 || oh.error > 0)
+		goto err_finish;
+
+	spin_lock(&fc->lock);
+	err = -ENOENT;
+	if (!fc->connected)
+		goto err_unlock;
+
+	req = request_find(fc, oh.unique);
+	if (!req)
+		goto err_unlock;
+
+	if (req->aborted) {
+		spin_unlock(&fc->lock);
+		fuse_copy_finish(cs);
+		spin_lock(&fc->lock);
+		request_end(fc, req);
+		return -ENOENT;
+	}
+	/* Is it an interrupt reply? */
+	if (req->intr_unique == oh.unique) {
+		err = -EINVAL;
+		if (nbytes != sizeof(struct fuse_out_header))
+			goto err_unlock;
+
+		if (oh.error == -ENOSYS)
+			fc->no_interrupt = 1;
+		else if (oh.error == -EAGAIN)
+			queue_interrupt(fc, req);
+
+		spin_unlock(&fc->lock);
+		fuse_copy_finish(cs);
+		return nbytes;
+	}
+
+	req->state = FUSE_REQ_WRITING;
+	list_move(&req->list, &fc->io);
+	req->out.h = oh;
+	req->locked = 1;
+	cs->req = req;
+	if (!req->out.page_replace)
+		cs->move_pages = 0;
+	spin_unlock(&fc->lock);
+
+	err = copy_out_args(cs, &req->out, nbytes);
+	fuse_copy_finish(cs);
+
+	spin_lock(&fc->lock);
+	req->locked = 0;
+	if (!err) {
+		if (req->aborted)
+			err = -ENOENT;
+	} else if (!req->aborted)
+		req->out.h.error = -EIO;
+	request_end(fc, req);
+
+	return err ? err : nbytes;
+
+ err_unlock:
+	spin_unlock(&fc->lock);
+ err_finish:
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
+			      unsigned long nr_segs, loff_t pos)
+{
+	struct fuse_copy_state cs;
+	struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
+	if (!fc)
+		return -EPERM;
+
+	fuse_copy_init(&cs, fc, 0, iov, nr_segs);
+
+	return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
+}
+
+static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
+				     struct file *out, loff_t *ppos,
+				     size_t len, unsigned int flags)
+{
+	unsigned nbuf;
+	unsigned idx;
+	struct pipe_buffer *bufs;
+	struct fuse_copy_state cs;
+	struct fuse_conn *fc;
+	size_t rem;
+	ssize_t ret;
+
+	fc = fuse_get_conn(out);
+	if (!fc)
+		return -EPERM;
+
+	bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
+	if (!bufs)
+		return -ENOMEM;
+
+	pipe_lock(pipe);
+	nbuf = 0;
+	rem = 0;
+	for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
+		rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
+
+	ret = -EINVAL;
+	if (rem < len) {
+		pipe_unlock(pipe);
+		goto out;
+	}
+
+	rem = len;
+	while (rem) {
+		struct pipe_buffer *ibuf;
+		struct pipe_buffer *obuf;
+
+		BUG_ON(nbuf >= pipe->buffers);
+		BUG_ON(!pipe->nrbufs);
+		ibuf = &pipe->bufs[pipe->curbuf];
+		obuf = &bufs[nbuf];
+
+		if (rem >= ibuf->len) {
+			*obuf = *ibuf;
+			ibuf->ops = NULL;
+			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+			pipe->nrbufs--;
+		} else {
+			ibuf->ops->get(pipe, ibuf);
+			*obuf = *ibuf;
+			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+			obuf->len = rem;
+			ibuf->offset += obuf->len;
+			ibuf->len -= obuf->len;
+		}
+		nbuf++;
+		rem -= obuf->len;
+	}
+	pipe_unlock(pipe);
+
+	fuse_copy_init(&cs, fc, 0, NULL, nbuf);
+	cs.pipebufs = bufs;
+	cs.pipe = pipe;
+
+	if (flags & SPLICE_F_MOVE)
+		cs.move_pages = 1;
+
+	ret = fuse_dev_do_write(fc, &cs, len);
+
+	for (idx = 0; idx < nbuf; idx++) {
+		struct pipe_buffer *buf = &bufs[idx];
+		buf->ops->release(pipe, buf);
+	}
+out:
+	kfree(bufs);
+	return ret;
+}
+
+static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
+{
+	unsigned mask = POLLOUT | POLLWRNORM;
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (!fc)
+		return POLLERR;
+
+	poll_wait(file, &fc->waitq, wait);
+
+	spin_lock(&fc->lock);
+	if (!fc->connected)
+		mask = POLLERR;
+	else if (request_pending(fc))
+		mask |= POLLIN | POLLRDNORM;
+	spin_unlock(&fc->lock);
+
+	return mask;
+}
+
+/*
+ * Abort all requests on the given list (pending or processing)
+ *
+ * This function releases and reacquires fc->lock
+ */
+static void end_requests(struct fuse_conn *fc, struct list_head *head)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	while (!list_empty(head)) {
+		struct fuse_req *req;
+		req = list_entry(head->next, struct fuse_req, list);
+		req->out.h.error = -ECONNABORTED;
+		request_end(fc, req);
+		spin_lock(&fc->lock);
+	}
+}
+
+/*
+ * Abort requests under I/O
+ *
+ * The requests are set to aborted and finished, and the request
+ * waiter is woken up.  This will make request_wait_answer() wait
+ * until the request is unlocked and then return.
+ *
+ * If the request is asynchronous, then the end function needs to be
+ * called after waiting for the request to be unlocked (if it was
+ * locked).
+ */
+static void end_io_requests(struct fuse_conn *fc)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	while (!list_empty(&fc->io)) {
+		struct fuse_req *req =
+			list_entry(fc->io.next, struct fuse_req, list);
+		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
+
+		req->aborted = 1;
+		req->out.h.error = -ECONNABORTED;
+		req->state = FUSE_REQ_FINISHED;
+		list_del_init(&req->list);
+		wake_up(&req->waitq);
+		if (end) {
+			req->end = NULL;
+			__fuse_get_request(req);
+			spin_unlock(&fc->lock);
+			wait_event(req->waitq, !req->locked);
+			end(fc, req);
+			fuse_put_request(fc, req);
+			spin_lock(&fc->lock);
+		}
+	}
+}
+
+static void end_queued_requests(struct fuse_conn *fc)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	fc->max_background = UINT_MAX;
+	flush_bg_queue(fc);
+	end_requests(fc, &fc->pending);
+	end_requests(fc, &fc->processing);
+	while (forget_pending(fc))
+		kfree(dequeue_forget(fc, 1, NULL));
+}
+
+static void end_polls(struct fuse_conn *fc)
+{
+	struct rb_node *p;
+
+	p = rb_first(&fc->polled_files);
+
+	while (p) {
+		struct fuse_file *ff;
+		ff = rb_entry(p, struct fuse_file, polled_node);
+		wake_up_interruptible_all(&ff->poll_wait);
+
+		p = rb_next(p);
+	}
+}
+
+/*
+ * Abort all requests.
+ *
+ * Emergency exit in case of a malicious or accidental deadlock, or
+ * just a hung filesystem.
+ *
+ * The same effect is usually achievable through killing the
+ * filesystem daemon and all users of the filesystem.  The exception
+ * is the combination of an asynchronous request and the tricky
+ * deadlock (see Documentation/filesystems/fuse.txt).
+ *
+ * During the aborting, progression of requests from the pending and
+ * processing lists onto the io list, and progression of new requests
+ * onto the pending list is prevented by req->connected being false.
+ *
+ * Progression of requests under I/O to the processing list is
+ * prevented by the req->aborted flag being true for these requests.
+ * For this reason requests on the io list must be aborted first.
+ */
+void fuse_abort_conn(struct fuse_conn *fc)
+{
+	spin_lock(&fc->lock);
+	if (fc->connected) {
+		fc->connected = 0;
+		fc->blocked = 0;
+		end_io_requests(fc);
+		end_queued_requests(fc);
+		end_polls(fc);
+		wake_up_all(&fc->waitq);
+		wake_up_all(&fc->blocked_waitq);
+		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+	}
+	spin_unlock(&fc->lock);
+}
+EXPORT_SYMBOL_GPL(fuse_abort_conn);
+
+int fuse_dev_release(struct inode *inode, struct file *file)
+{
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (fc) {
+		spin_lock(&fc->lock);
+		fc->connected = 0;
+		fc->blocked = 0;
+		end_queued_requests(fc);
+		end_polls(fc);
+		wake_up_all(&fc->blocked_waitq);
+		spin_unlock(&fc->lock);
+		fuse_conn_put(fc);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_dev_release);
+
+static int fuse_dev_fasync(int fd, struct file *file, int on)
+{
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (!fc)
+		return -EPERM;
+
+	/* No locking - fasync_helper does its own locking */
+	return fasync_helper(fd, file, on, &fc->fasync);
+}
+
+const struct file_operations fuse_dev_operations = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.read		= do_sync_read,
+	.aio_read	= fuse_dev_read,
+	.splice_read	= fuse_dev_splice_read,
+	.write		= do_sync_write,
+	.aio_write	= fuse_dev_write,
+	.splice_write	= fuse_dev_splice_write,
+	.poll		= fuse_dev_poll,
+	.release	= fuse_dev_release,
+	.fasync		= fuse_dev_fasync,
+};
+EXPORT_SYMBOL_GPL(fuse_dev_operations);
+
+static struct miscdevice fuse_miscdevice = {
+	.minor = FUSE_MINOR,
+	.name  = "fuse",
+	.fops = &fuse_dev_operations,
+};
+
+int __init fuse_dev_init(void)
+{
+	int err = -ENOMEM;
+	fuse_req_cachep = kmem_cache_create("fuse_request",
+					    sizeof(struct fuse_req),
+					    0, 0, NULL);
+	if (!fuse_req_cachep)
+		goto out;
+
+	err = misc_register(&fuse_miscdevice);
+	if (err)
+		goto out_cache_clean;
+
+	return 0;
+
+ out_cache_clean:
+	kmem_cache_destroy(fuse_req_cachep);
+ out:
+	return err;
+}
+
+void fuse_dev_cleanup(void)
+{
+	misc_deregister(&fuse_miscdevice);
+	kmem_cache_destroy(fuse_req_cachep);
+}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
new file mode 100644
index 00000000..c04a025c
--- /dev/null
+++ b/fs/fuse/dir.c
@@ -0,0 +1,1638 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+
+#if BITS_PER_LONG >= 64
+static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
+{
+	entry->d_time = time;
+}
+
+static inline u64 fuse_dentry_time(struct dentry *entry)
+{
+	return entry->d_time;
+}
+#else
+/*
+ * On 32 bit archs store the high 32 bits of time in d_fsdata
+ */
+static void fuse_dentry_settime(struct dentry *entry, u64 time)
+{
+	entry->d_time = time;
+	entry->d_fsdata = (void *) (unsigned long) (time >> 32);
+}
+
+static u64 fuse_dentry_time(struct dentry *entry)
+{
+	return (u64) entry->d_time +
+		((u64) (unsigned long) entry->d_fsdata << 32);
+}
+#endif
+
+/*
+ * FUSE caches dentries and attributes with separate timeout.  The
+ * time in jiffies until the dentry/attributes are valid is stored in
+ * dentry->d_time and fuse_inode->i_time respectively.
+ */
+
+/*
+ * Calculate the time in jiffies until a dentry/attributes are valid
+ */
+static u64 time_to_jiffies(unsigned long sec, unsigned long nsec)
+{
+	if (sec || nsec) {
+		struct timespec ts = {sec, nsec};
+		return get_jiffies_64() + timespec_to_jiffies(&ts);
+	} else
+		return 0;
+}
+
+/*
+ * Set dentry and possibly attribute timeouts from the lookup/mk*
+ * replies
+ */
+static void fuse_change_entry_timeout(struct dentry *entry,
+				      struct fuse_entry_out *o)
+{
+	fuse_dentry_settime(entry,
+		time_to_jiffies(o->entry_valid, o->entry_valid_nsec));
+}
+
+static u64 attr_timeout(struct fuse_attr_out *o)
+{
+	return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+}
+
+static u64 entry_attr_timeout(struct fuse_entry_out *o)
+{
+	return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+}
+
+/*
+ * Mark the attributes as stale, so that at the next call to
+ * ->getattr() they will be fetched from userspace
+ */
+void fuse_invalidate_attr(struct inode *inode)
+{
+	get_fuse_inode(inode)->i_time = 0;
+}
+
+/*
+ * Just mark the entry as stale, so that a next attempt to look it up
+ * will result in a new lookup call to userspace
+ *
+ * This is called when a dentry is about to become negative and the
+ * timeout is unknown (unlink, rmdir, rename and in some cases
+ * lookup)
+ */
+void fuse_invalidate_entry_cache(struct dentry *entry)
+{
+	fuse_dentry_settime(entry, 0);
+}
+
+/*
+ * Same as fuse_invalidate_entry_cache(), but also try to remove the
+ * dentry from the hash
+ */
+static void fuse_invalidate_entry(struct dentry *entry)
+{
+	d_invalidate(entry);
+	fuse_invalidate_entry_cache(entry);
+}
+
+static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_req *req,
+			     u64 nodeid, struct qstr *name,
+			     struct fuse_entry_out *outarg)
+{
+	memset(outarg, 0, sizeof(struct fuse_entry_out));
+	req->in.h.opcode = FUSE_LOOKUP;
+	req->in.h.nodeid = nodeid;
+	req->in.numargs = 1;
+	req->in.args[0].size = name->len + 1;
+	req->in.args[0].value = name->name;
+	req->out.numargs = 1;
+	if (fc->minor < 9)
+		req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
+	else
+		req->out.args[0].size = sizeof(struct fuse_entry_out);
+	req->out.args[0].value = outarg;
+}
+
+u64 fuse_get_attr_version(struct fuse_conn *fc)
+{
+	u64 curr_version;
+
+	/*
+	 * The spin lock isn't actually needed on 64bit archs, but we
+	 * don't yet care too much about such optimizations.
+	 */
+	spin_lock(&fc->lock);
+	curr_version = fc->attr_version;
+	spin_unlock(&fc->lock);
+
+	return curr_version;
+}
+
+/*
+ * Check whether the dentry is still valid
+ *
+ * If the entry validity timeout has expired and the dentry is
+ * positive, try to redo the lookup.  If the lookup results in a
+ * different inode, then let the VFS invalidate the dentry and redo
+ * the lookup once more.  If the lookup results in the same inode,
+ * then refresh the attributes, timeouts and mark the dentry valid.
+ */
+static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
+{
+	struct inode *inode;
+
+	inode = ACCESS_ONCE(entry->d_inode);
+	if (inode && is_bad_inode(inode))
+		return 0;
+	else if (fuse_dentry_time(entry) < get_jiffies_64()) {
+		int err;
+		struct fuse_entry_out outarg;
+		struct fuse_conn *fc;
+		struct fuse_req *req;
+		struct fuse_forget_link *forget;
+		struct dentry *parent;
+		u64 attr_version;
+
+		/* For negative dentries, always do a fresh lookup */
+		if (!inode)
+			return 0;
+
+		if (nd && (nd->flags & LOOKUP_RCU))
+			return -ECHILD;
+
+		fc = get_fuse_conn(inode);
+		req = fuse_get_req(fc);
+		if (IS_ERR(req))
+			return 0;
+
+		forget = fuse_alloc_forget();
+		if (!forget) {
+			fuse_put_request(fc, req);
+			return 0;
+		}
+
+		attr_version = fuse_get_attr_version(fc);
+
+		parent = dget_parent(entry);
+		fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
+				 &entry->d_name, &outarg);
+		fuse_request_send(fc, req);
+		dput(parent);
+		err = req->out.h.error;
+		fuse_put_request(fc, req);
+		/* Zero nodeid is same as -ENOENT */
+		if (!err && !outarg.nodeid)
+			err = -ENOENT;
+		if (!err) {
+			struct fuse_inode *fi = get_fuse_inode(inode);
+			if (outarg.nodeid != get_node_id(inode)) {
+				fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+				return 0;
+			}
+			spin_lock(&fc->lock);
+			fi->nlookup++;
+			spin_unlock(&fc->lock);
+		}
+		kfree(forget);
+		if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
+			return 0;
+
+		fuse_change_attributes(inode, &outarg.attr,
+				       entry_attr_timeout(&outarg),
+				       attr_version);
+		fuse_change_entry_timeout(entry, &outarg);
+	}
+	return 1;
+}
+
+static int invalid_nodeid(u64 nodeid)
+{
+	return !nodeid || nodeid == FUSE_ROOT_ID;
+}
+
+const struct dentry_operations fuse_dentry_operations = {
+	.d_revalidate	= fuse_dentry_revalidate,
+};
+
+int fuse_valid_type(int m)
+{
+	return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) ||
+		S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
+}
+
+/*
+ * Add a directory inode to a dentry, ensuring that no other dentry
+ * refers to this inode.  Called with fc->inst_mutex.
+ */
+static struct dentry *fuse_d_add_directory(struct dentry *entry,
+					   struct inode *inode)
+{
+	struct dentry *alias = d_find_alias(inode);
+	if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
+		/* This tries to shrink the subtree below alias */
+		fuse_invalidate_entry(alias);
+		dput(alias);
+		if (!list_empty(&inode->i_dentry))
+			return ERR_PTR(-EBUSY);
+	} else {
+		dput(alias);
+	}
+	return d_splice_alias(inode, entry);
+}
+
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
+		     struct fuse_entry_out *outarg, struct inode **inode)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	struct fuse_req *req;
+	struct fuse_forget_link *forget;
+	u64 attr_version;
+	int err;
+
+	*inode = NULL;
+	err = -ENAMETOOLONG;
+	if (name->len > FUSE_NAME_MAX)
+		goto out;
+
+	req = fuse_get_req(fc);
+	err = PTR_ERR(req);
+	if (IS_ERR(req))
+		goto out;
+
+	forget = fuse_alloc_forget();
+	err = -ENOMEM;
+	if (!forget) {
+		fuse_put_request(fc, req);
+		goto out;
+	}
+
+	attr_version = fuse_get_attr_version(fc);
+
+	fuse_lookup_init(fc, req, nodeid, name, outarg);
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	/* Zero nodeid is same as -ENOENT, but with valid timeout */
+	if (err || !outarg->nodeid)
+		goto out_put_forget;
+
+	err = -EIO;
+	if (!outarg->nodeid)
+		goto out_put_forget;
+	if (!fuse_valid_type(outarg->attr.mode))
+		goto out_put_forget;
+
+	*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
+			   &outarg->attr, entry_attr_timeout(outarg),
+			   attr_version);
+	err = -ENOMEM;
+	if (!*inode) {
+		fuse_queue_forget(fc, forget, outarg->nodeid, 1);
+		goto out;
+	}
+	err = 0;
+
+ out_put_forget:
+	kfree(forget);
+ out:
+	return err;
+}
+
+static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+				  struct nameidata *nd)
+{
+	int err;
+	struct fuse_entry_out outarg;
+	struct inode *inode;
+	struct dentry *newent;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	bool outarg_valid = true;
+
+	err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
+			       &outarg, &inode);
+	if (err == -ENOENT) {
+		outarg_valid = false;
+		err = 0;
+	}
+	if (err)
+		goto out_err;
+
+	err = -EIO;
+	if (inode && get_node_id(inode) == FUSE_ROOT_ID)
+		goto out_iput;
+
+	if (inode && S_ISDIR(inode->i_mode)) {
+		mutex_lock(&fc->inst_mutex);
+		newent = fuse_d_add_directory(entry, inode);
+		mutex_unlock(&fc->inst_mutex);
+		err = PTR_ERR(newent);
+		if (IS_ERR(newent))
+			goto out_iput;
+	} else {
+		newent = d_splice_alias(inode, entry);
+	}
+
+	entry = newent ? newent : entry;
+	if (outarg_valid)
+		fuse_change_entry_timeout(entry, &outarg);
+	else
+		fuse_invalidate_entry_cache(entry);
+
+	return newent;
+
+ out_iput:
+	iput(inode);
+ out_err:
+	return ERR_PTR(err);
+}
+
+/*
+ * Atomic create+open operation
+ *
+ * If the filesystem doesn't support this, then fall back to separate
+ * 'mknod' + 'open' requests.
+ */
+static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
+			    struct nameidata *nd)
+{
+	int err;
+	struct inode *inode;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_req *req;
+	struct fuse_forget_link *forget;
+	struct fuse_create_in inarg;
+	struct fuse_open_out outopen;
+	struct fuse_entry_out outentry;
+	struct fuse_file *ff;
+	struct file *file;
+	int flags = nd->intent.open.flags - 1;
+
+	if (fc->no_create)
+		return -ENOSYS;
+
+	if (flags & O_DIRECT)
+		return -EINVAL;
+
+	forget = fuse_alloc_forget();
+	if (!forget)
+		return -ENOMEM;
+
+	req = fuse_get_req(fc);
+	err = PTR_ERR(req);
+	if (IS_ERR(req))
+		goto out_put_forget_req;
+
+	err = -ENOMEM;
+	ff = fuse_file_alloc(fc);
+	if (!ff)
+		goto out_put_request;
+
+	if (!fc->dont_mask)
+		mode &= ~current_umask();
+
+	flags &= ~O_NOCTTY;
+	memset(&inarg, 0, sizeof(inarg));
+	memset(&outentry, 0, sizeof(outentry));
+	inarg.flags = flags;
+	inarg.mode = mode;
+	inarg.umask = current_umask();
+	req->in.h.opcode = FUSE_CREATE;
+	req->in.h.nodeid = get_node_id(dir);
+	req->in.numargs = 2;
+	req->in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) :
+						sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = entry->d_name.len + 1;
+	req->in.args[1].value = entry->d_name.name;
+	req->out.numargs = 2;
+	if (fc->minor < 9)
+		req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
+	else
+		req->out.args[0].size = sizeof(outentry);
+	req->out.args[0].value = &outentry;
+	req->out.args[1].size = sizeof(outopen);
+	req->out.args[1].value = &outopen;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	if (err) {
+		if (err == -ENOSYS)
+			fc->no_create = 1;
+		goto out_free_ff;
+	}
+
+	err = -EIO;
+	if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid))
+		goto out_free_ff;
+
+	fuse_put_request(fc, req);
+	ff->fh = outopen.fh;
+	ff->nodeid = outentry.nodeid;
+	ff->open_flags = outopen.open_flags;
+	inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
+			  &outentry.attr, entry_attr_timeout(&outentry), 0);
+	if (!inode) {
+		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
+		fuse_sync_release(ff, flags);
+		fuse_queue_forget(fc, forget, outentry.nodeid, 1);
+		return -ENOMEM;
+	}
+	kfree(forget);
+	d_instantiate(entry, inode);
+	fuse_change_entry_timeout(entry, &outentry);
+	fuse_invalidate_attr(dir);
+	file = lookup_instantiate_filp(nd, entry, generic_file_open);
+	if (IS_ERR(file)) {
+		fuse_sync_release(ff, flags);
+		return PTR_ERR(file);
+	}
+	file->private_data = fuse_file_get(ff);
+	fuse_finish_open(inode, file);
+	return 0;
+
+ out_free_ff:
+	fuse_file_free(ff);
+ out_put_request:
+	fuse_put_request(fc, req);
+ out_put_forget_req:
+	kfree(forget);
+	return err;
+}
+
+/*
+ * Code shared between mknod, mkdir, symlink and link
+ */
+static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
+			    struct inode *dir, struct dentry *entry,
+			    int mode)
+{
+	struct fuse_entry_out outarg;
+	struct inode *inode;
+	int err;
+	struct fuse_forget_link *forget;
+
+	forget = fuse_alloc_forget();
+	if (!forget) {
+		fuse_put_request(fc, req);
+		return -ENOMEM;
+	}
+
+	memset(&outarg, 0, sizeof(outarg));
+	req->in.h.nodeid = get_node_id(dir);
+	req->out.numargs = 1;
+	if (fc->minor < 9)
+		req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
+	else
+		req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err)
+		goto out_put_forget_req;
+
+	err = -EIO;
+	if (invalid_nodeid(outarg.nodeid))
+		goto out_put_forget_req;
+
+	if ((outarg.attr.mode ^ mode) & S_IFMT)
+		goto out_put_forget_req;
+
+	inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
+			  &outarg.attr, entry_attr_timeout(&outarg), 0);
+	if (!inode) {
+		fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+		return -ENOMEM;
+	}
+	kfree(forget);
+
+	if (S_ISDIR(inode->i_mode)) {
+		struct dentry *alias;
+		mutex_lock(&fc->inst_mutex);
+		alias = d_find_alias(inode);
+		if (alias) {
+			/* New directory must have moved since mkdir */
+			mutex_unlock(&fc->inst_mutex);
+			dput(alias);
+			iput(inode);
+			return -EBUSY;
+		}
+		d_instantiate(entry, inode);
+		mutex_unlock(&fc->inst_mutex);
+	} else
+		d_instantiate(entry, inode);
+
+	fuse_change_entry_timeout(entry, &outarg);
+	fuse_invalidate_attr(dir);
+	return 0;
+
+ out_put_forget_req:
+	kfree(forget);
+	return err;
+}
+
+static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
+		      dev_t rdev)
+{
+	struct fuse_mknod_in inarg;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	if (!fc->dont_mask)
+		mode &= ~current_umask();
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.mode = mode;
+	inarg.rdev = new_encode_dev(rdev);
+	inarg.umask = current_umask();
+	req->in.h.opcode = FUSE_MKNOD;
+	req->in.numargs = 2;
+	req->in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE :
+						sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = entry->d_name.len + 1;
+	req->in.args[1].value = entry->d_name.name;
+	return create_new_entry(fc, req, dir, entry, mode);
+}
+
+static int fuse_create(struct inode *dir, struct dentry *entry, int mode,
+		       struct nameidata *nd)
+{
+	if (nd && (nd->flags & LOOKUP_OPEN)) {
+		int err = fuse_create_open(dir, entry, mode, nd);
+		if (err != -ENOSYS)
+			return err;
+		/* Fall back on mknod */
+	}
+	return fuse_mknod(dir, entry, mode, 0);
+}
+
+static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
+{
+	struct fuse_mkdir_in inarg;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	if (!fc->dont_mask)
+		mode &= ~current_umask();
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.mode = mode;
+	inarg.umask = current_umask();
+	req->in.h.opcode = FUSE_MKDIR;
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = entry->d_name.len + 1;
+	req->in.args[1].value = entry->d_name.name;
+	return create_new_entry(fc, req, dir, entry, S_IFDIR);
+}
+
+static int fuse_symlink(struct inode *dir, struct dentry *entry,
+			const char *link)
+{
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	unsigned len = strlen(link) + 1;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->in.h.opcode = FUSE_SYMLINK;
+	req->in.numargs = 2;
+	req->in.args[0].size = entry->d_name.len + 1;
+	req->in.args[0].value = entry->d_name.name;
+	req->in.args[1].size = len;
+	req->in.args[1].value = link;
+	return create_new_entry(fc, req, dir, entry, S_IFLNK);
+}
+
+static int fuse_unlink(struct inode *dir, struct dentry *entry)
+{
+	int err;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->in.h.opcode = FUSE_UNLINK;
+	req->in.h.nodeid = get_node_id(dir);
+	req->in.numargs = 1;
+	req->in.args[0].size = entry->d_name.len + 1;
+	req->in.args[0].value = entry->d_name.name;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err) {
+		struct inode *inode = entry->d_inode;
+
+		/*
+		 * Set nlink to zero so the inode can be cleared, if the inode
+		 * does have more links this will be discovered at the next
+		 * lookup/getattr.
+		 */
+		clear_nlink(inode);
+		fuse_invalidate_attr(inode);
+		fuse_invalidate_attr(dir);
+		fuse_invalidate_entry_cache(entry);
+	} else if (err == -EINTR)
+		fuse_invalidate_entry(entry);
+	return err;
+}
+
+static int fuse_rmdir(struct inode *dir, struct dentry *entry)
+{
+	int err;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->in.h.opcode = FUSE_RMDIR;
+	req->in.h.nodeid = get_node_id(dir);
+	req->in.numargs = 1;
+	req->in.args[0].size = entry->d_name.len + 1;
+	req->in.args[0].value = entry->d_name.name;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err) {
+		clear_nlink(entry->d_inode);
+		fuse_invalidate_attr(dir);
+		fuse_invalidate_entry_cache(entry);
+	} else if (err == -EINTR)
+		fuse_invalidate_entry(entry);
+	return err;
+}
+
+static int fuse_rename(struct inode *olddir, struct dentry *oldent,
+		       struct inode *newdir, struct dentry *newent)
+{
+	int err;
+	struct fuse_rename_in inarg;
+	struct fuse_conn *fc = get_fuse_conn(olddir);
+	struct fuse_req *req = fuse_get_req(fc);
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.newdir = get_node_id(newdir);
+	req->in.h.opcode = FUSE_RENAME;
+	req->in.h.nodeid = get_node_id(olddir);
+	req->in.numargs = 3;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = oldent->d_name.len + 1;
+	req->in.args[1].value = oldent->d_name.name;
+	req->in.args[2].size = newent->d_name.len + 1;
+	req->in.args[2].value = newent->d_name.name;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err) {
+		/* ctime changes */
+		fuse_invalidate_attr(oldent->d_inode);
+
+		fuse_invalidate_attr(olddir);
+		if (olddir != newdir)
+			fuse_invalidate_attr(newdir);
+
+		/* newent will end up negative */
+		if (newent->d_inode) {
+			fuse_invalidate_attr(newent->d_inode);
+			fuse_invalidate_entry_cache(newent);
+		}
+	} else if (err == -EINTR) {
+		/* If request was interrupted, DEITY only knows if the
+		   rename actually took place.  If the invalidation
+		   fails (e.g. some process has CWD under the renamed
+		   directory), then there can be inconsistency between
+		   the dcache and the real filesystem.  Tough luck. */
+		fuse_invalidate_entry(oldent);
+		if (newent->d_inode)
+			fuse_invalidate_entry(newent);
+	}
+
+	return err;
+}
+
+static int fuse_link(struct dentry *entry, struct inode *newdir,
+		     struct dentry *newent)
+{
+	int err;
+	struct fuse_link_in inarg;
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.oldnodeid = get_node_id(inode);
+	req->in.h.opcode = FUSE_LINK;
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = newent->d_name.len + 1;
+	req->in.args[1].value = newent->d_name.name;
+	err = create_new_entry(fc, req, newdir, newent, inode->i_mode);
+	/* Contrary to "normal" filesystems it can happen that link
+	   makes two "logical" inodes point to the same "physical"
+	   inode.  We invalidate the attributes of the old one, so it
+	   will reflect changes in the backing inode (link count,
+	   etc.)
+	*/
+	if (!err || err == -EINTR)
+		fuse_invalidate_attr(inode);
+	return err;
+}
+
+static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
+			  struct kstat *stat)
+{
+	stat->dev = inode->i_sb->s_dev;
+	stat->ino = attr->ino;
+	stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
+	stat->nlink = attr->nlink;
+	stat->uid = attr->uid;
+	stat->gid = attr->gid;
+	stat->rdev = inode->i_rdev;
+	stat->atime.tv_sec = attr->atime;
+	stat->atime.tv_nsec = attr->atimensec;
+	stat->mtime.tv_sec = attr->mtime;
+	stat->mtime.tv_nsec = attr->mtimensec;
+	stat->ctime.tv_sec = attr->ctime;
+	stat->ctime.tv_nsec = attr->ctimensec;
+	stat->size = attr->size;
+	stat->blocks = attr->blocks;
+	stat->blksize = (1 << inode->i_blkbits);
+}
+
+static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
+			   struct file *file)
+{
+	int err;
+	struct fuse_getattr_in inarg;
+	struct fuse_attr_out outarg;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	u64 attr_version;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	attr_version = fuse_get_attr_version(fc);
+
+	memset(&inarg, 0, sizeof(inarg));
+	memset(&outarg, 0, sizeof(outarg));
+	/* Directories have separate file-handle space */
+	if (file && S_ISREG(inode->i_mode)) {
+		struct fuse_file *ff = file->private_data;
+
+		inarg.getattr_flags |= FUSE_GETATTR_FH;
+		inarg.fh = ff->fh;
+	}
+	req->in.h.opcode = FUSE_GETATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->out.numargs = 1;
+	if (fc->minor < 9)
+		req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+	else
+		req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err) {
+		if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+			make_bad_inode(inode);
+			err = -EIO;
+		} else {
+			fuse_change_attributes(inode, &outarg.attr,
+					       attr_timeout(&outarg),
+					       attr_version);
+			if (stat)
+				fuse_fillattr(inode, &outarg.attr, stat);
+		}
+	}
+	return err;
+}
+
+int fuse_update_attributes(struct inode *inode, struct kstat *stat,
+			   struct file *file, bool *refreshed)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	int err;
+	bool r;
+
+	if (fi->i_time < get_jiffies_64()) {
+		r = true;
+		err = fuse_do_getattr(inode, stat, file);
+	} else {
+		r = false;
+		err = 0;
+		if (stat) {
+			generic_fillattr(inode, stat);
+			stat->mode = fi->orig_i_mode;
+			stat->ino = fi->orig_ino;
+		}
+	}
+
+	if (refreshed != NULL)
+		*refreshed = r;
+
+	return err;
+}
+
+int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+			     struct qstr *name)
+{
+	int err = -ENOTDIR;
+	struct inode *parent;
+	struct dentry *dir;
+	struct dentry *entry;
+
+	parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid);
+	if (!parent)
+		return -ENOENT;
+
+	mutex_lock(&parent->i_mutex);
+	if (!S_ISDIR(parent->i_mode))
+		goto unlock;
+
+	err = -ENOENT;
+	dir = d_find_alias(parent);
+	if (!dir)
+		goto unlock;
+
+	entry = d_lookup(dir, name);
+	dput(dir);
+	if (!entry)
+		goto unlock;
+
+	fuse_invalidate_attr(parent);
+	fuse_invalidate_entry(entry);
+	dput(entry);
+	err = 0;
+
+ unlock:
+	mutex_unlock(&parent->i_mutex);
+	iput(parent);
+	return err;
+}
+
+/*
+ * Calling into a user-controlled filesystem gives the filesystem
+ * daemon ptrace-like capabilities over the requester process.  This
+ * means, that the filesystem daemon is able to record the exact
+ * filesystem operations performed, and can also control the behavior
+ * of the requester process in otherwise impossible ways.  For example
+ * it can delay the operation for arbitrary length of time allowing
+ * DoS against the requester.
+ *
+ * For this reason only those processes can call into the filesystem,
+ * for which the owner of the mount has ptrace privilege.  This
+ * excludes processes started by other users, suid or sgid processes.
+ */
+int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
+{
+	const struct cred *cred;
+	int ret;
+
+	if (fc->flags & FUSE_ALLOW_OTHER)
+		return 1;
+
+	rcu_read_lock();
+	ret = 0;
+	cred = __task_cred(task);
+	if (cred->euid == fc->user_id &&
+	    cred->suid == fc->user_id &&
+	    cred->uid  == fc->user_id &&
+	    cred->egid == fc->group_id &&
+	    cred->sgid == fc->group_id &&
+	    cred->gid  == fc->group_id)
+		ret = 1;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int fuse_access(struct inode *inode, int mask)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_access_in inarg;
+	int err;
+
+	if (fc->no_access)
+		return 0;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC);
+	req->in.h.opcode = FUSE_ACCESS;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS) {
+		fc->no_access = 1;
+		err = 0;
+	}
+	return err;
+}
+
+static int fuse_perm_getattr(struct inode *inode, int flags)
+{
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	return fuse_do_getattr(inode, NULL, NULL);
+}
+
+/*
+ * Check permission.  The two basic access models of FUSE are:
+ *
+ * 1) Local access checking ('default_permissions' mount option) based
+ * on file mode.  This is the plain old disk filesystem permission
+ * modell.
+ *
+ * 2) "Remote" access checking, where server is responsible for
+ * checking permission in each inode operation.  An exception to this
+ * is if ->permission() was invoked from sys_access() in which case an
+ * access request is sent.  Execute permission is still checked
+ * locally based on file mode.
+ */
+static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	bool refreshed = false;
+	int err = 0;
+
+	if (!fuse_allow_task(fc, current))
+		return -EACCES;
+
+	/*
+	 * If attributes are needed, refresh them before proceeding
+	 */
+	if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) ||
+	    ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+
+		if (fi->i_time < get_jiffies_64()) {
+			refreshed = true;
+
+			err = fuse_perm_getattr(inode, flags);
+			if (err)
+				return err;
+		}
+	}
+
+	if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
+		err = generic_permission(inode, mask, flags, NULL);
+
+		/* If permission is denied, try to refresh file
+		   attributes.  This is also needed, because the root
+		   node will at first have no permissions */
+		if (err == -EACCES && !refreshed) {
+			err = fuse_perm_getattr(inode, flags);
+			if (!err)
+				err = generic_permission(inode, mask,
+							flags, NULL);
+		}
+
+		/* Note: the opposite of the above test does not
+		   exist.  So if permissions are revoked this won't be
+		   noticed immediately, only after the attribute
+		   timeout has expired */
+	} else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
+		if (flags & IPERM_FLAG_RCU)
+			return -ECHILD;
+
+		err = fuse_access(inode, mask);
+	} else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
+		if (!(inode->i_mode & S_IXUGO)) {
+			if (refreshed)
+				return -EACCES;
+
+			err = fuse_perm_getattr(inode, flags);
+			if (!err && !(inode->i_mode & S_IXUGO))
+				return -EACCES;
+		}
+	}
+	return err;
+}
+
+static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
+			 void *dstbuf, filldir_t filldir)
+{
+	while (nbytes >= FUSE_NAME_OFFSET) {
+		struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
+		size_t reclen = FUSE_DIRENT_SIZE(dirent);
+		int over;
+		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+			return -EIO;
+		if (reclen > nbytes)
+			break;
+
+		over = filldir(dstbuf, dirent->name, dirent->namelen,
+			       file->f_pos, dirent->ino, dirent->type);
+		if (over)
+			break;
+
+		buf += reclen;
+		nbytes -= reclen;
+		file->f_pos = dirent->off;
+	}
+
+	return 0;
+}
+
+static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+{
+	int err;
+	size_t nbytes;
+	struct page *page;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		fuse_put_request(fc, req);
+		return -ENOMEM;
+	}
+	req->out.argpages = 1;
+	req->num_pages = 1;
+	req->pages[0] = page;
+	fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR);
+	fuse_request_send(fc, req);
+	nbytes = req->out.args[0].size;
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err)
+		err = parse_dirfile(page_address(page), nbytes, file, dstbuf,
+				    filldir);
+
+	__free_page(page);
+	fuse_invalidate_attr(inode); /* atime changed */
+	return err;
+}
+
+static char *read_link(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req = fuse_get_req(fc);
+	char *link;
+
+	if (IS_ERR(req))
+		return ERR_CAST(req);
+
+	link = (char *) __get_free_page(GFP_KERNEL);
+	if (!link) {
+		link = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	req->in.h.opcode = FUSE_READLINK;
+	req->in.h.nodeid = get_node_id(inode);
+	req->out.argvar = 1;
+	req->out.numargs = 1;
+	req->out.args[0].size = PAGE_SIZE - 1;
+	req->out.args[0].value = link;
+	fuse_request_send(fc, req);
+	if (req->out.h.error) {
+		free_page((unsigned long) link);
+		link = ERR_PTR(req->out.h.error);
+	} else
+		link[req->out.args[0].size] = '\0';
+ out:
+	fuse_put_request(fc, req);
+	fuse_invalidate_attr(inode); /* atime changed */
+	return link;
+}
+
+static void free_link(char *link)
+{
+	if (!IS_ERR(link))
+		free_page((unsigned long) link);
+}
+
+static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	nd_set_link(nd, read_link(dentry));
+	return NULL;
+}
+
+static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+{
+	free_link(nd_get_link(nd));
+}
+
+static int fuse_dir_open(struct inode *inode, struct file *file)
+{
+	return fuse_open_common(inode, file, true);
+}
+
+static int fuse_dir_release(struct inode *inode, struct file *file)
+{
+	fuse_release_common(file, FUSE_RELEASEDIR);
+
+	return 0;
+}
+
+static int fuse_dir_fsync(struct file *file, int datasync)
+{
+	return fuse_fsync_common(file, datasync, 1);
+}
+
+static bool update_mtime(unsigned ivalid)
+{
+	/* Always update if mtime is explicitly set  */
+	if (ivalid & ATTR_MTIME_SET)
+		return true;
+
+	/* If it's an open(O_TRUNC) or an ftruncate(), don't update */
+	if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE)))
+		return false;
+
+	/* In all other cases update */
+	return true;
+}
+
+static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
+{
+	unsigned ivalid = iattr->ia_valid;
+
+	if (ivalid & ATTR_MODE)
+		arg->valid |= FATTR_MODE,   arg->mode = iattr->ia_mode;
+	if (ivalid & ATTR_UID)
+		arg->valid |= FATTR_UID,    arg->uid = iattr->ia_uid;
+	if (ivalid & ATTR_GID)
+		arg->valid |= FATTR_GID,    arg->gid = iattr->ia_gid;
+	if (ivalid & ATTR_SIZE)
+		arg->valid |= FATTR_SIZE,   arg->size = iattr->ia_size;
+	if (ivalid & ATTR_ATIME) {
+		arg->valid |= FATTR_ATIME;
+		arg->atime = iattr->ia_atime.tv_sec;
+		arg->atimensec = iattr->ia_atime.tv_nsec;
+		if (!(ivalid & ATTR_ATIME_SET))
+			arg->valid |= FATTR_ATIME_NOW;
+	}
+	if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) {
+		arg->valid |= FATTR_MTIME;
+		arg->mtime = iattr->ia_mtime.tv_sec;
+		arg->mtimensec = iattr->ia_mtime.tv_nsec;
+		if (!(ivalid & ATTR_MTIME_SET))
+			arg->valid |= FATTR_MTIME_NOW;
+	}
+}
+
+/*
+ * Prevent concurrent writepages on inode
+ *
+ * This is done by adding a negative bias to the inode write counter
+ * and waiting for all pending writes to finish.
+ */
+void fuse_set_nowrite(struct inode *inode)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	BUG_ON(!mutex_is_locked(&inode->i_mutex));
+
+	spin_lock(&fc->lock);
+	BUG_ON(fi->writectr < 0);
+	fi->writectr += FUSE_NOWRITE;
+	spin_unlock(&fc->lock);
+	wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
+}
+
+/*
+ * Allow writepages on inode
+ *
+ * Remove the bias from the writecounter and send any queued
+ * writepages.
+ */
+static void __fuse_release_nowrite(struct inode *inode)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	BUG_ON(fi->writectr != FUSE_NOWRITE);
+	fi->writectr = 0;
+	fuse_flush_writepages(inode);
+}
+
+void fuse_release_nowrite(struct inode *inode)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	spin_lock(&fc->lock);
+	__fuse_release_nowrite(inode);
+	spin_unlock(&fc->lock);
+}
+
+/*
+ * Set attributes, and at the same time refresh them.
+ *
+ * Truncation is slightly complicated, because the 'truncate' request
+ * may fail, in which case we don't want to touch the mapping.
+ * vmtruncate() doesn't allow for this case, so do the rlimit checking
+ * and the actual truncation by hand.
+ */
+static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
+			   struct file *file)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_setattr_in inarg;
+	struct fuse_attr_out outarg;
+	bool is_truncate = false;
+	loff_t oldsize;
+	int err;
+
+	if (!fuse_allow_task(fc, current))
+		return -EACCES;
+
+	if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
+		attr->ia_valid |= ATTR_FORCE;
+
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	if (attr->ia_valid & ATTR_OPEN) {
+		if (fc->atomic_o_trunc)
+			return 0;
+		file = NULL;
+	}
+
+	if (attr->ia_valid & ATTR_SIZE)
+		is_truncate = true;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	if (is_truncate)
+		fuse_set_nowrite(inode);
+
+	memset(&inarg, 0, sizeof(inarg));
+	memset(&outarg, 0, sizeof(outarg));
+	iattr_to_fattr(attr, &inarg);
+	if (file) {
+		struct fuse_file *ff = file->private_data;
+		inarg.valid |= FATTR_FH;
+		inarg.fh = ff->fh;
+	}
+	if (attr->ia_valid & ATTR_SIZE) {
+		/* For mandatory locking in truncate */
+		inarg.valid |= FATTR_LOCKOWNER;
+		inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
+	}
+	req->in.h.opcode = FUSE_SETATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->out.numargs = 1;
+	if (fc->minor < 9)
+		req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+	else
+		req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err) {
+		if (err == -EINTR)
+			fuse_invalidate_attr(inode);
+		goto error;
+	}
+
+	if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+		make_bad_inode(inode);
+		err = -EIO;
+		goto error;
+	}
+
+	spin_lock(&fc->lock);
+	fuse_change_attributes_common(inode, &outarg.attr,
+				      attr_timeout(&outarg));
+	oldsize = inode->i_size;
+	i_size_write(inode, outarg.attr.size);
+
+	if (is_truncate) {
+		/* NOTE: this may release/reacquire fc->lock */
+		__fuse_release_nowrite(inode);
+	}
+	spin_unlock(&fc->lock);
+
+	/*
+	 * Only call invalidate_inode_pages2() after removing
+	 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
+	 */
+	if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+		truncate_pagecache(inode, oldsize, outarg.attr.size);
+		invalidate_inode_pages2(inode->i_mapping);
+	}
+
+	return 0;
+
+error:
+	if (is_truncate)
+		fuse_release_nowrite(inode);
+
+	return err;
+}
+
+static int fuse_setattr(struct dentry *entry, struct iattr *attr)
+{
+	if (attr->ia_valid & ATTR_FILE)
+		return fuse_do_setattr(entry, attr, attr->ia_file);
+	else
+		return fuse_do_setattr(entry, attr, NULL);
+}
+
+static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
+			struct kstat *stat)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (!fuse_allow_task(fc, current))
+		return -EACCES;
+
+	return fuse_update_attributes(inode, stat, NULL, NULL);
+}
+
+static int fuse_setxattr(struct dentry *entry, const char *name,
+			 const void *value, size_t size, int flags)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_setxattr_in inarg;
+	int err;
+
+	if (fc->no_setxattr)
+		return -EOPNOTSUPP;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.size = size;
+	inarg.flags = flags;
+	req->in.h.opcode = FUSE_SETXATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 3;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = strlen(name) + 1;
+	req->in.args[1].value = name;
+	req->in.args[2].size = size;
+	req->in.args[2].value = value;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS) {
+		fc->no_setxattr = 1;
+		err = -EOPNOTSUPP;
+	}
+	return err;
+}
+
+static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
+			     void *value, size_t size)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_getxattr_in inarg;
+	struct fuse_getxattr_out outarg;
+	ssize_t ret;
+
+	if (fc->no_getxattr)
+		return -EOPNOTSUPP;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.size = size;
+	req->in.h.opcode = FUSE_GETXATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = strlen(name) + 1;
+	req->in.args[1].value = name;
+	/* This is really two different operations rolled into one */
+	req->out.numargs = 1;
+	if (size) {
+		req->out.argvar = 1;
+		req->out.args[0].size = size;
+		req->out.args[0].value = value;
+	} else {
+		req->out.args[0].size = sizeof(outarg);
+		req->out.args[0].value = &outarg;
+	}
+	fuse_request_send(fc, req);
+	ret = req->out.h.error;
+	if (!ret)
+		ret = size ? req->out.args[0].size : outarg.size;
+	else {
+		if (ret == -ENOSYS) {
+			fc->no_getxattr = 1;
+			ret = -EOPNOTSUPP;
+		}
+	}
+	fuse_put_request(fc, req);
+	return ret;
+}
+
+static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_getxattr_in inarg;
+	struct fuse_getxattr_out outarg;
+	ssize_t ret;
+
+	if (!fuse_allow_task(fc, current))
+		return -EACCES;
+
+	if (fc->no_listxattr)
+		return -EOPNOTSUPP;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.size = size;
+	req->in.h.opcode = FUSE_LISTXATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	/* This is really two different operations rolled into one */
+	req->out.numargs = 1;
+	if (size) {
+		req->out.argvar = 1;
+		req->out.args[0].size = size;
+		req->out.args[0].value = list;
+	} else {
+		req->out.args[0].size = sizeof(outarg);
+		req->out.args[0].value = &outarg;
+	}
+	fuse_request_send(fc, req);
+	ret = req->out.h.error;
+	if (!ret)
+		ret = size ? req->out.args[0].size : outarg.size;
+	else {
+		if (ret == -ENOSYS) {
+			fc->no_listxattr = 1;
+			ret = -EOPNOTSUPP;
+		}
+	}
+	fuse_put_request(fc, req);
+	return ret;
+}
+
+static int fuse_removexattr(struct dentry *entry, const char *name)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	int err;
+
+	if (fc->no_removexattr)
+		return -EOPNOTSUPP;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->in.h.opcode = FUSE_REMOVEXATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = strlen(name) + 1;
+	req->in.args[0].value = name;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS) {
+		fc->no_removexattr = 1;
+		err = -EOPNOTSUPP;
+	}
+	return err;
+}
+
+static const struct inode_operations fuse_dir_inode_operations = {
+	.lookup		= fuse_lookup,
+	.mkdir		= fuse_mkdir,
+	.symlink	= fuse_symlink,
+	.unlink		= fuse_unlink,
+	.rmdir		= fuse_rmdir,
+	.rename		= fuse_rename,
+	.link		= fuse_link,
+	.setattr	= fuse_setattr,
+	.create		= fuse_create,
+	.mknod		= fuse_mknod,
+	.permission	= fuse_permission,
+	.getattr	= fuse_getattr,
+	.setxattr	= fuse_setxattr,
+	.getxattr	= fuse_getxattr,
+	.listxattr	= fuse_listxattr,
+	.removexattr	= fuse_removexattr,
+};
+
+static const struct file_operations fuse_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= fuse_readdir,
+	.open		= fuse_dir_open,
+	.release	= fuse_dir_release,
+	.fsync		= fuse_dir_fsync,
+};
+
+static const struct inode_operations fuse_common_inode_operations = {
+	.setattr	= fuse_setattr,
+	.permission	= fuse_permission,
+	.getattr	= fuse_getattr,
+	.setxattr	= fuse_setxattr,
+	.getxattr	= fuse_getxattr,
+	.listxattr	= fuse_listxattr,
+	.removexattr	= fuse_removexattr,
+};
+
+static const struct inode_operations fuse_symlink_inode_operations = {
+	.setattr	= fuse_setattr,
+	.follow_link	= fuse_follow_link,
+	.put_link	= fuse_put_link,
+	.readlink	= generic_readlink,
+	.getattr	= fuse_getattr,
+	.setxattr	= fuse_setxattr,
+	.getxattr	= fuse_getxattr,
+	.listxattr	= fuse_listxattr,
+	.removexattr	= fuse_removexattr,
+};
+
+void fuse_init_common(struct inode *inode)
+{
+	inode->i_op = &fuse_common_inode_operations;
+}
+
+void fuse_init_dir(struct inode *inode)
+{
+	inode->i_op = &fuse_dir_inode_operations;
+	inode->i_fop = &fuse_dir_operations;
+}
+
+void fuse_init_symlink(struct inode *inode)
+{
+	inode->i_op = &fuse_symlink_inode_operations;
+}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
new file mode 100644
index 00000000..82a66466
--- /dev/null
+++ b/fs/fuse/file.c
@@ -0,0 +1,2186 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/compat.h>
+
+static const struct file_operations fuse_direct_io_file_operations;
+
+static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+			  int opcode, struct fuse_open_out *outargp)
+{
+	struct fuse_open_in inarg;
+	struct fuse_req *req;
+	int err;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
+	if (!fc->atomic_o_trunc)
+		inarg.flags &= ~O_TRUNC;
+	req->in.h.opcode = opcode;
+	req->in.h.nodeid = nodeid;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(*outargp);
+	req->out.args[0].value = outargp;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+
+	return err;
+}
+
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
+{
+	struct fuse_file *ff;
+
+	ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
+	if (unlikely(!ff))
+		return NULL;
+
+	ff->fc = fc;
+	ff->reserved_req = fuse_request_alloc();
+	if (unlikely(!ff->reserved_req)) {
+		kfree(ff);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&ff->write_entry);
+	atomic_set(&ff->count, 0);
+	RB_CLEAR_NODE(&ff->polled_node);
+	init_waitqueue_head(&ff->poll_wait);
+
+	spin_lock(&fc->lock);
+	ff->kh = ++fc->khctr;
+	spin_unlock(&fc->lock);
+
+	return ff;
+}
+
+void fuse_file_free(struct fuse_file *ff)
+{
+	fuse_request_free(ff->reserved_req);
+	kfree(ff);
+}
+
+struct fuse_file *fuse_file_get(struct fuse_file *ff)
+{
+	atomic_inc(&ff->count);
+	return ff;
+}
+
+static void fuse_release_async(struct work_struct *work)
+{
+	struct fuse_req *req;
+	struct fuse_conn *fc;
+	struct path path;
+
+	req = container_of(work, struct fuse_req, misc.release.work);
+	path = req->misc.release.path;
+	fc = get_fuse_conn(path.dentry->d_inode);
+
+	fuse_put_request(fc, req);
+	path_put(&path);
+}
+
+static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+	if (fc->destroy_req) {
+		/*
+		 * If this is a fuseblk mount, then it's possible that
+		 * releasing the path will result in releasing the
+		 * super block and sending the DESTROY request.  If
+		 * the server is single threaded, this would hang.
+		 * For this reason do the path_put() in a separate
+		 * thread.
+		 */
+		atomic_inc(&req->count);
+		INIT_WORK(&req->misc.release.work, fuse_release_async);
+		schedule_work(&req->misc.release.work);
+	} else {
+		path_put(&req->misc.release.path);
+	}
+}
+
+static void fuse_file_put(struct fuse_file *ff, bool sync)
+{
+	if (atomic_dec_and_test(&ff->count)) {
+		struct fuse_req *req = ff->reserved_req;
+
+		if (sync) {
+			fuse_request_send(ff->fc, req);
+			path_put(&req->misc.release.path);
+			fuse_put_request(ff->fc, req);
+		} else {
+			req->end = fuse_release_end;
+			fuse_request_send_background(ff->fc, req);
+		}
+		kfree(ff);
+	}
+}
+
+int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+		 bool isdir)
+{
+	struct fuse_open_out outarg;
+	struct fuse_file *ff;
+	int err;
+	int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
+
+	ff = fuse_file_alloc(fc);
+	if (!ff)
+		return -ENOMEM;
+
+	err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+	if (err) {
+		fuse_file_free(ff);
+		return err;
+	}
+
+	if (isdir)
+		outarg.open_flags &= ~FOPEN_DIRECT_IO;
+
+	ff->fh = outarg.fh;
+	ff->nodeid = nodeid;
+	ff->open_flags = outarg.open_flags;
+	file->private_data = fuse_file_get(ff);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_do_open);
+
+void fuse_finish_open(struct inode *inode, struct file *file)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (ff->open_flags & FOPEN_DIRECT_IO)
+		file->f_op = &fuse_direct_io_file_operations;
+	if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+		invalidate_inode_pages2(inode->i_mapping);
+	if (ff->open_flags & FOPEN_NONSEEKABLE)
+		nonseekable_open(inode, file);
+	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+
+		spin_lock(&fc->lock);
+		fi->attr_version = ++fc->attr_version;
+		i_size_write(inode, 0);
+		spin_unlock(&fc->lock);
+		fuse_invalidate_attr(inode);
+	}
+}
+
+int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int err;
+
+	/* VFS checks this, but only _after_ ->open() */
+	if (file->f_flags & O_DIRECT)
+		return -EINVAL;
+
+	err = generic_file_open(inode, file);
+	if (err)
+		return err;
+
+	err = fuse_do_open(fc, get_node_id(inode), file, isdir);
+	if (err)
+		return err;
+
+	fuse_finish_open(inode, file);
+
+	return 0;
+}
+
+static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
+{
+	struct fuse_conn *fc = ff->fc;
+	struct fuse_req *req = ff->reserved_req;
+	struct fuse_release_in *inarg = &req->misc.release.in;
+
+	spin_lock(&fc->lock);
+	list_del(&ff->write_entry);
+	if (!RB_EMPTY_NODE(&ff->polled_node))
+		rb_erase(&ff->polled_node, &fc->polled_files);
+	spin_unlock(&fc->lock);
+
+	wake_up_interruptible_all(&ff->poll_wait);
+
+	inarg->fh = ff->fh;
+	inarg->flags = flags;
+	req->in.h.opcode = opcode;
+	req->in.h.nodeid = ff->nodeid;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(struct fuse_release_in);
+	req->in.args[0].value = inarg;
+}
+
+void fuse_release_common(struct file *file, int opcode)
+{
+	struct fuse_file *ff;
+	struct fuse_req *req;
+
+	ff = file->private_data;
+	if (unlikely(!ff))
+		return;
+
+	req = ff->reserved_req;
+	fuse_prepare_release(ff, file->f_flags, opcode);
+
+	/* Hold vfsmount and dentry until release is finished */
+	path_get(&file->f_path);
+	req->misc.release.path = file->f_path;
+
+	/*
+	 * Normally this will send the RELEASE request, however if
+	 * some asynchronous READ or WRITE requests are outstanding,
+	 * the sending will be delayed.
+	 *
+	 * Make the release synchronous if this is a fuseblk mount,
+	 * synchronous RELEASE is allowed (and desirable) in this case
+	 * because the server can be trusted not to screw up.
+	 */
+	fuse_file_put(ff, ff->fc->destroy_req != NULL);
+}
+
+static int fuse_open(struct inode *inode, struct file *file)
+{
+	return fuse_open_common(inode, file, false);
+}
+
+static int fuse_release(struct inode *inode, struct file *file)
+{
+	fuse_release_common(file, FUSE_RELEASE);
+
+	/* return value is ignored by VFS */
+	return 0;
+}
+
+void fuse_sync_release(struct fuse_file *ff, int flags)
+{
+	WARN_ON(atomic_read(&ff->count) > 1);
+	fuse_prepare_release(ff, flags, FUSE_RELEASE);
+	ff->reserved_req->force = 1;
+	fuse_request_send(ff->fc, ff->reserved_req);
+	fuse_put_request(ff->fc, ff->reserved_req);
+	kfree(ff);
+}
+EXPORT_SYMBOL_GPL(fuse_sync_release);
+
+/*
+ * Scramble the ID space with XTEA, so that the value of the files_struct
+ * pointer is not exposed to userspace.
+ */
+u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
+{
+	u32 *k = fc->scramble_key;
+	u64 v = (unsigned long) id;
+	u32 v0 = v;
+	u32 v1 = v >> 32;
+	u32 sum = 0;
+	int i;
+
+	for (i = 0; i < 32; i++) {
+		v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
+		sum += 0x9E3779B9;
+		v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
+	}
+
+	return (u64) v0 + ((u64) v1 << 32);
+}
+
+/*
+ * Check if page is under writeback
+ *
+ * This is currently done by walking the list of writepage requests
+ * for the inode, which can be pretty inefficient.
+ */
+static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
+	bool found = false;
+
+	spin_lock(&fc->lock);
+	list_for_each_entry(req, &fi->writepages, writepages_entry) {
+		pgoff_t curr_index;
+
+		BUG_ON(req->inode != inode);
+		curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+		if (curr_index == index) {
+			found = true;
+			break;
+		}
+	}
+	spin_unlock(&fc->lock);
+
+	return found;
+}
+
+/*
+ * Wait for page writeback to be completed.
+ *
+ * Since fuse doesn't rely on the VM writeback tracking, this has to
+ * use some other means.
+ */
+static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
+	return 0;
+}
+
+static int fuse_flush(struct file *file, fl_owner_t id)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	struct fuse_req *req;
+	struct fuse_flush_in inarg;
+	int err;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	if (fc->no_flush)
+		return 0;
+
+	req = fuse_get_req_nofail(fc, file);
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.fh = ff->fh;
+	inarg.lock_owner = fuse_lock_owner_id(fc, id);
+	req->in.h.opcode = FUSE_FLUSH;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->force = 1;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS) {
+		fc->no_flush = 1;
+		err = 0;
+	}
+	return err;
+}
+
+/*
+ * Wait for all pending writepages on the inode to finish.
+ *
+ * This is currently done by blocking further writes with FUSE_NOWRITE
+ * and waiting for all sent writes to complete.
+ *
+ * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
+ * could conflict with truncation.
+ */
+static void fuse_sync_writes(struct inode *inode)
+{
+	fuse_set_nowrite(inode);
+	fuse_release_nowrite(inode);
+}
+
+int fuse_fsync_common(struct file *file, int datasync, int isdir)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	struct fuse_req *req;
+	struct fuse_fsync_in inarg;
+	int err;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
+		return 0;
+
+	/*
+	 * Start writeback against all dirty pages of the inode, then
+	 * wait for all outstanding writes, before sending the FSYNC
+	 * request.
+	 */
+	err = write_inode_now(inode, 0);
+	if (err)
+		return err;
+
+	fuse_sync_writes(inode);
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.fh = ff->fh;
+	inarg.fsync_flags = datasync ? 1 : 0;
+	req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS) {
+		if (isdir)
+			fc->no_fsyncdir = 1;
+		else
+			fc->no_fsync = 1;
+		err = 0;
+	}
+	return err;
+}
+
+static int fuse_fsync(struct file *file, int datasync)
+{
+	return fuse_fsync_common(file, datasync, 0);
+}
+
+void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
+		    size_t count, int opcode)
+{
+	struct fuse_read_in *inarg = &req->misc.read.in;
+	struct fuse_file *ff = file->private_data;
+
+	inarg->fh = ff->fh;
+	inarg->offset = pos;
+	inarg->size = count;
+	inarg->flags = file->f_flags;
+	req->in.h.opcode = opcode;
+	req->in.h.nodeid = ff->nodeid;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(struct fuse_read_in);
+	req->in.args[0].value = inarg;
+	req->out.argvar = 1;
+	req->out.numargs = 1;
+	req->out.args[0].size = count;
+}
+
+static size_t fuse_send_read(struct fuse_req *req, struct file *file,
+			     loff_t pos, size_t count, fl_owner_t owner)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+
+	fuse_read_fill(req, file, pos, count, FUSE_READ);
+	if (owner != NULL) {
+		struct fuse_read_in *inarg = &req->misc.read.in;
+
+		inarg->read_flags |= FUSE_READ_LOCKOWNER;
+		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
+	}
+	fuse_request_send(fc, req);
+	return req->out.args[0].size;
+}
+
+static void fuse_read_update_size(struct inode *inode, loff_t size,
+				  u64 attr_ver)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	spin_lock(&fc->lock);
+	if (attr_ver == fi->attr_version && size < inode->i_size) {
+		fi->attr_version = ++fc->attr_version;
+		i_size_write(inode, size);
+	}
+	spin_unlock(&fc->lock);
+}
+
+static int fuse_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	size_t num_read;
+	loff_t pos = page_offset(page);
+	size_t count = PAGE_CACHE_SIZE;
+	u64 attr_ver;
+	int err;
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
+
+	/*
+	 * Page writeback can extend beyond the lifetime of the
+	 * page-cache page, so make sure we read a properly synced
+	 * page.
+	 */
+	fuse_wait_on_page_writeback(inode, page->index);
+
+	req = fuse_get_req(fc);
+	err = PTR_ERR(req);
+	if (IS_ERR(req))
+		goto out;
+
+	attr_ver = fuse_get_attr_version(fc);
+
+	req->out.page_zeroing = 1;
+	req->out.argpages = 1;
+	req->num_pages = 1;
+	req->pages[0] = page;
+	num_read = fuse_send_read(req, file, pos, count, NULL);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+
+	if (!err) {
+		/*
+		 * Short read means EOF.  If file size is larger, truncate it
+		 */
+		if (num_read < count)
+			fuse_read_update_size(inode, pos + num_read, attr_ver);
+
+		SetPageUptodate(page);
+	}
+
+	fuse_invalidate_attr(inode); /* atime changed */
+ out:
+	unlock_page(page);
+	return err;
+}
+
+static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+	int i;
+	size_t count = req->misc.read.in.size;
+	size_t num_read = req->out.args[0].size;
+	struct address_space *mapping = NULL;
+
+	for (i = 0; mapping == NULL && i < req->num_pages; i++)
+		mapping = req->pages[i]->mapping;
+
+	if (mapping) {
+		struct inode *inode = mapping->host;
+
+		/*
+		 * Short read means EOF. If file size is larger, truncate it
+		 */
+		if (!req->out.h.error && num_read < count) {
+			loff_t pos;
+
+			pos = page_offset(req->pages[0]) + num_read;
+			fuse_read_update_size(inode, pos,
+					      req->misc.read.attr_ver);
+		}
+		fuse_invalidate_attr(inode); /* atime changed */
+	}
+
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+		if (!req->out.h.error)
+			SetPageUptodate(page);
+		else
+			SetPageError(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	if (req->ff)
+		fuse_file_put(req->ff, false);
+}
+
+static void fuse_send_readpages(struct fuse_req *req, struct file *file)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+	loff_t pos = page_offset(req->pages[0]);
+	size_t count = req->num_pages << PAGE_CACHE_SHIFT;
+
+	req->out.argpages = 1;
+	req->out.page_zeroing = 1;
+	req->out.page_replace = 1;
+	fuse_read_fill(req, file, pos, count, FUSE_READ);
+	req->misc.read.attr_ver = fuse_get_attr_version(fc);
+	if (fc->async_read) {
+		req->ff = fuse_file_get(ff);
+		req->end = fuse_readpages_end;
+		fuse_request_send_background(fc, req);
+	} else {
+		fuse_request_send(fc, req);
+		fuse_readpages_end(fc, req);
+		fuse_put_request(fc, req);
+	}
+}
+
+struct fuse_fill_data {
+	struct fuse_req *req;
+	struct file *file;
+	struct inode *inode;
+};
+
+static int fuse_readpages_fill(void *_data, struct page *page)
+{
+	struct fuse_fill_data *data = _data;
+	struct fuse_req *req = data->req;
+	struct inode *inode = data->inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	fuse_wait_on_page_writeback(inode, page->index);
+
+	if (req->num_pages &&
+	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
+	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
+		fuse_send_readpages(req, data->file);
+		data->req = req = fuse_get_req(fc);
+		if (IS_ERR(req)) {
+			unlock_page(page);
+			return PTR_ERR(req);
+		}
+	}
+	page_cache_get(page);
+	req->pages[req->num_pages] = page;
+	req->num_pages++;
+	return 0;
+}
+
+static int fuse_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *pages, unsigned nr_pages)
+{
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_fill_data data;
+	int err;
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
+
+	data.file = file;
+	data.inode = inode;
+	data.req = fuse_get_req(fc);
+	err = PTR_ERR(data.req);
+	if (IS_ERR(data.req))
+		goto out;
+
+	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
+	if (!err) {
+		if (data.req->num_pages)
+			fuse_send_readpages(data.req, file);
+		else
+			fuse_put_request(fc, data.req);
+	}
+out:
+	return err;
+}
+
+static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+				  unsigned long nr_segs, loff_t pos)
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+
+	if (pos + iov_length(iov, nr_segs) > i_size_read(inode)) {
+		int err;
+		/*
+		 * If trying to read past EOF, make sure the i_size
+		 * attribute is up-to-date.
+		 */
+		err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
+		if (err)
+			return err;
+	}
+
+	return generic_file_aio_read(iocb, iov, nr_segs, pos);
+}
+
+static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
+			    loff_t pos, size_t count)
+{
+	struct fuse_write_in *inarg = &req->misc.write.in;
+	struct fuse_write_out *outarg = &req->misc.write.out;
+
+	inarg->fh = ff->fh;
+	inarg->offset = pos;
+	inarg->size = count;
+	req->in.h.opcode = FUSE_WRITE;
+	req->in.h.nodeid = ff->nodeid;
+	req->in.numargs = 2;
+	if (ff->fc->minor < 9)
+		req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
+	else
+		req->in.args[0].size = sizeof(struct fuse_write_in);
+	req->in.args[0].value = inarg;
+	req->in.args[1].size = count;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(struct fuse_write_out);
+	req->out.args[0].value = outarg;
+}
+
+static size_t fuse_send_write(struct fuse_req *req, struct file *file,
+			      loff_t pos, size_t count, fl_owner_t owner)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+	struct fuse_write_in *inarg = &req->misc.write.in;
+
+	fuse_write_fill(req, ff, pos, count);
+	inarg->flags = file->f_flags;
+	if (owner != NULL) {
+		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
+		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
+	}
+	fuse_request_send(fc, req);
+	return req->misc.write.out.size;
+}
+
+static int fuse_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+
+	*pagep = grab_cache_page_write_begin(mapping, index, flags);
+	if (!*pagep)
+		return -ENOMEM;
+	return 0;
+}
+
+void fuse_write_update_size(struct inode *inode, loff_t pos)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	spin_lock(&fc->lock);
+	fi->attr_version = ++fc->attr_version;
+	if (pos > inode->i_size)
+		i_size_write(inode, pos);
+	spin_unlock(&fc->lock);
+}
+
+static int fuse_buffered_write(struct file *file, struct inode *inode,
+			       loff_t pos, unsigned count, struct page *page)
+{
+	int err;
+	size_t nres;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+	struct fuse_req *req;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	/*
+	 * Make sure writepages on the same page are not mixed up with
+	 * plain writes.
+	 */
+	fuse_wait_on_page_writeback(inode, page->index);
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->in.argpages = 1;
+	req->num_pages = 1;
+	req->pages[0] = page;
+	req->page_offset = offset;
+	nres = fuse_send_write(req, file, pos, count, NULL);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err && !nres)
+		err = -EIO;
+	if (!err) {
+		pos += nres;
+		fuse_write_update_size(inode, pos);
+		if (count == PAGE_CACHE_SIZE)
+			SetPageUptodate(page);
+	}
+	fuse_invalidate_attr(inode);
+	return err ? err : nres;
+}
+
+static int fuse_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int res = 0;
+
+	if (copied)
+		res = fuse_buffered_write(file, inode, pos, copied, page);
+
+	unlock_page(page);
+	page_cache_release(page);
+	return res;
+}
+
+static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
+				    struct inode *inode, loff_t pos,
+				    size_t count)
+{
+	size_t res;
+	unsigned offset;
+	unsigned i;
+
+	for (i = 0; i < req->num_pages; i++)
+		fuse_wait_on_page_writeback(inode, req->pages[i]->index);
+
+	res = fuse_send_write(req, file, pos, count, NULL);
+
+	offset = req->page_offset;
+	count = res;
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+
+		if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE)
+			SetPageUptodate(page);
+
+		if (count > PAGE_CACHE_SIZE - offset)
+			count -= PAGE_CACHE_SIZE - offset;
+		else
+			count = 0;
+		offset = 0;
+
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	return res;
+}
+
+static ssize_t fuse_fill_write_pages(struct fuse_req *req,
+			       struct address_space *mapping,
+			       struct iov_iter *ii, loff_t pos)
+{
+	struct fuse_conn *fc = get_fuse_conn(mapping->host);
+	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+	size_t count = 0;
+	int err;
+
+	req->in.argpages = 1;
+	req->page_offset = offset;
+
+	do {
+		size_t tmp;
+		struct page *page;
+		pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+		size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,
+				     iov_iter_count(ii));
+
+		bytes = min_t(size_t, bytes, fc->max_write - count);
+
+ again:
+		err = -EFAULT;
+		if (iov_iter_fault_in_readable(ii, bytes))
+			break;
+
+		err = -ENOMEM;
+		page = grab_cache_page_write_begin(mapping, index, 0);
+		if (!page)
+			break;
+
+		if (mapping_writably_mapped(mapping))
+			flush_dcache_page(page);
+
+		pagefault_disable();
+		tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
+		pagefault_enable();
+		flush_dcache_page(page);
+
+		if (!tmp) {
+			unlock_page(page);
+			page_cache_release(page);
+			bytes = min(bytes, iov_iter_single_seg_count(ii));
+			goto again;
+		}
+
+		err = 0;
+		req->pages[req->num_pages] = page;
+		req->num_pages++;
+
+		iov_iter_advance(ii, tmp);
+		count += tmp;
+		pos += tmp;
+		offset += tmp;
+		if (offset == PAGE_CACHE_SIZE)
+			offset = 0;
+
+		if (!fc->big_writes)
+			break;
+	} while (iov_iter_count(ii) && count < fc->max_write &&
+		 req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0);
+
+	return count > 0 ? count : err;
+}
+
+static ssize_t fuse_perform_write(struct file *file,
+				  struct address_space *mapping,
+				  struct iov_iter *ii, loff_t pos)
+{
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int err = 0;
+	ssize_t res = 0;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	do {
+		struct fuse_req *req;
+		ssize_t count;
+
+		req = fuse_get_req(fc);
+		if (IS_ERR(req)) {
+			err = PTR_ERR(req);
+			break;
+		}
+
+		count = fuse_fill_write_pages(req, mapping, ii, pos);
+		if (count <= 0) {
+			err = count;
+		} else {
+			size_t num_written;
+
+			num_written = fuse_send_write_pages(req, file, inode,
+							    pos, count);
+			err = req->out.h.error;
+			if (!err) {
+				res += num_written;
+				pos += num_written;
+
+				/* break out of the loop on short write */
+				if (num_written != count)
+					err = -EIO;
+			}
+		}
+		fuse_put_request(fc, req);
+	} while (!err && iov_iter_count(ii));
+
+	if (res > 0)
+		fuse_write_update_size(inode, pos);
+
+	fuse_invalidate_attr(inode);
+
+	return res > 0 ? res : err;
+}
+
+static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				   unsigned long nr_segs, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	size_t count = 0;
+	ssize_t written = 0;
+	struct inode *inode = mapping->host;
+	ssize_t err;
+	struct iov_iter i;
+
+	WARN_ON(iocb->ki_pos != pos);
+
+	err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
+	if (err)
+		return err;
+
+	mutex_lock(&inode->i_mutex);
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
+
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err)
+		goto out;
+
+	if (count == 0)
+		goto out;
+
+	err = file_remove_suid(file);
+	if (err)
+		goto out;
+
+	file_update_time(file);
+
+	iov_iter_init(&i, iov, nr_segs, count, 0);
+	written = fuse_perform_write(file, mapping, &i, pos);
+	if (written >= 0)
+		iocb->ki_pos = pos + written;
+
+out:
+	current->backing_dev_info = NULL;
+	mutex_unlock(&inode->i_mutex);
+
+	return written ? written : err;
+}
+
+static void fuse_release_user_pages(struct fuse_req *req, int write)
+{
+	unsigned i;
+
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+		if (write)
+			set_page_dirty_lock(page);
+		put_page(page);
+	}
+}
+
+static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
+			       size_t *nbytesp, int write)
+{
+	size_t nbytes = *nbytesp;
+	unsigned long user_addr = (unsigned long) buf;
+	unsigned offset = user_addr & ~PAGE_MASK;
+	int npages;
+
+	/* Special case for kernel I/O: can copy directly into the buffer */
+	if (segment_eq(get_fs(), KERNEL_DS)) {
+		if (write)
+			req->in.args[1].value = (void *) user_addr;
+		else
+			req->out.args[0].value = (void *) user_addr;
+
+		return 0;
+	}
+
+	nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
+	npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
+	if (npages < 0)
+		return npages;
+
+	req->num_pages = npages;
+	req->page_offset = offset;
+
+	if (write)
+		req->in.argpages = 1;
+	else
+		req->out.argpages = 1;
+
+	nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
+	*nbytesp = min(*nbytesp, nbytes);
+
+	return 0;
+}
+
+ssize_t fuse_direct_io(struct file *file, const char __user *buf,
+		       size_t count, loff_t *ppos, int write)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+	size_t nmax = write ? fc->max_write : fc->max_read;
+	loff_t pos = *ppos;
+	ssize_t res = 0;
+	struct fuse_req *req;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	while (count) {
+		size_t nres;
+		fl_owner_t owner = current->files;
+		size_t nbytes = min(count, nmax);
+		int err = fuse_get_user_pages(req, buf, &nbytes, write);
+		if (err) {
+			res = err;
+			break;
+		}
+
+		if (write)
+			nres = fuse_send_write(req, file, pos, nbytes, owner);
+		else
+			nres = fuse_send_read(req, file, pos, nbytes, owner);
+
+		fuse_release_user_pages(req, !write);
+		if (req->out.h.error) {
+			if (!res)
+				res = req->out.h.error;
+			break;
+		} else if (nres > nbytes) {
+			res = -EIO;
+			break;
+		}
+		count -= nres;
+		res += nres;
+		pos += nres;
+		buf += nres;
+		if (nres != nbytes)
+			break;
+		if (count) {
+			fuse_put_request(fc, req);
+			req = fuse_get_req(fc);
+			if (IS_ERR(req))
+				break;
+		}
+	}
+	if (!IS_ERR(req))
+		fuse_put_request(fc, req);
+	if (res > 0)
+		*ppos = pos;
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(fuse_direct_io);
+
+static ssize_t fuse_direct_read(struct file *file, char __user *buf,
+				     size_t count, loff_t *ppos)
+{
+	ssize_t res;
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	res = fuse_direct_io(file, buf, count, ppos, 0);
+
+	fuse_invalidate_attr(inode);
+
+	return res;
+}
+
+static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	ssize_t res;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	/* Don't allow parallel writes to the same file */
+	mutex_lock(&inode->i_mutex);
+	res = generic_write_checks(file, ppos, &count, 0);
+	if (!res) {
+		res = fuse_direct_io(file, buf, count, ppos, 1);
+		if (res > 0)
+			fuse_write_update_size(inode, *ppos);
+	}
+	mutex_unlock(&inode->i_mutex);
+
+	fuse_invalidate_attr(inode);
+
+	return res;
+}
+
+static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
+{
+	__free_page(req->pages[0]);
+	fuse_file_put(req->ff, false);
+}
+
+static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct inode *inode = req->inode;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+
+	list_del(&req->writepages_entry);
+	dec_bdi_stat(bdi, BDI_WRITEBACK);
+	dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
+	bdi_writeout_inc(bdi);
+	wake_up(&fi->page_waitq);
+}
+
+/* Called under fc->lock, may release and reacquire it */
+static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	struct fuse_inode *fi = get_fuse_inode(req->inode);
+	loff_t size = i_size_read(req->inode);
+	struct fuse_write_in *inarg = &req->misc.write.in;
+
+	if (!fc->connected)
+		goto out_free;
+
+	if (inarg->offset + PAGE_CACHE_SIZE <= size) {
+		inarg->size = PAGE_CACHE_SIZE;
+	} else if (inarg->offset < size) {
+		inarg->size = size & (PAGE_CACHE_SIZE - 1);
+	} else {
+		/* Got truncated off completely */
+		goto out_free;
+	}
+
+	req->in.args[1].size = inarg->size;
+	fi->writectr++;
+	fuse_request_send_background_locked(fc, req);
+	return;
+
+ out_free:
+	fuse_writepage_finish(fc, req);
+	spin_unlock(&fc->lock);
+	fuse_writepage_free(fc, req);
+	fuse_put_request(fc, req);
+	spin_lock(&fc->lock);
+}
+
+/*
+ * If fi->writectr is positive (no truncate or fsync going on) send
+ * all queued writepage requests.
+ *
+ * Called with fc->lock
+ */
+void fuse_flush_writepages(struct inode *inode)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
+
+	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
+		req = list_entry(fi->queued_writes.next, struct fuse_req, list);
+		list_del_init(&req->list);
+		fuse_send_writepage(fc, req);
+	}
+}
+
+static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct inode *inode = req->inode;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	mapping_set_error(inode->i_mapping, req->out.h.error);
+	spin_lock(&fc->lock);
+	fi->writectr--;
+	fuse_writepage_finish(fc, req);
+	spin_unlock(&fc->lock);
+	fuse_writepage_free(fc, req);
+}
+
+static int fuse_writepage_locked(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
+	struct fuse_file *ff;
+	struct page *tmp_page;
+
+	set_page_writeback(page);
+
+	req = fuse_request_alloc_nofs();
+	if (!req)
+		goto err;
+
+	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (!tmp_page)
+		goto err_free;
+
+	spin_lock(&fc->lock);
+	BUG_ON(list_empty(&fi->write_files));
+	ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
+	req->ff = fuse_file_get(ff);
+	spin_unlock(&fc->lock);
+
+	fuse_write_fill(req, ff, page_offset(page), 0);
+
+	copy_highpage(tmp_page, page);
+	req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
+	req->in.argpages = 1;
+	req->num_pages = 1;
+	req->pages[0] = tmp_page;
+	req->page_offset = 0;
+	req->end = fuse_writepage_end;
+	req->inode = inode;
+
+	inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+	end_page_writeback(page);
+
+	spin_lock(&fc->lock);
+	list_add(&req->writepages_entry, &fi->writepages);
+	list_add_tail(&req->list, &fi->queued_writes);
+	fuse_flush_writepages(inode);
+	spin_unlock(&fc->lock);
+
+	return 0;
+
+err_free:
+	fuse_request_free(req);
+err:
+	end_page_writeback(page);
+	return -ENOMEM;
+}
+
+static int fuse_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int err;
+
+	err = fuse_writepage_locked(page);
+	unlock_page(page);
+
+	return err;
+}
+
+static int fuse_launder_page(struct page *page)
+{
+	int err = 0;
+	if (clear_page_dirty_for_io(page)) {
+		struct inode *inode = page->mapping->host;
+		err = fuse_writepage_locked(page);
+		if (!err)
+			fuse_wait_on_page_writeback(inode, page->index);
+	}
+	return err;
+}
+
+/*
+ * Write back dirty pages now, because there may not be any suitable
+ * open files later
+ */
+static void fuse_vma_close(struct vm_area_struct *vma)
+{
+	filemap_write_and_wait(vma->vm_file->f_mapping);
+}
+
+/*
+ * Wait for writeback against this page to complete before allowing it
+ * to be marked dirty again, and hence written back again, possibly
+ * before the previous writepage completed.
+ *
+ * Block here, instead of in ->writepage(), so that the userspace fs
+ * can only block processes actually operating on the filesystem.
+ *
+ * Otherwise unprivileged userspace fs would be able to block
+ * unrelated:
+ *
+ * - page migration
+ * - sync(2)
+ * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
+ */
+static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	/*
+	 * Don't use page->mapping as it may become NULL from a
+	 * concurrent truncate.
+	 */
+	struct inode *inode = vma->vm_file->f_mapping->host;
+
+	fuse_wait_on_page_writeback(inode, page->index);
+	return 0;
+}
+
+static const struct vm_operations_struct fuse_file_vm_ops = {
+	.close		= fuse_vma_close,
+	.fault		= filemap_fault,
+	.page_mkwrite	= fuse_page_mkwrite,
+};
+
+static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
+		struct inode *inode = file->f_dentry->d_inode;
+		struct fuse_conn *fc = get_fuse_conn(inode);
+		struct fuse_inode *fi = get_fuse_inode(inode);
+		struct fuse_file *ff = file->private_data;
+		/*
+		 * file may be written through mmap, so chain it onto the
+		 * inodes's write_file list
+		 */
+		spin_lock(&fc->lock);
+		if (list_empty(&ff->write_entry))
+			list_add(&ff->write_entry, &fi->write_files);
+		spin_unlock(&fc->lock);
+	}
+	file_accessed(file);
+	vma->vm_ops = &fuse_file_vm_ops;
+	return 0;
+}
+
+static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	/* Can't provide the coherency needed for MAP_SHARED */
+	if (vma->vm_flags & VM_MAYSHARE)
+		return -ENODEV;
+
+	invalidate_inode_pages2(file->f_mapping);
+
+	return generic_file_mmap(file, vma);
+}
+
+static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
+				  struct file_lock *fl)
+{
+	switch (ffl->type) {
+	case F_UNLCK:
+		break;
+
+	case F_RDLCK:
+	case F_WRLCK:
+		if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
+		    ffl->end < ffl->start)
+			return -EIO;
+
+		fl->fl_start = ffl->start;
+		fl->fl_end = ffl->end;
+		fl->fl_pid = ffl->pid;
+		break;
+
+	default:
+		return -EIO;
+	}
+	fl->fl_type = ffl->type;
+	return 0;
+}
+
+static void fuse_lk_fill(struct fuse_req *req, struct file *file,
+			 const struct file_lock *fl, int opcode, pid_t pid,
+			 int flock)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	struct fuse_lk_in *arg = &req->misc.lk_in;
+
+	arg->fh = ff->fh;
+	arg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
+	arg->lk.start = fl->fl_start;
+	arg->lk.end = fl->fl_end;
+	arg->lk.type = fl->fl_type;
+	arg->lk.pid = pid;
+	if (flock)
+		arg->lk_flags |= FUSE_LK_FLOCK;
+	req->in.h.opcode = opcode;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(*arg);
+	req->in.args[0].value = arg;
+}
+
+static int fuse_getlk(struct file *file, struct file_lock *fl)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_lk_out outarg;
+	int err;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	fuse_lk_fill(req, file, fl, FUSE_GETLK, 0, 0);
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err)
+		err = convert_fuse_file_lock(&outarg.lk, fl);
+
+	return err;
+}
+
+static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
+	pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
+	int err;
+
+	if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
+		/* NLM needs asynchronous locks, which we don't support yet */
+		return -ENOLCK;
+	}
+
+	/* Unlock on close is handled by the flush method */
+	if (fl->fl_flags & FL_CLOSE)
+		return 0;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	fuse_lk_fill(req, file, fl, opcode, pid, flock);
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	/* locking is restartable */
+	if (err == -EINTR)
+		err = -ERESTARTSYS;
+	fuse_put_request(fc, req);
+	return err;
+}
+
+static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int err;
+
+	if (cmd == F_CANCELLK) {
+		err = 0;
+	} else if (cmd == F_GETLK) {
+		if (fc->no_lock) {
+			posix_test_lock(file, fl);
+			err = 0;
+		} else
+			err = fuse_getlk(file, fl);
+	} else {
+		if (fc->no_lock)
+			err = posix_lock_file(file, fl, NULL);
+		else
+			err = fuse_setlk(file, fl, 0);
+	}
+	return err;
+}
+
+static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int err;
+
+	if (fc->no_lock) {
+		err = flock_lock_file_wait(file, fl);
+	} else {
+		/* emulate flock with POSIX locks */
+		fl->fl_owner = (fl_owner_t) file;
+		err = fuse_setlk(file, fl, 1);
+	}
+
+	return err;
+}
+
+static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
+{
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_bmap_in inarg;
+	struct fuse_bmap_out outarg;
+	int err;
+
+	if (!inode->i_sb->s_bdev || fc->no_bmap)
+		return 0;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return 0;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.block = block;
+	inarg.blocksize = inode->i_sb->s_blocksize;
+	req->in.h.opcode = FUSE_BMAP;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS)
+		fc->no_bmap = 1;
+
+	return err ? 0 : outarg.block;
+}
+
+static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t retval;
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	mutex_lock(&inode->i_mutex);
+	switch (origin) {
+	case SEEK_END:
+		retval = fuse_update_attributes(inode, NULL, file, NULL);
+		if (retval)
+			goto exit;
+		offset += i_size_read(inode);
+		break;
+	case SEEK_CUR:
+		offset += file->f_pos;
+	}
+	retval = -EINVAL;
+	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+		if (offset != file->f_pos) {
+			file->f_pos = offset;
+			file->f_version = 0;
+		}
+		retval = offset;
+	}
+exit:
+	mutex_unlock(&inode->i_mutex);
+	return retval;
+}
+
+static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
+			unsigned int nr_segs, size_t bytes, bool to_user)
+{
+	struct iov_iter ii;
+	int page_idx = 0;
+
+	if (!bytes)
+		return 0;
+
+	iov_iter_init(&ii, iov, nr_segs, bytes, 0);
+
+	while (iov_iter_count(&ii)) {
+		struct page *page = pages[page_idx++];
+		size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
+		void *kaddr;
+
+		kaddr = kmap(page);
+
+		while (todo) {
+			char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
+			size_t iov_len = ii.iov->iov_len - ii.iov_offset;
+			size_t copy = min(todo, iov_len);
+			size_t left;
+
+			if (!to_user)
+				left = copy_from_user(kaddr, uaddr, copy);
+			else
+				left = copy_to_user(uaddr, kaddr, copy);
+
+			if (unlikely(left))
+				return -EFAULT;
+
+			iov_iter_advance(&ii, copy);
+			todo -= copy;
+			kaddr += copy;
+		}
+
+		kunmap(page);
+	}
+
+	return 0;
+}
+
+/*
+ * CUSE servers compiled on 32bit broke on 64bit kernels because the
+ * ABI was defined to be 'struct iovec' which is different on 32bit
+ * and 64bit.  Fortunately we can determine which structure the server
+ * used from the size of the reply.
+ */
+static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
+				     size_t transferred, unsigned count,
+				     bool is_compat)
+{
+#ifdef CONFIG_COMPAT
+	if (count * sizeof(struct compat_iovec) == transferred) {
+		struct compat_iovec *ciov = src;
+		unsigned i;
+
+		/*
+		 * With this interface a 32bit server cannot support
+		 * non-compat (i.e. ones coming from 64bit apps) ioctl
+		 * requests
+		 */
+		if (!is_compat)
+			return -EINVAL;
+
+		for (i = 0; i < count; i++) {
+			dst[i].iov_base = compat_ptr(ciov[i].iov_base);
+			dst[i].iov_len = ciov[i].iov_len;
+		}
+		return 0;
+	}
+#endif
+
+	if (count * sizeof(struct iovec) != transferred)
+		return -EIO;
+
+	memcpy(dst, src, transferred);
+	return 0;
+}
+
+/* Make sure iov_length() won't overflow */
+static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
+{
+	size_t n;
+	u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
+
+	for (n = 0; n < count; n++) {
+		if (iov->iov_len > (size_t) max)
+			return -ENOMEM;
+		max -= iov->iov_len;
+	}
+	return 0;
+}
+
+static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
+				 void *src, size_t transferred, unsigned count,
+				 bool is_compat)
+{
+	unsigned i;
+	struct fuse_ioctl_iovec *fiov = src;
+
+	if (fc->minor < 16) {
+		return fuse_copy_ioctl_iovec_old(dst, src, transferred,
+						 count, is_compat);
+	}
+
+	if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
+		return -EIO;
+
+	for (i = 0; i < count; i++) {
+		/* Did the server supply an inappropriate value? */
+		if (fiov[i].base != (unsigned long) fiov[i].base ||
+		    fiov[i].len != (unsigned long) fiov[i].len)
+			return -EIO;
+
+		dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
+		dst[i].iov_len = (size_t) fiov[i].len;
+
+#ifdef CONFIG_COMPAT
+		if (is_compat &&
+		    (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
+		     (compat_size_t) dst[i].iov_len != fiov[i].len))
+			return -EIO;
+#endif
+	}
+
+	return 0;
+}
+
+
+/*
+ * For ioctls, there is no generic way to determine how much memory
+ * needs to be read and/or written.  Furthermore, ioctls are allowed
+ * to dereference the passed pointer, so the parameter requires deep
+ * copying but FUSE has no idea whatsoever about what to copy in or
+ * out.
+ *
+ * This is solved by allowing FUSE server to retry ioctl with
+ * necessary in/out iovecs.  Let's assume the ioctl implementation
+ * needs to read in the following structure.
+ *
+ * struct a {
+ *	char	*buf;
+ *	size_t	buflen;
+ * }
+ *
+ * On the first callout to FUSE server, inarg->in_size and
+ * inarg->out_size will be NULL; then, the server completes the ioctl
+ * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
+ * the actual iov array to
+ *
+ * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a) } }
+ *
+ * which tells FUSE to copy in the requested area and retry the ioctl.
+ * On the second round, the server has access to the structure and
+ * from that it can tell what to look for next, so on the invocation,
+ * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
+ *
+ * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a)	},
+ *   { .iov_base = a.buf,	.iov_len = a.buflen		} }
+ *
+ * FUSE will copy both struct a and the pointed buffer from the
+ * process doing the ioctl and retry ioctl with both struct a and the
+ * buffer.
+ *
+ * This time, FUSE server has everything it needs and completes ioctl
+ * without FUSE_IOCTL_RETRY which finishes the ioctl call.
+ *
+ * Copying data out works the same way.
+ *
+ * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
+ * automatically initializes in and out iovs by decoding @cmd with
+ * _IOC_* macros and the server is not allowed to request RETRY.  This
+ * limits ioctl data transfers to well-formed ioctls and is the forced
+ * behavior for all FUSE servers.
+ */
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+		   unsigned int flags)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+	struct fuse_ioctl_in inarg = {
+		.fh = ff->fh,
+		.cmd = cmd,
+		.arg = arg,
+		.flags = flags
+	};
+	struct fuse_ioctl_out outarg;
+	struct fuse_req *req = NULL;
+	struct page **pages = NULL;
+	struct iovec *iov_page = NULL;
+	struct iovec *in_iov = NULL, *out_iov = NULL;
+	unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
+	size_t in_size, out_size, transferred;
+	int err;
+
+#if BITS_PER_LONG == 32
+	inarg.flags |= FUSE_IOCTL_32BIT;
+#else
+	if (flags & FUSE_IOCTL_COMPAT)
+		inarg.flags |= FUSE_IOCTL_32BIT;
+#endif
+
+	/* assume all the iovs returned by client always fits in a page */
+	BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+
+	err = -ENOMEM;
+	pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
+	iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
+	if (!pages || !iov_page)
+		goto out;
+
+	/*
+	 * If restricted, initialize IO parameters as encoded in @cmd.
+	 * RETRY from server is not allowed.
+	 */
+	if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
+		struct iovec *iov = iov_page;
+
+		iov->iov_base = (void __user *)arg;
+		iov->iov_len = _IOC_SIZE(cmd);
+
+		if (_IOC_DIR(cmd) & _IOC_WRITE) {
+			in_iov = iov;
+			in_iovs = 1;
+		}
+
+		if (_IOC_DIR(cmd) & _IOC_READ) {
+			out_iov = iov;
+			out_iovs = 1;
+		}
+	}
+
+ retry:
+	inarg.in_size = in_size = iov_length(in_iov, in_iovs);
+	inarg.out_size = out_size = iov_length(out_iov, out_iovs);
+
+	/*
+	 * Out data can be used either for actual out data or iovs,
+	 * make sure there always is at least one page.
+	 */
+	out_size = max_t(size_t, out_size, PAGE_SIZE);
+	max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
+
+	/* make sure there are enough buffer pages and init request with them */
+	err = -ENOMEM;
+	if (max_pages > FUSE_MAX_PAGES_PER_REQ)
+		goto out;
+	while (num_pages < max_pages) {
+		pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+		if (!pages[num_pages])
+			goto out;
+		num_pages++;
+	}
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		req = NULL;
+		goto out;
+	}
+	memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
+	req->num_pages = num_pages;
+
+	/* okay, let's send it to the client */
+	req->in.h.opcode = FUSE_IOCTL;
+	req->in.h.nodeid = ff->nodeid;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	if (in_size) {
+		req->in.numargs++;
+		req->in.args[1].size = in_size;
+		req->in.argpages = 1;
+
+		err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
+					   false);
+		if (err)
+			goto out;
+	}
+
+	req->out.numargs = 2;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	req->out.args[1].size = out_size;
+	req->out.argpages = 1;
+	req->out.argvar = 1;
+
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	transferred = req->out.args[1].size;
+	fuse_put_request(fc, req);
+	req = NULL;
+	if (err)
+		goto out;
+
+	/* did it ask for retry? */
+	if (outarg.flags & FUSE_IOCTL_RETRY) {
+		void *vaddr;
+
+		/* no retry if in restricted mode */
+		err = -EIO;
+		if (!(flags & FUSE_IOCTL_UNRESTRICTED))
+			goto out;
+
+		in_iovs = outarg.in_iovs;
+		out_iovs = outarg.out_iovs;
+
+		/*
+		 * Make sure things are in boundary, separate checks
+		 * are to protect against overflow.
+		 */
+		err = -ENOMEM;
+		if (in_iovs > FUSE_IOCTL_MAX_IOV ||
+		    out_iovs > FUSE_IOCTL_MAX_IOV ||
+		    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
+			goto out;
+
+		vaddr = kmap_atomic(pages[0], KM_USER0);
+		err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
+					    transferred, in_iovs + out_iovs,
+					    (flags & FUSE_IOCTL_COMPAT) != 0);
+		kunmap_atomic(vaddr, KM_USER0);
+		if (err)
+			goto out;
+
+		in_iov = iov_page;
+		out_iov = in_iov + in_iovs;
+
+		err = fuse_verify_ioctl_iov(in_iov, in_iovs);
+		if (err)
+			goto out;
+
+		err = fuse_verify_ioctl_iov(out_iov, out_iovs);
+		if (err)
+			goto out;
+
+		goto retry;
+	}
+
+	err = -EIO;
+	if (transferred > inarg.out_size)
+		goto out;
+
+	err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
+ out:
+	if (req)
+		fuse_put_request(fc, req);
+	free_page((unsigned long) iov_page);
+	while (num_pages)
+		__free_page(pages[--num_pages]);
+	kfree(pages);
+
+	return err ? err : outarg.result;
+}
+EXPORT_SYMBOL_GPL(fuse_do_ioctl);
+
+static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
+				   unsigned long arg, unsigned int flags)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (!fuse_allow_task(fc, current))
+		return -EACCES;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+static long fuse_file_ioctl(struct file *file, unsigned int cmd,
+			    unsigned long arg)
+{
+	return fuse_file_ioctl_common(file, cmd, arg, 0);
+}
+
+static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+				   unsigned long arg)
+{
+	return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
+}
+
+/*
+ * All files which have been polled are linked to RB tree
+ * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
+ * find the matching one.
+ */
+static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
+					      struct rb_node **parent_out)
+{
+	struct rb_node **link = &fc->polled_files.rb_node;
+	struct rb_node *last = NULL;
+
+	while (*link) {
+		struct fuse_file *ff;
+
+		last = *link;
+		ff = rb_entry(last, struct fuse_file, polled_node);
+
+		if (kh < ff->kh)
+			link = &last->rb_left;
+		else if (kh > ff->kh)
+			link = &last->rb_right;
+		else
+			return link;
+	}
+
+	if (parent_out)
+		*parent_out = last;
+	return link;
+}
+
+/*
+ * The file is about to be polled.  Make sure it's on the polled_files
+ * RB tree.  Note that files once added to the polled_files tree are
+ * not removed before the file is released.  This is because a file
+ * polled once is likely to be polled again.
+ */
+static void fuse_register_polled_file(struct fuse_conn *fc,
+				      struct fuse_file *ff)
+{
+	spin_lock(&fc->lock);
+	if (RB_EMPTY_NODE(&ff->polled_node)) {
+		struct rb_node **link, *parent;
+
+		link = fuse_find_polled_node(fc, ff->kh, &parent);
+		BUG_ON(*link);
+		rb_link_node(&ff->polled_node, parent, link);
+		rb_insert_color(&ff->polled_node, &fc->polled_files);
+	}
+	spin_unlock(&fc->lock);
+}
+
+unsigned fuse_file_poll(struct file *file, poll_table *wait)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+	struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
+	struct fuse_poll_out outarg;
+	struct fuse_req *req;
+	int err;
+
+	if (fc->no_poll)
+		return DEFAULT_POLLMASK;
+
+	poll_wait(file, &ff->poll_wait, wait);
+
+	/*
+	 * Ask for notification iff there's someone waiting for it.
+	 * The client may ignore the flag and always notify.
+	 */
+	if (waitqueue_active(&ff->poll_wait)) {
+		inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
+		fuse_register_polled_file(fc, ff);
+	}
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return POLLERR;
+
+	req->in.h.opcode = FUSE_POLL;
+	req->in.h.nodeid = ff->nodeid;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+
+	if (!err)
+		return outarg.revents;
+	if (err == -ENOSYS) {
+		fc->no_poll = 1;
+		return DEFAULT_POLLMASK;
+	}
+	return POLLERR;
+}
+EXPORT_SYMBOL_GPL(fuse_file_poll);
+
+/*
+ * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
+ * wakes up the poll waiters.
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+			    struct fuse_notify_poll_wakeup_out *outarg)
+{
+	u64 kh = outarg->kh;
+	struct rb_node **link;
+
+	spin_lock(&fc->lock);
+
+	link = fuse_find_polled_node(fc, kh, NULL);
+	if (*link) {
+		struct fuse_file *ff;
+
+		ff = rb_entry(*link, struct fuse_file, polled_node);
+		wake_up_interruptible_sync(&ff->poll_wait);
+	}
+
+	spin_unlock(&fc->lock);
+	return 0;
+}
+
+static const struct file_operations fuse_file_operations = {
+	.llseek		= fuse_file_llseek,
+	.read		= do_sync_read,
+	.aio_read	= fuse_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= fuse_file_aio_write,
+	.mmap		= fuse_file_mmap,
+	.open		= fuse_open,
+	.flush		= fuse_flush,
+	.release	= fuse_release,
+	.fsync		= fuse_fsync,
+	.lock		= fuse_file_lock,
+	.flock		= fuse_file_flock,
+	.splice_read	= generic_file_splice_read,
+	.unlocked_ioctl	= fuse_file_ioctl,
+	.compat_ioctl	= fuse_file_compat_ioctl,
+	.poll		= fuse_file_poll,
+};
+
+static const struct file_operations fuse_direct_io_file_operations = {
+	.llseek		= fuse_file_llseek,
+	.read		= fuse_direct_read,
+	.write		= fuse_direct_write,
+	.mmap		= fuse_direct_mmap,
+	.open		= fuse_open,
+	.flush		= fuse_flush,
+	.release	= fuse_release,
+	.fsync		= fuse_fsync,
+	.lock		= fuse_file_lock,
+	.flock		= fuse_file_flock,
+	.unlocked_ioctl	= fuse_file_ioctl,
+	.compat_ioctl	= fuse_file_compat_ioctl,
+	.poll		= fuse_file_poll,
+	/* no splice_read */
+};
+
+static const struct address_space_operations fuse_file_aops  = {
+	.readpage	= fuse_readpage,
+	.writepage	= fuse_writepage,
+	.launder_page	= fuse_launder_page,
+	.write_begin	= fuse_write_begin,
+	.write_end	= fuse_write_end,
+	.readpages	= fuse_readpages,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.bmap		= fuse_bmap,
+};
+
+void fuse_init_file_inode(struct inode *inode)
+{
+	inode->i_fop = &fuse_file_operations;
+	inode->i_data.a_ops = &fuse_file_aops;
+}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
new file mode 100644
index 00000000..f6215501
--- /dev/null
+++ b/fs/fuse/fuse_i.h
@@ -0,0 +1,769 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#ifndef _FS_FUSE_I_H
+#define _FS_FUSE_I_H
+
+#include <linux/fuse.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/poll.h>
+#include <linux/workqueue.h>
+
+/** Max number of pages that can be used in a single read request */
+#define FUSE_MAX_PAGES_PER_REQ 32
+
+/** Bias for fi->writectr, meaning new writepages must not be sent */
+#define FUSE_NOWRITE INT_MIN
+
+/** It could be as large as PATH_MAX, but would that have any uses? */
+#define FUSE_NAME_MAX 1024
+
+/** Number of dentries for each connection in the control filesystem */
+#define FUSE_CTL_NUM_DENTRIES 5
+
+/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
+    module will check permissions based on the file mode.  Otherwise no
+    permission checking is done in the kernel */
+#define FUSE_DEFAULT_PERMISSIONS (1 << 0)
+
+/** If the FUSE_ALLOW_OTHER flag is given, then not only the user
+    doing the mount will be allowed to access the filesystem */
+#define FUSE_ALLOW_OTHER         (1 << 1)
+
+/** List of active connections */
+extern struct list_head fuse_conn_list;
+
+/** Global mutex protecting fuse_conn_list and the control filesystem */
+extern struct mutex fuse_mutex;
+
+/** Module parameters */
+extern unsigned max_user_bgreq;
+extern unsigned max_user_congthresh;
+
+/* One forget request */
+struct fuse_forget_link {
+	struct fuse_forget_one forget_one;
+	struct fuse_forget_link *next;
+};
+
+/** FUSE inode */
+struct fuse_inode {
+	/** Inode data */
+	struct inode inode;
+
+	/** Unique ID, which identifies the inode between userspace
+	 * and kernel */
+	u64 nodeid;
+
+	/** Number of lookups on this inode */
+	u64 nlookup;
+
+	/** The request used for sending the FORGET message */
+	struct fuse_forget_link *forget;
+
+	/** Time in jiffies until the file attributes are valid */
+	u64 i_time;
+
+	/** The sticky bit in inode->i_mode may have been removed, so
+	    preserve the original mode */
+	mode_t orig_i_mode;
+
+	/** 64 bit inode number */
+	u64 orig_ino;
+
+	/** Version of last attribute change */
+	u64 attr_version;
+
+	/** Files usable in writepage.  Protected by fc->lock */
+	struct list_head write_files;
+
+	/** Writepages pending on truncate or fsync */
+	struct list_head queued_writes;
+
+	/** Number of sent writes, a negative bias (FUSE_NOWRITE)
+	 * means more writes are blocked */
+	int writectr;
+
+	/** Waitq for writepage completion */
+	wait_queue_head_t page_waitq;
+
+	/** List of writepage requestst (pending or sent) */
+	struct list_head writepages;
+};
+
+struct fuse_conn;
+
+/** FUSE specific file data */
+struct fuse_file {
+	/** Fuse connection for this file */
+	struct fuse_conn *fc;
+
+	/** Request reserved for flush and release */
+	struct fuse_req *reserved_req;
+
+	/** Kernel file handle guaranteed to be unique */
+	u64 kh;
+
+	/** File handle used by userspace */
+	u64 fh;
+
+	/** Node id of this file */
+	u64 nodeid;
+
+	/** Refcount */
+	atomic_t count;
+
+	/** FOPEN_* flags returned by open */
+	u32 open_flags;
+
+	/** Entry on inode's write_files list */
+	struct list_head write_entry;
+
+	/** RB node to be linked on fuse_conn->polled_files */
+	struct rb_node polled_node;
+
+	/** Wait queue head for poll */
+	wait_queue_head_t poll_wait;
+};
+
+/** One input argument of a request */
+struct fuse_in_arg {
+	unsigned size;
+	const void *value;
+};
+
+/** The request input */
+struct fuse_in {
+	/** The request header */
+	struct fuse_in_header h;
+
+	/** True if the data for the last argument is in req->pages */
+	unsigned argpages:1;
+
+	/** Number of arguments */
+	unsigned numargs;
+
+	/** Array of arguments */
+	struct fuse_in_arg args[3];
+};
+
+/** One output argument of a request */
+struct fuse_arg {
+	unsigned size;
+	void *value;
+};
+
+/** The request output */
+struct fuse_out {
+	/** Header returned from userspace */
+	struct fuse_out_header h;
+
+	/*
+	 * The following bitfields are not changed during the request
+	 * processing
+	 */
+
+	/** Last argument is variable length (can be shorter than
+	    arg->size) */
+	unsigned argvar:1;
+
+	/** Last argument is a list of pages to copy data to */
+	unsigned argpages:1;
+
+	/** Zero partially or not copied pages */
+	unsigned page_zeroing:1;
+
+	/** Pages may be replaced with new ones */
+	unsigned page_replace:1;
+
+	/** Number or arguments */
+	unsigned numargs;
+
+	/** Array of arguments */
+	struct fuse_arg args[3];
+};
+
+/** The request state */
+enum fuse_req_state {
+	FUSE_REQ_INIT = 0,
+	FUSE_REQ_PENDING,
+	FUSE_REQ_READING,
+	FUSE_REQ_SENT,
+	FUSE_REQ_WRITING,
+	FUSE_REQ_FINISHED
+};
+
+/**
+ * A request to the client
+ */
+struct fuse_req {
+	/** This can be on either pending processing or io lists in
+	    fuse_conn */
+	struct list_head list;
+
+	/** Entry on the interrupts list  */
+	struct list_head intr_entry;
+
+	/** refcount */
+	atomic_t count;
+
+	/** Unique ID for the interrupt request */
+	u64 intr_unique;
+
+	/*
+	 * The following bitfields are either set once before the
+	 * request is queued or setting/clearing them is protected by
+	 * fuse_conn->lock
+	 */
+
+	/** True if the request has reply */
+	unsigned isreply:1;
+
+	/** Force sending of the request even if interrupted */
+	unsigned force:1;
+
+	/** The request was aborted */
+	unsigned aborted:1;
+
+	/** Request is sent in the background */
+	unsigned background:1;
+
+	/** The request has been interrupted */
+	unsigned interrupted:1;
+
+	/** Data is being copied to/from the request */
+	unsigned locked:1;
+
+	/** Request is counted as "waiting" */
+	unsigned waiting:1;
+
+	/** State of the request */
+	enum fuse_req_state state;
+
+	/** The request input */
+	struct fuse_in in;
+
+	/** The request output */
+	struct fuse_out out;
+
+	/** Used to wake up the task waiting for completion of request*/
+	wait_queue_head_t waitq;
+
+	/** Data for asynchronous requests */
+	union {
+		struct {
+			union {
+				struct fuse_release_in in;
+				struct work_struct work;
+			};
+			struct path path;
+		} release;
+		struct fuse_init_in init_in;
+		struct fuse_init_out init_out;
+		struct cuse_init_in cuse_init_in;
+		struct {
+			struct fuse_read_in in;
+			u64 attr_ver;
+		} read;
+		struct {
+			struct fuse_write_in in;
+			struct fuse_write_out out;
+		} write;
+		struct fuse_notify_retrieve_in retrieve_in;
+		struct fuse_lk_in lk_in;
+	} misc;
+
+	/** page vector */
+	struct page *pages[FUSE_MAX_PAGES_PER_REQ];
+
+	/** number of pages in vector */
+	unsigned num_pages;
+
+	/** offset of data on first page */
+	unsigned page_offset;
+
+	/** File used in the request (or NULL) */
+	struct fuse_file *ff;
+
+	/** Inode used in the request or NULL */
+	struct inode *inode;
+
+	/** Link on fi->writepages */
+	struct list_head writepages_entry;
+
+	/** Request completion callback */
+	void (*end)(struct fuse_conn *, struct fuse_req *);
+
+	/** Request is stolen from fuse_file->reserved_req */
+	struct file *stolen_file;
+};
+
+/**
+ * A Fuse connection.
+ *
+ * This structure is created, when the filesystem is mounted, and is
+ * destroyed, when the client device is closed and the filesystem is
+ * unmounted.
+ */
+struct fuse_conn {
+	/** Lock protecting accessess to  members of this structure */
+	spinlock_t lock;
+
+	/** Mutex protecting against directory alias creation */
+	struct mutex inst_mutex;
+
+	/** Refcount */
+	atomic_t count;
+
+	/** The user id for this mount */
+	uid_t user_id;
+
+	/** The group id for this mount */
+	gid_t group_id;
+
+	/** The fuse mount flags for this mount */
+	unsigned flags;
+
+	/** Maximum read size */
+	unsigned max_read;
+
+	/** Maximum write size */
+	unsigned max_write;
+
+	/** Readers of the connection are waiting on this */
+	wait_queue_head_t waitq;
+
+	/** The list of pending requests */
+	struct list_head pending;
+
+	/** The list of requests being processed */
+	struct list_head processing;
+
+	/** The list of requests under I/O */
+	struct list_head io;
+
+	/** The next unique kernel file handle */
+	u64 khctr;
+
+	/** rbtree of fuse_files waiting for poll events indexed by ph */
+	struct rb_root polled_files;
+
+	/** Maximum number of outstanding background requests */
+	unsigned max_background;
+
+	/** Number of background requests at which congestion starts */
+	unsigned congestion_threshold;
+
+	/** Number of requests currently in the background */
+	unsigned num_background;
+
+	/** Number of background requests currently queued for userspace */
+	unsigned active_background;
+
+	/** The list of background requests set aside for later queuing */
+	struct list_head bg_queue;
+
+	/** Pending interrupts */
+	struct list_head interrupts;
+
+	/** Queue of pending forgets */
+	struct fuse_forget_link forget_list_head;
+	struct fuse_forget_link *forget_list_tail;
+
+	/** Batching of FORGET requests (positive indicates FORGET batch) */
+	int forget_batch;
+
+	/** Flag indicating if connection is blocked.  This will be
+	    the case before the INIT reply is received, and if there
+	    are too many outstading backgrounds requests */
+	int blocked;
+
+	/** waitq for blocked connection */
+	wait_queue_head_t blocked_waitq;
+
+	/** waitq for reserved requests */
+	wait_queue_head_t reserved_req_waitq;
+
+	/** The next unique request id */
+	u64 reqctr;
+
+	/** Connection established, cleared on umount, connection
+	    abort and device release */
+	unsigned connected;
+
+	/** Connection failed (version mismatch).  Cannot race with
+	    setting other bitfields since it is only set once in INIT
+	    reply, before any other request, and never cleared */
+	unsigned conn_error:1;
+
+	/** Connection successful.  Only set in INIT */
+	unsigned conn_init:1;
+
+	/** Do readpages asynchronously?  Only set in INIT */
+	unsigned async_read:1;
+
+	/** Do not send separate SETATTR request before open(O_TRUNC)  */
+	unsigned atomic_o_trunc:1;
+
+	/** Filesystem supports NFS exporting.  Only set in INIT */
+	unsigned export_support:1;
+
+	/** Set if bdi is valid */
+	unsigned bdi_initialized:1;
+
+	/*
+	 * The following bitfields are only for optimization purposes
+	 * and hence races in setting them will not cause malfunction
+	 */
+
+	/** Is fsync not implemented by fs? */
+	unsigned no_fsync:1;
+
+	/** Is fsyncdir not implemented by fs? */
+	unsigned no_fsyncdir:1;
+
+	/** Is flush not implemented by fs? */
+	unsigned no_flush:1;
+
+	/** Is setxattr not implemented by fs? */
+	unsigned no_setxattr:1;
+
+	/** Is getxattr not implemented by fs? */
+	unsigned no_getxattr:1;
+
+	/** Is listxattr not implemented by fs? */
+	unsigned no_listxattr:1;
+
+	/** Is removexattr not implemented by fs? */
+	unsigned no_removexattr:1;
+
+	/** Are file locking primitives not implemented by fs? */
+	unsigned no_lock:1;
+
+	/** Is access not implemented by fs? */
+	unsigned no_access:1;
+
+	/** Is create not implemented by fs? */
+	unsigned no_create:1;
+
+	/** Is interrupt not implemented by fs? */
+	unsigned no_interrupt:1;
+
+	/** Is bmap not implemented by fs? */
+	unsigned no_bmap:1;
+
+	/** Is poll not implemented by fs? */
+	unsigned no_poll:1;
+
+	/** Do multi-page cached writes */
+	unsigned big_writes:1;
+
+	/** Don't apply umask to creation modes */
+	unsigned dont_mask:1;
+
+	/** The number of requests waiting for completion */
+	atomic_t num_waiting;
+
+	/** Negotiated minor version */
+	unsigned minor;
+
+	/** Backing dev info */
+	struct backing_dev_info bdi;
+
+	/** Entry on the fuse_conn_list */
+	struct list_head entry;
+
+	/** Device ID from super block */
+	dev_t dev;
+
+	/** Dentries in the control filesystem */
+	struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];
+
+	/** number of dentries used in the above array */
+	int ctl_ndents;
+
+	/** O_ASYNC requests */
+	struct fasync_struct *fasync;
+
+	/** Key for lock owner ID scrambling */
+	u32 scramble_key[4];
+
+	/** Reserved request for the DESTROY message */
+	struct fuse_req *destroy_req;
+
+	/** Version counter for attribute changes */
+	u64 attr_version;
+
+	/** Called on final put */
+	void (*release)(struct fuse_conn *);
+
+	/** Super block for this connection. */
+	struct super_block *sb;
+
+	/** Read/write semaphore to hold when accessing sb. */
+	struct rw_semaphore killsb;
+};
+
+static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
+{
+	return get_fuse_conn_super(inode->i_sb);
+}
+
+static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
+{
+	return container_of(inode, struct fuse_inode, inode);
+}
+
+static inline u64 get_node_id(struct inode *inode)
+{
+	return get_fuse_inode(inode)->nodeid;
+}
+
+/** Device operations */
+extern const struct file_operations fuse_dev_operations;
+
+extern const struct dentry_operations fuse_dentry_operations;
+
+/**
+ * Inode to nodeid comparison.
+ */
+int fuse_inode_eq(struct inode *inode, void *_nodeidp);
+
+/**
+ * Get a filled in inode
+ */
+struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
+			int generation, struct fuse_attr *attr,
+			u64 attr_valid, u64 attr_version);
+
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
+		     struct fuse_entry_out *outarg, struct inode **inode);
+
+/**
+ * Send FORGET command
+ */
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+		       u64 nodeid, u64 nlookup);
+
+struct fuse_forget_link *fuse_alloc_forget(void);
+
+/**
+ * Initialize READ or READDIR request
+ */
+void fuse_read_fill(struct fuse_req *req, struct file *file,
+		    loff_t pos, size_t count, int opcode);
+
+/**
+ * Send OPEN or OPENDIR request
+ */
+int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
+
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
+struct fuse_file *fuse_file_get(struct fuse_file *ff);
+void fuse_file_free(struct fuse_file *ff);
+void fuse_finish_open(struct inode *inode, struct file *file);
+
+void fuse_sync_release(struct fuse_file *ff, int flags);
+
+/**
+ * Send RELEASE or RELEASEDIR request
+ */
+void fuse_release_common(struct file *file, int opcode);
+
+/**
+ * Send FSYNC or FSYNCDIR request
+ */
+int fuse_fsync_common(struct file *file, int datasync, int isdir);
+
+/**
+ * Notify poll wakeup
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+			    struct fuse_notify_poll_wakeup_out *outarg);
+
+/**
+ * Initialize file operations on a regular file
+ */
+void fuse_init_file_inode(struct inode *inode);
+
+/**
+ * Initialize inode operations on regular files and special files
+ */
+void fuse_init_common(struct inode *inode);
+
+/**
+ * Initialize inode and file operations on a directory
+ */
+void fuse_init_dir(struct inode *inode);
+
+/**
+ * Initialize inode operations on a symlink
+ */
+void fuse_init_symlink(struct inode *inode);
+
+/**
+ * Change attributes of an inode
+ */
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+			    u64 attr_valid, u64 attr_version);
+
+void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+				   u64 attr_valid);
+
+/**
+ * Initialize the client device
+ */
+int fuse_dev_init(void);
+
+/**
+ * Cleanup the client device
+ */
+void fuse_dev_cleanup(void);
+
+int fuse_ctl_init(void);
+void fuse_ctl_cleanup(void);
+
+/**
+ * Allocate a request
+ */
+struct fuse_req *fuse_request_alloc(void);
+
+struct fuse_req *fuse_request_alloc_nofs(void);
+
+/**
+ * Free a request
+ */
+void fuse_request_free(struct fuse_req *req);
+
+/**
+ * Get a request, may fail with -ENOMEM
+ */
+struct fuse_req *fuse_get_req(struct fuse_conn *fc);
+
+/**
+ * Gets a requests for a file operation, always succeeds
+ */
+struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file);
+
+/**
+ * Decrement reference count of a request.  If count goes to zero free
+ * the request.
+ */
+void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
+ * Send a request (synchronous)
+ */
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
+ * Send a request in the background
+ */
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
+
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+					 struct fuse_req *req);
+
+/* Abort all requests */
+void fuse_abort_conn(struct fuse_conn *fc);
+
+/**
+ * Invalidate inode attributes
+ */
+void fuse_invalidate_attr(struct inode *inode);
+
+void fuse_invalidate_entry_cache(struct dentry *entry);
+
+/**
+ * Acquire reference to fuse_conn
+ */
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
+
+void fuse_conn_kill(struct fuse_conn *fc);
+
+/**
+ * Initialize fuse_conn
+ */
+void fuse_conn_init(struct fuse_conn *fc);
+
+/**
+ * Release reference to fuse_conn
+ */
+void fuse_conn_put(struct fuse_conn *fc);
+
+/**
+ * Add connection to control filesystem
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc);
+
+/**
+ * Remove connection from control filesystem
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc);
+
+/**
+ * Is file type valid?
+ */
+int fuse_valid_type(int m);
+
+/**
+ * Is task allowed to perform filesystem operation?
+ */
+int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task);
+
+u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
+
+int fuse_update_attributes(struct inode *inode, struct kstat *stat,
+			   struct file *file, bool *refreshed);
+
+void fuse_flush_writepages(struct inode *inode);
+
+void fuse_set_nowrite(struct inode *inode);
+void fuse_release_nowrite(struct inode *inode);
+
+u64 fuse_get_attr_version(struct fuse_conn *fc);
+
+/**
+ * File-system tells the kernel to invalidate cache for the given node id.
+ */
+int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+			     loff_t offset, loff_t len);
+
+/**
+ * File-system tells the kernel to invalidate parent attributes and
+ * the dentry matching parent/name.
+ */
+int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+			     struct qstr *name);
+
+int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+		 bool isdir);
+ssize_t fuse_direct_io(struct file *file, const char __user *buf,
+		       size_t count, loff_t *ppos, int write);
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+		   unsigned int flags);
+unsigned fuse_file_poll(struct file *file, poll_table *wait);
+int fuse_dev_release(struct inode *inode, struct file *file);
+
+void fuse_write_update_size(struct inode *inode, loff_t pos);
+
+#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
new file mode 100644
index 00000000..69a1e0f0
--- /dev/null
+++ b/fs/fuse/inode.c
@@ -0,0 +1,1263 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/parser.h>
+#include <linux/statfs.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/exportfs.h>
+
+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
+MODULE_DESCRIPTION("Filesystem in Userspace");
+MODULE_LICENSE("GPL");
+
+static struct kmem_cache *fuse_inode_cachep;
+struct list_head fuse_conn_list;
+DEFINE_MUTEX(fuse_mutex);
+
+static int set_global_limit(const char *val, struct kernel_param *kp);
+
+unsigned max_user_bgreq;
+module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
+		  &max_user_bgreq, 0644);
+__MODULE_PARM_TYPE(max_user_bgreq, "uint");
+MODULE_PARM_DESC(max_user_bgreq,
+ "Global limit for the maximum number of backgrounded requests an "
+ "unprivileged user can set");
+
+unsigned max_user_congthresh;
+module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
+		  &max_user_congthresh, 0644);
+__MODULE_PARM_TYPE(max_user_congthresh, "uint");
+MODULE_PARM_DESC(max_user_congthresh,
+ "Global limit for the maximum congestion threshold an "
+ "unprivileged user can set");
+
+#define FUSE_SUPER_MAGIC 0x65735546
+
+#define FUSE_DEFAULT_BLKSIZE 512
+
+/** Maximum number of outstanding background requests */
+#define FUSE_DEFAULT_MAX_BACKGROUND 12
+
+/** Congestion starts at 75% of maximum */
+#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
+
+struct fuse_mount_data {
+	int fd;
+	unsigned rootmode;
+	unsigned user_id;
+	unsigned group_id;
+	unsigned fd_present:1;
+	unsigned rootmode_present:1;
+	unsigned user_id_present:1;
+	unsigned group_id_present:1;
+	unsigned flags;
+	unsigned max_read;
+	unsigned blksize;
+};
+
+struct fuse_forget_link *fuse_alloc_forget()
+{
+	return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+}
+
+static struct inode *fuse_alloc_inode(struct super_block *sb)
+{
+	struct inode *inode;
+	struct fuse_inode *fi;
+
+	inode = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL);
+	if (!inode)
+		return NULL;
+
+	fi = get_fuse_inode(inode);
+	fi->i_time = 0;
+	fi->nodeid = 0;
+	fi->nlookup = 0;
+	fi->attr_version = 0;
+	fi->writectr = 0;
+	fi->orig_ino = 0;
+	INIT_LIST_HEAD(&fi->write_files);
+	INIT_LIST_HEAD(&fi->queued_writes);
+	INIT_LIST_HEAD(&fi->writepages);
+	init_waitqueue_head(&fi->page_waitq);
+	fi->forget = fuse_alloc_forget();
+	if (!fi->forget) {
+		kmem_cache_free(fuse_inode_cachep, inode);
+		return NULL;
+	}
+
+	return inode;
+}
+
+static void fuse_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(fuse_inode_cachep, inode);
+}
+
+static void fuse_destroy_inode(struct inode *inode)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	BUG_ON(!list_empty(&fi->write_files));
+	BUG_ON(!list_empty(&fi->queued_writes));
+	kfree(fi->forget);
+	call_rcu(&inode->i_rcu, fuse_i_callback);
+}
+
+static void fuse_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	if (inode->i_sb->s_flags & MS_ACTIVE) {
+		struct fuse_conn *fc = get_fuse_conn(inode);
+		struct fuse_inode *fi = get_fuse_inode(inode);
+		fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
+		fi->forget = NULL;
+	}
+}
+
+static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	if (*flags & MS_MANDLOCK)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
+ * so that it will fit.
+ */
+static ino_t fuse_squash_ino(u64 ino64)
+{
+	ino_t ino = (ino_t) ino64;
+	if (sizeof(ino_t) < sizeof(u64))
+		ino ^= ino64 >> (sizeof(u64) - sizeof(ino_t)) * 8;
+	return ino;
+}
+
+void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+				   u64 attr_valid)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	fi->attr_version = ++fc->attr_version;
+	fi->i_time = attr_valid;
+
+	inode->i_ino     = fuse_squash_ino(attr->ino);
+	inode->i_mode    = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
+	inode->i_nlink   = attr->nlink;
+	inode->i_uid     = attr->uid;
+	inode->i_gid     = attr->gid;
+	inode->i_blocks  = attr->blocks;
+	inode->i_atime.tv_sec   = attr->atime;
+	inode->i_atime.tv_nsec  = attr->atimensec;
+	inode->i_mtime.tv_sec   = attr->mtime;
+	inode->i_mtime.tv_nsec  = attr->mtimensec;
+	inode->i_ctime.tv_sec   = attr->ctime;
+	inode->i_ctime.tv_nsec  = attr->ctimensec;
+
+	if (attr->blksize != 0)
+		inode->i_blkbits = ilog2(attr->blksize);
+	else
+		inode->i_blkbits = inode->i_sb->s_blocksize_bits;
+
+	/*
+	 * Don't set the sticky bit in i_mode, unless we want the VFS
+	 * to check permissions.  This prevents failures due to the
+	 * check in may_delete().
+	 */
+	fi->orig_i_mode = inode->i_mode;
+	if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
+		inode->i_mode &= ~S_ISVTX;
+
+	fi->orig_ino = attr->ino;
+}
+
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+			    u64 attr_valid, u64 attr_version)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	loff_t oldsize;
+
+	spin_lock(&fc->lock);
+	if (attr_version != 0 && fi->attr_version > attr_version) {
+		spin_unlock(&fc->lock);
+		return;
+	}
+
+	fuse_change_attributes_common(inode, attr, attr_valid);
+
+	oldsize = inode->i_size;
+	i_size_write(inode, attr->size);
+	spin_unlock(&fc->lock);
+
+	if (S_ISREG(inode->i_mode) && oldsize != attr->size) {
+		truncate_pagecache(inode, oldsize, attr->size);
+		invalidate_inode_pages2(inode->i_mapping);
+	}
+}
+
+static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
+{
+	inode->i_mode = attr->mode & S_IFMT;
+	inode->i_size = attr->size;
+	if (S_ISREG(inode->i_mode)) {
+		fuse_init_common(inode);
+		fuse_init_file_inode(inode);
+	} else if (S_ISDIR(inode->i_mode))
+		fuse_init_dir(inode);
+	else if (S_ISLNK(inode->i_mode))
+		fuse_init_symlink(inode);
+	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+		 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+		fuse_init_common(inode);
+		init_special_inode(inode, inode->i_mode,
+				   new_decode_dev(attr->rdev));
+	} else
+		BUG();
+}
+
+int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+{
+	u64 nodeid = *(u64 *) _nodeidp;
+	if (get_node_id(inode) == nodeid)
+		return 1;
+	else
+		return 0;
+}
+
+static int fuse_inode_set(struct inode *inode, void *_nodeidp)
+{
+	u64 nodeid = *(u64 *) _nodeidp;
+	get_fuse_inode(inode)->nodeid = nodeid;
+	return 0;
+}
+
+struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
+			int generation, struct fuse_attr *attr,
+			u64 attr_valid, u64 attr_version)
+{
+	struct inode *inode;
+	struct fuse_inode *fi;
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ retry:
+	inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
+	if (!inode)
+		return NULL;
+
+	if ((inode->i_state & I_NEW)) {
+		inode->i_flags |= S_NOATIME|S_NOCMTIME;
+		inode->i_generation = generation;
+		inode->i_data.backing_dev_info = &fc->bdi;
+		fuse_init_inode(inode, attr);
+		unlock_new_inode(inode);
+	} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
+		/* Inode has changed type, any I/O on the old should fail */
+		make_bad_inode(inode);
+		iput(inode);
+		goto retry;
+	}
+
+	fi = get_fuse_inode(inode);
+	spin_lock(&fc->lock);
+	fi->nlookup++;
+	spin_unlock(&fc->lock);
+	fuse_change_attributes(inode, attr, attr_valid, attr_version);
+
+	return inode;
+}
+
+int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+			     loff_t offset, loff_t len)
+{
+	struct inode *inode;
+	pgoff_t pg_start;
+	pgoff_t pg_end;
+
+	inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
+	if (!inode)
+		return -ENOENT;
+
+	fuse_invalidate_attr(inode);
+	if (offset >= 0) {
+		pg_start = offset >> PAGE_CACHE_SHIFT;
+		if (len <= 0)
+			pg_end = -1;
+		else
+			pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+		invalidate_inode_pages2_range(inode->i_mapping,
+					      pg_start, pg_end);
+	}
+	iput(inode);
+	return 0;
+}
+
+static void fuse_umount_begin(struct super_block *sb)
+{
+	fuse_abort_conn(get_fuse_conn_super(sb));
+}
+
+static void fuse_send_destroy(struct fuse_conn *fc)
+{
+	struct fuse_req *req = fc->destroy_req;
+	if (req && fc->conn_init) {
+		fc->destroy_req = NULL;
+		req->in.h.opcode = FUSE_DESTROY;
+		req->force = 1;
+		fuse_request_send(fc, req);
+		fuse_put_request(fc, req);
+	}
+}
+
+static void fuse_bdi_destroy(struct fuse_conn *fc)
+{
+	if (fc->bdi_initialized)
+		bdi_destroy(&fc->bdi);
+}
+
+void fuse_conn_kill(struct fuse_conn *fc)
+{
+	spin_lock(&fc->lock);
+	fc->connected = 0;
+	fc->blocked = 0;
+	spin_unlock(&fc->lock);
+	/* Flush all readers on this fs */
+	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+	wake_up_all(&fc->waitq);
+	wake_up_all(&fc->blocked_waitq);
+	wake_up_all(&fc->reserved_req_waitq);
+	mutex_lock(&fuse_mutex);
+	list_del(&fc->entry);
+	fuse_ctl_remove_conn(fc);
+	mutex_unlock(&fuse_mutex);
+	fuse_bdi_destroy(fc);
+}
+EXPORT_SYMBOL_GPL(fuse_conn_kill);
+
+static void fuse_put_super(struct super_block *sb)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+	fuse_send_destroy(fc);
+	fuse_conn_kill(fc);
+	fuse_conn_put(fc);
+}
+
+static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
+{
+	stbuf->f_type    = FUSE_SUPER_MAGIC;
+	stbuf->f_bsize   = attr->bsize;
+	stbuf->f_frsize  = attr->frsize;
+	stbuf->f_blocks  = attr->blocks;
+	stbuf->f_bfree   = attr->bfree;
+	stbuf->f_bavail  = attr->bavail;
+	stbuf->f_files   = attr->files;
+	stbuf->f_ffree   = attr->ffree;
+	stbuf->f_namelen = attr->namelen;
+	/* fsid is left zero */
+}
+
+static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	struct fuse_req *req;
+	struct fuse_statfs_out outarg;
+	int err;
+
+	if (!fuse_allow_task(fc, current)) {
+		buf->f_type = FUSE_SUPER_MAGIC;
+		return 0;
+	}
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	memset(&outarg, 0, sizeof(outarg));
+	req->in.numargs = 0;
+	req->in.h.opcode = FUSE_STATFS;
+	req->in.h.nodeid = get_node_id(dentry->d_inode);
+	req->out.numargs = 1;
+	req->out.args[0].size =
+		fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	if (!err)
+		convert_fuse_statfs(buf, &outarg.st);
+	fuse_put_request(fc, req);
+	return err;
+}
+
+enum {
+	OPT_FD,
+	OPT_ROOTMODE,
+	OPT_USER_ID,
+	OPT_GROUP_ID,
+	OPT_DEFAULT_PERMISSIONS,
+	OPT_ALLOW_OTHER,
+	OPT_MAX_READ,
+	OPT_BLKSIZE,
+	OPT_ERR
+};
+
+static const match_table_t tokens = {
+	{OPT_FD,			"fd=%u"},
+	{OPT_ROOTMODE,			"rootmode=%o"},
+	{OPT_USER_ID,			"user_id=%u"},
+	{OPT_GROUP_ID,			"group_id=%u"},
+	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
+	{OPT_ALLOW_OTHER,		"allow_other"},
+	{OPT_MAX_READ,			"max_read=%u"},
+	{OPT_BLKSIZE,			"blksize=%u"},
+	{OPT_ERR,			NULL}
+};
+
+static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
+{
+	char *p;
+	memset(d, 0, sizeof(struct fuse_mount_data));
+	d->max_read = ~0;
+	d->blksize = FUSE_DEFAULT_BLKSIZE;
+
+	while ((p = strsep(&opt, ",")) != NULL) {
+		int token;
+		int value;
+		substring_t args[MAX_OPT_ARGS];
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case OPT_FD:
+			if (match_int(&args[0], &value))
+				return 0;
+			d->fd = value;
+			d->fd_present = 1;
+			break;
+
+		case OPT_ROOTMODE:
+			if (match_octal(&args[0], &value))
+				return 0;
+			if (!fuse_valid_type(value))
+				return 0;
+			d->rootmode = value;
+			d->rootmode_present = 1;
+			break;
+
+		case OPT_USER_ID:
+			if (match_int(&args[0], &value))
+				return 0;
+			d->user_id = value;
+			d->user_id_present = 1;
+			break;
+
+		case OPT_GROUP_ID:
+			if (match_int(&args[0], &value))
+				return 0;
+			d->group_id = value;
+			d->group_id_present = 1;
+			break;
+
+		case OPT_DEFAULT_PERMISSIONS:
+			d->flags |= FUSE_DEFAULT_PERMISSIONS;
+			break;
+
+		case OPT_ALLOW_OTHER:
+			d->flags |= FUSE_ALLOW_OTHER;
+			break;
+
+		case OPT_MAX_READ:
+			if (match_int(&args[0], &value))
+				return 0;
+			d->max_read = value;
+			break;
+
+		case OPT_BLKSIZE:
+			if (!is_bdev || match_int(&args[0], &value))
+				return 0;
+			d->blksize = value;
+			break;
+
+		default:
+			return 0;
+		}
+	}
+
+	if (!d->fd_present || !d->rootmode_present ||
+	    !d->user_id_present || !d->group_id_present)
+		return 0;
+
+	return 1;
+}
+
+static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(mnt->mnt_sb);
+
+	seq_printf(m, ",user_id=%u", fc->user_id);
+	seq_printf(m, ",group_id=%u", fc->group_id);
+	if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
+		seq_puts(m, ",default_permissions");
+	if (fc->flags & FUSE_ALLOW_OTHER)
+		seq_puts(m, ",allow_other");
+	if (fc->max_read != ~0)
+		seq_printf(m, ",max_read=%u", fc->max_read);
+	if (mnt->mnt_sb->s_bdev &&
+	    mnt->mnt_sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
+		seq_printf(m, ",blksize=%lu", mnt->mnt_sb->s_blocksize);
+	return 0;
+}
+
+void fuse_conn_init(struct fuse_conn *fc)
+{
+	memset(fc, 0, sizeof(*fc));
+	spin_lock_init(&fc->lock);
+	mutex_init(&fc->inst_mutex);
+	init_rwsem(&fc->killsb);
+	atomic_set(&fc->count, 1);
+	init_waitqueue_head(&fc->waitq);
+	init_waitqueue_head(&fc->blocked_waitq);
+	init_waitqueue_head(&fc->reserved_req_waitq);
+	INIT_LIST_HEAD(&fc->pending);
+	INIT_LIST_HEAD(&fc->processing);
+	INIT_LIST_HEAD(&fc->io);
+	INIT_LIST_HEAD(&fc->interrupts);
+	INIT_LIST_HEAD(&fc->bg_queue);
+	INIT_LIST_HEAD(&fc->entry);
+	fc->forget_list_tail = &fc->forget_list_head;
+	atomic_set(&fc->num_waiting, 0);
+	fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
+	fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
+	fc->khctr = 0;
+	fc->polled_files = RB_ROOT;
+	fc->reqctr = 0;
+	fc->blocked = 1;
+	fc->attr_version = 1;
+	get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+}
+EXPORT_SYMBOL_GPL(fuse_conn_init);
+
+void fuse_conn_put(struct fuse_conn *fc)
+{
+	if (atomic_dec_and_test(&fc->count)) {
+		if (fc->destroy_req)
+			fuse_request_free(fc->destroy_req);
+		mutex_destroy(&fc->inst_mutex);
+		fc->release(fc);
+	}
+}
+EXPORT_SYMBOL_GPL(fuse_conn_put);
+
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
+{
+	atomic_inc(&fc->count);
+	return fc;
+}
+EXPORT_SYMBOL_GPL(fuse_conn_get);
+
+static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
+{
+	struct fuse_attr attr;
+	memset(&attr, 0, sizeof(attr));
+
+	attr.mode = mode;
+	attr.ino = FUSE_ROOT_ID;
+	attr.nlink = 1;
+	return fuse_iget(sb, 1, 0, &attr, 0, 0);
+}
+
+struct fuse_inode_handle {
+	u64 nodeid;
+	u32 generation;
+};
+
+static struct dentry *fuse_get_dentry(struct super_block *sb,
+				      struct fuse_inode_handle *handle)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	struct inode *inode;
+	struct dentry *entry;
+	int err = -ESTALE;
+
+	if (handle->nodeid == 0)
+		goto out_err;
+
+	inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid);
+	if (!inode) {
+		struct fuse_entry_out outarg;
+		struct qstr name;
+
+		if (!fc->export_support)
+			goto out_err;
+
+		name.len = 1;
+		name.name = ".";
+		err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg,
+				       &inode);
+		if (err && err != -ENOENT)
+			goto out_err;
+		if (err || !inode) {
+			err = -ESTALE;
+			goto out_err;
+		}
+		err = -EIO;
+		if (get_node_id(inode) != handle->nodeid)
+			goto out_iput;
+	}
+	err = -ESTALE;
+	if (inode->i_generation != handle->generation)
+		goto out_iput;
+
+	entry = d_obtain_alias(inode);
+	if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID)
+		fuse_invalidate_entry_cache(entry);
+
+	return entry;
+
+ out_iput:
+	iput(inode);
+ out_err:
+	return ERR_PTR(err);
+}
+
+static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+			   int connectable)
+{
+	struct inode *inode = dentry->d_inode;
+	bool encode_parent = connectable && !S_ISDIR(inode->i_mode);
+	int len = encode_parent ? 6 : 3;
+	u64 nodeid;
+	u32 generation;
+
+	if (*max_len < len) {
+		*max_len = len;
+		return  255;
+	}
+
+	nodeid = get_fuse_inode(inode)->nodeid;
+	generation = inode->i_generation;
+
+	fh[0] = (u32)(nodeid >> 32);
+	fh[1] = (u32)(nodeid & 0xffffffff);
+	fh[2] = generation;
+
+	if (encode_parent) {
+		struct inode *parent;
+
+		spin_lock(&dentry->d_lock);
+		parent = dentry->d_parent->d_inode;
+		nodeid = get_fuse_inode(parent)->nodeid;
+		generation = parent->i_generation;
+		spin_unlock(&dentry->d_lock);
+
+		fh[3] = (u32)(nodeid >> 32);
+		fh[4] = (u32)(nodeid & 0xffffffff);
+		fh[5] = generation;
+	}
+
+	*max_len = len;
+	return encode_parent ? 0x82 : 0x81;
+}
+
+static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	struct fuse_inode_handle handle;
+
+	if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3)
+		return NULL;
+
+	handle.nodeid = (u64) fid->raw[0] << 32;
+	handle.nodeid |= (u64) fid->raw[1];
+	handle.generation = fid->raw[2];
+	return fuse_get_dentry(sb, &handle);
+}
+
+static struct dentry *fuse_fh_to_parent(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	struct fuse_inode_handle parent;
+
+	if (fh_type != 0x82 || fh_len < 6)
+		return NULL;
+
+	parent.nodeid = (u64) fid->raw[3] << 32;
+	parent.nodeid |= (u64) fid->raw[4];
+	parent.generation = fid->raw[5];
+	return fuse_get_dentry(sb, &parent);
+}
+
+static struct dentry *fuse_get_parent(struct dentry *child)
+{
+	struct inode *child_inode = child->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(child_inode);
+	struct inode *inode;
+	struct dentry *parent;
+	struct fuse_entry_out outarg;
+	struct qstr name;
+	int err;
+
+	if (!fc->export_support)
+		return ERR_PTR(-ESTALE);
+
+	name.len = 2;
+	name.name = "..";
+	err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode),
+			       &name, &outarg, &inode);
+	if (err) {
+		if (err == -ENOENT)
+			return ERR_PTR(-ESTALE);
+		return ERR_PTR(err);
+	}
+
+	parent = d_obtain_alias(inode);
+	if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID)
+		fuse_invalidate_entry_cache(parent);
+
+	return parent;
+}
+
+static const struct export_operations fuse_export_operations = {
+	.fh_to_dentry	= fuse_fh_to_dentry,
+	.fh_to_parent	= fuse_fh_to_parent,
+	.encode_fh	= fuse_encode_fh,
+	.get_parent	= fuse_get_parent,
+};
+
+static const struct super_operations fuse_super_operations = {
+	.alloc_inode    = fuse_alloc_inode,
+	.destroy_inode  = fuse_destroy_inode,
+	.evict_inode	= fuse_evict_inode,
+	.drop_inode	= generic_delete_inode,
+	.remount_fs	= fuse_remount_fs,
+	.put_super	= fuse_put_super,
+	.umount_begin	= fuse_umount_begin,
+	.statfs		= fuse_statfs,
+	.show_options	= fuse_show_options,
+};
+
+static void sanitize_global_limit(unsigned *limit)
+{
+	if (*limit == 0)
+		*limit = ((num_physpages << PAGE_SHIFT) >> 13) /
+			 sizeof(struct fuse_req);
+
+	if (*limit >= 1 << 16)
+		*limit = (1 << 16) - 1;
+}
+
+static int set_global_limit(const char *val, struct kernel_param *kp)
+{
+	int rv;
+
+	rv = param_set_uint(val, kp);
+	if (rv)
+		return rv;
+
+	sanitize_global_limit((unsigned *)kp->arg);
+
+	return 0;
+}
+
+static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
+{
+	int cap_sys_admin = capable(CAP_SYS_ADMIN);
+
+	if (arg->minor < 13)
+		return;
+
+	sanitize_global_limit(&max_user_bgreq);
+	sanitize_global_limit(&max_user_congthresh);
+
+	if (arg->max_background) {
+		fc->max_background = arg->max_background;
+
+		if (!cap_sys_admin && fc->max_background > max_user_bgreq)
+			fc->max_background = max_user_bgreq;
+	}
+	if (arg->congestion_threshold) {
+		fc->congestion_threshold = arg->congestion_threshold;
+
+		if (!cap_sys_admin &&
+		    fc->congestion_threshold > max_user_congthresh)
+			fc->congestion_threshold = max_user_congthresh;
+	}
+}
+
+static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct fuse_init_out *arg = &req->misc.init_out;
+
+	if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
+		fc->conn_error = 1;
+	else {
+		unsigned long ra_pages;
+
+		process_init_limits(fc, arg);
+
+		if (arg->minor >= 6) {
+			ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
+			if (arg->flags & FUSE_ASYNC_READ)
+				fc->async_read = 1;
+			if (!(arg->flags & FUSE_POSIX_LOCKS))
+				fc->no_lock = 1;
+			if (arg->flags & FUSE_ATOMIC_O_TRUNC)
+				fc->atomic_o_trunc = 1;
+			if (arg->minor >= 9) {
+				/* LOOKUP has dependency on proto version */
+				if (arg->flags & FUSE_EXPORT_SUPPORT)
+					fc->export_support = 1;
+			}
+			if (arg->flags & FUSE_BIG_WRITES)
+				fc->big_writes = 1;
+			if (arg->flags & FUSE_DONT_MASK)
+				fc->dont_mask = 1;
+		} else {
+			ra_pages = fc->max_read / PAGE_CACHE_SIZE;
+			fc->no_lock = 1;
+		}
+
+		fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
+		fc->minor = arg->minor;
+		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
+		fc->max_write = max_t(unsigned, 4096, fc->max_write);
+		fc->conn_init = 1;
+	}
+	fc->blocked = 0;
+	wake_up_all(&fc->blocked_waitq);
+}
+
+static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct fuse_init_in *arg = &req->misc.init_in;
+
+	arg->major = FUSE_KERNEL_VERSION;
+	arg->minor = FUSE_KERNEL_MINOR_VERSION;
+	arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
+	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
+		FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK;
+	req->in.h.opcode = FUSE_INIT;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(*arg);
+	req->in.args[0].value = arg;
+	req->out.numargs = 1;
+	/* Variable length argument used for backward compatibility
+	   with interface version < 7.5.  Rest of init_out is zeroed
+	   by do_get_request(), so a short reply is not a problem */
+	req->out.argvar = 1;
+	req->out.args[0].size = sizeof(struct fuse_init_out);
+	req->out.args[0].value = &req->misc.init_out;
+	req->end = process_init_reply;
+	fuse_request_send_background(fc, req);
+}
+
+static void fuse_free_conn(struct fuse_conn *fc)
+{
+	kfree(fc);
+}
+
+static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
+{
+	int err;
+
+	fc->bdi.name = "fuse";
+	fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+	/* fuse does it's own writeback accounting */
+	fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
+
+	err = bdi_init(&fc->bdi);
+	if (err)
+		return err;
+
+	fc->bdi_initialized = 1;
+
+	if (sb->s_bdev) {
+		err =  bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+				    MAJOR(fc->dev), MINOR(fc->dev));
+	} else {
+		err = bdi_register_dev(&fc->bdi, fc->dev);
+	}
+
+	if (err)
+		return err;
+
+	/*
+	 * For a single fuse filesystem use max 1% of dirty +
+	 * writeback threshold.
+	 *
+	 * This gives about 1M of write buffer for memory maps on a
+	 * machine with 1G and 10% dirty_ratio, which should be more
+	 * than enough.
+	 *
+	 * Privileged users can raise it by writing to
+	 *
+	 *    /sys/class/bdi/<bdi>/max_ratio
+	 */
+	bdi_set_max_ratio(&fc->bdi, 1);
+
+	return 0;
+}
+
+static int fuse_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct fuse_conn *fc;
+	struct inode *root;
+	struct fuse_mount_data d;
+	struct file *file;
+	struct dentry *root_dentry;
+	struct fuse_req *init_req;
+	int err;
+	int is_bdev = sb->s_bdev != NULL;
+
+	err = -EINVAL;
+	if (sb->s_flags & MS_MANDLOCK)
+		goto err;
+
+	sb->s_flags &= ~MS_NOSEC;
+
+	if (!parse_fuse_opt((char *) data, &d, is_bdev))
+		goto err;
+
+	if (is_bdev) {
+#ifdef CONFIG_BLOCK
+		err = -EINVAL;
+		if (!sb_set_blocksize(sb, d.blksize))
+			goto err;
+#endif
+	} else {
+		sb->s_blocksize = PAGE_CACHE_SIZE;
+		sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	}
+	sb->s_magic = FUSE_SUPER_MAGIC;
+	sb->s_op = &fuse_super_operations;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_export_op = &fuse_export_operations;
+
+	file = fget(d.fd);
+	err = -EINVAL;
+	if (!file)
+		goto err;
+
+	if (file->f_op != &fuse_dev_operations)
+		goto err_fput;
+
+	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+	err = -ENOMEM;
+	if (!fc)
+		goto err_fput;
+
+	fuse_conn_init(fc);
+
+	fc->dev = sb->s_dev;
+	fc->sb = sb;
+	err = fuse_bdi_init(fc, sb);
+	if (err)
+		goto err_put_conn;
+
+	sb->s_bdi = &fc->bdi;
+
+	/* Handle umasking inside the fuse code */
+	if (sb->s_flags & MS_POSIXACL)
+		fc->dont_mask = 1;
+	sb->s_flags |= MS_POSIXACL;
+
+	fc->release = fuse_free_conn;
+	fc->flags = d.flags;
+	fc->user_id = d.user_id;
+	fc->group_id = d.group_id;
+	fc->max_read = max_t(unsigned, 4096, d.max_read);
+
+	/* Used by get_root_inode() */
+	sb->s_fs_info = fc;
+
+	err = -ENOMEM;
+	root = fuse_get_root_inode(sb, d.rootmode);
+	if (!root)
+		goto err_put_conn;
+
+	root_dentry = d_alloc_root(root);
+	if (!root_dentry) {
+		iput(root);
+		goto err_put_conn;
+	}
+	/* only now - we want root dentry with NULL ->d_op */
+	sb->s_d_op = &fuse_dentry_operations;
+
+	init_req = fuse_request_alloc();
+	if (!init_req)
+		goto err_put_root;
+
+	if (is_bdev) {
+		fc->destroy_req = fuse_request_alloc();
+		if (!fc->destroy_req)
+			goto err_free_init_req;
+	}
+
+	mutex_lock(&fuse_mutex);
+	err = -EINVAL;
+	if (file->private_data)
+		goto err_unlock;
+
+	err = fuse_ctl_add_conn(fc);
+	if (err)
+		goto err_unlock;
+
+	list_add_tail(&fc->entry, &fuse_conn_list);
+	sb->s_root = root_dentry;
+	fc->connected = 1;
+	file->private_data = fuse_conn_get(fc);
+	mutex_unlock(&fuse_mutex);
+	/*
+	 * atomic_dec_and_test() in fput() provides the necessary
+	 * memory barrier for file->private_data to be visible on all
+	 * CPUs after this
+	 */
+	fput(file);
+
+	fuse_send_init(fc, init_req);
+
+	return 0;
+
+ err_unlock:
+	mutex_unlock(&fuse_mutex);
+ err_free_init_req:
+	fuse_request_free(init_req);
+ err_put_root:
+	dput(root_dentry);
+ err_put_conn:
+	fuse_bdi_destroy(fc);
+	fuse_conn_put(fc);
+ err_fput:
+	fput(file);
+ err:
+	return err;
+}
+
+static struct dentry *fuse_mount(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *raw_data)
+{
+	return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
+}
+
+static void fuse_kill_sb_anon(struct super_block *sb)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+	if (fc) {
+		down_write(&fc->killsb);
+		fc->sb = NULL;
+		up_write(&fc->killsb);
+	}
+
+	kill_anon_super(sb);
+}
+
+static struct file_system_type fuse_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "fuse",
+	.fs_flags	= FS_HAS_SUBTYPE,
+	.mount		= fuse_mount,
+	.kill_sb	= fuse_kill_sb_anon,
+};
+
+#ifdef CONFIG_BLOCK
+static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
+			   int flags, const char *dev_name,
+			   void *raw_data)
+{
+	return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super);
+}
+
+static void fuse_kill_sb_blk(struct super_block *sb)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+	if (fc) {
+		down_write(&fc->killsb);
+		fc->sb = NULL;
+		up_write(&fc->killsb);
+	}
+
+	kill_block_super(sb);
+}
+
+static struct file_system_type fuseblk_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "fuseblk",
+	.mount		= fuse_mount_blk,
+	.kill_sb	= fuse_kill_sb_blk,
+	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
+};
+
+static inline int register_fuseblk(void)
+{
+	return register_filesystem(&fuseblk_fs_type);
+}
+
+static inline void unregister_fuseblk(void)
+{
+	unregister_filesystem(&fuseblk_fs_type);
+}
+#else
+static inline int register_fuseblk(void)
+{
+	return 0;
+}
+
+static inline void unregister_fuseblk(void)
+{
+}
+#endif
+
+static void fuse_inode_init_once(void *foo)
+{
+	struct inode *inode = foo;
+
+	inode_init_once(inode);
+}
+
+static int __init fuse_fs_init(void)
+{
+	int err;
+
+	err = register_filesystem(&fuse_fs_type);
+	if (err)
+		goto out;
+
+	err = register_fuseblk();
+	if (err)
+		goto out_unreg;
+
+	fuse_inode_cachep = kmem_cache_create("fuse_inode",
+					      sizeof(struct fuse_inode),
+					      0, SLAB_HWCACHE_ALIGN,
+					      fuse_inode_init_once);
+	err = -ENOMEM;
+	if (!fuse_inode_cachep)
+		goto out_unreg2;
+
+	return 0;
+
+ out_unreg2:
+	unregister_fuseblk();
+ out_unreg:
+	unregister_filesystem(&fuse_fs_type);
+ out:
+	return err;
+}
+
+static void fuse_fs_cleanup(void)
+{
+	unregister_filesystem(&fuse_fs_type);
+	unregister_fuseblk();
+	kmem_cache_destroy(fuse_inode_cachep);
+}
+
+static struct kobject *fuse_kobj;
+static struct kobject *connections_kobj;
+
+static int fuse_sysfs_init(void)
+{
+	int err;
+
+	fuse_kobj = kobject_create_and_add("fuse", fs_kobj);
+	if (!fuse_kobj) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+
+	connections_kobj = kobject_create_and_add("connections", fuse_kobj);
+	if (!connections_kobj) {
+		err = -ENOMEM;
+		goto out_fuse_unregister;
+	}
+
+	return 0;
+
+ out_fuse_unregister:
+	kobject_put(fuse_kobj);
+ out_err:
+	return err;
+}
+
+static void fuse_sysfs_cleanup(void)
+{
+	kobject_put(connections_kobj);
+	kobject_put(fuse_kobj);
+}
+
+static int __init fuse_init(void)
+{
+	int res;
+
+	printk(KERN_INFO "fuse init (API version %i.%i)\n",
+	       FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
+
+	INIT_LIST_HEAD(&fuse_conn_list);
+	res = fuse_fs_init();
+	if (res)
+		goto err;
+
+	res = fuse_dev_init();
+	if (res)
+		goto err_fs_cleanup;
+
+	res = fuse_sysfs_init();
+	if (res)
+		goto err_dev_cleanup;
+
+	res = fuse_ctl_init();
+	if (res)
+		goto err_sysfs_cleanup;
+
+	sanitize_global_limit(&max_user_bgreq);
+	sanitize_global_limit(&max_user_congthresh);
+
+	return 0;
+
+ err_sysfs_cleanup:
+	fuse_sysfs_cleanup();
+ err_dev_cleanup:
+	fuse_dev_cleanup();
+ err_fs_cleanup:
+	fuse_fs_cleanup();
+ err:
+	return res;
+}
+
+static void __exit fuse_exit(void)
+{
+	printk(KERN_DEBUG "fuse exit\n");
+
+	fuse_ctl_cleanup();
+	fuse_sysfs_cleanup();
+	fuse_fs_cleanup();
+	fuse_dev_cleanup();
+}
+
+module_init(fuse_init);
+module_exit(fuse_exit);
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
new file mode 100644
index 00000000..8f26d1a5
--- /dev/null
+++ b/fs/generic_acl.c
@@ -0,0 +1,225 @@
+/*
+ * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
+ *
+ * This file is released under the GPL.
+ *
+ * Generic ACL support for in-memory filesystems.
+ */
+
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/generic_acl.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+
+
+static size_t
+generic_acl_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
+{
+	struct posix_acl *acl;
+	const char *xname;
+	size_t size;
+
+	acl = get_cached_acl(dentry->d_inode, type);
+	if (!acl)
+		return 0;
+	posix_acl_release(acl);
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		xname = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		xname = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		return 0;
+	}
+	size = strlen(xname) + 1;
+	if (list && size <= list_size)
+		memcpy(list, xname, size);
+	return size;
+}
+
+static int
+generic_acl_get(struct dentry *dentry, const char *name, void *buffer,
+		     size_t size, int type)
+{
+	struct posix_acl *acl;
+	int error;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+
+	acl = get_cached_acl(dentry->d_inode, type);
+	if (!acl)
+		return -ENODATA;
+	error = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+static int
+generic_acl_set(struct dentry *dentry, const char *name, const void *value,
+		     size_t size, int flags, int type)
+{
+	struct inode *inode = dentry->d_inode;
+	struct posix_acl *acl = NULL;
+	int error;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+	}
+	if (acl) {
+		mode_t mode;
+
+		error = posix_acl_valid(acl);
+		if (error)
+			goto failed;
+		switch (type) {
+		case ACL_TYPE_ACCESS:
+			mode = inode->i_mode;
+			error = posix_acl_equiv_mode(acl, &mode);
+			if (error < 0)
+				goto failed;
+			inode->i_mode = mode;
+			inode->i_ctime = CURRENT_TIME;
+			if (error == 0) {
+				posix_acl_release(acl);
+				acl = NULL;
+			}
+			break;
+		case ACL_TYPE_DEFAULT:
+			if (!S_ISDIR(inode->i_mode)) {
+				error = -EINVAL;
+				goto failed;
+			}
+			break;
+		}
+	}
+	set_cached_acl(inode, type, acl);
+	error = 0;
+failed:
+	posix_acl_release(acl);
+	return error;
+}
+
+/**
+ * generic_acl_init  -  Take care of acl inheritance at @inode create time
+ *
+ * Files created inside a directory with a default ACL inherit the
+ * directory's default ACL.
+ */
+int
+generic_acl_init(struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *acl = NULL;
+	mode_t mode = inode->i_mode;
+	int error;
+
+	inode->i_mode = mode & ~current_umask();
+	if (!S_ISLNK(inode->i_mode))
+		acl = get_cached_acl(dir, ACL_TYPE_DEFAULT);
+	if (acl) {
+		struct posix_acl *clone;
+
+		if (S_ISDIR(inode->i_mode)) {
+			clone = posix_acl_clone(acl, GFP_KERNEL);
+			error = -ENOMEM;
+			if (!clone)
+				goto cleanup;
+			set_cached_acl(inode, ACL_TYPE_DEFAULT, clone);
+			posix_acl_release(clone);
+		}
+		clone = posix_acl_clone(acl, GFP_KERNEL);
+		error = -ENOMEM;
+		if (!clone)
+			goto cleanup;
+		error = posix_acl_create_masq(clone, &mode);
+		if (error >= 0) {
+			inode->i_mode = mode;
+			if (error > 0)
+				set_cached_acl(inode, ACL_TYPE_ACCESS, clone);
+		}
+		posix_acl_release(clone);
+	}
+	error = 0;
+
+cleanup:
+	posix_acl_release(acl);
+	return error;
+}
+
+/**
+ * generic_acl_chmod  -  change the access acl of @inode upon chmod()
+ *
+ * A chmod also changes the permissions of the owner, group/mask, and
+ * other ACL entries.
+ */
+int
+generic_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+	int error = 0;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+	if (acl) {
+		clone = posix_acl_clone(acl, GFP_KERNEL);
+		posix_acl_release(acl);
+		if (!clone)
+			return -ENOMEM;
+		error = posix_acl_chmod_masq(clone, inode->i_mode);
+		if (!error)
+			set_cached_acl(inode, ACL_TYPE_ACCESS, clone);
+		posix_acl_release(clone);
+	}
+	return error;
+}
+
+int
+generic_check_acl(struct inode *inode, int mask, unsigned int flags)
+{
+	if (flags & IPERM_FLAG_RCU) {
+		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+			return -ECHILD;
+	} else {
+		struct posix_acl *acl;
+
+		acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+		if (acl) {
+			int error = posix_acl_permission(inode, acl, mask);
+			posix_acl_release(acl);
+			return error;
+		}
+	}
+	return -EAGAIN;
+}
+
+const struct xattr_handler generic_acl_access_handler = {
+	.prefix = POSIX_ACL_XATTR_ACCESS,
+	.flags	= ACL_TYPE_ACCESS,
+	.list	= generic_acl_list,
+	.get	= generic_acl_get,
+	.set	= generic_acl_set,
+};
+
+const struct xattr_handler generic_acl_default_handler = {
+	.prefix = POSIX_ACL_XATTR_DEFAULT,
+	.flags	= ACL_TYPE_DEFAULT,
+	.list	= generic_acl_list,
+	.get	= generic_acl_get,
+	.set	= generic_acl_set,
+};
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
new file mode 100644
index 00000000..c465ae06
--- /dev/null
+++ b/fs/gfs2/Kconfig
@@ -0,0 +1,38 @@
+config GFS2_FS
+	tristate "GFS2 file system support"
+	depends on (64BIT || LBDAF)
+	select DLM if GFS2_FS_LOCKING_DLM
+	select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
+	select SYSFS if GFS2_FS_LOCKING_DLM
+	select IP_SCTP if DLM_SCTP
+	select FS_POSIX_ACL
+	select CRC32
+	select QUOTACTL
+	help
+	  A cluster filesystem.
+
+	  Allows a cluster of computers to simultaneously use a block device
+	  that is shared between them (with FC, iSCSI, NBD, etc...).  GFS reads
+	  and writes to the block device like a local filesystem, but also uses
+	  a lock module to allow the computers coordinate their I/O so
+	  filesystem consistency is maintained.  One of the nifty features of
+	  GFS is perfect consistency -- changes made to the filesystem on one
+	  machine show up immediately on all other machines in the cluster.
+
+	  To use the GFS2 filesystem in a cluster, you will need to enable
+	  the locking module below. Documentation and utilities for GFS2 can
+	  be found here: http://sources.redhat.com/cluster
+
+	  The "nolock" lock module is now built in to GFS2 by default. If
+	  you want to use the DLM, be sure to enable HOTPLUG and IPv4/6
+	  networking.
+
+config GFS2_FS_LOCKING_DLM
+	bool "GFS2 DLM locking"
+	depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && HOTPLUG
+	help
+	  Multiple node locking module for GFS2
+
+	  Most users of GFS2 will require this. It provides the locking
+	  interface between GFS2 and the DLM, which is required to use GFS2
+	  in a cluster environment.
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
new file mode 100644
index 00000000..86128202
--- /dev/null
+++ b/fs/gfs2/Makefile
@@ -0,0 +1,10 @@
+ccflags-y := -I$(src)
+obj-$(CONFIG_GFS2_FS) += gfs2.o
+gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
+	glops.o log.o lops.o main.o meta_io.o \
+	aops.o dentry.o export.o file.o \
+	ops_fstype.o inode.o quota.o \
+	recovery.o rgrp.o super.o sys.o trans.o util.o
+
+gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
+
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
new file mode 100644
index 00000000..cbc07155
--- /dev/null
+++ b/fs/gfs2/acl.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/gfs2_ondisk.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "xattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "trans.h"
+#include "util.h"
+
+static const char *gfs2_acl_name(int type)
+{
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		return GFS2_POSIX_ACL_ACCESS;
+	case ACL_TYPE_DEFAULT:
+		return GFS2_POSIX_ACL_DEFAULT;
+	}
+	return NULL;
+}
+
+static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
+{
+	struct posix_acl *acl;
+	const char *name;
+	char *data;
+	int len;
+
+	if (!ip->i_eattr)
+		return NULL;
+
+	acl = get_cached_acl(&ip->i_inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	name = gfs2_acl_name(type);
+	if (name == NULL)
+		return ERR_PTR(-EINVAL);
+
+	len = gfs2_xattr_acl_get(ip, name, &data);
+	if (len < 0)
+		return ERR_PTR(len);
+	if (len == 0)
+		return NULL;
+
+	acl = posix_acl_from_xattr(data, len);
+	kfree(data);
+	return acl;
+}
+
+/**
+ * gfs2_check_acl - Check an ACL to see if we're allowed to do something
+ * @inode: the file we want to do something to
+ * @mask: what we want to do
+ *
+ * Returns: errno
+ */
+
+int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
+{
+	struct posix_acl *acl;
+	int error;
+
+	if (flags & IPERM_FLAG_RCU) {
+		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+			return -ECHILD;
+		return -EAGAIN;
+	}
+
+	acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+
+	if (acl) {
+		error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return error;
+	}
+
+	return -EAGAIN;
+}
+
+static int gfs2_set_mode(struct inode *inode, mode_t mode)
+{
+	int error = 0;
+
+	if (mode != inode->i_mode) {
+		struct iattr iattr;
+
+		iattr.ia_valid = ATTR_MODE;
+		iattr.ia_mode = mode;
+
+		error = gfs2_setattr_simple(GFS2_I(inode), &iattr);
+	}
+
+	return error;
+}
+
+static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
+{
+	int error;
+	int len;
+	char *data;
+	const char *name = gfs2_acl_name(type);
+
+	BUG_ON(name == NULL);
+	len = posix_acl_to_xattr(acl, NULL, 0);
+	if (len == 0)
+		return 0;
+	data = kmalloc(len, GFP_NOFS);
+	if (data == NULL)
+		return -ENOMEM;
+	error = posix_acl_to_xattr(acl, data, len);
+	if (error < 0)
+		goto out;
+	error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
+	if (!error)
+		set_cached_acl(inode, type, acl);
+out:
+	kfree(data);
+	return error;
+}
+
+int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct posix_acl *acl, *clone;
+	mode_t mode = inode->i_mode;
+	int error = 0;
+
+	if (!sdp->sd_args.ar_posix_acl)
+		return 0;
+	if (S_ISLNK(inode->i_mode))
+		return 0;
+
+	acl = gfs2_acl_get(dip, ACL_TYPE_DEFAULT);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (!acl) {
+		mode &= ~current_umask();
+		if (mode != inode->i_mode)
+			error = gfs2_set_mode(inode, mode);
+		return error;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+		error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl);
+		if (error)
+			goto out;
+	}
+
+	clone = posix_acl_clone(acl, GFP_NOFS);
+	error = -ENOMEM;
+	if (!clone)
+		goto out;
+	posix_acl_release(acl);
+	acl = clone;
+
+	error = posix_acl_create_masq(acl, &mode);
+	if (error < 0)
+		goto out;
+	if (error == 0)
+		goto munge;
+
+	error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl);
+	if (error)
+		goto out;
+munge:
+	error = gfs2_set_mode(inode, mode);
+out:
+	posix_acl_release(acl);
+	return error;
+}
+
+int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
+{
+	struct posix_acl *acl, *clone;
+	char *data;
+	unsigned int len;
+	int error;
+
+	acl = gfs2_acl_get(ip, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (!acl)
+		return gfs2_setattr_simple(ip, attr);
+
+	clone = posix_acl_clone(acl, GFP_NOFS);
+	error = -ENOMEM;
+	if (!clone)
+		goto out;
+	posix_acl_release(acl);
+	acl = clone;
+
+	error = posix_acl_chmod_masq(acl, attr->ia_mode);
+	if (!error) {
+		len = posix_acl_to_xattr(acl, NULL, 0);
+		data = kmalloc(len, GFP_NOFS);
+		error = -ENOMEM;
+		if (data == NULL)
+			goto out;
+		posix_acl_to_xattr(acl, data, len);
+		error = gfs2_xattr_acl_chmod(ip, attr, data);
+		kfree(data);
+		set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl);
+	}
+
+out:
+	posix_acl_release(acl);
+	return error;
+}
+
+static int gfs2_acl_type(const char *name)
+{
+	if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0)
+		return ACL_TYPE_ACCESS;
+	if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0)
+		return ACL_TYPE_DEFAULT;
+	return -EINVAL;
+}
+
+static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
+				 void *buffer, size_t size, int xtype)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct posix_acl *acl;
+	int type;
+	int error;
+
+	if (!sdp->sd_args.ar_posix_acl)
+		return -EOPNOTSUPP;
+
+	type = gfs2_acl_type(name);
+	if (type < 0)
+		return type;
+
+	acl = gfs2_acl_get(GFS2_I(inode), type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+
+	error = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
+				 const void *value, size_t size, int flags,
+				 int xtype)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct posix_acl *acl = NULL;
+	int error = 0, type;
+
+	if (!sdp->sd_args.ar_posix_acl)
+		return -EOPNOTSUPP;
+
+	type = gfs2_acl_type(name);
+	if (type < 0)
+		return type;
+	if (flags & XATTR_CREATE)
+		return -EINVAL;
+	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+		return value ? -EACCES : 0;
+	if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+		return -EPERM;
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	if (!value)
+		goto set_acl;
+
+	acl = posix_acl_from_xattr(value, size);
+	if (!acl) {
+		/*
+		 * acl_set_file(3) may request that we set default ACLs with
+		 * zero length -- defend (gracefully) against that here.
+		 */
+		goto out;
+	}
+	if (IS_ERR(acl)) {
+		error = PTR_ERR(acl);
+		goto out;
+	}
+
+	error = posix_acl_valid(acl);
+	if (error)
+		goto out_release;
+
+	error = -EINVAL;
+	if (acl->a_count > GFS2_ACL_MAX_ENTRIES)
+		goto out_release;
+
+	if (type == ACL_TYPE_ACCESS) {
+		mode_t mode = inode->i_mode;
+		error = posix_acl_equiv_mode(acl, &mode);
+
+		if (error <= 0) {
+			posix_acl_release(acl);
+			acl = NULL;
+
+			if (error < 0)
+				return error;
+		}
+
+		error = gfs2_set_mode(inode, mode);
+		if (error)
+			goto out_release;
+	}
+
+set_acl:
+	error = __gfs2_xattr_set(inode, name, value, size, 0, GFS2_EATYPE_SYS);
+	if (!error) {
+		if (acl)
+			set_cached_acl(inode, type, acl);
+		else
+			forget_cached_acl(inode, type);
+	}
+out_release:
+	posix_acl_release(acl);
+out:
+	return error;
+}
+
+const struct xattr_handler gfs2_xattr_system_handler = {
+	.prefix = XATTR_SYSTEM_PREFIX,
+	.flags  = GFS2_EATYPE_SYS,
+	.get    = gfs2_xattr_system_get,
+	.set    = gfs2_xattr_system_set,
+};
+
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
new file mode 100644
index 00000000..a93907c8
--- /dev/null
+++ b/fs/gfs2/acl.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __ACL_DOT_H__
+#define __ACL_DOT_H__
+
+#include "incore.h"
+
+#define GFS2_POSIX_ACL_ACCESS		"posix_acl_access"
+#define GFS2_POSIX_ACL_DEFAULT		"posix_acl_default"
+#define GFS2_ACL_MAX_ENTRIES		25
+
+extern int gfs2_check_acl(struct inode *inode, int mask, unsigned int);
+extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
+extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+extern const struct xattr_handler gfs2_xattr_system_handler;
+
+#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
new file mode 100644
index 00000000..f9fbbe96
--- /dev/null
+++ b/fs/gfs2/aops.c
@@ -0,0 +1,1182 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/fs.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/backing-dev.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "trans.h"
+#include "rgrp.h"
+#include "super.h"
+#include "util.h"
+#include "glops.h"
+
+
+void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+			    unsigned int from, unsigned int to)
+{
+	struct buffer_head *head = page_buffers(page);
+	unsigned int bsize = head->b_size;
+	struct buffer_head *bh;
+	unsigned int start, end;
+
+	for (bh = head, start = 0; bh != head || !start;
+	     bh = bh->b_this_page, start = end) {
+		end = start + bsize;
+		if (end <= from || start >= to)
+			continue;
+		if (gfs2_is_jdata(ip))
+			set_buffer_uptodate(bh);
+		gfs2_trans_add_bh(ip->i_gl, bh, 0);
+	}
+}
+
+/**
+ * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
+ * @inode: The inode
+ * @lblock: The block number to look up
+ * @bh_result: The buffer head to return the result in
+ * @create: Non-zero if we may add block to the file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
+				  struct buffer_head *bh_result, int create)
+{
+	int error;
+
+	error = gfs2_block_map(inode, lblock, bh_result, 0);
+	if (error)
+		return error;
+	if (!buffer_mapped(bh_result))
+		return -EIO;
+	return 0;
+}
+
+static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
+				 struct buffer_head *bh_result, int create)
+{
+	return gfs2_block_map(inode, lblock, bh_result, 0);
+}
+
+/**
+ * gfs2_writepage_common - Common bits of writepage
+ * @page: The page to be written
+ * @wbc: The writeback control
+ *
+ * Returns: 1 if writepage is ok, otherwise an error code or zero if no error.
+ */
+
+static int gfs2_writepage_common(struct page *page,
+				 struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+
+	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
+		goto out;
+	if (current->journal_info)
+		goto redirty;
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (page->index > end_index || (page->index == end_index && !offset)) {
+		page->mapping->a_ops->invalidatepage(page, 0);
+		goto out;
+	}
+	return 1;
+redirty:
+	redirty_page_for_writepage(wbc, page);
+out:
+	unlock_page(page);
+	return 0;
+}
+
+/**
+ * gfs2_writeback_writepage - Write page for writeback mappings
+ * @page: The page
+ * @wbc: The writeback control
+ *
+ */
+
+static int gfs2_writeback_writepage(struct page *page,
+				    struct writeback_control *wbc)
+{
+	int ret;
+
+	ret = gfs2_writepage_common(page, wbc);
+	if (ret <= 0)
+		return ret;
+
+	return nobh_writepage(page, gfs2_get_block_noalloc, wbc);
+}
+
+/**
+ * gfs2_ordered_writepage - Write page for ordered data files
+ * @page: The page to write
+ * @wbc: The writeback control
+ *
+ */
+
+static int gfs2_ordered_writepage(struct page *page,
+				  struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int ret;
+
+	ret = gfs2_writepage_common(page, wbc);
+	if (ret <= 0)
+		return ret;
+
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, inode->i_sb->s_blocksize,
+				     (1 << BH_Dirty)|(1 << BH_Uptodate));
+	}
+	gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1);
+	return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+}
+
+/**
+ * __gfs2_jdata_writepage - The core of jdata writepage
+ * @page: The page to write
+ * @wbc: The writeback control
+ *
+ * This is shared between writepage and writepages and implements the
+ * core of the writepage operation. If a transaction is required then
+ * PageChecked will have been set and the transaction will have
+ * already been started before this is called.
+ */
+
+static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+
+	if (PageChecked(page)) {
+		ClearPageChecked(page);
+		if (!page_has_buffers(page)) {
+			create_empty_buffers(page, inode->i_sb->s_blocksize,
+					     (1 << BH_Dirty)|(1 << BH_Uptodate));
+		}
+		gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
+	}
+	return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+}
+
+/**
+ * gfs2_jdata_writepage - Write complete page
+ * @page: Page to write
+ *
+ * Returns: errno
+ *
+ */
+
+static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	int ret;
+	int done_trans = 0;
+
+	if (PageChecked(page)) {
+		if (wbc->sync_mode != WB_SYNC_ALL)
+			goto out_ignore;
+		ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+		if (ret)
+			goto out_ignore;
+		done_trans = 1;
+	}
+	ret = gfs2_writepage_common(page, wbc);
+	if (ret > 0)
+		ret = __gfs2_jdata_writepage(page, wbc);
+	if (done_trans)
+		gfs2_trans_end(sdp);
+	return ret;
+
+out_ignore:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return 0;
+}
+
+/**
+ * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk
+ * @mapping: The mapping to write
+ * @wbc: Write-back control
+ *
+ * For the data=writeback case we can already ignore buffer heads
+ * and write whole extents at once. This is a big reduction in the
+ * number of I/O requests we send and the bmap calls we make in this case.
+ */
+static int gfs2_writeback_writepages(struct address_space *mapping,
+				     struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
+}
+
+/**
+ * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages
+ * @mapping: The mapping
+ * @wbc: The writeback control
+ * @writepage: The writepage function to call for each page
+ * @pvec: The vector of pages
+ * @nr_pages: The number of pages to write
+ *
+ * Returns: non-zero if loop should terminate, zero otherwise
+ */
+
+static int gfs2_write_jdata_pagevec(struct address_space *mapping,
+				    struct writeback_control *wbc,
+				    struct pagevec *pvec,
+				    int nr_pages, pgoff_t end)
+{
+	struct inode *inode = mapping->host;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
+	unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
+	int i;
+	int ret;
+
+	ret = gfs2_trans_begin(sdp, nrblocks, nrblocks);
+	if (ret < 0)
+		return ret;
+
+	for(i = 0; i < nr_pages; i++) {
+		struct page *page = pvec->pages[i];
+
+		lock_page(page);
+
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			continue;
+		}
+
+		if (!wbc->range_cyclic && page->index > end) {
+			ret = 1;
+			unlock_page(page);
+			continue;
+		}
+
+		if (wbc->sync_mode != WB_SYNC_NONE)
+			wait_on_page_writeback(page);
+
+		if (PageWriteback(page) ||
+		    !clear_page_dirty_for_io(page)) {
+			unlock_page(page);
+			continue;
+		}
+
+		/* Is the page fully outside i_size? (truncate in progress) */
+		if (page->index > end_index || (page->index == end_index && !offset)) {
+			page->mapping->a_ops->invalidatepage(page, 0);
+			unlock_page(page);
+			continue;
+		}
+
+		ret = __gfs2_jdata_writepage(page, wbc);
+
+		if (ret || (--(wbc->nr_to_write) <= 0))
+			ret = 1;
+	}
+	gfs2_trans_end(sdp);
+	return ret;
+}
+
+/**
+ * gfs2_write_cache_jdata - Like write_cache_pages but different
+ * @mapping: The mapping to write
+ * @wbc: The writeback control
+ * @writepage: The writepage function to call
+ * @data: The data to pass to writepage
+ *
+ * The reason that we use our own function here is that we need to
+ * start transactions before we grab page locks. This allows us
+ * to get the ordering right.
+ */
+
+static int gfs2_write_cache_jdata(struct address_space *mapping,
+				  struct writeback_control *wbc)
+{
+	int ret = 0;
+	int done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;
+	int scanned = 0;
+	int range_whole = 0;
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		scanned = 1;
+	}
+
+retry:
+	 while (!done && (index <= end) &&
+		(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					       PAGECACHE_TAG_DIRTY,
+					       min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		scanned = 1;
+		ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end);
+		if (ret)
+			done = 1;
+		if (ret > 0)
+			ret = 0;
+
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+	return ret;
+}
+
+
+/**
+ * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk
+ * @mapping: The mapping to write
+ * @wbc: The writeback control
+ * 
+ */
+
+static int gfs2_jdata_writepages(struct address_space *mapping,
+				 struct writeback_control *wbc)
+{
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+	int ret;
+
+	ret = gfs2_write_cache_jdata(mapping, wbc);
+	if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
+		gfs2_log_flush(sdp, ip->i_gl);
+		ret = gfs2_write_cache_jdata(mapping, wbc);
+	}
+	return ret;
+}
+
+/**
+ * stuffed_readpage - Fill in a Linux page with stuffed file data
+ * @ip: the inode
+ * @page: the page
+ *
+ * Returns: errno
+ */
+
+static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
+{
+	struct buffer_head *dibh;
+	u64 dsize = i_size_read(&ip->i_inode);
+	void *kaddr;
+	int error;
+
+	/*
+	 * Due to the order of unstuffing files and ->fault(), we can be
+	 * asked for a zero page in the case of a stuffed file being extended,
+	 * so we need to supply one here. It doesn't happen often.
+	 */
+	if (unlikely(page->index)) {
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+		SetPageUptodate(page);
+		return 0;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
+		dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
+	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
+	memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
+	kunmap_atomic(kaddr, KM_USER0);
+	flush_dcache_page(page);
+	brelse(dibh);
+	SetPageUptodate(page);
+
+	return 0;
+}
+
+
+/**
+ * __gfs2_readpage - readpage
+ * @file: The file to read a page for
+ * @page: The page to read
+ *
+ * This is the core of gfs2's readpage. Its used by the internal file
+ * reading code as in that case we already hold the glock. Also its
+ * called by gfs2_readpage() once the required lock has been granted.
+ *
+ */
+
+static int __gfs2_readpage(void *file, struct page *page)
+{
+	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+	int error;
+
+	if (gfs2_is_stuffed(ip)) {
+		error = stuffed_readpage(ip, page);
+		unlock_page(page);
+	} else {
+		error = mpage_readpage(page, gfs2_block_map);
+	}
+
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	return error;
+}
+
+/**
+ * gfs2_readpage - read a page of a file
+ * @file: The file to read
+ * @page: The page of the file
+ *
+ * This deals with the locking required. We have to unlock and
+ * relock the page in order to get the locking in the right
+ * order.
+ */
+
+static int gfs2_readpage(struct file *file, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_holder gh;
+	int error;
+
+	unlock_page(page);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	error = gfs2_glock_nq(&gh);
+	if (unlikely(error))
+		goto out;
+	error = AOP_TRUNCATED_PAGE;
+	lock_page(page);
+	if (page->mapping == mapping && !PageUptodate(page))
+		error = __gfs2_readpage(file, page);
+	else
+		unlock_page(page);
+	gfs2_glock_dq(&gh);
+out:
+	gfs2_holder_uninit(&gh);
+	if (error && error != AOP_TRUNCATED_PAGE)
+		lock_page(page);
+	return error;
+}
+
+/**
+ * gfs2_internal_read - read an internal file
+ * @ip: The gfs2 inode
+ * @ra_state: The readahead state (or NULL for no readahead)
+ * @buf: The buffer to fill
+ * @pos: The file position
+ * @size: The amount to read
+ *
+ */
+
+int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
+                       char *buf, loff_t *pos, unsigned size)
+{
+	struct address_space *mapping = ip->i_inode.i_mapping;
+	unsigned long index = *pos / PAGE_CACHE_SIZE;
+	unsigned offset = *pos & (PAGE_CACHE_SIZE - 1);
+	unsigned copied = 0;
+	unsigned amt;
+	struct page *page;
+	void *p;
+
+	do {
+		amt = size - copied;
+		if (offset + size > PAGE_CACHE_SIZE)
+			amt = PAGE_CACHE_SIZE - offset;
+		page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		p = kmap_atomic(page, KM_USER0);
+		memcpy(buf + copied, p + offset, amt);
+		kunmap_atomic(p, KM_USER0);
+		mark_page_accessed(page);
+		page_cache_release(page);
+		copied += amt;
+		index++;
+		offset = 0;
+	} while(copied < size);
+	(*pos) += size;
+	return size;
+}
+
+/**
+ * gfs2_readpages - Read a bunch of pages at once
+ *
+ * Some notes:
+ * 1. This is only for readahead, so we can simply ignore any things
+ *    which are slightly inconvenient (such as locking conflicts between
+ *    the page lock and the glock) and return having done no I/O. Its
+ *    obviously not something we'd want to do on too regular a basis.
+ *    Any I/O we ignore at this time will be done via readpage later.
+ * 2. We don't handle stuffed files here we let readpage do the honours.
+ * 3. mpage_readpages() does most of the heavy lifting in the common case.
+ * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places.
+ */
+
+static int gfs2_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *pages, unsigned nr_pages)
+{
+	struct inode *inode = mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (unlikely(ret))
+		goto out_uninit;
+	if (!gfs2_is_stuffed(ip))
+		ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map);
+	gfs2_glock_dq(&gh);
+out_uninit:
+	gfs2_holder_uninit(&gh);
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		ret = -EIO;
+	return ret;
+}
+
+/**
+ * gfs2_write_begin - Begin to write to a file
+ * @file: The file to write to
+ * @mapping: The mapping in which to write
+ * @pos: The file offset at which to start writing
+ * @len: Length of the write
+ * @flags: Various flags
+ * @pagep: Pointer to return the page
+ * @fsdata: Pointer to return fs data (unused by GFS2)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	int alloc_required;
+	int error = 0;
+	struct gfs2_alloc *al = NULL;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	struct page *page;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
+	if (unlikely(error))
+		goto out_uninit;
+	if (&ip->i_inode == sdp->sd_rindex) {
+		error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+					   GL_NOCACHE, &m_ip->i_gh);
+		if (unlikely(error)) {
+			gfs2_glock_dq(&ip->i_gh);
+			goto out_uninit;
+		}
+	}
+
+	alloc_required = gfs2_write_alloc_required(ip, pos, len);
+
+	if (alloc_required || gfs2_is_jdata(ip))
+		gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
+
+	if (alloc_required) {
+		al = gfs2_alloc_get(ip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_unlock;
+		}
+
+		error = gfs2_quota_lock_check(ip);
+		if (error)
+			goto out_alloc_put;
+
+		al->al_requested = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip);
+		if (error)
+			goto out_qunlock;
+	}
+
+	rblocks = RES_DINODE + ind_blocks;
+	if (gfs2_is_jdata(ip))
+		rblocks += data_blocks ? data_blocks : 1;
+	if (ind_blocks || data_blocks)
+		rblocks += RES_STATFS + RES_QUOTA;
+	if (&ip->i_inode == sdp->sd_rindex)
+		rblocks += 2 * RES_STATFS;
+	if (alloc_required)
+		rblocks += gfs2_rg_blocks(al);
+
+	error = gfs2_trans_begin(sdp, rblocks,
+				 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+	if (error)
+		goto out_trans_fail;
+
+	error = -ENOMEM;
+	flags |= AOP_FLAG_NOFS;
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	*pagep = page;
+	if (unlikely(!page))
+		goto out_endtrans;
+
+	if (gfs2_is_stuffed(ip)) {
+		error = 0;
+		if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+			error = gfs2_unstuff_dinode(ip, page);
+			if (error == 0)
+				goto prepare_write;
+		} else if (!PageUptodate(page)) {
+			error = stuffed_readpage(ip, page);
+		}
+		goto out;
+	}
+
+prepare_write:
+	error = __block_write_begin(page, from, len, gfs2_block_map);
+out:
+	if (error == 0)
+		return 0;
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	gfs2_trans_end(sdp);
+	if (pos + len > ip->i_inode.i_size)
+		gfs2_trim_blocks(&ip->i_inode);
+	goto out_trans_fail;
+
+out_endtrans:
+	gfs2_trans_end(sdp);
+out_trans_fail:
+	if (alloc_required) {
+		gfs2_inplace_release(ip);
+out_qunlock:
+		gfs2_quota_unlock(ip);
+out_alloc_put:
+		gfs2_alloc_put(ip);
+	}
+out_unlock:
+	if (&ip->i_inode == sdp->sd_rindex) {
+		gfs2_glock_dq(&m_ip->i_gh);
+		gfs2_holder_uninit(&m_ip->i_gh);
+	}
+	gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_gh);
+	return error;
+}
+
+/**
+ * adjust_fs_space - Adjusts the free space available due to gfs2_grow
+ * @inode: the rindex inode
+ */
+static void adjust_fs_space(struct inode *inode)
+{
+	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+	struct buffer_head *m_bh, *l_bh;
+	u64 fs_total, new_free;
+
+	/* Total up the file system space, according to the latest rindex. */
+	fs_total = gfs2_ri_total(sdp);
+	if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0)
+		return;
+
+	spin_lock(&sdp->sd_statfs_spin);
+	gfs2_statfs_change_in(m_sc, m_bh->b_data +
+			      sizeof(struct gfs2_dinode));
+	if (fs_total > (m_sc->sc_total + l_sc->sc_total))
+		new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
+	else
+		new_free = 0;
+	spin_unlock(&sdp->sd_statfs_spin);
+	fs_warn(sdp, "File system extended by %llu blocks.\n",
+		(unsigned long long)new_free);
+	gfs2_statfs_change(sdp, new_free, new_free, 0);
+
+	if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
+		goto out;
+	update_statfs(sdp, m_bh, l_bh);
+	brelse(l_bh);
+out:
+	brelse(m_bh);
+}
+
+/**
+ * gfs2_stuffed_write_end - Write end for stuffed files
+ * @inode: The inode
+ * @dibh: The buffer_head containing the on-disk inode
+ * @pos: The file position
+ * @len: The length of the write
+ * @copied: How much was actually copied by the VFS
+ * @page: The page
+ *
+ * This copies the data from the page into the inode block after
+ * the inode data structure itself.
+ *
+ * Returns: errno
+ */
+static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
+				  loff_t pos, unsigned len, unsigned copied,
+				  struct page *page)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	u64 to = pos + copied;
+	void *kaddr;
+	unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
+	struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+
+	BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
+	kaddr = kmap_atomic(page, KM_USER0);
+	memcpy(buf + pos, kaddr + pos, copied);
+	memset(kaddr + pos + copied, 0, len - copied);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (copied) {
+		if (inode->i_size < to)
+			i_size_write(inode, to);
+		gfs2_dinode_out(ip, di);
+		mark_inode_dirty(inode);
+	}
+
+	if (inode == sdp->sd_rindex) {
+		adjust_fs_space(inode);
+		ip->i_gh.gh_flags |= GL_NOCACHE;
+	}
+
+	brelse(dibh);
+	gfs2_trans_end(sdp);
+	if (inode == sdp->sd_rindex) {
+		gfs2_glock_dq(&m_ip->i_gh);
+		gfs2_holder_uninit(&m_ip->i_gh);
+	}
+	gfs2_glock_dq(&ip->i_gh);
+	gfs2_holder_uninit(&ip->i_gh);
+	return copied;
+}
+
+/**
+ * gfs2_write_end
+ * @file: The file to write to
+ * @mapping: The address space to write to
+ * @pos: The file position
+ * @len: The length of the data
+ * @copied:
+ * @page: The page that has been written
+ * @fsdata: The fsdata (unused in GFS2)
+ *
+ * The main write_end function for GFS2. We have a separate one for
+ * stuffed files as they are slightly different, otherwise we just
+ * put our locking around the VFS provided functions.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_write_end(struct file *file, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	struct buffer_head *dibh;
+	struct gfs2_alloc *al = ip->i_alloc;
+	unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned int to = from + len;
+	int ret;
+
+	BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
+
+	ret = gfs2_meta_inode_buffer(ip, &dibh);
+	if (unlikely(ret)) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto failed;
+	}
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+	if (gfs2_is_stuffed(ip))
+		return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
+
+	if (!gfs2_is_writeback(ip))
+		gfs2_page_add_databufs(ip, page, from, to);
+
+	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	if (ret > 0) {
+		gfs2_dinode_out(ip, dibh->b_data);
+		mark_inode_dirty(inode);
+	}
+
+	if (inode == sdp->sd_rindex) {
+		adjust_fs_space(inode);
+		ip->i_gh.gh_flags |= GL_NOCACHE;
+	}
+
+	brelse(dibh);
+failed:
+	gfs2_trans_end(sdp);
+	if (al) {
+		gfs2_inplace_release(ip);
+		gfs2_quota_unlock(ip);
+		gfs2_alloc_put(ip);
+	}
+	if (inode == sdp->sd_rindex) {
+		gfs2_glock_dq(&m_ip->i_gh);
+		gfs2_holder_uninit(&m_ip->i_gh);
+	}
+	gfs2_glock_dq(&ip->i_gh);
+	gfs2_holder_uninit(&ip->i_gh);
+	return ret;
+}
+
+/**
+ * gfs2_set_page_dirty - Page dirtying function
+ * @page: The page to dirty
+ *
+ * Returns: 1 if it dirtyed the page, or 0 otherwise
+ */
+ 
+static int gfs2_set_page_dirty(struct page *page)
+{
+	SetPageChecked(page);
+	return __set_page_dirty_buffers(page);
+}
+
+/**
+ * gfs2_bmap - Block map function
+ * @mapping: Address space info
+ * @lblock: The block to map
+ *
+ * Returns: The disk address for the block or 0 on hole or error
+ */
+
+static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
+{
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_holder i_gh;
+	sector_t dblock = 0;
+	int error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (error)
+		return 0;
+
+	if (!gfs2_is_stuffed(ip))
+		dblock = generic_block_bmap(mapping, lblock, gfs2_block_map);
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+	return dblock;
+}
+
+static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+	struct gfs2_bufdata *bd;
+
+	lock_buffer(bh);
+	gfs2_log_lock(sdp);
+	clear_buffer_dirty(bh);
+	bd = bh->b_private;
+	if (bd) {
+		if (!list_empty(&bd->bd_le.le_list) && !buffer_pinned(bh))
+			list_del_init(&bd->bd_le.le_list);
+		else
+			gfs2_remove_from_journal(bh, current->journal_info, 0);
+	}
+	bh->b_bdev = NULL;
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	gfs2_log_unlock(sdp);
+	unlock_buffer(bh);
+}
+
+static void gfs2_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+	struct buffer_head *bh, *head;
+	unsigned long pos = 0;
+
+	BUG_ON(!PageLocked(page));
+	if (offset == 0)
+		ClearPageChecked(page);
+	if (!page_has_buffers(page))
+		goto out;
+
+	bh = head = page_buffers(page);
+	do {
+		if (offset <= pos)
+			gfs2_discard(sdp, bh);
+		pos += bh->b_size;
+		bh = bh->b_this_page;
+	} while (bh != head);
+out:
+	if (offset == 0)
+		try_to_release_page(page, 0);
+}
+
+/**
+ * gfs2_ok_for_dio - check that dio is valid on this file
+ * @ip: The inode
+ * @rw: READ or WRITE
+ * @offset: The offset at which we are reading or writing
+ *
+ * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
+ *          1 (to accept the i/o request)
+ */
+static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
+{
+	/*
+	 * Should we return an error here? I can't see that O_DIRECT for
+	 * a stuffed file makes any sense. For now we'll silently fall
+	 * back to buffered I/O
+	 */
+	if (gfs2_is_stuffed(ip))
+		return 0;
+
+	if (offset >= i_size_read(&ip->i_inode))
+		return 0;
+	return 1;
+}
+
+
+
+static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
+			      const struct iovec *iov, loff_t offset,
+			      unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int rv;
+
+	/*
+	 * Deferred lock, even if its a write, since we do no allocation
+	 * on this path. All we need change is atime, and this lock mode
+	 * ensures that other nodes have flushed their buffered read caches
+	 * (i.e. their page cache entries for this inode). We do not,
+	 * unfortunately have the option of only flushing a range like
+	 * the VFS does.
+	 */
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
+	rv = gfs2_glock_nq(&gh);
+	if (rv)
+		return rv;
+	rv = gfs2_ok_for_dio(ip, rw, offset);
+	if (rv != 1)
+		goto out; /* dio not valid, fall back to buffered i/o */
+
+	rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				  offset, nr_segs, gfs2_get_block_direct,
+				  NULL, NULL, 0);
+out:
+	gfs2_glock_dq_m(1, &gh);
+	gfs2_holder_uninit(&gh);
+	return rv;
+}
+
+/**
+ * gfs2_releasepage - free the metadata associated with a page
+ * @page: the page that's being released
+ * @gfp_mask: passed from Linux VFS, ignored by us
+ *
+ * Call try_to_free_buffers() if the buffers in this page can be
+ * released.
+ *
+ * Returns: 0
+ */
+
+int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	struct address_space *mapping = page->mapping;
+	struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
+	struct buffer_head *bh, *head;
+	struct gfs2_bufdata *bd;
+
+	if (!page_has_buffers(page))
+		return 0;
+
+	gfs2_log_lock(sdp);
+	spin_lock(&sdp->sd_ail_lock);
+	head = bh = page_buffers(page);
+	do {
+		if (atomic_read(&bh->b_count))
+			goto cannot_release;
+		bd = bh->b_private;
+		if (bd && bd->bd_ail)
+			goto cannot_release;
+		if (buffer_pinned(bh) || buffer_dirty(bh))
+			goto not_possible;
+		bh = bh->b_this_page;
+	} while(bh != head);
+	spin_unlock(&sdp->sd_ail_lock);
+	gfs2_log_unlock(sdp);
+
+	head = bh = page_buffers(page);
+	do {
+		gfs2_log_lock(sdp);
+		bd = bh->b_private;
+		if (bd) {
+			gfs2_assert_warn(sdp, bd->bd_bh == bh);
+			gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
+			if (!list_empty(&bd->bd_le.le_list)) {
+				if (!buffer_pinned(bh))
+					list_del_init(&bd->bd_le.le_list);
+				else
+					bd = NULL;
+			}
+			if (bd)
+				bd->bd_bh = NULL;
+			bh->b_private = NULL;
+		}
+		gfs2_log_unlock(sdp);
+		if (bd)
+			kmem_cache_free(gfs2_bufdata_cachep, bd);
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	return try_to_free_buffers(page);
+
+not_possible: /* Should never happen */
+	WARN_ON(buffer_dirty(bh));
+	WARN_ON(buffer_pinned(bh));
+cannot_release:
+	spin_unlock(&sdp->sd_ail_lock);
+	gfs2_log_unlock(sdp);
+	return 0;
+}
+
+static const struct address_space_operations gfs2_writeback_aops = {
+	.writepage = gfs2_writeback_writepage,
+	.writepages = gfs2_writeback_writepages,
+	.readpage = gfs2_readpage,
+	.readpages = gfs2_readpages,
+	.write_begin = gfs2_write_begin,
+	.write_end = gfs2_write_end,
+	.bmap = gfs2_bmap,
+	.invalidatepage = gfs2_invalidatepage,
+	.releasepage = gfs2_releasepage,
+	.direct_IO = gfs2_direct_IO,
+	.migratepage = buffer_migrate_page,
+	.is_partially_uptodate = block_is_partially_uptodate,
+	.error_remove_page = generic_error_remove_page,
+};
+
+static const struct address_space_operations gfs2_ordered_aops = {
+	.writepage = gfs2_ordered_writepage,
+	.readpage = gfs2_readpage,
+	.readpages = gfs2_readpages,
+	.write_begin = gfs2_write_begin,
+	.write_end = gfs2_write_end,
+	.set_page_dirty = gfs2_set_page_dirty,
+	.bmap = gfs2_bmap,
+	.invalidatepage = gfs2_invalidatepage,
+	.releasepage = gfs2_releasepage,
+	.direct_IO = gfs2_direct_IO,
+	.migratepage = buffer_migrate_page,
+	.is_partially_uptodate = block_is_partially_uptodate,
+	.error_remove_page = generic_error_remove_page,
+};
+
+static const struct address_space_operations gfs2_jdata_aops = {
+	.writepage = gfs2_jdata_writepage,
+	.writepages = gfs2_jdata_writepages,
+	.readpage = gfs2_readpage,
+	.readpages = gfs2_readpages,
+	.write_begin = gfs2_write_begin,
+	.write_end = gfs2_write_end,
+	.set_page_dirty = gfs2_set_page_dirty,
+	.bmap = gfs2_bmap,
+	.invalidatepage = gfs2_invalidatepage,
+	.releasepage = gfs2_releasepage,
+	.is_partially_uptodate = block_is_partially_uptodate,
+	.error_remove_page = generic_error_remove_page,
+};
+
+void gfs2_set_aops(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (gfs2_is_writeback(ip))
+		inode->i_mapping->a_ops = &gfs2_writeback_aops;
+	else if (gfs2_is_ordered(ip))
+		inode->i_mapping->a_ops = &gfs2_ordered_aops;
+	else if (gfs2_is_jdata(ip))
+		inode->i_mapping->a_ops = &gfs2_jdata_aops;
+	else
+		BUG();
+}
+
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
new file mode 100644
index 00000000..e65493a8
--- /dev/null
+++ b/fs/gfs2/bmap.c
@@ -0,0 +1,1297 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "dir.h"
+#include "util.h"
+#include "trace_gfs2.h"
+
+/* This doesn't need to be that large as max 64 bit pointers in a 4k
+ * block is 512, so __u16 is fine for that. It saves stack space to
+ * keep it small.
+ */
+struct metapath {
+	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
+	__u16 mp_list[GFS2_MAX_META_HEIGHT];
+};
+
+typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
+			     struct buffer_head *bh, __be64 *top,
+			     __be64 *bottom, unsigned int height,
+			     void *data);
+
+struct strip_mine {
+	int sm_first;
+	unsigned int sm_height;
+};
+
+/**
+ * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @block: the block number that was allocated
+ * @page: The (optional) page. This is looked up if @page is NULL
+ *
+ * Returns: errno
+ */
+
+static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
+			       u64 block, struct page *page)
+{
+	struct inode *inode = &ip->i_inode;
+	struct buffer_head *bh;
+	int release = 0;
+
+	if (!page || page->index) {
+		page = grab_cache_page(inode->i_mapping, 0);
+		if (!page)
+			return -ENOMEM;
+		release = 1;
+	}
+
+	if (!PageUptodate(page)) {
+		void *kaddr = kmap(page);
+		u64 dsize = i_size_read(inode);
+ 
+		if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
+			dsize = dibh->b_size - sizeof(struct gfs2_dinode);
+
+		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
+		memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
+		kunmap(page);
+
+		SetPageUptodate(page);
+	}
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << inode->i_blkbits,
+				     (1 << BH_Uptodate));
+
+	bh = page_buffers(page);
+
+	if (!buffer_mapped(bh))
+		map_bh(bh, inode->i_sb, block);
+
+	set_buffer_uptodate(bh);
+	if (!gfs2_is_jdata(ip))
+		mark_buffer_dirty(bh);
+	if (!gfs2_is_writeback(ip))
+		gfs2_trans_add_bh(ip->i_gl, bh, 0);
+
+	if (release) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
+ * @ip: The GFS2 inode to unstuff
+ * @page: The (optional) page. This is looked up if the @page is NULL
+ *
+ * This routine unstuffs a dinode and returns it to a "normal" state such
+ * that the height can be grown in the traditional way.
+ *
+ * Returns: errno
+ */
+
+int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
+{
+	struct buffer_head *bh, *dibh;
+	struct gfs2_dinode *di;
+	u64 block = 0;
+	int isdir = gfs2_is_dir(ip);
+	int error;
+
+	down_write(&ip->i_rw_mutex);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+
+	if (i_size_read(&ip->i_inode)) {
+		/* Get a free block, fill it with the stuffed data,
+		   and write it out to disk */
+
+		unsigned int n = 1;
+		error = gfs2_alloc_block(ip, &block, &n);
+		if (error)
+			goto out_brelse;
+		if (isdir) {
+			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
+			error = gfs2_dir_get_new_buffer(ip, block, &bh);
+			if (error)
+				goto out_brelse;
+			gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
+					      dibh, sizeof(struct gfs2_dinode));
+			brelse(bh);
+		} else {
+			error = gfs2_unstuffer_page(ip, dibh, block, page);
+			if (error)
+				goto out_brelse;
+		}
+	}
+
+	/*  Set up the pointer to the new block  */
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	di = (struct gfs2_dinode *)dibh->b_data;
+	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+
+	if (i_size_read(&ip->i_inode)) {
+		*(__be64 *)(di + 1) = cpu_to_be64(block);
+		gfs2_add_inode_blocks(&ip->i_inode, 1);
+		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
+	}
+
+	ip->i_height = 1;
+	di->di_height = cpu_to_be16(1);
+
+out_brelse:
+	brelse(dibh);
+out:
+	up_write(&ip->i_rw_mutex);
+	return error;
+}
+
+
+/**
+ * find_metapath - Find path through the metadata tree
+ * @sdp: The superblock
+ * @mp: The metapath to return the result in
+ * @block: The disk block to look up
+ * @height: The pre-calculated height of the metadata tree
+ *
+ *   This routine returns a struct metapath structure that defines a path
+ *   through the metadata of inode "ip" to get to block "block".
+ *
+ *   Example:
+ *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
+ *   filesystem with a blocksize of 4096.
+ *
+ *   find_metapath() would return a struct metapath structure set to:
+ *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
+ *   and mp_list[2] = 165.
+ *
+ *   That means that in order to get to the block containing the byte at
+ *   offset 101342453, we would load the indirect block pointed to by pointer
+ *   0 in the dinode.  We would then load the indirect block pointed to by
+ *   pointer 48 in that indirect block.  We would then load the data block
+ *   pointed to by pointer 165 in that indirect block.
+ *
+ *             ----------------------------------------
+ *             | Dinode |                             |
+ *             |        |                            4|
+ *             |        |0 1 2 3 4 5                 9|
+ *             |        |                            6|
+ *             ----------------------------------------
+ *                       |
+ *                       |
+ *                       V
+ *             ----------------------------------------
+ *             | Indirect Block                       |
+ *             |                                     5|
+ *             |            4 4 4 4 4 5 5            1|
+ *             |0           5 6 7 8 9 0 1            2|
+ *             ----------------------------------------
+ *                                |
+ *                                |
+ *                                V
+ *             ----------------------------------------
+ *             | Indirect Block                       |
+ *             |                         1 1 1 1 1   5|
+ *             |                         6 6 6 6 6   1|
+ *             |0                        3 4 5 6 7   2|
+ *             ----------------------------------------
+ *                                           |
+ *                                           |
+ *                                           V
+ *             ----------------------------------------
+ *             | Data block containing offset         |
+ *             |            101342453                 |
+ *             |                                      |
+ *             |                                      |
+ *             ----------------------------------------
+ *
+ */
+
+static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
+			  struct metapath *mp, unsigned int height)
+{
+	unsigned int i;
+
+	for (i = height; i--;)
+		mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
+
+}
+
+static inline unsigned int metapath_branch_start(const struct metapath *mp)
+{
+	if (mp->mp_list[0] == 0)
+		return 2;
+	return 1;
+}
+
+/**
+ * metapointer - Return pointer to start of metadata in a buffer
+ * @height: The metadata height (0 = dinode)
+ * @mp: The metapath
+ *
+ * Return a pointer to the block number of the next height of the metadata
+ * tree given a buffer containing the pointer to the current height of the
+ * metadata tree.
+ */
+
+static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
+{
+	struct buffer_head *bh = mp->mp_bh[height];
+	unsigned int head_size = (height > 0) ?
+		sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
+	return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
+}
+
+/**
+ * lookup_metapath - Walk the metadata tree to a specific point
+ * @ip: The inode
+ * @mp: The metapath
+ *
+ * Assumes that the inode's buffer has already been looked up and
+ * hooked onto mp->mp_bh[0] and that the metapath has been initialised
+ * by find_metapath().
+ *
+ * If this function encounters part of the tree which has not been
+ * allocated, it returns the current height of the tree at the point
+ * at which it found the unallocated block. Blocks which are found are
+ * added to the mp->mp_bh[] list.
+ *
+ * Returns: error or height of metadata tree
+ */
+
+static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
+{
+	unsigned int end_of_metadata = ip->i_height - 1;
+	unsigned int x;
+	__be64 *ptr;
+	u64 dblock;
+	int ret;
+
+	for (x = 0; x < end_of_metadata; x++) {
+		ptr = metapointer(x, mp);
+		dblock = be64_to_cpu(*ptr);
+		if (!dblock)
+			return x + 1;
+
+		ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, 0, &mp->mp_bh[x+1]);
+		if (ret)
+			return ret;
+	}
+
+	return ip->i_height;
+}
+
+static inline void release_metapath(struct metapath *mp)
+{
+	int i;
+
+	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
+		if (mp->mp_bh[i] == NULL)
+			break;
+		brelse(mp->mp_bh[i]);
+	}
+}
+
+/**
+ * gfs2_extent_length - Returns length of an extent of blocks
+ * @start: Start of the buffer
+ * @len: Length of the buffer in bytes
+ * @ptr: Current position in the buffer
+ * @limit: Max extent length to return (0 = unlimited)
+ * @eob: Set to 1 if we hit "end of block"
+ *
+ * If the first block is zero (unallocated) it will return the number of
+ * unallocated blocks in the extent, otherwise it will return the number
+ * of contiguous blocks in the extent.
+ *
+ * Returns: The length of the extent (minimum of one block)
+ */
+
+static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob)
+{
+	const __be64 *end = (start + len);
+	const __be64 *first = ptr;
+	u64 d = be64_to_cpu(*ptr);
+
+	*eob = 0;
+	do {
+		ptr++;
+		if (ptr >= end)
+			break;
+		if (limit && --limit == 0)
+			break;
+		if (d)
+			d++;
+	} while(be64_to_cpu(*ptr) == d);
+	if (ptr >= end)
+		*eob = 1;
+	return (ptr - first);
+}
+
+static inline void bmap_lock(struct gfs2_inode *ip, int create)
+{
+	if (create)
+		down_write(&ip->i_rw_mutex);
+	else
+		down_read(&ip->i_rw_mutex);
+}
+
+static inline void bmap_unlock(struct gfs2_inode *ip, int create)
+{
+	if (create)
+		up_write(&ip->i_rw_mutex);
+	else
+		up_read(&ip->i_rw_mutex);
+}
+
+static inline __be64 *gfs2_indirect_init(struct metapath *mp,
+					 struct gfs2_glock *gl, unsigned int i,
+					 unsigned offset, u64 bn)
+{
+	__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
+		       ((i > 1) ? sizeof(struct gfs2_meta_header) :
+				 sizeof(struct gfs2_dinode)));
+	BUG_ON(i < 1);
+	BUG_ON(mp->mp_bh[i] != NULL);
+	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
+	gfs2_trans_add_bh(gl, mp->mp_bh[i], 1);
+	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
+	ptr += offset;
+	*ptr = cpu_to_be64(bn);
+	return ptr;
+}
+
+enum alloc_state {
+	ALLOC_DATA = 0,
+	ALLOC_GROW_DEPTH = 1,
+	ALLOC_GROW_HEIGHT = 2,
+	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
+};
+
+/**
+ * gfs2_bmap_alloc - Build a metadata tree of the requested height
+ * @inode: The GFS2 inode
+ * @lblock: The logical starting block of the extent
+ * @bh_map: This is used to return the mapping details
+ * @mp: The metapath
+ * @sheight: The starting height (i.e. whats already mapped)
+ * @height: The height to build to
+ * @maxlen: The max number of data blocks to alloc
+ *
+ * In this routine we may have to alloc:
+ *   i) Indirect blocks to grow the metadata tree height
+ *  ii) Indirect blocks to fill in lower part of the metadata tree
+ * iii) Data blocks
+ *
+ * The function is in two parts. The first part works out the total
+ * number of blocks which we need. The second part does the actual
+ * allocation asking for an extent at a time (if enough contiguous free
+ * blocks are available, there will only be one request per bmap call)
+ * and uses the state machine to initialise the blocks in order.
+ *
+ * Returns: errno on error
+ */
+
+static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
+			   struct buffer_head *bh_map, struct metapath *mp,
+			   const unsigned int sheight,
+			   const unsigned int height,
+			   const unsigned int maxlen)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *dibh = mp->mp_bh[0];
+	u64 bn, dblock = 0;
+	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
+	unsigned dblks = 0;
+	unsigned ptrs_per_blk;
+	const unsigned end_of_metadata = height - 1;
+	int eob = 0;
+	enum alloc_state state;
+	__be64 *ptr;
+	__be64 zero_bn = 0;
+
+	BUG_ON(sheight < 1);
+	BUG_ON(dibh == NULL);
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+	if (height == sheight) {
+		struct buffer_head *bh;
+		/* Bottom indirect block exists, find unalloced extent size */
+		ptr = metapointer(end_of_metadata, mp);
+		bh = mp->mp_bh[end_of_metadata];
+		dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
+					   &eob);
+		BUG_ON(dblks < 1);
+		state = ALLOC_DATA;
+	} else {
+		/* Need to allocate indirect blocks */
+		ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
+		dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]);
+		if (height == ip->i_height) {
+			/* Writing into existing tree, extend tree down */
+			iblks = height - sheight;
+			state = ALLOC_GROW_DEPTH;
+		} else {
+			/* Building up tree height */
+			state = ALLOC_GROW_HEIGHT;
+			iblks = height - ip->i_height;
+			branch_start = metapath_branch_start(mp);
+			iblks += (height - branch_start);
+		}
+	}
+
+	/* start of the second part of the function (state machine) */
+
+	blks = dblks + iblks;
+	i = sheight;
+	do {
+		int error;
+		n = blks - alloced;
+		error = gfs2_alloc_block(ip, &bn, &n);
+		if (error)
+			return error;
+		alloced += n;
+		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
+			gfs2_trans_add_unrevoke(sdp, bn, n);
+		switch (state) {
+		/* Growing height of tree */
+		case ALLOC_GROW_HEIGHT:
+			if (i == 1) {
+				ptr = (__be64 *)(dibh->b_data +
+						 sizeof(struct gfs2_dinode));
+				zero_bn = *ptr;
+			}
+			for (; i - 1 < height - ip->i_height && n > 0; i++, n--)
+				gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
+			if (i - 1 == height - ip->i_height) {
+				i--;
+				gfs2_buffer_copy_tail(mp->mp_bh[i],
+						sizeof(struct gfs2_meta_header),
+						dibh, sizeof(struct gfs2_dinode));
+				gfs2_buffer_clear_tail(dibh,
+						sizeof(struct gfs2_dinode) +
+						sizeof(__be64));
+				ptr = (__be64 *)(mp->mp_bh[i]->b_data +
+					sizeof(struct gfs2_meta_header));
+				*ptr = zero_bn;
+				state = ALLOC_GROW_DEPTH;
+				for(i = branch_start; i < height; i++) {
+					if (mp->mp_bh[i] == NULL)
+						break;
+					brelse(mp->mp_bh[i]);
+					mp->mp_bh[i] = NULL;
+				}
+				i = branch_start;
+			}
+			if (n == 0)
+				break;
+		/* Branching from existing tree */
+		case ALLOC_GROW_DEPTH:
+			if (i > 1 && i < height)
+				gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1);
+			for (; i < height && n > 0; i++, n--)
+				gfs2_indirect_init(mp, ip->i_gl, i,
+						   mp->mp_list[i-1], bn++);
+			if (i == height)
+				state = ALLOC_DATA;
+			if (n == 0)
+				break;
+		/* Tree complete, adding data blocks */
+		case ALLOC_DATA:
+			BUG_ON(n > dblks);
+			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
+			gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1);
+			dblks = n;
+			ptr = metapointer(end_of_metadata, mp);
+			dblock = bn;
+			while (n-- > 0)
+				*ptr++ = cpu_to_be64(bn++);
+			break;
+		}
+	} while ((state != ALLOC_DATA) || !dblock);
+
+	ip->i_height = height;
+	gfs2_add_inode_blocks(&ip->i_inode, alloced);
+	gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
+	map_bh(bh_map, inode->i_sb, dblock);
+	bh_map->b_size = dblks << inode->i_blkbits;
+	set_buffer_new(bh_map);
+	return 0;
+}
+
+/**
+ * gfs2_block_map - Map a block from an inode to a disk block
+ * @inode: The inode
+ * @lblock: The logical block number
+ * @bh_map: The bh to be mapped
+ * @create: True if its ok to alloc blocks to satify the request
+ *
+ * Sets buffer_mapped() if successful, sets buffer_boundary() if a
+ * read of metadata will be required before the next block can be
+ * mapped. Sets buffer_new() if new blocks were allocated.
+ *
+ * Returns: errno
+ */
+
+int gfs2_block_map(struct inode *inode, sector_t lblock,
+		   struct buffer_head *bh_map, int create)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	unsigned int bsize = sdp->sd_sb.sb_bsize;
+	const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
+	const u64 *arr = sdp->sd_heightsize;
+	__be64 *ptr;
+	u64 size;
+	struct metapath mp;
+	int ret;
+	int eob;
+	unsigned int len;
+	struct buffer_head *bh;
+	u8 height;
+
+	BUG_ON(maxlen == 0);
+
+	memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
+	bmap_lock(ip, create);
+	clear_buffer_mapped(bh_map);
+	clear_buffer_new(bh_map);
+	clear_buffer_boundary(bh_map);
+	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
+	if (gfs2_is_dir(ip)) {
+		bsize = sdp->sd_jbsize;
+		arr = sdp->sd_jheightsize;
+	}
+
+	ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
+	if (ret)
+		goto out;
+
+	height = ip->i_height;
+	size = (lblock + 1) * bsize;
+	while (size > arr[height])
+		height++;
+	find_metapath(sdp, lblock, &mp, height);
+	ret = 1;
+	if (height > ip->i_height || gfs2_is_stuffed(ip))
+		goto do_alloc;
+	ret = lookup_metapath(ip, &mp);
+	if (ret < 0)
+		goto out;
+	if (ret != ip->i_height)
+		goto do_alloc;
+	ptr = metapointer(ip->i_height - 1, &mp);
+	if (*ptr == 0)
+		goto do_alloc;
+	map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr));
+	bh = mp.mp_bh[ip->i_height - 1];
+	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob);
+	bh_map->b_size = (len << inode->i_blkbits);
+	if (eob)
+		set_buffer_boundary(bh_map);
+	ret = 0;
+out:
+	release_metapath(&mp);
+	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
+	bmap_unlock(ip, create);
+	return ret;
+
+do_alloc:
+	/* All allocations are done here, firstly check create flag */
+	if (!create) {
+		BUG_ON(gfs2_is_stuffed(ip));
+		ret = 0;
+		goto out;
+	}
+
+	/* At this point ret is the tree depth of already allocated blocks */
+	ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen);
+	goto out;
+}
+
+/*
+ * Deprecated: do not use in new code
+ */
+int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
+{
+	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
+	int ret;
+	int create = *new;
+
+	BUG_ON(!extlen);
+	BUG_ON(!dblock);
+	BUG_ON(!new);
+
+	bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5));
+	ret = gfs2_block_map(inode, lblock, &bh, create);
+	*extlen = bh.b_size >> inode->i_blkbits;
+	*dblock = bh.b_blocknr;
+	if (buffer_new(&bh))
+		*new = 1;
+	else
+		*new = 0;
+	return ret;
+}
+
+/**
+ * recursive_scan - recursively scan through the end of a file
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @mp: the path through the metadata to the point to start
+ * @height: the height the recursion is at
+ * @block: the indirect block to look at
+ * @first: 1 if this is the first block
+ * @bc: the call to make for each piece of metadata
+ * @data: data opaque to this function to pass to @bc
+ *
+ * When this is first called @height and @block should be zero and
+ * @first should be 1.
+ *
+ * Returns: errno
+ */
+
+static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
+			  struct metapath *mp, unsigned int height,
+			  u64 block, int first, block_call_t bc,
+			  void *data)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *bh = NULL;
+	__be64 *top, *bottom;
+	u64 bn;
+	int error;
+	int mh_size = sizeof(struct gfs2_meta_header);
+
+	if (!height) {
+		error = gfs2_meta_inode_buffer(ip, &bh);
+		if (error)
+			return error;
+		dibh = bh;
+
+		top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
+		bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
+	} else {
+		error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
+		if (error)
+			return error;
+
+		top = (__be64 *)(bh->b_data + mh_size) +
+				  (first ? mp->mp_list[height] : 0);
+
+		bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
+	}
+
+	error = bc(ip, dibh, bh, top, bottom, height, data);
+	if (error)
+		goto out;
+
+	if (height < ip->i_height - 1)
+		for (; top < bottom; top++, first = 0) {
+			if (!*top)
+				continue;
+
+			bn = be64_to_cpu(*top);
+
+			error = recursive_scan(ip, dibh, mp, height + 1, bn,
+					       first, bc, data);
+			if (error)
+				break;
+		}
+
+out:
+	brelse(bh);
+	return error;
+}
+
+/**
+ * do_strip - Look for a layer a particular layer of the file and strip it off
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @bh: A buffer of pointers
+ * @top: The first pointer in the buffer
+ * @bottom: One more than the last pointer
+ * @height: the height this buffer is at
+ * @data: a pointer to a struct strip_mine
+ *
+ * Returns: errno
+ */
+
+static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
+		    struct buffer_head *bh, __be64 *top, __be64 *bottom,
+		    unsigned int height, void *data)
+{
+	struct strip_mine *sm = data;
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrp_list rlist;
+	u64 bn, bstart;
+	u32 blen, btotal;
+	__be64 *p;
+	unsigned int rg_blocks = 0;
+	int metadata;
+	unsigned int revokes = 0;
+	int x;
+	int error = 0;
+
+	if (!*top)
+		sm->sm_first = 0;
+
+	if (height != sm->sm_height)
+		return 0;
+
+	if (sm->sm_first) {
+		top++;
+		sm->sm_first = 0;
+	}
+
+	metadata = (height != ip->i_height - 1);
+	if (metadata)
+		revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
+	else if (ip->i_depth)
+		revokes = sdp->sd_inptrs;
+
+	if (ip != GFS2_I(sdp->sd_rindex))
+		error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+	else if (!sdp->sd_rgrps)
+		error = gfs2_ri_update(ip);
+
+	if (error)
+		return error;
+
+	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+	bstart = 0;
+	blen = 0;
+
+	for (p = top; p < bottom; p++) {
+		if (!*p)
+			continue;
+
+		bn = be64_to_cpu(*p);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_rlist_add(sdp, &rlist, bstart);
+
+			bstart = bn;
+			blen = 1;
+		}
+	}
+
+	if (bstart)
+		gfs2_rlist_add(sdp, &rlist, bstart);
+	else
+		goto out; /* Nothing to do */
+
+	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
+
+	for (x = 0; x < rlist.rl_rgrps; x++) {
+		struct gfs2_rgrpd *rgd;
+		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+		rg_blocks += rgd->rd_length;
+	}
+
+	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+	if (error)
+		goto out_rlist;
+
+	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
+				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
+				 revokes);
+	if (error)
+		goto out_rg_gunlock;
+
+	down_write(&ip->i_rw_mutex);
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+	bstart = 0;
+	blen = 0;
+	btotal = 0;
+
+	for (p = top; p < bottom; p++) {
+		if (!*p)
+			continue;
+
+		bn = be64_to_cpu(*p);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart) {
+				if (metadata)
+					__gfs2_free_meta(ip, bstart, blen);
+				else
+					__gfs2_free_data(ip, bstart, blen);
+
+				btotal += blen;
+			}
+
+			bstart = bn;
+			blen = 1;
+		}
+
+		*p = 0;
+		gfs2_add_inode_blocks(&ip->i_inode, -1);
+	}
+	if (bstart) {
+		if (metadata)
+			__gfs2_free_meta(ip, bstart, blen);
+		else
+			__gfs2_free_data(ip, bstart, blen);
+
+		btotal += blen;
+	}
+
+	gfs2_statfs_change(sdp, 0, +btotal, 0);
+	gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
+			  ip->i_inode.i_gid);
+
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+
+	gfs2_dinode_out(ip, dibh->b_data);
+
+	up_write(&ip->i_rw_mutex);
+
+	gfs2_trans_end(sdp);
+
+out_rg_gunlock:
+	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist:
+	gfs2_rlist_free(&rlist);
+out:
+	if (ip != GFS2_I(sdp->sd_rindex))
+		gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
+	return error;
+}
+
+/**
+ * gfs2_block_truncate_page - Deal with zeroing out data for truncate
+ *
+ * This is partly borrowed from ext3.
+ */
+static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
+{
+	struct inode *inode = mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned long index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize, iblock, length, pos;
+	struct buffer_head *bh;
+	struct page *page;
+	int err;
+
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		return 0;
+
+	blocksize = inode->i_sb->s_blocksize;
+	length = blocksize - (offset & (blocksize - 1));
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	/* Find the buffer that contains "offset" */
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	err = 0;
+
+	if (!buffer_mapped(bh)) {
+		gfs2_block_map(inode, iblock, bh, 0);
+		/* unmapped? It's a hole - nothing to do */
+		if (!buffer_mapped(bh))
+			goto unlock;
+	}
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!buffer_uptodate(bh)) {
+		err = -EIO;
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (!buffer_uptodate(bh))
+			goto unlock;
+		err = 0;
+	}
+
+	if (!gfs2_is_writeback(ip))
+		gfs2_trans_add_bh(ip->i_gl, bh, 0);
+
+	zero_user(page, offset, length);
+	mark_buffer_dirty(bh);
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct address_space *mapping = inode->i_mapping;
+	struct buffer_head *dibh;
+	int journaled = gfs2_is_jdata(ip);
+	int error;
+
+	error = gfs2_trans_begin(sdp,
+				 RES_DINODE + (journaled ? RES_JDATA : 0), 0);
+	if (error)
+		return error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+	if (gfs2_is_stuffed(ip)) {
+		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
+	} else {
+		if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
+			error = gfs2_block_truncate_page(mapping, newsize);
+			if (error)
+				goto out_brelse;
+		}
+		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
+	}
+
+	i_size_write(inode, newsize);
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_dinode_out(ip, dibh->b_data);
+
+	truncate_pagecache(inode, oldsize, newsize);
+out_brelse:
+	brelse(dibh);
+out:
+	gfs2_trans_end(sdp);
+	return error;
+}
+
+static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int height = ip->i_height;
+	u64 lblock;
+	struct metapath mp;
+	int error;
+
+	if (!size)
+		lblock = 0;
+	else
+		lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
+
+	find_metapath(sdp, lblock, &mp, ip->i_height);
+	if (!gfs2_alloc_get(ip))
+		return -ENOMEM;
+
+	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out;
+
+	while (height--) {
+		struct strip_mine sm;
+		sm.sm_first = !!size;
+		sm.sm_height = height;
+
+		error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
+		if (error)
+			break;
+	}
+
+	gfs2_quota_unhold(ip);
+
+out:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+static int trunc_end(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (error)
+		return error;
+
+	down_write(&ip->i_rw_mutex);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+
+	if (!i_size_read(&ip->i_inode)) {
+		ip->i_height = 0;
+		ip->i_goal = ip->i_no_addr;
+		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+	}
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+
+out:
+	up_write(&ip->i_rw_mutex);
+	gfs2_trans_end(sdp);
+	return error;
+}
+
+/**
+ * do_shrink - make a file smaller
+ * @inode: the inode
+ * @oldsize: the current inode size
+ * @newsize: the size to make the file
+ *
+ * Called with an exclusive lock on @inode. The @size must
+ * be equal to or smaller than the current inode size.
+ *
+ * Returns: errno
+ */
+
+static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int error;
+
+	error = trunc_start(inode, oldsize, newsize);
+	if (error < 0)
+		return error;
+	if (gfs2_is_stuffed(ip))
+		return 0;
+
+	error = trunc_dealloc(ip, newsize);
+	if (error == 0)
+		error = trunc_end(ip);
+
+	return error;
+}
+
+void gfs2_trim_blocks(struct inode *inode)
+{
+	u64 size = inode->i_size;
+	int ret;
+
+	ret = do_shrink(inode, size, size);
+	WARN_ON(ret != 0);
+}
+
+/**
+ * do_grow - Touch and update inode size
+ * @inode: The inode
+ * @size: The new size
+ *
+ * This function updates the timestamps on the inode and
+ * may also increase the size of the inode. This function
+ * must not be called with @size any smaller than the current
+ * inode size.
+ *
+ * Although it is not strictly required to unstuff files here,
+ * earlier versions of GFS2 have a bug in the stuffed file reading
+ * code which will result in a buffer overrun if the size is larger
+ * than the max stuffed file size. In order to prevent this from
+ * occurring, such files are unstuffed, but in other cases we can
+ * just update the inode size directly.
+ *
+ * Returns: 0 on success, or -ve on error
+ */
+
+static int do_grow(struct inode *inode, u64 size)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *dibh;
+	struct gfs2_alloc *al = NULL;
+	int error;
+
+	if (gfs2_is_stuffed(ip) &&
+	    (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
+		al = gfs2_alloc_get(ip);
+		if (al == NULL)
+			return -ENOMEM;
+
+		error = gfs2_quota_lock_check(ip);
+		if (error)
+			goto do_grow_alloc_put;
+
+		al->al_requested = 1;
+		error = gfs2_inplace_reserve(ip);
+		if (error)
+			goto do_grow_qunlock;
+	}
+
+	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
+	if (error)
+		goto do_grow_release;
+
+	if (al) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (error)
+			goto do_end_trans;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto do_end_trans;
+
+	i_size_write(inode, size);
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+
+do_end_trans:
+	gfs2_trans_end(sdp);
+do_grow_release:
+	if (al) {
+		gfs2_inplace_release(ip);
+do_grow_qunlock:
+		gfs2_quota_unlock(ip);
+do_grow_alloc_put:
+		gfs2_alloc_put(ip);
+	}
+	return error;
+}
+
+/**
+ * gfs2_setattr_size - make a file a given size
+ * @inode: the inode
+ * @newsize: the size to make the file
+ *
+ * The file size can grow, shrink, or stay the same size. This
+ * is called holding i_mutex and an exclusive glock on the inode
+ * in question.
+ *
+ * Returns: errno
+ */
+
+int gfs2_setattr_size(struct inode *inode, u64 newsize)
+{
+	int ret;
+	u64 oldsize;
+
+	BUG_ON(!S_ISREG(inode->i_mode));
+
+	ret = inode_newsize_ok(inode, newsize);
+	if (ret)
+		return ret;
+
+	oldsize = inode->i_size;
+	if (newsize >= oldsize)
+		return do_grow(inode, newsize);
+
+	return do_shrink(inode, oldsize, newsize);
+}
+
+int gfs2_truncatei_resume(struct gfs2_inode *ip)
+{
+	int error;
+	error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
+	if (!error)
+		error = trunc_end(ip);
+	return error;
+}
+
+int gfs2_file_dealloc(struct gfs2_inode *ip)
+{
+	return trunc_dealloc(ip, 0);
+}
+
+/**
+ * gfs2_write_alloc_required - figure out if a write will require an allocation
+ * @ip: the file being written to
+ * @offset: the offset to write to
+ * @len: the number of bytes being written
+ *
+ * Returns: 1 if an alloc is required, 0 otherwise
+ */
+
+int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+			      unsigned int len)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head bh;
+	unsigned int shift;
+	u64 lblock, lblock_stop, size;
+	u64 end_of_file;
+
+	if (!len)
+		return 0;
+
+	if (gfs2_is_stuffed(ip)) {
+		if (offset + len >
+		    sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
+			return 1;
+		return 0;
+	}
+
+	shift = sdp->sd_sb.sb_bsize_shift;
+	BUG_ON(gfs2_is_dir(ip));
+	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
+	lblock = offset >> shift;
+	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
+	if (lblock_stop > end_of_file)
+		return 1;
+
+	size = (lblock_stop - lblock) << shift;
+	do {
+		bh.b_state = 0;
+		bh.b_size = size;
+		gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
+		if (!buffer_mapped(&bh))
+			return 1;
+		size -= bh.b_size;
+		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
+	} while(size > 0);
+
+	return 0;
+}
+
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
new file mode 100644
index 00000000..42fea03e
--- /dev/null
+++ b/fs/gfs2/bmap.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __BMAP_DOT_H__
+#define __BMAP_DOT_H__
+
+#include "inode.h"
+
+struct inode;
+struct gfs2_inode;
+struct page;
+
+
+/**
+ * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
+ * @ip: the file
+ * @len: the number of bytes to be written to the file
+ * @data_blocks: returns the number of data blocks required
+ * @ind_blocks: returns the number of indirect blocks required
+ *
+ */
+
+static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
+					  unsigned int len,
+					  unsigned int *data_blocks,
+					  unsigned int *ind_blocks)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int tmp;
+
+	BUG_ON(gfs2_is_dir(ip));
+	*data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
+	*ind_blocks = 3 * (sdp->sd_max_height - 1);
+
+	for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
+		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+		*ind_blocks += tmp;
+	}
+}
+
+extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
+extern int gfs2_block_map(struct inode *inode, sector_t lblock,
+			  struct buffer_head *bh, int create);
+extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
+			   u64 *dblock, unsigned *extlen);
+extern int gfs2_setattr_size(struct inode *inode, u64 size);
+extern void gfs2_trim_blocks(struct inode *inode);
+extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
+extern int gfs2_file_dealloc(struct gfs2_inode *ip);
+extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+				     unsigned int len);
+
+#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
new file mode 100644
index 00000000..0da8da2c
--- /dev/null
+++ b/fs/gfs2/dentry.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/namei.h>
+#include <linux/crc32.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "super.h"
+#include "util.h"
+#include "inode.h"
+
+/**
+ * gfs2_drevalidate - Check directory lookup consistency
+ * @dentry: the mapping to check
+ * @nd:
+ *
+ * Check to make sure the lookup necessary to arrive at this inode from its
+ * parent is still good.
+ *
+ * Returns: 1 if the dentry is ok, 0 if it isn't
+ */
+
+static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct dentry *parent;
+	struct gfs2_sbd *sdp;
+	struct gfs2_inode *dip;
+	struct inode *inode;
+	struct gfs2_holder d_gh;
+	struct gfs2_inode *ip = NULL;
+	int error;
+	int had_lock = 0;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	parent = dget_parent(dentry);
+	sdp = GFS2_SB(parent->d_inode);
+	dip = GFS2_I(parent->d_inode);
+	inode = dentry->d_inode;
+
+	if (inode) {
+		if (is_bad_inode(inode))
+			goto invalid;
+		ip = GFS2_I(inode);
+	}
+
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+		goto valid;
+
+	had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
+	if (!had_lock) {
+		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+		if (error)
+			goto fail;
+	} 
+
+	error = gfs2_dir_check(parent->d_inode, &dentry->d_name, ip);
+	switch (error) {
+	case 0:
+		if (!inode)
+			goto invalid_gunlock;
+		break;
+	case -ENOENT:
+		if (!inode)
+			goto valid_gunlock;
+		goto invalid_gunlock;
+	default:
+		goto fail_gunlock;
+	}
+
+valid_gunlock:
+	if (!had_lock)
+		gfs2_glock_dq_uninit(&d_gh);
+valid:
+	dput(parent);
+	return 1;
+
+invalid_gunlock:
+	if (!had_lock)
+		gfs2_glock_dq_uninit(&d_gh);
+invalid:
+	if (inode && S_ISDIR(inode->i_mode)) {
+		if (have_submounts(dentry))
+			goto valid;
+		shrink_dcache_parent(dentry);
+	}
+	d_drop(dentry);
+	dput(parent);
+	return 0;
+
+fail_gunlock:
+	gfs2_glock_dq_uninit(&d_gh);
+fail:
+	dput(parent);
+	return 0;
+}
+
+static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *str)
+{
+	str->hash = gfs2_disk_hash(str->name, str->len);
+	return 0;
+}
+
+static int gfs2_dentry_delete(const struct dentry *dentry)
+{
+	struct gfs2_inode *ginode;
+
+	if (!dentry->d_inode)
+		return 0;
+
+	ginode = GFS2_I(dentry->d_inode);
+	if (!ginode->i_iopen_gh.gh_gl)
+		return 0;
+
+	if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags))
+		return 1;
+
+	return 0;
+}
+
+const struct dentry_operations gfs2_dops = {
+	.d_revalidate = gfs2_drevalidate,
+	.d_hash = gfs2_dhash,
+	.d_delete = gfs2_dentry_delete,
+};
+
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
new file mode 100644
index 00000000..091ee477
--- /dev/null
+++ b/fs/gfs2/dir.c
@@ -0,0 +1,2006 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+/*
+ * Implements Extendible Hashing as described in:
+ *   "Extendible Hashing" by Fagin, et al in
+ *     __ACM Trans. on Database Systems__, Sept 1979.
+ *
+ *
+ * Here's the layout of dirents which is essentially the same as that of ext2
+ * within a single block. The field de_name_len is the number of bytes
+ * actually required for the name (no null terminator). The field de_rec_len
+ * is the number of bytes allocated to the dirent. The offset of the next
+ * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
+ * deleted, the preceding dirent inherits its allocated space, ie
+ * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
+ * by adding de_rec_len to the current dirent, this essentially causes the
+ * deleted dirent to get jumped over when iterating through all the dirents.
+ *
+ * When deleting the first dirent in a block, there is no previous dirent so
+ * the field de_ino is set to zero to designate it as deleted. When allocating
+ * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
+ * first dirent has (de_ino == 0) and de_rec_len is large enough, this first
+ * dirent is allocated. Otherwise it must go through all the 'used' dirents
+ * searching for one in which the amount of total space minus the amount of
+ * used space will provide enough space for the new dirent.
+ *
+ * There are two types of blocks in which dirents reside. In a stuffed dinode,
+ * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
+ * the block.  In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
+ * beginning of the leaf block. The dirents reside in leaves when
+ *
+ * dip->i_diskflags & GFS2_DIF_EXHASH is true
+ *
+ * Otherwise, the dirents are "linear", within a single stuffed dinode block.
+ *
+ * When the dirents are in leaves, the actual contents of the directory file are
+ * used as an array of 64-bit block pointers pointing to the leaf blocks. The
+ * dirents are NOT in the directory file itself. There can be more than one
+ * block pointer in the array that points to the same leaf. In fact, when a
+ * directory is first converted from linear to exhash, all of the pointers
+ * point to the same leaf.
+ *
+ * When a leaf is completely full, the size of the hash table can be
+ * doubled unless it is already at the maximum size which is hard coded into
+ * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
+ * but never before the maximum hash table size has been reached.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/buffer_head.h>
+#include <linux/sort.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/vmalloc.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "bmap.h"
+#include "util.h"
+
+#define IS_LEAF     1 /* Hashed (leaf) directory */
+#define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
+
+#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
+#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+
+struct qstr gfs2_qdot __read_mostly;
+struct qstr gfs2_qdotdot __read_mostly;
+
+typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
+			    const struct qstr *name, void *opaque);
+
+int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+			    struct buffer_head **bhp)
+{
+	struct buffer_head *bh;
+
+	bh = gfs2_meta_new(ip->i_gl, block);
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
+	gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+	*bhp = bh;
+	return 0;
+}
+
+static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
+					struct buffer_head **bhp)
+{
+	struct buffer_head *bh;
+	int error;
+
+	error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
+	if (error)
+		return error;
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
+		brelse(bh);
+		return -EIO;
+	}
+	*bhp = bh;
+	return 0;
+}
+
+static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
+				  unsigned int offset, unsigned int size)
+{
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
+	if (ip->i_inode.i_size < offset + size)
+		i_size_write(&ip->i_inode, offset + size);
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_dinode_out(ip, dibh->b_data);
+
+	brelse(dibh);
+
+	return size;
+}
+
+
+
+/**
+ * gfs2_dir_write_data - Write directory information to the inode
+ * @ip: The GFS2 inode
+ * @buf: The buffer containing information to be written
+ * @offset: The file offset to start writing at
+ * @size: The amount of data to write
+ *
+ * Returns: The number of bytes correctly written or error code
+ */
+static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
+			       u64 offset, unsigned int size)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *dibh;
+	u64 lblock, dblock;
+	u32 extlen = 0;
+	unsigned int o;
+	int copied = 0;
+	int error = 0;
+	int new = 0;
+
+	if (!size)
+		return 0;
+
+	if (gfs2_is_stuffed(ip) &&
+	    offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
+		return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
+					      size);
+
+	if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
+		return -EINVAL;
+
+	if (gfs2_is_stuffed(ip)) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (error)
+			return error;
+	}
+
+	lblock = offset;
+	o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
+
+	while (copied < size) {
+		unsigned int amount;
+		struct buffer_head *bh;
+
+		amount = size - copied;
+		if (amount > sdp->sd_sb.sb_bsize - o)
+			amount = sdp->sd_sb.sb_bsize - o;
+
+		if (!extlen) {
+			new = 1;
+			error = gfs2_extent_map(&ip->i_inode, lblock, &new,
+						&dblock, &extlen);
+			if (error)
+				goto fail;
+			error = -EIO;
+			if (gfs2_assert_withdraw(sdp, dblock))
+				goto fail;
+		}
+
+		if (amount == sdp->sd_jbsize || new)
+			error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
+		else
+			error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
+
+		if (error)
+			goto fail;
+
+		gfs2_trans_add_bh(ip->i_gl, bh, 1);
+		memcpy(bh->b_data + o, buf, amount);
+		brelse(bh);
+
+		buf += amount;
+		copied += amount;
+		lblock++;
+		dblock++;
+		extlen--;
+
+		o = sizeof(struct gfs2_meta_header);
+	}
+
+out:
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	if (ip->i_inode.i_size < offset + copied)
+		i_size_write(&ip->i_inode, offset + copied);
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+
+	return copied;
+fail:
+	if (copied)
+		goto out;
+	return error;
+}
+
+static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
+				 u64 offset, unsigned int size)
+{
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		offset += sizeof(struct gfs2_dinode);
+		memcpy(buf, dibh->b_data + offset, size);
+		brelse(dibh);
+	}
+
+	return (error) ? error : size;
+}
+
+
+/**
+ * gfs2_dir_read_data - Read a data from a directory inode
+ * @ip: The GFS2 Inode
+ * @buf: The buffer to place result into
+ * @offset: File offset to begin jdata_readng from
+ * @size: Amount of data to transfer
+ *
+ * Returns: The amount of data actually copied or the error
+ */
+static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
+			      unsigned int size, unsigned ra)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	u64 lblock, dblock;
+	u32 extlen = 0;
+	unsigned int o;
+	int copied = 0;
+	int error = 0;
+	u64 disksize = i_size_read(&ip->i_inode);
+
+	if (offset >= disksize)
+		return 0;
+
+	if (offset + size > disksize)
+		size = disksize - offset;
+
+	if (!size)
+		return 0;
+
+	if (gfs2_is_stuffed(ip))
+		return gfs2_dir_read_stuffed(ip, buf, offset, size);
+
+	if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
+		return -EINVAL;
+
+	lblock = offset;
+	o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
+
+	while (copied < size) {
+		unsigned int amount;
+		struct buffer_head *bh;
+		int new;
+
+		amount = size - copied;
+		if (amount > sdp->sd_sb.sb_bsize - o)
+			amount = sdp->sd_sb.sb_bsize - o;
+
+		if (!extlen) {
+			new = 0;
+			error = gfs2_extent_map(&ip->i_inode, lblock, &new,
+						&dblock, &extlen);
+			if (error || !dblock)
+				goto fail;
+			BUG_ON(extlen < 1);
+			if (!ra)
+				extlen = 1;
+			bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
+		} else {
+			error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
+			if (error)
+				goto fail;
+		}
+		error = gfs2_metatype_check(sdp, bh, GFS2_METATYPE_JD);
+		if (error) {
+			brelse(bh);
+			goto fail;
+		}
+		dblock++;
+		extlen--;
+		memcpy(buf, bh->b_data + o, amount);
+		brelse(bh);
+		buf += amount;
+		copied += amount;
+		lblock++;
+		o = sizeof(struct gfs2_meta_header);
+	}
+
+	return copied;
+fail:
+	return (copied) ? copied : error;
+}
+
+static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
+{
+	return dent->de_inum.no_addr == 0 || dent->de_inum.no_formal_ino == 0;
+}
+
+static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
+				     const struct qstr *name, int ret)
+{
+	if (!gfs2_dirent_sentinel(dent) &&
+	    be32_to_cpu(dent->de_hash) == name->hash &&
+	    be16_to_cpu(dent->de_name_len) == name->len &&
+	    memcmp(dent+1, name->name, name->len) == 0)
+		return ret;
+	return 0;
+}
+
+static int gfs2_dirent_find(const struct gfs2_dirent *dent,
+			    const struct qstr *name,
+			    void *opaque)
+{
+	return __gfs2_dirent_find(dent, name, 1);
+}
+
+static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
+			    const struct qstr *name,
+			    void *opaque)
+{
+	return __gfs2_dirent_find(dent, name, 2);
+}
+
+/*
+ * name->name holds ptr to start of block.
+ * name->len holds size of block.
+ */
+static int gfs2_dirent_last(const struct gfs2_dirent *dent,
+			    const struct qstr *name,
+			    void *opaque)
+{
+	const char *start = name->name;
+	const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
+	if (name->len == (end - start))
+		return 1;
+	return 0;
+}
+
+static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
+				  const struct qstr *name,
+				  void *opaque)
+{
+	unsigned required = GFS2_DIRENT_SIZE(name->len);
+	unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+	unsigned totlen = be16_to_cpu(dent->de_rec_len);
+
+	if (gfs2_dirent_sentinel(dent))
+		actual = 0;
+	if (totlen - actual >= required)
+		return 1;
+	return 0;
+}
+
+struct dirent_gather {
+	const struct gfs2_dirent **pdent;
+	unsigned offset;
+};
+
+static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
+			      const struct qstr *name,
+			      void *opaque)
+{
+	struct dirent_gather *g = opaque;
+	if (!gfs2_dirent_sentinel(dent)) {
+		g->pdent[g->offset++] = dent;
+	}
+	return 0;
+}
+
+/*
+ * Other possible things to check:
+ * - Inode located within filesystem size (and on valid block)
+ * - Valid directory entry type
+ * Not sure how heavy-weight we want to make this... could also check
+ * hash is correct for example, but that would take a lot of extra time.
+ * For now the most important thing is to check that the various sizes
+ * are correct.
+ */
+static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
+			     unsigned int size, unsigned int len, int first)
+{
+	const char *msg = "gfs2_dirent too small";
+	if (unlikely(size < sizeof(struct gfs2_dirent)))
+		goto error;
+	msg = "gfs2_dirent misaligned";
+	if (unlikely(offset & 0x7))
+		goto error;
+	msg = "gfs2_dirent points beyond end of block";
+	if (unlikely(offset + size > len))
+		goto error;
+	msg = "zero inode number";
+	if (unlikely(!first && gfs2_dirent_sentinel(dent)))
+		goto error;
+	msg = "name length is greater than space in dirent";
+	if (!gfs2_dirent_sentinel(dent) &&
+	    unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
+		     size))
+		goto error;
+	return 0;
+error:
+	printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
+	       first ? "first in block" : "not first in block");
+	return -EIO;
+}
+
+static int gfs2_dirent_offset(const void *buf)
+{
+	const struct gfs2_meta_header *h = buf;
+	int offset;
+
+	BUG_ON(buf == NULL);
+
+	switch(be32_to_cpu(h->mh_type)) {
+	case GFS2_METATYPE_LF:
+		offset = sizeof(struct gfs2_leaf);
+		break;
+	case GFS2_METATYPE_DI:
+		offset = sizeof(struct gfs2_dinode);
+		break;
+	default:
+		goto wrong_type;
+	}
+	return offset;
+wrong_type:
+	printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
+	       be32_to_cpu(h->mh_type));
+	return -1;
+}
+
+static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf,
+					    unsigned int len, gfs2_dscan_t scan,
+					    const struct qstr *name,
+					    void *opaque)
+{
+	struct gfs2_dirent *dent, *prev;
+	unsigned offset;
+	unsigned size;
+	int ret = 0;
+
+	ret = gfs2_dirent_offset(buf);
+	if (ret < 0)
+		goto consist_inode;
+
+	offset = ret;
+	prev = NULL;
+	dent = buf + offset;
+	size = be16_to_cpu(dent->de_rec_len);
+	if (gfs2_check_dirent(dent, offset, size, len, 1))
+		goto consist_inode;
+	do {
+		ret = scan(dent, name, opaque);
+		if (ret)
+			break;
+		offset += size;
+		if (offset == len)
+			break;
+		prev = dent;
+		dent = buf + offset;
+		size = be16_to_cpu(dent->de_rec_len);
+		if (gfs2_check_dirent(dent, offset, size, len, 0))
+			goto consist_inode;
+	} while(1);
+
+	switch(ret) {
+	case 0:
+		return NULL;
+	case 1:
+		return dent;
+	case 2:
+		return prev ? prev : dent;
+	default:
+		BUG_ON(ret > 0);
+		return ERR_PTR(ret);
+	}
+
+consist_inode:
+	gfs2_consist_inode(GFS2_I(inode));
+	return ERR_PTR(-EIO);
+}
+
+static int dirent_check_reclen(struct gfs2_inode *dip,
+			       const struct gfs2_dirent *d, const void *end_p)
+{
+	const void *ptr = d;
+	u16 rec_len = be16_to_cpu(d->de_rec_len);
+
+	if (unlikely(rec_len < sizeof(struct gfs2_dirent)))
+		goto broken;
+	ptr += rec_len;
+	if (ptr < end_p)
+		return rec_len;
+	if (ptr == end_p)
+		return -ENOENT;
+broken:
+	gfs2_consist_inode(dip);
+	return -EIO;
+}
+
+/**
+ * dirent_next - Next dirent
+ * @dip: the directory
+ * @bh: The buffer
+ * @dent: Pointer to list of dirents
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
+		       struct gfs2_dirent **dent)
+{
+	struct gfs2_dirent *cur = *dent, *tmp;
+	char *bh_end = bh->b_data + bh->b_size;
+	int ret;
+
+	ret = dirent_check_reclen(dip, cur, bh_end);
+	if (ret < 0)
+		return ret;
+
+	tmp = (void *)cur + ret;
+	ret = dirent_check_reclen(dip, tmp, bh_end);
+	if (ret == -EIO)
+		return ret;
+
+        /* Only the first dent could ever have de_inum.no_addr == 0 */
+	if (gfs2_dirent_sentinel(tmp)) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+
+	*dent = tmp;
+	return 0;
+}
+
+/**
+ * dirent_del - Delete a dirent
+ * @dip: The GFS2 inode
+ * @bh: The buffer
+ * @prev: The previous dirent
+ * @cur: The current dirent
+ *
+ */
+
+static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
+		       struct gfs2_dirent *prev, struct gfs2_dirent *cur)
+{
+	u16 cur_rec_len, prev_rec_len;
+
+	if (gfs2_dirent_sentinel(cur)) {
+		gfs2_consist_inode(dip);
+		return;
+	}
+
+	gfs2_trans_add_bh(dip->i_gl, bh, 1);
+
+	/* If there is no prev entry, this is the first entry in the block.
+	   The de_rec_len is already as big as it needs to be.  Just zero
+	   out the inode number and return.  */
+
+	if (!prev) {
+		cur->de_inum.no_addr = 0;
+		cur->de_inum.no_formal_ino = 0;
+		return;
+	}
+
+	/*  Combine this dentry with the previous one.  */
+
+	prev_rec_len = be16_to_cpu(prev->de_rec_len);
+	cur_rec_len = be16_to_cpu(cur->de_rec_len);
+
+	if ((char *)prev + prev_rec_len != (char *)cur)
+		gfs2_consist_inode(dip);
+	if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
+		gfs2_consist_inode(dip);
+
+	prev_rec_len += cur_rec_len;
+	prev->de_rec_len = cpu_to_be16(prev_rec_len);
+}
+
+/*
+ * Takes a dent from which to grab space as an argument. Returns the
+ * newly created dent.
+ */
+static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
+					    struct gfs2_dirent *dent,
+					    const struct qstr *name,
+					    struct buffer_head *bh)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_dirent *ndent;
+	unsigned offset = 0, totlen;
+
+	if (!gfs2_dirent_sentinel(dent))
+		offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+	totlen = be16_to_cpu(dent->de_rec_len);
+	BUG_ON(offset + name->len > totlen);
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	ndent = (struct gfs2_dirent *)((char *)dent + offset);
+	dent->de_rec_len = cpu_to_be16(offset);
+	gfs2_qstr2dirent(name, totlen - offset, ndent);
+	return ndent;
+}
+
+static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
+					     struct buffer_head *bh,
+					     const struct qstr *name)
+{
+	struct gfs2_dirent *dent;
+	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+				gfs2_dirent_find_space, name, NULL);
+	if (!dent || IS_ERR(dent))
+		return dent;
+	return gfs2_init_dirent(inode, dent, name, bh);
+}
+
+static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
+		    struct buffer_head **bhp)
+{
+	int error;
+
+	error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
+	if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
+		/* printk(KERN_INFO "block num=%llu\n", leaf_no); */
+		error = -EIO;
+	}
+
+	return error;
+}
+
+/**
+ * get_leaf_nr - Get a leaf number associated with the index
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_out:
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
+		       u64 *leaf_out)
+{
+	__be64 leaf_no;
+	int error;
+
+	error = gfs2_dir_read_data(dip, (char *)&leaf_no,
+				    index * sizeof(__be64),
+				    sizeof(__be64), 0);
+	if (error != sizeof(u64))
+		return (error < 0) ? error : -EIO;
+
+	*leaf_out = be64_to_cpu(leaf_no);
+
+	return 0;
+}
+
+static int get_first_leaf(struct gfs2_inode *dip, u32 index,
+			  struct buffer_head **bh_out)
+{
+	u64 leaf_no;
+	int error;
+
+	error = get_leaf_nr(dip, index, &leaf_no);
+	if (!error)
+		error = get_leaf(dip, leaf_no, bh_out);
+
+	return error;
+}
+
+static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
+					      const struct qstr *name,
+					      gfs2_dscan_t scan,
+					      struct buffer_head **pbh)
+{
+	struct buffer_head *bh;
+	struct gfs2_dirent *dent;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int error;
+
+	if (ip->i_diskflags & GFS2_DIF_EXHASH) {
+		struct gfs2_leaf *leaf;
+		unsigned hsize = 1 << ip->i_depth;
+		unsigned index;
+		u64 ln;
+		if (hsize * sizeof(u64) != i_size_read(inode)) {
+			gfs2_consist_inode(ip);
+			return ERR_PTR(-EIO);
+		}
+
+		index = name->hash >> (32 - ip->i_depth);
+		error = get_first_leaf(ip, index, &bh);
+		if (error)
+			return ERR_PTR(error);
+		do {
+			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+						scan, name, NULL);
+			if (dent)
+				goto got_dent;
+			leaf = (struct gfs2_leaf *)bh->b_data;
+			ln = be64_to_cpu(leaf->lf_next);
+			brelse(bh);
+			if (!ln)
+				break;
+
+			error = get_leaf(ip, ln, &bh);
+		} while(!error);
+
+		return error ? ERR_PTR(error) : NULL;
+	}
+
+
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error)
+		return ERR_PTR(error);
+	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
+got_dent:
+	if (unlikely(dent == NULL || IS_ERR(dent))) {
+		brelse(bh);
+		bh = NULL;
+	}
+	*pbh = bh;
+	return dent;
+}
+
+static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int n = 1;
+	u64 bn;
+	int error;
+	struct buffer_head *bh;
+	struct gfs2_leaf *leaf;
+	struct gfs2_dirent *dent;
+	struct qstr name = { .name = "", .len = 0, .hash = 0 };
+
+	error = gfs2_alloc_block(ip, &bn, &n);
+	if (error)
+		return NULL;
+	bh = gfs2_meta_new(ip->i_gl, bn);
+	if (!bh)
+		return NULL;
+
+	gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
+	leaf = (struct gfs2_leaf *)bh->b_data;
+	leaf->lf_depth = cpu_to_be16(depth);
+	leaf->lf_entries = 0;
+	leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
+	leaf->lf_next = 0;
+	memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
+	dent = (struct gfs2_dirent *)(leaf+1);
+	gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
+	*pbh = bh;
+	return leaf;
+}
+
+/**
+ * dir_make_exhash - Convert a stuffed directory into an ExHash directory
+ * @dip: The GFS2 inode
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int dir_make_exhash(struct inode *inode)
+{
+	struct gfs2_inode *dip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_dirent *dent;
+	struct qstr args;
+	struct buffer_head *bh, *dibh;
+	struct gfs2_leaf *leaf;
+	int y;
+	u32 x;
+	__be64 *lp;
+	u64 bn;
+	int error;
+
+	error = gfs2_meta_inode_buffer(dip, &dibh);
+	if (error)
+		return error;
+
+	/*  Turn over a new leaf  */
+
+	leaf = new_leaf(inode, &bh, 0);
+	if (!leaf)
+		return -ENOSPC;
+	bn = bh->b_blocknr;
+
+	gfs2_assert(sdp, dip->i_entries < (1 << 16));
+	leaf->lf_entries = cpu_to_be16(dip->i_entries);
+
+	/*  Copy dirents  */
+
+	gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
+			     sizeof(struct gfs2_dinode));
+
+	/*  Find last entry  */
+
+	x = 0;
+	args.len = bh->b_size - sizeof(struct gfs2_dinode) +
+		   sizeof(struct gfs2_leaf);
+	args.name = bh->b_data;
+	dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size,
+				gfs2_dirent_last, &args, NULL);
+	if (!dent) {
+		brelse(bh);
+		brelse(dibh);
+		return -EIO;
+	}
+	if (IS_ERR(dent)) {
+		brelse(bh);
+		brelse(dibh);
+		return PTR_ERR(dent);
+	}
+
+	/*  Adjust the last dirent's record length
+	   (Remember that dent still points to the last entry.)  */
+
+	dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
+		sizeof(struct gfs2_dinode) -
+		sizeof(struct gfs2_leaf));
+
+	brelse(bh);
+
+	/*  We're done with the new leaf block, now setup the new
+	    hash table.  */
+
+	gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+
+	lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
+
+	for (x = sdp->sd_hash_ptrs; x--; lp++)
+		*lp = cpu_to_be64(bn);
+
+	i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
+	gfs2_add_inode_blocks(&dip->i_inode, 1);
+	dip->i_diskflags |= GFS2_DIF_EXHASH;
+
+	for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
+	dip->i_depth = y;
+
+	gfs2_dinode_out(dip, dibh->b_data);
+
+	brelse(dibh);
+
+	return 0;
+}
+
+/**
+ * dir_split_leaf - Split a leaf block into two
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_no:
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int dir_split_leaf(struct inode *inode, const struct qstr *name)
+{
+	struct gfs2_inode *dip = GFS2_I(inode);
+	struct buffer_head *nbh, *obh, *dibh;
+	struct gfs2_leaf *nleaf, *oleaf;
+	struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new;
+	u32 start, len, half_len, divider;
+	u64 bn, leaf_no;
+	__be64 *lp;
+	u32 index;
+	int x, moved = 0;
+	int error;
+
+	index = name->hash >> (32 - dip->i_depth);
+	error = get_leaf_nr(dip, index, &leaf_no);
+	if (error)
+		return error;
+
+	/*  Get the old leaf block  */
+	error = get_leaf(dip, leaf_no, &obh);
+	if (error)
+		return error;
+
+	oleaf = (struct gfs2_leaf *)obh->b_data;
+	if (dip->i_depth == be16_to_cpu(oleaf->lf_depth)) {
+		brelse(obh);
+		return 1; /* can't split */
+	}
+
+	gfs2_trans_add_bh(dip->i_gl, obh, 1);
+
+	nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
+	if (!nleaf) {
+		brelse(obh);
+		return -ENOSPC;
+	}
+	bn = nbh->b_blocknr;
+
+	/*  Compute the start and len of leaf pointers in the hash table.  */
+	len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
+	half_len = len >> 1;
+	if (!half_len) {
+		printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
+		gfs2_consist_inode(dip);
+		error = -EIO;
+		goto fail_brelse;
+	}
+
+	start = (index & ~(len - 1));
+
+	/* Change the pointers.
+	   Don't bother distinguishing stuffed from non-stuffed.
+	   This code is complicated enough already. */
+	lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS);
+	if (!lp) {
+		error = -ENOMEM;
+		goto fail_brelse;
+	}
+
+	/*  Change the pointers  */
+	for (x = 0; x < half_len; x++)
+		lp[x] = cpu_to_be64(bn);
+
+	error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64),
+				    half_len * sizeof(u64));
+	if (error != half_len * sizeof(u64)) {
+		if (error >= 0)
+			error = -EIO;
+		goto fail_lpfree;
+	}
+
+	kfree(lp);
+
+	/*  Compute the divider  */
+	divider = (start + half_len) << (32 - dip->i_depth);
+
+	/*  Copy the entries  */
+	dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf));
+
+	do {
+		next = dent;
+		if (dirent_next(dip, obh, &next))
+			next = NULL;
+
+		if (!gfs2_dirent_sentinel(dent) &&
+		    be32_to_cpu(dent->de_hash) < divider) {
+			struct qstr str;
+			str.name = (char*)(dent+1);
+			str.len = be16_to_cpu(dent->de_name_len);
+			str.hash = be32_to_cpu(dent->de_hash);
+			new = gfs2_dirent_alloc(inode, nbh, &str);
+			if (IS_ERR(new)) {
+				error = PTR_ERR(new);
+				break;
+			}
+
+			new->de_inum = dent->de_inum; /* No endian worries */
+			new->de_type = dent->de_type; /* No endian worries */
+			be16_add_cpu(&nleaf->lf_entries, 1);
+
+			dirent_del(dip, obh, prev, dent);
+
+			if (!oleaf->lf_entries)
+				gfs2_consist_inode(dip);
+			be16_add_cpu(&oleaf->lf_entries, -1);
+
+			if (!prev)
+				prev = dent;
+
+			moved = 1;
+		} else {
+			prev = dent;
+		}
+		dent = next;
+	} while (dent);
+
+	oleaf->lf_depth = nleaf->lf_depth;
+
+	error = gfs2_meta_inode_buffer(dip, &dibh);
+	if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
+		gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+		gfs2_add_inode_blocks(&dip->i_inode, 1);
+		gfs2_dinode_out(dip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	brelse(obh);
+	brelse(nbh);
+
+	return error;
+
+fail_lpfree:
+	kfree(lp);
+
+fail_brelse:
+	brelse(obh);
+	brelse(nbh);
+	return error;
+}
+
+/**
+ * dir_double_exhash - Double size of ExHash table
+ * @dip: The GFS2 dinode
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int dir_double_exhash(struct gfs2_inode *dip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct buffer_head *dibh;
+	u32 hsize;
+	u64 *buf;
+	u64 *from, *to;
+	u64 block;
+	u64 disksize = i_size_read(&dip->i_inode);
+	int x;
+	int error = 0;
+
+	hsize = 1 << dip->i_depth;
+	if (hsize * sizeof(u64) != disksize) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+
+	/*  Allocate both the "from" and "to" buffers in one big chunk  */
+
+	buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+
+	for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) {
+		error = gfs2_dir_read_data(dip, (char *)buf,
+					    block * sdp->sd_hash_bsize,
+					    sdp->sd_hash_bsize, 1);
+		if (error != sdp->sd_hash_bsize) {
+			if (error >= 0)
+				error = -EIO;
+			goto fail;
+		}
+
+		from = buf;
+		to = (u64 *)((char *)buf + sdp->sd_hash_bsize);
+
+		for (x = sdp->sd_hash_ptrs; x--; from++) {
+			*to++ = *from;	/*  No endianess worries  */
+			*to++ = *from;
+		}
+
+		error = gfs2_dir_write_data(dip,
+					     (char *)buf + sdp->sd_hash_bsize,
+					     block * sdp->sd_sb.sb_bsize,
+					     sdp->sd_sb.sb_bsize);
+		if (error != sdp->sd_sb.sb_bsize) {
+			if (error >= 0)
+				error = -EIO;
+			goto fail;
+		}
+	}
+
+	kfree(buf);
+
+	error = gfs2_meta_inode_buffer(dip, &dibh);
+	if (!gfs2_assert_withdraw(sdp, !error)) {
+		dip->i_depth++;
+		gfs2_dinode_out(dip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	return error;
+
+fail:
+	kfree(buf);
+	return error;
+}
+
+/**
+ * compare_dents - compare directory entries by hash value
+ * @a: first dent
+ * @b: second dent
+ *
+ * When comparing the hash entries of @a to @b:
+ *   gt: returns 1
+ *   lt: returns -1
+ *   eq: returns 0
+ */
+
+static int compare_dents(const void *a, const void *b)
+{
+	const struct gfs2_dirent *dent_a, *dent_b;
+	u32 hash_a, hash_b;
+	int ret = 0;
+
+	dent_a = *(const struct gfs2_dirent **)a;
+	hash_a = be32_to_cpu(dent_a->de_hash);
+
+	dent_b = *(const struct gfs2_dirent **)b;
+	hash_b = be32_to_cpu(dent_b->de_hash);
+
+	if (hash_a > hash_b)
+		ret = 1;
+	else if (hash_a < hash_b)
+		ret = -1;
+	else {
+		unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
+		unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
+
+		if (len_a > len_b)
+			ret = 1;
+		else if (len_a < len_b)
+			ret = -1;
+		else
+			ret = memcmp(dent_a + 1, dent_b + 1, len_a);
+	}
+
+	return ret;
+}
+
+/**
+ * do_filldir_main - read out directory entries
+ * @dip: The GFS2 inode
+ * @offset: The offset in the file to read from
+ * @opaque: opaque data to pass to filldir
+ * @filldir: The function to pass entries to
+ * @darr: an array of struct gfs2_dirent pointers to read
+ * @entries: the number of entries in darr
+ * @copied: pointer to int that's non-zero if a entry has been copied out
+ *
+ * Jump through some hoops to make sure that if there are hash collsions,
+ * they are read out at the beginning of a buffer.  We want to minimize
+ * the possibility that they will fall into different readdir buffers or
+ * that someone will want to seek to that location.
+ *
+ * Returns: errno, >0 on exception from filldir
+ */
+
+static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
+			   void *opaque, filldir_t filldir,
+			   const struct gfs2_dirent **darr, u32 entries,
+			   int *copied)
+{
+	const struct gfs2_dirent *dent, *dent_next;
+	u64 off, off_next;
+	unsigned int x, y;
+	int run = 0;
+	int error = 0;
+
+	sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
+
+	dent_next = darr[0];
+	off_next = be32_to_cpu(dent_next->de_hash);
+	off_next = gfs2_disk_hash2offset(off_next);
+
+	for (x = 0, y = 1; x < entries; x++, y++) {
+		dent = dent_next;
+		off = off_next;
+
+		if (y < entries) {
+			dent_next = darr[y];
+			off_next = be32_to_cpu(dent_next->de_hash);
+			off_next = gfs2_disk_hash2offset(off_next);
+
+			if (off < *offset)
+				continue;
+			*offset = off;
+
+			if (off_next == off) {
+				if (*copied && !run)
+					return 1;
+				run = 1;
+			} else
+				run = 0;
+		} else {
+			if (off < *offset)
+				continue;
+			*offset = off;
+		}
+
+		error = filldir(opaque, (const char *)(dent + 1),
+				be16_to_cpu(dent->de_name_len),
+				off, be64_to_cpu(dent->de_inum.no_addr),
+				be16_to_cpu(dent->de_type));
+		if (error)
+			return 1;
+
+		*copied = 1;
+	}
+
+	/* Increment the *offset by one, so the next time we come into the
+	   do_filldir fxn, we get the next entry instead of the last one in the
+	   current leaf */
+
+	(*offset)++;
+
+	return 0;
+}
+
+static void *gfs2_alloc_sort_buffer(unsigned size)
+{
+	void *ptr = NULL;
+
+	if (size < KMALLOC_MAX_SIZE)
+		ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN);
+	if (!ptr)
+		ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL);
+	return ptr;
+}
+
+static void gfs2_free_sort_buffer(void *ptr)
+{
+	if (is_vmalloc_addr(ptr))
+		vfree(ptr);
+	else
+		kfree(ptr);
+}
+
+static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
+			      filldir_t filldir, int *copied, unsigned *depth,
+			      u64 leaf_no)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *bh;
+	struct gfs2_leaf *lf;
+	unsigned entries = 0, entries2 = 0;
+	unsigned leaves = 0;
+	const struct gfs2_dirent **darr, *dent;
+	struct dirent_gather g;
+	struct buffer_head **larr;
+	int leaf = 0;
+	int error, i;
+	u64 lfn = leaf_no;
+
+	do {
+		error = get_leaf(ip, lfn, &bh);
+		if (error)
+			goto out;
+		lf = (struct gfs2_leaf *)bh->b_data;
+		if (leaves == 0)
+			*depth = be16_to_cpu(lf->lf_depth);
+		entries += be16_to_cpu(lf->lf_entries);
+		leaves++;
+		lfn = be64_to_cpu(lf->lf_next);
+		brelse(bh);
+	} while(lfn);
+
+	if (!entries)
+		return 0;
+
+	error = -ENOMEM;
+	/*
+	 * The extra 99 entries are not normally used, but are a buffer
+	 * zone in case the number of entries in the leaf is corrupt.
+	 * 99 is the maximum number of entries that can fit in a single
+	 * leaf block.
+	 */
+	larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
+	if (!larr)
+		goto out;
+	darr = (const struct gfs2_dirent **)(larr + leaves);
+	g.pdent = darr;
+	g.offset = 0;
+	lfn = leaf_no;
+
+	do {
+		error = get_leaf(ip, lfn, &bh);
+		if (error)
+			goto out_free;
+		lf = (struct gfs2_leaf *)bh->b_data;
+		lfn = be64_to_cpu(lf->lf_next);
+		if (lf->lf_entries) {
+			entries2 += be16_to_cpu(lf->lf_entries);
+			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+						gfs2_dirent_gather, NULL, &g);
+			error = PTR_ERR(dent);
+			if (IS_ERR(dent))
+				goto out_free;
+			if (entries2 != g.offset) {
+				fs_warn(sdp, "Number of entries corrupt in dir "
+						"leaf %llu, entries2 (%u) != "
+						"g.offset (%u)\n",
+					(unsigned long long)bh->b_blocknr,
+					entries2, g.offset);
+					
+				error = -EIO;
+				goto out_free;
+			}
+			error = 0;
+			larr[leaf++] = bh;
+		} else {
+			brelse(bh);
+		}
+	} while(lfn);
+
+	BUG_ON(entries2 != entries);
+	error = do_filldir_main(ip, offset, opaque, filldir, darr,
+				entries, copied);
+out_free:
+	for(i = 0; i < leaf; i++)
+		brelse(larr[i]);
+	gfs2_free_sort_buffer(larr);
+out:
+	return error;
+}
+
+/**
+ * dir_e_read - Reads the entries from a directory into a filldir buffer
+ * @dip: dinode pointer
+ * @offset: the hash of the last entry read shifted to the right once
+ * @opaque: buffer for the filldir function to fill
+ * @filldir: points to the filldir function to use
+ *
+ * Returns: errno
+ */
+
+static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
+		      filldir_t filldir)
+{
+	struct gfs2_inode *dip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	u32 hsize, len = 0;
+	u32 ht_offset, lp_offset, ht_offset_cur = -1;
+	u32 hash, index;
+	__be64 *lp;
+	int copied = 0;
+	int error = 0;
+	unsigned depth = 0;
+
+	hsize = 1 << dip->i_depth;
+	if (hsize * sizeof(u64) != i_size_read(inode)) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+
+	hash = gfs2_dir_offset2hash(*offset);
+	index = hash >> (32 - dip->i_depth);
+
+	lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
+	if (!lp)
+		return -ENOMEM;
+
+	while (index < hsize) {
+		lp_offset = index & (sdp->sd_hash_ptrs - 1);
+		ht_offset = index - lp_offset;
+
+		if (ht_offset_cur != ht_offset) {
+			error = gfs2_dir_read_data(dip, (char *)lp,
+						ht_offset * sizeof(__be64),
+						sdp->sd_hash_bsize, 1);
+			if (error != sdp->sd_hash_bsize) {
+				if (error >= 0)
+					error = -EIO;
+				goto out;
+			}
+			ht_offset_cur = ht_offset;
+		}
+
+		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
+					   &copied, &depth,
+					   be64_to_cpu(lp[lp_offset]));
+		if (error)
+			break;
+
+		len = 1 << (dip->i_depth - depth);
+		index = (index & ~(len - 1)) + len;
+	}
+
+out:
+	kfree(lp);
+	if (error > 0)
+		error = 0;
+	return error;
+}
+
+int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+		  filldir_t filldir)
+{
+	struct gfs2_inode *dip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct dirent_gather g;
+	const struct gfs2_dirent **darr, *dent;
+	struct buffer_head *dibh;
+	int copied = 0;
+	int error;
+
+	if (!dip->i_entries)
+		return 0;
+
+	if (dip->i_diskflags & GFS2_DIF_EXHASH)
+		return dir_e_read(inode, offset, opaque, filldir);
+
+	if (!gfs2_is_stuffed(dip)) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+
+	error = gfs2_meta_inode_buffer(dip, &dibh);
+	if (error)
+		return error;
+
+	error = -ENOMEM;
+	/* 96 is max number of dirents which can be stuffed into an inode */
+	darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
+	if (darr) {
+		g.pdent = darr;
+		g.offset = 0;
+		dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
+					gfs2_dirent_gather, NULL, &g);
+		if (IS_ERR(dent)) {
+			error = PTR_ERR(dent);
+			goto out;
+		}
+		if (dip->i_entries != g.offset) {
+			fs_warn(sdp, "Number of entries corrupt in dir %llu, "
+				"ip->i_entries (%u) != g.offset (%u)\n",
+				(unsigned long long)dip->i_no_addr,
+				dip->i_entries,
+				g.offset);
+			error = -EIO;
+			goto out;
+		}
+		error = do_filldir_main(dip, offset, opaque, filldir, darr,
+					dip->i_entries, &copied);
+out:
+		kfree(darr);
+	}
+
+	if (error > 0)
+		error = 0;
+
+	brelse(dibh);
+
+	return error;
+}
+
+/**
+ * gfs2_dir_search - Search a directory
+ * @dip: The GFS2 inode
+ * @filename:
+ * @inode:
+ *
+ * This routine searches a directory for a file or another directory.
+ * Assumes a glock is held on dip.
+ *
+ * Returns: errno
+ */
+
+struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
+{
+	struct buffer_head *bh;
+	struct gfs2_dirent *dent;
+	struct inode *inode;
+
+	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
+	if (dent) {
+		if (IS_ERR(dent))
+			return ERR_CAST(dent);
+		inode = gfs2_inode_lookup(dir->i_sb, 
+				be16_to_cpu(dent->de_type),
+				be64_to_cpu(dent->de_inum.no_addr),
+				be64_to_cpu(dent->de_inum.no_formal_ino), 0);
+		brelse(bh);
+		return inode;
+	}
+	return ERR_PTR(-ENOENT);
+}
+
+int gfs2_dir_check(struct inode *dir, const struct qstr *name,
+		   const struct gfs2_inode *ip)
+{
+	struct buffer_head *bh;
+	struct gfs2_dirent *dent;
+	int ret = -ENOENT;
+
+	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
+	if (dent) {
+		if (IS_ERR(dent))
+			return PTR_ERR(dent);
+		if (ip) {
+			if (be64_to_cpu(dent->de_inum.no_addr) != ip->i_no_addr)
+				goto out;
+			if (be64_to_cpu(dent->de_inum.no_formal_ino) !=
+			    ip->i_no_formal_ino)
+				goto out;
+			if (unlikely(IF2DT(ip->i_inode.i_mode) !=
+			    be16_to_cpu(dent->de_type))) {
+				gfs2_consist_inode(GFS2_I(dir));
+				ret = -EIO;
+				goto out;
+			}
+		}
+		ret = 0;
+out:
+		brelse(bh);
+	}
+	return ret;
+}
+
+static int dir_new_leaf(struct inode *inode, const struct qstr *name)
+{
+	struct buffer_head *bh, *obh;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_leaf *leaf, *oleaf;
+	int error;
+	u32 index;
+	u64 bn;
+
+	index = name->hash >> (32 - ip->i_depth);
+	error = get_first_leaf(ip, index, &obh);
+	if (error)
+		return error;
+	do {
+		oleaf = (struct gfs2_leaf *)obh->b_data;
+		bn = be64_to_cpu(oleaf->lf_next);
+		if (!bn)
+			break;
+		brelse(obh);
+		error = get_leaf(ip, bn, &obh);
+		if (error)
+			return error;
+	} while(1);
+
+	gfs2_trans_add_bh(ip->i_gl, obh, 1);
+
+	leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
+	if (!leaf) {
+		brelse(obh);
+		return -ENOSPC;
+	}
+	oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
+	brelse(bh);
+	brelse(obh);
+
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error)
+		return error;
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_add_inode_blocks(&ip->i_inode, 1);
+	gfs2_dinode_out(ip, bh->b_data);
+	brelse(bh);
+	return 0;
+}
+
+/**
+ * gfs2_dir_add - Add new filename into directory
+ * @dip: The GFS2 inode
+ * @filename: The new name
+ * @inode: The inode number of the entry
+ * @type: The type of the entry
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+int gfs2_dir_add(struct inode *inode, const struct qstr *name,
+		 const struct gfs2_inode *nip)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *bh;
+	struct gfs2_dirent *dent;
+	struct gfs2_leaf *leaf;
+	int error;
+
+	while(1) {
+		dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
+					  &bh);
+		if (dent) {
+			if (IS_ERR(dent))
+				return PTR_ERR(dent);
+			dent = gfs2_init_dirent(inode, dent, name, bh);
+			gfs2_inum_out(nip, dent);
+			dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
+			if (ip->i_diskflags & GFS2_DIF_EXHASH) {
+				leaf = (struct gfs2_leaf *)bh->b_data;
+				be16_add_cpu(&leaf->lf_entries, 1);
+			}
+			brelse(bh);
+			error = gfs2_meta_inode_buffer(ip, &bh);
+			if (error)
+				break;
+			gfs2_trans_add_bh(ip->i_gl, bh, 1);
+			ip->i_entries++;
+			ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+			if (S_ISDIR(nip->i_inode.i_mode))
+				inc_nlink(&ip->i_inode);
+			gfs2_dinode_out(ip, bh->b_data);
+			brelse(bh);
+			error = 0;
+			break;
+		}
+		if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) {
+			error = dir_make_exhash(inode);
+			if (error)
+				break;
+			continue;
+		}
+		error = dir_split_leaf(inode, name);
+		if (error == 0)
+			continue;
+		if (error < 0)
+			break;
+		if (ip->i_depth < GFS2_DIR_MAX_DEPTH) {
+			error = dir_double_exhash(ip);
+			if (error)
+				break;
+			error = dir_split_leaf(inode, name);
+			if (error < 0)
+				break;
+			if (error == 0)
+				continue;
+		}
+		error = dir_new_leaf(inode, name);
+		if (!error)
+			continue;
+		error = -ENOSPC;
+		break;
+	}
+	return error;
+}
+
+
+/**
+ * gfs2_dir_del - Delete a directory entry
+ * @dip: The GFS2 inode
+ * @filename: The filename
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
+{
+	const struct qstr *name = &dentry->d_name;
+	struct gfs2_dirent *dent, *prev = NULL;
+	struct buffer_head *bh;
+	int error;
+
+	/* Returns _either_ the entry (if its first in block) or the
+	   previous entry otherwise */
+	dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh);
+	if (!dent) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+	if (IS_ERR(dent)) {
+		gfs2_consist_inode(dip);
+		return PTR_ERR(dent);
+	}
+	/* If not first in block, adjust pointers accordingly */
+	if (gfs2_dirent_find(dent, name, NULL) == 0) {
+		prev = dent;
+		dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
+	}
+
+	dirent_del(dip, bh, prev, dent);
+	if (dip->i_diskflags & GFS2_DIF_EXHASH) {
+		struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
+		u16 entries = be16_to_cpu(leaf->lf_entries);
+		if (!entries)
+			gfs2_consist_inode(dip);
+		leaf->lf_entries = cpu_to_be16(--entries);
+	}
+	brelse(bh);
+
+	error = gfs2_meta_inode_buffer(dip, &bh);
+	if (error)
+		return error;
+
+	if (!dip->i_entries)
+		gfs2_consist_inode(dip);
+	gfs2_trans_add_bh(dip->i_gl, bh, 1);
+	dip->i_entries--;
+	dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
+	if (S_ISDIR(dentry->d_inode->i_mode))
+		drop_nlink(&dip->i_inode);
+	gfs2_dinode_out(dip, bh->b_data);
+	brelse(bh);
+	mark_inode_dirty(&dip->i_inode);
+
+	return error;
+}
+
+/**
+ * gfs2_dir_mvino - Change inode number of directory entry
+ * @dip: The GFS2 inode
+ * @filename:
+ * @new_inode:
+ *
+ * This routine changes the inode number of a directory entry.  It's used
+ * by rename to change ".." when a directory is moved.
+ * Assumes a glock is held on dvp.
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+		   const struct gfs2_inode *nip, unsigned int new_type)
+{
+	struct buffer_head *bh;
+	struct gfs2_dirent *dent;
+	int error;
+
+	dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
+	if (!dent) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+	if (IS_ERR(dent))
+		return PTR_ERR(dent);
+
+	gfs2_trans_add_bh(dip->i_gl, bh, 1);
+	gfs2_inum_out(nip, dent);
+	dent->de_type = cpu_to_be16(new_type);
+
+	if (dip->i_diskflags & GFS2_DIF_EXHASH) {
+		brelse(bh);
+		error = gfs2_meta_inode_buffer(dip, &bh);
+		if (error)
+			return error;
+		gfs2_trans_add_bh(dip->i_gl, bh, 1);
+	}
+
+	dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_dinode_out(dip, bh->b_data);
+	brelse(bh);
+	return 0;
+}
+
+/**
+ * leaf_dealloc - Deallocate a directory leaf
+ * @dip: the directory
+ * @index: the hash table offset in the directory
+ * @len: the number of pointers to this leaf
+ * @leaf_no: the leaf number
+ * @leaf_bh: buffer_head for the starting leaf
+ * last_dealloc: 1 if this is the final dealloc for the leaf, else 0
+ *
+ * Returns: errno
+ */
+
+static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
+			u64 leaf_no, struct buffer_head *leaf_bh,
+			int last_dealloc)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_leaf *tmp_leaf;
+	struct gfs2_rgrp_list rlist;
+	struct buffer_head *bh, *dibh;
+	u64 blk, nblk;
+	unsigned int rg_blocks = 0, l_blocks = 0;
+	char *ht;
+	unsigned int x, size = len * sizeof(u64);
+	int error;
+
+	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+
+	ht = kzalloc(size, GFP_NOFS);
+	if (!ht)
+		return -ENOMEM;
+
+	if (!gfs2_alloc_get(dip)) {
+		error = -ENOMEM;
+		goto out;
+	}
+
+	error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out_put;
+
+	error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
+	if (error)
+		goto out_qs;
+
+	/*  Count the number of leaves  */
+	bh = leaf_bh;
+
+	for (blk = leaf_no; blk; blk = nblk) {
+		if (blk != leaf_no) {
+			error = get_leaf(dip, blk, &bh);
+			if (error)
+				goto out_rlist;
+		}
+		tmp_leaf = (struct gfs2_leaf *)bh->b_data;
+		nblk = be64_to_cpu(tmp_leaf->lf_next);
+		if (blk != leaf_no)
+			brelse(bh);
+
+		gfs2_rlist_add(sdp, &rlist, blk);
+		l_blocks++;
+	}
+
+	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
+
+	for (x = 0; x < rlist.rl_rgrps; x++) {
+		struct gfs2_rgrpd *rgd;
+		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+		rg_blocks += rgd->rd_length;
+	}
+
+	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+	if (error)
+		goto out_rlist;
+
+	error = gfs2_trans_begin(sdp,
+			rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
+			RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
+	if (error)
+		goto out_rg_gunlock;
+
+	bh = leaf_bh;
+
+	for (blk = leaf_no; blk; blk = nblk) {
+		if (blk != leaf_no) {
+			error = get_leaf(dip, blk, &bh);
+			if (error)
+				goto out_end_trans;
+		}
+		tmp_leaf = (struct gfs2_leaf *)bh->b_data;
+		nblk = be64_to_cpu(tmp_leaf->lf_next);
+		if (blk != leaf_no)
+			brelse(bh);
+
+		gfs2_free_meta(dip, blk, 1);
+		gfs2_add_inode_blocks(&dip->i_inode, -1);
+	}
+
+	error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
+	if (error != size) {
+		if (error >= 0)
+			error = -EIO;
+		goto out_end_trans;
+	}
+
+	error = gfs2_meta_inode_buffer(dip, &dibh);
+	if (error)
+		goto out_end_trans;
+
+	gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+	/* On the last dealloc, make this a regular file in case we crash.
+	   (We don't want to free these blocks a second time.)  */
+	if (last_dealloc)
+		dip->i_inode.i_mode = S_IFREG;
+	gfs2_dinode_out(dip, dibh->b_data);
+	brelse(dibh);
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_rg_gunlock:
+	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist:
+	gfs2_rlist_free(&rlist);
+	gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
+out_qs:
+	gfs2_quota_unhold(dip);
+out_put:
+	gfs2_alloc_put(dip);
+out:
+	kfree(ht);
+	return error;
+}
+
+/**
+ * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
+ * @dip: the directory
+ *
+ * Dealloc all on-disk directory leaves to FREEMETA state
+ * Change on-disk inode type to "regular file"
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct buffer_head *bh;
+	struct gfs2_leaf *leaf;
+	u32 hsize, len;
+	u32 ht_offset, lp_offset, ht_offset_cur = -1;
+	u32 index = 0, next_index;
+	__be64 *lp;
+	u64 leaf_no;
+	int error = 0, last;
+
+	hsize = 1 << dip->i_depth;
+	if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+
+	lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
+	if (!lp)
+		return -ENOMEM;
+
+	while (index < hsize) {
+		lp_offset = index & (sdp->sd_hash_ptrs - 1);
+		ht_offset = index - lp_offset;
+
+		if (ht_offset_cur != ht_offset) {
+			error = gfs2_dir_read_data(dip, (char *)lp,
+						ht_offset * sizeof(__be64),
+						sdp->sd_hash_bsize, 1);
+			if (error != sdp->sd_hash_bsize) {
+				if (error >= 0)
+					error = -EIO;
+				goto out;
+			}
+			ht_offset_cur = ht_offset;
+		}
+
+		leaf_no = be64_to_cpu(lp[lp_offset]);
+		if (leaf_no) {
+			error = get_leaf(dip, leaf_no, &bh);
+			if (error)
+				goto out;
+			leaf = (struct gfs2_leaf *)bh->b_data;
+			len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
+
+			next_index = (index & ~(len - 1)) + len;
+			last = ((next_index >= hsize) ? 1 : 0);
+			error = leaf_dealloc(dip, index, len, leaf_no, bh,
+					     last);
+			brelse(bh);
+			if (error)
+				goto out;
+			index = next_index;
+		} else
+			index++;
+	}
+
+	if (index != hsize) {
+		gfs2_consist_inode(dip);
+		error = -EIO;
+	}
+
+out:
+	kfree(lp);
+
+	return error;
+}
+
+/**
+ * gfs2_diradd_alloc_required - find if adding entry will require an allocation
+ * @ip: the file being written to
+ * @filname: the filename that's going to be added
+ *
+ * Returns: 1 if alloc required, 0 if not, -ve on error
+ */
+
+int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
+{
+	struct gfs2_dirent *dent;
+	struct buffer_head *bh;
+
+	dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
+	if (!dent) {
+		return 1;
+	}
+	if (IS_ERR(dent))
+		return PTR_ERR(dent);
+	brelse(bh);
+	return 0;
+}
+
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
new file mode 100644
index 00000000..e686af11
--- /dev/null
+++ b/fs/gfs2/dir.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+
+#include <linux/dcache.h>
+#include <linux/crc32.h>
+
+struct inode;
+struct gfs2_inode;
+struct gfs2_inum;
+
+extern struct inode *gfs2_dir_search(struct inode *dir,
+				     const struct qstr *filename);
+extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
+			  const struct gfs2_inode *ip);
+extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+			const struct gfs2_inode *ip);
+extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
+extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+			 filldir_t filldir);
+extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+			  const struct gfs2_inode *nip, unsigned int new_type);
+
+extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+
+extern int gfs2_diradd_alloc_required(struct inode *dir,
+				      const struct qstr *filename);
+extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+				   struct buffer_head **bhp);
+
+static inline u32 gfs2_disk_hash(const char *data, int len)
+{
+        return crc32_le((u32)~0, data, len) ^ (u32)~0;
+}
+
+
+static inline void gfs2_str2qstr(struct qstr *name, const char *fname)
+{
+	name->name = fname;
+	name->len = strlen(fname);
+	name->hash = gfs2_disk_hash(name->name, name->len);
+}
+
+/* N.B. This probably ought to take inum & type as args as well */
+static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent)
+{
+	dent->de_inum.no_addr = cpu_to_be64(0);
+	dent->de_inum.no_formal_ino = cpu_to_be64(0);
+	dent->de_hash = cpu_to_be32(name->hash);
+	dent->de_rec_len = cpu_to_be16(reclen);
+	dent->de_name_len = cpu_to_be16(name->len);
+	dent->de_type = cpu_to_be16(0);
+	memset(dent->__pad, 0, sizeof(dent->__pad));
+	memcpy(dent + 1, name->name, name->len);
+}
+
+extern struct qstr gfs2_qdot;
+extern struct qstr gfs2_qdotdot;
+
+#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
new file mode 100644
index 00000000..fe9945f2
--- /dev/null
+++ b/fs/gfs2/export.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "super.h"
+#include "rgrp.h"
+#include "util.h"
+
+#define GFS2_SMALL_FH_SIZE 4
+#define GFS2_LARGE_FH_SIZE 8
+#define GFS2_OLD_FH_SIZE 10
+
+static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
+			  int connectable)
+{
+	__be32 *fh = (__force __be32 *)p;
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (connectable && (*len < GFS2_LARGE_FH_SIZE)) {
+		*len = GFS2_LARGE_FH_SIZE;
+		return 255;
+	} else if (*len < GFS2_SMALL_FH_SIZE) {
+		*len = GFS2_SMALL_FH_SIZE;
+		return 255;
+	}
+
+	fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
+	fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
+	fh[2] = cpu_to_be32(ip->i_no_addr >> 32);
+	fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
+	*len = GFS2_SMALL_FH_SIZE;
+
+	if (!connectable || inode == sb->s_root->d_inode)
+		return *len;
+
+	spin_lock(&dentry->d_lock);
+	inode = dentry->d_parent->d_inode;
+	ip = GFS2_I(inode);
+	igrab(inode);
+	spin_unlock(&dentry->d_lock);
+
+	fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32);
+	fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
+	fh[6] = cpu_to_be32(ip->i_no_addr >> 32);
+	fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
+	*len = GFS2_LARGE_FH_SIZE;
+
+	iput(inode);
+
+	return *len;
+}
+
+struct get_name_filldir {
+	struct gfs2_inum_host inum;
+	char *name;
+};
+
+static int get_name_filldir(void *opaque, const char *name, int length,
+			    loff_t offset, u64 inum, unsigned int type)
+{
+	struct get_name_filldir *gnfd = opaque;
+
+	if (inum != gnfd->inum.no_addr)
+		return 0;
+
+	memcpy(gnfd->name, name, length);
+	gnfd->name[length] = 0;
+
+	return 1;
+}
+
+static int gfs2_get_name(struct dentry *parent, char *name,
+			 struct dentry *child)
+{
+	struct inode *dir = parent->d_inode;
+	struct inode *inode = child->d_inode;
+	struct gfs2_inode *dip, *ip;
+	struct get_name_filldir gnfd;
+	struct gfs2_holder gh;
+	u64 offset = 0;
+	int error;
+
+	if (!dir)
+		return -EINVAL;
+
+	if (!S_ISDIR(dir->i_mode) || !inode)
+		return -EINVAL;
+
+	dip = GFS2_I(dir);
+	ip = GFS2_I(inode);
+
+	*name = 0;
+	gnfd.inum.no_addr = ip->i_no_addr;
+	gnfd.inum.no_formal_ino = ip->i_no_formal_ino;
+	gnfd.name = name;
+
+	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
+	if (error)
+		return error;
+
+	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
+
+	gfs2_glock_dq_uninit(&gh);
+
+	if (!error && !*name)
+		error = -ENOENT;
+
+	return error;
+}
+
+static struct dentry *gfs2_get_parent(struct dentry *child)
+{
+	return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
+}
+
+static struct dentry *gfs2_get_dentry(struct super_block *sb,
+				      struct gfs2_inum_host *inum)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct inode *inode;
+
+	inode = gfs2_ilookup(sb, inum->no_addr, 0);
+	if (inode) {
+		if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
+			iput(inode);
+			return ERR_PTR(-ESTALE);
+		}
+		goto out_inode;
+	}
+
+	inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
+				    GFS2_BLKST_DINODE);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+out_inode:
+	return d_obtain_alias(inode);
+}
+
+static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	struct gfs2_inum_host this;
+	__be32 *fh = (__force __be32 *)fid->raw;
+
+	switch (fh_type) {
+	case GFS2_SMALL_FH_SIZE:
+	case GFS2_LARGE_FH_SIZE:
+	case GFS2_OLD_FH_SIZE:
+		this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
+		this.no_formal_ino |= be32_to_cpu(fh[1]);
+		this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
+		this.no_addr |= be32_to_cpu(fh[3]);
+		return gfs2_get_dentry(sb, &this);
+	default:
+		return NULL;
+	}
+}
+
+static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	struct gfs2_inum_host parent;
+	__be32 *fh = (__force __be32 *)fid->raw;
+
+	switch (fh_type) {
+	case GFS2_LARGE_FH_SIZE:
+	case GFS2_OLD_FH_SIZE:
+		parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
+		parent.no_formal_ino |= be32_to_cpu(fh[5]);
+		parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
+		parent.no_addr |= be32_to_cpu(fh[7]);
+		return gfs2_get_dentry(sb, &parent);
+	default:
+		return NULL;
+	}
+}
+
+const struct export_operations gfs2_export_ops = {
+	.encode_fh = gfs2_encode_fh,
+	.fh_to_dentry = gfs2_fh_to_dentry,
+	.fh_to_parent = gfs2_fh_to_parent,
+	.get_name = gfs2_get_name,
+	.get_parent = gfs2_get_parent,
+};
+
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
new file mode 100644
index 00000000..a9f5cbe4
--- /dev/null
+++ b/fs/gfs2/file.c
@@ -0,0 +1,1112 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/ext2_fs.h>
+#include <linux/falloc.h>
+#include <linux/swap.h>
+#include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <asm/uaccess.h>
+#include <linux/dlm.h>
+#include <linux/dlm_plock.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+/**
+ * gfs2_llseek - seek to a location in a file
+ * @file: the file
+ * @offset: the offset
+ * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
+ *
+ * SEEK_END requires the glock for the file because it references the
+ * file's size.
+ *
+ * Returns: The new offset, or errno
+ */
+
+static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+	struct gfs2_holder i_gh;
+	loff_t error;
+
+	if (origin == 2) {
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+					   &i_gh);
+		if (!error) {
+			error = generic_file_llseek_unlocked(file, offset, origin);
+			gfs2_glock_dq_uninit(&i_gh);
+		}
+	} else
+		error = generic_file_llseek_unlocked(file, offset, origin);
+
+	return error;
+}
+
+/**
+ * gfs2_readdir - Read directory entries from a directory
+ * @file: The directory to read from
+ * @dirent: Buffer for dirents
+ * @filldir: Function used to do the copying
+ *
+ * Returns: errno
+ */
+
+static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+	struct inode *dir = file->f_mapping->host;
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_holder d_gh;
+	u64 offset = file->f_pos;
+	int error;
+
+	gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+	error = gfs2_glock_nq(&d_gh);
+	if (error) {
+		gfs2_holder_uninit(&d_gh);
+		return error;
+	}
+
+	error = gfs2_dir_read(dir, &offset, dirent, filldir);
+
+	gfs2_glock_dq_uninit(&d_gh);
+
+	file->f_pos = offset;
+
+	return error;
+}
+
+/**
+ * fsflags_cvt
+ * @table: A table of 32 u32 flags
+ * @val: a 32 bit value to convert
+ *
+ * This function can be used to convert between fsflags values and
+ * GFS2's own flags values.
+ *
+ * Returns: the converted flags
+ */
+static u32 fsflags_cvt(const u32 *table, u32 val)
+{
+	u32 res = 0;
+	while(val) {
+		if (val & 1)
+			res |= *table;
+		table++;
+		val >>= 1;
+	}
+	return res;
+}
+
+static const u32 fsflags_to_gfs2[32] = {
+	[3] = GFS2_DIF_SYNC,
+	[4] = GFS2_DIF_IMMUTABLE,
+	[5] = GFS2_DIF_APPENDONLY,
+	[7] = GFS2_DIF_NOATIME,
+	[12] = GFS2_DIF_EXHASH,
+	[14] = GFS2_DIF_INHERIT_JDATA,
+};
+
+static const u32 gfs2_to_fsflags[32] = {
+	[gfs2fl_Sync] = FS_SYNC_FL,
+	[gfs2fl_Immutable] = FS_IMMUTABLE_FL,
+	[gfs2fl_AppendOnly] = FS_APPEND_FL,
+	[gfs2fl_NoAtime] = FS_NOATIME_FL,
+	[gfs2fl_ExHash] = FS_INDEX_FL,
+	[gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
+};
+
+static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int error;
+	u32 fsflags;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	error = gfs2_glock_nq(&gh);
+	if (error)
+		return error;
+
+	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
+	if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
+		fsflags |= FS_JOURNAL_DATA_FL;
+	if (put_user(fsflags, ptr))
+		error = -EFAULT;
+
+	gfs2_glock_dq(&gh);
+	gfs2_holder_uninit(&gh);
+	return error;
+}
+
+void gfs2_set_inode_flags(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int flags = inode->i_flags;
+
+	flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+	if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
+		flags |= S_IMMUTABLE;
+	if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
+		flags |= S_APPEND;
+	if (ip->i_diskflags & GFS2_DIF_NOATIME)
+		flags |= S_NOATIME;
+	if (ip->i_diskflags & GFS2_DIF_SYNC)
+		flags |= S_SYNC;
+	inode->i_flags = flags;
+}
+
+/* Flags that can be set by user space */
+#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA|			\
+			     GFS2_DIF_IMMUTABLE|		\
+			     GFS2_DIF_APPENDONLY|		\
+			     GFS2_DIF_NOATIME|			\
+			     GFS2_DIF_SYNC|			\
+			     GFS2_DIF_SYSTEM|			\
+			     GFS2_DIF_INHERIT_JDATA)
+
+/**
+ * gfs2_set_flags - set flags on an inode
+ * @inode: The inode
+ * @flags: The flags to set
+ * @mask: Indicates which flags are valid
+ *
+ */
+static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *bh;
+	struct gfs2_holder gh;
+	int error;
+	u32 new_flags, flags;
+
+	error = mnt_want_write(filp->f_path.mnt);
+	if (error)
+		return error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (error)
+		goto out_drop_write;
+
+	error = -EACCES;
+	if (!inode_owner_or_capable(inode))
+		goto out;
+
+	error = 0;
+	flags = ip->i_diskflags;
+	new_flags = (flags & ~mask) | (reqflags & mask);
+	if ((new_flags ^ flags) == 0)
+		goto out;
+
+	error = -EINVAL;
+	if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
+		goto out;
+
+	error = -EPERM;
+	if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
+		goto out;
+	if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
+		goto out;
+	if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		goto out;
+	if (!IS_IMMUTABLE(inode)) {
+		error = gfs2_permission(inode, MAY_WRITE, 0);
+		if (error)
+			goto out;
+	}
+	if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
+		if (flags & GFS2_DIF_JDATA)
+			gfs2_log_flush(sdp, ip->i_gl);
+		error = filemap_fdatawrite(inode->i_mapping);
+		if (error)
+			goto out;
+		error = filemap_fdatawait(inode->i_mapping);
+		if (error)
+			goto out;
+	}
+	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (error)
+		goto out;
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error)
+		goto out_trans_end;
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	ip->i_diskflags = new_flags;
+	gfs2_dinode_out(ip, bh->b_data);
+	brelse(bh);
+	gfs2_set_inode_flags(inode);
+	gfs2_set_aops(inode);
+out_trans_end:
+	gfs2_trans_end(sdp);
+out:
+	gfs2_glock_dq_uninit(&gh);
+out_drop_write:
+	mnt_drop_write(filp->f_path.mnt);
+	return error;
+}
+
+static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	u32 fsflags, gfsflags;
+
+	if (get_user(fsflags, ptr))
+		return -EFAULT;
+
+	gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
+	if (!S_ISDIR(inode->i_mode)) {
+		if (gfsflags & GFS2_DIF_INHERIT_JDATA)
+			gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
+		return do_gfs2_set_flags(filp, gfsflags, ~0);
+	}
+	return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
+}
+
+static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch(cmd) {
+	case FS_IOC_GETFLAGS:
+		return gfs2_get_flags(filp, (u32 __user *)arg);
+	case FS_IOC_SETFLAGS:
+		return gfs2_set_flags(filp, (u32 __user *)arg);
+	}
+	return -ENOTTY;
+}
+
+/**
+ * gfs2_allocate_page_backing - Use bmap to allocate blocks
+ * @page: The (locked) page to allocate backing for
+ *
+ * We try to allocate all the blocks required for the page in
+ * one go. This might fail for various reasons, so we keep
+ * trying until all the blocks to back this page are allocated.
+ * If some of the blocks are already allocated, thats ok too.
+ */
+
+static int gfs2_allocate_page_backing(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head bh;
+	unsigned long size = PAGE_CACHE_SIZE;
+	u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	do {
+		bh.b_state = 0;
+		bh.b_size = size;
+		gfs2_block_map(inode, lblock, &bh, 1);
+		if (!buffer_mapped(&bh))
+			return -EIO;
+		size -= bh.b_size;
+		lblock += (bh.b_size >> inode->i_blkbits);
+	} while(size > 0);
+	return 0;
+}
+
+/**
+ * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
+ * @vma: The virtual memory area
+ * @page: The page which is about to become writable
+ *
+ * When the page becomes writable, we need to ensure that we have
+ * blocks allocated on disk to back that page.
+ */
+
+static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	unsigned long last_index;
+	u64 pos = page->index << PAGE_CACHE_SHIFT;
+	unsigned int data_blocks, ind_blocks, rblocks;
+	struct gfs2_holder gh;
+	struct gfs2_alloc *al;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret)
+		goto out;
+
+	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
+	set_bit(GIF_SW_PAGED, &ip->i_flags);
+
+	if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE))
+		goto out_unlock;
+	ret = -ENOMEM;
+	al = gfs2_alloc_get(ip);
+	if (al == NULL)
+		goto out_unlock;
+
+	ret = gfs2_quota_lock_check(ip);
+	if (ret)
+		goto out_alloc_put;
+	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
+	al->al_requested = data_blocks + ind_blocks;
+	ret = gfs2_inplace_reserve(ip);
+	if (ret)
+		goto out_quota_unlock;
+
+	rblocks = RES_DINODE + ind_blocks;
+	if (gfs2_is_jdata(ip))
+		rblocks += data_blocks ? data_blocks : 1;
+	if (ind_blocks || data_blocks) {
+		rblocks += RES_STATFS + RES_QUOTA;
+		rblocks += gfs2_rg_blocks(al);
+	}
+	ret = gfs2_trans_begin(sdp, rblocks, 0);
+	if (ret)
+		goto out_trans_fail;
+
+	lock_page(page);
+	ret = -EINVAL;
+	last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT;
+	if (page->index > last_index)
+		goto out_unlock_page;
+	ret = 0;
+	if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping)
+		goto out_unlock_page;
+	if (gfs2_is_stuffed(ip)) {
+		ret = gfs2_unstuff_dinode(ip, page);
+		if (ret)
+			goto out_unlock_page;
+	}
+	ret = gfs2_allocate_page_backing(page);
+
+out_unlock_page:
+	unlock_page(page);
+	gfs2_trans_end(sdp);
+out_trans_fail:
+	gfs2_inplace_release(ip);
+out_quota_unlock:
+	gfs2_quota_unlock(ip);
+out_alloc_put:
+	gfs2_alloc_put(ip);
+out_unlock:
+	gfs2_glock_dq(&gh);
+out:
+	gfs2_holder_uninit(&gh);
+	if (ret == -ENOMEM)
+		ret = VM_FAULT_OOM;
+	else if (ret)
+		ret = VM_FAULT_SIGBUS;
+	return ret;
+}
+
+static const struct vm_operations_struct gfs2_vm_ops = {
+	.fault = filemap_fault,
+	.page_mkwrite = gfs2_page_mkwrite,
+};
+
+/**
+ * gfs2_mmap -
+ * @file: The file to map
+ * @vma: The VMA which described the mapping
+ *
+ * There is no need to get a lock here unless we should be updating
+ * atime. We ignore any locking errors since the only consequence is
+ * a missed atime update (which will just be deferred until later).
+ *
+ * Returns: 0
+ */
+
+static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+
+	if (!(file->f_flags & O_NOATIME) &&
+	    !IS_NOATIME(&ip->i_inode)) {
+		struct gfs2_holder i_gh;
+		int error;
+
+		gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+		error = gfs2_glock_nq(&i_gh);
+		if (error == 0) {
+			file_accessed(file);
+			gfs2_glock_dq(&i_gh);
+		}
+		gfs2_holder_uninit(&i_gh);
+		if (error)
+			return error;
+	}
+	vma->vm_ops = &gfs2_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+
+	return 0;
+}
+
+/**
+ * gfs2_open - open a file
+ * @inode: the inode to open
+ * @file: the struct file for this opening
+ *
+ * Returns: errno
+ */
+
+static int gfs2_open(struct inode *inode, struct file *file)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder i_gh;
+	struct gfs2_file *fp;
+	int error;
+
+	fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
+	if (!fp)
+		return -ENOMEM;
+
+	mutex_init(&fp->f_fl_mutex);
+
+	gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
+	file->private_data = fp;
+
+	if (S_ISREG(ip->i_inode.i_mode)) {
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+					   &i_gh);
+		if (error)
+			goto fail;
+
+		if (!(file->f_flags & O_LARGEFILE) &&
+		    i_size_read(inode) > MAX_NON_LFS) {
+			error = -EOVERFLOW;
+			goto fail_gunlock;
+		}
+
+		gfs2_glock_dq_uninit(&i_gh);
+	}
+
+	return 0;
+
+fail_gunlock:
+	gfs2_glock_dq_uninit(&i_gh);
+fail:
+	file->private_data = NULL;
+	kfree(fp);
+	return error;
+}
+
+/**
+ * gfs2_close - called to close a struct file
+ * @inode: the inode the struct file belongs to
+ * @file: the struct file being closed
+ *
+ * Returns: errno
+ */
+
+static int gfs2_close(struct inode *inode, struct file *file)
+{
+	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_file *fp;
+
+	fp = file->private_data;
+	file->private_data = NULL;
+
+	if (gfs2_assert_warn(sdp, fp))
+		return -EIO;
+
+	kfree(fp);
+
+	return 0;
+}
+
+/**
+ * gfs2_fsync - sync the dirty data for a file (across the cluster)
+ * @file: the file that points to the dentry (we ignore this)
+ * @datasync: set if we can ignore timestamp changes
+ *
+ * The VFS will flush data for us. We only need to worry
+ * about metadata here.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_fsync(struct file *file, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int ret;
+
+	if (datasync)
+		sync_state &= ~I_DIRTY_SYNC;
+
+	if (sync_state) {
+		ret = sync_inode_metadata(inode, 1);
+		if (ret)
+			return ret;
+		gfs2_ail_flush(ip->i_gl);
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_file_aio_write - Perform a write to a file
+ * @iocb: The io context
+ * @iov: The data to write
+ * @nr_segs: Number of @iov segments
+ * @pos: The file position
+ *
+ * We have to do a lock/unlock here to refresh the inode size for
+ * O_APPEND writes, otherwise we can land up writing at the wrong
+ * offset. There is still a race, but provided the app is using its
+ * own file locking, this will make O_APPEND work as expected.
+ *
+ */
+
+static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				   unsigned long nr_segs, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+
+	if (file->f_flags & O_APPEND) {
+		struct dentry *dentry = file->f_dentry;
+		struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+		struct gfs2_holder gh;
+		int ret;
+
+		ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+		if (ret)
+			return ret;
+		gfs2_glock_dq_uninit(&gh);
+	}
+
+	return generic_file_aio_write(iocb, iov, nr_segs, pos);
+}
+
+static int empty_write_end(struct page *page, unsigned from,
+			   unsigned to, int mode)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *bh;
+	unsigned offset, blksize = 1 << inode->i_blkbits;
+	pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
+
+	zero_user(page, from, to-from);
+	mark_page_accessed(page);
+
+	if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
+		if (!gfs2_is_writeback(ip))
+			gfs2_page_add_databufs(ip, page, from, to);
+
+		block_commit_write(page, from, to);
+		return 0;
+	}
+
+	offset = 0;
+	bh = page_buffers(page);
+	while (offset < to) {
+		if (offset >= from) {
+			set_buffer_uptodate(bh);
+			mark_buffer_dirty(bh);
+			clear_buffer_new(bh);
+			write_dirty_buffer(bh, WRITE);
+		}
+		offset += blksize;
+		bh = bh->b_this_page;
+	}
+
+	offset = 0;
+	bh = page_buffers(page);
+	while (offset < to) {
+		if (offset >= from) {
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				return -EIO;
+		}
+		offset += blksize;
+		bh = bh->b_this_page;
+	}
+	return 0;
+}
+
+static int needs_empty_write(sector_t block, struct inode *inode)
+{
+	int error;
+	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+
+	bh_map.b_size = 1 << inode->i_blkbits;
+	error = gfs2_block_map(inode, block, &bh_map, 0);
+	if (unlikely(error))
+		return error;
+	return !buffer_mapped(&bh_map);
+}
+
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
+			      int mode)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned start, end, next, blksize;
+	sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	int ret;
+
+	blksize = 1 << inode->i_blkbits;
+	next = end = 0;
+	while (next < from) {
+		next += blksize;
+		block++;
+	}
+	start = next;
+	do {
+		next += blksize;
+		ret = needs_empty_write(block, inode);
+		if (unlikely(ret < 0))
+			return ret;
+		if (ret == 0) {
+			if (end) {
+				ret = __block_write_begin(page, start, end - start,
+							  gfs2_block_map);
+				if (unlikely(ret))
+					return ret;
+				ret = empty_write_end(page, start, end, mode);
+				if (unlikely(ret))
+					return ret;
+				end = 0;
+			}
+			start = next;
+		}
+		else
+			end = next;
+		block++;
+	} while (next < to);
+
+	if (end) {
+		ret = __block_write_begin(page, start, end - start, gfs2_block_map);
+		if (unlikely(ret))
+			return ret;
+		ret = empty_write_end(page, start, end, mode);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	return 0;
+}
+
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+			   int mode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *dibh;
+	int error;
+	u64 start = offset >> PAGE_CACHE_SHIFT;
+	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+	pgoff_t curr;
+	struct page *page;
+	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+	unsigned int from, to;
+
+	if (!end_offset)
+		end_offset = PAGE_CACHE_SIZE;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (unlikely(error))
+		goto out;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+	if (gfs2_is_stuffed(ip)) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (unlikely(error))
+			goto out;
+	}
+
+	curr = start;
+	offset = start << PAGE_CACHE_SHIFT;
+	from = start_offset;
+	to = PAGE_CACHE_SIZE;
+	while (curr <= end) {
+		page = grab_cache_page_write_begin(inode->i_mapping, curr,
+						   AOP_FLAG_NOFS);
+		if (unlikely(!page)) {
+			error = -ENOMEM;
+			goto out;
+		}
+
+		if (curr == end)
+			to = end_offset;
+		error = write_empty_blocks(page, from, to, mode);
+		if (!error && offset + to > inode->i_size &&
+		    !(mode & FALLOC_FL_KEEP_SIZE)) {
+			i_size_write(inode, offset + to);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+		if (error)
+			goto out;
+		curr++;
+		offset += PAGE_CACHE_SIZE;
+		from = 0;
+	}
+
+	gfs2_dinode_out(ip, dibh->b_data);
+	mark_inode_dirty(inode);
+
+	brelse(dibh);
+
+out:
+	return error;
+}
+
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+			    unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+
+	for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+		max_data -= tmp;
+	}
+	/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+	   so it might end up with fewer data blocks */
+	if (max_data <= *data_blocks)
+		return;
+	*data_blocks = max_data;
+	*ind_blocks = max_blocks - max_data;
+	*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+	if (*len > max) {
+		*len = max;
+		gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+	}
+}
+
+static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
+			   loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	loff_t bytes, max_bytes;
+	struct gfs2_alloc *al;
+	int error;
+	loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
+	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+
+	/* We only support the FALLOC_FL_KEEP_SIZE mode */
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
+	offset &= bsize_mask;
+
+	len = next - offset;
+	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+	if (!bytes)
+		bytes = UINT_MAX;
+	bytes &= bsize_mask;
+	if (bytes == 0)
+		bytes = sdp->sd_sb.sb_bsize;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
+	if (unlikely(error))
+		goto out_uninit;
+
+	if (!gfs2_write_alloc_required(ip, offset, len))
+		goto out_unlock;
+
+	while (len > 0) {
+		if (len < bytes)
+			bytes = len;
+		al = gfs2_alloc_get(ip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_unlock;
+		}
+
+		error = gfs2_quota_lock_check(ip);
+		if (error)
+			goto out_alloc_put;
+
+retry:
+		gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+
+		al->al_requested = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip);
+		if (error) {
+			if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+				bytes >>= 1;
+				bytes &= bsize_mask;
+				if (bytes == 0)
+					bytes = sdp->sd_sb.sb_bsize;
+				goto retry;
+			}
+			goto out_qunlock;
+		}
+		max_bytes = bytes;
+		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+		al->al_requested = data_blocks + ind_blocks;
+
+		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+			  RES_RG_HDR + gfs2_rg_blocks(al);
+		if (gfs2_is_jdata(ip))
+			rblocks += data_blocks ? data_blocks : 1;
+
+		error = gfs2_trans_begin(sdp, rblocks,
+					 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+		if (error)
+			goto out_trans_fail;
+
+		error = fallocate_chunk(inode, offset, max_bytes, mode);
+		gfs2_trans_end(sdp);
+
+		if (error)
+			goto out_trans_fail;
+
+		len -= max_bytes;
+		offset += max_bytes;
+		gfs2_inplace_release(ip);
+		gfs2_quota_unlock(ip);
+		gfs2_alloc_put(ip);
+	}
+	goto out_unlock;
+
+out_trans_fail:
+	gfs2_inplace_release(ip);
+out_qunlock:
+	gfs2_quota_unlock(ip);
+out_alloc_put:
+	gfs2_alloc_put(ip);
+out_unlock:
+	gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_gh);
+	return error;
+}
+
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+
+/**
+ * gfs2_setlease - acquire/release a file lease
+ * @file: the file pointer
+ * @arg: lease type
+ * @fl: file lock
+ *
+ * We don't currently have a way to enforce a lease across the whole
+ * cluster; until we do, disable leases (by just returning -EINVAL),
+ * unless the administrator has requested purely local locking.
+ *
+ * Locking: called under lock_flocks
+ *
+ * Returns: errno
+ */
+
+static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
+{
+	return -EINVAL;
+}
+
+/**
+ * gfs2_lock - acquire/release a posix lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+	if (!(fl->fl_flags & FL_POSIX))
+		return -ENOLCK;
+	if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
+		return -ENOLCK;
+
+	if (cmd == F_CANCELLK) {
+		/* Hack: */
+		cmd = F_SETLK;
+		fl->fl_type = F_UNLCK;
+	}
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+	if (IS_GETLK(cmd))
+		return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
+	else if (fl->fl_type == F_UNLCK)
+		return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
+	else
+		return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
+}
+
+static int do_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct gfs2_file *fp = file->private_data;
+	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+	struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode);
+	struct gfs2_glock *gl;
+	unsigned int state;
+	int flags;
+	int error = 0;
+
+	state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
+	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
+
+	mutex_lock(&fp->f_fl_mutex);
+
+	gl = fl_gh->gh_gl;
+	if (gl) {
+		if (fl_gh->gh_state == state)
+			goto out;
+		flock_lock_file_wait(file,
+				     &(struct file_lock){.fl_type = F_UNLCK});
+		gfs2_glock_dq_wait(fl_gh);
+		gfs2_holder_reinit(state, flags, fl_gh);
+	} else {
+		error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
+				       &gfs2_flock_glops, CREATE, &gl);
+		if (error)
+			goto out;
+		gfs2_holder_init(gl, state, flags, fl_gh);
+		gfs2_glock_put(gl);
+	}
+	error = gfs2_glock_nq(fl_gh);
+	if (error) {
+		gfs2_holder_uninit(fl_gh);
+		if (error == GLR_TRYFAILED)
+			error = -EAGAIN;
+	} else {
+		error = flock_lock_file_wait(file, fl);
+		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+	}
+
+out:
+	mutex_unlock(&fp->f_fl_mutex);
+	return error;
+}
+
+static void do_unflock(struct file *file, struct file_lock *fl)
+{
+	struct gfs2_file *fp = file->private_data;
+	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+
+	mutex_lock(&fp->f_fl_mutex);
+	flock_lock_file_wait(file, fl);
+	if (fl_gh->gh_gl) {
+		gfs2_glock_dq_wait(fl_gh);
+		gfs2_holder_uninit(fl_gh);
+	}
+	mutex_unlock(&fp->f_fl_mutex);
+}
+
+/**
+ * gfs2_flock - acquire/release a flock lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	if (!(fl->fl_flags & FL_FLOCK))
+		return -ENOLCK;
+	if (fl->fl_type & LOCK_MAND)
+		return -EOPNOTSUPP;
+
+	if (fl->fl_type == F_UNLCK) {
+		do_unflock(file, fl);
+		return 0;
+	} else {
+		return do_flock(file, cmd, fl);
+	}
+}
+
+const struct file_operations gfs2_file_fops = {
+	.llseek		= gfs2_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= gfs2_file_aio_write,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.mmap		= gfs2_mmap,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+	.lock		= gfs2_lock,
+	.flock		= gfs2_flock,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+	.setlease	= gfs2_setlease,
+	.fallocate	= gfs2_fallocate,
+};
+
+const struct file_operations gfs2_dir_fops = {
+	.readdir	= gfs2_readdir,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+	.lock		= gfs2_lock,
+	.flock		= gfs2_flock,
+	.llseek		= default_llseek,
+};
+
+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
+
+const struct file_operations gfs2_file_fops_nolock = {
+	.llseek		= gfs2_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= gfs2_file_aio_write,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.mmap		= gfs2_mmap,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+	.setlease	= generic_setlease,
+	.fallocate	= gfs2_fallocate,
+};
+
+const struct file_operations gfs2_dir_fops_nolock = {
+	.readdir	= gfs2_readdir,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+	.llseek		= default_llseek,
+};
+
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
new file mode 100644
index 00000000..ef606e3a
--- /dev/null
+++ b/fs/gfs2/gfs2.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __GFS2_DOT_H__
+#define __GFS2_DOT_H__
+
+enum {
+	NO_CREATE = 0,
+	CREATE = 1,
+};
+
+enum {
+	NO_FORCE = 0,
+	FORCE = 1,
+};
+
+#define GFS2_FAST_NAME_SIZE 8
+
+#endif /* __GFS2_DOT_H__ */
+
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
new file mode 100644
index 00000000..1c1336e7
--- /dev/null
+++ b/fs/gfs2/glock.c
@@ -0,0 +1,1870 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/buffer_head.h>
+#include <linux/delay.h>
+#include <linux/sort.h>
+#include <linux/jhash.h>
+#include <linux/kallsyms.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/workqueue.h>
+#include <linux/jiffies.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/bit_spinlock.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "super.h"
+#include "util.h"
+#include "bmap.h"
+#define CREATE_TRACE_POINTS
+#include "trace_gfs2.h"
+
+struct gfs2_glock_iter {
+	int hash;			/* hash bucket index         */
+	struct gfs2_sbd *sdp;		/* incore superblock         */
+	struct gfs2_glock *gl;		/* current glock struct      */
+	char string[512];		/* scratch space             */
+};
+
+typedef void (*glock_examiner) (struct gfs2_glock * gl);
+
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
+#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
+
+static struct dentry *gfs2_root;
+static struct workqueue_struct *glock_workqueue;
+struct workqueue_struct *gfs2_delete_workqueue;
+static LIST_HEAD(lru_list);
+static atomic_t lru_count = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(lru_lock);
+
+#define GFS2_GL_HASH_SHIFT      15
+#define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
+#define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
+
+static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE];
+static struct dentry *gfs2_root;
+
+/**
+ * gl_hash() - Turn glock number into hash bucket number
+ * @lock: The glock number
+ *
+ * Returns: The number of the corresponding hash bucket
+ */
+
+static unsigned int gl_hash(const struct gfs2_sbd *sdp,
+			    const struct lm_lockname *name)
+{
+	unsigned int h;
+
+	h = jhash(&name->ln_number, sizeof(u64), 0);
+	h = jhash(&name->ln_type, sizeof(unsigned int), h);
+	h = jhash(&sdp, sizeof(struct gfs2_sbd *), h);
+	h &= GFS2_GL_HASH_MASK;
+
+	return h;
+}
+
+static inline void spin_lock_bucket(unsigned int hash)
+{
+	hlist_bl_lock(&gl_hash_table[hash]);
+}
+
+static inline void spin_unlock_bucket(unsigned int hash)
+{
+	hlist_bl_unlock(&gl_hash_table[hash]);
+}
+
+static void gfs2_glock_dealloc(struct rcu_head *rcu)
+{
+	struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
+
+	if (gl->gl_ops->go_flags & GLOF_ASPACE)
+		kmem_cache_free(gfs2_glock_aspace_cachep, gl);
+	else
+		kmem_cache_free(gfs2_glock_cachep, gl);
+}
+
+void gfs2_glock_free(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+
+	call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
+	if (atomic_dec_and_test(&sdp->sd_glock_disposal))
+		wake_up(&sdp->sd_glock_wait);
+}
+
+/**
+ * gfs2_glock_hold() - increment reference count on glock
+ * @gl: The glock to hold
+ *
+ */
+
+void gfs2_glock_hold(struct gfs2_glock *gl)
+{
+	GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
+	atomic_inc(&gl->gl_ref);
+}
+
+/**
+ * demote_ok - Check to see if it's ok to unlock a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int demote_ok(const struct gfs2_glock *gl)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+	if (gl->gl_state == LM_ST_UNLOCKED)
+		return 0;
+	if (!list_empty(&gl->gl_holders))
+		return 0;
+	if (glops->go_demote_ok)
+		return glops->go_demote_ok(gl);
+	return 1;
+}
+
+
+void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
+{
+	spin_lock(&lru_lock);
+
+	if (!list_empty(&gl->gl_lru))
+		list_del_init(&gl->gl_lru);
+	else
+		atomic_inc(&lru_count);
+
+	list_add_tail(&gl->gl_lru, &lru_list);
+	set_bit(GLF_LRU, &gl->gl_flags);
+	spin_unlock(&lru_lock);
+}
+
+static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
+{
+	spin_lock(&lru_lock);
+	if (!list_empty(&gl->gl_lru)) {
+		list_del_init(&gl->gl_lru);
+		atomic_dec(&lru_count);
+		clear_bit(GLF_LRU, &gl->gl_flags);
+	}
+	spin_unlock(&lru_lock);
+}
+
+/**
+ * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * @gl: the glock
+ *
+ * If the glock is demotable, then we add it (or move it) to the end
+ * of the glock LRU list.
+ */
+
+static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+	if (demote_ok(gl))
+		gfs2_glock_add_to_lru(gl);
+}
+
+/**
+ * gfs2_glock_put_nolock() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ * This function should only be used if the caller has its own reference
+ * to the glock, in addition to the one it is dropping.
+ */
+
+void gfs2_glock_put_nolock(struct gfs2_glock *gl)
+{
+	if (atomic_dec_and_test(&gl->gl_ref))
+		GLOCK_BUG_ON(gl, 1);
+}
+
+/**
+ * gfs2_glock_put() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ */
+
+void gfs2_glock_put(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct address_space *mapping = gfs2_glock2aspace(gl);
+
+	if (atomic_dec_and_test(&gl->gl_ref)) {
+		spin_lock_bucket(gl->gl_hash);
+		hlist_bl_del_rcu(&gl->gl_list);
+		spin_unlock_bucket(gl->gl_hash);
+		gfs2_glock_remove_from_lru(gl);
+		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
+		GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
+		trace_gfs2_glock_put(gl);
+		sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
+	}
+}
+
+/**
+ * search_bucket() - Find struct gfs2_glock by lock number
+ * @bucket: the bucket to search
+ * @name: The lock name
+ *
+ * Returns: NULL, or the struct gfs2_glock with the requested number
+ */
+
+static struct gfs2_glock *search_bucket(unsigned int hash,
+					const struct gfs2_sbd *sdp,
+					const struct lm_lockname *name)
+{
+	struct gfs2_glock *gl;
+	struct hlist_bl_node *h;
+
+	hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) {
+		if (!lm_name_equal(&gl->gl_name, name))
+			continue;
+		if (gl->gl_sbd != sdp)
+			continue;
+		if (atomic_inc_not_zero(&gl->gl_ref))
+			return gl;
+	}
+
+	return NULL;
+}
+
+/**
+ * may_grant - check if its ok to grant a new lock
+ * @gl: The glock
+ * @gh: The lock request which we wish to grant
+ *
+ * Returns: true if its ok to grant the lock
+ */
+
+static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
+{
+	const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list);
+	if ((gh->gh_state == LM_ST_EXCLUSIVE ||
+	     gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head)
+		return 0;
+	if (gl->gl_state == gh->gh_state)
+		return 1;
+	if (gh->gh_flags & GL_EXACT)
+		return 0;
+	if (gl->gl_state == LM_ST_EXCLUSIVE) {
+		if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
+			return 1;
+		if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
+			return 1;
+	}
+	if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
+		return 1;
+	return 0;
+}
+
+static void gfs2_holder_wake(struct gfs2_holder *gh)
+{
+	clear_bit(HIF_WAIT, &gh->gh_iflags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&gh->gh_iflags, HIF_WAIT);
+}
+
+/**
+ * do_error - Something unexpected has happened during a lock request
+ *
+ */
+
+static inline void do_error(struct gfs2_glock *gl, const int ret)
+{
+	struct gfs2_holder *gh, *tmp;
+
+	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			continue;
+		if (ret & LM_OUT_ERROR)
+			gh->gh_error = -EIO;
+		else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+			gh->gh_error = GLR_TRYFAILED;
+		else
+			continue;
+		list_del_init(&gh->gh_list);
+		trace_gfs2_glock_queue(gh, 0);
+		gfs2_holder_wake(gh);
+	}
+}
+
+/**
+ * do_promote - promote as many requests as possible on the current queue
+ * @gl: The glock
+ * 
+ * Returns: 1 if there is a blocked holder at the head of the list, or 2
+ *          if a type specific operation is underway.
+ */
+
+static int do_promote(struct gfs2_glock *gl)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_holder *gh, *tmp;
+	int ret;
+
+restart:
+	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			continue;
+		if (may_grant(gl, gh)) {
+			if (gh->gh_list.prev == &gl->gl_holders &&
+			    glops->go_lock) {
+				spin_unlock(&gl->gl_spin);
+				/* FIXME: eliminate this eventually */
+				ret = glops->go_lock(gh);
+				spin_lock(&gl->gl_spin);
+				if (ret) {
+					if (ret == 1)
+						return 2;
+					gh->gh_error = ret;
+					list_del_init(&gh->gh_list);
+					trace_gfs2_glock_queue(gh, 0);
+					gfs2_holder_wake(gh);
+					goto restart;
+				}
+				set_bit(HIF_HOLDER, &gh->gh_iflags);
+				trace_gfs2_promote(gh, 1);
+				gfs2_holder_wake(gh);
+				goto restart;
+			}
+			set_bit(HIF_HOLDER, &gh->gh_iflags);
+			trace_gfs2_promote(gh, 0);
+			gfs2_holder_wake(gh);
+			continue;
+		}
+		if (gh->gh_list.prev == &gl->gl_holders)
+			return 1;
+		do_error(gl, 0);
+		break;
+	}
+	return 0;
+}
+
+/**
+ * find_first_waiter - find the first gh that's waiting for the glock
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl)
+{
+	struct gfs2_holder *gh;
+
+	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+			return gh;
+	}
+	return NULL;
+}
+
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+	int held1, held2;
+
+	held1 = (gl->gl_state != LM_ST_UNLOCKED);
+	held2 = (new_state != LM_ST_UNLOCKED);
+
+	if (held1 != held2) {
+		if (held2)
+			gfs2_glock_hold(gl);
+		else
+			gfs2_glock_put_nolock(gl);
+	}
+	if (held1 && held2 && list_empty(&gl->gl_holders))
+		clear_bit(GLF_QUEUED, &gl->gl_flags);
+
+	gl->gl_state = new_state;
+	gl->gl_tchange = jiffies;
+}
+
+static void gfs2_demote_wake(struct gfs2_glock *gl)
+{
+	gl->gl_demote_state = LM_ST_EXCLUSIVE;
+	clear_bit(GLF_DEMOTE, &gl->gl_flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
+}
+
+/**
+ * finish_xmote - The DLM has replied to one of our lock requests
+ * @gl: The glock
+ * @ret: The status from the DLM
+ *
+ */
+
+static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_holder *gh;
+	unsigned state = ret & LM_OUT_ST_MASK;
+	int rv;
+
+	spin_lock(&gl->gl_spin);
+	trace_gfs2_glock_state_change(gl, state);
+	state_change(gl, state);
+	gh = find_first_waiter(gl);
+
+	/* Demote to UN request arrived during demote to SH or DF */
+	if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+	    state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED)
+		gl->gl_target = LM_ST_UNLOCKED;
+
+	/* Check for state != intended state */
+	if (unlikely(state != gl->gl_target)) {
+		if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
+			/* move to back of queue and try next entry */
+			if (ret & LM_OUT_CANCELED) {
+				if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0)
+					list_move_tail(&gh->gh_list, &gl->gl_holders);
+				gh = find_first_waiter(gl);
+				gl->gl_target = gh->gh_state;
+				goto retry;
+			}
+			/* Some error or failed "try lock" - report it */
+			if ((ret & LM_OUT_ERROR) ||
+			    (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
+				gl->gl_target = gl->gl_state;
+				do_error(gl, ret);
+				goto out;
+			}
+		}
+		switch(state) {
+		/* Unlocked due to conversion deadlock, try again */
+		case LM_ST_UNLOCKED:
+retry:
+			do_xmote(gl, gh, gl->gl_target);
+			break;
+		/* Conversion fails, unlock and try again */
+		case LM_ST_SHARED:
+		case LM_ST_DEFERRED:
+			do_xmote(gl, gh, LM_ST_UNLOCKED);
+			break;
+		default: /* Everything else */
+			printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state);
+			GLOCK_BUG_ON(gl, 1);
+		}
+		spin_unlock(&gl->gl_spin);
+		return;
+	}
+
+	/* Fast path - we got what we asked for */
+	if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
+		gfs2_demote_wake(gl);
+	if (state != LM_ST_UNLOCKED) {
+		if (glops->go_xmote_bh) {
+			spin_unlock(&gl->gl_spin);
+			rv = glops->go_xmote_bh(gl, gh);
+			spin_lock(&gl->gl_spin);
+			if (rv) {
+				do_error(gl, rv);
+				goto out;
+			}
+		}
+		rv = do_promote(gl);
+		if (rv == 2)
+			goto out_locked;
+	}
+out:
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+out_locked:
+	spin_unlock(&gl->gl_spin);
+}
+
+/**
+ * do_xmote - Calls the DLM to change the state of a lock
+ * @gl: The lock state
+ * @gh: The holder (only for promotes)
+ * @target: The target lock state
+ *
+ */
+
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	unsigned int lck_flags = gh ? gh->gh_flags : 0;
+	int ret;
+
+	lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
+		      LM_FLAG_PRIORITY);
+	GLOCK_BUG_ON(gl, gl->gl_state == target);
+	GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
+	if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
+	    glops->go_inval) {
+		set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+		do_error(gl, 0); /* Fail queued try locks */
+	}
+	gl->gl_req = target;
+	spin_unlock(&gl->gl_spin);
+	if (glops->go_xmote_th)
+		glops->go_xmote_th(gl);
+	if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+		glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
+	clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+
+	gfs2_glock_hold(gl);
+	if (sdp->sd_lockstruct.ls_ops->lm_lock)	{
+		/* lock_dlm */
+		ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
+		GLOCK_BUG_ON(gl, ret);
+	} else { /* lock_nolock */
+		finish_xmote(gl, target);
+		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+			gfs2_glock_put(gl);
+	}
+
+	spin_lock(&gl->gl_spin);
+}
+
+/**
+ * find_first_holder - find the first "holder" gh
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
+{
+	struct gfs2_holder *gh;
+
+	if (!list_empty(&gl->gl_holders)) {
+		gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			return gh;
+	}
+	return NULL;
+}
+
+/**
+ * run_queue - do all outstanding tasks related to a glock
+ * @gl: The glock in question
+ * @nonblock: True if we must not block in run_queue
+ *
+ */
+
+static void run_queue(struct gfs2_glock *gl, const int nonblock)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
+{
+	struct gfs2_holder *gh = NULL;
+	int ret;
+
+	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
+		return;
+
+	GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
+
+	if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
+	    gl->gl_demote_state != gl->gl_state) {
+		if (find_first_holder(gl))
+			goto out_unlock;
+		if (nonblock)
+			goto out_sched;
+		set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
+		GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
+		gl->gl_target = gl->gl_demote_state;
+	} else {
+		if (test_bit(GLF_DEMOTE, &gl->gl_flags))
+			gfs2_demote_wake(gl);
+		ret = do_promote(gl);
+		if (ret == 0)
+			goto out_unlock;
+		if (ret == 2)
+			goto out;
+		gh = find_first_waiter(gl);
+		gl->gl_target = gh->gh_state;
+		if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+			do_error(gl, 0); /* Fail queued try locks */
+	}
+	do_xmote(gl, gh, gl->gl_target);
+out:
+	return;
+
+out_sched:
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+	smp_mb__after_clear_bit();
+	gfs2_glock_hold(gl);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put_nolock(gl);
+	return;
+
+out_unlock:
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+	smp_mb__after_clear_bit();
+	return;
+}
+
+static void delete_work_func(struct work_struct *work)
+{
+	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_inode *ip;
+	struct inode *inode;
+	u64 no_addr = gl->gl_name.ln_number;
+
+	ip = gl->gl_object;
+	/* Note: Unsafe to dereference ip as we don't hold right refs/locks */
+
+	if (ip)
+		inode = gfs2_ilookup(sdp->sd_vfs, no_addr, 1);
+	else
+		inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
+	if (inode && !IS_ERR(inode)) {
+		d_prune_aliases(inode);
+		iput(inode);
+	}
+	gfs2_glock_put(gl);
+}
+
+static void glock_work_func(struct work_struct *work)
+{
+	unsigned long delay = 0;
+	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+	int drop_ref = 0;
+
+	if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
+		finish_xmote(gl, gl->gl_reply);
+		drop_ref = 1;
+	}
+	spin_lock(&gl->gl_spin);
+	if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+	    gl->gl_state != LM_ST_UNLOCKED &&
+	    gl->gl_demote_state != LM_ST_EXCLUSIVE) {
+		unsigned long holdtime, now = jiffies;
+
+		holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
+		if (time_before(now, holdtime))
+			delay = holdtime - now;
+
+		if (!delay) {
+			clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags);
+			set_bit(GLF_DEMOTE, &gl->gl_flags);
+		}
+	}
+	run_queue(gl, 0);
+	spin_unlock(&gl->gl_spin);
+	if (!delay ||
+	    queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+		gfs2_glock_put(gl);
+	if (drop_ref)
+		gfs2_glock_put(gl);
+}
+
+/**
+ * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
+ * @sdp: The GFS2 superblock
+ * @number: the lock number
+ * @glops: The glock_operations to use
+ * @create: If 0, don't create the glock if it doesn't exist
+ * @glp: the glock is returned here
+ *
+ * This does not lock a glock, just finds/creates structures for one.
+ *
+ * Returns: errno
+ */
+
+int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
+		   const struct gfs2_glock_operations *glops, int create,
+		   struct gfs2_glock **glp)
+{
+	struct super_block *s = sdp->sd_vfs;
+	struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
+	struct gfs2_glock *gl, *tmp;
+	unsigned int hash = gl_hash(sdp, &name);
+	struct address_space *mapping;
+	struct kmem_cache *cachep;
+
+	rcu_read_lock();
+	gl = search_bucket(hash, sdp, &name);
+	rcu_read_unlock();
+
+	*glp = gl;
+	if (gl)
+		return 0;
+	if (!create)
+		return -ENOENT;
+
+	if (glops->go_flags & GLOF_ASPACE)
+		cachep = gfs2_glock_aspace_cachep;
+	else
+		cachep = gfs2_glock_cachep;
+	gl = kmem_cache_alloc(cachep, GFP_KERNEL);
+	if (!gl)
+		return -ENOMEM;
+
+	atomic_inc(&sdp->sd_glock_disposal);
+	gl->gl_flags = 0;
+	gl->gl_name = name;
+	atomic_set(&gl->gl_ref, 1);
+	gl->gl_state = LM_ST_UNLOCKED;
+	gl->gl_target = LM_ST_UNLOCKED;
+	gl->gl_demote_state = LM_ST_EXCLUSIVE;
+	gl->gl_hash = hash;
+	gl->gl_ops = glops;
+	snprintf(gl->gl_strname, GDLM_STRNAME_BYTES, "%8x%16llx", name.ln_type, (unsigned long long)number);
+	memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
+	gl->gl_lksb.sb_lvbptr = gl->gl_lvb;
+	gl->gl_tchange = jiffies;
+	gl->gl_object = NULL;
+	gl->gl_sbd = sdp;
+	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
+	INIT_WORK(&gl->gl_delete, delete_work_func);
+
+	mapping = gfs2_glock2aspace(gl);
+	if (mapping) {
+                mapping->a_ops = &gfs2_meta_aops;
+		mapping->host = s->s_bdev->bd_inode;
+		mapping->flags = 0;
+		mapping_set_gfp_mask(mapping, GFP_NOFS);
+		mapping->assoc_mapping = NULL;
+		mapping->backing_dev_info = s->s_bdi;
+		mapping->writeback_index = 0;
+	}
+
+	spin_lock_bucket(hash);
+	tmp = search_bucket(hash, sdp, &name);
+	if (tmp) {
+		spin_unlock_bucket(hash);
+		kmem_cache_free(cachep, gl);
+		atomic_dec(&sdp->sd_glock_disposal);
+		gl = tmp;
+	} else {
+		hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]);
+		spin_unlock_bucket(hash);
+	}
+
+	*glp = gl;
+
+	return 0;
+}
+
+/**
+ * gfs2_holder_init - initialize a struct gfs2_holder in the default way
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+		      struct gfs2_holder *gh)
+{
+	INIT_LIST_HEAD(&gh->gh_list);
+	gh->gh_gl = gl;
+	gh->gh_ip = (unsigned long)__builtin_return_address(0);
+	gh->gh_owner_pid = get_pid(task_pid(current));
+	gh->gh_state = state;
+	gh->gh_flags = flags;
+	gh->gh_error = 0;
+	gh->gh_iflags = 0;
+	gfs2_glock_hold(gl);
+}
+
+/**
+ * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ * Don't mess with the glock.
+ *
+ */
+
+void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
+{
+	gh->gh_state = state;
+	gh->gh_flags = flags;
+	gh->gh_iflags = 0;
+	gh->gh_ip = (unsigned long)__builtin_return_address(0);
+	if (gh->gh_owner_pid)
+		put_pid(gh->gh_owner_pid);
+	gh->gh_owner_pid = get_pid(task_pid(current));
+}
+
+/**
+ * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_holder_uninit(struct gfs2_holder *gh)
+{
+	put_pid(gh->gh_owner_pid);
+	gfs2_glock_put(gh->gh_gl);
+	gh->gh_gl = NULL;
+	gh->gh_ip = 0;
+}
+
+/**
+ * gfs2_glock_holder_wait
+ * @word: unused
+ *
+ * This function and gfs2_glock_demote_wait both show up in the WCHAN
+ * field. Thus I've separated these otherwise identical functions in
+ * order to be more informative to the user.
+ */
+
+static int gfs2_glock_holder_wait(void *word)
+{
+        schedule();
+        return 0;
+}
+
+static int gfs2_glock_demote_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
+static void wait_on_holder(struct gfs2_holder *gh)
+{
+	might_sleep();
+	wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE);
+}
+
+static void wait_on_demote(struct gfs2_glock *gl)
+{
+	might_sleep();
+	wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE);
+}
+
+/**
+ * handle_callback - process a demote request
+ * @gl: the glock
+ * @state: the state the caller wants us to change to
+ *
+ * There are only two requests that we are going to see in actual
+ * practise: LM_ST_SHARED and LM_ST_UNLOCKED
+ */
+
+static void handle_callback(struct gfs2_glock *gl, unsigned int state,
+			    unsigned long delay)
+{
+	int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
+
+	set_bit(bit, &gl->gl_flags);
+	if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
+		gl->gl_demote_state = state;
+		gl->gl_demote_time = jiffies;
+	} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
+			gl->gl_demote_state != state) {
+		gl->gl_demote_state = LM_ST_UNLOCKED;
+	}
+	if (gl->gl_ops->go_callback)
+		gl->gl_ops->go_callback(gl);
+	trace_gfs2_demote_rq(gl);
+}
+
+/**
+ * gfs2_glock_wait - wait on a glock acquisition
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success
+ */
+
+int gfs2_glock_wait(struct gfs2_holder *gh)
+{
+	wait_on_holder(gh);
+	return gh->gh_error;
+}
+
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	if (seq) {
+		struct gfs2_glock_iter *gi = seq->private;
+		vsprintf(gi->string, fmt, args);
+		seq_printf(seq, gi->string);
+	} else {
+		vaf.fmt = fmt;
+		vaf.va = &args;
+
+		printk(KERN_ERR " %pV", &vaf);
+	}
+
+	va_end(args);
+}
+
+/**
+ * add_to_queue - Add a holder to the wait queue (but look for recursion)
+ * @gh: the holder structure to add
+ *
+ * Eventually we should move the recursive locking trap to a
+ * debugging option or something like that. This is the fast
+ * path and needs to have the minimum number of distractions.
+ * 
+ */
+
+static inline void add_to_queue(struct gfs2_holder *gh)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct list_head *insert_pt = NULL;
+	struct gfs2_holder *gh2;
+	int try_lock = 0;
+
+	BUG_ON(gh->gh_owner_pid == NULL);
+	if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
+		BUG();
+
+	if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+		if (test_bit(GLF_LOCK, &gl->gl_flags))
+			try_lock = 1;
+		if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+			goto fail;
+	}
+
+	list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
+		if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
+		    (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
+			goto trap_recursive;
+		if (try_lock &&
+		    !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) &&
+		    !may_grant(gl, gh)) {
+fail:
+			gh->gh_error = GLR_TRYFAILED;
+			gfs2_holder_wake(gh);
+			return;
+		}
+		if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
+			continue;
+		if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
+			insert_pt = &gh2->gh_list;
+	}
+	set_bit(GLF_QUEUED, &gl->gl_flags);
+	trace_gfs2_glock_queue(gh, 1);
+	if (likely(insert_pt == NULL)) {
+		list_add_tail(&gh->gh_list, &gl->gl_holders);
+		if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
+			goto do_cancel;
+		return;
+	}
+	list_add_tail(&gh->gh_list, insert_pt);
+do_cancel:
+	gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+	if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
+		spin_unlock(&gl->gl_spin);
+		if (sdp->sd_lockstruct.ls_ops->lm_cancel)
+			sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
+		spin_lock(&gl->gl_spin);
+	}
+	return;
+
+trap_recursive:
+	print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip);
+	printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid));
+	printk(KERN_ERR "lock type: %d req lock state : %d\n",
+	       gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
+	print_symbol(KERN_ERR "new: %s\n", gh->gh_ip);
+	printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
+	printk(KERN_ERR "lock type: %d req lock state : %d\n",
+	       gh->gh_gl->gl_name.ln_type, gh->gh_state);
+	__dump_glock(NULL, gl);
+	BUG();
+}
+
+/**
+ * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
+ * @gh: the holder structure
+ *
+ * if (gh->gh_flags & GL_ASYNC), this never returns an error
+ *
+ * Returns: 0, GLR_TRYFAILED, or errno on failure
+ */
+
+int gfs2_glock_nq(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	int error = 0;
+
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	if (test_bit(GLF_LRU, &gl->gl_flags))
+		gfs2_glock_remove_from_lru(gl);
+
+	spin_lock(&gl->gl_spin);
+	add_to_queue(gh);
+	if ((LM_FLAG_NOEXP & gh->gh_flags) &&
+	    test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+		set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+	run_queue(gl, 1);
+	spin_unlock(&gl->gl_spin);
+
+	if (!(gh->gh_flags & GL_ASYNC))
+		error = gfs2_glock_wait(gh);
+
+	return error;
+}
+
+/**
+ * gfs2_glock_poll - poll to see if an async request has been completed
+ * @gh: the holder
+ *
+ * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
+ */
+
+int gfs2_glock_poll(struct gfs2_holder *gh)
+{
+	return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
+}
+
+/**
+ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
+ * @gh: the glock holder
+ *
+ */
+
+void gfs2_glock_dq(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	unsigned delay = 0;
+	int fast_path = 0;
+
+	spin_lock(&gl->gl_spin);
+	if (gh->gh_flags & GL_NOCACHE)
+		handle_callback(gl, LM_ST_UNLOCKED, 0);
+
+	list_del_init(&gh->gh_list);
+	if (find_first_holder(gl) == NULL) {
+		if (glops->go_unlock) {
+			GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
+			spin_unlock(&gl->gl_spin);
+			glops->go_unlock(gh);
+			spin_lock(&gl->gl_spin);
+			clear_bit(GLF_LOCK, &gl->gl_flags);
+		}
+		if (list_empty(&gl->gl_holders) &&
+		    !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+		    !test_bit(GLF_DEMOTE, &gl->gl_flags))
+			fast_path = 1;
+	}
+	if (!test_bit(GLF_LFLUSH, &gl->gl_flags))
+		__gfs2_glock_schedule_for_reclaim(gl);
+	trace_gfs2_glock_queue(gh, 0);
+	spin_unlock(&gl->gl_spin);
+	if (likely(fast_path))
+		return;
+
+	gfs2_glock_hold(gl);
+	if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
+		delay = gl->gl_ops->go_min_hold_time;
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+		gfs2_glock_put(gl);
+}
+
+void gfs2_glock_dq_wait(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	gfs2_glock_dq(gh);
+	wait_on_demote(gl);
+}
+
+/**
+ * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
+{
+	gfs2_glock_dq(gh);
+	gfs2_holder_uninit(gh);
+}
+
+/**
+ * gfs2_glock_nq_num - acquire a glock based on lock number
+ * @sdp: the filesystem
+ * @number: the lock number
+ * @glops: the glock operations for the type of glock
+ * @state: the state to acquire the glock in
+ * @flags: modifier flags for the acquisition
+ * @gh: the struct gfs2_holder
+ *
+ * Returns: errno
+ */
+
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
+		      const struct gfs2_glock_operations *glops,
+		      unsigned int state, int flags, struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl;
+	int error;
+
+	error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
+	if (!error) {
+		error = gfs2_glock_nq_init(gl, state, flags, gh);
+		gfs2_glock_put(gl);
+	}
+
+	return error;
+}
+
+/**
+ * glock_compare - Compare two struct gfs2_glock structures for sorting
+ * @arg_a: the first structure
+ * @arg_b: the second structure
+ *
+ */
+
+static int glock_compare(const void *arg_a, const void *arg_b)
+{
+	const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
+	const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
+	const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
+	const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
+
+	if (a->ln_number > b->ln_number)
+		return 1;
+	if (a->ln_number < b->ln_number)
+		return -1;
+	BUG_ON(gh_a->gh_gl->gl_ops->go_type == gh_b->gh_gl->gl_ops->go_type);
+	return 0;
+}
+
+/**
+ * nq_m_sync - synchonously acquire more than one glock in deadlock free order
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Returns: 0 on success (all glocks acquired),
+ *          errno on failure (no glocks acquired)
+ */
+
+static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
+		     struct gfs2_holder **p)
+{
+	unsigned int x;
+	int error = 0;
+
+	for (x = 0; x < num_gh; x++)
+		p[x] = &ghs[x];
+
+	sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
+
+	for (x = 0; x < num_gh; x++) {
+		p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+
+		error = gfs2_glock_nq(p[x]);
+		if (error) {
+			while (x--)
+				gfs2_glock_dq(p[x]);
+			break;
+		}
+	}
+
+	return error;
+}
+
+/**
+ * gfs2_glock_nq_m - acquire multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ *
+ * Returns: 0 on success (all glocks acquired),
+ *          errno on failure (no glocks acquired)
+ */
+
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+	struct gfs2_holder *tmp[4];
+	struct gfs2_holder **pph = tmp;
+	int error = 0;
+
+	switch(num_gh) {
+	case 0:
+		return 0;
+	case 1:
+		ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+		return gfs2_glock_nq(ghs);
+	default:
+		if (num_gh <= 4)
+			break;
+		pph = kmalloc(num_gh * sizeof(struct gfs2_holder *), GFP_NOFS);
+		if (!pph)
+			return -ENOMEM;
+	}
+
+	error = nq_m_sync(num_gh, ghs, pph);
+
+	if (pph != tmp)
+		kfree(pph);
+
+	return error;
+}
+
+/**
+ * gfs2_glock_dq_m - release multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ */
+
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+	while (num_gh--)
+		gfs2_glock_dq(&ghs[num_gh]);
+}
+
+/**
+ * gfs2_glock_dq_uninit_m - release multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ */
+
+void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+	while (num_gh--)
+		gfs2_glock_dq_uninit(&ghs[num_gh]);
+}
+
+void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
+{
+	unsigned long delay = 0;
+	unsigned long holdtime;
+	unsigned long now = jiffies;
+
+	gfs2_glock_hold(gl);
+	holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
+	if (test_bit(GLF_QUEUED, &gl->gl_flags)) {
+		if (time_before(now, holdtime))
+			delay = holdtime - now;
+		if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+			delay = gl->gl_ops->go_min_hold_time;
+	}
+
+	spin_lock(&gl->gl_spin);
+	handle_callback(gl, state, delay);
+	spin_unlock(&gl->gl_spin);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+		gfs2_glock_put(gl);
+}
+
+/**
+ * gfs2_should_freeze - Figure out if glock should be frozen
+ * @gl: The glock in question
+ *
+ * Glocks are not frozen if (a) the result of the dlm operation is
+ * an error, (b) the locking operation was an unlock operation or
+ * (c) if there is a "noexp" flagged request anywhere in the queue
+ *
+ * Returns: 1 if freezing should occur, 0 otherwise
+ */
+
+static int gfs2_should_freeze(const struct gfs2_glock *gl)
+{
+	const struct gfs2_holder *gh;
+
+	if (gl->gl_reply & ~LM_OUT_ST_MASK)
+		return 0;
+	if (gl->gl_target == LM_ST_UNLOCKED)
+		return 0;
+
+	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			continue;
+		if (LM_FLAG_NOEXP & gh->gh_flags)
+			return 0;
+	}
+
+	return 1;
+}
+
+/**
+ * gfs2_glock_complete - Callback used by locking
+ * @gl: Pointer to the glock
+ * @ret: The return value from the dlm
+ *
+ * The gl_reply field is under the gl_spin lock so that it is ok
+ * to use a bitfield shared with other glock state fields.
+ */
+
+void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
+{
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+
+	spin_lock(&gl->gl_spin);
+	gl->gl_reply = ret;
+
+	if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
+		if (gfs2_should_freeze(gl)) {
+			set_bit(GLF_FROZEN, &gl->gl_flags);
+			spin_unlock(&gl->gl_spin);
+			return;
+		}
+	}
+
+	spin_unlock(&gl->gl_spin);
+	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+	smp_wmb();
+	gfs2_glock_hold(gl);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
+}
+
+
+static int gfs2_shrink_glock_memory(struct shrinker *shrink,
+				    struct shrink_control *sc)
+{
+	struct gfs2_glock *gl;
+	int may_demote;
+	int nr_skipped = 0;
+	int nr = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
+	LIST_HEAD(skipped);
+
+	if (nr == 0)
+		goto out;
+
+	if (!(gfp_mask & __GFP_FS))
+		return -1;
+
+	spin_lock(&lru_lock);
+	while(nr && !list_empty(&lru_list)) {
+		gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
+		list_del_init(&gl->gl_lru);
+		clear_bit(GLF_LRU, &gl->gl_flags);
+		atomic_dec(&lru_count);
+
+		/* Test for being demotable */
+		if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+			gfs2_glock_hold(gl);
+			spin_unlock(&lru_lock);
+			spin_lock(&gl->gl_spin);
+			may_demote = demote_ok(gl);
+			if (may_demote) {
+				handle_callback(gl, LM_ST_UNLOCKED, 0);
+				nr--;
+			}
+			clear_bit(GLF_LOCK, &gl->gl_flags);
+			smp_mb__after_clear_bit();
+			if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+				gfs2_glock_put_nolock(gl);
+			spin_unlock(&gl->gl_spin);
+			spin_lock(&lru_lock);
+			continue;
+		}
+		nr_skipped++;
+		list_add(&gl->gl_lru, &skipped);
+		set_bit(GLF_LRU, &gl->gl_flags);
+	}
+	list_splice(&skipped, &lru_list);
+	atomic_add(nr_skipped, &lru_count);
+	spin_unlock(&lru_lock);
+out:
+	return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
+}
+
+static struct shrinker glock_shrinker = {
+	.shrink = gfs2_shrink_glock_memory,
+	.seeks = DEFAULT_SEEKS,
+};
+
+/**
+ * examine_bucket - Call a function for glock in a hash bucket
+ * @examiner: the function
+ * @sdp: the filesystem
+ * @bucket: the bucket
+ *
+ */
+
+static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
+			  unsigned int hash)
+{
+	struct gfs2_glock *gl;
+	struct hlist_bl_head *head = &gl_hash_table[hash];
+	struct hlist_bl_node *pos;
+
+	rcu_read_lock();
+	hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
+		if ((gl->gl_sbd == sdp) && atomic_read(&gl->gl_ref))
+			examiner(gl);
+	}
+	rcu_read_unlock();
+	cond_resched();
+}
+
+static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
+{
+	unsigned x;
+
+	for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+		examine_bucket(examiner, sdp, x);
+}
+
+
+/**
+ * thaw_glock - thaw out a glock which has an unprocessed reply waiting
+ * @gl: The glock to thaw
+ *
+ * N.B. When we freeze a glock, we leave a ref to the glock outstanding,
+ * so this has to result in the ref count being dropped by one.
+ */
+
+static void thaw_glock(struct gfs2_glock *gl)
+{
+	if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+		return;
+	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+	gfs2_glock_hold(gl);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
+}
+
+/**
+ * clear_glock - look at a glock and see if we can free it from glock cache
+ * @gl: the glock to look at
+ *
+ */
+
+static void clear_glock(struct gfs2_glock *gl)
+{
+	gfs2_glock_remove_from_lru(gl);
+
+	spin_lock(&gl->gl_spin);
+	if (gl->gl_state != LM_ST_UNLOCKED)
+		handle_callback(gl, LM_ST_UNLOCKED, 0);
+	spin_unlock(&gl->gl_spin);
+	gfs2_glock_hold(gl);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
+}
+
+/**
+ * gfs2_glock_thaw - Thaw any frozen glocks
+ * @sdp: The super block
+ *
+ */
+
+void gfs2_glock_thaw(struct gfs2_sbd *sdp)
+{
+	glock_hash_walk(thaw_glock, sdp);
+}
+
+static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+{
+	int ret;
+	spin_lock(&gl->gl_spin);
+	ret = __dump_glock(seq, gl);
+	spin_unlock(&gl->gl_spin);
+	return ret;
+}
+
+static void dump_glock_func(struct gfs2_glock *gl)
+{
+	dump_glock(NULL, gl);
+}
+
+/**
+ * gfs2_gl_hash_clear - Empty out the glock hash table
+ * @sdp: the filesystem
+ * @wait: wait until it's all gone
+ *
+ * Called when unmounting the filesystem.
+ */
+
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
+{
+	glock_hash_walk(clear_glock, sdp);
+	flush_workqueue(glock_workqueue);
+	wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
+	glock_hash_walk(dump_glock_func, sdp);
+}
+
+void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
+{
+	struct gfs2_glock *gl = ip->i_gl;
+	int ret;
+
+	ret = gfs2_truncatei_resume(ip);
+	gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
+
+	spin_lock(&gl->gl_spin);
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+	run_queue(gl, 1);
+	spin_unlock(&gl->gl_spin);
+}
+
+static const char *state2str(unsigned state)
+{
+	switch(state) {
+	case LM_ST_UNLOCKED:
+		return "UN";
+	case LM_ST_SHARED:
+		return "SH";
+	case LM_ST_DEFERRED:
+		return "DF";
+	case LM_ST_EXCLUSIVE:
+		return "EX";
+	}
+	return "??";
+}
+
+static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
+{
+	char *p = buf;
+	if (flags & LM_FLAG_TRY)
+		*p++ = 't';
+	if (flags & LM_FLAG_TRY_1CB)
+		*p++ = 'T';
+	if (flags & LM_FLAG_NOEXP)
+		*p++ = 'e';
+	if (flags & LM_FLAG_ANY)
+		*p++ = 'A';
+	if (flags & LM_FLAG_PRIORITY)
+		*p++ = 'p';
+	if (flags & GL_ASYNC)
+		*p++ = 'a';
+	if (flags & GL_EXACT)
+		*p++ = 'E';
+	if (flags & GL_NOCACHE)
+		*p++ = 'c';
+	if (test_bit(HIF_HOLDER, &iflags))
+		*p++ = 'H';
+	if (test_bit(HIF_WAIT, &iflags))
+		*p++ = 'W';
+	if (test_bit(HIF_FIRST, &iflags))
+		*p++ = 'F';
+	*p = 0;
+	return buf;
+}
+
+/**
+ * dump_holder - print information about a glock holder
+ * @seq: the seq_file struct
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
+{
+	struct task_struct *gh_owner = NULL;
+	char flags_buf[32];
+
+	if (gh->gh_owner_pid)
+		gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
+	gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
+		       state2str(gh->gh_state),
+		       hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+		       gh->gh_error,
+		       gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+		       gh_owner ? gh_owner->comm : "(ended)",
+		       (void *)gh->gh_ip);
+	return 0;
+}
+
+static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
+{
+	const unsigned long *gflags = &gl->gl_flags;
+	char *p = buf;
+
+	if (test_bit(GLF_LOCK, gflags))
+		*p++ = 'l';
+	if (test_bit(GLF_DEMOTE, gflags))
+		*p++ = 'D';
+	if (test_bit(GLF_PENDING_DEMOTE, gflags))
+		*p++ = 'd';
+	if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags))
+		*p++ = 'p';
+	if (test_bit(GLF_DIRTY, gflags))
+		*p++ = 'y';
+	if (test_bit(GLF_LFLUSH, gflags))
+		*p++ = 'f';
+	if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
+		*p++ = 'i';
+	if (test_bit(GLF_REPLY_PENDING, gflags))
+		*p++ = 'r';
+	if (test_bit(GLF_INITIAL, gflags))
+		*p++ = 'I';
+	if (test_bit(GLF_FROZEN, gflags))
+		*p++ = 'F';
+	if (test_bit(GLF_QUEUED, gflags))
+		*p++ = 'q';
+	if (test_bit(GLF_LRU, gflags))
+		*p++ = 'L';
+	if (gl->gl_object)
+		*p++ = 'o';
+	*p = 0;
+	return buf;
+}
+
+/**
+ * __dump_glock - print information about a glock
+ * @seq: The seq_file struct
+ * @gl: the glock
+ *
+ * The file format is as follows:
+ * One line per object, capital letters are used to indicate objects
+ * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented,
+ * other objects are indented by a single space and follow the glock to
+ * which they are related. Fields are indicated by lower case letters
+ * followed by a colon and the field value, except for strings which are in
+ * [] so that its possible to see if they are composed of spaces for
+ * example. The field's are n = number (id of the object), f = flags,
+ * t = type, s = state, r = refcount, e = error, p = pid.
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	unsigned long long dtime;
+	const struct gfs2_holder *gh;
+	char gflags_buf[32];
+	int error = 0;
+
+	dtime = jiffies - gl->gl_demote_time;
+	dtime *= 1000000/HZ; /* demote time in uSec */
+	if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
+		dtime = 0;
+	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d\n",
+		  state2str(gl->gl_state),
+		  gl->gl_name.ln_type,
+		  (unsigned long long)gl->gl_name.ln_number,
+		  gflags2str(gflags_buf, gl),
+		  state2str(gl->gl_target),
+		  state2str(gl->gl_demote_state), dtime,
+		  atomic_read(&gl->gl_ail_count),
+		  atomic_read(&gl->gl_revokes),
+		  atomic_read(&gl->gl_ref));
+
+	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		error = dump_holder(seq, gh);
+		if (error)
+			goto out;
+	}
+	if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
+		error = glops->go_dump(seq, gl);
+out:
+	return error;
+}
+
+
+
+
+int __init gfs2_glock_init(void)
+{
+	unsigned i;
+	for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
+		INIT_HLIST_BL_HEAD(&gl_hash_table[i]);
+	}
+
+	glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
+					  WQ_HIGHPRI | WQ_FREEZABLE, 0);
+	if (IS_ERR(glock_workqueue))
+		return PTR_ERR(glock_workqueue);
+	gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
+						WQ_MEM_RECLAIM | WQ_FREEZABLE,
+						0);
+	if (IS_ERR(gfs2_delete_workqueue)) {
+		destroy_workqueue(glock_workqueue);
+		return PTR_ERR(gfs2_delete_workqueue);
+	}
+
+	register_shrinker(&glock_shrinker);
+
+	return 0;
+}
+
+void gfs2_glock_exit(void)
+{
+	unregister_shrinker(&glock_shrinker);
+	destroy_workqueue(glock_workqueue);
+	destroy_workqueue(gfs2_delete_workqueue);
+}
+
+static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
+{
+	return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]),
+			      struct gfs2_glock, gl_list);
+}
+
+static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl)
+{
+	return hlist_bl_entry(rcu_dereference(gl->gl_list.next),
+			      struct gfs2_glock, gl_list);
+}
+
+static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
+{
+	struct gfs2_glock *gl;
+
+	do {
+		gl = gi->gl;
+		if (gl) {
+			gi->gl = glock_hash_next(gl);
+		} else {
+			gi->gl = glock_hash_chain(gi->hash);
+		}
+		while (gi->gl == NULL) {
+			gi->hash++;
+			if (gi->hash >= GFS2_GL_HASH_SIZE) {
+				rcu_read_unlock();
+				return 1;
+			}
+			gi->gl = glock_hash_chain(gi->hash);
+		}
+	/* Skip entries for other sb and dead entries */
+	} while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0);
+
+	return 0;
+}
+
+static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct gfs2_glock_iter *gi = seq->private;
+	loff_t n = *pos;
+
+	gi->hash = 0;
+	rcu_read_lock();
+
+	do {
+		if (gfs2_glock_iter_next(gi))
+			return NULL;
+	} while (n--);
+
+	return gi->gl;
+}
+
+static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
+				 loff_t *pos)
+{
+	struct gfs2_glock_iter *gi = seq->private;
+
+	(*pos)++;
+
+	if (gfs2_glock_iter_next(gi))
+		return NULL;
+
+	return gi->gl;
+}
+
+static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
+{
+	struct gfs2_glock_iter *gi = seq->private;
+
+	if (gi->gl)
+		rcu_read_unlock();
+	gi->gl = NULL;
+}
+
+static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
+{
+	return dump_glock(seq, iter_ptr);
+}
+
+static const struct seq_operations gfs2_glock_seq_ops = {
+	.start = gfs2_glock_seq_start,
+	.next  = gfs2_glock_seq_next,
+	.stop  = gfs2_glock_seq_stop,
+	.show  = gfs2_glock_seq_show,
+};
+
+static int gfs2_debugfs_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open_private(file, &gfs2_glock_seq_ops,
+				   sizeof(struct gfs2_glock_iter));
+	if (ret == 0) {
+		struct seq_file *seq = file->private_data;
+		struct gfs2_glock_iter *gi = seq->private;
+		gi->sdp = inode->i_private;
+	}
+	return ret;
+}
+
+static const struct file_operations gfs2_debug_fops = {
+	.owner   = THIS_MODULE,
+	.open    = gfs2_debugfs_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private,
+};
+
+int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
+{
+	sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
+	if (!sdp->debugfs_dir)
+		return -ENOMEM;
+	sdp->debugfs_dentry_glocks = debugfs_create_file("glocks",
+							 S_IFREG | S_IRUGO,
+							 sdp->debugfs_dir, sdp,
+							 &gfs2_debug_fops);
+	if (!sdp->debugfs_dentry_glocks)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
+{
+	if (sdp && sdp->debugfs_dir) {
+		if (sdp->debugfs_dentry_glocks) {
+			debugfs_remove(sdp->debugfs_dentry_glocks);
+			sdp->debugfs_dentry_glocks = NULL;
+		}
+		debugfs_remove(sdp->debugfs_dir);
+		sdp->debugfs_dir = NULL;
+	}
+}
+
+int gfs2_register_debugfs(void)
+{
+	gfs2_root = debugfs_create_dir("gfs2", NULL);
+	return gfs2_root ? 0 : -ENOMEM;
+}
+
+void gfs2_unregister_debugfs(void)
+{
+	debugfs_remove(gfs2_root);
+	gfs2_root = NULL;
+}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
new file mode 100644
index 00000000..6b2f757b
--- /dev/null
+++ b/fs/gfs2/glock.h
@@ -0,0 +1,244 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __GLOCK_DOT_H__
+#define __GLOCK_DOT_H__
+
+#include <linux/sched.h>
+#include <linux/parser.h>
+#include "incore.h"
+
+/* Options for hostdata parser */
+
+enum {
+	Opt_jid,
+	Opt_id,
+	Opt_first,
+	Opt_nodir,
+	Opt_err,
+};
+
+/*
+ * lm_lockname types
+ */
+
+#define LM_TYPE_RESERVED	0x00
+#define LM_TYPE_NONDISK		0x01
+#define LM_TYPE_INODE		0x02
+#define LM_TYPE_RGRP		0x03
+#define LM_TYPE_META		0x04
+#define LM_TYPE_IOPEN		0x05
+#define LM_TYPE_FLOCK		0x06
+#define LM_TYPE_PLOCK		0x07
+#define LM_TYPE_QUOTA		0x08
+#define LM_TYPE_JOURNAL		0x09
+
+/*
+ * lm_lock() states
+ *
+ * SHARED is compatible with SHARED, not with DEFERRED or EX.
+ * DEFERRED is compatible with DEFERRED, not with SHARED or EX.
+ */
+
+#define LM_ST_UNLOCKED		0
+#define LM_ST_EXCLUSIVE		1
+#define LM_ST_DEFERRED		2
+#define LM_ST_SHARED		3
+
+/*
+ * lm_lock() flags
+ *
+ * LM_FLAG_TRY
+ * Don't wait to acquire the lock if it can't be granted immediately.
+ *
+ * LM_FLAG_TRY_1CB
+ * Send one blocking callback if TRY is set and the lock is not granted.
+ *
+ * LM_FLAG_NOEXP
+ * GFS sets this flag on lock requests it makes while doing journal recovery.
+ * These special requests should not be blocked due to the recovery like
+ * ordinary locks would be.
+ *
+ * LM_FLAG_ANY
+ * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may
+ * also be granted in SHARED.  The preferred state is whichever is compatible
+ * with other granted locks, or the specified state if no other locks exist.
+ *
+ * LM_FLAG_PRIORITY
+ * Override fairness considerations.  Suppose a lock is held in a shared state
+ * and there is a pending request for the deferred state.  A shared lock
+ * request with the priority flag would be allowed to bypass the deferred
+ * request and directly join the other shared lock.  A shared lock request
+ * without the priority flag might be forced to wait until the deferred
+ * requested had acquired and released the lock.
+ */
+
+#define LM_FLAG_TRY		0x00000001
+#define LM_FLAG_TRY_1CB		0x00000002
+#define LM_FLAG_NOEXP		0x00000004
+#define LM_FLAG_ANY		0x00000008
+#define LM_FLAG_PRIORITY	0x00000010
+#define GL_ASYNC		0x00000040
+#define GL_EXACT		0x00000080
+#define GL_SKIP			0x00000100
+#define GL_NOCACHE		0x00000400
+  
+/*
+ * lm_async_cb return flags
+ *
+ * LM_OUT_ST_MASK
+ * Masks the lower two bits of lock state in the returned value.
+ *
+ * LM_OUT_CANCELED
+ * The lock request was canceled.
+ *
+ */
+
+#define LM_OUT_ST_MASK		0x00000003
+#define LM_OUT_CANCELED		0x00000008
+#define LM_OUT_ERROR		0x00000004
+
+/*
+ * lm_recovery_done() messages
+ */
+
+#define LM_RD_GAVEUP		308
+#define LM_RD_SUCCESS		309
+
+#define GLR_TRYFAILED		13
+
+struct lm_lockops {
+	const char *lm_proto_name;
+	int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
+ 	void (*lm_unmount) (struct gfs2_sbd *sdp);
+	void (*lm_withdraw) (struct gfs2_sbd *sdp);
+	void (*lm_put_lock) (struct gfs2_glock *gl);
+	int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
+			unsigned int flags);
+	void (*lm_cancel) (struct gfs2_glock *gl);
+	const match_table_t *lm_tokens;
+};
+
+extern struct workqueue_struct *gfs2_delete_workqueue;
+static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
+{
+	struct gfs2_holder *gh;
+	struct pid *pid;
+
+	/* Look in glock's list of holders for one with current task as owner */
+	spin_lock(&gl->gl_spin);
+	pid = task_pid(current);
+	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+			break;
+		if (gh->gh_owner_pid == pid)
+			goto out;
+	}
+	gh = NULL;
+out:
+	spin_unlock(&gl->gl_spin);
+
+	return gh;
+}
+
+static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
+{
+	return gl->gl_state == LM_ST_EXCLUSIVE;
+}
+
+static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
+{
+	return gl->gl_state == LM_ST_DEFERRED;
+}
+
+static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
+{
+	return gl->gl_state == LM_ST_SHARED;
+}
+
+static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
+{
+	if (gl->gl_ops->go_flags & GLOF_ASPACE)
+		return (struct address_space *)(gl + 1);
+	return NULL;
+}
+
+int gfs2_glock_get(struct gfs2_sbd *sdp,
+		   u64 number, const struct gfs2_glock_operations *glops,
+		   int create, struct gfs2_glock **glp);
+void gfs2_glock_hold(struct gfs2_glock *gl);
+void gfs2_glock_put_nolock(struct gfs2_glock *gl);
+void gfs2_glock_put(struct gfs2_glock *gl);
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+		      struct gfs2_holder *gh);
+void gfs2_holder_reinit(unsigned int state, unsigned flags,
+			struct gfs2_holder *gh);
+void gfs2_holder_uninit(struct gfs2_holder *gh);
+int gfs2_glock_nq(struct gfs2_holder *gh);
+int gfs2_glock_poll(struct gfs2_holder *gh);
+int gfs2_glock_wait(struct gfs2_holder *gh);
+void gfs2_glock_dq(struct gfs2_holder *gh);
+void gfs2_glock_dq_wait(struct gfs2_holder *gh);
+
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
+		      u64 number, const struct gfs2_glock_operations *glops,
+		      unsigned int state, int flags, struct gfs2_holder *gh);
+
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+
+__attribute__ ((format(printf, 2, 3)))
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
+
+/**
+ * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ * Returns: 0, GLR_*, or errno
+ */
+
+static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
+				     unsigned int state, int flags,
+				     struct gfs2_holder *gh)
+{
+	int error;
+
+	gfs2_holder_init(gl, state, flags, gh);
+
+	error = gfs2_glock_nq(gh);
+	if (error)
+		gfs2_holder_uninit(gh);
+
+	return error;
+}
+
+extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
+extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
+extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
+extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
+extern void gfs2_glock_free(struct gfs2_glock *gl);
+
+extern int __init gfs2_glock_init(void);
+extern void gfs2_glock_exit(void);
+
+extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
+extern int gfs2_register_debugfs(void);
+extern void gfs2_unregister_debugfs(void);
+
+extern const struct lm_lockops gfs2_dlm_ops;
+
+#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
new file mode 100644
index 00000000..2cca2931
--- /dev/null
+++ b/fs/gfs2/glops.c
@@ -0,0 +1,605 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/bio.h>
+#include <linux/posix_acl.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "util.h"
+#include "trans.h"
+
+/**
+ * __gfs2_ail_flush - remove all buffers for a given lock from the AIL
+ * @gl: the glock
+ *
+ * None of the buffers should be dirty, locked, or pinned.
+ */
+
+static void __gfs2_ail_flush(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct list_head *head = &gl->gl_ail_list;
+	struct gfs2_bufdata *bd;
+	struct buffer_head *bh;
+
+	spin_lock(&sdp->sd_ail_lock);
+	while (!list_empty(head)) {
+		bd = list_entry(head->next, struct gfs2_bufdata,
+				bd_ail_gl_list);
+		bh = bd->bd_bh;
+		gfs2_remove_from_ail(bd);
+		bd->bd_bh = NULL;
+		bh->b_private = NULL;
+		spin_unlock(&sdp->sd_ail_lock);
+
+		bd->bd_blkno = bh->b_blocknr;
+		gfs2_log_lock(sdp);
+		gfs2_assert_withdraw(sdp, !buffer_busy(bh));
+		gfs2_trans_add_revoke(sdp, bd);
+		gfs2_log_unlock(sdp);
+
+		spin_lock(&sdp->sd_ail_lock);
+	}
+	gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+	spin_unlock(&sdp->sd_ail_lock);
+}
+
+
+static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_trans tr;
+
+	memset(&tr, 0, sizeof(tr));
+	tr.tr_revokes = atomic_read(&gl->gl_ail_count);
+
+	if (!tr.tr_revokes)
+		return;
+
+	/* A shortened, inline version of gfs2_trans_begin() */
+	tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
+	tr.tr_ip = (unsigned long)__builtin_return_address(0);
+	INIT_LIST_HEAD(&tr.tr_list_buf);
+	gfs2_log_reserve(sdp, tr.tr_reserved);
+	BUG_ON(current->journal_info);
+	current->journal_info = &tr;
+
+	__gfs2_ail_flush(gl);
+
+	gfs2_trans_end(sdp);
+	gfs2_log_flush(sdp, NULL);
+}
+
+void gfs2_ail_flush(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	unsigned int revokes = atomic_read(&gl->gl_ail_count);
+	int ret;
+
+	if (!revokes)
+		return;
+
+	ret = gfs2_trans_begin(sdp, 0, revokes);
+	if (ret)
+		return;
+	__gfs2_ail_flush(gl);
+	gfs2_trans_end(sdp);
+	gfs2_log_flush(sdp, NULL);
+}
+
+/**
+ * rgrp_go_sync - sync out the metadata for this glock
+ * @gl: the glock
+ *
+ * Called when demoting or unlocking an EX glock.  We must flush
+ * to disk all dirty buffers/pages relating to this glock, and must not
+ * not return to caller to demote/unlock the glock until I/O is complete.
+ */
+
+static void rgrp_go_sync(struct gfs2_glock *gl)
+{
+	struct address_space *metamapping = gfs2_glock2aspace(gl);
+	int error;
+
+	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
+		return;
+	BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE);
+
+	gfs2_log_flush(gl->gl_sbd, gl);
+	filemap_fdatawrite(metamapping);
+	error = filemap_fdatawait(metamapping);
+        mapping_set_error(metamapping, error);
+	gfs2_ail_empty_gl(gl);
+}
+
+/**
+ * rgrp_go_inval - invalidate the metadata for this glock
+ * @gl: the glock
+ * @flags:
+ *
+ * We never used LM_ST_DEFERRED with resource groups, so that we
+ * should always see the metadata flag set here.
+ *
+ */
+
+static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
+{
+	struct address_space *mapping = gfs2_glock2aspace(gl);
+
+	BUG_ON(!(flags & DIO_METADATA));
+	gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
+	truncate_inode_pages(mapping, 0);
+
+	if (gl->gl_object) {
+		struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
+		rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+	}
+}
+
+/**
+ * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
+ * @gl: the glock protecting the inode
+ *
+ */
+
+static void inode_go_sync(struct gfs2_glock *gl)
+{
+	struct gfs2_inode *ip = gl->gl_object;
+	struct address_space *metamapping = gfs2_glock2aspace(gl);
+	int error;
+
+	if (ip && !S_ISREG(ip->i_inode.i_mode))
+		ip = NULL;
+	if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+		unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
+	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
+		return;
+
+	BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE);
+
+	gfs2_log_flush(gl->gl_sbd, gl);
+	filemap_fdatawrite(metamapping);
+	if (ip) {
+		struct address_space *mapping = ip->i_inode.i_mapping;
+		filemap_fdatawrite(mapping);
+		error = filemap_fdatawait(mapping);
+		mapping_set_error(mapping, error);
+	}
+	error = filemap_fdatawait(metamapping);
+	mapping_set_error(metamapping, error);
+	gfs2_ail_empty_gl(gl);
+	/*
+	 * Writeback of the data mapping may cause the dirty flag to be set
+	 * so we have to clear it again here.
+	 */
+	smp_mb__before_clear_bit();
+	clear_bit(GLF_DIRTY, &gl->gl_flags);
+}
+
+/**
+ * inode_go_inval - prepare a inode glock to be released
+ * @gl: the glock
+ * @flags:
+ * 
+ * Normally we invlidate everything, but if we are moving into
+ * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we
+ * can keep hold of the metadata, since it won't have changed.
+ *
+ */
+
+static void inode_go_inval(struct gfs2_glock *gl, int flags)
+{
+	struct gfs2_inode *ip = gl->gl_object;
+
+	gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
+
+	if (flags & DIO_METADATA) {
+		struct address_space *mapping = gfs2_glock2aspace(gl);
+		truncate_inode_pages(mapping, 0);
+		if (ip) {
+			set_bit(GIF_INVALID, &ip->i_flags);
+			forget_all_cached_acls(&ip->i_inode);
+		}
+	}
+
+	if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) {
+		gfs2_log_flush(gl->gl_sbd, NULL);
+		gl->gl_sbd->sd_rindex_uptodate = 0;
+	}
+	if (ip && S_ISREG(ip->i_inode.i_mode))
+		truncate_inode_pages(ip->i_inode.i_mapping, 0);
+}
+
+/**
+ * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int inode_go_demote_ok(const struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_holder *gh;
+
+	if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
+		return 0;
+
+	if (!list_empty(&gl->gl_holders)) {
+		gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+		if (gh->gh_list.next != &gl->gl_holders)
+			return 0;
+	}
+
+	return 1;
+}
+
+/**
+ * gfs2_set_nlink - Set the inode's link count based on on-disk info
+ * @inode: The inode in question
+ * @nlink: The link count
+ *
+ * If the link count has hit zero, it must never be raised, whatever the
+ * on-disk inode might say. When new struct inodes are created the link
+ * count is set to 1, so that we can safely use this test even when reading
+ * in on disk information for the first time.
+ */
+
+static void gfs2_set_nlink(struct inode *inode, u32 nlink)
+{
+	/*
+	 * We will need to review setting the nlink count here in the
+	 * light of the forthcoming ro bind mount work. This is a reminder
+	 * to do that.
+	 */
+	if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) {
+		if (nlink == 0)
+			clear_nlink(inode);
+		else
+			inode->i_nlink = nlink;
+	}
+}
+
+static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
+{
+	const struct gfs2_dinode *str = buf;
+	struct timespec atime;
+	u16 height, depth;
+
+	if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
+		goto corrupt;
+	ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
+	ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
+	ip->i_inode.i_rdev = 0;
+	switch (ip->i_inode.i_mode & S_IFMT) {
+	case S_IFBLK:
+	case S_IFCHR:
+		ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major),
+					   be32_to_cpu(str->di_minor));
+		break;
+	};
+
+	ip->i_inode.i_uid = be32_to_cpu(str->di_uid);
+	ip->i_inode.i_gid = be32_to_cpu(str->di_gid);
+	gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
+	i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
+	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
+	atime.tv_sec = be64_to_cpu(str->di_atime);
+	atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
+	if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
+		ip->i_inode.i_atime = atime;
+	ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
+	ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
+	ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
+	ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
+
+	ip->i_goal = be64_to_cpu(str->di_goal_meta);
+	ip->i_generation = be64_to_cpu(str->di_generation);
+
+	ip->i_diskflags = be32_to_cpu(str->di_flags);
+	gfs2_set_inode_flags(&ip->i_inode);
+	height = be16_to_cpu(str->di_height);
+	if (unlikely(height > GFS2_MAX_META_HEIGHT))
+		goto corrupt;
+	ip->i_height = (u8)height;
+
+	depth = be16_to_cpu(str->di_depth);
+	if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
+		goto corrupt;
+	ip->i_depth = (u8)depth;
+	ip->i_entries = be32_to_cpu(str->di_entries);
+
+	ip->i_eattr = be64_to_cpu(str->di_eattr);
+	if (S_ISREG(ip->i_inode.i_mode))
+		gfs2_set_aops(&ip->i_inode);
+
+	return 0;
+corrupt:
+	gfs2_consist_inode(ip);
+	return -EIO;
+}
+
+/**
+ * gfs2_inode_refresh - Refresh the incore copy of the dinode
+ * @ip: The GFS2 inode
+ *
+ * Returns: errno
+ */
+
+int gfs2_inode_refresh(struct gfs2_inode *ip)
+{
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
+		brelse(dibh);
+		return -EIO;
+	}
+
+	error = gfs2_dinode_in(ip, dibh->b_data);
+	brelse(dibh);
+	clear_bit(GIF_INVALID, &ip->i_flags);
+
+	return error;
+}
+
+/**
+ * inode_go_lock - operation done after an inode lock is locked by a process
+ * @gl: the glock
+ * @flags:
+ *
+ * Returns: errno
+ */
+
+static int inode_go_lock(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_inode *ip = gl->gl_object;
+	int error = 0;
+
+	if (!ip || (gh->gh_flags & GL_SKIP))
+		return 0;
+
+	if (test_bit(GIF_INVALID, &ip->i_flags)) {
+		error = gfs2_inode_refresh(ip);
+		if (error)
+			return error;
+	}
+
+	if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
+	    (gl->gl_state == LM_ST_EXCLUSIVE) &&
+	    (gh->gh_state == LM_ST_EXCLUSIVE)) {
+		spin_lock(&sdp->sd_trunc_lock);
+		if (list_empty(&ip->i_trunc_list))
+			list_add(&sdp->sd_trunc_list, &ip->i_trunc_list);
+		spin_unlock(&sdp->sd_trunc_lock);
+		wake_up(&sdp->sd_quota_wait);
+		return 1;
+	}
+
+	return error;
+}
+
+/**
+ * inode_go_dump - print information about an inode
+ * @seq: The iterator
+ * @ip: the inode
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+	const struct gfs2_inode *ip = gl->gl_object;
+	if (ip == NULL)
+		return 0;
+	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
+		  (unsigned long long)ip->i_no_formal_ino,
+		  (unsigned long long)ip->i_no_addr,
+		  IF2DT(ip->i_inode.i_mode), ip->i_flags,
+		  (unsigned int)ip->i_diskflags,
+		  (unsigned long long)i_size_read(&ip->i_inode));
+	return 0;
+}
+
+/**
+ * rgrp_go_lock - operation done after an rgrp lock is locked by
+ *    a first holder on this node.
+ * @gl: the glock
+ * @flags:
+ *
+ * Returns: errno
+ */
+
+static int rgrp_go_lock(struct gfs2_holder *gh)
+{
+	return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
+}
+
+/**
+ * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
+ *    a last holder on this node.
+ * @gl: the glock
+ * @flags:
+ *
+ */
+
+static void rgrp_go_unlock(struct gfs2_holder *gh)
+{
+	gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
+}
+
+/**
+ * trans_go_sync - promote/demote the transaction glock
+ * @gl: the glock
+ * @state: the requested state
+ * @flags:
+ *
+ */
+
+static void trans_go_sync(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+
+	if (gl->gl_state != LM_ST_UNLOCKED &&
+	    test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+		gfs2_meta_syncfs(sdp);
+		gfs2_log_shutdown(sdp);
+	}
+}
+
+/**
+ * trans_go_xmote_bh - After promoting/demoting the transaction glock
+ * @gl: the glock
+ *
+ */
+
+static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+	struct gfs2_glock *j_gl = ip->i_gl;
+	struct gfs2_log_header_host head;
+	int error;
+
+	if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+		j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
+
+		error = gfs2_find_jhead(sdp->sd_jdesc, &head);
+		if (error)
+			gfs2_consist(sdp);
+		if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
+			gfs2_consist(sdp);
+
+		/*  Initialize some head of the log stuff  */
+		if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
+			sdp->sd_log_sequence = head.lh_sequence + 1;
+			gfs2_log_pointers_init(sdp, head.lh_blkno);
+		}
+	}
+	return 0;
+}
+
+/**
+ * trans_go_demote_ok
+ * @gl: the glock
+ *
+ * Always returns 0
+ */
+
+static int trans_go_demote_ok(const struct gfs2_glock *gl)
+{
+	return 0;
+}
+
+/**
+ * iopen_go_callback - schedule the dcache entry for the inode to be deleted
+ * @gl: the glock
+ *
+ * gl_spin lock is held while calling this
+ */
+static void iopen_go_callback(struct gfs2_glock *gl)
+{
+	struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+
+	if (sdp->sd_vfs->s_flags & MS_RDONLY)
+		return;
+
+	if (gl->gl_demote_state == LM_ST_UNLOCKED &&
+	    gl->gl_state == LM_ST_SHARED && ip) {
+		gfs2_glock_hold(gl);
+		if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+			gfs2_glock_put_nolock(gl);
+	}
+}
+
+const struct gfs2_glock_operations gfs2_meta_glops = {
+	.go_type = LM_TYPE_META,
+};
+
+const struct gfs2_glock_operations gfs2_inode_glops = {
+	.go_xmote_th = inode_go_sync,
+	.go_inval = inode_go_inval,
+	.go_demote_ok = inode_go_demote_ok,
+	.go_lock = inode_go_lock,
+	.go_dump = inode_go_dump,
+	.go_type = LM_TYPE_INODE,
+	.go_min_hold_time = HZ / 5,
+	.go_flags = GLOF_ASPACE,
+};
+
+const struct gfs2_glock_operations gfs2_rgrp_glops = {
+	.go_xmote_th = rgrp_go_sync,
+	.go_inval = rgrp_go_inval,
+	.go_lock = rgrp_go_lock,
+	.go_unlock = rgrp_go_unlock,
+	.go_dump = gfs2_rgrp_dump,
+	.go_type = LM_TYPE_RGRP,
+	.go_min_hold_time = HZ / 5,
+	.go_flags = GLOF_ASPACE,
+};
+
+const struct gfs2_glock_operations gfs2_trans_glops = {
+	.go_xmote_th = trans_go_sync,
+	.go_xmote_bh = trans_go_xmote_bh,
+	.go_demote_ok = trans_go_demote_ok,
+	.go_type = LM_TYPE_NONDISK,
+};
+
+const struct gfs2_glock_operations gfs2_iopen_glops = {
+	.go_type = LM_TYPE_IOPEN,
+	.go_callback = iopen_go_callback,
+};
+
+const struct gfs2_glock_operations gfs2_flock_glops = {
+	.go_type = LM_TYPE_FLOCK,
+};
+
+const struct gfs2_glock_operations gfs2_nondisk_glops = {
+	.go_type = LM_TYPE_NONDISK,
+};
+
+const struct gfs2_glock_operations gfs2_quota_glops = {
+	.go_type = LM_TYPE_QUOTA,
+};
+
+const struct gfs2_glock_operations gfs2_journal_glops = {
+	.go_type = LM_TYPE_JOURNAL,
+};
+
+const struct gfs2_glock_operations *gfs2_glops_list[] = {
+	[LM_TYPE_META] = &gfs2_meta_glops,
+	[LM_TYPE_INODE] = &gfs2_inode_glops,
+	[LM_TYPE_RGRP] = &gfs2_rgrp_glops,
+	[LM_TYPE_IOPEN] = &gfs2_iopen_glops,
+	[LM_TYPE_FLOCK] = &gfs2_flock_glops,
+	[LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
+	[LM_TYPE_QUOTA] = &gfs2_quota_glops,
+	[LM_TYPE_JOURNAL] = &gfs2_journal_glops,
+};
+
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
new file mode 100644
index 00000000..6fce409b
--- /dev/null
+++ b/fs/gfs2/glops.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __GLOPS_DOT_H__
+#define __GLOPS_DOT_H__
+
+#include "incore.h"
+
+extern const struct gfs2_glock_operations gfs2_meta_glops;
+extern const struct gfs2_glock_operations gfs2_inode_glops;
+extern const struct gfs2_glock_operations gfs2_rgrp_glops;
+extern const struct gfs2_glock_operations gfs2_trans_glops;
+extern const struct gfs2_glock_operations gfs2_iopen_glops;
+extern const struct gfs2_glock_operations gfs2_flock_glops;
+extern const struct gfs2_glock_operations gfs2_nondisk_glops;
+extern const struct gfs2_glock_operations gfs2_quota_glops;
+extern const struct gfs2_glock_operations gfs2_journal_glops;
+extern const struct gfs2_glock_operations *gfs2_glops_list[];
+
+extern void gfs2_ail_flush(struct gfs2_glock *gl);
+
+#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
new file mode 100644
index 00000000..81206e70
--- /dev/null
+++ b/fs/gfs2/incore.h
@@ -0,0 +1,686 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __INCORE_DOT_H__
+#define __INCORE_DOT_H__
+
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/workqueue.h>
+#include <linux/dlm.h>
+#include <linux/buffer_head.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/completion.h>
+
+#define DIO_WAIT	0x00000010
+#define DIO_METADATA	0x00000020
+
+struct gfs2_log_operations;
+struct gfs2_log_element;
+struct gfs2_holder;
+struct gfs2_glock;
+struct gfs2_quota_data;
+struct gfs2_trans;
+struct gfs2_ail;
+struct gfs2_jdesc;
+struct gfs2_sbd;
+struct lm_lockops;
+
+typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
+
+struct gfs2_log_header_host {
+	u64 lh_sequence;	/* Sequence number of this transaction */
+	u32 lh_flags;		/* GFS2_LOG_HEAD_... */
+	u32 lh_tail;		/* Block number of log tail */
+	u32 lh_blkno;
+	u32 lh_hash;
+};
+
+/*
+ * Structure of operations that are associated with each
+ * type of element in the log.
+ */
+
+struct gfs2_log_operations {
+	void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
+	void (*lo_before_commit) (struct gfs2_sbd *sdp);
+	void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
+	void (*lo_before_scan) (struct gfs2_jdesc *jd,
+				struct gfs2_log_header_host *head, int pass);
+	int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
+				 struct gfs2_log_descriptor *ld, __be64 *ptr,
+				 int pass);
+	void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass);
+	const char *lo_name;
+};
+
+struct gfs2_log_element {
+	struct list_head le_list;
+	const struct gfs2_log_operations *le_ops;
+};
+
+#define GBF_FULL 1
+
+struct gfs2_bitmap {
+	struct buffer_head *bi_bh;
+	char *bi_clone;
+	unsigned long bi_flags;
+	u32 bi_offset;
+	u32 bi_start;
+	u32 bi_len;
+};
+
+struct gfs2_rgrpd {
+	struct list_head rd_list;	/* Link with superblock */
+	struct list_head rd_list_mru;
+	struct gfs2_glock *rd_gl;	/* Glock for this rgrp */
+	u64 rd_addr;			/* grp block disk address */
+	u64 rd_data0;			/* first data location */
+	u32 rd_length;			/* length of rgrp header in fs blocks */
+	u32 rd_data;			/* num of data blocks in rgrp */
+	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
+	u32 rd_free;
+	u32 rd_free_clone;
+	u32 rd_dinodes;
+	u64 rd_igeneration;
+	struct gfs2_bitmap *rd_bits;
+	struct mutex rd_mutex;
+	struct gfs2_log_element rd_le;
+	struct gfs2_sbd *rd_sbd;
+	unsigned int rd_bh_count;
+	u32 rd_last_alloc;
+	u32 rd_flags;
+#define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
+#define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
+#define GFS2_RDF_ERROR		0x40000000 /* error in rg */
+#define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
+};
+
+enum gfs2_state_bits {
+	BH_Pinned = BH_PrivateStart,
+	BH_Escaped = BH_PrivateStart + 1,
+};
+
+BUFFER_FNS(Pinned, pinned)
+TAS_BUFFER_FNS(Pinned, pinned)
+BUFFER_FNS(Escaped, escaped)
+TAS_BUFFER_FNS(Escaped, escaped)
+
+struct gfs2_bufdata {
+	struct buffer_head *bd_bh;
+	struct gfs2_glock *bd_gl;
+
+	union {
+		struct list_head list_tr;
+		u64 blkno;
+	} u;
+#define bd_list_tr u.list_tr
+#define bd_blkno u.blkno
+
+	struct gfs2_log_element bd_le;
+
+	struct gfs2_ail *bd_ail;
+	struct list_head bd_ail_st_list;
+	struct list_head bd_ail_gl_list;
+};
+
+/*
+ * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
+ * prefix of lock_dlm_ gets awkward.
+ */
+
+#define GDLM_STRNAME_BYTES	25
+#define GDLM_LVB_SIZE		32
+
+enum {
+	DFL_BLOCK_LOCKS		= 0,
+};
+
+struct lm_lockname {
+	u64 ln_number;
+	unsigned int ln_type;
+};
+
+#define lm_name_equal(name1, name2) \
+        (((name1)->ln_number == (name2)->ln_number) && \
+         ((name1)->ln_type == (name2)->ln_type))
+
+
+struct gfs2_glock_operations {
+	void (*go_xmote_th) (struct gfs2_glock *gl);
+	int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
+	void (*go_inval) (struct gfs2_glock *gl, int flags);
+	int (*go_demote_ok) (const struct gfs2_glock *gl);
+	int (*go_lock) (struct gfs2_holder *gh);
+	void (*go_unlock) (struct gfs2_holder *gh);
+	int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
+	void (*go_callback) (struct gfs2_glock *gl);
+	const int go_type;
+	const unsigned long go_min_hold_time;
+	const unsigned long go_flags;
+#define GLOF_ASPACE 1
+};
+
+enum {
+	/* States */
+	HIF_HOLDER		= 6,  /* Set for gh that "holds" the glock */
+	HIF_FIRST		= 7,
+	HIF_WAIT		= 10,
+};
+
+struct gfs2_holder {
+	struct list_head gh_list;
+
+	struct gfs2_glock *gh_gl;
+	struct pid *gh_owner_pid;
+	unsigned int gh_state;
+	unsigned gh_flags;
+
+	int gh_error;
+	unsigned long gh_iflags; /* HIF_... */
+	unsigned long gh_ip;
+};
+
+enum {
+	GLF_LOCK			= 1,
+	GLF_DEMOTE			= 3,
+	GLF_PENDING_DEMOTE		= 4,
+	GLF_DEMOTE_IN_PROGRESS		= 5,
+	GLF_DIRTY			= 6,
+	GLF_LFLUSH			= 7,
+	GLF_INVALIDATE_IN_PROGRESS	= 8,
+	GLF_REPLY_PENDING		= 9,
+	GLF_INITIAL			= 10,
+	GLF_FROZEN			= 11,
+	GLF_QUEUED			= 12,
+	GLF_LRU				= 13,
+	GLF_OBJECT			= 14, /* Used only for tracing */
+};
+
+struct gfs2_glock {
+	struct hlist_bl_node gl_list;
+	unsigned long gl_flags;		/* GLF_... */
+	struct lm_lockname gl_name;
+	atomic_t gl_ref;
+
+	spinlock_t gl_spin;
+
+	/* State fields protected by gl_spin */
+	unsigned int gl_state:2,	/* Current state */
+		     gl_target:2,	/* Target state */
+		     gl_demote_state:2,	/* State requested by remote node */
+		     gl_req:2,		/* State in last dlm request */
+		     gl_reply:8;	/* Last reply from the dlm */
+
+	unsigned int gl_hash;
+	unsigned long gl_demote_time; /* time of first demote request */
+	struct list_head gl_holders;
+
+	const struct gfs2_glock_operations *gl_ops;
+	char gl_strname[GDLM_STRNAME_BYTES];
+	struct dlm_lksb gl_lksb;
+	char gl_lvb[32];
+	unsigned long gl_tchange;
+	void *gl_object;
+
+	struct list_head gl_lru;
+
+	struct gfs2_sbd *gl_sbd;
+
+	struct list_head gl_ail_list;
+	atomic_t gl_ail_count;
+	atomic_t gl_revokes;
+	struct delayed_work gl_work;
+	struct work_struct gl_delete;
+	struct rcu_head gl_rcu;
+};
+
+#define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
+
+struct gfs2_alloc {
+	/* Quota stuff */
+
+	struct gfs2_quota_data *al_qd[2*MAXQUOTAS];
+	struct gfs2_holder al_qd_ghs[2*MAXQUOTAS];
+	unsigned int al_qd_num;
+
+	u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */
+	u32 al_alloced; /* Filled in by gfs2_alloc_*() */
+
+	/* Filled in by gfs2_inplace_reserve() */
+
+	unsigned int al_line;
+	char *al_file;
+	struct gfs2_holder al_ri_gh;
+	struct gfs2_holder al_rgd_gh;
+	struct gfs2_rgrpd *al_rgd;
+
+};
+
+enum {
+	GIF_INVALID		= 0,
+	GIF_QD_LOCKED		= 1,
+	GIF_SW_PAGED		= 3,
+};
+
+
+struct gfs2_inode {
+	struct inode i_inode;
+	u64 i_no_addr;
+	u64 i_no_formal_ino;
+	u64 i_generation;
+	u64 i_eattr;
+	unsigned long i_flags;		/* GIF_... */
+	struct gfs2_glock *i_gl; /* Move into i_gh? */
+	struct gfs2_holder i_iopen_gh;
+	struct gfs2_holder i_gh; /* for prepare/commit_write only */
+	struct gfs2_alloc *i_alloc;
+	u64 i_goal;	/* goal block for allocations */
+	struct rw_semaphore i_rw_mutex;
+	struct list_head i_trunc_list;
+	u32 i_entries;
+	u32 i_diskflags;
+	u8 i_height;
+	u8 i_depth;
+};
+
+/*
+ * Since i_inode is the first element of struct gfs2_inode,
+ * this is effectively a cast.
+ */
+static inline struct gfs2_inode *GFS2_I(struct inode *inode)
+{
+	return container_of(inode, struct gfs2_inode, i_inode);
+}
+
+static inline struct gfs2_sbd *GFS2_SB(const struct inode *inode)
+{
+	return inode->i_sb->s_fs_info;
+}
+
+struct gfs2_file {
+	struct mutex f_fl_mutex;
+	struct gfs2_holder f_fl_gh;
+};
+
+struct gfs2_revoke_replay {
+	struct list_head rr_list;
+	u64 rr_blkno;
+	unsigned int rr_where;
+};
+
+enum {
+	QDF_USER		= 0,
+	QDF_CHANGE		= 1,
+	QDF_LOCKED		= 2,
+	QDF_REFRESH		= 3,
+};
+
+struct gfs2_quota_data {
+	struct list_head qd_list;
+	struct list_head qd_reclaim;
+
+	atomic_t qd_count;
+
+	u32 qd_id;
+	unsigned long qd_flags;		/* QDF_... */
+
+	s64 qd_change;
+	s64 qd_change_sync;
+
+	unsigned int qd_slot;
+	unsigned int qd_slot_count;
+
+	struct buffer_head *qd_bh;
+	struct gfs2_quota_change *qd_bh_qc;
+	unsigned int qd_bh_count;
+
+	struct gfs2_glock *qd_gl;
+	struct gfs2_quota_lvb qd_qb;
+
+	u64 qd_sync_gen;
+	unsigned long qd_last_warn;
+};
+
+struct gfs2_trans {
+	unsigned long tr_ip;
+
+	unsigned int tr_blocks;
+	unsigned int tr_revokes;
+	unsigned int tr_reserved;
+
+	struct gfs2_holder tr_t_gh;
+
+	int tr_touched;
+
+	unsigned int tr_num_buf;
+	unsigned int tr_num_buf_new;
+	unsigned int tr_num_databuf_new;
+	unsigned int tr_num_buf_rm;
+	unsigned int tr_num_databuf_rm;
+	struct list_head tr_list_buf;
+
+	unsigned int tr_num_revoke;
+	unsigned int tr_num_revoke_rm;
+};
+
+struct gfs2_ail {
+	struct list_head ai_list;
+
+	unsigned int ai_first;
+	struct list_head ai_ail1_list;
+	struct list_head ai_ail2_list;
+};
+
+struct gfs2_journal_extent {
+	struct list_head extent_list;
+
+	unsigned int lblock; /* First logical block */
+	u64 dblock; /* First disk block */
+	u64 blocks;
+};
+
+struct gfs2_jdesc {
+	struct list_head jd_list;
+	struct list_head extent_list;
+	struct work_struct jd_work;
+	struct inode *jd_inode;
+	unsigned long jd_flags;
+#define JDF_RECOVERY 1
+	unsigned int jd_jid;
+	unsigned int jd_blocks;
+};
+
+struct gfs2_statfs_change_host {
+	s64 sc_total;
+	s64 sc_free;
+	s64 sc_dinodes;
+};
+
+#define GFS2_QUOTA_DEFAULT	GFS2_QUOTA_OFF
+#define GFS2_QUOTA_OFF		0
+#define GFS2_QUOTA_ACCOUNT	1
+#define GFS2_QUOTA_ON		2
+
+#define GFS2_DATA_DEFAULT	GFS2_DATA_ORDERED
+#define GFS2_DATA_WRITEBACK	1
+#define GFS2_DATA_ORDERED	2
+
+#define GFS2_ERRORS_DEFAULT     GFS2_ERRORS_WITHDRAW
+#define GFS2_ERRORS_WITHDRAW    0
+#define GFS2_ERRORS_CONTINUE    1 /* place holder for future feature */
+#define GFS2_ERRORS_RO          2 /* place holder for future feature */
+#define GFS2_ERRORS_PANIC       3
+
+struct gfs2_args {
+	char ar_lockproto[GFS2_LOCKNAME_LEN];	/* Name of the Lock Protocol */
+	char ar_locktable[GFS2_LOCKNAME_LEN];	/* Name of the Lock Table */
+	char ar_hostdata[GFS2_LOCKNAME_LEN];	/* Host specific data */
+	unsigned int ar_spectator:1;		/* Don't get a journal */
+	unsigned int ar_localflocks:1;		/* Let the VFS do flock|fcntl */
+	unsigned int ar_debug:1;		/* Oops on errors */
+	unsigned int ar_posix_acl:1;		/* Enable posix acls */
+	unsigned int ar_quota:2;		/* off/account/on */
+	unsigned int ar_suiddir:1;		/* suiddir support */
+	unsigned int ar_data:2;			/* ordered/writeback */
+	unsigned int ar_meta:1;			/* mount metafs */
+	unsigned int ar_discard:1;		/* discard requests */
+	unsigned int ar_errors:2;               /* errors=withdraw | panic */
+	unsigned int ar_nobarrier:1;            /* do not send barriers */
+	int ar_commit;				/* Commit interval */
+	int ar_statfs_quantum;			/* The fast statfs interval */
+	int ar_quota_quantum;			/* The quota interval */
+	int ar_statfs_percent;			/* The % change to force sync */
+};
+
+struct gfs2_tune {
+	spinlock_t gt_spin;
+
+	unsigned int gt_logd_secs;
+
+	unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
+	unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
+	unsigned int gt_quota_scale_num; /* Numerator */
+	unsigned int gt_quota_scale_den; /* Denominator */
+	unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
+	unsigned int gt_new_files_jdata;
+	unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
+	unsigned int gt_complain_secs;
+	unsigned int gt_statfs_quantum;
+	unsigned int gt_statfs_slow;
+};
+
+enum {
+	SDF_JOURNAL_CHECKED	= 0,
+	SDF_JOURNAL_LIVE	= 1,
+	SDF_SHUTDOWN		= 2,
+	SDF_NOBARRIERS		= 3,
+	SDF_NORECOVERY		= 4,
+	SDF_DEMOTE		= 5,
+	SDF_NOJOURNALID		= 6,
+};
+
+#define GFS2_FSNAME_LEN		256
+
+struct gfs2_inum_host {
+	u64 no_formal_ino;
+	u64 no_addr;
+};
+
+struct gfs2_sb_host {
+	u32 sb_magic;
+	u32 sb_type;
+	u32 sb_format;
+
+	u32 sb_fs_format;
+	u32 sb_multihost_format;
+	u32 sb_bsize;
+	u32 sb_bsize_shift;
+
+	struct gfs2_inum_host sb_master_dir;
+	struct gfs2_inum_host sb_root_dir;
+
+	char sb_lockproto[GFS2_LOCKNAME_LEN];
+	char sb_locktable[GFS2_LOCKNAME_LEN];
+};
+
+/*
+ * lm_mount() return values
+ *
+ * ls_jid - the journal ID this node should use
+ * ls_first - this node is the first to mount the file system
+ * ls_lockspace - lock module's context for this file system
+ * ls_ops - lock module's functions
+ */
+
+struct lm_lockstruct {
+	int ls_jid;
+	unsigned int ls_first;
+	unsigned int ls_first_done;
+	unsigned int ls_nodir;
+	const struct lm_lockops *ls_ops;
+	unsigned long ls_flags;
+	dlm_lockspace_t *ls_dlm;
+
+	int ls_recover_jid_done;
+	int ls_recover_jid_status;
+};
+
+struct gfs2_sbd {
+	struct super_block *sd_vfs;
+	struct kobject sd_kobj;
+	unsigned long sd_flags;	/* SDF_... */
+	struct gfs2_sb_host sd_sb;
+
+	/* Constants computed on mount */
+
+	u32 sd_fsb2bb;
+	u32 sd_fsb2bb_shift;
+	u32 sd_diptrs;	/* Number of pointers in a dinode */
+	u32 sd_inptrs;	/* Number of pointers in a indirect block */
+	u32 sd_jbsize;	/* Size of a journaled data block */
+	u32 sd_hash_bsize;	/* sizeof(exhash block) */
+	u32 sd_hash_bsize_shift;
+	u32 sd_hash_ptrs;	/* Number of pointers in a hash block */
+	u32 sd_qc_per_block;
+	u32 sd_max_dirres;	/* Max blocks needed to add a directory entry */
+	u32 sd_max_height;	/* Max height of a file's metadata tree */
+	u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
+	u32 sd_max_jheight; /* Max height of journaled file's meta tree */
+	u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
+
+	struct gfs2_args sd_args;	/* Mount arguments */
+	struct gfs2_tune sd_tune;	/* Filesystem tuning structure */
+
+	/* Lock Stuff */
+
+	struct lm_lockstruct sd_lockstruct;
+	struct gfs2_holder sd_live_gh;
+	struct gfs2_glock *sd_rename_gl;
+	struct gfs2_glock *sd_trans_gl;
+	wait_queue_head_t sd_glock_wait;
+	atomic_t sd_glock_disposal;
+	struct completion sd_locking_init;
+
+	/* Inode Stuff */
+
+	struct dentry *sd_master_dir;
+	struct dentry *sd_root_dir;
+
+	struct inode *sd_jindex;
+	struct inode *sd_statfs_inode;
+	struct inode *sd_sc_inode;
+	struct inode *sd_qc_inode;
+	struct inode *sd_rindex;
+	struct inode *sd_quota_inode;
+
+	/* StatFS stuff */
+
+	spinlock_t sd_statfs_spin;
+	struct gfs2_statfs_change_host sd_statfs_master;
+	struct gfs2_statfs_change_host sd_statfs_local;
+	int sd_statfs_force_sync;
+
+	/* Resource group stuff */
+
+	int sd_rindex_uptodate;
+	spinlock_t sd_rindex_spin;
+	struct mutex sd_rindex_mutex;
+	struct list_head sd_rindex_list;
+	struct list_head sd_rindex_mru_list;
+	struct gfs2_rgrpd *sd_rindex_forward;
+	unsigned int sd_rgrps;
+	unsigned int sd_max_rg_data;
+
+	/* Journal index stuff */
+
+	struct list_head sd_jindex_list;
+	spinlock_t sd_jindex_spin;
+	struct mutex sd_jindex_mutex;
+	unsigned int sd_journals;
+
+	struct gfs2_jdesc *sd_jdesc;
+	struct gfs2_holder sd_journal_gh;
+	struct gfs2_holder sd_jinode_gh;
+
+	struct gfs2_holder sd_sc_gh;
+	struct gfs2_holder sd_qc_gh;
+
+	/* Daemon stuff */
+
+	struct task_struct *sd_logd_process;
+	struct task_struct *sd_quotad_process;
+
+	/* Quota stuff */
+
+	struct list_head sd_quota_list;
+	atomic_t sd_quota_count;
+	struct mutex sd_quota_mutex;
+	wait_queue_head_t sd_quota_wait;
+	struct list_head sd_trunc_list;
+	spinlock_t sd_trunc_lock;
+
+	unsigned int sd_quota_slots;
+	unsigned int sd_quota_chunks;
+	unsigned char **sd_quota_bitmap;
+
+	u64 sd_quota_sync_gen;
+
+	/* Log stuff */
+
+	spinlock_t sd_log_lock;
+
+	unsigned int sd_log_blks_reserved;
+	unsigned int sd_log_commited_buf;
+	unsigned int sd_log_commited_databuf;
+	int sd_log_commited_revoke;
+
+	atomic_t sd_log_pinned;
+	unsigned int sd_log_num_buf;
+	unsigned int sd_log_num_revoke;
+	unsigned int sd_log_num_rg;
+	unsigned int sd_log_num_databuf;
+
+	struct list_head sd_log_le_buf;
+	struct list_head sd_log_le_revoke;
+	struct list_head sd_log_le_rg;
+	struct list_head sd_log_le_databuf;
+	struct list_head sd_log_le_ordered;
+
+	atomic_t sd_log_thresh1;
+	atomic_t sd_log_thresh2;
+	atomic_t sd_log_blks_free;
+	wait_queue_head_t sd_log_waitq;
+	wait_queue_head_t sd_logd_waitq;
+
+	u64 sd_log_sequence;
+	unsigned int sd_log_head;
+	unsigned int sd_log_tail;
+	int sd_log_idle;
+
+	struct rw_semaphore sd_log_flush_lock;
+	atomic_t sd_log_in_flight;
+	wait_queue_head_t sd_log_flush_wait;
+
+	unsigned int sd_log_flush_head;
+	u64 sd_log_flush_wrapped;
+
+	spinlock_t sd_ail_lock;
+	struct list_head sd_ail1_list;
+	struct list_head sd_ail2_list;
+
+	/* Replay stuff */
+
+	struct list_head sd_revoke_list;
+	unsigned int sd_replay_tail;
+
+	unsigned int sd_found_blocks;
+	unsigned int sd_found_revokes;
+	unsigned int sd_replayed_blocks;
+
+	/* For quiescing the filesystem */
+
+	struct gfs2_holder sd_freeze_gh;
+	struct mutex sd_freeze_lock;
+	unsigned int sd_freeze_count;
+
+	char sd_fsname[GFS2_FSNAME_LEN];
+	char sd_table_name[GFS2_FSNAME_LEN];
+	char sd_proto_name[GFS2_FSNAME_LEN];
+
+	/* Debugging crud */
+
+	unsigned long sd_last_warning;
+	struct dentry *debugfs_dir;    /* debugfs directory */
+	struct dentry *debugfs_dentry_glocks; /* for debugfs */
+};
+
+#endif /* __INCORE_DOT_H__ */
+
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
new file mode 100644
index 00000000..03e0c529
--- /dev/null
+++ b/fs/gfs2/inode.c
@@ -0,0 +1,1892 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/namei.h>
+#include <linux/mm.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/fiemap.h>
+#include <linux/security.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "bmap.h"
+#include "dir.h"
+#include "xattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "super.h"
+#include "glops.h"
+
+struct gfs2_skip_data {
+	u64 no_addr;
+	int skipped;
+	int non_block;
+};
+
+static int iget_test(struct inode *inode, void *opaque)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_skip_data *data = opaque;
+
+	if (ip->i_no_addr == data->no_addr) {
+		if (data->non_block &&
+		    inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
+			data->skipped = 1;
+			return 0;
+		}
+		return 1;
+	}
+	return 0;
+}
+
+static int iget_set(struct inode *inode, void *opaque)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_skip_data *data = opaque;
+
+	if (data->skipped)
+		return -ENOENT;
+	inode->i_ino = (unsigned long)(data->no_addr);
+	ip->i_no_addr = data->no_addr;
+	return 0;
+}
+
+struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int non_block)
+{
+	unsigned long hash = (unsigned long)no_addr;
+	struct gfs2_skip_data data;
+
+	data.no_addr = no_addr;
+	data.skipped = 0;
+	data.non_block = non_block;
+	return ilookup5(sb, hash, iget_test, &data);
+}
+
+static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr,
+			       int non_block)
+{
+	struct gfs2_skip_data data;
+	unsigned long hash = (unsigned long)no_addr;
+
+	data.no_addr = no_addr;
+	data.skipped = 0;
+	data.non_block = non_block;
+	return iget5_locked(sb, hash, iget_test, iget_set, &data);
+}
+
+/**
+ * gfs2_set_iop - Sets inode operations
+ * @inode: The inode with correct i_mode filled in
+ *
+ * GFS2 lookup code fills in vfs inode contents based on info obtained
+ * from directory entry inside gfs2_inode_lookup().
+ */
+
+static void gfs2_set_iop(struct inode *inode)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	umode_t mode = inode->i_mode;
+
+	if (S_ISREG(mode)) {
+		inode->i_op = &gfs2_file_iops;
+		if (gfs2_localflocks(sdp))
+			inode->i_fop = &gfs2_file_fops_nolock;
+		else
+			inode->i_fop = &gfs2_file_fops;
+	} else if (S_ISDIR(mode)) {
+		inode->i_op = &gfs2_dir_iops;
+		if (gfs2_localflocks(sdp))
+			inode->i_fop = &gfs2_dir_fops_nolock;
+		else
+			inode->i_fop = &gfs2_dir_fops;
+	} else if (S_ISLNK(mode)) {
+		inode->i_op = &gfs2_symlink_iops;
+	} else {
+		inode->i_op = &gfs2_file_iops;
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+	}
+}
+
+/**
+ * gfs2_inode_lookup - Lookup an inode
+ * @sb: The super block
+ * @no_addr: The inode number
+ * @type: The type of the inode
+ * non_block: Can we block on inodes that are being freed?
+ *
+ * Returns: A VFS inode, or an error
+ */
+
+struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
+				u64 no_addr, u64 no_formal_ino, int non_block)
+{
+	struct inode *inode;
+	struct gfs2_inode *ip;
+	struct gfs2_glock *io_gl = NULL;
+	int error;
+
+	inode = gfs2_iget(sb, no_addr, non_block);
+	ip = GFS2_I(inode);
+
+	if (!inode)
+		return ERR_PTR(-ENOBUFS);
+
+	if (inode->i_state & I_NEW) {
+		struct gfs2_sbd *sdp = GFS2_SB(inode);
+		ip->i_no_formal_ino = no_formal_ino;
+
+		error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+		if (unlikely(error))
+			goto fail;
+		ip->i_gl->gl_object = ip;
+
+		error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+		if (unlikely(error))
+			goto fail_put;
+
+		set_bit(GIF_INVALID, &ip->i_flags);
+		error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
+		if (unlikely(error))
+			goto fail_iopen;
+
+		ip->i_iopen_gh.gh_gl->gl_object = ip;
+		gfs2_glock_put(io_gl);
+		io_gl = NULL;
+
+		if (type == DT_UNKNOWN) {
+			/* Inode glock must be locked already */
+			error = gfs2_inode_refresh(GFS2_I(inode));
+			if (error)
+				goto fail_refresh;
+		} else {
+			inode->i_mode = DT2IF(type);
+		}
+
+		gfs2_set_iop(inode);
+		unlock_new_inode(inode);
+	}
+
+	return inode;
+
+fail_refresh:
+	ip->i_iopen_gh.gh_gl->gl_object = NULL;
+	gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+fail_iopen:
+	if (io_gl)
+		gfs2_glock_put(io_gl);
+fail_put:
+	ip->i_gl->gl_object = NULL;
+	gfs2_glock_put(ip->i_gl);
+fail:
+	iget_failed(inode);
+	return ERR_PTR(error);
+}
+
+struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+				  u64 *no_formal_ino, unsigned int blktype)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	struct gfs2_holder i_gh;
+	struct inode *inode = NULL;
+	int error;
+
+	/* Must not read in block until block type is verified */
+	error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
+				  LM_ST_EXCLUSIVE, GL_SKIP, &i_gh);
+	if (error)
+		return ERR_PTR(error);
+
+	error = gfs2_check_blk_type(sdp, no_addr, blktype);
+	if (error)
+		goto fail;
+
+	inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, 1);
+	if (IS_ERR(inode))
+		goto fail;
+
+	/* Two extra checks for NFS only */
+	if (no_formal_ino) {
+		error = -ESTALE;
+		if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino)
+			goto fail_iput;
+
+		error = -EIO;
+		if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
+			goto fail_iput;
+
+		error = 0;
+	}
+
+fail:
+	gfs2_glock_dq_uninit(&i_gh);
+	return error ? ERR_PTR(error) : inode;
+fail_iput:
+	iput(inode);
+	goto fail;
+}
+
+
+struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
+{
+	struct qstr qstr;
+	struct inode *inode;
+	gfs2_str2qstr(&qstr, name);
+	inode = gfs2_lookupi(dip, &qstr, 1);
+	/* gfs2_lookupi has inconsistent callers: vfs
+	 * related routines expect NULL for no entry found,
+	 * gfs2_lookup_simple callers expect ENOENT
+	 * and do not check for NULL.
+	 */
+	if (inode == NULL)
+		return ERR_PTR(-ENOENT);
+	else
+		return inode;
+}
+
+
+/**
+ * gfs2_lookupi - Look up a filename in a directory and return its inode
+ * @d_gh: An initialized holder for the directory glock
+ * @name: The name of the inode to look for
+ * @is_root: If 1, ignore the caller's permissions
+ * @i_gh: An uninitialized holder for the new inode glock
+ *
+ * This can be called via the VFS filldir function when NFS is doing
+ * a readdirplus and the inode which its intending to stat isn't
+ * already in cache. In this case we must not take the directory glock
+ * again, since the readdir call will have already taken that lock.
+ *
+ * Returns: errno
+ */
+
+struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+			   int is_root)
+{
+	struct super_block *sb = dir->i_sb;
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_holder d_gh;
+	int error = 0;
+	struct inode *inode = NULL;
+	int unlock = 0;
+
+	if (!name->len || name->len > GFS2_FNAMESIZE)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) ||
+	    (name->len == 2 && memcmp(name->name, "..", 2) == 0 &&
+	     dir == sb->s_root->d_inode)) {
+		igrab(dir);
+		return dir;
+	}
+
+	if (gfs2_glock_is_locked_by_me(dip->i_gl) == NULL) {
+		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+		if (error)
+			return ERR_PTR(error);
+		unlock = 1;
+	}
+
+	if (!is_root) {
+		error = gfs2_permission(dir, MAY_EXEC, 0);
+		if (error)
+			goto out;
+	}
+
+	inode = gfs2_dir_search(dir, name);
+	if (IS_ERR(inode))
+		error = PTR_ERR(inode);
+out:
+	if (unlock)
+		gfs2_glock_dq_uninit(&d_gh);
+	if (error == -ENOENT)
+		return NULL;
+	return inode ? inode : ERR_PTR(error);
+}
+
+/**
+ * create_ok - OK to create a new on-disk inode here?
+ * @dip:  Directory in which dinode is to be created
+ * @name:  Name of new dinode
+ * @mode:
+ *
+ * Returns: errno
+ */
+
+static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
+		     unsigned int mode)
+{
+	int error;
+
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
+	if (error)
+		return error;
+
+	/*  Don't create entries in an unlinked directory  */
+	if (!dip->i_inode.i_nlink)
+		return -ENOENT;
+
+	error = gfs2_dir_check(&dip->i_inode, name, NULL);
+	switch (error) {
+	case -ENOENT:
+		error = 0;
+		break;
+	case 0:
+		return -EEXIST;
+	default:
+		return error;
+	}
+
+	if (dip->i_entries == (u32)-1)
+		return -EFBIG;
+	if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
+		return -EMLINK;
+
+	return 0;
+}
+
+static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
+			       unsigned int *uid, unsigned int *gid)
+{
+	if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
+	    (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) {
+		if (S_ISDIR(*mode))
+			*mode |= S_ISUID;
+		else if (dip->i_inode.i_uid != current_fsuid())
+			*mode &= ~07111;
+		*uid = dip->i_inode.i_uid;
+	} else
+		*uid = current_fsuid();
+
+	if (dip->i_inode.i_mode & S_ISGID) {
+		if (S_ISDIR(*mode))
+			*mode |= S_ISGID;
+		*gid = dip->i_inode.i_gid;
+	} else
+		*gid = current_fsgid();
+}
+
+static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	int error;
+
+	if (gfs2_alloc_get(dip) == NULL)
+		return -ENOMEM;
+
+	dip->i_alloc->al_requested = RES_DINODE;
+	error = gfs2_inplace_reserve(dip);
+	if (error)
+		goto out;
+
+	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0);
+	if (error)
+		goto out_ipreserv;
+
+	error = gfs2_alloc_di(dip, no_addr, generation);
+
+	gfs2_trans_end(sdp);
+
+out_ipreserv:
+	gfs2_inplace_release(dip);
+out:
+	gfs2_alloc_put(dip);
+	return error;
+}
+
+static void gfs2_init_dir(struct buffer_head *dibh,
+			  const struct gfs2_inode *parent)
+{
+	struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+	struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
+
+	gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
+	dent->de_inum = di->di_num; /* already GFS2 endian */
+	dent->de_type = cpu_to_be16(DT_DIR);
+
+	dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
+	gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+	gfs2_inum_out(parent, dent);
+	dent->de_type = cpu_to_be16(DT_DIR);
+	
+}
+
+/**
+ * init_dinode - Fill in a new dinode structure
+ * @dip: The directory this inode is being created in
+ * @gl: The glock covering the new inode
+ * @inum: The inode number
+ * @mode: The file permissions
+ * @uid: The uid of the new inode
+ * @gid: The gid of the new inode
+ * @generation: The generation number of the new inode
+ * @dev: The device number (if a device node)
+ * @symname: The symlink destination (if a symlink)
+ * @size: The inode size (ignored for directories)
+ * @bhp: The buffer head (returned to caller)
+ *
+ */
+
+static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
+			const struct gfs2_inum_host *inum, unsigned int mode,
+			unsigned int uid, unsigned int gid,
+			const u64 *generation, dev_t dev, const char *symname,
+			unsigned size, struct buffer_head **bhp)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_dinode *di;
+	struct buffer_head *dibh;
+	struct timespec tv = CURRENT_TIME;
+
+	dibh = gfs2_meta_new(gl, inum->no_addr);
+	gfs2_trans_add_bh(gl, dibh, 1);
+	gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
+	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+	di = (struct gfs2_dinode *)dibh->b_data;
+
+	di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino);
+	di->di_num.no_addr = cpu_to_be64(inum->no_addr);
+	di->di_mode = cpu_to_be32(mode);
+	di->di_uid = cpu_to_be32(uid);
+	di->di_gid = cpu_to_be32(gid);
+	di->di_nlink = 0;
+	di->di_size = cpu_to_be64(size);
+	di->di_blocks = cpu_to_be64(1);
+	di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec);
+	di->di_major = cpu_to_be32(MAJOR(dev));
+	di->di_minor = cpu_to_be32(MINOR(dev));
+	di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
+	di->di_generation = cpu_to_be64(*generation);
+	di->di_flags = 0;
+	di->__pad1 = 0;
+	di->di_payload_format = cpu_to_be32(S_ISDIR(mode) ? GFS2_FORMAT_DE : 0);
+	di->di_height = 0;
+	di->__pad2 = 0;
+	di->__pad3 = 0;
+	di->di_depth = 0;
+	di->di_entries = 0;
+	memset(&di->__pad4, 0, sizeof(di->__pad4));
+	di->di_eattr = 0;
+	di->di_atime_nsec = cpu_to_be32(tv.tv_nsec);
+	di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec);
+	di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec);
+	memset(&di->di_reserved, 0, sizeof(di->di_reserved));
+
+	switch(mode & S_IFMT) {	
+	case S_IFREG:
+		if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
+		    gfs2_tune_get(sdp, gt_new_files_jdata))
+			di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
+		break;
+	case S_IFDIR:
+		di->di_flags |= cpu_to_be32(dip->i_diskflags &
+					    GFS2_DIF_INHERIT_JDATA);
+		di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
+		di->di_size = cpu_to_be64(sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
+		di->di_entries = cpu_to_be32(2);
+		gfs2_init_dir(dibh, dip);
+		break;
+	case S_IFLNK:
+		memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, size);
+		break;
+	}
+
+	set_buffer_uptodate(dibh);
+
+	*bhp = dibh;
+}
+
+static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
+		       unsigned int mode, const struct gfs2_inum_host *inum,
+		       const u64 *generation, dev_t dev, const char *symname,
+		       unsigned int size, struct buffer_head **bhp)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	unsigned int uid, gid;
+	int error;
+
+	munge_mode_uid_gid(dip, &mode, &uid, &gid);
+	if (!gfs2_alloc_get(dip))
+		return -ENOMEM;
+
+	error = gfs2_quota_lock(dip, uid, gid);
+	if (error)
+		goto out;
+
+	error = gfs2_quota_check(dip, uid, gid);
+	if (error)
+		goto out_quota;
+
+	error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0);
+	if (error)
+		goto out_quota;
+
+	init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, symname, size, bhp);
+	gfs2_quota_change(dip, +1, uid, gid);
+	gfs2_trans_end(sdp);
+
+out_quota:
+	gfs2_quota_unlock(dip);
+out:
+	gfs2_alloc_put(dip);
+	return error;
+}
+
+static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
+		       struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_alloc *al;
+	int alloc_required;
+	struct buffer_head *dibh;
+	int error;
+
+	al = gfs2_alloc_get(dip);
+	if (!al)
+		return -ENOMEM;
+
+	error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto fail;
+
+	error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
+	if (alloc_required < 0)
+		goto fail_quota_locks;
+	if (alloc_required) {
+		error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid);
+		if (error)
+			goto fail_quota_locks;
+
+		al->al_requested = sdp->sd_max_dirres;
+
+		error = gfs2_inplace_reserve(dip);
+		if (error)
+			goto fail_quota_locks;
+
+		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+					 al->al_rgd->rd_length +
+					 2 * RES_DINODE +
+					 RES_STATFS + RES_QUOTA, 0);
+		if (error)
+			goto fail_ipreserv;
+	} else {
+		error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0);
+		if (error)
+			goto fail_quota_locks;
+	}
+
+	error = gfs2_dir_add(&dip->i_inode, name, ip);
+	if (error)
+		goto fail_end_trans;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto fail_end_trans;
+	inc_nlink(&ip->i_inode);
+	if (S_ISDIR(ip->i_inode.i_mode))
+		inc_nlink(&ip->i_inode);
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+	return 0;
+
+fail_end_trans:
+	gfs2_trans_end(sdp);
+
+fail_ipreserv:
+	if (dip->i_alloc->al_rgd)
+		gfs2_inplace_release(dip);
+
+fail_quota_locks:
+	gfs2_quota_unlock(dip);
+
+fail:
+	gfs2_alloc_put(dip);
+	return error;
+}
+
+static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
+			      const struct qstr *qstr)
+{
+	int err;
+	size_t len;
+	void *value;
+	char *name;
+
+	err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
+					   &name, &value, &len);
+
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+
+	err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0,
+			       GFS2_EATYPE_SECURITY);
+	kfree(value);
+	kfree(name);
+
+	return err;
+}
+
+/**
+ * gfs2_create_inode - Create a new inode
+ * @dir: The parent directory
+ * @dentry: The new dentry
+ * @mode: The permissions on the new inode
+ * @dev: For device nodes, this is the device number
+ * @symname: For symlinks, this is the link destination
+ * @size: The initial size of the inode (ignored for directories)
+ *
+ * Returns: 0 on success, or error code
+ */
+
+static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
+			     unsigned int mode, dev_t dev, const char *symname,
+			     unsigned int size)
+{
+	const struct qstr *name = &dentry->d_name;
+	struct gfs2_holder ghs[2];
+	struct inode *inode = NULL;
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
+	int error;
+	u64 generation;
+	struct buffer_head *bh = NULL;
+
+	if (!name->len || name->len > GFS2_FNAMESIZE)
+		return -ENAMETOOLONG;
+
+	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	if (error)
+		goto fail;
+
+	error = create_ok(dip, name, mode);
+	if (error)
+		goto fail_gunlock;
+
+	error = alloc_dinode(dip, &inum.no_addr, &generation);
+	if (error)
+		goto fail_gunlock;
+	inum.no_formal_ino = generation;
+
+	error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops,
+				  LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
+	if (error)
+		goto fail_gunlock;
+
+	error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, symname, size, &bh);
+	if (error)
+		goto fail_gunlock2;
+
+	inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
+				  inum.no_formal_ino, 0);
+	if (IS_ERR(inode))
+		goto fail_gunlock2;
+
+	error = gfs2_inode_refresh(GFS2_I(inode));
+	if (error)
+		goto fail_gunlock2;
+
+	error = gfs2_acl_create(dip, inode);
+	if (error)
+		goto fail_gunlock2;
+
+	error = gfs2_security_init(dip, GFS2_I(inode), name);
+	if (error)
+		goto fail_gunlock2;
+
+	error = link_dinode(dip, name, GFS2_I(inode));
+	if (error)
+		goto fail_gunlock2;
+
+	if (bh)
+		brelse(bh);
+
+	gfs2_trans_end(sdp);
+	if (dip->i_alloc->al_rgd)
+		gfs2_inplace_release(dip);
+	gfs2_quota_unlock(dip);
+	gfs2_alloc_put(dip);
+	gfs2_glock_dq_uninit_m(2, ghs);
+	mark_inode_dirty(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+
+fail_gunlock2:
+	gfs2_glock_dq_uninit(ghs + 1);
+	if (inode && !IS_ERR(inode))
+		iput(inode);
+fail_gunlock:
+	gfs2_glock_dq_uninit(ghs);
+fail:
+	if (bh)
+		brelse(bh);
+	return error;
+}
+
+/**
+ * gfs2_create - Create a file
+ * @dir: The directory in which to create the file
+ * @dentry: The dentry of the new file
+ * @mode: The mode of the new file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_create(struct inode *dir, struct dentry *dentry,
+		       int mode, struct nameidata *nd)
+{
+	struct inode *inode;
+	int ret;
+
+	for (;;) {
+		ret = gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0);
+		if (ret != -EEXIST || (nd && (nd->flags & LOOKUP_EXCL)))
+			return ret;
+
+		inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+		if (inode) {
+			if (!IS_ERR(inode))
+				break;
+			return PTR_ERR(inode);
+		}
+	}
+
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+/**
+ * gfs2_lookup - Look up a filename in a directory and return its inode
+ * @dir: The directory inode
+ * @dentry: The dentry of the new inode
+ * @nd: passed from Linux VFS, ignored by us
+ *
+ * Called by the VFS layer. Lock dir and call gfs2_lookupi()
+ *
+ * Returns: errno
+ */
+
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+				  struct nameidata *nd)
+{
+	struct inode *inode = NULL;
+
+	inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+	if (inode && IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	if (inode) {
+		struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
+		struct gfs2_holder gh;
+		int error;
+		error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+		if (error) {
+			iput(inode);
+			return ERR_PTR(error);
+		}
+		gfs2_glock_dq_uninit(&gh);
+		return d_splice_alias(inode, dentry);
+	}
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+/**
+ * gfs2_link - Link to a file
+ * @old_dentry: The inode to link
+ * @dir: Add link to this directory
+ * @dentry: The name of the link
+ *
+ * Link the inode in "old_dentry" into the directory "dir" with the
+ * name in "dentry".
+ *
+ * Returns: errno
+ */
+
+static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
+		     struct dentry *dentry)
+{
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	struct inode *inode = old_dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder ghs[2];
+	struct buffer_head *dibh;
+	int alloc_required;
+	int error;
+
+	if (S_ISDIR(inode->i_mode))
+		return -EPERM;
+
+	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+
+	error = gfs2_glock_nq(ghs); /* parent */
+	if (error)
+		goto out_parent;
+
+	error = gfs2_glock_nq(ghs + 1); /* child */
+	if (error)
+		goto out_child;
+
+	error = -ENOENT;
+	if (inode->i_nlink == 0)
+		goto out_gunlock;
+
+	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
+	if (error)
+		goto out_gunlock;
+
+	error = gfs2_dir_check(dir, &dentry->d_name, NULL);
+	switch (error) {
+	case -ENOENT:
+		break;
+	case 0:
+		error = -EEXIST;
+	default:
+		goto out_gunlock;
+	}
+
+	error = -EINVAL;
+	if (!dip->i_inode.i_nlink)
+		goto out_gunlock;
+	error = -EFBIG;
+	if (dip->i_entries == (u32)-1)
+		goto out_gunlock;
+	error = -EPERM;
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		goto out_gunlock;
+	error = -EINVAL;
+	if (!ip->i_inode.i_nlink)
+		goto out_gunlock;
+	error = -EMLINK;
+	if (ip->i_inode.i_nlink == (u32)-1)
+		goto out_gunlock;
+
+	alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
+	if (error < 0)
+		goto out_gunlock;
+	error = 0;
+
+	if (alloc_required) {
+		struct gfs2_alloc *al = gfs2_alloc_get(dip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_gunlock;
+		}
+
+		error = gfs2_quota_lock_check(dip);
+		if (error)
+			goto out_alloc;
+
+		al->al_requested = sdp->sd_max_dirres;
+
+		error = gfs2_inplace_reserve(dip);
+		if (error)
+			goto out_gunlock_q;
+
+		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+					 gfs2_rg_blocks(al) +
+					 2 * RES_DINODE + RES_STATFS +
+					 RES_QUOTA, 0);
+		if (error)
+			goto out_ipres;
+	} else {
+		error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
+		if (error)
+			goto out_ipres;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_dir_add(dir, &dentry->d_name, ip);
+	if (error)
+		goto out_brelse;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	inc_nlink(&ip->i_inode);
+	ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_dinode_out(ip, dibh->b_data);
+	mark_inode_dirty(&ip->i_inode);
+
+out_brelse:
+	brelse(dibh);
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_ipres:
+	if (alloc_required)
+		gfs2_inplace_release(dip);
+out_gunlock_q:
+	if (alloc_required)
+		gfs2_quota_unlock(dip);
+out_alloc:
+	if (alloc_required)
+		gfs2_alloc_put(dip);
+out_gunlock:
+	gfs2_glock_dq(ghs + 1);
+out_child:
+	gfs2_glock_dq(ghs);
+out_parent:
+	gfs2_holder_uninit(ghs);
+	gfs2_holder_uninit(ghs + 1);
+	if (!error) {
+		ihold(inode);
+		d_instantiate(dentry, inode);
+		mark_inode_dirty(inode);
+	}
+	return error;
+}
+
+/*
+ * gfs2_unlink_ok - check to see that a inode is still in a directory
+ * @dip: the directory
+ * @name: the name of the file
+ * @ip: the inode
+ *
+ * Assumes that the lock on (at least) @dip is held.
+ *
+ * Returns: 0 if the parent/child relationship is correct, errno if it isn't
+ */
+
+static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+			  const struct gfs2_inode *ip)
+{
+	int error;
+
+	if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+		return -EPERM;
+
+	if ((dip->i_inode.i_mode & S_ISVTX) &&
+	    dip->i_inode.i_uid != current_fsuid() &&
+	    ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
+		return -EPERM;
+
+	if (IS_APPEND(&dip->i_inode))
+		return -EPERM;
+
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
+	if (error)
+		return error;
+
+	error = gfs2_dir_check(&dip->i_inode, name, ip);
+	if (error)
+		return error;
+
+	return 0;
+}
+
+/**
+ * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it
+ * @dip: The parent directory
+ * @name: The name of the entry in the parent directory
+ * @bh: The inode buffer for the inode to be removed
+ * @inode: The inode to be removed
+ *
+ * Called with all the locks and in a transaction. This will only be
+ * called for a directory after it has been checked to ensure it is empty.
+ *
+ * Returns: 0 on success, or an error
+ */
+
+static int gfs2_unlink_inode(struct gfs2_inode *dip,
+			     const struct dentry *dentry,
+			     struct buffer_head *bh)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int error;
+
+	error = gfs2_dir_del(dip, dentry);
+	if (error)
+		return error;
+
+	ip->i_entries = 0;
+	inode->i_ctime = CURRENT_TIME;
+	if (S_ISDIR(inode->i_mode))
+		clear_nlink(inode);
+	else
+		drop_nlink(inode);
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_dinode_out(ip, bh->b_data);
+	mark_inode_dirty(inode);
+	if (inode->i_nlink == 0)
+		gfs2_unlink_di(inode);
+	return 0;
+}
+
+
+/**
+ * gfs2_unlink - Unlink an inode (this does rmdir as well)
+ * @dir: The inode of the directory containing the inode to unlink
+ * @dentry: The file itself
+ *
+ * This routine uses the type of the inode as a flag to figure out
+ * whether this is an unlink or an rmdir.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *bh;
+	struct gfs2_holder ghs[3];
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder ri_gh;
+	int error;
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		return error;
+
+	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	gfs2_holder_init(ip->i_gl,  LM_ST_EXCLUSIVE, 0, ghs + 1);
+
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
+	gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
+
+
+	error = gfs2_glock_nq(ghs); /* parent */
+	if (error)
+		goto out_parent;
+
+	error = gfs2_glock_nq(ghs + 1); /* child */
+	if (error)
+		goto out_child;
+
+	error = -ENOENT;
+	if (inode->i_nlink == 0)
+		goto out_rgrp;
+
+	if (S_ISDIR(inode->i_mode)) {
+		error = -ENOTEMPTY;
+		if (ip->i_entries > 2 || inode->i_nlink > 2)
+			goto out_rgrp;
+	}
+
+	error = gfs2_glock_nq(ghs + 2); /* rgrp */
+	if (error)
+		goto out_rgrp;
+
+	error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
+	if (error)
+		goto out_gunlock;
+
+	error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0);
+	if (error)
+		goto out_gunlock;
+
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_unlink_inode(dip, dentry, bh);
+	brelse(bh);
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_gunlock:
+	gfs2_glock_dq(ghs + 2);
+out_rgrp:
+	gfs2_holder_uninit(ghs + 2);
+	gfs2_glock_dq(ghs + 1);
+out_child:
+	gfs2_holder_uninit(ghs + 1);
+	gfs2_glock_dq(ghs);
+out_parent:
+	gfs2_holder_uninit(ghs);
+	gfs2_glock_dq_uninit(&ri_gh);
+	return error;
+}
+
+/**
+ * gfs2_symlink - Create a symlink
+ * @dir: The directory to create the symlink in
+ * @dentry: The dentry to put the symlink in
+ * @symname: The thing which the link points to
+ *
+ * Returns: errno
+ */
+
+static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
+			const char *symname)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	unsigned int size;
+
+	size = strlen(symname);
+	if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
+		return -ENAMETOOLONG;
+
+	return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size);
+}
+
+/**
+ * gfs2_mkdir - Make a directory
+ * @dir: The parent directory of the new one
+ * @dentry: The dentry of the new directory
+ * @mode: The mode of the new directory
+ *
+ * Returns: errno
+ */
+
+static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0);
+}
+
+/**
+ * gfs2_mknod - Make a special file
+ * @dir: The directory in which the special file will reside
+ * @dentry: The dentry of the special file
+ * @mode: The mode of the special file
+ * @dev: The device specification of the special file
+ *
+ */
+
+static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
+		      dev_t dev)
+{
+	return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0);
+}
+
+/*
+ * gfs2_ok_to_move - check if it's ok to move a directory to another directory
+ * @this: move this
+ * @to: to here
+ *
+ * Follow @to back to the root and make sure we don't encounter @this
+ * Assumes we already hold the rename lock.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
+{
+	struct inode *dir = &to->i_inode;
+	struct super_block *sb = dir->i_sb;
+	struct inode *tmp;
+	int error = 0;
+
+	igrab(dir);
+
+	for (;;) {
+		if (dir == &this->i_inode) {
+			error = -EINVAL;
+			break;
+		}
+		if (dir == sb->s_root->d_inode) {
+			error = 0;
+			break;
+		}
+
+		tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
+		if (IS_ERR(tmp)) {
+			error = PTR_ERR(tmp);
+			break;
+		}
+
+		iput(dir);
+		dir = tmp;
+	}
+
+	iput(dir);
+
+	return error;
+}
+
+/**
+ * gfs2_rename - Rename a file
+ * @odir: Parent directory of old file name
+ * @odentry: The old dentry of the file
+ * @ndir: Parent directory of new file name
+ * @ndentry: The new dentry of the file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_rename(struct inode *odir, struct dentry *odentry,
+		       struct inode *ndir, struct dentry *ndentry)
+{
+	struct gfs2_inode *odip = GFS2_I(odir);
+	struct gfs2_inode *ndip = GFS2_I(ndir);
+	struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
+	struct gfs2_inode *nip = NULL;
+	struct gfs2_sbd *sdp = GFS2_SB(odir);
+	struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
+	struct gfs2_rgrpd *nrgd;
+	unsigned int num_gh;
+	int dir_rename = 0;
+	int alloc_required = 0;
+	unsigned int x;
+	int error;
+
+	if (ndentry->d_inode) {
+		nip = GFS2_I(ndentry->d_inode);
+		if (ip == nip)
+			return 0;
+	}
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		return error;
+
+	if (odip != ndip) {
+		error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
+					   0, &r_gh);
+		if (error)
+			goto out;
+
+		if (S_ISDIR(ip->i_inode.i_mode)) {
+			dir_rename = 1;
+			/* don't move a dirctory into it's subdir */
+			error = gfs2_ok_to_move(ip, ndip);
+			if (error)
+				goto out_gunlock_r;
+		}
+	}
+
+	num_gh = 1;
+	gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	if (odip != ndip) {
+		gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+		num_gh++;
+	}
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+	num_gh++;
+
+	if (nip) {
+		gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+		num_gh++;
+		/* grab the resource lock for unlink flag twiddling 
+		 * this is the case of the target file already existing
+		 * so we unlink before doing the rename
+		 */
+		nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr);
+		if (nrgd)
+			gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
+	}
+
+	for (x = 0; x < num_gh; x++) {
+		error = gfs2_glock_nq(ghs + x);
+		if (error)
+			goto out_gunlock;
+	}
+
+	error = -ENOENT;
+	if (ip->i_inode.i_nlink == 0)
+		goto out_gunlock;
+
+	/* Check out the old directory */
+
+	error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
+	if (error)
+		goto out_gunlock;
+
+	/* Check out the new directory */
+
+	if (nip) {
+		error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
+		if (error)
+			goto out_gunlock;
+
+		if (nip->i_inode.i_nlink == 0) {
+			error = -EAGAIN;
+			goto out_gunlock;
+		}
+
+		if (S_ISDIR(nip->i_inode.i_mode)) {
+			if (nip->i_entries < 2) {
+				gfs2_consist_inode(nip);
+				error = -EIO;
+				goto out_gunlock;
+			}
+			if (nip->i_entries > 2) {
+				error = -ENOTEMPTY;
+				goto out_gunlock;
+			}
+		}
+	} else {
+		error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
+		if (error)
+			goto out_gunlock;
+
+		error = gfs2_dir_check(ndir, &ndentry->d_name, NULL);
+		switch (error) {
+		case -ENOENT:
+			error = 0;
+			break;
+		case 0:
+			error = -EEXIST;
+		default:
+			goto out_gunlock;
+		};
+
+		if (odip != ndip) {
+			if (!ndip->i_inode.i_nlink) {
+				error = -ENOENT;
+				goto out_gunlock;
+			}
+			if (ndip->i_entries == (u32)-1) {
+				error = -EFBIG;
+				goto out_gunlock;
+			}
+			if (S_ISDIR(ip->i_inode.i_mode) &&
+			    ndip->i_inode.i_nlink == (u32)-1) {
+				error = -EMLINK;
+				goto out_gunlock;
+			}
+		}
+	}
+
+	/* Check out the dir to be renamed */
+
+	if (dir_rename) {
+		error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
+		if (error)
+			goto out_gunlock;
+	}
+
+	if (nip == NULL)
+		alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
+	error = alloc_required;
+	if (error < 0)
+		goto out_gunlock;
+	error = 0;
+
+	if (alloc_required) {
+		struct gfs2_alloc *al = gfs2_alloc_get(ndip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_gunlock;
+		}
+
+		error = gfs2_quota_lock_check(ndip);
+		if (error)
+			goto out_alloc;
+
+		al->al_requested = sdp->sd_max_dirres;
+
+		error = gfs2_inplace_reserve_ri(ndip);
+		if (error)
+			goto out_gunlock_q;
+
+		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+					 gfs2_rg_blocks(al) +
+					 4 * RES_DINODE + 4 * RES_LEAF +
+					 RES_STATFS + RES_QUOTA + 4, 0);
+		if (error)
+			goto out_ipreserv;
+	} else {
+		error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
+					 5 * RES_LEAF + 4, 0);
+		if (error)
+			goto out_gunlock;
+	}
+
+	/* Remove the target file, if it exists */
+
+	if (nip) {
+		struct buffer_head *bh;
+		error = gfs2_meta_inode_buffer(nip, &bh);
+		if (error)
+			goto out_end_trans;
+		error = gfs2_unlink_inode(ndip, ndentry, bh);
+		brelse(bh);
+	}
+
+	if (dir_rename) {
+		error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
+		if (error)
+			goto out_end_trans;
+	} else {
+		struct buffer_head *dibh;
+		error = gfs2_meta_inode_buffer(ip, &dibh);
+		if (error)
+			goto out_end_trans;
+		ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	error = gfs2_dir_del(odip, odentry);
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_dir_add(ndir, &ndentry->d_name, ip);
+	if (error)
+		goto out_end_trans;
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_ipreserv:
+	if (alloc_required)
+		gfs2_inplace_release(ndip);
+out_gunlock_q:
+	if (alloc_required)
+		gfs2_quota_unlock(ndip);
+out_alloc:
+	if (alloc_required)
+		gfs2_alloc_put(ndip);
+out_gunlock:
+	while (x--) {
+		gfs2_glock_dq(ghs + x);
+		gfs2_holder_uninit(ghs + x);
+	}
+out_gunlock_r:
+	if (r_gh.gh_gl)
+		gfs2_glock_dq_uninit(&r_gh);
+out:
+	gfs2_glock_dq_uninit(&ri_gh);
+	return error;
+}
+
+/**
+ * gfs2_follow_link - Follow a symbolic link
+ * @dentry: The dentry of the link
+ * @nd: Data that we pass to vfs_follow_link()
+ *
+ * This can handle symlinks of any size.
+ *
+ * Returns: 0 on success or error code
+ */
+
+static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct gfs2_holder i_gh;
+	struct buffer_head *dibh;
+	unsigned int size;
+	char *buf;
+	int error;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+	error = gfs2_glock_nq(&i_gh);
+	if (error) {
+		gfs2_holder_uninit(&i_gh);
+		nd_set_link(nd, ERR_PTR(error));
+		return NULL;
+	}
+
+	size = (unsigned int)i_size_read(&ip->i_inode);
+	if (size == 0) {
+		gfs2_consist_inode(ip);
+		buf = ERR_PTR(-EIO);
+		goto out;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error) {
+		buf = ERR_PTR(error);
+		goto out;
+	}
+
+	buf = kzalloc(size + 1, GFP_NOFS);
+	if (!buf)
+		buf = ERR_PTR(-ENOMEM);
+	else
+		memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size);
+	brelse(dibh);
+out:
+	gfs2_glock_dq_uninit(&i_gh);
+	nd_set_link(nd, buf);
+	return NULL;
+}
+
+static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		kfree(s);
+}
+
+/**
+ * gfs2_permission -
+ * @inode: The inode
+ * @mask: The mask to be tested
+ * @flags: Indicates whether this is an RCU path walk or not
+ *
+ * This may be called from the VFS directly, or from within GFS2 with the
+ * inode locked, so we look to see if the glock is already locked and only
+ * lock the glock if its not already been done.
+ *
+ * Returns: errno
+ */
+
+int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	struct gfs2_inode *ip;
+	struct gfs2_holder i_gh;
+	int error;
+	int unlock = 0;
+
+
+	ip = GFS2_I(inode);
+	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+		if (flags & IPERM_FLAG_RCU)
+			return -ECHILD;
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+		if (error)
+			return error;
+		unlock = 1;
+	}
+
+	if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+		error = -EACCES;
+	else
+		error = generic_permission(inode, mask, flags, gfs2_check_acl);
+	if (unlock)
+		gfs2_glock_dq_uninit(&i_gh);
+
+	return error;
+}
+
+static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+{
+	struct inode *inode = &ip->i_inode;
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+	return 0;
+}
+
+/**
+ * gfs2_setattr_simple -
+ * @ip:
+ * @attr:
+ *
+ * Returns: errno
+ */
+
+int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+{
+	int error;
+
+	if (current->journal_info)
+		return __gfs2_setattr_simple(ip, attr);
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0);
+	if (error)
+		return error;
+
+	error = __gfs2_setattr_simple(ip, attr);
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+	return error;
+}
+
+static int setattr_chown(struct inode *inode, struct iattr *attr)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	u32 ouid, ogid, nuid, ngid;
+	int error;
+
+	ouid = inode->i_uid;
+	ogid = inode->i_gid;
+	nuid = attr->ia_uid;
+	ngid = attr->ia_gid;
+
+	if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
+		ouid = nuid = NO_QUOTA_CHANGE;
+	if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
+		ogid = ngid = NO_QUOTA_CHANGE;
+
+	if (!gfs2_alloc_get(ip))
+		return -ENOMEM;
+
+	error = gfs2_quota_lock(ip, nuid, ngid);
+	if (error)
+		goto out_alloc;
+
+	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+		error = gfs2_quota_check(ip, nuid, ngid);
+		if (error)
+			goto out_gunlock_q;
+	}
+
+	error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
+	if (error)
+		goto out_gunlock_q;
+
+	error = gfs2_setattr_simple(ip, attr);
+	if (error)
+		goto out_end_trans;
+
+	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+		u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
+		gfs2_quota_change(ip, -blocks, ouid, ogid);
+		gfs2_quota_change(ip, blocks, nuid, ngid);
+	}
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_gunlock_q:
+	gfs2_quota_unlock(ip);
+out_alloc:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+/**
+ * gfs2_setattr - Change attributes on an inode
+ * @dentry: The dentry which is changing
+ * @attr: The structure describing the change
+ *
+ * The VFS layer wants to change one or more of an inodes attributes.  Write
+ * that change out to disk.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder i_gh;
+	int error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+	if (error)
+		return error;
+
+	error = -EPERM;
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		goto out;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		goto out;
+
+	if (attr->ia_valid & ATTR_SIZE)
+		error = gfs2_setattr_size(inode, attr->ia_size);
+	else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
+		error = setattr_chown(inode, attr);
+	else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
+		error = gfs2_acl_chmod(ip, attr);
+	else
+		error = gfs2_setattr_simple(ip, attr);
+
+out:
+	gfs2_glock_dq_uninit(&i_gh);
+	if (!error)
+		mark_inode_dirty(inode);
+	return error;
+}
+
+/**
+ * gfs2_getattr - Read out an inode's attributes
+ * @mnt: The vfsmount the inode is being accessed from
+ * @dentry: The dentry to stat
+ * @stat: The inode's stats
+ *
+ * This may be called from the VFS directly, or from within GFS2 with the
+ * inode locked, so we look to see if the glock is already locked and only
+ * lock the glock if its not already been done. Note that its the NFS
+ * readdirplus operation which causes this to be called (from filldir)
+ * with the glock already held.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int error;
+	int unlock = 0;
+
+	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+		if (error)
+			return error;
+		unlock = 1;
+	}
+
+	generic_fillattr(inode, stat);
+	if (unlock)
+		gfs2_glock_dq_uninit(&gh);
+
+	return 0;
+}
+
+static int gfs2_setxattr(struct dentry *dentry, const char *name,
+			 const void *data, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret == 0) {
+		ret = generic_setxattr(dentry, name, data, size, flags);
+		gfs2_glock_dq(&gh);
+	}
+	gfs2_holder_uninit(&gh);
+	return ret;
+}
+
+static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
+			     void *data, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret == 0) {
+		ret = generic_getxattr(dentry, name, data, size);
+		gfs2_glock_dq(&gh);
+	}
+	gfs2_holder_uninit(&gh);
+	return ret;
+}
+
+static int gfs2_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret == 0) {
+		ret = generic_removexattr(dentry, name);
+		gfs2_glock_dq(&gh);
+	}
+	gfs2_holder_uninit(&gh);
+	return ret;
+}
+
+static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		       u64 start, u64 len)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+
+	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	if (ret)
+		goto out;
+
+	if (gfs2_is_stuffed(ip)) {
+		u64 phys = ip->i_no_addr << inode->i_blkbits;
+		u64 size = i_size_read(inode);
+		u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
+			    FIEMAP_EXTENT_DATA_INLINE;
+		phys += sizeof(struct gfs2_dinode);
+		phys += start;
+		if (start + len > size)
+			len = size - start;
+		if (start < size)
+			ret = fiemap_fill_next_extent(fieinfo, start, phys,
+						      len, flags);
+		if (ret == 1)
+			ret = 0;
+	} else {
+		ret = __generic_block_fiemap(inode, fieinfo, start, len,
+					     gfs2_block_map);
+	}
+
+	gfs2_glock_dq_uninit(&gh);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+const struct inode_operations gfs2_file_iops = {
+	.permission = gfs2_permission,
+	.setattr = gfs2_setattr,
+	.getattr = gfs2_getattr,
+	.setxattr = gfs2_setxattr,
+	.getxattr = gfs2_getxattr,
+	.listxattr = gfs2_listxattr,
+	.removexattr = gfs2_removexattr,
+	.fiemap = gfs2_fiemap,
+};
+
+const struct inode_operations gfs2_dir_iops = {
+	.create = gfs2_create,
+	.lookup = gfs2_lookup,
+	.link = gfs2_link,
+	.unlink = gfs2_unlink,
+	.symlink = gfs2_symlink,
+	.mkdir = gfs2_mkdir,
+	.rmdir = gfs2_unlink,
+	.mknod = gfs2_mknod,
+	.rename = gfs2_rename,
+	.permission = gfs2_permission,
+	.setattr = gfs2_setattr,
+	.getattr = gfs2_getattr,
+	.setxattr = gfs2_setxattr,
+	.getxattr = gfs2_getxattr,
+	.listxattr = gfs2_listxattr,
+	.removexattr = gfs2_removexattr,
+	.fiemap = gfs2_fiemap,
+};
+
+const struct inode_operations gfs2_symlink_iops = {
+	.readlink = generic_readlink,
+	.follow_link = gfs2_follow_link,
+	.put_link = gfs2_put_link,
+	.permission = gfs2_permission,
+	.setattr = gfs2_setattr,
+	.getattr = gfs2_getattr,
+	.setxattr = gfs2_setxattr,
+	.getxattr = gfs2_getxattr,
+	.listxattr = gfs2_listxattr,
+	.removexattr = gfs2_removexattr,
+	.fiemap = gfs2_fiemap,
+};
+
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
new file mode 100644
index 00000000..31606076
--- /dev/null
+++ b/fs/gfs2/inode.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __INODE_DOT_H__
+#define __INODE_DOT_H__
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include "util.h"
+
+extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+extern int gfs2_internal_read(struct gfs2_inode *ip,
+			      struct file_ra_state *ra_state,
+			      char *buf, loff_t *pos, unsigned size);
+extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+				   unsigned int from, unsigned int to);
+extern void gfs2_set_aops(struct inode *inode);
+
+static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
+{
+	return !ip->i_height;
+}
+
+static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
+{
+	return ip->i_diskflags & GFS2_DIF_JDATA;
+}
+
+static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip);
+}
+
+static inline int gfs2_is_ordered(const struct gfs2_inode *ip)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	return (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) && !gfs2_is_jdata(ip);
+}
+
+static inline int gfs2_is_dir(const struct gfs2_inode *ip)
+{
+	return S_ISDIR(ip->i_inode.i_mode);
+}
+
+static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks)
+{
+	inode->i_blocks = blocks <<
+		(GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+}
+
+static inline u64 gfs2_get_inode_blocks(const struct inode *inode)
+{
+	return inode->i_blocks >>
+		(GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+}
+
+static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
+{
+	gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change));
+	change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK);
+	inode->i_blocks += change;
+}
+
+static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr,
+				  u64 no_formal_ino)
+{
+	return ip->i_no_addr == no_addr && ip->i_no_formal_ino == no_formal_ino;
+}
+
+static inline void gfs2_inum_out(const struct gfs2_inode *ip,
+				 struct gfs2_dirent *dent)
+{
+	dent->de_inum.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
+	dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
+}
+
+static inline int gfs2_check_internal_file_size(struct inode *inode,
+						u64 minsize, u64 maxsize)
+{
+	u64 size = i_size_read(inode);
+	if (size < minsize || size > maxsize)
+		goto err;
+	if (size & ((1 << inode->i_blkbits) - 1))
+		goto err;
+	return 0;
+err:
+	gfs2_consist_inode(GFS2_I(inode));
+	return -EIO;
+}
+
+extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
+				       u64 no_addr, u64 no_formal_ino,
+				       int non_block);
+extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+					 u64 *no_formal_ino,
+					 unsigned int blktype);
+extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int nonblock);
+
+extern int gfs2_inode_refresh(struct gfs2_inode *ip);
+
+extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+				  int is_root);
+extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags);
+extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
+extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
+extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+
+extern const struct inode_operations gfs2_file_iops;
+extern const struct inode_operations gfs2_dir_iops;
+extern const struct inode_operations gfs2_symlink_iops;
+extern const struct file_operations gfs2_file_fops_nolock;
+extern const struct file_operations gfs2_dir_fops_nolock;
+
+extern void gfs2_set_inode_flags(struct inode *inode);
+ 
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+
+static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
+{
+	return sdp->sd_args.ar_localflocks;
+}
+#else /* Single node only */
+#define gfs2_file_fops gfs2_file_fops_nolock
+#define gfs2_dir_fops gfs2_dir_fops_nolock
+
+static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
+{
+	return 1;
+}
+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
+
+#endif /* __INODE_DOT_H__ */
+
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
new file mode 100644
index 00000000..98c80d8c
--- /dev/null
+++ b/fs/gfs2/lock_dlm.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/fs.h>
+#include <linux/dlm.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/gfs2_ondisk.h>
+
+#include "incore.h"
+#include "glock.h"
+#include "util.h"
+
+
+static void gdlm_ast(void *arg)
+{
+	struct gfs2_glock *gl = arg;
+	unsigned ret = gl->gl_state;
+
+	BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
+
+	if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID)
+		memset(gl->gl_lvb, 0, GDLM_LVB_SIZE);
+
+	switch (gl->gl_lksb.sb_status) {
+	case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
+		gfs2_glock_free(gl);
+		return;
+	case -DLM_ECANCEL: /* Cancel while getting lock */
+		ret |= LM_OUT_CANCELED;
+		goto out;
+	case -EAGAIN: /* Try lock fails */
+	case -EDEADLK: /* Deadlock detected */
+		goto out;
+	case -ETIMEDOUT: /* Canceled due to timeout */
+		ret |= LM_OUT_ERROR;
+		goto out;
+	case 0: /* Success */
+		break;
+	default: /* Something unexpected */
+		BUG();
+	}
+
+	ret = gl->gl_req;
+	if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) {
+		if (gl->gl_req == LM_ST_SHARED)
+			ret = LM_ST_DEFERRED;
+		else if (gl->gl_req == LM_ST_DEFERRED)
+			ret = LM_ST_SHARED;
+		else
+			BUG();
+	}
+
+	set_bit(GLF_INITIAL, &gl->gl_flags);
+	gfs2_glock_complete(gl, ret);
+	return;
+out:
+	if (!test_bit(GLF_INITIAL, &gl->gl_flags))
+		gl->gl_lksb.sb_lkid = 0;
+	gfs2_glock_complete(gl, ret);
+}
+
+static void gdlm_bast(void *arg, int mode)
+{
+	struct gfs2_glock *gl = arg;
+
+	switch (mode) {
+	case DLM_LOCK_EX:
+		gfs2_glock_cb(gl, LM_ST_UNLOCKED);
+		break;
+	case DLM_LOCK_CW:
+		gfs2_glock_cb(gl, LM_ST_DEFERRED);
+		break;
+	case DLM_LOCK_PR:
+		gfs2_glock_cb(gl, LM_ST_SHARED);
+		break;
+	default:
+		printk(KERN_ERR "unknown bast mode %d", mode);
+		BUG();
+	}
+}
+
+/* convert gfs lock-state to dlm lock-mode */
+
+static int make_mode(const unsigned int lmstate)
+{
+	switch (lmstate) {
+	case LM_ST_UNLOCKED:
+		return DLM_LOCK_NL;
+	case LM_ST_EXCLUSIVE:
+		return DLM_LOCK_EX;
+	case LM_ST_DEFERRED:
+		return DLM_LOCK_CW;
+	case LM_ST_SHARED:
+		return DLM_LOCK_PR;
+	}
+	printk(KERN_ERR "unknown LM state %d", lmstate);
+	BUG();
+	return -1;
+}
+
+static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
+		      const int req)
+{
+	u32 lkf = 0;
+
+	if (gfs_flags & LM_FLAG_TRY)
+		lkf |= DLM_LKF_NOQUEUE;
+
+	if (gfs_flags & LM_FLAG_TRY_1CB) {
+		lkf |= DLM_LKF_NOQUEUE;
+		lkf |= DLM_LKF_NOQUEUEBAST;
+	}
+
+	if (gfs_flags & LM_FLAG_PRIORITY) {
+		lkf |= DLM_LKF_NOORDER;
+		lkf |= DLM_LKF_HEADQUE;
+	}
+
+	if (gfs_flags & LM_FLAG_ANY) {
+		if (req == DLM_LOCK_PR)
+			lkf |= DLM_LKF_ALTCW;
+		else if (req == DLM_LOCK_CW)
+			lkf |= DLM_LKF_ALTPR;
+		else
+			BUG();
+	}
+
+	if (lkid != 0) 
+		lkf |= DLM_LKF_CONVERT;
+
+	lkf |= DLM_LKF_VALBLK;
+
+	return lkf;
+}
+
+static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
+		     unsigned int flags)
+{
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	int req;
+	u32 lkf;
+
+	req = make_mode(req_state);
+	lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
+
+	/*
+	 * Submit the actual lock request.
+	 */
+
+	return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+			GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+}
+
+static void gdlm_put_lock(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	int error;
+
+	if (gl->gl_lksb.sb_lkid == 0) {
+		gfs2_glock_free(gl);
+		return;
+	}
+
+	error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
+			   NULL, gl);
+	if (error) {
+		printk(KERN_ERR "gdlm_unlock %x,%llx err=%d\n",
+		       gl->gl_name.ln_type,
+		       (unsigned long long)gl->gl_name.ln_number, error);
+		return;
+	}
+}
+
+static void gdlm_cancel(struct gfs2_glock *gl)
+{
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
+}
+
+static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	int error;
+
+	if (fsname == NULL) {
+		fs_info(sdp, "no fsname found\n");
+		return -EINVAL;
+	}
+
+	error = dlm_new_lockspace(fsname, strlen(fsname), &ls->ls_dlm,
+				  DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
+				  (ls->ls_nodir ? DLM_LSFL_NODIR : 0),
+				  GDLM_LVB_SIZE);
+	if (error)
+		printk(KERN_ERR "dlm_new_lockspace error %d", error);
+
+	return error;
+}
+
+static void gdlm_unmount(struct gfs2_sbd *sdp)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+	if (ls->ls_dlm) {
+		dlm_release_lockspace(ls->ls_dlm, 2);
+		ls->ls_dlm = NULL;
+	}
+}
+
+static const match_table_t dlm_tokens = {
+	{ Opt_jid, "jid=%d"},
+	{ Opt_id, "id=%d"},
+	{ Opt_first, "first=%d"},
+	{ Opt_nodir, "nodir=%d"},
+	{ Opt_err, NULL },
+};
+
+const struct lm_lockops gfs2_dlm_ops = {
+	.lm_proto_name = "lock_dlm",
+	.lm_mount = gdlm_mount,
+	.lm_unmount = gdlm_unmount,
+	.lm_put_lock = gdlm_put_lock,
+	.lm_lock = gdlm_lock,
+	.lm_cancel = gdlm_cancel,
+	.lm_tokens = &dlm_tokens,
+};
+
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
new file mode 100644
index 00000000..85c62923
--- /dev/null
+++ b/fs/gfs2/log.c
@@ -0,0 +1,972 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/bio.h>
+#include <linux/writeback.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "util.h"
+#include "dir.h"
+#include "trace_gfs2.h"
+
+#define PULL 1
+
+/**
+ * gfs2_struct2blk - compute stuff
+ * @sdp: the filesystem
+ * @nstruct: the number of structures
+ * @ssize: the size of the structures
+ *
+ * Compute the number of log descriptor blocks needed to hold a certain number
+ * of structures of a certain size.
+ *
+ * Returns: the number of blocks needed (minimum is always 1)
+ */
+
+unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+			     unsigned int ssize)
+{
+	unsigned int blks;
+	unsigned int first, second;
+
+	blks = 1;
+	first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize;
+
+	if (nstruct > first) {
+		second = (sdp->sd_sb.sb_bsize -
+			  sizeof(struct gfs2_meta_header)) / ssize;
+		blks += DIV_ROUND_UP(nstruct - first, second);
+	}
+
+	return blks;
+}
+
+/**
+ * gfs2_remove_from_ail - Remove an entry from the ail lists, updating counters
+ * @mapping: The associated mapping (maybe NULL)
+ * @bd: The gfs2_bufdata to remove
+ *
+ * The ail lock _must_ be held when calling this function
+ *
+ */
+
+void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
+{
+	bd->bd_ail = NULL;
+	list_del_init(&bd->bd_ail_st_list);
+	list_del_init(&bd->bd_ail_gl_list);
+	atomic_dec(&bd->bd_gl->gl_ail_count);
+	brelse(bd->bd_bh);
+}
+
+/**
+ * gfs2_ail1_start_one - Start I/O on a part of the AIL
+ * @sdp: the filesystem
+ * @wbc: The writeback control structure
+ * @ai: The ail structure
+ *
+ */
+
+static int gfs2_ail1_start_one(struct gfs2_sbd *sdp,
+			       struct writeback_control *wbc,
+			       struct gfs2_ail *ai)
+__releases(&sdp->sd_ail_lock)
+__acquires(&sdp->sd_ail_lock)
+{
+	struct gfs2_glock *gl = NULL;
+	struct address_space *mapping;
+	struct gfs2_bufdata *bd, *s;
+	struct buffer_head *bh;
+
+	list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, bd_ail_st_list) {
+		bh = bd->bd_bh;
+
+		gfs2_assert(sdp, bd->bd_ail == ai);
+
+		if (!buffer_busy(bh)) {
+			if (!buffer_uptodate(bh))
+				gfs2_io_error_bh(sdp, bh);
+			list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+			continue;
+		}
+
+		if (!buffer_dirty(bh))
+			continue;
+		if (gl == bd->bd_gl)
+			continue;
+		gl = bd->bd_gl;
+		list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+		mapping = bh->b_page->mapping;
+		if (!mapping)
+			continue;
+		spin_unlock(&sdp->sd_ail_lock);
+		generic_writepages(mapping, wbc);
+		spin_lock(&sdp->sd_ail_lock);
+		if (wbc->nr_to_write <= 0)
+			break;
+		return 1;
+	}
+
+	return 0;
+}
+
+
+/**
+ * gfs2_ail1_flush - start writeback of some ail1 entries 
+ * @sdp: The super block
+ * @wbc: The writeback control structure
+ *
+ * Writes back some ail1 entries, according to the limits in the
+ * writeback control structure
+ */
+
+void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
+{
+	struct list_head *head = &sdp->sd_ail1_list;
+	struct gfs2_ail *ai;
+
+	trace_gfs2_ail_flush(sdp, wbc, 1);
+	spin_lock(&sdp->sd_ail_lock);
+restart:
+	list_for_each_entry_reverse(ai, head, ai_list) {
+		if (wbc->nr_to_write <= 0)
+			break;
+		if (gfs2_ail1_start_one(sdp, wbc, ai))
+			goto restart;
+	}
+	spin_unlock(&sdp->sd_ail_lock);
+	trace_gfs2_ail_flush(sdp, wbc, 0);
+}
+
+/**
+ * gfs2_ail1_start - start writeback of all ail1 entries
+ * @sdp: The superblock
+ */
+
+static void gfs2_ail1_start(struct gfs2_sbd *sdp)
+{
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = LONG_MAX,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+	};
+
+	return gfs2_ail1_flush(sdp, &wbc);
+}
+
+/**
+ * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
+ * @sdp: the filesystem
+ * @ai: the AIL entry
+ *
+ */
+
+static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+	struct gfs2_bufdata *bd, *s;
+	struct buffer_head *bh;
+
+	list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
+					 bd_ail_st_list) {
+		bh = bd->bd_bh;
+		gfs2_assert(sdp, bd->bd_ail == ai);
+		if (buffer_busy(bh))
+			continue;
+		if (!buffer_uptodate(bh))
+			gfs2_io_error_bh(sdp, bh);
+		list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+	}
+
+}
+
+/**
+ * gfs2_ail1_empty - Try to empty the ail1 lists
+ * @sdp: The superblock
+ *
+ * Tries to empty the ail1 lists, starting with the oldest first
+ */
+
+static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
+{
+	struct gfs2_ail *ai, *s;
+	int ret;
+
+	spin_lock(&sdp->sd_ail_lock);
+	list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
+		gfs2_ail1_empty_one(sdp, ai);
+		if (list_empty(&ai->ai_ail1_list))
+			list_move(&ai->ai_list, &sdp->sd_ail2_list);
+		else
+			break;
+	}
+	ret = list_empty(&sdp->sd_ail1_list);
+	spin_unlock(&sdp->sd_ail_lock);
+
+	return ret;
+}
+
+static void gfs2_ail1_wait(struct gfs2_sbd *sdp)
+{
+	struct gfs2_ail *ai;
+	struct gfs2_bufdata *bd;
+	struct buffer_head *bh;
+
+	spin_lock(&sdp->sd_ail_lock);
+	list_for_each_entry_reverse(ai, &sdp->sd_ail1_list, ai_list) {
+		list_for_each_entry(bd, &ai->ai_ail1_list, bd_ail_st_list) {
+			bh = bd->bd_bh;
+			if (!buffer_locked(bh))
+				continue;
+			get_bh(bh);
+			spin_unlock(&sdp->sd_ail_lock);
+			wait_on_buffer(bh);
+			brelse(bh);
+			return;
+		}
+	}
+	spin_unlock(&sdp->sd_ail_lock);
+}
+
+/**
+ * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
+ * @sdp: the filesystem
+ * @ai: the AIL entry
+ *
+ */
+
+static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+	struct list_head *head = &ai->ai_ail2_list;
+	struct gfs2_bufdata *bd;
+
+	while (!list_empty(head)) {
+		bd = list_entry(head->prev, struct gfs2_bufdata,
+				bd_ail_st_list);
+		gfs2_assert(sdp, bd->bd_ail == ai);
+		gfs2_remove_from_ail(bd);
+	}
+}
+
+static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
+{
+	struct gfs2_ail *ai, *safe;
+	unsigned int old_tail = sdp->sd_log_tail;
+	int wrap = (new_tail < old_tail);
+	int a, b, rm;
+
+	spin_lock(&sdp->sd_ail_lock);
+
+	list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
+		a = (old_tail <= ai->ai_first);
+		b = (ai->ai_first < new_tail);
+		rm = (wrap) ? (a || b) : (a && b);
+		if (!rm)
+			continue;
+
+		gfs2_ail2_empty_one(sdp, ai);
+		list_del(&ai->ai_list);
+		gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
+		gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
+		kfree(ai);
+	}
+
+	spin_unlock(&sdp->sd_ail_lock);
+}
+
+/**
+ * gfs2_log_reserve - Make a log reservation
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks to reserve
+ *
+ * Note that we never give out the last few blocks of the journal. Thats
+ * due to the fact that there is a small number of header blocks
+ * associated with each log flush. The exact number can't be known until
+ * flush time, so we ensure that we have just enough free blocks at all
+ * times to avoid running out during a log flush.
+ *
+ * We no longer flush the log here, instead we wake up logd to do that
+ * for us. To avoid the thundering herd and to ensure that we deal fairly
+ * with queued waiters, we use an exclusive wait. This means that when we
+ * get woken with enough journal space to get our reservation, we need to
+ * wake the next waiter on the list.
+ *
+ * Returns: errno
+ */
+
+int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
+{
+	unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
+	unsigned wanted = blks + reserved_blks;
+	DEFINE_WAIT(wait);
+	int did_wait = 0;
+	unsigned int free_blocks;
+
+	if (gfs2_assert_warn(sdp, blks) ||
+	    gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
+		return -EINVAL;
+retry:
+	free_blocks = atomic_read(&sdp->sd_log_blks_free);
+	if (unlikely(free_blocks <= wanted)) {
+		do {
+			prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait,
+					TASK_UNINTERRUPTIBLE);
+			wake_up(&sdp->sd_logd_waitq);
+			did_wait = 1;
+			if (atomic_read(&sdp->sd_log_blks_free) <= wanted)
+				io_schedule();
+			free_blocks = atomic_read(&sdp->sd_log_blks_free);
+		} while(free_blocks <= wanted);
+		finish_wait(&sdp->sd_log_waitq, &wait);
+	}
+	if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
+				free_blocks - blks) != free_blocks)
+		goto retry;
+	trace_gfs2_log_blocks(sdp, -blks);
+
+	/*
+	 * If we waited, then so might others, wake them up _after_ we get
+	 * our share of the log.
+	 */
+	if (unlikely(did_wait))
+		wake_up(&sdp->sd_log_waitq);
+
+	down_read(&sdp->sd_log_flush_lock);
+
+	return 0;
+}
+
+static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
+{
+	struct gfs2_journal_extent *je;
+
+	list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) {
+		if (lbn >= je->lblock && lbn < je->lblock + je->blocks)
+			return je->dblock + lbn - je->lblock;
+	}
+
+	return -1;
+}
+
+/**
+ * log_distance - Compute distance between two journal blocks
+ * @sdp: The GFS2 superblock
+ * @newer: The most recent journal block of the pair
+ * @older: The older journal block of the pair
+ *
+ *   Compute the distance (in the journal direction) between two
+ *   blocks in the journal
+ *
+ * Returns: the distance in blocks
+ */
+
+static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer,
+					unsigned int older)
+{
+	int dist;
+
+	dist = newer - older;
+	if (dist < 0)
+		dist += sdp->sd_jdesc->jd_blocks;
+
+	return dist;
+}
+
+/**
+ * calc_reserved - Calculate the number of blocks to reserve when
+ *                 refunding a transaction's unused buffers.
+ * @sdp: The GFS2 superblock
+ *
+ * This is complex.  We need to reserve room for all our currently used
+ * metadata buffers (e.g. normal file I/O rewriting file time stamps) and 
+ * all our journaled data buffers for journaled files (e.g. files in the 
+ * meta_fs like rindex, or files for which chattr +j was done.)
+ * If we don't reserve enough space, gfs2_log_refund and gfs2_log_flush
+ * will count it as free space (sd_log_blks_free) and corruption will follow.
+ *
+ * We can have metadata bufs and jdata bufs in the same journal.  So each
+ * type gets its own log header, for which we need to reserve a block.
+ * In fact, each type has the potential for needing more than one header 
+ * in cases where we have more buffers than will fit on a journal page.
+ * Metadata journal entries take up half the space of journaled buffer entries.
+ * Thus, metadata entries have buf_limit (502) and journaled buffers have
+ * databuf_limit (251) before they cause a wrap around.
+ *
+ * Also, we need to reserve blocks for revoke journal entries and one for an
+ * overall header for the lot.
+ *
+ * Returns: the number of blocks reserved
+ */
+static unsigned int calc_reserved(struct gfs2_sbd *sdp)
+{
+	unsigned int reserved = 0;
+	unsigned int mbuf_limit, metabufhdrs_needed;
+	unsigned int dbuf_limit, databufhdrs_needed;
+	unsigned int revokes = 0;
+
+	mbuf_limit = buf_limit(sdp);
+	metabufhdrs_needed = (sdp->sd_log_commited_buf +
+			      (mbuf_limit - 1)) / mbuf_limit;
+	dbuf_limit = databuf_limit(sdp);
+	databufhdrs_needed = (sdp->sd_log_commited_databuf +
+			      (dbuf_limit - 1)) / dbuf_limit;
+
+	if (sdp->sd_log_commited_revoke > 0)
+		revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
+					  sizeof(u64));
+
+	reserved = sdp->sd_log_commited_buf + metabufhdrs_needed +
+		sdp->sd_log_commited_databuf + databufhdrs_needed +
+		revokes;
+	/* One for the overall header */
+	if (reserved)
+		reserved++;
+	return reserved;
+}
+
+static unsigned int current_tail(struct gfs2_sbd *sdp)
+{
+	struct gfs2_ail *ai;
+	unsigned int tail;
+
+	spin_lock(&sdp->sd_ail_lock);
+
+	if (list_empty(&sdp->sd_ail1_list)) {
+		tail = sdp->sd_log_head;
+	} else {
+		ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list);
+		tail = ai->ai_first;
+	}
+
+	spin_unlock(&sdp->sd_ail_lock);
+
+	return tail;
+}
+
+void gfs2_log_incr_head(struct gfs2_sbd *sdp)
+{
+	if (sdp->sd_log_flush_head == sdp->sd_log_tail)
+		BUG_ON(sdp->sd_log_flush_head != sdp->sd_log_head);
+
+	if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
+		sdp->sd_log_flush_head = 0;
+		sdp->sd_log_flush_wrapped = 1;
+	}
+}
+
+/**
+ * gfs2_log_write_endio - End of I/O for a log buffer
+ * @bh: The buffer head
+ * @uptodate: I/O Status
+ *
+ */
+
+static void gfs2_log_write_endio(struct buffer_head *bh, int uptodate)
+{
+	struct gfs2_sbd *sdp = bh->b_private;
+	bh->b_private = NULL;
+
+	end_buffer_write_sync(bh, uptodate);
+	if (atomic_dec_and_test(&sdp->sd_log_in_flight))
+		wake_up(&sdp->sd_log_flush_wait);
+}
+
+/**
+ * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: the buffer_head
+ */
+
+struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
+{
+	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
+	struct buffer_head *bh;
+
+	bh = sb_getblk(sdp->sd_vfs, blkno);
+	lock_buffer(bh);
+	memset(bh->b_data, 0, bh->b_size);
+	set_buffer_uptodate(bh);
+	clear_buffer_dirty(bh);
+	gfs2_log_incr_head(sdp);
+	atomic_inc(&sdp->sd_log_in_flight);
+	bh->b_private = sdp;
+	bh->b_end_io = gfs2_log_write_endio;
+
+	return bh;
+}
+
+/**
+ * gfs2_fake_write_endio - 
+ * @bh: The buffer head
+ * @uptodate: The I/O Status
+ *
+ */
+
+static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate)
+{
+	struct buffer_head *real_bh = bh->b_private;
+	struct gfs2_bufdata *bd = real_bh->b_private;
+	struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd;
+
+	end_buffer_write_sync(bh, uptodate);
+	free_buffer_head(bh);
+	unlock_buffer(real_bh);
+	brelse(real_bh);
+	if (atomic_dec_and_test(&sdp->sd_log_in_flight))
+		wake_up(&sdp->sd_log_flush_wait);
+}
+
+/**
+ * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
+ * @sdp: the filesystem
+ * @data: the data the buffer_head should point to
+ *
+ * Returns: the log buffer descriptor
+ */
+
+struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+				      struct buffer_head *real)
+{
+	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
+	struct buffer_head *bh;
+
+	bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
+	atomic_set(&bh->b_count, 1);
+	bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock);
+	set_bh_page(bh, real->b_page, bh_offset(real));
+	bh->b_blocknr = blkno;
+	bh->b_size = sdp->sd_sb.sb_bsize;
+	bh->b_bdev = sdp->sd_vfs->s_bdev;
+	bh->b_private = real;
+	bh->b_end_io = gfs2_fake_write_endio;
+
+	gfs2_log_incr_head(sdp);
+	atomic_inc(&sdp->sd_log_in_flight);
+
+	return bh;
+}
+
+static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
+{
+	unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
+
+	ail2_empty(sdp, new_tail);
+
+	atomic_add(dist, &sdp->sd_log_blks_free);
+	trace_gfs2_log_blocks(sdp, dist);
+	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+			     sdp->sd_jdesc->jd_blocks);
+
+	sdp->sd_log_tail = new_tail;
+}
+
+/**
+ * log_write_header - Get and initialize a journal header buffer
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: the initialized log buffer descriptor
+ */
+
+static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
+{
+	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
+	struct buffer_head *bh;
+	struct gfs2_log_header *lh;
+	unsigned int tail;
+	u32 hash;
+
+	bh = sb_getblk(sdp->sd_vfs, blkno);
+	lock_buffer(bh);
+	memset(bh->b_data, 0, bh->b_size);
+	set_buffer_uptodate(bh);
+	clear_buffer_dirty(bh);
+
+	gfs2_ail1_empty(sdp);
+	tail = current_tail(sdp);
+
+	lh = (struct gfs2_log_header *)bh->b_data;
+	memset(lh, 0, sizeof(struct gfs2_log_header));
+	lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+	lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+	lh->lh_header.__pad0 = cpu_to_be64(0);
+	lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+	lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
+	lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
+	lh->lh_flags = cpu_to_be32(flags);
+	lh->lh_tail = cpu_to_be32(tail);
+	lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
+	hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
+	lh->lh_hash = cpu_to_be32(hash);
+
+	bh->b_end_io = end_buffer_write_sync;
+	get_bh(bh);
+	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
+		submit_bh(WRITE_SYNC | REQ_META, bh);
+	else
+		submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
+	wait_on_buffer(bh);
+
+	if (!buffer_uptodate(bh))
+		gfs2_io_error_bh(sdp, bh);
+	brelse(bh);
+
+	if (sdp->sd_log_tail != tail)
+		log_pull_tail(sdp, tail);
+	else
+		gfs2_assert_withdraw(sdp, !pull);
+
+	sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
+	gfs2_log_incr_head(sdp);
+}
+
+static void log_flush_commit(struct gfs2_sbd *sdp)
+{
+	DEFINE_WAIT(wait);
+
+	if (atomic_read(&sdp->sd_log_in_flight)) {
+		do {
+			prepare_to_wait(&sdp->sd_log_flush_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (atomic_read(&sdp->sd_log_in_flight))
+				io_schedule();
+		} while(atomic_read(&sdp->sd_log_in_flight));
+		finish_wait(&sdp->sd_log_flush_wait, &wait);
+	}
+
+	log_write_header(sdp, 0, 0);
+}
+
+static void gfs2_ordered_write(struct gfs2_sbd *sdp)
+{
+	struct gfs2_bufdata *bd;
+	struct buffer_head *bh;
+	LIST_HEAD(written);
+
+	gfs2_log_lock(sdp);
+	while (!list_empty(&sdp->sd_log_le_ordered)) {
+		bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_le.le_list);
+		list_move(&bd->bd_le.le_list, &written);
+		bh = bd->bd_bh;
+		if (!buffer_dirty(bh))
+			continue;
+		get_bh(bh);
+		gfs2_log_unlock(sdp);
+		lock_buffer(bh);
+		if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
+			bh->b_end_io = end_buffer_write_sync;
+			submit_bh(WRITE_SYNC, bh);
+		} else {
+			unlock_buffer(bh);
+			brelse(bh);
+		}
+		gfs2_log_lock(sdp);
+	}
+	list_splice(&written, &sdp->sd_log_le_ordered);
+	gfs2_log_unlock(sdp);
+}
+
+static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
+{
+	struct gfs2_bufdata *bd;
+	struct buffer_head *bh;
+
+	gfs2_log_lock(sdp);
+	while (!list_empty(&sdp->sd_log_le_ordered)) {
+		bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_le.le_list);
+		bh = bd->bd_bh;
+		if (buffer_locked(bh)) {
+			get_bh(bh);
+			gfs2_log_unlock(sdp);
+			wait_on_buffer(bh);
+			brelse(bh);
+			gfs2_log_lock(sdp);
+			continue;
+		}
+		list_del_init(&bd->bd_le.le_list);
+	}
+	gfs2_log_unlock(sdp);
+}
+
+/**
+ * gfs2_log_flush - flush incore transaction(s)
+ * @sdp: the filesystem
+ * @gl: The glock structure to flush.  If NULL, flush the whole incore log
+ *
+ */
+
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+{
+	struct gfs2_ail *ai;
+
+	down_write(&sdp->sd_log_flush_lock);
+
+	/* Log might have been flushed while we waited for the flush lock */
+	if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) {
+		up_write(&sdp->sd_log_flush_lock);
+		return;
+	}
+	trace_gfs2_log_flush(sdp, 1);
+
+	ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
+	INIT_LIST_HEAD(&ai->ai_ail1_list);
+	INIT_LIST_HEAD(&ai->ai_ail2_list);
+
+	if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) {
+		printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf,
+		       sdp->sd_log_commited_buf);
+		gfs2_assert_withdraw(sdp, 0);
+	}
+	if (sdp->sd_log_num_databuf != sdp->sd_log_commited_databuf) {
+		printk(KERN_INFO "GFS2: log databuf %u %u\n",
+		       sdp->sd_log_num_databuf, sdp->sd_log_commited_databuf);
+		gfs2_assert_withdraw(sdp, 0);
+	}
+	gfs2_assert_withdraw(sdp,
+			sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
+
+	sdp->sd_log_flush_head = sdp->sd_log_head;
+	sdp->sd_log_flush_wrapped = 0;
+	ai->ai_first = sdp->sd_log_flush_head;
+
+	gfs2_ordered_write(sdp);
+	lops_before_commit(sdp);
+	gfs2_ordered_wait(sdp);
+
+	if (sdp->sd_log_head != sdp->sd_log_flush_head)
+		log_flush_commit(sdp);
+	else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
+		gfs2_log_lock(sdp);
+		atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
+		trace_gfs2_log_blocks(sdp, -1);
+		gfs2_log_unlock(sdp);
+		log_write_header(sdp, 0, PULL);
+	}
+	lops_after_commit(sdp, ai);
+
+	gfs2_log_lock(sdp);
+	sdp->sd_log_head = sdp->sd_log_flush_head;
+	sdp->sd_log_blks_reserved = 0;
+	sdp->sd_log_commited_buf = 0;
+	sdp->sd_log_commited_databuf = 0;
+	sdp->sd_log_commited_revoke = 0;
+
+	spin_lock(&sdp->sd_ail_lock);
+	if (!list_empty(&ai->ai_ail1_list)) {
+		list_add(&ai->ai_list, &sdp->sd_ail1_list);
+		ai = NULL;
+	}
+	spin_unlock(&sdp->sd_ail_lock);
+	gfs2_log_unlock(sdp);
+	trace_gfs2_log_flush(sdp, 0);
+	up_write(&sdp->sd_log_flush_lock);
+
+	kfree(ai);
+}
+
+static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	unsigned int reserved;
+	unsigned int unused;
+
+	gfs2_log_lock(sdp);
+
+	sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
+	sdp->sd_log_commited_databuf += tr->tr_num_databuf_new -
+		tr->tr_num_databuf_rm;
+	gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) ||
+			     (((int)sdp->sd_log_commited_databuf) >= 0));
+	sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
+	reserved = calc_reserved(sdp);
+	gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
+	unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
+	atomic_add(unused, &sdp->sd_log_blks_free);
+	trace_gfs2_log_blocks(sdp, unused);
+	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+			     sdp->sd_jdesc->jd_blocks);
+	sdp->sd_log_blks_reserved = reserved;
+
+	gfs2_log_unlock(sdp);
+}
+
+static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	struct list_head *head = &tr->tr_list_buf;
+	struct gfs2_bufdata *bd;
+
+	gfs2_log_lock(sdp);
+	while (!list_empty(head)) {
+		bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
+		list_del_init(&bd->bd_list_tr);
+		tr->tr_num_buf--;
+	}
+	gfs2_log_unlock(sdp);
+	gfs2_assert_warn(sdp, !tr->tr_num_buf);
+}
+
+/**
+ * gfs2_log_commit - Commit a transaction to the log
+ * @sdp: the filesystem
+ * @tr: the transaction
+ *
+ * We wake up gfs2_logd if the number of pinned blocks exceed thresh1
+ * or the total number of used blocks (pinned blocks plus AIL blocks)
+ * is greater than thresh2.
+ *
+ * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
+ * journal size.
+ *
+ * Returns: errno
+ */
+
+void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	log_refund(sdp, tr);
+	buf_lo_incore_commit(sdp, tr);
+
+	up_read(&sdp->sd_log_flush_lock);
+
+	if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
+	    ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
+	    atomic_read(&sdp->sd_log_thresh2)))
+		wake_up(&sdp->sd_logd_waitq);
+}
+
+/**
+ * gfs2_log_shutdown - write a shutdown header into a journal
+ * @sdp: the filesystem
+ *
+ */
+
+void gfs2_log_shutdown(struct gfs2_sbd *sdp)
+{
+	down_write(&sdp->sd_log_flush_lock);
+
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
+	gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
+
+	sdp->sd_log_flush_head = sdp->sd_log_head;
+	sdp->sd_log_flush_wrapped = 0;
+
+	log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT,
+			 (sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL);
+
+	gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
+	gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
+	gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
+
+	sdp->sd_log_head = sdp->sd_log_flush_head;
+	sdp->sd_log_tail = sdp->sd_log_head;
+
+	up_write(&sdp->sd_log_flush_lock);
+}
+
+
+/**
+ * gfs2_meta_syncfs - sync all the buffers in a filesystem
+ * @sdp: the filesystem
+ *
+ */
+
+void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
+{
+	gfs2_log_flush(sdp, NULL);
+	for (;;) {
+		gfs2_ail1_start(sdp);
+		gfs2_ail1_wait(sdp);
+		if (gfs2_ail1_empty(sdp))
+			break;
+	}
+	gfs2_log_flush(sdp, NULL);
+}
+
+static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
+{
+	return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
+}
+
+static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
+{
+	unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
+	return used_blocks >= atomic_read(&sdp->sd_log_thresh2);
+}
+
+/**
+ * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * Also, periodically check to make sure that we're using the most recent
+ * journal index.
+ */
+
+int gfs2_logd(void *data)
+{
+	struct gfs2_sbd *sdp = data;
+	unsigned long t = 1;
+	DEFINE_WAIT(wait);
+	unsigned preflush;
+
+	while (!kthread_should_stop()) {
+
+		preflush = atomic_read(&sdp->sd_log_pinned);
+		if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
+			gfs2_ail1_empty(sdp);
+			gfs2_log_flush(sdp, NULL);
+		}
+
+		if (gfs2_ail_flush_reqd(sdp)) {
+			gfs2_ail1_start(sdp);
+			gfs2_ail1_wait(sdp);
+			gfs2_ail1_empty(sdp);
+			gfs2_log_flush(sdp, NULL);
+		}
+
+		if (!gfs2_ail_flush_reqd(sdp))
+			wake_up(&sdp->sd_log_waitq);
+
+		t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
+		if (freezing(current))
+			refrigerator();
+
+		do {
+			prepare_to_wait(&sdp->sd_logd_waitq, &wait,
+					TASK_INTERRUPTIBLE);
+			if (!gfs2_ail_flush_reqd(sdp) &&
+			    !gfs2_jrnl_flush_reqd(sdp) &&
+			    !kthread_should_stop())
+				t = schedule_timeout(t);
+		} while(t && !gfs2_ail_flush_reqd(sdp) &&
+			!gfs2_jrnl_flush_reqd(sdp) &&
+			!kthread_should_stop());
+		finish_wait(&sdp->sd_logd_waitq, &wait);
+	}
+
+	return 0;
+}
+
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
new file mode 100644
index 00000000..ab062169
--- /dev/null
+++ b/fs/gfs2/log.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __LOG_DOT_H__
+#define __LOG_DOT_H__
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/writeback.h>
+#include "incore.h"
+
+/**
+ * gfs2_log_lock - acquire the right to mess with the log manager
+ * @sdp: the filesystem
+ *
+ */
+
+static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
+__acquires(&sdp->sd_log_lock)
+{
+	spin_lock(&sdp->sd_log_lock);
+}
+
+/**
+ * gfs2_log_unlock - release the right to mess with the log manager
+ * @sdp: the filesystem
+ *
+ */
+
+static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
+__releases(&sdp->sd_log_lock)
+{
+	spin_unlock(&sdp->sd_log_lock);
+}
+
+static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
+					  unsigned int value)
+{
+	if (++value == sdp->sd_jdesc->jd_blocks) {
+		value = 0;
+	}
+	sdp->sd_log_head = sdp->sd_log_tail = value;
+}
+
+extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+			    unsigned int ssize);
+
+extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
+extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
+
+extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
+extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+				      struct buffer_head *real);
+extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
+extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
+
+extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
+extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
+extern int gfs2_logd(void *data);
+
+#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
new file mode 100644
index 00000000..05bbb124
--- /dev/null
+++ b/fs/gfs2/lops.c
@@ -0,0 +1,795 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/bio.h>
+#include <linux/fs.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "inode.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "trace_gfs2.h"
+
+/**
+ * gfs2_pin - Pin a buffer in memory
+ * @sdp: The superblock
+ * @bh: The buffer to be pinned
+ *
+ * The log lock must be held when calling this function
+ */
+static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+	struct gfs2_bufdata *bd;
+
+	BUG_ON(!current->journal_info);
+
+	clear_buffer_dirty(bh);
+	if (test_set_buffer_pinned(bh))
+		gfs2_assert_withdraw(sdp, 0);
+	if (!buffer_uptodate(bh))
+		gfs2_io_error_bh(sdp, bh);
+	bd = bh->b_private;
+	/* If this buffer is in the AIL and it has already been written
+	 * to in-place disk block, remove it from the AIL.
+	 */
+	spin_lock(&sdp->sd_ail_lock);
+	if (bd->bd_ail)
+		list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
+	spin_unlock(&sdp->sd_ail_lock);
+	get_bh(bh);
+	atomic_inc(&sdp->sd_log_pinned);
+	trace_gfs2_pin(bd, 1);
+}
+
+/**
+ * gfs2_unpin - Unpin a buffer
+ * @sdp: the filesystem the buffer belongs to
+ * @bh: The buffer to unpin
+ * @ai:
+ * @flags: The inode dirty flags
+ *
+ */
+
+static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		       struct gfs2_ail *ai)
+{
+	struct gfs2_bufdata *bd = bh->b_private;
+
+	BUG_ON(!buffer_uptodate(bh));
+	BUG_ON(!buffer_pinned(bh));
+
+	lock_buffer(bh);
+	mark_buffer_dirty(bh);
+	clear_buffer_pinned(bh);
+
+	spin_lock(&sdp->sd_ail_lock);
+	if (bd->bd_ail) {
+		list_del(&bd->bd_ail_st_list);
+		brelse(bh);
+	} else {
+		struct gfs2_glock *gl = bd->bd_gl;
+		list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
+		atomic_inc(&gl->gl_ail_count);
+	}
+	bd->bd_ail = ai;
+	list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+	spin_unlock(&sdp->sd_ail_lock);
+
+	clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+	trace_gfs2_pin(bd, 0);
+	unlock_buffer(bh);
+	atomic_dec(&sdp->sd_log_pinned);
+}
+
+
+static inline struct gfs2_log_descriptor *bh_log_desc(struct buffer_head *bh)
+{
+	return (struct gfs2_log_descriptor *)bh->b_data;
+}
+
+static inline __be64 *bh_log_ptr(struct buffer_head *bh)
+{
+	struct gfs2_log_descriptor *ld = bh_log_desc(bh);
+	return (__force __be64 *)(ld + 1);
+}
+
+static inline __be64 *bh_ptr_end(struct buffer_head *bh)
+{
+	return (__force __be64 *)(bh->b_data + bh->b_size);
+}
+
+
+static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
+{
+	struct buffer_head *bh = gfs2_log_get_buf(sdp);
+	struct gfs2_log_descriptor *ld = bh_log_desc(bh);
+	ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+	ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
+	ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
+	ld->ld_type = cpu_to_be32(ld_type);
+	ld->ld_length = 0;
+	ld->ld_data1 = 0;
+	ld->ld_data2 = 0;
+	memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+	return bh;
+}
+
+static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+	struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+	struct gfs2_meta_header *mh;
+	struct gfs2_trans *tr;
+
+	lock_buffer(bd->bd_bh);
+	gfs2_log_lock(sdp);
+	if (!list_empty(&bd->bd_list_tr))
+		goto out;
+	tr = current->journal_info;
+	tr->tr_touched = 1;
+	tr->tr_num_buf++;
+	list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+	if (!list_empty(&le->le_list))
+		goto out;
+	set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
+	gfs2_meta_check(sdp, bd->bd_bh);
+	gfs2_pin(sdp, bd->bd_bh);
+	mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
+	mh->__pad0 = cpu_to_be64(0);
+	mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
+	sdp->sd_log_num_buf++;
+	list_add(&le->le_list, &sdp->sd_log_le_buf);
+	tr->tr_num_buf_new++;
+out:
+	gfs2_log_unlock(sdp);
+	unlock_buffer(bd->bd_bh);
+}
+
+static void buf_lo_before_commit(struct gfs2_sbd *sdp)
+{
+	struct buffer_head *bh;
+	struct gfs2_log_descriptor *ld;
+	struct gfs2_bufdata *bd1 = NULL, *bd2;
+	unsigned int total;
+	unsigned int limit;
+	unsigned int num;
+	unsigned n;
+	__be64 *ptr;
+
+	limit = buf_limit(sdp);
+	/* for 4k blocks, limit = 503 */
+
+	gfs2_log_lock(sdp);
+	total = sdp->sd_log_num_buf;
+	bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
+	while(total) {
+		num = total;
+		if (total > limit)
+			num = limit;
+		gfs2_log_unlock(sdp);
+		bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA);
+		gfs2_log_lock(sdp);
+		ld = bh_log_desc(bh);
+		ptr = bh_log_ptr(bh);
+		ld->ld_length = cpu_to_be32(num + 1);
+		ld->ld_data1 = cpu_to_be32(num);
+
+		n = 0;
+		list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
+					     bd_le.le_list) {
+			*ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
+			if (++n >= num)
+				break;
+		}
+
+		gfs2_log_unlock(sdp);
+		submit_bh(WRITE_SYNC, bh);
+		gfs2_log_lock(sdp);
+
+		n = 0;
+		list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
+					     bd_le.le_list) {
+			get_bh(bd2->bd_bh);
+			gfs2_log_unlock(sdp);
+			lock_buffer(bd2->bd_bh);
+			bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
+			submit_bh(WRITE_SYNC, bh);
+			gfs2_log_lock(sdp);
+			if (++n >= num)
+				break;
+		}
+
+		BUG_ON(total < num);
+		total -= num;
+	}
+	gfs2_log_unlock(sdp);
+}
+
+static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+	struct list_head *head = &sdp->sd_log_le_buf;
+	struct gfs2_bufdata *bd;
+
+	while (!list_empty(head)) {
+		bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+		list_del_init(&bd->bd_le.le_list);
+		sdp->sd_log_num_buf--;
+
+		gfs2_unpin(sdp, bd->bd_bh, ai);
+	}
+	gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
+}
+
+static void buf_lo_before_scan(struct gfs2_jdesc *jd,
+			       struct gfs2_log_header_host *head, int pass)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+	if (pass != 0)
+		return;
+
+	sdp->sd_found_blocks = 0;
+	sdp->sd_replayed_blocks = 0;
+}
+
+static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+				struct gfs2_log_descriptor *ld, __be64 *ptr,
+				int pass)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	unsigned int blks = be32_to_cpu(ld->ld_data1);
+	struct buffer_head *bh_log, *bh_ip;
+	u64 blkno;
+	int error = 0;
+
+	if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
+		return 0;
+
+	gfs2_replay_incr_blk(sdp, &start);
+
+	for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+		blkno = be64_to_cpu(*ptr++);
+
+		sdp->sd_found_blocks++;
+
+		if (gfs2_revoke_check(sdp, blkno, start))
+			continue;
+
+		error = gfs2_replay_read_block(jd, start, &bh_log);
+		if (error)
+			return error;
+
+		bh_ip = gfs2_meta_new(gl, blkno);
+		memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
+
+		if (gfs2_meta_check(sdp, bh_ip))
+			error = -EIO;
+		else
+			mark_buffer_dirty(bh_ip);
+
+		brelse(bh_log);
+		brelse(bh_ip);
+
+		if (error)
+			break;
+
+		sdp->sd_replayed_blocks++;
+	}
+
+	return error;
+}
+
+static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+	if (error) {
+		gfs2_meta_sync(ip->i_gl);
+		return;
+	}
+	if (pass != 1)
+		return;
+
+	gfs2_meta_sync(ip->i_gl);
+
+	fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
+	        jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
+}
+
+static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+	struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+	struct gfs2_glock *gl = bd->bd_gl;
+	struct gfs2_trans *tr;
+
+	tr = current->journal_info;
+	tr->tr_touched = 1;
+	tr->tr_num_revoke++;
+	sdp->sd_log_num_revoke++;
+	atomic_inc(&gl->gl_revokes);
+	set_bit(GLF_LFLUSH, &gl->gl_flags);
+	list_add(&le->le_list, &sdp->sd_log_le_revoke);
+}
+
+static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
+{
+	struct gfs2_log_descriptor *ld;
+	struct gfs2_meta_header *mh;
+	struct buffer_head *bh;
+	unsigned int offset;
+	struct list_head *head = &sdp->sd_log_le_revoke;
+	struct gfs2_bufdata *bd;
+
+	if (!sdp->sd_log_num_revoke)
+		return;
+
+	bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE);
+	ld = bh_log_desc(bh);
+	ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
+						    sizeof(u64)));
+	ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
+	offset = sizeof(struct gfs2_log_descriptor);
+
+	list_for_each_entry(bd, head, bd_le.le_list) {
+		sdp->sd_log_num_revoke--;
+
+		if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
+			submit_bh(WRITE_SYNC, bh);
+
+			bh = gfs2_log_get_buf(sdp);
+			mh = (struct gfs2_meta_header *)bh->b_data;
+			mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
+			mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
+			mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
+			offset = sizeof(struct gfs2_meta_header);
+		}
+
+		*(__be64 *)(bh->b_data + offset) = cpu_to_be64(bd->bd_blkno);
+		offset += sizeof(u64);
+	}
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
+
+	submit_bh(WRITE_SYNC, bh);
+}
+
+static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+	struct list_head *head = &sdp->sd_log_le_revoke;
+	struct gfs2_bufdata *bd;
+	struct gfs2_glock *gl;
+
+	while (!list_empty(head)) {
+		bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+		list_del_init(&bd->bd_le.le_list);
+		gl = bd->bd_gl;
+		atomic_dec(&gl->gl_revokes);
+		clear_bit(GLF_LFLUSH, &gl->gl_flags);
+		kmem_cache_free(gfs2_bufdata_cachep, bd);
+	}
+}
+
+static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
+				  struct gfs2_log_header_host *head, int pass)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+	if (pass != 0)
+		return;
+
+	sdp->sd_found_revokes = 0;
+	sdp->sd_replay_tail = head->lh_tail;
+}
+
+static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+				   struct gfs2_log_descriptor *ld, __be64 *ptr,
+				   int pass)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	unsigned int blks = be32_to_cpu(ld->ld_length);
+	unsigned int revokes = be32_to_cpu(ld->ld_data1);
+	struct buffer_head *bh;
+	unsigned int offset;
+	u64 blkno;
+	int first = 1;
+	int error;
+
+	if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
+		return 0;
+
+	offset = sizeof(struct gfs2_log_descriptor);
+
+	for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+		error = gfs2_replay_read_block(jd, start, &bh);
+		if (error)
+			return error;
+
+		if (!first)
+			gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
+
+		while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) {
+			blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
+
+			error = gfs2_revoke_add(sdp, blkno, start);
+			if (error < 0) {
+				brelse(bh);
+				return error;
+			}
+			else if (error)
+				sdp->sd_found_revokes++;
+
+			if (!--revokes)
+				break;
+			offset += sizeof(u64);
+		}
+
+		brelse(bh);
+		offset = sizeof(struct gfs2_meta_header);
+		first = 0;
+	}
+
+	return 0;
+}
+
+static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+	if (error) {
+		gfs2_revoke_clean(sdp);
+		return;
+	}
+	if (pass != 1)
+		return;
+
+	fs_info(sdp, "jid=%u: Found %u revoke tags\n",
+	        jd->jd_jid, sdp->sd_found_revokes);
+
+	gfs2_revoke_clean(sdp);
+}
+
+static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_trans *tr = current->journal_info;
+
+	tr->tr_touched = 1;
+
+	rgd = container_of(le, struct gfs2_rgrpd, rd_le);
+
+	gfs2_log_lock(sdp);
+	if (!list_empty(&le->le_list)){
+		gfs2_log_unlock(sdp);
+		return;
+	}
+	gfs2_rgrp_bh_hold(rgd);
+	sdp->sd_log_num_rg++;
+	list_add(&le->le_list, &sdp->sd_log_le_rg);
+	gfs2_log_unlock(sdp);
+}
+
+static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+	struct list_head *head = &sdp->sd_log_le_rg;
+	struct gfs2_rgrpd *rgd;
+
+	while (!list_empty(head)) {
+		rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
+		list_del_init(&rgd->rd_le.le_list);
+		sdp->sd_log_num_rg--;
+
+		gfs2_rgrp_repolish_clones(rgd);
+		gfs2_rgrp_bh_put(rgd);
+	}
+	gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
+}
+
+/**
+ * databuf_lo_add - Add a databuf to the transaction.
+ *
+ * This is used in two distinct cases:
+ * i) In ordered write mode
+ *    We put the data buffer on a list so that we can ensure that its
+ *    synced to disk at the right time
+ * ii) In journaled data mode
+ *    We need to journal the data block in the same way as metadata in
+ *    the functions above. The difference is that here we have a tag
+ *    which is two __be64's being the block number (as per meta data)
+ *    and a flag which says whether the data block needs escaping or
+ *    not. This means we need a new log entry for each 251 or so data
+ *    blocks, which isn't an enormous overhead but twice as much as
+ *    for normal metadata blocks.
+ */
+static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+	struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+	struct gfs2_trans *tr = current->journal_info;
+	struct address_space *mapping = bd->bd_bh->b_page->mapping;
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+
+	lock_buffer(bd->bd_bh);
+	gfs2_log_lock(sdp);
+	if (tr) {
+		if (!list_empty(&bd->bd_list_tr))
+			goto out;
+		tr->tr_touched = 1;
+		if (gfs2_is_jdata(ip)) {
+			tr->tr_num_buf++;
+			list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+		}
+	}
+	if (!list_empty(&le->le_list))
+		goto out;
+
+	set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
+	if (gfs2_is_jdata(ip)) {
+		gfs2_pin(sdp, bd->bd_bh);
+		tr->tr_num_databuf_new++;
+		sdp->sd_log_num_databuf++;
+		list_add_tail(&le->le_list, &sdp->sd_log_le_databuf);
+	} else {
+		list_add_tail(&le->le_list, &sdp->sd_log_le_ordered);
+	}
+out:
+	gfs2_log_unlock(sdp);
+	unlock_buffer(bd->bd_bh);
+}
+
+static void gfs2_check_magic(struct buffer_head *bh)
+{
+	void *kaddr;
+	__be32 *ptr;
+
+	clear_buffer_escaped(bh);
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	ptr = kaddr + bh_offset(bh);
+	if (*ptr == cpu_to_be32(GFS2_MAGIC))
+		set_buffer_escaped(bh);
+	kunmap_atomic(kaddr, KM_USER0);
+}
+
+static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			      struct list_head *list, struct list_head *done,
+			      unsigned int n)
+{
+	struct buffer_head *bh1;
+	struct gfs2_log_descriptor *ld;
+	struct gfs2_bufdata *bd;
+	__be64 *ptr;
+
+	if (!bh)
+		return;
+
+	ld = bh_log_desc(bh);
+	ld->ld_length = cpu_to_be32(n + 1);
+	ld->ld_data1 = cpu_to_be32(n);
+
+	ptr = bh_log_ptr(bh);
+	
+	get_bh(bh);
+	submit_bh(WRITE_SYNC, bh);
+	gfs2_log_lock(sdp);
+	while(!list_empty(list)) {
+		bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
+		list_move_tail(&bd->bd_le.le_list, done);
+		get_bh(bd->bd_bh);
+		while (be64_to_cpu(*ptr) != bd->bd_bh->b_blocknr) {
+			gfs2_log_incr_head(sdp);
+			ptr += 2;
+		}
+		gfs2_log_unlock(sdp);
+		lock_buffer(bd->bd_bh);
+		if (buffer_escaped(bd->bd_bh)) {
+			void *kaddr;
+			bh1 = gfs2_log_get_buf(sdp);
+			kaddr = kmap_atomic(bd->bd_bh->b_page, KM_USER0);
+			memcpy(bh1->b_data, kaddr + bh_offset(bd->bd_bh),
+			       bh1->b_size);
+			kunmap_atomic(kaddr, KM_USER0);
+			*(__be32 *)bh1->b_data = 0;
+			clear_buffer_escaped(bd->bd_bh);
+			unlock_buffer(bd->bd_bh);
+			brelse(bd->bd_bh);
+		} else {
+			bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
+		}
+		submit_bh(WRITE_SYNC, bh1);
+		gfs2_log_lock(sdp);
+		ptr += 2;
+	}
+	gfs2_log_unlock(sdp);
+	brelse(bh);
+}
+
+/**
+ * databuf_lo_before_commit - Scan the data buffers, writing as we go
+ *
+ */
+
+static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
+{
+	struct gfs2_bufdata *bd = NULL;
+	struct buffer_head *bh = NULL;
+	unsigned int n = 0;
+	__be64 *ptr = NULL, *end = NULL;
+	LIST_HEAD(processed);
+	LIST_HEAD(in_progress);
+
+	gfs2_log_lock(sdp);
+	while (!list_empty(&sdp->sd_log_le_databuf)) {
+		if (ptr == end) {
+			gfs2_log_unlock(sdp);
+			gfs2_write_blocks(sdp, bh, &in_progress, &processed, n);
+			n = 0;
+			bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_JDATA);
+			ptr = bh_log_ptr(bh);
+			end = bh_ptr_end(bh) - 1;
+			gfs2_log_lock(sdp);
+			continue;
+		}
+		bd = list_entry(sdp->sd_log_le_databuf.next, struct gfs2_bufdata, bd_le.le_list);
+		list_move_tail(&bd->bd_le.le_list, &in_progress);
+		gfs2_check_magic(bd->bd_bh);
+		*ptr++ = cpu_to_be64(bd->bd_bh->b_blocknr);
+		*ptr++ = cpu_to_be64(buffer_escaped(bh) ? 1 : 0);
+		n++;
+	}
+	gfs2_log_unlock(sdp);
+	gfs2_write_blocks(sdp, bh, &in_progress, &processed, n);
+	gfs2_log_lock(sdp);
+	list_splice(&processed, &sdp->sd_log_le_databuf);
+	gfs2_log_unlock(sdp);
+}
+
+static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+				    struct gfs2_log_descriptor *ld,
+				    __be64 *ptr, int pass)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	unsigned int blks = be32_to_cpu(ld->ld_data1);
+	struct buffer_head *bh_log, *bh_ip;
+	u64 blkno;
+	u64 esc;
+	int error = 0;
+
+	if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
+		return 0;
+
+	gfs2_replay_incr_blk(sdp, &start);
+	for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+		blkno = be64_to_cpu(*ptr++);
+		esc = be64_to_cpu(*ptr++);
+
+		sdp->sd_found_blocks++;
+
+		if (gfs2_revoke_check(sdp, blkno, start))
+			continue;
+
+		error = gfs2_replay_read_block(jd, start, &bh_log);
+		if (error)
+			return error;
+
+		bh_ip = gfs2_meta_new(gl, blkno);
+		memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
+
+		/* Unescape */
+		if (esc) {
+			__be32 *eptr = (__be32 *)bh_ip->b_data;
+			*eptr = cpu_to_be32(GFS2_MAGIC);
+		}
+		mark_buffer_dirty(bh_ip);
+
+		brelse(bh_log);
+		brelse(bh_ip);
+		if (error)
+			break;
+
+		sdp->sd_replayed_blocks++;
+	}
+
+	return error;
+}
+
+/* FIXME: sort out accounting for log blocks etc. */
+
+static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+	if (error) {
+		gfs2_meta_sync(ip->i_gl);
+		return;
+	}
+	if (pass != 1)
+		return;
+
+	/* data sync? */
+	gfs2_meta_sync(ip->i_gl);
+
+	fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
+		jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
+}
+
+static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+	struct list_head *head = &sdp->sd_log_le_databuf;
+	struct gfs2_bufdata *bd;
+
+	while (!list_empty(head)) {
+		bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+		list_del_init(&bd->bd_le.le_list);
+		sdp->sd_log_num_databuf--;
+		gfs2_unpin(sdp, bd->bd_bh, ai);
+	}
+	gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
+}
+
+
+const struct gfs2_log_operations gfs2_buf_lops = {
+	.lo_add = buf_lo_add,
+	.lo_before_commit = buf_lo_before_commit,
+	.lo_after_commit = buf_lo_after_commit,
+	.lo_before_scan = buf_lo_before_scan,
+	.lo_scan_elements = buf_lo_scan_elements,
+	.lo_after_scan = buf_lo_after_scan,
+	.lo_name = "buf",
+};
+
+const struct gfs2_log_operations gfs2_revoke_lops = {
+	.lo_add = revoke_lo_add,
+	.lo_before_commit = revoke_lo_before_commit,
+	.lo_after_commit = revoke_lo_after_commit,
+	.lo_before_scan = revoke_lo_before_scan,
+	.lo_scan_elements = revoke_lo_scan_elements,
+	.lo_after_scan = revoke_lo_after_scan,
+	.lo_name = "revoke",
+};
+
+const struct gfs2_log_operations gfs2_rg_lops = {
+	.lo_add = rg_lo_add,
+	.lo_after_commit = rg_lo_after_commit,
+	.lo_name = "rg",
+};
+
+const struct gfs2_log_operations gfs2_databuf_lops = {
+	.lo_add = databuf_lo_add,
+	.lo_before_commit = databuf_lo_before_commit,
+	.lo_after_commit = databuf_lo_after_commit,
+	.lo_scan_elements = databuf_lo_scan_elements,
+	.lo_after_scan = databuf_lo_after_scan,
+	.lo_name = "databuf",
+};
+
+const struct gfs2_log_operations *gfs2_log_ops[] = {
+	&gfs2_databuf_lops,
+	&gfs2_buf_lops,
+	&gfs2_rg_lops,
+	&gfs2_revoke_lops,
+	NULL,
+};
+
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
new file mode 100644
index 00000000..3c0b2737
--- /dev/null
+++ b/fs/gfs2/lops.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __LOPS_DOT_H__
+#define __LOPS_DOT_H__
+
+#include <linux/list.h>
+#include "incore.h"
+
+#define BUF_OFFSET \
+	((sizeof(struct gfs2_log_descriptor) + sizeof(__be64) - 1) & \
+	 ~(sizeof(__be64) - 1))
+#define DATABUF_OFFSET \
+	((sizeof(struct gfs2_log_descriptor) + (2 * sizeof(__be64) - 1)) & \
+	 ~(2 * sizeof(__be64) - 1))
+
+extern const struct gfs2_log_operations gfs2_glock_lops;
+extern const struct gfs2_log_operations gfs2_buf_lops;
+extern const struct gfs2_log_operations gfs2_revoke_lops;
+extern const struct gfs2_log_operations gfs2_rg_lops;
+extern const struct gfs2_log_operations gfs2_databuf_lops;
+
+extern const struct gfs2_log_operations *gfs2_log_ops[];
+
+static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
+{
+	unsigned int limit;
+
+	limit = (sdp->sd_sb.sb_bsize - BUF_OFFSET) / sizeof(__be64);
+	return limit;
+}
+
+static inline unsigned int databuf_limit(struct gfs2_sbd *sdp)
+{
+	unsigned int limit;
+
+	limit = (sdp->sd_sb.sb_bsize - DATABUF_OFFSET) / (2 * sizeof(__be64));
+	return limit;
+}
+
+static inline void lops_init_le(struct gfs2_log_element *le,
+				const struct gfs2_log_operations *lops)
+{
+	INIT_LIST_HEAD(&le->le_list);
+	le->le_ops = lops;
+}
+
+static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+	if (le->le_ops->lo_add)
+		le->le_ops->lo_add(sdp, le);
+}
+
+static inline void lops_before_commit(struct gfs2_sbd *sdp)
+{
+	int x;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_before_commit)
+			gfs2_log_ops[x]->lo_before_commit(sdp);
+}
+
+static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+	int x;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_after_commit)
+			gfs2_log_ops[x]->lo_after_commit(sdp, ai);
+}
+
+static inline void lops_before_scan(struct gfs2_jdesc *jd,
+				    struct gfs2_log_header_host *head,
+				    unsigned int pass)
+{
+	int x;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_before_scan)
+			gfs2_log_ops[x]->lo_before_scan(jd, head, pass);
+}
+
+static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+				     struct gfs2_log_descriptor *ld,
+				     __be64 *ptr,
+				     unsigned int pass)
+{
+	int x, error;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_scan_elements) {
+			error = gfs2_log_ops[x]->lo_scan_elements(jd, start,
+								  ld, ptr, pass);
+			if (error)
+				return error;
+		}
+
+	return 0;
+}
+
+static inline void lops_after_scan(struct gfs2_jdesc *jd, int error,
+				   unsigned int pass)
+{
+	int x;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_before_scan)
+			gfs2_log_ops[x]->lo_after_scan(jd, error, pass);
+}
+
+#endif /* __LOPS_DOT_H__ */
+
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
new file mode 100644
index 00000000..c2b34cd2
--- /dev/null
+++ b/fs/gfs2/main.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <asm/atomic.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "super.h"
+#include "sys.h"
+#include "util.h"
+#include "glock.h"
+#include "quota.h"
+#include "recovery.h"
+#include "dir.h"
+
+static struct shrinker qd_shrinker = {
+	.shrink = gfs2_shrink_qd_memory,
+	.seeks = DEFAULT_SEEKS,
+};
+
+static void gfs2_init_inode_once(void *foo)
+{
+	struct gfs2_inode *ip = foo;
+
+	inode_init_once(&ip->i_inode);
+	init_rwsem(&ip->i_rw_mutex);
+	INIT_LIST_HEAD(&ip->i_trunc_list);
+	ip->i_alloc = NULL;
+}
+
+static void gfs2_init_glock_once(void *foo)
+{
+	struct gfs2_glock *gl = foo;
+
+	INIT_HLIST_BL_NODE(&gl->gl_list);
+	spin_lock_init(&gl->gl_spin);
+	INIT_LIST_HEAD(&gl->gl_holders);
+	INIT_LIST_HEAD(&gl->gl_lru);
+	INIT_LIST_HEAD(&gl->gl_ail_list);
+	atomic_set(&gl->gl_ail_count, 0);
+	atomic_set(&gl->gl_revokes, 0);
+}
+
+static void gfs2_init_gl_aspace_once(void *foo)
+{
+	struct gfs2_glock *gl = foo;
+	struct address_space *mapping = (struct address_space *)(gl + 1);
+
+	gfs2_init_glock_once(gl);
+	address_space_init_once(mapping);
+}
+
+/**
+ * init_gfs2_fs - Register GFS2 as a filesystem
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int __init init_gfs2_fs(void)
+{
+	int error;
+
+	gfs2_str2qstr(&gfs2_qdot, ".");
+	gfs2_str2qstr(&gfs2_qdotdot, "..");
+
+	error = gfs2_sys_init();
+	if (error)
+		return error;
+
+	error = gfs2_glock_init();
+	if (error)
+		goto fail;
+
+	error = -ENOMEM;
+	gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
+					      sizeof(struct gfs2_glock),
+					      0, 0,
+					      gfs2_init_glock_once);
+	if (!gfs2_glock_cachep)
+		goto fail;
+
+	gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)",
+					sizeof(struct gfs2_glock) +
+					sizeof(struct address_space),
+					0, 0, gfs2_init_gl_aspace_once);
+
+	if (!gfs2_glock_aspace_cachep)
+		goto fail;
+
+	gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
+					      sizeof(struct gfs2_inode),
+					      0,  SLAB_RECLAIM_ACCOUNT|
+					          SLAB_MEM_SPREAD,
+					      gfs2_init_inode_once);
+	if (!gfs2_inode_cachep)
+		goto fail;
+
+	gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
+						sizeof(struct gfs2_bufdata),
+					        0, 0, NULL);
+	if (!gfs2_bufdata_cachep)
+		goto fail;
+
+	gfs2_rgrpd_cachep = kmem_cache_create("gfs2_rgrpd",
+					      sizeof(struct gfs2_rgrpd),
+					      0, 0, NULL);
+	if (!gfs2_rgrpd_cachep)
+		goto fail;
+
+	gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad",
+					       sizeof(struct gfs2_quota_data),
+					       0, 0, NULL);
+	if (!gfs2_quotad_cachep)
+		goto fail;
+
+	register_shrinker(&qd_shrinker);
+
+	error = register_filesystem(&gfs2_fs_type);
+	if (error)
+		goto fail;
+
+	error = register_filesystem(&gfs2meta_fs_type);
+	if (error)
+		goto fail_unregister;
+
+	error = -ENOMEM;
+	gfs_recovery_wq = alloc_workqueue("gfs_recovery",
+					  WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
+	if (!gfs_recovery_wq)
+		goto fail_wq;
+
+	gfs2_register_debugfs();
+
+	printk("GFS2 installed\n");
+
+	return 0;
+
+fail_wq:
+	unregister_filesystem(&gfs2meta_fs_type);
+fail_unregister:
+	unregister_filesystem(&gfs2_fs_type);
+fail:
+	unregister_shrinker(&qd_shrinker);
+	gfs2_glock_exit();
+
+	if (gfs2_quotad_cachep)
+		kmem_cache_destroy(gfs2_quotad_cachep);
+
+	if (gfs2_rgrpd_cachep)
+		kmem_cache_destroy(gfs2_rgrpd_cachep);
+
+	if (gfs2_bufdata_cachep)
+		kmem_cache_destroy(gfs2_bufdata_cachep);
+
+	if (gfs2_inode_cachep)
+		kmem_cache_destroy(gfs2_inode_cachep);
+
+	if (gfs2_glock_aspace_cachep)
+		kmem_cache_destroy(gfs2_glock_aspace_cachep);
+
+	if (gfs2_glock_cachep)
+		kmem_cache_destroy(gfs2_glock_cachep);
+
+	gfs2_sys_uninit();
+	return error;
+}
+
+/**
+ * exit_gfs2_fs - Unregister the file system
+ *
+ */
+
+static void __exit exit_gfs2_fs(void)
+{
+	unregister_shrinker(&qd_shrinker);
+	gfs2_glock_exit();
+	gfs2_unregister_debugfs();
+	unregister_filesystem(&gfs2_fs_type);
+	unregister_filesystem(&gfs2meta_fs_type);
+	destroy_workqueue(gfs_recovery_wq);
+
+	rcu_barrier();
+
+	kmem_cache_destroy(gfs2_quotad_cachep);
+	kmem_cache_destroy(gfs2_rgrpd_cachep);
+	kmem_cache_destroy(gfs2_bufdata_cachep);
+	kmem_cache_destroy(gfs2_inode_cachep);
+	kmem_cache_destroy(gfs2_glock_aspace_cachep);
+	kmem_cache_destroy(gfs2_glock_cachep);
+
+	gfs2_sys_uninit();
+}
+
+MODULE_DESCRIPTION("Global File System");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+module_init(init_gfs2_fs);
+module_exit(exit_gfs2_fs);
+
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
new file mode 100644
index 00000000..747238cd
--- /dev/null
+++ b/fs/gfs2/meta_io.c
@@ -0,0 +1,459 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/delay.h>
+#include <linux/bio.h>
+#include <linux/gfs2_ondisk.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "trace_gfs2.h"
+
+static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct buffer_head *bh, *head;
+	int nr_underway = 0;
+	int write_op = REQ_META |
+		(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!page_has_buffers(page));
+
+	head = page_buffers(page);
+	bh = head;
+
+	do {
+		if (!buffer_mapped(bh))
+			continue;
+		/*
+		 * If it's a fully non-blocking write attempt and we cannot
+		 * lock the buffer then redirty the page.  Note that this can
+		 * potentially cause a busy-wait loop from pdflush and kswapd
+		 * activity, but those code paths have their own higher-level
+		 * throttling.
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE) {
+			lock_buffer(bh);
+		} else if (!trylock_buffer(bh)) {
+			redirty_page_for_writepage(wbc, page);
+			continue;
+		}
+		if (test_clear_buffer_dirty(bh)) {
+			mark_buffer_async_write(bh);
+		} else {
+			unlock_buffer(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+	/*
+	 * The page and its buffers are protected by PageWriteback(), so we can
+	 * drop the bh refcounts early.
+	 */
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			submit_bh(write_op, bh);
+			nr_underway++;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
+
+	if (nr_underway == 0)
+		end_page_writeback(page);
+
+	return 0;
+}
+
+const struct address_space_operations gfs2_meta_aops = {
+	.writepage = gfs2_aspace_writepage,
+	.releasepage = gfs2_releasepage,
+};
+
+/**
+ * gfs2_meta_sync - Sync all buffers associated with a glock
+ * @gl: The glock
+ *
+ */
+
+void gfs2_meta_sync(struct gfs2_glock *gl)
+{
+	struct address_space *mapping = gfs2_glock2aspace(gl);
+	int error;
+
+	filemap_fdatawrite(mapping);
+	error = filemap_fdatawait(mapping);
+
+	if (error)
+		gfs2_io_error(gl->gl_sbd);
+}
+
+/**
+ * gfs2_getbuf - Get a buffer with a given address space
+ * @gl: the glock
+ * @blkno: the block number (filesystem scope)
+ * @create: 1 if the buffer should be created
+ *
+ * Returns: the buffer
+ */
+
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
+{
+	struct address_space *mapping = gfs2_glock2aspace(gl);
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct page *page;
+	struct buffer_head *bh;
+	unsigned int shift;
+	unsigned long index;
+	unsigned int bufnum;
+
+	shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
+	index = blkno >> shift;             /* convert block to page */
+	bufnum = blkno - (index << shift);  /* block buf index within page */
+
+	if (create) {
+		for (;;) {
+			page = grab_cache_page(mapping, index);
+			if (page)
+				break;
+			yield();
+		}
+	} else {
+		page = find_lock_page(mapping, index);
+		if (!page)
+			return NULL;
+	}
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
+
+	/* Locate header for our buffer within our page */
+	for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
+		/* Do nothing */;
+	get_bh(bh);
+
+	if (!buffer_mapped(bh))
+		map_bh(bh, sdp->sd_vfs, blkno);
+
+	unlock_page(page);
+	mark_page_accessed(page);
+	page_cache_release(page);
+
+	return bh;
+}
+
+static void meta_prep_new(struct buffer_head *bh)
+{
+	struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+
+	lock_buffer(bh);
+	clear_buffer_dirty(bh);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
+}
+
+/**
+ * gfs2_meta_new - Get a block
+ * @gl: The glock associated with this block
+ * @blkno: The block number
+ *
+ * Returns: The buffer
+ */
+
+struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
+{
+	struct buffer_head *bh;
+	bh = gfs2_getbuf(gl, blkno, CREATE);
+	meta_prep_new(bh);
+	return bh;
+}
+
+/**
+ * gfs2_meta_read - Read a block from disk
+ * @gl: The glock covering the block
+ * @blkno: The block number
+ * @flags: flags
+ * @bhp: the place where the buffer is returned (NULL on failure)
+ *
+ * Returns: errno
+ */
+
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
+		   struct buffer_head **bhp)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct buffer_head *bh;
+
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	*bhp = bh = gfs2_getbuf(gl, blkno, CREATE);
+
+	lock_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		return 0;
+	}
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(READ_SYNC | REQ_META, bh);
+	if (!(flags & DIO_WAIT))
+		return 0;
+
+	wait_on_buffer(bh);
+	if (unlikely(!buffer_uptodate(bh))) {
+		struct gfs2_trans *tr = current->journal_info;
+		if (tr && tr->tr_touched)
+			gfs2_io_error_bh(sdp, bh);
+		brelse(bh);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_meta_wait - Reread a block from disk
+ * @sdp: the filesystem
+ * @bh: The block to wait for
+ *
+ * Returns: errno
+ */
+
+int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	wait_on_buffer(bh);
+
+	if (!buffer_uptodate(bh)) {
+		struct gfs2_trans *tr = current->journal_info;
+		if (tr && tr->tr_touched)
+			gfs2_io_error_bh(sdp, bh);
+		return -EIO;
+	}
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	return 0;
+}
+
+/**
+ * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
+ * @gl: the glock the buffer belongs to
+ * @bh: The buffer to be attached to
+ * @meta: Flag to indicate whether its metadata or not
+ */
+
+void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
+			 int meta)
+{
+	struct gfs2_bufdata *bd;
+
+	if (meta)
+		lock_page(bh->b_page);
+
+	if (bh->b_private) {
+		if (meta)
+			unlock_page(bh->b_page);
+		return;
+	}
+
+	bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
+	bd->bd_bh = bh;
+	bd->bd_gl = gl;
+
+	INIT_LIST_HEAD(&bd->bd_list_tr);
+	if (meta)
+		lops_init_le(&bd->bd_le, &gfs2_buf_lops);
+	else
+		lops_init_le(&bd->bd_le, &gfs2_databuf_lops);
+	bh->b_private = bd;
+
+	if (meta)
+		unlock_page(bh->b_page);
+}
+
+void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
+{
+	struct address_space *mapping = bh->b_page->mapping;
+	struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
+	struct gfs2_bufdata *bd = bh->b_private;
+
+	if (test_clear_buffer_pinned(bh)) {
+		trace_gfs2_pin(bd, 0);
+		atomic_dec(&sdp->sd_log_pinned);
+		list_del_init(&bd->bd_le.le_list);
+		if (meta) {
+			gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
+			sdp->sd_log_num_buf--;
+			tr->tr_num_buf_rm++;
+		} else {
+			gfs2_assert_warn(sdp, sdp->sd_log_num_databuf);
+			sdp->sd_log_num_databuf--;
+			tr->tr_num_databuf_rm++;
+		}
+		tr->tr_touched = 1;
+		brelse(bh);
+	}
+	if (bd) {
+		spin_lock(&sdp->sd_ail_lock);
+		if (bd->bd_ail) {
+			gfs2_remove_from_ail(bd);
+			bh->b_private = NULL;
+			bd->bd_bh = NULL;
+			bd->bd_blkno = bh->b_blocknr;
+			gfs2_trans_add_revoke(sdp, bd);
+		}
+		spin_unlock(&sdp->sd_ail_lock);
+	}
+	clear_buffer_dirty(bh);
+	clear_buffer_uptodate(bh);
+}
+
+/**
+ * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
+ * @ip: the inode who owns the buffers
+ * @bstart: the first buffer in the run
+ * @blen: the number of buffers in the run
+ *
+ */
+
+void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *bh;
+
+	while (blen) {
+		bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE);
+		if (bh) {
+			lock_buffer(bh);
+			gfs2_log_lock(sdp);
+			gfs2_remove_from_journal(bh, current->journal_info, 1);
+			gfs2_log_unlock(sdp);
+			unlock_buffer(bh);
+			brelse(bh);
+		}
+
+		bstart++;
+		blen--;
+	}
+}
+
+/**
+ * gfs2_meta_indirect_buffer - Get a metadata buffer
+ * @ip: The GFS2 inode
+ * @height: The level of this buf in the metadata (indir addr) tree (if any)
+ * @num: The block number (device relative) of the buffer
+ * @new: Non-zero if we may create a new buffer
+ * @bhp: the buffer is returned here
+ *
+ * Returns: errno
+ */
+
+int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
+			      int new, struct buffer_head **bhp)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	struct buffer_head *bh;
+	int ret = 0;
+
+	if (new) {
+		BUG_ON(height == 0);
+		bh = gfs2_meta_new(gl, num);
+		gfs2_trans_add_bh(ip->i_gl, bh, 1);
+		gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+		gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+	} else {
+		u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
+		ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh);
+		if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
+			brelse(bh);
+			ret = -EIO;
+		}
+	}
+	*bhp = bh;
+	return ret;
+}
+
+/**
+ * gfs2_meta_ra - start readahead on an extent of a file
+ * @gl: the glock the blocks belong to
+ * @dblock: the starting disk block
+ * @extlen: the number of blocks in the extent
+ *
+ * returns: the first buffer in the extent
+ */
+
+struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct buffer_head *first_bh, *bh;
+	u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
+			  sdp->sd_sb.sb_bsize_shift;
+
+	BUG_ON(!extlen);
+
+	if (max_ra < 1)
+		max_ra = 1;
+	if (extlen > max_ra)
+		extlen = max_ra;
+
+	first_bh = gfs2_getbuf(gl, dblock, CREATE);
+
+	if (buffer_uptodate(first_bh))
+		goto out;
+	if (!buffer_locked(first_bh))
+		ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh);
+
+	dblock++;
+	extlen--;
+
+	while (extlen) {
+		bh = gfs2_getbuf(gl, dblock, CREATE);
+
+		if (!buffer_uptodate(bh) && !buffer_locked(bh))
+			ll_rw_block(READA, 1, &bh);
+		brelse(bh);
+		dblock++;
+		extlen--;
+		if (!buffer_locked(first_bh) && buffer_uptodate(first_bh))
+			goto out;
+	}
+
+	wait_on_buffer(first_bh);
+out:
+	return first_bh;
+}
+
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
new file mode 100644
index 00000000..22c52659
--- /dev/null
+++ b/fs/gfs2/meta_io.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __DIO_DOT_H__
+#define __DIO_DOT_H__
+
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include "incore.h"
+
+static inline void gfs2_buffer_clear(struct buffer_head *bh)
+{
+	memset(bh->b_data, 0, bh->b_size);
+}
+
+static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head)
+{
+	BUG_ON(head > bh->b_size);
+	memset(bh->b_data + head, 0, bh->b_size - head);
+}
+
+static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
+					 int to_head,
+					 struct buffer_head *from_bh,
+					 int from_head)
+{
+	BUG_ON(from_head < to_head);
+	memcpy(to_bh->b_data + to_head, from_bh->b_data + from_head,
+	       from_bh->b_size - from_head);
+	memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
+	       0, from_head - to_head);
+}
+
+extern const struct address_space_operations gfs2_meta_aops;
+
+static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
+{
+	struct inode *inode = mapping->host;
+	if (mapping->a_ops == &gfs2_meta_aops)
+		return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
+	else
+		return inode->i_sb->s_fs_info;
+}
+
+void gfs2_meta_sync(struct gfs2_glock *gl);
+
+struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
+		   int flags, struct buffer_head **bhp);
+int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
+
+void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
+			 int meta);
+
+void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
+			      int meta);
+
+void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
+
+int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
+			      int new, struct buffer_head **bhp);
+
+static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
+					 struct buffer_head **bhp)
+{
+	return gfs2_meta_indirect_buffer(ip, 0, ip->i_no_addr, 0, bhp);
+}
+
+struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen);
+
+#define buffer_busy(bh) \
+((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
+
+#endif /* __DIO_DOT_H__ */
+
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
new file mode 100644
index 00000000..fa780e66
--- /dev/null
+++ b/fs/gfs2/ops_fstype.c
@@ -0,0 +1,1407 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/kthread.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/quotaops.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "sys.h"
+#include "util.h"
+#include "log.h"
+#include "quota.h"
+#include "dir.h"
+#include "trace_gfs2.h"
+
+#define DO 0
+#define UNDO 1
+
+/**
+ * gfs2_tune_init - Fill a gfs2_tune structure with default values
+ * @gt: tune
+ *
+ */
+
+static void gfs2_tune_init(struct gfs2_tune *gt)
+{
+	spin_lock_init(&gt->gt_spin);
+
+	gt->gt_quota_simul_sync = 64;
+	gt->gt_quota_warn_period = 10;
+	gt->gt_quota_scale_num = 1;
+	gt->gt_quota_scale_den = 1;
+	gt->gt_new_files_jdata = 0;
+	gt->gt_max_readahead = 1 << 18;
+	gt->gt_complain_secs = 10;
+}
+
+static struct gfs2_sbd *init_sbd(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp;
+
+	sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
+	if (!sdp)
+		return NULL;
+
+	sb->s_fs_info = sdp;
+	sdp->sd_vfs = sb;
+	set_bit(SDF_NOJOURNALID, &sdp->sd_flags);
+	gfs2_tune_init(&sdp->sd_tune);
+
+	init_waitqueue_head(&sdp->sd_glock_wait);
+	atomic_set(&sdp->sd_glock_disposal, 0);
+	init_completion(&sdp->sd_locking_init);
+	spin_lock_init(&sdp->sd_statfs_spin);
+
+	spin_lock_init(&sdp->sd_rindex_spin);
+	mutex_init(&sdp->sd_rindex_mutex);
+	INIT_LIST_HEAD(&sdp->sd_rindex_list);
+	INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
+
+	INIT_LIST_HEAD(&sdp->sd_jindex_list);
+	spin_lock_init(&sdp->sd_jindex_spin);
+	mutex_init(&sdp->sd_jindex_mutex);
+
+	INIT_LIST_HEAD(&sdp->sd_quota_list);
+	mutex_init(&sdp->sd_quota_mutex);
+	init_waitqueue_head(&sdp->sd_quota_wait);
+	INIT_LIST_HEAD(&sdp->sd_trunc_list);
+	spin_lock_init(&sdp->sd_trunc_lock);
+
+	spin_lock_init(&sdp->sd_log_lock);
+	atomic_set(&sdp->sd_log_pinned, 0);
+	INIT_LIST_HEAD(&sdp->sd_log_le_buf);
+	INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
+	INIT_LIST_HEAD(&sdp->sd_log_le_rg);
+	INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
+	INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
+
+	init_waitqueue_head(&sdp->sd_log_waitq);
+	init_waitqueue_head(&sdp->sd_logd_waitq);
+	spin_lock_init(&sdp->sd_ail_lock);
+	INIT_LIST_HEAD(&sdp->sd_ail1_list);
+	INIT_LIST_HEAD(&sdp->sd_ail2_list);
+
+	init_rwsem(&sdp->sd_log_flush_lock);
+	atomic_set(&sdp->sd_log_in_flight, 0);
+	init_waitqueue_head(&sdp->sd_log_flush_wait);
+
+	INIT_LIST_HEAD(&sdp->sd_revoke_list);
+
+	mutex_init(&sdp->sd_freeze_lock);
+
+	return sdp;
+}
+
+
+/**
+ * gfs2_check_sb - Check superblock
+ * @sdp: the filesystem
+ * @sb: The superblock
+ * @silent: Don't print a message if the check fails
+ *
+ * Checks the version code of the FS is one that we understand how to
+ * read and that the sizes of the various on-disk structures have not
+ * changed.
+ */
+
+static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
+{
+	struct gfs2_sb_host *sb = &sdp->sd_sb;
+
+	if (sb->sb_magic != GFS2_MAGIC ||
+	    sb->sb_type != GFS2_METATYPE_SB) {
+		if (!silent)
+			printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
+		return -EINVAL;
+	}
+
+	/*  If format numbers match exactly, we're done.  */
+
+	if (sb->sb_fs_format == GFS2_FORMAT_FS &&
+	    sb->sb_multihost_format == GFS2_FORMAT_MULTI)
+		return 0;
+
+	fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
+
+	return -EINVAL;
+}
+
+static void end_bio_io_page(struct bio *bio, int error)
+{
+	struct page *page = bio->bi_private;
+
+	if (!error)
+		SetPageUptodate(page);
+	else
+		printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
+	unlock_page(page);
+}
+
+static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf)
+{
+	struct gfs2_sb_host *sb = &sdp->sd_sb;
+	struct super_block *s = sdp->sd_vfs;
+	const struct gfs2_sb *str = buf;
+
+	sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
+	sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
+	sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
+	sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
+	sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
+	sb->sb_bsize = be32_to_cpu(str->sb_bsize);
+	sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
+	sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
+	sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
+	sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
+	sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
+
+	memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
+	memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
+	memcpy(s->s_uuid, str->sb_uuid, 16);
+}
+
+/**
+ * gfs2_read_super - Read the gfs2 super block from disk
+ * @sdp: The GFS2 super block
+ * @sector: The location of the super block
+ * @error: The error code to return
+ *
+ * This uses the bio functions to read the super block from disk
+ * because we want to be 100% sure that we never read cached data.
+ * A super block is read twice only during each GFS2 mount and is
+ * never written to by the filesystem. The first time its read no
+ * locks are held, and the only details which are looked at are those
+ * relating to the locking protocol. Once locking is up and working,
+ * the sb is read again under the lock to establish the location of
+ * the master directory (contains pointers to journals etc) and the
+ * root directory.
+ *
+ * Returns: 0 on success or error
+ */
+
+static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	struct gfs2_sb *p;
+	struct page *page;
+	struct bio *bio;
+
+	page = alloc_page(GFP_NOFS);
+	if (unlikely(!page))
+		return -ENOBUFS;
+
+	ClearPageUptodate(page);
+	ClearPageDirty(page);
+	lock_page(page);
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	bio->bi_sector = sector * (sb->s_blocksize >> 9);
+	bio->bi_bdev = sb->s_bdev;
+	bio_add_page(bio, page, PAGE_SIZE, 0);
+
+	bio->bi_end_io = end_bio_io_page;
+	bio->bi_private = page;
+	submit_bio(READ_SYNC | REQ_META, bio);
+	wait_on_page_locked(page);
+	bio_put(bio);
+	if (!PageUptodate(page)) {
+		__free_page(page);
+		return -EIO;
+	}
+	p = kmap(page);
+	gfs2_sb_in(sdp, p);
+	kunmap(page);
+	__free_page(page);
+	return gfs2_check_sb(sdp, silent);
+}
+
+/**
+ * gfs2_read_sb - Read super block
+ * @sdp: The GFS2 superblock
+ * @silent: Don't print message if mount fails
+ *
+ */
+
+static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
+{
+	u32 hash_blocks, ind_blocks, leaf_blocks;
+	u32 tmp_blocks;
+	unsigned int x;
+	int error;
+
+	error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent);
+	if (error) {
+		if (!silent)
+			fs_err(sdp, "can't read superblock\n");
+		return error;
+	}
+
+	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+			       GFS2_BASIC_BLOCK_SHIFT;
+	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+	sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
+			  sizeof(struct gfs2_dinode)) / sizeof(u64);
+	sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
+			  sizeof(struct gfs2_meta_header)) / sizeof(u64);
+	sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
+	sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
+	sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
+	sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
+	sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
+				sizeof(struct gfs2_meta_header)) /
+			        sizeof(struct gfs2_quota_change);
+
+	/* Compute maximum reservation required to add a entry to a directory */
+
+	hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
+			     sdp->sd_jbsize);
+
+	ind_blocks = 0;
+	for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
+		tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
+		ind_blocks += tmp_blocks;
+	}
+
+	leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
+
+	sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
+
+	sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
+				sizeof(struct gfs2_dinode);
+	sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
+	for (x = 2;; x++) {
+		u64 space, d;
+		u32 m;
+
+		space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
+		d = space;
+		m = do_div(d, sdp->sd_inptrs);
+
+		if (d != sdp->sd_heightsize[x - 1] || m)
+			break;
+		sdp->sd_heightsize[x] = space;
+	}
+	sdp->sd_max_height = x;
+	sdp->sd_heightsize[x] = ~0;
+	gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
+
+	sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
+				 sizeof(struct gfs2_dinode);
+	sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
+	for (x = 2;; x++) {
+		u64 space, d;
+		u32 m;
+
+		space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
+		d = space;
+		m = do_div(d, sdp->sd_inptrs);
+
+		if (d != sdp->sd_jheightsize[x - 1] || m)
+			break;
+		sdp->sd_jheightsize[x] = space;
+	}
+	sdp->sd_max_jheight = x;
+	sdp->sd_jheightsize[x] = ~0;
+	gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
+
+	return 0;
+}
+
+static int init_names(struct gfs2_sbd *sdp, int silent)
+{
+	char *proto, *table;
+	int error = 0;
+
+	proto = sdp->sd_args.ar_lockproto;
+	table = sdp->sd_args.ar_locktable;
+
+	/*  Try to autodetect  */
+
+	if (!proto[0] || !table[0]) {
+		error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent);
+		if (error)
+			return error;
+
+		if (!proto[0])
+			proto = sdp->sd_sb.sb_lockproto;
+		if (!table[0])
+			table = sdp->sd_sb.sb_locktable;
+	}
+
+	if (!table[0])
+		table = sdp->sd_vfs->s_id;
+
+	strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN);
+	strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN);
+
+	table = sdp->sd_table_name;
+	while ((table = strchr(table, '/')))
+		*table = '_';
+
+	return error;
+}
+
+static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
+			int undo)
+{
+	int error = 0;
+
+	if (undo)
+		goto fail_trans;
+
+	error = gfs2_glock_nq_num(sdp,
+				  GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
+				  LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
+				  mount_gh);
+	if (error) {
+		fs_err(sdp, "can't acquire mount glock: %d\n", error);
+		goto fail;
+	}
+
+	error = gfs2_glock_nq_num(sdp,
+				  GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
+				  LM_ST_SHARED,
+				  LM_FLAG_NOEXP | GL_EXACT,
+				  &sdp->sd_live_gh);
+	if (error) {
+		fs_err(sdp, "can't acquire live glock: %d\n", error);
+		goto fail_mount;
+	}
+
+	error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops,
+			       CREATE, &sdp->sd_rename_gl);
+	if (error) {
+		fs_err(sdp, "can't create rename glock: %d\n", error);
+		goto fail_live;
+	}
+
+	error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops,
+			       CREATE, &sdp->sd_trans_gl);
+	if (error) {
+		fs_err(sdp, "can't create transaction glock: %d\n", error);
+		goto fail_rename;
+	}
+
+	return 0;
+
+fail_trans:
+	gfs2_glock_put(sdp->sd_trans_gl);
+fail_rename:
+	gfs2_glock_put(sdp->sd_rename_gl);
+fail_live:
+	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+fail_mount:
+	gfs2_glock_dq_uninit(mount_gh);
+fail:
+	return error;
+}
+
+static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
+			    u64 no_addr, const char *name)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct dentry *dentry;
+	struct inode *inode;
+
+	inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+	if (IS_ERR(inode)) {
+		fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
+		return PTR_ERR(inode);
+	}
+	dentry = d_alloc_root(inode);
+	if (!dentry) {
+		fs_err(sdp, "can't alloc %s dentry\n", name);
+		iput(inode);
+		return -ENOMEM;
+	}
+	*dptr = dentry;
+	return 0;
+}
+
+static int init_sb(struct gfs2_sbd *sdp, int silent)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	struct gfs2_holder sb_gh;
+	u64 no_addr;
+	int ret;
+
+	ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
+				LM_ST_SHARED, 0, &sb_gh);
+	if (ret) {
+		fs_err(sdp, "can't acquire superblock glock: %d\n", ret);
+		return ret;
+	}
+
+	ret = gfs2_read_sb(sdp, silent);
+	if (ret) {
+		fs_err(sdp, "can't read superblock: %d\n", ret);
+		goto out;
+	}
+
+	/* Set up the buffer cache and SB for real */
+	if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) {
+		ret = -EINVAL;
+		fs_err(sdp, "FS block size (%u) is too small for device "
+		       "block size (%u)\n",
+		       sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev));
+		goto out;
+	}
+	if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
+		ret = -EINVAL;
+		fs_err(sdp, "FS block size (%u) is too big for machine "
+		       "page size (%u)\n",
+		       sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
+		goto out;
+	}
+	sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
+
+	/* Get the root inode */
+	no_addr = sdp->sd_sb.sb_root_dir.no_addr;
+	ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root");
+	if (ret)
+		goto out;
+
+	/* Get the master inode */
+	no_addr = sdp->sd_sb.sb_master_dir.no_addr;
+	ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master");
+	if (ret) {
+		dput(sdp->sd_root_dir);
+		goto out;
+	}
+	sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir);
+out:
+	gfs2_glock_dq_uninit(&sb_gh);
+	return ret;
+}
+
+/**
+ * map_journal_extents - create a reusable "extent" mapping from all logical
+ * blocks to all physical blocks for the given journal.  This will save
+ * us time when writing journal blocks.  Most journals will have only one
+ * extent that maps all their logical blocks.  That's because gfs2.mkfs
+ * arranges the journal blocks sequentially to maximize performance.
+ * So the extent would map the first block for the entire file length.
+ * However, gfs2_jadd can happen while file activity is happening, so
+ * those journals may not be sequential.  Less likely is the case where
+ * the users created their own journals by mounting the metafs and
+ * laying it out.  But it's still possible.  These journals might have
+ * several extents.
+ *
+ * TODO: This should be done in bigger chunks rather than one block at a time,
+ *       but since it's only done at mount time, I'm not worried about the
+ *       time it takes.
+ */
+static int map_journal_extents(struct gfs2_sbd *sdp)
+{
+	struct gfs2_jdesc *jd = sdp->sd_jdesc;
+	unsigned int lb;
+	u64 db, prev_db; /* logical block, disk block, prev disk block */
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_journal_extent *jext = NULL;
+	struct buffer_head bh;
+	int rc = 0;
+
+	prev_db = 0;
+
+	for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
+		bh.b_state = 0;
+		bh.b_blocknr = 0;
+		bh.b_size = 1 << ip->i_inode.i_blkbits;
+		rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0);
+		db = bh.b_blocknr;
+		if (rc || !db) {
+			printk(KERN_INFO "GFS2 journal mapping error %d: lb="
+			       "%u db=%llu\n", rc, lb, (unsigned long long)db);
+			break;
+		}
+		if (!prev_db || db != prev_db + 1) {
+			jext = kzalloc(sizeof(struct gfs2_journal_extent),
+				       GFP_KERNEL);
+			if (!jext) {
+				printk(KERN_INFO "GFS2 error: out of memory "
+				       "mapping journal extents.\n");
+				rc = -ENOMEM;
+				break;
+			}
+			jext->dblock = db;
+			jext->lblock = lb;
+			jext->blocks = 1;
+			list_add_tail(&jext->extent_list, &jd->extent_list);
+		} else {
+			jext->blocks++;
+		}
+		prev_db = db;
+	}
+	return rc;
+}
+
+static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
+{
+	char *message = "FIRSTMOUNT=Done";
+	char *envp[] = { message, NULL };
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ls->ls_first_done = 1;
+	kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
+}
+
+/**
+ * gfs2_jindex_hold - Grab a lock on the jindex
+ * @sdp: The GFS2 superblock
+ * @ji_gh: the holder for the jindex glock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
+{
+	struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
+	struct qstr name;
+	char buf[20];
+	struct gfs2_jdesc *jd;
+	int error;
+
+	name.name = buf;
+
+	mutex_lock(&sdp->sd_jindex_mutex);
+
+	for (;;) {
+		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
+		if (error)
+			break;
+
+		name.len = sprintf(buf, "journal%u", sdp->sd_journals);
+		name.hash = gfs2_disk_hash(name.name, name.len);
+
+		error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
+		if (error == -ENOENT) {
+			error = 0;
+			break;
+		}
+
+		gfs2_glock_dq_uninit(ji_gh);
+
+		if (error)
+			break;
+
+		error = -ENOMEM;
+		jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
+		if (!jd)
+			break;
+
+		INIT_LIST_HEAD(&jd->extent_list);
+		INIT_WORK(&jd->jd_work, gfs2_recover_func);
+		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
+		if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
+			if (!jd->jd_inode)
+				error = -ENOENT;
+			else
+				error = PTR_ERR(jd->jd_inode);
+			kfree(jd);
+			break;
+		}
+
+		spin_lock(&sdp->sd_jindex_spin);
+		jd->jd_jid = sdp->sd_journals++;
+		list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
+		spin_unlock(&sdp->sd_jindex_spin);
+	}
+
+	mutex_unlock(&sdp->sd_jindex_mutex);
+
+	return error;
+}
+
+static int init_journal(struct gfs2_sbd *sdp, int undo)
+{
+	struct inode *master = sdp->sd_master_dir->d_inode;
+	struct gfs2_holder ji_gh;
+	struct gfs2_inode *ip;
+	int jindex = 1;
+	int error = 0;
+
+	if (undo) {
+		jindex = 0;
+		goto fail_jinode_gh;
+	}
+
+	sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
+	if (IS_ERR(sdp->sd_jindex)) {
+		fs_err(sdp, "can't lookup journal index: %d\n", error);
+		return PTR_ERR(sdp->sd_jindex);
+	}
+	ip = GFS2_I(sdp->sd_jindex);
+
+	/* Load in the journal index special file */
+
+	error = gfs2_jindex_hold(sdp, &ji_gh);
+	if (error) {
+		fs_err(sdp, "can't read journal index: %d\n", error);
+		goto fail;
+	}
+
+	error = -EUSERS;
+	if (!gfs2_jindex_size(sdp)) {
+		fs_err(sdp, "no journals!\n");
+		goto fail_jindex;
+	}
+
+	if (sdp->sd_args.ar_spectator) {
+		sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
+		atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+		atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+		atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
+	} else {
+		if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
+			fs_err(sdp, "can't mount journal #%u\n",
+			       sdp->sd_lockstruct.ls_jid);
+			fs_err(sdp, "there are only %u journals (0 - %u)\n",
+			       gfs2_jindex_size(sdp),
+			       gfs2_jindex_size(sdp) - 1);
+			goto fail_jindex;
+		}
+		sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid);
+
+		error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid,
+					  &gfs2_journal_glops,
+					  LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
+					  &sdp->sd_journal_gh);
+		if (error) {
+			fs_err(sdp, "can't acquire journal glock: %d\n", error);
+			goto fail_jindex;
+		}
+
+		ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
+					   LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE,
+					   &sdp->sd_jinode_gh);
+		if (error) {
+			fs_err(sdp, "can't acquire journal inode glock: %d\n",
+			       error);
+			goto fail_journal_gh;
+		}
+
+		error = gfs2_jdesc_check(sdp->sd_jdesc);
+		if (error) {
+			fs_err(sdp, "my journal (%u) is bad: %d\n",
+			       sdp->sd_jdesc->jd_jid, error);
+			goto fail_jinode_gh;
+		}
+		atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+		atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+		atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
+
+		/* Map the extents for this journal's blocks */
+		map_journal_extents(sdp);
+	}
+	trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free));
+
+	if (sdp->sd_lockstruct.ls_first) {
+		unsigned int x;
+		for (x = 0; x < sdp->sd_journals; x++) {
+			error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x),
+						     true);
+			if (error) {
+				fs_err(sdp, "error recovering journal %u: %d\n",
+				       x, error);
+				goto fail_jinode_gh;
+			}
+		}
+
+		gfs2_others_may_mount(sdp);
+	} else if (!sdp->sd_args.ar_spectator) {
+		error = gfs2_recover_journal(sdp->sd_jdesc, true);
+		if (error) {
+			fs_err(sdp, "error recovering my journal: %d\n", error);
+			goto fail_jinode_gh;
+		}
+	}
+
+	set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
+	gfs2_glock_dq_uninit(&ji_gh);
+	jindex = 0;
+
+	return 0;
+
+fail_jinode_gh:
+	if (!sdp->sd_args.ar_spectator)
+		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+fail_journal_gh:
+	if (!sdp->sd_args.ar_spectator)
+		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+fail_jindex:
+	gfs2_jindex_free(sdp);
+	if (jindex)
+		gfs2_glock_dq_uninit(&ji_gh);
+fail:
+	iput(sdp->sd_jindex);
+	return error;
+}
+
+
+static int init_inodes(struct gfs2_sbd *sdp, int undo)
+{
+	int error = 0;
+	struct gfs2_inode *ip;
+	struct inode *master = sdp->sd_master_dir->d_inode;
+
+	if (undo)
+		goto fail_qinode;
+
+	error = init_journal(sdp, undo);
+	if (error)
+		goto fail;
+
+	/* Read in the master statfs inode */
+	sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
+	if (IS_ERR(sdp->sd_statfs_inode)) {
+		error = PTR_ERR(sdp->sd_statfs_inode);
+		fs_err(sdp, "can't read in statfs inode: %d\n", error);
+		goto fail_journal;
+	}
+
+	/* Read in the resource index inode */
+	sdp->sd_rindex = gfs2_lookup_simple(master, "rindex");
+	if (IS_ERR(sdp->sd_rindex)) {
+		error = PTR_ERR(sdp->sd_rindex);
+		fs_err(sdp, "can't get resource index inode: %d\n", error);
+		goto fail_statfs;
+	}
+	ip = GFS2_I(sdp->sd_rindex);
+	sdp->sd_rindex_uptodate = 0;
+
+	/* Read in the quota inode */
+	sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota");
+	if (IS_ERR(sdp->sd_quota_inode)) {
+		error = PTR_ERR(sdp->sd_quota_inode);
+		fs_err(sdp, "can't get quota file inode: %d\n", error);
+		goto fail_rindex;
+	}
+	return 0;
+
+fail_qinode:
+	iput(sdp->sd_quota_inode);
+fail_rindex:
+	gfs2_clear_rgrpd(sdp);
+	iput(sdp->sd_rindex);
+fail_statfs:
+	iput(sdp->sd_statfs_inode);
+fail_journal:
+	init_journal(sdp, UNDO);
+fail:
+	return error;
+}
+
+static int init_per_node(struct gfs2_sbd *sdp, int undo)
+{
+	struct inode *pn = NULL;
+	char buf[30];
+	int error = 0;
+	struct gfs2_inode *ip;
+	struct inode *master = sdp->sd_master_dir->d_inode;
+
+	if (sdp->sd_args.ar_spectator)
+		return 0;
+
+	if (undo)
+		goto fail_qc_gh;
+
+	pn = gfs2_lookup_simple(master, "per_node");
+	if (IS_ERR(pn)) {
+		error = PTR_ERR(pn);
+		fs_err(sdp, "can't find per_node directory: %d\n", error);
+		return error;
+	}
+
+	sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
+	sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
+	if (IS_ERR(sdp->sd_sc_inode)) {
+		error = PTR_ERR(sdp->sd_sc_inode);
+		fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
+		goto fail;
+	}
+
+	sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
+	sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
+	if (IS_ERR(sdp->sd_qc_inode)) {
+		error = PTR_ERR(sdp->sd_qc_inode);
+		fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
+		goto fail_ut_i;
+	}
+
+	iput(pn);
+	pn = NULL;
+
+	ip = GFS2_I(sdp->sd_sc_inode);
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
+				   &sdp->sd_sc_gh);
+	if (error) {
+		fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
+		goto fail_qc_i;
+	}
+
+	ip = GFS2_I(sdp->sd_qc_inode);
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
+				   &sdp->sd_qc_gh);
+	if (error) {
+		fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
+		goto fail_ut_gh;
+	}
+
+	return 0;
+
+fail_qc_gh:
+	gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+fail_ut_gh:
+	gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+fail_qc_i:
+	iput(sdp->sd_qc_inode);
+fail_ut_i:
+	iput(sdp->sd_sc_inode);
+fail:
+	if (pn)
+		iput(pn);
+	return error;
+}
+
+static int init_threads(struct gfs2_sbd *sdp, int undo)
+{
+	struct task_struct *p;
+	int error = 0;
+
+	if (undo)
+		goto fail_quotad;
+
+	p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
+	error = IS_ERR(p);
+	if (error) {
+		fs_err(sdp, "can't start logd thread: %d\n", error);
+		return error;
+	}
+	sdp->sd_logd_process = p;
+
+	p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
+	error = IS_ERR(p);
+	if (error) {
+		fs_err(sdp, "can't start quotad thread: %d\n", error);
+		goto fail;
+	}
+	sdp->sd_quotad_process = p;
+
+	return 0;
+
+
+fail_quotad:
+	kthread_stop(sdp->sd_quotad_process);
+fail:
+	kthread_stop(sdp->sd_logd_process);
+	return error;
+}
+
+static const match_table_t nolock_tokens = {
+	{ Opt_jid, "jid=%d\n", },
+	{ Opt_err, NULL },
+};
+
+static const struct lm_lockops nolock_ops = {
+	.lm_proto_name = "lock_nolock",
+	.lm_put_lock = gfs2_glock_free,
+	.lm_tokens = &nolock_tokens,
+};
+
+/**
+ * gfs2_lm_mount - mount a locking protocol
+ * @sdp: the filesystem
+ * @args: mount arguments
+ * @silent: if 1, don't complain if the FS isn't a GFS2 fs
+ *
+ * Returns: errno
+ */
+
+static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
+{
+	const struct lm_lockops *lm;
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	struct gfs2_args *args = &sdp->sd_args;
+	const char *proto = sdp->sd_proto_name;
+	const char *table = sdp->sd_table_name;
+	const char *fsname;
+	char *o, *options;
+	int ret;
+
+	if (!strcmp("lock_nolock", proto)) {
+		lm = &nolock_ops;
+		sdp->sd_args.ar_localflocks = 1;
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+	} else if (!strcmp("lock_dlm", proto)) {
+		lm = &gfs2_dlm_ops;
+#endif
+	} else {
+		printk(KERN_INFO "GFS2: can't find protocol %s\n", proto);
+		return -ENOENT;
+	}
+
+	fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
+
+	ls->ls_ops = lm;
+	ls->ls_first = 1;
+
+	for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
+		substring_t tmp[MAX_OPT_ARGS];
+		int token, option;
+
+		if (!o || !*o)
+			continue;
+
+		token = match_token(o, *lm->lm_tokens, tmp);
+		switch (token) {
+		case Opt_jid:
+			ret = match_int(&tmp[0], &option);
+			if (ret || option < 0) 
+				goto hostdata_error;
+			if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags))
+				ls->ls_jid = option;
+			break;
+		case Opt_id:
+			/* Obsolete, but left for backward compat purposes */
+			break;
+		case Opt_first:
+			ret = match_int(&tmp[0], &option);
+			if (ret || (option != 0 && option != 1))
+				goto hostdata_error;
+			ls->ls_first = option;
+			break;
+		case Opt_nodir:
+			ret = match_int(&tmp[0], &option);
+			if (ret || (option != 0 && option != 1))
+				goto hostdata_error;
+			ls->ls_nodir = option;
+			break;
+		case Opt_err:
+		default:
+hostdata_error:
+			fs_info(sdp, "unknown hostdata (%s)\n", o);
+			return -EINVAL;
+		}
+	}
+
+	if (sdp->sd_args.ar_spectator)
+		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
+	else
+		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
+			 sdp->sd_lockstruct.ls_jid);
+
+	fsname = strchr(table, ':');
+	if (fsname)
+		fsname++;
+	if (lm->lm_mount == NULL) {
+		fs_info(sdp, "Now mounting FS...\n");
+		complete_all(&sdp->sd_locking_init);
+		return 0;
+	}
+	ret = lm->lm_mount(sdp, fsname);
+	if (ret == 0)
+		fs_info(sdp, "Joined cluster. Now mounting FS...\n");
+	complete_all(&sdp->sd_locking_init);
+	return ret;
+}
+
+void gfs2_lm_unmount(struct gfs2_sbd *sdp)
+{
+	const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) &&
+	    lm->lm_unmount)
+		lm->lm_unmount(sdp);
+}
+
+static int gfs2_journalid_wait(void *word)
+{
+	if (signal_pending(current))
+		return -EINTR;
+	schedule();
+	return 0;
+}
+
+static int wait_on_journal(struct gfs2_sbd *sdp)
+{
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+		return 0;
+
+	return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE);
+}
+
+void gfs2_online_uevent(struct gfs2_sbd *sdp)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	char ro[20];
+	char spectator[20];
+	char *envp[] = { ro, spectator, NULL };
+	sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+	sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
+	kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp);
+}
+
+/**
+ * fill_super - Read in superblock
+ * @sb: The VFS superblock
+ * @data: Mount options
+ * @silent: Don't complain if it's not a GFS2 filesystem
+ *
+ * Returns: errno
+ */
+
+static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent)
+{
+	struct gfs2_sbd *sdp;
+	struct gfs2_holder mount_gh;
+	int error;
+
+	sdp = init_sbd(sb);
+	if (!sdp) {
+		printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
+		return -ENOMEM;
+	}
+	sdp->sd_args = *args;
+
+	if (sdp->sd_args.ar_spectator) {
+                sb->s_flags |= MS_RDONLY;
+		set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+	}
+	if (sdp->sd_args.ar_posix_acl)
+		sb->s_flags |= MS_POSIXACL;
+	if (sdp->sd_args.ar_nobarrier)
+		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+
+	sb->s_magic = GFS2_MAGIC;
+	sb->s_op = &gfs2_super_ops;
+	sb->s_d_op = &gfs2_dops;
+	sb->s_export_op = &gfs2_export_ops;
+	sb->s_xattr = gfs2_xattr_handlers;
+	sb->s_qcop = &gfs2_quotactl_ops;
+	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
+	sb->s_time_gran = 1;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+
+	/* Set up the buffer cache and fill in some fake block size values
+	   to allow us to read-in the on-disk superblock. */
+	sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
+	sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
+	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+                               GFS2_BASIC_BLOCK_SHIFT;
+	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+
+	sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
+	sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
+	if (sdp->sd_args.ar_statfs_quantum) {
+		sdp->sd_tune.gt_statfs_slow = 0;
+		sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum;
+	} else {
+		sdp->sd_tune.gt_statfs_slow = 1;
+		sdp->sd_tune.gt_statfs_quantum = 30;
+	}
+
+	error = init_names(sdp, silent);
+	if (error)
+		goto fail;
+
+	gfs2_create_debugfs_file(sdp);
+
+	error = gfs2_sys_fs_add(sdp);
+	if (error)
+		goto fail;
+
+	error = gfs2_lm_mount(sdp, silent);
+	if (error)
+		goto fail_sys;
+
+	error = init_locking(sdp, &mount_gh, DO);
+	if (error)
+		goto fail_lm;
+
+	error = init_sb(sdp, silent);
+	if (error)
+		goto fail_locking;
+
+	error = wait_on_journal(sdp);
+	if (error)
+		goto fail_sb;
+
+	/*
+	 * If user space has failed to join the cluster or some similar
+	 * failure has occurred, then the journal id will contain a
+	 * negative (error) number. This will then be returned to the
+	 * caller (of the mount syscall). We do this even for spectator
+	 * mounts (which just write a jid of 0 to indicate "ok" even though
+	 * the jid is unused in the spectator case)
+	 */
+	if (sdp->sd_lockstruct.ls_jid < 0) {
+		error = sdp->sd_lockstruct.ls_jid;
+		sdp->sd_lockstruct.ls_jid = 0;
+		goto fail_sb;
+	}
+
+	error = init_inodes(sdp, DO);
+	if (error)
+		goto fail_sb;
+
+	error = init_per_node(sdp, DO);
+	if (error)
+		goto fail_inodes;
+
+	error = gfs2_statfs_init(sdp);
+	if (error) {
+		fs_err(sdp, "can't initialize statfs subsystem: %d\n", error);
+		goto fail_per_node;
+	}
+
+	error = init_threads(sdp, DO);
+	if (error)
+		goto fail_per_node;
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		error = gfs2_make_fs_rw(sdp);
+		if (error) {
+			fs_err(sdp, "can't make FS RW: %d\n", error);
+			goto fail_threads;
+		}
+	}
+
+	gfs2_glock_dq_uninit(&mount_gh);
+	gfs2_online_uevent(sdp);
+	return 0;
+
+fail_threads:
+	init_threads(sdp, UNDO);
+fail_per_node:
+	init_per_node(sdp, UNDO);
+fail_inodes:
+	init_inodes(sdp, UNDO);
+fail_sb:
+	if (sdp->sd_root_dir)
+		dput(sdp->sd_root_dir);
+	if (sdp->sd_master_dir)
+		dput(sdp->sd_master_dir);
+	if (sb->s_root)
+		dput(sb->s_root);
+	sb->s_root = NULL;
+fail_locking:
+	init_locking(sdp, &mount_gh, UNDO);
+fail_lm:
+	gfs2_gl_hash_clear(sdp);
+	gfs2_lm_unmount(sdp);
+fail_sys:
+	gfs2_sys_fs_del(sdp);
+fail:
+	gfs2_delete_debugfs_file(sdp);
+	kfree(sdp);
+	sb->s_fs_info = NULL;
+	return error;
+}
+
+static int set_gfs2_super(struct super_block *s, void *data)
+{
+	s->s_bdev = data;
+	s->s_dev = s->s_bdev->bd_dev;
+
+	/*
+	 * We set the bdi here to the queue backing, file systems can
+	 * overwrite this in ->fill_super()
+	 */
+	s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
+	return 0;
+}
+
+static int test_gfs2_super(struct super_block *s, void *ptr)
+{
+	struct block_device *bdev = ptr;
+	return (bdev == s->s_bdev);
+}
+
+/**
+ * gfs2_mount - Get the GFS2 superblock
+ * @fs_type: The GFS2 filesystem type
+ * @flags: Mount flags
+ * @dev_name: The name of the device
+ * @data: The mount arguments
+ *
+ * Q. Why not use get_sb_bdev() ?
+ * A. We need to select one of two root directories to mount, independent
+ *    of whether this is the initial, or subsequent, mount of this sb
+ *
+ * Returns: 0 or -ve on error
+ */
+
+static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data)
+{
+	struct block_device *bdev;
+	struct super_block *s;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
+	int error;
+	struct gfs2_args args;
+	struct gfs2_sbd *sdp;
+
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
+	if (IS_ERR(bdev))
+		return ERR_CAST(bdev);
+
+	/*
+	 * once the super is inserted into the list by sget, s_umount
+	 * will protect the lockfs code from trying to start a snapshot
+	 * while we are mounting
+	 */
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (bdev->bd_fsfreeze_count > 0) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		error = -EBUSY;
+		goto error_bdev;
+	}
+	s = sget(fs_type, test_gfs2_super, set_gfs2_super, bdev);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	error = PTR_ERR(s);
+	if (IS_ERR(s))
+		goto error_bdev;
+
+	if (s->s_root)
+		blkdev_put(bdev, mode);
+
+	memset(&args, 0, sizeof(args));
+	args.ar_quota = GFS2_QUOTA_DEFAULT;
+	args.ar_data = GFS2_DATA_DEFAULT;
+	args.ar_commit = 30;
+	args.ar_statfs_quantum = 30;
+	args.ar_quota_quantum = 60;
+	args.ar_errors = GFS2_ERRORS_DEFAULT;
+
+	error = gfs2_mount_args(&args, data);
+	if (error) {
+		printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
+		goto error_super;
+	}
+
+	if (s->s_root) {
+		error = -EBUSY;
+		if ((flags ^ s->s_flags) & MS_RDONLY)
+			goto error_super;
+	} else {
+		char b[BDEVNAME_SIZE];
+
+		s->s_flags = flags;
+		s->s_mode = mode;
+		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		sb_set_blocksize(s, block_size(bdev));
+		error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
+		if (error)
+			goto error_super;
+		s->s_flags |= MS_ACTIVE;
+		bdev->bd_super = s;
+	}
+
+	sdp = s->s_fs_info;
+	if (args.ar_meta)
+		return dget(sdp->sd_master_dir);
+	else
+		return dget(sdp->sd_root_dir);
+
+error_super:
+	deactivate_locked_super(s);
+	return ERR_PTR(error);
+error_bdev:
+	blkdev_put(bdev, mode);
+	return ERR_PTR(error);
+}
+
+static int set_meta_super(struct super_block *s, void *ptr)
+{
+	return -EINVAL;
+}
+
+static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *data)
+{
+	struct super_block *s;
+	struct gfs2_sbd *sdp;
+	struct path path;
+	int error;
+
+	error = kern_path(dev_name, LOOKUP_FOLLOW, &path);
+	if (error) {
+		printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
+		       dev_name, error);
+		return ERR_PTR(error);
+	}
+	s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
+		 path.dentry->d_inode->i_sb->s_bdev);
+	path_put(&path);
+	if (IS_ERR(s)) {
+		printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
+		return ERR_CAST(s);
+	}
+	if ((flags ^ s->s_flags) & MS_RDONLY) {
+		deactivate_locked_super(s);
+		return ERR_PTR(-EBUSY);
+	}
+	sdp = s->s_fs_info;
+	return dget(sdp->sd_master_dir);
+}
+
+static void gfs2_kill_sb(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+
+	if (sdp == NULL) {
+		kill_block_super(sb);
+		return;
+	}
+
+	gfs2_meta_syncfs(sdp);
+	dput(sdp->sd_root_dir);
+	dput(sdp->sd_master_dir);
+	sdp->sd_root_dir = NULL;
+	sdp->sd_master_dir = NULL;
+	shrink_dcache_sb(sb);
+	kill_block_super(sb);
+	gfs2_delete_debugfs_file(sdp);
+	kfree(sdp);
+}
+
+struct file_system_type gfs2_fs_type = {
+	.name = "gfs2",
+	.fs_flags = FS_REQUIRES_DEV,
+	.mount = gfs2_mount,
+	.kill_sb = gfs2_kill_sb,
+	.owner = THIS_MODULE,
+};
+
+struct file_system_type gfs2meta_fs_type = {
+	.name = "gfs2meta",
+	.fs_flags = FS_REQUIRES_DEV,
+	.mount = gfs2_mount_meta,
+	.owner = THIS_MODULE,
+};
+
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
new file mode 100644
index 00000000..42e8d23b
--- /dev/null
+++ b/fs/gfs2/quota.c
@@ -0,0 +1,1644 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+/*
+ * Quota change tags are associated with each transaction that allocates or
+ * deallocates space.  Those changes are accumulated locally to each node (in a
+ * per-node file) and then are periodically synced to the quota file.  This
+ * avoids the bottleneck of constantly touching the quota file, but introduces
+ * fuzziness in the current usage value of IDs that are being used on different
+ * nodes in the cluster simultaneously.  So, it is possible for a user on
+ * multiple nodes to overrun their quota, but that overrun is controlable.
+ * Since quota tags are part of transactions, there is no need for a quota check
+ * program to be run on node crashes or anything like that.
+ *
+ * There are couple of knobs that let the administrator manage the quota
+ * fuzziness.  "quota_quantum" sets the maximum time a quota change can be
+ * sitting on one node before being synced to the quota file.  (The default is
+ * 60 seconds.)  Another knob, "quota_scale" controls how quickly the frequency
+ * of quota file syncs increases as the user moves closer to their limit.  The
+ * more frequent the syncs, the more accurate the quota enforcement, but that
+ * means that there is more contention between the nodes for the quota file.
+ * The default value is one.  This sets the maximum theoretical quota overrun
+ * (with infinite node with infinite bandwidth) to twice the user's limit.  (In
+ * practice, the maximum overrun you see should be much less.)  A "quota_scale"
+ * number greater than one makes quota syncs more frequent and reduces the
+ * maximum overrun.  Numbers less than one (but greater than zero) make quota
+ * syncs less frequent.
+ *
+ * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of
+ * the quota file, so it is not being constantly read.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/sort.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/quota.h>
+#include <linux/dqblk_xfs.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "inode.h"
+#include "util.h"
+
+#define QUOTA_USER 1
+#define QUOTA_GROUP 0
+
+struct gfs2_quota_change_host {
+	u64 qc_change;
+	u32 qc_flags; /* GFS2_QCF_... */
+	u32 qc_id;
+};
+
+static LIST_HEAD(qd_lru_list);
+static atomic_t qd_lru_count = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(qd_lru_lock);
+
+int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct gfs2_quota_data *qd;
+	struct gfs2_sbd *sdp;
+	int nr_to_scan = sc->nr_to_scan;
+
+	if (nr_to_scan == 0)
+		goto out;
+
+	if (!(sc->gfp_mask & __GFP_FS))
+		return -1;
+
+	spin_lock(&qd_lru_lock);
+	while (nr_to_scan && !list_empty(&qd_lru_list)) {
+		qd = list_entry(qd_lru_list.next,
+				struct gfs2_quota_data, qd_reclaim);
+		sdp = qd->qd_gl->gl_sbd;
+
+		/* Free from the filesystem-specific list */
+		list_del(&qd->qd_list);
+
+		gfs2_assert_warn(sdp, !qd->qd_change);
+		gfs2_assert_warn(sdp, !qd->qd_slot_count);
+		gfs2_assert_warn(sdp, !qd->qd_bh_count);
+
+		gfs2_glock_put(qd->qd_gl);
+		atomic_dec(&sdp->sd_quota_count);
+
+		/* Delete it from the common reclaim list */
+		list_del_init(&qd->qd_reclaim);
+		atomic_dec(&qd_lru_count);
+		spin_unlock(&qd_lru_lock);
+		kmem_cache_free(gfs2_quotad_cachep, qd);
+		spin_lock(&qd_lru_lock);
+		nr_to_scan--;
+	}
+	spin_unlock(&qd_lru_lock);
+
+out:
+	return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100;
+}
+
+static u64 qd2offset(struct gfs2_quota_data *qd)
+{
+	u64 offset;
+
+	offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
+	offset *= sizeof(struct gfs2_quota);
+
+	return offset;
+}
+
+static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
+		    struct gfs2_quota_data **qdp)
+{
+	struct gfs2_quota_data *qd;
+	int error;
+
+	qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
+	if (!qd)
+		return -ENOMEM;
+
+	atomic_set(&qd->qd_count, 1);
+	qd->qd_id = id;
+	if (user)
+		set_bit(QDF_USER, &qd->qd_flags);
+	qd->qd_slot = -1;
+	INIT_LIST_HEAD(&qd->qd_reclaim);
+
+	error = gfs2_glock_get(sdp, 2 * (u64)id + !user,
+			      &gfs2_quota_glops, CREATE, &qd->qd_gl);
+	if (error)
+		goto fail;
+
+	*qdp = qd;
+
+	return 0;
+
+fail:
+	kmem_cache_free(gfs2_quotad_cachep, qd);
+	return error;
+}
+
+static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
+		  struct gfs2_quota_data **qdp)
+{
+	struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
+	int error, found;
+
+	*qdp = NULL;
+
+	for (;;) {
+		found = 0;
+		spin_lock(&qd_lru_lock);
+		list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
+			if (qd->qd_id == id &&
+			    !test_bit(QDF_USER, &qd->qd_flags) == !user) {
+				if (!atomic_read(&qd->qd_count) &&
+				    !list_empty(&qd->qd_reclaim)) {
+					/* Remove it from reclaim list */
+					list_del_init(&qd->qd_reclaim);
+					atomic_dec(&qd_lru_count);
+				}
+				atomic_inc(&qd->qd_count);
+				found = 1;
+				break;
+			}
+		}
+
+		if (!found)
+			qd = NULL;
+
+		if (!qd && new_qd) {
+			qd = new_qd;
+			list_add(&qd->qd_list, &sdp->sd_quota_list);
+			atomic_inc(&sdp->sd_quota_count);
+			new_qd = NULL;
+		}
+
+		spin_unlock(&qd_lru_lock);
+
+		if (qd) {
+			if (new_qd) {
+				gfs2_glock_put(new_qd->qd_gl);
+				kmem_cache_free(gfs2_quotad_cachep, new_qd);
+			}
+			*qdp = qd;
+			return 0;
+		}
+
+		error = qd_alloc(sdp, user, id, &new_qd);
+		if (error)
+			return error;
+	}
+}
+
+static void qd_hold(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	gfs2_assert(sdp, atomic_read(&qd->qd_count));
+	atomic_inc(&qd->qd_count);
+}
+
+static void qd_put(struct gfs2_quota_data *qd)
+{
+	if (atomic_dec_and_lock(&qd->qd_count, &qd_lru_lock)) {
+		/* Add to the reclaim list */
+		list_add_tail(&qd->qd_reclaim, &qd_lru_list);
+		atomic_inc(&qd_lru_count);
+		spin_unlock(&qd_lru_lock);
+	}
+}
+
+static int slot_get(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	unsigned int c, o = 0, b;
+	unsigned char byte = 0;
+
+	spin_lock(&qd_lru_lock);
+
+	if (qd->qd_slot_count++) {
+		spin_unlock(&qd_lru_lock);
+		return 0;
+	}
+
+	for (c = 0; c < sdp->sd_quota_chunks; c++)
+		for (o = 0; o < PAGE_SIZE; o++) {
+			byte = sdp->sd_quota_bitmap[c][o];
+			if (byte != 0xFF)
+				goto found;
+		}
+
+	goto fail;
+
+found:
+	for (b = 0; b < 8; b++)
+		if (!(byte & (1 << b)))
+			break;
+	qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
+
+	if (qd->qd_slot >= sdp->sd_quota_slots)
+		goto fail;
+
+	sdp->sd_quota_bitmap[c][o] |= 1 << b;
+
+	spin_unlock(&qd_lru_lock);
+
+	return 0;
+
+fail:
+	qd->qd_slot_count--;
+	spin_unlock(&qd_lru_lock);
+	return -ENOSPC;
+}
+
+static void slot_hold(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+	spin_lock(&qd_lru_lock);
+	gfs2_assert(sdp, qd->qd_slot_count);
+	qd->qd_slot_count++;
+	spin_unlock(&qd_lru_lock);
+}
+
+static void slot_put(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+	spin_lock(&qd_lru_lock);
+	gfs2_assert(sdp, qd->qd_slot_count);
+	if (!--qd->qd_slot_count) {
+		gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
+		qd->qd_slot = -1;
+	}
+	spin_unlock(&qd_lru_lock);
+}
+
+static int bh_get(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+	unsigned int block, offset;
+	struct buffer_head *bh;
+	int error;
+	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+
+	mutex_lock(&sdp->sd_quota_mutex);
+
+	if (qd->qd_bh_count++) {
+		mutex_unlock(&sdp->sd_quota_mutex);
+		return 0;
+	}
+
+	block = qd->qd_slot / sdp->sd_qc_per_block;
+	offset = qd->qd_slot % sdp->sd_qc_per_block;
+
+	bh_map.b_size = 1 << ip->i_inode.i_blkbits;
+	error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
+	if (error)
+		goto fail;
+	error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
+	if (error)
+		goto fail;
+	error = -EIO;
+	if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
+		goto fail_brelse;
+
+	qd->qd_bh = bh;
+	qd->qd_bh_qc = (struct gfs2_quota_change *)
+		(bh->b_data + sizeof(struct gfs2_meta_header) +
+		 offset * sizeof(struct gfs2_quota_change));
+
+	mutex_unlock(&sdp->sd_quota_mutex);
+
+	return 0;
+
+fail_brelse:
+	brelse(bh);
+fail:
+	qd->qd_bh_count--;
+	mutex_unlock(&sdp->sd_quota_mutex);
+	return error;
+}
+
+static void bh_put(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+	mutex_lock(&sdp->sd_quota_mutex);
+	gfs2_assert(sdp, qd->qd_bh_count);
+	if (!--qd->qd_bh_count) {
+		brelse(qd->qd_bh);
+		qd->qd_bh = NULL;
+		qd->qd_bh_qc = NULL;
+	}
+	mutex_unlock(&sdp->sd_quota_mutex);
+}
+
+static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
+{
+	struct gfs2_quota_data *qd = NULL;
+	int error;
+	int found = 0;
+
+	*qdp = NULL;
+
+	if (sdp->sd_vfs->s_flags & MS_RDONLY)
+		return 0;
+
+	spin_lock(&qd_lru_lock);
+
+	list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
+		if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
+		    !test_bit(QDF_CHANGE, &qd->qd_flags) ||
+		    qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
+			continue;
+
+		list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
+
+		set_bit(QDF_LOCKED, &qd->qd_flags);
+		gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
+		atomic_inc(&qd->qd_count);
+		qd->qd_change_sync = qd->qd_change;
+		gfs2_assert_warn(sdp, qd->qd_slot_count);
+		qd->qd_slot_count++;
+		found = 1;
+
+		break;
+	}
+
+	if (!found)
+		qd = NULL;
+
+	spin_unlock(&qd_lru_lock);
+
+	if (qd) {
+		gfs2_assert_warn(sdp, qd->qd_change_sync);
+		error = bh_get(qd);
+		if (error) {
+			clear_bit(QDF_LOCKED, &qd->qd_flags);
+			slot_put(qd);
+			qd_put(qd);
+			return error;
+		}
+	}
+
+	*qdp = qd;
+
+	return 0;
+}
+
+static int qd_trylock(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+	if (sdp->sd_vfs->s_flags & MS_RDONLY)
+		return 0;
+
+	spin_lock(&qd_lru_lock);
+
+	if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
+	    !test_bit(QDF_CHANGE, &qd->qd_flags)) {
+		spin_unlock(&qd_lru_lock);
+		return 0;
+	}
+
+	list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
+
+	set_bit(QDF_LOCKED, &qd->qd_flags);
+	gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
+	atomic_inc(&qd->qd_count);
+	qd->qd_change_sync = qd->qd_change;
+	gfs2_assert_warn(sdp, qd->qd_slot_count);
+	qd->qd_slot_count++;
+
+	spin_unlock(&qd_lru_lock);
+
+	gfs2_assert_warn(sdp, qd->qd_change_sync);
+	if (bh_get(qd)) {
+		clear_bit(QDF_LOCKED, &qd->qd_flags);
+		slot_put(qd);
+		qd_put(qd);
+		return 0;
+	}
+
+	return 1;
+}
+
+static void qd_unlock(struct gfs2_quota_data *qd)
+{
+	gfs2_assert_warn(qd->qd_gl->gl_sbd,
+			 test_bit(QDF_LOCKED, &qd->qd_flags));
+	clear_bit(QDF_LOCKED, &qd->qd_flags);
+	bh_put(qd);
+	slot_put(qd);
+	qd_put(qd);
+}
+
+static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id,
+		    struct gfs2_quota_data **qdp)
+{
+	int error;
+
+	error = qd_get(sdp, user, id, qdp);
+	if (error)
+		return error;
+
+	error = slot_get(*qdp);
+	if (error)
+		goto fail;
+
+	error = bh_get(*qdp);
+	if (error)
+		goto fail_slot;
+
+	return 0;
+
+fail_slot:
+	slot_put(*qdp);
+fail:
+	qd_put(*qdp);
+	return error;
+}
+
+static void qdsb_put(struct gfs2_quota_data *qd)
+{
+	bh_put(qd);
+	slot_put(qd);
+	qd_put(qd);
+}
+
+int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = ip->i_alloc;
+	struct gfs2_quota_data **qd = al->al_qd;
+	int error;
+
+	if (gfs2_assert_warn(sdp, !al->al_qd_num) ||
+	    gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
+		return -EIO;
+
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+		return 0;
+
+	error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd);
+	if (error)
+		goto out;
+	al->al_qd_num++;
+	qd++;
+
+	error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd);
+	if (error)
+		goto out;
+	al->al_qd_num++;
+	qd++;
+
+	if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) {
+		error = qdsb_get(sdp, QUOTA_USER, uid, qd);
+		if (error)
+			goto out;
+		al->al_qd_num++;
+		qd++;
+	}
+
+	if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) {
+		error = qdsb_get(sdp, QUOTA_GROUP, gid, qd);
+		if (error)
+			goto out;
+		al->al_qd_num++;
+		qd++;
+	}
+
+out:
+	if (error)
+		gfs2_quota_unhold(ip);
+	return error;
+}
+
+void gfs2_quota_unhold(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = ip->i_alloc;
+	unsigned int x;
+
+	gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
+
+	for (x = 0; x < al->al_qd_num; x++) {
+		qdsb_put(al->al_qd[x]);
+		al->al_qd[x] = NULL;
+	}
+	al->al_qd_num = 0;
+}
+
+static int sort_qd(const void *a, const void *b)
+{
+	const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a;
+	const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b;
+
+	if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
+	    !test_bit(QDF_USER, &qd_b->qd_flags)) {
+		if (test_bit(QDF_USER, &qd_a->qd_flags))
+			return -1;
+		else
+			return 1;
+	}
+	if (qd_a->qd_id < qd_b->qd_id)
+		return -1;
+	if (qd_a->qd_id > qd_b->qd_id)
+		return 1;
+
+	return 0;
+}
+
+static void do_qc(struct gfs2_quota_data *qd, s64 change)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+	struct gfs2_quota_change *qc = qd->qd_bh_qc;
+	s64 x;
+
+	mutex_lock(&sdp->sd_quota_mutex);
+	gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
+
+	if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
+		qc->qc_change = 0;
+		qc->qc_flags = 0;
+		if (test_bit(QDF_USER, &qd->qd_flags))
+			qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
+		qc->qc_id = cpu_to_be32(qd->qd_id);
+	}
+
+	x = be64_to_cpu(qc->qc_change) + change;
+	qc->qc_change = cpu_to_be64(x);
+
+	spin_lock(&qd_lru_lock);
+	qd->qd_change = x;
+	spin_unlock(&qd_lru_lock);
+
+	if (!x) {
+		gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
+		clear_bit(QDF_CHANGE, &qd->qd_flags);
+		qc->qc_flags = 0;
+		qc->qc_id = 0;
+		slot_put(qd);
+		qd_put(qd);
+	} else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) {
+		qd_hold(qd);
+		slot_hold(qd);
+	}
+
+	mutex_unlock(&sdp->sd_quota_mutex);
+}
+
+/**
+ * gfs2_adjust_quota - adjust record of current block usage
+ * @ip: The quota inode
+ * @loc: Offset of the entry in the quota file
+ * @change: The amount of usage change to record
+ * @qd: The quota data
+ * @fdq: The updated limits to record
+ *
+ * This function was mostly borrowed from gfs2_block_truncate_page which was
+ * in turn mostly borrowed from ext3
+ *
+ * Returns: 0 or -ve on error
+ */
+
+static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
+			     s64 change, struct gfs2_quota_data *qd,
+			     struct fs_disk_quota *fdq)
+{
+	struct inode *inode = &ip->i_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long index = loc >> PAGE_CACHE_SHIFT;
+	unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
+	unsigned blocksize, iblock, pos;
+	struct buffer_head *bh, *dibh;
+	struct page *page;
+	void *kaddr, *ptr;
+	struct gfs2_quota q, *qp;
+	int err, nbytes;
+	u64 size;
+
+	if (gfs2_is_stuffed(ip))
+		gfs2_unstuff_dinode(ip, NULL);
+
+	memset(&q, 0, sizeof(struct gfs2_quota));
+	err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
+	if (err < 0)
+		return err;
+
+	err = -EIO;
+	qp = &q;
+	qp->qu_value = be64_to_cpu(qp->qu_value);
+	qp->qu_value += change;
+	qp->qu_value = cpu_to_be64(qp->qu_value);
+	qd->qd_qb.qb_value = qp->qu_value;
+	if (fdq) {
+		if (fdq->d_fieldmask & FS_DQ_BSOFT) {
+			qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
+			qd->qd_qb.qb_warn = qp->qu_warn;
+		}
+		if (fdq->d_fieldmask & FS_DQ_BHARD) {
+			qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
+			qd->qd_qb.qb_limit = qp->qu_limit;
+		}
+		if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+			qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+			qd->qd_qb.qb_value = qp->qu_value;
+		}
+	}
+
+	/* Write the quota into the quota file on disk */
+	ptr = qp;
+	nbytes = sizeof(struct gfs2_quota);
+get_a_page:
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		return -ENOMEM;
+
+	blocksize = inode->i_sb->s_blocksize;
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	if (!buffer_mapped(bh)) {
+		gfs2_block_map(inode, iblock, bh, 1);
+		if (!buffer_mapped(bh))
+			goto unlock_out;
+		/* If it's a newly allocated disk block for quota, zero it */
+		if (buffer_new(bh))
+			zero_user(page, pos - blocksize, bh->b_size);
+	}
+
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!buffer_uptodate(bh)) {
+		ll_rw_block(READ_META, 1, &bh);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			goto unlock_out;
+	}
+
+	gfs2_trans_add_bh(ip->i_gl, bh, 0);
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
+		nbytes = PAGE_CACHE_SIZE - offset;
+	memcpy(kaddr + offset, ptr, nbytes);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+	unlock_page(page);
+	page_cache_release(page);
+
+	/* If quota straddles page boundary, we need to update the rest of the
+	 * quota at the beginning of the next page */
+	if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) {
+		ptr = ptr + nbytes;
+		nbytes = sizeof(struct gfs2_quota) - nbytes;
+		offset = 0;
+		index++;
+		goto get_a_page;
+	}
+
+	/* Update the disk inode timestamp and size (if extended) */
+	err = gfs2_meta_inode_buffer(ip, &dibh);
+	if (err)
+		goto out;
+
+	size = loc + sizeof(struct gfs2_quota);
+	if (size > inode->i_size)
+		i_size_write(inode, size);
+	inode->i_mtime = inode->i_atime = CURRENT_TIME;
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+	mark_inode_dirty(inode);
+
+out:
+	return err;
+unlock_out:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
+{
+	struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+	unsigned int data_blocks, ind_blocks;
+	struct gfs2_holder *ghs, i_gh;
+	unsigned int qx, x;
+	struct gfs2_quota_data *qd;
+	loff_t offset;
+	unsigned int nalloc = 0, blocks;
+	struct gfs2_alloc *al = NULL;
+	int error;
+
+	gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
+			      &data_blocks, &ind_blocks);
+
+	ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_NOFS);
+	if (!ghs)
+		return -ENOMEM;
+
+	sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
+	mutex_lock_nested(&ip->i_inode.i_mutex, I_MUTEX_QUOTA);
+	for (qx = 0; qx < num_qd; qx++) {
+		error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
+					   GL_NOCACHE, &ghs[qx]);
+		if (error)
+			goto out;
+	}
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+	if (error)
+		goto out;
+
+	for (x = 0; x < num_qd; x++) {
+		offset = qd2offset(qda[x]);
+		if (gfs2_write_alloc_required(ip, offset,
+					      sizeof(struct gfs2_quota)))
+			nalloc++;
+	}
+
+	al = gfs2_alloc_get(ip);
+	if (!al) {
+		error = -ENOMEM;
+		goto out_gunlock;
+	}
+	/* 
+	 * 1 blk for unstuffing inode if stuffed. We add this extra
+	 * block to the reservation unconditionally. If the inode
+	 * doesn't need unstuffing, the block will be released to the 
+	 * rgrp since it won't be allocated during the transaction
+	 */
+	al->al_requested = 1;
+	/* +3 in the end for unstuffing block, inode size update block
+	 * and another block in case quota straddles page boundary and 
+	 * two blocks need to be updated instead of 1 */
+	blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
+
+	if (nalloc)
+		al->al_requested += nalloc * (data_blocks + ind_blocks);		
+	error = gfs2_inplace_reserve(ip);
+	if (error)
+		goto out_alloc;
+
+	if (nalloc)
+		blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
+
+	error = gfs2_trans_begin(sdp, blocks, 0);
+	if (error)
+		goto out_ipres;
+
+	for (x = 0; x < num_qd; x++) {
+		qd = qda[x];
+		offset = qd2offset(qd);
+		error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL);
+		if (error)
+			goto out_end_trans;
+
+		do_qc(qd, -qd->qd_change_sync);
+		set_bit(QDF_REFRESH, &qd->qd_flags);
+	}
+
+	error = 0;
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_ipres:
+	gfs2_inplace_release(ip);
+out_alloc:
+	gfs2_alloc_put(ip);
+out_gunlock:
+	gfs2_glock_dq_uninit(&i_gh);
+out:
+	while (qx--)
+		gfs2_glock_dq_uninit(&ghs[qx]);
+	mutex_unlock(&ip->i_inode.i_mutex);
+	kfree(ghs);
+	gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
+	return error;
+}
+
+static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
+{
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+	struct gfs2_quota q;
+	struct gfs2_quota_lvb *qlvb;
+	loff_t pos;
+	int error;
+
+	memset(&q, 0, sizeof(struct gfs2_quota));
+	pos = qd2offset(qd);
+	error = gfs2_internal_read(ip, NULL, (char *)&q, &pos, sizeof(q));
+	if (error < 0)
+		return error;
+
+	qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+	qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
+	qlvb->__pad = 0;
+	qlvb->qb_limit = q.qu_limit;
+	qlvb->qb_warn = q.qu_warn;
+	qlvb->qb_value = q.qu_value;
+	qd->qd_qb = *qlvb;
+
+	return 0;
+}
+
+static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
+		    struct gfs2_holder *q_gh)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+	struct gfs2_holder i_gh;
+	int error;
+
+restart:
+	error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
+	if (error)
+		return error;
+
+	qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+
+	if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
+		gfs2_glock_dq_uninit(q_gh);
+		error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE,
+					   GL_NOCACHE, q_gh);
+		if (error)
+			return error;
+
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+		if (error)
+			goto fail;
+
+		error = update_qd(sdp, qd);
+		if (error)
+			goto fail_gunlock;
+
+		gfs2_glock_dq_uninit(&i_gh);
+		gfs2_glock_dq_uninit(q_gh);
+		force_refresh = 0;
+		goto restart;
+	}
+
+	return 0;
+
+fail_gunlock:
+	gfs2_glock_dq_uninit(&i_gh);
+fail:
+	gfs2_glock_dq_uninit(q_gh);
+	return error;
+}
+
+int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = ip->i_alloc;
+	struct gfs2_quota_data *qd;
+	unsigned int x;
+	int error = 0;
+
+	gfs2_quota_hold(ip, uid, gid);
+
+	if (capable(CAP_SYS_RESOURCE) ||
+	    sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+		return 0;
+
+	sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *),
+	     sort_qd, NULL);
+
+	for (x = 0; x < al->al_qd_num; x++) {
+		int force = NO_FORCE;
+		qd = al->al_qd[x];
+		if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
+			force = FORCE;
+		error = do_glock(qd, force, &al->al_qd_ghs[x]);
+		if (error)
+			break;
+	}
+
+	if (!error)
+		set_bit(GIF_QD_LOCKED, &ip->i_flags);
+	else {
+		while (x--)
+			gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
+		gfs2_quota_unhold(ip);
+	}
+
+	return error;
+}
+
+static int need_sync(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	s64 value;
+	unsigned int num, den;
+	int do_sync = 1;
+
+	if (!qd->qd_qb.qb_limit)
+		return 0;
+
+	spin_lock(&qd_lru_lock);
+	value = qd->qd_change;
+	spin_unlock(&qd_lru_lock);
+
+	spin_lock(&gt->gt_spin);
+	num = gt->gt_quota_scale_num;
+	den = gt->gt_quota_scale_den;
+	spin_unlock(&gt->gt_spin);
+
+	if (value < 0)
+		do_sync = 0;
+	else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >=
+		 (s64)be64_to_cpu(qd->qd_qb.qb_limit))
+		do_sync = 0;
+	else {
+		value *= gfs2_jindex_size(sdp) * num;
+		value = div_s64(value, den);
+		value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
+		if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
+			do_sync = 0;
+	}
+
+	return do_sync;
+}
+
+void gfs2_quota_unlock(struct gfs2_inode *ip)
+{
+	struct gfs2_alloc *al = ip->i_alloc;
+	struct gfs2_quota_data *qda[4];
+	unsigned int count = 0;
+	unsigned int x;
+
+	if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
+		goto out;
+
+	for (x = 0; x < al->al_qd_num; x++) {
+		struct gfs2_quota_data *qd;
+		int sync;
+
+		qd = al->al_qd[x];
+		sync = need_sync(qd);
+
+		gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
+
+		if (sync && qd_trylock(qd))
+			qda[count++] = qd;
+	}
+
+	if (count) {
+		do_sync(count, qda);
+		for (x = 0; x < count; x++)
+			qd_unlock(qda[x]);
+	}
+
+out:
+	gfs2_quota_unhold(ip);
+}
+
+#define MAX_LINE 256
+
+static int print_message(struct gfs2_quota_data *qd, char *type)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+	printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n",
+	       sdp->sd_fsname, type,
+	       (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
+	       qd->qd_id);
+
+	return 0;
+}
+
+int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = ip->i_alloc;
+	struct gfs2_quota_data *qd;
+	s64 value;
+	unsigned int x;
+	int error = 0;
+
+	if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
+		return 0;
+
+        if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+                return 0;
+
+	for (x = 0; x < al->al_qd_num; x++) {
+		qd = al->al_qd[x];
+
+		if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
+		      (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
+			continue;
+
+		value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
+		spin_lock(&qd_lru_lock);
+		value += qd->qd_change;
+		spin_unlock(&qd_lru_lock);
+
+		if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
+			print_message(qd, "exceeded");
+			quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
+					   USRQUOTA : GRPQUOTA, qd->qd_id,
+					   sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);
+
+			error = -EDQUOT;
+			break;
+		} else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
+			   (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value &&
+			   time_after_eq(jiffies, qd->qd_last_warn +
+					 gfs2_tune_get(sdp,
+						gt_quota_warn_period) * HZ)) {
+			quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
+					   USRQUOTA : GRPQUOTA, qd->qd_id,
+					   sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
+			error = print_message(qd, "warning");
+			qd->qd_last_warn = jiffies;
+		}
+	}
+
+	return error;
+}
+
+void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+		       u32 uid, u32 gid)
+{
+	struct gfs2_alloc *al = ip->i_alloc;
+	struct gfs2_quota_data *qd;
+	unsigned int x;
+
+	if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
+		return;
+	if (ip->i_diskflags & GFS2_DIF_SYSTEM)
+		return;
+
+	for (x = 0; x < al->al_qd_num; x++) {
+		qd = al->al_qd[x];
+
+		if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
+		    (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
+			do_qc(qd, change);
+		}
+	}
+}
+
+int gfs2_quota_sync(struct super_block *sb, int type, int wait)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_quota_data **qda;
+	unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
+	unsigned int num_qd;
+	unsigned int x;
+	int error = 0;
+
+	sdp->sd_quota_sync_gen++;
+
+	qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
+	if (!qda)
+		return -ENOMEM;
+
+	do {
+		num_qd = 0;
+
+		for (;;) {
+			error = qd_fish(sdp, qda + num_qd);
+			if (error || !qda[num_qd])
+				break;
+			if (++num_qd == max_qd)
+				break;
+		}
+
+		if (num_qd) {
+			if (!error)
+				error = do_sync(num_qd, qda);
+			if (!error)
+				for (x = 0; x < num_qd; x++)
+					qda[x]->qd_sync_gen =
+						sdp->sd_quota_sync_gen;
+
+			for (x = 0; x < num_qd; x++)
+				qd_unlock(qda[x]);
+		}
+	} while (!error && num_qd == max_qd);
+
+	kfree(qda);
+
+	return error;
+}
+
+static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
+{
+	return gfs2_quota_sync(sb, type, 0);
+}
+
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
+{
+	struct gfs2_quota_data *qd;
+	struct gfs2_holder q_gh;
+	int error;
+
+	error = qd_get(sdp, user, id, &qd);
+	if (error)
+		return error;
+
+	error = do_glock(qd, FORCE, &q_gh);
+	if (!error)
+		gfs2_glock_dq_uninit(&q_gh);
+
+	qd_put(qd);
+	return error;
+}
+
+static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf)
+{
+	const struct gfs2_quota_change *str = buf;
+
+	qc->qc_change = be64_to_cpu(str->qc_change);
+	qc->qc_flags = be32_to_cpu(str->qc_flags);
+	qc->qc_id = be32_to_cpu(str->qc_id);
+}
+
+int gfs2_quota_init(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+	u64 size = i_size_read(sdp->sd_qc_inode);
+	unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
+	unsigned int x, slot = 0;
+	unsigned int found = 0;
+	u64 dblock;
+	u32 extlen = 0;
+	int error;
+
+	if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20))
+		return -EIO;
+
+	sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
+	sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
+
+	error = -ENOMEM;
+
+	sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
+				       sizeof(unsigned char *), GFP_NOFS);
+	if (!sdp->sd_quota_bitmap)
+		return error;
+
+	for (x = 0; x < sdp->sd_quota_chunks; x++) {
+		sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS);
+		if (!sdp->sd_quota_bitmap[x])
+			goto fail;
+	}
+
+	for (x = 0; x < blocks; x++) {
+		struct buffer_head *bh;
+		unsigned int y;
+
+		if (!extlen) {
+			int new = 0;
+			error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen);
+			if (error)
+				goto fail;
+		}
+		error = -EIO;
+		bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
+		if (!bh)
+			goto fail;
+		if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) {
+			brelse(bh);
+			goto fail;
+		}
+
+		for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
+		     y++, slot++) {
+			struct gfs2_quota_change_host qc;
+			struct gfs2_quota_data *qd;
+
+			gfs2_quota_change_in(&qc, bh->b_data +
+					  sizeof(struct gfs2_meta_header) +
+					  y * sizeof(struct gfs2_quota_change));
+			if (!qc.qc_change)
+				continue;
+
+			error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
+					 qc.qc_id, &qd);
+			if (error) {
+				brelse(bh);
+				goto fail;
+			}
+
+			set_bit(QDF_CHANGE, &qd->qd_flags);
+			qd->qd_change = qc.qc_change;
+			qd->qd_slot = slot;
+			qd->qd_slot_count = 1;
+
+			spin_lock(&qd_lru_lock);
+			gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
+			list_add(&qd->qd_list, &sdp->sd_quota_list);
+			atomic_inc(&sdp->sd_quota_count);
+			spin_unlock(&qd_lru_lock);
+
+			found++;
+		}
+
+		brelse(bh);
+		dblock++;
+		extlen--;
+	}
+
+	if (found)
+		fs_info(sdp, "found %u quota changes\n", found);
+
+	return 0;
+
+fail:
+	gfs2_quota_cleanup(sdp);
+	return error;
+}
+
+void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
+{
+	struct list_head *head = &sdp->sd_quota_list;
+	struct gfs2_quota_data *qd;
+	unsigned int x;
+
+	spin_lock(&qd_lru_lock);
+	while (!list_empty(head)) {
+		qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
+
+		if (atomic_read(&qd->qd_count) > 1 ||
+		    (atomic_read(&qd->qd_count) &&
+		     !test_bit(QDF_CHANGE, &qd->qd_flags))) {
+			list_move(&qd->qd_list, head);
+			spin_unlock(&qd_lru_lock);
+			schedule();
+			spin_lock(&qd_lru_lock);
+			continue;
+		}
+
+		list_del(&qd->qd_list);
+		/* Also remove if this qd exists in the reclaim list */
+		if (!list_empty(&qd->qd_reclaim)) {
+			list_del_init(&qd->qd_reclaim);
+			atomic_dec(&qd_lru_count);
+		}
+		atomic_dec(&sdp->sd_quota_count);
+		spin_unlock(&qd_lru_lock);
+
+		if (!atomic_read(&qd->qd_count)) {
+			gfs2_assert_warn(sdp, !qd->qd_change);
+			gfs2_assert_warn(sdp, !qd->qd_slot_count);
+		} else
+			gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
+		gfs2_assert_warn(sdp, !qd->qd_bh_count);
+
+		gfs2_glock_put(qd->qd_gl);
+		kmem_cache_free(gfs2_quotad_cachep, qd);
+
+		spin_lock(&qd_lru_lock);
+	}
+	spin_unlock(&qd_lru_lock);
+
+	gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
+
+	if (sdp->sd_quota_bitmap) {
+		for (x = 0; x < sdp->sd_quota_chunks; x++)
+			kfree(sdp->sd_quota_bitmap[x]);
+		kfree(sdp->sd_quota_bitmap);
+	}
+}
+
+static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
+{
+	if (error == 0 || error == -EROFS)
+		return;
+	if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
+}
+
+static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
+			       int (*fxn)(struct super_block *sb, int type),
+			       unsigned long t, unsigned long *timeo,
+			       unsigned int *new_timeo)
+{
+	if (t >= *timeo) {
+		int error = fxn(sdp->sd_vfs, 0);
+		quotad_error(sdp, msg, error);
+		*timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
+	} else {
+		*timeo -= t;
+	}
+}
+
+static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *ip;
+
+	while(1) {
+		ip = NULL;
+		spin_lock(&sdp->sd_trunc_lock);
+		if (!list_empty(&sdp->sd_trunc_list)) {
+			ip = list_entry(sdp->sd_trunc_list.next,
+					struct gfs2_inode, i_trunc_list);
+			list_del_init(&ip->i_trunc_list);
+		}
+		spin_unlock(&sdp->sd_trunc_lock);
+		if (ip == NULL)
+			return;
+		gfs2_glock_finish_truncate(ip);
+	}
+}
+
+void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) {
+	if (!sdp->sd_statfs_force_sync) {
+		sdp->sd_statfs_force_sync = 1;
+		wake_up(&sdp->sd_quota_wait);
+	}
+}
+
+
+/**
+ * gfs2_quotad - Write cached quota changes into the quota file
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_quotad(void *data)
+{
+	struct gfs2_sbd *sdp = data;
+	struct gfs2_tune *tune = &sdp->sd_tune;
+	unsigned long statfs_timeo = 0;
+	unsigned long quotad_timeo = 0;
+	unsigned long t = 0;
+	DEFINE_WAIT(wait);
+	int empty;
+
+	while (!kthread_should_stop()) {
+
+		/* Update the master statfs file */
+		if (sdp->sd_statfs_force_sync) {
+			int error = gfs2_statfs_sync(sdp->sd_vfs, 0);
+			quotad_error(sdp, "statfs", error);
+			statfs_timeo = gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
+		}
+		else
+			quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+				   	   &statfs_timeo,
+					   &tune->gt_statfs_quantum);
+
+		/* Update quota file */
+		quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
+				   &quotad_timeo, &tune->gt_quota_quantum);
+
+		/* Check for & recover partially truncated inodes */
+		quotad_check_trunc_list(sdp);
+
+		if (freezing(current))
+			refrigerator();
+		t = min(quotad_timeo, statfs_timeo);
+
+		prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
+		spin_lock(&sdp->sd_trunc_lock);
+		empty = list_empty(&sdp->sd_trunc_list);
+		spin_unlock(&sdp->sd_trunc_lock);
+		if (empty && !sdp->sd_statfs_force_sync)
+			t -= schedule_timeout(t);
+		else
+			t = 0;
+		finish_wait(&sdp->sd_quota_wait, &wait);
+	}
+
+	return 0;
+}
+
+static int gfs2_quota_get_xstate(struct super_block *sb,
+				 struct fs_quota_stat *fqs)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+
+	memset(fqs, 0, sizeof(struct fs_quota_stat));
+	fqs->qs_version = FS_QSTAT_VERSION;
+
+	switch (sdp->sd_args.ar_quota) {
+	case GFS2_QUOTA_ON:
+		fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD);
+		/*FALLTHRU*/
+	case GFS2_QUOTA_ACCOUNT:
+		fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT);
+		break;
+	case GFS2_QUOTA_OFF:
+		break;
+	}
+
+	if (sdp->sd_quota_inode) {
+		fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
+		fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
+	}
+	fqs->qs_uquota.qfs_nextents = 1; /* unsupported */
+	fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */
+	fqs->qs_incoredqs = atomic_read(&qd_lru_count);
+	return 0;
+}
+
+static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
+			  struct fs_disk_quota *fdq)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_quota_lvb *qlvb;
+	struct gfs2_quota_data *qd;
+	struct gfs2_holder q_gh;
+	int error;
+
+	memset(fdq, 0, sizeof(struct fs_disk_quota));
+
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+		return -ESRCH; /* Crazy XFS error code */
+
+	if (type == USRQUOTA)
+		type = QUOTA_USER;
+	else if (type == GRPQUOTA)
+		type = QUOTA_GROUP;
+	else
+		return -EINVAL;
+
+	error = qd_get(sdp, type, id, &qd);
+	if (error)
+		return error;
+	error = do_glock(qd, FORCE, &q_gh);
+	if (error)
+		goto out;
+
+	qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+	fdq->d_version = FS_DQUOT_VERSION;
+	fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
+	fdq->d_id = id;
+	fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
+	fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
+	fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
+
+	gfs2_glock_dq_uninit(&q_gh);
+out:
+	qd_put(qd);
+	return error;
+}
+
+/* GFS2 only supports a subset of the XFS fields */
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
+
+static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
+			  struct fs_disk_quota *fdq)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+	struct gfs2_quota_data *qd;
+	struct gfs2_holder q_gh, i_gh;
+	unsigned int data_blocks, ind_blocks;
+	unsigned int blocks = 0;
+	int alloc_required;
+	struct gfs2_alloc *al;
+	loff_t offset;
+	int error;
+
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+		return -ESRCH; /* Crazy XFS error code */
+
+	switch(type) {
+	case USRQUOTA:
+		type = QUOTA_USER;
+		if (fdq->d_flags != FS_USER_QUOTA)
+			return -EINVAL;
+		break;
+	case GRPQUOTA:
+		type = QUOTA_GROUP;
+		if (fdq->d_flags != FS_GROUP_QUOTA)
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (fdq->d_fieldmask & ~GFS2_FIELDMASK)
+		return -EINVAL;
+	if (fdq->d_id != id)
+		return -EINVAL;
+
+	error = qd_get(sdp, type, id, &qd);
+	if (error)
+		return error;
+
+	mutex_lock(&ip->i_inode.i_mutex);
+	error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh);
+	if (error)
+		goto out_put;
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+	if (error)
+		goto out_q;
+
+	/* Check for existing entry, if none then alloc new blocks */
+	error = update_qd(sdp, qd);
+	if (error)
+		goto out_i;
+
+	/* If nothing has changed, this is a no-op */
+	if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
+	    ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
+		fdq->d_fieldmask ^= FS_DQ_BSOFT;
+
+	if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
+	    ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
+		fdq->d_fieldmask ^= FS_DQ_BHARD;
+
+	if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+	    ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+		fdq->d_fieldmask ^= FS_DQ_BCOUNT;
+
+	if (fdq->d_fieldmask == 0)
+		goto out_i;
+
+	offset = qd2offset(qd);
+	alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota));
+	if (gfs2_is_stuffed(ip))
+		alloc_required = 1;
+	if (alloc_required) {
+		al = gfs2_alloc_get(ip);
+		if (al == NULL)
+			goto out_i;
+		gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
+				       &data_blocks, &ind_blocks);
+		blocks = al->al_requested = 1 + data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip);
+		if (error)
+			goto out_alloc;
+		blocks += gfs2_rg_blocks(al);
+	}
+
+	/* Some quotas span block boundaries and can update two blocks,
+	   adding an extra block to the transaction to handle such quotas */
+	error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0);
+	if (error)
+		goto out_release;
+
+	/* Apply changes */
+	error = gfs2_adjust_quota(ip, offset, 0, qd, fdq);
+
+	gfs2_trans_end(sdp);
+out_release:
+	if (alloc_required) {
+		gfs2_inplace_release(ip);
+out_alloc:
+		gfs2_alloc_put(ip);
+	}
+out_i:
+	gfs2_glock_dq_uninit(&i_gh);
+out_q:
+	gfs2_glock_dq_uninit(&q_gh);
+out_put:
+	mutex_unlock(&ip->i_inode.i_mutex);
+	qd_put(qd);
+	return error;
+}
+
+const struct quotactl_ops gfs2_quotactl_ops = {
+	.quota_sync     = gfs2_quota_sync,
+	.get_xstate     = gfs2_quota_get_xstate,
+	.get_dqblk	= gfs2_get_dqblk,
+	.set_dqblk	= gfs2_set_dqblk,
+};
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
new file mode 100644
index 00000000..90bf1c30
--- /dev/null
+++ b/fs/gfs2/quota.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __QUOTA_DOT_H__
+#define __QUOTA_DOT_H__
+
+struct gfs2_inode;
+struct gfs2_sbd;
+struct shrink_control;
+
+#define NO_QUOTA_CHANGE ((u32)-1)
+
+extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern void gfs2_quota_unhold(struct gfs2_inode *ip);
+
+extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern void gfs2_quota_unlock(struct gfs2_inode *ip);
+
+extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+			      u32 uid, u32 gid);
+
+extern int gfs2_quota_sync(struct super_block *sb, int type, int wait);
+extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
+
+extern int gfs2_quota_init(struct gfs2_sbd *sdp);
+extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+extern int gfs2_quotad(void *data);
+
+extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
+
+static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	int ret;
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+		return 0;
+	ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (ret)
+		return ret;
+	if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+		return 0;
+	ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
+	if (ret)
+		gfs2_quota_unlock(ip);
+	return ret;
+}
+
+extern int gfs2_shrink_qd_memory(struct shrinker *shrink,
+				 struct shrink_control *sc);
+extern const struct quotactl_ops gfs2_quotactl_ops;
+
+#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
new file mode 100644
index 00000000..f2a02edc
--- /dev/null
+++ b/fs/gfs2/recovery.c
@@ -0,0 +1,610 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "super.h"
+#include "util.h"
+#include "dir.h"
+
+struct workqueue_struct *gfs_recovery_wq;
+
+int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+			   struct buffer_head **bh)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	int new = 0;
+	u64 dblock;
+	u32 extlen;
+	int error;
+
+	error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen);
+	if (error)
+		return error;
+	if (!dblock) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	*bh = gfs2_meta_ra(gl, dblock, extlen);
+
+	return error;
+}
+
+int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
+{
+	struct list_head *head = &sdp->sd_revoke_list;
+	struct gfs2_revoke_replay *rr;
+	int found = 0;
+
+	list_for_each_entry(rr, head, rr_list) {
+		if (rr->rr_blkno == blkno) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		rr->rr_where = where;
+		return 0;
+	}
+
+	rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_NOFS);
+	if (!rr)
+		return -ENOMEM;
+
+	rr->rr_blkno = blkno;
+	rr->rr_where = where;
+	list_add(&rr->rr_list, head);
+
+	return 1;
+}
+
+int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
+{
+	struct gfs2_revoke_replay *rr;
+	int wrap, a, b, revoke;
+	int found = 0;
+
+	list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) {
+		if (rr->rr_blkno == blkno) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found)
+		return 0;
+
+	wrap = (rr->rr_where < sdp->sd_replay_tail);
+	a = (sdp->sd_replay_tail < where);
+	b = (where < rr->rr_where);
+	revoke = (wrap) ? (a || b) : (a && b);
+
+	return revoke;
+}
+
+void gfs2_revoke_clean(struct gfs2_sbd *sdp)
+{
+	struct list_head *head = &sdp->sd_revoke_list;
+	struct gfs2_revoke_replay *rr;
+
+	while (!list_empty(head)) {
+		rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
+		list_del(&rr->rr_list);
+		kfree(rr);
+	}
+}
+
+static int gfs2_log_header_in(struct gfs2_log_header_host *lh, const void *buf)
+{
+	const struct gfs2_log_header *str = buf;
+
+	if (str->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) ||
+	    str->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH))
+		return 1;
+
+	lh->lh_sequence = be64_to_cpu(str->lh_sequence);
+	lh->lh_flags = be32_to_cpu(str->lh_flags);
+	lh->lh_tail = be32_to_cpu(str->lh_tail);
+	lh->lh_blkno = be32_to_cpu(str->lh_blkno);
+	lh->lh_hash = be32_to_cpu(str->lh_hash);
+	return 0;
+}
+
+/**
+ * get_log_header - read the log header for a given segment
+ * @jd: the journal
+ * @blk: the block to look at
+ * @lh: the log header to return
+ *
+ * Read the log header for a given segement in a given journal.  Do a few
+ * sanity checks on it.
+ *
+ * Returns: 0 on success,
+ *          1 if the header was invalid or incomplete,
+ *          errno on error
+ */
+
+static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
+			  struct gfs2_log_header_host *head)
+{
+	struct buffer_head *bh;
+	struct gfs2_log_header_host uninitialized_var(lh);
+	const u32 nothing = 0;
+	u32 hash;
+	int error;
+
+	error = gfs2_replay_read_block(jd, blk, &bh);
+	if (error)
+		return error;
+
+	hash = crc32_le((u32)~0, bh->b_data, sizeof(struct gfs2_log_header) -
+					     sizeof(u32));
+	hash = crc32_le(hash, (unsigned char const *)&nothing, sizeof(nothing));
+	hash ^= (u32)~0;
+	error = gfs2_log_header_in(&lh, bh->b_data);
+	brelse(bh);
+
+	if (error || lh.lh_blkno != blk || lh.lh_hash != hash)
+		return 1;
+
+	*head = lh;
+
+	return 0;
+}
+
+/**
+ * find_good_lh - find a good log header
+ * @jd: the journal
+ * @blk: the segment to start searching from
+ * @lh: the log header to fill in
+ * @forward: if true search forward in the log, else search backward
+ *
+ * Call get_log_header() to get a log header for a segment, but if the
+ * segment is bad, either scan forward or backward until we find a good one.
+ *
+ * Returns: errno
+ */
+
+static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
+			struct gfs2_log_header_host *head)
+{
+	unsigned int orig_blk = *blk;
+	int error;
+
+	for (;;) {
+		error = get_log_header(jd, *blk, head);
+		if (error <= 0)
+			return error;
+
+		if (++*blk == jd->jd_blocks)
+			*blk = 0;
+
+		if (*blk == orig_blk) {
+			gfs2_consist_inode(GFS2_I(jd->jd_inode));
+			return -EIO;
+		}
+	}
+}
+
+/**
+ * jhead_scan - make sure we've found the head of the log
+ * @jd: the journal
+ * @head: this is filled in with the log descriptor of the head
+ *
+ * At this point, seg and lh should be either the head of the log or just
+ * before.  Scan forward until we find the head.
+ *
+ * Returns: errno
+ */
+
+static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
+{
+	unsigned int blk = head->lh_blkno;
+	struct gfs2_log_header_host lh;
+	int error;
+
+	for (;;) {
+		if (++blk == jd->jd_blocks)
+			blk = 0;
+
+		error = get_log_header(jd, blk, &lh);
+		if (error < 0)
+			return error;
+		if (error == 1)
+			continue;
+
+		if (lh.lh_sequence == head->lh_sequence) {
+			gfs2_consist_inode(GFS2_I(jd->jd_inode));
+			return -EIO;
+		}
+		if (lh.lh_sequence < head->lh_sequence)
+			break;
+
+		*head = lh;
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_find_jhead - find the head of a log
+ * @jd: the journal
+ * @head: the log descriptor for the head of the log is returned here
+ *
+ * Do a binary search of a journal and find the valid log entry with the
+ * highest sequence number.  (i.e. the log head)
+ *
+ * Returns: errno
+ */
+
+int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
+{
+	struct gfs2_log_header_host lh_1, lh_m;
+	u32 blk_1, blk_2, blk_m;
+	int error;
+
+	blk_1 = 0;
+	blk_2 = jd->jd_blocks - 1;
+
+	for (;;) {
+		blk_m = (blk_1 + blk_2) / 2;
+
+		error = find_good_lh(jd, &blk_1, &lh_1);
+		if (error)
+			return error;
+
+		error = find_good_lh(jd, &blk_m, &lh_m);
+		if (error)
+			return error;
+
+		if (blk_1 == blk_m || blk_m == blk_2)
+			break;
+
+		if (lh_1.lh_sequence <= lh_m.lh_sequence)
+			blk_1 = blk_m;
+		else
+			blk_2 = blk_m;
+	}
+
+	error = jhead_scan(jd, &lh_1);
+	if (error)
+		return error;
+
+	*head = lh_1;
+
+	return error;
+}
+
+/**
+ * foreach_descriptor - go through the active part of the log
+ * @jd: the journal
+ * @start: the first log header in the active region
+ * @end: the last log header (don't process the contents of this entry))
+ *
+ * Call a given function once for every log descriptor in the active
+ * portion of the log.
+ *
+ * Returns: errno
+ */
+
+static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
+			      unsigned int end, int pass)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	struct buffer_head *bh;
+	struct gfs2_log_descriptor *ld;
+	int error = 0;
+	u32 length;
+	__be64 *ptr;
+	unsigned int offset = sizeof(struct gfs2_log_descriptor);
+	offset += sizeof(__be64) - 1;
+	offset &= ~(sizeof(__be64) - 1);
+
+	while (start != end) {
+		error = gfs2_replay_read_block(jd, start, &bh);
+		if (error)
+			return error;
+		if (gfs2_meta_check(sdp, bh)) {
+			brelse(bh);
+			return -EIO;
+		}
+		ld = (struct gfs2_log_descriptor *)bh->b_data;
+		length = be32_to_cpu(ld->ld_length);
+
+		if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
+			struct gfs2_log_header_host lh;
+			error = get_log_header(jd, start, &lh);
+			if (!error) {
+				gfs2_replay_incr_blk(sdp, &start);
+				brelse(bh);
+				continue;
+			}
+			if (error == 1) {
+				gfs2_consist_inode(GFS2_I(jd->jd_inode));
+				error = -EIO;
+			}
+			brelse(bh);
+			return error;
+		} else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) {
+			brelse(bh);
+			return -EIO;
+		}
+		ptr = (__be64 *)(bh->b_data + offset);
+		error = lops_scan_elements(jd, start, ld, ptr, pass);
+		if (error) {
+			brelse(bh);
+			return error;
+		}
+
+		while (length--)
+			gfs2_replay_incr_blk(sdp, &start);
+
+		brelse(bh);
+	}
+
+	return 0;
+}
+
+/**
+ * clean_journal - mark a dirty journal as being clean
+ * @sdp: the filesystem
+ * @jd: the journal
+ * @gl: the journal's glock
+ * @head: the head journal to start from
+ *
+ * Returns: errno
+ */
+
+static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	unsigned int lblock;
+	struct gfs2_log_header *lh;
+	u32 hash;
+	struct buffer_head *bh;
+	int error;
+	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+
+	lblock = head->lh_blkno;
+	gfs2_replay_incr_blk(sdp, &lblock);
+	bh_map.b_size = 1 << ip->i_inode.i_blkbits;
+	error = gfs2_block_map(&ip->i_inode, lblock, &bh_map, 0);
+	if (error)
+		return error;
+	if (!bh_map.b_blocknr) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	bh = sb_getblk(sdp->sd_vfs, bh_map.b_blocknr);
+	lock_buffer(bh);
+	memset(bh->b_data, 0, bh->b_size);
+	set_buffer_uptodate(bh);
+	clear_buffer_dirty(bh);
+	unlock_buffer(bh);
+
+	lh = (struct gfs2_log_header *)bh->b_data;
+	memset(lh, 0, sizeof(struct gfs2_log_header));
+	lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+	lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+	lh->lh_header.__pad0 = cpu_to_be64(0);
+	lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+	lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
+	lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
+	lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
+	lh->lh_blkno = cpu_to_be32(lblock);
+	hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header));
+	lh->lh_hash = cpu_to_be32(hash);
+
+	set_buffer_dirty(bh);
+	if (sync_dirty_buffer(bh))
+		gfs2_io_error_bh(sdp, bh);
+	brelse(bh);
+
+	return error;
+}
+
+
+static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+                               unsigned int message)
+{
+	char env_jid[20];
+	char env_status[20];
+	char *envp[] = { env_jid, env_status, NULL };
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        ls->ls_recover_jid_done = jid;
+        ls->ls_recover_jid_status = message;
+	sprintf(env_jid, "JID=%d", jid);
+	sprintf(env_status, "RECOVERY=%s",
+		message == LM_RD_SUCCESS ? "Done" : "Failed");
+        kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
+}
+
+void gfs2_recover_func(struct work_struct *work)
+{
+	struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	struct gfs2_log_header_host head;
+	struct gfs2_holder j_gh, ji_gh, t_gh;
+	unsigned long t;
+	int ro = 0;
+	unsigned int pass;
+	int error;
+	int jlocked = 0;
+
+	if (sdp->sd_args.ar_spectator ||
+	    (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
+		fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
+			jd->jd_jid);
+		jlocked = 1;
+		/* Acquire the journal lock so we can do recovery */
+
+		error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
+					  LM_ST_EXCLUSIVE,
+					  LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE,
+					  &j_gh);
+		switch (error) {
+		case 0:
+			break;
+
+		case GLR_TRYFAILED:
+			fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid);
+			error = 0;
+
+		default:
+			goto fail;
+		};
+
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
+					   LM_FLAG_NOEXP | GL_NOCACHE, &ji_gh);
+		if (error)
+			goto fail_gunlock_j;
+	} else {
+		fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid);
+	}
+
+	fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
+
+	error = gfs2_jdesc_check(jd);
+	if (error)
+		goto fail_gunlock_ji;
+
+	error = gfs2_find_jhead(jd, &head);
+	if (error)
+		goto fail_gunlock_ji;
+
+	if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+		fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
+			jd->jd_jid);
+
+		t = jiffies;
+
+		/* Acquire a shared hold on the transaction lock */
+
+		error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
+					   LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
+					   GL_NOCACHE, &t_gh);
+		if (error)
+			goto fail_gunlock_ji;
+
+		if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
+			if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+				ro = 1;
+		} else {
+			if (sdp->sd_vfs->s_flags & MS_RDONLY) {
+				/* check if device itself is read-only */
+				ro = bdev_read_only(sdp->sd_vfs->s_bdev);
+				if (!ro) {
+					fs_info(sdp, "recovery required on "
+						"read-only filesystem.\n");
+					fs_info(sdp, "write access will be "
+						"enabled during recovery.\n");
+				}
+			}
+		}
+
+		if (ro) {
+			fs_warn(sdp, "jid=%u: Can't replay: read-only block "
+				"device\n", jd->jd_jid);
+			error = -EROFS;
+			goto fail_gunlock_tr;
+		}
+
+		fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
+
+		for (pass = 0; pass < 2; pass++) {
+			lops_before_scan(jd, &head, pass);
+			error = foreach_descriptor(jd, head.lh_tail,
+						   head.lh_blkno, pass);
+			lops_after_scan(jd, error, pass);
+			if (error)
+				goto fail_gunlock_tr;
+		}
+
+		error = clean_journal(jd, &head);
+		if (error)
+			goto fail_gunlock_tr;
+
+		gfs2_glock_dq_uninit(&t_gh);
+		t = DIV_ROUND_UP(jiffies - t, HZ);
+		fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
+			jd->jd_jid, t);
+	}
+
+	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
+
+	if (jlocked) {
+		gfs2_glock_dq_uninit(&ji_gh);
+		gfs2_glock_dq_uninit(&j_gh);
+	}
+
+	fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
+	goto done;
+
+fail_gunlock_tr:
+	gfs2_glock_dq_uninit(&t_gh);
+fail_gunlock_ji:
+	if (jlocked) {
+		gfs2_glock_dq_uninit(&ji_gh);
+fail_gunlock_j:
+		gfs2_glock_dq_uninit(&j_gh);
+	}
+
+	fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
+fail:
+	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
+done:
+	clear_bit(JDF_RECOVERY, &jd->jd_flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
+}
+
+static int gfs2_recovery_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
+int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
+{
+	int rv;
+
+	if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
+		return -EBUSY;
+
+	/* we have JDF_RECOVERY, queue should always succeed */
+	rv = queue_work(gfs_recovery_wq, &jd->jd_work);
+	BUG_ON(!rv);
+
+	if (wait)
+		wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait,
+			    TASK_UNINTERRUPTIBLE);
+
+	return 0;
+}
+
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
new file mode 100644
index 00000000..2226136c
--- /dev/null
+++ b/fs/gfs2/recovery.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __RECOVERY_DOT_H__
+#define __RECOVERY_DOT_H__
+
+#include "incore.h"
+
+extern struct workqueue_struct *gfs_recovery_wq;
+
+static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
+{
+	if (++*blk == sdp->sd_jdesc->jd_blocks)
+	        *blk = 0;
+}
+
+extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+			   struct buffer_head **bh);
+
+extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
+
+extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
+		    struct gfs2_log_header_host *head);
+extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
+extern void gfs2_recover_func(struct work_struct *work);
+
+#endif /* __RECOVERY_DOT_H__ */
+
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
new file mode 100644
index 00000000..9b780df3
--- /dev/null
+++ b/fs/gfs2/rgrp.c
@@ -0,0 +1,1885 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/prefetch.h>
+#include <linux/blkdev.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "util.h"
+#include "log.h"
+#include "inode.h"
+#include "trace_gfs2.h"
+
+#define BFITNOENT ((u32)~0)
+#define NO_BLOCK ((u64)~0)
+
+#if BITS_PER_LONG == 32
+#define LBITMASK   (0x55555555UL)
+#define LBITSKIP55 (0x55555555UL)
+#define LBITSKIP00 (0x00000000UL)
+#else
+#define LBITMASK   (0x5555555555555555UL)
+#define LBITSKIP55 (0x5555555555555555UL)
+#define LBITSKIP00 (0x0000000000000000UL)
+#endif
+
+/*
+ * These routines are used by the resource group routines (rgrp.c)
+ * to keep track of block allocation.  Each block is represented by two
+ * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
+ *
+ * 0 = Free
+ * 1 = Used (not metadata)
+ * 2 = Unlinked (still in use) inode
+ * 3 = Used (metadata)
+ */
+
+static const char valid_change[16] = {
+	        /* current */
+	/* n */ 0, 1, 1, 1,
+	/* e */ 1, 0, 0, 0,
+	/* w */ 0, 0, 0, 1,
+	        1, 0, 0, 0
+};
+
+static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
+                        unsigned char old_state, unsigned char new_state,
+			unsigned int *n);
+
+/**
+ * gfs2_setbit - Set a bit in the bitmaps
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @block: the block to set
+ * @new_state: the new state of the block
+ *
+ */
+
+static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
+			       unsigned char *buf2, unsigned int offset,
+			       struct gfs2_bitmap *bi, u32 block,
+			       unsigned char new_state)
+{
+	unsigned char *byte1, *byte2, *end, cur_state;
+	unsigned int buflen = bi->bi_len;
+	const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
+
+	byte1 = buf1 + offset + (block / GFS2_NBBY);
+	end = buf1 + offset + buflen;
+
+	BUG_ON(byte1 >= end);
+
+	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
+
+	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
+		printk(KERN_WARNING "GFS2: buf_blk = 0x%llx old_state=%d, "
+		       "new_state=%d\n",
+		       (unsigned long long)block, cur_state, new_state);
+		printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%lx\n",
+		       (unsigned long long)rgd->rd_addr,
+		       (unsigned long)bi->bi_start);
+		printk(KERN_WARNING "GFS2: bi_offset=0x%lx bi_len=0x%lx\n",
+		       (unsigned long)bi->bi_offset,
+		       (unsigned long)bi->bi_len);
+		dump_stack();
+		gfs2_consist_rgrpd(rgd);
+		return;
+	}
+	*byte1 ^= (cur_state ^ new_state) << bit;
+
+	if (buf2) {
+		byte2 = buf2 + offset + (block / GFS2_NBBY);
+		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
+		*byte2 ^= (cur_state ^ new_state) << bit;
+	}
+}
+
+/**
+ * gfs2_testbit - test a bit in the bitmaps
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @block: the block to read
+ *
+ */
+
+static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
+					 const unsigned char *buffer,
+					 unsigned int buflen, u32 block)
+{
+	const unsigned char *byte, *end;
+	unsigned char cur_state;
+	unsigned int bit;
+
+	byte = buffer + (block / GFS2_NBBY);
+	bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
+	end = buffer + buflen;
+
+	gfs2_assert(rgd->rd_sbd, byte < end);
+
+	cur_state = (*byte >> bit) & GFS2_BIT_MASK;
+
+	return cur_state;
+}
+
+/**
+ * gfs2_bit_search
+ * @ptr: Pointer to bitmap data
+ * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
+ * @state: The state we are searching for
+ *
+ * We xor the bitmap data with a patter which is the bitwise opposite
+ * of what we are looking for, this gives rise to a pattern of ones
+ * wherever there is a match. Since we have two bits per entry, we
+ * take this pattern, shift it down by one place and then and it with
+ * the original. All the even bit positions (0,2,4, etc) then represent
+ * successful matches, so we mask with 0x55555..... to remove the unwanted
+ * odd bit positions.
+ *
+ * This allows searching of a whole u64 at once (32 blocks) with a
+ * single test (on 64 bit arches).
+ */
+
+static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
+{
+	u64 tmp;
+	static const u64 search[] = {
+		[0] = 0xffffffffffffffffULL,
+		[1] = 0xaaaaaaaaaaaaaaaaULL,
+		[2] = 0x5555555555555555ULL,
+		[3] = 0x0000000000000000ULL,
+	};
+	tmp = le64_to_cpu(*ptr) ^ search[state];
+	tmp &= (tmp >> 1);
+	tmp &= mask;
+	return tmp;
+}
+
+/**
+ * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
+ *       a block in a given allocation state.
+ * @buffer: the buffer that holds the bitmaps
+ * @len: the length (in bytes) of the buffer
+ * @goal: start search at this block's bit-pair (within @buffer)
+ * @state: GFS2_BLKST_XXX the state of the block we're looking for.
+ *
+ * Scope of @goal and returned block number is only within this bitmap buffer,
+ * not entire rgrp or filesystem.  @buffer will be offset from the actual
+ * beginning of a bitmap block buffer, skipping any header structures, but
+ * headers are always a multiple of 64 bits long so that the buffer is
+ * always aligned to a 64 bit boundary.
+ *
+ * The size of the buffer is in bytes, but is it assumed that it is
+ * always ok to read a complete multiple of 64 bits at the end
+ * of the block in case the end is no aligned to a natural boundary.
+ *
+ * Return: the block number (bitmap buffer scope) that was found
+ */
+
+static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
+		       u32 goal, u8 state)
+{
+	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
+	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
+	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
+	u64 tmp;
+	u64 mask = 0x5555555555555555ULL;
+	u32 bit;
+
+	BUG_ON(state > 3);
+
+	/* Mask off bits we don't care about at the start of the search */
+	mask <<= spoint;
+	tmp = gfs2_bit_search(ptr, mask, state);
+	ptr++;
+	while(tmp == 0 && ptr < end) {
+		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
+		ptr++;
+	}
+	/* Mask off any bits which are more than len bytes from the start */
+	if (ptr == end && (len & (sizeof(u64) - 1)))
+		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
+	/* Didn't find anything, so return */
+	if (tmp == 0)
+		return BFITNOENT;
+	ptr--;
+	bit = __ffs64(tmp);
+	bit /= 2;	/* two bits per entry in the bitmap */
+	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
+}
+
+/**
+ * gfs2_bitcount - count the number of bits in a certain state
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @state: the state of the block we're looking for
+ *
+ * Returns: The number of bits
+ */
+
+static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
+			 unsigned int buflen, u8 state)
+{
+	const u8 *byte = buffer;
+	const u8 *end = buffer + buflen;
+	const u8 state1 = state << 2;
+	const u8 state2 = state << 4;
+	const u8 state3 = state << 6;
+	u32 count = 0;
+
+	for (; byte < end; byte++) {
+		if (((*byte) & 0x03) == state)
+			count++;
+		if (((*byte) & 0x0C) == state1)
+			count++;
+		if (((*byte) & 0x30) == state2)
+			count++;
+		if (((*byte) & 0xC0) == state3)
+			count++;
+	}
+
+	return count;
+}
+
+/**
+ * gfs2_rgrp_verify - Verify that a resource group is consistent
+ * @sdp: the filesystem
+ * @rgd: the rgrp
+ *
+ */
+
+void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct gfs2_bitmap *bi = NULL;
+	u32 length = rgd->rd_length;
+	u32 count[4], tmp;
+	int buf, x;
+
+	memset(count, 0, 4 * sizeof(u32));
+
+	/* Count # blocks in each of 4 possible allocation states */
+	for (buf = 0; buf < length; buf++) {
+		bi = rgd->rd_bits + buf;
+		for (x = 0; x < 4; x++)
+			count[x] += gfs2_bitcount(rgd,
+						  bi->bi_bh->b_data +
+						  bi->bi_offset,
+						  bi->bi_len, x);
+	}
+
+	if (count[0] != rgd->rd_free) {
+		if (gfs2_consist_rgrpd(rgd))
+			fs_err(sdp, "free data mismatch:  %u != %u\n",
+			       count[0], rgd->rd_free);
+		return;
+	}
+
+	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
+	if (count[1] != tmp) {
+		if (gfs2_consist_rgrpd(rgd))
+			fs_err(sdp, "used data mismatch:  %u != %u\n",
+			       count[1], tmp);
+		return;
+	}
+
+	if (count[2] + count[3] != rgd->rd_dinodes) {
+		if (gfs2_consist_rgrpd(rgd))
+			fs_err(sdp, "used metadata mismatch:  %u != %u\n",
+			       count[2] + count[3], rgd->rd_dinodes);
+		return;
+	}
+}
+
+static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
+{
+	u64 first = rgd->rd_data0;
+	u64 last = first + rgd->rd_data;
+	return first <= block && block < last;
+}
+
+/**
+ * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
+ * @sdp: The GFS2 superblock
+ * @n: The data block number
+ *
+ * Returns: The resource group, or NULL if not found
+ */
+
+struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
+{
+	struct gfs2_rgrpd *rgd;
+
+	spin_lock(&sdp->sd_rindex_spin);
+
+	list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
+		if (rgrp_contains_block(rgd, blk)) {
+			list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+			spin_unlock(&sdp->sd_rindex_spin);
+			return rgd;
+		}
+	}
+
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	return NULL;
+}
+
+/**
+ * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: The first rgrp in the filesystem
+ */
+
+struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
+{
+	gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
+	return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
+}
+
+/**
+ * gfs2_rgrpd_get_next - get the next RG
+ * @rgd: A RG
+ *
+ * Returns: The next rgrp
+ */
+
+struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
+{
+	if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
+		return NULL;
+	return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
+}
+
+static void clear_rgrpdi(struct gfs2_sbd *sdp)
+{
+	struct list_head *head;
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_glock *gl;
+
+	spin_lock(&sdp->sd_rindex_spin);
+	sdp->sd_rindex_forward = NULL;
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	head = &sdp->sd_rindex_list;
+	while (!list_empty(head)) {
+		rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
+		gl = rgd->rd_gl;
+
+		list_del(&rgd->rd_list);
+		list_del(&rgd->rd_list_mru);
+
+		if (gl) {
+			gl->gl_object = NULL;
+			gfs2_glock_add_to_lru(gl);
+			gfs2_glock_put(gl);
+		}
+
+		kfree(rgd->rd_bits);
+		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
+	}
+}
+
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
+{
+	mutex_lock(&sdp->sd_rindex_mutex);
+	clear_rgrpdi(sdp);
+	mutex_unlock(&sdp->sd_rindex_mutex);
+}
+
+static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
+{
+	printk(KERN_INFO "  ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
+	printk(KERN_INFO "  ri_length = %u\n", rgd->rd_length);
+	printk(KERN_INFO "  ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
+	printk(KERN_INFO "  ri_data = %u\n", rgd->rd_data);
+	printk(KERN_INFO "  ri_bitbytes = %u\n", rgd->rd_bitbytes);
+}
+
+/**
+ * gfs2_compute_bitstructs - Compute the bitmap sizes
+ * @rgd: The resource group descriptor
+ *
+ * Calculates bitmap descriptors, one for each block that contains bitmap data
+ *
+ * Returns: errno
+ */
+
+static int compute_bitstructs(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct gfs2_bitmap *bi;
+	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
+	u32 bytes_left, bytes;
+	int x;
+
+	if (!length)
+		return -EINVAL;
+
+	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
+	if (!rgd->rd_bits)
+		return -ENOMEM;
+
+	bytes_left = rgd->rd_bitbytes;
+
+	for (x = 0; x < length; x++) {
+		bi = rgd->rd_bits + x;
+
+		bi->bi_flags = 0;
+		/* small rgrp; bitmap stored completely in header block */
+		if (length == 1) {
+			bytes = bytes_left;
+			bi->bi_offset = sizeof(struct gfs2_rgrp);
+			bi->bi_start = 0;
+			bi->bi_len = bytes;
+		/* header block */
+		} else if (x == 0) {
+			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
+			bi->bi_offset = sizeof(struct gfs2_rgrp);
+			bi->bi_start = 0;
+			bi->bi_len = bytes;
+		/* last block */
+		} else if (x + 1 == length) {
+			bytes = bytes_left;
+			bi->bi_offset = sizeof(struct gfs2_meta_header);
+			bi->bi_start = rgd->rd_bitbytes - bytes_left;
+			bi->bi_len = bytes;
+		/* other blocks */
+		} else {
+			bytes = sdp->sd_sb.sb_bsize -
+				sizeof(struct gfs2_meta_header);
+			bi->bi_offset = sizeof(struct gfs2_meta_header);
+			bi->bi_start = rgd->rd_bitbytes - bytes_left;
+			bi->bi_len = bytes;
+		}
+
+		bytes_left -= bytes;
+	}
+
+	if (bytes_left) {
+		gfs2_consist_rgrpd(rgd);
+		return -EIO;
+	}
+	bi = rgd->rd_bits + (length - 1);
+	if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_data) {
+		if (gfs2_consist_rgrpd(rgd)) {
+			gfs2_rindex_print(rgd);
+			fs_err(sdp, "start=%u len=%u offset=%u\n",
+			       bi->bi_start, bi->bi_len, bi->bi_offset);
+		}
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_ri_total - Total up the file system space, according to the rindex.
+ *
+ */
+u64 gfs2_ri_total(struct gfs2_sbd *sdp)
+{
+	u64 total_data = 0;	
+	struct inode *inode = sdp->sd_rindex;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	char buf[sizeof(struct gfs2_rindex)];
+	struct file_ra_state ra_state;
+	int error, rgrps;
+
+	mutex_lock(&sdp->sd_rindex_mutex);
+	file_ra_state_init(&ra_state, inode->i_mapping);
+	for (rgrps = 0;; rgrps++) {
+		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
+
+		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
+			break;
+		error = gfs2_internal_read(ip, &ra_state, buf, &pos,
+					   sizeof(struct gfs2_rindex));
+		if (error != sizeof(struct gfs2_rindex))
+			break;
+		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
+	}
+	mutex_unlock(&sdp->sd_rindex_mutex);
+	return total_data;
+}
+
+static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf)
+{
+	const struct gfs2_rindex *str = buf;
+
+	rgd->rd_addr = be64_to_cpu(str->ri_addr);
+	rgd->rd_length = be32_to_cpu(str->ri_length);
+	rgd->rd_data0 = be64_to_cpu(str->ri_data0);
+	rgd->rd_data = be32_to_cpu(str->ri_data);
+	rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes);
+}
+
+/**
+ * read_rindex_entry - Pull in a new resource index entry from the disk
+ * @gl: The glock covering the rindex inode
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int read_rindex_entry(struct gfs2_inode *ip,
+			     struct file_ra_state *ra_state)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
+	char buf[sizeof(struct gfs2_rindex)];
+	int error;
+	struct gfs2_rgrpd *rgd;
+
+	error = gfs2_internal_read(ip, ra_state, buf, &pos,
+				   sizeof(struct gfs2_rindex));
+	if (!error)
+		return 0;
+	if (error != sizeof(struct gfs2_rindex)) {
+		if (error > 0)
+			error = -EIO;
+		return error;
+	}
+
+	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
+	error = -ENOMEM;
+	if (!rgd)
+		return error;
+
+	mutex_init(&rgd->rd_mutex);
+	lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
+	rgd->rd_sbd = sdp;
+
+	list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
+	list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+
+	gfs2_rindex_in(rgd, buf);
+	error = compute_bitstructs(rgd);
+	if (error)
+		return error;
+
+	error = gfs2_glock_get(sdp, rgd->rd_addr,
+			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
+	if (error)
+		return error;
+
+	rgd->rd_gl->gl_object = rgd;
+	rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+	return error;
+}
+
+/**
+ * gfs2_ri_update - Pull in a new resource index from the disk
+ * @ip: pointer to the rindex inode
+ *
+ * Returns: 0 on successful update, error code otherwise
+ */
+
+int gfs2_ri_update(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct inode *inode = &ip->i_inode;
+	struct file_ra_state ra_state;
+	u64 rgrp_count = i_size_read(inode);
+	struct gfs2_rgrpd *rgd;
+	unsigned int max_data = 0;
+	int error;
+
+	do_div(rgrp_count, sizeof(struct gfs2_rindex));
+	clear_rgrpdi(sdp);
+
+	file_ra_state_init(&ra_state, inode->i_mapping);
+	for (sdp->sd_rgrps = 0; sdp->sd_rgrps < rgrp_count; sdp->sd_rgrps++) {
+		error = read_rindex_entry(ip, &ra_state);
+		if (error) {
+			clear_rgrpdi(sdp);
+			return error;
+		}
+	}
+
+	list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+		if (rgd->rd_data > max_data)
+			max_data = rgd->rd_data;
+	sdp->sd_max_rg_data = max_data;
+	sdp->sd_rindex_uptodate = 1;
+	return 0;
+}
+
+/**
+ * gfs2_rindex_hold - Grab a lock on the rindex
+ * @sdp: The GFS2 superblock
+ * @ri_gh: the glock holder
+ *
+ * We grab a lock on the rindex inode to make sure that it doesn't
+ * change whilst we are performing an operation. We keep this lock
+ * for quite long periods of time compared to other locks. This
+ * doesn't matter, since it is shared and it is very, very rarely
+ * accessed in the exclusive mode (i.e. only when expanding the filesystem).
+ *
+ * This makes sure that we're using the latest copy of the resource index
+ * special file, which might have been updated if someone expanded the
+ * filesystem (via gfs2_grow utility), which adds new resource groups.
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
+{
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
+	struct gfs2_glock *gl = ip->i_gl;
+	int error;
+
+	error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
+	if (error)
+		return error;
+
+	/* Read new copy from disk if we don't have the latest */
+	if (!sdp->sd_rindex_uptodate) {
+		mutex_lock(&sdp->sd_rindex_mutex);
+		if (!sdp->sd_rindex_uptodate) {
+			error = gfs2_ri_update(ip);
+			if (error)
+				gfs2_glock_dq_uninit(ri_gh);
+		}
+		mutex_unlock(&sdp->sd_rindex_mutex);
+	}
+
+	return error;
+}
+
+static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
+{
+	const struct gfs2_rgrp *str = buf;
+	u32 rg_flags;
+
+	rg_flags = be32_to_cpu(str->rg_flags);
+	rg_flags &= ~GFS2_RDF_MASK;
+	rgd->rd_flags &= GFS2_RDF_MASK;
+	rgd->rd_flags |= rg_flags;
+	rgd->rd_free = be32_to_cpu(str->rg_free);
+	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
+	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
+}
+
+static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
+{
+	struct gfs2_rgrp *str = buf;
+
+	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
+	str->rg_free = cpu_to_be32(rgd->rd_free);
+	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
+	str->__pad = cpu_to_be32(0);
+	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
+	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
+}
+
+/**
+ * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
+ * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ *
+ * Read in all of a Resource Group's header and bitmap blocks.
+ * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
+ *
+ * Returns: errno
+ */
+
+int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct gfs2_glock *gl = rgd->rd_gl;
+	unsigned int length = rgd->rd_length;
+	struct gfs2_bitmap *bi;
+	unsigned int x, y;
+	int error;
+
+	mutex_lock(&rgd->rd_mutex);
+
+	spin_lock(&sdp->sd_rindex_spin);
+	if (rgd->rd_bh_count) {
+		rgd->rd_bh_count++;
+		spin_unlock(&sdp->sd_rindex_spin);
+		mutex_unlock(&rgd->rd_mutex);
+		return 0;
+	}
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	for (x = 0; x < length; x++) {
+		bi = rgd->rd_bits + x;
+		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh);
+		if (error)
+			goto fail;
+	}
+
+	for (y = length; y--;) {
+		bi = rgd->rd_bits + y;
+		error = gfs2_meta_wait(sdp, bi->bi_bh);
+		if (error)
+			goto fail;
+		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
+					      GFS2_METATYPE_RG)) {
+			error = -EIO;
+			goto fail;
+		}
+	}
+
+	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
+		for (x = 0; x < length; x++)
+			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
+		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
+		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
+	}
+
+	spin_lock(&sdp->sd_rindex_spin);
+	rgd->rd_free_clone = rgd->rd_free;
+	rgd->rd_bh_count++;
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	mutex_unlock(&rgd->rd_mutex);
+
+	return 0;
+
+fail:
+	while (x--) {
+		bi = rgd->rd_bits + x;
+		brelse(bi->bi_bh);
+		bi->bi_bh = NULL;
+		gfs2_assert_warn(sdp, !bi->bi_clone);
+	}
+	mutex_unlock(&rgd->rd_mutex);
+
+	return error;
+}
+
+void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+
+	spin_lock(&sdp->sd_rindex_spin);
+	gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
+	rgd->rd_bh_count++;
+	spin_unlock(&sdp->sd_rindex_spin);
+}
+
+/**
+ * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
+ * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ *
+ */
+
+void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	int x, length = rgd->rd_length;
+
+	spin_lock(&sdp->sd_rindex_spin);
+	gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
+	if (--rgd->rd_bh_count) {
+		spin_unlock(&sdp->sd_rindex_spin);
+		return;
+	}
+
+	for (x = 0; x < length; x++) {
+		struct gfs2_bitmap *bi = rgd->rd_bits + x;
+		kfree(bi->bi_clone);
+		bi->bi_clone = NULL;
+		brelse(bi->bi_bh);
+		bi->bi_bh = NULL;
+	}
+
+	spin_unlock(&sdp->sd_rindex_spin);
+}
+
+static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+				    const struct gfs2_bitmap *bi)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	struct block_device *bdev = sb->s_bdev;
+	const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
+					   bdev_logical_block_size(sb->s_bdev);
+	u64 blk;
+	sector_t start = 0;
+	sector_t nr_sects = 0;
+	int rv;
+	unsigned int x;
+
+	for (x = 0; x < bi->bi_len; x++) {
+		const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x;
+		const u8 *clone = bi->bi_clone + bi->bi_offset + x;
+		u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
+		diff &= 0x55;
+		if (diff == 0)
+			continue;
+		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
+		blk *= sects_per_blk; /* convert to sectors */
+		while(diff) {
+			if (diff & 1) {
+				if (nr_sects == 0)
+					goto start_new_extent;
+				if ((start + nr_sects) != blk) {
+					rv = blkdev_issue_discard(bdev, start,
+							    nr_sects, GFP_NOFS,
+							    0);
+					if (rv)
+						goto fail;
+					nr_sects = 0;
+start_new_extent:
+					start = blk;
+				}
+				nr_sects += sects_per_blk;
+			}
+			diff >>= 2;
+			blk += sects_per_blk;
+		}
+	}
+	if (nr_sects) {
+		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
+		if (rv)
+			goto fail;
+	}
+	return;
+fail:
+	fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv);
+	sdp->sd_args.ar_discard = 0;
+}
+
+void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	unsigned int length = rgd->rd_length;
+	unsigned int x;
+
+	for (x = 0; x < length; x++) {
+		struct gfs2_bitmap *bi = rgd->rd_bits + x;
+		if (!bi->bi_clone)
+			continue;
+		if (sdp->sd_args.ar_discard)
+			gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
+		clear_bit(GBF_FULL, &bi->bi_flags);
+		memcpy(bi->bi_clone + bi->bi_offset,
+		       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
+	}
+
+	spin_lock(&sdp->sd_rindex_spin);
+	rgd->rd_free_clone = rgd->rd_free;
+	spin_unlock(&sdp->sd_rindex_spin);
+}
+
+/**
+ * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
+ * @ip: the incore GFS2 inode structure
+ *
+ * Returns: the struct gfs2_alloc
+ */
+
+struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
+{
+	BUG_ON(ip->i_alloc != NULL);
+	ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS);
+	return ip->i_alloc;
+}
+
+/**
+ * try_rgrp_fit - See if a given reservation will fit in a given RG
+ * @rgd: the RG data
+ * @al: the struct gfs2_alloc structure describing the reservation
+ *
+ * If there's room for the requested blocks to be allocated from the RG:
+ *   Sets the $al_rgd field in @al.
+ *
+ * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
+ */
+
+static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	int ret = 0;
+
+	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
+		return 0;
+
+	spin_lock(&sdp->sd_rindex_spin);
+	if (rgd->rd_free_clone >= al->al_requested) {
+		al->al_rgd = rgd;
+		ret = 1;
+	}
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	return ret;
+}
+
+/**
+ * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
+ * @rgd: The rgrp
+ *
+ * Returns: 0 if no error
+ *          The inode, if one has been found, in inode.
+ */
+
+static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
+{
+	u32 goal = 0, block;
+	u64 no_addr;
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	unsigned int n;
+	struct gfs2_glock *gl;
+	struct gfs2_inode *ip;
+	int error;
+	int found = 0;
+
+	while (goal < rgd->rd_data) {
+		down_write(&sdp->sd_log_flush_lock);
+		n = 1;
+		block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
+				     GFS2_BLKST_UNLINKED, &n);
+		up_write(&sdp->sd_log_flush_lock);
+		if (block == BFITNOENT)
+			break;
+		/* rgblk_search can return a block < goal, so we need to
+		   keep it marching forward. */
+		no_addr = block + rgd->rd_data0;
+		goal = max(block + 1, goal + 1);
+		if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
+			continue;
+		if (no_addr == skip)
+			continue;
+		*last_unlinked = no_addr;
+
+		error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl);
+		if (error)
+			continue;
+
+		/* If the inode is already in cache, we can ignore it here
+		 * because the existing inode disposal code will deal with
+		 * it when all refs have gone away. Accessing gl_object like
+		 * this is not safe in general. Here it is ok because we do
+		 * not dereference the pointer, and we only need an approx
+		 * answer to whether it is NULL or not.
+		 */
+		ip = gl->gl_object;
+
+		if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+			gfs2_glock_put(gl);
+		else
+			found++;
+
+		/* Limit reclaim to sensible number of tasks */
+		if (found > NR_CPUS)
+			return;
+	}
+
+	rgd->rd_flags &= ~GFS2_RDF_CHECK;
+	return;
+}
+
+/**
+ * recent_rgrp_next - get next RG from "recent" list
+ * @cur_rgd: current rgrp
+ *
+ * Returns: The next rgrp in the recent list
+ */
+
+static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
+{
+	struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
+	struct list_head *head;
+	struct gfs2_rgrpd *rgd;
+
+	spin_lock(&sdp->sd_rindex_spin);
+	head = &sdp->sd_rindex_mru_list;
+	if (unlikely(cur_rgd->rd_list_mru.next == head)) {
+		spin_unlock(&sdp->sd_rindex_spin);
+		return NULL;
+	}
+	rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
+	spin_unlock(&sdp->sd_rindex_spin);
+	return rgd;
+}
+
+/**
+ * forward_rgrp_get - get an rgrp to try next from full list
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: The rgrp to try next
+ */
+
+static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
+{
+	struct gfs2_rgrpd *rgd;
+	unsigned int journals = gfs2_jindex_size(sdp);
+	unsigned int rg = 0, x;
+
+	spin_lock(&sdp->sd_rindex_spin);
+
+	rgd = sdp->sd_rindex_forward;
+	if (!rgd) {
+		if (sdp->sd_rgrps >= journals)
+			rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
+
+		for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg;
+		     x++, rgd = gfs2_rgrpd_get_next(rgd))
+			/* Do Nothing */;
+
+		sdp->sd_rindex_forward = rgd;
+	}
+
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	return rgd;
+}
+
+/**
+ * forward_rgrp_set - set the forward rgrp pointer
+ * @sdp: the filesystem
+ * @rgd: The new forward rgrp
+ *
+ */
+
+static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
+{
+	spin_lock(&sdp->sd_rindex_spin);
+	sdp->sd_rindex_forward = rgd;
+	spin_unlock(&sdp->sd_rindex_spin);
+}
+
+/**
+ * get_local_rgrp - Choose and lock a rgrp for allocation
+ * @ip: the inode to reserve space for
+ * @rgp: the chosen and locked rgrp
+ *
+ * Try to acquire rgrp in way which avoids contending with others.
+ *
+ * Returns: errno
+ */
+
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *rgd, *begin = NULL;
+	struct gfs2_alloc *al = ip->i_alloc;
+	int flags = LM_FLAG_TRY;
+	int skipped = 0;
+	int loops = 0;
+	int error, rg_locked;
+
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
+
+	while (rgd) {
+		rg_locked = 0;
+
+		if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
+			rg_locked = 1;
+			error = 0;
+		} else {
+			error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+						   LM_FLAG_TRY, &al->al_rgd_gh);
+		}
+		switch (error) {
+		case 0:
+			if (try_rgrp_fit(rgd, al))
+				goto out;
+			if (rgd->rd_flags & GFS2_RDF_CHECK)
+				try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+			if (!rg_locked)
+				gfs2_glock_dq_uninit(&al->al_rgd_gh);
+			/* fall through */
+		case GLR_TRYFAILED:
+			rgd = recent_rgrp_next(rgd);
+			break;
+
+		default:
+			return error;
+		}
+	}
+
+	/* Go through full list of rgrps */
+
+	begin = rgd = forward_rgrp_get(sdp);
+
+	for (;;) {
+		rg_locked = 0;
+
+		if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
+			rg_locked = 1;
+			error = 0;
+		} else {
+			error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
+						   &al->al_rgd_gh);
+		}
+		switch (error) {
+		case 0:
+			if (try_rgrp_fit(rgd, al))
+				goto out;
+			if (rgd->rd_flags & GFS2_RDF_CHECK)
+				try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+			if (!rg_locked)
+				gfs2_glock_dq_uninit(&al->al_rgd_gh);
+			break;
+
+		case GLR_TRYFAILED:
+			skipped++;
+			break;
+
+		default:
+			return error;
+		}
+
+		rgd = gfs2_rgrpd_get_next(rgd);
+		if (!rgd)
+			rgd = gfs2_rgrpd_get_first(sdp);
+
+		if (rgd == begin) {
+			if (++loops >= 3)
+				return -ENOSPC;
+			if (!skipped)
+				loops++;
+			flags = 0;
+			if (loops == 2)
+				gfs2_log_flush(sdp, NULL);
+		}
+	}
+
+out:
+	if (begin) {
+		spin_lock(&sdp->sd_rindex_spin);
+		list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+		spin_unlock(&sdp->sd_rindex_spin);
+		rgd = gfs2_rgrpd_get_next(rgd);
+		if (!rgd)
+			rgd = gfs2_rgrpd_get_first(sdp);
+		forward_rgrp_set(sdp, rgd);
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_inplace_reserve_i - Reserve space in the filesystem
+ * @ip: the inode to reserve space for
+ *
+ * Returns: errno
+ */
+
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+			   char *file, unsigned int line)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = ip->i_alloc;
+	int error = 0;
+	u64 last_unlinked = NO_BLOCK;
+	int tries = 0;
+
+	if (gfs2_assert_warn(sdp, al->al_requested))
+		return -EINVAL;
+
+	if (hold_rindex) {
+		/* We need to hold the rindex unless the inode we're using is
+		   the rindex itself, in which case it's already held. */
+		if (ip != GFS2_I(sdp->sd_rindex))
+			error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+		else if (!sdp->sd_rgrps) /* We may not have the rindex read
+					    in, so: */
+			error = gfs2_ri_update(ip);
+		if (error)
+			return error;
+	}
+
+try_again:
+	do {
+		error = get_local_rgrp(ip, &last_unlinked);
+		/* If there is no space, flushing the log may release some */
+		if (error) {
+			if (ip == GFS2_I(sdp->sd_rindex) &&
+			    !sdp->sd_rindex_uptodate) {
+				error = gfs2_ri_update(ip);
+				if (error)
+					return error;
+				goto try_again;
+			}
+			gfs2_log_flush(sdp, NULL);
+		}
+	} while (error && tries++ < 3);
+
+	if (error) {
+		if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
+			gfs2_glock_dq_uninit(&al->al_ri_gh);
+		return error;
+	}
+
+	/* no error, so we have the rgrp set in the inode's allocation. */
+	al->al_file = file;
+	al->al_line = line;
+
+	return 0;
+}
+
+/**
+ * gfs2_inplace_release - release an inplace reservation
+ * @ip: the inode the reservation was taken out on
+ *
+ * Release a reservation made by gfs2_inplace_reserve().
+ */
+
+void gfs2_inplace_release(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = ip->i_alloc;
+
+	if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
+		fs_warn(sdp, "al_alloced = %u, al_requested = %u "
+			     "al_file = %s, al_line = %u\n",
+		             al->al_alloced, al->al_requested, al->al_file,
+			     al->al_line);
+
+	al->al_rgd = NULL;
+	if (al->al_rgd_gh.gh_gl)
+		gfs2_glock_dq_uninit(&al->al_rgd_gh);
+	if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
+		gfs2_glock_dq_uninit(&al->al_ri_gh);
+}
+
+/**
+ * gfs2_get_block_type - Check a block in a RG is of given type
+ * @rgd: the resource group holding the block
+ * @block: the block number
+ *
+ * Returns: The block type (GFS2_BLKST_*)
+ */
+
+static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
+{
+	struct gfs2_bitmap *bi = NULL;
+	u32 length, rgrp_block, buf_block;
+	unsigned int buf;
+	unsigned char type;
+
+	length = rgd->rd_length;
+	rgrp_block = block - rgd->rd_data0;
+
+	for (buf = 0; buf < length; buf++) {
+		bi = rgd->rd_bits + buf;
+		if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+			break;
+	}
+
+	gfs2_assert(rgd->rd_sbd, buf < length);
+	buf_block = rgrp_block - bi->bi_start * GFS2_NBBY;
+
+	type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+			   bi->bi_len, buf_block);
+
+	return type;
+}
+
+/**
+ * rgblk_search - find a block in @old_state, change allocation
+ *           state to @new_state
+ * @rgd: the resource group descriptor
+ * @goal: the goal block within the RG (start here to search for avail block)
+ * @old_state: GFS2_BLKST_XXX the before-allocation state to find
+ * @new_state: GFS2_BLKST_XXX the after-allocation block state
+ * @n: The extent length
+ *
+ * Walk rgrp's bitmap to find bits that represent a block in @old_state.
+ * Add the found bitmap buffer to the transaction.
+ * Set the found bits to @new_state to change block's allocation state.
+ *
+ * This function never fails, because we wouldn't call it unless we
+ * know (from reservation results, etc.) that a block is available.
+ *
+ * Scope of @goal and returned block is just within rgrp, not the whole
+ * filesystem.
+ *
+ * Returns:  the block number allocated
+ */
+
+static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
+			unsigned char old_state, unsigned char new_state,
+			unsigned int *n)
+{
+	struct gfs2_bitmap *bi = NULL;
+	const u32 length = rgd->rd_length;
+	u32 blk = BFITNOENT;
+	unsigned int buf, x;
+	const unsigned int elen = *n;
+	const u8 *buffer = NULL;
+
+	*n = 0;
+	/* Find bitmap block that contains bits for goal block */
+	for (buf = 0; buf < length; buf++) {
+		bi = rgd->rd_bits + buf;
+		/* Convert scope of "goal" from rgrp-wide to within found bit block */
+		if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
+			goal -= bi->bi_start * GFS2_NBBY;
+			goto do_search;
+		}
+	}
+	buf = 0;
+	goal = 0;
+
+do_search:
+	/* Search (up to entire) bitmap in this rgrp for allocatable block.
+	   "x <= length", instead of "x < length", because we typically start
+	   the search in the middle of a bit block, but if we can't find an
+	   allocatable block anywhere else, we want to be able wrap around and
+	   search in the first part of our first-searched bit block.  */
+	for (x = 0; x <= length; x++) {
+		bi = rgd->rd_bits + buf;
+
+		if (test_bit(GBF_FULL, &bi->bi_flags) &&
+		    (old_state == GFS2_BLKST_FREE))
+			goto skip;
+
+		/* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
+		   bitmaps, so we must search the originals for that. */
+		buffer = bi->bi_bh->b_data + bi->bi_offset;
+		if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
+			buffer = bi->bi_clone + bi->bi_offset;
+
+		blk = gfs2_bitfit(buffer, bi->bi_len, goal, old_state);
+		if (blk != BFITNOENT)
+			break;
+
+		if ((goal == 0) && (old_state == GFS2_BLKST_FREE))
+			set_bit(GBF_FULL, &bi->bi_flags);
+
+		/* Try next bitmap block (wrap back to rgrp header if at end) */
+skip:
+		buf++;
+		buf %= length;
+		goal = 0;
+	}
+
+	if (blk == BFITNOENT)
+		return blk;
+	*n = 1;
+	if (old_state == new_state)
+		goto out;
+
+	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+	gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
+		    bi, blk, new_state);
+	goal = blk;
+	while (*n < elen) {
+		goal++;
+		if (goal >= (bi->bi_len * GFS2_NBBY))
+			break;
+		if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
+		    GFS2_BLKST_FREE)
+			break;
+		gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
+			    bi, goal, new_state);
+		(*n)++;
+	}
+out:
+	return (bi->bi_start * GFS2_NBBY) + blk;
+}
+
+/**
+ * rgblk_free - Change alloc state of given block(s)
+ * @sdp: the filesystem
+ * @bstart: the start of a run of blocks to free
+ * @blen: the length of the block run (all must lie within ONE RG!)
+ * @new_state: GFS2_BLKST_XXX the after-allocation block state
+ *
+ * Returns:  Resource group containing the block(s)
+ */
+
+static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
+				     u32 blen, unsigned char new_state)
+{
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_bitmap *bi = NULL;
+	u32 length, rgrp_blk, buf_blk;
+	unsigned int buf;
+
+	rgd = gfs2_blk2rgrpd(sdp, bstart);
+	if (!rgd) {
+		if (gfs2_consist(sdp))
+			fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
+		return NULL;
+	}
+
+	length = rgd->rd_length;
+
+	rgrp_blk = bstart - rgd->rd_data0;
+
+	while (blen--) {
+		for (buf = 0; buf < length; buf++) {
+			bi = rgd->rd_bits + buf;
+			if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+				break;
+		}
+
+		gfs2_assert(rgd->rd_sbd, buf < length);
+
+		buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY;
+		rgrp_blk++;
+
+		if (!bi->bi_clone) {
+			bi->bi_clone = kmalloc(bi->bi_bh->b_size,
+					       GFP_NOFS | __GFP_NOFAIL);
+			memcpy(bi->bi_clone + bi->bi_offset,
+			       bi->bi_bh->b_data + bi->bi_offset,
+			       bi->bi_len);
+		}
+		gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+		gfs2_setbit(rgd, bi->bi_bh->b_data, NULL, bi->bi_offset,
+			    bi, buf_blk, new_state);
+	}
+
+	return rgd;
+}
+
+/**
+ * gfs2_rgrp_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+
+int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+	const struct gfs2_rgrpd *rgd = gl->gl_object;
+	if (rgd == NULL)
+		return 0;
+	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
+		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
+	return 0;
+}
+
+static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+		(unsigned long long)rgd->rd_addr);
+	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
+	gfs2_rgrp_dump(NULL, rgd->rd_gl);
+	rgd->rd_flags |= GFS2_RDF_ERROR;
+}
+
+/**
+ * gfs2_alloc_block - Allocate one or more blocks
+ * @ip: the inode to allocate the block for
+ * @bn: Used to return the starting block number
+ * @n: requested number of blocks/extent length (value/result)
+ *
+ * Returns: 0 or error
+ */
+
+int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *dibh;
+	struct gfs2_alloc *al = ip->i_alloc;
+	struct gfs2_rgrpd *rgd;
+	u32 goal, blk;
+	u64 block;
+	int error;
+
+	/* Only happens if there is a bug in gfs2, return something distinctive
+	 * to ensure that it is noticed.
+	 */
+	if (al == NULL)
+		return -ECANCELED;
+
+	rgd = al->al_rgd;
+
+	if (rgrp_contains_block(rgd, ip->i_goal))
+		goal = ip->i_goal - rgd->rd_data0;
+	else
+		goal = rgd->rd_last_alloc;
+
+	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n);
+
+	/* Since all blocks are reserved in advance, this shouldn't happen */
+	if (blk == BFITNOENT)
+		goto rgrp_error;
+
+	rgd->rd_last_alloc = blk;
+	block = rgd->rd_data0 + blk;
+	ip->i_goal = block;
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error == 0) {
+		struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
+		brelse(dibh);
+	}
+	if (rgd->rd_free < *n)
+		goto rgrp_error;
+
+	rgd->rd_free -= *n;
+
+	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
+
+	al->al_alloced += *n;
+
+	gfs2_statfs_change(sdp, 0, -(s64)*n, 0);
+	gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid);
+
+	spin_lock(&sdp->sd_rindex_spin);
+	rgd->rd_free_clone -= *n;
+	spin_unlock(&sdp->sd_rindex_spin);
+	trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED);
+	*bn = block;
+	return 0;
+
+rgrp_error:
+	gfs2_rgrp_error(rgd);
+	return -EIO;
+}
+
+/**
+ * gfs2_alloc_di - Allocate a dinode
+ * @dip: the directory that the inode is going in
+ * @bn: the block number which is allocated
+ * @generation: the generation number of the inode
+ *
+ * Returns: 0 on success or error
+ */
+
+int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_alloc *al = dip->i_alloc;
+	struct gfs2_rgrpd *rgd = al->al_rgd;
+	u32 blk;
+	u64 block;
+	unsigned int n = 1;
+
+	blk = rgblk_search(rgd, rgd->rd_last_alloc,
+			   GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
+
+	/* Since all blocks are reserved in advance, this shouldn't happen */
+	if (blk == BFITNOENT)
+		goto rgrp_error;
+
+	rgd->rd_last_alloc = blk;
+	block = rgd->rd_data0 + blk;
+	if (rgd->rd_free == 0)
+		goto rgrp_error;
+
+	rgd->rd_free--;
+	rgd->rd_dinodes++;
+	*generation = rgd->rd_igeneration++;
+	if (*generation == 0)
+		*generation = rgd->rd_igeneration++;
+	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
+
+	al->al_alloced++;
+
+	gfs2_statfs_change(sdp, 0, -1, +1);
+	gfs2_trans_add_unrevoke(sdp, block, 1);
+
+	spin_lock(&sdp->sd_rindex_spin);
+	rgd->rd_free_clone--;
+	spin_unlock(&sdp->sd_rindex_spin);
+	trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
+	*bn = block;
+	return 0;
+
+rgrp_error:
+	gfs2_rgrp_error(rgd);
+	return -EIO;
+}
+
+/**
+ * gfs2_free_data - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+
+void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *rgd;
+
+	rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
+	if (!rgd)
+		return;
+	trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE);
+	rgd->rd_free += blen;
+
+	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
+
+	gfs2_trans_add_rg(rgd);
+
+	/* Directories keep their data in the metadata address space */
+	if (ip->i_depth)
+		gfs2_meta_wipe(ip, bstart, blen);
+}
+
+/**
+ * gfs2_free_data - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+
+void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+	__gfs2_free_data(ip, bstart, blen);
+	gfs2_statfs_change(sdp, 0, +blen, 0);
+	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
+}
+
+/**
+ * gfs2_free_meta - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+
+void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *rgd;
+
+	rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
+	if (!rgd)
+		return;
+	trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE);
+	rgd->rd_free += blen;
+
+	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
+
+	gfs2_trans_add_rg(rgd);
+	gfs2_meta_wipe(ip, bstart, blen);
+}
+
+/**
+ * gfs2_free_meta - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+
+void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+	__gfs2_free_meta(ip, bstart, blen);
+	gfs2_statfs_change(sdp, 0, +blen, 0);
+	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
+}
+
+void gfs2_unlink_di(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_rgrpd *rgd;
+	u64 blkno = ip->i_no_addr;
+
+	rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
+	if (!rgd)
+		return;
+	trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED);
+	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
+	gfs2_trans_add_rg(rgd);
+}
+
+static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct gfs2_rgrpd *tmp_rgd;
+
+	tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
+	if (!tmp_rgd)
+		return;
+	gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
+
+	if (!rgd->rd_dinodes)
+		gfs2_consist_rgrpd(rgd);
+	rgd->rd_dinodes--;
+	rgd->rd_free++;
+
+	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
+
+	gfs2_statfs_change(sdp, 0, +1, -1);
+	gfs2_trans_add_rg(rgd);
+}
+
+
+void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
+{
+	gfs2_free_uninit_di(rgd, ip->i_no_addr);
+	trace_gfs2_block_alloc(ip, ip->i_no_addr, 1, GFS2_BLKST_FREE);
+	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
+	gfs2_meta_wipe(ip, ip->i_no_addr, 1);
+}
+
+/**
+ * gfs2_check_blk_type - Check the type of a block
+ * @sdp: The superblock
+ * @no_addr: The block number to check
+ * @type: The block type we are looking for
+ *
+ * Returns: 0 if the block type matches the expected type
+ *          -ESTALE if it doesn't match
+ *          or -ve errno if something went wrong while checking
+ */
+
+int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
+{
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder ri_gh, rgd_gh;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
+	int ri_locked = 0;
+	int error;
+
+	if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
+		error = gfs2_rindex_hold(sdp, &ri_gh);
+		if (error)
+			goto fail;
+		ri_locked = 1;
+	}
+
+	error = -EINVAL;
+	rgd = gfs2_blk2rgrpd(sdp, no_addr);
+	if (!rgd)
+		goto fail_rindex;
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
+	if (error)
+		goto fail_rindex;
+
+	if (gfs2_get_block_type(rgd, no_addr) != type)
+		error = -ESTALE;
+
+	gfs2_glock_dq_uninit(&rgd_gh);
+fail_rindex:
+	if (ri_locked)
+		gfs2_glock_dq_uninit(&ri_gh);
+fail:
+	return error;
+}
+
+/**
+ * gfs2_rlist_add - add a RG to a list of RGs
+ * @sdp: the filesystem
+ * @rlist: the list of resource groups
+ * @block: the block
+ *
+ * Figure out what RG a block belongs to and add that RG to the list
+ *
+ * FIXME: Don't use NOFAIL
+ *
+ */
+
+void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+		    u64 block)
+{
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_rgrpd **tmp;
+	unsigned int new_space;
+	unsigned int x;
+
+	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
+		return;
+
+	rgd = gfs2_blk2rgrpd(sdp, block);
+	if (!rgd) {
+		if (gfs2_consist(sdp))
+			fs_err(sdp, "block = %llu\n", (unsigned long long)block);
+		return;
+	}
+
+	for (x = 0; x < rlist->rl_rgrps; x++)
+		if (rlist->rl_rgd[x] == rgd)
+			return;
+
+	if (rlist->rl_rgrps == rlist->rl_space) {
+		new_space = rlist->rl_space + 10;
+
+		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
+			      GFP_NOFS | __GFP_NOFAIL);
+
+		if (rlist->rl_rgd) {
+			memcpy(tmp, rlist->rl_rgd,
+			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
+			kfree(rlist->rl_rgd);
+		}
+
+		rlist->rl_space = new_space;
+		rlist->rl_rgd = tmp;
+	}
+
+	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
+}
+
+/**
+ * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
+ *      and initialize an array of glock holders for them
+ * @rlist: the list of resource groups
+ * @state: the lock state to acquire the RG lock in
+ * @flags: the modifier flags for the holder structures
+ *
+ * FIXME: Don't use NOFAIL
+ *
+ */
+
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
+{
+	unsigned int x;
+
+	rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
+				GFP_NOFS | __GFP_NOFAIL);
+	for (x = 0; x < rlist->rl_rgrps; x++)
+		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
+				state, 0,
+				&rlist->rl_ghs[x]);
+}
+
+/**
+ * gfs2_rlist_free - free a resource group list
+ * @list: the list of resource groups
+ *
+ */
+
+void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
+{
+	unsigned int x;
+
+	kfree(rlist->rl_rgd);
+
+	if (rlist->rl_ghs) {
+		for (x = 0; x < rlist->rl_rgrps; x++)
+			gfs2_holder_uninit(&rlist->rl_ghs[x]);
+		kfree(rlist->rl_ghs);
+	}
+}
+
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
new file mode 100644
index 00000000..a80e3034
--- /dev/null
+++ b/fs/gfs2/rgrp.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __RGRP_DOT_H__
+#define __RGRP_DOT_H__
+
+#include <linux/slab.h>
+
+struct gfs2_rgrpd;
+struct gfs2_sbd;
+struct gfs2_holder;
+
+extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
+
+struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
+struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
+struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
+
+extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
+extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
+
+extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
+
+extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
+
+extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+static inline void gfs2_alloc_put(struct gfs2_inode *ip)
+{
+	BUG_ON(ip->i_alloc == NULL);
+	kfree(ip->i_alloc);
+	ip->i_alloc = NULL;
+}
+
+extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+				  char *file, unsigned int line);
+#define gfs2_inplace_reserve(ip) \
+	gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
+#define gfs2_inplace_reserve_ri(ip) \
+	gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
+
+extern void gfs2_inplace_release(struct gfs2_inode *ip);
+
+extern int gfs2_ri_update(struct gfs2_inode *ip);
+extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
+extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
+
+extern void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
+extern void gfs2_unlink_di(struct inode *inode);
+extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
+			       unsigned int type);
+
+struct gfs2_rgrp_list {
+	unsigned int rl_rgrps;
+	unsigned int rl_space;
+	struct gfs2_rgrpd **rl_rgd;
+	struct gfs2_holder *rl_ghs;
+};
+
+extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+			   u64 block);
+extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
+extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
+extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
+
+#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
new file mode 100644
index 00000000..fb0edf73
--- /dev/null
+++ b/fs/gfs2/super.c
@@ -0,0 +1,1585 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/bio.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/time.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "util.h"
+#include "sys.h"
+#include "xattr.h"
+
+#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
+
+enum {
+	Opt_lockproto,
+	Opt_locktable,
+	Opt_hostdata,
+	Opt_spectator,
+	Opt_ignore_local_fs,
+	Opt_localflocks,
+	Opt_localcaching,
+	Opt_debug,
+	Opt_nodebug,
+	Opt_upgrade,
+	Opt_acl,
+	Opt_noacl,
+	Opt_quota_off,
+	Opt_quota_account,
+	Opt_quota_on,
+	Opt_quota,
+	Opt_noquota,
+	Opt_suiddir,
+	Opt_nosuiddir,
+	Opt_data_writeback,
+	Opt_data_ordered,
+	Opt_meta,
+	Opt_discard,
+	Opt_nodiscard,
+	Opt_commit,
+	Opt_err_withdraw,
+	Opt_err_panic,
+	Opt_statfs_quantum,
+	Opt_statfs_percent,
+	Opt_quota_quantum,
+	Opt_barrier,
+	Opt_nobarrier,
+	Opt_error,
+};
+
+static const match_table_t tokens = {
+	{Opt_lockproto, "lockproto=%s"},
+	{Opt_locktable, "locktable=%s"},
+	{Opt_hostdata, "hostdata=%s"},
+	{Opt_spectator, "spectator"},
+	{Opt_spectator, "norecovery"},
+	{Opt_ignore_local_fs, "ignore_local_fs"},
+	{Opt_localflocks, "localflocks"},
+	{Opt_localcaching, "localcaching"},
+	{Opt_debug, "debug"},
+	{Opt_nodebug, "nodebug"},
+	{Opt_upgrade, "upgrade"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_quota_off, "quota=off"},
+	{Opt_quota_account, "quota=account"},
+	{Opt_quota_on, "quota=on"},
+	{Opt_quota, "quota"},
+	{Opt_noquota, "noquota"},
+	{Opt_suiddir, "suiddir"},
+	{Opt_nosuiddir, "nosuiddir"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_meta, "meta"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
+	{Opt_commit, "commit=%d"},
+	{Opt_err_withdraw, "errors=withdraw"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_statfs_quantum, "statfs_quantum=%d"},
+	{Opt_statfs_percent, "statfs_percent=%d"},
+	{Opt_quota_quantum, "quota_quantum=%d"},
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_error, NULL}
+};
+
+/**
+ * gfs2_mount_args - Parse mount options
+ * @args: The structure into which the parsed options will be written
+ * @options: The options to parse
+ *
+ * Return: errno
+ */
+
+int gfs2_mount_args(struct gfs2_args *args, char *options)
+{
+	char *o;
+	int token;
+	substring_t tmp[MAX_OPT_ARGS];
+	int rv;
+
+	/* Split the options into tokens with the "," character and
+	   process them */
+
+	while (1) {
+		o = strsep(&options, ",");
+		if (o == NULL)
+			break;
+		if (*o == '\0')
+			continue;
+
+		token = match_token(o, tokens, tmp);
+		switch (token) {
+		case Opt_lockproto:
+			match_strlcpy(args->ar_lockproto, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
+			break;
+		case Opt_locktable:
+			match_strlcpy(args->ar_locktable, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
+			break;
+		case Opt_hostdata:
+			match_strlcpy(args->ar_hostdata, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
+			break;
+		case Opt_spectator:
+			args->ar_spectator = 1;
+			break;
+		case Opt_ignore_local_fs:
+			/* Retained for backwards compat only */
+			break;
+		case Opt_localflocks:
+			args->ar_localflocks = 1;
+			break;
+		case Opt_localcaching:
+			/* Retained for backwards compat only */
+			break;
+		case Opt_debug:
+			if (args->ar_errors == GFS2_ERRORS_PANIC) {
+				printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
+				       "are mutually exclusive.\n");
+				return -EINVAL;
+			}
+			args->ar_debug = 1;
+			break;
+		case Opt_nodebug:
+			args->ar_debug = 0;
+			break;
+		case Opt_upgrade:
+			/* Retained for backwards compat only */
+			break;
+		case Opt_acl:
+			args->ar_posix_acl = 1;
+			break;
+		case Opt_noacl:
+			args->ar_posix_acl = 0;
+			break;
+		case Opt_quota_off:
+		case Opt_noquota:
+			args->ar_quota = GFS2_QUOTA_OFF;
+			break;
+		case Opt_quota_account:
+			args->ar_quota = GFS2_QUOTA_ACCOUNT;
+			break;
+		case Opt_quota_on:
+		case Opt_quota:
+			args->ar_quota = GFS2_QUOTA_ON;
+			break;
+		case Opt_suiddir:
+			args->ar_suiddir = 1;
+			break;
+		case Opt_nosuiddir:
+			args->ar_suiddir = 0;
+			break;
+		case Opt_data_writeback:
+			args->ar_data = GFS2_DATA_WRITEBACK;
+			break;
+		case Opt_data_ordered:
+			args->ar_data = GFS2_DATA_ORDERED;
+			break;
+		case Opt_meta:
+			args->ar_meta = 1;
+			break;
+		case Opt_discard:
+			args->ar_discard = 1;
+			break;
+		case Opt_nodiscard:
+			args->ar_discard = 0;
+			break;
+		case Opt_commit:
+			rv = match_int(&tmp[0], &args->ar_commit);
+			if (rv || args->ar_commit <= 0) {
+				printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
+		case Opt_statfs_quantum:
+			rv = match_int(&tmp[0], &args->ar_statfs_quantum);
+			if (rv || args->ar_statfs_quantum < 0) {
+				printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
+		case Opt_quota_quantum:
+			rv = match_int(&tmp[0], &args->ar_quota_quantum);
+			if (rv || args->ar_quota_quantum <= 0) {
+				printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
+		case Opt_statfs_percent:
+			rv = match_int(&tmp[0], &args->ar_statfs_percent);
+			if (rv || args->ar_statfs_percent < 0 ||
+			    args->ar_statfs_percent > 100) {
+				printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
+		case Opt_err_withdraw:
+			args->ar_errors = GFS2_ERRORS_WITHDRAW;
+			break;
+		case Opt_err_panic:
+			if (args->ar_debug) {
+				printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
+					"are mutually exclusive.\n");
+				return -EINVAL;
+			}
+			args->ar_errors = GFS2_ERRORS_PANIC;
+			break;
+		case Opt_barrier:
+			args->ar_nobarrier = 0;
+			break;
+		case Opt_nobarrier:
+			args->ar_nobarrier = 1;
+			break;
+		case Opt_error:
+		default:
+			printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_jindex_free - Clear all the journal index information
+ * @sdp: The GFS2 superblock
+ *
+ */
+
+void gfs2_jindex_free(struct gfs2_sbd *sdp)
+{
+	struct list_head list, *head;
+	struct gfs2_jdesc *jd;
+	struct gfs2_journal_extent *jext;
+
+	spin_lock(&sdp->sd_jindex_spin);
+	list_add(&list, &sdp->sd_jindex_list);
+	list_del_init(&sdp->sd_jindex_list);
+	sdp->sd_journals = 0;
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	while (!list_empty(&list)) {
+		jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
+		head = &jd->extent_list;
+		while (!list_empty(head)) {
+			jext = list_entry(head->next,
+					  struct gfs2_journal_extent,
+					  extent_list);
+			list_del(&jext->extent_list);
+			kfree(jext);
+		}
+		list_del(&jd->jd_list);
+		iput(jd->jd_inode);
+		kfree(jd);
+	}
+}
+
+static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
+{
+	struct gfs2_jdesc *jd;
+	int found = 0;
+
+	list_for_each_entry(jd, head, jd_list) {
+		if (jd->jd_jid == jid) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found)
+		jd = NULL;
+
+	return jd;
+}
+
+struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
+{
+	struct gfs2_jdesc *jd;
+
+	spin_lock(&sdp->sd_jindex_spin);
+	jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	return jd;
+}
+
+int gfs2_jdesc_check(struct gfs2_jdesc *jd)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	u64 size = i_size_read(jd->jd_inode);
+
+	if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
+		return -EIO;
+
+	jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
+
+	if (gfs2_write_alloc_required(ip, 0, size)) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+
+int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+	struct gfs2_glock *j_gl = ip->i_gl;
+	struct gfs2_holder t_gh;
+	struct gfs2_log_header_host head;
+	int error;
+
+	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
+	if (error)
+		return error;
+
+	j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
+
+	error = gfs2_find_jhead(sdp->sd_jdesc, &head);
+	if (error)
+		goto fail;
+
+	if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+		gfs2_consist(sdp);
+		error = -EIO;
+		goto fail;
+	}
+
+	/*  Initialize some head of the log stuff  */
+	sdp->sd_log_sequence = head.lh_sequence + 1;
+	gfs2_log_pointers_init(sdp, head.lh_blkno);
+
+	error = gfs2_quota_init(sdp);
+	if (error)
+		goto fail;
+
+	set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
+	gfs2_glock_dq_uninit(&t_gh);
+
+	return 0;
+
+fail:
+	t_gh.gh_flags |= GL_NOCACHE;
+	gfs2_glock_dq_uninit(&t_gh);
+
+	return error;
+}
+
+void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
+{
+	const struct gfs2_statfs_change *str = buf;
+
+	sc->sc_total = be64_to_cpu(str->sc_total);
+	sc->sc_free = be64_to_cpu(str->sc_free);
+	sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
+}
+
+static void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf)
+{
+	struct gfs2_statfs_change *str = buf;
+
+	str->sc_total = cpu_to_be64(sc->sc_total);
+	str->sc_free = cpu_to_be64(sc->sc_free);
+	str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
+}
+
+int gfs2_statfs_init(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+	struct buffer_head *m_bh, *l_bh;
+	struct gfs2_holder gh;
+	int error;
+
+	error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
+				   &gh);
+	if (error)
+		return error;
+
+	error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+	if (error)
+		goto out;
+
+	if (sdp->sd_args.ar_spectator) {
+		spin_lock(&sdp->sd_statfs_spin);
+		gfs2_statfs_change_in(m_sc, m_bh->b_data +
+				      sizeof(struct gfs2_dinode));
+		spin_unlock(&sdp->sd_statfs_spin);
+	} else {
+		error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+		if (error)
+			goto out_m_bh;
+
+		spin_lock(&sdp->sd_statfs_spin);
+		gfs2_statfs_change_in(m_sc, m_bh->b_data +
+				      sizeof(struct gfs2_dinode));
+		gfs2_statfs_change_in(l_sc, l_bh->b_data +
+				      sizeof(struct gfs2_dinode));
+		spin_unlock(&sdp->sd_statfs_spin);
+
+		brelse(l_bh);
+	}
+
+out_m_bh:
+	brelse(m_bh);
+out:
+	gfs2_glock_dq_uninit(&gh);
+	return 0;
+}
+
+void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
+			s64 dinodes)
+{
+	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct buffer_head *l_bh;
+	s64 x, y;
+	int need_sync = 0;
+	int error;
+
+	error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+	if (error)
+		return;
+
+	gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+
+	spin_lock(&sdp->sd_statfs_spin);
+	l_sc->sc_total += total;
+	l_sc->sc_free += free;
+	l_sc->sc_dinodes += dinodes;
+	gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
+	if (sdp->sd_args.ar_statfs_percent) {
+		x = 100 * l_sc->sc_free;
+		y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent;
+		if (x >= y || x <= -y)
+			need_sync = 1;
+	}
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	brelse(l_bh);
+	if (need_sync)
+		gfs2_wake_up_statfs(sdp);
+}
+
+void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+		   struct buffer_head *l_bh)
+{
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+
+	gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+
+	spin_lock(&sdp->sd_statfs_spin);
+	m_sc->sc_total += l_sc->sc_total;
+	m_sc->sc_free += l_sc->sc_free;
+	m_sc->sc_dinodes += l_sc->sc_dinodes;
+	memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
+	memset(l_bh->b_data + sizeof(struct gfs2_dinode),
+	       0, sizeof(struct gfs2_statfs_change));
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+	gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+}
+
+int gfs2_statfs_sync(struct super_block *sb, int type)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+	struct gfs2_holder gh;
+	struct buffer_head *m_bh, *l_bh;
+	int error;
+
+	error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
+				   &gh);
+	if (error)
+		return error;
+
+	error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+	if (error)
+		goto out;
+
+	spin_lock(&sdp->sd_statfs_spin);
+	gfs2_statfs_change_in(m_sc, m_bh->b_data +
+			      sizeof(struct gfs2_dinode));
+	if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
+		spin_unlock(&sdp->sd_statfs_spin);
+		goto out_bh;
+	}
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+	if (error)
+		goto out_bh;
+
+	error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
+	if (error)
+		goto out_bh2;
+
+	update_statfs(sdp, m_bh, l_bh);
+	sdp->sd_statfs_force_sync = 0;
+
+	gfs2_trans_end(sdp);
+
+out_bh2:
+	brelse(l_bh);
+out_bh:
+	brelse(m_bh);
+out:
+	gfs2_glock_dq_uninit(&gh);
+	return error;
+}
+
+struct lfcc {
+	struct list_head list;
+	struct gfs2_holder gh;
+};
+
+/**
+ * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all
+ *                            journals are clean
+ * @sdp: the file system
+ * @state: the state to put the transaction lock into
+ * @t_gh: the hold on the transaction lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
+				    struct gfs2_holder *t_gh)
+{
+	struct gfs2_inode *ip;
+	struct gfs2_jdesc *jd;
+	struct lfcc *lfcc;
+	LIST_HEAD(list);
+	struct gfs2_log_header_host lh;
+	int error;
+
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
+		if (!lfcc) {
+			error = -ENOMEM;
+			goto out;
+		}
+		ip = GFS2_I(jd->jd_inode);
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh);
+		if (error) {
+			kfree(lfcc);
+			goto out;
+		}
+		list_add(&lfcc->list, &list);
+	}
+
+	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
+				   GL_NOCACHE, t_gh);
+
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		error = gfs2_jdesc_check(jd);
+		if (error)
+			break;
+		error = gfs2_find_jhead(jd, &lh);
+		if (error)
+			break;
+		if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+			error = -EBUSY;
+			break;
+		}
+	}
+
+	if (error)
+		gfs2_glock_dq_uninit(t_gh);
+
+out:
+	while (!list_empty(&list)) {
+		lfcc = list_entry(list.next, struct lfcc, list);
+		list_del(&lfcc->list);
+		gfs2_glock_dq_uninit(&lfcc->gh);
+		kfree(lfcc);
+	}
+	return error;
+}
+
+/**
+ * gfs2_freeze_fs - freezes the file system
+ * @sdp: the file system
+ *
+ * This function flushes data and meta data for all machines by
+ * acquiring the transaction log exclusively.  All journals are
+ * ensured to be in a clean state as well.
+ *
+ * Returns: errno
+ */
+
+int gfs2_freeze_fs(struct gfs2_sbd *sdp)
+{
+	int error = 0;
+
+	mutex_lock(&sdp->sd_freeze_lock);
+
+	if (!sdp->sd_freeze_count++) {
+		error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
+		if (error)
+			sdp->sd_freeze_count--;
+	}
+
+	mutex_unlock(&sdp->sd_freeze_lock);
+
+	return error;
+}
+
+/**
+ * gfs2_unfreeze_fs - unfreezes the file system
+ * @sdp: the file system
+ *
+ * This function allows the file system to proceed by unlocking
+ * the exclusively held transaction lock.  Other GFS2 nodes are
+ * now free to acquire the lock shared and go on with their lives.
+ *
+ */
+
+void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
+{
+	mutex_lock(&sdp->sd_freeze_lock);
+
+	if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
+		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+
+	mutex_unlock(&sdp->sd_freeze_lock);
+}
+
+void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
+{
+	struct gfs2_dinode *str = buf;
+
+	str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+	str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
+	str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
+	str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
+	str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
+	str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
+	str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
+	str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
+	str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
+	str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
+	str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
+	str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
+	str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
+	str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
+
+	str->di_goal_meta = cpu_to_be64(ip->i_goal);
+	str->di_goal_data = cpu_to_be64(ip->i_goal);
+	str->di_generation = cpu_to_be64(ip->i_generation);
+
+	str->di_flags = cpu_to_be32(ip->i_diskflags);
+	str->di_height = cpu_to_be16(ip->i_height);
+	str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
+					     !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
+					     GFS2_FORMAT_DE : 0);
+	str->di_depth = cpu_to_be16(ip->i_depth);
+	str->di_entries = cpu_to_be32(ip->i_entries);
+
+	str->di_eattr = cpu_to_be64(ip->i_eattr);
+	str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
+	str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
+	str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
+}
+
+/**
+ * gfs2_write_inode - Make sure the inode is stable on the disk
+ * @inode: The inode
+ * @wbc: The writeback control structure
+ *
+ * Returns: errno
+ */
+
+static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
+	struct backing_dev_info *bdi = metamapping->backing_dev_info;
+	struct gfs2_holder gh;
+	struct buffer_head *bh;
+	struct timespec atime;
+	struct gfs2_dinode *di;
+	int ret = -EAGAIN;
+	int unlock_required = 0;
+
+	/* Skip timestamp update, if this is from a memalloc */
+	if (current->flags & PF_MEMALLOC)
+		goto do_flush;
+	if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
+		ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+		if (ret)
+			goto do_flush;
+		unlock_required = 1;
+	}
+	ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (ret)
+		goto do_unlock;
+	ret = gfs2_meta_inode_buffer(ip, &bh);
+	if (ret == 0) {
+		di = (struct gfs2_dinode *)bh->b_data;
+		atime.tv_sec = be64_to_cpu(di->di_atime);
+		atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
+		if (timespec_compare(&inode->i_atime, &atime) > 0) {
+			gfs2_trans_add_bh(ip->i_gl, bh, 1);
+			gfs2_dinode_out(ip, bh->b_data);
+		}
+		brelse(bh);
+	}
+	gfs2_trans_end(sdp);
+do_unlock:
+	if (unlock_required)
+		gfs2_glock_dq_uninit(&gh);
+do_flush:
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+	filemap_fdatawrite(metamapping);
+	if (bdi->dirty_exceeded)
+		gfs2_ail1_flush(sdp, wbc);
+	if (!ret && (wbc->sync_mode == WB_SYNC_ALL))
+		ret = filemap_fdatawait(metamapping);
+	if (ret)
+		mark_inode_dirty_sync(inode);
+	return ret;
+}
+
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+
+static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+	struct gfs2_holder t_gh;
+	int error;
+
+	flush_workqueue(gfs2_delete_workqueue);
+	gfs2_quota_sync(sdp->sd_vfs, 0, 1);
+	gfs2_statfs_sync(sdp->sd_vfs, 0);
+
+	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
+				   &t_gh);
+	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return error;
+
+	gfs2_meta_syncfs(sdp);
+	gfs2_log_shutdown(sdp);
+
+	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
+	if (t_gh.gh_gl)
+		gfs2_glock_dq_uninit(&t_gh);
+
+	gfs2_quota_cleanup(sdp);
+
+	return error;
+}
+
+static int gfs2_umount_recovery_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
+/**
+ * gfs2_put_super - Unmount the filesystem
+ * @sb: The VFS superblock
+ *
+ */
+
+static void gfs2_put_super(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	int error;
+	struct gfs2_jdesc *jd;
+
+	/*  Unfreeze the filesystem, if we need to  */
+
+	mutex_lock(&sdp->sd_freeze_lock);
+	if (sdp->sd_freeze_count)
+		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+	mutex_unlock(&sdp->sd_freeze_lock);
+
+	/* No more recovery requests */
+	set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+	smp_mb();
+
+	/* Wait on outstanding recovery */
+restart:
+	spin_lock(&sdp->sd_jindex_spin);
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
+			continue;
+		spin_unlock(&sdp->sd_jindex_spin);
+		wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
+			    gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE);
+		goto restart;
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	kthread_stop(sdp->sd_quotad_process);
+	kthread_stop(sdp->sd_logd_process);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		error = gfs2_make_fs_ro(sdp);
+		if (error)
+			gfs2_io_error(sdp);
+	}
+	/*  At this point, we're through modifying the disk  */
+
+	/*  Release stuff  */
+
+	iput(sdp->sd_jindex);
+	iput(sdp->sd_statfs_inode);
+	iput(sdp->sd_rindex);
+	iput(sdp->sd_quota_inode);
+
+	gfs2_glock_put(sdp->sd_rename_gl);
+	gfs2_glock_put(sdp->sd_trans_gl);
+
+	if (!sdp->sd_args.ar_spectator) {
+		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+		iput(sdp->sd_sc_inode);
+		iput(sdp->sd_qc_inode);
+	}
+
+	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+	gfs2_clear_rgrpd(sdp);
+	gfs2_jindex_free(sdp);
+	/*  Take apart glock structures and buffer lists  */
+	gfs2_gl_hash_clear(sdp);
+	/*  Unmount the locking protocol  */
+	gfs2_lm_unmount(sdp);
+
+	/*  At this point, we're through participating in the lockspace  */
+	gfs2_sys_fs_del(sdp);
+}
+
+/**
+ * gfs2_sync_fs - sync the filesystem
+ * @sb: the superblock
+ *
+ * Flushes the log to disk.
+ */
+
+static int gfs2_sync_fs(struct super_block *sb, int wait)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	if (wait && sdp)
+		gfs2_log_flush(sdp, NULL);
+	return 0;
+}
+
+/**
+ * gfs2_freeze - prevent further writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+
+static int gfs2_freeze(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	int error;
+
+	if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return -EINVAL;
+
+	for (;;) {
+		error = gfs2_freeze_fs(sdp);
+		if (!error)
+			break;
+
+		switch (error) {
+		case -EBUSY:
+			fs_err(sdp, "waiting for recovery before freeze\n");
+			break;
+
+		default:
+			fs_err(sdp, "error freezing FS: %d\n", error);
+			break;
+		}
+
+		fs_err(sdp, "retrying...\n");
+		msleep(1000);
+	}
+	return 0;
+}
+
+/**
+ * gfs2_unfreeze - reallow writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+
+static int gfs2_unfreeze(struct super_block *sb)
+{
+	gfs2_unfreeze_fs(sb->s_fs_info);
+	return 0;
+}
+
+/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+			    struct gfs2_statfs_change_host *sc)
+{
+	gfs2_rgrp_verify(rgd);
+	sc->sc_total += rgd->rd_data;
+	sc->sc_free += rgd->rd_free;
+	sc->sc_dinodes += rgd->rd_dinodes;
+	return 0;
+}
+
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+	struct gfs2_holder ri_gh;
+	struct gfs2_rgrpd *rgd_next;
+	struct gfs2_holder *gha, *gh;
+	unsigned int slots = 64;
+	unsigned int x;
+	int done;
+	int error = 0, err;
+
+	memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
+	gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+	if (!gha)
+		return -ENOMEM;
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		goto out;
+
+	rgd_next = gfs2_rgrpd_get_first(sdp);
+
+	for (;;) {
+		done = 1;
+
+		for (x = 0; x < slots; x++) {
+			gh = gha + x;
+
+			if (gh->gh_gl && gfs2_glock_poll(gh)) {
+				err = gfs2_glock_wait(gh);
+				if (err) {
+					gfs2_holder_uninit(gh);
+					error = err;
+				} else {
+					if (!error)
+						error = statfs_slow_fill(
+							gh->gh_gl->gl_object, sc);
+					gfs2_glock_dq_uninit(gh);
+				}
+			}
+
+			if (gh->gh_gl)
+				done = 0;
+			else if (rgd_next && !error) {
+				error = gfs2_glock_nq_init(rgd_next->rd_gl,
+							   LM_ST_SHARED,
+							   GL_ASYNC,
+							   gh);
+				rgd_next = gfs2_rgrpd_get_next(rgd_next);
+				done = 0;
+			}
+
+			if (signal_pending(current))
+				error = -ERESTARTSYS;
+		}
+
+		if (done)
+			break;
+
+		yield();
+	}
+
+	gfs2_glock_dq_uninit(&ri_gh);
+
+out:
+	kfree(gha);
+	return error;
+}
+
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+
+	spin_lock(&sdp->sd_statfs_spin);
+
+	*sc = *m_sc;
+	sc->sc_total += l_sc->sc_total;
+	sc->sc_free += l_sc->sc_free;
+	sc->sc_dinodes += l_sc->sc_dinodes;
+
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	if (sc->sc_free < 0)
+		sc->sc_free = 0;
+	if (sc->sc_free > sc->sc_total)
+		sc->sc_free = sc->sc_total;
+	if (sc->sc_dinodes < 0)
+		sc->sc_dinodes = 0;
+
+	return 0;
+}
+
+/**
+ * gfs2_statfs - Gather and return stats about the filesystem
+ * @sb: The superblock
+ * @statfsbuf: The buffer
+ *
+ * Returns: 0 on success or error code
+ */
+
+static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_inode->i_sb;
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_statfs_change_host sc;
+	int error;
+
+	if (gfs2_tune_get(sdp, gt_statfs_slow))
+		error = gfs2_statfs_slow(sdp, &sc);
+	else
+		error = gfs2_statfs_i(sdp, &sc);
+
+	if (error)
+		return error;
+
+	buf->f_type = GFS2_MAGIC;
+	buf->f_bsize = sdp->sd_sb.sb_bsize;
+	buf->f_blocks = sc.sc_total;
+	buf->f_bfree = sc.sc_free;
+	buf->f_bavail = sc.sc_free;
+	buf->f_files = sc.sc_dinodes + sc.sc_free;
+	buf->f_ffree = sc.sc_free;
+	buf->f_namelen = GFS2_FNAMESIZE;
+
+	return 0;
+}
+
+/**
+ * gfs2_remount_fs - called when the FS is remounted
+ * @sb:  the filesystem
+ * @flags:  the remount flags
+ * @data:  extra data passed in (not used right now)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_args args = sdp->sd_args; /* Default to current settings */
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	int error;
+
+	spin_lock(&gt->gt_spin);
+	args.ar_commit = gt->gt_logd_secs;
+	args.ar_quota_quantum = gt->gt_quota_quantum;
+	if (gt->gt_statfs_slow)
+		args.ar_statfs_quantum = 0;
+	else
+		args.ar_statfs_quantum = gt->gt_statfs_quantum;
+	spin_unlock(&gt->gt_spin);
+	error = gfs2_mount_args(&args, data);
+	if (error)
+		return error;
+
+	/* Not allowed to change locking details */
+	if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
+	    strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
+	    strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
+		return -EINVAL;
+
+	/* Some flags must not be changed */
+	if (args_neq(&args, &sdp->sd_args, spectator) ||
+	    args_neq(&args, &sdp->sd_args, localflocks) ||
+	    args_neq(&args, &sdp->sd_args, meta))
+		return -EINVAL;
+
+	if (sdp->sd_args.ar_spectator)
+		*flags |= MS_RDONLY;
+
+	if ((sb->s_flags ^ *flags) & MS_RDONLY) {
+		if (*flags & MS_RDONLY)
+			error = gfs2_make_fs_ro(sdp);
+		else
+			error = gfs2_make_fs_rw(sdp);
+		if (error)
+			return error;
+	}
+
+	sdp->sd_args = args;
+	if (sdp->sd_args.ar_posix_acl)
+		sb->s_flags |= MS_POSIXACL;
+	else
+		sb->s_flags &= ~MS_POSIXACL;
+	if (sdp->sd_args.ar_nobarrier)
+		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+	else
+		clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+	spin_lock(&gt->gt_spin);
+	gt->gt_logd_secs = args.ar_commit;
+	gt->gt_quota_quantum = args.ar_quota_quantum;
+	if (args.ar_statfs_quantum) {
+		gt->gt_statfs_slow = 0;
+		gt->gt_statfs_quantum = args.ar_statfs_quantum;
+	}
+	else {
+		gt->gt_statfs_slow = 1;
+		gt->gt_statfs_quantum = 30;
+	}
+	spin_unlock(&gt->gt_spin);
+
+	gfs2_online_uevent(sdp);
+	return 0;
+}
+
+/**
+ * gfs2_drop_inode - Drop an inode (test for remote unlink)
+ * @inode: The inode to drop
+ *
+ * If we've received a callback on an iopen lock then its because a
+ * remote node tried to deallocate the inode but failed due to this node
+ * still having the inode open. Here we mark the link count zero
+ * since we know that it must have reached zero if the GLF_DEMOTE flag
+ * is set on the iopen glock. If we didn't do a disk read since the
+ * remote node removed the final link then we might otherwise miss
+ * this event. This check ensures that this node will deallocate the
+ * inode's blocks, or alternatively pass the baton on to another
+ * node for later deallocation.
+ */
+
+static int gfs2_drop_inode(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (inode->i_nlink) {
+		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
+			clear_nlink(inode);
+	}
+	return generic_drop_inode(inode);
+}
+
+static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
+{
+	do {
+		if (d1 == d2)
+			return 1;
+		d1 = d1->d_parent;
+	} while (!IS_ROOT(d1));
+	return 0;
+}
+
+/**
+ * gfs2_show_options - Show mount options for /proc/mounts
+ * @s: seq_file structure
+ * @mnt: vfsmount
+ *
+ * Returns: 0 on success or error code
+ */
+
+static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+	struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
+	struct gfs2_args *args = &sdp->sd_args;
+	int val;
+
+	if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
+		seq_printf(s, ",meta");
+	if (args->ar_lockproto[0])
+		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+	if (args->ar_locktable[0])
+		seq_printf(s, ",locktable=%s", args->ar_locktable);
+	if (args->ar_hostdata[0])
+		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+	if (args->ar_spectator)
+		seq_printf(s, ",spectator");
+	if (args->ar_localflocks)
+		seq_printf(s, ",localflocks");
+	if (args->ar_debug)
+		seq_printf(s, ",debug");
+	if (args->ar_posix_acl)
+		seq_printf(s, ",acl");
+	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
+		char *state;
+		switch (args->ar_quota) {
+		case GFS2_QUOTA_OFF:
+			state = "off";
+			break;
+		case GFS2_QUOTA_ACCOUNT:
+			state = "account";
+			break;
+		case GFS2_QUOTA_ON:
+			state = "on";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",quota=%s", state);
+	}
+	if (args->ar_suiddir)
+		seq_printf(s, ",suiddir");
+	if (args->ar_data != GFS2_DATA_DEFAULT) {
+		char *state;
+		switch (args->ar_data) {
+		case GFS2_DATA_WRITEBACK:
+			state = "writeback";
+			break;
+		case GFS2_DATA_ORDERED:
+			state = "ordered";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",data=%s", state);
+	}
+	if (args->ar_discard)
+		seq_printf(s, ",discard");
+	val = sdp->sd_tune.gt_logd_secs;
+	if (val != 30)
+		seq_printf(s, ",commit=%d", val);
+	val = sdp->sd_tune.gt_statfs_quantum;
+	if (val != 30)
+		seq_printf(s, ",statfs_quantum=%d", val);
+	val = sdp->sd_tune.gt_quota_quantum;
+	if (val != 60)
+		seq_printf(s, ",quota_quantum=%d", val);
+	if (args->ar_statfs_percent)
+		seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent);
+	if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
+		const char *state;
+
+		switch (args->ar_errors) {
+		case GFS2_ERRORS_WITHDRAW:
+			state = "withdraw";
+			break;
+		case GFS2_ERRORS_PANIC:
+			state = "panic";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",errors=%s", state);
+	}
+	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
+		seq_printf(s, ",nobarrier");
+	if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
+		seq_printf(s, ",demote_interface_used");
+	return 0;
+}
+
+static void gfs2_final_release_pages(struct gfs2_inode *ip)
+{
+	struct inode *inode = &ip->i_inode;
+	struct gfs2_glock *gl = ip->i_gl;
+
+	truncate_inode_pages(gfs2_glock2aspace(ip->i_gl), 0);
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (atomic_read(&gl->gl_revokes) == 0) {
+		clear_bit(GLF_LFLUSH, &gl->gl_flags);
+		clear_bit(GLF_DIRTY, &gl->gl_flags);
+	}
+}
+
+static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al;
+	struct gfs2_rgrpd *rgd;
+	int error;
+
+	if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
+
+	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out;
+
+	error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+	if (error)
+		goto out_qs;
+
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
+	if (!rgd) {
+		gfs2_consist_inode(ip);
+		error = -EIO;
+		goto out_rindex_relse;
+	}
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+				   &al->al_rgd_gh);
+	if (error)
+		goto out_rindex_relse;
+
+	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA,
+				 sdp->sd_jdesc->jd_blocks);
+	if (error)
+		goto out_rg_gunlock;
+
+	gfs2_free_di(rgd, ip);
+
+	gfs2_final_release_pages(ip);
+
+	gfs2_trans_end(sdp);
+
+out_rg_gunlock:
+	gfs2_glock_dq_uninit(&al->al_rgd_gh);
+out_rindex_relse:
+	gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_qs:
+	gfs2_quota_unhold(ip);
+out:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+/**
+ * gfs2_evict_inode - Remove an inode from cache
+ * @inode: The inode to evict
+ *
+ * There are three cases to consider:
+ * 1. i_nlink == 0, we are final opener (and must deallocate)
+ * 2. i_nlink == 0, we are not the final opener (and cannot deallocate)
+ * 3. i_nlink > 0
+ *
+ * If the fs is read only, then we have to treat all cases as per #3
+ * since we are unable to do any deallocation. The inode will be
+ * deallocated by the next read/write node to attempt an allocation
+ * in the same resource group
+ *
+ * We have to (at the moment) hold the inodes main lock to cover
+ * the gap between unlocking the shared lock on the iopen lock and
+ * taking the exclusive lock. I'd rather do a shared -> exclusive
+ * conversion on the iopen lock, but we can change that later. This
+ * is safe, just less efficient.
+ */
+
+static void gfs2_evict_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int error;
+
+	if (inode->i_nlink || (sb->s_flags & MS_RDONLY))
+		goto out;
+
+	/* Must not read inode block until block type has been verified */
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
+	if (unlikely(error)) {
+		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+		goto out;
+	}
+
+	error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
+	if (error)
+		goto out_truncate;
+
+	if (test_bit(GIF_INVALID, &ip->i_flags)) {
+		error = gfs2_inode_refresh(ip);
+		if (error)
+			goto out_truncate;
+	}
+
+	ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+	gfs2_glock_dq_wait(&ip->i_iopen_gh);
+	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
+	error = gfs2_glock_nq(&ip->i_iopen_gh);
+	if (error)
+		goto out_truncate;
+
+	/* Case 1 starts here */
+
+	if (S_ISDIR(inode->i_mode) &&
+	    (ip->i_diskflags & GFS2_DIF_EXHASH)) {
+		error = gfs2_dir_exhash_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (ip->i_eattr) {
+		error = gfs2_ea_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (!gfs2_is_stuffed(ip)) {
+		error = gfs2_file_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	error = gfs2_dinode_dealloc(ip);
+	goto out_unlock;
+
+out_truncate:
+	/* Case 2 starts here */
+	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
+	if (error)
+		goto out_unlock;
+	/* Needs to be done before glock release & also in a transaction */
+	truncate_inode_pages(&inode->i_data, 0);
+	gfs2_trans_end(sdp);
+
+out_unlock:
+	/* Error path for case 1 */
+	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
+		gfs2_glock_dq(&ip->i_iopen_gh);
+	gfs2_holder_uninit(&ip->i_iopen_gh);
+	gfs2_glock_dq_uninit(&gh);
+	if (error && error != GLR_TRYFAILED && error != -EROFS)
+		fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
+out:
+	/* Case 3 starts here */
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+
+	ip->i_gl->gl_object = NULL;
+	gfs2_glock_add_to_lru(ip->i_gl);
+	gfs2_glock_put(ip->i_gl);
+	ip->i_gl = NULL;
+	if (ip->i_iopen_gh.gh_gl) {
+		ip->i_iopen_gh.gh_gl->gl_object = NULL;
+		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+	}
+}
+
+static struct inode *gfs2_alloc_inode(struct super_block *sb)
+{
+	struct gfs2_inode *ip;
+
+	ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
+	if (ip) {
+		ip->i_flags = 0;
+		ip->i_gl = NULL;
+	}
+	return &ip->i_inode;
+}
+
+static void gfs2_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(gfs2_inode_cachep, inode);
+}
+
+static void gfs2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, gfs2_i_callback);
+}
+
+const struct super_operations gfs2_super_ops = {
+	.alloc_inode		= gfs2_alloc_inode,
+	.destroy_inode		= gfs2_destroy_inode,
+	.write_inode		= gfs2_write_inode,
+	.evict_inode		= gfs2_evict_inode,
+	.put_super		= gfs2_put_super,
+	.sync_fs		= gfs2_sync_fs,
+	.freeze_fs 		= gfs2_freeze,
+	.unfreeze_fs		= gfs2_unfreeze,
+	.statfs			= gfs2_statfs,
+	.remount_fs		= gfs2_remount_fs,
+	.drop_inode		= gfs2_drop_inode,
+	.show_options		= gfs2_show_options,
+};
+
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
new file mode 100644
index 00000000..a0464680
--- /dev/null
+++ b/fs/gfs2/super.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __SUPER_DOT_H__
+#define __SUPER_DOT_H__
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include "incore.h"
+
+extern void gfs2_lm_unmount(struct gfs2_sbd *sdp);
+
+static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
+{
+	unsigned int x;
+	spin_lock(&sdp->sd_jindex_spin);
+	x = sdp->sd_journals;
+	spin_unlock(&sdp->sd_jindex_spin);
+	return x;
+}
+
+extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
+
+extern int gfs2_mount_args(struct gfs2_args *args, char *data);
+
+extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
+extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
+
+extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
+				     struct gfs2_inode **ipp);
+
+extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
+extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
+extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
+			       s64 dinodes);
+extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
+				  const void *buf);
+extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+			  struct buffer_head *l_bh);
+extern int gfs2_statfs_sync(struct super_block *sb, int type);
+
+extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
+extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
+
+extern struct file_system_type gfs2_fs_type;
+extern struct file_system_type gfs2meta_fs_type;
+extern const struct export_operations gfs2_export_ops;
+extern const struct super_operations gfs2_super_ops;
+extern const struct dentry_operations gfs2_dops;
+extern const struct xattr_handler *gfs2_xattr_handlers[];
+
+#endif /* __SUPER_DOT_H__ */
+
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
new file mode 100644
index 00000000..443cabcf
--- /dev/null
+++ b/fs/gfs2/sys.c
@@ -0,0 +1,653 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <asm/uaccess.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/genhd.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "sys.h"
+#include "super.h"
+#include "glock.h"
+#include "quota.h"
+#include "util.h"
+#include "glops.h"
+#include "recovery.h"
+
+struct gfs2_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct gfs2_sbd *, char *);
+	ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+
+static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+	return a->show ? a->show(sdp, buf) : 0;
+}
+
+static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+	return a->store ? a->store(sdp, buf, len) : len;
+}
+
+static const struct sysfs_ops gfs2_attr_ops = {
+	.show  = gfs2_attr_show,
+	.store = gfs2_attr_store,
+};
+
+
+static struct kset *gfs2_kset;
+
+static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u:%u\n",
+			MAJOR(sdp->sd_vfs->s_dev), MINOR(sdp->sd_vfs->s_dev));
+}
+
+static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname);
+}
+
+static int gfs2_uuid_valid(const u8 *uuid)
+{
+	int i;
+
+	for (i = 0; i < 16; i++) {
+		if (uuid[i])
+			return 1;
+	}
+	return 0;
+}
+
+static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct super_block *s = sdp->sd_vfs;
+	const u8 *uuid = s->s_uuid;
+	buf[0] = '\0';
+	if (!gfs2_uuid_valid(uuid))
+		return 0;
+	return snprintf(buf, PAGE_SIZE, "%pUB\n", uuid);
+}
+
+static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
+{
+	unsigned int count;
+
+	mutex_lock(&sdp->sd_freeze_lock);
+	count = sdp->sd_freeze_count;
+	mutex_unlock(&sdp->sd_freeze_lock);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", count);
+}
+
+static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	ssize_t ret = len;
+	int error = 0;
+	int n = simple_strtol(buf, NULL, 0);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	switch (n) {
+	case 0:
+		gfs2_unfreeze_fs(sdp);
+		break;
+	case 1:
+		error = gfs2_freeze_fs(sdp);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (error)
+		fs_warn(sdp, "freeze %d error %d", n, error);
+
+	return ret;
+}
+
+static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
+{
+	unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags);
+	return snprintf(buf, PAGE_SIZE, "%u\n", b);
+}
+
+static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (simple_strtol(buf, NULL, 0) != 1)
+		return -EINVAL;
+
+	gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: withdrawing from cluster at user's request\n",
+		sdp->sd_fsname);
+	return len;
+}
+
+static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
+				 size_t len)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (simple_strtol(buf, NULL, 0) != 1)
+		return -EINVAL;
+
+	gfs2_statfs_sync(sdp->sd_vfs, 0);
+	return len;
+}
+
+static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
+				size_t len)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (simple_strtol(buf, NULL, 0) != 1)
+		return -EINVAL;
+
+	gfs2_quota_sync(sdp->sd_vfs, 0, 1);
+	return len;
+}
+
+static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
+					size_t len)
+{
+	int error;
+	u32 id;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	id = simple_strtoul(buf, NULL, 0);
+
+	error = gfs2_quota_refresh(sdp, 1, id);
+	return error ? error : len;
+}
+
+static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
+					 size_t len)
+{
+	int error;
+	u32 id;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	id = simple_strtoul(buf, NULL, 0);
+
+	error = gfs2_quota_refresh(sdp, 0, id);
+	return error ? error : len;
+}
+
+static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	struct gfs2_glock *gl;
+	const struct gfs2_glock_operations *glops;
+	unsigned int glmode;
+	unsigned int gltype;
+	unsigned long long glnum;
+	char mode[16];
+	int rv;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	rv = sscanf(buf, "%u:%llu %15s", &gltype, &glnum,
+		    mode);
+	if (rv != 3)
+		return -EINVAL;
+
+	if (strcmp(mode, "EX") == 0)
+		glmode = LM_ST_UNLOCKED;
+	else if ((strcmp(mode, "CW") == 0) || (strcmp(mode, "DF") == 0))
+		glmode = LM_ST_DEFERRED;
+	else if ((strcmp(mode, "PR") == 0) || (strcmp(mode, "SH") == 0))
+		glmode = LM_ST_SHARED;
+	else
+		return -EINVAL;
+
+	if (gltype > LM_TYPE_JOURNAL)
+		return -EINVAL;
+	if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
+		glops = &gfs2_trans_glops;
+	else
+		glops = gfs2_glops_list[gltype];
+	if (glops == NULL)
+		return -EINVAL;
+	if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
+		fs_info(sdp, "demote interface used\n");
+	rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
+	if (rv)
+		return rv;
+	gfs2_glock_cb(gl, glmode);
+	gfs2_glock_put(gl);
+	return len;
+}
+
+
+#define GFS2_ATTR(name, mode, show, store) \
+static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
+
+GFS2_ATTR(id,                  0444, id_show,       NULL);
+GFS2_ATTR(fsname,              0444, fsname_show,   NULL);
+GFS2_ATTR(uuid,                0444, uuid_show,     NULL);
+GFS2_ATTR(freeze,              0644, freeze_show,   freeze_store);
+GFS2_ATTR(withdraw,            0644, withdraw_show, withdraw_store);
+GFS2_ATTR(statfs_sync,         0200, NULL,          statfs_sync_store);
+GFS2_ATTR(quota_sync,          0200, NULL,          quota_sync_store);
+GFS2_ATTR(quota_refresh_user,  0200, NULL,          quota_refresh_user_store);
+GFS2_ATTR(quota_refresh_group, 0200, NULL,          quota_refresh_group_store);
+GFS2_ATTR(demote_rq,           0200, NULL,	    demote_rq_store);
+
+static struct attribute *gfs2_attrs[] = {
+	&gfs2_attr_id.attr,
+	&gfs2_attr_fsname.attr,
+	&gfs2_attr_uuid.attr,
+	&gfs2_attr_freeze.attr,
+	&gfs2_attr_withdraw.attr,
+	&gfs2_attr_statfs_sync.attr,
+	&gfs2_attr_quota_sync.attr,
+	&gfs2_attr_quota_refresh_user.attr,
+	&gfs2_attr_quota_refresh_group.attr,
+	&gfs2_attr_demote_rq.attr,
+	NULL,
+};
+
+static struct kobj_type gfs2_ktype = {
+	.default_attrs = gfs2_attrs,
+	.sysfs_ops     = &gfs2_attr_ops,
+};
+
+
+/*
+ * lock_module. Originally from lock_dlm
+ */
+
+static ssize_t proto_name_show(struct gfs2_sbd *sdp, char *buf)
+{
+	const struct lm_lockops *ops = sdp->sd_lockstruct.ls_ops;
+	return sprintf(buf, "%s\n", ops->lm_proto_name);
+}
+
+static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ssize_t ret;
+	int val = 0;
+
+	if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))
+		val = 1;
+	ret = sprintf(buf, "%d\n", val);
+	return ret;
+}
+
+static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ssize_t ret = len;
+	int val;
+
+	val = simple_strtol(buf, NULL, 0);
+
+	if (val == 1)
+		set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+	else if (val == 0) {
+		clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+		smp_mb__after_clear_bit();
+		gfs2_glock_thaw(sdp);
+	} else {
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_first);
+}
+
+static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	unsigned first;
+	int rv;
+
+	rv = sscanf(buf, "%u", &first);
+	if (rv != 1 || first > 1)
+		return -EINVAL;
+	rv = wait_for_completion_killable(&sdp->sd_locking_init);
+	if (rv)
+		return rv;
+	spin_lock(&sdp->sd_jindex_spin);
+	rv = -EBUSY;
+	if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+		goto out;
+	rv = -EINVAL;
+	if (sdp->sd_args.ar_spectator)
+		goto out;
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+		goto out;
+        sdp->sd_lockstruct.ls_first = first;
+        rv = 0;
+out:
+        spin_unlock(&sdp->sd_jindex_spin);
+        return rv ? rv : len;
+}
+
+static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_first_done);
+}
+
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	unsigned jid;
+	struct gfs2_jdesc *jd;
+	int rv;
+
+	rv = sscanf(buf, "%u", &jid);
+	if (rv != 1)
+		return -EINVAL;
+
+	rv = -ESHUTDOWN;
+	spin_lock(&sdp->sd_jindex_spin);
+	if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
+		goto out;
+	rv = -EBUSY;
+	if (sdp->sd_jdesc->jd_jid == jid)
+		goto out;
+	rv = -ENOENT;
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (jd->jd_jid != jid)
+			continue;
+		rv = gfs2_recover_journal(jd, false);
+		break;
+	}
+out:
+	spin_unlock(&sdp->sd_jindex_spin);
+	return rv ? rv : len;
+}
+
+static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_recover_jid_done);
+}
+
+static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
+}
+
+static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
+{
+	return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid);
+}
+
+static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+        int jid;
+	int rv;
+
+	rv = sscanf(buf, "%d", &jid);
+	if (rv != 1)
+		return -EINVAL;
+	rv = wait_for_completion_killable(&sdp->sd_locking_init);
+	if (rv)
+		return rv;
+	spin_lock(&sdp->sd_jindex_spin);
+	rv = -EINVAL;
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+		goto out;
+	rv = -EBUSY;
+	if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+		goto out;
+	rv = 0;
+	if (sdp->sd_args.ar_spectator && jid > 0)
+		rv = jid = -EINVAL;
+	sdp->sd_lockstruct.ls_jid = jid;
+	clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
+out:
+	spin_unlock(&sdp->sd_jindex_spin);
+	return rv ? rv : len;
+}
+
+#define GDLM_ATTR(_name,_mode,_show,_store) \
+static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
+
+GDLM_ATTR(proto_name,		0444, proto_name_show,		NULL);
+GDLM_ATTR(block,		0644, block_show,		block_store);
+GDLM_ATTR(withdraw,		0644, withdraw_show,		withdraw_store);
+GDLM_ATTR(jid,			0644, jid_show,			jid_store);
+GDLM_ATTR(first,		0644, lkfirst_show,		lkfirst_store);
+GDLM_ATTR(first_done,		0444, first_done_show,		NULL);
+GDLM_ATTR(recover,		0600, NULL,			recover_store);
+GDLM_ATTR(recover_done,		0444, recover_done_show,	NULL);
+GDLM_ATTR(recover_status,	0444, recover_status_show,	NULL);
+
+static struct attribute *lock_module_attrs[] = {
+	&gdlm_attr_proto_name.attr,
+	&gdlm_attr_block.attr,
+	&gdlm_attr_withdraw.attr,
+	&gdlm_attr_jid.attr,
+	&gdlm_attr_first.attr,
+	&gdlm_attr_first_done.attr,
+	&gdlm_attr_recover.attr,
+	&gdlm_attr_recover_done.attr,
+	&gdlm_attr_recover_status.attr,
+	NULL,
+};
+
+/*
+ * get and set struct gfs2_tune fields
+ */
+
+static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u %u\n",
+			sdp->sd_tune.gt_quota_scale_num,
+			sdp->sd_tune.gt_quota_scale_den);
+}
+
+static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
+				 size_t len)
+{
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	unsigned int x, y;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
+		return -EINVAL;
+
+	spin_lock(&gt->gt_spin);
+	gt->gt_quota_scale_num = x;
+	gt->gt_quota_scale_den = y;
+	spin_unlock(&gt->gt_spin);
+	return len;
+}
+
+static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
+			int check_zero, const char *buf, size_t len)
+{
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	unsigned int x;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	x = simple_strtoul(buf, NULL, 0);
+
+	if (check_zero && !x)
+		return -EINVAL;
+
+	spin_lock(&gt->gt_spin);
+	*field = x;
+	spin_unlock(&gt->gt_spin);
+	return len;
+}
+
+#define TUNE_ATTR_3(name, show, store)                                        \
+static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store)
+
+#define TUNE_ATTR_2(name, store)                                              \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                   \
+{                                                                             \
+	return snprintf(buf, PAGE_SIZE, "%u\n", sdp->sd_tune.gt_##name);      \
+}                                                                             \
+TUNE_ATTR_3(name, name##_show, store)
+
+#define TUNE_ATTR(name, check_zero)                                           \
+static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
+{                                                                             \
+	return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len);  \
+}                                                                             \
+TUNE_ATTR_2(name, name##_store)
+
+TUNE_ATTR(quota_warn_period, 0);
+TUNE_ATTR(quota_quantum, 0);
+TUNE_ATTR(max_readahead, 0);
+TUNE_ATTR(complain_secs, 0);
+TUNE_ATTR(statfs_slow, 0);
+TUNE_ATTR(new_files_jdata, 0);
+TUNE_ATTR(quota_simul_sync, 1);
+TUNE_ATTR(statfs_quantum, 1);
+TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
+
+static struct attribute *tune_attrs[] = {
+	&tune_attr_quota_warn_period.attr,
+	&tune_attr_quota_quantum.attr,
+	&tune_attr_max_readahead.attr,
+	&tune_attr_complain_secs.attr,
+	&tune_attr_statfs_slow.attr,
+	&tune_attr_quota_simul_sync.attr,
+	&tune_attr_statfs_quantum.attr,
+	&tune_attr_quota_scale.attr,
+	&tune_attr_new_files_jdata.attr,
+	NULL,
+};
+
+static struct attribute_group tune_group = {
+	.name = "tune",
+	.attrs = tune_attrs,
+};
+
+static struct attribute_group lock_module_group = {
+	.name = "lock_module",
+	.attrs = lock_module_attrs,
+};
+
+int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	int error;
+	char ro[20];
+	char spectator[20];
+	char *envp[] = { ro, spectator, NULL };
+
+	sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+	sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
+
+	sdp->sd_kobj.kset = gfs2_kset;
+	error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
+				     "%s", sdp->sd_table_name);
+	if (error)
+		goto fail;
+
+	error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
+	if (error)
+		goto fail_reg;
+
+	error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group);
+	if (error)
+		goto fail_tune;
+
+	error = sysfs_create_link(&sdp->sd_kobj,
+				  &disk_to_dev(sb->s_bdev->bd_disk)->kobj,
+				  "device");
+	if (error)
+		goto fail_lock_module;
+
+	kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp);
+	return 0;
+
+fail_lock_module:
+	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
+fail_tune:
+	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
+fail_reg:
+	kobject_put(&sdp->sd_kobj);
+fail:
+	fs_err(sdp, "error %d adding sysfs files", error);
+	return error;
+}
+
+void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
+{
+	sysfs_remove_link(&sdp->sd_kobj, "device");
+	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
+	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
+	kobject_put(&sdp->sd_kobj);
+}
+
+static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
+		       struct kobj_uevent_env *env)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	struct super_block *s = sdp->sd_vfs;
+	const u8 *uuid = s->s_uuid;
+
+	add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
+	add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+	if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
+		add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
+	if (gfs2_uuid_valid(uuid))
+		add_uevent_var(env, "UUID=%pUB", uuid);
+	return 0;
+}
+
+static const struct kset_uevent_ops gfs2_uevent_ops = {
+	.uevent = gfs2_uevent,
+};
+
+int gfs2_sys_init(void)
+{
+	gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
+	if (!gfs2_kset)
+		return -ENOMEM;
+	return 0;
+}
+
+void gfs2_sys_uninit(void)
+{
+	kset_unregister(gfs2_kset);
+}
+
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
new file mode 100644
index 00000000..e94560e8
--- /dev/null
+++ b/fs/gfs2/sys.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __SYS_DOT_H__
+#define __SYS_DOT_H__
+
+#include <linux/spinlock.h>
+struct gfs2_sbd;
+
+int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
+void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
+
+int gfs2_sys_init(void);
+void gfs2_sys_uninit(void);
+
+#endif /* __SYS_DOT_H__ */
+
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
new file mode 100644
index 00000000..5d07609e
--- /dev/null
+++ b/fs/gfs2/trace_gfs2.h
@@ -0,0 +1,438 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM gfs2
+
+#if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_GFS2_H
+
+#include <linux/tracepoint.h>
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/dlmconstants.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/writeback.h>
+#include "incore.h"
+#include "glock.h"
+
+#define dlm_state_name(nn) { DLM_LOCK_##nn, #nn }
+#define glock_trace_name(x) __print_symbolic(x,		\
+			    dlm_state_name(IV),		\
+			    dlm_state_name(NL),		\
+			    dlm_state_name(CR),		\
+			    dlm_state_name(CW),		\
+			    dlm_state_name(PR),		\
+			    dlm_state_name(PW),		\
+			    dlm_state_name(EX))
+
+#define block_state_name(x) __print_symbolic(x,			\
+			    { GFS2_BLKST_FREE, "free" },	\
+			    { GFS2_BLKST_USED, "used" },	\
+			    { GFS2_BLKST_DINODE, "dinode" },	\
+			    { GFS2_BLKST_UNLINKED, "unlinked" })
+
+#define show_glock_flags(flags) __print_flags(flags, "",	\
+	{(1UL << GLF_LOCK),			"l" },		\
+	{(1UL << GLF_DEMOTE),			"D" },		\
+	{(1UL << GLF_PENDING_DEMOTE),		"d" },		\
+	{(1UL << GLF_DEMOTE_IN_PROGRESS),	"p" },		\
+	{(1UL << GLF_DIRTY),			"y" },		\
+	{(1UL << GLF_LFLUSH),			"f" },		\
+	{(1UL << GLF_INVALIDATE_IN_PROGRESS),	"i" },		\
+	{(1UL << GLF_REPLY_PENDING),		"r" },		\
+	{(1UL << GLF_INITIAL),			"I" },		\
+	{(1UL << GLF_FROZEN),			"F" },		\
+	{(1UL << GLF_QUEUED),			"q" },		\
+	{(1UL << GLF_LRU),			"L" },		\
+	{(1UL << GLF_OBJECT),			"o" })
+
+#ifndef NUMPTY
+#define NUMPTY
+static inline u8 glock_trace_state(unsigned int state)
+{
+	switch(state) {
+	case LM_ST_SHARED:
+		return DLM_LOCK_PR;
+	case LM_ST_DEFERRED:
+		return DLM_LOCK_CW;
+	case LM_ST_EXCLUSIVE:
+		return DLM_LOCK_EX;
+	}
+	return DLM_LOCK_NL;
+}
+#endif
+
+/* Section 1 - Locking
+ *
+ * Objectives:
+ * Latency: Remote demote request to state change
+ * Latency: Local lock request to state change
+ * Latency: State change to lock grant
+ * Correctness: Ordering of local lock state vs. I/O requests
+ * Correctness: Responses to remote demote requests
+ */
+
+/* General glock state change (DLM lock request completes) */
+TRACE_EVENT(gfs2_glock_state_change,
+
+	TP_PROTO(const struct gfs2_glock *gl, unsigned int new_state),
+
+	TP_ARGS(gl, new_state),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	u64,	glnum			)
+		__field(	u32,	gltype			)
+		__field(	u8,	cur_state		)
+		__field(	u8,	new_state		)
+		__field(	u8,	dmt_state		)
+		__field(	u8,	tgt_state		)
+		__field(	unsigned long,	flags		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= gl->gl_sbd->sd_vfs->s_dev;
+		__entry->glnum		= gl->gl_name.ln_number;
+		__entry->gltype		= gl->gl_name.ln_type;
+		__entry->cur_state	= glock_trace_state(gl->gl_state);
+		__entry->new_state	= glock_trace_state(new_state);
+		__entry->tgt_state	= glock_trace_state(gl->gl_target);
+		__entry->dmt_state	= glock_trace_state(gl->gl_demote_state);
+		__entry->flags		= gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
+	),
+
+	TP_printk("%u,%u glock %d:%lld state %s to %s tgt:%s dmt:%s flags:%s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+		 (unsigned long long)__entry->glnum,
+		  glock_trace_name(__entry->cur_state),
+		  glock_trace_name(__entry->new_state),
+		  glock_trace_name(__entry->tgt_state),
+		  glock_trace_name(__entry->dmt_state),
+		  show_glock_flags(__entry->flags))
+);
+
+/* State change -> unlocked, glock is being deallocated */
+TRACE_EVENT(gfs2_glock_put,
+
+	TP_PROTO(const struct gfs2_glock *gl),
+
+	TP_ARGS(gl),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	glnum			)
+		__field(	u32,	gltype			)
+		__field(	u8,	cur_state		)
+		__field(	unsigned long,	flags		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= gl->gl_sbd->sd_vfs->s_dev;
+		__entry->gltype		= gl->gl_name.ln_type;
+		__entry->glnum		= gl->gl_name.ln_number;
+		__entry->cur_state	= glock_trace_state(gl->gl_state);
+		__entry->flags		= gl->gl_flags  | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
+	),
+
+	TP_printk("%u,%u glock %d:%lld state %s => %s flags:%s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->gltype, (unsigned long long)__entry->glnum,
+                  glock_trace_name(__entry->cur_state),
+		  glock_trace_name(DLM_LOCK_IV),
+		  show_glock_flags(__entry->flags))
+
+);
+
+/* Callback (local or remote) requesting lock demotion */
+TRACE_EVENT(gfs2_demote_rq,
+
+	TP_PROTO(const struct gfs2_glock *gl),
+
+	TP_ARGS(gl),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	glnum			)
+		__field(	u32,	gltype			)
+		__field(	u8,	cur_state		)
+		__field(	u8,	dmt_state		)
+		__field(	unsigned long,	flags		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= gl->gl_sbd->sd_vfs->s_dev;
+		__entry->gltype		= gl->gl_name.ln_type;
+		__entry->glnum		= gl->gl_name.ln_number;
+		__entry->cur_state	= glock_trace_state(gl->gl_state);
+		__entry->dmt_state	= glock_trace_state(gl->gl_demote_state);
+		__entry->flags		= gl->gl_flags  | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
+	),
+
+	TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+		  (unsigned long long)__entry->glnum,
+                  glock_trace_name(__entry->cur_state),
+                  glock_trace_name(__entry->dmt_state),
+		  show_glock_flags(__entry->flags))
+
+);
+
+/* Promotion/grant of a glock */
+TRACE_EVENT(gfs2_promote,
+
+	TP_PROTO(const struct gfs2_holder *gh, int first),
+
+	TP_ARGS(gh, first),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	glnum			)
+		__field(	u32,	gltype			)
+		__field(	int,	first			)
+		__field(	u8,	state			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+		__entry->glnum	= gh->gh_gl->gl_name.ln_number;
+		__entry->gltype	= gh->gh_gl->gl_name.ln_type;
+		__entry->first	= first;
+		__entry->state	= glock_trace_state(gh->gh_state);
+	),
+
+	TP_printk("%u,%u glock %u:%llu promote %s %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+		  (unsigned long long)__entry->glnum,
+		  __entry->first ? "first": "other",
+		  glock_trace_name(__entry->state))
+);
+
+/* Queue/dequeue a lock request */
+TRACE_EVENT(gfs2_glock_queue,
+
+	TP_PROTO(const struct gfs2_holder *gh, int queue),
+
+	TP_ARGS(gh, queue),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	glnum			)
+		__field(	u32,	gltype			)
+		__field(	int,	queue			)
+		__field(	u8,	state			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+		__entry->glnum	= gh->gh_gl->gl_name.ln_number;
+		__entry->gltype	= gh->gh_gl->gl_name.ln_type;
+		__entry->queue	= queue;
+		__entry->state	= glock_trace_state(gh->gh_state);
+	),
+
+	TP_printk("%u,%u glock %u:%llu %squeue %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+		  (unsigned long long)__entry->glnum,
+		  __entry->queue ? "" : "de",
+		  glock_trace_name(__entry->state))
+);
+
+/* Section 2 - Log/journal
+ *
+ * Objectives:
+ * Latency: Log flush time
+ * Correctness: pin/unpin vs. disk I/O ordering
+ * Performance: Log usage stats
+ */
+
+/* Pin/unpin a block in the log */
+TRACE_EVENT(gfs2_pin,
+
+	TP_PROTO(const struct gfs2_bufdata *bd, int pin),
+
+	TP_ARGS(bd, pin),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	int,	pin			)
+		__field(	u32,	len			)
+		__field(	sector_t,	block		)
+		__field(	u64,	ino			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bd->bd_gl->gl_sbd->sd_vfs->s_dev;
+		__entry->pin		= pin;
+		__entry->len		= bd->bd_bh->b_size;
+		__entry->block		= bd->bd_bh->b_blocknr;
+		__entry->ino		= bd->bd_gl->gl_name.ln_number;
+	),
+
+	TP_printk("%u,%u log %s %llu/%lu inode %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->pin ? "pin" : "unpin",
+		  (unsigned long long)__entry->block,
+		  (unsigned long)__entry->len,
+		  (unsigned long long)__entry->ino)
+);
+
+/* Flushing the log */
+TRACE_EVENT(gfs2_log_flush,
+
+	TP_PROTO(const struct gfs2_sbd *sdp, int start),
+
+	TP_ARGS(sdp, start),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	int,	start			)
+		__field(	u64,	log_seq			)
+	),
+
+	TP_fast_assign(
+		__entry->dev            = sdp->sd_vfs->s_dev;
+		__entry->start		= start;
+		__entry->log_seq	= sdp->sd_log_sequence;
+	),
+
+	TP_printk("%u,%u log flush %s %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->start ? "start" : "end",
+		  (unsigned long long)__entry->log_seq)
+);
+
+/* Reserving/releasing blocks in the log */
+TRACE_EVENT(gfs2_log_blocks,
+
+	TP_PROTO(const struct gfs2_sbd *sdp, int blocks),
+
+	TP_ARGS(sdp, blocks),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	int,	blocks			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= sdp->sd_vfs->s_dev;
+		__entry->blocks		= blocks;
+	),
+
+	TP_printk("%u,%u log reserve %d", MAJOR(__entry->dev),
+		  MINOR(__entry->dev), __entry->blocks)
+);
+
+/* Writing back the AIL */
+TRACE_EVENT(gfs2_ail_flush,
+
+	TP_PROTO(const struct gfs2_sbd *sdp, const struct writeback_control *wbc, int start),
+
+	TP_ARGS(sdp, wbc, start),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	int, start			)
+		__field(	int, sync_mode			)
+		__field(	long, nr_to_write		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= sdp->sd_vfs->s_dev;
+		__entry->start		= start;
+		__entry->sync_mode	= wbc->sync_mode;
+		__entry->nr_to_write	= wbc->nr_to_write;
+	),
+
+	TP_printk("%u,%u ail flush %s %s %ld", MAJOR(__entry->dev),
+		  MINOR(__entry->dev), __entry->start ? "start" : "end",
+		  __entry->sync_mode == WB_SYNC_ALL ? "all" : "none",
+		  __entry->nr_to_write)
+);
+
+/* Section 3 - bmap
+ *
+ * Objectives:
+ * Latency: Bmap request time
+ * Performance: Block allocator tracing
+ * Correctness: Test of disard generation vs. blocks allocated
+ */
+
+/* Map an extent of blocks, possibly a new allocation */
+TRACE_EVENT(gfs2_bmap,
+
+	TP_PROTO(const struct gfs2_inode *ip, const struct buffer_head *bh,
+		sector_t lblock, int create, int errno),
+
+	TP_ARGS(ip, bh, lblock, create, errno),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	sector_t, lblock		)
+		__field(	sector_t, pblock		)
+		__field(	u64,	inum			)
+		__field(	unsigned long, state		)
+		__field(	u32,	len			)
+		__field(	int,	create			)
+		__field(	int,	errno			)
+	),
+
+	TP_fast_assign(
+		__entry->dev            = ip->i_gl->gl_sbd->sd_vfs->s_dev;
+		__entry->lblock		= lblock;
+		__entry->pblock		= buffer_mapped(bh) ?  bh->b_blocknr : 0;
+		__entry->inum		= ip->i_no_addr;
+		__entry->state		= bh->b_state;
+		__entry->len		= bh->b_size;
+		__entry->create		= create;
+		__entry->errno		= errno;
+	),
+
+	TP_printk("%u,%u bmap %llu map %llu/%lu to %llu flags:%08lx %s %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->inum,
+		  (unsigned long long)__entry->lblock,
+		  (unsigned long)__entry->len,
+		  (unsigned long long)__entry->pblock,
+		  __entry->state, __entry->create ? "create " : "nocreate",
+		  __entry->errno)
+);
+
+/* Keep track of blocks as they are allocated/freed */
+TRACE_EVENT(gfs2_block_alloc,
+
+	TP_PROTO(const struct gfs2_inode *ip, u64 block, unsigned len,
+		u8 block_state),
+
+	TP_ARGS(ip, block, len, block_state),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	start			)
+		__field(	u64,	inum			)
+		__field(	u32,	len			)
+		__field(	u8,	block_state		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= ip->i_gl->gl_sbd->sd_vfs->s_dev;
+		__entry->start		= block;
+		__entry->inum		= ip->i_no_addr;
+		__entry->len		= len;
+		__entry->block_state	= block_state;
+	),
+
+	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->inum,
+		  (unsigned long long)__entry->start,
+		  (unsigned long)__entry->len,
+		  block_state_name(__entry->block_state))
+);
+
+#endif /* _TRACE_GFS2_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_gfs2
+#include <trace/define_trace.h>
+
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
new file mode 100644
index 00000000..9ec73a85
--- /dev/null
+++ b/fs/gfs2/trans.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/kallsyms.h>
+#include <linux/gfs2_ondisk.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "trans.h"
+#include "util.h"
+#include "trace_gfs2.h"
+
+int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+		     unsigned int revokes)
+{
+	struct gfs2_trans *tr;
+	int error;
+
+	BUG_ON(current->journal_info);
+	BUG_ON(blocks == 0 && revokes == 0);
+
+	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+		return -EROFS;
+
+	tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
+	if (!tr)
+		return -ENOMEM;
+
+	tr->tr_ip = (unsigned long)__builtin_return_address(0);
+	tr->tr_blocks = blocks;
+	tr->tr_revokes = revokes;
+	tr->tr_reserved = 1;
+	if (blocks)
+		tr->tr_reserved += 6 + blocks;
+	if (revokes)
+		tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
+						   sizeof(u64));
+	INIT_LIST_HEAD(&tr->tr_list_buf);
+
+	gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
+
+	error = gfs2_glock_nq(&tr->tr_t_gh);
+	if (error)
+		goto fail_holder_uninit;
+
+	error = gfs2_log_reserve(sdp, tr->tr_reserved);
+	if (error)
+		goto fail_gunlock;
+
+	current->journal_info = tr;
+
+	return 0;
+
+fail_gunlock:
+	gfs2_glock_dq(&tr->tr_t_gh);
+
+fail_holder_uninit:
+	gfs2_holder_uninit(&tr->tr_t_gh);
+	kfree(tr);
+
+	return error;
+}
+
+/**
+ * gfs2_log_release - Release a given number of log blocks
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks
+ *
+ */
+
+static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
+{
+
+	atomic_add(blks, &sdp->sd_log_blks_free);
+	trace_gfs2_log_blocks(sdp, blks);
+	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+				  sdp->sd_jdesc->jd_blocks);
+	up_read(&sdp->sd_log_flush_lock);
+}
+
+void gfs2_trans_end(struct gfs2_sbd *sdp)
+{
+	struct gfs2_trans *tr = current->journal_info;
+
+	BUG_ON(!tr);
+	current->journal_info = NULL;
+
+	if (!tr->tr_touched) {
+		gfs2_log_release(sdp, tr->tr_reserved);
+		if (tr->tr_t_gh.gh_gl) {
+			gfs2_glock_dq(&tr->tr_t_gh);
+			gfs2_holder_uninit(&tr->tr_t_gh);
+			kfree(tr);
+		}
+		return;
+	}
+
+	if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) {
+		fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ",
+		       tr->tr_num_buf, tr->tr_blocks);
+		print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
+	}
+	if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) {
+		fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ",
+		       tr->tr_num_revoke, tr->tr_revokes);
+		print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
+	}
+
+	gfs2_log_commit(sdp, tr);
+	if (tr->tr_t_gh.gh_gl) {
+		gfs2_glock_dq(&tr->tr_t_gh);
+		gfs2_holder_uninit(&tr->tr_t_gh);
+		kfree(tr);
+	}
+
+	if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
+		gfs2_log_flush(sdp, NULL);
+}
+
+/**
+ * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
+ * @gl: the glock the buffer belongs to
+ * @bh: The buffer to add
+ * @meta: True in the case of adding metadata
+ *
+ */
+
+void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_bufdata *bd;
+
+	bd = bh->b_private;
+	if (bd)
+		gfs2_assert(sdp, bd->bd_gl == gl);
+	else {
+		gfs2_attach_bufdata(gl, bh, meta);
+		bd = bh->b_private;
+	}
+	lops_add(sdp, &bd->bd_le);
+}
+
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
+{
+	BUG_ON(!list_empty(&bd->bd_le.le_list));
+	BUG_ON(!list_empty(&bd->bd_ail_st_list));
+	BUG_ON(!list_empty(&bd->bd_ail_gl_list));
+	lops_init_le(&bd->bd_le, &gfs2_revoke_lops);
+	lops_add(sdp, &bd->bd_le);
+}
+
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
+{
+	struct gfs2_bufdata *bd, *tmp;
+	struct gfs2_trans *tr = current->journal_info;
+	unsigned int n = len;
+
+	gfs2_log_lock(sdp);
+	list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_le.le_list) {
+		if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) {
+			list_del_init(&bd->bd_le.le_list);
+			gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
+			sdp->sd_log_num_revoke--;
+			kmem_cache_free(gfs2_bufdata_cachep, bd);
+			tr->tr_num_revoke_rm++;
+			if (--n == 0)
+				break;
+		}
+	}
+	gfs2_log_unlock(sdp);
+}
+
+void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
+{
+	lops_add(rgd->rd_sbd, &rgd->rd_le);
+}
+
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
new file mode 100644
index 00000000..fb56b783
--- /dev/null
+++ b/fs/gfs2/trans.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __TRANS_DOT_H__
+#define __TRANS_DOT_H__
+
+#include <linux/buffer_head.h>
+struct gfs2_sbd;
+struct gfs2_rgrpd;
+struct gfs2_glock;
+
+#define RES_DINODE	1
+#define RES_INDIRECT	1
+#define RES_JDATA	1
+#define RES_DATA	1
+#define RES_LEAF	1
+#define RES_RG_HDR	1
+#define RES_RG_BIT	2
+#define RES_EATTR	1
+#define RES_STATFS	1
+#define RES_QUOTA	2
+
+/* reserve either the number of blocks to be allocated plus the rg header
+ * block, or all of the blocks in the rg, whichever is smaller */
+static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
+{
+	return (al->al_requested < al->al_rgd->rd_length)?
+	       al->al_requested + 1 : al->al_rgd->rd_length;
+}
+
+int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+		     unsigned int revokes);
+
+void gfs2_trans_end(struct gfs2_sbd *sdp);
+
+void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
+void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
+
+#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
new file mode 100644
index 00000000..53511291
--- /dev/null
+++ b/fs/gfs2/util.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/crc32.h>
+#include <linux/gfs2_ondisk.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "util.h"
+
+struct kmem_cache *gfs2_glock_cachep __read_mostly;
+struct kmem_cache *gfs2_glock_aspace_cachep __read_mostly;
+struct kmem_cache *gfs2_inode_cachep __read_mostly;
+struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
+struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
+struct kmem_cache *gfs2_quotad_cachep __read_mostly;
+
+void gfs2_assert_i(struct gfs2_sbd *sdp)
+{
+	printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n",
+	       sdp->sd_fsname);
+}
+
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	const struct lm_lockops *lm = ls->ls_ops;
+	va_list args;
+
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
+	    test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return 0;
+
+	va_start(args, fmt);
+	vprintk(fmt, args);
+	va_end(args);
+
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
+		fs_err(sdp, "about to withdraw this file system\n");
+		BUG_ON(sdp->sd_args.ar_debug);
+
+		kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
+
+		if (lm->lm_unmount) {
+			fs_err(sdp, "telling LM to unmount\n");
+			lm->lm_unmount(sdp);
+		}
+		fs_err(sdp, "withdrawn\n");
+		dump_stack();
+	}
+
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+		panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname);
+
+	return -1;
+}
+
+/**
+ * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
+ * Returns: -1 if this call withdrew the machine,
+ *          -2 if it was already withdrawn
+ */
+
+int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+			   const char *function, char *file, unsigned int line)
+{
+	int me;
+	me = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: assertion \"%s\" failed\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname, assertion,
+		sdp->sd_fsname, function, file, line);
+	dump_stack();
+	return (me) ? -1 : -2;
+}
+
+/**
+ * gfs2_assert_warn_i - Print a message to the console if @assertion is false
+ * Returns: -1 if we printed something
+ *          -2 if we didn't
+ */
+
+int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+		       const char *function, char *file, unsigned int line)
+{
+	if (time_before(jiffies,
+			sdp->sd_last_warning +
+			gfs2_tune_get(sdp, gt_complain_secs) * HZ))
+		return -2;
+
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
+		printk(KERN_WARNING
+		       "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+		       "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		       sdp->sd_fsname, assertion,
+		       sdp->sd_fsname, function, file, line);
+
+	if (sdp->sd_args.ar_debug)
+		BUG();
+	else
+		dump_stack();
+
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+		panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+		      "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		      sdp->sd_fsname, assertion,
+		      sdp->sd_fsname, function, file, line);
+
+	sdp->sd_last_warning = jiffies;
+
+	return -1;
+}
+
+/**
+ * gfs2_consist_i - Flag a filesystem consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+
+int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
+		   char *file, unsigned int line)
+{
+	int rv;
+	rv = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: filesystem consistency error\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname,
+		sdp->sd_fsname, function, file, line);
+	return rv;
+}
+
+/**
+ * gfs2_consist_inode_i - Flag an inode consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+
+int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
+			 const char *function, char *file, unsigned int line)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	int rv;
+	rv = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: filesystem consistency error\n"
+		"GFS2: fsid=%s:   inode = %llu %llu\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname,
+		sdp->sd_fsname, (unsigned long long)ip->i_no_formal_ino,
+		(unsigned long long)ip->i_no_addr,
+		sdp->sd_fsname, function, file, line);
+	return rv;
+}
+
+/**
+ * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+
+int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
+			 const char *function, char *file, unsigned int line)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	int rv;
+	rv = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: filesystem consistency error\n"
+		"GFS2: fsid=%s:   RG = %llu\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname,
+		sdp->sd_fsname, (unsigned long long)rgd->rd_addr,
+		sdp->sd_fsname, function, file, line);
+	return rv;
+}
+
+/**
+ * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          -2 if it was already withdrawn
+ */
+
+int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		       const char *type, const char *function, char *file,
+		       unsigned int line)
+{
+	int me;
+	me = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: invalid metadata block\n"
+		"GFS2: fsid=%s:   bh = %llu (%s)\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname,
+		sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type,
+		sdp->sd_fsname, function, file, line);
+	return (me) ? -1 : -2;
+}
+
+/**
+ * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          -2 if it was already withdrawn
+ */
+
+int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			   u16 type, u16 t, const char *function,
+			   char *file, unsigned int line)
+{
+	int me;
+	me = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: invalid metadata block\n"
+		"GFS2: fsid=%s:   bh = %llu (type: exp=%u, found=%u)\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname,
+		sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t,
+		sdp->sd_fsname, function, file, line);
+	return (me) ? -1 : -2;
+}
+
+/**
+ * gfs2_io_error_i - Flag an I/O error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+
+int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
+		    unsigned int line)
+{
+	int rv;
+	rv = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: I/O error\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname,
+		sdp->sd_fsname, function, file, line);
+	return rv;
+}
+
+/**
+ * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+
+int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		       const char *function, char *file, unsigned int line)
+{
+	int rv;
+	rv = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: I/O error\n"
+		"GFS2: fsid=%s:   block = %llu\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname,
+		sdp->sd_fsname, (unsigned long long)bh->b_blocknr,
+		sdp->sd_fsname, function, file, line);
+	return rv;
+}
+
+void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
+		      unsigned int bit, int new_value)
+{
+	unsigned int c, o, b = bit;
+	int old_value;
+
+	c = b / (8 * PAGE_SIZE);
+	b %= 8 * PAGE_SIZE;
+	o = b / 8;
+	b %= 8;
+
+	old_value = (bitmap[c][o] & (1 << b));
+	gfs2_assert_withdraw(sdp, !old_value != !new_value);
+
+	if (new_value)
+		bitmap[c][o] |= 1 << b;
+	else
+		bitmap[c][o] &= ~(1 << b);
+}
+
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
new file mode 100644
index 00000000..b432e046
--- /dev/null
+++ b/fs/gfs2/util.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+
+#include "incore.h"
+
+#define fs_printk(level, fs, fmt, arg...) \
+	printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg)
+
+#define fs_info(fs, fmt, arg...) \
+	fs_printk(KERN_INFO , fs , fmt , ## arg)
+
+#define fs_warn(fs, fmt, arg...) \
+	fs_printk(KERN_WARNING , fs , fmt , ## arg)
+
+#define fs_err(fs, fmt, arg...) \
+	fs_printk(KERN_ERR, fs , fmt , ## arg)
+
+
+void gfs2_assert_i(struct gfs2_sbd *sdp);
+
+#define gfs2_assert(sdp, assertion) \
+do { \
+	if (unlikely(!(assertion))) { \
+		gfs2_assert_i(sdp); \
+		BUG(); \
+        } \
+} while (0)
+
+
+int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+			   const char *function, char *file, unsigned int line);
+
+#define gfs2_assert_withdraw(sdp, assertion) \
+((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
+					__func__, __FILE__, __LINE__))
+
+
+int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+		       const char *function, char *file, unsigned int line);
+
+#define gfs2_assert_warn(sdp, assertion) \
+((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
+					__func__, __FILE__, __LINE__))
+
+
+int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
+		   const char *function, char *file, unsigned int line);
+
+#define gfs2_consist(sdp) \
+gfs2_consist_i((sdp), 0, __func__, __FILE__, __LINE__)
+
+
+int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
+			 const char *function, char *file, unsigned int line);
+
+#define gfs2_consist_inode(ip) \
+gfs2_consist_inode_i((ip), 0, __func__, __FILE__, __LINE__)
+
+
+int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
+			 const char *function, char *file, unsigned int line);
+
+#define gfs2_consist_rgrpd(rgd) \
+gfs2_consist_rgrpd_i((rgd), 0, __func__, __FILE__, __LINE__)
+
+
+int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		       const char *type, const char *function,
+		       char *file, unsigned int line);
+
+static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp,
+				    struct buffer_head *bh,
+				    const char *function,
+				    char *file, unsigned int line)
+{
+	struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+	u32 magic = be32_to_cpu(mh->mh_magic);
+	if (unlikely(magic != GFS2_MAGIC))
+		return gfs2_meta_check_ii(sdp, bh, "magic number", function,
+					  file, line);
+	return 0;
+}
+
+#define gfs2_meta_check(sdp, bh) \
+gfs2_meta_check_i((sdp), (bh), __func__, __FILE__, __LINE__)
+
+
+int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			   u16 type, u16 t,
+			   const char *function,
+			   char *file, unsigned int line);
+
+static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
+					struct buffer_head *bh,
+					u16 type,
+					const char *function,
+					char *file, unsigned int line)
+{
+	struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+	u32 magic = be32_to_cpu(mh->mh_magic);
+	u16 t = be32_to_cpu(mh->mh_type);
+	if (unlikely(magic != GFS2_MAGIC))
+		return gfs2_meta_check_ii(sdp, bh, "magic number", function,
+					  file, line);
+        if (unlikely(t != type))
+		return gfs2_metatype_check_ii(sdp, bh, type, t, function,
+					      file, line);
+	return 0;
+}
+
+#define gfs2_metatype_check(sdp, bh, type) \
+gfs2_metatype_check_i((sdp), (bh), (type), __func__, __FILE__, __LINE__)
+
+static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
+				     u16 format)
+{
+	struct gfs2_meta_header *mh;
+	mh = (struct gfs2_meta_header *)bh->b_data;
+	mh->mh_type = cpu_to_be32(type);
+	mh->mh_format = cpu_to_be32(format);
+}
+
+
+int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
+		    char *file, unsigned int line);
+
+#define gfs2_io_error(sdp) \
+gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__);
+
+
+int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		       const char *function, char *file, unsigned int line);
+
+#define gfs2_io_error_bh(sdp, bh) \
+gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__);
+
+
+extern struct kmem_cache *gfs2_glock_cachep;
+extern struct kmem_cache *gfs2_glock_aspace_cachep;
+extern struct kmem_cache *gfs2_inode_cachep;
+extern struct kmem_cache *gfs2_bufdata_cachep;
+extern struct kmem_cache *gfs2_rgrpd_cachep;
+extern struct kmem_cache *gfs2_quotad_cachep;
+
+static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
+					   unsigned int *p)
+{
+	unsigned int x;
+	spin_lock(&gt->gt_spin);
+	x = *p;
+	spin_unlock(&gt->gt_spin);
+	return x;
+}
+
+#define gfs2_tune_get(sdp, field) \
+gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
+
+void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
+		      unsigned int bit, int new_value);
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...);
+
+#endif /* __UTIL_DOT_H__ */
+
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
new file mode 100644
index 00000000..439b61c0
--- /dev/null
+++ b/fs/gfs2/xattr.c
@@ -0,0 +1,1549 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "xattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+/**
+ * ea_calc_size - returns the acutal number of bytes the request will take up
+ *                (not counting any unstuffed data blocks)
+ * @sdp:
+ * @er:
+ * @size:
+ *
+ * Returns: 1 if the EA should be stuffed
+ */
+
+static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
+			unsigned int *size)
+{
+	unsigned int jbsize = sdp->sd_jbsize;
+
+	/* Stuffed */
+	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
+
+	if (*size <= jbsize)
+		return 1;
+
+	/* Unstuffed */
+	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
+		      (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
+
+	return 0;
+}
+
+static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
+{
+	unsigned int size;
+
+	if (dsize > GFS2_EA_MAX_DATA_LEN)
+		return -ERANGE;
+
+	ea_calc_size(sdp, nsize, dsize, &size);
+
+	/* This can only happen with 512 byte blocks */
+	if (size > sdp->sd_jbsize)
+		return -ERANGE;
+
+	return 0;
+}
+
+typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh,
+			  struct gfs2_ea_header *ea,
+			  struct gfs2_ea_header *prev, void *private);
+
+static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
+			ea_call_t ea_call, void *data)
+{
+	struct gfs2_ea_header *ea, *prev = NULL;
+	int error = 0;
+
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
+		return -EIO;
+
+	for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
+		if (!GFS2_EA_REC_LEN(ea))
+			goto fail;
+		if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <=
+						  bh->b_data + bh->b_size))
+			goto fail;
+		if (!GFS2_EATYPE_VALID(ea->ea_type))
+			goto fail;
+
+		error = ea_call(ip, bh, ea, prev, data);
+		if (error)
+			return error;
+
+		if (GFS2_EA_IS_LAST(ea)) {
+			if ((char *)GFS2_EA2NEXT(ea) !=
+			    bh->b_data + bh->b_size)
+				goto fail;
+			break;
+		}
+	}
+
+	return error;
+
+fail:
+	gfs2_consist_inode(ip);
+	return -EIO;
+}
+
+static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
+{
+	struct buffer_head *bh, *eabh;
+	__be64 *eablk, *end;
+	int error;
+
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
+	if (error)
+		return error;
+
+	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) {
+		error = ea_foreach_i(ip, bh, ea_call, data);
+		goto out;
+	}
+
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
+		error = -EIO;
+		goto out;
+	}
+
+	eablk = (__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
+	end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
+
+	for (; eablk < end; eablk++) {
+		u64 bn;
+
+		if (!*eablk)
+			break;
+		bn = be64_to_cpu(*eablk);
+
+		error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
+		if (error)
+			break;
+		error = ea_foreach_i(ip, eabh, ea_call, data);
+		brelse(eabh);
+		if (error)
+			break;
+	}
+out:
+	brelse(bh);
+	return error;
+}
+
+struct ea_find {
+	int type;
+	const char *name;
+	size_t namel;
+	struct gfs2_ea_location *ef_el;
+};
+
+static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
+		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+		     void *private)
+{
+	struct ea_find *ef = private;
+
+	if (ea->ea_type == GFS2_EATYPE_UNUSED)
+		return 0;
+
+	if (ea->ea_type == ef->type) {
+		if (ea->ea_name_len == ef->namel &&
+		    !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
+			struct gfs2_ea_location *el = ef->ef_el;
+			get_bh(bh);
+			el->el_bh = bh;
+			el->el_ea = ea;
+			el->el_prev = prev;
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+			struct gfs2_ea_location *el)
+{
+	struct ea_find ef;
+	int error;
+
+	ef.type = type;
+	ef.name = name;
+	ef.namel = strlen(name);
+	ef.ef_el = el;
+
+	memset(el, 0, sizeof(struct gfs2_ea_location));
+
+	error = ea_foreach(ip, ea_find_i, &ef);
+	if (error > 0)
+		return 0;
+
+	return error;
+}
+
+/**
+ * ea_dealloc_unstuffed -
+ * @ip:
+ * @bh:
+ * @ea:
+ * @prev:
+ * @private:
+ *
+ * Take advantage of the fact that all unstuffed blocks are
+ * allocated from the same RG.  But watch, this may not always
+ * be true.
+ *
+ * Returns: errno
+ */
+
+static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+				struct gfs2_ea_header *ea,
+				struct gfs2_ea_header *prev, void *private)
+{
+	int *leave = private;
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder rg_gh;
+	struct buffer_head *dibh;
+	__be64 *dataptrs;
+	u64 bn = 0;
+	u64 bstart = 0;
+	unsigned int blen = 0;
+	unsigned int blks = 0;
+	unsigned int x;
+	int error;
+
+	if (GFS2_EA_IS_STUFFED(ea))
+		return 0;
+
+	dataptrs = GFS2_EA2DATAPTRS(ea);
+	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+		if (*dataptrs) {
+			blks++;
+			bn = be64_to_cpu(*dataptrs);
+		}
+	}
+	if (!blks)
+		return 0;
+
+	rgd = gfs2_blk2rgrpd(sdp, bn);
+	if (!rgd) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
+	if (error)
+		return error;
+
+	error = gfs2_trans_begin(sdp, rgd->rd_length + RES_DINODE +
+				 RES_EATTR + RES_STATFS + RES_QUOTA, blks);
+	if (error)
+		goto out_gunlock;
+
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+	dataptrs = GFS2_EA2DATAPTRS(ea);
+	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+		if (!*dataptrs)
+			break;
+		bn = be64_to_cpu(*dataptrs);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_free_meta(ip, bstart, blen);
+			bstart = bn;
+			blen = 1;
+		}
+
+		*dataptrs = 0;
+		gfs2_add_inode_blocks(&ip->i_inode, -1);
+	}
+	if (bstart)
+		gfs2_free_meta(ip, bstart, blen);
+
+	if (prev && !leave) {
+		u32 len;
+
+		len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+		prev->ea_rec_len = cpu_to_be32(len);
+
+		if (GFS2_EA_IS_LAST(ea))
+			prev->ea_flags |= GFS2_EAFLAG_LAST;
+	} else {
+		ea->ea_type = GFS2_EATYPE_UNUSED;
+		ea->ea_num_ptrs = 0;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_uninit(&rg_gh);
+	return error;
+}
+
+static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+			       struct gfs2_ea_header *ea,
+			       struct gfs2_ea_header *prev, int leave)
+{
+	struct gfs2_alloc *al;
+	int error;
+
+	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
+
+	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out_alloc;
+
+	error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+	if (error)
+		goto out_quota;
+
+	error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
+
+	gfs2_glock_dq_uninit(&al->al_ri_gh);
+
+out_quota:
+	gfs2_quota_unhold(ip);
+out_alloc:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+struct ea_list {
+	struct gfs2_ea_request *ei_er;
+	unsigned int ei_size;
+};
+
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+	switch (ea->ea_type) {
+	case GFS2_EATYPE_USR:
+		return 5 + ea->ea_name_len + 1;
+	case GFS2_EATYPE_SYS:
+		return 7 + ea->ea_name_len + 1;
+	case GFS2_EATYPE_SECURITY:
+		return 9 + ea->ea_name_len + 1;
+	default:
+		return 0;
+	}
+}
+
+static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
+		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+		     void *private)
+{
+	struct ea_list *ei = private;
+	struct gfs2_ea_request *er = ei->ei_er;
+	unsigned int ea_size = gfs2_ea_strlen(ea);
+
+	if (ea->ea_type == GFS2_EATYPE_UNUSED)
+		return 0;
+
+	if (er->er_data_len) {
+		char *prefix = NULL;
+		unsigned int l = 0;
+		char c = 0;
+
+		if (ei->ei_size + ea_size > er->er_data_len)
+			return -ERANGE;
+
+		switch (ea->ea_type) {
+		case GFS2_EATYPE_USR:
+			prefix = "user.";
+			l = 5;
+			break;
+		case GFS2_EATYPE_SYS:
+			prefix = "system.";
+			l = 7;
+			break;
+		case GFS2_EATYPE_SECURITY:
+			prefix = "security.";
+			l = 9;
+			break;
+		}
+
+		BUG_ON(l == 0);
+
+		memcpy(er->er_data + ei->ei_size, prefix, l);
+		memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
+		       ea->ea_name_len);
+		memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
+	}
+
+	ei->ei_size += ea_size;
+
+	return 0;
+}
+
+/**
+ * gfs2_listxattr - List gfs2 extended attributes
+ * @dentry: The dentry whose inode we are interested in
+ * @buffer: The buffer to write the results
+ * @size: The size of the buffer
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct gfs2_ea_request er;
+	struct gfs2_holder i_gh;
+	int error;
+
+	memset(&er, 0, sizeof(struct gfs2_ea_request));
+	if (size) {
+		er.er_data = buffer;
+		er.er_data_len = size;
+	}
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (error)
+		return error;
+
+	if (ip->i_eattr) {
+		struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
+
+		error = ea_foreach(ip, ea_list_i, &ei);
+		if (!error)
+			error = ei.ei_size;
+	}
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+	return error;
+}
+
+/**
+ * ea_get_unstuffed - actually copies the unstuffed data into the
+ *                    request buffer
+ * @ip: The GFS2 inode
+ * @ea: The extended attribute header structure
+ * @data: The data to be copied
+ *
+ * Returns: errno
+ */
+
+static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+			    char *data)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head **bh;
+	unsigned int amount = GFS2_EA_DATA_LEN(ea);
+	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+	__be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+	unsigned int x;
+	int error = 0;
+
+	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!bh)
+		return -ENOMEM;
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+				       bh + x);
+		if (error) {
+			while (x--)
+				brelse(bh[x]);
+			goto out;
+		}
+		dataptrs++;
+	}
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_wait(sdp, bh[x]);
+		if (error) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			goto out;
+		}
+		if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			error = -EIO;
+			goto out;
+		}
+
+		memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header),
+		       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+
+		amount -= sdp->sd_jbsize;
+		data += sdp->sd_jbsize;
+
+		brelse(bh[x]);
+	}
+
+out:
+	kfree(bh);
+	return error;
+}
+
+static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+			    char *data, size_t size)
+{
+	int ret;
+	size_t len = GFS2_EA_DATA_LEN(el->el_ea);
+	if (len > size)
+		return -ERANGE;
+
+	if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+		memcpy(data, GFS2_EA2DATA(el->el_ea), len);
+		return len;
+	}
+	ret = ea_get_unstuffed(ip, el->el_ea, data);
+	if (ret < 0)
+		return ret;
+	return len;
+}
+
+int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata)
+{
+	struct gfs2_ea_location el;
+	int error;
+	int len;
+	char *data;
+
+	error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, &el);
+	if (error)
+		return error;
+	if (!el.el_ea)
+		goto out;
+	if (!GFS2_EA_DATA_LEN(el.el_ea))
+		goto out;
+
+	len = GFS2_EA_DATA_LEN(el.el_ea);
+	data = kmalloc(len, GFP_NOFS);
+	error = -ENOMEM;
+	if (data == NULL)
+		goto out;
+
+	error = gfs2_ea_get_copy(ip, &el, data, len);
+	if (error == 0)
+		error = len;
+	*ppdata = data;
+out:
+	brelse(el.el_bh);
+	return error;
+}
+
+/**
+ * gfs2_xattr_get - Get a GFS2 extended attribute
+ * @inode: The inode
+ * @name: The name of the extended attribute
+ * @buffer: The buffer to write the result into
+ * @size: The size of the buffer
+ * @type: The type of extended attribute
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+static int gfs2_xattr_get(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct gfs2_ea_location el;
+	int error;
+
+	if (!ip->i_eattr)
+		return -ENODATA;
+	if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
+		return -EINVAL;
+
+	error = gfs2_ea_find(ip, type, name, &el);
+	if (error)
+		return error;
+	if (!el.el_ea)
+		return -ENODATA;
+	if (size)
+		error = gfs2_ea_get_copy(ip, &el, buffer, size);
+	else
+		error = GFS2_EA_DATA_LEN(el.el_ea);
+	brelse(el.el_bh);
+
+	return error;
+}
+
+/**
+ * ea_alloc_blk - allocates a new block for extended attributes.
+ * @ip: A pointer to the inode that's getting extended attributes
+ * @bhp: Pointer to pointer to a struct buffer_head
+ *
+ * Returns: errno
+ */
+
+static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_ea_header *ea;
+	unsigned int n = 1;
+	u64 block;
+	int error;
+
+	error = gfs2_alloc_block(ip, &block, &n);
+	if (error)
+		return error;
+	gfs2_trans_add_unrevoke(sdp, block, 1);
+	*bhp = gfs2_meta_new(ip->i_gl, block);
+	gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
+	gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
+	gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
+
+	ea = GFS2_EA_BH2FIRST(*bhp);
+	ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
+	ea->ea_type = GFS2_EATYPE_UNUSED;
+	ea->ea_flags = GFS2_EAFLAG_LAST;
+	ea->ea_num_ptrs = 0;
+
+	gfs2_add_inode_blocks(&ip->i_inode, 1);
+
+	return 0;
+}
+
+/**
+ * ea_write - writes the request info to an ea, creating new blocks if
+ *            necessary
+ * @ip: inode that is being modified
+ * @ea: the location of the new ea in a block
+ * @er: the write request
+ *
+ * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
+ *
+ * returns : errno
+ */
+
+static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+		    struct gfs2_ea_request *er)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	int error;
+
+	ea->ea_data_len = cpu_to_be32(er->er_data_len);
+	ea->ea_name_len = er->er_name_len;
+	ea->ea_type = er->er_type;
+	ea->__pad = 0;
+
+	memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
+
+	if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
+		ea->ea_num_ptrs = 0;
+		memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
+	} else {
+		__be64 *dataptr = GFS2_EA2DATAPTRS(ea);
+		const char *data = er->er_data;
+		unsigned int data_len = er->er_data_len;
+		unsigned int copy;
+		unsigned int x;
+
+		ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
+		for (x = 0; x < ea->ea_num_ptrs; x++) {
+			struct buffer_head *bh;
+			u64 block;
+			int mh_size = sizeof(struct gfs2_meta_header);
+			unsigned int n = 1;
+
+			error = gfs2_alloc_block(ip, &block, &n);
+			if (error)
+				return error;
+			gfs2_trans_add_unrevoke(sdp, block, 1);
+			bh = gfs2_meta_new(ip->i_gl, block);
+			gfs2_trans_add_bh(ip->i_gl, bh, 1);
+			gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
+
+			gfs2_add_inode_blocks(&ip->i_inode, 1);
+
+			copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
+							   data_len;
+			memcpy(bh->b_data + mh_size, data, copy);
+			if (copy < sdp->sd_jbsize)
+				memset(bh->b_data + mh_size + copy, 0,
+				       sdp->sd_jbsize - copy);
+
+			*dataptr++ = cpu_to_be64(bh->b_blocknr);
+			data += copy;
+			data_len -= copy;
+
+			brelse(bh);
+		}
+
+		gfs2_assert_withdraw(sdp, !data_len);
+	}
+
+	return 0;
+}
+
+typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
+				   struct gfs2_ea_request *er, void *private);
+
+static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+			     unsigned int blks,
+			     ea_skeleton_call_t skeleton_call, void *private)
+{
+	struct gfs2_alloc *al;
+	struct buffer_head *dibh;
+	int error;
+
+	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
+
+	error = gfs2_quota_lock_check(ip);
+	if (error)
+		goto out;
+
+	al->al_requested = blks;
+
+	error = gfs2_inplace_reserve(ip);
+	if (error)
+		goto out_gunlock_q;
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
+				 blks + gfs2_rg_blocks(al) +
+				 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+	if (error)
+		goto out_ipres;
+
+	error = skeleton_call(ip, er, private);
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+out_end_trans:
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+out_ipres:
+	gfs2_inplace_release(ip);
+out_gunlock_q:
+	gfs2_quota_unlock(ip);
+out:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+		     void *private)
+{
+	struct buffer_head *bh;
+	int error;
+
+	error = ea_alloc_blk(ip, &bh);
+	if (error)
+		return error;
+
+	ip->i_eattr = bh->b_blocknr;
+	error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
+
+	brelse(bh);
+
+	return error;
+}
+
+/**
+ * ea_init - initializes a new eattr block
+ * @ip:
+ * @er:
+ *
+ * Returns: errno
+ */
+
+static int ea_init(struct gfs2_inode *ip, int type, const char *name,
+		   const void *data, size_t size)
+{
+	struct gfs2_ea_request er;
+	unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
+	unsigned int blks = 1;
+
+	er.er_type = type;
+	er.er_name = name;
+	er.er_name_len = strlen(name);
+	er.er_data = (void *)data;
+	er.er_data_len = size;
+
+	if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
+		blks += DIV_ROUND_UP(er.er_data_len, jbsize);
+
+	return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
+}
+
+static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
+{
+	u32 ea_size = GFS2_EA_SIZE(ea);
+	struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
+				     ea_size);
+	u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size;
+	int last = ea->ea_flags & GFS2_EAFLAG_LAST;
+
+	ea->ea_rec_len = cpu_to_be32(ea_size);
+	ea->ea_flags ^= last;
+
+	new->ea_rec_len = cpu_to_be32(new_size);
+	new->ea_flags = last;
+
+	return new;
+}
+
+static void ea_set_remove_stuffed(struct gfs2_inode *ip,
+				  struct gfs2_ea_location *el)
+{
+	struct gfs2_ea_header *ea = el->el_ea;
+	struct gfs2_ea_header *prev = el->el_prev;
+	u32 len;
+
+	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+
+	if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
+		ea->ea_type = GFS2_EATYPE_UNUSED;
+		return;
+	} else if (GFS2_EA2NEXT(prev) != ea) {
+		prev = GFS2_EA2NEXT(prev);
+		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
+	}
+
+	len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+	prev->ea_rec_len = cpu_to_be32(len);
+
+	if (GFS2_EA_IS_LAST(ea))
+		prev->ea_flags |= GFS2_EAFLAG_LAST;
+}
+
+struct ea_set {
+	int ea_split;
+
+	struct gfs2_ea_request *es_er;
+	struct gfs2_ea_location *es_el;
+
+	struct buffer_head *es_bh;
+	struct gfs2_ea_header *es_ea;
+};
+
+static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
+				 struct gfs2_ea_header *ea, struct ea_set *es)
+{
+	struct gfs2_ea_request *er = es->es_er;
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
+	if (error)
+		return error;
+
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+	if (es->ea_split)
+		ea = ea_split_ea(ea);
+
+	ea_write(ip, ea, er);
+
+	if (es->es_el)
+		ea_set_remove_stuffed(ip, es->es_el);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+	ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+out:
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+	return error;
+}
+
+static int ea_set_simple_alloc(struct gfs2_inode *ip,
+			       struct gfs2_ea_request *er, void *private)
+{
+	struct ea_set *es = private;
+	struct gfs2_ea_header *ea = es->es_ea;
+	int error;
+
+	gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
+
+	if (es->ea_split)
+		ea = ea_split_ea(ea);
+
+	error = ea_write(ip, ea, er);
+	if (error)
+		return error;
+
+	if (es->es_el)
+		ea_set_remove_stuffed(ip, es->es_el);
+
+	return 0;
+}
+
+static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
+			 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+			 void *private)
+{
+	struct ea_set *es = private;
+	unsigned int size;
+	int stuffed;
+	int error;
+
+	stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
+			       es->es_er->er_data_len, &size);
+
+	if (ea->ea_type == GFS2_EATYPE_UNUSED) {
+		if (GFS2_EA_REC_LEN(ea) < size)
+			return 0;
+		if (!GFS2_EA_IS_STUFFED(ea)) {
+			error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
+			if (error)
+				return error;
+		}
+		es->ea_split = 0;
+	} else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
+		es->ea_split = 1;
+	else
+		return 0;
+
+	if (stuffed) {
+		error = ea_set_simple_noalloc(ip, bh, ea, es);
+		if (error)
+			return error;
+	} else {
+		unsigned int blks;
+
+		es->es_bh = bh;
+		es->es_ea = ea;
+		blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
+					GFS2_SB(&ip->i_inode)->sd_jbsize);
+
+		error = ea_alloc_skeleton(ip, es->es_er, blks,
+					  ea_set_simple_alloc, es);
+		if (error)
+			return error;
+	}
+
+	return 1;
+}
+
+static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+			void *private)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *indbh, *newbh;
+	__be64 *eablk;
+	int error;
+	int mh_size = sizeof(struct gfs2_meta_header);
+
+	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
+		__be64 *end;
+
+		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
+				       &indbh);
+		if (error)
+			return error;
+
+		if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+			error = -EIO;
+			goto out;
+		}
+
+		eablk = (__be64 *)(indbh->b_data + mh_size);
+		end = eablk + sdp->sd_inptrs;
+
+		for (; eablk < end; eablk++)
+			if (!*eablk)
+				break;
+
+		if (eablk == end) {
+			error = -ENOSPC;
+			goto out;
+		}
+
+		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+	} else {
+		u64 blk;
+		unsigned int n = 1;
+		error = gfs2_alloc_block(ip, &blk, &n);
+		if (error)
+			return error;
+		gfs2_trans_add_unrevoke(sdp, blk, 1);
+		indbh = gfs2_meta_new(ip->i_gl, blk);
+		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+		gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+		gfs2_buffer_clear_tail(indbh, mh_size);
+
+		eablk = (__be64 *)(indbh->b_data + mh_size);
+		*eablk = cpu_to_be64(ip->i_eattr);
+		ip->i_eattr = blk;
+		ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
+		gfs2_add_inode_blocks(&ip->i_inode, 1);
+
+		eablk++;
+	}
+
+	error = ea_alloc_blk(ip, &newbh);
+	if (error)
+		goto out;
+
+	*eablk = cpu_to_be64((u64)newbh->b_blocknr);
+	error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
+	brelse(newbh);
+	if (error)
+		goto out;
+
+	if (private)
+		ea_set_remove_stuffed(ip, private);
+
+out:
+	brelse(indbh);
+	return error;
+}
+
+static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
+		    const void *value, size_t size, struct gfs2_ea_location *el)
+{
+	struct gfs2_ea_request er;
+	struct ea_set es;
+	unsigned int blks = 2;
+	int error;
+
+	er.er_type = type;
+	er.er_name = name;
+	er.er_data = (void *)value;
+	er.er_name_len = strlen(name);
+	er.er_data_len = size;
+
+	memset(&es, 0, sizeof(struct ea_set));
+	es.es_er = &er;
+	es.es_el = el;
+
+	error = ea_foreach(ip, ea_set_simple, &es);
+	if (error > 0)
+		return 0;
+	if (error)
+		return error;
+
+	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
+		blks++;
+	if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+		blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+
+	return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
+}
+
+static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
+				   struct gfs2_ea_location *el)
+{
+	if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
+		el->el_prev = GFS2_EA2NEXT(el->el_prev);
+		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+				     GFS2_EA2NEXT(el->el_prev) == el->el_ea);
+	}
+
+	return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0);
+}
+
+static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
+{
+	struct gfs2_ea_header *ea = el->el_ea;
+	struct gfs2_ea_header *prev = el->el_prev;
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+	if (error)
+		return error;
+
+	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+
+	if (prev) {
+		u32 len;
+
+		len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+		prev->ea_rec_len = cpu_to_be32(len);
+
+		if (GFS2_EA_IS_LAST(ea))
+			prev->ea_flags |= GFS2_EAFLAG_LAST;
+	} else {
+		ea->ea_type = GFS2_EATYPE_UNUSED;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+
+	return error;
+}
+
+/**
+ * gfs2_xattr_remove - Remove a GFS2 extended attribute
+ * @ip: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ *
+ * This is not called directly by the VFS since we use the (common)
+ * scheme of making a "set with NULL data" mean a remove request. Note
+ * that this is different from a set with zero length data.
+ *
+ * Returns: 0, or errno on failure
+ */
+
+static int gfs2_xattr_remove(struct gfs2_inode *ip, int type, const char *name)
+{
+	struct gfs2_ea_location el;
+	int error;
+
+	if (!ip->i_eattr)
+		return -ENODATA;
+
+	error = gfs2_ea_find(ip, type, name, &el);
+	if (error)
+		return error;
+	if (!el.el_ea)
+		return -ENODATA;
+
+	if (GFS2_EA_IS_STUFFED(el.el_ea))
+		error = ea_remove_stuffed(ip, &el);
+	else
+		error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
+
+	brelse(el.el_bh);
+
+	return error;
+}
+
+/**
+ * __gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
+ * @ip: The inode
+ * @name: The name of the extended attribute
+ * @value: The value of the extended attribute (NULL for remove)
+ * @size: The size of the @value argument
+ * @flags: Create or Replace
+ * @type: The type of the extended attribute
+ *
+ * See gfs2_xattr_remove() for details of the removal of xattrs.
+ *
+ * Returns: 0 or errno on failure
+ */
+
+int __gfs2_xattr_set(struct inode *inode, const char *name,
+		   const void *value, size_t size, int flags, int type)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_ea_location el;
+	unsigned int namel = strlen(name);
+	int error;
+
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		return -EPERM;
+	if (namel > GFS2_EA_MAX_NAME_LEN)
+		return -ERANGE;
+
+	if (value == NULL)
+		return gfs2_xattr_remove(ip, type, name);
+
+	if (ea_check_size(sdp, namel, size))
+		return -ERANGE;
+
+	if (!ip->i_eattr) {
+		if (flags & XATTR_REPLACE)
+			return -ENODATA;
+		return ea_init(ip, type, name, value, size);
+	}
+
+	error = gfs2_ea_find(ip, type, name, &el);
+	if (error)
+		return error;
+
+	if (el.el_ea) {
+		if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
+			brelse(el.el_bh);
+			return -EPERM;
+		}
+
+		error = -EEXIST;
+		if (!(flags & XATTR_CREATE)) {
+			int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+			error = ea_set_i(ip, type, name, value, size, &el);
+			if (!error && unstuffed)
+				ea_set_remove_unstuffed(ip, &el);
+		}
+
+		brelse(el.el_bh);
+		return error;
+	}
+
+	error = -ENODATA;
+	if (!(flags & XATTR_REPLACE))
+		error = ea_set_i(ip, type, name, value, size, NULL);
+
+	return error;
+}
+
+static int gfs2_xattr_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	return __gfs2_xattr_set(dentry->d_inode, name, value,
+				size, flags, type);
+}
+
+static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
+				  struct gfs2_ea_header *ea, char *data)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head **bh;
+	unsigned int amount = GFS2_EA_DATA_LEN(ea);
+	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+	__be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+	unsigned int x;
+	int error;
+
+	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!bh)
+		return -ENOMEM;
+
+	error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
+	if (error)
+		goto out;
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+				       bh + x);
+		if (error) {
+			while (x--)
+				brelse(bh[x]);
+			goto fail;
+		}
+		dataptrs++;
+	}
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_wait(sdp, bh[x]);
+		if (error) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			goto fail;
+		}
+		if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			error = -EIO;
+			goto fail;
+		}
+
+		gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
+
+		memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data,
+		       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+
+		amount -= sdp->sd_jbsize;
+		data += sdp->sd_jbsize;
+
+		brelse(bh[x]);
+	}
+
+out:
+	kfree(bh);
+	return error;
+
+fail:
+	gfs2_trans_end(sdp);
+	kfree(bh);
+	return error;
+}
+
+int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_ea_location el;
+	int error;
+
+	error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
+	if (error)
+		return error;
+
+	if (GFS2_EA_IS_STUFFED(el.el_ea)) {
+		error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0);
+		if (error == 0) {
+			gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1);
+			memcpy(GFS2_EA2DATA(el.el_ea), data,
+			       GFS2_EA_DATA_LEN(el.el_ea));
+		}
+	} else {
+		error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
+	}
+
+	brelse(el.el_bh);
+	if (error)
+		return error;
+
+	error = gfs2_setattr_simple(ip, attr);
+	gfs2_trans_end(sdp);
+	return error;
+}
+
+static int ea_dealloc_indirect(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrp_list rlist;
+	struct buffer_head *indbh, *dibh;
+	__be64 *eablk, *end;
+	unsigned int rg_blocks = 0;
+	u64 bstart = 0;
+	unsigned int blen = 0;
+	unsigned int blks = 0;
+	unsigned int x;
+	int error;
+
+	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
+	if (error)
+		return error;
+
+	if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+		error = -EIO;
+		goto out;
+	}
+
+	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+	end = eablk + sdp->sd_inptrs;
+
+	for (; eablk < end; eablk++) {
+		u64 bn;
+
+		if (!*eablk)
+			break;
+		bn = be64_to_cpu(*eablk);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_rlist_add(sdp, &rlist, bstart);
+			bstart = bn;
+			blen = 1;
+		}
+		blks++;
+	}
+	if (bstart)
+		gfs2_rlist_add(sdp, &rlist, bstart);
+	else
+		goto out;
+
+	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
+
+	for (x = 0; x < rlist.rl_rgrps; x++) {
+		struct gfs2_rgrpd *rgd;
+		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+		rg_blocks += rgd->rd_length;
+	}
+
+	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+	if (error)
+		goto out_rlist_free;
+
+	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT +
+				 RES_STATFS + RES_QUOTA, blks);
+	if (error)
+		goto out_gunlock;
+
+	gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+
+	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+	bstart = 0;
+	blen = 0;
+
+	for (; eablk < end; eablk++) {
+		u64 bn;
+
+		if (!*eablk)
+			break;
+		bn = be64_to_cpu(*eablk);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_free_meta(ip, bstart, blen);
+			bstart = bn;
+			blen = 1;
+		}
+
+		*eablk = 0;
+		gfs2_add_inode_blocks(&ip->i_inode, -1);
+	}
+	if (bstart)
+		gfs2_free_meta(ip, bstart, blen);
+
+	ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist_free:
+	gfs2_rlist_free(&rlist);
+out:
+	brelse(indbh);
+	return error;
+}
+
+static int ea_dealloc_block(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = ip->i_alloc;
+	struct gfs2_rgrpd *rgd;
+	struct buffer_head *dibh;
+	int error;
+
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
+	if (!rgd) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+				   &al->al_rgd_gh);
+	if (error)
+		return error;
+
+	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS +
+				 RES_QUOTA, 1);
+	if (error)
+		goto out_gunlock;
+
+	gfs2_free_meta(ip, ip->i_eattr, 1);
+
+	ip->i_eattr = 0;
+	gfs2_add_inode_blocks(&ip->i_inode, -1);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_uninit(&al->al_rgd_gh);
+	return error;
+}
+
+/**
+ * gfs2_ea_dealloc - deallocate the extended attribute fork
+ * @ip: the inode
+ *
+ * Returns: errno
+ */
+
+int gfs2_ea_dealloc(struct gfs2_inode *ip)
+{
+	struct gfs2_alloc *al;
+	int error;
+
+	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
+
+	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out_alloc;
+
+	error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+	if (error)
+		goto out_quota;
+
+	error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
+	if (error)
+		goto out_rindex;
+
+	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
+		error = ea_dealloc_indirect(ip);
+		if (error)
+			goto out_rindex;
+	}
+
+	error = ea_dealloc_block(ip);
+
+out_rindex:
+	gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_quota:
+	gfs2_quota_unhold(ip);
+out_alloc:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+static const struct xattr_handler gfs2_xattr_user_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.flags  = GFS2_EATYPE_USR,
+	.get    = gfs2_xattr_get,
+	.set    = gfs2_xattr_set,
+};
+
+static const struct xattr_handler gfs2_xattr_security_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.flags  = GFS2_EATYPE_SECURITY,
+	.get    = gfs2_xattr_get,
+	.set    = gfs2_xattr_set,
+};
+
+const struct xattr_handler *gfs2_xattr_handlers[] = {
+	&gfs2_xattr_user_handler,
+	&gfs2_xattr_security_handler,
+	&gfs2_xattr_system_handler,
+	NULL,
+};
+
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
new file mode 100644
index 00000000..d392f835
--- /dev/null
+++ b/fs/gfs2/xattr.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __EATTR_DOT_H__
+#define __EATTR_DOT_H__
+
+struct gfs2_inode;
+struct iattr;
+
+#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
+#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
+
+#define GFS2_EA_SIZE(ea) \
+ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
+      ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
+				  (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
+
+#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
+#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
+
+#define GFS2_EAREQ_SIZE_STUFFED(er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
+
+#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
+#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
+
+#define GFS2_EA2DATAPTRS(ea) \
+((__be64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
+
+#define GFS2_EA2NEXT(ea) \
+((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
+
+#define GFS2_EA_BH2FIRST(bh) \
+((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
+
+struct gfs2_ea_request {
+	const char *er_name;
+	char *er_data;
+	unsigned int er_name_len;
+	unsigned int er_data_len;
+	unsigned int er_type; /* GFS2_EATYPE_... */
+};
+
+struct gfs2_ea_location {
+	struct buffer_head *el_bh;
+	struct gfs2_ea_header *el_ea;
+	struct gfs2_ea_header *el_prev;
+};
+
+extern int __gfs2_xattr_set(struct inode *inode, const char *name,
+			    const void *value, size_t size,
+			    int flags, int type);
+extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
+extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
+
+/* Exported to acl.c */
+
+extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
+extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data);
+
+#endif /* __EATTR_DOT_H__ */
diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig
new file mode 100644
index 00000000..b77c5bc2
--- /dev/null
+++ b/fs/hfs/Kconfig
@@ -0,0 +1,12 @@
+config HFS_FS
+	tristate "Apple Macintosh file system support (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	select NLS
+	help
+	  If you say Y here, you will be able to mount Macintosh-formatted
+	  floppy disks and hard drive partitions with full read-write access.
+	  Please read <file:Documentation/filesystems/hfs.txt> to learn about
+	  the available mount options.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called hfs.
diff --git a/fs/hfs/Makefile b/fs/hfs/Makefile
new file mode 100644
index 00000000..c41f5a85
--- /dev/null
+++ b/fs/hfs/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the Linux hfs filesystem routines.
+#
+
+obj-$(CONFIG_HFS_FS) += hfs.o
+
+hfs-objs := bitmap.o bfind.o bnode.o brec.o btree.o \
+	    catalog.o dir.o extent.o inode.o attr.o mdb.o \
+            part_tbl.o string.o super.o sysdep.o trans.o
+
diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c
new file mode 100644
index 00000000..e057ec54
--- /dev/null
+++ b/fs/hfs/attr.c
@@ -0,0 +1,121 @@
+/*
+ *  linux/fs/hfs/attr.c
+ *
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Export hfs data via xattr
+ */
+
+
+#include <linux/fs.h>
+#include <linux/xattr.h>
+
+#include "hfs_fs.h"
+#include "btree.h"
+
+int hfs_setxattr(struct dentry *dentry, const char *name,
+		 const void *value, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	struct hfs_find_data fd;
+	hfs_cat_rec rec;
+	struct hfs_cat_file *file;
+	int res;
+
+	if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	res = hfs_find_init(HFS_SB(inode->i_sb)->cat_tree, &fd);
+	if (res)
+		return res;
+	fd.search_key->cat = HFS_I(inode)->cat_key;
+	res = hfs_brec_find(&fd);
+	if (res)
+		goto out;
+	hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
+			sizeof(struct hfs_cat_file));
+	file = &rec.file;
+
+	if (!strcmp(name, "hfs.type")) {
+		if (size == 4)
+			memcpy(&file->UsrWds.fdType, value, 4);
+		else
+			res = -ERANGE;
+	} else if (!strcmp(name, "hfs.creator")) {
+		if (size == 4)
+			memcpy(&file->UsrWds.fdCreator, value, 4);
+		else
+			res = -ERANGE;
+	} else
+		res = -EOPNOTSUPP;
+	if (!res)
+		hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
+				sizeof(struct hfs_cat_file));
+out:
+	hfs_find_exit(&fd);
+	return res;
+}
+
+ssize_t hfs_getxattr(struct dentry *dentry, const char *name,
+			 void *value, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct hfs_find_data fd;
+	hfs_cat_rec rec;
+	struct hfs_cat_file *file;
+	ssize_t res = 0;
+
+	if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	if (size) {
+		res = hfs_find_init(HFS_SB(inode->i_sb)->cat_tree, &fd);
+		if (res)
+			return res;
+		fd.search_key->cat = HFS_I(inode)->cat_key;
+		res = hfs_brec_find(&fd);
+		if (res)
+			goto out;
+		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
+				sizeof(struct hfs_cat_file));
+	}
+	file = &rec.file;
+
+	if (!strcmp(name, "hfs.type")) {
+		if (size >= 4) {
+			memcpy(value, &file->UsrWds.fdType, 4);
+			res = 4;
+		} else
+			res = size ? -ERANGE : 4;
+	} else if (!strcmp(name, "hfs.creator")) {
+		if (size >= 4) {
+			memcpy(value, &file->UsrWds.fdCreator, 4);
+			res = 4;
+		} else
+			res = size ? -ERANGE : 4;
+	} else
+		res = -ENODATA;
+out:
+	if (size)
+		hfs_find_exit(&fd);
+	return res;
+}
+
+#define HFS_ATTRLIST_SIZE (sizeof("hfs.creator")+sizeof("hfs.type"))
+
+ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	if (!buffer || !size)
+		return HFS_ATTRLIST_SIZE;
+	if (size < HFS_ATTRLIST_SIZE)
+		return -ERANGE;
+	strcpy(buffer, "hfs.type");
+	strcpy(buffer + sizeof("hfs.type"), "hfs.creator");
+
+	return HFS_ATTRLIST_SIZE;
+}
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
new file mode 100644
index 00000000..571abe97
--- /dev/null
+++ b/fs/hfs/bfind.c
@@ -0,0 +1,222 @@
+/*
+ *  linux/fs/hfs/bfind.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Search routines for btrees
+ */
+
+#include <linux/slab.h>
+#include "btree.h"
+
+int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
+{
+	void *ptr;
+
+	fd->tree = tree;
+	fd->bnode = NULL;
+	ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+	fd->search_key = ptr;
+	fd->key = ptr + tree->max_key_len + 2;
+	dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
+	mutex_lock(&tree->tree_lock);
+	return 0;
+}
+
+void hfs_find_exit(struct hfs_find_data *fd)
+{
+	hfs_bnode_put(fd->bnode);
+	kfree(fd->search_key);
+	dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
+	mutex_unlock(&fd->tree->tree_lock);
+	fd->tree = NULL;
+}
+
+/* Find the record in bnode that best matches key (not greater than...)*/
+int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
+{
+	int cmpval;
+	u16 off, len, keylen;
+	int rec;
+	int b, e;
+	int res;
+
+	b = 0;
+	e = bnode->num_recs - 1;
+	res = -ENOENT;
+	do {
+		rec = (e + b) / 2;
+		len = hfs_brec_lenoff(bnode, rec, &off);
+		keylen = hfs_brec_keylen(bnode, rec);
+		if (keylen == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
+		hfs_bnode_read(bnode, fd->key, off, keylen);
+		cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
+		if (!cmpval) {
+			e = rec;
+			res = 0;
+			goto done;
+		}
+		if (cmpval < 0)
+			b = rec + 1;
+		else
+			e = rec - 1;
+	} while (b <= e);
+	if (rec != e && e >= 0) {
+		len = hfs_brec_lenoff(bnode, e, &off);
+		keylen = hfs_brec_keylen(bnode, e);
+		if (keylen == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
+		hfs_bnode_read(bnode, fd->key, off, keylen);
+	}
+done:
+	fd->record = e;
+	fd->keyoffset = off;
+	fd->keylength = keylen;
+	fd->entryoffset = off + keylen;
+	fd->entrylength = len - keylen;
+fail:
+	return res;
+}
+
+/* Traverse a B*Tree from the root to a leaf finding best fit to key */
+/* Return allocated copy of node found, set recnum to best record */
+int hfs_brec_find(struct hfs_find_data *fd)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *bnode;
+	u32 nidx, parent;
+	__be32 data;
+	int height, res;
+
+	tree = fd->tree;
+	if (fd->bnode)
+		hfs_bnode_put(fd->bnode);
+	fd->bnode = NULL;
+	nidx = tree->root;
+	if (!nidx)
+		return -ENOENT;
+	height = tree->depth;
+	res = 0;
+	parent = 0;
+	for (;;) {
+		bnode = hfs_bnode_find(tree, nidx);
+		if (IS_ERR(bnode)) {
+			res = PTR_ERR(bnode);
+			bnode = NULL;
+			break;
+		}
+		if (bnode->height != height)
+			goto invalid;
+		if (bnode->type != (--height ? HFS_NODE_INDEX : HFS_NODE_LEAF))
+			goto invalid;
+		bnode->parent = parent;
+
+		res = __hfs_brec_find(bnode, fd);
+		if (!height)
+			break;
+		if (fd->record < 0)
+			goto release;
+
+		parent = nidx;
+		hfs_bnode_read(bnode, &data, fd->entryoffset, 4);
+		nidx = be32_to_cpu(data);
+		hfs_bnode_put(bnode);
+	}
+	fd->bnode = bnode;
+	return res;
+
+invalid:
+	printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
+		height, bnode->height, bnode->type, nidx, parent);
+	res = -EIO;
+release:
+	hfs_bnode_put(bnode);
+	return res;
+}
+
+int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len)
+{
+	int res;
+
+	res = hfs_brec_find(fd);
+	if (res)
+		return res;
+	if (fd->entrylength > rec_len)
+		return -EINVAL;
+	hfs_bnode_read(fd->bnode, rec, fd->entryoffset, fd->entrylength);
+	return 0;
+}
+
+int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *bnode;
+	int idx, res = 0;
+	u16 off, len, keylen;
+
+	bnode = fd->bnode;
+	tree = bnode->tree;
+
+	if (cnt < 0) {
+		cnt = -cnt;
+		while (cnt > fd->record) {
+			cnt -= fd->record + 1;
+			fd->record = bnode->num_recs - 1;
+			idx = bnode->prev;
+			if (!idx) {
+				res = -ENOENT;
+				goto out;
+			}
+			hfs_bnode_put(bnode);
+			bnode = hfs_bnode_find(tree, idx);
+			if (IS_ERR(bnode)) {
+				res = PTR_ERR(bnode);
+				bnode = NULL;
+				goto out;
+			}
+		}
+		fd->record -= cnt;
+	} else {
+		while (cnt >= bnode->num_recs - fd->record) {
+			cnt -= bnode->num_recs - fd->record;
+			fd->record = 0;
+			idx = bnode->next;
+			if (!idx) {
+				res = -ENOENT;
+				goto out;
+			}
+			hfs_bnode_put(bnode);
+			bnode = hfs_bnode_find(tree, idx);
+			if (IS_ERR(bnode)) {
+				res = PTR_ERR(bnode);
+				bnode = NULL;
+				goto out;
+			}
+		}
+		fd->record += cnt;
+	}
+
+	len = hfs_brec_lenoff(bnode, fd->record, &off);
+	keylen = hfs_brec_keylen(bnode, fd->record);
+	if (keylen == 0) {
+		res = -EINVAL;
+		goto out;
+	}
+	fd->keyoffset = off;
+	fd->keylength = keylen;
+	fd->entryoffset = off + keylen;
+	fd->entrylength = len - keylen;
+	hfs_bnode_read(bnode, fd->key, off, keylen);
+out:
+	fd->bnode = bnode;
+	return res;
+}
diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c
new file mode 100644
index 00000000..c6e97366
--- /dev/null
+++ b/fs/hfs/bitmap.c
@@ -0,0 +1,243 @@
+/*
+ *  linux/fs/hfs/bitmap.c
+ *
+ * Copyright (C) 1996-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * Based on GPLed code Copyright (C) 1995  Michael Dreher
+ *
+ * This file contains the code to modify the volume bitmap:
+ * search/set/clear bits.
+ */
+
+#include "hfs_fs.h"
+
+/*
+ * hfs_find_zero_bit()
+ *
+ * Description:
+ *  Given a block of memory, its length in bits, and a starting bit number,
+ *  determine the number of the first zero bits (in left-to-right ordering)
+ *  in that range.
+ *
+ *  Returns >= 'size' if no zero bits are found in the range.
+ *
+ *  Accesses memory in 32-bit aligned chunks of 32-bits and thus
+ *  may read beyond the 'size'th bit.
+ */
+static u32 hfs_find_set_zero_bits(__be32 *bitmap, u32 size, u32 offset, u32 *max)
+{
+	__be32 *curr, *end;
+	u32 mask, start, len, n;
+	__be32 val;
+	int i;
+
+	len = *max;
+	if (!len)
+		return size;
+
+	curr = bitmap + (offset / 32);
+	end = bitmap + ((size + 31) / 32);
+
+	/* scan the first partial u32 for zero bits */
+	val = *curr;
+	if (~val) {
+		n = be32_to_cpu(val);
+		i = offset % 32;
+		mask = (1U << 31) >> i;
+		for (; i < 32; mask >>= 1, i++) {
+			if (!(n & mask))
+				goto found;
+		}
+	}
+
+	/* scan complete u32s for the first zero bit */
+	while (++curr < end) {
+		val = *curr;
+		if (~val) {
+			n = be32_to_cpu(val);
+			mask = 1 << 31;
+			for (i = 0; i < 32; mask >>= 1, i++) {
+				if (!(n & mask))
+					goto found;
+			}
+		}
+	}
+	return size;
+
+found:
+	start = (curr - bitmap) * 32 + i;
+	if (start >= size)
+		return start;
+	/* do any partial u32 at the start */
+	len = min(size - start, len);
+	while (1) {
+		n |= mask;
+		if (++i >= 32)
+			break;
+		mask >>= 1;
+		if (!--len || n & mask)
+			goto done;
+	}
+	if (!--len)
+		goto done;
+	*curr++ = cpu_to_be32(n);
+	/* do full u32s */
+	while (1) {
+		n = be32_to_cpu(*curr);
+		if (len < 32)
+			break;
+		if (n) {
+			len = 32;
+			break;
+		}
+		*curr++ = cpu_to_be32(0xffffffff);
+		len -= 32;
+	}
+	/* do any partial u32 at end */
+	mask = 1U << 31;
+	for (i = 0; i < len; i++) {
+		if (n & mask)
+			break;
+		n |= mask;
+		mask >>= 1;
+	}
+done:
+	*curr = cpu_to_be32(n);
+	*max = (curr - bitmap) * 32 + i - start;
+	return start;
+}
+
+/*
+ * hfs_vbm_search_free()
+ *
+ * Description:
+ *   Search for 'num_bits' consecutive cleared bits in the bitmap blocks of
+ *   the hfs MDB. 'mdb' had better be locked or the returned range
+ *   may be no longer free, when this functions returns!
+ *   XXX Currently the search starts from bit 0, but it should start with
+ *   the bit number stored in 's_alloc_ptr' of the MDB.
+ * Input Variable(s):
+ *   struct hfs_mdb *mdb: Pointer to the hfs MDB
+ *   u16 *num_bits: Pointer to the number of cleared bits
+ *     to search for
+ * Output Variable(s):
+ *   u16 *num_bits: The number of consecutive clear bits of the
+ *     returned range. If the bitmap is fragmented, this will be less than
+ *     requested and it will be zero, when the disk is full.
+ * Returns:
+ *   The number of the first bit of the range of cleared bits which has been
+ *   found. When 'num_bits' is zero, this is invalid!
+ * Preconditions:
+ *   'mdb' points to a "valid" (struct hfs_mdb).
+ *   'num_bits' points to a variable of type (u16), which contains
+ *	the number of cleared bits to find.
+ * Postconditions:
+ *   'num_bits' is set to the length of the found sequence.
+ */
+u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits)
+{
+	void *bitmap;
+	u32 pos;
+
+	/* make sure we have actual work to perform */
+	if (!*num_bits)
+		return 0;
+
+	mutex_lock(&HFS_SB(sb)->bitmap_lock);
+	bitmap = HFS_SB(sb)->bitmap;
+
+	pos = hfs_find_set_zero_bits(bitmap, HFS_SB(sb)->fs_ablocks, goal, num_bits);
+	if (pos >= HFS_SB(sb)->fs_ablocks) {
+		if (goal)
+			pos = hfs_find_set_zero_bits(bitmap, goal, 0, num_bits);
+		if (pos >= HFS_SB(sb)->fs_ablocks) {
+			*num_bits = pos = 0;
+			goto out;
+		}
+	}
+
+	dprint(DBG_BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits);
+	HFS_SB(sb)->free_ablocks -= *num_bits;
+	hfs_bitmap_dirty(sb);
+out:
+	mutex_unlock(&HFS_SB(sb)->bitmap_lock);
+	return pos;
+}
+
+
+/*
+ * hfs_clear_vbm_bits()
+ *
+ * Description:
+ *   Clear the requested bits in the volume bitmap of the hfs filesystem
+ * Input Variable(s):
+ *   struct hfs_mdb *mdb: Pointer to the hfs MDB
+ *   u16 start: The offset of the first bit
+ *   u16 count: The number of bits
+ * Output Variable(s):
+ *   None
+ * Returns:
+ *    0: no error
+ *   -1: One of the bits was already clear.  This is a strange
+ *	 error and when it happens, the filesystem must be repaired!
+ *   -2: One or more of the bits are out of range of the bitmap.
+ * Preconditions:
+ *   'mdb' points to a "valid" (struct hfs_mdb).
+ * Postconditions:
+ *   Starting with bit number 'start', 'count' bits in the volume bitmap
+ *   are cleared. The affected bitmap blocks are marked "dirty", the free
+ *   block count of the MDB is updated and the MDB is marked dirty.
+ */
+int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count)
+{
+	__be32 *curr;
+	u32 mask;
+	int i, len;
+
+	/* is there any actual work to be done? */
+	if (!count)
+		return 0;
+
+	dprint(DBG_BITMAP, "clear_bits: %u,%u\n", start, count);
+	/* are all of the bits in range? */
+	if ((start + count) > HFS_SB(sb)->fs_ablocks)
+		return -2;
+
+	mutex_lock(&HFS_SB(sb)->bitmap_lock);
+	/* bitmap is always on a 32-bit boundary */
+	curr = HFS_SB(sb)->bitmap + (start / 32);
+	len = count;
+
+	/* do any partial u32 at the start */
+	i = start % 32;
+	if (i) {
+		int j = 32 - i;
+		mask = 0xffffffffU << j;
+		if (j > count) {
+			mask |= 0xffffffffU >> (i + count);
+			*curr &= cpu_to_be32(mask);
+			goto out;
+		}
+		*curr++ &= cpu_to_be32(mask);
+		count -= j;
+	}
+
+	/* do full u32s */
+	while (count >= 32) {
+		*curr++ = 0;
+		count -= 32;
+	}
+	/* do any partial u32 at end */
+	if (count) {
+		mask = 0xffffffffU >> count;
+		*curr &= cpu_to_be32(mask);
+	}
+out:
+	HFS_SB(sb)->free_ablocks += len;
+	mutex_unlock(&HFS_SB(sb)->bitmap_lock);
+	hfs_bitmap_dirty(sb);
+
+	return 0;
+}
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
new file mode 100644
index 00000000..cdb41a1f
--- /dev/null
+++ b/fs/hfs/bnode.c
@@ -0,0 +1,478 @@
+/*
+ *  linux/fs/hfs/bnode.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handle basic btree node operations
+ */
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+
+#include "btree.h"
+
+void hfs_bnode_read(struct hfs_bnode *node, void *buf,
+		int off, int len)
+{
+	struct page *page;
+
+	off += node->page_offset;
+	page = node->page[0];
+
+	memcpy(buf, kmap(page) + off, len);
+	kunmap(page);
+}
+
+u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
+{
+	__be16 data;
+	// optimize later...
+	hfs_bnode_read(node, &data, off, 2);
+	return be16_to_cpu(data);
+}
+
+u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off)
+{
+	u8 data;
+	// optimize later...
+	hfs_bnode_read(node, &data, off, 1);
+	return data;
+}
+
+void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)
+{
+	struct hfs_btree *tree;
+	int key_len;
+
+	tree = node->tree;
+	if (node->type == HFS_NODE_LEAF ||
+	    tree->attributes & HFS_TREE_VARIDXKEYS)
+		key_len = hfs_bnode_read_u8(node, off) + 1;
+	else
+		key_len = tree->max_key_len + 1;
+
+	hfs_bnode_read(node, key, off, key_len);
+}
+
+void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
+{
+	struct page *page;
+
+	off += node->page_offset;
+	page = node->page[0];
+
+	memcpy(kmap(page) + off, buf, len);
+	kunmap(page);
+	set_page_dirty(page);
+}
+
+void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data)
+{
+	__be16 v = cpu_to_be16(data);
+	// optimize later...
+	hfs_bnode_write(node, &v, off, 2);
+}
+
+void hfs_bnode_write_u8(struct hfs_bnode *node, int off, u8 data)
+{
+	// optimize later...
+	hfs_bnode_write(node, &data, off, 1);
+}
+
+void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
+{
+	struct page *page;
+
+	off += node->page_offset;
+	page = node->page[0];
+
+	memset(kmap(page) + off, 0, len);
+	kunmap(page);
+	set_page_dirty(page);
+}
+
+void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
+		struct hfs_bnode *src_node, int src, int len)
+{
+	struct hfs_btree *tree;
+	struct page *src_page, *dst_page;
+
+	dprint(DBG_BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
+	if (!len)
+		return;
+	tree = src_node->tree;
+	src += src_node->page_offset;
+	dst += dst_node->page_offset;
+	src_page = src_node->page[0];
+	dst_page = dst_node->page[0];
+
+	memcpy(kmap(dst_page) + dst, kmap(src_page) + src, len);
+	kunmap(src_page);
+	kunmap(dst_page);
+	set_page_dirty(dst_page);
+}
+
+void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
+{
+	struct page *page;
+	void *ptr;
+
+	dprint(DBG_BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
+	if (!len)
+		return;
+	src += node->page_offset;
+	dst += node->page_offset;
+	page = node->page[0];
+	ptr = kmap(page);
+	memmove(ptr + dst, ptr + src, len);
+	kunmap(page);
+	set_page_dirty(page);
+}
+
+void hfs_bnode_dump(struct hfs_bnode *node)
+{
+	struct hfs_bnode_desc desc;
+	__be32 cnid;
+	int i, off, key_off;
+
+	dprint(DBG_BNODE_MOD, "bnode: %d\n", node->this);
+	hfs_bnode_read(node, &desc, 0, sizeof(desc));
+	dprint(DBG_BNODE_MOD, "%d, %d, %d, %d, %d\n",
+		be32_to_cpu(desc.next), be32_to_cpu(desc.prev),
+		desc.type, desc.height, be16_to_cpu(desc.num_recs));
+
+	off = node->tree->node_size - 2;
+	for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) {
+		key_off = hfs_bnode_read_u16(node, off);
+		dprint(DBG_BNODE_MOD, " %d", key_off);
+		if (i && node->type == HFS_NODE_INDEX) {
+			int tmp;
+
+			if (node->tree->attributes & HFS_TREE_VARIDXKEYS)
+				tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1;
+			else
+				tmp = node->tree->max_key_len + 1;
+			dprint(DBG_BNODE_MOD, " (%d,%d", tmp, hfs_bnode_read_u8(node, key_off));
+			hfs_bnode_read(node, &cnid, key_off + tmp, 4);
+			dprint(DBG_BNODE_MOD, ",%d)", be32_to_cpu(cnid));
+		} else if (i && node->type == HFS_NODE_LEAF) {
+			int tmp;
+
+			tmp = hfs_bnode_read_u8(node, key_off);
+			dprint(DBG_BNODE_MOD, " (%d)", tmp);
+		}
+	}
+	dprint(DBG_BNODE_MOD, "\n");
+}
+
+void hfs_bnode_unlink(struct hfs_bnode *node)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *tmp;
+	__be32 cnid;
+
+	tree = node->tree;
+	if (node->prev) {
+		tmp = hfs_bnode_find(tree, node->prev);
+		if (IS_ERR(tmp))
+			return;
+		tmp->next = node->next;
+		cnid = cpu_to_be32(tmp->next);
+		hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, next), 4);
+		hfs_bnode_put(tmp);
+	} else if (node->type == HFS_NODE_LEAF)
+		tree->leaf_head = node->next;
+
+	if (node->next) {
+		tmp = hfs_bnode_find(tree, node->next);
+		if (IS_ERR(tmp))
+			return;
+		tmp->prev = node->prev;
+		cnid = cpu_to_be32(tmp->prev);
+		hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, prev), 4);
+		hfs_bnode_put(tmp);
+	} else if (node->type == HFS_NODE_LEAF)
+		tree->leaf_tail = node->prev;
+
+	// move down?
+	if (!node->prev && !node->next) {
+		printk(KERN_DEBUG "hfs_btree_del_level\n");
+	}
+	if (!node->parent) {
+		tree->root = 0;
+		tree->depth = 0;
+	}
+	set_bit(HFS_BNODE_DELETED, &node->flags);
+}
+
+static inline int hfs_bnode_hash(u32 num)
+{
+	num = (num >> 16) + num;
+	num += num >> 8;
+	return num & (NODE_HASH_SIZE - 1);
+}
+
+struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
+{
+	struct hfs_bnode *node;
+
+	if (cnid >= tree->node_count) {
+		printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
+		return NULL;
+	}
+
+	for (node = tree->node_hash[hfs_bnode_hash(cnid)];
+	     node; node = node->next_hash) {
+		if (node->this == cnid) {
+			return node;
+		}
+	}
+	return NULL;
+}
+
+static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
+{
+	struct super_block *sb;
+	struct hfs_bnode *node, *node2;
+	struct address_space *mapping;
+	struct page *page;
+	int size, block, i, hash;
+	loff_t off;
+
+	if (cnid >= tree->node_count) {
+		printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
+		return NULL;
+	}
+
+	sb = tree->inode->i_sb;
+	size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
+		sizeof(struct page *);
+	node = kzalloc(size, GFP_KERNEL);
+	if (!node)
+		return NULL;
+	node->tree = tree;
+	node->this = cnid;
+	set_bit(HFS_BNODE_NEW, &node->flags);
+	atomic_set(&node->refcnt, 1);
+	dprint(DBG_BNODE_REFS, "new_node(%d:%d): 1\n",
+	       node->tree->cnid, node->this);
+	init_waitqueue_head(&node->lock_wq);
+	spin_lock(&tree->hash_lock);
+	node2 = hfs_bnode_findhash(tree, cnid);
+	if (!node2) {
+		hash = hfs_bnode_hash(cnid);
+		node->next_hash = tree->node_hash[hash];
+		tree->node_hash[hash] = node;
+		tree->node_hash_cnt++;
+	} else {
+		spin_unlock(&tree->hash_lock);
+		kfree(node);
+		wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags));
+		return node2;
+	}
+	spin_unlock(&tree->hash_lock);
+
+	mapping = tree->inode->i_mapping;
+	off = (loff_t)cnid * tree->node_size;
+	block = off >> PAGE_CACHE_SHIFT;
+	node->page_offset = off & ~PAGE_CACHE_MASK;
+	for (i = 0; i < tree->pages_per_bnode; i++) {
+		page = read_mapping_page(mapping, block++, NULL);
+		if (IS_ERR(page))
+			goto fail;
+		if (PageError(page)) {
+			page_cache_release(page);
+			goto fail;
+		}
+		page_cache_release(page);
+		node->page[i] = page;
+	}
+
+	return node;
+fail:
+	set_bit(HFS_BNODE_ERROR, &node->flags);
+	return node;
+}
+
+void hfs_bnode_unhash(struct hfs_bnode *node)
+{
+	struct hfs_bnode **p;
+
+	dprint(DBG_BNODE_REFS, "remove_node(%d:%d): %d\n",
+		node->tree->cnid, node->this, atomic_read(&node->refcnt));
+	for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)];
+	     *p && *p != node; p = &(*p)->next_hash)
+		;
+	BUG_ON(!*p);
+	*p = node->next_hash;
+	node->tree->node_hash_cnt--;
+}
+
+/* Load a particular node out of a tree */
+struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
+{
+	struct hfs_bnode *node;
+	struct hfs_bnode_desc *desc;
+	int i, rec_off, off, next_off;
+	int entry_size, key_size;
+
+	spin_lock(&tree->hash_lock);
+	node = hfs_bnode_findhash(tree, num);
+	if (node) {
+		hfs_bnode_get(node);
+		spin_unlock(&tree->hash_lock);
+		wait_event(node->lock_wq, !test_bit(HFS_BNODE_NEW, &node->flags));
+		if (test_bit(HFS_BNODE_ERROR, &node->flags))
+			goto node_error;
+		return node;
+	}
+	spin_unlock(&tree->hash_lock);
+	node = __hfs_bnode_create(tree, num);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+	if (test_bit(HFS_BNODE_ERROR, &node->flags))
+		goto node_error;
+	if (!test_bit(HFS_BNODE_NEW, &node->flags))
+		return node;
+
+	desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset);
+	node->prev = be32_to_cpu(desc->prev);
+	node->next = be32_to_cpu(desc->next);
+	node->num_recs = be16_to_cpu(desc->num_recs);
+	node->type = desc->type;
+	node->height = desc->height;
+	kunmap(node->page[0]);
+
+	switch (node->type) {
+	case HFS_NODE_HEADER:
+	case HFS_NODE_MAP:
+		if (node->height != 0)
+			goto node_error;
+		break;
+	case HFS_NODE_LEAF:
+		if (node->height != 1)
+			goto node_error;
+		break;
+	case HFS_NODE_INDEX:
+		if (node->height <= 1 || node->height > tree->depth)
+			goto node_error;
+		break;
+	default:
+		goto node_error;
+	}
+
+	rec_off = tree->node_size - 2;
+	off = hfs_bnode_read_u16(node, rec_off);
+	if (off != sizeof(struct hfs_bnode_desc))
+		goto node_error;
+	for (i = 1; i <= node->num_recs; off = next_off, i++) {
+		rec_off -= 2;
+		next_off = hfs_bnode_read_u16(node, rec_off);
+		if (next_off <= off ||
+		    next_off > tree->node_size ||
+		    next_off & 1)
+			goto node_error;
+		entry_size = next_off - off;
+		if (node->type != HFS_NODE_INDEX &&
+		    node->type != HFS_NODE_LEAF)
+			continue;
+		key_size = hfs_bnode_read_u8(node, off) + 1;
+		if (key_size >= entry_size /*|| key_size & 1*/)
+			goto node_error;
+	}
+	clear_bit(HFS_BNODE_NEW, &node->flags);
+	wake_up(&node->lock_wq);
+	return node;
+
+node_error:
+	set_bit(HFS_BNODE_ERROR, &node->flags);
+	clear_bit(HFS_BNODE_NEW, &node->flags);
+	wake_up(&node->lock_wq);
+	hfs_bnode_put(node);
+	return ERR_PTR(-EIO);
+}
+
+void hfs_bnode_free(struct hfs_bnode *node)
+{
+	//int i;
+
+	//for (i = 0; i < node->tree->pages_per_bnode; i++)
+	//	if (node->page[i])
+	//		page_cache_release(node->page[i]);
+	kfree(node);
+}
+
+struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
+{
+	struct hfs_bnode *node;
+	struct page **pagep;
+	int i;
+
+	spin_lock(&tree->hash_lock);
+	node = hfs_bnode_findhash(tree, num);
+	spin_unlock(&tree->hash_lock);
+	BUG_ON(node);
+	node = __hfs_bnode_create(tree, num);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+	if (test_bit(HFS_BNODE_ERROR, &node->flags)) {
+		hfs_bnode_put(node);
+		return ERR_PTR(-EIO);
+	}
+
+	pagep = node->page;
+	memset(kmap(*pagep) + node->page_offset, 0,
+	       min((int)PAGE_CACHE_SIZE, (int)tree->node_size));
+	set_page_dirty(*pagep);
+	kunmap(*pagep);
+	for (i = 1; i < tree->pages_per_bnode; i++) {
+		memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE);
+		set_page_dirty(*pagep);
+		kunmap(*pagep);
+	}
+	clear_bit(HFS_BNODE_NEW, &node->flags);
+	wake_up(&node->lock_wq);
+
+	return node;
+}
+
+void hfs_bnode_get(struct hfs_bnode *node)
+{
+	if (node) {
+		atomic_inc(&node->refcnt);
+		dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
+		       node->tree->cnid, node->this, atomic_read(&node->refcnt));
+	}
+}
+
+/* Dispose of resources used by a node */
+void hfs_bnode_put(struct hfs_bnode *node)
+{
+	if (node) {
+		struct hfs_btree *tree = node->tree;
+		int i;
+
+		dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n",
+		       node->tree->cnid, node->this, atomic_read(&node->refcnt));
+		BUG_ON(!atomic_read(&node->refcnt));
+		if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
+			return;
+		for (i = 0; i < tree->pages_per_bnode; i++) {
+			if (!node->page[i])
+				continue;
+			mark_page_accessed(node->page[i]);
+		}
+
+		if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
+			hfs_bnode_unhash(node);
+			spin_unlock(&tree->hash_lock);
+			hfs_bmap_free(node);
+			hfs_bnode_free(node);
+			return;
+		}
+		spin_unlock(&tree->hash_lock);
+	}
+}
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
new file mode 100644
index 00000000..92fb358c
--- /dev/null
+++ b/fs/hfs/brec.c
@@ -0,0 +1,519 @@
+/*
+ *  linux/fs/hfs/brec.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handle individual btree records
+ */
+
+#include "btree.h"
+
+static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd);
+static int hfs_brec_update_parent(struct hfs_find_data *fd);
+static int hfs_btree_inc_height(struct hfs_btree *tree);
+
+/* Get the length and offset of the given record in the given node */
+u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off)
+{
+	__be16 retval[2];
+	u16 dataoff;
+
+	dataoff = node->tree->node_size - (rec + 2) * 2;
+	hfs_bnode_read(node, retval, dataoff, 4);
+	*off = be16_to_cpu(retval[1]);
+	return be16_to_cpu(retval[0]) - *off;
+}
+
+/* Get the length of the key from a keyed record */
+u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
+{
+	u16 retval, recoff;
+
+	if (node->type != HFS_NODE_INDEX && node->type != HFS_NODE_LEAF)
+		return 0;
+
+	if ((node->type == HFS_NODE_INDEX) &&
+	   !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) {
+		if (node->tree->attributes & HFS_TREE_BIGKEYS)
+			retval = node->tree->max_key_len + 2;
+		else
+			retval = node->tree->max_key_len + 1;
+	} else {
+		recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
+		if (!recoff)
+			return 0;
+		if (node->tree->attributes & HFS_TREE_BIGKEYS) {
+			retval = hfs_bnode_read_u16(node, recoff) + 2;
+			if (retval > node->tree->max_key_len + 2) {
+				printk(KERN_ERR "hfs: keylen %d too large\n",
+					retval);
+				retval = 0;
+			}
+		} else {
+			retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
+			if (retval > node->tree->max_key_len + 1) {
+				printk(KERN_ERR "hfs: keylen %d too large\n",
+					retval);
+				retval = 0;
+			}
+		}
+	}
+	return retval;
+}
+
+int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *new_node;
+	int size, key_len, rec;
+	int data_off, end_off;
+	int idx_rec_off, data_rec_off, end_rec_off;
+	__be32 cnid;
+
+	tree = fd->tree;
+	if (!fd->bnode) {
+		if (!tree->root)
+			hfs_btree_inc_height(tree);
+		fd->bnode = hfs_bnode_find(tree, tree->leaf_head);
+		if (IS_ERR(fd->bnode))
+			return PTR_ERR(fd->bnode);
+		fd->record = -1;
+	}
+	new_node = NULL;
+	key_len = (fd->search_key->key_len | 1) + 1;
+again:
+	/* new record idx and complete record size */
+	rec = fd->record + 1;
+	size = key_len + entry_len;
+
+	node = fd->bnode;
+	hfs_bnode_dump(node);
+	/* get last offset */
+	end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
+	end_off = hfs_bnode_read_u16(node, end_rec_off);
+	end_rec_off -= 2;
+	dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off);
+	if (size > end_rec_off - end_off) {
+		if (new_node)
+			panic("not enough room!\n");
+		new_node = hfs_bnode_split(fd);
+		if (IS_ERR(new_node))
+			return PTR_ERR(new_node);
+		goto again;
+	}
+	if (node->type == HFS_NODE_LEAF) {
+		tree->leaf_count++;
+		mark_inode_dirty(tree->inode);
+	}
+	node->num_recs++;
+	/* write new last offset */
+	hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);
+	hfs_bnode_write_u16(node, end_rec_off, end_off + size);
+	data_off = end_off;
+	data_rec_off = end_rec_off + 2;
+	idx_rec_off = tree->node_size - (rec + 1) * 2;
+	if (idx_rec_off == data_rec_off)
+		goto skip;
+	/* move all following entries */
+	do {
+		data_off = hfs_bnode_read_u16(node, data_rec_off + 2);
+		hfs_bnode_write_u16(node, data_rec_off, data_off + size);
+		data_rec_off += 2;
+	} while (data_rec_off < idx_rec_off);
+
+	/* move data away */
+	hfs_bnode_move(node, data_off + size, data_off,
+		       end_off - data_off);
+
+skip:
+	hfs_bnode_write(node, fd->search_key, data_off, key_len);
+	hfs_bnode_write(node, entry, data_off + key_len, entry_len);
+	hfs_bnode_dump(node);
+
+	if (new_node) {
+		/* update parent key if we inserted a key
+		 * at the start of the first node
+		 */
+		if (!rec && new_node != node)
+			hfs_brec_update_parent(fd);
+
+		hfs_bnode_put(fd->bnode);
+		if (!new_node->parent) {
+			hfs_btree_inc_height(tree);
+			new_node->parent = tree->root;
+		}
+		fd->bnode = hfs_bnode_find(tree, new_node->parent);
+
+		/* create index data entry */
+		cnid = cpu_to_be32(new_node->this);
+		entry = &cnid;
+		entry_len = sizeof(cnid);
+
+		/* get index key */
+		hfs_bnode_read_key(new_node, fd->search_key, 14);
+		__hfs_brec_find(fd->bnode, fd);
+
+		hfs_bnode_put(new_node);
+		new_node = NULL;
+
+		if (tree->attributes & HFS_TREE_VARIDXKEYS)
+			key_len = fd->search_key->key_len + 1;
+		else {
+			fd->search_key->key_len = tree->max_key_len;
+			key_len = tree->max_key_len + 1;
+		}
+		goto again;
+	}
+
+	if (!rec)
+		hfs_brec_update_parent(fd);
+
+	return 0;
+}
+
+int hfs_brec_remove(struct hfs_find_data *fd)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *parent;
+	int end_off, rec_off, data_off, size;
+
+	tree = fd->tree;
+	node = fd->bnode;
+again:
+	rec_off = tree->node_size - (fd->record + 2) * 2;
+	end_off = tree->node_size - (node->num_recs + 1) * 2;
+
+	if (node->type == HFS_NODE_LEAF) {
+		tree->leaf_count--;
+		mark_inode_dirty(tree->inode);
+	}
+	hfs_bnode_dump(node);
+	dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength);
+	if (!--node->num_recs) {
+		hfs_bnode_unlink(node);
+		if (!node->parent)
+			return 0;
+		parent = hfs_bnode_find(tree, node->parent);
+		if (IS_ERR(parent))
+			return PTR_ERR(parent);
+		hfs_bnode_put(node);
+		node = fd->bnode = parent;
+
+		__hfs_brec_find(node, fd);
+		goto again;
+	}
+	hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);
+
+	if (rec_off == end_off)
+		goto skip;
+	size = fd->keylength + fd->entrylength;
+
+	do {
+		data_off = hfs_bnode_read_u16(node, rec_off);
+		hfs_bnode_write_u16(node, rec_off + 2, data_off - size);
+		rec_off -= 2;
+	} while (rec_off >= end_off);
+
+	/* fill hole */
+	hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size,
+		       data_off - fd->keyoffset - size);
+skip:
+	hfs_bnode_dump(node);
+	if (!fd->record)
+		hfs_brec_update_parent(fd);
+	return 0;
+}
+
+static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *new_node, *next_node;
+	struct hfs_bnode_desc node_desc;
+	int num_recs, new_rec_off, new_off, old_rec_off;
+	int data_start, data_end, size;
+
+	tree = fd->tree;
+	node = fd->bnode;
+	new_node = hfs_bmap_alloc(tree);
+	if (IS_ERR(new_node))
+		return new_node;
+	hfs_bnode_get(node);
+	dprint(DBG_BNODE_MOD, "split_nodes: %d - %d - %d\n",
+		node->this, new_node->this, node->next);
+	new_node->next = node->next;
+	new_node->prev = node->this;
+	new_node->parent = node->parent;
+	new_node->type = node->type;
+	new_node->height = node->height;
+
+	if (node->next)
+		next_node = hfs_bnode_find(tree, node->next);
+	else
+		next_node = NULL;
+
+	if (IS_ERR(next_node)) {
+		hfs_bnode_put(node);
+		hfs_bnode_put(new_node);
+		return next_node;
+	}
+
+	size = tree->node_size / 2 - node->num_recs * 2 - 14;
+	old_rec_off = tree->node_size - 4;
+	num_recs = 1;
+	for (;;) {
+		data_start = hfs_bnode_read_u16(node, old_rec_off);
+		if (data_start > size)
+			break;
+		old_rec_off -= 2;
+		if (++num_recs < node->num_recs)
+			continue;
+		/* panic? */
+		hfs_bnode_put(node);
+		hfs_bnode_put(new_node);
+		if (next_node)
+			hfs_bnode_put(next_node);
+		return ERR_PTR(-ENOSPC);
+	}
+
+	if (fd->record + 1 < num_recs) {
+		/* new record is in the lower half,
+		 * so leave some more space there
+		 */
+		old_rec_off += 2;
+		num_recs--;
+		data_start = hfs_bnode_read_u16(node, old_rec_off);
+	} else {
+		hfs_bnode_put(node);
+		hfs_bnode_get(new_node);
+		fd->bnode = new_node;
+		fd->record -= num_recs;
+		fd->keyoffset -= data_start - 14;
+		fd->entryoffset -= data_start - 14;
+	}
+	new_node->num_recs = node->num_recs - num_recs;
+	node->num_recs = num_recs;
+
+	new_rec_off = tree->node_size - 2;
+	new_off = 14;
+	size = data_start - new_off;
+	num_recs = new_node->num_recs;
+	data_end = data_start;
+	while (num_recs) {
+		hfs_bnode_write_u16(new_node, new_rec_off, new_off);
+		old_rec_off -= 2;
+		new_rec_off -= 2;
+		data_end = hfs_bnode_read_u16(node, old_rec_off);
+		new_off = data_end - size;
+		num_recs--;
+	}
+	hfs_bnode_write_u16(new_node, new_rec_off, new_off);
+	hfs_bnode_copy(new_node, 14, node, data_start, data_end - data_start);
+
+	/* update new bnode header */
+	node_desc.next = cpu_to_be32(new_node->next);
+	node_desc.prev = cpu_to_be32(new_node->prev);
+	node_desc.type = new_node->type;
+	node_desc.height = new_node->height;
+	node_desc.num_recs = cpu_to_be16(new_node->num_recs);
+	node_desc.reserved = 0;
+	hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc));
+
+	/* update previous bnode header */
+	node->next = new_node->this;
+	hfs_bnode_read(node, &node_desc, 0, sizeof(node_desc));
+	node_desc.next = cpu_to_be32(node->next);
+	node_desc.num_recs = cpu_to_be16(node->num_recs);
+	hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
+
+	/* update next bnode header */
+	if (next_node) {
+		next_node->prev = new_node->this;
+		hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
+		node_desc.prev = cpu_to_be32(next_node->prev);
+		hfs_bnode_write(next_node, &node_desc, 0, sizeof(node_desc));
+		hfs_bnode_put(next_node);
+	} else if (node->this == tree->leaf_tail) {
+		/* if there is no next node, this might be the new tail */
+		tree->leaf_tail = new_node->this;
+		mark_inode_dirty(tree->inode);
+	}
+
+	hfs_bnode_dump(node);
+	hfs_bnode_dump(new_node);
+	hfs_bnode_put(node);
+
+	return new_node;
+}
+
+static int hfs_brec_update_parent(struct hfs_find_data *fd)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *new_node, *parent;
+	int newkeylen, diff;
+	int rec, rec_off, end_rec_off;
+	int start_off, end_off;
+
+	tree = fd->tree;
+	node = fd->bnode;
+	new_node = NULL;
+	if (!node->parent)
+		return 0;
+
+again:
+	parent = hfs_bnode_find(tree, node->parent);
+	if (IS_ERR(parent))
+		return PTR_ERR(parent);
+	__hfs_brec_find(parent, fd);
+	hfs_bnode_dump(parent);
+	rec = fd->record;
+
+	/* size difference between old and new key */
+	if (tree->attributes & HFS_TREE_VARIDXKEYS)
+		newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1;
+	else
+		fd->keylength = newkeylen = tree->max_key_len + 1;
+	dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen);
+
+	rec_off = tree->node_size - (rec + 2) * 2;
+	end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
+	diff = newkeylen - fd->keylength;
+	if (!diff)
+		goto skip;
+	if (diff > 0) {
+		end_off = hfs_bnode_read_u16(parent, end_rec_off);
+		if (end_rec_off - end_off < diff) {
+
+			printk(KERN_DEBUG "hfs: splitting index node...\n");
+			fd->bnode = parent;
+			new_node = hfs_bnode_split(fd);
+			if (IS_ERR(new_node))
+				return PTR_ERR(new_node);
+			parent = fd->bnode;
+			rec = fd->record;
+			rec_off = tree->node_size - (rec + 2) * 2;
+			end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
+		}
+	}
+
+	end_off = start_off = hfs_bnode_read_u16(parent, rec_off);
+	hfs_bnode_write_u16(parent, rec_off, start_off + diff);
+	start_off -= 4;	/* move previous cnid too */
+
+	while (rec_off > end_rec_off) {
+		rec_off -= 2;
+		end_off = hfs_bnode_read_u16(parent, rec_off);
+		hfs_bnode_write_u16(parent, rec_off, end_off + diff);
+	}
+	hfs_bnode_move(parent, start_off + diff, start_off,
+		       end_off - start_off);
+skip:
+	hfs_bnode_copy(parent, fd->keyoffset, node, 14, newkeylen);
+	if (!(tree->attributes & HFS_TREE_VARIDXKEYS))
+		hfs_bnode_write_u8(parent, fd->keyoffset, newkeylen - 1);
+	hfs_bnode_dump(parent);
+
+	hfs_bnode_put(node);
+	node = parent;
+
+	if (new_node) {
+		__be32 cnid;
+
+		fd->bnode = hfs_bnode_find(tree, new_node->parent);
+		/* create index key and entry */
+		hfs_bnode_read_key(new_node, fd->search_key, 14);
+		cnid = cpu_to_be32(new_node->this);
+
+		__hfs_brec_find(fd->bnode, fd);
+		hfs_brec_insert(fd, &cnid, sizeof(cnid));
+		hfs_bnode_put(fd->bnode);
+		hfs_bnode_put(new_node);
+
+		if (!rec) {
+			if (new_node == node)
+				goto out;
+			/* restore search_key */
+			hfs_bnode_read_key(node, fd->search_key, 14);
+		}
+	}
+
+	if (!rec && node->parent)
+		goto again;
+out:
+	fd->bnode = node;
+	return 0;
+}
+
+static int hfs_btree_inc_height(struct hfs_btree *tree)
+{
+	struct hfs_bnode *node, *new_node;
+	struct hfs_bnode_desc node_desc;
+	int key_size, rec;
+	__be32 cnid;
+
+	node = NULL;
+	if (tree->root) {
+		node = hfs_bnode_find(tree, tree->root);
+		if (IS_ERR(node))
+			return PTR_ERR(node);
+	}
+	new_node = hfs_bmap_alloc(tree);
+	if (IS_ERR(new_node)) {
+		hfs_bnode_put(node);
+		return PTR_ERR(new_node);
+	}
+
+	tree->root = new_node->this;
+	if (!tree->depth) {
+		tree->leaf_head = tree->leaf_tail = new_node->this;
+		new_node->type = HFS_NODE_LEAF;
+		new_node->num_recs = 0;
+	} else {
+		new_node->type = HFS_NODE_INDEX;
+		new_node->num_recs = 1;
+	}
+	new_node->parent = 0;
+	new_node->next = 0;
+	new_node->prev = 0;
+	new_node->height = ++tree->depth;
+
+	node_desc.next = cpu_to_be32(new_node->next);
+	node_desc.prev = cpu_to_be32(new_node->prev);
+	node_desc.type = new_node->type;
+	node_desc.height = new_node->height;
+	node_desc.num_recs = cpu_to_be16(new_node->num_recs);
+	node_desc.reserved = 0;
+	hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc));
+
+	rec = tree->node_size - 2;
+	hfs_bnode_write_u16(new_node, rec, 14);
+
+	if (node) {
+		/* insert old root idx into new root */
+		node->parent = tree->root;
+		if (node->type == HFS_NODE_LEAF ||
+		    tree->attributes & HFS_TREE_VARIDXKEYS)
+			key_size = hfs_bnode_read_u8(node, 14) + 1;
+		else
+			key_size = tree->max_key_len + 1;
+		hfs_bnode_copy(new_node, 14, node, 14, key_size);
+
+		if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
+			key_size = tree->max_key_len + 1;
+			hfs_bnode_write_u8(new_node, 14, tree->max_key_len);
+		}
+		key_size = (key_size + 1) & -2;
+		cnid = cpu_to_be32(node->this);
+		hfs_bnode_write(new_node, &cnid, 14 + key_size, 4);
+
+		rec -= 2;
+		hfs_bnode_write_u16(new_node, rec, 14 + key_size + 4);
+
+		hfs_bnode_put(node);
+	}
+	hfs_bnode_put(new_node);
+	mark_inode_dirty(tree->inode);
+
+	return 0;
+}
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
new file mode 100644
index 00000000..1cbdeea1
--- /dev/null
+++ b/fs/hfs/btree.c
@@ -0,0 +1,366 @@
+/*
+ *  linux/fs/hfs/btree.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handle opening/closing btree
+ */
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+
+#include "btree.h"
+
+/* Get a reference to a B*Tree and do some initial checks */
+struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp keycmp)
+{
+	struct hfs_btree *tree;
+	struct hfs_btree_header_rec *head;
+	struct address_space *mapping;
+	struct page *page;
+	unsigned int size;
+
+	tree = kzalloc(sizeof(*tree), GFP_KERNEL);
+	if (!tree)
+		return NULL;
+
+	mutex_init(&tree->tree_lock);
+	spin_lock_init(&tree->hash_lock);
+	/* Set the correct compare function */
+	tree->sb = sb;
+	tree->cnid = id;
+	tree->keycmp = keycmp;
+
+	tree->inode = iget_locked(sb, id);
+	if (!tree->inode)
+		goto free_tree;
+	BUG_ON(!(tree->inode->i_state & I_NEW));
+	{
+	struct hfs_mdb *mdb = HFS_SB(sb)->mdb;
+	HFS_I(tree->inode)->flags = 0;
+	mutex_init(&HFS_I(tree->inode)->extents_lock);
+	switch (id) {
+	case HFS_EXT_CNID:
+		hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize,
+				    mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz));
+		if (HFS_I(tree->inode)->alloc_blocks >
+					HFS_I(tree->inode)->first_blocks) {
+			printk(KERN_ERR "hfs: invalid btree extent records\n");
+			unlock_new_inode(tree->inode);
+			goto free_inode;
+		}
+
+		tree->inode->i_mapping->a_ops = &hfs_btree_aops;
+		break;
+	case HFS_CAT_CNID:
+		hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize,
+				    mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz));
+
+		if (!HFS_I(tree->inode)->first_blocks) {
+			printk(KERN_ERR "hfs: invalid btree extent records "
+								"(0 size).\n");
+			unlock_new_inode(tree->inode);
+			goto free_inode;
+		}
+
+		tree->inode->i_mapping->a_ops = &hfs_btree_aops;
+		break;
+	default:
+		BUG();
+	}
+	}
+	unlock_new_inode(tree->inode);
+
+	mapping = tree->inode->i_mapping;
+	page = read_mapping_page(mapping, 0, NULL);
+	if (IS_ERR(page))
+		goto free_inode;
+
+	/* Load the header */
+	head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+	tree->root = be32_to_cpu(head->root);
+	tree->leaf_count = be32_to_cpu(head->leaf_count);
+	tree->leaf_head = be32_to_cpu(head->leaf_head);
+	tree->leaf_tail = be32_to_cpu(head->leaf_tail);
+	tree->node_count = be32_to_cpu(head->node_count);
+	tree->free_nodes = be32_to_cpu(head->free_nodes);
+	tree->attributes = be32_to_cpu(head->attributes);
+	tree->node_size = be16_to_cpu(head->node_size);
+	tree->max_key_len = be16_to_cpu(head->max_key_len);
+	tree->depth = be16_to_cpu(head->depth);
+
+	size = tree->node_size;
+	if (!is_power_of_2(size))
+		goto fail_page;
+	if (!tree->node_count)
+		goto fail_page;
+	switch (id) {
+	case HFS_EXT_CNID:
+		if (tree->max_key_len != HFS_MAX_EXT_KEYLEN) {
+			printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
+				tree->max_key_len);
+			goto fail_page;
+		}
+		break;
+	case HFS_CAT_CNID:
+		if (tree->max_key_len != HFS_MAX_CAT_KEYLEN) {
+			printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
+				tree->max_key_len);
+			goto fail_page;
+		}
+		break;
+	default:
+		BUG();
+	}
+
+	tree->node_size_shift = ffs(size) - 1;
+	tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	kunmap(page);
+	page_cache_release(page);
+	return tree;
+
+fail_page:
+	page_cache_release(page);
+free_inode:
+	tree->inode->i_mapping->a_ops = &hfs_aops;
+	iput(tree->inode);
+free_tree:
+	kfree(tree);
+	return NULL;
+}
+
+/* Release resources used by a btree */
+void hfs_btree_close(struct hfs_btree *tree)
+{
+	struct hfs_bnode *node;
+	int i;
+
+	if (!tree)
+		return;
+
+	for (i = 0; i < NODE_HASH_SIZE; i++) {
+		while ((node = tree->node_hash[i])) {
+			tree->node_hash[i] = node->next_hash;
+			if (atomic_read(&node->refcnt))
+				printk(KERN_ERR "hfs: node %d:%d still has %d user(s)!\n",
+					node->tree->cnid, node->this, atomic_read(&node->refcnt));
+			hfs_bnode_free(node);
+			tree->node_hash_cnt--;
+		}
+	}
+	iput(tree->inode);
+	kfree(tree);
+}
+
+void hfs_btree_write(struct hfs_btree *tree)
+{
+	struct hfs_btree_header_rec *head;
+	struct hfs_bnode *node;
+	struct page *page;
+
+	node = hfs_bnode_find(tree, 0);
+	if (IS_ERR(node))
+		/* panic? */
+		return;
+	/* Load the header */
+	page = node->page[0];
+	head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+
+	head->root = cpu_to_be32(tree->root);
+	head->leaf_count = cpu_to_be32(tree->leaf_count);
+	head->leaf_head = cpu_to_be32(tree->leaf_head);
+	head->leaf_tail = cpu_to_be32(tree->leaf_tail);
+	head->node_count = cpu_to_be32(tree->node_count);
+	head->free_nodes = cpu_to_be32(tree->free_nodes);
+	head->attributes = cpu_to_be32(tree->attributes);
+	head->depth = cpu_to_be16(tree->depth);
+
+	kunmap(page);
+	set_page_dirty(page);
+	hfs_bnode_put(node);
+}
+
+static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
+{
+	struct hfs_btree *tree = prev->tree;
+	struct hfs_bnode *node;
+	struct hfs_bnode_desc desc;
+	__be32 cnid;
+
+	node = hfs_bnode_create(tree, idx);
+	if (IS_ERR(node))
+		return node;
+
+	if (!tree->free_nodes)
+		panic("FIXME!!!");
+	tree->free_nodes--;
+	prev->next = idx;
+	cnid = cpu_to_be32(idx);
+	hfs_bnode_write(prev, &cnid, offsetof(struct hfs_bnode_desc, next), 4);
+
+	node->type = HFS_NODE_MAP;
+	node->num_recs = 1;
+	hfs_bnode_clear(node, 0, tree->node_size);
+	desc.next = 0;
+	desc.prev = 0;
+	desc.type = HFS_NODE_MAP;
+	desc.height = 0;
+	desc.num_recs = cpu_to_be16(1);
+	desc.reserved = 0;
+	hfs_bnode_write(node, &desc, 0, sizeof(desc));
+	hfs_bnode_write_u16(node, 14, 0x8000);
+	hfs_bnode_write_u16(node, tree->node_size - 2, 14);
+	hfs_bnode_write_u16(node, tree->node_size - 4, tree->node_size - 6);
+
+	return node;
+}
+
+struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+{
+	struct hfs_bnode *node, *next_node;
+	struct page **pagep;
+	u32 nidx, idx;
+	unsigned off;
+	u16 off16;
+	u16 len;
+	u8 *data, byte, m;
+	int i;
+
+	while (!tree->free_nodes) {
+		struct inode *inode = tree->inode;
+		u32 count;
+		int res;
+
+		res = hfs_extend_file(inode);
+		if (res)
+			return ERR_PTR(res);
+		HFS_I(inode)->phys_size = inode->i_size =
+				(loff_t)HFS_I(inode)->alloc_blocks *
+				HFS_SB(tree->sb)->alloc_blksz;
+		HFS_I(inode)->fs_blocks = inode->i_size >>
+					  tree->sb->s_blocksize_bits;
+		inode_set_bytes(inode, inode->i_size);
+		count = inode->i_size >> tree->node_size_shift;
+		tree->free_nodes = count - tree->node_count;
+		tree->node_count = count;
+	}
+
+	nidx = 0;
+	node = hfs_bnode_find(tree, nidx);
+	if (IS_ERR(node))
+		return node;
+	len = hfs_brec_lenoff(node, 2, &off16);
+	off = off16;
+
+	off += node->page_offset;
+	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+	data = kmap(*pagep);
+	off &= ~PAGE_CACHE_MASK;
+	idx = 0;
+
+	for (;;) {
+		while (len) {
+			byte = data[off];
+			if (byte != 0xff) {
+				for (m = 0x80, i = 0; i < 8; m >>= 1, i++) {
+					if (!(byte & m)) {
+						idx += i;
+						data[off] |= m;
+						set_page_dirty(*pagep);
+						kunmap(*pagep);
+						tree->free_nodes--;
+						mark_inode_dirty(tree->inode);
+						hfs_bnode_put(node);
+						return hfs_bnode_create(tree, idx);
+					}
+				}
+			}
+			if (++off >= PAGE_CACHE_SIZE) {
+				kunmap(*pagep);
+				data = kmap(*++pagep);
+				off = 0;
+			}
+			idx += 8;
+			len--;
+		}
+		kunmap(*pagep);
+		nidx = node->next;
+		if (!nidx) {
+			printk(KERN_DEBUG "hfs: create new bmap node...\n");
+			next_node = hfs_bmap_new_bmap(node, idx);
+		} else
+			next_node = hfs_bnode_find(tree, nidx);
+		hfs_bnode_put(node);
+		if (IS_ERR(next_node))
+			return next_node;
+		node = next_node;
+
+		len = hfs_brec_lenoff(node, 0, &off16);
+		off = off16;
+		off += node->page_offset;
+		pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+		data = kmap(*pagep);
+		off &= ~PAGE_CACHE_MASK;
+	}
+}
+
+void hfs_bmap_free(struct hfs_bnode *node)
+{
+	struct hfs_btree *tree;
+	struct page *page;
+	u16 off, len;
+	u32 nidx;
+	u8 *data, byte, m;
+
+	dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this);
+	tree = node->tree;
+	nidx = node->this;
+	node = hfs_bnode_find(tree, 0);
+	if (IS_ERR(node))
+		return;
+	len = hfs_brec_lenoff(node, 2, &off);
+	while (nidx >= len * 8) {
+		u32 i;
+
+		nidx -= len * 8;
+		i = node->next;
+		hfs_bnode_put(node);
+		if (!i) {
+			/* panic */;
+			printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this);
+			return;
+		}
+		node = hfs_bnode_find(tree, i);
+		if (IS_ERR(node))
+			return;
+		if (node->type != HFS_NODE_MAP) {
+			/* panic */;
+			printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type);
+			hfs_bnode_put(node);
+			return;
+		}
+		len = hfs_brec_lenoff(node, 0, &off);
+	}
+	off += node->page_offset + nidx / 8;
+	page = node->page[off >> PAGE_CACHE_SHIFT];
+	data = kmap(page);
+	off &= ~PAGE_CACHE_MASK;
+	m = 1 << (~nidx & 7);
+	byte = data[off];
+	if (!(byte & m)) {
+		printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type);
+		kunmap(page);
+		hfs_bnode_put(node);
+		return;
+	}
+	data[off] = byte & ~m;
+	set_page_dirty(page);
+	kunmap(page);
+	hfs_bnode_put(node);
+	tree->free_nodes++;
+	mark_inode_dirty(tree->inode);
+}
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
new file mode 100644
index 00000000..2a1d712f
--- /dev/null
+++ b/fs/hfs/btree.h
@@ -0,0 +1,168 @@
+/*
+ *  linux/fs/hfs/btree.h
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ */
+
+#include "hfs_fs.h"
+
+typedef int (*btree_keycmp)(const btree_key *, const btree_key *);
+
+#define NODE_HASH_SIZE  256
+
+/* A HFS BTree held in memory */
+struct hfs_btree {
+	struct super_block *sb;
+	struct inode *inode;
+	btree_keycmp keycmp;
+
+	u32 cnid;
+	u32 root;
+	u32 leaf_count;
+	u32 leaf_head;
+	u32 leaf_tail;
+	u32 node_count;
+	u32 free_nodes;
+	u32 attributes;
+
+	unsigned int node_size;
+	unsigned int node_size_shift;
+	unsigned int max_key_len;
+	unsigned int depth;
+
+	//unsigned int map1_size, map_size;
+	struct mutex tree_lock;
+
+	unsigned int pages_per_bnode;
+	spinlock_t hash_lock;
+	struct hfs_bnode *node_hash[NODE_HASH_SIZE];
+	int node_hash_cnt;
+};
+
+/* A HFS BTree node in memory */
+struct hfs_bnode {
+	struct hfs_btree *tree;
+
+	u32 prev;
+	u32 this;
+	u32 next;
+	u32 parent;
+
+	u16 num_recs;
+	u8 type;
+	u8 height;
+
+	struct hfs_bnode *next_hash;
+	unsigned long flags;
+	wait_queue_head_t lock_wq;
+	atomic_t refcnt;
+	unsigned int page_offset;
+	struct page *page[0];
+};
+
+#define HFS_BNODE_ERROR		0
+#define HFS_BNODE_NEW		1
+#define HFS_BNODE_DELETED	2
+
+struct hfs_find_data {
+	btree_key *key;
+	btree_key *search_key;
+	struct hfs_btree *tree;
+	struct hfs_bnode *bnode;
+	int record;
+	int keyoffset, keylength;
+	int entryoffset, entrylength;
+};
+
+
+/* btree.c */
+extern struct hfs_btree *hfs_btree_open(struct super_block *, u32, btree_keycmp);
+extern void hfs_btree_close(struct hfs_btree *);
+extern void hfs_btree_write(struct hfs_btree *);
+extern struct hfs_bnode * hfs_bmap_alloc(struct hfs_btree *);
+extern void hfs_bmap_free(struct hfs_bnode *node);
+
+/* bnode.c */
+extern void hfs_bnode_read(struct hfs_bnode *, void *, int, int);
+extern u16 hfs_bnode_read_u16(struct hfs_bnode *, int);
+extern u8 hfs_bnode_read_u8(struct hfs_bnode *, int);
+extern void hfs_bnode_read_key(struct hfs_bnode *, void *, int);
+extern void hfs_bnode_write(struct hfs_bnode *, void *, int, int);
+extern void hfs_bnode_write_u16(struct hfs_bnode *, int, u16);
+extern void hfs_bnode_write_u8(struct hfs_bnode *, int, u8);
+extern void hfs_bnode_clear(struct hfs_bnode *, int, int);
+extern void hfs_bnode_copy(struct hfs_bnode *, int,
+			   struct hfs_bnode *, int, int);
+extern void hfs_bnode_move(struct hfs_bnode *, int, int, int);
+extern void hfs_bnode_dump(struct hfs_bnode *);
+extern void hfs_bnode_unlink(struct hfs_bnode *);
+extern struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *, u32);
+extern struct hfs_bnode *hfs_bnode_find(struct hfs_btree *, u32);
+extern void hfs_bnode_unhash(struct hfs_bnode *);
+extern void hfs_bnode_free(struct hfs_bnode *);
+extern struct hfs_bnode *hfs_bnode_create(struct hfs_btree *, u32);
+extern void hfs_bnode_get(struct hfs_bnode *);
+extern void hfs_bnode_put(struct hfs_bnode *);
+
+/* brec.c */
+extern u16 hfs_brec_lenoff(struct hfs_bnode *, u16, u16 *);
+extern u16 hfs_brec_keylen(struct hfs_bnode *, u16);
+extern int hfs_brec_insert(struct hfs_find_data *, void *, int);
+extern int hfs_brec_remove(struct hfs_find_data *);
+
+/* bfind.c */
+extern int hfs_find_init(struct hfs_btree *, struct hfs_find_data *);
+extern void hfs_find_exit(struct hfs_find_data *);
+extern int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *);
+extern int hfs_brec_find(struct hfs_find_data *);
+extern int hfs_brec_read(struct hfs_find_data *, void *, int);
+extern int hfs_brec_goto(struct hfs_find_data *, int);
+
+
+struct hfs_bnode_desc {
+	__be32 next;		/* (V) Number of the next node at this level */
+	__be32 prev;		/* (V) Number of the prev node at this level */
+	u8 type;		/* (F) The type of node */
+	u8 height;		/* (F) The level of this node (leaves=1) */
+	__be16 num_recs;	/* (V) The number of records in this node */
+	u16 reserved;
+} __packed;
+
+#define HFS_NODE_INDEX	0x00	/* An internal (index) node */
+#define HFS_NODE_HEADER	0x01	/* The tree header node (node 0) */
+#define HFS_NODE_MAP	0x02	/* Holds part of the bitmap of used nodes */
+#define HFS_NODE_LEAF	0xFF	/* A leaf (ndNHeight==1) node */
+
+struct hfs_btree_header_rec {
+	__be16 depth;		/* (V) The number of levels in this B-tree */
+	__be32 root;		/* (V) The node number of the root node */
+	__be32 leaf_count;	/* (V) The number of leaf records */
+	__be32 leaf_head;	/* (V) The number of the first leaf node */
+	__be32 leaf_tail;	/* (V) The number of the last leaf node */
+	__be16 node_size;	/* (F) The number of bytes in a node (=512) */
+	__be16 max_key_len;	/* (F) The length of a key in an index node */
+	__be32 node_count;	/* (V) The total number of nodes */
+	__be32 free_nodes;	/* (V) The number of unused nodes */
+	u16 reserved1;
+	__be32 clump_size;	/* (F) clump size. not usually used. */
+	u8 btree_type;		/* (F) BTree type */
+	u8 reserved2;
+	__be32 attributes;	/* (F) attributes */
+	u32 reserved3[16];
+} __packed;
+
+#define HFS_NODE_INDEX	0x00	/* An internal (index) node */
+#define HFS_NODE_HEADER	0x01	/* The tree header node (node 0) */
+#define HFS_NODE_MAP		0x02	/* Holds part of the bitmap of used nodes */
+#define HFS_NODE_LEAF		0xFF	/* A leaf (ndNHeight==1) node */
+
+#define BTREE_ATTR_BADCLOSE	0x00000001	/* b-tree not closed properly. not
+						   used by hfsplus. */
+#define HFS_TREE_BIGKEYS	0x00000002	/* key length is u16 instead of u8.
+						   used by hfsplus. */
+#define HFS_TREE_VARIDXKEYS	0x00000004	/* variable key length instead of
+						   max key length. use din catalog
+						   b-tree but not in extents
+						   b-tree (hfsplus). */
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
new file mode 100644
index 00000000..424b0337
--- /dev/null
+++ b/fs/hfs/catalog.c
@@ -0,0 +1,356 @@
+/*
+ *  linux/fs/hfs/catalog.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains the functions related to the catalog B-tree.
+ *
+ * Cache code shamelessly stolen from
+ *     linux/fs/inode.c Copyright (C) 1991, 1992  Linus Torvalds
+ *     re-shamelessly stolen Copyright (C) 1997 Linus Torvalds
+ */
+
+#include "hfs_fs.h"
+#include "btree.h"
+
+/*
+ * hfs_cat_build_key()
+ *
+ * Given the ID of the parent and the name build a search key.
+ */
+void hfs_cat_build_key(struct super_block *sb, btree_key *key, u32 parent, struct qstr *name)
+{
+	key->cat.reserved = 0;
+	key->cat.ParID = cpu_to_be32(parent);
+	if (name) {
+		hfs_asc2mac(sb, &key->cat.CName, name);
+		key->key_len = 6 + key->cat.CName.len;
+	} else {
+		memset(&key->cat.CName, 0, sizeof(struct hfs_name));
+		key->key_len = 6;
+	}
+}
+
+static int hfs_cat_build_record(hfs_cat_rec *rec, u32 cnid, struct inode *inode)
+{
+	__be32 mtime = hfs_mtime();
+
+	memset(rec, 0, sizeof(*rec));
+	if (S_ISDIR(inode->i_mode)) {
+		rec->type = HFS_CDR_DIR;
+		rec->dir.DirID = cpu_to_be32(cnid);
+		rec->dir.CrDat = mtime;
+		rec->dir.MdDat = mtime;
+		rec->dir.BkDat = 0;
+		rec->dir.UsrInfo.frView = cpu_to_be16(0xff);
+		return sizeof(struct hfs_cat_dir);
+	} else {
+		/* init some fields for the file record */
+		rec->type = HFS_CDR_FIL;
+		rec->file.Flags = HFS_FIL_USED | HFS_FIL_THD;
+		if (!(inode->i_mode & S_IWUSR))
+			rec->file.Flags |= HFS_FIL_LOCK;
+		rec->file.FlNum = cpu_to_be32(cnid);
+		rec->file.CrDat = mtime;
+		rec->file.MdDat = mtime;
+		rec->file.BkDat = 0;
+		rec->file.UsrWds.fdType = HFS_SB(inode->i_sb)->s_type;
+		rec->file.UsrWds.fdCreator = HFS_SB(inode->i_sb)->s_creator;
+		return sizeof(struct hfs_cat_file);
+	}
+}
+
+static int hfs_cat_build_thread(struct super_block *sb,
+				hfs_cat_rec *rec, int type,
+				u32 parentid, struct qstr *name)
+{
+	rec->type = type;
+	memset(rec->thread.reserved, 0, sizeof(rec->thread.reserved));
+	rec->thread.ParID = cpu_to_be32(parentid);
+	hfs_asc2mac(sb, &rec->thread.CName, name);
+	return sizeof(struct hfs_cat_thread);
+}
+
+/*
+ * create_entry()
+ *
+ * Add a new file or directory to the catalog B-tree and
+ * return a (struct hfs_cat_entry) for it in '*result'.
+ */
+int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
+{
+	struct hfs_find_data fd;
+	struct super_block *sb;
+	union hfs_cat_rec entry;
+	int entry_size;
+	int err;
+
+	dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
+	if (dir->i_size >= HFS_MAX_VALENCE)
+		return -ENOSPC;
+
+	sb = dir->i_sb;
+	hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+
+	hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
+	entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
+			HFS_CDR_THD : HFS_CDR_FTH,
+			dir->i_ino, str);
+	err = hfs_brec_find(&fd);
+	if (err != -ENOENT) {
+		if (!err)
+			err = -EEXIST;
+		goto err2;
+	}
+	err = hfs_brec_insert(&fd, &entry, entry_size);
+	if (err)
+		goto err2;
+
+	hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+	entry_size = hfs_cat_build_record(&entry, cnid, inode);
+	err = hfs_brec_find(&fd);
+	if (err != -ENOENT) {
+		/* panic? */
+		if (!err)
+			err = -EEXIST;
+		goto err1;
+	}
+	err = hfs_brec_insert(&fd, &entry, entry_size);
+	if (err)
+		goto err1;
+
+	dir->i_size++;
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+	hfs_find_exit(&fd);
+	return 0;
+
+err1:
+	hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
+	if (!hfs_brec_find(&fd))
+		hfs_brec_remove(&fd);
+err2:
+	hfs_find_exit(&fd);
+	return err;
+}
+
+/*
+ * hfs_cat_compare()
+ *
+ * Description:
+ *   This is the comparison function used for the catalog B-tree.  In
+ *   comparing catalog B-tree entries, the parent id is the most
+ *   significant field (compared as unsigned ints).  The name field is
+ *   the least significant (compared in "Macintosh lexical order",
+ *   see hfs_strcmp() in string.c)
+ * Input Variable(s):
+ *   struct hfs_cat_key *key1: pointer to the first key to compare
+ *   struct hfs_cat_key *key2: pointer to the second key to compare
+ * Output Variable(s):
+ *   NONE
+ * Returns:
+ *   int: negative if key1<key2, positive if key1>key2, and 0 if key1==key2
+ * Preconditions:
+ *   key1 and key2 point to "valid" (struct hfs_cat_key)s.
+ * Postconditions:
+ *   This function has no side-effects
+ */
+int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2)
+{
+	int retval;
+
+	retval = be32_to_cpu(key1->cat.ParID) - be32_to_cpu(key2->cat.ParID);
+	if (!retval)
+		retval = hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len,
+				    key2->cat.CName.name, key2->cat.CName.len);
+
+	return retval;
+}
+
+/* Try to get a catalog entry for given catalog id */
+// move to read_super???
+int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
+		      struct hfs_find_data *fd)
+{
+	hfs_cat_rec rec;
+	int res, len, type;
+
+	hfs_cat_build_key(sb, fd->search_key, cnid, NULL);
+	res = hfs_brec_read(fd, &rec, sizeof(rec));
+	if (res)
+		return res;
+
+	type = rec.type;
+	if (type != HFS_CDR_THD && type != HFS_CDR_FTH) {
+		printk(KERN_ERR "hfs: found bad thread record in catalog\n");
+		return -EIO;
+	}
+
+	fd->search_key->cat.ParID = rec.thread.ParID;
+	len = fd->search_key->cat.CName.len = rec.thread.CName.len;
+	if (len > HFS_NAMELEN) {
+		printk(KERN_ERR "hfs: bad catalog namelength\n");
+		return -EIO;
+	}
+	memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len);
+	return hfs_brec_find(fd);
+}
+
+
+/*
+ * hfs_cat_delete()
+ *
+ * Delete the indicated file or directory.
+ * The associated thread is also removed unless ('with_thread'==0).
+ */
+int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
+{
+	struct super_block *sb;
+	struct hfs_find_data fd;
+	struct list_head *pos;
+	int res, type;
+
+	dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
+	sb = dir->i_sb;
+	hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+
+	hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+	res = hfs_brec_find(&fd);
+	if (res)
+		goto out;
+
+	type = hfs_bnode_read_u8(fd.bnode, fd.entryoffset);
+	if (type == HFS_CDR_FIL) {
+		struct hfs_cat_file file;
+		hfs_bnode_read(fd.bnode, &file, fd.entryoffset, sizeof(file));
+		if (be32_to_cpu(file.FlNum) == cnid) {
+#if 0
+			hfs_free_fork(sb, &file, HFS_FK_DATA);
+#endif
+			hfs_free_fork(sb, &file, HFS_FK_RSRC);
+		}
+	}
+
+	list_for_each(pos, &HFS_I(dir)->open_dir_list) {
+		struct hfs_readdir_data *rd =
+			list_entry(pos, struct hfs_readdir_data, list);
+		if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
+			rd->file->f_pos--;
+	}
+
+	res = hfs_brec_remove(&fd);
+	if (res)
+		goto out;
+
+	hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
+	res = hfs_brec_find(&fd);
+	if (!res) {
+		res = hfs_brec_remove(&fd);
+		if (res)
+			goto out;
+	}
+
+	dir->i_size--;
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+	res = 0;
+out:
+	hfs_find_exit(&fd);
+
+	return res;
+}
+
+/*
+ * hfs_cat_move()
+ *
+ * Rename a file or directory, possibly to a new directory.
+ * If the destination exists it is removed and a
+ * (struct hfs_cat_entry) for it is returned in '*result'.
+ */
+int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
+		 struct inode *dst_dir, struct qstr *dst_name)
+{
+	struct super_block *sb;
+	struct hfs_find_data src_fd, dst_fd;
+	union hfs_cat_rec entry;
+	int entry_size, type;
+	int err;
+
+	dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
+		dst_dir->i_ino, dst_name->name);
+	sb = src_dir->i_sb;
+	hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd);
+	dst_fd = src_fd;
+
+	/* find the old dir entry and read the data */
+	hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
+	err = hfs_brec_find(&src_fd);
+	if (err)
+		goto out;
+	if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) {
+		err = -EIO;
+		goto out;
+	}
+
+	hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset,
+			    src_fd.entrylength);
+
+	/* create new dir entry with the data from the old entry */
+	hfs_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name);
+	err = hfs_brec_find(&dst_fd);
+	if (err != -ENOENT) {
+		if (!err)
+			err = -EEXIST;
+		goto out;
+	}
+
+	err = hfs_brec_insert(&dst_fd, &entry, src_fd.entrylength);
+	if (err)
+		goto out;
+	dst_dir->i_size++;
+	dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dst_dir);
+
+	/* finally remove the old entry */
+	hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
+	err = hfs_brec_find(&src_fd);
+	if (err)
+		goto out;
+	err = hfs_brec_remove(&src_fd);
+	if (err)
+		goto out;
+	src_dir->i_size--;
+	src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(src_dir);
+
+	type = entry.type;
+	if (type == HFS_CDR_FIL && !(entry.file.Flags & HFS_FIL_THD))
+		goto out;
+
+	/* remove old thread entry */
+	hfs_cat_build_key(sb, src_fd.search_key, cnid, NULL);
+	err = hfs_brec_find(&src_fd);
+	if (err)
+		goto out;
+	err = hfs_brec_remove(&src_fd);
+	if (err)
+		goto out;
+
+	/* create new thread entry */
+	hfs_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
+	entry_size = hfs_cat_build_thread(sb, &entry, type == HFS_CDR_FIL ? HFS_CDR_FTH : HFS_CDR_THD,
+					dst_dir->i_ino, dst_name);
+	err = hfs_brec_find(&dst_fd);
+	if (err != -ENOENT) {
+		if (!err)
+			err = -EEXIST;
+		goto out;
+	}
+	err = hfs_brec_insert(&dst_fd, &entry, entry_size);
+out:
+	hfs_bnode_put(dst_fd.bnode);
+	hfs_find_exit(&src_fd);
+	return err;
+}
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
new file mode 100644
index 00000000..b4d70b13
--- /dev/null
+++ b/fs/hfs/dir.c
@@ -0,0 +1,316 @@
+/*
+ *  linux/fs/hfs/dir.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains directory-related functions independent of which
+ * scheme is being used to represent forks.
+ *
+ * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds
+ */
+
+#include "hfs_fs.h"
+#include "btree.h"
+
+/*
+ * hfs_lookup()
+ */
+static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
+				 struct nameidata *nd)
+{
+	hfs_cat_rec rec;
+	struct hfs_find_data fd;
+	struct inode *inode = NULL;
+	int res;
+
+	hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
+	hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
+	res = hfs_brec_read(&fd, &rec, sizeof(rec));
+	if (res) {
+		hfs_find_exit(&fd);
+		if (res == -ENOENT) {
+			/* No such entry */
+			inode = NULL;
+			goto done;
+		}
+		return ERR_PTR(res);
+	}
+	inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec);
+	hfs_find_exit(&fd);
+	if (!inode)
+		return ERR_PTR(-EACCES);
+done:
+	d_add(dentry, inode);
+	return NULL;
+}
+
+/*
+ * hfs_readdir
+ */
+static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	int len, err;
+	char strbuf[HFS_MAX_NAMELEN];
+	union hfs_cat_rec entry;
+	struct hfs_find_data fd;
+	struct hfs_readdir_data *rd;
+	u16 type;
+
+	if (filp->f_pos >= inode->i_size)
+		return 0;
+
+	hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+	hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
+	err = hfs_brec_find(&fd);
+	if (err)
+		goto out;
+
+	switch ((u32)filp->f_pos) {
+	case 0:
+		/* This is completely artificial... */
+		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+			goto out;
+		filp->f_pos++;
+		/* fall through */
+	case 1:
+		if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
+			err = -EIO;
+			goto out;
+		}
+
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
+		if (entry.type != HFS_CDR_THD) {
+			printk(KERN_ERR "hfs: bad catalog folder thread\n");
+			err = -EIO;
+			goto out;
+		}
+		//if (fd.entrylength < HFS_MIN_THREAD_SZ) {
+		//	printk(KERN_ERR "hfs: truncated catalog thread\n");
+		//	err = -EIO;
+		//	goto out;
+		//}
+		if (filldir(dirent, "..", 2, 1,
+			    be32_to_cpu(entry.thread.ParID), DT_DIR))
+			goto out;
+		filp->f_pos++;
+		/* fall through */
+	default:
+		if (filp->f_pos >= inode->i_size)
+			goto out;
+		err = hfs_brec_goto(&fd, filp->f_pos - 1);
+		if (err)
+			goto out;
+	}
+
+	for (;;) {
+		if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) {
+			printk(KERN_ERR "hfs: walked past end of dir\n");
+			err = -EIO;
+			goto out;
+		}
+
+		if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
+			err = -EIO;
+			goto out;
+		}
+
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
+		type = entry.type;
+		len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName);
+		if (type == HFS_CDR_DIR) {
+			if (fd.entrylength < sizeof(struct hfs_cat_dir)) {
+				printk(KERN_ERR "hfs: small dir entry\n");
+				err = -EIO;
+				goto out;
+			}
+			if (filldir(dirent, strbuf, len, filp->f_pos,
+				    be32_to_cpu(entry.dir.DirID), DT_DIR))
+				break;
+		} else if (type == HFS_CDR_FIL) {
+			if (fd.entrylength < sizeof(struct hfs_cat_file)) {
+				printk(KERN_ERR "hfs: small file entry\n");
+				err = -EIO;
+				goto out;
+			}
+			if (filldir(dirent, strbuf, len, filp->f_pos,
+				    be32_to_cpu(entry.file.FlNum), DT_REG))
+				break;
+		} else {
+			printk(KERN_ERR "hfs: bad catalog entry type %d\n", type);
+			err = -EIO;
+			goto out;
+		}
+		filp->f_pos++;
+		if (filp->f_pos >= inode->i_size)
+			goto out;
+		err = hfs_brec_goto(&fd, 1);
+		if (err)
+			goto out;
+	}
+	rd = filp->private_data;
+	if (!rd) {
+		rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL);
+		if (!rd) {
+			err = -ENOMEM;
+			goto out;
+		}
+		filp->private_data = rd;
+		rd->file = filp;
+		list_add(&rd->list, &HFS_I(inode)->open_dir_list);
+	}
+	memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key));
+out:
+	hfs_find_exit(&fd);
+	return err;
+}
+
+static int hfs_dir_release(struct inode *inode, struct file *file)
+{
+	struct hfs_readdir_data *rd = file->private_data;
+	if (rd) {
+		list_del(&rd->list);
+		kfree(rd);
+	}
+	return 0;
+}
+
+/*
+ * hfs_create()
+ *
+ * This is the create() entry in the inode_operations structure for
+ * regular HFS directories.  The purpose is to create a new file in
+ * a directory and return a corresponding inode, given the inode for
+ * the directory and the name (and its length) of the new file.
+ */
+static int hfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		      struct nameidata *nd)
+{
+	struct inode *inode;
+	int res;
+
+	inode = hfs_new_inode(dir, &dentry->d_name, mode);
+	if (!inode)
+		return -ENOSPC;
+
+	res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
+	if (res) {
+		inode->i_nlink = 0;
+		hfs_delete_inode(inode);
+		iput(inode);
+		return res;
+	}
+	d_instantiate(dentry, inode);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+/*
+ * hfs_mkdir()
+ *
+ * This is the mkdir() entry in the inode_operations structure for
+ * regular HFS directories.  The purpose is to create a new directory
+ * in a directory, given the inode for the parent directory and the
+ * name (and its length) of the new directory.
+ */
+static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+	int res;
+
+	inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode);
+	if (!inode)
+		return -ENOSPC;
+
+	res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
+	if (res) {
+		inode->i_nlink = 0;
+		hfs_delete_inode(inode);
+		iput(inode);
+		return res;
+	}
+	d_instantiate(dentry, inode);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+/*
+ * hfs_remove()
+ *
+ * This serves as both unlink() and rmdir() in the inode_operations
+ * structure for regular HFS directories.  The purpose is to delete
+ * an existing child, given the inode for the parent directory and
+ * the name (and its length) of the existing directory.
+ *
+ * HFS does not have hardlinks, so both rmdir and unlink set the
+ * link count to 0.  The only difference is the emptiness check.
+ */
+static int hfs_remove(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int res;
+
+	if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
+		return -ENOTEMPTY;
+	res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
+	if (res)
+		return res;
+	clear_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	hfs_delete_inode(inode);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+/*
+ * hfs_rename()
+ *
+ * This is the rename() entry in the inode_operations structure for
+ * regular HFS directories.  The purpose is to rename an existing
+ * file or directory, given the inode for the current directory and
+ * the name (and its length) of the existing file/directory and the
+ * inode for the new directory and the name (and its length) of the
+ * new file/directory.
+ * XXX: how do you handle must_be dir?
+ */
+static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	int res;
+
+	/* Unlink destination if it already exists */
+	if (new_dentry->d_inode) {
+		res = hfs_remove(new_dir, new_dentry);
+		if (res)
+			return res;
+	}
+
+	res = hfs_cat_move(old_dentry->d_inode->i_ino,
+			   old_dir, &old_dentry->d_name,
+			   new_dir, &new_dentry->d_name);
+	if (!res)
+		hfs_cat_build_key(old_dir->i_sb,
+				  (btree_key *)&HFS_I(old_dentry->d_inode)->cat_key,
+				  new_dir->i_ino, &new_dentry->d_name);
+	return res;
+}
+
+const struct file_operations hfs_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= hfs_readdir,
+	.llseek		= generic_file_llseek,
+	.release	= hfs_dir_release,
+};
+
+const struct inode_operations hfs_dir_inode_operations = {
+	.create		= hfs_create,
+	.lookup		= hfs_lookup,
+	.unlink		= hfs_remove,
+	.mkdir		= hfs_mkdir,
+	.rmdir		= hfs_remove,
+	.rename		= hfs_rename,
+	.setattr	= hfs_inode_setattr,
+};
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
new file mode 100644
index 00000000..2c16316d
--- /dev/null
+++ b/fs/hfs/extent.c
@@ -0,0 +1,525 @@
+/*
+ *  linux/fs/hfs/extent.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains the functions related to the extents B-tree.
+ */
+
+#include <linux/pagemap.h>
+
+#include "hfs_fs.h"
+#include "btree.h"
+
+/*================ File-local functions ================*/
+
+/*
+ * build_key
+ */
+static void hfs_ext_build_key(hfs_btree_key *key, u32 cnid, u16 block, u8 type)
+{
+	key->key_len = 7;
+	key->ext.FkType = type;
+	key->ext.FNum = cpu_to_be32(cnid);
+	key->ext.FABN = cpu_to_be16(block);
+}
+
+/*
+ * hfs_ext_compare()
+ *
+ * Description:
+ *   This is the comparison function used for the extents B-tree.  In
+ *   comparing extent B-tree entries, the file id is the most
+ *   significant field (compared as unsigned ints); the fork type is
+ *   the second most significant field (compared as unsigned chars);
+ *   and the allocation block number field is the least significant
+ *   (compared as unsigned ints).
+ * Input Variable(s):
+ *   struct hfs_ext_key *key1: pointer to the first key to compare
+ *   struct hfs_ext_key *key2: pointer to the second key to compare
+ * Output Variable(s):
+ *   NONE
+ * Returns:
+ *   int: negative if key1<key2, positive if key1>key2, and 0 if key1==key2
+ * Preconditions:
+ *   key1 and key2 point to "valid" (struct hfs_ext_key)s.
+ * Postconditions:
+ *   This function has no side-effects */
+int hfs_ext_keycmp(const btree_key *key1, const btree_key *key2)
+{
+	__be32 fnum1, fnum2;
+	__be16 block1, block2;
+
+	fnum1 = key1->ext.FNum;
+	fnum2 = key2->ext.FNum;
+	if (fnum1 != fnum2)
+		return be32_to_cpu(fnum1) < be32_to_cpu(fnum2) ? -1 : 1;
+	if (key1->ext.FkType != key2->ext.FkType)
+		return key1->ext.FkType < key2->ext.FkType ? -1 : 1;
+
+	block1 = key1->ext.FABN;
+	block2 = key2->ext.FABN;
+	if (block1 == block2)
+		return 0;
+	return be16_to_cpu(block1) < be16_to_cpu(block2) ? -1 : 1;
+}
+
+/*
+ * hfs_ext_find_block
+ *
+ * Find a block within an extent record
+ */
+static u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off)
+{
+	int i;
+	u16 count;
+
+	for (i = 0; i < 3; ext++, i++) {
+		count = be16_to_cpu(ext->count);
+		if (off < count)
+			return be16_to_cpu(ext->block) + off;
+		off -= count;
+	}
+	/* panic? */
+	return 0;
+}
+
+static int hfs_ext_block_count(struct hfs_extent *ext)
+{
+	int i;
+	u16 count = 0;
+
+	for (i = 0; i < 3; ext++, i++)
+		count += be16_to_cpu(ext->count);
+	return count;
+}
+
+static u16 hfs_ext_lastblock(struct hfs_extent *ext)
+{
+	int i;
+
+	ext += 2;
+	for (i = 0; i < 2; ext--, i++)
+		if (ext->count)
+			break;
+	return be16_to_cpu(ext->block) + be16_to_cpu(ext->count);
+}
+
+static void __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
+{
+	int res;
+
+	hfs_ext_build_key(fd->search_key, inode->i_ino, HFS_I(inode)->cached_start,
+			  HFS_IS_RSRC(inode) ?  HFS_FK_RSRC : HFS_FK_DATA);
+	res = hfs_brec_find(fd);
+	if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) {
+		if (res != -ENOENT)
+			return;
+		hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec));
+		HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
+	} else {
+		if (res)
+			return;
+		hfs_bnode_write(fd->bnode, HFS_I(inode)->cached_extents, fd->entryoffset, fd->entrylength);
+		HFS_I(inode)->flags &= ~HFS_FLG_EXT_DIRTY;
+	}
+}
+
+void hfs_ext_write_extent(struct inode *inode)
+{
+	struct hfs_find_data fd;
+
+	if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) {
+		hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd);
+		__hfs_ext_write_extent(inode, &fd);
+		hfs_find_exit(&fd);
+	}
+}
+
+static inline int __hfs_ext_read_extent(struct hfs_find_data *fd, struct hfs_extent *extent,
+					u32 cnid, u32 block, u8 type)
+{
+	int res;
+
+	hfs_ext_build_key(fd->search_key, cnid, block, type);
+	fd->key->ext.FNum = 0;
+	res = hfs_brec_find(fd);
+	if (res && res != -ENOENT)
+		return res;
+	if (fd->key->ext.FNum != fd->search_key->ext.FNum ||
+	    fd->key->ext.FkType != fd->search_key->ext.FkType)
+		return -ENOENT;
+	if (fd->entrylength != sizeof(hfs_extent_rec))
+		return -EIO;
+	hfs_bnode_read(fd->bnode, extent, fd->entryoffset, sizeof(hfs_extent_rec));
+	return 0;
+}
+
+static inline int __hfs_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
+{
+	int res;
+
+	if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY)
+		__hfs_ext_write_extent(inode, fd);
+
+	res = __hfs_ext_read_extent(fd, HFS_I(inode)->cached_extents, inode->i_ino,
+				    block, HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA);
+	if (!res) {
+		HFS_I(inode)->cached_start = be16_to_cpu(fd->key->ext.FABN);
+		HFS_I(inode)->cached_blocks = hfs_ext_block_count(HFS_I(inode)->cached_extents);
+	} else {
+		HFS_I(inode)->cached_start = HFS_I(inode)->cached_blocks = 0;
+		HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
+	}
+	return res;
+}
+
+static int hfs_ext_read_extent(struct inode *inode, u16 block)
+{
+	struct hfs_find_data fd;
+	int res;
+
+	if (block >= HFS_I(inode)->cached_start &&
+	    block < HFS_I(inode)->cached_start + HFS_I(inode)->cached_blocks)
+		return 0;
+
+	hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd);
+	res = __hfs_ext_cache_extent(&fd, inode, block);
+	hfs_find_exit(&fd);
+	return res;
+}
+
+static void hfs_dump_extent(struct hfs_extent *extent)
+{
+	int i;
+
+	dprint(DBG_EXTENT, "   ");
+	for (i = 0; i < 3; i++)
+		dprint(DBG_EXTENT, " %u:%u", be16_to_cpu(extent[i].block),
+				 be16_to_cpu(extent[i].count));
+	dprint(DBG_EXTENT, "\n");
+}
+
+static int hfs_add_extent(struct hfs_extent *extent, u16 offset,
+			  u16 alloc_block, u16 block_count)
+{
+	u16 count, start;
+	int i;
+
+	hfs_dump_extent(extent);
+	for (i = 0; i < 3; extent++, i++) {
+		count = be16_to_cpu(extent->count);
+		if (offset == count) {
+			start = be16_to_cpu(extent->block);
+			if (alloc_block != start + count) {
+				if (++i >= 3)
+					return -ENOSPC;
+				extent++;
+				extent->block = cpu_to_be16(alloc_block);
+			} else
+				block_count += count;
+			extent->count = cpu_to_be16(block_count);
+			return 0;
+		} else if (offset < count)
+			break;
+		offset -= count;
+	}
+	/* panic? */
+	return -EIO;
+}
+
+static int hfs_free_extents(struct super_block *sb, struct hfs_extent *extent,
+			    u16 offset, u16 block_nr)
+{
+	u16 count, start;
+	int i;
+
+	hfs_dump_extent(extent);
+	for (i = 0; i < 3; extent++, i++) {
+		count = be16_to_cpu(extent->count);
+		if (offset == count)
+			goto found;
+		else if (offset < count)
+			break;
+		offset -= count;
+	}
+	/* panic? */
+	return -EIO;
+found:
+	for (;;) {
+		start = be16_to_cpu(extent->block);
+		if (count <= block_nr) {
+			hfs_clear_vbm_bits(sb, start, count);
+			extent->block = 0;
+			extent->count = 0;
+			block_nr -= count;
+		} else {
+			count -= block_nr;
+			hfs_clear_vbm_bits(sb, start + count, block_nr);
+			extent->count = cpu_to_be16(count);
+			block_nr = 0;
+		}
+		if (!block_nr || !i)
+			return 0;
+		i--;
+		extent--;
+		count = be16_to_cpu(extent->count);
+	}
+}
+
+int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type)
+{
+	struct hfs_find_data fd;
+	u32 total_blocks, blocks, start;
+	u32 cnid = be32_to_cpu(file->FlNum);
+	struct hfs_extent *extent;
+	int res, i;
+
+	if (type == HFS_FK_DATA) {
+		total_blocks = be32_to_cpu(file->PyLen);
+		extent = file->ExtRec;
+	} else {
+		total_blocks = be32_to_cpu(file->RPyLen);
+		extent = file->RExtRec;
+	}
+	total_blocks /= HFS_SB(sb)->alloc_blksz;
+	if (!total_blocks)
+		return 0;
+
+	blocks = 0;
+	for (i = 0; i < 3; extent++, i++)
+		blocks += be16_to_cpu(extent[i].count);
+
+	res = hfs_free_extents(sb, extent, blocks, blocks);
+	if (res)
+		return res;
+	if (total_blocks == blocks)
+		return 0;
+
+	hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
+	do {
+		res = __hfs_ext_read_extent(&fd, extent, cnid, total_blocks, type);
+		if (res)
+			break;
+		start = be16_to_cpu(fd.key->ext.FABN);
+		hfs_free_extents(sb, extent, total_blocks - start, total_blocks);
+		hfs_brec_remove(&fd);
+		total_blocks = start;
+	} while (total_blocks > blocks);
+	hfs_find_exit(&fd);
+
+	return res;
+}
+
+/*
+ * hfs_get_block
+ */
+int hfs_get_block(struct inode *inode, sector_t block,
+		  struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb;
+	u16 dblock, ablock;
+	int res;
+
+	sb = inode->i_sb;
+	/* Convert inode block to disk allocation block */
+	ablock = (u32)block / HFS_SB(sb)->fs_div;
+
+	if (block >= HFS_I(inode)->fs_blocks) {
+		if (block > HFS_I(inode)->fs_blocks || !create)
+			return -EIO;
+		if (ablock >= HFS_I(inode)->alloc_blocks) {
+			res = hfs_extend_file(inode);
+			if (res)
+				return res;
+		}
+	} else
+		create = 0;
+
+	if (ablock < HFS_I(inode)->first_blocks) {
+		dblock = hfs_ext_find_block(HFS_I(inode)->first_extents, ablock);
+		goto done;
+	}
+
+	mutex_lock(&HFS_I(inode)->extents_lock);
+	res = hfs_ext_read_extent(inode, ablock);
+	if (!res)
+		dblock = hfs_ext_find_block(HFS_I(inode)->cached_extents,
+					    ablock - HFS_I(inode)->cached_start);
+	else {
+		mutex_unlock(&HFS_I(inode)->extents_lock);
+		return -EIO;
+	}
+	mutex_unlock(&HFS_I(inode)->extents_lock);
+
+done:
+	map_bh(bh_result, sb, HFS_SB(sb)->fs_start +
+	       dblock * HFS_SB(sb)->fs_div +
+	       (u32)block % HFS_SB(sb)->fs_div);
+
+	if (create) {
+		set_buffer_new(bh_result);
+		HFS_I(inode)->phys_size += sb->s_blocksize;
+		HFS_I(inode)->fs_blocks++;
+		inode_add_bytes(inode, sb->s_blocksize);
+		mark_inode_dirty(inode);
+	}
+	return 0;
+}
+
+int hfs_extend_file(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	u32 start, len, goal;
+	int res;
+
+	mutex_lock(&HFS_I(inode)->extents_lock);
+	if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks)
+		goal = hfs_ext_lastblock(HFS_I(inode)->first_extents);
+	else {
+		res = hfs_ext_read_extent(inode, HFS_I(inode)->alloc_blocks);
+		if (res)
+			goto out;
+		goal = hfs_ext_lastblock(HFS_I(inode)->cached_extents);
+	}
+
+	len = HFS_I(inode)->clump_blocks;
+	start = hfs_vbm_search_free(sb, goal, &len);
+	if (!len) {
+		res = -ENOSPC;
+		goto out;
+	}
+
+	dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
+	if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) {
+		if (!HFS_I(inode)->first_blocks) {
+			dprint(DBG_EXTENT, "first extents\n");
+			/* no extents yet */
+			HFS_I(inode)->first_extents[0].block = cpu_to_be16(start);
+			HFS_I(inode)->first_extents[0].count = cpu_to_be16(len);
+			res = 0;
+		} else {
+			/* try to append to extents in inode */
+			res = hfs_add_extent(HFS_I(inode)->first_extents,
+					     HFS_I(inode)->alloc_blocks,
+					     start, len);
+			if (res == -ENOSPC)
+				goto insert_extent;
+		}
+		if (!res) {
+			hfs_dump_extent(HFS_I(inode)->first_extents);
+			HFS_I(inode)->first_blocks += len;
+		}
+	} else {
+		res = hfs_add_extent(HFS_I(inode)->cached_extents,
+				     HFS_I(inode)->alloc_blocks -
+				     HFS_I(inode)->cached_start,
+				     start, len);
+		if (!res) {
+			hfs_dump_extent(HFS_I(inode)->cached_extents);
+			HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY;
+			HFS_I(inode)->cached_blocks += len;
+		} else if (res == -ENOSPC)
+			goto insert_extent;
+	}
+out:
+	mutex_unlock(&HFS_I(inode)->extents_lock);
+	if (!res) {
+		HFS_I(inode)->alloc_blocks += len;
+		mark_inode_dirty(inode);
+		if (inode->i_ino < HFS_FIRSTUSER_CNID)
+			set_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags);
+		set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
+		sb->s_dirt = 1;
+	}
+	return res;
+
+insert_extent:
+	dprint(DBG_EXTENT, "insert new extent\n");
+	hfs_ext_write_extent(inode);
+
+	memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec));
+	HFS_I(inode)->cached_extents[0].block = cpu_to_be16(start);
+	HFS_I(inode)->cached_extents[0].count = cpu_to_be16(len);
+	hfs_dump_extent(HFS_I(inode)->cached_extents);
+	HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW;
+	HFS_I(inode)->cached_start = HFS_I(inode)->alloc_blocks;
+	HFS_I(inode)->cached_blocks = len;
+
+	res = 0;
+	goto out;
+}
+
+void hfs_file_truncate(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hfs_find_data fd;
+	u16 blk_cnt, alloc_cnt, start;
+	u32 size;
+	int res;
+
+	dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino,
+	       (long long)HFS_I(inode)->phys_size, inode->i_size);
+	if (inode->i_size > HFS_I(inode)->phys_size) {
+		struct address_space *mapping = inode->i_mapping;
+		void *fsdata;
+		struct page *page;
+		int res;
+
+		/* XXX: Can use generic_cont_expand? */
+		size = inode->i_size - 1;
+		res = pagecache_write_begin(NULL, mapping, size+1, 0,
+				AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+		if (!res) {
+			res = pagecache_write_end(NULL, mapping, size+1, 0, 0,
+					page, fsdata);
+		}
+		if (res)
+			inode->i_size = HFS_I(inode)->phys_size;
+		return;
+	} else if (inode->i_size == HFS_I(inode)->phys_size)
+		return;
+	size = inode->i_size + HFS_SB(sb)->alloc_blksz - 1;
+	blk_cnt = size / HFS_SB(sb)->alloc_blksz;
+	alloc_cnt = HFS_I(inode)->alloc_blocks;
+	if (blk_cnt == alloc_cnt)
+		goto out;
+
+	mutex_lock(&HFS_I(inode)->extents_lock);
+	hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
+	while (1) {
+		if (alloc_cnt == HFS_I(inode)->first_blocks) {
+			hfs_free_extents(sb, HFS_I(inode)->first_extents,
+					 alloc_cnt, alloc_cnt - blk_cnt);
+			hfs_dump_extent(HFS_I(inode)->first_extents);
+			HFS_I(inode)->first_blocks = blk_cnt;
+			break;
+		}
+		res = __hfs_ext_cache_extent(&fd, inode, alloc_cnt);
+		if (res)
+			break;
+		start = HFS_I(inode)->cached_start;
+		hfs_free_extents(sb, HFS_I(inode)->cached_extents,
+				 alloc_cnt - start, alloc_cnt - blk_cnt);
+		hfs_dump_extent(HFS_I(inode)->cached_extents);
+		if (blk_cnt > start) {
+			HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY;
+			break;
+		}
+		alloc_cnt = start;
+		HFS_I(inode)->cached_start = HFS_I(inode)->cached_blocks = 0;
+		HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
+		hfs_brec_remove(&fd);
+	}
+	hfs_find_exit(&fd);
+	mutex_unlock(&HFS_I(inode)->extents_lock);
+
+	HFS_I(inode)->alloc_blocks = blk_cnt;
+out:
+	HFS_I(inode)->phys_size = inode->i_size;
+	HFS_I(inode)->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits);
+	mark_inode_dirty(inode);
+}
diff --git a/fs/hfs/hfs.h b/fs/hfs/hfs.h
new file mode 100644
index 00000000..6f194d07
--- /dev/null
+++ b/fs/hfs/hfs.h
@@ -0,0 +1,289 @@
+/*
+ *  linux/fs/hfs/hfs.h
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ */
+
+#ifndef _HFS_H
+#define _HFS_H
+
+/* offsets to various blocks */
+#define HFS_DD_BLK		0 /* Driver Descriptor block */
+#define HFS_PMAP_BLK		1 /* First block of partition map */
+#define HFS_MDB_BLK		2 /* Block (w/i partition) of MDB */
+
+/* magic numbers for various disk blocks */
+#define HFS_DRVR_DESC_MAGIC	0x4552 /* "ER": driver descriptor map */
+#define HFS_OLD_PMAP_MAGIC	0x5453 /* "TS": old-type partition map */
+#define HFS_NEW_PMAP_MAGIC	0x504D /* "PM": new-type partition map */
+#define HFS_SUPER_MAGIC		0x4244 /* "BD": HFS MDB (super block) */
+#define HFS_MFS_SUPER_MAGIC	0xD2D7 /* MFS MDB (super block) */
+
+/* various FIXED size parameters */
+#define HFS_SECTOR_SIZE		512    /* size of an HFS sector */
+#define HFS_SECTOR_SIZE_BITS	9      /* log_2(HFS_SECTOR_SIZE) */
+#define HFS_NAMELEN		31     /* maximum length of an HFS filename */
+#define HFS_MAX_NAMELEN		128
+#define HFS_MAX_VALENCE		32767U
+
+/* Meanings of the drAtrb field of the MDB,
+ * Reference: _Inside Macintosh: Files_ p. 2-61
+ */
+#define HFS_SB_ATTRIB_HLOCK	(1 << 7)
+#define HFS_SB_ATTRIB_UNMNT	(1 << 8)
+#define HFS_SB_ATTRIB_SPARED	(1 << 9)
+#define HFS_SB_ATTRIB_INCNSTNT	(1 << 11)
+#define HFS_SB_ATTRIB_SLOCK	(1 << 15)
+
+/* Some special File ID numbers */
+#define HFS_POR_CNID		1	/* Parent Of the Root */
+#define HFS_ROOT_CNID		2	/* ROOT directory */
+#define HFS_EXT_CNID		3	/* EXTents B-tree */
+#define HFS_CAT_CNID		4	/* CATalog B-tree */
+#define HFS_BAD_CNID		5	/* BAD blocks file */
+#define HFS_ALLOC_CNID		6	/* ALLOCation file (HFS+) */
+#define HFS_START_CNID		7	/* STARTup file (HFS+) */
+#define HFS_ATTR_CNID		8	/* ATTRibutes file (HFS+) */
+#define HFS_EXCH_CNID		15	/* ExchangeFiles temp id */
+#define HFS_FIRSTUSER_CNID	16
+
+/* values for hfs_cat_rec.cdrType */
+#define HFS_CDR_DIR    0x01    /* folder (directory) */
+#define HFS_CDR_FIL    0x02    /* file */
+#define HFS_CDR_THD    0x03    /* folder (directory) thread */
+#define HFS_CDR_FTH    0x04    /* file thread */
+
+/* legal values for hfs_ext_key.FkType and hfs_file.fork */
+#define HFS_FK_DATA	0x00
+#define HFS_FK_RSRC	0xFF
+
+/* bits in hfs_fil_entry.Flags */
+#define HFS_FIL_LOCK	0x01  /* locked */
+#define HFS_FIL_THD	0x02  /* file thread */
+#define HFS_FIL_DOPEN   0x04  /* data fork open */
+#define HFS_FIL_ROPEN   0x08  /* resource fork open */
+#define HFS_FIL_DIR     0x10  /* directory (always clear) */
+#define HFS_FIL_NOCOPY  0x40  /* copy-protected file */
+#define HFS_FIL_USED	0x80  /* open */
+
+/* bits in hfs_dir_entry.Flags. dirflags is 16 bits. */
+#define HFS_DIR_LOCK        0x01  /* locked */
+#define HFS_DIR_THD         0x02  /* directory thread */
+#define HFS_DIR_INEXPFOLDER 0x04  /* in a shared area */
+#define HFS_DIR_MOUNTED     0x08  /* mounted */
+#define HFS_DIR_DIR         0x10  /* directory (always set) */
+#define HFS_DIR_EXPFOLDER   0x20  /* share point */
+
+/* bits hfs_finfo.fdFlags */
+#define HFS_FLG_INITED		0x0100
+#define HFS_FLG_LOCKED		0x1000
+#define HFS_FLG_INVISIBLE	0x4000
+
+/*======== HFS structures as they appear on the disk ========*/
+
+/* Pascal-style string of up to 31 characters */
+struct hfs_name {
+	u8 len;
+	u8 name[HFS_NAMELEN];
+} __packed;
+
+struct hfs_point {
+	__be16 v;
+	__be16 h;
+} __packed;
+
+struct hfs_rect {
+	__be16 top;
+	__be16 left;
+	__be16 bottom;
+	__be16 right;
+} __packed;
+
+struct hfs_finfo {
+	__be32 fdType;
+	__be32 fdCreator;
+	__be16 fdFlags;
+	struct hfs_point fdLocation;
+	__be16 fdFldr;
+} __packed;
+
+struct hfs_fxinfo {
+	__be16 fdIconID;
+	u8 fdUnused[8];
+	__be16 fdComment;
+	__be32 fdPutAway;
+} __packed;
+
+struct hfs_dinfo {
+	struct hfs_rect frRect;
+	__be16 frFlags;
+	struct hfs_point frLocation;
+	__be16 frView;
+} __packed;
+
+struct hfs_dxinfo {
+	struct hfs_point frScroll;
+	__be32 frOpenChain;
+	__be16 frUnused;
+	__be16 frComment;
+	__be32 frPutAway;
+} __packed;
+
+union hfs_finder_info {
+	struct {
+		struct hfs_finfo finfo;
+		struct hfs_fxinfo fxinfo;
+	} file;
+	struct {
+		struct hfs_dinfo dinfo;
+		struct hfs_dxinfo dxinfo;
+	} dir;
+} __packed;
+
+/* Cast to a pointer to a generic bkey */
+#define	HFS_BKEY(X)	(((void)((X)->KeyLen)), ((struct hfs_bkey *)(X)))
+
+/* The key used in the catalog b-tree: */
+struct hfs_cat_key {
+	u8 key_len;		/* number of bytes in the key */
+	u8 reserved;		/* padding */
+	__be32 ParID;		/* CNID of the parent dir */
+	struct hfs_name	CName;	/* The filename of the entry */
+} __packed;
+
+/* The key used in the extents b-tree: */
+struct hfs_ext_key {
+	u8 key_len;		/* number of bytes in the key */
+	u8 FkType;		/* HFS_FK_{DATA,RSRC} */
+	__be32 FNum;		/* The File ID of the file */
+	__be16 FABN;		/* allocation blocks number*/
+} __packed;
+
+typedef union hfs_btree_key {
+	u8 key_len;			/* number of bytes in the key */
+	struct hfs_cat_key cat;
+	struct hfs_ext_key ext;
+} hfs_btree_key;
+
+#define HFS_MAX_CAT_KEYLEN	(sizeof(struct hfs_cat_key) - sizeof(u8))
+#define HFS_MAX_EXT_KEYLEN	(sizeof(struct hfs_ext_key) - sizeof(u8))
+
+typedef union hfs_btree_key btree_key;
+
+struct hfs_extent {
+	__be16 block;
+	__be16 count;
+};
+typedef struct hfs_extent hfs_extent_rec[3];
+
+/* The catalog record for a file */
+struct hfs_cat_file {
+	s8 type;			/* The type of entry */
+	u8 reserved;
+	u8 Flags;			/* Flags such as read-only */
+	s8 Typ;				/* file version number = 0 */
+	struct hfs_finfo UsrWds;	/* data used by the Finder */
+	__be32 FlNum;			/* The CNID */
+	__be16 StBlk;			/* obsolete */
+	__be32 LgLen;			/* The logical EOF of the data fork*/
+	__be32 PyLen;			/* The physical EOF of the data fork */
+	__be16 RStBlk;			/* obsolete */
+	__be32 RLgLen;			/* The logical EOF of the rsrc fork */
+	__be32 RPyLen;			/* The physical EOF of the rsrc fork */
+	__be32 CrDat;			/* The creation date */
+	__be32 MdDat;			/* The modified date */
+	__be32 BkDat;			/* The last backup date */
+	struct hfs_fxinfo FndrInfo;	/* more data for the Finder */
+	__be16 ClpSize;			/* number of bytes to allocate
+					   when extending files */
+	hfs_extent_rec ExtRec;		/* first extent record
+					   for the data fork */
+	hfs_extent_rec RExtRec;		/* first extent record
+					   for the resource fork */
+	u32 Resrv;			/* reserved by Apple */
+} __packed;
+
+/* the catalog record for a directory */
+struct hfs_cat_dir {
+	s8 type;			/* The type of entry */
+	u8 reserved;
+	__be16 Flags;			/* flags */
+	__be16 Val;			/* Valence: number of files and
+					   dirs in the directory */
+	__be32 DirID;			/* The CNID */
+	__be32 CrDat;			/* The creation date */
+	__be32 MdDat;			/* The modification date */
+	__be32 BkDat;			/* The last backup date */
+	struct hfs_dinfo UsrInfo;	/* data used by the Finder */
+	struct hfs_dxinfo FndrInfo;	/* more data used by Finder */
+	u8 Resrv[16];			/* reserved by Apple */
+} __packed;
+
+/* the catalog record for a thread */
+struct hfs_cat_thread {
+	s8 type;			/* The type of entry */
+	u8 reserved[9];			/* reserved by Apple */
+	__be32 ParID;			/* CNID of parent directory */
+	struct hfs_name CName;		/* The name of this entry */
+}  __packed;
+
+/* A catalog tree record */
+typedef union hfs_cat_rec {
+	s8 type;			/* The type of entry */
+	struct hfs_cat_file file;
+	struct hfs_cat_dir dir;
+	struct hfs_cat_thread thread;
+} hfs_cat_rec;
+
+struct hfs_mdb {
+	__be16 drSigWord;		/* Signature word indicating fs type */
+	__be32 drCrDate;		/* fs creation date/time */
+	__be32 drLsMod;			/* fs modification date/time */
+	__be16 drAtrb;			/* fs attributes */
+	__be16 drNmFls;			/* number of files in root directory */
+	__be16 drVBMSt;			/* location (in 512-byte blocks)
+					   of the volume bitmap */
+	__be16 drAllocPtr;		/* location (in allocation blocks)
+					   to begin next allocation search */
+	__be16 drNmAlBlks;		/* number of allocation blocks */
+	__be32 drAlBlkSiz;		/* bytes in an allocation block */
+	__be32 drClpSiz;		/* clumpsize, the number of bytes to
+					   allocate when extending a file */
+	__be16 drAlBlSt;		/* location (in 512-byte blocks)
+					   of the first allocation block */
+	__be32 drNxtCNID;		/* CNID to assign to the next
+					   file or directory created */
+	__be16 drFreeBks;		/* number of free allocation blocks */
+	u8 drVN[28];			/* the volume label */
+	__be32 drVolBkUp;		/* fs backup date/time */
+	__be16 drVSeqNum;		/* backup sequence number */
+	__be32 drWrCnt;			/* fs write count */
+	__be32 drXTClpSiz;		/* clumpsize for the extents B-tree */
+	__be32 drCTClpSiz;		/* clumpsize for the catalog B-tree */
+	__be16 drNmRtDirs;		/* number of directories in
+					   the root directory */
+	__be32 drFilCnt;		/* number of files in the fs */
+	__be32 drDirCnt;		/* number of directories in the fs */
+	u8 drFndrInfo[32];		/* data used by the Finder */
+	__be16 drEmbedSigWord;		/* embedded volume signature */
+	__be32 drEmbedExtent;		/* starting block number (xdrStABN)
+					   and number of allocation blocks
+					   (xdrNumABlks) occupied by embedded
+					   volume */
+	__be32 drXTFlSize;		/* bytes in the extents B-tree */
+	hfs_extent_rec drXTExtRec;	/* extents B-tree's first 3 extents */
+	__be32 drCTFlSize;		/* bytes in the catalog B-tree */
+	hfs_extent_rec drCTExtRec;	/* catalog B-tree's first 3 extents */
+} __packed;
+
+/*======== Data structures kept in memory ========*/
+
+struct hfs_readdir_data {
+	struct list_head list;
+	struct file *file;
+	struct hfs_cat_key key;
+};
+
+#endif
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
new file mode 100644
index 00000000..ad97c2d5
--- /dev/null
+++ b/fs/hfs/hfs_fs.h
@@ -0,0 +1,276 @@
+/*
+ *  linux/fs/hfs/hfs_fs.h
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ */
+
+#ifndef _LINUX_HFS_FS_H
+#define _LINUX_HFS_FS_H
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+
+#include <asm/byteorder.h>
+#include <asm/uaccess.h>
+
+#include "hfs.h"
+
+#define DBG_BNODE_REFS	0x00000001
+#define DBG_BNODE_MOD	0x00000002
+#define DBG_CAT_MOD	0x00000004
+#define DBG_INODE	0x00000008
+#define DBG_SUPER	0x00000010
+#define DBG_EXTENT	0x00000020
+#define DBG_BITMAP	0x00000040
+
+//#define DBG_MASK	(DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD|DBG_CAT_MOD|DBG_BITMAP)
+//#define DBG_MASK	(DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
+//#define DBG_MASK	(DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
+#define DBG_MASK	(0)
+
+#define dprint(flg, fmt, args...) \
+	if (flg & DBG_MASK) printk(fmt , ## args)
+
+/*
+ * struct hfs_inode_info
+ *
+ * The HFS-specific part of a Linux (struct inode)
+ */
+struct hfs_inode_info {
+	atomic_t opencnt;
+
+	unsigned int flags;
+
+	/* to deal with localtime ugliness */
+	int tz_secondswest;
+
+	struct hfs_cat_key cat_key;
+
+	struct list_head open_dir_list;
+	struct inode *rsrc_inode;
+
+	struct mutex extents_lock;
+
+	u16 alloc_blocks, clump_blocks;
+	sector_t fs_blocks;
+	/* Allocation extents from catlog record or volume header */
+	hfs_extent_rec first_extents;
+	u16 first_blocks;
+	hfs_extent_rec cached_extents;
+	u16 cached_start, cached_blocks;
+
+	loff_t phys_size;
+	struct inode vfs_inode;
+};
+
+#define HFS_FLG_RSRC		0x0001
+#define HFS_FLG_EXT_DIRTY	0x0002
+#define HFS_FLG_EXT_NEW		0x0004
+
+#define HFS_IS_RSRC(inode)	(HFS_I(inode)->flags & HFS_FLG_RSRC)
+
+/*
+ * struct hfs_sb_info
+ *
+ * The HFS-specific part of a Linux (struct super_block)
+ */
+struct hfs_sb_info {
+	struct buffer_head *mdb_bh;		/* The hfs_buffer
+						   holding the real
+						   superblock (aka VIB
+						   or MDB) */
+	struct hfs_mdb *mdb;
+	struct buffer_head *alt_mdb_bh;		/* The hfs_buffer holding
+						   the alternate superblock */
+	struct hfs_mdb *alt_mdb;
+	__be32 *bitmap;				/* The page holding the
+						   allocation bitmap */
+	struct hfs_btree *ext_tree;			/* Information about
+						   the extents b-tree */
+	struct hfs_btree *cat_tree;			/* Information about
+						   the catalog b-tree */
+	u32 file_count;				/* The number of
+						   regular files in
+						   the filesystem */
+	u32 folder_count;			/* The number of
+						   directories in the
+						   filesystem */
+	u32 next_id;				/* The next available
+						   file id number */
+	u32 clumpablks;				/* The number of allocation
+						   blocks to try to add when
+						   extending a file */
+	u32 fs_start;				/* The first 512-byte
+						   block represented
+						   in the bitmap */
+	u32 part_start;
+	u16 root_files;				/* The number of
+						   regular
+						   (non-directory)
+						   files in the root
+						   directory */
+	u16 root_dirs;				/* The number of
+						   directories in the
+						   root directory */
+	u16 fs_ablocks;				/* The number of
+						   allocation blocks
+						   in the filesystem */
+	u16 free_ablocks;			/* the number of unused
+						   allocation blocks
+						   in the filesystem */
+	u32 alloc_blksz;			/* The size of an
+						   "allocation block" */
+	int s_quiet;				/* Silent failure when
+						   changing owner or mode? */
+	__be32 s_type;				/* Type for new files */
+	__be32 s_creator;			/* Creator for new files */
+	umode_t s_file_umask;			/* The umask applied to the
+						   permissions on all files */
+	umode_t s_dir_umask;			/* The umask applied to the
+						   permissions on all dirs */
+	uid_t s_uid;				/* The uid of all files */
+	gid_t s_gid;				/* The gid of all files */
+
+	int session, part;
+
+	struct nls_table *nls_io, *nls_disk;
+
+	struct mutex bitmap_lock;
+
+	unsigned long flags;
+
+	u16 blockoffset;
+
+	int fs_div;
+};
+
+#define HFS_FLG_BITMAP_DIRTY	0
+#define HFS_FLG_MDB_DIRTY	1
+#define HFS_FLG_ALT_MDB_DIRTY	2
+
+/* bitmap.c */
+extern u32 hfs_vbm_search_free(struct super_block *, u32, u32 *);
+extern int hfs_clear_vbm_bits(struct super_block *, u16, u16);
+
+/* catalog.c */
+extern int hfs_cat_keycmp(const btree_key *, const btree_key *);
+struct hfs_find_data;
+extern int hfs_cat_find_brec(struct super_block *, u32, struct hfs_find_data *);
+extern int hfs_cat_create(u32, struct inode *, struct qstr *, struct inode *);
+extern int hfs_cat_delete(u32, struct inode *, struct qstr *);
+extern int hfs_cat_move(u32, struct inode *, struct qstr *,
+			struct inode *, struct qstr *);
+extern void hfs_cat_build_key(struct super_block *, btree_key *, u32, struct qstr *);
+
+/* dir.c */
+extern const struct file_operations hfs_dir_operations;
+extern const struct inode_operations hfs_dir_inode_operations;
+
+/* extent.c */
+extern int hfs_ext_keycmp(const btree_key *, const btree_key *);
+extern int hfs_free_fork(struct super_block *, struct hfs_cat_file *, int);
+extern void hfs_ext_write_extent(struct inode *);
+extern int hfs_extend_file(struct inode *);
+extern void hfs_file_truncate(struct inode *);
+
+extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+
+/* inode.c */
+extern const struct address_space_operations hfs_aops;
+extern const struct address_space_operations hfs_btree_aops;
+
+extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int);
+extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
+extern int hfs_write_inode(struct inode *, struct writeback_control *);
+extern int hfs_inode_setattr(struct dentry *, struct iattr *);
+extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
+			__be32 log_size, __be32 phys_size, u32 clump_size);
+extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *);
+extern void hfs_evict_inode(struct inode *);
+extern void hfs_delete_inode(struct inode *);
+
+/* attr.c */
+extern int hfs_setxattr(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags);
+extern ssize_t hfs_getxattr(struct dentry *dentry, const char *name,
+			    void *value, size_t size);
+extern ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+
+/* mdb.c */
+extern int hfs_mdb_get(struct super_block *);
+extern void hfs_mdb_commit(struct super_block *);
+extern void hfs_mdb_close(struct super_block *);
+extern void hfs_mdb_put(struct super_block *);
+
+/* part_tbl.c */
+extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
+
+/* string.c */
+extern const struct dentry_operations hfs_dentry_operations;
+
+extern int hfs_hash_dentry(const struct dentry *, const struct inode *,
+		struct qstr *);
+extern int hfs_strcmp(const unsigned char *, unsigned int,
+		      const unsigned char *, unsigned int);
+extern int hfs_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
+
+/* trans.c */
+extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *);
+extern int hfs_mac2asc(struct super_block *, char *, const struct hfs_name *);
+
+extern struct timezone sys_tz;
+
+/*
+ * There are two time systems.  Both are based on seconds since
+ * a particular time/date.
+ *	Unix:	unsigned lil-endian since 00:00 GMT, Jan. 1, 1970
+ *	mac:	unsigned big-endian since 00:00 GMT, Jan. 1, 1904
+ *
+ */
+#define __hfs_u_to_mtime(sec)	cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60)
+#define __hfs_m_to_utime(sec)	(be32_to_cpu(sec) - 2082844800U  + sys_tz.tz_minuteswest * 60)
+
+#define HFS_I(inode)	(list_entry(inode, struct hfs_inode_info, vfs_inode))
+#define HFS_SB(sb)	((struct hfs_sb_info *)(sb)->s_fs_info)
+
+#define hfs_m_to_utime(time)	(struct timespec){ .tv_sec = __hfs_m_to_utime(time) }
+#define hfs_u_to_mtime(time)	__hfs_u_to_mtime((time).tv_sec)
+#define hfs_mtime()		__hfs_u_to_mtime(get_seconds())
+
+static inline const char *hfs_mdb_name(struct super_block *sb)
+{
+	return sb->s_id;
+}
+
+static inline void hfs_bitmap_dirty(struct super_block *sb)
+{
+	set_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags);
+	sb->s_dirt = 1;
+}
+
+#define sb_bread512(sb, sec, data) ({			\
+	struct buffer_head *__bh;			\
+	sector_t __block;				\
+	loff_t __start;					\
+	int __offset;					\
+							\
+	__start = (loff_t)(sec) << HFS_SECTOR_SIZE_BITS;\
+	__block = __start >> (sb)->s_blocksize_bits;	\
+	__offset = __start & ((sb)->s_blocksize - 1);	\
+	__bh = sb_bread((sb), __block);			\
+	if (likely(__bh != NULL))			\
+		data = (void *)(__bh->b_data + __offset);\
+	else						\
+		data = NULL;				\
+	__bh;						\
+})
+
+#endif
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
new file mode 100644
index 00000000..fff16c96
--- /dev/null
+++ b/fs/hfs/inode.c
@@ -0,0 +1,673 @@
+/*
+ *  linux/fs/hfs/inode.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains inode-related functions which do not depend on
+ * which scheme is being used to represent forks.
+ *
+ * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds
+ */
+
+#include <linux/pagemap.h>
+#include <linux/mpage.h>
+#include <linux/sched.h>
+
+#include "hfs_fs.h"
+#include "btree.h"
+
+static const struct file_operations hfs_file_operations;
+static const struct inode_operations hfs_file_inode_operations;
+
+/*================ Variable-like macros ================*/
+
+#define HFS_VALID_MODE_BITS  (S_IFREG | S_IFDIR | S_IRWXUGO)
+
+static int hfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, hfs_get_block, wbc);
+}
+
+static int hfs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, hfs_get_block);
+}
+
+static int hfs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	*pagep = NULL;
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+				hfs_get_block,
+				&HFS_I(mapping->host)->phys_size);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
+
+static sector_t hfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, hfs_get_block);
+}
+
+static int hfs_releasepage(struct page *page, gfp_t mask)
+{
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct hfs_btree *tree;
+	struct hfs_bnode *node;
+	u32 nidx;
+	int i, res = 1;
+
+	switch (inode->i_ino) {
+	case HFS_EXT_CNID:
+		tree = HFS_SB(sb)->ext_tree;
+		break;
+	case HFS_CAT_CNID:
+		tree = HFS_SB(sb)->cat_tree;
+		break;
+	default:
+		BUG();
+		return 0;
+	}
+
+	if (!tree)
+		return 0;
+
+	if (tree->node_size >= PAGE_CACHE_SIZE) {
+		nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
+		spin_lock(&tree->hash_lock);
+		node = hfs_bnode_findhash(tree, nidx);
+		if (!node)
+			;
+		else if (atomic_read(&node->refcnt))
+			res = 0;
+		if (res && node) {
+			hfs_bnode_unhash(node);
+			hfs_bnode_free(node);
+		}
+		spin_unlock(&tree->hash_lock);
+	} else {
+		nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+		i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+		spin_lock(&tree->hash_lock);
+		do {
+			node = hfs_bnode_findhash(tree, nidx++);
+			if (!node)
+				continue;
+			if (atomic_read(&node->refcnt)) {
+				res = 0;
+				break;
+			}
+			hfs_bnode_unhash(node);
+			hfs_bnode_free(node);
+		} while (--i && nidx < tree->node_count);
+		spin_unlock(&tree->hash_lock);
+	}
+	return res ? try_to_free_buffers(page) : 0;
+}
+
+static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
+		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
+	ssize_t ret;
+
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				  offset, nr_segs, hfs_get_block, NULL);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+
+	return ret;
+}
+
+static int hfs_writepages(struct address_space *mapping,
+			  struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, hfs_get_block);
+}
+
+const struct address_space_operations hfs_btree_aops = {
+	.readpage	= hfs_readpage,
+	.writepage	= hfs_writepage,
+	.write_begin	= hfs_write_begin,
+	.write_end	= generic_write_end,
+	.bmap		= hfs_bmap,
+	.releasepage	= hfs_releasepage,
+};
+
+const struct address_space_operations hfs_aops = {
+	.readpage	= hfs_readpage,
+	.writepage	= hfs_writepage,
+	.write_begin	= hfs_write_begin,
+	.write_end	= generic_write_end,
+	.bmap		= hfs_bmap,
+	.direct_IO	= hfs_direct_IO,
+	.writepages	= hfs_writepages,
+};
+
+/*
+ * hfs_new_inode
+ */
+struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = new_inode(sb);
+	if (!inode)
+		return NULL;
+
+	mutex_init(&HFS_I(inode)->extents_lock);
+	INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
+	hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
+	inode->i_ino = HFS_SB(sb)->next_id++;
+	inode->i_mode = mode;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	inode->i_nlink = 1;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	HFS_I(inode)->flags = 0;
+	HFS_I(inode)->rsrc_inode = NULL;
+	HFS_I(inode)->fs_blocks = 0;
+	if (S_ISDIR(mode)) {
+		inode->i_size = 2;
+		HFS_SB(sb)->folder_count++;
+		if (dir->i_ino == HFS_ROOT_CNID)
+			HFS_SB(sb)->root_dirs++;
+		inode->i_op = &hfs_dir_inode_operations;
+		inode->i_fop = &hfs_dir_operations;
+		inode->i_mode |= S_IRWXUGO;
+		inode->i_mode &= ~HFS_SB(inode->i_sb)->s_dir_umask;
+	} else if (S_ISREG(mode)) {
+		HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks;
+		HFS_SB(sb)->file_count++;
+		if (dir->i_ino == HFS_ROOT_CNID)
+			HFS_SB(sb)->root_files++;
+		inode->i_op = &hfs_file_inode_operations;
+		inode->i_fop = &hfs_file_operations;
+		inode->i_mapping->a_ops = &hfs_aops;
+		inode->i_mode |= S_IRUGO|S_IXUGO;
+		if (mode & S_IWUSR)
+			inode->i_mode |= S_IWUGO;
+		inode->i_mode &= ~HFS_SB(inode->i_sb)->s_file_umask;
+		HFS_I(inode)->phys_size = 0;
+		HFS_I(inode)->alloc_blocks = 0;
+		HFS_I(inode)->first_blocks = 0;
+		HFS_I(inode)->cached_start = 0;
+		HFS_I(inode)->cached_blocks = 0;
+		memset(HFS_I(inode)->first_extents, 0, sizeof(hfs_extent_rec));
+		memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec));
+	}
+	insert_inode_hash(inode);
+	mark_inode_dirty(inode);
+	set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
+	sb->s_dirt = 1;
+
+	return inode;
+}
+
+void hfs_delete_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	dprint(DBG_INODE, "delete_inode: %lu\n", inode->i_ino);
+	if (S_ISDIR(inode->i_mode)) {
+		HFS_SB(sb)->folder_count--;
+		if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
+			HFS_SB(sb)->root_dirs--;
+		set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
+		sb->s_dirt = 1;
+		return;
+	}
+	HFS_SB(sb)->file_count--;
+	if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
+		HFS_SB(sb)->root_files--;
+	if (S_ISREG(inode->i_mode)) {
+		if (!inode->i_nlink) {
+			inode->i_size = 0;
+			hfs_file_truncate(inode);
+		}
+	}
+	set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
+	sb->s_dirt = 1;
+}
+
+void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
+			 __be32 __log_size, __be32 phys_size, u32 clump_size)
+{
+	struct super_block *sb = inode->i_sb;
+	u32 log_size = be32_to_cpu(__log_size);
+	u16 count;
+	int i;
+
+	memcpy(HFS_I(inode)->first_extents, ext, sizeof(hfs_extent_rec));
+	for (count = 0, i = 0; i < 3; i++)
+		count += be16_to_cpu(ext[i].count);
+	HFS_I(inode)->first_blocks = count;
+
+	inode->i_size = HFS_I(inode)->phys_size = log_size;
+	HFS_I(inode)->fs_blocks = (log_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits);
+	HFS_I(inode)->alloc_blocks = be32_to_cpu(phys_size) /
+				     HFS_SB(sb)->alloc_blksz;
+	HFS_I(inode)->clump_blocks = clump_size / HFS_SB(sb)->alloc_blksz;
+	if (!HFS_I(inode)->clump_blocks)
+		HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks;
+}
+
+struct hfs_iget_data {
+	struct hfs_cat_key *key;
+	hfs_cat_rec *rec;
+};
+
+static int hfs_test_inode(struct inode *inode, void *data)
+{
+	struct hfs_iget_data *idata = data;
+	hfs_cat_rec *rec;
+
+	rec = idata->rec;
+	switch (rec->type) {
+	case HFS_CDR_DIR:
+		return inode->i_ino == be32_to_cpu(rec->dir.DirID);
+	case HFS_CDR_FIL:
+		return inode->i_ino == be32_to_cpu(rec->file.FlNum);
+	default:
+		BUG();
+		return 1;
+	}
+}
+
+/*
+ * hfs_read_inode
+ */
+static int hfs_read_inode(struct inode *inode, void *data)
+{
+	struct hfs_iget_data *idata = data;
+	struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
+	hfs_cat_rec *rec;
+
+	HFS_I(inode)->flags = 0;
+	HFS_I(inode)->rsrc_inode = NULL;
+	mutex_init(&HFS_I(inode)->extents_lock);
+	INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
+
+	/* Initialize the inode */
+	inode->i_uid = hsb->s_uid;
+	inode->i_gid = hsb->s_gid;
+	inode->i_nlink = 1;
+
+	if (idata->key)
+		HFS_I(inode)->cat_key = *idata->key;
+	else
+		HFS_I(inode)->flags |= HFS_FLG_RSRC;
+	HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60;
+
+	rec = idata->rec;
+	switch (rec->type) {
+	case HFS_CDR_FIL:
+		if (!HFS_IS_RSRC(inode)) {
+			hfs_inode_read_fork(inode, rec->file.ExtRec, rec->file.LgLen,
+					    rec->file.PyLen, be16_to_cpu(rec->file.ClpSize));
+		} else {
+			hfs_inode_read_fork(inode, rec->file.RExtRec, rec->file.RLgLen,
+					    rec->file.RPyLen, be16_to_cpu(rec->file.ClpSize));
+		}
+
+		inode->i_ino = be32_to_cpu(rec->file.FlNum);
+		inode->i_mode = S_IRUGO | S_IXUGO;
+		if (!(rec->file.Flags & HFS_FIL_LOCK))
+			inode->i_mode |= S_IWUGO;
+		inode->i_mode &= ~hsb->s_file_umask;
+		inode->i_mode |= S_IFREG;
+		inode->i_ctime = inode->i_atime = inode->i_mtime =
+				hfs_m_to_utime(rec->file.MdDat);
+		inode->i_op = &hfs_file_inode_operations;
+		inode->i_fop = &hfs_file_operations;
+		inode->i_mapping->a_ops = &hfs_aops;
+		break;
+	case HFS_CDR_DIR:
+		inode->i_ino = be32_to_cpu(rec->dir.DirID);
+		inode->i_size = be16_to_cpu(rec->dir.Val) + 2;
+		HFS_I(inode)->fs_blocks = 0;
+		inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask);
+		inode->i_ctime = inode->i_atime = inode->i_mtime =
+				hfs_m_to_utime(rec->dir.MdDat);
+		inode->i_op = &hfs_dir_inode_operations;
+		inode->i_fop = &hfs_dir_operations;
+		break;
+	default:
+		make_bad_inode(inode);
+	}
+	return 0;
+}
+
+/*
+ * __hfs_iget()
+ *
+ * Given the MDB for a HFS filesystem, a 'key' and an 'entry' in
+ * the catalog B-tree and the 'type' of the desired file return the
+ * inode for that file/directory or NULL.  Note that 'type' indicates
+ * whether we want the actual file or directory, or the corresponding
+ * metadata (AppleDouble header file or CAP metadata file).
+ */
+struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_rec *rec)
+{
+	struct hfs_iget_data data = { key, rec };
+	struct inode *inode;
+	u32 cnid;
+
+	switch (rec->type) {
+	case HFS_CDR_DIR:
+		cnid = be32_to_cpu(rec->dir.DirID);
+		break;
+	case HFS_CDR_FIL:
+		cnid = be32_to_cpu(rec->file.FlNum);
+		break;
+	default:
+		return NULL;
+	}
+	inode = iget5_locked(sb, cnid, hfs_test_inode, hfs_read_inode, &data);
+	if (inode && (inode->i_state & I_NEW))
+		unlock_new_inode(inode);
+	return inode;
+}
+
+void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext,
+			  __be32 *log_size, __be32 *phys_size)
+{
+	memcpy(ext, HFS_I(inode)->first_extents, sizeof(hfs_extent_rec));
+
+	if (log_size)
+		*log_size = cpu_to_be32(inode->i_size);
+	if (phys_size)
+		*phys_size = cpu_to_be32(HFS_I(inode)->alloc_blocks *
+					 HFS_SB(inode->i_sb)->alloc_blksz);
+}
+
+int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct inode *main_inode = inode;
+	struct hfs_find_data fd;
+	hfs_cat_rec rec;
+
+	dprint(DBG_INODE, "hfs_write_inode: %lu\n", inode->i_ino);
+	hfs_ext_write_extent(inode);
+
+	if (inode->i_ino < HFS_FIRSTUSER_CNID) {
+		switch (inode->i_ino) {
+		case HFS_ROOT_CNID:
+			break;
+		case HFS_EXT_CNID:
+			hfs_btree_write(HFS_SB(inode->i_sb)->ext_tree);
+			return 0;
+		case HFS_CAT_CNID:
+			hfs_btree_write(HFS_SB(inode->i_sb)->cat_tree);
+			return 0;
+		default:
+			BUG();
+			return -EIO;
+		}
+	}
+
+	if (HFS_IS_RSRC(inode))
+		main_inode = HFS_I(inode)->rsrc_inode;
+
+	if (!main_inode->i_nlink)
+		return 0;
+
+	if (hfs_find_init(HFS_SB(main_inode->i_sb)->cat_tree, &fd))
+		/* panic? */
+		return -EIO;
+
+	fd.search_key->cat = HFS_I(main_inode)->cat_key;
+	if (hfs_brec_find(&fd))
+		/* panic? */
+		goto out;
+
+	if (S_ISDIR(main_inode->i_mode)) {
+		if (fd.entrylength < sizeof(struct hfs_cat_dir))
+			/* panic? */;
+		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
+			   sizeof(struct hfs_cat_dir));
+		if (rec.type != HFS_CDR_DIR ||
+		    be32_to_cpu(rec.dir.DirID) != inode->i_ino) {
+		}
+
+		rec.dir.MdDat = hfs_u_to_mtime(inode->i_mtime);
+		rec.dir.Val = cpu_to_be16(inode->i_size - 2);
+
+		hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
+			    sizeof(struct hfs_cat_dir));
+	} else if (HFS_IS_RSRC(inode)) {
+		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
+			       sizeof(struct hfs_cat_file));
+		hfs_inode_write_fork(inode, rec.file.RExtRec,
+				     &rec.file.RLgLen, &rec.file.RPyLen);
+		hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
+				sizeof(struct hfs_cat_file));
+	} else {
+		if (fd.entrylength < sizeof(struct hfs_cat_file))
+			/* panic? */;
+		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
+			   sizeof(struct hfs_cat_file));
+		if (rec.type != HFS_CDR_FIL ||
+		    be32_to_cpu(rec.file.FlNum) != inode->i_ino) {
+		}
+
+		if (inode->i_mode & S_IWUSR)
+			rec.file.Flags &= ~HFS_FIL_LOCK;
+		else
+			rec.file.Flags |= HFS_FIL_LOCK;
+		hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen);
+		rec.file.MdDat = hfs_u_to_mtime(inode->i_mtime);
+
+		hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
+			    sizeof(struct hfs_cat_file));
+	}
+out:
+	hfs_find_exit(&fd);
+	return 0;
+}
+
+static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
+				      struct nameidata *nd)
+{
+	struct inode *inode = NULL;
+	hfs_cat_rec rec;
+	struct hfs_find_data fd;
+	int res;
+
+	if (HFS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
+		goto out;
+
+	inode = HFS_I(dir)->rsrc_inode;
+	if (inode)
+		goto out;
+
+	inode = new_inode(dir->i_sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
+	fd.search_key->cat = HFS_I(dir)->cat_key;
+	res = hfs_brec_read(&fd, &rec, sizeof(rec));
+	if (!res) {
+		struct hfs_iget_data idata = { NULL, &rec };
+		hfs_read_inode(inode, &idata);
+	}
+	hfs_find_exit(&fd);
+	if (res) {
+		iput(inode);
+		return ERR_PTR(res);
+	}
+	HFS_I(inode)->rsrc_inode = dir;
+	HFS_I(dir)->rsrc_inode = inode;
+	igrab(dir);
+	hlist_add_fake(&inode->i_hash);
+	mark_inode_dirty(inode);
+out:
+	d_add(dentry, inode);
+	return NULL;
+}
+
+void hfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
+		HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
+		iput(HFS_I(inode)->rsrc_inode);
+	}
+}
+
+static int hfs_file_open(struct inode *inode, struct file *file)
+{
+	if (HFS_IS_RSRC(inode))
+		inode = HFS_I(inode)->rsrc_inode;
+	atomic_inc(&HFS_I(inode)->opencnt);
+	return 0;
+}
+
+static int hfs_file_release(struct inode *inode, struct file *file)
+{
+	//struct super_block *sb = inode->i_sb;
+
+	if (HFS_IS_RSRC(inode))
+		inode = HFS_I(inode)->rsrc_inode;
+	if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
+		mutex_lock(&inode->i_mutex);
+		hfs_file_truncate(inode);
+		//if (inode->i_flags & S_DEAD) {
+		//	hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
+		//	hfs_delete_inode(inode);
+		//}
+		mutex_unlock(&inode->i_mutex);
+	}
+	return 0;
+}
+
+/*
+ * hfs_notify_change()
+ *
+ * Based very closely on fs/msdos/inode.c by Werner Almesberger
+ *
+ * This is the notify_change() field in the super_operations structure
+ * for HFS file systems.  The purpose is to take that changes made to
+ * an inode and apply then in a filesystem-dependent manner.  In this
+ * case the process has a few of tasks to do:
+ *  1) prevent changes to the i_uid and i_gid fields.
+ *  2) map file permissions to the closest allowable permissions
+ *  3) Since multiple Linux files can share the same on-disk inode under
+ *     HFS (for instance the data and resource forks of a file) a change
+ *     to permissions must be applied to all other in-core inodes which
+ *     correspond to the same HFS file.
+ */
+
+int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
+	int error;
+
+	error = inode_change_ok(inode, attr); /* basic permission checks */
+	if (error)
+		return error;
+
+	/* no uig/gid changes and limit which mode bits can be set */
+	if (((attr->ia_valid & ATTR_UID) &&
+	     (attr->ia_uid != hsb->s_uid)) ||
+	    ((attr->ia_valid & ATTR_GID) &&
+	     (attr->ia_gid != hsb->s_gid)) ||
+	    ((attr->ia_valid & ATTR_MODE) &&
+	     ((S_ISDIR(inode->i_mode) &&
+	       (attr->ia_mode != inode->i_mode)) ||
+	      (attr->ia_mode & ~HFS_VALID_MODE_BITS)))) {
+		return hsb->s_quiet ? 0 : error;
+	}
+
+	if (attr->ia_valid & ATTR_MODE) {
+		/* Only the 'w' bits can ever change and only all together. */
+		if (attr->ia_mode & S_IWUSR)
+			attr->ia_mode = inode->i_mode | S_IWUGO;
+		else
+			attr->ia_mode = inode->i_mode & ~S_IWUGO;
+		attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask;
+	}
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static int hfs_file_fsync(struct file *filp, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	struct super_block * sb;
+	int ret, err;
+
+	/* sync the inode to buffers */
+	ret = write_inode_now(inode, 0);
+
+	/* sync the superblock to buffers */
+	sb = inode->i_sb;
+	if (sb->s_dirt) {
+		lock_super(sb);
+		sb->s_dirt = 0;
+		if (!(sb->s_flags & MS_RDONLY))
+			hfs_mdb_commit(sb);
+		unlock_super(sb);
+	}
+	/* .. finally sync the buffers to disk */
+	err = sync_blockdev(sb->s_bdev);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static const struct file_operations hfs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.splice_read	= generic_file_splice_read,
+	.fsync		= hfs_file_fsync,
+	.open		= hfs_file_open,
+	.release	= hfs_file_release,
+};
+
+static const struct inode_operations hfs_file_inode_operations = {
+	.lookup		= hfs_file_lookup,
+	.truncate	= hfs_file_truncate,
+	.setattr	= hfs_inode_setattr,
+	.setxattr	= hfs_setxattr,
+	.getxattr	= hfs_getxattr,
+	.listxattr	= hfs_listxattr,
+};
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
new file mode 100644
index 00000000..1563d5ce
--- /dev/null
+++ b/fs/hfs/mdb.c
@@ -0,0 +1,354 @@
+/*
+ *  linux/fs/hfs/mdb.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains functions for reading/writing the MDB.
+ */
+
+#include <linux/cdrom.h>
+#include <linux/genhd.h>
+#include <linux/nls.h>
+#include <linux/slab.h>
+
+#include "hfs_fs.h"
+#include "btree.h"
+
+/*================ File-local data types ================*/
+
+/*
+ * The HFS Master Directory Block (MDB).
+ *
+ * Also known as the Volume Information Block (VIB), this structure is
+ * the HFS equivalent of a superblock.
+ *
+ * Reference: _Inside Macintosh: Files_ pages 2-59 through 2-62
+ *
+ * modified for HFS Extended
+ */
+
+static int hfs_get_last_session(struct super_block *sb,
+				sector_t *start, sector_t *size)
+{
+	struct cdrom_multisession ms_info;
+	struct cdrom_tocentry te;
+	int res;
+
+	/* default values */
+	*start = 0;
+	*size = sb->s_bdev->bd_inode->i_size >> 9;
+
+	if (HFS_SB(sb)->session >= 0) {
+		te.cdte_track = HFS_SB(sb)->session;
+		te.cdte_format = CDROM_LBA;
+		res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
+		if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
+			*start = (sector_t)te.cdte_addr.lba << 2;
+			return 0;
+		}
+		printk(KERN_ERR "hfs: invalid session number or type of track\n");
+		return -EINVAL;
+	}
+	ms_info.addr_format = CDROM_LBA;
+	res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
+	if (!res && ms_info.xa_flag)
+		*start = (sector_t)ms_info.addr.lba << 2;
+	return 0;
+}
+
+/*
+ * hfs_mdb_get()
+ *
+ * Build the in-core MDB for a filesystem, including
+ * the B-trees and the volume bitmap.
+ */
+int hfs_mdb_get(struct super_block *sb)
+{
+	struct buffer_head *bh;
+	struct hfs_mdb *mdb, *mdb2;
+	unsigned int block;
+	char *ptr;
+	int off2, len, size, sect;
+	sector_t part_start, part_size;
+	loff_t off;
+	__be16 attrib;
+
+	/* set the device driver to 512-byte blocks */
+	size = sb_min_blocksize(sb, HFS_SECTOR_SIZE);
+	if (!size)
+		return -EINVAL;
+
+	if (hfs_get_last_session(sb, &part_start, &part_size))
+		return -EINVAL;
+	while (1) {
+		/* See if this is an HFS filesystem */
+		bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb);
+		if (!bh)
+			goto out;
+
+		if (mdb->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC))
+			break;
+		brelse(bh);
+
+		/* check for a partition block
+		 * (should do this only for cdrom/loop though)
+		 */
+		if (hfs_part_find(sb, &part_start, &part_size))
+			goto out;
+	}
+
+	HFS_SB(sb)->alloc_blksz = size = be32_to_cpu(mdb->drAlBlkSiz);
+	if (!size || (size & (HFS_SECTOR_SIZE - 1))) {
+		printk(KERN_ERR "hfs: bad allocation block size %d\n", size);
+		goto out_bh;
+	}
+
+	size = min(HFS_SB(sb)->alloc_blksz, (u32)PAGE_SIZE);
+	/* size must be a multiple of 512 */
+	while (size & (size - 1))
+		size -= HFS_SECTOR_SIZE;
+	sect = be16_to_cpu(mdb->drAlBlSt) + part_start;
+	/* align block size to first sector */
+	while (sect & ((size - 1) >> HFS_SECTOR_SIZE_BITS))
+		size >>= 1;
+	/* align block size to weird alloc size */
+	while (HFS_SB(sb)->alloc_blksz & (size - 1))
+		size >>= 1;
+	brelse(bh);
+	if (!sb_set_blocksize(sb, size)) {
+		printk(KERN_ERR "hfs: unable to set blocksize to %u\n", size);
+		goto out;
+	}
+
+	bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb);
+	if (!bh)
+		goto out;
+	if (mdb->drSigWord != cpu_to_be16(HFS_SUPER_MAGIC))
+		goto out_bh;
+
+	HFS_SB(sb)->mdb_bh = bh;
+	HFS_SB(sb)->mdb = mdb;
+
+	/* These parameters are read from the MDB, and never written */
+	HFS_SB(sb)->part_start = part_start;
+	HFS_SB(sb)->fs_ablocks = be16_to_cpu(mdb->drNmAlBlks);
+	HFS_SB(sb)->fs_div = HFS_SB(sb)->alloc_blksz >> sb->s_blocksize_bits;
+	HFS_SB(sb)->clumpablks = be32_to_cpu(mdb->drClpSiz) /
+				 HFS_SB(sb)->alloc_blksz;
+	if (!HFS_SB(sb)->clumpablks)
+		HFS_SB(sb)->clumpablks = 1;
+	HFS_SB(sb)->fs_start = (be16_to_cpu(mdb->drAlBlSt) + part_start) >>
+			       (sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS);
+
+	/* These parameters are read from and written to the MDB */
+	HFS_SB(sb)->free_ablocks = be16_to_cpu(mdb->drFreeBks);
+	HFS_SB(sb)->next_id = be32_to_cpu(mdb->drNxtCNID);
+	HFS_SB(sb)->root_files = be16_to_cpu(mdb->drNmFls);
+	HFS_SB(sb)->root_dirs = be16_to_cpu(mdb->drNmRtDirs);
+	HFS_SB(sb)->file_count = be32_to_cpu(mdb->drFilCnt);
+	HFS_SB(sb)->folder_count = be32_to_cpu(mdb->drDirCnt);
+
+	/* TRY to get the alternate (backup) MDB. */
+	sect = part_start + part_size - 2;
+	bh = sb_bread512(sb, sect, mdb2);
+	if (bh) {
+		if (mdb2->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC)) {
+			HFS_SB(sb)->alt_mdb_bh = bh;
+			HFS_SB(sb)->alt_mdb = mdb2;
+		} else
+			brelse(bh);
+	}
+
+	if (!HFS_SB(sb)->alt_mdb) {
+		printk(KERN_WARNING "hfs: unable to locate alternate MDB\n");
+		printk(KERN_WARNING "hfs: continuing without an alternate MDB\n");
+	}
+
+	HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0);
+	if (!HFS_SB(sb)->bitmap)
+		goto out;
+
+	/* read in the bitmap */
+	block = be16_to_cpu(mdb->drVBMSt) + part_start;
+	off = (loff_t)block << HFS_SECTOR_SIZE_BITS;
+	size = (HFS_SB(sb)->fs_ablocks + 8) / 8;
+	ptr = (u8 *)HFS_SB(sb)->bitmap;
+	while (size) {
+		bh = sb_bread(sb, off >> sb->s_blocksize_bits);
+		if (!bh) {
+			printk(KERN_ERR "hfs: unable to read volume bitmap\n");
+			goto out;
+		}
+		off2 = off & (sb->s_blocksize - 1);
+		len = min((int)sb->s_blocksize - off2, size);
+		memcpy(ptr, bh->b_data + off2, len);
+		brelse(bh);
+		ptr += len;
+		off += len;
+		size -= len;
+	}
+
+	HFS_SB(sb)->ext_tree = hfs_btree_open(sb, HFS_EXT_CNID, hfs_ext_keycmp);
+	if (!HFS_SB(sb)->ext_tree) {
+		printk(KERN_ERR "hfs: unable to open extent tree\n");
+		goto out;
+	}
+	HFS_SB(sb)->cat_tree = hfs_btree_open(sb, HFS_CAT_CNID, hfs_cat_keycmp);
+	if (!HFS_SB(sb)->cat_tree) {
+		printk(KERN_ERR "hfs: unable to open catalog tree\n");
+		goto out;
+	}
+
+	attrib = mdb->drAtrb;
+	if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
+		printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, "
+			 "running fsck.hfs is recommended.  mounting read-only.\n");
+		sb->s_flags |= MS_RDONLY;
+	}
+	if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) {
+		printk(KERN_WARNING "hfs: filesystem is marked locked, mounting read-only.\n");
+		sb->s_flags |= MS_RDONLY;
+	}
+	if (!(sb->s_flags & MS_RDONLY)) {
+		/* Mark the volume uncleanly unmounted in case we crash */
+		attrib &= cpu_to_be16(~HFS_SB_ATTRIB_UNMNT);
+		attrib |= cpu_to_be16(HFS_SB_ATTRIB_INCNSTNT);
+		mdb->drAtrb = attrib;
+		be32_add_cpu(&mdb->drWrCnt, 1);
+		mdb->drLsMod = hfs_mtime();
+
+		mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
+		sync_dirty_buffer(HFS_SB(sb)->mdb_bh);
+	}
+
+	return 0;
+
+out_bh:
+	brelse(bh);
+out:
+	hfs_mdb_put(sb);
+	return -EIO;
+}
+
+/*
+ * hfs_mdb_commit()
+ *
+ * Description:
+ *   This updates the MDB on disk (look also at hfs_write_super()).
+ *   It does not check, if the superblock has been modified, or
+ *   if the filesystem has been mounted read-only. It is mainly
+ *   called by hfs_write_super() and hfs_btree_extend().
+ * Input Variable(s):
+ *   struct hfs_mdb *mdb: Pointer to the hfs MDB
+ *   int backup;
+ * Output Variable(s):
+ *   NONE
+ * Returns:
+ *   void
+ * Preconditions:
+ *   'mdb' points to a "valid" (struct hfs_mdb).
+ * Postconditions:
+ *   The HFS MDB and on disk will be updated, by copying the possibly
+ *   modified fields from the in memory MDB (in native byte order) to
+ *   the disk block buffer.
+ *   If 'backup' is non-zero then the alternate MDB is also written
+ *   and the function doesn't return until it is actually on disk.
+ */
+void hfs_mdb_commit(struct super_block *sb)
+{
+	struct hfs_mdb *mdb = HFS_SB(sb)->mdb;
+
+	if (test_and_clear_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags)) {
+		/* These parameters may have been modified, so write them back */
+		mdb->drLsMod = hfs_mtime();
+		mdb->drFreeBks = cpu_to_be16(HFS_SB(sb)->free_ablocks);
+		mdb->drNxtCNID = cpu_to_be32(HFS_SB(sb)->next_id);
+		mdb->drNmFls = cpu_to_be16(HFS_SB(sb)->root_files);
+		mdb->drNmRtDirs = cpu_to_be16(HFS_SB(sb)->root_dirs);
+		mdb->drFilCnt = cpu_to_be32(HFS_SB(sb)->file_count);
+		mdb->drDirCnt = cpu_to_be32(HFS_SB(sb)->folder_count);
+
+		/* write MDB to disk */
+		mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
+	}
+
+	/* write the backup MDB, not returning until it is written.
+	 * we only do this when either the catalog or extents overflow
+	 * files grow. */
+	if (test_and_clear_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags) &&
+	    HFS_SB(sb)->alt_mdb) {
+		hfs_inode_write_fork(HFS_SB(sb)->ext_tree->inode, mdb->drXTExtRec,
+				     &mdb->drXTFlSize, NULL);
+		hfs_inode_write_fork(HFS_SB(sb)->cat_tree->inode, mdb->drCTExtRec,
+				     &mdb->drCTFlSize, NULL);
+		memcpy(HFS_SB(sb)->alt_mdb, HFS_SB(sb)->mdb, HFS_SECTOR_SIZE);
+		HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
+		HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
+		mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh);
+		sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh);
+	}
+
+	if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) {
+		struct buffer_head *bh;
+		sector_t block;
+		char *ptr;
+		int off, size, len;
+
+		block = be16_to_cpu(HFS_SB(sb)->mdb->drVBMSt) + HFS_SB(sb)->part_start;
+		off = (block << HFS_SECTOR_SIZE_BITS) & (sb->s_blocksize - 1);
+		block >>= sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS;
+		size = (HFS_SB(sb)->fs_ablocks + 7) / 8;
+		ptr = (u8 *)HFS_SB(sb)->bitmap;
+		while (size) {
+			bh = sb_bread(sb, block);
+			if (!bh) {
+				printk(KERN_ERR "hfs: unable to read volume bitmap\n");
+				break;
+			}
+			len = min((int)sb->s_blocksize - off, size);
+			memcpy(bh->b_data + off, ptr, len);
+			mark_buffer_dirty(bh);
+			brelse(bh);
+			block++;
+			off = 0;
+			ptr += len;
+			size -= len;
+		}
+	}
+}
+
+void hfs_mdb_close(struct super_block *sb)
+{
+	/* update volume attributes */
+	if (sb->s_flags & MS_RDONLY)
+		return;
+	HFS_SB(sb)->mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
+	HFS_SB(sb)->mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
+	mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
+}
+
+/*
+ * hfs_mdb_put()
+ *
+ * Release the resources associated with the in-core MDB.  */
+void hfs_mdb_put(struct super_block *sb)
+{
+	if (!HFS_SB(sb))
+		return;
+	/* free the B-trees */
+	hfs_btree_close(HFS_SB(sb)->ext_tree);
+	hfs_btree_close(HFS_SB(sb)->cat_tree);
+
+	/* free the buffers holding the primary and alternate MDBs */
+	brelse(HFS_SB(sb)->mdb_bh);
+	brelse(HFS_SB(sb)->alt_mdb_bh);
+
+	unload_nls(HFS_SB(sb)->nls_io);
+	unload_nls(HFS_SB(sb)->nls_disk);
+
+	free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
+	kfree(HFS_SB(sb));
+	sb->s_fs_info = NULL;
+}
diff --git a/fs/hfs/part_tbl.c b/fs/hfs/part_tbl.c
new file mode 100644
index 00000000..36add537
--- /dev/null
+++ b/fs/hfs/part_tbl.c
@@ -0,0 +1,117 @@
+/*
+ *  linux/fs/hfs/part_tbl.c
+ *
+ * Copyright (C) 1996-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * Original code to handle the new style Mac partition table based on
+ * a patch contributed by Holger Schemel (aeglos@valinor.owl.de).
+ */
+
+#include "hfs_fs.h"
+
+/*
+ * The new style Mac partition map
+ *
+ * For each partition on the media there is a physical block (512-byte
+ * block) containing one of these structures.  These blocks are
+ * contiguous starting at block 1.
+ */
+struct new_pmap {
+	__be16	pmSig;		/* signature */
+	__be16	reSigPad;	/* padding */
+	__be32	pmMapBlkCnt;	/* partition blocks count */
+	__be32	pmPyPartStart;	/* physical block start of partition */
+	__be32	pmPartBlkCnt;	/* physical block count of partition */
+	u8	pmPartName[32];	/* (null terminated?) string
+				   giving the name of this
+				   partition */
+	u8	pmPartType[32];	/* (null terminated?) string
+				   giving the type of this
+				   partition */
+	/* a bunch more stuff we don't need */
+} __packed;
+
+/*
+ * The old style Mac partition map
+ *
+ * The partition map consists for a 2-byte signature followed by an
+ * array of these structures.  The map is terminated with an all-zero
+ * one of these.
+ */
+struct old_pmap {
+	__be16		pdSig;	/* Signature bytes */
+	struct 	old_pmap_entry {
+		__be32	pdStart;
+		__be32	pdSize;
+		__be32	pdFSID;
+	}	pdEntry[42];
+} __packed;
+
+/*
+ * hfs_part_find()
+ *
+ * Parse the partition map looking for the
+ * start and length of the 'part'th HFS partition.
+ */
+int hfs_part_find(struct super_block *sb,
+		  sector_t *part_start, sector_t *part_size)
+{
+	struct buffer_head *bh;
+	__be16 *data;
+	int i, size, res;
+
+	res = -ENOENT;
+	bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK, data);
+	if (!bh)
+		return -EIO;
+
+	switch (be16_to_cpu(*data)) {
+	case HFS_OLD_PMAP_MAGIC:
+	  {
+		struct old_pmap *pm;
+		struct old_pmap_entry *p;
+
+		pm = (struct old_pmap *)bh->b_data;
+		p = pm->pdEntry;
+		size = 42;
+		for (i = 0; i < size; p++, i++) {
+			if (p->pdStart && p->pdSize &&
+			    p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
+			    (HFS_SB(sb)->part < 0 || HFS_SB(sb)->part == i)) {
+				*part_start += be32_to_cpu(p->pdStart);
+				*part_size = be32_to_cpu(p->pdSize);
+				res = 0;
+			}
+		}
+		break;
+	  }
+	case HFS_NEW_PMAP_MAGIC:
+	  {
+		struct new_pmap *pm;
+
+		pm = (struct new_pmap *)bh->b_data;
+		size = be32_to_cpu(pm->pmMapBlkCnt);
+		for (i = 0; i < size;) {
+			if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
+			    (HFS_SB(sb)->part < 0 || HFS_SB(sb)->part == i)) {
+				*part_start += be32_to_cpu(pm->pmPyPartStart);
+				*part_size = be32_to_cpu(pm->pmPartBlkCnt);
+				res = 0;
+				break;
+			}
+			brelse(bh);
+			bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK + ++i, pm);
+			if (!bh)
+				return -EIO;
+			if (pm->pmSig != cpu_to_be16(HFS_NEW_PMAP_MAGIC))
+				break;
+		}
+		break;
+	  }
+	}
+	brelse(bh);
+
+	return res;
+}
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
new file mode 100644
index 00000000..495a976a
--- /dev/null
+++ b/fs/hfs/string.c
@@ -0,0 +1,116 @@
+/*
+ *  linux/fs/hfs/string.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains the string comparison function for the
+ * Macintosh character set.
+ *
+ * The code in this file is derived from code which is copyright
+ * 1986, 1989, 1990 by Abacus Research and Development, Inc. (ARDI)
+ * It is used here by the permission of ARDI's president Cliff Matthews.
+ */
+
+#include "hfs_fs.h"
+#include <linux/dcache.h>
+
+/*================ File-local variables ================*/
+
+/*
+ * unsigned char caseorder[]
+ *
+ * Defines the lexical ordering of characters on the Macintosh
+ *
+ * Composition of the 'casefold' and 'order' tables from ARDI's code
+ * with the entry for 0x20 changed to match that for 0xCA to remove
+ * special case for those two characters.
+ */
+static unsigned char caseorder[256] = {
+	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
+	0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
+	0x20,0x22,0x23,0x28,0x29,0x2A,0x2B,0x2C,0x2F,0x30,0x31,0x32,0x33,0x34,0x35,0x36,
+	0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,0x40,0x41,0x42,0x43,0x44,0x45,0x46,
+	0x47,0x48,0x57,0x59,0x5D,0x5F,0x66,0x68,0x6A,0x6C,0x72,0x74,0x76,0x78,0x7A,0x7E,
+	0x8C,0x8E,0x90,0x92,0x95,0x97,0x9E,0xA0,0xA2,0xA4,0xA7,0xA9,0xAA,0xAB,0xAC,0xAD,
+	0x4E,0x48,0x57,0x59,0x5D,0x5F,0x66,0x68,0x6A,0x6C,0x72,0x74,0x76,0x78,0x7A,0x7E,
+	0x8C,0x8E,0x90,0x92,0x95,0x97,0x9E,0xA0,0xA2,0xA4,0xA7,0xAF,0xB0,0xB1,0xB2,0xB3,
+	0x4A,0x4C,0x5A,0x60,0x7B,0x7F,0x98,0x4F,0x49,0x51,0x4A,0x4B,0x4C,0x5A,0x60,0x63,
+	0x64,0x65,0x6E,0x6F,0x70,0x71,0x7B,0x84,0x85,0x86,0x7F,0x80,0x9A,0x9B,0x9C,0x98,
+	0xB4,0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0x94,0xBB,0xBC,0xBD,0xBE,0xBF,0xC0,0x4D,0x81,
+	0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xCA,0xCB,0x55,0x8A,0xCC,0x4D,0x81,
+	0xCD,0xCE,0xCF,0xD0,0xD1,0xD2,0xD3,0x26,0x27,0xD4,0x20,0x49,0x4B,0x80,0x82,0x82,
+	0xD5,0xD6,0x24,0x25,0x2D,0x2E,0xD7,0xD8,0xA6,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
+	0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
+	0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
+};
+
+/*================ Global functions ================*/
+
+/*
+ * Hash a string to an integer in a case-independent way
+ */
+int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *this)
+{
+	const unsigned char *name = this->name;
+	unsigned int hash, len = this->len;
+
+	if (len > HFS_NAMELEN)
+		len = HFS_NAMELEN;
+
+	hash = init_name_hash();
+	for (; len; len--)
+		hash = partial_name_hash(caseorder[*name++], hash);
+	this->hash = end_name_hash(hash);
+	return 0;
+}
+
+/*
+ * Compare two strings in the HFS filename character ordering
+ * Returns positive, negative, or zero, not just 0 or (+/-)1
+ *
+ * Equivalent to ARDI's call:
+ *	ROMlib_RelString(s1+1, s2+1, true, false, (s1[0]<<16) | s2[0])
+ */
+int hfs_strcmp(const unsigned char *s1, unsigned int len1,
+	       const unsigned char *s2, unsigned int len2)
+{
+	int len, tmp;
+
+	len = (len1 > len2) ? len2 : len1;
+
+	while (len--) {
+		tmp = (int)caseorder[*(s1++)] - (int)caseorder[*(s2++)];
+		if (tmp)
+			return tmp;
+	}
+	return len1 - len2;
+}
+
+/*
+ * Test for equality of two strings in the HFS filename character ordering.
+ * return 1 on failure and 0 on success
+ */
+int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	const unsigned char *n1, *n2;
+
+	if (len >= HFS_NAMELEN) {
+		if (name->len < HFS_NAMELEN)
+			return 1;
+		len = HFS_NAMELEN;
+	} else if (len != name->len)
+		return 1;
+
+	n1 = str;
+	n2 = name->name;
+	while (len--) {
+		if (caseorder[*n1++] != caseorder[*n2++])
+			return 1;
+	}
+	return 0;
+}
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
new file mode 100644
index 00000000..1b55f704
--- /dev/null
+++ b/fs/hfs/super.c
@@ -0,0 +1,493 @@
+/*
+ *  linux/fs/hfs/super.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains hfs_read_super(), some of the super_ops and
+ * init_hfs_fs() and exit_hfs_fs().  The remaining super_ops are in
+ * inode.c since they deal with inodes.
+ *
+ * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds
+ */
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/nls.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/vfs.h>
+
+#include "hfs_fs.h"
+#include "btree.h"
+
+static struct kmem_cache *hfs_inode_cachep;
+
+MODULE_LICENSE("GPL");
+
+/*
+ * hfs_write_super()
+ *
+ * Description:
+ *   This function is called by the VFS only. When the filesystem
+ *   is mounted r/w it updates the MDB on disk.
+ * Input Variable(s):
+ *   struct super_block *sb: Pointer to the hfs superblock
+ * Output Variable(s):
+ *   NONE
+ * Returns:
+ *   void
+ * Preconditions:
+ *   'sb' points to a "valid" (struct super_block).
+ * Postconditions:
+ *   The MDB is marked 'unsuccessfully unmounted' by clearing bit 8 of drAtrb
+ *   (hfs_put_super() must set this flag!). Some MDB fields are updated
+ *   and the MDB buffer is written to disk by calling hfs_mdb_commit().
+ */
+static void hfs_write_super(struct super_block *sb)
+{
+	lock_super(sb);
+	sb->s_dirt = 0;
+
+	/* sync everything to the buffers */
+	if (!(sb->s_flags & MS_RDONLY))
+		hfs_mdb_commit(sb);
+	unlock_super(sb);
+}
+
+static int hfs_sync_fs(struct super_block *sb, int wait)
+{
+	lock_super(sb);
+	hfs_mdb_commit(sb);
+	sb->s_dirt = 0;
+	unlock_super(sb);
+
+	return 0;
+}
+
+/*
+ * hfs_put_super()
+ *
+ * This is the put_super() entry in the super_operations structure for
+ * HFS filesystems.  The purpose is to release the resources
+ * associated with the superblock sb.
+ */
+static void hfs_put_super(struct super_block *sb)
+{
+	if (sb->s_dirt)
+		hfs_write_super(sb);
+	hfs_mdb_close(sb);
+	/* release the MDB's resources */
+	hfs_mdb_put(sb);
+}
+
+/*
+ * hfs_statfs()
+ *
+ * This is the statfs() entry in the super_operations structure for
+ * HFS filesystems.  The purpose is to return various data about the
+ * filesystem.
+ *
+ * changed f_files/f_ffree to reflect the fs_ablock/free_ablocks.
+ */
+static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type = HFS_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = (u32)HFS_SB(sb)->fs_ablocks * HFS_SB(sb)->fs_div;
+	buf->f_bfree = (u32)HFS_SB(sb)->free_ablocks * HFS_SB(sb)->fs_div;
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = HFS_SB(sb)->fs_ablocks;
+	buf->f_ffree = HFS_SB(sb)->free_ablocks;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = HFS_NAMELEN;
+
+	return 0;
+}
+
+static int hfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_NODIRATIME;
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		return 0;
+	if (!(*flags & MS_RDONLY)) {
+		if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
+			printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, "
+			       "running fsck.hfs is recommended.  leaving read-only.\n");
+			sb->s_flags |= MS_RDONLY;
+			*flags |= MS_RDONLY;
+		} else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) {
+			printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
+			sb->s_flags |= MS_RDONLY;
+			*flags |= MS_RDONLY;
+		}
+	}
+	return 0;
+}
+
+static int hfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
+{
+	struct hfs_sb_info *sbi = HFS_SB(mnt->mnt_sb);
+
+	if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
+		seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
+	if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
+		seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type);
+	seq_printf(seq, ",uid=%u,gid=%u", sbi->s_uid, sbi->s_gid);
+	if (sbi->s_file_umask != 0133)
+		seq_printf(seq, ",file_umask=%o", sbi->s_file_umask);
+	if (sbi->s_dir_umask != 0022)
+		seq_printf(seq, ",dir_umask=%o", sbi->s_dir_umask);
+	if (sbi->part >= 0)
+		seq_printf(seq, ",part=%u", sbi->part);
+	if (sbi->session >= 0)
+		seq_printf(seq, ",session=%u", sbi->session);
+	if (sbi->nls_disk)
+		seq_printf(seq, ",codepage=%s", sbi->nls_disk->charset);
+	if (sbi->nls_io)
+		seq_printf(seq, ",iocharset=%s", sbi->nls_io->charset);
+	if (sbi->s_quiet)
+		seq_printf(seq, ",quiet");
+	return 0;
+}
+
+static struct inode *hfs_alloc_inode(struct super_block *sb)
+{
+	struct hfs_inode_info *i;
+
+	i = kmem_cache_alloc(hfs_inode_cachep, GFP_KERNEL);
+	return i ? &i->vfs_inode : NULL;
+}
+
+static void hfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(hfs_inode_cachep, HFS_I(inode));
+}
+
+static void hfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hfs_i_callback);
+}
+
+static const struct super_operations hfs_super_operations = {
+	.alloc_inode	= hfs_alloc_inode,
+	.destroy_inode	= hfs_destroy_inode,
+	.write_inode	= hfs_write_inode,
+	.evict_inode	= hfs_evict_inode,
+	.put_super	= hfs_put_super,
+	.write_super	= hfs_write_super,
+	.sync_fs	= hfs_sync_fs,
+	.statfs		= hfs_statfs,
+	.remount_fs     = hfs_remount,
+	.show_options	= hfs_show_options,
+};
+
+enum {
+	opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask,
+	opt_part, opt_session, opt_type, opt_creator, opt_quiet,
+	opt_codepage, opt_iocharset,
+	opt_err
+};
+
+static const match_table_t tokens = {
+	{ opt_uid, "uid=%u" },
+	{ opt_gid, "gid=%u" },
+	{ opt_umask, "umask=%o" },
+	{ opt_file_umask, "file_umask=%o" },
+	{ opt_dir_umask, "dir_umask=%o" },
+	{ opt_part, "part=%u" },
+	{ opt_session, "session=%u" },
+	{ opt_type, "type=%s" },
+	{ opt_creator, "creator=%s" },
+	{ opt_quiet, "quiet" },
+	{ opt_codepage, "codepage=%s" },
+	{ opt_iocharset, "iocharset=%s" },
+	{ opt_err, NULL }
+};
+
+static inline int match_fourchar(substring_t *arg, u32 *result)
+{
+	if (arg->to - arg->from != 4)
+		return -EINVAL;
+	memcpy(result, arg->from, 4);
+	return 0;
+}
+
+/*
+ * parse_options()
+ *
+ * adapted from linux/fs/msdos/inode.c written 1992,93 by Werner Almesberger
+ * This function is called by hfs_read_super() to parse the mount options.
+ */
+static int parse_options(char *options, struct hfs_sb_info *hsb)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int tmp, token;
+
+	/* initialize the sb with defaults */
+	hsb->s_uid = current_uid();
+	hsb->s_gid = current_gid();
+	hsb->s_file_umask = 0133;
+	hsb->s_dir_umask = 0022;
+	hsb->s_type = hsb->s_creator = cpu_to_be32(0x3f3f3f3f);	/* == '????' */
+	hsb->s_quiet = 0;
+	hsb->part = -1;
+	hsb->session = -1;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case opt_uid:
+			if (match_int(&args[0], &tmp)) {
+				printk(KERN_ERR "hfs: uid requires an argument\n");
+				return 0;
+			}
+			hsb->s_uid = (uid_t)tmp;
+			break;
+		case opt_gid:
+			if (match_int(&args[0], &tmp)) {
+				printk(KERN_ERR "hfs: gid requires an argument\n");
+				return 0;
+			}
+			hsb->s_gid = (gid_t)tmp;
+			break;
+		case opt_umask:
+			if (match_octal(&args[0], &tmp)) {
+				printk(KERN_ERR "hfs: umask requires a value\n");
+				return 0;
+			}
+			hsb->s_file_umask = (umode_t)tmp;
+			hsb->s_dir_umask = (umode_t)tmp;
+			break;
+		case opt_file_umask:
+			if (match_octal(&args[0], &tmp)) {
+				printk(KERN_ERR "hfs: file_umask requires a value\n");
+				return 0;
+			}
+			hsb->s_file_umask = (umode_t)tmp;
+			break;
+		case opt_dir_umask:
+			if (match_octal(&args[0], &tmp)) {
+				printk(KERN_ERR "hfs: dir_umask requires a value\n");
+				return 0;
+			}
+			hsb->s_dir_umask = (umode_t)tmp;
+			break;
+		case opt_part:
+			if (match_int(&args[0], &hsb->part)) {
+				printk(KERN_ERR "hfs: part requires an argument\n");
+				return 0;
+			}
+			break;
+		case opt_session:
+			if (match_int(&args[0], &hsb->session)) {
+				printk(KERN_ERR "hfs: session requires an argument\n");
+				return 0;
+			}
+			break;
+		case opt_type:
+			if (match_fourchar(&args[0], &hsb->s_type)) {
+				printk(KERN_ERR "hfs: type requires a 4 character value\n");
+				return 0;
+			}
+			break;
+		case opt_creator:
+			if (match_fourchar(&args[0], &hsb->s_creator)) {
+				printk(KERN_ERR "hfs: creator requires a 4 character value\n");
+				return 0;
+			}
+			break;
+		case opt_quiet:
+			hsb->s_quiet = 1;
+			break;
+		case opt_codepage:
+			if (hsb->nls_disk) {
+				printk(KERN_ERR "hfs: unable to change codepage\n");
+				return 0;
+			}
+			p = match_strdup(&args[0]);
+			if (p)
+				hsb->nls_disk = load_nls(p);
+			if (!hsb->nls_disk) {
+				printk(KERN_ERR "hfs: unable to load codepage \"%s\"\n", p);
+				kfree(p);
+				return 0;
+			}
+			kfree(p);
+			break;
+		case opt_iocharset:
+			if (hsb->nls_io) {
+				printk(KERN_ERR "hfs: unable to change iocharset\n");
+				return 0;
+			}
+			p = match_strdup(&args[0]);
+			if (p)
+				hsb->nls_io = load_nls(p);
+			if (!hsb->nls_io) {
+				printk(KERN_ERR "hfs: unable to load iocharset \"%s\"\n", p);
+				kfree(p);
+				return 0;
+			}
+			kfree(p);
+			break;
+		default:
+			return 0;
+		}
+	}
+
+	if (hsb->nls_disk && !hsb->nls_io) {
+		hsb->nls_io = load_nls_default();
+		if (!hsb->nls_io) {
+			printk(KERN_ERR "hfs: unable to load default iocharset\n");
+			return 0;
+		}
+	}
+	hsb->s_dir_umask &= 0777;
+	hsb->s_file_umask &= 0577;
+
+	return 1;
+}
+
+/*
+ * hfs_read_super()
+ *
+ * This is the function that is responsible for mounting an HFS
+ * filesystem.	It performs all the tasks necessary to get enough data
+ * from the disk to read the root inode.  This includes parsing the
+ * mount options, dealing with Macintosh partitions, reading the
+ * superblock and the allocation bitmap blocks, calling
+ * hfs_btree_init() to get the necessary data about the extents and
+ * catalog B-trees and, finally, reading the root inode into memory.
+ */
+static int hfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct hfs_sb_info *sbi;
+	struct hfs_find_data fd;
+	hfs_cat_rec rec;
+	struct inode *root_inode;
+	int res;
+
+	sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sb->s_fs_info = sbi;
+
+	res = -EINVAL;
+	if (!parse_options((char *)data, sbi)) {
+		printk(KERN_ERR "hfs: unable to parse mount options.\n");
+		goto bail;
+	}
+
+	sb->s_op = &hfs_super_operations;
+	sb->s_flags |= MS_NODIRATIME;
+	mutex_init(&sbi->bitmap_lock);
+
+	res = hfs_mdb_get(sb);
+	if (res) {
+		if (!silent)
+			printk(KERN_WARNING "hfs: can't find a HFS filesystem on dev %s.\n",
+				hfs_mdb_name(sb));
+		res = -EINVAL;
+		goto bail;
+	}
+
+	/* try to get the root inode */
+	hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+	res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd);
+	if (!res) {
+		if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) {
+			res =  -EIO;
+			goto bail;
+		}
+		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength);
+	}
+	if (res) {
+		hfs_find_exit(&fd);
+		goto bail_no_root;
+	}
+	res = -EINVAL;
+	root_inode = hfs_iget(sb, &fd.search_key->cat, &rec);
+	hfs_find_exit(&fd);
+	if (!root_inode)
+		goto bail_no_root;
+
+	sb->s_d_op = &hfs_dentry_operations;
+	res = -ENOMEM;
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root)
+		goto bail_iput;
+
+	/* everything's okay */
+	return 0;
+
+bail_iput:
+	iput(root_inode);
+bail_no_root:
+	printk(KERN_ERR "hfs: get root inode failed.\n");
+bail:
+	hfs_mdb_put(sb);
+	return res;
+}
+
+static struct dentry *hfs_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
+}
+
+static struct file_system_type hfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "hfs",
+	.mount		= hfs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static void hfs_init_once(void *p)
+{
+	struct hfs_inode_info *i = p;
+
+	inode_init_once(&i->vfs_inode);
+}
+
+static int __init init_hfs_fs(void)
+{
+	int err;
+
+	hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
+		sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN,
+		hfs_init_once);
+	if (!hfs_inode_cachep)
+		return -ENOMEM;
+	err = register_filesystem(&hfs_fs_type);
+	if (err)
+		kmem_cache_destroy(hfs_inode_cachep);
+	return err;
+}
+
+static void __exit exit_hfs_fs(void)
+{
+	unregister_filesystem(&hfs_fs_type);
+	kmem_cache_destroy(hfs_inode_cachep);
+}
+
+module_init(init_hfs_fs)
+module_exit(exit_hfs_fs)
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
new file mode 100644
index 00000000..19cf291e
--- /dev/null
+++ b/fs/hfs/sysdep.c
@@ -0,0 +1,45 @@
+/*
+ *  linux/fs/hfs/sysdep.c
+ *
+ * Copyright (C) 1996  Paul H. Hargrove
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains the code to do various system dependent things.
+ */
+
+#include <linux/namei.h>
+#include "hfs_fs.h"
+
+/* dentry case-handling: just lowercase everything */
+
+static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode;
+	int diff;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	if(!inode)
+		return 1;
+
+	/* fix up inode on a timezone change */
+	diff = sys_tz.tz_minuteswest * 60 - HFS_I(inode)->tz_secondswest;
+	if (diff) {
+		inode->i_ctime.tv_sec += diff;
+		inode->i_atime.tv_sec += diff;
+		inode->i_mtime.tv_sec += diff;
+		HFS_I(inode)->tz_secondswest += diff;
+	}
+	return 1;
+}
+
+const struct dentry_operations hfs_dentry_operations =
+{
+	.d_revalidate	= hfs_revalidate_dentry,
+	.d_hash		= hfs_hash_dentry,
+	.d_compare	= hfs_compare_dentry,
+};
+
diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c
new file mode 100644
index 00000000..b1ce4c7a
--- /dev/null
+++ b/fs/hfs/trans.c
@@ -0,0 +1,150 @@
+/*
+ *  linux/fs/hfs/trans.c
+ *
+ * Copyright (C) 1995-1997  Paul H. Hargrove
+ * This file may be distributed under the terms of the GNU General Public License.
+ *
+ * This file contains routines for converting between the Macintosh
+ * character set and various other encodings.  This includes dealing
+ * with ':' vs. '/' as the path-element separator.
+ */
+
+#include <linux/types.h>
+#include <linux/nls.h>
+
+#include "hfs_fs.h"
+
+/*================ Global functions ================*/
+
+/*
+ * hfs_mac2asc()
+ *
+ * Given a 'Pascal String' (a string preceded by a length byte) in
+ * the Macintosh character set produce the corresponding filename using
+ * the 'trivial' name-mangling scheme, returning the length of the
+ * mangled filename.  Note that the output string is not NULL
+ * terminated.
+ *
+ * The name-mangling works as follows:
+ * The character '/', which is illegal in Linux filenames is replaced
+ * by ':' which never appears in HFS filenames.	 All other characters
+ * are passed unchanged from input to output.
+ */
+int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in)
+{
+	struct nls_table *nls_disk = HFS_SB(sb)->nls_disk;
+	struct nls_table *nls_io = HFS_SB(sb)->nls_io;
+	const char *src;
+	char *dst;
+	int srclen, dstlen, size;
+
+	src = in->name;
+	srclen = in->len;
+	if (srclen > HFS_NAMELEN)
+		srclen = HFS_NAMELEN;
+	dst = out;
+	dstlen = HFS_MAX_NAMELEN;
+	if (nls_io) {
+		wchar_t ch;
+
+		while (srclen > 0) {
+			if (nls_disk) {
+				size = nls_disk->char2uni(src, srclen, &ch);
+				if (size <= 0) {
+					ch = '?';
+					size = 1;
+				}
+				src += size;
+				srclen -= size;
+			} else {
+				ch = *src++;
+				srclen--;
+			}
+			if (ch == '/')
+				ch = ':';
+			size = nls_io->uni2char(ch, dst, dstlen);
+			if (size < 0) {
+				if (size == -ENAMETOOLONG)
+					goto out;
+				*dst = '?';
+				size = 1;
+			}
+			dst += size;
+			dstlen -= size;
+		}
+	} else {
+		char ch;
+
+		while (--srclen >= 0)
+			*dst++ = (ch = *src++) == '/' ? ':' : ch;
+	}
+out:
+	return dst - out;
+}
+
+/*
+ * hfs_asc2mac()
+ *
+ * Given an ASCII string (not null-terminated) and its length,
+ * generate the corresponding filename in the Macintosh character set
+ * using the 'trivial' name-mangling scheme, returning the length of
+ * the mangled filename.  Note that the output string is not NULL
+ * terminated.
+ *
+ * This routine is a inverse to hfs_mac2triv().
+ * A ':' is replaced by a '/'.
+ */
+void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, struct qstr *in)
+{
+	struct nls_table *nls_disk = HFS_SB(sb)->nls_disk;
+	struct nls_table *nls_io = HFS_SB(sb)->nls_io;
+	const char *src;
+	char *dst;
+	int srclen, dstlen, size;
+
+	src = in->name;
+	srclen = in->len;
+	dst = out->name;
+	dstlen = HFS_NAMELEN;
+	if (nls_io) {
+		wchar_t ch;
+
+		while (srclen > 0) {
+			size = nls_io->char2uni(src, srclen, &ch);
+			if (size < 0) {
+				ch = '?';
+				size = 1;
+			}
+			src += size;
+			srclen -= size;
+			if (ch == ':')
+				ch = '/';
+			if (nls_disk) {
+				size = nls_disk->uni2char(ch, dst, dstlen);
+				if (size < 0) {
+					if (size == -ENAMETOOLONG)
+						goto out;
+					*dst = '?';
+					size = 1;
+				}
+				dst += size;
+				dstlen -= size;
+			} else {
+				*dst++ = ch > 0xff ? '?' : ch;
+				dstlen--;
+			}
+		}
+	} else {
+		char ch;
+
+		if (dstlen > srclen)
+			dstlen = srclen;
+		while (--dstlen >= 0)
+			*dst++ = (ch = *src++) == ':' ? '/' : ch;
+	}
+out:
+	out->len = dst - (char *)out->name;
+	dstlen = HFS_NAMELEN - out->len;
+	while (--dstlen >= 0)
+		*dst++ = 0;
+}
diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
new file mode 100644
index 00000000..a6337181
--- /dev/null
+++ b/fs/hfsplus/Kconfig
@@ -0,0 +1,13 @@
+config HFSPLUS_FS
+	tristate "Apple Extended HFS file system support"
+	depends on BLOCK
+	select NLS
+	select NLS_UTF8
+	help
+	  If you say Y here, you will be able to mount extended format
+	  Macintosh-formatted hard drive partitions with full read-write access.
+
+	  This file system is often called HFS+ and was introduced with
+	  MacOS 8. It includes all Mac specific filesystem data such as
+	  data forks and creator codes, but it also has several UNIX
+	  style features such as file ownership and permissions.
diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile
new file mode 100644
index 00000000..3cc0df73
--- /dev/null
+++ b/fs/hfsplus/Makefile
@@ -0,0 +1,9 @@
+#
+## Makefile for the linux hfsplus filesystem routines.
+#
+
+obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o
+
+hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \
+		bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o
+
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
new file mode 100644
index 00000000..5d799c13
--- /dev/null
+++ b/fs/hfsplus/bfind.c
@@ -0,0 +1,224 @@
+/*
+ *  linux/fs/hfsplus/bfind.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Search routines for btrees
+ */
+
+#include <linux/slab.h>
+#include "hfsplus_fs.h"
+
+int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
+{
+	void *ptr;
+
+	fd->tree = tree;
+	fd->bnode = NULL;
+	ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+	fd->search_key = ptr;
+	fd->key = ptr + tree->max_key_len + 2;
+	dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n",
+		tree->cnid, __builtin_return_address(0));
+	mutex_lock(&tree->tree_lock);
+	return 0;
+}
+
+void hfs_find_exit(struct hfs_find_data *fd)
+{
+	hfs_bnode_put(fd->bnode);
+	kfree(fd->search_key);
+	dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n",
+		fd->tree->cnid, __builtin_return_address(0));
+	mutex_unlock(&fd->tree->tree_lock);
+	fd->tree = NULL;
+}
+
+/* Find the record in bnode that best matches key (not greater than...)*/
+int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
+{
+	int cmpval;
+	u16 off, len, keylen;
+	int rec;
+	int b, e;
+	int res;
+
+	b = 0;
+	e = bnode->num_recs - 1;
+	res = -ENOENT;
+	do {
+		rec = (e + b) / 2;
+		len = hfs_brec_lenoff(bnode, rec, &off);
+		keylen = hfs_brec_keylen(bnode, rec);
+		if (keylen == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
+		hfs_bnode_read(bnode, fd->key, off, keylen);
+		cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
+		if (!cmpval) {
+			e = rec;
+			res = 0;
+			goto done;
+		}
+		if (cmpval < 0)
+			b = rec + 1;
+		else
+			e = rec - 1;
+	} while (b <= e);
+	if (rec != e && e >= 0) {
+		len = hfs_brec_lenoff(bnode, e, &off);
+		keylen = hfs_brec_keylen(bnode, e);
+		if (keylen == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
+		hfs_bnode_read(bnode, fd->key, off, keylen);
+	}
+done:
+	fd->record = e;
+	fd->keyoffset = off;
+	fd->keylength = keylen;
+	fd->entryoffset = off + keylen;
+	fd->entrylength = len - keylen;
+fail:
+	return res;
+}
+
+/* Traverse a B*Tree from the root to a leaf finding best fit to key */
+/* Return allocated copy of node found, set recnum to best record */
+int hfs_brec_find(struct hfs_find_data *fd)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *bnode;
+	u32 nidx, parent;
+	__be32 data;
+	int height, res;
+
+	tree = fd->tree;
+	if (fd->bnode)
+		hfs_bnode_put(fd->bnode);
+	fd->bnode = NULL;
+	nidx = tree->root;
+	if (!nidx)
+		return -ENOENT;
+	height = tree->depth;
+	res = 0;
+	parent = 0;
+	for (;;) {
+		bnode = hfs_bnode_find(tree, nidx);
+		if (IS_ERR(bnode)) {
+			res = PTR_ERR(bnode);
+			bnode = NULL;
+			break;
+		}
+		if (bnode->height != height)
+			goto invalid;
+		if (bnode->type != (--height ? HFS_NODE_INDEX : HFS_NODE_LEAF))
+			goto invalid;
+		bnode->parent = parent;
+
+		res = __hfs_brec_find(bnode, fd);
+		if (!height)
+			break;
+		if (fd->record < 0)
+			goto release;
+
+		parent = nidx;
+		hfs_bnode_read(bnode, &data, fd->entryoffset, 4);
+		nidx = be32_to_cpu(data);
+		hfs_bnode_put(bnode);
+	}
+	fd->bnode = bnode;
+	return res;
+
+invalid:
+	printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
+		height, bnode->height, bnode->type, nidx, parent);
+	res = -EIO;
+release:
+	hfs_bnode_put(bnode);
+	return res;
+}
+
+int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len)
+{
+	int res;
+
+	res = hfs_brec_find(fd);
+	if (res)
+		return res;
+	if (fd->entrylength > rec_len)
+		return -EINVAL;
+	hfs_bnode_read(fd->bnode, rec, fd->entryoffset, fd->entrylength);
+	return 0;
+}
+
+int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *bnode;
+	int idx, res = 0;
+	u16 off, len, keylen;
+
+	bnode = fd->bnode;
+	tree = bnode->tree;
+
+	if (cnt < 0) {
+		cnt = -cnt;
+		while (cnt > fd->record) {
+			cnt -= fd->record + 1;
+			fd->record = bnode->num_recs - 1;
+			idx = bnode->prev;
+			if (!idx) {
+				res = -ENOENT;
+				goto out;
+			}
+			hfs_bnode_put(bnode);
+			bnode = hfs_bnode_find(tree, idx);
+			if (IS_ERR(bnode)) {
+				res = PTR_ERR(bnode);
+				bnode = NULL;
+				goto out;
+			}
+		}
+		fd->record -= cnt;
+	} else {
+		while (cnt >= bnode->num_recs - fd->record) {
+			cnt -= bnode->num_recs - fd->record;
+			fd->record = 0;
+			idx = bnode->next;
+			if (!idx) {
+				res = -ENOENT;
+				goto out;
+			}
+			hfs_bnode_put(bnode);
+			bnode = hfs_bnode_find(tree, idx);
+			if (IS_ERR(bnode)) {
+				res = PTR_ERR(bnode);
+				bnode = NULL;
+				goto out;
+			}
+		}
+		fd->record += cnt;
+	}
+
+	len = hfs_brec_lenoff(bnode, fd->record, &off);
+	keylen = hfs_brec_keylen(bnode, fd->record);
+	if (keylen == 0) {
+		res = -EINVAL;
+		goto out;
+	}
+	fd->keyoffset = off;
+	fd->keylength = keylen;
+	fd->entryoffset = off + keylen;
+	fd->entrylength = len - keylen;
+	hfs_bnode_read(bnode, fd->key, off, keylen);
+out:
+	fd->bnode = bnode;
+	return res;
+}
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
new file mode 100644
index 00000000..1cad80c7
--- /dev/null
+++ b/fs/hfsplus/bitmap.c
@@ -0,0 +1,235 @@
+/*
+ *  linux/fs/hfsplus/bitmap.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handling of allocation file
+ */
+
+#include <linux/pagemap.h>
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+#define PAGE_CACHE_BITS	(PAGE_CACHE_SIZE * 8)
+
+int hfsplus_block_allocate(struct super_block *sb, u32 size,
+		u32 offset, u32 *max)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct page *page;
+	struct address_space *mapping;
+	__be32 *pptr, *curr, *end;
+	u32 mask, start, len, n;
+	__be32 val;
+	int i;
+
+	len = *max;
+	if (!len)
+		return size;
+
+	dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
+	mutex_lock(&sbi->alloc_mutex);
+	mapping = sbi->alloc_file->i_mapping;
+	page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
+	if (IS_ERR(page)) {
+		start = size;
+		goto out;
+	}
+	pptr = kmap(page);
+	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
+	i = offset % 32;
+	offset &= ~(PAGE_CACHE_BITS - 1);
+	if ((size ^ offset) / PAGE_CACHE_BITS)
+		end = pptr + PAGE_CACHE_BITS / 32;
+	else
+		end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32;
+
+	/* scan the first partial u32 for zero bits */
+	val = *curr;
+	if (~val) {
+		n = be32_to_cpu(val);
+		mask = (1U << 31) >> i;
+		for (; i < 32; mask >>= 1, i++) {
+			if (!(n & mask))
+				goto found;
+		}
+	}
+	curr++;
+
+	/* scan complete u32s for the first zero bit */
+	while (1) {
+		while (curr < end) {
+			val = *curr;
+			if (~val) {
+				n = be32_to_cpu(val);
+				mask = 1 << 31;
+				for (i = 0; i < 32; mask >>= 1, i++) {
+					if (!(n & mask))
+						goto found;
+				}
+			}
+			curr++;
+		}
+		kunmap(page);
+		offset += PAGE_CACHE_BITS;
+		if (offset >= size)
+			break;
+		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
+					 NULL);
+		if (IS_ERR(page)) {
+			start = size;
+			goto out;
+		}
+		curr = pptr = kmap(page);
+		if ((size ^ offset) / PAGE_CACHE_BITS)
+			end = pptr + PAGE_CACHE_BITS / 32;
+		else
+			end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32;
+	}
+	dprint(DBG_BITMAP, "bitmap full\n");
+	start = size;
+	goto out;
+
+found:
+	start = offset + (curr - pptr) * 32 + i;
+	if (start >= size) {
+		dprint(DBG_BITMAP, "bitmap full\n");
+		goto out;
+	}
+	/* do any partial u32 at the start */
+	len = min(size - start, len);
+	while (1) {
+		n |= mask;
+		if (++i >= 32)
+			break;
+		mask >>= 1;
+		if (!--len || n & mask)
+			goto done;
+	}
+	if (!--len)
+		goto done;
+	*curr++ = cpu_to_be32(n);
+	/* do full u32s */
+	while (1) {
+		while (curr < end) {
+			n = be32_to_cpu(*curr);
+			if (len < 32)
+				goto last;
+			if (n) {
+				len = 32;
+				goto last;
+			}
+			*curr++ = cpu_to_be32(0xffffffff);
+			len -= 32;
+		}
+		set_page_dirty(page);
+		kunmap(page);
+		offset += PAGE_CACHE_BITS;
+		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
+					 NULL);
+		if (IS_ERR(page)) {
+			start = size;
+			goto out;
+		}
+		pptr = kmap(page);
+		curr = pptr;
+		end = pptr + PAGE_CACHE_BITS / 32;
+	}
+last:
+	/* do any partial u32 at end */
+	mask = 1U << 31;
+	for (i = 0; i < len; i++) {
+		if (n & mask)
+			break;
+		n |= mask;
+		mask >>= 1;
+	}
+done:
+	*curr = cpu_to_be32(n);
+	set_page_dirty(page);
+	kunmap(page);
+	*max = offset + (curr - pptr) * 32 + i - start;
+	sbi->free_blocks -= *max;
+	sb->s_dirt = 1;
+	dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
+out:
+	mutex_unlock(&sbi->alloc_mutex);
+	return start;
+}
+
+int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct page *page;
+	struct address_space *mapping;
+	__be32 *pptr, *curr, *end;
+	u32 mask, len, pnr;
+	int i;
+
+	/* is there any actual work to be done? */
+	if (!count)
+		return 0;
+
+	dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
+	/* are all of the bits in range? */
+	if ((offset + count) > sbi->total_blocks)
+		return -2;
+
+	mutex_lock(&sbi->alloc_mutex);
+	mapping = sbi->alloc_file->i_mapping;
+	pnr = offset / PAGE_CACHE_BITS;
+	page = read_mapping_page(mapping, pnr, NULL);
+	pptr = kmap(page);
+	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
+	end = pptr + PAGE_CACHE_BITS / 32;
+	len = count;
+
+	/* do any partial u32 at the start */
+	i = offset % 32;
+	if (i) {
+		int j = 32 - i;
+		mask = 0xffffffffU << j;
+		if (j > count) {
+			mask |= 0xffffffffU >> (i + count);
+			*curr++ &= cpu_to_be32(mask);
+			goto out;
+		}
+		*curr++ &= cpu_to_be32(mask);
+		count -= j;
+	}
+
+	/* do full u32s */
+	while (1) {
+		while (curr < end) {
+			if (count < 32)
+				goto done;
+			*curr++ = 0;
+			count -= 32;
+		}
+		if (!count)
+			break;
+		set_page_dirty(page);
+		kunmap(page);
+		page = read_mapping_page(mapping, ++pnr, NULL);
+		pptr = kmap(page);
+		curr = pptr;
+		end = pptr + PAGE_CACHE_BITS / 32;
+	}
+done:
+	/* do any partial u32 at end */
+	if (count) {
+		mask = 0xffffffffU >> count;
+		*curr &= cpu_to_be32(mask);
+	}
+out:
+	set_page_dirty(page);
+	kunmap(page);
+	sbi->free_blocks += len;
+	sb->s_dirt = 1;
+	mutex_unlock(&sbi->alloc_mutex);
+
+	return 0;
+}
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
new file mode 100644
index 00000000..1c42cc5b
--- /dev/null
+++ b/fs/hfsplus/bnode.c
@@ -0,0 +1,656 @@
+/*
+ *  linux/fs/hfsplus/bnode.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handle basic btree node operations
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/fs.h>
+#include <linux/swap.h>
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+/* Copy a specified range of bytes from the raw data of a node */
+void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
+{
+	struct page **pagep;
+	int l;
+
+	off += node->page_offset;
+	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+	off &= ~PAGE_CACHE_MASK;
+
+	l = min(len, (int)PAGE_CACHE_SIZE - off);
+	memcpy(buf, kmap(*pagep) + off, l);
+	kunmap(*pagep);
+
+	while ((len -= l) != 0) {
+		buf += l;
+		l = min(len, (int)PAGE_CACHE_SIZE);
+		memcpy(buf, kmap(*++pagep), l);
+		kunmap(*pagep);
+	}
+}
+
+u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
+{
+	__be16 data;
+	/* TODO: optimize later... */
+	hfs_bnode_read(node, &data, off, 2);
+	return be16_to_cpu(data);
+}
+
+u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off)
+{
+	u8 data;
+	/* TODO: optimize later... */
+	hfs_bnode_read(node, &data, off, 1);
+	return data;
+}
+
+void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)
+{
+	struct hfs_btree *tree;
+	int key_len;
+
+	tree = node->tree;
+	if (node->type == HFS_NODE_LEAF ||
+	    tree->attributes & HFS_TREE_VARIDXKEYS)
+		key_len = hfs_bnode_read_u16(node, off) + 2;
+	else
+		key_len = tree->max_key_len + 2;
+
+	hfs_bnode_read(node, key, off, key_len);
+}
+
+void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
+{
+	struct page **pagep;
+	int l;
+
+	off += node->page_offset;
+	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+	off &= ~PAGE_CACHE_MASK;
+
+	l = min(len, (int)PAGE_CACHE_SIZE - off);
+	memcpy(kmap(*pagep) + off, buf, l);
+	set_page_dirty(*pagep);
+	kunmap(*pagep);
+
+	while ((len -= l) != 0) {
+		buf += l;
+		l = min(len, (int)PAGE_CACHE_SIZE);
+		memcpy(kmap(*++pagep), buf, l);
+		set_page_dirty(*pagep);
+		kunmap(*pagep);
+	}
+}
+
+void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data)
+{
+	__be16 v = cpu_to_be16(data);
+	/* TODO: optimize later... */
+	hfs_bnode_write(node, &v, off, 2);
+}
+
+void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
+{
+	struct page **pagep;
+	int l;
+
+	off += node->page_offset;
+	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+	off &= ~PAGE_CACHE_MASK;
+
+	l = min(len, (int)PAGE_CACHE_SIZE - off);
+	memset(kmap(*pagep) + off, 0, l);
+	set_page_dirty(*pagep);
+	kunmap(*pagep);
+
+	while ((len -= l) != 0) {
+		l = min(len, (int)PAGE_CACHE_SIZE);
+		memset(kmap(*++pagep), 0, l);
+		set_page_dirty(*pagep);
+		kunmap(*pagep);
+	}
+}
+
+void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
+		    struct hfs_bnode *src_node, int src, int len)
+{
+	struct hfs_btree *tree;
+	struct page **src_page, **dst_page;
+	int l;
+
+	dprint(DBG_BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
+	if (!len)
+		return;
+	tree = src_node->tree;
+	src += src_node->page_offset;
+	dst += dst_node->page_offset;
+	src_page = src_node->page + (src >> PAGE_CACHE_SHIFT);
+	src &= ~PAGE_CACHE_MASK;
+	dst_page = dst_node->page + (dst >> PAGE_CACHE_SHIFT);
+	dst &= ~PAGE_CACHE_MASK;
+
+	if (src == dst) {
+		l = min(len, (int)PAGE_CACHE_SIZE - src);
+		memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l);
+		kunmap(*src_page);
+		set_page_dirty(*dst_page);
+		kunmap(*dst_page);
+
+		while ((len -= l) != 0) {
+			l = min(len, (int)PAGE_CACHE_SIZE);
+			memcpy(kmap(*++dst_page), kmap(*++src_page), l);
+			kunmap(*src_page);
+			set_page_dirty(*dst_page);
+			kunmap(*dst_page);
+		}
+	} else {
+		void *src_ptr, *dst_ptr;
+
+		do {
+			src_ptr = kmap(*src_page) + src;
+			dst_ptr = kmap(*dst_page) + dst;
+			if (PAGE_CACHE_SIZE - src < PAGE_CACHE_SIZE - dst) {
+				l = PAGE_CACHE_SIZE - src;
+				src = 0;
+				dst += l;
+			} else {
+				l = PAGE_CACHE_SIZE - dst;
+				src += l;
+				dst = 0;
+			}
+			l = min(len, l);
+			memcpy(dst_ptr, src_ptr, l);
+			kunmap(*src_page);
+			set_page_dirty(*dst_page);
+			kunmap(*dst_page);
+			if (!dst)
+				dst_page++;
+			else
+				src_page++;
+		} while ((len -= l));
+	}
+}
+
+void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
+{
+	struct page **src_page, **dst_page;
+	int l;
+
+	dprint(DBG_BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
+	if (!len)
+		return;
+	src += node->page_offset;
+	dst += node->page_offset;
+	if (dst > src) {
+		src += len - 1;
+		src_page = node->page + (src >> PAGE_CACHE_SHIFT);
+		src = (src & ~PAGE_CACHE_MASK) + 1;
+		dst += len - 1;
+		dst_page = node->page + (dst >> PAGE_CACHE_SHIFT);
+		dst = (dst & ~PAGE_CACHE_MASK) + 1;
+
+		if (src == dst) {
+			while (src < len) {
+				memmove(kmap(*dst_page), kmap(*src_page), src);
+				kunmap(*src_page);
+				set_page_dirty(*dst_page);
+				kunmap(*dst_page);
+				len -= src;
+				src = PAGE_CACHE_SIZE;
+				src_page--;
+				dst_page--;
+			}
+			src -= len;
+			memmove(kmap(*dst_page) + src,
+				kmap(*src_page) + src, len);
+			kunmap(*src_page);
+			set_page_dirty(*dst_page);
+			kunmap(*dst_page);
+		} else {
+			void *src_ptr, *dst_ptr;
+
+			do {
+				src_ptr = kmap(*src_page) + src;
+				dst_ptr = kmap(*dst_page) + dst;
+				if (src < dst) {
+					l = src;
+					src = PAGE_CACHE_SIZE;
+					dst -= l;
+				} else {
+					l = dst;
+					src -= l;
+					dst = PAGE_CACHE_SIZE;
+				}
+				l = min(len, l);
+				memmove(dst_ptr - l, src_ptr - l, l);
+				kunmap(*src_page);
+				set_page_dirty(*dst_page);
+				kunmap(*dst_page);
+				if (dst == PAGE_CACHE_SIZE)
+					dst_page--;
+				else
+					src_page--;
+			} while ((len -= l));
+		}
+	} else {
+		src_page = node->page + (src >> PAGE_CACHE_SHIFT);
+		src &= ~PAGE_CACHE_MASK;
+		dst_page = node->page + (dst >> PAGE_CACHE_SHIFT);
+		dst &= ~PAGE_CACHE_MASK;
+
+		if (src == dst) {
+			l = min(len, (int)PAGE_CACHE_SIZE - src);
+			memmove(kmap(*dst_page) + src,
+				kmap(*src_page) + src, l);
+			kunmap(*src_page);
+			set_page_dirty(*dst_page);
+			kunmap(*dst_page);
+
+			while ((len -= l) != 0) {
+				l = min(len, (int)PAGE_CACHE_SIZE);
+				memmove(kmap(*++dst_page),
+					kmap(*++src_page), l);
+				kunmap(*src_page);
+				set_page_dirty(*dst_page);
+				kunmap(*dst_page);
+			}
+		} else {
+			void *src_ptr, *dst_ptr;
+
+			do {
+				src_ptr = kmap(*src_page) + src;
+				dst_ptr = kmap(*dst_page) + dst;
+				if (PAGE_CACHE_SIZE - src <
+						PAGE_CACHE_SIZE - dst) {
+					l = PAGE_CACHE_SIZE - src;
+					src = 0;
+					dst += l;
+				} else {
+					l = PAGE_CACHE_SIZE - dst;
+					src += l;
+					dst = 0;
+				}
+				l = min(len, l);
+				memmove(dst_ptr, src_ptr, l);
+				kunmap(*src_page);
+				set_page_dirty(*dst_page);
+				kunmap(*dst_page);
+				if (!dst)
+					dst_page++;
+				else
+					src_page++;
+			} while ((len -= l));
+		}
+	}
+}
+
+void hfs_bnode_dump(struct hfs_bnode *node)
+{
+	struct hfs_bnode_desc desc;
+	__be32 cnid;
+	int i, off, key_off;
+
+	dprint(DBG_BNODE_MOD, "bnode: %d\n", node->this);
+	hfs_bnode_read(node, &desc, 0, sizeof(desc));
+	dprint(DBG_BNODE_MOD, "%d, %d, %d, %d, %d\n",
+		be32_to_cpu(desc.next), be32_to_cpu(desc.prev),
+		desc.type, desc.height, be16_to_cpu(desc.num_recs));
+
+	off = node->tree->node_size - 2;
+	for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) {
+		key_off = hfs_bnode_read_u16(node, off);
+		dprint(DBG_BNODE_MOD, " %d", key_off);
+		if (i && node->type == HFS_NODE_INDEX) {
+			int tmp;
+
+			if (node->tree->attributes & HFS_TREE_VARIDXKEYS)
+				tmp = hfs_bnode_read_u16(node, key_off) + 2;
+			else
+				tmp = node->tree->max_key_len + 2;
+			dprint(DBG_BNODE_MOD, " (%d", tmp);
+			hfs_bnode_read(node, &cnid, key_off + tmp, 4);
+			dprint(DBG_BNODE_MOD, ",%d)", be32_to_cpu(cnid));
+		} else if (i && node->type == HFS_NODE_LEAF) {
+			int tmp;
+
+			tmp = hfs_bnode_read_u16(node, key_off);
+			dprint(DBG_BNODE_MOD, " (%d)", tmp);
+		}
+	}
+	dprint(DBG_BNODE_MOD, "\n");
+}
+
+void hfs_bnode_unlink(struct hfs_bnode *node)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *tmp;
+	__be32 cnid;
+
+	tree = node->tree;
+	if (node->prev) {
+		tmp = hfs_bnode_find(tree, node->prev);
+		if (IS_ERR(tmp))
+			return;
+		tmp->next = node->next;
+		cnid = cpu_to_be32(tmp->next);
+		hfs_bnode_write(tmp, &cnid,
+			offsetof(struct hfs_bnode_desc, next), 4);
+		hfs_bnode_put(tmp);
+	} else if (node->type == HFS_NODE_LEAF)
+		tree->leaf_head = node->next;
+
+	if (node->next) {
+		tmp = hfs_bnode_find(tree, node->next);
+		if (IS_ERR(tmp))
+			return;
+		tmp->prev = node->prev;
+		cnid = cpu_to_be32(tmp->prev);
+		hfs_bnode_write(tmp, &cnid,
+			offsetof(struct hfs_bnode_desc, prev), 4);
+		hfs_bnode_put(tmp);
+	} else if (node->type == HFS_NODE_LEAF)
+		tree->leaf_tail = node->prev;
+
+	/* move down? */
+	if (!node->prev && !node->next)
+		dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n");
+	if (!node->parent) {
+		tree->root = 0;
+		tree->depth = 0;
+	}
+	set_bit(HFS_BNODE_DELETED, &node->flags);
+}
+
+static inline int hfs_bnode_hash(u32 num)
+{
+	num = (num >> 16) + num;
+	num += num >> 8;
+	return num & (NODE_HASH_SIZE - 1);
+}
+
+struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
+{
+	struct hfs_bnode *node;
+
+	if (cnid >= tree->node_count) {
+		printk(KERN_ERR "hfs: request for non-existent node "
+				"%d in B*Tree\n",
+			cnid);
+		return NULL;
+	}
+
+	for (node = tree->node_hash[hfs_bnode_hash(cnid)];
+			node; node = node->next_hash)
+		if (node->this == cnid)
+			return node;
+	return NULL;
+}
+
+static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
+{
+	struct super_block *sb;
+	struct hfs_bnode *node, *node2;
+	struct address_space *mapping;
+	struct page *page;
+	int size, block, i, hash;
+	loff_t off;
+
+	if (cnid >= tree->node_count) {
+		printk(KERN_ERR "hfs: request for non-existent node "
+				"%d in B*Tree\n",
+			cnid);
+		return NULL;
+	}
+
+	sb = tree->inode->i_sb;
+	size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
+		sizeof(struct page *);
+	node = kzalloc(size, GFP_KERNEL);
+	if (!node)
+		return NULL;
+	node->tree = tree;
+	node->this = cnid;
+	set_bit(HFS_BNODE_NEW, &node->flags);
+	atomic_set(&node->refcnt, 1);
+	dprint(DBG_BNODE_REFS, "new_node(%d:%d): 1\n",
+	       node->tree->cnid, node->this);
+	init_waitqueue_head(&node->lock_wq);
+	spin_lock(&tree->hash_lock);
+	node2 = hfs_bnode_findhash(tree, cnid);
+	if (!node2) {
+		hash = hfs_bnode_hash(cnid);
+		node->next_hash = tree->node_hash[hash];
+		tree->node_hash[hash] = node;
+		tree->node_hash_cnt++;
+	} else {
+		spin_unlock(&tree->hash_lock);
+		kfree(node);
+		wait_event(node2->lock_wq,
+			!test_bit(HFS_BNODE_NEW, &node2->flags));
+		return node2;
+	}
+	spin_unlock(&tree->hash_lock);
+
+	mapping = tree->inode->i_mapping;
+	off = (loff_t)cnid << tree->node_size_shift;
+	block = off >> PAGE_CACHE_SHIFT;
+	node->page_offset = off & ~PAGE_CACHE_MASK;
+	for (i = 0; i < tree->pages_per_bnode; block++, i++) {
+		page = read_mapping_page(mapping, block, NULL);
+		if (IS_ERR(page))
+			goto fail;
+		if (PageError(page)) {
+			page_cache_release(page);
+			goto fail;
+		}
+		page_cache_release(page);
+		node->page[i] = page;
+	}
+
+	return node;
+fail:
+	set_bit(HFS_BNODE_ERROR, &node->flags);
+	return node;
+}
+
+void hfs_bnode_unhash(struct hfs_bnode *node)
+{
+	struct hfs_bnode **p;
+
+	dprint(DBG_BNODE_REFS, "remove_node(%d:%d): %d\n",
+		node->tree->cnid, node->this, atomic_read(&node->refcnt));
+	for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)];
+	     *p && *p != node; p = &(*p)->next_hash)
+		;
+	BUG_ON(!*p);
+	*p = node->next_hash;
+	node->tree->node_hash_cnt--;
+}
+
+/* Load a particular node out of a tree */
+struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
+{
+	struct hfs_bnode *node;
+	struct hfs_bnode_desc *desc;
+	int i, rec_off, off, next_off;
+	int entry_size, key_size;
+
+	spin_lock(&tree->hash_lock);
+	node = hfs_bnode_findhash(tree, num);
+	if (node) {
+		hfs_bnode_get(node);
+		spin_unlock(&tree->hash_lock);
+		wait_event(node->lock_wq,
+			!test_bit(HFS_BNODE_NEW, &node->flags));
+		if (test_bit(HFS_BNODE_ERROR, &node->flags))
+			goto node_error;
+		return node;
+	}
+	spin_unlock(&tree->hash_lock);
+	node = __hfs_bnode_create(tree, num);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+	if (test_bit(HFS_BNODE_ERROR, &node->flags))
+		goto node_error;
+	if (!test_bit(HFS_BNODE_NEW, &node->flags))
+		return node;
+
+	desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) +
+			node->page_offset);
+	node->prev = be32_to_cpu(desc->prev);
+	node->next = be32_to_cpu(desc->next);
+	node->num_recs = be16_to_cpu(desc->num_recs);
+	node->type = desc->type;
+	node->height = desc->height;
+	kunmap(node->page[0]);
+
+	switch (node->type) {
+	case HFS_NODE_HEADER:
+	case HFS_NODE_MAP:
+		if (node->height != 0)
+			goto node_error;
+		break;
+	case HFS_NODE_LEAF:
+		if (node->height != 1)
+			goto node_error;
+		break;
+	case HFS_NODE_INDEX:
+		if (node->height <= 1 || node->height > tree->depth)
+			goto node_error;
+		break;
+	default:
+		goto node_error;
+	}
+
+	rec_off = tree->node_size - 2;
+	off = hfs_bnode_read_u16(node, rec_off);
+	if (off != sizeof(struct hfs_bnode_desc))
+		goto node_error;
+	for (i = 1; i <= node->num_recs; off = next_off, i++) {
+		rec_off -= 2;
+		next_off = hfs_bnode_read_u16(node, rec_off);
+		if (next_off <= off ||
+		    next_off > tree->node_size ||
+		    next_off & 1)
+			goto node_error;
+		entry_size = next_off - off;
+		if (node->type != HFS_NODE_INDEX &&
+		    node->type != HFS_NODE_LEAF)
+			continue;
+		key_size = hfs_bnode_read_u16(node, off) + 2;
+		if (key_size >= entry_size || key_size & 1)
+			goto node_error;
+	}
+	clear_bit(HFS_BNODE_NEW, &node->flags);
+	wake_up(&node->lock_wq);
+	return node;
+
+node_error:
+	set_bit(HFS_BNODE_ERROR, &node->flags);
+	clear_bit(HFS_BNODE_NEW, &node->flags);
+	wake_up(&node->lock_wq);
+	hfs_bnode_put(node);
+	return ERR_PTR(-EIO);
+}
+
+void hfs_bnode_free(struct hfs_bnode *node)
+{
+#if 0
+	int i;
+
+	for (i = 0; i < node->tree->pages_per_bnode; i++)
+		if (node->page[i])
+			page_cache_release(node->page[i]);
+#endif
+	kfree(node);
+}
+
+struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
+{
+	struct hfs_bnode *node;
+	struct page **pagep;
+	int i;
+
+	spin_lock(&tree->hash_lock);
+	node = hfs_bnode_findhash(tree, num);
+	spin_unlock(&tree->hash_lock);
+	if (node) {
+		printk(KERN_CRIT "new node %u already hashed?\n", num);
+		WARN_ON(1);
+		return node;
+	}
+	node = __hfs_bnode_create(tree, num);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+	if (test_bit(HFS_BNODE_ERROR, &node->flags)) {
+		hfs_bnode_put(node);
+		return ERR_PTR(-EIO);
+	}
+
+	pagep = node->page;
+	memset(kmap(*pagep) + node->page_offset, 0,
+	       min((int)PAGE_CACHE_SIZE, (int)tree->node_size));
+	set_page_dirty(*pagep);
+	kunmap(*pagep);
+	for (i = 1; i < tree->pages_per_bnode; i++) {
+		memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE);
+		set_page_dirty(*pagep);
+		kunmap(*pagep);
+	}
+	clear_bit(HFS_BNODE_NEW, &node->flags);
+	wake_up(&node->lock_wq);
+
+	return node;
+}
+
+void hfs_bnode_get(struct hfs_bnode *node)
+{
+	if (node) {
+		atomic_inc(&node->refcnt);
+		dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
+			node->tree->cnid, node->this,
+			atomic_read(&node->refcnt));
+	}
+}
+
+/* Dispose of resources used by a node */
+void hfs_bnode_put(struct hfs_bnode *node)
+{
+	if (node) {
+		struct hfs_btree *tree = node->tree;
+		int i;
+
+		dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n",
+			node->tree->cnid, node->this,
+			atomic_read(&node->refcnt));
+		BUG_ON(!atomic_read(&node->refcnt));
+		if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
+			return;
+		for (i = 0; i < tree->pages_per_bnode; i++) {
+			if (!node->page[i])
+				continue;
+			mark_page_accessed(node->page[i]);
+		}
+
+		if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
+			hfs_bnode_unhash(node);
+			spin_unlock(&tree->hash_lock);
+			hfs_bmap_free(node);
+			hfs_bnode_free(node);
+			return;
+		}
+		spin_unlock(&tree->hash_lock);
+	}
+}
+
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
new file mode 100644
index 00000000..2312de34
--- /dev/null
+++ b/fs/hfsplus/brec.c
@@ -0,0 +1,516 @@
+/*
+ *  linux/fs/hfsplus/brec.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handle individual btree records
+ */
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd);
+static int hfs_brec_update_parent(struct hfs_find_data *fd);
+static int hfs_btree_inc_height(struct hfs_btree *);
+
+/* Get the length and offset of the given record in the given node */
+u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off)
+{
+	__be16 retval[2];
+	u16 dataoff;
+
+	dataoff = node->tree->node_size - (rec + 2) * 2;
+	hfs_bnode_read(node, retval, dataoff, 4);
+	*off = be16_to_cpu(retval[1]);
+	return be16_to_cpu(retval[0]) - *off;
+}
+
+/* Get the length of the key from a keyed record */
+u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
+{
+	u16 retval, recoff;
+
+	if (node->type != HFS_NODE_INDEX && node->type != HFS_NODE_LEAF)
+		return 0;
+
+	if ((node->type == HFS_NODE_INDEX) &&
+	   !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) {
+		retval = node->tree->max_key_len + 2;
+	} else {
+		recoff = hfs_bnode_read_u16(node,
+			node->tree->node_size - (rec + 1) * 2);
+		if (!recoff)
+			return 0;
+
+		retval = hfs_bnode_read_u16(node, recoff) + 2;
+		if (retval > node->tree->max_key_len + 2) {
+			printk(KERN_ERR "hfs: keylen %d too large\n",
+				retval);
+			retval = 0;
+		}
+	}
+	return retval;
+}
+
+int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *new_node;
+	int size, key_len, rec;
+	int data_off, end_off;
+	int idx_rec_off, data_rec_off, end_rec_off;
+	__be32 cnid;
+
+	tree = fd->tree;
+	if (!fd->bnode) {
+		if (!tree->root)
+			hfs_btree_inc_height(tree);
+		fd->bnode = hfs_bnode_find(tree, tree->leaf_head);
+		if (IS_ERR(fd->bnode))
+			return PTR_ERR(fd->bnode);
+		fd->record = -1;
+	}
+	new_node = NULL;
+	key_len = be16_to_cpu(fd->search_key->key_len) + 2;
+again:
+	/* new record idx and complete record size */
+	rec = fd->record + 1;
+	size = key_len + entry_len;
+
+	node = fd->bnode;
+	hfs_bnode_dump(node);
+	/* get last offset */
+	end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
+	end_off = hfs_bnode_read_u16(node, end_rec_off);
+	end_rec_off -= 2;
+	dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
+		rec, size, end_off, end_rec_off);
+	if (size > end_rec_off - end_off) {
+		if (new_node)
+			panic("not enough room!\n");
+		new_node = hfs_bnode_split(fd);
+		if (IS_ERR(new_node))
+			return PTR_ERR(new_node);
+		goto again;
+	}
+	if (node->type == HFS_NODE_LEAF) {
+		tree->leaf_count++;
+		mark_inode_dirty(tree->inode);
+	}
+	node->num_recs++;
+	/* write new last offset */
+	hfs_bnode_write_u16(node,
+		offsetof(struct hfs_bnode_desc, num_recs),
+		node->num_recs);
+	hfs_bnode_write_u16(node, end_rec_off, end_off + size);
+	data_off = end_off;
+	data_rec_off = end_rec_off + 2;
+	idx_rec_off = tree->node_size - (rec + 1) * 2;
+	if (idx_rec_off == data_rec_off)
+		goto skip;
+	/* move all following entries */
+	do {
+		data_off = hfs_bnode_read_u16(node, data_rec_off + 2);
+		hfs_bnode_write_u16(node, data_rec_off, data_off + size);
+		data_rec_off += 2;
+	} while (data_rec_off < idx_rec_off);
+
+	/* move data away */
+	hfs_bnode_move(node, data_off + size, data_off,
+		       end_off - data_off);
+
+skip:
+	hfs_bnode_write(node, fd->search_key, data_off, key_len);
+	hfs_bnode_write(node, entry, data_off + key_len, entry_len);
+	hfs_bnode_dump(node);
+
+	if (new_node) {
+		/* update parent key if we inserted a key
+		 * at the start of the first node
+		 */
+		if (!rec && new_node != node)
+			hfs_brec_update_parent(fd);
+
+		hfs_bnode_put(fd->bnode);
+		if (!new_node->parent) {
+			hfs_btree_inc_height(tree);
+			new_node->parent = tree->root;
+		}
+		fd->bnode = hfs_bnode_find(tree, new_node->parent);
+
+		/* create index data entry */
+		cnid = cpu_to_be32(new_node->this);
+		entry = &cnid;
+		entry_len = sizeof(cnid);
+
+		/* get index key */
+		hfs_bnode_read_key(new_node, fd->search_key, 14);
+		__hfs_brec_find(fd->bnode, fd);
+
+		hfs_bnode_put(new_node);
+		new_node = NULL;
+
+		if (tree->attributes & HFS_TREE_VARIDXKEYS)
+			key_len = be16_to_cpu(fd->search_key->key_len) + 2;
+		else {
+			fd->search_key->key_len =
+				cpu_to_be16(tree->max_key_len);
+			key_len = tree->max_key_len + 2;
+		}
+		goto again;
+	}
+
+	if (!rec)
+		hfs_brec_update_parent(fd);
+
+	return 0;
+}
+
+int hfs_brec_remove(struct hfs_find_data *fd)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *parent;
+	int end_off, rec_off, data_off, size;
+
+	tree = fd->tree;
+	node = fd->bnode;
+again:
+	rec_off = tree->node_size - (fd->record + 2) * 2;
+	end_off = tree->node_size - (node->num_recs + 1) * 2;
+
+	if (node->type == HFS_NODE_LEAF) {
+		tree->leaf_count--;
+		mark_inode_dirty(tree->inode);
+	}
+	hfs_bnode_dump(node);
+	dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n",
+		fd->record, fd->keylength + fd->entrylength);
+	if (!--node->num_recs) {
+		hfs_bnode_unlink(node);
+		if (!node->parent)
+			return 0;
+		parent = hfs_bnode_find(tree, node->parent);
+		if (IS_ERR(parent))
+			return PTR_ERR(parent);
+		hfs_bnode_put(node);
+		node = fd->bnode = parent;
+
+		__hfs_brec_find(node, fd);
+		goto again;
+	}
+	hfs_bnode_write_u16(node,
+		offsetof(struct hfs_bnode_desc, num_recs),
+		node->num_recs);
+
+	if (rec_off == end_off)
+		goto skip;
+	size = fd->keylength + fd->entrylength;
+
+	do {
+		data_off = hfs_bnode_read_u16(node, rec_off);
+		hfs_bnode_write_u16(node, rec_off + 2, data_off - size);
+		rec_off -= 2;
+	} while (rec_off >= end_off);
+
+	/* fill hole */
+	hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size,
+		       data_off - fd->keyoffset - size);
+skip:
+	hfs_bnode_dump(node);
+	if (!fd->record)
+		hfs_brec_update_parent(fd);
+	return 0;
+}
+
+static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *new_node, *next_node;
+	struct hfs_bnode_desc node_desc;
+	int num_recs, new_rec_off, new_off, old_rec_off;
+	int data_start, data_end, size;
+
+	tree = fd->tree;
+	node = fd->bnode;
+	new_node = hfs_bmap_alloc(tree);
+	if (IS_ERR(new_node))
+		return new_node;
+	hfs_bnode_get(node);
+	dprint(DBG_BNODE_MOD, "split_nodes: %d - %d - %d\n",
+		node->this, new_node->this, node->next);
+	new_node->next = node->next;
+	new_node->prev = node->this;
+	new_node->parent = node->parent;
+	new_node->type = node->type;
+	new_node->height = node->height;
+
+	if (node->next)
+		next_node = hfs_bnode_find(tree, node->next);
+	else
+		next_node = NULL;
+
+	if (IS_ERR(next_node)) {
+		hfs_bnode_put(node);
+		hfs_bnode_put(new_node);
+		return next_node;
+	}
+
+	size = tree->node_size / 2 - node->num_recs * 2 - 14;
+	old_rec_off = tree->node_size - 4;
+	num_recs = 1;
+	for (;;) {
+		data_start = hfs_bnode_read_u16(node, old_rec_off);
+		if (data_start > size)
+			break;
+		old_rec_off -= 2;
+		if (++num_recs < node->num_recs)
+			continue;
+		/* panic? */
+		hfs_bnode_put(node);
+		hfs_bnode_put(new_node);
+		if (next_node)
+			hfs_bnode_put(next_node);
+		return ERR_PTR(-ENOSPC);
+	}
+
+	if (fd->record + 1 < num_recs) {
+		/* new record is in the lower half,
+		 * so leave some more space there
+		 */
+		old_rec_off += 2;
+		num_recs--;
+		data_start = hfs_bnode_read_u16(node, old_rec_off);
+	} else {
+		hfs_bnode_put(node);
+		hfs_bnode_get(new_node);
+		fd->bnode = new_node;
+		fd->record -= num_recs;
+		fd->keyoffset -= data_start - 14;
+		fd->entryoffset -= data_start - 14;
+	}
+	new_node->num_recs = node->num_recs - num_recs;
+	node->num_recs = num_recs;
+
+	new_rec_off = tree->node_size - 2;
+	new_off = 14;
+	size = data_start - new_off;
+	num_recs = new_node->num_recs;
+	data_end = data_start;
+	while (num_recs) {
+		hfs_bnode_write_u16(new_node, new_rec_off, new_off);
+		old_rec_off -= 2;
+		new_rec_off -= 2;
+		data_end = hfs_bnode_read_u16(node, old_rec_off);
+		new_off = data_end - size;
+		num_recs--;
+	}
+	hfs_bnode_write_u16(new_node, new_rec_off, new_off);
+	hfs_bnode_copy(new_node, 14, node, data_start, data_end - data_start);
+
+	/* update new bnode header */
+	node_desc.next = cpu_to_be32(new_node->next);
+	node_desc.prev = cpu_to_be32(new_node->prev);
+	node_desc.type = new_node->type;
+	node_desc.height = new_node->height;
+	node_desc.num_recs = cpu_to_be16(new_node->num_recs);
+	node_desc.reserved = 0;
+	hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc));
+
+	/* update previous bnode header */
+	node->next = new_node->this;
+	hfs_bnode_read(node, &node_desc, 0, sizeof(node_desc));
+	node_desc.next = cpu_to_be32(node->next);
+	node_desc.num_recs = cpu_to_be16(node->num_recs);
+	hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
+
+	/* update next bnode header */
+	if (next_node) {
+		next_node->prev = new_node->this;
+		hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
+		node_desc.prev = cpu_to_be32(next_node->prev);
+		hfs_bnode_write(next_node, &node_desc, 0, sizeof(node_desc));
+		hfs_bnode_put(next_node);
+	} else if (node->this == tree->leaf_tail) {
+		/* if there is no next node, this might be the new tail */
+		tree->leaf_tail = new_node->this;
+		mark_inode_dirty(tree->inode);
+	}
+
+	hfs_bnode_dump(node);
+	hfs_bnode_dump(new_node);
+	hfs_bnode_put(node);
+
+	return new_node;
+}
+
+static int hfs_brec_update_parent(struct hfs_find_data *fd)
+{
+	struct hfs_btree *tree;
+	struct hfs_bnode *node, *new_node, *parent;
+	int newkeylen, diff;
+	int rec, rec_off, end_rec_off;
+	int start_off, end_off;
+
+	tree = fd->tree;
+	node = fd->bnode;
+	new_node = NULL;
+	if (!node->parent)
+		return 0;
+
+again:
+	parent = hfs_bnode_find(tree, node->parent);
+	if (IS_ERR(parent))
+		return PTR_ERR(parent);
+	__hfs_brec_find(parent, fd);
+	hfs_bnode_dump(parent);
+	rec = fd->record;
+
+	/* size difference between old and new key */
+	if (tree->attributes & HFS_TREE_VARIDXKEYS)
+		newkeylen = hfs_bnode_read_u16(node, 14) + 2;
+	else
+		fd->keylength = newkeylen = tree->max_key_len + 2;
+	dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n",
+		rec, fd->keylength, newkeylen);
+
+	rec_off = tree->node_size - (rec + 2) * 2;
+	end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
+	diff = newkeylen - fd->keylength;
+	if (!diff)
+		goto skip;
+	if (diff > 0) {
+		end_off = hfs_bnode_read_u16(parent, end_rec_off);
+		if (end_rec_off - end_off < diff) {
+
+			dprint(DBG_BNODE_MOD, "hfs: splitting index node.\n");
+			fd->bnode = parent;
+			new_node = hfs_bnode_split(fd);
+			if (IS_ERR(new_node))
+				return PTR_ERR(new_node);
+			parent = fd->bnode;
+			rec = fd->record;
+			rec_off = tree->node_size - (rec + 2) * 2;
+			end_rec_off = tree->node_size -
+				(parent->num_recs + 1) * 2;
+		}
+	}
+
+	end_off = start_off = hfs_bnode_read_u16(parent, rec_off);
+	hfs_bnode_write_u16(parent, rec_off, start_off + diff);
+	start_off -= 4;	/* move previous cnid too */
+
+	while (rec_off > end_rec_off) {
+		rec_off -= 2;
+		end_off = hfs_bnode_read_u16(parent, rec_off);
+		hfs_bnode_write_u16(parent, rec_off, end_off + diff);
+	}
+	hfs_bnode_move(parent, start_off + diff, start_off,
+		       end_off - start_off);
+skip:
+	hfs_bnode_copy(parent, fd->keyoffset, node, 14, newkeylen);
+	hfs_bnode_dump(parent);
+
+	hfs_bnode_put(node);
+	node = parent;
+
+	if (new_node) {
+		__be32 cnid;
+
+		fd->bnode = hfs_bnode_find(tree, new_node->parent);
+		/* create index key and entry */
+		hfs_bnode_read_key(new_node, fd->search_key, 14);
+		cnid = cpu_to_be32(new_node->this);
+
+		__hfs_brec_find(fd->bnode, fd);
+		hfs_brec_insert(fd, &cnid, sizeof(cnid));
+		hfs_bnode_put(fd->bnode);
+		hfs_bnode_put(new_node);
+
+		if (!rec) {
+			if (new_node == node)
+				goto out;
+			/* restore search_key */
+			hfs_bnode_read_key(node, fd->search_key, 14);
+		}
+	}
+
+	if (!rec && node->parent)
+		goto again;
+out:
+	fd->bnode = node;
+	return 0;
+}
+
+static int hfs_btree_inc_height(struct hfs_btree *tree)
+{
+	struct hfs_bnode *node, *new_node;
+	struct hfs_bnode_desc node_desc;
+	int key_size, rec;
+	__be32 cnid;
+
+	node = NULL;
+	if (tree->root) {
+		node = hfs_bnode_find(tree, tree->root);
+		if (IS_ERR(node))
+			return PTR_ERR(node);
+	}
+	new_node = hfs_bmap_alloc(tree);
+	if (IS_ERR(new_node)) {
+		hfs_bnode_put(node);
+		return PTR_ERR(new_node);
+	}
+
+	tree->root = new_node->this;
+	if (!tree->depth) {
+		tree->leaf_head = tree->leaf_tail = new_node->this;
+		new_node->type = HFS_NODE_LEAF;
+		new_node->num_recs = 0;
+	} else {
+		new_node->type = HFS_NODE_INDEX;
+		new_node->num_recs = 1;
+	}
+	new_node->parent = 0;
+	new_node->next = 0;
+	new_node->prev = 0;
+	new_node->height = ++tree->depth;
+
+	node_desc.next = cpu_to_be32(new_node->next);
+	node_desc.prev = cpu_to_be32(new_node->prev);
+	node_desc.type = new_node->type;
+	node_desc.height = new_node->height;
+	node_desc.num_recs = cpu_to_be16(new_node->num_recs);
+	node_desc.reserved = 0;
+	hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc));
+
+	rec = tree->node_size - 2;
+	hfs_bnode_write_u16(new_node, rec, 14);
+
+	if (node) {
+		/* insert old root idx into new root */
+		node->parent = tree->root;
+		if (node->type == HFS_NODE_LEAF ||
+		    tree->attributes & HFS_TREE_VARIDXKEYS)
+			key_size = hfs_bnode_read_u16(node, 14) + 2;
+		else
+			key_size = tree->max_key_len + 2;
+		hfs_bnode_copy(new_node, 14, node, 14, key_size);
+
+		if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
+			key_size = tree->max_key_len + 2;
+			hfs_bnode_write_u16(new_node, 14, tree->max_key_len);
+		}
+		cnid = cpu_to_be32(node->this);
+		hfs_bnode_write(new_node, &cnid, 14 + key_size, 4);
+
+		rec -= 2;
+		hfs_bnode_write_u16(new_node, rec, 14 + key_size + 4);
+
+		hfs_bnode_put(node);
+	}
+	hfs_bnode_put(new_node);
+	mark_inode_dirty(tree->inode);
+
+	return 0;
+}
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
new file mode 100644
index 00000000..21023d9f
--- /dev/null
+++ b/fs/hfsplus/btree.c
@@ -0,0 +1,377 @@
+/*
+ *  linux/fs/hfsplus/btree.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handle opening/closing btree
+ */
+
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/log2.h>
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+
+/* Get a reference to a B*Tree and do some initial checks */
+struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
+{
+	struct hfs_btree *tree;
+	struct hfs_btree_header_rec *head;
+	struct address_space *mapping;
+	struct inode *inode;
+	struct page *page;
+	unsigned int size;
+
+	tree = kzalloc(sizeof(*tree), GFP_KERNEL);
+	if (!tree)
+		return NULL;
+
+	mutex_init(&tree->tree_lock);
+	spin_lock_init(&tree->hash_lock);
+	tree->sb = sb;
+	tree->cnid = id;
+	inode = hfsplus_iget(sb, id);
+	if (IS_ERR(inode))
+		goto free_tree;
+	tree->inode = inode;
+
+	if (!HFSPLUS_I(tree->inode)->first_blocks) {
+		printk(KERN_ERR
+		       "hfs: invalid btree extent records (0 size).\n");
+		goto free_inode;
+	}
+
+	mapping = tree->inode->i_mapping;
+	page = read_mapping_page(mapping, 0, NULL);
+	if (IS_ERR(page))
+		goto free_inode;
+
+	/* Load the header */
+	head = (struct hfs_btree_header_rec *)(kmap(page) +
+		sizeof(struct hfs_bnode_desc));
+	tree->root = be32_to_cpu(head->root);
+	tree->leaf_count = be32_to_cpu(head->leaf_count);
+	tree->leaf_head = be32_to_cpu(head->leaf_head);
+	tree->leaf_tail = be32_to_cpu(head->leaf_tail);
+	tree->node_count = be32_to_cpu(head->node_count);
+	tree->free_nodes = be32_to_cpu(head->free_nodes);
+	tree->attributes = be32_to_cpu(head->attributes);
+	tree->node_size = be16_to_cpu(head->node_size);
+	tree->max_key_len = be16_to_cpu(head->max_key_len);
+	tree->depth = be16_to_cpu(head->depth);
+
+	/* Verify the tree and set the correct compare function */
+	switch (id) {
+	case HFSPLUS_EXT_CNID:
+		if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
+			printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
+				tree->max_key_len);
+			goto fail_page;
+		}
+		if (tree->attributes & HFS_TREE_VARIDXKEYS) {
+			printk(KERN_ERR "hfs: invalid extent btree flag\n");
+			goto fail_page;
+		}
+
+		tree->keycmp = hfsplus_ext_cmp_key;
+		break;
+	case HFSPLUS_CAT_CNID:
+		if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
+			printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
+				tree->max_key_len);
+			goto fail_page;
+		}
+		if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
+			printk(KERN_ERR "hfs: invalid catalog btree flag\n");
+			goto fail_page;
+		}
+
+		if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
+		    (head->key_type == HFSPLUS_KEY_BINARY))
+			tree->keycmp = hfsplus_cat_bin_cmp_key;
+		else {
+			tree->keycmp = hfsplus_cat_case_cmp_key;
+			set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+		}
+		break;
+	default:
+		printk(KERN_ERR "hfs: unknown B*Tree requested\n");
+		goto fail_page;
+	}
+
+	if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
+		printk(KERN_ERR "hfs: invalid btree flag\n");
+		goto fail_page;
+	}
+
+	size = tree->node_size;
+	if (!is_power_of_2(size))
+		goto fail_page;
+	if (!tree->node_count)
+		goto fail_page;
+
+	tree->node_size_shift = ffs(size) - 1;
+
+	tree->pages_per_bnode =
+		(tree->node_size + PAGE_CACHE_SIZE - 1) >>
+		PAGE_CACHE_SHIFT;
+
+	kunmap(page);
+	page_cache_release(page);
+	return tree;
+
+ fail_page:
+	page_cache_release(page);
+ free_inode:
+	tree->inode->i_mapping->a_ops = &hfsplus_aops;
+	iput(tree->inode);
+ free_tree:
+	kfree(tree);
+	return NULL;
+}
+
+/* Release resources used by a btree */
+void hfs_btree_close(struct hfs_btree *tree)
+{
+	struct hfs_bnode *node;
+	int i;
+
+	if (!tree)
+		return;
+
+	for (i = 0; i < NODE_HASH_SIZE; i++) {
+		while ((node = tree->node_hash[i])) {
+			tree->node_hash[i] = node->next_hash;
+			if (atomic_read(&node->refcnt))
+				printk(KERN_CRIT "hfs: node %d:%d "
+						"still has %d user(s)!\n",
+					node->tree->cnid, node->this,
+					atomic_read(&node->refcnt));
+			hfs_bnode_free(node);
+			tree->node_hash_cnt--;
+		}
+	}
+	iput(tree->inode);
+	kfree(tree);
+}
+
+void hfs_btree_write(struct hfs_btree *tree)
+{
+	struct hfs_btree_header_rec *head;
+	struct hfs_bnode *node;
+	struct page *page;
+
+	node = hfs_bnode_find(tree, 0);
+	if (IS_ERR(node))
+		/* panic? */
+		return;
+	/* Load the header */
+	page = node->page[0];
+	head = (struct hfs_btree_header_rec *)(kmap(page) +
+		sizeof(struct hfs_bnode_desc));
+
+	head->root = cpu_to_be32(tree->root);
+	head->leaf_count = cpu_to_be32(tree->leaf_count);
+	head->leaf_head = cpu_to_be32(tree->leaf_head);
+	head->leaf_tail = cpu_to_be32(tree->leaf_tail);
+	head->node_count = cpu_to_be32(tree->node_count);
+	head->free_nodes = cpu_to_be32(tree->free_nodes);
+	head->attributes = cpu_to_be32(tree->attributes);
+	head->depth = cpu_to_be16(tree->depth);
+
+	kunmap(page);
+	set_page_dirty(page);
+	hfs_bnode_put(node);
+}
+
+static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
+{
+	struct hfs_btree *tree = prev->tree;
+	struct hfs_bnode *node;
+	struct hfs_bnode_desc desc;
+	__be32 cnid;
+
+	node = hfs_bnode_create(tree, idx);
+	if (IS_ERR(node))
+		return node;
+
+	tree->free_nodes--;
+	prev->next = idx;
+	cnid = cpu_to_be32(idx);
+	hfs_bnode_write(prev, &cnid, offsetof(struct hfs_bnode_desc, next), 4);
+
+	node->type = HFS_NODE_MAP;
+	node->num_recs = 1;
+	hfs_bnode_clear(node, 0, tree->node_size);
+	desc.next = 0;
+	desc.prev = 0;
+	desc.type = HFS_NODE_MAP;
+	desc.height = 0;
+	desc.num_recs = cpu_to_be16(1);
+	desc.reserved = 0;
+	hfs_bnode_write(node, &desc, 0, sizeof(desc));
+	hfs_bnode_write_u16(node, 14, 0x8000);
+	hfs_bnode_write_u16(node, tree->node_size - 2, 14);
+	hfs_bnode_write_u16(node, tree->node_size - 4, tree->node_size - 6);
+
+	return node;
+}
+
+struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+{
+	struct hfs_bnode *node, *next_node;
+	struct page **pagep;
+	u32 nidx, idx;
+	unsigned off;
+	u16 off16;
+	u16 len;
+	u8 *data, byte, m;
+	int i;
+
+	while (!tree->free_nodes) {
+		struct inode *inode = tree->inode;
+		struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+		u32 count;
+		int res;
+
+		res = hfsplus_file_extend(inode);
+		if (res)
+			return ERR_PTR(res);
+		hip->phys_size = inode->i_size =
+			(loff_t)hip->alloc_blocks <<
+				HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
+		hip->fs_blocks =
+			hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
+		inode_set_bytes(inode, inode->i_size);
+		count = inode->i_size >> tree->node_size_shift;
+		tree->free_nodes = count - tree->node_count;
+		tree->node_count = count;
+	}
+
+	nidx = 0;
+	node = hfs_bnode_find(tree, nidx);
+	if (IS_ERR(node))
+		return node;
+	len = hfs_brec_lenoff(node, 2, &off16);
+	off = off16;
+
+	off += node->page_offset;
+	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+	data = kmap(*pagep);
+	off &= ~PAGE_CACHE_MASK;
+	idx = 0;
+
+	for (;;) {
+		while (len) {
+			byte = data[off];
+			if (byte != 0xff) {
+				for (m = 0x80, i = 0; i < 8; m >>= 1, i++) {
+					if (!(byte & m)) {
+						idx += i;
+						data[off] |= m;
+						set_page_dirty(*pagep);
+						kunmap(*pagep);
+						tree->free_nodes--;
+						mark_inode_dirty(tree->inode);
+						hfs_bnode_put(node);
+						return hfs_bnode_create(tree,
+							idx);
+					}
+				}
+			}
+			if (++off >= PAGE_CACHE_SIZE) {
+				kunmap(*pagep);
+				data = kmap(*++pagep);
+				off = 0;
+			}
+			idx += 8;
+			len--;
+		}
+		kunmap(*pagep);
+		nidx = node->next;
+		if (!nidx) {
+			dprint(DBG_BNODE_MOD, "hfs: create new bmap node.\n");
+			next_node = hfs_bmap_new_bmap(node, idx);
+		} else
+			next_node = hfs_bnode_find(tree, nidx);
+		hfs_bnode_put(node);
+		if (IS_ERR(next_node))
+			return next_node;
+		node = next_node;
+
+		len = hfs_brec_lenoff(node, 0, &off16);
+		off = off16;
+		off += node->page_offset;
+		pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+		data = kmap(*pagep);
+		off &= ~PAGE_CACHE_MASK;
+	}
+}
+
+void hfs_bmap_free(struct hfs_bnode *node)
+{
+	struct hfs_btree *tree;
+	struct page *page;
+	u16 off, len;
+	u32 nidx;
+	u8 *data, byte, m;
+
+	dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this);
+	BUG_ON(!node->this);
+	tree = node->tree;
+	nidx = node->this;
+	node = hfs_bnode_find(tree, 0);
+	if (IS_ERR(node))
+		return;
+	len = hfs_brec_lenoff(node, 2, &off);
+	while (nidx >= len * 8) {
+		u32 i;
+
+		nidx -= len * 8;
+		i = node->next;
+		hfs_bnode_put(node);
+		if (!i) {
+			/* panic */;
+			printk(KERN_CRIT "hfs: unable to free bnode %u. "
+					"bmap not found!\n",
+				node->this);
+			return;
+		}
+		node = hfs_bnode_find(tree, i);
+		if (IS_ERR(node))
+			return;
+		if (node->type != HFS_NODE_MAP) {
+			/* panic */;
+			printk(KERN_CRIT "hfs: invalid bmap found! "
+					"(%u,%d)\n",
+				node->this, node->type);
+			hfs_bnode_put(node);
+			return;
+		}
+		len = hfs_brec_lenoff(node, 0, &off);
+	}
+	off += node->page_offset + nidx / 8;
+	page = node->page[off >> PAGE_CACHE_SHIFT];
+	data = kmap(page);
+	off &= ~PAGE_CACHE_MASK;
+	m = 1 << (~nidx & 7);
+	byte = data[off];
+	if (!(byte & m)) {
+		printk(KERN_CRIT "hfs: trying to free free bnode "
+				"%u(%d)\n",
+			node->this, node->type);
+		kunmap(page);
+		hfs_bnode_put(node);
+		return;
+	}
+	data[off] = byte & ~m;
+	set_page_dirty(page);
+	kunmap(page);
+	hfs_bnode_put(node);
+	tree->free_nodes++;
+	mark_inode_dirty(tree->inode);
+}
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
new file mode 100644
index 00000000..408073ae
--- /dev/null
+++ b/fs/hfsplus/catalog.c
@@ -0,0 +1,425 @@
+/*
+ *  linux/fs/hfsplus/catalog.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handling of catalog records
+ */
+
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1,
+			     const hfsplus_btree_key *k2)
+{
+	__be32 k1p, k2p;
+
+	k1p = k1->cat.parent;
+	k2p = k2->cat.parent;
+	if (k1p != k2p)
+		return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1;
+
+	return hfsplus_strcasecmp(&k1->cat.name, &k2->cat.name);
+}
+
+int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1,
+			    const hfsplus_btree_key *k2)
+{
+	__be32 k1p, k2p;
+
+	k1p = k1->cat.parent;
+	k2p = k2->cat.parent;
+	if (k1p != k2p)
+		return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1;
+
+	return hfsplus_strcmp(&k1->cat.name, &k2->cat.name);
+}
+
+void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,
+			   u32 parent, struct qstr *str)
+{
+	int len;
+
+	key->cat.parent = cpu_to_be32(parent);
+	if (str) {
+		hfsplus_asc2uni(sb, &key->cat.name, str->name, str->len);
+		len = be16_to_cpu(key->cat.name.length);
+	} else {
+		key->cat.name.length = 0;
+		len = 0;
+	}
+	key->key_len = cpu_to_be16(6 + 2 * len);
+}
+
+static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
+				      struct hfsplus_unistr *name)
+{
+	int ustrlen;
+
+	ustrlen = be16_to_cpu(name->length);
+	key->cat.parent = cpu_to_be32(parent);
+	key->cat.name.length = cpu_to_be16(ustrlen);
+	ustrlen *= 2;
+	memcpy(key->cat.name.unicode, name->unicode, ustrlen);
+	key->key_len = cpu_to_be16(6 + ustrlen);
+}
+
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
+{
+	if (inode->i_flags & S_IMMUTABLE)
+		perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
+	else
+		perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
+	if (inode->i_flags & S_APPEND)
+		perms->rootflags |= HFSPLUS_FLG_APPEND;
+	else
+		perms->rootflags &= ~HFSPLUS_FLG_APPEND;
+
+	perms->userflags = HFSPLUS_I(inode)->userflags;
+	perms->mode = cpu_to_be16(inode->i_mode);
+	perms->owner = cpu_to_be32(inode->i_uid);
+	perms->group = cpu_to_be32(inode->i_gid);
+
+	if (S_ISREG(inode->i_mode))
+		perms->dev = cpu_to_be32(inode->i_nlink);
+	else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
+		perms->dev = cpu_to_be32(inode->i_rdev);
+	else
+		perms->dev = 0;
+}
+
+static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
+		u32 cnid, struct inode *inode)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+
+	if (S_ISDIR(inode->i_mode)) {
+		struct hfsplus_cat_folder *folder;
+
+		folder = &entry->folder;
+		memset(folder, 0, sizeof(*folder));
+		folder->type = cpu_to_be16(HFSPLUS_FOLDER);
+		folder->id = cpu_to_be32(inode->i_ino);
+		HFSPLUS_I(inode)->create_date =
+			folder->create_date =
+			folder->content_mod_date =
+			folder->attribute_mod_date =
+			folder->access_date = hfsp_now2mt();
+		hfsplus_cat_set_perms(inode, &folder->permissions);
+		if (inode == sbi->hidden_dir)
+			/* invisible and namelocked */
+			folder->user_info.frFlags = cpu_to_be16(0x5000);
+		return sizeof(*folder);
+	} else {
+		struct hfsplus_cat_file *file;
+
+		file = &entry->file;
+		memset(file, 0, sizeof(*file));
+		file->type = cpu_to_be16(HFSPLUS_FILE);
+		file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
+		file->id = cpu_to_be32(cnid);
+		HFSPLUS_I(inode)->create_date =
+			file->create_date =
+			file->content_mod_date =
+			file->attribute_mod_date =
+			file->access_date = hfsp_now2mt();
+		if (cnid == inode->i_ino) {
+			hfsplus_cat_set_perms(inode, &file->permissions);
+			if (S_ISLNK(inode->i_mode)) {
+				file->user_info.fdType =
+					cpu_to_be32(HFSP_SYMLINK_TYPE);
+				file->user_info.fdCreator =
+					cpu_to_be32(HFSP_SYMLINK_CREATOR);
+			} else {
+				file->user_info.fdType =
+					cpu_to_be32(sbi->type);
+				file->user_info.fdCreator =
+					cpu_to_be32(sbi->creator);
+			}
+			if (HFSPLUS_FLG_IMMUTABLE &
+					(file->permissions.rootflags |
+					file->permissions.userflags))
+				file->flags |=
+					cpu_to_be16(HFSPLUS_FILE_LOCKED);
+		} else {
+			file->user_info.fdType =
+				cpu_to_be32(HFSP_HARDLINK_TYPE);
+			file->user_info.fdCreator =
+				cpu_to_be32(HFSP_HFSPLUS_CREATOR);
+			file->user_info.fdFlags =
+				cpu_to_be16(0x100);
+			file->create_date =
+				HFSPLUS_I(sbi->hidden_dir)->create_date;
+			file->permissions.dev =
+				cpu_to_be32(HFSPLUS_I(inode)->linkid);
+		}
+		return sizeof(*file);
+	}
+}
+
+static int hfsplus_fill_cat_thread(struct super_block *sb,
+				   hfsplus_cat_entry *entry, int type,
+				   u32 parentid, struct qstr *str)
+{
+	entry->type = cpu_to_be16(type);
+	entry->thread.reserved = 0;
+	entry->thread.parentID = cpu_to_be32(parentid);
+	hfsplus_asc2uni(sb, &entry->thread.nodeName, str->name, str->len);
+	return 10 + be16_to_cpu(entry->thread.nodeName.length) * 2;
+}
+
+/* Try to get a catalog entry for given catalog id */
+int hfsplus_find_cat(struct super_block *sb, u32 cnid,
+		     struct hfs_find_data *fd)
+{
+	hfsplus_cat_entry tmp;
+	int err;
+	u16 type;
+
+	hfsplus_cat_build_key(sb, fd->search_key, cnid, NULL);
+	err = hfs_brec_read(fd, &tmp, sizeof(hfsplus_cat_entry));
+	if (err)
+		return err;
+
+	type = be16_to_cpu(tmp.type);
+	if (type != HFSPLUS_FOLDER_THREAD && type != HFSPLUS_FILE_THREAD) {
+		printk(KERN_ERR "hfs: found bad thread record in catalog\n");
+		return -EIO;
+	}
+
+	if (be16_to_cpu(tmp.thread.nodeName.length) > 255) {
+		printk(KERN_ERR "hfs: catalog name length corrupted\n");
+		return -EIO;
+	}
+
+	hfsplus_cat_build_key_uni(fd->search_key,
+		be32_to_cpu(tmp.thread.parentID),
+		&tmp.thread.nodeName);
+	return hfs_brec_find(fd);
+}
+
+int hfsplus_create_cat(u32 cnid, struct inode *dir,
+		struct qstr *str, struct inode *inode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct hfs_find_data fd;
+	hfsplus_cat_entry entry;
+	int entry_size;
+	int err;
+
+	dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n",
+		str->name, cnid, inode->i_nlink);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+
+	hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+	entry_size = hfsplus_fill_cat_thread(sb, &entry,
+		S_ISDIR(inode->i_mode) ?
+			HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD,
+		dir->i_ino, str);
+	err = hfs_brec_find(&fd);
+	if (err != -ENOENT) {
+		if (!err)
+			err = -EEXIST;
+		goto err2;
+	}
+	err = hfs_brec_insert(&fd, &entry, entry_size);
+	if (err)
+		goto err2;
+
+	hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+	entry_size = hfsplus_cat_build_record(&entry, cnid, inode);
+	err = hfs_brec_find(&fd);
+	if (err != -ENOENT) {
+		/* panic? */
+		if (!err)
+			err = -EEXIST;
+		goto err1;
+	}
+	err = hfs_brec_insert(&fd, &entry, entry_size);
+	if (err)
+		goto err1;
+
+	dir->i_size++;
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
+
+	hfs_find_exit(&fd);
+	return 0;
+
+err1:
+	hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+	if (!hfs_brec_find(&fd))
+		hfs_brec_remove(&fd);
+err2:
+	hfs_find_exit(&fd);
+	return err;
+}
+
+int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
+{
+	struct super_block *sb = dir->i_sb;
+	struct hfs_find_data fd;
+	struct hfsplus_fork_raw fork;
+	struct list_head *pos;
+	int err, off;
+	u16 type;
+
+	dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n",
+		str ? str->name : NULL, cnid);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+
+	if (!str) {
+		int len;
+
+		hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+		err = hfs_brec_find(&fd);
+		if (err)
+			goto out;
+
+		off = fd.entryoffset +
+			offsetof(struct hfsplus_cat_thread, nodeName);
+		fd.search_key->cat.parent = cpu_to_be32(dir->i_ino);
+		hfs_bnode_read(fd.bnode,
+			&fd.search_key->cat.name.length, off, 2);
+		len = be16_to_cpu(fd.search_key->cat.name.length) * 2;
+		hfs_bnode_read(fd.bnode,
+			&fd.search_key->cat.name.unicode,
+			off + 2, len);
+		fd.search_key->key_len = cpu_to_be16(6 + len);
+	} else
+		hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+
+	err = hfs_brec_find(&fd);
+	if (err)
+		goto out;
+
+	type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset);
+	if (type == HFSPLUS_FILE) {
+#if 0
+		off = fd.entryoffset + offsetof(hfsplus_cat_file, data_fork);
+		hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork));
+		hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_DATA);
+#endif
+
+		off = fd.entryoffset +
+			offsetof(struct hfsplus_cat_file, rsrc_fork);
+		hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork));
+		hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
+	}
+
+	list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
+		struct hfsplus_readdir_data *rd =
+			list_entry(pos, struct hfsplus_readdir_data, list);
+		if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
+			rd->file->f_pos--;
+	}
+
+	err = hfs_brec_remove(&fd);
+	if (err)
+		goto out;
+
+	hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+	err = hfs_brec_find(&fd);
+	if (err)
+		goto out;
+
+	err = hfs_brec_remove(&fd);
+	if (err)
+		goto out;
+
+	dir->i_size--;
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
+out:
+	hfs_find_exit(&fd);
+
+	return err;
+}
+
+int hfsplus_rename_cat(u32 cnid,
+		       struct inode *src_dir, struct qstr *src_name,
+		       struct inode *dst_dir, struct qstr *dst_name)
+{
+	struct super_block *sb = src_dir->i_sb;
+	struct hfs_find_data src_fd, dst_fd;
+	hfsplus_cat_entry entry;
+	int entry_size, type;
+	int err = 0;
+
+	dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
+		cnid, src_dir->i_ino, src_name->name,
+		dst_dir->i_ino, dst_name->name);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
+	dst_fd = src_fd;
+
+	/* find the old dir entry and read the data */
+	hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
+	err = hfs_brec_find(&src_fd);
+	if (err)
+		goto out;
+	if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) {
+		err = -EIO;
+		goto out;
+	}
+
+	hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset,
+				src_fd.entrylength);
+
+	/* create new dir entry with the data from the old entry */
+	hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name);
+	err = hfs_brec_find(&dst_fd);
+	if (err != -ENOENT) {
+		if (!err)
+			err = -EEXIST;
+		goto out;
+	}
+
+	err = hfs_brec_insert(&dst_fd, &entry, src_fd.entrylength);
+	if (err)
+		goto out;
+	dst_dir->i_size++;
+	dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
+
+	/* finally remove the old entry */
+	hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
+	err = hfs_brec_find(&src_fd);
+	if (err)
+		goto out;
+	err = hfs_brec_remove(&src_fd);
+	if (err)
+		goto out;
+	src_dir->i_size--;
+	src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
+
+	/* remove old thread entry */
+	hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL);
+	err = hfs_brec_find(&src_fd);
+	if (err)
+		goto out;
+	type = hfs_bnode_read_u16(src_fd.bnode, src_fd.entryoffset);
+	err = hfs_brec_remove(&src_fd);
+	if (err)
+		goto out;
+
+	/* create new thread entry */
+	hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
+	entry_size = hfsplus_fill_cat_thread(sb, &entry, type,
+		dst_dir->i_ino, dst_name);
+	err = hfs_brec_find(&dst_fd);
+	if (err != -ENOENT) {
+		if (!err)
+			err = -EEXIST;
+		goto out;
+	}
+	err = hfs_brec_insert(&dst_fd, &entry, entry_size);
+
+	hfsplus_mark_inode_dirty(dst_dir, HFSPLUS_I_CAT_DIRTY);
+	hfsplus_mark_inode_dirty(src_dir, HFSPLUS_I_CAT_DIRTY);
+out:
+	hfs_bnode_put(dst_fd.bnode);
+	hfs_find_exit(&src_fd);
+	return err;
+}
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
new file mode 100644
index 00000000..159f5ebf
--- /dev/null
+++ b/fs/hfsplus/dir.c
@@ -0,0 +1,516 @@
+/*
+ *  linux/fs/hfsplus/dir.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handling of directories
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+static inline void hfsplus_instantiate(struct dentry *dentry,
+				       struct inode *inode, u32 cnid)
+{
+	dentry->d_fsdata = (void *)(unsigned long)cnid;
+	d_instantiate(dentry, inode);
+}
+
+/* Find the entry inside dir named dentry->d_name */
+static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
+				     struct nameidata *nd)
+{
+	struct inode *inode = NULL;
+	struct hfs_find_data fd;
+	struct super_block *sb;
+	hfsplus_cat_entry entry;
+	int err;
+	u32 cnid, linkid = 0;
+	u16 type;
+
+	sb = dir->i_sb;
+
+	dentry->d_fsdata = NULL;
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+	hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
+again:
+	err = hfs_brec_read(&fd, &entry, sizeof(entry));
+	if (err) {
+		if (err == -ENOENT) {
+			hfs_find_exit(&fd);
+			/* No such entry */
+			inode = NULL;
+			goto out;
+		}
+		goto fail;
+	}
+	type = be16_to_cpu(entry.type);
+	if (type == HFSPLUS_FOLDER) {
+		if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) {
+			err = -EIO;
+			goto fail;
+		}
+		cnid = be32_to_cpu(entry.folder.id);
+		dentry->d_fsdata = (void *)(unsigned long)cnid;
+	} else if (type == HFSPLUS_FILE) {
+		if (fd.entrylength < sizeof(struct hfsplus_cat_file)) {
+			err = -EIO;
+			goto fail;
+		}
+		cnid = be32_to_cpu(entry.file.id);
+		if (entry.file.user_info.fdType ==
+				cpu_to_be32(HFSP_HARDLINK_TYPE) &&
+				entry.file.user_info.fdCreator ==
+				cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
+				(entry.file.create_date ==
+					HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->
+						create_date ||
+				entry.file.create_date ==
+					HFSPLUS_I(sb->s_root->d_inode)->
+						create_date) &&
+				HFSPLUS_SB(sb)->hidden_dir) {
+			struct qstr str;
+			char name[32];
+
+			if (dentry->d_fsdata) {
+				/*
+				 * We found a link pointing to another link,
+				 * so ignore it and treat it as regular file.
+				 */
+				cnid = (unsigned long)dentry->d_fsdata;
+				linkid = 0;
+			} else {
+				dentry->d_fsdata = (void *)(unsigned long)cnid;
+				linkid =
+					be32_to_cpu(entry.file.permissions.dev);
+				str.len = sprintf(name, "iNode%d", linkid);
+				str.name = name;
+				hfsplus_cat_build_key(sb, fd.search_key,
+					HFSPLUS_SB(sb)->hidden_dir->i_ino,
+					&str);
+				goto again;
+			}
+		} else if (!dentry->d_fsdata)
+			dentry->d_fsdata = (void *)(unsigned long)cnid;
+	} else {
+		printk(KERN_ERR "hfs: invalid catalog entry type in lookup\n");
+		err = -EIO;
+		goto fail;
+	}
+	hfs_find_exit(&fd);
+	inode = hfsplus_iget(dir->i_sb, cnid);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (S_ISREG(inode->i_mode))
+		HFSPLUS_I(inode)->linkid = linkid;
+out:
+	d_add(dentry, inode);
+	return NULL;
+fail:
+	hfs_find_exit(&fd);
+	return ERR_PTR(err);
+}
+
+static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	int len, err;
+	char strbuf[HFSPLUS_MAX_STRLEN + 1];
+	hfsplus_cat_entry entry;
+	struct hfs_find_data fd;
+	struct hfsplus_readdir_data *rd;
+	u16 type;
+
+	if (filp->f_pos >= inode->i_size)
+		return 0;
+
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+	hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
+	err = hfs_brec_find(&fd);
+	if (err)
+		goto out;
+
+	switch ((u32)filp->f_pos) {
+	case 0:
+		/* This is completely artificial... */
+		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+			goto out;
+		filp->f_pos++;
+		/* fall through */
+	case 1:
+		if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
+			err = -EIO;
+			goto out;
+		}
+
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+			fd.entrylength);
+		if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) {
+			printk(KERN_ERR "hfs: bad catalog folder thread\n");
+			err = -EIO;
+			goto out;
+		}
+		if (fd.entrylength < HFSPLUS_MIN_THREAD_SZ) {
+			printk(KERN_ERR "hfs: truncated catalog thread\n");
+			err = -EIO;
+			goto out;
+		}
+		if (filldir(dirent, "..", 2, 1,
+			    be32_to_cpu(entry.thread.parentID), DT_DIR))
+			goto out;
+		filp->f_pos++;
+		/* fall through */
+	default:
+		if (filp->f_pos >= inode->i_size)
+			goto out;
+		err = hfs_brec_goto(&fd, filp->f_pos - 1);
+		if (err)
+			goto out;
+	}
+
+	for (;;) {
+		if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) {
+			printk(KERN_ERR "hfs: walked past end of dir\n");
+			err = -EIO;
+			goto out;
+		}
+
+		if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
+			err = -EIO;
+			goto out;
+		}
+
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+			fd.entrylength);
+		type = be16_to_cpu(entry.type);
+		len = HFSPLUS_MAX_STRLEN;
+		err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len);
+		if (err)
+			goto out;
+		if (type == HFSPLUS_FOLDER) {
+			if (fd.entrylength <
+					sizeof(struct hfsplus_cat_folder)) {
+				printk(KERN_ERR "hfs: small dir entry\n");
+				err = -EIO;
+				goto out;
+			}
+			if (HFSPLUS_SB(sb)->hidden_dir &&
+			    HFSPLUS_SB(sb)->hidden_dir->i_ino ==
+					be32_to_cpu(entry.folder.id))
+				goto next;
+			if (filldir(dirent, strbuf, len, filp->f_pos,
+				    be32_to_cpu(entry.folder.id), DT_DIR))
+				break;
+		} else if (type == HFSPLUS_FILE) {
+			if (fd.entrylength < sizeof(struct hfsplus_cat_file)) {
+				printk(KERN_ERR "hfs: small file entry\n");
+				err = -EIO;
+				goto out;
+			}
+			if (filldir(dirent, strbuf, len, filp->f_pos,
+				    be32_to_cpu(entry.file.id), DT_REG))
+				break;
+		} else {
+			printk(KERN_ERR "hfs: bad catalog entry type\n");
+			err = -EIO;
+			goto out;
+		}
+next:
+		filp->f_pos++;
+		if (filp->f_pos >= inode->i_size)
+			goto out;
+		err = hfs_brec_goto(&fd, 1);
+		if (err)
+			goto out;
+	}
+	rd = filp->private_data;
+	if (!rd) {
+		rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL);
+		if (!rd) {
+			err = -ENOMEM;
+			goto out;
+		}
+		filp->private_data = rd;
+		rd->file = filp;
+		list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
+	}
+	memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
+out:
+	hfs_find_exit(&fd);
+	return err;
+}
+
+static int hfsplus_dir_release(struct inode *inode, struct file *file)
+{
+	struct hfsplus_readdir_data *rd = file->private_data;
+	if (rd) {
+		mutex_lock(&inode->i_mutex);
+		list_del(&rd->list);
+		mutex_unlock(&inode->i_mutex);
+		kfree(rd);
+	}
+	return 0;
+}
+
+static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
+			struct dentry *dst_dentry)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
+	struct inode *inode = src_dentry->d_inode;
+	struct inode *src_dir = src_dentry->d_parent->d_inode;
+	struct qstr str;
+	char name[32];
+	u32 cnid, id;
+	int res;
+
+	if (HFSPLUS_IS_RSRC(inode))
+		return -EPERM;
+	if (!S_ISREG(inode->i_mode))
+		return -EPERM;
+
+	mutex_lock(&sbi->vh_mutex);
+	if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
+		for (;;) {
+			get_random_bytes(&id, sizeof(cnid));
+			id &= 0x3fffffff;
+			str.name = name;
+			str.len = sprintf(name, "iNode%d", id);
+			res = hfsplus_rename_cat(inode->i_ino,
+						 src_dir, &src_dentry->d_name,
+						 sbi->hidden_dir, &str);
+			if (!res)
+				break;
+			if (res != -EEXIST)
+				goto out;
+		}
+		HFSPLUS_I(inode)->linkid = id;
+		cnid = sbi->next_cnid++;
+		src_dentry->d_fsdata = (void *)(unsigned long)cnid;
+		res = hfsplus_create_cat(cnid, src_dir,
+			&src_dentry->d_name, inode);
+		if (res)
+			/* panic? */
+			goto out;
+		sbi->file_count++;
+	}
+	cnid = sbi->next_cnid++;
+	res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
+	if (res)
+		goto out;
+
+	inc_nlink(inode);
+	hfsplus_instantiate(dst_dentry, inode, cnid);
+	ihold(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	sbi->file_count++;
+	dst_dir->i_sb->s_dirt = 1;
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
+}
+
+static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+	struct inode *inode = dentry->d_inode;
+	struct qstr str;
+	char name[32];
+	u32 cnid;
+	int res;
+
+	if (HFSPLUS_IS_RSRC(inode))
+		return -EPERM;
+
+	mutex_lock(&sbi->vh_mutex);
+	cnid = (u32)(unsigned long)dentry->d_fsdata;
+	if (inode->i_ino == cnid &&
+	    atomic_read(&HFSPLUS_I(inode)->opencnt)) {
+		str.name = name;
+		str.len = sprintf(name, "temp%lu", inode->i_ino);
+		res = hfsplus_rename_cat(inode->i_ino,
+					 dir, &dentry->d_name,
+					 sbi->hidden_dir, &str);
+		if (!res) {
+			inode->i_flags |= S_DEAD;
+			drop_nlink(inode);
+		}
+		goto out;
+	}
+	res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
+	if (res)
+		goto out;
+
+	if (inode->i_nlink > 0)
+		drop_nlink(inode);
+	if (inode->i_ino == cnid)
+		clear_nlink(inode);
+	if (!inode->i_nlink) {
+		if (inode->i_ino != cnid) {
+			sbi->file_count--;
+			if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) {
+				res = hfsplus_delete_cat(inode->i_ino,
+							 sbi->hidden_dir,
+							 NULL);
+				if (!res)
+					hfsplus_delete_inode(inode);
+			} else
+				inode->i_flags |= S_DEAD;
+		} else
+			hfsplus_delete_inode(inode);
+	} else
+		sbi->file_count--;
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
+}
+
+static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+	struct inode *inode = dentry->d_inode;
+	int res;
+
+	if (inode->i_size != 2)
+		return -ENOTEMPTY;
+
+	mutex_lock(&sbi->vh_mutex);
+	res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
+	if (res)
+		goto out;
+	clear_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	hfsplus_delete_inode(inode);
+	mark_inode_dirty(inode);
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
+}
+
+static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
+			   const char *symname)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+	struct inode *inode;
+	int res = -ENOSPC;
+
+	mutex_lock(&sbi->vh_mutex);
+	inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
+	if (!inode)
+		goto out;
+
+	res = page_symlink(inode, symname, strlen(symname) + 1);
+	if (res)
+		goto out_err;
+
+	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
+	if (res)
+		goto out_err;
+
+	hfsplus_instantiate(dentry, inode, inode->i_ino);
+	mark_inode_dirty(inode);
+	goto out;
+
+out_err:
+	inode->i_nlink = 0;
+	hfsplus_delete_inode(inode);
+	iput(inode);
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
+}
+
+static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
+			 int mode, dev_t rdev)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+	struct inode *inode;
+	int res = -ENOSPC;
+
+	mutex_lock(&sbi->vh_mutex);
+	inode = hfsplus_new_inode(dir->i_sb, mode);
+	if (!inode)
+		goto out;
+
+	if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
+		init_special_inode(inode, mode, rdev);
+
+	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
+	if (res) {
+		inode->i_nlink = 0;
+		hfsplus_delete_inode(inode);
+		iput(inode);
+		goto out;
+	}
+
+	hfsplus_instantiate(dentry, inode, inode->i_ino);
+	mark_inode_dirty(inode);
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
+}
+
+static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
+			  struct nameidata *nd)
+{
+	return hfsplus_mknod(dir, dentry, mode, 0);
+}
+
+static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
+}
+
+static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
+			  struct inode *new_dir, struct dentry *new_dentry)
+{
+	int res;
+
+	/* Unlink destination if it already exists */
+	if (new_dentry->d_inode) {
+		if (S_ISDIR(new_dentry->d_inode->i_mode))
+			res = hfsplus_rmdir(new_dir, new_dentry);
+		else
+			res = hfsplus_unlink(new_dir, new_dentry);
+		if (res)
+			return res;
+	}
+
+	res = hfsplus_rename_cat((u32)(unsigned long)old_dentry->d_fsdata,
+				 old_dir, &old_dentry->d_name,
+				 new_dir, &new_dentry->d_name);
+	if (!res)
+		new_dentry->d_fsdata = old_dentry->d_fsdata;
+	return res;
+}
+
+const struct inode_operations hfsplus_dir_inode_operations = {
+	.lookup		= hfsplus_lookup,
+	.create		= hfsplus_create,
+	.link		= hfsplus_link,
+	.unlink		= hfsplus_unlink,
+	.mkdir		= hfsplus_mkdir,
+	.rmdir		= hfsplus_rmdir,
+	.symlink	= hfsplus_symlink,
+	.mknod		= hfsplus_mknod,
+	.rename		= hfsplus_rename,
+};
+
+const struct file_operations hfsplus_dir_operations = {
+	.fsync		= hfsplus_file_fsync,
+	.read		= generic_read_dir,
+	.readdir	= hfsplus_readdir,
+	.unlocked_ioctl = hfsplus_ioctl,
+	.llseek		= generic_file_llseek,
+	.release	= hfsplus_dir_release,
+};
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
new file mode 100644
index 00000000..b1991a2a
--- /dev/null
+++ b/fs/hfsplus/extents.c
@@ -0,0 +1,561 @@
+/*
+ *  linux/fs/hfsplus/extents.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handling of Extents both in catalog and extents overflow trees
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+/* Compare two extents keys, returns 0 on same, pos/neg for difference */
+int hfsplus_ext_cmp_key(const hfsplus_btree_key *k1,
+			const hfsplus_btree_key *k2)
+{
+	__be32 k1id, k2id;
+	__be32 k1s, k2s;
+
+	k1id = k1->ext.cnid;
+	k2id = k2->ext.cnid;
+	if (k1id != k2id)
+		return be32_to_cpu(k1id) < be32_to_cpu(k2id) ? -1 : 1;
+
+	if (k1->ext.fork_type != k2->ext.fork_type)
+		return k1->ext.fork_type < k2->ext.fork_type ? -1 : 1;
+
+	k1s = k1->ext.start_block;
+	k2s = k2->ext.start_block;
+	if (k1s == k2s)
+		return 0;
+	return be32_to_cpu(k1s) < be32_to_cpu(k2s) ? -1 : 1;
+}
+
+static void hfsplus_ext_build_key(hfsplus_btree_key *key, u32 cnid,
+				  u32 block, u8 type)
+{
+	key->key_len = cpu_to_be16(HFSPLUS_EXT_KEYLEN - 2);
+	key->ext.cnid = cpu_to_be32(cnid);
+	key->ext.start_block = cpu_to_be32(block);
+	key->ext.fork_type = type;
+	key->ext.pad = 0;
+}
+
+static u32 hfsplus_ext_find_block(struct hfsplus_extent *ext, u32 off)
+{
+	int i;
+	u32 count;
+
+	for (i = 0; i < 8; ext++, i++) {
+		count = be32_to_cpu(ext->block_count);
+		if (off < count)
+			return be32_to_cpu(ext->start_block) + off;
+		off -= count;
+	}
+	/* panic? */
+	return 0;
+}
+
+static int hfsplus_ext_block_count(struct hfsplus_extent *ext)
+{
+	int i;
+	u32 count = 0;
+
+	for (i = 0; i < 8; ext++, i++)
+		count += be32_to_cpu(ext->block_count);
+	return count;
+}
+
+static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
+{
+	int i;
+
+	ext += 7;
+	for (i = 0; i < 7; ext--, i++)
+		if (ext->block_count)
+			break;
+	return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count);
+}
+
+static void __hfsplus_ext_write_extent(struct inode *inode,
+		struct hfs_find_data *fd)
+{
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	int res;
+
+	WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
+	hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
+			      HFSPLUS_IS_RSRC(inode) ?
+				HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+
+	res = hfs_brec_find(fd);
+	if (hip->extent_state & HFSPLUS_EXT_NEW) {
+		if (res != -ENOENT)
+			return;
+		hfs_brec_insert(fd, hip->cached_extents,
+				sizeof(hfsplus_extent_rec));
+		hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
+	} else {
+		if (res)
+			return;
+		hfs_bnode_write(fd->bnode, hip->cached_extents,
+				fd->entryoffset, fd->entrylength);
+		hip->extent_state &= ~HFSPLUS_EXT_DIRTY;
+	}
+
+	/*
+	 * We can't just use hfsplus_mark_inode_dirty here, because we
+	 * also get called from hfsplus_write_inode, which should not
+	 * redirty the inode.  Instead the callers have to be careful
+	 * to explicily mark the inode dirty, too.
+	 */
+	set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags);
+}
+
+static void hfsplus_ext_write_extent_locked(struct inode *inode)
+{
+	if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) {
+		struct hfs_find_data fd;
+
+		hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
+		__hfsplus_ext_write_extent(inode, &fd);
+		hfs_find_exit(&fd);
+	}
+}
+
+void hfsplus_ext_write_extent(struct inode *inode)
+{
+	mutex_lock(&HFSPLUS_I(inode)->extents_lock);
+	hfsplus_ext_write_extent_locked(inode);
+	mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
+}
+
+static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
+					    struct hfsplus_extent *extent,
+					    u32 cnid, u32 block, u8 type)
+{
+	int res;
+
+	hfsplus_ext_build_key(fd->search_key, cnid, block, type);
+	fd->key->ext.cnid = 0;
+	res = hfs_brec_find(fd);
+	if (res && res != -ENOENT)
+		return res;
+	if (fd->key->ext.cnid != fd->search_key->ext.cnid ||
+	    fd->key->ext.fork_type != fd->search_key->ext.fork_type)
+		return -ENOENT;
+	if (fd->entrylength != sizeof(hfsplus_extent_rec))
+		return -EIO;
+	hfs_bnode_read(fd->bnode, extent, fd->entryoffset,
+		sizeof(hfsplus_extent_rec));
+	return 0;
+}
+
+static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd,
+		struct inode *inode, u32 block)
+{
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	int res;
+
+	WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
+	if (hip->extent_state & HFSPLUS_EXT_DIRTY)
+		__hfsplus_ext_write_extent(inode, fd);
+
+	res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
+					block, HFSPLUS_IS_RSRC(inode) ?
+						HFSPLUS_TYPE_RSRC :
+						HFSPLUS_TYPE_DATA);
+	if (!res) {
+		hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
+		hip->cached_blocks =
+			hfsplus_ext_block_count(hip->cached_extents);
+	} else {
+		hip->cached_start = hip->cached_blocks = 0;
+		hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
+	}
+	return res;
+}
+
+static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
+{
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	struct hfs_find_data fd;
+	int res;
+
+	if (block >= hip->cached_start &&
+	    block < hip->cached_start + hip->cached_blocks)
+		return 0;
+
+	hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
+	res = __hfsplus_ext_cache_extent(&fd, inode, block);
+	hfs_find_exit(&fd);
+	return res;
+}
+
+/* Get a block at iblock for inode, possibly allocating if create */
+int hfsplus_get_block(struct inode *inode, sector_t iblock,
+		      struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	int res = -EIO;
+	u32 ablock, dblock, mask;
+	int was_dirty = 0;
+	int shift;
+
+	/* Convert inode block to disk allocation block */
+	shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
+	ablock = iblock >> sbi->fs_shift;
+
+	if (iblock >= hip->fs_blocks) {
+		if (iblock > hip->fs_blocks || !create)
+			return -EIO;
+		if (ablock >= hip->alloc_blocks) {
+			res = hfsplus_file_extend(inode);
+			if (res)
+				return res;
+		}
+	} else
+		create = 0;
+
+	if (ablock < hip->first_blocks) {
+		dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
+		goto done;
+	}
+
+	if (inode->i_ino == HFSPLUS_EXT_CNID)
+		return -EIO;
+
+	mutex_lock(&hip->extents_lock);
+
+	/*
+	 * hfsplus_ext_read_extent will write out a cached extent into
+	 * the extents btree.  In that case we may have to mark the inode
+	 * dirty even for a pure read of an extent here.
+	 */
+	was_dirty = (hip->extent_state & HFSPLUS_EXT_DIRTY);
+	res = hfsplus_ext_read_extent(inode, ablock);
+	if (res) {
+		mutex_unlock(&hip->extents_lock);
+		return -EIO;
+	}
+	dblock = hfsplus_ext_find_block(hip->cached_extents,
+					ablock - hip->cached_start);
+	mutex_unlock(&hip->extents_lock);
+
+done:
+	dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n",
+		inode->i_ino, (long long)iblock, dblock);
+	mask = (1 << sbi->fs_shift) - 1;
+	map_bh(bh_result, sb,
+		(dblock << sbi->fs_shift) + sbi->blockoffset +
+			(iblock & mask));
+	if (create) {
+		set_buffer_new(bh_result);
+		hip->phys_size += sb->s_blocksize;
+		hip->fs_blocks++;
+		inode_add_bytes(inode, sb->s_blocksize);
+	}
+	if (create || was_dirty)
+		mark_inode_dirty(inode);
+	return 0;
+}
+
+static void hfsplus_dump_extent(struct hfsplus_extent *extent)
+{
+	int i;
+
+	dprint(DBG_EXTENT, "   ");
+	for (i = 0; i < 8; i++)
+		dprint(DBG_EXTENT, " %u:%u", be32_to_cpu(extent[i].start_block),
+				 be32_to_cpu(extent[i].block_count));
+	dprint(DBG_EXTENT, "\n");
+}
+
+static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset,
+			      u32 alloc_block, u32 block_count)
+{
+	u32 count, start;
+	int i;
+
+	hfsplus_dump_extent(extent);
+	for (i = 0; i < 8; extent++, i++) {
+		count = be32_to_cpu(extent->block_count);
+		if (offset == count) {
+			start = be32_to_cpu(extent->start_block);
+			if (alloc_block != start + count) {
+				if (++i >= 8)
+					return -ENOSPC;
+				extent++;
+				extent->start_block = cpu_to_be32(alloc_block);
+			} else
+				block_count += count;
+			extent->block_count = cpu_to_be32(block_count);
+			return 0;
+		} else if (offset < count)
+			break;
+		offset -= count;
+	}
+	/* panic? */
+	return -EIO;
+}
+
+static int hfsplus_free_extents(struct super_block *sb,
+				struct hfsplus_extent *extent,
+				u32 offset, u32 block_nr)
+{
+	u32 count, start;
+	int i;
+
+	hfsplus_dump_extent(extent);
+	for (i = 0; i < 8; extent++, i++) {
+		count = be32_to_cpu(extent->block_count);
+		if (offset == count)
+			goto found;
+		else if (offset < count)
+			break;
+		offset -= count;
+	}
+	/* panic? */
+	return -EIO;
+found:
+	for (;;) {
+		start = be32_to_cpu(extent->start_block);
+		if (count <= block_nr) {
+			hfsplus_block_free(sb, start, count);
+			extent->block_count = 0;
+			extent->start_block = 0;
+			block_nr -= count;
+		} else {
+			count -= block_nr;
+			hfsplus_block_free(sb, start + count, block_nr);
+			extent->block_count = cpu_to_be32(count);
+			block_nr = 0;
+		}
+		if (!block_nr || !i)
+			return 0;
+		i--;
+		extent--;
+		count = be32_to_cpu(extent->block_count);
+	}
+}
+
+int hfsplus_free_fork(struct super_block *sb, u32 cnid,
+		struct hfsplus_fork_raw *fork, int type)
+{
+	struct hfs_find_data fd;
+	hfsplus_extent_rec ext_entry;
+	u32 total_blocks, blocks, start;
+	int res, i;
+
+	total_blocks = be32_to_cpu(fork->total_blocks);
+	if (!total_blocks)
+		return 0;
+
+	blocks = 0;
+	for (i = 0; i < 8; i++)
+		blocks += be32_to_cpu(fork->extents[i].block_count);
+
+	res = hfsplus_free_extents(sb, fork->extents, blocks, blocks);
+	if (res)
+		return res;
+	if (total_blocks == blocks)
+		return 0;
+
+	hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
+	do {
+		res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
+						total_blocks, type);
+		if (res)
+			break;
+		start = be32_to_cpu(fd.key->ext.start_block);
+		hfsplus_free_extents(sb, ext_entry,
+				     total_blocks - start,
+				     total_blocks);
+		hfs_brec_remove(&fd);
+		total_blocks = start;
+	} while (total_blocks > blocks);
+	hfs_find_exit(&fd);
+
+	return res;
+}
+
+int hfsplus_file_extend(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	u32 start, len, goal;
+	int res;
+
+	if (sbi->alloc_file->i_size * 8 <
+	    sbi->total_blocks - sbi->free_blocks + 8) {
+		/* extend alloc file */
+		printk(KERN_ERR "hfs: extend alloc file! "
+				"(%llu,%u,%u)\n",
+			sbi->alloc_file->i_size * 8,
+			sbi->total_blocks, sbi->free_blocks);
+		return -ENOSPC;
+	}
+
+	mutex_lock(&hip->extents_lock);
+	if (hip->alloc_blocks == hip->first_blocks)
+		goal = hfsplus_ext_lastblock(hip->first_extents);
+	else {
+		res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
+		if (res)
+			goto out;
+		goal = hfsplus_ext_lastblock(hip->cached_extents);
+	}
+
+	len = hip->clump_blocks;
+	start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
+	if (start >= sbi->total_blocks) {
+		start = hfsplus_block_allocate(sb, goal, 0, &len);
+		if (start >= goal) {
+			res = -ENOSPC;
+			goto out;
+		}
+	}
+
+	dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
+
+	if (hip->alloc_blocks <= hip->first_blocks) {
+		if (!hip->first_blocks) {
+			dprint(DBG_EXTENT, "first extents\n");
+			/* no extents yet */
+			hip->first_extents[0].start_block = cpu_to_be32(start);
+			hip->first_extents[0].block_count = cpu_to_be32(len);
+			res = 0;
+		} else {
+			/* try to append to extents in inode */
+			res = hfsplus_add_extent(hip->first_extents,
+						 hip->alloc_blocks,
+						 start, len);
+			if (res == -ENOSPC)
+				goto insert_extent;
+		}
+		if (!res) {
+			hfsplus_dump_extent(hip->first_extents);
+			hip->first_blocks += len;
+		}
+	} else {
+		res = hfsplus_add_extent(hip->cached_extents,
+					 hip->alloc_blocks - hip->cached_start,
+					 start, len);
+		if (!res) {
+			hfsplus_dump_extent(hip->cached_extents);
+			hip->extent_state |= HFSPLUS_EXT_DIRTY;
+			hip->cached_blocks += len;
+		} else if (res == -ENOSPC)
+			goto insert_extent;
+	}
+out:
+	mutex_unlock(&hip->extents_lock);
+	if (!res) {
+		hip->alloc_blocks += len;
+		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
+	}
+	return res;
+
+insert_extent:
+	dprint(DBG_EXTENT, "insert new extent\n");
+	hfsplus_ext_write_extent_locked(inode);
+
+	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+	hip->cached_extents[0].start_block = cpu_to_be32(start);
+	hip->cached_extents[0].block_count = cpu_to_be32(len);
+	hfsplus_dump_extent(hip->cached_extents);
+	hip->extent_state |= HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW;
+	hip->cached_start = hip->alloc_blocks;
+	hip->cached_blocks = len;
+
+	res = 0;
+	goto out;
+}
+
+void hfsplus_file_truncate(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	struct hfs_find_data fd;
+	u32 alloc_cnt, blk_cnt, start;
+	int res;
+
+	dprint(DBG_INODE, "truncate: %lu, %llu -> %llu\n",
+		inode->i_ino, (long long)hip->phys_size,
+		inode->i_size);
+
+	if (inode->i_size > hip->phys_size) {
+		struct address_space *mapping = inode->i_mapping;
+		struct page *page;
+		void *fsdata;
+		u32 size = inode->i_size;
+		int res;
+
+		res = pagecache_write_begin(NULL, mapping, size, 0,
+						AOP_FLAG_UNINTERRUPTIBLE,
+						&page, &fsdata);
+		if (res)
+			return;
+		res = pagecache_write_end(NULL, mapping, size,
+			0, 0, page, fsdata);
+		if (res < 0)
+			return;
+		mark_inode_dirty(inode);
+		return;
+	} else if (inode->i_size == hip->phys_size)
+		return;
+
+	blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
+			HFSPLUS_SB(sb)->alloc_blksz_shift;
+	alloc_cnt = hip->alloc_blocks;
+	if (blk_cnt == alloc_cnt)
+		goto out;
+
+	mutex_lock(&hip->extents_lock);
+	hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
+	while (1) {
+		if (alloc_cnt == hip->first_blocks) {
+			hfsplus_free_extents(sb, hip->first_extents,
+					     alloc_cnt, alloc_cnt - blk_cnt);
+			hfsplus_dump_extent(hip->first_extents);
+			hip->first_blocks = blk_cnt;
+			break;
+		}
+		res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
+		if (res)
+			break;
+		start = hip->cached_start;
+		hfsplus_free_extents(sb, hip->cached_extents,
+				     alloc_cnt - start, alloc_cnt - blk_cnt);
+		hfsplus_dump_extent(hip->cached_extents);
+		if (blk_cnt > start) {
+			hip->extent_state |= HFSPLUS_EXT_DIRTY;
+			break;
+		}
+		alloc_cnt = start;
+		hip->cached_start = hip->cached_blocks = 0;
+		hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
+		hfs_brec_remove(&fd);
+	}
+	hfs_find_exit(&fd);
+	mutex_unlock(&hip->extents_lock);
+
+	hip->alloc_blocks = blk_cnt;
+out:
+	hip->phys_size = inode->i_size;
+	hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >>
+		sb->s_blocksize_bits;
+	inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
+	hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
+}
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
new file mode 100644
index 00000000..4e7f64b7
--- /dev/null
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -0,0 +1,463 @@
+/*
+ *  linux/include/linux/hfsplus_fs.h
+ *
+ * Copyright (C) 1999
+ * Brad Boyer (flar@pants.nu)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ */
+
+#ifndef _LINUX_HFSPLUS_FS_H
+#define _LINUX_HFSPLUS_FS_H
+
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include "hfsplus_raw.h"
+
+#define DBG_BNODE_REFS	0x00000001
+#define DBG_BNODE_MOD	0x00000002
+#define DBG_CAT_MOD	0x00000004
+#define DBG_INODE	0x00000008
+#define DBG_SUPER	0x00000010
+#define DBG_EXTENT	0x00000020
+#define DBG_BITMAP	0x00000040
+
+#if 0
+#define DBG_MASK	(DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
+#define DBG_MASK	(DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
+#define DBG_MASK	(DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
+#endif
+#define DBG_MASK	(0)
+
+#define dprint(flg, fmt, args...) \
+	if (flg & DBG_MASK) \
+		printk(fmt , ## args)
+
+/* Runtime config options */
+#define HFSPLUS_DEF_CR_TYPE    0x3F3F3F3F  /* '????' */
+
+#define HFSPLUS_TYPE_DATA 0x00
+#define HFSPLUS_TYPE_RSRC 0xFF
+
+typedef int (*btree_keycmp)(const hfsplus_btree_key *,
+		const hfsplus_btree_key *);
+
+#define NODE_HASH_SIZE	256
+
+/* An HFS+ BTree held in memory */
+struct hfs_btree {
+	struct super_block *sb;
+	struct inode *inode;
+	btree_keycmp keycmp;
+
+	u32 cnid;
+	u32 root;
+	u32 leaf_count;
+	u32 leaf_head;
+	u32 leaf_tail;
+	u32 node_count;
+	u32 free_nodes;
+	u32 attributes;
+
+	unsigned int node_size;
+	unsigned int node_size_shift;
+	unsigned int max_key_len;
+	unsigned int depth;
+
+	struct mutex tree_lock;
+
+	unsigned int pages_per_bnode;
+	spinlock_t hash_lock;
+	struct hfs_bnode *node_hash[NODE_HASH_SIZE];
+	int node_hash_cnt;
+};
+
+struct page;
+
+/* An HFS+ BTree node in memory */
+struct hfs_bnode {
+	struct hfs_btree *tree;
+
+	u32 prev;
+	u32 this;
+	u32 next;
+	u32 parent;
+
+	u16 num_recs;
+	u8 type;
+	u8 height;
+
+	struct hfs_bnode *next_hash;
+	unsigned long flags;
+	wait_queue_head_t lock_wq;
+	atomic_t refcnt;
+	unsigned int page_offset;
+	struct page *page[0];
+};
+
+#define HFS_BNODE_LOCK		0
+#define HFS_BNODE_ERROR		1
+#define HFS_BNODE_NEW		2
+#define HFS_BNODE_DIRTY		3
+#define HFS_BNODE_DELETED	4
+
+/*
+ * HFS+ superblock info (built from Volume Header on disk)
+ */
+
+struct hfsplus_vh;
+struct hfs_btree;
+
+struct hfsplus_sb_info {
+	void *s_vhdr_buf;
+	struct hfsplus_vh *s_vhdr;
+	void *s_backup_vhdr_buf;
+	struct hfsplus_vh *s_backup_vhdr;
+	struct hfs_btree *ext_tree;
+	struct hfs_btree *cat_tree;
+	struct hfs_btree *attr_tree;
+	struct inode *alloc_file;
+	struct inode *hidden_dir;
+	struct nls_table *nls;
+
+	/* Runtime variables */
+	u32 blockoffset;
+	sector_t part_start;
+	sector_t sect_count;
+	int fs_shift;
+
+	/* immutable data from the volume header */
+	u32 alloc_blksz;
+	int alloc_blksz_shift;
+	u32 total_blocks;
+	u32 data_clump_blocks, rsrc_clump_blocks;
+
+	/* mutable data from the volume header, protected by alloc_mutex */
+	u32 free_blocks;
+	struct mutex alloc_mutex;
+
+	/* mutable data from the volume header, protected by vh_mutex */
+	u32 next_cnid;
+	u32 file_count;
+	u32 folder_count;
+	struct mutex vh_mutex;
+
+	/* Config options */
+	u32 creator;
+	u32 type;
+
+	umode_t umask;
+	uid_t uid;
+	gid_t gid;
+
+	int part, session;
+
+	unsigned long flags;
+};
+
+#define HFSPLUS_SB_WRITEBACKUP	0
+#define HFSPLUS_SB_NODECOMPOSE	1
+#define HFSPLUS_SB_FORCE	2
+#define HFSPLUS_SB_HFSX		3
+#define HFSPLUS_SB_CASEFOLD	4
+#define HFSPLUS_SB_NOBARRIER	5
+
+static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+
+struct hfsplus_inode_info {
+	atomic_t opencnt;
+
+	/*
+	 * Extent allocation information, protected by extents_lock.
+	 */
+	u32 first_blocks;
+	u32 clump_blocks;
+	u32 alloc_blocks;
+	u32 cached_start;
+	u32 cached_blocks;
+	hfsplus_extent_rec first_extents;
+	hfsplus_extent_rec cached_extents;
+	unsigned int extent_state;
+	struct mutex extents_lock;
+
+	/*
+	 * Immutable data.
+	 */
+	struct inode *rsrc_inode;
+	__be32 create_date;
+
+	/*
+	 * Protected by sbi->vh_mutex.
+	 */
+	u32 linkid;
+
+	/*
+	 * Accessed using atomic bitops.
+	 */
+	unsigned long flags;
+
+	/*
+	 * Protected by i_mutex.
+	 */
+	sector_t fs_blocks;
+	u8 userflags;		/* BSD user file flags */
+	struct list_head open_dir_list;
+	loff_t phys_size;
+
+	struct inode vfs_inode;
+};
+
+#define HFSPLUS_EXT_DIRTY	0x0001
+#define HFSPLUS_EXT_NEW		0x0002
+
+#define HFSPLUS_I_RSRC		0	/* represents a resource fork */
+#define HFSPLUS_I_CAT_DIRTY	1	/* has changes in the catalog tree */
+#define HFSPLUS_I_EXT_DIRTY	2	/* has changes in the extent tree */
+#define HFSPLUS_I_ALLOC_DIRTY	3	/* has changes in the allocation file */
+
+#define HFSPLUS_IS_RSRC(inode) \
+	test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags)
+
+static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
+{
+	return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
+}
+
+/*
+ * Mark an inode dirty, and also mark the btree in which the
+ * specific type of metadata is stored.
+ * For data or metadata that gets written back by into the catalog btree
+ * by hfsplus_write_inode a plain mark_inode_dirty call is enough.
+ */
+static inline void hfsplus_mark_inode_dirty(struct inode *inode,
+		unsigned int flag)
+{
+	set_bit(flag, &HFSPLUS_I(inode)->flags);
+	mark_inode_dirty(inode);
+}
+
+struct hfs_find_data {
+	/* filled by caller */
+	hfsplus_btree_key *search_key;
+	hfsplus_btree_key *key;
+	/* filled by find */
+	struct hfs_btree *tree;
+	struct hfs_bnode *bnode;
+	/* filled by findrec */
+	int record;
+	int keyoffset, keylength;
+	int entryoffset, entrylength;
+};
+
+struct hfsplus_readdir_data {
+	struct list_head list;
+	struct file *file;
+	struct hfsplus_cat_key key;
+};
+
+/*
+ * Find minimum acceptible I/O size for an hfsplus sb.
+ */
+static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
+{
+	return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev),
+		     HFSPLUS_SECTOR_SIZE);
+}
+
+#define hfs_btree_open hfsplus_btree_open
+#define hfs_btree_close hfsplus_btree_close
+#define hfs_btree_write hfsplus_btree_write
+#define hfs_bmap_alloc hfsplus_bmap_alloc
+#define hfs_bmap_free hfsplus_bmap_free
+#define hfs_bnode_read hfsplus_bnode_read
+#define hfs_bnode_read_u16 hfsplus_bnode_read_u16
+#define hfs_bnode_read_u8 hfsplus_bnode_read_u8
+#define hfs_bnode_read_key hfsplus_bnode_read_key
+#define hfs_bnode_write hfsplus_bnode_write
+#define hfs_bnode_write_u16 hfsplus_bnode_write_u16
+#define hfs_bnode_clear hfsplus_bnode_clear
+#define hfs_bnode_copy hfsplus_bnode_copy
+#define hfs_bnode_move hfsplus_bnode_move
+#define hfs_bnode_dump hfsplus_bnode_dump
+#define hfs_bnode_unlink hfsplus_bnode_unlink
+#define hfs_bnode_findhash hfsplus_bnode_findhash
+#define hfs_bnode_find hfsplus_bnode_find
+#define hfs_bnode_unhash hfsplus_bnode_unhash
+#define hfs_bnode_free hfsplus_bnode_free
+#define hfs_bnode_create hfsplus_bnode_create
+#define hfs_bnode_get hfsplus_bnode_get
+#define hfs_bnode_put hfsplus_bnode_put
+#define hfs_brec_lenoff hfsplus_brec_lenoff
+#define hfs_brec_keylen hfsplus_brec_keylen
+#define hfs_brec_insert hfsplus_brec_insert
+#define hfs_brec_remove hfsplus_brec_remove
+#define hfs_find_init hfsplus_find_init
+#define hfs_find_exit hfsplus_find_exit
+#define __hfs_brec_find __hplusfs_brec_find
+#define hfs_brec_find hfsplus_brec_find
+#define hfs_brec_read hfsplus_brec_read
+#define hfs_brec_goto hfsplus_brec_goto
+#define hfs_part_find hfsplus_part_find
+
+/*
+ * definitions for ext2 flag ioctls (linux really needs a generic
+ * interface for this).
+ */
+
+/* ext2 ioctls (EXT2_IOC_GETFLAGS and EXT2_IOC_SETFLAGS) to support
+ * chattr/lsattr */
+#define HFSPLUS_IOC_EXT2_GETFLAGS	FS_IOC_GETFLAGS
+#define HFSPLUS_IOC_EXT2_SETFLAGS	FS_IOC_SETFLAGS
+
+
+/*
+ * Functions in any *.c used in other files
+ */
+
+/* bitmap.c */
+int hfsplus_block_allocate(struct super_block *, u32, u32, u32 *);
+int hfsplus_block_free(struct super_block *, u32, u32);
+
+/* btree.c */
+struct hfs_btree *hfs_btree_open(struct super_block *, u32);
+void hfs_btree_close(struct hfs_btree *);
+void hfs_btree_write(struct hfs_btree *);
+struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *);
+void hfs_bmap_free(struct hfs_bnode *);
+
+/* bnode.c */
+void hfs_bnode_read(struct hfs_bnode *, void *, int, int);
+u16 hfs_bnode_read_u16(struct hfs_bnode *, int);
+u8 hfs_bnode_read_u8(struct hfs_bnode *, int);
+void hfs_bnode_read_key(struct hfs_bnode *, void *, int);
+void hfs_bnode_write(struct hfs_bnode *, void *, int, int);
+void hfs_bnode_write_u16(struct hfs_bnode *, int, u16);
+void hfs_bnode_clear(struct hfs_bnode *, int, int);
+void hfs_bnode_copy(struct hfs_bnode *, int,
+		    struct hfs_bnode *, int, int);
+void hfs_bnode_move(struct hfs_bnode *, int, int, int);
+void hfs_bnode_dump(struct hfs_bnode *);
+void hfs_bnode_unlink(struct hfs_bnode *);
+struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *, u32);
+struct hfs_bnode *hfs_bnode_find(struct hfs_btree *, u32);
+void hfs_bnode_unhash(struct hfs_bnode *);
+void hfs_bnode_free(struct hfs_bnode *);
+struct hfs_bnode *hfs_bnode_create(struct hfs_btree *, u32);
+void hfs_bnode_get(struct hfs_bnode *);
+void hfs_bnode_put(struct hfs_bnode *);
+
+/* brec.c */
+u16 hfs_brec_lenoff(struct hfs_bnode *, u16, u16 *);
+u16 hfs_brec_keylen(struct hfs_bnode *, u16);
+int hfs_brec_insert(struct hfs_find_data *, void *, int);
+int hfs_brec_remove(struct hfs_find_data *);
+
+/* bfind.c */
+int hfs_find_init(struct hfs_btree *, struct hfs_find_data *);
+void hfs_find_exit(struct hfs_find_data *);
+int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *);
+int hfs_brec_find(struct hfs_find_data *);
+int hfs_brec_read(struct hfs_find_data *, void *, int);
+int hfs_brec_goto(struct hfs_find_data *, int);
+
+/* catalog.c */
+int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *,
+		const hfsplus_btree_key *);
+int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *,
+		const hfsplus_btree_key *);
+void hfsplus_cat_build_key(struct super_block *sb,
+		hfsplus_btree_key *, u32, struct qstr *);
+int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *);
+int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
+int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
+int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
+		       struct inode *, struct qstr *);
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
+
+/* dir.c */
+extern const struct inode_operations hfsplus_dir_inode_operations;
+extern const struct file_operations hfsplus_dir_operations;
+
+/* extents.c */
+int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
+void hfsplus_ext_write_extent(struct inode *);
+int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int);
+int hfsplus_free_fork(struct super_block *, u32,
+		struct hfsplus_fork_raw *, int);
+int hfsplus_file_extend(struct inode *);
+void hfsplus_file_truncate(struct inode *);
+
+/* inode.c */
+extern const struct address_space_operations hfsplus_aops;
+extern const struct address_space_operations hfsplus_btree_aops;
+extern const struct dentry_operations hfsplus_dentry_operations;
+
+void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *);
+void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *);
+int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *);
+int hfsplus_cat_write_inode(struct inode *);
+struct inode *hfsplus_new_inode(struct super_block *, int);
+void hfsplus_delete_inode(struct inode *);
+int hfsplus_file_fsync(struct file *file, int datasync);
+
+/* ioctl.c */
+long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+int hfsplus_setxattr(struct dentry *dentry, const char *name,
+		     const void *value, size_t size, int flags);
+ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+			 void *value, size_t size);
+ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
+
+/* options.c */
+int hfsplus_parse_options(char *, struct hfsplus_sb_info *);
+int hfsplus_parse_options_remount(char *input, int *force);
+void hfsplus_fill_defaults(struct hfsplus_sb_info *);
+int hfsplus_show_options(struct seq_file *, struct vfsmount *);
+
+/* super.c */
+struct inode *hfsplus_iget(struct super_block *, unsigned long);
+int hfsplus_sync_fs(struct super_block *sb, int wait);
+
+/* tables.c */
+extern u16 hfsplus_case_fold_table[];
+extern u16 hfsplus_decompose_table[];
+extern u16 hfsplus_compose_table[];
+
+/* unicode.c */
+int hfsplus_strcasecmp(const struct hfsplus_unistr *,
+		const struct hfsplus_unistr *);
+int hfsplus_strcmp(const struct hfsplus_unistr *,
+		const struct hfsplus_unistr *);
+int hfsplus_uni2asc(struct super_block *,
+		const struct hfsplus_unistr *, char *, int *);
+int hfsplus_asc2uni(struct super_block *,
+		struct hfsplus_unistr *, const char *, int);
+int hfsplus_hash_dentry(const struct dentry *dentry,
+		const struct inode *inode, struct qstr *str);
+int hfsplus_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
+
+/* wrapper.c */
+int hfsplus_read_wrapper(struct super_block *);
+int hfs_part_find(struct super_block *, sector_t *, sector_t *);
+int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
+		void *buf, void **data, int rw);
+
+/* time macros */
+#define __hfsp_mt2ut(t)		(be32_to_cpu(t) - 2082844800U)
+#define __hfsp_ut2mt(t)		(cpu_to_be32(t + 2082844800U))
+
+/* compatibility */
+#define hfsp_mt2ut(t)		(struct timespec){ .tv_sec = __hfsp_mt2ut(t) }
+#define hfsp_ut2mt(t)		__hfsp_ut2mt((t).tv_sec)
+#define hfsp_now2mt()		__hfsp_ut2mt(get_seconds())
+
+#endif
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
new file mode 100644
index 00000000..927cdd6d
--- /dev/null
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -0,0 +1,337 @@
+/*
+ *  linux/include/linux/hfsplus_raw.h
+ *
+ * Copyright (C) 1999
+ * Brad Boyer (flar@pants.nu)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Format of structures on disk
+ * Information taken from Apple Technote #1150 (HFS Plus Volume Format)
+ *
+ */
+
+#ifndef _LINUX_HFSPLUS_RAW_H
+#define _LINUX_HFSPLUS_RAW_H
+
+#include <linux/types.h>
+
+/* Some constants */
+#define HFSPLUS_SECTOR_SIZE        512
+#define HFSPLUS_SECTOR_SHIFT         9
+#define HFSPLUS_VOLHEAD_SECTOR       2
+#define HFSPLUS_VOLHEAD_SIG     0x482b
+#define HFSPLUS_VOLHEAD_SIGX    0x4858
+#define HFSPLUS_SUPER_MAGIC     0x482b
+#define HFSPLUS_MIN_VERSION          4
+#define HFSPLUS_CURRENT_VERSION      5
+
+#define HFSP_WRAP_MAGIC         0x4244
+#define HFSP_WRAP_ATTRIB_SLOCK  0x8000
+#define HFSP_WRAP_ATTRIB_SPARED 0x0200
+
+#define HFSP_WRAPOFF_SIG          0x00
+#define HFSP_WRAPOFF_ATTRIB       0x0A
+#define HFSP_WRAPOFF_ABLKSIZE     0x14
+#define HFSP_WRAPOFF_ABLKSTART    0x1C
+#define HFSP_WRAPOFF_EMBEDSIG     0x7C
+#define HFSP_WRAPOFF_EMBEDEXT     0x7E
+
+#define HFSP_HIDDENDIR_NAME \
+	"\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data"
+
+#define HFSP_HARDLINK_TYPE	0x686c6e6b	/* 'hlnk' */
+#define HFSP_HFSPLUS_CREATOR	0x6866732b	/* 'hfs+' */
+
+#define HFSP_SYMLINK_TYPE	0x736c6e6b	/* 'slnk' */
+#define HFSP_SYMLINK_CREATOR	0x72686170	/* 'rhap' */
+
+#define HFSP_MOUNT_VERSION	0x482b4c78	/* 'H+Lx' */
+
+/* Structures used on disk */
+
+typedef __be32 hfsplus_cnid;
+typedef __be16 hfsplus_unichr;
+
+/* A "string" as used in filenames, etc. */
+struct hfsplus_unistr {
+	__be16 length;
+	hfsplus_unichr unicode[255];
+} __packed;
+
+#define HFSPLUS_MAX_STRLEN 255
+
+/* POSIX permissions */
+struct hfsplus_perm {
+	__be32 owner;
+	__be32 group;
+	u8  rootflags;
+	u8  userflags;
+	__be16 mode;
+	__be32 dev;
+} __packed;
+
+#define HFSPLUS_FLG_NODUMP	0x01
+#define HFSPLUS_FLG_IMMUTABLE	0x02
+#define HFSPLUS_FLG_APPEND	0x04
+
+/* A single contiguous area of a file */
+struct hfsplus_extent {
+	__be32 start_block;
+	__be32 block_count;
+} __packed;
+typedef struct hfsplus_extent hfsplus_extent_rec[8];
+
+/* Information for a "Fork" in a file */
+struct hfsplus_fork_raw {
+	__be64 total_size;
+	__be32 clump_size;
+	__be32 total_blocks;
+	hfsplus_extent_rec extents;
+} __packed;
+
+/* HFS+ Volume Header */
+struct hfsplus_vh {
+	__be16 signature;
+	__be16 version;
+	__be32 attributes;
+	__be32 last_mount_vers;
+	u32 reserved;
+
+	__be32 create_date;
+	__be32 modify_date;
+	__be32 backup_date;
+	__be32 checked_date;
+
+	__be32 file_count;
+	__be32 folder_count;
+
+	__be32 blocksize;
+	__be32 total_blocks;
+	__be32 free_blocks;
+
+	__be32 next_alloc;
+	__be32 rsrc_clump_sz;
+	__be32 data_clump_sz;
+	hfsplus_cnid next_cnid;
+
+	__be32 write_count;
+	__be64 encodings_bmp;
+
+	u8 finder_info[32];
+
+	struct hfsplus_fork_raw alloc_file;
+	struct hfsplus_fork_raw ext_file;
+	struct hfsplus_fork_raw cat_file;
+	struct hfsplus_fork_raw attr_file;
+	struct hfsplus_fork_raw start_file;
+} __packed;
+
+/* HFS+ volume attributes */
+#define HFSPLUS_VOL_UNMNT		(1 << 8)
+#define HFSPLUS_VOL_SPARE_BLK		(1 << 9)
+#define HFSPLUS_VOL_NOCACHE		(1 << 10)
+#define HFSPLUS_VOL_INCNSTNT		(1 << 11)
+#define HFSPLUS_VOL_NODEID_REUSED	(1 << 12)
+#define HFSPLUS_VOL_JOURNALED		(1 << 13)
+#define HFSPLUS_VOL_SOFTLOCK		(1 << 15)
+
+/* HFS+ BTree node descriptor */
+struct hfs_bnode_desc {
+	__be32 next;
+	__be32 prev;
+	s8 type;
+	u8 height;
+	__be16 num_recs;
+	u16 reserved;
+} __packed;
+
+/* HFS+ BTree node types */
+#define HFS_NODE_INDEX	0x00
+#define HFS_NODE_HEADER	0x01
+#define HFS_NODE_MAP	0x02
+#define HFS_NODE_LEAF	0xFF
+
+/* HFS+ BTree header */
+struct hfs_btree_header_rec {
+	__be16 depth;
+	__be32 root;
+	__be32 leaf_count;
+	__be32 leaf_head;
+	__be32 leaf_tail;
+	__be16 node_size;
+	__be16 max_key_len;
+	__be32 node_count;
+	__be32 free_nodes;
+	u16 reserved1;
+	__be32 clump_size;
+	u8 btree_type;
+	u8 key_type;
+	__be32 attributes;
+	u32 reserved3[16];
+} __packed;
+
+/* BTree attributes */
+#define HFS_TREE_BIGKEYS	2
+#define HFS_TREE_VARIDXKEYS	4
+
+/* HFS+ BTree misc info */
+#define HFSPLUS_TREE_HEAD 0
+#define HFSPLUS_NODE_MXSZ 32768
+
+/* Some special File ID numbers (stolen from hfs.h) */
+#define HFSPLUS_POR_CNID		1	/* Parent Of the Root */
+#define HFSPLUS_ROOT_CNID		2	/* ROOT directory */
+#define HFSPLUS_EXT_CNID		3	/* EXTents B-tree */
+#define HFSPLUS_CAT_CNID		4	/* CATalog B-tree */
+#define HFSPLUS_BAD_CNID		5	/* BAD blocks file */
+#define HFSPLUS_ALLOC_CNID		6	/* ALLOCation file */
+#define HFSPLUS_START_CNID		7	/* STARTup file */
+#define HFSPLUS_ATTR_CNID		8	/* ATTRibutes file */
+#define HFSPLUS_EXCH_CNID		15	/* ExchangeFiles temp id */
+#define HFSPLUS_FIRSTUSER_CNID		16	/* first available user id */
+
+/* btree key type */
+#define HFSPLUS_KEY_CASEFOLDING		0xCF	/* case-insensitive */
+#define HFSPLUS_KEY_BINARY		0xBC	/* case-sensitive */
+
+/* HFS+ catalog entry key */
+struct hfsplus_cat_key {
+	__be16 key_len;
+	hfsplus_cnid parent;
+	struct hfsplus_unistr name;
+} __packed;
+
+#define HFSPLUS_CAT_KEYLEN	(sizeof(struct hfsplus_cat_key))
+
+/* Structs from hfs.h */
+struct hfsp_point {
+	__be16 v;
+	__be16 h;
+} __packed;
+
+struct hfsp_rect {
+	__be16 top;
+	__be16 left;
+	__be16 bottom;
+	__be16 right;
+} __packed;
+
+
+/* HFS directory info (stolen from hfs.h */
+struct DInfo {
+	struct hfsp_rect frRect;
+	__be16 frFlags;
+	struct hfsp_point frLocation;
+	__be16 frView;
+} __packed;
+
+struct DXInfo {
+	struct hfsp_point frScroll;
+	__be32 frOpenChain;
+	__be16 frUnused;
+	__be16 frComment;
+	__be32 frPutAway;
+} __packed;
+
+/* HFS+ folder data (part of an hfsplus_cat_entry) */
+struct hfsplus_cat_folder {
+	__be16 type;
+	__be16 flags;
+	__be32 valence;
+	hfsplus_cnid id;
+	__be32 create_date;
+	__be32 content_mod_date;
+	__be32 attribute_mod_date;
+	__be32 access_date;
+	__be32 backup_date;
+	struct hfsplus_perm permissions;
+	struct DInfo user_info;
+	struct DXInfo finder_info;
+	__be32 text_encoding;
+	u32 reserved;
+} __packed;
+
+/* HFS file info (stolen from hfs.h) */
+struct FInfo {
+	__be32 fdType;
+	__be32 fdCreator;
+	__be16 fdFlags;
+	struct hfsp_point fdLocation;
+	__be16 fdFldr;
+} __packed;
+
+struct FXInfo {
+	__be16 fdIconID;
+	u8 fdUnused[8];
+	__be16 fdComment;
+	__be32 fdPutAway;
+} __packed;
+
+/* HFS+ file data (part of a cat_entry) */
+struct hfsplus_cat_file {
+	__be16 type;
+	__be16 flags;
+	u32 reserved1;
+	hfsplus_cnid id;
+	__be32 create_date;
+	__be32 content_mod_date;
+	__be32 attribute_mod_date;
+	__be32 access_date;
+	__be32 backup_date;
+	struct hfsplus_perm permissions;
+	struct FInfo user_info;
+	struct FXInfo finder_info;
+	__be32 text_encoding;
+	u32 reserved2;
+
+	struct hfsplus_fork_raw data_fork;
+	struct hfsplus_fork_raw rsrc_fork;
+} __packed;
+
+/* File attribute bits */
+#define HFSPLUS_FILE_LOCKED		0x0001
+#define HFSPLUS_FILE_THREAD_EXISTS	0x0002
+
+/* HFS+ catalog thread (part of a cat_entry) */
+struct hfsplus_cat_thread {
+	__be16 type;
+	s16 reserved;
+	hfsplus_cnid parentID;
+	struct hfsplus_unistr nodeName;
+} __packed;
+
+#define HFSPLUS_MIN_THREAD_SZ 10
+
+/* A data record in the catalog tree */
+typedef union {
+	__be16 type;
+	struct hfsplus_cat_folder folder;
+	struct hfsplus_cat_file file;
+	struct hfsplus_cat_thread thread;
+} __packed hfsplus_cat_entry;
+
+/* HFS+ catalog entry type */
+#define HFSPLUS_FOLDER         0x0001
+#define HFSPLUS_FILE           0x0002
+#define HFSPLUS_FOLDER_THREAD  0x0003
+#define HFSPLUS_FILE_THREAD    0x0004
+
+/* HFS+ extents tree key */
+struct hfsplus_ext_key {
+	__be16 key_len;
+	u8 fork_type;
+	u8 pad;
+	hfsplus_cnid cnid;
+	__be32 start_block;
+} __packed;
+
+#define HFSPLUS_EXT_KEYLEN	sizeof(struct hfsplus_ext_key)
+
+/* HFS+ generic BTree key */
+typedef union {
+	__be16 key_len;
+	struct hfsplus_cat_key cat;
+	struct hfsplus_ext_key ext;
+} __packed hfsplus_btree_key;
+
+#endif
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
new file mode 100644
index 00000000..b248a6cf
--- /dev/null
+++ b/fs/hfsplus/inode.c
@@ -0,0 +1,617 @@
+/*
+ *  linux/fs/hfsplus/inode.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Inode handling routines
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/mpage.h>
+#include <linux/sched.h>
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+static int hfsplus_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, hfsplus_get_block);
+}
+
+static int hfsplus_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, hfsplus_get_block, wbc);
+}
+
+static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	*pagep = NULL;
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+				hfsplus_get_block,
+				&HFSPLUS_I(mapping->host)->phys_size);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
+
+static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, hfsplus_get_block);
+}
+
+static int hfsplus_releasepage(struct page *page, gfp_t mask)
+{
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct hfs_btree *tree;
+	struct hfs_bnode *node;
+	u32 nidx;
+	int i, res = 1;
+
+	switch (inode->i_ino) {
+	case HFSPLUS_EXT_CNID:
+		tree = HFSPLUS_SB(sb)->ext_tree;
+		break;
+	case HFSPLUS_CAT_CNID:
+		tree = HFSPLUS_SB(sb)->cat_tree;
+		break;
+	case HFSPLUS_ATTR_CNID:
+		tree = HFSPLUS_SB(sb)->attr_tree;
+		break;
+	default:
+		BUG();
+		return 0;
+	}
+	if (!tree)
+		return 0;
+	if (tree->node_size >= PAGE_CACHE_SIZE) {
+		nidx = page->index >>
+			(tree->node_size_shift - PAGE_CACHE_SHIFT);
+		spin_lock(&tree->hash_lock);
+		node = hfs_bnode_findhash(tree, nidx);
+		if (!node)
+			;
+		else if (atomic_read(&node->refcnt))
+			res = 0;
+		if (res && node) {
+			hfs_bnode_unhash(node);
+			hfs_bnode_free(node);
+		}
+		spin_unlock(&tree->hash_lock);
+	} else {
+		nidx = page->index <<
+			(PAGE_CACHE_SHIFT - tree->node_size_shift);
+		i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+		spin_lock(&tree->hash_lock);
+		do {
+			node = hfs_bnode_findhash(tree, nidx++);
+			if (!node)
+				continue;
+			if (atomic_read(&node->refcnt)) {
+				res = 0;
+				break;
+			}
+			hfs_bnode_unhash(node);
+			hfs_bnode_free(node);
+		} while (--i && nidx < tree->node_count);
+		spin_unlock(&tree->hash_lock);
+	}
+	return res ? try_to_free_buffers(page) : 0;
+}
+
+static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
+		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
+	ssize_t ret;
+
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				  offset, nr_segs, hfsplus_get_block, NULL);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+
+	return ret;
+}
+
+static int hfsplus_writepages(struct address_space *mapping,
+			      struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, hfsplus_get_block);
+}
+
+const struct address_space_operations hfsplus_btree_aops = {
+	.readpage	= hfsplus_readpage,
+	.writepage	= hfsplus_writepage,
+	.write_begin	= hfsplus_write_begin,
+	.write_end	= generic_write_end,
+	.bmap		= hfsplus_bmap,
+	.releasepage	= hfsplus_releasepage,
+};
+
+const struct address_space_operations hfsplus_aops = {
+	.readpage	= hfsplus_readpage,
+	.writepage	= hfsplus_writepage,
+	.write_begin	= hfsplus_write_begin,
+	.write_end	= generic_write_end,
+	.bmap		= hfsplus_bmap,
+	.direct_IO	= hfsplus_direct_IO,
+	.writepages	= hfsplus_writepages,
+};
+
+const struct dentry_operations hfsplus_dentry_operations = {
+	.d_hash       = hfsplus_hash_dentry,
+	.d_compare    = hfsplus_compare_dentry,
+};
+
+static struct dentry *hfsplus_file_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	struct hfs_find_data fd;
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = NULL;
+	struct hfsplus_inode_info *hip;
+	int err;
+
+	if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
+		goto out;
+
+	inode = HFSPLUS_I(dir)->rsrc_inode;
+	if (inode)
+		goto out;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	hip = HFSPLUS_I(inode);
+	inode->i_ino = dir->i_ino;
+	INIT_LIST_HEAD(&hip->open_dir_list);
+	mutex_init(&hip->extents_lock);
+	hip->extent_state = 0;
+	hip->flags = 0;
+	set_bit(HFSPLUS_I_RSRC, &hip->flags);
+
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
+	err = hfsplus_find_cat(sb, dir->i_ino, &fd);
+	if (!err)
+		err = hfsplus_cat_read_inode(inode, &fd);
+	hfs_find_exit(&fd);
+	if (err) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	hip->rsrc_inode = dir;
+	HFSPLUS_I(dir)->rsrc_inode = inode;
+	igrab(dir);
+
+	/*
+	 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
+	 * want resource fork inodes in the regular inode space, we make them
+	 * appear hashed, but do not put on any lists.  hlist_del()
+	 * will work fine and require no locking.
+	 */
+	hlist_add_fake(&inode->i_hash);
+
+	mark_inode_dirty(inode);
+out:
+	d_add(dentry, inode);
+	return NULL;
+}
+
+static void hfsplus_get_perms(struct inode *inode,
+		struct hfsplus_perm *perms, int dir)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+	u16 mode;
+
+	mode = be16_to_cpu(perms->mode);
+
+	inode->i_uid = be32_to_cpu(perms->owner);
+	if (!inode->i_uid && !mode)
+		inode->i_uid = sbi->uid;
+
+	inode->i_gid = be32_to_cpu(perms->group);
+	if (!inode->i_gid && !mode)
+		inode->i_gid = sbi->gid;
+
+	if (dir) {
+		mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
+		mode |= S_IFDIR;
+	} else if (!mode)
+		mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
+	inode->i_mode = mode;
+
+	HFSPLUS_I(inode)->userflags = perms->userflags;
+	if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
+		inode->i_flags |= S_IMMUTABLE;
+	else
+		inode->i_flags &= ~S_IMMUTABLE;
+	if (perms->rootflags & HFSPLUS_FLG_APPEND)
+		inode->i_flags |= S_APPEND;
+	else
+		inode->i_flags &= ~S_APPEND;
+}
+
+static int hfsplus_file_open(struct inode *inode, struct file *file)
+{
+	if (HFSPLUS_IS_RSRC(inode))
+		inode = HFSPLUS_I(inode)->rsrc_inode;
+	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+		return -EOVERFLOW;
+	atomic_inc(&HFSPLUS_I(inode)->opencnt);
+	return 0;
+}
+
+static int hfsplus_file_release(struct inode *inode, struct file *file)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (HFSPLUS_IS_RSRC(inode))
+		inode = HFSPLUS_I(inode)->rsrc_inode;
+	if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
+		mutex_lock(&inode->i_mutex);
+		hfsplus_file_truncate(inode);
+		if (inode->i_flags & S_DEAD) {
+			hfsplus_delete_cat(inode->i_ino,
+					   HFSPLUS_SB(sb)->hidden_dir, NULL);
+			hfsplus_delete_inode(inode);
+		}
+		mutex_unlock(&inode->i_mutex);
+	}
+	return 0;
+}
+
+static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+int hfsplus_file_fsync(struct file *file, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+	int error = 0, error2;
+
+	/*
+	 * Sync inode metadata into the catalog and extent trees.
+	 */
+	sync_inode_metadata(inode, 1);
+
+	/*
+	 * And explicitly write out the btrees.
+	 */
+	if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags))
+		error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
+
+	if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) {
+		error2 =
+			filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
+		if (!error)
+			error = error2;
+	}
+
+	if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) {
+		error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
+		if (!error)
+			error = error2;
+	}
+
+	if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+
+	return error;
+}
+
+static const struct inode_operations hfsplus_file_inode_operations = {
+	.lookup		= hfsplus_file_lookup,
+	.truncate	= hfsplus_file_truncate,
+	.setattr	= hfsplus_setattr,
+	.setxattr	= hfsplus_setxattr,
+	.getxattr	= hfsplus_getxattr,
+	.listxattr	= hfsplus_listxattr,
+};
+
+static const struct file_operations hfsplus_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.splice_read	= generic_file_splice_read,
+	.fsync		= hfsplus_file_fsync,
+	.open		= hfsplus_file_open,
+	.release	= hfsplus_file_release,
+	.unlocked_ioctl = hfsplus_ioctl,
+};
+
+struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct inode *inode = new_inode(sb);
+	struct hfsplus_inode_info *hip;
+
+	if (!inode)
+		return NULL;
+
+	inode->i_ino = sbi->next_cnid++;
+	inode->i_mode = mode;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	inode->i_nlink = 1;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+
+	hip = HFSPLUS_I(inode);
+	INIT_LIST_HEAD(&hip->open_dir_list);
+	mutex_init(&hip->extents_lock);
+	atomic_set(&hip->opencnt, 0);
+	hip->extent_state = 0;
+	hip->flags = 0;
+	memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
+	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+	hip->alloc_blocks = 0;
+	hip->first_blocks = 0;
+	hip->cached_start = 0;
+	hip->cached_blocks = 0;
+	hip->phys_size = 0;
+	hip->fs_blocks = 0;
+	hip->rsrc_inode = NULL;
+	if (S_ISDIR(inode->i_mode)) {
+		inode->i_size = 2;
+		sbi->folder_count++;
+		inode->i_op = &hfsplus_dir_inode_operations;
+		inode->i_fop = &hfsplus_dir_operations;
+	} else if (S_ISREG(inode->i_mode)) {
+		sbi->file_count++;
+		inode->i_op = &hfsplus_file_inode_operations;
+		inode->i_fop = &hfsplus_file_operations;
+		inode->i_mapping->a_ops = &hfsplus_aops;
+		hip->clump_blocks = sbi->data_clump_blocks;
+	} else if (S_ISLNK(inode->i_mode)) {
+		sbi->file_count++;
+		inode->i_op = &page_symlink_inode_operations;
+		inode->i_mapping->a_ops = &hfsplus_aops;
+		hip->clump_blocks = 1;
+	} else
+		sbi->file_count++;
+	insert_inode_hash(inode);
+	mark_inode_dirty(inode);
+	sb->s_dirt = 1;
+
+	return inode;
+}
+
+void hfsplus_delete_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (S_ISDIR(inode->i_mode)) {
+		HFSPLUS_SB(sb)->folder_count--;
+		sb->s_dirt = 1;
+		return;
+	}
+	HFSPLUS_SB(sb)->file_count--;
+	if (S_ISREG(inode->i_mode)) {
+		if (!inode->i_nlink) {
+			inode->i_size = 0;
+			hfsplus_file_truncate(inode);
+		}
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_size = 0;
+		hfsplus_file_truncate(inode);
+	}
+	sb->s_dirt = 1;
+}
+
+void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	u32 count;
+	int i;
+
+	memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
+	for (count = 0, i = 0; i < 8; i++)
+		count += be32_to_cpu(fork->extents[i].block_count);
+	hip->first_blocks = count;
+	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+	hip->cached_start = 0;
+	hip->cached_blocks = 0;
+
+	hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
+	hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
+	hip->fs_blocks =
+		(inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
+	hip->clump_blocks =
+		be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
+	if (!hip->clump_blocks) {
+		hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
+			sbi->rsrc_clump_blocks :
+			sbi->data_clump_blocks;
+	}
+}
+
+void hfsplus_inode_write_fork(struct inode *inode,
+		struct hfsplus_fork_raw *fork)
+{
+	memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
+	       sizeof(hfsplus_extent_rec));
+	fork->total_size = cpu_to_be64(inode->i_size);
+	fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
+}
+
+int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
+{
+	hfsplus_cat_entry entry;
+	int res = 0;
+	u16 type;
+
+	type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
+
+	HFSPLUS_I(inode)->linkid = 0;
+	if (type == HFSPLUS_FOLDER) {
+		struct hfsplus_cat_folder *folder = &entry.folder;
+
+		if (fd->entrylength < sizeof(struct hfsplus_cat_folder))
+			/* panic? */;
+		hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
+					sizeof(struct hfsplus_cat_folder));
+		hfsplus_get_perms(inode, &folder->permissions, 1);
+		inode->i_nlink = 1;
+		inode->i_size = 2 + be32_to_cpu(folder->valence);
+		inode->i_atime = hfsp_mt2ut(folder->access_date);
+		inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
+		inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
+		HFSPLUS_I(inode)->create_date = folder->create_date;
+		HFSPLUS_I(inode)->fs_blocks = 0;
+		inode->i_op = &hfsplus_dir_inode_operations;
+		inode->i_fop = &hfsplus_dir_operations;
+	} else if (type == HFSPLUS_FILE) {
+		struct hfsplus_cat_file *file = &entry.file;
+
+		if (fd->entrylength < sizeof(struct hfsplus_cat_file))
+			/* panic? */;
+		hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
+					sizeof(struct hfsplus_cat_file));
+
+		hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
+					&file->rsrc_fork : &file->data_fork);
+		hfsplus_get_perms(inode, &file->permissions, 0);
+		inode->i_nlink = 1;
+		if (S_ISREG(inode->i_mode)) {
+			if (file->permissions.dev)
+				inode->i_nlink =
+					be32_to_cpu(file->permissions.dev);
+			inode->i_op = &hfsplus_file_inode_operations;
+			inode->i_fop = &hfsplus_file_operations;
+			inode->i_mapping->a_ops = &hfsplus_aops;
+		} else if (S_ISLNK(inode->i_mode)) {
+			inode->i_op = &page_symlink_inode_operations;
+			inode->i_mapping->a_ops = &hfsplus_aops;
+		} else {
+			init_special_inode(inode, inode->i_mode,
+					   be32_to_cpu(file->permissions.dev));
+		}
+		inode->i_atime = hfsp_mt2ut(file->access_date);
+		inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
+		inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
+		HFSPLUS_I(inode)->create_date = file->create_date;
+	} else {
+		printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
+		res = -EIO;
+	}
+	return res;
+}
+
+int hfsplus_cat_write_inode(struct inode *inode)
+{
+	struct inode *main_inode = inode;
+	struct hfs_find_data fd;
+	hfsplus_cat_entry entry;
+
+	if (HFSPLUS_IS_RSRC(inode))
+		main_inode = HFSPLUS_I(inode)->rsrc_inode;
+
+	if (!main_inode->i_nlink)
+		return 0;
+
+	if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
+		/* panic? */
+		return -EIO;
+
+	if (hfsplus_find_cat(main_inode->i_sb, main_inode->i_ino, &fd))
+		/* panic? */
+		goto out;
+
+	if (S_ISDIR(main_inode->i_mode)) {
+		struct hfsplus_cat_folder *folder = &entry.folder;
+
+		if (fd.entrylength < sizeof(struct hfsplus_cat_folder))
+			/* panic? */;
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+					sizeof(struct hfsplus_cat_folder));
+		/* simple node checks? */
+		hfsplus_cat_set_perms(inode, &folder->permissions);
+		folder->access_date = hfsp_ut2mt(inode->i_atime);
+		folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
+		folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
+		folder->valence = cpu_to_be32(inode->i_size - 2);
+		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
+					 sizeof(struct hfsplus_cat_folder));
+	} else if (HFSPLUS_IS_RSRC(inode)) {
+		struct hfsplus_cat_file *file = &entry.file;
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+			       sizeof(struct hfsplus_cat_file));
+		hfsplus_inode_write_fork(inode, &file->rsrc_fork);
+		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
+				sizeof(struct hfsplus_cat_file));
+	} else {
+		struct hfsplus_cat_file *file = &entry.file;
+
+		if (fd.entrylength < sizeof(struct hfsplus_cat_file))
+			/* panic? */;
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+					sizeof(struct hfsplus_cat_file));
+		hfsplus_inode_write_fork(inode, &file->data_fork);
+		hfsplus_cat_set_perms(inode, &file->permissions);
+		if (HFSPLUS_FLG_IMMUTABLE &
+				(file->permissions.rootflags |
+					file->permissions.userflags))
+			file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
+		else
+			file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
+		file->access_date = hfsp_ut2mt(inode->i_atime);
+		file->content_mod_date = hfsp_ut2mt(inode->i_mtime);
+		file->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
+		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
+					 sizeof(struct hfsplus_cat_file));
+	}
+
+	set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags);
+out:
+	hfs_find_exit(&fd);
+	return 0;
+}
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
new file mode 100644
index 00000000..fbaa6690
--- /dev/null
+++ b/fs/hfsplus/ioctl.c
@@ -0,0 +1,221 @@
+/*
+ *  linux/fs/hfsplus/ioctl.c
+ *
+ * Copyright (C) 2003
+ * Ethan Benson <erbenson@alaska.net>
+ * partially derived from linux/fs/ext2/ioctl.c
+ * Copyright (C) 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * hfsplus ioctls
+ */
+
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/sched.h>
+#include <linux/xattr.h>
+#include <asm/uaccess.h>
+#include "hfsplus_fs.h"
+
+static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	unsigned int flags = 0;
+
+	if (inode->i_flags & S_IMMUTABLE)
+		flags |= FS_IMMUTABLE_FL;
+	if (inode->i_flags & S_APPEND)
+		flags |= FS_APPEND_FL;
+	if (hip->userflags & HFSPLUS_FLG_NODUMP)
+		flags |= FS_NODUMP_FL;
+
+	return put_user(flags, user_flags);
+}
+
+static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	unsigned int flags;
+	int err = 0;
+
+	err = mnt_want_write(file->f_path.mnt);
+	if (err)
+		goto out;
+
+	if (!inode_owner_or_capable(inode)) {
+		err = -EACCES;
+		goto out_drop_write;
+	}
+
+	if (get_user(flags, user_flags)) {
+		err = -EFAULT;
+		goto out_drop_write;
+	}
+
+	mutex_lock(&inode->i_mutex);
+
+	if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
+	    inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+		if (!capable(CAP_LINUX_IMMUTABLE)) {
+			err = -EPERM;
+			goto out_unlock_inode;
+		}
+	}
+
+	/* don't silently ignore unsupported ext2 flags */
+	if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
+		err = -EOPNOTSUPP;
+		goto out_unlock_inode;
+	}
+
+	if (flags & FS_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	else
+		inode->i_flags &= ~S_IMMUTABLE;
+
+	if (flags & FS_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	else
+		inode->i_flags &= ~S_APPEND;
+
+	if (flags & FS_NODUMP_FL)
+		hip->userflags |= HFSPLUS_FLG_NODUMP;
+	else
+		hip->userflags &= ~HFSPLUS_FLG_NODUMP;
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+
+out_unlock_inode:
+	mutex_unlock(&inode->i_mutex);
+out_drop_write:
+	mnt_drop_write(file->f_path.mnt);
+out:
+	return err;
+}
+
+long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case HFSPLUS_IOC_EXT2_GETFLAGS:
+		return hfsplus_ioctl_getflags(file, argp);
+	case HFSPLUS_IOC_EXT2_SETFLAGS:
+		return hfsplus_ioctl_setflags(file, argp);
+	default:
+		return -ENOTTY;
+	}
+}
+
+int hfsplus_setxattr(struct dentry *dentry, const char *name,
+		     const void *value, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	struct hfs_find_data fd;
+	hfsplus_cat_entry entry;
+	struct hfsplus_cat_file *file;
+	int res;
+
+	if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+	if (res)
+		return res;
+	res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+	if (res)
+		goto out;
+	hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+			sizeof(struct hfsplus_cat_file));
+	file = &entry.file;
+
+	if (!strcmp(name, "hfs.type")) {
+		if (size == 4)
+			memcpy(&file->user_info.fdType, value, 4);
+		else
+			res = -ERANGE;
+	} else if (!strcmp(name, "hfs.creator")) {
+		if (size == 4)
+			memcpy(&file->user_info.fdCreator, value, 4);
+		else
+			res = -ERANGE;
+	} else
+		res = -EOPNOTSUPP;
+	if (!res) {
+		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
+				sizeof(struct hfsplus_cat_file));
+		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+	}
+out:
+	hfs_find_exit(&fd);
+	return res;
+}
+
+ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+			 void *value, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct hfs_find_data fd;
+	hfsplus_cat_entry entry;
+	struct hfsplus_cat_file *file;
+	ssize_t res = 0;
+
+	if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	if (size) {
+		res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+		if (res)
+			return res;
+		res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+		if (res)
+			goto out;
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+				sizeof(struct hfsplus_cat_file));
+	}
+	file = &entry.file;
+
+	if (!strcmp(name, "hfs.type")) {
+		if (size >= 4) {
+			memcpy(value, &file->user_info.fdType, 4);
+			res = 4;
+		} else
+			res = size ? -ERANGE : 4;
+	} else if (!strcmp(name, "hfs.creator")) {
+		if (size >= 4) {
+			memcpy(value, &file->user_info.fdCreator, 4);
+			res = 4;
+		} else
+			res = size ? -ERANGE : 4;
+	} else
+		res = -EOPNOTSUPP;
+out:
+	if (size)
+		hfs_find_exit(&fd);
+	return res;
+}
+
+#define HFSPLUS_ATTRLIST_SIZE (sizeof("hfs.creator")+sizeof("hfs.type"))
+
+ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	if (!buffer || !size)
+		return HFSPLUS_ATTRLIST_SIZE;
+	if (size < HFSPLUS_ATTRLIST_SIZE)
+		return -ERANGE;
+	strcpy(buffer, "hfs.type");
+	strcpy(buffer + sizeof("hfs.type"), "hfs.creator");
+
+	return HFSPLUS_ATTRLIST_SIZE;
+}
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
new file mode 100644
index 00000000..bb62a588
--- /dev/null
+++ b/fs/hfsplus/options.c
@@ -0,0 +1,230 @@
+/*
+ *  linux/fs/hfsplus/options.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Option parsing
+ */
+
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/parser.h>
+#include <linux/nls.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include "hfsplus_fs.h"
+
+enum {
+	opt_creator, opt_type,
+	opt_umask, opt_uid, opt_gid,
+	opt_part, opt_session, opt_nls,
+	opt_nodecompose, opt_decompose,
+	opt_barrier, opt_nobarrier,
+	opt_force, opt_err
+};
+
+static const match_table_t tokens = {
+	{ opt_creator, "creator=%s" },
+	{ opt_type, "type=%s" },
+	{ opt_umask, "umask=%o" },
+	{ opt_uid, "uid=%u" },
+	{ opt_gid, "gid=%u" },
+	{ opt_part, "part=%u" },
+	{ opt_session, "session=%u" },
+	{ opt_nls, "nls=%s" },
+	{ opt_decompose, "decompose" },
+	{ opt_nodecompose, "nodecompose" },
+	{ opt_barrier, "barrier" },
+	{ opt_nobarrier, "nobarrier" },
+	{ opt_force, "force" },
+	{ opt_err, NULL }
+};
+
+/* Initialize an options object to reasonable defaults */
+void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
+{
+	if (!opts)
+		return;
+
+	opts->creator = HFSPLUS_DEF_CR_TYPE;
+	opts->type = HFSPLUS_DEF_CR_TYPE;
+	opts->umask = current_umask();
+	opts->uid = current_uid();
+	opts->gid = current_gid();
+	opts->part = -1;
+	opts->session = -1;
+}
+
+/* convert a "four byte character" to a 32 bit int with error checks */
+static inline int match_fourchar(substring_t *arg, u32 *result)
+{
+	if (arg->to - arg->from != 4)
+		return -EINVAL;
+	memcpy(result, arg->from, 4);
+	return 0;
+}
+
+int hfsplus_parse_options_remount(char *input, int *force)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int token;
+
+	if (!input)
+		return 0;
+
+	while ((p = strsep(&input, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case opt_force:
+			*force = 1;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return 1;
+}
+
+/* Parse options from mount. Returns 0 on failure */
+/* input is the options passed to mount() as a string */
+int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int tmp, token;
+
+	if (!input)
+		goto done;
+
+	while ((p = strsep(&input, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case opt_creator:
+			if (match_fourchar(&args[0], &sbi->creator)) {
+				printk(KERN_ERR "hfs: creator requires a 4 character value\n");
+				return 0;
+			}
+			break;
+		case opt_type:
+			if (match_fourchar(&args[0], &sbi->type)) {
+				printk(KERN_ERR "hfs: type requires a 4 character value\n");
+				return 0;
+			}
+			break;
+		case opt_umask:
+			if (match_octal(&args[0], &tmp)) {
+				printk(KERN_ERR "hfs: umask requires a value\n");
+				return 0;
+			}
+			sbi->umask = (umode_t)tmp;
+			break;
+		case opt_uid:
+			if (match_int(&args[0], &tmp)) {
+				printk(KERN_ERR "hfs: uid requires an argument\n");
+				return 0;
+			}
+			sbi->uid = (uid_t)tmp;
+			break;
+		case opt_gid:
+			if (match_int(&args[0], &tmp)) {
+				printk(KERN_ERR "hfs: gid requires an argument\n");
+				return 0;
+			}
+			sbi->gid = (gid_t)tmp;
+			break;
+		case opt_part:
+			if (match_int(&args[0], &sbi->part)) {
+				printk(KERN_ERR "hfs: part requires an argument\n");
+				return 0;
+			}
+			break;
+		case opt_session:
+			if (match_int(&args[0], &sbi->session)) {
+				printk(KERN_ERR "hfs: session requires an argument\n");
+				return 0;
+			}
+			break;
+		case opt_nls:
+			if (sbi->nls) {
+				printk(KERN_ERR "hfs: unable to change nls mapping\n");
+				return 0;
+			}
+			p = match_strdup(&args[0]);
+			if (p)
+				sbi->nls = load_nls(p);
+			if (!sbi->nls) {
+				printk(KERN_ERR "hfs: unable to load "
+						"nls mapping \"%s\"\n",
+					p);
+				kfree(p);
+				return 0;
+			}
+			kfree(p);
+			break;
+		case opt_decompose:
+			clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
+			break;
+		case opt_nodecompose:
+			set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
+			break;
+		case opt_barrier:
+			clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
+			break;
+		case opt_nobarrier:
+			set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
+			break;
+		case opt_force:
+			set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
+			break;
+		default:
+			return 0;
+		}
+	}
+
+done:
+	if (!sbi->nls) {
+		/* try utf8 first, as this is the old default behaviour */
+		sbi->nls = load_nls("utf8");
+		if (!sbi->nls)
+			sbi->nls = load_nls_default();
+		if (!sbi->nls)
+			return 0;
+	}
+
+	return 1;
+}
+
+int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
+
+	if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
+		seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
+	if (sbi->type != HFSPLUS_DEF_CR_TYPE)
+		seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
+	seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
+		sbi->uid, sbi->gid);
+	if (sbi->part >= 0)
+		seq_printf(seq, ",part=%u", sbi->part);
+	if (sbi->session >= 0)
+		seq_printf(seq, ",session=%u", sbi->session);
+	if (sbi->nls)
+		seq_printf(seq, ",nls=%s", sbi->nls->charset);
+	if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
+		seq_printf(seq, ",nodecompose");
+	if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+		seq_printf(seq, ",nobarrier");
+	return 0;
+}
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
new file mode 100644
index 00000000..eb355d81
--- /dev/null
+++ b/fs/hfsplus/part_tbl.c
@@ -0,0 +1,157 @@
+/*
+ * linux/fs/hfsplus/part_tbl.c
+ *
+ * Copyright (C) 1996-1997  Paul H. Hargrove
+ * This file may be distributed under the terms of
+ * the GNU General Public License.
+ *
+ * Original code to handle the new style Mac partition table based on
+ * a patch contributed by Holger Schemel (aeglos@valinor.owl.de).
+ *
+ * In function preconditions the term "valid" applied to a pointer to
+ * a structure means that the pointer is non-NULL and the structure it
+ * points to has all fields initialized to consistent values.
+ *
+ */
+
+#include <linux/slab.h>
+#include "hfsplus_fs.h"
+
+/* offsets to various blocks */
+#define HFS_DD_BLK		0 /* Driver Descriptor block */
+#define HFS_PMAP_BLK		1 /* First block of partition map */
+#define HFS_MDB_BLK		2 /* Block (w/i partition) of MDB */
+
+/* magic numbers for various disk blocks */
+#define HFS_DRVR_DESC_MAGIC	0x4552 /* "ER": driver descriptor map */
+#define HFS_OLD_PMAP_MAGIC	0x5453 /* "TS": old-type partition map */
+#define HFS_NEW_PMAP_MAGIC	0x504D /* "PM": new-type partition map */
+#define HFS_SUPER_MAGIC		0x4244 /* "BD": HFS MDB (super block) */
+#define HFS_MFS_SUPER_MAGIC	0xD2D7 /* MFS MDB (super block) */
+
+/*
+ * The new style Mac partition map
+ *
+ * For each partition on the media there is a physical block (512-byte
+ * block) containing one of these structures.  These blocks are
+ * contiguous starting at block 1.
+ */
+struct new_pmap {
+	__be16	pmSig;		/* signature */
+	__be16	reSigPad;	/* padding */
+	__be32	pmMapBlkCnt;	/* partition blocks count */
+	__be32	pmPyPartStart;	/* physical block start of partition */
+	__be32	pmPartBlkCnt;	/* physical block count of partition */
+	u8	pmPartName[32];	/* (null terminated?) string
+				   giving the name of this
+				   partition */
+	u8	pmPartType[32];	/* (null terminated?) string
+				   giving the type of this
+				   partition */
+	/* a bunch more stuff we don't need */
+} __packed;
+
+/*
+ * The old style Mac partition map
+ *
+ * The partition map consists for a 2-byte signature followed by an
+ * array of these structures.  The map is terminated with an all-zero
+ * one of these.
+ */
+struct old_pmap {
+	__be16		pdSig;	/* Signature bytes */
+	struct old_pmap_entry {
+		__be32	pdStart;
+		__be32	pdSize;
+		__be32	pdFSID;
+	}	pdEntry[42];
+} __packed;
+
+static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm,
+		sector_t *part_start, sector_t *part_size)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	int i;
+
+	for (i = 0; i < 42; i++) {
+		struct old_pmap_entry *p = &pm->pdEntry[i];
+
+		if (p->pdStart && p->pdSize &&
+		    p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
+		    (sbi->part < 0 || sbi->part == i)) {
+			*part_start += be32_to_cpu(p->pdStart);
+			*part_size = be32_to_cpu(p->pdSize);
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int hfs_parse_new_pmap(struct super_block *sb, void *buf,
+		struct new_pmap *pm, sector_t *part_start, sector_t *part_size)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	int size = be32_to_cpu(pm->pmMapBlkCnt);
+	int buf_size = hfsplus_min_io_size(sb);
+	int res;
+	int i = 0;
+
+	do {
+		if (!memcmp(pm->pmPartType, "Apple_HFS", 9) &&
+		    (sbi->part < 0 || sbi->part == i)) {
+			*part_start += be32_to_cpu(pm->pmPyPartStart);
+			*part_size = be32_to_cpu(pm->pmPartBlkCnt);
+			return 0;
+		}
+
+		if (++i >= size)
+			return -ENOENT;
+
+		pm = (struct new_pmap *)((u8 *)pm + HFSPLUS_SECTOR_SIZE);
+		if ((u8 *)pm - (u8 *)buf >= buf_size) {
+			res = hfsplus_submit_bio(sb,
+						 *part_start + HFS_PMAP_BLK + i,
+						 buf, (void **)&pm, READ);
+			if (res)
+				return res;
+		}
+	} while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC));
+
+	return -ENOENT;
+}
+
+/*
+ * Parse the partition map looking for the start and length of a
+ * HFS/HFS+ partition.
+ */
+int hfs_part_find(struct super_block *sb,
+		sector_t *part_start, sector_t *part_size)
+{
+	void *buf, *data;
+	int res;
+
+	buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK,
+				 buf, &data, READ);
+	if (res)
+		goto out;
+
+	switch (be16_to_cpu(*((__be16 *)data))) {
+	case HFS_OLD_PMAP_MAGIC:
+		res = hfs_parse_old_pmap(sb, data, part_start, part_size);
+		break;
+	case HFS_NEW_PMAP_MAGIC:
+		res = hfs_parse_new_pmap(sb, buf, data, part_start, part_size);
+		break;
+	default:
+		res = -ENOENT;
+		break;
+	}
+out:
+	kfree(buf);
+	return res;
+}
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
new file mode 100644
index 00000000..c3a76fd2
--- /dev/null
+++ b/fs/hfsplus/super.c
@@ -0,0 +1,593 @@
+/*
+ *  linux/fs/hfsplus/super.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/vfs.h>
+#include <linux/nls.h>
+
+static struct inode *hfsplus_alloc_inode(struct super_block *sb);
+static void hfsplus_destroy_inode(struct inode *inode);
+
+#include "hfsplus_fs.h"
+
+static int hfsplus_system_read_inode(struct inode *inode)
+{
+	struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
+
+	switch (inode->i_ino) {
+	case HFSPLUS_EXT_CNID:
+		hfsplus_inode_read_fork(inode, &vhdr->ext_file);
+		inode->i_mapping->a_ops = &hfsplus_btree_aops;
+		break;
+	case HFSPLUS_CAT_CNID:
+		hfsplus_inode_read_fork(inode, &vhdr->cat_file);
+		inode->i_mapping->a_ops = &hfsplus_btree_aops;
+		break;
+	case HFSPLUS_ALLOC_CNID:
+		hfsplus_inode_read_fork(inode, &vhdr->alloc_file);
+		inode->i_mapping->a_ops = &hfsplus_aops;
+		break;
+	case HFSPLUS_START_CNID:
+		hfsplus_inode_read_fork(inode, &vhdr->start_file);
+		break;
+	case HFSPLUS_ATTR_CNID:
+		hfsplus_inode_read_fork(inode, &vhdr->attr_file);
+		inode->i_mapping->a_ops = &hfsplus_btree_aops;
+		break;
+	default:
+		return -EIO;
+	}
+
+	return 0;
+}
+
+struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
+{
+	struct hfs_find_data fd;
+	struct inode *inode;
+	int err;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
+	mutex_init(&HFSPLUS_I(inode)->extents_lock);
+	HFSPLUS_I(inode)->flags = 0;
+	HFSPLUS_I(inode)->extent_state = 0;
+	HFSPLUS_I(inode)->rsrc_inode = NULL;
+	atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
+
+	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+	    inode->i_ino == HFSPLUS_ROOT_CNID) {
+		hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+		err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+		if (!err)
+			err = hfsplus_cat_read_inode(inode, &fd);
+		hfs_find_exit(&fd);
+	} else {
+		err = hfsplus_system_read_inode(inode);
+	}
+
+	if (err) {
+		iget_failed(inode);
+		return ERR_PTR(err);
+	}
+
+	unlock_new_inode(inode);
+	return inode;
+}
+
+static int hfsplus_system_write_inode(struct inode *inode)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+	struct hfsplus_vh *vhdr = sbi->s_vhdr;
+	struct hfsplus_fork_raw *fork;
+	struct hfs_btree *tree = NULL;
+
+	switch (inode->i_ino) {
+	case HFSPLUS_EXT_CNID:
+		fork = &vhdr->ext_file;
+		tree = sbi->ext_tree;
+		break;
+	case HFSPLUS_CAT_CNID:
+		fork = &vhdr->cat_file;
+		tree = sbi->cat_tree;
+		break;
+	case HFSPLUS_ALLOC_CNID:
+		fork = &vhdr->alloc_file;
+		break;
+	case HFSPLUS_START_CNID:
+		fork = &vhdr->start_file;
+		break;
+	case HFSPLUS_ATTR_CNID:
+		fork = &vhdr->attr_file;
+		tree = sbi->attr_tree;
+	default:
+		return -EIO;
+	}
+
+	if (fork->total_size != cpu_to_be64(inode->i_size)) {
+		set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
+		inode->i_sb->s_dirt = 1;
+	}
+	hfsplus_inode_write_fork(inode, fork);
+	if (tree)
+		hfs_btree_write(tree);
+	return 0;
+}
+
+static int hfsplus_write_inode(struct inode *inode,
+		struct writeback_control *wbc)
+{
+	dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
+
+	hfsplus_ext_write_extent(inode);
+
+	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+	    inode->i_ino == HFSPLUS_ROOT_CNID)
+		return hfsplus_cat_write_inode(inode);
+	else
+		return hfsplus_system_write_inode(inode);
+}
+
+static void hfsplus_evict_inode(struct inode *inode)
+{
+	dprint(DBG_INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	if (HFSPLUS_IS_RSRC(inode)) {
+		HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
+		iput(HFSPLUS_I(inode)->rsrc_inode);
+	}
+}
+
+int hfsplus_sync_fs(struct super_block *sb, int wait)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_vh *vhdr = sbi->s_vhdr;
+	int write_backup = 0;
+	int error, error2;
+
+	if (!wait)
+		return 0;
+
+	dprint(DBG_SUPER, "hfsplus_write_super\n");
+
+	sb->s_dirt = 0;
+
+	/*
+	 * Explicitly write out the special metadata inodes.
+	 *
+	 * While these special inodes are marked as hashed and written
+	 * out peridocically by the flusher threads we redirty them
+	 * during writeout of normal inodes, and thus the life lock
+	 * prevents us from getting the latest state to disk.
+	 */
+	error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
+	error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
+	if (!error)
+		error = error2;
+	error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
+	if (!error)
+		error = error2;
+
+	mutex_lock(&sbi->vh_mutex);
+	mutex_lock(&sbi->alloc_mutex);
+	vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
+	vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
+	vhdr->folder_count = cpu_to_be32(sbi->folder_count);
+	vhdr->file_count = cpu_to_be32(sbi->file_count);
+
+	if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
+		memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr));
+		write_backup = 1;
+	}
+
+	error2 = hfsplus_submit_bio(sb,
+				   sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
+				   sbi->s_vhdr_buf, NULL, WRITE_SYNC);
+	if (!error)
+		error = error2;
+	if (!write_backup)
+		goto out;
+
+	error2 = hfsplus_submit_bio(sb,
+				  sbi->part_start + sbi->sect_count - 2,
+				  sbi->s_backup_vhdr_buf, NULL, WRITE_SYNC);
+	if (!error)
+		error2 = error;
+out:
+	mutex_unlock(&sbi->alloc_mutex);
+	mutex_unlock(&sbi->vh_mutex);
+
+	if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+
+	return error;
+}
+
+static void hfsplus_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		hfsplus_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
+}
+
+static void hfsplus_put_super(struct super_block *sb)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+
+	dprint(DBG_SUPER, "hfsplus_put_super\n");
+
+	if (!sb->s_fs_info)
+		return;
+
+	if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
+		struct hfsplus_vh *vhdr = sbi->s_vhdr;
+
+		vhdr->modify_date = hfsp_now2mt();
+		vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
+		vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
+
+		hfsplus_sync_fs(sb, 1);
+	}
+
+	hfs_btree_close(sbi->cat_tree);
+	hfs_btree_close(sbi->ext_tree);
+	iput(sbi->alloc_file);
+	iput(sbi->hidden_dir);
+	kfree(sbi->s_vhdr_buf);
+	kfree(sbi->s_backup_vhdr_buf);
+	unload_nls(sbi->nls);
+	kfree(sb->s_fs_info);
+	sb->s_fs_info = NULL;
+}
+
+static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type = HFSPLUS_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
+	buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = 0xFFFFFFFF;
+	buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = HFSPLUS_MAX_STRLEN;
+
+	return 0;
+}
+
+static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
+{
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		return 0;
+	if (!(*flags & MS_RDONLY)) {
+		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
+		int force = 0;
+
+		if (!hfsplus_parse_options_remount(data, &force))
+			return -EINVAL;
+
+		if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
+			printk(KERN_WARNING "hfs: filesystem was "
+					"not cleanly unmounted, "
+					"running fsck.hfsplus is recommended.  "
+					"leaving read-only.\n");
+			sb->s_flags |= MS_RDONLY;
+			*flags |= MS_RDONLY;
+		} else if (force) {
+			/* nothing */
+		} else if (vhdr->attributes &
+				cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
+			printk(KERN_WARNING "hfs: filesystem is marked locked, "
+					"leaving read-only.\n");
+			sb->s_flags |= MS_RDONLY;
+			*flags |= MS_RDONLY;
+		} else if (vhdr->attributes &
+				cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
+			printk(KERN_WARNING "hfs: filesystem is "
+					"marked journaled, "
+					"leaving read-only.\n");
+			sb->s_flags |= MS_RDONLY;
+			*flags |= MS_RDONLY;
+		}
+	}
+	return 0;
+}
+
+static const struct super_operations hfsplus_sops = {
+	.alloc_inode	= hfsplus_alloc_inode,
+	.destroy_inode	= hfsplus_destroy_inode,
+	.write_inode	= hfsplus_write_inode,
+	.evict_inode	= hfsplus_evict_inode,
+	.put_super	= hfsplus_put_super,
+	.write_super	= hfsplus_write_super,
+	.sync_fs	= hfsplus_sync_fs,
+	.statfs		= hfsplus_statfs,
+	.remount_fs	= hfsplus_remount,
+	.show_options	= hfsplus_show_options,
+};
+
+static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct hfsplus_vh *vhdr;
+	struct hfsplus_sb_info *sbi;
+	hfsplus_cat_entry entry;
+	struct hfs_find_data fd;
+	struct inode *root, *inode;
+	struct qstr str;
+	struct nls_table *nls = NULL;
+	int err;
+
+	err = -EINVAL;
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		goto out;
+
+	sb->s_fs_info = sbi;
+	mutex_init(&sbi->alloc_mutex);
+	mutex_init(&sbi->vh_mutex);
+	hfsplus_fill_defaults(sbi);
+
+	err = -EINVAL;
+	if (!hfsplus_parse_options(data, sbi)) {
+		printk(KERN_ERR "hfs: unable to parse mount options\n");
+		goto out_unload_nls;
+	}
+
+	/* temporarily use utf8 to correctly find the hidden dir below */
+	nls = sbi->nls;
+	sbi->nls = load_nls("utf8");
+	if (!sbi->nls) {
+		printk(KERN_ERR "hfs: unable to load nls for utf8\n");
+		goto out_unload_nls;
+	}
+
+	/* Grab the volume header */
+	if (hfsplus_read_wrapper(sb)) {
+		if (!silent)
+			printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n");
+		goto out_unload_nls;
+	}
+	vhdr = sbi->s_vhdr;
+
+	/* Copy parts of the volume header into the superblock */
+	sb->s_magic = HFSPLUS_VOLHEAD_SIG;
+	if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION ||
+	    be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) {
+		printk(KERN_ERR "hfs: wrong filesystem version\n");
+		goto out_free_vhdr;
+	}
+	sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
+	sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
+	sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
+	sbi->file_count = be32_to_cpu(vhdr->file_count);
+	sbi->folder_count = be32_to_cpu(vhdr->folder_count);
+	sbi->data_clump_blocks =
+		be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
+	if (!sbi->data_clump_blocks)
+		sbi->data_clump_blocks = 1;
+	sbi->rsrc_clump_blocks =
+		be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
+	if (!sbi->rsrc_clump_blocks)
+		sbi->rsrc_clump_blocks = 1;
+
+	/* Set up operations so we can load metadata */
+	sb->s_op = &hfsplus_sops;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+
+	if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
+		printk(KERN_WARNING "hfs: Filesystem was "
+				"not cleanly unmounted, "
+				"running fsck.hfsplus is recommended.  "
+				"mounting read-only.\n");
+		sb->s_flags |= MS_RDONLY;
+	} else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
+		/* nothing */
+	} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
+		printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
+		sb->s_flags |= MS_RDONLY;
+	} else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
+			!(sb->s_flags & MS_RDONLY)) {
+		printk(KERN_WARNING "hfs: write access to "
+				"a journaled filesystem is not supported, "
+				"use the force option at your own risk, "
+				"mounting read-only.\n");
+		sb->s_flags |= MS_RDONLY;
+	}
+
+	/* Load metadata objects (B*Trees) */
+	sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
+	if (!sbi->ext_tree) {
+		printk(KERN_ERR "hfs: failed to load extents file\n");
+		goto out_free_vhdr;
+	}
+	sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
+	if (!sbi->cat_tree) {
+		printk(KERN_ERR "hfs: failed to load catalog file\n");
+		goto out_close_ext_tree;
+	}
+
+	inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
+	if (IS_ERR(inode)) {
+		printk(KERN_ERR "hfs: failed to load allocation file\n");
+		err = PTR_ERR(inode);
+		goto out_close_cat_tree;
+	}
+	sbi->alloc_file = inode;
+
+	/* Load the root directory */
+	root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
+	if (IS_ERR(root)) {
+		printk(KERN_ERR "hfs: failed to load root directory\n");
+		err = PTR_ERR(root);
+		goto out_put_alloc_file;
+	}
+
+	str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
+	str.name = HFSP_HIDDENDIR_NAME;
+	hfs_find_init(sbi->cat_tree, &fd);
+	hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
+	if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
+		hfs_find_exit(&fd);
+		if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
+			goto out_put_root;
+		inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto out_put_root;
+		}
+		sbi->hidden_dir = inode;
+	} else
+		hfs_find_exit(&fd);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		/*
+		 * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
+		 * all three are registered with Apple for our use
+		 */
+		vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
+		vhdr->modify_date = hfsp_now2mt();
+		be32_add_cpu(&vhdr->write_count, 1);
+		vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
+		vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
+		hfsplus_sync_fs(sb, 1);
+
+		if (!sbi->hidden_dir) {
+			mutex_lock(&sbi->vh_mutex);
+			sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
+			hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str,
+					   sbi->hidden_dir);
+			mutex_unlock(&sbi->vh_mutex);
+
+			hfsplus_mark_inode_dirty(sbi->hidden_dir,
+						 HFSPLUS_I_CAT_DIRTY);
+		}
+	}
+
+	sb->s_d_op = &hfsplus_dentry_operations;
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		err = -ENOMEM;
+		goto out_put_hidden_dir;
+	}
+
+	unload_nls(sbi->nls);
+	sbi->nls = nls;
+	return 0;
+
+out_put_hidden_dir:
+	iput(sbi->hidden_dir);
+out_put_root:
+	iput(root);
+out_put_alloc_file:
+	iput(sbi->alloc_file);
+out_close_cat_tree:
+	hfs_btree_close(sbi->cat_tree);
+out_close_ext_tree:
+	hfs_btree_close(sbi->ext_tree);
+out_free_vhdr:
+	kfree(sbi->s_vhdr_buf);
+	kfree(sbi->s_backup_vhdr_buf);
+out_unload_nls:
+	unload_nls(sbi->nls);
+	unload_nls(nls);
+	kfree(sbi);
+out:
+	return err;
+}
+
+MODULE_AUTHOR("Brad Boyer");
+MODULE_DESCRIPTION("Extended Macintosh Filesystem");
+MODULE_LICENSE("GPL");
+
+static struct kmem_cache *hfsplus_inode_cachep;
+
+static struct inode *hfsplus_alloc_inode(struct super_block *sb)
+{
+	struct hfsplus_inode_info *i;
+
+	i = kmem_cache_alloc(hfsplus_inode_cachep, GFP_KERNEL);
+	return i ? &i->vfs_inode : NULL;
+}
+
+static void hfsplus_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
+}
+
+static void hfsplus_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hfsplus_i_callback);
+}
+
+#define HFSPLUS_INODE_SIZE	sizeof(struct hfsplus_inode_info)
+
+static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
+			  int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
+}
+
+static struct file_system_type hfsplus_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "hfsplus",
+	.mount		= hfsplus_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static void hfsplus_init_once(void *p)
+{
+	struct hfsplus_inode_info *i = p;
+
+	inode_init_once(&i->vfs_inode);
+}
+
+static int __init init_hfsplus_fs(void)
+{
+	int err;
+
+	hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache",
+		HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN,
+		hfsplus_init_once);
+	if (!hfsplus_inode_cachep)
+		return -ENOMEM;
+	err = register_filesystem(&hfsplus_fs_type);
+	if (err)
+		kmem_cache_destroy(hfsplus_inode_cachep);
+	return err;
+}
+
+static void __exit exit_hfsplus_fs(void)
+{
+	unregister_filesystem(&hfsplus_fs_type);
+	kmem_cache_destroy(hfsplus_inode_cachep);
+}
+
+module_init(init_hfsplus_fs)
+module_exit(exit_hfsplus_fs)
diff --git a/fs/hfsplus/tables.c b/fs/hfsplus/tables.c
new file mode 100644
index 00000000..1b911730
--- /dev/null
+++ b/fs/hfsplus/tables.c
@@ -0,0 +1,3245 @@
+/*
+ * linux/fs/hfsplus/tables.c
+ *
+ * Various data tables
+ */
+
+#include "hfsplus_fs.h"
+
+/*
+ *  Unicode case folding table taken from Apple Technote #1150
+ *  (HFS Plus Volume Format)
+ */
+
+u16 hfsplus_case_fold_table[] = {
+/*
+ *  The lower case table consists of a 256-entry high-byte table followed by
+ *  some number of 256-entry subtables. The high-byte table contains either an
+ *  offset to the subtable for characters with that high byte or zero, which
+ *  means that there are no case mappings or ignored characters in that block.
+ *  Ignored characters are mapped to zero.
+ */
+
+    // High-byte indices ( == 0 iff no case mapping and no ignorables )
+
+
+    /* 0 */ 0x0100, 0x0200, 0x0000, 0x0300, 0x0400, 0x0500, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 1 */ 0x0600, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 2 */ 0x0700, 0x0800, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 3 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 4 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 5 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 6 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 7 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 8 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 9 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* A */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* B */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* C */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* D */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* E */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* F */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0900, 0x0A00,
+
+    // Table 1 (for high byte 0x00)
+
+    /* 0 */ 0xFFFF, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
+            0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
+    /* 1 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
+            0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
+    /* 2 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
+            0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
+    /* 3 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
+            0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
+    /* 4 */ 0x0040, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
+            0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
+    /* 5 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
+            0x0078, 0x0079, 0x007A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
+    /* 6 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
+            0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
+    /* 7 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
+            0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F,
+    /* 8 */ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
+            0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
+    /* 9 */ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
+            0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
+    /* A */ 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
+            0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
+    /* B */ 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
+            0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
+    /* C */ 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00E6, 0x00C7,
+            0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
+    /* D */ 0x00F0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
+            0x00F8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00FE, 0x00DF,
+    /* E */ 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
+            0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
+    /* F */ 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
+            0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF,
+
+    // Table 2 (for high byte 0x01)
+
+    /* 0 */ 0x0100, 0x0101, 0x0102, 0x0103, 0x0104, 0x0105, 0x0106, 0x0107,
+            0x0108, 0x0109, 0x010A, 0x010B, 0x010C, 0x010D, 0x010E, 0x010F,
+    /* 1 */ 0x0111, 0x0111, 0x0112, 0x0113, 0x0114, 0x0115, 0x0116, 0x0117,
+            0x0118, 0x0119, 0x011A, 0x011B, 0x011C, 0x011D, 0x011E, 0x011F,
+    /* 2 */ 0x0120, 0x0121, 0x0122, 0x0123, 0x0124, 0x0125, 0x0127, 0x0127,
+            0x0128, 0x0129, 0x012A, 0x012B, 0x012C, 0x012D, 0x012E, 0x012F,
+    /* 3 */ 0x0130, 0x0131, 0x0133, 0x0133, 0x0134, 0x0135, 0x0136, 0x0137,
+            0x0138, 0x0139, 0x013A, 0x013B, 0x013C, 0x013D, 0x013E, 0x0140,
+    /* 4 */ 0x0140, 0x0142, 0x0142, 0x0143, 0x0144, 0x0145, 0x0146, 0x0147,
+            0x0148, 0x0149, 0x014B, 0x014B, 0x014C, 0x014D, 0x014E, 0x014F,
+    /* 5 */ 0x0150, 0x0151, 0x0153, 0x0153, 0x0154, 0x0155, 0x0156, 0x0157,
+            0x0158, 0x0159, 0x015A, 0x015B, 0x015C, 0x015D, 0x015E, 0x015F,
+    /* 6 */ 0x0160, 0x0161, 0x0162, 0x0163, 0x0164, 0x0165, 0x0167, 0x0167,
+            0x0168, 0x0169, 0x016A, 0x016B, 0x016C, 0x016D, 0x016E, 0x016F,
+    /* 7 */ 0x0170, 0x0171, 0x0172, 0x0173, 0x0174, 0x0175, 0x0176, 0x0177,
+            0x0178, 0x0179, 0x017A, 0x017B, 0x017C, 0x017D, 0x017E, 0x017F,
+    /* 8 */ 0x0180, 0x0253, 0x0183, 0x0183, 0x0185, 0x0185, 0x0254, 0x0188,
+            0x0188, 0x0256, 0x0257, 0x018C, 0x018C, 0x018D, 0x01DD, 0x0259,
+    /* 9 */ 0x025B, 0x0192, 0x0192, 0x0260, 0x0263, 0x0195, 0x0269, 0x0268,
+            0x0199, 0x0199, 0x019A, 0x019B, 0x026F, 0x0272, 0x019E, 0x0275,
+    /* A */ 0x01A0, 0x01A1, 0x01A3, 0x01A3, 0x01A5, 0x01A5, 0x01A6, 0x01A8,
+            0x01A8, 0x0283, 0x01AA, 0x01AB, 0x01AD, 0x01AD, 0x0288, 0x01AF,
+    /* B */ 0x01B0, 0x028A, 0x028B, 0x01B4, 0x01B4, 0x01B6, 0x01B6, 0x0292,
+            0x01B9, 0x01B9, 0x01BA, 0x01BB, 0x01BD, 0x01BD, 0x01BE, 0x01BF,
+    /* C */ 0x01C0, 0x01C1, 0x01C2, 0x01C3, 0x01C6, 0x01C6, 0x01C6, 0x01C9,
+            0x01C9, 0x01C9, 0x01CC, 0x01CC, 0x01CC, 0x01CD, 0x01CE, 0x01CF,
+    /* D */ 0x01D0, 0x01D1, 0x01D2, 0x01D3, 0x01D4, 0x01D5, 0x01D6, 0x01D7,
+            0x01D8, 0x01D9, 0x01DA, 0x01DB, 0x01DC, 0x01DD, 0x01DE, 0x01DF,
+    /* E */ 0x01E0, 0x01E1, 0x01E2, 0x01E3, 0x01E5, 0x01E5, 0x01E6, 0x01E7,
+            0x01E8, 0x01E9, 0x01EA, 0x01EB, 0x01EC, 0x01ED, 0x01EE, 0x01EF,
+    /* F */ 0x01F0, 0x01F3, 0x01F3, 0x01F3, 0x01F4, 0x01F5, 0x01F6, 0x01F7,
+            0x01F8, 0x01F9, 0x01FA, 0x01FB, 0x01FC, 0x01FD, 0x01FE, 0x01FF,
+
+    // Table 3 (for high byte 0x03)
+
+    /* 0 */ 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 0x0307,
+            0x0308, 0x0309, 0x030A, 0x030B, 0x030C, 0x030D, 0x030E, 0x030F,
+    /* 1 */ 0x0310, 0x0311, 0x0312, 0x0313, 0x0314, 0x0315, 0x0316, 0x0317,
+            0x0318, 0x0319, 0x031A, 0x031B, 0x031C, 0x031D, 0x031E, 0x031F,
+    /* 2 */ 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 0x0327,
+            0x0328, 0x0329, 0x032A, 0x032B, 0x032C, 0x032D, 0x032E, 0x032F,
+    /* 3 */ 0x0330, 0x0331, 0x0332, 0x0333, 0x0334, 0x0335, 0x0336, 0x0337,
+            0x0338, 0x0339, 0x033A, 0x033B, 0x033C, 0x033D, 0x033E, 0x033F,
+    /* 4 */ 0x0340, 0x0341, 0x0342, 0x0343, 0x0344, 0x0345, 0x0346, 0x0347,
+            0x0348, 0x0349, 0x034A, 0x034B, 0x034C, 0x034D, 0x034E, 0x034F,
+    /* 5 */ 0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357,
+            0x0358, 0x0359, 0x035A, 0x035B, 0x035C, 0x035D, 0x035E, 0x035F,
+    /* 6 */ 0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367,
+            0x0368, 0x0369, 0x036A, 0x036B, 0x036C, 0x036D, 0x036E, 0x036F,
+    /* 7 */ 0x0370, 0x0371, 0x0372, 0x0373, 0x0374, 0x0375, 0x0376, 0x0377,
+            0x0378, 0x0379, 0x037A, 0x037B, 0x037C, 0x037D, 0x037E, 0x037F,
+    /* 8 */ 0x0380, 0x0381, 0x0382, 0x0383, 0x0384, 0x0385, 0x0386, 0x0387,
+            0x0388, 0x0389, 0x038A, 0x038B, 0x038C, 0x038D, 0x038E, 0x038F,
+    /* 9 */ 0x0390, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7,
+            0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
+    /* A */ 0x03C0, 0x03C1, 0x03A2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7,
+            0x03C8, 0x03C9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
+    /* B */ 0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7,
+            0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
+    /* C */ 0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7,
+            0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0x03CF,
+    /* D */ 0x03D0, 0x03D1, 0x03D2, 0x03D3, 0x03D4, 0x03D5, 0x03D6, 0x03D7,
+            0x03D8, 0x03D9, 0x03DA, 0x03DB, 0x03DC, 0x03DD, 0x03DE, 0x03DF,
+    /* E */ 0x03E0, 0x03E1, 0x03E3, 0x03E3, 0x03E5, 0x03E5, 0x03E7, 0x03E7,
+            0x03E9, 0x03E9, 0x03EB, 0x03EB, 0x03ED, 0x03ED, 0x03EF, 0x03EF,
+    /* F */ 0x03F0, 0x03F1, 0x03F2, 0x03F3, 0x03F4, 0x03F5, 0x03F6, 0x03F7,
+            0x03F8, 0x03F9, 0x03FA, 0x03FB, 0x03FC, 0x03FD, 0x03FE, 0x03FF,
+
+    // Table 4 (for high byte 0x04)
+
+    /* 0 */ 0x0400, 0x0401, 0x0452, 0x0403, 0x0454, 0x0455, 0x0456, 0x0407,
+            0x0458, 0x0459, 0x045A, 0x045B, 0x040C, 0x040D, 0x040E, 0x045F,
+    /* 1 */ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
+            0x0438, 0x0419, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
+    /* 2 */ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
+            0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F,
+    /* 3 */ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
+            0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
+    /* 4 */ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
+            0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F,
+    /* 5 */ 0x0450, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457,
+            0x0458, 0x0459, 0x045A, 0x045B, 0x045C, 0x045D, 0x045E, 0x045F,
+    /* 6 */ 0x0461, 0x0461, 0x0463, 0x0463, 0x0465, 0x0465, 0x0467, 0x0467,
+            0x0469, 0x0469, 0x046B, 0x046B, 0x046D, 0x046D, 0x046F, 0x046F,
+    /* 7 */ 0x0471, 0x0471, 0x0473, 0x0473, 0x0475, 0x0475, 0x0476, 0x0477,
+            0x0479, 0x0479, 0x047B, 0x047B, 0x047D, 0x047D, 0x047F, 0x047F,
+    /* 8 */ 0x0481, 0x0481, 0x0482, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487,
+            0x0488, 0x0489, 0x048A, 0x048B, 0x048C, 0x048D, 0x048E, 0x048F,
+    /* 9 */ 0x0491, 0x0491, 0x0493, 0x0493, 0x0495, 0x0495, 0x0497, 0x0497,
+            0x0499, 0x0499, 0x049B, 0x049B, 0x049D, 0x049D, 0x049F, 0x049F,
+    /* A */ 0x04A1, 0x04A1, 0x04A3, 0x04A3, 0x04A5, 0x04A5, 0x04A7, 0x04A7,
+            0x04A9, 0x04A9, 0x04AB, 0x04AB, 0x04AD, 0x04AD, 0x04AF, 0x04AF,
+    /* B */ 0x04B1, 0x04B1, 0x04B3, 0x04B3, 0x04B5, 0x04B5, 0x04B7, 0x04B7,
+            0x04B9, 0x04B9, 0x04BB, 0x04BB, 0x04BD, 0x04BD, 0x04BF, 0x04BF,
+    /* C */ 0x04C0, 0x04C1, 0x04C2, 0x04C4, 0x04C4, 0x04C5, 0x04C6, 0x04C8,
+            0x04C8, 0x04C9, 0x04CA, 0x04CC, 0x04CC, 0x04CD, 0x04CE, 0x04CF,
+    /* D */ 0x04D0, 0x04D1, 0x04D2, 0x04D3, 0x04D4, 0x04D5, 0x04D6, 0x04D7,
+            0x04D8, 0x04D9, 0x04DA, 0x04DB, 0x04DC, 0x04DD, 0x04DE, 0x04DF,
+    /* E */ 0x04E0, 0x04E1, 0x04E2, 0x04E3, 0x04E4, 0x04E5, 0x04E6, 0x04E7,
+            0x04E8, 0x04E9, 0x04EA, 0x04EB, 0x04EC, 0x04ED, 0x04EE, 0x04EF,
+    /* F */ 0x04F0, 0x04F1, 0x04F2, 0x04F3, 0x04F4, 0x04F5, 0x04F6, 0x04F7,
+            0x04F8, 0x04F9, 0x04FA, 0x04FB, 0x04FC, 0x04FD, 0x04FE, 0x04FF,
+
+    // Table 5 (for high byte 0x05)
+
+    /* 0 */ 0x0500, 0x0501, 0x0502, 0x0503, 0x0504, 0x0505, 0x0506, 0x0507,
+            0x0508, 0x0509, 0x050A, 0x050B, 0x050C, 0x050D, 0x050E, 0x050F,
+    /* 1 */ 0x0510, 0x0511, 0x0512, 0x0513, 0x0514, 0x0515, 0x0516, 0x0517,
+            0x0518, 0x0519, 0x051A, 0x051B, 0x051C, 0x051D, 0x051E, 0x051F,
+    /* 2 */ 0x0520, 0x0521, 0x0522, 0x0523, 0x0524, 0x0525, 0x0526, 0x0527,
+            0x0528, 0x0529, 0x052A, 0x052B, 0x052C, 0x052D, 0x052E, 0x052F,
+    /* 3 */ 0x0530, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, 0x0566, 0x0567,
+            0x0568, 0x0569, 0x056A, 0x056B, 0x056C, 0x056D, 0x056E, 0x056F,
+    /* 4 */ 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577,
+            0x0578, 0x0579, 0x057A, 0x057B, 0x057C, 0x057D, 0x057E, 0x057F,
+    /* 5 */ 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, 0x0586, 0x0557,
+            0x0558, 0x0559, 0x055A, 0x055B, 0x055C, 0x055D, 0x055E, 0x055F,
+    /* 6 */ 0x0560, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, 0x0566, 0x0567,
+            0x0568, 0x0569, 0x056A, 0x056B, 0x056C, 0x056D, 0x056E, 0x056F,
+    /* 7 */ 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577,
+            0x0578, 0x0579, 0x057A, 0x057B, 0x057C, 0x057D, 0x057E, 0x057F,
+    /* 8 */ 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, 0x0586, 0x0587,
+            0x0588, 0x0589, 0x058A, 0x058B, 0x058C, 0x058D, 0x058E, 0x058F,
+    /* 9 */ 0x0590, 0x0591, 0x0592, 0x0593, 0x0594, 0x0595, 0x0596, 0x0597,
+            0x0598, 0x0599, 0x059A, 0x059B, 0x059C, 0x059D, 0x059E, 0x059F,
+    /* A */ 0x05A0, 0x05A1, 0x05A2, 0x05A3, 0x05A4, 0x05A5, 0x05A6, 0x05A7,
+            0x05A8, 0x05A9, 0x05AA, 0x05AB, 0x05AC, 0x05AD, 0x05AE, 0x05AF,
+    /* B */ 0x05B0, 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7,
+            0x05B8, 0x05B9, 0x05BA, 0x05BB, 0x05BC, 0x05BD, 0x05BE, 0x05BF,
+    /* C */ 0x05C0, 0x05C1, 0x05C2, 0x05C3, 0x05C4, 0x05C5, 0x05C6, 0x05C7,
+            0x05C8, 0x05C9, 0x05CA, 0x05CB, 0x05CC, 0x05CD, 0x05CE, 0x05CF,
+    /* D */ 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7,
+            0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
+    /* E */ 0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7,
+            0x05E8, 0x05E9, 0x05EA, 0x05EB, 0x05EC, 0x05ED, 0x05EE, 0x05EF,
+    /* F */ 0x05F0, 0x05F1, 0x05F2, 0x05F3, 0x05F4, 0x05F5, 0x05F6, 0x05F7,
+            0x05F8, 0x05F9, 0x05FA, 0x05FB, 0x05FC, 0x05FD, 0x05FE, 0x05FF,
+
+    // Table 6 (for high byte 0x10)
+
+    /* 0 */ 0x1000, 0x1001, 0x1002, 0x1003, 0x1004, 0x1005, 0x1006, 0x1007,
+            0x1008, 0x1009, 0x100A, 0x100B, 0x100C, 0x100D, 0x100E, 0x100F,
+    /* 1 */ 0x1010, 0x1011, 0x1012, 0x1013, 0x1014, 0x1015, 0x1016, 0x1017,
+            0x1018, 0x1019, 0x101A, 0x101B, 0x101C, 0x101D, 0x101E, 0x101F,
+    /* 2 */ 0x1020, 0x1021, 0x1022, 0x1023, 0x1024, 0x1025, 0x1026, 0x1027,
+            0x1028, 0x1029, 0x102A, 0x102B, 0x102C, 0x102D, 0x102E, 0x102F,
+    /* 3 */ 0x1030, 0x1031, 0x1032, 0x1033, 0x1034, 0x1035, 0x1036, 0x1037,
+            0x1038, 0x1039, 0x103A, 0x103B, 0x103C, 0x103D, 0x103E, 0x103F,
+    /* 4 */ 0x1040, 0x1041, 0x1042, 0x1043, 0x1044, 0x1045, 0x1046, 0x1047,
+            0x1048, 0x1049, 0x104A, 0x104B, 0x104C, 0x104D, 0x104E, 0x104F,
+    /* 5 */ 0x1050, 0x1051, 0x1052, 0x1053, 0x1054, 0x1055, 0x1056, 0x1057,
+            0x1058, 0x1059, 0x105A, 0x105B, 0x105C, 0x105D, 0x105E, 0x105F,
+    /* 6 */ 0x1060, 0x1061, 0x1062, 0x1063, 0x1064, 0x1065, 0x1066, 0x1067,
+            0x1068, 0x1069, 0x106A, 0x106B, 0x106C, 0x106D, 0x106E, 0x106F,
+    /* 7 */ 0x1070, 0x1071, 0x1072, 0x1073, 0x1074, 0x1075, 0x1076, 0x1077,
+            0x1078, 0x1079, 0x107A, 0x107B, 0x107C, 0x107D, 0x107E, 0x107F,
+    /* 8 */ 0x1080, 0x1081, 0x1082, 0x1083, 0x1084, 0x1085, 0x1086, 0x1087,
+            0x1088, 0x1089, 0x108A, 0x108B, 0x108C, 0x108D, 0x108E, 0x108F,
+    /* 9 */ 0x1090, 0x1091, 0x1092, 0x1093, 0x1094, 0x1095, 0x1096, 0x1097,
+            0x1098, 0x1099, 0x109A, 0x109B, 0x109C, 0x109D, 0x109E, 0x109F,
+    /* A */ 0x10D0, 0x10D1, 0x10D2, 0x10D3, 0x10D4, 0x10D5, 0x10D6, 0x10D7,
+            0x10D8, 0x10D9, 0x10DA, 0x10DB, 0x10DC, 0x10DD, 0x10DE, 0x10DF,
+    /* B */ 0x10E0, 0x10E1, 0x10E2, 0x10E3, 0x10E4, 0x10E5, 0x10E6, 0x10E7,
+            0x10E8, 0x10E9, 0x10EA, 0x10EB, 0x10EC, 0x10ED, 0x10EE, 0x10EF,
+    /* C */ 0x10F0, 0x10F1, 0x10F2, 0x10F3, 0x10F4, 0x10F5, 0x10C6, 0x10C7,
+            0x10C8, 0x10C9, 0x10CA, 0x10CB, 0x10CC, 0x10CD, 0x10CE, 0x10CF,
+    /* D */ 0x10D0, 0x10D1, 0x10D2, 0x10D3, 0x10D4, 0x10D5, 0x10D6, 0x10D7,
+            0x10D8, 0x10D9, 0x10DA, 0x10DB, 0x10DC, 0x10DD, 0x10DE, 0x10DF,
+    /* E */ 0x10E0, 0x10E1, 0x10E2, 0x10E3, 0x10E4, 0x10E5, 0x10E6, 0x10E7,
+            0x10E8, 0x10E9, 0x10EA, 0x10EB, 0x10EC, 0x10ED, 0x10EE, 0x10EF,
+    /* F */ 0x10F0, 0x10F1, 0x10F2, 0x10F3, 0x10F4, 0x10F5, 0x10F6, 0x10F7,
+            0x10F8, 0x10F9, 0x10FA, 0x10FB, 0x10FC, 0x10FD, 0x10FE, 0x10FF,
+
+    // Table 7 (for high byte 0x20)
+
+    /* 0 */ 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
+            0x2008, 0x2009, 0x200A, 0x200B, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 1 */ 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2016, 0x2017,
+            0x2018, 0x2019, 0x201A, 0x201B, 0x201C, 0x201D, 0x201E, 0x201F,
+    /* 2 */ 0x2020, 0x2021, 0x2022, 0x2023, 0x2024, 0x2025, 0x2026, 0x2027,
+            0x2028, 0x2029, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x202F,
+    /* 3 */ 0x2030, 0x2031, 0x2032, 0x2033, 0x2034, 0x2035, 0x2036, 0x2037,
+            0x2038, 0x2039, 0x203A, 0x203B, 0x203C, 0x203D, 0x203E, 0x203F,
+    /* 4 */ 0x2040, 0x2041, 0x2042, 0x2043, 0x2044, 0x2045, 0x2046, 0x2047,
+            0x2048, 0x2049, 0x204A, 0x204B, 0x204C, 0x204D, 0x204E, 0x204F,
+    /* 5 */ 0x2050, 0x2051, 0x2052, 0x2053, 0x2054, 0x2055, 0x2056, 0x2057,
+            0x2058, 0x2059, 0x205A, 0x205B, 0x205C, 0x205D, 0x205E, 0x205F,
+    /* 6 */ 0x2060, 0x2061, 0x2062, 0x2063, 0x2064, 0x2065, 0x2066, 0x2067,
+            0x2068, 0x2069, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* 7 */ 0x2070, 0x2071, 0x2072, 0x2073, 0x2074, 0x2075, 0x2076, 0x2077,
+            0x2078, 0x2079, 0x207A, 0x207B, 0x207C, 0x207D, 0x207E, 0x207F,
+    /* 8 */ 0x2080, 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087,
+            0x2088, 0x2089, 0x208A, 0x208B, 0x208C, 0x208D, 0x208E, 0x208F,
+    /* 9 */ 0x2090, 0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096, 0x2097,
+            0x2098, 0x2099, 0x209A, 0x209B, 0x209C, 0x209D, 0x209E, 0x209F,
+    /* A */ 0x20A0, 0x20A1, 0x20A2, 0x20A3, 0x20A4, 0x20A5, 0x20A6, 0x20A7,
+            0x20A8, 0x20A9, 0x20AA, 0x20AB, 0x20AC, 0x20AD, 0x20AE, 0x20AF,
+    /* B */ 0x20B0, 0x20B1, 0x20B2, 0x20B3, 0x20B4, 0x20B5, 0x20B6, 0x20B7,
+            0x20B8, 0x20B9, 0x20BA, 0x20BB, 0x20BC, 0x20BD, 0x20BE, 0x20BF,
+    /* C */ 0x20C0, 0x20C1, 0x20C2, 0x20C3, 0x20C4, 0x20C5, 0x20C6, 0x20C7,
+            0x20C8, 0x20C9, 0x20CA, 0x20CB, 0x20CC, 0x20CD, 0x20CE, 0x20CF,
+    /* D */ 0x20D0, 0x20D1, 0x20D2, 0x20D3, 0x20D4, 0x20D5, 0x20D6, 0x20D7,
+            0x20D8, 0x20D9, 0x20DA, 0x20DB, 0x20DC, 0x20DD, 0x20DE, 0x20DF,
+    /* E */ 0x20E0, 0x20E1, 0x20E2, 0x20E3, 0x20E4, 0x20E5, 0x20E6, 0x20E7,
+            0x20E8, 0x20E9, 0x20EA, 0x20EB, 0x20EC, 0x20ED, 0x20EE, 0x20EF,
+    /* F */ 0x20F0, 0x20F1, 0x20F2, 0x20F3, 0x20F4, 0x20F5, 0x20F6, 0x20F7,
+            0x20F8, 0x20F9, 0x20FA, 0x20FB, 0x20FC, 0x20FD, 0x20FE, 0x20FF,
+
+    // Table 8 (for high byte 0x21)
+
+    /* 0 */ 0x2100, 0x2101, 0x2102, 0x2103, 0x2104, 0x2105, 0x2106, 0x2107,
+            0x2108, 0x2109, 0x210A, 0x210B, 0x210C, 0x210D, 0x210E, 0x210F,
+    /* 1 */ 0x2110, 0x2111, 0x2112, 0x2113, 0x2114, 0x2115, 0x2116, 0x2117,
+            0x2118, 0x2119, 0x211A, 0x211B, 0x211C, 0x211D, 0x211E, 0x211F,
+    /* 2 */ 0x2120, 0x2121, 0x2122, 0x2123, 0x2124, 0x2125, 0x2126, 0x2127,
+            0x2128, 0x2129, 0x212A, 0x212B, 0x212C, 0x212D, 0x212E, 0x212F,
+    /* 3 */ 0x2130, 0x2131, 0x2132, 0x2133, 0x2134, 0x2135, 0x2136, 0x2137,
+            0x2138, 0x2139, 0x213A, 0x213B, 0x213C, 0x213D, 0x213E, 0x213F,
+    /* 4 */ 0x2140, 0x2141, 0x2142, 0x2143, 0x2144, 0x2145, 0x2146, 0x2147,
+            0x2148, 0x2149, 0x214A, 0x214B, 0x214C, 0x214D, 0x214E, 0x214F,
+    /* 5 */ 0x2150, 0x2151, 0x2152, 0x2153, 0x2154, 0x2155, 0x2156, 0x2157,
+            0x2158, 0x2159, 0x215A, 0x215B, 0x215C, 0x215D, 0x215E, 0x215F,
+    /* 6 */ 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177,
+            0x2178, 0x2179, 0x217A, 0x217B, 0x217C, 0x217D, 0x217E, 0x217F,
+    /* 7 */ 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177,
+            0x2178, 0x2179, 0x217A, 0x217B, 0x217C, 0x217D, 0x217E, 0x217F,
+    /* 8 */ 0x2180, 0x2181, 0x2182, 0x2183, 0x2184, 0x2185, 0x2186, 0x2187,
+            0x2188, 0x2189, 0x218A, 0x218B, 0x218C, 0x218D, 0x218E, 0x218F,
+    /* 9 */ 0x2190, 0x2191, 0x2192, 0x2193, 0x2194, 0x2195, 0x2196, 0x2197,
+            0x2198, 0x2199, 0x219A, 0x219B, 0x219C, 0x219D, 0x219E, 0x219F,
+    /* A */ 0x21A0, 0x21A1, 0x21A2, 0x21A3, 0x21A4, 0x21A5, 0x21A6, 0x21A7,
+            0x21A8, 0x21A9, 0x21AA, 0x21AB, 0x21AC, 0x21AD, 0x21AE, 0x21AF,
+    /* B */ 0x21B0, 0x21B1, 0x21B2, 0x21B3, 0x21B4, 0x21B5, 0x21B6, 0x21B7,
+            0x21B8, 0x21B9, 0x21BA, 0x21BB, 0x21BC, 0x21BD, 0x21BE, 0x21BF,
+    /* C */ 0x21C0, 0x21C1, 0x21C2, 0x21C3, 0x21C4, 0x21C5, 0x21C6, 0x21C7,
+            0x21C8, 0x21C9, 0x21CA, 0x21CB, 0x21CC, 0x21CD, 0x21CE, 0x21CF,
+    /* D */ 0x21D0, 0x21D1, 0x21D2, 0x21D3, 0x21D4, 0x21D5, 0x21D6, 0x21D7,
+            0x21D8, 0x21D9, 0x21DA, 0x21DB, 0x21DC, 0x21DD, 0x21DE, 0x21DF,
+    /* E */ 0x21E0, 0x21E1, 0x21E2, 0x21E3, 0x21E4, 0x21E5, 0x21E6, 0x21E7,
+            0x21E8, 0x21E9, 0x21EA, 0x21EB, 0x21EC, 0x21ED, 0x21EE, 0x21EF,
+    /* F */ 0x21F0, 0x21F1, 0x21F2, 0x21F3, 0x21F4, 0x21F5, 0x21F6, 0x21F7,
+            0x21F8, 0x21F9, 0x21FA, 0x21FB, 0x21FC, 0x21FD, 0x21FE, 0x21FF,
+
+    // Table 9 (for high byte 0xFE)
+
+    /* 0 */ 0xFE00, 0xFE01, 0xFE02, 0xFE03, 0xFE04, 0xFE05, 0xFE06, 0xFE07,
+            0xFE08, 0xFE09, 0xFE0A, 0xFE0B, 0xFE0C, 0xFE0D, 0xFE0E, 0xFE0F,
+    /* 1 */ 0xFE10, 0xFE11, 0xFE12, 0xFE13, 0xFE14, 0xFE15, 0xFE16, 0xFE17,
+            0xFE18, 0xFE19, 0xFE1A, 0xFE1B, 0xFE1C, 0xFE1D, 0xFE1E, 0xFE1F,
+    /* 2 */ 0xFE20, 0xFE21, 0xFE22, 0xFE23, 0xFE24, 0xFE25, 0xFE26, 0xFE27,
+            0xFE28, 0xFE29, 0xFE2A, 0xFE2B, 0xFE2C, 0xFE2D, 0xFE2E, 0xFE2F,
+    /* 3 */ 0xFE30, 0xFE31, 0xFE32, 0xFE33, 0xFE34, 0xFE35, 0xFE36, 0xFE37,
+            0xFE38, 0xFE39, 0xFE3A, 0xFE3B, 0xFE3C, 0xFE3D, 0xFE3E, 0xFE3F,
+    /* 4 */ 0xFE40, 0xFE41, 0xFE42, 0xFE43, 0xFE44, 0xFE45, 0xFE46, 0xFE47,
+            0xFE48, 0xFE49, 0xFE4A, 0xFE4B, 0xFE4C, 0xFE4D, 0xFE4E, 0xFE4F,
+    /* 5 */ 0xFE50, 0xFE51, 0xFE52, 0xFE53, 0xFE54, 0xFE55, 0xFE56, 0xFE57,
+            0xFE58, 0xFE59, 0xFE5A, 0xFE5B, 0xFE5C, 0xFE5D, 0xFE5E, 0xFE5F,
+    /* 6 */ 0xFE60, 0xFE61, 0xFE62, 0xFE63, 0xFE64, 0xFE65, 0xFE66, 0xFE67,
+            0xFE68, 0xFE69, 0xFE6A, 0xFE6B, 0xFE6C, 0xFE6D, 0xFE6E, 0xFE6F,
+    /* 7 */ 0xFE70, 0xFE71, 0xFE72, 0xFE73, 0xFE74, 0xFE75, 0xFE76, 0xFE77,
+            0xFE78, 0xFE79, 0xFE7A, 0xFE7B, 0xFE7C, 0xFE7D, 0xFE7E, 0xFE7F,
+    /* 8 */ 0xFE80, 0xFE81, 0xFE82, 0xFE83, 0xFE84, 0xFE85, 0xFE86, 0xFE87,
+            0xFE88, 0xFE89, 0xFE8A, 0xFE8B, 0xFE8C, 0xFE8D, 0xFE8E, 0xFE8F,
+    /* 9 */ 0xFE90, 0xFE91, 0xFE92, 0xFE93, 0xFE94, 0xFE95, 0xFE96, 0xFE97,
+            0xFE98, 0xFE99, 0xFE9A, 0xFE9B, 0xFE9C, 0xFE9D, 0xFE9E, 0xFE9F,
+    /* A */ 0xFEA0, 0xFEA1, 0xFEA2, 0xFEA3, 0xFEA4, 0xFEA5, 0xFEA6, 0xFEA7,
+            0xFEA8, 0xFEA9, 0xFEAA, 0xFEAB, 0xFEAC, 0xFEAD, 0xFEAE, 0xFEAF,
+    /* B */ 0xFEB0, 0xFEB1, 0xFEB2, 0xFEB3, 0xFEB4, 0xFEB5, 0xFEB6, 0xFEB7,
+            0xFEB8, 0xFEB9, 0xFEBA, 0xFEBB, 0xFEBC, 0xFEBD, 0xFEBE, 0xFEBF,
+    /* C */ 0xFEC0, 0xFEC1, 0xFEC2, 0xFEC3, 0xFEC4, 0xFEC5, 0xFEC6, 0xFEC7,
+            0xFEC8, 0xFEC9, 0xFECA, 0xFECB, 0xFECC, 0xFECD, 0xFECE, 0xFECF,
+    /* D */ 0xFED0, 0xFED1, 0xFED2, 0xFED3, 0xFED4, 0xFED5, 0xFED6, 0xFED7,
+            0xFED8, 0xFED9, 0xFEDA, 0xFEDB, 0xFEDC, 0xFEDD, 0xFEDE, 0xFEDF,
+    /* E */ 0xFEE0, 0xFEE1, 0xFEE2, 0xFEE3, 0xFEE4, 0xFEE5, 0xFEE6, 0xFEE7,
+            0xFEE8, 0xFEE9, 0xFEEA, 0xFEEB, 0xFEEC, 0xFEED, 0xFEEE, 0xFEEF,
+    /* F */ 0xFEF0, 0xFEF1, 0xFEF2, 0xFEF3, 0xFEF4, 0xFEF5, 0xFEF6, 0xFEF7,
+            0xFEF8, 0xFEF9, 0xFEFA, 0xFEFB, 0xFEFC, 0xFEFD, 0xFEFE, 0x0000,
+
+    // Table 10 (for high byte 0xFF)
+
+    /* 0 */ 0xFF00, 0xFF01, 0xFF02, 0xFF03, 0xFF04, 0xFF05, 0xFF06, 0xFF07,
+            0xFF08, 0xFF09, 0xFF0A, 0xFF0B, 0xFF0C, 0xFF0D, 0xFF0E, 0xFF0F,
+    /* 1 */ 0xFF10, 0xFF11, 0xFF12, 0xFF13, 0xFF14, 0xFF15, 0xFF16, 0xFF17,
+            0xFF18, 0xFF19, 0xFF1A, 0xFF1B, 0xFF1C, 0xFF1D, 0xFF1E, 0xFF1F,
+    /* 2 */ 0xFF20, 0xFF41, 0xFF42, 0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47,
+            0xFF48, 0xFF49, 0xFF4A, 0xFF4B, 0xFF4C, 0xFF4D, 0xFF4E, 0xFF4F,
+    /* 3 */ 0xFF50, 0xFF51, 0xFF52, 0xFF53, 0xFF54, 0xFF55, 0xFF56, 0xFF57,
+            0xFF58, 0xFF59, 0xFF5A, 0xFF3B, 0xFF3C, 0xFF3D, 0xFF3E, 0xFF3F,
+    /* 4 */ 0xFF40, 0xFF41, 0xFF42, 0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47,
+            0xFF48, 0xFF49, 0xFF4A, 0xFF4B, 0xFF4C, 0xFF4D, 0xFF4E, 0xFF4F,
+    /* 5 */ 0xFF50, 0xFF51, 0xFF52, 0xFF53, 0xFF54, 0xFF55, 0xFF56, 0xFF57,
+            0xFF58, 0xFF59, 0xFF5A, 0xFF5B, 0xFF5C, 0xFF5D, 0xFF5E, 0xFF5F,
+    /* 6 */ 0xFF60, 0xFF61, 0xFF62, 0xFF63, 0xFF64, 0xFF65, 0xFF66, 0xFF67,
+            0xFF68, 0xFF69, 0xFF6A, 0xFF6B, 0xFF6C, 0xFF6D, 0xFF6E, 0xFF6F,
+    /* 7 */ 0xFF70, 0xFF71, 0xFF72, 0xFF73, 0xFF74, 0xFF75, 0xFF76, 0xFF77,
+            0xFF78, 0xFF79, 0xFF7A, 0xFF7B, 0xFF7C, 0xFF7D, 0xFF7E, 0xFF7F,
+    /* 8 */ 0xFF80, 0xFF81, 0xFF82, 0xFF83, 0xFF84, 0xFF85, 0xFF86, 0xFF87,
+            0xFF88, 0xFF89, 0xFF8A, 0xFF8B, 0xFF8C, 0xFF8D, 0xFF8E, 0xFF8F,
+    /* 9 */ 0xFF90, 0xFF91, 0xFF92, 0xFF93, 0xFF94, 0xFF95, 0xFF96, 0xFF97,
+            0xFF98, 0xFF99, 0xFF9A, 0xFF9B, 0xFF9C, 0xFF9D, 0xFF9E, 0xFF9F,
+    /* A */ 0xFFA0, 0xFFA1, 0xFFA2, 0xFFA3, 0xFFA4, 0xFFA5, 0xFFA6, 0xFFA7,
+            0xFFA8, 0xFFA9, 0xFFAA, 0xFFAB, 0xFFAC, 0xFFAD, 0xFFAE, 0xFFAF,
+    /* B */ 0xFFB0, 0xFFB1, 0xFFB2, 0xFFB3, 0xFFB4, 0xFFB5, 0xFFB6, 0xFFB7,
+            0xFFB8, 0xFFB9, 0xFFBA, 0xFFBB, 0xFFBC, 0xFFBD, 0xFFBE, 0xFFBF,
+    /* C */ 0xFFC0, 0xFFC1, 0xFFC2, 0xFFC3, 0xFFC4, 0xFFC5, 0xFFC6, 0xFFC7,
+            0xFFC8, 0xFFC9, 0xFFCA, 0xFFCB, 0xFFCC, 0xFFCD, 0xFFCE, 0xFFCF,
+    /* D */ 0xFFD0, 0xFFD1, 0xFFD2, 0xFFD3, 0xFFD4, 0xFFD5, 0xFFD6, 0xFFD7,
+            0xFFD8, 0xFFD9, 0xFFDA, 0xFFDB, 0xFFDC, 0xFFDD, 0xFFDE, 0xFFDF,
+    /* E */ 0xFFE0, 0xFFE1, 0xFFE2, 0xFFE3, 0xFFE4, 0xFFE5, 0xFFE6, 0xFFE7,
+            0xFFE8, 0xFFE9, 0xFFEA, 0xFFEB, 0xFFEC, 0xFFED, 0xFFEE, 0xFFEF,
+    /* F */ 0xFFF0, 0xFFF1, 0xFFF2, 0xFFF3, 0xFFF4, 0xFFF5, 0xFFF6, 0xFFF7,
+            0xFFF8, 0xFFF9, 0xFFFA, 0xFFFB, 0xFFFC, 0xFFFD, 0xFFFE, 0xFFFF,
+};
+
+u16 hfsplus_decompose_table[] = {
+	/* base table */
+	0x0010, 0x04c0, 0x0000, 0x06f0, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x07b0,
+	/* char table 0x0___ */
+	0x0020, 0x0070, 0x0160, 0x0190, 0x0230, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x02d0, 0x0340, 0x0360, 0x03b0, 0x03e0, 0x0400, 0x0430,
+	/* char table 0x00__ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0030, 0x0040, 0x0050, 0x0060,
+	/* char values 0x00c_ */
+	0x2042, 0x204a, 0x2052, 0x205a, 0x2062, 0x206a, 0x0000, 0x2072,
+	0x207a, 0x2082, 0x208a, 0x2092, 0x209a, 0x20a2, 0x20aa, 0x20b2,
+	/* char values 0x00d_ */
+	0x0000, 0x20ba, 0x20c2, 0x20ca, 0x20d2, 0x20da, 0x20e2, 0x0000,
+	0x0000, 0x20ea, 0x20f2, 0x20fa, 0x2102, 0x210a, 0x0000, 0x0000,
+	/* char values 0x00e_ */
+	0x2112, 0x211a, 0x2122, 0x212a, 0x2132, 0x213a, 0x0000, 0x2142,
+	0x214a, 0x2152, 0x215a, 0x2162, 0x216a, 0x2172, 0x217a, 0x2182,
+	/* char values 0x00f_ */
+	0x0000, 0x218a, 0x2192, 0x219a, 0x21a2, 0x21aa, 0x21b2, 0x0000,
+	0x0000, 0x21ba, 0x21c2, 0x21ca, 0x21d2, 0x21da, 0x0000, 0x21e2,
+	/* char table 0x01__ */
+	0x0080, 0x0090, 0x00a0, 0x00b0, 0x00c0, 0x00d0, 0x00e0, 0x00f0,
+	0x0000, 0x0000, 0x0100, 0x0110, 0x0120, 0x0130, 0x0140, 0x0150,
+	/* char values 0x010_ */
+	0x21ea, 0x21f2, 0x21fa, 0x2202, 0x220a, 0x2212, 0x221a, 0x2222,
+	0x222a, 0x2232, 0x223a, 0x2242, 0x224a, 0x2252, 0x225a, 0x2262,
+	/* char values 0x011_ */
+	0x0000, 0x0000, 0x226a, 0x2272, 0x227a, 0x2282, 0x228a, 0x2292,
+	0x229a, 0x22a2, 0x22aa, 0x22b2, 0x22ba, 0x22c2, 0x22ca, 0x22d2,
+	/* char values 0x012_ */
+	0x22da, 0x22e2, 0x22ea, 0x22f2, 0x22fa, 0x2302, 0x0000, 0x0000,
+	0x230a, 0x2312, 0x231a, 0x2322, 0x232a, 0x2332, 0x233a, 0x2342,
+	/* char values 0x013_ */
+	0x234a, 0x0000, 0x0000, 0x0000, 0x2352, 0x235a, 0x2362, 0x236a,
+	0x0000, 0x2372, 0x237a, 0x2382, 0x238a, 0x2392, 0x239a, 0x0000,
+	/* char values 0x014_ */
+	0x0000, 0x0000, 0x0000, 0x23a2, 0x23aa, 0x23b2, 0x23ba, 0x23c2,
+	0x23ca, 0x0000, 0x0000, 0x0000, 0x23d2, 0x23da, 0x23e2, 0x23ea,
+	/* char values 0x015_ */
+	0x23f2, 0x23fa, 0x0000, 0x0000, 0x2402, 0x240a, 0x2412, 0x241a,
+	0x2422, 0x242a, 0x2432, 0x243a, 0x2442, 0x244a, 0x2452, 0x245a,
+	/* char values 0x016_ */
+	0x2462, 0x246a, 0x2472, 0x247a, 0x2482, 0x248a, 0x0000, 0x0000,
+	0x2492, 0x249a, 0x24a2, 0x24aa, 0x24b2, 0x24ba, 0x24c2, 0x24ca,
+	/* char values 0x017_ */
+	0x24d2, 0x24da, 0x24e2, 0x24ea, 0x24f2, 0x24fa, 0x2502, 0x250a,
+	0x2512, 0x251a, 0x2522, 0x252a, 0x2532, 0x253a, 0x2542, 0x0000,
+	/* char values 0x01a_ */
+	0x254a, 0x2552, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x255a,
+	/* char values 0x01b_ */
+	0x2562, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x01c_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x256a, 0x2572, 0x257a,
+	/* char values 0x01d_ */
+	0x2582, 0x258a, 0x2592, 0x259a, 0x25a2, 0x25ab, 0x25b7, 0x25c3,
+	0x25cf, 0x25db, 0x25e7, 0x25f3, 0x25ff, 0x0000, 0x260b, 0x2617,
+	/* char values 0x01e_ */
+	0x2623, 0x262f, 0x263a, 0x2642, 0x0000, 0x0000, 0x264a, 0x2652,
+	0x265a, 0x2662, 0x266a, 0x2672, 0x267b, 0x2687, 0x2692, 0x269a,
+	/* char values 0x01f_ */
+	0x26a2, 0x0000, 0x0000, 0x0000, 0x26aa, 0x26b2, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x26bb, 0x26c7, 0x26d2, 0x26da, 0x26e2, 0x26ea,
+	/* char table 0x02__ */
+	0x0170, 0x0180, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x020_ */
+	0x26f2, 0x26fa, 0x2702, 0x270a, 0x2712, 0x271a, 0x2722, 0x272a,
+	0x2732, 0x273a, 0x2742, 0x274a, 0x2752, 0x275a, 0x2762, 0x276a,
+	/* char values 0x021_ */
+	0x2772, 0x277a, 0x2782, 0x278a, 0x2792, 0x279a, 0x27a2, 0x27aa,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x03__ */
+	0x0000, 0x01a0, 0x0000, 0x0000, 0x01b0, 0x0000, 0x0000, 0x01c0,
+	0x01d0, 0x01e0, 0x01f0, 0x0200, 0x0210, 0x0220, 0x0000, 0x0000,
+	/* char values 0x031_ */
+	0x27b2, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x034_ */
+	0x27b9, 0x27bd, 0x0000, 0x27c1, 0x27c6, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x037_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x27cd, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x27d1, 0x0000,
+	/* char values 0x038_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x27d6, 0x27de, 0x27e5,
+	0x27ea, 0x27f2, 0x27fa, 0x0000, 0x2802, 0x0000, 0x280a, 0x2812,
+	/* char values 0x039_ */
+	0x281b, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x03a_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x2826, 0x282e, 0x2836, 0x283e, 0x2846, 0x284e,
+	/* char values 0x03b_ */
+	0x2857, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x03c_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x2862, 0x286a, 0x2872, 0x287a, 0x2882, 0x0000,
+	/* char values 0x03d_ */
+	0x0000, 0x0000, 0x0000, 0x288a, 0x2892, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x04__ */
+	0x0240, 0x0250, 0x0000, 0x0260, 0x0000, 0x0270, 0x0000, 0x0280,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0290, 0x02a0, 0x02b0, 0x02c0,
+	/* char values 0x040_ */
+	0x0000, 0x289a, 0x0000, 0x28a2, 0x0000, 0x0000, 0x0000, 0x28aa,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x28b2, 0x0000, 0x28ba, 0x0000,
+	/* char values 0x041_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x28c2, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x043_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x28ca, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x045_ */
+	0x0000, 0x28d2, 0x0000, 0x28da, 0x0000, 0x0000, 0x0000, 0x28e2,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x28ea, 0x0000, 0x28f2, 0x0000,
+	/* char values 0x047_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x28fa, 0x2902,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x04c_ */
+	0x0000, 0x290a, 0x2912, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x04d_ */
+	0x291a, 0x2922, 0x292a, 0x2932, 0x2939, 0x293d, 0x2942, 0x294a,
+	0x2951, 0x2955, 0x295a, 0x2962, 0x296a, 0x2972, 0x297a, 0x2982,
+	/* char values 0x04e_ */
+	0x2989, 0x298d, 0x2992, 0x299a, 0x29a2, 0x29aa, 0x29b2, 0x29ba,
+	0x29c1, 0x29c5, 0x29ca, 0x29d2, 0x0000, 0x0000, 0x29da, 0x29e2,
+	/* char values 0x04f_ */
+	0x29ea, 0x29f2, 0x29fa, 0x2a02, 0x2a0a, 0x2a12, 0x0000, 0x0000,
+	0x2a1a, 0x2a22, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x09__ */
+	0x0000, 0x0000, 0x02e0, 0x02f0, 0x0000, 0x0300, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0310, 0x0320, 0x0330, 0x0000, 0x0000,
+	/* char values 0x092_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x2a2a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x093_ */
+	0x0000, 0x2a32, 0x0000, 0x0000, 0x2a3a, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x095_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x2a42, 0x2a4a, 0x2a52, 0x2a5a, 0x2a62, 0x2a6a, 0x2a72, 0x2a7a,
+	/* char values 0x09b_ */
+	0x2a82, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x09c_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x2a8a, 0x2a92, 0x0000, 0x0000, 0x0000,
+	/* char values 0x09d_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x2a9a, 0x2aa2, 0x0000, 0x2aaa,
+	/* char table 0x0a__ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0350, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0a5_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x2ab2, 0x2aba, 0x2ac2, 0x2aca, 0x0000, 0x2ad2, 0x0000,
+	/* char table 0x0b__ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0370, 0x0380, 0x0000, 0x0000,
+	0x0000, 0x0390, 0x0000, 0x0000, 0x03a0, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0b4_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x2ada, 0x0000, 0x0000, 0x2ae2, 0x2aea, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0b5_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x2af2, 0x2afa, 0x0000, 0x2b02,
+	/* char values 0x0b9_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x2b0a, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0bc_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x2b12, 0x2b1a, 0x2b22, 0x0000, 0x0000, 0x0000,
+	/* char table 0x0c__ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x03c0, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x03d0, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0c4_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x2b2a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0cc_ */
+	0x2b32, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2b3a,
+	0x2b42, 0x0000, 0x2b4a, 0x2b53, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x0d__ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x03f0, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0d4_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x2b5e, 0x2b66, 0x2b6e, 0x0000, 0x0000, 0x0000,
+	/* char table 0x0e__ */
+	0x0000, 0x0000, 0x0000, 0x0410, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0420, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0e3_ */
+	0x0000, 0x0000, 0x0000, 0x2b76, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0eb_ */
+	0x0000, 0x0000, 0x0000, 0x2b7e, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x0f__ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0440, 0x0450, 0x0460, 0x0470,
+	0x0480, 0x0490, 0x04a0, 0x04b0, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0f4_ */
+	0x0000, 0x0000, 0x0000, 0x2b86, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2b8e, 0x0000, 0x0000,
+	/* char values 0x0f5_ */
+	0x0000, 0x0000, 0x2b96, 0x0000, 0x0000, 0x0000, 0x0000, 0x2b9e,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x2ba6, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0f6_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x2bae, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0f7_ */
+	0x0000, 0x0000, 0x0000, 0x2bb6, 0x0000, 0x2bbe, 0x2bc6, 0x2bcf,
+	0x2bda, 0x2be3, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0f8_ */
+	0x0000, 0x2bee, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0f9_ */
+	0x0000, 0x0000, 0x0000, 0x2bf6, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2bfe, 0x0000, 0x0000,
+	/* char values 0x0fa_ */
+	0x0000, 0x0000, 0x2c06, 0x0000, 0x0000, 0x0000, 0x0000, 0x2c0e,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x2c16, 0x0000, 0x0000, 0x0000,
+	/* char values 0x0fb_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x2c1e, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x1___ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x04d0, 0x05e0,
+	/* char table 0x1e__ */
+	0x04e0, 0x04f0, 0x0500, 0x0510, 0x0520, 0x0530, 0x0540, 0x0550,
+	0x0560, 0x0570, 0x0580, 0x0590, 0x05a0, 0x05b0, 0x05c0, 0x05d0,
+	/* char values 0x1e0_ */
+	0x2c26, 0x2c2e, 0x2c36, 0x2c3e, 0x2c46, 0x2c4e, 0x2c56, 0x2c5e,
+	0x2c67, 0x2c73, 0x2c7e, 0x2c86, 0x2c8e, 0x2c96, 0x2c9e, 0x2ca6,
+	/* char values 0x1e1_ */
+	0x2cae, 0x2cb6, 0x2cbe, 0x2cc6, 0x2ccf, 0x2cdb, 0x2ce7, 0x2cf3,
+	0x2cfe, 0x2d06, 0x2d0e, 0x2d16, 0x2d1f, 0x2d2b, 0x2d36, 0x2d3e,
+	/* char values 0x1e2_ */
+	0x2d46, 0x2d4e, 0x2d56, 0x2d5e, 0x2d66, 0x2d6e, 0x2d76, 0x2d7e,
+	0x2d86, 0x2d8e, 0x2d96, 0x2d9e, 0x2da6, 0x2dae, 0x2db7, 0x2dc3,
+	/* char values 0x1e3_ */
+	0x2dce, 0x2dd6, 0x2dde, 0x2de6, 0x2dee, 0x2df6, 0x2dfe, 0x2e06,
+	0x2e0f, 0x2e1b, 0x2e26, 0x2e2e, 0x2e36, 0x2e3e, 0x2e46, 0x2e4e,
+	/* char values 0x1e4_ */
+	0x2e56, 0x2e5e, 0x2e66, 0x2e6e, 0x2e76, 0x2e7e, 0x2e86, 0x2e8e,
+	0x2e96, 0x2e9e, 0x2ea6, 0x2eae, 0x2eb7, 0x2ec3, 0x2ecf, 0x2edb,
+	/* char values 0x1e5_ */
+	0x2ee7, 0x2ef3, 0x2eff, 0x2f0b, 0x2f16, 0x2f1e, 0x2f26, 0x2f2e,
+	0x2f36, 0x2f3e, 0x2f46, 0x2f4e, 0x2f57, 0x2f63, 0x2f6e, 0x2f76,
+	/* char values 0x1e6_ */
+	0x2f7e, 0x2f86, 0x2f8e, 0x2f96, 0x2f9f, 0x2fab, 0x2fb7, 0x2fc3,
+	0x2fcf, 0x2fdb, 0x2fe6, 0x2fee, 0x2ff6, 0x2ffe, 0x3006, 0x300e,
+	/* char values 0x1e7_ */
+	0x3016, 0x301e, 0x3026, 0x302e, 0x3036, 0x303e, 0x3046, 0x304e,
+	0x3057, 0x3063, 0x306f, 0x307b, 0x3086, 0x308e, 0x3096, 0x309e,
+	/* char values 0x1e8_ */
+	0x30a6, 0x30ae, 0x30b6, 0x30be, 0x30c6, 0x30ce, 0x30d6, 0x30de,
+	0x30e6, 0x30ee, 0x30f6, 0x30fe, 0x3106, 0x310e, 0x3116, 0x311e,
+	/* char values 0x1e9_ */
+	0x3126, 0x312e, 0x3136, 0x313e, 0x3146, 0x314e, 0x3156, 0x315e,
+	0x3166, 0x316e, 0x0000, 0x3176, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x1ea_ */
+	0x317e, 0x3186, 0x318e, 0x3196, 0x319f, 0x31ab, 0x31b7, 0x31c3,
+	0x31cf, 0x31db, 0x31e7, 0x31f3, 0x31ff, 0x320b, 0x3217, 0x3223,
+	/* char values 0x1eb_ */
+	0x322f, 0x323b, 0x3247, 0x3253, 0x325f, 0x326b, 0x3277, 0x3283,
+	0x328e, 0x3296, 0x329e, 0x32a6, 0x32ae, 0x32b6, 0x32bf, 0x32cb,
+	/* char values 0x1ec_ */
+	0x32d7, 0x32e3, 0x32ef, 0x32fb, 0x3307, 0x3313, 0x331f, 0x332b,
+	0x3336, 0x333e, 0x3346, 0x334e, 0x3356, 0x335e, 0x3366, 0x336e,
+	/* char values 0x1ed_ */
+	0x3377, 0x3383, 0x338f, 0x339b, 0x33a7, 0x33b3, 0x33bf, 0x33cb,
+	0x33d7, 0x33e3, 0x33ef, 0x33fb, 0x3407, 0x3413, 0x341f, 0x342b,
+	/* char values 0x1ee_ */
+	0x3437, 0x3443, 0x344f, 0x345b, 0x3466, 0x346e, 0x3476, 0x347e,
+	0x3487, 0x3493, 0x349f, 0x34ab, 0x34b7, 0x34c3, 0x34cf, 0x34db,
+	/* char values 0x1ef_ */
+	0x34e7, 0x34f3, 0x34fe, 0x3506, 0x350e, 0x3516, 0x351e, 0x3526,
+	0x352e, 0x3536, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x1f__ */
+	0x05f0, 0x0600, 0x0610, 0x0620, 0x0630, 0x0640, 0x0650, 0x0660,
+	0x0670, 0x0680, 0x0690, 0x06a0, 0x06b0, 0x06c0, 0x06d0, 0x06e0,
+	/* char values 0x1f0_ */
+	0x353e, 0x3546, 0x354f, 0x355b, 0x3567, 0x3573, 0x357f, 0x358b,
+	0x3596, 0x359e, 0x35a7, 0x35b3, 0x35bf, 0x35cb, 0x35d7, 0x35e3,
+	/* char values 0x1f1_ */
+	0x35ee, 0x35f6, 0x35ff, 0x360b, 0x3617, 0x3623, 0x0000, 0x0000,
+	0x362e, 0x3636, 0x363f, 0x364b, 0x3657, 0x3663, 0x0000, 0x0000,
+	/* char values 0x1f2_ */
+	0x366e, 0x3676, 0x367f, 0x368b, 0x3697, 0x36a3, 0x36af, 0x36bb,
+	0x36c6, 0x36ce, 0x36d7, 0x36e3, 0x36ef, 0x36fb, 0x3707, 0x3713,
+	/* char values 0x1f3_ */
+	0x371e, 0x3726, 0x372f, 0x373b, 0x3747, 0x3753, 0x375f, 0x376b,
+	0x3776, 0x377e, 0x3787, 0x3793, 0x379f, 0x37ab, 0x37b7, 0x37c3,
+	/* char values 0x1f4_ */
+	0x37ce, 0x37d6, 0x37df, 0x37eb, 0x37f7, 0x3803, 0x0000, 0x0000,
+	0x380e, 0x3816, 0x381f, 0x382b, 0x3837, 0x3843, 0x0000, 0x0000,
+	/* char values 0x1f5_ */
+	0x384e, 0x3856, 0x385f, 0x386b, 0x3877, 0x3883, 0x388f, 0x389b,
+	0x0000, 0x38a6, 0x0000, 0x38af, 0x0000, 0x38bb, 0x0000, 0x38c7,
+	/* char values 0x1f6_ */
+	0x38d2, 0x38da, 0x38e3, 0x38ef, 0x38fb, 0x3907, 0x3913, 0x391f,
+	0x392a, 0x3932, 0x393b, 0x3947, 0x3953, 0x395f, 0x396b, 0x3977,
+	/* char values 0x1f7_ */
+	0x3982, 0x398a, 0x3992, 0x399a, 0x39a2, 0x39aa, 0x39b2, 0x39ba,
+	0x39c2, 0x39ca, 0x39d2, 0x39da, 0x39e2, 0x39ea, 0x0000, 0x0000,
+	/* char values 0x1f8_ */
+	0x39f3, 0x39ff, 0x3a0c, 0x3a1c, 0x3a2c, 0x3a3c, 0x3a4c, 0x3a5c,
+	0x3a6b, 0x3a77, 0x3a84, 0x3a94, 0x3aa4, 0x3ab4, 0x3ac4, 0x3ad4,
+	/* char values 0x1f9_ */
+	0x3ae3, 0x3aef, 0x3afc, 0x3b0c, 0x3b1c, 0x3b2c, 0x3b3c, 0x3b4c,
+	0x3b5b, 0x3b67, 0x3b74, 0x3b84, 0x3b94, 0x3ba4, 0x3bb4, 0x3bc4,
+	/* char values 0x1fa_ */
+	0x3bd3, 0x3bdf, 0x3bec, 0x3bfc, 0x3c0c, 0x3c1c, 0x3c2c, 0x3c3c,
+	0x3c4b, 0x3c57, 0x3c64, 0x3c74, 0x3c84, 0x3c94, 0x3ca4, 0x3cb4,
+	/* char values 0x1fb_ */
+	0x3cc2, 0x3cca, 0x3cd3, 0x3cde, 0x3ce7, 0x0000, 0x3cf2, 0x3cfb,
+	0x3d06, 0x3d0e, 0x3d16, 0x3d1e, 0x3d26, 0x0000, 0x3d2d, 0x0000,
+	/* char values 0x1fc_ */
+	0x0000, 0x3d32, 0x3d3b, 0x3d46, 0x3d4f, 0x0000, 0x3d5a, 0x3d63,
+	0x3d6e, 0x3d76, 0x3d7e, 0x3d86, 0x3d8e, 0x3d96, 0x3d9e, 0x3da6,
+	/* char values 0x1fd_ */
+	0x3dae, 0x3db6, 0x3dbf, 0x3dcb, 0x0000, 0x0000, 0x3dd6, 0x3ddf,
+	0x3dea, 0x3df2, 0x3dfa, 0x3e02, 0x0000, 0x3e0a, 0x3e12, 0x3e1a,
+	/* char values 0x1fe_ */
+	0x3e22, 0x3e2a, 0x3e33, 0x3e3f, 0x3e4a, 0x3e52, 0x3e5a, 0x3e63,
+	0x3e6e, 0x3e76, 0x3e7e, 0x3e86, 0x3e8e, 0x3e96, 0x3e9e, 0x3ea5,
+	/* char values 0x1ff_ */
+	0x0000, 0x0000, 0x3eab, 0x3eb6, 0x3ebf, 0x0000, 0x3eca, 0x3ed3,
+	0x3ede, 0x3ee6, 0x3eee, 0x3ef6, 0x3efe, 0x3f05, 0x0000, 0x0000,
+	/* char table 0x3___ */
+	0x0700, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0x30__ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0710, 0x0720, 0x0730, 0x0740,
+	0x0000, 0x0750, 0x0760, 0x0770, 0x0780, 0x0790, 0x0000, 0x07a0,
+	/* char values 0x304_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x3f0a, 0x0000, 0x3f12, 0x0000,
+	/* char values 0x305_ */
+	0x3f1a, 0x0000, 0x3f22, 0x0000, 0x3f2a, 0x0000, 0x3f32, 0x0000,
+	0x3f3a, 0x0000, 0x3f42, 0x0000, 0x3f4a, 0x0000, 0x3f52, 0x0000,
+	/* char values 0x306_ */
+	0x3f5a, 0x0000, 0x3f62, 0x0000, 0x0000, 0x3f6a, 0x0000, 0x3f72,
+	0x0000, 0x3f7a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x307_ */
+	0x3f82, 0x3f8a, 0x0000, 0x3f92, 0x3f9a, 0x0000, 0x3fa2, 0x3faa,
+	0x0000, 0x3fb2, 0x3fba, 0x0000, 0x3fc2, 0x3fca, 0x0000, 0x0000,
+	/* char values 0x309_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x3fd2, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x3fda, 0x0000,
+	/* char values 0x30a_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x3fe2, 0x0000, 0x3fea, 0x0000,
+	/* char values 0x30b_ */
+	0x3ff2, 0x0000, 0x3ffa, 0x0000, 0x4002, 0x0000, 0x400a, 0x0000,
+	0x4012, 0x0000, 0x401a, 0x0000, 0x4022, 0x0000, 0x402a, 0x0000,
+	/* char values 0x30c_ */
+	0x4032, 0x0000, 0x403a, 0x0000, 0x0000, 0x4042, 0x0000, 0x404a,
+	0x0000, 0x4052, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0x30d_ */
+	0x405a, 0x4062, 0x0000, 0x406a, 0x4072, 0x0000, 0x407a, 0x4082,
+	0x0000, 0x408a, 0x4092, 0x0000, 0x409a, 0x40a2, 0x0000, 0x0000,
+	/* char values 0x30f_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x40aa, 0x0000, 0x0000, 0x40b2,
+	0x40ba, 0x40c2, 0x40ca, 0x0000, 0x0000, 0x0000, 0x40d2, 0x0000,
+	/* char table 0xf___ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x07c0, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char table 0xfb__ */
+	0x0000, 0x07d0, 0x07e0, 0x07f0, 0x0800, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	/* char values 0xfb1_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x40da,
+	/* char values 0xfb2_ */
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x40e2, 0x40ea, 0x40f3, 0x40ff, 0x410a, 0x4112,
+	/* char values 0xfb3_ */
+	0x411a, 0x4122, 0x412a, 0x4132, 0x413a, 0x4142, 0x414a, 0x0000,
+	0x4152, 0x415a, 0x4162, 0x416a, 0x4172, 0x0000, 0x417a, 0x0000,
+	/* char values 0xfb4_ */
+	0x4182, 0x418a, 0x0000, 0x4192, 0x419a, 0x0000, 0x41a2, 0x41aa,
+	0x41b2, 0x41ba, 0x41c2, 0x41ca, 0x41d2, 0x41da, 0x41e2, 0x0000,
+	/* decomposed characters */
+	0x0041, 0x0300, 0x0041, 0x0301, 0x0041, 0x0302, 0x0041, 0x0303,
+	0x0041, 0x0308, 0x0041, 0x030a, 0x0043, 0x0327, 0x0045, 0x0300,
+	0x0045, 0x0301, 0x0045, 0x0302, 0x0045, 0x0308, 0x0049, 0x0300,
+	0x0049, 0x0301, 0x0049, 0x0302, 0x0049, 0x0308, 0x004e, 0x0303,
+	0x004f, 0x0300, 0x004f, 0x0301, 0x004f, 0x0302, 0x004f, 0x0303,
+	0x004f, 0x0308, 0x0055, 0x0300, 0x0055, 0x0301, 0x0055, 0x0302,
+	0x0055, 0x0308, 0x0059, 0x0301, 0x0061, 0x0300, 0x0061, 0x0301,
+	0x0061, 0x0302, 0x0061, 0x0303, 0x0061, 0x0308, 0x0061, 0x030a,
+	0x0063, 0x0327, 0x0065, 0x0300, 0x0065, 0x0301, 0x0065, 0x0302,
+	0x0065, 0x0308, 0x0069, 0x0300, 0x0069, 0x0301, 0x0069, 0x0302,
+	0x0069, 0x0308, 0x006e, 0x0303, 0x006f, 0x0300, 0x006f, 0x0301,
+	0x006f, 0x0302, 0x006f, 0x0303, 0x006f, 0x0308, 0x0075, 0x0300,
+	0x0075, 0x0301, 0x0075, 0x0302, 0x0075, 0x0308, 0x0079, 0x0301,
+	0x0079, 0x0308, 0x0041, 0x0304, 0x0061, 0x0304, 0x0041, 0x0306,
+	0x0061, 0x0306, 0x0041, 0x0328, 0x0061, 0x0328, 0x0043, 0x0301,
+	0x0063, 0x0301, 0x0043, 0x0302, 0x0063, 0x0302, 0x0043, 0x0307,
+	0x0063, 0x0307, 0x0043, 0x030c, 0x0063, 0x030c, 0x0044, 0x030c,
+	0x0064, 0x030c, 0x0045, 0x0304, 0x0065, 0x0304, 0x0045, 0x0306,
+	0x0065, 0x0306, 0x0045, 0x0307, 0x0065, 0x0307, 0x0045, 0x0328,
+	0x0065, 0x0328, 0x0045, 0x030c, 0x0065, 0x030c, 0x0047, 0x0302,
+	0x0067, 0x0302, 0x0047, 0x0306, 0x0067, 0x0306, 0x0047, 0x0307,
+	0x0067, 0x0307, 0x0047, 0x0327, 0x0067, 0x0327, 0x0048, 0x0302,
+	0x0068, 0x0302, 0x0049, 0x0303, 0x0069, 0x0303, 0x0049, 0x0304,
+	0x0069, 0x0304, 0x0049, 0x0306, 0x0069, 0x0306, 0x0049, 0x0328,
+	0x0069, 0x0328, 0x0049, 0x0307, 0x004a, 0x0302, 0x006a, 0x0302,
+	0x004b, 0x0327, 0x006b, 0x0327, 0x004c, 0x0301, 0x006c, 0x0301,
+	0x004c, 0x0327, 0x006c, 0x0327, 0x004c, 0x030c, 0x006c, 0x030c,
+	0x004e, 0x0301, 0x006e, 0x0301, 0x004e, 0x0327, 0x006e, 0x0327,
+	0x004e, 0x030c, 0x006e, 0x030c, 0x004f, 0x0304, 0x006f, 0x0304,
+	0x004f, 0x0306, 0x006f, 0x0306, 0x004f, 0x030b, 0x006f, 0x030b,
+	0x0052, 0x0301, 0x0072, 0x0301, 0x0052, 0x0327, 0x0072, 0x0327,
+	0x0052, 0x030c, 0x0072, 0x030c, 0x0053, 0x0301, 0x0073, 0x0301,
+	0x0053, 0x0302, 0x0073, 0x0302, 0x0053, 0x0327, 0x0073, 0x0327,
+	0x0053, 0x030c, 0x0073, 0x030c, 0x0054, 0x0327, 0x0074, 0x0327,
+	0x0054, 0x030c, 0x0074, 0x030c, 0x0055, 0x0303, 0x0075, 0x0303,
+	0x0055, 0x0304, 0x0075, 0x0304, 0x0055, 0x0306, 0x0075, 0x0306,
+	0x0055, 0x030a, 0x0075, 0x030a, 0x0055, 0x030b, 0x0075, 0x030b,
+	0x0055, 0x0328, 0x0075, 0x0328, 0x0057, 0x0302, 0x0077, 0x0302,
+	0x0059, 0x0302, 0x0079, 0x0302, 0x0059, 0x0308, 0x005a, 0x0301,
+	0x007a, 0x0301, 0x005a, 0x0307, 0x007a, 0x0307, 0x005a, 0x030c,
+	0x007a, 0x030c, 0x004f, 0x031b, 0x006f, 0x031b, 0x0055, 0x031b,
+	0x0075, 0x031b, 0x0041, 0x030c, 0x0061, 0x030c, 0x0049, 0x030c,
+	0x0069, 0x030c, 0x004f, 0x030c, 0x006f, 0x030c, 0x0055, 0x030c,
+	0x0075, 0x030c, 0x0055, 0x0308, 0x0304, 0x0075, 0x0308, 0x0304,
+	0x0055, 0x0308, 0x0301, 0x0075, 0x0308, 0x0301, 0x0055, 0x0308,
+	0x030c, 0x0075, 0x0308, 0x030c, 0x0055, 0x0308, 0x0300, 0x0075,
+	0x0308, 0x0300, 0x0041, 0x0308, 0x0304, 0x0061, 0x0308, 0x0304,
+	0x0041, 0x0307, 0x0304, 0x0061, 0x0307, 0x0304, 0x00c6, 0x0304,
+	0x00e6, 0x0304, 0x0047, 0x030c, 0x0067, 0x030c, 0x004b, 0x030c,
+	0x006b, 0x030c, 0x004f, 0x0328, 0x006f, 0x0328, 0x004f, 0x0328,
+	0x0304, 0x006f, 0x0328, 0x0304, 0x01b7, 0x030c, 0x0292, 0x030c,
+	0x006a, 0x030c, 0x0047, 0x0301, 0x0067, 0x0301, 0x0041, 0x030a,
+	0x0301, 0x0061, 0x030a, 0x0301, 0x00c6, 0x0301, 0x00e6, 0x0301,
+	0x00d8, 0x0301, 0x00f8, 0x0301, 0x0041, 0x030f, 0x0061, 0x030f,
+	0x0041, 0x0311, 0x0061, 0x0311, 0x0045, 0x030f, 0x0065, 0x030f,
+	0x0045, 0x0311, 0x0065, 0x0311, 0x0049, 0x030f, 0x0069, 0x030f,
+	0x0049, 0x0311, 0x0069, 0x0311, 0x004f, 0x030f, 0x006f, 0x030f,
+	0x004f, 0x0311, 0x006f, 0x0311, 0x0052, 0x030f, 0x0072, 0x030f,
+	0x0052, 0x0311, 0x0072, 0x0311, 0x0055, 0x030f, 0x0075, 0x030f,
+	0x0055, 0x0311, 0x0075, 0x0311, 0x0306, 0x0307, 0x0300, 0x0301,
+	0x0313, 0x0308, 0x030d, 0x02b9, 0x003b, 0x00a8, 0x030d, 0x0391,
+	0x030d, 0x00b7, 0x0395, 0x030d, 0x0397, 0x030d, 0x0399, 0x030d,
+	0x039f, 0x030d, 0x03a5, 0x030d, 0x03a9, 0x030d, 0x03b9, 0x0308,
+	0x030d, 0x0399, 0x0308, 0x03a5, 0x0308, 0x03b1, 0x030d, 0x03b5,
+	0x030d, 0x03b7, 0x030d, 0x03b9, 0x030d, 0x03c5, 0x0308, 0x030d,
+	0x03b9, 0x0308, 0x03c5, 0x0308, 0x03bf, 0x030d, 0x03c5, 0x030d,
+	0x03c9, 0x030d, 0x03d2, 0x030d, 0x03d2, 0x0308, 0x0415, 0x0308,
+	0x0413, 0x0301, 0x0406, 0x0308, 0x041a, 0x0301, 0x0423, 0x0306,
+	0x0418, 0x0306, 0x0438, 0x0306, 0x0435, 0x0308, 0x0433, 0x0301,
+	0x0456, 0x0308, 0x043a, 0x0301, 0x0443, 0x0306, 0x0474, 0x030f,
+	0x0475, 0x030f, 0x0416, 0x0306, 0x0436, 0x0306, 0x0410, 0x0306,
+	0x0430, 0x0306, 0x0410, 0x0308, 0x0430, 0x0308, 0x00c6, 0x00e6,
+	0x0415, 0x0306, 0x0435, 0x0306, 0x018f, 0x0259, 0x018f, 0x0308,
+	0x0259, 0x0308, 0x0416, 0x0308, 0x0436, 0x0308, 0x0417, 0x0308,
+	0x0437, 0x0308, 0x01b7, 0x0292, 0x0418, 0x0304, 0x0438, 0x0304,
+	0x0418, 0x0308, 0x0438, 0x0308, 0x041e, 0x0308, 0x043e, 0x0308,
+	0x019f, 0x0275, 0x019f, 0x0308, 0x0275, 0x0308, 0x0423, 0x0304,
+	0x0443, 0x0304, 0x0423, 0x0308, 0x0443, 0x0308, 0x0423, 0x030b,
+	0x0443, 0x030b, 0x0427, 0x0308, 0x0447, 0x0308, 0x042b, 0x0308,
+	0x044b, 0x0308, 0x0928, 0x093c, 0x0930, 0x093c, 0x0933, 0x093c,
+	0x0915, 0x093c, 0x0916, 0x093c, 0x0917, 0x093c, 0x091c, 0x093c,
+	0x0921, 0x093c, 0x0922, 0x093c, 0x092b, 0x093c, 0x092f, 0x093c,
+	0x09ac, 0x09bc, 0x09c7, 0x09be, 0x09c7, 0x09d7, 0x09a1, 0x09bc,
+	0x09a2, 0x09bc, 0x09af, 0x09bc, 0x0a16, 0x0a3c, 0x0a17, 0x0a3c,
+	0x0a1c, 0x0a3c, 0x0a21, 0x0a3c, 0x0a2b, 0x0a3c, 0x0b47, 0x0b56,
+	0x0b47, 0x0b3e, 0x0b47, 0x0b57, 0x0b21, 0x0b3c, 0x0b22, 0x0b3c,
+	0x0b2f, 0x0b3c, 0x0b92, 0x0bd7, 0x0bc6, 0x0bbe, 0x0bc7, 0x0bbe,
+	0x0bc6, 0x0bd7, 0x0c46, 0x0c56, 0x0cbf, 0x0cd5, 0x0cc6, 0x0cd5,
+	0x0cc6, 0x0cd6, 0x0cc6, 0x0cc2, 0x0cc6, 0x0cc2, 0x0cd5, 0x0d46,
+	0x0d3e, 0x0d47, 0x0d3e, 0x0d46, 0x0d57, 0x0e4d, 0x0e32, 0x0ecd,
+	0x0eb2, 0x0f42, 0x0fb7, 0x0f4c, 0x0fb7, 0x0f51, 0x0fb7, 0x0f56,
+	0x0fb7, 0x0f5b, 0x0fb7, 0x0f40, 0x0fb5, 0x0f72, 0x0f71, 0x0f74,
+	0x0f71, 0x0fb2, 0x0f80, 0x0fb2, 0x0f80, 0x0f71, 0x0fb3, 0x0f80,
+	0x0fb3, 0x0f80, 0x0f71, 0x0f80, 0x0f71, 0x0f92, 0x0fb7, 0x0f9c,
+	0x0fb7, 0x0fa1, 0x0fb7, 0x0fa6, 0x0fb7, 0x0fab, 0x0fb7, 0x0f90,
+	0x0fb5, 0x0041, 0x0325, 0x0061, 0x0325, 0x0042, 0x0307, 0x0062,
+	0x0307, 0x0042, 0x0323, 0x0062, 0x0323, 0x0042, 0x0331, 0x0062,
+	0x0331, 0x0043, 0x0327, 0x0301, 0x0063, 0x0327, 0x0301, 0x0044,
+	0x0307, 0x0064, 0x0307, 0x0044, 0x0323, 0x0064, 0x0323, 0x0044,
+	0x0331, 0x0064, 0x0331, 0x0044, 0x0327, 0x0064, 0x0327, 0x0044,
+	0x032d, 0x0064, 0x032d, 0x0045, 0x0304, 0x0300, 0x0065, 0x0304,
+	0x0300, 0x0045, 0x0304, 0x0301, 0x0065, 0x0304, 0x0301, 0x0045,
+	0x032d, 0x0065, 0x032d, 0x0045, 0x0330, 0x0065, 0x0330, 0x0045,
+	0x0327, 0x0306, 0x0065, 0x0327, 0x0306, 0x0046, 0x0307, 0x0066,
+	0x0307, 0x0047, 0x0304, 0x0067, 0x0304, 0x0048, 0x0307, 0x0068,
+	0x0307, 0x0048, 0x0323, 0x0068, 0x0323, 0x0048, 0x0308, 0x0068,
+	0x0308, 0x0048, 0x0327, 0x0068, 0x0327, 0x0048, 0x032e, 0x0068,
+	0x032e, 0x0049, 0x0330, 0x0069, 0x0330, 0x0049, 0x0308, 0x0301,
+	0x0069, 0x0308, 0x0301, 0x004b, 0x0301, 0x006b, 0x0301, 0x004b,
+	0x0323, 0x006b, 0x0323, 0x004b, 0x0331, 0x006b, 0x0331, 0x004c,
+	0x0323, 0x006c, 0x0323, 0x004c, 0x0323, 0x0304, 0x006c, 0x0323,
+	0x0304, 0x004c, 0x0331, 0x006c, 0x0331, 0x004c, 0x032d, 0x006c,
+	0x032d, 0x004d, 0x0301, 0x006d, 0x0301, 0x004d, 0x0307, 0x006d,
+	0x0307, 0x004d, 0x0323, 0x006d, 0x0323, 0x004e, 0x0307, 0x006e,
+	0x0307, 0x004e, 0x0323, 0x006e, 0x0323, 0x004e, 0x0331, 0x006e,
+	0x0331, 0x004e, 0x032d, 0x006e, 0x032d, 0x004f, 0x0303, 0x0301,
+	0x006f, 0x0303, 0x0301, 0x004f, 0x0303, 0x0308, 0x006f, 0x0303,
+	0x0308, 0x004f, 0x0304, 0x0300, 0x006f, 0x0304, 0x0300, 0x004f,
+	0x0304, 0x0301, 0x006f, 0x0304, 0x0301, 0x0050, 0x0301, 0x0070,
+	0x0301, 0x0050, 0x0307, 0x0070, 0x0307, 0x0052, 0x0307, 0x0072,
+	0x0307, 0x0052, 0x0323, 0x0072, 0x0323, 0x0052, 0x0323, 0x0304,
+	0x0072, 0x0323, 0x0304, 0x0052, 0x0331, 0x0072, 0x0331, 0x0053,
+	0x0307, 0x0073, 0x0307, 0x0053, 0x0323, 0x0073, 0x0323, 0x0053,
+	0x0301, 0x0307, 0x0073, 0x0301, 0x0307, 0x0053, 0x030c, 0x0307,
+	0x0073, 0x030c, 0x0307, 0x0053, 0x0323, 0x0307, 0x0073, 0x0323,
+	0x0307, 0x0054, 0x0307, 0x0074, 0x0307, 0x0054, 0x0323, 0x0074,
+	0x0323, 0x0054, 0x0331, 0x0074, 0x0331, 0x0054, 0x032d, 0x0074,
+	0x032d, 0x0055, 0x0324, 0x0075, 0x0324, 0x0055, 0x0330, 0x0075,
+	0x0330, 0x0055, 0x032d, 0x0075, 0x032d, 0x0055, 0x0303, 0x0301,
+	0x0075, 0x0303, 0x0301, 0x0055, 0x0304, 0x0308, 0x0075, 0x0304,
+	0x0308, 0x0056, 0x0303, 0x0076, 0x0303, 0x0056, 0x0323, 0x0076,
+	0x0323, 0x0057, 0x0300, 0x0077, 0x0300, 0x0057, 0x0301, 0x0077,
+	0x0301, 0x0057, 0x0308, 0x0077, 0x0308, 0x0057, 0x0307, 0x0077,
+	0x0307, 0x0057, 0x0323, 0x0077, 0x0323, 0x0058, 0x0307, 0x0078,
+	0x0307, 0x0058, 0x0308, 0x0078, 0x0308, 0x0059, 0x0307, 0x0079,
+	0x0307, 0x005a, 0x0302, 0x007a, 0x0302, 0x005a, 0x0323, 0x007a,
+	0x0323, 0x005a, 0x0331, 0x007a, 0x0331, 0x0068, 0x0331, 0x0074,
+	0x0308, 0x0077, 0x030a, 0x0079, 0x030a, 0x017f, 0x0307, 0x0041,
+	0x0323, 0x0061, 0x0323, 0x0041, 0x0309, 0x0061, 0x0309, 0x0041,
+	0x0302, 0x0301, 0x0061, 0x0302, 0x0301, 0x0041, 0x0302, 0x0300,
+	0x0061, 0x0302, 0x0300, 0x0041, 0x0302, 0x0309, 0x0061, 0x0302,
+	0x0309, 0x0041, 0x0302, 0x0303, 0x0061, 0x0302, 0x0303, 0x0041,
+	0x0323, 0x0302, 0x0061, 0x0323, 0x0302, 0x0041, 0x0306, 0x0301,
+	0x0061, 0x0306, 0x0301, 0x0041, 0x0306, 0x0300, 0x0061, 0x0306,
+	0x0300, 0x0041, 0x0306, 0x0309, 0x0061, 0x0306, 0x0309, 0x0041,
+	0x0306, 0x0303, 0x0061, 0x0306, 0x0303, 0x0041, 0x0323, 0x0306,
+	0x0061, 0x0323, 0x0306, 0x0045, 0x0323, 0x0065, 0x0323, 0x0045,
+	0x0309, 0x0065, 0x0309, 0x0045, 0x0303, 0x0065, 0x0303, 0x0045,
+	0x0302, 0x0301, 0x0065, 0x0302, 0x0301, 0x0045, 0x0302, 0x0300,
+	0x0065, 0x0302, 0x0300, 0x0045, 0x0302, 0x0309, 0x0065, 0x0302,
+	0x0309, 0x0045, 0x0302, 0x0303, 0x0065, 0x0302, 0x0303, 0x0045,
+	0x0323, 0x0302, 0x0065, 0x0323, 0x0302, 0x0049, 0x0309, 0x0069,
+	0x0309, 0x0049, 0x0323, 0x0069, 0x0323, 0x004f, 0x0323, 0x006f,
+	0x0323, 0x004f, 0x0309, 0x006f, 0x0309, 0x004f, 0x0302, 0x0301,
+	0x006f, 0x0302, 0x0301, 0x004f, 0x0302, 0x0300, 0x006f, 0x0302,
+	0x0300, 0x004f, 0x0302, 0x0309, 0x006f, 0x0302, 0x0309, 0x004f,
+	0x0302, 0x0303, 0x006f, 0x0302, 0x0303, 0x004f, 0x0323, 0x0302,
+	0x006f, 0x0323, 0x0302, 0x004f, 0x031b, 0x0301, 0x006f, 0x031b,
+	0x0301, 0x004f, 0x031b, 0x0300, 0x006f, 0x031b, 0x0300, 0x004f,
+	0x031b, 0x0309, 0x006f, 0x031b, 0x0309, 0x004f, 0x031b, 0x0303,
+	0x006f, 0x031b, 0x0303, 0x004f, 0x031b, 0x0323, 0x006f, 0x031b,
+	0x0323, 0x0055, 0x0323, 0x0075, 0x0323, 0x0055, 0x0309, 0x0075,
+	0x0309, 0x0055, 0x031b, 0x0301, 0x0075, 0x031b, 0x0301, 0x0055,
+	0x031b, 0x0300, 0x0075, 0x031b, 0x0300, 0x0055, 0x031b, 0x0309,
+	0x0075, 0x031b, 0x0309, 0x0055, 0x031b, 0x0303, 0x0075, 0x031b,
+	0x0303, 0x0055, 0x031b, 0x0323, 0x0075, 0x031b, 0x0323, 0x0059,
+	0x0300, 0x0079, 0x0300, 0x0059, 0x0323, 0x0079, 0x0323, 0x0059,
+	0x0309, 0x0079, 0x0309, 0x0059, 0x0303, 0x0079, 0x0303, 0x03b1,
+	0x0313, 0x03b1, 0x0314, 0x03b1, 0x0313, 0x0300, 0x03b1, 0x0314,
+	0x0300, 0x03b1, 0x0313, 0x0301, 0x03b1, 0x0314, 0x0301, 0x03b1,
+	0x0313, 0x0342, 0x03b1, 0x0314, 0x0342, 0x0391, 0x0313, 0x0391,
+	0x0314, 0x0391, 0x0313, 0x0300, 0x0391, 0x0314, 0x0300, 0x0391,
+	0x0313, 0x0301, 0x0391, 0x0314, 0x0301, 0x0391, 0x0313, 0x0342,
+	0x0391, 0x0314, 0x0342, 0x03b5, 0x0313, 0x03b5, 0x0314, 0x03b5,
+	0x0313, 0x0300, 0x03b5, 0x0314, 0x0300, 0x03b5, 0x0313, 0x0301,
+	0x03b5, 0x0314, 0x0301, 0x0395, 0x0313, 0x0395, 0x0314, 0x0395,
+	0x0313, 0x0300, 0x0395, 0x0314, 0x0300, 0x0395, 0x0313, 0x0301,
+	0x0395, 0x0314, 0x0301, 0x03b7, 0x0313, 0x03b7, 0x0314, 0x03b7,
+	0x0313, 0x0300, 0x03b7, 0x0314, 0x0300, 0x03b7, 0x0313, 0x0301,
+	0x03b7, 0x0314, 0x0301, 0x03b7, 0x0313, 0x0342, 0x03b7, 0x0314,
+	0x0342, 0x0397, 0x0313, 0x0397, 0x0314, 0x0397, 0x0313, 0x0300,
+	0x0397, 0x0314, 0x0300, 0x0397, 0x0313, 0x0301, 0x0397, 0x0314,
+	0x0301, 0x0397, 0x0313, 0x0342, 0x0397, 0x0314, 0x0342, 0x03b9,
+	0x0313, 0x03b9, 0x0314, 0x03b9, 0x0313, 0x0300, 0x03b9, 0x0314,
+	0x0300, 0x03b9, 0x0313, 0x0301, 0x03b9, 0x0314, 0x0301, 0x03b9,
+	0x0313, 0x0342, 0x03b9, 0x0314, 0x0342, 0x0399, 0x0313, 0x0399,
+	0x0314, 0x0399, 0x0313, 0x0300, 0x0399, 0x0314, 0x0300, 0x0399,
+	0x0313, 0x0301, 0x0399, 0x0314, 0x0301, 0x0399, 0x0313, 0x0342,
+	0x0399, 0x0314, 0x0342, 0x03bf, 0x0313, 0x03bf, 0x0314, 0x03bf,
+	0x0313, 0x0300, 0x03bf, 0x0314, 0x0300, 0x03bf, 0x0313, 0x0301,
+	0x03bf, 0x0314, 0x0301, 0x039f, 0x0313, 0x039f, 0x0314, 0x039f,
+	0x0313, 0x0300, 0x039f, 0x0314, 0x0300, 0x039f, 0x0313, 0x0301,
+	0x039f, 0x0314, 0x0301, 0x03c5, 0x0313, 0x03c5, 0x0314, 0x03c5,
+	0x0313, 0x0300, 0x03c5, 0x0314, 0x0300, 0x03c5, 0x0313, 0x0301,
+	0x03c5, 0x0314, 0x0301, 0x03c5, 0x0313, 0x0342, 0x03c5, 0x0314,
+	0x0342, 0x03a5, 0x0314, 0x03a5, 0x0314, 0x0300, 0x03a5, 0x0314,
+	0x0301, 0x03a5, 0x0314, 0x0342, 0x03c9, 0x0313, 0x03c9, 0x0314,
+	0x03c9, 0x0313, 0x0300, 0x03c9, 0x0314, 0x0300, 0x03c9, 0x0313,
+	0x0301, 0x03c9, 0x0314, 0x0301, 0x03c9, 0x0313, 0x0342, 0x03c9,
+	0x0314, 0x0342, 0x03a9, 0x0313, 0x03a9, 0x0314, 0x03a9, 0x0313,
+	0x0300, 0x03a9, 0x0314, 0x0300, 0x03a9, 0x0313, 0x0301, 0x03a9,
+	0x0314, 0x0301, 0x03a9, 0x0313, 0x0342, 0x03a9, 0x0314, 0x0342,
+	0x03b1, 0x0300, 0x03b1, 0x0301, 0x03b5, 0x0300, 0x03b5, 0x0301,
+	0x03b7, 0x0300, 0x03b7, 0x0301, 0x03b9, 0x0300, 0x03b9, 0x0301,
+	0x03bf, 0x0300, 0x03bf, 0x0301, 0x03c5, 0x0300, 0x03c5, 0x0301,
+	0x03c9, 0x0300, 0x03c9, 0x0301, 0x03b1, 0x0345, 0x0313, 0x03b1,
+	0x0345, 0x0314, 0x03b1, 0x0345, 0x0313, 0x0300, 0x03b1, 0x0345,
+	0x0314, 0x0300, 0x03b1, 0x0345, 0x0313, 0x0301, 0x03b1, 0x0345,
+	0x0314, 0x0301, 0x03b1, 0x0345, 0x0313, 0x0342, 0x03b1, 0x0345,
+	0x0314, 0x0342, 0x0391, 0x0345, 0x0313, 0x0391, 0x0345, 0x0314,
+	0x0391, 0x0345, 0x0313, 0x0300, 0x0391, 0x0345, 0x0314, 0x0300,
+	0x0391, 0x0345, 0x0313, 0x0301, 0x0391, 0x0345, 0x0314, 0x0301,
+	0x0391, 0x0345, 0x0313, 0x0342, 0x0391, 0x0345, 0x0314, 0x0342,
+	0x03b7, 0x0345, 0x0313, 0x03b7, 0x0345, 0x0314, 0x03b7, 0x0345,
+	0x0313, 0x0300, 0x03b7, 0x0345, 0x0314, 0x0300, 0x03b7, 0x0345,
+	0x0313, 0x0301, 0x03b7, 0x0345, 0x0314, 0x0301, 0x03b7, 0x0345,
+	0x0313, 0x0342, 0x03b7, 0x0345, 0x0314, 0x0342, 0x0397, 0x0345,
+	0x0313, 0x0397, 0x0345, 0x0314, 0x0397, 0x0345, 0x0313, 0x0300,
+	0x0397, 0x0345, 0x0314, 0x0300, 0x0397, 0x0345, 0x0313, 0x0301,
+	0x0397, 0x0345, 0x0314, 0x0301, 0x0397, 0x0345, 0x0313, 0x0342,
+	0x0397, 0x0345, 0x0314, 0x0342, 0x03c9, 0x0345, 0x0313, 0x03c9,
+	0x0345, 0x0314, 0x03c9, 0x0345, 0x0313, 0x0300, 0x03c9, 0x0345,
+	0x0314, 0x0300, 0x03c9, 0x0345, 0x0313, 0x0301, 0x03c9, 0x0345,
+	0x0314, 0x0301, 0x03c9, 0x0345, 0x0313, 0x0342, 0x03c9, 0x0345,
+	0x0314, 0x0342, 0x03a9, 0x0345, 0x0313, 0x03a9, 0x0345, 0x0314,
+	0x03a9, 0x0345, 0x0313, 0x0300, 0x03a9, 0x0345, 0x0314, 0x0300,
+	0x03a9, 0x0345, 0x0313, 0x0301, 0x03a9, 0x0345, 0x0314, 0x0301,
+	0x03a9, 0x0345, 0x0313, 0x0342, 0x03a9, 0x0345, 0x0314, 0x0342,
+	0x03b1, 0x0306, 0x03b1, 0x0304, 0x03b1, 0x0345, 0x0300, 0x03b1,
+	0x0345, 0x03b1, 0x0345, 0x0301, 0x03b1, 0x0342, 0x03b1, 0x0345,
+	0x0342, 0x0391, 0x0306, 0x0391, 0x0304, 0x0391, 0x0300, 0x0391,
+	0x0301, 0x0391, 0x0345, 0x03b9, 0x00a8, 0x0342, 0x03b7, 0x0345,
+	0x0300, 0x03b7, 0x0345, 0x03b7, 0x0345, 0x0301, 0x03b7, 0x0342,
+	0x03b7, 0x0345, 0x0342, 0x0395, 0x0300, 0x0395, 0x0301, 0x0397,
+	0x0300, 0x0397, 0x0301, 0x0397, 0x0345, 0x1fbf, 0x0300, 0x1fbf,
+	0x0301, 0x1fbf, 0x0342, 0x03b9, 0x0306, 0x03b9, 0x0304, 0x03b9,
+	0x0308, 0x0300, 0x03b9, 0x0308, 0x0301, 0x03b9, 0x0342, 0x03b9,
+	0x0308, 0x0342, 0x0399, 0x0306, 0x0399, 0x0304, 0x0399, 0x0300,
+	0x0399, 0x0301, 0x1ffe, 0x0300, 0x1ffe, 0x0301, 0x1ffe, 0x0342,
+	0x03c5, 0x0306, 0x03c5, 0x0304, 0x03c5, 0x0308, 0x0300, 0x03c5,
+	0x0308, 0x0301, 0x03c1, 0x0313, 0x03c1, 0x0314, 0x03c5, 0x0342,
+	0x03c5, 0x0308, 0x0342, 0x03a5, 0x0306, 0x03a5, 0x0304, 0x03a5,
+	0x0300, 0x03a5, 0x0301, 0x03a1, 0x0314, 0x00a8, 0x0300, 0x00a8,
+	0x0301, 0x0060, 0x03c9, 0x0345, 0x0300, 0x03c9, 0x0345, 0x03bf,
+	0x0345, 0x0301, 0x03c9, 0x0342, 0x03c9, 0x0345, 0x0342, 0x039f,
+	0x0300, 0x039f, 0x0301, 0x03a9, 0x0300, 0x03a9, 0x0301, 0x03a9,
+	0x0345, 0x00b4, 0x304b, 0x3099, 0x304d, 0x3099, 0x304f, 0x3099,
+	0x3051, 0x3099, 0x3053, 0x3099, 0x3055, 0x3099, 0x3057, 0x3099,
+	0x3059, 0x3099, 0x305b, 0x3099, 0x305d, 0x3099, 0x305f, 0x3099,
+	0x3061, 0x3099, 0x3064, 0x3099, 0x3066, 0x3099, 0x3068, 0x3099,
+	0x306f, 0x3099, 0x306f, 0x309a, 0x3072, 0x3099, 0x3072, 0x309a,
+	0x3075, 0x3099, 0x3075, 0x309a, 0x3078, 0x3099, 0x3078, 0x309a,
+	0x307b, 0x3099, 0x307b, 0x309a, 0x3046, 0x3099, 0x309d, 0x3099,
+	0x30ab, 0x3099, 0x30ad, 0x3099, 0x30af, 0x3099, 0x30b1, 0x3099,
+	0x30b3, 0x3099, 0x30b5, 0x3099, 0x30b7, 0x3099, 0x30b9, 0x3099,
+	0x30bb, 0x3099, 0x30bd, 0x3099, 0x30bf, 0x3099, 0x30c1, 0x3099,
+	0x30c4, 0x3099, 0x30c6, 0x3099, 0x30c8, 0x3099, 0x30cf, 0x3099,
+	0x30cf, 0x309a, 0x30d2, 0x3099, 0x30d2, 0x309a, 0x30d5, 0x3099,
+	0x30d5, 0x309a, 0x30d8, 0x3099, 0x30d8, 0x309a, 0x30db, 0x3099,
+	0x30db, 0x309a, 0x30a6, 0x3099, 0x30ef, 0x3099, 0x30f0, 0x3099,
+	0x30f1, 0x3099, 0x30f2, 0x3099, 0x30fd, 0x3099, 0x05f2, 0x05b7,
+	0x05e9, 0x05c1, 0x05e9, 0x05c2, 0x05e9, 0x05bc, 0x05c1, 0x05e9,
+	0x05bc, 0x05c2, 0x05d0, 0x05b7, 0x05d0, 0x05b8, 0x05d0, 0x05bc,
+	0x05d1, 0x05bc, 0x05d2, 0x05bc, 0x05d3, 0x05bc, 0x05d4, 0x05bc,
+	0x05d5, 0x05bc, 0x05d6, 0x05bc, 0x05d8, 0x05bc, 0x05d9, 0x05bc,
+	0x05da, 0x05bc, 0x05db, 0x05bc, 0x05dc, 0x05bc, 0x05de, 0x05bc,
+	0x05e0, 0x05bc, 0x05e1, 0x05bc, 0x05e3, 0x05bc, 0x05e4, 0x05bc,
+	0x05e6, 0x05bc, 0x05e7, 0x05bc, 0x05e8, 0x05bc, 0x05e9, 0x05bc,
+	0x05ea, 0x05bc, 0x05d5, 0x05b9, 0x05d1, 0x05bf, 0x05db, 0x05bf,
+	0x05e4, 0x05bf
+};
+
+u16 hfsplus_compose_table[] = {
+	/* base */
+	0x0000, 0x0050,  0x0300, 0x00a4,  0x0301, 0x00e4,  0x0302, 0x015c,
+	0x0303, 0x0192,  0x0304, 0x01b4,  0x0306, 0x01e6,  0x0307, 0x0220,
+	0x0308, 0x0270,  0x0309, 0x02d2,  0x030a, 0x02ec,  0x030b, 0x02fa,
+	0x030c, 0x0308,  0x030d, 0x034c,  0x030f, 0x0370,  0x0311, 0x038e,
+	0x0313, 0x03a8,  0x0314, 0x03c6,  0x031b, 0x03e8,  0x0323, 0x03f2,
+	0x0324, 0x0440,  0x0325, 0x0446,  0x0327, 0x044c,  0x0328, 0x047a,
+	0x032d, 0x0490,  0x032e, 0x04aa,  0x0330, 0x04b0,  0x0331, 0x04be,
+	0x0342, 0x04e2,  0x0345, 0x04f4,  0x05b7, 0x0504,  0x05b8, 0x050a,
+	0x05b9, 0x050e,  0x05bc, 0x0512,  0x05bf, 0x0540,  0x05c1, 0x0548,
+	0x05c2, 0x054c,  0x093c, 0x0550,  0x09bc, 0x0568,  0x09be, 0x0572,
+	0x09d7, 0x0576,  0x0a3c, 0x057a,  0x0b3c, 0x0586,  0x0b3e, 0x058e,
+	0x0b56, 0x0592,  0x0b57, 0x0596,  0x0bbe, 0x059a,  0x0bd7, 0x05a0,
+	0x0c56, 0x05a6,  0x0cc2, 0x05aa,  0x0cd5, 0x05ae,  0x0cd6, 0x05b4,
+	0x0d3e, 0x05b8,  0x0d57, 0x05be,  0x0e32, 0x05c2,  0x0eb2, 0x05c6,
+	0x0f71, 0x05ca,  0x0f80, 0x05d2,  0x0fb5, 0x05d8,  0x0fb7, 0x05de,
+	0x1100, 0x00a2,  0x1101, 0x00a2,  0x1102, 0x00a2,  0x1103, 0x00a2,
+	0x1104, 0x00a2,  0x1105, 0x00a2,  0x1106, 0x00a2,  0x1107, 0x00a2,
+	0x1108, 0x00a2,  0x1109, 0x00a2,  0x110a, 0x00a2,  0x110b, 0x00a2,
+	0x110c, 0x00a2,  0x110d, 0x00a2,  0x110e, 0x00a2,  0x110f, 0x00a2,
+	0x1110, 0x00a2,  0x1111, 0x00a2,  0x1112, 0x00a2,  0x3099, 0x05f4,
+	0x309a, 0x0656,
+	/* hangul marker */
+	0xffff, 0x0000,
+	/* 0x0300 */
+	0x0340, 0x001f,  0x0041, 0x066c,  0x0045, 0x066e,  0x0049, 0x0670,
+	0x004f, 0x0672,  0x0055, 0x0674,  0x0057, 0x0676,  0x0059, 0x0678,
+	0x0061, 0x067a,  0x0065, 0x067c,  0x0069, 0x067e,  0x006f, 0x0680,
+	0x0075, 0x0682,  0x0077, 0x0684,  0x0079, 0x0686,  0x00a8, 0x0688,
+	0x0391, 0x068a,  0x0395, 0x068c,  0x0397, 0x068e,  0x0399, 0x0690,
+	0x039f, 0x0692,  0x03a5, 0x0694,  0x03a9, 0x0696,  0x03b1, 0x0698,
+	0x03b5, 0x069a,  0x03b7, 0x069c,  0x03b9, 0x069e,  0x03bf, 0x06a0,
+	0x03c5, 0x06a2,  0x03c9, 0x06a4,  0x1fbf, 0x06a6,  0x1ffe, 0x06a8,
+	/* 0x0301 */
+	0x0341, 0x003b,  0x0041, 0x06aa,  0x0043, 0x06ac,  0x0045, 0x06ae,
+	0x0047, 0x06b0,  0x0049, 0x06b2,  0x004b, 0x06b4,  0x004c, 0x06b6,
+	0x004d, 0x06b8,  0x004e, 0x06ba,  0x004f, 0x06bc,  0x0050, 0x06be,
+	0x0052, 0x06c0,  0x0053, 0x06c2,  0x0055, 0x06c6,  0x0057, 0x06c8,
+	0x0059, 0x06ca,  0x005a, 0x06cc,  0x0061, 0x06ce,  0x0063, 0x06d0,
+	0x0065, 0x06d2,  0x0067, 0x06d4,  0x0069, 0x06d6,  0x006b, 0x06d8,
+	0x006c, 0x06da,  0x006d, 0x06dc,  0x006e, 0x06de,  0x006f, 0x06e0,
+	0x0070, 0x06e2,  0x0072, 0x06e4,  0x0073, 0x06e6,  0x0075, 0x06ea,
+	0x0077, 0x06ec,  0x0079, 0x06ee,  0x007a, 0x06f0,  0x00a8, 0x06f2,
+	0x00c6, 0x06f4,  0x00d8, 0x06f6,  0x00e6, 0x06f8,  0x00f8, 0x06fa,
+	0x0391, 0x06fc,  0x0395, 0x06fe,  0x0397, 0x0700,  0x0399, 0x0702,
+	0x039f, 0x0704,  0x03a5, 0x0706,  0x03a9, 0x0708,  0x03b1, 0x070a,
+	0x03b5, 0x070c,  0x03b7, 0x070e,  0x03b9, 0x0710,  0x03bf, 0x0712,
+	0x03c5, 0x0714,  0x03c9, 0x0716,  0x0413, 0x0718,  0x041a, 0x071a,
+	0x0433, 0x071c,  0x043a, 0x071e,  0x1fbf, 0x0720,  0x1ffe, 0x0722,
+	/* 0x0302 */
+	0x0000, 0x001a,  0x0041, 0x0724,  0x0043, 0x072e,  0x0045, 0x0730,
+	0x0047, 0x073a,  0x0048, 0x073c,  0x0049, 0x073e,  0x004a, 0x0740,
+	0x004f, 0x0742,  0x0053, 0x074c,  0x0055, 0x074e,  0x0057, 0x0750,
+	0x0059, 0x0752,  0x005a, 0x0754,  0x0061, 0x0756,  0x0063, 0x0760,
+	0x0065, 0x0762,  0x0067, 0x076c,  0x0068, 0x076e,  0x0069, 0x0770,
+	0x006a, 0x0772,  0x006f, 0x0774,  0x0073, 0x077e,  0x0075, 0x0780,
+	0x0077, 0x0782,  0x0079, 0x0784,  0x007a, 0x0786,
+	/* 0x0303 */
+	0x0000, 0x0010,  0x0041, 0x0788,  0x0045, 0x078a,  0x0049, 0x078c,
+	0x004e, 0x078e,  0x004f, 0x0790,  0x0055, 0x0796,  0x0056, 0x079a,
+	0x0059, 0x079c,  0x0061, 0x079e,  0x0065, 0x07a0,  0x0069, 0x07a2,
+	0x006e, 0x07a4,  0x006f, 0x07a6,  0x0075, 0x07ac,  0x0076, 0x07b0,
+	0x0079, 0x07b2,
+	/* 0x0304 */
+	0x0000, 0x0018,  0x0041, 0x07b4,  0x0045, 0x07b6,  0x0047, 0x07bc,
+	0x0049, 0x07be,  0x004f, 0x07c0,  0x0055, 0x07c6,  0x0061, 0x07ca,
+	0x0065, 0x07cc,  0x0067, 0x07d2,  0x0069, 0x07d4,  0x006f, 0x07d6,
+	0x0075, 0x07dc,  0x00c6, 0x07e0,  0x00e6, 0x07e2,  0x0391, 0x07e4,
+	0x0399, 0x07e6,  0x03a5, 0x07e8,  0x03b1, 0x07ea,  0x03b9, 0x07ec,
+	0x03c5, 0x07ee,  0x0418, 0x07f0,  0x0423, 0x07f2,  0x0438, 0x07f4,
+	0x0443, 0x07f6,
+	/* 0x0306 */
+	0x0000, 0x001c,  0x0041, 0x07f8,  0x0045, 0x0802,  0x0047, 0x0804,
+	0x0049, 0x0806,  0x004f, 0x0808,  0x0055, 0x080a,  0x0061, 0x080c,
+	0x0065, 0x0816,  0x0067, 0x0818,  0x0069, 0x081a,  0x006f, 0x081c,
+	0x0075, 0x081e,  0x0391, 0x0820,  0x0399, 0x0822,  0x03a5, 0x0824,
+	0x03b1, 0x0826,  0x03b9, 0x0828,  0x03c5, 0x082a,  0x0410, 0x082c,
+	0x0415, 0x082e,  0x0416, 0x0830,  0x0418, 0x0832,  0x0423, 0x0834,
+	0x0430, 0x0836,  0x0435, 0x0838,  0x0436, 0x083a,  0x0438, 0x083c,
+	0x0443, 0x083e,
+	/* 0x0307 */
+	0x0000, 0x0027,  0x0041, 0x0840,  0x0042, 0x0844,  0x0043, 0x0846,
+	0x0044, 0x0848,  0x0045, 0x084a,  0x0046, 0x084c,  0x0047, 0x084e,
+	0x0048, 0x0850,  0x0049, 0x0852,  0x004d, 0x0854,  0x004e, 0x0856,
+	0x0050, 0x0858,  0x0052, 0x085a,  0x0053, 0x085c,  0x0054, 0x085e,
+	0x0057, 0x0860,  0x0058, 0x0862,  0x0059, 0x0864,  0x005a, 0x0866,
+	0x0061, 0x0868,  0x0062, 0x086c,  0x0063, 0x086e,  0x0064, 0x0870,
+	0x0065, 0x0872,  0x0066, 0x0874,  0x0067, 0x0876,  0x0068, 0x0878,
+	0x006d, 0x087a,  0x006e, 0x087c,  0x0070, 0x087e,  0x0072, 0x0880,
+	0x0073, 0x0882,  0x0074, 0x0884,  0x0077, 0x0886,  0x0078, 0x0888,
+	0x0079, 0x088a,  0x007a, 0x088c,  0x017f, 0x088e,  0x0306, 0x0890,
+	/* 0x0308 */
+	0x0000, 0x0030,  0x0041, 0x0892,  0x0045, 0x0896,  0x0048, 0x0898,
+	0x0049, 0x089a,  0x004f, 0x089e,  0x0055, 0x08a0,  0x0057, 0x08aa,
+	0x0058, 0x08ac,  0x0059, 0x08ae,  0x0061, 0x08b0,  0x0065, 0x08b4,
+	0x0068, 0x08b6,  0x0069, 0x08b8,  0x006f, 0x08bc,  0x0074, 0x08be,
+	0x0075, 0x08c0,  0x0077, 0x08ca,  0x0078, 0x08cc,  0x0079, 0x08ce,
+	0x018f, 0x08d0,  0x019f, 0x08d2,  0x0259, 0x08d4,  0x0275, 0x08d6,
+	0x0399, 0x08d8,  0x03a5, 0x08da,  0x03b9, 0x08dc,  0x03c5, 0x08e6,
+	0x03d2, 0x08f0,  0x0406, 0x08f2,  0x0410, 0x08f4,  0x0415, 0x08f6,
+	0x0416, 0x08f8,  0x0417, 0x08fa,  0x0418, 0x08fc,  0x041e, 0x08fe,
+	0x0423, 0x0900,  0x0427, 0x0902,  0x042b, 0x0904,  0x0430, 0x0906,
+	0x0435, 0x0908,  0x0436, 0x090a,  0x0437, 0x090c,  0x0438, 0x090e,
+	0x043e, 0x0910,  0x0443, 0x0912,  0x0447, 0x0914,  0x044b, 0x0916,
+	0x0456, 0x0918,
+	/* 0x0309 */
+	0x0000, 0x000c,  0x0041, 0x091a,  0x0045, 0x091c,  0x0049, 0x091e,
+	0x004f, 0x0920,  0x0055, 0x0922,  0x0059, 0x0924,  0x0061, 0x0926,
+	0x0065, 0x0928,  0x0069, 0x092a,  0x006f, 0x092c,  0x0075, 0x092e,
+	0x0079, 0x0930,
+	/* 0x030a */
+	0x0000, 0x0006,  0x0041, 0x0932,  0x0055, 0x0936,  0x0061, 0x0938,
+	0x0075, 0x093c,  0x0077, 0x093e,  0x0079, 0x0940,
+	/* 0x030b */
+	0x0000, 0x0006,  0x004f, 0x0942,  0x0055, 0x0944,  0x006f, 0x0946,
+	0x0075, 0x0948,  0x0423, 0x094a,  0x0443, 0x094c,
+	/* 0x030c */
+	0x0000, 0x0021,  0x0041, 0x094e,  0x0043, 0x0950,  0x0044, 0x0952,
+	0x0045, 0x0954,  0x0047, 0x0956,  0x0049, 0x0958,  0x004b, 0x095a,
+	0x004c, 0x095c,  0x004e, 0x095e,  0x004f, 0x0960,  0x0052, 0x0962,
+	0x0053, 0x0964,  0x0054, 0x0968,  0x0055, 0x096a,  0x005a, 0x096c,
+	0x0061, 0x096e,  0x0063, 0x0970,  0x0064, 0x0972,  0x0065, 0x0974,
+	0x0067, 0x0976,  0x0069, 0x0978,  0x006a, 0x097a,  0x006b, 0x097c,
+	0x006c, 0x097e,  0x006e, 0x0980,  0x006f, 0x0982,  0x0072, 0x0984,
+	0x0073, 0x0986,  0x0074, 0x098a,  0x0075, 0x098c,  0x007a, 0x098e,
+	0x01b7, 0x0990,  0x0292, 0x0992,
+	/* 0x030d */
+	0x0000, 0x0011,  0x00a8, 0x0994,  0x0308, 0x0996,  0x0391, 0x0998,
+	0x0395, 0x099a,  0x0397, 0x099c,  0x0399, 0x099e,  0x039f, 0x09a0,
+	0x03a5, 0x09a2,  0x03a9, 0x09a4,  0x03b1, 0x09a6,  0x03b5, 0x09a8,
+	0x03b7, 0x09aa,  0x03b9, 0x09ac,  0x03bf, 0x09ae,  0x03c5, 0x09b0,
+	0x03c9, 0x09b2,  0x03d2, 0x09b4,
+	/* 0x030f */
+	0x0000, 0x000e,  0x0041, 0x09b6,  0x0045, 0x09b8,  0x0049, 0x09ba,
+	0x004f, 0x09bc,  0x0052, 0x09be,  0x0055, 0x09c0,  0x0061, 0x09c2,
+	0x0065, 0x09c4,  0x0069, 0x09c6,  0x006f, 0x09c8,  0x0072, 0x09ca,
+	0x0075, 0x09cc,  0x0474, 0x09ce,  0x0475, 0x09d0,
+	/* 0x0311 */
+	0x0000, 0x000c,  0x0041, 0x09d2,  0x0045, 0x09d4,  0x0049, 0x09d6,
+	0x004f, 0x09d8,  0x0052, 0x09da,  0x0055, 0x09dc,  0x0061, 0x09de,
+	0x0065, 0x09e0,  0x0069, 0x09e2,  0x006f, 0x09e4,  0x0072, 0x09e6,
+	0x0075, 0x09e8,
+	/* 0x0313 */
+	0x0343, 0x000e,  0x0391, 0x09ea,  0x0395, 0x09f2,  0x0397, 0x09f8,
+	0x0399, 0x0a00,  0x039f, 0x0a08,  0x03a9, 0x0a0e,  0x03b1, 0x0a16,
+	0x03b5, 0x0a1e,  0x03b7, 0x0a24,  0x03b9, 0x0a2c,  0x03bf, 0x0a34,
+	0x03c1, 0x0a3a,  0x03c5, 0x0a3c,  0x03c9, 0x0a44,
+	/* 0x0314 */
+	0x0000, 0x0010,  0x0391, 0x0a4c,  0x0395, 0x0a54,  0x0397, 0x0a5a,
+	0x0399, 0x0a62,  0x039f, 0x0a6a,  0x03a1, 0x0a70,  0x03a5, 0x0a72,
+	0x03a9, 0x0a7a,  0x03b1, 0x0a82,  0x03b5, 0x0a8a,  0x03b7, 0x0a90,
+	0x03b9, 0x0a98,  0x03bf, 0x0aa0,  0x03c1, 0x0aa6,  0x03c5, 0x0aa8,
+	0x03c9, 0x0ab0,
+	/* 0x031b */
+	0x0000, 0x0004,  0x004f, 0x0ab8,  0x0055, 0x0ac4,  0x006f, 0x0ad0,
+	0x0075, 0x0adc,
+	/* 0x0323 */
+	0x0000, 0x0026,  0x0041, 0x0ae8,  0x0042, 0x0aee,  0x0044, 0x0af0,
+	0x0045, 0x0af2,  0x0048, 0x0af6,  0x0049, 0x0af8,  0x004b, 0x0afa,
+	0x004c, 0x0afc,  0x004d, 0x0b00,  0x004e, 0x0b02,  0x004f, 0x0b04,
+	0x0052, 0x0b08,  0x0053, 0x0b0c,  0x0054, 0x0b10,  0x0055, 0x0b12,
+	0x0056, 0x0b14,  0x0057, 0x0b16,  0x0059, 0x0b18,  0x005a, 0x0b1a,
+	0x0061, 0x0b1c,  0x0062, 0x0b22,  0x0064, 0x0b24,  0x0065, 0x0b26,
+	0x0068, 0x0b2a,  0x0069, 0x0b2c,  0x006b, 0x0b2e,  0x006c, 0x0b30,
+	0x006d, 0x0b34,  0x006e, 0x0b36,  0x006f, 0x0b38,  0x0072, 0x0b3c,
+	0x0073, 0x0b40,  0x0074, 0x0b44,  0x0075, 0x0b46,  0x0076, 0x0b48,
+	0x0077, 0x0b4a,  0x0079, 0x0b4c,  0x007a, 0x0b4e,
+	/* 0x0324 */
+	0x0000, 0x0002,  0x0055, 0x0b50,  0x0075, 0x0b52,
+	/* 0x0325 */
+	0x0000, 0x0002,  0x0041, 0x0b54,  0x0061, 0x0b56,
+	/* 0x0327 */
+	0x0000, 0x0016,  0x0043, 0x0b58,  0x0044, 0x0b5c,  0x0045, 0x0b5e,
+	0x0047, 0x0b62,  0x0048, 0x0b64,  0x004b, 0x0b66,  0x004c, 0x0b68,
+	0x004e, 0x0b6a,  0x0052, 0x0b6c,  0x0053, 0x0b6e,  0x0054, 0x0b70,
+	0x0063, 0x0b72,  0x0064, 0x0b76,  0x0065, 0x0b78,  0x0067, 0x0b7c,
+	0x0068, 0x0b7e,  0x006b, 0x0b80,  0x006c, 0x0b82,  0x006e, 0x0b84,
+	0x0072, 0x0b86,  0x0073, 0x0b88,  0x0074, 0x0b8a,
+	/* 0x0328 */
+	0x0000, 0x000a,  0x0041, 0x0b8c,  0x0045, 0x0b8e,  0x0049, 0x0b90,
+	0x004f, 0x0b92,  0x0055, 0x0b96,  0x0061, 0x0b98,  0x0065, 0x0b9a,
+	0x0069, 0x0b9c,  0x006f, 0x0b9e,  0x0075, 0x0ba2,
+	/* 0x032d */
+	0x0000, 0x000c,  0x0044, 0x0ba4,  0x0045, 0x0ba6,  0x004c, 0x0ba8,
+	0x004e, 0x0baa,  0x0054, 0x0bac,  0x0055, 0x0bae,  0x0064, 0x0bb0,
+	0x0065, 0x0bb2,  0x006c, 0x0bb4,  0x006e, 0x0bb6,  0x0074, 0x0bb8,
+	0x0075, 0x0bba,
+	/* 0x032e */
+	0x0000, 0x0002,  0x0048, 0x0bbc,  0x0068, 0x0bbe,
+	/* 0x0330 */
+	0x0000, 0x0006,  0x0045, 0x0bc0,  0x0049, 0x0bc2,  0x0055, 0x0bc4,
+	0x0065, 0x0bc6,  0x0069, 0x0bc8,  0x0075, 0x0bca,
+	/* 0x0331 */
+	0x0000, 0x0011,  0x0042, 0x0bcc,  0x0044, 0x0bce,  0x004b, 0x0bd0,
+	0x004c, 0x0bd2,  0x004e, 0x0bd4,  0x0052, 0x0bd6,  0x0054, 0x0bd8,
+	0x005a, 0x0bda,  0x0062, 0x0bdc,  0x0064, 0x0bde,  0x0068, 0x0be0,
+	0x006b, 0x0be2,  0x006c, 0x0be4,  0x006e, 0x0be6,  0x0072, 0x0be8,
+	0x0074, 0x0bea,  0x007a, 0x0bec,
+	/* 0x0342 */
+	0x0000, 0x0008,  0x00a8, 0x0bee,  0x03b1, 0x0bf0,  0x03b7, 0x0bf2,
+	0x03b9, 0x0bf4,  0x03c5, 0x0bf6,  0x03c9, 0x0bf8,  0x1fbf, 0x0bfa,
+	0x1ffe, 0x0bfc,
+	/* 0x0345 */
+	0x0000, 0x0007,  0x0391, 0x0bfe,  0x0397, 0x0c04,  0x03a9, 0x0c0a,
+	0x03b1, 0x0c10,  0x03b7, 0x0c1c,  0x03bf, 0x0c28,  0x03c9, 0x0c2c,
+	/* 0x05b7 */
+	0x0000, 0x0002,  0x05d0, 0x0c36,  0x05f2, 0x0c38,
+	/* 0x05b8 */
+	0x0000, 0x0001,  0x05d0, 0x0c3a,
+	/* 0x05b9 */
+	0x0000, 0x0001,  0x05d5, 0x0c3c,
+	/* 0x05bc */
+	0x0000, 0x0016,  0x05d0, 0x0c3e,  0x05d1, 0x0c40,  0x05d2, 0x0c42,
+	0x05d3, 0x0c44,  0x05d4, 0x0c46,  0x05d5, 0x0c48,  0x05d6, 0x0c4a,
+	0x05d8, 0x0c4c,  0x05d9, 0x0c4e,  0x05da, 0x0c50,  0x05db, 0x0c52,
+	0x05dc, 0x0c54,  0x05de, 0x0c56,  0x05e0, 0x0c58,  0x05e1, 0x0c5a,
+	0x05e3, 0x0c5c,  0x05e4, 0x0c5e,  0x05e6, 0x0c60,  0x05e7, 0x0c62,
+	0x05e8, 0x0c64,  0x05e9, 0x0c66,  0x05ea, 0x0c6c,
+	/* 0x05bf */
+	0x0000, 0x0003,  0x05d1, 0x0c6e,  0x05db, 0x0c70,  0x05e4, 0x0c72,
+	/* 0x05c1 */
+	0x0000, 0x0001,  0x05e9, 0x0c74,
+	/* 0x05c2 */
+	0x0000, 0x0001,  0x05e9, 0x0c76,
+	/* 0x093c */
+	0x0000, 0x000b,  0x0915, 0x0c78,  0x0916, 0x0c7a,  0x0917, 0x0c7c,
+	0x091c, 0x0c7e,  0x0921, 0x0c80,  0x0922, 0x0c82,  0x0928, 0x0c84,
+	0x092b, 0x0c86,  0x092f, 0x0c88,  0x0930, 0x0c8a,  0x0933, 0x0c8c,
+	/* 0x09bc */
+	0x0000, 0x0004,  0x09a1, 0x0c8e,  0x09a2, 0x0c90,  0x09ac, 0x0c92,
+	0x09af, 0x0c94,
+	/* 0x09be */
+	0x0000, 0x0001,  0x09c7, 0x0c96,
+	/* 0x09d7 */
+	0x0000, 0x0001,  0x09c7, 0x0c98,
+	/* 0x0a3c */
+	0x0000, 0x0005,  0x0a16, 0x0c9a,  0x0a17, 0x0c9c,  0x0a1c, 0x0c9e,
+	0x0a21, 0x0ca0,  0x0a2b, 0x0ca2,
+	/* 0x0b3c */
+	0x0000, 0x0003,  0x0b21, 0x0ca4,  0x0b22, 0x0ca6,  0x0b2f, 0x0ca8,
+	/* 0x0b3e */
+	0x0000, 0x0001,  0x0b47, 0x0caa,
+	/* 0x0b56 */
+	0x0000, 0x0001,  0x0b47, 0x0cac,
+	/* 0x0b57 */
+	0x0000, 0x0001,  0x0b47, 0x0cae,
+	/* 0x0bbe */
+	0x0000, 0x0002,  0x0bc6, 0x0cb0,  0x0bc7, 0x0cb2,
+	/* 0x0bd7 */
+	0x0000, 0x0002,  0x0b92, 0x0cb4,  0x0bc6, 0x0cb6,
+	/* 0x0c56 */
+	0x0000, 0x0001,  0x0c46, 0x0cb8,
+	/* 0x0cc2 */
+	0x0000, 0x0001,  0x0cc6, 0x0cba,
+	/* 0x0cd5 */
+	0x0000, 0x0002,  0x0cbf, 0x0cbe,  0x0cc6, 0x0cc0,
+	/* 0x0cd6 */
+	0x0000, 0x0001,  0x0cc6, 0x0cc2,
+	/* 0x0d3e */
+	0x0000, 0x0002,  0x0d46, 0x0cc4,  0x0d47, 0x0cc6,
+	/* 0x0d57 */
+	0x0000, 0x0001,  0x0d46, 0x0cc8,
+	/* 0x0e32 */
+	0x0000, 0x0001,  0x0e4d, 0x0cca,
+	/* 0x0eb2 */
+	0x0000, 0x0001,  0x0ecd, 0x0ccc,
+	/* 0x0f71 */
+	0x0000, 0x0003,  0x0f72, 0x0cce,  0x0f74, 0x0cd0,  0x0f80, 0x0cd2,
+	/* 0x0f80 */
+	0x0000, 0x0002,  0x0fb2, 0x0cd4,  0x0fb3, 0x0cd8,
+	/* 0x0fb5 */
+	0x0000, 0x0002,  0x0f40, 0x0cdc,  0x0f90, 0x0cde,
+	/* 0x0fb7 */
+	0x0000, 0x000a,  0x0f42, 0x0ce0,  0x0f4c, 0x0ce2,  0x0f51, 0x0ce4,
+	0x0f56, 0x0ce6,  0x0f5b, 0x0ce8,  0x0f92, 0x0cea,  0x0f9c, 0x0cec,
+	0x0fa1, 0x0cee,  0x0fa6, 0x0cf0,  0x0fab, 0x0cf2,
+	/* 0x3099 */
+	0x0000, 0x0030,  0x3046, 0x0cf4,  0x304b, 0x0cf6,  0x304d, 0x0cf8,
+	0x304f, 0x0cfa,  0x3051, 0x0cfc,  0x3053, 0x0cfe,  0x3055, 0x0d00,
+	0x3057, 0x0d02,  0x3059, 0x0d04,  0x305b, 0x0d06,  0x305d, 0x0d08,
+	0x305f, 0x0d0a,  0x3061, 0x0d0c,  0x3064, 0x0d0e,  0x3066, 0x0d10,
+	0x3068, 0x0d12,  0x306f, 0x0d14,  0x3072, 0x0d16,  0x3075, 0x0d18,
+	0x3078, 0x0d1a,  0x307b, 0x0d1c,  0x309d, 0x0d1e,  0x30a6, 0x0d20,
+	0x30ab, 0x0d22,  0x30ad, 0x0d24,  0x30af, 0x0d26,  0x30b1, 0x0d28,
+	0x30b3, 0x0d2a,  0x30b5, 0x0d2c,  0x30b7, 0x0d2e,  0x30b9, 0x0d30,
+	0x30bb, 0x0d32,  0x30bd, 0x0d34,  0x30bf, 0x0d36,  0x30c1, 0x0d38,
+	0x30c4, 0x0d3a,  0x30c6, 0x0d3c,  0x30c8, 0x0d3e,  0x30cf, 0x0d40,
+	0x30d2, 0x0d42,  0x30d5, 0x0d44,  0x30d8, 0x0d46,  0x30db, 0x0d48,
+	0x30ef, 0x0d4a,  0x30f0, 0x0d4c,  0x30f1, 0x0d4e,  0x30f2, 0x0d50,
+	0x30fd, 0x0d52,
+	/* 0x309a */
+	0x0000, 0x000a,  0x306f, 0x0d54,  0x3072, 0x0d56,  0x3075, 0x0d58,
+	0x3078, 0x0d5a,  0x307b, 0x0d5c,  0x30cf, 0x0d5e,  0x30d2, 0x0d60,
+	0x30d5, 0x0d62,  0x30d8, 0x0d64,  0x30db, 0x0d66,
+	/* 0x0041 0x0300 */
+	0x00c0, 0x0000,
+	/* 0x0045 0x0300 */
+	0x00c8, 0x0000,
+	/* 0x0049 0x0300 */
+	0x00cc, 0x0000,
+	/* 0x004f 0x0300 */
+	0x00d2, 0x0000,
+	/* 0x0055 0x0300 */
+	0x00d9, 0x0000,
+	/* 0x0057 0x0300 */
+	0x1e80, 0x0000,
+	/* 0x0059 0x0300 */
+	0x1ef2, 0x0000,
+	/* 0x0061 0x0300 */
+	0x00e0, 0x0000,
+	/* 0x0065 0x0300 */
+	0x00e8, 0x0000,
+	/* 0x0069 0x0300 */
+	0x00ec, 0x0000,
+	/* 0x006f 0x0300 */
+	0x00f2, 0x0000,
+	/* 0x0075 0x0300 */
+	0x00f9, 0x0000,
+	/* 0x0077 0x0300 */
+	0x1e81, 0x0000,
+	/* 0x0079 0x0300 */
+	0x1ef3, 0x0000,
+	/* 0x00a8 0x0300 */
+	0x1fed, 0x0000,
+	/* 0x0391 0x0300 */
+	0x1fba, 0x0000,
+	/* 0x0395 0x0300 */
+	0x1fc8, 0x0000,
+	/* 0x0397 0x0300 */
+	0x1fca, 0x0000,
+	/* 0x0399 0x0300 */
+	0x1fda, 0x0000,
+	/* 0x039f 0x0300 */
+	0x1ff8, 0x0000,
+	/* 0x03a5 0x0300 */
+	0x1fea, 0x0000,
+	/* 0x03a9 0x0300 */
+	0x1ffa, 0x0000,
+	/* 0x03b1 0x0300 */
+	0x1f70, 0x0000,
+	/* 0x03b5 0x0300 */
+	0x1f72, 0x0000,
+	/* 0x03b7 0x0300 */
+	0x1f74, 0x0000,
+	/* 0x03b9 0x0300 */
+	0x1f76, 0x0000,
+	/* 0x03bf 0x0300 */
+	0x1f78, 0x0000,
+	/* 0x03c5 0x0300 */
+	0x1f7a, 0x0000,
+	/* 0x03c9 0x0300 */
+	0x1f7c, 0x0000,
+	/* 0x1fbf 0x0300 */
+	0x1fcd, 0x0000,
+	/* 0x1ffe 0x0300 */
+	0x1fdd, 0x0000,
+	/* 0x0041 0x0301 */
+	0x00c1, 0x0000,
+	/* 0x0043 0x0301 */
+	0x0106, 0x0000,
+	/* 0x0045 0x0301 */
+	0x00c9, 0x0000,
+	/* 0x0047 0x0301 */
+	0x01f4, 0x0000,
+	/* 0x0049 0x0301 */
+	0x00cd, 0x0000,
+	/* 0x004b 0x0301 */
+	0x1e30, 0x0000,
+	/* 0x004c 0x0301 */
+	0x0139, 0x0000,
+	/* 0x004d 0x0301 */
+	0x1e3e, 0x0000,
+	/* 0x004e 0x0301 */
+	0x0143, 0x0000,
+	/* 0x004f 0x0301 */
+	0x00d3, 0x0000,
+	/* 0x0050 0x0301 */
+	0x1e54, 0x0000,
+	/* 0x0052 0x0301 */
+	0x0154, 0x0000,
+	/* 0x0053 0x0301 */
+	0x015a, 0x0001,  0x0307, 0x0d68,
+	/* 0x0055 0x0301 */
+	0x00da, 0x0000,
+	/* 0x0057 0x0301 */
+	0x1e82, 0x0000,
+	/* 0x0059 0x0301 */
+	0x00dd, 0x0000,
+	/* 0x005a 0x0301 */
+	0x0179, 0x0000,
+	/* 0x0061 0x0301 */
+	0x00e1, 0x0000,
+	/* 0x0063 0x0301 */
+	0x0107, 0x0000,
+	/* 0x0065 0x0301 */
+	0x00e9, 0x0000,
+	/* 0x0067 0x0301 */
+	0x01f5, 0x0000,
+	/* 0x0069 0x0301 */
+	0x00ed, 0x0000,
+	/* 0x006b 0x0301 */
+	0x1e31, 0x0000,
+	/* 0x006c 0x0301 */
+	0x013a, 0x0000,
+	/* 0x006d 0x0301 */
+	0x1e3f, 0x0000,
+	/* 0x006e 0x0301 */
+	0x0144, 0x0000,
+	/* 0x006f 0x0301 */
+	0x00f3, 0x0000,
+	/* 0x0070 0x0301 */
+	0x1e55, 0x0000,
+	/* 0x0072 0x0301 */
+	0x0155, 0x0000,
+	/* 0x0073 0x0301 */
+	0x015b, 0x0001,  0x0307, 0x0d6a,
+	/* 0x0075 0x0301 */
+	0x00fa, 0x0000,
+	/* 0x0077 0x0301 */
+	0x1e83, 0x0000,
+	/* 0x0079 0x0301 */
+	0x00fd, 0x0000,
+	/* 0x007a 0x0301 */
+	0x017a, 0x0000,
+	/* 0x00a8 0x0301 */
+	0x1fee, 0x0000,
+	/* 0x00c6 0x0301 */
+	0x01fc, 0x0000,
+	/* 0x00d8 0x0301 */
+	0x01fe, 0x0000,
+	/* 0x00e6 0x0301 */
+	0x01fd, 0x0000,
+	/* 0x00f8 0x0301 */
+	0x01ff, 0x0000,
+	/* 0x0391 0x0301 */
+	0x1fbb, 0x0000,
+	/* 0x0395 0x0301 */
+	0x1fc9, 0x0000,
+	/* 0x0397 0x0301 */
+	0x1fcb, 0x0000,
+	/* 0x0399 0x0301 */
+	0x1fdb, 0x0000,
+	/* 0x039f 0x0301 */
+	0x1ff9, 0x0000,
+	/* 0x03a5 0x0301 */
+	0x1feb, 0x0000,
+	/* 0x03a9 0x0301 */
+	0x1ffb, 0x0000,
+	/* 0x03b1 0x0301 */
+	0x1f71, 0x0000,
+	/* 0x03b5 0x0301 */
+	0x1f73, 0x0000,
+	/* 0x03b7 0x0301 */
+	0x1f75, 0x0000,
+	/* 0x03b9 0x0301 */
+	0x1f77, 0x0000,
+	/* 0x03bf 0x0301 */
+	0x1f79, 0x0000,
+	/* 0x03c5 0x0301 */
+	0x1f7b, 0x0000,
+	/* 0x03c9 0x0301 */
+	0x1f7d, 0x0000,
+	/* 0x0413 0x0301 */
+	0x0403, 0x0000,
+	/* 0x041a 0x0301 */
+	0x040c, 0x0000,
+	/* 0x0433 0x0301 */
+	0x0453, 0x0000,
+	/* 0x043a 0x0301 */
+	0x045c, 0x0000,
+	/* 0x1fbf 0x0301 */
+	0x1fce, 0x0000,
+	/* 0x1ffe 0x0301 */
+	0x1fde, 0x0000,
+	/* 0x0041 0x0302 */
+	0x00c2, 0x0004,  0x0300, 0x0d6c,  0x0301, 0x0d6e,  0x0303, 0x0d70,
+	0x0309, 0x0d72,
+	/* 0x0043 0x0302 */
+	0x0108, 0x0000,
+	/* 0x0045 0x0302 */
+	0x00ca, 0x0004,  0x0300, 0x0d74,  0x0301, 0x0d76,  0x0303, 0x0d78,
+	0x0309, 0x0d7a,
+	/* 0x0047 0x0302 */
+	0x011c, 0x0000,
+	/* 0x0048 0x0302 */
+	0x0124, 0x0000,
+	/* 0x0049 0x0302 */
+	0x00ce, 0x0000,
+	/* 0x004a 0x0302 */
+	0x0134, 0x0000,
+	/* 0x004f 0x0302 */
+	0x00d4, 0x0004,  0x0300, 0x0d7c,  0x0301, 0x0d7e,  0x0303, 0x0d80,
+	0x0309, 0x0d82,
+	/* 0x0053 0x0302 */
+	0x015c, 0x0000,
+	/* 0x0055 0x0302 */
+	0x00db, 0x0000,
+	/* 0x0057 0x0302 */
+	0x0174, 0x0000,
+	/* 0x0059 0x0302 */
+	0x0176, 0x0000,
+	/* 0x005a 0x0302 */
+	0x1e90, 0x0000,
+	/* 0x0061 0x0302 */
+	0x00e2, 0x0004,  0x0300, 0x0d84,  0x0301, 0x0d86,  0x0303, 0x0d88,
+	0x0309, 0x0d8a,
+	/* 0x0063 0x0302 */
+	0x0109, 0x0000,
+	/* 0x0065 0x0302 */
+	0x00ea, 0x0004,  0x0300, 0x0d8c,  0x0301, 0x0d8e,  0x0303, 0x0d90,
+	0x0309, 0x0d92,
+	/* 0x0067 0x0302 */
+	0x011d, 0x0000,
+	/* 0x0068 0x0302 */
+	0x0125, 0x0000,
+	/* 0x0069 0x0302 */
+	0x00ee, 0x0000,
+	/* 0x006a 0x0302 */
+	0x0135, 0x0000,
+	/* 0x006f 0x0302 */
+	0x00f4, 0x0004,  0x0300, 0x0d94,  0x0301, 0x0d96,  0x0303, 0x0d98,
+	0x0309, 0x0d9a,
+	/* 0x0073 0x0302 */
+	0x015d, 0x0000,
+	/* 0x0075 0x0302 */
+	0x00fb, 0x0000,
+	/* 0x0077 0x0302 */
+	0x0175, 0x0000,
+	/* 0x0079 0x0302 */
+	0x0177, 0x0000,
+	/* 0x007a 0x0302 */
+	0x1e91, 0x0000,
+	/* 0x0041 0x0303 */
+	0x00c3, 0x0000,
+	/* 0x0045 0x0303 */
+	0x1ebc, 0x0000,
+	/* 0x0049 0x0303 */
+	0x0128, 0x0000,
+	/* 0x004e 0x0303 */
+	0x00d1, 0x0000,
+	/* 0x004f 0x0303 */
+	0x00d5, 0x0002,  0x0301, 0x0d9c,  0x0308, 0x0d9e,
+	/* 0x0055 0x0303 */
+	0x0168, 0x0001,  0x0301, 0x0da0,
+	/* 0x0056 0x0303 */
+	0x1e7c, 0x0000,
+	/* 0x0059 0x0303 */
+	0x1ef8, 0x0000,
+	/* 0x0061 0x0303 */
+	0x00e3, 0x0000,
+	/* 0x0065 0x0303 */
+	0x1ebd, 0x0000,
+	/* 0x0069 0x0303 */
+	0x0129, 0x0000,
+	/* 0x006e 0x0303 */
+	0x00f1, 0x0000,
+	/* 0x006f 0x0303 */
+	0x00f5, 0x0002,  0x0301, 0x0da2,  0x0308, 0x0da4,
+	/* 0x0075 0x0303 */
+	0x0169, 0x0001,  0x0301, 0x0da6,
+	/* 0x0076 0x0303 */
+	0x1e7d, 0x0000,
+	/* 0x0079 0x0303 */
+	0x1ef9, 0x0000,
+	/* 0x0041 0x0304 */
+	0x0100, 0x0000,
+	/* 0x0045 0x0304 */
+	0x0112, 0x0002,  0x0300, 0x0da8,  0x0301, 0x0daa,
+	/* 0x0047 0x0304 */
+	0x1e20, 0x0000,
+	/* 0x0049 0x0304 */
+	0x012a, 0x0000,
+	/* 0x004f 0x0304 */
+	0x014c, 0x0002,  0x0300, 0x0dac,  0x0301, 0x0dae,
+	/* 0x0055 0x0304 */
+	0x016a, 0x0001,  0x0308, 0x0db0,
+	/* 0x0061 0x0304 */
+	0x0101, 0x0000,
+	/* 0x0065 0x0304 */
+	0x0113, 0x0002,  0x0300, 0x0db2,  0x0301, 0x0db4,
+	/* 0x0067 0x0304 */
+	0x1e21, 0x0000,
+	/* 0x0069 0x0304 */
+	0x012b, 0x0000,
+	/* 0x006f 0x0304 */
+	0x014d, 0x0002,  0x0300, 0x0db6,  0x0301, 0x0db8,
+	/* 0x0075 0x0304 */
+	0x016b, 0x0001,  0x0308, 0x0dba,
+	/* 0x00c6 0x0304 */
+	0x01e2, 0x0000,
+	/* 0x00e6 0x0304 */
+	0x01e3, 0x0000,
+	/* 0x0391 0x0304 */
+	0x1fb9, 0x0000,
+	/* 0x0399 0x0304 */
+	0x1fd9, 0x0000,
+	/* 0x03a5 0x0304 */
+	0x1fe9, 0x0000,
+	/* 0x03b1 0x0304 */
+	0x1fb1, 0x0000,
+	/* 0x03b9 0x0304 */
+	0x1fd1, 0x0000,
+	/* 0x03c5 0x0304 */
+	0x1fe1, 0x0000,
+	/* 0x0418 0x0304 */
+	0x04e2, 0x0000,
+	/* 0x0423 0x0304 */
+	0x04ee, 0x0000,
+	/* 0x0438 0x0304 */
+	0x04e3, 0x0000,
+	/* 0x0443 0x0304 */
+	0x04ef, 0x0000,
+	/* 0x0041 0x0306 */
+	0x0102, 0x0004,  0x0300, 0x0dbc,  0x0301, 0x0dbe,  0x0303, 0x0dc0,
+	0x0309, 0x0dc2,
+	/* 0x0045 0x0306 */
+	0x0114, 0x0000,
+	/* 0x0047 0x0306 */
+	0x011e, 0x0000,
+	/* 0x0049 0x0306 */
+	0x012c, 0x0000,
+	/* 0x004f 0x0306 */
+	0x014e, 0x0000,
+	/* 0x0055 0x0306 */
+	0x016c, 0x0000,
+	/* 0x0061 0x0306 */
+	0x0103, 0x0004,  0x0300, 0x0dc4,  0x0301, 0x0dc6,  0x0303, 0x0dc8,
+	0x0309, 0x0dca,
+	/* 0x0065 0x0306 */
+	0x0115, 0x0000,
+	/* 0x0067 0x0306 */
+	0x011f, 0x0000,
+	/* 0x0069 0x0306 */
+	0x012d, 0x0000,
+	/* 0x006f 0x0306 */
+	0x014f, 0x0000,
+	/* 0x0075 0x0306 */
+	0x016d, 0x0000,
+	/* 0x0391 0x0306 */
+	0x1fb8, 0x0000,
+	/* 0x0399 0x0306 */
+	0x1fd8, 0x0000,
+	/* 0x03a5 0x0306 */
+	0x1fe8, 0x0000,
+	/* 0x03b1 0x0306 */
+	0x1fb0, 0x0000,
+	/* 0x03b9 0x0306 */
+	0x1fd0, 0x0000,
+	/* 0x03c5 0x0306 */
+	0x1fe0, 0x0000,
+	/* 0x0410 0x0306 */
+	0x04d0, 0x0000,
+	/* 0x0415 0x0306 */
+	0x04d6, 0x0000,
+	/* 0x0416 0x0306 */
+	0x04c1, 0x0000,
+	/* 0x0418 0x0306 */
+	0x0419, 0x0000,
+	/* 0x0423 0x0306 */
+	0x040e, 0x0000,
+	/* 0x0430 0x0306 */
+	0x04d1, 0x0000,
+	/* 0x0435 0x0306 */
+	0x04d7, 0x0000,
+	/* 0x0436 0x0306 */
+	0x04c2, 0x0000,
+	/* 0x0438 0x0306 */
+	0x0439, 0x0000,
+	/* 0x0443 0x0306 */
+	0x045e, 0x0000,
+	/* 0x0041 0x0307 */
+	0x0000, 0x0001,  0x0304, 0x0dcc,
+	/* 0x0042 0x0307 */
+	0x1e02, 0x0000,
+	/* 0x0043 0x0307 */
+	0x010a, 0x0000,
+	/* 0x0044 0x0307 */
+	0x1e0a, 0x0000,
+	/* 0x0045 0x0307 */
+	0x0116, 0x0000,
+	/* 0x0046 0x0307 */
+	0x1e1e, 0x0000,
+	/* 0x0047 0x0307 */
+	0x0120, 0x0000,
+	/* 0x0048 0x0307 */
+	0x1e22, 0x0000,
+	/* 0x0049 0x0307 */
+	0x0130, 0x0000,
+	/* 0x004d 0x0307 */
+	0x1e40, 0x0000,
+	/* 0x004e 0x0307 */
+	0x1e44, 0x0000,
+	/* 0x0050 0x0307 */
+	0x1e56, 0x0000,
+	/* 0x0052 0x0307 */
+	0x1e58, 0x0000,
+	/* 0x0053 0x0307 */
+	0x1e60, 0x0000,
+	/* 0x0054 0x0307 */
+	0x1e6a, 0x0000,
+	/* 0x0057 0x0307 */
+	0x1e86, 0x0000,
+	/* 0x0058 0x0307 */
+	0x1e8a, 0x0000,
+	/* 0x0059 0x0307 */
+	0x1e8e, 0x0000,
+	/* 0x005a 0x0307 */
+	0x017b, 0x0000,
+	/* 0x0061 0x0307 */
+	0x0000, 0x0001,  0x0304, 0x0dce,
+	/* 0x0062 0x0307 */
+	0x1e03, 0x0000,
+	/* 0x0063 0x0307 */
+	0x010b, 0x0000,
+	/* 0x0064 0x0307 */
+	0x1e0b, 0x0000,
+	/* 0x0065 0x0307 */
+	0x0117, 0x0000,
+	/* 0x0066 0x0307 */
+	0x1e1f, 0x0000,
+	/* 0x0067 0x0307 */
+	0x0121, 0x0000,
+	/* 0x0068 0x0307 */
+	0x1e23, 0x0000,
+	/* 0x006d 0x0307 */
+	0x1e41, 0x0000,
+	/* 0x006e 0x0307 */
+	0x1e45, 0x0000,
+	/* 0x0070 0x0307 */
+	0x1e57, 0x0000,
+	/* 0x0072 0x0307 */
+	0x1e59, 0x0000,
+	/* 0x0073 0x0307 */
+	0x1e61, 0x0000,
+	/* 0x0074 0x0307 */
+	0x1e6b, 0x0000,
+	/* 0x0077 0x0307 */
+	0x1e87, 0x0000,
+	/* 0x0078 0x0307 */
+	0x1e8b, 0x0000,
+	/* 0x0079 0x0307 */
+	0x1e8f, 0x0000,
+	/* 0x007a 0x0307 */
+	0x017c, 0x0000,
+	/* 0x017f 0x0307 */
+	0x1e9b, 0x0000,
+	/* 0x0306 0x0307 */
+	0x0310, 0x0000,
+	/* 0x0041 0x0308 */
+	0x00c4, 0x0001,  0x0304, 0x0dd0,
+	/* 0x0045 0x0308 */
+	0x00cb, 0x0000,
+	/* 0x0048 0x0308 */
+	0x1e26, 0x0000,
+	/* 0x0049 0x0308 */
+	0x00cf, 0x0001,  0x0301, 0x0dd2,
+	/* 0x004f 0x0308 */
+	0x00d6, 0x0000,
+	/* 0x0055 0x0308 */
+	0x00dc, 0x0004,  0x0300, 0x0dd4,  0x0301, 0x0dd6,  0x0304, 0x0dd8,
+	0x030c, 0x0dda,
+	/* 0x0057 0x0308 */
+	0x1e84, 0x0000,
+	/* 0x0058 0x0308 */
+	0x1e8c, 0x0000,
+	/* 0x0059 0x0308 */
+	0x0178, 0x0000,
+	/* 0x0061 0x0308 */
+	0x00e4, 0x0001,  0x0304, 0x0ddc,
+	/* 0x0065 0x0308 */
+	0x00eb, 0x0000,
+	/* 0x0068 0x0308 */
+	0x1e27, 0x0000,
+	/* 0x0069 0x0308 */
+	0x00ef, 0x0001,  0x0301, 0x0dde,
+	/* 0x006f 0x0308 */
+	0x00f6, 0x0000,
+	/* 0x0074 0x0308 */
+	0x1e97, 0x0000,
+	/* 0x0075 0x0308 */
+	0x00fc, 0x0004,  0x0300, 0x0de0,  0x0301, 0x0de2,  0x0304, 0x0de4,
+	0x030c, 0x0de6,
+	/* 0x0077 0x0308 */
+	0x1e85, 0x0000,
+	/* 0x0078 0x0308 */
+	0x1e8d, 0x0000,
+	/* 0x0079 0x0308 */
+	0x00ff, 0x0000,
+	/* 0x018f 0x0308 */
+	0x04da, 0x0000,
+	/* 0x019f 0x0308 */
+	0x04ea, 0x0000,
+	/* 0x0259 0x0308 */
+	0x04db, 0x0000,
+	/* 0x0275 0x0308 */
+	0x04eb, 0x0000,
+	/* 0x0399 0x0308 */
+	0x03aa, 0x0000,
+	/* 0x03a5 0x0308 */
+	0x03ab, 0x0000,
+	/* 0x03b9 0x0308 */
+	0x03ca, 0x0004,  0x0300, 0x0de8,  0x0301, 0x0dea,  0x030d, 0x0dec,
+	0x0342, 0x0dee,
+	/* 0x03c5 0x0308 */
+	0x03cb, 0x0004,  0x0300, 0x0df0,  0x0301, 0x0df2,  0x030d, 0x0df4,
+	0x0342, 0x0df6,
+	/* 0x03d2 0x0308 */
+	0x03d4, 0x0000,
+	/* 0x0406 0x0308 */
+	0x0407, 0x0000,
+	/* 0x0410 0x0308 */
+	0x04d2, 0x0000,
+	/* 0x0415 0x0308 */
+	0x0401, 0x0000,
+	/* 0x0416 0x0308 */
+	0x04dc, 0x0000,
+	/* 0x0417 0x0308 */
+	0x04de, 0x0000,
+	/* 0x0418 0x0308 */
+	0x04e4, 0x0000,
+	/* 0x041e 0x0308 */
+	0x04e6, 0x0000,
+	/* 0x0423 0x0308 */
+	0x04f0, 0x0000,
+	/* 0x0427 0x0308 */
+	0x04f4, 0x0000,
+	/* 0x042b 0x0308 */
+	0x04f8, 0x0000,
+	/* 0x0430 0x0308 */
+	0x04d3, 0x0000,
+	/* 0x0435 0x0308 */
+	0x0451, 0x0000,
+	/* 0x0436 0x0308 */
+	0x04dd, 0x0000,
+	/* 0x0437 0x0308 */
+	0x04df, 0x0000,
+	/* 0x0438 0x0308 */
+	0x04e5, 0x0000,
+	/* 0x043e 0x0308 */
+	0x04e7, 0x0000,
+	/* 0x0443 0x0308 */
+	0x04f1, 0x0000,
+	/* 0x0447 0x0308 */
+	0x04f5, 0x0000,
+	/* 0x044b 0x0308 */
+	0x04f9, 0x0000,
+	/* 0x0456 0x0308 */
+	0x0457, 0x0000,
+	/* 0x0041 0x0309 */
+	0x1ea2, 0x0000,
+	/* 0x0045 0x0309 */
+	0x1eba, 0x0000,
+	/* 0x0049 0x0309 */
+	0x1ec8, 0x0000,
+	/* 0x004f 0x0309 */
+	0x1ece, 0x0000,
+	/* 0x0055 0x0309 */
+	0x1ee6, 0x0000,
+	/* 0x0059 0x0309 */
+	0x1ef6, 0x0000,
+	/* 0x0061 0x0309 */
+	0x1ea3, 0x0000,
+	/* 0x0065 0x0309 */
+	0x1ebb, 0x0000,
+	/* 0x0069 0x0309 */
+	0x1ec9, 0x0000,
+	/* 0x006f 0x0309 */
+	0x1ecf, 0x0000,
+	/* 0x0075 0x0309 */
+	0x1ee7, 0x0000,
+	/* 0x0079 0x0309 */
+	0x1ef7, 0x0000,
+	/* 0x0041 0x030a */
+	0x00c5, 0x0001,  0x0301, 0x0df8,
+	/* 0x0055 0x030a */
+	0x016e, 0x0000,
+	/* 0x0061 0x030a */
+	0x00e5, 0x0001,  0x0301, 0x0dfa,
+	/* 0x0075 0x030a */
+	0x016f, 0x0000,
+	/* 0x0077 0x030a */
+	0x1e98, 0x0000,
+	/* 0x0079 0x030a */
+	0x1e99, 0x0000,
+	/* 0x004f 0x030b */
+	0x0150, 0x0000,
+	/* 0x0055 0x030b */
+	0x0170, 0x0000,
+	/* 0x006f 0x030b */
+	0x0151, 0x0000,
+	/* 0x0075 0x030b */
+	0x0171, 0x0000,
+	/* 0x0423 0x030b */
+	0x04f2, 0x0000,
+	/* 0x0443 0x030b */
+	0x04f3, 0x0000,
+	/* 0x0041 0x030c */
+	0x01cd, 0x0000,
+	/* 0x0043 0x030c */
+	0x010c, 0x0000,
+	/* 0x0044 0x030c */
+	0x010e, 0x0000,
+	/* 0x0045 0x030c */
+	0x011a, 0x0000,
+	/* 0x0047 0x030c */
+	0x01e6, 0x0000,
+	/* 0x0049 0x030c */
+	0x01cf, 0x0000,
+	/* 0x004b 0x030c */
+	0x01e8, 0x0000,
+	/* 0x004c 0x030c */
+	0x013d, 0x0000,
+	/* 0x004e 0x030c */
+	0x0147, 0x0000,
+	/* 0x004f 0x030c */
+	0x01d1, 0x0000,
+	/* 0x0052 0x030c */
+	0x0158, 0x0000,
+	/* 0x0053 0x030c */
+	0x0160, 0x0001,  0x0307, 0x0dfc,
+	/* 0x0054 0x030c */
+	0x0164, 0x0000,
+	/* 0x0055 0x030c */
+	0x01d3, 0x0000,
+	/* 0x005a 0x030c */
+	0x017d, 0x0000,
+	/* 0x0061 0x030c */
+	0x01ce, 0x0000,
+	/* 0x0063 0x030c */
+	0x010d, 0x0000,
+	/* 0x0064 0x030c */
+	0x010f, 0x0000,
+	/* 0x0065 0x030c */
+	0x011b, 0x0000,
+	/* 0x0067 0x030c */
+	0x01e7, 0x0000,
+	/* 0x0069 0x030c */
+	0x01d0, 0x0000,
+	/* 0x006a 0x030c */
+	0x01f0, 0x0000,
+	/* 0x006b 0x030c */
+	0x01e9, 0x0000,
+	/* 0x006c 0x030c */
+	0x013e, 0x0000,
+	/* 0x006e 0x030c */
+	0x0148, 0x0000,
+	/* 0x006f 0x030c */
+	0x01d2, 0x0000,
+	/* 0x0072 0x030c */
+	0x0159, 0x0000,
+	/* 0x0073 0x030c */
+	0x0161, 0x0001,  0x0307, 0x0dfe,
+	/* 0x0074 0x030c */
+	0x0165, 0x0000,
+	/* 0x0075 0x030c */
+	0x01d4, 0x0000,
+	/* 0x007a 0x030c */
+	0x017e, 0x0000,
+	/* 0x01b7 0x030c */
+	0x01ee, 0x0000,
+	/* 0x0292 0x030c */
+	0x01ef, 0x0000,
+	/* 0x00a8 0x030d */
+	0x0385, 0x0000,
+	/* 0x0308 0x030d */
+	0x0344, 0x0000,
+	/* 0x0391 0x030d */
+	0x0386, 0x0000,
+	/* 0x0395 0x030d */
+	0x0388, 0x0000,
+	/* 0x0397 0x030d */
+	0x0389, 0x0000,
+	/* 0x0399 0x030d */
+	0x038a, 0x0000,
+	/* 0x039f 0x030d */
+	0x038c, 0x0000,
+	/* 0x03a5 0x030d */
+	0x038e, 0x0000,
+	/* 0x03a9 0x030d */
+	0x038f, 0x0000,
+	/* 0x03b1 0x030d */
+	0x03ac, 0x0000,
+	/* 0x03b5 0x030d */
+	0x03ad, 0x0000,
+	/* 0x03b7 0x030d */
+	0x03ae, 0x0000,
+	/* 0x03b9 0x030d */
+	0x03af, 0x0000,
+	/* 0x03bf 0x030d */
+	0x03cc, 0x0000,
+	/* 0x03c5 0x030d */
+	0x03cd, 0x0000,
+	/* 0x03c9 0x030d */
+	0x03ce, 0x0000,
+	/* 0x03d2 0x030d */
+	0x03d3, 0x0000,
+	/* 0x0041 0x030f */
+	0x0200, 0x0000,
+	/* 0x0045 0x030f */
+	0x0204, 0x0000,
+	/* 0x0049 0x030f */
+	0x0208, 0x0000,
+	/* 0x004f 0x030f */
+	0x020c, 0x0000,
+	/* 0x0052 0x030f */
+	0x0210, 0x0000,
+	/* 0x0055 0x030f */
+	0x0214, 0x0000,
+	/* 0x0061 0x030f */
+	0x0201, 0x0000,
+	/* 0x0065 0x030f */
+	0x0205, 0x0000,
+	/* 0x0069 0x030f */
+	0x0209, 0x0000,
+	/* 0x006f 0x030f */
+	0x020d, 0x0000,
+	/* 0x0072 0x030f */
+	0x0211, 0x0000,
+	/* 0x0075 0x030f */
+	0x0215, 0x0000,
+	/* 0x0474 0x030f */
+	0x0476, 0x0000,
+	/* 0x0475 0x030f */
+	0x0477, 0x0000,
+	/* 0x0041 0x0311 */
+	0x0202, 0x0000,
+	/* 0x0045 0x0311 */
+	0x0206, 0x0000,
+	/* 0x0049 0x0311 */
+	0x020a, 0x0000,
+	/* 0x004f 0x0311 */
+	0x020e, 0x0000,
+	/* 0x0052 0x0311 */
+	0x0212, 0x0000,
+	/* 0x0055 0x0311 */
+	0x0216, 0x0000,
+	/* 0x0061 0x0311 */
+	0x0203, 0x0000,
+	/* 0x0065 0x0311 */
+	0x0207, 0x0000,
+	/* 0x0069 0x0311 */
+	0x020b, 0x0000,
+	/* 0x006f 0x0311 */
+	0x020f, 0x0000,
+	/* 0x0072 0x0311 */
+	0x0213, 0x0000,
+	/* 0x0075 0x0311 */
+	0x0217, 0x0000,
+	/* 0x0391 0x0313 */
+	0x1f08, 0x0003,  0x0300, 0x0e00,  0x0301, 0x0e02,  0x0342, 0x0e04,
+	/* 0x0395 0x0313 */
+	0x1f18, 0x0002,  0x0300, 0x0e06,  0x0301, 0x0e08,
+	/* 0x0397 0x0313 */
+	0x1f28, 0x0003,  0x0300, 0x0e0a,  0x0301, 0x0e0c,  0x0342, 0x0e0e,
+	/* 0x0399 0x0313 */
+	0x1f38, 0x0003,  0x0300, 0x0e10,  0x0301, 0x0e12,  0x0342, 0x0e14,
+	/* 0x039f 0x0313 */
+	0x1f48, 0x0002,  0x0300, 0x0e16,  0x0301, 0x0e18,
+	/* 0x03a9 0x0313 */
+	0x1f68, 0x0003,  0x0300, 0x0e1a,  0x0301, 0x0e1c,  0x0342, 0x0e1e,
+	/* 0x03b1 0x0313 */
+	0x1f00, 0x0003,  0x0300, 0x0e20,  0x0301, 0x0e22,  0x0342, 0x0e24,
+	/* 0x03b5 0x0313 */
+	0x1f10, 0x0002,  0x0300, 0x0e26,  0x0301, 0x0e28,
+	/* 0x03b7 0x0313 */
+	0x1f20, 0x0003,  0x0300, 0x0e2a,  0x0301, 0x0e2c,  0x0342, 0x0e2e,
+	/* 0x03b9 0x0313 */
+	0x1f30, 0x0003,  0x0300, 0x0e30,  0x0301, 0x0e32,  0x0342, 0x0e34,
+	/* 0x03bf 0x0313 */
+	0x1f40, 0x0002,  0x0300, 0x0e36,  0x0301, 0x0e38,
+	/* 0x03c1 0x0313 */
+	0x1fe4, 0x0000,
+	/* 0x03c5 0x0313 */
+	0x1f50, 0x0003,  0x0300, 0x0e3a,  0x0301, 0x0e3c,  0x0342, 0x0e3e,
+	/* 0x03c9 0x0313 */
+	0x1f60, 0x0003,  0x0300, 0x0e40,  0x0301, 0x0e42,  0x0342, 0x0e44,
+	/* 0x0391 0x0314 */
+	0x1f09, 0x0003,  0x0300, 0x0e46,  0x0301, 0x0e48,  0x0342, 0x0e4a,
+	/* 0x0395 0x0314 */
+	0x1f19, 0x0002,  0x0300, 0x0e4c,  0x0301, 0x0e4e,
+	/* 0x0397 0x0314 */
+	0x1f29, 0x0003,  0x0300, 0x0e50,  0x0301, 0x0e52,  0x0342, 0x0e54,
+	/* 0x0399 0x0314 */
+	0x1f39, 0x0003,  0x0300, 0x0e56,  0x0301, 0x0e58,  0x0342, 0x0e5a,
+	/* 0x039f 0x0314 */
+	0x1f49, 0x0002,  0x0300, 0x0e5c,  0x0301, 0x0e5e,
+	/* 0x03a1 0x0314 */
+	0x1fec, 0x0000,
+	/* 0x03a5 0x0314 */
+	0x1f59, 0x0003,  0x0300, 0x0e60,  0x0301, 0x0e62,  0x0342, 0x0e64,
+	/* 0x03a9 0x0314 */
+	0x1f69, 0x0003,  0x0300, 0x0e66,  0x0301, 0x0e68,  0x0342, 0x0e6a,
+	/* 0x03b1 0x0314 */
+	0x1f01, 0x0003,  0x0300, 0x0e6c,  0x0301, 0x0e6e,  0x0342, 0x0e70,
+	/* 0x03b5 0x0314 */
+	0x1f11, 0x0002,  0x0300, 0x0e72,  0x0301, 0x0e74,
+	/* 0x03b7 0x0314 */
+	0x1f21, 0x0003,  0x0300, 0x0e76,  0x0301, 0x0e78,  0x0342, 0x0e7a,
+	/* 0x03b9 0x0314 */
+	0x1f31, 0x0003,  0x0300, 0x0e7c,  0x0301, 0x0e7e,  0x0342, 0x0e80,
+	/* 0x03bf 0x0314 */
+	0x1f41, 0x0002,  0x0300, 0x0e82,  0x0301, 0x0e84,
+	/* 0x03c1 0x0314 */
+	0x1fe5, 0x0000,
+	/* 0x03c5 0x0314 */
+	0x1f51, 0x0003,  0x0300, 0x0e86,  0x0301, 0x0e88,  0x0342, 0x0e8a,
+	/* 0x03c9 0x0314 */
+	0x1f61, 0x0003,  0x0300, 0x0e8c,  0x0301, 0x0e8e,  0x0342, 0x0e90,
+	/* 0x004f 0x031b */
+	0x01a0, 0x0005,  0x0300, 0x0e92,  0x0301, 0x0e94,  0x0303, 0x0e96,
+	0x0309, 0x0e98,  0x0323, 0x0e9a,
+	/* 0x0055 0x031b */
+	0x01af, 0x0005,  0x0300, 0x0e9c,  0x0301, 0x0e9e,  0x0303, 0x0ea0,
+	0x0309, 0x0ea2,  0x0323, 0x0ea4,
+	/* 0x006f 0x031b */
+	0x01a1, 0x0005,  0x0300, 0x0ea6,  0x0301, 0x0ea8,  0x0303, 0x0eaa,
+	0x0309, 0x0eac,  0x0323, 0x0eae,
+	/* 0x0075 0x031b */
+	0x01b0, 0x0005,  0x0300, 0x0eb0,  0x0301, 0x0eb2,  0x0303, 0x0eb4,
+	0x0309, 0x0eb6,  0x0323, 0x0eb8,
+	/* 0x0041 0x0323 */
+	0x1ea0, 0x0002,  0x0302, 0x0eba,  0x0306, 0x0ebc,
+	/* 0x0042 0x0323 */
+	0x1e04, 0x0000,
+	/* 0x0044 0x0323 */
+	0x1e0c, 0x0000,
+	/* 0x0045 0x0323 */
+	0x1eb8, 0x0001,  0x0302, 0x0ebe,
+	/* 0x0048 0x0323 */
+	0x1e24, 0x0000,
+	/* 0x0049 0x0323 */
+	0x1eca, 0x0000,
+	/* 0x004b 0x0323 */
+	0x1e32, 0x0000,
+	/* 0x004c 0x0323 */
+	0x1e36, 0x0001,  0x0304, 0x0ec0,
+	/* 0x004d 0x0323 */
+	0x1e42, 0x0000,
+	/* 0x004e 0x0323 */
+	0x1e46, 0x0000,
+	/* 0x004f 0x0323 */
+	0x1ecc, 0x0001,  0x0302, 0x0ec2,
+	/* 0x0052 0x0323 */
+	0x1e5a, 0x0001,  0x0304, 0x0ec4,
+	/* 0x0053 0x0323 */
+	0x1e62, 0x0001,  0x0307, 0x0ec6,
+	/* 0x0054 0x0323 */
+	0x1e6c, 0x0000,
+	/* 0x0055 0x0323 */
+	0x1ee4, 0x0000,
+	/* 0x0056 0x0323 */
+	0x1e7e, 0x0000,
+	/* 0x0057 0x0323 */
+	0x1e88, 0x0000,
+	/* 0x0059 0x0323 */
+	0x1ef4, 0x0000,
+	/* 0x005a 0x0323 */
+	0x1e92, 0x0000,
+	/* 0x0061 0x0323 */
+	0x1ea1, 0x0002,  0x0302, 0x0ec8,  0x0306, 0x0eca,
+	/* 0x0062 0x0323 */
+	0x1e05, 0x0000,
+	/* 0x0064 0x0323 */
+	0x1e0d, 0x0000,
+	/* 0x0065 0x0323 */
+	0x1eb9, 0x0001,  0x0302, 0x0ecc,
+	/* 0x0068 0x0323 */
+	0x1e25, 0x0000,
+	/* 0x0069 0x0323 */
+	0x1ecb, 0x0000,
+	/* 0x006b 0x0323 */
+	0x1e33, 0x0000,
+	/* 0x006c 0x0323 */
+	0x1e37, 0x0001,  0x0304, 0x0ece,
+	/* 0x006d 0x0323 */
+	0x1e43, 0x0000,
+	/* 0x006e 0x0323 */
+	0x1e47, 0x0000,
+	/* 0x006f 0x0323 */
+	0x1ecd, 0x0001,  0x0302, 0x0ed0,
+	/* 0x0072 0x0323 */
+	0x1e5b, 0x0001,  0x0304, 0x0ed2,
+	/* 0x0073 0x0323 */
+	0x1e63, 0x0001,  0x0307, 0x0ed4,
+	/* 0x0074 0x0323 */
+	0x1e6d, 0x0000,
+	/* 0x0075 0x0323 */
+	0x1ee5, 0x0000,
+	/* 0x0076 0x0323 */
+	0x1e7f, 0x0000,
+	/* 0x0077 0x0323 */
+	0x1e89, 0x0000,
+	/* 0x0079 0x0323 */
+	0x1ef5, 0x0000,
+	/* 0x007a 0x0323 */
+	0x1e93, 0x0000,
+	/* 0x0055 0x0324 */
+	0x1e72, 0x0000,
+	/* 0x0075 0x0324 */
+	0x1e73, 0x0000,
+	/* 0x0041 0x0325 */
+	0x1e00, 0x0000,
+	/* 0x0061 0x0325 */
+	0x1e01, 0x0000,
+	/* 0x0043 0x0327 */
+	0x00c7, 0x0001,  0x0301, 0x0ed6,
+	/* 0x0044 0x0327 */
+	0x1e10, 0x0000,
+	/* 0x0045 0x0327 */
+	0x0000, 0x0001,  0x0306, 0x0ed8,
+	/* 0x0047 0x0327 */
+	0x0122, 0x0000,
+	/* 0x0048 0x0327 */
+	0x1e28, 0x0000,
+	/* 0x004b 0x0327 */
+	0x0136, 0x0000,
+	/* 0x004c 0x0327 */
+	0x013b, 0x0000,
+	/* 0x004e 0x0327 */
+	0x0145, 0x0000,
+	/* 0x0052 0x0327 */
+	0x0156, 0x0000,
+	/* 0x0053 0x0327 */
+	0x015e, 0x0000,
+	/* 0x0054 0x0327 */
+	0x0162, 0x0000,
+	/* 0x0063 0x0327 */
+	0x00e7, 0x0001,  0x0301, 0x0eda,
+	/* 0x0064 0x0327 */
+	0x1e11, 0x0000,
+	/* 0x0065 0x0327 */
+	0x0000, 0x0001,  0x0306, 0x0edc,
+	/* 0x0067 0x0327 */
+	0x0123, 0x0000,
+	/* 0x0068 0x0327 */
+	0x1e29, 0x0000,
+	/* 0x006b 0x0327 */
+	0x0137, 0x0000,
+	/* 0x006c 0x0327 */
+	0x013c, 0x0000,
+	/* 0x006e 0x0327 */
+	0x0146, 0x0000,
+	/* 0x0072 0x0327 */
+	0x0157, 0x0000,
+	/* 0x0073 0x0327 */
+	0x015f, 0x0000,
+	/* 0x0074 0x0327 */
+	0x0163, 0x0000,
+	/* 0x0041 0x0328 */
+	0x0104, 0x0000,
+	/* 0x0045 0x0328 */
+	0x0118, 0x0000,
+	/* 0x0049 0x0328 */
+	0x012e, 0x0000,
+	/* 0x004f 0x0328 */
+	0x01ea, 0x0001,  0x0304, 0x0ede,
+	/* 0x0055 0x0328 */
+	0x0172, 0x0000,
+	/* 0x0061 0x0328 */
+	0x0105, 0x0000,
+	/* 0x0065 0x0328 */
+	0x0119, 0x0000,
+	/* 0x0069 0x0328 */
+	0x012f, 0x0000,
+	/* 0x006f 0x0328 */
+	0x01eb, 0x0001,  0x0304, 0x0ee0,
+	/* 0x0075 0x0328 */
+	0x0173, 0x0000,
+	/* 0x0044 0x032d */
+	0x1e12, 0x0000,
+	/* 0x0045 0x032d */
+	0x1e18, 0x0000,
+	/* 0x004c 0x032d */
+	0x1e3c, 0x0000,
+	/* 0x004e 0x032d */
+	0x1e4a, 0x0000,
+	/* 0x0054 0x032d */
+	0x1e70, 0x0000,
+	/* 0x0055 0x032d */
+	0x1e76, 0x0000,
+	/* 0x0064 0x032d */
+	0x1e13, 0x0000,
+	/* 0x0065 0x032d */
+	0x1e19, 0x0000,
+	/* 0x006c 0x032d */
+	0x1e3d, 0x0000,
+	/* 0x006e 0x032d */
+	0x1e4b, 0x0000,
+	/* 0x0074 0x032d */
+	0x1e71, 0x0000,
+	/* 0x0075 0x032d */
+	0x1e77, 0x0000,
+	/* 0x0048 0x032e */
+	0x1e2a, 0x0000,
+	/* 0x0068 0x032e */
+	0x1e2b, 0x0000,
+	/* 0x0045 0x0330 */
+	0x1e1a, 0x0000,
+	/* 0x0049 0x0330 */
+	0x1e2c, 0x0000,
+	/* 0x0055 0x0330 */
+	0x1e74, 0x0000,
+	/* 0x0065 0x0330 */
+	0x1e1b, 0x0000,
+	/* 0x0069 0x0330 */
+	0x1e2d, 0x0000,
+	/* 0x0075 0x0330 */
+	0x1e75, 0x0000,
+	/* 0x0042 0x0331 */
+	0x1e06, 0x0000,
+	/* 0x0044 0x0331 */
+	0x1e0e, 0x0000,
+	/* 0x004b 0x0331 */
+	0x1e34, 0x0000,
+	/* 0x004c 0x0331 */
+	0x1e3a, 0x0000,
+	/* 0x004e 0x0331 */
+	0x1e48, 0x0000,
+	/* 0x0052 0x0331 */
+	0x1e5e, 0x0000,
+	/* 0x0054 0x0331 */
+	0x1e6e, 0x0000,
+	/* 0x005a 0x0331 */
+	0x1e94, 0x0000,
+	/* 0x0062 0x0331 */
+	0x1e07, 0x0000,
+	/* 0x0064 0x0331 */
+	0x1e0f, 0x0000,
+	/* 0x0068 0x0331 */
+	0x1e96, 0x0000,
+	/* 0x006b 0x0331 */
+	0x1e35, 0x0000,
+	/* 0x006c 0x0331 */
+	0x1e3b, 0x0000,
+	/* 0x006e 0x0331 */
+	0x1e49, 0x0000,
+	/* 0x0072 0x0331 */
+	0x1e5f, 0x0000,
+	/* 0x0074 0x0331 */
+	0x1e6f, 0x0000,
+	/* 0x007a 0x0331 */
+	0x1e95, 0x0000,
+	/* 0x00a8 0x0342 */
+	0x1fc1, 0x0000,
+	/* 0x03b1 0x0342 */
+	0x1fb6, 0x0000,
+	/* 0x03b7 0x0342 */
+	0x1fc6, 0x0000,
+	/* 0x03b9 0x0342 */
+	0x1fd6, 0x0000,
+	/* 0x03c5 0x0342 */
+	0x1fe6, 0x0000,
+	/* 0x03c9 0x0342 */
+	0x1ff6, 0x0000,
+	/* 0x1fbf 0x0342 */
+	0x1fcf, 0x0000,
+	/* 0x1ffe 0x0342 */
+	0x1fdf, 0x0000,
+	/* 0x0391 0x0345 */
+	0x1fbc, 0x0002,  0x0313, 0x0ee2,  0x0314, 0x0eea,
+	/* 0x0397 0x0345 */
+	0x1fcc, 0x0002,  0x0313, 0x0ef2,  0x0314, 0x0efa,
+	/* 0x03a9 0x0345 */
+	0x1ffc, 0x0002,  0x0313, 0x0f02,  0x0314, 0x0f0a,
+	/* 0x03b1 0x0345 */
+	0x1fb3, 0x0005,  0x0300, 0x0f12,  0x0301, 0x0f14,  0x0313, 0x0f16,
+	0x0314, 0x0f1e,  0x0342, 0x0f26,
+	/* 0x03b7 0x0345 */
+	0x1fc3, 0x0005,  0x0300, 0x0f28,  0x0301, 0x0f2a,  0x0313, 0x0f2c,
+	0x0314, 0x0f34,  0x0342, 0x0f3c,
+	/* 0x03bf 0x0345 */
+	0x0000, 0x0001,  0x0301, 0x0f3e,
+	/* 0x03c9 0x0345 */
+	0x1ff3, 0x0004,  0x0300, 0x0f40,  0x0313, 0x0f42,  0x0314, 0x0f4a,
+	0x0342, 0x0f52,
+	/* 0x05d0 0x05b7 */
+	0xfb2e, 0x0000,
+	/* 0x05f2 0x05b7 */
+	0xfb1f, 0x0000,
+	/* 0x05d0 0x05b8 */
+	0xfb2f, 0x0000,
+	/* 0x05d5 0x05b9 */
+	0xfb4b, 0x0000,
+	/* 0x05d0 0x05bc */
+	0xfb30, 0x0000,
+	/* 0x05d1 0x05bc */
+	0xfb31, 0x0000,
+	/* 0x05d2 0x05bc */
+	0xfb32, 0x0000,
+	/* 0x05d3 0x05bc */
+	0xfb33, 0x0000,
+	/* 0x05d4 0x05bc */
+	0xfb34, 0x0000,
+	/* 0x05d5 0x05bc */
+	0xfb35, 0x0000,
+	/* 0x05d6 0x05bc */
+	0xfb36, 0x0000,
+	/* 0x05d8 0x05bc */
+	0xfb38, 0x0000,
+	/* 0x05d9 0x05bc */
+	0xfb39, 0x0000,
+	/* 0x05da 0x05bc */
+	0xfb3a, 0x0000,
+	/* 0x05db 0x05bc */
+	0xfb3b, 0x0000,
+	/* 0x05dc 0x05bc */
+	0xfb3c, 0x0000,
+	/* 0x05de 0x05bc */
+	0xfb3e, 0x0000,
+	/* 0x05e0 0x05bc */
+	0xfb40, 0x0000,
+	/* 0x05e1 0x05bc */
+	0xfb41, 0x0000,
+	/* 0x05e3 0x05bc */
+	0xfb43, 0x0000,
+	/* 0x05e4 0x05bc */
+	0xfb44, 0x0000,
+	/* 0x05e6 0x05bc */
+	0xfb46, 0x0000,
+	/* 0x05e7 0x05bc */
+	0xfb47, 0x0000,
+	/* 0x05e8 0x05bc */
+	0xfb48, 0x0000,
+	/* 0x05e9 0x05bc */
+	0xfb49, 0x0002,  0x05c1, 0x0f54,  0x05c2, 0x0f56,
+	/* 0x05ea 0x05bc */
+	0xfb4a, 0x0000,
+	/* 0x05d1 0x05bf */
+	0xfb4c, 0x0000,
+	/* 0x05db 0x05bf */
+	0xfb4d, 0x0000,
+	/* 0x05e4 0x05bf */
+	0xfb4e, 0x0000,
+	/* 0x05e9 0x05c1 */
+	0xfb2a, 0x0000,
+	/* 0x05e9 0x05c2 */
+	0xfb2b, 0x0000,
+	/* 0x0915 0x093c */
+	0x0958, 0x0000,
+	/* 0x0916 0x093c */
+	0x0959, 0x0000,
+	/* 0x0917 0x093c */
+	0x095a, 0x0000,
+	/* 0x091c 0x093c */
+	0x095b, 0x0000,
+	/* 0x0921 0x093c */
+	0x095c, 0x0000,
+	/* 0x0922 0x093c */
+	0x095d, 0x0000,
+	/* 0x0928 0x093c */
+	0x0929, 0x0000,
+	/* 0x092b 0x093c */
+	0x095e, 0x0000,
+	/* 0x092f 0x093c */
+	0x095f, 0x0000,
+	/* 0x0930 0x093c */
+	0x0931, 0x0000,
+	/* 0x0933 0x093c */
+	0x0934, 0x0000,
+	/* 0x09a1 0x09bc */
+	0x09dc, 0x0000,
+	/* 0x09a2 0x09bc */
+	0x09dd, 0x0000,
+	/* 0x09ac 0x09bc */
+	0x09b0, 0x0000,
+	/* 0x09af 0x09bc */
+	0x09df, 0x0000,
+	/* 0x09c7 0x09be */
+	0x09cb, 0x0000,
+	/* 0x09c7 0x09d7 */
+	0x09cc, 0x0000,
+	/* 0x0a16 0x0a3c */
+	0x0a59, 0x0000,
+	/* 0x0a17 0x0a3c */
+	0x0a5a, 0x0000,
+	/* 0x0a1c 0x0a3c */
+	0x0a5b, 0x0000,
+	/* 0x0a21 0x0a3c */
+	0x0a5c, 0x0000,
+	/* 0x0a2b 0x0a3c */
+	0x0a5e, 0x0000,
+	/* 0x0b21 0x0b3c */
+	0x0b5c, 0x0000,
+	/* 0x0b22 0x0b3c */
+	0x0b5d, 0x0000,
+	/* 0x0b2f 0x0b3c */
+	0x0b5f, 0x0000,
+	/* 0x0b47 0x0b3e */
+	0x0b4b, 0x0000,
+	/* 0x0b47 0x0b56 */
+	0x0b48, 0x0000,
+	/* 0x0b47 0x0b57 */
+	0x0b4c, 0x0000,
+	/* 0x0bc6 0x0bbe */
+	0x0bca, 0x0000,
+	/* 0x0bc7 0x0bbe */
+	0x0bcb, 0x0000,
+	/* 0x0b92 0x0bd7 */
+	0x0b94, 0x0000,
+	/* 0x0bc6 0x0bd7 */
+	0x0bcc, 0x0000,
+	/* 0x0c46 0x0c56 */
+	0x0c48, 0x0000,
+	/* 0x0cc6 0x0cc2 */
+	0x0cca, 0x0001,  0x0cd5, 0x0f58,
+	/* 0x0cbf 0x0cd5 */
+	0x0cc0, 0x0000,
+	/* 0x0cc6 0x0cd5 */
+	0x0cc7, 0x0000,
+	/* 0x0cc6 0x0cd6 */
+	0x0cc8, 0x0000,
+	/* 0x0d46 0x0d3e */
+	0x0d4a, 0x0000,
+	/* 0x0d47 0x0d3e */
+	0x0d4b, 0x0000,
+	/* 0x0d46 0x0d57 */
+	0x0d4c, 0x0000,
+	/* 0x0e4d 0x0e32 */
+	0x0e33, 0x0000,
+	/* 0x0ecd 0x0eb2 */
+	0x0eb3, 0x0000,
+	/* 0x0f72 0x0f71 */
+	0x0f73, 0x0000,
+	/* 0x0f74 0x0f71 */
+	0x0f75, 0x0000,
+	/* 0x0f80 0x0f71 */
+	0x0f81, 0x0000,
+	/* 0x0fb2 0x0f80 */
+	0x0f76, 0x0001,  0x0f71, 0x0f5a,
+	/* 0x0fb3 0x0f80 */
+	0x0f78, 0x0001,  0x0f71, 0x0f5c,
+	/* 0x0f40 0x0fb5 */
+	0x0f69, 0x0000,
+	/* 0x0f90 0x0fb5 */
+	0x0fb9, 0x0000,
+	/* 0x0f42 0x0fb7 */
+	0x0f43, 0x0000,
+	/* 0x0f4c 0x0fb7 */
+	0x0f4d, 0x0000,
+	/* 0x0f51 0x0fb7 */
+	0x0f52, 0x0000,
+	/* 0x0f56 0x0fb7 */
+	0x0f57, 0x0000,
+	/* 0x0f5b 0x0fb7 */
+	0x0f5c, 0x0000,
+	/* 0x0f92 0x0fb7 */
+	0x0f93, 0x0000,
+	/* 0x0f9c 0x0fb7 */
+	0x0f9d, 0x0000,
+	/* 0x0fa1 0x0fb7 */
+	0x0fa2, 0x0000,
+	/* 0x0fa6 0x0fb7 */
+	0x0fa7, 0x0000,
+	/* 0x0fab 0x0fb7 */
+	0x0fac, 0x0000,
+	/* 0x3046 0x3099 */
+	0x3094, 0x0000,
+	/* 0x304b 0x3099 */
+	0x304c, 0x0000,
+	/* 0x304d 0x3099 */
+	0x304e, 0x0000,
+	/* 0x304f 0x3099 */
+	0x3050, 0x0000,
+	/* 0x3051 0x3099 */
+	0x3052, 0x0000,
+	/* 0x3053 0x3099 */
+	0x3054, 0x0000,
+	/* 0x3055 0x3099 */
+	0x3056, 0x0000,
+	/* 0x3057 0x3099 */
+	0x3058, 0x0000,
+	/* 0x3059 0x3099 */
+	0x305a, 0x0000,
+	/* 0x305b 0x3099 */
+	0x305c, 0x0000,
+	/* 0x305d 0x3099 */
+	0x305e, 0x0000,
+	/* 0x305f 0x3099 */
+	0x3060, 0x0000,
+	/* 0x3061 0x3099 */
+	0x3062, 0x0000,
+	/* 0x3064 0x3099 */
+	0x3065, 0x0000,
+	/* 0x3066 0x3099 */
+	0x3067, 0x0000,
+	/* 0x3068 0x3099 */
+	0x3069, 0x0000,
+	/* 0x306f 0x3099 */
+	0x3070, 0x0000,
+	/* 0x3072 0x3099 */
+	0x3073, 0x0000,
+	/* 0x3075 0x3099 */
+	0x3076, 0x0000,
+	/* 0x3078 0x3099 */
+	0x3079, 0x0000,
+	/* 0x307b 0x3099 */
+	0x307c, 0x0000,
+	/* 0x309d 0x3099 */
+	0x309e, 0x0000,
+	/* 0x30a6 0x3099 */
+	0x30f4, 0x0000,
+	/* 0x30ab 0x3099 */
+	0x30ac, 0x0000,
+	/* 0x30ad 0x3099 */
+	0x30ae, 0x0000,
+	/* 0x30af 0x3099 */
+	0x30b0, 0x0000,
+	/* 0x30b1 0x3099 */
+	0x30b2, 0x0000,
+	/* 0x30b3 0x3099 */
+	0x30b4, 0x0000,
+	/* 0x30b5 0x3099 */
+	0x30b6, 0x0000,
+	/* 0x30b7 0x3099 */
+	0x30b8, 0x0000,
+	/* 0x30b9 0x3099 */
+	0x30ba, 0x0000,
+	/* 0x30bb 0x3099 */
+	0x30bc, 0x0000,
+	/* 0x30bd 0x3099 */
+	0x30be, 0x0000,
+	/* 0x30bf 0x3099 */
+	0x30c0, 0x0000,
+	/* 0x30c1 0x3099 */
+	0x30c2, 0x0000,
+	/* 0x30c4 0x3099 */
+	0x30c5, 0x0000,
+	/* 0x30c6 0x3099 */
+	0x30c7, 0x0000,
+	/* 0x30c8 0x3099 */
+	0x30c9, 0x0000,
+	/* 0x30cf 0x3099 */
+	0x30d0, 0x0000,
+	/* 0x30d2 0x3099 */
+	0x30d3, 0x0000,
+	/* 0x30d5 0x3099 */
+	0x30d6, 0x0000,
+	/* 0x30d8 0x3099 */
+	0x30d9, 0x0000,
+	/* 0x30db 0x3099 */
+	0x30dc, 0x0000,
+	/* 0x30ef 0x3099 */
+	0x30f7, 0x0000,
+	/* 0x30f0 0x3099 */
+	0x30f8, 0x0000,
+	/* 0x30f1 0x3099 */
+	0x30f9, 0x0000,
+	/* 0x30f2 0x3099 */
+	0x30fa, 0x0000,
+	/* 0x30fd 0x3099 */
+	0x30fe, 0x0000,
+	/* 0x306f 0x309a */
+	0x3071, 0x0000,
+	/* 0x3072 0x309a */
+	0x3074, 0x0000,
+	/* 0x3075 0x309a */
+	0x3077, 0x0000,
+	/* 0x3078 0x309a */
+	0x307a, 0x0000,
+	/* 0x307b 0x309a */
+	0x307d, 0x0000,
+	/* 0x30cf 0x309a */
+	0x30d1, 0x0000,
+	/* 0x30d2 0x309a */
+	0x30d4, 0x0000,
+	/* 0x30d5 0x309a */
+	0x30d7, 0x0000,
+	/* 0x30d8 0x309a */
+	0x30da, 0x0000,
+	/* 0x30db 0x309a */
+	0x30dd, 0x0000,
+	/* 0x0307 0x0053 0x0301 */
+	0x1e64, 0x0000,
+	/* 0x0307 0x0073 0x0301 */
+	0x1e65, 0x0000,
+	/* 0x0300 0x0041 0x0302 */
+	0x1ea6, 0x0000,
+	/* 0x0301 0x0041 0x0302 */
+	0x1ea4, 0x0000,
+	/* 0x0303 0x0041 0x0302 */
+	0x1eaa, 0x0000,
+	/* 0x0309 0x0041 0x0302 */
+	0x1ea8, 0x0000,
+	/* 0x0300 0x0045 0x0302 */
+	0x1ec0, 0x0000,
+	/* 0x0301 0x0045 0x0302 */
+	0x1ebe, 0x0000,
+	/* 0x0303 0x0045 0x0302 */
+	0x1ec4, 0x0000,
+	/* 0x0309 0x0045 0x0302 */
+	0x1ec2, 0x0000,
+	/* 0x0300 0x004f 0x0302 */
+	0x1ed2, 0x0000,
+	/* 0x0301 0x004f 0x0302 */
+	0x1ed0, 0x0000,
+	/* 0x0303 0x004f 0x0302 */
+	0x1ed6, 0x0000,
+	/* 0x0309 0x004f 0x0302 */
+	0x1ed4, 0x0000,
+	/* 0x0300 0x0061 0x0302 */
+	0x1ea7, 0x0000,
+	/* 0x0301 0x0061 0x0302 */
+	0x1ea5, 0x0000,
+	/* 0x0303 0x0061 0x0302 */
+	0x1eab, 0x0000,
+	/* 0x0309 0x0061 0x0302 */
+	0x1ea9, 0x0000,
+	/* 0x0300 0x0065 0x0302 */
+	0x1ec1, 0x0000,
+	/* 0x0301 0x0065 0x0302 */
+	0x1ebf, 0x0000,
+	/* 0x0303 0x0065 0x0302 */
+	0x1ec5, 0x0000,
+	/* 0x0309 0x0065 0x0302 */
+	0x1ec3, 0x0000,
+	/* 0x0300 0x006f 0x0302 */
+	0x1ed3, 0x0000,
+	/* 0x0301 0x006f 0x0302 */
+	0x1ed1, 0x0000,
+	/* 0x0303 0x006f 0x0302 */
+	0x1ed7, 0x0000,
+	/* 0x0309 0x006f 0x0302 */
+	0x1ed5, 0x0000,
+	/* 0x0301 0x004f 0x0303 */
+	0x1e4c, 0x0000,
+	/* 0x0308 0x004f 0x0303 */
+	0x1e4e, 0x0000,
+	/* 0x0301 0x0055 0x0303 */
+	0x1e78, 0x0000,
+	/* 0x0301 0x006f 0x0303 */
+	0x1e4d, 0x0000,
+	/* 0x0308 0x006f 0x0303 */
+	0x1e4f, 0x0000,
+	/* 0x0301 0x0075 0x0303 */
+	0x1e79, 0x0000,
+	/* 0x0300 0x0045 0x0304 */
+	0x1e14, 0x0000,
+	/* 0x0301 0x0045 0x0304 */
+	0x1e16, 0x0000,
+	/* 0x0300 0x004f 0x0304 */
+	0x1e50, 0x0000,
+	/* 0x0301 0x004f 0x0304 */
+	0x1e52, 0x0000,
+	/* 0x0308 0x0055 0x0304 */
+	0x1e7a, 0x0000,
+	/* 0x0300 0x0065 0x0304 */
+	0x1e15, 0x0000,
+	/* 0x0301 0x0065 0x0304 */
+	0x1e17, 0x0000,
+	/* 0x0300 0x006f 0x0304 */
+	0x1e51, 0x0000,
+	/* 0x0301 0x006f 0x0304 */
+	0x1e53, 0x0000,
+	/* 0x0308 0x0075 0x0304 */
+	0x1e7b, 0x0000,
+	/* 0x0300 0x0041 0x0306 */
+	0x1eb0, 0x0000,
+	/* 0x0301 0x0041 0x0306 */
+	0x1eae, 0x0000,
+	/* 0x0303 0x0041 0x0306 */
+	0x1eb4, 0x0000,
+	/* 0x0309 0x0041 0x0306 */
+	0x1eb2, 0x0000,
+	/* 0x0300 0x0061 0x0306 */
+	0x1eb1, 0x0000,
+	/* 0x0301 0x0061 0x0306 */
+	0x1eaf, 0x0000,
+	/* 0x0303 0x0061 0x0306 */
+	0x1eb5, 0x0000,
+	/* 0x0309 0x0061 0x0306 */
+	0x1eb3, 0x0000,
+	/* 0x0304 0x0041 0x0307 */
+	0x01e0, 0x0000,
+	/* 0x0304 0x0061 0x0307 */
+	0x01e1, 0x0000,
+	/* 0x0304 0x0041 0x0308 */
+	0x01de, 0x0000,
+	/* 0x0301 0x0049 0x0308 */
+	0x1e2e, 0x0000,
+	/* 0x0300 0x0055 0x0308 */
+	0x01db, 0x0000,
+	/* 0x0301 0x0055 0x0308 */
+	0x01d7, 0x0000,
+	/* 0x0304 0x0055 0x0308 */
+	0x01d5, 0x0000,
+	/* 0x030c 0x0055 0x0308 */
+	0x01d9, 0x0000,
+	/* 0x0304 0x0061 0x0308 */
+	0x01df, 0x0000,
+	/* 0x0301 0x0069 0x0308 */
+	0x1e2f, 0x0000,
+	/* 0x0300 0x0075 0x0308 */
+	0x01dc, 0x0000,
+	/* 0x0301 0x0075 0x0308 */
+	0x01d8, 0x0000,
+	/* 0x0304 0x0075 0x0308 */
+	0x01d6, 0x0000,
+	/* 0x030c 0x0075 0x0308 */
+	0x01da, 0x0000,
+	/* 0x0300 0x03b9 0x0308 */
+	0x1fd2, 0x0000,
+	/* 0x0301 0x03b9 0x0308 */
+	0x1fd3, 0x0000,
+	/* 0x030d 0x03b9 0x0308 */
+	0x0390, 0x0000,
+	/* 0x0342 0x03b9 0x0308 */
+	0x1fd7, 0x0000,
+	/* 0x0300 0x03c5 0x0308 */
+	0x1fe2, 0x0000,
+	/* 0x0301 0x03c5 0x0308 */
+	0x1fe3, 0x0000,
+	/* 0x030d 0x03c5 0x0308 */
+	0x03b0, 0x0000,
+	/* 0x0342 0x03c5 0x0308 */
+	0x1fe7, 0x0000,
+	/* 0x0301 0x0041 0x030a */
+	0x01fa, 0x0000,
+	/* 0x0301 0x0061 0x030a */
+	0x01fb, 0x0000,
+	/* 0x0307 0x0053 0x030c */
+	0x1e66, 0x0000,
+	/* 0x0307 0x0073 0x030c */
+	0x1e67, 0x0000,
+	/* 0x0300 0x0391 0x0313 */
+	0x1f0a, 0x0000,
+	/* 0x0301 0x0391 0x0313 */
+	0x1f0c, 0x0000,
+	/* 0x0342 0x0391 0x0313 */
+	0x1f0e, 0x0000,
+	/* 0x0300 0x0395 0x0313 */
+	0x1f1a, 0x0000,
+	/* 0x0301 0x0395 0x0313 */
+	0x1f1c, 0x0000,
+	/* 0x0300 0x0397 0x0313 */
+	0x1f2a, 0x0000,
+	/* 0x0301 0x0397 0x0313 */
+	0x1f2c, 0x0000,
+	/* 0x0342 0x0397 0x0313 */
+	0x1f2e, 0x0000,
+	/* 0x0300 0x0399 0x0313 */
+	0x1f3a, 0x0000,
+	/* 0x0301 0x0399 0x0313 */
+	0x1f3c, 0x0000,
+	/* 0x0342 0x0399 0x0313 */
+	0x1f3e, 0x0000,
+	/* 0x0300 0x039f 0x0313 */
+	0x1f4a, 0x0000,
+	/* 0x0301 0x039f 0x0313 */
+	0x1f4c, 0x0000,
+	/* 0x0300 0x03a9 0x0313 */
+	0x1f6a, 0x0000,
+	/* 0x0301 0x03a9 0x0313 */
+	0x1f6c, 0x0000,
+	/* 0x0342 0x03a9 0x0313 */
+	0x1f6e, 0x0000,
+	/* 0x0300 0x03b1 0x0313 */
+	0x1f02, 0x0000,
+	/* 0x0301 0x03b1 0x0313 */
+	0x1f04, 0x0000,
+	/* 0x0342 0x03b1 0x0313 */
+	0x1f06, 0x0000,
+	/* 0x0300 0x03b5 0x0313 */
+	0x1f12, 0x0000,
+	/* 0x0301 0x03b5 0x0313 */
+	0x1f14, 0x0000,
+	/* 0x0300 0x03b7 0x0313 */
+	0x1f22, 0x0000,
+	/* 0x0301 0x03b7 0x0313 */
+	0x1f24, 0x0000,
+	/* 0x0342 0x03b7 0x0313 */
+	0x1f26, 0x0000,
+	/* 0x0300 0x03b9 0x0313 */
+	0x1f32, 0x0000,
+	/* 0x0301 0x03b9 0x0313 */
+	0x1f34, 0x0000,
+	/* 0x0342 0x03b9 0x0313 */
+	0x1f36, 0x0000,
+	/* 0x0300 0x03bf 0x0313 */
+	0x1f42, 0x0000,
+	/* 0x0301 0x03bf 0x0313 */
+	0x1f44, 0x0000,
+	/* 0x0300 0x03c5 0x0313 */
+	0x1f52, 0x0000,
+	/* 0x0301 0x03c5 0x0313 */
+	0x1f54, 0x0000,
+	/* 0x0342 0x03c5 0x0313 */
+	0x1f56, 0x0000,
+	/* 0x0300 0x03c9 0x0313 */
+	0x1f62, 0x0000,
+	/* 0x0301 0x03c9 0x0313 */
+	0x1f64, 0x0000,
+	/* 0x0342 0x03c9 0x0313 */
+	0x1f66, 0x0000,
+	/* 0x0300 0x0391 0x0314 */
+	0x1f0b, 0x0000,
+	/* 0x0301 0x0391 0x0314 */
+	0x1f0d, 0x0000,
+	/* 0x0342 0x0391 0x0314 */
+	0x1f0f, 0x0000,
+	/* 0x0300 0x0395 0x0314 */
+	0x1f1b, 0x0000,
+	/* 0x0301 0x0395 0x0314 */
+	0x1f1d, 0x0000,
+	/* 0x0300 0x0397 0x0314 */
+	0x1f2b, 0x0000,
+	/* 0x0301 0x0397 0x0314 */
+	0x1f2d, 0x0000,
+	/* 0x0342 0x0397 0x0314 */
+	0x1f2f, 0x0000,
+	/* 0x0300 0x0399 0x0314 */
+	0x1f3b, 0x0000,
+	/* 0x0301 0x0399 0x0314 */
+	0x1f3d, 0x0000,
+	/* 0x0342 0x0399 0x0314 */
+	0x1f3f, 0x0000,
+	/* 0x0300 0x039f 0x0314 */
+	0x1f4b, 0x0000,
+	/* 0x0301 0x039f 0x0314 */
+	0x1f4d, 0x0000,
+	/* 0x0300 0x03a5 0x0314 */
+	0x1f5b, 0x0000,
+	/* 0x0301 0x03a5 0x0314 */
+	0x1f5d, 0x0000,
+	/* 0x0342 0x03a5 0x0314 */
+	0x1f5f, 0x0000,
+	/* 0x0300 0x03a9 0x0314 */
+	0x1f6b, 0x0000,
+	/* 0x0301 0x03a9 0x0314 */
+	0x1f6d, 0x0000,
+	/* 0x0342 0x03a9 0x0314 */
+	0x1f6f, 0x0000,
+	/* 0x0300 0x03b1 0x0314 */
+	0x1f03, 0x0000,
+	/* 0x0301 0x03b1 0x0314 */
+	0x1f05, 0x0000,
+	/* 0x0342 0x03b1 0x0314 */
+	0x1f07, 0x0000,
+	/* 0x0300 0x03b5 0x0314 */
+	0x1f13, 0x0000,
+	/* 0x0301 0x03b5 0x0314 */
+	0x1f15, 0x0000,
+	/* 0x0300 0x03b7 0x0314 */
+	0x1f23, 0x0000,
+	/* 0x0301 0x03b7 0x0314 */
+	0x1f25, 0x0000,
+	/* 0x0342 0x03b7 0x0314 */
+	0x1f27, 0x0000,
+	/* 0x0300 0x03b9 0x0314 */
+	0x1f33, 0x0000,
+	/* 0x0301 0x03b9 0x0314 */
+	0x1f35, 0x0000,
+	/* 0x0342 0x03b9 0x0314 */
+	0x1f37, 0x0000,
+	/* 0x0300 0x03bf 0x0314 */
+	0x1f43, 0x0000,
+	/* 0x0301 0x03bf 0x0314 */
+	0x1f45, 0x0000,
+	/* 0x0300 0x03c5 0x0314 */
+	0x1f53, 0x0000,
+	/* 0x0301 0x03c5 0x0314 */
+	0x1f55, 0x0000,
+	/* 0x0342 0x03c5 0x0314 */
+	0x1f57, 0x0000,
+	/* 0x0300 0x03c9 0x0314 */
+	0x1f63, 0x0000,
+	/* 0x0301 0x03c9 0x0314 */
+	0x1f65, 0x0000,
+	/* 0x0342 0x03c9 0x0314 */
+	0x1f67, 0x0000,
+	/* 0x0300 0x004f 0x031b */
+	0x1edc, 0x0000,
+	/* 0x0301 0x004f 0x031b */
+	0x1eda, 0x0000,
+	/* 0x0303 0x004f 0x031b */
+	0x1ee0, 0x0000,
+	/* 0x0309 0x004f 0x031b */
+	0x1ede, 0x0000,
+	/* 0x0323 0x004f 0x031b */
+	0x1ee2, 0x0000,
+	/* 0x0300 0x0055 0x031b */
+	0x1eea, 0x0000,
+	/* 0x0301 0x0055 0x031b */
+	0x1ee8, 0x0000,
+	/* 0x0303 0x0055 0x031b */
+	0x1eee, 0x0000,
+	/* 0x0309 0x0055 0x031b */
+	0x1eec, 0x0000,
+	/* 0x0323 0x0055 0x031b */
+	0x1ef0, 0x0000,
+	/* 0x0300 0x006f 0x031b */
+	0x1edd, 0x0000,
+	/* 0x0301 0x006f 0x031b */
+	0x1edb, 0x0000,
+	/* 0x0303 0x006f 0x031b */
+	0x1ee1, 0x0000,
+	/* 0x0309 0x006f 0x031b */
+	0x1edf, 0x0000,
+	/* 0x0323 0x006f 0x031b */
+	0x1ee3, 0x0000,
+	/* 0x0300 0x0075 0x031b */
+	0x1eeb, 0x0000,
+	/* 0x0301 0x0075 0x031b */
+	0x1ee9, 0x0000,
+	/* 0x0303 0x0075 0x031b */
+	0x1eef, 0x0000,
+	/* 0x0309 0x0075 0x031b */
+	0x1eed, 0x0000,
+	/* 0x0323 0x0075 0x031b */
+	0x1ef1, 0x0000,
+	/* 0x0302 0x0041 0x0323 */
+	0x1eac, 0x0000,
+	/* 0x0306 0x0041 0x0323 */
+	0x1eb6, 0x0000,
+	/* 0x0302 0x0045 0x0323 */
+	0x1ec6, 0x0000,
+	/* 0x0304 0x004c 0x0323 */
+	0x1e38, 0x0000,
+	/* 0x0302 0x004f 0x0323 */
+	0x1ed8, 0x0000,
+	/* 0x0304 0x0052 0x0323 */
+	0x1e5c, 0x0000,
+	/* 0x0307 0x0053 0x0323 */
+	0x1e68, 0x0000,
+	/* 0x0302 0x0061 0x0323 */
+	0x1ead, 0x0000,
+	/* 0x0306 0x0061 0x0323 */
+	0x1eb7, 0x0000,
+	/* 0x0302 0x0065 0x0323 */
+	0x1ec7, 0x0000,
+	/* 0x0304 0x006c 0x0323 */
+	0x1e39, 0x0000,
+	/* 0x0302 0x006f 0x0323 */
+	0x1ed9, 0x0000,
+	/* 0x0304 0x0072 0x0323 */
+	0x1e5d, 0x0000,
+	/* 0x0307 0x0073 0x0323 */
+	0x1e69, 0x0000,
+	/* 0x0301 0x0043 0x0327 */
+	0x1e08, 0x0000,
+	/* 0x0306 0x0045 0x0327 */
+	0x1e1c, 0x0000,
+	/* 0x0301 0x0063 0x0327 */
+	0x1e09, 0x0000,
+	/* 0x0306 0x0065 0x0327 */
+	0x1e1d, 0x0000,
+	/* 0x0304 0x004f 0x0328 */
+	0x01ec, 0x0000,
+	/* 0x0304 0x006f 0x0328 */
+	0x01ed, 0x0000,
+	/* 0x0313 0x0391 0x0345 */
+	0x1f88, 0x0003,  0x0300, 0x0f5e,  0x0301, 0x0f60,  0x0342, 0x0f62,
+	/* 0x0314 0x0391 0x0345 */
+	0x1f89, 0x0003,  0x0300, 0x0f64,  0x0301, 0x0f66,  0x0342, 0x0f68,
+	/* 0x0313 0x0397 0x0345 */
+	0x1f98, 0x0003,  0x0300, 0x0f6a,  0x0301, 0x0f6c,  0x0342, 0x0f6e,
+	/* 0x0314 0x0397 0x0345 */
+	0x1f99, 0x0003,  0x0300, 0x0f70,  0x0301, 0x0f72,  0x0342, 0x0f74,
+	/* 0x0313 0x03a9 0x0345 */
+	0x1fa8, 0x0003,  0x0300, 0x0f76,  0x0301, 0x0f78,  0x0342, 0x0f7a,
+	/* 0x0314 0x03a9 0x0345 */
+	0x1fa9, 0x0003,  0x0300, 0x0f7c,  0x0301, 0x0f7e,  0x0342, 0x0f80,
+	/* 0x0300 0x03b1 0x0345 */
+	0x1fb2, 0x0000,
+	/* 0x0301 0x03b1 0x0345 */
+	0x1fb4, 0x0000,
+	/* 0x0313 0x03b1 0x0345 */
+	0x1f80, 0x0003,  0x0300, 0x0f82,  0x0301, 0x0f84,  0x0342, 0x0f86,
+	/* 0x0314 0x03b1 0x0345 */
+	0x1f81, 0x0003,  0x0300, 0x0f88,  0x0301, 0x0f8a,  0x0342, 0x0f8c,
+	/* 0x0342 0x03b1 0x0345 */
+	0x1fb7, 0x0000,
+	/* 0x0300 0x03b7 0x0345 */
+	0x1fc2, 0x0000,
+	/* 0x0301 0x03b7 0x0345 */
+	0x1fc4, 0x0000,
+	/* 0x0313 0x03b7 0x0345 */
+	0x1f90, 0x0003,  0x0300, 0x0f8e,  0x0301, 0x0f90,  0x0342, 0x0f92,
+	/* 0x0314 0x03b7 0x0345 */
+	0x1f91, 0x0003,  0x0300, 0x0f94,  0x0301, 0x0f96,  0x0342, 0x0f98,
+	/* 0x0342 0x03b7 0x0345 */
+	0x1fc7, 0x0000,
+	/* 0x0301 0x03bf 0x0345 */
+	0x1ff4, 0x0000,
+	/* 0x0300 0x03c9 0x0345 */
+	0x1ff2, 0x0000,
+	/* 0x0313 0x03c9 0x0345 */
+	0x1fa0, 0x0003,  0x0300, 0x0f9a,  0x0301, 0x0f9c,  0x0342, 0x0f9e,
+	/* 0x0314 0x03c9 0x0345 */
+	0x1fa1, 0x0003,  0x0300, 0x0fa0,  0x0301, 0x0fa2,  0x0342, 0x0fa4,
+	/* 0x0342 0x03c9 0x0345 */
+	0x1ff7, 0x0000,
+	/* 0x05c1 0x05e9 0x05bc */
+	0xfb2c, 0x0000,
+	/* 0x05c2 0x05e9 0x05bc */
+	0xfb2d, 0x0000,
+	/* 0x0cd5 0x0cc6 0x0cc2 */
+	0x0ccb, 0x0000,
+	/* 0x0f71 0x0fb2 0x0f80 */
+	0x0f77, 0x0000,
+	/* 0x0f71 0x0fb3 0x0f80 */
+	0x0f79, 0x0000,
+	/* 0x0300 0x0313 0x0391 0x0345 */
+	0x1f8a, 0x0000,
+	/* 0x0301 0x0313 0x0391 0x0345 */
+	0x1f8c, 0x0000,
+	/* 0x0342 0x0313 0x0391 0x0345 */
+	0x1f8e, 0x0000,
+	/* 0x0300 0x0314 0x0391 0x0345 */
+	0x1f8b, 0x0000,
+	/* 0x0301 0x0314 0x0391 0x0345 */
+	0x1f8d, 0x0000,
+	/* 0x0342 0x0314 0x0391 0x0345 */
+	0x1f8f, 0x0000,
+	/* 0x0300 0x0313 0x0397 0x0345 */
+	0x1f9a, 0x0000,
+	/* 0x0301 0x0313 0x0397 0x0345 */
+	0x1f9c, 0x0000,
+	/* 0x0342 0x0313 0x0397 0x0345 */
+	0x1f9e, 0x0000,
+	/* 0x0300 0x0314 0x0397 0x0345 */
+	0x1f9b, 0x0000,
+	/* 0x0301 0x0314 0x0397 0x0345 */
+	0x1f9d, 0x0000,
+	/* 0x0342 0x0314 0x0397 0x0345 */
+	0x1f9f, 0x0000,
+	/* 0x0300 0x0313 0x03a9 0x0345 */
+	0x1faa, 0x0000,
+	/* 0x0301 0x0313 0x03a9 0x0345 */
+	0x1fac, 0x0000,
+	/* 0x0342 0x0313 0x03a9 0x0345 */
+	0x1fae, 0x0000,
+	/* 0x0300 0x0314 0x03a9 0x0345 */
+	0x1fab, 0x0000,
+	/* 0x0301 0x0314 0x03a9 0x0345 */
+	0x1fad, 0x0000,
+	/* 0x0342 0x0314 0x03a9 0x0345 */
+	0x1faf, 0x0000,
+	/* 0x0300 0x0313 0x03b1 0x0345 */
+	0x1f82, 0x0000,
+	/* 0x0301 0x0313 0x03b1 0x0345 */
+	0x1f84, 0x0000,
+	/* 0x0342 0x0313 0x03b1 0x0345 */
+	0x1f86, 0x0000,
+	/* 0x0300 0x0314 0x03b1 0x0345 */
+	0x1f83, 0x0000,
+	/* 0x0301 0x0314 0x03b1 0x0345 */
+	0x1f85, 0x0000,
+	/* 0x0342 0x0314 0x03b1 0x0345 */
+	0x1f87, 0x0000,
+	/* 0x0300 0x0313 0x03b7 0x0345 */
+	0x1f92, 0x0000,
+	/* 0x0301 0x0313 0x03b7 0x0345 */
+	0x1f94, 0x0000,
+	/* 0x0342 0x0313 0x03b7 0x0345 */
+	0x1f96, 0x0000,
+	/* 0x0300 0x0314 0x03b7 0x0345 */
+	0x1f93, 0x0000,
+	/* 0x0301 0x0314 0x03b7 0x0345 */
+	0x1f95, 0x0000,
+	/* 0x0342 0x0314 0x03b7 0x0345 */
+	0x1f97, 0x0000,
+	/* 0x0300 0x0313 0x03c9 0x0345 */
+	0x1fa2, 0x0000,
+	/* 0x0301 0x0313 0x03c9 0x0345 */
+	0x1fa4, 0x0000,
+	/* 0x0342 0x0313 0x03c9 0x0345 */
+	0x1fa6, 0x0000,
+	/* 0x0300 0x0314 0x03c9 0x0345 */
+	0x1fa3, 0x0000,
+	/* 0x0301 0x0314 0x03c9 0x0345 */
+	0x1fa5, 0x0000,
+	/* 0x0342 0x0314 0x03c9 0x0345 */
+	0x1fa7, 0x0000,
+};
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
new file mode 100644
index 00000000..a3f0bfcc
--- /dev/null
+++ b/fs/hfsplus/unicode.c
@@ -0,0 +1,452 @@
+/*
+ *  linux/fs/hfsplus/unicode.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handler routines for unicode strings
+ */
+
+#include <linux/types.h>
+#include <linux/nls.h>
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+/* Fold the case of a unicode char, given the 16 bit value */
+/* Returns folded char, or 0 if ignorable */
+static inline u16 case_fold(u16 c)
+{
+	u16 tmp;
+
+	tmp = hfsplus_case_fold_table[c >> 8];
+	if (tmp)
+		tmp = hfsplus_case_fold_table[tmp + (c & 0xff)];
+	else
+		tmp = c;
+	return tmp;
+}
+
+/* Compare unicode strings, return values like normal strcmp */
+int hfsplus_strcasecmp(const struct hfsplus_unistr *s1,
+		       const struct hfsplus_unistr *s2)
+{
+	u16 len1, len2, c1, c2;
+	const hfsplus_unichr *p1, *p2;
+
+	len1 = be16_to_cpu(s1->length);
+	len2 = be16_to_cpu(s2->length);
+	p1 = s1->unicode;
+	p2 = s2->unicode;
+
+	while (1) {
+		c1 = c2 = 0;
+
+		while (len1 && !c1) {
+			c1 = case_fold(be16_to_cpu(*p1));
+			p1++;
+			len1--;
+		}
+		while (len2 && !c2) {
+			c2 = case_fold(be16_to_cpu(*p2));
+			p2++;
+			len2--;
+		}
+
+		if (c1 != c2)
+			return (c1 < c2) ? -1 : 1;
+		if (!c1 && !c2)
+			return 0;
+	}
+}
+
+/* Compare names as a sequence of 16-bit unsigned integers */
+int hfsplus_strcmp(const struct hfsplus_unistr *s1,
+		   const struct hfsplus_unistr *s2)
+{
+	u16 len1, len2, c1, c2;
+	const hfsplus_unichr *p1, *p2;
+	int len;
+
+	len1 = be16_to_cpu(s1->length);
+	len2 = be16_to_cpu(s2->length);
+	p1 = s1->unicode;
+	p2 = s2->unicode;
+
+	for (len = min(len1, len2); len > 0; len--) {
+		c1 = be16_to_cpu(*p1);
+		c2 = be16_to_cpu(*p2);
+		if (c1 != c2)
+			return c1 < c2 ? -1 : 1;
+		p1++;
+		p2++;
+	}
+
+	return len1 < len2 ? -1 :
+	       len1 > len2 ? 1 : 0;
+}
+
+
+#define Hangul_SBase	0xac00
+#define Hangul_LBase	0x1100
+#define Hangul_VBase	0x1161
+#define Hangul_TBase	0x11a7
+#define Hangul_SCount	11172
+#define Hangul_LCount	19
+#define Hangul_VCount	21
+#define Hangul_TCount	28
+#define Hangul_NCount	(Hangul_VCount * Hangul_TCount)
+
+
+static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
+{
+	int i, s, e;
+
+	s = 1;
+	e = p[1];
+	if (!e || cc < p[s * 2] || cc > p[e * 2])
+		return NULL;
+	do {
+		i = (s + e) / 2;
+		if (cc > p[i * 2])
+			s = i + 1;
+		else if (cc < p[i * 2])
+			e = i - 1;
+		else
+			return hfsplus_compose_table + p[i * 2 + 1];
+	} while (s <= e);
+	return NULL;
+}
+
+int hfsplus_uni2asc(struct super_block *sb,
+		const struct hfsplus_unistr *ustr,
+		char *astr, int *len_p)
+{
+	const hfsplus_unichr *ip;
+	struct nls_table *nls = HFSPLUS_SB(sb)->nls;
+	u8 *op;
+	u16 cc, c0, c1;
+	u16 *ce1, *ce2;
+	int i, len, ustrlen, res, compose;
+
+	op = astr;
+	ip = ustr->unicode;
+	ustrlen = be16_to_cpu(ustr->length);
+	len = *len_p;
+	ce1 = NULL;
+	compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
+
+	while (ustrlen > 0) {
+		c0 = be16_to_cpu(*ip++);
+		ustrlen--;
+		/* search for single decomposed char */
+		if (likely(compose))
+			ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c0);
+		if (ce1 && (cc = ce1[0])) {
+			/* start of a possibly decomposed Hangul char */
+			if (cc != 0xffff)
+				goto done;
+			if (!ustrlen)
+				goto same;
+			c1 = be16_to_cpu(*ip) - Hangul_VBase;
+			if (c1 < Hangul_VCount) {
+				/* compose the Hangul char */
+				cc = (c0 - Hangul_LBase) * Hangul_VCount;
+				cc = (cc + c1) * Hangul_TCount;
+				cc += Hangul_SBase;
+				ip++;
+				ustrlen--;
+				if (!ustrlen)
+					goto done;
+				c1 = be16_to_cpu(*ip) - Hangul_TBase;
+				if (c1 > 0 && c1 < Hangul_TCount) {
+					cc += c1;
+					ip++;
+					ustrlen--;
+				}
+				goto done;
+			}
+		}
+		while (1) {
+			/* main loop for common case of not composed chars */
+			if (!ustrlen)
+				goto same;
+			c1 = be16_to_cpu(*ip);
+			if (likely(compose))
+				ce1 = hfsplus_compose_lookup(
+					hfsplus_compose_table, c1);
+			if (ce1)
+				break;
+			switch (c0) {
+			case 0:
+				c0 = 0x2400;
+				break;
+			case '/':
+				c0 = ':';
+				break;
+			}
+			res = nls->uni2char(c0, op, len);
+			if (res < 0) {
+				if (res == -ENAMETOOLONG)
+					goto out;
+				*op = '?';
+				res = 1;
+			}
+			op += res;
+			len -= res;
+			c0 = c1;
+			ip++;
+			ustrlen--;
+		}
+		ce2 = hfsplus_compose_lookup(ce1, c0);
+		if (ce2) {
+			i = 1;
+			while (i < ustrlen) {
+				ce1 = hfsplus_compose_lookup(ce2,
+					be16_to_cpu(ip[i]));
+				if (!ce1)
+					break;
+				i++;
+				ce2 = ce1;
+			}
+			if ((cc = ce2[0])) {
+				ip += i;
+				ustrlen -= i;
+				goto done;
+			}
+		}
+same:
+		switch (c0) {
+		case 0:
+			cc = 0x2400;
+			break;
+		case '/':
+			cc = ':';
+			break;
+		default:
+			cc = c0;
+		}
+done:
+		res = nls->uni2char(cc, op, len);
+		if (res < 0) {
+			if (res == -ENAMETOOLONG)
+				goto out;
+			*op = '?';
+			res = 1;
+		}
+		op += res;
+		len -= res;
+	}
+	res = 0;
+out:
+	*len_p = (char *)op - astr;
+	return res;
+}
+
+/*
+ * Convert one or more ASCII characters into a single unicode character.
+ * Returns the number of ASCII characters corresponding to the unicode char.
+ */
+static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
+			      wchar_t *uc)
+{
+	int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
+	if (size <= 0) {
+		*uc = '?';
+		size = 1;
+	}
+	switch (*uc) {
+	case 0x2400:
+		*uc = 0;
+		break;
+	case ':':
+		*uc = '/';
+		break;
+	}
+	return size;
+}
+
+/* Decomposes a single unicode character. */
+static inline u16 *decompose_unichar(wchar_t uc, int *size)
+{
+	int off;
+
+	off = hfsplus_decompose_table[(uc >> 12) & 0xf];
+	if (off == 0 || off == 0xffff)
+		return NULL;
+
+	off = hfsplus_decompose_table[off + ((uc >> 8) & 0xf)];
+	if (!off)
+		return NULL;
+
+	off = hfsplus_decompose_table[off + ((uc >> 4) & 0xf)];
+	if (!off)
+		return NULL;
+
+	off = hfsplus_decompose_table[off + (uc & 0xf)];
+	*size = off & 3;
+	if (*size == 0)
+		return NULL;
+	return hfsplus_decompose_table + (off / 4);
+}
+
+int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
+		    const char *astr, int len)
+{
+	int size, dsize, decompose;
+	u16 *dstr, outlen = 0;
+	wchar_t c;
+
+	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
+	while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
+		size = asc2unichar(sb, astr, len, &c);
+
+		if (decompose && (dstr = decompose_unichar(c, &dsize))) {
+			if (outlen + dsize > HFSPLUS_MAX_STRLEN)
+				break;
+			do {
+				ustr->unicode[outlen++] = cpu_to_be16(*dstr++);
+			} while (--dsize > 0);
+		} else
+			ustr->unicode[outlen++] = cpu_to_be16(c);
+
+		astr += size;
+		len -= size;
+	}
+	ustr->length = cpu_to_be16(outlen);
+	if (len > 0)
+		return -ENAMETOOLONG;
+	return 0;
+}
+
+/*
+ * Hash a string to an integer as appropriate for the HFS+ filesystem.
+ * Composed unicode characters are decomposed and case-folding is performed
+ * if the appropriate bits are (un)set on the superblock.
+ */
+int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *str)
+{
+	struct super_block *sb = dentry->d_sb;
+	const char *astr;
+	const u16 *dstr;
+	int casefold, decompose, size, len;
+	unsigned long hash;
+	wchar_t c;
+	u16 c2;
+
+	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
+	hash = init_name_hash();
+	astr = str->name;
+	len = str->len;
+	while (len > 0) {
+		int uninitialized_var(dsize);
+		size = asc2unichar(sb, astr, len, &c);
+		astr += size;
+		len -= size;
+
+		if (decompose && (dstr = decompose_unichar(c, &dsize))) {
+			do {
+				c2 = *dstr++;
+				if (!casefold || (c2 = case_fold(c2)))
+					hash = partial_name_hash(c2, hash);
+			} while (--dsize > 0);
+		} else {
+			c2 = c;
+			if (!casefold || (c2 = case_fold(c2)))
+				hash = partial_name_hash(c2, hash);
+		}
+	}
+	str->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+/*
+ * Compare strings with HFS+ filename ordering.
+ * Composed unicode characters are decomposed and case-folding is performed
+ * if the appropriate bits are (un)set on the superblock.
+ */
+int hfsplus_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	struct super_block *sb = parent->d_sb;
+	int casefold, decompose, size;
+	int dsize1, dsize2, len1, len2;
+	const u16 *dstr1, *dstr2;
+	const char *astr1, *astr2;
+	u16 c1, c2;
+	wchar_t c;
+
+	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
+	astr1 = str;
+	len1 = len;
+	astr2 = name->name;
+	len2 = name->len;
+	dsize1 = dsize2 = 0;
+	dstr1 = dstr2 = NULL;
+
+	while (len1 > 0 && len2 > 0) {
+		if (!dsize1) {
+			size = asc2unichar(sb, astr1, len1, &c);
+			astr1 += size;
+			len1 -= size;
+
+			if (decompose)
+				dstr1 = decompose_unichar(c, &dsize1);
+			if (!decompose || !dstr1) {
+				c1 = c;
+				dstr1 = &c1;
+				dsize1 = 1;
+			}
+		}
+
+		if (!dsize2) {
+			size = asc2unichar(sb, astr2, len2, &c);
+			astr2 += size;
+			len2 -= size;
+
+			if (decompose)
+				dstr2 = decompose_unichar(c, &dsize2);
+			if (!decompose || !dstr2) {
+				c2 = c;
+				dstr2 = &c2;
+				dsize2 = 1;
+			}
+		}
+
+		c1 = *dstr1;
+		c2 = *dstr2;
+		if (casefold) {
+			if  (!(c1 = case_fold(c1))) {
+				dstr1++;
+				dsize1--;
+				continue;
+			}
+			if (!(c2 = case_fold(c2))) {
+				dstr2++;
+				dsize2--;
+				continue;
+			}
+		}
+		if (c1 < c2)
+			return -1;
+		else if (c1 > c2)
+			return 1;
+
+		dstr1++;
+		dsize1--;
+		dstr2++;
+		dsize2--;
+	}
+
+	if (len1 < len2)
+		return -1;
+	if (len1 > len2)
+		return 1;
+	return 0;
+}
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
new file mode 100644
index 00000000..7b8112da
--- /dev/null
+++ b/fs/hfsplus/wrapper.c
@@ -0,0 +1,283 @@
+/*
+ *  linux/fs/hfsplus/wrapper.c
+ *
+ * Copyright (C) 2001
+ * Brad Boyer (flar@allandria.com)
+ * (C) 2003 Ardis Technologies <roman@ardistech.com>
+ *
+ * Handling of HFS wrappers around HFS+ volumes
+ */
+
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/cdrom.h>
+#include <linux/genhd.h>
+#include <asm/unaligned.h>
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+struct hfsplus_wd {
+	u32 ablk_size;
+	u16 ablk_start;
+	u16 embed_start;
+	u16 embed_count;
+};
+
+static void hfsplus_end_io_sync(struct bio *bio, int err)
+{
+	if (err)
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	complete(bio->bi_private);
+}
+
+/*
+ * hfsplus_submit_bio - Perfrom block I/O
+ * @sb: super block of volume for I/O
+ * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes
+ * @buf: buffer for I/O
+ * @data: output pointer for location of requested data
+ * @rw: direction of I/O
+ *
+ * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than
+ * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads
+ * @data will return a pointer to the start of the requested sector,
+ * which may not be the same location as @buf.
+ *
+ * If @sector is not aligned to the bdev logical block size it will
+ * be rounded down. For writes this means that @buf should contain data
+ * that starts at the rounded-down address. As long as the data was
+ * read using hfsplus_submit_bio() and the same buffer is used things
+ * will work correctly.
+ */
+int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
+		void *buf, void **data, int rw)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	struct bio *bio;
+	int ret = 0;
+	unsigned int io_size;
+	loff_t start;
+	int offset;
+
+	/*
+	 * Align sector to hardware sector size and find offset. We
+	 * assume that io_size is a power of two, which _should_
+	 * be true.
+	 */
+	io_size = hfsplus_min_io_size(sb);
+	start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT;
+	offset = start & (io_size - 1);
+	sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
+
+	bio = bio_alloc(GFP_NOIO, 1);
+	bio->bi_sector = sector;
+	bio->bi_bdev = sb->s_bdev;
+	bio->bi_end_io = hfsplus_end_io_sync;
+	bio->bi_private = &wait;
+
+	if (!(rw & WRITE) && data)
+		*data = (u8 *)buf + offset;
+
+	while (io_size > 0) {
+		unsigned int page_offset = offset_in_page(buf);
+		unsigned int len = min_t(unsigned int, PAGE_SIZE - page_offset,
+					 io_size);
+
+		ret = bio_add_page(bio, virt_to_page(buf), len, page_offset);
+		if (ret != len) {
+			ret = -EIO;
+			goto out;
+		}
+		io_size -= len;
+		buf = (u8 *)buf + len;
+	}
+
+	submit_bio(rw, bio);
+	wait_for_completion(&wait);
+
+	if (!bio_flagged(bio, BIO_UPTODATE))
+		ret = -EIO;
+
+out:
+	bio_put(bio);
+	return ret < 0 ? ret : 0;
+}
+
+static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
+{
+	u32 extent;
+	u16 attrib;
+	__be16 sig;
+
+	sig = *(__be16 *)(bufptr + HFSP_WRAPOFF_EMBEDSIG);
+	if (sig != cpu_to_be16(HFSPLUS_VOLHEAD_SIG) &&
+	    sig != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
+		return 0;
+
+	attrib = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ATTRIB));
+	if (!(attrib & HFSP_WRAP_ATTRIB_SLOCK) ||
+	   !(attrib & HFSP_WRAP_ATTRIB_SPARED))
+		return 0;
+
+	wd->ablk_size =
+		be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE));
+	if (wd->ablk_size < HFSPLUS_SECTOR_SIZE)
+		return 0;
+	if (wd->ablk_size % HFSPLUS_SECTOR_SIZE)
+		return 0;
+	wd->ablk_start =
+		be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART));
+
+	extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT);
+	wd->embed_start = (extent >> 16) & 0xFFFF;
+	wd->embed_count = extent & 0xFFFF;
+
+	return 1;
+}
+
+static int hfsplus_get_last_session(struct super_block *sb,
+				    sector_t *start, sector_t *size)
+{
+	struct cdrom_multisession ms_info;
+	struct cdrom_tocentry te;
+	int res;
+
+	/* default values */
+	*start = 0;
+	*size = sb->s_bdev->bd_inode->i_size >> 9;
+
+	if (HFSPLUS_SB(sb)->session >= 0) {
+		te.cdte_track = HFSPLUS_SB(sb)->session;
+		te.cdte_format = CDROM_LBA;
+		res = ioctl_by_bdev(sb->s_bdev,
+			CDROMREADTOCENTRY, (unsigned long)&te);
+		if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
+			*start = (sector_t)te.cdte_addr.lba << 2;
+			return 0;
+		}
+		printk(KERN_ERR "hfs: invalid session number or type of track\n");
+		return -EINVAL;
+	}
+	ms_info.addr_format = CDROM_LBA;
+	res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION,
+		(unsigned long)&ms_info);
+	if (!res && ms_info.xa_flag)
+		*start = (sector_t)ms_info.addr.lba << 2;
+	return 0;
+}
+
+/* Find the volume header and fill in some minimum bits in superblock */
+/* Takes in super block, returns true if good data read */
+int hfsplus_read_wrapper(struct super_block *sb)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_wd wd;
+	sector_t part_start, part_size;
+	u32 blocksize;
+	int error = 0;
+
+	error = -EINVAL;
+	blocksize = sb_min_blocksize(sb, HFSPLUS_SECTOR_SIZE);
+	if (!blocksize)
+		goto out;
+
+	if (hfsplus_get_last_session(sb, &part_start, &part_size))
+		goto out;
+	if ((u64)part_start + part_size > 0x100000000ULL) {
+		pr_err("hfs: volumes larger than 2TB are not supported yet\n");
+		goto out;
+	}
+
+	error = -ENOMEM;
+	sbi->s_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL);
+	if (!sbi->s_vhdr_buf)
+		goto out;
+	sbi->s_backup_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL);
+	if (!sbi->s_backup_vhdr_buf)
+		goto out_free_vhdr;
+
+reread:
+	error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR,
+				   sbi->s_vhdr_buf, (void **)&sbi->s_vhdr,
+				   READ);
+	if (error)
+		goto out_free_backup_vhdr;
+
+	error = -EINVAL;
+	switch (sbi->s_vhdr->signature) {
+	case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX):
+		set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
+		/*FALLTHRU*/
+	case cpu_to_be16(HFSPLUS_VOLHEAD_SIG):
+		break;
+	case cpu_to_be16(HFSP_WRAP_MAGIC):
+		if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
+			goto out_free_backup_vhdr;
+		wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
+		part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
+		part_size = wd.embed_count * wd.ablk_size;
+		goto reread;
+	default:
+		/*
+		 * Check for a partition block.
+		 *
+		 * (should do this only for cdrom/loop though)
+		 */
+		if (hfs_part_find(sb, &part_start, &part_size))
+			goto out_free_backup_vhdr;
+		goto reread;
+	}
+
+	error = hfsplus_submit_bio(sb, part_start + part_size - 2,
+				   sbi->s_backup_vhdr_buf,
+				   (void **)&sbi->s_backup_vhdr, READ);
+	if (error)
+		goto out_free_backup_vhdr;
+
+	error = -EINVAL;
+	if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) {
+		printk(KERN_WARNING
+			"hfs: invalid secondary volume header\n");
+		goto out_free_backup_vhdr;
+	}
+
+	blocksize = be32_to_cpu(sbi->s_vhdr->blocksize);
+
+	/*
+	 * Block size must be at least as large as a sector and a multiple of 2.
+	 */
+	if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize))
+		goto out_free_backup_vhdr;
+	sbi->alloc_blksz = blocksize;
+	sbi->alloc_blksz_shift = 0;
+	while ((blocksize >>= 1) != 0)
+		sbi->alloc_blksz_shift++;
+	blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
+
+	/*
+	 * Align block size to block offset.
+	 */
+	while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
+		blocksize >>= 1;
+
+	if (sb_set_blocksize(sb, blocksize) != blocksize) {
+		printk(KERN_ERR "hfs: unable to set blocksize to %u!\n",
+			blocksize);
+		goto out_free_backup_vhdr;
+	}
+
+	sbi->blockoffset =
+		part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
+	sbi->part_start = part_start;
+	sbi->sect_count = part_size;
+	sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
+	return 0;
+
+out_free_backup_vhdr:
+	kfree(sbi->s_backup_vhdr_buf);
+out_free_vhdr:
+	kfree(sbi->s_vhdr_buf);
+out:
+	return error;
+}
diff --git a/fs/hostfs/Makefile b/fs/hostfs/Makefile
new file mode 100644
index 00000000..d5beaffa
--- /dev/null
+++ b/fs/hostfs/Makefile
@@ -0,0 +1,11 @@
+#
+# Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
+# Licensed under the GPL
+#
+
+hostfs-objs := hostfs_kern.o hostfs_user.o
+
+obj-y :=
+obj-$(CONFIG_HOSTFS) += hostfs.o
+
+include arch/um/scripts/Makefile.rules
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
new file mode 100644
index 00000000..bf15a430
--- /dev/null
+++ b/fs/hostfs/hostfs.h
@@ -0,0 +1,96 @@
+#ifndef __UM_FS_HOSTFS
+#define __UM_FS_HOSTFS
+
+#include "os.h"
+
+/*
+ * These are exactly the same definitions as in fs.h, but the names are
+ * changed so that this file can be included in both kernel and user files.
+ */
+
+#define HOSTFS_ATTR_MODE	1
+#define HOSTFS_ATTR_UID 	2
+#define HOSTFS_ATTR_GID 	4
+#define HOSTFS_ATTR_SIZE	8
+#define HOSTFS_ATTR_ATIME	16
+#define HOSTFS_ATTR_MTIME	32
+#define HOSTFS_ATTR_CTIME	64
+#define HOSTFS_ATTR_ATIME_SET	128
+#define HOSTFS_ATTR_MTIME_SET	256
+
+/* These two are unused by hostfs. */
+#define HOSTFS_ATTR_FORCE	512	/* Not a change, but a change it */
+#define HOSTFS_ATTR_ATTR_FLAG	1024
+
+/*
+ * If you are very careful, you'll notice that these two are missing:
+ *
+ * #define ATTR_KILL_SUID	2048
+ * #define ATTR_KILL_SGID	4096
+ *
+ * and this is because they were added in 2.5 development.
+ * Actually, they are not needed by most ->setattr() methods - they are set by
+ * callers of notify_change() to notify that the setuid/setgid bits must be
+ * dropped.
+ * notify_change() will delete those flags, make sure attr->ia_valid & ATTR_MODE
+ * is on, and remove the appropriate bits from attr->ia_mode (attr is a
+ * "struct iattr *"). -BlaisorBlade
+ */
+
+struct hostfs_iattr {
+	unsigned int	ia_valid;
+	mode_t		ia_mode;
+	uid_t		ia_uid;
+	gid_t		ia_gid;
+	loff_t		ia_size;
+	struct timespec	ia_atime;
+	struct timespec	ia_mtime;
+	struct timespec	ia_ctime;
+};
+
+struct hostfs_stat {
+	unsigned long long ino;
+	unsigned int mode;
+	unsigned int nlink;
+	unsigned int uid;
+	unsigned int gid;
+	unsigned long long size;
+	struct timespec atime, mtime, ctime;
+	unsigned int blksize;
+	unsigned long long blocks;
+	unsigned int maj;
+	unsigned int min;
+};
+
+extern int stat_file(const char *path, struct hostfs_stat *p, int fd);
+extern int access_file(char *path, int r, int w, int x);
+extern int open_file(char *path, int r, int w, int append);
+extern void *open_dir(char *path, int *err_out);
+extern char *read_dir(void *stream, unsigned long long *pos,
+		      unsigned long long *ino_out, int *len_out);
+extern void close_file(void *stream);
+extern int replace_file(int oldfd, int fd);
+extern void close_dir(void *stream);
+extern int read_file(int fd, unsigned long long *offset, char *buf, int len);
+extern int write_file(int fd, unsigned long long *offset, const char *buf,
+		      int len);
+extern int lseek_file(int fd, long long offset, int whence);
+extern int fsync_file(int fd, int datasync);
+extern int file_create(char *name, int ur, int uw, int ux, int gr,
+		       int gw, int gx, int or, int ow, int ox);
+extern int set_attr(const char *file, struct hostfs_iattr *attrs, int fd);
+extern int make_symlink(const char *from, const char *to);
+extern int unlink_file(const char *file);
+extern int do_mkdir(const char *file, int mode);
+extern int do_rmdir(const char *file);
+extern int do_mknod(const char *file, int mode, unsigned int major,
+		    unsigned int minor);
+extern int link_file(const char *from, const char *to);
+extern int hostfs_do_readlink(char *file, char *buf, int size);
+extern int rename_file(char *from, char *to);
+extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
+		     long long *bfree_out, long long *bavail_out,
+		     long long *files_out, long long *ffree_out,
+		     void *fsid_out, int fsid_size, long *namelen_out);
+
+#endif
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
new file mode 100644
index 00000000..2638c834
--- /dev/null
+++ b/fs/hostfs/hostfs_kern.c
@@ -0,0 +1,1004 @@
+/*
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ *
+ * Ported the filesystem routines to 2.5.
+ * 2003-02-10 Petr Baudis <pasky@ucw.cz>
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/statfs.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include "hostfs.h"
+#include "init.h"
+#include "kern.h"
+
+struct hostfs_inode_info {
+	int fd;
+	fmode_t mode;
+	struct inode vfs_inode;
+};
+
+static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
+{
+	return list_entry(inode, struct hostfs_inode_info, vfs_inode);
+}
+
+#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
+
+static int hostfs_d_delete(const struct dentry *dentry)
+{
+	return 1;
+}
+
+static const struct dentry_operations hostfs_dentry_ops = {
+	.d_delete		= hostfs_d_delete,
+};
+
+/* Changed in hostfs_args before the kernel starts running */
+static char *root_ino = "";
+static int append = 0;
+
+#define HOSTFS_SUPER_MAGIC 0x00c0ffee
+
+static const struct inode_operations hostfs_iops;
+static const struct inode_operations hostfs_dir_iops;
+static const struct inode_operations hostfs_link_iops;
+
+#ifndef MODULE
+static int __init hostfs_args(char *options, int *add)
+{
+	char *ptr;
+
+	ptr = strchr(options, ',');
+	if (ptr != NULL)
+		*ptr++ = '\0';
+	if (*options != '\0')
+		root_ino = options;
+
+	options = ptr;
+	while (options) {
+		ptr = strchr(options, ',');
+		if (ptr != NULL)
+			*ptr++ = '\0';
+		if (*options != '\0') {
+			if (!strcmp(options, "append"))
+				append = 1;
+			else printf("hostfs_args - unsupported option - %s\n",
+				    options);
+		}
+		options = ptr;
+	}
+	return 0;
+}
+
+__uml_setup("hostfs=", hostfs_args,
+"hostfs=<root dir>,<flags>,...\n"
+"    This is used to set hostfs parameters.  The root directory argument\n"
+"    is used to confine all hostfs mounts to within the specified directory\n"
+"    tree on the host.  If this isn't specified, then a user inside UML can\n"
+"    mount anything on the host that's accessible to the user that's running\n"
+"    it.\n"
+"    The only flag currently supported is 'append', which specifies that all\n"
+"    files opened by hostfs will be opened in append mode.\n\n"
+);
+#endif
+
+static char *__dentry_name(struct dentry *dentry, char *name)
+{
+	char *p = dentry_path_raw(dentry, name, PATH_MAX);
+	char *root;
+	size_t len;
+
+	root = dentry->d_sb->s_fs_info;
+	len = strlen(root);
+	if (IS_ERR(p)) {
+		__putname(name);
+		return NULL;
+	}
+	strlcpy(name, root, PATH_MAX);
+	if (len > p - name) {
+		__putname(name);
+		return NULL;
+	}
+	if (p > name + len) {
+		char *s = name + len;
+		while ((*s++ = *p++) != '\0')
+			;
+	}
+	return name;
+}
+
+static char *dentry_name(struct dentry *dentry)
+{
+	char *name = __getname();
+	if (!name)
+		return NULL;
+
+	return __dentry_name(dentry, name); /* will unlock */
+}
+
+static char *inode_name(struct inode *ino)
+{
+	struct dentry *dentry;
+	char *name;
+
+	dentry = d_find_alias(ino);
+	if (!dentry)
+		return NULL;
+
+	name = dentry_name(dentry);
+
+	dput(dentry);
+
+	return name;
+}
+
+static char *follow_link(char *link)
+{
+	int len, n;
+	char *name, *resolved, *end;
+
+	len = 64;
+	while (1) {
+		n = -ENOMEM;
+		name = kmalloc(len, GFP_KERNEL);
+		if (name == NULL)
+			goto out;
+
+		n = hostfs_do_readlink(link, name, len);
+		if (n < len)
+			break;
+		len *= 2;
+		kfree(name);
+	}
+	if (n < 0)
+		goto out_free;
+
+	if (*name == '/')
+		return name;
+
+	end = strrchr(link, '/');
+	if (end == NULL)
+		return name;
+
+	*(end + 1) = '\0';
+	len = strlen(link) + strlen(name) + 1;
+
+	resolved = kmalloc(len, GFP_KERNEL);
+	if (resolved == NULL) {
+		n = -ENOMEM;
+		goto out_free;
+	}
+
+	sprintf(resolved, "%s%s", link, name);
+	kfree(name);
+	kfree(link);
+	return resolved;
+
+ out_free:
+	kfree(name);
+ out:
+	return ERR_PTR(n);
+}
+
+static struct inode *hostfs_iget(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	return inode;
+}
+
+int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
+{
+	/*
+	 * do_statfs uses struct statfs64 internally, but the linux kernel
+	 * struct statfs still has 32-bit versions for most of these fields,
+	 * so we convert them here
+	 */
+	int err;
+	long long f_blocks;
+	long long f_bfree;
+	long long f_bavail;
+	long long f_files;
+	long long f_ffree;
+
+	err = do_statfs(dentry->d_sb->s_fs_info,
+			&sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
+			&f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
+			&sf->f_namelen);
+	if (err)
+		return err;
+	sf->f_blocks = f_blocks;
+	sf->f_bfree = f_bfree;
+	sf->f_bavail = f_bavail;
+	sf->f_files = f_files;
+	sf->f_ffree = f_ffree;
+	sf->f_type = HOSTFS_SUPER_MAGIC;
+	return 0;
+}
+
+static struct inode *hostfs_alloc_inode(struct super_block *sb)
+{
+	struct hostfs_inode_info *hi;
+
+	hi = kzalloc(sizeof(*hi), GFP_KERNEL);
+	if (hi == NULL)
+		return NULL;
+	hi->fd = -1;
+	inode_init_once(&hi->vfs_inode);
+	return &hi->vfs_inode;
+}
+
+static void hostfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	if (HOSTFS_I(inode)->fd != -1) {
+		close_file(&HOSTFS_I(inode)->fd);
+		HOSTFS_I(inode)->fd = -1;
+	}
+}
+
+static void hostfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kfree(HOSTFS_I(inode));
+}
+
+static void hostfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hostfs_i_callback);
+}
+
+static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	const char *root_path = vfs->mnt_sb->s_fs_info;
+	size_t offset = strlen(root_ino) + 1;
+
+	if (strlen(root_path) > offset)
+		seq_printf(seq, ",%s", root_path + offset);
+
+	return 0;
+}
+
+static const struct super_operations hostfs_sbops = {
+	.alloc_inode	= hostfs_alloc_inode,
+	.destroy_inode	= hostfs_destroy_inode,
+	.evict_inode	= hostfs_evict_inode,
+	.statfs		= hostfs_statfs,
+	.show_options	= hostfs_show_options,
+};
+
+int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
+{
+	void *dir;
+	char *name;
+	unsigned long long next, ino;
+	int error, len;
+
+	name = dentry_name(file->f_path.dentry);
+	if (name == NULL)
+		return -ENOMEM;
+	dir = open_dir(name, &error);
+	__putname(name);
+	if (dir == NULL)
+		return -error;
+	next = file->f_pos;
+	while ((name = read_dir(dir, &next, &ino, &len)) != NULL) {
+		error = (*filldir)(ent, name, len, file->f_pos,
+				   ino, DT_UNKNOWN);
+		if (error) break;
+		file->f_pos = next;
+	}
+	close_dir(dir);
+	return 0;
+}
+
+int hostfs_file_open(struct inode *ino, struct file *file)
+{
+	static DEFINE_MUTEX(open_mutex);
+	char *name;
+	fmode_t mode = 0;
+	int err;
+	int r = 0, w = 0, fd;
+
+	mode = file->f_mode & (FMODE_READ | FMODE_WRITE);
+	if ((mode & HOSTFS_I(ino)->mode) == mode)
+		return 0;
+
+	mode |= HOSTFS_I(ino)->mode;
+
+retry:
+	if (mode & FMODE_READ)
+		r = 1;
+	if (mode & FMODE_WRITE)
+		w = 1;
+	if (w)
+		r = 1;
+
+	name = dentry_name(file->f_path.dentry);
+	if (name == NULL)
+		return -ENOMEM;
+
+	fd = open_file(name, r, w, append);
+	__putname(name);
+	if (fd < 0)
+		return fd;
+
+	mutex_lock(&open_mutex);
+	/* somebody else had handled it first? */
+	if ((mode & HOSTFS_I(ino)->mode) == mode) {
+		mutex_unlock(&open_mutex);
+		return 0;
+	}
+	if ((mode | HOSTFS_I(ino)->mode) != mode) {
+		mode |= HOSTFS_I(ino)->mode;
+		mutex_unlock(&open_mutex);
+		close_file(&fd);
+		goto retry;
+	}
+	if (HOSTFS_I(ino)->fd == -1) {
+		HOSTFS_I(ino)->fd = fd;
+	} else {
+		err = replace_file(fd, HOSTFS_I(ino)->fd);
+		close_file(&fd);
+		if (err < 0) {
+			mutex_unlock(&open_mutex);
+			return err;
+		}
+	}
+	HOSTFS_I(ino)->mode = mode;
+	mutex_unlock(&open_mutex);
+
+	return 0;
+}
+
+int hostfs_fsync(struct file *file, int datasync)
+{
+	return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync);
+}
+
+static const struct file_operations hostfs_file_fops = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.splice_read	= generic_file_splice_read,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.write		= do_sync_write,
+	.mmap		= generic_file_mmap,
+	.open		= hostfs_file_open,
+	.release	= NULL,
+	.fsync		= hostfs_fsync,
+};
+
+static const struct file_operations hostfs_dir_fops = {
+	.llseek		= generic_file_llseek,
+	.readdir	= hostfs_readdir,
+	.read		= generic_read_dir,
+};
+
+int hostfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	char *buffer;
+	unsigned long long base;
+	int count = PAGE_CACHE_SIZE;
+	int end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	int err;
+
+	if (page->index >= end_index)
+		count = inode->i_size & (PAGE_CACHE_SIZE-1);
+
+	buffer = kmap(page);
+	base = ((unsigned long long) page->index) << PAGE_CACHE_SHIFT;
+
+	err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count);
+	if (err != count) {
+		ClearPageUptodate(page);
+		goto out;
+	}
+
+	if (base > inode->i_size)
+		inode->i_size = base;
+
+	if (PageError(page))
+		ClearPageError(page);
+	err = 0;
+
+ out:
+	kunmap(page);
+
+	unlock_page(page);
+	return err;
+}
+
+int hostfs_readpage(struct file *file, struct page *page)
+{
+	char *buffer;
+	long long start;
+	int err = 0;
+
+	start = (long long) page->index << PAGE_CACHE_SHIFT;
+	buffer = kmap(page);
+	err = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer,
+			PAGE_CACHE_SIZE);
+	if (err < 0)
+		goto out;
+
+	memset(&buffer[err], 0, PAGE_CACHE_SIZE - err);
+
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	if (PageError(page)) ClearPageError(page);
+	err = 0;
+ out:
+	kunmap(page);
+	unlock_page(page);
+	return err;
+}
+
+int hostfs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+
+	*pagep = grab_cache_page_write_begin(mapping, index, flags);
+	if (!*pagep)
+		return -ENOMEM;
+	return 0;
+}
+
+int hostfs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	void *buffer;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	int err;
+
+	buffer = kmap(page);
+	err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer + from, copied);
+	kunmap(page);
+
+	if (!PageUptodate(page) && err == PAGE_CACHE_SIZE)
+		SetPageUptodate(page);
+
+	/*
+	 * If err > 0, write_file has added err to pos, so we are comparing
+	 * i_size against the last byte written.
+	 */
+	if (err > 0 && (pos > inode->i_size))
+		inode->i_size = pos;
+	unlock_page(page);
+	page_cache_release(page);
+
+	return err;
+}
+
+static const struct address_space_operations hostfs_aops = {
+	.writepage 	= hostfs_writepage,
+	.readpage	= hostfs_readpage,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+	.write_begin	= hostfs_write_begin,
+	.write_end	= hostfs_write_end,
+};
+
+static int read_name(struct inode *ino, char *name)
+{
+	dev_t rdev;
+	struct hostfs_stat st;
+	int err = stat_file(name, &st, -1);
+	if (err)
+		return err;
+
+	/* Reencode maj and min with the kernel encoding.*/
+	rdev = MKDEV(st.maj, st.min);
+
+	switch (st.mode & S_IFMT) {
+	case S_IFLNK:
+		ino->i_op = &hostfs_link_iops;
+		break;
+	case S_IFDIR:
+		ino->i_op = &hostfs_dir_iops;
+		ino->i_fop = &hostfs_dir_fops;
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		init_special_inode(ino, st.mode & S_IFMT, rdev);
+		ino->i_op = &hostfs_iops;
+		break;
+
+	default:
+		ino->i_op = &hostfs_iops;
+		ino->i_fop = &hostfs_file_fops;
+		ino->i_mapping->a_ops = &hostfs_aops;
+	}
+
+	ino->i_ino = st.ino;
+	ino->i_mode = st.mode;
+	ino->i_nlink = st.nlink;
+	ino->i_uid = st.uid;
+	ino->i_gid = st.gid;
+	ino->i_atime = st.atime;
+	ino->i_mtime = st.mtime;
+	ino->i_ctime = st.ctime;
+	ino->i_size = st.size;
+	ino->i_blocks = st.blocks;
+	return 0;
+}
+
+int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		  struct nameidata *nd)
+{
+	struct inode *inode;
+	char *name;
+	int error, fd;
+
+	inode = hostfs_iget(dir->i_sb);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto out;
+	}
+
+	error = -ENOMEM;
+	name = dentry_name(dentry);
+	if (name == NULL)
+		goto out_put;
+
+	fd = file_create(name,
+			 mode & S_IRUSR, mode & S_IWUSR, mode & S_IXUSR,
+			 mode & S_IRGRP, mode & S_IWGRP, mode & S_IXGRP,
+			 mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH);
+	if (fd < 0)
+		error = fd;
+	else
+		error = read_name(inode, name);
+
+	__putname(name);
+	if (error)
+		goto out_put;
+
+	HOSTFS_I(inode)->fd = fd;
+	HOSTFS_I(inode)->mode = FMODE_READ | FMODE_WRITE;
+	d_instantiate(dentry, inode);
+	return 0;
+
+ out_put:
+	iput(inode);
+ out:
+	return error;
+}
+
+struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
+			     struct nameidata *nd)
+{
+	struct inode *inode;
+	char *name;
+	int err;
+
+	inode = hostfs_iget(ino->i_sb);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+
+	err = -ENOMEM;
+	name = dentry_name(dentry);
+	if (name == NULL)
+		goto out_put;
+
+	err = read_name(inode, name);
+
+	__putname(name);
+	if (err == -ENOENT) {
+		iput(inode);
+		inode = NULL;
+	}
+	else if (err)
+		goto out_put;
+
+	d_add(dentry, inode);
+	return NULL;
+
+ out_put:
+	iput(inode);
+ out:
+	return ERR_PTR(err);
+}
+
+int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
+{
+	char *from_name, *to_name;
+	int err;
+
+	if ((from_name = dentry_name(from)) == NULL)
+		return -ENOMEM;
+	to_name = dentry_name(to);
+	if (to_name == NULL) {
+		__putname(from_name);
+		return -ENOMEM;
+	}
+	err = link_file(to_name, from_name);
+	__putname(from_name);
+	__putname(to_name);
+	return err;
+}
+
+int hostfs_unlink(struct inode *ino, struct dentry *dentry)
+{
+	char *file;
+	int err;
+
+	if (append)
+		return -EPERM;
+
+	if ((file = dentry_name(dentry)) == NULL)
+		return -ENOMEM;
+
+	err = unlink_file(file);
+	__putname(file);
+	return err;
+}
+
+int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
+{
+	char *file;
+	int err;
+
+	if ((file = dentry_name(dentry)) == NULL)
+		return -ENOMEM;
+	err = make_symlink(file, to);
+	__putname(file);
+	return err;
+}
+
+int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode)
+{
+	char *file;
+	int err;
+
+	if ((file = dentry_name(dentry)) == NULL)
+		return -ENOMEM;
+	err = do_mkdir(file, mode);
+	__putname(file);
+	return err;
+}
+
+int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
+{
+	char *file;
+	int err;
+
+	if ((file = dentry_name(dentry)) == NULL)
+		return -ENOMEM;
+	err = do_rmdir(file);
+	__putname(file);
+	return err;
+}
+
+int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+	struct inode *inode;
+	char *name;
+	int err;
+
+	inode = hostfs_iget(dir->i_sb);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+
+	err = -ENOMEM;
+	name = dentry_name(dentry);
+	if (name == NULL)
+		goto out_put;
+
+	init_special_inode(inode, mode, dev);
+	err = do_mknod(name, mode, MAJOR(dev), MINOR(dev));
+	if (!err)
+		goto out_free;
+
+	err = read_name(inode, name);
+	__putname(name);
+	if (err)
+		goto out_put;
+	if (err)
+		goto out_put;
+
+	d_instantiate(dentry, inode);
+	return 0;
+
+ out_free:
+	__putname(name);
+ out_put:
+	iput(inode);
+ out:
+	return err;
+}
+
+int hostfs_rename(struct inode *from_ino, struct dentry *from,
+		  struct inode *to_ino, struct dentry *to)
+{
+	char *from_name, *to_name;
+	int err;
+
+	if ((from_name = dentry_name(from)) == NULL)
+		return -ENOMEM;
+	if ((to_name = dentry_name(to)) == NULL) {
+		__putname(from_name);
+		return -ENOMEM;
+	}
+	err = rename_file(from_name, to_name);
+	__putname(from_name);
+	__putname(to_name);
+	return err;
+}
+
+int hostfs_permission(struct inode *ino, int desired, unsigned int flags)
+{
+	char *name;
+	int r = 0, w = 0, x = 0, err;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	if (desired & MAY_READ) r = 1;
+	if (desired & MAY_WRITE) w = 1;
+	if (desired & MAY_EXEC) x = 1;
+	name = inode_name(ino);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (S_ISCHR(ino->i_mode) || S_ISBLK(ino->i_mode) ||
+	    S_ISFIFO(ino->i_mode) || S_ISSOCK(ino->i_mode))
+		err = 0;
+	else
+		err = access_file(name, r, w, x);
+	__putname(name);
+	if (!err)
+		err = generic_permission(ino, desired, flags, NULL);
+	return err;
+}
+
+int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct hostfs_iattr attrs;
+	char *name;
+	int err;
+
+	int fd = HOSTFS_I(inode)->fd;
+
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	if (append)
+		attr->ia_valid &= ~ATTR_SIZE;
+
+	attrs.ia_valid = 0;
+	if (attr->ia_valid & ATTR_MODE) {
+		attrs.ia_valid |= HOSTFS_ATTR_MODE;
+		attrs.ia_mode = attr->ia_mode;
+	}
+	if (attr->ia_valid & ATTR_UID) {
+		attrs.ia_valid |= HOSTFS_ATTR_UID;
+		attrs.ia_uid = attr->ia_uid;
+	}
+	if (attr->ia_valid & ATTR_GID) {
+		attrs.ia_valid |= HOSTFS_ATTR_GID;
+		attrs.ia_gid = attr->ia_gid;
+	}
+	if (attr->ia_valid & ATTR_SIZE) {
+		attrs.ia_valid |= HOSTFS_ATTR_SIZE;
+		attrs.ia_size = attr->ia_size;
+	}
+	if (attr->ia_valid & ATTR_ATIME) {
+		attrs.ia_valid |= HOSTFS_ATTR_ATIME;
+		attrs.ia_atime = attr->ia_atime;
+	}
+	if (attr->ia_valid & ATTR_MTIME) {
+		attrs.ia_valid |= HOSTFS_ATTR_MTIME;
+		attrs.ia_mtime = attr->ia_mtime;
+	}
+	if (attr->ia_valid & ATTR_CTIME) {
+		attrs.ia_valid |= HOSTFS_ATTR_CTIME;
+		attrs.ia_ctime = attr->ia_ctime;
+	}
+	if (attr->ia_valid & ATTR_ATIME_SET) {
+		attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET;
+	}
+	if (attr->ia_valid & ATTR_MTIME_SET) {
+		attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET;
+	}
+	name = dentry_name(dentry);
+	if (name == NULL)
+		return -ENOMEM;
+	err = set_attr(name, &attrs, fd);
+	__putname(name);
+	if (err)
+		return err;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		int error;
+
+		error = vmtruncate(inode, attr->ia_size);
+		if (err)
+			return err;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static const struct inode_operations hostfs_iops = {
+	.create		= hostfs_create,
+	.link		= hostfs_link,
+	.unlink		= hostfs_unlink,
+	.symlink	= hostfs_symlink,
+	.mkdir		= hostfs_mkdir,
+	.rmdir		= hostfs_rmdir,
+	.mknod		= hostfs_mknod,
+	.rename		= hostfs_rename,
+	.permission	= hostfs_permission,
+	.setattr	= hostfs_setattr,
+};
+
+static const struct inode_operations hostfs_dir_iops = {
+	.create		= hostfs_create,
+	.lookup		= hostfs_lookup,
+	.link		= hostfs_link,
+	.unlink		= hostfs_unlink,
+	.symlink	= hostfs_symlink,
+	.mkdir		= hostfs_mkdir,
+	.rmdir		= hostfs_rmdir,
+	.mknod		= hostfs_mknod,
+	.rename		= hostfs_rename,
+	.permission	= hostfs_permission,
+	.setattr	= hostfs_setattr,
+};
+
+static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	char *link = __getname();
+	if (link) {
+		char *path = dentry_name(dentry);
+		int err = -ENOMEM;
+		if (path) {
+			err = hostfs_do_readlink(path, link, PATH_MAX);
+			if (err == PATH_MAX)
+				err = -E2BIG;
+			__putname(path);
+		}
+		if (err < 0) {
+			__putname(link);
+			link = ERR_PTR(err);
+		}
+	} else {
+		link = ERR_PTR(-ENOMEM);
+	}
+
+	nd_set_link(nd, link);
+	return NULL;
+}
+
+static void hostfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		__putname(s);
+}
+
+static const struct inode_operations hostfs_link_iops = {
+	.readlink	= generic_readlink,
+	.follow_link	= hostfs_follow_link,
+	.put_link	= hostfs_put_link,
+};
+
+static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
+{
+	struct inode *root_inode;
+	char *host_root_path, *req_root = d;
+	int err;
+
+	sb->s_blocksize = 1024;
+	sb->s_blocksize_bits = 10;
+	sb->s_magic = HOSTFS_SUPER_MAGIC;
+	sb->s_op = &hostfs_sbops;
+	sb->s_d_op = &hostfs_dentry_ops;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+
+	/* NULL is printed as <NULL> by sprintf: avoid that. */
+	if (req_root == NULL)
+		req_root = "";
+
+	err = -ENOMEM;
+	sb->s_fs_info = host_root_path =
+		kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL);
+	if (host_root_path == NULL)
+		goto out;
+
+	sprintf(host_root_path, "%s/%s", root_ino, req_root);
+
+	root_inode = new_inode(sb);
+	if (!root_inode)
+		goto out;
+
+	err = read_name(root_inode, host_root_path);
+	if (err)
+		goto out_put;
+
+	if (S_ISLNK(root_inode->i_mode)) {
+		char *name = follow_link(host_root_path);
+		if (IS_ERR(name))
+			err = PTR_ERR(name);
+		else
+			err = read_name(root_inode, name);
+		kfree(name);
+		if (err)
+			goto out_put;
+	}
+
+	err = -ENOMEM;
+	sb->s_root = d_alloc_root(root_inode);
+	if (sb->s_root == NULL)
+		goto out_put;
+
+	return 0;
+
+out_put:
+	iput(root_inode);
+out:
+	return err;
+}
+
+static struct dentry *hostfs_read_sb(struct file_system_type *type,
+			  int flags, const char *dev_name,
+			  void *data)
+{
+	return mount_nodev(type, flags, data, hostfs_fill_sb_common);
+}
+
+static void hostfs_kill_sb(struct super_block *s)
+{
+	kill_anon_super(s);
+	kfree(s->s_fs_info);
+}
+
+static struct file_system_type hostfs_type = {
+	.owner 		= THIS_MODULE,
+	.name 		= "hostfs",
+	.mount	 	= hostfs_read_sb,
+	.kill_sb	= hostfs_kill_sb,
+	.fs_flags 	= 0,
+};
+
+static int __init init_hostfs(void)
+{
+	return register_filesystem(&hostfs_type);
+}
+
+static void __exit exit_hostfs(void)
+{
+	unregister_filesystem(&hostfs_type);
+}
+
+module_init(init_hostfs)
+module_exit(exit_hostfs)
+MODULE_LICENSE("GPL");
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
new file mode 100644
index 00000000..d51a9838
--- /dev/null
+++ b/fs/hostfs/hostfs_user.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#include <stdio.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+#include "hostfs.h"
+#include "os.h"
+#include "user.h"
+#include <utime.h>
+
+static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
+{
+	p->ino = buf->st_ino;
+	p->mode = buf->st_mode;
+	p->nlink = buf->st_nlink;
+	p->uid = buf->st_uid;
+	p->gid = buf->st_gid;
+	p->size = buf->st_size;
+	p->atime.tv_sec = buf->st_atime;
+	p->atime.tv_nsec = 0;
+	p->ctime.tv_sec = buf->st_ctime;
+	p->ctime.tv_nsec = 0;
+	p->mtime.tv_sec = buf->st_mtime;
+	p->mtime.tv_nsec = 0;
+	p->blksize = buf->st_blksize;
+	p->blocks = buf->st_blocks;
+	p->maj = os_major(buf->st_rdev);
+	p->min = os_minor(buf->st_rdev);
+}
+
+int stat_file(const char *path, struct hostfs_stat *p, int fd)
+{
+	struct stat64 buf;
+
+	if (fd >= 0) {
+		if (fstat64(fd, &buf) < 0)
+			return -errno;
+	} else if (lstat64(path, &buf) < 0) {
+		return -errno;
+	}
+	stat64_to_hostfs(&buf, p);
+	return 0;
+}
+
+int access_file(char *path, int r, int w, int x)
+{
+	int mode = 0;
+
+	if (r)
+		mode = R_OK;
+	if (w)
+		mode |= W_OK;
+	if (x)
+		mode |= X_OK;
+	if (access(path, mode) != 0)
+		return -errno;
+	else return 0;
+}
+
+int open_file(char *path, int r, int w, int append)
+{
+	int mode = 0, fd;
+
+	if (r && !w)
+		mode = O_RDONLY;
+	else if (!r && w)
+		mode = O_WRONLY;
+	else if (r && w)
+		mode = O_RDWR;
+	else panic("Impossible mode in open_file");
+
+	if (append)
+		mode |= O_APPEND;
+	fd = open64(path, mode);
+	if (fd < 0)
+		return -errno;
+	else return fd;
+}
+
+void *open_dir(char *path, int *err_out)
+{
+	DIR *dir;
+
+	dir = opendir(path);
+	*err_out = errno;
+
+	return dir;
+}
+
+char *read_dir(void *stream, unsigned long long *pos,
+	       unsigned long long *ino_out, int *len_out)
+{
+	DIR *dir = stream;
+	struct dirent *ent;
+
+	seekdir(dir, *pos);
+	ent = readdir(dir);
+	if (ent == NULL)
+		return NULL;
+	*len_out = strlen(ent->d_name);
+	*ino_out = ent->d_ino;
+	*pos = telldir(dir);
+	return ent->d_name;
+}
+
+int read_file(int fd, unsigned long long *offset, char *buf, int len)
+{
+	int n;
+
+	n = pread64(fd, buf, len, *offset);
+	if (n < 0)
+		return -errno;
+	*offset += n;
+	return n;
+}
+
+int write_file(int fd, unsigned long long *offset, const char *buf, int len)
+{
+	int n;
+
+	n = pwrite64(fd, buf, len, *offset);
+	if (n < 0)
+		return -errno;
+	*offset += n;
+	return n;
+}
+
+int lseek_file(int fd, long long offset, int whence)
+{
+	int ret;
+
+	ret = lseek64(fd, offset, whence);
+	if (ret < 0)
+		return -errno;
+	return 0;
+}
+
+int fsync_file(int fd, int datasync)
+{
+	int ret;
+	if (datasync)
+		ret = fdatasync(fd);
+	else
+		ret = fsync(fd);
+
+	if (ret < 0)
+		return -errno;
+	return 0;
+}
+
+int replace_file(int oldfd, int fd)
+{
+	return dup2(oldfd, fd);
+}
+
+void close_file(void *stream)
+{
+	close(*((int *) stream));
+}
+
+void close_dir(void *stream)
+{
+	closedir(stream);
+}
+
+int file_create(char *name, int ur, int uw, int ux, int gr,
+		int gw, int gx, int or, int ow, int ox)
+{
+	int mode, fd;
+
+	mode = 0;
+	mode |= ur ? S_IRUSR : 0;
+	mode |= uw ? S_IWUSR : 0;
+	mode |= ux ? S_IXUSR : 0;
+	mode |= gr ? S_IRGRP : 0;
+	mode |= gw ? S_IWGRP : 0;
+	mode |= gx ? S_IXGRP : 0;
+	mode |= or ? S_IROTH : 0;
+	mode |= ow ? S_IWOTH : 0;
+	mode |= ox ? S_IXOTH : 0;
+	fd = open64(name, O_CREAT | O_RDWR, mode);
+	if (fd < 0)
+		return -errno;
+	return fd;
+}
+
+int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
+{
+	struct hostfs_stat st;
+	struct timeval times[2];
+	int err, ma;
+
+	if (attrs->ia_valid & HOSTFS_ATTR_MODE) {
+		if (fd >= 0) {
+			if (fchmod(fd, attrs->ia_mode) != 0)
+				return -errno;
+		} else if (chmod(file, attrs->ia_mode) != 0) {
+			return -errno;
+		}
+	}
+	if (attrs->ia_valid & HOSTFS_ATTR_UID) {
+		if (fd >= 0) {
+			if (fchown(fd, attrs->ia_uid, -1))
+				return -errno;
+		} else if (chown(file, attrs->ia_uid, -1)) {
+			return -errno;
+		}
+	}
+	if (attrs->ia_valid & HOSTFS_ATTR_GID) {
+		if (fd >= 0) {
+			if (fchown(fd, -1, attrs->ia_gid))
+				return -errno;
+		} else if (chown(file, -1, attrs->ia_gid)) {
+			return -errno;
+		}
+	}
+	if (attrs->ia_valid & HOSTFS_ATTR_SIZE) {
+		if (fd >= 0) {
+			if (ftruncate(fd, attrs->ia_size))
+				return -errno;
+		} else if (truncate(file, attrs->ia_size)) {
+			return -errno;
+		}
+	}
+
+	/*
+	 * Update accessed and/or modified time, in two parts: first set
+	 * times according to the changes to perform, and then call futimes()
+	 * or utimes() to apply them.
+	 */
+	ma = (HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET);
+	if (attrs->ia_valid & ma) {
+		err = stat_file(file, &st, fd);
+		if (err != 0)
+			return err;
+
+		times[0].tv_sec = st.atime.tv_sec;
+		times[0].tv_usec = st.atime.tv_nsec / 1000;
+		times[1].tv_sec = st.mtime.tv_sec;
+		times[1].tv_usec = st.mtime.tv_nsec / 1000;
+
+		if (attrs->ia_valid & HOSTFS_ATTR_ATIME_SET) {
+			times[0].tv_sec = attrs->ia_atime.tv_sec;
+			times[0].tv_usec = attrs->ia_atime.tv_nsec / 1000;
+		}
+		if (attrs->ia_valid & HOSTFS_ATTR_MTIME_SET) {
+			times[1].tv_sec = attrs->ia_mtime.tv_sec;
+			times[1].tv_usec = attrs->ia_mtime.tv_nsec / 1000;
+		}
+
+		if (fd >= 0) {
+			if (futimes(fd, times) != 0)
+				return -errno;
+		} else if (utimes(file, times) != 0) {
+			return -errno;
+		}
+	}
+
+	/* Note: ctime is not handled */
+	if (attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)) {
+		err = stat_file(file, &st, fd);
+		attrs->ia_atime = st.atime;
+		attrs->ia_mtime = st.mtime;
+		if (err != 0)
+			return err;
+	}
+	return 0;
+}
+
+int make_symlink(const char *from, const char *to)
+{
+	int err;
+
+	err = symlink(to, from);
+	if (err)
+		return -errno;
+	return 0;
+}
+
+int unlink_file(const char *file)
+{
+	int err;
+
+	err = unlink(file);
+	if (err)
+		return -errno;
+	return 0;
+}
+
+int do_mkdir(const char *file, int mode)
+{
+	int err;
+
+	err = mkdir(file, mode);
+	if (err)
+		return -errno;
+	return 0;
+}
+
+int do_rmdir(const char *file)
+{
+	int err;
+
+	err = rmdir(file);
+	if (err)
+		return -errno;
+	return 0;
+}
+
+int do_mknod(const char *file, int mode, unsigned int major, unsigned int minor)
+{
+	int err;
+
+	err = mknod(file, mode, os_makedev(major, minor));
+	if (err)
+		return -errno;
+	return 0;
+}
+
+int link_file(const char *to, const char *from)
+{
+	int err;
+
+	err = link(to, from);
+	if (err)
+		return -errno;
+	return 0;
+}
+
+int hostfs_do_readlink(char *file, char *buf, int size)
+{
+	int n;
+
+	n = readlink(file, buf, size);
+	if (n < 0)
+		return -errno;
+	if (n < size)
+		buf[n] = '\0';
+	return n;
+}
+
+int rename_file(char *from, char *to)
+{
+	int err;
+
+	err = rename(from, to);
+	if (err < 0)
+		return -errno;
+	return 0;
+}
+
+int do_statfs(char *root, long *bsize_out, long long *blocks_out,
+	      long long *bfree_out, long long *bavail_out,
+	      long long *files_out, long long *ffree_out,
+	      void *fsid_out, int fsid_size, long *namelen_out)
+{
+	struct statfs64 buf;
+	int err;
+
+	err = statfs64(root, &buf);
+	if (err < 0)
+		return -errno;
+
+	*bsize_out = buf.f_bsize;
+	*blocks_out = buf.f_blocks;
+	*bfree_out = buf.f_bfree;
+	*bavail_out = buf.f_bavail;
+	*files_out = buf.f_files;
+	*ffree_out = buf.f_ffree;
+	memcpy(fsid_out, &buf.f_fsid,
+	       sizeof(buf.f_fsid) > fsid_size ? fsid_size :
+	       sizeof(buf.f_fsid));
+	*namelen_out = buf.f_namelen;
+
+	return 0;
+}
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
new file mode 100644
index 00000000..56bd15c5
--- /dev/null
+++ b/fs/hpfs/Kconfig
@@ -0,0 +1,14 @@
+config HPFS_FS
+	tristate "OS/2 HPFS file system support"
+	depends on BLOCK
+	help
+	  OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
+	  is the file system used for organizing files on OS/2 hard disk
+	  partitions. Say Y if you want to be able to read files from and
+	  write files to an OS/2 HPFS partition on your hard drive. OS/2
+	  floppies however are in regular MSDOS format, so you don't need this
+	  option in order to be able to read them. Read
+	  <file:Documentation/filesystems/hpfs.txt>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called hpfs.  If unsure, say N.
diff --git a/fs/hpfs/Makefile b/fs/hpfs/Makefile
new file mode 100644
index 00000000..57b786fb
--- /dev/null
+++ b/fs/hpfs/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the Linux hpfs filesystem routines.
+#
+
+obj-$(CONFIG_HPFS_FS) += hpfs.o
+
+hpfs-objs := alloc.o anode.o buffer.o dentry.o dir.o dnode.o ea.o file.o \
+	     inode.o map.o name.o namei.o super.o
diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c
new file mode 100644
index 00000000..7a5eb2c7
--- /dev/null
+++ b/fs/hpfs/alloc.c
@@ -0,0 +1,424 @@
+/*
+ *  linux/fs/hpfs/alloc.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  HPFS bitmap operations
+ */
+
+#include "hpfs_fn.h"
+
+/*
+ * Check if a sector is allocated in bitmap
+ * This is really slow. Turned on only if chk==2
+ */
+
+static int chk_if_allocated(struct super_block *s, secno sec, char *msg)
+{
+	struct quad_buffer_head qbh;
+	u32 *bmp;
+	if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "chk"))) goto fail;
+	if ((cpu_to_le32(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) {
+		hpfs_error(s, "sector '%s' - %08x not allocated in bitmap", msg, sec);
+		goto fail1;
+	}
+	hpfs_brelse4(&qbh);
+	if (sec >= hpfs_sb(s)->sb_dirband_start && sec < hpfs_sb(s)->sb_dirband_start + hpfs_sb(s)->sb_dirband_size) {
+		unsigned ssec = (sec - hpfs_sb(s)->sb_dirband_start) / 4;
+		if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) goto fail;
+		if ((le32_to_cpu(bmp[ssec >> 5]) >> (ssec & 0x1f)) & 1) {
+			hpfs_error(s, "sector '%s' - %08x not allocated in directory bitmap", msg, sec);
+			goto fail1;
+		}
+		hpfs_brelse4(&qbh);
+	}
+	return 0;
+	fail1:
+	hpfs_brelse4(&qbh);
+	fail:
+	return 1;
+}
+
+/*
+ * Check if sector(s) have proper number and additionally check if they're
+ * allocated in bitmap.
+ */
+	
+int hpfs_chk_sectors(struct super_block *s, secno start, int len, char *msg)
+{
+	if (start + len < start || start < 0x12 ||
+	    start + len > hpfs_sb(s)->sb_fs_size) {
+	    	hpfs_error(s, "sector(s) '%s' badly placed at %08x", msg, start);
+		return 1;
+	}
+	if (hpfs_sb(s)->sb_chk>=2) {
+		int i;
+		for (i = 0; i < len; i++)
+			if (chk_if_allocated(s, start + i, msg)) return 1;
+	}
+	return 0;
+}
+
+static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigned forward)
+{
+	struct quad_buffer_head qbh;
+	unsigned *bmp;
+	unsigned bs = near & ~0x3fff;
+	unsigned nr = (near & 0x3fff) & ~(n - 1);
+	/*unsigned mnr;*/
+	unsigned i, q;
+	int a, b;
+	secno ret = 0;
+	if (n != 1 && n != 4) {
+		hpfs_error(s, "Bad allocation size: %d", n);
+		return 0;
+	}
+	if (bs != ~0x3fff) {
+		if (!(bmp = hpfs_map_bitmap(s, near >> 14, &qbh, "aib"))) goto uls;
+	} else {
+		if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) goto uls;
+	}
+	if (!tstbits(bmp, nr, n + forward)) {
+		ret = bs + nr;
+		goto rt;
+	}
+	q = nr + n; b = 0;
+	while ((a = tstbits(bmp, q, n + forward)) != 0) {
+		q += a;
+		if (n != 1) q = ((q-1)&~(n-1))+n;
+		if (!b) {
+			if (q>>5 != nr>>5) {
+				b = 1;
+				q = nr & 0x1f;
+			}
+		} else if (q > nr) break;
+	}
+	if (!a) {
+		ret = bs + q;
+		goto rt;
+	}
+	nr >>= 5;
+	/*for (i = nr + 1; i != nr; i++, i &= 0x1ff) */
+	i = nr;
+	do {
+		if (!le32_to_cpu(bmp[i])) goto cont;
+		if (n + forward >= 0x3f && le32_to_cpu(bmp[i]) != 0xffffffff) goto cont;
+		q = i<<5;
+		if (i > 0) {
+			unsigned k = le32_to_cpu(bmp[i-1]);
+			while (k & 0x80000000) {
+				q--; k <<= 1;
+			}
+		}
+		if (n != 1) q = ((q-1)&~(n-1))+n;
+		while ((a = tstbits(bmp, q, n + forward)) != 0) {
+			q += a;
+			if (n != 1) q = ((q-1)&~(n-1))+n;
+			if (q>>5 > i) break;
+		}
+		if (!a) {
+			ret = bs + q;
+			goto rt;
+		}
+		cont:
+		i++, i &= 0x1ff;
+	} while (i != nr);
+	rt:
+	if (ret) {
+		if (hpfs_sb(s)->sb_chk && ((ret >> 14) != (bs >> 14) || (le32_to_cpu(bmp[(ret & 0x3fff) >> 5]) | ~(((1 << n) - 1) << (ret & 0x1f))) != 0xffffffff)) {
+			hpfs_error(s, "Allocation doesn't work! Wanted %d, allocated at %08x", n, ret);
+			ret = 0;
+			goto b;
+		}
+		bmp[(ret & 0x3fff) >> 5] &= cpu_to_le32(~(((1 << n) - 1) << (ret & 0x1f)));
+		hpfs_mark_4buffers_dirty(&qbh);
+	}
+	b:
+	hpfs_brelse4(&qbh);
+	uls:
+	return ret;
+}
+
+/*
+ * Allocation strategy:	1) search place near the sector specified
+ *			2) search bitmap where free sectors last found
+ *			3) search all bitmaps
+ *			4) search all bitmaps ignoring number of pre-allocated
+ *				sectors
+ */
+
+secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forward)
+{
+	secno sec;
+	int i;
+	unsigned n_bmps;
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	int f_p = 0;
+	int near_bmp;
+	if (forward < 0) {
+		forward = -forward;
+		f_p = 1;
+	}
+	n_bmps = (sbi->sb_fs_size + 0x4000 - 1) >> 14;
+	if (near && near < sbi->sb_fs_size) {
+		if ((sec = alloc_in_bmp(s, near, n, f_p ? forward : forward/4))) goto ret;
+		near_bmp = near >> 14;
+	} else near_bmp = n_bmps / 2;
+	/*
+	if (b != -1) {
+		if ((sec = alloc_in_bmp(s, b<<14, n, f_p ? forward : forward/2))) {
+			b &= 0x0fffffff;
+			goto ret;
+		}
+		if (b > 0x10000000) if ((sec = alloc_in_bmp(s, (b&0xfffffff)<<14, n, f_p ? forward : 0))) goto ret;
+	*/
+	if (!f_p) if (forward > sbi->sb_max_fwd_alloc) forward = sbi->sb_max_fwd_alloc;
+	less_fwd:
+	for (i = 0; i < n_bmps; i++) {
+		if (near_bmp+i < n_bmps && ((sec = alloc_in_bmp(s, (near_bmp+i) << 14, n, forward)))) {
+			sbi->sb_c_bitmap = near_bmp+i;
+			goto ret;
+		}	
+		if (!forward) {
+			if (near_bmp-i-1 >= 0 && ((sec = alloc_in_bmp(s, (near_bmp-i-1) << 14, n, forward)))) {
+				sbi->sb_c_bitmap = near_bmp-i-1;
+				goto ret;
+			}
+		} else {
+			if (near_bmp+i >= n_bmps && ((sec = alloc_in_bmp(s, (near_bmp+i-n_bmps) << 14, n, forward)))) {
+				sbi->sb_c_bitmap = near_bmp+i-n_bmps;
+				goto ret;
+			}
+		}
+		if (i == 1 && sbi->sb_c_bitmap != -1 && ((sec = alloc_in_bmp(s, (sbi->sb_c_bitmap) << 14, n, forward)))) {
+			goto ret;
+		}
+	}
+	if (!f_p) {
+		if (forward) {
+			sbi->sb_max_fwd_alloc = forward * 3 / 4;
+			forward /= 2;
+			goto less_fwd;
+		}
+	}
+	sec = 0;
+	ret:
+	if (sec && f_p) {
+		for (i = 0; i < forward; i++) {
+			if (!hpfs_alloc_if_possible(s, sec + i + 1)) {
+				hpfs_error(s, "Prealloc doesn't work! Wanted %d, allocated at %08x, can't allocate %d", forward, sec, i);
+				sec = 0;
+				break;
+			}
+		}
+	}
+	return sec;
+}
+
+static secno alloc_in_dirband(struct super_block *s, secno near)
+{
+	unsigned nr = near;
+	secno sec;
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	if (nr < sbi->sb_dirband_start)
+		nr = sbi->sb_dirband_start;
+	if (nr >= sbi->sb_dirband_start + sbi->sb_dirband_size)
+		nr = sbi->sb_dirband_start + sbi->sb_dirband_size - 4;
+	nr -= sbi->sb_dirband_start;
+	nr >>= 2;
+	sec = alloc_in_bmp(s, (~0x3fff) | nr, 1, 0);
+	if (!sec) return 0;
+	return ((sec & 0x3fff) << 2) + sbi->sb_dirband_start;
+}
+
+/* Alloc sector if it's free */
+
+int hpfs_alloc_if_possible(struct super_block *s, secno sec)
+{
+	struct quad_buffer_head qbh;
+	u32 *bmp;
+	if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "aip"))) goto end;
+	if (le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) & (1 << (sec & 0x1f))) {
+		bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f)));
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+		return 1;
+	}
+	hpfs_brelse4(&qbh);
+	end:
+	return 0;
+}
+
+/* Free sectors in bitmaps */
+
+void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n)
+{
+	struct quad_buffer_head qbh;
+	u32 *bmp;
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	/*printk("2 - ");*/
+	if (!n) return;
+	if (sec < 0x12) {
+		hpfs_error(s, "Trying to free reserved sector %08x", sec);
+		return;
+	}
+	sbi->sb_max_fwd_alloc += n > 0xffff ? 0xffff : n;
+	if (sbi->sb_max_fwd_alloc > 0xffffff) sbi->sb_max_fwd_alloc = 0xffffff;
+	new_map:
+	if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "free"))) {
+		return;
+	}	
+	new_tst:
+	if ((le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f) & 1)) {
+		hpfs_error(s, "sector %08x not allocated", sec);
+		hpfs_brelse4(&qbh);
+		return;
+	}
+	bmp[(sec & 0x3fff) >> 5] |= cpu_to_le32(1 << (sec & 0x1f));
+	if (!--n) {
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+		return;
+	}	
+	if (!(++sec & 0x3fff)) {
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+		goto new_map;
+	}
+	goto new_tst;
+}
+
+/*
+ * Check if there are at least n free dnodes on the filesystem.
+ * Called before adding to dnode. If we run out of space while
+ * splitting dnodes, it would corrupt dnode tree.
+ */
+
+int hpfs_check_free_dnodes(struct super_block *s, int n)
+{
+	int n_bmps = (hpfs_sb(s)->sb_fs_size + 0x4000 - 1) >> 14;
+	int b = hpfs_sb(s)->sb_c_bitmap & 0x0fffffff;
+	int i, j;
+	u32 *bmp;
+	struct quad_buffer_head qbh;
+	if ((bmp = hpfs_map_dnode_bitmap(s, &qbh))) {
+		for (j = 0; j < 512; j++) {
+			unsigned k;
+			if (!le32_to_cpu(bmp[j])) continue;
+			for (k = le32_to_cpu(bmp[j]); k; k >>= 1) if (k & 1) if (!--n) {
+				hpfs_brelse4(&qbh);
+				return 0;
+			}
+		}
+	}
+	hpfs_brelse4(&qbh);
+	i = 0;
+	if (hpfs_sb(s)->sb_c_bitmap != -1) {
+		bmp = hpfs_map_bitmap(s, b, &qbh, "chkdn1");
+		goto chk_bmp;
+	}
+	chk_next:
+	if (i == b) i++;
+	if (i >= n_bmps) return 1;
+	bmp = hpfs_map_bitmap(s, i, &qbh, "chkdn2");
+	chk_bmp:
+	if (bmp) {
+		for (j = 0; j < 512; j++) {
+			u32 k;
+			if (!le32_to_cpu(bmp[j])) continue;
+			for (k = 0xf; k; k <<= 4)
+				if ((le32_to_cpu(bmp[j]) & k) == k) {
+					if (!--n) {
+						hpfs_brelse4(&qbh);
+						return 0;
+					}
+				}
+		}
+		hpfs_brelse4(&qbh);
+	}
+	i++;
+	goto chk_next;
+}
+
+void hpfs_free_dnode(struct super_block *s, dnode_secno dno)
+{
+	if (hpfs_sb(s)->sb_chk) if (dno & 3) {
+		hpfs_error(s, "hpfs_free_dnode: dnode %08x not aligned", dno);
+		return;
+	}
+	if (dno < hpfs_sb(s)->sb_dirband_start ||
+	    dno >= hpfs_sb(s)->sb_dirband_start + hpfs_sb(s)->sb_dirband_size) {
+		hpfs_free_sectors(s, dno, 4);
+	} else {
+		struct quad_buffer_head qbh;
+		u32 *bmp;
+		unsigned ssec = (dno - hpfs_sb(s)->sb_dirband_start) / 4;
+		if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) {
+			return;
+		}
+		bmp[ssec >> 5] |= cpu_to_le32(1 << (ssec & 0x1f));
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+	}
+}
+
+struct dnode *hpfs_alloc_dnode(struct super_block *s, secno near,
+			 dnode_secno *dno, struct quad_buffer_head *qbh)
+{
+	struct dnode *d;
+	if (hpfs_count_one_bitmap(s, hpfs_sb(s)->sb_dmap) > FREE_DNODES_ADD) {
+		if (!(*dno = alloc_in_dirband(s, near)))
+			if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) return NULL;
+	} else {
+		if (!(*dno = hpfs_alloc_sector(s, near, 4, 0)))
+			if (!(*dno = alloc_in_dirband(s, near))) return NULL;
+	}
+	if (!(d = hpfs_get_4sectors(s, *dno, qbh))) {
+		hpfs_free_dnode(s, *dno);
+		return NULL;
+	}
+	memset(d, 0, 2048);
+	d->magic = cpu_to_le32(DNODE_MAGIC);
+	d->first_free = cpu_to_le32(52);
+	d->dirent[0] = 32;
+	d->dirent[2] = 8;
+	d->dirent[30] = 1;
+	d->dirent[31] = 255;
+	d->self = cpu_to_le32(*dno);
+	return d;
+}
+
+struct fnode *hpfs_alloc_fnode(struct super_block *s, secno near, fnode_secno *fno,
+			  struct buffer_head **bh)
+{
+	struct fnode *f;
+	if (!(*fno = hpfs_alloc_sector(s, near, 1, FNODE_ALLOC_FWD))) return NULL;
+	if (!(f = hpfs_get_sector(s, *fno, bh))) {
+		hpfs_free_sectors(s, *fno, 1);
+		return NULL;
+	}	
+	memset(f, 0, 512);
+	f->magic = cpu_to_le32(FNODE_MAGIC);
+	f->ea_offs = cpu_to_le16(0xc4);
+	f->btree.n_free_nodes = 8;
+	f->btree.first_free = cpu_to_le16(8);
+	return f;
+}
+
+struct anode *hpfs_alloc_anode(struct super_block *s, secno near, anode_secno *ano,
+			  struct buffer_head **bh)
+{
+	struct anode *a;
+	if (!(*ano = hpfs_alloc_sector(s, near, 1, ANODE_ALLOC_FWD))) return NULL;
+	if (!(a = hpfs_get_sector(s, *ano, bh))) {
+		hpfs_free_sectors(s, *ano, 1);
+		return NULL;
+	}
+	memset(a, 0, 512);
+	a->magic = cpu_to_le32(ANODE_MAGIC);
+	a->self = cpu_to_le32(*ano);
+	a->btree.n_free_nodes = 40;
+	a->btree.n_used_nodes = 0;
+	a->btree.first_free = cpu_to_le16(8);
+	return a;
+}
diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c
new file mode 100644
index 00000000..08b503e8
--- /dev/null
+++ b/fs/hpfs/anode.c
@@ -0,0 +1,491 @@
+/*
+ *  linux/fs/hpfs/anode.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  handling HPFS anode tree that contains file allocation info
+ */
+
+#include "hpfs_fn.h"
+
+/* Find a sector in allocation tree */
+
+secno hpfs_bplus_lookup(struct super_block *s, struct inode *inode,
+		   struct bplus_header *btree, unsigned sec,
+		   struct buffer_head *bh)
+{
+	anode_secno a = -1;
+	struct anode *anode;
+	int i;
+	int c1, c2 = 0;
+	go_down:
+	if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_bplus_lookup")) return -1;
+	if (btree->internal) {
+		for (i = 0; i < btree->n_used_nodes; i++)
+			if (le32_to_cpu(btree->u.internal[i].file_secno) > sec) {
+				a = le32_to_cpu(btree->u.internal[i].down);
+				brelse(bh);
+				if (!(anode = hpfs_map_anode(s, a, &bh))) return -1;
+				btree = &anode->btree;
+				goto go_down;
+			}
+		hpfs_error(s, "sector %08x not found in internal anode %08x", sec, a);
+		brelse(bh);
+		return -1;
+	}
+	for (i = 0; i < btree->n_used_nodes; i++)
+		if (le32_to_cpu(btree->u.external[i].file_secno) <= sec &&
+		    le32_to_cpu(btree->u.external[i].file_secno) + le32_to_cpu(btree->u.external[i].length) > sec) {
+			a = le32_to_cpu(btree->u.external[i].disk_secno) + sec - le32_to_cpu(btree->u.external[i].file_secno);
+			if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, a, 1, "data")) {
+				brelse(bh);
+				return -1;
+			}
+			if (inode) {
+				struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
+				hpfs_inode->i_file_sec = le32_to_cpu(btree->u.external[i].file_secno);
+				hpfs_inode->i_disk_sec = le32_to_cpu(btree->u.external[i].disk_secno);
+				hpfs_inode->i_n_secs = le32_to_cpu(btree->u.external[i].length);
+			}
+			brelse(bh);
+			return a;
+		}
+	hpfs_error(s, "sector %08x not found in external anode %08x", sec, a);
+	brelse(bh);
+	return -1;
+}
+
+/* Add a sector to tree */
+
+secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsigned fsecno)
+{
+	struct bplus_header *btree;
+	struct anode *anode = NULL, *ranode = NULL;
+	struct fnode *fnode;
+	anode_secno a, na = -1, ra, up = -1;
+	secno se;
+	struct buffer_head *bh, *bh1, *bh2;
+	int n;
+	unsigned fs;
+	int c1, c2 = 0;
+	if (fnod) {
+		if (!(fnode = hpfs_map_fnode(s, node, &bh))) return -1;
+		btree = &fnode->btree;
+	} else {
+		if (!(anode = hpfs_map_anode(s, node, &bh))) return -1;
+		btree = &anode->btree;
+	}
+	a = node;
+	go_down:
+	if ((n = btree->n_used_nodes - 1) < -!!fnod) {
+		hpfs_error(s, "anode %08x has no entries", a);
+		brelse(bh);
+		return -1;
+	}
+	if (btree->internal) {
+		a = le32_to_cpu(btree->u.internal[n].down);
+		btree->u.internal[n].file_secno = cpu_to_le32(-1);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+		if (hpfs_sb(s)->sb_chk)
+			if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_add_sector_to_btree #1")) return -1;
+		if (!(anode = hpfs_map_anode(s, a, &bh))) return -1;
+		btree = &anode->btree;
+		goto go_down;
+	}
+	if (n >= 0) {
+		if (le32_to_cpu(btree->u.external[n].file_secno) + le32_to_cpu(btree->u.external[n].length) != fsecno) {
+			hpfs_error(s, "allocated size %08x, trying to add sector %08x, %cnode %08x",
+				le32_to_cpu(btree->u.external[n].file_secno) + le32_to_cpu(btree->u.external[n].length), fsecno,
+				fnod?'f':'a', node);
+			brelse(bh);
+			return -1;
+		}
+		if (hpfs_alloc_if_possible(s, se = le32_to_cpu(btree->u.external[n].disk_secno) + le32_to_cpu(btree->u.external[n].length))) {
+			btree->u.external[n].length = cpu_to_le32(le32_to_cpu(btree->u.external[n].length) + 1);
+			mark_buffer_dirty(bh);
+			brelse(bh);
+			return se;
+		}
+	} else {
+		if (fsecno) {
+			hpfs_error(s, "empty file %08x, trying to add sector %08x", node, fsecno);
+			brelse(bh);
+			return -1;
+		}
+		se = !fnod ? node : (node + 16384) & ~16383;
+	}	
+	if (!(se = hpfs_alloc_sector(s, se, 1, fsecno*ALLOC_M>ALLOC_FWD_MAX ? ALLOC_FWD_MAX : fsecno*ALLOC_M<ALLOC_FWD_MIN ? ALLOC_FWD_MIN : fsecno*ALLOC_M))) {
+		brelse(bh);
+		return -1;
+	}
+	fs = n < 0 ? 0 : le32_to_cpu(btree->u.external[n].file_secno) + le32_to_cpu(btree->u.external[n].length);
+	if (!btree->n_free_nodes) {
+		up = a != node ? le32_to_cpu(anode->up) : -1;
+		if (!(anode = hpfs_alloc_anode(s, a, &na, &bh1))) {
+			brelse(bh);
+			hpfs_free_sectors(s, se, 1);
+			return -1;
+		}
+		if (a == node && fnod) {
+			anode->up = cpu_to_le32(node);
+			anode->btree.fnode_parent = 1;
+			anode->btree.n_used_nodes = btree->n_used_nodes;
+			anode->btree.first_free = btree->first_free;
+			anode->btree.n_free_nodes = 40 - anode->btree.n_used_nodes;
+			memcpy(&anode->u, &btree->u, btree->n_used_nodes * 12);
+			btree->internal = 1;
+			btree->n_free_nodes = 11;
+			btree->n_used_nodes = 1;
+			btree->first_free = cpu_to_le16((char *)&(btree->u.internal[1]) - (char *)btree);
+			btree->u.internal[0].file_secno = cpu_to_le32(-1);
+			btree->u.internal[0].down = cpu_to_le32(na);
+			mark_buffer_dirty(bh);
+		} else if (!(ranode = hpfs_alloc_anode(s, /*a*/0, &ra, &bh2))) {
+			brelse(bh);
+			brelse(bh1);
+			hpfs_free_sectors(s, se, 1);
+			hpfs_free_sectors(s, na, 1);
+			return -1;
+		}
+		brelse(bh);
+		bh = bh1;
+		btree = &anode->btree;
+	}
+	btree->n_free_nodes--; n = btree->n_used_nodes++;
+	btree->first_free = cpu_to_le16(le16_to_cpu(btree->first_free) + 12);
+	btree->u.external[n].disk_secno = cpu_to_le32(se);
+	btree->u.external[n].file_secno = cpu_to_le32(fs);
+	btree->u.external[n].length = cpu_to_le32(1);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	if ((a == node && fnod) || na == -1) return se;
+	c2 = 0;
+	while (up != (anode_secno)-1) {
+		struct anode *new_anode;
+		if (hpfs_sb(s)->sb_chk)
+			if (hpfs_stop_cycles(s, up, &c1, &c2, "hpfs_add_sector_to_btree #2")) return -1;
+		if (up != node || !fnod) {
+			if (!(anode = hpfs_map_anode(s, up, &bh))) return -1;
+			btree = &anode->btree;
+		} else {
+			if (!(fnode = hpfs_map_fnode(s, up, &bh))) return -1;
+			btree = &fnode->btree;
+		}
+		if (btree->n_free_nodes) {
+			btree->n_free_nodes--; n = btree->n_used_nodes++;
+			btree->first_free = cpu_to_le16(le16_to_cpu(btree->first_free) + 8);
+			btree->u.internal[n].file_secno = cpu_to_le32(-1);
+			btree->u.internal[n].down = cpu_to_le32(na);
+			btree->u.internal[n-1].file_secno = cpu_to_le32(fs);
+			mark_buffer_dirty(bh);
+			brelse(bh);
+			brelse(bh2);
+			hpfs_free_sectors(s, ra, 1);
+			if ((anode = hpfs_map_anode(s, na, &bh))) {
+				anode->up = cpu_to_le32(up);
+				anode->btree.fnode_parent = up == node && fnod;
+				mark_buffer_dirty(bh);
+				brelse(bh);
+			}
+			return se;
+		}
+		up = up != node ? le32_to_cpu(anode->up) : -1;
+		btree->u.internal[btree->n_used_nodes - 1].file_secno = cpu_to_le32(/*fs*/-1);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+		a = na;
+		if ((new_anode = hpfs_alloc_anode(s, a, &na, &bh))) {
+			anode = new_anode;
+			/*anode->up = cpu_to_le32(up != -1 ? up : ra);*/
+			anode->btree.internal = 1;
+			anode->btree.n_used_nodes = 1;
+			anode->btree.n_free_nodes = 59;
+			anode->btree.first_free = cpu_to_le16(16);
+			anode->btree.u.internal[0].down = cpu_to_le32(a);
+			anode->btree.u.internal[0].file_secno = cpu_to_le32(-1);
+			mark_buffer_dirty(bh);
+			brelse(bh);
+			if ((anode = hpfs_map_anode(s, a, &bh))) {
+				anode->up = cpu_to_le32(na);
+				mark_buffer_dirty(bh);
+				brelse(bh);
+			}
+		} else na = a;
+	}
+	if ((anode = hpfs_map_anode(s, na, &bh))) {
+		anode->up = cpu_to_le32(node);
+		if (fnod) anode->btree.fnode_parent = 1;
+		mark_buffer_dirty(bh);
+		brelse(bh);
+	}
+	if (!fnod) {
+		if (!(anode = hpfs_map_anode(s, node, &bh))) {
+			brelse(bh2);
+			return -1;
+		}
+		btree = &anode->btree;
+	} else {
+		if (!(fnode = hpfs_map_fnode(s, node, &bh))) {
+			brelse(bh2);
+			return -1;
+		}
+		btree = &fnode->btree;
+	}
+	ranode->up = cpu_to_le32(node);
+	memcpy(&ranode->btree, btree, le16_to_cpu(btree->first_free));
+	if (fnod) ranode->btree.fnode_parent = 1;
+	ranode->btree.n_free_nodes = (ranode->btree.internal ? 60 : 40) - ranode->btree.n_used_nodes;
+	if (ranode->btree.internal) for (n = 0; n < ranode->btree.n_used_nodes; n++) {
+		struct anode *unode;
+		if ((unode = hpfs_map_anode(s, le32_to_cpu(ranode->u.internal[n].down), &bh1))) {
+			unode->up = cpu_to_le32(ra);
+			unode->btree.fnode_parent = 0;
+			mark_buffer_dirty(bh1);
+			brelse(bh1);
+		}
+	}
+	btree->internal = 1;
+	btree->n_free_nodes = fnod ? 10 : 58;
+	btree->n_used_nodes = 2;
+	btree->first_free = cpu_to_le16((char *)&btree->u.internal[2] - (char *)btree);
+	btree->u.internal[0].file_secno = cpu_to_le32(fs);
+	btree->u.internal[0].down = cpu_to_le32(ra);
+	btree->u.internal[1].file_secno = cpu_to_le32(-1);
+	btree->u.internal[1].down = cpu_to_le32(na);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	mark_buffer_dirty(bh2);
+	brelse(bh2);
+	return se;
+}
+
+/*
+ * Remove allocation tree. Recursion would look much nicer but
+ * I want to avoid it because it can cause stack overflow.
+ */
+
+void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree)
+{
+	struct bplus_header *btree1 = btree;
+	struct anode *anode = NULL;
+	anode_secno ano = 0, oano;
+	struct buffer_head *bh;
+	int level = 0;
+	int pos = 0;
+	int i;
+	int c1, c2 = 0;
+	int d1, d2;
+	go_down:
+	d2 = 0;
+	while (btree1->internal) {
+		ano = le32_to_cpu(btree1->u.internal[pos].down);
+		if (level) brelse(bh);
+		if (hpfs_sb(s)->sb_chk)
+			if (hpfs_stop_cycles(s, ano, &d1, &d2, "hpfs_remove_btree #1"))
+				return;
+		if (!(anode = hpfs_map_anode(s, ano, &bh))) return;
+		btree1 = &anode->btree;
+		level++;
+		pos = 0;
+	}
+	for (i = 0; i < btree1->n_used_nodes; i++)
+		hpfs_free_sectors(s, le32_to_cpu(btree1->u.external[i].disk_secno), le32_to_cpu(btree1->u.external[i].length));
+	go_up:
+	if (!level) return;
+	brelse(bh);
+	if (hpfs_sb(s)->sb_chk)
+		if (hpfs_stop_cycles(s, ano, &c1, &c2, "hpfs_remove_btree #2")) return;
+	hpfs_free_sectors(s, ano, 1);
+	oano = ano;
+	ano = le32_to_cpu(anode->up);
+	if (--level) {
+		if (!(anode = hpfs_map_anode(s, ano, &bh))) return;
+		btree1 = &anode->btree;
+	} else btree1 = btree;
+	for (i = 0; i < btree1->n_used_nodes; i++) {
+		if (le32_to_cpu(btree1->u.internal[i].down) == oano) {
+			if ((pos = i + 1) < btree1->n_used_nodes)
+				goto go_down;
+			else
+				goto go_up;
+		}
+	}
+	hpfs_error(s,
+		   "reference to anode %08x not found in anode %08x "
+		   "(probably bad up pointer)",
+		   oano, level ? ano : -1);
+	if (level)
+		brelse(bh);
+}
+
+/* Just a wrapper around hpfs_bplus_lookup .. used for reading eas */
+
+static secno anode_lookup(struct super_block *s, anode_secno a, unsigned sec)
+{
+	struct anode *anode;
+	struct buffer_head *bh;
+	if (!(anode = hpfs_map_anode(s, a, &bh))) return -1;
+	return hpfs_bplus_lookup(s, NULL, &anode->btree, sec, bh);
+}
+
+int hpfs_ea_read(struct super_block *s, secno a, int ano, unsigned pos,
+	    unsigned len, char *buf)
+{
+	struct buffer_head *bh;
+	char *data;
+	secno sec;
+	unsigned l;
+	while (len) {
+		if (ano) {
+			if ((sec = anode_lookup(s, a, pos >> 9)) == -1)
+				return -1;
+		} else sec = a + (pos >> 9);
+		if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, sec, 1, "ea #1")) return -1;
+		if (!(data = hpfs_map_sector(s, sec, &bh, (len - 1) >> 9)))
+			return -1;
+		l = 0x200 - (pos & 0x1ff); if (l > len) l = len;
+		memcpy(buf, data + (pos & 0x1ff), l);
+		brelse(bh);
+		buf += l; pos += l; len -= l;
+	}
+	return 0;
+}
+
+int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos,
+	     unsigned len, const char *buf)
+{
+	struct buffer_head *bh;
+	char *data;
+	secno sec;
+	unsigned l;
+	while (len) {
+		if (ano) {
+			if ((sec = anode_lookup(s, a, pos >> 9)) == -1)
+				return -1;
+		} else sec = a + (pos >> 9);
+		if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, sec, 1, "ea #2")) return -1;
+		if (!(data = hpfs_map_sector(s, sec, &bh, (len - 1) >> 9)))
+			return -1;
+		l = 0x200 - (pos & 0x1ff); if (l > len) l = len;
+		memcpy(data + (pos & 0x1ff), buf, l);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+		buf += l; pos += l; len -= l;
+	}
+	return 0;
+}
+
+void hpfs_ea_remove(struct super_block *s, secno a, int ano, unsigned len)
+{
+	struct anode *anode;
+	struct buffer_head *bh;
+	if (ano) {
+		if (!(anode = hpfs_map_anode(s, a, &bh))) return;
+		hpfs_remove_btree(s, &anode->btree);
+		brelse(bh);
+		hpfs_free_sectors(s, a, 1);
+	} else hpfs_free_sectors(s, a, (len + 511) >> 9);
+}
+
+/* Truncate allocation tree. Doesn't join anodes - I hope it doesn't matter */
+
+void hpfs_truncate_btree(struct super_block *s, secno f, int fno, unsigned secs)
+{
+	struct fnode *fnode;
+	struct anode *anode;
+	struct buffer_head *bh;
+	struct bplus_header *btree;
+	anode_secno node = f;
+	int i, j, nodes;
+	int c1, c2 = 0;
+	if (fno) {
+		if (!(fnode = hpfs_map_fnode(s, f, &bh))) return;
+		btree = &fnode->btree;
+	} else {
+		if (!(anode = hpfs_map_anode(s, f, &bh))) return;
+		btree = &anode->btree;
+	}
+	if (!secs) {
+		hpfs_remove_btree(s, btree);
+		if (fno) {
+			btree->n_free_nodes = 8;
+			btree->n_used_nodes = 0;
+			btree->first_free = cpu_to_le16(8);
+			btree->internal = 0;
+			mark_buffer_dirty(bh);
+		} else hpfs_free_sectors(s, f, 1);
+		brelse(bh);
+		return;
+	}
+	while (btree->internal) {
+		nodes = btree->n_used_nodes + btree->n_free_nodes;
+		for (i = 0; i < btree->n_used_nodes; i++)
+			if (le32_to_cpu(btree->u.internal[i].file_secno) >= secs) goto f;
+		brelse(bh);
+		hpfs_error(s, "internal btree %08x doesn't end with -1", node);
+		return;
+		f:
+		for (j = i + 1; j < btree->n_used_nodes; j++)
+			hpfs_ea_remove(s, le32_to_cpu(btree->u.internal[j].down), 1, 0);
+		btree->n_used_nodes = i + 1;
+		btree->n_free_nodes = nodes - btree->n_used_nodes;
+		btree->first_free = cpu_to_le16(8 + 8 * btree->n_used_nodes);
+		mark_buffer_dirty(bh);
+		if (btree->u.internal[i].file_secno == cpu_to_le32(secs)) {
+			brelse(bh);
+			return;
+		}
+		node = le32_to_cpu(btree->u.internal[i].down);
+		brelse(bh);
+		if (hpfs_sb(s)->sb_chk)
+			if (hpfs_stop_cycles(s, node, &c1, &c2, "hpfs_truncate_btree"))
+				return;
+		if (!(anode = hpfs_map_anode(s, node, &bh))) return;
+		btree = &anode->btree;
+	}	
+	nodes = btree->n_used_nodes + btree->n_free_nodes;
+	for (i = 0; i < btree->n_used_nodes; i++)
+		if (le32_to_cpu(btree->u.external[i].file_secno) + le32_to_cpu(btree->u.external[i].length) >= secs) goto ff;
+	brelse(bh);
+	return;
+	ff:
+	if (secs <= le32_to_cpu(btree->u.external[i].file_secno)) {
+		hpfs_error(s, "there is an allocation error in file %08x, sector %08x", f, secs);
+		if (i) i--;
+	}
+	else if (le32_to_cpu(btree->u.external[i].file_secno) + le32_to_cpu(btree->u.external[i].length) > secs) {
+		hpfs_free_sectors(s, le32_to_cpu(btree->u.external[i].disk_secno) + secs -
+			le32_to_cpu(btree->u.external[i].file_secno), le32_to_cpu(btree->u.external[i].length)
+			- secs + le32_to_cpu(btree->u.external[i].file_secno)); /* I hope gcc optimizes this :-) */
+		btree->u.external[i].length = cpu_to_le32(secs - le32_to_cpu(btree->u.external[i].file_secno));
+	}
+	for (j = i + 1; j < btree->n_used_nodes; j++)
+		hpfs_free_sectors(s, le32_to_cpu(btree->u.external[j].disk_secno), le32_to_cpu(btree->u.external[j].length));
+	btree->n_used_nodes = i + 1;
+	btree->n_free_nodes = nodes - btree->n_used_nodes;
+	btree->first_free = cpu_to_le16(8 + 12 * btree->n_used_nodes);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+}
+
+/* Remove file or directory and it's eas - note that directory must
+   be empty when this is called. */
+
+void hpfs_remove_fnode(struct super_block *s, fnode_secno fno)
+{
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	struct extended_attribute *ea;
+	struct extended_attribute *ea_end;
+	if (!(fnode = hpfs_map_fnode(s, fno, &bh))) return;
+	if (!fnode->dirflag) hpfs_remove_btree(s, &fnode->btree);
+	else hpfs_remove_dtree(s, le32_to_cpu(fnode->u.external[0].disk_secno));
+	ea_end = fnode_end_ea(fnode);
+	for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
+		if (ea->indirect)
+			hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea));
+	hpfs_ea_ext_remove(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l));
+	brelse(bh);
+	hpfs_free_sectors(s, fno, 1);
+}
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
new file mode 100644
index 00000000..9ecde27d
--- /dev/null
+++ b/fs/hpfs/buffer.c
@@ -0,0 +1,168 @@
+/*
+ *  linux/fs/hpfs/buffer.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  general buffer i/o
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "hpfs_fn.h"
+
+/* Map a sector into a buffer and return pointers to it and to the buffer. */
+
+void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp,
+		 int ahead)
+{
+	struct buffer_head *bh;
+
+	hpfs_lock_assert(s);
+
+	cond_resched();
+
+	*bhp = bh = sb_bread(s, secno);
+	if (bh != NULL)
+		return bh->b_data;
+	else {
+		printk("HPFS: hpfs_map_sector: read error\n");
+		return NULL;
+	}
+}
+
+/* Like hpfs_map_sector but don't read anything */
+
+void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp)
+{
+	struct buffer_head *bh;
+	/*return hpfs_map_sector(s, secno, bhp, 0);*/
+
+	hpfs_lock_assert(s);
+
+	cond_resched();
+
+	if ((*bhp = bh = sb_getblk(s, secno)) != NULL) {
+		if (!buffer_uptodate(bh)) wait_on_buffer(bh);
+		set_buffer_uptodate(bh);
+		return bh->b_data;
+	} else {
+		printk("HPFS: hpfs_get_sector: getblk failed\n");
+		return NULL;
+	}
+}
+
+/* Map 4 sectors into a 4buffer and return pointers to it and to the buffer. */
+
+void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffer_head *qbh,
+		   int ahead)
+{
+	struct buffer_head *bh;
+	char *data;
+
+	hpfs_lock_assert(s);
+
+	cond_resched();
+
+	if (secno & 3) {
+		printk("HPFS: hpfs_map_4sectors: unaligned read\n");
+		return NULL;
+	}
+
+	qbh->data = data = kmalloc(2048, GFP_NOFS);
+	if (!data) {
+		printk("HPFS: hpfs_map_4sectors: out of memory\n");
+		goto bail;
+	}
+
+	qbh->bh[0] = bh = sb_bread(s, secno);
+	if (!bh)
+		goto bail0;
+	memcpy(data, bh->b_data, 512);
+
+	qbh->bh[1] = bh = sb_bread(s, secno + 1);
+	if (!bh)
+		goto bail1;
+	memcpy(data + 512, bh->b_data, 512);
+
+	qbh->bh[2] = bh = sb_bread(s, secno + 2);
+	if (!bh)
+		goto bail2;
+	memcpy(data + 2 * 512, bh->b_data, 512);
+
+	qbh->bh[3] = bh = sb_bread(s, secno + 3);
+	if (!bh)
+		goto bail3;
+	memcpy(data + 3 * 512, bh->b_data, 512);
+
+	return data;
+
+ bail3:
+	brelse(qbh->bh[2]);
+ bail2:
+	brelse(qbh->bh[1]);
+ bail1:
+	brelse(qbh->bh[0]);
+ bail0:
+	kfree(data);
+	printk("HPFS: hpfs_map_4sectors: read error\n");
+ bail:
+	return NULL;
+}
+
+/* Don't read sectors */
+
+void *hpfs_get_4sectors(struct super_block *s, unsigned secno,
+                          struct quad_buffer_head *qbh)
+{
+	cond_resched();
+
+	hpfs_lock_assert(s);
+
+	if (secno & 3) {
+		printk("HPFS: hpfs_get_4sectors: unaligned read\n");
+		return NULL;
+	}
+
+	/*return hpfs_map_4sectors(s, secno, qbh, 0);*/
+	if (!(qbh->data = kmalloc(2048, GFP_NOFS))) {
+		printk("HPFS: hpfs_get_4sectors: out of memory\n");
+		return NULL;
+	}
+	if (!(hpfs_get_sector(s, secno, &qbh->bh[0]))) goto bail0;
+	if (!(hpfs_get_sector(s, secno + 1, &qbh->bh[1]))) goto bail1;
+	if (!(hpfs_get_sector(s, secno + 2, &qbh->bh[2]))) goto bail2;
+	if (!(hpfs_get_sector(s, secno + 3, &qbh->bh[3]))) goto bail3;
+	memcpy(qbh->data, qbh->bh[0]->b_data, 512);
+	memcpy(qbh->data + 512, qbh->bh[1]->b_data, 512);
+	memcpy(qbh->data + 2*512, qbh->bh[2]->b_data, 512);
+	memcpy(qbh->data + 3*512, qbh->bh[3]->b_data, 512);
+	return qbh->data;
+
+	bail3:	brelse(qbh->bh[2]);
+	bail2:	brelse(qbh->bh[1]);
+	bail1:	brelse(qbh->bh[0]);
+	bail0:
+	return NULL;
+}
+	
+
+void hpfs_brelse4(struct quad_buffer_head *qbh)
+{
+	brelse(qbh->bh[3]);
+	brelse(qbh->bh[2]);
+	brelse(qbh->bh[1]);
+	brelse(qbh->bh[0]);
+	kfree(qbh->data);
+}	
+
+void hpfs_mark_4buffers_dirty(struct quad_buffer_head *qbh)
+{
+	PRINTK(("hpfs_mark_4buffers_dirty\n"));
+	memcpy(qbh->bh[0]->b_data, qbh->data, 512);
+	memcpy(qbh->bh[1]->b_data, qbh->data + 512, 512);
+	memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512);
+	memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512);
+	mark_buffer_dirty(qbh->bh[0]);
+	mark_buffer_dirty(qbh->bh[1]);
+	mark_buffer_dirty(qbh->bh[2]);
+	mark_buffer_dirty(qbh->bh[3]);
+}
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
new file mode 100644
index 00000000..05d4816e
--- /dev/null
+++ b/fs/hpfs/dentry.c
@@ -0,0 +1,64 @@
+/*
+ *  linux/fs/hpfs/dentry.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  dcache operations
+ */
+
+#include "hpfs_fn.h"
+
+/*
+ * Note: the dentry argument is the parent dentry.
+ */
+
+static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
+{
+	unsigned long	 hash;
+	int		 i;
+	unsigned l = qstr->len;
+
+	if (l == 1) if (qstr->name[0]=='.') goto x;
+	if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x;
+	hpfs_adjust_length(qstr->name, &l);
+	/*if (hpfs_chk_name(qstr->name,&l))*/
+		/*return -ENAMETOOLONG;*/
+		/*return -ENOENT;*/
+	x:
+
+	hash = init_name_hash();
+	for (i = 0; i < l; i++)
+		hash = partial_name_hash(hpfs_upcase(hpfs_sb(dentry->d_sb)->sb_cp_table,qstr->name[i]), hash);
+	qstr->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+static int hpfs_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	unsigned al = len;
+	unsigned bl = name->len;
+
+	hpfs_adjust_length(str, &al);
+	/*hpfs_adjust_length(b->name, &bl);*/
+
+	/*
+	 * 'str' is the nane of an already existing dentry, so the name
+	 * must be valid. 'name' must be validated first.
+	 */
+
+	if (hpfs_chk_name(name->name, &bl))
+		return 1;
+	if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0))
+		return 1;
+	return 0;
+}
+
+const struct dentry_operations hpfs_dentry_operations = {
+	.d_hash		= hpfs_hash_dentry,
+	.d_compare	= hpfs_compare_dentry,
+};
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
new file mode 100644
index 00000000..f46ae025
--- /dev/null
+++ b/fs/hpfs/dir.c
@@ -0,0 +1,322 @@
+/*
+ *  linux/fs/hpfs/dir.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  directory VFS functions
+ */
+
+#include <linux/slab.h>
+#include "hpfs_fn.h"
+
+static int hpfs_dir_release(struct inode *inode, struct file *filp)
+{
+	hpfs_lock(inode->i_sb);
+	hpfs_del_pos(inode, &filp->f_pos);
+	/*hpfs_write_if_changed(inode);*/
+	hpfs_unlock(inode->i_sb);
+	return 0;
+}
+
+/* This is slow, but it's not used often */
+
+static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
+{
+	loff_t new_off = off + (whence == 1 ? filp->f_pos : 0);
+	loff_t pos;
+	struct quad_buffer_head qbh;
+	struct inode *i = filp->f_path.dentry->d_inode;
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+	struct super_block *s = i->i_sb;
+
+	hpfs_lock(s);
+
+	/*printk("dir lseek\n");*/
+	if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
+	mutex_lock(&i->i_mutex);
+	pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1;
+	while (pos != new_off) {
+		if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh);
+		else goto fail;
+		if (pos == 12) goto fail;
+	}
+	mutex_unlock(&i->i_mutex);
+ok:
+	hpfs_unlock(s);
+	return filp->f_pos = new_off;
+fail:
+	mutex_unlock(&i->i_mutex);
+	/*printk("illegal lseek: %016llx\n", new_off);*/
+	hpfs_unlock(s);
+	return -ESPIPE;
+}
+
+static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
+	struct quad_buffer_head qbh;
+	struct hpfs_dirent *de;
+	int lc;
+	long old_pos;
+	unsigned char *tempname;
+	int c1, c2 = 0;
+	int ret = 0;
+
+	hpfs_lock(inode->i_sb);
+
+	if (hpfs_sb(inode->i_sb)->sb_chk) {
+		if (hpfs_chk_sectors(inode->i_sb, inode->i_ino, 1, "dir_fnode")) {
+			ret = -EFSERROR;
+			goto out;
+		}
+		if (hpfs_chk_sectors(inode->i_sb, hpfs_inode->i_dno, 4, "dir_dnode")) {
+			ret = -EFSERROR;
+			goto out;
+		}
+	}
+	if (hpfs_sb(inode->i_sb)->sb_chk >= 2) {
+		struct buffer_head *bh;
+		struct fnode *fno;
+		int e = 0;
+		if (!(fno = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) {
+			ret = -EIOERROR;
+			goto out;
+		}
+		if (!fno->dirflag) {
+			e = 1;
+			hpfs_error(inode->i_sb, "not a directory, fnode %08lx",
+					(unsigned long)inode->i_ino);
+		}
+		if (hpfs_inode->i_dno != le32_to_cpu(fno->u.external[0].disk_secno)) {
+			e = 1;
+			hpfs_error(inode->i_sb, "corrupted inode: i_dno == %08x, fnode -> dnode == %08x", hpfs_inode->i_dno, le32_to_cpu(fno->u.external[0].disk_secno));
+		}
+		brelse(bh);
+		if (e) {
+			ret = -EFSERROR;
+			goto out;
+		}
+	}
+	lc = hpfs_sb(inode->i_sb)->sb_lowercase;
+	if (filp->f_pos == 12) { /* diff -r requires this (note, that diff -r */
+		filp->f_pos = 13; /* also fails on msdos filesystem in 2.0) */
+		goto out;
+	}
+	if (filp->f_pos == 13) {
+		ret = -ENOENT;
+		goto out;
+	}
+	
+	while (1) {
+		again:
+		/* This won't work when cycle is longer than number of dirents
+		   accepted by filldir, but what can I do?
+		   maybe killall -9 ls helps */
+		if (hpfs_sb(inode->i_sb)->sb_chk)
+			if (hpfs_stop_cycles(inode->i_sb, filp->f_pos, &c1, &c2, "hpfs_readdir")) {
+				ret = -EFSERROR;
+				goto out;
+			}
+		if (filp->f_pos == 12)
+			goto out;
+		if (filp->f_pos == 3 || filp->f_pos == 4 || filp->f_pos == 5) {
+			printk("HPFS: warning: pos==%d\n",(int)filp->f_pos);
+			goto out;
+		}
+		if (filp->f_pos == 0) {
+			if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
+				goto out;
+			filp->f_pos = 11;
+		}
+		if (filp->f_pos == 11) {
+			if (filldir(dirent, "..", 2, filp->f_pos, hpfs_inode->i_parent_dir, DT_DIR) < 0)
+				goto out;
+			filp->f_pos = 1;
+		}
+		if (filp->f_pos == 1) {
+			filp->f_pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
+			hpfs_add_pos(inode, &filp->f_pos);
+			filp->f_version = inode->i_version;
+		}
+		old_pos = filp->f_pos;
+		if (!(de = map_pos_dirent(inode, &filp->f_pos, &qbh))) {
+			ret = -EIOERROR;
+			goto out;
+		}
+		if (de->first || de->last) {
+			if (hpfs_sb(inode->i_sb)->sb_chk) {
+				if (de->first && !de->last && (de->namelen != 2
+				    || de ->name[0] != 1 || de->name[1] != 1))
+					hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", old_pos);
+				if (de->last && (de->namelen != 1 || de ->name[0] != 255))
+					hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", old_pos);
+			}
+			hpfs_brelse4(&qbh);
+			goto again;
+		}
+		tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
+		if (filldir(dirent, tempname, de->namelen, old_pos, le32_to_cpu(de->fnode), DT_UNKNOWN) < 0) {
+			filp->f_pos = old_pos;
+			if (tempname != de->name) kfree(tempname);
+			hpfs_brelse4(&qbh);
+			goto out;
+		}
+		if (tempname != de->name) kfree(tempname);
+		hpfs_brelse4(&qbh);
+	}
+out:
+	hpfs_unlock(inode->i_sb);
+	return ret;
+}
+
+/*
+ * lookup.  Search the specified directory for the specified name, set
+ * *result to the corresponding inode.
+ *
+ * lookup uses the inode number to tell read_inode whether it is reading
+ * the inode of a directory or a file -- file ino's are odd, directory
+ * ino's are even.  read_inode avoids i/o for file inodes; everything
+ * needed is up here in the directory.  (And file fnodes are out in
+ * the boondocks.)
+ *
+ *    - M.P.: this is over, sometimes we've got to read file's fnode for eas
+ *	      inode numbers are just fnode sector numbers; iget lock is used
+ *	      to tell read_inode to read fnode or not.
+ */
+
+struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	const unsigned char *name = dentry->d_name.name;
+	unsigned len = dentry->d_name.len;
+	struct quad_buffer_head qbh;
+	struct hpfs_dirent *de;
+	ino_t ino;
+	int err;
+	struct inode *result = NULL;
+	struct hpfs_inode_info *hpfs_result;
+
+	hpfs_lock(dir->i_sb);
+	if ((err = hpfs_chk_name(name, &len))) {
+		if (err == -ENAMETOOLONG) {
+			hpfs_unlock(dir->i_sb);
+			return ERR_PTR(-ENAMETOOLONG);
+		}
+		goto end_add;
+	}
+
+	/*
+	 * '.' and '..' will never be passed here.
+	 */
+
+	de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, NULL, &qbh);
+
+	/*
+	 * This is not really a bailout, just means file not found.
+	 */
+
+	if (!de) goto end;
+
+	/*
+	 * Get inode number, what we're after.
+	 */
+
+	ino = le32_to_cpu(de->fnode);
+
+	/*
+	 * Go find or make an inode.
+	 */
+
+	result = iget_locked(dir->i_sb, ino);
+	if (!result) {
+		hpfs_error(dir->i_sb, "hpfs_lookup: can't get inode");
+		goto bail1;
+	}
+	if (result->i_state & I_NEW) {
+		hpfs_init_inode(result);
+		if (de->directory)
+			hpfs_read_inode(result);
+		else if (le32_to_cpu(de->ea_size) && hpfs_sb(dir->i_sb)->sb_eas)
+			hpfs_read_inode(result);
+		else {
+			result->i_mode |= S_IFREG;
+			result->i_mode &= ~0111;
+			result->i_op = &hpfs_file_iops;
+			result->i_fop = &hpfs_file_ops;
+			result->i_nlink = 1;
+		}
+		unlock_new_inode(result);
+	}
+	hpfs_result = hpfs_i(result);
+	if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino;
+
+	if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) {
+		hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures");
+		goto bail1;
+	}
+
+	/*
+	 * Fill in the info from the directory if this is a newly created
+	 * inode.
+	 */
+
+	if (!result->i_ctime.tv_sec) {
+		if (!(result->i_ctime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date))))
+			result->i_ctime.tv_sec = 1;
+		result->i_ctime.tv_nsec = 0;
+		result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date));
+		result->i_mtime.tv_nsec = 0;
+		result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date));
+		result->i_atime.tv_nsec = 0;
+		hpfs_result->i_ea_size = le32_to_cpu(de->ea_size);
+		if (!hpfs_result->i_ea_mode && de->read_only)
+			result->i_mode &= ~0222;
+		if (!de->directory) {
+			if (result->i_size == -1) {
+				result->i_size = le32_to_cpu(de->file_size);
+				result->i_data.a_ops = &hpfs_aops;
+				hpfs_i(result)->mmu_private = result->i_size;
+			/*
+			 * i_blocks should count the fnode and any anodes.
+			 * We count 1 for the fnode and don't bother about
+			 * anodes -- the disk heads are on the directory band
+			 * and we want them to stay there.
+			 */
+				result->i_blocks = 1 + ((result->i_size + 511) >> 9);
+			}
+		}
+	}
+
+	hpfs_brelse4(&qbh);
+
+	/*
+	 * Made it.
+	 */
+
+	end:
+	end_add:
+	hpfs_unlock(dir->i_sb);
+	d_add(dentry, result);
+	return NULL;
+
+	/*
+	 * Didn't.
+	 */
+	bail1:
+	
+	hpfs_brelse4(&qbh);
+	
+	/*bail:*/
+
+	hpfs_unlock(dir->i_sb);
+	return ERR_PTR(-ENOENT);
+}
+
+const struct file_operations hpfs_dir_ops =
+{
+	.llseek		= hpfs_dir_lseek,
+	.read		= generic_read_dir,
+	.readdir	= hpfs_readdir,
+	.release	= hpfs_dir_release,
+	.fsync		= hpfs_file_fsync,
+};
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
new file mode 100644
index 00000000..1e0e2ac3
--- /dev/null
+++ b/fs/hpfs/dnode.c
@@ -0,0 +1,1084 @@
+/*
+ *  linux/fs/hpfs/dnode.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  handling directory dnode tree - adding, deleteing & searching for dirents
+ */
+
+#include "hpfs_fn.h"
+
+static loff_t get_pos(struct dnode *d, struct hpfs_dirent *fde)
+{
+	struct hpfs_dirent *de;
+	struct hpfs_dirent *de_end = dnode_end_de(d);
+	int i = 1;
+	for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) {
+		if (de == fde) return ((loff_t) le32_to_cpu(d->self) << 4) | (loff_t)i;
+		i++;
+	}
+	printk("HPFS: get_pos: not_found\n");
+	return ((loff_t)le32_to_cpu(d->self) << 4) | (loff_t)1;
+}
+
+void hpfs_add_pos(struct inode *inode, loff_t *pos)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
+	int i = 0;
+	loff_t **ppos;
+
+	if (hpfs_inode->i_rddir_off)
+		for (; hpfs_inode->i_rddir_off[i]; i++)
+			if (hpfs_inode->i_rddir_off[i] == pos) return;
+	if (!(i&0x0f)) {
+		if (!(ppos = kmalloc((i+0x11) * sizeof(loff_t*), GFP_NOFS))) {
+			printk("HPFS: out of memory for position list\n");
+			return;
+		}
+		if (hpfs_inode->i_rddir_off) {
+			memcpy(ppos, hpfs_inode->i_rddir_off, i * sizeof(loff_t));
+			kfree(hpfs_inode->i_rddir_off);
+		}
+		hpfs_inode->i_rddir_off = ppos;
+	}
+	hpfs_inode->i_rddir_off[i] = pos;
+	hpfs_inode->i_rddir_off[i + 1] = NULL;
+}
+
+void hpfs_del_pos(struct inode *inode, loff_t *pos)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
+	loff_t **i, **j;
+
+	if (!hpfs_inode->i_rddir_off) goto not_f;
+	for (i = hpfs_inode->i_rddir_off; *i; i++) if (*i == pos) goto fnd;
+	goto not_f;
+	fnd:
+	for (j = i + 1; *j; j++) ;
+	*i = *(j - 1);
+	*(j - 1) = NULL;
+	if (j - 1 == hpfs_inode->i_rddir_off) {
+		kfree(hpfs_inode->i_rddir_off);
+		hpfs_inode->i_rddir_off = NULL;
+	}
+	return;
+	not_f:
+	/*printk("HPFS: warning: position pointer %p->%08x not found\n", pos, (int)*pos);*/
+	return;
+}
+
+static void for_all_poss(struct inode *inode, void (*f)(loff_t *, loff_t, loff_t),
+			 loff_t p1, loff_t p2)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
+	loff_t **i;
+
+	if (!hpfs_inode->i_rddir_off) return;
+	for (i = hpfs_inode->i_rddir_off; *i; i++) (*f)(*i, p1, p2);
+	return;
+}
+
+static void hpfs_pos_subst(loff_t *p, loff_t f, loff_t t)
+{
+	if (*p == f) *p = t;
+}
+
+/*void hpfs_hpfs_pos_substd(loff_t *p, loff_t f, loff_t t)
+{
+	if ((*p & ~0x3f) == (f & ~0x3f)) *p = (t & ~0x3f) | (*p & 0x3f);
+}*/
+
+static void hpfs_pos_ins(loff_t *p, loff_t d, loff_t c)
+{
+	if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) {
+		int n = (*p & 0x3f) + c;
+		if (n > 0x3f) printk("HPFS: hpfs_pos_ins: %08x + %d\n", (int)*p, (int)c >> 8);
+		else *p = (*p & ~0x3f) | n;
+	}
+}
+
+static void hpfs_pos_del(loff_t *p, loff_t d, loff_t c)
+{
+	if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) {
+		int n = (*p & 0x3f) - c;
+		if (n < 1) printk("HPFS: hpfs_pos_ins: %08x - %d\n", (int)*p, (int)c >> 8);
+		else *p = (*p & ~0x3f) | n;
+	}
+}
+
+static struct hpfs_dirent *dnode_pre_last_de(struct dnode *d)
+{
+	struct hpfs_dirent *de, *de_end, *dee = NULL, *deee = NULL;
+	de_end = dnode_end_de(d);
+	for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) {
+		deee = dee; dee = de;
+	}	
+	return deee;
+}
+
+static struct hpfs_dirent *dnode_last_de(struct dnode *d)
+{
+	struct hpfs_dirent *de, *de_end, *dee = NULL;
+	de_end = dnode_end_de(d);
+	for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) {
+		dee = de;
+	}	
+	return dee;
+}
+
+static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno ptr)
+{
+	struct hpfs_dirent *de;
+	if (!(de = dnode_last_de(d))) {
+		hpfs_error(s, "set_last_pointer: empty dnode %08x", le32_to_cpu(d->self));
+		return;
+	}
+	if (hpfs_sb(s)->sb_chk) {
+		if (de->down) {
+			hpfs_error(s, "set_last_pointer: dnode %08x has already last pointer %08x",
+				le32_to_cpu(d->self), de_down_pointer(de));
+			return;
+		}
+		if (le16_to_cpu(de->length) != 32) {
+			hpfs_error(s, "set_last_pointer: bad last dirent in dnode %08x", le32_to_cpu(d->self));
+			return;
+		}
+	}
+	if (ptr) {
+		d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) + 4);
+		if (le32_to_cpu(d->first_free) > 2048) {
+			hpfs_error(s, "set_last_pointer: too long dnode %08x", le32_to_cpu(d->self));
+			d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) - 4);
+			return;
+		}
+		de->length = cpu_to_le16(36);
+		de->down = 1;
+		*(dnode_secno *)((char *)de + 32) = cpu_to_le32(ptr);
+	}
+}
+
+/* Add an entry to dnode and don't care if it grows over 2048 bytes */
+
+struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d,
+				const unsigned char *name,
+				unsigned namelen, secno down_ptr)
+{
+	struct hpfs_dirent *de;
+	struct hpfs_dirent *de_end = dnode_end_de(d);
+	unsigned d_size = de_size(namelen, down_ptr);
+	for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) {
+		int c = hpfs_compare_names(s, name, namelen, de->name, de->namelen, de->last);
+		if (!c) {
+			hpfs_error(s, "name (%c,%d) already exists in dnode %08x", *name, namelen, le32_to_cpu(d->self));
+			return NULL;
+		}
+		if (c < 0) break;
+	}
+	memmove((char *)de + d_size, de, (char *)de_end - (char *)de);
+	memset(de, 0, d_size);
+	if (down_ptr) {
+		*(dnode_secno *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr);
+		de->down = 1;
+	}
+	de->length = cpu_to_le16(d_size);
+	de->not_8x3 = hpfs_is_name_long(name, namelen);
+	de->namelen = namelen;
+	memcpy(de->name, name, namelen);
+	d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) + d_size);
+	return de;
+}
+
+/* Delete dirent and don't care about its subtree */
+
+static void hpfs_delete_de(struct super_block *s, struct dnode *d,
+			   struct hpfs_dirent *de)
+{
+	if (de->last) {
+		hpfs_error(s, "attempt to delete last dirent in dnode %08x", le32_to_cpu(d->self));
+		return;
+	}
+	d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) - le16_to_cpu(de->length));
+	memmove(de, de_next_de(de), le32_to_cpu(d->first_free) + (char *)d - (char *)de);
+}
+
+static void fix_up_ptrs(struct super_block *s, struct dnode *d)
+{
+	struct hpfs_dirent *de;
+	struct hpfs_dirent *de_end = dnode_end_de(d);
+	dnode_secno dno = le32_to_cpu(d->self);
+	for (de = dnode_first_de(d); de < de_end; de = de_next_de(de))
+		if (de->down) {
+			struct quad_buffer_head qbh;
+			struct dnode *dd;
+			if ((dd = hpfs_map_dnode(s, de_down_pointer(de), &qbh))) {
+				if (le32_to_cpu(dd->up) != dno || dd->root_dnode) {
+					dd->up = cpu_to_le32(dno);
+					dd->root_dnode = 0;
+					hpfs_mark_4buffers_dirty(&qbh);
+				}
+				hpfs_brelse4(&qbh);
+			}
+		}
+}
+
+/* Add an entry to dnode and do dnode splitting if required */
+
+static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
+			     const unsigned char *name, unsigned namelen,
+			     struct hpfs_dirent *new_de, dnode_secno down_ptr)
+{
+	struct quad_buffer_head qbh, qbh1, qbh2;
+	struct dnode *d, *ad, *rd, *nd = NULL;
+	dnode_secno adno, rdno;
+	struct hpfs_dirent *de;
+	struct hpfs_dirent nde;
+	unsigned char *nname;
+	int h;
+	int pos;
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	int c1, c2 = 0;
+	if (!(nname = kmalloc(256, GFP_NOFS))) {
+		printk("HPFS: out of memory, can't add to dnode\n");
+		return 1;
+	}
+	go_up:
+	if (namelen >= 256) {
+		hpfs_error(i->i_sb, "hpfs_add_to_dnode: namelen == %d", namelen);
+		kfree(nd);
+		kfree(nname);
+		return 1;
+	}
+	if (!(d = hpfs_map_dnode(i->i_sb, dno, &qbh))) {
+		kfree(nd);
+		kfree(nname);
+		return 1;
+	}
+	go_up_a:
+	if (hpfs_sb(i->i_sb)->sb_chk)
+		if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "hpfs_add_to_dnode")) {
+			hpfs_brelse4(&qbh);
+			kfree(nd);
+			kfree(nname);
+			return 1;
+		}
+	if (le32_to_cpu(d->first_free) + de_size(namelen, down_ptr) <= 2048) {
+		loff_t t;
+		copy_de(de=hpfs_add_de(i->i_sb, d, name, namelen, down_ptr), new_de);
+		t = get_pos(d, de);
+		for_all_poss(i, hpfs_pos_ins, t, 1);
+		for_all_poss(i, hpfs_pos_subst, 4, t);
+		for_all_poss(i, hpfs_pos_subst, 5, t + 1);
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+		kfree(nd);
+		kfree(nname);
+		return 0;
+	}
+	if (!nd) if (!(nd = kmalloc(0x924, GFP_NOFS))) {
+		/* 0x924 is a max size of dnode after adding a dirent with
+		   max name length. We alloc this only once. There must
+		   not be any error while splitting dnodes, otherwise the
+		   whole directory, not only file we're adding, would
+		   be lost. */
+		printk("HPFS: out of memory for dnode splitting\n");
+		hpfs_brelse4(&qbh);
+		kfree(nname);
+		return 1;
+	}	
+	memcpy(nd, d, le32_to_cpu(d->first_free));
+	copy_de(de = hpfs_add_de(i->i_sb, nd, name, namelen, down_ptr), new_de);
+	for_all_poss(i, hpfs_pos_ins, get_pos(nd, de), 1);
+	h = ((char *)dnode_last_de(nd) - (char *)nd) / 2 + 10;
+	if (!(ad = hpfs_alloc_dnode(i->i_sb, le32_to_cpu(d->up), &adno, &qbh1))) {
+		hpfs_error(i->i_sb, "unable to alloc dnode - dnode tree will be corrupted");
+		hpfs_brelse4(&qbh);
+		kfree(nd);
+		kfree(nname);
+		return 1;
+	}
+	i->i_size += 2048;
+	i->i_blocks += 4;
+	pos = 1;
+	for (de = dnode_first_de(nd); (char *)de_next_de(de) - (char *)nd < h; de = de_next_de(de)) {
+		copy_de(hpfs_add_de(i->i_sb, ad, de->name, de->namelen, de->down ? de_down_pointer(de) : 0), de);
+		for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, ((loff_t)adno << 4) | pos);
+		pos++;
+	}
+	copy_de(new_de = &nde, de);
+	memcpy(nname, de->name, de->namelen);
+	name = nname;
+	namelen = de->namelen;
+	for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4);
+	down_ptr = adno;
+	set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0);
+	de = de_next_de(de);
+	memmove((char *)nd + 20, de, le32_to_cpu(nd->first_free) + (char *)nd - (char *)de);
+	nd->first_free = cpu_to_le32(le32_to_cpu(nd->first_free) - ((char *)de - (char *)nd - 20));
+	memcpy(d, nd, le32_to_cpu(nd->first_free));
+	for_all_poss(i, hpfs_pos_del, (loff_t)dno << 4, pos);
+	fix_up_ptrs(i->i_sb, ad);
+	if (!d->root_dnode) {
+		ad->up = d->up;
+		dno = le32_to_cpu(ad->up);
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+		hpfs_mark_4buffers_dirty(&qbh1);
+		hpfs_brelse4(&qbh1);
+		goto go_up;
+	}
+	if (!(rd = hpfs_alloc_dnode(i->i_sb, le32_to_cpu(d->up), &rdno, &qbh2))) {
+		hpfs_error(i->i_sb, "unable to alloc dnode - dnode tree will be corrupted");
+		hpfs_brelse4(&qbh);
+		hpfs_brelse4(&qbh1);
+		kfree(nd);
+		kfree(nname);
+		return 1;
+	}
+	i->i_size += 2048;
+	i->i_blocks += 4;
+	rd->root_dnode = 1;
+	rd->up = d->up;
+	if (!(fnode = hpfs_map_fnode(i->i_sb, le32_to_cpu(d->up), &bh))) {
+		hpfs_free_dnode(i->i_sb, rdno);
+		hpfs_brelse4(&qbh);
+		hpfs_brelse4(&qbh1);
+		hpfs_brelse4(&qbh2);
+		kfree(nd);
+		kfree(nname);
+		return 1;
+	}
+	fnode->u.external[0].disk_secno = cpu_to_le32(rdno);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	hpfs_i(i)->i_dno = rdno;
+	d->up = ad->up = cpu_to_le32(rdno);
+	d->root_dnode = ad->root_dnode = 0;
+	hpfs_mark_4buffers_dirty(&qbh);
+	hpfs_brelse4(&qbh);
+	hpfs_mark_4buffers_dirty(&qbh1);
+	hpfs_brelse4(&qbh1);
+	qbh = qbh2;
+	set_last_pointer(i->i_sb, rd, dno);
+	dno = rdno;
+	d = rd;
+	goto go_up_a;
+}
+
+/*
+ * Add an entry to directory btree.
+ * I hate such crazy directory structure.
+ * It's easy to read but terrible to write.
+ * I wrote this directory code 4 times.
+ * I hope, now it's finally bug-free.
+ */
+
+int hpfs_add_dirent(struct inode *i,
+		    const unsigned char *name, unsigned namelen,
+		    struct hpfs_dirent *new_de)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+	struct dnode *d;
+	struct hpfs_dirent *de, *de_end;
+	struct quad_buffer_head qbh;
+	dnode_secno dno;
+	int c;
+	int c1, c2 = 0;
+	dno = hpfs_inode->i_dno;
+	down:
+	if (hpfs_sb(i->i_sb)->sb_chk)
+		if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "hpfs_add_dirent")) return 1;
+	if (!(d = hpfs_map_dnode(i->i_sb, dno, &qbh))) return 1;
+	de_end = dnode_end_de(d);
+	for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) {
+		if (!(c = hpfs_compare_names(i->i_sb, name, namelen, de->name, de->namelen, de->last))) {
+			hpfs_brelse4(&qbh);
+			return -1;
+		}	
+		if (c < 0) {
+			if (de->down) {
+				dno = de_down_pointer(de);
+				hpfs_brelse4(&qbh);
+				goto down;
+			}
+			break;
+		}
+	}
+	hpfs_brelse4(&qbh);
+	if (hpfs_check_free_dnodes(i->i_sb, FREE_DNODES_ADD)) {
+		c = 1;
+		goto ret;
+	}	
+	i->i_version++;
+	c = hpfs_add_to_dnode(i, dno, name, namelen, new_de, 0);
+	ret:
+	return c;
+}
+
+/* 
+ * Find dirent with higher name in 'from' subtree and move it to 'to' dnode.
+ * Return the dnode we moved from (to be checked later if it's empty)
+ */
+
+static secno move_to_top(struct inode *i, dnode_secno from, dnode_secno to)
+{
+	dnode_secno dno, ddno;
+	dnode_secno chk_up = to;
+	struct dnode *dnode;
+	struct quad_buffer_head qbh;
+	struct hpfs_dirent *de, *nde;
+	int a;
+	loff_t t;
+	int c1, c2 = 0;
+	dno = from;
+	while (1) {
+		if (hpfs_sb(i->i_sb)->sb_chk)
+			if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "move_to_top"))
+				return 0;
+		if (!(dnode = hpfs_map_dnode(i->i_sb, dno, &qbh))) return 0;
+		if (hpfs_sb(i->i_sb)->sb_chk) {
+			if (le32_to_cpu(dnode->up) != chk_up) {
+				hpfs_error(i->i_sb, "move_to_top: up pointer from %08x should be %08x, is %08x",
+					dno, chk_up, le32_to_cpu(dnode->up));
+				hpfs_brelse4(&qbh);
+				return 0;
+			}
+			chk_up = dno;
+		}
+		if (!(de = dnode_last_de(dnode))) {
+			hpfs_error(i->i_sb, "move_to_top: dnode %08x has no last de", dno);
+			hpfs_brelse4(&qbh);
+			return 0;
+		}
+		if (!de->down) break;
+		dno = de_down_pointer(de);
+		hpfs_brelse4(&qbh);
+	}
+	while (!(de = dnode_pre_last_de(dnode))) {
+		dnode_secno up = le32_to_cpu(dnode->up);
+		hpfs_brelse4(&qbh);
+		hpfs_free_dnode(i->i_sb, dno);
+		i->i_size -= 2048;
+		i->i_blocks -= 4;
+		for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, 5);
+		if (up == to) return to;
+		if (!(dnode = hpfs_map_dnode(i->i_sb, up, &qbh))) return 0;
+		if (dnode->root_dnode) {
+			hpfs_error(i->i_sb, "move_to_top: got to root_dnode while moving from %08x to %08x", from, to);
+			hpfs_brelse4(&qbh);
+			return 0;
+		}
+		de = dnode_last_de(dnode);
+		if (!de || !de->down) {
+			hpfs_error(i->i_sb, "move_to_top: dnode %08x doesn't point down to %08x", up, dno);
+			hpfs_brelse4(&qbh);
+			return 0;
+		}
+		dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) - 4);
+		de->length = cpu_to_le16(le16_to_cpu(de->length) - 4);
+		de->down = 0;
+		hpfs_mark_4buffers_dirty(&qbh);
+		dno = up;
+	}
+	t = get_pos(dnode, de);
+	for_all_poss(i, hpfs_pos_subst, t, 4);
+	for_all_poss(i, hpfs_pos_subst, t + 1, 5);
+	if (!(nde = kmalloc(le16_to_cpu(de->length), GFP_NOFS))) {
+		hpfs_error(i->i_sb, "out of memory for dirent - directory will be corrupted");
+		hpfs_brelse4(&qbh);
+		return 0;
+	}
+	memcpy(nde, de, le16_to_cpu(de->length));
+	ddno = de->down ? de_down_pointer(de) : 0;
+	hpfs_delete_de(i->i_sb, dnode, de);
+	set_last_pointer(i->i_sb, dnode, ddno);
+	hpfs_mark_4buffers_dirty(&qbh);
+	hpfs_brelse4(&qbh);
+	a = hpfs_add_to_dnode(i, to, nde->name, nde->namelen, nde, from);
+	kfree(nde);
+	if (a) return 0;
+	return dno;
+}
+
+/* 
+ * Check if a dnode is empty and delete it from the tree
+ * (chkdsk doesn't like empty dnodes)
+ */
+
+static void delete_empty_dnode(struct inode *i, dnode_secno dno)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+	struct quad_buffer_head qbh;
+	struct dnode *dnode;
+	dnode_secno down, up, ndown;
+	int p;
+	struct hpfs_dirent *de;
+	int c1, c2 = 0;
+	try_it_again:
+	if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "delete_empty_dnode")) return;
+	if (!(dnode = hpfs_map_dnode(i->i_sb, dno, &qbh))) return;
+	if (le32_to_cpu(dnode->first_free) > 56) goto end;
+	if (le32_to_cpu(dnode->first_free) == 52 || le32_to_cpu(dnode->first_free) == 56) {
+		struct hpfs_dirent *de_end;
+		int root = dnode->root_dnode;
+		up = le32_to_cpu(dnode->up);
+		de = dnode_first_de(dnode);
+		down = de->down ? de_down_pointer(de) : 0;
+		if (hpfs_sb(i->i_sb)->sb_chk) if (root && !down) {
+			hpfs_error(i->i_sb, "delete_empty_dnode: root dnode %08x is empty", dno);
+			goto end;
+		}
+		hpfs_brelse4(&qbh);
+		hpfs_free_dnode(i->i_sb, dno);
+		i->i_size -= 2048;
+		i->i_blocks -= 4;
+		if (root) {
+			struct fnode *fnode;
+			struct buffer_head *bh;
+			struct dnode *d1;
+			struct quad_buffer_head qbh1;
+			if (hpfs_sb(i->i_sb)->sb_chk)
+			    if (up != i->i_ino) {
+				hpfs_error(i->i_sb,
+					"bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx",
+					dno, up, (unsigned long)i->i_ino);
+				return;
+			    }
+			if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) {
+				d1->up = cpu_to_le32(up);
+				d1->root_dnode = 1;
+				hpfs_mark_4buffers_dirty(&qbh1);
+				hpfs_brelse4(&qbh1);
+			}
+			if ((fnode = hpfs_map_fnode(i->i_sb, up, &bh))) {
+				fnode->u.external[0].disk_secno = cpu_to_le32(down);
+				mark_buffer_dirty(bh);
+				brelse(bh);
+			}
+			hpfs_inode->i_dno = down;
+			for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, (loff_t) 12);
+			return;
+		}
+		if (!(dnode = hpfs_map_dnode(i->i_sb, up, &qbh))) return;
+		p = 1;
+		de_end = dnode_end_de(dnode);
+		for (de = dnode_first_de(dnode); de < de_end; de = de_next_de(de), p++)
+			if (de->down) if (de_down_pointer(de) == dno) goto fnd;
+		hpfs_error(i->i_sb, "delete_empty_dnode: pointer to dnode %08x not found in dnode %08x", dno, up);
+		goto end;
+		fnd:
+		for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, ((loff_t)up << 4) | p);
+		if (!down) {
+			de->down = 0;
+			de->length = cpu_to_le16(le16_to_cpu(de->length) - 4);
+			dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) - 4);
+			memmove(de_next_de(de), (char *)de_next_de(de) + 4,
+				(char *)dnode + le32_to_cpu(dnode->first_free) - (char *)de_next_de(de));
+		} else {
+			struct dnode *d1;
+			struct quad_buffer_head qbh1;
+			*(dnode_secno *) ((void *) de + le16_to_cpu(de->length) - 4) = down;
+			if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) {
+				d1->up = cpu_to_le32(up);
+				hpfs_mark_4buffers_dirty(&qbh1);
+				hpfs_brelse4(&qbh1);
+			}
+		}
+	} else {
+		hpfs_error(i->i_sb, "delete_empty_dnode: dnode %08x, first_free == %03x", dno, le32_to_cpu(dnode->first_free));
+		goto end;
+	}
+
+	if (!de->last) {
+		struct hpfs_dirent *de_next = de_next_de(de);
+		struct hpfs_dirent *de_cp;
+		struct dnode *d1;
+		struct quad_buffer_head qbh1;
+		if (!de_next->down) goto endm;
+		ndown = de_down_pointer(de_next);
+		if (!(de_cp = kmalloc(le16_to_cpu(de->length), GFP_NOFS))) {
+			printk("HPFS: out of memory for dtree balancing\n");
+			goto endm;
+		}
+		memcpy(de_cp, de, le16_to_cpu(de->length));
+		hpfs_delete_de(i->i_sb, dnode, de);
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+		for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | p, 4);
+		for_all_poss(i, hpfs_pos_del, ((loff_t)up << 4) | p, 1);
+		if (de_cp->down) if ((d1 = hpfs_map_dnode(i->i_sb, de_down_pointer(de_cp), &qbh1))) {
+			d1->up = cpu_to_le32(ndown);
+			hpfs_mark_4buffers_dirty(&qbh1);
+			hpfs_brelse4(&qbh1);
+		}
+		hpfs_add_to_dnode(i, ndown, de_cp->name, de_cp->namelen, de_cp, de_cp->down ? de_down_pointer(de_cp) : 0);
+		/*printk("UP-TO-DNODE: %08x (ndown = %08x, down = %08x, dno = %08x)\n", up, ndown, down, dno);*/
+		dno = up;
+		kfree(de_cp);
+		goto try_it_again;
+	} else {
+		struct hpfs_dirent *de_prev = dnode_pre_last_de(dnode);
+		struct hpfs_dirent *de_cp;
+		struct dnode *d1;
+		struct quad_buffer_head qbh1;
+		dnode_secno dlp;
+		if (!de_prev) {
+			hpfs_error(i->i_sb, "delete_empty_dnode: empty dnode %08x", up);
+			hpfs_mark_4buffers_dirty(&qbh);
+			hpfs_brelse4(&qbh);
+			dno = up;
+			goto try_it_again;
+		}
+		if (!de_prev->down) goto endm;
+		ndown = de_down_pointer(de_prev);
+		if ((d1 = hpfs_map_dnode(i->i_sb, ndown, &qbh1))) {
+			struct hpfs_dirent *del = dnode_last_de(d1);
+			dlp = del->down ? de_down_pointer(del) : 0;
+			if (!dlp && down) {
+				if (le32_to_cpu(d1->first_free) > 2044) {
+					if (hpfs_sb(i->i_sb)->sb_chk >= 2) {
+						printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n");
+						printk("HPFS: warning: terminating balancing operation\n");
+					}
+					hpfs_brelse4(&qbh1);
+					goto endm;
+				}
+				if (hpfs_sb(i->i_sb)->sb_chk >= 2) {
+					printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n");
+					printk("HPFS: warning: goin'on\n");
+				}
+				del->length = cpu_to_le16(le16_to_cpu(del->length) + 4);
+				del->down = 1;
+				d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) + 4);
+			}
+			if (dlp && !down) {
+				del->length = cpu_to_le16(le16_to_cpu(del->length) - 4);
+				del->down = 0;
+				d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) - 4);
+			} else if (down)
+				*(dnode_secno *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down);
+		} else goto endm;
+		if (!(de_cp = kmalloc(le16_to_cpu(de_prev->length), GFP_NOFS))) {
+			printk("HPFS: out of memory for dtree balancing\n");
+			hpfs_brelse4(&qbh1);
+			goto endm;
+		}
+		hpfs_mark_4buffers_dirty(&qbh1);
+		hpfs_brelse4(&qbh1);
+		memcpy(de_cp, de_prev, le16_to_cpu(de_prev->length));
+		hpfs_delete_de(i->i_sb, dnode, de_prev);
+		if (!de_prev->down) {
+			de_prev->length = cpu_to_le16(le16_to_cpu(de_prev->length) + 4);
+			de_prev->down = 1;
+			dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) + 4);
+		}
+		*(dnode_secno *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown);
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+		for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | (p - 1), 4);
+		for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | p, ((loff_t)up << 4) | (p - 1));
+		if (down) if ((d1 = hpfs_map_dnode(i->i_sb, de_down_pointer(de), &qbh1))) {
+			d1->up = cpu_to_le32(ndown);
+			hpfs_mark_4buffers_dirty(&qbh1);
+			hpfs_brelse4(&qbh1);
+		}
+		hpfs_add_to_dnode(i, ndown, de_cp->name, de_cp->namelen, de_cp, dlp);
+		dno = up;
+		kfree(de_cp);
+		goto try_it_again;
+	}
+	endm:
+	hpfs_mark_4buffers_dirty(&qbh);
+	end:
+	hpfs_brelse4(&qbh);
+}
+
+
+/* Delete dirent from directory */
+
+int hpfs_remove_dirent(struct inode *i, dnode_secno dno, struct hpfs_dirent *de,
+		       struct quad_buffer_head *qbh, int depth)
+{
+	struct dnode *dnode = qbh->data;
+	dnode_secno down = 0;
+	loff_t t;
+	if (de->first || de->last) {
+		hpfs_error(i->i_sb, "hpfs_remove_dirent: attempt to delete first or last dirent in dnode %08x", dno);
+		hpfs_brelse4(qbh);
+		return 1;
+	}
+	if (de->down) down = de_down_pointer(de);
+	if (depth && (de->down || (de == dnode_first_de(dnode) && de_next_de(de)->last))) {
+		if (hpfs_check_free_dnodes(i->i_sb, FREE_DNODES_DEL)) {
+			hpfs_brelse4(qbh);
+			return 2;
+		}
+	}
+	i->i_version++;
+	for_all_poss(i, hpfs_pos_del, (t = get_pos(dnode, de)) + 1, 1);
+	hpfs_delete_de(i->i_sb, dnode, de);
+	hpfs_mark_4buffers_dirty(qbh);
+	hpfs_brelse4(qbh);
+	if (down) {
+		dnode_secno a = move_to_top(i, down, dno);
+		for_all_poss(i, hpfs_pos_subst, 5, t);
+		if (a) delete_empty_dnode(i, a);
+		return !a;
+	}
+	delete_empty_dnode(i, dno);
+	return 0;
+}
+
+void hpfs_count_dnodes(struct super_block *s, dnode_secno dno, int *n_dnodes,
+		       int *n_subdirs, int *n_items)
+{
+	struct dnode *dnode;
+	struct quad_buffer_head qbh;
+	struct hpfs_dirent *de;
+	dnode_secno ptr, odno = 0;
+	int c1, c2 = 0;
+	int d1, d2 = 0;
+	go_down:
+	if (n_dnodes) (*n_dnodes)++;
+	if (hpfs_sb(s)->sb_chk)
+		if (hpfs_stop_cycles(s, dno, &c1, &c2, "hpfs_count_dnodes #1")) return;
+	ptr = 0;
+	go_up:
+	if (!(dnode = hpfs_map_dnode(s, dno, &qbh))) return;
+	if (hpfs_sb(s)->sb_chk) if (odno && odno != -1 && le32_to_cpu(dnode->up) != odno)
+		hpfs_error(s, "hpfs_count_dnodes: bad up pointer; dnode %08x, down %08x points to %08x", odno, dno, le32_to_cpu(dnode->up));
+	de = dnode_first_de(dnode);
+	if (ptr) while(1) {
+		if (de->down) if (de_down_pointer(de) == ptr) goto process_de;
+		if (de->last) {
+			hpfs_brelse4(&qbh);
+			hpfs_error(s, "hpfs_count_dnodes: pointer to dnode %08x not found in dnode %08x, got here from %08x",
+				ptr, dno, odno);
+			return;
+		}
+		de = de_next_de(de);
+	}
+	next_de:
+	if (de->down) {
+		odno = dno;
+		dno = de_down_pointer(de);
+		hpfs_brelse4(&qbh);
+		goto go_down;
+	}
+	process_de:
+	if (!de->first && !de->last && de->directory && n_subdirs) (*n_subdirs)++;
+	if (!de->first && !de->last && n_items) (*n_items)++;
+	if ((de = de_next_de(de)) < dnode_end_de(dnode)) goto next_de;
+	ptr = dno;
+	dno = le32_to_cpu(dnode->up);
+	if (dnode->root_dnode) {
+		hpfs_brelse4(&qbh);
+		return;
+	}
+	hpfs_brelse4(&qbh);
+	if (hpfs_sb(s)->sb_chk)
+		if (hpfs_stop_cycles(s, ptr, &d1, &d2, "hpfs_count_dnodes #2")) return;
+	odno = -1;
+	goto go_up;
+}
+
+static struct hpfs_dirent *map_nth_dirent(struct super_block *s, dnode_secno dno, int n,
+					  struct quad_buffer_head *qbh, struct dnode **dn)
+{
+	int i;
+	struct hpfs_dirent *de, *de_end;
+	struct dnode *dnode;
+	dnode = hpfs_map_dnode(s, dno, qbh);
+	if (!dnode) return NULL;
+	if (dn) *dn=dnode;
+	de = dnode_first_de(dnode);
+	de_end = dnode_end_de(dnode);
+	for (i = 1; de < de_end; i++, de = de_next_de(de)) {
+		if (i == n) {
+			return de;
+		}	
+		if (de->last) break;
+	}
+	hpfs_brelse4(qbh);
+	hpfs_error(s, "map_nth_dirent: n too high; dnode = %08x, requested %08x", dno, n);
+	return NULL;
+}
+
+dnode_secno hpfs_de_as_down_as_possible(struct super_block *s, dnode_secno dno)
+{
+	struct quad_buffer_head qbh;
+	dnode_secno d = dno;
+	dnode_secno up = 0;
+	struct hpfs_dirent *de;
+	int c1, c2 = 0;
+
+	again:
+	if (hpfs_sb(s)->sb_chk)
+		if (hpfs_stop_cycles(s, d, &c1, &c2, "hpfs_de_as_down_as_possible"))
+			return d;
+	if (!(de = map_nth_dirent(s, d, 1, &qbh, NULL))) return dno;
+	if (hpfs_sb(s)->sb_chk)
+		if (up && le32_to_cpu(((struct dnode *)qbh.data)->up) != up)
+			hpfs_error(s, "hpfs_de_as_down_as_possible: bad up pointer; dnode %08x, down %08x points to %08x", up, d, le32_to_cpu(((struct dnode *)qbh.data)->up));
+	if (!de->down) {
+		hpfs_brelse4(&qbh);
+		return d;
+	}
+	up = d;
+	d = de_down_pointer(de);
+	hpfs_brelse4(&qbh);
+	goto again;
+}
+
+struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp,
+				   struct quad_buffer_head *qbh)
+{
+	loff_t pos;
+	unsigned c;
+	dnode_secno dno;
+	struct hpfs_dirent *de, *d;
+	struct hpfs_dirent *up_de;
+	struct hpfs_dirent *end_up_de;
+	struct dnode *dnode;
+	struct dnode *up_dnode;
+	struct quad_buffer_head qbh0;
+
+	pos = *posp;
+	dno = pos >> 6 << 2;
+	pos &= 077;
+	if (!(de = map_nth_dirent(inode->i_sb, dno, pos, qbh, &dnode)))
+		goto bail;
+
+	/* Going to the next dirent */
+	if ((d = de_next_de(de)) < dnode_end_de(dnode)) {
+		if (!(++*posp & 077)) {
+			hpfs_error(inode->i_sb,
+				"map_pos_dirent: pos crossed dnode boundary; pos = %08llx",
+				(unsigned long long)*posp);
+			goto bail;
+		}
+		/* We're going down the tree */
+		if (d->down) {
+			*posp = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, de_down_pointer(d)) << 4) + 1;
+		}
+	
+		return de;
+	}
+
+	/* Going up */
+	if (dnode->root_dnode) goto bail;
+
+	if (!(up_dnode = hpfs_map_dnode(inode->i_sb, le32_to_cpu(dnode->up), &qbh0)))
+		goto bail;
+
+	end_up_de = dnode_end_de(up_dnode);
+	c = 0;
+	for (up_de = dnode_first_de(up_dnode); up_de < end_up_de;
+	     up_de = de_next_de(up_de)) {
+		if (!(++c & 077)) hpfs_error(inode->i_sb,
+			"map_pos_dirent: pos crossed dnode boundary; dnode = %08x", le32_to_cpu(dnode->up));
+		if (up_de->down && de_down_pointer(up_de) == dno) {
+			*posp = ((loff_t) le32_to_cpu(dnode->up) << 4) + c;
+			hpfs_brelse4(&qbh0);
+			return de;
+		}
+	}
+	
+	hpfs_error(inode->i_sb, "map_pos_dirent: pointer to dnode %08x not found in parent dnode %08x",
+		dno, le32_to_cpu(dnode->up));
+	hpfs_brelse4(&qbh0);
+	
+	bail:
+	*posp = 12;
+	return de;
+}
+
+/* Find a dirent in tree */
+
+struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno,
+			       const unsigned char *name, unsigned len,
+			       dnode_secno *dd, struct quad_buffer_head *qbh)
+{
+	struct dnode *dnode;
+	struct hpfs_dirent *de;
+	struct hpfs_dirent *de_end;
+	int c1, c2 = 0;
+
+	if (!S_ISDIR(inode->i_mode)) hpfs_error(inode->i_sb, "map_dirent: not a directory\n");
+	again:
+	if (hpfs_sb(inode->i_sb)->sb_chk)
+		if (hpfs_stop_cycles(inode->i_sb, dno, &c1, &c2, "map_dirent")) return NULL;
+	if (!(dnode = hpfs_map_dnode(inode->i_sb, dno, qbh))) return NULL;
+	
+	de_end = dnode_end_de(dnode);
+	for (de = dnode_first_de(dnode); de < de_end; de = de_next_de(de)) {
+		int t = hpfs_compare_names(inode->i_sb, name, len, de->name, de->namelen, de->last);
+		if (!t) {
+			if (dd) *dd = dno;
+			return de;
+		}
+		if (t < 0) {
+			if (de->down) {
+				dno = de_down_pointer(de);
+				hpfs_brelse4(qbh);
+				goto again;
+			}
+		break;
+		}
+	}
+	hpfs_brelse4(qbh);
+	return NULL;
+}
+
+/*
+ * Remove empty directory. In normal cases it is only one dnode with two
+ * entries, but we must handle also such obscure cases when it's a tree
+ * of empty dnodes.
+ */
+
+void hpfs_remove_dtree(struct super_block *s, dnode_secno dno)
+{
+	struct quad_buffer_head qbh;
+	struct dnode *dnode;
+	struct hpfs_dirent *de;
+	dnode_secno d1, d2, rdno = dno;
+	while (1) {
+		if (!(dnode = hpfs_map_dnode(s, dno, &qbh))) return;
+		de = dnode_first_de(dnode);
+		if (de->last) {
+			if (de->down) d1 = de_down_pointer(de);
+			else goto error;
+			hpfs_brelse4(&qbh);
+			hpfs_free_dnode(s, dno);
+			dno = d1;
+		} else break;
+	}
+	if (!de->first) goto error;
+	d1 = de->down ? de_down_pointer(de) : 0;
+	de = de_next_de(de);
+	if (!de->last) goto error;
+	d2 = de->down ? de_down_pointer(de) : 0;
+	hpfs_brelse4(&qbh);
+	hpfs_free_dnode(s, dno);
+	do {
+		while (d1) {
+			if (!(dnode = hpfs_map_dnode(s, dno = d1, &qbh))) return;
+			de = dnode_first_de(dnode);
+			if (!de->last) goto error;
+			d1 = de->down ? de_down_pointer(de) : 0;
+			hpfs_brelse4(&qbh);
+			hpfs_free_dnode(s, dno);
+		}
+		d1 = d2;
+		d2 = 0;
+	} while (d1);
+	return;
+	error:
+	hpfs_brelse4(&qbh);
+	hpfs_free_dnode(s, dno);
+	hpfs_error(s, "directory %08x is corrupted or not empty", rdno);
+}
+
+/* 
+ * Find dirent for specified fnode. Use truncated 15-char name in fnode as
+ * a help for searching.
+ */
+
+struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
+				     struct fnode *f, struct quad_buffer_head *qbh)
+{
+	unsigned char *name1;
+	unsigned char *name2;
+	int name1len, name2len;
+	struct dnode *d;
+	dnode_secno dno, downd;
+	struct fnode *upf;
+	struct buffer_head *bh;
+	struct hpfs_dirent *de, *de_end;
+	int c;
+	int c1, c2 = 0;
+	int d1, d2 = 0;
+	name1 = f->name;
+	if (!(name2 = kmalloc(256, GFP_NOFS))) {
+		printk("HPFS: out of memory, can't map dirent\n");
+		return NULL;
+	}
+	if (f->len <= 15)
+		memcpy(name2, name1, name1len = name2len = f->len);
+	else {
+		memcpy(name2, name1, 15);
+		memset(name2 + 15, 0xff, 256 - 15);
+		/*name2[15] = 0xff;*/
+		name1len = 15; name2len = 256;
+	}
+	if (!(upf = hpfs_map_fnode(s, le32_to_cpu(f->up), &bh))) {
+		kfree(name2);
+		return NULL;
+	}	
+	if (!upf->dirflag) {
+		brelse(bh);
+		hpfs_error(s, "fnode %08x has non-directory parent %08x", fno, le32_to_cpu(f->up));
+		kfree(name2);
+		return NULL;
+	}
+	dno = le32_to_cpu(upf->u.external[0].disk_secno);
+	brelse(bh);
+	go_down:
+	downd = 0;
+	go_up:
+	if (!(d = hpfs_map_dnode(s, dno, qbh))) {
+		kfree(name2);
+		return NULL;
+	}
+	de_end = dnode_end_de(d);
+	de = dnode_first_de(d);
+	if (downd) {
+		while (de < de_end) {
+			if (de->down) if (de_down_pointer(de) == downd) goto f;
+			de = de_next_de(de);
+		}
+		hpfs_error(s, "pointer to dnode %08x not found in dnode %08x", downd, dno);
+		hpfs_brelse4(qbh);
+		kfree(name2);
+		return NULL;
+	}
+	next_de:
+	if (le32_to_cpu(de->fnode) == fno) {
+		kfree(name2);
+		return de;
+	}
+	c = hpfs_compare_names(s, name1, name1len, de->name, de->namelen, de->last);
+	if (c < 0 && de->down) {
+		dno = de_down_pointer(de);
+		hpfs_brelse4(qbh);
+		if (hpfs_sb(s)->sb_chk)
+			if (hpfs_stop_cycles(s, dno, &c1, &c2, "map_fnode_dirent #1")) {
+			kfree(name2);
+			return NULL;
+		}
+		goto go_down;
+	}
+	f:
+	if (le32_to_cpu(de->fnode) == fno) {
+		kfree(name2);
+		return de;
+	}
+	c = hpfs_compare_names(s, name2, name2len, de->name, de->namelen, de->last);
+	if (c < 0 && !de->last) goto not_found;
+	if ((de = de_next_de(de)) < de_end) goto next_de;
+	if (d->root_dnode) goto not_found;
+	downd = dno;
+	dno = le32_to_cpu(d->up);
+	hpfs_brelse4(qbh);
+	if (hpfs_sb(s)->sb_chk)
+		if (hpfs_stop_cycles(s, downd, &d1, &d2, "map_fnode_dirent #2")) {
+			kfree(name2);
+			return NULL;
+		}
+	goto go_up;
+	not_found:
+	hpfs_brelse4(qbh);
+	hpfs_error(s, "dirent for fnode %08x not found", fno);
+	kfree(name2);
+	return NULL;
+}
diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c
new file mode 100644
index 00000000..d8b84d11
--- /dev/null
+++ b/fs/hpfs/ea.c
@@ -0,0 +1,367 @@
+/*
+ *  linux/fs/hpfs/ea.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  handling extended attributes
+ */
+
+#include "hpfs_fn.h"
+
+/* Remove external extended attributes. ano specifies whether a is a 
+   direct sector where eas starts or an anode */
+
+void hpfs_ea_ext_remove(struct super_block *s, secno a, int ano, unsigned len)
+{
+	unsigned pos = 0;
+	while (pos < len) {
+		char ex[4 + 255 + 1 + 8];
+		struct extended_attribute *ea = (struct extended_attribute *)ex;
+		if (pos + 4 > len) {
+			hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x",
+				ano ? "anode" : "sectors", a, len);
+			return;
+		}
+		if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return;
+		if (ea->indirect) {
+			if (ea_valuelen(ea) != 8) {
+				hpfs_error(s, "ea->indirect set while ea->valuelen!=8, %s %08x, pos %08x",
+					ano ? "anode" : "sectors", a, pos);
+				return;
+			}
+			if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 9, ex+4))
+				return;
+			hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea));
+		}
+		pos += ea->namelen + ea_valuelen(ea) + 5;
+	}
+	if (!ano) hpfs_free_sectors(s, a, (len+511) >> 9);
+	else {
+		struct buffer_head *bh;
+		struct anode *anode;
+		if ((anode = hpfs_map_anode(s, a, &bh))) {
+			hpfs_remove_btree(s, &anode->btree);
+			brelse(bh);
+			hpfs_free_sectors(s, a, 1);
+		}
+	}
+}
+
+static char *get_indirect_ea(struct super_block *s, int ano, secno a, int size)
+{
+	char *ret;
+	if (!(ret = kmalloc(size + 1, GFP_NOFS))) {
+		printk("HPFS: out of memory for EA\n");
+		return NULL;
+	}
+	if (hpfs_ea_read(s, a, ano, 0, size, ret)) {
+		kfree(ret);
+		return NULL;
+	}
+	ret[size] = 0;
+	return ret;
+}
+
+static void set_indirect_ea(struct super_block *s, int ano, secno a,
+			    const char *data, int size)
+{
+	hpfs_ea_write(s, a, ano, 0, size, data);
+}
+
+/* Read an extended attribute named 'key' into the provided buffer */
+
+int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
+		char *buf, int size)
+{
+	unsigned pos;
+	int ano, len;
+	secno a;
+	char ex[4 + 255 + 1 + 8];
+	struct extended_attribute *ea;
+	struct extended_attribute *ea_end = fnode_end_ea(fnode);
+	for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
+		if (!strcmp(ea->name, key)) {
+			if (ea->indirect)
+				goto indirect;
+			if (ea_valuelen(ea) >= size)
+				return -EINVAL;
+			memcpy(buf, ea_data(ea), ea_valuelen(ea));
+			buf[ea_valuelen(ea)] = 0;
+			return 0;
+		}
+	a = le32_to_cpu(fnode->ea_secno);
+	len = le32_to_cpu(fnode->ea_size_l);
+	ano = fnode->ea_anode;
+	pos = 0;
+	while (pos < len) {
+		ea = (struct extended_attribute *)ex;
+		if (pos + 4 > len) {
+			hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x",
+				ano ? "anode" : "sectors", a, len);
+			return -EIO;
+		}
+		if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return -EIO;
+		if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4))
+			return -EIO;
+		if (!strcmp(ea->name, key)) {
+			if (ea->indirect)
+				goto indirect;
+			if (ea_valuelen(ea) >= size)
+				return -EINVAL;
+			if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, ea_valuelen(ea), buf))
+				return -EIO;
+			buf[ea_valuelen(ea)] = 0;
+			return 0;
+		}
+		pos += ea->namelen + ea_valuelen(ea) + 5;
+	}
+	return -ENOENT;
+indirect:
+	if (ea_len(ea) >= size)
+		return -EINVAL;
+	if (hpfs_ea_read(s, ea_sec(ea), ea->anode, 0, ea_len(ea), buf))
+		return -EIO;
+	buf[ea_len(ea)] = 0;
+	return 0;
+}
+
+/* Read an extended attribute named 'key' */
+char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *size)
+{
+	char *ret;
+	unsigned pos;
+	int ano, len;
+	secno a;
+	struct extended_attribute *ea;
+	struct extended_attribute *ea_end = fnode_end_ea(fnode);
+	for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
+		if (!strcmp(ea->name, key)) {
+			if (ea->indirect)
+				return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea));
+			if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) {
+				printk("HPFS: out of memory for EA\n");
+				return NULL;
+			}
+			memcpy(ret, ea_data(ea), ea_valuelen(ea));
+			ret[ea_valuelen(ea)] = 0;
+			return ret;
+		}
+	a = le32_to_cpu(fnode->ea_secno);
+	len = le32_to_cpu(fnode->ea_size_l);
+	ano = fnode->ea_anode;
+	pos = 0;
+	while (pos < len) {
+		char ex[4 + 255 + 1 + 8];
+		ea = (struct extended_attribute *)ex;
+		if (pos + 4 > len) {
+			hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x",
+				ano ? "anode" : "sectors", a, len);
+			return NULL;
+		}
+		if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return NULL;
+		if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4))
+			return NULL;
+		if (!strcmp(ea->name, key)) {
+			if (ea->indirect)
+				return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea));
+			if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) {
+				printk("HPFS: out of memory for EA\n");
+				return NULL;
+			}
+			if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, ea_valuelen(ea), ret)) {
+				kfree(ret);
+				return NULL;
+			}
+			ret[ea_valuelen(ea)] = 0;
+			return ret;
+		}
+		pos += ea->namelen + ea_valuelen(ea) + 5;
+	}
+	return NULL;
+}
+
+/* 
+ * Update or create extended attribute 'key' with value 'data'. Note that
+ * when this ea exists, it MUST have the same size as size of data.
+ * This driver can't change sizes of eas ('cause I just don't need it).
+ */
+
+void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
+		 const char *data, int size)
+{
+	fnode_secno fno = inode->i_ino;
+	struct super_block *s = inode->i_sb;
+	unsigned pos;
+	int ano, len;
+	secno a;
+	unsigned char h[4];
+	struct extended_attribute *ea;
+	struct extended_attribute *ea_end = fnode_end_ea(fnode);
+	for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
+		if (!strcmp(ea->name, key)) {
+			if (ea->indirect) {
+				if (ea_len(ea) == size)
+					set_indirect_ea(s, ea->anode, ea_sec(ea), data, size);
+			} else if (ea_valuelen(ea) == size) {
+				memcpy(ea_data(ea), data, size);
+			}
+			return;
+		}
+	a = le32_to_cpu(fnode->ea_secno);
+	len = le32_to_cpu(fnode->ea_size_l);
+	ano = fnode->ea_anode;
+	pos = 0;
+	while (pos < len) {
+		char ex[4 + 255 + 1 + 8];
+		ea = (struct extended_attribute *)ex;
+		if (pos + 4 > len) {
+			hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x",
+				ano ? "anode" : "sectors", a, len);
+			return;
+		}
+		if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return;
+		if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4))
+			return;
+		if (!strcmp(ea->name, key)) {
+			if (ea->indirect) {
+				if (ea_len(ea) == size)
+					set_indirect_ea(s, ea->anode, ea_sec(ea), data, size);
+			}
+			else {
+				if (ea_valuelen(ea) == size)
+					hpfs_ea_write(s, a, ano, pos + 4 + ea->namelen + 1, size, data);
+			}
+			return;
+		}
+		pos += ea->namelen + ea_valuelen(ea) + 5;
+	}
+	if (!le16_to_cpu(fnode->ea_offs)) {
+		/*if (le16_to_cpu(fnode->ea_size_s)) {
+			hpfs_error(s, "fnode %08x: ea_size_s == %03x, ea_offs == 0",
+				inode->i_ino, le16_to_cpu(fnode->ea_size_s));
+			return;
+		}*/
+		fnode->ea_offs = cpu_to_le16(0xc4);
+	}
+	if (le16_to_cpu(fnode->ea_offs) < 0xc4 || le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200) {
+		hpfs_error(s, "fnode %08lx: ea_offs == %03x, ea_size_s == %03x",
+			(unsigned long)inode->i_ino,
+			le32_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s));
+		return;
+	}
+	if ((le16_to_cpu(fnode->ea_size_s) || !le32_to_cpu(fnode->ea_size_l)) &&
+	     le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) + strlen(key) + size + 5 <= 0x200) {
+		ea = fnode_end_ea(fnode);
+		*(char *)ea = 0;
+		ea->namelen = strlen(key);
+		ea->valuelen_lo = size;
+		ea->valuelen_hi = size >> 8;
+		strcpy(ea->name, key);
+		memcpy(ea_data(ea), data, size);
+		fnode->ea_size_s = cpu_to_le16(le16_to_cpu(fnode->ea_size_s) + strlen(key) + size + 5);
+		goto ret;
+	}
+	/* Most the code here is 99.9993422% unused. I hope there are no bugs.
+	   But what .. HPFS.IFS has also bugs in ea management. */
+	if (le16_to_cpu(fnode->ea_size_s) && !le32_to_cpu(fnode->ea_size_l)) {
+		secno n;
+		struct buffer_head *bh;
+		char *data;
+		if (!(n = hpfs_alloc_sector(s, fno, 1, 0))) return;
+		if (!(data = hpfs_get_sector(s, n, &bh))) {
+			hpfs_free_sectors(s, n, 1);
+			return;
+		}
+		memcpy(data, fnode_ea(fnode), le16_to_cpu(fnode->ea_size_s));
+		fnode->ea_size_l = cpu_to_le32(le16_to_cpu(fnode->ea_size_s));
+		fnode->ea_size_s = cpu_to_le16(0);
+		fnode->ea_secno = cpu_to_le32(n);
+		fnode->ea_anode = cpu_to_le32(0);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+	}
+	pos = le32_to_cpu(fnode->ea_size_l) + 5 + strlen(key) + size;
+	len = (le32_to_cpu(fnode->ea_size_l) + 511) >> 9;
+	if (pos >= 30000) goto bail;
+	while (((pos + 511) >> 9) > len) {
+		if (!len) {
+			secno q = hpfs_alloc_sector(s, fno, 1, 0);
+			if (!q) goto bail;
+			fnode->ea_secno = cpu_to_le32(q);
+			fnode->ea_anode = 0;
+			len++;
+		} else if (!fnode->ea_anode) {
+			if (hpfs_alloc_if_possible(s, le32_to_cpu(fnode->ea_secno) + len)) {
+				len++;
+			} else {
+				/* Aargh... don't know how to create ea anodes :-( */
+				/*struct buffer_head *bh;
+				struct anode *anode;
+				anode_secno a_s;
+				if (!(anode = hpfs_alloc_anode(s, fno, &a_s, &bh)))
+					goto bail;
+				anode->up = cpu_to_le32(fno);
+				anode->btree.fnode_parent = 1;
+				anode->btree.n_free_nodes--;
+				anode->btree.n_used_nodes++;
+				anode->btree.first_free = cpu_to_le16(le16_to_cpu(anode->btree.first_free) + 12);
+				anode->u.external[0].disk_secno = cpu_to_le32(le32_to_cpu(fnode->ea_secno));
+				anode->u.external[0].file_secno = cpu_to_le32(0);
+				anode->u.external[0].length = cpu_to_le32(len);
+				mark_buffer_dirty(bh);
+				brelse(bh);
+				fnode->ea_anode = 1;
+				fnode->ea_secno = cpu_to_le32(a_s);*/
+				secno new_sec;
+				int i;
+				if (!(new_sec = hpfs_alloc_sector(s, fno, 1, 1 - ((pos + 511) >> 9))))
+					goto bail;
+				for (i = 0; i < len; i++) {
+					struct buffer_head *bh1, *bh2;
+					void *b1, *b2;
+					if (!(b1 = hpfs_map_sector(s, le32_to_cpu(fnode->ea_secno) + i, &bh1, len - i - 1))) {
+						hpfs_free_sectors(s, new_sec, (pos + 511) >> 9);
+						goto bail;
+					}
+					if (!(b2 = hpfs_get_sector(s, new_sec + i, &bh2))) {
+						brelse(bh1);
+						hpfs_free_sectors(s, new_sec, (pos + 511) >> 9);
+						goto bail;
+					}
+					memcpy(b2, b1, 512);
+					brelse(bh1);
+					mark_buffer_dirty(bh2);
+					brelse(bh2);
+				}
+				hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno), len);
+				fnode->ea_secno = cpu_to_le32(new_sec);
+				len = (pos + 511) >> 9;
+			}
+		}
+		if (fnode->ea_anode) {
+			if (hpfs_add_sector_to_btree(s, le32_to_cpu(fnode->ea_secno),
+						     0, len) != -1) {
+				len++;
+			} else {
+				goto bail;
+			}
+		}
+	}
+	h[0] = 0;
+	h[1] = strlen(key);
+	h[2] = size & 0xff;
+	h[3] = size >> 8;
+	if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l), 4, h)) goto bail;
+	if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 4, h[1] + 1, key)) goto bail;
+	if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 5 + h[1], size, data)) goto bail;
+	fnode->ea_size_l = cpu_to_le32(pos);
+	ret:
+	hpfs_i(inode)->i_ea_size += 5 + strlen(key) + size;
+	return;
+	bail:
+	if (le32_to_cpu(fnode->ea_secno))
+		if (fnode->ea_anode) hpfs_truncate_btree(s, le32_to_cpu(fnode->ea_secno), 1, (le32_to_cpu(fnode->ea_size_l) + 511) >> 9);
+		else hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno) + ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9), len - ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9));
+	else fnode->ea_secno = fnode->ea_size_l = cpu_to_le32(0);
+}
+	
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
new file mode 100644
index 00000000..89c500ee
--- /dev/null
+++ b/fs/hpfs/file.c
@@ -0,0 +1,166 @@
+/*
+ *  linux/fs/hpfs/file.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  file VFS functions
+ */
+
+#include "hpfs_fn.h"
+
+#define BLOCKS(size) (((size) + 511) >> 9)
+
+static int hpfs_file_release(struct inode *inode, struct file *file)
+{
+	hpfs_lock(inode->i_sb);
+	hpfs_write_if_changed(inode);
+	hpfs_unlock(inode->i_sb);
+	return 0;
+}
+
+int hpfs_file_fsync(struct file *file, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	return sync_blockdev(inode->i_sb->s_bdev);
+}
+
+/*
+ * generic_file_read often calls bmap with non-existing sector,
+ * so we must ignore such errors.
+ */
+
+static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
+	unsigned n, disk_secno;
+	struct fnode *fnode;
+	struct buffer_head *bh;
+	if (BLOCKS(hpfs_i(inode)->mmu_private) <= file_secno) return 0;
+	n = file_secno - hpfs_inode->i_file_sec;
+	if (n < hpfs_inode->i_n_secs) return hpfs_inode->i_disk_sec + n;
+	if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0;
+	disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh);
+	if (disk_secno == -1) return 0;
+	if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0;
+	return disk_secno;
+}
+
+static void hpfs_truncate(struct inode *i)
+{
+	if (IS_IMMUTABLE(i)) return /*-EPERM*/;
+	hpfs_lock_assert(i->i_sb);
+
+	hpfs_i(i)->i_n_secs = 0;
+	i->i_blocks = 1 + ((i->i_size + 511) >> 9);
+	hpfs_i(i)->mmu_private = i->i_size;
+	hpfs_truncate_btree(i->i_sb, i->i_ino, 1, ((i->i_size + 511) >> 9));
+	hpfs_write_inode(i);
+	hpfs_i(i)->i_n_secs = 0;
+}
+
+static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
+{
+	int r;
+	secno s;
+	hpfs_lock(inode->i_sb);
+	s = hpfs_bmap(inode, iblock);
+	if (s) {
+		map_bh(bh_result, inode->i_sb, s);
+		goto ret_0;
+	}
+	if (!create) goto ret_0;
+	if (iblock<<9 != hpfs_i(inode)->mmu_private) {
+		BUG();
+		r = -EIO;
+		goto ret_r;
+	}
+	if ((s = hpfs_add_sector_to_btree(inode->i_sb, inode->i_ino, 1, inode->i_blocks - 1)) == -1) {
+		hpfs_truncate_btree(inode->i_sb, inode->i_ino, 1, inode->i_blocks - 1);
+		r = -ENOSPC;
+		goto ret_r;
+	}
+	inode->i_blocks++;
+	hpfs_i(inode)->mmu_private += 512;
+	set_buffer_new(bh_result);
+	map_bh(bh_result, inode->i_sb, s);
+	ret_0:
+	r = 0;
+	ret_r:
+	hpfs_unlock(inode->i_sb);
+	return r;
+}
+
+static int hpfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page,hpfs_get_block, wbc);
+}
+
+static int hpfs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page,hpfs_get_block);
+}
+
+static int hpfs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	*pagep = NULL;
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+				hpfs_get_block,
+				&hpfs_i(mapping->host)->mmu_private);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
+
+static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,hpfs_get_block);
+}
+
+const struct address_space_operations hpfs_aops = {
+	.readpage = hpfs_readpage,
+	.writepage = hpfs_writepage,
+	.write_begin = hpfs_write_begin,
+	.write_end = generic_write_end,
+	.bmap = _hpfs_bmap
+};
+
+static ssize_t hpfs_file_write(struct file *file, const char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	ssize_t retval;
+
+	retval = do_sync_write(file, buf, count, ppos);
+	if (retval > 0) {
+		hpfs_lock(file->f_path.dentry->d_sb);
+		hpfs_i(file->f_path.dentry->d_inode)->i_dirty = 1;
+		hpfs_unlock(file->f_path.dentry->d_sb);
+	}
+	return retval;
+}
+
+const struct file_operations hpfs_file_ops =
+{
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= hpfs_file_write,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.release	= hpfs_file_release,
+	.fsync		= hpfs_file_fsync,
+	.splice_read	= generic_file_splice_read,
+};
+
+const struct inode_operations hpfs_file_iops =
+{
+	.truncate	= hpfs_truncate,
+	.setattr	= hpfs_setattr,
+};
diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h
new file mode 100644
index 00000000..8b0650aa
--- /dev/null
+++ b/fs/hpfs/hpfs.h
@@ -0,0 +1,568 @@
+/*
+ *  linux/fs/hpfs/hpfs.h
+ *
+ *  HPFS structures by Chris Smith, 1993
+ *
+ *  a little bit modified by Mikulas Patocka, 1998-1999
+ */
+
+/* The paper
+
+     Duncan, Roy
+     Design goals and implementation of the new High Performance File System
+     Microsoft Systems Journal  Sept 1989  v4 n5 p1(13)
+
+   describes what HPFS looked like when it was new, and it is the source
+   of most of the information given here.  The rest is conjecture.
+
+   For definitive information on the Duncan paper, see it, not this file.
+   For definitive information on HPFS, ask somebody else -- this is guesswork.
+   There are certain to be many mistakes. */
+
+#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
+#error unknown endian
+#endif
+
+/* Notation */
+
+typedef u32 secno;			/* sector number, partition relative */
+
+typedef secno dnode_secno;		/* sector number of a dnode */
+typedef secno fnode_secno;		/* sector number of an fnode */
+typedef secno anode_secno;		/* sector number of an anode */
+
+typedef u32 time32_t;		/* 32-bit time_t type */
+
+/* sector 0 */
+
+/* The boot block is very like a FAT boot block, except that the
+   29h signature byte is 28h instead, and the ID string is "HPFS". */
+
+#define BB_MAGIC 0xaa55
+
+struct hpfs_boot_block
+{
+  u8 jmp[3];
+  u8 oem_id[8];
+  u8 bytes_per_sector[2];	/* 512 */
+  u8 sectors_per_cluster;
+  u8 n_reserved_sectors[2];
+  u8 n_fats;
+  u8 n_rootdir_entries[2];
+  u8 n_sectors_s[2];
+  u8 media_byte;
+  u16 sectors_per_fat;
+  u16 sectors_per_track;
+  u16 heads_per_cyl;
+  u32 n_hidden_sectors;
+  u32 n_sectors_l;		/* size of partition */
+  u8 drive_number;
+  u8 mbz;
+  u8 sig_28h;			/* 28h */
+  u8 vol_serno[4];
+  u8 vol_label[11];
+  u8 sig_hpfs[8];		/* "HPFS    " */
+  u8 pad[448];
+  u16 magic;			/* aa55 */
+};
+
+
+/* sector 16 */
+
+/* The super block has the pointer to the root directory. */
+
+#define SB_MAGIC 0xf995e849
+
+struct hpfs_super_block
+{
+  u32 magic;				/* f995 e849 */
+  u32 magic1;				/* fa53 e9c5, more magic? */
+  u8 version;				/* version of a filesystem  usually 2 */
+  u8 funcversion;			/* functional version - oldest version
+  					   of filesystem that can understand
+					   this disk */
+  u16 zero;				/* 0 */
+  fnode_secno root;			/* fnode of root directory */
+  secno n_sectors;			/* size of filesystem */
+  u32 n_badblocks;			/* number of bad blocks */
+  secno bitmaps;			/* pointers to free space bit maps */
+  u32 zero1;				/* 0 */
+  secno badblocks;			/* bad block list */
+  u32 zero3;				/* 0 */
+  time32_t last_chkdsk;			/* date last checked, 0 if never */
+  time32_t last_optimize;		/* date last optimized, 0 if never */
+  secno n_dir_band;			/* number of sectors in dir band */
+  secno dir_band_start;			/* first sector in dir band */
+  secno dir_band_end;			/* last sector in dir band */
+  secno dir_band_bitmap;		/* free space map, 1 dnode per bit */
+  u8 volume_name[32];			/* not used */
+  secno user_id_table;			/* 8 preallocated sectors - user id */
+  u32 zero6[103];			/* 0 */
+};
+
+
+/* sector 17 */
+
+/* The spare block has pointers to spare sectors.  */
+
+#define SP_MAGIC 0xf9911849
+
+struct hpfs_spare_block
+{
+  u32 magic;				/* f991 1849 */
+  u32 magic1;				/* fa52 29c5, more magic? */
+
+#ifdef __LITTLE_ENDIAN
+  u8 dirty: 1;				/* 0 clean, 1 "improperly stopped" */
+  u8 sparedir_used: 1;			/* spare dirblks used */
+  u8 hotfixes_used: 1;			/* hotfixes used */
+  u8 bad_sector: 1;			/* bad sector, corrupted disk (???) */
+  u8 bad_bitmap: 1;			/* bad bitmap */
+  u8 fast: 1;				/* partition was fast formatted */
+  u8 old_wrote: 1;			/* old version wrote to partion */
+  u8 old_wrote_1: 1;			/* old version wrote to partion (?) */
+#else
+  u8 old_wrote_1: 1;			/* old version wrote to partion (?) */
+  u8 old_wrote: 1;			/* old version wrote to partion */
+  u8 fast: 1;				/* partition was fast formatted */
+  u8 bad_bitmap: 1;			/* bad bitmap */
+  u8 bad_sector: 1;			/* bad sector, corrupted disk (???) */
+  u8 hotfixes_used: 1;			/* hotfixes used */
+  u8 sparedir_used: 1;			/* spare dirblks used */
+  u8 dirty: 1;				/* 0 clean, 1 "improperly stopped" */
+#endif
+
+#ifdef __LITTLE_ENDIAN
+  u8 install_dasd_limits: 1;		/* HPFS386 flags */
+  u8 resynch_dasd_limits: 1;
+  u8 dasd_limits_operational: 1;
+  u8 multimedia_active: 1;
+  u8 dce_acls_active: 1;
+  u8 dasd_limits_dirty: 1;
+  u8 flag67: 2;
+#else
+  u8 flag67: 2;
+  u8 dasd_limits_dirty: 1;
+  u8 dce_acls_active: 1;
+  u8 multimedia_active: 1;
+  u8 dasd_limits_operational: 1;
+  u8 resynch_dasd_limits: 1;
+  u8 install_dasd_limits: 1;		/* HPFS386 flags */
+#endif
+
+  u8 mm_contlgulty;
+  u8 unused;
+
+  secno hotfix_map;			/* info about remapped bad sectors */
+  u32 n_spares_used;			/* number of hotfixes */
+  u32 n_spares;				/* number of spares in hotfix map */
+  u32 n_dnode_spares_free;		/* spare dnodes unused */
+  u32 n_dnode_spares;			/* length of spare_dnodes[] list,
+					   follows in this block*/
+  secno code_page_dir;			/* code page directory block */
+  u32 n_code_pages;			/* number of code pages */
+  u32 super_crc;			/* on HPFS386 and LAN Server this is
+  					   checksum of superblock, on normal
+					   OS/2 unused */
+  u32 spare_crc;			/* on HPFS386 checksum of spareblock */
+  u32 zero1[15];			/* unused */
+  dnode_secno spare_dnodes[100];	/* emergency free dnode list */
+  u32 zero2[1];				/* room for more? */
+};
+
+/* The bad block list is 4 sectors long.  The first word must be zero,
+   the remaining words give n_badblocks bad block numbers.
+   I bet you can see it coming... */
+
+#define BAD_MAGIC 0
+       
+/* The hotfix map is 4 sectors long.  It looks like
+
+       secno from[n_spares];
+       secno to[n_spares];
+
+   The to[] list is initialized to point to n_spares preallocated empty
+   sectors.  The from[] list contains the sector numbers of bad blocks
+   which have been remapped to corresponding sectors in the to[] list.
+   n_spares_used gives the length of the from[] list. */
+
+
+/* Sectors 18 and 19 are preallocated and unused.
+   Maybe they're spares for 16 and 17, but simple substitution fails. */
+
+
+/* The code page info pointed to by the spare block consists of an index
+   block and blocks containing uppercasing tables.  I don't know what
+   these are for (CHKDSK, maybe?) -- OS/2 does not seem to use them
+   itself.  Linux doesn't use them either. */
+
+/* block pointed to by spareblock->code_page_dir */
+
+#define CP_DIR_MAGIC 0x494521f7
+
+struct code_page_directory
+{
+  u32 magic;				/* 4945 21f7 */
+  u32 n_code_pages;			/* number of pointers following */
+  u32 zero1[2];
+  struct {
+    u16 ix;				/* index */
+    u16 code_page_number;		/* code page number */
+    u32 bounds;				/* matches corresponding word
+					   in data block */
+    secno code_page_data;		/* sector number of a code_page_data
+					   containing c.p. array */
+    u16 index;				/* index in c.p. array in that sector*/
+    u16 unknown;			/* some unknown value; usually 0;
+    					   2 in Japanese version */
+  } array[31];				/* unknown length */
+};
+
+/* blocks pointed to by code_page_directory */
+
+#define CP_DATA_MAGIC 0x894521f7
+
+struct code_page_data
+{
+  u32 magic;				/* 8945 21f7 */
+  u32 n_used;				/* # elements used in c_p_data[] */
+  u32 bounds[3];			/* looks a bit like
+					     (beg1,end1), (beg2,end2)
+					   one byte each */
+  u16 offs[3];				/* offsets from start of sector
+					   to start of c_p_data[ix] */
+  struct {
+    u16 ix;				/* index */
+    u16 code_page_number;		/* code page number */
+    u16 unknown;			/* the same as in cp directory */
+    u8 map[128];			/* upcase table for chars 80..ff */
+    u16 zero2;
+  } code_page[3];
+  u8 incognita[78];
+};
+
+
+/* Free space bitmaps are 4 sectors long, which is 16384 bits.
+   16384 sectors is 8 meg, and each 8 meg band has a 4-sector bitmap.
+   Bit order in the maps is little-endian.  0 means taken, 1 means free.
+
+   Bit map sectors are marked allocated in the bit maps, and so are sectors 
+   off the end of the partition.
+
+   Band 0 is sectors 0-3fff, its map is in sectors 18-1b.
+   Band 1 is 4000-7fff, its map is in 7ffc-7fff.
+   Band 2 is 8000-ffff, its map is in 8000-8003.
+   The remaining bands have maps in their first (even) or last (odd) 4 sectors
+     -- if the last, partial, band is odd its map is in its last 4 sectors.
+
+   The bitmap locations are given in a table pointed to by the super block.
+   No doubt they aren't constrained to be at 18, 7ffc, 8000, ...; that is
+   just where they usually are.
+
+   The "directory band" is a bunch of sectors preallocated for dnodes.
+   It has a 4-sector free space bitmap of its own.  Each bit in the map
+   corresponds to one 4-sector dnode, bit 0 of the map corresponding to
+   the first 4 sectors of the directory band.  The entire band is marked
+   allocated in the main bitmap.   The super block gives the locations
+   of the directory band and its bitmap.  ("band" doesn't mean it is
+   8 meg long; it isn't.)  */
+
+
+/* dnode: directory.  4 sectors long */
+
+/* A directory is a tree of dnodes.  The fnode for a directory
+   contains one pointer, to the root dnode of the tree.  The fnode
+   never moves, the dnodes do the B-tree thing, splitting and merging
+   as files are added and removed.  */
+
+#define DNODE_MAGIC   0x77e40aae
+
+struct dnode {
+  u32 magic;				/* 77e4 0aae */
+  u32 first_free;			/* offset from start of dnode to
+					   first free dir entry */
+#ifdef __LITTLE_ENDIAN
+  u8 root_dnode: 1;			/* Is it root dnode? */
+  u8 increment_me: 7;			/* some kind of activity counter? */
+					/* Neither HPFS.IFS nor CHKDSK cares
+					   if you change this word */
+#else
+  u8 increment_me: 7;			/* some kind of activity counter? */
+					/* Neither HPFS.IFS nor CHKDSK cares
+					   if you change this word */
+  u8 root_dnode: 1;			/* Is it root dnode? */
+#endif
+  u8 increment_me2[3];
+  secno up;				/* (root dnode) directory's fnode
+					   (nonroot) parent dnode */
+  dnode_secno self;			/* pointer to this dnode */
+  u8 dirent[2028];			/* one or more dirents */
+};
+
+struct hpfs_dirent {
+  u16 length;				/* offset to next dirent */
+
+#ifdef __LITTLE_ENDIAN
+  u8 first: 1;				/* set on phony ^A^A (".") entry */
+  u8 has_acl: 1;
+  u8 down: 1;				/* down pointer present (after name) */
+  u8 last: 1;				/* set on phony \377 entry */
+  u8 has_ea: 1;				/* entry has EA */
+  u8 has_xtd_perm: 1;			/* has extended perm list (???) */
+  u8 has_explicit_acl: 1;
+  u8 has_needea: 1;			/* ?? some EA has NEEDEA set
+					   I have no idea why this is
+					   interesting in a dir entry */
+#else
+  u8 has_needea: 1;			/* ?? some EA has NEEDEA set
+					   I have no idea why this is
+					   interesting in a dir entry */
+  u8 has_explicit_acl: 1;
+  u8 has_xtd_perm: 1;			/* has extended perm list (???) */
+  u8 has_ea: 1;				/* entry has EA */
+  u8 last: 1;				/* set on phony \377 entry */
+  u8 down: 1;				/* down pointer present (after name) */
+  u8 has_acl: 1;
+  u8 first: 1;				/* set on phony ^A^A (".") entry */
+#endif
+
+#ifdef __LITTLE_ENDIAN
+  u8 read_only: 1;			/* dos attrib */
+  u8 hidden: 1;				/* dos attrib */
+  u8 system: 1;				/* dos attrib */
+  u8 flag11: 1;				/* would be volume label dos attrib */
+  u8 directory: 1;			/* dos attrib */
+  u8 archive: 1;			/* dos attrib */
+  u8 not_8x3: 1;			/* name is not 8.3 */
+  u8 flag15: 1;
+#else
+  u8 flag15: 1;
+  u8 not_8x3: 1;			/* name is not 8.3 */
+  u8 archive: 1;			/* dos attrib */
+  u8 directory: 1;			/* dos attrib */
+  u8 flag11: 1;				/* would be volume label dos attrib */
+  u8 system: 1;				/* dos attrib */
+  u8 hidden: 1;				/* dos attrib */
+  u8 read_only: 1;			/* dos attrib */
+#endif
+
+  fnode_secno fnode;			/* fnode giving allocation info */
+  time32_t write_date;			/* mtime */
+  u32 file_size;			/* file length, bytes */
+  time32_t read_date;			/* atime */
+  time32_t creation_date;			/* ctime */
+  u32 ea_size;				/* total EA length, bytes */
+  u8 no_of_acls;			/* number of ACL's (low 3 bits) */
+  u8 ix;				/* code page index (of filename), see
+					   struct code_page_data */
+  u8 namelen, name[1];			/* file name */
+  /* dnode_secno down;	  btree down pointer, if present,
+     			  follows name on next word boundary, or maybe it
+			  precedes next dirent, which is on a word boundary. */
+};
+
+
+/* B+ tree: allocation info in fnodes and anodes */
+
+/* dnodes point to fnodes which are responsible for listing the sectors
+   assigned to the file.  This is done with trees of (length,address)
+   pairs.  (Actually triples, of (length, file-address, disk-address)
+   which can represent holes.  Find out if HPFS does that.)
+   At any rate, fnodes contain a small tree; if subtrees are needed
+   they occupy essentially a full block in anodes.  A leaf-level tree node
+   has 3-word entries giving sector runs, a non-leaf node has 2-word
+   entries giving subtree pointers.  A flag in the header says which. */
+
+struct bplus_leaf_node
+{
+  u32 file_secno;			/* first file sector in extent */
+  u32 length;				/* length, sectors */
+  secno disk_secno;			/* first corresponding disk sector */
+};
+
+struct bplus_internal_node
+{
+  u32 file_secno;			/* subtree maps sectors < this  */
+  anode_secno down;			/* pointer to subtree */
+};
+
+struct bplus_header
+{
+#ifdef __LITTLE_ENDIAN
+  u8 hbff: 1;			/* high bit of first free entry offset */
+  u8 flag1234: 4;
+  u8 fnode_parent: 1;			/* ? we're pointed to by an fnode,
+					   the data btree or some ea or the
+					   main ea bootage pointer ea_secno */
+					/* also can get set in fnodes, which
+					   may be a chkdsk glitch or may mean
+					   this bit is irrelevant in fnodes,
+					   or this interpretation is all wet */
+  u8 binary_search: 1;			/* suggest binary search (unused) */
+  u8 internal: 1;			/* 1 -> (internal) tree of anodes
+					   0 -> (leaf) list of extents */
+#else
+  u8 internal: 1;			/* 1 -> (internal) tree of anodes
+					   0 -> (leaf) list of extents */
+  u8 binary_search: 1;			/* suggest binary search (unused) */
+  u8 fnode_parent: 1;			/* ? we're pointed to by an fnode,
+					   the data btree or some ea or the
+					   main ea bootage pointer ea_secno */
+					/* also can get set in fnodes, which
+					   may be a chkdsk glitch or may mean
+					   this bit is irrelevant in fnodes,
+					   or this interpretation is all wet */
+  u8 flag1234: 4;
+  u8 hbff: 1;			/* high bit of first free entry offset */
+#endif
+  u8 fill[3];
+  u8 n_free_nodes;			/* free nodes in following array */
+  u8 n_used_nodes;			/* used nodes in following array */
+  u16 first_free;			/* offset from start of header to
+					   first free node in array */
+  union {
+    struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving
+					       subtree pointers */
+    struct bplus_leaf_node external[0];	    /* (external) 3-word entries giving
+					       sector runs */
+  } u;
+};
+
+/* fnode: root of allocation b+ tree, and EA's */
+
+/* Every file and every directory has one fnode, pointed to by the directory
+   entry and pointing to the file's sectors or directory's root dnode.  EA's
+   are also stored here, and there are said to be ACL's somewhere here too. */
+
+#define FNODE_MAGIC 0xf7e40aae
+
+struct fnode
+{
+  u32 magic;				/* f7e4 0aae */
+  u32 zero1[2];				/* read history */
+  u8 len, name[15];			/* true length, truncated name */
+  fnode_secno up;			/* pointer to file's directory fnode */
+  secno acl_size_l;
+  secno acl_secno;
+  u16 acl_size_s;
+  u8 acl_anode;
+  u8 zero2;				/* history bit count */
+  u32 ea_size_l;			/* length of disk-resident ea's */
+  secno ea_secno;			/* first sector of disk-resident ea's*/
+  u16 ea_size_s;			/* length of fnode-resident ea's */
+
+#ifdef __LITTLE_ENDIAN
+  u8 flag0: 1;
+  u8 ea_anode: 1;			/* 1 -> ea_secno is an anode */
+  u8 flag234567: 6;
+#else
+  u8 flag234567: 6;
+  u8 ea_anode: 1;			/* 1 -> ea_secno is an anode */
+  u8 flag0: 1;
+#endif
+
+#ifdef __LITTLE_ENDIAN
+  u8 dirflag: 1;			/* 1 -> directory.  first & only extent
+					   points to dnode. */
+  u8 flag9012345: 7;
+#else
+  u8 flag9012345: 7;
+  u8 dirflag: 1;			/* 1 -> directory.  first & only extent
+					   points to dnode. */
+#endif
+
+  struct bplus_header btree;		/* b+ tree, 8 extents or 12 subtrees */
+  union {
+    struct bplus_leaf_node external[8];
+    struct bplus_internal_node internal[12];
+  } u;
+
+  u32 file_size;			/* file length, bytes */
+  u32 n_needea;				/* number of EA's with NEEDEA set */
+  u8 user_id[16];			/* unused */
+  u16 ea_offs;				/* offset from start of fnode
+					   to first fnode-resident ea */
+  u8 dasd_limit_treshhold;
+  u8 dasd_limit_delta;
+  u32 dasd_limit;
+  u32 dasd_usage;
+  u8 ea[316];				/* zero or more EA's, packed together
+					   with no alignment padding.
+					   (Do not use this name, get here
+					   via fnode + ea_offs. I think.) */
+};
+
+
+/* anode: 99.44% pure allocation tree */
+
+#define ANODE_MAGIC 0x37e40aae
+
+struct anode
+{
+  u32 magic;				/* 37e4 0aae */
+  anode_secno self;			/* pointer to this anode */
+  secno up;				/* parent anode or fnode */
+
+  struct bplus_header btree;		/* b+tree, 40 extents or 60 subtrees */
+  union {
+    struct bplus_leaf_node external[40];
+    struct bplus_internal_node internal[60];
+  } u;
+
+  u32 fill[3];				/* unused */
+};
+
+
+/* extended attributes.
+
+   A file's EA info is stored as a list of (name,value) pairs.  It is
+   usually in the fnode, but (if it's large) it is moved to a single
+   sector run outside the fnode, or to multiple runs with an anode tree
+   that points to them.
+
+   The value of a single EA is stored along with the name, or (if large)
+   it is moved to a single sector run, or multiple runs pointed to by an
+   anode tree, pointed to by the value field of the (name,value) pair.
+
+   Flags in the EA tell whether the value is immediate, in a single sector
+   run, or in multiple runs.  Flags in the fnode tell whether the EA list
+   is immediate, in a single run, or in multiple runs. */
+
+struct extended_attribute
+{
+#ifdef __LITTLE_ENDIAN
+  u8 indirect: 1;			/* 1 -> value gives sector number
+					   where real value starts */
+  u8 anode: 1;				/* 1 -> sector is an anode
+					   that points to fragmented value */
+  u8 flag23456: 5;
+  u8 needea: 1;				/* required ea */
+#else
+  u8 needea: 1;				/* required ea */
+  u8 flag23456: 5;
+  u8 anode: 1;				/* 1 -> sector is an anode
+					   that points to fragmented value */
+  u8 indirect: 1;			/* 1 -> value gives sector number
+					   where real value starts */
+#endif
+  u8 namelen;				/* length of name, bytes */
+  u8 valuelen_lo;			/* length of value, bytes */
+  u8 valuelen_hi;			/* length of value, bytes */
+  u8 name[0];
+  /*
+    u8 name[namelen];			ascii attrib name
+    u8 nul;				terminating '\0', not counted
+    u8 value[valuelen];			value, arbitrary
+      if this.indirect, valuelen is 8 and the value is
+        u32 length;			real length of value, bytes
+        secno secno;			sector address where it starts
+      if this.anode, the above sector number is the root of an anode tree
+        which points to the value.
+  */
+};
+
+/*
+   Local Variables:
+   comment-column: 40
+   End:
+*/
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
new file mode 100644
index 00000000..dd552f86
--- /dev/null
+++ b/fs/hpfs/hpfs_fn.h
@@ -0,0 +1,360 @@
+/*
+ *  linux/fs/hpfs/hpfs_fn.h
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  function headers
+ */
+
+//#define DBG
+//#define DEBUG_LOCKS
+
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+
+#include "hpfs.h"
+
+#define EIOERROR  EIO
+#define EFSERROR  EPERM
+#define EMEMERROR ENOMEM
+
+#define ANODE_ALLOC_FWD	512
+#define FNODE_ALLOC_FWD	0
+#define ALLOC_FWD_MIN	16
+#define ALLOC_FWD_MAX	128
+#define ALLOC_M		1
+#define FNODE_RD_AHEAD	16
+#define ANODE_RD_AHEAD	16
+#define DNODE_RD_AHEAD	4
+
+#define FREE_DNODES_ADD	58
+#define FREE_DNODES_DEL	29
+
+#define CHKCOND(x,y) if (!(x)) printk y
+
+#ifdef DBG
+#define PRINTK(x) printk x
+#else
+#undef PRINTK
+#define PRINTK(x)
+#endif
+
+struct hpfs_inode_info {
+	loff_t mmu_private;
+	ino_t i_parent_dir;	/* (directories) gives fnode of parent dir */
+	unsigned i_dno;		/* (directories) root dnode */
+	unsigned i_dpos;	/* (directories) temp for readdir */
+	unsigned i_dsubdno;	/* (directories) temp for readdir */
+	unsigned i_file_sec;	/* (files) minimalist cache of alloc info */
+	unsigned i_disk_sec;	/* (files) minimalist cache of alloc info */
+	unsigned i_n_secs;	/* (files) minimalist cache of alloc info */
+	unsigned i_ea_size;	/* size of extended attributes */
+	unsigned i_ea_mode : 1;	/* file's permission is stored in ea */
+	unsigned i_ea_uid : 1;	/* file's uid is stored in ea */
+	unsigned i_ea_gid : 1;	/* file's gid is stored in ea */
+	unsigned i_dirty : 1;
+	loff_t **i_rddir_off;
+	struct inode vfs_inode;
+};
+
+struct hpfs_sb_info {
+	struct mutex hpfs_mutex;	/* global hpfs lock */
+	ino_t sb_root;			/* inode number of root dir */
+	unsigned sb_fs_size;		/* file system size, sectors */
+	unsigned sb_bitmaps;		/* sector number of bitmap list */
+	unsigned sb_dirband_start;	/* directory band start sector */
+	unsigned sb_dirband_size;	/* directory band size, dnodes */
+	unsigned sb_dmap;		/* sector number of dnode bit map */
+	unsigned sb_n_free;		/* free blocks for statfs, or -1 */
+	unsigned sb_n_free_dnodes;	/* free dnodes for statfs, or -1 */
+	uid_t sb_uid;			/* uid from mount options */
+	gid_t sb_gid;			/* gid from mount options */
+	umode_t sb_mode;		/* mode from mount options */
+	unsigned sb_eas : 2;		/* eas: 0-ignore, 1-ro, 2-rw */
+	unsigned sb_err : 2;		/* on errs: 0-cont, 1-ro, 2-panic */
+	unsigned sb_chk : 2;		/* checks: 0-no, 1-normal, 2-strict */
+	unsigned sb_lowercase : 1;	/* downcase filenames hackery */
+	unsigned sb_was_error : 1;	/* there was an error, set dirty flag */
+	unsigned sb_chkdsk : 2;		/* chkdsk: 0-no, 1-on errs, 2-allways */
+	unsigned char *sb_cp_table;	/* code page tables: */
+					/* 	128 bytes uppercasing table & */
+					/*	128 bytes lowercasing table */
+	unsigned *sb_bmp_dir;		/* main bitmap directory */
+	unsigned sb_c_bitmap;		/* current bitmap */
+	unsigned sb_max_fwd_alloc;	/* max forwad allocation */
+	int sb_timeshift;
+};
+
+/* Four 512-byte buffers and the 2k block obtained by concatenating them */
+
+struct quad_buffer_head {
+	struct buffer_head *bh[4];
+	void *data;
+};
+
+/* The b-tree down pointer from a dir entry */
+
+static inline dnode_secno de_down_pointer (struct hpfs_dirent *de)
+{
+  CHKCOND(de->down,("HPFS: de_down_pointer: !de->down\n"));
+  return le32_to_cpu(*(dnode_secno *) ((void *) de + le16_to_cpu(de->length) - 4));
+}
+
+/* The first dir entry in a dnode */
+
+static inline struct hpfs_dirent *dnode_first_de (struct dnode *dnode)
+{
+  return (void *) dnode->dirent;
+}
+
+/* The end+1 of the dir entries */
+
+static inline struct hpfs_dirent *dnode_end_de (struct dnode *dnode)
+{
+  CHKCOND(le32_to_cpu(dnode->first_free)>=0x14 && le32_to_cpu(dnode->first_free)<=0xa00,("HPFS: dnode_end_de: dnode->first_free = %x\n",(unsigned)le32_to_cpu(dnode->first_free)));
+  return (void *) dnode + le32_to_cpu(dnode->first_free);
+}
+
+/* The dir entry after dir entry de */
+
+static inline struct hpfs_dirent *de_next_de (struct hpfs_dirent *de)
+{
+  CHKCOND(le16_to_cpu(de->length)>=0x20 && le16_to_cpu(de->length)<0x800,("HPFS: de_next_de: de->length = %x\n",(unsigned)le16_to_cpu(de->length)));
+  return (void *) de + le16_to_cpu(de->length);
+}
+
+static inline struct extended_attribute *fnode_ea(struct fnode *fnode)
+{
+	return (struct extended_attribute *)((char *)fnode + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s));
+}
+
+static inline struct extended_attribute *fnode_end_ea(struct fnode *fnode)
+{
+	return (struct extended_attribute *)((char *)fnode + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s));
+}
+
+static unsigned ea_valuelen(struct extended_attribute *ea)
+{
+	return ea->valuelen_lo + 256 * ea->valuelen_hi;
+}
+
+static inline struct extended_attribute *next_ea(struct extended_attribute *ea)
+{
+	return (struct extended_attribute *)((char *)ea + 5 + ea->namelen + ea_valuelen(ea));
+}
+
+static inline secno ea_sec(struct extended_attribute *ea)
+{
+	return le32_to_cpu(get_unaligned((secno *)((char *)ea + 9 + ea->namelen)));
+}
+
+static inline secno ea_len(struct extended_attribute *ea)
+{
+	return le32_to_cpu(get_unaligned((secno *)((char *)ea + 5 + ea->namelen)));
+}
+
+static inline char *ea_data(struct extended_attribute *ea)
+{
+	return (char *)((char *)ea + 5 + ea->namelen);
+}
+
+static inline unsigned de_size(int namelen, secno down_ptr)
+{
+	return ((0x1f + namelen + 3) & ~3) + (down_ptr ? 4 : 0);
+}
+
+static inline void copy_de(struct hpfs_dirent *dst, struct hpfs_dirent *src)
+{
+	int a;
+	int n;
+	if (!dst || !src) return;
+	a = dst->down;
+	n = dst->not_8x3;
+	memcpy((char *)dst + 2, (char *)src + 2, 28);
+	dst->down = a;
+	dst->not_8x3 = n;
+}
+
+static inline unsigned tstbits(u32 *bmp, unsigned b, unsigned n)
+{
+	int i;
+	if ((b >= 0x4000) || (b + n - 1 >= 0x4000)) return n;
+	if (!((le32_to_cpu(bmp[(b & 0x3fff) >> 5]) >> (b & 0x1f)) & 1)) return 1;
+	for (i = 1; i < n; i++)
+		if (!((le32_to_cpu(bmp[((b+i) & 0x3fff) >> 5]) >> ((b+i) & 0x1f)) & 1))
+			return i + 1;
+	return 0;
+}
+
+/* alloc.c */
+
+int hpfs_chk_sectors(struct super_block *, secno, int, char *);
+secno hpfs_alloc_sector(struct super_block *, secno, unsigned, int);
+int hpfs_alloc_if_possible(struct super_block *, secno);
+void hpfs_free_sectors(struct super_block *, secno, unsigned);
+int hpfs_check_free_dnodes(struct super_block *, int);
+void hpfs_free_dnode(struct super_block *, secno);
+struct dnode *hpfs_alloc_dnode(struct super_block *, secno, dnode_secno *, struct quad_buffer_head *);
+struct fnode *hpfs_alloc_fnode(struct super_block *, secno, fnode_secno *, struct buffer_head **);
+struct anode *hpfs_alloc_anode(struct super_block *, secno, anode_secno *, struct buffer_head **);
+
+/* anode.c */
+
+secno hpfs_bplus_lookup(struct super_block *, struct inode *, struct bplus_header *, unsigned, struct buffer_head *);
+secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned);
+void hpfs_remove_btree(struct super_block *, struct bplus_header *);
+int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *);
+int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, const char *);
+void hpfs_ea_remove(struct super_block *, secno, int, unsigned);
+void hpfs_truncate_btree(struct super_block *, secno, int, unsigned);
+void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
+
+/* buffer.c */
+
+void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int);
+void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **);
+void *hpfs_map_4sectors(struct super_block *, unsigned, struct quad_buffer_head *, int);
+void *hpfs_get_4sectors(struct super_block *, unsigned, struct quad_buffer_head *);
+void hpfs_brelse4(struct quad_buffer_head *);
+void hpfs_mark_4buffers_dirty(struct quad_buffer_head *);
+
+/* dentry.c */
+
+extern const struct dentry_operations hpfs_dentry_operations;
+
+/* dir.c */
+
+struct dentry *hpfs_lookup(struct inode *, struct dentry *, struct nameidata *);
+extern const struct file_operations hpfs_dir_ops;
+
+/* dnode.c */
+
+void hpfs_add_pos(struct inode *, loff_t *);
+void hpfs_del_pos(struct inode *, loff_t *);
+struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *,
+				const unsigned char *, unsigned, secno);
+int hpfs_add_dirent(struct inode *, const unsigned char *, unsigned,
+		    struct hpfs_dirent *);
+int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int);
+void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *);
+dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno);
+struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *);
+struct hpfs_dirent *map_dirent(struct inode *, dnode_secno,
+			       const unsigned char *, unsigned, dnode_secno *,
+			       struct quad_buffer_head *);
+void hpfs_remove_dtree(struct super_block *, dnode_secno);
+struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *);
+
+/* ea.c */
+
+void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned);
+int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int);
+char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *);
+void hpfs_set_ea(struct inode *, struct fnode *, const char *,
+		 const char *, int);
+
+/* file.c */
+
+int hpfs_file_fsync(struct file *, int);
+extern const struct file_operations hpfs_file_ops;
+extern const struct inode_operations hpfs_file_iops;
+extern const struct address_space_operations hpfs_aops;
+
+/* inode.c */
+
+void hpfs_init_inode(struct inode *);
+void hpfs_read_inode(struct inode *);
+void hpfs_write_inode(struct inode *);
+void hpfs_write_inode_nolock(struct inode *);
+int hpfs_setattr(struct dentry *, struct iattr *);
+void hpfs_write_if_changed(struct inode *);
+void hpfs_evict_inode(struct inode *);
+
+/* map.c */
+
+unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
+unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
+unsigned char *hpfs_load_code_page(struct super_block *, secno);
+secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
+struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
+struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **);
+struct dnode *hpfs_map_dnode(struct super_block *s, dnode_secno, struct quad_buffer_head *);
+dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino);
+
+/* name.c */
+
+unsigned char hpfs_upcase(unsigned char *, unsigned char);
+int hpfs_chk_name(const unsigned char *, unsigned *);
+unsigned char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int);
+int hpfs_compare_names(struct super_block *, const unsigned char *, unsigned,
+		       const unsigned char *, unsigned, int);
+int hpfs_is_name_long(const unsigned char *, unsigned);
+void hpfs_adjust_length(const unsigned char *, unsigned *);
+
+/* namei.c */
+
+extern const struct inode_operations hpfs_dir_iops;
+extern const struct address_space_operations hpfs_symlink_aops;
+
+static inline struct hpfs_inode_info *hpfs_i(struct inode *inode)
+{
+	return list_entry(inode, struct hpfs_inode_info, vfs_inode);
+}
+
+static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/* super.c */
+
+void hpfs_error(struct super_block *, const char *, ...)
+	__attribute__((format (printf, 2, 3)));
+int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *);
+unsigned hpfs_count_one_bitmap(struct super_block *, secno);
+
+/*
+ * local time (HPFS) to GMT (Unix)
+ */
+
+static inline time_t local_to_gmt(struct super_block *s, time32_t t)
+{
+	extern struct timezone sys_tz;
+	return t + sys_tz.tz_minuteswest * 60 + hpfs_sb(s)->sb_timeshift;
+}
+
+static inline time32_t gmt_to_local(struct super_block *s, time_t t)
+{
+	extern struct timezone sys_tz;
+	return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
+}
+
+/*
+ * Locking:
+ *
+ * hpfs_lock() locks the whole filesystem. It must be taken
+ * on any method called by the VFS.
+ *
+ * We don't do any per-file locking anymore, it is hard to
+ * review and HPFS is not performance-sensitive anyway.
+ */
+static inline void hpfs_lock(struct super_block *s)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	mutex_lock(&sbi->hpfs_mutex);
+}
+
+static inline void hpfs_unlock(struct super_block *s)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	mutex_unlock(&sbi->hpfs_mutex);
+}
+
+static inline void hpfs_lock_assert(struct super_block *s)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	WARN_ON(!mutex_is_locked(&sbi->hpfs_mutex));
+}
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
new file mode 100644
index 00000000..338cd836
--- /dev/null
+++ b/fs/hpfs/inode.c
@@ -0,0 +1,308 @@
+/*
+ *  linux/fs/hpfs/inode.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  inode VFS functions
+ */
+
+#include <linux/slab.h>
+#include "hpfs_fn.h"
+
+void hpfs_init_inode(struct inode *i)
+{
+	struct super_block *sb = i->i_sb;
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+
+	i->i_uid = hpfs_sb(sb)->sb_uid;
+	i->i_gid = hpfs_sb(sb)->sb_gid;
+	i->i_mode = hpfs_sb(sb)->sb_mode;
+	i->i_size = -1;
+	i->i_blocks = -1;
+	
+	hpfs_inode->i_dno = 0;
+	hpfs_inode->i_n_secs = 0;
+	hpfs_inode->i_file_sec = 0;
+	hpfs_inode->i_disk_sec = 0;
+	hpfs_inode->i_dpos = 0;
+	hpfs_inode->i_dsubdno = 0;
+	hpfs_inode->i_ea_mode = 0;
+	hpfs_inode->i_ea_uid = 0;
+	hpfs_inode->i_ea_gid = 0;
+	hpfs_inode->i_ea_size = 0;
+
+	hpfs_inode->i_rddir_off = NULL;
+	hpfs_inode->i_dirty = 0;
+
+	i->i_ctime.tv_sec = i->i_ctime.tv_nsec = 0;
+	i->i_mtime.tv_sec = i->i_mtime.tv_nsec = 0;
+	i->i_atime.tv_sec = i->i_atime.tv_nsec = 0;
+}
+
+void hpfs_read_inode(struct inode *i)
+{
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	struct super_block *sb = i->i_sb;
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+	void *ea;
+	int ea_size;
+
+	if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) {
+		/*i->i_mode |= S_IFREG;
+		i->i_mode &= ~0111;
+		i->i_op = &hpfs_file_iops;
+		i->i_fop = &hpfs_file_ops;
+		i->i_nlink = 0;*/
+		make_bad_inode(i);
+		return;
+	}
+	if (hpfs_sb(i->i_sb)->sb_eas) {
+		if ((ea = hpfs_get_ea(i->i_sb, fnode, "UID", &ea_size))) {
+			if (ea_size == 2) {
+				i->i_uid = le16_to_cpu(*(__le16*)ea);
+				hpfs_inode->i_ea_uid = 1;
+			}
+			kfree(ea);
+		}
+		if ((ea = hpfs_get_ea(i->i_sb, fnode, "GID", &ea_size))) {
+			if (ea_size == 2) {
+				i->i_gid = le16_to_cpu(*(__le16*)ea);
+				hpfs_inode->i_ea_gid = 1;
+			}
+			kfree(ea);
+		}
+		if ((ea = hpfs_get_ea(i->i_sb, fnode, "SYMLINK", &ea_size))) {
+			kfree(ea);
+			i->i_mode = S_IFLNK | 0777;
+			i->i_op = &page_symlink_inode_operations;
+			i->i_data.a_ops = &hpfs_symlink_aops;
+			i->i_nlink = 1;
+			i->i_size = ea_size;
+			i->i_blocks = 1;
+			brelse(bh);
+			return;
+		}
+		if ((ea = hpfs_get_ea(i->i_sb, fnode, "MODE", &ea_size))) {
+			int rdev = 0;
+			umode_t mode = hpfs_sb(sb)->sb_mode;
+			if (ea_size == 2) {
+				mode = le16_to_cpu(*(__le16*)ea);
+				hpfs_inode->i_ea_mode = 1;
+			}
+			kfree(ea);
+			i->i_mode = mode;
+			if (S_ISBLK(mode) || S_ISCHR(mode)) {
+				if ((ea = hpfs_get_ea(i->i_sb, fnode, "DEV", &ea_size))) {
+					if (ea_size == 4)
+						rdev = le32_to_cpu(*(__le32*)ea);
+					kfree(ea);
+				}
+			}
+			if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) {
+				brelse(bh);
+				i->i_nlink = 1;
+				i->i_size = 0;
+				i->i_blocks = 1;
+				init_special_inode(i, mode,
+					new_decode_dev(rdev));
+				return;
+			}
+		}
+	}
+	if (fnode->dirflag) {
+		int n_dnodes, n_subdirs;
+		i->i_mode |= S_IFDIR;
+		i->i_op = &hpfs_dir_iops;
+		i->i_fop = &hpfs_dir_ops;
+		hpfs_inode->i_parent_dir = le32_to_cpu(fnode->up);
+		hpfs_inode->i_dno = le32_to_cpu(fnode->u.external[0].disk_secno);
+		if (hpfs_sb(sb)->sb_chk >= 2) {
+			struct buffer_head *bh0;
+			if (hpfs_map_fnode(sb, hpfs_inode->i_parent_dir, &bh0)) brelse(bh0);
+		}
+		n_dnodes = 0; n_subdirs = 0;
+		hpfs_count_dnodes(i->i_sb, hpfs_inode->i_dno, &n_dnodes, &n_subdirs, NULL);
+		i->i_blocks = 4 * n_dnodes;
+		i->i_size = 2048 * n_dnodes;
+		i->i_nlink = 2 + n_subdirs;
+	} else {
+		i->i_mode |= S_IFREG;
+		if (!hpfs_inode->i_ea_mode) i->i_mode &= ~0111;
+		i->i_op = &hpfs_file_iops;
+		i->i_fop = &hpfs_file_ops;
+		i->i_nlink = 1;
+		i->i_size = le32_to_cpu(fnode->file_size);
+		i->i_blocks = ((i->i_size + 511) >> 9) + 1;
+		i->i_data.a_ops = &hpfs_aops;
+		hpfs_i(i)->mmu_private = i->i_size;
+	}
+	brelse(bh);
+}
+
+static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+	/*if (le32_to_cpu(fnode->acl_size_l) || le16_to_cpu(fnode->acl_size_s)) {
+		   Some unknown structures like ACL may be in fnode,
+		   we'd better not overwrite them
+		hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino);
+	} else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) {
+		__le32 ea;
+		if ((i->i_uid != hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) {
+			ea = cpu_to_le32(i->i_uid);
+			hpfs_set_ea(i, fnode, "UID", (char*)&ea, 2);
+			hpfs_inode->i_ea_uid = 1;
+		}
+		if ((i->i_gid != hpfs_sb(i->i_sb)->sb_gid) || hpfs_inode->i_ea_gid) {
+			ea = cpu_to_le32(i->i_gid);
+			hpfs_set_ea(i, fnode, "GID", (char *)&ea, 2);
+			hpfs_inode->i_ea_gid = 1;
+		}
+		if (!S_ISLNK(i->i_mode))
+			if ((i->i_mode != ((hpfs_sb(i->i_sb)->sb_mode & ~(S_ISDIR(i->i_mode) ? 0 : 0111))
+			  | (S_ISDIR(i->i_mode) ? S_IFDIR : S_IFREG))
+			  && i->i_mode != ((hpfs_sb(i->i_sb)->sb_mode & ~(S_ISDIR(i->i_mode) ? 0222 : 0333))
+			  | (S_ISDIR(i->i_mode) ? S_IFDIR : S_IFREG))) || hpfs_inode->i_ea_mode) {
+				ea = cpu_to_le32(i->i_mode);
+				/* sick, but legal */
+				hpfs_set_ea(i, fnode, "MODE", (char *)&ea, 2);
+				hpfs_inode->i_ea_mode = 1;
+			}
+		if (S_ISBLK(i->i_mode) || S_ISCHR(i->i_mode)) {
+			ea = cpu_to_le32(new_encode_dev(i->i_rdev));
+			hpfs_set_ea(i, fnode, "DEV", (char *)&ea, 4);
+		}
+	}
+}
+
+void hpfs_write_inode(struct inode *i)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+	struct inode *parent;
+	if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return;
+	if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) {
+		if (*hpfs_inode->i_rddir_off) printk("HPFS: write_inode: some position still there\n");
+		kfree(hpfs_inode->i_rddir_off);
+		hpfs_inode->i_rddir_off = NULL;
+	}
+	if (!i->i_nlink) {
+		return;
+	}
+	parent = iget_locked(i->i_sb, hpfs_inode->i_parent_dir);
+	if (parent) {
+		hpfs_inode->i_dirty = 0;
+		if (parent->i_state & I_NEW) {
+			hpfs_init_inode(parent);
+			hpfs_read_inode(parent);
+			unlock_new_inode(parent);
+		}
+		hpfs_write_inode_nolock(i);
+		iput(parent);
+	}
+}
+
+void hpfs_write_inode_nolock(struct inode *i)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	struct quad_buffer_head qbh;
+	struct hpfs_dirent *de;
+	if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return;
+	if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) return;
+	if (i->i_ino != hpfs_sb(i->i_sb)->sb_root && i->i_nlink) {
+		if (!(de = map_fnode_dirent(i->i_sb, i->i_ino, fnode, &qbh))) {
+			brelse(bh);
+			return;
+		}
+	} else de = NULL;
+	if (S_ISREG(i->i_mode)) {
+		fnode->file_size = cpu_to_le32(i->i_size);
+		if (de) de->file_size = cpu_to_le32(i->i_size);
+	} else if (S_ISDIR(i->i_mode)) {
+		fnode->file_size = cpu_to_le32(0);
+		if (de) de->file_size = cpu_to_le32(0);
+	}
+	hpfs_write_inode_ea(i, fnode);
+	if (de) {
+		de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
+		de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
+		de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec));
+		de->read_only = !(i->i_mode & 0222);
+		de->ea_size = cpu_to_le32(hpfs_inode->i_ea_size);
+		hpfs_mark_4buffers_dirty(&qbh);
+		hpfs_brelse4(&qbh);
+	}
+	if (S_ISDIR(i->i_mode)) {
+		if ((de = map_dirent(i, hpfs_inode->i_dno, "\001\001", 2, NULL, &qbh))) {
+			de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
+			de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
+			de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec));
+			de->read_only = !(i->i_mode & 0222);
+			de->ea_size = cpu_to_le32(/*hpfs_inode->i_ea_size*/0);
+			de->file_size = cpu_to_le32(0);
+			hpfs_mark_4buffers_dirty(&qbh);
+			hpfs_brelse4(&qbh);
+		} else
+			hpfs_error(i->i_sb,
+				"directory %08lx doesn't have '.' entry",
+				(unsigned long)i->i_ino);
+	}
+	mark_buffer_dirty(bh);
+	brelse(bh);
+}
+
+int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error = -EINVAL;
+
+	hpfs_lock(inode->i_sb);
+	if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root)
+		goto out_unlock;
+	if ((attr->ia_valid & ATTR_UID) && attr->ia_uid >= 0x10000)
+		goto out_unlock;
+	if ((attr->ia_valid & ATTR_GID) && attr->ia_gid >= 0x10000)
+		goto out_unlock;
+	if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size)
+		goto out_unlock;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		goto out_unlock;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			goto out_unlock;
+	}
+
+	setattr_copy(inode, attr);
+
+	hpfs_write_inode(inode);
+
+ out_unlock:
+	hpfs_unlock(inode->i_sb);
+	return error;
+}
+
+void hpfs_write_if_changed(struct inode *inode)
+{
+	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
+
+	if (hpfs_inode->i_dirty)
+		hpfs_write_inode(inode);
+}
+
+void hpfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	if (!inode->i_nlink) {
+		hpfs_lock(inode->i_sb);
+		hpfs_remove_fnode(inode->i_sb, inode->i_ino);
+		hpfs_unlock(inode->i_sb);
+	}
+}
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
new file mode 100644
index 00000000..a7908213
--- /dev/null
+++ b/fs/hpfs/map.c
@@ -0,0 +1,287 @@
+/*
+ *  linux/fs/hpfs/map.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  mapping structures to memory with some minimal checks
+ */
+
+#include "hpfs_fn.h"
+
+unsigned *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh)
+{
+	return hpfs_map_4sectors(s, hpfs_sb(s)->sb_dmap, qbh, 0);
+}
+
+unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
+			 struct quad_buffer_head *qbh, char *id)
+{
+	secno sec;
+	if (hpfs_sb(s)->sb_chk) if (bmp_block * 16384 > hpfs_sb(s)->sb_fs_size) {
+		hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id);
+		return NULL;
+	}
+	sec = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]);
+	if (!sec || sec > hpfs_sb(s)->sb_fs_size-4) {
+		hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id);
+		return NULL;
+	}
+	return hpfs_map_4sectors(s, sec, qbh, 4);
+}
+
+/*
+ * Load first code page into kernel memory, return pointer to 256-byte array,
+ * first 128 bytes are uppercasing table for chars 128-255, next 128 bytes are
+ * lowercasing table
+ */
+
+unsigned char *hpfs_load_code_page(struct super_block *s, secno cps)
+{
+	struct buffer_head *bh;
+	secno cpds;
+	unsigned cpi;
+	unsigned char *ptr;
+	unsigned char *cp_table;
+	int i;
+	struct code_page_data *cpd;
+	struct code_page_directory *cp = hpfs_map_sector(s, cps, &bh, 0);
+	if (!cp) return NULL;
+	if (le32_to_cpu(cp->magic) != CP_DIR_MAGIC) {
+		printk("HPFS: Code page directory magic doesn't match (magic = %08x)\n", le32_to_cpu(cp->magic));
+		brelse(bh);
+		return NULL;
+	}
+	if (!le32_to_cpu(cp->n_code_pages)) {
+		printk("HPFS: n_code_pages == 0\n");
+		brelse(bh);
+		return NULL;
+	}
+	cpds = le32_to_cpu(cp->array[0].code_page_data);
+	cpi = le16_to_cpu(cp->array[0].index);
+	brelse(bh);
+
+	if (cpi >= 3) {
+		printk("HPFS: Code page index out of array\n");
+		return NULL;
+	}
+	
+	if (!(cpd = hpfs_map_sector(s, cpds, &bh, 0))) return NULL;
+	if (le16_to_cpu(cpd->offs[cpi]) > 0x178) {
+		printk("HPFS: Code page index out of sector\n");
+		brelse(bh);
+		return NULL;
+	}
+	ptr = (unsigned char *)cpd + le16_to_cpu(cpd->offs[cpi]) + 6;
+	if (!(cp_table = kmalloc(256, GFP_KERNEL))) {
+		printk("HPFS: out of memory for code page table\n");
+		brelse(bh);
+		return NULL;
+	}
+	memcpy(cp_table, ptr, 128);
+	brelse(bh);
+
+	/* Try to build lowercasing table from uppercasing one */
+
+	for (i=128; i<256; i++) cp_table[i]=i;
+	for (i=128; i<256; i++) if (cp_table[i-128]!=i && cp_table[i-128]>=128)
+		cp_table[cp_table[i-128]] = i;
+	
+	return cp_table;
+}
+
+secno *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
+{
+	struct buffer_head *bh;
+	int n = (hpfs_sb(s)->sb_fs_size + 0x200000 - 1) >> 21;
+	int i;
+	secno *b;
+	if (!(b = kmalloc(n * 512, GFP_KERNEL))) {
+		printk("HPFS: can't allocate memory for bitmap directory\n");
+		return NULL;
+	}	
+	for (i=0;i<n;i++) {
+		secno *d = hpfs_map_sector(s, bmp+i, &bh, n - i - 1);
+		if (!d) {
+			kfree(b);
+			return NULL;
+		}
+		memcpy((char *)b + 512 * i, d, 512);
+		brelse(bh);
+	}
+	return b;
+}
+
+/*
+ * Load fnode to memory
+ */
+
+struct fnode *hpfs_map_fnode(struct super_block *s, ino_t ino, struct buffer_head **bhp)
+{
+	struct fnode *fnode;
+	if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, ino, 1, "fnode")) {
+		return NULL;
+	}
+	if ((fnode = hpfs_map_sector(s, ino, bhp, FNODE_RD_AHEAD))) {
+		if (hpfs_sb(s)->sb_chk) {
+			struct extended_attribute *ea;
+			struct extended_attribute *ea_end;
+			if (le32_to_cpu(fnode->magic) != FNODE_MAGIC) {
+				hpfs_error(s, "bad magic on fnode %08lx",
+					(unsigned long)ino);
+				goto bail;
+			}
+			if (!fnode->dirflag) {
+				if ((unsigned)fnode->btree.n_used_nodes + (unsigned)fnode->btree.n_free_nodes !=
+				    (fnode->btree.internal ? 12 : 8)) {
+					hpfs_error(s,
+					   "bad number of nodes in fnode %08lx",
+					    (unsigned long)ino);
+					goto bail;
+				}
+				if (le16_to_cpu(fnode->btree.first_free) !=
+				    8 + fnode->btree.n_used_nodes * (fnode->btree.internal ? 8 : 12)) {
+					hpfs_error(s,
+					    "bad first_free pointer in fnode %08lx",
+					    (unsigned long)ino);
+					goto bail;
+				}
+			}
+			if (le16_to_cpu(fnode->ea_size_s) && (le16_to_cpu(fnode->ea_offs) < 0xc4 ||
+			   le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200)) {
+				hpfs_error(s,
+					"bad EA info in fnode %08lx: ea_offs == %04x ea_size_s == %04x",
+					(unsigned long)ino,
+					le16_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s));
+				goto bail;
+			}
+			ea = fnode_ea(fnode);
+			ea_end = fnode_end_ea(fnode);
+			while (ea != ea_end) {
+				if (ea > ea_end) {
+					hpfs_error(s, "bad EA in fnode %08lx",
+						(unsigned long)ino);
+					goto bail;
+				}
+				ea = next_ea(ea);
+			}
+		}
+	}
+	return fnode;
+	bail:
+	brelse(*bhp);
+	return NULL;
+}
+
+struct anode *hpfs_map_anode(struct super_block *s, anode_secno ano, struct buffer_head **bhp)
+{
+	struct anode *anode;
+	if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, ano, 1, "anode")) return NULL;
+	if ((anode = hpfs_map_sector(s, ano, bhp, ANODE_RD_AHEAD)))
+		if (hpfs_sb(s)->sb_chk) {
+			if (le32_to_cpu(anode->magic) != ANODE_MAGIC) {
+				hpfs_error(s, "bad magic on anode %08x", ano);
+				goto bail;
+			}
+			if (le32_to_cpu(anode->self) != ano) {
+				hpfs_error(s, "self pointer invalid on anode %08x", ano);
+				goto bail;
+			}
+			if ((unsigned)anode->btree.n_used_nodes + (unsigned)anode->btree.n_free_nodes !=
+			    (anode->btree.internal ? 60 : 40)) {
+				hpfs_error(s, "bad number of nodes in anode %08x", ano);
+				goto bail;
+			}
+			if (le16_to_cpu(anode->btree.first_free) !=
+			    8 + anode->btree.n_used_nodes * (anode->btree.internal ? 8 : 12)) {
+				hpfs_error(s, "bad first_free pointer in anode %08x", ano);
+				goto bail;
+			}
+		}
+	return anode;
+	bail:
+	brelse(*bhp);
+	return NULL;
+}
+
+/*
+ * Load dnode to memory and do some checks
+ */
+
+struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
+			     struct quad_buffer_head *qbh)
+{
+	struct dnode *dnode;
+	if (hpfs_sb(s)->sb_chk) {
+		if (hpfs_chk_sectors(s, secno, 4, "dnode")) return NULL;
+		if (secno & 3) {
+			hpfs_error(s, "dnode %08x not byte-aligned", secno);
+			return NULL;
+		}	
+	}
+	if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD)))
+		if (hpfs_sb(s)->sb_chk) {
+			unsigned p, pp = 0;
+			unsigned char *d = (unsigned char *)dnode;
+			int b = 0;
+			if (le32_to_cpu(dnode->magic) != DNODE_MAGIC) {
+				hpfs_error(s, "bad magic on dnode %08x", secno);
+				goto bail;
+			}
+			if (le32_to_cpu(dnode->self) != secno)
+				hpfs_error(s, "bad self pointer on dnode %08x self = %08x", secno, le32_to_cpu(dnode->self));
+			/* Check dirents - bad dirents would cause infinite
+			   loops or shooting to memory */
+			if (le32_to_cpu(dnode->first_free) > 2048) {
+				hpfs_error(s, "dnode %08x has first_free == %08x", secno, le32_to_cpu(dnode->first_free));
+				goto bail;
+			}
+			for (p = 20; p < le32_to_cpu(dnode->first_free); p += d[p] + (d[p+1] << 8)) {
+				struct hpfs_dirent *de = (struct hpfs_dirent *)((char *)dnode + p);
+				if (le16_to_cpu(de->length) > 292 || (le16_to_cpu(de->length) < 32) || (le16_to_cpu(de->length) & 3) || p + le16_to_cpu(de->length) > 2048) {
+					hpfs_error(s, "bad dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp);
+					goto bail;
+				}
+				if (((31 + de->namelen + de->down*4 + 3) & ~3) != le16_to_cpu(de->length)) {
+					if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & MS_RDONLY) goto ok;
+					hpfs_error(s, "namelen does not match dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp);
+					goto bail;
+				}
+				ok:
+				if (hpfs_sb(s)->sb_chk >= 2) b |= 1 << de->down;
+				if (de->down) if (de_down_pointer(de) < 0x10) {
+					hpfs_error(s, "bad down pointer in dnode %08x, dirent %03x, last %03x", secno, p, pp);
+					goto bail;
+				}
+				pp = p;
+				
+			}
+			if (p != le32_to_cpu(dnode->first_free)) {
+				hpfs_error(s, "size on last dirent does not match first_free; dnode %08x", secno);
+				goto bail;
+			}
+			if (d[pp + 30] != 1 || d[pp + 31] != 255) {
+				hpfs_error(s, "dnode %08x does not end with \\377 entry", secno);
+				goto bail;
+			}
+			if (b == 3) printk("HPFS: warning: unbalanced dnode tree, dnode %08x; see hpfs.txt 4 more info\n", secno);
+		}
+	return dnode;
+	bail:
+	hpfs_brelse4(qbh);
+	return NULL;
+}
+
+dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino)
+{
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	dnode_secno dno;
+
+	fnode = hpfs_map_fnode(s, ino, &bh);
+	if (!fnode)
+		return 0;
+
+	dno = le32_to_cpu(fnode->u.external[0].disk_secno);
+	brelse(bh);
+	return dno;
+}
diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c
new file mode 100644
index 00000000..9acdf338
--- /dev/null
+++ b/fs/hpfs/name.c
@@ -0,0 +1,112 @@
+/*
+ *  linux/fs/hpfs/name.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  operations with filenames
+ */
+
+#include "hpfs_fn.h"
+
+static inline int not_allowed_char(unsigned char c)
+{
+	return c<' ' || c=='"' || c=='*' || c=='/' || c==':' || c=='<' ||
+	      c=='>' || c=='?' || c=='\\' || c=='|';
+}
+
+static inline int no_dos_char(unsigned char c)
+{	/* Characters that are allowed in HPFS but not in DOS */
+	return c=='+' || c==',' || c==';' || c=='=' || c=='[' || c==']';
+}
+
+static inline unsigned char upcase(unsigned char *dir, unsigned char a)
+{
+	if (a<128 || a==255) return a>='a' && a<='z' ? a - 0x20 : a;
+	if (!dir) return a;
+	return dir[a-128];
+}
+
+unsigned char hpfs_upcase(unsigned char *dir, unsigned char a)
+{
+	return upcase(dir, a);
+}
+
+static inline unsigned char locase(unsigned char *dir, unsigned char a)
+{
+	if (a<128 || a==255) return a>='A' && a<='Z' ? a + 0x20 : a;
+	if (!dir) return a;
+	return dir[a];
+}
+
+int hpfs_chk_name(const unsigned char *name, unsigned *len)
+{
+	int i;
+	if (*len > 254) return -ENAMETOOLONG;
+	hpfs_adjust_length(name, len);
+	if (!*len) return -EINVAL;
+	for (i = 0; i < *len; i++) if (not_allowed_char(name[i])) return -EINVAL;
+	if (*len == 1) if (name[0] == '.') return -EINVAL;
+	if (*len == 2) if (name[0] == '.' && name[1] == '.') return -EINVAL;
+	return 0;
+}
+
+unsigned char *hpfs_translate_name(struct super_block *s, unsigned char *from,
+			  unsigned len, int lc, int lng)
+{
+	unsigned char *to;
+	int i;
+	if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) {
+		printk("HPFS: Long name flag mismatch - name ");
+		for (i=0; i<len; i++) printk("%c", from[i]);
+		printk(" misidentified as %s.\n", lng ? "short" : "long");
+		printk("HPFS: It's nothing serious. It could happen because of bug in OS/2.\nHPFS: Set checks=normal to disable this message.\n");
+	}
+	if (!lc) return from;
+	if (!(to = kmalloc(len, GFP_KERNEL))) {
+		printk("HPFS: can't allocate memory for name conversion buffer\n");
+		return from;
+	}
+	for (i = 0; i < len; i++) to[i] = locase(hpfs_sb(s)->sb_cp_table,from[i]);
+	return to;
+}
+
+int hpfs_compare_names(struct super_block *s,
+		       const unsigned char *n1, unsigned l1,
+		       const unsigned char *n2, unsigned l2, int last)
+{
+	unsigned l = l1 < l2 ? l1 : l2;
+	unsigned i;
+	if (last) return -1;
+	for (i = 0; i < l; i++) {
+		unsigned char c1 = upcase(hpfs_sb(s)->sb_cp_table,n1[i]);
+		unsigned char c2 = upcase(hpfs_sb(s)->sb_cp_table,n2[i]);
+		if (c1 < c2) return -1;
+		if (c1 > c2) return 1;
+	}
+	if (l1 < l2) return -1;
+	if (l1 > l2) return 1;
+	return 0;
+}
+
+int hpfs_is_name_long(const unsigned char *name, unsigned len)
+{
+	int i,j;
+	for (i = 0; i < len && name[i] != '.'; i++)
+		if (no_dos_char(name[i])) return 1;
+	if (!i || i > 8) return 1;
+	if (i == len) return 0;
+	for (j = i + 1; j < len; j++)
+		if (name[j] == '.' || no_dos_char(name[i])) return 1;
+	return j - i > 4;
+}
+
+/* OS/2 clears dots and spaces at the end of file name, so we have to */
+
+void hpfs_adjust_length(const unsigned char *name, unsigned *len)
+{
+	if (!*len) return;
+	if (*len == 1 && name[0] == '.') return;
+	if (*len == 2 && name[0] == '.' && name[1] == '.') return;
+	while (*len && (name[*len - 1] == '.' || name[*len - 1] == ' '))
+		(*len)--;
+}
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
new file mode 100644
index 00000000..acf95dab
--- /dev/null
+++ b/fs/hpfs/namei.c
@@ -0,0 +1,628 @@
+/*
+ *  linux/fs/hpfs/namei.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  adding & removing files & directories
+ */
+#include <linux/sched.h>
+#include "hpfs_fn.h"
+
+static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	const unsigned char *name = dentry->d_name.name;
+	unsigned len = dentry->d_name.len;
+	struct quad_buffer_head qbh0;
+	struct buffer_head *bh;
+	struct hpfs_dirent *de;
+	struct fnode *fnode;
+	struct dnode *dnode;
+	struct inode *result;
+	fnode_secno fno;
+	dnode_secno dno;
+	int r;
+	struct hpfs_dirent dee;
+	int err;
+	if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
+	hpfs_lock(dir->i_sb);
+	err = -ENOSPC;
+	fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
+	if (!fnode)
+		goto bail;
+	dnode = hpfs_alloc_dnode(dir->i_sb, fno, &dno, &qbh0);
+	if (!dnode)
+		goto bail1;
+	memset(&dee, 0, sizeof dee);
+	dee.directory = 1;
+	if (!(mode & 0222)) dee.read_only = 1;
+	/*dee.archive = 0;*/
+	dee.hidden = name[0] == '.';
+	dee.fnode = cpu_to_le32(fno);
+	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+	result = new_inode(dir->i_sb);
+	if (!result)
+		goto bail2;
+	hpfs_init_inode(result);
+	result->i_ino = fno;
+	hpfs_i(result)->i_parent_dir = dir->i_ino;
+	hpfs_i(result)->i_dno = dno;
+	result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
+	result->i_ctime.tv_nsec = 0; 
+	result->i_mtime.tv_nsec = 0; 
+	result->i_atime.tv_nsec = 0; 
+	hpfs_i(result)->i_ea_size = 0;
+	result->i_mode |= S_IFDIR;
+	result->i_op = &hpfs_dir_iops;
+	result->i_fop = &hpfs_dir_ops;
+	result->i_blocks = 4;
+	result->i_size = 2048;
+	result->i_nlink = 2;
+	if (dee.read_only)
+		result->i_mode &= ~0222;
+
+	r = hpfs_add_dirent(dir, name, len, &dee);
+	if (r == 1)
+		goto bail3;
+	if (r == -1) {
+		err = -EEXIST;
+		goto bail3;
+	}
+	fnode->len = len;
+	memcpy(fnode->name, name, len > 15 ? 15 : len);
+	fnode->up = cpu_to_le32(dir->i_ino);
+	fnode->dirflag = 1;
+	fnode->btree.n_free_nodes = 7;
+	fnode->btree.n_used_nodes = 1;
+	fnode->btree.first_free = cpu_to_le16(0x14);
+	fnode->u.external[0].disk_secno = cpu_to_le32(dno);
+	fnode->u.external[0].file_secno = cpu_to_le32(-1);
+	dnode->root_dnode = 1;
+	dnode->up = cpu_to_le32(fno);
+	de = hpfs_add_de(dir->i_sb, dnode, "\001\001", 2, 0);
+	de->creation_date = de->write_date = de->read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+	if (!(mode & 0222)) de->read_only = 1;
+	de->first = de->directory = 1;
+	/*de->hidden = de->system = 0;*/
+	de->fnode = cpu_to_le32(fno);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	hpfs_mark_4buffers_dirty(&qbh0);
+	hpfs_brelse4(&qbh0);
+	inc_nlink(dir);
+	insert_inode_hash(result);
+
+	if (result->i_uid != current_fsuid() ||
+	    result->i_gid != current_fsgid() ||
+	    result->i_mode != (mode | S_IFDIR)) {
+		result->i_uid = current_fsuid();
+		result->i_gid = current_fsgid();
+		result->i_mode = mode | S_IFDIR;
+		hpfs_write_inode_nolock(result);
+	}
+	d_instantiate(dentry, result);
+	hpfs_unlock(dir->i_sb);
+	return 0;
+bail3:
+	iput(result);
+bail2:
+	hpfs_brelse4(&qbh0);
+	hpfs_free_dnode(dir->i_sb, dno);
+bail1:
+	brelse(bh);
+	hpfs_free_sectors(dir->i_sb, fno, 1);
+bail:
+	hpfs_unlock(dir->i_sb);
+	return err;
+}
+
+static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
+{
+	const unsigned char *name = dentry->d_name.name;
+	unsigned len = dentry->d_name.len;
+	struct inode *result = NULL;
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	fnode_secno fno;
+	int r;
+	struct hpfs_dirent dee;
+	int err;
+	if ((err = hpfs_chk_name(name, &len)))
+		return err==-ENOENT ? -EINVAL : err;
+	hpfs_lock(dir->i_sb);
+	err = -ENOSPC;
+	fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
+	if (!fnode)
+		goto bail;
+	memset(&dee, 0, sizeof dee);
+	if (!(mode & 0222)) dee.read_only = 1;
+	dee.archive = 1;
+	dee.hidden = name[0] == '.';
+	dee.fnode = cpu_to_le32(fno);
+	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+
+	result = new_inode(dir->i_sb);
+	if (!result)
+		goto bail1;
+	
+	hpfs_init_inode(result);
+	result->i_ino = fno;
+	result->i_mode |= S_IFREG;
+	result->i_mode &= ~0111;
+	result->i_op = &hpfs_file_iops;
+	result->i_fop = &hpfs_file_ops;
+	result->i_nlink = 1;
+	hpfs_i(result)->i_parent_dir = dir->i_ino;
+	result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
+	result->i_ctime.tv_nsec = 0;
+	result->i_mtime.tv_nsec = 0;
+	result->i_atime.tv_nsec = 0;
+	hpfs_i(result)->i_ea_size = 0;
+	if (dee.read_only)
+		result->i_mode &= ~0222;
+	result->i_blocks = 1;
+	result->i_size = 0;
+	result->i_data.a_ops = &hpfs_aops;
+	hpfs_i(result)->mmu_private = 0;
+
+	r = hpfs_add_dirent(dir, name, len, &dee);
+	if (r == 1)
+		goto bail2;
+	if (r == -1) {
+		err = -EEXIST;
+		goto bail2;
+	}
+	fnode->len = len;
+	memcpy(fnode->name, name, len > 15 ? 15 : len);
+	fnode->up = cpu_to_le32(dir->i_ino);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+
+	insert_inode_hash(result);
+
+	if (result->i_uid != current_fsuid() ||
+	    result->i_gid != current_fsgid() ||
+	    result->i_mode != (mode | S_IFREG)) {
+		result->i_uid = current_fsuid();
+		result->i_gid = current_fsgid();
+		result->i_mode = mode | S_IFREG;
+		hpfs_write_inode_nolock(result);
+	}
+	d_instantiate(dentry, result);
+	hpfs_unlock(dir->i_sb);
+	return 0;
+
+bail2:
+	iput(result);
+bail1:
+	brelse(bh);
+	hpfs_free_sectors(dir->i_sb, fno, 1);
+bail:
+	hpfs_unlock(dir->i_sb);
+	return err;
+}
+
+static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+{
+	const unsigned char *name = dentry->d_name.name;
+	unsigned len = dentry->d_name.len;
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	fnode_secno fno;
+	int r;
+	struct hpfs_dirent dee;
+	struct inode *result = NULL;
+	int err;
+	if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
+	if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+	hpfs_lock(dir->i_sb);
+	err = -ENOSPC;
+	fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
+	if (!fnode)
+		goto bail;
+	memset(&dee, 0, sizeof dee);
+	if (!(mode & 0222)) dee.read_only = 1;
+	dee.archive = 1;
+	dee.hidden = name[0] == '.';
+	dee.fnode = cpu_to_le32(fno);
+	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+
+	result = new_inode(dir->i_sb);
+	if (!result)
+		goto bail1;
+
+	hpfs_init_inode(result);
+	result->i_ino = fno;
+	hpfs_i(result)->i_parent_dir = dir->i_ino;
+	result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
+	result->i_ctime.tv_nsec = 0;
+	result->i_mtime.tv_nsec = 0;
+	result->i_atime.tv_nsec = 0;
+	hpfs_i(result)->i_ea_size = 0;
+	result->i_uid = current_fsuid();
+	result->i_gid = current_fsgid();
+	result->i_nlink = 1;
+	result->i_size = 0;
+	result->i_blocks = 1;
+	init_special_inode(result, mode, rdev);
+
+	r = hpfs_add_dirent(dir, name, len, &dee);
+	if (r == 1)
+		goto bail2;
+	if (r == -1) {
+		err = -EEXIST;
+		goto bail2;
+	}
+	fnode->len = len;
+	memcpy(fnode->name, name, len > 15 ? 15 : len);
+	fnode->up = cpu_to_le32(dir->i_ino);
+	mark_buffer_dirty(bh);
+
+	insert_inode_hash(result);
+
+	hpfs_write_inode_nolock(result);
+	d_instantiate(dentry, result);
+	brelse(bh);
+	hpfs_unlock(dir->i_sb);
+	return 0;
+bail2:
+	iput(result);
+bail1:
+	brelse(bh);
+	hpfs_free_sectors(dir->i_sb, fno, 1);
+bail:
+	hpfs_unlock(dir->i_sb);
+	return err;
+}
+
+static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink)
+{
+	const unsigned char *name = dentry->d_name.name;
+	unsigned len = dentry->d_name.len;
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	fnode_secno fno;
+	int r;
+	struct hpfs_dirent dee;
+	struct inode *result;
+	int err;
+	if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
+	hpfs_lock(dir->i_sb);
+	if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
+		hpfs_unlock(dir->i_sb);
+		return -EPERM;
+	}
+	err = -ENOSPC;
+	fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
+	if (!fnode)
+		goto bail;
+	memset(&dee, 0, sizeof dee);
+	dee.archive = 1;
+	dee.hidden = name[0] == '.';
+	dee.fnode = cpu_to_le32(fno);
+	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+
+	result = new_inode(dir->i_sb);
+	if (!result)
+		goto bail1;
+	result->i_ino = fno;
+	hpfs_init_inode(result);
+	hpfs_i(result)->i_parent_dir = dir->i_ino;
+	result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
+	result->i_ctime.tv_nsec = 0;
+	result->i_mtime.tv_nsec = 0;
+	result->i_atime.tv_nsec = 0;
+	hpfs_i(result)->i_ea_size = 0;
+	result->i_mode = S_IFLNK | 0777;
+	result->i_uid = current_fsuid();
+	result->i_gid = current_fsgid();
+	result->i_blocks = 1;
+	result->i_nlink = 1;
+	result->i_size = strlen(symlink);
+	result->i_op = &page_symlink_inode_operations;
+	result->i_data.a_ops = &hpfs_symlink_aops;
+
+	r = hpfs_add_dirent(dir, name, len, &dee);
+	if (r == 1)
+		goto bail2;
+	if (r == -1) {
+		err = -EEXIST;
+		goto bail2;
+	}
+	fnode->len = len;
+	memcpy(fnode->name, name, len > 15 ? 15 : len);
+	fnode->up = cpu_to_le32(dir->i_ino);
+	hpfs_set_ea(result, fnode, "SYMLINK", symlink, strlen(symlink));
+	mark_buffer_dirty(bh);
+	brelse(bh);
+
+	insert_inode_hash(result);
+
+	hpfs_write_inode_nolock(result);
+	d_instantiate(dentry, result);
+	hpfs_unlock(dir->i_sb);
+	return 0;
+bail2:
+	iput(result);
+bail1:
+	brelse(bh);
+	hpfs_free_sectors(dir->i_sb, fno, 1);
+bail:
+	hpfs_unlock(dir->i_sb);
+	return err;
+}
+
+static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	const unsigned char *name = dentry->d_name.name;
+	unsigned len = dentry->d_name.len;
+	struct quad_buffer_head qbh;
+	struct hpfs_dirent *de;
+	struct inode *inode = dentry->d_inode;
+	dnode_secno dno;
+	int r;
+	int rep = 0;
+	int err;
+
+	hpfs_lock(dir->i_sb);
+	hpfs_adjust_length(name, &len);
+again:
+	err = -ENOENT;
+	de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
+	if (!de)
+		goto out;
+
+	err = -EPERM;
+	if (de->first)
+		goto out1;
+
+	err = -EISDIR;
+	if (de->directory)
+		goto out1;
+
+	r = hpfs_remove_dirent(dir, dno, de, &qbh, 1);
+	switch (r) {
+	case 1:
+		hpfs_error(dir->i_sb, "there was error when removing dirent");
+		err = -EFSERROR;
+		break;
+	case 2:		/* no space for deleting, try to truncate file */
+
+		err = -ENOSPC;
+		if (rep++)
+			break;
+
+		dentry_unhash(dentry);
+		if (!d_unhashed(dentry)) {
+			hpfs_unlock(dir->i_sb);
+			return -ENOSPC;
+		}
+		if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
+		    !S_ISREG(inode->i_mode) ||
+		    get_write_access(inode)) {
+			d_rehash(dentry);
+		} else {
+			struct iattr newattrs;
+			/*printk("HPFS: truncating file before delete.\n");*/
+			newattrs.ia_size = 0;
+			newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+			err = notify_change(dentry, &newattrs);
+			put_write_access(inode);
+			if (!err)
+				goto again;
+		}
+		hpfs_unlock(dir->i_sb);
+		return -ENOSPC;
+	default:
+		drop_nlink(inode);
+		err = 0;
+	}
+	goto out;
+
+out1:
+	hpfs_brelse4(&qbh);
+out:
+	hpfs_unlock(dir->i_sb);
+	return err;
+}
+
+static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	const unsigned char *name = dentry->d_name.name;
+	unsigned len = dentry->d_name.len;
+	struct quad_buffer_head qbh;
+	struct hpfs_dirent *de;
+	struct inode *inode = dentry->d_inode;
+	dnode_secno dno;
+	int n_items = 0;
+	int err;
+	int r;
+
+	hpfs_adjust_length(name, &len);
+	hpfs_lock(dir->i_sb);
+	err = -ENOENT;
+	de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
+	if (!de)
+		goto out;
+
+	err = -EPERM;
+	if (de->first)
+		goto out1;
+
+	err = -ENOTDIR;
+	if (!de->directory)
+		goto out1;
+
+	hpfs_count_dnodes(dir->i_sb, hpfs_i(inode)->i_dno, NULL, NULL, &n_items);
+	err = -ENOTEMPTY;
+	if (n_items)
+		goto out1;
+
+	r = hpfs_remove_dirent(dir, dno, de, &qbh, 1);
+	switch (r) {
+	case 1:
+		hpfs_error(dir->i_sb, "there was error when removing dirent");
+		err = -EFSERROR;
+		break;
+	case 2:
+		err = -ENOSPC;
+		break;
+	default:
+		drop_nlink(dir);
+		clear_nlink(inode);
+		err = 0;
+	}
+	goto out;
+out1:
+	hpfs_brelse4(&qbh);
+out:
+	hpfs_unlock(dir->i_sb);
+	return err;
+}
+
+static int hpfs_symlink_readpage(struct file *file, struct page *page)
+{
+	char *link = kmap(page);
+	struct inode *i = page->mapping->host;
+	struct fnode *fnode;
+	struct buffer_head *bh;
+	int err;
+
+	err = -EIO;
+	hpfs_lock(i->i_sb);
+	if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh)))
+		goto fail;
+	err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE);
+	brelse(bh);
+	if (err)
+		goto fail;
+	hpfs_unlock(i->i_sb);
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+	return 0;
+
+fail:
+	hpfs_unlock(i->i_sb);
+	SetPageError(page);
+	kunmap(page);
+	unlock_page(page);
+	return err;
+}
+
+const struct address_space_operations hpfs_symlink_aops = {
+	.readpage	= hpfs_symlink_readpage
+};
+	
+static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	const unsigned char *old_name = old_dentry->d_name.name;
+	unsigned old_len = old_dentry->d_name.len;
+	const unsigned char *new_name = new_dentry->d_name.name;
+	unsigned new_len = new_dentry->d_name.len;
+	struct inode *i = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct quad_buffer_head qbh, qbh1;
+	struct hpfs_dirent *dep, *nde;
+	struct hpfs_dirent de;
+	dnode_secno dno;
+	int r;
+	struct buffer_head *bh;
+	struct fnode *fnode;
+	int err;
+
+	if ((err = hpfs_chk_name(new_name, &new_len))) return err;
+	err = 0;
+	hpfs_adjust_length(old_name, &old_len);
+
+	hpfs_lock(i->i_sb);
+	/* order doesn't matter, due to VFS exclusion */
+	
+	/* Erm? Moving over the empty non-busy directory is perfectly legal */
+	if (new_inode && S_ISDIR(new_inode->i_mode)) {
+		err = -EINVAL;
+		goto end1;
+	}
+
+	if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
+		hpfs_error(i->i_sb, "lookup succeeded but map dirent failed");
+		err = -ENOENT;
+		goto end1;
+	}
+	copy_de(&de, dep);
+	de.hidden = new_name[0] == '.';
+
+	if (new_inode) {
+		int r;
+		if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) {
+			if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, new_name, new_len, NULL, &qbh1))) {
+				clear_nlink(new_inode);
+				copy_de(nde, &de);
+				memcpy(nde->name, new_name, new_len);
+				hpfs_mark_4buffers_dirty(&qbh1);
+				hpfs_brelse4(&qbh1);
+				goto end;
+			}
+			hpfs_error(new_dir->i_sb, "hpfs_rename: could not find dirent");
+			err = -EFSERROR;
+			goto end1;
+		}
+		err = r == 2 ? -ENOSPC : r == 1 ? -EFSERROR : 0;
+		goto end1;
+	}
+
+	if (new_dir == old_dir) hpfs_brelse4(&qbh);
+
+	if ((r = hpfs_add_dirent(new_dir, new_name, new_len, &de))) {
+		if (r == -1) hpfs_error(new_dir->i_sb, "hpfs_rename: dirent already exists!");
+		err = r == 1 ? -ENOSPC : -EFSERROR;
+		if (new_dir != old_dir) hpfs_brelse4(&qbh);
+		goto end1;
+	}
+	
+	if (new_dir == old_dir)
+		if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
+			hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2");
+			err = -ENOENT;
+			goto end1;
+		}
+
+	if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 0))) {
+		hpfs_error(i->i_sb, "hpfs_rename: could not remove dirent");
+		err = r == 2 ? -ENOSPC : -EFSERROR;
+		goto end1;
+	}
+
+	end:
+	hpfs_i(i)->i_parent_dir = new_dir->i_ino;
+	if (S_ISDIR(i->i_mode)) {
+		inc_nlink(new_dir);
+		drop_nlink(old_dir);
+	}
+	if ((fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) {
+		fnode->up = cpu_to_le32(new_dir->i_ino);
+		fnode->len = new_len;
+		memcpy(fnode->name, new_name, new_len>15?15:new_len);
+		if (new_len < 15) memset(&fnode->name[new_len], 0, 15 - new_len);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+	}
+end1:
+	hpfs_unlock(i->i_sb);
+	return err;
+}
+
+const struct inode_operations hpfs_dir_iops =
+{
+	.create		= hpfs_create,
+	.lookup		= hpfs_lookup,
+	.unlink		= hpfs_unlink,
+	.symlink	= hpfs_symlink,
+	.mkdir		= hpfs_mkdir,
+	.rmdir		= hpfs_rmdir,
+	.mknod		= hpfs_mknod,
+	.rename		= hpfs_rename,
+	.setattr	= hpfs_setattr,
+};
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
new file mode 100644
index 00000000..98580a3b
--- /dev/null
+++ b/fs/hpfs/super.c
@@ -0,0 +1,712 @@
+/*
+ *  linux/fs/hpfs/super.c
+ *
+ *  Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999
+ *
+ *  mounting, unmounting, error handling
+ */
+
+#include "hpfs_fn.h"
+#include <linux/module.h>
+#include <linux/parser.h>
+#include <linux/init.h>
+#include <linux/statfs.h>
+#include <linux/magic.h>
+#include <linux/sched.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+
+/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
+
+static void mark_dirty(struct super_block *s, int remount)
+{
+	if (hpfs_sb(s)->sb_chkdsk && (remount || !(s->s_flags & MS_RDONLY))) {
+		struct buffer_head *bh;
+		struct hpfs_spare_block *sb;
+		if ((sb = hpfs_map_sector(s, 17, &bh, 0))) {
+			sb->dirty = 1;
+			sb->old_wrote = 0;
+			mark_buffer_dirty(bh);
+			sync_dirty_buffer(bh);
+			brelse(bh);
+		}
+	}
+}
+
+/* Mark the filesystem clean (mark it dirty for chkdsk if chkdsk==2 or if there
+   were errors) */
+
+static void unmark_dirty(struct super_block *s)
+{
+	struct buffer_head *bh;
+	struct hpfs_spare_block *sb;
+	if (s->s_flags & MS_RDONLY) return;
+	sync_blockdev(s->s_bdev);
+	if ((sb = hpfs_map_sector(s, 17, &bh, 0))) {
+		sb->dirty = hpfs_sb(s)->sb_chkdsk > 1 - hpfs_sb(s)->sb_was_error;
+		sb->old_wrote = hpfs_sb(s)->sb_chkdsk >= 2 && !hpfs_sb(s)->sb_was_error;
+		mark_buffer_dirty(bh);
+		sync_dirty_buffer(bh);
+		brelse(bh);
+	}
+}
+
+/* Filesystem error... */
+static char err_buf[1024];
+
+void hpfs_error(struct super_block *s, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+	va_end(args);
+
+	printk("HPFS: filesystem error: %s", err_buf);
+	if (!hpfs_sb(s)->sb_was_error) {
+		if (hpfs_sb(s)->sb_err == 2) {
+			printk("; crashing the system because you wanted it\n");
+			mark_dirty(s, 0);
+			panic("HPFS panic");
+		} else if (hpfs_sb(s)->sb_err == 1) {
+			if (s->s_flags & MS_RDONLY) printk("; already mounted read-only\n");
+			else {
+				printk("; remounting read-only\n");
+				mark_dirty(s, 0);
+				s->s_flags |= MS_RDONLY;
+			}
+		} else if (s->s_flags & MS_RDONLY) printk("; going on - but anything won't be destroyed because it's read-only\n");
+		else printk("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n");
+	} else printk("\n");
+	hpfs_sb(s)->sb_was_error = 1;
+}
+
+/* 
+ * A little trick to detect cycles in many hpfs structures and don't let the
+ * kernel crash on corrupted filesystem. When first called, set c2 to 0.
+ *
+ * BTW. chkdsk doesn't detect cycles correctly. When I had 2 lost directories
+ * nested each in other, chkdsk locked up happilly.
+ */
+
+int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2,
+		char *msg)
+{
+	if (*c2 && *c1 == key) {
+		hpfs_error(s, "cycle detected on key %08x in %s", key, msg);
+		return 1;
+	}
+	(*c2)++;
+	if (!((*c2 - 1) & *c2)) *c1 = key;
+	return 0;
+}
+
+static void hpfs_put_super(struct super_block *s)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+
+	hpfs_lock(s);
+	unmark_dirty(s);
+	hpfs_unlock(s);
+
+	kfree(sbi->sb_cp_table);
+	kfree(sbi->sb_bmp_dir);
+	s->s_fs_info = NULL;
+	kfree(sbi);
+}
+
+unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
+{
+	struct quad_buffer_head qbh;
+	unsigned long *bits;
+	unsigned count;
+
+	bits = hpfs_map_4sectors(s, secno, &qbh, 4);
+	if (!bits)
+		return 0;
+	count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
+	hpfs_brelse4(&qbh);
+	return count;
+}
+
+static unsigned count_bitmaps(struct super_block *s)
+{
+	unsigned n, count, n_bands;
+	n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
+	count = 0;
+	for (n = 0; n < n_bands; n++)
+		count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
+	return count;
+}
+
+static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *s = dentry->d_sb;
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	u64 id = huge_encode_dev(s->s_bdev->bd_dev);
+	hpfs_lock(s);
+
+	/*if (sbi->sb_n_free == -1) {*/
+		sbi->sb_n_free = count_bitmaps(s);
+		sbi->sb_n_free_dnodes = hpfs_count_one_bitmap(s, sbi->sb_dmap);
+	/*}*/
+	buf->f_type = s->s_magic;
+	buf->f_bsize = 512;
+	buf->f_blocks = sbi->sb_fs_size;
+	buf->f_bfree = sbi->sb_n_free;
+	buf->f_bavail = sbi->sb_n_free;
+	buf->f_files = sbi->sb_dirband_size / 4;
+	buf->f_ffree = sbi->sb_n_free_dnodes;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = 254;
+
+	hpfs_unlock(s);
+
+	return 0;
+}
+
+static struct kmem_cache * hpfs_inode_cachep;
+
+static struct inode *hpfs_alloc_inode(struct super_block *sb)
+{
+	struct hpfs_inode_info *ei;
+	ei = (struct hpfs_inode_info *)kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+	ei->vfs_inode.i_version = 1;
+	return &ei->vfs_inode;
+}
+
+static void hpfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
+}
+
+static void hpfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hpfs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
+					     sizeof(struct hpfs_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (hpfs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(hpfs_inode_cachep);
+}
+
+/*
+ * A tiny parser for option strings, stolen from dosfs.
+ * Stolen again from read-only hpfs.
+ * And updated for table-driven option parsing.
+ */
+
+enum {
+	Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case_lower, Opt_case_asis,
+	Opt_check_none, Opt_check_normal, Opt_check_strict,
+	Opt_err_cont, Opt_err_ro, Opt_err_panic,
+	Opt_eas_no, Opt_eas_ro, Opt_eas_rw,
+	Opt_chkdsk_no, Opt_chkdsk_errors, Opt_chkdsk_always,
+	Opt_timeshift, Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_help, "help"},
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_umask, "umask=%o"},
+	{Opt_case_lower, "case=lower"},
+	{Opt_case_asis, "case=asis"},
+	{Opt_check_none, "check=none"},
+	{Opt_check_normal, "check=normal"},
+	{Opt_check_strict, "check=strict"},
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_eas_no, "eas=no"},
+	{Opt_eas_ro, "eas=ro"},
+	{Opt_eas_rw, "eas=rw"},
+	{Opt_chkdsk_no, "chkdsk=no"},
+	{Opt_chkdsk_errors, "chkdsk=errors"},
+	{Opt_chkdsk_always, "chkdsk=always"},
+	{Opt_timeshift, "timeshift=%d"},
+	{Opt_err, NULL},
+};
+
+static int parse_opts(char *opts, uid_t *uid, gid_t *gid, umode_t *umask,
+		      int *lowercase, int *eas, int *chk, int *errs,
+		      int *chkdsk, int *timeshift)
+{
+	char *p;
+	int option;
+
+	if (!opts)
+		return 1;
+
+	/*printk("Parsing opts: '%s'\n",opts);*/
+
+	while ((p = strsep(&opts, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_help:
+			return 2;
+		case Opt_uid:
+			if (match_int(args, &option))
+				return 0;
+			*uid = option;
+			break;
+		case Opt_gid:
+			if (match_int(args, &option))
+				return 0;
+			*gid = option;
+			break;
+		case Opt_umask:
+			if (match_octal(args, &option))
+				return 0;
+			*umask = option;
+			break;
+		case Opt_case_lower:
+			*lowercase = 1;
+			break;
+		case Opt_case_asis:
+			*lowercase = 0;
+			break;
+		case Opt_check_none:
+			*chk = 0;
+			break;
+		case Opt_check_normal:
+			*chk = 1;
+			break;
+		case Opt_check_strict:
+			*chk = 2;
+			break;
+		case Opt_err_cont:
+			*errs = 0;
+			break;
+		case Opt_err_ro:
+			*errs = 1;
+			break;
+		case Opt_err_panic:
+			*errs = 2;
+			break;
+		case Opt_eas_no:
+			*eas = 0;
+			break;
+		case Opt_eas_ro:
+			*eas = 1;
+			break;
+		case Opt_eas_rw:
+			*eas = 2;
+			break;
+		case Opt_chkdsk_no:
+			*chkdsk = 0;
+			break;
+		case Opt_chkdsk_errors:
+			*chkdsk = 1;
+			break;
+		case Opt_chkdsk_always:
+			*chkdsk = 2;
+			break;
+		case Opt_timeshift:
+		{
+			int m = 1;
+			char *rhs = args[0].from;
+			if (!rhs || !*rhs)
+				return 0;
+			if (*rhs == '-') m = -1;
+			if (*rhs == '+' || *rhs == '-') rhs++;
+			*timeshift = simple_strtoul(rhs, &rhs, 0) * m;
+			if (*rhs)
+				return 0;
+			break;
+		}
+		default:
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static inline void hpfs_help(void)
+{
+	printk("\n\
+HPFS filesystem options:\n\
+      help              do not mount and display this text\n\
+      uid=xxx           set uid of files that don't have uid specified in eas\n\
+      gid=xxx           set gid of files that don't have gid specified in eas\n\
+      umask=xxx         set mode of files that don't have mode specified in eas\n\
+      case=lower        lowercase all files\n\
+      case=asis         do not lowercase files (default)\n\
+      check=none        no fs checks - kernel may crash on corrupted filesystem\n\
+      check=normal      do some checks - it should not crash (default)\n\
+      check=strict      do extra time-consuming checks, used for debugging\n\
+      errors=continue   continue on errors\n\
+      errors=remount-ro remount read-only if errors found (default)\n\
+      errors=panic      panic on errors\n\
+      chkdsk=no         do not mark fs for chkdsking even if there were errors\n\
+      chkdsk=errors     mark fs dirty if errors found (default)\n\
+      chkdsk=always     always mark fs dirty - used for debugging\n\
+      eas=no            ignore extended attributes\n\
+      eas=ro            read but do not write extended attributes\n\
+      eas=rw            r/w eas => enables chmod, chown, mknod, ln -s (default)\n\
+      timeshift=nnn	add nnn seconds to file times\n\
+\n");
+}
+
+static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
+{
+	uid_t uid;
+	gid_t gid;
+	umode_t umask;
+	int lowercase, eas, chk, errs, chkdsk, timeshift;
+	int o;
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	char *new_opts = kstrdup(data, GFP_KERNEL);
+	
+	*flags |= MS_NOATIME;
+	
+	hpfs_lock(s);
+	lock_super(s);
+	uid = sbi->sb_uid; gid = sbi->sb_gid;
+	umask = 0777 & ~sbi->sb_mode;
+	lowercase = sbi->sb_lowercase;
+	eas = sbi->sb_eas; chk = sbi->sb_chk; chkdsk = sbi->sb_chkdsk;
+	errs = sbi->sb_err; timeshift = sbi->sb_timeshift;
+
+	if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase,
+	    &eas, &chk, &errs, &chkdsk, &timeshift))) {
+		printk("HPFS: bad mount options.\n");
+		goto out_err;
+	}
+	if (o == 2) {
+		hpfs_help();
+		goto out_err;
+	}
+	if (timeshift != sbi->sb_timeshift) {
+		printk("HPFS: timeshift can't be changed using remount.\n");
+		goto out_err;
+	}
+
+	unmark_dirty(s);
+
+	sbi->sb_uid = uid; sbi->sb_gid = gid;
+	sbi->sb_mode = 0777 & ~umask;
+	sbi->sb_lowercase = lowercase;
+	sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk;
+	sbi->sb_err = errs; sbi->sb_timeshift = timeshift;
+
+	if (!(*flags & MS_RDONLY)) mark_dirty(s, 1);
+
+	replace_mount_options(s, new_opts);
+
+	unlock_super(s);
+	hpfs_unlock(s);
+	return 0;
+
+out_err:
+	unlock_super(s);
+	hpfs_unlock(s);
+	kfree(new_opts);
+	return -EINVAL;
+}
+
+/* Super operations */
+
+static const struct super_operations hpfs_sops =
+{
+	.alloc_inode	= hpfs_alloc_inode,
+	.destroy_inode	= hpfs_destroy_inode,
+	.evict_inode	= hpfs_evict_inode,
+	.put_super	= hpfs_put_super,
+	.statfs		= hpfs_statfs,
+	.remount_fs	= hpfs_remount_fs,
+	.show_options	= generic_show_options,
+};
+
+static int hpfs_fill_super(struct super_block *s, void *options, int silent)
+{
+	struct buffer_head *bh0, *bh1, *bh2;
+	struct hpfs_boot_block *bootblock;
+	struct hpfs_super_block *superblock;
+	struct hpfs_spare_block *spareblock;
+	struct hpfs_sb_info *sbi;
+	struct inode *root;
+
+	uid_t uid;
+	gid_t gid;
+	umode_t umask;
+	int lowercase, eas, chk, errs, chkdsk, timeshift;
+
+	dnode_secno root_dno;
+	struct hpfs_dirent *de = NULL;
+	struct quad_buffer_head qbh;
+
+	int o;
+
+	save_mount_options(s, options);
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi) {
+		return -ENOMEM;
+	}
+	s->s_fs_info = sbi;
+
+	sbi->sb_bmp_dir = NULL;
+	sbi->sb_cp_table = NULL;
+
+	mutex_init(&sbi->hpfs_mutex);
+	hpfs_lock(s);
+
+	uid = current_uid();
+	gid = current_gid();
+	umask = current_umask();
+	lowercase = 0;
+	eas = 2;
+	chk = 1;
+	errs = 1;
+	chkdsk = 1;
+	timeshift = 0;
+
+	if (!(o = parse_opts(options, &uid, &gid, &umask, &lowercase,
+	    &eas, &chk, &errs, &chkdsk, &timeshift))) {
+		printk("HPFS: bad mount options.\n");
+		goto bail0;
+	}
+	if (o==2) {
+		hpfs_help();
+		goto bail0;
+	}
+
+	/*sbi->sb_mounting = 1;*/
+	sb_set_blocksize(s, 512);
+	sbi->sb_fs_size = -1;
+	if (!(bootblock = hpfs_map_sector(s, 0, &bh0, 0))) goto bail1;
+	if (!(superblock = hpfs_map_sector(s, 16, &bh1, 1))) goto bail2;
+	if (!(spareblock = hpfs_map_sector(s, 17, &bh2, 0))) goto bail3;
+
+	/* Check magics */
+	if (/*le16_to_cpu(bootblock->magic) != BB_MAGIC
+	    ||*/ le32_to_cpu(superblock->magic) != SB_MAGIC
+	    || le32_to_cpu(spareblock->magic) != SP_MAGIC) {
+		if (!silent) printk("HPFS: Bad magic ... probably not HPFS\n");
+		goto bail4;
+	}
+
+	/* Check version */
+	if (!(s->s_flags & MS_RDONLY) &&
+	      superblock->funcversion != 2 && superblock->funcversion != 3) {
+		printk("HPFS: Bad version %d,%d. Mount readonly to go around\n",
+			(int)superblock->version, (int)superblock->funcversion);
+		printk("HPFS: please try recent version of HPFS driver at http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi and if it still can't understand this format, contact author - mikulas@artax.karlin.mff.cuni.cz\n");
+		goto bail4;
+	}
+
+	s->s_flags |= MS_NOATIME;
+
+	/* Fill superblock stuff */
+	s->s_magic = HPFS_SUPER_MAGIC;
+	s->s_op = &hpfs_sops;
+	s->s_d_op = &hpfs_dentry_operations;
+
+	sbi->sb_root = le32_to_cpu(superblock->root);
+	sbi->sb_fs_size = le32_to_cpu(superblock->n_sectors);
+	sbi->sb_bitmaps = le32_to_cpu(superblock->bitmaps);
+	sbi->sb_dirband_start = le32_to_cpu(superblock->dir_band_start);
+	sbi->sb_dirband_size = le32_to_cpu(superblock->n_dir_band);
+	sbi->sb_dmap = le32_to_cpu(superblock->dir_band_bitmap);
+	sbi->sb_uid = uid;
+	sbi->sb_gid = gid;
+	sbi->sb_mode = 0777 & ~umask;
+	sbi->sb_n_free = -1;
+	sbi->sb_n_free_dnodes = -1;
+	sbi->sb_lowercase = lowercase;
+	sbi->sb_eas = eas;
+	sbi->sb_chk = chk;
+	sbi->sb_chkdsk = chkdsk;
+	sbi->sb_err = errs;
+	sbi->sb_timeshift = timeshift;
+	sbi->sb_was_error = 0;
+	sbi->sb_cp_table = NULL;
+	sbi->sb_c_bitmap = -1;
+	sbi->sb_max_fwd_alloc = 0xffffff;
+	
+	/* Load bitmap directory */
+	if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps))))
+		goto bail4;
+	
+	/* Check for general fs errors*/
+	if (spareblock->dirty && !spareblock->old_wrote) {
+		if (errs == 2) {
+			printk("HPFS: Improperly stopped, not mounted\n");
+			goto bail4;
+		}
+		hpfs_error(s, "improperly stopped");
+	}
+
+	if (!(s->s_flags & MS_RDONLY)) {
+		spareblock->dirty = 1;
+		spareblock->old_wrote = 0;
+		mark_buffer_dirty(bh2);
+	}
+
+	if (le32_to_cpu(spareblock->hotfixes_used) || le32_to_cpu(spareblock->n_spares_used)) {
+		if (errs >= 2) {
+			printk("HPFS: Hotfixes not supported here, try chkdsk\n");
+			mark_dirty(s, 0);
+			goto bail4;
+		}
+		hpfs_error(s, "hotfixes not supported here, try chkdsk");
+		if (errs == 0) printk("HPFS: Proceeding, but your filesystem will be probably corrupted by this driver...\n");
+		else printk("HPFS: This driver may read bad files or crash when operating on disk with hotfixes.\n");
+	}
+	if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) {
+		if (errs >= 2) {
+			printk("HPFS: Spare dnodes used, try chkdsk\n");
+			mark_dirty(s, 0);
+			goto bail4;
+		}
+		hpfs_error(s, "warning: spare dnodes used, try chkdsk");
+		if (errs == 0) printk("HPFS: Proceeding, but your filesystem could be corrupted if you delete files or directories\n");
+	}
+	if (chk) {
+		unsigned a;
+		if (le32_to_cpu(superblock->dir_band_end) - le32_to_cpu(superblock->dir_band_start) + 1 != le32_to_cpu(superblock->n_dir_band) ||
+		    le32_to_cpu(superblock->dir_band_end) < le32_to_cpu(superblock->dir_band_start) || le32_to_cpu(superblock->n_dir_band) > 0x4000) {
+			hpfs_error(s, "dir band size mismatch: dir_band_start==%08x, dir_band_end==%08x, n_dir_band==%08x",
+				le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->dir_band_end), le32_to_cpu(superblock->n_dir_band));
+			goto bail4;
+		}
+		a = sbi->sb_dirband_size;
+		sbi->sb_dirband_size = 0;
+		if (hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->n_dir_band), "dir_band") ||
+		    hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_bitmap), 4, "dir_band_bitmap") ||
+		    hpfs_chk_sectors(s, le32_to_cpu(superblock->bitmaps), 4, "bitmaps")) {
+			mark_dirty(s, 0);
+			goto bail4;
+		}
+		sbi->sb_dirband_size = a;
+	} else printk("HPFS: You really don't want any checks? You are crazy...\n");
+
+	/* Load code page table */
+	if (le32_to_cpu(spareblock->n_code_pages))
+		if (!(sbi->sb_cp_table = hpfs_load_code_page(s, le32_to_cpu(spareblock->code_page_dir))))
+			printk("HPFS: Warning: code page support is disabled\n");
+
+	brelse(bh2);
+	brelse(bh1);
+	brelse(bh0);
+
+	root = iget_locked(s, sbi->sb_root);
+	if (!root)
+		goto bail0;
+	hpfs_init_inode(root);
+	hpfs_read_inode(root);
+	unlock_new_inode(root);
+	s->s_root = d_alloc_root(root);
+	if (!s->s_root) {
+		iput(root);
+		goto bail0;
+	}
+
+	/*
+	 * find the root directory's . pointer & finish filling in the inode
+	 */
+
+	root_dno = hpfs_fnode_dno(s, sbi->sb_root);
+	if (root_dno)
+		de = map_dirent(root, root_dno, "\001\001", 2, NULL, &qbh);
+	if (!de)
+		hpfs_error(s, "unable to find root dir");
+	else {
+		root->i_atime.tv_sec = local_to_gmt(s, le32_to_cpu(de->read_date));
+		root->i_atime.tv_nsec = 0;
+		root->i_mtime.tv_sec = local_to_gmt(s, le32_to_cpu(de->write_date));
+		root->i_mtime.tv_nsec = 0;
+		root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date));
+		root->i_ctime.tv_nsec = 0;
+		hpfs_i(root)->i_ea_size = le16_to_cpu(de->ea_size);
+		hpfs_i(root)->i_parent_dir = root->i_ino;
+		if (root->i_size == -1)
+			root->i_size = 2048;
+		if (root->i_blocks == -1)
+			root->i_blocks = 5;
+		hpfs_brelse4(&qbh);
+	}
+	hpfs_unlock(s);
+	return 0;
+
+bail4:	brelse(bh2);
+bail3:	brelse(bh1);
+bail2:	brelse(bh0);
+bail1:
+bail0:
+	hpfs_unlock(s);
+	kfree(sbi->sb_bmp_dir);
+	kfree(sbi->sb_cp_table);
+	s->s_fs_info = NULL;
+	kfree(sbi);
+	return -EINVAL;
+}
+
+static struct dentry *hpfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
+}
+
+static struct file_system_type hpfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "hpfs",
+	.mount		= hpfs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_hpfs_fs(void)
+{
+	int err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&hpfs_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_hpfs_fs(void)
+{
+	unregister_filesystem(&hpfs_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_hpfs_fs)
+module_exit(exit_hpfs_fs)
+MODULE_LICENSE("GPL");
diff --git a/fs/hppfs/Makefile b/fs/hppfs/Makefile
new file mode 100644
index 00000000..3a982bd9
--- /dev/null
+++ b/fs/hppfs/Makefile
@@ -0,0 +1,6 @@
+#
+# Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+# Licensed under the GPL
+#
+
+obj-$(CONFIG_HPPFS) += hppfs.o
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
new file mode 100644
index 00000000..9d71c95b
--- /dev/null
+++ b/fs/hppfs/hppfs.c
@@ -0,0 +1,773 @@
+/*
+ * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#include <linux/ctype.h>
+#include <linux/dcache.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/types.h>
+#include <linux/pid_namespace.h>
+#include <linux/namei.h>
+#include <asm/uaccess.h>
+#include "os.h"
+
+static struct inode *get_inode(struct super_block *, struct dentry *);
+
+struct hppfs_data {
+	struct list_head list;
+	char contents[PAGE_SIZE - sizeof(struct list_head)];
+};
+
+struct hppfs_private {
+	struct file *proc_file;
+	int host_fd;
+	loff_t len;
+	struct hppfs_data *contents;
+};
+
+struct hppfs_inode_info {
+	struct dentry *proc_dentry;
+	struct inode vfs_inode;
+};
+
+static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode)
+{
+	return container_of(inode, struct hppfs_inode_info, vfs_inode);
+}
+
+#define HPPFS_SUPER_MAGIC 0xb00000ee
+
+static const struct super_operations hppfs_sbops;
+
+static int is_pid(struct dentry *dentry)
+{
+	struct super_block *sb;
+	int i;
+
+	sb = dentry->d_sb;
+	if (dentry->d_parent != sb->s_root)
+		return 0;
+
+	for (i = 0; i < dentry->d_name.len; i++) {
+		if (!isdigit(dentry->d_name.name[i]))
+			return 0;
+	}
+	return 1;
+}
+
+static char *dentry_name(struct dentry *dentry, int extra)
+{
+	struct dentry *parent;
+	char *root, *name;
+	const char *seg_name;
+	int len, seg_len;
+
+	len = 0;
+	parent = dentry;
+	while (parent->d_parent != parent) {
+		if (is_pid(parent))
+			len += strlen("pid") + 1;
+		else len += parent->d_name.len + 1;
+		parent = parent->d_parent;
+	}
+
+	root = "proc";
+	len += strlen(root);
+	name = kmalloc(len + extra + 1, GFP_KERNEL);
+	if (name == NULL)
+		return NULL;
+
+	name[len] = '\0';
+	parent = dentry;
+	while (parent->d_parent != parent) {
+		if (is_pid(parent)) {
+			seg_name = "pid";
+			seg_len = strlen("pid");
+		}
+		else {
+			seg_name = parent->d_name.name;
+			seg_len = parent->d_name.len;
+		}
+
+		len -= seg_len + 1;
+		name[len] = '/';
+		strncpy(&name[len + 1], seg_name, seg_len);
+		parent = parent->d_parent;
+	}
+	strncpy(name, root, strlen(root));
+	return name;
+}
+
+static int file_removed(struct dentry *dentry, const char *file)
+{
+	char *host_file;
+	int extra, fd;
+
+	extra = 0;
+	if (file != NULL)
+		extra += strlen(file) + 1;
+
+	host_file = dentry_name(dentry, extra + strlen("/remove"));
+	if (host_file == NULL) {
+		printk(KERN_ERR "file_removed : allocation failed\n");
+		return -ENOMEM;
+	}
+
+	if (file != NULL) {
+		strcat(host_file, "/");
+		strcat(host_file, file);
+	}
+	strcat(host_file, "/remove");
+
+	fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
+	kfree(host_file);
+	if (fd > 0) {
+		os_close_file(fd);
+		return 1;
+	}
+	return 0;
+}
+
+static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct dentry *proc_dentry, *parent;
+	struct qstr *name = &dentry->d_name;
+	struct inode *inode;
+	int err, deleted;
+
+	deleted = file_removed(dentry, NULL);
+	if (deleted < 0)
+		return ERR_PTR(deleted);
+	else if (deleted)
+		return ERR_PTR(-ENOENT);
+
+	parent = HPPFS_I(ino)->proc_dentry;
+	mutex_lock(&parent->d_inode->i_mutex);
+	proc_dentry = lookup_one_len(name->name, parent, name->len);
+	mutex_unlock(&parent->d_inode->i_mutex);
+
+	if (IS_ERR(proc_dentry))
+		return proc_dentry;
+
+	err = -ENOMEM;
+	inode = get_inode(ino->i_sb, proc_dentry);
+	if (!inode)
+		goto out;
+
+ 	d_add(dentry, inode);
+	return NULL;
+
+ out:
+	return ERR_PTR(err);
+}
+
+static const struct inode_operations hppfs_file_iops = {
+};
+
+static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count,
+			 loff_t *ppos, int is_user)
+{
+	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
+	ssize_t n;
+
+	read = file->f_path.dentry->d_inode->i_fop->read;
+
+	if (!is_user)
+		set_fs(KERNEL_DS);
+
+	n = (*read)(file, buf, count, &file->f_pos);
+
+	if (!is_user)
+		set_fs(USER_DS);
+
+	if (ppos)
+		*ppos = file->f_pos;
+	return n;
+}
+
+static ssize_t hppfs_read_file(int fd, char __user *buf, ssize_t count)
+{
+	ssize_t n;
+	int cur, err;
+	char *new_buf;
+
+	n = -ENOMEM;
+	new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (new_buf == NULL) {
+		printk(KERN_ERR "hppfs_read_file : kmalloc failed\n");
+		goto out;
+	}
+	n = 0;
+	while (count > 0) {
+		cur = min_t(ssize_t, count, PAGE_SIZE);
+		err = os_read_file(fd, new_buf, cur);
+		if (err < 0) {
+			printk(KERN_ERR "hppfs_read : read failed, "
+			       "errno = %d\n", err);
+			n = err;
+			goto out_free;
+		} else if (err == 0)
+			break;
+
+		if (copy_to_user(buf, new_buf, err)) {
+			n = -EFAULT;
+			goto out_free;
+		}
+		n += err;
+		count -= err;
+	}
+ out_free:
+	kfree(new_buf);
+ out:
+	return n;
+}
+
+static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
+			  loff_t *ppos)
+{
+	struct hppfs_private *hppfs = file->private_data;
+	struct hppfs_data *data;
+	loff_t off;
+	int err;
+
+	if (hppfs->contents != NULL) {
+		int rem;
+
+		if (*ppos >= hppfs->len)
+			return 0;
+
+		data = hppfs->contents;
+		off = *ppos;
+		while (off >= sizeof(data->contents)) {
+			data = list_entry(data->list.next, struct hppfs_data,
+					  list);
+			off -= sizeof(data->contents);
+		}
+
+		if (off + count > hppfs->len)
+			count = hppfs->len - off;
+		rem = copy_to_user(buf, &data->contents[off], count);
+		*ppos += count - rem;
+		if (rem > 0)
+			return -EFAULT;
+	} else if (hppfs->host_fd != -1) {
+		err = os_seek_file(hppfs->host_fd, *ppos);
+		if (err) {
+			printk(KERN_ERR "hppfs_read : seek failed, "
+			       "errno = %d\n", err);
+			return err;
+		}
+		err = hppfs_read_file(hppfs->host_fd, buf, count);
+		if (err < 0) {
+			printk(KERN_ERR "hppfs_read: read failed: %d\n", err);
+			return err;
+		}
+		count = err;
+		if (count > 0)
+			*ppos += count;
+	}
+	else count = read_proc(hppfs->proc_file, buf, count, ppos, 1);
+
+	return count;
+}
+
+static ssize_t hppfs_write(struct file *file, const char __user *buf,
+			   size_t len, loff_t *ppos)
+{
+	struct hppfs_private *data = file->private_data;
+	struct file *proc_file = data->proc_file;
+	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
+
+	write = proc_file->f_path.dentry->d_inode->i_fop->write;
+	return (*write)(proc_file, buf, len, ppos);
+}
+
+static int open_host_sock(char *host_file, int *filter_out)
+{
+	char *end;
+	int fd;
+
+	end = &host_file[strlen(host_file)];
+	strcpy(end, "/rw");
+	*filter_out = 1;
+	fd = os_connect_socket(host_file);
+	if (fd > 0)
+		return fd;
+
+	strcpy(end, "/r");
+	*filter_out = 0;
+	fd = os_connect_socket(host_file);
+	return fd;
+}
+
+static void free_contents(struct hppfs_data *head)
+{
+	struct hppfs_data *data;
+	struct list_head *ele, *next;
+
+	if (head == NULL)
+		return;
+
+	list_for_each_safe(ele, next, &head->list) {
+		data = list_entry(ele, struct hppfs_data, list);
+		kfree(data);
+	}
+	kfree(head);
+}
+
+static struct hppfs_data *hppfs_get_data(int fd, int filter,
+					 struct file *proc_file,
+					 struct file *hppfs_file,
+					 loff_t *size_out)
+{
+	struct hppfs_data *data, *new, *head;
+	int n, err;
+
+	err = -ENOMEM;
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL) {
+		printk(KERN_ERR "hppfs_get_data : head allocation failed\n");
+		goto failed;
+	}
+
+	INIT_LIST_HEAD(&data->list);
+
+	head = data;
+	*size_out = 0;
+
+	if (filter) {
+		while ((n = read_proc(proc_file, data->contents,
+				      sizeof(data->contents), NULL, 0)) > 0)
+			os_write_file(fd, data->contents, n);
+		err = os_shutdown_socket(fd, 0, 1);
+		if (err) {
+			printk(KERN_ERR "hppfs_get_data : failed to shut down "
+			       "socket\n");
+			goto failed_free;
+		}
+	}
+	while (1) {
+		n = os_read_file(fd, data->contents, sizeof(data->contents));
+		if (n < 0) {
+			err = n;
+			printk(KERN_ERR "hppfs_get_data : read failed, "
+			       "errno = %d\n", err);
+			goto failed_free;
+		} else if (n == 0)
+			break;
+
+		*size_out += n;
+
+		if (n < sizeof(data->contents))
+			break;
+
+		new = kmalloc(sizeof(*data), GFP_KERNEL);
+		if (new == 0) {
+			printk(KERN_ERR "hppfs_get_data : data allocation "
+			       "failed\n");
+			err = -ENOMEM;
+			goto failed_free;
+		}
+
+		INIT_LIST_HEAD(&new->list);
+		list_add(&new->list, &data->list);
+		data = new;
+	}
+	return head;
+
+ failed_free:
+	free_contents(head);
+ failed:
+	return ERR_PTR(err);
+}
+
+static struct hppfs_private *hppfs_data(void)
+{
+	struct hppfs_private *data;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		return data;
+
+	*data = ((struct hppfs_private ) { .host_fd  		= -1,
+					   .len  		= -1,
+					   .contents 		= NULL } );
+	return data;
+}
+
+static int file_mode(int fmode)
+{
+	if (fmode == (FMODE_READ | FMODE_WRITE))
+		return O_RDWR;
+	if (fmode == FMODE_READ)
+		return O_RDONLY;
+	if (fmode == FMODE_WRITE)
+		return O_WRONLY;
+	return 0;
+}
+
+static int hppfs_open(struct inode *inode, struct file *file)
+{
+	const struct cred *cred = file->f_cred;
+	struct hppfs_private *data;
+	struct vfsmount *proc_mnt;
+	struct dentry *proc_dentry;
+	char *host_file;
+	int err, fd, type, filter;
+
+	err = -ENOMEM;
+	data = hppfs_data();
+	if (data == NULL)
+		goto out;
+
+	host_file = dentry_name(file->f_path.dentry, strlen("/rw"));
+	if (host_file == NULL)
+		goto out_free2;
+
+	proc_dentry = HPPFS_I(inode)->proc_dentry;
+	proc_mnt = inode->i_sb->s_fs_info;
+
+	/* XXX This isn't closed anywhere */
+	data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt),
+				      file_mode(file->f_mode), cred);
+	err = PTR_ERR(data->proc_file);
+	if (IS_ERR(data->proc_file))
+		goto out_free1;
+
+	type = os_file_type(host_file);
+	if (type == OS_TYPE_FILE) {
+		fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
+		if (fd >= 0)
+			data->host_fd = fd;
+		else
+			printk(KERN_ERR "hppfs_open : failed to open '%s', "
+			       "errno = %d\n", host_file, -fd);
+
+		data->contents = NULL;
+	} else if (type == OS_TYPE_DIR) {
+		fd = open_host_sock(host_file, &filter);
+		if (fd > 0) {
+			data->contents = hppfs_get_data(fd, filter,
+							data->proc_file,
+							file, &data->len);
+			if (!IS_ERR(data->contents))
+				data->host_fd = fd;
+		} else
+			printk(KERN_ERR "hppfs_open : failed to open a socket "
+			       "in '%s', errno = %d\n", host_file, -fd);
+	}
+	kfree(host_file);
+
+	file->private_data = data;
+	return 0;
+
+ out_free1:
+	kfree(host_file);
+ out_free2:
+	free_contents(data->contents);
+	kfree(data);
+ out:
+	return err;
+}
+
+static int hppfs_dir_open(struct inode *inode, struct file *file)
+{
+	const struct cred *cred = file->f_cred;
+	struct hppfs_private *data;
+	struct vfsmount *proc_mnt;
+	struct dentry *proc_dentry;
+	int err;
+
+	err = -ENOMEM;
+	data = hppfs_data();
+	if (data == NULL)
+		goto out;
+
+	proc_dentry = HPPFS_I(inode)->proc_dentry;
+	proc_mnt = inode->i_sb->s_fs_info;
+	data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt),
+				      file_mode(file->f_mode), cred);
+	err = PTR_ERR(data->proc_file);
+	if (IS_ERR(data->proc_file))
+		goto out_free;
+
+	file->private_data = data;
+	return 0;
+
+ out_free:
+	kfree(data);
+ out:
+	return err;
+}
+
+static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
+{
+	struct hppfs_private *data = file->private_data;
+	struct file *proc_file = data->proc_file;
+	loff_t (*llseek)(struct file *, loff_t, int);
+	loff_t ret;
+
+	llseek = proc_file->f_path.dentry->d_inode->i_fop->llseek;
+	if (llseek != NULL) {
+		ret = (*llseek)(proc_file, off, where);
+		if (ret < 0)
+			return ret;
+	}
+
+	return default_llseek(file, off, where);
+}
+
+static const struct file_operations hppfs_file_fops = {
+	.owner		= NULL,
+	.llseek		= hppfs_llseek,
+	.read		= hppfs_read,
+	.write		= hppfs_write,
+	.open		= hppfs_open,
+};
+
+struct hppfs_dirent {
+	void *vfs_dirent;
+	filldir_t filldir;
+	struct dentry *dentry;
+};
+
+static int hppfs_filldir(void *d, const char *name, int size,
+			 loff_t offset, u64 inode, unsigned int type)
+{
+	struct hppfs_dirent *dirent = d;
+
+	if (file_removed(dirent->dentry, name))
+		return 0;
+
+	return (*dirent->filldir)(dirent->vfs_dirent, name, size, offset,
+				  inode, type);
+}
+
+static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
+{
+	struct hppfs_private *data = file->private_data;
+	struct file *proc_file = data->proc_file;
+	int (*readdir)(struct file *, void *, filldir_t);
+	struct hppfs_dirent dirent = ((struct hppfs_dirent)
+		                      { .vfs_dirent  	= ent,
+					.filldir 	= filldir,
+					.dentry  	= file->f_path.dentry
+				      });
+	int err;
+
+	readdir = proc_file->f_path.dentry->d_inode->i_fop->readdir;
+
+	proc_file->f_pos = file->f_pos;
+	err = (*readdir)(proc_file, &dirent, hppfs_filldir);
+	file->f_pos = proc_file->f_pos;
+
+	return err;
+}
+
+static int hppfs_fsync(struct file *file, int datasync)
+{
+	return 0;
+}
+
+static const struct file_operations hppfs_dir_fops = {
+	.owner		= NULL,
+	.readdir	= hppfs_readdir,
+	.open		= hppfs_dir_open,
+	.fsync		= hppfs_fsync,
+	.llseek		= default_llseek,
+};
+
+static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
+{
+	sf->f_blocks = 0;
+	sf->f_bfree = 0;
+	sf->f_bavail = 0;
+	sf->f_files = 0;
+	sf->f_ffree = 0;
+	sf->f_type = HPPFS_SUPER_MAGIC;
+	return 0;
+}
+
+static struct inode *hppfs_alloc_inode(struct super_block *sb)
+{
+	struct hppfs_inode_info *hi;
+
+	hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+	if (!hi)
+		return NULL;
+
+	hi->proc_dentry = NULL;
+	inode_init_once(&hi->vfs_inode);
+	return &hi->vfs_inode;
+}
+
+void hppfs_evict_inode(struct inode *ino)
+{
+	end_writeback(ino);
+	dput(HPPFS_I(ino)->proc_dentry);
+	mntput(ino->i_sb->s_fs_info);
+}
+
+static void hppfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kfree(HPPFS_I(inode));
+}
+
+static void hppfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hppfs_i_callback);
+}
+
+static const struct super_operations hppfs_sbops = {
+	.alloc_inode	= hppfs_alloc_inode,
+	.destroy_inode	= hppfs_destroy_inode,
+	.evict_inode	= hppfs_evict_inode,
+	.statfs		= hppfs_statfs,
+};
+
+static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
+			  int buflen)
+{
+	struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
+	return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer,
+						    buflen);
+}
+
+static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
+
+	return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
+}
+
+static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd,
+			   void *cookie)
+{
+	struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
+
+	if (proc_dentry->d_inode->i_op->put_link)
+		proc_dentry->d_inode->i_op->put_link(proc_dentry, nd, cookie);
+}
+
+static const struct inode_operations hppfs_dir_iops = {
+	.lookup		= hppfs_lookup,
+};
+
+static const struct inode_operations hppfs_link_iops = {
+	.readlink	= hppfs_readlink,
+	.follow_link	= hppfs_follow_link,
+	.put_link	= hppfs_put_link,
+};
+
+static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
+{
+	struct inode *proc_ino = dentry->d_inode;
+	struct inode *inode = new_inode(sb);
+
+	if (!inode) {
+		dput(dentry);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (S_ISDIR(dentry->d_inode->i_mode)) {
+		inode->i_op = &hppfs_dir_iops;
+		inode->i_fop = &hppfs_dir_fops;
+	} else if (S_ISLNK(dentry->d_inode->i_mode)) {
+		inode->i_op = &hppfs_link_iops;
+		inode->i_fop = &hppfs_file_fops;
+	} else {
+		inode->i_op = &hppfs_file_iops;
+		inode->i_fop = &hppfs_file_fops;
+	}
+
+	HPPFS_I(inode)->proc_dentry = dentry;
+
+	inode->i_uid = proc_ino->i_uid;
+	inode->i_gid = proc_ino->i_gid;
+	inode->i_atime = proc_ino->i_atime;
+	inode->i_mtime = proc_ino->i_mtime;
+	inode->i_ctime = proc_ino->i_ctime;
+	inode->i_ino = proc_ino->i_ino;
+	inode->i_mode = proc_ino->i_mode;
+	inode->i_nlink = proc_ino->i_nlink;
+	inode->i_size = proc_ino->i_size;
+	inode->i_blocks = proc_ino->i_blocks;
+
+	return inode;
+}
+
+static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
+{
+	struct inode *root_inode;
+	struct vfsmount *proc_mnt;
+	int err = -ENOENT;
+
+	proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt);
+	if (IS_ERR(proc_mnt))
+		goto out;
+
+	sb->s_blocksize = 1024;
+	sb->s_blocksize_bits = 10;
+	sb->s_magic = HPPFS_SUPER_MAGIC;
+	sb->s_op = &hppfs_sbops;
+	sb->s_fs_info = proc_mnt;
+
+	err = -ENOMEM;
+	root_inode = get_inode(sb, dget(proc_mnt->mnt_sb->s_root));
+	if (!root_inode)
+		goto out_mntput;
+
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root)
+		goto out_iput;
+
+	return 0;
+
+ out_iput:
+	iput(root_inode);
+ out_mntput:
+	mntput(proc_mnt);
+ out:
+	return(err);
+}
+
+static struct dentry *hppfs_read_super(struct file_system_type *type,
+			    int flags, const char *dev_name,
+			    void *data)
+{
+	return mount_nodev(type, flags, data, hppfs_fill_super);
+}
+
+static struct file_system_type hppfs_type = {
+	.owner 		= THIS_MODULE,
+	.name 		= "hppfs",
+	.mount 		= hppfs_read_super,
+	.kill_sb	= kill_anon_super,
+	.fs_flags 	= 0,
+};
+
+static int __init init_hppfs(void)
+{
+	return register_filesystem(&hppfs_type);
+}
+
+static void __exit exit_hppfs(void)
+{
+	unregister_filesystem(&hppfs_type);
+}
+
+module_init(init_hppfs)
+module_exit(exit_hppfs)
+MODULE_LICENSE("GPL");
diff --git a/fs/hugetlbfs/Makefile b/fs/hugetlbfs/Makefile
new file mode 100644
index 00000000..6adf870c
--- /dev/null
+++ b/fs/hugetlbfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux ramfs routines.
+#
+
+obj-$(CONFIG_HUGETLBFS) += hugetlbfs.o
+
+hugetlbfs-objs := inode.o
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
new file mode 100644
index 00000000..8b0c8753
--- /dev/null
+++ b/fs/hugetlbfs/inode.c
@@ -0,0 +1,1033 @@
+/*
+ * hugetlbpage-backed filesystem.  Based on ramfs.
+ *
+ * William Irwin, 2002
+ *
+ * Copyright (C) 2002 Linus Torvalds.
+ */
+
+#include <linux/module.h>
+#include <linux/thread_info.h>
+#include <asm/current.h>
+#include <linux/sched.h>		/* remove ASAP */
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/kernel.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/capability.h>
+#include <linux/ctype.h>
+#include <linux/backing-dev.h>
+#include <linux/hugetlb.h>
+#include <linux/pagevec.h>
+#include <linux/parser.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/dnotify.h>
+#include <linux/statfs.h>
+#include <linux/security.h>
+#include <linux/magic.h>
+#include <linux/migrate.h>
+
+#include <asm/uaccess.h>
+
+static const struct super_operations hugetlbfs_ops;
+static const struct address_space_operations hugetlbfs_aops;
+const struct file_operations hugetlbfs_file_operations;
+static const struct inode_operations hugetlbfs_dir_inode_operations;
+static const struct inode_operations hugetlbfs_inode_operations;
+
+static struct backing_dev_info hugetlbfs_backing_dev_info = {
+	.name		= "hugetlbfs",
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+
+int sysctl_hugetlb_shm_group;
+
+enum {
+	Opt_size, Opt_nr_inodes,
+	Opt_mode, Opt_uid, Opt_gid,
+	Opt_pagesize,
+	Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_size,	"size=%s"},
+	{Opt_nr_inodes,	"nr_inodes=%s"},
+	{Opt_mode,	"mode=%o"},
+	{Opt_uid,	"uid=%u"},
+	{Opt_gid,	"gid=%u"},
+	{Opt_pagesize,	"pagesize=%s"},
+	{Opt_err,	NULL},
+};
+
+static void huge_pagevec_release(struct pagevec *pvec)
+{
+	int i;
+
+	for (i = 0; i < pagevec_count(pvec); ++i)
+		put_page(pvec->pages[i]);
+
+	pagevec_reinit(pvec);
+}
+
+static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	loff_t len, vma_len;
+	int ret;
+	struct hstate *h = hstate_file(file);
+
+	/*
+	 * vma address alignment (but not the pgoff alignment) has
+	 * already been checked by prepare_hugepage_range.  If you add
+	 * any error returns here, do so after setting VM_HUGETLB, so
+	 * is_vm_hugetlb_page tests below unmap_region go the right
+	 * way when do_mmap_pgoff unwinds (may be important on powerpc
+	 * and ia64).
+	 */
+	vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
+	vma->vm_ops = &hugetlb_vm_ops;
+
+	if (vma->vm_pgoff & ~(huge_page_mask(h) >> PAGE_SHIFT))
+		return -EINVAL;
+
+	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
+
+	mutex_lock(&inode->i_mutex);
+	file_accessed(file);
+
+	ret = -ENOMEM;
+	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+
+	if (hugetlb_reserve_pages(inode,
+				vma->vm_pgoff >> huge_page_order(h),
+				len >> huge_page_shift(h), vma,
+				vma->vm_flags))
+		goto out;
+
+	ret = 0;
+	hugetlb_prefault_arch_hook(vma->vm_mm);
+	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
+		inode->i_size = len;
+out:
+	mutex_unlock(&inode->i_mutex);
+
+	return ret;
+}
+
+/*
+ * Called under down_write(mmap_sem).
+ */
+
+#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+static unsigned long
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long start_addr;
+	struct hstate *h = hstate_file(file);
+
+	if (len & ~huge_page_mask(h))
+		return -EINVAL;
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (flags & MAP_FIXED) {
+		if (prepare_hugepage_range(file, addr, len))
+			return -EINVAL;
+		return addr;
+	}
+
+	if (addr) {
+		addr = ALIGN(addr, huge_page_size(h));
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+
+	start_addr = mm->free_area_cache;
+
+	if (len <= mm->cached_hole_size)
+		start_addr = TASK_UNMAPPED_BASE;
+
+full_search:
+	addr = ALIGN(start_addr, huge_page_size(h));
+
+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (TASK_SIZE - len < addr) {
+			/*
+			 * Start a new search - just in case we missed
+			 * some holes.
+			 */
+			if (start_addr != TASK_UNMAPPED_BASE) {
+				start_addr = TASK_UNMAPPED_BASE;
+				goto full_search;
+			}
+			return -ENOMEM;
+		}
+
+		if (!vma || addr + len <= vma->vm_start)
+			return addr;
+		addr = ALIGN(vma->vm_end, huge_page_size(h));
+	}
+}
+#endif
+
+static int
+hugetlbfs_read_actor(struct page *page, unsigned long offset,
+			char __user *buf, unsigned long count,
+			unsigned long size)
+{
+	char *kaddr;
+	unsigned long left, copied = 0;
+	int i, chunksize;
+
+	if (size > count)
+		size = count;
+
+	/* Find which 4k chunk and offset with in that chunk */
+	i = offset >> PAGE_CACHE_SHIFT;
+	offset = offset & ~PAGE_CACHE_MASK;
+
+	while (size) {
+		chunksize = PAGE_CACHE_SIZE;
+		if (offset)
+			chunksize -= offset;
+		if (chunksize > size)
+			chunksize = size;
+		kaddr = kmap(&page[i]);
+		left = __copy_to_user(buf, kaddr + offset, chunksize);
+		kunmap(&page[i]);
+		if (left) {
+			copied += (chunksize - left);
+			break;
+		}
+		offset = 0;
+		size -= chunksize;
+		buf += chunksize;
+		copied += chunksize;
+		i++;
+	}
+	return copied ? copied : -EFAULT;
+}
+
+/*
+ * Support for read() - Find the page attached to f_mapping and copy out the
+ * data. Its *very* similar to do_generic_mapping_read(), we can't use that
+ * since it has PAGE_CACHE_SIZE assumptions.
+ */
+static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
+			      size_t len, loff_t *ppos)
+{
+	struct hstate *h = hstate_file(filp);
+	struct address_space *mapping = filp->f_mapping;
+	struct inode *inode = mapping->host;
+	unsigned long index = *ppos >> huge_page_shift(h);
+	unsigned long offset = *ppos & ~huge_page_mask(h);
+	unsigned long end_index;
+	loff_t isize;
+	ssize_t retval = 0;
+
+	/* validate length */
+	if (len == 0)
+		goto out;
+
+	for (;;) {
+		struct page *page;
+		unsigned long nr, ret;
+		int ra;
+
+		/* nr is the maximum number of bytes to copy from this page */
+		nr = huge_page_size(h);
+		isize = i_size_read(inode);
+		if (!isize)
+			goto out;
+		end_index = (isize - 1) >> huge_page_shift(h);
+		if (index >= end_index) {
+			if (index > end_index)
+				goto out;
+			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
+			if (nr <= offset)
+				goto out;
+		}
+		nr = nr - offset;
+
+		/* Find the page */
+		page = find_lock_page(mapping, index);
+		if (unlikely(page == NULL)) {
+			/*
+			 * We have a HOLE, zero out the user-buffer for the
+			 * length of the hole or request.
+			 */
+			ret = len < nr ? len : nr;
+			if (clear_user(buf, ret))
+				ra = -EFAULT;
+			else
+				ra = 0;
+		} else {
+			unlock_page(page);
+
+			/*
+			 * We have the page, copy it to user space buffer.
+			 */
+			ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
+			ret = ra;
+			page_cache_release(page);
+		}
+		if (ra < 0) {
+			if (retval == 0)
+				retval = ra;
+			goto out;
+		}
+
+		offset += ret;
+		retval += ret;
+		len -= ret;
+		index += offset >> huge_page_shift(h);
+		offset &= ~huge_page_mask(h);
+
+		/* short read or no more work */
+		if ((ret != nr) || (len == 0))
+			break;
+	}
+out:
+	*ppos = ((loff_t)index << huge_page_shift(h)) + offset;
+	return retval;
+}
+
+static int hugetlbfs_write_begin(struct file *file,
+			struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	return -EINVAL;
+}
+
+static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	BUG();
+	return -EINVAL;
+}
+
+static void truncate_huge_page(struct page *page)
+{
+	cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
+	ClearPageUptodate(page);
+	delete_from_page_cache(page);
+}
+
+static void truncate_hugepages(struct inode *inode, loff_t lstart)
+{
+	struct hstate *h = hstate_inode(inode);
+	struct address_space *mapping = &inode->i_data;
+	const pgoff_t start = lstart >> huge_page_shift(h);
+	struct pagevec pvec;
+	pgoff_t next;
+	int i, freed = 0;
+
+	pagevec_init(&pvec, 0);
+	next = start;
+	while (1) {
+		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+			if (next == start)
+				break;
+			next = start;
+			continue;
+		}
+
+		for (i = 0; i < pagevec_count(&pvec); ++i) {
+			struct page *page = pvec.pages[i];
+
+			lock_page(page);
+			if (page->index > next)
+				next = page->index;
+			++next;
+			truncate_huge_page(page);
+			unlock_page(page);
+			freed++;
+		}
+		huge_pagevec_release(&pvec);
+	}
+	BUG_ON(!lstart && mapping->nrpages);
+	hugetlb_unreserve_pages(inode, start, freed);
+}
+
+static void hugetlbfs_evict_inode(struct inode *inode)
+{
+	truncate_hugepages(inode, 0);
+	end_writeback(inode);
+}
+
+static inline void
+hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+
+	vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) {
+		unsigned long v_offset;
+
+		/*
+		 * Can the expression below overflow on 32-bit arches?
+		 * No, because the prio_tree returns us only those vmas
+		 * which overlap the truncated area starting at pgoff,
+		 * and no vma on a 32-bit arch can span beyond the 4GB.
+		 */
+		if (vma->vm_pgoff < pgoff)
+			v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
+		else
+			v_offset = 0;
+
+		__unmap_hugepage_range(vma,
+				vma->vm_start + v_offset, vma->vm_end, NULL);
+	}
+}
+
+static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
+{
+	pgoff_t pgoff;
+	struct address_space *mapping = inode->i_mapping;
+	struct hstate *h = hstate_inode(inode);
+
+	BUG_ON(offset & ~huge_page_mask(h));
+	pgoff = offset >> PAGE_SHIFT;
+
+	i_size_write(inode, offset);
+	mutex_lock(&mapping->i_mmap_mutex);
+	if (!prio_tree_empty(&mapping->i_mmap))
+		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
+	mutex_unlock(&mapping->i_mmap_mutex);
+	truncate_hugepages(inode, offset);
+	return 0;
+}
+
+static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct hstate *h = hstate_inode(inode);
+	int error;
+	unsigned int ia_valid = attr->ia_valid;
+
+	BUG_ON(!inode);
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if (ia_valid & ATTR_SIZE) {
+		error = -EINVAL;
+		if (attr->ia_size & ~huge_page_mask(h))
+			return -EINVAL;
+		error = hugetlb_vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, 
+					gid_t gid, int mode, dev_t dev)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (inode) {
+		struct hugetlbfs_inode_info *info;
+		inode->i_ino = get_next_ino();
+		inode->i_mode = mode;
+		inode->i_uid = uid;
+		inode->i_gid = gid;
+		inode->i_mapping->a_ops = &hugetlbfs_aops;
+		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		INIT_LIST_HEAD(&inode->i_mapping->private_list);
+		info = HUGETLBFS_I(inode);
+		/*
+		 * The policy is initialized here even if we are creating a
+		 * private inode because initialization simply creates an
+		 * an empty rb tree and calls spin_lock_init(), later when we
+		 * call mpol_free_shared_policy() it will just return because
+		 * the rb tree will still be empty.
+		 */
+		mpol_shared_policy_init(&info->policy, NULL);
+		switch (mode & S_IFMT) {
+		default:
+			init_special_inode(inode, mode, dev);
+			break;
+		case S_IFREG:
+			inode->i_op = &hugetlbfs_inode_operations;
+			inode->i_fop = &hugetlbfs_file_operations;
+			break;
+		case S_IFDIR:
+			inode->i_op = &hugetlbfs_dir_inode_operations;
+			inode->i_fop = &simple_dir_operations;
+
+			/* directory inodes start off with i_nlink == 2 (for "." entry) */
+			inc_nlink(inode);
+			break;
+		case S_IFLNK:
+			inode->i_op = &page_symlink_inode_operations;
+			break;
+		}
+	}
+	return inode;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+static int hugetlbfs_mknod(struct inode *dir,
+			struct dentry *dentry, int mode, dev_t dev)
+{
+	struct inode *inode;
+	int error = -ENOSPC;
+	gid_t gid;
+
+	if (dir->i_mode & S_ISGID) {
+		gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else {
+		gid = current_fsgid();
+	}
+	inode = hugetlbfs_get_inode(dir->i_sb, current_fsuid(), gid, mode, dev);
+	if (inode) {
+		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+		d_instantiate(dentry, inode);
+		dget(dentry);	/* Extra count - pin the dentry in core */
+		error = 0;
+	}
+	return error;
+}
+
+static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+	if (!retval)
+		inc_nlink(dir);
+	return retval;
+}
+
+static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
+{
+	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+static int hugetlbfs_symlink(struct inode *dir,
+			struct dentry *dentry, const char *symname)
+{
+	struct inode *inode;
+	int error = -ENOSPC;
+	gid_t gid;
+
+	if (dir->i_mode & S_ISGID)
+		gid = dir->i_gid;
+	else
+		gid = current_fsgid();
+
+	inode = hugetlbfs_get_inode(dir->i_sb, current_fsuid(),
+					gid, S_IFLNK|S_IRWXUGO, 0);
+	if (inode) {
+		int l = strlen(symname)+1;
+		error = page_symlink(inode, symname, l);
+		if (!error) {
+			d_instantiate(dentry, inode);
+			dget(dentry);
+		} else
+			iput(inode);
+	}
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+
+	return error;
+}
+
+/*
+ * mark the head page dirty
+ */
+static int hugetlbfs_set_page_dirty(struct page *page)
+{
+	struct page *head = compound_head(page);
+
+	SetPageDirty(head);
+	return 0;
+}
+
+static int hugetlbfs_migrate_page(struct address_space *mapping,
+				struct page *newpage, struct page *page)
+{
+	int rc;
+
+	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+	if (rc)
+		return rc;
+	migrate_page_copy(newpage, page);
+
+	return 0;
+}
+
+static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
+	struct hstate *h = hstate_inode(dentry->d_inode);
+
+	buf->f_type = HUGETLBFS_MAGIC;
+	buf->f_bsize = huge_page_size(h);
+	if (sbinfo) {
+		spin_lock(&sbinfo->stat_lock);
+		/* If no limits set, just report 0 for max/free/used
+		 * blocks, like simple_statfs() */
+		if (sbinfo->max_blocks >= 0) {
+			buf->f_blocks = sbinfo->max_blocks;
+			buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+			buf->f_files = sbinfo->max_inodes;
+			buf->f_ffree = sbinfo->free_inodes;
+		}
+		spin_unlock(&sbinfo->stat_lock);
+	}
+	buf->f_namelen = NAME_MAX;
+	return 0;
+}
+
+static void hugetlbfs_put_super(struct super_block *sb)
+{
+	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
+
+	if (sbi) {
+		sb->s_fs_info = NULL;
+		kfree(sbi);
+	}
+}
+
+static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
+{
+	if (sbinfo->free_inodes >= 0) {
+		spin_lock(&sbinfo->stat_lock);
+		if (unlikely(!sbinfo->free_inodes)) {
+			spin_unlock(&sbinfo->stat_lock);
+			return 0;
+		}
+		sbinfo->free_inodes--;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+
+	return 1;
+}
+
+static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
+{
+	if (sbinfo->free_inodes >= 0) {
+		spin_lock(&sbinfo->stat_lock);
+		sbinfo->free_inodes++;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+}
+
+
+static struct kmem_cache *hugetlbfs_inode_cachep;
+
+static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
+{
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
+	struct hugetlbfs_inode_info *p;
+
+	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
+		return NULL;
+	p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
+	if (unlikely(!p)) {
+		hugetlbfs_inc_free_inodes(sbinfo);
+		return NULL;
+	}
+	return &p->vfs_inode;
+}
+
+static void hugetlbfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
+}
+
+static void hugetlbfs_destroy_inode(struct inode *inode)
+{
+	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
+	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
+	call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
+}
+
+static const struct address_space_operations hugetlbfs_aops = {
+	.write_begin	= hugetlbfs_write_begin,
+	.write_end	= hugetlbfs_write_end,
+	.set_page_dirty	= hugetlbfs_set_page_dirty,
+	.migratepage    = hugetlbfs_migrate_page,
+};
+
+
+static void init_once(void *foo)
+{
+	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+const struct file_operations hugetlbfs_file_operations = {
+	.read			= hugetlbfs_read,
+	.mmap			= hugetlbfs_file_mmap,
+	.fsync			= noop_fsync,
+	.get_unmapped_area	= hugetlb_get_unmapped_area,
+	.llseek		= default_llseek,
+};
+
+static const struct inode_operations hugetlbfs_dir_inode_operations = {
+	.create		= hugetlbfs_create,
+	.lookup		= simple_lookup,
+	.link		= simple_link,
+	.unlink		= simple_unlink,
+	.symlink	= hugetlbfs_symlink,
+	.mkdir		= hugetlbfs_mkdir,
+	.rmdir		= simple_rmdir,
+	.mknod		= hugetlbfs_mknod,
+	.rename		= simple_rename,
+	.setattr	= hugetlbfs_setattr,
+};
+
+static const struct inode_operations hugetlbfs_inode_operations = {
+	.setattr	= hugetlbfs_setattr,
+};
+
+static const struct super_operations hugetlbfs_ops = {
+	.alloc_inode    = hugetlbfs_alloc_inode,
+	.destroy_inode  = hugetlbfs_destroy_inode,
+	.evict_inode	= hugetlbfs_evict_inode,
+	.statfs		= hugetlbfs_statfs,
+	.put_super	= hugetlbfs_put_super,
+	.show_options	= generic_show_options,
+};
+
+static int
+hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
+{
+	char *p, *rest;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	unsigned long long size = 0;
+	enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+ 				goto bad_val;
+			pconfig->uid = option;
+			break;
+
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+ 				goto bad_val;
+			pconfig->gid = option;
+			break;
+
+		case Opt_mode:
+			if (match_octal(&args[0], &option))
+ 				goto bad_val;
+			pconfig->mode = option & 01777U;
+			break;
+
+		case Opt_size: {
+			/* memparse() will accept a K/M/G without a digit */
+			if (!isdigit(*args[0].from))
+				goto bad_val;
+			size = memparse(args[0].from, &rest);
+			setsize = SIZE_STD;
+			if (*rest == '%')
+				setsize = SIZE_PERCENT;
+			break;
+		}
+
+		case Opt_nr_inodes:
+			/* memparse() will accept a K/M/G without a digit */
+			if (!isdigit(*args[0].from))
+				goto bad_val;
+			pconfig->nr_inodes = memparse(args[0].from, &rest);
+			break;
+
+		case Opt_pagesize: {
+			unsigned long ps;
+			ps = memparse(args[0].from, &rest);
+			pconfig->hstate = size_to_hstate(ps);
+			if (!pconfig->hstate) {
+				printk(KERN_ERR
+				"hugetlbfs: Unsupported page size %lu MB\n",
+					ps >> 20);
+				return -EINVAL;
+			}
+			break;
+		}
+
+		default:
+			printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n",
+				 p);
+			return -EINVAL;
+			break;
+		}
+	}
+
+	/* Do size after hstate is set up */
+	if (setsize > NO_SIZE) {
+		struct hstate *h = pconfig->hstate;
+		if (setsize == SIZE_PERCENT) {
+			size <<= huge_page_shift(h);
+			size *= h->max_huge_pages;
+			do_div(size, 100);
+		}
+		pconfig->nr_blocks = (size >> huge_page_shift(h));
+	}
+
+	return 0;
+
+bad_val:
+ 	printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
+	       args[0].from, p);
+ 	return -EINVAL;
+}
+
+static int
+hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode * inode;
+	struct dentry * root;
+	int ret;
+	struct hugetlbfs_config config;
+	struct hugetlbfs_sb_info *sbinfo;
+
+	save_mount_options(sb, data);
+
+	config.nr_blocks = -1; /* No limit on size by default */
+	config.nr_inodes = -1; /* No limit on number of inodes by default */
+	config.uid = current_fsuid();
+	config.gid = current_fsgid();
+	config.mode = 0755;
+	config.hstate = &default_hstate;
+	ret = hugetlbfs_parse_options(data, &config);
+	if (ret)
+		return ret;
+
+	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
+	if (!sbinfo)
+		return -ENOMEM;
+	sb->s_fs_info = sbinfo;
+	sbinfo->hstate = config.hstate;
+	spin_lock_init(&sbinfo->stat_lock);
+	sbinfo->max_blocks = config.nr_blocks;
+	sbinfo->free_blocks = config.nr_blocks;
+	sbinfo->max_inodes = config.nr_inodes;
+	sbinfo->free_inodes = config.nr_inodes;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize = huge_page_size(config.hstate);
+	sb->s_blocksize_bits = huge_page_shift(config.hstate);
+	sb->s_magic = HUGETLBFS_MAGIC;
+	sb->s_op = &hugetlbfs_ops;
+	sb->s_time_gran = 1;
+	inode = hugetlbfs_get_inode(sb, config.uid, config.gid,
+					S_IFDIR | config.mode, 0);
+	if (!inode)
+		goto out_free;
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		goto out_free;
+	}
+	sb->s_root = root;
+	return 0;
+out_free:
+	kfree(sbinfo);
+	return -ENOMEM;
+}
+
+int hugetlb_get_quota(struct address_space *mapping, long delta)
+{
+	int ret = 0;
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
+
+	if (sbinfo->free_blocks > -1) {
+		spin_lock(&sbinfo->stat_lock);
+		if (sbinfo->free_blocks - delta >= 0)
+			sbinfo->free_blocks -= delta;
+		else
+			ret = -ENOMEM;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+
+	return ret;
+}
+
+void hugetlb_put_quota(struct address_space *mapping, long delta)
+{
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
+
+	if (sbinfo->free_blocks > -1) {
+		spin_lock(&sbinfo->stat_lock);
+		sbinfo->free_blocks += delta;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+}
+
+static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
+}
+
+static struct file_system_type hugetlbfs_fs_type = {
+	.name		= "hugetlbfs",
+	.mount		= hugetlbfs_mount,
+	.kill_sb	= kill_litter_super,
+};
+
+static struct vfsmount *hugetlbfs_vfsmount;
+
+static int can_do_hugetlb_shm(void)
+{
+	return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
+}
+
+struct file *hugetlb_file_setup(const char *name, size_t size,
+				vm_flags_t acctflag,
+				struct user_struct **user, int creat_flags)
+{
+	int error = -ENOMEM;
+	struct file *file;
+	struct inode *inode;
+	struct path path;
+	struct dentry *root;
+	struct qstr quick_string;
+
+	*user = NULL;
+	if (!hugetlbfs_vfsmount)
+		return ERR_PTR(-ENOENT);
+
+	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
+		*user = current_user();
+		if (user_shm_lock(size, *user)) {
+			printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n");
+		} else {
+			*user = NULL;
+			return ERR_PTR(-EPERM);
+		}
+	}
+
+	root = hugetlbfs_vfsmount->mnt_root;
+	quick_string.name = name;
+	quick_string.len = strlen(quick_string.name);
+	quick_string.hash = 0;
+	path.dentry = d_alloc(root, &quick_string);
+	if (!path.dentry)
+		goto out_shm_unlock;
+
+	path.mnt = mntget(hugetlbfs_vfsmount);
+	error = -ENOSPC;
+	inode = hugetlbfs_get_inode(root->d_sb, current_fsuid(),
+				current_fsgid(), S_IFREG | S_IRWXUGO, 0);
+	if (!inode)
+		goto out_dentry;
+
+	error = -ENOMEM;
+	if (hugetlb_reserve_pages(inode, 0,
+			size >> huge_page_shift(hstate_inode(inode)), NULL,
+			acctflag))
+		goto out_inode;
+
+	d_instantiate(path.dentry, inode);
+	inode->i_size = size;
+	inode->i_nlink = 0;
+
+	error = -ENFILE;
+	file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
+			&hugetlbfs_file_operations);
+	if (!file)
+		goto out_dentry; /* inode is already attached */
+
+	return file;
+
+out_inode:
+	iput(inode);
+out_dentry:
+	path_put(&path);
+out_shm_unlock:
+	if (*user) {
+		user_shm_unlock(size, *user);
+		*user = NULL;
+	}
+	return ERR_PTR(error);
+}
+
+static int __init init_hugetlbfs_fs(void)
+{
+	int error;
+	struct vfsmount *vfsmount;
+
+	error = bdi_init(&hugetlbfs_backing_dev_info);
+	if (error)
+		return error;
+
+	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
+					sizeof(struct hugetlbfs_inode_info),
+					0, 0, init_once);
+	if (hugetlbfs_inode_cachep == NULL)
+		goto out2;
+
+	error = register_filesystem(&hugetlbfs_fs_type);
+	if (error)
+		goto out;
+
+	vfsmount = kern_mount(&hugetlbfs_fs_type);
+
+	if (!IS_ERR(vfsmount)) {
+		hugetlbfs_vfsmount = vfsmount;
+		return 0;
+	}
+
+	error = PTR_ERR(vfsmount);
+
+ out:
+	if (error)
+		kmem_cache_destroy(hugetlbfs_inode_cachep);
+ out2:
+	bdi_destroy(&hugetlbfs_backing_dev_info);
+	return error;
+}
+
+static void __exit exit_hugetlbfs_fs(void)
+{
+	kmem_cache_destroy(hugetlbfs_inode_cachep);
+	unregister_filesystem(&hugetlbfs_fs_type);
+	bdi_destroy(&hugetlbfs_backing_dev_info);
+}
+
+module_init(init_hugetlbfs_fs)
+module_exit(exit_hugetlbfs_fs)
+
+MODULE_LICENSE("GPL");
diff --git a/fs/inode.c b/fs/inode.c
new file mode 100644
index 00000000..43566d17
--- /dev/null
+++ b/fs/inode.c
@@ -0,0 +1,1697 @@
+/*
+ * (C) 1997 Linus Torvalds
+ * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
+ */
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/dcache.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <linux/rwsem.h>
+#include <linux/hash.h>
+#include <linux/swap.h>
+#include <linux/security.h>
+#include <linux/pagemap.h>
+#include <linux/cdev.h>
+#include <linux/bootmem.h>
+#include <linux/fsnotify.h>
+#include <linux/mount.h>
+#include <linux/async.h>
+#include <linux/posix_acl.h>
+#include <linux/prefetch.h>
+#include <linux/ima.h>
+#include <linux/cred.h>
+#include <linux/buffer_head.h> /* for inode_has_buffers */
+#include "internal.h"
+
+/*
+ * Inode locking rules:
+ *
+ * inode->i_lock protects:
+ *   inode->i_state, inode->i_hash, __iget()
+ * inode_lru_lock protects:
+ *   inode_lru, inode->i_lru
+ * inode_sb_list_lock protects:
+ *   sb->s_inodes, inode->i_sb_list
+ * inode_wb_list_lock protects:
+ *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
+ * inode_hash_lock protects:
+ *   inode_hashtable, inode->i_hash
+ *
+ * Lock ordering:
+ *
+ * inode_sb_list_lock
+ *   inode->i_lock
+ *     inode_lru_lock
+ *
+ * inode_wb_list_lock
+ *   inode->i_lock
+ *
+ * inode_hash_lock
+ *   inode_sb_list_lock
+ *   inode->i_lock
+ *
+ * iunique_lock
+ *   inode_hash_lock
+ */
+
+static unsigned int i_hash_mask __read_mostly;
+static unsigned int i_hash_shift __read_mostly;
+static struct hlist_head *inode_hashtable __read_mostly;
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
+
+static LIST_HEAD(inode_lru);
+static DEFINE_SPINLOCK(inode_lru_lock);
+
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
+
+/*
+ * iprune_sem provides exclusion between the icache shrinking and the
+ * umount path.
+ *
+ * We don't actually need it to protect anything in the umount path,
+ * but only need to cycle through it to make sure any inode that
+ * prune_icache took off the LRU list has been fully torn down by the
+ * time we are past evict_inodes.
+ */
+static DECLARE_RWSEM(iprune_sem);
+
+/*
+ * Empty aops. Can be used for the cases where the user does not
+ * define any of the address_space operations.
+ */
+const struct address_space_operations empty_aops = {
+};
+EXPORT_SYMBOL(empty_aops);
+
+/*
+ * Statistics gathering..
+ */
+struct inodes_stat_t inodes_stat;
+
+static DEFINE_PER_CPU(unsigned int, nr_inodes);
+
+static struct kmem_cache *inode_cachep __read_mostly;
+
+static int get_nr_inodes(void)
+{
+	int i;
+	int sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_inodes, i);
+	return sum < 0 ? 0 : sum;
+}
+
+static inline int get_nr_inodes_unused(void)
+{
+	return inodes_stat.nr_unused;
+}
+
+int get_nr_dirty_inodes(void)
+{
+	/* not actually dirty inodes, but a wild approximation */
+	int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+	return nr_dirty > 0 ? nr_dirty : 0;
+}
+
+/*
+ * Handle nr_inode sysctl
+ */
+#ifdef CONFIG_SYSCTL
+int proc_nr_inodes(ctl_table *table, int write,
+		   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	inodes_stat.nr_inodes = get_nr_inodes();
+	return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
+
+/**
+ * inode_init_always - perform inode structure intialisation
+ * @sb: superblock inode belongs to
+ * @inode: inode to initialise
+ *
+ * These are initializations that need to be done on every inode
+ * allocation as the fields are not initialised by slab allocation.
+ */
+int inode_init_always(struct super_block *sb, struct inode *inode)
+{
+	static const struct inode_operations empty_iops;
+	static const struct file_operations empty_fops;
+	struct address_space *const mapping = &inode->i_data;
+
+	inode->i_sb = sb;
+	inode->i_blkbits = sb->s_blocksize_bits;
+	inode->i_flags = 0;
+	atomic_set(&inode->i_count, 1);
+	inode->i_op = &empty_iops;
+	inode->i_fop = &empty_fops;
+	inode->i_nlink = 1;
+	inode->i_uid = 0;
+	inode->i_gid = 0;
+	atomic_set(&inode->i_writecount, 0);
+	inode->i_size = 0;
+	inode->i_blocks = 0;
+	inode->i_bytes = 0;
+	inode->i_generation = 0;
+#ifdef CONFIG_QUOTA
+	memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+#endif
+	inode->i_pipe = NULL;
+	inode->i_bdev = NULL;
+	inode->i_cdev = NULL;
+	inode->i_rdev = 0;
+	inode->dirtied_when = 0;
+
+	if (security_inode_alloc(inode))
+		goto out;
+	spin_lock_init(&inode->i_lock);
+	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
+
+	mutex_init(&inode->i_mutex);
+	lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
+
+	init_rwsem(&inode->i_alloc_sem);
+	lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
+
+	mapping->a_ops = &empty_aops;
+	mapping->host = inode;
+	mapping->flags = 0;
+	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
+	mapping->assoc_mapping = NULL;
+	mapping->backing_dev_info = &default_backing_dev_info;
+	mapping->writeback_index = 0;
+
+	/*
+	 * If the block_device provides a backing_dev_info for client
+	 * inodes then use that.  Otherwise the inode share the bdev's
+	 * backing_dev_info.
+	 */
+	if (sb->s_bdev) {
+		struct backing_dev_info *bdi;
+
+		bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+		mapping->backing_dev_info = bdi;
+	}
+	inode->i_private = NULL;
+	inode->i_mapping = mapping;
+#ifdef CONFIG_FS_POSIX_ACL
+	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
+#endif
+
+#ifdef CONFIG_FSNOTIFY
+	inode->i_fsnotify_mask = 0;
+#endif
+
+	this_cpu_inc(nr_inodes);
+
+	return 0;
+out:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(inode_init_always);
+
+static struct inode *alloc_inode(struct super_block *sb)
+{
+	struct inode *inode;
+
+	if (sb->s_op->alloc_inode)
+		inode = sb->s_op->alloc_inode(sb);
+	else
+		inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
+
+	if (!inode)
+		return NULL;
+
+	if (unlikely(inode_init_always(sb, inode))) {
+		if (inode->i_sb->s_op->destroy_inode)
+			inode->i_sb->s_op->destroy_inode(inode);
+		else
+			kmem_cache_free(inode_cachep, inode);
+		return NULL;
+	}
+
+	return inode;
+}
+
+void free_inode_nonrcu(struct inode *inode)
+{
+	kmem_cache_free(inode_cachep, inode);
+}
+EXPORT_SYMBOL(free_inode_nonrcu);
+
+void __destroy_inode(struct inode *inode)
+{
+	BUG_ON(inode_has_buffers(inode));
+	security_inode_free(inode);
+	fsnotify_inode_delete(inode);
+#ifdef CONFIG_FS_POSIX_ACL
+	if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED)
+		posix_acl_release(inode->i_acl);
+	if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
+		posix_acl_release(inode->i_default_acl);
+#endif
+	this_cpu_dec(nr_inodes);
+}
+EXPORT_SYMBOL(__destroy_inode);
+
+static void i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(inode_cachep, inode);
+}
+
+static void destroy_inode(struct inode *inode)
+{
+	BUG_ON(!list_empty(&inode->i_lru));
+	__destroy_inode(inode);
+	if (inode->i_sb->s_op->destroy_inode)
+		inode->i_sb->s_op->destroy_inode(inode);
+	else
+		call_rcu(&inode->i_rcu, i_callback);
+}
+
+void address_space_init_once(struct address_space *mapping)
+{
+	memset(mapping, 0, sizeof(*mapping));
+	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+	spin_lock_init(&mapping->tree_lock);
+	mutex_init(&mapping->i_mmap_mutex);
+	INIT_LIST_HEAD(&mapping->private_list);
+	spin_lock_init(&mapping->private_lock);
+	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+}
+EXPORT_SYMBOL(address_space_init_once);
+
+/*
+ * These are initializations that only need to be done
+ * once, because the fields are idempotent across use
+ * of the inode, so let the slab aware of that.
+ */
+void inode_init_once(struct inode *inode)
+{
+	memset(inode, 0, sizeof(*inode));
+	INIT_HLIST_NODE(&inode->i_hash);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	INIT_LIST_HEAD(&inode->i_devices);
+	INIT_LIST_HEAD(&inode->i_wb_list);
+	INIT_LIST_HEAD(&inode->i_lru);
+	address_space_init_once(&inode->i_data);
+	i_size_ordered_init(inode);
+#ifdef CONFIG_FSNOTIFY
+	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
+#endif
+}
+EXPORT_SYMBOL(inode_init_once);
+
+static void init_once(void *foo)
+{
+	struct inode *inode = (struct inode *) foo;
+
+	inode_init_once(inode);
+}
+
+/*
+ * inode->i_lock must be held
+ */
+void __iget(struct inode *inode)
+{
+	atomic_inc(&inode->i_count);
+}
+
+/*
+ * get additional reference to inode; caller must already hold one.
+ */
+void ihold(struct inode *inode)
+{
+	WARN_ON(atomic_inc_return(&inode->i_count) < 2);
+}
+EXPORT_SYMBOL(ihold);
+
+static void inode_lru_list_add(struct inode *inode)
+{
+	spin_lock(&inode_lru_lock);
+	if (list_empty(&inode->i_lru)) {
+		list_add(&inode->i_lru, &inode_lru);
+		inodes_stat.nr_unused++;
+	}
+	spin_unlock(&inode_lru_lock);
+}
+
+static void inode_lru_list_del(struct inode *inode)
+{
+	spin_lock(&inode_lru_lock);
+	if (!list_empty(&inode->i_lru)) {
+		list_del_init(&inode->i_lru);
+		inodes_stat.nr_unused--;
+	}
+	spin_unlock(&inode_lru_lock);
+}
+
+/**
+ * inode_sb_list_add - add inode to the superblock list of inodes
+ * @inode: inode to add
+ */
+void inode_sb_list_add(struct inode *inode)
+{
+	spin_lock(&inode_sb_list_lock);
+	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
+	spin_unlock(&inode_sb_list_lock);
+}
+EXPORT_SYMBOL_GPL(inode_sb_list_add);
+
+static inline void inode_sb_list_del(struct inode *inode)
+{
+	spin_lock(&inode_sb_list_lock);
+	list_del_init(&inode->i_sb_list);
+	spin_unlock(&inode_sb_list_lock);
+}
+
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+	unsigned long tmp;
+
+	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+			L1_CACHE_BYTES;
+	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
+	return tmp & i_hash_mask;
+}
+
+/**
+ *	__insert_inode_hash - hash an inode
+ *	@inode: unhashed inode
+ *	@hashval: unsigned long value used to locate this object in the
+ *		inode_hashtable.
+ *
+ *	Add an inode to the inode hash for this superblock.
+ */
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+{
+	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
+
+	spin_lock(&inode_hash_lock);
+	spin_lock(&inode->i_lock);
+	hlist_add_head(&inode->i_hash, b);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_hash_lock);
+}
+EXPORT_SYMBOL(__insert_inode_hash);
+
+/**
+ *	remove_inode_hash - remove an inode from the hash
+ *	@inode: inode to unhash
+ *
+ *	Remove an inode from the superblock.
+ */
+void remove_inode_hash(struct inode *inode)
+{
+	spin_lock(&inode_hash_lock);
+	spin_lock(&inode->i_lock);
+	hlist_del_init(&inode->i_hash);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_hash_lock);
+}
+EXPORT_SYMBOL(remove_inode_hash);
+
+void end_writeback(struct inode *inode)
+{
+	might_sleep();
+	/*
+	 * We have to cycle tree_lock here because reclaim can be still in the
+	 * process of removing the last page (in __delete_from_page_cache())
+	 * and we must not free mapping under it.
+	 */
+	spin_lock_irq(&inode->i_data.tree_lock);
+	BUG_ON(inode->i_data.nrpages);
+	spin_unlock_irq(&inode->i_data.tree_lock);
+	BUG_ON(!list_empty(&inode->i_data.private_list));
+	BUG_ON(!(inode->i_state & I_FREEING));
+	BUG_ON(inode->i_state & I_CLEAR);
+	inode_sync_wait(inode);
+	/* don't need i_lock here, no concurrent mods to i_state */
+	inode->i_state = I_FREEING | I_CLEAR;
+}
+EXPORT_SYMBOL(end_writeback);
+
+/*
+ * Free the inode passed in, removing it from the lists it is still connected
+ * to. We remove any pages still attached to the inode and wait for any IO that
+ * is still in progress before finally destroying the inode.
+ *
+ * An inode must already be marked I_FREEING so that we avoid the inode being
+ * moved back onto lists if we race with other code that manipulates the lists
+ * (e.g. writeback_single_inode). The caller is responsible for setting this.
+ *
+ * An inode must already be removed from the LRU list before being evicted from
+ * the cache. This should occur atomically with setting the I_FREEING state
+ * flag, so no inodes here should ever be on the LRU when being evicted.
+ */
+static void evict(struct inode *inode)
+{
+	const struct super_operations *op = inode->i_sb->s_op;
+
+	BUG_ON(!(inode->i_state & I_FREEING));
+	BUG_ON(!list_empty(&inode->i_lru));
+
+	inode_wb_list_del(inode);
+	inode_sb_list_del(inode);
+
+	if (op->evict_inode) {
+		op->evict_inode(inode);
+	} else {
+		if (inode->i_data.nrpages)
+			truncate_inode_pages(&inode->i_data, 0);
+		end_writeback(inode);
+	}
+	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
+		bd_forget(inode);
+	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
+		cd_forget(inode);
+
+	remove_inode_hash(inode);
+
+	spin_lock(&inode->i_lock);
+	wake_up_bit(&inode->i_state, __I_NEW);
+	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
+	spin_unlock(&inode->i_lock);
+
+	destroy_inode(inode);
+}
+
+/*
+ * dispose_list - dispose of the contents of a local list
+ * @head: the head of the list to free
+ *
+ * Dispose-list gets a local list with local inodes in it, so it doesn't
+ * need to worry about list corruption and SMP locks.
+ */
+static void dispose_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct inode *inode;
+
+		inode = list_first_entry(head, struct inode, i_lru);
+		list_del_init(&inode->i_lru);
+
+		evict(inode);
+	}
+}
+
+/**
+ * evict_inodes	- evict all evictable inodes for a superblock
+ * @sb:		superblock to operate on
+ *
+ * Make sure that no inodes with zero refcount are retained.  This is
+ * called by superblock shutdown after having MS_ACTIVE flag removed,
+ * so any inode reaching zero refcount during or after that call will
+ * be immediately evicted.
+ */
+void evict_inodes(struct super_block *sb)
+{
+	struct inode *inode, *next;
+	LIST_HEAD(dispose);
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+		if (atomic_read(&inode->i_count))
+			continue;
+
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+
+		inode->i_state |= I_FREEING;
+		inode_lru_list_del(inode);
+		spin_unlock(&inode->i_lock);
+		list_add(&inode->i_lru, &dispose);
+	}
+	spin_unlock(&inode_sb_list_lock);
+
+	dispose_list(&dispose);
+
+	/*
+	 * Cycle through iprune_sem to make sure any inode that prune_icache
+	 * moved off the list before we took the lock has been fully torn
+	 * down.
+	 */
+	down_write(&iprune_sem);
+	up_write(&iprune_sem);
+}
+
+/**
+ * invalidate_inodes	- attempt to free all inodes on a superblock
+ * @sb:		superblock to operate on
+ * @kill_dirty: flag to guide handling of dirty inodes
+ *
+ * Attempts to free all inodes for a given superblock.  If there were any
+ * busy inodes return a non-zero value, else zero.
+ * If @kill_dirty is set, discard dirty inodes too, otherwise treat
+ * them as busy.
+ */
+int invalidate_inodes(struct super_block *sb, bool kill_dirty)
+{
+	int busy = 0;
+	struct inode *inode, *next;
+	LIST_HEAD(dispose);
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		if (inode->i_state & I_DIRTY && !kill_dirty) {
+			spin_unlock(&inode->i_lock);
+			busy = 1;
+			continue;
+		}
+		if (atomic_read(&inode->i_count)) {
+			spin_unlock(&inode->i_lock);
+			busy = 1;
+			continue;
+		}
+
+		inode->i_state |= I_FREEING;
+		inode_lru_list_del(inode);
+		spin_unlock(&inode->i_lock);
+		list_add(&inode->i_lru, &dispose);
+	}
+	spin_unlock(&inode_sb_list_lock);
+
+	dispose_list(&dispose);
+
+	return busy;
+}
+
+static int can_unuse(struct inode *inode)
+{
+	if (inode->i_state & ~I_REFERENCED)
+		return 0;
+	if (inode_has_buffers(inode))
+		return 0;
+	if (atomic_read(&inode->i_count))
+		return 0;
+	if (inode->i_data.nrpages)
+		return 0;
+	return 1;
+}
+
+/*
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
+ * temporary list and then are freed outside inode_lru_lock by dispose_list().
+ *
+ * Any inodes which are pinned purely because of attached pagecache have their
+ * pagecache removed.  If the inode has metadata buffers attached to
+ * mapping->private_list then try to remove them.
+ *
+ * If the inode has the I_REFERENCED flag set, then it means that it has been
+ * used recently - the flag is set in iput_final(). When we encounter such an
+ * inode, clear the flag and move it to the back of the LRU so it gets another
+ * pass through the LRU before it gets reclaimed. This is necessary because of
+ * the fact we are doing lazy LRU updates to minimise lock contention so the
+ * LRU does not have strict ordering. Hence we don't want to reclaim inodes
+ * with this flag set because they are the inodes that are out of order.
+ */
+static void prune_icache(int nr_to_scan)
+{
+	LIST_HEAD(freeable);
+	int nr_scanned;
+	unsigned long reap = 0;
+
+	down_read(&iprune_sem);
+	spin_lock(&inode_lru_lock);
+	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+		struct inode *inode;
+
+		if (list_empty(&inode_lru))
+			break;
+
+		inode = list_entry(inode_lru.prev, struct inode, i_lru);
+
+		/*
+		 * we are inverting the inode_lru_lock/inode->i_lock here,
+		 * so use a trylock. If we fail to get the lock, just move the
+		 * inode to the back of the list so we don't spin on it.
+		 */
+		if (!spin_trylock(&inode->i_lock)) {
+			list_move(&inode->i_lru, &inode_lru);
+			continue;
+		}
+
+		/*
+		 * Referenced or dirty inodes are still in use. Give them
+		 * another pass through the LRU as we canot reclaim them now.
+		 */
+		if (atomic_read(&inode->i_count) ||
+		    (inode->i_state & ~I_REFERENCED)) {
+			list_del_init(&inode->i_lru);
+			spin_unlock(&inode->i_lock);
+			inodes_stat.nr_unused--;
+			continue;
+		}
+
+		/* recently referenced inodes get one more pass */
+		if (inode->i_state & I_REFERENCED) {
+			inode->i_state &= ~I_REFERENCED;
+			list_move(&inode->i_lru, &inode_lru);
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+			__iget(inode);
+			spin_unlock(&inode->i_lock);
+			spin_unlock(&inode_lru_lock);
+			if (remove_inode_buffers(inode))
+				reap += invalidate_mapping_pages(&inode->i_data,
+								0, -1);
+			iput(inode);
+			spin_lock(&inode_lru_lock);
+
+			if (inode != list_entry(inode_lru.next,
+						struct inode, i_lru))
+				continue;	/* wrong inode or list_empty */
+			/* avoid lock inversions with trylock */
+			if (!spin_trylock(&inode->i_lock))
+				continue;
+			if (!can_unuse(inode)) {
+				spin_unlock(&inode->i_lock);
+				continue;
+			}
+		}
+		WARN_ON(inode->i_state & I_NEW);
+		inode->i_state |= I_FREEING;
+		spin_unlock(&inode->i_lock);
+
+		list_move(&inode->i_lru, &freeable);
+		inodes_stat.nr_unused--;
+	}
+	if (current_is_kswapd())
+		__count_vm_events(KSWAPD_INODESTEAL, reap);
+	else
+		__count_vm_events(PGINODESTEAL, reap);
+	spin_unlock(&inode_lru_lock);
+
+	dispose_list(&freeable);
+	up_read(&iprune_sem);
+}
+
+/*
+ * shrink_icache_memory() will attempt to reclaim some unused inodes.  Here,
+ * "unused" means that no dentries are referring to the inodes: the files are
+ * not open and the dcache references to those inodes have already been
+ * reclaimed.
+ *
+ * This function is passed the number of inodes to scan, and it returns the
+ * total number of remaining possibly-reclaimable inodes.
+ */
+static int shrink_icache_memory(struct shrinker *shrink,
+				struct shrink_control *sc)
+{
+	int nr = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
+
+	if (nr) {
+		/*
+		 * Nasty deadlock avoidance.  We may hold various FS locks,
+		 * and we don't want to recurse into the FS that called us
+		 * in clear_inode() and friends..
+		 */
+		if (!(gfp_mask & __GFP_FS))
+			return -1;
+		prune_icache(nr);
+	}
+	return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
+}
+
+static struct shrinker icache_shrinker = {
+	.shrink = shrink_icache_memory,
+	.seeks = DEFAULT_SEEKS,
+};
+
+static void __wait_on_freeing_inode(struct inode *inode);
+/*
+ * Called with the inode lock held.
+ */
+static struct inode *find_inode(struct super_block *sb,
+				struct hlist_head *head,
+				int (*test)(struct inode *, void *),
+				void *data)
+{
+	struct hlist_node *node;
+	struct inode *inode = NULL;
+
+repeat:
+	hlist_for_each_entry(inode, node, head, i_hash) {
+		spin_lock(&inode->i_lock);
+		if (inode->i_sb != sb) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		if (!test(inode, data)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
+			__wait_on_freeing_inode(inode);
+			goto repeat;
+		}
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		return inode;
+	}
+	return NULL;
+}
+
+/*
+ * find_inode_fast is the fast path version of find_inode, see the comment at
+ * iget_locked for details.
+ */
+static struct inode *find_inode_fast(struct super_block *sb,
+				struct hlist_head *head, unsigned long ino)
+{
+	struct hlist_node *node;
+	struct inode *inode = NULL;
+
+repeat:
+	hlist_for_each_entry(inode, node, head, i_hash) {
+		spin_lock(&inode->i_lock);
+		if (inode->i_ino != ino) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		if (inode->i_sb != sb) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
+			__wait_on_freeing_inode(inode);
+			goto repeat;
+		}
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		return inode;
+	}
+	return NULL;
+}
+
+/*
+ * Each cpu owns a range of LAST_INO_BATCH numbers.
+ * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
+ * to renew the exhausted range.
+ *
+ * This does not significantly increase overflow rate because every CPU can
+ * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
+ * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
+ * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
+ * overflow rate by 2x, which does not seem too significant.
+ *
+ * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
+ * error if st_ino won't fit in target struct field. Use 32bit counter
+ * here to attempt to avoid that.
+ */
+#define LAST_INO_BATCH 1024
+static DEFINE_PER_CPU(unsigned int, last_ino);
+
+unsigned int get_next_ino(void)
+{
+	unsigned int *p = &get_cpu_var(last_ino);
+	unsigned int res = *p;
+
+#ifdef CONFIG_SMP
+	if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
+		static atomic_t shared_last_ino;
+		int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
+
+		res = next - LAST_INO_BATCH;
+	}
+#endif
+
+	*p = ++res;
+	put_cpu_var(last_ino);
+	return res;
+}
+EXPORT_SYMBOL(get_next_ino);
+
+/**
+ *	new_inode 	- obtain an inode
+ *	@sb: superblock
+ *
+ *	Allocates a new inode for given superblock. The default gfp_mask
+ *	for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
+ *	If HIGHMEM pages are unsuitable or it is known that pages allocated
+ *	for the page cache are not reclaimable or migratable,
+ *	mapping_set_gfp_mask() must be called with suitable flags on the
+ *	newly created inode's mapping
+ *
+ */
+struct inode *new_inode(struct super_block *sb)
+{
+	struct inode *inode;
+
+	spin_lock_prefetch(&inode_sb_list_lock);
+
+	inode = alloc_inode(sb);
+	if (inode) {
+		spin_lock(&inode->i_lock);
+		inode->i_state = 0;
+		spin_unlock(&inode->i_lock);
+		inode_sb_list_add(inode);
+	}
+	return inode;
+}
+EXPORT_SYMBOL(new_inode);
+
+/**
+ * unlock_new_inode - clear the I_NEW state and wake up any waiters
+ * @inode:	new inode to unlock
+ *
+ * Called when the inode is fully initialised to clear the new state of the
+ * inode and wake up anyone waiting for the inode to finish initialisation.
+ */
+void unlock_new_inode(struct inode *inode)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (S_ISDIR(inode->i_mode)) {
+		struct file_system_type *type = inode->i_sb->s_type;
+
+		/* Set new key only if filesystem hasn't already changed it */
+		if (!lockdep_match_class(&inode->i_mutex,
+		    &type->i_mutex_key)) {
+			/*
+			 * ensure nobody is actually holding i_mutex
+			 */
+			mutex_destroy(&inode->i_mutex);
+			mutex_init(&inode->i_mutex);
+			lockdep_set_class(&inode->i_mutex,
+					  &type->i_mutex_dir_key);
+		}
+	}
+#endif
+	spin_lock(&inode->i_lock);
+	WARN_ON(!(inode->i_state & I_NEW));
+	inode->i_state &= ~I_NEW;
+	wake_up_bit(&inode->i_state, __I_NEW);
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(unlock_new_inode);
+
+/**
+ * iget5_locked - obtain an inode from a mounted file system
+ * @sb:		super block of file system
+ * @hashval:	hash value (usually inode number) to get
+ * @test:	callback used for comparisons between inodes
+ * @set:	callback used to initialize a new struct inode
+ * @data:	opaque data pointer to pass to @test and @set
+ *
+ * Search for the inode specified by @hashval and @data in the inode cache,
+ * and if present it is return it with an increased reference count. This is
+ * a generalized version of iget_locked() for file systems where the inode
+ * number is not sufficient for unique identification of an inode.
+ *
+ * If the inode is not in cache, allocate a new inode and return it locked,
+ * hashed, and with the I_NEW flag set. The file system gets to fill it in
+ * before unlocking it via unlock_new_inode().
+ *
+ * Note both @test and @set are called with the inode_hash_lock held, so can't
+ * sleep.
+ */
+struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
+		int (*test)(struct inode *, void *),
+		int (*set)(struct inode *, void *), void *data)
+{
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *inode;
+
+	spin_lock(&inode_hash_lock);
+	inode = find_inode(sb, head, test, data);
+	spin_unlock(&inode_hash_lock);
+
+	if (inode) {
+		wait_on_inode(inode);
+		return inode;
+	}
+
+	inode = alloc_inode(sb);
+	if (inode) {
+		struct inode *old;
+
+		spin_lock(&inode_hash_lock);
+		/* We released the lock, so.. */
+		old = find_inode(sb, head, test, data);
+		if (!old) {
+			if (set(inode, data))
+				goto set_failed;
+
+			spin_lock(&inode->i_lock);
+			inode->i_state = I_NEW;
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode->i_lock);
+			inode_sb_list_add(inode);
+			spin_unlock(&inode_hash_lock);
+
+			/* Return the locked inode with I_NEW set, the
+			 * caller is responsible for filling in the contents
+			 */
+			return inode;
+		}
+
+		/*
+		 * Uhhuh, somebody else created the same inode under
+		 * us. Use the old inode instead of the one we just
+		 * allocated.
+		 */
+		spin_unlock(&inode_hash_lock);
+		destroy_inode(inode);
+		inode = old;
+		wait_on_inode(inode);
+	}
+	return inode;
+
+set_failed:
+	spin_unlock(&inode_hash_lock);
+	destroy_inode(inode);
+	return NULL;
+}
+EXPORT_SYMBOL(iget5_locked);
+
+/**
+ * iget_locked - obtain an inode from a mounted file system
+ * @sb:		super block of file system
+ * @ino:	inode number to get
+ *
+ * Search for the inode specified by @ino in the inode cache and if present
+ * return it with an increased reference count. This is for file systems
+ * where the inode number is sufficient for unique identification of an inode.
+ *
+ * If the inode is not in cache, allocate a new inode and return it locked,
+ * hashed, and with the I_NEW flag set.  The file system gets to fill it in
+ * before unlocking it via unlock_new_inode().
+ */
+struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+{
+	struct hlist_head *head = inode_hashtable + hash(sb, ino);
+	struct inode *inode;
+
+	spin_lock(&inode_hash_lock);
+	inode = find_inode_fast(sb, head, ino);
+	spin_unlock(&inode_hash_lock);
+	if (inode) {
+		wait_on_inode(inode);
+		return inode;
+	}
+
+	inode = alloc_inode(sb);
+	if (inode) {
+		struct inode *old;
+
+		spin_lock(&inode_hash_lock);
+		/* We released the lock, so.. */
+		old = find_inode_fast(sb, head, ino);
+		if (!old) {
+			inode->i_ino = ino;
+			spin_lock(&inode->i_lock);
+			inode->i_state = I_NEW;
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode->i_lock);
+			inode_sb_list_add(inode);
+			spin_unlock(&inode_hash_lock);
+
+			/* Return the locked inode with I_NEW set, the
+			 * caller is responsible for filling in the contents
+			 */
+			return inode;
+		}
+
+		/*
+		 * Uhhuh, somebody else created the same inode under
+		 * us. Use the old inode instead of the one we just
+		 * allocated.
+		 */
+		spin_unlock(&inode_hash_lock);
+		destroy_inode(inode);
+		inode = old;
+		wait_on_inode(inode);
+	}
+	return inode;
+}
+EXPORT_SYMBOL(iget_locked);
+
+/*
+ * search the inode cache for a matching inode number.
+ * If we find one, then the inode number we are trying to
+ * allocate is not unique and so we should not use it.
+ *
+ * Returns 1 if the inode number is unique, 0 if it is not.
+ */
+static int test_inode_iunique(struct super_block *sb, unsigned long ino)
+{
+	struct hlist_head *b = inode_hashtable + hash(sb, ino);
+	struct hlist_node *node;
+	struct inode *inode;
+
+	spin_lock(&inode_hash_lock);
+	hlist_for_each_entry(inode, node, b, i_hash) {
+		if (inode->i_ino == ino && inode->i_sb == sb) {
+			spin_unlock(&inode_hash_lock);
+			return 0;
+		}
+	}
+	spin_unlock(&inode_hash_lock);
+
+	return 1;
+}
+
+/**
+ *	iunique - get a unique inode number
+ *	@sb: superblock
+ *	@max_reserved: highest reserved inode number
+ *
+ *	Obtain an inode number that is unique on the system for a given
+ *	superblock. This is used by file systems that have no natural
+ *	permanent inode numbering system. An inode number is returned that
+ *	is higher than the reserved limit but unique.
+ *
+ *	BUGS:
+ *	With a large number of inodes live on the file system this function
+ *	currently becomes quite slow.
+ */
+ino_t iunique(struct super_block *sb, ino_t max_reserved)
+{
+	/*
+	 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
+	 * error if st_ino won't fit in target struct field. Use 32bit counter
+	 * here to attempt to avoid that.
+	 */
+	static DEFINE_SPINLOCK(iunique_lock);
+	static unsigned int counter;
+	ino_t res;
+
+	spin_lock(&iunique_lock);
+	do {
+		if (counter <= max_reserved)
+			counter = max_reserved + 1;
+		res = counter++;
+	} while (!test_inode_iunique(sb, res));
+	spin_unlock(&iunique_lock);
+
+	return res;
+}
+EXPORT_SYMBOL(iunique);
+
+struct inode *igrab(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+	} else {
+		spin_unlock(&inode->i_lock);
+		/*
+		 * Handle the case where s_op->clear_inode is not been
+		 * called yet, and somebody is calling igrab
+		 * while the inode is getting freed.
+		 */
+		inode = NULL;
+	}
+	return inode;
+}
+EXPORT_SYMBOL(igrab);
+
+/**
+ * ilookup5_nowait - search for an inode in the inode cache
+ * @sb:		super block of file system to search
+ * @hashval:	hash value (usually inode number) to search for
+ * @test:	callback used for comparisons between inodes
+ * @data:	opaque data pointer to pass to @test
+ *
+ * Search for the inode specified by @hashval and @data in the inode cache.
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Note: I_NEW is not waited upon so you have to be very careful what you do
+ * with the returned inode.  You probably should be using ilookup5() instead.
+ *
+ * Note2: @test is called with the inode_hash_lock held, so can't sleep.
+ */
+struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
+		int (*test)(struct inode *, void *), void *data)
+{
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *inode;
+
+	spin_lock(&inode_hash_lock);
+	inode = find_inode(sb, head, test, data);
+	spin_unlock(&inode_hash_lock);
+
+	return inode;
+}
+EXPORT_SYMBOL(ilookup5_nowait);
+
+/**
+ * ilookup5 - search for an inode in the inode cache
+ * @sb:		super block of file system to search
+ * @hashval:	hash value (usually inode number) to search for
+ * @test:	callback used for comparisons between inodes
+ * @data:	opaque data pointer to pass to @test
+ *
+ * Search for the inode specified by @hashval and @data in the inode cache,
+ * and if the inode is in the cache, return the inode with an incremented
+ * reference count.  Waits on I_NEW before returning the inode.
+ * returned with an incremented reference count.
+ *
+ * This is a generalized version of ilookup() for file systems where the
+ * inode number is not sufficient for unique identification of an inode.
+ *
+ * Note: @test is called with the inode_hash_lock held, so can't sleep.
+ */
+struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
+		int (*test)(struct inode *, void *), void *data)
+{
+	struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
+
+	if (inode)
+		wait_on_inode(inode);
+	return inode;
+}
+EXPORT_SYMBOL(ilookup5);
+
+/**
+ * ilookup - search for an inode in the inode cache
+ * @sb:		super block of file system to search
+ * @ino:	inode number to search for
+ *
+ * Search for the inode @ino in the inode cache, and if the inode is in the
+ * cache, the inode is returned with an incremented reference count.
+ */
+struct inode *ilookup(struct super_block *sb, unsigned long ino)
+{
+	struct hlist_head *head = inode_hashtable + hash(sb, ino);
+	struct inode *inode;
+
+	spin_lock(&inode_hash_lock);
+	inode = find_inode_fast(sb, head, ino);
+	spin_unlock(&inode_hash_lock);
+
+	if (inode)
+		wait_on_inode(inode);
+	return inode;
+}
+EXPORT_SYMBOL(ilookup);
+
+int insert_inode_locked(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	ino_t ino = inode->i_ino;
+	struct hlist_head *head = inode_hashtable + hash(sb, ino);
+
+	while (1) {
+		struct hlist_node *node;
+		struct inode *old = NULL;
+		spin_lock(&inode_hash_lock);
+		hlist_for_each_entry(old, node, head, i_hash) {
+			if (old->i_ino != ino)
+				continue;
+			if (old->i_sb != sb)
+				continue;
+			spin_lock(&old->i_lock);
+			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+				spin_unlock(&old->i_lock);
+				continue;
+			}
+			break;
+		}
+		if (likely(!node)) {
+			spin_lock(&inode->i_lock);
+			inode->i_state |= I_NEW;
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode->i_lock);
+			spin_unlock(&inode_hash_lock);
+			return 0;
+		}
+		__iget(old);
+		spin_unlock(&old->i_lock);
+		spin_unlock(&inode_hash_lock);
+		wait_on_inode(old);
+		if (unlikely(!inode_unhashed(old))) {
+			iput(old);
+			return -EBUSY;
+		}
+		iput(old);
+	}
+}
+EXPORT_SYMBOL(insert_inode_locked);
+
+int insert_inode_locked4(struct inode *inode, unsigned long hashval,
+		int (*test)(struct inode *, void *), void *data)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+
+	while (1) {
+		struct hlist_node *node;
+		struct inode *old = NULL;
+
+		spin_lock(&inode_hash_lock);
+		hlist_for_each_entry(old, node, head, i_hash) {
+			if (old->i_sb != sb)
+				continue;
+			if (!test(old, data))
+				continue;
+			spin_lock(&old->i_lock);
+			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+				spin_unlock(&old->i_lock);
+				continue;
+			}
+			break;
+		}
+		if (likely(!node)) {
+			spin_lock(&inode->i_lock);
+			inode->i_state |= I_NEW;
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode->i_lock);
+			spin_unlock(&inode_hash_lock);
+			return 0;
+		}
+		__iget(old);
+		spin_unlock(&old->i_lock);
+		spin_unlock(&inode_hash_lock);
+		wait_on_inode(old);
+		if (unlikely(!inode_unhashed(old))) {
+			iput(old);
+			return -EBUSY;
+		}
+		iput(old);
+	}
+}
+EXPORT_SYMBOL(insert_inode_locked4);
+
+
+int generic_delete_inode(struct inode *inode)
+{
+	return 1;
+}
+EXPORT_SYMBOL(generic_delete_inode);
+
+/*
+ * Normal UNIX filesystem behaviour: delete the
+ * inode when the usage count drops to zero, and
+ * i_nlink is zero.
+ */
+int generic_drop_inode(struct inode *inode)
+{
+	return !inode->i_nlink || inode_unhashed(inode);
+}
+EXPORT_SYMBOL_GPL(generic_drop_inode);
+
+/*
+ * Called when we're dropping the last reference
+ * to an inode.
+ *
+ * Call the FS "drop_inode()" function, defaulting to
+ * the legacy UNIX filesystem behaviour.  If it tells
+ * us to evict inode, do so.  Otherwise, retain inode
+ * in cache if fs is alive, sync and evict if fs is
+ * shutting down.
+ */
+static void iput_final(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	const struct super_operations *op = inode->i_sb->s_op;
+	int drop;
+
+	WARN_ON(inode->i_state & I_NEW);
+
+	if (op && op->drop_inode)
+		drop = op->drop_inode(inode);
+	else
+		drop = generic_drop_inode(inode);
+
+	if (!drop && (sb->s_flags & MS_ACTIVE)) {
+		inode->i_state |= I_REFERENCED;
+		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
+			inode_lru_list_add(inode);
+		spin_unlock(&inode->i_lock);
+		return;
+	}
+
+	if (!drop) {
+		inode->i_state |= I_WILL_FREE;
+		spin_unlock(&inode->i_lock);
+		write_inode_now(inode, 1);
+		spin_lock(&inode->i_lock);
+		WARN_ON(inode->i_state & I_NEW);
+		inode->i_state &= ~I_WILL_FREE;
+	}
+
+	inode->i_state |= I_FREEING;
+	inode_lru_list_del(inode);
+	spin_unlock(&inode->i_lock);
+
+	evict(inode);
+}
+
+/**
+ *	iput	- put an inode
+ *	@inode: inode to put
+ *
+ *	Puts an inode, dropping its usage count. If the inode use count hits
+ *	zero, the inode is then freed and may also be destroyed.
+ *
+ *	Consequently, iput() can sleep.
+ */
+void iput(struct inode *inode)
+{
+	if (inode) {
+		BUG_ON(inode->i_state & I_CLEAR);
+
+		if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
+			iput_final(inode);
+	}
+}
+EXPORT_SYMBOL(iput);
+
+/**
+ *	bmap	- find a block number in a file
+ *	@inode: inode of file
+ *	@block: block to find
+ *
+ *	Returns the block number on the device holding the inode that
+ *	is the disk block number for the block of the file requested.
+ *	That is, asked for block 4 of inode 1 the function will return the
+ *	disk block relative to the disk start that holds that block of the
+ *	file.
+ */
+sector_t bmap(struct inode *inode, sector_t block)
+{
+	sector_t res = 0;
+	if (inode->i_mapping->a_ops->bmap)
+		res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
+	return res;
+}
+EXPORT_SYMBOL(bmap);
+
+/*
+ * With relative atime, only update atime if the previous atime is
+ * earlier than either the ctime or mtime or if at least a day has
+ * passed since the last atime update.
+ */
+static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
+			     struct timespec now)
+{
+
+	if (!(mnt->mnt_flags & MNT_RELATIME))
+		return 1;
+	/*
+	 * Is mtime younger than atime? If yes, update atime:
+	 */
+	if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0)
+		return 1;
+	/*
+	 * Is ctime younger than atime? If yes, update atime:
+	 */
+	if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0)
+		return 1;
+
+	/*
+	 * Is the previous atime value older than a day? If yes,
+	 * update atime:
+	 */
+	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
+		return 1;
+	/*
+	 * Good, we can skip the atime update:
+	 */
+	return 0;
+}
+
+/**
+ *	touch_atime	-	update the access time
+ *	@mnt: mount the inode is accessed on
+ *	@dentry: dentry accessed
+ *
+ *	Update the accessed time on an inode and mark it for writeback.
+ *	This function automatically handles read only file systems and media,
+ *	as well as the "noatime" flag and inode specific "noatime" markers.
+ */
+void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct timespec now;
+
+	if (inode->i_flags & S_NOATIME)
+		return;
+	if (IS_NOATIME(inode))
+		return;
+	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
+		return;
+
+	if (mnt->mnt_flags & MNT_NOATIME)
+		return;
+	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
+		return;
+
+	now = current_fs_time(inode->i_sb);
+
+	if (!relatime_need_update(mnt, inode, now))
+		return;
+
+	if (timespec_equal(&inode->i_atime, &now))
+		return;
+
+	if (mnt_want_write(mnt))
+		return;
+
+	inode->i_atime = now;
+	mark_inode_dirty_sync(inode);
+	mnt_drop_write(mnt);
+}
+EXPORT_SYMBOL(touch_atime);
+
+/**
+ *	file_update_time	-	update mtime and ctime time
+ *	@file: file accessed
+ *
+ *	Update the mtime and ctime members of an inode and mark the inode
+ *	for writeback.  Note that this function is meant exclusively for
+ *	usage in the file write path of filesystems, and filesystems may
+ *	choose to explicitly ignore update via this function with the
+ *	S_NOCMTIME inode flag, e.g. for network filesystem where these
+ *	timestamps are handled by the server.
+ */
+
+void file_update_time(struct file *file)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct timespec now;
+	enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
+
+	/* First try to exhaust all avenues to not sync */
+	if (IS_NOCMTIME(inode))
+		return;
+
+	now = current_fs_time(inode->i_sb);
+	if (!timespec_equal(&inode->i_mtime, &now))
+		sync_it = S_MTIME;
+
+	if (!timespec_equal(&inode->i_ctime, &now))
+		sync_it |= S_CTIME;
+
+	if (IS_I_VERSION(inode))
+		sync_it |= S_VERSION;
+
+	if (!sync_it)
+		return;
+
+	/* Finally allowed to write? Takes lock. */
+	if (mnt_want_write_file(file))
+		return;
+
+	/* Only change inode inside the lock region */
+	if (sync_it & S_VERSION)
+		inode_inc_iversion(inode);
+	if (sync_it & S_CTIME)
+		inode->i_ctime = now;
+	if (sync_it & S_MTIME)
+		inode->i_mtime = now;
+	mark_inode_dirty_sync(inode);
+	mnt_drop_write(file->f_path.mnt);
+}
+EXPORT_SYMBOL(file_update_time);
+
+int inode_needs_sync(struct inode *inode)
+{
+	if (IS_SYNC(inode))
+		return 1;
+	if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL(inode_needs_sync);
+
+int inode_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+EXPORT_SYMBOL(inode_wait);
+
+/*
+ * If we try to find an inode in the inode hash while it is being
+ * deleted, we have to wait until the filesystem completes its
+ * deletion before reporting that it isn't found.  This function waits
+ * until the deletion _might_ have completed.  Callers are responsible
+ * to recheck inode state.
+ *
+ * It doesn't matter if I_NEW is not set initially, a call to
+ * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
+ * will DTRT.
+ */
+static void __wait_on_freeing_inode(struct inode *inode)
+{
+	wait_queue_head_t *wq;
+	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+	wq = bit_waitqueue(&inode->i_state, __I_NEW);
+	prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&inode_hash_lock);
+	schedule();
+	finish_wait(wq, &wait.wait);
+	spin_lock(&inode_hash_lock);
+}
+
+static __initdata unsigned long ihash_entries;
+static int __init set_ihash_entries(char *str)
+{
+	if (!str)
+		return 0;
+	ihash_entries = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("ihash_entries=", set_ihash_entries);
+
+/*
+ * Initialize the waitqueues and inode hash table.
+ */
+void __init inode_init_early(void)
+{
+	int loop;
+
+	/* If hashes are distributed across NUMA nodes, defer
+	 * hash allocation until vmalloc space is available.
+	 */
+	if (hashdist)
+		return;
+
+	inode_hashtable =
+		alloc_large_system_hash("Inode-cache",
+					sizeof(struct hlist_head),
+					ihash_entries,
+					14,
+					HASH_EARLY,
+					&i_hash_shift,
+					&i_hash_mask,
+					0);
+
+	for (loop = 0; loop < (1 << i_hash_shift); loop++)
+		INIT_HLIST_HEAD(&inode_hashtable[loop]);
+}
+
+void __init inode_init(void)
+{
+	int loop;
+
+	/* inode slab cache */
+	inode_cachep = kmem_cache_create("inode_cache",
+					 sizeof(struct inode),
+					 0,
+					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
+					 SLAB_MEM_SPREAD),
+					 init_once);
+	register_shrinker(&icache_shrinker);
+
+	/* Hash may have been set up in inode_init_early */
+	if (!hashdist)
+		return;
+
+	inode_hashtable =
+		alloc_large_system_hash("Inode-cache",
+					sizeof(struct hlist_head),
+					ihash_entries,
+					14,
+					0,
+					&i_hash_shift,
+					&i_hash_mask,
+					0);
+
+	for (loop = 0; loop < (1 << i_hash_shift); loop++)
+		INIT_HLIST_HEAD(&inode_hashtable[loop]);
+}
+
+void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
+{
+	inode->i_mode = mode;
+	if (S_ISCHR(mode)) {
+		inode->i_fop = &def_chr_fops;
+		inode->i_rdev = rdev;
+	} else if (S_ISBLK(mode)) {
+		inode->i_fop = &def_blk_fops;
+		inode->i_rdev = rdev;
+	} else if (S_ISFIFO(mode))
+		inode->i_fop = &def_fifo_fops;
+	else if (S_ISSOCK(mode))
+		inode->i_fop = &bad_sock_fops;
+	else
+		printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
+				  " inode %s:%lu\n", mode, inode->i_sb->s_id,
+				  inode->i_ino);
+}
+EXPORT_SYMBOL(init_special_inode);
+
+/**
+ * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
+ * @inode: New inode
+ * @dir: Directory inode
+ * @mode: mode of the new inode
+ */
+void inode_init_owner(struct inode *inode, const struct inode *dir,
+			mode_t mode)
+{
+	inode->i_uid = current_fsuid();
+	if (dir && dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		inode->i_gid = current_fsgid();
+	inode->i_mode = mode;
+}
+EXPORT_SYMBOL(inode_init_owner);
+
+/**
+ * inode_owner_or_capable - check current task permissions to inode
+ * @inode: inode being checked
+ *
+ * Return true if current either has CAP_FOWNER to the inode, or
+ * owns the file.
+ */
+bool inode_owner_or_capable(const struct inode *inode)
+{
+	struct user_namespace *ns = inode_userns(inode);
+
+	if (current_user_ns() == ns && current_fsuid() == inode->i_uid)
+		return true;
+	if (ns_capable(ns, CAP_FOWNER))
+		return true;
+	return false;
+}
+EXPORT_SYMBOL(inode_owner_or_capable);
diff --git a/fs/internal.h b/fs/internal.h
new file mode 100644
index 00000000..b29c46e4
--- /dev/null
+++ b/fs/internal.h
@@ -0,0 +1,137 @@
+/* fs/ internal definitions
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/lglock.h>
+
+struct super_block;
+struct file_system_type;
+struct linux_binprm;
+struct path;
+
+/*
+ * block_dev.c
+ */
+#ifdef CONFIG_BLOCK
+extern struct super_block *blockdev_superblock;
+extern void __init bdev_cache_init(void);
+
+static inline int sb_is_blkdev_sb(struct super_block *sb)
+{
+	return sb == blockdev_superblock;
+}
+
+extern int __sync_blockdev(struct block_device *bdev, int wait);
+
+#else
+static inline void bdev_cache_init(void)
+{
+}
+
+static inline int sb_is_blkdev_sb(struct super_block *sb)
+{
+	return 0;
+}
+
+static inline int __sync_blockdev(struct block_device *bdev, int wait)
+{
+	return 0;
+}
+#endif
+
+/*
+ * char_dev.c
+ */
+extern void __init chrdev_init(void);
+
+/*
+ * exec.c
+ */
+extern int check_unsafe_exec(struct linux_binprm *);
+
+/*
+ * namespace.c
+ */
+extern int copy_mount_options(const void __user *, unsigned long *);
+extern int copy_mount_string(const void __user *, char **);
+
+extern unsigned int mnt_get_count(struct vfsmount *mnt);
+extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
+extern struct vfsmount *lookup_mnt(struct path *);
+extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
+				struct vfsmount *);
+extern void release_mounts(struct list_head *);
+extern void umount_tree(struct vfsmount *, int, struct list_head *);
+extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+extern int finish_automount(struct vfsmount *, struct path *);
+
+extern void mnt_make_longterm(struct vfsmount *);
+extern void mnt_make_shortterm(struct vfsmount *);
+
+extern void __init mnt_init(void);
+
+DECLARE_BRLOCK(vfsmount_lock);
+
+
+/*
+ * fs_struct.c
+ */
+extern void chroot_fs_refs(struct path *, struct path *);
+
+/*
+ * file_table.c
+ */
+extern void file_sb_list_add(struct file *f, struct super_block *sb);
+extern void file_sb_list_del(struct file *f);
+extern void mark_files_ro(struct super_block *);
+extern struct file *get_empty_filp(void);
+
+/*
+ * super.c
+ */
+extern int do_remount_sb(struct super_block *, int, void *, int);
+extern void __put_super(struct super_block *sb);
+extern void put_super(struct super_block *sb);
+extern struct dentry *mount_fs(struct file_system_type *,
+			       int, const char *, void *);
+
+/*
+ * open.c
+ */
+struct nameidata;
+extern struct file *nameidata_to_filp(struct nameidata *);
+extern void release_open_intent(struct nameidata *);
+struct open_flags {
+	int open_flag;
+	int mode;
+	int acc_mode;
+	int intent;
+};
+extern struct file *do_filp_open(int dfd, const char *pathname,
+		const struct open_flags *op, int lookup_flags);
+extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
+		const char *, const struct open_flags *, int lookup_flags);
+
+extern long do_handle_open(int mountdirfd,
+			   struct file_handle __user *ufh, int open_flag);
+
+/*
+ * inode.c
+ */
+extern spinlock_t inode_sb_list_lock;
+
+/*
+ * fs-writeback.c
+ */
+extern void inode_wb_list_del(struct inode *inode);
+
+extern int get_nr_dirty_inodes(void);
+extern void evict_inodes(struct super_block *);
+extern int invalidate_inodes(struct super_block *, bool);
diff --git a/fs/ioctl.c b/fs/ioctl.c
new file mode 100644
index 00000000..1d9b9fcb
--- /dev/null
+++ b/fs/ioctl.c
@@ -0,0 +1,623 @@
+/*
+ *  linux/fs/ioctl.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/capability.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/security.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/falloc.h>
+
+#include <asm/ioctls.h>
+
+/* So that the fiemap access checks can't overflow on 32 bit machines. */
+#define FIEMAP_MAX_EXTENTS	(UINT_MAX / sizeof(struct fiemap_extent))
+
+/**
+ * vfs_ioctl - call filesystem specific ioctl methods
+ * @filp:	open file to invoke ioctl method on
+ * @cmd:	ioctl command to execute
+ * @arg:	command-specific argument for ioctl
+ *
+ * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise
+ * returns -ENOTTY.
+ *
+ * Returns 0 on success, -errno on error.
+ */
+static long vfs_ioctl(struct file *filp, unsigned int cmd,
+		      unsigned long arg)
+{
+	int error = -ENOTTY;
+
+	if (!filp->f_op || !filp->f_op->unlocked_ioctl)
+		goto out;
+
+	error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
+	if (error == -ENOIOCTLCMD)
+		error = -EINVAL;
+ out:
+	return error;
+}
+
+static int ioctl_fibmap(struct file *filp, int __user *p)
+{
+	struct address_space *mapping = filp->f_mapping;
+	int res, block;
+
+	/* do we support this mess? */
+	if (!mapping->a_ops->bmap)
+		return -EINVAL;
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+	res = get_user(block, p);
+	if (res)
+		return res;
+	res = mapping->a_ops->bmap(mapping, block);
+	return put_user(res, p);
+}
+
+/**
+ * fiemap_fill_next_extent - Fiemap helper function
+ * @fieinfo:	Fiemap context passed into ->fiemap
+ * @logical:	Extent logical start offset, in bytes
+ * @phys:	Extent physical start offset, in bytes
+ * @len:	Extent length, in bytes
+ * @flags:	FIEMAP_EXTENT flags that describe this extent
+ *
+ * Called from file system ->fiemap callback. Will populate extent
+ * info as passed in via arguments and copy to user memory. On
+ * success, extent count on fieinfo is incremented.
+ *
+ * Returns 0 on success, -errno on error, 1 if this was the last
+ * extent that will fit in user array.
+ */
+#define SET_UNKNOWN_FLAGS	(FIEMAP_EXTENT_DELALLOC)
+#define SET_NO_UNMOUNTED_IO_FLAGS	(FIEMAP_EXTENT_DATA_ENCRYPTED)
+#define SET_NOT_ALIGNED_FLAGS	(FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
+int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
+			    u64 phys, u64 len, u32 flags)
+{
+	struct fiemap_extent extent;
+	struct fiemap_extent __user *dest = fieinfo->fi_extents_start;
+
+	/* only count the extents */
+	if (fieinfo->fi_extents_max == 0) {
+		fieinfo->fi_extents_mapped++;
+		return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
+	}
+
+	if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
+		return 1;
+
+	if (flags & SET_UNKNOWN_FLAGS)
+		flags |= FIEMAP_EXTENT_UNKNOWN;
+	if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
+		flags |= FIEMAP_EXTENT_ENCODED;
+	if (flags & SET_NOT_ALIGNED_FLAGS)
+		flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+
+	memset(&extent, 0, sizeof(extent));
+	extent.fe_logical = logical;
+	extent.fe_physical = phys;
+	extent.fe_length = len;
+	extent.fe_flags = flags;
+
+	dest += fieinfo->fi_extents_mapped;
+	if (copy_to_user(dest, &extent, sizeof(extent)))
+		return -EFAULT;
+
+	fieinfo->fi_extents_mapped++;
+	if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
+		return 1;
+	return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
+}
+EXPORT_SYMBOL(fiemap_fill_next_extent);
+
+/**
+ * fiemap_check_flags - check validity of requested flags for fiemap
+ * @fieinfo:	Fiemap context passed into ->fiemap
+ * @fs_flags:	Set of fiemap flags that the file system understands
+ *
+ * Called from file system ->fiemap callback. This will compute the
+ * intersection of valid fiemap flags and those that the fs supports. That
+ * value is then compared against the user supplied flags. In case of bad user
+ * flags, the invalid values will be written into the fieinfo structure, and
+ * -EBADR is returned, which tells ioctl_fiemap() to return those values to
+ * userspace. For this reason, a return code of -EBADR should be preserved.
+ *
+ * Returns 0 on success, -EBADR on bad flags.
+ */
+int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags)
+{
+	u32 incompat_flags;
+
+	incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags);
+	if (incompat_flags) {
+		fieinfo->fi_flags = incompat_flags;
+		return -EBADR;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(fiemap_check_flags);
+
+static int fiemap_check_ranges(struct super_block *sb,
+			       u64 start, u64 len, u64 *new_len)
+{
+	u64 maxbytes = (u64) sb->s_maxbytes;
+
+	*new_len = len;
+
+	if (len == 0)
+		return -EINVAL;
+
+	if (start > maxbytes)
+		return -EFBIG;
+
+	/*
+	 * Shrink request scope to what the fs can actually handle.
+	 */
+	if (len > maxbytes || (maxbytes - len) < start)
+		*new_len = maxbytes - start;
+
+	return 0;
+}
+
+static int ioctl_fiemap(struct file *filp, unsigned long arg)
+{
+	struct fiemap fiemap;
+	struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
+	struct fiemap_extent_info fieinfo = { 0, };
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	u64 len;
+	int error;
+
+	if (!inode->i_op->fiemap)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
+		return -EFAULT;
+
+	if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
+		return -EINVAL;
+
+	error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
+				    &len);
+	if (error)
+		return error;
+
+	fieinfo.fi_flags = fiemap.fm_flags;
+	fieinfo.fi_extents_max = fiemap.fm_extent_count;
+	fieinfo.fi_extents_start = ufiemap->fm_extents;
+
+	if (fiemap.fm_extent_count != 0 &&
+	    !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
+		       fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
+		return -EFAULT;
+
+	if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
+		filemap_write_and_wait(inode->i_mapping);
+
+	error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
+	fiemap.fm_flags = fieinfo.fi_flags;
+	fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
+	if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
+		error = -EFAULT;
+
+	return error;
+}
+
+#ifdef CONFIG_BLOCK
+
+static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
+{
+	return (offset >> inode->i_blkbits);
+}
+
+static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
+{
+	return (blk << inode->i_blkbits);
+}
+
+/**
+ * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
+ * @inode: the inode to map
+ * @fieinfo: the fiemap info struct that will be passed back to userspace
+ * @start: where to start mapping in the inode
+ * @len: how much space to map
+ * @get_block: the fs's get_block function
+ *
+ * This does FIEMAP for block based inodes.  Basically it will just loop
+ * through get_block until we hit the number of extents we want to map, or we
+ * go past the end of the file and hit a hole.
+ *
+ * If it is possible to have data blocks beyond a hole past @inode->i_size, then
+ * please do not use this function, it will stop at the first unmapped block
+ * beyond i_size.
+ *
+ * If you use this function directly, you need to do your own locking. Use
+ * generic_block_fiemap if you want the locking done for you.
+ */
+
+int __generic_block_fiemap(struct inode *inode,
+			   struct fiemap_extent_info *fieinfo, loff_t start,
+			   loff_t len, get_block_t *get_block)
+{
+	struct buffer_head map_bh;
+	sector_t start_blk, last_blk;
+	loff_t isize = i_size_read(inode);
+	u64 logical = 0, phys = 0, size = 0;
+	u32 flags = FIEMAP_EXTENT_MERGED;
+	bool past_eof = false, whole_file = false;
+	int ret = 0;
+
+	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	/*
+	 * Either the i_mutex or other appropriate locking needs to be held
+	 * since we expect isize to not change at all through the duration of
+	 * this call.
+	 */
+	if (len >= isize) {
+		whole_file = true;
+		len = isize;
+	}
+
+	/*
+	 * Some filesystems can't deal with being asked to map less than
+	 * blocksize, so make sure our len is at least block length.
+	 */
+	if (logical_to_blk(inode, len) == 0)
+		len = blk_to_logical(inode, 1);
+
+	start_blk = logical_to_blk(inode, start);
+	last_blk = logical_to_blk(inode, start + len - 1);
+
+	do {
+		/*
+		 * we set b_size to the total size we want so it will map as
+		 * many contiguous blocks as possible at once
+		 */
+		memset(&map_bh, 0, sizeof(struct buffer_head));
+		map_bh.b_size = len;
+
+		ret = get_block(inode, start_blk, &map_bh, 0);
+		if (ret)
+			break;
+
+		/* HOLE */
+		if (!buffer_mapped(&map_bh)) {
+			start_blk++;
+
+			/*
+			 * We want to handle the case where there is an
+			 * allocated block at the front of the file, and then
+			 * nothing but holes up to the end of the file properly,
+			 * to make sure that extent at the front gets properly
+			 * marked with FIEMAP_EXTENT_LAST
+			 */
+			if (!past_eof &&
+			    blk_to_logical(inode, start_blk) >= isize)
+				past_eof = 1;
+
+			/*
+			 * First hole after going past the EOF, this is our
+			 * last extent
+			 */
+			if (past_eof && size) {
+				flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size,
+							      flags);
+			} else if (size) {
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size, flags);
+				size = 0;
+			}
+
+			/* if we have holes up to/past EOF then we're done */
+			if (start_blk > last_blk || past_eof || ret)
+				break;
+		} else {
+			/*
+			 * We have gone over the length of what we wanted to
+			 * map, and it wasn't the entire file, so add the extent
+			 * we got last time and exit.
+			 *
+			 * This is for the case where say we want to map all the
+			 * way up to the second to the last block in a file, but
+			 * the last block is a hole, making the second to last
+			 * block FIEMAP_EXTENT_LAST.  In this case we want to
+			 * see if there is a hole after the second to last block
+			 * so we can mark it properly.  If we found data after
+			 * we exceeded the length we were requesting, then we
+			 * are good to go, just add the extent to the fieinfo
+			 * and break
+			 */
+			if (start_blk > last_blk && !whole_file) {
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size,
+							      flags);
+				break;
+			}
+
+			/*
+			 * if size != 0 then we know we already have an extent
+			 * to add, so add it.
+			 */
+			if (size) {
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size,
+							      flags);
+				if (ret)
+					break;
+			}
+
+			logical = blk_to_logical(inode, start_blk);
+			phys = blk_to_logical(inode, map_bh.b_blocknr);
+			size = map_bh.b_size;
+			flags = FIEMAP_EXTENT_MERGED;
+
+			start_blk += logical_to_blk(inode, size);
+
+			/*
+			 * If we are past the EOF, then we need to make sure as
+			 * soon as we find a hole that the last extent we found
+			 * is marked with FIEMAP_EXTENT_LAST
+			 */
+			if (!past_eof && logical + size >= isize)
+				past_eof = true;
+		}
+		cond_resched();
+	} while (1);
+
+	/* If ret is 1 then we just hit the end of the extent array */
+	if (ret == 1)
+		ret = 0;
+
+	return ret;
+}
+EXPORT_SYMBOL(__generic_block_fiemap);
+
+/**
+ * generic_block_fiemap - FIEMAP for block based inodes
+ * @inode: The inode to map
+ * @fieinfo: The mapping information
+ * @start: The initial block to map
+ * @len: The length of the extect to attempt to map
+ * @get_block: The block mapping function for the fs
+ *
+ * Calls __generic_block_fiemap to map the inode, after taking
+ * the inode's mutex lock.
+ */
+
+int generic_block_fiemap(struct inode *inode,
+			 struct fiemap_extent_info *fieinfo, u64 start,
+			 u64 len, get_block_t *get_block)
+{
+	int ret;
+	mutex_lock(&inode->i_mutex);
+	ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(generic_block_fiemap);
+
+#endif  /*  CONFIG_BLOCK  */
+
+/*
+ * This provides compatibility with legacy XFS pre-allocation ioctls
+ * which predate the fallocate syscall.
+ *
+ * Only the l_start, l_len and l_whence fields of the 'struct space_resv'
+ * are used here, rest are ignored.
+ */
+int ioctl_preallocate(struct file *filp, void __user *argp)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct space_resv sr;
+
+	if (copy_from_user(&sr, argp, sizeof(sr)))
+		return -EFAULT;
+
+	switch (sr.l_whence) {
+	case SEEK_SET:
+		break;
+	case SEEK_CUR:
+		sr.l_start += filp->f_pos;
+		break;
+	case SEEK_END:
+		sr.l_start += i_size_read(inode);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return do_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
+}
+
+static int file_ioctl(struct file *filp, unsigned int cmd,
+		unsigned long arg)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int __user *p = (int __user *)arg;
+
+	switch (cmd) {
+	case FIBMAP:
+		return ioctl_fibmap(filp, p);
+	case FIONREAD:
+		return put_user(i_size_read(inode) - filp->f_pos, p);
+	case FS_IOC_RESVSP:
+	case FS_IOC_RESVSP64:
+		return ioctl_preallocate(filp, p);
+	}
+
+	return vfs_ioctl(filp, cmd, arg);
+}
+
+static int ioctl_fionbio(struct file *filp, int __user *argp)
+{
+	unsigned int flag;
+	int on, error;
+
+	error = get_user(on, argp);
+	if (error)
+		return error;
+	flag = O_NONBLOCK;
+#ifdef __sparc__
+	/* SunOS compatibility item. */
+	if (O_NONBLOCK != O_NDELAY)
+		flag |= O_NDELAY;
+#endif
+	spin_lock(&filp->f_lock);
+	if (on)
+		filp->f_flags |= flag;
+	else
+		filp->f_flags &= ~flag;
+	spin_unlock(&filp->f_lock);
+	return error;
+}
+
+static int ioctl_fioasync(unsigned int fd, struct file *filp,
+			  int __user *argp)
+{
+	unsigned int flag;
+	int on, error;
+
+	error = get_user(on, argp);
+	if (error)
+		return error;
+	flag = on ? FASYNC : 0;
+
+	/* Did FASYNC state change ? */
+	if ((flag ^ filp->f_flags) & FASYNC) {
+		if (filp->f_op && filp->f_op->fasync)
+			/* fasync() adjusts filp->f_flags */
+			error = filp->f_op->fasync(fd, filp, on);
+		else
+			error = -ENOTTY;
+	}
+	return error < 0 ? error : 0;
+}
+
+static int ioctl_fsfreeze(struct file *filp)
+{
+	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* If filesystem doesn't support freeze feature, return. */
+	if (sb->s_op->freeze_fs == NULL)
+		return -EOPNOTSUPP;
+
+	/* Freeze */
+	return freeze_super(sb);
+}
+
+static int ioctl_fsthaw(struct file *filp)
+{
+	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* Thaw */
+	return thaw_super(sb);
+}
+
+/*
+ * When you add any new common ioctls to the switches above and below
+ * please update compat_sys_ioctl() too.
+ *
+ * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d.
+ * It's just a simple helper for sys_ioctl and compat_sys_ioctl.
+ */
+int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
+	     unsigned long arg)
+{
+	int error = 0;
+	int __user *argp = (int __user *)arg;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	switch (cmd) {
+	case FIOCLEX:
+		set_close_on_exec(fd, 1);
+		break;
+
+	case FIONCLEX:
+		set_close_on_exec(fd, 0);
+		break;
+
+	case FIONBIO:
+		error = ioctl_fionbio(filp, argp);
+		break;
+
+	case FIOASYNC:
+		error = ioctl_fioasync(fd, filp, argp);
+		break;
+
+	case FIOQSIZE:
+		if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
+		    S_ISLNK(inode->i_mode)) {
+			loff_t res = inode_get_bytes(inode);
+			error = copy_to_user(argp, &res, sizeof(res)) ?
+					-EFAULT : 0;
+		} else
+			error = -ENOTTY;
+		break;
+
+	case FIFREEZE:
+		error = ioctl_fsfreeze(filp);
+		break;
+
+	case FITHAW:
+		error = ioctl_fsthaw(filp);
+		break;
+
+	case FS_IOC_FIEMAP:
+		return ioctl_fiemap(filp, arg);
+
+	case FIGETBSZ:
+		return put_user(inode->i_sb->s_blocksize, argp);
+
+	default:
+		if (S_ISREG(inode->i_mode))
+			error = file_ioctl(filp, cmd, arg);
+		else
+			error = vfs_ioctl(filp, cmd, arg);
+		break;
+	}
+	return error;
+}
+
+SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
+{
+	struct file *filp;
+	int error = -EBADF;
+	int fput_needed;
+
+	filp = fget_light(fd, &fput_needed);
+	if (!filp)
+		goto out;
+
+	error = security_file_ioctl(filp, cmd, arg);
+	if (error)
+		goto out_fput;
+
+	error = do_vfs_ioctl(filp, fd, cmd, arg);
+ out_fput:
+	fput_light(filp, fput_needed);
+ out:
+	return error;
+}
diff --git a/fs/ioprio.c b/fs/ioprio.c
new file mode 100644
index 00000000..7da2a065
--- /dev/null
+++ b/fs/ioprio.c
@@ -0,0 +1,250 @@
+/*
+ * fs/ioprio.c
+ *
+ * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
+ *
+ * Helper functions for setting/querying io priorities of processes. The
+ * system calls closely mimmick getpriority/setpriority, see the man page for
+ * those. The prio argument is a composite of prio class and prio data, where
+ * the data argument has meaning within that class. The standard scheduling
+ * classes have 8 distinct prio levels, with 0 being the highest prio and 7
+ * being the lowest.
+ *
+ * IOW, setting BE scheduling class with prio 2 is done ala:
+ *
+ * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
+ *
+ * ioprio_set(PRIO_PROCESS, pid, prio);
+ *
+ * See also Documentation/block/ioprio.txt
+ *
+ */
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/ioprio.h>
+#include <linux/blkdev.h>
+#include <linux/capability.h>
+#include <linux/syscalls.h>
+#include <linux/security.h>
+#include <linux/pid_namespace.h>
+
+int set_task_ioprio(struct task_struct *task, int ioprio)
+{
+	int err;
+	struct io_context *ioc;
+	const struct cred *cred = current_cred(), *tcred;
+
+	rcu_read_lock();
+	tcred = __task_cred(task);
+	if (tcred->uid != cred->euid &&
+	    tcred->uid != cred->uid && !capable(CAP_SYS_NICE)) {
+		rcu_read_unlock();
+		return -EPERM;
+	}
+	rcu_read_unlock();
+
+	err = security_task_setioprio(task, ioprio);
+	if (err)
+		return err;
+
+	task_lock(task);
+	do {
+		ioc = task->io_context;
+		/* see wmb() in current_io_context() */
+		smp_read_barrier_depends();
+		if (ioc)
+			break;
+
+		ioc = alloc_io_context(GFP_ATOMIC, -1);
+		if (!ioc) {
+			err = -ENOMEM;
+			break;
+		}
+		task->io_context = ioc;
+	} while (1);
+
+	if (!err) {
+		ioc->ioprio = ioprio;
+		ioc->ioprio_changed = 1;
+	}
+
+	task_unlock(task);
+	return err;
+}
+EXPORT_SYMBOL_GPL(set_task_ioprio);
+
+SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
+{
+	int class = IOPRIO_PRIO_CLASS(ioprio);
+	int data = IOPRIO_PRIO_DATA(ioprio);
+	struct task_struct *p, *g;
+	struct user_struct *user;
+	struct pid *pgrp;
+	int ret;
+
+	switch (class) {
+		case IOPRIO_CLASS_RT:
+			if (!capable(CAP_SYS_ADMIN))
+				return -EPERM;
+			/* fall through, rt has prio field too */
+		case IOPRIO_CLASS_BE:
+			if (data >= IOPRIO_BE_NR || data < 0)
+				return -EINVAL;
+
+			break;
+		case IOPRIO_CLASS_IDLE:
+			break;
+		case IOPRIO_CLASS_NONE:
+			if (data)
+				return -EINVAL;
+			break;
+		default:
+			return -EINVAL;
+	}
+
+	ret = -ESRCH;
+	rcu_read_lock();
+	switch (which) {
+		case IOPRIO_WHO_PROCESS:
+			if (!who)
+				p = current;
+			else
+				p = find_task_by_vpid(who);
+			if (p)
+				ret = set_task_ioprio(p, ioprio);
+			break;
+		case IOPRIO_WHO_PGRP:
+			if (!who)
+				pgrp = task_pgrp(current);
+			else
+				pgrp = find_vpid(who);
+			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+				ret = set_task_ioprio(p, ioprio);
+				if (ret)
+					break;
+			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+			break;
+		case IOPRIO_WHO_USER:
+			if (!who)
+				user = current_user();
+			else
+				user = find_user(who);
+
+			if (!user)
+				break;
+
+			do_each_thread(g, p) {
+				if (__task_cred(p)->uid != who)
+					continue;
+				ret = set_task_ioprio(p, ioprio);
+				if (ret)
+					goto free_uid;
+			} while_each_thread(g, p);
+free_uid:
+			if (who)
+				free_uid(user);
+			break;
+		default:
+			ret = -EINVAL;
+	}
+
+	rcu_read_unlock();
+	return ret;
+}
+
+static int get_task_ioprio(struct task_struct *p)
+{
+	int ret;
+
+	ret = security_task_getioprio(p);
+	if (ret)
+		goto out;
+	ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
+	if (p->io_context)
+		ret = p->io_context->ioprio;
+out:
+	return ret;
+}
+
+int ioprio_best(unsigned short aprio, unsigned short bprio)
+{
+	unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
+	unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
+
+	if (aclass == IOPRIO_CLASS_NONE)
+		aclass = IOPRIO_CLASS_BE;
+	if (bclass == IOPRIO_CLASS_NONE)
+		bclass = IOPRIO_CLASS_BE;
+
+	if (aclass == bclass)
+		return min(aprio, bprio);
+	if (aclass > bclass)
+		return bprio;
+	else
+		return aprio;
+}
+
+SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
+{
+	struct task_struct *g, *p;
+	struct user_struct *user;
+	struct pid *pgrp;
+	int ret = -ESRCH;
+	int tmpio;
+
+	rcu_read_lock();
+	switch (which) {
+		case IOPRIO_WHO_PROCESS:
+			if (!who)
+				p = current;
+			else
+				p = find_task_by_vpid(who);
+			if (p)
+				ret = get_task_ioprio(p);
+			break;
+		case IOPRIO_WHO_PGRP:
+			if (!who)
+				pgrp = task_pgrp(current);
+			else
+				pgrp = find_vpid(who);
+			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+				tmpio = get_task_ioprio(p);
+				if (tmpio < 0)
+					continue;
+				if (ret == -ESRCH)
+					ret = tmpio;
+				else
+					ret = ioprio_best(ret, tmpio);
+			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+			break;
+		case IOPRIO_WHO_USER:
+			if (!who)
+				user = current_user();
+			else
+				user = find_user(who);
+
+			if (!user)
+				break;
+
+			do_each_thread(g, p) {
+				if (__task_cred(p)->uid != user->uid)
+					continue;
+				tmpio = get_task_ioprio(p);
+				if (tmpio < 0)
+					continue;
+				if (ret == -ESRCH)
+					ret = tmpio;
+				else
+					ret = ioprio_best(ret, tmpio);
+			} while_each_thread(g, p);
+
+			if (who)
+				free_uid(user);
+			break;
+		default:
+			ret = -EINVAL;
+	}
+
+	rcu_read_unlock();
+	return ret;
+}
diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig
new file mode 100644
index 00000000..8ab9878e
--- /dev/null
+++ b/fs/isofs/Kconfig
@@ -0,0 +1,39 @@
+config ISO9660_FS
+	tristate "ISO 9660 CDROM file system support"
+	help
+	  This is the standard file system used on CD-ROMs.  It was previously
+	  known as "High Sierra File System" and is called "hsfs" on other
+	  Unix systems.  The so-called Rock-Ridge extensions which allow for
+	  long Unix filenames and symbolic links are also supported by this
+	  driver.  If you have a CD-ROM drive and want to do more with it than
+	  just listen to audio CDs and watch its LEDs, say Y (and read
+	  <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO,
+	  available from <http://www.tldp.org/docs.html#howto>), thereby
+	  enlarging your kernel by about 27 KB; otherwise say N.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called isofs.
+
+config JOLIET
+	bool "Microsoft Joliet CDROM extensions"
+	depends on ISO9660_FS
+	select NLS
+	help
+	  Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system
+	  which allows for long filenames in unicode format (unicode is the
+	  new 16 bit character code, successor to ASCII, which encodes the
+	  characters of almost all languages of the world; see
+	  <http://www.unicode.org/> for more information).  Say Y here if you
+	  want to be able to read Joliet CD-ROMs under Linux.
+
+config ZISOFS
+	bool "Transparent decompression extension"
+	depends on ISO9660_FS
+	select ZLIB_INFLATE
+	help
+	  This is a Linux-specific extension to RockRidge which lets you store
+	  data in compressed form on a CD-ROM and have it transparently
+	  decompressed when the CD-ROM is accessed.  See
+	  <http://www.kernel.org/pub/linux/utils/fs/zisofs/> for the tools
+	  necessary to create such a filesystem.  Say Y here if you want to be
+	  able to read such compressed CD-ROMs.
diff --git a/fs/isofs/Makefile b/fs/isofs/Makefile
new file mode 100644
index 00000000..bf162f09
--- /dev/null
+++ b/fs/isofs/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the Linux isofs filesystem routines.
+#
+
+obj-$(CONFIG_ISO9660_FS) += isofs.o
+
+isofs-objs-y 			:= namei.o inode.o dir.o util.o rock.o export.o
+isofs-objs-$(CONFIG_JOLIET)	+= joliet.o
+isofs-objs-$(CONFIG_ZISOFS)	+= compress.o
+isofs-objs			:= $(isofs-objs-y)
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
new file mode 100644
index 00000000..0b3fa797
--- /dev/null
+++ b/fs/isofs/compress.c
@@ -0,0 +1,380 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *   
+ *   Copyright 2001 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ *   USA; either version 2 of the License, or (at your option) any later
+ *   version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * linux/fs/isofs/compress.c
+ *
+ * Transparent decompression of files on an iso9660 filesystem
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/vmalloc.h>
+#include <linux/zlib.h>
+
+#include "isofs.h"
+#include "zisofs.h"
+
+/* This should probably be global. */
+static char zisofs_sink_page[PAGE_CACHE_SIZE];
+
+/*
+ * This contains the zlib memory allocation and the mutex for the
+ * allocation; this avoids failures at block-decompression time.
+ */
+static void *zisofs_zlib_workspace;
+static DEFINE_MUTEX(zisofs_zlib_lock);
+
+/*
+ * Read data of @inode from @block_start to @block_end and uncompress
+ * to one zisofs block. Store the data in the @pages array with @pcount
+ * entries. Start storing at offset @poffset of the first page.
+ */
+static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
+				      loff_t block_end, int pcount,
+				      struct page **pages, unsigned poffset,
+				      int *errp)
+{
+	unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
+	unsigned int bufsize = ISOFS_BUFFER_SIZE(inode);
+	unsigned int bufshift = ISOFS_BUFFER_BITS(inode);
+	unsigned int bufmask = bufsize - 1;
+	int i, block_size = block_end - block_start;
+	z_stream stream = { .total_out = 0,
+			    .avail_in = 0,
+			    .avail_out = 0, };
+	int zerr;
+	int needblocks = (block_size + (block_start & bufmask) + bufmask)
+				>> bufshift;
+	int haveblocks;
+	blkcnt_t blocknum;
+	struct buffer_head *bhs[needblocks + 1];
+	int curbh, curpage;
+
+	if (block_size > deflateBound(1UL << zisofs_block_shift)) {
+		*errp = -EIO;
+		return 0;
+	}
+	/* Empty block? */
+	if (block_size == 0) {
+		for ( i = 0 ; i < pcount ; i++ ) {
+			if (!pages[i])
+				continue;
+			memset(page_address(pages[i]), 0, PAGE_CACHE_SIZE);
+			flush_dcache_page(pages[i]);
+			SetPageUptodate(pages[i]);
+		}
+		return ((loff_t)pcount) << PAGE_CACHE_SHIFT;
+	}
+
+	/* Because zlib is not thread-safe, do all the I/O at the top. */
+	blocknum = block_start >> bufshift;
+	memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *));
+	haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks);
+	ll_rw_block(READ, haveblocks, bhs);
+
+	curbh = 0;
+	curpage = 0;
+	/*
+	 * First block is special since it may be fractional.  We also wait for
+	 * it before grabbing the zlib mutex; odds are that the subsequent
+	 * blocks are going to come in in short order so we don't hold the zlib
+	 * mutex longer than necessary.
+	 */
+
+	if (!bhs[0])
+		goto b_eio;
+
+	wait_on_buffer(bhs[0]);
+	if (!buffer_uptodate(bhs[0])) {
+		*errp = -EIO;
+		goto b_eio;
+	}
+
+	stream.workspace = zisofs_zlib_workspace;
+	mutex_lock(&zisofs_zlib_lock);
+		
+	zerr = zlib_inflateInit(&stream);
+	if (zerr != Z_OK) {
+		if (zerr == Z_MEM_ERROR)
+			*errp = -ENOMEM;
+		else
+			*errp = -EIO;
+		printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n",
+			       zerr);
+		goto z_eio;
+	}
+
+	while (curpage < pcount && curbh < haveblocks &&
+	       zerr != Z_STREAM_END) {
+		if (!stream.avail_out) {
+			if (pages[curpage]) {
+				stream.next_out = page_address(pages[curpage])
+						+ poffset;
+				stream.avail_out = PAGE_CACHE_SIZE - poffset;
+				poffset = 0;
+			} else {
+				stream.next_out = (void *)&zisofs_sink_page;
+				stream.avail_out = PAGE_CACHE_SIZE;
+			}
+		}
+		if (!stream.avail_in) {
+			wait_on_buffer(bhs[curbh]);
+			if (!buffer_uptodate(bhs[curbh])) {
+				*errp = -EIO;
+				break;
+			}
+			stream.next_in  = bhs[curbh]->b_data +
+						(block_start & bufmask);
+			stream.avail_in = min_t(unsigned, bufsize -
+						(block_start & bufmask),
+						block_size);
+			block_size -= stream.avail_in;
+			block_start = 0;
+		}
+
+		while (stream.avail_out && stream.avail_in) {
+			zerr = zlib_inflate(&stream, Z_SYNC_FLUSH);
+			if (zerr == Z_BUF_ERROR && stream.avail_in == 0)
+				break;
+			if (zerr == Z_STREAM_END)
+				break;
+			if (zerr != Z_OK) {
+				/* EOF, error, or trying to read beyond end of input */
+				if (zerr == Z_MEM_ERROR)
+					*errp = -ENOMEM;
+				else {
+					printk(KERN_DEBUG
+					       "zisofs: zisofs_inflate returned"
+					       " %d, inode = %lu,"
+					       " page idx = %d, bh idx = %d,"
+					       " avail_in = %d,"
+					       " avail_out = %d\n",
+					       zerr, inode->i_ino, curpage,
+					       curbh, stream.avail_in,
+					       stream.avail_out);
+					*errp = -EIO;
+				}
+				goto inflate_out;
+			}
+		}
+
+		if (!stream.avail_out) {
+			/* This page completed */
+			if (pages[curpage]) {
+				flush_dcache_page(pages[curpage]);
+				SetPageUptodate(pages[curpage]);
+			}
+			curpage++;
+		}
+		if (!stream.avail_in)
+			curbh++;
+	}
+inflate_out:
+	zlib_inflateEnd(&stream);
+
+z_eio:
+	mutex_unlock(&zisofs_zlib_lock);
+
+b_eio:
+	for (i = 0; i < haveblocks; i++)
+		brelse(bhs[i]);
+	return stream.total_out;
+}
+
+/*
+ * Uncompress data so that pages[full_page] is fully uptodate and possibly
+ * fills in other pages if we have data for them.
+ */
+static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
+			     struct page **pages)
+{
+	loff_t start_off, end_off;
+	loff_t block_start, block_end;
+	unsigned int header_size = ISOFS_I(inode)->i_format_parm[0];
+	unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
+	unsigned int blockptr;
+	loff_t poffset = 0;
+	blkcnt_t cstart_block, cend_block;
+	struct buffer_head *bh;
+	unsigned int blkbits = ISOFS_BUFFER_BITS(inode);
+	unsigned int blksize = 1 << blkbits;
+	int err;
+	loff_t ret;
+
+	BUG_ON(!pages[full_page]);
+
+	/*
+	 * We want to read at least 'full_page' page. Because we have to
+	 * uncompress the whole compression block anyway, fill the surrounding
+	 * pages with the data we have anyway...
+	 */
+	start_off = page_offset(pages[full_page]);
+	end_off = min_t(loff_t, start_off + PAGE_CACHE_SIZE, inode->i_size);
+
+	cstart_block = start_off >> zisofs_block_shift;
+	cend_block = (end_off + (1 << zisofs_block_shift) - 1)
+			>> zisofs_block_shift;
+
+	WARN_ON(start_off - (full_page << PAGE_CACHE_SHIFT) !=
+		((cstart_block << zisofs_block_shift) & PAGE_CACHE_MASK));
+
+	/* Find the pointer to this specific chunk */
+	/* Note: we're not using isonum_731() here because the data is known aligned */
+	/* Note: header_size is in 32-bit words (4 bytes) */
+	blockptr = (header_size + cstart_block) << 2;
+	bh = isofs_bread(inode, blockptr >> blkbits);
+	if (!bh)
+		return -EIO;
+	block_start = le32_to_cpu(*(__le32 *)
+				(bh->b_data + (blockptr & (blksize - 1))));
+
+	while (cstart_block < cend_block && pcount > 0) {
+		/* Load end of the compressed block in the file */
+		blockptr += 4;
+		/* Traversed to next block? */
+		if (!(blockptr & (blksize - 1))) {
+			brelse(bh);
+
+			bh = isofs_bread(inode, blockptr >> blkbits);
+			if (!bh)
+				return -EIO;
+		}
+		block_end = le32_to_cpu(*(__le32 *)
+				(bh->b_data + (blockptr & (blksize - 1))));
+		if (block_start > block_end) {
+			brelse(bh);
+			return -EIO;
+		}
+		err = 0;
+		ret = zisofs_uncompress_block(inode, block_start, block_end,
+					      pcount, pages, poffset, &err);
+		poffset += ret;
+		pages += poffset >> PAGE_CACHE_SHIFT;
+		pcount -= poffset >> PAGE_CACHE_SHIFT;
+		full_page -= poffset >> PAGE_CACHE_SHIFT;
+		poffset &= ~PAGE_CACHE_MASK;
+
+		if (err) {
+			brelse(bh);
+			/*
+			 * Did we finish reading the page we really wanted
+			 * to read?
+			 */
+			if (full_page < 0)
+				return 0;
+			return err;
+		}
+
+		block_start = block_end;
+		cstart_block++;
+	}
+
+	if (poffset && *pages) {
+		memset(page_address(*pages) + poffset, 0,
+		       PAGE_CACHE_SIZE - poffset);
+		flush_dcache_page(*pages);
+		SetPageUptodate(*pages);
+	}
+	return 0;
+}
+
+/*
+ * When decompressing, we typically obtain more than one page
+ * per reference.  We inject the additional pages into the page
+ * cache as a form of readahead.
+ */
+static int zisofs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	int err;
+	int i, pcount, full_page;
+	unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
+	unsigned int zisofs_pages_per_cblock =
+		PAGE_CACHE_SHIFT <= zisofs_block_shift ?
+		(1 << (zisofs_block_shift - PAGE_CACHE_SHIFT)) : 0;
+	struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)];
+	pgoff_t index = page->index, end_index;
+
+	end_index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	/*
+	 * If this page is wholly outside i_size we just return zero;
+	 * do_generic_file_read() will handle this for us
+	 */
+	if (index >= end_index) {
+		SetPageUptodate(page);
+		unlock_page(page);
+		return 0;
+	}
+
+	if (PAGE_CACHE_SHIFT <= zisofs_block_shift) {
+		/* We have already been given one page, this is the one
+		   we must do. */
+		full_page = index & (zisofs_pages_per_cblock - 1);
+		pcount = min_t(int, zisofs_pages_per_cblock,
+			end_index - (index & ~(zisofs_pages_per_cblock - 1)));
+		index -= full_page;
+	} else {
+		full_page = 0;
+		pcount = 1;
+	}
+	pages[full_page] = page;
+
+	for (i = 0; i < pcount; i++, index++) {
+		if (i != full_page)
+			pages[i] = grab_cache_page_nowait(mapping, index);
+		if (pages[i]) {
+			ClearPageError(pages[i]);
+			kmap(pages[i]);
+		}
+	}
+
+	err = zisofs_fill_pages(inode, full_page, pcount, pages);
+
+	/* Release any residual pages, do not SetPageUptodate */
+	for (i = 0; i < pcount; i++) {
+		if (pages[i]) {
+			flush_dcache_page(pages[i]);
+			if (i == full_page && err)
+				SetPageError(pages[i]);
+			kunmap(pages[i]);
+			unlock_page(pages[i]);
+			if (i != full_page)
+				page_cache_release(pages[i]);
+		}
+	}			
+
+	/* At this point, err contains 0 or -EIO depending on the "critical" page */
+	return err;
+}
+
+const struct address_space_operations zisofs_aops = {
+	.readpage = zisofs_readpage,
+	/* No sync_page operation supported? */
+	/* No bmap operation supported */
+};
+
+int __init zisofs_init(void)
+{
+	zisofs_zlib_workspace = vmalloc(zlib_inflate_workspacesize());
+	if ( !zisofs_zlib_workspace )
+		return -ENOMEM;
+
+	return 0;
+}
+
+void zisofs_cleanup(void)
+{
+	vfree(zisofs_zlib_workspace);
+}
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
new file mode 100644
index 00000000..0542b6ee
--- /dev/null
+++ b/fs/isofs/dir.c
@@ -0,0 +1,288 @@
+/*
+ *  linux/fs/isofs/dir.c
+ *
+ *  (C) 1992, 1993, 1994  Eric Youngdale Modified for ISO 9660 filesystem.
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ *
+ *  Steve Beynon		       : Missing last directory entries fixed
+ *  (stephen@askone.demon.co.uk)      : 21st June 1996
+ *
+ *  isofs directory handling functions
+ */
+#include <linux/gfp.h>
+#include "isofs.h"
+
+int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
+{
+	char * old = de->name;
+	int len = de->name_len[0];
+	int i;
+
+	for (i = 0; i < len; i++) {
+		unsigned char c = old[i];
+		if (!c)
+			break;
+
+		if (c >= 'A' && c <= 'Z')
+			c |= 0x20;	/* lower case */
+
+		/* Drop trailing '.;1' (ISO 9660:1988 7.5.1 requires period) */
+		if (c == '.' && i == len - 3 && old[i + 1] == ';' && old[i + 2] == '1')
+			break;
+
+		/* Drop trailing ';1' */
+		if (c == ';' && i == len - 2 && old[i + 1] == '1')
+			break;
+
+		/* Convert remaining ';' to '.' */
+		/* Also '/' to '.' (broken Acorn-generated ISO9660 images) */
+		if (c == ';' || c == '/')
+			c = '.';
+
+		new[i] = c;
+	}
+	return i;
+}
+
+/* Acorn extensions written by Matthew Wilcox <willy@bofh.ai> 1998 */
+int get_acorn_filename(struct iso_directory_record *de,
+			    char *retname, struct inode *inode)
+{
+	int std;
+	unsigned char *chr;
+	int retnamlen = isofs_name_translate(de, retname, inode);
+
+	if (retnamlen == 0)
+		return 0;
+	std = sizeof(struct iso_directory_record) + de->name_len[0];
+	if (std & 1)
+		std++;
+	if ((*((unsigned char *) de) - std) != 32)
+		return retnamlen;
+	chr = ((unsigned char *) de) + std;
+	if (strncmp(chr, "ARCHIMEDES", 10))
+		return retnamlen;
+	if ((*retname == '_') && ((chr[19] & 1) == 1))
+		*retname = '!';
+	if (((de->flags[0] & 2) == 0) && (chr[13] == 0xff)
+		&& ((chr[12] & 0xf0) == 0xf0)) {
+		retname[retnamlen] = ',';
+		sprintf(retname+retnamlen+1, "%3.3x",
+			((chr[12] & 0xf) << 8) | chr[11]);
+		retnamlen += 4;
+	}
+	return retnamlen;
+}
+
+/*
+ * This should _really_ be cleaned up some day..
+ */
+static int do_isofs_readdir(struct inode *inode, struct file *filp,
+		void *dirent, filldir_t filldir,
+		char *tmpname, struct iso_directory_record *tmpde)
+{
+	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
+	unsigned char bufbits = ISOFS_BUFFER_BITS(inode);
+	unsigned long block, offset, block_saved, offset_saved;
+	unsigned long inode_number = 0;	/* Quiet GCC */
+	struct buffer_head *bh = NULL;
+	int len;
+	int map;
+	int first_de = 1;
+	char *p = NULL;		/* Quiet GCC */
+	struct iso_directory_record *de;
+	struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
+
+	offset = filp->f_pos & (bufsize - 1);
+	block = filp->f_pos >> bufbits;
+
+	while (filp->f_pos < inode->i_size) {
+		int de_len;
+
+		if (!bh) {
+			bh = isofs_bread(inode, block);
+			if (!bh)
+				return 0;
+		}
+
+		de = (struct iso_directory_record *) (bh->b_data + offset);
+
+		de_len = *(unsigned char *) de;
+
+		/*
+		 * If the length byte is zero, we should move on to the next
+		 * CDROM sector.  If we are at the end of the directory, we
+		 * kick out of the while loop.
+		 */
+
+		if (de_len == 0) {
+			brelse(bh);
+			bh = NULL;
+			filp->f_pos = (filp->f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
+			block = filp->f_pos >> bufbits;
+			offset = 0;
+			continue;
+		}
+
+		block_saved = block;
+		offset_saved = offset;
+		offset += de_len;
+
+		/* Make sure we have a full directory entry */
+		if (offset >= bufsize) {
+			int slop = bufsize - offset + de_len;
+			memcpy(tmpde, de, slop);
+			offset &= bufsize - 1;
+			block++;
+			brelse(bh);
+			bh = NULL;
+			if (offset) {
+				bh = isofs_bread(inode, block);
+				if (!bh)
+					return 0;
+				memcpy((void *) tmpde + slop, bh->b_data, offset);
+			}
+			de = tmpde;
+		}
+		/* Basic sanity check, whether name doesn't exceed dir entry */
+		if (de_len < de->name_len[0] +
+					sizeof(struct iso_directory_record)) {
+			printk(KERN_NOTICE "iso9660: Corrupted directory entry"
+			       " in block %lu of inode %lu\n", block,
+			       inode->i_ino);
+			return -EIO;
+		}
+
+		if (first_de) {
+			isofs_normalize_block_and_offset(de,
+							&block_saved,
+							&offset_saved);
+			inode_number = isofs_get_ino(block_saved,
+							offset_saved, bufbits);
+		}
+
+		if (de->flags[-sbi->s_high_sierra] & 0x80) {
+			first_de = 0;
+			filp->f_pos += de_len;
+			continue;
+		}
+		first_de = 1;
+
+		/* Handle the case of the '.' directory */
+		if (de->name_len[0] == 1 && de->name[0] == 0) {
+			if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
+				break;
+			filp->f_pos += de_len;
+			continue;
+		}
+
+		len = 0;
+
+		/* Handle the case of the '..' directory */
+		if (de->name_len[0] == 1 && de->name[0] == 1) {
+			inode_number = parent_ino(filp->f_path.dentry);
+			if (filldir(dirent, "..", 2, filp->f_pos, inode_number, DT_DIR) < 0)
+				break;
+			filp->f_pos += de_len;
+			continue;
+		}
+
+		/* Handle everything else.  Do name translation if there
+		   is no Rock Ridge NM field. */
+
+		/*
+		 * Do not report hidden files if so instructed, or associated
+		 * files unless instructed to do so
+		 */
+		if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) ||
+		    (!sbi->s_showassoc &&
+				(de->flags[-sbi->s_high_sierra] & 4))) {
+			filp->f_pos += de_len;
+			continue;
+		}
+
+		map = 1;
+		if (sbi->s_rock) {
+			len = get_rock_ridge_filename(de, tmpname, inode);
+			if (len != 0) {		/* may be -1 */
+				p = tmpname;
+				map = 0;
+			}
+		}
+		if (map) {
+#ifdef CONFIG_JOLIET
+			if (sbi->s_joliet_level) {
+				len = get_joliet_filename(de, tmpname, inode);
+				p = tmpname;
+			} else
+#endif
+			if (sbi->s_mapping == 'a') {
+				len = get_acorn_filename(de, tmpname, inode);
+				p = tmpname;
+			} else
+			if (sbi->s_mapping == 'n') {
+				len = isofs_name_translate(de, tmpname, inode);
+				p = tmpname;
+			} else {
+				p = de->name;
+				len = de->name_len[0];
+			}
+		}
+		if (len > 0) {
+			if (filldir(dirent, p, len, filp->f_pos, inode_number, DT_UNKNOWN) < 0)
+				break;
+		}
+		filp->f_pos += de_len;
+
+		continue;
+	}
+	if (bh)
+		brelse(bh);
+	return 0;
+}
+
+/*
+ * Handle allocation of temporary space for name translation and
+ * handling split directory entries.. The real work is done by
+ * "do_isofs_readdir()".
+ */
+static int isofs_readdir(struct file *filp,
+		void *dirent, filldir_t filldir)
+{
+	int result;
+	char *tmpname;
+	struct iso_directory_record *tmpde;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
+
+	tmpname = (char *)__get_free_page(GFP_KERNEL);
+	if (tmpname == NULL)
+		return -ENOMEM;
+
+	mutex_lock(&sbi->s_mutex);
+	tmpde = (struct iso_directory_record *) (tmpname+1024);
+
+	result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
+
+	free_page((unsigned long) tmpname);
+	mutex_unlock(&sbi->s_mutex);
+	return result;
+}
+
+const struct file_operations isofs_dir_operations =
+{
+	.llseek = generic_file_llseek,
+	.read = generic_read_dir,
+	.readdir = isofs_readdir,
+};
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations isofs_dir_inode_operations =
+{
+	.lookup = isofs_lookup,
+};
+
+
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
new file mode 100644
index 00000000..dd4687ff
--- /dev/null
+++ b/fs/isofs/export.c
@@ -0,0 +1,196 @@
+/*
+ * fs/isofs/export.c
+ *
+ *  (C) 2004  Paul Serice - The new inode scheme requires switching
+ *                          from iget() to iget5_locked() which means
+ *                          the NFS export operations have to be hand
+ *                          coded because the default routines rely on
+ *                          iget().
+ *
+ * The following files are helpful:
+ *
+ *     Documentation/filesystems/nfs/Exporting
+ *     fs/exportfs/expfs.c.
+ */
+
+#include "isofs.h"
+
+static struct dentry *
+isofs_export_iget(struct super_block *sb,
+		  unsigned long block,
+		  unsigned long offset,
+		  __u32 generation)
+{
+	struct inode *inode;
+
+	if (block == 0)
+		return ERR_PTR(-ESTALE);
+	inode = isofs_iget(sb, block, offset);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return d_obtain_alias(inode);
+}
+
+/* This function is surprisingly simple.  The trick is understanding
+ * that "child" is always a directory. So, to find its parent, you
+ * simply need to find its ".." entry, normalize its block and offset,
+ * and return the underlying inode.  See the comments for
+ * isofs_normalize_block_and_offset(). */
+static struct dentry *isofs_export_get_parent(struct dentry *child)
+{
+	unsigned long parent_block = 0;
+	unsigned long parent_offset = 0;
+	struct inode *child_inode = child->d_inode;
+	struct iso_inode_info *e_child_inode = ISOFS_I(child_inode);
+	struct iso_directory_record *de = NULL;
+	struct buffer_head * bh = NULL;
+	struct dentry *rv = NULL;
+
+	/* "child" must always be a directory. */
+	if (!S_ISDIR(child_inode->i_mode)) {
+		printk(KERN_ERR "isofs: isofs_export_get_parent(): "
+		       "child is not a directory!\n");
+		rv = ERR_PTR(-EACCES);
+		goto out;
+	}
+
+	/* It is an invariant that the directory offset is zero.  If
+	 * it is not zero, it means the directory failed to be
+	 * normalized for some reason. */
+	if (e_child_inode->i_iget5_offset != 0) {
+		printk(KERN_ERR "isofs: isofs_export_get_parent(): "
+		       "child directory not normalized!\n");
+		rv = ERR_PTR(-EACCES);
+		goto out;
+	}
+
+	/* The child inode has been normalized such that its
+	 * i_iget5_block value points to the "." entry.  Fortunately,
+	 * the ".." entry is located in the same block. */
+	parent_block = e_child_inode->i_iget5_block;
+
+	/* Get the block in question. */
+	bh = sb_bread(child_inode->i_sb, parent_block);
+	if (bh == NULL) {
+		rv = ERR_PTR(-EACCES);
+		goto out;
+	}
+
+	/* This is the "." entry. */
+	de = (struct iso_directory_record*)bh->b_data;
+
+	/* The ".." entry is always the second entry. */
+	parent_offset = (unsigned long)isonum_711(de->length);
+	de = (struct iso_directory_record*)(bh->b_data + parent_offset);
+
+	/* Verify it is in fact the ".." entry. */
+	if ((isonum_711(de->name_len) != 1) || (de->name[0] != 1)) {
+		printk(KERN_ERR "isofs: Unable to find the \"..\" "
+		       "directory for NFS.\n");
+		rv = ERR_PTR(-EACCES);
+		goto out;
+	}
+
+	/* Normalize */
+	isofs_normalize_block_and_offset(de, &parent_block, &parent_offset);
+
+	rv = d_obtain_alias(isofs_iget(child_inode->i_sb, parent_block,
+				     parent_offset));
+ out:
+	if (bh)
+		brelse(bh);
+	return rv;
+}
+
+static int
+isofs_export_encode_fh(struct dentry *dentry,
+		       __u32 *fh32,
+		       int *max_len,
+		       int connectable)
+{
+	struct inode * inode = dentry->d_inode;
+	struct iso_inode_info * ei = ISOFS_I(inode);
+	int len = *max_len;
+	int type = 1;
+	__u16 *fh16 = (__u16*)fh32;
+
+	/*
+	 * WARNING: max_len is 5 for NFSv2.  Because of this
+	 * limitation, we use the lower 16 bits of fh32[1] to hold the
+	 * offset of the inode and the upper 16 bits of fh32[1] to
+	 * hold the offset of the parent.
+	 */
+	if (connectable && (len < 5)) {
+		*max_len = 5;
+		return 255;
+	} else if (len < 3) {
+		*max_len = 3;
+		return 255;
+	}
+
+	len = 3;
+	fh32[0] = ei->i_iget5_block;
+ 	fh16[2] = (__u16)ei->i_iget5_offset;  /* fh16 [sic] */
+	fh32[2] = inode->i_generation;
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		struct inode *parent;
+		struct iso_inode_info *eparent;
+		spin_lock(&dentry->d_lock);
+		parent = dentry->d_parent->d_inode;
+		eparent = ISOFS_I(parent);
+		fh32[3] = eparent->i_iget5_block;
+		fh16[3] = (__u16)eparent->i_iget5_offset;  /* fh16 [sic] */
+		fh32[4] = parent->i_generation;
+		spin_unlock(&dentry->d_lock);
+		len = 5;
+		type = 2;
+	}
+	*max_len = len;
+	return type;
+}
+
+struct isofs_fid {
+	u32 block;
+	u16 offset;
+	u16 parent_offset;
+	u32 generation;
+	u32 parent_block;
+	u32 parent_generation;
+};
+
+static struct dentry *isofs_fh_to_dentry(struct super_block *sb,
+	struct fid *fid, int fh_len, int fh_type)
+{
+	struct isofs_fid *ifid = (struct isofs_fid *)fid;
+
+	if (fh_len < 3 || fh_type > 2)
+		return NULL;
+
+	return isofs_export_iget(sb, ifid->block, ifid->offset,
+			ifid->generation);
+}
+
+static struct dentry *isofs_fh_to_parent(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	struct isofs_fid *ifid = (struct isofs_fid *)fid;
+
+	if (fh_type != 2)
+		return NULL;
+
+	return isofs_export_iget(sb,
+			fh_len > 2 ? ifid->parent_block : 0,
+			ifid->parent_offset,
+			fh_len > 4 ? ifid->parent_generation : 0);
+}
+
+const struct export_operations isofs_export_ops = {
+	.encode_fh	= isofs_export_encode_fh,
+	.fh_to_dentry	= isofs_fh_to_dentry,
+	.fh_to_parent	= isofs_fh_to_parent,
+	.get_parent     = isofs_export_get_parent,
+};
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
new file mode 100644
index 00000000..b3cc8586
--- /dev/null
+++ b/fs/isofs/inode.c
@@ -0,0 +1,1579 @@
+/*
+ *  linux/fs/isofs/inode.c
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ *      1992, 1993, 1994  Eric Youngdale Modified for ISO 9660 filesystem.
+ *      1994  Eberhard Mönkeberg - multi session handling.
+ *      1995  Mark Dobie - allow mounting of some weird VideoCDs and PhotoCDs.
+ *	1997  Gordon Chaffee - Joliet CDs
+ *	1998  Eric Lammerts - ISO 9660 Level 3
+ *	2004  Paul Serice - Inode Support pushed out from 4GB to 128GB
+ *	2004  Paul Serice - NFS Export Operations
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/nls.h>
+#include <linux/ctype.h>
+#include <linux/statfs.h>
+#include <linux/cdrom.h>
+#include <linux/parser.h>
+
+#include "isofs.h"
+#include "zisofs.h"
+
+#define BEQUIET
+
+static int isofs_hashi(const struct dentry *parent, const struct inode *inode,
+		struct qstr *qstr);
+static int isofs_hash(const struct dentry *parent, const struct inode *inode,
+		struct qstr *qstr);
+static int isofs_dentry_cmpi(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
+static int isofs_dentry_cmp(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
+
+#ifdef CONFIG_JOLIET
+static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode,
+		struct qstr *qstr);
+static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
+		struct qstr *qstr);
+static int isofs_dentry_cmpi_ms(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
+static int isofs_dentry_cmp_ms(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
+#endif
+
+static void isofs_put_super(struct super_block *sb)
+{
+	struct isofs_sb_info *sbi = ISOFS_SB(sb);
+
+#ifdef CONFIG_JOLIET
+	unload_nls(sbi->s_nls_iocharset);
+#endif
+
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+	return;
+}
+
+static int isofs_read_inode(struct inode *);
+static int isofs_statfs (struct dentry *, struct kstatfs *);
+
+static struct kmem_cache *isofs_inode_cachep;
+
+static struct inode *isofs_alloc_inode(struct super_block *sb)
+{
+	struct iso_inode_info *ei;
+	ei = kmem_cache_alloc(isofs_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void isofs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
+}
+
+static void isofs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, isofs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct iso_inode_info *ei = foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
+					sizeof(struct iso_inode_info),
+					0, (SLAB_RECLAIM_ACCOUNT|
+					SLAB_MEM_SPREAD),
+					init_once);
+	if (isofs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(isofs_inode_cachep);
+}
+
+static int isofs_remount(struct super_block *sb, int *flags, char *data)
+{
+	/* we probably want a lot more here */
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+static const struct super_operations isofs_sops = {
+	.alloc_inode	= isofs_alloc_inode,
+	.destroy_inode	= isofs_destroy_inode,
+	.put_super	= isofs_put_super,
+	.statfs		= isofs_statfs,
+	.remount_fs	= isofs_remount,
+	.show_options	= generic_show_options,
+};
+
+
+static const struct dentry_operations isofs_dentry_ops[] = {
+	{
+		.d_hash		= isofs_hash,
+		.d_compare	= isofs_dentry_cmp,
+	},
+	{
+		.d_hash		= isofs_hashi,
+		.d_compare	= isofs_dentry_cmpi,
+	},
+#ifdef CONFIG_JOLIET
+	{
+		.d_hash		= isofs_hash_ms,
+		.d_compare	= isofs_dentry_cmp_ms,
+	},
+	{
+		.d_hash		= isofs_hashi_ms,
+		.d_compare	= isofs_dentry_cmpi_ms,
+	},
+#endif
+};
+
+struct iso9660_options{
+	unsigned int rock:1;
+	unsigned int joliet:1;
+	unsigned int cruft:1;
+	unsigned int hide:1;
+	unsigned int showassoc:1;
+	unsigned int nocompress:1;
+	unsigned int overriderockperm:1;
+	unsigned int uid_set:1;
+	unsigned int gid_set:1;
+	unsigned int utf8:1;
+	unsigned char map;
+	unsigned char check;
+	unsigned int blocksize;
+	mode_t fmode;
+	mode_t dmode;
+	gid_t gid;
+	uid_t uid;
+	char *iocharset;
+	/* LVE */
+	s32 session;
+	s32 sbsector;
+};
+
+/*
+ * Compute the hash for the isofs name corresponding to the dentry.
+ */
+static int
+isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
+{
+	const char *name;
+	int len;
+
+	len = qstr->len;
+	name = qstr->name;
+	if (ms) {
+		while (len && name[len-1] == '.')
+			len--;
+	}
+
+	qstr->hash = full_name_hash(name, len);
+
+	return 0;
+}
+
+/*
+ * Compute the hash for the isofs name corresponding to the dentry.
+ */
+static int
+isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms)
+{
+	const char *name;
+	int len;
+	char c;
+	unsigned long hash;
+
+	len = qstr->len;
+	name = qstr->name;
+	if (ms) {
+		while (len && name[len-1] == '.')
+			len--;
+	}
+
+	hash = init_name_hash();
+	while (len--) {
+		c = tolower(*name++);
+		hash = partial_name_hash(c, hash);
+	}
+	qstr->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+/*
+ * Compare of two isofs names.
+ */
+static int isofs_dentry_cmp_common(
+		unsigned int len, const char *str,
+		const struct qstr *name, int ms, int ci)
+{
+	int alen, blen;
+
+	/* A filename cannot end in '.' or we treat it like it has none */
+	alen = name->len;
+	blen = len;
+	if (ms) {
+		while (alen && name->name[alen-1] == '.')
+			alen--;
+		while (blen && str[blen-1] == '.')
+			blen--;
+	}
+	if (alen == blen) {
+		if (ci) {
+			if (strnicmp(name->name, str, alen) == 0)
+				return 0;
+		} else {
+			if (strncmp(name->name, str, alen) == 0)
+				return 0;
+		}
+	}
+	return 1;
+}
+
+static int
+isofs_hash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
+{
+	return isofs_hash_common(dentry, qstr, 0);
+}
+
+static int
+isofs_hashi(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
+{
+	return isofs_hashi_common(dentry, qstr, 0);
+}
+
+static int
+isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	return isofs_dentry_cmp_common(len, str, name, 0, 0);
+}
+
+static int
+isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	return isofs_dentry_cmp_common(len, str, name, 0, 1);
+}
+
+#ifdef CONFIG_JOLIET
+static int
+isofs_hash_ms(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
+{
+	return isofs_hash_common(dentry, qstr, 1);
+}
+
+static int
+isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
+{
+	return isofs_hashi_common(dentry, qstr, 1);
+}
+
+static int
+isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	return isofs_dentry_cmp_common(len, str, name, 1, 0);
+}
+
+static int
+isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	return isofs_dentry_cmp_common(len, str, name, 1, 1);
+}
+#endif
+
+enum {
+	Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore,
+	Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet,
+	Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err,
+	Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm,
+};
+
+static const match_table_t tokens = {
+	{Opt_norock, "norock"},
+	{Opt_nojoliet, "nojoliet"},
+	{Opt_unhide, "unhide"},
+	{Opt_hide, "hide"},
+	{Opt_showassoc, "showassoc"},
+	{Opt_cruft, "cruft"},
+	{Opt_utf8, "utf8"},
+	{Opt_iocharset, "iocharset=%s"},
+	{Opt_map_a, "map=acorn"},
+	{Opt_map_a, "map=a"},
+	{Opt_map_n, "map=normal"},
+	{Opt_map_n, "map=n"},
+	{Opt_map_o, "map=off"},
+	{Opt_map_o, "map=o"},
+	{Opt_session, "session=%u"},
+	{Opt_sb, "sbsector=%u"},
+	{Opt_check_r, "check=relaxed"},
+	{Opt_check_r, "check=r"},
+	{Opt_check_s, "check=strict"},
+	{Opt_check_s, "check=s"},
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_mode, "mode=%u"},
+	{Opt_dmode, "dmode=%u"},
+	{Opt_overriderockperm, "overriderockperm"},
+	{Opt_block, "block=%u"},
+	{Opt_ignore, "conv=binary"},
+	{Opt_ignore, "conv=b"},
+	{Opt_ignore, "conv=text"},
+	{Opt_ignore, "conv=t"},
+	{Opt_ignore, "conv=mtext"},
+	{Opt_ignore, "conv=m"},
+	{Opt_ignore, "conv=auto"},
+	{Opt_ignore, "conv=a"},
+	{Opt_nocompress, "nocompress"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(char *options, struct iso9660_options *popt)
+{
+	char *p;
+	int option;
+
+	popt->map = 'n';
+	popt->rock = 1;
+	popt->joliet = 1;
+	popt->cruft = 0;
+	popt->hide = 0;
+	popt->showassoc = 0;
+	popt->check = 'u';		/* unset */
+	popt->nocompress = 0;
+	popt->blocksize = 1024;
+	popt->fmode = popt->dmode = ISOFS_INVALID_MODE;
+	popt->uid_set = 0;
+	popt->gid_set = 0;
+	popt->gid = 0;
+	popt->uid = 0;
+	popt->iocharset = NULL;
+	popt->utf8 = 0;
+	popt->overriderockperm = 0;
+	popt->session=-1;
+	popt->sbsector=-1;
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		substring_t args[MAX_OPT_ARGS];
+		unsigned n;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_norock:
+			popt->rock = 0;
+			break;
+		case Opt_nojoliet:
+			popt->joliet = 0;
+			break;
+		case Opt_hide:
+			popt->hide = 1;
+			break;
+		case Opt_unhide:
+		case Opt_showassoc:
+			popt->showassoc = 1;
+			break;
+		case Opt_cruft:
+			popt->cruft = 1;
+			break;
+		case Opt_utf8:
+			popt->utf8 = 1;
+			break;
+#ifdef CONFIG_JOLIET
+		case Opt_iocharset:
+			popt->iocharset = match_strdup(&args[0]);
+			break;
+#endif
+		case Opt_map_a:
+			popt->map = 'a';
+			break;
+		case Opt_map_o:
+			popt->map = 'o';
+			break;
+		case Opt_map_n:
+			popt->map = 'n';
+			break;
+		case Opt_session:
+			if (match_int(&args[0], &option))
+				return 0;
+			n = option;
+			if (n > 99)
+				return 0;
+			popt->session = n + 1;
+			break;
+		case Opt_sb:
+			if (match_int(&args[0], &option))
+				return 0;
+			popt->sbsector = option;
+			break;
+		case Opt_check_r:
+			popt->check = 'r';
+			break;
+		case Opt_check_s:
+			popt->check = 's';
+			break;
+		case Opt_ignore:
+			break;
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return 0;
+			popt->uid = option;
+			popt->uid_set = 1;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+			popt->gid = option;
+			popt->gid_set = 1;
+			break;
+		case Opt_mode:
+			if (match_int(&args[0], &option))
+				return 0;
+			popt->fmode = option;
+			break;
+		case Opt_dmode:
+			if (match_int(&args[0], &option))
+				return 0;
+			popt->dmode = option;
+			break;
+		case Opt_overriderockperm:
+			popt->overriderockperm = 1;
+			break;
+		case Opt_block:
+			if (match_int(&args[0], &option))
+				return 0;
+			n = option;
+			if (n != 512 && n != 1024 && n != 2048)
+				return 0;
+			popt->blocksize = n;
+			break;
+		case Opt_nocompress:
+			popt->nocompress = 1;
+			break;
+		default:
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/*
+ * look if the driver can tell the multi session redirection value
+ *
+ * don't change this if you don't know what you do, please!
+ * Multisession is legal only with XA disks.
+ * A non-XA disk with more than one volume descriptor may do it right, but
+ * usually is written in a nowhere standardized "multi-partition" manner.
+ * Multisession uses absolute addressing (solely the first frame of the whole
+ * track is #0), multi-partition uses relative addressing (each first frame of
+ * each track is #0), and a track is not a session.
+ *
+ * A broken CDwriter software or drive firmware does not set new standards,
+ * at least not if conflicting with the existing ones.
+ *
+ * emoenke@gwdg.de
+ */
+#define WE_OBEY_THE_WRITTEN_STANDARDS 1
+
+static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
+{
+	struct cdrom_multisession ms_info;
+	unsigned int vol_desc_start;
+	struct block_device *bdev = sb->s_bdev;
+	int i;
+
+	vol_desc_start=0;
+	ms_info.addr_format=CDROM_LBA;
+	if(session >= 0 && session <= 99) {
+		struct cdrom_tocentry Te;
+		Te.cdte_track=session;
+		Te.cdte_format=CDROM_LBA;
+		i = ioctl_by_bdev(bdev, CDROMREADTOCENTRY, (unsigned long) &Te);
+		if (!i) {
+			printk(KERN_DEBUG "ISOFS: Session %d start %d type %d\n",
+				session, Te.cdte_addr.lba,
+				Te.cdte_ctrl&CDROM_DATA_TRACK);
+			if ((Te.cdte_ctrl&CDROM_DATA_TRACK) == 4)
+				return Te.cdte_addr.lba;
+		}
+
+		printk(KERN_ERR "ISOFS: Invalid session number or type of track\n");
+	}
+	i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long) &ms_info);
+	if (session > 0)
+		printk(KERN_ERR "ISOFS: Invalid session number\n");
+#if 0
+	printk(KERN_DEBUG "isofs.inode: CDROMMULTISESSION: rc=%d\n",i);
+	if (i==0) {
+		printk(KERN_DEBUG "isofs.inode: XA disk: %s\n",ms_info.xa_flag?"yes":"no");
+		printk(KERN_DEBUG "isofs.inode: vol_desc_start = %d\n", ms_info.addr.lba);
+	}
+#endif
+	if (i==0)
+#if WE_OBEY_THE_WRITTEN_STANDARDS
+		if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
+#endif
+			vol_desc_start=ms_info.addr.lba;
+	return vol_desc_start;
+}
+
+/*
+ * Check if root directory is empty (has less than 3 files).
+ *
+ * Used to detect broken CDs where ISO root directory is empty but Joliet root
+ * directory is OK. If such CD has Rock Ridge extensions, they will be disabled
+ * (and Joliet used instead) or else no files would be visible.
+ */
+static bool rootdir_empty(struct super_block *sb, unsigned long block)
+{
+	int offset = 0, files = 0, de_len;
+	struct iso_directory_record *de;
+	struct buffer_head *bh;
+
+	bh = sb_bread(sb, block);
+	if (!bh)
+		return true;
+	while (files < 3) {
+		de = (struct iso_directory_record *) (bh->b_data + offset);
+		de_len = *(unsigned char *) de;
+		if (de_len == 0)
+			break;
+		files++;
+		offset += de_len;
+	}
+	brelse(bh);
+	return files < 3;
+}
+
+/*
+ * Initialize the superblock and read the root inode.
+ *
+ * Note: a check_disk_change() has been done immediately prior
+ * to this call, so we don't need to check again.
+ */
+static int isofs_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct buffer_head *bh = NULL, *pri_bh = NULL;
+	struct hs_primary_descriptor *h_pri = NULL;
+	struct iso_primary_descriptor *pri = NULL;
+	struct iso_supplementary_descriptor *sec = NULL;
+	struct iso_directory_record *rootp;
+	struct inode *inode;
+	struct iso9660_options opt;
+	struct isofs_sb_info *sbi;
+	unsigned long first_data_zone;
+	int joliet_level = 0;
+	int iso_blknum, block;
+	int orig_zonesize;
+	int table, error = -EINVAL;
+	unsigned int vol_desc_start;
+
+	save_mount_options(s, data);
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	s->s_fs_info = sbi;
+
+	if (!parse_options((char *)data, &opt))
+		goto out_freesbi;
+
+	/*
+	 * First of all, get the hardware blocksize for this device.
+	 * If we don't know what it is, or the hardware blocksize is
+	 * larger than the blocksize the user specified, then use
+	 * that value.
+	 */
+	/*
+	 * What if bugger tells us to go beyond page size?
+	 */
+	opt.blocksize = sb_min_blocksize(s, opt.blocksize);
+
+	sbi->s_high_sierra = 0; /* default is iso9660 */
+
+	vol_desc_start = (opt.sbsector != -1) ?
+		opt.sbsector : isofs_get_last_session(s,opt.session);
+
+	for (iso_blknum = vol_desc_start+16;
+		iso_blknum < vol_desc_start+100; iso_blknum++) {
+		struct hs_volume_descriptor *hdp;
+		struct iso_volume_descriptor  *vdp;
+
+		block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits);
+		if (!(bh = sb_bread(s, block)))
+			goto out_no_read;
+
+		vdp = (struct iso_volume_descriptor *)bh->b_data;
+		hdp = (struct hs_volume_descriptor *)bh->b_data;
+
+		/*
+		 * Due to the overlapping physical location of the descriptors,
+		 * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure
+		 * proper identification in this case, we first check for ISO.
+		 */
+		if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) {
+			if (isonum_711(vdp->type) == ISO_VD_END)
+				break;
+			if (isonum_711(vdp->type) == ISO_VD_PRIMARY) {
+				if (pri == NULL) {
+					pri = (struct iso_primary_descriptor *)vdp;
+					/* Save the buffer in case we need it ... */
+					pri_bh = bh;
+					bh = NULL;
+				}
+			}
+#ifdef CONFIG_JOLIET
+			else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) {
+				sec = (struct iso_supplementary_descriptor *)vdp;
+				if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) {
+					if (opt.joliet) {
+						if (sec->escape[2] == 0x40)
+							joliet_level = 1;
+						else if (sec->escape[2] == 0x43)
+							joliet_level = 2;
+						else if (sec->escape[2] == 0x45)
+							joliet_level = 3;
+
+						printk(KERN_DEBUG "ISO 9660 Extensions: "
+							"Microsoft Joliet Level %d\n",
+							joliet_level);
+					}
+					goto root_found;
+				} else {
+				/* Unknown supplementary volume descriptor */
+				sec = NULL;
+				}
+			}
+#endif
+		} else {
+			if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) {
+				if (isonum_711(hdp->type) != ISO_VD_PRIMARY)
+					goto out_freebh;
+
+				sbi->s_high_sierra = 1;
+				opt.rock = 0;
+				h_pri = (struct hs_primary_descriptor *)vdp;
+				goto root_found;
+			}
+		}
+
+		/* Just skip any volume descriptors we don't recognize */
+
+		brelse(bh);
+		bh = NULL;
+	}
+	/*
+	 * If we fall through, either no volume descriptor was found,
+	 * or else we passed a primary descriptor looking for others.
+	 */
+	if (!pri)
+		goto out_unknown_format;
+	brelse(bh);
+	bh = pri_bh;
+	pri_bh = NULL;
+
+root_found:
+
+	if (joliet_level && (pri == NULL || !opt.rock)) {
+		/* This is the case of Joliet with the norock mount flag.
+		 * A disc with both Joliet and Rock Ridge is handled later
+		 */
+		pri = (struct iso_primary_descriptor *) sec;
+	}
+
+	if(sbi->s_high_sierra){
+		rootp = (struct iso_directory_record *) h_pri->root_directory_record;
+		sbi->s_nzones = isonum_733(h_pri->volume_space_size);
+		sbi->s_log_zone_size = isonum_723(h_pri->logical_block_size);
+		sbi->s_max_size = isonum_733(h_pri->volume_space_size);
+	} else {
+		if (!pri)
+			goto out_freebh;
+		rootp = (struct iso_directory_record *) pri->root_directory_record;
+		sbi->s_nzones = isonum_733(pri->volume_space_size);
+		sbi->s_log_zone_size = isonum_723(pri->logical_block_size);
+		sbi->s_max_size = isonum_733(pri->volume_space_size);
+	}
+
+	sbi->s_ninodes = 0; /* No way to figure this out easily */
+
+	orig_zonesize = sbi->s_log_zone_size;
+	/*
+	 * If the zone size is smaller than the hardware sector size,
+	 * this is a fatal error.  This would occur if the disc drive
+	 * had sectors that were 2048 bytes, but the filesystem had
+	 * blocks that were 512 bytes (which should only very rarely
+	 * happen.)
+	 */
+	if (orig_zonesize < opt.blocksize)
+		goto out_bad_size;
+
+	/* RDE: convert log zone size to bit shift */
+	switch (sbi->s_log_zone_size) {
+	case  512: sbi->s_log_zone_size =  9; break;
+	case 1024: sbi->s_log_zone_size = 10; break;
+	case 2048: sbi->s_log_zone_size = 11; break;
+
+	default:
+		goto out_bad_zone_size;
+	}
+
+	s->s_magic = ISOFS_SUPER_MAGIC;
+
+	/*
+	 * With multi-extent files, file size is only limited by the maximum
+	 * size of a file system, which is 8 TB.
+	 */
+	s->s_maxbytes = 0x80000000000LL;
+
+	/*
+	 * The CDROM is read-only, has no nodes (devices) on it, and since
+	 * all of the files appear to be owned by root, we really do not want
+	 * to allow suid.  (suid or devices will not show up unless we have
+	 * Rock Ridge extensions)
+	 */
+
+	s->s_flags |= MS_RDONLY /* | MS_NODEV | MS_NOSUID */;
+
+	/* Set this for reference. Its not currently used except on write
+	   which we don't have .. */
+
+	first_data_zone = isonum_733(rootp->extent) +
+			  isonum_711(rootp->ext_attr_length);
+	sbi->s_firstdatazone = first_data_zone;
+#ifndef BEQUIET
+	printk(KERN_DEBUG "ISOFS: Max size:%ld   Log zone size:%ld\n",
+		sbi->s_max_size, 1UL << sbi->s_log_zone_size);
+	printk(KERN_DEBUG "ISOFS: First datazone:%ld\n", sbi->s_firstdatazone);
+	if(sbi->s_high_sierra)
+		printk(KERN_DEBUG "ISOFS: Disc in High Sierra format.\n");
+#endif
+
+	/*
+	 * If the Joliet level is set, we _may_ decide to use the
+	 * secondary descriptor, but can't be sure until after we
+	 * read the root inode. But before reading the root inode
+	 * we may need to change the device blocksize, and would
+	 * rather release the old buffer first. So, we cache the
+	 * first_data_zone value from the secondary descriptor.
+	 */
+	if (joliet_level) {
+		pri = (struct iso_primary_descriptor *) sec;
+		rootp = (struct iso_directory_record *)
+			pri->root_directory_record;
+		first_data_zone = isonum_733(rootp->extent) +
+				isonum_711(rootp->ext_attr_length);
+	}
+
+	/*
+	 * We're all done using the volume descriptor, and may need
+	 * to change the device blocksize, so release the buffer now.
+	 */
+	brelse(pri_bh);
+	brelse(bh);
+
+	/*
+	 * Force the blocksize to 512 for 512 byte sectors.  The file
+	 * read primitives really get it wrong in a bad way if we don't
+	 * do this.
+	 *
+	 * Note - we should never be setting the blocksize to something
+	 * less than the hardware sector size for the device.  If we
+	 * do, we would end up having to read larger buffers and split
+	 * out portions to satisfy requests.
+	 *
+	 * Note2- the idea here is that we want to deal with the optimal
+	 * zonesize in the filesystem.  If we have it set to something less,
+	 * then we have horrible problems with trying to piece together
+	 * bits of adjacent blocks in order to properly read directory
+	 * entries.  By forcing the blocksize in this way, we ensure
+	 * that we will never be required to do this.
+	 */
+	sb_set_blocksize(s, orig_zonesize);
+
+	sbi->s_nls_iocharset = NULL;
+
+#ifdef CONFIG_JOLIET
+	if (joliet_level && opt.utf8 == 0) {
+		char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT;
+		sbi->s_nls_iocharset = load_nls(p);
+		if (! sbi->s_nls_iocharset) {
+			/* Fail only if explicit charset specified */
+			if (opt.iocharset)
+				goto out_freesbi;
+			sbi->s_nls_iocharset = load_nls_default();
+		}
+	}
+#endif
+	s->s_op = &isofs_sops;
+	s->s_export_op = &isofs_export_ops;
+	sbi->s_mapping = opt.map;
+	sbi->s_rock = (opt.rock ? 2 : 0);
+	sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/
+	sbi->s_cruft = opt.cruft;
+	sbi->s_hide = opt.hide;
+	sbi->s_showassoc = opt.showassoc;
+	sbi->s_uid = opt.uid;
+	sbi->s_gid = opt.gid;
+	sbi->s_uid_set = opt.uid_set;
+	sbi->s_gid_set = opt.gid_set;
+	sbi->s_utf8 = opt.utf8;
+	sbi->s_nocompress = opt.nocompress;
+	sbi->s_overriderockperm = opt.overriderockperm;
+	mutex_init(&sbi->s_mutex);
+	/*
+	 * It would be incredibly stupid to allow people to mark every file
+	 * on the disk as suid, so we merely allow them to set the default
+	 * permissions.
+	 */
+	if (opt.fmode != ISOFS_INVALID_MODE)
+		sbi->s_fmode = opt.fmode & 0777;
+	else
+		sbi->s_fmode = ISOFS_INVALID_MODE;
+	if (opt.dmode != ISOFS_INVALID_MODE)
+		sbi->s_dmode = opt.dmode & 0777;
+	else
+		sbi->s_dmode = ISOFS_INVALID_MODE;
+
+	/*
+	 * Read the root inode, which _may_ result in changing
+	 * the s_rock flag. Once we have the final s_rock value,
+	 * we then decide whether to use the Joliet descriptor.
+	 */
+	inode = isofs_iget(s, sbi->s_firstdatazone, 0);
+	if (IS_ERR(inode))
+		goto out_no_root;
+
+	/*
+	 * Fix for broken CDs with Rock Ridge and empty ISO root directory but
+	 * correct Joliet root directory.
+	 */
+	if (sbi->s_rock == 1 && joliet_level &&
+				rootdir_empty(s, sbi->s_firstdatazone)) {
+		printk(KERN_NOTICE
+			"ISOFS: primary root directory is empty. "
+			"Disabling Rock Ridge and switching to Joliet.");
+		sbi->s_rock = 0;
+	}
+
+	/*
+	 * If this disk has both Rock Ridge and Joliet on it, then we
+	 * want to use Rock Ridge by default.  This can be overridden
+	 * by using the norock mount option.  There is still one other
+	 * possibility that is not taken into account: a Rock Ridge
+	 * CD with Unicode names.  Until someone sees such a beast, it
+	 * will not be supported.
+	 */
+	if (sbi->s_rock == 1) {
+		joliet_level = 0;
+	} else if (joliet_level) {
+		sbi->s_rock = 0;
+		if (sbi->s_firstdatazone != first_data_zone) {
+			sbi->s_firstdatazone = first_data_zone;
+			printk(KERN_DEBUG
+				"ISOFS: changing to secondary root\n");
+			iput(inode);
+			inode = isofs_iget(s, sbi->s_firstdatazone, 0);
+			if (IS_ERR(inode))
+				goto out_no_root;
+		}
+	}
+
+	if (opt.check == 'u') {
+		/* Only Joliet is case insensitive by default */
+		if (joliet_level)
+			opt.check = 'r';
+		else
+			opt.check = 's';
+	}
+	sbi->s_joliet_level = joliet_level;
+
+	/* Make sure the root inode is a directory */
+	if (!S_ISDIR(inode->i_mode)) {
+		printk(KERN_WARNING
+			"isofs_fill_super: root inode is not a directory. "
+			"Corrupted media?\n");
+		goto out_iput;
+	}
+
+	table = 0;
+	if (joliet_level)
+		table += 2;
+	if (opt.check == 'r')
+		table++;
+
+	s->s_d_op = &isofs_dentry_ops[table];
+
+	/* get the root dentry */
+	s->s_root = d_alloc_root(inode);
+	if (!(s->s_root))
+		goto out_no_root;
+
+	kfree(opt.iocharset);
+
+	return 0;
+
+	/*
+	 * Display error messages and free resources.
+	 */
+out_iput:
+	iput(inode);
+	goto out_no_inode;
+out_no_root:
+	error = PTR_ERR(inode);
+	if (error != -ENOMEM)
+		printk(KERN_WARNING "%s: get root inode failed\n", __func__);
+out_no_inode:
+#ifdef CONFIG_JOLIET
+	unload_nls(sbi->s_nls_iocharset);
+#endif
+	goto out_freesbi;
+out_no_read:
+	printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n",
+		__func__, s->s_id, iso_blknum, block);
+	goto out_freebh;
+out_bad_zone_size:
+	printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n",
+		sbi->s_log_zone_size);
+	goto out_freebh;
+out_bad_size:
+	printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n",
+		orig_zonesize, opt.blocksize);
+	goto out_freebh;
+out_unknown_format:
+	if (!silent)
+		printk(KERN_WARNING "ISOFS: Unable to identify CD-ROM format.\n");
+
+out_freebh:
+	brelse(bh);
+	brelse(pri_bh);
+out_freesbi:
+	kfree(opt.iocharset);
+	kfree(sbi);
+	s->s_fs_info = NULL;
+	return error;
+}
+
+static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type = ISOFS_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = (ISOFS_SB(sb)->s_nzones
+		<< (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits));
+	buf->f_bfree = 0;
+	buf->f_bavail = 0;
+	buf->f_files = ISOFS_SB(sb)->s_ninodes;
+	buf->f_ffree = 0;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = NAME_MAX;
+	return 0;
+}
+
+/*
+ * Get a set of blocks; filling in buffer_heads if already allocated
+ * or getblk() if they are not.  Returns the number of blocks inserted
+ * (-ve == error.)
+ */
+int isofs_get_blocks(struct inode *inode, sector_t iblock,
+		     struct buffer_head **bh, unsigned long nblocks)
+{
+	unsigned long b_off = iblock;
+	unsigned offset, sect_size;
+	unsigned int firstext;
+	unsigned long nextblk, nextoff;
+	int section, rv, error;
+	struct iso_inode_info *ei = ISOFS_I(inode);
+
+	error = -EIO;
+	rv = 0;
+	if (iblock != b_off) {
+		printk(KERN_DEBUG "%s: block number too large\n", __func__);
+		goto abort;
+	}
+
+
+	offset = 0;
+	firstext = ei->i_first_extent;
+	sect_size = ei->i_section_size >> ISOFS_BUFFER_BITS(inode);
+	nextblk = ei->i_next_section_block;
+	nextoff = ei->i_next_section_offset;
+	section = 0;
+
+	while (nblocks) {
+		/* If we are *way* beyond the end of the file, print a message.
+		 * Access beyond the end of the file up to the next page boundary
+		 * is normal, however because of the way the page cache works.
+		 * In this case, we just return 0 so that we can properly fill
+		 * the page with useless information without generating any
+		 * I/O errors.
+		 */
+		if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
+			printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
+				__func__, b_off,
+				(unsigned long long)inode->i_size);
+			goto abort;
+		}
+
+		/* On the last section, nextblk == 0, section size is likely to
+		 * exceed sect_size by a partial block, and access beyond the
+		 * end of the file will reach beyond the section size, too.
+		 */
+		while (nextblk && (b_off >= (offset + sect_size))) {
+			struct inode *ninode;
+
+			offset += sect_size;
+			ninode = isofs_iget(inode->i_sb, nextblk, nextoff);
+			if (IS_ERR(ninode)) {
+				error = PTR_ERR(ninode);
+				goto abort;
+			}
+			firstext  = ISOFS_I(ninode)->i_first_extent;
+			sect_size = ISOFS_I(ninode)->i_section_size >> ISOFS_BUFFER_BITS(ninode);
+			nextblk   = ISOFS_I(ninode)->i_next_section_block;
+			nextoff   = ISOFS_I(ninode)->i_next_section_offset;
+			iput(ninode);
+
+			if (++section > 100) {
+				printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
+					" aborting...\n", __func__);
+				printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u "
+					"nextblk=%lu nextoff=%lu\n", __func__,
+					b_off, firstext, (unsigned) sect_size,
+					nextblk, nextoff);
+				goto abort;
+			}
+		}
+
+		if (*bh) {
+			map_bh(*bh, inode->i_sb, firstext + b_off - offset);
+		} else {
+			*bh = sb_getblk(inode->i_sb, firstext+b_off-offset);
+			if (!*bh)
+				goto abort;
+		}
+		bh++;	/* Next buffer head */
+		b_off++;	/* Next buffer offset */
+		nblocks--;
+		rv++;
+	}
+
+	error = 0;
+abort:
+	return rv != 0 ? rv : error;
+}
+
+/*
+ * Used by the standard interfaces.
+ */
+static int isofs_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *bh_result, int create)
+{
+	int ret;
+
+	if (create) {
+		printk(KERN_DEBUG "%s: Kernel tries to allocate a block\n", __func__);
+		return -EROFS;
+	}
+
+	ret = isofs_get_blocks(inode, iblock, &bh_result, 1);
+	return ret < 0 ? ret : 0;
+}
+
+static int isofs_bmap(struct inode *inode, sector_t block)
+{
+	struct buffer_head dummy;
+	int error;
+
+	dummy.b_state = 0;
+	dummy.b_blocknr = -1000;
+	error = isofs_get_block(inode, block, &dummy, 0);
+	if (!error)
+		return dummy.b_blocknr;
+	return 0;
+}
+
+struct buffer_head *isofs_bread(struct inode *inode, sector_t block)
+{
+	sector_t blknr = isofs_bmap(inode, block);
+	if (!blknr)
+		return NULL;
+	return sb_bread(inode->i_sb, blknr);
+}
+
+static int isofs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page,isofs_get_block);
+}
+
+static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,isofs_get_block);
+}
+
+static const struct address_space_operations isofs_aops = {
+	.readpage = isofs_readpage,
+	.bmap = _isofs_bmap
+};
+
+static int isofs_read_level3_size(struct inode *inode)
+{
+	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
+	int high_sierra = ISOFS_SB(inode->i_sb)->s_high_sierra;
+	struct buffer_head *bh = NULL;
+	unsigned long block, offset, block_saved, offset_saved;
+	int i = 0;
+	int more_entries = 0;
+	struct iso_directory_record *tmpde = NULL;
+	struct iso_inode_info *ei = ISOFS_I(inode);
+
+	inode->i_size = 0;
+
+	/* The first 16 blocks are reserved as the System Area.  Thus,
+	 * no inodes can appear in block 0.  We use this to flag that
+	 * this is the last section. */
+	ei->i_next_section_block = 0;
+	ei->i_next_section_offset = 0;
+
+	block = ei->i_iget5_block;
+	offset = ei->i_iget5_offset;
+
+	do {
+		struct iso_directory_record *de;
+		unsigned int de_len;
+
+		if (!bh) {
+			bh = sb_bread(inode->i_sb, block);
+			if (!bh)
+				goto out_noread;
+		}
+		de = (struct iso_directory_record *) (bh->b_data + offset);
+		de_len = *(unsigned char *) de;
+
+		if (de_len == 0) {
+			brelse(bh);
+			bh = NULL;
+			++block;
+			offset = 0;
+			continue;
+		}
+
+		block_saved = block;
+		offset_saved = offset;
+		offset += de_len;
+
+		/* Make sure we have a full directory entry */
+		if (offset >= bufsize) {
+			int slop = bufsize - offset + de_len;
+			if (!tmpde) {
+				tmpde = kmalloc(256, GFP_KERNEL);
+				if (!tmpde)
+					goto out_nomem;
+			}
+			memcpy(tmpde, de, slop);
+			offset &= bufsize - 1;
+			block++;
+			brelse(bh);
+			bh = NULL;
+			if (offset) {
+				bh = sb_bread(inode->i_sb, block);
+				if (!bh)
+					goto out_noread;
+				memcpy((void *)tmpde+slop, bh->b_data, offset);
+			}
+			de = tmpde;
+		}
+
+		inode->i_size += isonum_733(de->size);
+		if (i == 1) {
+			ei->i_next_section_block = block_saved;
+			ei->i_next_section_offset = offset_saved;
+		}
+
+		more_entries = de->flags[-high_sierra] & 0x80;
+
+		i++;
+		if (i > 100)
+			goto out_toomany;
+	} while (more_entries);
+out:
+	kfree(tmpde);
+	if (bh)
+		brelse(bh);
+	return 0;
+
+out_nomem:
+	if (bh)
+		brelse(bh);
+	return -ENOMEM;
+
+out_noread:
+	printk(KERN_INFO "ISOFS: unable to read i-node block %lu\n", block);
+	kfree(tmpde);
+	return -EIO;
+
+out_toomany:
+	printk(KERN_INFO "%s: More than 100 file sections ?!?, aborting...\n"
+		"isofs_read_level3_size: inode=%lu\n",
+		__func__, inode->i_ino);
+	goto out;
+}
+
+static int isofs_read_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct isofs_sb_info *sbi = ISOFS_SB(sb);
+	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
+	unsigned long block;
+	int high_sierra = sbi->s_high_sierra;
+	struct buffer_head *bh = NULL;
+	struct iso_directory_record *de;
+	struct iso_directory_record *tmpde = NULL;
+	unsigned int de_len;
+	unsigned long offset;
+	struct iso_inode_info *ei = ISOFS_I(inode);
+	int ret = -EIO;
+
+	block = ei->i_iget5_block;
+	bh = sb_bread(inode->i_sb, block);
+	if (!bh)
+		goto out_badread;
+
+	offset = ei->i_iget5_offset;
+
+	de = (struct iso_directory_record *) (bh->b_data + offset);
+	de_len = *(unsigned char *) de;
+
+	if (offset + de_len > bufsize) {
+		int frag1 = bufsize - offset;
+
+		tmpde = kmalloc(de_len, GFP_KERNEL);
+		if (tmpde == NULL) {
+			printk(KERN_INFO "%s: out of memory\n", __func__);
+			ret = -ENOMEM;
+			goto fail;
+		}
+		memcpy(tmpde, bh->b_data + offset, frag1);
+		brelse(bh);
+		bh = sb_bread(inode->i_sb, ++block);
+		if (!bh)
+			goto out_badread;
+		memcpy((char *)tmpde+frag1, bh->b_data, de_len - frag1);
+		de = tmpde;
+	}
+
+	inode->i_ino = isofs_get_ino(ei->i_iget5_block,
+					ei->i_iget5_offset,
+					ISOFS_BUFFER_BITS(inode));
+
+	/* Assume it is a normal-format file unless told otherwise */
+	ei->i_file_format = isofs_file_normal;
+
+	if (de->flags[-high_sierra] & 2) {
+		if (sbi->s_dmode != ISOFS_INVALID_MODE)
+			inode->i_mode = S_IFDIR | sbi->s_dmode;
+		else
+			inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
+		inode->i_nlink = 1;	/*
+					 * Set to 1.  We know there are 2, but
+					 * the find utility tries to optimize
+					 * if it is 2, and it screws up.  It is
+					 * easier to give 1 which tells find to
+					 * do it the hard way.
+					 */
+	} else {
+		if (sbi->s_fmode != ISOFS_INVALID_MODE) {
+			inode->i_mode = S_IFREG | sbi->s_fmode;
+		} else {
+			/*
+			 * Set default permissions: r-x for all.  The disc
+			 * could be shared with DOS machines so virtually
+			 * anything could be a valid executable.
+			 */
+			inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO;
+		}
+		inode->i_nlink = 1;
+	}
+	inode->i_uid = sbi->s_uid;
+	inode->i_gid = sbi->s_gid;
+	inode->i_blocks = 0;
+
+	ei->i_format_parm[0] = 0;
+	ei->i_format_parm[1] = 0;
+	ei->i_format_parm[2] = 0;
+
+	ei->i_section_size = isonum_733(de->size);
+	if (de->flags[-high_sierra] & 0x80) {
+		ret = isofs_read_level3_size(inode);
+		if (ret < 0)
+			goto fail;
+		ret = -EIO;
+	} else {
+		ei->i_next_section_block = 0;
+		ei->i_next_section_offset = 0;
+		inode->i_size = isonum_733(de->size);
+	}
+
+	/*
+	 * Some dipshit decided to store some other bit of information
+	 * in the high byte of the file length.  Truncate size in case
+	 * this CDROM was mounted with the cruft option.
+	 */
+
+	if (sbi->s_cruft)
+		inode->i_size &= 0x00ffffff;
+
+	if (de->interleave[0]) {
+		printk(KERN_DEBUG "ISOFS: Interleaved files not (yet) supported.\n");
+		inode->i_size = 0;
+	}
+
+	/* I have no idea what file_unit_size is used for, so
+	   we will flag it for now */
+	if (de->file_unit_size[0] != 0) {
+		printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%ld).\n",
+			inode->i_ino);
+	}
+
+	/* I have no idea what other flag bits are used for, so
+	   we will flag it for now */
+#ifdef DEBUG
+	if((de->flags[-high_sierra] & ~2)!= 0){
+		printk(KERN_DEBUG "ISOFS: Unusual flag settings for ISO file "
+				"(%ld %x).\n",
+			inode->i_ino, de->flags[-high_sierra]);
+	}
+#endif
+
+	inode->i_mtime.tv_sec =
+	inode->i_atime.tv_sec =
+	inode->i_ctime.tv_sec = iso_date(de->date, high_sierra);
+	inode->i_mtime.tv_nsec =
+	inode->i_atime.tv_nsec =
+	inode->i_ctime.tv_nsec = 0;
+
+	ei->i_first_extent = (isonum_733(de->extent) +
+			isonum_711(de->ext_attr_length));
+
+	/* Set the number of blocks for stat() - should be done before RR */
+	inode->i_blocks = (inode->i_size + 511) >> 9;
+
+	/*
+	 * Now test for possible Rock Ridge extensions which will override
+	 * some of these numbers in the inode structure.
+	 */
+
+	if (!high_sierra) {
+		parse_rock_ridge_inode(de, inode);
+		/* if we want uid/gid set, override the rock ridge setting */
+		if (sbi->s_uid_set)
+			inode->i_uid = sbi->s_uid;
+		if (sbi->s_gid_set)
+			inode->i_gid = sbi->s_gid;
+	}
+	/* Now set final access rights if overriding rock ridge setting */
+	if (S_ISDIR(inode->i_mode) && sbi->s_overriderockperm &&
+	    sbi->s_dmode != ISOFS_INVALID_MODE)
+		inode->i_mode = S_IFDIR | sbi->s_dmode;
+	if (S_ISREG(inode->i_mode) && sbi->s_overriderockperm &&
+	    sbi->s_fmode != ISOFS_INVALID_MODE)
+		inode->i_mode = S_IFREG | sbi->s_fmode;
+
+	/* Install the inode operations vector */
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_fop = &generic_ro_fops;
+		switch (ei->i_file_format) {
+#ifdef CONFIG_ZISOFS
+		case isofs_file_compressed:
+			inode->i_data.a_ops = &zisofs_aops;
+			break;
+#endif
+		default:
+			inode->i_data.a_ops = &isofs_aops;
+			break;
+		}
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &isofs_dir_inode_operations;
+		inode->i_fop = &isofs_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &page_symlink_inode_operations;
+		inode->i_data.a_ops = &isofs_symlink_aops;
+	} else
+		/* XXX - parse_rock_ridge_inode() had already set i_rdev. */
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+
+	ret = 0;
+out:
+	kfree(tmpde);
+	if (bh)
+		brelse(bh);
+	return ret;
+
+out_badread:
+	printk(KERN_WARNING "ISOFS: unable to read i-node block\n");
+fail:
+	goto out;
+}
+
+struct isofs_iget5_callback_data {
+	unsigned long block;
+	unsigned long offset;
+};
+
+static int isofs_iget5_test(struct inode *ino, void *data)
+{
+	struct iso_inode_info *i = ISOFS_I(ino);
+	struct isofs_iget5_callback_data *d =
+		(struct isofs_iget5_callback_data*)data;
+	return (i->i_iget5_block == d->block)
+		&& (i->i_iget5_offset == d->offset);
+}
+
+static int isofs_iget5_set(struct inode *ino, void *data)
+{
+	struct iso_inode_info *i = ISOFS_I(ino);
+	struct isofs_iget5_callback_data *d =
+		(struct isofs_iget5_callback_data*)data;
+	i->i_iget5_block = d->block;
+	i->i_iget5_offset = d->offset;
+	return 0;
+}
+
+/* Store, in the inode's containing structure, the block and block
+ * offset that point to the underlying meta-data for the inode.  The
+ * code below is otherwise similar to the iget() code in
+ * include/linux/fs.h */
+struct inode *isofs_iget(struct super_block *sb,
+			 unsigned long block,
+			 unsigned long offset)
+{
+	unsigned long hashval;
+	struct inode *inode;
+	struct isofs_iget5_callback_data data;
+	long ret;
+
+	if (offset >= 1ul << sb->s_blocksize_bits)
+		return ERR_PTR(-EINVAL);
+
+	data.block = block;
+	data.offset = offset;
+
+	hashval = (block << sb->s_blocksize_bits) | offset;
+
+	inode = iget5_locked(sb, hashval, &isofs_iget5_test,
+				&isofs_iget5_set, &data);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (inode->i_state & I_NEW) {
+		ret = isofs_read_inode(inode);
+		if (ret < 0) {
+			iget_failed(inode);
+			inode = ERR_PTR(ret);
+		} else {
+			unlock_new_inode(inode);
+		}
+	}
+
+	return inode;
+}
+
+static struct dentry *isofs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
+}
+
+static struct file_system_type iso9660_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "iso9660",
+	.mount		= isofs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_iso9660_fs(void)
+{
+	int err = init_inodecache();
+	if (err)
+		goto out;
+#ifdef CONFIG_ZISOFS
+	err = zisofs_init();
+	if (err)
+		goto out1;
+#endif
+	err = register_filesystem(&iso9660_fs_type);
+	if (err)
+		goto out2;
+	return 0;
+out2:
+#ifdef CONFIG_ZISOFS
+	zisofs_cleanup();
+out1:
+#endif
+	destroy_inodecache();
+out:
+	return err;
+}
+
+static void __exit exit_iso9660_fs(void)
+{
+        unregister_filesystem(&iso9660_fs_type);
+#ifdef CONFIG_ZISOFS
+	zisofs_cleanup();
+#endif
+	destroy_inodecache();
+}
+
+module_init(init_iso9660_fs)
+module_exit(exit_iso9660_fs)
+MODULE_LICENSE("GPL");
+/* Actual filesystem name is iso9660, as requested in filesystems.c */
+MODULE_ALIAS("iso9660");
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
new file mode 100644
index 00000000..2882dc08
--- /dev/null
+++ b/fs/isofs/isofs.h
@@ -0,0 +1,184 @@
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/iso_fs.h>
+#include <asm/unaligned.h>
+
+enum isofs_file_format {
+	isofs_file_normal = 0,
+	isofs_file_sparse = 1,
+	isofs_file_compressed = 2,
+};
+	
+/*
+ * iso fs inode data in memory
+ */
+struct iso_inode_info {
+	unsigned long i_iget5_block;
+	unsigned long i_iget5_offset;
+	unsigned int i_first_extent;
+	unsigned char i_file_format;
+	unsigned char i_format_parm[3];
+	unsigned long i_next_section_block;
+	unsigned long i_next_section_offset;
+	off_t i_section_size;
+	struct inode vfs_inode;
+};
+
+/*
+ * iso9660 super-block data in memory
+ */
+struct isofs_sb_info {
+	unsigned long s_ninodes;
+	unsigned long s_nzones;
+	unsigned long s_firstdatazone;
+	unsigned long s_log_zone_size;
+	unsigned long s_max_size;
+	
+	int           s_rock_offset; /* offset of SUSP fields within SU area */
+	unsigned char s_joliet_level;
+	unsigned char s_mapping;
+	unsigned int  s_high_sierra:1;
+	unsigned int  s_rock:2;
+	unsigned int  s_utf8:1;
+	unsigned int  s_cruft:1; /* Broken disks with high byte of length
+				  * containing junk */
+	unsigned int  s_nocompress:1;
+	unsigned int  s_hide:1;
+	unsigned int  s_showassoc:1;
+	unsigned int  s_overriderockperm:1;
+	unsigned int  s_uid_set:1;
+	unsigned int  s_gid_set:1;
+
+	mode_t s_fmode;
+	mode_t s_dmode;
+	gid_t s_gid;
+	uid_t s_uid;
+	struct nls_table *s_nls_iocharset; /* Native language support table */
+	struct mutex s_mutex; /* replaces BKL, please remove if possible */
+};
+
+#define ISOFS_INVALID_MODE ((mode_t) -1)
+
+static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct iso_inode_info *ISOFS_I(struct inode *inode)
+{
+	return container_of(inode, struct iso_inode_info, vfs_inode);
+}
+
+static inline int isonum_711(char *p)
+{
+	return *(u8 *)p;
+}
+static inline int isonum_712(char *p)
+{
+	return *(s8 *)p;
+}
+static inline unsigned int isonum_721(char *p)
+{
+	return get_unaligned_le16(p);
+}
+static inline unsigned int isonum_722(char *p)
+{
+	return get_unaligned_be16(p);
+}
+static inline unsigned int isonum_723(char *p)
+{
+	/* Ignore bigendian datum due to broken mastering programs */
+	return get_unaligned_le16(p);
+}
+static inline unsigned int isonum_731(char *p)
+{
+	return get_unaligned_le32(p);
+}
+static inline unsigned int isonum_732(char *p)
+{
+	return get_unaligned_be32(p);
+}
+static inline unsigned int isonum_733(char *p)
+{
+	/* Ignore bigendian datum due to broken mastering programs */
+	return get_unaligned_le32(p);
+}
+extern int iso_date(char *, int);
+
+struct inode;		/* To make gcc happy */
+
+extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *);
+extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *);
+extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *);
+
+int get_joliet_filename(struct iso_directory_record *, unsigned char *, struct inode *);
+int get_acorn_filename(struct iso_directory_record *, char *, struct inode *);
+
+extern struct dentry *isofs_lookup(struct inode *, struct dentry *, struct nameidata *);
+extern struct buffer_head *isofs_bread(struct inode *, sector_t);
+extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long);
+
+extern struct inode *isofs_iget(struct super_block *sb,
+                                unsigned long block,
+                                unsigned long offset);
+
+/* Because the inode number is no longer relevant to finding the
+ * underlying meta-data for an inode, we are free to choose a more
+ * convenient 32-bit number as the inode number.  The inode numbering
+ * scheme was recommended by Sergey Vlasov and Eric Lammerts. */
+static inline unsigned long isofs_get_ino(unsigned long block,
+					  unsigned long offset,
+					  unsigned long bufbits)
+{
+	return (block << (bufbits - 5)) | (offset >> 5);
+}
+
+/* Every directory can have many redundant directory entries scattered
+ * throughout the directory tree.  First there is the directory entry
+ * with the name of the directory stored in the parent directory.
+ * Then, there is the "." directory entry stored in the directory
+ * itself.  Finally, there are possibly many ".." directory entries
+ * stored in all the subdirectories.
+ *
+ * In order for the NFS get_parent() method to work and for the
+ * general consistency of the dcache, we need to make sure the
+ * "i_iget5_block" and "i_iget5_offset" all point to exactly one of
+ * the many redundant entries for each directory.  We normalize the
+ * block and offset by always making them point to the "."  directory.
+ *
+ * Notice that we do not use the entry for the directory with the name
+ * that is located in the parent directory.  Even though choosing this
+ * first directory is more natural, it is much easier to find the "."
+ * entry in the NFS get_parent() method because it is implicitly
+ * encoded in the "extent + ext_attr_length" fields of _all_ the
+ * redundant entries for the directory.  Thus, it can always be
+ * reached regardless of which directory entry you have in hand.
+ *
+ * This works because the "." entry is simply the first directory
+ * record when you start reading the file that holds all the directory
+ * records, and this file starts at "extent + ext_attr_length" blocks.
+ * Because the "." entry is always the first entry listed in the
+ * directories file, the normalized "offset" value is always 0.
+ *
+ * You should pass the directory entry in "de".  On return, "block"
+ * and "offset" will hold normalized values.  Only directories are
+ * affected making it safe to call even for non-directory file
+ * types. */
+static inline void
+isofs_normalize_block_and_offset(struct iso_directory_record* de,
+				 unsigned long *block,
+				 unsigned long *offset)
+{
+	/* Only directories are normalized. */
+	if (de->flags[0] & 2) {
+		*offset = 0;
+		*block = (unsigned long)isonum_733(de->extent)
+			+ (unsigned long)isonum_711(de->ext_attr_length);
+	}
+}
+
+extern const struct inode_operations isofs_dir_inode_operations;
+extern const struct file_operations isofs_dir_operations;
+extern const struct address_space_operations isofs_symlink_aops;
+extern const struct export_operations isofs_export_ops;
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
new file mode 100644
index 00000000..a048de81
--- /dev/null
+++ b/fs/isofs/joliet.c
@@ -0,0 +1,69 @@
+/*
+ *  linux/fs/isofs/joliet.c
+ *
+ *  (C) 1996 Gordon Chaffee
+ *
+ *  Joliet: Microsoft's Unicode extensions to iso9660
+ */
+
+#include <linux/types.h>
+#include <linux/nls.h>
+#include "isofs.h"
+
+/*
+ * Convert Unicode 16 to UTF-8 or ASCII.
+ */
+static int
+uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
+{
+	__be16 *ip, ch;
+	unsigned char *op;
+
+	ip = uni;
+	op = ascii;
+
+	while ((ch = get_unaligned(ip)) && len) {
+		int llen;
+		llen = nls->uni2char(be16_to_cpu(ch), op, NLS_MAX_CHARSET_SIZE);
+		if (llen > 0)
+			op += llen;
+		else
+			*op++ = '?';
+		ip++;
+
+		len--;
+	}
+	*op = 0;
+	return (op - ascii);
+}
+
+int
+get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode)
+{
+	unsigned char utf8;
+	struct nls_table *nls;
+	unsigned char len = 0;
+
+	utf8 = ISOFS_SB(inode->i_sb)->s_utf8;
+	nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset;
+
+	if (utf8) {
+		len = utf16s_to_utf8s((const wchar_t *) de->name,
+				de->name_len[0] >> 1, UTF16_BIG_ENDIAN,
+				outname, PAGE_SIZE);
+	} else {
+		len = uni16_to_x8(outname, (__be16 *) de->name,
+				de->name_len[0] >> 1, nls);
+	}
+	if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1'))
+		len -= 2;
+
+	/*
+	 * Windows doesn't like periods at the end of a name,
+	 * so neither do we
+	 */
+	while (len >= 2 && (outname[len-1] == '.'))
+		len--;
+
+	return len;
+}
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
new file mode 100644
index 00000000..4fb3e807
--- /dev/null
+++ b/fs/isofs/namei.c
@@ -0,0 +1,196 @@
+/*
+ *  linux/fs/isofs/namei.c
+ *
+ *  (C) 1992  Eric Youngdale Modified for ISO 9660 filesystem.
+ *
+ *  (C) 1991  Linus Torvalds - minix filesystem
+ */
+
+#include <linux/gfp.h>
+#include "isofs.h"
+
+/*
+ * ok, we cannot use strncmp, as the name is not in our data space.
+ * Thus we'll have to use isofs_match. No big problem. Match also makes
+ * some sanity tests.
+ */
+static int
+isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
+{
+	struct qstr qstr;
+
+	if (!compare)
+		return 1;
+
+	/* check special "." and ".." files */
+	if (dlen == 1) {
+		/* "." */
+		if (compare[0] == 0) {
+			if (!dentry->d_name.len)
+				return 0;
+			compare = ".";
+		} else if (compare[0] == 1) {
+			compare = "..";
+			dlen = 2;
+		}
+	}
+
+	qstr.name = compare;
+	qstr.len = dlen;
+	return dentry->d_op->d_compare(NULL, NULL, NULL, NULL,
+			dentry->d_name.len, dentry->d_name.name, &qstr);
+}
+
+/*
+ *	isofs_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the inode number of the found entry, or 0 on error.
+ */
+static unsigned long
+isofs_find_entry(struct inode *dir, struct dentry *dentry,
+	unsigned long *block_rv, unsigned long *offset_rv,
+	char *tmpname, struct iso_directory_record *tmpde)
+{
+	unsigned long bufsize = ISOFS_BUFFER_SIZE(dir);
+	unsigned char bufbits = ISOFS_BUFFER_BITS(dir);
+	unsigned long block, f_pos, offset, block_saved, offset_saved;
+	struct buffer_head *bh = NULL;
+	struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb);
+
+	if (!ISOFS_I(dir)->i_first_extent)
+		return 0;
+
+	f_pos = 0;
+	offset = 0;
+	block = 0;
+
+	while (f_pos < dir->i_size) {
+		struct iso_directory_record *de;
+		int de_len, match, i, dlen;
+		char *dpnt;
+
+		if (!bh) {
+			bh = isofs_bread(dir, block);
+			if (!bh)
+				return 0;
+		}
+
+		de = (struct iso_directory_record *) (bh->b_data + offset);
+
+		de_len = *(unsigned char *) de;
+		if (!de_len) {
+			brelse(bh);
+			bh = NULL;
+			f_pos = (f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
+			block = f_pos >> bufbits;
+			offset = 0;
+			continue;
+		}
+
+		block_saved = bh->b_blocknr;
+		offset_saved = offset;
+		offset += de_len;
+		f_pos += de_len;
+
+		/* Make sure we have a full directory entry */
+		if (offset >= bufsize) {
+			int slop = bufsize - offset + de_len;
+			memcpy(tmpde, de, slop);
+			offset &= bufsize - 1;
+			block++;
+			brelse(bh);
+			bh = NULL;
+			if (offset) {
+				bh = isofs_bread(dir, block);
+				if (!bh)
+					return 0;
+				memcpy((void *) tmpde + slop, bh->b_data, offset);
+			}
+			de = tmpde;
+		}
+
+		dlen = de->name_len[0];
+		dpnt = de->name;
+		/* Basic sanity check, whether name doesn't exceed dir entry */
+		if (de_len < dlen + sizeof(struct iso_directory_record)) {
+			printk(KERN_NOTICE "iso9660: Corrupted directory entry"
+			       " in block %lu of inode %lu\n", block,
+			       dir->i_ino);
+			return 0;
+		}
+
+		if (sbi->s_rock &&
+		    ((i = get_rock_ridge_filename(de, tmpname, dir)))) {
+			dlen = i;	/* possibly -1 */
+			dpnt = tmpname;
+#ifdef CONFIG_JOLIET
+		} else if (sbi->s_joliet_level) {
+			dlen = get_joliet_filename(de, tmpname, dir);
+			dpnt = tmpname;
+#endif
+		} else if (sbi->s_mapping == 'a') {
+			dlen = get_acorn_filename(de, tmpname, dir);
+			dpnt = tmpname;
+		} else if (sbi->s_mapping == 'n') {
+			dlen = isofs_name_translate(de, tmpname, dir);
+			dpnt = tmpname;
+		}
+
+		/*
+		 * Skip hidden or associated files unless hide or showassoc,
+		 * respectively, is set
+		 */
+		match = 0;
+		if (dlen > 0 &&
+			(!sbi->s_hide ||
+				(!(de->flags[-sbi->s_high_sierra] & 1))) &&
+			(sbi->s_showassoc ||
+				(!(de->flags[-sbi->s_high_sierra] & 4)))) {
+			match = (isofs_cmp(dentry, dpnt, dlen) == 0);
+		}
+		if (match) {
+			isofs_normalize_block_and_offset(de,
+							 &block_saved,
+							 &offset_saved);
+			*block_rv = block_saved;
+			*offset_rv = offset_saved;
+			brelse(bh);
+			return 1;
+		}
+	}
+	brelse(bh);
+	return 0;
+}
+
+struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	int found;
+	unsigned long uninitialized_var(block);
+	unsigned long uninitialized_var(offset);
+	struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb);
+	struct inode *inode;
+	struct page *page;
+
+	page = alloc_page(GFP_USER);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_lock(&sbi->s_mutex);
+	found = isofs_find_entry(dir, dentry,
+				&block, &offset,
+				page_address(page),
+				1024 + page_address(page));
+	__free_page(page);
+
+	inode = NULL;
+	if (found) {
+		inode = isofs_iget(dir->i_sb, block, offset);
+		if (IS_ERR(inode)) {
+			mutex_unlock(&sbi->s_mutex);
+			return ERR_CAST(inode);
+		}
+	}
+	mutex_unlock(&sbi->s_mutex);
+	return d_splice_alias(inode, dentry);
+}
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
new file mode 100644
index 00000000..f9cd04db
--- /dev/null
+++ b/fs/isofs/rock.c
@@ -0,0 +1,778 @@
+/*
+ *  linux/fs/isofs/rock.c
+ *
+ *  (C) 1992, 1993  Eric Youngdale
+ *
+ *  Rock Ridge Extensions to iso9660
+ */
+
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+
+#include "isofs.h"
+#include "rock.h"
+
+/*
+ * These functions are designed to read the system areas of a directory record
+ * and extract relevant information.  There are different functions provided
+ * depending upon what information we need at the time.  One function fills
+ * out an inode structure, a second one extracts a filename, a third one
+ * returns a symbolic link name, and a fourth one returns the extent number
+ * for the file.
+ */
+
+#define SIG(A,B) ((A) | ((B) << 8))	/* isonum_721() */
+
+struct rock_state {
+	void *buffer;
+	unsigned char *chr;
+	int len;
+	int cont_size;
+	int cont_extent;
+	int cont_offset;
+	struct inode *inode;
+};
+
+/*
+ * This is a way of ensuring that we have something in the system
+ * use fields that is compatible with Rock Ridge.  Return zero on success.
+ */
+
+static int check_sp(struct rock_ridge *rr, struct inode *inode)
+{
+	if (rr->u.SP.magic[0] != 0xbe)
+		return -1;
+	if (rr->u.SP.magic[1] != 0xef)
+		return -1;
+	ISOFS_SB(inode->i_sb)->s_rock_offset = rr->u.SP.skip;
+	return 0;
+}
+
+static void setup_rock_ridge(struct iso_directory_record *de,
+			struct inode *inode, struct rock_state *rs)
+{
+	rs->len = sizeof(struct iso_directory_record) + de->name_len[0];
+	if (rs->len & 1)
+		(rs->len)++;
+	rs->chr = (unsigned char *)de + rs->len;
+	rs->len = *((unsigned char *)de) - rs->len;
+	if (rs->len < 0)
+		rs->len = 0;
+
+	if (ISOFS_SB(inode->i_sb)->s_rock_offset != -1) {
+		rs->len -= ISOFS_SB(inode->i_sb)->s_rock_offset;
+		rs->chr += ISOFS_SB(inode->i_sb)->s_rock_offset;
+		if (rs->len < 0)
+			rs->len = 0;
+	}
+}
+
+static void init_rock_state(struct rock_state *rs, struct inode *inode)
+{
+	memset(rs, 0, sizeof(*rs));
+	rs->inode = inode;
+}
+
+/*
+ * Returns 0 if the caller should continue scanning, 1 if the scan must end
+ * and -ve on error.
+ */
+static int rock_continue(struct rock_state *rs)
+{
+	int ret = 1;
+	int blocksize = 1 << rs->inode->i_blkbits;
+	const int min_de_size = offsetof(struct rock_ridge, u);
+
+	kfree(rs->buffer);
+	rs->buffer = NULL;
+
+	if ((unsigned)rs->cont_offset > blocksize - min_de_size ||
+	    (unsigned)rs->cont_size > blocksize ||
+	    (unsigned)(rs->cont_offset + rs->cont_size) > blocksize) {
+		printk(KERN_NOTICE "rock: corrupted directory entry. "
+			"extent=%d, offset=%d, size=%d\n",
+			rs->cont_extent, rs->cont_offset, rs->cont_size);
+		ret = -EIO;
+		goto out;
+	}
+
+	if (rs->cont_extent) {
+		struct buffer_head *bh;
+
+		rs->buffer = kmalloc(rs->cont_size, GFP_KERNEL);
+		if (!rs->buffer) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = -EIO;
+		bh = sb_bread(rs->inode->i_sb, rs->cont_extent);
+		if (bh) {
+			memcpy(rs->buffer, bh->b_data + rs->cont_offset,
+					rs->cont_size);
+			put_bh(bh);
+			rs->chr = rs->buffer;
+			rs->len = rs->cont_size;
+			rs->cont_extent = 0;
+			rs->cont_size = 0;
+			rs->cont_offset = 0;
+			return 0;
+		}
+		printk("Unable to read rock-ridge attributes\n");
+	}
+out:
+	kfree(rs->buffer);
+	rs->buffer = NULL;
+	return ret;
+}
+
+/*
+ * We think there's a record of type `sig' at rs->chr.  Parse the signature
+ * and make sure that there's really room for a record of that type.
+ */
+static int rock_check_overflow(struct rock_state *rs, int sig)
+{
+	int len;
+
+	switch (sig) {
+	case SIG('S', 'P'):
+		len = sizeof(struct SU_SP_s);
+		break;
+	case SIG('C', 'E'):
+		len = sizeof(struct SU_CE_s);
+		break;
+	case SIG('E', 'R'):
+		len = sizeof(struct SU_ER_s);
+		break;
+	case SIG('R', 'R'):
+		len = sizeof(struct RR_RR_s);
+		break;
+	case SIG('P', 'X'):
+		len = sizeof(struct RR_PX_s);
+		break;
+	case SIG('P', 'N'):
+		len = sizeof(struct RR_PN_s);
+		break;
+	case SIG('S', 'L'):
+		len = sizeof(struct RR_SL_s);
+		break;
+	case SIG('N', 'M'):
+		len = sizeof(struct RR_NM_s);
+		break;
+	case SIG('C', 'L'):
+		len = sizeof(struct RR_CL_s);
+		break;
+	case SIG('P', 'L'):
+		len = sizeof(struct RR_PL_s);
+		break;
+	case SIG('T', 'F'):
+		len = sizeof(struct RR_TF_s);
+		break;
+	case SIG('Z', 'F'):
+		len = sizeof(struct RR_ZF_s);
+		break;
+	default:
+		len = 0;
+		break;
+	}
+	len += offsetof(struct rock_ridge, u);
+	if (len > rs->len) {
+		printk(KERN_NOTICE "rock: directory entry would overflow "
+				"storage\n");
+		printk(KERN_NOTICE "rock: sig=0x%02x, size=%d, remaining=%d\n",
+				sig, len, rs->len);
+		return -EIO;
+	}
+	return 0;
+}
+
+/*
+ * return length of name field; 0: not found, -1: to be ignored
+ */
+int get_rock_ridge_filename(struct iso_directory_record *de,
+			    char *retname, struct inode *inode)
+{
+	struct rock_state rs;
+	struct rock_ridge *rr;
+	int sig;
+	int retnamlen = 0;
+	int truncate = 0;
+	int ret = 0;
+
+	if (!ISOFS_SB(inode->i_sb)->s_rock)
+		return 0;
+	*retname = 0;
+
+	init_rock_state(&rs, inode);
+	setup_rock_ridge(de, inode, &rs);
+repeat:
+
+	while (rs.len > 2) { /* There may be one byte for padding somewhere */
+		rr = (struct rock_ridge *)rs.chr;
+		/*
+		 * Ignore rock ridge info if rr->len is out of range, but
+		 * don't return -EIO because that would make the file
+		 * invisible.
+		 */
+		if (rr->len < 3)
+			goto out;	/* Something got screwed up here */
+		sig = isonum_721(rs.chr);
+		if (rock_check_overflow(&rs, sig))
+			goto eio;
+		rs.chr += rr->len;
+		rs.len -= rr->len;
+		/*
+		 * As above, just ignore the rock ridge info if rr->len
+		 * is bogus.
+		 */
+		if (rs.len < 0)
+			goto out;	/* Something got screwed up here */
+
+		switch (sig) {
+		case SIG('R', 'R'):
+			if ((rr->u.RR.flags[0] & RR_NM) == 0)
+				goto out;
+			break;
+		case SIG('S', 'P'):
+			if (check_sp(rr, inode))
+				goto out;
+			break;
+		case SIG('C', 'E'):
+			rs.cont_extent = isonum_733(rr->u.CE.extent);
+			rs.cont_offset = isonum_733(rr->u.CE.offset);
+			rs.cont_size = isonum_733(rr->u.CE.size);
+			break;
+		case SIG('N', 'M'):
+			if (truncate)
+				break;
+			if (rr->len < 5)
+				break;
+			/*
+			 * If the flags are 2 or 4, this indicates '.' or '..'.
+			 * We don't want to do anything with this, because it
+			 * screws up the code that calls us.  We don't really
+			 * care anyways, since we can just use the non-RR
+			 * name.
+			 */
+			if (rr->u.NM.flags & 6)
+				break;
+
+			if (rr->u.NM.flags & ~1) {
+				printk("Unsupported NM flag settings (%d)\n",
+					rr->u.NM.flags);
+				break;
+			}
+			if ((strlen(retname) + rr->len - 5) >= 254) {
+				truncate = 1;
+				break;
+			}
+			strncat(retname, rr->u.NM.name, rr->len - 5);
+			retnamlen += rr->len - 5;
+			break;
+		case SIG('R', 'E'):
+			kfree(rs.buffer);
+			return -1;
+		default:
+			break;
+		}
+	}
+	ret = rock_continue(&rs);
+	if (ret == 0)
+		goto repeat;
+	if (ret == 1)
+		return retnamlen; /* If 0, this file did not have a NM field */
+out:
+	kfree(rs.buffer);
+	return ret;
+eio:
+	ret = -EIO;
+	goto out;
+}
+
+static int
+parse_rock_ridge_inode_internal(struct iso_directory_record *de,
+				struct inode *inode, int regard_xa)
+{
+	int symlink_len = 0;
+	int cnt, sig;
+	struct inode *reloc;
+	struct rock_ridge *rr;
+	int rootflag;
+	struct rock_state rs;
+	int ret = 0;
+
+	if (!ISOFS_SB(inode->i_sb)->s_rock)
+		return 0;
+
+	init_rock_state(&rs, inode);
+	setup_rock_ridge(de, inode, &rs);
+	if (regard_xa) {
+		rs.chr += 14;
+		rs.len -= 14;
+		if (rs.len < 0)
+			rs.len = 0;
+	}
+
+repeat:
+	while (rs.len > 2) { /* There may be one byte for padding somewhere */
+		rr = (struct rock_ridge *)rs.chr;
+		/*
+		 * Ignore rock ridge info if rr->len is out of range, but
+		 * don't return -EIO because that would make the file
+		 * invisible.
+		 */
+		if (rr->len < 3)
+			goto out;	/* Something got screwed up here */
+		sig = isonum_721(rs.chr);
+		if (rock_check_overflow(&rs, sig))
+			goto eio;
+		rs.chr += rr->len;
+		rs.len -= rr->len;
+		/*
+		 * As above, just ignore the rock ridge info if rr->len
+		 * is bogus.
+		 */
+		if (rs.len < 0)
+			goto out;	/* Something got screwed up here */
+
+		switch (sig) {
+#ifndef CONFIG_ZISOFS		/* No flag for SF or ZF */
+		case SIG('R', 'R'):
+			if ((rr->u.RR.flags[0] &
+			     (RR_PX | RR_TF | RR_SL | RR_CL)) == 0)
+				goto out;
+			break;
+#endif
+		case SIG('S', 'P'):
+			if (check_sp(rr, inode))
+				goto out;
+			break;
+		case SIG('C', 'E'):
+			rs.cont_extent = isonum_733(rr->u.CE.extent);
+			rs.cont_offset = isonum_733(rr->u.CE.offset);
+			rs.cont_size = isonum_733(rr->u.CE.size);
+			break;
+		case SIG('E', 'R'):
+			ISOFS_SB(inode->i_sb)->s_rock = 1;
+			printk(KERN_DEBUG "ISO 9660 Extensions: ");
+			{
+				int p;
+				for (p = 0; p < rr->u.ER.len_id; p++)
+					printk("%c", rr->u.ER.data[p]);
+			}
+			printk("\n");
+			break;
+		case SIG('P', 'X'):
+			inode->i_mode = isonum_733(rr->u.PX.mode);
+			inode->i_nlink = isonum_733(rr->u.PX.n_links);
+			inode->i_uid = isonum_733(rr->u.PX.uid);
+			inode->i_gid = isonum_733(rr->u.PX.gid);
+			break;
+		case SIG('P', 'N'):
+			{
+				int high, low;
+				high = isonum_733(rr->u.PN.dev_high);
+				low = isonum_733(rr->u.PN.dev_low);
+				/*
+				 * The Rock Ridge standard specifies that if
+				 * sizeof(dev_t) <= 4, then the high field is
+				 * unused, and the device number is completely
+				 * stored in the low field.  Some writers may
+				 * ignore this subtlety,
+				 * and as a result we test to see if the entire
+				 * device number is
+				 * stored in the low field, and use that.
+				 */
+				if ((low & ~0xff) && high == 0) {
+					inode->i_rdev =
+					    MKDEV(low >> 8, low & 0xff);
+				} else {
+					inode->i_rdev =
+					    MKDEV(high, low);
+				}
+			}
+			break;
+		case SIG('T', 'F'):
+			/*
+			 * Some RRIP writers incorrectly place ctime in the
+			 * TF_CREATE field. Try to handle this correctly for
+			 * either case.
+			 */
+			/* Rock ridge never appears on a High Sierra disk */
+			cnt = 0;
+			if (rr->u.TF.flags & TF_CREATE) {
+				inode->i_ctime.tv_sec =
+				    iso_date(rr->u.TF.times[cnt++].time,
+					     0);
+				inode->i_ctime.tv_nsec = 0;
+			}
+			if (rr->u.TF.flags & TF_MODIFY) {
+				inode->i_mtime.tv_sec =
+				    iso_date(rr->u.TF.times[cnt++].time,
+					     0);
+				inode->i_mtime.tv_nsec = 0;
+			}
+			if (rr->u.TF.flags & TF_ACCESS) {
+				inode->i_atime.tv_sec =
+				    iso_date(rr->u.TF.times[cnt++].time,
+					     0);
+				inode->i_atime.tv_nsec = 0;
+			}
+			if (rr->u.TF.flags & TF_ATTRIBUTES) {
+				inode->i_ctime.tv_sec =
+				    iso_date(rr->u.TF.times[cnt++].time,
+					     0);
+				inode->i_ctime.tv_nsec = 0;
+			}
+			break;
+		case SIG('S', 'L'):
+			{
+				int slen;
+				struct SL_component *slp;
+				struct SL_component *oldslp;
+				slen = rr->len - 5;
+				slp = &rr->u.SL.link;
+				inode->i_size = symlink_len;
+				while (slen > 1) {
+					rootflag = 0;
+					switch (slp->flags & ~1) {
+					case 0:
+						inode->i_size +=
+						    slp->len;
+						break;
+					case 2:
+						inode->i_size += 1;
+						break;
+					case 4:
+						inode->i_size += 2;
+						break;
+					case 8:
+						rootflag = 1;
+						inode->i_size += 1;
+						break;
+					default:
+						printk("Symlink component flag "
+							"not implemented\n");
+					}
+					slen -= slp->len + 2;
+					oldslp = slp;
+					slp = (struct SL_component *)
+						(((char *)slp) + slp->len + 2);
+
+					if (slen < 2) {
+						if (((rr->u.SL.
+						      flags & 1) != 0)
+						    &&
+						    ((oldslp->
+						      flags & 1) == 0))
+							inode->i_size +=
+							    1;
+						break;
+					}
+
+					/*
+					 * If this component record isn't
+					 * continued, then append a '/'.
+					 */
+					if (!rootflag
+					    && (oldslp->flags & 1) == 0)
+						inode->i_size += 1;
+				}
+			}
+			symlink_len = inode->i_size;
+			break;
+		case SIG('R', 'E'):
+			printk(KERN_WARNING "Attempt to read inode for "
+					"relocated directory\n");
+			goto out;
+		case SIG('C', 'L'):
+			ISOFS_I(inode)->i_first_extent =
+			    isonum_733(rr->u.CL.location);
+			reloc =
+			    isofs_iget(inode->i_sb,
+				       ISOFS_I(inode)->i_first_extent,
+				       0);
+			if (IS_ERR(reloc)) {
+				ret = PTR_ERR(reloc);
+				goto out;
+			}
+			inode->i_mode = reloc->i_mode;
+			inode->i_nlink = reloc->i_nlink;
+			inode->i_uid = reloc->i_uid;
+			inode->i_gid = reloc->i_gid;
+			inode->i_rdev = reloc->i_rdev;
+			inode->i_size = reloc->i_size;
+			inode->i_blocks = reloc->i_blocks;
+			inode->i_atime = reloc->i_atime;
+			inode->i_ctime = reloc->i_ctime;
+			inode->i_mtime = reloc->i_mtime;
+			iput(reloc);
+			break;
+#ifdef CONFIG_ZISOFS
+		case SIG('Z', 'F'): {
+			int algo;
+
+			if (ISOFS_SB(inode->i_sb)->s_nocompress)
+				break;
+			algo = isonum_721(rr->u.ZF.algorithm);
+			if (algo == SIG('p', 'z')) {
+				int block_shift =
+					isonum_711(&rr->u.ZF.parms[1]);
+				if (block_shift > 17) {
+					printk(KERN_WARNING "isofs: "
+						"Can't handle ZF block "
+						"size of 2^%d\n",
+						block_shift);
+				} else {
+					/*
+					 * Note: we don't change
+					 * i_blocks here
+					 */
+					ISOFS_I(inode)->i_file_format =
+						isofs_file_compressed;
+					/*
+					 * Parameters to compression
+					 * algorithm (header size,
+					 * block size)
+					 */
+					ISOFS_I(inode)->i_format_parm[0] =
+						isonum_711(&rr->u.ZF.parms[0]);
+					ISOFS_I(inode)->i_format_parm[1] =
+						isonum_711(&rr->u.ZF.parms[1]);
+					inode->i_size =
+					    isonum_733(rr->u.ZF.
+						       real_size);
+				}
+			} else {
+				printk(KERN_WARNING
+				       "isofs: Unknown ZF compression "
+						"algorithm: %c%c\n",
+				       rr->u.ZF.algorithm[0],
+				       rr->u.ZF.algorithm[1]);
+			}
+			break;
+		}
+#endif
+		default:
+			break;
+		}
+	}
+	ret = rock_continue(&rs);
+	if (ret == 0)
+		goto repeat;
+	if (ret == 1)
+		ret = 0;
+out:
+	kfree(rs.buffer);
+	return ret;
+eio:
+	ret = -EIO;
+	goto out;
+}
+
+static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit)
+{
+	int slen;
+	int rootflag;
+	struct SL_component *oldslp;
+	struct SL_component *slp;
+	slen = rr->len - 5;
+	slp = &rr->u.SL.link;
+	while (slen > 1) {
+		rootflag = 0;
+		switch (slp->flags & ~1) {
+		case 0:
+			if (slp->len > plimit - rpnt)
+				return NULL;
+			memcpy(rpnt, slp->text, slp->len);
+			rpnt += slp->len;
+			break;
+		case 2:
+			if (rpnt >= plimit)
+				return NULL;
+			*rpnt++ = '.';
+			break;
+		case 4:
+			if (2 > plimit - rpnt)
+				return NULL;
+			*rpnt++ = '.';
+			*rpnt++ = '.';
+			break;
+		case 8:
+			if (rpnt >= plimit)
+				return NULL;
+			rootflag = 1;
+			*rpnt++ = '/';
+			break;
+		default:
+			printk("Symlink component flag not implemented (%d)\n",
+			       slp->flags);
+		}
+		slen -= slp->len + 2;
+		oldslp = slp;
+		slp = (struct SL_component *)((char *)slp + slp->len + 2);
+
+		if (slen < 2) {
+			/*
+			 * If there is another SL record, and this component
+			 * record isn't continued, then add a slash.
+			 */
+			if ((!rootflag) && (rr->u.SL.flags & 1) &&
+			    !(oldslp->flags & 1)) {
+				if (rpnt >= plimit)
+					return NULL;
+				*rpnt++ = '/';
+			}
+			break;
+		}
+
+		/*
+		 * If this component record isn't continued, then append a '/'.
+		 */
+		if (!rootflag && !(oldslp->flags & 1)) {
+			if (rpnt >= plimit)
+				return NULL;
+			*rpnt++ = '/';
+		}
+	}
+	return rpnt;
+}
+
+int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode)
+{
+	int result = parse_rock_ridge_inode_internal(de, inode, 0);
+
+	/*
+	 * if rockridge flag was reset and we didn't look for attributes
+	 * behind eventual XA attributes, have a look there
+	 */
+	if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1)
+	    && (ISOFS_SB(inode->i_sb)->s_rock == 2)) {
+		result = parse_rock_ridge_inode_internal(de, inode, 14);
+	}
+	return result;
+}
+
+/*
+ * readpage() for symlinks: reads symlink contents into the page and either
+ * makes it uptodate and returns 0 or returns error (-EIO)
+ */
+static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct iso_inode_info *ei = ISOFS_I(inode);
+	struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
+	char *link = kmap(page);
+	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
+	struct buffer_head *bh;
+	char *rpnt = link;
+	unsigned char *pnt;
+	struct iso_directory_record *raw_de;
+	unsigned long block, offset;
+	int sig;
+	struct rock_ridge *rr;
+	struct rock_state rs;
+	int ret;
+
+	if (!sbi->s_rock)
+		goto error;
+
+	init_rock_state(&rs, inode);
+	block = ei->i_iget5_block;
+	mutex_lock(&sbi->s_mutex);
+	bh = sb_bread(inode->i_sb, block);
+	if (!bh)
+		goto out_noread;
+
+	offset = ei->i_iget5_offset;
+	pnt = (unsigned char *)bh->b_data + offset;
+
+	raw_de = (struct iso_directory_record *)pnt;
+
+	/*
+	 * If we go past the end of the buffer, there is some sort of error.
+	 */
+	if (offset + *pnt > bufsize)
+		goto out_bad_span;
+
+	/*
+	 * Now test for possible Rock Ridge extensions which will override
+	 * some of these numbers in the inode structure.
+	 */
+
+	setup_rock_ridge(raw_de, inode, &rs);
+
+repeat:
+	while (rs.len > 2) { /* There may be one byte for padding somewhere */
+		rr = (struct rock_ridge *)rs.chr;
+		if (rr->len < 3)
+			goto out;	/* Something got screwed up here */
+		sig = isonum_721(rs.chr);
+		if (rock_check_overflow(&rs, sig))
+			goto out;
+		rs.chr += rr->len;
+		rs.len -= rr->len;
+		if (rs.len < 0)
+			goto out;	/* corrupted isofs */
+
+		switch (sig) {
+		case SIG('R', 'R'):
+			if ((rr->u.RR.flags[0] & RR_SL) == 0)
+				goto out;
+			break;
+		case SIG('S', 'P'):
+			if (check_sp(rr, inode))
+				goto out;
+			break;
+		case SIG('S', 'L'):
+			rpnt = get_symlink_chunk(rpnt, rr,
+						 link + (PAGE_SIZE - 1));
+			if (rpnt == NULL)
+				goto out;
+			break;
+		case SIG('C', 'E'):
+			/* This tells is if there is a continuation record */
+			rs.cont_extent = isonum_733(rr->u.CE.extent);
+			rs.cont_offset = isonum_733(rr->u.CE.offset);
+			rs.cont_size = isonum_733(rr->u.CE.size);
+		default:
+			break;
+		}
+	}
+	ret = rock_continue(&rs);
+	if (ret == 0)
+		goto repeat;
+	if (ret < 0)
+		goto fail;
+
+	if (rpnt == link)
+		goto fail;
+	brelse(bh);
+	*rpnt = '\0';
+	mutex_unlock(&sbi->s_mutex);
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+	return 0;
+
+	/* error exit from macro */
+out:
+	kfree(rs.buffer);
+	goto fail;
+out_noread:
+	printk("unable to read i-node block");
+	goto fail;
+out_bad_span:
+	printk("symlink spans iso9660 blocks\n");
+fail:
+	brelse(bh);
+	mutex_unlock(&sbi->s_mutex);
+error:
+	SetPageError(page);
+	kunmap(page);
+	unlock_page(page);
+	return -EIO;
+}
+
+const struct address_space_operations isofs_symlink_aops = {
+	.readpage = rock_ridge_symlink_readpage
+};
diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h
new file mode 100644
index 00000000..ed09e2b0
--- /dev/null
+++ b/fs/isofs/rock.h
@@ -0,0 +1,122 @@
+/*
+ * These structs are used by the system-use-sharing protocol, in which the
+ * Rock Ridge extensions are embedded.  It is quite possible that other
+ * extensions are present on the disk, and this is fine as long as they
+ * all use SUSP
+ */
+
+struct SU_SP_s {
+	unsigned char magic[2];
+	unsigned char skip;
+} __attribute__ ((packed));
+
+struct SU_CE_s {
+	char extent[8];
+	char offset[8];
+	char size[8];
+};
+
+struct SU_ER_s {
+	unsigned char len_id;
+	unsigned char len_des;
+	unsigned char len_src;
+	unsigned char ext_ver;
+	char data[0];
+} __attribute__ ((packed));
+
+struct RR_RR_s {
+	char flags[1];
+} __attribute__ ((packed));
+
+struct RR_PX_s {
+	char mode[8];
+	char n_links[8];
+	char uid[8];
+	char gid[8];
+};
+
+struct RR_PN_s {
+	char dev_high[8];
+	char dev_low[8];
+};
+
+struct SL_component {
+	unsigned char flags;
+	unsigned char len;
+	char text[0];
+} __attribute__ ((packed));
+
+struct RR_SL_s {
+	unsigned char flags;
+	struct SL_component link;
+} __attribute__ ((packed));
+
+struct RR_NM_s {
+	unsigned char flags;
+	char name[0];
+} __attribute__ ((packed));
+
+struct RR_CL_s {
+	char location[8];
+};
+
+struct RR_PL_s {
+	char location[8];
+};
+
+struct stamp {
+	char time[7];
+} __attribute__ ((packed));
+
+struct RR_TF_s {
+	char flags;
+	struct stamp times[0];	/* Variable number of these beasts */
+} __attribute__ ((packed));
+
+/* Linux-specific extension for transparent decompression */
+struct RR_ZF_s {
+	char algorithm[2];
+	char parms[2];
+	char real_size[8];
+};
+
+/*
+ * These are the bits and their meanings for flags in the TF structure.
+ */
+#define TF_CREATE 1
+#define TF_MODIFY 2
+#define TF_ACCESS 4
+#define TF_ATTRIBUTES 8
+#define TF_BACKUP 16
+#define TF_EXPIRATION 32
+#define TF_EFFECTIVE 64
+#define TF_LONG_FORM 128
+
+struct rock_ridge {
+	char signature[2];
+	unsigned char len;
+	unsigned char version;
+	union {
+		struct SU_SP_s SP;
+		struct SU_CE_s CE;
+		struct SU_ER_s ER;
+		struct RR_RR_s RR;
+		struct RR_PX_s PX;
+		struct RR_PN_s PN;
+		struct RR_SL_s SL;
+		struct RR_NM_s NM;
+		struct RR_CL_s CL;
+		struct RR_PL_s PL;
+		struct RR_TF_s TF;
+		struct RR_ZF_s ZF;
+	} u;
+};
+
+#define RR_PX 1			/* POSIX attributes */
+#define RR_PN 2			/* POSIX devices */
+#define RR_SL 4			/* Symbolic link */
+#define RR_NM 8			/* Alternate Name */
+#define RR_CL 16		/* Child link */
+#define RR_PL 32		/* Parent link */
+#define RR_RE 64		/* Relocation directory */
+#define RR_TF 128		/* Timestamps */
diff --git a/fs/isofs/util.c b/fs/isofs/util.c
new file mode 100644
index 00000000..01e1ee7a
--- /dev/null
+++ b/fs/isofs/util.c
@@ -0,0 +1,80 @@
+/*
+ *  linux/fs/isofs/util.c
+ */
+
+#include "isofs.h"
+
+/* 
+ * We have to convert from a MM/DD/YY format to the Unix ctime format.
+ * We have to take into account leap years and all of that good stuff.
+ * Unfortunately, the kernel does not have the information on hand to
+ * take into account daylight savings time, but it shouldn't matter.
+ * The time stored should be localtime (with or without DST in effect),
+ * and the timezone offset should hold the offset required to get back
+ * to GMT.  Thus  we should always be correct.
+ */
+
+int iso_date(char * p, int flag)
+{
+	int year, month, day, hour, minute, second, tz;
+	int crtime, days, i;
+
+	year = p[0] - 70;
+	month = p[1];
+	day = p[2];
+	hour = p[3];
+	minute = p[4];
+	second = p[5];
+	if (flag == 0) tz = p[6]; /* High sierra has no time zone */
+	else tz = 0;
+	
+	if (year < 0) {
+		crtime = 0;
+	} else {
+		int monlen[12] = {31,28,31,30,31,30,31,31,30,31,30,31};
+
+		days = year * 365;
+		if (year > 2)
+			days += (year+1) / 4;
+		for (i = 1; i < month; i++)
+			days += monlen[i-1];
+		if (((year+2) % 4) == 0 && month > 2)
+			days++;
+		days += day - 1;
+		crtime = ((((days * 24) + hour) * 60 + minute) * 60)
+			+ second;
+
+		/* sign extend */
+		if (tz & 0x80)
+			tz |= (-1 << 8);
+		
+		/* 
+		 * The timezone offset is unreliable on some disks,
+		 * so we make a sanity check.  In no case is it ever
+		 * more than 13 hours from GMT, which is 52*15min.
+		 * The time is always stored in localtime with the
+		 * timezone offset being what get added to GMT to
+		 * get to localtime.  Thus we need to subtract the offset
+		 * to get to true GMT, which is what we store the time
+		 * as internally.  On the local system, the user may set
+		 * their timezone any way they wish, of course, so GMT
+		 * gets converted back to localtime on the receiving
+		 * system.
+		 *
+		 * NOTE: mkisofs in versions prior to mkisofs-1.10 had
+		 * the sign wrong on the timezone offset.  This has now
+		 * been corrected there too, but if you are getting screwy
+		 * results this may be the explanation.  If enough people
+		 * complain, a user configuration option could be added
+		 * to add the timezone offset in with the wrong sign
+		 * for 'compatibility' with older discs, but I cannot see how
+		 * it will matter that much.
+		 *
+		 * Thanks to kuhlmav@elec.canterbury.ac.nz (Volker Kuhlmann)
+		 * for pointing out the sign error.
+		 */
+		if (-52 <= tz && tz <= 52)
+			crtime -= tz * 15 * 60;
+	}
+	return crtime;
+}		
diff --git a/fs/isofs/zisofs.h b/fs/isofs/zisofs.h
new file mode 100644
index 00000000..27379570
--- /dev/null
+++ b/fs/isofs/zisofs.h
@@ -0,0 +1,21 @@
+/* ----------------------------------------------------------------------- *
+ *   
+ *   Copyright 2001 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ *   USA; either version 2 of the License, or (at your option) any later
+ *   version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Prototypes for functions exported from the compressed isofs subsystem
+ */
+
+#ifdef CONFIG_ZISOFS
+extern const struct address_space_operations zisofs_aops;
+extern int __init zisofs_init(void);
+extern void zisofs_cleanup(void);
+#endif
diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig
new file mode 100644
index 00000000..4e28beee
--- /dev/null
+++ b/fs/jbd/Kconfig
@@ -0,0 +1,30 @@
+config JBD
+	tristate
+	help
+	  This is a generic journalling layer for block devices.  It is
+	  currently used by the ext3 file system, but it could also be
+	  used to add journal support to other file systems or block
+	  devices such as RAID or LVM.
+
+	  If you are using the ext3 file system, you need to say Y here.
+	  If you are not using ext3 then you will probably want to say N.
+
+	  To compile this device as a module, choose M here: the module will be
+	  called jbd.  If you are compiling ext3 into the kernel, you
+	  cannot compile this code as a module.
+
+config JBD_DEBUG
+	bool "JBD (ext3) debugging support"
+	depends on JBD && DEBUG_FS
+	help
+	  If you are using the ext3 journaled file system (or potentially any
+	  other file system/device using JBD), this option allows you to
+	  enable debugging output while the system is running, in order to
+	  help track down any problems you are having.  By default the
+	  debugging output will be turned off.
+
+	  If you select Y here, then you will be able to turn on debugging
+	  with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
+	  number between 1 and 5, the higher the number, the more debugging
+	  output is generated.  To turn debugging off again, do
+	  "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
diff --git a/fs/jbd/Makefile b/fs/jbd/Makefile
new file mode 100644
index 00000000..54aca486
--- /dev/null
+++ b/fs/jbd/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux journaling routines.
+#
+
+obj-$(CONFIG_JBD) += jbd.o
+
+jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
new file mode 100644
index 00000000..e4b87bc1
--- /dev/null
+++ b/fs/jbd/checkpoint.c
@@ -0,0 +1,757 @@
+/*
+ * linux/fs/jbd/checkpoint.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1999 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Checkpoint routines for the generic filesystem journaling code.
+ * Part of the ext2fs journaling system.
+ *
+ * Checkpointing is the process of ensuring that a section of the log is
+ * committed fully to disk, so that that portion of the log can be
+ * reused.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+/*
+ * Unlink a buffer from a transaction checkpoint list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink_first(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	jh->b_cpnext->b_cpprev = jh->b_cpprev;
+	jh->b_cpprev->b_cpnext = jh->b_cpnext;
+	if (transaction->t_checkpoint_list == jh) {
+		transaction->t_checkpoint_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_list == jh)
+			transaction->t_checkpoint_list = NULL;
+	}
+}
+
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+	if (transaction->t_checkpoint_io_list == jh) {
+		transaction->t_checkpoint_io_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_io_list == jh)
+			transaction->t_checkpoint_io_list = NULL;
+	}
+}
+
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+
+	if (!transaction->t_checkpoint_io_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_io_list;
+		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_io_list = jh;
+}
+
+/*
+ * Try to release a checkpointed buffer from its transaction.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
+ * Requires j_list_lock
+ * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
+ */
+static int __try_to_free_cp_buf(struct journal_head *jh)
+{
+	int ret = 0;
+	struct buffer_head *bh = jh2bh(jh);
+
+	if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+	    !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
+		JBUFFER_TRACE(jh, "remove from checkpoint list");
+		ret = __journal_remove_checkpoint(jh) + 1;
+		jbd_unlock_bh_state(bh);
+		journal_remove_journal_head(bh);
+		BUFFER_TRACE(bh, "release");
+		__brelse(bh);
+	} else {
+		jbd_unlock_bh_state(bh);
+	}
+	return ret;
+}
+
+/*
+ * __log_wait_for_space: wait until there is space in the journal.
+ *
+ * Called under j-state_lock *only*.  It will be unlocked if we have to wait
+ * for a checkpoint to free up some space in the log.
+ */
+void __log_wait_for_space(journal_t *journal)
+{
+	int nblocks, space_left;
+	assert_spin_locked(&journal->j_state_lock);
+
+	nblocks = jbd_space_needed(journal);
+	while (__log_space_left(journal) < nblocks) {
+		if (journal->j_flags & JFS_ABORT)
+			return;
+		spin_unlock(&journal->j_state_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
+
+		/*
+		 * Test again, another process may have checkpointed while we
+		 * were waiting for the checkpoint lock. If there are no
+		 * transactions ready to be checkpointed, try to recover
+		 * journal space by calling cleanup_journal_tail(), and if
+		 * that doesn't work, by waiting for the currently committing
+		 * transaction to complete.  If there is absolutely no way
+		 * to make progress, this is either a BUG or corrupted
+		 * filesystem, so abort the journal and leave a stack
+		 * trace for forensic evidence.
+		 */
+		spin_lock(&journal->j_state_lock);
+		spin_lock(&journal->j_list_lock);
+		nblocks = jbd_space_needed(journal);
+		space_left = __log_space_left(journal);
+		if (space_left < nblocks) {
+			int chkpt = journal->j_checkpoint_transactions != NULL;
+			tid_t tid = 0;
+
+			if (journal->j_committing_transaction)
+				tid = journal->j_committing_transaction->t_tid;
+			spin_unlock(&journal->j_list_lock);
+			spin_unlock(&journal->j_state_lock);
+			if (chkpt) {
+				log_do_checkpoint(journal);
+			} else if (cleanup_journal_tail(journal) == 0) {
+				/* We were able to recover space; yay! */
+				;
+			} else if (tid) {
+				log_wait_commit(journal, tid);
+			} else {
+				printk(KERN_ERR "%s: needed %d blocks and "
+				       "only had %d space available\n",
+				       __func__, nblocks, space_left);
+				printk(KERN_ERR "%s: no way to get more "
+				       "journal space\n", __func__);
+				WARN_ON(1);
+				journal_abort(journal, 0);
+			}
+			spin_lock(&journal->j_state_lock);
+		} else {
+			spin_unlock(&journal->j_list_lock);
+		}
+		mutex_unlock(&journal->j_checkpoint_mutex);
+	}
+}
+
+/*
+ * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
+ * The caller must restart a list walk.  Wait for someone else to run
+ * jbd_unlock_bh_state().
+ */
+static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
+	__releases(journal->j_list_lock)
+{
+	get_bh(bh);
+	spin_unlock(&journal->j_list_lock);
+	jbd_lock_bh_state(bh);
+	jbd_unlock_bh_state(bh);
+	put_bh(bh);
+}
+
+/*
+ * Clean up transaction's list of buffers submitted for io.
+ * We wait for any pending IO to complete and remove any clean
+ * buffers. Note that we take the buffers in the opposite ordering
+ * from the one in which they were submitted for IO.
+ *
+ * Return 0 on success, and return <0 if some buffers have failed
+ * to be written out.
+ *
+ * Called with j_list_lock held.
+ */
+static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
+{
+	struct journal_head *jh;
+	struct buffer_head *bh;
+	tid_t this_tid;
+	int released = 0;
+	int ret = 0;
+
+	this_tid = transaction->t_tid;
+restart:
+	/* Did somebody clean up the transaction in the meanwhile? */
+	if (journal->j_checkpoint_transactions != transaction ||
+			transaction->t_tid != this_tid)
+		return ret;
+	while (!released && transaction->t_checkpoint_io_list) {
+		jh = transaction->t_checkpoint_io_list;
+		bh = jh2bh(jh);
+		if (!jbd_trylock_bh_state(bh)) {
+			jbd_sync_bh(journal, bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
+		if (buffer_locked(bh)) {
+			get_bh(bh);
+			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
+			wait_on_buffer(bh);
+			/* the journal_head may have gone by now */
+			BUFFER_TRACE(bh, "brelse");
+			__brelse(bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
+		if (unlikely(buffer_write_io_error(bh)))
+			ret = -EIO;
+
+		/*
+		 * Now in whatever state the buffer currently is, we know that
+		 * it has been written out and so we can drop it from the list
+		 */
+		released = __journal_remove_checkpoint(jh);
+		jbd_unlock_bh_state(bh);
+		journal_remove_journal_head(bh);
+		__brelse(bh);
+	}
+
+	return ret;
+}
+
+#define NR_BATCH	64
+
+static void
+__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
+{
+	int i;
+
+	for (i = 0; i < *batch_count; i++)
+		write_dirty_buffer(bhs[i], WRITE);
+
+	for (i = 0; i < *batch_count; i++) {
+		struct buffer_head *bh = bhs[i];
+		clear_buffer_jwrite(bh);
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+	}
+	*batch_count = 0;
+}
+
+/*
+ * Try to flush one buffer from the checkpoint list to disk.
+ *
+ * Return 1 if something happened which requires us to abort the current
+ * scan of the checkpoint list.  Return <0 if the buffer has failed to
+ * be written out.
+ *
+ * Called with j_list_lock held and drops it if 1 is returned
+ * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
+ */
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
+			struct buffer_head **bhs, int *batch_count)
+{
+	struct buffer_head *bh = jh2bh(jh);
+	int ret = 0;
+
+	if (buffer_locked(bh)) {
+		get_bh(bh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		wait_on_buffer(bh);
+		/* the journal_head may have gone by now */
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+		ret = 1;
+	} else if (jh->b_transaction != NULL) {
+		transaction_t *t = jh->b_transaction;
+		tid_t tid = t->t_tid;
+
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		log_start_commit(journal, tid);
+		log_wait_commit(journal, tid);
+		ret = 1;
+	} else if (!buffer_dirty(bh)) {
+		ret = 1;
+		if (unlikely(buffer_write_io_error(bh)))
+			ret = -EIO;
+		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+		BUFFER_TRACE(bh, "remove from checkpoint");
+		__journal_remove_checkpoint(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		journal_remove_journal_head(bh);
+		__brelse(bh);
+	} else {
+		/*
+		 * Important: we are about to write the buffer, and
+		 * possibly block, while still holding the journal lock.
+		 * We cannot afford to let the transaction logic start
+		 * messing around with this buffer before we write it to
+		 * disk, as that would break recoverability.
+		 */
+		BUFFER_TRACE(bh, "queue");
+		get_bh(bh);
+		J_ASSERT_BH(bh, !buffer_jwrite(bh));
+		set_buffer_jwrite(bh);
+		bhs[*batch_count] = bh;
+		__buffer_relink_io(jh);
+		jbd_unlock_bh_state(bh);
+		(*batch_count)++;
+		if (*batch_count == NR_BATCH) {
+			spin_unlock(&journal->j_list_lock);
+			__flush_batch(journal, bhs, batch_count);
+			ret = 1;
+		}
+	}
+	return ret;
+}
+
+/*
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
+ *
+ * The journal should be locked before calling this function.
+ * Called with j_checkpoint_mutex held.
+ */
+int log_do_checkpoint(journal_t *journal)
+{
+	transaction_t *transaction;
+	tid_t this_tid;
+	int result;
+
+	jbd_debug(1, "Start checkpoint\n");
+
+	/*
+	 * First thing: if there are any transactions in the log which
+	 * don't need checkpointing, just eliminate them from the
+	 * journal straight away.
+	 */
+	result = cleanup_journal_tail(journal);
+	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+	if (result <= 0)
+		return result;
+
+	/*
+	 * OK, we need to start writing disk blocks.  Take one transaction
+	 * and write it.
+	 */
+	result = 0;
+	spin_lock(&journal->j_list_lock);
+	if (!journal->j_checkpoint_transactions)
+		goto out;
+	transaction = journal->j_checkpoint_transactions;
+	this_tid = transaction->t_tid;
+restart:
+	/*
+	 * If someone cleaned up this transaction while we slept, we're
+	 * done (maybe it's a new transaction, but it fell at the same
+	 * address).
+	 */
+	if (journal->j_checkpoint_transactions == transaction &&
+			transaction->t_tid == this_tid) {
+		int batch_count = 0;
+		struct buffer_head *bhs[NR_BATCH];
+		struct journal_head *jh;
+		int retry = 0, err;
+
+		while (!retry && transaction->t_checkpoint_list) {
+			struct buffer_head *bh;
+
+			jh = transaction->t_checkpoint_list;
+			bh = jh2bh(jh);
+			if (!jbd_trylock_bh_state(bh)) {
+				jbd_sync_bh(journal, bh);
+				retry = 1;
+				break;
+			}
+			retry = __process_buffer(journal, jh, bhs,&batch_count);
+			if (retry < 0 && !result)
+				result = retry;
+			if (!retry && (need_resched() ||
+				spin_needbreak(&journal->j_list_lock))) {
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+				break;
+			}
+		}
+
+		if (batch_count) {
+			if (!retry) {
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+			}
+			__flush_batch(journal, bhs, &batch_count);
+		}
+
+		if (retry) {
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
+		/*
+		 * Now we have cleaned up the first transaction's checkpoint
+		 * list. Let's clean up the second one
+		 */
+		err = __wait_cp_io(journal, transaction);
+		if (!result)
+			result = err;
+	}
+out:
+	spin_unlock(&journal->j_list_lock);
+	if (result < 0)
+		journal_abort(journal, result);
+	else
+		result = cleanup_journal_tail(journal);
+
+	return (result < 0) ? result : 0;
+}
+
+/*
+ * Check the list of checkpoint transactions for the journal to see if
+ * we have already got rid of any since the last update of the log tail
+ * in the journal superblock.  If so, we can instantly roll the
+ * superblock forward to remove those transactions from the log.
+ *
+ * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
+ *
+ * Called with the journal lock held.
+ *
+ * This is the only part of the journaling code which really needs to be
+ * aware of transaction aborts.  Checkpointing involves writing to the
+ * main filesystem area rather than to the journal, so it can proceed
+ * even in abort state, but we must not update the super block if
+ * checkpointing may have failed.  Otherwise, we would lose some metadata
+ * buffers which should be written-back to the filesystem.
+ */
+
+int cleanup_journal_tail(journal_t *journal)
+{
+	transaction_t * transaction;
+	tid_t		first_tid;
+	unsigned int	blocknr, freed;
+
+	if (is_journal_aborted(journal))
+		return 1;
+
+	/* OK, work out the oldest transaction remaining in the log, and
+	 * the log block it starts at.
+	 *
+	 * If the log is now empty, we need to work out which is the
+	 * next transaction ID we will write, and where it will
+	 * start. */
+
+	spin_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	transaction = journal->j_checkpoint_transactions;
+	if (transaction) {
+		first_tid = transaction->t_tid;
+		blocknr = transaction->t_log_start;
+	} else if ((transaction = journal->j_committing_transaction) != NULL) {
+		first_tid = transaction->t_tid;
+		blocknr = transaction->t_log_start;
+	} else if ((transaction = journal->j_running_transaction) != NULL) {
+		first_tid = transaction->t_tid;
+		blocknr = journal->j_head;
+	} else {
+		first_tid = journal->j_transaction_sequence;
+		blocknr = journal->j_head;
+	}
+	spin_unlock(&journal->j_list_lock);
+	J_ASSERT(blocknr != 0);
+
+	/* If the oldest pinned transaction is at the tail of the log
+           already then there's not much we can do right now. */
+	if (journal->j_tail_sequence == first_tid) {
+		spin_unlock(&journal->j_state_lock);
+		return 1;
+	}
+
+	/* OK, update the superblock to recover the freed space.
+	 * Physical blocks come first: have we wrapped beyond the end of
+	 * the log?  */
+	freed = blocknr - journal->j_tail;
+	if (blocknr < journal->j_tail)
+		freed = freed + journal->j_last - journal->j_first;
+
+	jbd_debug(1,
+		  "Cleaning journal tail from %d to %d (offset %u), "
+		  "freeing %u\n",
+		  journal->j_tail_sequence, first_tid, blocknr, freed);
+
+	journal->j_free += freed;
+	journal->j_tail_sequence = first_tid;
+	journal->j_tail = blocknr;
+	spin_unlock(&journal->j_state_lock);
+	if (!(journal->j_flags & JFS_ABORT))
+		journal_update_superblock(journal, 1);
+	return 0;
+}
+
+
+/* Checkpoint list management */
+
+/*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+	struct journal_head *last_jh;
+	struct journal_head *next_jh = jh;
+	int ret, freed = 0;
+
+	*released = 0;
+	if (!jh)
+		return 0;
+
+	last_jh = jh->b_cpprev;
+	do {
+		jh = next_jh;
+		next_jh = jh->b_cpnext;
+		/* Use trylock because of the ranking */
+		if (jbd_trylock_bh_state(jh2bh(jh))) {
+			ret = __try_to_free_cp_buf(jh);
+			if (ret) {
+				freed++;
+				if (ret == 2) {
+					*released = 1;
+					return freed;
+				}
+			}
+		}
+		/*
+		 * This function only frees up some memory
+		 * if possible so we dont have an obligation
+		 * to finish processing. Bail out if preemption
+		 * requested:
+		 */
+		if (need_resched())
+			return freed;
+	} while (jh != last_jh);
+
+	return freed;
+}
+
+/*
+ * journal_clean_checkpoint_list
+ *
+ * Find all the written-back checkpoint buffers in the journal and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of buffers reaped (for debug)
+ */
+
+int __journal_clean_checkpoint_list(journal_t *journal)
+{
+	transaction_t *transaction, *last_transaction, *next_transaction;
+	int ret = 0;
+	int released;
+
+	transaction = journal->j_checkpoint_transactions;
+	if (!transaction)
+		goto out;
+
+	last_transaction = transaction->t_cpprev;
+	next_transaction = transaction;
+	do {
+		transaction = next_transaction;
+		next_transaction = transaction->t_cpnext;
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_list, &released);
+		/*
+		 * This function only frees up some memory if possible so we
+		 * dont have an obligation to finish processing. Bail out if
+		 * preemption requested:
+		 */
+		if (need_resched())
+			goto out;
+		if (released)
+			continue;
+		/*
+		 * It is essential that we are as careful as in the case of
+		 * t_checkpoint_list with removing the buffer from the list as
+		 * we can possibly see not yet submitted buffers on io_list
+		 */
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_io_list, &released);
+		if (need_resched())
+			goto out;
+	} while (transaction != last_transaction);
+out:
+	return ret;
+}
+
+/*
+ * journal_remove_checkpoint: called after a buffer has been committed
+ * to disk (either by being write-back flushed to disk, or being
+ * committed to the log).
+ *
+ * We cannot safely clean a transaction out of the log until all of the
+ * buffer updates committed in that transaction have safely been stored
+ * elsewhere on disk.  To achieve this, all of the buffers in a
+ * transaction need to be maintained on the transaction's checkpoint
+ * lists until they have been rewritten, at which point this function is
+ * called to remove the buffer from the existing transaction's
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
+ *
+ * This function is called with the journal locked.
+ * This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
+ */
+
+int __journal_remove_checkpoint(struct journal_head *jh)
+{
+	transaction_t *transaction;
+	journal_t *journal;
+	int ret = 0;
+
+	JBUFFER_TRACE(jh, "entry");
+
+	if ((transaction = jh->b_cp_transaction) == NULL) {
+		JBUFFER_TRACE(jh, "not on transaction");
+		goto out;
+	}
+	journal = transaction->t_journal;
+
+	__buffer_unlink(jh);
+	jh->b_cp_transaction = NULL;
+
+	if (transaction->t_checkpoint_list != NULL ||
+	    transaction->t_checkpoint_io_list != NULL)
+		goto out;
+	JBUFFER_TRACE(jh, "transaction has no more buffers");
+
+	/*
+	 * There is one special case to worry about: if we have just pulled the
+	 * buffer off a running or committing transaction's checkpoing list,
+	 * then even if the checkpoint list is empty, the transaction obviously
+	 * cannot be dropped!
+	 *
+	 * The locking here around t_state is a bit sleazy.
+	 * See the comment at the end of journal_commit_transaction().
+	 */
+	if (transaction->t_state != T_FINISHED) {
+		JBUFFER_TRACE(jh, "belongs to running/committing transaction");
+		goto out;
+	}
+
+	/* OK, that was the last buffer for the transaction: we can now
+	   safely remove this transaction from the log */
+
+	__journal_drop_transaction(journal, transaction);
+
+	/* Just in case anybody was waiting for more transactions to be
+           checkpointed... */
+	wake_up(&journal->j_wait_logspace);
+	ret = 1;
+out:
+	JBUFFER_TRACE(jh, "exit");
+	return ret;
+}
+
+/*
+ * journal_insert_checkpoint: put a committed buffer onto a checkpoint
+ * list so that we know when it is safe to clean the transaction out of
+ * the log.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ */
+void __journal_insert_checkpoint(struct journal_head *jh,
+			       transaction_t *transaction)
+{
+	JBUFFER_TRACE(jh, "entry");
+	J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
+	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+
+	jh->b_cp_transaction = transaction;
+
+	if (!transaction->t_checkpoint_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_list;
+		jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_list = jh;
+}
+
+/*
+ * We've finished with this transaction structure: adios...
+ *
+ * The transaction must have no links except for the checkpoint by this
+ * point.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ */
+
+void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
+{
+	assert_spin_locked(&journal->j_list_lock);
+	if (transaction->t_cpnext) {
+		transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
+		transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
+		if (journal->j_checkpoint_transactions == transaction)
+			journal->j_checkpoint_transactions =
+				transaction->t_cpnext;
+		if (journal->j_checkpoint_transactions == transaction)
+			journal->j_checkpoint_transactions = NULL;
+	}
+
+	J_ASSERT(transaction->t_state == T_FINISHED);
+	J_ASSERT(transaction->t_buffers == NULL);
+	J_ASSERT(transaction->t_sync_datalist == NULL);
+	J_ASSERT(transaction->t_forget == NULL);
+	J_ASSERT(transaction->t_iobuf_list == NULL);
+	J_ASSERT(transaction->t_shadow_list == NULL);
+	J_ASSERT(transaction->t_log_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
+	J_ASSERT(transaction->t_updates == 0);
+	J_ASSERT(journal->j_committing_transaction != transaction);
+	J_ASSERT(journal->j_running_transaction != transaction);
+
+	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
+	kfree(transaction);
+}
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
new file mode 100644
index 00000000..72ffa974
--- /dev/null
+++ b/fs/jbd/commit.c
@@ -0,0 +1,953 @@
+/*
+ * linux/fs/jbd/commit.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal commit routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+
+/*
+ * Default IO end handler for temporary BJ_IO buffer_heads.
+ */
+static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+	BUFFER_TRACE(bh, "");
+	if (uptodate)
+		set_buffer_uptodate(bh);
+	else
+		clear_buffer_uptodate(bh);
+	unlock_buffer(bh);
+}
+
+/*
+ * When an ext3-ordered file is truncated, it is possible that many pages are
+ * not successfully freed, because they are attached to a committing transaction.
+ * After the transaction commits, these pages are left on the LRU, with no
+ * ->mapping, and with attached buffers.  These pages are trivially reclaimable
+ * by the VM, but their apparent absence upsets the VM accounting, and it makes
+ * the numbers in /proc/meminfo look odd.
+ *
+ * So here, we have a buffer which has just come off the forget list.  Look to
+ * see if we can strip all buffers from the backing page.
+ *
+ * Called under journal->j_list_lock.  The caller provided us with a ref
+ * against the buffer, and we drop that here.
+ */
+static void release_buffer_page(struct buffer_head *bh)
+{
+	struct page *page;
+
+	if (buffer_dirty(bh))
+		goto nope;
+	if (atomic_read(&bh->b_count) != 1)
+		goto nope;
+	page = bh->b_page;
+	if (!page)
+		goto nope;
+	if (page->mapping)
+		goto nope;
+
+	/* OK, it's a truncated page */
+	if (!trylock_page(page))
+		goto nope;
+
+	page_cache_get(page);
+	__brelse(bh);
+	try_to_free_buffers(page);
+	unlock_page(page);
+	page_cache_release(page);
+	return;
+
+nope:
+	__brelse(bh);
+}
+
+/*
+ * Decrement reference counter for data buffer. If it has been marked
+ * 'BH_Freed', release it and the page to which it belongs if possible.
+ */
+static void release_data_buffer(struct buffer_head *bh)
+{
+	if (buffer_freed(bh)) {
+		clear_buffer_freed(bh);
+		release_buffer_page(bh);
+	} else
+		put_bh(bh);
+}
+
+/*
+ * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
+ * held.  For ranking reasons we must trylock.  If we lose, schedule away and
+ * return 0.  j_list_lock is dropped in this case.
+ */
+static int inverted_lock(journal_t *journal, struct buffer_head *bh)
+{
+	if (!jbd_trylock_bh_state(bh)) {
+		spin_unlock(&journal->j_list_lock);
+		schedule();
+		return 0;
+	}
+	return 1;
+}
+
+/* Done it all: now write the commit record.  We should have
+ * cleaned up our previous buffers by now, so if we are in abort
+ * mode we can now just skip the rest of the journal write
+ * entirely.
+ *
+ * Returns 1 if the journal needs to be aborted or 0 on success
+ */
+static int journal_write_commit_record(journal_t *journal,
+					transaction_t *commit_transaction)
+{
+	struct journal_head *descriptor;
+	struct buffer_head *bh;
+	journal_header_t *header;
+	int ret;
+
+	if (is_journal_aborted(journal))
+		return 0;
+
+	descriptor = journal_get_descriptor_buffer(journal);
+	if (!descriptor)
+		return 1;
+
+	bh = jh2bh(descriptor);
+
+	header = (journal_header_t *)(bh->b_data);
+	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
+	header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
+	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+
+	JBUFFER_TRACE(descriptor, "write commit block");
+	set_buffer_dirty(bh);
+
+	if (journal->j_flags & JFS_BARRIER)
+		ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
+	else
+		ret = sync_dirty_buffer(bh);
+
+	put_bh(bh);		/* One for getblk() */
+	journal_put_journal_head(descriptor);
+
+	return (ret == -EIO);
+}
+
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
+				   int write_op)
+{
+	int i;
+
+	for (i = 0; i < bufs; i++) {
+		wbuf[i]->b_end_io = end_buffer_write_sync;
+		/* We use-up our safety reference in submit_bh() */
+		submit_bh(write_op, wbuf[i]);
+	}
+}
+
+/*
+ *  Submit all the data buffers to disk
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+				       transaction_t *commit_transaction,
+				       int write_op)
+{
+	struct journal_head *jh;
+	struct buffer_head *bh;
+	int locked;
+	int bufs = 0;
+	struct buffer_head **wbuf = journal->j_wbuf;
+	int err = 0;
+
+	/*
+	 * Whenever we unlock the journal and sleep, things can get added
+	 * onto ->t_sync_datalist, so we have to keep looping back to
+	 * write_out_data until we *know* that the list is empty.
+	 *
+	 * Cleanup any flushed data buffers from the data list.  Even in
+	 * abort mode, we want to flush this out as soon as possible.
+	 */
+write_out_data:
+	cond_resched();
+	spin_lock(&journal->j_list_lock);
+
+	while (commit_transaction->t_sync_datalist) {
+		jh = commit_transaction->t_sync_datalist;
+		bh = jh2bh(jh);
+		locked = 0;
+
+		/* Get reference just to make sure buffer does not disappear
+		 * when we are forced to drop various locks */
+		get_bh(bh);
+		/* If the buffer is dirty, we need to submit IO and hence
+		 * we need the buffer lock. We try to lock the buffer without
+		 * blocking. If we fail, we need to drop j_list_lock and do
+		 * blocking lock_buffer().
+		 */
+		if (buffer_dirty(bh)) {
+			if (!trylock_buffer(bh)) {
+				BUFFER_TRACE(bh, "needs blocking lock");
+				spin_unlock(&journal->j_list_lock);
+				/* Write out all data to prevent deadlocks */
+				journal_do_submit_data(wbuf, bufs, write_op);
+				bufs = 0;
+				lock_buffer(bh);
+				spin_lock(&journal->j_list_lock);
+			}
+			locked = 1;
+		}
+		/* We have to get bh_state lock. Again out of order, sigh. */
+		if (!inverted_lock(journal, bh)) {
+			jbd_lock_bh_state(bh);
+			spin_lock(&journal->j_list_lock);
+		}
+		/* Someone already cleaned up the buffer? */
+		if (!buffer_jbd(bh) || bh2jh(bh) != jh
+			|| jh->b_transaction != commit_transaction
+			|| jh->b_jlist != BJ_SyncData) {
+			jbd_unlock_bh_state(bh);
+			if (locked)
+				unlock_buffer(bh);
+			BUFFER_TRACE(bh, "already cleaned up");
+			release_data_buffer(bh);
+			continue;
+		}
+		if (locked && test_clear_buffer_dirty(bh)) {
+			BUFFER_TRACE(bh, "needs writeout, adding to array");
+			wbuf[bufs++] = bh;
+			__journal_file_buffer(jh, commit_transaction,
+						BJ_Locked);
+			jbd_unlock_bh_state(bh);
+			if (bufs == journal->j_wbufsize) {
+				spin_unlock(&journal->j_list_lock);
+				journal_do_submit_data(wbuf, bufs, write_op);
+				bufs = 0;
+				goto write_out_data;
+			}
+		} else if (!locked && buffer_locked(bh)) {
+			__journal_file_buffer(jh, commit_transaction,
+						BJ_Locked);
+			jbd_unlock_bh_state(bh);
+			put_bh(bh);
+		} else {
+			BUFFER_TRACE(bh, "writeout complete: unfile");
+			if (unlikely(!buffer_uptodate(bh)))
+				err = -EIO;
+			__journal_unfile_buffer(jh);
+			jbd_unlock_bh_state(bh);
+			if (locked)
+				unlock_buffer(bh);
+			journal_remove_journal_head(bh);
+			/* One for our safety reference, other for
+			 * journal_remove_journal_head() */
+			put_bh(bh);
+			release_data_buffer(bh);
+		}
+
+		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
+			spin_unlock(&journal->j_list_lock);
+			goto write_out_data;
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+	journal_do_submit_data(wbuf, bufs, write_op);
+
+	return err;
+}
+
+/*
+ * journal_commit_transaction
+ *
+ * The primary function for committing a transaction to the log.  This
+ * function is called by the journal thread to begin a complete commit.
+ */
+void journal_commit_transaction(journal_t *journal)
+{
+	transaction_t *commit_transaction;
+	struct journal_head *jh, *new_jh, *descriptor;
+	struct buffer_head **wbuf = journal->j_wbuf;
+	int bufs;
+	int flags;
+	int err;
+	unsigned int blocknr;
+	ktime_t start_time;
+	u64 commit_time;
+	char *tagp = NULL;
+	journal_header_t *header;
+	journal_block_tag_t *tag = NULL;
+	int space_left = 0;
+	int first_tag = 0;
+	int tag_flag;
+	int i;
+	struct blk_plug plug;
+
+	/*
+	 * First job: lock down the current transaction and wait for
+	 * all outstanding updates to complete.
+	 */
+
+	/* Do we need to erase the effects of a prior journal_flush? */
+	if (journal->j_flags & JFS_FLUSHED) {
+		jbd_debug(3, "super block updated\n");
+		journal_update_superblock(journal, 1);
+	} else {
+		jbd_debug(3, "superblock not updated\n");
+	}
+
+	J_ASSERT(journal->j_running_transaction != NULL);
+	J_ASSERT(journal->j_committing_transaction == NULL);
+
+	commit_transaction = journal->j_running_transaction;
+	J_ASSERT(commit_transaction->t_state == T_RUNNING);
+
+	jbd_debug(1, "JBD: starting commit of transaction %d\n",
+			commit_transaction->t_tid);
+
+	spin_lock(&journal->j_state_lock);
+	commit_transaction->t_state = T_LOCKED;
+
+	spin_lock(&commit_transaction->t_handle_lock);
+	while (commit_transaction->t_updates) {
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(&journal->j_wait_updates, &wait,
+					TASK_UNINTERRUPTIBLE);
+		if (commit_transaction->t_updates) {
+			spin_unlock(&commit_transaction->t_handle_lock);
+			spin_unlock(&journal->j_state_lock);
+			schedule();
+			spin_lock(&journal->j_state_lock);
+			spin_lock(&commit_transaction->t_handle_lock);
+		}
+		finish_wait(&journal->j_wait_updates, &wait);
+	}
+	spin_unlock(&commit_transaction->t_handle_lock);
+
+	J_ASSERT (commit_transaction->t_outstanding_credits <=
+			journal->j_max_transaction_buffers);
+
+	/*
+	 * First thing we are allowed to do is to discard any remaining
+	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
+	 * that there are no such buffers: if a large filesystem
+	 * operation like a truncate needs to split itself over multiple
+	 * transactions, then it may try to do a journal_restart() while
+	 * there are still BJ_Reserved buffers outstanding.  These must
+	 * be released cleanly from the current transaction.
+	 *
+	 * In this case, the filesystem must still reserve write access
+	 * again before modifying the buffer in the new transaction, but
+	 * we do not require it to remember exactly which old buffers it
+	 * has reserved.  This is consistent with the existing behaviour
+	 * that multiple journal_get_write_access() calls to the same
+	 * buffer are perfectly permissible.
+	 */
+	while (commit_transaction->t_reserved_list) {
+		jh = commit_transaction->t_reserved_list;
+		JBUFFER_TRACE(jh, "reserved, unused: refile");
+		/*
+		 * A journal_get_undo_access()+journal_release_buffer() may
+		 * leave undo-committed data.
+		 */
+		if (jh->b_committed_data) {
+			struct buffer_head *bh = jh2bh(jh);
+
+			jbd_lock_bh_state(bh);
+			jbd_free(jh->b_committed_data, bh->b_size);
+			jh->b_committed_data = NULL;
+			jbd_unlock_bh_state(bh);
+		}
+		journal_refile_buffer(journal, jh);
+	}
+
+	/*
+	 * Now try to drop any written-back buffers from the journal's
+	 * checkpoint lists.  We do this *before* commit because it potentially
+	 * frees some memory
+	 */
+	spin_lock(&journal->j_list_lock);
+	__journal_clean_checkpoint_list(journal);
+	spin_unlock(&journal->j_list_lock);
+
+	jbd_debug (3, "JBD: commit phase 1\n");
+
+	/*
+	 * Switch to a new revoke table.
+	 */
+	journal_switch_revoke_table(journal);
+
+	commit_transaction->t_state = T_FLUSH;
+	journal->j_committing_transaction = commit_transaction;
+	journal->j_running_transaction = NULL;
+	start_time = ktime_get();
+	commit_transaction->t_log_start = journal->j_head;
+	wake_up(&journal->j_wait_transaction_locked);
+	spin_unlock(&journal->j_state_lock);
+
+	jbd_debug (3, "JBD: commit phase 2\n");
+
+	/*
+	 * Now start flushing things to disk, in the order they appear
+	 * on the transaction lists.  Data blocks go first.
+	 */
+	blk_start_plug(&plug);
+	err = journal_submit_data_buffers(journal, commit_transaction,
+					  WRITE_SYNC);
+	blk_finish_plug(&plug);
+
+	/*
+	 * Wait for all previously submitted IO to complete.
+	 */
+	spin_lock(&journal->j_list_lock);
+	while (commit_transaction->t_locked_list) {
+		struct buffer_head *bh;
+
+		jh = commit_transaction->t_locked_list->b_tprev;
+		bh = jh2bh(jh);
+		get_bh(bh);
+		if (buffer_locked(bh)) {
+			spin_unlock(&journal->j_list_lock);
+			wait_on_buffer(bh);
+			spin_lock(&journal->j_list_lock);
+		}
+		if (unlikely(!buffer_uptodate(bh))) {
+			if (!trylock_page(bh->b_page)) {
+				spin_unlock(&journal->j_list_lock);
+				lock_page(bh->b_page);
+				spin_lock(&journal->j_list_lock);
+			}
+			if (bh->b_page->mapping)
+				set_bit(AS_EIO, &bh->b_page->mapping->flags);
+
+			unlock_page(bh->b_page);
+			SetPageError(bh->b_page);
+			err = -EIO;
+		}
+		if (!inverted_lock(journal, bh)) {
+			put_bh(bh);
+			spin_lock(&journal->j_list_lock);
+			continue;
+		}
+		if (buffer_jbd(bh) && bh2jh(bh) == jh &&
+		    jh->b_transaction == commit_transaction &&
+		    jh->b_jlist == BJ_Locked) {
+			__journal_unfile_buffer(jh);
+			jbd_unlock_bh_state(bh);
+			journal_remove_journal_head(bh);
+			put_bh(bh);
+		} else {
+			jbd_unlock_bh_state(bh);
+		}
+		release_data_buffer(bh);
+		cond_resched_lock(&journal->j_list_lock);
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	if (err) {
+		char b[BDEVNAME_SIZE];
+
+		printk(KERN_WARNING
+			"JBD: Detected IO errors while flushing file data "
+			"on %s\n", bdevname(journal->j_fs_dev, b));
+		if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
+			journal_abort(journal, err);
+		err = 0;
+	}
+
+	blk_start_plug(&plug);
+
+	journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC);
+
+	/*
+	 * If we found any dirty or locked buffers, then we should have
+	 * looped back up to the write_out_data label.  If there weren't
+	 * any then journal_clean_data_list should have wiped the list
+	 * clean by now, so check that it is in fact empty.
+	 */
+	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
+
+	jbd_debug (3, "JBD: commit phase 3\n");
+
+	/*
+	 * Way to go: we have now written out all of the data for a
+	 * transaction!  Now comes the tricky part: we need to write out
+	 * metadata.  Loop over the transaction's entire buffer list:
+	 */
+	spin_lock(&journal->j_state_lock);
+	commit_transaction->t_state = T_COMMIT;
+	spin_unlock(&journal->j_state_lock);
+
+	J_ASSERT(commit_transaction->t_nr_buffers <=
+		 commit_transaction->t_outstanding_credits);
+
+	descriptor = NULL;
+	bufs = 0;
+	while (commit_transaction->t_buffers) {
+
+		/* Find the next buffer to be journaled... */
+
+		jh = commit_transaction->t_buffers;
+
+		/* If we're in abort mode, we just un-journal the buffer and
+		   release it. */
+
+		if (is_journal_aborted(journal)) {
+			clear_buffer_jbddirty(jh2bh(jh));
+			JBUFFER_TRACE(jh, "journal is aborting: refile");
+			journal_refile_buffer(journal, jh);
+			/* If that was the last one, we need to clean up
+			 * any descriptor buffers which may have been
+			 * already allocated, even if we are now
+			 * aborting. */
+			if (!commit_transaction->t_buffers)
+				goto start_journal_io;
+			continue;
+		}
+
+		/* Make sure we have a descriptor block in which to
+		   record the metadata buffer. */
+
+		if (!descriptor) {
+			struct buffer_head *bh;
+
+			J_ASSERT (bufs == 0);
+
+			jbd_debug(4, "JBD: get descriptor\n");
+
+			descriptor = journal_get_descriptor_buffer(journal);
+			if (!descriptor) {
+				journal_abort(journal, -EIO);
+				continue;
+			}
+
+			bh = jh2bh(descriptor);
+			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
+				(unsigned long long)bh->b_blocknr, bh->b_data);
+			header = (journal_header_t *)&bh->b_data[0];
+			header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
+			header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
+			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
+
+			tagp = &bh->b_data[sizeof(journal_header_t)];
+			space_left = bh->b_size - sizeof(journal_header_t);
+			first_tag = 1;
+			set_buffer_jwrite(bh);
+			set_buffer_dirty(bh);
+			wbuf[bufs++] = bh;
+
+			/* Record it so that we can wait for IO
+                           completion later */
+			BUFFER_TRACE(bh, "ph3: file as descriptor");
+			journal_file_buffer(descriptor, commit_transaction,
+					BJ_LogCtl);
+		}
+
+		/* Where is the buffer to be written? */
+
+		err = journal_next_log_block(journal, &blocknr);
+		/* If the block mapping failed, just abandon the buffer
+		   and repeat this loop: we'll fall into the
+		   refile-on-abort condition above. */
+		if (err) {
+			journal_abort(journal, err);
+			continue;
+		}
+
+		/*
+		 * start_this_handle() uses t_outstanding_credits to determine
+		 * the free space in the log, but this counter is changed
+		 * by journal_next_log_block() also.
+		 */
+		commit_transaction->t_outstanding_credits--;
+
+		/* Bump b_count to prevent truncate from stumbling over
+                   the shadowed buffer!  @@@ This can go if we ever get
+                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+		get_bh(jh2bh(jh));
+
+		/* Make a temporary IO buffer with which to write it out
+                   (this will requeue both the metadata buffer and the
+                   temporary IO buffer). new_bh goes on BJ_IO*/
+
+		set_buffer_jwrite(jh2bh(jh));
+		/*
+		 * akpm: journal_write_metadata_buffer() sets
+		 * new_bh->b_transaction to commit_transaction.
+		 * We need to clean this up before we release new_bh
+		 * (which is of type BJ_IO)
+		 */
+		JBUFFER_TRACE(jh, "ph3: write metadata");
+		flags = journal_write_metadata_buffer(commit_transaction,
+						      jh, &new_jh, blocknr);
+		set_buffer_jwrite(jh2bh(new_jh));
+		wbuf[bufs++] = jh2bh(new_jh);
+
+		/* Record the new block's tag in the current descriptor
+                   buffer */
+
+		tag_flag = 0;
+		if (flags & 1)
+			tag_flag |= JFS_FLAG_ESCAPE;
+		if (!first_tag)
+			tag_flag |= JFS_FLAG_SAME_UUID;
+
+		tag = (journal_block_tag_t *) tagp;
+		tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
+		tag->t_flags = cpu_to_be32(tag_flag);
+		tagp += sizeof(journal_block_tag_t);
+		space_left -= sizeof(journal_block_tag_t);
+
+		if (first_tag) {
+			memcpy (tagp, journal->j_uuid, 16);
+			tagp += 16;
+			space_left -= 16;
+			first_tag = 0;
+		}
+
+		/* If there's no more to do, or if the descriptor is full,
+		   let the IO rip! */
+
+		if (bufs == journal->j_wbufsize ||
+		    commit_transaction->t_buffers == NULL ||
+		    space_left < sizeof(journal_block_tag_t) + 16) {
+
+			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+
+			/* Write an end-of-descriptor marker before
+                           submitting the IOs.  "tag" still points to
+                           the last tag we set up. */
+
+			tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
+
+start_journal_io:
+			for (i = 0; i < bufs; i++) {
+				struct buffer_head *bh = wbuf[i];
+				lock_buffer(bh);
+				clear_buffer_dirty(bh);
+				set_buffer_uptodate(bh);
+				bh->b_end_io = journal_end_buffer_io_sync;
+				submit_bh(WRITE_SYNC, bh);
+			}
+			cond_resched();
+
+			/* Force a new descriptor to be generated next
+                           time round the loop. */
+			descriptor = NULL;
+			bufs = 0;
+		}
+	}
+
+	blk_finish_plug(&plug);
+
+	/* Lo and behold: we have just managed to send a transaction to
+           the log.  Before we can commit it, wait for the IO so far to
+           complete.  Control buffers being written are on the
+           transaction's t_log_list queue, and metadata buffers are on
+           the t_iobuf_list queue.
+
+	   Wait for the buffers in reverse order.  That way we are
+	   less likely to be woken up until all IOs have completed, and
+	   so we incur less scheduling load.
+	*/
+
+	jbd_debug(3, "JBD: commit phase 4\n");
+
+	/*
+	 * akpm: these are BJ_IO, and j_list_lock is not needed.
+	 * See __journal_try_to_free_buffer.
+	 */
+wait_for_iobuf:
+	while (commit_transaction->t_iobuf_list != NULL) {
+		struct buffer_head *bh;
+
+		jh = commit_transaction->t_iobuf_list->b_tprev;
+		bh = jh2bh(jh);
+		if (buffer_locked(bh)) {
+			wait_on_buffer(bh);
+			goto wait_for_iobuf;
+		}
+		if (cond_resched())
+			goto wait_for_iobuf;
+
+		if (unlikely(!buffer_uptodate(bh)))
+			err = -EIO;
+
+		clear_buffer_jwrite(bh);
+
+		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
+		journal_unfile_buffer(journal, jh);
+
+		/*
+		 * ->t_iobuf_list should contain only dummy buffer_heads
+		 * which were created by journal_write_metadata_buffer().
+		 */
+		BUFFER_TRACE(bh, "dumping temporary bh");
+		journal_put_journal_head(jh);
+		__brelse(bh);
+		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
+		free_buffer_head(bh);
+
+		/* We also have to unlock and free the corresponding
+                   shadowed buffer */
+		jh = commit_transaction->t_shadow_list->b_tprev;
+		bh = jh2bh(jh);
+		clear_buffer_jwrite(bh);
+		J_ASSERT_BH(bh, buffer_jbddirty(bh));
+
+		/* The metadata is now released for reuse, but we need
+                   to remember it against this transaction so that when
+                   we finally commit, we can do any checkpointing
+                   required. */
+		JBUFFER_TRACE(jh, "file as BJ_Forget");
+		journal_file_buffer(jh, commit_transaction, BJ_Forget);
+		/*
+		 * Wake up any transactions which were waiting for this
+		 * IO to complete. The barrier must be here so that changes
+		 * by journal_file_buffer() take effect before wake_up_bit()
+		 * does the waitqueue check.
+		 */
+		smp_mb();
+		wake_up_bit(&bh->b_state, BH_Unshadow);
+		JBUFFER_TRACE(jh, "brelse shadowed buffer");
+		__brelse(bh);
+	}
+
+	J_ASSERT (commit_transaction->t_shadow_list == NULL);
+
+	jbd_debug(3, "JBD: commit phase 5\n");
+
+	/* Here we wait for the revoke record and descriptor record buffers */
+ wait_for_ctlbuf:
+	while (commit_transaction->t_log_list != NULL) {
+		struct buffer_head *bh;
+
+		jh = commit_transaction->t_log_list->b_tprev;
+		bh = jh2bh(jh);
+		if (buffer_locked(bh)) {
+			wait_on_buffer(bh);
+			goto wait_for_ctlbuf;
+		}
+		if (cond_resched())
+			goto wait_for_ctlbuf;
+
+		if (unlikely(!buffer_uptodate(bh)))
+			err = -EIO;
+
+		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
+		clear_buffer_jwrite(bh);
+		journal_unfile_buffer(journal, jh);
+		journal_put_journal_head(jh);
+		__brelse(bh);		/* One for getblk */
+		/* AKPM: bforget here */
+	}
+
+	if (err)
+		journal_abort(journal, err);
+
+	jbd_debug(3, "JBD: commit phase 6\n");
+
+	/* All metadata is written, now write commit record and do cleanup */
+	spin_lock(&journal->j_state_lock);
+	J_ASSERT(commit_transaction->t_state == T_COMMIT);
+	commit_transaction->t_state = T_COMMIT_RECORD;
+	spin_unlock(&journal->j_state_lock);
+
+	if (journal_write_commit_record(journal, commit_transaction))
+		err = -EIO;
+
+	if (err)
+		journal_abort(journal, err);
+
+	/* End of a transaction!  Finally, we can do checkpoint
+           processing: any buffers committed as a result of this
+           transaction can be removed from any checkpoint list it was on
+           before. */
+
+	jbd_debug(3, "JBD: commit phase 7\n");
+
+	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+	J_ASSERT(commit_transaction->t_buffers == NULL);
+	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
+	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
+	J_ASSERT(commit_transaction->t_shadow_list == NULL);
+	J_ASSERT(commit_transaction->t_log_list == NULL);
+
+restart_loop:
+	/*
+	 * As there are other places (journal_unmap_buffer()) adding buffers
+	 * to this list we have to be careful and hold the j_list_lock.
+	 */
+	spin_lock(&journal->j_list_lock);
+	while (commit_transaction->t_forget) {
+		transaction_t *cp_transaction;
+		struct buffer_head *bh;
+
+		jh = commit_transaction->t_forget;
+		spin_unlock(&journal->j_list_lock);
+		bh = jh2bh(jh);
+		jbd_lock_bh_state(bh);
+		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
+			jh->b_transaction == journal->j_running_transaction);
+
+		/*
+		 * If there is undo-protected committed data against
+		 * this buffer, then we can remove it now.  If it is a
+		 * buffer needing such protection, the old frozen_data
+		 * field now points to a committed version of the
+		 * buffer, so rotate that field to the new committed
+		 * data.
+		 *
+		 * Otherwise, we can just throw away the frozen data now.
+		 */
+		if (jh->b_committed_data) {
+			jbd_free(jh->b_committed_data, bh->b_size);
+			jh->b_committed_data = NULL;
+			if (jh->b_frozen_data) {
+				jh->b_committed_data = jh->b_frozen_data;
+				jh->b_frozen_data = NULL;
+			}
+		} else if (jh->b_frozen_data) {
+			jbd_free(jh->b_frozen_data, bh->b_size);
+			jh->b_frozen_data = NULL;
+		}
+
+		spin_lock(&journal->j_list_lock);
+		cp_transaction = jh->b_cp_transaction;
+		if (cp_transaction) {
+			JBUFFER_TRACE(jh, "remove from old cp transaction");
+			__journal_remove_checkpoint(jh);
+		}
+
+		/* Only re-checkpoint the buffer_head if it is marked
+		 * dirty.  If the buffer was added to the BJ_Forget list
+		 * by journal_forget, it may no longer be dirty and
+		 * there's no point in keeping a checkpoint record for
+		 * it. */
+
+		/* A buffer which has been freed while still being
+		 * journaled by a previous transaction may end up still
+		 * being dirty here, but we want to avoid writing back
+		 * that buffer in the future after the "add to orphan"
+		 * operation been committed,  That's not only a performance
+		 * gain, it also stops aliasing problems if the buffer is
+		 * left behind for writeback and gets reallocated for another
+		 * use in a different page. */
+		if (buffer_freed(bh) && !jh->b_next_transaction) {
+			clear_buffer_freed(bh);
+			clear_buffer_jbddirty(bh);
+		}
+
+		if (buffer_jbddirty(bh)) {
+			JBUFFER_TRACE(jh, "add to new checkpointing trans");
+			__journal_insert_checkpoint(jh, commit_transaction);
+			if (is_journal_aborted(journal))
+				clear_buffer_jbddirty(bh);
+			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
+			__journal_refile_buffer(jh);
+			jbd_unlock_bh_state(bh);
+		} else {
+			J_ASSERT_BH(bh, !buffer_dirty(bh));
+			/* The buffer on BJ_Forget list and not jbddirty means
+			 * it has been freed by this transaction and hence it
+			 * could not have been reallocated until this
+			 * transaction has committed. *BUT* it could be
+			 * reallocated once we have written all the data to
+			 * disk and before we process the buffer on BJ_Forget
+			 * list. */
+			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+			__journal_refile_buffer(jh);
+			if (!jh->b_transaction) {
+				jbd_unlock_bh_state(bh);
+				 /* needs a brelse */
+				journal_remove_journal_head(bh);
+				release_buffer_page(bh);
+			} else
+				jbd_unlock_bh_state(bh);
+		}
+		cond_resched_lock(&journal->j_list_lock);
+	}
+	spin_unlock(&journal->j_list_lock);
+	/*
+	 * This is a bit sleazy.  We use j_list_lock to protect transition
+	 * of a transaction into T_FINISHED state and calling
+	 * __journal_drop_transaction(). Otherwise we could race with
+	 * other checkpointing code processing the transaction...
+	 */
+	spin_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	/*
+	 * Now recheck if some buffers did not get attached to the transaction
+	 * while the lock was dropped...
+	 */
+	if (commit_transaction->t_forget) {
+		spin_unlock(&journal->j_list_lock);
+		spin_unlock(&journal->j_state_lock);
+		goto restart_loop;
+	}
+
+	/* Done with this transaction! */
+
+	jbd_debug(3, "JBD: commit phase 8\n");
+
+	J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
+
+	commit_transaction->t_state = T_FINISHED;
+	J_ASSERT(commit_transaction == journal->j_committing_transaction);
+	journal->j_commit_sequence = commit_transaction->t_tid;
+	journal->j_committing_transaction = NULL;
+	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+
+	/*
+	 * weight the commit time higher than the average time so we don't
+	 * react too strongly to vast changes in commit time
+	 */
+	if (likely(journal->j_average_commit_time))
+		journal->j_average_commit_time = (commit_time*3 +
+				journal->j_average_commit_time) / 4;
+	else
+		journal->j_average_commit_time = commit_time;
+
+	spin_unlock(&journal->j_state_lock);
+
+	if (commit_transaction->t_checkpoint_list == NULL &&
+	    commit_transaction->t_checkpoint_io_list == NULL) {
+		__journal_drop_transaction(journal, commit_transaction);
+	} else {
+		if (journal->j_checkpoint_transactions == NULL) {
+			journal->j_checkpoint_transactions = commit_transaction;
+			commit_transaction->t_cpnext = commit_transaction;
+			commit_transaction->t_cpprev = commit_transaction;
+		} else {
+			commit_transaction->t_cpnext =
+				journal->j_checkpoint_transactions;
+			commit_transaction->t_cpprev =
+				commit_transaction->t_cpnext->t_cpprev;
+			commit_transaction->t_cpnext->t_cpprev =
+				commit_transaction;
+			commit_transaction->t_cpprev->t_cpnext =
+				commit_transaction;
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	jbd_debug(1, "JBD: commit %d complete, head %d\n",
+		  journal->j_commit_sequence, journal->j_tail_sequence);
+
+	wake_up(&journal->j_wait_done_commit);
+}
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
new file mode 100644
index 00000000..9f36384e
--- /dev/null
+++ b/fs/jbd/journal.c
@@ -0,0 +1,2081 @@
+/*
+ * linux/fs/jbd/journal.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem journal-writing code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages journals: areas of disk reserved for logging
+ * transactional updates.  This includes the kernel journaling thread
+ * which is responsible for scheduling updates to the log.
+ *
+ * We do not actually manage the physical storage of the journal in this
+ * file: that is left to a per-journal policy function, which allows us
+ * to store the journal within a filesystem-specified area for ext2
+ * journaling (ext2 can use a reserved inode for storing the log).
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/freezer.h>
+#include <linux/pagemap.h>
+#include <linux/kthread.h>
+#include <linux/poison.h>
+#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
+#include <linux/ratelimit.h>
+
+#include <asm/uaccess.h>
+#include <asm/page.h>
+
+EXPORT_SYMBOL(journal_start);
+EXPORT_SYMBOL(journal_restart);
+EXPORT_SYMBOL(journal_extend);
+EXPORT_SYMBOL(journal_stop);
+EXPORT_SYMBOL(journal_lock_updates);
+EXPORT_SYMBOL(journal_unlock_updates);
+EXPORT_SYMBOL(journal_get_write_access);
+EXPORT_SYMBOL(journal_get_create_access);
+EXPORT_SYMBOL(journal_get_undo_access);
+EXPORT_SYMBOL(journal_dirty_data);
+EXPORT_SYMBOL(journal_dirty_metadata);
+EXPORT_SYMBOL(journal_release_buffer);
+EXPORT_SYMBOL(journal_forget);
+#if 0
+EXPORT_SYMBOL(journal_sync_buffer);
+#endif
+EXPORT_SYMBOL(journal_flush);
+EXPORT_SYMBOL(journal_revoke);
+
+EXPORT_SYMBOL(journal_init_dev);
+EXPORT_SYMBOL(journal_init_inode);
+EXPORT_SYMBOL(journal_update_format);
+EXPORT_SYMBOL(journal_check_used_features);
+EXPORT_SYMBOL(journal_check_available_features);
+EXPORT_SYMBOL(journal_set_features);
+EXPORT_SYMBOL(journal_create);
+EXPORT_SYMBOL(journal_load);
+EXPORT_SYMBOL(journal_destroy);
+EXPORT_SYMBOL(journal_abort);
+EXPORT_SYMBOL(journal_errno);
+EXPORT_SYMBOL(journal_ack_err);
+EXPORT_SYMBOL(journal_clear_err);
+EXPORT_SYMBOL(log_wait_commit);
+EXPORT_SYMBOL(log_start_commit);
+EXPORT_SYMBOL(journal_start_commit);
+EXPORT_SYMBOL(journal_force_commit_nested);
+EXPORT_SYMBOL(journal_wipe);
+EXPORT_SYMBOL(journal_blocks_per_page);
+EXPORT_SYMBOL(journal_invalidatepage);
+EXPORT_SYMBOL(journal_try_to_free_buffers);
+EXPORT_SYMBOL(journal_force_commit);
+
+static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
+static void __journal_abort_soft (journal_t *journal, int errno);
+static const char *journal_dev_name(journal_t *journal, char *buffer);
+
+/*
+ * Helper function used to manage commit timeouts
+ */
+
+static void commit_timeout(unsigned long __data)
+{
+	struct task_struct * p = (struct task_struct *) __data;
+
+	wake_up_process(p);
+}
+
+/*
+ * kjournald: The main thread function used to manage a logging device
+ * journal.
+ *
+ * This kernel thread is responsible for two things:
+ *
+ * 1) COMMIT:  Every so often we need to commit the current state of the
+ *    filesystem to disk.  The journal thread is responsible for writing
+ *    all of the metadata buffers to disk.
+ *
+ * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
+ *    of the data in that part of the log has been rewritten elsewhere on
+ *    the disk.  Flushing these old buffers to reclaim space in the log is
+ *    known as checkpointing, and this thread is responsible for that job.
+ */
+
+static int kjournald(void *arg)
+{
+	journal_t *journal = arg;
+	transaction_t *transaction;
+
+	/*
+	 * Set up an interval timer which can be used to trigger a commit wakeup
+	 * after the commit interval expires
+	 */
+	setup_timer(&journal->j_commit_timer, commit_timeout,
+			(unsigned long)current);
+
+	/* Record that the journal thread is running */
+	journal->j_task = current;
+	wake_up(&journal->j_wait_done_commit);
+
+	printk(KERN_INFO "kjournald starting.  Commit interval %ld seconds\n",
+			journal->j_commit_interval / HZ);
+
+	/*
+	 * And now, wait forever for commit wakeup events.
+	 */
+	spin_lock(&journal->j_state_lock);
+
+loop:
+	if (journal->j_flags & JFS_UNMOUNT)
+		goto end_loop;
+
+	jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
+		journal->j_commit_sequence, journal->j_commit_request);
+
+	if (journal->j_commit_sequence != journal->j_commit_request) {
+		jbd_debug(1, "OK, requests differ\n");
+		spin_unlock(&journal->j_state_lock);
+		del_timer_sync(&journal->j_commit_timer);
+		journal_commit_transaction(journal);
+		spin_lock(&journal->j_state_lock);
+		goto loop;
+	}
+
+	wake_up(&journal->j_wait_done_commit);
+	if (freezing(current)) {
+		/*
+		 * The simpler the better. Flushing journal isn't a
+		 * good idea, because that depends on threads that may
+		 * be already stopped.
+		 */
+		jbd_debug(1, "Now suspending kjournald\n");
+		spin_unlock(&journal->j_state_lock);
+		refrigerator();
+		spin_lock(&journal->j_state_lock);
+	} else {
+		/*
+		 * We assume on resume that commits are already there,
+		 * so we don't sleep
+		 */
+		DEFINE_WAIT(wait);
+		int should_sleep = 1;
+
+		prepare_to_wait(&journal->j_wait_commit, &wait,
+				TASK_INTERRUPTIBLE);
+		if (journal->j_commit_sequence != journal->j_commit_request)
+			should_sleep = 0;
+		transaction = journal->j_running_transaction;
+		if (transaction && time_after_eq(jiffies,
+						transaction->t_expires))
+			should_sleep = 0;
+		if (journal->j_flags & JFS_UNMOUNT)
+			should_sleep = 0;
+		if (should_sleep) {
+			spin_unlock(&journal->j_state_lock);
+			schedule();
+			spin_lock(&journal->j_state_lock);
+		}
+		finish_wait(&journal->j_wait_commit, &wait);
+	}
+
+	jbd_debug(1, "kjournald wakes\n");
+
+	/*
+	 * Were we woken up by a commit wakeup event?
+	 */
+	transaction = journal->j_running_transaction;
+	if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
+		journal->j_commit_request = transaction->t_tid;
+		jbd_debug(1, "woke because of timeout\n");
+	}
+	goto loop;
+
+end_loop:
+	spin_unlock(&journal->j_state_lock);
+	del_timer_sync(&journal->j_commit_timer);
+	journal->j_task = NULL;
+	wake_up(&journal->j_wait_done_commit);
+	jbd_debug(1, "Journal thread exiting.\n");
+	return 0;
+}
+
+static int journal_start_thread(journal_t *journal)
+{
+	struct task_struct *t;
+
+	t = kthread_run(kjournald, journal, "kjournald");
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
+	return 0;
+}
+
+static void journal_kill_thread(journal_t *journal)
+{
+	spin_lock(&journal->j_state_lock);
+	journal->j_flags |= JFS_UNMOUNT;
+
+	while (journal->j_task) {
+		wake_up(&journal->j_wait_commit);
+		spin_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_done_commit,
+				journal->j_task == NULL);
+		spin_lock(&journal->j_state_lock);
+	}
+	spin_unlock(&journal->j_state_lock);
+}
+
+/*
+ * journal_write_metadata_buffer: write a metadata buffer to the journal.
+ *
+ * Writes a metadata buffer to a given disk block.  The actual IO is not
+ * performed but a new buffer_head is constructed which labels the data
+ * to be written with the correct destination disk block.
+ *
+ * Any magic-number escaping which needs to be done will cause a
+ * copy-out here.  If the buffer happens to start with the
+ * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the
+ * magic number is only written to the log for descripter blocks.  In
+ * this case, we copy the data and replace the first word with 0, and we
+ * return a result code which indicates that this buffer needs to be
+ * marked as an escaped buffer in the corresponding log descriptor
+ * block.  The missing word can then be restored when the block is read
+ * during recovery.
+ *
+ * If the source buffer has already been modified by a new transaction
+ * since we took the last commit snapshot, we use the frozen copy of
+ * that data for IO.  If we end up using the existing buffer_head's data
+ * for the write, then we *have* to lock the buffer to prevent anyone
+ * else from using and possibly modifying it while the IO is in
+ * progress.
+ *
+ * The function returns a pointer to the buffer_heads to be used for IO.
+ *
+ * We assume that the journal has already been locked in this function.
+ *
+ * Return value:
+ *  <0: Error
+ * >=0: Finished OK
+ *
+ * On success:
+ * Bit 0 set == escape performed on the data
+ * Bit 1 set == buffer copy-out performed (kfree the data after IO)
+ */
+
+int journal_write_metadata_buffer(transaction_t *transaction,
+				  struct journal_head  *jh_in,
+				  struct journal_head **jh_out,
+				  unsigned int blocknr)
+{
+	int need_copy_out = 0;
+	int done_copy_out = 0;
+	int do_escape = 0;
+	char *mapped_data;
+	struct buffer_head *new_bh;
+	struct journal_head *new_jh;
+	struct page *new_page;
+	unsigned int new_offset;
+	struct buffer_head *bh_in = jh2bh(jh_in);
+	journal_t *journal = transaction->t_journal;
+
+	/*
+	 * The buffer really shouldn't be locked: only the current committing
+	 * transaction is allowed to write it, so nobody else is allowed
+	 * to do any IO.
+	 *
+	 * akpm: except if we're journalling data, and write() output is
+	 * also part of a shared mapping, and another thread has
+	 * decided to launch a writepage() against this buffer.
+	 */
+	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
+
+	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+	/* keep subsequent assertions sane */
+	new_bh->b_state = 0;
+	init_buffer(new_bh, NULL, NULL);
+	atomic_set(&new_bh->b_count, 1);
+	new_jh = journal_add_journal_head(new_bh);	/* This sleeps */
+
+	/*
+	 * If a new transaction has already done a buffer copy-out, then
+	 * we use that version of the data for the commit.
+	 */
+	jbd_lock_bh_state(bh_in);
+repeat:
+	if (jh_in->b_frozen_data) {
+		done_copy_out = 1;
+		new_page = virt_to_page(jh_in->b_frozen_data);
+		new_offset = offset_in_page(jh_in->b_frozen_data);
+	} else {
+		new_page = jh2bh(jh_in)->b_page;
+		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+	}
+
+	mapped_data = kmap_atomic(new_page, KM_USER0);
+	/*
+	 * Check for escaping
+	 */
+	if (*((__be32 *)(mapped_data + new_offset)) ==
+				cpu_to_be32(JFS_MAGIC_NUMBER)) {
+		need_copy_out = 1;
+		do_escape = 1;
+	}
+	kunmap_atomic(mapped_data, KM_USER0);
+
+	/*
+	 * Do we need to do a data copy?
+	 */
+	if (need_copy_out && !done_copy_out) {
+		char *tmp;
+
+		jbd_unlock_bh_state(bh_in);
+		tmp = jbd_alloc(bh_in->b_size, GFP_NOFS);
+		jbd_lock_bh_state(bh_in);
+		if (jh_in->b_frozen_data) {
+			jbd_free(tmp, bh_in->b_size);
+			goto repeat;
+		}
+
+		jh_in->b_frozen_data = tmp;
+		mapped_data = kmap_atomic(new_page, KM_USER0);
+		memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
+		kunmap_atomic(mapped_data, KM_USER0);
+
+		new_page = virt_to_page(tmp);
+		new_offset = offset_in_page(tmp);
+		done_copy_out = 1;
+	}
+
+	/*
+	 * Did we need to do an escaping?  Now we've done all the
+	 * copying, we can finally do so.
+	 */
+	if (do_escape) {
+		mapped_data = kmap_atomic(new_page, KM_USER0);
+		*((unsigned int *)(mapped_data + new_offset)) = 0;
+		kunmap_atomic(mapped_data, KM_USER0);
+	}
+
+	set_bh_page(new_bh, new_page, new_offset);
+	new_jh->b_transaction = NULL;
+	new_bh->b_size = jh2bh(jh_in)->b_size;
+	new_bh->b_bdev = transaction->t_journal->j_dev;
+	new_bh->b_blocknr = blocknr;
+	set_buffer_mapped(new_bh);
+	set_buffer_dirty(new_bh);
+
+	*jh_out = new_jh;
+
+	/*
+	 * The to-be-written buffer needs to get moved to the io queue,
+	 * and the original buffer whose contents we are shadowing or
+	 * copying is moved to the transaction's shadow queue.
+	 */
+	JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
+	spin_lock(&journal->j_list_lock);
+	__journal_file_buffer(jh_in, transaction, BJ_Shadow);
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh_in);
+
+	JBUFFER_TRACE(new_jh, "file as BJ_IO");
+	journal_file_buffer(new_jh, transaction, BJ_IO);
+
+	return do_escape | (done_copy_out << 1);
+}
+
+/*
+ * Allocation code for the journal file.  Manage the space left in the
+ * journal, so that we can begin checkpointing when appropriate.
+ */
+
+/*
+ * __log_space_left: Return the number of free blocks left in the journal.
+ *
+ * Called with the journal already locked.
+ *
+ * Called under j_state_lock
+ */
+
+int __log_space_left(journal_t *journal)
+{
+	int left = journal->j_free;
+
+	assert_spin_locked(&journal->j_state_lock);
+
+	/*
+	 * Be pessimistic here about the number of those free blocks which
+	 * might be required for log descriptor control blocks.
+	 */
+
+#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
+
+	left -= MIN_LOG_RESERVED_BLOCKS;
+
+	if (left <= 0)
+		return 0;
+	left -= (left >> 3);
+	return left;
+}
+
+/*
+ * Called under j_state_lock.  Returns true if a transaction commit was started.
+ */
+int __log_start_commit(journal_t *journal, tid_t target)
+{
+	/*
+	 * The only transaction we can possibly wait upon is the
+	 * currently running transaction (if it exists).  Otherwise,
+	 * the target tid must be an old one.
+	 */
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == target) {
+		/*
+		 * We want a new commit: OK, mark the request and wakeup the
+		 * commit thread.  We do _not_ do the commit ourselves.
+		 */
+
+		journal->j_commit_request = target;
+		jbd_debug(1, "JBD: requesting commit %d/%d\n",
+			  journal->j_commit_request,
+			  journal->j_commit_sequence);
+		wake_up(&journal->j_wait_commit);
+		return 1;
+	} else if (!tid_geq(journal->j_commit_request, target))
+		/* This should never happen, but if it does, preserve
+		   the evidence before kjournald goes into a loop and
+		   increments j_commit_sequence beyond all recognition. */
+		WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+		    journal->j_commit_request, journal->j_commit_sequence,
+		    target, journal->j_running_transaction ?
+		    journal->j_running_transaction->t_tid : 0);
+	return 0;
+}
+
+int log_start_commit(journal_t *journal, tid_t tid)
+{
+	int ret;
+
+	spin_lock(&journal->j_state_lock);
+	ret = __log_start_commit(journal, tid);
+	spin_unlock(&journal->j_state_lock);
+	return ret;
+}
+
+/*
+ * Force and wait upon a commit if the calling process is not within
+ * transaction.  This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * We can only force the running transaction if we don't have an active handle;
+ * otherwise, we will deadlock.
+ *
+ * Returns true if a transaction was started.
+ */
+int journal_force_commit_nested(journal_t *journal)
+{
+	transaction_t *transaction = NULL;
+	tid_t tid;
+
+	spin_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction && !current->journal_info) {
+		transaction = journal->j_running_transaction;
+		__log_start_commit(journal, transaction->t_tid);
+	} else if (journal->j_committing_transaction)
+		transaction = journal->j_committing_transaction;
+
+	if (!transaction) {
+		spin_unlock(&journal->j_state_lock);
+		return 0;	/* Nothing to retry */
+	}
+
+	tid = transaction->t_tid;
+	spin_unlock(&journal->j_state_lock);
+	log_wait_commit(journal, tid);
+	return 1;
+}
+
+/*
+ * Start a commit of the current running transaction (if any).  Returns true
+ * if a transaction is going to be committed (or is currently already
+ * committing), and fills its tid in at *ptid
+ */
+int journal_start_commit(journal_t *journal, tid_t *ptid)
+{
+	int ret = 0;
+
+	spin_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction) {
+		tid_t tid = journal->j_running_transaction->t_tid;
+
+		__log_start_commit(journal, tid);
+		/* There's a running transaction and we've just made sure
+		 * it's commit has been scheduled. */
+		if (ptid)
+			*ptid = tid;
+		ret = 1;
+	} else if (journal->j_committing_transaction) {
+		/*
+		 * If ext3_write_super() recently started a commit, then we
+		 * have to wait for completion of that transaction
+		 */
+		if (ptid)
+			*ptid = journal->j_committing_transaction->t_tid;
+		ret = 1;
+	}
+	spin_unlock(&journal->j_state_lock);
+	return ret;
+}
+
+/*
+ * Wait for a specified commit to complete.
+ * The caller may not hold the journal lock.
+ */
+int log_wait_commit(journal_t *journal, tid_t tid)
+{
+	int err = 0;
+
+#ifdef CONFIG_JBD_DEBUG
+	spin_lock(&journal->j_state_lock);
+	if (!tid_geq(journal->j_commit_request, tid)) {
+		printk(KERN_EMERG
+		       "%s: error: j_commit_request=%d, tid=%d\n",
+		       __func__, journal->j_commit_request, tid);
+	}
+	spin_unlock(&journal->j_state_lock);
+#endif
+	spin_lock(&journal->j_state_lock);
+	while (tid_gt(tid, journal->j_commit_sequence)) {
+		jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
+				  tid, journal->j_commit_sequence);
+		wake_up(&journal->j_wait_commit);
+		spin_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_done_commit,
+				!tid_gt(tid, journal->j_commit_sequence));
+		spin_lock(&journal->j_state_lock);
+	}
+	spin_unlock(&journal->j_state_lock);
+
+	if (unlikely(is_journal_aborted(journal))) {
+		printk(KERN_EMERG "journal commit I/O error\n");
+		err = -EIO;
+	}
+	return err;
+}
+
+/*
+ * Return 1 if a given transaction has not yet sent barrier request
+ * connected with a transaction commit. If 0 is returned, transaction
+ * may or may not have sent the barrier. Used to avoid sending barrier
+ * twice in common cases.
+ */
+int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
+{
+	int ret = 0;
+	transaction_t *commit_trans;
+
+	if (!(journal->j_flags & JFS_BARRIER))
+		return 0;
+	spin_lock(&journal->j_state_lock);
+	/* Transaction already committed? */
+	if (tid_geq(journal->j_commit_sequence, tid))
+		goto out;
+	/*
+	 * Transaction is being committed and we already proceeded to
+	 * writing commit record?
+	 */
+	commit_trans = journal->j_committing_transaction;
+	if (commit_trans && commit_trans->t_tid == tid &&
+	    commit_trans->t_state >= T_COMMIT_RECORD)
+		goto out;
+	ret = 1;
+out:
+	spin_unlock(&journal->j_state_lock);
+	return ret;
+}
+EXPORT_SYMBOL(journal_trans_will_send_data_barrier);
+
+/*
+ * Log buffer allocation routines:
+ */
+
+int journal_next_log_block(journal_t *journal, unsigned int *retp)
+{
+	unsigned int blocknr;
+
+	spin_lock(&journal->j_state_lock);
+	J_ASSERT(journal->j_free > 1);
+
+	blocknr = journal->j_head;
+	journal->j_head++;
+	journal->j_free--;
+	if (journal->j_head == journal->j_last)
+		journal->j_head = journal->j_first;
+	spin_unlock(&journal->j_state_lock);
+	return journal_bmap(journal, blocknr, retp);
+}
+
+/*
+ * Conversion of logical to physical block numbers for the journal
+ *
+ * On external journals the journal blocks are identity-mapped, so
+ * this is a no-op.  If needed, we can use j_blk_offset - everything is
+ * ready.
+ */
+int journal_bmap(journal_t *journal, unsigned int blocknr,
+		 unsigned int *retp)
+{
+	int err = 0;
+	unsigned int ret;
+
+	if (journal->j_inode) {
+		ret = bmap(journal->j_inode, blocknr);
+		if (ret)
+			*retp = ret;
+		else {
+			char b[BDEVNAME_SIZE];
+
+			printk(KERN_ALERT "%s: journal block not found "
+					"at offset %u on %s\n",
+				__func__,
+				blocknr,
+				bdevname(journal->j_dev, b));
+			err = -EIO;
+			__journal_abort_soft(journal, err);
+		}
+	} else {
+		*retp = blocknr; /* +journal->j_blk_offset */
+	}
+	return err;
+}
+
+/*
+ * We play buffer_head aliasing tricks to write data/metadata blocks to
+ * the journal without copying their contents, but for journal
+ * descriptor blocks we do need to generate bona fide buffers.
+ *
+ * After the caller of journal_get_descriptor_buffer() has finished modifying
+ * the buffer's contents they really should run flush_dcache_page(bh->b_page).
+ * But we don't bother doing that, so there will be coherency problems with
+ * mmaps of blockdevs which hold live JBD-controlled filesystems.
+ */
+struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
+{
+	struct buffer_head *bh;
+	unsigned int blocknr;
+	int err;
+
+	err = journal_next_log_block(journal, &blocknr);
+
+	if (err)
+		return NULL;
+
+	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh)
+		return NULL;
+	lock_buffer(bh);
+	memset(bh->b_data, 0, journal->j_blocksize);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	BUFFER_TRACE(bh, "return this buffer");
+	return journal_add_journal_head(bh);
+}
+
+/*
+ * Management for journal control blocks: functions to create and
+ * destroy journal_t structures, and to initialise and read existing
+ * journal blocks from disk.  */
+
+/* First: create and setup a journal_t object in memory.  We initialise
+ * very few fields yet: that has to wait until we have created the
+ * journal structures from from scratch, or loaded them from disk. */
+
+static journal_t * journal_init_common (void)
+{
+	journal_t *journal;
+	int err;
+
+	journal = kzalloc(sizeof(*journal), GFP_KERNEL);
+	if (!journal)
+		goto fail;
+
+	init_waitqueue_head(&journal->j_wait_transaction_locked);
+	init_waitqueue_head(&journal->j_wait_logspace);
+	init_waitqueue_head(&journal->j_wait_done_commit);
+	init_waitqueue_head(&journal->j_wait_checkpoint);
+	init_waitqueue_head(&journal->j_wait_commit);
+	init_waitqueue_head(&journal->j_wait_updates);
+	mutex_init(&journal->j_barrier);
+	mutex_init(&journal->j_checkpoint_mutex);
+	spin_lock_init(&journal->j_revoke_lock);
+	spin_lock_init(&journal->j_list_lock);
+	spin_lock_init(&journal->j_state_lock);
+
+	journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
+
+	/* The journal is marked for error until we succeed with recovery! */
+	journal->j_flags = JFS_ABORT;
+
+	/* Set up a default-sized revoke table for the new mount. */
+	err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
+	if (err) {
+		kfree(journal);
+		goto fail;
+	}
+	return journal;
+fail:
+	return NULL;
+}
+
+/* journal_init_dev and journal_init_inode:
+ *
+ * Create a journal structure assigned some fixed set of disk blocks to
+ * the journal.  We don't actually touch those disk blocks yet, but we
+ * need to set up all of the mapping information to tell the journaling
+ * system where the journal blocks are.
+ *
+ */
+
+/**
+ *  journal_t * journal_init_dev() - creates and initialises a journal structure
+ *  @bdev: Block device on which to create the journal
+ *  @fs_dev: Device which hold journalled filesystem for this journal.
+ *  @start: Block nr Start of journal.
+ *  @len:  Length of the journal in blocks.
+ *  @blocksize: blocksize of journalling device
+ *
+ *  Returns: a newly created journal_t *
+ *
+ *  journal_init_dev creates a journal which maps a fixed contiguous
+ *  range of blocks on an arbitrary block device.
+ *
+ */
+journal_t * journal_init_dev(struct block_device *bdev,
+			struct block_device *fs_dev,
+			int start, int len, int blocksize)
+{
+	journal_t *journal = journal_init_common();
+	struct buffer_head *bh;
+	int n;
+
+	if (!journal)
+		return NULL;
+
+	/* journal descriptor can store up to n blocks -bzzz */
+	journal->j_blocksize = blocksize;
+	n = journal->j_blocksize / sizeof(journal_block_tag_t);
+	journal->j_wbufsize = n;
+	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+	if (!journal->j_wbuf) {
+		printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
+			__func__);
+		goto out_err;
+	}
+	journal->j_dev = bdev;
+	journal->j_fs_dev = fs_dev;
+	journal->j_blk_offset = start;
+	journal->j_maxlen = len;
+
+	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
+	if (!bh) {
+		printk(KERN_ERR
+		       "%s: Cannot get buffer for journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
+	journal->j_sb_buffer = bh;
+	journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
+	return journal;
+out_err:
+	kfree(journal->j_wbuf);
+	kfree(journal);
+	return NULL;
+}
+
+/**
+ *  journal_t * journal_init_inode () - creates a journal which maps to a inode.
+ *  @inode: An inode to create the journal in
+ *
+ * journal_init_inode creates a journal which maps an on-disk inode as
+ * the journal.  The inode must exist already, must support bmap() and
+ * must have all data blocks preallocated.
+ */
+journal_t * journal_init_inode (struct inode *inode)
+{
+	struct buffer_head *bh;
+	journal_t *journal = journal_init_common();
+	int err;
+	int n;
+	unsigned int blocknr;
+
+	if (!journal)
+		return NULL;
+
+	journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
+	journal->j_inode = inode;
+	jbd_debug(1,
+		  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
+		  journal, inode->i_sb->s_id, inode->i_ino,
+		  (long long) inode->i_size,
+		  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
+
+	journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
+	journal->j_blocksize = inode->i_sb->s_blocksize;
+
+	/* journal descriptor can store up to n blocks -bzzz */
+	n = journal->j_blocksize / sizeof(journal_block_tag_t);
+	journal->j_wbufsize = n;
+	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+	if (!journal->j_wbuf) {
+		printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
+			__func__);
+		goto out_err;
+	}
+
+	err = journal_bmap(journal, 0, &blocknr);
+	/* If that failed, give up */
+	if (err) {
+		printk(KERN_ERR "%s: Cannot locate journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
+
+	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh) {
+		printk(KERN_ERR
+		       "%s: Cannot get buffer for journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
+	journal->j_sb_buffer = bh;
+	journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
+	return journal;
+out_err:
+	kfree(journal->j_wbuf);
+	kfree(journal);
+	return NULL;
+}
+
+/*
+ * If the journal init or create aborts, we need to mark the journal
+ * superblock as being NULL to prevent the journal destroy from writing
+ * back a bogus superblock.
+ */
+static void journal_fail_superblock (journal_t *journal)
+{
+	struct buffer_head *bh = journal->j_sb_buffer;
+	brelse(bh);
+	journal->j_sb_buffer = NULL;
+}
+
+/*
+ * Given a journal_t structure, initialise the various fields for
+ * startup of a new journaling session.  We use this both when creating
+ * a journal, and after recovering an old journal to reset it for
+ * subsequent use.
+ */
+
+static int journal_reset(journal_t *journal)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+	unsigned int first, last;
+
+	first = be32_to_cpu(sb->s_first);
+	last = be32_to_cpu(sb->s_maxlen);
+	if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
+		printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
+		       first, last);
+		journal_fail_superblock(journal);
+		return -EINVAL;
+	}
+
+	journal->j_first = first;
+	journal->j_last = last;
+
+	journal->j_head = first;
+	journal->j_tail = first;
+	journal->j_free = last - first;
+
+	journal->j_tail_sequence = journal->j_transaction_sequence;
+	journal->j_commit_sequence = journal->j_transaction_sequence - 1;
+	journal->j_commit_request = journal->j_commit_sequence;
+
+	journal->j_max_transaction_buffers = journal->j_maxlen / 4;
+
+	/* Add the dynamic fields and write it to disk. */
+	journal_update_superblock(journal, 1);
+	return journal_start_thread(journal);
+}
+
+/**
+ * int journal_create() - Initialise the new journal file
+ * @journal: Journal to create. This structure must have been initialised
+ *
+ * Given a journal_t structure which tells us which disk blocks we can
+ * use, create a new journal superblock and initialise all of the
+ * journal fields from scratch.
+ **/
+int journal_create(journal_t *journal)
+{
+	unsigned int blocknr;
+	struct buffer_head *bh;
+	journal_superblock_t *sb;
+	int i, err;
+
+	if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
+		printk (KERN_ERR "Journal length (%d blocks) too short.\n",
+			journal->j_maxlen);
+		journal_fail_superblock(journal);
+		return -EINVAL;
+	}
+
+	if (journal->j_inode == NULL) {
+		/*
+		 * We don't know what block to start at!
+		 */
+		printk(KERN_EMERG
+		       "%s: creation of journal on external device!\n",
+		       __func__);
+		BUG();
+	}
+
+	/* Zero out the entire journal on disk.  We cannot afford to
+	   have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
+	jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
+	for (i = 0; i < journal->j_maxlen; i++) {
+		err = journal_bmap(journal, i, &blocknr);
+		if (err)
+			return err;
+		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+		if (unlikely(!bh))
+			return -ENOMEM;
+		lock_buffer(bh);
+		memset (bh->b_data, 0, journal->j_blocksize);
+		BUFFER_TRACE(bh, "marking dirty");
+		mark_buffer_dirty(bh);
+		BUFFER_TRACE(bh, "marking uptodate");
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		__brelse(bh);
+	}
+
+	sync_blockdev(journal->j_dev);
+	jbd_debug(1, "JBD: journal cleared.\n");
+
+	/* OK, fill in the initial static fields in the new superblock */
+	sb = journal->j_superblock;
+
+	sb->s_header.h_magic	 = cpu_to_be32(JFS_MAGIC_NUMBER);
+	sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
+
+	sb->s_blocksize	= cpu_to_be32(journal->j_blocksize);
+	sb->s_maxlen	= cpu_to_be32(journal->j_maxlen);
+	sb->s_first	= cpu_to_be32(1);
+
+	journal->j_transaction_sequence = 1;
+
+	journal->j_flags &= ~JFS_ABORT;
+	journal->j_format_version = 2;
+
+	return journal_reset(journal);
+}
+
+/**
+ * void journal_update_superblock() - Update journal sb on disk.
+ * @journal: The journal to update.
+ * @wait: Set to '0' if you don't want to wait for IO completion.
+ *
+ * Update a journal's dynamic superblock fields and write it to disk,
+ * optionally waiting for the IO to complete.
+ */
+void journal_update_superblock(journal_t *journal, int wait)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+	struct buffer_head *bh = journal->j_sb_buffer;
+
+	/*
+	 * As a special case, if the on-disk copy is already marked as needing
+	 * no recovery (s_start == 0) and there are no outstanding transactions
+	 * in the filesystem, then we can safely defer the superblock update
+	 * until the next commit by setting JFS_FLUSHED.  This avoids
+	 * attempting a write to a potential-readonly device.
+	 */
+	if (sb->s_start == 0 && journal->j_tail_sequence ==
+				journal->j_transaction_sequence) {
+		jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
+			"(start %u, seq %d, errno %d)\n",
+			journal->j_tail, journal->j_tail_sequence,
+			journal->j_errno);
+		goto out;
+	}
+
+	if (buffer_write_io_error(bh)) {
+		char b[BDEVNAME_SIZE];
+		/*
+		 * Oh, dear.  A previous attempt to write the journal
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		printk(KERN_ERR "JBD: previous I/O error detected "
+		       "for journal superblock update for %s.\n",
+		       journal_dev_name(journal, b));
+		clear_buffer_write_io_error(bh);
+		set_buffer_uptodate(bh);
+	}
+
+	spin_lock(&journal->j_state_lock);
+	jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
+		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
+
+	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
+	sb->s_start    = cpu_to_be32(journal->j_tail);
+	sb->s_errno    = cpu_to_be32(journal->j_errno);
+	spin_unlock(&journal->j_state_lock);
+
+	BUFFER_TRACE(bh, "marking dirty");
+	mark_buffer_dirty(bh);
+	if (wait) {
+		sync_dirty_buffer(bh);
+		if (buffer_write_io_error(bh)) {
+			char b[BDEVNAME_SIZE];
+			printk(KERN_ERR "JBD: I/O error detected "
+			       "when updating journal superblock for %s.\n",
+			       journal_dev_name(journal, b));
+			clear_buffer_write_io_error(bh);
+			set_buffer_uptodate(bh);
+		}
+	} else
+		write_dirty_buffer(bh, WRITE);
+
+out:
+	/* If we have just flushed the log (by marking s_start==0), then
+	 * any future commit will have to be careful to update the
+	 * superblock again to re-record the true start of the log. */
+
+	spin_lock(&journal->j_state_lock);
+	if (sb->s_start)
+		journal->j_flags &= ~JFS_FLUSHED;
+	else
+		journal->j_flags |= JFS_FLUSHED;
+	spin_unlock(&journal->j_state_lock);
+}
+
+/*
+ * Read the superblock for a given journal, performing initial
+ * validation of the format.
+ */
+
+static int journal_get_superblock(journal_t *journal)
+{
+	struct buffer_head *bh;
+	journal_superblock_t *sb;
+	int err = -EIO;
+
+	bh = journal->j_sb_buffer;
+
+	J_ASSERT(bh != NULL);
+	if (!buffer_uptodate(bh)) {
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			printk (KERN_ERR
+				"JBD: IO error reading journal superblock\n");
+			goto out;
+		}
+	}
+
+	sb = journal->j_superblock;
+
+	err = -EINVAL;
+
+	if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) ||
+	    sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
+		printk(KERN_WARNING "JBD: no valid journal superblock found\n");
+		goto out;
+	}
+
+	switch(be32_to_cpu(sb->s_header.h_blocktype)) {
+	case JFS_SUPERBLOCK_V1:
+		journal->j_format_version = 1;
+		break;
+	case JFS_SUPERBLOCK_V2:
+		journal->j_format_version = 2;
+		break;
+	default:
+		printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
+		goto out;
+	}
+
+	if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
+		journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
+	else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
+		printk (KERN_WARNING "JBD: journal file too short\n");
+		goto out;
+	}
+
+	if (be32_to_cpu(sb->s_first) == 0 ||
+	    be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
+		printk(KERN_WARNING
+			"JBD: Invalid start block of journal: %u\n",
+			be32_to_cpu(sb->s_first));
+		goto out;
+	}
+
+	return 0;
+
+out:
+	journal_fail_superblock(journal);
+	return err;
+}
+
+/*
+ * Load the on-disk journal superblock and read the key fields into the
+ * journal_t.
+ */
+
+static int load_superblock(journal_t *journal)
+{
+	int err;
+	journal_superblock_t *sb;
+
+	err = journal_get_superblock(journal);
+	if (err)
+		return err;
+
+	sb = journal->j_superblock;
+
+	journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
+	journal->j_tail = be32_to_cpu(sb->s_start);
+	journal->j_first = be32_to_cpu(sb->s_first);
+	journal->j_last = be32_to_cpu(sb->s_maxlen);
+	journal->j_errno = be32_to_cpu(sb->s_errno);
+
+	return 0;
+}
+
+
+/**
+ * int journal_load() - Read journal from disk.
+ * @journal: Journal to act on.
+ *
+ * Given a journal_t structure which tells us which disk blocks contain
+ * a journal, read the journal from disk to initialise the in-memory
+ * structures.
+ */
+int journal_load(journal_t *journal)
+{
+	int err;
+	journal_superblock_t *sb;
+
+	err = load_superblock(journal);
+	if (err)
+		return err;
+
+	sb = journal->j_superblock;
+	/* If this is a V2 superblock, then we have to check the
+	 * features flags on it. */
+
+	if (journal->j_format_version >= 2) {
+		if ((sb->s_feature_ro_compat &
+		     ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
+		    (sb->s_feature_incompat &
+		     ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) {
+			printk (KERN_WARNING
+				"JBD: Unrecognised features on journal\n");
+			return -EINVAL;
+		}
+	}
+
+	/* Let the recovery code check whether it needs to recover any
+	 * data from the journal. */
+	if (journal_recover(journal))
+		goto recovery_error;
+
+	/* OK, we've finished with the dynamic journal bits:
+	 * reinitialise the dynamic contents of the superblock in memory
+	 * and reset them on disk. */
+	if (journal_reset(journal))
+		goto recovery_error;
+
+	journal->j_flags &= ~JFS_ABORT;
+	journal->j_flags |= JFS_LOADED;
+	return 0;
+
+recovery_error:
+	printk (KERN_WARNING "JBD: recovery failed\n");
+	return -EIO;
+}
+
+/**
+ * void journal_destroy() - Release a journal_t structure.
+ * @journal: Journal to act on.
+ *
+ * Release a journal_t structure once it is no longer in use by the
+ * journaled object.
+ * Return <0 if we couldn't clean up the journal.
+ */
+int journal_destroy(journal_t *journal)
+{
+	int err = 0;
+
+	
+	/* Wait for the commit thread to wake up and die. */
+	journal_kill_thread(journal);
+
+	/* Force a final log commit */
+	if (journal->j_running_transaction)
+		journal_commit_transaction(journal);
+
+	/* Force any old transactions to disk */
+
+	/* Totally anal locking here... */
+	spin_lock(&journal->j_list_lock);
+	while (journal->j_checkpoint_transactions != NULL) {
+		spin_unlock(&journal->j_list_lock);
+		log_do_checkpoint(journal);
+		spin_lock(&journal->j_list_lock);
+	}
+
+	J_ASSERT(journal->j_running_transaction == NULL);
+	J_ASSERT(journal->j_committing_transaction == NULL);
+	J_ASSERT(journal->j_checkpoint_transactions == NULL);
+	spin_unlock(&journal->j_list_lock);
+
+	if (journal->j_sb_buffer) {
+		if (!is_journal_aborted(journal)) {
+			/* We can now mark the journal as empty. */
+			journal->j_tail = 0;
+			journal->j_tail_sequence =
+				++journal->j_transaction_sequence;
+			journal_update_superblock(journal, 1);
+		} else {
+			err = -EIO;
+		}
+		brelse(journal->j_sb_buffer);
+	}
+
+	if (journal->j_inode)
+		iput(journal->j_inode);
+	if (journal->j_revoke)
+		journal_destroy_revoke(journal);
+	kfree(journal->j_wbuf);
+	kfree(journal);
+
+	return err;
+}
+
+
+/**
+ *int journal_check_used_features () - Check if features specified are used.
+ * @journal: Journal to check.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Check whether the journal uses all of a given set of
+ * features.  Return true (non-zero) if it does.
+ **/
+
+int journal_check_used_features (journal_t *journal, unsigned long compat,
+				 unsigned long ro, unsigned long incompat)
+{
+	journal_superblock_t *sb;
+
+	if (!compat && !ro && !incompat)
+		return 1;
+	if (journal->j_format_version == 1)
+		return 0;
+
+	sb = journal->j_superblock;
+
+	if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
+	    ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
+	    ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
+		return 1;
+
+	return 0;
+}
+
+/**
+ * int journal_check_available_features() - Check feature set in journalling layer
+ * @journal: Journal to check.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Check whether the journaling code supports the use of
+ * all of a given set of features on this journal.  Return true
+ * (non-zero) if it can. */
+
+int journal_check_available_features (journal_t *journal, unsigned long compat,
+				      unsigned long ro, unsigned long incompat)
+{
+	if (!compat && !ro && !incompat)
+		return 1;
+
+	/* We can support any known requested features iff the
+	 * superblock is in version 2.  Otherwise we fail to support any
+	 * extended sb features. */
+
+	if (journal->j_format_version != 2)
+		return 0;
+
+	if ((compat   & JFS_KNOWN_COMPAT_FEATURES) == compat &&
+	    (ro       & JFS_KNOWN_ROCOMPAT_FEATURES) == ro &&
+	    (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * int journal_set_features () - Mark a given journal feature in the superblock
+ * @journal: Journal to act on.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Mark a given journal feature as present on the
+ * superblock.  Returns true if the requested features could be set.
+ *
+ */
+
+int journal_set_features (journal_t *journal, unsigned long compat,
+			  unsigned long ro, unsigned long incompat)
+{
+	journal_superblock_t *sb;
+
+	if (journal_check_used_features(journal, compat, ro, incompat))
+		return 1;
+
+	if (!journal_check_available_features(journal, compat, ro, incompat))
+		return 0;
+
+	jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+		  compat, ro, incompat);
+
+	sb = journal->j_superblock;
+
+	sb->s_feature_compat    |= cpu_to_be32(compat);
+	sb->s_feature_ro_compat |= cpu_to_be32(ro);
+	sb->s_feature_incompat  |= cpu_to_be32(incompat);
+
+	return 1;
+}
+
+
+/**
+ * int journal_update_format () - Update on-disk journal structure.
+ * @journal: Journal to act on.
+ *
+ * Given an initialised but unloaded journal struct, poke about in the
+ * on-disk structure to update it to the most recent supported version.
+ */
+int journal_update_format (journal_t *journal)
+{
+	journal_superblock_t *sb;
+	int err;
+
+	err = journal_get_superblock(journal);
+	if (err)
+		return err;
+
+	sb = journal->j_superblock;
+
+	switch (be32_to_cpu(sb->s_header.h_blocktype)) {
+	case JFS_SUPERBLOCK_V2:
+		return 0;
+	case JFS_SUPERBLOCK_V1:
+		return journal_convert_superblock_v1(journal, sb);
+	default:
+		break;
+	}
+	return -EINVAL;
+}
+
+static int journal_convert_superblock_v1(journal_t *journal,
+					 journal_superblock_t *sb)
+{
+	int offset, blocksize;
+	struct buffer_head *bh;
+
+	printk(KERN_WARNING
+		"JBD: Converting superblock from version 1 to 2.\n");
+
+	/* Pre-initialise new fields to zero */
+	offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
+	blocksize = be32_to_cpu(sb->s_blocksize);
+	memset(&sb->s_feature_compat, 0, blocksize-offset);
+
+	sb->s_nr_users = cpu_to_be32(1);
+	sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
+	journal->j_format_version = 2;
+
+	bh = journal->j_sb_buffer;
+	BUFFER_TRACE(bh, "marking dirty");
+	mark_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+	return 0;
+}
+
+
+/**
+ * int journal_flush () - Flush journal
+ * @journal: Journal to act on.
+ *
+ * Flush all data for a given journal to disk and empty the journal.
+ * Filesystems can use this when remounting readonly to ensure that
+ * recovery does not need to happen on remount.
+ */
+
+int journal_flush(journal_t *journal)
+{
+	int err = 0;
+	transaction_t *transaction = NULL;
+	unsigned int old_tail;
+
+	spin_lock(&journal->j_state_lock);
+
+	/* Force everything buffered to the log... */
+	if (journal->j_running_transaction) {
+		transaction = journal->j_running_transaction;
+		__log_start_commit(journal, transaction->t_tid);
+	} else if (journal->j_committing_transaction)
+		transaction = journal->j_committing_transaction;
+
+	/* Wait for the log commit to complete... */
+	if (transaction) {
+		tid_t tid = transaction->t_tid;
+
+		spin_unlock(&journal->j_state_lock);
+		log_wait_commit(journal, tid);
+	} else {
+		spin_unlock(&journal->j_state_lock);
+	}
+
+	/* ...and flush everything in the log out to disk. */
+	spin_lock(&journal->j_list_lock);
+	while (!err && journal->j_checkpoint_transactions != NULL) {
+		spin_unlock(&journal->j_list_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
+		err = log_do_checkpoint(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
+		spin_lock(&journal->j_list_lock);
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	if (is_journal_aborted(journal))
+		return -EIO;
+
+	cleanup_journal_tail(journal);
+
+	/* Finally, mark the journal as really needing no recovery.
+	 * This sets s_start==0 in the underlying superblock, which is
+	 * the magic code for a fully-recovered superblock.  Any future
+	 * commits of data to the journal will restore the current
+	 * s_start value. */
+	spin_lock(&journal->j_state_lock);
+	old_tail = journal->j_tail;
+	journal->j_tail = 0;
+	spin_unlock(&journal->j_state_lock);
+	journal_update_superblock(journal, 1);
+	spin_lock(&journal->j_state_lock);
+	journal->j_tail = old_tail;
+
+	J_ASSERT(!journal->j_running_transaction);
+	J_ASSERT(!journal->j_committing_transaction);
+	J_ASSERT(!journal->j_checkpoint_transactions);
+	J_ASSERT(journal->j_head == journal->j_tail);
+	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
+	spin_unlock(&journal->j_state_lock);
+	return 0;
+}
+
+/**
+ * int journal_wipe() - Wipe journal contents
+ * @journal: Journal to act on.
+ * @write: flag (see below)
+ *
+ * Wipe out all of the contents of a journal, safely.  This will produce
+ * a warning if the journal contains any valid recovery information.
+ * Must be called between journal_init_*() and journal_load().
+ *
+ * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
+ * we merely suppress recovery.
+ */
+
+int journal_wipe(journal_t *journal, int write)
+{
+	int err = 0;
+
+	J_ASSERT (!(journal->j_flags & JFS_LOADED));
+
+	err = load_superblock(journal);
+	if (err)
+		return err;
+
+	if (!journal->j_tail)
+		goto no_recovery;
+
+	printk (KERN_WARNING "JBD: %s recovery information on journal\n",
+		write ? "Clearing" : "Ignoring");
+
+	err = journal_skip_recovery(journal);
+	if (write)
+		journal_update_superblock(journal, 1);
+
+ no_recovery:
+	return err;
+}
+
+/*
+ * journal_dev_name: format a character string to describe on what
+ * device this journal is present.
+ */
+
+static const char *journal_dev_name(journal_t *journal, char *buffer)
+{
+	struct block_device *bdev;
+
+	if (journal->j_inode)
+		bdev = journal->j_inode->i_sb->s_bdev;
+	else
+		bdev = journal->j_dev;
+
+	return bdevname(bdev, buffer);
+}
+
+/*
+ * Journal abort has very specific semantics, which we describe
+ * for journal abort.
+ *
+ * Two internal function, which provide abort to te jbd layer
+ * itself are here.
+ */
+
+/*
+ * Quick version for internal journal use (doesn't lock the journal).
+ * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
+ * and don't attempt to make any other journal updates.
+ */
+static void __journal_abort_hard(journal_t *journal)
+{
+	transaction_t *transaction;
+	char b[BDEVNAME_SIZE];
+
+	if (journal->j_flags & JFS_ABORT)
+		return;
+
+	printk(KERN_ERR "Aborting journal on device %s.\n",
+		journal_dev_name(journal, b));
+
+	spin_lock(&journal->j_state_lock);
+	journal->j_flags |= JFS_ABORT;
+	transaction = journal->j_running_transaction;
+	if (transaction)
+		__log_start_commit(journal, transaction->t_tid);
+	spin_unlock(&journal->j_state_lock);
+}
+
+/* Soft abort: record the abort error status in the journal superblock,
+ * but don't do any other IO. */
+static void __journal_abort_soft (journal_t *journal, int errno)
+{
+	if (journal->j_flags & JFS_ABORT)
+		return;
+
+	if (!journal->j_errno)
+		journal->j_errno = errno;
+
+	__journal_abort_hard(journal);
+
+	if (errno)
+		journal_update_superblock(journal, 1);
+}
+
+/**
+ * void journal_abort () - Shutdown the journal immediately.
+ * @journal: the journal to shutdown.
+ * @errno:   an error number to record in the journal indicating
+ *           the reason for the shutdown.
+ *
+ * Perform a complete, immediate shutdown of the ENTIRE
+ * journal (not of a single transaction).  This operation cannot be
+ * undone without closing and reopening the journal.
+ *
+ * The journal_abort function is intended to support higher level error
+ * recovery mechanisms such as the ext2/ext3 remount-readonly error
+ * mode.
+ *
+ * Journal abort has very specific semantics.  Any existing dirty,
+ * unjournaled buffers in the main filesystem will still be written to
+ * disk by bdflush, but the journaling mechanism will be suspended
+ * immediately and no further transaction commits will be honoured.
+ *
+ * Any dirty, journaled buffers will be written back to disk without
+ * hitting the journal.  Atomicity cannot be guaranteed on an aborted
+ * filesystem, but we _do_ attempt to leave as much data as possible
+ * behind for fsck to use for cleanup.
+ *
+ * Any attempt to get a new transaction handle on a journal which is in
+ * ABORT state will just result in an -EROFS error return.  A
+ * journal_stop on an existing handle will return -EIO if we have
+ * entered abort state during the update.
+ *
+ * Recursive transactions are not disturbed by journal abort until the
+ * final journal_stop, which will receive the -EIO error.
+ *
+ * Finally, the journal_abort call allows the caller to supply an errno
+ * which will be recorded (if possible) in the journal superblock.  This
+ * allows a client to record failure conditions in the middle of a
+ * transaction without having to complete the transaction to record the
+ * failure to disk.  ext3_error, for example, now uses this
+ * functionality.
+ *
+ * Errors which originate from within the journaling layer will NOT
+ * supply an errno; a null errno implies that absolutely no further
+ * writes are done to the journal (unless there are any already in
+ * progress).
+ *
+ */
+
+void journal_abort(journal_t *journal, int errno)
+{
+	__journal_abort_soft(journal, errno);
+}
+
+/**
+ * int journal_errno () - returns the journal's error state.
+ * @journal: journal to examine.
+ *
+ * This is the errno numbet set with journal_abort(), the last
+ * time the journal was mounted - if the journal was stopped
+ * without calling abort this will be 0.
+ *
+ * If the journal has been aborted on this mount time -EROFS will
+ * be returned.
+ */
+int journal_errno(journal_t *journal)
+{
+	int err;
+
+	spin_lock(&journal->j_state_lock);
+	if (journal->j_flags & JFS_ABORT)
+		err = -EROFS;
+	else
+		err = journal->j_errno;
+	spin_unlock(&journal->j_state_lock);
+	return err;
+}
+
+/**
+ * int journal_clear_err () - clears the journal's error state
+ * @journal: journal to act on.
+ *
+ * An error must be cleared or Acked to take a FS out of readonly
+ * mode.
+ */
+int journal_clear_err(journal_t *journal)
+{
+	int err = 0;
+
+	spin_lock(&journal->j_state_lock);
+	if (journal->j_flags & JFS_ABORT)
+		err = -EROFS;
+	else
+		journal->j_errno = 0;
+	spin_unlock(&journal->j_state_lock);
+	return err;
+}
+
+/**
+ * void journal_ack_err() - Ack journal err.
+ * @journal: journal to act on.
+ *
+ * An error must be cleared or Acked to take a FS out of readonly
+ * mode.
+ */
+void journal_ack_err(journal_t *journal)
+{
+	spin_lock(&journal->j_state_lock);
+	if (journal->j_errno)
+		journal->j_flags |= JFS_ACK_ERR;
+	spin_unlock(&journal->j_state_lock);
+}
+
+int journal_blocks_per_page(struct inode *inode)
+{
+	return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+}
+
+/*
+ * Journal_head storage management
+ */
+static struct kmem_cache *journal_head_cache;
+#ifdef CONFIG_JBD_DEBUG
+static atomic_t nr_journal_heads = ATOMIC_INIT(0);
+#endif
+
+static int journal_init_journal_head_cache(void)
+{
+	int retval;
+
+	J_ASSERT(journal_head_cache == NULL);
+	journal_head_cache = kmem_cache_create("journal_head",
+				sizeof(struct journal_head),
+				0,		/* offset */
+				SLAB_TEMPORARY,	/* flags */
+				NULL);		/* ctor */
+	retval = 0;
+	if (!journal_head_cache) {
+		retval = -ENOMEM;
+		printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
+	}
+	return retval;
+}
+
+static void journal_destroy_journal_head_cache(void)
+{
+	if (journal_head_cache) {
+		kmem_cache_destroy(journal_head_cache);
+		journal_head_cache = NULL;
+	}
+}
+
+/*
+ * journal_head splicing and dicing
+ */
+static struct journal_head *journal_alloc_journal_head(void)
+{
+	struct journal_head *ret;
+
+#ifdef CONFIG_JBD_DEBUG
+	atomic_inc(&nr_journal_heads);
+#endif
+	ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
+	if (ret == NULL) {
+		jbd_debug(1, "out of memory for journal_head\n");
+		printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
+				   __func__);
+
+		while (ret == NULL) {
+			yield();
+			ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
+		}
+	}
+	return ret;
+}
+
+static void journal_free_journal_head(struct journal_head *jh)
+{
+#ifdef CONFIG_JBD_DEBUG
+	atomic_dec(&nr_journal_heads);
+	memset(jh, JBD_POISON_FREE, sizeof(*jh));
+#endif
+	kmem_cache_free(journal_head_cache, jh);
+}
+
+/*
+ * A journal_head is attached to a buffer_head whenever JBD has an
+ * interest in the buffer.
+ *
+ * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
+ * is set.  This bit is tested in core kernel code where we need to take
+ * JBD-specific actions.  Testing the zeroness of ->b_private is not reliable
+ * there.
+ *
+ * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
+ *
+ * When a buffer has its BH_JBD bit set it is immune from being released by
+ * core kernel code, mainly via ->b_count.
+ *
+ * A journal_head may be detached from its buffer_head when the journal_head's
+ * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
+ * Various places in JBD call journal_remove_journal_head() to indicate that the
+ * journal_head can be dropped if needed.
+ *
+ * Various places in the kernel want to attach a journal_head to a buffer_head
+ * _before_ attaching the journal_head to a transaction.  To protect the
+ * journal_head in this situation, journal_add_journal_head elevates the
+ * journal_head's b_jcount refcount by one.  The caller must call
+ * journal_put_journal_head() to undo this.
+ *
+ * So the typical usage would be:
+ *
+ *	(Attach a journal_head if needed.  Increments b_jcount)
+ *	struct journal_head *jh = journal_add_journal_head(bh);
+ *	...
+ *	jh->b_transaction = xxx;
+ *	journal_put_journal_head(jh);
+ *
+ * Now, the journal_head's b_jcount is zero, but it is safe from being released
+ * because it has a non-zero b_transaction.
+ */
+
+/*
+ * Give a buffer_head a journal_head.
+ *
+ * Doesn't need the journal lock.
+ * May sleep.
+ */
+struct journal_head *journal_add_journal_head(struct buffer_head *bh)
+{
+	struct journal_head *jh;
+	struct journal_head *new_jh = NULL;
+
+repeat:
+	if (!buffer_jbd(bh)) {
+		new_jh = journal_alloc_journal_head();
+		memset(new_jh, 0, sizeof(*new_jh));
+	}
+
+	jbd_lock_bh_journal_head(bh);
+	if (buffer_jbd(bh)) {
+		jh = bh2jh(bh);
+	} else {
+		J_ASSERT_BH(bh,
+			(atomic_read(&bh->b_count) > 0) ||
+			(bh->b_page && bh->b_page->mapping));
+
+		if (!new_jh) {
+			jbd_unlock_bh_journal_head(bh);
+			goto repeat;
+		}
+
+		jh = new_jh;
+		new_jh = NULL;		/* We consumed it */
+		set_buffer_jbd(bh);
+		bh->b_private = jh;
+		jh->b_bh = bh;
+		get_bh(bh);
+		BUFFER_TRACE(bh, "added journal_head");
+	}
+	jh->b_jcount++;
+	jbd_unlock_bh_journal_head(bh);
+	if (new_jh)
+		journal_free_journal_head(new_jh);
+	return bh->b_private;
+}
+
+/*
+ * Grab a ref against this buffer_head's journal_head.  If it ended up not
+ * having a journal_head, return NULL
+ */
+struct journal_head *journal_grab_journal_head(struct buffer_head *bh)
+{
+	struct journal_head *jh = NULL;
+
+	jbd_lock_bh_journal_head(bh);
+	if (buffer_jbd(bh)) {
+		jh = bh2jh(bh);
+		jh->b_jcount++;
+	}
+	jbd_unlock_bh_journal_head(bh);
+	return jh;
+}
+
+static void __journal_remove_journal_head(struct buffer_head *bh)
+{
+	struct journal_head *jh = bh2jh(bh);
+
+	J_ASSERT_JH(jh, jh->b_jcount >= 0);
+
+	get_bh(bh);
+	if (jh->b_jcount == 0) {
+		if (jh->b_transaction == NULL &&
+				jh->b_next_transaction == NULL &&
+				jh->b_cp_transaction == NULL) {
+			J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
+			J_ASSERT_BH(bh, buffer_jbd(bh));
+			J_ASSERT_BH(bh, jh2bh(jh) == bh);
+			BUFFER_TRACE(bh, "remove journal_head");
+			if (jh->b_frozen_data) {
+				printk(KERN_WARNING "%s: freeing "
+						"b_frozen_data\n",
+						__func__);
+				jbd_free(jh->b_frozen_data, bh->b_size);
+			}
+			if (jh->b_committed_data) {
+				printk(KERN_WARNING "%s: freeing "
+						"b_committed_data\n",
+						__func__);
+				jbd_free(jh->b_committed_data, bh->b_size);
+			}
+			bh->b_private = NULL;
+			jh->b_bh = NULL;	/* debug, really */
+			clear_buffer_jbd(bh);
+			__brelse(bh);
+			journal_free_journal_head(jh);
+		} else {
+			BUFFER_TRACE(bh, "journal_head was locked");
+		}
+	}
+}
+
+/*
+ * journal_remove_journal_head(): if the buffer isn't attached to a transaction
+ * and has a zero b_jcount then remove and release its journal_head.   If we did
+ * see that the buffer is not used by any transaction we also "logically"
+ * decrement ->b_count.
+ *
+ * We in fact take an additional increment on ->b_count as a convenience,
+ * because the caller usually wants to do additional things with the bh
+ * after calling here.
+ * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some
+ * time.  Once the caller has run __brelse(), the buffer is eligible for
+ * reaping by try_to_free_buffers().
+ */
+void journal_remove_journal_head(struct buffer_head *bh)
+{
+	jbd_lock_bh_journal_head(bh);
+	__journal_remove_journal_head(bh);
+	jbd_unlock_bh_journal_head(bh);
+}
+
+/*
+ * Drop a reference on the passed journal_head.  If it fell to zero then try to
+ * release the journal_head from the buffer_head.
+ */
+void journal_put_journal_head(struct journal_head *jh)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	jbd_lock_bh_journal_head(bh);
+	J_ASSERT_JH(jh, jh->b_jcount > 0);
+	--jh->b_jcount;
+	if (!jh->b_jcount && !jh->b_transaction) {
+		__journal_remove_journal_head(bh);
+		__brelse(bh);
+	}
+	jbd_unlock_bh_journal_head(bh);
+}
+
+/*
+ * debugfs tunables
+ */
+#ifdef CONFIG_JBD_DEBUG
+
+u8 journal_enable_debug __read_mostly;
+EXPORT_SYMBOL(journal_enable_debug);
+
+static struct dentry *jbd_debugfs_dir;
+static struct dentry *jbd_debug;
+
+static void __init jbd_create_debugfs_entry(void)
+{
+	jbd_debugfs_dir = debugfs_create_dir("jbd", NULL);
+	if (jbd_debugfs_dir)
+		jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR,
+					       jbd_debugfs_dir,
+					       &journal_enable_debug);
+}
+
+static void __exit jbd_remove_debugfs_entry(void)
+{
+	debugfs_remove(jbd_debug);
+	debugfs_remove(jbd_debugfs_dir);
+}
+
+#else
+
+static inline void jbd_create_debugfs_entry(void)
+{
+}
+
+static inline void jbd_remove_debugfs_entry(void)
+{
+}
+
+#endif
+
+struct kmem_cache *jbd_handle_cache;
+
+static int __init journal_init_handle_cache(void)
+{
+	jbd_handle_cache = kmem_cache_create("journal_handle",
+				sizeof(handle_t),
+				0,		/* offset */
+				SLAB_TEMPORARY,	/* flags */
+				NULL);		/* ctor */
+	if (jbd_handle_cache == NULL) {
+		printk(KERN_EMERG "JBD: failed to create handle cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void journal_destroy_handle_cache(void)
+{
+	if (jbd_handle_cache)
+		kmem_cache_destroy(jbd_handle_cache);
+}
+
+/*
+ * Module startup and shutdown
+ */
+
+static int __init journal_init_caches(void)
+{
+	int ret;
+
+	ret = journal_init_revoke_caches();
+	if (ret == 0)
+		ret = journal_init_journal_head_cache();
+	if (ret == 0)
+		ret = journal_init_handle_cache();
+	return ret;
+}
+
+static void journal_destroy_caches(void)
+{
+	journal_destroy_revoke_caches();
+	journal_destroy_journal_head_cache();
+	journal_destroy_handle_cache();
+}
+
+static int __init journal_init(void)
+{
+	int ret;
+
+	BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
+
+	ret = journal_init_caches();
+	if (ret != 0)
+		journal_destroy_caches();
+	jbd_create_debugfs_entry();
+	return ret;
+}
+
+static void __exit journal_exit(void)
+{
+#ifdef CONFIG_JBD_DEBUG
+	int n = atomic_read(&nr_journal_heads);
+	if (n)
+		printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+#endif
+	jbd_remove_debugfs_entry();
+	journal_destroy_caches();
+}
+
+MODULE_LICENSE("GPL");
+module_init(journal_init);
+module_exit(journal_exit);
+
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
new file mode 100644
index 00000000..5b43e967
--- /dev/null
+++ b/fs/jbd/recovery.c
@@ -0,0 +1,587 @@
+/*
+ * linux/fs/jbd/recovery.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal recovery routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+
+#ifndef __KERNEL__
+#include "jfs_user.h"
+#else
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/errno.h>
+#endif
+
+/*
+ * Maintain information about the progress of the recovery job, so that
+ * the different passes can carry information between them.
+ */
+struct recovery_info
+{
+	tid_t		start_transaction;
+	tid_t		end_transaction;
+
+	int		nr_replays;
+	int		nr_revokes;
+	int		nr_revoke_hits;
+};
+
+enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
+static int do_one_pass(journal_t *journal,
+				struct recovery_info *info, enum passtype pass);
+static int scan_revoke_records(journal_t *, struct buffer_head *,
+				tid_t, struct recovery_info *);
+
+#ifdef __KERNEL__
+
+/* Release readahead buffers after use */
+static void journal_brelse_array(struct buffer_head *b[], int n)
+{
+	while (--n >= 0)
+		brelse (b[n]);
+}
+
+
+/*
+ * When reading from the journal, we are going through the block device
+ * layer directly and so there is no readahead being done for us.  We
+ * need to implement any readahead ourselves if we want it to happen at
+ * all.  Recovery is basically one long sequential read, so make sure we
+ * do the IO in reasonably large chunks.
+ *
+ * This is not so critical that we need to be enormously clever about
+ * the readahead size, though.  128K is a purely arbitrary, good-enough
+ * fixed value.
+ */
+
+#define MAXBUF 8
+static int do_readahead(journal_t *journal, unsigned int start)
+{
+	int err;
+	unsigned int max, nbufs, next;
+	unsigned int blocknr;
+	struct buffer_head *bh;
+
+	struct buffer_head * bufs[MAXBUF];
+
+	/* Do up to 128K of readahead */
+	max = start + (128 * 1024 / journal->j_blocksize);
+	if (max > journal->j_maxlen)
+		max = journal->j_maxlen;
+
+	/* Do the readahead itself.  We'll submit MAXBUF buffer_heads at
+	 * a time to the block device IO layer. */
+
+	nbufs = 0;
+
+	for (next = start; next < max; next++) {
+		err = journal_bmap(journal, next, &blocknr);
+
+		if (err) {
+			printk (KERN_ERR "JBD: bad block at offset %u\n",
+				next);
+			goto failed;
+		}
+
+		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+		if (!bh) {
+			err = -ENOMEM;
+			goto failed;
+		}
+
+		if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
+			bufs[nbufs++] = bh;
+			if (nbufs == MAXBUF) {
+				ll_rw_block(READ, nbufs, bufs);
+				journal_brelse_array(bufs, nbufs);
+				nbufs = 0;
+			}
+		} else
+			brelse(bh);
+	}
+
+	if (nbufs)
+		ll_rw_block(READ, nbufs, bufs);
+	err = 0;
+
+failed:
+	if (nbufs)
+		journal_brelse_array(bufs, nbufs);
+	return err;
+}
+
+#endif /* __KERNEL__ */
+
+
+/*
+ * Read a block from the journal
+ */
+
+static int jread(struct buffer_head **bhp, journal_t *journal,
+		 unsigned int offset)
+{
+	int err;
+	unsigned int blocknr;
+	struct buffer_head *bh;
+
+	*bhp = NULL;
+
+	if (offset >= journal->j_maxlen) {
+		printk(KERN_ERR "JBD: corrupted journal superblock\n");
+		return -EIO;
+	}
+
+	err = journal_bmap(journal, offset, &blocknr);
+
+	if (err) {
+		printk (KERN_ERR "JBD: bad block at offset %u\n",
+			offset);
+		return err;
+	}
+
+	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh)
+		return -ENOMEM;
+
+	if (!buffer_uptodate(bh)) {
+		/* If this is a brand new buffer, start readahead.
+                   Otherwise, we assume we are already reading it.  */
+		if (!buffer_req(bh))
+			do_readahead(journal, offset);
+		wait_on_buffer(bh);
+	}
+
+	if (!buffer_uptodate(bh)) {
+		printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
+			offset);
+		brelse(bh);
+		return -EIO;
+	}
+
+	*bhp = bh;
+	return 0;
+}
+
+
+/*
+ * Count the number of in-use tags in a journal descriptor block.
+ */
+
+static int count_tags(struct buffer_head *bh, int size)
+{
+	char *			tagp;
+	journal_block_tag_t *	tag;
+	int			nr = 0;
+
+	tagp = &bh->b_data[sizeof(journal_header_t)];
+
+	while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
+		tag = (journal_block_tag_t *) tagp;
+
+		nr++;
+		tagp += sizeof(journal_block_tag_t);
+		if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
+			tagp += 16;
+
+		if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
+			break;
+	}
+
+	return nr;
+}
+
+
+/* Make sure we wrap around the log correctly! */
+#define wrap(journal, var)						\
+do {									\
+	if (var >= (journal)->j_last)					\
+		var -= ((journal)->j_last - (journal)->j_first);	\
+} while (0)
+
+/**
+ * journal_recover - recovers a on-disk journal
+ * @journal: the journal to recover
+ *
+ * The primary function for recovering the log contents when mounting a
+ * journaled device.
+ *
+ * Recovery is done in three passes.  In the first pass, we look for the
+ * end of the log.  In the second, we assemble the list of revoke
+ * blocks.  In the third and final pass, we replay any un-revoked blocks
+ * in the log.
+ */
+int journal_recover(journal_t *journal)
+{
+	int			err, err2;
+	journal_superblock_t *	sb;
+
+	struct recovery_info	info;
+
+	memset(&info, 0, sizeof(info));
+	sb = journal->j_superblock;
+
+	/*
+	 * The journal superblock's s_start field (the current log head)
+	 * is always zero if, and only if, the journal was cleanly
+	 * unmounted.
+	 */
+
+	if (!sb->s_start) {
+		jbd_debug(1, "No recovery required, last transaction %d\n",
+			  be32_to_cpu(sb->s_sequence));
+		journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
+		return 0;
+	}
+
+	err = do_one_pass(journal, &info, PASS_SCAN);
+	if (!err)
+		err = do_one_pass(journal, &info, PASS_REVOKE);
+	if (!err)
+		err = do_one_pass(journal, &info, PASS_REPLAY);
+
+	jbd_debug(1, "JBD: recovery, exit status %d, "
+		  "recovered transactions %u to %u\n",
+		  err, info.start_transaction, info.end_transaction);
+	jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n",
+		  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
+
+	/* Restart the log at the next transaction ID, thus invalidating
+	 * any existing commit records in the log. */
+	journal->j_transaction_sequence = ++info.end_transaction;
+
+	journal_clear_revoke(journal);
+	err2 = sync_blockdev(journal->j_fs_dev);
+	if (!err)
+		err = err2;
+
+	return err;
+}
+
+/**
+ * journal_skip_recovery - Start journal and wipe exiting records
+ * @journal: journal to startup
+ *
+ * Locate any valid recovery information from the journal and set up the
+ * journal structures in memory to ignore it (presumably because the
+ * caller has evidence that it is out of date).
+ * This function does'nt appear to be exorted..
+ *
+ * We perform one pass over the journal to allow us to tell the user how
+ * much recovery information is being erased, and to let us initialise
+ * the journal transaction sequence numbers to the next unused ID.
+ */
+int journal_skip_recovery(journal_t *journal)
+{
+	int			err;
+	struct recovery_info	info;
+
+	memset (&info, 0, sizeof(info));
+
+	err = do_one_pass(journal, &info, PASS_SCAN);
+
+	if (err) {
+		printk(KERN_ERR "JBD: error %d scanning journal\n", err);
+		++journal->j_transaction_sequence;
+	} else {
+#ifdef CONFIG_JBD_DEBUG
+		int dropped = info.end_transaction -
+			      be32_to_cpu(journal->j_superblock->s_sequence);
+		jbd_debug(1,
+			  "JBD: ignoring %d transaction%s from the journal.\n",
+			  dropped, (dropped == 1) ? "" : "s");
+#endif
+		journal->j_transaction_sequence = ++info.end_transaction;
+	}
+
+	journal->j_tail = 0;
+	return err;
+}
+
+static int do_one_pass(journal_t *journal,
+			struct recovery_info *info, enum passtype pass)
+{
+	unsigned int		first_commit_ID, next_commit_ID;
+	unsigned int		next_log_block;
+	int			err, success = 0;
+	journal_superblock_t *	sb;
+	journal_header_t *	tmp;
+	struct buffer_head *	bh;
+	unsigned int		sequence;
+	int			blocktype;
+
+	/*
+	 * First thing is to establish what we expect to find in the log
+	 * (in terms of transaction IDs), and where (in terms of log
+	 * block offsets): query the superblock.
+	 */
+
+	sb = journal->j_superblock;
+	next_commit_ID = be32_to_cpu(sb->s_sequence);
+	next_log_block = be32_to_cpu(sb->s_start);
+
+	first_commit_ID = next_commit_ID;
+	if (pass == PASS_SCAN)
+		info->start_transaction = first_commit_ID;
+
+	jbd_debug(1, "Starting recovery pass %d\n", pass);
+
+	/*
+	 * Now we walk through the log, transaction by transaction,
+	 * making sure that each transaction has a commit block in the
+	 * expected place.  Each complete transaction gets replayed back
+	 * into the main filesystem.
+	 */
+
+	while (1) {
+		int			flags;
+		char *			tagp;
+		journal_block_tag_t *	tag;
+		struct buffer_head *	obh;
+		struct buffer_head *	nbh;
+
+		cond_resched();
+
+		/* If we already know where to stop the log traversal,
+		 * check right now that we haven't gone past the end of
+		 * the log. */
+
+		if (pass != PASS_SCAN)
+			if (tid_geq(next_commit_ID, info->end_transaction))
+				break;
+
+		jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
+			  next_commit_ID, next_log_block, journal->j_last);
+
+		/* Skip over each chunk of the transaction looking
+		 * either the next descriptor block or the final commit
+		 * record. */
+
+		jbd_debug(3, "JBD: checking block %u\n", next_log_block);
+		err = jread(&bh, journal, next_log_block);
+		if (err)
+			goto failed;
+
+		next_log_block++;
+		wrap(journal, next_log_block);
+
+		/* What kind of buffer is it?
+		 *
+		 * If it is a descriptor block, check that it has the
+		 * expected sequence number.  Otherwise, we're all done
+		 * here. */
+
+		tmp = (journal_header_t *)bh->b_data;
+
+		if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
+			brelse(bh);
+			break;
+		}
+
+		blocktype = be32_to_cpu(tmp->h_blocktype);
+		sequence = be32_to_cpu(tmp->h_sequence);
+		jbd_debug(3, "Found magic %d, sequence %d\n",
+			  blocktype, sequence);
+
+		if (sequence != next_commit_ID) {
+			brelse(bh);
+			break;
+		}
+
+		/* OK, we have a valid descriptor block which matches
+		 * all of the sequence number checks.  What are we going
+		 * to do with it?  That depends on the pass... */
+
+		switch(blocktype) {
+		case JFS_DESCRIPTOR_BLOCK:
+			/* If it is a valid descriptor block, replay it
+			 * in pass REPLAY; otherwise, just skip over the
+			 * blocks it describes. */
+			if (pass != PASS_REPLAY) {
+				next_log_block +=
+					count_tags(bh, journal->j_blocksize);
+				wrap(journal, next_log_block);
+				brelse(bh);
+				continue;
+			}
+
+			/* A descriptor block: we can now write all of
+			 * the data blocks.  Yay, useful work is finally
+			 * getting done here! */
+
+			tagp = &bh->b_data[sizeof(journal_header_t)];
+			while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
+			       <= journal->j_blocksize) {
+				unsigned int io_block;
+
+				tag = (journal_block_tag_t *) tagp;
+				flags = be32_to_cpu(tag->t_flags);
+
+				io_block = next_log_block++;
+				wrap(journal, next_log_block);
+				err = jread(&obh, journal, io_block);
+				if (err) {
+					/* Recover what we can, but
+					 * report failure at the end. */
+					success = err;
+					printk (KERN_ERR
+						"JBD: IO error %d recovering "
+						"block %u in log\n",
+						err, io_block);
+				} else {
+					unsigned int blocknr;
+
+					J_ASSERT(obh != NULL);
+					blocknr = be32_to_cpu(tag->t_blocknr);
+
+					/* If the block has been
+					 * revoked, then we're all done
+					 * here. */
+					if (journal_test_revoke
+					    (journal, blocknr,
+					     next_commit_ID)) {
+						brelse(obh);
+						++info->nr_revoke_hits;
+						goto skip_write;
+					}
+
+					/* Find a buffer for the new
+					 * data being restored */
+					nbh = __getblk(journal->j_fs_dev,
+							blocknr,
+							journal->j_blocksize);
+					if (nbh == NULL) {
+						printk(KERN_ERR
+						       "JBD: Out of memory "
+						       "during recovery.\n");
+						err = -ENOMEM;
+						brelse(bh);
+						brelse(obh);
+						goto failed;
+					}
+
+					lock_buffer(nbh);
+					memcpy(nbh->b_data, obh->b_data,
+							journal->j_blocksize);
+					if (flags & JFS_FLAG_ESCAPE) {
+						*((__be32 *)nbh->b_data) =
+						cpu_to_be32(JFS_MAGIC_NUMBER);
+					}
+
+					BUFFER_TRACE(nbh, "marking dirty");
+					set_buffer_uptodate(nbh);
+					mark_buffer_dirty(nbh);
+					BUFFER_TRACE(nbh, "marking uptodate");
+					++info->nr_replays;
+					/* ll_rw_block(WRITE, 1, &nbh); */
+					unlock_buffer(nbh);
+					brelse(obh);
+					brelse(nbh);
+				}
+
+			skip_write:
+				tagp += sizeof(journal_block_tag_t);
+				if (!(flags & JFS_FLAG_SAME_UUID))
+					tagp += 16;
+
+				if (flags & JFS_FLAG_LAST_TAG)
+					break;
+			}
+
+			brelse(bh);
+			continue;
+
+		case JFS_COMMIT_BLOCK:
+			/* Found an expected commit block: not much to
+			 * do other than move on to the next sequence
+			 * number. */
+			brelse(bh);
+			next_commit_ID++;
+			continue;
+
+		case JFS_REVOKE_BLOCK:
+			/* If we aren't in the REVOKE pass, then we can
+			 * just skip over this block. */
+			if (pass != PASS_REVOKE) {
+				brelse(bh);
+				continue;
+			}
+
+			err = scan_revoke_records(journal, bh,
+						  next_commit_ID, info);
+			brelse(bh);
+			if (err)
+				goto failed;
+			continue;
+
+		default:
+			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+				  blocktype);
+			brelse(bh);
+			goto done;
+		}
+	}
+
+ done:
+	/*
+	 * We broke out of the log scan loop: either we came to the
+	 * known end of the log or we found an unexpected block in the
+	 * log.  If the latter happened, then we know that the "current"
+	 * transaction marks the end of the valid log.
+	 */
+
+	if (pass == PASS_SCAN)
+		info->end_transaction = next_commit_ID;
+	else {
+		/* It's really bad news if different passes end up at
+		 * different places (but possible due to IO errors). */
+		if (info->end_transaction != next_commit_ID) {
+			printk (KERN_ERR "JBD: recovery pass %d ended at "
+				"transaction %u, expected %u\n",
+				pass, next_commit_ID, info->end_transaction);
+			if (!success)
+				success = -EIO;
+		}
+	}
+
+	return success;
+
+ failed:
+	return err;
+}
+
+
+/* Scan a revoke record, marking all blocks mentioned as revoked. */
+
+static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
+			       tid_t sequence, struct recovery_info *info)
+{
+	journal_revoke_header_t *header;
+	int offset, max;
+
+	header = (journal_revoke_header_t *) bh->b_data;
+	offset = sizeof(journal_revoke_header_t);
+	max = be32_to_cpu(header->r_count);
+
+	while (offset < max) {
+		unsigned int blocknr;
+		int err;
+
+		blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
+		offset += 4;
+		err = journal_set_revoke(journal, blocknr, sequence);
+		if (err)
+			return err;
+		++info->nr_revokes;
+	}
+	return 0;
+}
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
new file mode 100644
index 00000000..305a9076
--- /dev/null
+++ b/fs/jbd/revoke.c
@@ -0,0 +1,706 @@
+/*
+ * linux/fs/jbd/revoke.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
+ *
+ * Copyright 2000 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal revoke routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ *
+ * Revoke is the mechanism used to prevent old log records for deleted
+ * metadata from being replayed on top of newer data using the same
+ * blocks.  The revoke mechanism is used in two separate places:
+ *
+ * + Commit: during commit we write the entire list of the current
+ *   transaction's revoked blocks to the journal
+ *
+ * + Recovery: during recovery we record the transaction ID of all
+ *   revoked blocks.  If there are multiple revoke records in the log
+ *   for a single block, only the last one counts, and if there is a log
+ *   entry for a block beyond the last revoke, then that log entry still
+ *   gets replayed.
+ *
+ * We can get interactions between revokes and new log data within a
+ * single transaction:
+ *
+ * Block is revoked and then journaled:
+ *   The desired end result is the journaling of the new block, so we
+ *   cancel the revoke before the transaction commits.
+ *
+ * Block is journaled and then revoked:
+ *   The revoke must take precedence over the write of the block, so we
+ *   need either to cancel the journal entry or to write the revoke
+ *   later in the log than the log block.  In this case, we choose the
+ *   latter: journaling a block cancels any revoke record for that block
+ *   in the current transaction, so any revoke for that block in the
+ *   transaction must have happened after the block was journaled and so
+ *   the revoke must take precedence.
+ *
+ * Block is revoked and then written as data:
+ *   The data write is allowed to succeed, but the revoke is _not_
+ *   cancelled.  We still need to prevent old log records from
+ *   overwriting the new data.  We don't even need to clear the revoke
+ *   bit here.
+ *
+ * Revoke information on buffers is a tri-state value:
+ *
+ * RevokeValid clear:	no cached revoke status, need to look it up
+ * RevokeValid set, Revoked clear:
+ *			buffer has not been revoked, and cancel_revoke
+ *			need do nothing.
+ * RevokeValid set, Revoked set:
+ *			buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table.  Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment no one else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
+ */
+
+#ifndef __KERNEL__
+#include "jfs_user.h"
+#else
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#endif
+#include <linux/log2.h>
+
+static struct kmem_cache *revoke_record_cache;
+static struct kmem_cache *revoke_table_cache;
+
+/* Each revoke record represents one single revoked block.  During
+   journal replay, this involves recording the transaction ID of the
+   last transaction to revoke this block. */
+
+struct jbd_revoke_record_s
+{
+	struct list_head  hash;
+	tid_t		  sequence;	/* Used for recovery only */
+	unsigned int	  blocknr;
+};
+
+
+/* The revoke table is just a simple hash table of revoke records. */
+struct jbd_revoke_table_s
+{
+	/* It is conceivable that we might want a larger hash table
+	 * for recovery.  Must be a power of two. */
+	int		  hash_size;
+	int		  hash_shift;
+	struct list_head *hash_table;
+};
+
+
+#ifdef __KERNEL__
+static void write_one_revoke_record(journal_t *, transaction_t *,
+				    struct journal_head **, int *,
+				    struct jbd_revoke_record_s *, int);
+static void flush_descriptor(journal_t *, struct journal_head *, int, int);
+#endif
+
+/* Utility functions to maintain the revoke table */
+
+/* Borrowed from buffer.c: this is a tried and tested block hash function */
+static inline int hash(journal_t *journal, unsigned int block)
+{
+	struct jbd_revoke_table_s *table = journal->j_revoke;
+	int hash_shift = table->hash_shift;
+
+	return ((block << (hash_shift - 6)) ^
+		(block >> 13) ^
+		(block << (hash_shift - 12))) & (table->hash_size - 1);
+}
+
+static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
+			      tid_t seq)
+{
+	struct list_head *hash_list;
+	struct jbd_revoke_record_s *record;
+
+repeat:
+	record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
+	if (!record)
+		goto oom;
+
+	record->sequence = seq;
+	record->blocknr = blocknr;
+	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+	spin_lock(&journal->j_revoke_lock);
+	list_add(&record->hash, hash_list);
+	spin_unlock(&journal->j_revoke_lock);
+	return 0;
+
+oom:
+	if (!journal_oom_retry)
+		return -ENOMEM;
+	jbd_debug(1, "ENOMEM in %s, retrying\n", __func__);
+	yield();
+	goto repeat;
+}
+
+/* Find a revoke record in the journal's hash table. */
+
+static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
+						      unsigned int blocknr)
+{
+	struct list_head *hash_list;
+	struct jbd_revoke_record_s *record;
+
+	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+
+	spin_lock(&journal->j_revoke_lock);
+	record = (struct jbd_revoke_record_s *) hash_list->next;
+	while (&(record->hash) != hash_list) {
+		if (record->blocknr == blocknr) {
+			spin_unlock(&journal->j_revoke_lock);
+			return record;
+		}
+		record = (struct jbd_revoke_record_s *) record->hash.next;
+	}
+	spin_unlock(&journal->j_revoke_lock);
+	return NULL;
+}
+
+void journal_destroy_revoke_caches(void)
+{
+	if (revoke_record_cache) {
+		kmem_cache_destroy(revoke_record_cache);
+		revoke_record_cache = NULL;
+	}
+	if (revoke_table_cache) {
+		kmem_cache_destroy(revoke_table_cache);
+		revoke_table_cache = NULL;
+	}
+}
+
+int __init journal_init_revoke_caches(void)
+{
+	J_ASSERT(!revoke_record_cache);
+	J_ASSERT(!revoke_table_cache);
+
+	revoke_record_cache = kmem_cache_create("revoke_record",
+					   sizeof(struct jbd_revoke_record_s),
+					   0,
+					   SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
+					   NULL);
+	if (!revoke_record_cache)
+		goto record_cache_failure;
+
+	revoke_table_cache = kmem_cache_create("revoke_table",
+					   sizeof(struct jbd_revoke_table_s),
+					   0, SLAB_TEMPORARY, NULL);
+	if (!revoke_table_cache)
+		goto table_cache_failure;
+
+	return 0;
+
+table_cache_failure:
+	journal_destroy_revoke_caches();
+record_cache_failure:
+	return -ENOMEM;
+}
+
+static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
+{
+	int shift = 0;
+	int tmp = hash_size;
+	struct jbd_revoke_table_s *table;
+
+	table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
+	if (!table)
+		goto out;
+
+	while((tmp >>= 1UL) != 0UL)
+		shift++;
+
+	table->hash_size = hash_size;
+	table->hash_shift = shift;
+	table->hash_table =
+		kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
+	if (!table->hash_table) {
+		kmem_cache_free(revoke_table_cache, table);
+		table = NULL;
+		goto out;
+	}
+
+	for (tmp = 0; tmp < hash_size; tmp++)
+		INIT_LIST_HEAD(&table->hash_table[tmp]);
+
+out:
+	return table;
+}
+
+static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table)
+{
+	int i;
+	struct list_head *hash_list;
+
+	for (i = 0; i < table->hash_size; i++) {
+		hash_list = &table->hash_table[i];
+		J_ASSERT(list_empty(hash_list));
+	}
+
+	kfree(table->hash_table);
+	kmem_cache_free(revoke_table_cache, table);
+}
+
+/* Initialise the revoke table for a given journal to a given size. */
+int journal_init_revoke(journal_t *journal, int hash_size)
+{
+	J_ASSERT(journal->j_revoke_table[0] == NULL);
+	J_ASSERT(is_power_of_2(hash_size));
+
+	journal->j_revoke_table[0] = journal_init_revoke_table(hash_size);
+	if (!journal->j_revoke_table[0])
+		goto fail0;
+
+	journal->j_revoke_table[1] = journal_init_revoke_table(hash_size);
+	if (!journal->j_revoke_table[1])
+		goto fail1;
+
+	journal->j_revoke = journal->j_revoke_table[1];
+
+	spin_lock_init(&journal->j_revoke_lock);
+
+	return 0;
+
+fail1:
+	journal_destroy_revoke_table(journal->j_revoke_table[0]);
+fail0:
+	return -ENOMEM;
+}
+
+/* Destroy a journal's revoke table.  The table must already be empty! */
+void journal_destroy_revoke(journal_t *journal)
+{
+	journal->j_revoke = NULL;
+	if (journal->j_revoke_table[0])
+		journal_destroy_revoke_table(journal->j_revoke_table[0]);
+	if (journal->j_revoke_table[1])
+		journal_destroy_revoke_table(journal->j_revoke_table[1]);
+}
+
+
+#ifdef __KERNEL__
+
+/*
+ * journal_revoke: revoke a given buffer_head from the journal.  This
+ * prevents the block from being replayed during recovery if we take a
+ * crash after this current transaction commits.  Any subsequent
+ * metadata writes of the buffer in this transaction cancel the
+ * revoke.
+ *
+ * Note that this call may block --- it is up to the caller to make
+ * sure that there are no further calls to journal_write_metadata
+ * before the revoke is complete.  In ext3, this implies calling the
+ * revoke before clearing the block bitmap when we are deleting
+ * metadata.
+ *
+ * Revoke performs a journal_forget on any buffer_head passed in as a
+ * parameter, but does _not_ forget the buffer_head if the bh was only
+ * found implicitly.
+ *
+ * bh_in may not be a journalled buffer - it may have come off
+ * the hash tables without an attached journal_head.
+ *
+ * If bh_in is non-zero, journal_revoke() will decrement its b_count
+ * by one.
+ */
+
+int journal_revoke(handle_t *handle, unsigned int blocknr,
+		   struct buffer_head *bh_in)
+{
+	struct buffer_head *bh = NULL;
+	journal_t *journal;
+	struct block_device *bdev;
+	int err;
+
+	might_sleep();
+	if (bh_in)
+		BUFFER_TRACE(bh_in, "enter");
+
+	journal = handle->h_transaction->t_journal;
+	if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
+		J_ASSERT (!"Cannot set revoke feature!");
+		return -EINVAL;
+	}
+
+	bdev = journal->j_fs_dev;
+	bh = bh_in;
+
+	if (!bh) {
+		bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
+		if (bh)
+			BUFFER_TRACE(bh, "found on hash");
+	}
+#ifdef JBD_EXPENSIVE_CHECKING
+	else {
+		struct buffer_head *bh2;
+
+		/* If there is a different buffer_head lying around in
+		 * memory anywhere... */
+		bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
+		if (bh2) {
+			/* ... and it has RevokeValid status... */
+			if (bh2 != bh && buffer_revokevalid(bh2))
+				/* ...then it better be revoked too,
+				 * since it's illegal to create a revoke
+				 * record against a buffer_head which is
+				 * not marked revoked --- that would
+				 * risk missing a subsequent revoke
+				 * cancel. */
+				J_ASSERT_BH(bh2, buffer_revoked(bh2));
+			put_bh(bh2);
+		}
+	}
+#endif
+
+	/* We really ought not ever to revoke twice in a row without
+           first having the revoke cancelled: it's illegal to free a
+           block twice without allocating it in between! */
+	if (bh) {
+		if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
+				 "inconsistent data on disk")) {
+			if (!bh_in)
+				brelse(bh);
+			return -EIO;
+		}
+		set_buffer_revoked(bh);
+		set_buffer_revokevalid(bh);
+		if (bh_in) {
+			BUFFER_TRACE(bh_in, "call journal_forget");
+			journal_forget(handle, bh_in);
+		} else {
+			BUFFER_TRACE(bh, "call brelse");
+			__brelse(bh);
+		}
+	}
+
+	jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
+	err = insert_revoke_hash(journal, blocknr,
+				handle->h_transaction->t_tid);
+	BUFFER_TRACE(bh_in, "exit");
+	return err;
+}
+
+/*
+ * Cancel an outstanding revoke.  For use only internally by the
+ * journaling code (called from journal_get_write_access).
+ *
+ * We trust buffer_revoked() on the buffer if the buffer is already
+ * being journaled: if there is no revoke pending on the buffer, then we
+ * don't do anything here.
+ *
+ * This would break if it were possible for a buffer to be revoked and
+ * discarded, and then reallocated within the same transaction.  In such
+ * a case we would have lost the revoked bit, but when we arrived here
+ * the second time we would still have a pending revoke to cancel.  So,
+ * do not trust the Revoked bit on buffers unless RevokeValid is also
+ * set.
+ */
+int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
+{
+	struct jbd_revoke_record_s *record;
+	journal_t *journal = handle->h_transaction->t_journal;
+	int need_cancel;
+	int did_revoke = 0;	/* akpm: debug */
+	struct buffer_head *bh = jh2bh(jh);
+
+	jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+
+	/* Is the existing Revoke bit valid?  If so, we trust it, and
+	 * only perform the full cancel if the revoke bit is set.  If
+	 * not, we can't trust the revoke bit, and we need to do the
+	 * full search for a revoke record. */
+	if (test_set_buffer_revokevalid(bh)) {
+		need_cancel = test_clear_buffer_revoked(bh);
+	} else {
+		need_cancel = 1;
+		clear_buffer_revoked(bh);
+	}
+
+	if (need_cancel) {
+		record = find_revoke_record(journal, bh->b_blocknr);
+		if (record) {
+			jbd_debug(4, "cancelled existing revoke on "
+				  "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
+			spin_lock(&journal->j_revoke_lock);
+			list_del(&record->hash);
+			spin_unlock(&journal->j_revoke_lock);
+			kmem_cache_free(revoke_record_cache, record);
+			did_revoke = 1;
+		}
+	}
+
+#ifdef JBD_EXPENSIVE_CHECKING
+	/* There better not be one left behind by now! */
+	record = find_revoke_record(journal, bh->b_blocknr);
+	J_ASSERT_JH(jh, record == NULL);
+#endif
+
+	/* Finally, have we just cleared revoke on an unhashed
+	 * buffer_head?  If so, we'd better make sure we clear the
+	 * revoked status on any hashed alias too, otherwise the revoke
+	 * state machine will get very upset later on. */
+	if (need_cancel) {
+		struct buffer_head *bh2;
+		bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
+		if (bh2) {
+			if (bh2 != bh)
+				clear_buffer_revoked(bh2);
+			__brelse(bh2);
+		}
+	}
+	return did_revoke;
+}
+
+/* journal_switch_revoke table select j_revoke for next transaction
+ * we do not want to suspend any processing until all revokes are
+ * written -bzzz
+ */
+void journal_switch_revoke_table(journal_t *journal)
+{
+	int i;
+
+	if (journal->j_revoke == journal->j_revoke_table[0])
+		journal->j_revoke = journal->j_revoke_table[1];
+	else
+		journal->j_revoke = journal->j_revoke_table[0];
+
+	for (i = 0; i < journal->j_revoke->hash_size; i++)
+		INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
+}
+
+/*
+ * Write revoke records to the journal for all entries in the current
+ * revoke hash, deleting the entries as we go.
+ */
+void journal_write_revoke_records(journal_t *journal,
+				  transaction_t *transaction, int write_op)
+{
+	struct journal_head *descriptor;
+	struct jbd_revoke_record_s *record;
+	struct jbd_revoke_table_s *revoke;
+	struct list_head *hash_list;
+	int i, offset, count;
+
+	descriptor = NULL;
+	offset = 0;
+	count = 0;
+
+	/* select revoke table for committing transaction */
+	revoke = journal->j_revoke == journal->j_revoke_table[0] ?
+		journal->j_revoke_table[1] : journal->j_revoke_table[0];
+
+	for (i = 0; i < revoke->hash_size; i++) {
+		hash_list = &revoke->hash_table[i];
+
+		while (!list_empty(hash_list)) {
+			record = (struct jbd_revoke_record_s *)
+				hash_list->next;
+			write_one_revoke_record(journal, transaction,
+						&descriptor, &offset,
+						record, write_op);
+			count++;
+			list_del(&record->hash);
+			kmem_cache_free(revoke_record_cache, record);
+		}
+	}
+	if (descriptor)
+		flush_descriptor(journal, descriptor, offset, write_op);
+	jbd_debug(1, "Wrote %d revoke records\n", count);
+}
+
+/*
+ * Write out one revoke record.  We need to create a new descriptor
+ * block if the old one is full or if we have not already created one.
+ */
+
+static void write_one_revoke_record(journal_t *journal,
+				    transaction_t *transaction,
+				    struct journal_head **descriptorp,
+				    int *offsetp,
+				    struct jbd_revoke_record_s *record,
+				    int write_op)
+{
+	struct journal_head *descriptor;
+	int offset;
+	journal_header_t *header;
+
+	/* If we are already aborting, this all becomes a noop.  We
+           still need to go round the loop in
+           journal_write_revoke_records in order to free all of the
+           revoke records: only the IO to the journal is omitted. */
+	if (is_journal_aborted(journal))
+		return;
+
+	descriptor = *descriptorp;
+	offset = *offsetp;
+
+	/* Make sure we have a descriptor with space left for the record */
+	if (descriptor) {
+		if (offset == journal->j_blocksize) {
+			flush_descriptor(journal, descriptor, offset, write_op);
+			descriptor = NULL;
+		}
+	}
+
+	if (!descriptor) {
+		descriptor = journal_get_descriptor_buffer(journal);
+		if (!descriptor)
+			return;
+		header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+		header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
+		header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK);
+		header->h_sequence  = cpu_to_be32(transaction->t_tid);
+
+		/* Record it so that we can wait for IO completion later */
+		JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
+		journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+
+		offset = sizeof(journal_revoke_header_t);
+		*descriptorp = descriptor;
+	}
+
+	* ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
+		cpu_to_be32(record->blocknr);
+	offset += 4;
+	*offsetp = offset;
+}
+
+/*
+ * Flush a revoke descriptor out to the journal.  If we are aborting,
+ * this is a noop; otherwise we are generating a buffer which needs to
+ * be waited for during commit, so it has to go onto the appropriate
+ * journal buffer list.
+ */
+
+static void flush_descriptor(journal_t *journal,
+			     struct journal_head *descriptor,
+			     int offset, int write_op)
+{
+	journal_revoke_header_t *header;
+	struct buffer_head *bh = jh2bh(descriptor);
+
+	if (is_journal_aborted(journal)) {
+		put_bh(bh);
+		return;
+	}
+
+	header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+	header->r_count = cpu_to_be32(offset);
+	set_buffer_jwrite(bh);
+	BUFFER_TRACE(bh, "write");
+	set_buffer_dirty(bh);
+	write_dirty_buffer(bh, write_op);
+}
+#endif
+
+/*
+ * Revoke support for recovery.
+ *
+ * Recovery needs to be able to:
+ *
+ *  record all revoke records, including the tid of the latest instance
+ *  of each revoke in the journal
+ *
+ *  check whether a given block in a given transaction should be replayed
+ *  (ie. has not been revoked by a revoke record in that or a subsequent
+ *  transaction)
+ *
+ *  empty the revoke table after recovery.
+ */
+
+/*
+ * First, setting revoke records.  We create a new revoke record for
+ * every block ever revoked in the log as we scan it for recovery, and
+ * we update the existing records if we find multiple revokes for a
+ * single block.
+ */
+
+int journal_set_revoke(journal_t *journal,
+		       unsigned int blocknr,
+		       tid_t sequence)
+{
+	struct jbd_revoke_record_s *record;
+
+	record = find_revoke_record(journal, blocknr);
+	if (record) {
+		/* If we have multiple occurrences, only record the
+		 * latest sequence number in the hashed record */
+		if (tid_gt(sequence, record->sequence))
+			record->sequence = sequence;
+		return 0;
+	}
+	return insert_revoke_hash(journal, blocknr, sequence);
+}
+
+/*
+ * Test revoke records.  For a given block referenced in the log, has
+ * that block been revoked?  A revoke record with a given transaction
+ * sequence number revokes all blocks in that transaction and earlier
+ * ones, but later transactions still need replayed.
+ */
+
+int journal_test_revoke(journal_t *journal,
+			unsigned int blocknr,
+			tid_t sequence)
+{
+	struct jbd_revoke_record_s *record;
+
+	record = find_revoke_record(journal, blocknr);
+	if (!record)
+		return 0;
+	if (tid_gt(sequence, record->sequence))
+		return 0;
+	return 1;
+}
+
+/*
+ * Finally, once recovery is over, we need to clear the revoke table so
+ * that it can be reused by the running filesystem.
+ */
+
+void journal_clear_revoke(journal_t *journal)
+{
+	int i;
+	struct list_head *hash_list;
+	struct jbd_revoke_record_s *record;
+	struct jbd_revoke_table_s *revoke;
+
+	revoke = journal->j_revoke;
+
+	for (i = 0; i < revoke->hash_size; i++) {
+		hash_list = &revoke->hash_table[i];
+		while (!list_empty(hash_list)) {
+			record = (struct jbd_revoke_record_s*) hash_list->next;
+			list_del(&record->hash);
+			kmem_cache_free(revoke_record_cache, record);
+		}
+	}
+}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
new file mode 100644
index 00000000..f7ee81a0
--- /dev/null
+++ b/fs/jbd/transaction.c
@@ -0,0 +1,2198 @@
+/*
+ * linux/fs/jbd/transaction.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem transaction handling code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages transactions (compound commits managed by the
+ * journaling code) and handles (individual atomic operations by the
+ * filesystem).
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/hrtimer.h>
+
+static void __journal_temp_unlink_buffer(struct journal_head *jh);
+
+/*
+ * get_transaction: obtain a new transaction_t object.
+ *
+ * Simply allocate and initialise a new transaction.  Create it in
+ * RUNNING state and add it to the current journal (which should not
+ * have an existing running transaction: we only make a new transaction
+ * once we have started to commit the old one).
+ *
+ * Preconditions:
+ *	The journal MUST be locked.  We don't perform atomic mallocs on the
+ *	new transaction	and we can't block without protecting against other
+ *	processes trying to touch the journal while it is in transition.
+ *
+ * Called under j_state_lock
+ */
+
+static transaction_t *
+get_transaction(journal_t *journal, transaction_t *transaction)
+{
+	transaction->t_journal = journal;
+	transaction->t_state = T_RUNNING;
+	transaction->t_start_time = ktime_get();
+	transaction->t_tid = journal->j_transaction_sequence++;
+	transaction->t_expires = jiffies + journal->j_commit_interval;
+	spin_lock_init(&transaction->t_handle_lock);
+
+	/* Set up the commit timer for the new transaction. */
+	journal->j_commit_timer.expires =
+				round_jiffies_up(transaction->t_expires);
+	add_timer(&journal->j_commit_timer);
+
+	J_ASSERT(journal->j_running_transaction == NULL);
+	journal->j_running_transaction = transaction;
+
+	return transaction;
+}
+
+/*
+ * Handle management.
+ *
+ * A handle_t is an object which represents a single atomic update to a
+ * filesystem, and which tracks all of the modifications which form part
+ * of that one update.
+ */
+
+/*
+ * start_this_handle: Given a handle, deal with any locking or stalling
+ * needed to make sure that there is enough journal space for the handle
+ * to begin.  Attach the handle to a transaction and set up the
+ * transaction's buffer credits.
+ */
+
+static int start_this_handle(journal_t *journal, handle_t *handle)
+{
+	transaction_t *transaction;
+	int needed;
+	int nblocks = handle->h_buffer_credits;
+	transaction_t *new_transaction = NULL;
+	int ret = 0;
+
+	if (nblocks > journal->j_max_transaction_buffers) {
+		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
+		       current->comm, nblocks,
+		       journal->j_max_transaction_buffers);
+		ret = -ENOSPC;
+		goto out;
+	}
+
+alloc_transaction:
+	if (!journal->j_running_transaction) {
+		new_transaction = kzalloc(sizeof(*new_transaction),
+						GFP_NOFS|__GFP_NOFAIL);
+		if (!new_transaction) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	jbd_debug(3, "New handle %p going live.\n", handle);
+
+repeat:
+
+	/*
+	 * We need to hold j_state_lock until t_updates has been incremented,
+	 * for proper journal barrier handling
+	 */
+	spin_lock(&journal->j_state_lock);
+repeat_locked:
+	if (is_journal_aborted(journal) ||
+	    (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
+		spin_unlock(&journal->j_state_lock);
+		ret = -EROFS;
+		goto out;
+	}
+
+	/* Wait on the journal's transaction barrier if necessary */
+	if (journal->j_barrier_count) {
+		spin_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_transaction_locked,
+				journal->j_barrier_count == 0);
+		goto repeat;
+	}
+
+	if (!journal->j_running_transaction) {
+		if (!new_transaction) {
+			spin_unlock(&journal->j_state_lock);
+			goto alloc_transaction;
+		}
+		get_transaction(journal, new_transaction);
+		new_transaction = NULL;
+	}
+
+	transaction = journal->j_running_transaction;
+
+	/*
+	 * If the current transaction is locked down for commit, wait for the
+	 * lock to be released.
+	 */
+	if (transaction->t_state == T_LOCKED) {
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(&journal->j_wait_transaction_locked,
+					&wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&journal->j_state_lock);
+		schedule();
+		finish_wait(&journal->j_wait_transaction_locked, &wait);
+		goto repeat;
+	}
+
+	/*
+	 * If there is not enough space left in the log to write all potential
+	 * buffers requested by this operation, we need to stall pending a log
+	 * checkpoint to free some more log space.
+	 */
+	spin_lock(&transaction->t_handle_lock);
+	needed = transaction->t_outstanding_credits + nblocks;
+
+	if (needed > journal->j_max_transaction_buffers) {
+		/*
+		 * If the current transaction is already too large, then start
+		 * to commit it: we can then go back and attach this handle to
+		 * a new transaction.
+		 */
+		DEFINE_WAIT(wait);
+
+		jbd_debug(2, "Handle %p starting new commit...\n", handle);
+		spin_unlock(&transaction->t_handle_lock);
+		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+				TASK_UNINTERRUPTIBLE);
+		__log_start_commit(journal, transaction->t_tid);
+		spin_unlock(&journal->j_state_lock);
+		schedule();
+		finish_wait(&journal->j_wait_transaction_locked, &wait);
+		goto repeat;
+	}
+
+	/*
+	 * The commit code assumes that it can get enough log space
+	 * without forcing a checkpoint.  This is *critical* for
+	 * correctness: a checkpoint of a buffer which is also
+	 * associated with a committing transaction creates a deadlock,
+	 * so commit simply cannot force through checkpoints.
+	 *
+	 * We must therefore ensure the necessary space in the journal
+	 * *before* starting to dirty potentially checkpointed buffers
+	 * in the new transaction.
+	 *
+	 * The worst part is, any transaction currently committing can
+	 * reduce the free space arbitrarily.  Be careful to account for
+	 * those buffers when checkpointing.
+	 */
+
+	/*
+	 * @@@ AKPM: This seems rather over-defensive.  We're giving commit
+	 * a _lot_ of headroom: 1/4 of the journal plus the size of
+	 * the committing transaction.  Really, we only need to give it
+	 * committing_transaction->t_outstanding_credits plus "enough" for
+	 * the log control blocks.
+	 * Also, this test is inconsistent with the matching one in
+	 * journal_extend().
+	 */
+	if (__log_space_left(journal) < jbd_space_needed(journal)) {
+		jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
+		spin_unlock(&transaction->t_handle_lock);
+		__log_wait_for_space(journal);
+		goto repeat_locked;
+	}
+
+	/* OK, account for the buffers that this operation expects to
+	 * use and add the handle to the running transaction. */
+
+	handle->h_transaction = transaction;
+	transaction->t_outstanding_credits += nblocks;
+	transaction->t_updates++;
+	transaction->t_handle_count++;
+	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
+		  handle, nblocks, transaction->t_outstanding_credits,
+		  __log_space_left(journal));
+	spin_unlock(&transaction->t_handle_lock);
+	spin_unlock(&journal->j_state_lock);
+
+	lock_map_acquire(&handle->h_lockdep_map);
+out:
+	if (unlikely(new_transaction))		/* It's usually NULL */
+		kfree(new_transaction);
+	return ret;
+}
+
+static struct lock_class_key jbd_handle_key;
+
+/* Allocate a new handle.  This should probably be in a slab... */
+static handle_t *new_handle(int nblocks)
+{
+	handle_t *handle = jbd_alloc_handle(GFP_NOFS);
+	if (!handle)
+		return NULL;
+	memset(handle, 0, sizeof(*handle));
+	handle->h_buffer_credits = nblocks;
+	handle->h_ref = 1;
+
+	lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0);
+
+	return handle;
+}
+
+/**
+ * handle_t *journal_start() - Obtain a new handle.
+ * @journal: Journal to start transaction on.
+ * @nblocks: number of block buffer we might modify
+ *
+ * We make sure that the transaction can guarantee at least nblocks of
+ * modified buffers in the log.  We block until the log can guarantee
+ * that much space.
+ *
+ * This function is visible to journal users (like ext3fs), so is not
+ * called with the journal already locked.
+ *
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
+ */
+handle_t *journal_start(journal_t *journal, int nblocks)
+{
+	handle_t *handle = journal_current_handle();
+	int err;
+
+	if (!journal)
+		return ERR_PTR(-EROFS);
+
+	if (handle) {
+		J_ASSERT(handle->h_transaction->t_journal == journal);
+		handle->h_ref++;
+		return handle;
+	}
+
+	handle = new_handle(nblocks);
+	if (!handle)
+		return ERR_PTR(-ENOMEM);
+
+	current->journal_info = handle;
+
+	err = start_this_handle(journal, handle);
+	if (err < 0) {
+		jbd_free_handle(handle);
+		current->journal_info = NULL;
+		handle = ERR_PTR(err);
+	}
+	return handle;
+}
+
+/**
+ * int journal_extend() - extend buffer credits.
+ * @handle:  handle to 'extend'
+ * @nblocks: nr blocks to try to extend by.
+ *
+ * Some transactions, such as large extends and truncates, can be done
+ * atomically all at once or in several stages.  The operation requests
+ * a credit for a number of buffer modications in advance, but can
+ * extend its credit if it needs more.
+ *
+ * journal_extend tries to give the running handle more buffer credits.
+ * It does not guarantee that allocation - this is a best-effort only.
+ * The calling process MUST be able to deal cleanly with a failure to
+ * extend here.
+ *
+ * Return 0 on success, non-zero on failure.
+ *
+ * return code < 0 implies an error
+ * return code > 0 implies normal transaction-full status.
+ */
+int journal_extend(handle_t *handle, int nblocks)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	int result;
+	int wanted;
+
+	result = -EIO;
+	if (is_handle_aborted(handle))
+		goto out;
+
+	result = 1;
+
+	spin_lock(&journal->j_state_lock);
+
+	/* Don't extend a locked-down transaction! */
+	if (handle->h_transaction->t_state != T_RUNNING) {
+		jbd_debug(3, "denied handle %p %d blocks: "
+			  "transaction not running\n", handle, nblocks);
+		goto error_out;
+	}
+
+	spin_lock(&transaction->t_handle_lock);
+	wanted = transaction->t_outstanding_credits + nblocks;
+
+	if (wanted > journal->j_max_transaction_buffers) {
+		jbd_debug(3, "denied handle %p %d blocks: "
+			  "transaction too large\n", handle, nblocks);
+		goto unlock;
+	}
+
+	if (wanted > __log_space_left(journal)) {
+		jbd_debug(3, "denied handle %p %d blocks: "
+			  "insufficient log space\n", handle, nblocks);
+		goto unlock;
+	}
+
+	handle->h_buffer_credits += nblocks;
+	transaction->t_outstanding_credits += nblocks;
+	result = 0;
+
+	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
+unlock:
+	spin_unlock(&transaction->t_handle_lock);
+error_out:
+	spin_unlock(&journal->j_state_lock);
+out:
+	return result;
+}
+
+
+/**
+ * int journal_restart() - restart a handle.
+ * @handle:  handle to restart
+ * @nblocks: nr credits requested
+ *
+ * Restart a handle for a multi-transaction filesystem
+ * operation.
+ *
+ * If the journal_extend() call above fails to grant new buffer credits
+ * to a running handle, a call to journal_restart will commit the
+ * handle's transaction so far and reattach the handle to a new
+ * transaction capabable of guaranteeing the requested number of
+ * credits.
+ */
+
+int journal_restart(handle_t *handle, int nblocks)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	int ret;
+
+	/* If we've had an abort of any type, don't even think about
+	 * actually doing the restart! */
+	if (is_handle_aborted(handle))
+		return 0;
+
+	/*
+	 * First unlink the handle from its current transaction, and start the
+	 * commit on that.
+	 */
+	J_ASSERT(transaction->t_updates > 0);
+	J_ASSERT(journal_current_handle() == handle);
+
+	spin_lock(&journal->j_state_lock);
+	spin_lock(&transaction->t_handle_lock);
+	transaction->t_outstanding_credits -= handle->h_buffer_credits;
+	transaction->t_updates--;
+
+	if (!transaction->t_updates)
+		wake_up(&journal->j_wait_updates);
+	spin_unlock(&transaction->t_handle_lock);
+
+	jbd_debug(2, "restarting handle %p\n", handle);
+	__log_start_commit(journal, transaction->t_tid);
+	spin_unlock(&journal->j_state_lock);
+
+	lock_map_release(&handle->h_lockdep_map);
+	handle->h_buffer_credits = nblocks;
+	ret = start_this_handle(journal, handle);
+	return ret;
+}
+
+
+/**
+ * void journal_lock_updates () - establish a transaction barrier.
+ * @journal:  Journal to establish a barrier on.
+ *
+ * This locks out any further updates from being started, and blocks
+ * until all existing updates have completed, returning only once the
+ * journal is in a quiescent state with no updates running.
+ *
+ * The journal lock should not be held on entry.
+ */
+void journal_lock_updates(journal_t *journal)
+{
+	DEFINE_WAIT(wait);
+
+	spin_lock(&journal->j_state_lock);
+	++journal->j_barrier_count;
+
+	/* Wait until there are no running updates */
+	while (1) {
+		transaction_t *transaction = journal->j_running_transaction;
+
+		if (!transaction)
+			break;
+
+		spin_lock(&transaction->t_handle_lock);
+		if (!transaction->t_updates) {
+			spin_unlock(&transaction->t_handle_lock);
+			break;
+		}
+		prepare_to_wait(&journal->j_wait_updates, &wait,
+				TASK_UNINTERRUPTIBLE);
+		spin_unlock(&transaction->t_handle_lock);
+		spin_unlock(&journal->j_state_lock);
+		schedule();
+		finish_wait(&journal->j_wait_updates, &wait);
+		spin_lock(&journal->j_state_lock);
+	}
+	spin_unlock(&journal->j_state_lock);
+
+	/*
+	 * We have now established a barrier against other normal updates, but
+	 * we also need to barrier against other journal_lock_updates() calls
+	 * to make sure that we serialise special journal-locked operations
+	 * too.
+	 */
+	mutex_lock(&journal->j_barrier);
+}
+
+/**
+ * void journal_unlock_updates (journal_t* journal) - release barrier
+ * @journal:  Journal to release the barrier on.
+ *
+ * Release a transaction barrier obtained with journal_lock_updates().
+ *
+ * Should be called without the journal lock held.
+ */
+void journal_unlock_updates (journal_t *journal)
+{
+	J_ASSERT(journal->j_barrier_count != 0);
+
+	mutex_unlock(&journal->j_barrier);
+	spin_lock(&journal->j_state_lock);
+	--journal->j_barrier_count;
+	spin_unlock(&journal->j_state_lock);
+	wake_up(&journal->j_wait_transaction_locked);
+}
+
+static void warn_dirty_buffer(struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+
+	printk(KERN_WARNING
+	       "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+	       "There's a risk of filesystem corruption in case of system "
+	       "crash.\n",
+	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
+}
+
+/*
+ * If the buffer is already part of the current transaction, then there
+ * is nothing we need to do.  If it is already part of a prior
+ * transaction which we are still committing to disk, then we need to
+ * make sure that we do not overwrite the old copy: we do copy-out to
+ * preserve the copy going to disk.  We also account the buffer against
+ * the handle's metadata buffer credits (unless the buffer is already
+ * part of the transaction, that is).
+ *
+ */
+static int
+do_get_write_access(handle_t *handle, struct journal_head *jh,
+			int force_copy)
+{
+	struct buffer_head *bh;
+	transaction_t *transaction;
+	journal_t *journal;
+	int error;
+	char *frozen_buffer = NULL;
+	int need_copy = 0;
+
+	if (is_handle_aborted(handle))
+		return -EROFS;
+
+	transaction = handle->h_transaction;
+	journal = transaction->t_journal;
+
+	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
+
+	JBUFFER_TRACE(jh, "entry");
+repeat:
+	bh = jh2bh(jh);
+
+	/* @@@ Need to check for errors here at some point. */
+
+	lock_buffer(bh);
+	jbd_lock_bh_state(bh);
+
+	/* We now hold the buffer lock so it is safe to query the buffer
+	 * state.  Is the buffer dirty?
+	 *
+	 * If so, there are two possibilities.  The buffer may be
+	 * non-journaled, and undergoing a quite legitimate writeback.
+	 * Otherwise, it is journaled, and we don't expect dirty buffers
+	 * in that state (the buffers should be marked JBD_Dirty
+	 * instead.)  So either the IO is being done under our own
+	 * control and this is a bug, or it's a third party IO such as
+	 * dump(8) (which may leave the buffer scheduled for read ---
+	 * ie. locked but not dirty) or tune2fs (which may actually have
+	 * the buffer dirtied, ugh.)  */
+
+	if (buffer_dirty(bh)) {
+		/*
+		 * First question: is this buffer already part of the current
+		 * transaction or the existing committing transaction?
+		 */
+		if (jh->b_transaction) {
+			J_ASSERT_JH(jh,
+				jh->b_transaction == transaction ||
+				jh->b_transaction ==
+					journal->j_committing_transaction);
+			if (jh->b_next_transaction)
+				J_ASSERT_JH(jh, jh->b_next_transaction ==
+							transaction);
+			warn_dirty_buffer(bh);
+		}
+		/*
+		 * In any case we need to clean the dirty flag and we must
+		 * do it under the buffer lock to be sure we don't race
+		 * with running write-out.
+		 */
+		JBUFFER_TRACE(jh, "Journalling dirty buffer");
+		clear_buffer_dirty(bh);
+		set_buffer_jbddirty(bh);
+	}
+
+	unlock_buffer(bh);
+
+	error = -EROFS;
+	if (is_handle_aborted(handle)) {
+		jbd_unlock_bh_state(bh);
+		goto out;
+	}
+	error = 0;
+
+	/*
+	 * The buffer is already part of this transaction if b_transaction or
+	 * b_next_transaction points to it
+	 */
+	if (jh->b_transaction == transaction ||
+	    jh->b_next_transaction == transaction)
+		goto done;
+
+	/*
+	 * this is the first time this transaction is touching this buffer,
+	 * reset the modified flag
+	 */
+	jh->b_modified = 0;
+
+	/*
+	 * If there is already a copy-out version of this buffer, then we don't
+	 * need to make another one
+	 */
+	if (jh->b_frozen_data) {
+		JBUFFER_TRACE(jh, "has frozen data");
+		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+		jh->b_next_transaction = transaction;
+		goto done;
+	}
+
+	/* Is there data here we need to preserve? */
+
+	if (jh->b_transaction && jh->b_transaction != transaction) {
+		JBUFFER_TRACE(jh, "owned by older transaction");
+		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+		J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_committing_transaction);
+
+		/* There is one case we have to be very careful about.
+		 * If the committing transaction is currently writing
+		 * this buffer out to disk and has NOT made a copy-out,
+		 * then we cannot modify the buffer contents at all
+		 * right now.  The essence of copy-out is that it is the
+		 * extra copy, not the primary copy, which gets
+		 * journaled.  If the primary copy is already going to
+		 * disk then we cannot do copy-out here. */
+
+		if (jh->b_jlist == BJ_Shadow) {
+			DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
+			wait_queue_head_t *wqh;
+
+			wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
+
+			JBUFFER_TRACE(jh, "on shadow: sleep");
+			jbd_unlock_bh_state(bh);
+			/* commit wakes up all shadow buffers after IO */
+			for ( ; ; ) {
+				prepare_to_wait(wqh, &wait.wait,
+						TASK_UNINTERRUPTIBLE);
+				if (jh->b_jlist != BJ_Shadow)
+					break;
+				schedule();
+			}
+			finish_wait(wqh, &wait.wait);
+			goto repeat;
+		}
+
+		/* Only do the copy if the currently-owning transaction
+		 * still needs it.  If it is on the Forget list, the
+		 * committing transaction is past that stage.  The
+		 * buffer had better remain locked during the kmalloc,
+		 * but that should be true --- we hold the journal lock
+		 * still and the buffer is already on the BUF_JOURNAL
+		 * list so won't be flushed.
+		 *
+		 * Subtle point, though: if this is a get_undo_access,
+		 * then we will be relying on the frozen_data to contain
+		 * the new value of the committed_data record after the
+		 * transaction, so we HAVE to force the frozen_data copy
+		 * in that case. */
+
+		if (jh->b_jlist != BJ_Forget || force_copy) {
+			JBUFFER_TRACE(jh, "generate frozen data");
+			if (!frozen_buffer) {
+				JBUFFER_TRACE(jh, "allocate memory for buffer");
+				jbd_unlock_bh_state(bh);
+				frozen_buffer =
+					jbd_alloc(jh2bh(jh)->b_size,
+							 GFP_NOFS);
+				if (!frozen_buffer) {
+					printk(KERN_EMERG
+					       "%s: OOM for frozen_buffer\n",
+					       __func__);
+					JBUFFER_TRACE(jh, "oom!");
+					error = -ENOMEM;
+					jbd_lock_bh_state(bh);
+					goto done;
+				}
+				goto repeat;
+			}
+			jh->b_frozen_data = frozen_buffer;
+			frozen_buffer = NULL;
+			need_copy = 1;
+		}
+		jh->b_next_transaction = transaction;
+	}
+
+
+	/*
+	 * Finally, if the buffer is not journaled right now, we need to make
+	 * sure it doesn't get written to disk before the caller actually
+	 * commits the new data
+	 */
+	if (!jh->b_transaction) {
+		JBUFFER_TRACE(jh, "no transaction");
+		J_ASSERT_JH(jh, !jh->b_next_transaction);
+		jh->b_transaction = transaction;
+		JBUFFER_TRACE(jh, "file as BJ_Reserved");
+		spin_lock(&journal->j_list_lock);
+		__journal_file_buffer(jh, transaction, BJ_Reserved);
+		spin_unlock(&journal->j_list_lock);
+	}
+
+done:
+	if (need_copy) {
+		struct page *page;
+		int offset;
+		char *source;
+
+		J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
+			    "Possible IO failure.\n");
+		page = jh2bh(jh)->b_page;
+		offset = offset_in_page(jh2bh(jh)->b_data);
+		source = kmap_atomic(page, KM_USER0);
+		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
+		kunmap_atomic(source, KM_USER0);
+	}
+	jbd_unlock_bh_state(bh);
+
+	/*
+	 * If we are about to journal a buffer, then any revoke pending on it is
+	 * no longer valid
+	 */
+	journal_cancel_revoke(handle, jh);
+
+out:
+	if (unlikely(frozen_buffer))	/* It's usually NULL */
+		jbd_free(frozen_buffer, bh->b_size);
+
+	JBUFFER_TRACE(jh, "exit");
+	return error;
+}
+
+/**
+ * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
+ * @handle: transaction to add buffer modifications to
+ * @bh:     bh to be used for metadata writes
+ *
+ * Returns an error code or 0 on success.
+ *
+ * In full data journalling mode the buffer may be of type BJ_AsyncData,
+ * because we're write()ing a buffer which is also part of a shared mapping.
+ */
+
+int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
+{
+	struct journal_head *jh = journal_add_journal_head(bh);
+	int rc;
+
+	/* We do not want to get caught playing with fields which the
+	 * log thread also manipulates.  Make sure that the buffer
+	 * completes any outstanding IO before proceeding. */
+	rc = do_get_write_access(handle, jh, 0);
+	journal_put_journal_head(jh);
+	return rc;
+}
+
+
+/*
+ * When the user wants to journal a newly created buffer_head
+ * (ie. getblk() returned a new buffer and we are going to populate it
+ * manually rather than reading off disk), then we need to keep the
+ * buffer_head locked until it has been completely filled with new
+ * data.  In this case, we should be able to make the assertion that
+ * the bh is not already part of an existing transaction.
+ *
+ * The buffer should already be locked by the caller by this point.
+ * There is no lock ranking violation: it was a newly created,
+ * unlocked buffer beforehand. */
+
+/**
+ * int journal_get_create_access () - notify intent to use newly created bh
+ * @handle: transaction to new buffer to
+ * @bh: new buffer.
+ *
+ * Call this if you create a new bh.
+ */
+int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	struct journal_head *jh = journal_add_journal_head(bh);
+	int err;
+
+	jbd_debug(5, "journal_head %p\n", jh);
+	err = -EROFS;
+	if (is_handle_aborted(handle))
+		goto out;
+	err = 0;
+
+	JBUFFER_TRACE(jh, "entry");
+	/*
+	 * The buffer may already belong to this transaction due to pre-zeroing
+	 * in the filesystem's new_block code.  It may also be on the previous,
+	 * committing transaction's lists, but it HAS to be in Forget state in
+	 * that case: the transaction must have deleted the buffer for it to be
+	 * reused here.
+	 */
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+	J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
+		jh->b_transaction == NULL ||
+		(jh->b_transaction == journal->j_committing_transaction &&
+			  jh->b_jlist == BJ_Forget)));
+
+	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
+
+	if (jh->b_transaction == NULL) {
+		/*
+		 * Previous journal_forget() could have left the buffer
+		 * with jbddirty bit set because it was being committed. When
+		 * the commit finished, we've filed the buffer for
+		 * checkpointing and marked it dirty. Now we are reallocating
+		 * the buffer so the transaction freeing it must have
+		 * committed and so it's safe to clear the dirty bit.
+		 */
+		clear_buffer_dirty(jh2bh(jh));
+		jh->b_transaction = transaction;
+
+		/* first access by this transaction */
+		jh->b_modified = 0;
+
+		JBUFFER_TRACE(jh, "file as BJ_Reserved");
+		__journal_file_buffer(jh, transaction, BJ_Reserved);
+	} else if (jh->b_transaction == journal->j_committing_transaction) {
+		/* first access by this transaction */
+		jh->b_modified = 0;
+
+		JBUFFER_TRACE(jh, "set next transaction");
+		jh->b_next_transaction = transaction;
+	}
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+
+	/*
+	 * akpm: I added this.  ext3_alloc_branch can pick up new indirect
+	 * blocks which contain freed but then revoked metadata.  We need
+	 * to cancel the revoke in case we end up freeing it yet again
+	 * and the reallocating as data - this would cause a second revoke,
+	 * which hits an assertion error.
+	 */
+	JBUFFER_TRACE(jh, "cancelling revoke");
+	journal_cancel_revoke(handle, jh);
+	journal_put_journal_head(jh);
+out:
+	return err;
+}
+
+/**
+ * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences
+ * @handle: transaction
+ * @bh: buffer to undo
+ *
+ * Sometimes there is a need to distinguish between metadata which has
+ * been committed to disk and that which has not.  The ext3fs code uses
+ * this for freeing and allocating space, we have to make sure that we
+ * do not reuse freed space until the deallocation has been committed,
+ * since if we overwrote that space we would make the delete
+ * un-rewindable in case of a crash.
+ *
+ * To deal with that, journal_get_undo_access requests write access to a
+ * buffer for parts of non-rewindable operations such as delete
+ * operations on the bitmaps.  The journaling code must keep a copy of
+ * the buffer's contents prior to the undo_access call until such time
+ * as we know that the buffer has definitely been committed to disk.
+ *
+ * We never need to know which transaction the committed data is part
+ * of, buffers touched here are guaranteed to be dirtied later and so
+ * will be committed to a new transaction in due course, at which point
+ * we can discard the old committed data pointer.
+ *
+ * Returns error number or 0 on success.
+ */
+int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
+{
+	int err;
+	struct journal_head *jh = journal_add_journal_head(bh);
+	char *committed_data = NULL;
+
+	JBUFFER_TRACE(jh, "entry");
+
+	/*
+	 * Do this first --- it can drop the journal lock, so we want to
+	 * make sure that obtaining the committed_data is done
+	 * atomically wrt. completion of any outstanding commits.
+	 */
+	err = do_get_write_access(handle, jh, 1);
+	if (err)
+		goto out;
+
+repeat:
+	if (!jh->b_committed_data) {
+		committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
+		if (!committed_data) {
+			printk(KERN_EMERG "%s: No memory for committed data\n",
+				__func__);
+			err = -ENOMEM;
+			goto out;
+		}
+	}
+
+	jbd_lock_bh_state(bh);
+	if (!jh->b_committed_data) {
+		/* Copy out the current buffer contents into the
+		 * preserved, committed copy. */
+		JBUFFER_TRACE(jh, "generate b_committed data");
+		if (!committed_data) {
+			jbd_unlock_bh_state(bh);
+			goto repeat;
+		}
+
+		jh->b_committed_data = committed_data;
+		committed_data = NULL;
+		memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
+	}
+	jbd_unlock_bh_state(bh);
+out:
+	journal_put_journal_head(jh);
+	if (unlikely(committed_data))
+		jbd_free(committed_data, bh->b_size);
+	return err;
+}
+
+/**
+ * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed
+ * @handle: transaction
+ * @bh: bufferhead to mark
+ *
+ * Description:
+ * Mark a buffer as containing dirty data which needs to be flushed before
+ * we can commit the current transaction.
+ *
+ * The buffer is placed on the transaction's data list and is marked as
+ * belonging to the transaction.
+ *
+ * Returns error number or 0 on success.
+ *
+ * journal_dirty_data() can be called via page_launder->ext3_writepage
+ * by kswapd.
+ */
+int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
+{
+	journal_t *journal = handle->h_transaction->t_journal;
+	int need_brelse = 0;
+	struct journal_head *jh;
+	int ret = 0;
+
+	if (is_handle_aborted(handle))
+		return ret;
+
+	jh = journal_add_journal_head(bh);
+	JBUFFER_TRACE(jh, "entry");
+
+	/*
+	 * The buffer could *already* be dirty.  Writeout can start
+	 * at any time.
+	 */
+	jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
+
+	/*
+	 * What if the buffer is already part of a running transaction?
+	 *
+	 * There are two cases:
+	 * 1) It is part of the current running transaction.  Refile it,
+	 *    just in case we have allocated it as metadata, deallocated
+	 *    it, then reallocated it as data.
+	 * 2) It is part of the previous, still-committing transaction.
+	 *    If all we want to do is to guarantee that the buffer will be
+	 *    written to disk before this new transaction commits, then
+	 *    being sure that the *previous* transaction has this same
+	 *    property is sufficient for us!  Just leave it on its old
+	 *    transaction.
+	 *
+	 * In case (2), the buffer must not already exist as metadata
+	 * --- that would violate write ordering (a transaction is free
+	 * to write its data at any point, even before the previous
+	 * committing transaction has committed).  The caller must
+	 * never, ever allow this to happen: there's nothing we can do
+	 * about it in this layer.
+	 */
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+
+	/* Now that we have bh_state locked, are we really still mapped? */
+	if (!buffer_mapped(bh)) {
+		JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
+		goto no_journal;
+	}
+
+	if (jh->b_transaction) {
+		JBUFFER_TRACE(jh, "has transaction");
+		if (jh->b_transaction != handle->h_transaction) {
+			JBUFFER_TRACE(jh, "belongs to older transaction");
+			J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_committing_transaction);
+
+			/* @@@ IS THIS TRUE  ? */
+			/*
+			 * Not any more.  Scenario: someone does a write()
+			 * in data=journal mode.  The buffer's transaction has
+			 * moved into commit.  Then someone does another
+			 * write() to the file.  We do the frozen data copyout
+			 * and set b_next_transaction to point to j_running_t.
+			 * And while we're in that state, someone does a
+			 * writepage() in an attempt to pageout the same area
+			 * of the file via a shared mapping.  At present that
+			 * calls journal_dirty_data(), and we get right here.
+			 * It may be too late to journal the data.  Simply
+			 * falling through to the next test will suffice: the
+			 * data will be dirty and wil be checkpointed.  The
+			 * ordering comments in the next comment block still
+			 * apply.
+			 */
+			//J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+
+			/*
+			 * If we're journalling data, and this buffer was
+			 * subject to a write(), it could be metadata, forget
+			 * or shadow against the committing transaction.  Now,
+			 * someone has dirtied the same darn page via a mapping
+			 * and it is being writepage()'d.
+			 * We *could* just steal the page from commit, with some
+			 * fancy locking there.  Instead, we just skip it -
+			 * don't tie the page's buffers to the new transaction
+			 * at all.
+			 * Implication: if we crash before the writepage() data
+			 * is written into the filesystem, recovery will replay
+			 * the write() data.
+			 */
+			if (jh->b_jlist != BJ_None &&
+					jh->b_jlist != BJ_SyncData &&
+					jh->b_jlist != BJ_Locked) {
+				JBUFFER_TRACE(jh, "Not stealing");
+				goto no_journal;
+			}
+
+			/*
+			 * This buffer may be undergoing writeout in commit.  We
+			 * can't return from here and let the caller dirty it
+			 * again because that can cause the write-out loop in
+			 * commit to never terminate.
+			 */
+			if (buffer_dirty(bh)) {
+				get_bh(bh);
+				spin_unlock(&journal->j_list_lock);
+				jbd_unlock_bh_state(bh);
+				need_brelse = 1;
+				sync_dirty_buffer(bh);
+				jbd_lock_bh_state(bh);
+				spin_lock(&journal->j_list_lock);
+				/* Since we dropped the lock... */
+				if (!buffer_mapped(bh)) {
+					JBUFFER_TRACE(jh, "buffer got unmapped");
+					goto no_journal;
+				}
+				/* The buffer may become locked again at any
+				   time if it is redirtied */
+			}
+
+			/*
+			 * We cannot remove the buffer with io error from the
+			 * committing transaction, because otherwise it would
+			 * miss the error and the commit would not abort.
+			 */
+			if (unlikely(!buffer_uptodate(bh))) {
+				ret = -EIO;
+				goto no_journal;
+			}
+
+			if (jh->b_transaction != NULL) {
+				JBUFFER_TRACE(jh, "unfile from commit");
+				__journal_temp_unlink_buffer(jh);
+				/* It still points to the committing
+				 * transaction; move it to this one so
+				 * that the refile assert checks are
+				 * happy. */
+				jh->b_transaction = handle->h_transaction;
+			}
+			/* The buffer will be refiled below */
+
+		}
+		/*
+		 * Special case --- the buffer might actually have been
+		 * allocated and then immediately deallocated in the previous,
+		 * committing transaction, so might still be left on that
+		 * transaction's metadata lists.
+		 */
+		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
+			JBUFFER_TRACE(jh, "not on correct data list: unfile");
+			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
+			__journal_temp_unlink_buffer(jh);
+			jh->b_transaction = handle->h_transaction;
+			JBUFFER_TRACE(jh, "file as data");
+			__journal_file_buffer(jh, handle->h_transaction,
+						BJ_SyncData);
+		}
+	} else {
+		JBUFFER_TRACE(jh, "not on a transaction");
+		__journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
+	}
+no_journal:
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+	if (need_brelse) {
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+	}
+	JBUFFER_TRACE(jh, "exit");
+	journal_put_journal_head(jh);
+	return ret;
+}
+
+/**
+ * int journal_dirty_metadata() - mark a buffer as containing dirty metadata
+ * @handle: transaction to add buffer to.
+ * @bh: buffer to mark
+ *
+ * Mark dirty metadata which needs to be journaled as part of the current
+ * transaction.
+ *
+ * The buffer is placed on the transaction's metadata list and is marked
+ * as belonging to the transaction.
+ *
+ * Returns error number or 0 on success.
+ *
+ * Special care needs to be taken if the buffer already belongs to the
+ * current committing transaction (in which case we should have frozen
+ * data present for that commit).  In that case, we don't relink the
+ * buffer: that only gets done when the old transaction finally
+ * completes its commit.
+ */
+int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	struct journal_head *jh = bh2jh(bh);
+
+	jbd_debug(5, "journal_head %p\n", jh);
+	JBUFFER_TRACE(jh, "entry");
+	if (is_handle_aborted(handle))
+		goto out;
+
+	jbd_lock_bh_state(bh);
+
+	if (jh->b_modified == 0) {
+		/*
+		 * This buffer's got modified and becoming part
+		 * of the transaction. This needs to be done
+		 * once a transaction -bzzz
+		 */
+		jh->b_modified = 1;
+		J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+		handle->h_buffer_credits--;
+	}
+
+	/*
+	 * fastpath, to avoid expensive locking.  If this buffer is already
+	 * on the running transaction's metadata list there is nothing to do.
+	 * Nobody can take it off again because there is a handle open.
+	 * I _think_ we're OK here with SMP barriers - a mistaken decision will
+	 * result in this test being false, so we go in and take the locks.
+	 */
+	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
+		JBUFFER_TRACE(jh, "fastpath");
+		J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_running_transaction);
+		goto out_unlock_bh;
+	}
+
+	set_buffer_jbddirty(bh);
+
+	/*
+	 * Metadata already on the current transaction list doesn't
+	 * need to be filed.  Metadata on another transaction's list must
+	 * be committing, and will be refiled once the commit completes:
+	 * leave it alone for now.
+	 */
+	if (jh->b_transaction != transaction) {
+		JBUFFER_TRACE(jh, "already on other transaction");
+		J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_committing_transaction);
+		J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
+		/* And this case is illegal: we can't reuse another
+		 * transaction's data buffer, ever. */
+		goto out_unlock_bh;
+	}
+
+	/* That test should have eliminated the following case: */
+	J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
+
+	JBUFFER_TRACE(jh, "file as BJ_Metadata");
+	spin_lock(&journal->j_list_lock);
+	__journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+	spin_unlock(&journal->j_list_lock);
+out_unlock_bh:
+	jbd_unlock_bh_state(bh);
+out:
+	JBUFFER_TRACE(jh, "exit");
+	return 0;
+}
+
+/*
+ * journal_release_buffer: undo a get_write_access without any buffer
+ * updates, if the update decided in the end that it didn't need access.
+ *
+ */
+void
+journal_release_buffer(handle_t *handle, struct buffer_head *bh)
+{
+	BUFFER_TRACE(bh, "entry");
+}
+
+/**
+ * void journal_forget() - bforget() for potentially-journaled buffers.
+ * @handle: transaction handle
+ * @bh:     bh to 'forget'
+ *
+ * We can only do the bforget if there are no commits pending against the
+ * buffer.  If the buffer is dirty in the current running transaction we
+ * can safely unlink it.
+ *
+ * bh may not be a journalled buffer at all - it may be a non-JBD
+ * buffer which came off the hashtable.  Check for this.
+ *
+ * Decrements bh->b_count by one.
+ *
+ * Allow this call even if the handle has aborted --- it may be part of
+ * the caller's cleanup after an abort.
+ */
+int journal_forget (handle_t *handle, struct buffer_head *bh)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	struct journal_head *jh;
+	int drop_reserve = 0;
+	int err = 0;
+	int was_modified = 0;
+
+	BUFFER_TRACE(bh, "entry");
+
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+
+	if (!buffer_jbd(bh))
+		goto not_jbd;
+	jh = bh2jh(bh);
+
+	/* Critical error: attempting to delete a bitmap buffer, maybe?
+	 * Don't do any jbd operations, and return an error. */
+	if (!J_EXPECT_JH(jh, !jh->b_committed_data,
+			 "inconsistent data on disk")) {
+		err = -EIO;
+		goto not_jbd;
+	}
+
+	/* keep track of wether or not this transaction modified us */
+	was_modified = jh->b_modified;
+
+	/*
+	 * The buffer's going from the transaction, we must drop
+	 * all references -bzzz
+	 */
+	jh->b_modified = 0;
+
+	if (jh->b_transaction == handle->h_transaction) {
+		J_ASSERT_JH(jh, !jh->b_frozen_data);
+
+		/* If we are forgetting a buffer which is already part
+		 * of this transaction, then we can just drop it from
+		 * the transaction immediately. */
+		clear_buffer_dirty(bh);
+		clear_buffer_jbddirty(bh);
+
+		JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
+
+		/*
+		 * we only want to drop a reference if this transaction
+		 * modified the buffer
+		 */
+		if (was_modified)
+			drop_reserve = 1;
+
+		/*
+		 * We are no longer going to journal this buffer.
+		 * However, the commit of this transaction is still
+		 * important to the buffer: the delete that we are now
+		 * processing might obsolete an old log entry, so by
+		 * committing, we can satisfy the buffer's checkpoint.
+		 *
+		 * So, if we have a checkpoint on the buffer, we should
+		 * now refile the buffer on our BJ_Forget list so that
+		 * we know to remove the checkpoint after we commit.
+		 */
+
+		if (jh->b_cp_transaction) {
+			__journal_temp_unlink_buffer(jh);
+			__journal_file_buffer(jh, transaction, BJ_Forget);
+		} else {
+			__journal_unfile_buffer(jh);
+			journal_remove_journal_head(bh);
+			__brelse(bh);
+			if (!buffer_jbd(bh)) {
+				spin_unlock(&journal->j_list_lock);
+				jbd_unlock_bh_state(bh);
+				__bforget(bh);
+				goto drop;
+			}
+		}
+	} else if (jh->b_transaction) {
+		J_ASSERT_JH(jh, (jh->b_transaction ==
+				 journal->j_committing_transaction));
+		/* However, if the buffer is still owned by a prior
+		 * (committing) transaction, we can't drop it yet... */
+		JBUFFER_TRACE(jh, "belongs to older transaction");
+		/* ... but we CAN drop it from the new transaction if we
+		 * have also modified it since the original commit. */
+
+		if (jh->b_next_transaction) {
+			J_ASSERT(jh->b_next_transaction == transaction);
+			jh->b_next_transaction = NULL;
+
+			/*
+			 * only drop a reference if this transaction modified
+			 * the buffer
+			 */
+			if (was_modified)
+				drop_reserve = 1;
+		}
+	}
+
+not_jbd:
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+	__brelse(bh);
+drop:
+	if (drop_reserve) {
+		/* no need to reserve log space for this block -bzzz */
+		handle->h_buffer_credits++;
+	}
+	return err;
+}
+
+/**
+ * int journal_stop() - complete a transaction
+ * @handle: tranaction to complete.
+ *
+ * All done for a particular handle.
+ *
+ * There is not much action needed here.  We just return any remaining
+ * buffer credits to the transaction and remove the handle.  The only
+ * complication is that we need to start a commit operation if the
+ * filesystem is marked for synchronous update.
+ *
+ * journal_stop itself will not usually return an error, but it may
+ * do so in unusual circumstances.  In particular, expect it to
+ * return -EIO if a journal_abort has been executed since the
+ * transaction began.
+ */
+int journal_stop(handle_t *handle)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	int err;
+	pid_t pid;
+
+	J_ASSERT(journal_current_handle() == handle);
+
+	if (is_handle_aborted(handle))
+		err = -EIO;
+	else {
+		J_ASSERT(transaction->t_updates > 0);
+		err = 0;
+	}
+
+	if (--handle->h_ref > 0) {
+		jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+			  handle->h_ref);
+		return err;
+	}
+
+	jbd_debug(4, "Handle %p going down\n", handle);
+
+	/*
+	 * Implement synchronous transaction batching.  If the handle
+	 * was synchronous, don't force a commit immediately.  Let's
+	 * yield and let another thread piggyback onto this transaction.
+	 * Keep doing that while new threads continue to arrive.
+	 * It doesn't cost much - we're about to run a commit and sleep
+	 * on IO anyway.  Speeds up many-threaded, many-dir operations
+	 * by 30x or more...
+	 *
+	 * We try and optimize the sleep time against what the underlying disk
+	 * can do, instead of having a static sleep time.  This is useful for
+	 * the case where our storage is so fast that it is more optimal to go
+	 * ahead and force a flush and wait for the transaction to be committed
+	 * than it is to wait for an arbitrary amount of time for new writers to
+	 * join the transaction.  We achieve this by measuring how long it takes
+	 * to commit a transaction, and compare it with how long this
+	 * transaction has been running, and if run time < commit time then we
+	 * sleep for the delta and commit.  This greatly helps super fast disks
+	 * that would see slowdowns as more threads started doing fsyncs.
+	 *
+	 * But don't do this if this process was the most recent one to
+	 * perform a synchronous write.  We do this to detect the case where a
+	 * single process is doing a stream of sync writes.  No point in waiting
+	 * for joiners in that case.
+	 */
+	pid = current->pid;
+	if (handle->h_sync && journal->j_last_sync_writer != pid) {
+		u64 commit_time, trans_time;
+
+		journal->j_last_sync_writer = pid;
+
+		spin_lock(&journal->j_state_lock);
+		commit_time = journal->j_average_commit_time;
+		spin_unlock(&journal->j_state_lock);
+
+		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+						   transaction->t_start_time));
+
+		commit_time = min_t(u64, commit_time,
+				    1000*jiffies_to_usecs(1));
+
+		if (trans_time < commit_time) {
+			ktime_t expires = ktime_add_ns(ktime_get(),
+						       commit_time);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+		}
+	}
+
+	if (handle->h_sync)
+		transaction->t_synchronous_commit = 1;
+	current->journal_info = NULL;
+	spin_lock(&journal->j_state_lock);
+	spin_lock(&transaction->t_handle_lock);
+	transaction->t_outstanding_credits -= handle->h_buffer_credits;
+	transaction->t_updates--;
+	if (!transaction->t_updates) {
+		wake_up(&journal->j_wait_updates);
+		if (journal->j_barrier_count)
+			wake_up(&journal->j_wait_transaction_locked);
+	}
+
+	/*
+	 * If the handle is marked SYNC, we need to set another commit
+	 * going!  We also want to force a commit if the current
+	 * transaction is occupying too much of the log, or if the
+	 * transaction is too old now.
+	 */
+	if (handle->h_sync ||
+			transaction->t_outstanding_credits >
+				journal->j_max_transaction_buffers ||
+			time_after_eq(jiffies, transaction->t_expires)) {
+		/* Do this even for aborted journals: an abort still
+		 * completes the commit thread, it just doesn't write
+		 * anything to disk. */
+		tid_t tid = transaction->t_tid;
+
+		spin_unlock(&transaction->t_handle_lock);
+		jbd_debug(2, "transaction too old, requesting commit for "
+					"handle %p\n", handle);
+		/* This is non-blocking */
+		__log_start_commit(journal, transaction->t_tid);
+		spin_unlock(&journal->j_state_lock);
+
+		/*
+		 * Special case: JFS_SYNC synchronous updates require us
+		 * to wait for the commit to complete.
+		 */
+		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
+			err = log_wait_commit(journal, tid);
+	} else {
+		spin_unlock(&transaction->t_handle_lock);
+		spin_unlock(&journal->j_state_lock);
+	}
+
+	lock_map_release(&handle->h_lockdep_map);
+
+	jbd_free_handle(handle);
+	return err;
+}
+
+/**
+ * int journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * For synchronous operations: force any uncommitted transactions
+ * to disk.  May seem kludgy, but it reuses all the handle batching
+ * code in a very simple manner.
+ */
+int journal_force_commit(journal_t *journal)
+{
+	handle_t *handle;
+	int ret;
+
+	handle = journal_start(journal, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+	} else {
+		handle->h_sync = 1;
+		ret = journal_stop(handle);
+	}
+	return ret;
+}
+
+/*
+ *
+ * List management code snippets: various functions for manipulating the
+ * transaction buffer lists.
+ *
+ */
+
+/*
+ * Append a buffer to a transaction list, given the transaction's list head
+ * pointer.
+ *
+ * j_list_lock is held.
+ *
+ * jbd_lock_bh_state(jh2bh(jh)) is held.
+ */
+
+static inline void
+__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
+{
+	if (!*list) {
+		jh->b_tnext = jh->b_tprev = jh;
+		*list = jh;
+	} else {
+		/* Insert at the tail of the list to preserve order */
+		struct journal_head *first = *list, *last = first->b_tprev;
+		jh->b_tprev = last;
+		jh->b_tnext = first;
+		last->b_tnext = first->b_tprev = jh;
+	}
+}
+
+/*
+ * Remove a buffer from a transaction list, given the transaction's list
+ * head pointer.
+ *
+ * Called with j_list_lock held, and the journal may not be locked.
+ *
+ * jbd_lock_bh_state(jh2bh(jh)) is held.
+ */
+
+static inline void
+__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
+{
+	if (*list == jh) {
+		*list = jh->b_tnext;
+		if (*list == jh)
+			*list = NULL;
+	}
+	jh->b_tprev->b_tnext = jh->b_tnext;
+	jh->b_tnext->b_tprev = jh->b_tprev;
+}
+
+/*
+ * Remove a buffer from the appropriate transaction list.
+ *
+ * Note that this function can *change* the value of
+ * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
+ * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
+ * is holding onto a copy of one of thee pointers, it could go bad.
+ * Generally the caller needs to re-read the pointer from the transaction_t.
+ *
+ * Called under j_list_lock.  The journal may not be locked.
+ */
+static void __journal_temp_unlink_buffer(struct journal_head *jh)
+{
+	struct journal_head **list = NULL;
+	transaction_t *transaction;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	transaction = jh->b_transaction;
+	if (transaction)
+		assert_spin_locked(&transaction->t_journal->j_list_lock);
+
+	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+	if (jh->b_jlist != BJ_None)
+		J_ASSERT_JH(jh, transaction != NULL);
+
+	switch (jh->b_jlist) {
+	case BJ_None:
+		return;
+	case BJ_SyncData:
+		list = &transaction->t_sync_datalist;
+		break;
+	case BJ_Metadata:
+		transaction->t_nr_buffers--;
+		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
+		list = &transaction->t_buffers;
+		break;
+	case BJ_Forget:
+		list = &transaction->t_forget;
+		break;
+	case BJ_IO:
+		list = &transaction->t_iobuf_list;
+		break;
+	case BJ_Shadow:
+		list = &transaction->t_shadow_list;
+		break;
+	case BJ_LogCtl:
+		list = &transaction->t_log_list;
+		break;
+	case BJ_Reserved:
+		list = &transaction->t_reserved_list;
+		break;
+	case BJ_Locked:
+		list = &transaction->t_locked_list;
+		break;
+	}
+
+	__blist_del_buffer(list, jh);
+	jh->b_jlist = BJ_None;
+	if (test_clear_buffer_jbddirty(bh))
+		mark_buffer_dirty(bh);	/* Expose it to the VM */
+}
+
+void __journal_unfile_buffer(struct journal_head *jh)
+{
+	__journal_temp_unlink_buffer(jh);
+	jh->b_transaction = NULL;
+}
+
+void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
+{
+	jbd_lock_bh_state(jh2bh(jh));
+	spin_lock(&journal->j_list_lock);
+	__journal_unfile_buffer(jh);
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(jh2bh(jh));
+}
+
+/*
+ * Called from journal_try_to_free_buffers().
+ *
+ * Called under jbd_lock_bh_state(bh)
+ */
+static void
+__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
+{
+	struct journal_head *jh;
+
+	jh = bh2jh(bh);
+
+	if (buffer_locked(bh) || buffer_dirty(bh))
+		goto out;
+
+	if (jh->b_next_transaction != NULL)
+		goto out;
+
+	spin_lock(&journal->j_list_lock);
+	if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
+		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
+			/* A written-back ordered data buffer */
+			JBUFFER_TRACE(jh, "release data");
+			__journal_unfile_buffer(jh);
+			journal_remove_journal_head(bh);
+			__brelse(bh);
+		}
+	} else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+		/* written-back checkpointed metadata buffer */
+		if (jh->b_jlist == BJ_None) {
+			JBUFFER_TRACE(jh, "remove from checkpoint list");
+			__journal_remove_checkpoint(jh);
+			journal_remove_journal_head(bh);
+			__brelse(bh);
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+out:
+	return;
+}
+
+/**
+ * int journal_try_to_free_buffers() - try to free page buffers.
+ * @journal: journal for operation
+ * @page: to try and free
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
+ *
+ *
+ * For all the buffers on this page,
+ * if they are fully written out ordered data, move them onto BUF_CLEAN
+ * so try_to_free_buffers() can reap them.
+ *
+ * This function returns non-zero if we wish try_to_free_buffers()
+ * to be called. We do this if the page is releasable by try_to_free_buffers().
+ * We also do it if the page has locked or dirty buffers and the caller wants
+ * us to perform sync or async writeout.
+ *
+ * This complicates JBD locking somewhat.  We aren't protected by the
+ * BKL here.  We wish to remove the buffer from its committing or
+ * running transaction's ->t_datalist via __journal_unfile_buffer.
+ *
+ * This may *change* the value of transaction_t->t_datalist, so anyone
+ * who looks at t_datalist needs to lock against this function.
+ *
+ * Even worse, someone may be doing a journal_dirty_data on this
+ * buffer.  So we need to lock against that.  journal_dirty_data()
+ * will come out of the lock with the buffer dirty, which makes it
+ * ineligible for release here.
+ *
+ * Who else is affected by this?  hmm...  Really the only contender
+ * is do_get_write_access() - it could be looking at the buffer while
+ * journal_try_to_free_buffer() is changing its state.  But that
+ * cannot happen because we never reallocate freed data as metadata
+ * while the data is part of a transaction.  Yes?
+ *
+ * Return 0 on failure, 1 on success
+ */
+int journal_try_to_free_buffers(journal_t *journal,
+				struct page *page, gfp_t gfp_mask)
+{
+	struct buffer_head *head;
+	struct buffer_head *bh;
+	int ret = 0;
+
+	J_ASSERT(PageLocked(page));
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		struct journal_head *jh;
+
+		/*
+		 * We take our own ref against the journal_head here to avoid
+		 * having to add tons of locking around each instance of
+		 * journal_remove_journal_head() and journal_put_journal_head().
+		 */
+		jh = journal_grab_journal_head(bh);
+		if (!jh)
+			continue;
+
+		jbd_lock_bh_state(bh);
+		__journal_try_to_free_buffer(journal, bh);
+		journal_put_journal_head(jh);
+		jbd_unlock_bh_state(bh);
+		if (buffer_jbd(bh))
+			goto busy;
+	} while ((bh = bh->b_this_page) != head);
+
+	ret = try_to_free_buffers(page);
+
+busy:
+	return ret;
+}
+
+/*
+ * This buffer is no longer needed.  If it is on an older transaction's
+ * checkpoint list we need to record it on this transaction's forget list
+ * to pin this buffer (and hence its checkpointing transaction) down until
+ * this transaction commits.  If the buffer isn't on a checkpoint list, we
+ * release it.
+ * Returns non-zero if JBD no longer has an interest in the buffer.
+ *
+ * Called under j_list_lock.
+ *
+ * Called under jbd_lock_bh_state(bh).
+ */
+static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
+{
+	int may_free = 1;
+	struct buffer_head *bh = jh2bh(jh);
+
+	__journal_unfile_buffer(jh);
+
+	if (jh->b_cp_transaction) {
+		JBUFFER_TRACE(jh, "on running+cp transaction");
+		/*
+		 * We don't want to write the buffer anymore, clear the
+		 * bit so that we don't confuse checks in
+		 * __journal_file_buffer
+		 */
+		clear_buffer_dirty(bh);
+		__journal_file_buffer(jh, transaction, BJ_Forget);
+		may_free = 0;
+	} else {
+		JBUFFER_TRACE(jh, "on running transaction");
+		journal_remove_journal_head(bh);
+		__brelse(bh);
+	}
+	return may_free;
+}
+
+/*
+ * journal_invalidatepage
+ *
+ * This code is tricky.  It has a number of cases to deal with.
+ *
+ * There are two invariants which this code relies on:
+ *
+ * i_size must be updated on disk before we start calling invalidatepage on the
+ * data.
+ *
+ *  This is done in ext3 by defining an ext3_setattr method which
+ *  updates i_size before truncate gets going.  By maintaining this
+ *  invariant, we can be sure that it is safe to throw away any buffers
+ *  attached to the current transaction: once the transaction commits,
+ *  we know that the data will not be needed.
+ *
+ *  Note however that we can *not* throw away data belonging to the
+ *  previous, committing transaction!
+ *
+ * Any disk blocks which *are* part of the previous, committing
+ * transaction (and which therefore cannot be discarded immediately) are
+ * not going to be reused in the new running transaction
+ *
+ *  The bitmap committed_data images guarantee this: any block which is
+ *  allocated in one transaction and removed in the next will be marked
+ *  as in-use in the committed_data bitmap, so cannot be reused until
+ *  the next transaction to delete the block commits.  This means that
+ *  leaving committing buffers dirty is quite safe: the disk blocks
+ *  cannot be reallocated to a different file and so buffer aliasing is
+ *  not possible.
+ *
+ *
+ * The above applies mainly to ordered data mode.  In writeback mode we
+ * don't make guarantees about the order in which data hits disk --- in
+ * particular we don't guarantee that new dirty data is flushed before
+ * transaction commit --- so it is always safe just to discard data
+ * immediately in that mode.  --sct
+ */
+
+/*
+ * The journal_unmap_buffer helper function returns zero if the buffer
+ * concerned remains pinned as an anonymous buffer belonging to an older
+ * transaction.
+ *
+ * We're outside-transaction here.  Either or both of j_running_transaction
+ * and j_committing_transaction may be NULL.
+ */
+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
+{
+	transaction_t *transaction;
+	struct journal_head *jh;
+	int may_free = 1;
+	int ret;
+
+	BUFFER_TRACE(bh, "entry");
+
+	/*
+	 * It is safe to proceed here without the j_list_lock because the
+	 * buffers cannot be stolen by try_to_free_buffers as long as we are
+	 * holding the page lock. --sct
+	 */
+
+	if (!buffer_jbd(bh))
+		goto zap_buffer_unlocked;
+
+	spin_lock(&journal->j_state_lock);
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+
+	jh = journal_grab_journal_head(bh);
+	if (!jh)
+		goto zap_buffer_no_jh;
+
+	/*
+	 * We cannot remove the buffer from checkpoint lists until the
+	 * transaction adding inode to orphan list (let's call it T)
+	 * is committed.  Otherwise if the transaction changing the
+	 * buffer would be cleaned from the journal before T is
+	 * committed, a crash will cause that the correct contents of
+	 * the buffer will be lost.  On the other hand we have to
+	 * clear the buffer dirty bit at latest at the moment when the
+	 * transaction marking the buffer as freed in the filesystem
+	 * structures is committed because from that moment on the
+	 * buffer can be reallocated and used by a different page.
+	 * Since the block hasn't been freed yet but the inode has
+	 * already been added to orphan list, it is safe for us to add
+	 * the buffer to BJ_Forget list of the newest transaction.
+	 */
+	transaction = jh->b_transaction;
+	if (transaction == NULL) {
+		/* First case: not on any transaction.  If it
+		 * has no checkpoint link, then we can zap it:
+		 * it's a writeback-mode buffer so we don't care
+		 * if it hits disk safely. */
+		if (!jh->b_cp_transaction) {
+			JBUFFER_TRACE(jh, "not on any transaction: zap");
+			goto zap_buffer;
+		}
+
+		if (!buffer_dirty(bh)) {
+			/* bdflush has written it.  We can drop it now */
+			goto zap_buffer;
+		}
+
+		/* OK, it must be in the journal but still not
+		 * written fully to disk: it's metadata or
+		 * journaled data... */
+
+		if (journal->j_running_transaction) {
+			/* ... and once the current transaction has
+			 * committed, the buffer won't be needed any
+			 * longer. */
+			JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
+			ret = __dispose_buffer(jh,
+					journal->j_running_transaction);
+			journal_put_journal_head(jh);
+			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
+			spin_unlock(&journal->j_state_lock);
+			return ret;
+		} else {
+			/* There is no currently-running transaction. So the
+			 * orphan record which we wrote for this file must have
+			 * passed into commit.  We must attach this buffer to
+			 * the committing transaction, if it exists. */
+			if (journal->j_committing_transaction) {
+				JBUFFER_TRACE(jh, "give to committing trans");
+				ret = __dispose_buffer(jh,
+					journal->j_committing_transaction);
+				journal_put_journal_head(jh);
+				spin_unlock(&journal->j_list_lock);
+				jbd_unlock_bh_state(bh);
+				spin_unlock(&journal->j_state_lock);
+				return ret;
+			} else {
+				/* The orphan record's transaction has
+				 * committed.  We can cleanse this buffer */
+				clear_buffer_jbddirty(bh);
+				goto zap_buffer;
+			}
+		}
+	} else if (transaction == journal->j_committing_transaction) {
+		JBUFFER_TRACE(jh, "on committing transaction");
+		if (jh->b_jlist == BJ_Locked) {
+			/*
+			 * The buffer is on the committing transaction's locked
+			 * list.  We have the buffer locked, so I/O has
+			 * completed.  So we can nail the buffer now.
+			 */
+			may_free = __dispose_buffer(jh, transaction);
+			goto zap_buffer;
+		}
+		/*
+		 * The buffer is committing, we simply cannot touch
+		 * it. So we just set j_next_transaction to the
+		 * running transaction (if there is one) and mark
+		 * buffer as freed so that commit code knows it should
+		 * clear dirty bits when it is done with the buffer.
+		 */
+		set_buffer_freed(bh);
+		if (journal->j_running_transaction && buffer_jbddirty(bh))
+			jh->b_next_transaction = journal->j_running_transaction;
+		journal_put_journal_head(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		spin_unlock(&journal->j_state_lock);
+		return 0;
+	} else {
+		/* Good, the buffer belongs to the running transaction.
+		 * We are writing our own transaction's data, not any
+		 * previous one's, so it is safe to throw it away
+		 * (remember that we expect the filesystem to have set
+		 * i_size already for this truncate so recovery will not
+		 * expose the disk blocks we are discarding here.) */
+		J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
+		JBUFFER_TRACE(jh, "on running transaction");
+		may_free = __dispose_buffer(jh, transaction);
+	}
+
+zap_buffer:
+	journal_put_journal_head(jh);
+zap_buffer_no_jh:
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+	spin_unlock(&journal->j_state_lock);
+zap_buffer_unlocked:
+	clear_buffer_dirty(bh);
+	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	bh->b_bdev = NULL;
+	return may_free;
+}
+
+/**
+ * void journal_invalidatepage() - invalidate a journal page
+ * @journal: journal to use for flush
+ * @page:    page to flush
+ * @offset:  length of page to invalidate.
+ *
+ * Reap page buffers containing data after offset in page.
+ */
+void journal_invalidatepage(journal_t *journal,
+		      struct page *page,
+		      unsigned long offset)
+{
+	struct buffer_head *head, *bh, *next;
+	unsigned int curr_off = 0;
+	int may_free = 1;
+
+	if (!PageLocked(page))
+		BUG();
+	if (!page_has_buffers(page))
+		return;
+
+	/* We will potentially be playing with lists other than just the
+	 * data lists (especially for journaled data mode), so be
+	 * cautious in our locking. */
+
+	head = bh = page_buffers(page);
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+		next = bh->b_this_page;
+
+		if (offset <= curr_off) {
+			/* This block is wholly outside the truncation point */
+			lock_buffer(bh);
+			may_free &= journal_unmap_buffer(journal, bh);
+			unlock_buffer(bh);
+		}
+		curr_off = next_off;
+		bh = next;
+
+	} while (bh != head);
+
+	if (!offset) {
+		if (may_free && try_to_free_buffers(page))
+			J_ASSERT(!page_has_buffers(page));
+	}
+}
+
+/*
+ * File a buffer on the given transaction list.
+ */
+void __journal_file_buffer(struct journal_head *jh,
+			transaction_t *transaction, int jlist)
+{
+	struct journal_head **list = NULL;
+	int was_dirty = 0;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	assert_spin_locked(&transaction->t_journal->j_list_lock);
+
+	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+	J_ASSERT_JH(jh, jh->b_transaction == transaction ||
+				jh->b_transaction == NULL);
+
+	if (jh->b_transaction && jh->b_jlist == jlist)
+		return;
+
+	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+	    jlist == BJ_Shadow || jlist == BJ_Forget) {
+		/*
+		 * For metadata buffers, we track dirty bit in buffer_jbddirty
+		 * instead of buffer_dirty. We should not see a dirty bit set
+		 * here because we clear it in do_get_write_access but e.g.
+		 * tune2fs can modify the sb and set the dirty bit at any time
+		 * so we try to gracefully handle that.
+		 */
+		if (buffer_dirty(bh))
+			warn_dirty_buffer(bh);
+		if (test_clear_buffer_dirty(bh) ||
+		    test_clear_buffer_jbddirty(bh))
+			was_dirty = 1;
+	}
+
+	if (jh->b_transaction)
+		__journal_temp_unlink_buffer(jh);
+	jh->b_transaction = transaction;
+
+	switch (jlist) {
+	case BJ_None:
+		J_ASSERT_JH(jh, !jh->b_committed_data);
+		J_ASSERT_JH(jh, !jh->b_frozen_data);
+		return;
+	case BJ_SyncData:
+		list = &transaction->t_sync_datalist;
+		break;
+	case BJ_Metadata:
+		transaction->t_nr_buffers++;
+		list = &transaction->t_buffers;
+		break;
+	case BJ_Forget:
+		list = &transaction->t_forget;
+		break;
+	case BJ_IO:
+		list = &transaction->t_iobuf_list;
+		break;
+	case BJ_Shadow:
+		list = &transaction->t_shadow_list;
+		break;
+	case BJ_LogCtl:
+		list = &transaction->t_log_list;
+		break;
+	case BJ_Reserved:
+		list = &transaction->t_reserved_list;
+		break;
+	case BJ_Locked:
+		list =  &transaction->t_locked_list;
+		break;
+	}
+
+	__blist_add_buffer(list, jh);
+	jh->b_jlist = jlist;
+
+	if (was_dirty)
+		set_buffer_jbddirty(bh);
+}
+
+void journal_file_buffer(struct journal_head *jh,
+				transaction_t *transaction, int jlist)
+{
+	jbd_lock_bh_state(jh2bh(jh));
+	spin_lock(&transaction->t_journal->j_list_lock);
+	__journal_file_buffer(jh, transaction, jlist);
+	spin_unlock(&transaction->t_journal->j_list_lock);
+	jbd_unlock_bh_state(jh2bh(jh));
+}
+
+/*
+ * Remove a buffer from its current buffer list in preparation for
+ * dropping it from its current transaction entirely.  If the buffer has
+ * already started to be used by a subsequent transaction, refile the
+ * buffer on that transaction's metadata list.
+ *
+ * Called under journal->j_list_lock
+ *
+ * Called under jbd_lock_bh_state(jh2bh(jh))
+ */
+void __journal_refile_buffer(struct journal_head *jh)
+{
+	int was_dirty, jlist;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	if (jh->b_transaction)
+		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
+
+	/* If the buffer is now unused, just drop it. */
+	if (jh->b_next_transaction == NULL) {
+		__journal_unfile_buffer(jh);
+		return;
+	}
+
+	/*
+	 * It has been modified by a later transaction: add it to the new
+	 * transaction's metadata list.
+	 */
+
+	was_dirty = test_clear_buffer_jbddirty(bh);
+	__journal_temp_unlink_buffer(jh);
+	jh->b_transaction = jh->b_next_transaction;
+	jh->b_next_transaction = NULL;
+	if (buffer_freed(bh))
+		jlist = BJ_Forget;
+	else if (jh->b_modified)
+		jlist = BJ_Metadata;
+	else
+		jlist = BJ_Reserved;
+	__journal_file_buffer(jh, jh->b_transaction, jlist);
+	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
+
+	if (was_dirty)
+		set_buffer_jbddirty(bh);
+}
+
+/*
+ * For the unlocked version of this call, also make sure that any
+ * hanging journal_head is cleaned up if necessary.
+ *
+ * __journal_refile_buffer is usually called as part of a single locked
+ * operation on a buffer_head, in which the caller is probably going to
+ * be hooking the journal_head onto other lists.  In that case it is up
+ * to the caller to remove the journal_head if necessary.  For the
+ * unlocked journal_refile_buffer call, the caller isn't going to be
+ * doing anything else to the buffer so we need to do the cleanup
+ * ourselves to avoid a jh leak.
+ *
+ * *** The journal_head may be freed by this call! ***
+ */
+void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+
+	__journal_refile_buffer(jh);
+	jbd_unlock_bh_state(bh);
+	journal_remove_journal_head(bh);
+
+	spin_unlock(&journal->j_list_lock);
+	__brelse(bh);
+}
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
new file mode 100644
index 00000000..f32f346f
--- /dev/null
+++ b/fs/jbd2/Kconfig
@@ -0,0 +1,33 @@
+config JBD2
+	tristate
+	select CRC32
+	help
+	  This is a generic journaling layer for block devices that support
+	  both 32-bit and 64-bit block numbers.  It is currently used by
+	  the ext4 and OCFS2 filesystems, but it could also be used to add
+	  journal support to other file systems or block devices such
+	  as RAID or LVM.
+
+	  If you are using ext4 or OCFS2, you need to say Y here.
+	  If you are not using ext4 or OCFS2 then you will
+	  probably want to say N.
+
+	  To compile this device as a module, choose M here. The module will be
+	  called jbd2.  If you are compiling ext4 or OCFS2 into the kernel,
+	  you cannot compile this code as a module.
+
+config JBD2_DEBUG
+	bool "JBD2 (ext4) debugging support"
+	depends on JBD2 && DEBUG_FS
+	help
+	  If you are using the ext4 journaled file system (or
+	  potentially any other filesystem/device using JBD2), this option
+	  allows you to enable debugging output while the system is running,
+	  in order to help track down any problems you are having.
+	  By default, the debugging output will be turned off.
+
+	  If you select Y here, then you will be able to turn on debugging
+	  with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
+	  number between 1 and 5. The higher the number, the more debugging
+	  output is generated.  To turn debugging off again, do
+	  "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
new file mode 100644
index 00000000..802a3413
--- /dev/null
+++ b/fs/jbd2/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux journaling routines.
+#
+
+obj-$(CONFIG_JBD2) += jbd2.o
+
+jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
new file mode 100644
index 00000000..2c62c5aa
--- /dev/null
+++ b/fs/jbd2/checkpoint.c
@@ -0,0 +1,798 @@
+/*
+ * linux/fs/jbd2/checkpoint.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1999 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Checkpoint routines for the generic filesystem journaling code.
+ * Part of the ext2fs journaling system.
+ *
+ * Checkpointing is the process of ensuring that a section of the log is
+ * committed fully to disk, so that that portion of the log can be
+ * reused.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <trace/events/jbd2.h>
+
+/*
+ * Unlink a buffer from a transaction checkpoint list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink_first(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	jh->b_cpnext->b_cpprev = jh->b_cpprev;
+	jh->b_cpprev->b_cpnext = jh->b_cpnext;
+	if (transaction->t_checkpoint_list == jh) {
+		transaction->t_checkpoint_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_list == jh)
+			transaction->t_checkpoint_list = NULL;
+	}
+}
+
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+	if (transaction->t_checkpoint_io_list == jh) {
+		transaction->t_checkpoint_io_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_io_list == jh)
+			transaction->t_checkpoint_io_list = NULL;
+	}
+}
+
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+
+	if (!transaction->t_checkpoint_io_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_io_list;
+		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_io_list = jh;
+}
+
+/*
+ * Try to release a checkpointed buffer from its transaction.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
+ * Requires j_list_lock
+ * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
+ */
+static int __try_to_free_cp_buf(struct journal_head *jh)
+{
+	int ret = 0;
+	struct buffer_head *bh = jh2bh(jh);
+
+	if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+	    !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
+		/*
+		 * Get our reference so that bh cannot be freed before
+		 * we unlock it
+		 */
+		get_bh(bh);
+		JBUFFER_TRACE(jh, "remove from checkpoint list");
+		ret = __jbd2_journal_remove_checkpoint(jh) + 1;
+		jbd_unlock_bh_state(bh);
+		BUFFER_TRACE(bh, "release");
+		__brelse(bh);
+	} else {
+		jbd_unlock_bh_state(bh);
+	}
+	return ret;
+}
+
+/*
+ * __jbd2_log_wait_for_space: wait until there is space in the journal.
+ *
+ * Called under j-state_lock *only*.  It will be unlocked if we have to wait
+ * for a checkpoint to free up some space in the log.
+ */
+void __jbd2_log_wait_for_space(journal_t *journal)
+{
+	int nblocks, space_left;
+	/* assert_spin_locked(&journal->j_state_lock); */
+
+	nblocks = jbd_space_needed(journal);
+	while (__jbd2_log_space_left(journal) < nblocks) {
+		if (journal->j_flags & JBD2_ABORT)
+			return;
+		write_unlock(&journal->j_state_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
+
+		/*
+		 * Test again, another process may have checkpointed while we
+		 * were waiting for the checkpoint lock. If there are no
+		 * transactions ready to be checkpointed, try to recover
+		 * journal space by calling cleanup_journal_tail(), and if
+		 * that doesn't work, by waiting for the currently committing
+		 * transaction to complete.  If there is absolutely no way
+		 * to make progress, this is either a BUG or corrupted
+		 * filesystem, so abort the journal and leave a stack
+		 * trace for forensic evidence.
+		 */
+		write_lock(&journal->j_state_lock);
+		spin_lock(&journal->j_list_lock);
+		nblocks = jbd_space_needed(journal);
+		space_left = __jbd2_log_space_left(journal);
+		if (space_left < nblocks) {
+			int chkpt = journal->j_checkpoint_transactions != NULL;
+			tid_t tid = 0;
+
+			if (journal->j_committing_transaction)
+				tid = journal->j_committing_transaction->t_tid;
+			spin_unlock(&journal->j_list_lock);
+			write_unlock(&journal->j_state_lock);
+			if (chkpt) {
+				jbd2_log_do_checkpoint(journal);
+			} else if (jbd2_cleanup_journal_tail(journal) == 0) {
+				/* We were able to recover space; yay! */
+				;
+			} else if (tid) {
+				jbd2_log_wait_commit(journal, tid);
+			} else {
+				printk(KERN_ERR "%s: needed %d blocks and "
+				       "only had %d space available\n",
+				       __func__, nblocks, space_left);
+				printk(KERN_ERR "%s: no way to get more "
+				       "journal space in %s\n", __func__,
+				       journal->j_devname);
+				WARN_ON(1);
+				jbd2_journal_abort(journal, 0);
+			}
+			write_lock(&journal->j_state_lock);
+		} else {
+			spin_unlock(&journal->j_list_lock);
+		}
+		mutex_unlock(&journal->j_checkpoint_mutex);
+	}
+}
+
+/*
+ * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
+ * The caller must restart a list walk.  Wait for someone else to run
+ * jbd_unlock_bh_state().
+ */
+static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
+	__releases(journal->j_list_lock)
+{
+	get_bh(bh);
+	spin_unlock(&journal->j_list_lock);
+	jbd_lock_bh_state(bh);
+	jbd_unlock_bh_state(bh);
+	put_bh(bh);
+}
+
+/*
+ * Clean up transaction's list of buffers submitted for io.
+ * We wait for any pending IO to complete and remove any clean
+ * buffers. Note that we take the buffers in the opposite ordering
+ * from the one in which they were submitted for IO.
+ *
+ * Return 0 on success, and return <0 if some buffers have failed
+ * to be written out.
+ *
+ * Called with j_list_lock held.
+ */
+static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
+{
+	struct journal_head *jh;
+	struct buffer_head *bh;
+	tid_t this_tid;
+	int released = 0;
+	int ret = 0;
+
+	this_tid = transaction->t_tid;
+restart:
+	/* Did somebody clean up the transaction in the meanwhile? */
+	if (journal->j_checkpoint_transactions != transaction ||
+			transaction->t_tid != this_tid)
+		return ret;
+	while (!released && transaction->t_checkpoint_io_list) {
+		jh = transaction->t_checkpoint_io_list;
+		bh = jh2bh(jh);
+		if (!jbd_trylock_bh_state(bh)) {
+			jbd_sync_bh(journal, bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
+		get_bh(bh);
+		if (buffer_locked(bh)) {
+			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
+			wait_on_buffer(bh);
+			/* the journal_head may have gone by now */
+			BUFFER_TRACE(bh, "brelse");
+			__brelse(bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
+		if (unlikely(buffer_write_io_error(bh)))
+			ret = -EIO;
+
+		/*
+		 * Now in whatever state the buffer currently is, we know that
+		 * it has been written out and so we can drop it from the list
+		 */
+		released = __jbd2_journal_remove_checkpoint(jh);
+		jbd_unlock_bh_state(bh);
+		__brelse(bh);
+	}
+
+	return ret;
+}
+
+static void
+__flush_batch(journal_t *journal, int *batch_count)
+{
+	int i;
+
+	for (i = 0; i < *batch_count; i++)
+		write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE);
+
+	for (i = 0; i < *batch_count; i++) {
+		struct buffer_head *bh = journal->j_chkpt_bhs[i];
+		clear_buffer_jwrite(bh);
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+	}
+	*batch_count = 0;
+}
+
+/*
+ * Try to flush one buffer from the checkpoint list to disk.
+ *
+ * Return 1 if something happened which requires us to abort the current
+ * scan of the checkpoint list.  Return <0 if the buffer has failed to
+ * be written out.
+ *
+ * Called with j_list_lock held and drops it if 1 is returned
+ * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
+ */
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
+			    int *batch_count, transaction_t *transaction)
+{
+	struct buffer_head *bh = jh2bh(jh);
+	int ret = 0;
+
+	if (buffer_locked(bh)) {
+		get_bh(bh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		wait_on_buffer(bh);
+		/* the journal_head may have gone by now */
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+		ret = 1;
+	} else if (jh->b_transaction != NULL) {
+		transaction_t *t = jh->b_transaction;
+		tid_t tid = t->t_tid;
+
+		transaction->t_chp_stats.cs_forced_to_close++;
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		if (unlikely(journal->j_flags & JBD2_UNMOUNT))
+			/*
+			 * The journal thread is dead; so starting and
+			 * waiting for a commit to finish will cause
+			 * us to wait for a _very_ long time.
+			 */
+			printk(KERN_ERR "JBD2: %s: "
+			       "Waiting for Godot: block %llu\n",
+			       journal->j_devname,
+			       (unsigned long long) bh->b_blocknr);
+		jbd2_log_start_commit(journal, tid);
+		jbd2_log_wait_commit(journal, tid);
+		ret = 1;
+	} else if (!buffer_dirty(bh)) {
+		ret = 1;
+		if (unlikely(buffer_write_io_error(bh)))
+			ret = -EIO;
+		get_bh(bh);
+		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+		BUFFER_TRACE(bh, "remove from checkpoint");
+		__jbd2_journal_remove_checkpoint(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		__brelse(bh);
+	} else {
+		/*
+		 * Important: we are about to write the buffer, and
+		 * possibly block, while still holding the journal lock.
+		 * We cannot afford to let the transaction logic start
+		 * messing around with this buffer before we write it to
+		 * disk, as that would break recoverability.
+		 */
+		BUFFER_TRACE(bh, "queue");
+		get_bh(bh);
+		J_ASSERT_BH(bh, !buffer_jwrite(bh));
+		set_buffer_jwrite(bh);
+		journal->j_chkpt_bhs[*batch_count] = bh;
+		__buffer_relink_io(jh);
+		jbd_unlock_bh_state(bh);
+		transaction->t_chp_stats.cs_written++;
+		(*batch_count)++;
+		if (*batch_count == JBD2_NR_BATCH) {
+			spin_unlock(&journal->j_list_lock);
+			__flush_batch(journal, batch_count);
+			ret = 1;
+		}
+	}
+	return ret;
+}
+
+/*
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
+ *
+ * The journal should be locked before calling this function.
+ * Called with j_checkpoint_mutex held.
+ */
+int jbd2_log_do_checkpoint(journal_t *journal)
+{
+	transaction_t *transaction;
+	tid_t this_tid;
+	int result;
+
+	jbd_debug(1, "Start checkpoint\n");
+
+	/*
+	 * First thing: if there are any transactions in the log which
+	 * don't need checkpointing, just eliminate them from the
+	 * journal straight away.
+	 */
+	result = jbd2_cleanup_journal_tail(journal);
+	trace_jbd2_checkpoint(journal, result);
+	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+	if (result <= 0)
+		return result;
+
+	/*
+	 * OK, we need to start writing disk blocks.  Take one transaction
+	 * and write it.
+	 */
+	result = 0;
+	spin_lock(&journal->j_list_lock);
+	if (!journal->j_checkpoint_transactions)
+		goto out;
+	transaction = journal->j_checkpoint_transactions;
+	if (transaction->t_chp_stats.cs_chp_time == 0)
+		transaction->t_chp_stats.cs_chp_time = jiffies;
+	this_tid = transaction->t_tid;
+restart:
+	/*
+	 * If someone cleaned up this transaction while we slept, we're
+	 * done (maybe it's a new transaction, but it fell at the same
+	 * address).
+	 */
+	if (journal->j_checkpoint_transactions == transaction &&
+			transaction->t_tid == this_tid) {
+		int batch_count = 0;
+		struct journal_head *jh;
+		int retry = 0, err;
+
+		while (!retry && transaction->t_checkpoint_list) {
+			struct buffer_head *bh;
+
+			jh = transaction->t_checkpoint_list;
+			bh = jh2bh(jh);
+			if (!jbd_trylock_bh_state(bh)) {
+				jbd_sync_bh(journal, bh);
+				retry = 1;
+				break;
+			}
+			retry = __process_buffer(journal, jh, &batch_count,
+						 transaction);
+			if (retry < 0 && !result)
+				result = retry;
+			if (!retry && (need_resched() ||
+				spin_needbreak(&journal->j_list_lock))) {
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+				break;
+			}
+		}
+
+		if (batch_count) {
+			if (!retry) {
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+			}
+			__flush_batch(journal, &batch_count);
+		}
+
+		if (retry) {
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
+		/*
+		 * Now we have cleaned up the first transaction's checkpoint
+		 * list. Let's clean up the second one
+		 */
+		err = __wait_cp_io(journal, transaction);
+		if (!result)
+			result = err;
+	}
+out:
+	spin_unlock(&journal->j_list_lock);
+	if (result < 0)
+		jbd2_journal_abort(journal, result);
+	else
+		result = jbd2_cleanup_journal_tail(journal);
+
+	return (result < 0) ? result : 0;
+}
+
+/*
+ * Check the list of checkpoint transactions for the journal to see if
+ * we have already got rid of any since the last update of the log tail
+ * in the journal superblock.  If so, we can instantly roll the
+ * superblock forward to remove those transactions from the log.
+ *
+ * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
+ *
+ * Called with the journal lock held.
+ *
+ * This is the only part of the journaling code which really needs to be
+ * aware of transaction aborts.  Checkpointing involves writing to the
+ * main filesystem area rather than to the journal, so it can proceed
+ * even in abort state, but we must not update the super block if
+ * checkpointing may have failed.  Otherwise, we would lose some metadata
+ * buffers which should be written-back to the filesystem.
+ */
+
+int jbd2_cleanup_journal_tail(journal_t *journal)
+{
+	transaction_t * transaction;
+	tid_t		first_tid;
+	unsigned long	blocknr, freed;
+
+	if (is_journal_aborted(journal))
+		return 1;
+
+	/* OK, work out the oldest transaction remaining in the log, and
+	 * the log block it starts at.
+	 *
+	 * If the log is now empty, we need to work out which is the
+	 * next transaction ID we will write, and where it will
+	 * start. */
+
+	write_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	transaction = journal->j_checkpoint_transactions;
+	if (transaction) {
+		first_tid = transaction->t_tid;
+		blocknr = transaction->t_log_start;
+	} else if ((transaction = journal->j_committing_transaction) != NULL) {
+		first_tid = transaction->t_tid;
+		blocknr = transaction->t_log_start;
+	} else if ((transaction = journal->j_running_transaction) != NULL) {
+		first_tid = transaction->t_tid;
+		blocknr = journal->j_head;
+	} else {
+		first_tid = journal->j_transaction_sequence;
+		blocknr = journal->j_head;
+	}
+	spin_unlock(&journal->j_list_lock);
+	J_ASSERT(blocknr != 0);
+
+	/* If the oldest pinned transaction is at the tail of the log
+           already then there's not much we can do right now. */
+	if (journal->j_tail_sequence == first_tid) {
+		write_unlock(&journal->j_state_lock);
+		return 1;
+	}
+
+	/* OK, update the superblock to recover the freed space.
+	 * Physical blocks come first: have we wrapped beyond the end of
+	 * the log?  */
+	freed = blocknr - journal->j_tail;
+	if (blocknr < journal->j_tail)
+		freed = freed + journal->j_last - journal->j_first;
+
+	trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
+	jbd_debug(1,
+		  "Cleaning journal tail from %d to %d (offset %lu), "
+		  "freeing %lu\n",
+		  journal->j_tail_sequence, first_tid, blocknr, freed);
+
+	journal->j_free += freed;
+	journal->j_tail_sequence = first_tid;
+	journal->j_tail = blocknr;
+	write_unlock(&journal->j_state_lock);
+
+	/*
+	 * If there is an external journal, we need to make sure that
+	 * any data blocks that were recently written out --- perhaps
+	 * by jbd2_log_do_checkpoint() --- are flushed out before we
+	 * drop the transactions from the external journal.  It's
+	 * unlikely this will be necessary, especially with a
+	 * appropriately sized journal, but we need this to guarantee
+	 * correctness.  Fortunately jbd2_cleanup_journal_tail()
+	 * doesn't get called all that often.
+	 */
+	if ((journal->j_fs_dev != journal->j_dev) &&
+	    (journal->j_flags & JBD2_BARRIER))
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+	if (!(journal->j_flags & JBD2_ABORT))
+		jbd2_journal_update_superblock(journal, 1);
+	return 0;
+}
+
+
+/* Checkpoint list management */
+
+/*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and
+ * release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+	struct journal_head *last_jh;
+	struct journal_head *next_jh = jh;
+	int ret, freed = 0;
+
+	*released = 0;
+	if (!jh)
+		return 0;
+
+	last_jh = jh->b_cpprev;
+	do {
+		jh = next_jh;
+		next_jh = jh->b_cpnext;
+		/* Use trylock because of the ranking */
+		if (jbd_trylock_bh_state(jh2bh(jh))) {
+			ret = __try_to_free_cp_buf(jh);
+			if (ret) {
+				freed++;
+				if (ret == 2) {
+					*released = 1;
+					return freed;
+				}
+			}
+		}
+		/*
+		 * This function only frees up some memory
+		 * if possible so we dont have an obligation
+		 * to finish processing. Bail out if preemption
+		 * requested:
+		 */
+		if (need_resched())
+			return freed;
+	} while (jh != last_jh);
+
+	return freed;
+}
+
+/*
+ * journal_clean_checkpoint_list
+ *
+ * Find all the written-back checkpoint buffers in the journal and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of buffers reaped (for debug)
+ */
+
+int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
+{
+	transaction_t *transaction, *last_transaction, *next_transaction;
+	int ret = 0;
+	int released;
+
+	transaction = journal->j_checkpoint_transactions;
+	if (!transaction)
+		goto out;
+
+	last_transaction = transaction->t_cpprev;
+	next_transaction = transaction;
+	do {
+		transaction = next_transaction;
+		next_transaction = transaction->t_cpnext;
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_list, &released);
+		/*
+		 * This function only frees up some memory if possible so we
+		 * dont have an obligation to finish processing. Bail out if
+		 * preemption requested:
+		 */
+		if (need_resched())
+			goto out;
+		if (released)
+			continue;
+		/*
+		 * It is essential that we are as careful as in the case of
+		 * t_checkpoint_list with removing the buffer from the list as
+		 * we can possibly see not yet submitted buffers on io_list
+		 */
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_io_list, &released);
+		if (need_resched())
+			goto out;
+	} while (transaction != last_transaction);
+out:
+	return ret;
+}
+
+/*
+ * journal_remove_checkpoint: called after a buffer has been committed
+ * to disk (either by being write-back flushed to disk, or being
+ * committed to the log).
+ *
+ * We cannot safely clean a transaction out of the log until all of the
+ * buffer updates committed in that transaction have safely been stored
+ * elsewhere on disk.  To achieve this, all of the buffers in a
+ * transaction need to be maintained on the transaction's checkpoint
+ * lists until they have been rewritten, at which point this function is
+ * called to remove the buffer from the existing transaction's
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
+ * The function can free jh and bh.
+ *
+ * This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
+ */
+
+int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
+{
+	struct transaction_chp_stats_s *stats;
+	transaction_t *transaction;
+	journal_t *journal;
+	int ret = 0;
+
+	JBUFFER_TRACE(jh, "entry");
+
+	if ((transaction = jh->b_cp_transaction) == NULL) {
+		JBUFFER_TRACE(jh, "not on transaction");
+		goto out;
+	}
+	journal = transaction->t_journal;
+
+	JBUFFER_TRACE(jh, "removing from transaction");
+	__buffer_unlink(jh);
+	jh->b_cp_transaction = NULL;
+	jbd2_journal_put_journal_head(jh);
+
+	if (transaction->t_checkpoint_list != NULL ||
+	    transaction->t_checkpoint_io_list != NULL)
+		goto out;
+
+	/*
+	 * There is one special case to worry about: if we have just pulled the
+	 * buffer off a running or committing transaction's checkpoing list,
+	 * then even if the checkpoint list is empty, the transaction obviously
+	 * cannot be dropped!
+	 *
+	 * The locking here around t_state is a bit sleazy.
+	 * See the comment at the end of jbd2_journal_commit_transaction().
+	 */
+	if (transaction->t_state != T_FINISHED)
+		goto out;
+
+	/* OK, that was the last buffer for the transaction: we can now
+	   safely remove this transaction from the log */
+	stats = &transaction->t_chp_stats;
+	if (stats->cs_chp_time)
+		stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
+						    jiffies);
+	trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev,
+				    transaction->t_tid, stats);
+
+	__jbd2_journal_drop_transaction(journal, transaction);
+	kfree(transaction);
+
+	/* Just in case anybody was waiting for more transactions to be
+           checkpointed... */
+	wake_up(&journal->j_wait_logspace);
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * journal_insert_checkpoint: put a committed buffer onto a checkpoint
+ * list so that we know when it is safe to clean the transaction out of
+ * the log.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ */
+void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
+			       transaction_t *transaction)
+{
+	JBUFFER_TRACE(jh, "entry");
+	J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
+	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+
+	/* Get reference for checkpointing transaction */
+	jbd2_journal_grab_journal_head(jh2bh(jh));
+	jh->b_cp_transaction = transaction;
+
+	if (!transaction->t_checkpoint_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_list;
+		jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_list = jh;
+}
+
+/*
+ * We've finished with this transaction structure: adios...
+ *
+ * The transaction must have no links except for the checkpoint by this
+ * point.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ */
+
+void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
+{
+	assert_spin_locked(&journal->j_list_lock);
+	if (transaction->t_cpnext) {
+		transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
+		transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
+		if (journal->j_checkpoint_transactions == transaction)
+			journal->j_checkpoint_transactions =
+				transaction->t_cpnext;
+		if (journal->j_checkpoint_transactions == transaction)
+			journal->j_checkpoint_transactions = NULL;
+	}
+
+	J_ASSERT(transaction->t_state == T_FINISHED);
+	J_ASSERT(transaction->t_buffers == NULL);
+	J_ASSERT(transaction->t_forget == NULL);
+	J_ASSERT(transaction->t_iobuf_list == NULL);
+	J_ASSERT(transaction->t_shadow_list == NULL);
+	J_ASSERT(transaction->t_log_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
+	J_ASSERT(atomic_read(&transaction->t_updates) == 0);
+	J_ASSERT(journal->j_committing_transaction != transaction);
+	J_ASSERT(journal->j_running_transaction != transaction);
+
+	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
+}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
new file mode 100644
index 00000000..36c2e800
--- /dev/null
+++ b/fs/jbd2/commit.c
@@ -0,0 +1,1048 @@
+/*
+ * linux/fs/jbd2/commit.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal commit routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/jiffies.h>
+#include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/bitops.h>
+#include <trace/events/jbd2.h>
+#include <asm/system.h>
+
+/*
+ * Default IO end handler for temporary BJ_IO buffer_heads.
+ */
+static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+	BUFFER_TRACE(bh, "");
+	if (uptodate)
+		set_buffer_uptodate(bh);
+	else
+		clear_buffer_uptodate(bh);
+	unlock_buffer(bh);
+}
+
+/*
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
+ * After the transaction commits, these pages are left on the LRU, with no
+ * ->mapping, and with attached buffers.  These pages are trivially reclaimable
+ * by the VM, but their apparent absence upsets the VM accounting, and it makes
+ * the numbers in /proc/meminfo look odd.
+ *
+ * So here, we have a buffer which has just come off the forget list.  Look to
+ * see if we can strip all buffers from the backing page.
+ *
+ * Called under lock_journal(), and possibly under journal_datalist_lock.  The
+ * caller provided us with a ref against the buffer, and we drop that here.
+ */
+static void release_buffer_page(struct buffer_head *bh)
+{
+	struct page *page;
+
+	if (buffer_dirty(bh))
+		goto nope;
+	if (atomic_read(&bh->b_count) != 1)
+		goto nope;
+	page = bh->b_page;
+	if (!page)
+		goto nope;
+	if (page->mapping)
+		goto nope;
+
+	/* OK, it's a truncated page */
+	if (!trylock_page(page))
+		goto nope;
+
+	page_cache_get(page);
+	__brelse(bh);
+	try_to_free_buffers(page);
+	unlock_page(page);
+	page_cache_release(page);
+	return;
+
+nope:
+	__brelse(bh);
+}
+
+/*
+ * Done it all: now submit the commit record.  We should have
+ * cleaned up our previous buffers by now, so if we are in abort
+ * mode we can now just skip the rest of the journal write
+ * entirely.
+ *
+ * Returns 1 if the journal needs to be aborted or 0 on success
+ */
+static int journal_submit_commit_record(journal_t *journal,
+					transaction_t *commit_transaction,
+					struct buffer_head **cbh,
+					__u32 crc32_sum)
+{
+	struct journal_head *descriptor;
+	struct commit_header *tmp;
+	struct buffer_head *bh;
+	int ret;
+	struct timespec now = current_kernel_time();
+
+	*cbh = NULL;
+
+	if (is_journal_aborted(journal))
+		return 0;
+
+	descriptor = jbd2_journal_get_descriptor_buffer(journal);
+	if (!descriptor)
+		return 1;
+
+	bh = jh2bh(descriptor);
+
+	tmp = (struct commit_header *)bh->b_data;
+	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
+	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
+	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
+
+	if (JBD2_HAS_COMPAT_FEATURE(journal,
+				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
+		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
+		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
+		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
+	}
+
+	JBUFFER_TRACE(descriptor, "submit commit block");
+	lock_buffer(bh);
+	clear_buffer_dirty(bh);
+	set_buffer_uptodate(bh);
+	bh->b_end_io = journal_end_buffer_io_sync;
+
+	if (journal->j_flags & JBD2_BARRIER &&
+	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
+				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+		ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
+	else
+		ret = submit_bh(WRITE_SYNC, bh);
+
+	*cbh = bh;
+	return ret;
+}
+
+/*
+ * This function along with journal_submit_commit_record
+ * allows to write the commit record asynchronously.
+ */
+static int journal_wait_on_commit_record(journal_t *journal,
+					 struct buffer_head *bh)
+{
+	int ret = 0;
+
+	clear_buffer_dirty(bh);
+	wait_on_buffer(bh);
+
+	if (unlikely(!buffer_uptodate(bh)))
+		ret = -EIO;
+	put_bh(bh);            /* One for getblk() */
+	jbd2_journal_put_journal_head(bh2jh(bh));
+
+	return ret;
+}
+
+/*
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
+ */
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
+{
+	int ret;
+	struct writeback_control wbc = {
+		.sync_mode =  WB_SYNC_ALL,
+		.nr_to_write = mapping->nrpages * 2,
+		.range_start = 0,
+		.range_end = i_size_read(mapping->host),
+	};
+
+	ret = generic_writepages(mapping, &wbc);
+	return ret;
+}
+
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
+{
+	struct jbd2_inode *jinode;
+	int err, ret = 0;
+	struct address_space *mapping;
+
+	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		mapping = jinode->i_vfs_inode->i_mapping;
+		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+		spin_unlock(&journal->j_list_lock);
+		/*
+		 * submit the inode data buffers. We use writepage
+		 * instead of writepages. Because writepages can do
+		 * block allocation  with delalloc. We need to write
+		 * only allocated blocks here.
+		 */
+		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
+		err = journal_submit_inode_data_buffers(mapping);
+		if (!ret)
+			ret = err;
+		spin_lock(&journal->j_list_lock);
+		J_ASSERT(jinode->i_transaction == commit_transaction);
+		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+		smp_mb__after_clear_bit();
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+	}
+	spin_unlock(&journal->j_list_lock);
+	return ret;
+}
+
+/*
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
+ */
+static int journal_finish_inode_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
+{
+	struct jbd2_inode *jinode, *next_i;
+	int err, ret = 0;
+
+	/* For locking, see the comment in journal_submit_data_buffers() */
+	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+		spin_unlock(&journal->j_list_lock);
+		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+		if (err) {
+			/*
+			 * Because AS_EIO is cleared by
+			 * filemap_fdatawait_range(), set it again so
+			 * that user process can get -EIO from fsync().
+			 */
+			set_bit(AS_EIO,
+				&jinode->i_vfs_inode->i_mapping->flags);
+
+			if (!ret)
+				ret = err;
+		}
+		spin_lock(&journal->j_list_lock);
+		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+		smp_mb__after_clear_bit();
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+	}
+
+	/* Now refile inode to proper lists */
+	list_for_each_entry_safe(jinode, next_i,
+				 &commit_transaction->t_inode_list, i_list) {
+		list_del(&jinode->i_list);
+		if (jinode->i_next_transaction) {
+			jinode->i_transaction = jinode->i_next_transaction;
+			jinode->i_next_transaction = NULL;
+			list_add(&jinode->i_list,
+				&jinode->i_transaction->t_inode_list);
+		} else {
+			jinode->i_transaction = NULL;
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	return ret;
+}
+
+static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
+{
+	struct page *page = bh->b_page;
+	char *addr;
+	__u32 checksum;
+
+	addr = kmap_atomic(page, KM_USER0);
+	checksum = crc32_be(crc32_sum,
+		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
+	kunmap_atomic(addr, KM_USER0);
+
+	return checksum;
+}
+
+static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
+				   unsigned long long block)
+{
+	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
+	if (tag_bytes > JBD2_TAG_SIZE32)
+		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
+}
+
+/*
+ * jbd2_journal_commit_transaction
+ *
+ * The primary function for committing a transaction to the log.  This
+ * function is called by the journal thread to begin a complete commit.
+ */
+void jbd2_journal_commit_transaction(journal_t *journal)
+{
+	struct transaction_stats_s stats;
+	transaction_t *commit_transaction;
+	struct journal_head *jh, *new_jh, *descriptor;
+	struct buffer_head **wbuf = journal->j_wbuf;
+	int bufs;
+	int flags;
+	int err;
+	unsigned long long blocknr;
+	ktime_t start_time;
+	u64 commit_time;
+	char *tagp = NULL;
+	journal_header_t *header;
+	journal_block_tag_t *tag = NULL;
+	int space_left = 0;
+	int first_tag = 0;
+	int tag_flag;
+	int i, to_free = 0;
+	int tag_bytes = journal_tag_bytes(journal);
+	struct buffer_head *cbh = NULL; /* For transactional checksums */
+	__u32 crc32_sum = ~0;
+	struct blk_plug plug;
+
+	/*
+	 * First job: lock down the current transaction and wait for
+	 * all outstanding updates to complete.
+	 */
+
+	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
+	if (journal->j_flags & JBD2_FLUSHED) {
+		jbd_debug(3, "super block updated\n");
+		jbd2_journal_update_superblock(journal, 1);
+	} else {
+		jbd_debug(3, "superblock not updated\n");
+	}
+
+	J_ASSERT(journal->j_running_transaction != NULL);
+	J_ASSERT(journal->j_committing_transaction == NULL);
+
+	commit_transaction = journal->j_running_transaction;
+	J_ASSERT(commit_transaction->t_state == T_RUNNING);
+
+	trace_jbd2_start_commit(journal, commit_transaction);
+	jbd_debug(1, "JBD: starting commit of transaction %d\n",
+			commit_transaction->t_tid);
+
+	write_lock(&journal->j_state_lock);
+	commit_transaction->t_state = T_LOCKED;
+
+	trace_jbd2_commit_locking(journal, commit_transaction);
+	stats.run.rs_wait = commit_transaction->t_max_wait;
+	stats.run.rs_locked = jiffies;
+	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
+					      stats.run.rs_locked);
+
+	spin_lock(&commit_transaction->t_handle_lock);
+	while (atomic_read(&commit_transaction->t_updates)) {
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(&journal->j_wait_updates, &wait,
+					TASK_UNINTERRUPTIBLE);
+		if (atomic_read(&commit_transaction->t_updates)) {
+			spin_unlock(&commit_transaction->t_handle_lock);
+			write_unlock(&journal->j_state_lock);
+			schedule();
+			write_lock(&journal->j_state_lock);
+			spin_lock(&commit_transaction->t_handle_lock);
+		}
+		finish_wait(&journal->j_wait_updates, &wait);
+	}
+	spin_unlock(&commit_transaction->t_handle_lock);
+
+	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
+			journal->j_max_transaction_buffers);
+
+	/*
+	 * First thing we are allowed to do is to discard any remaining
+	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
+	 * that there are no such buffers: if a large filesystem
+	 * operation like a truncate needs to split itself over multiple
+	 * transactions, then it may try to do a jbd2_journal_restart() while
+	 * there are still BJ_Reserved buffers outstanding.  These must
+	 * be released cleanly from the current transaction.
+	 *
+	 * In this case, the filesystem must still reserve write access
+	 * again before modifying the buffer in the new transaction, but
+	 * we do not require it to remember exactly which old buffers it
+	 * has reserved.  This is consistent with the existing behaviour
+	 * that multiple jbd2_journal_get_write_access() calls to the same
+	 * buffer are perfectly permissible.
+	 */
+	while (commit_transaction->t_reserved_list) {
+		jh = commit_transaction->t_reserved_list;
+		JBUFFER_TRACE(jh, "reserved, unused: refile");
+		/*
+		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
+		 * leave undo-committed data.
+		 */
+		if (jh->b_committed_data) {
+			struct buffer_head *bh = jh2bh(jh);
+
+			jbd_lock_bh_state(bh);
+			jbd2_free(jh->b_committed_data, bh->b_size);
+			jh->b_committed_data = NULL;
+			jbd_unlock_bh_state(bh);
+		}
+		jbd2_journal_refile_buffer(journal, jh);
+	}
+
+	/*
+	 * Now try to drop any written-back buffers from the journal's
+	 * checkpoint lists.  We do this *before* commit because it potentially
+	 * frees some memory
+	 */
+	spin_lock(&journal->j_list_lock);
+	__jbd2_journal_clean_checkpoint_list(journal);
+	spin_unlock(&journal->j_list_lock);
+
+	jbd_debug (3, "JBD: commit phase 1\n");
+
+	/*
+	 * Switch to a new revoke table.
+	 */
+	jbd2_journal_switch_revoke_table(journal);
+
+	trace_jbd2_commit_flushing(journal, commit_transaction);
+	stats.run.rs_flushing = jiffies;
+	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
+					     stats.run.rs_flushing);
+
+	commit_transaction->t_state = T_FLUSH;
+	journal->j_committing_transaction = commit_transaction;
+	journal->j_running_transaction = NULL;
+	start_time = ktime_get();
+	commit_transaction->t_log_start = journal->j_head;
+	wake_up(&journal->j_wait_transaction_locked);
+	write_unlock(&journal->j_state_lock);
+
+	jbd_debug (3, "JBD: commit phase 2\n");
+
+	/*
+	 * Now start flushing things to disk, in the order they appear
+	 * on the transaction lists.  Data blocks go first.
+	 */
+	err = journal_submit_data_buffers(journal, commit_transaction);
+	if (err)
+		jbd2_journal_abort(journal, err);
+
+	blk_start_plug(&plug);
+	jbd2_journal_write_revoke_records(journal, commit_transaction,
+					  WRITE_SYNC);
+	blk_finish_plug(&plug);
+
+	jbd_debug(3, "JBD: commit phase 2\n");
+
+	/*
+	 * Way to go: we have now written out all of the data for a
+	 * transaction!  Now comes the tricky part: we need to write out
+	 * metadata.  Loop over the transaction's entire buffer list:
+	 */
+	write_lock(&journal->j_state_lock);
+	commit_transaction->t_state = T_COMMIT;
+	write_unlock(&journal->j_state_lock);
+
+	trace_jbd2_commit_logging(journal, commit_transaction);
+	stats.run.rs_logging = jiffies;
+	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
+					       stats.run.rs_logging);
+	stats.run.rs_blocks =
+		atomic_read(&commit_transaction->t_outstanding_credits);
+	stats.run.rs_blocks_logged = 0;
+
+	J_ASSERT(commit_transaction->t_nr_buffers <=
+		 atomic_read(&commit_transaction->t_outstanding_credits));
+
+	err = 0;
+	descriptor = NULL;
+	bufs = 0;
+	blk_start_plug(&plug);
+	while (commit_transaction->t_buffers) {
+
+		/* Find the next buffer to be journaled... */
+
+		jh = commit_transaction->t_buffers;
+
+		/* If we're in abort mode, we just un-journal the buffer and
+		   release it. */
+
+		if (is_journal_aborted(journal)) {
+			clear_buffer_jbddirty(jh2bh(jh));
+			JBUFFER_TRACE(jh, "journal is aborting: refile");
+			jbd2_buffer_abort_trigger(jh,
+						  jh->b_frozen_data ?
+						  jh->b_frozen_triggers :
+						  jh->b_triggers);
+			jbd2_journal_refile_buffer(journal, jh);
+			/* If that was the last one, we need to clean up
+			 * any descriptor buffers which may have been
+			 * already allocated, even if we are now
+			 * aborting. */
+			if (!commit_transaction->t_buffers)
+				goto start_journal_io;
+			continue;
+		}
+
+		/* Make sure we have a descriptor block in which to
+		   record the metadata buffer. */
+
+		if (!descriptor) {
+			struct buffer_head *bh;
+
+			J_ASSERT (bufs == 0);
+
+			jbd_debug(4, "JBD: get descriptor\n");
+
+			descriptor = jbd2_journal_get_descriptor_buffer(journal);
+			if (!descriptor) {
+				jbd2_journal_abort(journal, -EIO);
+				continue;
+			}
+
+			bh = jh2bh(descriptor);
+			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
+				(unsigned long long)bh->b_blocknr, bh->b_data);
+			header = (journal_header_t *)&bh->b_data[0];
+			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
+			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
+			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
+
+			tagp = &bh->b_data[sizeof(journal_header_t)];
+			space_left = bh->b_size - sizeof(journal_header_t);
+			first_tag = 1;
+			set_buffer_jwrite(bh);
+			set_buffer_dirty(bh);
+			wbuf[bufs++] = bh;
+
+			/* Record it so that we can wait for IO
+                           completion later */
+			BUFFER_TRACE(bh, "ph3: file as descriptor");
+			jbd2_journal_file_buffer(descriptor, commit_transaction,
+					BJ_LogCtl);
+		}
+
+		/* Where is the buffer to be written? */
+
+		err = jbd2_journal_next_log_block(journal, &blocknr);
+		/* If the block mapping failed, just abandon the buffer
+		   and repeat this loop: we'll fall into the
+		   refile-on-abort condition above. */
+		if (err) {
+			jbd2_journal_abort(journal, err);
+			continue;
+		}
+
+		/*
+		 * start_this_handle() uses t_outstanding_credits to determine
+		 * the free space in the log, but this counter is changed
+		 * by jbd2_journal_next_log_block() also.
+		 */
+		atomic_dec(&commit_transaction->t_outstanding_credits);
+
+		/* Bump b_count to prevent truncate from stumbling over
+                   the shadowed buffer!  @@@ This can go if we ever get
+                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+		atomic_inc(&jh2bh(jh)->b_count);
+
+		/* Make a temporary IO buffer with which to write it out
+                   (this will requeue both the metadata buffer and the
+                   temporary IO buffer). new_bh goes on BJ_IO*/
+
+		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
+		/*
+		 * akpm: jbd2_journal_write_metadata_buffer() sets
+		 * new_bh->b_transaction to commit_transaction.
+		 * We need to clean this up before we release new_bh
+		 * (which is of type BJ_IO)
+		 */
+		JBUFFER_TRACE(jh, "ph3: write metadata");
+		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
+						      jh, &new_jh, blocknr);
+		if (flags < 0) {
+			jbd2_journal_abort(journal, flags);
+			continue;
+		}
+		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
+		wbuf[bufs++] = jh2bh(new_jh);
+
+		/* Record the new block's tag in the current descriptor
+                   buffer */
+
+		tag_flag = 0;
+		if (flags & 1)
+			tag_flag |= JBD2_FLAG_ESCAPE;
+		if (!first_tag)
+			tag_flag |= JBD2_FLAG_SAME_UUID;
+
+		tag = (journal_block_tag_t *) tagp;
+		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
+		tag->t_flags = cpu_to_be32(tag_flag);
+		tagp += tag_bytes;
+		space_left -= tag_bytes;
+
+		if (first_tag) {
+			memcpy (tagp, journal->j_uuid, 16);
+			tagp += 16;
+			space_left -= 16;
+			first_tag = 0;
+		}
+
+		/* If there's no more to do, or if the descriptor is full,
+		   let the IO rip! */
+
+		if (bufs == journal->j_wbufsize ||
+		    commit_transaction->t_buffers == NULL ||
+		    space_left < tag_bytes + 16) {
+
+			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+
+			/* Write an end-of-descriptor marker before
+                           submitting the IOs.  "tag" still points to
+                           the last tag we set up. */
+
+			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
+
+start_journal_io:
+			for (i = 0; i < bufs; i++) {
+				struct buffer_head *bh = wbuf[i];
+				/*
+				 * Compute checksum.
+				 */
+				if (JBD2_HAS_COMPAT_FEATURE(journal,
+					JBD2_FEATURE_COMPAT_CHECKSUM)) {
+					crc32_sum =
+					    jbd2_checksum_data(crc32_sum, bh);
+				}
+
+				lock_buffer(bh);
+				clear_buffer_dirty(bh);
+				set_buffer_uptodate(bh);
+				bh->b_end_io = journal_end_buffer_io_sync;
+				submit_bh(WRITE_SYNC, bh);
+			}
+			cond_resched();
+			stats.run.rs_blocks_logged += bufs;
+
+			/* Force a new descriptor to be generated next
+                           time round the loop. */
+			descriptor = NULL;
+			bufs = 0;
+		}
+	}
+
+	err = journal_finish_inode_data_buffers(journal, commit_transaction);
+	if (err) {
+		printk(KERN_WARNING
+			"JBD2: Detected IO errors while flushing file data "
+		       "on %s\n", journal->j_devname);
+		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+			jbd2_journal_abort(journal, err);
+		err = 0;
+	}
+
+	write_lock(&journal->j_state_lock);
+	J_ASSERT(commit_transaction->t_state == T_COMMIT);
+	commit_transaction->t_state = T_COMMIT_DFLUSH;
+	write_unlock(&journal->j_state_lock);
+	/* 
+	 * If the journal is not located on the file system device,
+	 * then we must flush the file system device before we issue
+	 * the commit record
+	 */
+	if (commit_transaction->t_need_data_flush &&
+	    (journal->j_fs_dev != journal->j_dev) &&
+	    (journal->j_flags & JBD2_BARRIER))
+		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
+
+	/* Done it all: now write the commit record asynchronously. */
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+		err = journal_submit_commit_record(journal, commit_transaction,
+						 &cbh, crc32_sum);
+		if (err)
+			__jbd2_journal_abort_hard(journal);
+	}
+
+	blk_finish_plug(&plug);
+
+	/* Lo and behold: we have just managed to send a transaction to
+           the log.  Before we can commit it, wait for the IO so far to
+           complete.  Control buffers being written are on the
+           transaction's t_log_list queue, and metadata buffers are on
+           the t_iobuf_list queue.
+
+	   Wait for the buffers in reverse order.  That way we are
+	   less likely to be woken up until all IOs have completed, and
+	   so we incur less scheduling load.
+	*/
+
+	jbd_debug(3, "JBD: commit phase 3\n");
+
+	/*
+	 * akpm: these are BJ_IO, and j_list_lock is not needed.
+	 * See __journal_try_to_free_buffer.
+	 */
+wait_for_iobuf:
+	while (commit_transaction->t_iobuf_list != NULL) {
+		struct buffer_head *bh;
+
+		jh = commit_transaction->t_iobuf_list->b_tprev;
+		bh = jh2bh(jh);
+		if (buffer_locked(bh)) {
+			wait_on_buffer(bh);
+			goto wait_for_iobuf;
+		}
+		if (cond_resched())
+			goto wait_for_iobuf;
+
+		if (unlikely(!buffer_uptodate(bh)))
+			err = -EIO;
+
+		clear_buffer_jwrite(bh);
+
+		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
+		jbd2_journal_unfile_buffer(journal, jh);
+
+		/*
+		 * ->t_iobuf_list should contain only dummy buffer_heads
+		 * which were created by jbd2_journal_write_metadata_buffer().
+		 */
+		BUFFER_TRACE(bh, "dumping temporary bh");
+		jbd2_journal_put_journal_head(jh);
+		__brelse(bh);
+		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
+		free_buffer_head(bh);
+
+		/* We also have to unlock and free the corresponding
+                   shadowed buffer */
+		jh = commit_transaction->t_shadow_list->b_tprev;
+		bh = jh2bh(jh);
+		clear_bit(BH_JWrite, &bh->b_state);
+		J_ASSERT_BH(bh, buffer_jbddirty(bh));
+
+		/* The metadata is now released for reuse, but we need
+                   to remember it against this transaction so that when
+                   we finally commit, we can do any checkpointing
+                   required. */
+		JBUFFER_TRACE(jh, "file as BJ_Forget");
+		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
+		/*
+		 * Wake up any transactions which were waiting for this IO to
+		 * complete. The barrier must be here so that changes by
+		 * jbd2_journal_file_buffer() take effect before wake_up_bit()
+		 * does the waitqueue check.
+		 */
+		smp_mb();
+		wake_up_bit(&bh->b_state, BH_Unshadow);
+		JBUFFER_TRACE(jh, "brelse shadowed buffer");
+		__brelse(bh);
+	}
+
+	J_ASSERT (commit_transaction->t_shadow_list == NULL);
+
+	jbd_debug(3, "JBD: commit phase 4\n");
+
+	/* Here we wait for the revoke record and descriptor record buffers */
+ wait_for_ctlbuf:
+	while (commit_transaction->t_log_list != NULL) {
+		struct buffer_head *bh;
+
+		jh = commit_transaction->t_log_list->b_tprev;
+		bh = jh2bh(jh);
+		if (buffer_locked(bh)) {
+			wait_on_buffer(bh);
+			goto wait_for_ctlbuf;
+		}
+		if (cond_resched())
+			goto wait_for_ctlbuf;
+
+		if (unlikely(!buffer_uptodate(bh)))
+			err = -EIO;
+
+		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
+		clear_buffer_jwrite(bh);
+		jbd2_journal_unfile_buffer(journal, jh);
+		jbd2_journal_put_journal_head(jh);
+		__brelse(bh);		/* One for getblk */
+		/* AKPM: bforget here */
+	}
+
+	if (err)
+		jbd2_journal_abort(journal, err);
+
+	jbd_debug(3, "JBD: commit phase 5\n");
+	write_lock(&journal->j_state_lock);
+	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
+	commit_transaction->t_state = T_COMMIT_JFLUSH;
+	write_unlock(&journal->j_state_lock);
+
+	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
+				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+		err = journal_submit_commit_record(journal, commit_transaction,
+						&cbh, crc32_sum);
+		if (err)
+			__jbd2_journal_abort_hard(journal);
+	}
+	if (cbh)
+		err = journal_wait_on_commit_record(journal, cbh);
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+	    journal->j_flags & JBD2_BARRIER) {
+		blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
+	}
+
+	if (err)
+		jbd2_journal_abort(journal, err);
+
+	/* End of a transaction!  Finally, we can do checkpoint
+           processing: any buffers committed as a result of this
+           transaction can be removed from any checkpoint list it was on
+           before. */
+
+	jbd_debug(3, "JBD: commit phase 6\n");
+
+	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
+	J_ASSERT(commit_transaction->t_buffers == NULL);
+	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
+	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
+	J_ASSERT(commit_transaction->t_shadow_list == NULL);
+	J_ASSERT(commit_transaction->t_log_list == NULL);
+
+restart_loop:
+	/*
+	 * As there are other places (journal_unmap_buffer()) adding buffers
+	 * to this list we have to be careful and hold the j_list_lock.
+	 */
+	spin_lock(&journal->j_list_lock);
+	while (commit_transaction->t_forget) {
+		transaction_t *cp_transaction;
+		struct buffer_head *bh;
+		int try_to_free = 0;
+
+		jh = commit_transaction->t_forget;
+		spin_unlock(&journal->j_list_lock);
+		bh = jh2bh(jh);
+		/*
+		 * Get a reference so that bh cannot be freed before we are
+		 * done with it.
+		 */
+		get_bh(bh);
+		jbd_lock_bh_state(bh);
+		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
+
+		/*
+		 * If there is undo-protected committed data against
+		 * this buffer, then we can remove it now.  If it is a
+		 * buffer needing such protection, the old frozen_data
+		 * field now points to a committed version of the
+		 * buffer, so rotate that field to the new committed
+		 * data.
+		 *
+		 * Otherwise, we can just throw away the frozen data now.
+		 *
+		 * We also know that the frozen data has already fired
+		 * its triggers if they exist, so we can clear that too.
+		 */
+		if (jh->b_committed_data) {
+			jbd2_free(jh->b_committed_data, bh->b_size);
+			jh->b_committed_data = NULL;
+			if (jh->b_frozen_data) {
+				jh->b_committed_data = jh->b_frozen_data;
+				jh->b_frozen_data = NULL;
+				jh->b_frozen_triggers = NULL;
+			}
+		} else if (jh->b_frozen_data) {
+			jbd2_free(jh->b_frozen_data, bh->b_size);
+			jh->b_frozen_data = NULL;
+			jh->b_frozen_triggers = NULL;
+		}
+
+		spin_lock(&journal->j_list_lock);
+		cp_transaction = jh->b_cp_transaction;
+		if (cp_transaction) {
+			JBUFFER_TRACE(jh, "remove from old cp transaction");
+			cp_transaction->t_chp_stats.cs_dropped++;
+			__jbd2_journal_remove_checkpoint(jh);
+		}
+
+		/* Only re-checkpoint the buffer_head if it is marked
+		 * dirty.  If the buffer was added to the BJ_Forget list
+		 * by jbd2_journal_forget, it may no longer be dirty and
+		 * there's no point in keeping a checkpoint record for
+		 * it. */
+
+		/* A buffer which has been freed while still being
+		 * journaled by a previous transaction may end up still
+		 * being dirty here, but we want to avoid writing back
+		 * that buffer in the future after the "add to orphan"
+		 * operation been committed,  That's not only a performance
+		 * gain, it also stops aliasing problems if the buffer is
+		 * left behind for writeback and gets reallocated for another
+		 * use in a different page. */
+		if (buffer_freed(bh) && !jh->b_next_transaction) {
+			clear_buffer_freed(bh);
+			clear_buffer_jbddirty(bh);
+		}
+
+		if (buffer_jbddirty(bh)) {
+			JBUFFER_TRACE(jh, "add to new checkpointing trans");
+			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
+			if (is_journal_aborted(journal))
+				clear_buffer_jbddirty(bh);
+		} else {
+			J_ASSERT_BH(bh, !buffer_dirty(bh));
+			/*
+			 * The buffer on BJ_Forget list and not jbddirty means
+			 * it has been freed by this transaction and hence it
+			 * could not have been reallocated until this
+			 * transaction has committed. *BUT* it could be
+			 * reallocated once we have written all the data to
+			 * disk and before we process the buffer on BJ_Forget
+			 * list.
+			 */
+			if (!jh->b_next_transaction)
+				try_to_free = 1;
+		}
+		JBUFFER_TRACE(jh, "refile or unfile buffer");
+		__jbd2_journal_refile_buffer(jh);
+		jbd_unlock_bh_state(bh);
+		if (try_to_free)
+			release_buffer_page(bh);	/* Drops bh reference */
+		else
+			__brelse(bh);
+		cond_resched_lock(&journal->j_list_lock);
+	}
+	spin_unlock(&journal->j_list_lock);
+	/*
+	 * This is a bit sleazy.  We use j_list_lock to protect transition
+	 * of a transaction into T_FINISHED state and calling
+	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
+	 * other checkpointing code processing the transaction...
+	 */
+	write_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	/*
+	 * Now recheck if some buffers did not get attached to the transaction
+	 * while the lock was dropped...
+	 */
+	if (commit_transaction->t_forget) {
+		spin_unlock(&journal->j_list_lock);
+		write_unlock(&journal->j_state_lock);
+		goto restart_loop;
+	}
+
+	/* Done with this transaction! */
+
+	jbd_debug(3, "JBD: commit phase 7\n");
+
+	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
+
+	commit_transaction->t_start = jiffies;
+	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
+					      commit_transaction->t_start);
+
+	/*
+	 * File the transaction statistics
+	 */
+	stats.ts_tid = commit_transaction->t_tid;
+	stats.run.rs_handle_count =
+		atomic_read(&commit_transaction->t_handle_count);
+	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
+			     commit_transaction->t_tid, &stats.run);
+
+	/*
+	 * Calculate overall stats
+	 */
+	spin_lock(&journal->j_history_lock);
+	journal->j_stats.ts_tid++;
+	journal->j_stats.run.rs_wait += stats.run.rs_wait;
+	journal->j_stats.run.rs_running += stats.run.rs_running;
+	journal->j_stats.run.rs_locked += stats.run.rs_locked;
+	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
+	journal->j_stats.run.rs_logging += stats.run.rs_logging;
+	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
+	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
+	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
+	spin_unlock(&journal->j_history_lock);
+
+	commit_transaction->t_state = T_FINISHED;
+	J_ASSERT(commit_transaction == journal->j_committing_transaction);
+	journal->j_commit_sequence = commit_transaction->t_tid;
+	journal->j_committing_transaction = NULL;
+	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+
+	/*
+	 * weight the commit time higher than the average time so we don't
+	 * react too strongly to vast changes in the commit time
+	 */
+	if (likely(journal->j_average_commit_time))
+		journal->j_average_commit_time = (commit_time +
+				journal->j_average_commit_time*3) / 4;
+	else
+		journal->j_average_commit_time = commit_time;
+	write_unlock(&journal->j_state_lock);
+
+	if (commit_transaction->t_checkpoint_list == NULL &&
+	    commit_transaction->t_checkpoint_io_list == NULL) {
+		__jbd2_journal_drop_transaction(journal, commit_transaction);
+		to_free = 1;
+	} else {
+		if (journal->j_checkpoint_transactions == NULL) {
+			journal->j_checkpoint_transactions = commit_transaction;
+			commit_transaction->t_cpnext = commit_transaction;
+			commit_transaction->t_cpprev = commit_transaction;
+		} else {
+			commit_transaction->t_cpnext =
+				journal->j_checkpoint_transactions;
+			commit_transaction->t_cpprev =
+				commit_transaction->t_cpnext->t_cpprev;
+			commit_transaction->t_cpnext->t_cpprev =
+				commit_transaction;
+			commit_transaction->t_cpprev->t_cpnext =
+				commit_transaction;
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	if (journal->j_commit_callback)
+		journal->j_commit_callback(journal, commit_transaction);
+
+	trace_jbd2_end_commit(journal, commit_transaction);
+	jbd_debug(1, "JBD: commit %d complete, head %d\n",
+		  journal->j_commit_sequence, journal->j_tail_sequence);
+	if (to_free)
+		kfree(commit_transaction);
+
+	wake_up(&journal->j_wait_done_commit);
+}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
new file mode 100644
index 00000000..40c5fb73
--- /dev/null
+++ b/fs/jbd2/journal.c
@@ -0,0 +1,2471 @@
+/*
+ * linux/fs/jbd2/journal.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem journal-writing code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages journals: areas of disk reserved for logging
+ * transactional updates.  This includes the kernel journaling thread
+ * which is responsible for scheduling updates to the log.
+ *
+ * We do not actually manage the physical storage of the journal in this
+ * file: that is left to a per-journal policy function, which allows us
+ * to store the journal within a filesystem-specified area for ext2
+ * journaling (ext2 can use a reserved inode for storing the log).
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/freezer.h>
+#include <linux/pagemap.h>
+#include <linux/kthread.h>
+#include <linux/poison.h>
+#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/math64.h>
+#include <linux/hash.h>
+#include <linux/log2.h>
+#include <linux/vmalloc.h>
+#include <linux/backing-dev.h>
+#include <linux/bitops.h>
+#include <linux/ratelimit.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/jbd2.h>
+
+#include <asm/uaccess.h>
+#include <asm/page.h>
+#include <asm/system.h>
+
+EXPORT_SYMBOL(jbd2_journal_extend);
+EXPORT_SYMBOL(jbd2_journal_stop);
+EXPORT_SYMBOL(jbd2_journal_lock_updates);
+EXPORT_SYMBOL(jbd2_journal_unlock_updates);
+EXPORT_SYMBOL(jbd2_journal_get_write_access);
+EXPORT_SYMBOL(jbd2_journal_get_create_access);
+EXPORT_SYMBOL(jbd2_journal_get_undo_access);
+EXPORT_SYMBOL(jbd2_journal_set_triggers);
+EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
+EXPORT_SYMBOL(jbd2_journal_release_buffer);
+EXPORT_SYMBOL(jbd2_journal_forget);
+#if 0
+EXPORT_SYMBOL(journal_sync_buffer);
+#endif
+EXPORT_SYMBOL(jbd2_journal_flush);
+EXPORT_SYMBOL(jbd2_journal_revoke);
+
+EXPORT_SYMBOL(jbd2_journal_init_dev);
+EXPORT_SYMBOL(jbd2_journal_init_inode);
+EXPORT_SYMBOL(jbd2_journal_update_format);
+EXPORT_SYMBOL(jbd2_journal_check_used_features);
+EXPORT_SYMBOL(jbd2_journal_check_available_features);
+EXPORT_SYMBOL(jbd2_journal_set_features);
+EXPORT_SYMBOL(jbd2_journal_load);
+EXPORT_SYMBOL(jbd2_journal_destroy);
+EXPORT_SYMBOL(jbd2_journal_abort);
+EXPORT_SYMBOL(jbd2_journal_errno);
+EXPORT_SYMBOL(jbd2_journal_ack_err);
+EXPORT_SYMBOL(jbd2_journal_clear_err);
+EXPORT_SYMBOL(jbd2_log_wait_commit);
+EXPORT_SYMBOL(jbd2_log_start_commit);
+EXPORT_SYMBOL(jbd2_journal_start_commit);
+EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
+EXPORT_SYMBOL(jbd2_journal_wipe);
+EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
+EXPORT_SYMBOL(jbd2_journal_invalidatepage);
+EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
+EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
+EXPORT_SYMBOL(jbd2_inode_cache);
+
+static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
+static void __journal_abort_soft (journal_t *journal, int errno);
+static int jbd2_journal_create_slab(size_t slab_size);
+
+/*
+ * Helper function used to manage commit timeouts
+ */
+
+static void commit_timeout(unsigned long __data)
+{
+	struct task_struct * p = (struct task_struct *) __data;
+
+	wake_up_process(p);
+}
+
+/*
+ * kjournald2: The main thread function used to manage a logging device
+ * journal.
+ *
+ * This kernel thread is responsible for two things:
+ *
+ * 1) COMMIT:  Every so often we need to commit the current state of the
+ *    filesystem to disk.  The journal thread is responsible for writing
+ *    all of the metadata buffers to disk.
+ *
+ * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
+ *    of the data in that part of the log has been rewritten elsewhere on
+ *    the disk.  Flushing these old buffers to reclaim space in the log is
+ *    known as checkpointing, and this thread is responsible for that job.
+ */
+
+static int kjournald2(void *arg)
+{
+	journal_t *journal = arg;
+	transaction_t *transaction;
+
+	/*
+	 * Set up an interval timer which can be used to trigger a commit wakeup
+	 * after the commit interval expires
+	 */
+	setup_timer(&journal->j_commit_timer, commit_timeout,
+			(unsigned long)current);
+
+	/* Record that the journal thread is running */
+	journal->j_task = current;
+	wake_up(&journal->j_wait_done_commit);
+
+	/*
+	 * And now, wait forever for commit wakeup events.
+	 */
+	write_lock(&journal->j_state_lock);
+
+loop:
+	if (journal->j_flags & JBD2_UNMOUNT)
+		goto end_loop;
+
+	jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
+		journal->j_commit_sequence, journal->j_commit_request);
+
+	if (journal->j_commit_sequence != journal->j_commit_request) {
+		jbd_debug(1, "OK, requests differ\n");
+		write_unlock(&journal->j_state_lock);
+		del_timer_sync(&journal->j_commit_timer);
+		jbd2_journal_commit_transaction(journal);
+		write_lock(&journal->j_state_lock);
+		goto loop;
+	}
+
+	wake_up(&journal->j_wait_done_commit);
+	if (freezing(current)) {
+		/*
+		 * The simpler the better. Flushing journal isn't a
+		 * good idea, because that depends on threads that may
+		 * be already stopped.
+		 */
+		jbd_debug(1, "Now suspending kjournald2\n");
+		write_unlock(&journal->j_state_lock);
+		refrigerator();
+		write_lock(&journal->j_state_lock);
+	} else {
+		/*
+		 * We assume on resume that commits are already there,
+		 * so we don't sleep
+		 */
+		DEFINE_WAIT(wait);
+		int should_sleep = 1;
+
+		prepare_to_wait(&journal->j_wait_commit, &wait,
+				TASK_INTERRUPTIBLE);
+		if (journal->j_commit_sequence != journal->j_commit_request)
+			should_sleep = 0;
+		transaction = journal->j_running_transaction;
+		if (transaction && time_after_eq(jiffies,
+						transaction->t_expires))
+			should_sleep = 0;
+		if (journal->j_flags & JBD2_UNMOUNT)
+			should_sleep = 0;
+		if (should_sleep) {
+			write_unlock(&journal->j_state_lock);
+			schedule();
+			write_lock(&journal->j_state_lock);
+		}
+		finish_wait(&journal->j_wait_commit, &wait);
+	}
+
+	jbd_debug(1, "kjournald2 wakes\n");
+
+	/*
+	 * Were we woken up by a commit wakeup event?
+	 */
+	transaction = journal->j_running_transaction;
+	if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
+		journal->j_commit_request = transaction->t_tid;
+		jbd_debug(1, "woke because of timeout\n");
+	}
+	goto loop;
+
+end_loop:
+	write_unlock(&journal->j_state_lock);
+	del_timer_sync(&journal->j_commit_timer);
+	journal->j_task = NULL;
+	wake_up(&journal->j_wait_done_commit);
+	jbd_debug(1, "Journal thread exiting.\n");
+	return 0;
+}
+
+static int jbd2_journal_start_thread(journal_t *journal)
+{
+	struct task_struct *t;
+
+	t = kthread_run(kjournald2, journal, "jbd2/%s",
+			journal->j_devname);
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
+	return 0;
+}
+
+static void journal_kill_thread(journal_t *journal)
+{
+	write_lock(&journal->j_state_lock);
+	journal->j_flags |= JBD2_UNMOUNT;
+
+	while (journal->j_task) {
+		wake_up(&journal->j_wait_commit);
+		write_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
+		write_lock(&journal->j_state_lock);
+	}
+	write_unlock(&journal->j_state_lock);
+}
+
+/*
+ * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
+ *
+ * Writes a metadata buffer to a given disk block.  The actual IO is not
+ * performed but a new buffer_head is constructed which labels the data
+ * to be written with the correct destination disk block.
+ *
+ * Any magic-number escaping which needs to be done will cause a
+ * copy-out here.  If the buffer happens to start with the
+ * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
+ * magic number is only written to the log for descripter blocks.  In
+ * this case, we copy the data and replace the first word with 0, and we
+ * return a result code which indicates that this buffer needs to be
+ * marked as an escaped buffer in the corresponding log descriptor
+ * block.  The missing word can then be restored when the block is read
+ * during recovery.
+ *
+ * If the source buffer has already been modified by a new transaction
+ * since we took the last commit snapshot, we use the frozen copy of
+ * that data for IO.  If we end up using the existing buffer_head's data
+ * for the write, then we *have* to lock the buffer to prevent anyone
+ * else from using and possibly modifying it while the IO is in
+ * progress.
+ *
+ * The function returns a pointer to the buffer_heads to be used for IO.
+ *
+ * We assume that the journal has already been locked in this function.
+ *
+ * Return value:
+ *  <0: Error
+ * >=0: Finished OK
+ *
+ * On success:
+ * Bit 0 set == escape performed on the data
+ * Bit 1 set == buffer copy-out performed (kfree the data after IO)
+ */
+
+int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
+				  struct journal_head  *jh_in,
+				  struct journal_head **jh_out,
+				  unsigned long long blocknr)
+{
+	int need_copy_out = 0;
+	int done_copy_out = 0;
+	int do_escape = 0;
+	char *mapped_data;
+	struct buffer_head *new_bh;
+	struct journal_head *new_jh;
+	struct page *new_page;
+	unsigned int new_offset;
+	struct buffer_head *bh_in = jh2bh(jh_in);
+	journal_t *journal = transaction->t_journal;
+
+	/*
+	 * The buffer really shouldn't be locked: only the current committing
+	 * transaction is allowed to write it, so nobody else is allowed
+	 * to do any IO.
+	 *
+	 * akpm: except if we're journalling data, and write() output is
+	 * also part of a shared mapping, and another thread has
+	 * decided to launch a writepage() against this buffer.
+	 */
+	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
+
+retry_alloc:
+	new_bh = alloc_buffer_head(GFP_NOFS);
+	if (!new_bh) {
+		/*
+		 * Failure is not an option, but __GFP_NOFAIL is going
+		 * away; so we retry ourselves here.
+		 */
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+		goto retry_alloc;
+	}
+
+	/* keep subsequent assertions sane */
+	new_bh->b_state = 0;
+	init_buffer(new_bh, NULL, NULL);
+	atomic_set(&new_bh->b_count, 1);
+	new_jh = jbd2_journal_add_journal_head(new_bh);	/* This sleeps */
+
+	/*
+	 * If a new transaction has already done a buffer copy-out, then
+	 * we use that version of the data for the commit.
+	 */
+	jbd_lock_bh_state(bh_in);
+repeat:
+	if (jh_in->b_frozen_data) {
+		done_copy_out = 1;
+		new_page = virt_to_page(jh_in->b_frozen_data);
+		new_offset = offset_in_page(jh_in->b_frozen_data);
+	} else {
+		new_page = jh2bh(jh_in)->b_page;
+		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+	}
+
+	mapped_data = kmap_atomic(new_page, KM_USER0);
+	/*
+	 * Fire data frozen trigger if data already wasn't frozen.  Do this
+	 * before checking for escaping, as the trigger may modify the magic
+	 * offset.  If a copy-out happens afterwards, it will have the correct
+	 * data in the buffer.
+	 */
+	if (!done_copy_out)
+		jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
+					   jh_in->b_triggers);
+
+	/*
+	 * Check for escaping
+	 */
+	if (*((__be32 *)(mapped_data + new_offset)) ==
+				cpu_to_be32(JBD2_MAGIC_NUMBER)) {
+		need_copy_out = 1;
+		do_escape = 1;
+	}
+	kunmap_atomic(mapped_data, KM_USER0);
+
+	/*
+	 * Do we need to do a data copy?
+	 */
+	if (need_copy_out && !done_copy_out) {
+		char *tmp;
+
+		jbd_unlock_bh_state(bh_in);
+		tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
+		if (!tmp) {
+			jbd2_journal_put_journal_head(new_jh);
+			return -ENOMEM;
+		}
+		jbd_lock_bh_state(bh_in);
+		if (jh_in->b_frozen_data) {
+			jbd2_free(tmp, bh_in->b_size);
+			goto repeat;
+		}
+
+		jh_in->b_frozen_data = tmp;
+		mapped_data = kmap_atomic(new_page, KM_USER0);
+		memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
+		kunmap_atomic(mapped_data, KM_USER0);
+
+		new_page = virt_to_page(tmp);
+		new_offset = offset_in_page(tmp);
+		done_copy_out = 1;
+
+		/*
+		 * This isn't strictly necessary, as we're using frozen
+		 * data for the escaping, but it keeps consistency with
+		 * b_frozen_data usage.
+		 */
+		jh_in->b_frozen_triggers = jh_in->b_triggers;
+	}
+
+	/*
+	 * Did we need to do an escaping?  Now we've done all the
+	 * copying, we can finally do so.
+	 */
+	if (do_escape) {
+		mapped_data = kmap_atomic(new_page, KM_USER0);
+		*((unsigned int *)(mapped_data + new_offset)) = 0;
+		kunmap_atomic(mapped_data, KM_USER0);
+	}
+
+	set_bh_page(new_bh, new_page, new_offset);
+	new_jh->b_transaction = NULL;
+	new_bh->b_size = jh2bh(jh_in)->b_size;
+	new_bh->b_bdev = transaction->t_journal->j_dev;
+	new_bh->b_blocknr = blocknr;
+	set_buffer_mapped(new_bh);
+	set_buffer_dirty(new_bh);
+
+	*jh_out = new_jh;
+
+	/*
+	 * The to-be-written buffer needs to get moved to the io queue,
+	 * and the original buffer whose contents we are shadowing or
+	 * copying is moved to the transaction's shadow queue.
+	 */
+	JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
+	spin_lock(&journal->j_list_lock);
+	__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh_in);
+
+	JBUFFER_TRACE(new_jh, "file as BJ_IO");
+	jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
+
+	return do_escape | (done_copy_out << 1);
+}
+
+/*
+ * Allocation code for the journal file.  Manage the space left in the
+ * journal, so that we can begin checkpointing when appropriate.
+ */
+
+/*
+ * __jbd2_log_space_left: Return the number of free blocks left in the journal.
+ *
+ * Called with the journal already locked.
+ *
+ * Called under j_state_lock
+ */
+
+int __jbd2_log_space_left(journal_t *journal)
+{
+	int left = journal->j_free;
+
+	/* assert_spin_locked(&journal->j_state_lock); */
+
+	/*
+	 * Be pessimistic here about the number of those free blocks which
+	 * might be required for log descriptor control blocks.
+	 */
+
+#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
+
+	left -= MIN_LOG_RESERVED_BLOCKS;
+
+	if (left <= 0)
+		return 0;
+	left -= (left >> 3);
+	return left;
+}
+
+/*
+ * Called with j_state_lock locked for writing.
+ * Returns true if a transaction commit was started.
+ */
+int __jbd2_log_start_commit(journal_t *journal, tid_t target)
+{
+	/*
+	 * The only transaction we can possibly wait upon is the
+	 * currently running transaction (if it exists).  Otherwise,
+	 * the target tid must be an old one.
+	 */
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == target) {
+		/*
+		 * We want a new commit: OK, mark the request and wakeup the
+		 * commit thread.  We do _not_ do the commit ourselves.
+		 */
+
+		journal->j_commit_request = target;
+		jbd_debug(1, "JBD: requesting commit %d/%d\n",
+			  journal->j_commit_request,
+			  journal->j_commit_sequence);
+		wake_up(&journal->j_wait_commit);
+		return 1;
+	} else if (!tid_geq(journal->j_commit_request, target))
+		/* This should never happen, but if it does, preserve
+		   the evidence before kjournald goes into a loop and
+		   increments j_commit_sequence beyond all recognition. */
+		WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+			  journal->j_commit_request,
+			  journal->j_commit_sequence,
+			  target, journal->j_running_transaction ? 
+			  journal->j_running_transaction->t_tid : 0);
+	return 0;
+}
+
+int jbd2_log_start_commit(journal_t *journal, tid_t tid)
+{
+	int ret;
+
+	write_lock(&journal->j_state_lock);
+	ret = __jbd2_log_start_commit(journal, tid);
+	write_unlock(&journal->j_state_lock);
+	return ret;
+}
+
+/*
+ * Force and wait upon a commit if the calling process is not within
+ * transaction.  This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * We can only force the running transaction if we don't have an active handle;
+ * otherwise, we will deadlock.
+ *
+ * Returns true if a transaction was started.
+ */
+int jbd2_journal_force_commit_nested(journal_t *journal)
+{
+	transaction_t *transaction = NULL;
+	tid_t tid;
+	int need_to_start = 0;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction && !current->journal_info) {
+		transaction = journal->j_running_transaction;
+		if (!tid_geq(journal->j_commit_request, transaction->t_tid))
+			need_to_start = 1;
+	} else if (journal->j_committing_transaction)
+		transaction = journal->j_committing_transaction;
+
+	if (!transaction) {
+		read_unlock(&journal->j_state_lock);
+		return 0;	/* Nothing to retry */
+	}
+
+	tid = transaction->t_tid;
+	read_unlock(&journal->j_state_lock);
+	if (need_to_start)
+		jbd2_log_start_commit(journal, tid);
+	jbd2_log_wait_commit(journal, tid);
+	return 1;
+}
+
+/*
+ * Start a commit of the current running transaction (if any).  Returns true
+ * if a transaction is going to be committed (or is currently already
+ * committing), and fills its tid in at *ptid
+ */
+int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
+{
+	int ret = 0;
+
+	write_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction) {
+		tid_t tid = journal->j_running_transaction->t_tid;
+
+		__jbd2_log_start_commit(journal, tid);
+		/* There's a running transaction and we've just made sure
+		 * it's commit has been scheduled. */
+		if (ptid)
+			*ptid = tid;
+		ret = 1;
+	} else if (journal->j_committing_transaction) {
+		/*
+		 * If ext3_write_super() recently started a commit, then we
+		 * have to wait for completion of that transaction
+		 */
+		if (ptid)
+			*ptid = journal->j_committing_transaction->t_tid;
+		ret = 1;
+	}
+	write_unlock(&journal->j_state_lock);
+	return ret;
+}
+
+/*
+ * Return 1 if a given transaction has not yet sent barrier request
+ * connected with a transaction commit. If 0 is returned, transaction
+ * may or may not have sent the barrier. Used to avoid sending barrier
+ * twice in common cases.
+ */
+int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
+{
+	int ret = 0;
+	transaction_t *commit_trans;
+
+	if (!(journal->j_flags & JBD2_BARRIER))
+		return 0;
+	read_lock(&journal->j_state_lock);
+	/* Transaction already committed? */
+	if (tid_geq(journal->j_commit_sequence, tid))
+		goto out;
+	commit_trans = journal->j_committing_transaction;
+	if (!commit_trans || commit_trans->t_tid != tid) {
+		ret = 1;
+		goto out;
+	}
+	/*
+	 * Transaction is being committed and we already proceeded to
+	 * submitting a flush to fs partition?
+	 */
+	if (journal->j_fs_dev != journal->j_dev) {
+		if (!commit_trans->t_need_data_flush ||
+		    commit_trans->t_state >= T_COMMIT_DFLUSH)
+			goto out;
+	} else {
+		if (commit_trans->t_state >= T_COMMIT_JFLUSH)
+			goto out;
+	}
+	ret = 1;
+out:
+	read_unlock(&journal->j_state_lock);
+	return ret;
+}
+EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
+
+/*
+ * Wait for a specified commit to complete.
+ * The caller may not hold the journal lock.
+ */
+int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
+{
+	int err = 0;
+
+	read_lock(&journal->j_state_lock);
+#ifdef CONFIG_JBD2_DEBUG
+	if (!tid_geq(journal->j_commit_request, tid)) {
+		printk(KERN_EMERG
+		       "%s: error: j_commit_request=%d, tid=%d\n",
+		       __func__, journal->j_commit_request, tid);
+	}
+#endif
+	while (tid_gt(tid, journal->j_commit_sequence)) {
+		jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
+				  tid, journal->j_commit_sequence);
+		wake_up(&journal->j_wait_commit);
+		read_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_done_commit,
+				!tid_gt(tid, journal->j_commit_sequence));
+		read_lock(&journal->j_state_lock);
+	}
+	read_unlock(&journal->j_state_lock);
+
+	if (unlikely(is_journal_aborted(journal))) {
+		printk(KERN_EMERG "journal commit I/O error\n");
+		err = -EIO;
+	}
+	return err;
+}
+
+/*
+ * Log buffer allocation routines:
+ */
+
+int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
+{
+	unsigned long blocknr;
+
+	write_lock(&journal->j_state_lock);
+	J_ASSERT(journal->j_free > 1);
+
+	blocknr = journal->j_head;
+	journal->j_head++;
+	journal->j_free--;
+	if (journal->j_head == journal->j_last)
+		journal->j_head = journal->j_first;
+	write_unlock(&journal->j_state_lock);
+	return jbd2_journal_bmap(journal, blocknr, retp);
+}
+
+/*
+ * Conversion of logical to physical block numbers for the journal
+ *
+ * On external journals the journal blocks are identity-mapped, so
+ * this is a no-op.  If needed, we can use j_blk_offset - everything is
+ * ready.
+ */
+int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
+		 unsigned long long *retp)
+{
+	int err = 0;
+	unsigned long long ret;
+
+	if (journal->j_inode) {
+		ret = bmap(journal->j_inode, blocknr);
+		if (ret)
+			*retp = ret;
+		else {
+			printk(KERN_ALERT "%s: journal block not found "
+					"at offset %lu on %s\n",
+			       __func__, blocknr, journal->j_devname);
+			err = -EIO;
+			__journal_abort_soft(journal, err);
+		}
+	} else {
+		*retp = blocknr; /* +journal->j_blk_offset */
+	}
+	return err;
+}
+
+/*
+ * We play buffer_head aliasing tricks to write data/metadata blocks to
+ * the journal without copying their contents, but for journal
+ * descriptor blocks we do need to generate bona fide buffers.
+ *
+ * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
+ * the buffer's contents they really should run flush_dcache_page(bh->b_page).
+ * But we don't bother doing that, so there will be coherency problems with
+ * mmaps of blockdevs which hold live JBD-controlled filesystems.
+ */
+struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+{
+	struct buffer_head *bh;
+	unsigned long long blocknr;
+	int err;
+
+	err = jbd2_journal_next_log_block(journal, &blocknr);
+
+	if (err)
+		return NULL;
+
+	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh)
+		return NULL;
+	lock_buffer(bh);
+	memset(bh->b_data, 0, journal->j_blocksize);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	BUFFER_TRACE(bh, "return this buffer");
+	return jbd2_journal_add_journal_head(bh);
+}
+
+struct jbd2_stats_proc_session {
+	journal_t *journal;
+	struct transaction_stats_s *stats;
+	int start;
+	int max;
+};
+
+static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
+{
+	return *pos ? NULL : SEQ_START_TOKEN;
+}
+
+static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return NULL;
+}
+
+static int jbd2_seq_info_show(struct seq_file *seq, void *v)
+{
+	struct jbd2_stats_proc_session *s = seq->private;
+
+	if (v != SEQ_START_TOKEN)
+		return 0;
+	seq_printf(seq, "%lu transaction, each up to %u blocks\n",
+			s->stats->ts_tid,
+			s->journal->j_max_transaction_buffers);
+	if (s->stats->ts_tid == 0)
+		return 0;
+	seq_printf(seq, "average: \n  %ums waiting for transaction\n",
+	    jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
+	seq_printf(seq, "  %ums running transaction\n",
+	    jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
+	seq_printf(seq, "  %ums transaction was being locked\n",
+	    jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
+	seq_printf(seq, "  %ums flushing data (in ordered mode)\n",
+	    jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
+	seq_printf(seq, "  %ums logging transaction\n",
+	    jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
+	seq_printf(seq, "  %lluus average transaction commit time\n",
+		   div_u64(s->journal->j_average_commit_time, 1000));
+	seq_printf(seq, "  %lu handles per transaction\n",
+	    s->stats->run.rs_handle_count / s->stats->ts_tid);
+	seq_printf(seq, "  %lu blocks per transaction\n",
+	    s->stats->run.rs_blocks / s->stats->ts_tid);
+	seq_printf(seq, "  %lu logged blocks per transaction\n",
+	    s->stats->run.rs_blocks_logged / s->stats->ts_tid);
+	return 0;
+}
+
+static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations jbd2_seq_info_ops = {
+	.start  = jbd2_seq_info_start,
+	.next   = jbd2_seq_info_next,
+	.stop   = jbd2_seq_info_stop,
+	.show   = jbd2_seq_info_show,
+};
+
+static int jbd2_seq_info_open(struct inode *inode, struct file *file)
+{
+	journal_t *journal = PDE(inode)->data;
+	struct jbd2_stats_proc_session *s;
+	int rc, size;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (s == NULL)
+		return -ENOMEM;
+	size = sizeof(struct transaction_stats_s);
+	s->stats = kmalloc(size, GFP_KERNEL);
+	if (s->stats == NULL) {
+		kfree(s);
+		return -ENOMEM;
+	}
+	spin_lock(&journal->j_history_lock);
+	memcpy(s->stats, &journal->j_stats, size);
+	s->journal = journal;
+	spin_unlock(&journal->j_history_lock);
+
+	rc = seq_open(file, &jbd2_seq_info_ops);
+	if (rc == 0) {
+		struct seq_file *m = file->private_data;
+		m->private = s;
+	} else {
+		kfree(s->stats);
+		kfree(s);
+	}
+	return rc;
+
+}
+
+static int jbd2_seq_info_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct jbd2_stats_proc_session *s = seq->private;
+	kfree(s->stats);
+	kfree(s);
+	return seq_release(inode, file);
+}
+
+static const struct file_operations jbd2_seq_info_fops = {
+	.owner		= THIS_MODULE,
+	.open           = jbd2_seq_info_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = jbd2_seq_info_release,
+};
+
+static struct proc_dir_entry *proc_jbd2_stats;
+
+static void jbd2_stats_proc_init(journal_t *journal)
+{
+	journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
+	if (journal->j_proc_entry) {
+		proc_create_data("info", S_IRUGO, journal->j_proc_entry,
+				 &jbd2_seq_info_fops, journal);
+	}
+}
+
+static void jbd2_stats_proc_exit(journal_t *journal)
+{
+	remove_proc_entry("info", journal->j_proc_entry);
+	remove_proc_entry(journal->j_devname, proc_jbd2_stats);
+}
+
+/*
+ * Management for journal control blocks: functions to create and
+ * destroy journal_t structures, and to initialise and read existing
+ * journal blocks from disk.  */
+
+/* First: create and setup a journal_t object in memory.  We initialise
+ * very few fields yet: that has to wait until we have created the
+ * journal structures from from scratch, or loaded them from disk. */
+
+static journal_t * journal_init_common (void)
+{
+	journal_t *journal;
+	int err;
+
+	journal = kzalloc(sizeof(*journal), GFP_KERNEL);
+	if (!journal)
+		return NULL;
+
+	init_waitqueue_head(&journal->j_wait_transaction_locked);
+	init_waitqueue_head(&journal->j_wait_logspace);
+	init_waitqueue_head(&journal->j_wait_done_commit);
+	init_waitqueue_head(&journal->j_wait_checkpoint);
+	init_waitqueue_head(&journal->j_wait_commit);
+	init_waitqueue_head(&journal->j_wait_updates);
+	mutex_init(&journal->j_barrier);
+	mutex_init(&journal->j_checkpoint_mutex);
+	spin_lock_init(&journal->j_revoke_lock);
+	spin_lock_init(&journal->j_list_lock);
+	rwlock_init(&journal->j_state_lock);
+
+	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
+	journal->j_min_batch_time = 0;
+	journal->j_max_batch_time = 15000; /* 15ms */
+
+	/* The journal is marked for error until we succeed with recovery! */
+	journal->j_flags = JBD2_ABORT;
+
+	/* Set up a default-sized revoke table for the new mount. */
+	err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
+	if (err) {
+		kfree(journal);
+		return NULL;
+	}
+
+	spin_lock_init(&journal->j_history_lock);
+
+	return journal;
+}
+
+/* jbd2_journal_init_dev and jbd2_journal_init_inode:
+ *
+ * Create a journal structure assigned some fixed set of disk blocks to
+ * the journal.  We don't actually touch those disk blocks yet, but we
+ * need to set up all of the mapping information to tell the journaling
+ * system where the journal blocks are.
+ *
+ */
+
+/**
+ *  journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure
+ *  @bdev: Block device on which to create the journal
+ *  @fs_dev: Device which hold journalled filesystem for this journal.
+ *  @start: Block nr Start of journal.
+ *  @len:  Length of the journal in blocks.
+ *  @blocksize: blocksize of journalling device
+ *
+ *  Returns: a newly created journal_t *
+ *
+ *  jbd2_journal_init_dev creates a journal which maps a fixed contiguous
+ *  range of blocks on an arbitrary block device.
+ *
+ */
+journal_t * jbd2_journal_init_dev(struct block_device *bdev,
+			struct block_device *fs_dev,
+			unsigned long long start, int len, int blocksize)
+{
+	journal_t *journal = journal_init_common();
+	struct buffer_head *bh;
+	char *p;
+	int n;
+
+	if (!journal)
+		return NULL;
+
+	/* journal descriptor can store up to n blocks -bzzz */
+	journal->j_blocksize = blocksize;
+	journal->j_dev = bdev;
+	journal->j_fs_dev = fs_dev;
+	journal->j_blk_offset = start;
+	journal->j_maxlen = len;
+	bdevname(journal->j_dev, journal->j_devname);
+	p = journal->j_devname;
+	while ((p = strchr(p, '/')))
+		*p = '!';
+	jbd2_stats_proc_init(journal);
+	n = journal->j_blocksize / sizeof(journal_block_tag_t);
+	journal->j_wbufsize = n;
+	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+	if (!journal->j_wbuf) {
+		printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
+			__func__);
+		goto out_err;
+	}
+
+	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
+	if (!bh) {
+		printk(KERN_ERR
+		       "%s: Cannot get buffer for journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
+	journal->j_sb_buffer = bh;
+	journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
+	return journal;
+out_err:
+	kfree(journal->j_wbuf);
+	jbd2_stats_proc_exit(journal);
+	kfree(journal);
+	return NULL;
+}
+
+/**
+ *  journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
+ *  @inode: An inode to create the journal in
+ *
+ * jbd2_journal_init_inode creates a journal which maps an on-disk inode as
+ * the journal.  The inode must exist already, must support bmap() and
+ * must have all data blocks preallocated.
+ */
+journal_t * jbd2_journal_init_inode (struct inode *inode)
+{
+	struct buffer_head *bh;
+	journal_t *journal = journal_init_common();
+	char *p;
+	int err;
+	int n;
+	unsigned long long blocknr;
+
+	if (!journal)
+		return NULL;
+
+	journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
+	journal->j_inode = inode;
+	bdevname(journal->j_dev, journal->j_devname);
+	p = journal->j_devname;
+	while ((p = strchr(p, '/')))
+		*p = '!';
+	p = journal->j_devname + strlen(journal->j_devname);
+	sprintf(p, "-%lu", journal->j_inode->i_ino);
+	jbd_debug(1,
+		  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
+		  journal, inode->i_sb->s_id, inode->i_ino,
+		  (long long) inode->i_size,
+		  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
+
+	journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
+	journal->j_blocksize = inode->i_sb->s_blocksize;
+	jbd2_stats_proc_init(journal);
+
+	/* journal descriptor can store up to n blocks -bzzz */
+	n = journal->j_blocksize / sizeof(journal_block_tag_t);
+	journal->j_wbufsize = n;
+	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+	if (!journal->j_wbuf) {
+		printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
+			__func__);
+		goto out_err;
+	}
+
+	err = jbd2_journal_bmap(journal, 0, &blocknr);
+	/* If that failed, give up */
+	if (err) {
+		printk(KERN_ERR "%s: Cannot locate journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
+
+	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh) {
+		printk(KERN_ERR
+		       "%s: Cannot get buffer for journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
+	journal->j_sb_buffer = bh;
+	journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
+	return journal;
+out_err:
+	kfree(journal->j_wbuf);
+	jbd2_stats_proc_exit(journal);
+	kfree(journal);
+	return NULL;
+}
+
+/*
+ * If the journal init or create aborts, we need to mark the journal
+ * superblock as being NULL to prevent the journal destroy from writing
+ * back a bogus superblock.
+ */
+static void journal_fail_superblock (journal_t *journal)
+{
+	struct buffer_head *bh = journal->j_sb_buffer;
+	brelse(bh);
+	journal->j_sb_buffer = NULL;
+}
+
+/*
+ * Given a journal_t structure, initialise the various fields for
+ * startup of a new journaling session.  We use this both when creating
+ * a journal, and after recovering an old journal to reset it for
+ * subsequent use.
+ */
+
+static int journal_reset(journal_t *journal)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+	unsigned long long first, last;
+
+	first = be32_to_cpu(sb->s_first);
+	last = be32_to_cpu(sb->s_maxlen);
+	if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
+		printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
+		       first, last);
+		journal_fail_superblock(journal);
+		return -EINVAL;
+	}
+
+	journal->j_first = first;
+	journal->j_last = last;
+
+	journal->j_head = first;
+	journal->j_tail = first;
+	journal->j_free = last - first;
+
+	journal->j_tail_sequence = journal->j_transaction_sequence;
+	journal->j_commit_sequence = journal->j_transaction_sequence - 1;
+	journal->j_commit_request = journal->j_commit_sequence;
+
+	journal->j_max_transaction_buffers = journal->j_maxlen / 4;
+
+	/* Add the dynamic fields and write it to disk. */
+	jbd2_journal_update_superblock(journal, 1);
+	return jbd2_journal_start_thread(journal);
+}
+
+/**
+ * void jbd2_journal_update_superblock() - Update journal sb on disk.
+ * @journal: The journal to update.
+ * @wait: Set to '0' if you don't want to wait for IO completion.
+ *
+ * Update a journal's dynamic superblock fields and write it to disk,
+ * optionally waiting for the IO to complete.
+ */
+void jbd2_journal_update_superblock(journal_t *journal, int wait)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+	struct buffer_head *bh = journal->j_sb_buffer;
+
+	/*
+	 * As a special case, if the on-disk copy is already marked as needing
+	 * no recovery (s_start == 0) and there are no outstanding transactions
+	 * in the filesystem, then we can safely defer the superblock update
+	 * until the next commit by setting JBD2_FLUSHED.  This avoids
+	 * attempting a write to a potential-readonly device.
+	 */
+	if (sb->s_start == 0 && journal->j_tail_sequence ==
+				journal->j_transaction_sequence) {
+		jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
+			"(start %ld, seq %d, errno %d)\n",
+			journal->j_tail, journal->j_tail_sequence,
+			journal->j_errno);
+		goto out;
+	}
+
+	if (buffer_write_io_error(bh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the journal
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		printk(KERN_ERR "JBD2: previous I/O error detected "
+		       "for journal superblock update for %s.\n",
+		       journal->j_devname);
+		clear_buffer_write_io_error(bh);
+		set_buffer_uptodate(bh);
+	}
+
+	read_lock(&journal->j_state_lock);
+	jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
+
+	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
+	sb->s_start    = cpu_to_be32(journal->j_tail);
+	sb->s_errno    = cpu_to_be32(journal->j_errno);
+	read_unlock(&journal->j_state_lock);
+
+	BUFFER_TRACE(bh, "marking dirty");
+	mark_buffer_dirty(bh);
+	if (wait) {
+		sync_dirty_buffer(bh);
+		if (buffer_write_io_error(bh)) {
+			printk(KERN_ERR "JBD2: I/O error detected "
+			       "when updating journal superblock for %s.\n",
+			       journal->j_devname);
+			clear_buffer_write_io_error(bh);
+			set_buffer_uptodate(bh);
+		}
+	} else
+		write_dirty_buffer(bh, WRITE);
+
+out:
+	/* If we have just flushed the log (by marking s_start==0), then
+	 * any future commit will have to be careful to update the
+	 * superblock again to re-record the true start of the log. */
+
+	write_lock(&journal->j_state_lock);
+	if (sb->s_start)
+		journal->j_flags &= ~JBD2_FLUSHED;
+	else
+		journal->j_flags |= JBD2_FLUSHED;
+	write_unlock(&journal->j_state_lock);
+}
+
+/*
+ * Read the superblock for a given journal, performing initial
+ * validation of the format.
+ */
+
+static int journal_get_superblock(journal_t *journal)
+{
+	struct buffer_head *bh;
+	journal_superblock_t *sb;
+	int err = -EIO;
+
+	bh = journal->j_sb_buffer;
+
+	J_ASSERT(bh != NULL);
+	if (!buffer_uptodate(bh)) {
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			printk (KERN_ERR
+				"JBD: IO error reading journal superblock\n");
+			goto out;
+		}
+	}
+
+	sb = journal->j_superblock;
+
+	err = -EINVAL;
+
+	if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
+	    sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
+		printk(KERN_WARNING "JBD: no valid journal superblock found\n");
+		goto out;
+	}
+
+	switch(be32_to_cpu(sb->s_header.h_blocktype)) {
+	case JBD2_SUPERBLOCK_V1:
+		journal->j_format_version = 1;
+		break;
+	case JBD2_SUPERBLOCK_V2:
+		journal->j_format_version = 2;
+		break;
+	default:
+		printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
+		goto out;
+	}
+
+	if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
+		journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
+	else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
+		printk (KERN_WARNING "JBD: journal file too short\n");
+		goto out;
+	}
+
+	if (be32_to_cpu(sb->s_first) == 0 ||
+	    be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
+		printk(KERN_WARNING
+			"JBD2: Invalid start block of journal: %u\n",
+			be32_to_cpu(sb->s_first));
+		goto out;
+	}
+
+	return 0;
+
+out:
+	journal_fail_superblock(journal);
+	return err;
+}
+
+/*
+ * Load the on-disk journal superblock and read the key fields into the
+ * journal_t.
+ */
+
+static int load_superblock(journal_t *journal)
+{
+	int err;
+	journal_superblock_t *sb;
+
+	err = journal_get_superblock(journal);
+	if (err)
+		return err;
+
+	sb = journal->j_superblock;
+
+	journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
+	journal->j_tail = be32_to_cpu(sb->s_start);
+	journal->j_first = be32_to_cpu(sb->s_first);
+	journal->j_last = be32_to_cpu(sb->s_maxlen);
+	journal->j_errno = be32_to_cpu(sb->s_errno);
+
+	return 0;
+}
+
+
+/**
+ * int jbd2_journal_load() - Read journal from disk.
+ * @journal: Journal to act on.
+ *
+ * Given a journal_t structure which tells us which disk blocks contain
+ * a journal, read the journal from disk to initialise the in-memory
+ * structures.
+ */
+int jbd2_journal_load(journal_t *journal)
+{
+	int err;
+	journal_superblock_t *sb;
+
+	err = load_superblock(journal);
+	if (err)
+		return err;
+
+	sb = journal->j_superblock;
+	/* If this is a V2 superblock, then we have to check the
+	 * features flags on it. */
+
+	if (journal->j_format_version >= 2) {
+		if ((sb->s_feature_ro_compat &
+		     ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
+		    (sb->s_feature_incompat &
+		     ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
+			printk (KERN_WARNING
+				"JBD: Unrecognised features on journal\n");
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * Create a slab for this blocksize
+	 */
+	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
+	if (err)
+		return err;
+
+	/* Let the recovery code check whether it needs to recover any
+	 * data from the journal. */
+	if (jbd2_journal_recover(journal))
+		goto recovery_error;
+
+	if (journal->j_failed_commit) {
+		printk(KERN_ERR "JBD2: journal transaction %u on %s "
+		       "is corrupt.\n", journal->j_failed_commit,
+		       journal->j_devname);
+		return -EIO;
+	}
+
+	/* OK, we've finished with the dynamic journal bits:
+	 * reinitialise the dynamic contents of the superblock in memory
+	 * and reset them on disk. */
+	if (journal_reset(journal))
+		goto recovery_error;
+
+	journal->j_flags &= ~JBD2_ABORT;
+	journal->j_flags |= JBD2_LOADED;
+	return 0;
+
+recovery_error:
+	printk (KERN_WARNING "JBD: recovery failed\n");
+	return -EIO;
+}
+
+/**
+ * void jbd2_journal_destroy() - Release a journal_t structure.
+ * @journal: Journal to act on.
+ *
+ * Release a journal_t structure once it is no longer in use by the
+ * journaled object.
+ * Return <0 if we couldn't clean up the journal.
+ */
+int jbd2_journal_destroy(journal_t *journal)
+{
+	int err = 0;
+
+	/* Wait for the commit thread to wake up and die. */
+	journal_kill_thread(journal);
+
+	/* Force a final log commit */
+	if (journal->j_running_transaction)
+		jbd2_journal_commit_transaction(journal);
+
+	/* Force any old transactions to disk */
+
+	/* Totally anal locking here... */
+	spin_lock(&journal->j_list_lock);
+	while (journal->j_checkpoint_transactions != NULL) {
+		spin_unlock(&journal->j_list_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
+		jbd2_log_do_checkpoint(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
+		spin_lock(&journal->j_list_lock);
+	}
+
+	J_ASSERT(journal->j_running_transaction == NULL);
+	J_ASSERT(journal->j_committing_transaction == NULL);
+	J_ASSERT(journal->j_checkpoint_transactions == NULL);
+	spin_unlock(&journal->j_list_lock);
+
+	if (journal->j_sb_buffer) {
+		if (!is_journal_aborted(journal)) {
+			/* We can now mark the journal as empty. */
+			journal->j_tail = 0;
+			journal->j_tail_sequence =
+				++journal->j_transaction_sequence;
+			jbd2_journal_update_superblock(journal, 1);
+		} else {
+			err = -EIO;
+		}
+		brelse(journal->j_sb_buffer);
+	}
+
+	if (journal->j_proc_entry)
+		jbd2_stats_proc_exit(journal);
+	if (journal->j_inode)
+		iput(journal->j_inode);
+	if (journal->j_revoke)
+		jbd2_journal_destroy_revoke(journal);
+	kfree(journal->j_wbuf);
+	kfree(journal);
+
+	return err;
+}
+
+
+/**
+ *int jbd2_journal_check_used_features () - Check if features specified are used.
+ * @journal: Journal to check.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Check whether the journal uses all of a given set of
+ * features.  Return true (non-zero) if it does.
+ **/
+
+int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
+				 unsigned long ro, unsigned long incompat)
+{
+	journal_superblock_t *sb;
+
+	if (!compat && !ro && !incompat)
+		return 1;
+	/* Load journal superblock if it is not loaded yet. */
+	if (journal->j_format_version == 0 &&
+	    journal_get_superblock(journal) != 0)
+		return 0;
+	if (journal->j_format_version == 1)
+		return 0;
+
+	sb = journal->j_superblock;
+
+	if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
+	    ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
+	    ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
+		return 1;
+
+	return 0;
+}
+
+/**
+ * int jbd2_journal_check_available_features() - Check feature set in journalling layer
+ * @journal: Journal to check.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Check whether the journaling code supports the use of
+ * all of a given set of features on this journal.  Return true
+ * (non-zero) if it can. */
+
+int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
+				      unsigned long ro, unsigned long incompat)
+{
+	if (!compat && !ro && !incompat)
+		return 1;
+
+	/* We can support any known requested features iff the
+	 * superblock is in version 2.  Otherwise we fail to support any
+	 * extended sb features. */
+
+	if (journal->j_format_version != 2)
+		return 0;
+
+	if ((compat   & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
+	    (ro       & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro &&
+	    (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * int jbd2_journal_set_features () - Mark a given journal feature in the superblock
+ * @journal: Journal to act on.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Mark a given journal feature as present on the
+ * superblock.  Returns true if the requested features could be set.
+ *
+ */
+
+int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
+			  unsigned long ro, unsigned long incompat)
+{
+	journal_superblock_t *sb;
+
+	if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
+		return 1;
+
+	if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
+		return 0;
+
+	jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+		  compat, ro, incompat);
+
+	sb = journal->j_superblock;
+
+	sb->s_feature_compat    |= cpu_to_be32(compat);
+	sb->s_feature_ro_compat |= cpu_to_be32(ro);
+	sb->s_feature_incompat  |= cpu_to_be32(incompat);
+
+	return 1;
+}
+
+/*
+ * jbd2_journal_clear_features () - Clear a given journal feature in the
+ * 				    superblock
+ * @journal: Journal to act on.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Clear a given journal feature as present on the
+ * superblock.
+ */
+void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
+				unsigned long ro, unsigned long incompat)
+{
+	journal_superblock_t *sb;
+
+	jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
+		  compat, ro, incompat);
+
+	sb = journal->j_superblock;
+
+	sb->s_feature_compat    &= ~cpu_to_be32(compat);
+	sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
+	sb->s_feature_incompat  &= ~cpu_to_be32(incompat);
+}
+EXPORT_SYMBOL(jbd2_journal_clear_features);
+
+/**
+ * int jbd2_journal_update_format () - Update on-disk journal structure.
+ * @journal: Journal to act on.
+ *
+ * Given an initialised but unloaded journal struct, poke about in the
+ * on-disk structure to update it to the most recent supported version.
+ */
+int jbd2_journal_update_format (journal_t *journal)
+{
+	journal_superblock_t *sb;
+	int err;
+
+	err = journal_get_superblock(journal);
+	if (err)
+		return err;
+
+	sb = journal->j_superblock;
+
+	switch (be32_to_cpu(sb->s_header.h_blocktype)) {
+	case JBD2_SUPERBLOCK_V2:
+		return 0;
+	case JBD2_SUPERBLOCK_V1:
+		return journal_convert_superblock_v1(journal, sb);
+	default:
+		break;
+	}
+	return -EINVAL;
+}
+
+static int journal_convert_superblock_v1(journal_t *journal,
+					 journal_superblock_t *sb)
+{
+	int offset, blocksize;
+	struct buffer_head *bh;
+
+	printk(KERN_WARNING
+		"JBD: Converting superblock from version 1 to 2.\n");
+
+	/* Pre-initialise new fields to zero */
+	offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
+	blocksize = be32_to_cpu(sb->s_blocksize);
+	memset(&sb->s_feature_compat, 0, blocksize-offset);
+
+	sb->s_nr_users = cpu_to_be32(1);
+	sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
+	journal->j_format_version = 2;
+
+	bh = journal->j_sb_buffer;
+	BUFFER_TRACE(bh, "marking dirty");
+	mark_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+	return 0;
+}
+
+
+/**
+ * int jbd2_journal_flush () - Flush journal
+ * @journal: Journal to act on.
+ *
+ * Flush all data for a given journal to disk and empty the journal.
+ * Filesystems can use this when remounting readonly to ensure that
+ * recovery does not need to happen on remount.
+ */
+
+int jbd2_journal_flush(journal_t *journal)
+{
+	int err = 0;
+	transaction_t *transaction = NULL;
+	unsigned long old_tail;
+
+	write_lock(&journal->j_state_lock);
+
+	/* Force everything buffered to the log... */
+	if (journal->j_running_transaction) {
+		transaction = journal->j_running_transaction;
+		__jbd2_log_start_commit(journal, transaction->t_tid);
+	} else if (journal->j_committing_transaction)
+		transaction = journal->j_committing_transaction;
+
+	/* Wait for the log commit to complete... */
+	if (transaction) {
+		tid_t tid = transaction->t_tid;
+
+		write_unlock(&journal->j_state_lock);
+		jbd2_log_wait_commit(journal, tid);
+	} else {
+		write_unlock(&journal->j_state_lock);
+	}
+
+	/* ...and flush everything in the log out to disk. */
+	spin_lock(&journal->j_list_lock);
+	while (!err && journal->j_checkpoint_transactions != NULL) {
+		spin_unlock(&journal->j_list_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
+		err = jbd2_log_do_checkpoint(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
+		spin_lock(&journal->j_list_lock);
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	if (is_journal_aborted(journal))
+		return -EIO;
+
+	jbd2_cleanup_journal_tail(journal);
+
+	/* Finally, mark the journal as really needing no recovery.
+	 * This sets s_start==0 in the underlying superblock, which is
+	 * the magic code for a fully-recovered superblock.  Any future
+	 * commits of data to the journal will restore the current
+	 * s_start value. */
+	write_lock(&journal->j_state_lock);
+	old_tail = journal->j_tail;
+	journal->j_tail = 0;
+	write_unlock(&journal->j_state_lock);
+	jbd2_journal_update_superblock(journal, 1);
+	write_lock(&journal->j_state_lock);
+	journal->j_tail = old_tail;
+
+	J_ASSERT(!journal->j_running_transaction);
+	J_ASSERT(!journal->j_committing_transaction);
+	J_ASSERT(!journal->j_checkpoint_transactions);
+	J_ASSERT(journal->j_head == journal->j_tail);
+	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
+	write_unlock(&journal->j_state_lock);
+	return 0;
+}
+
+/**
+ * int jbd2_journal_wipe() - Wipe journal contents
+ * @journal: Journal to act on.
+ * @write: flag (see below)
+ *
+ * Wipe out all of the contents of a journal, safely.  This will produce
+ * a warning if the journal contains any valid recovery information.
+ * Must be called between journal_init_*() and jbd2_journal_load().
+ *
+ * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
+ * we merely suppress recovery.
+ */
+
+int jbd2_journal_wipe(journal_t *journal, int write)
+{
+	int err = 0;
+
+	J_ASSERT (!(journal->j_flags & JBD2_LOADED));
+
+	err = load_superblock(journal);
+	if (err)
+		return err;
+
+	if (!journal->j_tail)
+		goto no_recovery;
+
+	printk (KERN_WARNING "JBD: %s recovery information on journal\n",
+		write ? "Clearing" : "Ignoring");
+
+	err = jbd2_journal_skip_recovery(journal);
+	if (write)
+		jbd2_journal_update_superblock(journal, 1);
+
+ no_recovery:
+	return err;
+}
+
+/*
+ * Journal abort has very specific semantics, which we describe
+ * for journal abort.
+ *
+ * Two internal functions, which provide abort to the jbd layer
+ * itself are here.
+ */
+
+/*
+ * Quick version for internal journal use (doesn't lock the journal).
+ * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
+ * and don't attempt to make any other journal updates.
+ */
+void __jbd2_journal_abort_hard(journal_t *journal)
+{
+	transaction_t *transaction;
+
+	if (journal->j_flags & JBD2_ABORT)
+		return;
+
+	printk(KERN_ERR "Aborting journal on device %s.\n",
+	       journal->j_devname);
+
+	write_lock(&journal->j_state_lock);
+	journal->j_flags |= JBD2_ABORT;
+	transaction = journal->j_running_transaction;
+	if (transaction)
+		__jbd2_log_start_commit(journal, transaction->t_tid);
+	write_unlock(&journal->j_state_lock);
+}
+
+/* Soft abort: record the abort error status in the journal superblock,
+ * but don't do any other IO. */
+static void __journal_abort_soft (journal_t *journal, int errno)
+{
+	if (journal->j_flags & JBD2_ABORT)
+		return;
+
+	if (!journal->j_errno)
+		journal->j_errno = errno;
+
+	__jbd2_journal_abort_hard(journal);
+
+	if (errno)
+		jbd2_journal_update_superblock(journal, 1);
+}
+
+/**
+ * void jbd2_journal_abort () - Shutdown the journal immediately.
+ * @journal: the journal to shutdown.
+ * @errno:   an error number to record in the journal indicating
+ *           the reason for the shutdown.
+ *
+ * Perform a complete, immediate shutdown of the ENTIRE
+ * journal (not of a single transaction).  This operation cannot be
+ * undone without closing and reopening the journal.
+ *
+ * The jbd2_journal_abort function is intended to support higher level error
+ * recovery mechanisms such as the ext2/ext3 remount-readonly error
+ * mode.
+ *
+ * Journal abort has very specific semantics.  Any existing dirty,
+ * unjournaled buffers in the main filesystem will still be written to
+ * disk by bdflush, but the journaling mechanism will be suspended
+ * immediately and no further transaction commits will be honoured.
+ *
+ * Any dirty, journaled buffers will be written back to disk without
+ * hitting the journal.  Atomicity cannot be guaranteed on an aborted
+ * filesystem, but we _do_ attempt to leave as much data as possible
+ * behind for fsck to use for cleanup.
+ *
+ * Any attempt to get a new transaction handle on a journal which is in
+ * ABORT state will just result in an -EROFS error return.  A
+ * jbd2_journal_stop on an existing handle will return -EIO if we have
+ * entered abort state during the update.
+ *
+ * Recursive transactions are not disturbed by journal abort until the
+ * final jbd2_journal_stop, which will receive the -EIO error.
+ *
+ * Finally, the jbd2_journal_abort call allows the caller to supply an errno
+ * which will be recorded (if possible) in the journal superblock.  This
+ * allows a client to record failure conditions in the middle of a
+ * transaction without having to complete the transaction to record the
+ * failure to disk.  ext3_error, for example, now uses this
+ * functionality.
+ *
+ * Errors which originate from within the journaling layer will NOT
+ * supply an errno; a null errno implies that absolutely no further
+ * writes are done to the journal (unless there are any already in
+ * progress).
+ *
+ */
+
+void jbd2_journal_abort(journal_t *journal, int errno)
+{
+	__journal_abort_soft(journal, errno);
+}
+
+/**
+ * int jbd2_journal_errno () - returns the journal's error state.
+ * @journal: journal to examine.
+ *
+ * This is the errno number set with jbd2_journal_abort(), the last
+ * time the journal was mounted - if the journal was stopped
+ * without calling abort this will be 0.
+ *
+ * If the journal has been aborted on this mount time -EROFS will
+ * be returned.
+ */
+int jbd2_journal_errno(journal_t *journal)
+{
+	int err;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_flags & JBD2_ABORT)
+		err = -EROFS;
+	else
+		err = journal->j_errno;
+	read_unlock(&journal->j_state_lock);
+	return err;
+}
+
+/**
+ * int jbd2_journal_clear_err () - clears the journal's error state
+ * @journal: journal to act on.
+ *
+ * An error must be cleared or acked to take a FS out of readonly
+ * mode.
+ */
+int jbd2_journal_clear_err(journal_t *journal)
+{
+	int err = 0;
+
+	write_lock(&journal->j_state_lock);
+	if (journal->j_flags & JBD2_ABORT)
+		err = -EROFS;
+	else
+		journal->j_errno = 0;
+	write_unlock(&journal->j_state_lock);
+	return err;
+}
+
+/**
+ * void jbd2_journal_ack_err() - Ack journal err.
+ * @journal: journal to act on.
+ *
+ * An error must be cleared or acked to take a FS out of readonly
+ * mode.
+ */
+void jbd2_journal_ack_err(journal_t *journal)
+{
+	write_lock(&journal->j_state_lock);
+	if (journal->j_errno)
+		journal->j_flags |= JBD2_ACK_ERR;
+	write_unlock(&journal->j_state_lock);
+}
+
+int jbd2_journal_blocks_per_page(struct inode *inode)
+{
+	return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+}
+
+/*
+ * helper functions to deal with 32 or 64bit block numbers.
+ */
+size_t journal_tag_bytes(journal_t *journal)
+{
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+		return JBD2_TAG_SIZE64;
+	else
+		return JBD2_TAG_SIZE32;
+}
+
+/*
+ * JBD memory management
+ *
+ * These functions are used to allocate block-sized chunks of memory
+ * used for making copies of buffer_head data.  Very often it will be
+ * page-sized chunks of data, but sometimes it will be in
+ * sub-page-size chunks.  (For example, 16k pages on Power systems
+ * with a 4k block file system.)  For blocks smaller than a page, we
+ * use a SLAB allocator.  There are slab caches for each block size,
+ * which are allocated at mount time, if necessary, and we only free
+ * (all of) the slab caches when/if the jbd2 module is unloaded.  For
+ * this reason we don't need to a mutex to protect access to
+ * jbd2_slab[] allocating or releasing memory; only in
+ * jbd2_journal_create_slab().
+ */
+#define JBD2_MAX_SLABS 8
+static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
+
+static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
+	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
+	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
+};
+
+
+static void jbd2_journal_destroy_slabs(void)
+{
+	int i;
+
+	for (i = 0; i < JBD2_MAX_SLABS; i++) {
+		if (jbd2_slab[i])
+			kmem_cache_destroy(jbd2_slab[i]);
+		jbd2_slab[i] = NULL;
+	}
+}
+
+static int jbd2_journal_create_slab(size_t size)
+{
+	static DEFINE_MUTEX(jbd2_slab_create_mutex);
+	int i = order_base_2(size) - 10;
+	size_t slab_size;
+
+	if (size == PAGE_SIZE)
+		return 0;
+
+	if (i >= JBD2_MAX_SLABS)
+		return -EINVAL;
+
+	if (unlikely(i < 0))
+		i = 0;
+	mutex_lock(&jbd2_slab_create_mutex);
+	if (jbd2_slab[i]) {
+		mutex_unlock(&jbd2_slab_create_mutex);
+		return 0;	/* Already created */
+	}
+
+	slab_size = 1 << (i+10);
+	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
+					 slab_size, 0, NULL);
+	mutex_unlock(&jbd2_slab_create_mutex);
+	if (!jbd2_slab[i]) {
+		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static struct kmem_cache *get_slab(size_t size)
+{
+	int i = order_base_2(size) - 10;
+
+	BUG_ON(i >= JBD2_MAX_SLABS);
+	if (unlikely(i < 0))
+		i = 0;
+	BUG_ON(jbd2_slab[i] == NULL);
+	return jbd2_slab[i];
+}
+
+void *jbd2_alloc(size_t size, gfp_t flags)
+{
+	void *ptr;
+
+	BUG_ON(size & (size-1)); /* Must be a power of 2 */
+
+	flags |= __GFP_REPEAT;
+	if (size == PAGE_SIZE)
+		ptr = (void *)__get_free_pages(flags, 0);
+	else if (size > PAGE_SIZE) {
+		int order = get_order(size);
+
+		if (order < 3)
+			ptr = (void *)__get_free_pages(flags, order);
+		else
+			ptr = vmalloc(size);
+	} else
+		ptr = kmem_cache_alloc(get_slab(size), flags);
+
+	/* Check alignment; SLUB has gotten this wrong in the past,
+	 * and this can lead to user data corruption! */
+	BUG_ON(((unsigned long) ptr) & (size-1));
+
+	return ptr;
+}
+
+void jbd2_free(void *ptr, size_t size)
+{
+	if (size == PAGE_SIZE) {
+		free_pages((unsigned long)ptr, 0);
+		return;
+	}
+	if (size > PAGE_SIZE) {
+		int order = get_order(size);
+
+		if (order < 3)
+			free_pages((unsigned long)ptr, order);
+		else
+			vfree(ptr);
+		return;
+	}
+	kmem_cache_free(get_slab(size), ptr);
+};
+
+/*
+ * Journal_head storage management
+ */
+static struct kmem_cache *jbd2_journal_head_cache;
+#ifdef CONFIG_JBD2_DEBUG
+static atomic_t nr_journal_heads = ATOMIC_INIT(0);
+#endif
+
+static int journal_init_jbd2_journal_head_cache(void)
+{
+	int retval;
+
+	J_ASSERT(jbd2_journal_head_cache == NULL);
+	jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
+				sizeof(struct journal_head),
+				0,		/* offset */
+				SLAB_TEMPORARY,	/* flags */
+				NULL);		/* ctor */
+	retval = 0;
+	if (!jbd2_journal_head_cache) {
+		retval = -ENOMEM;
+		printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
+	}
+	return retval;
+}
+
+static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
+{
+	if (jbd2_journal_head_cache) {
+		kmem_cache_destroy(jbd2_journal_head_cache);
+		jbd2_journal_head_cache = NULL;
+	}
+}
+
+/*
+ * journal_head splicing and dicing
+ */
+static struct journal_head *journal_alloc_journal_head(void)
+{
+	struct journal_head *ret;
+
+#ifdef CONFIG_JBD2_DEBUG
+	atomic_inc(&nr_journal_heads);
+#endif
+	ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+	if (!ret) {
+		jbd_debug(1, "out of memory for journal_head\n");
+		pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
+		while (!ret) {
+			yield();
+			ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+		}
+	}
+	return ret;
+}
+
+static void journal_free_journal_head(struct journal_head *jh)
+{
+#ifdef CONFIG_JBD2_DEBUG
+	atomic_dec(&nr_journal_heads);
+	memset(jh, JBD2_POISON_FREE, sizeof(*jh));
+#endif
+	kmem_cache_free(jbd2_journal_head_cache, jh);
+}
+
+/*
+ * A journal_head is attached to a buffer_head whenever JBD has an
+ * interest in the buffer.
+ *
+ * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
+ * is set.  This bit is tested in core kernel code where we need to take
+ * JBD-specific actions.  Testing the zeroness of ->b_private is not reliable
+ * there.
+ *
+ * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
+ *
+ * When a buffer has its BH_JBD bit set it is immune from being released by
+ * core kernel code, mainly via ->b_count.
+ *
+ * A journal_head is detached from its buffer_head when the journal_head's
+ * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
+ * transaction (b_cp_transaction) hold their references to b_jcount.
+ *
+ * Various places in the kernel want to attach a journal_head to a buffer_head
+ * _before_ attaching the journal_head to a transaction.  To protect the
+ * journal_head in this situation, jbd2_journal_add_journal_head elevates the
+ * journal_head's b_jcount refcount by one.  The caller must call
+ * jbd2_journal_put_journal_head() to undo this.
+ *
+ * So the typical usage would be:
+ *
+ *	(Attach a journal_head if needed.  Increments b_jcount)
+ *	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+ *	...
+ *      (Get another reference for transaction)
+ *	jbd2_journal_grab_journal_head(bh);
+ *	jh->b_transaction = xxx;
+ *	(Put original reference)
+ *	jbd2_journal_put_journal_head(jh);
+ */
+
+/*
+ * Give a buffer_head a journal_head.
+ *
+ * May sleep.
+ */
+struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
+{
+	struct journal_head *jh;
+	struct journal_head *new_jh = NULL;
+
+repeat:
+	if (!buffer_jbd(bh)) {
+		new_jh = journal_alloc_journal_head();
+		memset(new_jh, 0, sizeof(*new_jh));
+	}
+
+	jbd_lock_bh_journal_head(bh);
+	if (buffer_jbd(bh)) {
+		jh = bh2jh(bh);
+	} else {
+		J_ASSERT_BH(bh,
+			(atomic_read(&bh->b_count) > 0) ||
+			(bh->b_page && bh->b_page->mapping));
+
+		if (!new_jh) {
+			jbd_unlock_bh_journal_head(bh);
+			goto repeat;
+		}
+
+		jh = new_jh;
+		new_jh = NULL;		/* We consumed it */
+		set_buffer_jbd(bh);
+		bh->b_private = jh;
+		jh->b_bh = bh;
+		get_bh(bh);
+		BUFFER_TRACE(bh, "added journal_head");
+	}
+	jh->b_jcount++;
+	jbd_unlock_bh_journal_head(bh);
+	if (new_jh)
+		journal_free_journal_head(new_jh);
+	return bh->b_private;
+}
+
+/*
+ * Grab a ref against this buffer_head's journal_head.  If it ended up not
+ * having a journal_head, return NULL
+ */
+struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
+{
+	struct journal_head *jh = NULL;
+
+	jbd_lock_bh_journal_head(bh);
+	if (buffer_jbd(bh)) {
+		jh = bh2jh(bh);
+		jh->b_jcount++;
+	}
+	jbd_unlock_bh_journal_head(bh);
+	return jh;
+}
+
+static void __journal_remove_journal_head(struct buffer_head *bh)
+{
+	struct journal_head *jh = bh2jh(bh);
+
+	J_ASSERT_JH(jh, jh->b_jcount >= 0);
+	J_ASSERT_JH(jh, jh->b_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
+	J_ASSERT_BH(bh, buffer_jbd(bh));
+	J_ASSERT_BH(bh, jh2bh(jh) == bh);
+	BUFFER_TRACE(bh, "remove journal_head");
+	if (jh->b_frozen_data) {
+		printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
+		jbd2_free(jh->b_frozen_data, bh->b_size);
+	}
+	if (jh->b_committed_data) {
+		printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
+		jbd2_free(jh->b_committed_data, bh->b_size);
+	}
+	bh->b_private = NULL;
+	jh->b_bh = NULL;	/* debug, really */
+	clear_buffer_jbd(bh);
+	journal_free_journal_head(jh);
+}
+
+/*
+ * Drop a reference on the passed journal_head.  If it fell to zero then
+ * release the journal_head from the buffer_head.
+ */
+void jbd2_journal_put_journal_head(struct journal_head *jh)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	jbd_lock_bh_journal_head(bh);
+	J_ASSERT_JH(jh, jh->b_jcount > 0);
+	--jh->b_jcount;
+	if (!jh->b_jcount) {
+		__journal_remove_journal_head(bh);
+		jbd_unlock_bh_journal_head(bh);
+		__brelse(bh);
+	} else
+		jbd_unlock_bh_journal_head(bh);
+}
+
+/*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+	jinode->i_transaction = NULL;
+	jinode->i_next_transaction = NULL;
+	jinode->i_vfs_inode = inode;
+	jinode->i_flags = 0;
+	INIT_LIST_HEAD(&jinode->i_list);
+}
+
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+				    struct jbd2_inode *jinode)
+{
+	if (!journal)
+		return;
+restart:
+	spin_lock(&journal->j_list_lock);
+	/* Is commit writing out inode - we have to wait */
+	if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
+		wait_queue_head_t *wq;
+		DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+		wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&journal->j_list_lock);
+		schedule();
+		finish_wait(wq, &wait.wait);
+		goto restart;
+	}
+
+	if (jinode->i_transaction) {
+		list_del(&jinode->i_list);
+		jinode->i_transaction = NULL;
+	}
+	spin_unlock(&journal->j_list_lock);
+}
+
+/*
+ * debugfs tunables
+ */
+#ifdef CONFIG_JBD2_DEBUG
+u8 jbd2_journal_enable_debug __read_mostly;
+EXPORT_SYMBOL(jbd2_journal_enable_debug);
+
+#define JBD2_DEBUG_NAME "jbd2-debug"
+
+static struct dentry *jbd2_debugfs_dir;
+static struct dentry *jbd2_debug;
+
+static void __init jbd2_create_debugfs_entry(void)
+{
+	jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL);
+	if (jbd2_debugfs_dir)
+		jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME,
+					       S_IRUGO | S_IWUSR,
+					       jbd2_debugfs_dir,
+					       &jbd2_journal_enable_debug);
+}
+
+static void __exit jbd2_remove_debugfs_entry(void)
+{
+	debugfs_remove(jbd2_debug);
+	debugfs_remove(jbd2_debugfs_dir);
+}
+
+#else
+
+static void __init jbd2_create_debugfs_entry(void)
+{
+}
+
+static void __exit jbd2_remove_debugfs_entry(void)
+{
+}
+
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+#define JBD2_STATS_PROC_NAME "fs/jbd2"
+
+static void __init jbd2_create_jbd_stats_proc_entry(void)
+{
+	proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL);
+}
+
+static void __exit jbd2_remove_jbd_stats_proc_entry(void)
+{
+	if (proc_jbd2_stats)
+		remove_proc_entry(JBD2_STATS_PROC_NAME, NULL);
+}
+
+#else
+
+#define jbd2_create_jbd_stats_proc_entry() do {} while (0)
+#define jbd2_remove_jbd_stats_proc_entry() do {} while (0)
+
+#endif
+
+struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
+
+static int __init journal_init_handle_cache(void)
+{
+	jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
+	if (jbd2_handle_cache == NULL) {
+		printk(KERN_EMERG "JBD2: failed to create handle cache\n");
+		return -ENOMEM;
+	}
+	jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
+	if (jbd2_inode_cache == NULL) {
+		printk(KERN_EMERG "JBD2: failed to create inode cache\n");
+		kmem_cache_destroy(jbd2_handle_cache);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void jbd2_journal_destroy_handle_cache(void)
+{
+	if (jbd2_handle_cache)
+		kmem_cache_destroy(jbd2_handle_cache);
+	if (jbd2_inode_cache)
+		kmem_cache_destroy(jbd2_inode_cache);
+
+}
+
+/*
+ * Module startup and shutdown
+ */
+
+static int __init journal_init_caches(void)
+{
+	int ret;
+
+	ret = jbd2_journal_init_revoke_caches();
+	if (ret == 0)
+		ret = journal_init_jbd2_journal_head_cache();
+	if (ret == 0)
+		ret = journal_init_handle_cache();
+	return ret;
+}
+
+static void jbd2_journal_destroy_caches(void)
+{
+	jbd2_journal_destroy_revoke_caches();
+	jbd2_journal_destroy_jbd2_journal_head_cache();
+	jbd2_journal_destroy_handle_cache();
+	jbd2_journal_destroy_slabs();
+}
+
+static int __init journal_init(void)
+{
+	int ret;
+
+	BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
+
+	ret = journal_init_caches();
+	if (ret == 0) {
+		jbd2_create_debugfs_entry();
+		jbd2_create_jbd_stats_proc_entry();
+	} else {
+		jbd2_journal_destroy_caches();
+	}
+	return ret;
+}
+
+static void __exit journal_exit(void)
+{
+#ifdef CONFIG_JBD2_DEBUG
+	int n = atomic_read(&nr_journal_heads);
+	if (n)
+		printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+#endif
+	jbd2_remove_debugfs_entry();
+	jbd2_remove_jbd_stats_proc_entry();
+	jbd2_journal_destroy_caches();
+}
+
+/* 
+ * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 
+ * tracing infrastructure to map a dev_t to a device name.
+ *
+ * The caller should use rcu_read_lock() in order to make sure the
+ * device name stays valid until its done with it.  We use
+ * rcu_read_lock() as well to make sure we're safe in case the caller
+ * gets sloppy, and because rcu_read_lock() is cheap and can be safely
+ * nested.
+ */
+struct devname_cache {
+	struct rcu_head	rcu;
+	dev_t		device;
+	char		devname[BDEVNAME_SIZE];
+};
+#define CACHE_SIZE_BITS 6
+static struct devname_cache *devcache[1 << CACHE_SIZE_BITS];
+static DEFINE_SPINLOCK(devname_cache_lock);
+
+static void free_devcache(struct rcu_head *rcu)
+{
+	kfree(rcu);
+}
+
+const char *jbd2_dev_to_name(dev_t device)
+{
+	int	i = hash_32(device, CACHE_SIZE_BITS);
+	char	*ret;
+	struct block_device *bd;
+	static struct devname_cache *new_dev;
+
+	rcu_read_lock();
+	if (devcache[i] && devcache[i]->device == device) {
+		ret = devcache[i]->devname;
+		rcu_read_unlock();
+		return ret;
+	}
+	rcu_read_unlock();
+
+	new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
+	if (!new_dev)
+		return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
+	bd = bdget(device);
+	spin_lock(&devname_cache_lock);
+	if (devcache[i]) {
+		if (devcache[i]->device == device) {
+			kfree(new_dev);
+			bdput(bd);
+			ret = devcache[i]->devname;
+			spin_unlock(&devname_cache_lock);
+			return ret;
+		}
+		call_rcu(&devcache[i]->rcu, free_devcache);
+	}
+	devcache[i] = new_dev;
+	devcache[i]->device = device;
+	if (bd) {
+		bdevname(bd, devcache[i]->devname);
+		bdput(bd);
+	} else
+		__bdevname(device, devcache[i]->devname);
+	ret = devcache[i]->devname;
+	spin_unlock(&devname_cache_lock);
+	return ret;
+}
+EXPORT_SYMBOL(jbd2_dev_to_name);
+
+MODULE_LICENSE("GPL");
+module_init(journal_init);
+module_exit(journal_exit);
+
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
new file mode 100644
index 00000000..1cad8694
--- /dev/null
+++ b/fs/jbd2/recovery.c
@@ -0,0 +1,738 @@
+/*
+ * linux/fs/jbd2/recovery.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal recovery routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+
+#ifndef __KERNEL__
+#include "jfs_user.h"
+#else
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/crc32.h>
+#endif
+
+/*
+ * Maintain information about the progress of the recovery job, so that
+ * the different passes can carry information between them.
+ */
+struct recovery_info
+{
+	tid_t		start_transaction;
+	tid_t		end_transaction;
+
+	int		nr_replays;
+	int		nr_revokes;
+	int		nr_revoke_hits;
+};
+
+enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
+static int do_one_pass(journal_t *journal,
+				struct recovery_info *info, enum passtype pass);
+static int scan_revoke_records(journal_t *, struct buffer_head *,
+				tid_t, struct recovery_info *);
+
+#ifdef __KERNEL__
+
+/* Release readahead buffers after use */
+static void journal_brelse_array(struct buffer_head *b[], int n)
+{
+	while (--n >= 0)
+		brelse (b[n]);
+}
+
+
+/*
+ * When reading from the journal, we are going through the block device
+ * layer directly and so there is no readahead being done for us.  We
+ * need to implement any readahead ourselves if we want it to happen at
+ * all.  Recovery is basically one long sequential read, so make sure we
+ * do the IO in reasonably large chunks.
+ *
+ * This is not so critical that we need to be enormously clever about
+ * the readahead size, though.  128K is a purely arbitrary, good-enough
+ * fixed value.
+ */
+
+#define MAXBUF 8
+static int do_readahead(journal_t *journal, unsigned int start)
+{
+	int err;
+	unsigned int max, nbufs, next;
+	unsigned long long blocknr;
+	struct buffer_head *bh;
+
+	struct buffer_head * bufs[MAXBUF];
+
+	/* Do up to 128K of readahead */
+	max = start + (128 * 1024 / journal->j_blocksize);
+	if (max > journal->j_maxlen)
+		max = journal->j_maxlen;
+
+	/* Do the readahead itself.  We'll submit MAXBUF buffer_heads at
+	 * a time to the block device IO layer. */
+
+	nbufs = 0;
+
+	for (next = start; next < max; next++) {
+		err = jbd2_journal_bmap(journal, next, &blocknr);
+
+		if (err) {
+			printk (KERN_ERR "JBD: bad block at offset %u\n",
+				next);
+			goto failed;
+		}
+
+		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+		if (!bh) {
+			err = -ENOMEM;
+			goto failed;
+		}
+
+		if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
+			bufs[nbufs++] = bh;
+			if (nbufs == MAXBUF) {
+				ll_rw_block(READ, nbufs, bufs);
+				journal_brelse_array(bufs, nbufs);
+				nbufs = 0;
+			}
+		} else
+			brelse(bh);
+	}
+
+	if (nbufs)
+		ll_rw_block(READ, nbufs, bufs);
+	err = 0;
+
+failed:
+	if (nbufs)
+		journal_brelse_array(bufs, nbufs);
+	return err;
+}
+
+#endif /* __KERNEL__ */
+
+
+/*
+ * Read a block from the journal
+ */
+
+static int jread(struct buffer_head **bhp, journal_t *journal,
+		 unsigned int offset)
+{
+	int err;
+	unsigned long long blocknr;
+	struct buffer_head *bh;
+
+	*bhp = NULL;
+
+	if (offset >= journal->j_maxlen) {
+		printk(KERN_ERR "JBD: corrupted journal superblock\n");
+		return -EIO;
+	}
+
+	err = jbd2_journal_bmap(journal, offset, &blocknr);
+
+	if (err) {
+		printk (KERN_ERR "JBD: bad block at offset %u\n",
+			offset);
+		return err;
+	}
+
+	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh)
+		return -ENOMEM;
+
+	if (!buffer_uptodate(bh)) {
+		/* If this is a brand new buffer, start readahead.
+                   Otherwise, we assume we are already reading it.  */
+		if (!buffer_req(bh))
+			do_readahead(journal, offset);
+		wait_on_buffer(bh);
+	}
+
+	if (!buffer_uptodate(bh)) {
+		printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
+			offset);
+		brelse(bh);
+		return -EIO;
+	}
+
+	*bhp = bh;
+	return 0;
+}
+
+
+/*
+ * Count the number of in-use tags in a journal descriptor block.
+ */
+
+static int count_tags(journal_t *journal, struct buffer_head *bh)
+{
+	char *			tagp;
+	journal_block_tag_t *	tag;
+	int			nr = 0, size = journal->j_blocksize;
+	int			tag_bytes = journal_tag_bytes(journal);
+
+	tagp = &bh->b_data[sizeof(journal_header_t)];
+
+	while ((tagp - bh->b_data + tag_bytes) <= size) {
+		tag = (journal_block_tag_t *) tagp;
+
+		nr++;
+		tagp += tag_bytes;
+		if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID)))
+			tagp += 16;
+
+		if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG))
+			break;
+	}
+
+	return nr;
+}
+
+
+/* Make sure we wrap around the log correctly! */
+#define wrap(journal, var)						\
+do {									\
+	if (var >= (journal)->j_last)					\
+		var -= ((journal)->j_last - (journal)->j_first);	\
+} while (0)
+
+/**
+ * jbd2_journal_recover - recovers a on-disk journal
+ * @journal: the journal to recover
+ *
+ * The primary function for recovering the log contents when mounting a
+ * journaled device.
+ *
+ * Recovery is done in three passes.  In the first pass, we look for the
+ * end of the log.  In the second, we assemble the list of revoke
+ * blocks.  In the third and final pass, we replay any un-revoked blocks
+ * in the log.
+ */
+int jbd2_journal_recover(journal_t *journal)
+{
+	int			err, err2;
+	journal_superblock_t *	sb;
+
+	struct recovery_info	info;
+
+	memset(&info, 0, sizeof(info));
+	sb = journal->j_superblock;
+
+	/*
+	 * The journal superblock's s_start field (the current log head)
+	 * is always zero if, and only if, the journal was cleanly
+	 * unmounted.
+	 */
+
+	if (!sb->s_start) {
+		jbd_debug(1, "No recovery required, last transaction %d\n",
+			  be32_to_cpu(sb->s_sequence));
+		journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
+		return 0;
+	}
+
+	err = do_one_pass(journal, &info, PASS_SCAN);
+	if (!err)
+		err = do_one_pass(journal, &info, PASS_REVOKE);
+	if (!err)
+		err = do_one_pass(journal, &info, PASS_REPLAY);
+
+	jbd_debug(1, "JBD: recovery, exit status %d, "
+		  "recovered transactions %u to %u\n",
+		  err, info.start_transaction, info.end_transaction);
+	jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n",
+		  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
+
+	/* Restart the log at the next transaction ID, thus invalidating
+	 * any existing commit records in the log. */
+	journal->j_transaction_sequence = ++info.end_transaction;
+
+	jbd2_journal_clear_revoke(journal);
+	err2 = sync_blockdev(journal->j_fs_dev);
+	if (!err)
+		err = err2;
+
+	return err;
+}
+
+/**
+ * jbd2_journal_skip_recovery - Start journal and wipe exiting records
+ * @journal: journal to startup
+ *
+ * Locate any valid recovery information from the journal and set up the
+ * journal structures in memory to ignore it (presumably because the
+ * caller has evidence that it is out of date).
+ * This function does'nt appear to be exorted..
+ *
+ * We perform one pass over the journal to allow us to tell the user how
+ * much recovery information is being erased, and to let us initialise
+ * the journal transaction sequence numbers to the next unused ID.
+ */
+int jbd2_journal_skip_recovery(journal_t *journal)
+{
+	int			err;
+
+	struct recovery_info	info;
+
+	memset (&info, 0, sizeof(info));
+
+	err = do_one_pass(journal, &info, PASS_SCAN);
+
+	if (err) {
+		printk(KERN_ERR "JBD: error %d scanning journal\n", err);
+		++journal->j_transaction_sequence;
+	} else {
+#ifdef CONFIG_JBD2_DEBUG
+		int dropped = info.end_transaction - 
+			be32_to_cpu(journal->j_superblock->s_sequence);
+		jbd_debug(1,
+			  "JBD: ignoring %d transaction%s from the journal.\n",
+			  dropped, (dropped == 1) ? "" : "s");
+#endif
+		journal->j_transaction_sequence = ++info.end_transaction;
+	}
+
+	journal->j_tail = 0;
+	return err;
+}
+
+static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
+{
+	unsigned long long block = be32_to_cpu(tag->t_blocknr);
+	if (tag_bytes > JBD2_TAG_SIZE32)
+		block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
+	return block;
+}
+
+/*
+ * calc_chksums calculates the checksums for the blocks described in the
+ * descriptor block.
+ */
+static int calc_chksums(journal_t *journal, struct buffer_head *bh,
+			unsigned long *next_log_block, __u32 *crc32_sum)
+{
+	int i, num_blks, err;
+	unsigned long io_block;
+	struct buffer_head *obh;
+
+	num_blks = count_tags(journal, bh);
+	/* Calculate checksum of the descriptor block. */
+	*crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
+
+	for (i = 0; i < num_blks; i++) {
+		io_block = (*next_log_block)++;
+		wrap(journal, *next_log_block);
+		err = jread(&obh, journal, io_block);
+		if (err) {
+			printk(KERN_ERR "JBD: IO error %d recovering block "
+				"%lu in log\n", err, io_block);
+			return 1;
+		} else {
+			*crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
+				     obh->b_size);
+		}
+		put_bh(obh);
+	}
+	return 0;
+}
+
+static int do_one_pass(journal_t *journal,
+			struct recovery_info *info, enum passtype pass)
+{
+	unsigned int		first_commit_ID, next_commit_ID;
+	unsigned long		next_log_block;
+	int			err, success = 0;
+	journal_superblock_t *	sb;
+	journal_header_t *	tmp;
+	struct buffer_head *	bh;
+	unsigned int		sequence;
+	int			blocktype;
+	int			tag_bytes = journal_tag_bytes(journal);
+	__u32			crc32_sum = ~0; /* Transactional Checksums */
+
+	/*
+	 * First thing is to establish what we expect to find in the log
+	 * (in terms of transaction IDs), and where (in terms of log
+	 * block offsets): query the superblock.
+	 */
+
+	sb = journal->j_superblock;
+	next_commit_ID = be32_to_cpu(sb->s_sequence);
+	next_log_block = be32_to_cpu(sb->s_start);
+
+	first_commit_ID = next_commit_ID;
+	if (pass == PASS_SCAN)
+		info->start_transaction = first_commit_ID;
+
+	jbd_debug(1, "Starting recovery pass %d\n", pass);
+
+	/*
+	 * Now we walk through the log, transaction by transaction,
+	 * making sure that each transaction has a commit block in the
+	 * expected place.  Each complete transaction gets replayed back
+	 * into the main filesystem.
+	 */
+
+	while (1) {
+		int			flags;
+		char *			tagp;
+		journal_block_tag_t *	tag;
+		struct buffer_head *	obh;
+		struct buffer_head *	nbh;
+
+		cond_resched();
+
+		/* If we already know where to stop the log traversal,
+		 * check right now that we haven't gone past the end of
+		 * the log. */
+
+		if (pass != PASS_SCAN)
+			if (tid_geq(next_commit_ID, info->end_transaction))
+				break;
+
+		jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+			  next_commit_ID, next_log_block, journal->j_last);
+
+		/* Skip over each chunk of the transaction looking
+		 * either the next descriptor block or the final commit
+		 * record. */
+
+		jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+		err = jread(&bh, journal, next_log_block);
+		if (err)
+			goto failed;
+
+		next_log_block++;
+		wrap(journal, next_log_block);
+
+		/* What kind of buffer is it?
+		 *
+		 * If it is a descriptor block, check that it has the
+		 * expected sequence number.  Otherwise, we're all done
+		 * here. */
+
+		tmp = (journal_header_t *)bh->b_data;
+
+		if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
+			brelse(bh);
+			break;
+		}
+
+		blocktype = be32_to_cpu(tmp->h_blocktype);
+		sequence = be32_to_cpu(tmp->h_sequence);
+		jbd_debug(3, "Found magic %d, sequence %d\n",
+			  blocktype, sequence);
+
+		if (sequence != next_commit_ID) {
+			brelse(bh);
+			break;
+		}
+
+		/* OK, we have a valid descriptor block which matches
+		 * all of the sequence number checks.  What are we going
+		 * to do with it?  That depends on the pass... */
+
+		switch(blocktype) {
+		case JBD2_DESCRIPTOR_BLOCK:
+			/* If it is a valid descriptor block, replay it
+			 * in pass REPLAY; if journal_checksums enabled, then
+			 * calculate checksums in PASS_SCAN, otherwise,
+			 * just skip over the blocks it describes. */
+			if (pass != PASS_REPLAY) {
+				if (pass == PASS_SCAN &&
+				    JBD2_HAS_COMPAT_FEATURE(journal,
+					    JBD2_FEATURE_COMPAT_CHECKSUM) &&
+				    !info->end_transaction) {
+					if (calc_chksums(journal, bh,
+							&next_log_block,
+							&crc32_sum)) {
+						put_bh(bh);
+						break;
+					}
+					put_bh(bh);
+					continue;
+				}
+				next_log_block += count_tags(journal, bh);
+				wrap(journal, next_log_block);
+				put_bh(bh);
+				continue;
+			}
+
+			/* A descriptor block: we can now write all of
+			 * the data blocks.  Yay, useful work is finally
+			 * getting done here! */
+
+			tagp = &bh->b_data[sizeof(journal_header_t)];
+			while ((tagp - bh->b_data + tag_bytes)
+			       <= journal->j_blocksize) {
+				unsigned long io_block;
+
+				tag = (journal_block_tag_t *) tagp;
+				flags = be32_to_cpu(tag->t_flags);
+
+				io_block = next_log_block++;
+				wrap(journal, next_log_block);
+				err = jread(&obh, journal, io_block);
+				if (err) {
+					/* Recover what we can, but
+					 * report failure at the end. */
+					success = err;
+					printk (KERN_ERR
+						"JBD: IO error %d recovering "
+						"block %ld in log\n",
+						err, io_block);
+				} else {
+					unsigned long long blocknr;
+
+					J_ASSERT(obh != NULL);
+					blocknr = read_tag_block(tag_bytes,
+								 tag);
+
+					/* If the block has been
+					 * revoked, then we're all done
+					 * here. */
+					if (jbd2_journal_test_revoke
+					    (journal, blocknr,
+					     next_commit_ID)) {
+						brelse(obh);
+						++info->nr_revoke_hits;
+						goto skip_write;
+					}
+
+					/* Find a buffer for the new
+					 * data being restored */
+					nbh = __getblk(journal->j_fs_dev,
+							blocknr,
+							journal->j_blocksize);
+					if (nbh == NULL) {
+						printk(KERN_ERR
+						       "JBD: Out of memory "
+						       "during recovery.\n");
+						err = -ENOMEM;
+						brelse(bh);
+						brelse(obh);
+						goto failed;
+					}
+
+					lock_buffer(nbh);
+					memcpy(nbh->b_data, obh->b_data,
+							journal->j_blocksize);
+					if (flags & JBD2_FLAG_ESCAPE) {
+						*((__be32 *)nbh->b_data) =
+						cpu_to_be32(JBD2_MAGIC_NUMBER);
+					}
+
+					BUFFER_TRACE(nbh, "marking dirty");
+					set_buffer_uptodate(nbh);
+					mark_buffer_dirty(nbh);
+					BUFFER_TRACE(nbh, "marking uptodate");
+					++info->nr_replays;
+					/* ll_rw_block(WRITE, 1, &nbh); */
+					unlock_buffer(nbh);
+					brelse(obh);
+					brelse(nbh);
+				}
+
+			skip_write:
+				tagp += tag_bytes;
+				if (!(flags & JBD2_FLAG_SAME_UUID))
+					tagp += 16;
+
+				if (flags & JBD2_FLAG_LAST_TAG)
+					break;
+			}
+
+			brelse(bh);
+			continue;
+
+		case JBD2_COMMIT_BLOCK:
+			/*     How to differentiate between interrupted commit
+			 *               and journal corruption ?
+			 *
+			 * {nth transaction}
+			 *        Checksum Verification Failed
+			 *			 |
+			 *		 ____________________
+			 *		|		     |
+			 * 	async_commit             sync_commit
+			 *     		|                    |
+			 *		| GO TO NEXT    "Journal Corruption"
+			 *		| TRANSACTION
+			 *		|
+			 * {(n+1)th transanction}
+			 *		|
+			 * 	 _______|______________
+			 * 	|	 	      |
+			 * Commit block found	Commit block not found
+			 *      |		      |
+			 * "Journal Corruption"       |
+			 *		 _____________|_________
+			 *     		|	           	|
+			 *	nth trans corrupt	OR   nth trans
+			 *	and (n+1)th interrupted     interrupted
+			 *	before commit block
+			 *      could reach the disk.
+			 *	(Cannot find the difference in above
+			 *	 mentioned conditions. Hence assume
+			 *	 "Interrupted Commit".)
+			 */
+
+			/* Found an expected commit block: if checksums
+			 * are present verify them in PASS_SCAN; else not
+			 * much to do other than move on to the next sequence
+			 * number. */
+			if (pass == PASS_SCAN &&
+			    JBD2_HAS_COMPAT_FEATURE(journal,
+				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
+				int chksum_err, chksum_seen;
+				struct commit_header *cbh =
+					(struct commit_header *)bh->b_data;
+				unsigned found_chksum =
+					be32_to_cpu(cbh->h_chksum[0]);
+
+				chksum_err = chksum_seen = 0;
+
+				if (info->end_transaction) {
+					journal->j_failed_commit =
+						info->end_transaction;
+					brelse(bh);
+					break;
+				}
+
+				if (crc32_sum == found_chksum &&
+				    cbh->h_chksum_type == JBD2_CRC32_CHKSUM &&
+				    cbh->h_chksum_size ==
+						JBD2_CRC32_CHKSUM_SIZE)
+				       chksum_seen = 1;
+				else if (!(cbh->h_chksum_type == 0 &&
+					     cbh->h_chksum_size == 0 &&
+					     found_chksum == 0 &&
+					     !chksum_seen))
+				/*
+				 * If fs is mounted using an old kernel and then
+				 * kernel with journal_chksum is used then we
+				 * get a situation where the journal flag has
+				 * checksum flag set but checksums are not
+				 * present i.e chksum = 0, in the individual
+				 * commit blocks.
+				 * Hence to avoid checksum failures, in this
+				 * situation, this extra check is added.
+				 */
+						chksum_err = 1;
+
+				if (chksum_err) {
+					info->end_transaction = next_commit_ID;
+
+					if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
+					   JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
+						journal->j_failed_commit =
+							next_commit_ID;
+						brelse(bh);
+						break;
+					}
+				}
+				crc32_sum = ~0;
+			}
+			brelse(bh);
+			next_commit_ID++;
+			continue;
+
+		case JBD2_REVOKE_BLOCK:
+			/* If we aren't in the REVOKE pass, then we can
+			 * just skip over this block. */
+			if (pass != PASS_REVOKE) {
+				brelse(bh);
+				continue;
+			}
+
+			err = scan_revoke_records(journal, bh,
+						  next_commit_ID, info);
+			brelse(bh);
+			if (err)
+				goto failed;
+			continue;
+
+		default:
+			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+				  blocktype);
+			brelse(bh);
+			goto done;
+		}
+	}
+
+ done:
+	/*
+	 * We broke out of the log scan loop: either we came to the
+	 * known end of the log or we found an unexpected block in the
+	 * log.  If the latter happened, then we know that the "current"
+	 * transaction marks the end of the valid log.
+	 */
+
+	if (pass == PASS_SCAN) {
+		if (!info->end_transaction)
+			info->end_transaction = next_commit_ID;
+	} else {
+		/* It's really bad news if different passes end up at
+		 * different places (but possible due to IO errors). */
+		if (info->end_transaction != next_commit_ID) {
+			printk (KERN_ERR "JBD: recovery pass %d ended at "
+				"transaction %u, expected %u\n",
+				pass, next_commit_ID, info->end_transaction);
+			if (!success)
+				success = -EIO;
+		}
+	}
+
+	return success;
+
+ failed:
+	return err;
+}
+
+
+/* Scan a revoke record, marking all blocks mentioned as revoked. */
+
+static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
+			       tid_t sequence, struct recovery_info *info)
+{
+	jbd2_journal_revoke_header_t *header;
+	int offset, max;
+	int record_len = 4;
+
+	header = (jbd2_journal_revoke_header_t *) bh->b_data;
+	offset = sizeof(jbd2_journal_revoke_header_t);
+	max = be32_to_cpu(header->r_count);
+
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+		record_len = 8;
+
+	while (offset + record_len <= max) {
+		unsigned long long blocknr;
+		int err;
+
+		if (record_len == 4)
+			blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
+		else
+			blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset)));
+		offset += record_len;
+		err = jbd2_journal_set_revoke(journal, blocknr, sequence);
+		if (err)
+			return err;
+		++info->nr_revokes;
+	}
+	return 0;
+}
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
new file mode 100644
index 00000000..69fd9358
--- /dev/null
+++ b/fs/jbd2/revoke.c
@@ -0,0 +1,714 @@
+/*
+ * linux/fs/jbd2/revoke.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
+ *
+ * Copyright 2000 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal revoke routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ *
+ * Revoke is the mechanism used to prevent old log records for deleted
+ * metadata from being replayed on top of newer data using the same
+ * blocks.  The revoke mechanism is used in two separate places:
+ *
+ * + Commit: during commit we write the entire list of the current
+ *   transaction's revoked blocks to the journal
+ *
+ * + Recovery: during recovery we record the transaction ID of all
+ *   revoked blocks.  If there are multiple revoke records in the log
+ *   for a single block, only the last one counts, and if there is a log
+ *   entry for a block beyond the last revoke, then that log entry still
+ *   gets replayed.
+ *
+ * We can get interactions between revokes and new log data within a
+ * single transaction:
+ *
+ * Block is revoked and then journaled:
+ *   The desired end result is the journaling of the new block, so we
+ *   cancel the revoke before the transaction commits.
+ *
+ * Block is journaled and then revoked:
+ *   The revoke must take precedence over the write of the block, so we
+ *   need either to cancel the journal entry or to write the revoke
+ *   later in the log than the log block.  In this case, we choose the
+ *   latter: journaling a block cancels any revoke record for that block
+ *   in the current transaction, so any revoke for that block in the
+ *   transaction must have happened after the block was journaled and so
+ *   the revoke must take precedence.
+ *
+ * Block is revoked and then written as data:
+ *   The data write is allowed to succeed, but the revoke is _not_
+ *   cancelled.  We still need to prevent old log records from
+ *   overwriting the new data.  We don't even need to clear the revoke
+ *   bit here.
+ *
+ * Revoke information on buffers is a tri-state value:
+ *
+ * RevokeValid clear:	no cached revoke status, need to look it up
+ * RevokeValid set, Revoked clear:
+ *			buffer has not been revoked, and cancel_revoke
+ *			need do nothing.
+ * RevokeValid set, Revoked set:
+ *			buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table.  Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment no one else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
+ */
+
+#ifndef __KERNEL__
+#include "jfs_user.h"
+#else
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#endif
+#include <linux/log2.h>
+
+static struct kmem_cache *jbd2_revoke_record_cache;
+static struct kmem_cache *jbd2_revoke_table_cache;
+
+/* Each revoke record represents one single revoked block.  During
+   journal replay, this involves recording the transaction ID of the
+   last transaction to revoke this block. */
+
+struct jbd2_revoke_record_s
+{
+	struct list_head  hash;
+	tid_t		  sequence;	/* Used for recovery only */
+	unsigned long long	  blocknr;
+};
+
+
+/* The revoke table is just a simple hash table of revoke records. */
+struct jbd2_revoke_table_s
+{
+	/* It is conceivable that we might want a larger hash table
+	 * for recovery.  Must be a power of two. */
+	int		  hash_size;
+	int		  hash_shift;
+	struct list_head *hash_table;
+};
+
+
+#ifdef __KERNEL__
+static void write_one_revoke_record(journal_t *, transaction_t *,
+				    struct journal_head **, int *,
+				    struct jbd2_revoke_record_s *, int);
+static void flush_descriptor(journal_t *, struct journal_head *, int, int);
+#endif
+
+/* Utility functions to maintain the revoke table */
+
+/* Borrowed from buffer.c: this is a tried and tested block hash function */
+static inline int hash(journal_t *journal, unsigned long long block)
+{
+	struct jbd2_revoke_table_s *table = journal->j_revoke;
+	int hash_shift = table->hash_shift;
+	int hash = (int)block ^ (int)((block >> 31) >> 1);
+
+	return ((hash << (hash_shift - 6)) ^
+		(hash >> 13) ^
+		(hash << (hash_shift - 12))) & (table->hash_size - 1);
+}
+
+static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
+			      tid_t seq)
+{
+	struct list_head *hash_list;
+	struct jbd2_revoke_record_s *record;
+
+repeat:
+	record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS);
+	if (!record)
+		goto oom;
+
+	record->sequence = seq;
+	record->blocknr = blocknr;
+	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+	spin_lock(&journal->j_revoke_lock);
+	list_add(&record->hash, hash_list);
+	spin_unlock(&journal->j_revoke_lock);
+	return 0;
+
+oom:
+	if (!journal_oom_retry)
+		return -ENOMEM;
+	jbd_debug(1, "ENOMEM in %s, retrying\n", __func__);
+	yield();
+	goto repeat;
+}
+
+/* Find a revoke record in the journal's hash table. */
+
+static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
+						      unsigned long long blocknr)
+{
+	struct list_head *hash_list;
+	struct jbd2_revoke_record_s *record;
+
+	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+
+	spin_lock(&journal->j_revoke_lock);
+	record = (struct jbd2_revoke_record_s *) hash_list->next;
+	while (&(record->hash) != hash_list) {
+		if (record->blocknr == blocknr) {
+			spin_unlock(&journal->j_revoke_lock);
+			return record;
+		}
+		record = (struct jbd2_revoke_record_s *) record->hash.next;
+	}
+	spin_unlock(&journal->j_revoke_lock);
+	return NULL;
+}
+
+void jbd2_journal_destroy_revoke_caches(void)
+{
+	if (jbd2_revoke_record_cache) {
+		kmem_cache_destroy(jbd2_revoke_record_cache);
+		jbd2_revoke_record_cache = NULL;
+	}
+	if (jbd2_revoke_table_cache) {
+		kmem_cache_destroy(jbd2_revoke_table_cache);
+		jbd2_revoke_table_cache = NULL;
+	}
+}
+
+int __init jbd2_journal_init_revoke_caches(void)
+{
+	J_ASSERT(!jbd2_revoke_record_cache);
+	J_ASSERT(!jbd2_revoke_table_cache);
+
+	jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
+					   sizeof(struct jbd2_revoke_record_s),
+					   0,
+					   SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
+					   NULL);
+	if (!jbd2_revoke_record_cache)
+		goto record_cache_failure;
+
+	jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
+					   sizeof(struct jbd2_revoke_table_s),
+					   0, SLAB_TEMPORARY, NULL);
+	if (!jbd2_revoke_table_cache)
+		goto table_cache_failure;
+	return 0;
+table_cache_failure:
+	jbd2_journal_destroy_revoke_caches();
+record_cache_failure:
+		return -ENOMEM;
+}
+
+static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
+{
+	int shift = 0;
+	int tmp = hash_size;
+	struct jbd2_revoke_table_s *table;
+
+	table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
+	if (!table)
+		goto out;
+
+	while((tmp >>= 1UL) != 0UL)
+		shift++;
+
+	table->hash_size = hash_size;
+	table->hash_shift = shift;
+	table->hash_table =
+		kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
+	if (!table->hash_table) {
+		kmem_cache_free(jbd2_revoke_table_cache, table);
+		table = NULL;
+		goto out;
+	}
+
+	for (tmp = 0; tmp < hash_size; tmp++)
+		INIT_LIST_HEAD(&table->hash_table[tmp]);
+
+out:
+	return table;
+}
+
+static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
+{
+	int i;
+	struct list_head *hash_list;
+
+	for (i = 0; i < table->hash_size; i++) {
+		hash_list = &table->hash_table[i];
+		J_ASSERT(list_empty(hash_list));
+	}
+
+	kfree(table->hash_table);
+	kmem_cache_free(jbd2_revoke_table_cache, table);
+}
+
+/* Initialise the revoke table for a given journal to a given size. */
+int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
+{
+	J_ASSERT(journal->j_revoke_table[0] == NULL);
+	J_ASSERT(is_power_of_2(hash_size));
+
+	journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size);
+	if (!journal->j_revoke_table[0])
+		goto fail0;
+
+	journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size);
+	if (!journal->j_revoke_table[1])
+		goto fail1;
+
+	journal->j_revoke = journal->j_revoke_table[1];
+
+	spin_lock_init(&journal->j_revoke_lock);
+
+	return 0;
+
+fail1:
+	jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
+fail0:
+	return -ENOMEM;
+}
+
+/* Destroy a journal's revoke table.  The table must already be empty! */
+void jbd2_journal_destroy_revoke(journal_t *journal)
+{
+	journal->j_revoke = NULL;
+	if (journal->j_revoke_table[0])
+		jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
+	if (journal->j_revoke_table[1])
+		jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]);
+}
+
+
+#ifdef __KERNEL__
+
+/*
+ * jbd2_journal_revoke: revoke a given buffer_head from the journal.  This
+ * prevents the block from being replayed during recovery if we take a
+ * crash after this current transaction commits.  Any subsequent
+ * metadata writes of the buffer in this transaction cancel the
+ * revoke.
+ *
+ * Note that this call may block --- it is up to the caller to make
+ * sure that there are no further calls to journal_write_metadata
+ * before the revoke is complete.  In ext3, this implies calling the
+ * revoke before clearing the block bitmap when we are deleting
+ * metadata.
+ *
+ * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a
+ * parameter, but does _not_ forget the buffer_head if the bh was only
+ * found implicitly.
+ *
+ * bh_in may not be a journalled buffer - it may have come off
+ * the hash tables without an attached journal_head.
+ *
+ * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count
+ * by one.
+ */
+
+int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
+		   struct buffer_head *bh_in)
+{
+	struct buffer_head *bh = NULL;
+	journal_t *journal;
+	struct block_device *bdev;
+	int err;
+
+	might_sleep();
+	if (bh_in)
+		BUFFER_TRACE(bh_in, "enter");
+
+	journal = handle->h_transaction->t_journal;
+	if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){
+		J_ASSERT (!"Cannot set revoke feature!");
+		return -EINVAL;
+	}
+
+	bdev = journal->j_fs_dev;
+	bh = bh_in;
+
+	if (!bh) {
+		bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
+		if (bh)
+			BUFFER_TRACE(bh, "found on hash");
+	}
+#ifdef JBD2_EXPENSIVE_CHECKING
+	else {
+		struct buffer_head *bh2;
+
+		/* If there is a different buffer_head lying around in
+		 * memory anywhere... */
+		bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
+		if (bh2) {
+			/* ... and it has RevokeValid status... */
+			if (bh2 != bh && buffer_revokevalid(bh2))
+				/* ...then it better be revoked too,
+				 * since it's illegal to create a revoke
+				 * record against a buffer_head which is
+				 * not marked revoked --- that would
+				 * risk missing a subsequent revoke
+				 * cancel. */
+				J_ASSERT_BH(bh2, buffer_revoked(bh2));
+			put_bh(bh2);
+		}
+	}
+#endif
+
+	/* We really ought not ever to revoke twice in a row without
+           first having the revoke cancelled: it's illegal to free a
+           block twice without allocating it in between! */
+	if (bh) {
+		if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
+				 "inconsistent data on disk")) {
+			if (!bh_in)
+				brelse(bh);
+			return -EIO;
+		}
+		set_buffer_revoked(bh);
+		set_buffer_revokevalid(bh);
+		if (bh_in) {
+			BUFFER_TRACE(bh_in, "call jbd2_journal_forget");
+			jbd2_journal_forget(handle, bh_in);
+		} else {
+			BUFFER_TRACE(bh, "call brelse");
+			__brelse(bh);
+		}
+	}
+
+	jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
+	err = insert_revoke_hash(journal, blocknr,
+				handle->h_transaction->t_tid);
+	BUFFER_TRACE(bh_in, "exit");
+	return err;
+}
+
+/*
+ * Cancel an outstanding revoke.  For use only internally by the
+ * journaling code (called from jbd2_journal_get_write_access).
+ *
+ * We trust buffer_revoked() on the buffer if the buffer is already
+ * being journaled: if there is no revoke pending on the buffer, then we
+ * don't do anything here.
+ *
+ * This would break if it were possible for a buffer to be revoked and
+ * discarded, and then reallocated within the same transaction.  In such
+ * a case we would have lost the revoked bit, but when we arrived here
+ * the second time we would still have a pending revoke to cancel.  So,
+ * do not trust the Revoked bit on buffers unless RevokeValid is also
+ * set.
+ */
+int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
+{
+	struct jbd2_revoke_record_s *record;
+	journal_t *journal = handle->h_transaction->t_journal;
+	int need_cancel;
+	int did_revoke = 0;	/* akpm: debug */
+	struct buffer_head *bh = jh2bh(jh);
+
+	jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+
+	/* Is the existing Revoke bit valid?  If so, we trust it, and
+	 * only perform the full cancel if the revoke bit is set.  If
+	 * not, we can't trust the revoke bit, and we need to do the
+	 * full search for a revoke record. */
+	if (test_set_buffer_revokevalid(bh)) {
+		need_cancel = test_clear_buffer_revoked(bh);
+	} else {
+		need_cancel = 1;
+		clear_buffer_revoked(bh);
+	}
+
+	if (need_cancel) {
+		record = find_revoke_record(journal, bh->b_blocknr);
+		if (record) {
+			jbd_debug(4, "cancelled existing revoke on "
+				  "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
+			spin_lock(&journal->j_revoke_lock);
+			list_del(&record->hash);
+			spin_unlock(&journal->j_revoke_lock);
+			kmem_cache_free(jbd2_revoke_record_cache, record);
+			did_revoke = 1;
+		}
+	}
+
+#ifdef JBD2_EXPENSIVE_CHECKING
+	/* There better not be one left behind by now! */
+	record = find_revoke_record(journal, bh->b_blocknr);
+	J_ASSERT_JH(jh, record == NULL);
+#endif
+
+	/* Finally, have we just cleared revoke on an unhashed
+	 * buffer_head?  If so, we'd better make sure we clear the
+	 * revoked status on any hashed alias too, otherwise the revoke
+	 * state machine will get very upset later on. */
+	if (need_cancel) {
+		struct buffer_head *bh2;
+		bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
+		if (bh2) {
+			if (bh2 != bh)
+				clear_buffer_revoked(bh2);
+			__brelse(bh2);
+		}
+	}
+	return did_revoke;
+}
+
+/* journal_switch_revoke table select j_revoke for next transaction
+ * we do not want to suspend any processing until all revokes are
+ * written -bzzz
+ */
+void jbd2_journal_switch_revoke_table(journal_t *journal)
+{
+	int i;
+
+	if (journal->j_revoke == journal->j_revoke_table[0])
+		journal->j_revoke = journal->j_revoke_table[1];
+	else
+		journal->j_revoke = journal->j_revoke_table[0];
+
+	for (i = 0; i < journal->j_revoke->hash_size; i++)
+		INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
+}
+
+/*
+ * Write revoke records to the journal for all entries in the current
+ * revoke hash, deleting the entries as we go.
+ */
+void jbd2_journal_write_revoke_records(journal_t *journal,
+				       transaction_t *transaction,
+				       int write_op)
+{
+	struct journal_head *descriptor;
+	struct jbd2_revoke_record_s *record;
+	struct jbd2_revoke_table_s *revoke;
+	struct list_head *hash_list;
+	int i, offset, count;
+
+	descriptor = NULL;
+	offset = 0;
+	count = 0;
+
+	/* select revoke table for committing transaction */
+	revoke = journal->j_revoke == journal->j_revoke_table[0] ?
+		journal->j_revoke_table[1] : journal->j_revoke_table[0];
+
+	for (i = 0; i < revoke->hash_size; i++) {
+		hash_list = &revoke->hash_table[i];
+
+		while (!list_empty(hash_list)) {
+			record = (struct jbd2_revoke_record_s *)
+				hash_list->next;
+			write_one_revoke_record(journal, transaction,
+						&descriptor, &offset,
+						record, write_op);
+			count++;
+			list_del(&record->hash);
+			kmem_cache_free(jbd2_revoke_record_cache, record);
+		}
+	}
+	if (descriptor)
+		flush_descriptor(journal, descriptor, offset, write_op);
+	jbd_debug(1, "Wrote %d revoke records\n", count);
+}
+
+/*
+ * Write out one revoke record.  We need to create a new descriptor
+ * block if the old one is full or if we have not already created one.
+ */
+
+static void write_one_revoke_record(journal_t *journal,
+				    transaction_t *transaction,
+				    struct journal_head **descriptorp,
+				    int *offsetp,
+				    struct jbd2_revoke_record_s *record,
+				    int write_op)
+{
+	struct journal_head *descriptor;
+	int offset;
+	journal_header_t *header;
+
+	/* If we are already aborting, this all becomes a noop.  We
+           still need to go round the loop in
+           jbd2_journal_write_revoke_records in order to free all of the
+           revoke records: only the IO to the journal is omitted. */
+	if (is_journal_aborted(journal))
+		return;
+
+	descriptor = *descriptorp;
+	offset = *offsetp;
+
+	/* Make sure we have a descriptor with space left for the record */
+	if (descriptor) {
+		if (offset == journal->j_blocksize) {
+			flush_descriptor(journal, descriptor, offset, write_op);
+			descriptor = NULL;
+		}
+	}
+
+	if (!descriptor) {
+		descriptor = jbd2_journal_get_descriptor_buffer(journal);
+		if (!descriptor)
+			return;
+		header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+		header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
+		header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
+		header->h_sequence  = cpu_to_be32(transaction->t_tid);
+
+		/* Record it so that we can wait for IO completion later */
+		JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
+		jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+
+		offset = sizeof(jbd2_journal_revoke_header_t);
+		*descriptorp = descriptor;
+	}
+
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
+		* ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
+			cpu_to_be64(record->blocknr);
+		offset += 8;
+
+	} else {
+		* ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
+			cpu_to_be32(record->blocknr);
+		offset += 4;
+	}
+
+	*offsetp = offset;
+}
+
+/*
+ * Flush a revoke descriptor out to the journal.  If we are aborting,
+ * this is a noop; otherwise we are generating a buffer which needs to
+ * be waited for during commit, so it has to go onto the appropriate
+ * journal buffer list.
+ */
+
+static void flush_descriptor(journal_t *journal,
+			     struct journal_head *descriptor,
+			     int offset, int write_op)
+{
+	jbd2_journal_revoke_header_t *header;
+	struct buffer_head *bh = jh2bh(descriptor);
+
+	if (is_journal_aborted(journal)) {
+		put_bh(bh);
+		return;
+	}
+
+	header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+	header->r_count = cpu_to_be32(offset);
+	set_buffer_jwrite(bh);
+	BUFFER_TRACE(bh, "write");
+	set_buffer_dirty(bh);
+	write_dirty_buffer(bh, write_op);
+}
+#endif
+
+/*
+ * Revoke support for recovery.
+ *
+ * Recovery needs to be able to:
+ *
+ *  record all revoke records, including the tid of the latest instance
+ *  of each revoke in the journal
+ *
+ *  check whether a given block in a given transaction should be replayed
+ *  (ie. has not been revoked by a revoke record in that or a subsequent
+ *  transaction)
+ *
+ *  empty the revoke table after recovery.
+ */
+
+/*
+ * First, setting revoke records.  We create a new revoke record for
+ * every block ever revoked in the log as we scan it for recovery, and
+ * we update the existing records if we find multiple revokes for a
+ * single block.
+ */
+
+int jbd2_journal_set_revoke(journal_t *journal,
+		       unsigned long long blocknr,
+		       tid_t sequence)
+{
+	struct jbd2_revoke_record_s *record;
+
+	record = find_revoke_record(journal, blocknr);
+	if (record) {
+		/* If we have multiple occurrences, only record the
+		 * latest sequence number in the hashed record */
+		if (tid_gt(sequence, record->sequence))
+			record->sequence = sequence;
+		return 0;
+	}
+	return insert_revoke_hash(journal, blocknr, sequence);
+}
+
+/*
+ * Test revoke records.  For a given block referenced in the log, has
+ * that block been revoked?  A revoke record with a given transaction
+ * sequence number revokes all blocks in that transaction and earlier
+ * ones, but later transactions still need replayed.
+ */
+
+int jbd2_journal_test_revoke(journal_t *journal,
+			unsigned long long blocknr,
+			tid_t sequence)
+{
+	struct jbd2_revoke_record_s *record;
+
+	record = find_revoke_record(journal, blocknr);
+	if (!record)
+		return 0;
+	if (tid_gt(sequence, record->sequence))
+		return 0;
+	return 1;
+}
+
+/*
+ * Finally, once recovery is over, we need to clear the revoke table so
+ * that it can be reused by the running filesystem.
+ */
+
+void jbd2_journal_clear_revoke(journal_t *journal)
+{
+	int i;
+	struct list_head *hash_list;
+	struct jbd2_revoke_record_s *record;
+	struct jbd2_revoke_table_s *revoke;
+
+	revoke = journal->j_revoke;
+
+	for (i = 0; i < revoke->hash_size; i++) {
+		hash_list = &revoke->hash_table[i];
+		while (!list_empty(hash_list)) {
+			record = (struct jbd2_revoke_record_s*) hash_list->next;
+			list_del(&record->hash);
+			kmem_cache_free(jbd2_revoke_record_cache, record);
+		}
+	}
+}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
new file mode 100644
index 00000000..9baa39ea
--- /dev/null
+++ b/fs/jbd2/transaction.c
@@ -0,0 +1,2227 @@
+/*
+ * linux/fs/jbd2/transaction.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem transaction handling code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages transactions (compound commits managed by the
+ * journaling code) and handles (individual atomic operations by the
+ * filesystem).
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/hrtimer.h>
+#include <linux/backing-dev.h>
+#include <linux/module.h>
+
+static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
+static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
+
+/*
+ * jbd2_get_transaction: obtain a new transaction_t object.
+ *
+ * Simply allocate and initialise a new transaction.  Create it in
+ * RUNNING state and add it to the current journal (which should not
+ * have an existing running transaction: we only make a new transaction
+ * once we have started to commit the old one).
+ *
+ * Preconditions:
+ *	The journal MUST be locked.  We don't perform atomic mallocs on the
+ *	new transaction	and we can't block without protecting against other
+ *	processes trying to touch the journal while it is in transition.
+ *
+ */
+
+static transaction_t *
+jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
+{
+	transaction->t_journal = journal;
+	transaction->t_state = T_RUNNING;
+	transaction->t_start_time = ktime_get();
+	transaction->t_tid = journal->j_transaction_sequence++;
+	transaction->t_expires = jiffies + journal->j_commit_interval;
+	spin_lock_init(&transaction->t_handle_lock);
+	atomic_set(&transaction->t_updates, 0);
+	atomic_set(&transaction->t_outstanding_credits, 0);
+	atomic_set(&transaction->t_handle_count, 0);
+	INIT_LIST_HEAD(&transaction->t_inode_list);
+	INIT_LIST_HEAD(&transaction->t_private_list);
+
+	/* Set up the commit timer for the new transaction. */
+	journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
+	add_timer(&journal->j_commit_timer);
+
+	J_ASSERT(journal->j_running_transaction == NULL);
+	journal->j_running_transaction = transaction;
+	transaction->t_max_wait = 0;
+	transaction->t_start = jiffies;
+
+	return transaction;
+}
+
+/*
+ * Handle management.
+ *
+ * A handle_t is an object which represents a single atomic update to a
+ * filesystem, and which tracks all of the modifications which form part
+ * of that one update.
+ */
+
+/*
+ * Update transaction's maximum wait time, if debugging is enabled.
+ *
+ * In order for t_max_wait to be reliable, it must be protected by a
+ * lock.  But doing so will mean that start_this_handle() can not be
+ * run in parallel on SMP systems, which limits our scalability.  So
+ * unless debugging is enabled, we no longer update t_max_wait, which
+ * means that maximum wait time reported by the jbd2_run_stats
+ * tracepoint will always be zero.
+ */
+static inline void update_t_max_wait(transaction_t *transaction,
+				     unsigned long ts)
+{
+#ifdef CONFIG_JBD2_DEBUG
+	if (jbd2_journal_enable_debug &&
+	    time_after(transaction->t_start, ts)) {
+		ts = jbd2_time_diff(ts, transaction->t_start);
+		spin_lock(&transaction->t_handle_lock);
+		if (ts > transaction->t_max_wait)
+			transaction->t_max_wait = ts;
+		spin_unlock(&transaction->t_handle_lock);
+	}
+#endif
+}
+
+/*
+ * start_this_handle: Given a handle, deal with any locking or stalling
+ * needed to make sure that there is enough journal space for the handle
+ * to begin.  Attach the handle to a transaction and set up the
+ * transaction's buffer credits.
+ */
+
+static int start_this_handle(journal_t *journal, handle_t *handle,
+			     int gfp_mask)
+{
+	transaction_t	*transaction, *new_transaction = NULL;
+	tid_t		tid;
+	int		needed, need_to_start;
+	int		nblocks = handle->h_buffer_credits;
+	unsigned long ts = jiffies;
+
+	if (nblocks > journal->j_max_transaction_buffers) {
+		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
+		       current->comm, nblocks,
+		       journal->j_max_transaction_buffers);
+		return -ENOSPC;
+	}
+
+alloc_transaction:
+	if (!journal->j_running_transaction) {
+		new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
+		if (!new_transaction) {
+			/*
+			 * If __GFP_FS is not present, then we may be
+			 * being called from inside the fs writeback
+			 * layer, so we MUST NOT fail.  Since
+			 * __GFP_NOFAIL is going away, we will arrange
+			 * to retry the allocation ourselves.
+			 */
+			if ((gfp_mask & __GFP_FS) == 0) {
+				congestion_wait(BLK_RW_ASYNC, HZ/50);
+				goto alloc_transaction;
+			}
+			return -ENOMEM;
+		}
+	}
+
+	jbd_debug(3, "New handle %p going live.\n", handle);
+
+	/*
+	 * We need to hold j_state_lock until t_updates has been incremented,
+	 * for proper journal barrier handling
+	 */
+repeat:
+	read_lock(&journal->j_state_lock);
+	BUG_ON(journal->j_flags & JBD2_UNMOUNT);
+	if (is_journal_aborted(journal) ||
+	    (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
+		read_unlock(&journal->j_state_lock);
+		kfree(new_transaction);
+		return -EROFS;
+	}
+
+	/* Wait on the journal's transaction barrier if necessary */
+	if (journal->j_barrier_count) {
+		read_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_transaction_locked,
+				journal->j_barrier_count == 0);
+		goto repeat;
+	}
+
+	if (!journal->j_running_transaction) {
+		read_unlock(&journal->j_state_lock);
+		if (!new_transaction)
+			goto alloc_transaction;
+		write_lock(&journal->j_state_lock);
+		if (!journal->j_running_transaction) {
+			jbd2_get_transaction(journal, new_transaction);
+			new_transaction = NULL;
+		}
+		write_unlock(&journal->j_state_lock);
+		goto repeat;
+	}
+
+	transaction = journal->j_running_transaction;
+
+	/*
+	 * If the current transaction is locked down for commit, wait for the
+	 * lock to be released.
+	 */
+	if (transaction->t_state == T_LOCKED) {
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(&journal->j_wait_transaction_locked,
+					&wait, TASK_UNINTERRUPTIBLE);
+		read_unlock(&journal->j_state_lock);
+		schedule();
+		finish_wait(&journal->j_wait_transaction_locked, &wait);
+		goto repeat;
+	}
+
+	/*
+	 * If there is not enough space left in the log to write all potential
+	 * buffers requested by this operation, we need to stall pending a log
+	 * checkpoint to free some more log space.
+	 */
+	needed = atomic_add_return(nblocks,
+				   &transaction->t_outstanding_credits);
+
+	if (needed > journal->j_max_transaction_buffers) {
+		/*
+		 * If the current transaction is already too large, then start
+		 * to commit it: we can then go back and attach this handle to
+		 * a new transaction.
+		 */
+		DEFINE_WAIT(wait);
+
+		jbd_debug(2, "Handle %p starting new commit...\n", handle);
+		atomic_sub(nblocks, &transaction->t_outstanding_credits);
+		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+				TASK_UNINTERRUPTIBLE);
+		tid = transaction->t_tid;
+		need_to_start = !tid_geq(journal->j_commit_request, tid);
+		read_unlock(&journal->j_state_lock);
+		if (need_to_start)
+			jbd2_log_start_commit(journal, tid);
+		schedule();
+		finish_wait(&journal->j_wait_transaction_locked, &wait);
+		goto repeat;
+	}
+
+	/*
+	 * The commit code assumes that it can get enough log space
+	 * without forcing a checkpoint.  This is *critical* for
+	 * correctness: a checkpoint of a buffer which is also
+	 * associated with a committing transaction creates a deadlock,
+	 * so commit simply cannot force through checkpoints.
+	 *
+	 * We must therefore ensure the necessary space in the journal
+	 * *before* starting to dirty potentially checkpointed buffers
+	 * in the new transaction.
+	 *
+	 * The worst part is, any transaction currently committing can
+	 * reduce the free space arbitrarily.  Be careful to account for
+	 * those buffers when checkpointing.
+	 */
+
+	/*
+	 * @@@ AKPM: This seems rather over-defensive.  We're giving commit
+	 * a _lot_ of headroom: 1/4 of the journal plus the size of
+	 * the committing transaction.  Really, we only need to give it
+	 * committing_transaction->t_outstanding_credits plus "enough" for
+	 * the log control blocks.
+	 * Also, this test is inconsistent with the matching one in
+	 * jbd2_journal_extend().
+	 */
+	if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
+		jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
+		atomic_sub(nblocks, &transaction->t_outstanding_credits);
+		read_unlock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
+		if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
+			__jbd2_log_wait_for_space(journal);
+		write_unlock(&journal->j_state_lock);
+		goto repeat;
+	}
+
+	/* OK, account for the buffers that this operation expects to
+	 * use and add the handle to the running transaction. 
+	 */
+	update_t_max_wait(transaction, ts);
+	handle->h_transaction = transaction;
+	atomic_inc(&transaction->t_updates);
+	atomic_inc(&transaction->t_handle_count);
+	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
+		  handle, nblocks,
+		  atomic_read(&transaction->t_outstanding_credits),
+		  __jbd2_log_space_left(journal));
+	read_unlock(&journal->j_state_lock);
+
+	lock_map_acquire(&handle->h_lockdep_map);
+	kfree(new_transaction);
+	return 0;
+}
+
+static struct lock_class_key jbd2_handle_key;
+
+/* Allocate a new handle.  This should probably be in a slab... */
+static handle_t *new_handle(int nblocks)
+{
+	handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
+	if (!handle)
+		return NULL;
+	memset(handle, 0, sizeof(*handle));
+	handle->h_buffer_credits = nblocks;
+	handle->h_ref = 1;
+
+	lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle",
+						&jbd2_handle_key, 0);
+
+	return handle;
+}
+
+/**
+ * handle_t *jbd2_journal_start() - Obtain a new handle.
+ * @journal: Journal to start transaction on.
+ * @nblocks: number of block buffer we might modify
+ *
+ * We make sure that the transaction can guarantee at least nblocks of
+ * modified buffers in the log.  We block until the log can guarantee
+ * that much space.
+ *
+ * This function is visible to journal users (like ext3fs), so is not
+ * called with the journal already locked.
+ *
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
+ */
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
+{
+	handle_t *handle = journal_current_handle();
+	int err;
+
+	if (!journal)
+		return ERR_PTR(-EROFS);
+
+	if (handle) {
+		J_ASSERT(handle->h_transaction->t_journal == journal);
+		handle->h_ref++;
+		return handle;
+	}
+
+	handle = new_handle(nblocks);
+	if (!handle)
+		return ERR_PTR(-ENOMEM);
+
+	current->journal_info = handle;
+
+	err = start_this_handle(journal, handle, gfp_mask);
+	if (err < 0) {
+		jbd2_free_handle(handle);
+		current->journal_info = NULL;
+		handle = ERR_PTR(err);
+	}
+	return handle;
+}
+EXPORT_SYMBOL(jbd2__journal_start);
+
+
+handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
+{
+	return jbd2__journal_start(journal, nblocks, GFP_NOFS);
+}
+EXPORT_SYMBOL(jbd2_journal_start);
+
+
+/**
+ * int jbd2_journal_extend() - extend buffer credits.
+ * @handle:  handle to 'extend'
+ * @nblocks: nr blocks to try to extend by.
+ *
+ * Some transactions, such as large extends and truncates, can be done
+ * atomically all at once or in several stages.  The operation requests
+ * a credit for a number of buffer modications in advance, but can
+ * extend its credit if it needs more.
+ *
+ * jbd2_journal_extend tries to give the running handle more buffer credits.
+ * It does not guarantee that allocation - this is a best-effort only.
+ * The calling process MUST be able to deal cleanly with a failure to
+ * extend here.
+ *
+ * Return 0 on success, non-zero on failure.
+ *
+ * return code < 0 implies an error
+ * return code > 0 implies normal transaction-full status.
+ */
+int jbd2_journal_extend(handle_t *handle, int nblocks)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	int result;
+	int wanted;
+
+	result = -EIO;
+	if (is_handle_aborted(handle))
+		goto out;
+
+	result = 1;
+
+	read_lock(&journal->j_state_lock);
+
+	/* Don't extend a locked-down transaction! */
+	if (handle->h_transaction->t_state != T_RUNNING) {
+		jbd_debug(3, "denied handle %p %d blocks: "
+			  "transaction not running\n", handle, nblocks);
+		goto error_out;
+	}
+
+	spin_lock(&transaction->t_handle_lock);
+	wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
+
+	if (wanted > journal->j_max_transaction_buffers) {
+		jbd_debug(3, "denied handle %p %d blocks: "
+			  "transaction too large\n", handle, nblocks);
+		goto unlock;
+	}
+
+	if (wanted > __jbd2_log_space_left(journal)) {
+		jbd_debug(3, "denied handle %p %d blocks: "
+			  "insufficient log space\n", handle, nblocks);
+		goto unlock;
+	}
+
+	handle->h_buffer_credits += nblocks;
+	atomic_add(nblocks, &transaction->t_outstanding_credits);
+	result = 0;
+
+	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
+unlock:
+	spin_unlock(&transaction->t_handle_lock);
+error_out:
+	read_unlock(&journal->j_state_lock);
+out:
+	return result;
+}
+
+
+/**
+ * int jbd2_journal_restart() - restart a handle .
+ * @handle:  handle to restart
+ * @nblocks: nr credits requested
+ *
+ * Restart a handle for a multi-transaction filesystem
+ * operation.
+ *
+ * If the jbd2_journal_extend() call above fails to grant new buffer credits
+ * to a running handle, a call to jbd2_journal_restart will commit the
+ * handle's transaction so far and reattach the handle to a new
+ * transaction capabable of guaranteeing the requested number of
+ * credits.
+ */
+int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	tid_t		tid;
+	int		need_to_start, ret;
+
+	/* If we've had an abort of any type, don't even think about
+	 * actually doing the restart! */
+	if (is_handle_aborted(handle))
+		return 0;
+
+	/*
+	 * First unlink the handle from its current transaction, and start the
+	 * commit on that.
+	 */
+	J_ASSERT(atomic_read(&transaction->t_updates) > 0);
+	J_ASSERT(journal_current_handle() == handle);
+
+	read_lock(&journal->j_state_lock);
+	spin_lock(&transaction->t_handle_lock);
+	atomic_sub(handle->h_buffer_credits,
+		   &transaction->t_outstanding_credits);
+	if (atomic_dec_and_test(&transaction->t_updates))
+		wake_up(&journal->j_wait_updates);
+	spin_unlock(&transaction->t_handle_lock);
+
+	jbd_debug(2, "restarting handle %p\n", handle);
+	tid = transaction->t_tid;
+	need_to_start = !tid_geq(journal->j_commit_request, tid);
+	read_unlock(&journal->j_state_lock);
+	if (need_to_start)
+		jbd2_log_start_commit(journal, tid);
+
+	lock_map_release(&handle->h_lockdep_map);
+	handle->h_buffer_credits = nblocks;
+	ret = start_this_handle(journal, handle, gfp_mask);
+	return ret;
+}
+EXPORT_SYMBOL(jbd2__journal_restart);
+
+
+int jbd2_journal_restart(handle_t *handle, int nblocks)
+{
+	return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
+}
+EXPORT_SYMBOL(jbd2_journal_restart);
+
+/**
+ * void jbd2_journal_lock_updates () - establish a transaction barrier.
+ * @journal:  Journal to establish a barrier on.
+ *
+ * This locks out any further updates from being started, and blocks
+ * until all existing updates have completed, returning only once the
+ * journal is in a quiescent state with no updates running.
+ *
+ * The journal lock should not be held on entry.
+ */
+void jbd2_journal_lock_updates(journal_t *journal)
+{
+	DEFINE_WAIT(wait);
+
+	write_lock(&journal->j_state_lock);
+	++journal->j_barrier_count;
+
+	/* Wait until there are no running updates */
+	while (1) {
+		transaction_t *transaction = journal->j_running_transaction;
+
+		if (!transaction)
+			break;
+
+		spin_lock(&transaction->t_handle_lock);
+		if (!atomic_read(&transaction->t_updates)) {
+			spin_unlock(&transaction->t_handle_lock);
+			break;
+		}
+		prepare_to_wait(&journal->j_wait_updates, &wait,
+				TASK_UNINTERRUPTIBLE);
+		spin_unlock(&transaction->t_handle_lock);
+		write_unlock(&journal->j_state_lock);
+		schedule();
+		finish_wait(&journal->j_wait_updates, &wait);
+		write_lock(&journal->j_state_lock);
+	}
+	write_unlock(&journal->j_state_lock);
+
+	/*
+	 * We have now established a barrier against other normal updates, but
+	 * we also need to barrier against other jbd2_journal_lock_updates() calls
+	 * to make sure that we serialise special journal-locked operations
+	 * too.
+	 */
+	mutex_lock(&journal->j_barrier);
+}
+
+/**
+ * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
+ * @journal:  Journal to release the barrier on.
+ *
+ * Release a transaction barrier obtained with jbd2_journal_lock_updates().
+ *
+ * Should be called without the journal lock held.
+ */
+void jbd2_journal_unlock_updates (journal_t *journal)
+{
+	J_ASSERT(journal->j_barrier_count != 0);
+
+	mutex_unlock(&journal->j_barrier);
+	write_lock(&journal->j_state_lock);
+	--journal->j_barrier_count;
+	write_unlock(&journal->j_state_lock);
+	wake_up(&journal->j_wait_transaction_locked);
+}
+
+static void warn_dirty_buffer(struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+
+	printk(KERN_WARNING
+	       "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+	       "There's a risk of filesystem corruption in case of system "
+	       "crash.\n",
+	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
+}
+
+/*
+ * If the buffer is already part of the current transaction, then there
+ * is nothing we need to do.  If it is already part of a prior
+ * transaction which we are still committing to disk, then we need to
+ * make sure that we do not overwrite the old copy: we do copy-out to
+ * preserve the copy going to disk.  We also account the buffer against
+ * the handle's metadata buffer credits (unless the buffer is already
+ * part of the transaction, that is).
+ *
+ */
+static int
+do_get_write_access(handle_t *handle, struct journal_head *jh,
+			int force_copy)
+{
+	struct buffer_head *bh;
+	transaction_t *transaction;
+	journal_t *journal;
+	int error;
+	char *frozen_buffer = NULL;
+	int need_copy = 0;
+
+	if (is_handle_aborted(handle))
+		return -EROFS;
+
+	transaction = handle->h_transaction;
+	journal = transaction->t_journal;
+
+	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
+
+	JBUFFER_TRACE(jh, "entry");
+repeat:
+	bh = jh2bh(jh);
+
+	/* @@@ Need to check for errors here at some point. */
+
+	lock_buffer(bh);
+	jbd_lock_bh_state(bh);
+
+	/* We now hold the buffer lock so it is safe to query the buffer
+	 * state.  Is the buffer dirty?
+	 *
+	 * If so, there are two possibilities.  The buffer may be
+	 * non-journaled, and undergoing a quite legitimate writeback.
+	 * Otherwise, it is journaled, and we don't expect dirty buffers
+	 * in that state (the buffers should be marked JBD_Dirty
+	 * instead.)  So either the IO is being done under our own
+	 * control and this is a bug, or it's a third party IO such as
+	 * dump(8) (which may leave the buffer scheduled for read ---
+	 * ie. locked but not dirty) or tune2fs (which may actually have
+	 * the buffer dirtied, ugh.)  */
+
+	if (buffer_dirty(bh)) {
+		/*
+		 * First question: is this buffer already part of the current
+		 * transaction or the existing committing transaction?
+		 */
+		if (jh->b_transaction) {
+			J_ASSERT_JH(jh,
+				jh->b_transaction == transaction ||
+				jh->b_transaction ==
+					journal->j_committing_transaction);
+			if (jh->b_next_transaction)
+				J_ASSERT_JH(jh, jh->b_next_transaction ==
+							transaction);
+			warn_dirty_buffer(bh);
+		}
+		/*
+		 * In any case we need to clean the dirty flag and we must
+		 * do it under the buffer lock to be sure we don't race
+		 * with running write-out.
+		 */
+		JBUFFER_TRACE(jh, "Journalling dirty buffer");
+		clear_buffer_dirty(bh);
+		set_buffer_jbddirty(bh);
+	}
+
+	unlock_buffer(bh);
+
+	error = -EROFS;
+	if (is_handle_aborted(handle)) {
+		jbd_unlock_bh_state(bh);
+		goto out;
+	}
+	error = 0;
+
+	/*
+	 * The buffer is already part of this transaction if b_transaction or
+	 * b_next_transaction points to it
+	 */
+	if (jh->b_transaction == transaction ||
+	    jh->b_next_transaction == transaction)
+		goto done;
+
+	/*
+	 * this is the first time this transaction is touching this buffer,
+	 * reset the modified flag
+	 */
+       jh->b_modified = 0;
+
+	/*
+	 * If there is already a copy-out version of this buffer, then we don't
+	 * need to make another one
+	 */
+	if (jh->b_frozen_data) {
+		JBUFFER_TRACE(jh, "has frozen data");
+		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+		jh->b_next_transaction = transaction;
+		goto done;
+	}
+
+	/* Is there data here we need to preserve? */
+
+	if (jh->b_transaction && jh->b_transaction != transaction) {
+		JBUFFER_TRACE(jh, "owned by older transaction");
+		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+		J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_committing_transaction);
+
+		/* There is one case we have to be very careful about.
+		 * If the committing transaction is currently writing
+		 * this buffer out to disk and has NOT made a copy-out,
+		 * then we cannot modify the buffer contents at all
+		 * right now.  The essence of copy-out is that it is the
+		 * extra copy, not the primary copy, which gets
+		 * journaled.  If the primary copy is already going to
+		 * disk then we cannot do copy-out here. */
+
+		if (jh->b_jlist == BJ_Shadow) {
+			DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
+			wait_queue_head_t *wqh;
+
+			wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
+
+			JBUFFER_TRACE(jh, "on shadow: sleep");
+			jbd_unlock_bh_state(bh);
+			/* commit wakes up all shadow buffers after IO */
+			for ( ; ; ) {
+				prepare_to_wait(wqh, &wait.wait,
+						TASK_UNINTERRUPTIBLE);
+				if (jh->b_jlist != BJ_Shadow)
+					break;
+				schedule();
+			}
+			finish_wait(wqh, &wait.wait);
+			goto repeat;
+		}
+
+		/* Only do the copy if the currently-owning transaction
+		 * still needs it.  If it is on the Forget list, the
+		 * committing transaction is past that stage.  The
+		 * buffer had better remain locked during the kmalloc,
+		 * but that should be true --- we hold the journal lock
+		 * still and the buffer is already on the BUF_JOURNAL
+		 * list so won't be flushed.
+		 *
+		 * Subtle point, though: if this is a get_undo_access,
+		 * then we will be relying on the frozen_data to contain
+		 * the new value of the committed_data record after the
+		 * transaction, so we HAVE to force the frozen_data copy
+		 * in that case. */
+
+		if (jh->b_jlist != BJ_Forget || force_copy) {
+			JBUFFER_TRACE(jh, "generate frozen data");
+			if (!frozen_buffer) {
+				JBUFFER_TRACE(jh, "allocate memory for buffer");
+				jbd_unlock_bh_state(bh);
+				frozen_buffer =
+					jbd2_alloc(jh2bh(jh)->b_size,
+							 GFP_NOFS);
+				if (!frozen_buffer) {
+					printk(KERN_EMERG
+					       "%s: OOM for frozen_buffer\n",
+					       __func__);
+					JBUFFER_TRACE(jh, "oom!");
+					error = -ENOMEM;
+					jbd_lock_bh_state(bh);
+					goto done;
+				}
+				goto repeat;
+			}
+			jh->b_frozen_data = frozen_buffer;
+			frozen_buffer = NULL;
+			need_copy = 1;
+		}
+		jh->b_next_transaction = transaction;
+	}
+
+
+	/*
+	 * Finally, if the buffer is not journaled right now, we need to make
+	 * sure it doesn't get written to disk before the caller actually
+	 * commits the new data
+	 */
+	if (!jh->b_transaction) {
+		JBUFFER_TRACE(jh, "no transaction");
+		J_ASSERT_JH(jh, !jh->b_next_transaction);
+		JBUFFER_TRACE(jh, "file as BJ_Reserved");
+		spin_lock(&journal->j_list_lock);
+		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+		spin_unlock(&journal->j_list_lock);
+	}
+
+done:
+	if (need_copy) {
+		struct page *page;
+		int offset;
+		char *source;
+
+		J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
+			    "Possible IO failure.\n");
+		page = jh2bh(jh)->b_page;
+		offset = offset_in_page(jh2bh(jh)->b_data);
+		source = kmap_atomic(page, KM_USER0);
+		/* Fire data frozen trigger just before we copy the data */
+		jbd2_buffer_frozen_trigger(jh, source + offset,
+					   jh->b_triggers);
+		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
+		kunmap_atomic(source, KM_USER0);
+
+		/*
+		 * Now that the frozen data is saved off, we need to store
+		 * any matching triggers.
+		 */
+		jh->b_frozen_triggers = jh->b_triggers;
+	}
+	jbd_unlock_bh_state(bh);
+
+	/*
+	 * If we are about to journal a buffer, then any revoke pending on it is
+	 * no longer valid
+	 */
+	jbd2_journal_cancel_revoke(handle, jh);
+
+out:
+	if (unlikely(frozen_buffer))	/* It's usually NULL */
+		jbd2_free(frozen_buffer, bh->b_size);
+
+	JBUFFER_TRACE(jh, "exit");
+	return error;
+}
+
+/**
+ * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
+ * @handle: transaction to add buffer modifications to
+ * @bh:     bh to be used for metadata writes
+ *
+ * Returns an error code or 0 on success.
+ *
+ * In full data journalling mode the buffer may be of type BJ_AsyncData,
+ * because we're write()ing a buffer which is also part of a shared mapping.
+ */
+
+int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
+{
+	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+	int rc;
+
+	/* We do not want to get caught playing with fields which the
+	 * log thread also manipulates.  Make sure that the buffer
+	 * completes any outstanding IO before proceeding. */
+	rc = do_get_write_access(handle, jh, 0);
+	jbd2_journal_put_journal_head(jh);
+	return rc;
+}
+
+
+/*
+ * When the user wants to journal a newly created buffer_head
+ * (ie. getblk() returned a new buffer and we are going to populate it
+ * manually rather than reading off disk), then we need to keep the
+ * buffer_head locked until it has been completely filled with new
+ * data.  In this case, we should be able to make the assertion that
+ * the bh is not already part of an existing transaction.
+ *
+ * The buffer should already be locked by the caller by this point.
+ * There is no lock ranking violation: it was a newly created,
+ * unlocked buffer beforehand. */
+
+/**
+ * int jbd2_journal_get_create_access () - notify intent to use newly created bh
+ * @handle: transaction to new buffer to
+ * @bh: new buffer.
+ *
+ * Call this if you create a new bh.
+ */
+int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+	int err;
+
+	jbd_debug(5, "journal_head %p\n", jh);
+	err = -EROFS;
+	if (is_handle_aborted(handle))
+		goto out;
+	err = 0;
+
+	JBUFFER_TRACE(jh, "entry");
+	/*
+	 * The buffer may already belong to this transaction due to pre-zeroing
+	 * in the filesystem's new_block code.  It may also be on the previous,
+	 * committing transaction's lists, but it HAS to be in Forget state in
+	 * that case: the transaction must have deleted the buffer for it to be
+	 * reused here.
+	 */
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+	J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
+		jh->b_transaction == NULL ||
+		(jh->b_transaction == journal->j_committing_transaction &&
+			  jh->b_jlist == BJ_Forget)));
+
+	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
+
+	if (jh->b_transaction == NULL) {
+		/*
+		 * Previous jbd2_journal_forget() could have left the buffer
+		 * with jbddirty bit set because it was being committed. When
+		 * the commit finished, we've filed the buffer for
+		 * checkpointing and marked it dirty. Now we are reallocating
+		 * the buffer so the transaction freeing it must have
+		 * committed and so it's safe to clear the dirty bit.
+		 */
+		clear_buffer_dirty(jh2bh(jh));
+		/* first access by this transaction */
+		jh->b_modified = 0;
+
+		JBUFFER_TRACE(jh, "file as BJ_Reserved");
+		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+	} else if (jh->b_transaction == journal->j_committing_transaction) {
+		/* first access by this transaction */
+		jh->b_modified = 0;
+
+		JBUFFER_TRACE(jh, "set next transaction");
+		jh->b_next_transaction = transaction;
+	}
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+
+	/*
+	 * akpm: I added this.  ext3_alloc_branch can pick up new indirect
+	 * blocks which contain freed but then revoked metadata.  We need
+	 * to cancel the revoke in case we end up freeing it yet again
+	 * and the reallocating as data - this would cause a second revoke,
+	 * which hits an assertion error.
+	 */
+	JBUFFER_TRACE(jh, "cancelling revoke");
+	jbd2_journal_cancel_revoke(handle, jh);
+out:
+	jbd2_journal_put_journal_head(jh);
+	return err;
+}
+
+/**
+ * int jbd2_journal_get_undo_access() -  Notify intent to modify metadata with
+ *     non-rewindable consequences
+ * @handle: transaction
+ * @bh: buffer to undo
+ *
+ * Sometimes there is a need to distinguish between metadata which has
+ * been committed to disk and that which has not.  The ext3fs code uses
+ * this for freeing and allocating space, we have to make sure that we
+ * do not reuse freed space until the deallocation has been committed,
+ * since if we overwrote that space we would make the delete
+ * un-rewindable in case of a crash.
+ *
+ * To deal with that, jbd2_journal_get_undo_access requests write access to a
+ * buffer for parts of non-rewindable operations such as delete
+ * operations on the bitmaps.  The journaling code must keep a copy of
+ * the buffer's contents prior to the undo_access call until such time
+ * as we know that the buffer has definitely been committed to disk.
+ *
+ * We never need to know which transaction the committed data is part
+ * of, buffers touched here are guaranteed to be dirtied later and so
+ * will be committed to a new transaction in due course, at which point
+ * we can discard the old committed data pointer.
+ *
+ * Returns error number or 0 on success.
+ */
+int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
+{
+	int err;
+	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+	char *committed_data = NULL;
+
+	JBUFFER_TRACE(jh, "entry");
+
+	/*
+	 * Do this first --- it can drop the journal lock, so we want to
+	 * make sure that obtaining the committed_data is done
+	 * atomically wrt. completion of any outstanding commits.
+	 */
+	err = do_get_write_access(handle, jh, 1);
+	if (err)
+		goto out;
+
+repeat:
+	if (!jh->b_committed_data) {
+		committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
+		if (!committed_data) {
+			printk(KERN_EMERG "%s: No memory for committed data\n",
+				__func__);
+			err = -ENOMEM;
+			goto out;
+		}
+	}
+
+	jbd_lock_bh_state(bh);
+	if (!jh->b_committed_data) {
+		/* Copy out the current buffer contents into the
+		 * preserved, committed copy. */
+		JBUFFER_TRACE(jh, "generate b_committed data");
+		if (!committed_data) {
+			jbd_unlock_bh_state(bh);
+			goto repeat;
+		}
+
+		jh->b_committed_data = committed_data;
+		committed_data = NULL;
+		memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
+	}
+	jbd_unlock_bh_state(bh);
+out:
+	jbd2_journal_put_journal_head(jh);
+	if (unlikely(committed_data))
+		jbd2_free(committed_data, bh->b_size);
+	return err;
+}
+
+/**
+ * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * @bh: buffer to trigger on
+ * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
+ *
+ * Set any triggers on this journal_head.  This is always safe, because
+ * triggers for a committing buffer will be saved off, and triggers for
+ * a running transaction will match the buffer in that transaction.
+ *
+ * Call with NULL to clear the triggers.
+ */
+void jbd2_journal_set_triggers(struct buffer_head *bh,
+			       struct jbd2_buffer_trigger_type *type)
+{
+	struct journal_head *jh = bh2jh(bh);
+
+	jh->b_triggers = type;
+}
+
+void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
+				struct jbd2_buffer_trigger_type *triggers)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	if (!triggers || !triggers->t_frozen)
+		return;
+
+	triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
+}
+
+void jbd2_buffer_abort_trigger(struct journal_head *jh,
+			       struct jbd2_buffer_trigger_type *triggers)
+{
+	if (!triggers || !triggers->t_abort)
+		return;
+
+	triggers->t_abort(triggers, jh2bh(jh));
+}
+
+
+
+/**
+ * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
+ * @handle: transaction to add buffer to.
+ * @bh: buffer to mark
+ *
+ * mark dirty metadata which needs to be journaled as part of the current
+ * transaction.
+ *
+ * The buffer is placed on the transaction's metadata list and is marked
+ * as belonging to the transaction.
+ *
+ * Returns error number or 0 on success.
+ *
+ * Special care needs to be taken if the buffer already belongs to the
+ * current committing transaction (in which case we should have frozen
+ * data present for that commit).  In that case, we don't relink the
+ * buffer: that only gets done when the old transaction finally
+ * completes its commit.
+ */
+int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	struct journal_head *jh = bh2jh(bh);
+
+	jbd_debug(5, "journal_head %p\n", jh);
+	JBUFFER_TRACE(jh, "entry");
+	if (is_handle_aborted(handle))
+		goto out;
+
+	jbd_lock_bh_state(bh);
+
+	if (jh->b_modified == 0) {
+		/*
+		 * This buffer's got modified and becoming part
+		 * of the transaction. This needs to be done
+		 * once a transaction -bzzz
+		 */
+		jh->b_modified = 1;
+		J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+		handle->h_buffer_credits--;
+	}
+
+	/*
+	 * fastpath, to avoid expensive locking.  If this buffer is already
+	 * on the running transaction's metadata list there is nothing to do.
+	 * Nobody can take it off again because there is a handle open.
+	 * I _think_ we're OK here with SMP barriers - a mistaken decision will
+	 * result in this test being false, so we go in and take the locks.
+	 */
+	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
+		JBUFFER_TRACE(jh, "fastpath");
+		J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_running_transaction);
+		goto out_unlock_bh;
+	}
+
+	set_buffer_jbddirty(bh);
+
+	/*
+	 * Metadata already on the current transaction list doesn't
+	 * need to be filed.  Metadata on another transaction's list must
+	 * be committing, and will be refiled once the commit completes:
+	 * leave it alone for now.
+	 */
+	if (jh->b_transaction != transaction) {
+		JBUFFER_TRACE(jh, "already on other transaction");
+		J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_committing_transaction);
+		J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
+		/* And this case is illegal: we can't reuse another
+		 * transaction's data buffer, ever. */
+		goto out_unlock_bh;
+	}
+
+	/* That test should have eliminated the following case: */
+	J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
+
+	JBUFFER_TRACE(jh, "file as BJ_Metadata");
+	spin_lock(&journal->j_list_lock);
+	__jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+	spin_unlock(&journal->j_list_lock);
+out_unlock_bh:
+	jbd_unlock_bh_state(bh);
+out:
+	JBUFFER_TRACE(jh, "exit");
+	return 0;
+}
+
+/*
+ * jbd2_journal_release_buffer: undo a get_write_access without any buffer
+ * updates, if the update decided in the end that it didn't need access.
+ *
+ */
+void
+jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
+{
+	BUFFER_TRACE(bh, "entry");
+}
+
+/**
+ * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
+ * @handle: transaction handle
+ * @bh:     bh to 'forget'
+ *
+ * We can only do the bforget if there are no commits pending against the
+ * buffer.  If the buffer is dirty in the current running transaction we
+ * can safely unlink it.
+ *
+ * bh may not be a journalled buffer at all - it may be a non-JBD
+ * buffer which came off the hashtable.  Check for this.
+ *
+ * Decrements bh->b_count by one.
+ *
+ * Allow this call even if the handle has aborted --- it may be part of
+ * the caller's cleanup after an abort.
+ */
+int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	struct journal_head *jh;
+	int drop_reserve = 0;
+	int err = 0;
+	int was_modified = 0;
+
+	BUFFER_TRACE(bh, "entry");
+
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+
+	if (!buffer_jbd(bh))
+		goto not_jbd;
+	jh = bh2jh(bh);
+
+	/* Critical error: attempting to delete a bitmap buffer, maybe?
+	 * Don't do any jbd operations, and return an error. */
+	if (!J_EXPECT_JH(jh, !jh->b_committed_data,
+			 "inconsistent data on disk")) {
+		err = -EIO;
+		goto not_jbd;
+	}
+
+	/* keep track of wether or not this transaction modified us */
+	was_modified = jh->b_modified;
+
+	/*
+	 * The buffer's going from the transaction, we must drop
+	 * all references -bzzz
+	 */
+	jh->b_modified = 0;
+
+	if (jh->b_transaction == handle->h_transaction) {
+		J_ASSERT_JH(jh, !jh->b_frozen_data);
+
+		/* If we are forgetting a buffer which is already part
+		 * of this transaction, then we can just drop it from
+		 * the transaction immediately. */
+		clear_buffer_dirty(bh);
+		clear_buffer_jbddirty(bh);
+
+		JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
+
+		/*
+		 * we only want to drop a reference if this transaction
+		 * modified the buffer
+		 */
+		if (was_modified)
+			drop_reserve = 1;
+
+		/*
+		 * We are no longer going to journal this buffer.
+		 * However, the commit of this transaction is still
+		 * important to the buffer: the delete that we are now
+		 * processing might obsolete an old log entry, so by
+		 * committing, we can satisfy the buffer's checkpoint.
+		 *
+		 * So, if we have a checkpoint on the buffer, we should
+		 * now refile the buffer on our BJ_Forget list so that
+		 * we know to remove the checkpoint after we commit.
+		 */
+
+		if (jh->b_cp_transaction) {
+			__jbd2_journal_temp_unlink_buffer(jh);
+			__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+		} else {
+			__jbd2_journal_unfile_buffer(jh);
+			if (!buffer_jbd(bh)) {
+				spin_unlock(&journal->j_list_lock);
+				jbd_unlock_bh_state(bh);
+				__bforget(bh);
+				goto drop;
+			}
+		}
+	} else if (jh->b_transaction) {
+		J_ASSERT_JH(jh, (jh->b_transaction ==
+				 journal->j_committing_transaction));
+		/* However, if the buffer is still owned by a prior
+		 * (committing) transaction, we can't drop it yet... */
+		JBUFFER_TRACE(jh, "belongs to older transaction");
+		/* ... but we CAN drop it from the new transaction if we
+		 * have also modified it since the original commit. */
+
+		if (jh->b_next_transaction) {
+			J_ASSERT(jh->b_next_transaction == transaction);
+			jh->b_next_transaction = NULL;
+
+			/*
+			 * only drop a reference if this transaction modified
+			 * the buffer
+			 */
+			if (was_modified)
+				drop_reserve = 1;
+		}
+	}
+
+not_jbd:
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+	__brelse(bh);
+drop:
+	if (drop_reserve) {
+		/* no need to reserve log space for this block -bzzz */
+		handle->h_buffer_credits++;
+	}
+	return err;
+}
+
+/**
+ * int jbd2_journal_stop() - complete a transaction
+ * @handle: tranaction to complete.
+ *
+ * All done for a particular handle.
+ *
+ * There is not much action needed here.  We just return any remaining
+ * buffer credits to the transaction and remove the handle.  The only
+ * complication is that we need to start a commit operation if the
+ * filesystem is marked for synchronous update.
+ *
+ * jbd2_journal_stop itself will not usually return an error, but it may
+ * do so in unusual circumstances.  In particular, expect it to
+ * return -EIO if a jbd2_journal_abort has been executed since the
+ * transaction began.
+ */
+int jbd2_journal_stop(handle_t *handle)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	int err, wait_for_commit = 0;
+	tid_t tid;
+	pid_t pid;
+
+	J_ASSERT(journal_current_handle() == handle);
+
+	if (is_handle_aborted(handle))
+		err = -EIO;
+	else {
+		J_ASSERT(atomic_read(&transaction->t_updates) > 0);
+		err = 0;
+	}
+
+	if (--handle->h_ref > 0) {
+		jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+			  handle->h_ref);
+		return err;
+	}
+
+	jbd_debug(4, "Handle %p going down\n", handle);
+
+	/*
+	 * Implement synchronous transaction batching.  If the handle
+	 * was synchronous, don't force a commit immediately.  Let's
+	 * yield and let another thread piggyback onto this
+	 * transaction.  Keep doing that while new threads continue to
+	 * arrive.  It doesn't cost much - we're about to run a commit
+	 * and sleep on IO anyway.  Speeds up many-threaded, many-dir
+	 * operations by 30x or more...
+	 *
+	 * We try and optimize the sleep time against what the
+	 * underlying disk can do, instead of having a static sleep
+	 * time.  This is useful for the case where our storage is so
+	 * fast that it is more optimal to go ahead and force a flush
+	 * and wait for the transaction to be committed than it is to
+	 * wait for an arbitrary amount of time for new writers to
+	 * join the transaction.  We achieve this by measuring how
+	 * long it takes to commit a transaction, and compare it with
+	 * how long this transaction has been running, and if run time
+	 * < commit time then we sleep for the delta and commit.  This
+	 * greatly helps super fast disks that would see slowdowns as
+	 * more threads started doing fsyncs.
+	 *
+	 * But don't do this if this process was the most recent one
+	 * to perform a synchronous write.  We do this to detect the
+	 * case where a single process is doing a stream of sync
+	 * writes.  No point in waiting for joiners in that case.
+	 */
+	pid = current->pid;
+	if (handle->h_sync && journal->j_last_sync_writer != pid) {
+		u64 commit_time, trans_time;
+
+		journal->j_last_sync_writer = pid;
+
+		read_lock(&journal->j_state_lock);
+		commit_time = journal->j_average_commit_time;
+		read_unlock(&journal->j_state_lock);
+
+		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+						   transaction->t_start_time));
+
+		commit_time = max_t(u64, commit_time,
+				    1000*journal->j_min_batch_time);
+		commit_time = min_t(u64, commit_time,
+				    1000*journal->j_max_batch_time);
+
+		if (trans_time < commit_time) {
+			ktime_t expires = ktime_add_ns(ktime_get(),
+						       commit_time);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+		}
+	}
+
+	if (handle->h_sync)
+		transaction->t_synchronous_commit = 1;
+	current->journal_info = NULL;
+	atomic_sub(handle->h_buffer_credits,
+		   &transaction->t_outstanding_credits);
+
+	/*
+	 * If the handle is marked SYNC, we need to set another commit
+	 * going!  We also want to force a commit if the current
+	 * transaction is occupying too much of the log, or if the
+	 * transaction is too old now.
+	 */
+	if (handle->h_sync ||
+	    (atomic_read(&transaction->t_outstanding_credits) >
+	     journal->j_max_transaction_buffers) ||
+	    time_after_eq(jiffies, transaction->t_expires)) {
+		/* Do this even for aborted journals: an abort still
+		 * completes the commit thread, it just doesn't write
+		 * anything to disk. */
+
+		jbd_debug(2, "transaction too old, requesting commit for "
+					"handle %p\n", handle);
+		/* This is non-blocking */
+		jbd2_log_start_commit(journal, transaction->t_tid);
+
+		/*
+		 * Special case: JBD2_SYNC synchronous updates require us
+		 * to wait for the commit to complete.
+		 */
+		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
+			wait_for_commit = 1;
+	}
+
+	/*
+	 * Once we drop t_updates, if it goes to zero the transaction
+	 * could start committing on us and eventually disappear.  So
+	 * once we do this, we must not dereference transaction
+	 * pointer again.
+	 */
+	tid = transaction->t_tid;
+	if (atomic_dec_and_test(&transaction->t_updates)) {
+		wake_up(&journal->j_wait_updates);
+		if (journal->j_barrier_count)
+			wake_up(&journal->j_wait_transaction_locked);
+	}
+
+	if (wait_for_commit)
+		err = jbd2_log_wait_commit(journal, tid);
+
+	lock_map_release(&handle->h_lockdep_map);
+
+	jbd2_free_handle(handle);
+	return err;
+}
+
+/**
+ * int jbd2_journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * For synchronous operations: force any uncommitted transactions
+ * to disk.  May seem kludgy, but it reuses all the handle batching
+ * code in a very simple manner.
+ */
+int jbd2_journal_force_commit(journal_t *journal)
+{
+	handle_t *handle;
+	int ret;
+
+	handle = jbd2_journal_start(journal, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+	} else {
+		handle->h_sync = 1;
+		ret = jbd2_journal_stop(handle);
+	}
+	return ret;
+}
+
+/*
+ *
+ * List management code snippets: various functions for manipulating the
+ * transaction buffer lists.
+ *
+ */
+
+/*
+ * Append a buffer to a transaction list, given the transaction's list head
+ * pointer.
+ *
+ * j_list_lock is held.
+ *
+ * jbd_lock_bh_state(jh2bh(jh)) is held.
+ */
+
+static inline void
+__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
+{
+	if (!*list) {
+		jh->b_tnext = jh->b_tprev = jh;
+		*list = jh;
+	} else {
+		/* Insert at the tail of the list to preserve order */
+		struct journal_head *first = *list, *last = first->b_tprev;
+		jh->b_tprev = last;
+		jh->b_tnext = first;
+		last->b_tnext = first->b_tprev = jh;
+	}
+}
+
+/*
+ * Remove a buffer from a transaction list, given the transaction's list
+ * head pointer.
+ *
+ * Called with j_list_lock held, and the journal may not be locked.
+ *
+ * jbd_lock_bh_state(jh2bh(jh)) is held.
+ */
+
+static inline void
+__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
+{
+	if (*list == jh) {
+		*list = jh->b_tnext;
+		if (*list == jh)
+			*list = NULL;
+	}
+	jh->b_tprev->b_tnext = jh->b_tnext;
+	jh->b_tnext->b_tprev = jh->b_tprev;
+}
+
+/*
+ * Remove a buffer from the appropriate transaction list.
+ *
+ * Note that this function can *change* the value of
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
+ * of these pointers, it could go bad.  Generally the caller needs to re-read
+ * the pointer from the transaction_t.
+ *
+ * Called under j_list_lock.  The journal may not be locked.
+ */
+void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
+{
+	struct journal_head **list = NULL;
+	transaction_t *transaction;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	transaction = jh->b_transaction;
+	if (transaction)
+		assert_spin_locked(&transaction->t_journal->j_list_lock);
+
+	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+	if (jh->b_jlist != BJ_None)
+		J_ASSERT_JH(jh, transaction != NULL);
+
+	switch (jh->b_jlist) {
+	case BJ_None:
+		return;
+	case BJ_Metadata:
+		transaction->t_nr_buffers--;
+		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
+		list = &transaction->t_buffers;
+		break;
+	case BJ_Forget:
+		list = &transaction->t_forget;
+		break;
+	case BJ_IO:
+		list = &transaction->t_iobuf_list;
+		break;
+	case BJ_Shadow:
+		list = &transaction->t_shadow_list;
+		break;
+	case BJ_LogCtl:
+		list = &transaction->t_log_list;
+		break;
+	case BJ_Reserved:
+		list = &transaction->t_reserved_list;
+		break;
+	}
+
+	__blist_del_buffer(list, jh);
+	jh->b_jlist = BJ_None;
+	if (test_clear_buffer_jbddirty(bh))
+		mark_buffer_dirty(bh);	/* Expose it to the VM */
+}
+
+/*
+ * Remove buffer from all transactions.
+ *
+ * Called with bh_state lock and j_list_lock
+ *
+ * jh and bh may be already freed when this function returns.
+ */
+static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
+{
+	__jbd2_journal_temp_unlink_buffer(jh);
+	jh->b_transaction = NULL;
+	jbd2_journal_put_journal_head(jh);
+}
+
+void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	/* Get reference so that buffer cannot be freed before we unlock it */
+	get_bh(bh);
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+	__jbd2_journal_unfile_buffer(jh);
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+	__brelse(bh);
+}
+
+/*
+ * Called from jbd2_journal_try_to_free_buffers().
+ *
+ * Called under jbd_lock_bh_state(bh)
+ */
+static void
+__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
+{
+	struct journal_head *jh;
+
+	jh = bh2jh(bh);
+
+	if (buffer_locked(bh) || buffer_dirty(bh))
+		goto out;
+
+	if (jh->b_next_transaction != NULL)
+		goto out;
+
+	spin_lock(&journal->j_list_lock);
+	if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+		/* written-back checkpointed metadata buffer */
+		if (jh->b_jlist == BJ_None) {
+			JBUFFER_TRACE(jh, "remove from checkpoint list");
+			__jbd2_journal_remove_checkpoint(jh);
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+out:
+	return;
+}
+
+/**
+ * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
+ * @journal: journal for operation
+ * @page: to try and free
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
+ *
+ *
+ * For all the buffers on this page,
+ * if they are fully written out ordered data, move them onto BUF_CLEAN
+ * so try_to_free_buffers() can reap them.
+ *
+ * This function returns non-zero if we wish try_to_free_buffers()
+ * to be called. We do this if the page is releasable by try_to_free_buffers().
+ * We also do it if the page has locked or dirty buffers and the caller wants
+ * us to perform sync or async writeout.
+ *
+ * This complicates JBD locking somewhat.  We aren't protected by the
+ * BKL here.  We wish to remove the buffer from its committing or
+ * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
+ *
+ * This may *change* the value of transaction_t->t_datalist, so anyone
+ * who looks at t_datalist needs to lock against this function.
+ *
+ * Even worse, someone may be doing a jbd2_journal_dirty_data on this
+ * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
+ * will come out of the lock with the buffer dirty, which makes it
+ * ineligible for release here.
+ *
+ * Who else is affected by this?  hmm...  Really the only contender
+ * is do_get_write_access() - it could be looking at the buffer while
+ * journal_try_to_free_buffer() is changing its state.  But that
+ * cannot happen because we never reallocate freed data as metadata
+ * while the data is part of a transaction.  Yes?
+ *
+ * Return 0 on failure, 1 on success
+ */
+int jbd2_journal_try_to_free_buffers(journal_t *journal,
+				struct page *page, gfp_t gfp_mask)
+{
+	struct buffer_head *head;
+	struct buffer_head *bh;
+	int ret = 0;
+
+	J_ASSERT(PageLocked(page));
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		struct journal_head *jh;
+
+		/*
+		 * We take our own ref against the journal_head here to avoid
+		 * having to add tons of locking around each instance of
+		 * jbd2_journal_put_journal_head().
+		 */
+		jh = jbd2_journal_grab_journal_head(bh);
+		if (!jh)
+			continue;
+
+		jbd_lock_bh_state(bh);
+		__journal_try_to_free_buffer(journal, bh);
+		jbd2_journal_put_journal_head(jh);
+		jbd_unlock_bh_state(bh);
+		if (buffer_jbd(bh))
+			goto busy;
+	} while ((bh = bh->b_this_page) != head);
+
+	ret = try_to_free_buffers(page);
+
+busy:
+	return ret;
+}
+
+/*
+ * This buffer is no longer needed.  If it is on an older transaction's
+ * checkpoint list we need to record it on this transaction's forget list
+ * to pin this buffer (and hence its checkpointing transaction) down until
+ * this transaction commits.  If the buffer isn't on a checkpoint list, we
+ * release it.
+ * Returns non-zero if JBD no longer has an interest in the buffer.
+ *
+ * Called under j_list_lock.
+ *
+ * Called under jbd_lock_bh_state(bh).
+ */
+static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
+{
+	int may_free = 1;
+	struct buffer_head *bh = jh2bh(jh);
+
+	if (jh->b_cp_transaction) {
+		JBUFFER_TRACE(jh, "on running+cp transaction");
+		__jbd2_journal_temp_unlink_buffer(jh);
+		/*
+		 * We don't want to write the buffer anymore, clear the
+		 * bit so that we don't confuse checks in
+		 * __journal_file_buffer
+		 */
+		clear_buffer_dirty(bh);
+		__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+		may_free = 0;
+	} else {
+		JBUFFER_TRACE(jh, "on running transaction");
+		__jbd2_journal_unfile_buffer(jh);
+	}
+	return may_free;
+}
+
+/*
+ * jbd2_journal_invalidatepage
+ *
+ * This code is tricky.  It has a number of cases to deal with.
+ *
+ * There are two invariants which this code relies on:
+ *
+ * i_size must be updated on disk before we start calling invalidatepage on the
+ * data.
+ *
+ *  This is done in ext3 by defining an ext3_setattr method which
+ *  updates i_size before truncate gets going.  By maintaining this
+ *  invariant, we can be sure that it is safe to throw away any buffers
+ *  attached to the current transaction: once the transaction commits,
+ *  we know that the data will not be needed.
+ *
+ *  Note however that we can *not* throw away data belonging to the
+ *  previous, committing transaction!
+ *
+ * Any disk blocks which *are* part of the previous, committing
+ * transaction (and which therefore cannot be discarded immediately) are
+ * not going to be reused in the new running transaction
+ *
+ *  The bitmap committed_data images guarantee this: any block which is
+ *  allocated in one transaction and removed in the next will be marked
+ *  as in-use in the committed_data bitmap, so cannot be reused until
+ *  the next transaction to delete the block commits.  This means that
+ *  leaving committing buffers dirty is quite safe: the disk blocks
+ *  cannot be reallocated to a different file and so buffer aliasing is
+ *  not possible.
+ *
+ *
+ * The above applies mainly to ordered data mode.  In writeback mode we
+ * don't make guarantees about the order in which data hits disk --- in
+ * particular we don't guarantee that new dirty data is flushed before
+ * transaction commit --- so it is always safe just to discard data
+ * immediately in that mode.  --sct
+ */
+
+/*
+ * The journal_unmap_buffer helper function returns zero if the buffer
+ * concerned remains pinned as an anonymous buffer belonging to an older
+ * transaction.
+ *
+ * We're outside-transaction here.  Either or both of j_running_transaction
+ * and j_committing_transaction may be NULL.
+ */
+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
+{
+	transaction_t *transaction;
+	struct journal_head *jh;
+	int may_free = 1;
+	int ret;
+
+	BUFFER_TRACE(bh, "entry");
+
+	/*
+	 * It is safe to proceed here without the j_list_lock because the
+	 * buffers cannot be stolen by try_to_free_buffers as long as we are
+	 * holding the page lock. --sct
+	 */
+
+	if (!buffer_jbd(bh))
+		goto zap_buffer_unlocked;
+
+	/* OK, we have data buffer in journaled mode */
+	write_lock(&journal->j_state_lock);
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+
+	jh = jbd2_journal_grab_journal_head(bh);
+	if (!jh)
+		goto zap_buffer_no_jh;
+
+	/*
+	 * We cannot remove the buffer from checkpoint lists until the
+	 * transaction adding inode to orphan list (let's call it T)
+	 * is committed.  Otherwise if the transaction changing the
+	 * buffer would be cleaned from the journal before T is
+	 * committed, a crash will cause that the correct contents of
+	 * the buffer will be lost.  On the other hand we have to
+	 * clear the buffer dirty bit at latest at the moment when the
+	 * transaction marking the buffer as freed in the filesystem
+	 * structures is committed because from that moment on the
+	 * buffer can be reallocated and used by a different page.
+	 * Since the block hasn't been freed yet but the inode has
+	 * already been added to orphan list, it is safe for us to add
+	 * the buffer to BJ_Forget list of the newest transaction.
+	 */
+	transaction = jh->b_transaction;
+	if (transaction == NULL) {
+		/* First case: not on any transaction.  If it
+		 * has no checkpoint link, then we can zap it:
+		 * it's a writeback-mode buffer so we don't care
+		 * if it hits disk safely. */
+		if (!jh->b_cp_transaction) {
+			JBUFFER_TRACE(jh, "not on any transaction: zap");
+			goto zap_buffer;
+		}
+
+		if (!buffer_dirty(bh)) {
+			/* bdflush has written it.  We can drop it now */
+			goto zap_buffer;
+		}
+
+		/* OK, it must be in the journal but still not
+		 * written fully to disk: it's metadata or
+		 * journaled data... */
+
+		if (journal->j_running_transaction) {
+			/* ... and once the current transaction has
+			 * committed, the buffer won't be needed any
+			 * longer. */
+			JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
+			ret = __dispose_buffer(jh,
+					journal->j_running_transaction);
+			jbd2_journal_put_journal_head(jh);
+			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
+			write_unlock(&journal->j_state_lock);
+			return ret;
+		} else {
+			/* There is no currently-running transaction. So the
+			 * orphan record which we wrote for this file must have
+			 * passed into commit.  We must attach this buffer to
+			 * the committing transaction, if it exists. */
+			if (journal->j_committing_transaction) {
+				JBUFFER_TRACE(jh, "give to committing trans");
+				ret = __dispose_buffer(jh,
+					journal->j_committing_transaction);
+				jbd2_journal_put_journal_head(jh);
+				spin_unlock(&journal->j_list_lock);
+				jbd_unlock_bh_state(bh);
+				write_unlock(&journal->j_state_lock);
+				return ret;
+			} else {
+				/* The orphan record's transaction has
+				 * committed.  We can cleanse this buffer */
+				clear_buffer_jbddirty(bh);
+				goto zap_buffer;
+			}
+		}
+	} else if (transaction == journal->j_committing_transaction) {
+		JBUFFER_TRACE(jh, "on committing transaction");
+		/*
+		 * The buffer is committing, we simply cannot touch
+		 * it. So we just set j_next_transaction to the
+		 * running transaction (if there is one) and mark
+		 * buffer as freed so that commit code knows it should
+		 * clear dirty bits when it is done with the buffer.
+		 */
+		set_buffer_freed(bh);
+		if (journal->j_running_transaction && buffer_jbddirty(bh))
+			jh->b_next_transaction = journal->j_running_transaction;
+		jbd2_journal_put_journal_head(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		write_unlock(&journal->j_state_lock);
+		return 0;
+	} else {
+		/* Good, the buffer belongs to the running transaction.
+		 * We are writing our own transaction's data, not any
+		 * previous one's, so it is safe to throw it away
+		 * (remember that we expect the filesystem to have set
+		 * i_size already for this truncate so recovery will not
+		 * expose the disk blocks we are discarding here.) */
+		J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
+		JBUFFER_TRACE(jh, "on running transaction");
+		may_free = __dispose_buffer(jh, transaction);
+	}
+
+zap_buffer:
+	jbd2_journal_put_journal_head(jh);
+zap_buffer_no_jh:
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+	write_unlock(&journal->j_state_lock);
+zap_buffer_unlocked:
+	clear_buffer_dirty(bh);
+	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	clear_buffer_delay(bh);
+	clear_buffer_unwritten(bh);
+	bh->b_bdev = NULL;
+	return may_free;
+}
+
+/**
+ * void jbd2_journal_invalidatepage()
+ * @journal: journal to use for flush...
+ * @page:    page to flush
+ * @offset:  length of page to invalidate.
+ *
+ * Reap page buffers containing data after offset in page.
+ *
+ */
+void jbd2_journal_invalidatepage(journal_t *journal,
+		      struct page *page,
+		      unsigned long offset)
+{
+	struct buffer_head *head, *bh, *next;
+	unsigned int curr_off = 0;
+	int may_free = 1;
+
+	if (!PageLocked(page))
+		BUG();
+	if (!page_has_buffers(page))
+		return;
+
+	/* We will potentially be playing with lists other than just the
+	 * data lists (especially for journaled data mode), so be
+	 * cautious in our locking. */
+
+	head = bh = page_buffers(page);
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+		next = bh->b_this_page;
+
+		if (offset <= curr_off) {
+			/* This block is wholly outside the truncation point */
+			lock_buffer(bh);
+			may_free &= journal_unmap_buffer(journal, bh);
+			unlock_buffer(bh);
+		}
+		curr_off = next_off;
+		bh = next;
+
+	} while (bh != head);
+
+	if (!offset) {
+		if (may_free && try_to_free_buffers(page))
+			J_ASSERT(!page_has_buffers(page));
+	}
+}
+
+/*
+ * File a buffer on the given transaction list.
+ */
+void __jbd2_journal_file_buffer(struct journal_head *jh,
+			transaction_t *transaction, int jlist)
+{
+	struct journal_head **list = NULL;
+	int was_dirty = 0;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	assert_spin_locked(&transaction->t_journal->j_list_lock);
+
+	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+	J_ASSERT_JH(jh, jh->b_transaction == transaction ||
+				jh->b_transaction == NULL);
+
+	if (jh->b_transaction && jh->b_jlist == jlist)
+		return;
+
+	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+	    jlist == BJ_Shadow || jlist == BJ_Forget) {
+		/*
+		 * For metadata buffers, we track dirty bit in buffer_jbddirty
+		 * instead of buffer_dirty. We should not see a dirty bit set
+		 * here because we clear it in do_get_write_access but e.g.
+		 * tune2fs can modify the sb and set the dirty bit at any time
+		 * so we try to gracefully handle that.
+		 */
+		if (buffer_dirty(bh))
+			warn_dirty_buffer(bh);
+		if (test_clear_buffer_dirty(bh) ||
+		    test_clear_buffer_jbddirty(bh))
+			was_dirty = 1;
+	}
+
+	if (jh->b_transaction)
+		__jbd2_journal_temp_unlink_buffer(jh);
+	else
+		jbd2_journal_grab_journal_head(bh);
+	jh->b_transaction = transaction;
+
+	switch (jlist) {
+	case BJ_None:
+		J_ASSERT_JH(jh, !jh->b_committed_data);
+		J_ASSERT_JH(jh, !jh->b_frozen_data);
+		return;
+	case BJ_Metadata:
+		transaction->t_nr_buffers++;
+		list = &transaction->t_buffers;
+		break;
+	case BJ_Forget:
+		list = &transaction->t_forget;
+		break;
+	case BJ_IO:
+		list = &transaction->t_iobuf_list;
+		break;
+	case BJ_Shadow:
+		list = &transaction->t_shadow_list;
+		break;
+	case BJ_LogCtl:
+		list = &transaction->t_log_list;
+		break;
+	case BJ_Reserved:
+		list = &transaction->t_reserved_list;
+		break;
+	}
+
+	__blist_add_buffer(list, jh);
+	jh->b_jlist = jlist;
+
+	if (was_dirty)
+		set_buffer_jbddirty(bh);
+}
+
+void jbd2_journal_file_buffer(struct journal_head *jh,
+				transaction_t *transaction, int jlist)
+{
+	jbd_lock_bh_state(jh2bh(jh));
+	spin_lock(&transaction->t_journal->j_list_lock);
+	__jbd2_journal_file_buffer(jh, transaction, jlist);
+	spin_unlock(&transaction->t_journal->j_list_lock);
+	jbd_unlock_bh_state(jh2bh(jh));
+}
+
+/*
+ * Remove a buffer from its current buffer list in preparation for
+ * dropping it from its current transaction entirely.  If the buffer has
+ * already started to be used by a subsequent transaction, refile the
+ * buffer on that transaction's metadata list.
+ *
+ * Called under j_list_lock
+ * Called under jbd_lock_bh_state(jh2bh(jh))
+ *
+ * jh and bh may be already free when this function returns
+ */
+void __jbd2_journal_refile_buffer(struct journal_head *jh)
+{
+	int was_dirty, jlist;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	if (jh->b_transaction)
+		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
+
+	/* If the buffer is now unused, just drop it. */
+	if (jh->b_next_transaction == NULL) {
+		__jbd2_journal_unfile_buffer(jh);
+		return;
+	}
+
+	/*
+	 * It has been modified by a later transaction: add it to the new
+	 * transaction's metadata list.
+	 */
+
+	was_dirty = test_clear_buffer_jbddirty(bh);
+	__jbd2_journal_temp_unlink_buffer(jh);
+	/*
+	 * We set b_transaction here because b_next_transaction will inherit
+	 * our jh reference and thus __jbd2_journal_file_buffer() must not
+	 * take a new one.
+	 */
+	jh->b_transaction = jh->b_next_transaction;
+	jh->b_next_transaction = NULL;
+	if (buffer_freed(bh))
+		jlist = BJ_Forget;
+	else if (jh->b_modified)
+		jlist = BJ_Metadata;
+	else
+		jlist = BJ_Reserved;
+	__jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
+	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
+
+	if (was_dirty)
+		set_buffer_jbddirty(bh);
+}
+
+/*
+ * __jbd2_journal_refile_buffer() with necessary locking added. We take our
+ * bh reference so that we can safely unlock bh.
+ *
+ * The jh and bh may be freed by this call.
+ */
+void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	/* Get reference so that buffer cannot be freed before we unlock it */
+	get_bh(bh);
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+	__jbd2_journal_refile_buffer(jh);
+	jbd_unlock_bh_state(bh);
+	spin_unlock(&journal->j_list_lock);
+	__brelse(bh);
+}
+
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+
+	if (is_handle_aborted(handle))
+		return -EIO;
+
+	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+			transaction->t_tid);
+
+	/*
+	 * First check whether inode isn't already on the transaction's
+	 * lists without taking the lock. Note that this check is safe
+	 * without the lock as we cannot race with somebody removing inode
+	 * from the transaction. The reason is that we remove inode from the
+	 * transaction only in journal_release_jbd_inode() and when we commit
+	 * the transaction. We are guarded from the first case by holding
+	 * a reference to the inode. We are safe against the second case
+	 * because if jinode->i_transaction == transaction, commit code
+	 * cannot touch the transaction because we hold reference to it,
+	 * and if jinode->i_next_transaction == transaction, commit code
+	 * will only file the inode where we want it.
+	 */
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		return 0;
+
+	spin_lock(&journal->j_list_lock);
+
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		goto done;
+
+	/*
+	 * We only ever set this variable to 1 so the test is safe. Since
+	 * t_need_data_flush is likely to be set, we do the test to save some
+	 * cacheline bouncing
+	 */
+	if (!transaction->t_need_data_flush)
+		transaction->t_need_data_flush = 1;
+	/* On some different transaction's list - should be
+	 * the committing one */
+	if (jinode->i_transaction) {
+		J_ASSERT(jinode->i_next_transaction == NULL);
+		J_ASSERT(jinode->i_transaction ==
+					journal->j_committing_transaction);
+		jinode->i_next_transaction = transaction;
+		goto done;
+	}
+	/* Not on any transaction list... */
+	J_ASSERT(!jinode->i_next_transaction);
+	jinode->i_transaction = transaction;
+	list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+	spin_unlock(&journal->j_list_lock);
+
+	return 0;
+}
+
+/*
+ * File truncate and transaction commit interact with each other in a
+ * non-trivial way.  If a transaction writing data block A is
+ * committing, we cannot discard the data by truncate until we have
+ * written them.  Otherwise if we crashed after the transaction with
+ * write has committed but before the transaction with truncate has
+ * committed, we could see stale data in block A.  This function is a
+ * helper to solve this problem.  It starts writeout of the truncated
+ * part in case it is in the committing transaction.
+ *
+ * Filesystem code must call this function when inode is journaled in
+ * ordered mode before truncation happens and after the inode has been
+ * placed on orphan list with the new inode size. The second condition
+ * avoids the race that someone writes new data and we start
+ * committing the transaction after this function has been called but
+ * before a transaction for truncate is started (and furthermore it
+ * allows us to optimize the case where the addition to orphan list
+ * happens in the same transaction as write --- we don't have to write
+ * any data in such case).
+ */
+int jbd2_journal_begin_ordered_truncate(journal_t *journal,
+					struct jbd2_inode *jinode,
+					loff_t new_size)
+{
+	transaction_t *inode_trans, *commit_trans;
+	int ret = 0;
+
+	/* This is a quick check to avoid locking if not necessary */
+	if (!jinode->i_transaction)
+		goto out;
+	/* Locks are here just to force reading of recent values, it is
+	 * enough that the transaction was not committing before we started
+	 * a transaction adding the inode to orphan list */
+	read_lock(&journal->j_state_lock);
+	commit_trans = journal->j_committing_transaction;
+	read_unlock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	inode_trans = jinode->i_transaction;
+	spin_unlock(&journal->j_list_lock);
+	if (inode_trans == commit_trans) {
+		ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
+			new_size, LLONG_MAX);
+		if (ret)
+			jbd2_journal_abort(journal, ret);
+	}
+out:
+	return ret;
+}
diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig
new file mode 100644
index 00000000..6ae169cd
--- /dev/null
+++ b/fs/jffs2/Kconfig
@@ -0,0 +1,188 @@
+config JFFS2_FS
+	tristate "Journalling Flash File System v2 (JFFS2) support"
+	select CRC32
+	depends on MTD
+	help
+	  JFFS2 is the second generation of the Journalling Flash File System
+	  for use on diskless embedded devices. It provides improved wear
+	  levelling, compression and support for hard links. You cannot use
+	  this on normal block devices, only on 'MTD' devices.
+
+	  Further information on the design and implementation of JFFS2 is
+	  available at <http://sources.redhat.com/jffs2/>.
+
+config JFFS2_FS_DEBUG
+	int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)"
+	depends on JFFS2_FS
+	default "0"
+	help
+	  This controls the amount of debugging messages produced by the JFFS2
+	  code. Set it to zero for use in production systems. For evaluation,
+	  testing and debugging, it's advisable to set it to one. This will
+	  enable a few assertions and will print debugging messages at the
+	  KERN_DEBUG loglevel, where they won't normally be visible. Level 2
+	  is unlikely to be useful - it enables extra debugging in certain
+	  areas which at one point needed debugging, but when the bugs were
+	  located and fixed, the detailed messages were relegated to level 2.
+
+	  If reporting bugs, please try to have available a full dump of the
+	  messages at debug level 1 while the misbehaviour was occurring.
+
+config JFFS2_FS_WRITEBUFFER
+	bool "JFFS2 write-buffering support"
+	depends on JFFS2_FS
+	default y
+	help
+	  This enables the write-buffering support in JFFS2.
+
+	  This functionality is required to support JFFS2 on the following
+	  types of flash devices:
+	    - NAND flash
+	    - NOR flash with transparent ECC
+	    - DataFlash
+
+config JFFS2_FS_WBUF_VERIFY
+	bool "Verify JFFS2 write-buffer reads"
+	depends on JFFS2_FS_WRITEBUFFER
+	default n
+	help
+	  This causes JFFS2 to read back every page written through the
+	  write-buffer, and check for errors.
+
+config JFFS2_SUMMARY
+	bool "JFFS2 summary support (EXPERIMENTAL)"
+	depends on JFFS2_FS && EXPERIMENTAL
+	default n
+	help
+	  This feature makes it possible to use summary information
+	  for faster filesystem mount.
+
+	  The summary information can be inserted into a filesystem image
+	  by the utility 'sumtool'.
+
+	  If unsure, say 'N'.
+
+config JFFS2_FS_XATTR
+	bool "JFFS2 XATTR support (EXPERIMENTAL)"
+	depends on JFFS2_FS && EXPERIMENTAL
+	default n
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config JFFS2_FS_POSIX_ACL
+	bool "JFFS2 POSIX Access Control Lists"
+	depends on JFFS2_FS_XATTR
+	default y
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config JFFS2_FS_SECURITY
+	bool "JFFS2 Security Labels"
+	depends on JFFS2_FS_XATTR
+	default y
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the jffs2 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
+config JFFS2_COMPRESSION_OPTIONS
+	bool "Advanced compression options for JFFS2"
+	depends on JFFS2_FS
+	default n
+	help
+	  Enabling this option allows you to explicitly choose which
+	  compression modules, if any, are enabled in JFFS2. Removing
+	  compressors can mean you cannot read existing file systems,
+	  and enabling experimental compressors can mean that you
+	  write a file system which cannot be read by a standard kernel.
+
+	  If unsure, you should _definitely_ say 'N'.
+
+config JFFS2_ZLIB
+	bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	depends on JFFS2_FS
+	default y
+	help
+	  Zlib is designed to be a free, general-purpose, legally unencumbered,
+	  lossless data-compression library for use on virtually any computer
+	  hardware and operating system. See <http://www.gzip.org/zlib/> for
+	  further information.
+
+	  Say 'Y' if unsure.
+
+config JFFS2_LZO
+	bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS
+	select LZO_COMPRESS
+	select LZO_DECOMPRESS
+	depends on JFFS2_FS
+	default n
+	help
+	  minilzo-based compression. Generally works better than Zlib.
+
+	  This feature was added in July, 2007. Say 'N' if you need
+	  compatibility with older bootloaders or kernels.
+
+config JFFS2_RTIME
+	bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS
+	depends on JFFS2_FS
+	default y
+	help
+	  Rtime does manage to recompress already-compressed data. Say 'Y' if unsure.
+
+config JFFS2_RUBIN
+	bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS
+	depends on JFFS2_FS
+	default n
+	help
+	  RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure.
+
+choice
+	prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS
+	default JFFS2_CMODE_PRIORITY
+	depends on JFFS2_FS
+	help
+	  You can set here the default compression mode of JFFS2 from
+	  the available compression modes. Don't touch if unsure.
+
+config JFFS2_CMODE_NONE
+	bool "no compression"
+	help
+	  Uses no compression.
+
+config JFFS2_CMODE_PRIORITY
+	bool "priority"
+	help
+	  Tries the compressors in a predefined order and chooses the first
+	  successful one.
+
+config JFFS2_CMODE_SIZE
+	bool "size (EXPERIMENTAL)"
+	help
+	  Tries all compressors and chooses the one which has the smallest
+	  result.
+
+config JFFS2_CMODE_FAVOURLZO
+	bool "Favour LZO"
+	help
+	  Tries all compressors and chooses the one which has the smallest
+	  result but gives some preference to LZO (which has faster
+	  decompression) at the expense of size.
+
+endchoice
diff --git a/fs/jffs2/LICENCE b/fs/jffs2/LICENCE
new file mode 100644
index 00000000..56288590
--- /dev/null
+++ b/fs/jffs2/LICENCE
@@ -0,0 +1,30 @@
+The files in this directory and elsewhere which refer to this LICENCE
+file are part of JFFS2, the Journalling Flash File System v2.
+
+	Copyright © 2001-2007 Red Hat, Inc. and others
+
+JFFS2 is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2 or (at your option) any later 
+version.
+
+JFFS2 is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License along
+with JFFS2; if not, write to the Free Software Foundation, Inc.,
+59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+
+As a special exception, if other files instantiate templates or use
+macros or inline functions from these files, or you compile these
+files and link them with other works to produce a work based on these
+files, these files do not by themselves cause the resulting work to be
+covered by the GNU General Public License. However the source code for
+these files must still be made available in accordance with section (3)
+of the GNU General Public License.
+
+This exception does not invalidate any other reasons why a work based on
+this file might be covered by the GNU General Public License.
+
diff --git a/fs/jffs2/Makefile b/fs/jffs2/Makefile
new file mode 100644
index 00000000..60e5d49c
--- /dev/null
+++ b/fs/jffs2/Makefile
@@ -0,0 +1,21 @@
+#
+# Makefile for the Linux Journalling Flash File System v2 (JFFS2)
+#
+#
+
+obj-$(CONFIG_JFFS2_FS) += jffs2.o
+
+jffs2-y	:= compr.o dir.o file.o ioctl.o nodelist.o malloc.o
+jffs2-y	+= read.o nodemgmt.o readinode.o write.o scan.o gc.o
+jffs2-y	+= symlink.o build.o erase.o background.o fs.o writev.o
+jffs2-y	+= super.o debug.o
+
+jffs2-$(CONFIG_JFFS2_FS_WRITEBUFFER)	+= wbuf.o
+jffs2-$(CONFIG_JFFS2_FS_XATTR)		+= xattr.o xattr_trusted.o xattr_user.o
+jffs2-$(CONFIG_JFFS2_FS_SECURITY)	+= security.o
+jffs2-$(CONFIG_JFFS2_FS_POSIX_ACL)	+= acl.o
+jffs2-$(CONFIG_JFFS2_RUBIN)	+= compr_rubin.o
+jffs2-$(CONFIG_JFFS2_RTIME)	+= compr_rtime.o
+jffs2-$(CONFIG_JFFS2_ZLIB)	+= compr_zlib.o
+jffs2-$(CONFIG_JFFS2_LZO)	+= compr_lzo.o
+jffs2-$(CONFIG_JFFS2_SUMMARY)   += summary.o
diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking
new file mode 100644
index 00000000..3ea36554
--- /dev/null
+++ b/fs/jffs2/README.Locking
@@ -0,0 +1,172 @@
+
+	JFFS2 LOCKING DOCUMENTATION
+	---------------------------
+
+At least theoretically, JFFS2 does not require the Big Kernel Lock
+(BKL), which was always helpfully obtained for it by Linux 2.4 VFS
+code. It has its own locking, as described below.
+
+This document attempts to describe the existing locking rules for
+JFFS2. It is not expected to remain perfectly up to date, but ought to
+be fairly close.
+
+
+	alloc_sem
+	---------
+
+The alloc_sem is a per-filesystem mutex, used primarily to ensure
+contiguous allocation of space on the medium. It is automatically
+obtained during space allocations (jffs2_reserve_space()) and freed
+upon write completion (jffs2_complete_reservation()). Note that
+the garbage collector will obtain this right at the beginning of
+jffs2_garbage_collect_pass() and release it at the end, thereby
+preventing any other write activity on the file system during a
+garbage collect pass.
+
+When writing new nodes, the alloc_sem must be held until the new nodes
+have been properly linked into the data structures for the inode to
+which they belong. This is for the benefit of NAND flash - adding new
+nodes to an inode may obsolete old ones, and by holding the alloc_sem
+until this happens we ensure that any data in the write-buffer at the
+time this happens are part of the new node, not just something that
+was written afterwards. Hence, we can ensure the newly-obsoleted nodes
+don't actually get erased until the write-buffer has been flushed to
+the medium.
+
+With the introduction of NAND flash support and the write-buffer, 
+the alloc_sem is also used to protect the wbuf-related members of the
+jffs2_sb_info structure. Atomically reading the wbuf_len member to see
+if the wbuf is currently holding any data is permitted, though.
+
+Ordering constraints: See f->sem.
+
+
+	File Mutex f->sem
+	---------------------
+
+This is the JFFS2-internal equivalent of the inode mutex i->i_sem.
+It protects the contents of the jffs2_inode_info private inode data,
+including the linked list of node fragments (but see the notes below on
+erase_completion_lock), etc.
+
+The reason that the i_sem itself isn't used for this purpose is to
+avoid deadlocks with garbage collection -- the VFS will lock the i_sem
+before calling a function which may need to allocate space. The
+allocation may trigger garbage-collection, which may need to move a
+node belonging to the inode which was locked in the first place by the
+VFS. If the garbage collection code were to attempt to lock the i_sem
+of the inode from which it's garbage-collecting a physical node, this
+lead to deadlock, unless we played games with unlocking the i_sem
+before calling the space allocation functions.
+
+Instead of playing such games, we just have an extra internal
+mutex, which is obtained by the garbage collection code and also
+by the normal file system code _after_ allocation of space.
+
+Ordering constraints: 
+
+	1. Never attempt to allocate space or lock alloc_sem with 
+	   any f->sem held.
+	2. Never attempt to lock two file mutexes in one thread.
+	   No ordering rules have been made for doing so.
+
+
+	erase_completion_lock spinlock
+	------------------------------
+
+This is used to serialise access to the eraseblock lists, to the
+per-eraseblock lists of physical jffs2_raw_node_ref structures, and
+(NB) the per-inode list of physical nodes. The latter is a special
+case - see below.
+
+As the MTD API no longer permits erase-completion callback functions
+to be called from bottom-half (timer) context (on the basis that nobody
+ever actually implemented such a thing), it's now sufficient to use
+a simple spin_lock() rather than spin_lock_bh().
+
+Note that the per-inode list of physical nodes (f->nodes) is a special
+case. Any changes to _valid_ nodes (i.e. ->flash_offset & 1 == 0) in
+the list are protected by the file mutex f->sem. But the erase code
+may remove _obsolete_ nodes from the list while holding only the
+erase_completion_lock. So you can walk the list only while holding the
+erase_completion_lock, and can drop the lock temporarily mid-walk as
+long as the pointer you're holding is to a _valid_ node, not an
+obsolete one.
+
+The erase_completion_lock is also used to protect the c->gc_task
+pointer when the garbage collection thread exits. The code to kill the
+GC thread locks it, sends the signal, then unlocks it - while the GC
+thread itself locks it, zeroes c->gc_task, then unlocks on the exit path.
+
+
+	inocache_lock spinlock
+	----------------------
+
+This spinlock protects the hashed list (c->inocache_list) of the
+in-core jffs2_inode_cache objects (each inode in JFFS2 has the
+correspondent jffs2_inode_cache object). So, the inocache_lock
+has to be locked while walking the c->inocache_list hash buckets.
+
+This spinlock also covers allocation of new inode numbers, which is
+currently just '++->highest_ino++', but might one day get more complicated
+if we need to deal with wrapping after 4 milliard inode numbers are used.
+
+Note, the f->sem guarantees that the correspondent jffs2_inode_cache
+will not be removed. So, it is allowed to access it without locking
+the inocache_lock spinlock. 
+
+Ordering constraints: 
+
+	If both erase_completion_lock and inocache_lock are needed, the
+	c->erase_completion has to be acquired first.
+
+
+	erase_free_sem
+	--------------
+
+This mutex is only used by the erase code which frees obsolete node
+references and the jffs2_garbage_collect_deletion_dirent() function.
+The latter function on NAND flash must read _obsolete_ nodes to
+determine whether the 'deletion dirent' under consideration can be
+discarded or whether it is still required to show that an inode has
+been unlinked. Because reading from the flash may sleep, the
+erase_completion_lock cannot be held, so an alternative, more
+heavyweight lock was required to prevent the erase code from freeing
+the jffs2_raw_node_ref structures in question while the garbage
+collection code is looking at them.
+
+Suggestions for alternative solutions to this problem would be welcomed.
+
+
+	wbuf_sem
+	--------
+
+This read/write semaphore protects against concurrent access to the
+write-behind buffer ('wbuf') used for flash chips where we must write
+in blocks. It protects both the contents of the wbuf and the metadata
+which indicates which flash region (if any) is currently covered by 
+the buffer.
+
+Ordering constraints:
+	Lock wbuf_sem last, after the alloc_sem or and f->sem.
+
+
+	c->xattr_sem
+	------------
+
+This read/write semaphore protects against concurrent access to the
+xattr related objects which include stuff in superblock and ic->xref.
+In read-only path, write-semaphore is too much exclusion. It's enough
+by read-semaphore. But you must hold write-semaphore when updating,
+creating or deleting any xattr related object.
+
+Once xattr_sem released, there would be no assurance for the existence
+of those objects. Thus, a series of processes is often required to retry,
+when updating such a object is necessary under holding read semaphore.
+For example, do_jffs2_getxattr() holds read-semaphore to scan xref and
+xdatum at first. But it retries this process with holding write-semaphore
+after release read-semaphore, if it's necessary to load name/value pair
+from medium.
+
+Ordering constraints:
+	Lock xattr_sem last, after the alloc_sem.
diff --git a/fs/jffs2/TODO b/fs/jffs2/TODO
new file mode 100644
index 00000000..ca28964a
--- /dev/null
+++ b/fs/jffs2/TODO
@@ -0,0 +1,37 @@
+
+ - support asynchronous operation -- add a per-fs 'reserved_space' count,
+   let each outstanding write reserve the _maximum_ amount of physical
+   space it could take. Let GC flush the outstanding writes because the
+   reservations will necessarily be pessimistic. With this we could even
+   do shared writable mmap, if we can have a fs hook for do_wp_page() to
+   make the reservation.
+ - disable compression in commit_write()?
+ - fine-tune the allocation / GC thresholds
+ - chattr support - turning on/off and tuning compression per-inode
+ - checkpointing (do we need this? scan is quite fast)
+ - make the scan code populate real inodes so read_inode just after 
+	mount doesn't have to read the flash twice for large files.
+	Make this a per-inode option, changeable with chattr, so you can
+	decide which inodes should be in-core immediately after mount.
+ - test, test, test
+
+ - NAND flash support:
+	- almost done :)
+	- use bad block check instead of the hardwired byte check
+
+ - Optimisations:
+   - Split writes so they go to two separate blocks rather than just c->nextblock.
+	By writing _new_ nodes to one block, and garbage-collected REF_PRISTINE
+	nodes to a different one, we can separate clean nodes from those which
+	are likely to become dirty, and end up with blocks which are each far
+	closer to 100% or 0% clean, hence speeding up later GC progress dramatically.
+   - Stop keeping name in-core with struct jffs2_full_dirent. If we keep the hash in 
+     the full dirent, we only need to go to the flash in lookup() when we think we've
+     got a match, and in readdir(). 
+   - Doubly-linked next_in_ino list to allow us to free obsoleted raw_node_refs immediately?
+   - Remove size from jffs2_raw_node_frag. 
+
+dedekind:
+1. __jffs2_flush_wbuf() has a strange 'pad' parameter. Eliminate.
+2. get_sb()->build_fs()->scan() path... Why get_sb() removes scan()'s crap in
+   case of failure? scan() does not clean everything. Fix.
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
new file mode 100644
index 00000000..828a0e1e
--- /dev/null
+++ b/fs/jffs2/acl.c
@@ -0,0 +1,440 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static size_t jffs2_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(struct jffs2_acl_header)
+		       + count * sizeof(struct jffs2_acl_entry_short);
+	} else {
+		return sizeof(struct jffs2_acl_header)
+		       + 4 * sizeof(struct jffs2_acl_entry_short)
+		       + (count - 4) * sizeof(struct jffs2_acl_entry);
+	}
+}
+
+static int jffs2_acl_count(size_t size)
+{
+	size_t s;
+
+	size -= sizeof(struct jffs2_acl_header);
+	if (size < 4 * sizeof(struct jffs2_acl_entry_short)) {
+		if (size % sizeof(struct jffs2_acl_entry_short))
+			return -1;
+		return size / sizeof(struct jffs2_acl_entry_short);
+	} else {
+		s = size - 4 * sizeof(struct jffs2_acl_entry_short);
+		if (s % sizeof(struct jffs2_acl_entry))
+			return -1;
+		return s / sizeof(struct jffs2_acl_entry) + 4;
+	}
+}
+
+static struct posix_acl *jffs2_acl_from_medium(void *value, size_t size)
+{
+	void *end = value + size;
+	struct jffs2_acl_header *header = value;
+	struct jffs2_acl_entry *entry;
+	struct posix_acl *acl;
+	uint32_t ver;
+	int i, count;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(struct jffs2_acl_header))
+		return ERR_PTR(-EINVAL);
+	ver = je32_to_cpu(header->a_version);
+	if (ver != JFFS2_ACL_VERSION) {
+		JFFS2_WARNING("Invalid ACL version. (=%u)\n", ver);
+		return ERR_PTR(-EINVAL);
+	}
+
+	value += sizeof(struct jffs2_acl_header);
+	count = jffs2_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+
+	for (i=0; i < count; i++) {
+		entry = value;
+		if (value + sizeof(struct jffs2_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[i].e_tag = je16_to_cpu(entry->e_tag);
+		acl->a_entries[i].e_perm = je16_to_cpu(entry->e_perm);
+		switch (acl->a_entries[i].e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				value += sizeof(struct jffs2_acl_entry_short);
+				acl->a_entries[i].e_id = ACL_UNDEFINED_ID;
+				break;
+
+			case ACL_USER:
+			case ACL_GROUP:
+				value += sizeof(struct jffs2_acl_entry);
+				if (value > end)
+					goto fail;
+				acl->a_entries[i].e_id = je32_to_cpu(entry->e_id);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+ fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
+{
+	struct jffs2_acl_header *header;
+	struct jffs2_acl_entry *entry;
+	void *e;
+	size_t i;
+
+	*size = jffs2_acl_size(acl->a_count);
+	header = kmalloc(sizeof(*header) + acl->a_count * sizeof(*entry), GFP_KERNEL);
+	if (!header)
+		return ERR_PTR(-ENOMEM);
+	header->a_version = cpu_to_je32(JFFS2_ACL_VERSION);
+	e = header + 1;
+	for (i=0; i < acl->a_count; i++) {
+		entry = e;
+		entry->e_tag = cpu_to_je16(acl->a_entries[i].e_tag);
+		entry->e_perm = cpu_to_je16(acl->a_entries[i].e_perm);
+		switch(acl->a_entries[i].e_tag) {
+			case ACL_USER:
+			case ACL_GROUP:
+				entry->e_id = cpu_to_je32(acl->a_entries[i].e_id);
+				e += sizeof(struct jffs2_acl_entry);
+				break;
+
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				e += sizeof(struct jffs2_acl_entry_short);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	return header;
+ fail:
+	kfree(header);
+	return ERR_PTR(-EINVAL);
+}
+
+static struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	char *value = NULL;
+	int rc, xprefix;
+
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	rc = do_jffs2_getxattr(inode, xprefix, "", NULL, 0);
+	if (rc > 0) {
+		value = kmalloc(rc, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		rc = do_jffs2_getxattr(inode, xprefix, "", value, rc);
+	}
+	if (rc > 0) {
+		acl = jffs2_acl_from_medium(value, rc);
+	} else if (rc == -ENODATA || rc == -ENOSYS) {
+		acl = NULL;
+	} else {
+		acl = ERR_PTR(rc);
+	}
+	if (value)
+		kfree(value);
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+	return acl;
+}
+
+static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *acl)
+{
+	char *value = NULL;
+	size_t size = 0;
+	int rc;
+
+	if (acl) {
+		value = jffs2_acl_to_medium(acl, &size);
+		if (IS_ERR(value))
+			return PTR_ERR(value);
+	}
+	rc = do_jffs2_setxattr(inode, xprefix, "", value, size, 0);
+	if (!value && rc == -ENODATA)
+		rc = 0;
+	kfree(value);
+
+	return rc;
+}
+
+static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+	int rc, xprefix;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			rc = posix_acl_equiv_mode(acl, &mode);
+			if (rc < 0)
+				return rc;
+			if (inode->i_mode != mode) {
+				struct iattr attr;
+
+				attr.ia_valid = ATTR_MODE | ATTR_CTIME;
+				attr.ia_mode = mode;
+				attr.ia_ctime = CURRENT_TIME_SEC;
+				rc = jffs2_do_setattr(inode, &attr);
+				if (rc < 0)
+					return rc;
+			}
+			if (rc == 0)
+				acl = NULL;
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+	rc = __jffs2_set_acl(inode, xprefix, acl);
+	if (!rc)
+		set_cached_acl(inode, type, acl);
+	return rc;
+}
+
+int jffs2_check_acl(struct inode *inode, int mask, unsigned int flags)
+{
+	struct posix_acl *acl;
+	int rc;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		rc = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return rc;
+	}
+	return -EAGAIN;
+}
+
+int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
+{
+	struct posix_acl *acl, *clone;
+	int rc;
+
+	cache_no_acl(inode);
+
+	if (S_ISLNK(*i_mode))
+		return 0;	/* Symlink always has no-ACL */
+
+	acl = jffs2_get_acl(dir_i, ACL_TYPE_DEFAULT);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+
+	if (!acl) {
+		*i_mode &= ~current_umask();
+	} else {
+		if (S_ISDIR(*i_mode))
+			set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
+
+		clone = posix_acl_clone(acl, GFP_KERNEL);
+		if (!clone)
+			return -ENOMEM;
+		rc = posix_acl_create_masq(clone, (mode_t *)i_mode);
+		if (rc < 0) {
+			posix_acl_release(clone);
+			return rc;
+		}
+		if (rc > 0)
+			set_cached_acl(inode, ACL_TYPE_ACCESS, clone);
+
+		posix_acl_release(clone);
+	}
+	return 0;
+}
+
+int jffs2_init_acl_post(struct inode *inode)
+{
+	int rc;
+
+	if (inode->i_default_acl) {
+		rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_DEFAULT, inode->i_default_acl);
+		if (rc)
+			return rc;
+	}
+
+	if (inode->i_acl) {
+		rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_ACCESS, inode->i_acl);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+int jffs2_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+	int rc;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+	rc = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!rc)
+		rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, clone);
+	posix_acl_release(clone);
+	return rc;
+}
+
+static size_t jffs2_acl_access_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+	if (list && retlen <= list_size)
+		strcpy(list, POSIX_ACL_XATTR_ACCESS);
+	return retlen;
+}
+
+static size_t jffs2_acl_default_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+	if (list && retlen <= list_size)
+		strcpy(list, POSIX_ACL_XATTR_DEFAULT);
+	return retlen;
+}
+
+static int jffs2_acl_getxattr(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	struct posix_acl *acl;
+	int rc;
+
+	if (name[0] != '\0')
+		return -EINVAL;
+
+	acl = jffs2_get_acl(dentry->d_inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (!acl)
+		return -ENODATA;
+	rc = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return rc;
+}
+
+static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	struct posix_acl *acl;
+	int rc;
+
+	if (name[0] != '\0')
+		return -EINVAL;
+	if (!inode_owner_or_capable(dentry->d_inode))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (acl) {
+			rc = posix_acl_valid(acl);
+			if (rc)
+				goto out;
+		}
+	} else {
+		acl = NULL;
+	}
+	rc = jffs2_set_acl(dentry->d_inode, type, acl);
+ out:
+	posix_acl_release(acl);
+	return rc;
+}
+
+const struct xattr_handler jffs2_acl_access_xattr_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.flags	= ACL_TYPE_DEFAULT,
+	.list	= jffs2_acl_access_listxattr,
+	.get	= jffs2_acl_getxattr,
+	.set	= jffs2_acl_setxattr,
+};
+
+const struct xattr_handler jffs2_acl_default_xattr_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.flags	= ACL_TYPE_DEFAULT,
+	.list	= jffs2_acl_default_listxattr,
+	.get	= jffs2_acl_getxattr,
+	.set	= jffs2_acl_setxattr,
+};
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
new file mode 100644
index 00000000..3119f592
--- /dev/null
+++ b/fs/jffs2/acl.h
@@ -0,0 +1,44 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+struct jffs2_acl_entry {
+	jint16_t	e_tag;
+	jint16_t	e_perm;
+	jint32_t	e_id;
+};
+
+struct jffs2_acl_entry_short {
+	jint16_t	e_tag;
+	jint16_t	e_perm;
+};
+
+struct jffs2_acl_header {
+	jint32_t	a_version;
+};
+
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+
+extern int jffs2_check_acl(struct inode *, int, unsigned int);
+extern int jffs2_acl_chmod(struct inode *);
+extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
+extern int jffs2_init_acl_post(struct inode *);
+
+extern const struct xattr_handler jffs2_acl_access_xattr_handler;
+extern const struct xattr_handler jffs2_acl_default_xattr_handler;
+
+#else
+
+#define jffs2_check_acl				(NULL)
+#define jffs2_acl_chmod(inode)			(0)
+#define jffs2_init_acl_pre(dir_i,inode,mode)	(0)
+#define jffs2_init_acl_post(inode)		(0)
+
+#endif	/* CONFIG_JFFS2_FS_POSIX_ACL */
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
new file mode 100644
index 00000000..404111b0
--- /dev/null
+++ b/fs/jffs2/background.c
@@ -0,0 +1,159 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/jffs2.h>
+#include <linux/mtd/mtd.h>
+#include <linux/completion.h>
+#include <linux/sched.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include "nodelist.h"
+
+
+static int jffs2_garbage_collect_thread(void *);
+
+void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
+{
+	assert_spin_locked(&c->erase_completion_lock);
+	if (c->gc_task && jffs2_thread_should_wake(c))
+		send_sig(SIGHUP, c->gc_task, 1);
+}
+
+/* This must only ever be called when no GC thread is currently running */
+int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c)
+{
+	struct task_struct *tsk;
+	int ret = 0;
+
+	BUG_ON(c->gc_task);
+
+	init_completion(&c->gc_thread_start);
+	init_completion(&c->gc_thread_exit);
+
+	tsk = kthread_run(jffs2_garbage_collect_thread, c, "jffs2_gcd_mtd%d", c->mtd->index);
+	if (IS_ERR(tsk)) {
+		printk(KERN_WARNING "fork failed for JFFS2 garbage collect thread: %ld\n", -PTR_ERR(tsk));
+		complete(&c->gc_thread_exit);
+		ret = PTR_ERR(tsk);
+	} else {
+		/* Wait for it... */
+		D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", tsk->pid));
+		wait_for_completion(&c->gc_thread_start);
+		ret = tsk->pid;
+	}
+
+	return ret;
+}
+
+void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c)
+{
+	int wait = 0;
+	spin_lock(&c->erase_completion_lock);
+	if (c->gc_task) {
+		D1(printk(KERN_DEBUG "jffs2: Killing GC task %d\n", c->gc_task->pid));
+		send_sig(SIGKILL, c->gc_task, 1);
+		wait = 1;
+	}
+	spin_unlock(&c->erase_completion_lock);
+	if (wait)
+		wait_for_completion(&c->gc_thread_exit);
+}
+
+static int jffs2_garbage_collect_thread(void *_c)
+{
+	struct jffs2_sb_info *c = _c;
+
+	allow_signal(SIGKILL);
+	allow_signal(SIGSTOP);
+	allow_signal(SIGCONT);
+
+	c->gc_task = current;
+	complete(&c->gc_thread_start);
+
+	set_user_nice(current, 10);
+
+	set_freezable();
+	for (;;) {
+		allow_signal(SIGHUP);
+	again:
+		spin_lock(&c->erase_completion_lock);
+		if (!jffs2_thread_should_wake(c)) {
+			set_current_state (TASK_INTERRUPTIBLE);
+			spin_unlock(&c->erase_completion_lock);
+			D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread sleeping...\n"));
+			schedule();
+		} else
+			spin_unlock(&c->erase_completion_lock);
+			
+
+		/* Problem - immediately after bootup, the GCD spends a lot
+		 * of time in places like jffs2_kill_fragtree(); so much so
+		 * that userspace processes (like gdm and X) are starved
+		 * despite plenty of cond_resched()s and renicing.  Yield()
+		 * doesn't help, either (presumably because userspace and GCD
+		 * are generally competing for a higher latency resource -
+		 * disk).
+		 * This forces the GCD to slow the hell down.   Pulling an
+		 * inode in with read_inode() is much preferable to having
+		 * the GC thread get there first. */
+		schedule_timeout_interruptible(msecs_to_jiffies(50));
+
+		if (kthread_should_stop()) {
+			D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread():  kthread_stop() called.\n"));
+			goto die;
+		}
+
+		/* Put_super will send a SIGKILL and then wait on the sem.
+		 */
+		while (signal_pending(current) || freezing(current)) {
+			siginfo_t info;
+			unsigned long signr;
+
+			if (try_to_freeze())
+				goto again;
+
+			signr = dequeue_signal_lock(current, &current->blocked, &info);
+
+			switch(signr) {
+			case SIGSTOP:
+				D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): SIGSTOP received.\n"));
+				set_current_state(TASK_STOPPED);
+				schedule();
+				break;
+
+			case SIGKILL:
+				D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): SIGKILL received.\n"));
+				goto die;
+
+			case SIGHUP:
+				D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): SIGHUP received.\n"));
+				break;
+			default:
+				D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): signal %ld received\n", signr));
+			}
+		}
+		/* We don't want SIGHUP to interrupt us. STOP and KILL are OK though. */
+		disallow_signal(SIGHUP);
+
+		D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): pass\n"));
+		if (jffs2_garbage_collect_pass(c) == -ENOSPC) {
+			printk(KERN_NOTICE "No space for garbage collection. Aborting GC thread\n");
+			goto die;
+		}
+	}
+ die:
+	spin_lock(&c->erase_completion_lock);
+	c->gc_task = NULL;
+	spin_unlock(&c->erase_completion_lock);
+	complete_and_exit(&c->gc_thread_exit, 0);
+}
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
new file mode 100644
index 00000000..3005ec45
--- /dev/null
+++ b/fs/jffs2/build.c
@@ -0,0 +1,392 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
+		struct jffs2_inode_cache *, struct jffs2_full_dirent **);
+
+static inline struct jffs2_inode_cache *
+first_inode_chain(int *i, struct jffs2_sb_info *c)
+{
+	for (; *i < c->inocache_hashsize; (*i)++) {
+		if (c->inocache_list[*i])
+			return c->inocache_list[*i];
+	}
+	return NULL;
+}
+
+static inline struct jffs2_inode_cache *
+next_inode(int *i, struct jffs2_inode_cache *ic, struct jffs2_sb_info *c)
+{
+	/* More in this chain? */
+	if (ic->next)
+		return ic->next;
+	(*i)++;
+	return first_inode_chain(i, c);
+}
+
+#define for_each_inode(i, c, ic)			\
+	for (i = 0, ic = first_inode_chain(&i, (c));	\
+	     ic;					\
+	     ic = next_inode(&i, ic, (c)))
+
+
+static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
+				    struct jffs2_inode_cache *ic)
+{
+	struct jffs2_full_dirent *fd;
+
+	dbg_fsbuild("building directory inode #%u\n", ic->ino);
+
+	/* For each child, increase nlink */
+	for(fd = ic->scan_dents; fd; fd = fd->next) {
+		struct jffs2_inode_cache *child_ic;
+		if (!fd->ino)
+			continue;
+
+		/* we can get high latency here with huge directories */
+
+		child_ic = jffs2_get_ino_cache(c, fd->ino);
+		if (!child_ic) {
+			dbg_fsbuild("child \"%s\" (ino #%u) of dir ino #%u doesn't exist!\n",
+				  fd->name, fd->ino, ic->ino);
+			jffs2_mark_node_obsolete(c, fd->raw);
+			continue;
+		}
+
+		if (fd->type == DT_DIR) {
+			if (child_ic->pino_nlink) {
+				JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u appears to be a hard link\n",
+					    fd->name, fd->ino, ic->ino);
+				/* TODO: What do we do about it? */
+			} else {
+				child_ic->pino_nlink = ic->ino;
+			}
+		} else
+			child_ic->pino_nlink++;
+
+		dbg_fsbuild("increased nlink for child \"%s\" (ino #%u)\n", fd->name, fd->ino);
+		/* Can't free scan_dents so far. We might need them in pass 2 */
+	}
+}
+
+/* Scan plan:
+ - Scan physical nodes. Build map of inodes/dirents. Allocate inocaches as we go
+ - Scan directory tree from top down, setting nlink in inocaches
+ - Scan inocaches for inodes with nlink==0
+*/
+static int jffs2_build_filesystem(struct jffs2_sb_info *c)
+{
+	int ret;
+	int i;
+	struct jffs2_inode_cache *ic;
+	struct jffs2_full_dirent *fd;
+	struct jffs2_full_dirent *dead_fds = NULL;
+
+	dbg_fsbuild("build FS data structures\n");
+
+	/* First, scan the medium and build all the inode caches with
+	   lists of physical nodes */
+
+	c->flags |= JFFS2_SB_FLAG_SCANNING;
+	ret = jffs2_scan_medium(c);
+	c->flags &= ~JFFS2_SB_FLAG_SCANNING;
+	if (ret)
+		goto exit;
+
+	dbg_fsbuild("scanned flash completely\n");
+	jffs2_dbg_dump_block_lists_nolock(c);
+
+	dbg_fsbuild("pass 1 starting\n");
+	c->flags |= JFFS2_SB_FLAG_BUILDING;
+	/* Now scan the directory tree, increasing nlink according to every dirent found. */
+	for_each_inode(i, c, ic) {
+		if (ic->scan_dents) {
+			jffs2_build_inode_pass1(c, ic);
+			cond_resched();
+		}
+	}
+
+	dbg_fsbuild("pass 1 complete\n");
+
+	/* Next, scan for inodes with nlink == 0 and remove them. If
+	   they were directories, then decrement the nlink of their
+	   children too, and repeat the scan. As that's going to be
+	   a fairly uncommon occurrence, it's not so evil to do it this
+	   way. Recursion bad. */
+	dbg_fsbuild("pass 2 starting\n");
+
+	for_each_inode(i, c, ic) {
+		if (ic->pino_nlink)
+			continue;
+
+		jffs2_build_remove_unlinked_inode(c, ic, &dead_fds);
+		cond_resched();
+	}
+
+	dbg_fsbuild("pass 2a starting\n");
+
+	while (dead_fds) {
+		fd = dead_fds;
+		dead_fds = fd->next;
+
+		ic = jffs2_get_ino_cache(c, fd->ino);
+
+		if (ic)
+			jffs2_build_remove_unlinked_inode(c, ic, &dead_fds);
+		jffs2_free_full_dirent(fd);
+	}
+
+	dbg_fsbuild("pass 2a complete\n");
+	dbg_fsbuild("freeing temporary data structures\n");
+
+	/* Finally, we can scan again and free the dirent structs */
+	for_each_inode(i, c, ic) {
+		while(ic->scan_dents) {
+			fd = ic->scan_dents;
+			ic->scan_dents = fd->next;
+			jffs2_free_full_dirent(fd);
+		}
+		ic->scan_dents = NULL;
+		cond_resched();
+	}
+	jffs2_build_xattr_subsystem(c);
+	c->flags &= ~JFFS2_SB_FLAG_BUILDING;
+
+	dbg_fsbuild("FS build complete\n");
+
+	/* Rotate the lists by some number to ensure wear levelling */
+	jffs2_rotate_lists(c);
+
+	ret = 0;
+
+exit:
+	if (ret) {
+		for_each_inode(i, c, ic) {
+			while(ic->scan_dents) {
+				fd = ic->scan_dents;
+				ic->scan_dents = fd->next;
+				jffs2_free_full_dirent(fd);
+			}
+		}
+		jffs2_clear_xattr_subsystem(c);
+	}
+
+	return ret;
+}
+
+static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *c,
+					struct jffs2_inode_cache *ic,
+					struct jffs2_full_dirent **dead_fds)
+{
+	struct jffs2_raw_node_ref *raw;
+	struct jffs2_full_dirent *fd;
+
+	dbg_fsbuild("removing ino #%u with nlink == zero.\n", ic->ino);
+
+	raw = ic->nodes;
+	while (raw != (void *)ic) {
+		struct jffs2_raw_node_ref *next = raw->next_in_ino;
+		dbg_fsbuild("obsoleting node at 0x%08x\n", ref_offset(raw));
+		jffs2_mark_node_obsolete(c, raw);
+		raw = next;
+	}
+
+	if (ic->scan_dents) {
+		int whinged = 0;
+		dbg_fsbuild("inode #%u was a directory which may have children...\n", ic->ino);
+
+		while(ic->scan_dents) {
+			struct jffs2_inode_cache *child_ic;
+
+			fd = ic->scan_dents;
+			ic->scan_dents = fd->next;
+
+			if (!fd->ino) {
+				/* It's a deletion dirent. Ignore it */
+				dbg_fsbuild("child \"%s\" is a deletion dirent, skipping...\n", fd->name);
+				jffs2_free_full_dirent(fd);
+				continue;
+			}
+			if (!whinged)
+				whinged = 1;
+
+			dbg_fsbuild("removing child \"%s\", ino #%u\n", fd->name, fd->ino);
+
+			child_ic = jffs2_get_ino_cache(c, fd->ino);
+			if (!child_ic) {
+				dbg_fsbuild("cannot remove child \"%s\", ino #%u, because it doesn't exist\n",
+						fd->name, fd->ino);
+				jffs2_free_full_dirent(fd);
+				continue;
+			}
+
+			/* Reduce nlink of the child. If it's now zero, stick it on the
+			   dead_fds list to be cleaned up later. Else just free the fd */
+
+			if (fd->type == DT_DIR)
+				child_ic->pino_nlink = 0;
+			else
+				child_ic->pino_nlink--;
+
+			if (!child_ic->pino_nlink) {
+				dbg_fsbuild("inode #%u (\"%s\") now has no links; adding to dead_fds list.\n",
+					  fd->ino, fd->name);
+				fd->next = *dead_fds;
+				*dead_fds = fd;
+			} else {
+				dbg_fsbuild("inode #%u (\"%s\") has now got nlink %d. Ignoring.\n",
+					  fd->ino, fd->name, child_ic->pino_nlink);
+				jffs2_free_full_dirent(fd);
+			}
+		}
+	}
+
+	/*
+	   We don't delete the inocache from the hash list and free it yet.
+	   The erase code will do that, when all the nodes are completely gone.
+	*/
+}
+
+static void jffs2_calc_trigger_levels(struct jffs2_sb_info *c)
+{
+	uint32_t size;
+
+	/* Deletion should almost _always_ be allowed. We're fairly
+	   buggered once we stop allowing people to delete stuff
+	   because there's not enough free space... */
+	c->resv_blocks_deletion = 2;
+
+	/* Be conservative about how much space we need before we allow writes.
+	   On top of that which is required for deletia, require an extra 2%
+	   of the medium to be available, for overhead caused by nodes being
+	   split across blocks, etc. */
+
+	size = c->flash_size / 50; /* 2% of flash size */
+	size += c->nr_blocks * 100; /* And 100 bytes per eraseblock */
+	size += c->sector_size - 1; /* ... and round up */
+
+	c->resv_blocks_write = c->resv_blocks_deletion + (size / c->sector_size);
+
+	/* When do we let the GC thread run in the background */
+
+	c->resv_blocks_gctrigger = c->resv_blocks_write + 1;
+
+	/* When do we allow garbage collection to merge nodes to make
+	   long-term progress at the expense of short-term space exhaustion? */
+	c->resv_blocks_gcmerge = c->resv_blocks_deletion + 1;
+
+	/* When do we allow garbage collection to eat from bad blocks rather
+	   than actually making progress? */
+	c->resv_blocks_gcbad = 0;//c->resv_blocks_deletion + 2;
+
+	/* What number of 'very dirty' eraseblocks do we allow before we
+	   trigger the GC thread even if we don't _need_ the space. When we
+	   can't mark nodes obsolete on the medium, the old dirty nodes cause
+	   performance problems because we have to inspect and discard them. */
+	c->vdirty_blocks_gctrigger = c->resv_blocks_gctrigger;
+	if (jffs2_can_mark_obsolete(c))
+		c->vdirty_blocks_gctrigger *= 10;
+
+	/* If there's less than this amount of dirty space, don't bother
+	   trying to GC to make more space. It'll be a fruitless task */
+	c->nospc_dirty_size = c->sector_size + (c->flash_size / 100);
+
+	dbg_fsbuild("JFFS2 trigger levels (size %d KiB, block size %d KiB, %d blocks)\n",
+		  c->flash_size / 1024, c->sector_size / 1024, c->nr_blocks);
+	dbg_fsbuild("Blocks required to allow deletion:    %d (%d KiB)\n",
+		  c->resv_blocks_deletion, c->resv_blocks_deletion*c->sector_size/1024);
+	dbg_fsbuild("Blocks required to allow writes:      %d (%d KiB)\n",
+		  c->resv_blocks_write, c->resv_blocks_write*c->sector_size/1024);
+	dbg_fsbuild("Blocks required to quiesce GC thread: %d (%d KiB)\n",
+		  c->resv_blocks_gctrigger, c->resv_blocks_gctrigger*c->sector_size/1024);
+	dbg_fsbuild("Blocks required to allow GC merges:   %d (%d KiB)\n",
+		  c->resv_blocks_gcmerge, c->resv_blocks_gcmerge*c->sector_size/1024);
+	dbg_fsbuild("Blocks required to GC bad blocks:     %d (%d KiB)\n",
+		  c->resv_blocks_gcbad, c->resv_blocks_gcbad*c->sector_size/1024);
+	dbg_fsbuild("Amount of dirty space required to GC: %d bytes\n",
+		  c->nospc_dirty_size);
+	dbg_fsbuild("Very dirty blocks before GC triggered: %d\n",
+		  c->vdirty_blocks_gctrigger);
+}
+
+int jffs2_do_mount_fs(struct jffs2_sb_info *c)
+{
+	int ret;
+	int i;
+	int size;
+
+	c->free_size = c->flash_size;
+	c->nr_blocks = c->flash_size / c->sector_size;
+	size = sizeof(struct jffs2_eraseblock) * c->nr_blocks;
+#ifndef __ECOS
+	if (jffs2_blocks_use_vmalloc(c))
+		c->blocks = vzalloc(size);
+	else
+#endif
+		c->blocks = kzalloc(size, GFP_KERNEL);
+	if (!c->blocks)
+		return -ENOMEM;
+
+	for (i=0; i<c->nr_blocks; i++) {
+		INIT_LIST_HEAD(&c->blocks[i].list);
+		c->blocks[i].offset = i * c->sector_size;
+		c->blocks[i].free_size = c->sector_size;
+	}
+
+	INIT_LIST_HEAD(&c->clean_list);
+	INIT_LIST_HEAD(&c->very_dirty_list);
+	INIT_LIST_HEAD(&c->dirty_list);
+	INIT_LIST_HEAD(&c->erasable_list);
+	INIT_LIST_HEAD(&c->erasing_list);
+	INIT_LIST_HEAD(&c->erase_checking_list);
+	INIT_LIST_HEAD(&c->erase_pending_list);
+	INIT_LIST_HEAD(&c->erasable_pending_wbuf_list);
+	INIT_LIST_HEAD(&c->erase_complete_list);
+	INIT_LIST_HEAD(&c->free_list);
+	INIT_LIST_HEAD(&c->bad_list);
+	INIT_LIST_HEAD(&c->bad_used_list);
+	c->highest_ino = 1;
+	c->summary = NULL;
+
+	ret = jffs2_sum_init(c);
+	if (ret)
+		goto out_free;
+
+	if (jffs2_build_filesystem(c)) {
+		dbg_fsbuild("build_fs failed\n");
+		jffs2_free_ino_caches(c);
+		jffs2_free_raw_node_refs(c);
+		ret = -EIO;
+		goto out_free;
+	}
+
+	jffs2_calc_trigger_levels(c);
+
+	return 0;
+
+ out_free:
+#ifndef __ECOS
+	if (jffs2_blocks_use_vmalloc(c))
+		vfree(c->blocks);
+	else
+#endif
+		kfree(c->blocks);
+
+	return ret;
+}
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
new file mode 100644
index 00000000..de424702
--- /dev/null
+++ b/fs/jffs2/compr.c
@@ -0,0 +1,360 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ *		    University of Szeged, Hungary
+ *
+ * Created by Arjan van de Ven <arjan@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include "compr.h"
+
+static DEFINE_SPINLOCK(jffs2_compressor_list_lock);
+
+/* Available compressors are on this list */
+static LIST_HEAD(jffs2_compressor_list);
+
+/* Actual compression mode */
+static int jffs2_compression_mode = JFFS2_COMPR_MODE_PRIORITY;
+
+/* Statistics for blocks stored without compression */
+static uint32_t none_stat_compr_blocks=0,none_stat_decompr_blocks=0,none_stat_compr_size=0;
+
+
+/*
+ * Return 1 to use this compression
+ */
+static int jffs2_is_best_compression(struct jffs2_compressor *this,
+		struct jffs2_compressor *best, uint32_t size, uint32_t bestsize)
+{
+	switch (jffs2_compression_mode) {
+	case JFFS2_COMPR_MODE_SIZE:
+		if (bestsize > size)
+			return 1;
+		return 0;
+	case JFFS2_COMPR_MODE_FAVOURLZO:
+		if ((this->compr == JFFS2_COMPR_LZO) && (bestsize > size))
+			return 1;
+		if ((best->compr != JFFS2_COMPR_LZO) && (bestsize > size))
+			return 1;
+		if ((this->compr == JFFS2_COMPR_LZO) && (bestsize > (size * FAVOUR_LZO_PERCENT / 100)))
+			return 1;
+		if ((bestsize * FAVOUR_LZO_PERCENT / 100) > size)
+			return 1;
+
+		return 0;
+	}
+	/* Shouldn't happen */
+	return 0;
+}
+
+/* jffs2_compress:
+ * @data_in: Pointer to uncompressed data
+ * @cpage_out: Pointer to returned pointer to buffer for compressed data
+ * @datalen: On entry, holds the amount of data available for compression.
+ *	On exit, expected to hold the amount of data actually compressed.
+ * @cdatalen: On entry, holds the amount of space available for compressed
+ *	data. On exit, expected to hold the actual size of the compressed
+ *	data.
+ *
+ * Returns: Lower byte to be stored with data indicating compression type used.
+ * Zero is used to show that the data could not be compressed - the
+ * compressed version was actually larger than the original.
+ * Upper byte will be used later. (soon)
+ *
+ * If the cdata buffer isn't large enough to hold all the uncompressed data,
+ * jffs2_compress should compress as much as will fit, and should set
+ * *datalen accordingly to show the amount of data which were compressed.
+ */
+uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			unsigned char *data_in, unsigned char **cpage_out,
+			uint32_t *datalen, uint32_t *cdatalen)
+{
+	int ret = JFFS2_COMPR_NONE;
+	int compr_ret;
+	struct jffs2_compressor *this, *best=NULL;
+	unsigned char *output_buf = NULL, *tmp_buf;
+	uint32_t orig_slen, orig_dlen;
+	uint32_t best_slen=0, best_dlen=0;
+
+	switch (jffs2_compression_mode) {
+	case JFFS2_COMPR_MODE_NONE:
+		break;
+	case JFFS2_COMPR_MODE_PRIORITY:
+		output_buf = kmalloc(*cdatalen,GFP_KERNEL);
+		if (!output_buf) {
+			printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
+			goto out;
+		}
+		orig_slen = *datalen;
+		orig_dlen = *cdatalen;
+		spin_lock(&jffs2_compressor_list_lock);
+		list_for_each_entry(this, &jffs2_compressor_list, list) {
+			/* Skip decompress-only backwards-compatibility and disabled modules */
+			if ((!this->compress)||(this->disabled))
+				continue;
+
+			this->usecount++;
+			spin_unlock(&jffs2_compressor_list_lock);
+			*datalen  = orig_slen;
+			*cdatalen = orig_dlen;
+			compr_ret = this->compress(data_in, output_buf, datalen, cdatalen);
+			spin_lock(&jffs2_compressor_list_lock);
+			this->usecount--;
+			if (!compr_ret) {
+				ret = this->compr;
+				this->stat_compr_blocks++;
+				this->stat_compr_orig_size += *datalen;
+				this->stat_compr_new_size  += *cdatalen;
+				break;
+			}
+		}
+		spin_unlock(&jffs2_compressor_list_lock);
+		if (ret == JFFS2_COMPR_NONE)
+			kfree(output_buf);
+		break;
+	case JFFS2_COMPR_MODE_SIZE:
+	case JFFS2_COMPR_MODE_FAVOURLZO:
+		orig_slen = *datalen;
+		orig_dlen = *cdatalen;
+		spin_lock(&jffs2_compressor_list_lock);
+		list_for_each_entry(this, &jffs2_compressor_list, list) {
+			/* Skip decompress-only backwards-compatibility and disabled modules */
+			if ((!this->compress)||(this->disabled))
+				continue;
+			/* Allocating memory for output buffer if necessary */
+			if ((this->compr_buf_size < orig_slen) && (this->compr_buf)) {
+				spin_unlock(&jffs2_compressor_list_lock);
+				kfree(this->compr_buf);
+				spin_lock(&jffs2_compressor_list_lock);
+				this->compr_buf_size=0;
+				this->compr_buf=NULL;
+			}
+			if (!this->compr_buf) {
+				spin_unlock(&jffs2_compressor_list_lock);
+				tmp_buf = kmalloc(orig_slen, GFP_KERNEL);
+				spin_lock(&jffs2_compressor_list_lock);
+				if (!tmp_buf) {
+					printk(KERN_WARNING "JFFS2: No memory for compressor allocation. (%d bytes)\n", orig_slen);
+					continue;
+				}
+				else {
+					this->compr_buf = tmp_buf;
+					this->compr_buf_size = orig_slen;
+				}
+			}
+			this->usecount++;
+			spin_unlock(&jffs2_compressor_list_lock);
+			*datalen  = orig_slen;
+			*cdatalen = orig_dlen;
+			compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen);
+			spin_lock(&jffs2_compressor_list_lock);
+			this->usecount--;
+			if (!compr_ret) {
+				if (((!best_dlen) || jffs2_is_best_compression(this, best, *cdatalen, best_dlen))
+						&& (*cdatalen < *datalen)) {
+					best_dlen = *cdatalen;
+					best_slen = *datalen;
+					best = this;
+				}
+			}
+		}
+		if (best_dlen) {
+			*cdatalen = best_dlen;
+			*datalen  = best_slen;
+			output_buf = best->compr_buf;
+			best->compr_buf = NULL;
+			best->compr_buf_size = 0;
+			best->stat_compr_blocks++;
+			best->stat_compr_orig_size += best_slen;
+			best->stat_compr_new_size  += best_dlen;
+			ret = best->compr;
+		}
+		spin_unlock(&jffs2_compressor_list_lock);
+		break;
+	default:
+		printk(KERN_ERR "JFFS2: unknown compression mode.\n");
+	}
+ out:
+	if (ret == JFFS2_COMPR_NONE) {
+		*cpage_out = data_in;
+		*datalen = *cdatalen;
+		none_stat_compr_blocks++;
+		none_stat_compr_size += *datalen;
+	}
+	else {
+		*cpage_out = output_buf;
+	}
+	return ret;
+}
+
+int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+		     uint16_t comprtype, unsigned char *cdata_in,
+		     unsigned char *data_out, uint32_t cdatalen, uint32_t datalen)
+{
+	struct jffs2_compressor *this;
+	int ret;
+
+	/* Older code had a bug where it would write non-zero 'usercompr'
+	   fields. Deal with it. */
+	if ((comprtype & 0xff) <= JFFS2_COMPR_ZLIB)
+		comprtype &= 0xff;
+
+	switch (comprtype & 0xff) {
+	case JFFS2_COMPR_NONE:
+		/* This should be special-cased elsewhere, but we might as well deal with it */
+		memcpy(data_out, cdata_in, datalen);
+		none_stat_decompr_blocks++;
+		break;
+	case JFFS2_COMPR_ZERO:
+		memset(data_out, 0, datalen);
+		break;
+	default:
+		spin_lock(&jffs2_compressor_list_lock);
+		list_for_each_entry(this, &jffs2_compressor_list, list) {
+			if (comprtype == this->compr) {
+				this->usecount++;
+				spin_unlock(&jffs2_compressor_list_lock);
+				ret = this->decompress(cdata_in, data_out, cdatalen, datalen);
+				spin_lock(&jffs2_compressor_list_lock);
+				if (ret) {
+					printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret);
+				}
+				else {
+					this->stat_decompr_blocks++;
+				}
+				this->usecount--;
+				spin_unlock(&jffs2_compressor_list_lock);
+				return ret;
+			}
+		}
+		printk(KERN_WARNING "JFFS2 compression type 0x%02x not available.\n", comprtype);
+		spin_unlock(&jffs2_compressor_list_lock);
+		return -EIO;
+	}
+	return 0;
+}
+
+int jffs2_register_compressor(struct jffs2_compressor *comp)
+{
+	struct jffs2_compressor *this;
+
+	if (!comp->name) {
+		printk(KERN_WARNING "NULL compressor name at registering JFFS2 compressor. Failed.\n");
+		return -1;
+	}
+	comp->compr_buf_size=0;
+	comp->compr_buf=NULL;
+	comp->usecount=0;
+	comp->stat_compr_orig_size=0;
+	comp->stat_compr_new_size=0;
+	comp->stat_compr_blocks=0;
+	comp->stat_decompr_blocks=0;
+	D1(printk(KERN_DEBUG "Registering JFFS2 compressor \"%s\"\n", comp->name));
+
+	spin_lock(&jffs2_compressor_list_lock);
+
+	list_for_each_entry(this, &jffs2_compressor_list, list) {
+		if (this->priority < comp->priority) {
+			list_add(&comp->list, this->list.prev);
+			goto out;
+		}
+	}
+	list_add_tail(&comp->list, &jffs2_compressor_list);
+out:
+	D2(list_for_each_entry(this, &jffs2_compressor_list, list) {
+		printk(KERN_DEBUG "Compressor \"%s\", prio %d\n", this->name, this->priority);
+	})
+
+	spin_unlock(&jffs2_compressor_list_lock);
+
+	return 0;
+}
+
+int jffs2_unregister_compressor(struct jffs2_compressor *comp)
+{
+	D2(struct jffs2_compressor *this;)
+
+	D1(printk(KERN_DEBUG "Unregistering JFFS2 compressor \"%s\"\n", comp->name));
+
+	spin_lock(&jffs2_compressor_list_lock);
+
+	if (comp->usecount) {
+		spin_unlock(&jffs2_compressor_list_lock);
+		printk(KERN_WARNING "JFFS2: Compressor modul is in use. Unregister failed.\n");
+		return -1;
+	}
+	list_del(&comp->list);
+
+	D2(list_for_each_entry(this, &jffs2_compressor_list, list) {
+		printk(KERN_DEBUG "Compressor \"%s\", prio %d\n", this->name, this->priority);
+	})
+	spin_unlock(&jffs2_compressor_list_lock);
+	return 0;
+}
+
+void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig)
+{
+	if (orig != comprbuf)
+		kfree(comprbuf);
+}
+
+int __init jffs2_compressors_init(void)
+{
+/* Registering compressors */
+#ifdef CONFIG_JFFS2_ZLIB
+	jffs2_zlib_init();
+#endif
+#ifdef CONFIG_JFFS2_RTIME
+	jffs2_rtime_init();
+#endif
+#ifdef CONFIG_JFFS2_RUBIN
+	jffs2_rubinmips_init();
+	jffs2_dynrubin_init();
+#endif
+#ifdef CONFIG_JFFS2_LZO
+	jffs2_lzo_init();
+#endif
+/* Setting default compression mode */
+#ifdef CONFIG_JFFS2_CMODE_NONE
+	jffs2_compression_mode = JFFS2_COMPR_MODE_NONE;
+	D1(printk(KERN_INFO "JFFS2: default compression mode: none\n");)
+#else
+#ifdef CONFIG_JFFS2_CMODE_SIZE
+	jffs2_compression_mode = JFFS2_COMPR_MODE_SIZE;
+	D1(printk(KERN_INFO "JFFS2: default compression mode: size\n");)
+#else
+#ifdef CONFIG_JFFS2_CMODE_FAVOURLZO
+	jffs2_compression_mode = JFFS2_COMPR_MODE_FAVOURLZO;
+	D1(printk(KERN_INFO "JFFS2: default compression mode: favourlzo\n");)
+#else
+	D1(printk(KERN_INFO "JFFS2: default compression mode: priority\n");)
+#endif
+#endif
+#endif
+	return 0;
+}
+
+int jffs2_compressors_exit(void)
+{
+/* Unregistering compressors */
+#ifdef CONFIG_JFFS2_LZO
+	jffs2_lzo_exit();
+#endif
+#ifdef CONFIG_JFFS2_RUBIN
+	jffs2_dynrubin_exit();
+	jffs2_rubinmips_exit();
+#endif
+#ifdef CONFIG_JFFS2_RTIME
+	jffs2_rtime_exit();
+#endif
+#ifdef CONFIG_JFFS2_ZLIB
+	jffs2_zlib_exit();
+#endif
+	return 0;
+}
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
new file mode 100644
index 00000000..13bb7597
--- /dev/null
+++ b/fs/jffs2/compr.h
@@ -0,0 +1,103 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2004   Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ *		      University of Szeged, Hungary
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef __JFFS2_COMPR_H__
+#define __JFFS2_COMPR_H__
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/jffs2.h>
+#include "jffs2_fs_i.h"
+#include "jffs2_fs_sb.h"
+#include "nodelist.h"
+
+#define JFFS2_RUBINMIPS_PRIORITY 10
+#define JFFS2_DYNRUBIN_PRIORITY  20
+#define JFFS2_LZARI_PRIORITY     30
+#define JFFS2_RTIME_PRIORITY     50
+#define JFFS2_ZLIB_PRIORITY      60
+#define JFFS2_LZO_PRIORITY       80
+
+
+#define JFFS2_RUBINMIPS_DISABLED /* RUBINs will be used only */
+#define JFFS2_DYNRUBIN_DISABLED  /*	   for decompression */
+
+#define JFFS2_COMPR_MODE_NONE       0
+#define JFFS2_COMPR_MODE_PRIORITY   1
+#define JFFS2_COMPR_MODE_SIZE       2
+#define JFFS2_COMPR_MODE_FAVOURLZO  3
+
+#define FAVOUR_LZO_PERCENT 80
+
+struct jffs2_compressor {
+	struct list_head list;
+	int priority;			/* used by prirority comr. mode */
+	char *name;
+	char compr;			/* JFFS2_COMPR_XXX */
+	int (*compress)(unsigned char *data_in, unsigned char *cpage_out,
+			uint32_t *srclen, uint32_t *destlen);
+	int (*decompress)(unsigned char *cdata_in, unsigned char *data_out,
+			  uint32_t cdatalen, uint32_t datalen);
+	int usecount;
+	int disabled;		/* if set the compressor won't compress */
+	unsigned char *compr_buf;	/* used by size compr. mode */
+	uint32_t compr_buf_size;	/* used by size compr. mode */
+	uint32_t stat_compr_orig_size;
+	uint32_t stat_compr_new_size;
+	uint32_t stat_compr_blocks;
+	uint32_t stat_decompr_blocks;
+};
+
+int jffs2_register_compressor(struct jffs2_compressor *comp);
+int jffs2_unregister_compressor(struct jffs2_compressor *comp);
+
+int jffs2_compressors_init(void);
+int jffs2_compressors_exit(void);
+
+uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			unsigned char *data_in, unsigned char **cpage_out,
+			uint32_t *datalen, uint32_t *cdatalen);
+
+int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+		     uint16_t comprtype, unsigned char *cdata_in,
+		     unsigned char *data_out, uint32_t cdatalen, uint32_t datalen);
+
+void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig);
+
+/* Compressor modules */
+/* These functions will be called by jffs2_compressors_init/exit */
+
+#ifdef CONFIG_JFFS2_RUBIN
+int jffs2_rubinmips_init(void);
+void jffs2_rubinmips_exit(void);
+int jffs2_dynrubin_init(void);
+void jffs2_dynrubin_exit(void);
+#endif
+#ifdef CONFIG_JFFS2_RTIME
+int jffs2_rtime_init(void);
+void jffs2_rtime_exit(void);
+#endif
+#ifdef CONFIG_JFFS2_ZLIB
+int jffs2_zlib_init(void);
+void jffs2_zlib_exit(void);
+#endif
+#ifdef CONFIG_JFFS2_LZO
+int jffs2_lzo_init(void);
+void jffs2_lzo_exit(void);
+#endif
+
+#endif /* __JFFS2_COMPR_H__ */
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
new file mode 100644
index 00000000..af186ee6
--- /dev/null
+++ b/fs/jffs2/compr_lzo.c
@@ -0,0 +1,111 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2007 Nokia Corporation. All rights reserved.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by Richard Purdie <rpurdie@openedhand.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/lzo.h>
+#include "compr.h"
+
+static void *lzo_mem;
+static void *lzo_compress_buf;
+static DEFINE_MUTEX(deflate_mutex);	/* for lzo_mem and lzo_compress_buf */
+
+static void free_workspace(void)
+{
+	vfree(lzo_mem);
+	vfree(lzo_compress_buf);
+}
+
+static int __init alloc_workspace(void)
+{
+	lzo_mem = vmalloc(LZO1X_MEM_COMPRESS);
+	lzo_compress_buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE));
+
+	if (!lzo_mem || !lzo_compress_buf) {
+		printk(KERN_WARNING "Failed to allocate lzo deflate workspace\n");
+		free_workspace();
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
+			      uint32_t *sourcelen, uint32_t *dstlen)
+{
+	size_t compress_size;
+	int ret;
+
+	mutex_lock(&deflate_mutex);
+	ret = lzo1x_1_compress(data_in, *sourcelen, lzo_compress_buf, &compress_size, lzo_mem);
+	if (ret != LZO_E_OK)
+		goto fail;
+
+	if (compress_size > *dstlen)
+		goto fail;
+
+	memcpy(cpage_out, lzo_compress_buf, compress_size);
+	mutex_unlock(&deflate_mutex);
+
+	*dstlen = compress_size;
+	return 0;
+
+ fail:
+	mutex_unlock(&deflate_mutex);
+	return -1;
+}
+
+static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out,
+				 uint32_t srclen, uint32_t destlen)
+{
+	size_t dl = destlen;
+	int ret;
+
+	ret = lzo1x_decompress_safe(data_in, srclen, cpage_out, &dl);
+
+	if (ret != LZO_E_OK || dl != destlen)
+		return -1;
+
+	return 0;
+}
+
+static struct jffs2_compressor jffs2_lzo_comp = {
+	.priority = JFFS2_LZO_PRIORITY,
+	.name = "lzo",
+	.compr = JFFS2_COMPR_LZO,
+	.compress = &jffs2_lzo_compress,
+	.decompress = &jffs2_lzo_decompress,
+	.disabled = 0,
+};
+
+int __init jffs2_lzo_init(void)
+{
+	int ret;
+
+	ret = alloc_workspace();
+	if (ret < 0)
+		return ret;
+
+	ret = jffs2_register_compressor(&jffs2_lzo_comp);
+	if (ret)
+		free_workspace();
+
+	return ret;
+}
+
+void jffs2_lzo_exit(void)
+{
+	jffs2_unregister_compressor(&jffs2_lzo_comp);
+	free_workspace();
+}
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
new file mode 100644
index 00000000..16a50479
--- /dev/null
+++ b/fs/jffs2/compr_rtime.c
@@ -0,0 +1,130 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by Arjan van de Ven <arjanv@redhat.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ *
+ *
+ * Very simple lz77-ish encoder.
+ *
+ * Theory of operation: Both encoder and decoder have a list of "last
+ * occurrences" for every possible source-value; after sending the
+ * first source-byte, the second byte indicated the "run" length of
+ * matches
+ *
+ * The algorithm is intended to only send "whole bytes", no bit-messing.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/jffs2.h>
+#include "compr.h"
+
+/* _compress returns the compressed size, -1 if bigger */
+static int jffs2_rtime_compress(unsigned char *data_in,
+				unsigned char *cpage_out,
+				uint32_t *sourcelen, uint32_t *dstlen)
+{
+	short positions[256];
+	int outpos = 0;
+	int pos=0;
+
+	memset(positions,0,sizeof(positions));
+
+	while (pos < (*sourcelen) && outpos <= (*dstlen)-2) {
+		int backpos, runlen=0;
+		unsigned char value;
+
+		value = data_in[pos];
+
+		cpage_out[outpos++] = data_in[pos++];
+
+		backpos = positions[value];
+		positions[value]=pos;
+
+		while ((backpos < pos) && (pos < (*sourcelen)) &&
+		       (data_in[pos]==data_in[backpos++]) && (runlen<255)) {
+			pos++;
+			runlen++;
+		}
+		cpage_out[outpos++] = runlen;
+	}
+
+	if (outpos >= pos) {
+		/* We failed */
+		return -1;
+	}
+
+	/* Tell the caller how much we managed to compress, and how much space it took */
+	*sourcelen = pos;
+	*dstlen = outpos;
+	return 0;
+}
+
+
+static int jffs2_rtime_decompress(unsigned char *data_in,
+				  unsigned char *cpage_out,
+				  uint32_t srclen, uint32_t destlen)
+{
+	short positions[256];
+	int outpos = 0;
+	int pos=0;
+
+	memset(positions,0,sizeof(positions));
+
+	while (outpos<destlen) {
+		unsigned char value;
+		int backoffs;
+		int repeat;
+
+		value = data_in[pos++];
+		cpage_out[outpos++] = value; /* first the verbatim copied byte */
+		repeat = data_in[pos++];
+		backoffs = positions[value];
+
+		positions[value]=outpos;
+		if (repeat) {
+			if (backoffs + repeat >= outpos) {
+				while(repeat) {
+					cpage_out[outpos++] = cpage_out[backoffs++];
+					repeat--;
+				}
+			} else {
+				memcpy(&cpage_out[outpos],&cpage_out[backoffs],repeat);
+				outpos+=repeat;
+			}
+		}
+	}
+	return 0;
+}
+
+static struct jffs2_compressor jffs2_rtime_comp = {
+    .priority = JFFS2_RTIME_PRIORITY,
+    .name = "rtime",
+    .compr = JFFS2_COMPR_RTIME,
+    .compress = &jffs2_rtime_compress,
+    .decompress = &jffs2_rtime_decompress,
+#ifdef JFFS2_RTIME_DISABLED
+    .disabled = 1,
+#else
+    .disabled = 0,
+#endif
+};
+
+int jffs2_rtime_init(void)
+{
+    return jffs2_register_compressor(&jffs2_rtime_comp);
+}
+
+void jffs2_rtime_exit(void)
+{
+    jffs2_unregister_compressor(&jffs2_rtime_comp);
+}
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
new file mode 100644
index 00000000..9e7cec80
--- /dev/null
+++ b/fs/jffs2/compr_rubin.c
@@ -0,0 +1,455 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by Arjan van de Ven <arjanv@redhat.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/jffs2.h>
+#include <linux/errno.h>
+#include "compr.h"
+
+
+#define RUBIN_REG_SIZE   16
+#define UPPER_BIT_RUBIN    (((long) 1)<<(RUBIN_REG_SIZE-1))
+#define LOWER_BITS_RUBIN   ((((long) 1)<<(RUBIN_REG_SIZE-1))-1)
+
+
+#define BIT_DIVIDER_MIPS 1043
+static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241};
+
+struct pushpull {
+	unsigned char *buf;
+	unsigned int buflen;
+	unsigned int ofs;
+	unsigned int reserve;
+};
+
+struct rubin_state {
+	unsigned long p;
+	unsigned long q;
+	unsigned long rec_q;
+	long bit_number;
+	struct pushpull pp;
+	int bit_divider;
+	int bits[8];
+};
+
+static inline void init_pushpull(struct pushpull *pp, char *buf,
+				 unsigned buflen, unsigned ofs,
+				 unsigned reserve)
+{
+	pp->buf = buf;
+	pp->buflen = buflen;
+	pp->ofs = ofs;
+	pp->reserve = reserve;
+}
+
+static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
+{
+	if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve))
+		return -ENOSPC;
+
+	if (bit)
+		pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7)));
+	else
+		pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7)));
+
+	pp->ofs++;
+
+	return 0;
+}
+
+static inline int pushedbits(struct pushpull *pp)
+{
+	return pp->ofs;
+}
+
+static inline int pullbit(struct pushpull *pp)
+{
+	int bit;
+
+	bit = (pp->buf[pp->ofs >> 3] >> (7-(pp->ofs & 7))) & 1;
+
+	pp->ofs++;
+	return bit;
+}
+
+static inline int pulledbits(struct pushpull *pp)
+{
+	return pp->ofs;
+}
+
+
+static void init_rubin(struct rubin_state *rs, int div, int *bits)
+{
+	int c;
+
+	rs->q = 0;
+	rs->p = (long) (2 * UPPER_BIT_RUBIN);
+	rs->bit_number = (long) 0;
+	rs->bit_divider = div;
+
+	for (c=0; c<8; c++)
+		rs->bits[c] = bits[c];
+}
+
+
+static int encode(struct rubin_state *rs, long A, long B, int symbol)
+{
+
+	long i0, i1;
+	int ret;
+
+	while ((rs->q >= UPPER_BIT_RUBIN) ||
+	       ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) {
+		rs->bit_number++;
+
+		ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0);
+		if (ret)
+			return ret;
+		rs->q &= LOWER_BITS_RUBIN;
+		rs->q <<= 1;
+		rs->p <<= 1;
+	}
+	i0 = A * rs->p / (A + B);
+	if (i0 <= 0)
+		i0 = 1;
+
+	if (i0 >= rs->p)
+		i0 = rs->p - 1;
+
+	i1 = rs->p - i0;
+
+	if (symbol == 0)
+		rs->p = i0;
+	else {
+		rs->p = i1;
+		rs->q += i0;
+	}
+	return 0;
+}
+
+
+static void end_rubin(struct rubin_state *rs)
+{
+
+	int i;
+
+	for (i = 0; i < RUBIN_REG_SIZE; i++) {
+		pushbit(&rs->pp, (UPPER_BIT_RUBIN & rs->q) ? 1 : 0, 1);
+		rs->q &= LOWER_BITS_RUBIN;
+		rs->q <<= 1;
+	}
+}
+
+
+static void init_decode(struct rubin_state *rs, int div, int *bits)
+{
+	init_rubin(rs, div, bits);
+
+	/* behalve lower */
+	rs->rec_q = 0;
+
+	for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE;
+	     rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp)))
+		;
+}
+
+static void __do_decode(struct rubin_state *rs, unsigned long p,
+			unsigned long q)
+{
+	register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN;
+	unsigned long rec_q;
+	int c, bits = 0;
+
+	/*
+	 * First, work out how many bits we need from the input stream.
+	 * Note that we have already done the initial check on this
+	 * loop prior to calling this function.
+	 */
+	do {
+		bits++;
+		q &= lower_bits_rubin;
+		q <<= 1;
+		p <<= 1;
+	} while ((q >= UPPER_BIT_RUBIN) || ((p + q) <= UPPER_BIT_RUBIN));
+
+	rs->p = p;
+	rs->q = q;
+
+	rs->bit_number += bits;
+
+	/*
+	 * Now get the bits.  We really want this to be "get n bits".
+	 */
+	rec_q = rs->rec_q;
+	do {
+		c = pullbit(&rs->pp);
+		rec_q &= lower_bits_rubin;
+		rec_q <<= 1;
+		rec_q += c;
+	} while (--bits);
+	rs->rec_q = rec_q;
+}
+
+static int decode(struct rubin_state *rs, long A, long B)
+{
+	unsigned long p = rs->p, q = rs->q;
+	long i0, threshold;
+	int symbol;
+
+	if (q >= UPPER_BIT_RUBIN || ((p + q) <= UPPER_BIT_RUBIN))
+		__do_decode(rs, p, q);
+
+	i0 = A * rs->p / (A + B);
+	if (i0 <= 0)
+		i0 = 1;
+
+	if (i0 >= rs->p)
+		i0 = rs->p - 1;
+
+	threshold = rs->q + i0;
+	symbol = rs->rec_q >= threshold;
+	if (rs->rec_q >= threshold) {
+		rs->q += i0;
+		i0 = rs->p - i0;
+	}
+
+	rs->p = i0;
+
+	return symbol;
+}
+
+
+
+static int out_byte(struct rubin_state *rs, unsigned char byte)
+{
+	int i, ret;
+	struct rubin_state rs_copy;
+	rs_copy = *rs;
+
+	for (i=0; i<8; i++) {
+		ret = encode(rs, rs->bit_divider-rs->bits[i],
+			     rs->bits[i], byte & 1);
+		if (ret) {
+			/* Failed. Restore old state */
+			*rs = rs_copy;
+			return ret;
+		}
+		byte >>= 1 ;
+	}
+	return 0;
+}
+
+static int in_byte(struct rubin_state *rs)
+{
+	int i, result = 0, bit_divider = rs->bit_divider;
+
+	for (i = 0; i < 8; i++)
+		result |= decode(rs, bit_divider - rs->bits[i],
+				 rs->bits[i]) << i;
+
+	return result;
+}
+
+
+
+static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
+			     unsigned char *cpage_out, uint32_t *sourcelen,
+			     uint32_t *dstlen)
+	{
+	int outpos = 0;
+	int pos=0;
+	struct rubin_state rs;
+
+	init_pushpull(&rs.pp, cpage_out, *dstlen * 8, 0, 32);
+
+	init_rubin(&rs, bit_divider, bits);
+
+	while (pos < (*sourcelen) && !out_byte(&rs, data_in[pos]))
+		pos++;
+
+	end_rubin(&rs);
+
+	if (outpos > pos) {
+		/* We failed */
+		return -1;
+	}
+
+	/* Tell the caller how much we managed to compress,
+	 * and how much space it took */
+
+	outpos = (pushedbits(&rs.pp)+7)/8;
+
+	if (outpos >= pos)
+		return -1; /* We didn't actually compress */
+	*sourcelen = pos;
+	*dstlen = outpos;
+	return 0;
+}
+#if 0
+/* _compress returns the compressed size, -1 if bigger */
+int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
+		   uint32_t *sourcelen, uint32_t *dstlen)
+{
+	return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
+				 cpage_out, sourcelen, dstlen);
+}
+#endif
+static int jffs2_dynrubin_compress(unsigned char *data_in,
+				   unsigned char *cpage_out,
+				   uint32_t *sourcelen, uint32_t *dstlen)
+{
+	int bits[8];
+	unsigned char histo[256];
+	int i;
+	int ret;
+	uint32_t mysrclen, mydstlen;
+
+	mysrclen = *sourcelen;
+	mydstlen = *dstlen - 8;
+
+	if (*dstlen <= 12)
+		return -1;
+
+	memset(histo, 0, 256);
+	for (i=0; i<mysrclen; i++)
+		histo[data_in[i]]++;
+	memset(bits, 0, sizeof(int)*8);
+	for (i=0; i<256; i++) {
+		if (i&128)
+			bits[7] += histo[i];
+		if (i&64)
+			bits[6] += histo[i];
+		if (i&32)
+			bits[5] += histo[i];
+		if (i&16)
+			bits[4] += histo[i];
+		if (i&8)
+			bits[3] += histo[i];
+		if (i&4)
+			bits[2] += histo[i];
+		if (i&2)
+			bits[1] += histo[i];
+		if (i&1)
+			bits[0] += histo[i];
+	}
+
+	for (i=0; i<8; i++) {
+		bits[i] = (bits[i] * 256) / mysrclen;
+		if (!bits[i]) bits[i] = 1;
+		if (bits[i] > 255) bits[i] = 255;
+		cpage_out[i] = bits[i];
+	}
+
+	ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen,
+				&mydstlen);
+	if (ret)
+		return ret;
+
+	/* Add back the 8 bytes we took for the probabilities */
+	mydstlen += 8;
+
+	if (mysrclen <= mydstlen) {
+		/* We compressed */
+		return -1;
+	}
+
+	*sourcelen = mysrclen;
+	*dstlen = mydstlen;
+	return 0;
+}
+
+static void rubin_do_decompress(int bit_divider, int *bits,
+				unsigned char *cdata_in, 
+				unsigned char *page_out, uint32_t srclen,
+				uint32_t destlen)
+{
+	int outpos = 0;
+	struct rubin_state rs;
+
+	init_pushpull(&rs.pp, cdata_in, srclen, 0, 0);
+	init_decode(&rs, bit_divider, bits);
+
+	while (outpos < destlen)
+		page_out[outpos++] = in_byte(&rs);
+}
+
+
+static int jffs2_rubinmips_decompress(unsigned char *data_in,
+				      unsigned char *cpage_out,
+				      uint32_t sourcelen, uint32_t dstlen)
+{
+	rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
+			    cpage_out, sourcelen, dstlen);
+	return 0;
+}
+
+static int jffs2_dynrubin_decompress(unsigned char *data_in,
+				     unsigned char *cpage_out,
+				     uint32_t sourcelen, uint32_t dstlen)
+{
+	int bits[8];
+	int c;
+
+	for (c=0; c<8; c++)
+		bits[c] = data_in[c];
+
+	rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8,
+			    dstlen);
+	return 0;
+}
+
+static struct jffs2_compressor jffs2_rubinmips_comp = {
+	.priority = JFFS2_RUBINMIPS_PRIORITY,
+	.name = "rubinmips",
+	.compr = JFFS2_COMPR_DYNRUBIN,
+	.compress = NULL, /*&jffs2_rubinmips_compress,*/
+	.decompress = &jffs2_rubinmips_decompress,
+#ifdef JFFS2_RUBINMIPS_DISABLED
+	.disabled = 1,
+#else
+	.disabled = 0,
+#endif
+};
+
+int jffs2_rubinmips_init(void)
+{
+	return jffs2_register_compressor(&jffs2_rubinmips_comp);
+}
+
+void jffs2_rubinmips_exit(void)
+{
+	jffs2_unregister_compressor(&jffs2_rubinmips_comp);
+}
+
+static struct jffs2_compressor jffs2_dynrubin_comp = {
+	.priority = JFFS2_DYNRUBIN_PRIORITY,
+	.name = "dynrubin",
+	.compr = JFFS2_COMPR_RUBINMIPS,
+	.compress = jffs2_dynrubin_compress,
+	.decompress = &jffs2_dynrubin_decompress,
+#ifdef JFFS2_DYNRUBIN_DISABLED
+	.disabled = 1,
+#else
+	.disabled = 0,
+#endif
+};
+
+int jffs2_dynrubin_init(void)
+{
+	return jffs2_register_compressor(&jffs2_dynrubin_comp);
+}
+
+void jffs2_dynrubin_exit(void)
+{
+	jffs2_unregister_compressor(&jffs2_dynrubin_comp);
+}
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
new file mode 100644
index 00000000..5a001020
--- /dev/null
+++ b/fs/jffs2/compr_zlib.c
@@ -0,0 +1,218 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#if !defined(__KERNEL__) && !defined(__ECOS)
+#error "The userspace support got too messy and was removed. Update your mkfs.jffs2"
+#endif
+
+#include <linux/kernel.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include "nodelist.h"
+#include "compr.h"
+
+	/* Plan: call deflate() with avail_in == *sourcelen,
+		avail_out = *dstlen - 12 and flush == Z_FINISH.
+		If it doesn't manage to finish,	call it again with
+		avail_in == 0 and avail_out set to the remaining 12
+		bytes for it to clean up.
+	   Q: Is 12 bytes sufficient?
+	*/
+#define STREAM_END_SPACE 12
+
+static DEFINE_MUTEX(deflate_mutex);
+static DEFINE_MUTEX(inflate_mutex);
+static z_stream inf_strm, def_strm;
+
+#ifdef __KERNEL__ /* Linux-only */
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+
+static int __init alloc_workspaces(void)
+{
+	def_strm.workspace = vmalloc(zlib_deflate_workspacesize(MAX_WBITS,
+							MAX_MEM_LEVEL));
+	if (!def_strm.workspace) {
+		printk(KERN_WARNING "Failed to allocate %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL));
+		return -ENOMEM;
+	}
+	D1(printk(KERN_DEBUG "Allocated %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL)));
+	inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+	if (!inf_strm.workspace) {
+		printk(KERN_WARNING "Failed to allocate %d bytes for inflate workspace\n", zlib_inflate_workspacesize());
+		vfree(def_strm.workspace);
+		return -ENOMEM;
+	}
+	D1(printk(KERN_DEBUG "Allocated %d bytes for inflate workspace\n", zlib_inflate_workspacesize()));
+	return 0;
+}
+
+static void free_workspaces(void)
+{
+	vfree(def_strm.workspace);
+	vfree(inf_strm.workspace);
+}
+#else
+#define alloc_workspaces() (0)
+#define free_workspaces() do { } while(0)
+#endif /* __KERNEL__ */
+
+static int jffs2_zlib_compress(unsigned char *data_in,
+			       unsigned char *cpage_out,
+			       uint32_t *sourcelen, uint32_t *dstlen)
+{
+	int ret;
+
+	if (*dstlen <= STREAM_END_SPACE)
+		return -1;
+
+	mutex_lock(&deflate_mutex);
+
+	if (Z_OK != zlib_deflateInit(&def_strm, 3)) {
+		printk(KERN_WARNING "deflateInit failed\n");
+		mutex_unlock(&deflate_mutex);
+		return -1;
+	}
+
+	def_strm.next_in = data_in;
+	def_strm.total_in = 0;
+
+	def_strm.next_out = cpage_out;
+	def_strm.total_out = 0;
+
+	while (def_strm.total_out < *dstlen - STREAM_END_SPACE && def_strm.total_in < *sourcelen) {
+		def_strm.avail_out = *dstlen - (def_strm.total_out + STREAM_END_SPACE);
+		def_strm.avail_in = min((unsigned)(*sourcelen-def_strm.total_in), def_strm.avail_out);
+		D1(printk(KERN_DEBUG "calling deflate with avail_in %d, avail_out %d\n",
+			  def_strm.avail_in, def_strm.avail_out));
+		ret = zlib_deflate(&def_strm, Z_PARTIAL_FLUSH);
+		D1(printk(KERN_DEBUG "deflate returned with avail_in %d, avail_out %d, total_in %ld, total_out %ld\n",
+			  def_strm.avail_in, def_strm.avail_out, def_strm.total_in, def_strm.total_out));
+		if (ret != Z_OK) {
+			D1(printk(KERN_DEBUG "deflate in loop returned %d\n", ret));
+			zlib_deflateEnd(&def_strm);
+			mutex_unlock(&deflate_mutex);
+			return -1;
+		}
+	}
+	def_strm.avail_out += STREAM_END_SPACE;
+	def_strm.avail_in = 0;
+	ret = zlib_deflate(&def_strm, Z_FINISH);
+	zlib_deflateEnd(&def_strm);
+
+	if (ret != Z_STREAM_END) {
+		D1(printk(KERN_DEBUG "final deflate returned %d\n", ret));
+		ret = -1;
+		goto out;
+	}
+
+	if (def_strm.total_out >= def_strm.total_in) {
+		D1(printk(KERN_DEBUG "zlib compressed %ld bytes into %ld; failing\n",
+			  def_strm.total_in, def_strm.total_out));
+		ret = -1;
+		goto out;
+	}
+
+	D1(printk(KERN_DEBUG "zlib compressed %ld bytes into %ld\n",
+		  def_strm.total_in, def_strm.total_out));
+
+	*dstlen = def_strm.total_out;
+	*sourcelen = def_strm.total_in;
+	ret = 0;
+ out:
+	mutex_unlock(&deflate_mutex);
+	return ret;
+}
+
+static int jffs2_zlib_decompress(unsigned char *data_in,
+				 unsigned char *cpage_out,
+				 uint32_t srclen, uint32_t destlen)
+{
+	int ret;
+	int wbits = MAX_WBITS;
+
+	mutex_lock(&inflate_mutex);
+
+	inf_strm.next_in = data_in;
+	inf_strm.avail_in = srclen;
+	inf_strm.total_in = 0;
+
+	inf_strm.next_out = cpage_out;
+	inf_strm.avail_out = destlen;
+	inf_strm.total_out = 0;
+
+	/* If it's deflate, and it's got no preset dictionary, then
+	   we can tell zlib to skip the adler32 check. */
+	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+		D2(printk(KERN_DEBUG "inflate skipping adler32\n"));
+		wbits = -((data_in[0] >> 4) + 8);
+		inf_strm.next_in += 2;
+		inf_strm.avail_in -= 2;
+	} else {
+		/* Let this remain D1 for now -- it should never happen */
+		D1(printk(KERN_DEBUG "inflate not skipping adler32\n"));
+	}
+
+
+	if (Z_OK != zlib_inflateInit2(&inf_strm, wbits)) {
+		printk(KERN_WARNING "inflateInit failed\n");
+		mutex_unlock(&inflate_mutex);
+		return 1;
+	}
+
+	while((ret = zlib_inflate(&inf_strm, Z_FINISH)) == Z_OK)
+		;
+	if (ret != Z_STREAM_END) {
+		printk(KERN_NOTICE "inflate returned %d\n", ret);
+	}
+	zlib_inflateEnd(&inf_strm);
+	mutex_unlock(&inflate_mutex);
+	return 0;
+}
+
+static struct jffs2_compressor jffs2_zlib_comp = {
+    .priority = JFFS2_ZLIB_PRIORITY,
+    .name = "zlib",
+    .compr = JFFS2_COMPR_ZLIB,
+    .compress = &jffs2_zlib_compress,
+    .decompress = &jffs2_zlib_decompress,
+#ifdef JFFS2_ZLIB_DISABLED
+    .disabled = 1,
+#else
+    .disabled = 0,
+#endif
+};
+
+int __init jffs2_zlib_init(void)
+{
+    int ret;
+
+    ret = alloc_workspaces();
+    if (ret)
+	    return ret;
+
+    ret = jffs2_register_compressor(&jffs2_zlib_comp);
+    if (ret)
+	    free_workspaces();
+
+    return ret;
+}
+
+void jffs2_zlib_exit(void)
+{
+    jffs2_unregister_compressor(&jffs2_zlib_comp);
+    free_workspaces();
+}
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
new file mode 100644
index 00000000..e0b76c87
--- /dev/null
+++ b/fs/jffs2/debug.c
@@ -0,0 +1,860 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/pagemap.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/mtd/mtd.h>
+#include <linux/slab.h>
+#include "nodelist.h"
+#include "debug.h"
+
+#ifdef JFFS2_DBG_SANITY_CHECKS
+
+void
+__jffs2_dbg_acct_sanity_check_nolock(struct jffs2_sb_info *c,
+				     struct jffs2_eraseblock *jeb)
+{
+	if (unlikely(jeb && jeb->used_size + jeb->dirty_size +
+			jeb->free_size + jeb->wasted_size +
+			jeb->unchecked_size != c->sector_size)) {
+		JFFS2_ERROR("eeep, space accounting for block at 0x%08x is screwed.\n", jeb->offset);
+		JFFS2_ERROR("free %#08x + dirty %#08x + used %#08x + wasted %#08x + unchecked %#08x != total %#08x.\n",
+			jeb->free_size, jeb->dirty_size, jeb->used_size,
+			jeb->wasted_size, jeb->unchecked_size, c->sector_size);
+		BUG();
+	}
+
+	if (unlikely(c->used_size + c->dirty_size + c->free_size + c->erasing_size + c->bad_size
+				+ c->wasted_size + c->unchecked_size != c->flash_size)) {
+		JFFS2_ERROR("eeep, space accounting superblock info is screwed.\n");
+		JFFS2_ERROR("free %#08x + dirty %#08x + used %#08x + erasing %#08x + bad %#08x + wasted %#08x + unchecked %#08x != total %#08x.\n",
+			c->free_size, c->dirty_size, c->used_size, c->erasing_size, c->bad_size,
+			c->wasted_size, c->unchecked_size, c->flash_size);
+		BUG();
+	}
+}
+
+void
+__jffs2_dbg_acct_sanity_check(struct jffs2_sb_info *c,
+			      struct jffs2_eraseblock *jeb)
+{
+	spin_lock(&c->erase_completion_lock);
+	jffs2_dbg_acct_sanity_check_nolock(c, jeb);
+	spin_unlock(&c->erase_completion_lock);
+}
+
+#endif /* JFFS2_DBG_SANITY_CHECKS */
+
+#ifdef JFFS2_DBG_PARANOIA_CHECKS
+/*
+ * Check the fragtree.
+ */
+void
+__jffs2_dbg_fragtree_paranoia_check(struct jffs2_inode_info *f)
+{
+	mutex_lock(&f->sem);
+	__jffs2_dbg_fragtree_paranoia_check_nolock(f);
+	mutex_unlock(&f->sem);
+}
+
+void
+__jffs2_dbg_fragtree_paranoia_check_nolock(struct jffs2_inode_info *f)
+{
+	struct jffs2_node_frag *frag;
+	int bitched = 0;
+
+	for (frag = frag_first(&f->fragtree); frag; frag = frag_next(frag)) {
+		struct jffs2_full_dnode *fn = frag->node;
+
+		if (!fn || !fn->raw)
+			continue;
+
+		if (ref_flags(fn->raw) == REF_PRISTINE) {
+			if (fn->frags > 1) {
+				JFFS2_ERROR("REF_PRISTINE node at 0x%08x had %d frags. Tell dwmw2.\n",
+					ref_offset(fn->raw), fn->frags);
+				bitched = 1;
+			}
+
+			/* A hole node which isn't multi-page should be garbage-collected
+			   and merged anyway, so we just check for the frag size here,
+			   rather than mucking around with actually reading the node
+			   and checking the compression type, which is the real way
+			   to tell a hole node. */
+			if (frag->ofs & (PAGE_CACHE_SIZE-1) && frag_prev(frag)
+					&& frag_prev(frag)->size < PAGE_CACHE_SIZE && frag_prev(frag)->node) {
+				JFFS2_ERROR("REF_PRISTINE node at 0x%08x had a previous non-hole frag in the same page. Tell dwmw2.\n",
+					ref_offset(fn->raw));
+				bitched = 1;
+			}
+
+			if ((frag->ofs+frag->size) & (PAGE_CACHE_SIZE-1) && frag_next(frag)
+					&& frag_next(frag)->size < PAGE_CACHE_SIZE && frag_next(frag)->node) {
+				JFFS2_ERROR("REF_PRISTINE node at 0x%08x (%08x-%08x) had a following non-hole frag in the same page. Tell dwmw2.\n",
+				       ref_offset(fn->raw), frag->ofs, frag->ofs+frag->size);
+				bitched = 1;
+			}
+		}
+	}
+
+	if (bitched) {
+		JFFS2_ERROR("fragtree is corrupted.\n");
+		__jffs2_dbg_dump_fragtree_nolock(f);
+		BUG();
+	}
+}
+
+/*
+ * Check if the flash contains all 0xFF before we start writing.
+ */
+void
+__jffs2_dbg_prewrite_paranoia_check(struct jffs2_sb_info *c,
+				    uint32_t ofs, int len)
+{
+	size_t retlen;
+	int ret, i;
+	unsigned char *buf;
+
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return;
+
+	ret = jffs2_flash_read(c, ofs, len, &retlen, buf);
+	if (ret || (retlen != len)) {
+		JFFS2_WARNING("read %d bytes failed or short. ret %d, retlen %zd.\n",
+				len, ret, retlen);
+		kfree(buf);
+		return;
+	}
+
+	ret = 0;
+	for (i = 0; i < len; i++)
+		if (buf[i] != 0xff)
+			ret = 1;
+
+	if (ret) {
+		JFFS2_ERROR("argh, about to write node to %#08x on flash, but there are data already there. The first corrupted byte is at %#08x offset.\n",
+			ofs, ofs + i);
+		__jffs2_dbg_dump_buffer(buf, len, ofs);
+		kfree(buf);
+		BUG();
+	}
+
+	kfree(buf);
+}
+
+void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c)
+{
+	struct jffs2_eraseblock *jeb;
+	uint32_t free = 0, dirty = 0, used = 0, wasted = 0,
+		erasing = 0, bad = 0, unchecked = 0;
+	int nr_counted = 0;
+	int dump = 0;
+
+	if (c->gcblock) {
+		nr_counted++;
+		free += c->gcblock->free_size;
+		dirty += c->gcblock->dirty_size;
+		used += c->gcblock->used_size;
+		wasted += c->gcblock->wasted_size;
+		unchecked += c->gcblock->unchecked_size;
+	}
+	if (c->nextblock) {
+		nr_counted++;
+		free += c->nextblock->free_size;
+		dirty += c->nextblock->dirty_size;
+		used += c->nextblock->used_size;
+		wasted += c->nextblock->wasted_size;
+		unchecked += c->nextblock->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->clean_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->very_dirty_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->dirty_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->erasable_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->erasable_pending_wbuf_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->erase_pending_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->free_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+	list_for_each_entry(jeb, &c->bad_used_list, list) {
+		nr_counted++;
+		free += jeb->free_size;
+		dirty += jeb->dirty_size;
+		used += jeb->used_size;
+		wasted += jeb->wasted_size;
+		unchecked += jeb->unchecked_size;
+	}
+
+	list_for_each_entry(jeb, &c->erasing_list, list) {
+		nr_counted++;
+		erasing += c->sector_size;
+	}
+	list_for_each_entry(jeb, &c->erase_checking_list, list) {
+		nr_counted++;
+		erasing += c->sector_size;
+	}
+	list_for_each_entry(jeb, &c->erase_complete_list, list) {
+		nr_counted++;
+		erasing += c->sector_size;
+	}
+	list_for_each_entry(jeb, &c->bad_list, list) {
+		nr_counted++;
+		bad += c->sector_size;
+	}
+
+#define check(sz) \
+	if (sz != c->sz##_size) {			\
+		printk(KERN_WARNING #sz "_size mismatch counted 0x%x, c->" #sz "_size 0x%x\n", \
+		       sz, c->sz##_size);		\
+		dump = 1;				\
+	}
+	check(free);
+	check(dirty);
+	check(used);
+	check(wasted);
+	check(unchecked);
+	check(bad);
+	check(erasing);
+#undef check
+
+	if (nr_counted != c->nr_blocks) {
+		printk(KERN_WARNING "%s counted only 0x%x blocks of 0x%x. Where are the others?\n",
+		       __func__, nr_counted, c->nr_blocks);
+		dump = 1;
+	}
+
+	if (dump) {
+		__jffs2_dbg_dump_block_lists_nolock(c);
+		BUG();
+	}
+}
+
+/*
+ * Check the space accounting and node_ref list correctness for the JFFS2 erasable block 'jeb'.
+ */
+void
+__jffs2_dbg_acct_paranoia_check(struct jffs2_sb_info *c,
+				struct jffs2_eraseblock *jeb)
+{
+	spin_lock(&c->erase_completion_lock);
+	__jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+	spin_unlock(&c->erase_completion_lock);
+}
+
+void
+__jffs2_dbg_acct_paranoia_check_nolock(struct jffs2_sb_info *c,
+				       struct jffs2_eraseblock *jeb)
+{
+	uint32_t my_used_size = 0;
+	uint32_t my_unchecked_size = 0;
+	uint32_t my_dirty_size = 0;
+	struct jffs2_raw_node_ref *ref2 = jeb->first_node;
+
+	while (ref2) {
+		uint32_t totlen = ref_totlen(c, jeb, ref2);
+
+		if (ref_offset(ref2) < jeb->offset ||
+				ref_offset(ref2) > jeb->offset + c->sector_size) {
+			JFFS2_ERROR("node_ref %#08x shouldn't be in block at %#08x.\n",
+				ref_offset(ref2), jeb->offset);
+			goto error;
+
+		}
+		if (ref_flags(ref2) == REF_UNCHECKED)
+			my_unchecked_size += totlen;
+		else if (!ref_obsolete(ref2))
+			my_used_size += totlen;
+		else
+			my_dirty_size += totlen;
+
+		if ((!ref_next(ref2)) != (ref2 == jeb->last_node)) {
+			JFFS2_ERROR("node_ref for node at %#08x (mem %p) has next at %#08x (mem %p), last_node is at %#08x (mem %p).\n",
+				    ref_offset(ref2), ref2, ref_offset(ref_next(ref2)), ref_next(ref2),
+				    ref_offset(jeb->last_node), jeb->last_node);
+			goto error;
+		}
+		ref2 = ref_next(ref2);
+	}
+
+	if (my_used_size != jeb->used_size) {
+		JFFS2_ERROR("Calculated used size %#08x != stored used size %#08x.\n",
+			my_used_size, jeb->used_size);
+		goto error;
+	}
+
+	if (my_unchecked_size != jeb->unchecked_size) {
+		JFFS2_ERROR("Calculated unchecked size %#08x != stored unchecked size %#08x.\n",
+			my_unchecked_size, jeb->unchecked_size);
+		goto error;
+	}
+
+#if 0
+	/* This should work when we implement ref->__totlen elemination */
+	if (my_dirty_size != jeb->dirty_size + jeb->wasted_size) {
+		JFFS2_ERROR("Calculated dirty+wasted size %#08x != stored dirty + wasted size %#08x\n",
+			my_dirty_size, jeb->dirty_size + jeb->wasted_size);
+		goto error;
+	}
+
+	if (jeb->free_size == 0
+		&& my_used_size + my_unchecked_size + my_dirty_size != c->sector_size) {
+		JFFS2_ERROR("The sum of all nodes in block (%#x) != size of block (%#x)\n",
+			my_used_size + my_unchecked_size + my_dirty_size,
+			c->sector_size);
+		goto error;
+	}
+#endif
+
+	if (!(c->flags & (JFFS2_SB_FLAG_BUILDING|JFFS2_SB_FLAG_SCANNING)))
+		__jffs2_dbg_superblock_counts(c);
+
+	return;
+
+error:
+	__jffs2_dbg_dump_node_refs_nolock(c, jeb);
+	__jffs2_dbg_dump_jeb_nolock(jeb);
+	__jffs2_dbg_dump_block_lists_nolock(c);
+	BUG();
+
+}
+#endif /* JFFS2_DBG_PARANOIA_CHECKS */
+
+#if defined(JFFS2_DBG_DUMPS) || defined(JFFS2_DBG_PARANOIA_CHECKS)
+/*
+ * Dump the node_refs of the 'jeb' JFFS2 eraseblock.
+ */
+void
+__jffs2_dbg_dump_node_refs(struct jffs2_sb_info *c,
+			   struct jffs2_eraseblock *jeb)
+{
+	spin_lock(&c->erase_completion_lock);
+	__jffs2_dbg_dump_node_refs_nolock(c, jeb);
+	spin_unlock(&c->erase_completion_lock);
+}
+
+void
+__jffs2_dbg_dump_node_refs_nolock(struct jffs2_sb_info *c,
+				  struct jffs2_eraseblock *jeb)
+{
+	struct jffs2_raw_node_ref *ref;
+	int i = 0;
+
+	printk(JFFS2_DBG_MSG_PREFIX " Dump node_refs of the eraseblock %#08x\n", jeb->offset);
+	if (!jeb->first_node) {
+		printk(JFFS2_DBG_MSG_PREFIX " no nodes in the eraseblock %#08x\n", jeb->offset);
+		return;
+	}
+
+	printk(JFFS2_DBG);
+	for (ref = jeb->first_node; ; ref = ref_next(ref)) {
+		printk("%#08x", ref_offset(ref));
+#ifdef TEST_TOTLEN
+		printk("(%x)", ref->__totlen);
+#endif
+		if (ref_next(ref))
+			printk("->");
+		else
+			break;
+		if (++i == 4) {
+			i = 0;
+			printk("\n" JFFS2_DBG);
+		}
+	}
+	printk("\n");
+}
+
+/*
+ * Dump an eraseblock's space accounting.
+ */
+void
+__jffs2_dbg_dump_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+	spin_lock(&c->erase_completion_lock);
+	__jffs2_dbg_dump_jeb_nolock(jeb);
+	spin_unlock(&c->erase_completion_lock);
+}
+
+void
+__jffs2_dbg_dump_jeb_nolock(struct jffs2_eraseblock *jeb)
+{
+	if (!jeb)
+		return;
+
+	printk(JFFS2_DBG_MSG_PREFIX " dump space accounting for the eraseblock at %#08x:\n",
+			jeb->offset);
+
+	printk(JFFS2_DBG "used_size: %#08x\n",		jeb->used_size);
+	printk(JFFS2_DBG "dirty_size: %#08x\n",		jeb->dirty_size);
+	printk(JFFS2_DBG "wasted_size: %#08x\n",	jeb->wasted_size);
+	printk(JFFS2_DBG "unchecked_size: %#08x\n",	jeb->unchecked_size);
+	printk(JFFS2_DBG "free_size: %#08x\n",		jeb->free_size);
+}
+
+void
+__jffs2_dbg_dump_block_lists(struct jffs2_sb_info *c)
+{
+	spin_lock(&c->erase_completion_lock);
+	__jffs2_dbg_dump_block_lists_nolock(c);
+	spin_unlock(&c->erase_completion_lock);
+}
+
+void
+__jffs2_dbg_dump_block_lists_nolock(struct jffs2_sb_info *c)
+{
+	printk(JFFS2_DBG_MSG_PREFIX " dump JFFS2 blocks lists:\n");
+
+	printk(JFFS2_DBG "flash_size: %#08x\n",		c->flash_size);
+	printk(JFFS2_DBG "used_size: %#08x\n",		c->used_size);
+	printk(JFFS2_DBG "dirty_size: %#08x\n",		c->dirty_size);
+	printk(JFFS2_DBG "wasted_size: %#08x\n",	c->wasted_size);
+	printk(JFFS2_DBG "unchecked_size: %#08x\n",	c->unchecked_size);
+	printk(JFFS2_DBG "free_size: %#08x\n",		c->free_size);
+	printk(JFFS2_DBG "erasing_size: %#08x\n",	c->erasing_size);
+	printk(JFFS2_DBG "bad_size: %#08x\n",		c->bad_size);
+	printk(JFFS2_DBG "sector_size: %#08x\n",	c->sector_size);
+	printk(JFFS2_DBG "jffs2_reserved_blocks size: %#08x\n",
+				c->sector_size * c->resv_blocks_write);
+
+	if (c->nextblock)
+		printk(JFFS2_DBG "nextblock: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+			c->nextblock->offset, c->nextblock->used_size,
+			c->nextblock->dirty_size, c->nextblock->wasted_size,
+			c->nextblock->unchecked_size, c->nextblock->free_size);
+	else
+		printk(JFFS2_DBG "nextblock: NULL\n");
+
+	if (c->gcblock)
+		printk(JFFS2_DBG "gcblock: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+			c->gcblock->offset, c->gcblock->used_size, c->gcblock->dirty_size,
+			c->gcblock->wasted_size, c->gcblock->unchecked_size, c->gcblock->free_size);
+	else
+		printk(JFFS2_DBG "gcblock: NULL\n");
+
+	if (list_empty(&c->clean_list)) {
+		printk(JFFS2_DBG "clean_list: empty\n");
+	} else {
+		struct list_head *this;
+		int numblocks = 0;
+		uint32_t dirty = 0;
+
+		list_for_each(this, &c->clean_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+			numblocks ++;
+			dirty += jeb->wasted_size;
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "clean_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+
+		printk (JFFS2_DBG "Contains %d blocks with total wasted size %u, average wasted size: %u\n",
+			numblocks, dirty, dirty / numblocks);
+	}
+
+	if (list_empty(&c->very_dirty_list)) {
+		printk(JFFS2_DBG "very_dirty_list: empty\n");
+	} else {
+		struct list_head *this;
+		int numblocks = 0;
+		uint32_t dirty = 0;
+
+		list_for_each(this, &c->very_dirty_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			numblocks ++;
+			dirty += jeb->dirty_size;
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "very_dirty_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+
+		printk (JFFS2_DBG "Contains %d blocks with total dirty size %u, average dirty size: %u\n",
+			numblocks, dirty, dirty / numblocks);
+	}
+
+	if (list_empty(&c->dirty_list)) {
+		printk(JFFS2_DBG "dirty_list: empty\n");
+	} else {
+		struct list_head *this;
+		int numblocks = 0;
+		uint32_t dirty = 0;
+
+		list_for_each(this, &c->dirty_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			numblocks ++;
+			dirty += jeb->dirty_size;
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "dirty_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+
+		printk (JFFS2_DBG "contains %d blocks with total dirty size %u, average dirty size: %u\n",
+			numblocks, dirty, dirty / numblocks);
+	}
+
+	if (list_empty(&c->erasable_list)) {
+		printk(JFFS2_DBG "erasable_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->erasable_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "erasable_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+
+	if (list_empty(&c->erasing_list)) {
+		printk(JFFS2_DBG "erasing_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->erasing_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "erasing_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+	if (list_empty(&c->erase_checking_list)) {
+		printk(JFFS2_DBG "erase_checking_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->erase_checking_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "erase_checking_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+
+	if (list_empty(&c->erase_pending_list)) {
+		printk(JFFS2_DBG "erase_pending_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->erase_pending_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "erase_pending_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+
+	if (list_empty(&c->erasable_pending_wbuf_list)) {
+		printk(JFFS2_DBG "erasable_pending_wbuf_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->erasable_pending_wbuf_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "erasable_pending_wbuf_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+
+	if (list_empty(&c->free_list)) {
+		printk(JFFS2_DBG "free_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->free_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "free_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+
+	if (list_empty(&c->bad_list)) {
+		printk(JFFS2_DBG "bad_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->bad_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "bad_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+
+	if (list_empty(&c->bad_used_list)) {
+		printk(JFFS2_DBG "bad_used_list: empty\n");
+	} else {
+		struct list_head *this;
+
+		list_for_each(this, &c->bad_used_list) {
+			struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+			if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) {
+				printk(JFFS2_DBG "bad_used_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n",
+					jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size,
+					jeb->unchecked_size, jeb->free_size);
+			}
+		}
+	}
+}
+
+void
+__jffs2_dbg_dump_fragtree(struct jffs2_inode_info *f)
+{
+	mutex_lock(&f->sem);
+	jffs2_dbg_dump_fragtree_nolock(f);
+	mutex_unlock(&f->sem);
+}
+
+void
+__jffs2_dbg_dump_fragtree_nolock(struct jffs2_inode_info *f)
+{
+	struct jffs2_node_frag *this = frag_first(&f->fragtree);
+	uint32_t lastofs = 0;
+	int buggy = 0;
+
+	printk(JFFS2_DBG_MSG_PREFIX " dump fragtree of ino #%u\n", f->inocache->ino);
+	while(this) {
+		if (this->node)
+			printk(JFFS2_DBG "frag %#04x-%#04x: %#08x(%d) on flash (*%p), left (%p), right (%p), parent (%p)\n",
+				this->ofs, this->ofs+this->size, ref_offset(this->node->raw),
+				ref_flags(this->node->raw), this, frag_left(this), frag_right(this),
+				frag_parent(this));
+		else
+			printk(JFFS2_DBG "frag %#04x-%#04x: hole (*%p). left (%p), right (%p), parent (%p)\n",
+				this->ofs, this->ofs+this->size, this, frag_left(this),
+				frag_right(this), frag_parent(this));
+		if (this->ofs != lastofs)
+			buggy = 1;
+		lastofs = this->ofs + this->size;
+		this = frag_next(this);
+	}
+
+	if (f->metadata)
+		printk(JFFS2_DBG "metadata at 0x%08x\n", ref_offset(f->metadata->raw));
+
+	if (buggy) {
+		JFFS2_ERROR("frag tree got a hole in it.\n");
+		BUG();
+	}
+}
+
+#define JFFS2_BUFDUMP_BYTES_PER_LINE	32
+void
+__jffs2_dbg_dump_buffer(unsigned char *buf, int len, uint32_t offs)
+{
+	int skip;
+	int i;
+
+	printk(JFFS2_DBG_MSG_PREFIX " dump from offset %#08x to offset %#08x (%x bytes).\n",
+		offs, offs + len, len);
+	i = skip = offs % JFFS2_BUFDUMP_BYTES_PER_LINE;
+	offs = offs & ~(JFFS2_BUFDUMP_BYTES_PER_LINE - 1);
+
+	if (skip != 0)
+		printk(JFFS2_DBG "%#08x: ", offs);
+
+	while (skip--)
+		printk("   ");
+
+	while (i < len) {
+		if ((i % JFFS2_BUFDUMP_BYTES_PER_LINE) == 0 && i != len -1) {
+			if (i != 0)
+				printk("\n");
+			offs += JFFS2_BUFDUMP_BYTES_PER_LINE;
+			printk(JFFS2_DBG "%0#8x: ", offs);
+		}
+
+		printk("%02x ", buf[i]);
+
+		i += 1;
+	}
+
+	printk("\n");
+}
+
+/*
+ * Dump a JFFS2 node.
+ */
+void
+__jffs2_dbg_dump_node(struct jffs2_sb_info *c, uint32_t ofs)
+{
+	union jffs2_node_union node;
+	int len = sizeof(union jffs2_node_union);
+	size_t retlen;
+	uint32_t crc;
+	int ret;
+
+	printk(JFFS2_DBG_MSG_PREFIX " dump node at offset %#08x.\n", ofs);
+
+	ret = jffs2_flash_read(c, ofs, len, &retlen, (unsigned char *)&node);
+	if (ret || (retlen != len)) {
+		JFFS2_ERROR("read %d bytes failed or short. ret %d, retlen %zd.\n",
+			len, ret, retlen);
+		return;
+	}
+
+	printk(JFFS2_DBG "magic:\t%#04x\n", je16_to_cpu(node.u.magic));
+	printk(JFFS2_DBG "nodetype:\t%#04x\n", je16_to_cpu(node.u.nodetype));
+	printk(JFFS2_DBG "totlen:\t%#08x\n", je32_to_cpu(node.u.totlen));
+	printk(JFFS2_DBG "hdr_crc:\t%#08x\n", je32_to_cpu(node.u.hdr_crc));
+
+	crc = crc32(0, &node.u, sizeof(node.u) - 4);
+	if (crc != je32_to_cpu(node.u.hdr_crc)) {
+		JFFS2_ERROR("wrong common header CRC.\n");
+		return;
+	}
+
+	if (je16_to_cpu(node.u.magic) != JFFS2_MAGIC_BITMASK &&
+		je16_to_cpu(node.u.magic) != JFFS2_OLD_MAGIC_BITMASK)
+	{
+		JFFS2_ERROR("wrong node magic: %#04x instead of %#04x.\n",
+			je16_to_cpu(node.u.magic), JFFS2_MAGIC_BITMASK);
+		return;
+	}
+
+	switch(je16_to_cpu(node.u.nodetype)) {
+
+	case JFFS2_NODETYPE_INODE:
+
+		printk(JFFS2_DBG "the node is inode node\n");
+		printk(JFFS2_DBG "ino:\t%#08x\n", je32_to_cpu(node.i.ino));
+		printk(JFFS2_DBG "version:\t%#08x\n", je32_to_cpu(node.i.version));
+		printk(JFFS2_DBG "mode:\t%#08x\n", node.i.mode.m);
+		printk(JFFS2_DBG "uid:\t%#04x\n", je16_to_cpu(node.i.uid));
+		printk(JFFS2_DBG "gid:\t%#04x\n", je16_to_cpu(node.i.gid));
+		printk(JFFS2_DBG "isize:\t%#08x\n", je32_to_cpu(node.i.isize));
+		printk(JFFS2_DBG "atime:\t%#08x\n", je32_to_cpu(node.i.atime));
+		printk(JFFS2_DBG "mtime:\t%#08x\n", je32_to_cpu(node.i.mtime));
+		printk(JFFS2_DBG "ctime:\t%#08x\n", je32_to_cpu(node.i.ctime));
+		printk(JFFS2_DBG "offset:\t%#08x\n", je32_to_cpu(node.i.offset));
+		printk(JFFS2_DBG "csize:\t%#08x\n", je32_to_cpu(node.i.csize));
+		printk(JFFS2_DBG "dsize:\t%#08x\n", je32_to_cpu(node.i.dsize));
+		printk(JFFS2_DBG "compr:\t%#02x\n", node.i.compr);
+		printk(JFFS2_DBG "usercompr:\t%#02x\n", node.i.usercompr);
+		printk(JFFS2_DBG "flags:\t%#04x\n", je16_to_cpu(node.i.flags));
+		printk(JFFS2_DBG "data_crc:\t%#08x\n", je32_to_cpu(node.i.data_crc));
+		printk(JFFS2_DBG "node_crc:\t%#08x\n", je32_to_cpu(node.i.node_crc));
+
+		crc = crc32(0, &node.i, sizeof(node.i) - 8);
+		if (crc != je32_to_cpu(node.i.node_crc)) {
+			JFFS2_ERROR("wrong node header CRC.\n");
+			return;
+		}
+		break;
+
+	case JFFS2_NODETYPE_DIRENT:
+
+		printk(JFFS2_DBG "the node is dirent node\n");
+		printk(JFFS2_DBG "pino:\t%#08x\n", je32_to_cpu(node.d.pino));
+		printk(JFFS2_DBG "version:\t%#08x\n", je32_to_cpu(node.d.version));
+		printk(JFFS2_DBG "ino:\t%#08x\n", je32_to_cpu(node.d.ino));
+		printk(JFFS2_DBG "mctime:\t%#08x\n", je32_to_cpu(node.d.mctime));
+		printk(JFFS2_DBG "nsize:\t%#02x\n", node.d.nsize);
+		printk(JFFS2_DBG "type:\t%#02x\n", node.d.type);
+		printk(JFFS2_DBG "node_crc:\t%#08x\n", je32_to_cpu(node.d.node_crc));
+		printk(JFFS2_DBG "name_crc:\t%#08x\n", je32_to_cpu(node.d.name_crc));
+
+		node.d.name[node.d.nsize] = '\0';
+		printk(JFFS2_DBG "name:\t\"%s\"\n", node.d.name);
+
+		crc = crc32(0, &node.d, sizeof(node.d) - 8);
+		if (crc != je32_to_cpu(node.d.node_crc)) {
+			JFFS2_ERROR("wrong node header CRC.\n");
+			return;
+		}
+		break;
+
+	default:
+		printk(JFFS2_DBG "node type is unknown\n");
+		break;
+	}
+}
+#endif /* JFFS2_DBG_DUMPS || JFFS2_DBG_PARANOIA_CHECKS */
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
new file mode 100644
index 00000000..c4f8eef5
--- /dev/null
+++ b/fs/jffs2/debug.h
@@ -0,0 +1,291 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef _JFFS2_DEBUG_H_
+#define _JFFS2_DEBUG_H_
+
+#include <linux/sched.h>
+
+#ifndef CONFIG_JFFS2_FS_DEBUG
+#define CONFIG_JFFS2_FS_DEBUG 0
+#endif
+
+#if CONFIG_JFFS2_FS_DEBUG > 0
+/* Enable "paranoia" checks and dumps */
+#define JFFS2_DBG_PARANOIA_CHECKS
+#define JFFS2_DBG_DUMPS
+
+/*
+ * By defining/undefining the below macros one may select debugging messages
+ * fro specific JFFS2 subsystems.
+ */
+#define JFFS2_DBG_READINODE_MESSAGES
+#define JFFS2_DBG_FRAGTREE_MESSAGES
+#define JFFS2_DBG_DENTLIST_MESSAGES
+#define JFFS2_DBG_NODEREF_MESSAGES
+#define JFFS2_DBG_INOCACHE_MESSAGES
+#define JFFS2_DBG_SUMMARY_MESSAGES
+#define JFFS2_DBG_FSBUILD_MESSAGES
+#endif
+
+#if CONFIG_JFFS2_FS_DEBUG > 1
+#define JFFS2_DBG_FRAGTREE2_MESSAGES
+#define JFFS2_DBG_READINODE2_MESSAGES
+#define JFFS2_DBG_MEMALLOC_MESSAGES
+#endif
+
+/* Sanity checks are supposed to be light-weight and enabled by default */
+#define JFFS2_DBG_SANITY_CHECKS
+
+/*
+ * Dx() are mainly used for debugging messages, they must go away and be
+ * superseded by nicer dbg_xxx() macros...
+ */
+#if CONFIG_JFFS2_FS_DEBUG > 0
+#define D1(x) x
+#else
+#define D1(x)
+#endif
+
+#if CONFIG_JFFS2_FS_DEBUG > 1
+#define D2(x) x
+#else
+#define D2(x)
+#endif
+
+/* The prefixes of JFFS2 messages */
+#define JFFS2_DBG_PREFIX	"[JFFS2 DBG]"
+#define JFFS2_ERR_PREFIX	"JFFS2 error:"
+#define JFFS2_WARN_PREFIX	"JFFS2 warning:"
+#define JFFS2_NOTICE_PREFIX	"JFFS2 notice:"
+
+#define JFFS2_ERR	KERN_ERR
+#define JFFS2_WARN	KERN_WARNING
+#define JFFS2_NOT	KERN_NOTICE
+#define JFFS2_DBG	KERN_DEBUG
+
+#define JFFS2_DBG_MSG_PREFIX	JFFS2_DBG JFFS2_DBG_PREFIX
+#define JFFS2_ERR_MSG_PREFIX	JFFS2_ERR JFFS2_ERR_PREFIX
+#define JFFS2_WARN_MSG_PREFIX	JFFS2_WARN JFFS2_WARN_PREFIX
+#define JFFS2_NOTICE_MSG_PREFIX	JFFS2_NOT JFFS2_NOTICE_PREFIX
+
+/* JFFS2 message macros */
+#define JFFS2_ERROR(fmt, ...)						\
+	do {								\
+		printk(JFFS2_ERR_MSG_PREFIX				\
+			" (%d) %s: " fmt, task_pid_nr(current),		\
+			__func__ , ##__VA_ARGS__);			\
+	} while(0)
+
+#define JFFS2_WARNING(fmt, ...)						\
+	do {								\
+		printk(JFFS2_WARN_MSG_PREFIX				\
+			" (%d) %s: " fmt, task_pid_nr(current),		\
+			__func__ , ##__VA_ARGS__);			\
+	} while(0)
+
+#define JFFS2_NOTICE(fmt, ...)						\
+	do {								\
+		printk(JFFS2_NOTICE_MSG_PREFIX				\
+			" (%d) %s: " fmt, task_pid_nr(current),		\
+			__func__ , ##__VA_ARGS__);			\
+	} while(0)
+
+#define JFFS2_DEBUG(fmt, ...)						\
+	do {								\
+		printk(JFFS2_DBG_MSG_PREFIX				\
+			" (%d) %s: " fmt, task_pid_nr(current),		\
+			__func__ , ##__VA_ARGS__);			\
+	} while(0)
+
+/*
+ * We split our debugging messages on several parts, depending on the JFFS2
+ * subsystem the message belongs to.
+ */
+/* Read inode debugging messages */
+#ifdef JFFS2_DBG_READINODE_MESSAGES
+#define dbg_readinode(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_readinode(fmt, ...)
+#endif
+#ifdef JFFS2_DBG_READINODE2_MESSAGES
+#define dbg_readinode2(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_readinode2(fmt, ...)
+#endif
+
+/* Fragtree build debugging messages */
+#ifdef JFFS2_DBG_FRAGTREE_MESSAGES
+#define dbg_fragtree(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_fragtree(fmt, ...)
+#endif
+#ifdef JFFS2_DBG_FRAGTREE2_MESSAGES
+#define dbg_fragtree2(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_fragtree2(fmt, ...)
+#endif
+
+/* Directory entry list manilulation debugging messages */
+#ifdef JFFS2_DBG_DENTLIST_MESSAGES
+#define dbg_dentlist(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_dentlist(fmt, ...)
+#endif
+
+/* Print the messages about manipulating node_refs */
+#ifdef JFFS2_DBG_NODEREF_MESSAGES
+#define dbg_noderef(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_noderef(fmt, ...)
+#endif
+
+/* Manipulations with the list of inodes (JFFS2 inocache) */
+#ifdef JFFS2_DBG_INOCACHE_MESSAGES
+#define dbg_inocache(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_inocache(fmt, ...)
+#endif
+
+/* Summary debugging messages */
+#ifdef JFFS2_DBG_SUMMARY_MESSAGES
+#define dbg_summary(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_summary(fmt, ...)
+#endif
+
+/* File system build messages */
+#ifdef JFFS2_DBG_FSBUILD_MESSAGES
+#define dbg_fsbuild(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_fsbuild(fmt, ...)
+#endif
+
+/* Watch the object allocations */
+#ifdef JFFS2_DBG_MEMALLOC_MESSAGES
+#define dbg_memalloc(fmt, ...)	JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_memalloc(fmt, ...)
+#endif
+
+/* Watch the XATTR subsystem */
+#ifdef JFFS2_DBG_XATTR_MESSAGES
+#define dbg_xattr(fmt, ...)  JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_xattr(fmt, ...)
+#endif 
+
+/* "Sanity" checks */
+void
+__jffs2_dbg_acct_sanity_check_nolock(struct jffs2_sb_info *c,
+				     struct jffs2_eraseblock *jeb);
+void
+__jffs2_dbg_acct_sanity_check(struct jffs2_sb_info *c,
+			      struct jffs2_eraseblock *jeb);
+
+/* "Paranoia" checks */
+void
+__jffs2_dbg_fragtree_paranoia_check(struct jffs2_inode_info *f);
+void
+__jffs2_dbg_fragtree_paranoia_check_nolock(struct jffs2_inode_info *f);
+void
+__jffs2_dbg_acct_paranoia_check(struct jffs2_sb_info *c,
+			   	struct jffs2_eraseblock *jeb);
+void
+__jffs2_dbg_acct_paranoia_check_nolock(struct jffs2_sb_info *c,
+				       struct jffs2_eraseblock *jeb);
+void
+__jffs2_dbg_prewrite_paranoia_check(struct jffs2_sb_info *c,
+				    uint32_t ofs, int len);
+
+/* "Dump" functions */
+void
+__jffs2_dbg_dump_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+void
+__jffs2_dbg_dump_jeb_nolock(struct jffs2_eraseblock *jeb);
+void
+__jffs2_dbg_dump_block_lists(struct jffs2_sb_info *c);
+void
+__jffs2_dbg_dump_block_lists_nolock(struct jffs2_sb_info *c);
+void
+__jffs2_dbg_dump_node_refs(struct jffs2_sb_info *c,
+		 	   struct jffs2_eraseblock *jeb);
+void
+__jffs2_dbg_dump_node_refs_nolock(struct jffs2_sb_info *c,
+				  struct jffs2_eraseblock *jeb);
+void
+__jffs2_dbg_dump_fragtree(struct jffs2_inode_info *f);
+void
+__jffs2_dbg_dump_fragtree_nolock(struct jffs2_inode_info *f);
+void
+__jffs2_dbg_dump_buffer(unsigned char *buf, int len, uint32_t offs);
+void
+__jffs2_dbg_dump_node(struct jffs2_sb_info *c, uint32_t ofs);
+
+#ifdef JFFS2_DBG_PARANOIA_CHECKS
+#define jffs2_dbg_fragtree_paranoia_check(f)			\
+	__jffs2_dbg_fragtree_paranoia_check(f)
+#define jffs2_dbg_fragtree_paranoia_check_nolock(f)		\
+	__jffs2_dbg_fragtree_paranoia_check_nolock(f)
+#define jffs2_dbg_acct_paranoia_check(c, jeb)			\
+	__jffs2_dbg_acct_paranoia_check(c,jeb)
+#define jffs2_dbg_acct_paranoia_check_nolock(c, jeb)		\
+	__jffs2_dbg_acct_paranoia_check_nolock(c,jeb)
+#define jffs2_dbg_prewrite_paranoia_check(c, ofs, len)		\
+	__jffs2_dbg_prewrite_paranoia_check(c, ofs, len)
+#else
+#define jffs2_dbg_fragtree_paranoia_check(f)
+#define jffs2_dbg_fragtree_paranoia_check_nolock(f)
+#define jffs2_dbg_acct_paranoia_check(c, jeb)
+#define jffs2_dbg_acct_paranoia_check_nolock(c, jeb)
+#define jffs2_dbg_prewrite_paranoia_check(c, ofs, len)
+#endif /* !JFFS2_PARANOIA_CHECKS */
+
+#ifdef JFFS2_DBG_DUMPS
+#define jffs2_dbg_dump_jeb(c, jeb)				\
+	__jffs2_dbg_dump_jeb(c, jeb);
+#define jffs2_dbg_dump_jeb_nolock(jeb)				\
+	__jffs2_dbg_dump_jeb_nolock(jeb);
+#define jffs2_dbg_dump_block_lists(c)				\
+	__jffs2_dbg_dump_block_lists(c)
+#define jffs2_dbg_dump_block_lists_nolock(c)			\
+	__jffs2_dbg_dump_block_lists_nolock(c)
+#define jffs2_dbg_dump_fragtree(f)				\
+	__jffs2_dbg_dump_fragtree(f);
+#define jffs2_dbg_dump_fragtree_nolock(f)			\
+	__jffs2_dbg_dump_fragtree_nolock(f);
+#define jffs2_dbg_dump_buffer(buf, len, offs)			\
+	__jffs2_dbg_dump_buffer(*buf, len, offs);
+#define jffs2_dbg_dump_node(c, ofs)				\
+	__jffs2_dbg_dump_node(c, ofs);
+#else
+#define jffs2_dbg_dump_jeb(c, jeb)
+#define jffs2_dbg_dump_jeb_nolock(jeb)
+#define jffs2_dbg_dump_block_lists(c)
+#define jffs2_dbg_dump_block_lists_nolock(c)
+#define jffs2_dbg_dump_fragtree(f)
+#define jffs2_dbg_dump_fragtree_nolock(f)
+#define jffs2_dbg_dump_buffer(buf, len, offs)
+#define jffs2_dbg_dump_node(c, ofs)
+#endif /* !JFFS2_DBG_DUMPS */
+
+#ifdef JFFS2_DBG_SANITY_CHECKS
+#define jffs2_dbg_acct_sanity_check(c, jeb)			\
+	__jffs2_dbg_acct_sanity_check(c, jeb)
+#define jffs2_dbg_acct_sanity_check_nolock(c, jeb)		\
+	__jffs2_dbg_acct_sanity_check_nolock(c, jeb)
+#else
+#define jffs2_dbg_acct_sanity_check(c, jeb)
+#define jffs2_dbg_acct_sanity_check_nolock(c, jeb)
+#endif /* !JFFS2_DBG_SANITY_CHECKS */
+
+#endif /* _JFFS2_DEBUG_H_ */
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
new file mode 100644
index 00000000..4bca6a2e
--- /dev/null
+++ b/fs/jffs2/dir.c
@@ -0,0 +1,873 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include "jffs2_fs_i.h"
+#include "jffs2_fs_sb.h"
+#include <linux/time.h>
+#include "nodelist.h"
+
+static int jffs2_readdir (struct file *, void *, filldir_t);
+
+static int jffs2_create (struct inode *,struct dentry *,int,
+			 struct nameidata *);
+static struct dentry *jffs2_lookup (struct inode *,struct dentry *,
+				    struct nameidata *);
+static int jffs2_link (struct dentry *,struct inode *,struct dentry *);
+static int jffs2_unlink (struct inode *,struct dentry *);
+static int jffs2_symlink (struct inode *,struct dentry *,const char *);
+static int jffs2_mkdir (struct inode *,struct dentry *,int);
+static int jffs2_rmdir (struct inode *,struct dentry *);
+static int jffs2_mknod (struct inode *,struct dentry *,int,dev_t);
+static int jffs2_rename (struct inode *, struct dentry *,
+			 struct inode *, struct dentry *);
+
+const struct file_operations jffs2_dir_operations =
+{
+	.read =		generic_read_dir,
+	.readdir =	jffs2_readdir,
+	.unlocked_ioctl=jffs2_ioctl,
+	.fsync =	jffs2_fsync,
+	.llseek =	generic_file_llseek,
+};
+
+
+const struct inode_operations jffs2_dir_inode_operations =
+{
+	.create =	jffs2_create,
+	.lookup =	jffs2_lookup,
+	.link =		jffs2_link,
+	.unlink =	jffs2_unlink,
+	.symlink =	jffs2_symlink,
+	.mkdir =	jffs2_mkdir,
+	.rmdir =	jffs2_rmdir,
+	.mknod =	jffs2_mknod,
+	.rename =	jffs2_rename,
+	.check_acl =	jffs2_check_acl,
+	.setattr =	jffs2_setattr,
+	.setxattr =	jffs2_setxattr,
+	.getxattr =	jffs2_getxattr,
+	.listxattr =	jffs2_listxattr,
+	.removexattr =	jffs2_removexattr
+};
+
+/***********************************************************************/
+
+
+/* We keep the dirent list sorted in increasing order of name hash,
+   and we use the same hash function as the dentries. Makes this
+   nice and simple
+*/
+static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
+				   struct nameidata *nd)
+{
+	struct jffs2_inode_info *dir_f;
+	struct jffs2_full_dirent *fd = NULL, *fd_list;
+	uint32_t ino = 0;
+	struct inode *inode = NULL;
+
+	D1(printk(KERN_DEBUG "jffs2_lookup()\n"));
+
+	if (target->d_name.len > JFFS2_MAX_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	dir_f = JFFS2_INODE_INFO(dir_i);
+
+	mutex_lock(&dir_f->sem);
+
+	/* NB: The 2.2 backport will need to explicitly check for '.' and '..' here */
+	for (fd_list = dir_f->dents; fd_list && fd_list->nhash <= target->d_name.hash; fd_list = fd_list->next) {
+		if (fd_list->nhash == target->d_name.hash &&
+		    (!fd || fd_list->version > fd->version) &&
+		    strlen(fd_list->name) == target->d_name.len &&
+		    !strncmp(fd_list->name, target->d_name.name, target->d_name.len)) {
+			fd = fd_list;
+		}
+	}
+	if (fd)
+		ino = fd->ino;
+	mutex_unlock(&dir_f->sem);
+	if (ino) {
+		inode = jffs2_iget(dir_i->i_sb, ino);
+		if (IS_ERR(inode)) {
+			printk(KERN_WARNING "iget() failed for ino #%u\n", ino);
+			return ERR_CAST(inode);
+		}
+	}
+
+	return d_splice_alias(inode, target);
+}
+
+/***********************************************************************/
+
+
+static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct jffs2_inode_info *f;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct jffs2_full_dirent *fd;
+	unsigned long offset, curofs;
+
+	D1(printk(KERN_DEBUG "jffs2_readdir() for dir_i #%lu\n", filp->f_path.dentry->d_inode->i_ino));
+
+	f = JFFS2_INODE_INFO(inode);
+
+	offset = filp->f_pos;
+
+	if (offset == 0) {
+		D1(printk(KERN_DEBUG "Dirent 0: \".\", ino #%lu\n", inode->i_ino));
+		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
+			goto out;
+		offset++;
+	}
+	if (offset == 1) {
+		unsigned long pino = parent_ino(filp->f_path.dentry);
+		D1(printk(KERN_DEBUG "Dirent 1: \"..\", ino #%lu\n", pino));
+		if (filldir(dirent, "..", 2, 1, pino, DT_DIR) < 0)
+			goto out;
+		offset++;
+	}
+
+	curofs=1;
+	mutex_lock(&f->sem);
+	for (fd = f->dents; fd; fd = fd->next) {
+
+		curofs++;
+		/* First loop: curofs = 2; offset = 2 */
+		if (curofs < offset) {
+			D2(printk(KERN_DEBUG "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n",
+				  fd->name, fd->ino, fd->type, curofs, offset));
+			continue;
+		}
+		if (!fd->ino) {
+			D2(printk(KERN_DEBUG "Skipping deletion dirent \"%s\"\n", fd->name));
+			offset++;
+			continue;
+		}
+		D2(printk(KERN_DEBUG "Dirent %ld: \"%s\", ino #%u, type %d\n", offset, fd->name, fd->ino, fd->type));
+		if (filldir(dirent, fd->name, strlen(fd->name), offset, fd->ino, fd->type) < 0)
+			break;
+		offset++;
+	}
+	mutex_unlock(&f->sem);
+ out:
+	filp->f_pos = offset;
+	return 0;
+}
+
+/***********************************************************************/
+
+
+static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
+			struct nameidata *nd)
+{
+	struct jffs2_raw_inode *ri;
+	struct jffs2_inode_info *f, *dir_f;
+	struct jffs2_sb_info *c;
+	struct inode *inode;
+	int ret;
+
+	ri = jffs2_alloc_raw_inode();
+	if (!ri)
+		return -ENOMEM;
+
+	c = JFFS2_SB_INFO(dir_i->i_sb);
+
+	D1(printk(KERN_DEBUG "jffs2_create()\n"));
+
+	inode = jffs2_new_inode(dir_i, mode, ri);
+
+	if (IS_ERR(inode)) {
+		D1(printk(KERN_DEBUG "jffs2_new_inode() failed\n"));
+		jffs2_free_raw_inode(ri);
+		return PTR_ERR(inode);
+	}
+
+	inode->i_op = &jffs2_file_inode_operations;
+	inode->i_fop = &jffs2_file_operations;
+	inode->i_mapping->a_ops = &jffs2_file_address_operations;
+	inode->i_mapping->nrpages = 0;
+
+	f = JFFS2_INODE_INFO(inode);
+	dir_f = JFFS2_INODE_INFO(dir_i);
+
+	/* jffs2_do_create() will want to lock it, _after_ reserving
+	   space and taking c-alloc_sem. If we keep it locked here,
+	   lockdep gets unhappy (although it's a false positive;
+	   nothing else will be looking at this inode yet so there's
+	   no chance of AB-BA deadlock involving its f->sem). */
+	mutex_unlock(&f->sem);
+
+	ret = jffs2_do_create(c, dir_f, f, ri, &dentry->d_name);
+	if (ret)
+		goto fail;
+
+	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
+
+	jffs2_free_raw_inode(ri);
+
+	D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
+		  inode->i_ino, inode->i_mode, inode->i_nlink,
+		  f->inocache->pino_nlink, inode->i_mapping->nrpages));
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+	return 0;
+
+ fail:
+	iget_failed(inode);
+	jffs2_free_raw_inode(ri);
+	return ret;
+}
+
+/***********************************************************************/
+
+
+static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb);
+	struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i);
+	struct jffs2_inode_info *dead_f = JFFS2_INODE_INFO(dentry->d_inode);
+	int ret;
+	uint32_t now = get_seconds();
+
+	ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
+			      dentry->d_name.len, dead_f, now);
+	if (dead_f->inocache)
+		dentry->d_inode->i_nlink = dead_f->inocache->pino_nlink;
+	if (!ret)
+		dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
+	return ret;
+}
+/***********************************************************************/
+
+
+static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct dentry *dentry)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dentry->d_inode->i_sb);
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode);
+	struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i);
+	int ret;
+	uint8_t type;
+	uint32_t now;
+
+	/* Don't let people make hard links to bad inodes. */
+	if (!f->inocache)
+		return -EIO;
+
+	if (S_ISDIR(old_dentry->d_inode->i_mode))
+		return -EPERM;
+
+	/* XXX: This is ugly */
+	type = (old_dentry->d_inode->i_mode & S_IFMT) >> 12;
+	if (!type) type = DT_REG;
+
+	now = get_seconds();
+	ret = jffs2_do_link(c, dir_f, f->inocache->ino, type, dentry->d_name.name, dentry->d_name.len, now);
+
+	if (!ret) {
+		mutex_lock(&f->sem);
+		old_dentry->d_inode->i_nlink = ++f->inocache->pino_nlink;
+		mutex_unlock(&f->sem);
+		d_instantiate(dentry, old_dentry->d_inode);
+		dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
+		ihold(old_dentry->d_inode);
+	}
+	return ret;
+}
+
+/***********************************************************************/
+
+static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char *target)
+{
+	struct jffs2_inode_info *f, *dir_f;
+	struct jffs2_sb_info *c;
+	struct inode *inode;
+	struct jffs2_raw_inode *ri;
+	struct jffs2_raw_dirent *rd;
+	struct jffs2_full_dnode *fn;
+	struct jffs2_full_dirent *fd;
+	int namelen;
+	uint32_t alloclen;
+	int ret, targetlen = strlen(target);
+
+	/* FIXME: If you care. We'd need to use frags for the target
+	   if it grows much more than this */
+	if (targetlen > 254)
+		return -ENAMETOOLONG;
+
+	ri = jffs2_alloc_raw_inode();
+
+	if (!ri)
+		return -ENOMEM;
+
+	c = JFFS2_SB_INFO(dir_i->i_sb);
+
+	/* Try to reserve enough space for both node and dirent.
+	 * Just the node will do for now, though
+	 */
+	namelen = dentry->d_name.len;
+	ret = jffs2_reserve_space(c, sizeof(*ri) + targetlen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+
+	if (ret) {
+		jffs2_free_raw_inode(ri);
+		return ret;
+	}
+
+	inode = jffs2_new_inode(dir_i, S_IFLNK | S_IRWXUGO, ri);
+
+	if (IS_ERR(inode)) {
+		jffs2_free_raw_inode(ri);
+		jffs2_complete_reservation(c);
+		return PTR_ERR(inode);
+	}
+
+	inode->i_op = &jffs2_symlink_inode_operations;
+
+	f = JFFS2_INODE_INFO(inode);
+
+	inode->i_size = targetlen;
+	ri->isize = ri->dsize = ri->csize = cpu_to_je32(inode->i_size);
+	ri->totlen = cpu_to_je32(sizeof(*ri) + inode->i_size);
+	ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4));
+
+	ri->compr = JFFS2_COMPR_NONE;
+	ri->data_crc = cpu_to_je32(crc32(0, target, targetlen));
+	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
+
+	fn = jffs2_write_dnode(c, f, ri, target, targetlen, ALLOC_NORMAL);
+
+	jffs2_free_raw_inode(ri);
+
+	if (IS_ERR(fn)) {
+		/* Eeek. Wave bye bye */
+		mutex_unlock(&f->sem);
+		jffs2_complete_reservation(c);
+		ret = PTR_ERR(fn);
+		goto fail;
+	}
+
+	/* We use f->target field to store the target path. */
+	f->target = kmemdup(target, targetlen + 1, GFP_KERNEL);
+	if (!f->target) {
+		printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
+		mutex_unlock(&f->sem);
+		jffs2_complete_reservation(c);
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target));
+
+	/* No data here. Only a metadata node, which will be
+	   obsoleted by the first data write
+	*/
+	f->metadata = fn;
+	mutex_unlock(&f->sem);
+
+	jffs2_complete_reservation(c);
+
+	ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
+	if (ret)
+		goto fail;
+
+	ret = jffs2_init_acl_post(inode);
+	if (ret)
+		goto fail;
+
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+	if (ret)
+		goto fail;
+
+	rd = jffs2_alloc_raw_dirent();
+	if (!rd) {
+		/* Argh. Now we treat it like a normal delete */
+		jffs2_complete_reservation(c);
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	dir_f = JFFS2_INODE_INFO(dir_i);
+	mutex_lock(&dir_f->sem);
+
+	rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
+	rd->totlen = cpu_to_je32(sizeof(*rd) + namelen);
+	rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
+
+	rd->pino = cpu_to_je32(dir_i->i_ino);
+	rd->version = cpu_to_je32(++dir_f->highest_version);
+	rd->ino = cpu_to_je32(inode->i_ino);
+	rd->mctime = cpu_to_je32(get_seconds());
+	rd->nsize = namelen;
+	rd->type = DT_LNK;
+	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
+	rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen));
+
+	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL);
+
+	if (IS_ERR(fd)) {
+		/* dirent failed to write. Delete the inode normally
+		   as if it were the final unlink() */
+		jffs2_complete_reservation(c);
+		jffs2_free_raw_dirent(rd);
+		mutex_unlock(&dir_f->sem);
+		ret = PTR_ERR(fd);
+		goto fail;
+	}
+
+	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
+
+	jffs2_free_raw_dirent(rd);
+
+	/* Link the fd into the inode's list, obsoleting an old
+	   one if necessary. */
+	jffs2_add_fd_to_list(c, fd, &dir_f->dents);
+
+	mutex_unlock(&dir_f->sem);
+	jffs2_complete_reservation(c);
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+	return 0;
+
+ fail:
+	iget_failed(inode);
+	return ret;
+}
+
+
+static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
+{
+	struct jffs2_inode_info *f, *dir_f;
+	struct jffs2_sb_info *c;
+	struct inode *inode;
+	struct jffs2_raw_inode *ri;
+	struct jffs2_raw_dirent *rd;
+	struct jffs2_full_dnode *fn;
+	struct jffs2_full_dirent *fd;
+	int namelen;
+	uint32_t alloclen;
+	int ret;
+
+	mode |= S_IFDIR;
+
+	ri = jffs2_alloc_raw_inode();
+	if (!ri)
+		return -ENOMEM;
+
+	c = JFFS2_SB_INFO(dir_i->i_sb);
+
+	/* Try to reserve enough space for both node and dirent.
+	 * Just the node will do for now, though
+	 */
+	namelen = dentry->d_name.len;
+	ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL,
+				  JFFS2_SUMMARY_INODE_SIZE);
+
+	if (ret) {
+		jffs2_free_raw_inode(ri);
+		return ret;
+	}
+
+	inode = jffs2_new_inode(dir_i, mode, ri);
+
+	if (IS_ERR(inode)) {
+		jffs2_free_raw_inode(ri);
+		jffs2_complete_reservation(c);
+		return PTR_ERR(inode);
+	}
+
+	inode->i_op = &jffs2_dir_inode_operations;
+	inode->i_fop = &jffs2_dir_operations;
+
+	f = JFFS2_INODE_INFO(inode);
+
+	/* Directories get nlink 2 at start */
+	inode->i_nlink = 2;
+	/* but ic->pino_nlink is the parent ino# */
+	f->inocache->pino_nlink = dir_i->i_ino;
+
+	ri->data_crc = cpu_to_je32(0);
+	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
+
+	fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL);
+
+	jffs2_free_raw_inode(ri);
+
+	if (IS_ERR(fn)) {
+		/* Eeek. Wave bye bye */
+		mutex_unlock(&f->sem);
+		jffs2_complete_reservation(c);
+		ret = PTR_ERR(fn);
+		goto fail;
+	}
+	/* No data here. Only a metadata node, which will be
+	   obsoleted by the first data write
+	*/
+	f->metadata = fn;
+	mutex_unlock(&f->sem);
+
+	jffs2_complete_reservation(c);
+
+	ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
+	if (ret)
+		goto fail;
+
+	ret = jffs2_init_acl_post(inode);
+	if (ret)
+		goto fail;
+
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+	if (ret)
+		goto fail;
+
+	rd = jffs2_alloc_raw_dirent();
+	if (!rd) {
+		/* Argh. Now we treat it like a normal delete */
+		jffs2_complete_reservation(c);
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	dir_f = JFFS2_INODE_INFO(dir_i);
+	mutex_lock(&dir_f->sem);
+
+	rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
+	rd->totlen = cpu_to_je32(sizeof(*rd) + namelen);
+	rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
+
+	rd->pino = cpu_to_je32(dir_i->i_ino);
+	rd->version = cpu_to_je32(++dir_f->highest_version);
+	rd->ino = cpu_to_je32(inode->i_ino);
+	rd->mctime = cpu_to_je32(get_seconds());
+	rd->nsize = namelen;
+	rd->type = DT_DIR;
+	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
+	rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen));
+
+	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL);
+
+	if (IS_ERR(fd)) {
+		/* dirent failed to write. Delete the inode normally
+		   as if it were the final unlink() */
+		jffs2_complete_reservation(c);
+		jffs2_free_raw_dirent(rd);
+		mutex_unlock(&dir_f->sem);
+		ret = PTR_ERR(fd);
+		goto fail;
+	}
+
+	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
+	inc_nlink(dir_i);
+
+	jffs2_free_raw_dirent(rd);
+
+	/* Link the fd into the inode's list, obsoleting an old
+	   one if necessary. */
+	jffs2_add_fd_to_list(c, fd, &dir_f->dents);
+
+	mutex_unlock(&dir_f->sem);
+	jffs2_complete_reservation(c);
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+	return 0;
+
+ fail:
+	iget_failed(inode);
+	return ret;
+}
+
+static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb);
+	struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i);
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(dentry->d_inode);
+	struct jffs2_full_dirent *fd;
+	int ret;
+	uint32_t now = get_seconds();
+
+	for (fd = f->dents ; fd; fd = fd->next) {
+		if (fd->ino)
+			return -ENOTEMPTY;
+	}
+
+	ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
+			      dentry->d_name.len, f, now);
+	if (!ret) {
+		dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
+		clear_nlink(dentry->d_inode);
+		drop_nlink(dir_i);
+	}
+	return ret;
+}
+
+static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, dev_t rdev)
+{
+	struct jffs2_inode_info *f, *dir_f;
+	struct jffs2_sb_info *c;
+	struct inode *inode;
+	struct jffs2_raw_inode *ri;
+	struct jffs2_raw_dirent *rd;
+	struct jffs2_full_dnode *fn;
+	struct jffs2_full_dirent *fd;
+	int namelen;
+	union jffs2_device_node dev;
+	int devlen = 0;
+	uint32_t alloclen;
+	int ret;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	ri = jffs2_alloc_raw_inode();
+	if (!ri)
+		return -ENOMEM;
+
+	c = JFFS2_SB_INFO(dir_i->i_sb);
+
+	if (S_ISBLK(mode) || S_ISCHR(mode))
+		devlen = jffs2_encode_dev(&dev, rdev);
+
+	/* Try to reserve enough space for both node and dirent.
+	 * Just the node will do for now, though
+	 */
+	namelen = dentry->d_name.len;
+	ret = jffs2_reserve_space(c, sizeof(*ri) + devlen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+
+	if (ret) {
+		jffs2_free_raw_inode(ri);
+		return ret;
+	}
+
+	inode = jffs2_new_inode(dir_i, mode, ri);
+
+	if (IS_ERR(inode)) {
+		jffs2_free_raw_inode(ri);
+		jffs2_complete_reservation(c);
+		return PTR_ERR(inode);
+	}
+	inode->i_op = &jffs2_file_inode_operations;
+	init_special_inode(inode, inode->i_mode, rdev);
+
+	f = JFFS2_INODE_INFO(inode);
+
+	ri->dsize = ri->csize = cpu_to_je32(devlen);
+	ri->totlen = cpu_to_je32(sizeof(*ri) + devlen);
+	ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4));
+
+	ri->compr = JFFS2_COMPR_NONE;
+	ri->data_crc = cpu_to_je32(crc32(0, &dev, devlen));
+	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
+
+	fn = jffs2_write_dnode(c, f, ri, (char *)&dev, devlen, ALLOC_NORMAL);
+
+	jffs2_free_raw_inode(ri);
+
+	if (IS_ERR(fn)) {
+		/* Eeek. Wave bye bye */
+		mutex_unlock(&f->sem);
+		jffs2_complete_reservation(c);
+		ret = PTR_ERR(fn);
+		goto fail;
+	}
+	/* No data here. Only a metadata node, which will be
+	   obsoleted by the first data write
+	*/
+	f->metadata = fn;
+	mutex_unlock(&f->sem);
+
+	jffs2_complete_reservation(c);
+
+	ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
+	if (ret)
+		goto fail;
+
+	ret = jffs2_init_acl_post(inode);
+	if (ret)
+		goto fail;
+
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+	if (ret)
+		goto fail;
+
+	rd = jffs2_alloc_raw_dirent();
+	if (!rd) {
+		/* Argh. Now we treat it like a normal delete */
+		jffs2_complete_reservation(c);
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	dir_f = JFFS2_INODE_INFO(dir_i);
+	mutex_lock(&dir_f->sem);
+
+	rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
+	rd->totlen = cpu_to_je32(sizeof(*rd) + namelen);
+	rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
+
+	rd->pino = cpu_to_je32(dir_i->i_ino);
+	rd->version = cpu_to_je32(++dir_f->highest_version);
+	rd->ino = cpu_to_je32(inode->i_ino);
+	rd->mctime = cpu_to_je32(get_seconds());
+	rd->nsize = namelen;
+
+	/* XXX: This is ugly. */
+	rd->type = (mode & S_IFMT) >> 12;
+
+	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
+	rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen));
+
+	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL);
+
+	if (IS_ERR(fd)) {
+		/* dirent failed to write. Delete the inode normally
+		   as if it were the final unlink() */
+		jffs2_complete_reservation(c);
+		jffs2_free_raw_dirent(rd);
+		mutex_unlock(&dir_f->sem);
+		ret = PTR_ERR(fd);
+		goto fail;
+	}
+
+	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
+
+	jffs2_free_raw_dirent(rd);
+
+	/* Link the fd into the inode's list, obsoleting an old
+	   one if necessary. */
+	jffs2_add_fd_to_list(c, fd, &dir_f->dents);
+
+	mutex_unlock(&dir_f->sem);
+	jffs2_complete_reservation(c);
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+	return 0;
+
+ fail:
+	iget_failed(inode);
+	return ret;
+}
+
+static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
+			 struct inode *new_dir_i, struct dentry *new_dentry)
+{
+	int ret;
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dir_i->i_sb);
+	struct jffs2_inode_info *victim_f = NULL;
+	uint8_t type;
+	uint32_t now;
+
+	/* The VFS will check for us and prevent trying to rename a
+	 * file over a directory and vice versa, but if it's a directory,
+	 * the VFS can't check whether the victim is empty. The filesystem
+	 * needs to do that for itself.
+	 */
+	if (new_dentry->d_inode) {
+		victim_f = JFFS2_INODE_INFO(new_dentry->d_inode);
+		if (S_ISDIR(new_dentry->d_inode->i_mode)) {
+			struct jffs2_full_dirent *fd;
+
+			mutex_lock(&victim_f->sem);
+			for (fd = victim_f->dents; fd; fd = fd->next) {
+				if (fd->ino) {
+					mutex_unlock(&victim_f->sem);
+					return -ENOTEMPTY;
+				}
+			}
+			mutex_unlock(&victim_f->sem);
+		}
+	}
+
+	/* XXX: We probably ought to alloc enough space for
+	   both nodes at the same time. Writing the new link,
+	   then getting -ENOSPC, is quite bad :)
+	*/
+
+	/* Make a hard link */
+
+	/* XXX: This is ugly */
+	type = (old_dentry->d_inode->i_mode & S_IFMT) >> 12;
+	if (!type) type = DT_REG;
+
+	now = get_seconds();
+	ret = jffs2_do_link(c, JFFS2_INODE_INFO(new_dir_i),
+			    old_dentry->d_inode->i_ino, type,
+			    new_dentry->d_name.name, new_dentry->d_name.len, now);
+
+	if (ret)
+		return ret;
+
+	if (victim_f) {
+		/* There was a victim. Kill it off nicely */
+		drop_nlink(new_dentry->d_inode);
+		/* Don't oops if the victim was a dirent pointing to an
+		   inode which didn't exist. */
+		if (victim_f->inocache) {
+			mutex_lock(&victim_f->sem);
+			if (S_ISDIR(new_dentry->d_inode->i_mode))
+				victim_f->inocache->pino_nlink = 0;
+			else
+				victim_f->inocache->pino_nlink--;
+			mutex_unlock(&victim_f->sem);
+		}
+	}
+
+	/* If it was a directory we moved, and there was no victim,
+	   increase i_nlink on its new parent */
+	if (S_ISDIR(old_dentry->d_inode->i_mode) && !victim_f)
+		inc_nlink(new_dir_i);
+
+	/* Unlink the original */
+	ret = jffs2_do_unlink(c, JFFS2_INODE_INFO(old_dir_i),
+			      old_dentry->d_name.name, old_dentry->d_name.len, NULL, now);
+
+	/* We don't touch inode->i_nlink */
+
+	if (ret) {
+		/* Oh shit. We really ought to make a single node which can do both atomically */
+		struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode);
+		mutex_lock(&f->sem);
+		inc_nlink(old_dentry->d_inode);
+		if (f->inocache && !S_ISDIR(old_dentry->d_inode->i_mode))
+			f->inocache->pino_nlink++;
+		mutex_unlock(&f->sem);
+
+		printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret);
+		/* Might as well let the VFS know */
+		d_instantiate(new_dentry, old_dentry->d_inode);
+		ihold(old_dentry->d_inode);
+		new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
+		return ret;
+	}
+
+	if (S_ISDIR(old_dentry->d_inode->i_mode))
+		drop_nlink(old_dir_i);
+
+	new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now);
+
+	return 0;
+}
+
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
new file mode 100644
index 00000000..e513f191
--- /dev/null
+++ b/fs/jffs2/erase.c
@@ -0,0 +1,502 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mtd/mtd.h>
+#include <linux/compiler.h>
+#include <linux/crc32.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include "nodelist.h"
+
+struct erase_priv_struct {
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_sb_info *c;
+};
+
+#ifndef __ECOS
+static void jffs2_erase_callback(struct erase_info *);
+#endif
+static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset);
+static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+
+static void jffs2_erase_block(struct jffs2_sb_info *c,
+			      struct jffs2_eraseblock *jeb)
+{
+	int ret;
+	uint32_t bad_offset;
+#ifdef __ECOS
+       ret = jffs2_flash_erase(c, jeb);
+       if (!ret) {
+	       jffs2_erase_succeeded(c, jeb);
+	       return;
+       }
+       bad_offset = jeb->offset;
+#else /* Linux */
+	struct erase_info *instr;
+
+	D1(printk(KERN_DEBUG "jffs2_erase_block(): erase block %#08x (range %#08x-%#08x)\n",
+				jeb->offset, jeb->offset, jeb->offset + c->sector_size));
+	instr = kmalloc(sizeof(struct erase_info) + sizeof(struct erase_priv_struct), GFP_KERNEL);
+	if (!instr) {
+		printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
+		mutex_lock(&c->erase_free_sem);
+		spin_lock(&c->erase_completion_lock);
+		list_move(&jeb->list, &c->erase_pending_list);
+		c->erasing_size -= c->sector_size;
+		c->dirty_size += c->sector_size;
+		jeb->dirty_size = c->sector_size;
+		spin_unlock(&c->erase_completion_lock);
+		mutex_unlock(&c->erase_free_sem);
+		return;
+	}
+
+	memset(instr, 0, sizeof(*instr));
+
+	instr->mtd = c->mtd;
+	instr->addr = jeb->offset;
+	instr->len = c->sector_size;
+	instr->callback = jffs2_erase_callback;
+	instr->priv = (unsigned long)(&instr[1]);
+	instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN;
+
+	((struct erase_priv_struct *)instr->priv)->jeb = jeb;
+	((struct erase_priv_struct *)instr->priv)->c = c;
+
+	ret = c->mtd->erase(c->mtd, instr);
+	if (!ret)
+		return;
+
+	bad_offset = instr->fail_addr;
+	kfree(instr);
+#endif /* __ECOS */
+
+	if (ret == -ENOMEM || ret == -EAGAIN) {
+		/* Erase failed immediately. Refile it on the list */
+		D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret));
+		mutex_lock(&c->erase_free_sem);
+		spin_lock(&c->erase_completion_lock);
+		list_move(&jeb->list, &c->erase_pending_list);
+		c->erasing_size -= c->sector_size;
+		c->dirty_size += c->sector_size;
+		jeb->dirty_size = c->sector_size;
+		spin_unlock(&c->erase_completion_lock);
+		mutex_unlock(&c->erase_free_sem);
+		return;
+	}
+
+	if (ret == -EROFS)
+		printk(KERN_WARNING "Erase at 0x%08x failed immediately: -EROFS. Is the sector locked?\n", jeb->offset);
+	else
+		printk(KERN_WARNING "Erase at 0x%08x failed immediately: errno %d\n", jeb->offset, ret);
+
+	jffs2_erase_failed(c, jeb, bad_offset);
+}
+
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
+{
+	struct jffs2_eraseblock *jeb;
+	int work_done = 0;
+
+	mutex_lock(&c->erase_free_sem);
+
+	spin_lock(&c->erase_completion_lock);
+
+	while (!list_empty(&c->erase_complete_list) ||
+	       !list_empty(&c->erase_pending_list)) {
+
+		if (!list_empty(&c->erase_complete_list)) {
+			jeb = list_entry(c->erase_complete_list.next, struct jffs2_eraseblock, list);
+			list_move(&jeb->list, &c->erase_checking_list);
+			spin_unlock(&c->erase_completion_lock);
+			mutex_unlock(&c->erase_free_sem);
+			jffs2_mark_erased_block(c, jeb);
+
+			work_done++;
+			if (!--count) {
+				D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n"));
+				goto done;
+			}
+
+		} else if (!list_empty(&c->erase_pending_list)) {
+			jeb = list_entry(c->erase_pending_list.next, struct jffs2_eraseblock, list);
+			D1(printk(KERN_DEBUG "Starting erase of pending block 0x%08x\n", jeb->offset));
+			list_del(&jeb->list);
+			c->erasing_size += c->sector_size;
+			c->wasted_size -= jeb->wasted_size;
+			c->free_size -= jeb->free_size;
+			c->used_size -= jeb->used_size;
+			c->dirty_size -= jeb->dirty_size;
+			jeb->wasted_size = jeb->used_size = jeb->dirty_size = jeb->free_size = 0;
+			jffs2_free_jeb_node_refs(c, jeb);
+			list_add(&jeb->list, &c->erasing_list);
+			spin_unlock(&c->erase_completion_lock);
+			mutex_unlock(&c->erase_free_sem);
+
+			jffs2_erase_block(c, jeb);
+
+		} else {
+			BUG();
+		}
+
+		/* Be nice */
+		cond_resched();
+		mutex_lock(&c->erase_free_sem);
+		spin_lock(&c->erase_completion_lock);
+	}
+
+	spin_unlock(&c->erase_completion_lock);
+	mutex_unlock(&c->erase_free_sem);
+ done:
+	D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n"));
+	return work_done;
+}
+
+static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+	D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset));
+	mutex_lock(&c->erase_free_sem);
+	spin_lock(&c->erase_completion_lock);
+	list_move_tail(&jeb->list, &c->erase_complete_list);
+	/* Wake the GC thread to mark them clean */
+	jffs2_garbage_collect_trigger(c);
+	spin_unlock(&c->erase_completion_lock);
+	mutex_unlock(&c->erase_free_sem);
+	wake_up(&c->erase_wait);
+}
+
+static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset)
+{
+	/* For NAND, if the failure did not occur at the device level for a
+	   specific physical page, don't bother updating the bad block table. */
+	if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) {
+		/* We had a device-level failure to erase.  Let's see if we've
+		   failed too many times. */
+		if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
+			/* We'd like to give this block another try. */
+			mutex_lock(&c->erase_free_sem);
+			spin_lock(&c->erase_completion_lock);
+			list_move(&jeb->list, &c->erase_pending_list);
+			c->erasing_size -= c->sector_size;
+			c->dirty_size += c->sector_size;
+			jeb->dirty_size = c->sector_size;
+			spin_unlock(&c->erase_completion_lock);
+			mutex_unlock(&c->erase_free_sem);
+			return;
+		}
+	}
+
+	mutex_lock(&c->erase_free_sem);
+	spin_lock(&c->erase_completion_lock);
+	c->erasing_size -= c->sector_size;
+	c->bad_size += c->sector_size;
+	list_move(&jeb->list, &c->bad_list);
+	c->nr_erasing_blocks--;
+	spin_unlock(&c->erase_completion_lock);
+	mutex_unlock(&c->erase_free_sem);
+	wake_up(&c->erase_wait);
+}
+
+#ifndef __ECOS
+static void jffs2_erase_callback(struct erase_info *instr)
+{
+	struct erase_priv_struct *priv = (void *)instr->priv;
+
+	if(instr->state != MTD_ERASE_DONE) {
+		printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n",
+			(unsigned long long)instr->addr, instr->state);
+		jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr);
+	} else {
+		jffs2_erase_succeeded(priv->c, priv->jeb);
+	}
+	kfree(instr);
+}
+#endif /* !__ECOS */
+
+/* Hmmm. Maybe we should accept the extra space it takes and make
+   this a standard doubly-linked list? */
+static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
+			struct jffs2_raw_node_ref *ref, struct jffs2_eraseblock *jeb)
+{
+	struct jffs2_inode_cache *ic = NULL;
+	struct jffs2_raw_node_ref **prev;
+
+	prev = &ref->next_in_ino;
+
+	/* Walk the inode's list once, removing any nodes from this eraseblock */
+	while (1) {
+		if (!(*prev)->next_in_ino) {
+			/* We're looking at the jffs2_inode_cache, which is
+			   at the end of the linked list. Stash it and continue
+			   from the beginning of the list */
+			ic = (struct jffs2_inode_cache *)(*prev);
+			prev = &ic->nodes;
+			continue;
+		}
+
+		if (SECTOR_ADDR((*prev)->flash_offset) == jeb->offset) {
+			/* It's in the block we're erasing */
+			struct jffs2_raw_node_ref *this;
+
+			this = *prev;
+			*prev = this->next_in_ino;
+			this->next_in_ino = NULL;
+
+			if (this == ref)
+				break;
+
+			continue;
+		}
+		/* Not to be deleted. Skip */
+		prev = &((*prev)->next_in_ino);
+	}
+
+	/* PARANOIA */
+	if (!ic) {
+		JFFS2_WARNING("inode_cache/xattr_datum/xattr_ref"
+			      " not found in remove_node_refs()!!\n");
+		return;
+	}
+
+	D1(printk(KERN_DEBUG "Removed nodes in range 0x%08x-0x%08x from ino #%u\n",
+		  jeb->offset, jeb->offset + c->sector_size, ic->ino));
+
+	D2({
+		int i=0;
+		struct jffs2_raw_node_ref *this;
+		printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n");
+
+		this = ic->nodes;
+
+		printk(KERN_DEBUG);
+		while(this) {
+			printk(KERN_CONT "0x%08x(%d)->",
+			       ref_offset(this), ref_flags(this));
+			if (++i == 5) {
+				printk(KERN_DEBUG);
+				i=0;
+			}
+			this = this->next_in_ino;
+		}
+		printk(KERN_CONT "\n");
+	});
+
+	switch (ic->class) {
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case RAWNODE_CLASS_XATTR_DATUM:
+			jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic);
+			break;
+		case RAWNODE_CLASS_XATTR_REF:
+			jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic);
+			break;
+#endif
+		default:
+			if (ic->nodes == (void *)ic && ic->pino_nlink == 0)
+				jffs2_del_ino_cache(c, ic);
+	}
+}
+
+void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+	struct jffs2_raw_node_ref *block, *ref;
+	D1(printk(KERN_DEBUG "Freeing all node refs for eraseblock offset 0x%08x\n", jeb->offset));
+
+	block = ref = jeb->first_node;
+
+	while (ref) {
+		if (ref->flash_offset == REF_LINK_NODE) {
+			ref = ref->next_in_ino;
+			jffs2_free_refblock(block);
+			block = ref;
+			continue;
+		}
+		if (ref->flash_offset != REF_EMPTY_NODE && ref->next_in_ino)
+			jffs2_remove_node_refs_from_ino_list(c, ref, jeb);
+		/* else it was a non-inode node or already removed, so don't bother */
+
+		ref++;
+	}
+	jeb->first_node = jeb->last_node = NULL;
+}
+
+static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t *bad_offset)
+{
+	void *ebuf;
+	uint32_t ofs;
+	size_t retlen;
+	int ret = -EIO;
+
+	if (c->mtd->point) {
+		unsigned long *wordebuf;
+
+		ret = c->mtd->point(c->mtd, jeb->offset, c->sector_size,
+				    &retlen, &ebuf, NULL);
+		if (ret) {
+			D1(printk(KERN_DEBUG "MTD point failed %d\n", ret));
+			goto do_flash_read;
+		}
+		if (retlen < c->sector_size) {
+			/* Don't muck about if it won't let us point to the whole erase sector */
+			D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", retlen));
+			c->mtd->unpoint(c->mtd, jeb->offset, retlen);
+			goto do_flash_read;
+		}
+		wordebuf = ebuf-sizeof(*wordebuf);
+		retlen /= sizeof(*wordebuf);
+		do {
+		   if (*++wordebuf != ~0)
+			   break;
+		} while(--retlen);
+		c->mtd->unpoint(c->mtd, jeb->offset, c->sector_size);
+		if (retlen) {
+			printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08tx\n",
+			       *wordebuf, jeb->offset + c->sector_size-retlen*sizeof(*wordebuf));
+			return -EIO;
+		}
+		return 0;
+	}
+ do_flash_read:
+	ebuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!ebuf) {
+		printk(KERN_WARNING "Failed to allocate page buffer for verifying erase at 0x%08x. Refiling\n", jeb->offset);
+		return -EAGAIN;
+	}
+
+	D1(printk(KERN_DEBUG "Verifying erase at 0x%08x\n", jeb->offset));
+
+	for (ofs = jeb->offset; ofs < jeb->offset + c->sector_size; ) {
+		uint32_t readlen = min((uint32_t)PAGE_SIZE, jeb->offset + c->sector_size - ofs);
+		int i;
+
+		*bad_offset = ofs;
+
+		ret = c->mtd->read(c->mtd, ofs, readlen, &retlen, ebuf);
+		if (ret) {
+			printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret);
+			ret = -EIO;
+			goto fail;
+		}
+		if (retlen != readlen) {
+			printk(KERN_WARNING "Short read from newly-erased block at 0x%08x. Wanted %d, got %zd\n", ofs, readlen, retlen);
+			ret = -EIO;
+			goto fail;
+		}
+		for (i=0; i<readlen; i += sizeof(unsigned long)) {
+			/* It's OK. We know it's properly aligned */
+			unsigned long *datum = ebuf + i;
+			if (*datum + 1) {
+				*bad_offset += i;
+				printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08x\n", *datum, *bad_offset);
+				ret = -EIO;
+				goto fail;
+			}
+		}
+		ofs += readlen;
+		cond_resched();
+	}
+	ret = 0;
+fail:
+	kfree(ebuf);
+	return ret;
+}
+
+static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+	size_t retlen;
+	int ret;
+	uint32_t uninitialized_var(bad_offset);
+
+	switch (jffs2_block_check_erase(c, jeb, &bad_offset)) {
+	case -EAGAIN:	goto refile;
+	case -EIO:	goto filebad;
+	}
+
+	/* Write the erase complete marker */
+	D1(printk(KERN_DEBUG "Writing erased marker to block at 0x%08x\n", jeb->offset));
+	bad_offset = jeb->offset;
+
+	/* Cleanmarker in oob area or no cleanmarker at all ? */
+	if (jffs2_cleanmarker_oob(c) || c->cleanmarker_size == 0) {
+
+		if (jffs2_cleanmarker_oob(c)) {
+			if (jffs2_write_nand_cleanmarker(c, jeb))
+				goto filebad;
+		}
+	} else {
+
+		struct kvec vecs[1];
+		struct jffs2_unknown_node marker = {
+			.magic =	cpu_to_je16(JFFS2_MAGIC_BITMASK),
+			.nodetype =	cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER),
+			.totlen =	cpu_to_je32(c->cleanmarker_size)
+		};
+
+		jffs2_prealloc_raw_node_refs(c, jeb, 1);
+
+		marker.hdr_crc = cpu_to_je32(crc32(0, &marker, sizeof(struct jffs2_unknown_node)-4));
+
+		vecs[0].iov_base = (unsigned char *) &marker;
+		vecs[0].iov_len = sizeof(marker);
+		ret = jffs2_flash_direct_writev(c, vecs, 1, jeb->offset, &retlen);
+
+		if (ret || retlen != sizeof(marker)) {
+			if (ret)
+				printk(KERN_WARNING "Write clean marker to block at 0x%08x failed: %d\n",
+				       jeb->offset, ret);
+			else
+				printk(KERN_WARNING "Short write to newly-erased block at 0x%08x: Wanted %zd, got %zd\n",
+				       jeb->offset, sizeof(marker), retlen);
+
+			goto filebad;
+		}
+	}
+	/* Everything else got zeroed before the erase */
+	jeb->free_size = c->sector_size;
+
+	mutex_lock(&c->erase_free_sem);
+	spin_lock(&c->erase_completion_lock);
+
+	c->erasing_size -= c->sector_size;
+	c->free_size += c->sector_size;
+
+	/* Account for cleanmarker now, if it's in-band */
+	if (c->cleanmarker_size && !jffs2_cleanmarker_oob(c))
+		jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL, c->cleanmarker_size, NULL);
+
+	list_move_tail(&jeb->list, &c->free_list);
+	c->nr_erasing_blocks--;
+	c->nr_free_blocks++;
+
+	jffs2_dbg_acct_sanity_check_nolock(c, jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+
+	spin_unlock(&c->erase_completion_lock);
+	mutex_unlock(&c->erase_free_sem);
+	wake_up(&c->erase_wait);
+	return;
+
+filebad:
+	jffs2_erase_failed(c, jeb, bad_offset);
+	return;
+
+refile:
+	/* Stick it back on the list from whence it came and come back later */
+	mutex_lock(&c->erase_free_sem);
+	spin_lock(&c->erase_completion_lock);
+	jffs2_garbage_collect_trigger(c);
+	list_move(&jeb->list, &c->erase_complete_list);
+	spin_unlock(&c->erase_completion_lock);
+	mutex_unlock(&c->erase_free_sem);
+	return;
+}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
new file mode 100644
index 00000000..1c0a08d7
--- /dev/null
+++ b/fs/jffs2/file.c
@@ -0,0 +1,321 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include "nodelist.h"
+
+static int jffs2_write_end(struct file *filp, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *pg, void *fsdata);
+static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata);
+static int jffs2_readpage (struct file *filp, struct page *pg);
+
+int jffs2_fsync(struct file *filp, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+
+	/* Trigger GC to flush any pending writes for this inode */
+	jffs2_flush_wbuf_gc(c, inode->i_ino);
+
+	return 0;
+}
+
+const struct file_operations jffs2_file_operations =
+{
+	.llseek =	generic_file_llseek,
+	.open =		generic_file_open,
+ 	.read =		do_sync_read,
+ 	.aio_read =	generic_file_aio_read,
+ 	.write =	do_sync_write,
+ 	.aio_write =	generic_file_aio_write,
+	.unlocked_ioctl=jffs2_ioctl,
+	.mmap =		generic_file_readonly_mmap,
+	.fsync =	jffs2_fsync,
+	.splice_read =	generic_file_splice_read,
+};
+
+/* jffs2_file_inode_operations */
+
+const struct inode_operations jffs2_file_inode_operations =
+{
+	.check_acl =	jffs2_check_acl,
+	.setattr =	jffs2_setattr,
+	.setxattr =	jffs2_setxattr,
+	.getxattr =	jffs2_getxattr,
+	.listxattr =	jffs2_listxattr,
+	.removexattr =	jffs2_removexattr
+};
+
+const struct address_space_operations jffs2_file_address_operations =
+{
+	.readpage =	jffs2_readpage,
+	.write_begin =	jffs2_write_begin,
+	.write_end =	jffs2_write_end,
+};
+
+static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	unsigned char *pg_buf;
+	int ret;
+
+	D2(printk(KERN_DEBUG "jffs2_do_readpage_nolock(): ino #%lu, page at offset 0x%lx\n", inode->i_ino, pg->index << PAGE_CACHE_SHIFT));
+
+	BUG_ON(!PageLocked(pg));
+
+	pg_buf = kmap(pg);
+	/* FIXME: Can kmap fail? */
+
+	ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE);
+
+	if (ret) {
+		ClearPageUptodate(pg);
+		SetPageError(pg);
+	} else {
+		SetPageUptodate(pg);
+		ClearPageError(pg);
+	}
+
+	flush_dcache_page(pg);
+	kunmap(pg);
+
+	D2(printk(KERN_DEBUG "readpage finished\n"));
+	return ret;
+}
+
+int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg)
+{
+	int ret = jffs2_do_readpage_nolock(inode, pg);
+	unlock_page(pg);
+	return ret;
+}
+
+
+static int jffs2_readpage (struct file *filp, struct page *pg)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(pg->mapping->host);
+	int ret;
+
+	mutex_lock(&f->sem);
+	ret = jffs2_do_readpage_unlock(pg->mapping->host, pg);
+	mutex_unlock(&f->sem);
+	return ret;
+}
+
+static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	struct page *pg;
+	struct inode *inode = mapping->host;
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	uint32_t pageofs = index << PAGE_CACHE_SHIFT;
+	int ret = 0;
+
+	pg = grab_cache_page_write_begin(mapping, index, flags);
+	if (!pg)
+		return -ENOMEM;
+	*pagep = pg;
+
+	D1(printk(KERN_DEBUG "jffs2_write_begin()\n"));
+
+	if (pageofs > inode->i_size) {
+		/* Make new hole frag from old EOF to new page */
+		struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+		struct jffs2_raw_inode ri;
+		struct jffs2_full_dnode *fn;
+		uint32_t alloc_len;
+
+		D1(printk(KERN_DEBUG "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
+			  (unsigned int)inode->i_size, pageofs));
+
+		ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
+					  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+		if (ret)
+			goto out_page;
+
+		mutex_lock(&f->sem);
+		memset(&ri, 0, sizeof(ri));
+
+		ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+		ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE);
+		ri.totlen = cpu_to_je32(sizeof(ri));
+		ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4));
+
+		ri.ino = cpu_to_je32(f->inocache->ino);
+		ri.version = cpu_to_je32(++f->highest_version);
+		ri.mode = cpu_to_jemode(inode->i_mode);
+		ri.uid = cpu_to_je16(inode->i_uid);
+		ri.gid = cpu_to_je16(inode->i_gid);
+		ri.isize = cpu_to_je32(max((uint32_t)inode->i_size, pageofs));
+		ri.atime = ri.ctime = ri.mtime = cpu_to_je32(get_seconds());
+		ri.offset = cpu_to_je32(inode->i_size);
+		ri.dsize = cpu_to_je32(pageofs - inode->i_size);
+		ri.csize = cpu_to_je32(0);
+		ri.compr = JFFS2_COMPR_ZERO;
+		ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
+		ri.data_crc = cpu_to_je32(0);
+
+		fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_NORMAL);
+
+		if (IS_ERR(fn)) {
+			ret = PTR_ERR(fn);
+			jffs2_complete_reservation(c);
+			mutex_unlock(&f->sem);
+			goto out_page;
+		}
+		ret = jffs2_add_full_dnode_to_inode(c, f, fn);
+		if (f->metadata) {
+			jffs2_mark_node_obsolete(c, f->metadata->raw);
+			jffs2_free_full_dnode(f->metadata);
+			f->metadata = NULL;
+		}
+		if (ret) {
+			D1(printk(KERN_DEBUG "Eep. add_full_dnode_to_inode() failed in write_begin, returned %d\n", ret));
+			jffs2_mark_node_obsolete(c, fn->raw);
+			jffs2_free_full_dnode(fn);
+			jffs2_complete_reservation(c);
+			mutex_unlock(&f->sem);
+			goto out_page;
+		}
+		jffs2_complete_reservation(c);
+		inode->i_size = pageofs;
+		mutex_unlock(&f->sem);
+	}
+
+	/*
+	 * Read in the page if it wasn't already present. Cannot optimize away
+	 * the whole page write case until jffs2_write_end can handle the
+	 * case of a short-copy.
+	 */
+	if (!PageUptodate(pg)) {
+		mutex_lock(&f->sem);
+		ret = jffs2_do_readpage_nolock(inode, pg);
+		mutex_unlock(&f->sem);
+		if (ret)
+			goto out_page;
+	}
+	D1(printk(KERN_DEBUG "end write_begin(). pg->flags %lx\n", pg->flags));
+	return ret;
+
+out_page:
+	unlock_page(pg);
+	page_cache_release(pg);
+	return ret;
+}
+
+static int jffs2_write_end(struct file *filp, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *pg, void *fsdata)
+{
+	/* Actually commit the write from the page cache page we're looking at.
+	 * For now, we write the full page out each time. It sucks, but it's simple
+	 */
+	struct inode *inode = mapping->host;
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_raw_inode *ri;
+	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned end = start + copied;
+	unsigned aligned_start = start & ~3;
+	int ret = 0;
+	uint32_t writtenlen = 0;
+
+	D1(printk(KERN_DEBUG "jffs2_write_end(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n",
+		  inode->i_ino, pg->index << PAGE_CACHE_SHIFT, start, end, pg->flags));
+
+	/* We need to avoid deadlock with page_cache_read() in
+	   jffs2_garbage_collect_pass(). So the page must be
+	   up to date to prevent page_cache_read() from trying
+	   to re-lock it. */
+	BUG_ON(!PageUptodate(pg));
+
+	if (end == PAGE_CACHE_SIZE) {
+		/* When writing out the end of a page, write out the
+		   _whole_ page. This helps to reduce the number of
+		   nodes in files which have many short writes, like
+		   syslog files. */
+		aligned_start = 0;
+	}
+
+	ri = jffs2_alloc_raw_inode();
+
+	if (!ri) {
+		D1(printk(KERN_DEBUG "jffs2_write_end(): Allocation of raw inode failed\n"));
+		unlock_page(pg);
+		page_cache_release(pg);
+		return -ENOMEM;
+	}
+
+	/* Set the fields that the generic jffs2_write_inode_range() code can't find */
+	ri->ino = cpu_to_je32(inode->i_ino);
+	ri->mode = cpu_to_jemode(inode->i_mode);
+	ri->uid = cpu_to_je16(inode->i_uid);
+	ri->gid = cpu_to_je16(inode->i_gid);
+	ri->isize = cpu_to_je32((uint32_t)inode->i_size);
+	ri->atime = ri->ctime = ri->mtime = cpu_to_je32(get_seconds());
+
+	/* In 2.4, it was already kmapped by generic_file_write(). Doesn't
+	   hurt to do it again. The alternative is ifdefs, which are ugly. */
+	kmap(pg);
+
+	ret = jffs2_write_inode_range(c, f, ri, page_address(pg) + aligned_start,
+				      (pg->index << PAGE_CACHE_SHIFT) + aligned_start,
+				      end - aligned_start, &writtenlen);
+
+	kunmap(pg);
+
+	if (ret) {
+		/* There was an error writing. */
+		SetPageError(pg);
+	}
+
+	/* Adjust writtenlen for the padding we did, so we don't confuse our caller */
+	writtenlen -= min(writtenlen, (start - aligned_start));
+
+	if (writtenlen) {
+		if (inode->i_size < pos + writtenlen) {
+			inode->i_size = pos + writtenlen;
+			inode->i_blocks = (inode->i_size + 511) >> 9;
+
+			inode->i_ctime = inode->i_mtime = ITIME(je32_to_cpu(ri->ctime));
+		}
+	}
+
+	jffs2_free_raw_inode(ri);
+
+	if (start+writtenlen < end) {
+		/* generic_file_write has written more to the page cache than we've
+		   actually written to the medium. Mark the page !Uptodate so that
+		   it gets reread */
+		D1(printk(KERN_DEBUG "jffs2_write_end(): Not all bytes written. Marking page !uptodate\n"));
+		SetPageError(pg);
+		ClearPageUptodate(pg);
+	}
+
+	D1(printk(KERN_DEBUG "jffs2_write_end() returning %d\n",
+					writtenlen > 0 ? writtenlen : ret));
+	unlock_page(pg);
+	page_cache_release(pg);
+	return writtenlen > 0 ? writtenlen : ret;
+}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
new file mode 100644
index 00000000..fed08c1d
--- /dev/null
+++ b/fs/jffs2/fs.c
@@ -0,0 +1,748 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/mtd/mtd.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/vfs.h>
+#include <linux/crc32.h>
+#include "nodelist.h"
+
+static int jffs2_flash_setup(struct jffs2_sb_info *c);
+
+int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
+{
+	struct jffs2_full_dnode *old_metadata, *new_metadata;
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_raw_inode *ri;
+	union jffs2_device_node dev;
+	unsigned char *mdata = NULL;
+	int mdatalen = 0;
+	unsigned int ivalid;
+	uint32_t alloclen;
+	int ret;
+	int alloc_type = ALLOC_NORMAL;
+
+	D1(printk(KERN_DEBUG "jffs2_setattr(): ino #%lu\n", inode->i_ino));
+
+	/* Special cases - we don't want more than one data node
+	   for these types on the medium at any time. So setattr
+	   must read the original data associated with the node
+	   (i.e. the device numbers or the target name) and write
+	   it out again with the appropriate data attached */
+	if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
+		/* For these, we don't actually need to read the old node */
+		mdatalen = jffs2_encode_dev(&dev, inode->i_rdev);
+		mdata = (char *)&dev;
+		D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of kdev_t\n", mdatalen));
+	} else if (S_ISLNK(inode->i_mode)) {
+		mutex_lock(&f->sem);
+		mdatalen = f->metadata->size;
+		mdata = kmalloc(f->metadata->size, GFP_USER);
+		if (!mdata) {
+			mutex_unlock(&f->sem);
+			return -ENOMEM;
+		}
+		ret = jffs2_read_dnode(c, f, f->metadata, mdata, 0, mdatalen);
+		if (ret) {
+			mutex_unlock(&f->sem);
+			kfree(mdata);
+			return ret;
+		}
+		mutex_unlock(&f->sem);
+		D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of symlink target\n", mdatalen));
+	}
+
+	ri = jffs2_alloc_raw_inode();
+	if (!ri) {
+		if (S_ISLNK(inode->i_mode))
+			kfree(mdata);
+		return -ENOMEM;
+	}
+
+	ret = jffs2_reserve_space(c, sizeof(*ri) + mdatalen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+	if (ret) {
+		jffs2_free_raw_inode(ri);
+		if (S_ISLNK(inode->i_mode & S_IFMT))
+			 kfree(mdata);
+		return ret;
+	}
+	mutex_lock(&f->sem);
+	ivalid = iattr->ia_valid;
+
+	ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	ri->nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE);
+	ri->totlen = cpu_to_je32(sizeof(*ri) + mdatalen);
+	ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4));
+
+	ri->ino = cpu_to_je32(inode->i_ino);
+	ri->version = cpu_to_je32(++f->highest_version);
+
+	ri->uid = cpu_to_je16((ivalid & ATTR_UID)?iattr->ia_uid:inode->i_uid);
+	ri->gid = cpu_to_je16((ivalid & ATTR_GID)?iattr->ia_gid:inode->i_gid);
+
+	if (ivalid & ATTR_MODE)
+		ri->mode = cpu_to_jemode(iattr->ia_mode);
+	else
+		ri->mode = cpu_to_jemode(inode->i_mode);
+
+
+	ri->isize = cpu_to_je32((ivalid & ATTR_SIZE)?iattr->ia_size:inode->i_size);
+	ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode->i_atime));
+	ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode->i_mtime));
+	ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode->i_ctime));
+
+	ri->offset = cpu_to_je32(0);
+	ri->csize = ri->dsize = cpu_to_je32(mdatalen);
+	ri->compr = JFFS2_COMPR_NONE;
+	if (ivalid & ATTR_SIZE && inode->i_size < iattr->ia_size) {
+		/* It's an extension. Make it a hole node */
+		ri->compr = JFFS2_COMPR_ZERO;
+		ri->dsize = cpu_to_je32(iattr->ia_size - inode->i_size);
+		ri->offset = cpu_to_je32(inode->i_size);
+	} else if (ivalid & ATTR_SIZE && !iattr->ia_size) {
+		/* For truncate-to-zero, treat it as deletion because
+		   it'll always be obsoleting all previous nodes */
+		alloc_type = ALLOC_DELETION;
+	}
+	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
+	if (mdatalen)
+		ri->data_crc = cpu_to_je32(crc32(0, mdata, mdatalen));
+	else
+		ri->data_crc = cpu_to_je32(0);
+
+	new_metadata = jffs2_write_dnode(c, f, ri, mdata, mdatalen, alloc_type);
+	if (S_ISLNK(inode->i_mode))
+		kfree(mdata);
+
+	if (IS_ERR(new_metadata)) {
+		jffs2_complete_reservation(c);
+		jffs2_free_raw_inode(ri);
+		mutex_unlock(&f->sem);
+		return PTR_ERR(new_metadata);
+	}
+	/* It worked. Update the inode */
+	inode->i_atime = ITIME(je32_to_cpu(ri->atime));
+	inode->i_ctime = ITIME(je32_to_cpu(ri->ctime));
+	inode->i_mtime = ITIME(je32_to_cpu(ri->mtime));
+	inode->i_mode = jemode_to_cpu(ri->mode);
+	inode->i_uid = je16_to_cpu(ri->uid);
+	inode->i_gid = je16_to_cpu(ri->gid);
+
+
+	old_metadata = f->metadata;
+
+	if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size)
+		jffs2_truncate_fragtree (c, &f->fragtree, iattr->ia_size);
+
+	if (ivalid & ATTR_SIZE && inode->i_size < iattr->ia_size) {
+		jffs2_add_full_dnode_to_inode(c, f, new_metadata);
+		inode->i_size = iattr->ia_size;
+		inode->i_blocks = (inode->i_size + 511) >> 9;
+		f->metadata = NULL;
+	} else {
+		f->metadata = new_metadata;
+	}
+	if (old_metadata) {
+		jffs2_mark_node_obsolete(c, old_metadata->raw);
+		jffs2_free_full_dnode(old_metadata);
+	}
+	jffs2_free_raw_inode(ri);
+
+	mutex_unlock(&f->sem);
+	jffs2_complete_reservation(c);
+
+	/* We have to do the truncate_setsize() without f->sem held, since
+	   some pages may be locked and waiting for it in readpage().
+	   We are protected from a simultaneous write() extending i_size
+	   back past iattr->ia_size, because do_truncate() holds the
+	   generic inode semaphore. */
+	if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
+		truncate_setsize(inode, iattr->ia_size);
+		inode->i_blocks = (inode->i_size + 511) >> 9;
+	}	
+
+	return 0;
+}
+
+int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	int rc;
+
+	rc = inode_change_ok(dentry->d_inode, iattr);
+	if (rc)
+		return rc;
+
+	rc = jffs2_do_setattr(dentry->d_inode, iattr);
+	if (!rc && (iattr->ia_valid & ATTR_MODE))
+		rc = jffs2_acl_chmod(dentry->d_inode);
+
+	return rc;
+}
+
+int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(dentry->d_sb);
+	unsigned long avail;
+
+	buf->f_type = JFFS2_SUPER_MAGIC;
+	buf->f_bsize = 1 << PAGE_SHIFT;
+	buf->f_blocks = c->flash_size >> PAGE_SHIFT;
+	buf->f_files = 0;
+	buf->f_ffree = 0;
+	buf->f_namelen = JFFS2_MAX_NAME_LEN;
+	buf->f_fsid.val[0] = JFFS2_SUPER_MAGIC;
+	buf->f_fsid.val[1] = c->mtd->index;
+
+	spin_lock(&c->erase_completion_lock);
+	avail = c->dirty_size + c->free_size;
+	if (avail > c->sector_size * c->resv_blocks_write)
+		avail -= c->sector_size * c->resv_blocks_write;
+	else
+		avail = 0;
+	spin_unlock(&c->erase_completion_lock);
+
+	buf->f_bavail = buf->f_bfree = avail >> PAGE_SHIFT;
+
+	return 0;
+}
+
+
+void jffs2_evict_inode (struct inode *inode)
+{
+	/* We can forget about this inode for now - drop all
+	 *  the nodelists associated with it, etc.
+	 */
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+
+	D1(printk(KERN_DEBUG "jffs2_evict_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode));
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	jffs2_do_clear_inode(c, f);
+}
+
+struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
+{
+	struct jffs2_inode_info *f;
+	struct jffs2_sb_info *c;
+	struct jffs2_raw_inode latest_node;
+	union jffs2_device_node jdev;
+	struct inode *inode;
+	dev_t rdev = 0;
+	int ret;
+
+	D1(printk(KERN_DEBUG "jffs2_iget(): ino == %lu\n", ino));
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	f = JFFS2_INODE_INFO(inode);
+	c = JFFS2_SB_INFO(inode->i_sb);
+
+	jffs2_init_inode_info(f);
+	mutex_lock(&f->sem);
+
+	ret = jffs2_do_read_inode(c, f, inode->i_ino, &latest_node);
+
+	if (ret) {
+		mutex_unlock(&f->sem);
+		iget_failed(inode);
+		return ERR_PTR(ret);
+	}
+	inode->i_mode = jemode_to_cpu(latest_node.mode);
+	inode->i_uid = je16_to_cpu(latest_node.uid);
+	inode->i_gid = je16_to_cpu(latest_node.gid);
+	inode->i_size = je32_to_cpu(latest_node.isize);
+	inode->i_atime = ITIME(je32_to_cpu(latest_node.atime));
+	inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime));
+	inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime));
+
+	inode->i_nlink = f->inocache->pino_nlink;
+
+	inode->i_blocks = (inode->i_size + 511) >> 9;
+
+	switch (inode->i_mode & S_IFMT) {
+
+	case S_IFLNK:
+		inode->i_op = &jffs2_symlink_inode_operations;
+		break;
+
+	case S_IFDIR:
+	{
+		struct jffs2_full_dirent *fd;
+		inode->i_nlink = 2; /* parent and '.' */
+
+		for (fd=f->dents; fd; fd = fd->next) {
+			if (fd->type == DT_DIR && fd->ino)
+				inc_nlink(inode);
+		}
+		/* Root dir gets i_nlink 3 for some reason */
+		if (inode->i_ino == 1)
+			inc_nlink(inode);
+
+		inode->i_op = &jffs2_dir_inode_operations;
+		inode->i_fop = &jffs2_dir_operations;
+		break;
+	}
+	case S_IFREG:
+		inode->i_op = &jffs2_file_inode_operations;
+		inode->i_fop = &jffs2_file_operations;
+		inode->i_mapping->a_ops = &jffs2_file_address_operations;
+		inode->i_mapping->nrpages = 0;
+		break;
+
+	case S_IFBLK:
+	case S_IFCHR:
+		/* Read the device numbers from the media */
+		if (f->metadata->size != sizeof(jdev.old_id) &&
+		    f->metadata->size != sizeof(jdev.new_id)) {
+			printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
+			goto error_io;
+		}
+		D1(printk(KERN_DEBUG "Reading device numbers from flash\n"));
+		ret = jffs2_read_dnode(c, f, f->metadata, (char *)&jdev, 0, f->metadata->size);
+		if (ret < 0) {
+			/* Eep */
+			printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
+			goto error;
+		}
+		if (f->metadata->size == sizeof(jdev.old_id))
+			rdev = old_decode_dev(je16_to_cpu(jdev.old_id));
+		else
+			rdev = new_decode_dev(je32_to_cpu(jdev.new_id));
+
+	case S_IFSOCK:
+	case S_IFIFO:
+		inode->i_op = &jffs2_file_inode_operations;
+		init_special_inode(inode, inode->i_mode, rdev);
+		break;
+
+	default:
+		printk(KERN_WARNING "jffs2_read_inode(): Bogus imode %o for ino %lu\n", inode->i_mode, (unsigned long)inode->i_ino);
+	}
+
+	mutex_unlock(&f->sem);
+
+	D1(printk(KERN_DEBUG "jffs2_read_inode() returning\n"));
+	unlock_new_inode(inode);
+	return inode;
+
+error_io:
+	ret = -EIO;
+error:
+	mutex_unlock(&f->sem);
+	jffs2_do_clear_inode(c, f);
+	iget_failed(inode);
+	return ERR_PTR(ret);
+}
+
+void jffs2_dirty_inode(struct inode *inode, int flags)
+{
+	struct iattr iattr;
+
+	if (!(inode->i_state & I_DIRTY_DATASYNC)) {
+		D2(printk(KERN_DEBUG "jffs2_dirty_inode() not calling setattr() for ino #%lu\n", inode->i_ino));
+		return;
+	}
+
+	D1(printk(KERN_DEBUG "jffs2_dirty_inode() calling setattr() for ino #%lu\n", inode->i_ino));
+
+	iattr.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_MTIME|ATTR_CTIME;
+	iattr.ia_mode = inode->i_mode;
+	iattr.ia_uid = inode->i_uid;
+	iattr.ia_gid = inode->i_gid;
+	iattr.ia_atime = inode->i_atime;
+	iattr.ia_mtime = inode->i_mtime;
+	iattr.ia_ctime = inode->i_ctime;
+
+	jffs2_do_setattr(inode, &iattr);
+}
+
+int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+
+	if (c->flags & JFFS2_SB_FLAG_RO && !(sb->s_flags & MS_RDONLY))
+		return -EROFS;
+
+	/* We stop if it was running, then restart if it needs to.
+	   This also catches the case where it was stopped and this
+	   is just a remount to restart it.
+	   Flush the writebuffer, if neccecary, else we loose it */
+	if (!(sb->s_flags & MS_RDONLY)) {
+		jffs2_stop_garbage_collect_thread(c);
+		mutex_lock(&c->alloc_sem);
+		jffs2_flush_wbuf_pad(c);
+		mutex_unlock(&c->alloc_sem);
+	}
+
+	if (!(*flags & MS_RDONLY))
+		jffs2_start_garbage_collect_thread(c);
+
+	*flags |= MS_NOATIME;
+	return 0;
+}
+
+/* jffs2_new_inode: allocate a new inode and inocache, add it to the hash,
+   fill in the raw_inode while you're at it. */
+struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri)
+{
+	struct inode *inode;
+	struct super_block *sb = dir_i->i_sb;
+	struct jffs2_sb_info *c;
+	struct jffs2_inode_info *f;
+	int ret;
+
+	D1(printk(KERN_DEBUG "jffs2_new_inode(): dir_i %ld, mode 0x%x\n", dir_i->i_ino, mode));
+
+	c = JFFS2_SB_INFO(sb);
+
+	inode = new_inode(sb);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	f = JFFS2_INODE_INFO(inode);
+	jffs2_init_inode_info(f);
+	mutex_lock(&f->sem);
+
+	memset(ri, 0, sizeof(*ri));
+	/* Set OS-specific defaults for new inodes */
+	ri->uid = cpu_to_je16(current_fsuid());
+
+	if (dir_i->i_mode & S_ISGID) {
+		ri->gid = cpu_to_je16(dir_i->i_gid);
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else {
+		ri->gid = cpu_to_je16(current_fsgid());
+	}
+
+	/* POSIX ACLs have to be processed now, at least partly.
+	   The umask is only applied if there's no default ACL */
+	ret = jffs2_init_acl_pre(dir_i, inode, &mode);
+	if (ret) {
+	    make_bad_inode(inode);
+	    iput(inode);
+	    return ERR_PTR(ret);
+	}
+	ret = jffs2_do_new_inode (c, f, mode, ri);
+	if (ret) {
+		make_bad_inode(inode);
+		iput(inode);
+		return ERR_PTR(ret);
+	}
+	inode->i_nlink = 1;
+	inode->i_ino = je32_to_cpu(ri->ino);
+	inode->i_mode = jemode_to_cpu(ri->mode);
+	inode->i_gid = je16_to_cpu(ri->gid);
+	inode->i_uid = je16_to_cpu(ri->uid);
+	inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime));
+
+	inode->i_blocks = 0;
+	inode->i_size = 0;
+
+	if (insert_inode_locked(inode) < 0) {
+		make_bad_inode(inode);
+		unlock_new_inode(inode);
+		iput(inode);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return inode;
+}
+
+static int calculate_inocache_hashsize(uint32_t flash_size)
+{
+	/*
+	 * Pick a inocache hash size based on the size of the medium.
+	 * Count how many megabytes we're dealing with, apply a hashsize twice
+	 * that size, but rounding down to the usual big powers of 2. And keep
+	 * to sensible bounds.
+	 */
+
+	int size_mb = flash_size / 1024 / 1024;
+	int hashsize = (size_mb * 2) & ~0x3f;
+
+	if (hashsize < INOCACHE_HASHSIZE_MIN)
+		return INOCACHE_HASHSIZE_MIN;
+	if (hashsize > INOCACHE_HASHSIZE_MAX)
+		return INOCACHE_HASHSIZE_MAX;
+
+	return hashsize;
+}
+
+int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct jffs2_sb_info *c;
+	struct inode *root_i;
+	int ret;
+	size_t blocks;
+
+	c = JFFS2_SB_INFO(sb);
+
+#ifndef CONFIG_JFFS2_FS_WRITEBUFFER
+	if (c->mtd->type == MTD_NANDFLASH) {
+		printk(KERN_ERR "jffs2: Cannot operate on NAND flash unless jffs2 NAND support is compiled in.\n");
+		return -EINVAL;
+	}
+	if (c->mtd->type == MTD_DATAFLASH) {
+		printk(KERN_ERR "jffs2: Cannot operate on DataFlash unless jffs2 DataFlash support is compiled in.\n");
+		return -EINVAL;
+	}
+#endif
+
+	c->flash_size = c->mtd->size;
+	c->sector_size = c->mtd->erasesize;
+	blocks = c->flash_size / c->sector_size;
+
+	/*
+	 * Size alignment check
+	 */
+	if ((c->sector_size * blocks) != c->flash_size) {
+		c->flash_size = c->sector_size * blocks;
+		printk(KERN_INFO "jffs2: Flash size not aligned to erasesize, reducing to %dKiB\n",
+			c->flash_size / 1024);
+	}
+
+	if (c->flash_size < 5*c->sector_size) {
+		printk(KERN_ERR "jffs2: Too few erase blocks (%d)\n", c->flash_size / c->sector_size);
+		return -EINVAL;
+	}
+
+	c->cleanmarker_size = sizeof(struct jffs2_unknown_node);
+
+	/* NAND (or other bizarre) flash... do setup accordingly */
+	ret = jffs2_flash_setup(c);
+	if (ret)
+		return ret;
+
+	c->inocache_hashsize = calculate_inocache_hashsize(c->flash_size);
+	c->inocache_list = kcalloc(c->inocache_hashsize, sizeof(struct jffs2_inode_cache *), GFP_KERNEL);
+	if (!c->inocache_list) {
+		ret = -ENOMEM;
+		goto out_wbuf;
+	}
+
+	jffs2_init_xattr_subsystem(c);
+
+	if ((ret = jffs2_do_mount_fs(c)))
+		goto out_inohash;
+
+	D1(printk(KERN_DEBUG "jffs2_do_fill_super(): Getting root inode\n"));
+	root_i = jffs2_iget(sb, 1);
+	if (IS_ERR(root_i)) {
+		D1(printk(KERN_WARNING "get root inode failed\n"));
+		ret = PTR_ERR(root_i);
+		goto out_root;
+	}
+
+	ret = -ENOMEM;
+
+	D1(printk(KERN_DEBUG "jffs2_do_fill_super(): d_alloc_root()\n"));
+	sb->s_root = d_alloc_root(root_i);
+	if (!sb->s_root)
+		goto out_root_i;
+
+	sb->s_maxbytes = 0xFFFFFFFF;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = JFFS2_SUPER_MAGIC;
+	if (!(sb->s_flags & MS_RDONLY))
+		jffs2_start_garbage_collect_thread(c);
+	return 0;
+
+ out_root_i:
+	iput(root_i);
+out_root:
+	jffs2_free_ino_caches(c);
+	jffs2_free_raw_node_refs(c);
+	if (jffs2_blocks_use_vmalloc(c))
+		vfree(c->blocks);
+	else
+		kfree(c->blocks);
+ out_inohash:
+	jffs2_clear_xattr_subsystem(c);
+	kfree(c->inocache_list);
+ out_wbuf:
+	jffs2_flash_cleanup(c);
+
+	return ret;
+}
+
+void jffs2_gc_release_inode(struct jffs2_sb_info *c,
+				   struct jffs2_inode_info *f)
+{
+	iput(OFNI_EDONI_2SFFJ(f));
+}
+
+struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c,
+					      int inum, int unlinked)
+{
+	struct inode *inode;
+	struct jffs2_inode_cache *ic;
+
+	if (unlinked) {
+		/* The inode has zero nlink but its nodes weren't yet marked
+		   obsolete. This has to be because we're still waiting for
+		   the final (close() and) iput() to happen.
+
+		   There's a possibility that the final iput() could have
+		   happened while we were contemplating. In order to ensure
+		   that we don't cause a new read_inode() (which would fail)
+		   for the inode in question, we use ilookup() in this case
+		   instead of iget().
+
+		   The nlink can't _become_ zero at this point because we're
+		   holding the alloc_sem, and jffs2_do_unlink() would also
+		   need that while decrementing nlink on any inode.
+		*/
+		inode = ilookup(OFNI_BS_2SFFJ(c), inum);
+		if (!inode) {
+			D1(printk(KERN_DEBUG "ilookup() failed for ino #%u; inode is probably deleted.\n",
+				  inum));
+
+			spin_lock(&c->inocache_lock);
+			ic = jffs2_get_ino_cache(c, inum);
+			if (!ic) {
+				D1(printk(KERN_DEBUG "Inode cache for ino #%u is gone.\n", inum));
+				spin_unlock(&c->inocache_lock);
+				return NULL;
+			}
+			if (ic->state != INO_STATE_CHECKEDABSENT) {
+				/* Wait for progress. Don't just loop */
+				D1(printk(KERN_DEBUG "Waiting for ino #%u in state %d\n",
+					  ic->ino, ic->state));
+				sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
+			} else {
+				spin_unlock(&c->inocache_lock);
+			}
+
+			return NULL;
+		}
+	} else {
+		/* Inode has links to it still; they're not going away because
+		   jffs2_do_unlink() would need the alloc_sem and we have it.
+		   Just iget() it, and if read_inode() is necessary that's OK.
+		*/
+		inode = jffs2_iget(OFNI_BS_2SFFJ(c), inum);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	if (is_bad_inode(inode)) {
+		printk(KERN_NOTICE "Eep. read_inode() failed for ino #%u. unlinked %d\n",
+		       inum, unlinked);
+		/* NB. This will happen again. We need to do something appropriate here. */
+		iput(inode);
+		return ERR_PTR(-EIO);
+	}
+
+	return JFFS2_INODE_INFO(inode);
+}
+
+unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
+				   struct jffs2_inode_info *f,
+				   unsigned long offset,
+				   unsigned long *priv)
+{
+	struct inode *inode = OFNI_EDONI_2SFFJ(f);
+	struct page *pg;
+
+	pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
+			     (void *)jffs2_do_readpage_unlock, inode);
+	if (IS_ERR(pg))
+		return (void *)pg;
+
+	*priv = (unsigned long)pg;
+	return kmap(pg);
+}
+
+void jffs2_gc_release_page(struct jffs2_sb_info *c,
+			   unsigned char *ptr,
+			   unsigned long *priv)
+{
+	struct page *pg = (void *)*priv;
+
+	kunmap(pg);
+	page_cache_release(pg);
+}
+
+static int jffs2_flash_setup(struct jffs2_sb_info *c) {
+	int ret = 0;
+
+	if (c->mtd->type == MTD_NANDFLASH) {
+		if (!(c->mtd->flags & MTD_OOB_WRITEABLE))
+			printk(KERN_INFO "JFFS2 doesn't use OOB.\n");
+		/* NAND flash... do setup accordingly */
+		ret = jffs2_nand_flash_setup(c);
+		if (ret)
+			return ret;
+	}
+
+	/* and Dataflash */
+	if (jffs2_dataflash(c)) {
+		ret = jffs2_dataflash_setup(c);
+		if (ret)
+			return ret;
+	}
+
+	/* and Intel "Sibley" flash */
+	if (jffs2_nor_wbuf_flash(c)) {
+		ret = jffs2_nor_wbuf_flash_setup(c);
+		if (ret)
+			return ret;
+	}
+
+	/* and an UBI volume */
+	if (jffs2_ubivol(c)) {
+		ret = jffs2_ubivol_setup(c);
+		if (ret)
+			return ret;
+	}
+
+	return ret;
+}
+
+void jffs2_flash_cleanup(struct jffs2_sb_info *c) {
+
+	if (c->mtd->type == MTD_NANDFLASH) {
+		jffs2_nand_flash_cleanup(c);
+	}
+
+	/* and DataFlash */
+	if (jffs2_dataflash(c)) {
+		jffs2_dataflash_cleanup(c);
+	}
+
+	/* and Intel "Sibley" flash */
+	if (jffs2_nor_wbuf_flash(c)) {
+		jffs2_nor_wbuf_flash_cleanup(c);
+	}
+
+	/* and an UBI volume */
+	if (jffs2_ubivol(c)) {
+		jffs2_ubivol_cleanup(c);
+	}
+}
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
new file mode 100644
index 00000000..4bbd5211
--- /dev/null
+++ b/fs/jffs2/gc.c
@@ -0,0 +1,1326 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mtd/mtd.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/crc32.h>
+#include <linux/compiler.h>
+#include <linux/stat.h>
+#include "nodelist.h"
+#include "compr.h"
+
+static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
+					  struct jffs2_inode_cache *ic,
+					  struct jffs2_raw_node_ref *raw);
+static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+					struct jffs2_inode_info *f, struct jffs2_full_dnode *fd);
+static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+					struct jffs2_inode_info *f, struct jffs2_full_dirent *fd);
+static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+					struct jffs2_inode_info *f, struct jffs2_full_dirent *fd);
+static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				      struct jffs2_inode_info *f, struct jffs2_full_dnode *fn,
+				      uint32_t start, uint32_t end);
+static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				       struct jffs2_inode_info *f, struct jffs2_full_dnode *fn,
+				       uint32_t start, uint32_t end);
+static int jffs2_garbage_collect_live(struct jffs2_sb_info *c,  struct jffs2_eraseblock *jeb,
+			       struct jffs2_raw_node_ref *raw, struct jffs2_inode_info *f);
+
+/* Called with erase_completion_lock held */
+static struct jffs2_eraseblock *jffs2_find_gc_block(struct jffs2_sb_info *c)
+{
+	struct jffs2_eraseblock *ret;
+	struct list_head *nextlist = NULL;
+	int n = jiffies % 128;
+
+	/* Pick an eraseblock to garbage collect next. This is where we'll
+	   put the clever wear-levelling algorithms. Eventually.  */
+	/* We possibly want to favour the dirtier blocks more when the
+	   number of free blocks is low. */
+again:
+	if (!list_empty(&c->bad_used_list) && c->nr_free_blocks > c->resv_blocks_gcbad) {
+		D1(printk(KERN_DEBUG "Picking block from bad_used_list to GC next\n"));
+		nextlist = &c->bad_used_list;
+	} else if (n < 50 && !list_empty(&c->erasable_list)) {
+		/* Note that most of them will have gone directly to be erased.
+		   So don't favour the erasable_list _too_ much. */
+		D1(printk(KERN_DEBUG "Picking block from erasable_list to GC next\n"));
+		nextlist = &c->erasable_list;
+	} else if (n < 110 && !list_empty(&c->very_dirty_list)) {
+		/* Most of the time, pick one off the very_dirty list */
+		D1(printk(KERN_DEBUG "Picking block from very_dirty_list to GC next\n"));
+		nextlist = &c->very_dirty_list;
+	} else if (n < 126 && !list_empty(&c->dirty_list)) {
+		D1(printk(KERN_DEBUG "Picking block from dirty_list to GC next\n"));
+		nextlist = &c->dirty_list;
+	} else if (!list_empty(&c->clean_list)) {
+		D1(printk(KERN_DEBUG "Picking block from clean_list to GC next\n"));
+		nextlist = &c->clean_list;
+	} else if (!list_empty(&c->dirty_list)) {
+		D1(printk(KERN_DEBUG "Picking block from dirty_list to GC next (clean_list was empty)\n"));
+
+		nextlist = &c->dirty_list;
+	} else if (!list_empty(&c->very_dirty_list)) {
+		D1(printk(KERN_DEBUG "Picking block from very_dirty_list to GC next (clean_list and dirty_list were empty)\n"));
+		nextlist = &c->very_dirty_list;
+	} else if (!list_empty(&c->erasable_list)) {
+		D1(printk(KERN_DEBUG "Picking block from erasable_list to GC next (clean_list and {very_,}dirty_list were empty)\n"));
+
+		nextlist = &c->erasable_list;
+	} else if (!list_empty(&c->erasable_pending_wbuf_list)) {
+		/* There are blocks are wating for the wbuf sync */
+		D1(printk(KERN_DEBUG "Synching wbuf in order to reuse erasable_pending_wbuf_list blocks\n"));
+		spin_unlock(&c->erase_completion_lock);
+		jffs2_flush_wbuf_pad(c);
+		spin_lock(&c->erase_completion_lock);
+		goto again;
+	} else {
+		/* Eep. All were empty */
+		D1(printk(KERN_NOTICE "jffs2: No clean, dirty _or_ erasable blocks to GC from! Where are they all?\n"));
+		return NULL;
+	}
+
+	ret = list_entry(nextlist->next, struct jffs2_eraseblock, list);
+	list_del(&ret->list);
+	c->gcblock = ret;
+	ret->gc_node = ret->first_node;
+	if (!ret->gc_node) {
+		printk(KERN_WARNING "Eep. ret->gc_node for block at 0x%08x is NULL\n", ret->offset);
+		BUG();
+	}
+
+	/* Have we accidentally picked a clean block with wasted space ? */
+	if (ret->wasted_size) {
+		D1(printk(KERN_DEBUG "Converting wasted_size %08x to dirty_size\n", ret->wasted_size));
+		ret->dirty_size += ret->wasted_size;
+		c->wasted_size -= ret->wasted_size;
+		c->dirty_size += ret->wasted_size;
+		ret->wasted_size = 0;
+	}
+
+	return ret;
+}
+
+/* jffs2_garbage_collect_pass
+ * Make a single attempt to progress GC. Move one node, and possibly
+ * start erasing one eraseblock.
+ */
+int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
+{
+	struct jffs2_inode_info *f;
+	struct jffs2_inode_cache *ic;
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *raw;
+	uint32_t gcblock_dirty;
+	int ret = 0, inum, nlink;
+	int xattr = 0;
+
+	if (mutex_lock_interruptible(&c->alloc_sem))
+		return -EINTR;
+
+	for (;;) {
+		spin_lock(&c->erase_completion_lock);
+		if (!c->unchecked_size)
+			break;
+
+		/* We can't start doing GC yet. We haven't finished checking
+		   the node CRCs etc. Do it now. */
+
+		/* checked_ino is protected by the alloc_sem */
+		if (c->checked_ino > c->highest_ino && xattr) {
+			printk(KERN_CRIT "Checked all inodes but still 0x%x bytes of unchecked space?\n",
+			       c->unchecked_size);
+			jffs2_dbg_dump_block_lists_nolock(c);
+			spin_unlock(&c->erase_completion_lock);
+			mutex_unlock(&c->alloc_sem);
+			return -ENOSPC;
+		}
+
+		spin_unlock(&c->erase_completion_lock);
+
+		if (!xattr)
+			xattr = jffs2_verify_xattr(c);
+
+		spin_lock(&c->inocache_lock);
+
+		ic = jffs2_get_ino_cache(c, c->checked_ino++);
+
+		if (!ic) {
+			spin_unlock(&c->inocache_lock);
+			continue;
+		}
+
+		if (!ic->pino_nlink) {
+			D1(printk(KERN_DEBUG "Skipping check of ino #%d with nlink/pino zero\n",
+				  ic->ino));
+			spin_unlock(&c->inocache_lock);
+			jffs2_xattr_delete_inode(c, ic);
+			continue;
+		}
+		switch(ic->state) {
+		case INO_STATE_CHECKEDABSENT:
+		case INO_STATE_PRESENT:
+			D1(printk(KERN_DEBUG "Skipping ino #%u already checked\n", ic->ino));
+			spin_unlock(&c->inocache_lock);
+			continue;
+
+		case INO_STATE_GC:
+		case INO_STATE_CHECKING:
+			printk(KERN_WARNING "Inode #%u is in state %d during CRC check phase!\n", ic->ino, ic->state);
+			spin_unlock(&c->inocache_lock);
+			BUG();
+
+		case INO_STATE_READING:
+			/* We need to wait for it to finish, lest we move on
+			   and trigger the BUG() above while we haven't yet
+			   finished checking all its nodes */
+			D1(printk(KERN_DEBUG "Waiting for ino #%u to finish reading\n", ic->ino));
+			/* We need to come back again for the _same_ inode. We've
+			 made no progress in this case, but that should be OK */
+			c->checked_ino--;
+
+			mutex_unlock(&c->alloc_sem);
+			sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
+			return 0;
+
+		default:
+			BUG();
+
+		case INO_STATE_UNCHECKED:
+			;
+		}
+		ic->state = INO_STATE_CHECKING;
+		spin_unlock(&c->inocache_lock);
+
+		D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() triggering inode scan of ino#%u\n", ic->ino));
+
+		ret = jffs2_do_crccheck_inode(c, ic);
+		if (ret)
+			printk(KERN_WARNING "Returned error for crccheck of ino #%u. Expect badness...\n", ic->ino);
+
+		jffs2_set_inocache_state(c, ic, INO_STATE_CHECKEDABSENT);
+		mutex_unlock(&c->alloc_sem);
+		return ret;
+	}
+
+	/* If there are any blocks which need erasing, erase them now */
+	if (!list_empty(&c->erase_complete_list) ||
+	    !list_empty(&c->erase_pending_list)) {
+		spin_unlock(&c->erase_completion_lock);
+		mutex_unlock(&c->alloc_sem);
+		D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
+		if (jffs2_erase_pending_blocks(c, 1))
+			return 0;
+
+		D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
+		mutex_lock(&c->alloc_sem);
+		spin_lock(&c->erase_completion_lock);
+	}
+
+	/* First, work out which block we're garbage-collecting */
+	jeb = c->gcblock;
+
+	if (!jeb)
+		jeb = jffs2_find_gc_block(c);
+
+	if (!jeb) {
+		/* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */
+		if (c->nr_erasing_blocks) {
+			spin_unlock(&c->erase_completion_lock);
+			mutex_unlock(&c->alloc_sem);
+			return -EAGAIN;
+		}
+		D1(printk(KERN_NOTICE "jffs2: Couldn't find erase block to garbage collect!\n"));
+		spin_unlock(&c->erase_completion_lock);
+		mutex_unlock(&c->alloc_sem);
+		return -EIO;
+	}
+
+	D1(printk(KERN_DEBUG "GC from block %08x, used_size %08x, dirty_size %08x, free_size %08x\n", jeb->offset, jeb->used_size, jeb->dirty_size, jeb->free_size));
+	D1(if (c->nextblock)
+	   printk(KERN_DEBUG "Nextblock at  %08x, used_size %08x, dirty_size %08x, wasted_size %08x, free_size %08x\n", c->nextblock->offset, c->nextblock->used_size, c->nextblock->dirty_size, c->nextblock->wasted_size, c->nextblock->free_size));
+
+	if (!jeb->used_size) {
+		mutex_unlock(&c->alloc_sem);
+		goto eraseit;
+	}
+
+	raw = jeb->gc_node;
+	gcblock_dirty = jeb->dirty_size;
+
+	while(ref_obsolete(raw)) {
+		D1(printk(KERN_DEBUG "Node at 0x%08x is obsolete... skipping\n", ref_offset(raw)));
+		raw = ref_next(raw);
+		if (unlikely(!raw)) {
+			printk(KERN_WARNING "eep. End of raw list while still supposedly nodes to GC\n");
+			printk(KERN_WARNING "erase block at 0x%08x. free_size 0x%08x, dirty_size 0x%08x, used_size 0x%08x\n",
+			       jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size);
+			jeb->gc_node = raw;
+			spin_unlock(&c->erase_completion_lock);
+			mutex_unlock(&c->alloc_sem);
+			BUG();
+		}
+	}
+	jeb->gc_node = raw;
+
+	D1(printk(KERN_DEBUG "Going to garbage collect node at 0x%08x\n", ref_offset(raw)));
+
+	if (!raw->next_in_ino) {
+		/* Inode-less node. Clean marker, snapshot or something like that */
+		spin_unlock(&c->erase_completion_lock);
+		if (ref_flags(raw) == REF_PRISTINE) {
+			/* It's an unknown node with JFFS2_FEATURE_RWCOMPAT_COPY */
+			jffs2_garbage_collect_pristine(c, NULL, raw);
+		} else {
+			/* Just mark it obsolete */
+			jffs2_mark_node_obsolete(c, raw);
+		}
+		mutex_unlock(&c->alloc_sem);
+		goto eraseit_lock;
+	}
+
+	ic = jffs2_raw_ref_to_ic(raw);
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+	/* When 'ic' refers xattr_datum/xattr_ref, this node is GCed as xattr.
+	 * We can decide whether this node is inode or xattr by ic->class.     */
+	if (ic->class == RAWNODE_CLASS_XATTR_DATUM
+	    || ic->class == RAWNODE_CLASS_XATTR_REF) {
+		spin_unlock(&c->erase_completion_lock);
+
+		if (ic->class == RAWNODE_CLASS_XATTR_DATUM) {
+			ret = jffs2_garbage_collect_xattr_datum(c, (struct jffs2_xattr_datum *)ic, raw);
+		} else {
+			ret = jffs2_garbage_collect_xattr_ref(c, (struct jffs2_xattr_ref *)ic, raw);
+		}
+		goto test_gcnode;
+	}
+#endif
+
+	/* We need to hold the inocache. Either the erase_completion_lock or
+	   the inocache_lock are sufficient; we trade down since the inocache_lock
+	   causes less contention. */
+	spin_lock(&c->inocache_lock);
+
+	spin_unlock(&c->erase_completion_lock);
+
+	D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass collecting from block @0x%08x. Node @0x%08x(%d), ino #%u\n", jeb->offset, ref_offset(raw), ref_flags(raw), ic->ino));
+
+	/* Three possibilities:
+	   1. Inode is already in-core. We must iget it and do proper
+	      updating to its fragtree, etc.
+	   2. Inode is not in-core, node is REF_PRISTINE. We lock the
+	      inocache to prevent a read_inode(), copy the node intact.
+	   3. Inode is not in-core, node is not pristine. We must iget()
+	      and take the slow path.
+	*/
+
+	switch(ic->state) {
+	case INO_STATE_CHECKEDABSENT:
+		/* It's been checked, but it's not currently in-core.
+		   We can just copy any pristine nodes, but have
+		   to prevent anyone else from doing read_inode() while
+		   we're at it, so we set the state accordingly */
+		if (ref_flags(raw) == REF_PRISTINE)
+			ic->state = INO_STATE_GC;
+		else {
+			D1(printk(KERN_DEBUG "Ino #%u is absent but node not REF_PRISTINE. Reading.\n",
+				  ic->ino));
+		}
+		break;
+
+	case INO_STATE_PRESENT:
+		/* It's in-core. GC must iget() it. */
+		break;
+
+	case INO_STATE_UNCHECKED:
+	case INO_STATE_CHECKING:
+	case INO_STATE_GC:
+		/* Should never happen. We should have finished checking
+		   by the time we actually start doing any GC, and since
+		   we're holding the alloc_sem, no other garbage collection
+		   can happen.
+		*/
+		printk(KERN_CRIT "Inode #%u already in state %d in jffs2_garbage_collect_pass()!\n",
+		       ic->ino, ic->state);
+		mutex_unlock(&c->alloc_sem);
+		spin_unlock(&c->inocache_lock);
+		BUG();
+
+	case INO_STATE_READING:
+		/* Someone's currently trying to read it. We must wait for
+		   them to finish and then go through the full iget() route
+		   to do the GC. However, sometimes read_inode() needs to get
+		   the alloc_sem() (for marking nodes invalid) so we must
+		   drop the alloc_sem before sleeping. */
+
+		mutex_unlock(&c->alloc_sem);
+		D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() waiting for ino #%u in state %d\n",
+			  ic->ino, ic->state));
+		sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
+		/* And because we dropped the alloc_sem we must start again from the
+		   beginning. Ponder chance of livelock here -- we're returning success
+		   without actually making any progress.
+
+		   Q: What are the chances that the inode is back in INO_STATE_READING
+		   again by the time we next enter this function? And that this happens
+		   enough times to cause a real delay?
+
+		   A: Small enough that I don't care :)
+		*/
+		return 0;
+	}
+
+	/* OK. Now if the inode is in state INO_STATE_GC, we are going to copy the
+	   node intact, and we don't have to muck about with the fragtree etc.
+	   because we know it's not in-core. If it _was_ in-core, we go through
+	   all the iget() crap anyway */
+
+	if (ic->state == INO_STATE_GC) {
+		spin_unlock(&c->inocache_lock);
+
+		ret = jffs2_garbage_collect_pristine(c, ic, raw);
+
+		spin_lock(&c->inocache_lock);
+		ic->state = INO_STATE_CHECKEDABSENT;
+		wake_up(&c->inocache_wq);
+
+		if (ret != -EBADFD) {
+			spin_unlock(&c->inocache_lock);
+			goto test_gcnode;
+		}
+
+		/* Fall through if it wanted us to, with inocache_lock held */
+	}
+
+	/* Prevent the fairly unlikely race where the gcblock is
+	   entirely obsoleted by the final close of a file which had
+	   the only valid nodes in the block, followed by erasure,
+	   followed by freeing of the ic because the erased block(s)
+	   held _all_ the nodes of that inode.... never been seen but
+	   it's vaguely possible. */
+
+	inum = ic->ino;
+	nlink = ic->pino_nlink;
+	spin_unlock(&c->inocache_lock);
+
+	f = jffs2_gc_fetch_inode(c, inum, !nlink);
+	if (IS_ERR(f)) {
+		ret = PTR_ERR(f);
+		goto release_sem;
+	}
+	if (!f) {
+		ret = 0;
+		goto release_sem;
+	}
+
+	ret = jffs2_garbage_collect_live(c, jeb, raw, f);
+
+	jffs2_gc_release_inode(c, f);
+
+ test_gcnode:
+	if (jeb->dirty_size == gcblock_dirty && !ref_obsolete(jeb->gc_node)) {
+		/* Eep. This really should never happen. GC is broken */
+		printk(KERN_ERR "Error garbage collecting node at %08x!\n", ref_offset(jeb->gc_node));
+		ret = -ENOSPC;
+	}
+ release_sem:
+	mutex_unlock(&c->alloc_sem);
+
+ eraseit_lock:
+	/* If we've finished this block, start it erasing */
+	spin_lock(&c->erase_completion_lock);
+
+ eraseit:
+	if (c->gcblock && !c->gcblock->used_size) {
+		D1(printk(KERN_DEBUG "Block at 0x%08x completely obsoleted by GC. Moving to erase_pending_list\n", c->gcblock->offset));
+		/* We're GC'ing an empty block? */
+		list_add_tail(&c->gcblock->list, &c->erase_pending_list);
+		c->gcblock = NULL;
+		c->nr_erasing_blocks++;
+		jffs2_garbage_collect_trigger(c);
+	}
+	spin_unlock(&c->erase_completion_lock);
+
+	return ret;
+}
+
+static int jffs2_garbage_collect_live(struct jffs2_sb_info *c,  struct jffs2_eraseblock *jeb,
+				      struct jffs2_raw_node_ref *raw, struct jffs2_inode_info *f)
+{
+	struct jffs2_node_frag *frag;
+	struct jffs2_full_dnode *fn = NULL;
+	struct jffs2_full_dirent *fd;
+	uint32_t start = 0, end = 0, nrfrags = 0;
+	int ret = 0;
+
+	mutex_lock(&f->sem);
+
+	/* Now we have the lock for this inode. Check that it's still the one at the head
+	   of the list. */
+
+	spin_lock(&c->erase_completion_lock);
+
+	if (c->gcblock != jeb) {
+		spin_unlock(&c->erase_completion_lock);
+		D1(printk(KERN_DEBUG "GC block is no longer gcblock. Restart\n"));
+		goto upnout;
+	}
+	if (ref_obsolete(raw)) {
+		spin_unlock(&c->erase_completion_lock);
+		D1(printk(KERN_DEBUG "node to be GC'd was obsoleted in the meantime.\n"));
+		/* They'll call again */
+		goto upnout;
+	}
+	spin_unlock(&c->erase_completion_lock);
+
+	/* OK. Looks safe. And nobody can get us now because we have the semaphore. Move the block */
+	if (f->metadata && f->metadata->raw == raw) {
+		fn = f->metadata;
+		ret = jffs2_garbage_collect_metadata(c, jeb, f, fn);
+		goto upnout;
+	}
+
+	/* FIXME. Read node and do lookup? */
+	for (frag = frag_first(&f->fragtree); frag; frag = frag_next(frag)) {
+		if (frag->node && frag->node->raw == raw) {
+			fn = frag->node;
+			end = frag->ofs + frag->size;
+			if (!nrfrags++)
+				start = frag->ofs;
+			if (nrfrags == frag->node->frags)
+				break; /* We've found them all */
+		}
+	}
+	if (fn) {
+		if (ref_flags(raw) == REF_PRISTINE) {
+			ret = jffs2_garbage_collect_pristine(c, f->inocache, raw);
+			if (!ret) {
+				/* Urgh. Return it sensibly. */
+				frag->node->raw = f->inocache->nodes;
+			}
+			if (ret != -EBADFD)
+				goto upnout;
+		}
+		/* We found a datanode. Do the GC */
+		if((start >> PAGE_CACHE_SHIFT) < ((end-1) >> PAGE_CACHE_SHIFT)) {
+			/* It crosses a page boundary. Therefore, it must be a hole. */
+			ret = jffs2_garbage_collect_hole(c, jeb, f, fn, start, end);
+		} else {
+			/* It could still be a hole. But we GC the page this way anyway */
+			ret = jffs2_garbage_collect_dnode(c, jeb, f, fn, start, end);
+		}
+		goto upnout;
+	}
+
+	/* Wasn't a dnode. Try dirent */
+	for (fd = f->dents; fd; fd=fd->next) {
+		if (fd->raw == raw)
+			break;
+	}
+
+	if (fd && fd->ino) {
+		ret = jffs2_garbage_collect_dirent(c, jeb, f, fd);
+	} else if (fd) {
+		ret = jffs2_garbage_collect_deletion_dirent(c, jeb, f, fd);
+	} else {
+		printk(KERN_WARNING "Raw node at 0x%08x wasn't in node lists for ino #%u\n",
+		       ref_offset(raw), f->inocache->ino);
+		if (ref_obsolete(raw)) {
+			printk(KERN_WARNING "But it's obsolete so we don't mind too much\n");
+		} else {
+			jffs2_dbg_dump_node(c, ref_offset(raw));
+			BUG();
+		}
+	}
+ upnout:
+	mutex_unlock(&f->sem);
+
+	return ret;
+}
+
+static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
+					  struct jffs2_inode_cache *ic,
+					  struct jffs2_raw_node_ref *raw)
+{
+	union jffs2_node_union *node;
+	size_t retlen;
+	int ret;
+	uint32_t phys_ofs, alloclen;
+	uint32_t crc, rawlen;
+	int retried = 0;
+
+	D1(printk(KERN_DEBUG "Going to GC REF_PRISTINE node at 0x%08x\n", ref_offset(raw)));
+
+	alloclen = rawlen = ref_totlen(c, c->gcblock, raw);
+
+	/* Ask for a small amount of space (or the totlen if smaller) because we
+	   don't want to force wastage of the end of a block if splitting would
+	   work. */
+	if (ic && alloclen > sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN)
+		alloclen = sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN;
+
+	ret = jffs2_reserve_space_gc(c, alloclen, &alloclen, rawlen);
+	/* 'rawlen' is not the exact summary size; it is only an upper estimation */
+
+	if (ret)
+		return ret;
+
+	if (alloclen < rawlen) {
+		/* Doesn't fit untouched. We'll go the old route and split it */
+		return -EBADFD;
+	}
+
+	node = kmalloc(rawlen, GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
+
+	ret = jffs2_flash_read(c, ref_offset(raw), rawlen, &retlen, (char *)node);
+	if (!ret && retlen != rawlen)
+		ret = -EIO;
+	if (ret)
+		goto out_node;
+
+	crc = crc32(0, node, sizeof(struct jffs2_unknown_node)-4);
+	if (je32_to_cpu(node->u.hdr_crc) != crc) {
+		printk(KERN_WARNING "Header CRC failed on REF_PRISTINE node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+		       ref_offset(raw), je32_to_cpu(node->u.hdr_crc), crc);
+		goto bail;
+	}
+
+	switch(je16_to_cpu(node->u.nodetype)) {
+	case JFFS2_NODETYPE_INODE:
+		crc = crc32(0, node, sizeof(node->i)-8);
+		if (je32_to_cpu(node->i.node_crc) != crc) {
+			printk(KERN_WARNING "Node CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+			       ref_offset(raw), je32_to_cpu(node->i.node_crc), crc);
+			goto bail;
+		}
+
+		if (je32_to_cpu(node->i.dsize)) {
+			crc = crc32(0, node->i.data, je32_to_cpu(node->i.csize));
+			if (je32_to_cpu(node->i.data_crc) != crc) {
+				printk(KERN_WARNING "Data CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+				       ref_offset(raw), je32_to_cpu(node->i.data_crc), crc);
+				goto bail;
+			}
+		}
+		break;
+
+	case JFFS2_NODETYPE_DIRENT:
+		crc = crc32(0, node, sizeof(node->d)-8);
+		if (je32_to_cpu(node->d.node_crc) != crc) {
+			printk(KERN_WARNING "Node CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+			       ref_offset(raw), je32_to_cpu(node->d.node_crc), crc);
+			goto bail;
+		}
+
+		if (strnlen(node->d.name, node->d.nsize) != node->d.nsize) {
+			printk(KERN_WARNING "Name in dirent node at 0x%08x contains zeroes\n", ref_offset(raw));
+			goto bail;
+		}
+
+		if (node->d.nsize) {
+			crc = crc32(0, node->d.name, node->d.nsize);
+			if (je32_to_cpu(node->d.name_crc) != crc) {
+				printk(KERN_WARNING "Name CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+				       ref_offset(raw), je32_to_cpu(node->d.name_crc), crc);
+				goto bail;
+			}
+		}
+		break;
+	default:
+		/* If it's inode-less, we don't _know_ what it is. Just copy it intact */
+		if (ic) {
+			printk(KERN_WARNING "Unknown node type for REF_PRISTINE node at 0x%08x: 0x%04x\n",
+			       ref_offset(raw), je16_to_cpu(node->u.nodetype));
+			goto bail;
+		}
+	}
+
+	/* OK, all the CRCs are good; this node can just be copied as-is. */
+ retry:
+	phys_ofs = write_ofs(c);
+
+	ret = jffs2_flash_write(c, phys_ofs, rawlen, &retlen, (char *)node);
+
+	if (ret || (retlen != rawlen)) {
+		printk(KERN_NOTICE "Write of %d bytes at 0x%08x failed. returned %d, retlen %zd\n",
+		       rawlen, phys_ofs, ret, retlen);
+		if (retlen) {
+			jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, rawlen, NULL);
+		} else {
+			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", phys_ofs);
+		}
+		if (!retried) {
+			/* Try to reallocate space and retry */
+			uint32_t dummy;
+			struct jffs2_eraseblock *jeb = &c->blocks[phys_ofs / c->sector_size];
+
+			retried = 1;
+
+			D1(printk(KERN_DEBUG "Retrying failed write of REF_PRISTINE node.\n"));
+
+			jffs2_dbg_acct_sanity_check(c,jeb);
+			jffs2_dbg_acct_paranoia_check(c, jeb);
+
+			ret = jffs2_reserve_space_gc(c, rawlen, &dummy, rawlen);
+						/* this is not the exact summary size of it,
+							it is only an upper estimation */
+
+			if (!ret) {
+				D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", phys_ofs));
+
+				jffs2_dbg_acct_sanity_check(c,jeb);
+				jffs2_dbg_acct_paranoia_check(c, jeb);
+
+				goto retry;
+			}
+			D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret));
+		}
+
+		if (!ret)
+			ret = -EIO;
+		goto out_node;
+	}
+	jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, rawlen, ic);
+
+	jffs2_mark_node_obsolete(c, raw);
+	D1(printk(KERN_DEBUG "WHEEE! GC REF_PRISTINE node at 0x%08x succeeded\n", ref_offset(raw)));
+
+ out_node:
+	kfree(node);
+	return ret;
+ bail:
+	ret = -EBADFD;
+	goto out_node;
+}
+
+static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+					struct jffs2_inode_info *f, struct jffs2_full_dnode *fn)
+{
+	struct jffs2_full_dnode *new_fn;
+	struct jffs2_raw_inode ri;
+	struct jffs2_node_frag *last_frag;
+	union jffs2_device_node dev;
+	char *mdata = NULL;
+	int mdatalen = 0;
+	uint32_t alloclen, ilen;
+	int ret;
+
+	if (S_ISBLK(JFFS2_F_I_MODE(f)) ||
+	    S_ISCHR(JFFS2_F_I_MODE(f)) ) {
+		/* For these, we don't actually need to read the old node */
+		mdatalen = jffs2_encode_dev(&dev, JFFS2_F_I_RDEV(f));
+		mdata = (char *)&dev;
+		D1(printk(KERN_DEBUG "jffs2_garbage_collect_metadata(): Writing %d bytes of kdev_t\n", mdatalen));
+	} else if (S_ISLNK(JFFS2_F_I_MODE(f))) {
+		mdatalen = fn->size;
+		mdata = kmalloc(fn->size, GFP_KERNEL);
+		if (!mdata) {
+			printk(KERN_WARNING "kmalloc of mdata failed in jffs2_garbage_collect_metadata()\n");
+			return -ENOMEM;
+		}
+		ret = jffs2_read_dnode(c, f, fn, mdata, 0, mdatalen);
+		if (ret) {
+			printk(KERN_WARNING "read of old metadata failed in jffs2_garbage_collect_metadata(): %d\n", ret);
+			kfree(mdata);
+			return ret;
+		}
+		D1(printk(KERN_DEBUG "jffs2_garbage_collect_metadata(): Writing %d bites of symlink target\n", mdatalen));
+
+	}
+
+	ret = jffs2_reserve_space_gc(c, sizeof(ri) + mdatalen, &alloclen,
+				JFFS2_SUMMARY_INODE_SIZE);
+	if (ret) {
+		printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_metadata failed: %d\n",
+		       sizeof(ri)+ mdatalen, ret);
+		goto out;
+	}
+
+	last_frag = frag_last(&f->fragtree);
+	if (last_frag)
+		/* Fetch the inode length from the fragtree rather then
+		 * from i_size since i_size may have not been updated yet */
+		ilen = last_frag->ofs + last_frag->size;
+	else
+		ilen = JFFS2_F_I_SIZE(f);
+
+	memset(&ri, 0, sizeof(ri));
+	ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE);
+	ri.totlen = cpu_to_je32(sizeof(ri) + mdatalen);
+	ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4));
+
+	ri.ino = cpu_to_je32(f->inocache->ino);
+	ri.version = cpu_to_je32(++f->highest_version);
+	ri.mode = cpu_to_jemode(JFFS2_F_I_MODE(f));
+	ri.uid = cpu_to_je16(JFFS2_F_I_UID(f));
+	ri.gid = cpu_to_je16(JFFS2_F_I_GID(f));
+	ri.isize = cpu_to_je32(ilen);
+	ri.atime = cpu_to_je32(JFFS2_F_I_ATIME(f));
+	ri.ctime = cpu_to_je32(JFFS2_F_I_CTIME(f));
+	ri.mtime = cpu_to_je32(JFFS2_F_I_MTIME(f));
+	ri.offset = cpu_to_je32(0);
+	ri.csize = cpu_to_je32(mdatalen);
+	ri.dsize = cpu_to_je32(mdatalen);
+	ri.compr = JFFS2_COMPR_NONE;
+	ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
+	ri.data_crc = cpu_to_je32(crc32(0, mdata, mdatalen));
+
+	new_fn = jffs2_write_dnode(c, f, &ri, mdata, mdatalen, ALLOC_GC);
+
+	if (IS_ERR(new_fn)) {
+		printk(KERN_WARNING "Error writing new dnode: %ld\n", PTR_ERR(new_fn));
+		ret = PTR_ERR(new_fn);
+		goto out;
+	}
+	jffs2_mark_node_obsolete(c, fn->raw);
+	jffs2_free_full_dnode(fn);
+	f->metadata = new_fn;
+ out:
+	if (S_ISLNK(JFFS2_F_I_MODE(f)))
+		kfree(mdata);
+	return ret;
+}
+
+static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+					struct jffs2_inode_info *f, struct jffs2_full_dirent *fd)
+{
+	struct jffs2_full_dirent *new_fd;
+	struct jffs2_raw_dirent rd;
+	uint32_t alloclen;
+	int ret;
+
+	rd.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rd.nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
+	rd.nsize = strlen(fd->name);
+	rd.totlen = cpu_to_je32(sizeof(rd) + rd.nsize);
+	rd.hdr_crc = cpu_to_je32(crc32(0, &rd, sizeof(struct jffs2_unknown_node)-4));
+
+	rd.pino = cpu_to_je32(f->inocache->ino);
+	rd.version = cpu_to_je32(++f->highest_version);
+	rd.ino = cpu_to_je32(fd->ino);
+	/* If the times on this inode were set by explicit utime() they can be different,
+	   so refrain from splatting them. */
+	if (JFFS2_F_I_MTIME(f) == JFFS2_F_I_CTIME(f))
+		rd.mctime = cpu_to_je32(JFFS2_F_I_MTIME(f));
+	else
+		rd.mctime = cpu_to_je32(0);
+	rd.type = fd->type;
+	rd.node_crc = cpu_to_je32(crc32(0, &rd, sizeof(rd)-8));
+	rd.name_crc = cpu_to_je32(crc32(0, fd->name, rd.nsize));
+
+	ret = jffs2_reserve_space_gc(c, sizeof(rd)+rd.nsize, &alloclen,
+				JFFS2_SUMMARY_DIRENT_SIZE(rd.nsize));
+	if (ret) {
+		printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_dirent failed: %d\n",
+		       sizeof(rd)+rd.nsize, ret);
+		return ret;
+	}
+	new_fd = jffs2_write_dirent(c, f, &rd, fd->name, rd.nsize, ALLOC_GC);
+
+	if (IS_ERR(new_fd)) {
+		printk(KERN_WARNING "jffs2_write_dirent in garbage_collect_dirent failed: %ld\n", PTR_ERR(new_fd));
+		return PTR_ERR(new_fd);
+	}
+	jffs2_add_fd_to_list(c, new_fd, &f->dents);
+	return 0;
+}
+
+static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+					struct jffs2_inode_info *f, struct jffs2_full_dirent *fd)
+{
+	struct jffs2_full_dirent **fdp = &f->dents;
+	int found = 0;
+
+	/* On a medium where we can't actually mark nodes obsolete
+	   pernamently, such as NAND flash, we need to work out
+	   whether this deletion dirent is still needed to actively
+	   delete a 'real' dirent with the same name that's still
+	   somewhere else on the flash. */
+	if (!jffs2_can_mark_obsolete(c)) {
+		struct jffs2_raw_dirent *rd;
+		struct jffs2_raw_node_ref *raw;
+		int ret;
+		size_t retlen;
+		int name_len = strlen(fd->name);
+		uint32_t name_crc = crc32(0, fd->name, name_len);
+		uint32_t rawlen = ref_totlen(c, jeb, fd->raw);
+
+		rd = kmalloc(rawlen, GFP_KERNEL);
+		if (!rd)
+			return -ENOMEM;
+
+		/* Prevent the erase code from nicking the obsolete node refs while
+		   we're looking at them. I really don't like this extra lock but
+		   can't see any alternative. Suggestions on a postcard to... */
+		mutex_lock(&c->erase_free_sem);
+
+		for (raw = f->inocache->nodes; raw != (void *)f->inocache; raw = raw->next_in_ino) {
+
+			cond_resched();
+
+			/* We only care about obsolete ones */
+			if (!(ref_obsolete(raw)))
+				continue;
+
+			/* Any dirent with the same name is going to have the same length... */
+			if (ref_totlen(c, NULL, raw) != rawlen)
+				continue;
+
+			/* Doesn't matter if there's one in the same erase block. We're going to
+			   delete it too at the same time. */
+			if (SECTOR_ADDR(raw->flash_offset) == SECTOR_ADDR(fd->raw->flash_offset))
+				continue;
+
+			D1(printk(KERN_DEBUG "Check potential deletion dirent at %08x\n", ref_offset(raw)));
+
+			/* This is an obsolete node belonging to the same directory, and it's of the right
+			   length. We need to take a closer look...*/
+			ret = jffs2_flash_read(c, ref_offset(raw), rawlen, &retlen, (char *)rd);
+			if (ret) {
+				printk(KERN_WARNING "jffs2_g_c_deletion_dirent(): Read error (%d) reading obsolete node at %08x\n", ret, ref_offset(raw));
+				/* If we can't read it, we don't need to continue to obsolete it. Continue */
+				continue;
+			}
+			if (retlen != rawlen) {
+				printk(KERN_WARNING "jffs2_g_c_deletion_dirent(): Short read (%zd not %u) reading header from obsolete node at %08x\n",
+				       retlen, rawlen, ref_offset(raw));
+				continue;
+			}
+
+			if (je16_to_cpu(rd->nodetype) != JFFS2_NODETYPE_DIRENT)
+				continue;
+
+			/* If the name CRC doesn't match, skip */
+			if (je32_to_cpu(rd->name_crc) != name_crc)
+				continue;
+
+			/* If the name length doesn't match, or it's another deletion dirent, skip */
+			if (rd->nsize != name_len || !je32_to_cpu(rd->ino))
+				continue;
+
+			/* OK, check the actual name now */
+			if (memcmp(rd->name, fd->name, name_len))
+				continue;
+
+			/* OK. The name really does match. There really is still an older node on
+			   the flash which our deletion dirent obsoletes. So we have to write out
+			   a new deletion dirent to replace it */
+			mutex_unlock(&c->erase_free_sem);
+
+			D1(printk(KERN_DEBUG "Deletion dirent at %08x still obsoletes real dirent \"%s\" at %08x for ino #%u\n",
+				  ref_offset(fd->raw), fd->name, ref_offset(raw), je32_to_cpu(rd->ino)));
+			kfree(rd);
+
+			return jffs2_garbage_collect_dirent(c, jeb, f, fd);
+		}
+
+		mutex_unlock(&c->erase_free_sem);
+		kfree(rd);
+	}
+
+	/* FIXME: If we're deleting a dirent which contains the current mtime and ctime,
+	   we should update the metadata node with those times accordingly */
+
+	/* No need for it any more. Just mark it obsolete and remove it from the list */
+	while (*fdp) {
+		if ((*fdp) == fd) {
+			found = 1;
+			*fdp = fd->next;
+			break;
+		}
+		fdp = &(*fdp)->next;
+	}
+	if (!found) {
+		printk(KERN_WARNING "Deletion dirent \"%s\" not found in list for ino #%u\n", fd->name, f->inocache->ino);
+	}
+	jffs2_mark_node_obsolete(c, fd->raw);
+	jffs2_free_full_dirent(fd);
+	return 0;
+}
+
+static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				      struct jffs2_inode_info *f, struct jffs2_full_dnode *fn,
+				      uint32_t start, uint32_t end)
+{
+	struct jffs2_raw_inode ri;
+	struct jffs2_node_frag *frag;
+	struct jffs2_full_dnode *new_fn;
+	uint32_t alloclen, ilen;
+	int ret;
+
+	D1(printk(KERN_DEBUG "Writing replacement hole node for ino #%u from offset 0x%x to 0x%x\n",
+		  f->inocache->ino, start, end));
+
+	memset(&ri, 0, sizeof(ri));
+
+	if(fn->frags > 1) {
+		size_t readlen;
+		uint32_t crc;
+		/* It's partially obsoleted by a later write. So we have to
+		   write it out again with the _same_ version as before */
+		ret = jffs2_flash_read(c, ref_offset(fn->raw), sizeof(ri), &readlen, (char *)&ri);
+		if (readlen != sizeof(ri) || ret) {
+			printk(KERN_WARNING "Node read failed in jffs2_garbage_collect_hole. Ret %d, retlen %zd. Data will be lost by writing new hole node\n", ret, readlen);
+			goto fill;
+		}
+		if (je16_to_cpu(ri.nodetype) != JFFS2_NODETYPE_INODE) {
+			printk(KERN_WARNING "jffs2_garbage_collect_hole: Node at 0x%08x had node type 0x%04x instead of JFFS2_NODETYPE_INODE(0x%04x)\n",
+			       ref_offset(fn->raw),
+			       je16_to_cpu(ri.nodetype), JFFS2_NODETYPE_INODE);
+			return -EIO;
+		}
+		if (je32_to_cpu(ri.totlen) != sizeof(ri)) {
+			printk(KERN_WARNING "jffs2_garbage_collect_hole: Node at 0x%08x had totlen 0x%x instead of expected 0x%zx\n",
+			       ref_offset(fn->raw),
+			       je32_to_cpu(ri.totlen), sizeof(ri));
+			return -EIO;
+		}
+		crc = crc32(0, &ri, sizeof(ri)-8);
+		if (crc != je32_to_cpu(ri.node_crc)) {
+			printk(KERN_WARNING "jffs2_garbage_collect_hole: Node at 0x%08x had CRC 0x%08x which doesn't match calculated CRC 0x%08x\n",
+			       ref_offset(fn->raw),
+			       je32_to_cpu(ri.node_crc), crc);
+			/* FIXME: We could possibly deal with this by writing new holes for each frag */
+			printk(KERN_WARNING "Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n",
+			       start, end, f->inocache->ino);
+			goto fill;
+		}
+		if (ri.compr != JFFS2_COMPR_ZERO) {
+			printk(KERN_WARNING "jffs2_garbage_collect_hole: Node 0x%08x wasn't a hole node!\n", ref_offset(fn->raw));
+			printk(KERN_WARNING "Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n",
+			       start, end, f->inocache->ino);
+			goto fill;
+		}
+	} else {
+	fill:
+		ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+		ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE);
+		ri.totlen = cpu_to_je32(sizeof(ri));
+		ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4));
+
+		ri.ino = cpu_to_je32(f->inocache->ino);
+		ri.version = cpu_to_je32(++f->highest_version);
+		ri.offset = cpu_to_je32(start);
+		ri.dsize = cpu_to_je32(end - start);
+		ri.csize = cpu_to_je32(0);
+		ri.compr = JFFS2_COMPR_ZERO;
+	}
+
+	frag = frag_last(&f->fragtree);
+	if (frag)
+		/* Fetch the inode length from the fragtree rather then
+		 * from i_size since i_size may have not been updated yet */
+		ilen = frag->ofs + frag->size;
+	else
+		ilen = JFFS2_F_I_SIZE(f);
+
+	ri.mode = cpu_to_jemode(JFFS2_F_I_MODE(f));
+	ri.uid = cpu_to_je16(JFFS2_F_I_UID(f));
+	ri.gid = cpu_to_je16(JFFS2_F_I_GID(f));
+	ri.isize = cpu_to_je32(ilen);
+	ri.atime = cpu_to_je32(JFFS2_F_I_ATIME(f));
+	ri.ctime = cpu_to_je32(JFFS2_F_I_CTIME(f));
+	ri.mtime = cpu_to_je32(JFFS2_F_I_MTIME(f));
+	ri.data_crc = cpu_to_je32(0);
+	ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
+
+	ret = jffs2_reserve_space_gc(c, sizeof(ri), &alloclen,
+				     JFFS2_SUMMARY_INODE_SIZE);
+	if (ret) {
+		printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_hole failed: %d\n",
+		       sizeof(ri), ret);
+		return ret;
+	}
+	new_fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_GC);
+
+	if (IS_ERR(new_fn)) {
+		printk(KERN_WARNING "Error writing new hole node: %ld\n", PTR_ERR(new_fn));
+		return PTR_ERR(new_fn);
+	}
+	if (je32_to_cpu(ri.version) == f->highest_version) {
+		jffs2_add_full_dnode_to_inode(c, f, new_fn);
+		if (f->metadata) {
+			jffs2_mark_node_obsolete(c, f->metadata->raw);
+			jffs2_free_full_dnode(f->metadata);
+			f->metadata = NULL;
+		}
+		return 0;
+	}
+
+	/*
+	 * We should only get here in the case where the node we are
+	 * replacing had more than one frag, so we kept the same version
+	 * number as before. (Except in case of error -- see 'goto fill;'
+	 * above.)
+	 */
+	D1(if(unlikely(fn->frags <= 1)) {
+		printk(KERN_WARNING "jffs2_garbage_collect_hole: Replacing fn with %d frag(s) but new ver %d != highest_version %d of ino #%d\n",
+		       fn->frags, je32_to_cpu(ri.version), f->highest_version,
+		       je32_to_cpu(ri.ino));
+	});
+
+	/* This is a partially-overlapped hole node. Mark it REF_NORMAL not REF_PRISTINE */
+	mark_ref_normal(new_fn->raw);
+
+	for (frag = jffs2_lookup_node_frag(&f->fragtree, fn->ofs);
+	     frag; frag = frag_next(frag)) {
+		if (frag->ofs > fn->size + fn->ofs)
+			break;
+		if (frag->node == fn) {
+			frag->node = new_fn;
+			new_fn->frags++;
+			fn->frags--;
+		}
+	}
+	if (fn->frags) {
+		printk(KERN_WARNING "jffs2_garbage_collect_hole: Old node still has frags!\n");
+		BUG();
+	}
+	if (!new_fn->frags) {
+		printk(KERN_WARNING "jffs2_garbage_collect_hole: New node has no frags!\n");
+		BUG();
+	}
+
+	jffs2_mark_node_obsolete(c, fn->raw);
+	jffs2_free_full_dnode(fn);
+
+	return 0;
+}
+
+static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *orig_jeb,
+				       struct jffs2_inode_info *f, struct jffs2_full_dnode *fn,
+				       uint32_t start, uint32_t end)
+{
+	struct jffs2_full_dnode *new_fn;
+	struct jffs2_raw_inode ri;
+	uint32_t alloclen, offset, orig_end, orig_start;
+	int ret = 0;
+	unsigned char *comprbuf = NULL, *writebuf;
+	unsigned long pg;
+	unsigned char *pg_ptr;
+
+	memset(&ri, 0, sizeof(ri));
+
+	D1(printk(KERN_DEBUG "Writing replacement dnode for ino #%u from offset 0x%x to 0x%x\n",
+		  f->inocache->ino, start, end));
+
+	orig_end = end;
+	orig_start = start;
+
+	if (c->nr_free_blocks + c->nr_erasing_blocks > c->resv_blocks_gcmerge) {
+		/* Attempt to do some merging. But only expand to cover logically
+		   adjacent frags if the block containing them is already considered
+		   to be dirty. Otherwise we end up with GC just going round in
+		   circles dirtying the nodes it already wrote out, especially
+		   on NAND where we have small eraseblocks and hence a much higher
+		   chance of nodes having to be split to cross boundaries. */
+
+		struct jffs2_node_frag *frag;
+		uint32_t min, max;
+
+		min = start & ~(PAGE_CACHE_SIZE-1);
+		max = min + PAGE_CACHE_SIZE;
+
+		frag = jffs2_lookup_node_frag(&f->fragtree, start);
+
+		/* BUG_ON(!frag) but that'll happen anyway... */
+
+		BUG_ON(frag->ofs != start);
+
+		/* First grow down... */
+		while((frag = frag_prev(frag)) && frag->ofs >= min) {
+
+			/* If the previous frag doesn't even reach the beginning, there's
+			   excessive fragmentation. Just merge. */
+			if (frag->ofs > min) {
+				D1(printk(KERN_DEBUG "Expanding down to cover partial frag (0x%x-0x%x)\n",
+					  frag->ofs, frag->ofs+frag->size));
+				start = frag->ofs;
+				continue;
+			}
+			/* OK. This frag holds the first byte of the page. */
+			if (!frag->node || !frag->node->raw) {
+				D1(printk(KERN_DEBUG "First frag in page is hole (0x%x-0x%x). Not expanding down.\n",
+					  frag->ofs, frag->ofs+frag->size));
+				break;
+			} else {
+
+				/* OK, it's a frag which extends to the beginning of the page. Does it live
+				   in a block which is still considered clean? If so, don't obsolete it.
+				   If not, cover it anyway. */
+
+				struct jffs2_raw_node_ref *raw = frag->node->raw;
+				struct jffs2_eraseblock *jeb;
+
+				jeb = &c->blocks[raw->flash_offset / c->sector_size];
+
+				if (jeb == c->gcblock) {
+					D1(printk(KERN_DEBUG "Expanding down to cover frag (0x%x-0x%x) in gcblock at %08x\n",
+						  frag->ofs, frag->ofs+frag->size, ref_offset(raw)));
+					start = frag->ofs;
+					break;
+				}
+				if (!ISDIRTY(jeb->dirty_size + jeb->wasted_size)) {
+					D1(printk(KERN_DEBUG "Not expanding down to cover frag (0x%x-0x%x) in clean block %08x\n",
+						  frag->ofs, frag->ofs+frag->size, jeb->offset));
+					break;
+				}
+
+				D1(printk(KERN_DEBUG "Expanding down to cover frag (0x%x-0x%x) in dirty block %08x\n",
+						  frag->ofs, frag->ofs+frag->size, jeb->offset));
+				start = frag->ofs;
+				break;
+			}
+		}
+
+		/* ... then up */
+
+		/* Find last frag which is actually part of the node we're to GC. */
+		frag = jffs2_lookup_node_frag(&f->fragtree, end-1);
+
+		while((frag = frag_next(frag)) && frag->ofs+frag->size <= max) {
+
+			/* If the previous frag doesn't even reach the beginning, there's lots
+			   of fragmentation. Just merge. */
+			if (frag->ofs+frag->size < max) {
+				D1(printk(KERN_DEBUG "Expanding up to cover partial frag (0x%x-0x%x)\n",
+					  frag->ofs, frag->ofs+frag->size));
+				end = frag->ofs + frag->size;
+				continue;
+			}
+
+			if (!frag->node || !frag->node->raw) {
+				D1(printk(KERN_DEBUG "Last frag in page is hole (0x%x-0x%x). Not expanding up.\n",
+					  frag->ofs, frag->ofs+frag->size));
+				break;
+			} else {
+
+				/* OK, it's a frag which extends to the beginning of the page. Does it live
+				   in a block which is still considered clean? If so, don't obsolete it.
+				   If not, cover it anyway. */
+
+				struct jffs2_raw_node_ref *raw = frag->node->raw;
+				struct jffs2_eraseblock *jeb;
+
+				jeb = &c->blocks[raw->flash_offset / c->sector_size];
+
+				if (jeb == c->gcblock) {
+					D1(printk(KERN_DEBUG "Expanding up to cover frag (0x%x-0x%x) in gcblock at %08x\n",
+						  frag->ofs, frag->ofs+frag->size, ref_offset(raw)));
+					end = frag->ofs + frag->size;
+					break;
+				}
+				if (!ISDIRTY(jeb->dirty_size + jeb->wasted_size)) {
+					D1(printk(KERN_DEBUG "Not expanding up to cover frag (0x%x-0x%x) in clean block %08x\n",
+						  frag->ofs, frag->ofs+frag->size, jeb->offset));
+					break;
+				}
+
+				D1(printk(KERN_DEBUG "Expanding up to cover frag (0x%x-0x%x) in dirty block %08x\n",
+						  frag->ofs, frag->ofs+frag->size, jeb->offset));
+				end = frag->ofs + frag->size;
+				break;
+			}
+		}
+		D1(printk(KERN_DEBUG "Expanded dnode to write from (0x%x-0x%x) to (0x%x-0x%x)\n",
+			  orig_start, orig_end, start, end));
+
+		D1(BUG_ON(end > frag_last(&f->fragtree)->ofs + frag_last(&f->fragtree)->size));
+		BUG_ON(end < orig_end);
+		BUG_ON(start > orig_start);
+	}
+
+	/* First, use readpage() to read the appropriate page into the page cache */
+	/* Q: What happens if we actually try to GC the _same_ page for which commit_write()
+	 *    triggered garbage collection in the first place?
+	 * A: I _think_ it's OK. read_cache_page shouldn't deadlock, we'll write out the
+	 *    page OK. We'll actually write it out again in commit_write, which is a little
+	 *    suboptimal, but at least we're correct.
+	 */
+	pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg);
+
+	if (IS_ERR(pg_ptr)) {
+		printk(KERN_WARNING "read_cache_page() returned error: %ld\n", PTR_ERR(pg_ptr));
+		return PTR_ERR(pg_ptr);
+	}
+
+	offset = start;
+	while(offset < orig_end) {
+		uint32_t datalen;
+		uint32_t cdatalen;
+		uint16_t comprtype = JFFS2_COMPR_NONE;
+
+		ret = jffs2_reserve_space_gc(c, sizeof(ri) + JFFS2_MIN_DATA_LEN,
+					&alloclen, JFFS2_SUMMARY_INODE_SIZE);
+
+		if (ret) {
+			printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_dnode failed: %d\n",
+			       sizeof(ri)+ JFFS2_MIN_DATA_LEN, ret);
+			break;
+		}
+		cdatalen = min_t(uint32_t, alloclen - sizeof(ri), end - offset);
+		datalen = end - offset;
+
+		writebuf = pg_ptr + (offset & (PAGE_CACHE_SIZE -1));
+
+		comprtype = jffs2_compress(c, f, writebuf, &comprbuf, &datalen, &cdatalen);
+
+		ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+		ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE);
+		ri.totlen = cpu_to_je32(sizeof(ri) + cdatalen);
+		ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4));
+
+		ri.ino = cpu_to_je32(f->inocache->ino);
+		ri.version = cpu_to_je32(++f->highest_version);
+		ri.mode = cpu_to_jemode(JFFS2_F_I_MODE(f));
+		ri.uid = cpu_to_je16(JFFS2_F_I_UID(f));
+		ri.gid = cpu_to_je16(JFFS2_F_I_GID(f));
+		ri.isize = cpu_to_je32(JFFS2_F_I_SIZE(f));
+		ri.atime = cpu_to_je32(JFFS2_F_I_ATIME(f));
+		ri.ctime = cpu_to_je32(JFFS2_F_I_CTIME(f));
+		ri.mtime = cpu_to_je32(JFFS2_F_I_MTIME(f));
+		ri.offset = cpu_to_je32(offset);
+		ri.csize = cpu_to_je32(cdatalen);
+		ri.dsize = cpu_to_je32(datalen);
+		ri.compr = comprtype & 0xff;
+		ri.usercompr = (comprtype >> 8) & 0xff;
+		ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
+		ri.data_crc = cpu_to_je32(crc32(0, comprbuf, cdatalen));
+
+		new_fn = jffs2_write_dnode(c, f, &ri, comprbuf, cdatalen, ALLOC_GC);
+
+		jffs2_free_comprbuf(comprbuf, writebuf);
+
+		if (IS_ERR(new_fn)) {
+			printk(KERN_WARNING "Error writing new dnode: %ld\n", PTR_ERR(new_fn));
+			ret = PTR_ERR(new_fn);
+			break;
+		}
+		ret = jffs2_add_full_dnode_to_inode(c, f, new_fn);
+		offset += datalen;
+		if (f->metadata) {
+			jffs2_mark_node_obsolete(c, f->metadata->raw);
+			jffs2_free_full_dnode(f->metadata);
+			f->metadata = NULL;
+		}
+	}
+
+	jffs2_gc_release_page(c, pg_ptr, &pg);
+	return ret;
+}
diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c
new file mode 100644
index 00000000..859a598a
--- /dev/null
+++ b/fs/jffs2/ioctl.c
@@ -0,0 +1,22 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/fs.h>
+#include "nodelist.h"
+
+long jffs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	/* Later, this will provide for lsattr.jffs2 and chattr.jffs2, which
+	   will include compression support etc. */
+	return -ENOTTY;
+}
+
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
new file mode 100644
index 00000000..2e4a8676
--- /dev/null
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -0,0 +1,56 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef _JFFS2_FS_I
+#define _JFFS2_FS_I
+
+#include <linux/rbtree.h>
+#include <linux/posix_acl.h>
+#include <linux/mutex.h>
+
+struct jffs2_inode_info {
+	/* We need an internal mutex similar to inode->i_mutex.
+	   Unfortunately, we can't used the existing one, because
+	   either the GC would deadlock, or we'd have to release it
+	   before letting GC proceed. Or we'd have to put ugliness
+	   into the GC code so it didn't attempt to obtain the i_mutex
+	   for the inode(s) which are already locked */
+	struct mutex sem;
+
+	/* The highest (datanode) version number used for this ino */
+	uint32_t highest_version;
+
+	/* List of data fragments which make up the file */
+	struct rb_root fragtree;
+
+	/* There may be one datanode which isn't referenced by any of the
+	   above fragments, if it contains a metadata update but no actual
+	   data - or if this is a directory inode */
+	/* This also holds the _only_ dnode for symlinks/device nodes,
+	   etc. */
+	struct jffs2_full_dnode *metadata;
+
+	/* Directory entries */
+	struct jffs2_full_dirent *dents;
+
+	/* The target path if this is the inode of a symlink */
+	unsigned char *target;
+
+	/* Some stuff we just have to keep in-core at all times, for each inode. */
+	struct jffs2_inode_cache *inocache;
+
+	uint16_t flags;
+	uint8_t usercompr;
+	struct inode vfs_inode;
+};
+
+#endif /* _JFFS2_FS_I */
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
new file mode 100644
index 00000000..0bc6a6c8
--- /dev/null
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -0,0 +1,147 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef _JFFS2_FS_SB
+#define _JFFS2_FS_SB
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/rwsem.h>
+
+#define JFFS2_SB_FLAG_RO 1
+#define JFFS2_SB_FLAG_SCANNING 2 /* Flash scanning is in progress */
+#define JFFS2_SB_FLAG_BUILDING 4 /* File system building is in progress */
+
+struct jffs2_inodirty;
+
+/* A struct for the overall file system control.  Pointers to
+   jffs2_sb_info structs are named `c' in the source code.
+   Nee jffs_control
+*/
+struct jffs2_sb_info {
+	struct mtd_info *mtd;
+
+	uint32_t highest_ino;
+	uint32_t checked_ino;
+
+	unsigned int flags;
+
+	struct task_struct *gc_task;	/* GC task struct */
+	struct completion gc_thread_start; /* GC thread start completion */
+	struct completion gc_thread_exit; /* GC thread exit completion port */
+
+	struct mutex alloc_sem;		/* Used to protect all the following
+					   fields, and also to protect against
+					   out-of-order writing of nodes. And GC. */
+	uint32_t cleanmarker_size;	/* Size of an _inline_ CLEANMARKER
+					 (i.e. zero for OOB CLEANMARKER */
+
+	uint32_t flash_size;
+	uint32_t used_size;
+	uint32_t dirty_size;
+	uint32_t wasted_size;
+	uint32_t free_size;
+	uint32_t erasing_size;
+	uint32_t bad_size;
+	uint32_t sector_size;
+	uint32_t unchecked_size;
+
+	uint32_t nr_free_blocks;
+	uint32_t nr_erasing_blocks;
+
+	/* Number of free blocks there must be before we... */
+	uint8_t resv_blocks_write;	/* ... allow a normal filesystem write */
+	uint8_t resv_blocks_deletion;	/* ... allow a normal filesystem deletion */
+	uint8_t resv_blocks_gctrigger;	/* ... wake up the GC thread */
+	uint8_t resv_blocks_gcbad;	/* ... pick a block from the bad_list to GC */
+	uint8_t resv_blocks_gcmerge;	/* ... merge pages when garbage collecting */
+	/* Number of 'very dirty' blocks before we trigger immediate GC */
+	uint8_t vdirty_blocks_gctrigger;
+
+	uint32_t nospc_dirty_size;
+
+	uint32_t nr_blocks;
+	struct jffs2_eraseblock *blocks;	/* The whole array of blocks. Used for getting blocks
+						 * from the offset (blocks[ofs / sector_size]) */
+	struct jffs2_eraseblock *nextblock;	/* The block we're currently filling */
+
+	struct jffs2_eraseblock *gcblock;	/* The block we're currently garbage-collecting */
+
+	struct list_head clean_list;		/* Blocks 100% full of clean data */
+	struct list_head very_dirty_list;	/* Blocks with lots of dirty space */
+	struct list_head dirty_list;		/* Blocks with some dirty space */
+	struct list_head erasable_list;		/* Blocks which are completely dirty, and need erasing */
+	struct list_head erasable_pending_wbuf_list;	/* Blocks which need erasing but only after the current wbuf is flushed */
+	struct list_head erasing_list;		/* Blocks which are currently erasing */
+	struct list_head erase_checking_list;	/* Blocks which are being checked and marked */
+	struct list_head erase_pending_list;	/* Blocks which need erasing now */
+	struct list_head erase_complete_list;	/* Blocks which are erased and need the clean marker written to them */
+	struct list_head free_list;		/* Blocks which are free and ready to be used */
+	struct list_head bad_list;		/* Bad blocks. */
+	struct list_head bad_used_list;		/* Bad blocks with valid data in. */
+
+	spinlock_t erase_completion_lock;	/* Protect free_list and erasing_list
+						   against erase completion handler */
+	wait_queue_head_t erase_wait;		/* For waiting for erases to complete */
+
+	wait_queue_head_t inocache_wq;
+	int inocache_hashsize;
+	struct jffs2_inode_cache **inocache_list;
+	spinlock_t inocache_lock;
+
+	/* Sem to allow jffs2_garbage_collect_deletion_dirent to
+	   drop the erase_completion_lock while it's holding a pointer
+	   to an obsoleted node. I don't like this. Alternatives welcomed. */
+	struct mutex erase_free_sem;
+
+	uint32_t wbuf_pagesize; /* 0 for NOR and other flashes with no wbuf */
+
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	unsigned char *wbuf_verify; /* read-back buffer for verification */
+#endif
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	unsigned char *wbuf; /* Write-behind buffer for NAND flash */
+	uint32_t wbuf_ofs;
+	uint32_t wbuf_len;
+	struct jffs2_inodirty *wbuf_inodes;
+	struct rw_semaphore wbuf_sem;	/* Protects the write buffer */
+
+	unsigned char *oobbuf;
+	int oobavail; /* How many bytes are available for JFFS2 in OOB */
+#endif
+
+	struct jffs2_summary *summary;		/* Summary information */
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+#define XATTRINDEX_HASHSIZE	(57)
+	uint32_t highest_xid;
+	uint32_t highest_xseqno;
+	struct list_head xattrindex[XATTRINDEX_HASHSIZE];
+	struct list_head xattr_unchecked;
+	struct list_head xattr_dead_list;
+	struct jffs2_xattr_ref *xref_dead_list;
+	struct jffs2_xattr_ref *xref_temp;
+	struct rw_semaphore xattr_sem;
+	uint32_t xdatum_mem_usage;
+	uint32_t xdatum_mem_threshold;
+#endif
+	/* OS-private pointer for getting back to master superblock info */
+	void *os_priv;
+};
+
+#endif /* _JFFS2_FS_SB */
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
new file mode 100644
index 00000000..c0828689
--- /dev/null
+++ b/fs/jffs2/malloc.c
@@ -0,0 +1,318 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/jffs2.h>
+#include "nodelist.h"
+
+/* These are initialised to NULL in the kernel startup code.
+   If you're porting to other operating systems, beware */
+static struct kmem_cache *full_dnode_slab;
+static struct kmem_cache *raw_dirent_slab;
+static struct kmem_cache *raw_inode_slab;
+static struct kmem_cache *tmp_dnode_info_slab;
+static struct kmem_cache *raw_node_ref_slab;
+static struct kmem_cache *node_frag_slab;
+static struct kmem_cache *inode_cache_slab;
+#ifdef CONFIG_JFFS2_FS_XATTR
+static struct kmem_cache *xattr_datum_cache;
+static struct kmem_cache *xattr_ref_cache;
+#endif
+
+int __init jffs2_create_slab_caches(void)
+{
+	full_dnode_slab = kmem_cache_create("jffs2_full_dnode",
+					    sizeof(struct jffs2_full_dnode),
+					    0, 0, NULL);
+	if (!full_dnode_slab)
+		goto err;
+
+	raw_dirent_slab = kmem_cache_create("jffs2_raw_dirent",
+					    sizeof(struct jffs2_raw_dirent),
+					    0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!raw_dirent_slab)
+		goto err;
+
+	raw_inode_slab = kmem_cache_create("jffs2_raw_inode",
+					   sizeof(struct jffs2_raw_inode),
+					   0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!raw_inode_slab)
+		goto err;
+
+	tmp_dnode_info_slab = kmem_cache_create("jffs2_tmp_dnode",
+						sizeof(struct jffs2_tmp_dnode_info),
+						0, 0, NULL);
+	if (!tmp_dnode_info_slab)
+		goto err;
+
+	raw_node_ref_slab = kmem_cache_create("jffs2_refblock",
+					      sizeof(struct jffs2_raw_node_ref) * (REFS_PER_BLOCK + 1),
+					      0, 0, NULL);
+	if (!raw_node_ref_slab)
+		goto err;
+
+	node_frag_slab = kmem_cache_create("jffs2_node_frag",
+					   sizeof(struct jffs2_node_frag),
+					   0, 0, NULL);
+	if (!node_frag_slab)
+		goto err;
+
+	inode_cache_slab = kmem_cache_create("jffs2_inode_cache",
+					     sizeof(struct jffs2_inode_cache),
+					     0, 0, NULL);
+	if (!inode_cache_slab)
+		goto err;
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+	xattr_datum_cache = kmem_cache_create("jffs2_xattr_datum",
+					     sizeof(struct jffs2_xattr_datum),
+					     0, 0, NULL);
+	if (!xattr_datum_cache)
+		goto err;
+
+	xattr_ref_cache = kmem_cache_create("jffs2_xattr_ref",
+					   sizeof(struct jffs2_xattr_ref),
+					   0, 0, NULL);
+	if (!xattr_ref_cache)
+		goto err;
+#endif
+
+	return 0;
+ err:
+	jffs2_destroy_slab_caches();
+	return -ENOMEM;
+}
+
+void jffs2_destroy_slab_caches(void)
+{
+	if(full_dnode_slab)
+		kmem_cache_destroy(full_dnode_slab);
+	if(raw_dirent_slab)
+		kmem_cache_destroy(raw_dirent_slab);
+	if(raw_inode_slab)
+		kmem_cache_destroy(raw_inode_slab);
+	if(tmp_dnode_info_slab)
+		kmem_cache_destroy(tmp_dnode_info_slab);
+	if(raw_node_ref_slab)
+		kmem_cache_destroy(raw_node_ref_slab);
+	if(node_frag_slab)
+		kmem_cache_destroy(node_frag_slab);
+	if(inode_cache_slab)
+		kmem_cache_destroy(inode_cache_slab);
+#ifdef CONFIG_JFFS2_FS_XATTR
+	if (xattr_datum_cache)
+		kmem_cache_destroy(xattr_datum_cache);
+	if (xattr_ref_cache)
+		kmem_cache_destroy(xattr_ref_cache);
+#endif
+}
+
+struct jffs2_full_dirent *jffs2_alloc_full_dirent(int namesize)
+{
+	struct jffs2_full_dirent *ret;
+	ret = kmalloc(sizeof(struct jffs2_full_dirent) + namesize, GFP_KERNEL);
+	dbg_memalloc("%p\n", ret);
+	return ret;
+}
+
+void jffs2_free_full_dirent(struct jffs2_full_dirent *x)
+{
+	dbg_memalloc("%p\n", x);
+	kfree(x);
+}
+
+struct jffs2_full_dnode *jffs2_alloc_full_dnode(void)
+{
+	struct jffs2_full_dnode *ret;
+	ret = kmem_cache_alloc(full_dnode_slab, GFP_KERNEL);
+	dbg_memalloc("%p\n", ret);
+	return ret;
+}
+
+void jffs2_free_full_dnode(struct jffs2_full_dnode *x)
+{
+	dbg_memalloc("%p\n", x);
+	kmem_cache_free(full_dnode_slab, x);
+}
+
+struct jffs2_raw_dirent *jffs2_alloc_raw_dirent(void)
+{
+	struct jffs2_raw_dirent *ret;
+	ret = kmem_cache_alloc(raw_dirent_slab, GFP_KERNEL);
+	dbg_memalloc("%p\n", ret);
+	return ret;
+}
+
+void jffs2_free_raw_dirent(struct jffs2_raw_dirent *x)
+{
+	dbg_memalloc("%p\n", x);
+	kmem_cache_free(raw_dirent_slab, x);
+}
+
+struct jffs2_raw_inode *jffs2_alloc_raw_inode(void)
+{
+	struct jffs2_raw_inode *ret;
+	ret = kmem_cache_alloc(raw_inode_slab, GFP_KERNEL);
+	dbg_memalloc("%p\n", ret);
+	return ret;
+}
+
+void jffs2_free_raw_inode(struct jffs2_raw_inode *x)
+{
+	dbg_memalloc("%p\n", x);
+	kmem_cache_free(raw_inode_slab, x);
+}
+
+struct jffs2_tmp_dnode_info *jffs2_alloc_tmp_dnode_info(void)
+{
+	struct jffs2_tmp_dnode_info *ret;
+	ret = kmem_cache_alloc(tmp_dnode_info_slab, GFP_KERNEL);
+	dbg_memalloc("%p\n",
+		ret);
+	return ret;
+}
+
+void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *x)
+{
+	dbg_memalloc("%p\n", x);
+	kmem_cache_free(tmp_dnode_info_slab, x);
+}
+
+static struct jffs2_raw_node_ref *jffs2_alloc_refblock(void)
+{
+	struct jffs2_raw_node_ref *ret;
+
+	ret = kmem_cache_alloc(raw_node_ref_slab, GFP_KERNEL);
+	if (ret) {
+		int i = 0;
+		for (i=0; i < REFS_PER_BLOCK; i++) {
+			ret[i].flash_offset = REF_EMPTY_NODE;
+			ret[i].next_in_ino = NULL;
+		}
+		ret[i].flash_offset = REF_LINK_NODE;
+		ret[i].next_in_ino = NULL;
+	}
+	return ret;
+}
+
+int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c,
+				 struct jffs2_eraseblock *jeb, int nr)
+{
+	struct jffs2_raw_node_ref **p, *ref;
+	int i = nr;
+
+	dbg_memalloc("%d\n", nr);
+
+	p = &jeb->last_node;
+	ref = *p;
+
+	dbg_memalloc("Reserving %d refs for block @0x%08x\n", nr, jeb->offset);
+
+	/* If jeb->last_node is really a valid node then skip over it */
+	if (ref && ref->flash_offset != REF_EMPTY_NODE)
+		ref++;
+
+	while (i) {
+		if (!ref) {
+			dbg_memalloc("Allocating new refblock linked from %p\n", p);
+			ref = *p = jffs2_alloc_refblock();
+			if (!ref)
+				return -ENOMEM;
+		}
+		if (ref->flash_offset == REF_LINK_NODE) {
+			p = &ref->next_in_ino;
+			ref = *p;
+			continue;
+		}
+		i--;
+		ref++;
+	}
+	jeb->allocated_refs = nr;
+
+	dbg_memalloc("Reserved %d refs for block @0x%08x, last_node is %p (%08x,%p)\n",
+		  nr, jeb->offset, jeb->last_node, jeb->last_node->flash_offset,
+		  jeb->last_node->next_in_ino);
+
+	return 0;
+}
+
+void jffs2_free_refblock(struct jffs2_raw_node_ref *x)
+{
+	dbg_memalloc("%p\n", x);
+	kmem_cache_free(raw_node_ref_slab, x);
+}
+
+struct jffs2_node_frag *jffs2_alloc_node_frag(void)
+{
+	struct jffs2_node_frag *ret;
+	ret = kmem_cache_alloc(node_frag_slab, GFP_KERNEL);
+	dbg_memalloc("%p\n", ret);
+	return ret;
+}
+
+void jffs2_free_node_frag(struct jffs2_node_frag *x)
+{
+	dbg_memalloc("%p\n", x);
+	kmem_cache_free(node_frag_slab, x);
+}
+
+struct jffs2_inode_cache *jffs2_alloc_inode_cache(void)
+{
+	struct jffs2_inode_cache *ret;
+	ret = kmem_cache_alloc(inode_cache_slab, GFP_KERNEL);
+	dbg_memalloc("%p\n", ret);
+	return ret;
+}
+
+void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
+{
+	dbg_memalloc("%p\n", x);
+	kmem_cache_free(inode_cache_slab, x);
+}
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
+{
+	struct jffs2_xattr_datum *xd;
+	xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
+	dbg_memalloc("%p\n", xd);
+
+	xd->class = RAWNODE_CLASS_XATTR_DATUM;
+	xd->node = (void *)xd;
+	INIT_LIST_HEAD(&xd->xindex);
+	return xd;
+}
+
+void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
+{
+	dbg_memalloc("%p\n", xd);
+	kmem_cache_free(xattr_datum_cache, xd);
+}
+
+struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
+{
+	struct jffs2_xattr_ref *ref;
+	ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
+	dbg_memalloc("%p\n", ref);
+
+	ref->class = RAWNODE_CLASS_XATTR_REF;
+	ref->node = (void *)ref;
+	return ref;
+}
+
+void jffs2_free_xattr_ref(struct jffs2_xattr_ref *ref)
+{
+	dbg_memalloc("%p\n", ref);
+	kmem_cache_free(xattr_ref_cache, ref);
+}
+#endif
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
new file mode 100644
index 00000000..5e03233c
--- /dev/null
+++ b/fs/jffs2/nodelist.c
@@ -0,0 +1,771 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/mtd/mtd.h>
+#include <linux/rbtree.h>
+#include <linux/crc32.h>
+#include <linux/pagemap.h>
+#include "nodelist.h"
+
+static void jffs2_obsolete_node_frag(struct jffs2_sb_info *c,
+				     struct jffs2_node_frag *this);
+
+void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new, struct jffs2_full_dirent **list)
+{
+	struct jffs2_full_dirent **prev = list;
+
+	dbg_dentlist("add dirent \"%s\", ino #%u\n", new->name, new->ino);
+
+	while ((*prev) && (*prev)->nhash <= new->nhash) {
+		if ((*prev)->nhash == new->nhash && !strcmp((*prev)->name, new->name)) {
+			/* Duplicate. Free one */
+			if (new->version < (*prev)->version) {
+				dbg_dentlist("Eep! Marking new dirent node obsolete, old is \"%s\", ino #%u\n",
+					(*prev)->name, (*prev)->ino);
+				jffs2_mark_node_obsolete(c, new->raw);
+				jffs2_free_full_dirent(new);
+			} else {
+				dbg_dentlist("marking old dirent \"%s\", ino #%u obsolete\n",
+					(*prev)->name, (*prev)->ino);
+				new->next = (*prev)->next;
+				/* It may have been a 'placeholder' deletion dirent, 
+				   if jffs2_can_mark_obsolete() (see jffs2_do_unlink()) */
+				if ((*prev)->raw)
+					jffs2_mark_node_obsolete(c, ((*prev)->raw));
+				jffs2_free_full_dirent(*prev);
+				*prev = new;
+			}
+			return;
+		}
+		prev = &((*prev)->next);
+	}
+	new->next = *prev;
+	*prev = new;
+}
+
+uint32_t jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list, uint32_t size)
+{
+	struct jffs2_node_frag *frag = jffs2_lookup_node_frag(list, size);
+
+	dbg_fragtree("truncating fragtree to 0x%08x bytes\n", size);
+
+	/* We know frag->ofs <= size. That's what lookup does for us */
+	if (frag && frag->ofs != size) {
+		if (frag->ofs+frag->size > size) {
+			frag->size = size - frag->ofs;
+		}
+		frag = frag_next(frag);
+	}
+	while (frag && frag->ofs >= size) {
+		struct jffs2_node_frag *next = frag_next(frag);
+
+		frag_erase(frag, list);
+		jffs2_obsolete_node_frag(c, frag);
+		frag = next;
+	}
+
+	if (size == 0)
+		return 0;
+
+	frag = frag_last(list);
+
+	/* Sanity check for truncation to longer than we started with... */
+	if (!frag)
+		return 0;
+	if (frag->ofs + frag->size < size)
+		return frag->ofs + frag->size;
+
+	/* If the last fragment starts at the RAM page boundary, it is
+	 * REF_PRISTINE irrespective of its size. */
+	if (frag->node && (frag->ofs & (PAGE_CACHE_SIZE - 1)) == 0) {
+		dbg_fragtree2("marking the last fragment 0x%08x-0x%08x REF_PRISTINE.\n",
+			frag->ofs, frag->ofs + frag->size);
+		frag->node->raw->flash_offset = ref_offset(frag->node->raw) | REF_PRISTINE;
+	}
+	return size;
+}
+
+static void jffs2_obsolete_node_frag(struct jffs2_sb_info *c,
+				     struct jffs2_node_frag *this)
+{
+	if (this->node) {
+		this->node->frags--;
+		if (!this->node->frags) {
+			/* The node has no valid frags left. It's totally obsoleted */
+			dbg_fragtree2("marking old node @0x%08x (0x%04x-0x%04x) obsolete\n",
+				ref_offset(this->node->raw), this->node->ofs, this->node->ofs+this->node->size);
+			jffs2_mark_node_obsolete(c, this->node->raw);
+			jffs2_free_full_dnode(this->node);
+		} else {
+			dbg_fragtree2("marking old node @0x%08x (0x%04x-0x%04x) REF_NORMAL. frags is %d\n",
+				ref_offset(this->node->raw), this->node->ofs, this->node->ofs+this->node->size, this->node->frags);
+			mark_ref_normal(this->node->raw);
+		}
+
+	}
+	jffs2_free_node_frag(this);
+}
+
+static void jffs2_fragtree_insert(struct jffs2_node_frag *newfrag, struct jffs2_node_frag *base)
+{
+	struct rb_node *parent = &base->rb;
+	struct rb_node **link = &parent;
+
+	dbg_fragtree2("insert frag (0x%04x-0x%04x)\n", newfrag->ofs, newfrag->ofs + newfrag->size);
+
+	while (*link) {
+		parent = *link;
+		base = rb_entry(parent, struct jffs2_node_frag, rb);
+
+		if (newfrag->ofs > base->ofs)
+			link = &base->rb.rb_right;
+		else if (newfrag->ofs < base->ofs)
+			link = &base->rb.rb_left;
+		else {
+			JFFS2_ERROR("duplicate frag at %08x (%p,%p)\n", newfrag->ofs, newfrag, base);
+			BUG();
+		}
+	}
+
+	rb_link_node(&newfrag->rb, &base->rb, link);
+}
+
+/*
+ * Allocate and initializes a new fragment.
+ */
+static struct jffs2_node_frag * new_fragment(struct jffs2_full_dnode *fn, uint32_t ofs, uint32_t size)
+{
+	struct jffs2_node_frag *newfrag;
+
+	newfrag = jffs2_alloc_node_frag();
+	if (likely(newfrag)) {
+		newfrag->ofs = ofs;
+		newfrag->size = size;
+		newfrag->node = fn;
+	} else {
+		JFFS2_ERROR("cannot allocate a jffs2_node_frag object\n");
+	}
+
+	return newfrag;
+}
+
+/*
+ * Called when there is no overlapping fragment exist. Inserts a hole before the new
+ * fragment and inserts the new fragment to the fragtree.
+ */
+static int no_overlapping_node(struct jffs2_sb_info *c, struct rb_root *root,
+		 	       struct jffs2_node_frag *newfrag,
+			       struct jffs2_node_frag *this, uint32_t lastend)
+{
+	if (lastend < newfrag->node->ofs) {
+		/* put a hole in before the new fragment */
+		struct jffs2_node_frag *holefrag;
+
+		holefrag= new_fragment(NULL, lastend, newfrag->node->ofs - lastend);
+		if (unlikely(!holefrag)) {
+			jffs2_free_node_frag(newfrag);
+			return -ENOMEM;
+		}
+
+		if (this) {
+			/* By definition, the 'this' node has no right-hand child,
+			   because there are no frags with offset greater than it.
+			   So that's where we want to put the hole */
+			dbg_fragtree2("add hole frag %#04x-%#04x on the right of the new frag.\n",
+				holefrag->ofs, holefrag->ofs + holefrag->size);
+			rb_link_node(&holefrag->rb, &this->rb, &this->rb.rb_right);
+		} else {
+			dbg_fragtree2("Add hole frag %#04x-%#04x to the root of the tree.\n",
+				holefrag->ofs, holefrag->ofs + holefrag->size);
+			rb_link_node(&holefrag->rb, NULL, &root->rb_node);
+		}
+		rb_insert_color(&holefrag->rb, root);
+		this = holefrag;
+	}
+
+	if (this) {
+		/* By definition, the 'this' node has no right-hand child,
+		   because there are no frags with offset greater than it.
+		   So that's where we want to put new fragment */
+		dbg_fragtree2("add the new node at the right\n");
+		rb_link_node(&newfrag->rb, &this->rb, &this->rb.rb_right);
+	} else {
+		dbg_fragtree2("insert the new node at the root of the tree\n");
+		rb_link_node(&newfrag->rb, NULL, &root->rb_node);
+	}
+	rb_insert_color(&newfrag->rb, root);
+
+	return 0;
+}
+
+/* Doesn't set inode->i_size */
+static int jffs2_add_frag_to_fragtree(struct jffs2_sb_info *c, struct rb_root *root, struct jffs2_node_frag *newfrag)
+{
+	struct jffs2_node_frag *this;
+	uint32_t lastend;
+
+	/* Skip all the nodes which are completed before this one starts */
+	this = jffs2_lookup_node_frag(root, newfrag->node->ofs);
+
+	if (this) {
+		dbg_fragtree2("lookup gave frag 0x%04x-0x%04x; phys 0x%08x (*%p)\n",
+			  this->ofs, this->ofs+this->size, this->node?(ref_offset(this->node->raw)):0xffffffff, this);
+		lastend = this->ofs + this->size;
+	} else {
+		dbg_fragtree2("lookup gave no frag\n");
+		lastend = 0;
+	}
+
+	/* See if we ran off the end of the fragtree */
+	if (lastend <= newfrag->ofs) {
+		/* We did */
+
+		/* Check if 'this' node was on the same page as the new node.
+		   If so, both 'this' and the new node get marked REF_NORMAL so
+		   the GC can take a look.
+		*/
+		if (lastend && (lastend-1) >> PAGE_CACHE_SHIFT == newfrag->ofs >> PAGE_CACHE_SHIFT) {
+			if (this->node)
+				mark_ref_normal(this->node->raw);
+			mark_ref_normal(newfrag->node->raw);
+		}
+
+		return no_overlapping_node(c, root, newfrag, this, lastend);
+	}
+
+	if (this->node)
+		dbg_fragtree2("dealing with frag %u-%u, phys %#08x(%d).\n",
+		this->ofs, this->ofs + this->size,
+		ref_offset(this->node->raw), ref_flags(this->node->raw));
+	else
+		dbg_fragtree2("dealing with hole frag %u-%u.\n",
+		this->ofs, this->ofs + this->size);
+
+	/* OK. 'this' is pointing at the first frag that newfrag->ofs at least partially obsoletes,
+	 * - i.e. newfrag->ofs < this->ofs+this->size && newfrag->ofs >= this->ofs
+	 */
+	if (newfrag->ofs > this->ofs) {
+		/* This node isn't completely obsoleted. The start of it remains valid */
+
+		/* Mark the new node and the partially covered node REF_NORMAL -- let
+		   the GC take a look at them */
+		mark_ref_normal(newfrag->node->raw);
+		if (this->node)
+			mark_ref_normal(this->node->raw);
+
+		if (this->ofs + this->size > newfrag->ofs + newfrag->size) {
+			/* The new node splits 'this' frag into two */
+			struct jffs2_node_frag *newfrag2;
+
+			if (this->node)
+				dbg_fragtree2("split old frag 0x%04x-0x%04x, phys 0x%08x\n",
+					this->ofs, this->ofs+this->size, ref_offset(this->node->raw));
+			else
+				dbg_fragtree2("split old hole frag 0x%04x-0x%04x\n",
+					this->ofs, this->ofs+this->size);
+
+			/* New second frag pointing to this's node */
+			newfrag2 = new_fragment(this->node, newfrag->ofs + newfrag->size,
+						this->ofs + this->size - newfrag->ofs - newfrag->size);
+			if (unlikely(!newfrag2))
+				return -ENOMEM;
+			if (this->node)
+				this->node->frags++;
+
+			/* Adjust size of original 'this' */
+			this->size = newfrag->ofs - this->ofs;
+
+			/* Now, we know there's no node with offset
+			   greater than this->ofs but smaller than
+			   newfrag2->ofs or newfrag->ofs, for obvious
+			   reasons. So we can do a tree insert from
+			   'this' to insert newfrag, and a tree insert
+			   from newfrag to insert newfrag2. */
+			jffs2_fragtree_insert(newfrag, this);
+			rb_insert_color(&newfrag->rb, root);
+
+			jffs2_fragtree_insert(newfrag2, newfrag);
+			rb_insert_color(&newfrag2->rb, root);
+
+			return 0;
+		}
+		/* New node just reduces 'this' frag in size, doesn't split it */
+		this->size = newfrag->ofs - this->ofs;
+
+		/* Again, we know it lives down here in the tree */
+		jffs2_fragtree_insert(newfrag, this);
+		rb_insert_color(&newfrag->rb, root);
+	} else {
+		/* New frag starts at the same point as 'this' used to. Replace
+		   it in the tree without doing a delete and insertion */
+		dbg_fragtree2("inserting newfrag (*%p),%d-%d in before 'this' (*%p),%d-%d\n",
+			  newfrag, newfrag->ofs, newfrag->ofs+newfrag->size, this, this->ofs, this->ofs+this->size);
+
+		rb_replace_node(&this->rb, &newfrag->rb, root);
+
+		if (newfrag->ofs + newfrag->size >= this->ofs+this->size) {
+			dbg_fragtree2("obsoleting node frag %p (%x-%x)\n", this, this->ofs, this->ofs+this->size);
+			jffs2_obsolete_node_frag(c, this);
+		} else {
+			this->ofs += newfrag->size;
+			this->size -= newfrag->size;
+
+			jffs2_fragtree_insert(this, newfrag);
+			rb_insert_color(&this->rb, root);
+			return 0;
+		}
+	}
+	/* OK, now we have newfrag added in the correct place in the tree, but
+	   frag_next(newfrag) may be a fragment which is overlapped by it
+	*/
+	while ((this = frag_next(newfrag)) && newfrag->ofs + newfrag->size >= this->ofs + this->size) {
+		/* 'this' frag is obsoleted completely. */
+		dbg_fragtree2("obsoleting node frag %p (%x-%x) and removing from tree\n",
+			this, this->ofs, this->ofs+this->size);
+		rb_erase(&this->rb, root);
+		jffs2_obsolete_node_frag(c, this);
+	}
+	/* Now we're pointing at the first frag which isn't totally obsoleted by
+	   the new frag */
+
+	if (!this || newfrag->ofs + newfrag->size == this->ofs)
+		return 0;
+
+	/* Still some overlap but we don't need to move it in the tree */
+	this->size = (this->ofs + this->size) - (newfrag->ofs + newfrag->size);
+	this->ofs = newfrag->ofs + newfrag->size;
+
+	/* And mark them REF_NORMAL so the GC takes a look at them */
+	if (this->node)
+		mark_ref_normal(this->node->raw);
+	mark_ref_normal(newfrag->node->raw);
+
+	return 0;
+}
+
+/*
+ * Given an inode, probably with existing tree of fragments, add the new node
+ * to the fragment tree.
+ */
+int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn)
+{
+	int ret;
+	struct jffs2_node_frag *newfrag;
+
+	if (unlikely(!fn->size))
+		return 0;
+
+	newfrag = new_fragment(fn, fn->ofs, fn->size);
+	if (unlikely(!newfrag))
+		return -ENOMEM;
+	newfrag->node->frags = 1;
+
+	dbg_fragtree("adding node %#04x-%#04x @0x%08x on flash, newfrag *%p\n",
+		  fn->ofs, fn->ofs+fn->size, ref_offset(fn->raw), newfrag);
+
+	ret = jffs2_add_frag_to_fragtree(c, &f->fragtree, newfrag);
+	if (unlikely(ret))
+		return ret;
+
+	/* If we now share a page with other nodes, mark either previous
+	   or next node REF_NORMAL, as appropriate.  */
+	if (newfrag->ofs & (PAGE_CACHE_SIZE-1)) {
+		struct jffs2_node_frag *prev = frag_prev(newfrag);
+
+		mark_ref_normal(fn->raw);
+		/* If we don't start at zero there's _always_ a previous */
+		if (prev->node)
+			mark_ref_normal(prev->node->raw);
+	}
+
+	if ((newfrag->ofs+newfrag->size) & (PAGE_CACHE_SIZE-1)) {
+		struct jffs2_node_frag *next = frag_next(newfrag);
+
+		if (next) {
+			mark_ref_normal(fn->raw);
+			if (next->node)
+				mark_ref_normal(next->node->raw);
+		}
+	}
+	jffs2_dbg_fragtree_paranoia_check_nolock(f);
+
+	return 0;
+}
+
+void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state)
+{
+	spin_lock(&c->inocache_lock);
+	ic->state = state;
+	wake_up(&c->inocache_wq);
+	spin_unlock(&c->inocache_lock);
+}
+
+/* During mount, this needs no locking. During normal operation, its
+   callers want to do other stuff while still holding the inocache_lock.
+   Rather than introducing special case get_ino_cache functions or
+   callbacks, we just let the caller do the locking itself. */
+
+struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t ino)
+{
+	struct jffs2_inode_cache *ret;
+
+	ret = c->inocache_list[ino % c->inocache_hashsize];
+	while (ret && ret->ino < ino) {
+		ret = ret->next;
+	}
+
+	if (ret && ret->ino != ino)
+		ret = NULL;
+
+	return ret;
+}
+
+void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new)
+{
+	struct jffs2_inode_cache **prev;
+
+	spin_lock(&c->inocache_lock);
+	if (!new->ino)
+		new->ino = ++c->highest_ino;
+
+	dbg_inocache("add %p (ino #%u)\n", new, new->ino);
+
+	prev = &c->inocache_list[new->ino % c->inocache_hashsize];
+
+	while ((*prev) && (*prev)->ino < new->ino) {
+		prev = &(*prev)->next;
+	}
+	new->next = *prev;
+	*prev = new;
+
+	spin_unlock(&c->inocache_lock);
+}
+
+void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old)
+{
+	struct jffs2_inode_cache **prev;
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+	BUG_ON(old->xref);
+#endif
+	dbg_inocache("del %p (ino #%u)\n", old, old->ino);
+	spin_lock(&c->inocache_lock);
+
+	prev = &c->inocache_list[old->ino % c->inocache_hashsize];
+
+	while ((*prev) && (*prev)->ino < old->ino) {
+		prev = &(*prev)->next;
+	}
+	if ((*prev) == old) {
+		*prev = old->next;
+	}
+
+	/* Free it now unless it's in READING or CLEARING state, which
+	   are the transitions upon read_inode() and clear_inode(). The
+	   rest of the time we know nobody else is looking at it, and
+	   if it's held by read_inode() or clear_inode() they'll free it
+	   for themselves. */
+	if (old->state != INO_STATE_READING && old->state != INO_STATE_CLEARING)
+		jffs2_free_inode_cache(old);
+
+	spin_unlock(&c->inocache_lock);
+}
+
+void jffs2_free_ino_caches(struct jffs2_sb_info *c)
+{
+	int i;
+	struct jffs2_inode_cache *this, *next;
+
+	for (i=0; i < c->inocache_hashsize; i++) {
+		this = c->inocache_list[i];
+		while (this) {
+			next = this->next;
+			jffs2_xattr_free_inode(c, this);
+			jffs2_free_inode_cache(this);
+			this = next;
+		}
+		c->inocache_list[i] = NULL;
+	}
+}
+
+void jffs2_free_raw_node_refs(struct jffs2_sb_info *c)
+{
+	int i;
+	struct jffs2_raw_node_ref *this, *next;
+
+	for (i=0; i<c->nr_blocks; i++) {
+		this = c->blocks[i].first_node;
+		while (this) {
+			if (this[REFS_PER_BLOCK].flash_offset == REF_LINK_NODE)
+				next = this[REFS_PER_BLOCK].next_in_ino;
+			else
+				next = NULL;
+
+			jffs2_free_refblock(this);
+			this = next;
+		}
+		c->blocks[i].first_node = c->blocks[i].last_node = NULL;
+	}
+}
+
+struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset)
+{
+	/* The common case in lookup is that there will be a node
+	   which precisely matches. So we go looking for that first */
+	struct rb_node *next;
+	struct jffs2_node_frag *prev = NULL;
+	struct jffs2_node_frag *frag = NULL;
+
+	dbg_fragtree2("root %p, offset %d\n", fragtree, offset);
+
+	next = fragtree->rb_node;
+
+	while(next) {
+		frag = rb_entry(next, struct jffs2_node_frag, rb);
+
+		if (frag->ofs + frag->size <= offset) {
+			/* Remember the closest smaller match on the way down */
+			if (!prev || frag->ofs > prev->ofs)
+				prev = frag;
+			next = frag->rb.rb_right;
+		} else if (frag->ofs > offset) {
+			next = frag->rb.rb_left;
+		} else {
+			return frag;
+		}
+	}
+
+	/* Exact match not found. Go back up looking at each parent,
+	   and return the closest smaller one */
+
+	if (prev)
+		dbg_fragtree2("no match. Returning frag %#04x-%#04x, closest previous\n",
+			  prev->ofs, prev->ofs+prev->size);
+	else
+		dbg_fragtree2("returning NULL, empty fragtree\n");
+
+	return prev;
+}
+
+/* Pass 'c' argument to indicate that nodes should be marked obsolete as
+   they're killed. */
+void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
+{
+	struct jffs2_node_frag *frag;
+	struct jffs2_node_frag *parent;
+
+	if (!root->rb_node)
+		return;
+
+	dbg_fragtree("killing\n");
+
+	frag = (rb_entry(root->rb_node, struct jffs2_node_frag, rb));
+	while(frag) {
+		if (frag->rb.rb_left) {
+			frag = frag_left(frag);
+			continue;
+		}
+		if (frag->rb.rb_right) {
+			frag = frag_right(frag);
+			continue;
+		}
+
+		if (frag->node && !(--frag->node->frags)) {
+			/* Not a hole, and it's the final remaining frag
+			   of this node. Free the node */
+			if (c)
+				jffs2_mark_node_obsolete(c, frag->node->raw);
+
+			jffs2_free_full_dnode(frag->node);
+		}
+		parent = frag_parent(frag);
+		if (parent) {
+			if (frag_left(parent) == frag)
+				parent->rb.rb_left = NULL;
+			else
+				parent->rb.rb_right = NULL;
+		}
+
+		jffs2_free_node_frag(frag);
+		frag = parent;
+
+		cond_resched();
+	}
+}
+
+struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
+					       struct jffs2_eraseblock *jeb,
+					       uint32_t ofs, uint32_t len,
+					       struct jffs2_inode_cache *ic)
+{
+	struct jffs2_raw_node_ref *ref;
+
+	BUG_ON(!jeb->allocated_refs);
+	jeb->allocated_refs--;
+
+	ref = jeb->last_node;
+
+	dbg_noderef("Last node at %p is (%08x,%p)\n", ref, ref->flash_offset,
+		    ref->next_in_ino);
+
+	while (ref->flash_offset != REF_EMPTY_NODE) {
+		if (ref->flash_offset == REF_LINK_NODE)
+			ref = ref->next_in_ino;
+		else
+			ref++;
+	}
+
+	dbg_noderef("New ref is %p (%08x becomes %08x,%p) len 0x%x\n", ref, 
+		    ref->flash_offset, ofs, ref->next_in_ino, len);
+
+	ref->flash_offset = ofs;
+
+	if (!jeb->first_node) {
+		jeb->first_node = ref;
+		BUG_ON(ref_offset(ref) != jeb->offset);
+	} else if (unlikely(ref_offset(ref) != jeb->offset + c->sector_size - jeb->free_size)) {
+		uint32_t last_len = ref_totlen(c, jeb, jeb->last_node);
+
+		JFFS2_ERROR("Adding new ref %p at (0x%08x-0x%08x) not immediately after previous (0x%08x-0x%08x)\n",
+			    ref, ref_offset(ref), ref_offset(ref)+len,
+			    ref_offset(jeb->last_node), 
+			    ref_offset(jeb->last_node)+last_len);
+		BUG();
+	}
+	jeb->last_node = ref;
+
+	if (ic) {
+		ref->next_in_ino = ic->nodes;
+		ic->nodes = ref;
+	} else {
+		ref->next_in_ino = NULL;
+	}
+
+	switch(ref_flags(ref)) {
+	case REF_UNCHECKED:
+		c->unchecked_size += len;
+		jeb->unchecked_size += len;
+		break;
+
+	case REF_NORMAL:
+	case REF_PRISTINE:
+		c->used_size += len;
+		jeb->used_size += len;
+		break;
+
+	case REF_OBSOLETE:
+		c->dirty_size += len;
+		jeb->dirty_size += len;
+		break;
+	}
+	c->free_size -= len;
+	jeb->free_size -= len;
+
+#ifdef TEST_TOTLEN
+	/* Set (and test) __totlen field... for now */
+	ref->__totlen = len;
+	ref_totlen(c, jeb, ref);
+#endif
+	return ref;
+}
+
+/* No locking, no reservation of 'ref'. Do not use on a live file system */
+int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+			   uint32_t size)
+{
+	if (!size)
+		return 0;
+	if (unlikely(size > jeb->free_size)) {
+		printk(KERN_CRIT "Dirty space 0x%x larger then free_size 0x%x (wasted 0x%x)\n",
+		       size, jeb->free_size, jeb->wasted_size);
+		BUG();
+	}
+	/* REF_EMPTY_NODE is !obsolete, so that works OK */
+	if (jeb->last_node && ref_obsolete(jeb->last_node)) {
+#ifdef TEST_TOTLEN
+		jeb->last_node->__totlen += size;
+#endif
+		c->dirty_size += size;
+		c->free_size -= size;
+		jeb->dirty_size += size;
+		jeb->free_size -= size;
+	} else {
+		uint32_t ofs = jeb->offset + c->sector_size - jeb->free_size;
+		ofs |= REF_OBSOLETE;
+
+		jffs2_link_node_ref(c, jeb, ofs, size, NULL);
+	}
+
+	return 0;
+}
+
+/* Calculate totlen from surrounding nodes or eraseblock */
+static inline uint32_t __ref_totlen(struct jffs2_sb_info *c,
+				    struct jffs2_eraseblock *jeb,
+				    struct jffs2_raw_node_ref *ref)
+{
+	uint32_t ref_end;
+	struct jffs2_raw_node_ref *next_ref = ref_next(ref);
+
+	if (next_ref)
+		ref_end = ref_offset(next_ref);
+	else {
+		if (!jeb)
+			jeb = &c->blocks[ref->flash_offset / c->sector_size];
+
+		/* Last node in block. Use free_space */
+		if (unlikely(ref != jeb->last_node)) {
+			printk(KERN_CRIT "ref %p @0x%08x is not jeb->last_node (%p @0x%08x)\n",
+			       ref, ref_offset(ref), jeb->last_node, jeb->last_node?ref_offset(jeb->last_node):0);
+			BUG();
+		}
+		ref_end = jeb->offset + c->sector_size - jeb->free_size;
+	}
+	return ref_end - ref_offset(ref);
+}
+
+uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+			    struct jffs2_raw_node_ref *ref)
+{
+	uint32_t ret;
+
+	ret = __ref_totlen(c, jeb, ref);
+
+#ifdef TEST_TOTLEN
+	if (unlikely(ret != ref->__totlen)) {
+		if (!jeb)
+			jeb = &c->blocks[ref->flash_offset / c->sector_size];
+
+		printk(KERN_CRIT "Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n",
+		       ref, ref_offset(ref), ref_offset(ref)+ref->__totlen,
+		       ret, ref->__totlen);
+		if (ref_next(ref)) {
+			printk(KERN_CRIT "next %p (0x%08x-0x%08x)\n", ref_next(ref), ref_offset(ref_next(ref)),
+			       ref_offset(ref_next(ref))+ref->__totlen);
+		} else 
+			printk(KERN_CRIT "No next ref. jeb->last_node is %p\n", jeb->last_node);
+
+		printk(KERN_CRIT "jeb->wasted_size %x, dirty_size %x, used_size %x, free_size %x\n", jeb->wasted_size, jeb->dirty_size, jeb->used_size, jeb->free_size);
+
+#if defined(JFFS2_DBG_DUMPS) || defined(JFFS2_DBG_PARANOIA_CHECKS)
+		__jffs2_dbg_dump_node_refs_nolock(c, jeb);
+#endif
+
+		WARN_ON(1);
+
+		ret = ref->__totlen;
+	}
+#endif /* TEST_TOTLEN */
+	return ret;
+}
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
new file mode 100644
index 00000000..e4619b00
--- /dev/null
+++ b/fs/jffs2/nodelist.h
@@ -0,0 +1,480 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef __JFFS2_NODELIST_H__
+#define __JFFS2_NODELIST_H__
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/jffs2.h>
+#include "jffs2_fs_sb.h"
+#include "jffs2_fs_i.h"
+#include "xattr.h"
+#include "acl.h"
+#include "summary.h"
+
+#ifdef __ECOS
+#include "os-ecos.h"
+#else
+#include "os-linux.h"
+#endif
+
+#define JFFS2_NATIVE_ENDIAN
+
+/* Note we handle mode bits conversion from JFFS2 (i.e. Linux) to/from
+   whatever OS we're actually running on here too. */
+
+#if defined(JFFS2_NATIVE_ENDIAN)
+#define cpu_to_je16(x) ((jint16_t){x})
+#define cpu_to_je32(x) ((jint32_t){x})
+#define cpu_to_jemode(x) ((jmode_t){os_to_jffs2_mode(x)})
+
+#define constant_cpu_to_je16(x) ((jint16_t){x})
+#define constant_cpu_to_je32(x) ((jint32_t){x})
+
+#define je16_to_cpu(x) ((x).v16)
+#define je32_to_cpu(x) ((x).v32)
+#define jemode_to_cpu(x) (jffs2_to_os_mode((x).m))
+#elif defined(JFFS2_BIG_ENDIAN)
+#define cpu_to_je16(x) ((jint16_t){cpu_to_be16(x)})
+#define cpu_to_je32(x) ((jint32_t){cpu_to_be32(x)})
+#define cpu_to_jemode(x) ((jmode_t){cpu_to_be32(os_to_jffs2_mode(x))})
+
+#define constant_cpu_to_je16(x) ((jint16_t){__constant_cpu_to_be16(x)})
+#define constant_cpu_to_je32(x) ((jint32_t){__constant_cpu_to_be32(x)})
+
+#define je16_to_cpu(x) (be16_to_cpu(x.v16))
+#define je32_to_cpu(x) (be32_to_cpu(x.v32))
+#define jemode_to_cpu(x) (be32_to_cpu(jffs2_to_os_mode((x).m)))
+#elif defined(JFFS2_LITTLE_ENDIAN)
+#define cpu_to_je16(x) ((jint16_t){cpu_to_le16(x)})
+#define cpu_to_je32(x) ((jint32_t){cpu_to_le32(x)})
+#define cpu_to_jemode(x) ((jmode_t){cpu_to_le32(os_to_jffs2_mode(x))})
+
+#define constant_cpu_to_je16(x) ((jint16_t){__constant_cpu_to_le16(x)})
+#define constant_cpu_to_je32(x) ((jint32_t){__constant_cpu_to_le32(x)})
+
+#define je16_to_cpu(x) (le16_to_cpu(x.v16))
+#define je32_to_cpu(x) (le32_to_cpu(x.v32))
+#define jemode_to_cpu(x) (le32_to_cpu(jffs2_to_os_mode((x).m)))
+#else
+#error wibble
+#endif
+
+/* The minimal node header size */
+#define JFFS2_MIN_NODE_HEADER sizeof(struct jffs2_raw_dirent)
+
+/*
+  This is all we need to keep in-core for each raw node during normal
+  operation. As and when we do read_inode on a particular inode, we can
+  scan the nodes which are listed for it and build up a proper map of
+  which nodes are currently valid. JFFSv1 always used to keep that whole
+  map in core for each inode.
+*/
+struct jffs2_raw_node_ref
+{
+	struct jffs2_raw_node_ref *next_in_ino; /* Points to the next raw_node_ref
+		for this object. If this _is_ the last, it points to the inode_cache,
+		xattr_ref or xattr_datum instead. The common part of those structures
+		has NULL in the first word. See jffs2_raw_ref_to_ic() below */
+	uint32_t flash_offset;
+#undef TEST_TOTLEN
+#ifdef TEST_TOTLEN
+	uint32_t __totlen; /* This may die; use ref_totlen(c, jeb, ) below */
+#endif
+};
+
+#define REF_LINK_NODE ((int32_t)-1)
+#define REF_EMPTY_NODE ((int32_t)-2)
+
+/* Use blocks of about 256 bytes */
+#define REFS_PER_BLOCK ((255/sizeof(struct jffs2_raw_node_ref))-1)
+
+static inline struct jffs2_raw_node_ref *ref_next(struct jffs2_raw_node_ref *ref)
+{
+	ref++;
+
+	/* Link to another block of refs */
+	if (ref->flash_offset == REF_LINK_NODE) {
+		ref = ref->next_in_ino;
+		if (!ref)
+			return ref;
+	}
+
+	/* End of chain */
+	if (ref->flash_offset == REF_EMPTY_NODE)
+		return NULL;
+
+	return ref;
+}
+
+static inline struct jffs2_inode_cache *jffs2_raw_ref_to_ic(struct jffs2_raw_node_ref *raw)
+{
+	while(raw->next_in_ino)
+		raw = raw->next_in_ino;
+
+	/* NB. This can be a jffs2_xattr_datum or jffs2_xattr_ref and
+	   not actually a jffs2_inode_cache. Check ->class */
+	return ((struct jffs2_inode_cache *)raw);
+}
+
+	/* flash_offset & 3 always has to be zero, because nodes are
+	   always aligned at 4 bytes. So we have a couple of extra bits
+	   to play with, which indicate the node's status; see below: */
+#define REF_UNCHECKED	0	/* We haven't yet checked the CRC or built its inode */
+#define REF_OBSOLETE	1	/* Obsolete, can be completely ignored */
+#define REF_PRISTINE	2	/* Completely clean. GC without looking */
+#define REF_NORMAL	3	/* Possibly overlapped. Read the page and write again on GC */
+#define ref_flags(ref)		((ref)->flash_offset & 3)
+#define ref_offset(ref)		((ref)->flash_offset & ~3)
+#define ref_obsolete(ref)	(((ref)->flash_offset & 3) == REF_OBSOLETE)
+#define mark_ref_normal(ref)    do { (ref)->flash_offset = ref_offset(ref) | REF_NORMAL; } while(0)
+
+/* Dirent nodes should be REF_PRISTINE only if they are not a deletion
+   dirent. Deletion dirents should be REF_NORMAL so that GC gets to
+   throw them away when appropriate */
+#define dirent_node_state(rd)	( (je32_to_cpu((rd)->ino)?REF_PRISTINE:REF_NORMAL) )
+
+/* NB: REF_PRISTINE for an inode-less node (ref->next_in_ino == NULL) indicates
+   it is an unknown node of type JFFS2_NODETYPE_RWCOMPAT_COPY, so it'll get
+   copied. If you need to do anything different to GC inode-less nodes, then
+   you need to modify gc.c accordingly. */
+
+/* For each inode in the filesystem, we need to keep a record of
+   nlink, because it would be a PITA to scan the whole directory tree
+   at read_inode() time to calculate it, and to keep sufficient information
+   in the raw_node_ref (basically both parent and child inode number for
+   dirent nodes) would take more space than this does. We also keep
+   a pointer to the first physical node which is part of this inode, too.
+*/
+struct jffs2_inode_cache {
+	/* First part of structure is shared with other objects which
+	   can terminate the raw node refs' next_in_ino list -- which
+	   currently struct jffs2_xattr_datum and struct jffs2_xattr_ref. */
+
+	struct jffs2_full_dirent *scan_dents; /* Used during scan to hold
+		temporary lists of dirents, and later must be set to
+		NULL to mark the end of the raw_node_ref->next_in_ino
+		chain. */
+	struct jffs2_raw_node_ref *nodes;
+	uint8_t class;	/* It's used for identification */
+
+	/* end of shared structure */
+
+	uint8_t flags;
+	uint16_t state;
+	uint32_t ino;
+	struct jffs2_inode_cache *next;
+#ifdef CONFIG_JFFS2_FS_XATTR
+	struct jffs2_xattr_ref *xref;
+#endif
+	uint32_t pino_nlink;	/* Directories store parent inode
+				   here; other inodes store nlink.
+				   Zero always means that it's
+				   completely unlinked. */
+};
+
+/* Inode states for 'state' above. We need the 'GC' state to prevent
+   someone from doing a read_inode() while we're moving a 'REF_PRISTINE'
+   node without going through all the iget() nonsense */
+#define INO_STATE_UNCHECKED	0	/* CRC checks not yet done */
+#define INO_STATE_CHECKING	1	/* CRC checks in progress */
+#define INO_STATE_PRESENT	2	/* In core */
+#define INO_STATE_CHECKEDABSENT	3	/* Checked, cleared again */
+#define INO_STATE_GC		4	/* GCing a 'pristine' node */
+#define INO_STATE_READING	5	/* In read_inode() */
+#define INO_STATE_CLEARING	6	/* In clear_inode() */
+
+#define INO_FLAGS_XATTR_CHECKED	0x01	/* has no duplicate xattr_ref */
+
+#define RAWNODE_CLASS_INODE_CACHE	0
+#define RAWNODE_CLASS_XATTR_DATUM	1
+#define RAWNODE_CLASS_XATTR_REF		2
+
+#define INOCACHE_HASHSIZE_MIN 128
+#define INOCACHE_HASHSIZE_MAX 1024
+
+#define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size)
+
+/*
+  Larger representation of a raw node, kept in-core only when the
+  struct inode for this particular ino is instantiated.
+*/
+
+struct jffs2_full_dnode
+{
+	struct jffs2_raw_node_ref *raw;
+	uint32_t ofs; /* The offset to which the data of this node belongs */
+	uint32_t size;
+	uint32_t frags; /* Number of fragments which currently refer
+			to this node. When this reaches zero,
+			the node is obsolete.  */
+};
+
+/*
+   Even larger representation of a raw node, kept in-core only while
+   we're actually building up the original map of which nodes go where,
+   in read_inode()
+*/
+struct jffs2_tmp_dnode_info
+{
+	struct rb_node rb;
+	struct jffs2_full_dnode *fn;
+	uint32_t version;
+	uint32_t data_crc;
+	uint32_t partial_crc;
+	uint16_t csize;
+	uint16_t overlapped;
+};
+
+/* Temporary data structure used during readinode. */
+struct jffs2_readinode_info
+{
+	struct rb_root tn_root;
+	struct jffs2_tmp_dnode_info *mdata_tn;
+	uint32_t highest_version;
+	uint32_t latest_mctime;
+	uint32_t mctime_ver;
+	struct jffs2_full_dirent *fds;
+	struct jffs2_raw_node_ref *latest_ref;
+};
+
+struct jffs2_full_dirent
+{
+	struct jffs2_raw_node_ref *raw;
+	struct jffs2_full_dirent *next;
+	uint32_t version;
+	uint32_t ino; /* == zero for unlink */
+	unsigned int nhash;
+	unsigned char type;
+	unsigned char name[0];
+};
+
+/*
+  Fragments - used to build a map of which raw node to obtain
+  data from for each part of the ino
+*/
+struct jffs2_node_frag
+{
+	struct rb_node rb;
+	struct jffs2_full_dnode *node; /* NULL for holes */
+	uint32_t size;
+	uint32_t ofs; /* The offset to which this fragment belongs */
+};
+
+struct jffs2_eraseblock
+{
+	struct list_head list;
+	int bad_count;
+	uint32_t offset;		/* of this block in the MTD */
+
+	uint32_t unchecked_size;
+	uint32_t used_size;
+	uint32_t dirty_size;
+	uint32_t wasted_size;
+	uint32_t free_size;	/* Note that sector_size - free_size
+				   is the address of the first free space */
+	uint32_t allocated_refs;
+	struct jffs2_raw_node_ref *first_node;
+	struct jffs2_raw_node_ref *last_node;
+
+	struct jffs2_raw_node_ref *gc_node;	/* Next node to be garbage collected */
+};
+
+static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c)
+{
+	return ((c->flash_size / c->sector_size) * sizeof (struct jffs2_eraseblock)) > (128 * 1024);
+}
+
+#define ref_totlen(a, b, c) __jffs2_ref_totlen((a), (b), (c))
+
+#define ALLOC_NORMAL	0	/* Normal allocation */
+#define ALLOC_DELETION	1	/* Deletion node. Best to allow it */
+#define ALLOC_GC	2	/* Space requested for GC. Give it or die */
+#define ALLOC_NORETRY	3	/* For jffs2_write_dnode: On failure, return -EAGAIN instead of retrying */
+
+/* How much dirty space before it goes on the very_dirty_list */
+#define VERYDIRTY(c, size) ((size) >= ((c)->sector_size / 2))
+
+/* check if dirty space is more than 255 Byte */
+#define ISDIRTY(size) ((size) >  sizeof (struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN)
+
+#define PAD(x) (((x)+3)&~3)
+
+static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
+{
+	if (old_valid_dev(rdev)) {
+		jdev->old_id = cpu_to_je16(old_encode_dev(rdev));
+		return sizeof(jdev->old_id);
+	} else {
+		jdev->new_id = cpu_to_je32(new_encode_dev(rdev));
+		return sizeof(jdev->new_id);
+	}
+}
+
+static inline struct jffs2_node_frag *frag_first(struct rb_root *root)
+{
+	struct rb_node *node = rb_first(root);
+
+	if (!node)
+		return NULL;
+
+	return rb_entry(node, struct jffs2_node_frag, rb);
+}
+
+static inline struct jffs2_node_frag *frag_last(struct rb_root *root)
+{
+	struct rb_node *node = rb_last(root);
+
+	if (!node)
+		return NULL;
+
+	return rb_entry(node, struct jffs2_node_frag, rb);
+}
+
+#define frag_next(frag) rb_entry(rb_next(&(frag)->rb), struct jffs2_node_frag, rb)
+#define frag_prev(frag) rb_entry(rb_prev(&(frag)->rb), struct jffs2_node_frag, rb)
+#define frag_parent(frag) rb_entry(rb_parent(&(frag)->rb), struct jffs2_node_frag, rb)
+#define frag_left(frag) rb_entry((frag)->rb.rb_left, struct jffs2_node_frag, rb)
+#define frag_right(frag) rb_entry((frag)->rb.rb_right, struct jffs2_node_frag, rb)
+#define frag_erase(frag, list) rb_erase(&frag->rb, list);
+
+#define tn_next(tn) rb_entry(rb_next(&(tn)->rb), struct jffs2_tmp_dnode_info, rb)
+#define tn_prev(tn) rb_entry(rb_prev(&(tn)->rb), struct jffs2_tmp_dnode_info, rb)
+#define tn_parent(tn) rb_entry(rb_parent(&(tn)->rb), struct jffs2_tmp_dnode_info, rb)
+#define tn_left(tn) rb_entry((tn)->rb.rb_left, struct jffs2_tmp_dnode_info, rb)
+#define tn_right(tn) rb_entry((tn)->rb.rb_right, struct jffs2_tmp_dnode_info, rb)
+#define tn_erase(tn, list) rb_erase(&tn->rb, list);
+#define tn_last(list) rb_entry(rb_last(list), struct jffs2_tmp_dnode_info, rb)
+#define tn_first(list) rb_entry(rb_first(list), struct jffs2_tmp_dnode_info, rb)
+
+/* nodelist.c */
+void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new, struct jffs2_full_dirent **list);
+void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state);
+struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t ino);
+void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new);
+void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old);
+void jffs2_free_ino_caches(struct jffs2_sb_info *c);
+void jffs2_free_raw_node_refs(struct jffs2_sb_info *c);
+struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset);
+void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete);
+int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
+uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
+struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
+					       struct jffs2_eraseblock *jeb,
+					       uint32_t ofs, uint32_t len,
+					       struct jffs2_inode_cache *ic);
+extern uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c,
+				   struct jffs2_eraseblock *jeb,
+				   struct jffs2_raw_node_ref *ref);
+
+/* nodemgmt.c */
+int jffs2_thread_should_wake(struct jffs2_sb_info *c);
+int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
+			uint32_t *len, int prio, uint32_t sumsize);
+int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
+			uint32_t *len, uint32_t sumsize);
+struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c, 
+						       uint32_t ofs, uint32_t len,
+						       struct jffs2_inode_cache *ic);
+void jffs2_complete_reservation(struct jffs2_sb_info *c);
+void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *raw);
+
+/* write.c */
+int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint32_t mode, struct jffs2_raw_inode *ri);
+
+struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					   struct jffs2_raw_inode *ri, const unsigned char *data,
+					   uint32_t datalen, int alloc_mode);
+struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					     struct jffs2_raw_dirent *rd, const unsigned char *name,
+					     uint32_t namelen, int alloc_mode);
+int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			    struct jffs2_raw_inode *ri, unsigned char *buf,
+			    uint32_t offset, uint32_t writelen, uint32_t *retlen);
+int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f,
+		    struct jffs2_raw_inode *ri, const struct qstr *qstr);
+int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name,
+		    int namelen, struct jffs2_inode_info *dead_f, uint32_t time);
+int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino,
+		   uint8_t type, const char *name, int namelen, uint32_t time);
+
+
+/* readinode.c */
+int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			uint32_t ino, struct jffs2_raw_inode *latest_node);
+int jffs2_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f);
+
+/* malloc.c */
+int jffs2_create_slab_caches(void);
+void jffs2_destroy_slab_caches(void);
+
+struct jffs2_full_dirent *jffs2_alloc_full_dirent(int namesize);
+void jffs2_free_full_dirent(struct jffs2_full_dirent *);
+struct jffs2_full_dnode *jffs2_alloc_full_dnode(void);
+void jffs2_free_full_dnode(struct jffs2_full_dnode *);
+struct jffs2_raw_dirent *jffs2_alloc_raw_dirent(void);
+void jffs2_free_raw_dirent(struct jffs2_raw_dirent *);
+struct jffs2_raw_inode *jffs2_alloc_raw_inode(void);
+void jffs2_free_raw_inode(struct jffs2_raw_inode *);
+struct jffs2_tmp_dnode_info *jffs2_alloc_tmp_dnode_info(void);
+void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *);
+int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c,
+				 struct jffs2_eraseblock *jeb, int nr);
+void jffs2_free_refblock(struct jffs2_raw_node_ref *);
+struct jffs2_node_frag *jffs2_alloc_node_frag(void);
+void jffs2_free_node_frag(struct jffs2_node_frag *);
+struct jffs2_inode_cache *jffs2_alloc_inode_cache(void);
+void jffs2_free_inode_cache(struct jffs2_inode_cache *);
+#ifdef CONFIG_JFFS2_FS_XATTR
+struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void);
+void jffs2_free_xattr_datum(struct jffs2_xattr_datum *);
+struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void);
+void jffs2_free_xattr_ref(struct jffs2_xattr_ref *);
+#endif
+
+/* gc.c */
+int jffs2_garbage_collect_pass(struct jffs2_sb_info *c);
+
+/* read.c */
+int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+		     struct jffs2_full_dnode *fd, unsigned char *buf,
+		     int ofs, int len);
+int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			   unsigned char *buf, uint32_t offset, uint32_t len);
+char *jffs2_getlink(struct jffs2_sb_info *c, struct jffs2_inode_info *f);
+
+/* scan.c */
+int jffs2_scan_medium(struct jffs2_sb_info *c);
+void jffs2_rotate_lists(struct jffs2_sb_info *c);
+struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uint32_t ino);
+int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t size);
+
+/* build.c */
+int jffs2_do_mount_fs(struct jffs2_sb_info *c);
+
+/* erase.c */
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
+void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+/* wbuf.c */
+int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino);
+int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c);
+int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+#endif
+
+#include "debug.h"
+
+#endif /* __JFFS2_NODELIST_H__ */
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
new file mode 100644
index 00000000..694aa5b0
--- /dev/null
+++ b/fs/jffs2/nodemgmt.c
@@ -0,0 +1,787 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mtd/mtd.h>
+#include <linux/compiler.h>
+#include <linux/sched.h> /* For cond_resched() */
+#include "nodelist.h"
+#include "debug.h"
+
+/**
+ *	jffs2_reserve_space - request physical space to write nodes to flash
+ *	@c: superblock info
+ *	@minsize: Minimum acceptable size of allocation
+ *	@len: Returned value of allocation length
+ *	@prio: Allocation type - ALLOC_{NORMAL,DELETION}
+ *
+ *	Requests a block of physical space on the flash. Returns zero for success
+ *	and puts 'len' into the appropriate place, or returns -ENOSPC or other 
+ *	error if appropriate. Doesn't return len since that's 
+ *
+ *	If it returns zero, jffs2_reserve_space() also downs the per-filesystem
+ *	allocation semaphore, to prevent more than one allocation from being
+ *	active at any time. The semaphore is later released by jffs2_commit_allocation()
+ *
+ *	jffs2_reserve_space() may trigger garbage collection in order to make room
+ *	for the requested allocation.
+ */
+
+static int jffs2_do_reserve_space(struct jffs2_sb_info *c,  uint32_t minsize,
+				  uint32_t *len, uint32_t sumsize);
+
+int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
+			uint32_t *len, int prio, uint32_t sumsize)
+{
+	int ret = -EAGAIN;
+	int blocksneeded = c->resv_blocks_write;
+	/* align it */
+	minsize = PAD(minsize);
+
+	D1(printk(KERN_DEBUG "jffs2_reserve_space(): Requested 0x%x bytes\n", minsize));
+	mutex_lock(&c->alloc_sem);
+
+	D1(printk(KERN_DEBUG "jffs2_reserve_space(): alloc sem got\n"));
+
+	spin_lock(&c->erase_completion_lock);
+
+	/* this needs a little more thought (true <tglx> :)) */
+	while(ret == -EAGAIN) {
+		while(c->nr_free_blocks + c->nr_erasing_blocks < blocksneeded) {
+			uint32_t dirty, avail;
+
+			/* calculate real dirty size
+			 * dirty_size contains blocks on erase_pending_list
+			 * those blocks are counted in c->nr_erasing_blocks.
+			 * If one block is actually erased, it is not longer counted as dirty_space
+			 * but it is counted in c->nr_erasing_blocks, so we add it and subtract it
+			 * with c->nr_erasing_blocks * c->sector_size again.
+			 * Blocks on erasable_list are counted as dirty_size, but not in c->nr_erasing_blocks
+			 * This helps us to force gc and pick eventually a clean block to spread the load.
+			 * We add unchecked_size here, as we hopefully will find some space to use.
+			 * This will affect the sum only once, as gc first finishes checking
+			 * of nodes.
+			 */
+			dirty = c->dirty_size + c->erasing_size - c->nr_erasing_blocks * c->sector_size + c->unchecked_size;
+			if (dirty < c->nospc_dirty_size) {
+				if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) {
+					D1(printk(KERN_NOTICE "jffs2_reserve_space(): Low on dirty space to GC, but it's a deletion. Allowing...\n"));
+					break;
+				}
+				D1(printk(KERN_DEBUG "dirty size 0x%08x + unchecked_size 0x%08x < nospc_dirty_size 0x%08x, returning -ENOSPC\n",
+					  dirty, c->unchecked_size, c->sector_size));
+
+				spin_unlock(&c->erase_completion_lock);
+				mutex_unlock(&c->alloc_sem);
+				return -ENOSPC;
+			}
+
+			/* Calc possibly available space. Possibly available means that we
+			 * don't know, if unchecked size contains obsoleted nodes, which could give us some
+			 * more usable space. This will affect the sum only once, as gc first finishes checking
+			 * of nodes.
+			 + Return -ENOSPC, if the maximum possibly available space is less or equal than
+			 * blocksneeded * sector_size.
+			 * This blocks endless gc looping on a filesystem, which is nearly full, even if
+			 * the check above passes.
+			 */
+			avail = c->free_size + c->dirty_size + c->erasing_size + c->unchecked_size;
+			if ( (avail / c->sector_size) <= blocksneeded) {
+				if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) {
+					D1(printk(KERN_NOTICE "jffs2_reserve_space(): Low on possibly available space, but it's a deletion. Allowing...\n"));
+					break;
+				}
+
+				D1(printk(KERN_DEBUG "max. available size 0x%08x  < blocksneeded * sector_size 0x%08x, returning -ENOSPC\n",
+					  avail, blocksneeded * c->sector_size));
+				spin_unlock(&c->erase_completion_lock);
+				mutex_unlock(&c->alloc_sem);
+				return -ENOSPC;
+			}
+
+			mutex_unlock(&c->alloc_sem);
+
+			D1(printk(KERN_DEBUG "Triggering GC pass. nr_free_blocks %d, nr_erasing_blocks %d, free_size 0x%08x, dirty_size 0x%08x, wasted_size 0x%08x, used_size 0x%08x, erasing_size 0x%08x, bad_size 0x%08x (total 0x%08x of 0x%08x)\n",
+				  c->nr_free_blocks, c->nr_erasing_blocks, c->free_size, c->dirty_size, c->wasted_size, c->used_size, c->erasing_size, c->bad_size,
+				  c->free_size + c->dirty_size + c->wasted_size + c->used_size + c->erasing_size + c->bad_size, c->flash_size));
+			spin_unlock(&c->erase_completion_lock);
+
+			ret = jffs2_garbage_collect_pass(c);
+
+			if (ret == -EAGAIN) {
+				spin_lock(&c->erase_completion_lock);
+				if (c->nr_erasing_blocks &&
+				    list_empty(&c->erase_pending_list) &&
+				    list_empty(&c->erase_complete_list)) {
+					DECLARE_WAITQUEUE(wait, current);
+					set_current_state(TASK_UNINTERRUPTIBLE);
+					add_wait_queue(&c->erase_wait, &wait);
+					D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__));
+					spin_unlock(&c->erase_completion_lock);
+
+					schedule();
+				} else
+					spin_unlock(&c->erase_completion_lock);
+			} else if (ret)
+				return ret;
+
+			cond_resched();
+
+			if (signal_pending(current))
+				return -EINTR;
+
+			mutex_lock(&c->alloc_sem);
+			spin_lock(&c->erase_completion_lock);
+		}
+
+		ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
+		if (ret) {
+			D1(printk(KERN_DEBUG "jffs2_reserve_space: ret is %d\n", ret));
+		}
+	}
+	spin_unlock(&c->erase_completion_lock);
+	if (!ret)
+		ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
+	if (ret)
+		mutex_unlock(&c->alloc_sem);
+	return ret;
+}
+
+int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
+			   uint32_t *len, uint32_t sumsize)
+{
+	int ret = -EAGAIN;
+	minsize = PAD(minsize);
+
+	D1(printk(KERN_DEBUG "jffs2_reserve_space_gc(): Requested 0x%x bytes\n", minsize));
+
+	spin_lock(&c->erase_completion_lock);
+	while(ret == -EAGAIN) {
+		ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
+		if (ret) {
+			D1(printk(KERN_DEBUG "jffs2_reserve_space_gc: looping, ret is %d\n", ret));
+		}
+	}
+	spin_unlock(&c->erase_completion_lock);
+	if (!ret)
+		ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
+
+	return ret;
+}
+
+
+/* Classify nextblock (clean, dirty of verydirty) and force to select an other one */
+
+static void jffs2_close_nextblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+
+	if (c->nextblock == NULL) {
+		D1(printk(KERN_DEBUG "jffs2_close_nextblock: Erase block at 0x%08x has already been placed in a list\n",
+		  jeb->offset));
+		return;
+	}
+	/* Check, if we have a dirty block now, or if it was dirty already */
+	if (ISDIRTY (jeb->wasted_size + jeb->dirty_size)) {
+		c->dirty_size += jeb->wasted_size;
+		c->wasted_size -= jeb->wasted_size;
+		jeb->dirty_size += jeb->wasted_size;
+		jeb->wasted_size = 0;
+		if (VERYDIRTY(c, jeb->dirty_size)) {
+			D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to very_dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
+			  jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size));
+			list_add_tail(&jeb->list, &c->very_dirty_list);
+		} else {
+			D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
+			  jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size));
+			list_add_tail(&jeb->list, &c->dirty_list);
+		}
+	} else {
+		D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
+		  jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size));
+		list_add_tail(&jeb->list, &c->clean_list);
+	}
+	c->nextblock = NULL;
+
+}
+
+/* Select a new jeb for nextblock */
+
+static int jffs2_find_nextblock(struct jffs2_sb_info *c)
+{
+	struct list_head *next;
+
+	/* Take the next block off the 'free' list */
+
+	if (list_empty(&c->free_list)) {
+
+		if (!c->nr_erasing_blocks &&
+			!list_empty(&c->erasable_list)) {
+			struct jffs2_eraseblock *ejeb;
+
+			ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
+			list_move_tail(&ejeb->list, &c->erase_pending_list);
+			c->nr_erasing_blocks++;
+			jffs2_garbage_collect_trigger(c);
+			D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
+				  ejeb->offset));
+		}
+
+		if (!c->nr_erasing_blocks &&
+			!list_empty(&c->erasable_pending_wbuf_list)) {
+			D1(printk(KERN_DEBUG "jffs2_find_nextblock: Flushing write buffer\n"));
+			/* c->nextblock is NULL, no update to c->nextblock allowed */
+			spin_unlock(&c->erase_completion_lock);
+			jffs2_flush_wbuf_pad(c);
+			spin_lock(&c->erase_completion_lock);
+			/* Have another go. It'll be on the erasable_list now */
+			return -EAGAIN;
+		}
+
+		if (!c->nr_erasing_blocks) {
+			/* Ouch. We're in GC, or we wouldn't have got here.
+			   And there's no space left. At all. */
+			printk(KERN_CRIT "Argh. No free space left for GC. nr_erasing_blocks is %d. nr_free_blocks is %d. (erasableempty: %s, erasingempty: %s, erasependingempty: %s)\n",
+				   c->nr_erasing_blocks, c->nr_free_blocks, list_empty(&c->erasable_list)?"yes":"no",
+				   list_empty(&c->erasing_list)?"yes":"no", list_empty(&c->erase_pending_list)?"yes":"no");
+			return -ENOSPC;
+		}
+
+		spin_unlock(&c->erase_completion_lock);
+		/* Don't wait for it; just erase one right now */
+		jffs2_erase_pending_blocks(c, 1);
+		spin_lock(&c->erase_completion_lock);
+
+		/* An erase may have failed, decreasing the
+		   amount of free space available. So we must
+		   restart from the beginning */
+		return -EAGAIN;
+	}
+
+	next = c->free_list.next;
+	list_del(next);
+	c->nextblock = list_entry(next, struct jffs2_eraseblock, list);
+	c->nr_free_blocks--;
+
+	jffs2_sum_reset_collected(c->summary); /* reset collected summary */
+
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	/* adjust write buffer offset, else we get a non contiguous write bug */
+	if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len)
+		c->wbuf_ofs = 0xffffffff;
+#endif
+
+	D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset));
+
+	return 0;
+}
+
+/* Called with alloc sem _and_ erase_completion_lock */
+static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
+				  uint32_t *len, uint32_t sumsize)
+{
+	struct jffs2_eraseblock *jeb = c->nextblock;
+	uint32_t reserved_size;				/* for summary information at the end of the jeb */
+	int ret;
+
+ restart:
+	reserved_size = 0;
+
+	if (jffs2_sum_active() && (sumsize != JFFS2_SUMMARY_NOSUM_SIZE)) {
+							/* NOSUM_SIZE means not to generate summary */
+
+		if (jeb) {
+			reserved_size = PAD(sumsize + c->summary->sum_size + JFFS2_SUMMARY_FRAME_SIZE);
+			dbg_summary("minsize=%d , jeb->free=%d ,"
+						"summary->size=%d , sumsize=%d\n",
+						minsize, jeb->free_size,
+						c->summary->sum_size, sumsize);
+		}
+
+		/* Is there enough space for writing out the current node, or we have to
+		   write out summary information now, close this jeb and select new nextblock? */
+		if (jeb && (PAD(minsize) + PAD(c->summary->sum_size + sumsize +
+					JFFS2_SUMMARY_FRAME_SIZE) > jeb->free_size)) {
+
+			/* Has summary been disabled for this jeb? */
+			if (jffs2_sum_is_disabled(c->summary)) {
+				sumsize = JFFS2_SUMMARY_NOSUM_SIZE;
+				goto restart;
+			}
+
+			/* Writing out the collected summary information */
+			dbg_summary("generating summary for 0x%08x.\n", jeb->offset);
+			ret = jffs2_sum_write_sumnode(c);
+
+			if (ret)
+				return ret;
+
+			if (jffs2_sum_is_disabled(c->summary)) {
+				/* jffs2_write_sumnode() couldn't write out the summary information
+				   diabling summary for this jeb and free the collected information
+				 */
+				sumsize = JFFS2_SUMMARY_NOSUM_SIZE;
+				goto restart;
+			}
+
+			jffs2_close_nextblock(c, jeb);
+			jeb = NULL;
+			/* keep always valid value in reserved_size */
+			reserved_size = PAD(sumsize + c->summary->sum_size + JFFS2_SUMMARY_FRAME_SIZE);
+		}
+	} else {
+		if (jeb && minsize > jeb->free_size) {
+			uint32_t waste;
+
+			/* Skip the end of this block and file it as having some dirty space */
+			/* If there's a pending write to it, flush now */
+
+			if (jffs2_wbuf_dirty(c)) {
+				spin_unlock(&c->erase_completion_lock);
+				D1(printk(KERN_DEBUG "jffs2_do_reserve_space: Flushing write buffer\n"));
+				jffs2_flush_wbuf_pad(c);
+				spin_lock(&c->erase_completion_lock);
+				jeb = c->nextblock;
+				goto restart;
+			}
+
+			spin_unlock(&c->erase_completion_lock);
+
+			ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
+			if (ret)
+				return ret;
+			/* Just lock it again and continue. Nothing much can change because
+			   we hold c->alloc_sem anyway. In fact, it's not entirely clear why
+			   we hold c->erase_completion_lock in the majority of this function...
+			   but that's a question for another (more caffeine-rich) day. */
+			spin_lock(&c->erase_completion_lock);
+
+			waste = jeb->free_size;
+			jffs2_link_node_ref(c, jeb,
+					    (jeb->offset + c->sector_size - waste) | REF_OBSOLETE,
+					    waste, NULL);
+			/* FIXME: that made it count as dirty. Convert to wasted */
+			jeb->dirty_size -= waste;
+			c->dirty_size -= waste;
+			jeb->wasted_size += waste;
+			c->wasted_size += waste;
+
+			jffs2_close_nextblock(c, jeb);
+			jeb = NULL;
+		}
+	}
+
+	if (!jeb) {
+
+		ret = jffs2_find_nextblock(c);
+		if (ret)
+			return ret;
+
+		jeb = c->nextblock;
+
+		if (jeb->free_size != c->sector_size - c->cleanmarker_size) {
+			printk(KERN_WARNING "Eep. Block 0x%08x taken from free_list had free_size of 0x%08x!!\n", jeb->offset, jeb->free_size);
+			goto restart;
+		}
+	}
+	/* OK, jeb (==c->nextblock) is now pointing at a block which definitely has
+	   enough space */
+	*len = jeb->free_size - reserved_size;
+
+	if (c->cleanmarker_size && jeb->used_size == c->cleanmarker_size &&
+	    !jeb->first_node->next_in_ino) {
+		/* Only node in it beforehand was a CLEANMARKER node (we think).
+		   So mark it obsolete now that there's going to be another node
+		   in the block. This will reduce used_size to zero but We've
+		   already set c->nextblock so that jffs2_mark_node_obsolete()
+		   won't try to refile it to the dirty_list.
+		*/
+		spin_unlock(&c->erase_completion_lock);
+		jffs2_mark_node_obsolete(c, jeb->first_node);
+		spin_lock(&c->erase_completion_lock);
+	}
+
+	D1(printk(KERN_DEBUG "jffs2_do_reserve_space(): Giving 0x%x bytes at 0x%x\n",
+		  *len, jeb->offset + (c->sector_size - jeb->free_size)));
+	return 0;
+}
+
+/**
+ *	jffs2_add_physical_node_ref - add a physical node reference to the list
+ *	@c: superblock info
+ *	@new: new node reference to add
+ *	@len: length of this physical node
+ *
+ *	Should only be used to report nodes for which space has been allocated
+ *	by jffs2_reserve_space.
+ *
+ *	Must be called with the alloc_sem held.
+ */
+
+struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
+						       uint32_t ofs, uint32_t len,
+						       struct jffs2_inode_cache *ic)
+{
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *new;
+
+	jeb = &c->blocks[ofs / c->sector_size];
+
+	D1(printk(KERN_DEBUG "jffs2_add_physical_node_ref(): Node at 0x%x(%d), size 0x%x\n",
+		  ofs & ~3, ofs & 3, len));
+#if 1
+	/* Allow non-obsolete nodes only to be added at the end of c->nextblock, 
+	   if c->nextblock is set. Note that wbuf.c will file obsolete nodes
+	   even after refiling c->nextblock */
+	if ((c->nextblock || ((ofs & 3) != REF_OBSOLETE))
+	    && (jeb != c->nextblock || (ofs & ~3) != jeb->offset + (c->sector_size - jeb->free_size))) {
+		printk(KERN_WARNING "argh. node added in wrong place at 0x%08x(%d)\n", ofs & ~3, ofs & 3);
+		if (c->nextblock)
+			printk(KERN_WARNING "nextblock 0x%08x", c->nextblock->offset);
+		else
+			printk(KERN_WARNING "No nextblock");
+		printk(", expected at %08x\n", jeb->offset + (c->sector_size - jeb->free_size));
+		return ERR_PTR(-EINVAL);
+	}
+#endif
+	spin_lock(&c->erase_completion_lock);
+
+	new = jffs2_link_node_ref(c, jeb, ofs, len, ic);
+
+	if (!jeb->free_size && !jeb->dirty_size && !ISDIRTY(jeb->wasted_size)) {
+		/* If it lives on the dirty_list, jffs2_reserve_space will put it there */
+		D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n",
+			  jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size));
+		if (jffs2_wbuf_dirty(c)) {
+			/* Flush the last write in the block if it's outstanding */
+			spin_unlock(&c->erase_completion_lock);
+			jffs2_flush_wbuf_pad(c);
+			spin_lock(&c->erase_completion_lock);
+		}
+
+		list_add_tail(&jeb->list, &c->clean_list);
+		c->nextblock = NULL;
+	}
+	jffs2_dbg_acct_sanity_check_nolock(c,jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+
+	spin_unlock(&c->erase_completion_lock);
+
+	return new;
+}
+
+
+void jffs2_complete_reservation(struct jffs2_sb_info *c)
+{
+	D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n"));
+	spin_lock(&c->erase_completion_lock);
+	jffs2_garbage_collect_trigger(c);
+	spin_unlock(&c->erase_completion_lock);
+	mutex_unlock(&c->alloc_sem);
+}
+
+static inline int on_list(struct list_head *obj, struct list_head *head)
+{
+	struct list_head *this;
+
+	list_for_each(this, head) {
+		if (this == obj) {
+			D1(printk("%p is on list at %p\n", obj, head));
+			return 1;
+
+		}
+	}
+	return 0;
+}
+
+void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref)
+{
+	struct jffs2_eraseblock *jeb;
+	int blocknr;
+	struct jffs2_unknown_node n;
+	int ret, addedsize;
+	size_t retlen;
+	uint32_t freed_len;
+
+	if(unlikely(!ref)) {
+		printk(KERN_NOTICE "EEEEEK. jffs2_mark_node_obsolete called with NULL node\n");
+		return;
+	}
+	if (ref_obsolete(ref)) {
+		D1(printk(KERN_DEBUG "jffs2_mark_node_obsolete called with already obsolete node at 0x%08x\n", ref_offset(ref)));
+		return;
+	}
+	blocknr = ref->flash_offset / c->sector_size;
+	if (blocknr >= c->nr_blocks) {
+		printk(KERN_NOTICE "raw node at 0x%08x is off the end of device!\n", ref->flash_offset);
+		BUG();
+	}
+	jeb = &c->blocks[blocknr];
+
+	if (jffs2_can_mark_obsolete(c) && !jffs2_is_readonly(c) &&
+	    !(c->flags & (JFFS2_SB_FLAG_SCANNING | JFFS2_SB_FLAG_BUILDING))) {
+		/* Hm. This may confuse static lock analysis. If any of the above
+		   three conditions is false, we're going to return from this
+		   function without actually obliterating any nodes or freeing
+		   any jffs2_raw_node_refs. So we don't need to stop erases from
+		   happening, or protect against people holding an obsolete
+		   jffs2_raw_node_ref without the erase_completion_lock. */
+		mutex_lock(&c->erase_free_sem);
+	}
+
+	spin_lock(&c->erase_completion_lock);
+
+	freed_len = ref_totlen(c, jeb, ref);
+
+	if (ref_flags(ref) == REF_UNCHECKED) {
+		D1(if (unlikely(jeb->unchecked_size < freed_len)) {
+			printk(KERN_NOTICE "raw unchecked node of size 0x%08x freed from erase block %d at 0x%08x, but unchecked_size was already 0x%08x\n",
+			       freed_len, blocknr, ref->flash_offset, jeb->used_size);
+			BUG();
+		})
+		D1(printk(KERN_DEBUG "Obsoleting previously unchecked node at 0x%08x of len %x: ", ref_offset(ref), freed_len));
+		jeb->unchecked_size -= freed_len;
+		c->unchecked_size -= freed_len;
+	} else {
+		D1(if (unlikely(jeb->used_size < freed_len)) {
+			printk(KERN_NOTICE "raw node of size 0x%08x freed from erase block %d at 0x%08x, but used_size was already 0x%08x\n",
+			       freed_len, blocknr, ref->flash_offset, jeb->used_size);
+			BUG();
+		})
+		D1(printk(KERN_DEBUG "Obsoleting node at 0x%08x of len %#x: ", ref_offset(ref), freed_len));
+		jeb->used_size -= freed_len;
+		c->used_size -= freed_len;
+	}
+
+	// Take care, that wasted size is taken into concern
+	if ((jeb->dirty_size || ISDIRTY(jeb->wasted_size + freed_len)) && jeb != c->nextblock) {
+		D1(printk("Dirtying\n"));
+		addedsize = freed_len;
+		jeb->dirty_size += freed_len;
+		c->dirty_size += freed_len;
+
+		/* Convert wasted space to dirty, if not a bad block */
+		if (jeb->wasted_size) {
+			if (on_list(&jeb->list, &c->bad_used_list)) {
+				D1(printk(KERN_DEBUG "Leaving block at %08x on the bad_used_list\n",
+					  jeb->offset));
+				addedsize = 0; /* To fool the refiling code later */
+			} else {
+				D1(printk(KERN_DEBUG "Converting %d bytes of wasted space to dirty in block at %08x\n",
+					  jeb->wasted_size, jeb->offset));
+				addedsize += jeb->wasted_size;
+				jeb->dirty_size += jeb->wasted_size;
+				c->dirty_size += jeb->wasted_size;
+				c->wasted_size -= jeb->wasted_size;
+				jeb->wasted_size = 0;
+			}
+		}
+	} else {
+		D1(printk("Wasting\n"));
+		addedsize = 0;
+		jeb->wasted_size += freed_len;
+		c->wasted_size += freed_len;
+	}
+	ref->flash_offset = ref_offset(ref) | REF_OBSOLETE;
+
+	jffs2_dbg_acct_sanity_check_nolock(c, jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+
+	if (c->flags & JFFS2_SB_FLAG_SCANNING) {
+		/* Flash scanning is in progress. Don't muck about with the block
+		   lists because they're not ready yet, and don't actually
+		   obliterate nodes that look obsolete. If they weren't
+		   marked obsolete on the flash at the time they _became_
+		   obsolete, there was probably a reason for that. */
+		spin_unlock(&c->erase_completion_lock);
+		/* We didn't lock the erase_free_sem */
+		return;
+	}
+
+	if (jeb == c->nextblock) {
+		D2(printk(KERN_DEBUG "Not moving nextblock 0x%08x to dirty/erase_pending list\n", jeb->offset));
+	} else if (!jeb->used_size && !jeb->unchecked_size) {
+		if (jeb == c->gcblock) {
+			D1(printk(KERN_DEBUG "gcblock at 0x%08x completely dirtied. Clearing gcblock...\n", jeb->offset));
+			c->gcblock = NULL;
+		} else {
+			D1(printk(KERN_DEBUG "Eraseblock at 0x%08x completely dirtied. Removing from (dirty?) list...\n", jeb->offset));
+			list_del(&jeb->list);
+		}
+		if (jffs2_wbuf_dirty(c)) {
+			D1(printk(KERN_DEBUG "...and adding to erasable_pending_wbuf_list\n"));
+			list_add_tail(&jeb->list, &c->erasable_pending_wbuf_list);
+		} else {
+			if (jiffies & 127) {
+				/* Most of the time, we just erase it immediately. Otherwise we
+				   spend ages scanning it on mount, etc. */
+				D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
+				list_add_tail(&jeb->list, &c->erase_pending_list);
+				c->nr_erasing_blocks++;
+				jffs2_garbage_collect_trigger(c);
+			} else {
+				/* Sometimes, however, we leave it elsewhere so it doesn't get
+				   immediately reused, and we spread the load a bit. */
+				D1(printk(KERN_DEBUG "...and adding to erasable_list\n"));
+				list_add_tail(&jeb->list, &c->erasable_list);
+			}
+		}
+		D1(printk(KERN_DEBUG "Done OK\n"));
+	} else if (jeb == c->gcblock) {
+		D2(printk(KERN_DEBUG "Not moving gcblock 0x%08x to dirty_list\n", jeb->offset));
+	} else if (ISDIRTY(jeb->dirty_size) && !ISDIRTY(jeb->dirty_size - addedsize)) {
+		D1(printk(KERN_DEBUG "Eraseblock at 0x%08x is freshly dirtied. Removing from clean list...\n", jeb->offset));
+		list_del(&jeb->list);
+		D1(printk(KERN_DEBUG "...and adding to dirty_list\n"));
+		list_add_tail(&jeb->list, &c->dirty_list);
+	} else if (VERYDIRTY(c, jeb->dirty_size) &&
+		   !VERYDIRTY(c, jeb->dirty_size - addedsize)) {
+		D1(printk(KERN_DEBUG "Eraseblock at 0x%08x is now very dirty. Removing from dirty list...\n", jeb->offset));
+		list_del(&jeb->list);
+		D1(printk(KERN_DEBUG "...and adding to very_dirty_list\n"));
+		list_add_tail(&jeb->list, &c->very_dirty_list);
+	} else {
+		D1(printk(KERN_DEBUG "Eraseblock at 0x%08x not moved anywhere. (free 0x%08x, dirty 0x%08x, used 0x%08x)\n",
+			  jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size));
+	}
+
+	spin_unlock(&c->erase_completion_lock);
+
+	if (!jffs2_can_mark_obsolete(c) || jffs2_is_readonly(c) ||
+		(c->flags & JFFS2_SB_FLAG_BUILDING)) {
+		/* We didn't lock the erase_free_sem */
+		return;
+	}
+
+	/* The erase_free_sem is locked, and has been since before we marked the node obsolete
+	   and potentially put its eraseblock onto the erase_pending_list. Thus, we know that
+	   the block hasn't _already_ been erased, and that 'ref' itself hasn't been freed yet
+	   by jffs2_free_jeb_node_refs() in erase.c. Which is nice. */
+
+	D1(printk(KERN_DEBUG "obliterating obsoleted node at 0x%08x\n", ref_offset(ref)));
+	ret = jffs2_flash_read(c, ref_offset(ref), sizeof(n), &retlen, (char *)&n);
+	if (ret) {
+		printk(KERN_WARNING "Read error reading from obsoleted node at 0x%08x: %d\n", ref_offset(ref), ret);
+		goto out_erase_sem;
+	}
+	if (retlen != sizeof(n)) {
+		printk(KERN_WARNING "Short read from obsoleted node at 0x%08x: %zd\n", ref_offset(ref), retlen);
+		goto out_erase_sem;
+	}
+	if (PAD(je32_to_cpu(n.totlen)) != PAD(freed_len)) {
+		printk(KERN_WARNING "Node totlen on flash (0x%08x) != totlen from node ref (0x%08x)\n", je32_to_cpu(n.totlen), freed_len);
+		goto out_erase_sem;
+	}
+	if (!(je16_to_cpu(n.nodetype) & JFFS2_NODE_ACCURATE)) {
+		D1(printk(KERN_DEBUG "Node at 0x%08x was already marked obsolete (nodetype 0x%04x)\n", ref_offset(ref), je16_to_cpu(n.nodetype)));
+		goto out_erase_sem;
+	}
+	/* XXX FIXME: This is ugly now */
+	n.nodetype = cpu_to_je16(je16_to_cpu(n.nodetype) & ~JFFS2_NODE_ACCURATE);
+	ret = jffs2_flash_write(c, ref_offset(ref), sizeof(n), &retlen, (char *)&n);
+	if (ret) {
+		printk(KERN_WARNING "Write error in obliterating obsoleted node at 0x%08x: %d\n", ref_offset(ref), ret);
+		goto out_erase_sem;
+	}
+	if (retlen != sizeof(n)) {
+		printk(KERN_WARNING "Short write in obliterating obsoleted node at 0x%08x: %zd\n", ref_offset(ref), retlen);
+		goto out_erase_sem;
+	}
+
+	/* Nodes which have been marked obsolete no longer need to be
+	   associated with any inode. Remove them from the per-inode list.
+
+	   Note we can't do this for NAND at the moment because we need
+	   obsolete dirent nodes to stay on the lists, because of the
+	   horridness in jffs2_garbage_collect_deletion_dirent(). Also
+	   because we delete the inocache, and on NAND we need that to
+	   stay around until all the nodes are actually erased, in order
+	   to stop us from giving the same inode number to another newly
+	   created inode. */
+	if (ref->next_in_ino) {
+		struct jffs2_inode_cache *ic;
+		struct jffs2_raw_node_ref **p;
+
+		spin_lock(&c->erase_completion_lock);
+
+		ic = jffs2_raw_ref_to_ic(ref);
+		for (p = &ic->nodes; (*p) != ref; p = &((*p)->next_in_ino))
+			;
+
+		*p = ref->next_in_ino;
+		ref->next_in_ino = NULL;
+
+		switch (ic->class) {
+#ifdef CONFIG_JFFS2_FS_XATTR
+			case RAWNODE_CLASS_XATTR_DATUM:
+				jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic);
+				break;
+			case RAWNODE_CLASS_XATTR_REF:
+				jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic);
+				break;
+#endif
+			default:
+				if (ic->nodes == (void *)ic && ic->pino_nlink == 0)
+					jffs2_del_ino_cache(c, ic);
+				break;
+		}
+		spin_unlock(&c->erase_completion_lock);
+	}
+
+ out_erase_sem:
+	mutex_unlock(&c->erase_free_sem);
+}
+
+int jffs2_thread_should_wake(struct jffs2_sb_info *c)
+{
+	int ret = 0;
+	uint32_t dirty;
+	int nr_very_dirty = 0;
+	struct jffs2_eraseblock *jeb;
+
+	if (!list_empty(&c->erase_complete_list) ||
+	    !list_empty(&c->erase_pending_list))
+		return 1;
+
+	if (c->unchecked_size) {
+		D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
+			  c->unchecked_size, c->checked_ino));
+		return 1;
+	}
+
+	/* dirty_size contains blocks on erase_pending_list
+	 * those blocks are counted in c->nr_erasing_blocks.
+	 * If one block is actually erased, it is not longer counted as dirty_space
+	 * but it is counted in c->nr_erasing_blocks, so we add it and subtract it
+	 * with c->nr_erasing_blocks * c->sector_size again.
+	 * Blocks on erasable_list are counted as dirty_size, but not in c->nr_erasing_blocks
+	 * This helps us to force gc and pick eventually a clean block to spread the load.
+	 */
+	dirty = c->dirty_size + c->erasing_size - c->nr_erasing_blocks * c->sector_size;
+
+	if (c->nr_free_blocks + c->nr_erasing_blocks < c->resv_blocks_gctrigger &&
+			(dirty > c->nospc_dirty_size))
+		ret = 1;
+
+	list_for_each_entry(jeb, &c->very_dirty_list, list) {
+		nr_very_dirty++;
+		if (nr_very_dirty == c->vdirty_blocks_gctrigger) {
+			ret = 1;
+			/* In debug mode, actually go through and count them all */
+			D1(continue);
+			break;
+		}
+	}
+
+	D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): nr_free_blocks %d, nr_erasing_blocks %d, dirty_size 0x%x, vdirty_blocks %d: %s\n",
+		  c->nr_free_blocks, c->nr_erasing_blocks, c->dirty_size, nr_very_dirty, ret?"yes":"no"));
+
+	return ret;
+}
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
new file mode 100644
index 00000000..dce73c35
--- /dev/null
+++ b/fs/jffs2/os-linux.h
@@ -0,0 +1,204 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef __JFFS2_OS_LINUX_H__
+#define __JFFS2_OS_LINUX_H__
+
+/* JFFS2 uses Linux mode bits natively -- no need for conversion */
+#define os_to_jffs2_mode(x) (x)
+#define jffs2_to_os_mode(x) (x)
+
+struct kstatfs;
+struct kvec;
+
+#define JFFS2_INODE_INFO(i) (list_entry(i, struct jffs2_inode_info, vfs_inode))
+#define OFNI_EDONI_2SFFJ(f)  (&(f)->vfs_inode)
+#define JFFS2_SB_INFO(sb) (sb->s_fs_info)
+#define OFNI_BS_2SFFJ(c)  ((struct super_block *)c->os_priv)
+
+
+#define JFFS2_F_I_SIZE(f) (OFNI_EDONI_2SFFJ(f)->i_size)
+#define JFFS2_F_I_MODE(f) (OFNI_EDONI_2SFFJ(f)->i_mode)
+#define JFFS2_F_I_UID(f) (OFNI_EDONI_2SFFJ(f)->i_uid)
+#define JFFS2_F_I_GID(f) (OFNI_EDONI_2SFFJ(f)->i_gid)
+#define JFFS2_F_I_RDEV(f) (OFNI_EDONI_2SFFJ(f)->i_rdev)
+
+#define ITIME(sec) ((struct timespec){sec, 0})
+#define I_SEC(tv) ((tv).tv_sec)
+#define JFFS2_F_I_CTIME(f) (OFNI_EDONI_2SFFJ(f)->i_ctime.tv_sec)
+#define JFFS2_F_I_MTIME(f) (OFNI_EDONI_2SFFJ(f)->i_mtime.tv_sec)
+#define JFFS2_F_I_ATIME(f) (OFNI_EDONI_2SFFJ(f)->i_atime.tv_sec)
+
+#define sleep_on_spinunlock(wq, s)				\
+	do {							\
+		DECLARE_WAITQUEUE(__wait, current);		\
+		add_wait_queue((wq), &__wait);			\
+		set_current_state(TASK_UNINTERRUPTIBLE);	\
+		spin_unlock(s);					\
+		schedule();					\
+		remove_wait_queue((wq), &__wait);		\
+	} while(0)
+
+static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
+{
+	f->highest_version = 0;
+	f->fragtree = RB_ROOT;
+	f->metadata = NULL;
+	f->dents = NULL;
+	f->target = NULL;
+	f->flags = 0;
+	f->usercompr = 0;
+}
+
+
+#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & MS_RDONLY)
+
+#define SECTOR_ADDR(x) ( (((unsigned long)(x) / c->sector_size) * c->sector_size) )
+#ifndef CONFIG_JFFS2_FS_WRITEBUFFER
+
+
+#ifdef CONFIG_JFFS2_SUMMARY
+#define jffs2_can_mark_obsolete(c) (0)
+#else
+#define jffs2_can_mark_obsolete(c) (1)
+#endif
+
+#define jffs2_is_writebuffered(c) (0)
+#define jffs2_cleanmarker_oob(c) (0)
+#define jffs2_write_nand_cleanmarker(c,jeb) (-EIO)
+
+#define jffs2_flash_write(c, ofs, len, retlen, buf) jffs2_flash_direct_write(c, ofs, len, retlen, buf)
+#define jffs2_flash_read(c, ofs, len, retlen, buf) ((c)->mtd->read((c)->mtd, ofs, len, retlen, buf))
+#define jffs2_flush_wbuf_pad(c) ({ do{} while(0); (void)(c), 0; })
+#define jffs2_flush_wbuf_gc(c, i) ({ do{} while(0); (void)(c), (void) i, 0; })
+#define jffs2_write_nand_badblock(c,jeb,bad_offset) (1)
+#define jffs2_nand_flash_setup(c) (0)
+#define jffs2_nand_flash_cleanup(c) do {} while(0)
+#define jffs2_wbuf_dirty(c) (0)
+#define jffs2_flash_writev(a,b,c,d,e,f) jffs2_flash_direct_writev(a,b,c,d,e)
+#define jffs2_wbuf_timeout NULL
+#define jffs2_wbuf_process NULL
+#define jffs2_dataflash(c) (0)
+#define jffs2_dataflash_setup(c) (0)
+#define jffs2_dataflash_cleanup(c) do {} while (0)
+#define jffs2_nor_wbuf_flash(c) (0)
+#define jffs2_nor_wbuf_flash_setup(c) (0)
+#define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0)
+#define jffs2_ubivol(c) (0)
+#define jffs2_ubivol_setup(c) (0)
+#define jffs2_ubivol_cleanup(c) do {} while (0)
+
+#else /* NAND and/or ECC'd NOR support present */
+
+#define jffs2_is_writebuffered(c) (c->wbuf != NULL)
+
+#ifdef CONFIG_JFFS2_SUMMARY
+#define jffs2_can_mark_obsolete(c) (0)
+#else
+#define jffs2_can_mark_obsolete(c) (c->mtd->flags & (MTD_BIT_WRITEABLE))
+#endif
+
+#define jffs2_cleanmarker_oob(c) (c->mtd->type == MTD_NANDFLASH && (c->mtd->flags & MTD_OOB_WRITEABLE))
+
+#define jffs2_flash_write_oob(c, ofs, len, retlen, buf) ((c)->mtd->write_oob((c)->mtd, ofs, len, retlen, buf))
+#define jffs2_flash_read_oob(c, ofs, len, retlen, buf) ((c)->mtd->read_oob((c)->mtd, ofs, len, retlen, buf))
+#define jffs2_wbuf_dirty(c) (!!(c)->wbuf_len)
+
+/* wbuf.c */
+int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *vecs, unsigned long count, loff_t to, size_t *retlen, uint32_t ino);
+int jffs2_flash_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, const u_char *buf);
+int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, u_char *buf);
+int jffs2_check_oob_empty(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,int mode);
+int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset);
+void jffs2_wbuf_timeout(unsigned long data);
+void jffs2_wbuf_process(void *data);
+int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino);
+int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c);
+int jffs2_nand_flash_setup(struct jffs2_sb_info *c);
+void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c);
+
+#define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH)
+int jffs2_dataflash_setup(struct jffs2_sb_info *c);
+void jffs2_dataflash_cleanup(struct jffs2_sb_info *c);
+#define jffs2_ubivol(c) (c->mtd->type == MTD_UBIVOLUME)
+int jffs2_ubivol_setup(struct jffs2_sb_info *c);
+void jffs2_ubivol_cleanup(struct jffs2_sb_info *c);
+
+#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE))
+int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c);
+void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
+
+#endif /* WRITEBUFFER */
+
+static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c)
+{
+	OFNI_BS_2SFFJ(c)->s_dirt = 1;
+}
+
+/* background.c */
+int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c);
+void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c);
+void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c);
+
+/* dir.c */
+extern const struct file_operations jffs2_dir_operations;
+extern const struct inode_operations jffs2_dir_inode_operations;
+
+/* file.c */
+extern const struct file_operations jffs2_file_operations;
+extern const struct inode_operations jffs2_file_inode_operations;
+extern const struct address_space_operations jffs2_file_address_operations;
+int jffs2_fsync(struct file *, int);
+int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
+
+/* ioctl.c */
+long jffs2_ioctl(struct file *, unsigned int, unsigned long);
+
+/* symlink.c */
+extern const struct inode_operations jffs2_symlink_inode_operations;
+
+/* fs.c */
+int jffs2_setattr (struct dentry *, struct iattr *);
+int jffs2_do_setattr (struct inode *, struct iattr *);
+struct inode *jffs2_iget(struct super_block *, unsigned long);
+void jffs2_evict_inode (struct inode *);
+void jffs2_dirty_inode(struct inode *inode, int flags);
+struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
+			       struct jffs2_raw_inode *ri);
+int jffs2_statfs (struct dentry *, struct kstatfs *);
+int jffs2_remount_fs (struct super_block *, int *, char *);
+int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
+void jffs2_gc_release_inode(struct jffs2_sb_info *c,
+			    struct jffs2_inode_info *f);
+struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c,
+					      int inum, int unlinked);
+
+unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
+				   struct jffs2_inode_info *f,
+				   unsigned long offset,
+				   unsigned long *priv);
+void jffs2_gc_release_page(struct jffs2_sb_info *c,
+			   unsigned char *pg,
+			   unsigned long *priv);
+void jffs2_flash_cleanup(struct jffs2_sb_info *c);
+
+
+/* writev.c */
+int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs,
+		       unsigned long count, loff_t to, size_t *retlen);
+int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len,
+			size_t *retlen, const u_char *buf);
+
+#endif /* __JFFS2_OS_LINUX_H__ */
+
+
diff --git a/fs/jffs2/read.c b/fs/jffs2/read.c
new file mode 100644
index 00000000..3f39be1b
--- /dev/null
+++ b/fs/jffs2/read.c
@@ -0,0 +1,216 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/mtd.h>
+#include <linux/compiler.h>
+#include "nodelist.h"
+#include "compr.h"
+
+int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+		     struct jffs2_full_dnode *fd, unsigned char *buf,
+		     int ofs, int len)
+{
+	struct jffs2_raw_inode *ri;
+	size_t readlen;
+	uint32_t crc;
+	unsigned char *decomprbuf = NULL;
+	unsigned char *readbuf = NULL;
+	int ret = 0;
+
+	ri = jffs2_alloc_raw_inode();
+	if (!ri)
+		return -ENOMEM;
+
+	ret = jffs2_flash_read(c, ref_offset(fd->raw), sizeof(*ri), &readlen, (char *)ri);
+	if (ret) {
+		jffs2_free_raw_inode(ri);
+		printk(KERN_WARNING "Error reading node from 0x%08x: %d\n", ref_offset(fd->raw), ret);
+		return ret;
+	}
+	if (readlen != sizeof(*ri)) {
+		jffs2_free_raw_inode(ri);
+		printk(KERN_WARNING "Short read from 0x%08x: wanted 0x%zx bytes, got 0x%zx\n",
+		       ref_offset(fd->raw), sizeof(*ri), readlen);
+		return -EIO;
+	}
+	crc = crc32(0, ri, sizeof(*ri)-8);
+
+	D1(printk(KERN_DEBUG "Node read from %08x: node_crc %08x, calculated CRC %08x. dsize %x, csize %x, offset %x, buf %p\n",
+		  ref_offset(fd->raw), je32_to_cpu(ri->node_crc),
+		  crc, je32_to_cpu(ri->dsize), je32_to_cpu(ri->csize),
+		  je32_to_cpu(ri->offset), buf));
+	if (crc != je32_to_cpu(ri->node_crc)) {
+		printk(KERN_WARNING "Node CRC %08x != calculated CRC %08x for node at %08x\n",
+		       je32_to_cpu(ri->node_crc), crc, ref_offset(fd->raw));
+		ret = -EIO;
+		goto out_ri;
+	}
+	/* There was a bug where we wrote hole nodes out with csize/dsize
+	   swapped. Deal with it */
+	if (ri->compr == JFFS2_COMPR_ZERO && !je32_to_cpu(ri->dsize) &&
+	    je32_to_cpu(ri->csize)) {
+		ri->dsize = ri->csize;
+		ri->csize = cpu_to_je32(0);
+	}
+
+	D1(if(ofs + len > je32_to_cpu(ri->dsize)) {
+		printk(KERN_WARNING "jffs2_read_dnode() asked for %d bytes at %d from %d-byte node\n",
+		       len, ofs, je32_to_cpu(ri->dsize));
+		ret = -EINVAL;
+		goto out_ri;
+	});
+
+
+	if (ri->compr == JFFS2_COMPR_ZERO) {
+		memset(buf, 0, len);
+		goto out_ri;
+	}
+
+	/* Cases:
+	   Reading whole node and it's uncompressed - read directly to buffer provided, check CRC.
+	   Reading whole node and it's compressed - read into comprbuf, check CRC and decompress to buffer provided
+	   Reading partial node and it's uncompressed - read into readbuf, check CRC, and copy
+	   Reading partial node and it's compressed - read into readbuf, check checksum, decompress to decomprbuf and copy
+	*/
+	if (ri->compr == JFFS2_COMPR_NONE && len == je32_to_cpu(ri->dsize)) {
+		readbuf = buf;
+	} else {
+		readbuf = kmalloc(je32_to_cpu(ri->csize), GFP_KERNEL);
+		if (!readbuf) {
+			ret = -ENOMEM;
+			goto out_ri;
+		}
+	}
+	if (ri->compr != JFFS2_COMPR_NONE) {
+		if (len < je32_to_cpu(ri->dsize)) {
+			decomprbuf = kmalloc(je32_to_cpu(ri->dsize), GFP_KERNEL);
+			if (!decomprbuf) {
+				ret = -ENOMEM;
+				goto out_readbuf;
+			}
+		} else {
+			decomprbuf = buf;
+		}
+	} else {
+		decomprbuf = readbuf;
+	}
+
+	D2(printk(KERN_DEBUG "Read %d bytes to %p\n", je32_to_cpu(ri->csize),
+		  readbuf));
+	ret = jffs2_flash_read(c, (ref_offset(fd->raw)) + sizeof(*ri),
+			       je32_to_cpu(ri->csize), &readlen, readbuf);
+
+	if (!ret && readlen != je32_to_cpu(ri->csize))
+		ret = -EIO;
+	if (ret)
+		goto out_decomprbuf;
+
+	crc = crc32(0, readbuf, je32_to_cpu(ri->csize));
+	if (crc != je32_to_cpu(ri->data_crc)) {
+		printk(KERN_WARNING "Data CRC %08x != calculated CRC %08x for node at %08x\n",
+		       je32_to_cpu(ri->data_crc), crc, ref_offset(fd->raw));
+		ret = -EIO;
+		goto out_decomprbuf;
+	}
+	D2(printk(KERN_DEBUG "Data CRC matches calculated CRC %08x\n", crc));
+	if (ri->compr != JFFS2_COMPR_NONE) {
+		D2(printk(KERN_DEBUG "Decompress %d bytes from %p to %d bytes at %p\n",
+			  je32_to_cpu(ri->csize), readbuf, je32_to_cpu(ri->dsize), decomprbuf));
+		ret = jffs2_decompress(c, f, ri->compr | (ri->usercompr << 8), readbuf, decomprbuf, je32_to_cpu(ri->csize), je32_to_cpu(ri->dsize));
+		if (ret) {
+			printk(KERN_WARNING "Error: jffs2_decompress returned %d\n", ret);
+			goto out_decomprbuf;
+		}
+	}
+
+	if (len < je32_to_cpu(ri->dsize)) {
+		memcpy(buf, decomprbuf+ofs, len);
+	}
+ out_decomprbuf:
+	if(decomprbuf != buf && decomprbuf != readbuf)
+		kfree(decomprbuf);
+ out_readbuf:
+	if(readbuf != buf)
+		kfree(readbuf);
+ out_ri:
+	jffs2_free_raw_inode(ri);
+
+	return ret;
+}
+
+int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			   unsigned char *buf, uint32_t offset, uint32_t len)
+{
+	uint32_t end = offset + len;
+	struct jffs2_node_frag *frag;
+	int ret;
+
+	D1(printk(KERN_DEBUG "jffs2_read_inode_range: ino #%u, range 0x%08x-0x%08x\n",
+		  f->inocache->ino, offset, offset+len));
+
+	frag = jffs2_lookup_node_frag(&f->fragtree, offset);
+
+	/* XXX FIXME: Where a single physical node actually shows up in two
+	   frags, we read it twice. Don't do that. */
+	/* Now we're pointing at the first frag which overlaps our page
+	 * (or perhaps is before it, if we've been asked to read off the
+	 * end of the file). */
+	while(offset < end) {
+		D2(printk(KERN_DEBUG "jffs2_read_inode_range: offset %d, end %d\n", offset, end));
+		if (unlikely(!frag || frag->ofs > offset ||
+			     frag->ofs + frag->size <= offset)) {
+			uint32_t holesize = end - offset;
+			if (frag && frag->ofs > offset) {
+				D1(printk(KERN_NOTICE "Eep. Hole in ino #%u fraglist. frag->ofs = 0x%08x, offset = 0x%08x\n", f->inocache->ino, frag->ofs, offset));
+				holesize = min(holesize, frag->ofs - offset);
+			}
+			D1(printk(KERN_DEBUG "Filling non-frag hole from %d-%d\n", offset, offset+holesize));
+			memset(buf, 0, holesize);
+			buf += holesize;
+			offset += holesize;
+			continue;
+		} else if (unlikely(!frag->node)) {
+			uint32_t holeend = min(end, frag->ofs + frag->size);
+			D1(printk(KERN_DEBUG "Filling frag hole from %d-%d (frag 0x%x 0x%x)\n", offset, holeend, frag->ofs, frag->ofs + frag->size));
+			memset(buf, 0, holeend - offset);
+			buf += holeend - offset;
+			offset = holeend;
+			frag = frag_next(frag);
+			continue;
+		} else {
+			uint32_t readlen;
+			uint32_t fragofs; /* offset within the frag to start reading */
+
+			fragofs = offset - frag->ofs;
+			readlen = min(frag->size - fragofs, end - offset);
+			D1(printk(KERN_DEBUG "Reading %d-%d from node at 0x%08x (%d)\n",
+				  frag->ofs+fragofs, frag->ofs+fragofs+readlen,
+				  ref_offset(frag->node->raw), ref_flags(frag->node->raw)));
+			ret = jffs2_read_dnode(c, f, frag->node, buf, fragofs + frag->ofs - frag->node->ofs, readlen);
+			D2(printk(KERN_DEBUG "node read done\n"));
+			if (ret) {
+				D1(printk(KERN_DEBUG"jffs2_read_inode_range error %d\n",ret));
+				memset(buf, 0, readlen);
+				return ret;
+			}
+			buf += readlen;
+			offset += readlen;
+			frag = frag_next(frag);
+			D2(printk(KERN_DEBUG "node read was OK. Looping\n"));
+		}
+	}
+	return 0;
+}
+
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
new file mode 100644
index 00000000..2ab1a0d9
--- /dev/null
+++ b/fs/jffs2/readinode.c
@@ -0,0 +1,1461 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/crc32.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/mtd.h>
+#include <linux/compiler.h>
+#include "nodelist.h"
+
+/*
+ * Check the data CRC of the node.
+ *
+ * Returns: 0 if the data CRC is correct;
+ * 	    1 - if incorrect;
+ *	    error code if an error occurred.
+ */
+static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
+{
+	struct jffs2_raw_node_ref *ref = tn->fn->raw;
+	int err = 0, pointed = 0;
+	struct jffs2_eraseblock *jeb;
+	unsigned char *buffer;
+	uint32_t crc, ofs, len;
+	size_t retlen;
+
+	BUG_ON(tn->csize == 0);
+
+	/* Calculate how many bytes were already checked */
+	ofs = ref_offset(ref) + sizeof(struct jffs2_raw_inode);
+	len = tn->csize;
+
+	if (jffs2_is_writebuffered(c)) {
+		int adj = ofs % c->wbuf_pagesize;
+		if (likely(adj))
+			adj = c->wbuf_pagesize - adj;
+
+		if (adj >= tn->csize) {
+			dbg_readinode("no need to check node at %#08x, data length %u, data starts at %#08x - it has already been checked.\n",
+				      ref_offset(ref), tn->csize, ofs);
+			goto adj_acc;
+		}
+
+		ofs += adj;
+		len -= adj;
+	}
+
+	dbg_readinode("check node at %#08x, data length %u, partial CRC %#08x, correct CRC %#08x, data starts at %#08x, start checking from %#08x - %u bytes.\n",
+		ref_offset(ref), tn->csize, tn->partial_crc, tn->data_crc, ofs - len, ofs, len);
+
+#ifndef __ECOS
+	/* TODO: instead, incapsulate point() stuff to jffs2_flash_read(),
+	 * adding and jffs2_flash_read_end() interface. */
+	if (c->mtd->point) {
+		err = c->mtd->point(c->mtd, ofs, len, &retlen,
+				    (void **)&buffer, NULL);
+		if (!err && retlen < len) {
+			JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
+			c->mtd->unpoint(c->mtd, ofs, retlen);
+		} else if (err)
+			JFFS2_WARNING("MTD point failed: error code %d.\n", err);
+		else
+			pointed = 1; /* succefully pointed to device */
+	}
+#endif
+
+	if (!pointed) {
+		buffer = kmalloc(len, GFP_KERNEL);
+		if (unlikely(!buffer))
+			return -ENOMEM;
+
+		/* TODO: this is very frequent pattern, make it a separate
+		 * routine */
+		err = jffs2_flash_read(c, ofs, len, &retlen, buffer);
+		if (err) {
+			JFFS2_ERROR("can not read %d bytes from 0x%08x, error code: %d.\n", len, ofs, err);
+			goto free_out;
+		}
+
+		if (retlen != len) {
+			JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n", ofs, retlen, len);
+			err = -EIO;
+			goto free_out;
+		}
+	}
+
+	/* Continue calculating CRC */
+	crc = crc32(tn->partial_crc, buffer, len);
+	if(!pointed)
+		kfree(buffer);
+#ifndef __ECOS
+	else
+		c->mtd->unpoint(c->mtd, ofs, len);
+#endif
+
+	if (crc != tn->data_crc) {
+		JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n",
+			     ref_offset(ref), tn->data_crc, crc);
+		return 1;
+	}
+
+adj_acc:
+	jeb = &c->blocks[ref->flash_offset / c->sector_size];
+	len = ref_totlen(c, jeb, ref);
+	/* If it should be REF_NORMAL, it'll get marked as such when
+	   we build the fragtree, shortly. No need to worry about GC
+	   moving it while it's marked REF_PRISTINE -- GC won't happen
+	   till we've finished checking every inode anyway. */
+	ref->flash_offset |= REF_PRISTINE;
+	/*
+	 * Mark the node as having been checked and fix the
+	 * accounting accordingly.
+	 */
+	spin_lock(&c->erase_completion_lock);
+	jeb->used_size += len;
+	jeb->unchecked_size -= len;
+	c->used_size += len;
+	c->unchecked_size -= len;
+	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+	spin_unlock(&c->erase_completion_lock);
+
+	return 0;
+
+free_out:
+	if(!pointed)
+		kfree(buffer);
+#ifndef __ECOS
+	else
+		c->mtd->unpoint(c->mtd, ofs, len);
+#endif
+	return err;
+}
+
+/*
+ * Helper function for jffs2_add_older_frag_to_fragtree().
+ *
+ * Checks the node if we are in the checking stage.
+ */
+static int check_tn_node(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
+{
+	int ret;
+
+	BUG_ON(ref_obsolete(tn->fn->raw));
+
+	/* We only check the data CRC of unchecked nodes */
+	if (ref_flags(tn->fn->raw) != REF_UNCHECKED)
+		return 0;
+
+	dbg_readinode("check node %#04x-%#04x, phys offs %#08x\n",
+		      tn->fn->ofs, tn->fn->ofs + tn->fn->size, ref_offset(tn->fn->raw));
+
+	ret = check_node_data(c, tn);
+	if (unlikely(ret < 0)) {
+		JFFS2_ERROR("check_node_data() returned error: %d.\n",
+			ret);
+	} else if (unlikely(ret > 0)) {
+		dbg_readinode("CRC error, mark it obsolete.\n");
+		jffs2_mark_node_obsolete(c, tn->fn->raw);
+	}
+
+	return ret;
+}
+
+static struct jffs2_tmp_dnode_info *jffs2_lookup_tn(struct rb_root *tn_root, uint32_t offset)
+{
+	struct rb_node *next;
+	struct jffs2_tmp_dnode_info *tn = NULL;
+
+	dbg_readinode("root %p, offset %d\n", tn_root, offset);
+
+	next = tn_root->rb_node;
+
+	while (next) {
+		tn = rb_entry(next, struct jffs2_tmp_dnode_info, rb);
+
+		if (tn->fn->ofs < offset)
+			next = tn->rb.rb_right;
+		else if (tn->fn->ofs >= offset)
+			next = tn->rb.rb_left;
+		else
+			break;
+	}
+
+	return tn;
+}
+
+
+static void jffs2_kill_tn(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
+{
+	jffs2_mark_node_obsolete(c, tn->fn->raw);
+	jffs2_free_full_dnode(tn->fn);
+	jffs2_free_tmp_dnode_info(tn);
+}
+/*
+ * This function is used when we read an inode. Data nodes arrive in
+ * arbitrary order -- they may be older or newer than the nodes which
+ * are already in the tree. Where overlaps occur, the older node can
+ * be discarded as long as the newer passes the CRC check. We don't
+ * bother to keep track of holes in this rbtree, and neither do we deal
+ * with frags -- we can have multiple entries starting at the same
+ * offset, and the one with the smallest length will come first in the
+ * ordering.
+ *
+ * Returns 0 if the node was handled (including marking it obsolete)
+ *	 < 0 an if error occurred
+ */
+static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
+				struct jffs2_readinode_info *rii,
+				struct jffs2_tmp_dnode_info *tn)
+{
+	uint32_t fn_end = tn->fn->ofs + tn->fn->size;
+	struct jffs2_tmp_dnode_info *this, *ptn;
+
+	dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw));
+
+	/* If a node has zero dsize, we only have to keep if it if it might be the
+	   node with highest version -- i.e. the one which will end up as f->metadata.
+	   Note that such nodes won't be REF_UNCHECKED since there are no data to
+	   check anyway. */
+	if (!tn->fn->size) {
+		if (rii->mdata_tn) {
+			if (rii->mdata_tn->version < tn->version) {
+				/* We had a candidate mdata node already */
+				dbg_readinode("kill old mdata with ver %d\n", rii->mdata_tn->version);
+				jffs2_kill_tn(c, rii->mdata_tn);
+			} else {
+				dbg_readinode("kill new mdata with ver %d (older than existing %d\n",
+					      tn->version, rii->mdata_tn->version);
+				jffs2_kill_tn(c, tn);
+				return 0;
+			}
+		}
+		rii->mdata_tn = tn;
+		dbg_readinode("keep new mdata with ver %d\n", tn->version);
+		return 0;
+	}
+
+	/* Find the earliest node which _may_ be relevant to this one */
+	this = jffs2_lookup_tn(&rii->tn_root, tn->fn->ofs);
+	if (this) {
+		/* If the node is coincident with another at a lower address,
+		   back up until the other node is found. It may be relevant */
+		while (this->overlapped) {
+			ptn = tn_prev(this);
+			if (!ptn) {
+				/*
+				 * We killed a node which set the overlapped
+				 * flags during the scan. Fix it up.
+				 */
+				this->overlapped = 0;
+				break;
+			}
+			this = ptn;
+		}
+		dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole");
+	}
+
+	while (this) {
+		if (this->fn->ofs > fn_end)
+			break;
+		dbg_readinode("Ponder this ver %d, 0x%x-0x%x\n",
+			      this->version, this->fn->ofs, this->fn->size);
+
+		if (this->version == tn->version) {
+			/* Version number collision means REF_PRISTINE GC. Accept either of them
+			   as long as the CRC is correct. Check the one we have already...  */
+			if (!check_tn_node(c, this)) {
+				/* The one we already had was OK. Keep it and throw away the new one */
+				dbg_readinode("Like old node. Throw away new\n");
+				jffs2_kill_tn(c, tn);
+				return 0;
+			} else {
+				/* Who cares if the new one is good; keep it for now anyway. */
+				dbg_readinode("Like new node. Throw away old\n");
+				rb_replace_node(&this->rb, &tn->rb, &rii->tn_root);
+				jffs2_kill_tn(c, this);
+				/* Same overlapping from in front and behind */
+				return 0;
+			}
+		}
+		if (this->version < tn->version &&
+		    this->fn->ofs >= tn->fn->ofs &&
+		    this->fn->ofs + this->fn->size <= fn_end) {
+			/* New node entirely overlaps 'this' */
+			if (check_tn_node(c, tn)) {
+				dbg_readinode("new node bad CRC\n");
+				jffs2_kill_tn(c, tn);
+				return 0;
+			}
+			/* ... and is good. Kill 'this' and any subsequent nodes which are also overlapped */
+			while (this && this->fn->ofs + this->fn->size <= fn_end) {
+				struct jffs2_tmp_dnode_info *next = tn_next(this);
+				if (this->version < tn->version) {
+					tn_erase(this, &rii->tn_root);
+					dbg_readinode("Kill overlapped ver %d, 0x%x-0x%x\n",
+						      this->version, this->fn->ofs,
+						      this->fn->ofs+this->fn->size);
+					jffs2_kill_tn(c, this);
+				}
+				this = next;
+			}
+			dbg_readinode("Done killing overlapped nodes\n");
+			continue;
+		}
+		if (this->version > tn->version &&
+		    this->fn->ofs <= tn->fn->ofs &&
+		    this->fn->ofs+this->fn->size >= fn_end) {
+			/* New node entirely overlapped by 'this' */
+			if (!check_tn_node(c, this)) {
+				dbg_readinode("Good CRC on old node. Kill new\n");
+				jffs2_kill_tn(c, tn);
+				return 0;
+			}
+			/* ... but 'this' was bad. Replace it... */
+			dbg_readinode("Bad CRC on old overlapping node. Kill it\n");
+			tn_erase(this, &rii->tn_root);
+			jffs2_kill_tn(c, this);
+			break;
+		}
+
+		this = tn_next(this);
+	}
+
+	/* We neither completely obsoleted nor were completely
+	   obsoleted by an earlier node. Insert into the tree */
+	{
+		struct rb_node *parent;
+		struct rb_node **link = &rii->tn_root.rb_node;
+		struct jffs2_tmp_dnode_info *insert_point = NULL;
+
+		while (*link) {
+			parent = *link;
+			insert_point = rb_entry(parent, struct jffs2_tmp_dnode_info, rb);
+			if (tn->fn->ofs > insert_point->fn->ofs)
+				link = &insert_point->rb.rb_right;
+			else if (tn->fn->ofs < insert_point->fn->ofs ||
+				 tn->fn->size < insert_point->fn->size)
+				link = &insert_point->rb.rb_left;
+			else
+				link = &insert_point->rb.rb_right;
+		}
+		rb_link_node(&tn->rb, &insert_point->rb, link);
+		rb_insert_color(&tn->rb, &rii->tn_root);
+	}
+
+	/* If there's anything behind that overlaps us, note it */
+	this = tn_prev(tn);
+	if (this) {
+		while (1) {
+			if (this->fn->ofs + this->fn->size > tn->fn->ofs) {
+				dbg_readinode("Node is overlapped by %p (v %d, 0x%x-0x%x)\n",
+					      this, this->version, this->fn->ofs,
+					      this->fn->ofs+this->fn->size);
+				tn->overlapped = 1;
+				break;
+			}
+			if (!this->overlapped)
+				break;
+
+			ptn = tn_prev(this);
+			if (!ptn) {
+				/*
+				 * We killed a node which set the overlapped
+				 * flags during the scan. Fix it up.
+				 */
+				this->overlapped = 0;
+				break;
+			}
+			this = ptn;
+		}
+	}
+
+	/* If the new node overlaps anything ahead, note it */
+	this = tn_next(tn);
+	while (this && this->fn->ofs < fn_end) {
+		this->overlapped = 1;
+		dbg_readinode("Node ver %d, 0x%x-0x%x is overlapped\n",
+			      this->version, this->fn->ofs,
+			      this->fn->ofs+this->fn->size);
+		this = tn_next(this);
+	}
+	return 0;
+}
+
+/* Trivial function to remove the last node in the tree. Which by definition
+   has no right-hand -- so can be removed just by making its only child (if
+   any) take its place under its parent. */
+static void eat_last(struct rb_root *root, struct rb_node *node)
+{
+	struct rb_node *parent = rb_parent(node);
+	struct rb_node **link;
+
+	/* LAST! */
+	BUG_ON(node->rb_right);
+
+	if (!parent)
+		link = &root->rb_node;
+	else if (node == parent->rb_left)
+		link = &parent->rb_left;
+	else
+		link = &parent->rb_right;
+
+	*link = node->rb_left;
+	/* Colour doesn't matter now. Only the parent pointer. */
+	if (node->rb_left)
+		node->rb_left->rb_parent_color = node->rb_parent_color;
+}
+
+/* We put this in reverse order, so we can just use eat_last */
+static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn)
+{
+	struct rb_node **link = &ver_root->rb_node;
+	struct rb_node *parent = NULL;
+	struct jffs2_tmp_dnode_info *this_tn;
+
+	while (*link) {
+		parent = *link;
+		this_tn = rb_entry(parent, struct jffs2_tmp_dnode_info, rb);
+
+		if (tn->version > this_tn->version)
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+	dbg_readinode("Link new node at %p (root is %p)\n", link, ver_root);
+	rb_link_node(&tn->rb, parent, link);
+	rb_insert_color(&tn->rb, ver_root);
+}
+
+/* Build final, normal fragtree from tn tree. It doesn't matter which order
+   we add nodes to the real fragtree, as long as they don't overlap. And
+   having thrown away the majority of overlapped nodes as we went, there
+   really shouldn't be many sets of nodes which do overlap. If we start at
+   the end, we can use the overlap markers -- we can just eat nodes which
+   aren't overlapped, and when we encounter nodes which _do_ overlap we
+   sort them all into a temporary tree in version order before replaying them. */
+static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
+				      struct jffs2_inode_info *f,
+				      struct jffs2_readinode_info *rii)
+{
+	struct jffs2_tmp_dnode_info *pen, *last, *this;
+	struct rb_root ver_root = RB_ROOT;
+	uint32_t high_ver = 0;
+
+	if (rii->mdata_tn) {
+		dbg_readinode("potential mdata is ver %d at %p\n", rii->mdata_tn->version, rii->mdata_tn);
+		high_ver = rii->mdata_tn->version;
+		rii->latest_ref = rii->mdata_tn->fn->raw;
+	}
+#ifdef JFFS2_DBG_READINODE_MESSAGES
+	this = tn_last(&rii->tn_root);
+	while (this) {
+		dbg_readinode("tn %p ver %d range 0x%x-0x%x ov %d\n", this, this->version, this->fn->ofs,
+			      this->fn->ofs+this->fn->size, this->overlapped);
+		this = tn_prev(this);
+	}
+#endif
+	pen = tn_last(&rii->tn_root);
+	while ((last = pen)) {
+		pen = tn_prev(last);
+
+		eat_last(&rii->tn_root, &last->rb);
+		ver_insert(&ver_root, last);
+
+		if (unlikely(last->overlapped)) {
+			if (pen)
+				continue;
+			/*
+			 * We killed a node which set the overlapped
+			 * flags during the scan. Fix it up.
+			 */
+			last->overlapped = 0;
+		}
+
+		/* Now we have a bunch of nodes in reverse version
+		   order, in the tree at ver_root. Most of the time,
+		   there'll actually be only one node in the 'tree',
+		   in fact. */
+		this = tn_last(&ver_root);
+
+		while (this) {
+			struct jffs2_tmp_dnode_info *vers_next;
+			int ret;
+			vers_next = tn_prev(this);
+			eat_last(&ver_root, &this->rb);
+			if (check_tn_node(c, this)) {
+				dbg_readinode("node ver %d, 0x%x-0x%x failed CRC\n",
+					     this->version, this->fn->ofs,
+					     this->fn->ofs+this->fn->size);
+				jffs2_kill_tn(c, this);
+			} else {
+				if (this->version > high_ver) {
+					/* Note that this is different from the other
+					   highest_version, because this one is only
+					   counting _valid_ nodes which could give the
+					   latest inode metadata */
+					high_ver = this->version;
+					rii->latest_ref = this->fn->raw;
+				}
+				dbg_readinode("Add %p (v %d, 0x%x-0x%x, ov %d) to fragtree\n",
+					     this, this->version, this->fn->ofs,
+					     this->fn->ofs+this->fn->size, this->overlapped);
+
+				ret = jffs2_add_full_dnode_to_inode(c, f, this->fn);
+				if (ret) {
+					/* Free the nodes in vers_root; let the caller
+					   deal with the rest */
+					JFFS2_ERROR("Add node to tree failed %d\n", ret);
+					while (1) {
+						vers_next = tn_prev(this);
+						if (check_tn_node(c, this))
+							jffs2_mark_node_obsolete(c, this->fn->raw);
+						jffs2_free_full_dnode(this->fn);
+						jffs2_free_tmp_dnode_info(this);
+						this = vers_next;
+						if (!this)
+							break;
+						eat_last(&ver_root, &vers_next->rb);
+					}
+					return ret;
+				}
+				jffs2_free_tmp_dnode_info(this);
+			}
+			this = vers_next;
+		}
+	}
+	return 0;
+}
+
+static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
+{
+	struct rb_node *this;
+	struct jffs2_tmp_dnode_info *tn;
+
+	this = list->rb_node;
+
+	/* Now at bottom of tree */
+	while (this) {
+		if (this->rb_left)
+			this = this->rb_left;
+		else if (this->rb_right)
+			this = this->rb_right;
+		else {
+			tn = rb_entry(this, struct jffs2_tmp_dnode_info, rb);
+			jffs2_free_full_dnode(tn->fn);
+			jffs2_free_tmp_dnode_info(tn);
+
+			this = rb_parent(this);
+			if (!this)
+				break;
+
+			if (this->rb_left == &tn->rb)
+				this->rb_left = NULL;
+			else if (this->rb_right == &tn->rb)
+				this->rb_right = NULL;
+			else BUG();
+		}
+	}
+	*list = RB_ROOT;
+}
+
+static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd)
+{
+	struct jffs2_full_dirent *next;
+
+	while (fd) {
+		next = fd->next;
+		jffs2_free_full_dirent(fd);
+		fd = next;
+	}
+}
+
+/* Returns first valid node after 'ref'. May return 'ref' */
+static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_ref *ref)
+{
+	while (ref && ref->next_in_ino) {
+		if (!ref_obsolete(ref))
+			return ref;
+		dbg_noderef("node at 0x%08x is obsoleted. Ignoring.\n", ref_offset(ref));
+		ref = ref->next_in_ino;
+	}
+	return NULL;
+}
+
+/*
+ * Helper function for jffs2_get_inode_nodes().
+ * It is called every time an directory entry node is found.
+ *
+ * Returns: 0 on success;
+ * 	    negative error code on failure.
+ */
+static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
+				struct jffs2_raw_dirent *rd, size_t read,
+				struct jffs2_readinode_info *rii)
+{
+	struct jffs2_full_dirent *fd;
+	uint32_t crc;
+
+	/* Obsoleted. This cannot happen, surely? dwmw2 20020308 */
+	BUG_ON(ref_obsolete(ref));
+
+	crc = crc32(0, rd, sizeof(*rd) - 8);
+	if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
+		JFFS2_NOTICE("header CRC failed on dirent node at %#08x: read %#08x, calculated %#08x\n",
+			     ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
+		jffs2_mark_node_obsolete(c, ref);
+		return 0;
+	}
+
+	/* If we've never checked the CRCs on this node, check them now */
+	if (ref_flags(ref) == REF_UNCHECKED) {
+		struct jffs2_eraseblock *jeb;
+		int len;
+
+		/* Sanity check */
+		if (unlikely(PAD((rd->nsize + sizeof(*rd))) != PAD(je32_to_cpu(rd->totlen)))) {
+			JFFS2_ERROR("illegal nsize in node at %#08x: nsize %#02x, totlen %#04x\n",
+				    ref_offset(ref), rd->nsize, je32_to_cpu(rd->totlen));
+			jffs2_mark_node_obsolete(c, ref);
+			return 0;
+		}
+
+		jeb = &c->blocks[ref->flash_offset / c->sector_size];
+		len = ref_totlen(c, jeb, ref);
+
+		spin_lock(&c->erase_completion_lock);
+		jeb->used_size += len;
+		jeb->unchecked_size -= len;
+		c->used_size += len;
+		c->unchecked_size -= len;
+		ref->flash_offset = ref_offset(ref) | dirent_node_state(rd);
+		spin_unlock(&c->erase_completion_lock);
+	}
+
+	fd = jffs2_alloc_full_dirent(rd->nsize + 1);
+	if (unlikely(!fd))
+		return -ENOMEM;
+
+	fd->raw = ref;
+	fd->version = je32_to_cpu(rd->version);
+	fd->ino = je32_to_cpu(rd->ino);
+	fd->type = rd->type;
+
+	if (fd->version > rii->highest_version)
+		rii->highest_version = fd->version;
+
+	/* Pick out the mctime of the latest dirent */
+	if(fd->version > rii->mctime_ver && je32_to_cpu(rd->mctime)) {
+		rii->mctime_ver = fd->version;
+		rii->latest_mctime = je32_to_cpu(rd->mctime);
+	}
+
+	/*
+	 * Copy as much of the name as possible from the raw
+	 * dirent we've already read from the flash.
+	 */
+	if (read > sizeof(*rd))
+		memcpy(&fd->name[0], &rd->name[0],
+		       min_t(uint32_t, rd->nsize, (read - sizeof(*rd)) ));
+
+	/* Do we need to copy any more of the name directly from the flash? */
+	if (rd->nsize + sizeof(*rd) > read) {
+		/* FIXME: point() */
+		int err;
+		int already = read - sizeof(*rd);
+
+		err = jffs2_flash_read(c, (ref_offset(ref)) + read,
+				rd->nsize - already, &read, &fd->name[already]);
+		if (unlikely(read != rd->nsize - already) && likely(!err))
+			return -EIO;
+
+		if (unlikely(err)) {
+			JFFS2_ERROR("read remainder of name: error %d\n", err);
+			jffs2_free_full_dirent(fd);
+			return -EIO;
+		}
+	}
+
+	fd->nhash = full_name_hash(fd->name, rd->nsize);
+	fd->next = NULL;
+	fd->name[rd->nsize] = '\0';
+
+	/*
+	 * Wheee. We now have a complete jffs2_full_dirent structure, with
+	 * the name in it and everything. Link it into the list
+	 */
+	jffs2_add_fd_to_list(c, fd, &rii->fds);
+
+	return 0;
+}
+
+/*
+ * Helper function for jffs2_get_inode_nodes().
+ * It is called every time an inode node is found.
+ *
+ * Returns: 0 on success (possibly after marking a bad node obsolete);
+ * 	    negative error code on failure.
+ */
+static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
+			     struct jffs2_raw_inode *rd, int rdlen,
+			     struct jffs2_readinode_info *rii)
+{
+	struct jffs2_tmp_dnode_info *tn;
+	uint32_t len, csize;
+	int ret = 0;
+	uint32_t crc;
+
+	/* Obsoleted. This cannot happen, surely? dwmw2 20020308 */
+	BUG_ON(ref_obsolete(ref));
+
+	crc = crc32(0, rd, sizeof(*rd) - 8);
+	if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
+		JFFS2_NOTICE("node CRC failed on dnode at %#08x: read %#08x, calculated %#08x\n",
+			     ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
+		jffs2_mark_node_obsolete(c, ref);
+		return 0;
+	}
+
+	tn = jffs2_alloc_tmp_dnode_info();
+	if (!tn) {
+		JFFS2_ERROR("failed to allocate tn (%zu bytes).\n", sizeof(*tn));
+		return -ENOMEM;
+	}
+
+	tn->partial_crc = 0;
+	csize = je32_to_cpu(rd->csize);
+
+	/* If we've never checked the CRCs on this node, check them now */
+	if (ref_flags(ref) == REF_UNCHECKED) {
+
+		/* Sanity checks */
+		if (unlikely(je32_to_cpu(rd->offset) > je32_to_cpu(rd->isize)) ||
+		    unlikely(PAD(je32_to_cpu(rd->csize) + sizeof(*rd)) != PAD(je32_to_cpu(rd->totlen)))) {
+			JFFS2_WARNING("inode node header CRC is corrupted at %#08x\n", ref_offset(ref));
+			jffs2_dbg_dump_node(c, ref_offset(ref));
+			jffs2_mark_node_obsolete(c, ref);
+			goto free_out;
+		}
+
+		if (jffs2_is_writebuffered(c) && csize != 0) {
+			/* At this point we are supposed to check the data CRC
+			 * of our unchecked node. But thus far, we do not
+			 * know whether the node is valid or obsolete. To
+			 * figure this out, we need to walk all the nodes of
+			 * the inode and build the inode fragtree. We don't
+			 * want to spend time checking data of nodes which may
+			 * later be found to be obsolete. So we put off the full
+			 * data CRC checking until we have read all the inode
+			 * nodes and have started building the fragtree.
+			 *
+			 * The fragtree is being built starting with nodes
+			 * having the highest version number, so we'll be able
+			 * to detect whether a node is valid (i.e., it is not
+			 * overlapped by a node with higher version) or not.
+			 * And we'll be able to check only those nodes, which
+			 * are not obsolete.
+			 *
+			 * Of course, this optimization only makes sense in case
+			 * of NAND flashes (or other flashes with
+			 * !jffs2_can_mark_obsolete()), since on NOR flashes
+			 * nodes are marked obsolete physically.
+			 *
+			 * Since NAND flashes (or other flashes with
+			 * jffs2_is_writebuffered(c)) are anyway read by
+			 * fractions of c->wbuf_pagesize, and we have just read
+			 * the node header, it is likely that the starting part
+			 * of the node data is also read when we read the
+			 * header. So we don't mind to check the CRC of the
+			 * starting part of the data of the node now, and check
+			 * the second part later (in jffs2_check_node_data()).
+			 * Of course, we will not need to re-read and re-check
+			 * the NAND page which we have just read. This is why we
+			 * read the whole NAND page at jffs2_get_inode_nodes(),
+			 * while we needed only the node header.
+			 */
+			unsigned char *buf;
+
+			/* 'buf' will point to the start of data */
+			buf = (unsigned char *)rd + sizeof(*rd);
+			/* len will be the read data length */
+			len = min_t(uint32_t, rdlen - sizeof(*rd), csize);
+			tn->partial_crc = crc32(0, buf, len);
+
+			dbg_readinode("Calculates CRC (%#08x) for %d bytes, csize %d\n", tn->partial_crc, len, csize);
+
+			/* If we actually calculated the whole data CRC
+			 * and it is wrong, drop the node. */
+			if (len >= csize && unlikely(tn->partial_crc != je32_to_cpu(rd->data_crc))) {
+				JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n",
+					ref_offset(ref), tn->partial_crc, je32_to_cpu(rd->data_crc));
+				jffs2_mark_node_obsolete(c, ref);
+				goto free_out;
+			}
+
+		} else if (csize == 0) {
+			/*
+			 * We checked the header CRC. If the node has no data, adjust
+			 * the space accounting now. For other nodes this will be done
+			 * later either when the node is marked obsolete or when its
+			 * data is checked.
+			 */
+			struct jffs2_eraseblock *jeb;
+
+			dbg_readinode("the node has no data.\n");
+			jeb = &c->blocks[ref->flash_offset / c->sector_size];
+			len = ref_totlen(c, jeb, ref);
+
+			spin_lock(&c->erase_completion_lock);
+			jeb->used_size += len;
+			jeb->unchecked_size -= len;
+			c->used_size += len;
+			c->unchecked_size -= len;
+			ref->flash_offset = ref_offset(ref) | REF_NORMAL;
+			spin_unlock(&c->erase_completion_lock);
+		}
+	}
+
+	tn->fn = jffs2_alloc_full_dnode();
+	if (!tn->fn) {
+		JFFS2_ERROR("alloc fn failed\n");
+		ret = -ENOMEM;
+		goto free_out;
+	}
+
+	tn->version = je32_to_cpu(rd->version);
+	tn->fn->ofs = je32_to_cpu(rd->offset);
+	tn->data_crc = je32_to_cpu(rd->data_crc);
+	tn->csize = csize;
+	tn->fn->raw = ref;
+	tn->overlapped = 0;
+
+	if (tn->version > rii->highest_version)
+		rii->highest_version = tn->version;
+
+	/* There was a bug where we wrote hole nodes out with
+	   csize/dsize swapped. Deal with it */
+	if (rd->compr == JFFS2_COMPR_ZERO && !je32_to_cpu(rd->dsize) && csize)
+		tn->fn->size = csize;
+	else // normal case...
+		tn->fn->size = je32_to_cpu(rd->dsize);
+
+	dbg_readinode2("dnode @%08x: ver %u, offset %#04x, dsize %#04x, csize %#04x\n",
+		       ref_offset(ref), je32_to_cpu(rd->version),
+		       je32_to_cpu(rd->offset), je32_to_cpu(rd->dsize), csize);
+
+	ret = jffs2_add_tn_to_tree(c, rii, tn);
+
+	if (ret) {
+		jffs2_free_full_dnode(tn->fn);
+	free_out:
+		jffs2_free_tmp_dnode_info(tn);
+		return ret;
+	}
+#ifdef JFFS2_DBG_READINODE2_MESSAGES
+	dbg_readinode2("After adding ver %d:\n", je32_to_cpu(rd->version));
+	tn = tn_first(&rii->tn_root);
+	while (tn) {
+		dbg_readinode2("%p: v %d r 0x%x-0x%x ov %d\n",
+			       tn, tn->version, tn->fn->ofs,
+			       tn->fn->ofs+tn->fn->size, tn->overlapped);
+		tn = tn_next(tn);
+	}
+#endif
+	return 0;
+}
+
+/*
+ * Helper function for jffs2_get_inode_nodes().
+ * It is called every time an unknown node is found.
+ *
+ * Returns: 0 on success;
+ * 	    negative error code on failure.
+ */
+static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, struct jffs2_unknown_node *un)
+{
+	/* We don't mark unknown nodes as REF_UNCHECKED */
+	if (ref_flags(ref) == REF_UNCHECKED) {
+		JFFS2_ERROR("REF_UNCHECKED but unknown node at %#08x\n",
+			    ref_offset(ref));
+		JFFS2_ERROR("Node is {%04x,%04x,%08x,%08x}. Please report this error.\n",
+			    je16_to_cpu(un->magic), je16_to_cpu(un->nodetype),
+			    je32_to_cpu(un->totlen), je32_to_cpu(un->hdr_crc));
+		jffs2_mark_node_obsolete(c, ref);
+		return 0;
+	}
+
+	un->nodetype = cpu_to_je16(JFFS2_NODE_ACCURATE | je16_to_cpu(un->nodetype));
+
+	switch(je16_to_cpu(un->nodetype) & JFFS2_COMPAT_MASK) {
+
+	case JFFS2_FEATURE_INCOMPAT:
+		JFFS2_ERROR("unknown INCOMPAT nodetype %#04X at %#08x\n",
+			    je16_to_cpu(un->nodetype), ref_offset(ref));
+		/* EEP */
+		BUG();
+		break;
+
+	case JFFS2_FEATURE_ROCOMPAT:
+		JFFS2_ERROR("unknown ROCOMPAT nodetype %#04X at %#08x\n",
+			    je16_to_cpu(un->nodetype), ref_offset(ref));
+		BUG_ON(!(c->flags & JFFS2_SB_FLAG_RO));
+		break;
+
+	case JFFS2_FEATURE_RWCOMPAT_COPY:
+		JFFS2_NOTICE("unknown RWCOMPAT_COPY nodetype %#04X at %#08x\n",
+			     je16_to_cpu(un->nodetype), ref_offset(ref));
+		break;
+
+	case JFFS2_FEATURE_RWCOMPAT_DELETE:
+		JFFS2_NOTICE("unknown RWCOMPAT_DELETE nodetype %#04X at %#08x\n",
+			     je16_to_cpu(un->nodetype), ref_offset(ref));
+		jffs2_mark_node_obsolete(c, ref);
+		return 0;
+	}
+
+	return 0;
+}
+
+/*
+ * Helper function for jffs2_get_inode_nodes().
+ * The function detects whether more data should be read and reads it if yes.
+ *
+ * Returns: 0 on success;
+ * 	    negative error code on failure.
+ */
+static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
+		     int needed_len, int *rdlen, unsigned char *buf)
+{
+	int err, to_read = needed_len - *rdlen;
+	size_t retlen;
+	uint32_t offs;
+
+	if (jffs2_is_writebuffered(c)) {
+		int rem = to_read % c->wbuf_pagesize;
+
+		if (rem)
+			to_read += c->wbuf_pagesize - rem;
+	}
+
+	/* We need to read more data */
+	offs = ref_offset(ref) + *rdlen;
+
+	dbg_readinode("read more %d bytes\n", to_read);
+
+	err = jffs2_flash_read(c, offs, to_read, &retlen, buf + *rdlen);
+	if (err) {
+		JFFS2_ERROR("can not read %d bytes from 0x%08x, "
+			"error code: %d.\n", to_read, offs, err);
+		return err;
+	}
+
+	if (retlen < to_read) {
+		JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n",
+				offs, retlen, to_read);
+		return -EIO;
+	}
+
+	*rdlen += to_read;
+	return 0;
+}
+
+/* Get tmp_dnode_info and full_dirent for all non-obsolete nodes associated
+   with this ino. Perform a preliminary ordering on data nodes, throwing away
+   those which are completely obsoleted by newer ones. The naïve approach we
+   use to take of just returning them _all_ in version order will cause us to
+   run out of memory in certain degenerate cases. */
+static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+				 struct jffs2_readinode_info *rii)
+{
+	struct jffs2_raw_node_ref *ref, *valid_ref;
+	unsigned char *buf = NULL;
+	union jffs2_node_union *node;
+	size_t retlen;
+	int len, err;
+
+	rii->mctime_ver = 0;
+
+	dbg_readinode("ino #%u\n", f->inocache->ino);
+
+	/* FIXME: in case of NOR and available ->point() this
+	 * needs to be fixed. */
+	len = sizeof(union jffs2_node_union) + c->wbuf_pagesize;
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	spin_lock(&c->erase_completion_lock);
+	valid_ref = jffs2_first_valid_node(f->inocache->nodes);
+	if (!valid_ref && f->inocache->ino != 1)
+		JFFS2_WARNING("Eep. No valid nodes for ino #%u.\n", f->inocache->ino);
+	while (valid_ref) {
+		/* We can hold a pointer to a non-obsolete node without the spinlock,
+		   but _obsolete_ nodes may disappear at any time, if the block
+		   they're in gets erased. So if we mark 'ref' obsolete while we're
+		   not holding the lock, it can go away immediately. For that reason,
+		   we find the next valid node first, before processing 'ref'.
+		*/
+		ref = valid_ref;
+		valid_ref = jffs2_first_valid_node(ref->next_in_ino);
+		spin_unlock(&c->erase_completion_lock);
+
+		cond_resched();
+
+		/*
+		 * At this point we don't know the type of the node we're going
+		 * to read, so we do not know the size of its header. In order
+		 * to minimize the amount of flash IO we assume the header is
+		 * of size = JFFS2_MIN_NODE_HEADER.
+		 */
+		len = JFFS2_MIN_NODE_HEADER;
+		if (jffs2_is_writebuffered(c)) {
+			int end, rem;
+
+			/*
+			 * We are about to read JFFS2_MIN_NODE_HEADER bytes,
+			 * but this flash has some minimal I/O unit. It is
+			 * possible that we'll need to read more soon, so read
+			 * up to the next min. I/O unit, in order not to
+			 * re-read the same min. I/O unit twice.
+			 */
+			end = ref_offset(ref) + len;
+			rem = end % c->wbuf_pagesize;
+			if (rem)
+				end += c->wbuf_pagesize - rem;
+			len = end - ref_offset(ref);
+		}
+
+		dbg_readinode("read %d bytes at %#08x(%d).\n", len, ref_offset(ref), ref_flags(ref));
+
+		/* FIXME: point() */
+		err = jffs2_flash_read(c, ref_offset(ref), len, &retlen, buf);
+		if (err) {
+			JFFS2_ERROR("can not read %d bytes from 0x%08x, " "error code: %d.\n", len, ref_offset(ref), err);
+			goto free_out;
+		}
+
+		if (retlen < len) {
+			JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n", ref_offset(ref), retlen, len);
+			err = -EIO;
+			goto free_out;
+		}
+
+		node = (union jffs2_node_union *)buf;
+
+		/* No need to mask in the valid bit; it shouldn't be invalid */
+		if (je32_to_cpu(node->u.hdr_crc) != crc32(0, node, sizeof(node->u)-4)) {
+			JFFS2_NOTICE("Node header CRC failed at %#08x. {%04x,%04x,%08x,%08x}\n",
+				     ref_offset(ref), je16_to_cpu(node->u.magic),
+				     je16_to_cpu(node->u.nodetype),
+				     je32_to_cpu(node->u.totlen),
+				     je32_to_cpu(node->u.hdr_crc));
+			jffs2_dbg_dump_node(c, ref_offset(ref));
+			jffs2_mark_node_obsolete(c, ref);
+			goto cont;
+		}
+		if (je16_to_cpu(node->u.magic) != JFFS2_MAGIC_BITMASK) {
+			/* Not a JFFS2 node, whinge and move on */
+			JFFS2_NOTICE("Wrong magic bitmask 0x%04x in node header at %#08x.\n",
+				     je16_to_cpu(node->u.magic), ref_offset(ref));
+			jffs2_mark_node_obsolete(c, ref);
+			goto cont;
+		}
+
+		switch (je16_to_cpu(node->u.nodetype)) {
+
+		case JFFS2_NODETYPE_DIRENT:
+
+			if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_dirent) &&
+			    len < sizeof(struct jffs2_raw_dirent)) {
+				err = read_more(c, ref, sizeof(struct jffs2_raw_dirent), &len, buf);
+				if (unlikely(err))
+					goto free_out;
+			}
+
+			err = read_direntry(c, ref, &node->d, retlen, rii);
+			if (unlikely(err))
+				goto free_out;
+
+			break;
+
+		case JFFS2_NODETYPE_INODE:
+
+			if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_inode) &&
+			    len < sizeof(struct jffs2_raw_inode)) {
+				err = read_more(c, ref, sizeof(struct jffs2_raw_inode), &len, buf);
+				if (unlikely(err))
+					goto free_out;
+			}
+
+			err = read_dnode(c, ref, &node->i, len, rii);
+			if (unlikely(err))
+				goto free_out;
+
+			break;
+
+		default:
+			if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_unknown_node) &&
+			    len < sizeof(struct jffs2_unknown_node)) {
+				err = read_more(c, ref, sizeof(struct jffs2_unknown_node), &len, buf);
+				if (unlikely(err))
+					goto free_out;
+			}
+
+			err = read_unknown(c, ref, &node->u);
+			if (unlikely(err))
+				goto free_out;
+
+		}
+	cont:
+		spin_lock(&c->erase_completion_lock);
+	}
+
+	spin_unlock(&c->erase_completion_lock);
+	kfree(buf);
+
+	f->highest_version = rii->highest_version;
+
+	dbg_readinode("nodes of inode #%u were read, the highest version is %u, latest_mctime %u, mctime_ver %u.\n",
+		      f->inocache->ino, rii->highest_version, rii->latest_mctime,
+		      rii->mctime_ver);
+	return 0;
+
+ free_out:
+	jffs2_free_tmp_dnode_info_list(&rii->tn_root);
+	jffs2_free_full_dirent_list(rii->fds);
+	rii->fds = NULL;
+	kfree(buf);
+	return err;
+}
+
+static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
+					struct jffs2_inode_info *f,
+					struct jffs2_raw_inode *latest_node)
+{
+	struct jffs2_readinode_info rii;
+	uint32_t crc, new_size;
+	size_t retlen;
+	int ret;
+
+	dbg_readinode("ino #%u pino/nlink is %d\n", f->inocache->ino,
+		      f->inocache->pino_nlink);
+
+	memset(&rii, 0, sizeof(rii));
+
+	/* Grab all nodes relevant to this ino */
+	ret = jffs2_get_inode_nodes(c, f, &rii);
+
+	if (ret) {
+		JFFS2_ERROR("cannot read nodes for ino %u, returned error is %d\n", f->inocache->ino, ret);
+		if (f->inocache->state == INO_STATE_READING)
+			jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
+		return ret;
+	}
+
+	ret = jffs2_build_inode_fragtree(c, f, &rii);
+	if (ret) {
+		JFFS2_ERROR("Failed to build final fragtree for inode #%u: error %d\n",
+			    f->inocache->ino, ret);
+		if (f->inocache->state == INO_STATE_READING)
+			jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
+		jffs2_free_tmp_dnode_info_list(&rii.tn_root);
+		/* FIXME: We could at least crc-check them all */
+		if (rii.mdata_tn) {
+			jffs2_free_full_dnode(rii.mdata_tn->fn);
+			jffs2_free_tmp_dnode_info(rii.mdata_tn);
+			rii.mdata_tn = NULL;
+		}
+		return ret;
+	}
+
+	if (rii.mdata_tn) {
+		if (rii.mdata_tn->fn->raw == rii.latest_ref) {
+			f->metadata = rii.mdata_tn->fn;
+			jffs2_free_tmp_dnode_info(rii.mdata_tn);
+		} else {
+			jffs2_kill_tn(c, rii.mdata_tn);
+		}
+		rii.mdata_tn = NULL;
+	}
+
+	f->dents = rii.fds;
+
+	jffs2_dbg_fragtree_paranoia_check_nolock(f);
+
+	if (unlikely(!rii.latest_ref)) {
+		/* No data nodes for this inode. */
+		if (f->inocache->ino != 1) {
+			JFFS2_WARNING("no data nodes found for ino #%u\n", f->inocache->ino);
+			if (!rii.fds) {
+				if (f->inocache->state == INO_STATE_READING)
+					jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
+				return -EIO;
+			}
+			JFFS2_NOTICE("but it has children so we fake some modes for it\n");
+		}
+		latest_node->mode = cpu_to_jemode(S_IFDIR|S_IRUGO|S_IWUSR|S_IXUGO);
+		latest_node->version = cpu_to_je32(0);
+		latest_node->atime = latest_node->ctime = latest_node->mtime = cpu_to_je32(0);
+		latest_node->isize = cpu_to_je32(0);
+		latest_node->gid = cpu_to_je16(0);
+		latest_node->uid = cpu_to_je16(0);
+		if (f->inocache->state == INO_STATE_READING)
+			jffs2_set_inocache_state(c, f->inocache, INO_STATE_PRESENT);
+		return 0;
+	}
+
+	ret = jffs2_flash_read(c, ref_offset(rii.latest_ref), sizeof(*latest_node), &retlen, (void *)latest_node);
+	if (ret || retlen != sizeof(*latest_node)) {
+		JFFS2_ERROR("failed to read from flash: error %d, %zd of %zd bytes read\n",
+			ret, retlen, sizeof(*latest_node));
+		/* FIXME: If this fails, there seems to be a memory leak. Find it. */
+		mutex_unlock(&f->sem);
+		jffs2_do_clear_inode(c, f);
+		return ret?ret:-EIO;
+	}
+
+	crc = crc32(0, latest_node, sizeof(*latest_node)-8);
+	if (crc != je32_to_cpu(latest_node->node_crc)) {
+		JFFS2_ERROR("CRC failed for read_inode of inode %u at physical location 0x%x\n",
+			f->inocache->ino, ref_offset(rii.latest_ref));
+		mutex_unlock(&f->sem);
+		jffs2_do_clear_inode(c, f);
+		return -EIO;
+	}
+
+	switch(jemode_to_cpu(latest_node->mode) & S_IFMT) {
+	case S_IFDIR:
+		if (rii.mctime_ver > je32_to_cpu(latest_node->version)) {
+			/* The times in the latest_node are actually older than
+			   mctime in the latest dirent. Cheat. */
+			latest_node->ctime = latest_node->mtime = cpu_to_je32(rii.latest_mctime);
+		}
+		break;
+
+
+	case S_IFREG:
+		/* If it was a regular file, truncate it to the latest node's isize */
+		new_size = jffs2_truncate_fragtree(c, &f->fragtree, je32_to_cpu(latest_node->isize));
+		if (new_size != je32_to_cpu(latest_node->isize)) {
+			JFFS2_WARNING("Truncating ino #%u to %d bytes failed because it only had %d bytes to start with!\n",
+				      f->inocache->ino, je32_to_cpu(latest_node->isize), new_size);
+			latest_node->isize = cpu_to_je32(new_size);
+		}
+		break;
+
+	case S_IFLNK:
+		/* Hack to work around broken isize in old symlink code.
+		   Remove this when dwmw2 comes to his senses and stops
+		   symlinks from being an entirely gratuitous special
+		   case. */
+		if (!je32_to_cpu(latest_node->isize))
+			latest_node->isize = latest_node->dsize;
+
+		if (f->inocache->state != INO_STATE_CHECKING) {
+			/* Symlink's inode data is the target path. Read it and
+			 * keep in RAM to facilitate quick follow symlink
+			 * operation. */
+			f->target = kmalloc(je32_to_cpu(latest_node->csize) + 1, GFP_KERNEL);
+			if (!f->target) {
+				JFFS2_ERROR("can't allocate %d bytes of memory for the symlink target path cache\n", je32_to_cpu(latest_node->csize));
+				mutex_unlock(&f->sem);
+				jffs2_do_clear_inode(c, f);
+				return -ENOMEM;
+			}
+
+			ret = jffs2_flash_read(c, ref_offset(rii.latest_ref) + sizeof(*latest_node),
+						je32_to_cpu(latest_node->csize), &retlen, (char *)f->target);
+
+			if (ret  || retlen != je32_to_cpu(latest_node->csize)) {
+				if (retlen != je32_to_cpu(latest_node->csize))
+					ret = -EIO;
+				kfree(f->target);
+				f->target = NULL;
+				mutex_unlock(&f->sem);
+				jffs2_do_clear_inode(c, f);
+				return ret;
+			}
+
+			f->target[je32_to_cpu(latest_node->csize)] = '\0';
+			dbg_readinode("symlink's target '%s' cached\n", f->target);
+		}
+
+		/* fall through... */
+
+	case S_IFBLK:
+	case S_IFCHR:
+		/* Certain inode types should have only one data node, and it's
+		   kept as the metadata node */
+		if (f->metadata) {
+			JFFS2_ERROR("Argh. Special inode #%u with mode 0%o had metadata node\n",
+			       f->inocache->ino, jemode_to_cpu(latest_node->mode));
+			mutex_unlock(&f->sem);
+			jffs2_do_clear_inode(c, f);
+			return -EIO;
+		}
+		if (!frag_first(&f->fragtree)) {
+			JFFS2_ERROR("Argh. Special inode #%u with mode 0%o has no fragments\n",
+			       f->inocache->ino, jemode_to_cpu(latest_node->mode));
+			mutex_unlock(&f->sem);
+			jffs2_do_clear_inode(c, f);
+			return -EIO;
+		}
+		/* ASSERT: f->fraglist != NULL */
+		if (frag_next(frag_first(&f->fragtree))) {
+			JFFS2_ERROR("Argh. Special inode #%u with mode 0x%x had more than one node\n",
+			       f->inocache->ino, jemode_to_cpu(latest_node->mode));
+			/* FIXME: Deal with it - check crc32, check for duplicate node, check times and discard the older one */
+			mutex_unlock(&f->sem);
+			jffs2_do_clear_inode(c, f);
+			return -EIO;
+		}
+		/* OK. We're happy */
+		f->metadata = frag_first(&f->fragtree)->node;
+		jffs2_free_node_frag(frag_first(&f->fragtree));
+		f->fragtree = RB_ROOT;
+		break;
+	}
+	if (f->inocache->state == INO_STATE_READING)
+		jffs2_set_inocache_state(c, f->inocache, INO_STATE_PRESENT);
+
+	return 0;
+}
+
+/* Scan the list of all nodes present for this ino, build map of versions, etc. */
+int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			uint32_t ino, struct jffs2_raw_inode *latest_node)
+{
+	dbg_readinode("read inode #%u\n", ino);
+
+ retry_inocache:
+	spin_lock(&c->inocache_lock);
+	f->inocache = jffs2_get_ino_cache(c, ino);
+
+	if (f->inocache) {
+		/* Check its state. We may need to wait before we can use it */
+		switch(f->inocache->state) {
+		case INO_STATE_UNCHECKED:
+		case INO_STATE_CHECKEDABSENT:
+			f->inocache->state = INO_STATE_READING;
+			break;
+
+		case INO_STATE_CHECKING:
+		case INO_STATE_GC:
+			/* If it's in either of these states, we need
+			   to wait for whoever's got it to finish and
+			   put it back. */
+			dbg_readinode("waiting for ino #%u in state %d\n", ino, f->inocache->state);
+			sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
+			goto retry_inocache;
+
+		case INO_STATE_READING:
+		case INO_STATE_PRESENT:
+			/* Eep. This should never happen. It can
+			happen if Linux calls read_inode() again
+			before clear_inode() has finished though. */
+			JFFS2_ERROR("Eep. Trying to read_inode #%u when it's already in state %d!\n", ino, f->inocache->state);
+			/* Fail. That's probably better than allowing it to succeed */
+			f->inocache = NULL;
+			break;
+
+		default:
+			BUG();
+		}
+	}
+	spin_unlock(&c->inocache_lock);
+
+	if (!f->inocache && ino == 1) {
+		/* Special case - no root inode on medium */
+		f->inocache = jffs2_alloc_inode_cache();
+		if (!f->inocache) {
+			JFFS2_ERROR("cannot allocate inocache for root inode\n");
+			return -ENOMEM;
+		}
+		dbg_readinode("creating inocache for root inode\n");
+		memset(f->inocache, 0, sizeof(struct jffs2_inode_cache));
+		f->inocache->ino = f->inocache->pino_nlink = 1;
+		f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache;
+		f->inocache->state = INO_STATE_READING;
+		jffs2_add_ino_cache(c, f->inocache);
+	}
+	if (!f->inocache) {
+		JFFS2_ERROR("requestied to read an nonexistent ino %u\n", ino);
+		return -ENOENT;
+	}
+
+	return jffs2_do_read_inode_internal(c, f, latest_node);
+}
+
+int jffs2_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	struct jffs2_raw_inode n;
+	struct jffs2_inode_info *f = kzalloc(sizeof(*f), GFP_KERNEL);
+	int ret;
+
+	if (!f)
+		return -ENOMEM;
+
+	mutex_init(&f->sem);
+	mutex_lock(&f->sem);
+	f->inocache = ic;
+
+	ret = jffs2_do_read_inode_internal(c, f, &n);
+	if (!ret) {
+		mutex_unlock(&f->sem);
+		jffs2_do_clear_inode(c, f);
+	}
+	kfree (f);
+	return ret;
+}
+
+void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f)
+{
+	struct jffs2_full_dirent *fd, *fds;
+	int deleted;
+
+	jffs2_xattr_delete_inode(c, f->inocache);
+	mutex_lock(&f->sem);
+	deleted = f->inocache && !f->inocache->pino_nlink;
+
+	if (f->inocache && f->inocache->state != INO_STATE_CHECKING)
+		jffs2_set_inocache_state(c, f->inocache, INO_STATE_CLEARING);
+
+	if (f->metadata) {
+		if (deleted)
+			jffs2_mark_node_obsolete(c, f->metadata->raw);
+		jffs2_free_full_dnode(f->metadata);
+	}
+
+	jffs2_kill_fragtree(&f->fragtree, deleted?c:NULL);
+
+	if (f->target) {
+		kfree(f->target);
+		f->target = NULL;
+	}
+
+	fds = f->dents;
+	while(fds) {
+		fd = fds;
+		fds = fd->next;
+		jffs2_free_full_dirent(fd);
+	}
+
+	if (f->inocache && f->inocache->state != INO_STATE_CHECKING) {
+		jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
+		if (f->inocache->nodes == (void *)f->inocache)
+			jffs2_del_ino_cache(c, f->inocache);
+	}
+
+	mutex_unlock(&f->sem);
+}
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
new file mode 100644
index 00000000..3e021cfb
--- /dev/null
+++ b/fs/jffs2/scan.c
@@ -0,0 +1,1148 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/mtd/mtd.h>
+#include <linux/pagemap.h>
+#include <linux/crc32.h>
+#include <linux/compiler.h>
+#include "nodelist.h"
+#include "summary.h"
+#include "debug.h"
+
+#define DEFAULT_EMPTY_SCAN_SIZE 256
+
+#define noisy_printk(noise, args...) do { \
+	if (*(noise)) { \
+		printk(KERN_NOTICE args); \
+		 (*(noise))--; \
+		 if (!(*(noise))) { \
+			 printk(KERN_NOTICE "Further such events for this erase block will not be printed\n"); \
+		 } \
+	} \
+} while(0)
+
+static uint32_t pseudo_random;
+
+static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				  unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s);
+
+/* These helper functions _must_ increase ofs and also do the dirty/used space accounting.
+ * Returning an error will abort the mount - bad checksums etc. should just mark the space
+ * as dirty.
+ */
+static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				 struct jffs2_raw_inode *ri, uint32_t ofs, struct jffs2_summary *s);
+static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				 struct jffs2_raw_dirent *rd, uint32_t ofs, struct jffs2_summary *s);
+
+static inline int min_free(struct jffs2_sb_info *c)
+{
+	uint32_t min = 2 * sizeof(struct jffs2_raw_inode);
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	if (!jffs2_can_mark_obsolete(c) && min < c->wbuf_pagesize)
+		return c->wbuf_pagesize;
+#endif
+	return min;
+
+}
+
+static inline uint32_t EMPTY_SCAN_SIZE(uint32_t sector_size) {
+	if (sector_size < DEFAULT_EMPTY_SCAN_SIZE)
+		return sector_size;
+	else
+		return DEFAULT_EMPTY_SCAN_SIZE;
+}
+
+static int file_dirty(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+	int ret;
+
+	if ((ret = jffs2_prealloc_raw_node_refs(c, jeb, 1)))
+		return ret;
+	if ((ret = jffs2_scan_dirty_space(c, jeb, jeb->free_size)))
+		return ret;
+	/* Turned wasted size into dirty, since we apparently 
+	   think it's recoverable now. */
+	jeb->dirty_size += jeb->wasted_size;
+	c->dirty_size += jeb->wasted_size;
+	c->wasted_size -= jeb->wasted_size;
+	jeb->wasted_size = 0;
+	if (VERYDIRTY(c, jeb->dirty_size)) {
+		list_add(&jeb->list, &c->very_dirty_list);
+	} else {
+		list_add(&jeb->list, &c->dirty_list);
+	}
+	return 0;
+}
+
+int jffs2_scan_medium(struct jffs2_sb_info *c)
+{
+	int i, ret;
+	uint32_t empty_blocks = 0, bad_blocks = 0;
+	unsigned char *flashbuf = NULL;
+	uint32_t buf_size = 0;
+	struct jffs2_summary *s = NULL; /* summary info collected by the scan process */
+#ifndef __ECOS
+	size_t pointlen, try_size;
+
+	if (c->mtd->point) {
+		ret = c->mtd->point(c->mtd, 0, c->mtd->size, &pointlen,
+				    (void **)&flashbuf, NULL);
+		if (!ret && pointlen < c->mtd->size) {
+			/* Don't muck about if it won't let us point to the whole flash */
+			D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", pointlen));
+			c->mtd->unpoint(c->mtd, 0, pointlen);
+			flashbuf = NULL;
+		}
+		if (ret)
+			D1(printk(KERN_DEBUG "MTD point failed %d\n", ret));
+	}
+#endif
+	if (!flashbuf) {
+		/* For NAND it's quicker to read a whole eraseblock at a time,
+		   apparently */
+		if (jffs2_cleanmarker_oob(c))
+			try_size = c->sector_size;
+		if (c->mtd->type == MTD_NANDFLASH)
+			buf_size = c->sector_size;
+		else
+			try_size = PAGE_SIZE;
+
+		D1(printk(KERN_DEBUG "Trying to allocate readbuf of %zu "
+			"bytes\n", try_size));
+
+		flashbuf = mtd_kmalloc_up_to(c->mtd, &try_size);
+		if (!flashbuf)
+			return -ENOMEM;
+
+		D1(printk(KERN_DEBUG "Allocated readbuf of %zu bytes\n",
+			try_size));
+
+		buf_size = (uint32_t)try_size;
+	}
+
+	if (jffs2_sum_active()) {
+		s = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);
+		if (!s) {
+			JFFS2_WARNING("Can't allocate memory for summary\n");
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	for (i=0; i<c->nr_blocks; i++) {
+		struct jffs2_eraseblock *jeb = &c->blocks[i];
+
+		cond_resched();
+
+		/* reset summary info for next eraseblock scan */
+		jffs2_sum_reset_collected(s);
+
+		ret = jffs2_scan_eraseblock(c, jeb, buf_size?flashbuf:(flashbuf+jeb->offset),
+						buf_size, s);
+
+		if (ret < 0)
+			goto out;
+
+		jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+
+		/* Now decide which list to put it on */
+		switch(ret) {
+		case BLK_STATE_ALLFF:
+			/*
+			 * Empty block.   Since we can't be sure it
+			 * was entirely erased, we just queue it for erase
+			 * again.  It will be marked as such when the erase
+			 * is complete.  Meanwhile we still count it as empty
+			 * for later checks.
+			 */
+			empty_blocks++;
+			list_add(&jeb->list, &c->erase_pending_list);
+			c->nr_erasing_blocks++;
+			break;
+
+		case BLK_STATE_CLEANMARKER:
+			/* Only a CLEANMARKER node is valid */
+			if (!jeb->dirty_size) {
+				/* It's actually free */
+				list_add(&jeb->list, &c->free_list);
+				c->nr_free_blocks++;
+			} else {
+				/* Dirt */
+				D1(printk(KERN_DEBUG "Adding all-dirty block at 0x%08x to erase_pending_list\n", jeb->offset));
+				list_add(&jeb->list, &c->erase_pending_list);
+				c->nr_erasing_blocks++;
+			}
+			break;
+
+		case BLK_STATE_CLEAN:
+			/* Full (or almost full) of clean data. Clean list */
+			list_add(&jeb->list, &c->clean_list);
+			break;
+
+		case BLK_STATE_PARTDIRTY:
+			/* Some data, but not full. Dirty list. */
+			/* We want to remember the block with most free space
+			and stick it in the 'nextblock' position to start writing to it. */
+			if (jeb->free_size > min_free(c) &&
+					(!c->nextblock || c->nextblock->free_size < jeb->free_size)) {
+				/* Better candidate for the next writes to go to */
+				if (c->nextblock) {
+					ret = file_dirty(c, c->nextblock);
+					if (ret)
+						goto out;
+					/* deleting summary information of the old nextblock */
+					jffs2_sum_reset_collected(c->summary);
+				}
+				/* update collected summary information for the current nextblock */
+				jffs2_sum_move_collected(c, s);
+				D1(printk(KERN_DEBUG "jffs2_scan_medium(): new nextblock = 0x%08x\n", jeb->offset));
+				c->nextblock = jeb;
+			} else {
+				ret = file_dirty(c, jeb);
+				if (ret)
+					goto out;
+			}
+			break;
+
+		case BLK_STATE_ALLDIRTY:
+			/* Nothing valid - not even a clean marker. Needs erasing. */
+			/* For now we just put it on the erasing list. We'll start the erases later */
+			D1(printk(KERN_NOTICE "JFFS2: Erase block at 0x%08x is not formatted. It will be erased\n", jeb->offset));
+			list_add(&jeb->list, &c->erase_pending_list);
+			c->nr_erasing_blocks++;
+			break;
+
+		case BLK_STATE_BADBLOCK:
+			D1(printk(KERN_NOTICE "JFFS2: Block at 0x%08x is bad\n", jeb->offset));
+			list_add(&jeb->list, &c->bad_list);
+			c->bad_size += c->sector_size;
+			c->free_size -= c->sector_size;
+			bad_blocks++;
+			break;
+		default:
+			printk(KERN_WARNING "jffs2_scan_medium(): unknown block state\n");
+			BUG();
+		}
+	}
+
+	/* Nextblock dirty is always seen as wasted, because we cannot recycle it now */
+	if (c->nextblock && (c->nextblock->dirty_size)) {
+		c->nextblock->wasted_size += c->nextblock->dirty_size;
+		c->wasted_size += c->nextblock->dirty_size;
+		c->dirty_size -= c->nextblock->dirty_size;
+		c->nextblock->dirty_size = 0;
+	}
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	if (!jffs2_can_mark_obsolete(c) && c->wbuf_pagesize && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) {
+		/* If we're going to start writing into a block which already
+		   contains data, and the end of the data isn't page-aligned,
+		   skip a little and align it. */
+
+		uint32_t skip = c->nextblock->free_size % c->wbuf_pagesize;
+
+		D1(printk(KERN_DEBUG "jffs2_scan_medium(): Skipping %d bytes in nextblock to ensure page alignment\n",
+			  skip));
+		jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
+		jffs2_scan_dirty_space(c, c->nextblock, skip);
+	}
+#endif
+	if (c->nr_erasing_blocks) {
+		if ( !c->used_size && ((c->nr_free_blocks+empty_blocks+bad_blocks)!= c->nr_blocks || bad_blocks == c->nr_blocks) ) {
+			printk(KERN_NOTICE "Cowardly refusing to erase blocks on filesystem with no valid JFFS2 nodes\n");
+			printk(KERN_NOTICE "empty_blocks %d, bad_blocks %d, c->nr_blocks %d\n",empty_blocks,bad_blocks,c->nr_blocks);
+			ret = -EIO;
+			goto out;
+		}
+		spin_lock(&c->erase_completion_lock);
+		jffs2_garbage_collect_trigger(c);
+		spin_unlock(&c->erase_completion_lock);
+	}
+	ret = 0;
+ out:
+	if (buf_size)
+		kfree(flashbuf);
+#ifndef __ECOS
+	else
+		c->mtd->unpoint(c->mtd, 0, c->mtd->size);
+#endif
+	if (s)
+		kfree(s);
+
+	return ret;
+}
+
+static int jffs2_fill_scan_buf(struct jffs2_sb_info *c, void *buf,
+			       uint32_t ofs, uint32_t len)
+{
+	int ret;
+	size_t retlen;
+
+	ret = jffs2_flash_read(c, ofs, len, &retlen, buf);
+	if (ret) {
+		D1(printk(KERN_WARNING "mtd->read(0x%x bytes from 0x%x) returned %d\n", len, ofs, ret));
+		return ret;
+	}
+	if (retlen < len) {
+		D1(printk(KERN_WARNING "Read at 0x%x gave only 0x%zx bytes\n", ofs, retlen));
+		return -EIO;
+	}
+	return 0;
+}
+
+int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+	if ((jeb->used_size + jeb->unchecked_size) == PAD(c->cleanmarker_size) && !jeb->dirty_size
+	    && (!jeb->first_node || !ref_next(jeb->first_node)) )
+		return BLK_STATE_CLEANMARKER;
+
+	/* move blocks with max 4 byte dirty space to cleanlist */
+	else if (!ISDIRTY(c->sector_size - (jeb->used_size + jeb->unchecked_size))) {
+		c->dirty_size -= jeb->dirty_size;
+		c->wasted_size += jeb->dirty_size;
+		jeb->wasted_size += jeb->dirty_size;
+		jeb->dirty_size = 0;
+		return BLK_STATE_CLEAN;
+	} else if (jeb->used_size || jeb->unchecked_size)
+		return BLK_STATE_PARTDIRTY;
+	else
+		return BLK_STATE_ALLDIRTY;
+}
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				 struct jffs2_raw_xattr *rx, uint32_t ofs,
+				 struct jffs2_summary *s)
+{
+	struct jffs2_xattr_datum *xd;
+	uint32_t xid, version, totlen, crc;
+	int err;
+
+	crc = crc32(0, rx, sizeof(struct jffs2_raw_xattr) - 4);
+	if (crc != je32_to_cpu(rx->node_crc)) {
+		JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			      ofs, je32_to_cpu(rx->node_crc), crc);
+		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen))))
+			return err;
+		return 0;
+	}
+
+	xid = je32_to_cpu(rx->xid);
+	version = je32_to_cpu(rx->version);
+
+	totlen = PAD(sizeof(struct jffs2_raw_xattr)
+			+ rx->name_len + 1 + je16_to_cpu(rx->value_len));
+	if (totlen != je32_to_cpu(rx->totlen)) {
+		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n",
+			      ofs, je32_to_cpu(rx->totlen), totlen);
+		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen))))
+			return err;
+		return 0;
+	}
+
+	xd = jffs2_setup_xattr_datum(c, xid, version);
+	if (IS_ERR(xd))
+		return PTR_ERR(xd);
+
+	if (xd->version > version) {
+		struct jffs2_raw_node_ref *raw
+			= jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, NULL);
+		raw->next_in_ino = xd->node->next_in_ino;
+		xd->node->next_in_ino = raw;
+	} else {
+		xd->version = version;
+		xd->xprefix = rx->xprefix;
+		xd->name_len = rx->name_len;
+		xd->value_len = je16_to_cpu(rx->value_len);
+		xd->data_crc = je32_to_cpu(rx->data_crc);
+
+		jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, (void *)xd);
+	}
+
+	if (jffs2_sum_active())
+		jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset);
+	dbg_xattr("scaning xdatum at %#08x (xid=%u, version=%u)\n",
+		  ofs, xd->xid, xd->version);
+	return 0;
+}
+
+static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				struct jffs2_raw_xref *rr, uint32_t ofs,
+				struct jffs2_summary *s)
+{
+	struct jffs2_xattr_ref *ref;
+	uint32_t crc;
+	int err;
+
+	crc = crc32(0, rr, sizeof(*rr) - 4);
+	if (crc != je32_to_cpu(rr->node_crc)) {
+		JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			      ofs, je32_to_cpu(rr->node_crc), crc);
+		if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rr->totlen)))))
+			return err;
+		return 0;
+	}
+
+	if (PAD(sizeof(struct jffs2_raw_xref)) != je32_to_cpu(rr->totlen)) {
+		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%zd\n",
+			      ofs, je32_to_cpu(rr->totlen),
+			      PAD(sizeof(struct jffs2_raw_xref)));
+		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rr->totlen))))
+			return err;
+		return 0;
+	}
+
+	ref = jffs2_alloc_xattr_ref();
+	if (!ref)
+		return -ENOMEM;
+
+	/* BEFORE jffs2_build_xattr_subsystem() called, 
+	 * and AFTER xattr_ref is marked as a dead xref,
+	 * ref->xid is used to store 32bit xid, xd is not used
+	 * ref->ino is used to store 32bit inode-number, ic is not used
+	 * Thoes variables are declared as union, thus using those
+	 * are exclusive. In a similar way, ref->next is temporarily
+	 * used to chain all xattr_ref object. It's re-chained to
+	 * jffs2_inode_cache in jffs2_build_xattr_subsystem() correctly.
+	 */
+	ref->ino = je32_to_cpu(rr->ino);
+	ref->xid = je32_to_cpu(rr->xid);
+	ref->xseqno = je32_to_cpu(rr->xseqno);
+	if (ref->xseqno > c->highest_xseqno)
+		c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER);
+	ref->next = c->xref_temp;
+	c->xref_temp = ref;
+
+	jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rr->totlen)), (void *)ref);
+
+	if (jffs2_sum_active())
+		jffs2_sum_add_xref_mem(s, rr, ofs - jeb->offset);
+	dbg_xattr("scan xref at %#08x (xid=%u, ino=%u)\n",
+		  ofs, ref->xid, ref->ino);
+	return 0;
+}
+#endif
+
+/* Called with 'buf_size == 0' if buf is in fact a pointer _directly_ into
+   the flash, XIP-style */
+static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				  unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
+	struct jffs2_unknown_node *node;
+	struct jffs2_unknown_node crcnode;
+	uint32_t ofs, prevofs, max_ofs;
+	uint32_t hdr_crc, buf_ofs, buf_len;
+	int err;
+	int noise = 0;
+
+
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	int cleanmarkerfound = 0;
+#endif
+
+	ofs = jeb->offset;
+	prevofs = jeb->offset - 1;
+
+	D1(printk(KERN_DEBUG "jffs2_scan_eraseblock(): Scanning block at 0x%x\n", ofs));
+
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	if (c->mtd->type == MTD_NANDFLASH) {
+		int ret;
+
+		if (c->mtd->block_isbad(c->mtd, jeb->offset))
+			return BLK_STATE_BADBLOCK;
+
+		if (jffs2_cleanmarker_oob(c)) {
+			ret = jffs2_check_nand_cleanmarker(c, jeb);
+			D2(printk(KERN_NOTICE "jffs_check_nand_cleanmarker returned %d\n", ret));
+
+			/* Even if it's not found, we still scan to see
+			   if the block is empty. We use this information
+			   to decide whether to erase it or not. */
+			switch (ret) {
+			case 0:		cleanmarkerfound = 1; break;
+			case 1: 	break;
+			default: 	return ret;
+			}
+		}
+	}
+#endif
+
+	if (jffs2_sum_active()) {
+		struct jffs2_sum_marker *sm;
+		void *sumptr = NULL;
+		uint32_t sumlen;
+	      
+		if (!buf_size) {
+			/* XIP case. Just look, point at the summary if it's there */
+			sm = (void *)buf + c->sector_size - sizeof(*sm);
+			if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC) {
+				sumptr = buf + je32_to_cpu(sm->offset);
+				sumlen = c->sector_size - je32_to_cpu(sm->offset);
+			}
+		} else {
+			/* If NAND flash, read a whole page of it. Else just the end */
+			if (c->wbuf_pagesize)
+				buf_len = c->wbuf_pagesize;
+			else
+				buf_len = sizeof(*sm);
+
+			/* Read as much as we want into the _end_ of the preallocated buffer */
+			err = jffs2_fill_scan_buf(c, buf + buf_size - buf_len, 
+						  jeb->offset + c->sector_size - buf_len,
+						  buf_len);				
+			if (err)
+				return err;
+
+			sm = (void *)buf + buf_size - sizeof(*sm);
+			if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC) {
+				sumlen = c->sector_size - je32_to_cpu(sm->offset);
+				sumptr = buf + buf_size - sumlen;
+
+				/* Now, make sure the summary itself is available */
+				if (sumlen > buf_size) {
+					/* Need to kmalloc for this. */
+					sumptr = kmalloc(sumlen, GFP_KERNEL);
+					if (!sumptr)
+						return -ENOMEM;
+					memcpy(sumptr + sumlen - buf_len, buf + buf_size - buf_len, buf_len);
+				}
+				if (buf_len < sumlen) {
+					/* Need to read more so that the entire summary node is present */
+					err = jffs2_fill_scan_buf(c, sumptr, 
+								  jeb->offset + c->sector_size - sumlen,
+								  sumlen - buf_len);				
+					if (err)
+						return err;
+				}
+			}
+
+		}
+
+		if (sumptr) {
+			err = jffs2_sum_scan_sumnode(c, jeb, sumptr, sumlen, &pseudo_random);
+
+			if (buf_size && sumlen > buf_size)
+				kfree(sumptr);
+			/* If it returns with a real error, bail. 
+			   If it returns positive, that's a block classification
+			   (i.e. BLK_STATE_xxx) so return that too.
+			   If it returns zero, fall through to full scan. */
+			if (err)
+				return err;
+		}
+	}
+
+	buf_ofs = jeb->offset;
+
+	if (!buf_size) {
+		/* This is the XIP case -- we're reading _directly_ from the flash chip */
+		buf_len = c->sector_size;
+	} else {
+		buf_len = EMPTY_SCAN_SIZE(c->sector_size);
+		err = jffs2_fill_scan_buf(c, buf, buf_ofs, buf_len);
+		if (err)
+			return err;
+	}
+
+	/* We temporarily use 'ofs' as a pointer into the buffer/jeb */
+	ofs = 0;
+	max_ofs = EMPTY_SCAN_SIZE(c->sector_size);
+	/* Scan only EMPTY_SCAN_SIZE of 0xFF before declaring it's empty */
+	while(ofs < max_ofs && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF)
+		ofs += 4;
+
+	if (ofs == max_ofs) {
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+		if (jffs2_cleanmarker_oob(c)) {
+			/* scan oob, take care of cleanmarker */
+			int ret = jffs2_check_oob_empty(c, jeb, cleanmarkerfound);
+			D2(printk(KERN_NOTICE "jffs2_check_oob_empty returned %d\n",ret));
+			switch (ret) {
+			case 0:		return cleanmarkerfound ? BLK_STATE_CLEANMARKER : BLK_STATE_ALLFF;
+			case 1: 	return BLK_STATE_ALLDIRTY;
+			default: 	return ret;
+			}
+		}
+#endif
+		D1(printk(KERN_DEBUG "Block at 0x%08x is empty (erased)\n", jeb->offset));
+		if (c->cleanmarker_size == 0)
+			return BLK_STATE_CLEANMARKER;	/* don't bother with re-erase */
+		else
+			return BLK_STATE_ALLFF;	/* OK to erase if all blocks are like this */
+	}
+	if (ofs) {
+		D1(printk(KERN_DEBUG "Free space at %08x ends at %08x\n", jeb->offset,
+			  jeb->offset + ofs));
+		if ((err = jffs2_prealloc_raw_node_refs(c, jeb, 1)))
+			return err;
+		if ((err = jffs2_scan_dirty_space(c, jeb, ofs)))
+			return err;
+	}
+
+	/* Now ofs is a complete physical flash offset as it always was... */
+	ofs += jeb->offset;
+
+	noise = 10;
+
+	dbg_summary("no summary found in jeb 0x%08x. Apply original scan.\n",jeb->offset);
+
+scan_more:
+	while(ofs < jeb->offset + c->sector_size) {
+
+		jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+
+		/* Make sure there are node refs available for use */
+		err = jffs2_prealloc_raw_node_refs(c, jeb, 2);
+		if (err)
+			return err;
+
+		cond_resched();
+
+		if (ofs & 3) {
+			printk(KERN_WARNING "Eep. ofs 0x%08x not word-aligned!\n", ofs);
+			ofs = PAD(ofs);
+			continue;
+		}
+		if (ofs == prevofs) {
+			printk(KERN_WARNING "ofs 0x%08x has already been seen. Skipping\n", ofs);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
+			ofs += 4;
+			continue;
+		}
+		prevofs = ofs;
+
+		if (jeb->offset + c->sector_size < ofs + sizeof(*node)) {
+			D1(printk(KERN_DEBUG "Fewer than %zd bytes left to end of block. (%x+%x<%x+%zx) Not reading\n", sizeof(struct jffs2_unknown_node),
+				  jeb->offset, c->sector_size, ofs, sizeof(*node)));
+			if ((err = jffs2_scan_dirty_space(c, jeb, (jeb->offset + c->sector_size)-ofs)))
+				return err;
+			break;
+		}
+
+		if (buf_ofs + buf_len < ofs + sizeof(*node)) {
+			buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+			D1(printk(KERN_DEBUG "Fewer than %zd bytes (node header) left to end of buf. Reading 0x%x at 0x%08x\n",
+				  sizeof(struct jffs2_unknown_node), buf_len, ofs));
+			err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+			if (err)
+				return err;
+			buf_ofs = ofs;
+		}
+
+		node = (struct jffs2_unknown_node *)&buf[ofs-buf_ofs];
+
+		if (*(uint32_t *)(&buf[ofs-buf_ofs]) == 0xffffffff) {
+			uint32_t inbuf_ofs;
+			uint32_t empty_start, scan_end;
+
+			empty_start = ofs;
+			ofs += 4;
+			scan_end = min_t(uint32_t, EMPTY_SCAN_SIZE(c->sector_size)/8, buf_len);
+
+			D1(printk(KERN_DEBUG "Found empty flash at 0x%08x\n", ofs));
+		more_empty:
+			inbuf_ofs = ofs - buf_ofs;
+			while (inbuf_ofs < scan_end) {
+				if (unlikely(*(uint32_t *)(&buf[inbuf_ofs]) != 0xffffffff)) {
+					printk(KERN_WARNING "Empty flash at 0x%08x ends at 0x%08x\n",
+					       empty_start, ofs);
+					if ((err = jffs2_scan_dirty_space(c, jeb, ofs-empty_start)))
+						return err;
+					goto scan_more;
+				}
+
+				inbuf_ofs+=4;
+				ofs += 4;
+			}
+			/* Ran off end. */
+			D1(printk(KERN_DEBUG "Empty flash to end of buffer at 0x%08x\n", ofs));
+
+			/* If we're only checking the beginning of a block with a cleanmarker,
+			   bail now */
+			if (buf_ofs == jeb->offset && jeb->used_size == PAD(c->cleanmarker_size) &&
+			    c->cleanmarker_size && !jeb->dirty_size && !ref_next(jeb->first_node)) {
+				D1(printk(KERN_DEBUG "%d bytes at start of block seems clean... assuming all clean\n", EMPTY_SCAN_SIZE(c->sector_size)));
+				return BLK_STATE_CLEANMARKER;
+			}
+			if (!buf_size && (scan_end != buf_len)) {/* XIP/point case */
+				scan_end = buf_len;
+				goto more_empty;
+			}
+			
+			/* See how much more there is to read in this eraseblock... */
+			buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+			if (!buf_len) {
+				/* No more to read. Break out of main loop without marking
+				   this range of empty space as dirty (because it's not) */
+				D1(printk(KERN_DEBUG "Empty flash at %08x runs to end of block. Treating as free_space\n",
+					  empty_start));
+				break;
+			}
+			/* point never reaches here */
+			scan_end = buf_len;
+			D1(printk(KERN_DEBUG "Reading another 0x%x at 0x%08x\n", buf_len, ofs));
+			err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+			if (err)
+				return err;
+			buf_ofs = ofs;
+			goto more_empty;
+		}
+
+		if (ofs == jeb->offset && je16_to_cpu(node->magic) == KSAMTIB_CIGAM_2SFFJ) {
+			printk(KERN_WARNING "Magic bitmask is backwards at offset 0x%08x. Wrong endian filesystem?\n", ofs);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
+			ofs += 4;
+			continue;
+		}
+		if (je16_to_cpu(node->magic) == JFFS2_DIRTY_BITMASK) {
+			D1(printk(KERN_DEBUG "Dirty bitmask at 0x%08x\n", ofs));
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
+			ofs += 4;
+			continue;
+		}
+		if (je16_to_cpu(node->magic) == JFFS2_OLD_MAGIC_BITMASK) {
+			printk(KERN_WARNING "Old JFFS2 bitmask found at 0x%08x\n", ofs);
+			printk(KERN_WARNING "You cannot use older JFFS2 filesystems with newer kernels\n");
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
+			ofs += 4;
+			continue;
+		}
+		if (je16_to_cpu(node->magic) != JFFS2_MAGIC_BITMASK) {
+			/* OK. We're out of possibilities. Whinge and move on */
+			noisy_printk(&noise, "jffs2_scan_eraseblock(): Magic bitmask 0x%04x not found at 0x%08x: 0x%04x instead\n",
+				     JFFS2_MAGIC_BITMASK, ofs,
+				     je16_to_cpu(node->magic));
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
+			ofs += 4;
+			continue;
+		}
+		/* We seem to have a node of sorts. Check the CRC */
+		crcnode.magic = node->magic;
+		crcnode.nodetype = cpu_to_je16( je16_to_cpu(node->nodetype) | JFFS2_NODE_ACCURATE);
+		crcnode.totlen = node->totlen;
+		hdr_crc = crc32(0, &crcnode, sizeof(crcnode)-4);
+
+		if (hdr_crc != je32_to_cpu(node->hdr_crc)) {
+			noisy_printk(&noise, "jffs2_scan_eraseblock(): Node at 0x%08x {0x%04x, 0x%04x, 0x%08x) has invalid CRC 0x%08x (calculated 0x%08x)\n",
+				     ofs, je16_to_cpu(node->magic),
+				     je16_to_cpu(node->nodetype),
+				     je32_to_cpu(node->totlen),
+				     je32_to_cpu(node->hdr_crc),
+				     hdr_crc);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
+			ofs += 4;
+			continue;
+		}
+
+		if (ofs + je32_to_cpu(node->totlen) > jeb->offset + c->sector_size) {
+			/* Eep. Node goes over the end of the erase block. */
+			printk(KERN_WARNING "Node at 0x%08x with length 0x%08x would run over the end of the erase block\n",
+			       ofs, je32_to_cpu(node->totlen));
+			printk(KERN_WARNING "Perhaps the file system was created with the wrong erase size?\n");
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
+			ofs += 4;
+			continue;
+		}
+
+		if (!(je16_to_cpu(node->nodetype) & JFFS2_NODE_ACCURATE)) {
+			/* Wheee. This is an obsoleted node */
+			D2(printk(KERN_DEBUG "Node at 0x%08x is obsolete. Skipping\n", ofs));
+			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+				return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			continue;
+		}
+
+		switch(je16_to_cpu(node->nodetype)) {
+		case JFFS2_NODETYPE_INODE:
+			if (buf_ofs + buf_len < ofs + sizeof(struct jffs2_raw_inode)) {
+				buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+				D1(printk(KERN_DEBUG "Fewer than %zd bytes (inode node) left to end of buf. Reading 0x%x at 0x%08x\n",
+					  sizeof(struct jffs2_raw_inode), buf_len, ofs));
+				err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+				if (err)
+					return err;
+				buf_ofs = ofs;
+				node = (void *)buf;
+			}
+			err = jffs2_scan_inode_node(c, jeb, (void *)node, ofs, s);
+			if (err) return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+
+		case JFFS2_NODETYPE_DIRENT:
+			if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
+				buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+				D1(printk(KERN_DEBUG "Fewer than %d bytes (dirent node) left to end of buf. Reading 0x%x at 0x%08x\n",
+					  je32_to_cpu(node->totlen), buf_len, ofs));
+				err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+				if (err)
+					return err;
+				buf_ofs = ofs;
+				node = (void *)buf;
+			}
+			err = jffs2_scan_dirent_node(c, jeb, (void *)node, ofs, s);
+			if (err) return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case JFFS2_NODETYPE_XATTR:
+			if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
+				buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+				D1(printk(KERN_DEBUG "Fewer than %d bytes (xattr node)"
+					  " left to end of buf. Reading 0x%x at 0x%08x\n",
+					  je32_to_cpu(node->totlen), buf_len, ofs));
+				err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+				if (err)
+					return err;
+				buf_ofs = ofs;
+				node = (void *)buf;
+			}
+			err = jffs2_scan_xattr_node(c, jeb, (void *)node, ofs, s);
+			if (err)
+				return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+		case JFFS2_NODETYPE_XREF:
+			if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
+				buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+				D1(printk(KERN_DEBUG "Fewer than %d bytes (xref node)"
+					  " left to end of buf. Reading 0x%x at 0x%08x\n",
+					  je32_to_cpu(node->totlen), buf_len, ofs));
+				err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+				if (err)
+					return err;
+				buf_ofs = ofs;
+				node = (void *)buf;
+			}
+			err = jffs2_scan_xref_node(c, jeb, (void *)node, ofs, s);
+			if (err)
+				return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+#endif	/* CONFIG_JFFS2_FS_XATTR */
+
+		case JFFS2_NODETYPE_CLEANMARKER:
+			D1(printk(KERN_DEBUG "CLEANMARKER node found at 0x%08x\n", ofs));
+			if (je32_to_cpu(node->totlen) != c->cleanmarker_size) {
+				printk(KERN_NOTICE "CLEANMARKER node found at 0x%08x has totlen 0x%x != normal 0x%x\n",
+				       ofs, je32_to_cpu(node->totlen), c->cleanmarker_size);
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node)))))
+					return err;
+				ofs += PAD(sizeof(struct jffs2_unknown_node));
+			} else if (jeb->first_node) {
+				printk(KERN_NOTICE "CLEANMARKER node found at 0x%08x, not first node in block (0x%08x)\n", ofs, jeb->offset);
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node)))))
+					return err;
+				ofs += PAD(sizeof(struct jffs2_unknown_node));
+			} else {
+				jffs2_link_node_ref(c, jeb, ofs | REF_NORMAL, c->cleanmarker_size, NULL);
+
+				ofs += PAD(c->cleanmarker_size);
+			}
+			break;
+
+		case JFFS2_NODETYPE_PADDING:
+			if (jffs2_sum_active())
+				jffs2_sum_add_padding_mem(s, je32_to_cpu(node->totlen));
+			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+				return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+
+		default:
+			switch (je16_to_cpu(node->nodetype) & JFFS2_COMPAT_MASK) {
+			case JFFS2_FEATURE_ROCOMPAT:
+				printk(KERN_NOTICE "Read-only compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs);
+				c->flags |= JFFS2_SB_FLAG_RO;
+				if (!(jffs2_is_readonly(c)))
+					return -EROFS;
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+					return err;
+				ofs += PAD(je32_to_cpu(node->totlen));
+				break;
+
+			case JFFS2_FEATURE_INCOMPAT:
+				printk(KERN_NOTICE "Incompatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs);
+				return -EINVAL;
+
+			case JFFS2_FEATURE_RWCOMPAT_DELETE:
+				D1(printk(KERN_NOTICE "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs));
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+					return err;
+				ofs += PAD(je32_to_cpu(node->totlen));
+				break;
+
+			case JFFS2_FEATURE_RWCOMPAT_COPY: {
+				D1(printk(KERN_NOTICE "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs));
+
+				jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(node->totlen)), NULL);
+
+				/* We can't summarise nodes we don't grok */
+				jffs2_sum_disable_collecting(s);
+				ofs += PAD(je32_to_cpu(node->totlen));
+				break;
+				}
+			}
+		}
+	}
+
+	if (jffs2_sum_active()) {
+		if (PAD(s->sum_size + JFFS2_SUMMARY_FRAME_SIZE) > jeb->free_size) {
+			dbg_summary("There is not enough space for "
+				"summary information, disabling for this jeb!\n");
+			jffs2_sum_disable_collecting(s);
+		}
+	}
+
+	D1(printk(KERN_DEBUG "Block at 0x%08x: free 0x%08x, dirty 0x%08x, unchecked 0x%08x, used 0x%08x, wasted 0x%08x\n",
+		  jeb->offset,jeb->free_size, jeb->dirty_size, jeb->unchecked_size, jeb->used_size, jeb->wasted_size));
+	
+	/* mark_node_obsolete can add to wasted !! */
+	if (jeb->wasted_size) {
+		jeb->dirty_size += jeb->wasted_size;
+		c->dirty_size += jeb->wasted_size;
+		c->wasted_size -= jeb->wasted_size;
+		jeb->wasted_size = 0;
+	}
+
+	return jffs2_scan_classify_jeb(c, jeb);
+}
+
+struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uint32_t ino)
+{
+	struct jffs2_inode_cache *ic;
+
+	ic = jffs2_get_ino_cache(c, ino);
+	if (ic)
+		return ic;
+
+	if (ino > c->highest_ino)
+		c->highest_ino = ino;
+
+	ic = jffs2_alloc_inode_cache();
+	if (!ic) {
+		printk(KERN_NOTICE "jffs2_scan_make_inode_cache(): allocation of inode cache failed\n");
+		return NULL;
+	}
+	memset(ic, 0, sizeof(*ic));
+
+	ic->ino = ino;
+	ic->nodes = (void *)ic;
+	jffs2_add_ino_cache(c, ic);
+	if (ino == 1)
+		ic->pino_nlink = 1;
+	return ic;
+}
+
+static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				 struct jffs2_raw_inode *ri, uint32_t ofs, struct jffs2_summary *s)
+{
+	struct jffs2_inode_cache *ic;
+	uint32_t crc, ino = je32_to_cpu(ri->ino);
+
+	D1(printk(KERN_DEBUG "jffs2_scan_inode_node(): Node at 0x%08x\n", ofs));
+
+	/* We do very little here now. Just check the ino# to which we should attribute
+	   this node; we can do all the CRC checking etc. later. There's a tradeoff here --
+	   we used to scan the flash once only, reading everything we want from it into
+	   memory, then building all our in-core data structures and freeing the extra
+	   information. Now we allow the first part of the mount to complete a lot quicker,
+	   but we have to go _back_ to the flash in order to finish the CRC checking, etc.
+	   Which means that the _full_ amount of time to get to proper write mode with GC
+	   operational may actually be _longer_ than before. Sucks to be me. */
+
+	/* Check the node CRC in any case. */
+	crc = crc32(0, ri, sizeof(*ri)-8);
+	if (crc != je32_to_cpu(ri->node_crc)) {
+		printk(KERN_NOTICE "jffs2_scan_inode_node(): CRC failed on "
+		       "node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+		       ofs, je32_to_cpu(ri->node_crc), crc);
+		/*
+		 * We believe totlen because the CRC on the node
+		 * _header_ was OK, just the node itself failed.
+		 */
+		return jffs2_scan_dirty_space(c, jeb,
+					      PAD(je32_to_cpu(ri->totlen)));
+	}
+
+	ic = jffs2_get_ino_cache(c, ino);
+	if (!ic) {
+		ic = jffs2_scan_make_ino_cache(c, ino);
+		if (!ic)
+			return -ENOMEM;
+	}
+
+	/* Wheee. It worked */
+	jffs2_link_node_ref(c, jeb, ofs | REF_UNCHECKED, PAD(je32_to_cpu(ri->totlen)), ic);
+
+	D1(printk(KERN_DEBUG "Node is ino #%u, version %d. Range 0x%x-0x%x\n",
+		  je32_to_cpu(ri->ino), je32_to_cpu(ri->version),
+		  je32_to_cpu(ri->offset),
+		  je32_to_cpu(ri->offset)+je32_to_cpu(ri->dsize)));
+
+	pseudo_random += je32_to_cpu(ri->version);
+
+	if (jffs2_sum_active()) {
+		jffs2_sum_add_inode_mem(s, ri, ofs - jeb->offset);
+	}
+
+	return 0;
+}
+
+static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				  struct jffs2_raw_dirent *rd, uint32_t ofs, struct jffs2_summary *s)
+{
+	struct jffs2_full_dirent *fd;
+	struct jffs2_inode_cache *ic;
+	uint32_t checkedlen;
+	uint32_t crc;
+	int err;
+
+	D1(printk(KERN_DEBUG "jffs2_scan_dirent_node(): Node at 0x%08x\n", ofs));
+
+	/* We don't get here unless the node is still valid, so we don't have to
+	   mask in the ACCURATE bit any more. */
+	crc = crc32(0, rd, sizeof(*rd)-8);
+
+	if (crc != je32_to_cpu(rd->node_crc)) {
+		printk(KERN_NOTICE "jffs2_scan_dirent_node(): Node CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+		       ofs, je32_to_cpu(rd->node_crc), crc);
+		/* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */
+		if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen)))))
+			return err;
+		return 0;
+	}
+
+	pseudo_random += je32_to_cpu(rd->version);
+
+	/* Should never happen. Did. (OLPC trac #4184)*/
+	checkedlen = strnlen(rd->name, rd->nsize);
+	if (checkedlen < rd->nsize) {
+		printk(KERN_ERR "Dirent at %08x has zeroes in name. Truncating to %d chars\n",
+		       ofs, checkedlen);
+	}
+	fd = jffs2_alloc_full_dirent(checkedlen+1);
+	if (!fd) {
+		return -ENOMEM;
+	}
+	memcpy(&fd->name, rd->name, checkedlen);
+	fd->name[checkedlen] = 0;
+
+	crc = crc32(0, fd->name, rd->nsize);
+	if (crc != je32_to_cpu(rd->name_crc)) {
+		printk(KERN_NOTICE "jffs2_scan_dirent_node(): Name CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+		       ofs, je32_to_cpu(rd->name_crc), crc);
+		D1(printk(KERN_NOTICE "Name for which CRC failed is (now) '%s', ino #%d\n", fd->name, je32_to_cpu(rd->ino)));
+		jffs2_free_full_dirent(fd);
+		/* FIXME: Why do we believe totlen? */
+		/* We believe totlen because the CRC on the node _header_ was OK, just the name failed. */
+		if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen)))))
+			return err;
+		return 0;
+	}
+	ic = jffs2_scan_make_ino_cache(c, je32_to_cpu(rd->pino));
+	if (!ic) {
+		jffs2_free_full_dirent(fd);
+		return -ENOMEM;
+	}
+
+	fd->raw = jffs2_link_node_ref(c, jeb, ofs | dirent_node_state(rd),
+				      PAD(je32_to_cpu(rd->totlen)), ic);
+
+	fd->next = NULL;
+	fd->version = je32_to_cpu(rd->version);
+	fd->ino = je32_to_cpu(rd->ino);
+	fd->nhash = full_name_hash(fd->name, checkedlen);
+	fd->type = rd->type;
+	jffs2_add_fd_to_list(c, fd, &ic->scan_dents);
+
+	if (jffs2_sum_active()) {
+		jffs2_sum_add_dirent_mem(s, rd, ofs - jeb->offset);
+	}
+
+	return 0;
+}
+
+static int count_list(struct list_head *l)
+{
+	uint32_t count = 0;
+	struct list_head *tmp;
+
+	list_for_each(tmp, l) {
+		count++;
+	}
+	return count;
+}
+
+/* Note: This breaks if list_empty(head). I don't care. You
+   might, if you copy this code and use it elsewhere :) */
+static void rotate_list(struct list_head *head, uint32_t count)
+{
+	struct list_head *n = head->next;
+
+	list_del(head);
+	while(count--) {
+		n = n->next;
+	}
+	list_add(head, n);
+}
+
+void jffs2_rotate_lists(struct jffs2_sb_info *c)
+{
+	uint32_t x;
+	uint32_t rotateby;
+
+	x = count_list(&c->clean_list);
+	if (x) {
+		rotateby = pseudo_random % x;
+		rotate_list((&c->clean_list), rotateby);
+	}
+
+	x = count_list(&c->very_dirty_list);
+	if (x) {
+		rotateby = pseudo_random % x;
+		rotate_list((&c->very_dirty_list), rotateby);
+	}
+
+	x = count_list(&c->dirty_list);
+	if (x) {
+		rotateby = pseudo_random % x;
+		rotate_list((&c->dirty_list), rotateby);
+	}
+
+	x = count_list(&c->erasable_list);
+	if (x) {
+		rotateby = pseudo_random % x;
+		rotate_list((&c->erasable_list), rotateby);
+	}
+
+	if (c->nr_erasing_blocks) {
+		rotateby = pseudo_random % c->nr_erasing_blocks;
+		rotate_list((&c->erase_pending_list), rotateby);
+	}
+
+	if (c->nr_free_blocks) {
+		rotateby = pseudo_random % c->nr_free_blocks;
+		rotate_list((&c->free_list), rotateby);
+	}
+}
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
new file mode 100644
index 00000000..cfeb7164
--- /dev/null
+++ b/fs/jffs2/security.c
@@ -0,0 +1,86 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include <linux/security.h>
+#include "nodelist.h"
+
+/* ---- Initial Security Label Attachment -------------- */
+int jffs2_init_security(struct inode *inode, struct inode *dir,
+			const struct qstr *qstr)
+{
+	int rc;
+	size_t len;
+	void *value;
+	char *name;
+
+	rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
+	if (rc) {
+		if (rc == -EOPNOTSUPP)
+			return 0;
+		return rc;
+	}
+	rc = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, value, len, 0);
+
+	kfree(name);
+	kfree(value);
+	return rc;
+}
+
+/* ---- XATTR Handler for "security.*" ----------------- */
+static int jffs2_security_getxattr(struct dentry *dentry, const char *name,
+				   void *buffer, size_t size, int type)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY,
+				 name, buffer, size);
+}
+
+static int jffs2_security_setxattr(struct dentry *dentry, const char *name,
+		const void *buffer, size_t size, int flags, int type)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY,
+				 name, buffer, size, flags);
+}
+
+static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
+
+	if (list && retlen <= list_size) {
+		strcpy(list, XATTR_SECURITY_PREFIX);
+		strcpy(list + XATTR_SECURITY_PREFIX_LEN, name);
+	}
+
+	return retlen;
+}
+
+const struct xattr_handler jffs2_security_xattr_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.list = jffs2_security_listxattr,
+	.set = jffs2_security_setxattr,
+	.get = jffs2_security_getxattr
+};
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
new file mode 100644
index 00000000..e537fb0e
--- /dev/null
+++ b/fs/jffs2/summary.c
@@ -0,0 +1,869 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2004  Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ *		     Zoltan Sogor <weth@inf.u-szeged.hu>,
+ *		     Patrik Kluba <pajko@halom.u-szeged.hu>,
+ *		     University of Szeged, Hungary
+ *	       2006  KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mtd/mtd.h>
+#include <linux/pagemap.h>
+#include <linux/crc32.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include "nodelist.h"
+#include "debug.h"
+
+int jffs2_sum_init(struct jffs2_sb_info *c)
+{
+	uint32_t sum_size = min_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE);
+
+	c->summary = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);
+
+	if (!c->summary) {
+		JFFS2_WARNING("Can't allocate memory for summary information!\n");
+		return -ENOMEM;
+	}
+
+	c->summary->sum_buf = kmalloc(sum_size, GFP_KERNEL);
+
+	if (!c->summary->sum_buf) {
+		JFFS2_WARNING("Can't allocate buffer for writing out summary information!\n");
+		kfree(c->summary);
+		return -ENOMEM;
+	}
+
+	dbg_summary("returned successfully\n");
+
+	return 0;
+}
+
+void jffs2_sum_exit(struct jffs2_sb_info *c)
+{
+	dbg_summary("called\n");
+
+	jffs2_sum_disable_collecting(c->summary);
+
+	kfree(c->summary->sum_buf);
+	c->summary->sum_buf = NULL;
+
+	kfree(c->summary);
+	c->summary = NULL;
+}
+
+static int jffs2_sum_add_mem(struct jffs2_summary *s, union jffs2_sum_mem *item)
+{
+	if (!s->sum_list_head)
+		s->sum_list_head = (union jffs2_sum_mem *) item;
+	if (s->sum_list_tail)
+		s->sum_list_tail->u.next = (union jffs2_sum_mem *) item;
+	s->sum_list_tail = (union jffs2_sum_mem *) item;
+
+	switch (je16_to_cpu(item->u.nodetype)) {
+		case JFFS2_NODETYPE_INODE:
+			s->sum_size += JFFS2_SUMMARY_INODE_SIZE;
+			s->sum_num++;
+			dbg_summary("inode (%u) added to summary\n",
+						je32_to_cpu(item->i.inode));
+			break;
+		case JFFS2_NODETYPE_DIRENT:
+			s->sum_size += JFFS2_SUMMARY_DIRENT_SIZE(item->d.nsize);
+			s->sum_num++;
+			dbg_summary("dirent (%u) added to summary\n",
+						je32_to_cpu(item->d.ino));
+			break;
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case JFFS2_NODETYPE_XATTR:
+			s->sum_size += JFFS2_SUMMARY_XATTR_SIZE;
+			s->sum_num++;
+			dbg_summary("xattr (xid=%u, version=%u) added to summary\n",
+				    je32_to_cpu(item->x.xid), je32_to_cpu(item->x.version));
+			break;
+		case JFFS2_NODETYPE_XREF:
+			s->sum_size += JFFS2_SUMMARY_XREF_SIZE;
+			s->sum_num++;
+			dbg_summary("xref added to summary\n");
+			break;
+#endif
+		default:
+			JFFS2_WARNING("UNKNOWN node type %u\n",
+					    je16_to_cpu(item->u.nodetype));
+			return 1;
+	}
+	return 0;
+}
+
+
+/* The following 3 functions are called from scan.c to collect summary info for not closed jeb */
+
+int jffs2_sum_add_padding_mem(struct jffs2_summary *s, uint32_t size)
+{
+	dbg_summary("called with %u\n", size);
+	s->sum_padded += size;
+	return 0;
+}
+
+int jffs2_sum_add_inode_mem(struct jffs2_summary *s, struct jffs2_raw_inode *ri,
+				uint32_t ofs)
+{
+	struct jffs2_sum_inode_mem *temp = kmalloc(sizeof(struct jffs2_sum_inode_mem), GFP_KERNEL);
+
+	if (!temp)
+		return -ENOMEM;
+
+	temp->nodetype = ri->nodetype;
+	temp->inode = ri->ino;
+	temp->version = ri->version;
+	temp->offset = cpu_to_je32(ofs); /* relative offset from the beginning of the jeb */
+	temp->totlen = ri->totlen;
+	temp->next = NULL;
+
+	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+
+int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *rd,
+				uint32_t ofs)
+{
+	struct jffs2_sum_dirent_mem *temp =
+		kmalloc(sizeof(struct jffs2_sum_dirent_mem) + rd->nsize, GFP_KERNEL);
+
+	if (!temp)
+		return -ENOMEM;
+
+	temp->nodetype = rd->nodetype;
+	temp->totlen = rd->totlen;
+	temp->offset = cpu_to_je32(ofs);	/* relative from the beginning of the jeb */
+	temp->pino = rd->pino;
+	temp->version = rd->version;
+	temp->ino = rd->ino;
+	temp->nsize = rd->nsize;
+	temp->type = rd->type;
+	temp->next = NULL;
+
+	memcpy(temp->name, rd->name, rd->nsize);
+
+	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs)
+{
+	struct jffs2_sum_xattr_mem *temp;
+
+	temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL);
+	if (!temp)
+		return -ENOMEM;
+
+	temp->nodetype = rx->nodetype;
+	temp->xid = rx->xid;
+	temp->version = rx->version;
+	temp->offset = cpu_to_je32(ofs);
+	temp->totlen = rx->totlen;
+	temp->next = NULL;
+
+	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+
+int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs)
+{
+	struct jffs2_sum_xref_mem *temp;
+
+	temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL);
+	if (!temp)
+		return -ENOMEM;
+
+	temp->nodetype = rr->nodetype;
+	temp->offset = cpu_to_je32(ofs);
+	temp->next = NULL;
+
+	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+#endif
+/* Cleanup every collected summary information */
+
+static void jffs2_sum_clean_collected(struct jffs2_summary *s)
+{
+	union jffs2_sum_mem *temp;
+
+	if (!s->sum_list_head) {
+		dbg_summary("already empty\n");
+	}
+	while (s->sum_list_head) {
+		temp = s->sum_list_head;
+		s->sum_list_head = s->sum_list_head->u.next;
+		kfree(temp);
+	}
+	s->sum_list_tail = NULL;
+	s->sum_padded = 0;
+	s->sum_num = 0;
+}
+
+void jffs2_sum_reset_collected(struct jffs2_summary *s)
+{
+	dbg_summary("called\n");
+	jffs2_sum_clean_collected(s);
+	s->sum_size = 0;
+}
+
+void jffs2_sum_disable_collecting(struct jffs2_summary *s)
+{
+	dbg_summary("called\n");
+	jffs2_sum_clean_collected(s);
+	s->sum_size = JFFS2_SUMMARY_NOSUM_SIZE;
+}
+
+int jffs2_sum_is_disabled(struct jffs2_summary *s)
+{
+	return (s->sum_size == JFFS2_SUMMARY_NOSUM_SIZE);
+}
+
+/* Move the collected summary information into sb (called from scan.c) */
+
+void jffs2_sum_move_collected(struct jffs2_sb_info *c, struct jffs2_summary *s)
+{
+	dbg_summary("oldsize=0x%x oldnum=%u => newsize=0x%x newnum=%u\n",
+				c->summary->sum_size, c->summary->sum_num,
+				s->sum_size, s->sum_num);
+
+	c->summary->sum_size = s->sum_size;
+	c->summary->sum_num = s->sum_num;
+	c->summary->sum_padded = s->sum_padded;
+	c->summary->sum_list_head = s->sum_list_head;
+	c->summary->sum_list_tail = s->sum_list_tail;
+
+	s->sum_list_head = s->sum_list_tail = NULL;
+}
+
+/* Called from wbuf.c to collect writed node info */
+
+int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs,
+				unsigned long count, uint32_t ofs)
+{
+	union jffs2_node_union *node;
+	struct jffs2_eraseblock *jeb;
+
+	if (c->summary->sum_size == JFFS2_SUMMARY_NOSUM_SIZE) {
+		dbg_summary("Summary is disabled for this jeb! Skipping summary info!\n");
+		return 0;
+	}
+
+	node = invecs[0].iov_base;
+	jeb = &c->blocks[ofs / c->sector_size];
+	ofs -= jeb->offset;
+
+	switch (je16_to_cpu(node->u.nodetype)) {
+		case JFFS2_NODETYPE_INODE: {
+			struct jffs2_sum_inode_mem *temp =
+				kmalloc(sizeof(struct jffs2_sum_inode_mem), GFP_KERNEL);
+
+			if (!temp)
+				goto no_mem;
+
+			temp->nodetype = node->i.nodetype;
+			temp->inode = node->i.ino;
+			temp->version = node->i.version;
+			temp->offset = cpu_to_je32(ofs);
+			temp->totlen = node->i.totlen;
+			temp->next = NULL;
+
+			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+		}
+
+		case JFFS2_NODETYPE_DIRENT: {
+			struct jffs2_sum_dirent_mem *temp =
+				kmalloc(sizeof(struct jffs2_sum_dirent_mem) + node->d.nsize, GFP_KERNEL);
+
+			if (!temp)
+				goto no_mem;
+
+			temp->nodetype = node->d.nodetype;
+			temp->totlen = node->d.totlen;
+			temp->offset = cpu_to_je32(ofs);
+			temp->pino = node->d.pino;
+			temp->version = node->d.version;
+			temp->ino = node->d.ino;
+			temp->nsize = node->d.nsize;
+			temp->type = node->d.type;
+			temp->next = NULL;
+
+			switch (count) {
+				case 1:
+					memcpy(temp->name,node->d.name,node->d.nsize);
+					break;
+
+				case 2:
+					memcpy(temp->name,invecs[1].iov_base,node->d.nsize);
+					break;
+
+				default:
+					BUG();	/* impossible count value */
+					break;
+			}
+
+			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+		}
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case JFFS2_NODETYPE_XATTR: {
+			struct jffs2_sum_xattr_mem *temp;
+			temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL);
+			if (!temp)
+				goto no_mem;
+
+			temp->nodetype = node->x.nodetype;
+			temp->xid = node->x.xid;
+			temp->version = node->x.version;
+			temp->totlen = node->x.totlen;
+			temp->offset = cpu_to_je32(ofs);
+			temp->next = NULL;
+
+			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+		}
+		case JFFS2_NODETYPE_XREF: {
+			struct jffs2_sum_xref_mem *temp;
+			temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL);
+			if (!temp)
+				goto no_mem;
+			temp->nodetype = node->r.nodetype;
+			temp->offset = cpu_to_je32(ofs);
+			temp->next = NULL;
+
+			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+		}
+#endif
+		case JFFS2_NODETYPE_PADDING:
+			dbg_summary("node PADDING\n");
+			c->summary->sum_padded += je32_to_cpu(node->u.totlen);
+			break;
+
+		case JFFS2_NODETYPE_CLEANMARKER:
+			dbg_summary("node CLEANMARKER\n");
+			break;
+
+		case JFFS2_NODETYPE_SUMMARY:
+			dbg_summary("node SUMMARY\n");
+			break;
+
+		default:
+			/* If you implement a new node type you should also implement
+			   summary support for it or disable summary.
+			*/
+			BUG();
+			break;
+	}
+
+	return 0;
+
+no_mem:
+	JFFS2_WARNING("MEMORY ALLOCATION ERROR!");
+	return -ENOMEM;
+}
+
+static struct jffs2_raw_node_ref *sum_link_node_ref(struct jffs2_sb_info *c,
+						    struct jffs2_eraseblock *jeb,
+						    uint32_t ofs, uint32_t len,
+						    struct jffs2_inode_cache *ic)
+{
+	/* If there was a gap, mark it dirty */
+	if ((ofs & ~3) > c->sector_size - jeb->free_size) {
+		/* Ew. Summary doesn't actually tell us explicitly about dirty space */
+		jffs2_scan_dirty_space(c, jeb, (ofs & ~3) - (c->sector_size - jeb->free_size));
+	}
+
+	return jffs2_link_node_ref(c, jeb, jeb->offset + ofs, len, ic);
+}
+
+/* Process the stored summary information - helper function for jffs2_sum_scan_sumnode() */
+
+static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				struct jffs2_raw_summary *summary, uint32_t *pseudo_random)
+{
+	struct jffs2_inode_cache *ic;
+	struct jffs2_full_dirent *fd;
+	void *sp;
+	int i, ino;
+	int err;
+
+	sp = summary->sum;
+
+	for (i=0; i<je32_to_cpu(summary->sum_num); i++) {
+		dbg_summary("processing summary index %d\n", i);
+
+		cond_resched();
+
+		/* Make sure there's a spare ref for dirty space */
+		err = jffs2_prealloc_raw_node_refs(c, jeb, 2);
+		if (err)
+			return err;
+
+		switch (je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype)) {
+			case JFFS2_NODETYPE_INODE: {
+				struct jffs2_sum_inode_flash *spi;
+				spi = sp;
+
+				ino = je32_to_cpu(spi->inode);
+
+				dbg_summary("Inode at 0x%08x-0x%08x\n",
+					    jeb->offset + je32_to_cpu(spi->offset),
+					    jeb->offset + je32_to_cpu(spi->offset) + je32_to_cpu(spi->totlen));
+
+				ic = jffs2_scan_make_ino_cache(c, ino);
+				if (!ic) {
+					JFFS2_NOTICE("scan_make_ino_cache failed\n");
+					return -ENOMEM;
+				}
+
+				sum_link_node_ref(c, jeb, je32_to_cpu(spi->offset) | REF_UNCHECKED,
+						  PAD(je32_to_cpu(spi->totlen)), ic);
+
+				*pseudo_random += je32_to_cpu(spi->version);
+
+				sp += JFFS2_SUMMARY_INODE_SIZE;
+
+				break;
+			}
+
+			case JFFS2_NODETYPE_DIRENT: {
+				struct jffs2_sum_dirent_flash *spd;
+				int checkedlen;
+				spd = sp;
+
+				dbg_summary("Dirent at 0x%08x-0x%08x\n",
+					    jeb->offset + je32_to_cpu(spd->offset),
+					    jeb->offset + je32_to_cpu(spd->offset) + je32_to_cpu(spd->totlen));
+
+
+				/* This should never happen, but https://dev.laptop.org/ticket/4184 */
+				checkedlen = strnlen(spd->name, spd->nsize);
+				if (!checkedlen) {
+					printk(KERN_ERR "Dirent at %08x has zero at start of name. Aborting mount.\n",
+					       jeb->offset + je32_to_cpu(spd->offset));
+					return -EIO;
+				}
+				if (checkedlen < spd->nsize) {
+					printk(KERN_ERR "Dirent at %08x has zeroes in name. Truncating to %d chars\n",
+					       jeb->offset + je32_to_cpu(spd->offset), checkedlen);
+				}
+
+
+				fd = jffs2_alloc_full_dirent(checkedlen+1);
+				if (!fd)
+					return -ENOMEM;
+
+				memcpy(&fd->name, spd->name, checkedlen);
+				fd->name[checkedlen] = 0;
+
+				ic = jffs2_scan_make_ino_cache(c, je32_to_cpu(spd->pino));
+				if (!ic) {
+					jffs2_free_full_dirent(fd);
+					return -ENOMEM;
+				}
+
+				fd->raw = sum_link_node_ref(c, jeb,  je32_to_cpu(spd->offset) | REF_UNCHECKED,
+							    PAD(je32_to_cpu(spd->totlen)), ic);
+
+				fd->next = NULL;
+				fd->version = je32_to_cpu(spd->version);
+				fd->ino = je32_to_cpu(spd->ino);
+				fd->nhash = full_name_hash(fd->name, checkedlen);
+				fd->type = spd->type;
+
+				jffs2_add_fd_to_list(c, fd, &ic->scan_dents);
+
+				*pseudo_random += je32_to_cpu(spd->version);
+
+				sp += JFFS2_SUMMARY_DIRENT_SIZE(spd->nsize);
+
+				break;
+			}
+#ifdef CONFIG_JFFS2_FS_XATTR
+			case JFFS2_NODETYPE_XATTR: {
+				struct jffs2_xattr_datum *xd;
+				struct jffs2_sum_xattr_flash *spx;
+
+				spx = (struct jffs2_sum_xattr_flash *)sp;
+				dbg_summary("xattr at %#08x-%#08x (xid=%u, version=%u)\n", 
+					    jeb->offset + je32_to_cpu(spx->offset),
+					    jeb->offset + je32_to_cpu(spx->offset) + je32_to_cpu(spx->totlen),
+					    je32_to_cpu(spx->xid), je32_to_cpu(spx->version));
+
+				xd = jffs2_setup_xattr_datum(c, je32_to_cpu(spx->xid),
+								je32_to_cpu(spx->version));
+				if (IS_ERR(xd))
+					return PTR_ERR(xd);
+				if (xd->version > je32_to_cpu(spx->version)) {
+					/* node is not the newest one */
+					struct jffs2_raw_node_ref *raw
+						= sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
+								    PAD(je32_to_cpu(spx->totlen)), NULL);
+					raw->next_in_ino = xd->node->next_in_ino;
+					xd->node->next_in_ino = raw;
+				} else {
+					xd->version = je32_to_cpu(spx->version);
+					sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
+							  PAD(je32_to_cpu(spx->totlen)), (void *)xd);
+				}
+				*pseudo_random += je32_to_cpu(spx->xid);
+				sp += JFFS2_SUMMARY_XATTR_SIZE;
+
+				break;
+			}
+			case JFFS2_NODETYPE_XREF: {
+				struct jffs2_xattr_ref *ref;
+				struct jffs2_sum_xref_flash *spr;
+
+				spr = (struct jffs2_sum_xref_flash *)sp;
+				dbg_summary("xref at %#08x-%#08x\n",
+					    jeb->offset + je32_to_cpu(spr->offset),
+					    jeb->offset + je32_to_cpu(spr->offset) + 
+					    (uint32_t)PAD(sizeof(struct jffs2_raw_xref)));
+
+				ref = jffs2_alloc_xattr_ref();
+				if (!ref) {
+					JFFS2_NOTICE("allocation of xattr_datum failed\n");
+					return -ENOMEM;
+				}
+				ref->next = c->xref_temp;
+				c->xref_temp = ref;
+
+				sum_link_node_ref(c, jeb, je32_to_cpu(spr->offset) | REF_UNCHECKED,
+						  PAD(sizeof(struct jffs2_raw_xref)), (void *)ref);
+
+				*pseudo_random += ref->node->flash_offset;
+				sp += JFFS2_SUMMARY_XREF_SIZE;
+
+				break;
+			}
+#endif
+			default : {
+				uint16_t nodetype = je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype);
+				JFFS2_WARNING("Unsupported node type %x found in summary! Exiting...\n", nodetype);
+				if ((nodetype & JFFS2_COMPAT_MASK) == JFFS2_FEATURE_INCOMPAT)
+					return -EIO;
+
+				/* For compatible node types, just fall back to the full scan */
+				c->wasted_size -= jeb->wasted_size;
+				c->free_size += c->sector_size - jeb->free_size;
+				c->used_size -= jeb->used_size;
+				c->dirty_size -= jeb->dirty_size;
+				jeb->wasted_size = jeb->used_size = jeb->dirty_size = 0;
+				jeb->free_size = c->sector_size;
+
+				jffs2_free_jeb_node_refs(c, jeb);
+				return -ENOTRECOVERABLE;
+			}
+		}
+	}
+	return 0;
+}
+
+/* Process the summary node - called from jffs2_scan_eraseblock() */
+int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+			   struct jffs2_raw_summary *summary, uint32_t sumsize,
+			   uint32_t *pseudo_random)
+{
+	struct jffs2_unknown_node crcnode;
+	int ret, ofs;
+	uint32_t crc;
+
+	ofs = c->sector_size - sumsize;
+
+	dbg_summary("summary found for 0x%08x at 0x%08x (0x%x bytes)\n",
+		    jeb->offset, jeb->offset + ofs, sumsize);
+
+	/* OK, now check for node validity and CRC */
+	crcnode.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	crcnode.nodetype = cpu_to_je16(JFFS2_NODETYPE_SUMMARY);
+	crcnode.totlen = summary->totlen;
+	crc = crc32(0, &crcnode, sizeof(crcnode)-4);
+
+	if (je32_to_cpu(summary->hdr_crc) != crc) {
+		dbg_summary("Summary node header is corrupt (bad CRC or "
+				"no summary at all)\n");
+		goto crc_err;
+	}
+
+	if (je32_to_cpu(summary->totlen) != sumsize) {
+		dbg_summary("Summary node is corrupt (wrong erasesize?)\n");
+		goto crc_err;
+	}
+
+	crc = crc32(0, summary, sizeof(struct jffs2_raw_summary)-8);
+
+	if (je32_to_cpu(summary->node_crc) != crc) {
+		dbg_summary("Summary node is corrupt (bad CRC)\n");
+		goto crc_err;
+	}
+
+	crc = crc32(0, summary->sum, sumsize - sizeof(struct jffs2_raw_summary));
+
+	if (je32_to_cpu(summary->sum_crc) != crc) {
+		dbg_summary("Summary node data is corrupt (bad CRC)\n");
+		goto crc_err;
+	}
+
+	if ( je32_to_cpu(summary->cln_mkr) ) {
+
+		dbg_summary("Summary : CLEANMARKER node \n");
+
+		ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
+		if (ret)
+			return ret;
+
+		if (je32_to_cpu(summary->cln_mkr) != c->cleanmarker_size) {
+			dbg_summary("CLEANMARKER node has totlen 0x%x != normal 0x%x\n",
+				je32_to_cpu(summary->cln_mkr), c->cleanmarker_size);
+			if ((ret = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr)))))
+				return ret;
+		} else if (jeb->first_node) {
+			dbg_summary("CLEANMARKER node not first node in block "
+					"(0x%08x)\n", jeb->offset);
+			if ((ret = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr)))))
+				return ret;
+		} else {
+			jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL,
+					    je32_to_cpu(summary->cln_mkr), NULL);
+		}
+	}
+
+	ret = jffs2_sum_process_sum_data(c, jeb, summary, pseudo_random);
+	/* -ENOTRECOVERABLE isn't a fatal error -- it means we should do a full
+	   scan of this eraseblock. So return zero */
+	if (ret == -ENOTRECOVERABLE)
+		return 0;
+	if (ret)
+		return ret;		/* real error */
+
+	/* for PARANOIA_CHECK */
+	ret = jffs2_prealloc_raw_node_refs(c, jeb, 2);
+	if (ret)
+		return ret;
+
+	sum_link_node_ref(c, jeb, ofs | REF_NORMAL, sumsize, NULL);
+
+	if (unlikely(jeb->free_size)) {
+		JFFS2_WARNING("Free size 0x%x bytes in eraseblock @0x%08x with summary?\n",
+			      jeb->free_size, jeb->offset);
+		jeb->wasted_size += jeb->free_size;
+		c->wasted_size += jeb->free_size;
+		c->free_size -= jeb->free_size;
+		jeb->free_size = 0;
+	}
+
+	return jffs2_scan_classify_jeb(c, jeb);
+
+crc_err:
+	JFFS2_WARNING("Summary node crc error, skipping summary information.\n");
+
+	return 0;
+}
+
+/* Write summary data to flash - helper function for jffs2_sum_write_sumnode() */
+
+static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				uint32_t infosize, uint32_t datasize, int padsize)
+{
+	struct jffs2_raw_summary isum;
+	union jffs2_sum_mem *temp;
+	struct jffs2_sum_marker *sm;
+	struct kvec vecs[2];
+	uint32_t sum_ofs;
+	void *wpage;
+	int ret;
+	size_t retlen;
+
+	if (padsize + datasize > MAX_SUMMARY_SIZE) {
+		/* It won't fit in the buffer. Abort summary for this jeb */
+		jffs2_sum_disable_collecting(c->summary);
+
+		JFFS2_WARNING("Summary too big (%d data, %d pad) in eraseblock at %08x\n",
+			      datasize, padsize, jeb->offset);
+		/* Non-fatal */
+		return 0;
+	}
+	/* Is there enough space for summary? */
+	if (padsize < 0) {
+		/* don't try to write out summary for this jeb */
+		jffs2_sum_disable_collecting(c->summary);
+
+		JFFS2_WARNING("Not enough space for summary, padsize = %d\n",
+			      padsize);
+		/* Non-fatal */
+		return 0;
+	}
+
+	memset(c->summary->sum_buf, 0xff, datasize);
+	memset(&isum, 0, sizeof(isum));
+
+	isum.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	isum.nodetype = cpu_to_je16(JFFS2_NODETYPE_SUMMARY);
+	isum.totlen = cpu_to_je32(infosize);
+	isum.hdr_crc = cpu_to_je32(crc32(0, &isum, sizeof(struct jffs2_unknown_node) - 4));
+	isum.padded = cpu_to_je32(c->summary->sum_padded);
+	isum.cln_mkr = cpu_to_je32(c->cleanmarker_size);
+	isum.sum_num = cpu_to_je32(c->summary->sum_num);
+	wpage = c->summary->sum_buf;
+
+	while (c->summary->sum_num) {
+		temp = c->summary->sum_list_head;
+
+		switch (je16_to_cpu(temp->u.nodetype)) {
+			case JFFS2_NODETYPE_INODE: {
+				struct jffs2_sum_inode_flash *sino_ptr = wpage;
+
+				sino_ptr->nodetype = temp->i.nodetype;
+				sino_ptr->inode = temp->i.inode;
+				sino_ptr->version = temp->i.version;
+				sino_ptr->offset = temp->i.offset;
+				sino_ptr->totlen = temp->i.totlen;
+
+				wpage += JFFS2_SUMMARY_INODE_SIZE;
+
+				break;
+			}
+
+			case JFFS2_NODETYPE_DIRENT: {
+				struct jffs2_sum_dirent_flash *sdrnt_ptr = wpage;
+
+				sdrnt_ptr->nodetype = temp->d.nodetype;
+				sdrnt_ptr->totlen = temp->d.totlen;
+				sdrnt_ptr->offset = temp->d.offset;
+				sdrnt_ptr->pino = temp->d.pino;
+				sdrnt_ptr->version = temp->d.version;
+				sdrnt_ptr->ino = temp->d.ino;
+				sdrnt_ptr->nsize = temp->d.nsize;
+				sdrnt_ptr->type = temp->d.type;
+
+				memcpy(sdrnt_ptr->name, temp->d.name,
+							temp->d.nsize);
+
+				wpage += JFFS2_SUMMARY_DIRENT_SIZE(temp->d.nsize);
+
+				break;
+			}
+#ifdef CONFIG_JFFS2_FS_XATTR
+			case JFFS2_NODETYPE_XATTR: {
+				struct jffs2_sum_xattr_flash *sxattr_ptr = wpage;
+
+				temp = c->summary->sum_list_head;
+				sxattr_ptr->nodetype = temp->x.nodetype;
+				sxattr_ptr->xid = temp->x.xid;
+				sxattr_ptr->version = temp->x.version;
+				sxattr_ptr->offset = temp->x.offset;
+				sxattr_ptr->totlen = temp->x.totlen;
+
+				wpage += JFFS2_SUMMARY_XATTR_SIZE;
+				break;
+			}
+			case JFFS2_NODETYPE_XREF: {
+				struct jffs2_sum_xref_flash *sxref_ptr = wpage;
+
+				temp = c->summary->sum_list_head;
+				sxref_ptr->nodetype = temp->r.nodetype;
+				sxref_ptr->offset = temp->r.offset;
+
+				wpage += JFFS2_SUMMARY_XREF_SIZE;
+				break;
+			}
+#endif
+			default : {
+				if ((je16_to_cpu(temp->u.nodetype) & JFFS2_COMPAT_MASK)
+				    == JFFS2_FEATURE_RWCOMPAT_COPY) {
+					dbg_summary("Writing unknown RWCOMPAT_COPY node type %x\n",
+						    je16_to_cpu(temp->u.nodetype));
+					jffs2_sum_disable_collecting(c->summary);
+				} else {
+					BUG();	/* unknown node in summary information */
+				}
+			}
+		}
+
+		c->summary->sum_list_head = temp->u.next;
+		kfree(temp);
+
+		c->summary->sum_num--;
+	}
+
+	jffs2_sum_reset_collected(c->summary);
+
+	wpage += padsize;
+
+	sm = wpage;
+	sm->offset = cpu_to_je32(c->sector_size - jeb->free_size);
+	sm->magic = cpu_to_je32(JFFS2_SUM_MAGIC);
+
+	isum.sum_crc = cpu_to_je32(crc32(0, c->summary->sum_buf, datasize));
+	isum.node_crc = cpu_to_je32(crc32(0, &isum, sizeof(isum) - 8));
+
+	vecs[0].iov_base = &isum;
+	vecs[0].iov_len = sizeof(isum);
+	vecs[1].iov_base = c->summary->sum_buf;
+	vecs[1].iov_len = datasize;
+
+	sum_ofs = jeb->offset + c->sector_size - jeb->free_size;
+
+	dbg_summary("JFFS2: writing out data to flash to pos : 0x%08x\n",
+		    sum_ofs);
+
+	ret = jffs2_flash_writev(c, vecs, 2, sum_ofs, &retlen, 0);
+
+	if (ret || (retlen != infosize)) {
+
+		JFFS2_WARNING("Write of %u bytes at 0x%08x failed. returned %d, retlen %zd\n",
+			      infosize, sum_ofs, ret, retlen);
+
+		if (retlen) {
+			/* Waste remaining space */
+			spin_lock(&c->erase_completion_lock);
+			jffs2_link_node_ref(c, jeb, sum_ofs | REF_OBSOLETE, infosize, NULL);
+			spin_unlock(&c->erase_completion_lock);
+		}
+
+		c->summary->sum_size = JFFS2_SUMMARY_NOSUM_SIZE;
+
+		return 0;
+	}
+
+	spin_lock(&c->erase_completion_lock);
+	jffs2_link_node_ref(c, jeb, sum_ofs | REF_NORMAL, infosize, NULL);
+	spin_unlock(&c->erase_completion_lock);
+
+	return 0;
+}
+
+/* Write out summary information - called from jffs2_do_reserve_space */
+
+int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
+{
+	int datasize, infosize, padsize;
+	struct jffs2_eraseblock *jeb;
+	int ret = 0;
+
+	dbg_summary("called\n");
+
+	spin_unlock(&c->erase_completion_lock);
+
+	jeb = c->nextblock;
+	jffs2_prealloc_raw_node_refs(c, jeb, 1);
+
+	if (!c->summary->sum_num || !c->summary->sum_list_head) {
+		JFFS2_WARNING("Empty summary info!!!\n");
+		BUG();
+	}
+
+	datasize = c->summary->sum_size + sizeof(struct jffs2_sum_marker);
+	infosize = sizeof(struct jffs2_raw_summary) + datasize;
+	padsize = jeb->free_size - infosize;
+	infosize += padsize;
+	datasize += padsize;
+
+	ret = jffs2_sum_write_data(c, jeb, infosize, datasize, padsize);
+	spin_lock(&c->erase_completion_lock);
+	return ret;
+}
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
new file mode 100644
index 00000000..60207a2a
--- /dev/null
+++ b/fs/jffs2/summary.h
@@ -0,0 +1,213 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2004  Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ *		     Zoltan Sogor <weth@inf.u-szeged.hu>,
+ *		     Patrik Kluba <pajko@halom.u-szeged.hu>,
+ *		     University of Szeged, Hungary
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef JFFS2_SUMMARY_H
+#define JFFS2_SUMMARY_H
+
+/* Limit summary size to 64KiB so that we can kmalloc it. If the summary
+   is larger than that, we have to just ditch it and avoid using summary
+   for the eraseblock in question... and it probably doesn't hurt us much
+   anyway. */
+#define MAX_SUMMARY_SIZE 65536
+
+#include <linux/uio.h>
+#include <linux/jffs2.h>
+
+#define BLK_STATE_ALLFF		0
+#define BLK_STATE_CLEAN		1
+#define BLK_STATE_PARTDIRTY	2
+#define BLK_STATE_CLEANMARKER	3
+#define BLK_STATE_ALLDIRTY	4
+#define BLK_STATE_BADBLOCK	5
+
+#define JFFS2_SUMMARY_NOSUM_SIZE 0xffffffff
+#define JFFS2_SUMMARY_INODE_SIZE (sizeof(struct jffs2_sum_inode_flash))
+#define JFFS2_SUMMARY_DIRENT_SIZE(x) (sizeof(struct jffs2_sum_dirent_flash) + (x))
+#define JFFS2_SUMMARY_XATTR_SIZE (sizeof(struct jffs2_sum_xattr_flash))
+#define JFFS2_SUMMARY_XREF_SIZE (sizeof(struct jffs2_sum_xref_flash))
+
+/* Summary structures used on flash */
+
+struct jffs2_sum_unknown_flash
+{
+	jint16_t nodetype;	/* node type */
+};
+
+struct jffs2_sum_inode_flash
+{
+	jint16_t nodetype;	/* node type */
+	jint32_t inode;		/* inode number */
+	jint32_t version;	/* inode version */
+	jint32_t offset;	/* offset on jeb */
+	jint32_t totlen; 	/* record length */
+} __attribute__((packed));
+
+struct jffs2_sum_dirent_flash
+{
+	jint16_t nodetype;	/* == JFFS_NODETYPE_DIRENT */
+	jint32_t totlen;	/* record length */
+	jint32_t offset;	/* offset on jeb */
+	jint32_t pino;		/* parent inode */
+	jint32_t version;	/* dirent version */
+	jint32_t ino; 		/* == zero for unlink */
+	uint8_t nsize;		/* dirent name size */
+	uint8_t type;		/* dirent type */
+	uint8_t name[0];	/* dirent name */
+} __attribute__((packed));
+
+struct jffs2_sum_xattr_flash
+{
+	jint16_t nodetype;	/* == JFFS2_NODETYPE_XATR */
+	jint32_t xid;		/* xattr identifier */
+	jint32_t version;	/* version number */
+	jint32_t offset;	/* offset on jeb */
+	jint32_t totlen;	/* node length */
+} __attribute__((packed));
+
+struct jffs2_sum_xref_flash
+{
+	jint16_t nodetype;	/* == JFFS2_NODETYPE_XREF */
+	jint32_t offset;	/* offset on jeb */
+} __attribute__((packed));
+
+union jffs2_sum_flash
+{
+	struct jffs2_sum_unknown_flash u;
+	struct jffs2_sum_inode_flash i;
+	struct jffs2_sum_dirent_flash d;
+	struct jffs2_sum_xattr_flash x;
+	struct jffs2_sum_xref_flash r;
+};
+
+/* Summary structures used in the memory */
+
+struct jffs2_sum_unknown_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;	/* node type */
+};
+
+struct jffs2_sum_inode_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;	/* node type */
+	jint32_t inode;		/* inode number */
+	jint32_t version;	/* inode version */
+	jint32_t offset;	/* offset on jeb */
+	jint32_t totlen; 	/* record length */
+} __attribute__((packed));
+
+struct jffs2_sum_dirent_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;	/* == JFFS_NODETYPE_DIRENT */
+	jint32_t totlen;	/* record length */
+	jint32_t offset;	/* ofset on jeb */
+	jint32_t pino;		/* parent inode */
+	jint32_t version;	/* dirent version */
+	jint32_t ino; 		/* == zero for unlink */
+	uint8_t nsize;		/* dirent name size */
+	uint8_t type;		/* dirent type */
+	uint8_t name[0];	/* dirent name */
+} __attribute__((packed));
+
+struct jffs2_sum_xattr_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;
+	jint32_t xid;
+	jint32_t version;
+	jint32_t offset;
+	jint32_t totlen;
+} __attribute__((packed));
+
+struct jffs2_sum_xref_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;
+	jint32_t offset;
+} __attribute__((packed));
+
+union jffs2_sum_mem
+{
+	struct jffs2_sum_unknown_mem u;
+	struct jffs2_sum_inode_mem i;
+	struct jffs2_sum_dirent_mem d;
+	struct jffs2_sum_xattr_mem x;
+	struct jffs2_sum_xref_mem r;
+};
+
+/* Summary related information stored in superblock */
+
+struct jffs2_summary
+{
+	uint32_t sum_size;      /* collected summary information for nextblock */
+	uint32_t sum_num;
+	uint32_t sum_padded;
+	union jffs2_sum_mem *sum_list_head;
+	union jffs2_sum_mem *sum_list_tail;
+
+	jint32_t *sum_buf;	/* buffer for writing out summary */
+};
+
+/* Summary marker is stored at the end of every sumarized erase block */
+
+struct jffs2_sum_marker
+{
+	jint32_t offset;	/* offset of the summary node in the jeb */
+	jint32_t magic; 	/* == JFFS2_SUM_MAGIC */
+};
+
+#define JFFS2_SUMMARY_FRAME_SIZE (sizeof(struct jffs2_raw_summary) + sizeof(struct jffs2_sum_marker))
+
+#ifdef CONFIG_JFFS2_SUMMARY	/* SUMMARY SUPPORT ENABLED */
+
+#define jffs2_sum_active() (1)
+int jffs2_sum_init(struct jffs2_sb_info *c);
+void jffs2_sum_exit(struct jffs2_sb_info *c);
+void jffs2_sum_disable_collecting(struct jffs2_summary *s);
+int jffs2_sum_is_disabled(struct jffs2_summary *s);
+void jffs2_sum_reset_collected(struct jffs2_summary *s);
+void jffs2_sum_move_collected(struct jffs2_sb_info *c, struct jffs2_summary *s);
+int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs,
+			unsigned long count,  uint32_t to);
+int jffs2_sum_write_sumnode(struct jffs2_sb_info *c);
+int jffs2_sum_add_padding_mem(struct jffs2_summary *s, uint32_t size);
+int jffs2_sum_add_inode_mem(struct jffs2_summary *s, struct jffs2_raw_inode *ri, uint32_t ofs);
+int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *rd, uint32_t ofs);
+int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs);
+int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs);
+int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+			   struct jffs2_raw_summary *summary, uint32_t sumlen,
+			   uint32_t *pseudo_random);
+
+#else				/* SUMMARY DISABLED */
+
+#define jffs2_sum_active() (0)
+#define jffs2_sum_init(a) (0)
+#define jffs2_sum_exit(a)
+#define jffs2_sum_disable_collecting(a)
+#define jffs2_sum_is_disabled(a) (0)
+#define jffs2_sum_reset_collected(a)
+#define jffs2_sum_add_kvec(a,b,c,d) (0)
+#define jffs2_sum_move_collected(a,b)
+#define jffs2_sum_write_sumnode(a) (0)
+#define jffs2_sum_add_padding_mem(a,b)
+#define jffs2_sum_add_inode_mem(a,b,c)
+#define jffs2_sum_add_dirent_mem(a,b,c)
+#define jffs2_sum_add_xattr_mem(a,b,c)
+#define jffs2_sum_add_xref_mem(a,b,c)
+#define jffs2_sum_scan_sumnode(a,b,c,d,e) (0)
+
+#endif /* CONFIG_JFFS2_SUMMARY */
+
+#endif /* JFFS2_SUMMARY_H */
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
new file mode 100644
index 00000000..853b8e30
--- /dev/null
+++ b/fs/jffs2/super.c
@@ -0,0 +1,316 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/mount.h>
+#include <linux/jffs2.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/super.h>
+#include <linux/ctype.h>
+#include <linux/namei.h>
+#include <linux/exportfs.h>
+#include "compr.h"
+#include "nodelist.h"
+
+static void jffs2_put_super(struct super_block *);
+
+static struct kmem_cache *jffs2_inode_cachep;
+
+static struct inode *jffs2_alloc_inode(struct super_block *sb)
+{
+	struct jffs2_inode_info *f;
+
+	f = kmem_cache_alloc(jffs2_inode_cachep, GFP_KERNEL);
+	if (!f)
+		return NULL;
+	return &f->vfs_inode;
+}
+
+static void jffs2_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
+}
+
+static void jffs2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, jffs2_i_callback);
+}
+
+static void jffs2_i_init_once(void *foo)
+{
+	struct jffs2_inode_info *f = foo;
+
+	mutex_init(&f->sem);
+	inode_init_once(&f->vfs_inode);
+}
+
+static void jffs2_write_super(struct super_block *sb)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+
+	lock_super(sb);
+	sb->s_dirt = 0;
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
+		jffs2_flush_wbuf_gc(c, 0);
+	}
+
+	unlock_super(sb);
+}
+
+static int jffs2_sync_fs(struct super_block *sb, int wait)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+
+	jffs2_write_super(sb);
+
+	mutex_lock(&c->alloc_sem);
+	jffs2_flush_wbuf_pad(c);
+	mutex_unlock(&c->alloc_sem);
+	return 0;
+}
+
+static struct inode *jffs2_nfs_get_inode(struct super_block *sb, uint64_t ino,
+					 uint32_t generation)
+{
+	/* We don't care about i_generation. We'll destroy the flash
+	   before we start re-using inode numbers anyway. And even
+	   if that wasn't true, we'd have other problems...*/
+	return jffs2_iget(sb, ino);
+}
+
+static struct dentry *jffs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+					 int fh_len, int fh_type)
+{
+        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+                                    jffs2_nfs_get_inode);
+}
+
+static struct dentry *jffs2_fh_to_parent(struct super_block *sb, struct fid *fid,
+					 int fh_len, int fh_type)
+{
+        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+                                    jffs2_nfs_get_inode);
+}
+
+static struct dentry *jffs2_get_parent(struct dentry *child)
+{
+	struct jffs2_inode_info *f;
+	uint32_t pino;
+
+	BUG_ON(!S_ISDIR(child->d_inode->i_mode));
+
+	f = JFFS2_INODE_INFO(child->d_inode);
+
+	pino = f->inocache->pino_nlink;
+
+	JFFS2_DEBUG("Parent of directory ino #%u is #%u\n",
+		    f->inocache->ino, pino);
+
+	return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino));
+}
+
+static const struct export_operations jffs2_export_ops = {
+	.get_parent = jffs2_get_parent,
+	.fh_to_dentry = jffs2_fh_to_dentry,
+	.fh_to_parent = jffs2_fh_to_parent,
+};
+
+static const struct super_operations jffs2_super_operations =
+{
+	.alloc_inode =	jffs2_alloc_inode,
+	.destroy_inode =jffs2_destroy_inode,
+	.put_super =	jffs2_put_super,
+	.write_super =	jffs2_write_super,
+	.statfs =	jffs2_statfs,
+	.remount_fs =	jffs2_remount_fs,
+	.evict_inode =	jffs2_evict_inode,
+	.dirty_inode =	jffs2_dirty_inode,
+	.sync_fs =	jffs2_sync_fs,
+};
+
+/*
+ * fill in the superblock
+ */
+static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct jffs2_sb_info *c;
+	int ret;
+
+	D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():"
+		  " New superblock for device %d (\"%s\")\n",
+		  sb->s_mtd->index, sb->s_mtd->name));
+
+	c = kzalloc(sizeof(*c), GFP_KERNEL);
+	if (!c)
+		return -ENOMEM;
+
+	c->mtd = sb->s_mtd;
+	c->os_priv = sb;
+	sb->s_fs_info = c;
+
+	/* Initialize JFFS2 superblock locks, the further initialization will
+	 * be done later */
+	mutex_init(&c->alloc_sem);
+	mutex_init(&c->erase_free_sem);
+	init_waitqueue_head(&c->erase_wait);
+	init_waitqueue_head(&c->inocache_wq);
+	spin_lock_init(&c->erase_completion_lock);
+	spin_lock_init(&c->inocache_lock);
+
+	sb->s_op = &jffs2_super_operations;
+	sb->s_export_op = &jffs2_export_ops;
+	sb->s_flags = sb->s_flags | MS_NOATIME;
+	sb->s_xattr = jffs2_xattr_handlers;
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	sb->s_flags |= MS_POSIXACL;
+#endif
+	ret = jffs2_do_fill_super(sb, data, silent);
+	return ret;
+}
+
+static struct dentry *jffs2_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data)
+{
+	return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super);
+}
+
+static void jffs2_put_super (struct super_block *sb)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+
+	D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
+
+	if (sb->s_dirt)
+		jffs2_write_super(sb);
+
+	mutex_lock(&c->alloc_sem);
+	jffs2_flush_wbuf_pad(c);
+	mutex_unlock(&c->alloc_sem);
+
+	jffs2_sum_exit(c);
+
+	jffs2_free_ino_caches(c);
+	jffs2_free_raw_node_refs(c);
+	if (jffs2_blocks_use_vmalloc(c))
+		vfree(c->blocks);
+	else
+		kfree(c->blocks);
+	jffs2_flash_cleanup(c);
+	kfree(c->inocache_list);
+	jffs2_clear_xattr_subsystem(c);
+	if (c->mtd->sync)
+		c->mtd->sync(c->mtd);
+
+	D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
+}
+
+static void jffs2_kill_sb(struct super_block *sb)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+	if (!(sb->s_flags & MS_RDONLY))
+		jffs2_stop_garbage_collect_thread(c);
+	kill_mtd_super(sb);
+	kfree(c);
+}
+
+static struct file_system_type jffs2_fs_type = {
+	.owner =	THIS_MODULE,
+	.name =		"jffs2",
+	.mount =	jffs2_mount,
+	.kill_sb =	jffs2_kill_sb,
+};
+
+static int __init init_jffs2_fs(void)
+{
+	int ret;
+
+	/* Paranoia checks for on-medium structures. If we ask GCC
+	   to pack them with __attribute__((packed)) then it _also_
+	   assumes that they're not aligned -- so it emits crappy
+	   code on some architectures. Ideally we want an attribute
+	   which means just 'no padding', without the alignment
+	   thing. But GCC doesn't have that -- we have to just
+	   hope the structs are the right sizes, instead. */
+	BUILD_BUG_ON(sizeof(struct jffs2_unknown_node) != 12);
+	BUILD_BUG_ON(sizeof(struct jffs2_raw_dirent) != 40);
+	BUILD_BUG_ON(sizeof(struct jffs2_raw_inode) != 68);
+	BUILD_BUG_ON(sizeof(struct jffs2_raw_summary) != 32);
+
+	printk(KERN_INFO "JFFS2 version 2.2."
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	       " (NAND)"
+#endif
+#ifdef CONFIG_JFFS2_SUMMARY
+	       " (SUMMARY) "
+#endif
+	       " © 2001-2006 Red Hat, Inc.\n");
+
+	jffs2_inode_cachep = kmem_cache_create("jffs2_i",
+					     sizeof(struct jffs2_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     jffs2_i_init_once);
+	if (!jffs2_inode_cachep) {
+		printk(KERN_ERR "JFFS2 error: Failed to initialise inode cache\n");
+		return -ENOMEM;
+	}
+	ret = jffs2_compressors_init();
+	if (ret) {
+		printk(KERN_ERR "JFFS2 error: Failed to initialise compressors\n");
+		goto out;
+	}
+	ret = jffs2_create_slab_caches();
+	if (ret) {
+		printk(KERN_ERR "JFFS2 error: Failed to initialise slab caches\n");
+		goto out_compressors;
+	}
+	ret = register_filesystem(&jffs2_fs_type);
+	if (ret) {
+		printk(KERN_ERR "JFFS2 error: Failed to register filesystem\n");
+		goto out_slab;
+	}
+	return 0;
+
+ out_slab:
+	jffs2_destroy_slab_caches();
+ out_compressors:
+	jffs2_compressors_exit();
+ out:
+	kmem_cache_destroy(jffs2_inode_cachep);
+	return ret;
+}
+
+static void __exit exit_jffs2_fs(void)
+{
+	unregister_filesystem(&jffs2_fs_type);
+	jffs2_destroy_slab_caches();
+	jffs2_compressors_exit();
+	kmem_cache_destroy(jffs2_inode_cachep);
+}
+
+module_init(init_jffs2_fs);
+module_exit(exit_jffs2_fs);
+
+MODULE_DESCRIPTION("The Journalling Flash File System, v2");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL"); // Actually dual-licensed, but it doesn't matter for
+		       // the sake of this tag. It's Free Software.
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
new file mode 100644
index 00000000..b9556260
--- /dev/null
+++ b/fs/jffs2/symlink.c
@@ -0,0 +1,64 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include "nodelist.h"
+
+static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd);
+
+const struct inode_operations jffs2_symlink_inode_operations =
+{
+	.readlink =	generic_readlink,
+	.follow_link =	jffs2_follow_link,
+	.check_acl =	jffs2_check_acl,
+	.setattr =	jffs2_setattr,
+	.setxattr =	jffs2_setxattr,
+	.getxattr =	jffs2_getxattr,
+	.listxattr =	jffs2_listxattr,
+	.removexattr =	jffs2_removexattr
+};
+
+static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(dentry->d_inode);
+	char *p = (char *)f->target;
+
+	/*
+	 * We don't acquire the f->sem mutex here since the only data we
+	 * use is f->target.
+	 *
+	 * 1. If we are here the inode has already built and f->target has
+	 * to point to the target path.
+	 * 2. Nobody uses f->target (if the inode is symlink's inode). The
+	 * exception is inode freeing function which frees f->target. But
+	 * it can't be called while we are here and before VFS has
+	 * stopped using our f->target string which we provide by means of
+	 * nd_set_link() call.
+	 */
+
+	if (!p) {
+		printk(KERN_ERR "jffs2_follow_link(): can't find symlink target\n");
+		p = ERR_PTR(-EIO);
+	}
+	D1(printk(KERN_DEBUG "jffs2_follow_link(): target path is '%s'\n", (char *) f->target));
+
+	nd_set_link(nd, p);
+
+	/*
+	 * We will unlock the f->sem mutex but VFS will use the f->target string. This is safe
+	 * since the only way that may cause f->target to be changed is iput() operation.
+	 * But VFS will not use f->target after iput() has been called.
+	 */
+	return NULL;
+}
+
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
new file mode 100644
index 00000000..4515bea0
--- /dev/null
+++ b/fs/jffs2/wbuf.c
@@ -0,0 +1,1310 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ * Modified debugged and enhanced by Thomas Gleixner <tglx@linutronix.de>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mtd/mtd.h>
+#include <linux/crc32.h>
+#include <linux/mtd/nand.h>
+#include <linux/jiffies.h>
+#include <linux/sched.h>
+
+#include "nodelist.h"
+
+/* For testing write failures */
+#undef BREAKME
+#undef BREAKMEHEADER
+
+#ifdef BREAKME
+static unsigned char *brokenbuf;
+#endif
+
+#define PAGE_DIV(x) ( ((unsigned long)(x) / (unsigned long)(c->wbuf_pagesize)) * (unsigned long)(c->wbuf_pagesize) )
+#define PAGE_MOD(x) ( (unsigned long)(x) % (unsigned long)(c->wbuf_pagesize) )
+
+/* max. erase failures before we mark a block bad */
+#define MAX_ERASE_FAILURES 	2
+
+struct jffs2_inodirty {
+	uint32_t ino;
+	struct jffs2_inodirty *next;
+};
+
+static struct jffs2_inodirty inodirty_nomem;
+
+static int jffs2_wbuf_pending_for_ino(struct jffs2_sb_info *c, uint32_t ino)
+{
+	struct jffs2_inodirty *this = c->wbuf_inodes;
+
+	/* If a malloc failed, consider _everything_ dirty */
+	if (this == &inodirty_nomem)
+		return 1;
+
+	/* If ino == 0, _any_ non-GC writes mean 'yes' */
+	if (this && !ino)
+		return 1;
+
+	/* Look to see if the inode in question is pending in the wbuf */
+	while (this) {
+		if (this->ino == ino)
+			return 1;
+		this = this->next;
+	}
+	return 0;
+}
+
+static void jffs2_clear_wbuf_ino_list(struct jffs2_sb_info *c)
+{
+	struct jffs2_inodirty *this;
+
+	this = c->wbuf_inodes;
+
+	if (this != &inodirty_nomem) {
+		while (this) {
+			struct jffs2_inodirty *next = this->next;
+			kfree(this);
+			this = next;
+		}
+	}
+	c->wbuf_inodes = NULL;
+}
+
+static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
+{
+	struct jffs2_inodirty *new;
+
+	/* Mark the superblock dirty so that kupdated will flush... */
+	jffs2_dirty_trigger(c);
+
+	if (jffs2_wbuf_pending_for_ino(c, ino))
+		return;
+
+	new = kmalloc(sizeof(*new), GFP_KERNEL);
+	if (!new) {
+		D1(printk(KERN_DEBUG "No memory to allocate inodirty. Fallback to all considered dirty\n"));
+		jffs2_clear_wbuf_ino_list(c);
+		c->wbuf_inodes = &inodirty_nomem;
+		return;
+	}
+	new->ino = ino;
+	new->next = c->wbuf_inodes;
+	c->wbuf_inodes = new;
+	return;
+}
+
+static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c)
+{
+	struct list_head *this, *next;
+	static int n;
+
+	if (list_empty(&c->erasable_pending_wbuf_list))
+		return;
+
+	list_for_each_safe(this, next, &c->erasable_pending_wbuf_list) {
+		struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list);
+
+		D1(printk(KERN_DEBUG "Removing eraseblock at 0x%08x from erasable_pending_wbuf_list...\n", jeb->offset));
+		list_del(this);
+		if ((jiffies + (n++)) & 127) {
+			/* Most of the time, we just erase it immediately. Otherwise we
+			   spend ages scanning it on mount, etc. */
+			D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
+			list_add_tail(&jeb->list, &c->erase_pending_list);
+			c->nr_erasing_blocks++;
+			jffs2_garbage_collect_trigger(c);
+		} else {
+			/* Sometimes, however, we leave it elsewhere so it doesn't get
+			   immediately reused, and we spread the load a bit. */
+			D1(printk(KERN_DEBUG "...and adding to erasable_list\n"));
+			list_add_tail(&jeb->list, &c->erasable_list);
+		}
+	}
+}
+
+#define REFILE_NOTEMPTY 0
+#define REFILE_ANYWAY   1
+
+static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, int allow_empty)
+{
+	D1(printk("About to refile bad block at %08x\n", jeb->offset));
+
+	/* File the existing block on the bad_used_list.... */
+	if (c->nextblock == jeb)
+		c->nextblock = NULL;
+	else /* Not sure this should ever happen... need more coffee */
+		list_del(&jeb->list);
+	if (jeb->first_node) {
+		D1(printk("Refiling block at %08x to bad_used_list\n", jeb->offset));
+		list_add(&jeb->list, &c->bad_used_list);
+	} else {
+		BUG_ON(allow_empty == REFILE_NOTEMPTY);
+		/* It has to have had some nodes or we couldn't be here */
+		D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset));
+		list_add(&jeb->list, &c->erase_pending_list);
+		c->nr_erasing_blocks++;
+		jffs2_garbage_collect_trigger(c);
+	}
+
+	if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) {
+		uint32_t oldfree = jeb->free_size;
+
+		jffs2_link_node_ref(c, jeb, 
+				    (jeb->offset+c->sector_size-oldfree) | REF_OBSOLETE,
+				    oldfree, NULL);
+		/* convert to wasted */
+		c->wasted_size += oldfree;
+		jeb->wasted_size += oldfree;
+		c->dirty_size -= oldfree;
+		jeb->dirty_size -= oldfree;
+	}
+
+	jffs2_dbg_dump_block_lists_nolock(c);
+	jffs2_dbg_acct_sanity_check_nolock(c,jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+}
+
+static struct jffs2_raw_node_ref **jffs2_incore_replace_raw(struct jffs2_sb_info *c,
+							    struct jffs2_inode_info *f,
+							    struct jffs2_raw_node_ref *raw,
+							    union jffs2_node_union *node)
+{
+	struct jffs2_node_frag *frag;
+	struct jffs2_full_dirent *fd;
+
+	dbg_noderef("incore_replace_raw: node at %p is {%04x,%04x}\n",
+		    node, je16_to_cpu(node->u.magic), je16_to_cpu(node->u.nodetype));
+
+	BUG_ON(je16_to_cpu(node->u.magic) != 0x1985 &&
+	       je16_to_cpu(node->u.magic) != 0);
+
+	switch (je16_to_cpu(node->u.nodetype)) {
+	case JFFS2_NODETYPE_INODE:
+		if (f->metadata && f->metadata->raw == raw) {
+			dbg_noderef("Will replace ->raw in f->metadata at %p\n", f->metadata);
+			return &f->metadata->raw;
+		}
+		frag = jffs2_lookup_node_frag(&f->fragtree, je32_to_cpu(node->i.offset));
+		BUG_ON(!frag);
+		/* Find a frag which refers to the full_dnode we want to modify */
+		while (!frag->node || frag->node->raw != raw) {
+			frag = frag_next(frag);
+			BUG_ON(!frag);
+		}
+		dbg_noderef("Will replace ->raw in full_dnode at %p\n", frag->node);
+		return &frag->node->raw;
+
+	case JFFS2_NODETYPE_DIRENT:
+		for (fd = f->dents; fd; fd = fd->next) {
+			if (fd->raw == raw) {
+				dbg_noderef("Will replace ->raw in full_dirent at %p\n", fd);
+				return &fd->raw;
+			}
+		}
+		BUG();
+
+	default:
+		dbg_noderef("Don't care about replacing raw for nodetype %x\n",
+			    je16_to_cpu(node->u.nodetype));
+		break;
+	}
+	return NULL;
+}
+
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+static int jffs2_verify_write(struct jffs2_sb_info *c, unsigned char *buf,
+			      uint32_t ofs)
+{
+	int ret;
+	size_t retlen;
+	char *eccstr;
+
+	ret = c->mtd->read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify);
+	if (ret && ret != -EUCLEAN && ret != -EBADMSG) {
+		printk(KERN_WARNING "jffs2_verify_write(): Read back of page at %08x failed: %d\n", c->wbuf_ofs, ret);
+		return ret;
+	} else if (retlen != c->wbuf_pagesize) {
+		printk(KERN_WARNING "jffs2_verify_write(): Read back of page at %08x gave short read: %zd not %d.\n", ofs, retlen, c->wbuf_pagesize);
+		return -EIO;
+	}
+	if (!memcmp(buf, c->wbuf_verify, c->wbuf_pagesize))
+		return 0;
+
+	if (ret == -EUCLEAN)
+		eccstr = "corrected";
+	else if (ret == -EBADMSG)
+		eccstr = "correction failed";
+	else
+		eccstr = "OK or unused";
+
+	printk(KERN_WARNING "Write verify error (ECC %s) at %08x. Wrote:\n",
+	       eccstr, c->wbuf_ofs);
+	print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1,
+		       c->wbuf, c->wbuf_pagesize, 0);
+
+	printk(KERN_WARNING "Read back:\n");
+	print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1,
+		       c->wbuf_verify, c->wbuf_pagesize, 0);
+
+	return -EIO;
+}
+#else
+#define jffs2_verify_write(c,b,o) (0)
+#endif
+
+/* Recover from failure to write wbuf. Recover the nodes up to the
+ * wbuf, not the one which we were starting to try to write. */
+
+static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
+{
+	struct jffs2_eraseblock *jeb, *new_jeb;
+	struct jffs2_raw_node_ref *raw, *next, *first_raw = NULL;
+	size_t retlen;
+	int ret;
+	int nr_refile = 0;
+	unsigned char *buf;
+	uint32_t start, end, ofs, len;
+
+	jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
+
+	spin_lock(&c->erase_completion_lock);
+	if (c->wbuf_ofs % c->mtd->erasesize)
+		jffs2_block_refile(c, jeb, REFILE_NOTEMPTY);
+	else
+		jffs2_block_refile(c, jeb, REFILE_ANYWAY);
+	spin_unlock(&c->erase_completion_lock);
+
+	BUG_ON(!ref_obsolete(jeb->last_node));
+
+	/* Find the first node to be recovered, by skipping over every
+	   node which ends before the wbuf starts, or which is obsolete. */
+	for (next = raw = jeb->first_node; next; raw = next) {
+		next = ref_next(raw);
+
+		if (ref_obsolete(raw) || 
+		    (next && ref_offset(next) <= c->wbuf_ofs)) {
+			dbg_noderef("Skipping node at 0x%08x(%d)-0x%08x which is either before 0x%08x or obsolete\n",
+				    ref_offset(raw), ref_flags(raw),
+				    (ref_offset(raw) + ref_totlen(c, jeb, raw)),
+				    c->wbuf_ofs);
+			continue;
+		}
+		dbg_noderef("First node to be recovered is at 0x%08x(%d)-0x%08x\n",
+			    ref_offset(raw), ref_flags(raw),
+			    (ref_offset(raw) + ref_totlen(c, jeb, raw)));
+
+		first_raw = raw;
+		break;
+	}
+
+	if (!first_raw) {
+		/* All nodes were obsolete. Nothing to recover. */
+		D1(printk(KERN_DEBUG "No non-obsolete nodes to be recovered. Just filing block bad\n"));
+		c->wbuf_len = 0;
+		return;
+	}
+
+	start = ref_offset(first_raw);
+	end = ref_offset(jeb->last_node);
+	nr_refile = 1;
+
+	/* Count the number of refs which need to be copied */
+	while ((raw = ref_next(raw)) != jeb->last_node)
+		nr_refile++;
+
+	dbg_noderef("wbuf recover %08x-%08x (%d bytes in %d nodes)\n",
+		    start, end, end - start, nr_refile);
+
+	buf = NULL;
+	if (start < c->wbuf_ofs) {
+		/* First affected node was already partially written.
+		 * Attempt to reread the old data into our buffer. */
+
+		buf = kmalloc(end - start, GFP_KERNEL);
+		if (!buf) {
+			printk(KERN_CRIT "Malloc failure in wbuf recovery. Data loss ensues.\n");
+
+			goto read_failed;
+		}
+
+		/* Do the read... */
+		ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf);
+
+		/* ECC recovered ? */
+		if ((ret == -EUCLEAN || ret == -EBADMSG) &&
+		    (retlen == c->wbuf_ofs - start))
+			ret = 0;
+
+		if (ret || retlen != c->wbuf_ofs - start) {
+			printk(KERN_CRIT "Old data are already lost in wbuf recovery. Data loss ensues.\n");
+
+			kfree(buf);
+			buf = NULL;
+		read_failed:
+			first_raw = ref_next(first_raw);
+			nr_refile--;
+			while (first_raw && ref_obsolete(first_raw)) {
+				first_raw = ref_next(first_raw);
+				nr_refile--;
+			}
+
+			/* If this was the only node to be recovered, give up */
+			if (!first_raw) {
+				c->wbuf_len = 0;
+				return;
+			}
+
+			/* It wasn't. Go on and try to recover nodes complete in the wbuf */
+			start = ref_offset(first_raw);
+			dbg_noderef("wbuf now recover %08x-%08x (%d bytes in %d nodes)\n",
+				    start, end, end - start, nr_refile);
+
+		} else {
+			/* Read succeeded. Copy the remaining data from the wbuf */
+			memcpy(buf + (c->wbuf_ofs - start), c->wbuf, end - c->wbuf_ofs);
+		}
+	}
+	/* OK... we're to rewrite (end-start) bytes of data from first_raw onwards.
+	   Either 'buf' contains the data, or we find it in the wbuf */
+
+	/* ... and get an allocation of space from a shiny new block instead */
+	ret = jffs2_reserve_space_gc(c, end-start, &len, JFFS2_SUMMARY_NOSUM_SIZE);
+	if (ret) {
+		printk(KERN_WARNING "Failed to allocate space for wbuf recovery. Data loss ensues.\n");
+		kfree(buf);
+		return;
+	}
+
+	/* The summary is not recovered, so it must be disabled for this erase block */
+	jffs2_sum_disable_collecting(c->summary);
+
+	ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, nr_refile);
+	if (ret) {
+		printk(KERN_WARNING "Failed to allocate node refs for wbuf recovery. Data loss ensues.\n");
+		kfree(buf);
+		return;
+	}
+
+	ofs = write_ofs(c);
+
+	if (end-start >= c->wbuf_pagesize) {
+		/* Need to do another write immediately, but it's possible
+		   that this is just because the wbuf itself is completely
+		   full, and there's nothing earlier read back from the
+		   flash. Hence 'buf' isn't necessarily what we're writing
+		   from. */
+		unsigned char *rewrite_buf = buf?:c->wbuf;
+		uint32_t towrite = (end-start) - ((end-start)%c->wbuf_pagesize);
+
+		D1(printk(KERN_DEBUG "Write 0x%x bytes at 0x%08x in wbuf recover\n",
+			  towrite, ofs));
+
+#ifdef BREAKMEHEADER
+		static int breakme;
+		if (breakme++ == 20) {
+			printk(KERN_NOTICE "Faking write error at 0x%08x\n", ofs);
+			breakme = 0;
+			c->mtd->write(c->mtd, ofs, towrite, &retlen,
+				      brokenbuf);
+			ret = -EIO;
+		} else
+#endif
+			ret = c->mtd->write(c->mtd, ofs, towrite, &retlen,
+					    rewrite_buf);
+
+		if (ret || retlen != towrite || jffs2_verify_write(c, rewrite_buf, ofs)) {
+			/* Argh. We tried. Really we did. */
+			printk(KERN_CRIT "Recovery of wbuf failed due to a second write error\n");
+			kfree(buf);
+
+			if (retlen)
+				jffs2_add_physical_node_ref(c, ofs | REF_OBSOLETE, ref_totlen(c, jeb, first_raw), NULL);
+
+			return;
+		}
+		printk(KERN_NOTICE "Recovery of wbuf succeeded to %08x\n", ofs);
+
+		c->wbuf_len = (end - start) - towrite;
+		c->wbuf_ofs = ofs + towrite;
+		memmove(c->wbuf, rewrite_buf + towrite, c->wbuf_len);
+		/* Don't muck about with c->wbuf_inodes. False positives are harmless. */
+	} else {
+		/* OK, now we're left with the dregs in whichever buffer we're using */
+		if (buf) {
+			memcpy(c->wbuf, buf, end-start);
+		} else {
+			memmove(c->wbuf, c->wbuf + (start - c->wbuf_ofs), end - start);
+		}
+		c->wbuf_ofs = ofs;
+		c->wbuf_len = end - start;
+	}
+
+	/* Now sort out the jffs2_raw_node_refs, moving them from the old to the next block */
+	new_jeb = &c->blocks[ofs / c->sector_size];
+
+	spin_lock(&c->erase_completion_lock);
+	for (raw = first_raw; raw != jeb->last_node; raw = ref_next(raw)) {
+		uint32_t rawlen = ref_totlen(c, jeb, raw);
+		struct jffs2_inode_cache *ic;
+		struct jffs2_raw_node_ref *new_ref;
+		struct jffs2_raw_node_ref **adjust_ref = NULL;
+		struct jffs2_inode_info *f = NULL;
+
+		D1(printk(KERN_DEBUG "Refiling block of %08x at %08x(%d) to %08x\n",
+			  rawlen, ref_offset(raw), ref_flags(raw), ofs));
+
+		ic = jffs2_raw_ref_to_ic(raw);
+
+		/* Ick. This XATTR mess should be fixed shortly... */
+		if (ic && ic->class == RAWNODE_CLASS_XATTR_DATUM) {
+			struct jffs2_xattr_datum *xd = (void *)ic;
+			BUG_ON(xd->node != raw);
+			adjust_ref = &xd->node;
+			raw->next_in_ino = NULL;
+			ic = NULL;
+		} else if (ic && ic->class == RAWNODE_CLASS_XATTR_REF) {
+			struct jffs2_xattr_datum *xr = (void *)ic;
+			BUG_ON(xr->node != raw);
+			adjust_ref = &xr->node;
+			raw->next_in_ino = NULL;
+			ic = NULL;
+		} else if (ic && ic->class == RAWNODE_CLASS_INODE_CACHE) {
+			struct jffs2_raw_node_ref **p = &ic->nodes;
+
+			/* Remove the old node from the per-inode list */
+			while (*p && *p != (void *)ic) {
+				if (*p == raw) {
+					(*p) = (raw->next_in_ino);
+					raw->next_in_ino = NULL;
+					break;
+				}
+				p = &((*p)->next_in_ino);
+			}
+
+			if (ic->state == INO_STATE_PRESENT && !ref_obsolete(raw)) {
+				/* If it's an in-core inode, then we have to adjust any
+				   full_dirent or full_dnode structure to point to the
+				   new version instead of the old */
+				f = jffs2_gc_fetch_inode(c, ic->ino, !ic->pino_nlink);
+				if (IS_ERR(f)) {
+					/* Should never happen; it _must_ be present */
+					JFFS2_ERROR("Failed to iget() ino #%u, err %ld\n",
+						    ic->ino, PTR_ERR(f));
+					BUG();
+				}
+				/* We don't lock f->sem. There's a number of ways we could
+				   end up in here with it already being locked, and nobody's
+				   going to modify it on us anyway because we hold the
+				   alloc_sem. We're only changing one ->raw pointer too,
+				   which we can get away with without upsetting readers. */
+				adjust_ref = jffs2_incore_replace_raw(c, f, raw,
+								      (void *)(buf?:c->wbuf) + (ref_offset(raw) - start));
+			} else if (unlikely(ic->state != INO_STATE_PRESENT &&
+					    ic->state != INO_STATE_CHECKEDABSENT &&
+					    ic->state != INO_STATE_GC)) {
+				JFFS2_ERROR("Inode #%u is in strange state %d!\n", ic->ino, ic->state);
+				BUG();
+			}
+		}
+
+		new_ref = jffs2_link_node_ref(c, new_jeb, ofs | ref_flags(raw), rawlen, ic);
+
+		if (adjust_ref) {
+			BUG_ON(*adjust_ref != raw);
+			*adjust_ref = new_ref;
+		}
+		if (f)
+			jffs2_gc_release_inode(c, f);
+
+		if (!ref_obsolete(raw)) {
+			jeb->dirty_size += rawlen;
+			jeb->used_size  -= rawlen;
+			c->dirty_size += rawlen;
+			c->used_size -= rawlen;
+			raw->flash_offset = ref_offset(raw) | REF_OBSOLETE;
+			BUG_ON(raw->next_in_ino);
+		}
+		ofs += rawlen;
+	}
+
+	kfree(buf);
+
+	/* Fix up the original jeb now it's on the bad_list */
+	if (first_raw == jeb->first_node) {
+		D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
+		list_move(&jeb->list, &c->erase_pending_list);
+		c->nr_erasing_blocks++;
+		jffs2_garbage_collect_trigger(c);
+	}
+
+	jffs2_dbg_acct_sanity_check_nolock(c, jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+
+	jffs2_dbg_acct_sanity_check_nolock(c, new_jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, new_jeb);
+
+	spin_unlock(&c->erase_completion_lock);
+
+	D1(printk(KERN_DEBUG "wbuf recovery completed OK. wbuf_ofs 0x%08x, len 0x%x\n", c->wbuf_ofs, c->wbuf_len));
+
+}
+
+/* Meaning of pad argument:
+   0: Do not pad. Probably pointless - we only ever use this when we can't pad anyway.
+   1: Pad, do not adjust nextblock free_size
+   2: Pad, adjust nextblock free_size
+*/
+#define NOPAD		0
+#define PAD_NOACCOUNT	1
+#define PAD_ACCOUNTING	2
+
+static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
+{
+	struct jffs2_eraseblock *wbuf_jeb;
+	int ret;
+	size_t retlen;
+
+	/* Nothing to do if not write-buffering the flash. In particular, we shouldn't
+	   del_timer() the timer we never initialised. */
+	if (!jffs2_is_writebuffered(c))
+		return 0;
+
+	if (mutex_trylock(&c->alloc_sem)) {
+		mutex_unlock(&c->alloc_sem);
+		printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n");
+		BUG();
+	}
+
+	if (!c->wbuf_len)	/* already checked c->wbuf above */
+		return 0;
+
+	wbuf_jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
+	if (jffs2_prealloc_raw_node_refs(c, wbuf_jeb, c->nextblock->allocated_refs + 1))
+		return -ENOMEM;
+
+	/* claim remaining space on the page
+	   this happens, if we have a change to a new block,
+	   or if fsync forces us to flush the writebuffer.
+	   if we have a switch to next page, we will not have
+	   enough remaining space for this.
+	*/
+	if (pad ) {
+		c->wbuf_len = PAD(c->wbuf_len);
+
+		/* Pad with JFFS2_DIRTY_BITMASK initially.  this helps out ECC'd NOR
+		   with 8 byte page size */
+		memset(c->wbuf + c->wbuf_len, 0, c->wbuf_pagesize - c->wbuf_len);
+
+		if ( c->wbuf_len + sizeof(struct jffs2_unknown_node) < c->wbuf_pagesize) {
+			struct jffs2_unknown_node *padnode = (void *)(c->wbuf + c->wbuf_len);
+			padnode->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+			padnode->nodetype = cpu_to_je16(JFFS2_NODETYPE_PADDING);
+			padnode->totlen = cpu_to_je32(c->wbuf_pagesize - c->wbuf_len);
+			padnode->hdr_crc = cpu_to_je32(crc32(0, padnode, sizeof(*padnode)-4));
+		}
+	}
+	/* else jffs2_flash_writev has actually filled in the rest of the
+	   buffer for us, and will deal with the node refs etc. later. */
+
+#ifdef BREAKME
+	static int breakme;
+	if (breakme++ == 20) {
+		printk(KERN_NOTICE "Faking write error at 0x%08x\n", c->wbuf_ofs);
+		breakme = 0;
+		c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen,
+			      brokenbuf);
+		ret = -EIO;
+	} else
+#endif
+
+		ret = c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf);
+
+	if (ret) {
+		printk(KERN_WARNING "jffs2_flush_wbuf(): Write failed with %d\n", ret);
+		goto wfail;
+	} else if (retlen != c->wbuf_pagesize) {
+		printk(KERN_WARNING "jffs2_flush_wbuf(): Write was short: %zd instead of %d\n",
+		       retlen, c->wbuf_pagesize);
+		ret = -EIO;
+		goto wfail;
+	} else if ((ret = jffs2_verify_write(c, c->wbuf, c->wbuf_ofs))) {
+	wfail:
+		jffs2_wbuf_recover(c);
+
+		return ret;
+	}
+
+	/* Adjust free size of the block if we padded. */
+	if (pad) {
+		uint32_t waste = c->wbuf_pagesize - c->wbuf_len;
+
+		D1(printk(KERN_DEBUG "jffs2_flush_wbuf() adjusting free_size of %sblock at %08x\n",
+			  (wbuf_jeb==c->nextblock)?"next":"", wbuf_jeb->offset));
+
+		/* wbuf_pagesize - wbuf_len is the amount of space that's to be
+		   padded. If there is less free space in the block than that,
+		   something screwed up */
+		if (wbuf_jeb->free_size < waste) {
+			printk(KERN_CRIT "jffs2_flush_wbuf(): Accounting error. wbuf at 0x%08x has 0x%03x bytes, 0x%03x left.\n",
+			       c->wbuf_ofs, c->wbuf_len, waste);
+			printk(KERN_CRIT "jffs2_flush_wbuf(): But free_size for block at 0x%08x is only 0x%08x\n",
+			       wbuf_jeb->offset, wbuf_jeb->free_size);
+			BUG();
+		}
+
+		spin_lock(&c->erase_completion_lock);
+
+		jffs2_link_node_ref(c, wbuf_jeb, (c->wbuf_ofs + c->wbuf_len) | REF_OBSOLETE, waste, NULL);
+		/* FIXME: that made it count as dirty. Convert to wasted */
+		wbuf_jeb->dirty_size -= waste;
+		c->dirty_size -= waste;
+		wbuf_jeb->wasted_size += waste;
+		c->wasted_size += waste;
+	} else
+		spin_lock(&c->erase_completion_lock);
+
+	/* Stick any now-obsoleted blocks on the erase_pending_list */
+	jffs2_refile_wbuf_blocks(c);
+	jffs2_clear_wbuf_ino_list(c);
+	spin_unlock(&c->erase_completion_lock);
+
+	memset(c->wbuf,0xff,c->wbuf_pagesize);
+	/* adjust write buffer offset, else we get a non contiguous write bug */
+	c->wbuf_ofs += c->wbuf_pagesize;
+	c->wbuf_len = 0;
+	return 0;
+}
+
+/* Trigger garbage collection to flush the write-buffer.
+   If ino arg is zero, do it if _any_ real (i.e. not GC) writes are
+   outstanding. If ino arg non-zero, do it only if a write for the
+   given inode is outstanding. */
+int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino)
+{
+	uint32_t old_wbuf_ofs;
+	uint32_t old_wbuf_len;
+	int ret = 0;
+
+	D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() called for ino #%u...\n", ino));
+
+	if (!c->wbuf)
+		return 0;
+
+	mutex_lock(&c->alloc_sem);
+	if (!jffs2_wbuf_pending_for_ino(c, ino)) {
+		D1(printk(KERN_DEBUG "Ino #%d not pending in wbuf. Returning\n", ino));
+		mutex_unlock(&c->alloc_sem);
+		return 0;
+	}
+
+	old_wbuf_ofs = c->wbuf_ofs;
+	old_wbuf_len = c->wbuf_len;
+
+	if (c->unchecked_size) {
+		/* GC won't make any progress for a while */
+		D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() padding. Not finished checking\n"));
+		down_write(&c->wbuf_sem);
+		ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING);
+		/* retry flushing wbuf in case jffs2_wbuf_recover
+		   left some data in the wbuf */
+		if (ret)
+			ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING);
+		up_write(&c->wbuf_sem);
+	} else while (old_wbuf_len &&
+		      old_wbuf_ofs == c->wbuf_ofs) {
+
+		mutex_unlock(&c->alloc_sem);
+
+		D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() calls gc pass\n"));
+
+		ret = jffs2_garbage_collect_pass(c);
+		if (ret) {
+			/* GC failed. Flush it with padding instead */
+			mutex_lock(&c->alloc_sem);
+			down_write(&c->wbuf_sem);
+			ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING);
+			/* retry flushing wbuf in case jffs2_wbuf_recover
+			   left some data in the wbuf */
+			if (ret)
+				ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING);
+			up_write(&c->wbuf_sem);
+			break;
+		}
+		mutex_lock(&c->alloc_sem);
+	}
+
+	D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() ends...\n"));
+
+	mutex_unlock(&c->alloc_sem);
+	return ret;
+}
+
+/* Pad write-buffer to end and write it, wasting space. */
+int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c)
+{
+	int ret;
+
+	if (!c->wbuf)
+		return 0;
+
+	down_write(&c->wbuf_sem);
+	ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT);
+	/* retry - maybe wbuf recover left some data in wbuf. */
+	if (ret)
+		ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT);
+	up_write(&c->wbuf_sem);
+
+	return ret;
+}
+
+static size_t jffs2_fill_wbuf(struct jffs2_sb_info *c, const uint8_t *buf,
+			      size_t len)
+{
+	if (len && !c->wbuf_len && (len >= c->wbuf_pagesize))
+		return 0;
+
+	if (len > (c->wbuf_pagesize - c->wbuf_len))
+		len = c->wbuf_pagesize - c->wbuf_len;
+	memcpy(c->wbuf + c->wbuf_len, buf, len);
+	c->wbuf_len += (uint32_t) len;
+	return len;
+}
+
+int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs,
+		       unsigned long count, loff_t to, size_t *retlen,
+		       uint32_t ino)
+{
+	struct jffs2_eraseblock *jeb;
+	size_t wbuf_retlen, donelen = 0;
+	uint32_t outvec_to = to;
+	int ret, invec;
+
+	/* If not writebuffered flash, don't bother */
+	if (!jffs2_is_writebuffered(c))
+		return jffs2_flash_direct_writev(c, invecs, count, to, retlen);
+
+	down_write(&c->wbuf_sem);
+
+	/* If wbuf_ofs is not initialized, set it to target address */
+	if (c->wbuf_ofs == 0xFFFFFFFF) {
+		c->wbuf_ofs = PAGE_DIV(to);
+		c->wbuf_len = PAGE_MOD(to);
+		memset(c->wbuf,0xff,c->wbuf_pagesize);
+	}
+
+	/*
+	 * Sanity checks on target address.  It's permitted to write
+	 * at PAD(c->wbuf_len+c->wbuf_ofs), and it's permitted to
+	 * write at the beginning of a new erase block. Anything else,
+	 * and you die.  New block starts at xxx000c (0-b = block
+	 * header)
+	 */
+	if (SECTOR_ADDR(to) != SECTOR_ADDR(c->wbuf_ofs)) {
+		/* It's a write to a new block */
+		if (c->wbuf_len) {
+			D1(printk(KERN_DEBUG "jffs2_flash_writev() to 0x%lx "
+				  "causes flush of wbuf at 0x%08x\n",
+				  (unsigned long)to, c->wbuf_ofs));
+			ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT);
+			if (ret)
+				goto outerr;
+		}
+		/* set pointer to new block */
+		c->wbuf_ofs = PAGE_DIV(to);
+		c->wbuf_len = PAGE_MOD(to);
+	}
+
+	if (to != PAD(c->wbuf_ofs + c->wbuf_len)) {
+		/* We're not writing immediately after the writebuffer. Bad. */
+		printk(KERN_CRIT "jffs2_flash_writev(): Non-contiguous write "
+		       "to %08lx\n", (unsigned long)to);
+		if (c->wbuf_len)
+			printk(KERN_CRIT "wbuf was previously %08x-%08x\n",
+			       c->wbuf_ofs, c->wbuf_ofs+c->wbuf_len);
+		BUG();
+	}
+
+	/* adjust alignment offset */
+	if (c->wbuf_len != PAGE_MOD(to)) {
+		c->wbuf_len = PAGE_MOD(to);
+		/* take care of alignment to next page */
+		if (!c->wbuf_len) {
+			c->wbuf_len = c->wbuf_pagesize;
+			ret = __jffs2_flush_wbuf(c, NOPAD);
+			if (ret)
+				goto outerr;
+		}
+	}
+
+	for (invec = 0; invec < count; invec++) {
+		int vlen = invecs[invec].iov_len;
+		uint8_t *v = invecs[invec].iov_base;
+
+		wbuf_retlen = jffs2_fill_wbuf(c, v, vlen);
+
+		if (c->wbuf_len == c->wbuf_pagesize) {
+			ret = __jffs2_flush_wbuf(c, NOPAD);
+			if (ret)
+				goto outerr;
+		}
+		vlen -= wbuf_retlen;
+		outvec_to += wbuf_retlen;
+		donelen += wbuf_retlen;
+		v += wbuf_retlen;
+
+		if (vlen >= c->wbuf_pagesize) {
+			ret = c->mtd->write(c->mtd, outvec_to, PAGE_DIV(vlen),
+					    &wbuf_retlen, v);
+			if (ret < 0 || wbuf_retlen != PAGE_DIV(vlen))
+				goto outfile;
+
+			vlen -= wbuf_retlen;
+			outvec_to += wbuf_retlen;
+			c->wbuf_ofs = outvec_to;
+			donelen += wbuf_retlen;
+			v += wbuf_retlen;
+		}
+
+		wbuf_retlen = jffs2_fill_wbuf(c, v, vlen);
+		if (c->wbuf_len == c->wbuf_pagesize) {
+			ret = __jffs2_flush_wbuf(c, NOPAD);
+			if (ret)
+				goto outerr;
+		}
+
+		outvec_to += wbuf_retlen;
+		donelen += wbuf_retlen;
+	}
+
+	/*
+	 * If there's a remainder in the wbuf and it's a non-GC write,
+	 * remember that the wbuf affects this ino
+	 */
+	*retlen = donelen;
+
+	if (jffs2_sum_active()) {
+		int res = jffs2_sum_add_kvec(c, invecs, count, (uint32_t) to);
+		if (res)
+			return res;
+	}
+
+	if (c->wbuf_len && ino)
+		jffs2_wbuf_dirties_inode(c, ino);
+
+	ret = 0;
+	up_write(&c->wbuf_sem);
+	return ret;
+
+outfile:
+	/*
+	 * At this point we have no problem, c->wbuf is empty. However
+	 * refile nextblock to avoid writing again to same address.
+	 */
+
+	spin_lock(&c->erase_completion_lock);
+
+	jeb = &c->blocks[outvec_to / c->sector_size];
+	jffs2_block_refile(c, jeb, REFILE_ANYWAY);
+
+	spin_unlock(&c->erase_completion_lock);
+
+outerr:
+	*retlen = 0;
+	up_write(&c->wbuf_sem);
+	return ret;
+}
+
+/*
+ *	This is the entry for flash write.
+ *	Check, if we work on NAND FLASH, if so build an kvec and write it via vritev
+*/
+int jffs2_flash_write(struct jffs2_sb_info *c, loff_t ofs, size_t len,
+		      size_t *retlen, const u_char *buf)
+{
+	struct kvec vecs[1];
+
+	if (!jffs2_is_writebuffered(c))
+		return jffs2_flash_direct_write(c, ofs, len, retlen, buf);
+
+	vecs[0].iov_base = (unsigned char *) buf;
+	vecs[0].iov_len = len;
+	return jffs2_flash_writev(c, vecs, 1, ofs, retlen, 0);
+}
+
+/*
+	Handle readback from writebuffer and ECC failure return
+*/
+int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, u_char *buf)
+{
+	loff_t	orbf = 0, owbf = 0, lwbf = 0;
+	int	ret;
+
+	if (!jffs2_is_writebuffered(c))
+		return c->mtd->read(c->mtd, ofs, len, retlen, buf);
+
+	/* Read flash */
+	down_read(&c->wbuf_sem);
+	ret = c->mtd->read(c->mtd, ofs, len, retlen, buf);
+
+	if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) {
+		if (ret == -EBADMSG)
+			printk(KERN_WARNING "mtd->read(0x%zx bytes from 0x%llx)"
+			       " returned ECC error\n", len, ofs);
+		/*
+		 * We have the raw data without ECC correction in the buffer,
+		 * maybe we are lucky and all data or parts are correct. We
+		 * check the node.  If data are corrupted node check will sort
+		 * it out.  We keep this block, it will fail on write or erase
+		 * and the we mark it bad. Or should we do that now? But we
+		 * should give him a chance.  Maybe we had a system crash or
+		 * power loss before the ecc write or a erase was completed.
+		 * So we return success. :)
+		 */
+		ret = 0;
+	}
+
+	/* if no writebuffer available or write buffer empty, return */
+	if (!c->wbuf_pagesize || !c->wbuf_len)
+		goto exit;
+
+	/* if we read in a different block, return */
+	if (SECTOR_ADDR(ofs) != SECTOR_ADDR(c->wbuf_ofs))
+		goto exit;
+
+	if (ofs >= c->wbuf_ofs) {
+		owbf = (ofs - c->wbuf_ofs);	/* offset in write buffer */
+		if (owbf > c->wbuf_len)		/* is read beyond write buffer ? */
+			goto exit;
+		lwbf = c->wbuf_len - owbf;	/* number of bytes to copy */
+		if (lwbf > len)
+			lwbf = len;
+	} else {
+		orbf = (c->wbuf_ofs - ofs);	/* offset in read buffer */
+		if (orbf > len)			/* is write beyond write buffer ? */
+			goto exit;
+		lwbf = len - orbf;		/* number of bytes to copy */
+		if (lwbf > c->wbuf_len)
+			lwbf = c->wbuf_len;
+	}
+	if (lwbf > 0)
+		memcpy(buf+orbf,c->wbuf+owbf,lwbf);
+
+exit:
+	up_read(&c->wbuf_sem);
+	return ret;
+}
+
+#define NR_OOB_SCAN_PAGES 4
+
+/* For historical reasons we use only 8 bytes for OOB clean marker */
+#define OOB_CM_SIZE 8
+
+static const struct jffs2_unknown_node oob_cleanmarker =
+{
+	.magic = constant_cpu_to_je16(JFFS2_MAGIC_BITMASK),
+	.nodetype = constant_cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER),
+	.totlen = constant_cpu_to_je32(8)
+};
+
+/*
+ * Check, if the out of band area is empty. This function knows about the clean
+ * marker and if it is present in OOB, treats the OOB as empty anyway.
+ */
+int jffs2_check_oob_empty(struct jffs2_sb_info *c,
+			  struct jffs2_eraseblock *jeb, int mode)
+{
+	int i, ret;
+	int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
+	struct mtd_oob_ops ops;
+
+	ops.mode = MTD_OOB_AUTO;
+	ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
+	ops.oobbuf = c->oobbuf;
+	ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
+	ops.datbuf = NULL;
+
+	ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops);
+	if (ret || ops.oobretlen != ops.ooblen) {
+		printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd"
+				" bytes, read %zd bytes, error %d\n",
+				jeb->offset, ops.ooblen, ops.oobretlen, ret);
+		if (!ret)
+			ret = -EIO;
+		return ret;
+	}
+
+	for(i = 0; i < ops.ooblen; i++) {
+		if (mode && i < cmlen)
+			/* Yeah, we know about the cleanmarker */
+			continue;
+
+		if (ops.oobbuf[i] != 0xFF) {
+			D2(printk(KERN_DEBUG "Found %02x at %x in OOB for "
+				  "%08x\n", ops.oobbuf[i], i, jeb->offset));
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Check for a valid cleanmarker.
+ * Returns: 0 if a valid cleanmarker was found
+ *	    1 if no cleanmarker was found
+ *	    negative error code if an error occurred
+ */
+int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
+				 struct jffs2_eraseblock *jeb)
+{
+	struct mtd_oob_ops ops;
+	int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
+
+	ops.mode = MTD_OOB_AUTO;
+	ops.ooblen = cmlen;
+	ops.oobbuf = c->oobbuf;
+	ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
+	ops.datbuf = NULL;
+
+	ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops);
+	if (ret || ops.oobretlen != ops.ooblen) {
+		printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd"
+				" bytes, read %zd bytes, error %d\n",
+				jeb->offset, ops.ooblen, ops.oobretlen, ret);
+		if (!ret)
+			ret = -EIO;
+		return ret;
+	}
+
+	return !!memcmp(&oob_cleanmarker, c->oobbuf, cmlen);
+}
+
+int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
+				 struct jffs2_eraseblock *jeb)
+{
+	int ret;
+	struct mtd_oob_ops ops;
+	int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
+
+	ops.mode = MTD_OOB_AUTO;
+	ops.ooblen = cmlen;
+	ops.oobbuf = (uint8_t *)&oob_cleanmarker;
+	ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
+	ops.datbuf = NULL;
+
+	ret = c->mtd->write_oob(c->mtd, jeb->offset, &ops);
+	if (ret || ops.oobretlen != ops.ooblen) {
+		printk(KERN_ERR "cannot write OOB for EB at %08x, requested %zd"
+				" bytes, read %zd bytes, error %d\n",
+				jeb->offset, ops.ooblen, ops.oobretlen, ret);
+		if (!ret)
+			ret = -EIO;
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * On NAND we try to mark this block bad. If the block was erased more
+ * than MAX_ERASE_FAILURES we mark it finally bad.
+ * Don't care about failures. This block remains on the erase-pending
+ * or badblock list as long as nobody manipulates the flash with
+ * a bootloader or something like that.
+ */
+
+int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset)
+{
+	int 	ret;
+
+	/* if the count is < max, we try to write the counter to the 2nd page oob area */
+	if( ++jeb->bad_count < MAX_ERASE_FAILURES)
+		return 0;
+
+	if (!c->mtd->block_markbad)
+		return 1; // What else can we do?
+
+	printk(KERN_WARNING "JFFS2: marking eraseblock at %08x\n as bad", bad_offset);
+	ret = c->mtd->block_markbad(c->mtd, bad_offset);
+
+	if (ret) {
+		D1(printk(KERN_WARNING "jffs2_write_nand_badblock(): Write failed for block at %08x: error %d\n", jeb->offset, ret));
+		return ret;
+	}
+	return 1;
+}
+
+int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
+{
+	struct nand_ecclayout *oinfo = c->mtd->ecclayout;
+
+	if (!c->mtd->oobsize)
+		return 0;
+
+	/* Cleanmarker is out-of-band, so inline size zero */
+	c->cleanmarker_size = 0;
+
+	if (!oinfo || oinfo->oobavail == 0) {
+		printk(KERN_ERR "inconsistent device description\n");
+		return -EINVAL;
+	}
+
+	D1(printk(KERN_DEBUG "JFFS2 using OOB on NAND\n"));
+
+	c->oobavail = oinfo->oobavail;
+
+	/* Initialise write buffer */
+	init_rwsem(&c->wbuf_sem);
+	c->wbuf_pagesize = c->mtd->writesize;
+	c->wbuf_ofs = 0xFFFFFFFF;
+
+	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf)
+		return -ENOMEM;
+
+	c->oobbuf = kmalloc(NR_OOB_SCAN_PAGES * c->oobavail, GFP_KERNEL);
+	if (!c->oobbuf) {
+		kfree(c->wbuf);
+		return -ENOMEM;
+	}
+
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf_verify) {
+		kfree(c->oobbuf);
+		kfree(c->wbuf);
+		return -ENOMEM;
+	}
+#endif
+	return 0;
+}
+
+void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c)
+{
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	kfree(c->wbuf_verify);
+#endif
+	kfree(c->wbuf);
+	kfree(c->oobbuf);
+}
+
+int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
+	c->cleanmarker_size = 0;		/* No cleanmarkers needed */
+
+	/* Initialize write buffer */
+	init_rwsem(&c->wbuf_sem);
+
+
+	c->wbuf_pagesize =  c->mtd->erasesize;
+
+	/* Find a suitable c->sector_size
+	 * - Not too much sectors
+	 * - Sectors have to be at least 4 K + some bytes
+	 * - All known dataflashes have erase sizes of 528 or 1056
+	 * - we take at least 8 eraseblocks and want to have at least 8K size
+	 * - The concatenation should be a power of 2
+	*/
+
+	c->sector_size = 8 * c->mtd->erasesize;
+
+	while (c->sector_size < 8192) {
+		c->sector_size *= 2;
+	}
+
+	/* It may be necessary to adjust the flash size */
+	c->flash_size = c->mtd->size;
+
+	if ((c->flash_size % c->sector_size) != 0) {
+		c->flash_size = (c->flash_size / c->sector_size) * c->sector_size;
+		printk(KERN_WARNING "JFFS2 flash size adjusted to %dKiB\n", c->flash_size);
+	};
+
+	c->wbuf_ofs = 0xFFFFFFFF;
+	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf)
+		return -ENOMEM;
+
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf_verify) {
+		kfree(c->oobbuf);
+		kfree(c->wbuf);
+		return -ENOMEM;
+	}
+#endif
+
+	printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size);
+
+	return 0;
+}
+
+void jffs2_dataflash_cleanup(struct jffs2_sb_info *c) {
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	kfree(c->wbuf_verify);
+#endif
+	kfree(c->wbuf);
+}
+
+int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
+	/* Cleanmarker currently occupies whole programming regions,
+	 * either one or 2 for 8Byte STMicro flashes. */
+	c->cleanmarker_size = max(16u, c->mtd->writesize);
+
+	/* Initialize write buffer */
+	init_rwsem(&c->wbuf_sem);
+	c->wbuf_pagesize = c->mtd->writesize;
+	c->wbuf_ofs = 0xFFFFFFFF;
+
+	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf)
+		return -ENOMEM;
+
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf_verify) {
+		kfree(c->wbuf);
+		return -ENOMEM;
+	}
+#endif
+	return 0;
+}
+
+void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) {
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	kfree(c->wbuf_verify);
+#endif
+	kfree(c->wbuf);
+}
+
+int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
+	c->cleanmarker_size = 0;
+
+	if (c->mtd->writesize == 1)
+		/* We do not need write-buffer */
+		return 0;
+
+	init_rwsem(&c->wbuf_sem);
+
+	c->wbuf_pagesize =  c->mtd->writesize;
+	c->wbuf_ofs = 0xFFFFFFFF;
+	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf)
+		return -ENOMEM;
+
+	printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size);
+
+	return 0;
+}
+
+void jffs2_ubivol_cleanup(struct jffs2_sb_info *c) {
+	kfree(c->wbuf);
+}
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
new file mode 100644
index 00000000..30d175b6
--- /dev/null
+++ b/fs/jffs2/write.c
@@ -0,0 +1,707 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/crc32.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+#include "compr.h"
+
+
+int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+		       uint32_t mode, struct jffs2_raw_inode *ri)
+{
+	struct jffs2_inode_cache *ic;
+
+	ic = jffs2_alloc_inode_cache();
+	if (!ic) {
+		return -ENOMEM;
+	}
+
+	memset(ic, 0, sizeof(*ic));
+
+	f->inocache = ic;
+	f->inocache->pino_nlink = 1; /* Will be overwritten shortly for directories */
+	f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache;
+	f->inocache->state = INO_STATE_PRESENT;
+
+	jffs2_add_ino_cache(c, f->inocache);
+	D1(printk(KERN_DEBUG "jffs2_do_new_inode(): Assigned ino# %d\n", f->inocache->ino));
+	ri->ino = cpu_to_je32(f->inocache->ino);
+
+	ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	ri->nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE);
+	ri->totlen = cpu_to_je32(PAD(sizeof(*ri)));
+	ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4));
+	ri->mode = cpu_to_jemode(mode);
+
+	f->highest_version = 1;
+	ri->version = cpu_to_je32(f->highest_version);
+
+	return 0;
+}
+
+/* jffs2_write_dnode - given a raw_inode, allocate a full_dnode for it,
+   write it to the flash, link it into the existing inode/fragment list */
+
+struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					   struct jffs2_raw_inode *ri, const unsigned char *data,
+					   uint32_t datalen, int alloc_mode)
+
+{
+	struct jffs2_full_dnode *fn;
+	size_t retlen;
+	uint32_t flash_ofs;
+	struct kvec vecs[2];
+	int ret;
+	int retried = 0;
+	unsigned long cnt = 2;
+
+	D1(if(je32_to_cpu(ri->hdr_crc) != crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)) {
+		printk(KERN_CRIT "Eep. CRC not correct in jffs2_write_dnode()\n");
+		BUG();
+	}
+	   );
+	vecs[0].iov_base = ri;
+	vecs[0].iov_len = sizeof(*ri);
+	vecs[1].iov_base = (unsigned char *)data;
+	vecs[1].iov_len = datalen;
+
+	if (je32_to_cpu(ri->totlen) != sizeof(*ri) + datalen) {
+		printk(KERN_WARNING "jffs2_write_dnode: ri->totlen (0x%08x) != sizeof(*ri) (0x%08zx) + datalen (0x%08x)\n", je32_to_cpu(ri->totlen), sizeof(*ri), datalen);
+	}
+
+	fn = jffs2_alloc_full_dnode();
+	if (!fn)
+		return ERR_PTR(-ENOMEM);
+
+	/* check number of valid vecs */
+	if (!datalen || !data)
+		cnt = 1;
+ retry:
+	flash_ofs = write_ofs(c);
+
+	jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
+
+	if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(ri->version) < f->highest_version)) {
+		BUG_ON(!retried);
+		D1(printk(KERN_DEBUG "jffs2_write_dnode : dnode_version %d, "
+				"highest version %d -> updating dnode\n",
+				je32_to_cpu(ri->version), f->highest_version));
+		ri->version = cpu_to_je32(++f->highest_version);
+		ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
+	}
+
+	ret = jffs2_flash_writev(c, vecs, cnt, flash_ofs, &retlen,
+				 (alloc_mode==ALLOC_GC)?0:f->inocache->ino);
+
+	if (ret || (retlen != sizeof(*ri) + datalen)) {
+		printk(KERN_NOTICE "Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n",
+		       sizeof(*ri)+datalen, flash_ofs, ret, retlen);
+
+		/* Mark the space as dirtied */
+		if (retlen) {
+			/* Don't change raw->size to match retlen. We may have
+			   written the node header already, and only the data will
+			   seem corrupted, in which case the scan would skip over
+			   any node we write before the original intended end of
+			   this node */
+			jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*ri)+datalen), NULL);
+		} else {
+			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", flash_ofs);
+		}
+		if (!retried && alloc_mode != ALLOC_NORETRY) {
+			/* Try to reallocate space and retry */
+			uint32_t dummy;
+			struct jffs2_eraseblock *jeb = &c->blocks[flash_ofs / c->sector_size];
+
+			retried = 1;
+
+			D1(printk(KERN_DEBUG "Retrying failed write.\n"));
+
+			jffs2_dbg_acct_sanity_check(c,jeb);
+			jffs2_dbg_acct_paranoia_check(c, jeb);
+
+			if (alloc_mode == ALLOC_GC) {
+				ret = jffs2_reserve_space_gc(c, sizeof(*ri) + datalen, &dummy,
+							     JFFS2_SUMMARY_INODE_SIZE);
+			} else {
+				/* Locking pain */
+				mutex_unlock(&f->sem);
+				jffs2_complete_reservation(c);
+
+				ret = jffs2_reserve_space(c, sizeof(*ri) + datalen, &dummy,
+							  alloc_mode, JFFS2_SUMMARY_INODE_SIZE);
+				mutex_lock(&f->sem);
+			}
+
+			if (!ret) {
+				flash_ofs = write_ofs(c);
+				D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", flash_ofs));
+
+				jffs2_dbg_acct_sanity_check(c,jeb);
+				jffs2_dbg_acct_paranoia_check(c, jeb);
+
+				goto retry;
+			}
+			D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret));
+		}
+		/* Release the full_dnode which is now useless, and return */
+		jffs2_free_full_dnode(fn);
+		return ERR_PTR(ret?ret:-EIO);
+	}
+	/* Mark the space used */
+	/* If node covers at least a whole page, or if it starts at the
+	   beginning of a page and runs to the end of the file, or if
+	   it's a hole node, mark it REF_PRISTINE, else REF_NORMAL.
+	*/
+	if ((je32_to_cpu(ri->dsize) >= PAGE_CACHE_SIZE) ||
+	    ( ((je32_to_cpu(ri->offset)&(PAGE_CACHE_SIZE-1))==0) &&
+	      (je32_to_cpu(ri->dsize)+je32_to_cpu(ri->offset) ==  je32_to_cpu(ri->isize)))) {
+		flash_ofs |= REF_PRISTINE;
+	} else {
+		flash_ofs |= REF_NORMAL;
+	}
+	fn->raw = jffs2_add_physical_node_ref(c, flash_ofs, PAD(sizeof(*ri)+datalen), f->inocache);
+	if (IS_ERR(fn->raw)) {
+		void *hold_err = fn->raw;
+		/* Release the full_dnode which is now useless, and return */
+		jffs2_free_full_dnode(fn);
+		return ERR_CAST(hold_err);
+	}
+	fn->ofs = je32_to_cpu(ri->offset);
+	fn->size = je32_to_cpu(ri->dsize);
+	fn->frags = 0;
+
+	D1(printk(KERN_DEBUG "jffs2_write_dnode wrote node at 0x%08x(%d) with dsize 0x%x, csize 0x%x, node_crc 0x%08x, data_crc 0x%08x, totlen 0x%08x\n",
+		  flash_ofs & ~3, flash_ofs & 3, je32_to_cpu(ri->dsize),
+		  je32_to_cpu(ri->csize), je32_to_cpu(ri->node_crc),
+		  je32_to_cpu(ri->data_crc), je32_to_cpu(ri->totlen)));
+
+	if (retried) {
+		jffs2_dbg_acct_sanity_check(c,NULL);
+	}
+
+	return fn;
+}
+
+struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					     struct jffs2_raw_dirent *rd, const unsigned char *name,
+					     uint32_t namelen, int alloc_mode)
+{
+	struct jffs2_full_dirent *fd;
+	size_t retlen;
+	struct kvec vecs[2];
+	uint32_t flash_ofs;
+	int retried = 0;
+	int ret;
+
+	D1(printk(KERN_DEBUG "jffs2_write_dirent(ino #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x)\n",
+		  je32_to_cpu(rd->pino), name, name, je32_to_cpu(rd->ino),
+		  je32_to_cpu(rd->name_crc)));
+
+	D1(if(je32_to_cpu(rd->hdr_crc) != crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)) {
+		printk(KERN_CRIT "Eep. CRC not correct in jffs2_write_dirent()\n");
+		BUG();
+	   });
+
+	if (strnlen(name, namelen) != namelen) {
+		/* This should never happen, but seems to have done on at least one
+		   occasion: https://dev.laptop.org/ticket/4184 */
+		printk(KERN_CRIT "Error in jffs2_write_dirent() -- name contains zero bytes!\n");
+		printk(KERN_CRIT "Directory inode #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x\n",
+		       je32_to_cpu(rd->pino), name, name, je32_to_cpu(rd->ino),
+		       je32_to_cpu(rd->name_crc));
+		WARN_ON(1);
+		return ERR_PTR(-EIO);
+	}
+
+	vecs[0].iov_base = rd;
+	vecs[0].iov_len = sizeof(*rd);
+	vecs[1].iov_base = (unsigned char *)name;
+	vecs[1].iov_len = namelen;
+
+	fd = jffs2_alloc_full_dirent(namelen+1);
+	if (!fd)
+		return ERR_PTR(-ENOMEM);
+
+	fd->version = je32_to_cpu(rd->version);
+	fd->ino = je32_to_cpu(rd->ino);
+	fd->nhash = full_name_hash(name, namelen);
+	fd->type = rd->type;
+	memcpy(fd->name, name, namelen);
+	fd->name[namelen]=0;
+
+ retry:
+	flash_ofs = write_ofs(c);
+
+	jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
+
+	if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(rd->version) < f->highest_version)) {
+		BUG_ON(!retried);
+		D1(printk(KERN_DEBUG "jffs2_write_dirent : dirent_version %d, "
+				     "highest version %d -> updating dirent\n",
+				     je32_to_cpu(rd->version), f->highest_version));
+		rd->version = cpu_to_je32(++f->highest_version);
+		fd->version = je32_to_cpu(rd->version);
+		rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
+	}
+
+	ret = jffs2_flash_writev(c, vecs, 2, flash_ofs, &retlen,
+				 (alloc_mode==ALLOC_GC)?0:je32_to_cpu(rd->pino));
+	if (ret || (retlen != sizeof(*rd) + namelen)) {
+		printk(KERN_NOTICE "Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n",
+			       sizeof(*rd)+namelen, flash_ofs, ret, retlen);
+		/* Mark the space as dirtied */
+		if (retlen) {
+			jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*rd)+namelen), NULL);
+		} else {
+			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", flash_ofs);
+		}
+		if (!retried) {
+			/* Try to reallocate space and retry */
+			uint32_t dummy;
+			struct jffs2_eraseblock *jeb = &c->blocks[flash_ofs / c->sector_size];
+
+			retried = 1;
+
+			D1(printk(KERN_DEBUG "Retrying failed write.\n"));
+
+			jffs2_dbg_acct_sanity_check(c,jeb);
+			jffs2_dbg_acct_paranoia_check(c, jeb);
+
+			if (alloc_mode == ALLOC_GC) {
+				ret = jffs2_reserve_space_gc(c, sizeof(*rd) + namelen, &dummy,
+							     JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+			} else {
+				/* Locking pain */
+				mutex_unlock(&f->sem);
+				jffs2_complete_reservation(c);
+
+				ret = jffs2_reserve_space(c, sizeof(*rd) + namelen, &dummy,
+							  alloc_mode, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+				mutex_lock(&f->sem);
+			}
+
+			if (!ret) {
+				flash_ofs = write_ofs(c);
+				D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", flash_ofs));
+				jffs2_dbg_acct_sanity_check(c,jeb);
+				jffs2_dbg_acct_paranoia_check(c, jeb);
+				goto retry;
+			}
+			D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret));
+		}
+		/* Release the full_dnode which is now useless, and return */
+		jffs2_free_full_dirent(fd);
+		return ERR_PTR(ret?ret:-EIO);
+	}
+	/* Mark the space used */
+	fd->raw = jffs2_add_physical_node_ref(c, flash_ofs | dirent_node_state(rd),
+					      PAD(sizeof(*rd)+namelen), f->inocache);
+	if (IS_ERR(fd->raw)) {
+		void *hold_err = fd->raw;
+		/* Release the full_dirent which is now useless, and return */
+		jffs2_free_full_dirent(fd);
+		return ERR_CAST(hold_err);
+	}
+
+	if (retried) {
+		jffs2_dbg_acct_sanity_check(c,NULL);
+	}
+
+	return fd;
+}
+
+/* The OS-specific code fills in the metadata in the jffs2_raw_inode for us, so that
+   we don't have to go digging in struct inode or its equivalent. It should set:
+   mode, uid, gid, (starting)isize, atime, ctime, mtime */
+int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+			    struct jffs2_raw_inode *ri, unsigned char *buf,
+			    uint32_t offset, uint32_t writelen, uint32_t *retlen)
+{
+	int ret = 0;
+	uint32_t writtenlen = 0;
+
+       	D1(printk(KERN_DEBUG "jffs2_write_inode_range(): Ino #%u, ofs 0x%x, len 0x%x\n",
+		  f->inocache->ino, offset, writelen));
+
+	while(writelen) {
+		struct jffs2_full_dnode *fn;
+		unsigned char *comprbuf = NULL;
+		uint16_t comprtype = JFFS2_COMPR_NONE;
+		uint32_t alloclen;
+		uint32_t datalen, cdatalen;
+		int retried = 0;
+
+	retry:
+		D2(printk(KERN_DEBUG "jffs2_commit_write() loop: 0x%x to write to 0x%x\n", writelen, offset));
+
+		ret = jffs2_reserve_space(c, sizeof(*ri) + JFFS2_MIN_DATA_LEN,
+					&alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+		if (ret) {
+			D1(printk(KERN_DEBUG "jffs2_reserve_space returned %d\n", ret));
+			break;
+		}
+		mutex_lock(&f->sem);
+		datalen = min_t(uint32_t, writelen, PAGE_CACHE_SIZE - (offset & (PAGE_CACHE_SIZE-1)));
+		cdatalen = min_t(uint32_t, alloclen - sizeof(*ri), datalen);
+
+		comprtype = jffs2_compress(c, f, buf, &comprbuf, &datalen, &cdatalen);
+
+		ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+		ri->nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE);
+		ri->totlen = cpu_to_je32(sizeof(*ri) + cdatalen);
+		ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4));
+
+		ri->ino = cpu_to_je32(f->inocache->ino);
+		ri->version = cpu_to_je32(++f->highest_version);
+		ri->isize = cpu_to_je32(max(je32_to_cpu(ri->isize), offset + datalen));
+		ri->offset = cpu_to_je32(offset);
+		ri->csize = cpu_to_je32(cdatalen);
+		ri->dsize = cpu_to_je32(datalen);
+		ri->compr = comprtype & 0xff;
+		ri->usercompr = (comprtype >> 8 ) & 0xff;
+		ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
+		ri->data_crc = cpu_to_je32(crc32(0, comprbuf, cdatalen));
+
+		fn = jffs2_write_dnode(c, f, ri, comprbuf, cdatalen, ALLOC_NORETRY);
+
+		jffs2_free_comprbuf(comprbuf, buf);
+
+		if (IS_ERR(fn)) {
+			ret = PTR_ERR(fn);
+			mutex_unlock(&f->sem);
+			jffs2_complete_reservation(c);
+			if (!retried) {
+				/* Write error to be retried */
+				retried = 1;
+				D1(printk(KERN_DEBUG "Retrying node write in jffs2_write_inode_range()\n"));
+				goto retry;
+			}
+			break;
+		}
+		ret = jffs2_add_full_dnode_to_inode(c, f, fn);
+		if (f->metadata) {
+			jffs2_mark_node_obsolete(c, f->metadata->raw);
+			jffs2_free_full_dnode(f->metadata);
+			f->metadata = NULL;
+		}
+		if (ret) {
+			/* Eep */
+			D1(printk(KERN_DEBUG "Eep. add_full_dnode_to_inode() failed in commit_write, returned %d\n", ret));
+			jffs2_mark_node_obsolete(c, fn->raw);
+			jffs2_free_full_dnode(fn);
+
+			mutex_unlock(&f->sem);
+			jffs2_complete_reservation(c);
+			break;
+		}
+		mutex_unlock(&f->sem);
+		jffs2_complete_reservation(c);
+		if (!datalen) {
+			printk(KERN_WARNING "Eep. We didn't actually write any data in jffs2_write_inode_range()\n");
+			ret = -EIO;
+			break;
+		}
+		D1(printk(KERN_DEBUG "increasing writtenlen by %d\n", datalen));
+		writtenlen += datalen;
+		offset += datalen;
+		writelen -= datalen;
+		buf += datalen;
+	}
+	*retlen = writtenlen;
+	return ret;
+}
+
+int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
+		    struct jffs2_inode_info *f, struct jffs2_raw_inode *ri,
+		    const struct qstr *qstr)
+{
+	struct jffs2_raw_dirent *rd;
+	struct jffs2_full_dnode *fn;
+	struct jffs2_full_dirent *fd;
+	uint32_t alloclen;
+	int ret;
+
+	/* Try to reserve enough space for both node and dirent.
+	 * Just the node will do for now, though
+	 */
+	ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL,
+				JFFS2_SUMMARY_INODE_SIZE);
+	D1(printk(KERN_DEBUG "jffs2_do_create(): reserved 0x%x bytes\n", alloclen));
+	if (ret)
+		return ret;
+
+	mutex_lock(&f->sem);
+
+	ri->data_crc = cpu_to_je32(0);
+	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
+
+	fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL);
+
+	D1(printk(KERN_DEBUG "jffs2_do_create created file with mode 0x%x\n",
+		  jemode_to_cpu(ri->mode)));
+
+	if (IS_ERR(fn)) {
+		D1(printk(KERN_DEBUG "jffs2_write_dnode() failed\n"));
+		/* Eeek. Wave bye bye */
+		mutex_unlock(&f->sem);
+		jffs2_complete_reservation(c);
+		return PTR_ERR(fn);
+	}
+	/* No data here. Only a metadata node, which will be
+	   obsoleted by the first data write
+	*/
+	f->metadata = fn;
+
+	mutex_unlock(&f->sem);
+	jffs2_complete_reservation(c);
+
+	ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode, qstr);
+	if (ret)
+		return ret;
+	ret = jffs2_init_acl_post(&f->vfs_inode);
+	if (ret)
+		return ret;
+
+	ret = jffs2_reserve_space(c, sizeof(*rd)+qstr->len, &alloclen,
+				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(qstr->len));
+
+	if (ret) {
+		/* Eep. */
+		D1(printk(KERN_DEBUG "jffs2_reserve_space() for dirent failed\n"));
+		return ret;
+	}
+
+	rd = jffs2_alloc_raw_dirent();
+	if (!rd) {
+		/* Argh. Now we treat it like a normal delete */
+		jffs2_complete_reservation(c);
+		return -ENOMEM;
+	}
+
+	mutex_lock(&dir_f->sem);
+
+	rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
+	rd->totlen = cpu_to_je32(sizeof(*rd) + qstr->len);
+	rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
+
+	rd->pino = cpu_to_je32(dir_f->inocache->ino);
+	rd->version = cpu_to_je32(++dir_f->highest_version);
+	rd->ino = ri->ino;
+	rd->mctime = ri->ctime;
+	rd->nsize = qstr->len;
+	rd->type = DT_REG;
+	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
+	rd->name_crc = cpu_to_je32(crc32(0, qstr->name, qstr->len));
+
+	fd = jffs2_write_dirent(c, dir_f, rd, qstr->name, qstr->len, ALLOC_NORMAL);
+
+	jffs2_free_raw_dirent(rd);
+
+	if (IS_ERR(fd)) {
+		/* dirent failed to write. Delete the inode normally
+		   as if it were the final unlink() */
+		jffs2_complete_reservation(c);
+		mutex_unlock(&dir_f->sem);
+		return PTR_ERR(fd);
+	}
+
+	/* Link the fd into the inode's list, obsoleting an old
+	   one if necessary. */
+	jffs2_add_fd_to_list(c, fd, &dir_f->dents);
+
+	jffs2_complete_reservation(c);
+	mutex_unlock(&dir_f->sem);
+
+	return 0;
+}
+
+
+int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
+		    const char *name, int namelen, struct jffs2_inode_info *dead_f,
+		    uint32_t time)
+{
+	struct jffs2_raw_dirent *rd;
+	struct jffs2_full_dirent *fd;
+	uint32_t alloclen;
+	int ret;
+
+	if (!jffs2_can_mark_obsolete(c)) {
+		/* We can't mark stuff obsolete on the medium. We need to write a deletion dirent */
+
+		rd = jffs2_alloc_raw_dirent();
+		if (!rd)
+			return -ENOMEM;
+
+		ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+					ALLOC_DELETION, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+		if (ret) {
+			jffs2_free_raw_dirent(rd);
+			return ret;
+		}
+
+		mutex_lock(&dir_f->sem);
+
+		/* Build a deletion node */
+		rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+		rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
+		rd->totlen = cpu_to_je32(sizeof(*rd) + namelen);
+		rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
+
+		rd->pino = cpu_to_je32(dir_f->inocache->ino);
+		rd->version = cpu_to_je32(++dir_f->highest_version);
+		rd->ino = cpu_to_je32(0);
+		rd->mctime = cpu_to_je32(time);
+		rd->nsize = namelen;
+		rd->type = DT_UNKNOWN;
+		rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
+		rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
+
+		fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_DELETION);
+
+		jffs2_free_raw_dirent(rd);
+
+		if (IS_ERR(fd)) {
+			jffs2_complete_reservation(c);
+			mutex_unlock(&dir_f->sem);
+			return PTR_ERR(fd);
+		}
+
+		/* File it. This will mark the old one obsolete. */
+		jffs2_add_fd_to_list(c, fd, &dir_f->dents);
+		mutex_unlock(&dir_f->sem);
+	} else {
+		uint32_t nhash = full_name_hash(name, namelen);
+
+		fd = dir_f->dents;
+		/* We don't actually want to reserve any space, but we do
+		   want to be holding the alloc_sem when we write to flash */
+		mutex_lock(&c->alloc_sem);
+		mutex_lock(&dir_f->sem);
+
+		for (fd = dir_f->dents; fd; fd = fd->next) {
+			if (fd->nhash == nhash &&
+			    !memcmp(fd->name, name, namelen) &&
+			    !fd->name[namelen]) {
+
+				D1(printk(KERN_DEBUG "Marking old dirent node (ino #%u) @%08x obsolete\n",
+					  fd->ino, ref_offset(fd->raw)));
+				jffs2_mark_node_obsolete(c, fd->raw);
+				/* We don't want to remove it from the list immediately,
+				   because that screws up getdents()/seek() semantics even
+				   more than they're screwed already. Turn it into a
+				   node-less deletion dirent instead -- a placeholder */
+				fd->raw = NULL;
+				fd->ino = 0;
+				break;
+			}
+		}
+		mutex_unlock(&dir_f->sem);
+	}
+
+	/* dead_f is NULL if this was a rename not a real unlink */
+	/* Also catch the !f->inocache case, where there was a dirent
+	   pointing to an inode which didn't exist. */
+	if (dead_f && dead_f->inocache) {
+
+		mutex_lock(&dead_f->sem);
+
+		if (S_ISDIR(OFNI_EDONI_2SFFJ(dead_f)->i_mode)) {
+			while (dead_f->dents) {
+				/* There can be only deleted ones */
+				fd = dead_f->dents;
+
+				dead_f->dents = fd->next;
+
+				if (fd->ino) {
+					printk(KERN_WARNING "Deleting inode #%u with active dentry \"%s\"->ino #%u\n",
+					       dead_f->inocache->ino, fd->name, fd->ino);
+				} else {
+					D1(printk(KERN_DEBUG "Removing deletion dirent for \"%s\" from dir ino #%u\n",
+						fd->name, dead_f->inocache->ino));
+				}
+				if (fd->raw)
+					jffs2_mark_node_obsolete(c, fd->raw);
+				jffs2_free_full_dirent(fd);
+			}
+			dead_f->inocache->pino_nlink = 0;
+		} else
+			dead_f->inocache->pino_nlink--;
+		/* NB: Caller must set inode nlink if appropriate */
+		mutex_unlock(&dead_f->sem);
+	}
+
+	jffs2_complete_reservation(c);
+
+	return 0;
+}
+
+
+int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino, uint8_t type, const char *name, int namelen, uint32_t time)
+{
+	struct jffs2_raw_dirent *rd;
+	struct jffs2_full_dirent *fd;
+	uint32_t alloclen;
+	int ret;
+
+	rd = jffs2_alloc_raw_dirent();
+	if (!rd)
+		return -ENOMEM;
+
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+	if (ret) {
+		jffs2_free_raw_dirent(rd);
+		return ret;
+	}
+
+	mutex_lock(&dir_f->sem);
+
+	/* Build a deletion node */
+	rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
+	rd->totlen = cpu_to_je32(sizeof(*rd) + namelen);
+	rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
+
+	rd->pino = cpu_to_je32(dir_f->inocache->ino);
+	rd->version = cpu_to_je32(++dir_f->highest_version);
+	rd->ino = cpu_to_je32(ino);
+	rd->mctime = cpu_to_je32(time);
+	rd->nsize = namelen;
+
+	rd->type = type;
+
+	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
+	rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
+
+	fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL);
+
+	jffs2_free_raw_dirent(rd);
+
+	if (IS_ERR(fd)) {
+		jffs2_complete_reservation(c);
+		mutex_unlock(&dir_f->sem);
+		return PTR_ERR(fd);
+	}
+
+	/* File it. This will mark the old one obsolete. */
+	jffs2_add_fd_to_list(c, fd, &dir_f->dents);
+
+	jffs2_complete_reservation(c);
+	mutex_unlock(&dir_f->sem);
+
+	return 0;
+}
diff --git a/fs/jffs2/writev.c b/fs/jffs2/writev.c
new file mode 100644
index 00000000..b9276b11
--- /dev/null
+++ b/fs/jffs2/writev.c
@@ -0,0 +1,79 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+/* This ought to be in core MTD code. All registered MTD devices
+   without writev should have this put in place. Bug the MTD
+   maintainer */
+static inline int mtd_fake_writev(struct mtd_info *mtd, const struct kvec *vecs,
+				  unsigned long count, loff_t to, size_t *retlen)
+{
+	unsigned long i;
+	size_t totlen = 0, thislen;
+	int ret = 0;
+
+	for (i=0; i<count; i++) {
+		if (!vecs[i].iov_len)
+			continue;
+		ret = mtd->write(mtd, to, vecs[i].iov_len, &thislen, vecs[i].iov_base);
+		totlen += thislen;
+		if (ret || thislen != vecs[i].iov_len)
+			break;
+		to += vecs[i].iov_len;
+	}
+	if (retlen)
+		*retlen = totlen;
+	return ret;
+}
+
+int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs,
+			      unsigned long count, loff_t to, size_t *retlen)
+{
+	if (!jffs2_is_writebuffered(c)) {
+		if (jffs2_sum_active()) {
+			int res;
+			res = jffs2_sum_add_kvec(c, vecs, count, (uint32_t) to);
+			if (res) {
+				return res;
+			}
+		}
+	}
+
+	if (c->mtd->writev)
+		return c->mtd->writev(c->mtd, vecs, count, to, retlen);
+	else {
+		return mtd_fake_writev(c->mtd, vecs, count, to, retlen);
+	}
+}
+
+int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len,
+			size_t *retlen, const u_char *buf)
+{
+	int ret;
+	ret = c->mtd->write(c->mtd, ofs, len, retlen, buf);
+
+	if (jffs2_sum_active()) {
+		struct kvec vecs[1];
+		int res;
+
+		vecs[0].iov_base = (unsigned char *) buf;
+		vecs[0].iov_len = len;
+
+		res = jffs2_sum_add_kvec(c, vecs, 1, (uint32_t) ofs);
+		if (res) {
+			return res;
+		}
+	}
+	return ret;
+}
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
new file mode 100644
index 00000000..3e93cdd1
--- /dev/null
+++ b/fs/jffs2/xattr.c
@@ -0,0 +1,1330 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+/* -------- xdatum related functions ----------------
+ * xattr_datum_hashkey(xprefix, xname, xvalue, xsize)
+ *   is used to calcurate xdatum hashkey. The reminder of hashkey into XATTRINDEX_HASHSIZE is
+ *   the index of the xattr name/value pair cache (c->xattrindex).
+ * is_xattr_datum_unchecked(c, xd)
+ *   returns 1, if xdatum contains any unchecked raw nodes. if all raw nodes are not
+ *   unchecked, it returns 0.
+ * unload_xattr_datum(c, xd)
+ *   is used to release xattr name/value pair and detach from c->xattrindex.
+ * reclaim_xattr_datum(c)
+ *   is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
+ *   memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold
+ *   is hard coded as 32KiB.
+ * do_verify_xattr_datum(c, xd)
+ *   is used to load the xdatum informations without name/value pair from the medium.
+ *   It's necessary once, because those informations are not collected during mounting
+ *   process when EBS is enabled.
+ *   0 will be returned, if success. An negative return value means recoverable error, and
+ *   positive return value means unrecoverable error. Thus, caller must remove this xdatum
+ *   and xref when it returned positive value.
+ * do_load_xattr_datum(c, xd)
+ *   is used to load name/value pair from the medium.
+ *   The meanings of return value is same as do_verify_xattr_datum().
+ * load_xattr_datum(c, xd)
+ *   is used to be as a wrapper of do_verify_xattr_datum() and do_load_xattr_datum().
+ *   If xd need to call do_verify_xattr_datum() at first, it's called before calling
+ *   do_load_xattr_datum(). The meanings of return value is same as do_verify_xattr_datum().
+ * save_xattr_datum(c, xd)
+ *   is used to write xdatum to medium. xd->version will be incremented.
+ * create_xattr_datum(c, xprefix, xname, xvalue, xsize)
+ *   is used to create new xdatum and write to medium.
+ * unrefer_xattr_datum(c, xd)
+ *   is used to delete a xdatum. When nobody refers this xdatum, JFFS2_XFLAGS_DEAD
+ *   is set on xd->flags and chained xattr_dead_list or release it immediately.
+ *   In the first case, the garbage collector release it later.
+ * -------------------------------------------------- */
+static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *xvalue, int xsize)
+{
+	int name_len = strlen(xname);
+
+	return crc32(xprefix, xname, name_len) ^ crc32(xprefix, xvalue, xsize);
+}
+
+static int is_xattr_datum_unchecked(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	struct jffs2_raw_node_ref *raw;
+	int rc = 0;
+
+	spin_lock(&c->erase_completion_lock);
+	for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) {
+		if (ref_flags(raw) == REF_UNCHECKED) {
+			rc = 1;
+			break;
+		}
+	}
+	spin_unlock(&c->erase_completion_lock);
+	return rc;
+}
+
+static void unload_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	D1(dbg_xattr("%s: xid=%u, version=%u\n", __func__, xd->xid, xd->version));
+	if (xd->xname) {
+		c->xdatum_mem_usage -= (xd->name_len + 1 + xd->value_len);
+		kfree(xd->xname);
+	}
+
+	list_del_init(&xd->xindex);
+	xd->hashkey = 0;
+	xd->xname = NULL;
+	xd->xvalue = NULL;
+}
+
+static void reclaim_xattr_datum(struct jffs2_sb_info *c)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd, *_xd;
+	uint32_t target, before;
+	static int index = 0;
+	int count;
+
+	if (c->xdatum_mem_threshold > c->xdatum_mem_usage)
+		return;
+
+	before = c->xdatum_mem_usage;
+	target = c->xdatum_mem_usage * 4 / 5; /* 20% reduction */
+	for (count = 0; count < XATTRINDEX_HASHSIZE; count++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[index], xindex) {
+			if (xd->flags & JFFS2_XFLAGS_HOT) {
+				xd->flags &= ~JFFS2_XFLAGS_HOT;
+			} else if (!(xd->flags & JFFS2_XFLAGS_BIND)) {
+				unload_xattr_datum(c, xd);
+			}
+			if (c->xdatum_mem_usage <= target)
+				goto out;
+		}
+		index = (index+1) % XATTRINDEX_HASHSIZE;
+	}
+ out:
+	JFFS2_NOTICE("xdatum_mem_usage from %u byte to %u byte (%u byte reclaimed)\n",
+		     before, c->xdatum_mem_usage, before - c->xdatum_mem_usage);
+}
+
+static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *raw;
+	struct jffs2_raw_xattr rx;
+	size_t readlen;
+	uint32_t crc, offset, totlen;
+	int rc;
+
+	spin_lock(&c->erase_completion_lock);
+	offset = ref_offset(xd->node);
+	if (ref_flags(xd->node) == REF_PRISTINE)
+		goto complete;
+	spin_unlock(&c->erase_completion_lock);
+
+	rc = jffs2_flash_read(c, offset, sizeof(rx), &readlen, (char *)&rx);
+	if (rc || readlen != sizeof(rx)) {
+		JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n",
+			      rc, sizeof(rx), readlen, offset);
+		return rc ? rc : -EIO;
+	}
+	crc = crc32(0, &rx, sizeof(rx) - 4);
+	if (crc != je32_to_cpu(rx.node_crc)) {
+		JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			    offset, je32_to_cpu(rx.hdr_crc), crc);
+		xd->flags |= JFFS2_XFLAGS_INVALID;
+		return -EIO;
+	}
+	totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
+	if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
+	    || je16_to_cpu(rx.nodetype) != JFFS2_NODETYPE_XATTR
+	    || je32_to_cpu(rx.totlen) != totlen
+	    || je32_to_cpu(rx.xid) != xd->xid
+	    || je32_to_cpu(rx.version) != xd->version) {
+		JFFS2_ERROR("inconsistent xdatum at %#08x, magic=%#04x/%#04x, "
+			    "nodetype=%#04x/%#04x, totlen=%u/%u, xid=%u/%u, version=%u/%u\n",
+			    offset, je16_to_cpu(rx.magic), JFFS2_MAGIC_BITMASK,
+			    je16_to_cpu(rx.nodetype), JFFS2_NODETYPE_XATTR,
+			    je32_to_cpu(rx.totlen), totlen,
+			    je32_to_cpu(rx.xid), xd->xid,
+			    je32_to_cpu(rx.version), xd->version);
+		xd->flags |= JFFS2_XFLAGS_INVALID;
+		return -EIO;
+	}
+	xd->xprefix = rx.xprefix;
+	xd->name_len = rx.name_len;
+	xd->value_len = je16_to_cpu(rx.value_len);
+	xd->data_crc = je32_to_cpu(rx.data_crc);
+
+	spin_lock(&c->erase_completion_lock);
+ complete:
+	for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) {
+		jeb = &c->blocks[ref_offset(raw) / c->sector_size];
+		totlen = PAD(ref_totlen(c, jeb, raw));
+		if (ref_flags(raw) == REF_UNCHECKED) {
+			c->unchecked_size -= totlen; c->used_size += totlen;
+			jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+		}
+		raw->flash_offset = ref_offset(raw) | ((xd->node==raw) ? REF_PRISTINE : REF_NORMAL);
+	}
+	spin_unlock(&c->erase_completion_lock);
+
+	/* unchecked xdatum is chained with c->xattr_unchecked */
+	list_del_init(&xd->xindex);
+
+	dbg_xattr("success on verfying xdatum (xid=%u, version=%u)\n",
+		  xd->xid, xd->version);
+
+	return 0;
+}
+
+static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	char *data;
+	size_t readlen;
+	uint32_t crc, length;
+	int i, ret, retry = 0;
+
+	BUG_ON(ref_flags(xd->node) != REF_PRISTINE);
+	BUG_ON(!list_empty(&xd->xindex));
+ retry:
+	length = xd->name_len + 1 + xd->value_len;
+	data = kmalloc(length, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	ret = jffs2_flash_read(c, ref_offset(xd->node)+sizeof(struct jffs2_raw_xattr),
+			       length, &readlen, data);
+
+	if (ret || length!=readlen) {
+		JFFS2_WARNING("jffs2_flash_read() returned %d, request=%d, readlen=%zu, at %#08x\n",
+			      ret, length, readlen, ref_offset(xd->node));
+		kfree(data);
+		return ret ? ret : -EIO;
+	}
+
+	data[xd->name_len] = '\0';
+	crc = crc32(0, data, length);
+	if (crc != xd->data_crc) {
+		JFFS2_WARNING("node CRC failed (JFFS2_NODETYPE_XREF)"
+			      " at %#08x, read: 0x%08x calculated: 0x%08x\n",
+			      ref_offset(xd->node), xd->data_crc, crc);
+		kfree(data);
+		xd->flags |= JFFS2_XFLAGS_INVALID;
+		return -EIO;
+	}
+
+	xd->flags |= JFFS2_XFLAGS_HOT;
+	xd->xname = data;
+	xd->xvalue = data + xd->name_len+1;
+
+	c->xdatum_mem_usage += length;
+
+	xd->hashkey = xattr_datum_hashkey(xd->xprefix, xd->xname, xd->xvalue, xd->value_len);
+	i = xd->hashkey % XATTRINDEX_HASHSIZE;
+	list_add(&xd->xindex, &c->xattrindex[i]);
+	if (!retry) {
+		retry = 1;
+		reclaim_xattr_datum(c);
+		if (!xd->xname)
+			goto retry;
+	}
+
+	dbg_xattr("success on loading xdatum (xid=%u, xprefix=%u, xname='%s')\n",
+		  xd->xid, xd->xprefix, xd->xname);
+
+	return 0;
+}
+
+static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem);
+	 * rc < 0 : recoverable error, try again
+	 * rc = 0 : success
+	 * rc > 0 : Unrecoverable error, this node should be deleted.
+	 */
+	int rc = 0;
+
+	BUG_ON(xd->flags & JFFS2_XFLAGS_DEAD);
+	if (xd->xname)
+		return 0;
+	if (xd->flags & JFFS2_XFLAGS_INVALID)
+		return -EIO;
+	if (unlikely(is_xattr_datum_unchecked(c, xd)))
+		rc = do_verify_xattr_datum(c, xd);
+	if (!rc)
+		rc = do_load_xattr_datum(c, xd);
+	return rc;
+}
+
+static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_raw_xattr rx;
+	struct kvec vecs[2];
+	size_t length;
+	int rc, totlen;
+	uint32_t phys_ofs = write_ofs(c);
+
+	BUG_ON(!xd->xname);
+	BUG_ON(xd->flags & (JFFS2_XFLAGS_DEAD|JFFS2_XFLAGS_INVALID));
+
+	vecs[0].iov_base = &rx;
+	vecs[0].iov_len = sizeof(rx);
+	vecs[1].iov_base = xd->xname;
+	vecs[1].iov_len = xd->name_len + 1 + xd->value_len;
+	totlen = vecs[0].iov_len + vecs[1].iov_len;
+
+	/* Setup raw-xattr */
+	memset(&rx, 0, sizeof(rx));
+	rx.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rx.nodetype = cpu_to_je16(JFFS2_NODETYPE_XATTR);
+	rx.totlen = cpu_to_je32(PAD(totlen));
+	rx.hdr_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_unknown_node) - 4));
+
+	rx.xid = cpu_to_je32(xd->xid);
+	rx.version = cpu_to_je32(++xd->version);
+	rx.xprefix = xd->xprefix;
+	rx.name_len = xd->name_len;
+	rx.value_len = cpu_to_je16(xd->value_len);
+	rx.data_crc = cpu_to_je32(crc32(0, vecs[1].iov_base, vecs[1].iov_len));
+	rx.node_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_raw_xattr) - 4));
+
+	rc = jffs2_flash_writev(c, vecs, 2, phys_ofs, &length, 0);
+	if (rc || totlen != length) {
+		JFFS2_WARNING("jffs2_flash_writev()=%d, req=%u, wrote=%zu, at %#08x\n",
+			      rc, totlen, length, phys_ofs);
+		rc = rc ? rc : -EIO;
+		if (length)
+			jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, PAD(totlen), NULL);
+
+		return rc;
+	}
+	/* success */
+	jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(totlen), (void *)xd);
+
+	dbg_xattr("success on saving xdatum (xid=%u, version=%u, xprefix=%u, xname='%s')\n",
+		  xd->xid, xd->version, xd->xprefix, xd->xname);
+
+	return 0;
+}
+
+static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
+						    int xprefix, const char *xname,
+						    const char *xvalue, int xsize)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd;
+	uint32_t hashkey, name_len;
+	char *data;
+	int i, rc;
+
+	/* Search xattr_datum has same xname/xvalue by index */
+	hashkey = xattr_datum_hashkey(xprefix, xname, xvalue, xsize);
+	i = hashkey % XATTRINDEX_HASHSIZE;
+	list_for_each_entry(xd, &c->xattrindex[i], xindex) {
+		if (xd->hashkey==hashkey
+		    && xd->xprefix==xprefix
+		    && xd->value_len==xsize
+		    && !strcmp(xd->xname, xname)
+		    && !memcmp(xd->xvalue, xvalue, xsize)) {
+			atomic_inc(&xd->refcnt);
+			return xd;
+		}
+	}
+
+	/* Not found, Create NEW XATTR-Cache */
+	name_len = strlen(xname);
+
+	xd = jffs2_alloc_xattr_datum();
+	if (!xd)
+		return ERR_PTR(-ENOMEM);
+
+	data = kmalloc(name_len + 1 + xsize, GFP_KERNEL);
+	if (!data) {
+		jffs2_free_xattr_datum(xd);
+		return ERR_PTR(-ENOMEM);
+	}
+	strcpy(data, xname);
+	memcpy(data + name_len + 1, xvalue, xsize);
+
+	atomic_set(&xd->refcnt, 1);
+	xd->xid = ++c->highest_xid;
+	xd->flags |= JFFS2_XFLAGS_HOT;
+	xd->xprefix = xprefix;
+
+	xd->hashkey = hashkey;
+	xd->xname = data;
+	xd->xvalue = data + name_len + 1;
+	xd->name_len = name_len;
+	xd->value_len = xsize;
+	xd->data_crc = crc32(0, data, xd->name_len + 1 + xd->value_len);
+
+	rc = save_xattr_datum(c, xd);
+	if (rc) {
+		kfree(xd->xname);
+		jffs2_free_xattr_datum(xd);
+		return ERR_PTR(rc);
+	}
+
+	/* Insert Hash Index */
+	i = hashkey % XATTRINDEX_HASHSIZE;
+	list_add(&xd->xindex, &c->xattrindex[i]);
+
+	c->xdatum_mem_usage += (xd->name_len + 1 + xd->value_len);
+	reclaim_xattr_datum(c);
+
+	return xd;
+}
+
+static void unrefer_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	if (atomic_dec_and_lock(&xd->refcnt, &c->erase_completion_lock)) {
+		unload_xattr_datum(c, xd);
+		xd->flags |= JFFS2_XFLAGS_DEAD;
+		if (xd->node == (void *)xd) {
+			BUG_ON(!(xd->flags & JFFS2_XFLAGS_INVALID));
+			jffs2_free_xattr_datum(xd);
+		} else {
+			list_add(&xd->xindex, &c->xattr_dead_list);
+		}
+		spin_unlock(&c->erase_completion_lock);
+
+		dbg_xattr("xdatum(xid=%u, version=%u) was removed.\n",
+			  xd->xid, xd->version);
+	}
+}
+
+/* -------- xref related functions ------------------
+ * verify_xattr_ref(c, ref)
+ *   is used to load xref information from medium. Because summary data does not
+ *   contain xid/ino, it's necessary to verify once while mounting process.
+ * save_xattr_ref(c, ref)
+ *   is used to write xref to medium. If delete marker is marked, it write
+ *   a delete marker of xref into medium.
+ * create_xattr_ref(c, ic, xd)
+ *   is used to create a new xref and write to medium.
+ * delete_xattr_ref(c, ref)
+ *   is used to delete jffs2_xattr_ref. It marks xref XREF_DELETE_MARKER,
+ *   and allows GC to reclaim those physical nodes.
+ * jffs2_xattr_delete_inode(c, ic)
+ *   is called to remove xrefs related to obsolete inode when inode is unlinked.
+ * jffs2_xattr_free_inode(c, ic)
+ *   is called to release xattr related objects when unmounting. 
+ * check_xattr_ref_inode(c, ic)
+ *   is used to confirm inode does not have duplicate xattr name/value pair.
+ * -------------------------------------------------- */
+static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *raw;
+	struct jffs2_raw_xref rr;
+	size_t readlen;
+	uint32_t crc, offset, totlen;
+	int rc;
+
+	spin_lock(&c->erase_completion_lock);
+	if (ref_flags(ref->node) != REF_UNCHECKED)
+		goto complete;
+	offset = ref_offset(ref->node);
+	spin_unlock(&c->erase_completion_lock);
+
+	rc = jffs2_flash_read(c, offset, sizeof(rr), &readlen, (char *)&rr);
+	if (rc || sizeof(rr) != readlen) {
+		JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu, at %#08x\n",
+			      rc, sizeof(rr), readlen, offset);
+		return rc ? rc : -EIO;
+	}
+	/* obsolete node */
+	crc = crc32(0, &rr, sizeof(rr) - 4);
+	if (crc != je32_to_cpu(rr.node_crc)) {
+		JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			    offset, je32_to_cpu(rr.node_crc), crc);
+		return -EIO;
+	}
+	if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
+	    || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
+	    || je32_to_cpu(rr.totlen) != PAD(sizeof(rr))) {
+		JFFS2_ERROR("inconsistent xref at %#08x, magic=%#04x/%#04x, "
+			    "nodetype=%#04x/%#04x, totlen=%u/%zu\n",
+			    offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
+			    je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
+			    je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
+		return -EIO;
+	}
+	ref->ino = je32_to_cpu(rr.ino);
+	ref->xid = je32_to_cpu(rr.xid);
+	ref->xseqno = je32_to_cpu(rr.xseqno);
+	if (ref->xseqno > c->highest_xseqno)
+		c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER);
+
+	spin_lock(&c->erase_completion_lock);
+ complete:
+	for (raw=ref->node; raw != (void *)ref; raw=raw->next_in_ino) {
+		jeb = &c->blocks[ref_offset(raw) / c->sector_size];
+		totlen = PAD(ref_totlen(c, jeb, raw));
+		if (ref_flags(raw) == REF_UNCHECKED) {
+			c->unchecked_size -= totlen; c->used_size += totlen;
+			jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+		}
+		raw->flash_offset = ref_offset(raw) | ((ref->node==raw) ? REF_PRISTINE : REF_NORMAL);
+	}
+	spin_unlock(&c->erase_completion_lock);
+
+	dbg_xattr("success on verifying xref (ino=%u, xid=%u) at %#08x\n",
+		  ref->ino, ref->xid, ref_offset(ref->node));
+	return 0;
+}
+
+static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_raw_xref rr;
+	size_t length;
+	uint32_t xseqno, phys_ofs = write_ofs(c);
+	int ret;
+
+	rr.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rr.nodetype = cpu_to_je16(JFFS2_NODETYPE_XREF);
+	rr.totlen = cpu_to_je32(PAD(sizeof(rr)));
+	rr.hdr_crc = cpu_to_je32(crc32(0, &rr, sizeof(struct jffs2_unknown_node) - 4));
+
+	xseqno = (c->highest_xseqno += 2);
+	if (is_xattr_ref_dead(ref)) {
+		xseqno |= XREF_DELETE_MARKER;
+		rr.ino = cpu_to_je32(ref->ino);
+		rr.xid = cpu_to_je32(ref->xid);
+	} else {
+		rr.ino = cpu_to_je32(ref->ic->ino);
+		rr.xid = cpu_to_je32(ref->xd->xid);
+	}
+	rr.xseqno = cpu_to_je32(xseqno);
+	rr.node_crc = cpu_to_je32(crc32(0, &rr, sizeof(rr) - 4));
+
+	ret = jffs2_flash_write(c, phys_ofs, sizeof(rr), &length, (char *)&rr);
+	if (ret || sizeof(rr) != length) {
+		JFFS2_WARNING("jffs2_flash_write() returned %d, request=%zu, retlen=%zu, at %#08x\n",
+			      ret, sizeof(rr), length, phys_ofs);
+		ret = ret ? ret : -EIO;
+		if (length)
+			jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, PAD(sizeof(rr)), NULL);
+
+		return ret;
+	}
+	/* success */
+	ref->xseqno = xseqno;
+	jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(sizeof(rr)), (void *)ref);
+
+	dbg_xattr("success on saving xref (ino=%u, xid=%u)\n", ref->ic->ino, ref->xd->xid);
+
+	return 0;
+}
+
+static struct jffs2_xattr_ref *create_xattr_ref(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic,
+						struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_ref *ref;
+	int ret;
+
+	ref = jffs2_alloc_xattr_ref();
+	if (!ref)
+		return ERR_PTR(-ENOMEM);
+	ref->ic = ic;
+	ref->xd = xd;
+
+	ret = save_xattr_ref(c, ref);
+	if (ret) {
+		jffs2_free_xattr_ref(ref);
+		return ERR_PTR(ret);
+	}
+
+	/* Chain to inode */
+	ref->next = ic->xref;
+	ic->xref = ref;
+
+	return ref; /* success */
+}
+
+static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd;
+
+	xd = ref->xd;
+	ref->xseqno |= XREF_DELETE_MARKER;
+	ref->ino = ref->ic->ino;
+	ref->xid = ref->xd->xid;
+	spin_lock(&c->erase_completion_lock);
+	ref->next = c->xref_dead_list;
+	c->xref_dead_list = ref;
+	spin_unlock(&c->erase_completion_lock);
+
+	dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) was removed.\n",
+		  ref->ino, ref->xid, ref->xseqno);
+
+	unrefer_xattr_datum(c, xd);
+}
+
+void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	/* It's called from jffs2_evict_inode() on inode removing.
+	   When an inode with XATTR is removed, those XATTRs must be removed. */
+	struct jffs2_xattr_ref *ref, *_ref;
+
+	if (!ic || ic->pino_nlink > 0)
+		return;
+
+	down_write(&c->xattr_sem);
+	for (ref = ic->xref; ref; ref = _ref) {
+		_ref = ref->next;
+		delete_xattr_ref(c, ref);
+	}
+	ic->xref = NULL;
+	up_write(&c->xattr_sem);
+}
+
+void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	/* It's called from jffs2_free_ino_caches() until unmounting FS. */
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref, *_ref;
+
+	down_write(&c->xattr_sem);
+	for (ref = ic->xref; ref; ref = _ref) {
+		_ref = ref->next;
+		xd = ref->xd;
+		if (atomic_dec_and_test(&xd->refcnt)) {
+			unload_xattr_datum(c, xd);
+			jffs2_free_xattr_datum(xd);
+		}
+		jffs2_free_xattr_ref(ref);
+	}
+	ic->xref = NULL;
+	up_write(&c->xattr_sem);
+}
+
+static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	/* success of check_xattr_ref_inode() means that inode (ic) dose not have
+	 * duplicate name/value pairs. If duplicate name/value pair would be found,
+	 * one will be removed.
+	 */
+	struct jffs2_xattr_ref *ref, *cmp, **pref, **pcmp;
+	int rc = 0;
+
+	if (likely(ic->flags & INO_FLAGS_XATTR_CHECKED))
+		return 0;
+	down_write(&c->xattr_sem);
+ retry:
+	rc = 0;
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+		if (!ref->xd->xname) {
+			rc = load_xattr_datum(c, ref->xd);
+			if (unlikely(rc > 0)) {
+				*pref = ref->next;
+				delete_xattr_ref(c, ref);
+				goto retry;
+			} else if (unlikely(rc < 0))
+				goto out;
+		}
+		for (cmp=ref->next, pcmp=&ref->next; cmp; pcmp=&cmp->next, cmp=cmp->next) {
+			if (!cmp->xd->xname) {
+				ref->xd->flags |= JFFS2_XFLAGS_BIND;
+				rc = load_xattr_datum(c, cmp->xd);
+				ref->xd->flags &= ~JFFS2_XFLAGS_BIND;
+				if (unlikely(rc > 0)) {
+					*pcmp = cmp->next;
+					delete_xattr_ref(c, cmp);
+					goto retry;
+				} else if (unlikely(rc < 0))
+					goto out;
+			}
+			if (ref->xd->xprefix == cmp->xd->xprefix
+			    && !strcmp(ref->xd->xname, cmp->xd->xname)) {
+				if (ref->xseqno > cmp->xseqno) {
+					*pcmp = cmp->next;
+					delete_xattr_ref(c, cmp);
+				} else {
+					*pref = ref->next;
+					delete_xattr_ref(c, ref);
+				}
+				goto retry;
+			}
+		}
+	}
+	ic->flags |= INO_FLAGS_XATTR_CHECKED;
+ out:
+	up_write(&c->xattr_sem);
+
+	return rc;
+}
+
+/* -------- xattr subsystem functions ---------------
+ * jffs2_init_xattr_subsystem(c)
+ *   is used to initialize semaphore and list_head, and some variables.
+ * jffs2_find_xattr_datum(c, xid)
+ *   is used to lookup xdatum while scanning process.
+ * jffs2_clear_xattr_subsystem(c)
+ *   is used to release any xattr related objects.
+ * jffs2_build_xattr_subsystem(c)
+ *   is used to associate xdatum and xref while super block building process.
+ * jffs2_setup_xattr_datum(c, xid, version)
+ *   is used to insert xdatum while scanning process.
+ * -------------------------------------------------- */
+void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c)
+{
+	int i;
+
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++)
+		INIT_LIST_HEAD(&c->xattrindex[i]);
+	INIT_LIST_HEAD(&c->xattr_unchecked);
+	INIT_LIST_HEAD(&c->xattr_dead_list);
+	c->xref_dead_list = NULL;
+	c->xref_temp = NULL;
+
+	init_rwsem(&c->xattr_sem);
+	c->highest_xid = 0;
+	c->highest_xseqno = 0;
+	c->xdatum_mem_usage = 0;
+	c->xdatum_mem_threshold = 32 * 1024;	/* Default 32KB */
+}
+
+static struct jffs2_xattr_datum *jffs2_find_xattr_datum(struct jffs2_sb_info *c, uint32_t xid)
+{
+	struct jffs2_xattr_datum *xd;
+	int i = xid % XATTRINDEX_HASHSIZE;
+
+	/* It's only used in scanning/building process. */
+	BUG_ON(!(c->flags & (JFFS2_SB_FLAG_SCANNING|JFFS2_SB_FLAG_BUILDING)));
+
+	list_for_each_entry(xd, &c->xattrindex[i], xindex) {
+		if (xd->xid==xid)
+			return xd;
+	}
+	return NULL;
+}
+
+void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
+{
+	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_xattr_ref *ref, *_ref;
+	int i;
+
+	for (ref=c->xref_temp; ref; ref = _ref) {
+		_ref = ref->next;
+		jffs2_free_xattr_ref(ref);
+	}
+
+	for (ref=c->xref_dead_list; ref; ref = _ref) {
+		_ref = ref->next;
+		jffs2_free_xattr_ref(ref);
+	}
+
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
+			list_del(&xd->xindex);
+			if (xd->xname)
+				kfree(xd->xname);
+			jffs2_free_xattr_datum(xd);
+		}
+	}
+
+	list_for_each_entry_safe(xd, _xd, &c->xattr_dead_list, xindex) {
+		list_del(&xd->xindex);
+		jffs2_free_xattr_datum(xd);
+	}
+	list_for_each_entry_safe(xd, _xd, &c->xattr_unchecked, xindex) {
+		list_del(&xd->xindex);
+		jffs2_free_xattr_datum(xd);
+	}
+}
+
+#define XREF_TMPHASH_SIZE	(128)
+void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
+{
+	struct jffs2_xattr_ref *ref, *_ref;
+	struct jffs2_xattr_ref *xref_tmphash[XREF_TMPHASH_SIZE];
+	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_inode_cache *ic;
+	struct jffs2_raw_node_ref *raw;
+	int i, xdatum_count = 0, xdatum_unchecked_count = 0, xref_count = 0;
+	int xdatum_orphan_count = 0, xref_orphan_count = 0, xref_dead_count = 0;
+
+	BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING));
+
+	/* Phase.1 : Merge same xref */
+	for (i=0; i < XREF_TMPHASH_SIZE; i++)
+		xref_tmphash[i] = NULL;
+	for (ref=c->xref_temp; ref; ref=_ref) {
+		struct jffs2_xattr_ref *tmp;
+
+		_ref = ref->next;
+		if (ref_flags(ref->node) != REF_PRISTINE) {
+			if (verify_xattr_ref(c, ref)) {
+				BUG_ON(ref->node->next_in_ino != (void *)ref);
+				ref->node->next_in_ino = NULL;
+				jffs2_mark_node_obsolete(c, ref->node);
+				jffs2_free_xattr_ref(ref);
+				continue;
+			}
+		}
+
+		i = (ref->ino ^ ref->xid) % XREF_TMPHASH_SIZE;
+		for (tmp=xref_tmphash[i]; tmp; tmp=tmp->next) {
+			if (tmp->ino == ref->ino && tmp->xid == ref->xid)
+				break;
+		}
+		if (tmp) {
+			raw = ref->node;
+			if (ref->xseqno > tmp->xseqno) {
+				tmp->xseqno = ref->xseqno;
+				raw->next_in_ino = tmp->node;
+				tmp->node = raw;
+			} else {
+				raw->next_in_ino = tmp->node->next_in_ino;
+				tmp->node->next_in_ino = raw;
+			}
+			jffs2_free_xattr_ref(ref);
+			continue;
+		} else {
+			ref->next = xref_tmphash[i];
+			xref_tmphash[i] = ref;
+		}
+	}
+	c->xref_temp = NULL;
+
+	/* Phase.2 : Bind xref with inode_cache and xattr_datum */
+	for (i=0; i < XREF_TMPHASH_SIZE; i++) {
+		for (ref=xref_tmphash[i]; ref; ref=_ref) {
+			xref_count++;
+			_ref = ref->next;
+			if (is_xattr_ref_dead(ref)) {
+				ref->next = c->xref_dead_list;
+				c->xref_dead_list = ref;
+				xref_dead_count++;
+				continue;
+			}
+			/* At this point, ref->xid and ref->ino contain XID and inode number.
+			   ref->xd and ref->ic are not valid yet. */
+			xd = jffs2_find_xattr_datum(c, ref->xid);
+			ic = jffs2_get_ino_cache(c, ref->ino);
+			if (!xd || !ic || !ic->pino_nlink) {
+				dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) is orphan.\n",
+					  ref->ino, ref->xid, ref->xseqno);
+				ref->xseqno |= XREF_DELETE_MARKER;
+				ref->next = c->xref_dead_list;
+				c->xref_dead_list = ref;
+				xref_orphan_count++;
+				continue;
+			}
+			ref->xd = xd;
+			ref->ic = ic;
+			atomic_inc(&xd->refcnt);
+			ref->next = ic->xref;
+			ic->xref = ref;
+		}
+	}
+
+	/* Phase.3 : Link unchecked xdatum to xattr_unchecked list */
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
+			xdatum_count++;
+			list_del_init(&xd->xindex);
+			if (!atomic_read(&xd->refcnt)) {
+				dbg_xattr("xdatum(xid=%u, version=%u) is orphan.\n",
+					  xd->xid, xd->version);
+				xd->flags |= JFFS2_XFLAGS_DEAD;
+				list_add(&xd->xindex, &c->xattr_unchecked);
+				xdatum_orphan_count++;
+				continue;
+			}
+			if (is_xattr_datum_unchecked(c, xd)) {
+				dbg_xattr("unchecked xdatum(xid=%u, version=%u)\n",
+					  xd->xid, xd->version);
+				list_add(&xd->xindex, &c->xattr_unchecked);
+				xdatum_unchecked_count++;
+			}
+		}
+	}
+	/* build complete */
+	JFFS2_NOTICE("complete building xattr subsystem, %u of xdatum"
+		     " (%u unchecked, %u orphan) and "
+		     "%u of xref (%u dead, %u orphan) found.\n",
+		     xdatum_count, xdatum_unchecked_count, xdatum_orphan_count,
+		     xref_count, xref_dead_count, xref_orphan_count);
+}
+
+struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
+						  uint32_t xid, uint32_t version)
+{
+	struct jffs2_xattr_datum *xd;
+
+	xd = jffs2_find_xattr_datum(c, xid);
+	if (!xd) {
+		xd = jffs2_alloc_xattr_datum();
+		if (!xd)
+			return ERR_PTR(-ENOMEM);
+		xd->xid = xid;
+		xd->version = version;
+		if (xd->xid > c->highest_xid)
+			c->highest_xid = xd->xid;
+		list_add_tail(&xd->xindex, &c->xattrindex[xid % XATTRINDEX_HASHSIZE]);
+	}
+	return xd;
+}
+
+/* -------- xattr subsystem functions ---------------
+ * xprefix_to_handler(xprefix)
+ *   is used to translate xprefix into xattr_handler.
+ * jffs2_listxattr(dentry, buffer, size)
+ *   is an implementation of listxattr handler on jffs2.
+ * do_jffs2_getxattr(inode, xprefix, xname, buffer, size)
+ *   is an implementation of getxattr handler on jffs2.
+ * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
+ *   is an implementation of setxattr handler on jffs2.
+ * -------------------------------------------------- */
+const struct xattr_handler *jffs2_xattr_handlers[] = {
+	&jffs2_user_xattr_handler,
+#ifdef CONFIG_JFFS2_FS_SECURITY
+	&jffs2_security_xattr_handler,
+#endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	&jffs2_acl_access_xattr_handler,
+	&jffs2_acl_default_xattr_handler,
+#endif
+	&jffs2_trusted_xattr_handler,
+	NULL
+};
+
+static const struct xattr_handler *xprefix_to_handler(int xprefix) {
+	const struct xattr_handler *ret;
+
+	switch (xprefix) {
+	case JFFS2_XPREFIX_USER:
+		ret = &jffs2_user_xattr_handler;
+		break;
+#ifdef CONFIG_JFFS2_FS_SECURITY
+	case JFFS2_XPREFIX_SECURITY:
+		ret = &jffs2_security_xattr_handler;
+		break;
+#endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	case JFFS2_XPREFIX_ACL_ACCESS:
+		ret = &jffs2_acl_access_xattr_handler;
+		break;
+	case JFFS2_XPREFIX_ACL_DEFAULT:
+		ret = &jffs2_acl_default_xattr_handler;
+		break;
+#endif
+	case JFFS2_XPREFIX_TRUSTED:
+		ret = &jffs2_trusted_xattr_handler;
+		break;
+	default:
+		ret = NULL;
+		break;
+	}
+	return ret;
+}
+
+ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_cache *ic = f->inocache;
+	struct jffs2_xattr_ref *ref, **pref;
+	struct jffs2_xattr_datum *xd;
+	const struct xattr_handler *xhandle;
+	ssize_t len, rc;
+	int retry = 0;
+
+	rc = check_xattr_ref_inode(c, ic);
+	if (unlikely(rc))
+		return rc;
+
+	down_read(&c->xattr_sem);
+ retry:
+	len = 0;
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+		BUG_ON(ref->ic != ic);
+		xd = ref->xd;
+		if (!xd->xname) {
+			/* xdatum is unchached */
+			if (!retry) {
+				retry = 1;
+				up_read(&c->xattr_sem);
+				down_write(&c->xattr_sem);
+				goto retry;
+			} else {
+				rc = load_xattr_datum(c, xd);
+				if (unlikely(rc > 0)) {
+					*pref = ref->next;
+					delete_xattr_ref(c, ref);
+					goto retry;
+				} else if (unlikely(rc < 0))
+					goto out;
+			}
+		}
+		xhandle = xprefix_to_handler(xd->xprefix);
+		if (!xhandle)
+			continue;
+		if (buffer) {
+			rc = xhandle->list(dentry, buffer+len, size-len,
+					   xd->xname, xd->name_len, xd->flags);
+		} else {
+			rc = xhandle->list(dentry, NULL, 0, xd->xname,
+					   xd->name_len, xd->flags);
+		}
+		if (rc < 0)
+			goto out;
+		len += rc;
+	}
+	rc = len;
+ out:
+	if (!retry) {
+		up_read(&c->xattr_sem);
+	} else {
+		up_write(&c->xattr_sem);
+	}
+	return rc;
+}
+
+int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
+		      char *buffer, size_t size)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_cache *ic = f->inocache;
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref, **pref;
+	int rc, retry = 0;
+
+	rc = check_xattr_ref_inode(c, ic);
+	if (unlikely(rc))
+		return rc;
+
+	down_read(&c->xattr_sem);
+ retry:
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+		BUG_ON(ref->ic!=ic);
+
+		xd = ref->xd;
+		if (xd->xprefix != xprefix)
+			continue;
+		if (!xd->xname) {
+			/* xdatum is unchached */
+			if (!retry) {
+				retry = 1;
+				up_read(&c->xattr_sem);
+				down_write(&c->xattr_sem);
+				goto retry;
+			} else {
+				rc = load_xattr_datum(c, xd);
+				if (unlikely(rc > 0)) {
+					*pref = ref->next;
+					delete_xattr_ref(c, ref);
+					goto retry;
+				} else if (unlikely(rc < 0)) {
+					goto out;
+				}
+			}
+		}
+		if (!strcmp(xname, xd->xname)) {
+			rc = xd->value_len;
+			if (buffer) {
+				if (size < rc) {
+					rc = -ERANGE;
+				} else {
+					memcpy(buffer, xd->xvalue, rc);
+				}
+			}
+			goto out;
+		}
+	}
+	rc = -ENODATA;
+ out:
+	if (!retry) {
+		up_read(&c->xattr_sem);
+	} else {
+		up_write(&c->xattr_sem);
+	}
+	return rc;
+}
+
+int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
+		      const char *buffer, size_t size, int flags)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_cache *ic = f->inocache;
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref, *newref, **pref;
+	uint32_t length, request;
+	int rc;
+
+	rc = check_xattr_ref_inode(c, ic);
+	if (unlikely(rc))
+		return rc;
+
+	request = PAD(sizeof(struct jffs2_raw_xattr) + strlen(xname) + 1 + size);
+	rc = jffs2_reserve_space(c, request, &length,
+				 ALLOC_NORMAL, JFFS2_SUMMARY_XATTR_SIZE);
+	if (rc) {
+		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
+		return rc;
+	}
+
+	/* Find existing xattr */
+	down_write(&c->xattr_sem);
+ retry:
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+		xd = ref->xd;
+		if (xd->xprefix != xprefix)
+			continue;
+		if (!xd->xname) {
+			rc = load_xattr_datum(c, xd);
+			if (unlikely(rc > 0)) {
+				*pref = ref->next;
+				delete_xattr_ref(c, ref);
+				goto retry;
+			} else if (unlikely(rc < 0))
+				goto out;
+		}
+		if (!strcmp(xd->xname, xname)) {
+			if (flags & XATTR_CREATE) {
+				rc = -EEXIST;
+				goto out;
+			}
+			if (!buffer) {
+				ref->ino = ic->ino;
+				ref->xid = xd->xid;
+				ref->xseqno |= XREF_DELETE_MARKER;
+				rc = save_xattr_ref(c, ref);
+				if (!rc) {
+					*pref = ref->next;
+					spin_lock(&c->erase_completion_lock);
+					ref->next = c->xref_dead_list;
+					c->xref_dead_list = ref;
+					spin_unlock(&c->erase_completion_lock);
+					unrefer_xattr_datum(c, xd);
+				} else {
+					ref->ic = ic;
+					ref->xd = xd;
+					ref->xseqno &= ~XREF_DELETE_MARKER;
+				}
+				goto out;
+			}
+			goto found;
+		}
+	}
+	/* not found */
+	if (flags & XATTR_REPLACE) {
+		rc = -ENODATA;
+		goto out;
+	}
+	if (!buffer) {
+		rc = -ENODATA;
+		goto out;
+	}
+ found:
+	xd = create_xattr_datum(c, xprefix, xname, buffer, size);
+	if (IS_ERR(xd)) {
+		rc = PTR_ERR(xd);
+		goto out;
+	}
+	up_write(&c->xattr_sem);
+	jffs2_complete_reservation(c);
+
+	/* create xattr_ref */
+	request = PAD(sizeof(struct jffs2_raw_xref));
+	rc = jffs2_reserve_space(c, request, &length,
+				 ALLOC_NORMAL, JFFS2_SUMMARY_XREF_SIZE);
+	down_write(&c->xattr_sem);
+	if (rc) {
+		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
+		unrefer_xattr_datum(c, xd);
+		up_write(&c->xattr_sem);
+		return rc;
+	}
+	if (ref)
+		*pref = ref->next;
+	newref = create_xattr_ref(c, ic, xd);
+	if (IS_ERR(newref)) {
+		if (ref) {
+			ref->next = ic->xref;
+			ic->xref = ref;
+		}
+		rc = PTR_ERR(newref);
+		unrefer_xattr_datum(c, xd);
+	} else if (ref) {
+		delete_xattr_ref(c, ref);
+	}
+ out:
+	up_write(&c->xattr_sem);
+	jffs2_complete_reservation(c);
+	return rc;
+}
+
+/* -------- garbage collector functions -------------
+ * jffs2_garbage_collect_xattr_datum(c, xd, raw)
+ *   is used to move xdatum into new node.
+ * jffs2_garbage_collect_xattr_ref(c, ref, raw)
+ *   is used to move xref into new node.
+ * jffs2_verify_xattr(c)
+ *   is used to call do_verify_xattr_datum() before garbage collecting.
+ * jffs2_release_xattr_datum(c, xd)
+ *   is used to release an in-memory object of xdatum.
+ * jffs2_release_xattr_ref(c, ref)
+ *   is used to release an in-memory object of xref.
+ * -------------------------------------------------- */
+int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd,
+				      struct jffs2_raw_node_ref *raw)
+{
+	uint32_t totlen, length, old_ofs;
+	int rc = 0;
+
+	down_write(&c->xattr_sem);
+	if (xd->node != raw)
+		goto out;
+	if (xd->flags & (JFFS2_XFLAGS_DEAD|JFFS2_XFLAGS_INVALID))
+		goto out;
+
+	rc = load_xattr_datum(c, xd);
+	if (unlikely(rc)) {
+		rc = (rc > 0) ? 0 : rc;
+		goto out;
+	}
+	old_ofs = ref_offset(xd->node);
+	totlen = PAD(sizeof(struct jffs2_raw_xattr)
+			+ xd->name_len + 1 + xd->value_len);
+	rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XATTR_SIZE);
+	if (rc) {
+		JFFS2_WARNING("jffs2_reserve_space_gc()=%d, request=%u\n", rc, totlen);
+		goto out;
+	}
+	rc = save_xattr_datum(c, xd);
+	if (!rc)
+		dbg_xattr("xdatum (xid=%u, version=%u) GC'ed from %#08x to %08x\n",
+			  xd->xid, xd->version, old_ofs, ref_offset(xd->node));
+ out:
+	if (!rc)
+		jffs2_mark_node_obsolete(c, raw);
+	up_write(&c->xattr_sem);
+	return rc;
+}
+
+int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
+				    struct jffs2_raw_node_ref *raw)
+{
+	uint32_t totlen, length, old_ofs;
+	int rc = 0;
+
+	down_write(&c->xattr_sem);
+	BUG_ON(!ref->node);
+
+	if (ref->node != raw)
+		goto out;
+	if (is_xattr_ref_dead(ref) && (raw->next_in_ino == (void *)ref))
+		goto out;
+
+	old_ofs = ref_offset(ref->node);
+	totlen = ref_totlen(c, c->gcblock, ref->node);
+
+	rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XREF_SIZE);
+	if (rc) {
+		JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n",
+			      __func__, rc, totlen);
+		rc = rc ? rc : -EBADFD;
+		goto out;
+	}
+	rc = save_xattr_ref(c, ref);
+	if (!rc)
+		dbg_xattr("xref (ino=%u, xid=%u) GC'ed from %#08x to %08x\n",
+			  ref->ic->ino, ref->xd->xid, old_ofs, ref_offset(ref->node));
+ out:
+	if (!rc)
+		jffs2_mark_node_obsolete(c, raw);
+	up_write(&c->xattr_sem);
+	return rc;
+}
+
+int jffs2_verify_xattr(struct jffs2_sb_info *c)
+{
+	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *raw;
+	uint32_t totlen;
+	int rc;
+
+	down_write(&c->xattr_sem);
+	list_for_each_entry_safe(xd, _xd, &c->xattr_unchecked, xindex) {
+		rc = do_verify_xattr_datum(c, xd);
+		if (rc < 0)
+			continue;
+		list_del_init(&xd->xindex);
+		spin_lock(&c->erase_completion_lock);
+		for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) {
+			if (ref_flags(raw) != REF_UNCHECKED)
+				continue;
+			jeb = &c->blocks[ref_offset(raw) / c->sector_size];
+			totlen = PAD(ref_totlen(c, jeb, raw));
+			c->unchecked_size -= totlen; c->used_size += totlen;
+			jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+			raw->flash_offset = ref_offset(raw)
+				| ((xd->node == (void *)raw) ? REF_PRISTINE : REF_NORMAL);
+		}
+		if (xd->flags & JFFS2_XFLAGS_DEAD)
+			list_add(&xd->xindex, &c->xattr_dead_list);
+		spin_unlock(&c->erase_completion_lock);
+	}
+	up_write(&c->xattr_sem);
+	return list_empty(&c->xattr_unchecked) ? 1 : 0;
+}
+
+void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under spin_lock(&c->erase_completion_lock) */
+	if (atomic_read(&xd->refcnt) || xd->node != (void *)xd)
+		return;
+
+	list_del(&xd->xindex);
+	jffs2_free_xattr_datum(xd);
+}
+
+void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	/* must be called under spin_lock(&c->erase_completion_lock) */
+	struct jffs2_xattr_ref *tmp, **ptmp;
+
+	if (ref->node != (void *)ref)
+		return;
+
+	for (tmp=c->xref_dead_list, ptmp=&c->xref_dead_list; tmp; ptmp=&tmp->next, tmp=tmp->next) {
+		if (ref == tmp) {
+			*ptmp = tmp->next;
+			break;
+		}
+	}
+	jffs2_free_xattr_ref(ref);
+}
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
new file mode 100644
index 00000000..7be4beb3
--- /dev/null
+++ b/fs/jffs2/xattr.h
@@ -0,0 +1,131 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#ifndef _JFFS2_FS_XATTR_H_
+#define _JFFS2_FS_XATTR_H_
+
+#include <linux/xattr.h>
+#include <linux/list.h>
+
+#define JFFS2_XFLAGS_HOT	(0x01)	/* This datum is HOT */
+#define JFFS2_XFLAGS_BIND	(0x02)	/* This datum is not reclaimed */
+#define JFFS2_XFLAGS_DEAD	(0x40)	/* This datum is already dead */
+#define JFFS2_XFLAGS_INVALID	(0x80)	/* This datum contains crc error */
+
+struct jffs2_xattr_datum
+{
+	void *always_null;
+	struct jffs2_raw_node_ref *node;
+	uint8_t class;
+	uint8_t flags;
+	uint16_t xprefix;		/* see JFFS2_XATTR_PREFIX_* */
+
+	struct list_head xindex;	/* chained from c->xattrindex[n] */
+	atomic_t refcnt;		/* # of xattr_ref refers this */
+	uint32_t xid;
+	uint32_t version;
+
+	uint32_t data_crc;
+	uint32_t hashkey;
+	char *xname;		/* XATTR name without prefix */
+	uint32_t name_len;	/* length of xname */
+	char *xvalue;		/* XATTR value */
+	uint32_t value_len;	/* length of xvalue */
+};
+
+struct jffs2_inode_cache;
+struct jffs2_xattr_ref
+{
+	void *always_null;
+	struct jffs2_raw_node_ref *node;
+	uint8_t class;
+	uint8_t flags;		/* Currently unused */
+	u16 unused;
+
+	uint32_t xseqno;
+	union {
+		struct jffs2_inode_cache *ic;	/* reference to jffs2_inode_cache */
+		uint32_t ino;			/* only used in scanning/building  */
+	};
+	union {
+		struct jffs2_xattr_datum *xd;	/* reference to jffs2_xattr_datum */
+		uint32_t xid;			/* only used in sccanning/building */
+	};
+	struct jffs2_xattr_ref *next;		/* chained from ic->xref_list */
+};
+
+#define XREF_DELETE_MARKER	(0x00000001)
+static inline int is_xattr_ref_dead(struct jffs2_xattr_ref *ref)
+{
+	return ((ref->xseqno & XREF_DELETE_MARKER) != 0);
+}
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+
+extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c);
+extern void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c);
+extern void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c);
+
+extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
+							 uint32_t xid, uint32_t version);
+
+extern void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+extern void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+
+extern int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd,
+					     struct jffs2_raw_node_ref *raw);
+extern int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
+					   struct jffs2_raw_node_ref *raw);
+extern int jffs2_verify_xattr(struct jffs2_sb_info *c);
+extern void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd);
+extern void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref);
+
+extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
+			     char *buffer, size_t size);
+extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
+			     const char *buffer, size_t size, int flags);
+
+extern const struct xattr_handler *jffs2_xattr_handlers[];
+extern const struct xattr_handler jffs2_user_xattr_handler;
+extern const struct xattr_handler jffs2_trusted_xattr_handler;
+
+extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
+#define jffs2_getxattr		generic_getxattr
+#define jffs2_setxattr		generic_setxattr
+#define jffs2_removexattr	generic_removexattr
+
+#else
+
+#define jffs2_init_xattr_subsystem(c)
+#define jffs2_build_xattr_subsystem(c)
+#define jffs2_clear_xattr_subsystem(c)
+
+#define jffs2_xattr_delete_inode(c, ic)
+#define jffs2_xattr_free_inode(c, ic)
+#define jffs2_verify_xattr(c)			(1)
+
+#define jffs2_xattr_handlers	NULL
+#define jffs2_listxattr		NULL
+#define jffs2_getxattr		NULL
+#define jffs2_setxattr		NULL
+#define jffs2_removexattr	NULL
+
+#endif /* CONFIG_JFFS2_FS_XATTR */
+
+#ifdef CONFIG_JFFS2_FS_SECURITY
+extern int jffs2_init_security(struct inode *inode, struct inode *dir,
+			       const struct qstr *qstr);
+extern const struct xattr_handler jffs2_security_xattr_handler;
+#else
+#define jffs2_init_security(inode,dir,qstr)	(0)
+#endif /* CONFIG_JFFS2_FS_SECURITY */
+
+#endif /* _JFFS2_FS_XATTR_H_ */
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
new file mode 100644
index 00000000..1c868194
--- /dev/null
+++ b/fs/jffs2/xattr_trusted.c
@@ -0,0 +1,55 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED,
+				 name, buffer, size);
+}
+
+static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name,
+		const void *buffer, size_t size, int flags, int type)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED,
+				 name, buffer, size, flags);
+}
+
+static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
+
+	if (list && retlen<=list_size) {
+		strcpy(list, XATTR_TRUSTED_PREFIX);
+		strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name);
+	}
+
+	return retlen;
+}
+
+const struct xattr_handler jffs2_trusted_xattr_handler = {
+	.prefix = XATTR_TRUSTED_PREFIX,
+	.list = jffs2_trusted_listxattr,
+	.set = jffs2_trusted_setxattr,
+	.get = jffs2_trusted_getxattr
+};
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
new file mode 100644
index 00000000..916b5c96
--- /dev/null
+++ b/fs/jffs2/xattr_user.c
@@ -0,0 +1,55 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static int jffs2_user_getxattr(struct dentry *dentry, const char *name,
+			       void *buffer, size_t size, int type)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_USER,
+				 name, buffer, size);
+}
+
+static int jffs2_user_setxattr(struct dentry *dentry, const char *name,
+		const void *buffer, size_t size, int flags, int type)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_USER,
+				 name, buffer, size, flags);
+}
+
+static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
+
+	if (list && retlen <= list_size) {
+		strcpy(list, XATTR_USER_PREFIX);
+		strcpy(list + XATTR_USER_PREFIX_LEN, name);
+	}
+
+	return retlen;
+}
+
+const struct xattr_handler jffs2_user_xattr_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.list = jffs2_user_listxattr,
+	.set = jffs2_user_setxattr,
+	.get = jffs2_user_getxattr
+};
diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
new file mode 100644
index 00000000..57cef199
--- /dev/null
+++ b/fs/jfs/Kconfig
@@ -0,0 +1,50 @@
+config JFS_FS
+	tristate "JFS filesystem support"
+	select NLS
+	select CRC32
+	help
+	  This is a port of IBM's Journaled Filesystem .  More information is
+	  available in the file <file:Documentation/filesystems/jfs.txt>.
+
+	  If you do not intend to use the JFS filesystem, say N.
+
+config JFS_POSIX_ACL
+	bool "JFS POSIX Access Control Lists"
+	depends on JFS_FS
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config JFS_SECURITY
+	bool "JFS Security Labels"
+	depends on JFS_FS
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the jfs filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
+config JFS_DEBUG
+	bool "JFS debugging"
+	depends on JFS_FS
+	help
+	  If you are experiencing any problems with the JFS filesystem, say
+	  Y here.  This will result in additional debugging messages to be
+	  written to the system log.  Under normal circumstances, this
+	  results in very little overhead.
+
+config JFS_STATISTICS
+	bool "JFS statistics"
+	depends on JFS_FS
+	help
+	  Enabling this option will cause statistics from the JFS file system
+	  to be made available to the user in the /proc/fs/jfs/ directory.
diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile
new file mode 100644
index 00000000..a58fa72d
--- /dev/null
+++ b/fs/jfs/Makefile
@@ -0,0 +1,16 @@
+#
+# Makefile for the Linux JFS filesystem routines.
+#
+
+obj-$(CONFIG_JFS_FS) += jfs.o
+
+jfs-y    := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \
+	    jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \
+	    jfs_unicode.o jfs_dtree.o jfs_inode.o \
+	    jfs_extent.o symlink.o jfs_metapage.o \
+	    jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o \
+	    resize.o xattr.o ioctl.o
+
+jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o
+
+ccflags-y := -D_JFS_4K
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
new file mode 100644
index 00000000..e5de9422
--- /dev/null
+++ b/fs/jfs/acl.c
@@ -0,0 +1,211 @@
+/*
+ *   Copyright (C) International Business Machines  Corp., 2002-2004
+ *   Copyright (C) Andreas Gruenbacher, 2001
+ *   Copyright (C) Linus Torvalds, 1991, 1992
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/posix_acl_xattr.h>
+#include "jfs_incore.h"
+#include "jfs_txnmgr.h"
+#include "jfs_xattr.h"
+#include "jfs_acl.h"
+
+static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	char *ea_name;
+	int size;
+	char *value = NULL;
+
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			ea_name = POSIX_ACL_XATTR_ACCESS;
+			break;
+		case ACL_TYPE_DEFAULT:
+			ea_name = POSIX_ACL_XATTR_DEFAULT;
+			break;
+		default:
+			return ERR_PTR(-EINVAL);
+	}
+
+	size = __jfs_getxattr(inode, ea_name, NULL, 0);
+
+	if (size > 0) {
+		value = kmalloc(size, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		size = __jfs_getxattr(inode, ea_name, value, size);
+	}
+
+	if (size < 0) {
+		if (size == -ENODATA)
+			acl = NULL;
+		else
+			acl = ERR_PTR(size);
+	} else {
+		acl = posix_acl_from_xattr(value, size);
+	}
+	kfree(value);
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+	return acl;
+}
+
+static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
+		       struct posix_acl *acl)
+{
+	char *ea_name;
+	int rc;
+	int size = 0;
+	char *value = NULL;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			ea_name = POSIX_ACL_XATTR_ACCESS;
+			break;
+		case ACL_TYPE_DEFAULT:
+			ea_name = POSIX_ACL_XATTR_DEFAULT;
+			if (!S_ISDIR(inode->i_mode))
+				return acl ? -EACCES : 0;
+			break;
+		default:
+			return -EINVAL;
+	}
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(size, GFP_KERNEL);
+		if (!value)
+			return -ENOMEM;
+		rc = posix_acl_to_xattr(acl, value, size);
+		if (rc < 0)
+			goto out;
+	}
+	rc = __jfs_setxattr(tid, inode, ea_name, value, size, 0);
+out:
+	kfree(value);
+
+	if (!rc)
+		set_cached_acl(inode, type, acl);
+
+	return rc;
+}
+
+int jfs_check_acl(struct inode *inode, int mask, unsigned int flags)
+{
+	struct posix_acl *acl;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		int error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return error;
+	}
+
+	return -EAGAIN;
+}
+
+int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *acl = NULL;
+	struct posix_acl *clone;
+	mode_t mode;
+	int rc = 0;
+
+	if (S_ISLNK(inode->i_mode))
+		return 0;
+
+	acl = jfs_get_acl(dir, ACL_TYPE_DEFAULT);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+
+	if (acl) {
+		if (S_ISDIR(inode->i_mode)) {
+			rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl);
+			if (rc)
+				goto cleanup;
+		}
+		clone = posix_acl_clone(acl, GFP_KERNEL);
+		if (!clone) {
+			rc = -ENOMEM;
+			goto cleanup;
+		}
+		mode = inode->i_mode;
+		rc = posix_acl_create_masq(clone, &mode);
+		if (rc >= 0) {
+			inode->i_mode = mode;
+			if (rc > 0)
+				rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS,
+						 clone);
+		}
+		posix_acl_release(clone);
+cleanup:
+		posix_acl_release(acl);
+	} else
+		inode->i_mode &= ~current_umask();
+
+	JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
+			       inode->i_mode;
+
+	return rc;
+}
+
+int jfs_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+	int rc;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+
+	rc = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!rc) {
+		tid_t tid = txBegin(inode->i_sb, 0);
+		mutex_lock(&JFS_IP(inode)->commit_mutex);
+		rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, clone);
+		if (!rc)
+			rc = txCommit(tid, 1, &inode, 0);
+		txEnd(tid);
+		mutex_unlock(&JFS_IP(inode)->commit_mutex);
+	}
+
+	posix_acl_release(clone);
+	return rc;
+}
diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h
new file mode 100644
index 00000000..fa92f7f1
--- /dev/null
+++ b/fs/jfs/endian24.h
@@ -0,0 +1,49 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2001
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_ENDIAN24
+#define	_H_ENDIAN24
+
+/*
+ *	endian24.h:
+ *
+ * Endian conversion for 24-byte data
+ *
+ */
+#define __swab24(x) \
+({ \
+	__u32 __x = (x); \
+	((__u32)( \
+		((__x & (__u32)0x000000ffUL) << 16) | \
+		 (__x & (__u32)0x0000ff00UL)	    | \
+		((__x & (__u32)0x00ff0000UL) >> 16) )); \
+})
+
+#if (defined(__KERNEL__) && defined(__LITTLE_ENDIAN)) || (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN))
+	#define __cpu_to_le24(x) ((__u32)(x))
+	#define __le24_to_cpu(x) ((__u32)(x))
+#else
+	#define __cpu_to_le24(x) __swab24(x)
+	#define __le24_to_cpu(x) __swab24(x)
+#endif
+
+#ifdef __KERNEL__
+	#define cpu_to_le24 __cpu_to_le24
+	#define le24_to_cpu __le24_to_cpu
+#endif
+
+#endif				/* !_H_ENDIAN24 */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
new file mode 100644
index 00000000..2f3f531f
--- /dev/null
+++ b/fs/jfs/file.c
@@ -0,0 +1,154 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_inode.h"
+#include "jfs_dmap.h"
+#include "jfs_txnmgr.h"
+#include "jfs_xattr.h"
+#include "jfs_acl.h"
+#include "jfs_debug.h"
+
+int jfs_fsync(struct file *file, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	int rc = 0;
+
+	if (!(inode->i_state & I_DIRTY) ||
+	    (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
+		/* Make sure committed changes hit the disk */
+		jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
+		return rc;
+	}
+
+	rc |= jfs_commit_inode(inode, 1);
+
+	return rc ? -EIO : 0;
+}
+
+static int jfs_open(struct inode *inode, struct file *file)
+{
+	int rc;
+
+	if ((rc = dquot_file_open(inode, file)))
+		return rc;
+
+	/*
+	 * We attempt to allow only one "active" file open per aggregate
+	 * group.  Otherwise, appending to files in parallel can cause
+	 * fragmentation within the files.
+	 *
+	 * If the file is empty, it was probably just created and going
+	 * to be written to.  If it has a size, we'll hold off until the
+	 * file is actually grown.
+	 */
+	if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE &&
+	    (inode->i_size == 0)) {
+		struct jfs_inode_info *ji = JFS_IP(inode);
+		spin_lock_irq(&ji->ag_lock);
+		if (ji->active_ag == -1) {
+			struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb);
+			ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb);
+			atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]);
+		}
+		spin_unlock_irq(&ji->ag_lock);
+	}
+
+	return 0;
+}
+static int jfs_release(struct inode *inode, struct file *file)
+{
+	struct jfs_inode_info *ji = JFS_IP(inode);
+
+	spin_lock_irq(&ji->ag_lock);
+	if (ji->active_ag != -1) {
+		struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap;
+		atomic_dec(&bmap->db_active[ji->active_ag]);
+		ji->active_ag = -1;
+	}
+	spin_unlock_irq(&ji->ag_lock);
+
+	return 0;
+}
+
+int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	int rc;
+
+	rc = inode_change_ok(inode, iattr);
+	if (rc)
+		return rc;
+
+	if (is_quota_modification(inode, iattr))
+		dquot_initialize(inode);
+	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
+	    (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
+		rc = dquot_transfer(inode, iattr);
+		if (rc)
+			return rc;
+	}
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		rc = vmtruncate(inode, iattr->ia_size);
+		if (rc)
+			return rc;
+	}
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+
+	if (iattr->ia_valid & ATTR_MODE)
+		rc = jfs_acl_chmod(inode);
+	return rc;
+}
+
+const struct inode_operations jfs_file_inode_operations = {
+	.truncate	= jfs_truncate,
+	.setxattr	= jfs_setxattr,
+	.getxattr	= jfs_getxattr,
+	.listxattr	= jfs_listxattr,
+	.removexattr	= jfs_removexattr,
+	.setattr	= jfs_setattr,
+#ifdef CONFIG_JFS_POSIX_ACL
+	.check_acl	= jfs_check_acl,
+#endif
+};
+
+const struct file_operations jfs_file_operations = {
+	.open		= jfs_open,
+	.llseek		= generic_file_llseek,
+	.write		= do_sync_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+	.fsync		= jfs_fsync,
+	.release	= jfs_release,
+	.unlocked_ioctl = jfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= jfs_compat_ioctl,
+#endif
+};
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
new file mode 100644
index 00000000..10965590
--- /dev/null
+++ b/fs/jfs/inode.c
@@ -0,0 +1,414 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/mpage.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/writeback.h>
+#include "jfs_incore.h"
+#include "jfs_inode.h"
+#include "jfs_filsys.h"
+#include "jfs_imap.h"
+#include "jfs_extent.h"
+#include "jfs_unicode.h"
+#include "jfs_debug.h"
+
+
+struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct inode *inode;
+	int ret;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	ret = diRead(inode);
+	if (ret < 0) {
+		iget_failed(inode);
+		return ERR_PTR(ret);
+	}
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &jfs_file_inode_operations;
+		inode->i_fop = &jfs_file_operations;
+		inode->i_mapping->a_ops = &jfs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &jfs_dir_inode_operations;
+		inode->i_fop = &jfs_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (inode->i_size >= IDATASIZE) {
+			inode->i_op = &page_symlink_inode_operations;
+			inode->i_mapping->a_ops = &jfs_aops;
+		} else {
+			inode->i_op = &jfs_fast_symlink_inode_operations;
+			/*
+			 * The inline data should be null-terminated, but
+			 * don't let on-disk corruption crash the kernel
+			 */
+			JFS_IP(inode)->i_inline[inode->i_size] = '\0';
+		}
+	} else {
+		inode->i_op = &jfs_file_inode_operations;
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+	}
+	unlock_new_inode(inode);
+	return inode;
+}
+
+/*
+ * Workhorse of both fsync & write_inode
+ */
+int jfs_commit_inode(struct inode *inode, int wait)
+{
+	int rc = 0;
+	tid_t tid;
+	static int noisy = 5;
+
+	jfs_info("In jfs_commit_inode, inode = 0x%p", inode);
+
+	/*
+	 * Don't commit if inode has been committed since last being
+	 * marked dirty, or if it has been deleted.
+	 */
+	if (inode->i_nlink == 0 || !test_cflag(COMMIT_Dirty, inode))
+		return 0;
+
+	if (isReadOnly(inode)) {
+		/* kernel allows writes to devices on read-only
+		 * partitions and may think inode is dirty
+		 */
+		if (!special_file(inode->i_mode) && noisy) {
+			jfs_err("jfs_commit_inode(0x%p) called on "
+				   "read-only volume", inode);
+			jfs_err("Is remount racy?");
+			noisy--;
+		}
+		return 0;
+	}
+
+	tid = txBegin(inode->i_sb, COMMIT_INODE);
+	mutex_lock(&JFS_IP(inode)->commit_mutex);
+
+	/*
+	 * Retest inode state after taking commit_mutex
+	 */
+	if (inode->i_nlink && test_cflag(COMMIT_Dirty, inode))
+		rc = txCommit(tid, 1, &inode, wait ? COMMIT_SYNC : 0);
+
+	txEnd(tid);
+	mutex_unlock(&JFS_IP(inode)->commit_mutex);
+	return rc;
+}
+
+int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int wait = wbc->sync_mode == WB_SYNC_ALL;
+
+	if (test_cflag(COMMIT_Nolink, inode))
+		return 0;
+	/*
+	 * If COMMIT_DIRTY is not set, the inode isn't really dirty.
+	 * It has been committed since the last change, but was still
+	 * on the dirty inode list.
+	 */
+	 if (!test_cflag(COMMIT_Dirty, inode)) {
+		/* Make sure committed changes hit the disk */
+		jfs_flush_journal(JFS_SBI(inode->i_sb)->log, wait);
+		return 0;
+	 }
+
+	if (jfs_commit_inode(inode, wait)) {
+		jfs_err("jfs_write_inode: jfs_commit_inode failed!");
+		return -EIO;
+	} else
+		return 0;
+}
+
+void jfs_evict_inode(struct inode *inode)
+{
+	jfs_info("In jfs_evict_inode, inode = 0x%p", inode);
+
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
+		dquot_initialize(inode);
+
+		if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
+			truncate_inode_pages(&inode->i_data, 0);
+
+			if (test_cflag(COMMIT_Freewmap, inode))
+				jfs_free_zero_link(inode);
+
+			diFree(inode);
+
+			/*
+			 * Free the inode from the quota allocation.
+			 */
+			dquot_initialize(inode);
+			dquot_free_inode(inode);
+		}
+	} else {
+		truncate_inode_pages(&inode->i_data, 0);
+	}
+	end_writeback(inode);
+	dquot_drop(inode);
+}
+
+void jfs_dirty_inode(struct inode *inode, int flags)
+{
+	static int noisy = 5;
+
+	if (isReadOnly(inode)) {
+		if (!special_file(inode->i_mode) && noisy) {
+			/* kernel allows writes to devices on read-only
+			 * partitions and may try to mark inode dirty
+			 */
+			jfs_err("jfs_dirty_inode called on read-only volume");
+			jfs_err("Is remount racy?");
+			noisy--;
+		}
+		return;
+	}
+
+	set_cflag(COMMIT_Dirty, inode);
+}
+
+int jfs_get_block(struct inode *ip, sector_t lblock,
+		  struct buffer_head *bh_result, int create)
+{
+	s64 lblock64 = lblock;
+	int rc = 0;
+	xad_t xad;
+	s64 xaddr;
+	int xflag;
+	s32 xlen = bh_result->b_size >> ip->i_blkbits;
+
+	/*
+	 * Take appropriate lock on inode
+	 */
+	if (create)
+		IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
+	else
+		IREAD_LOCK(ip, RDWRLOCK_NORMAL);
+
+	if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) &&
+	    (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) &&
+	    xaddr) {
+		if (xflag & XAD_NOTRECORDED) {
+			if (!create)
+				/*
+				 * Allocated but not recorded, read treats
+				 * this as a hole
+				 */
+				goto unlock;
+#ifdef _JFS_4K
+			XADoffset(&xad, lblock64);
+			XADlength(&xad, xlen);
+			XADaddress(&xad, xaddr);
+#else				/* _JFS_4K */
+			/*
+			 * As long as block size = 4K, this isn't a problem.
+			 * We should mark the whole page not ABNR, but how
+			 * will we know to mark the other blocks BH_New?
+			 */
+			BUG();
+#endif				/* _JFS_4K */
+			rc = extRecord(ip, &xad);
+			if (rc)
+				goto unlock;
+			set_buffer_new(bh_result);
+		}
+
+		map_bh(bh_result, ip->i_sb, xaddr);
+		bh_result->b_size = xlen << ip->i_blkbits;
+		goto unlock;
+	}
+	if (!create)
+		goto unlock;
+
+	/*
+	 * Allocate a new block
+	 */
+#ifdef _JFS_4K
+	if ((rc = extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad)))
+		goto unlock;
+	rc = extAlloc(ip, xlen, lblock64, &xad, false);
+	if (rc)
+		goto unlock;
+
+	set_buffer_new(bh_result);
+	map_bh(bh_result, ip->i_sb, addressXAD(&xad));
+	bh_result->b_size = lengthXAD(&xad) << ip->i_blkbits;
+
+#else				/* _JFS_4K */
+	/*
+	 * We need to do whatever it takes to keep all but the last buffers
+	 * in 4K pages - see jfs_write.c
+	 */
+	BUG();
+#endif				/* _JFS_4K */
+
+      unlock:
+	/*
+	 * Release lock on inode
+	 */
+	if (create)
+		IWRITE_UNLOCK(ip);
+	else
+		IREAD_UNLOCK(ip);
+	return rc;
+}
+
+static int jfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, jfs_get_block, wbc);
+}
+
+static int jfs_writepages(struct address_space *mapping,
+			struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, jfs_get_block);
+}
+
+static int jfs_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, jfs_get_block);
+}
+
+static int jfs_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, jfs_get_block);
+}
+
+static int jfs_write_begin(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
+				jfs_get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
+
+static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, jfs_get_block);
+}
+
+static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
+	const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
+
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				offset, nr_segs, jfs_get_block, NULL);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+
+	return ret;
+}
+
+const struct address_space_operations jfs_aops = {
+	.readpage	= jfs_readpage,
+	.readpages	= jfs_readpages,
+	.writepage	= jfs_writepage,
+	.writepages	= jfs_writepages,
+	.write_begin	= jfs_write_begin,
+	.write_end	= nobh_write_end,
+	.bmap		= jfs_bmap,
+	.direct_IO	= jfs_direct_IO,
+};
+
+/*
+ * Guts of jfs_truncate.  Called with locks already held.  Can be called
+ * with directory for truncating directory index table.
+ */
+void jfs_truncate_nolock(struct inode *ip, loff_t length)
+{
+	loff_t newsize;
+	tid_t tid;
+
+	ASSERT(length >= 0);
+
+	if (test_cflag(COMMIT_Nolink, ip)) {
+		xtTruncate(0, ip, length, COMMIT_WMAP);
+		return;
+	}
+
+	do {
+		tid = txBegin(ip->i_sb, 0);
+
+		/*
+		 * The commit_mutex cannot be taken before txBegin.
+		 * txBegin may block and there is a chance the inode
+		 * could be marked dirty and need to be committed
+		 * before txBegin unblocks
+		 */
+		mutex_lock(&JFS_IP(ip)->commit_mutex);
+
+		newsize = xtTruncate(tid, ip, length,
+				     COMMIT_TRUNCATE | COMMIT_PWMAP);
+		if (newsize < 0) {
+			txEnd(tid);
+			mutex_unlock(&JFS_IP(ip)->commit_mutex);
+			break;
+		}
+
+		ip->i_mtime = ip->i_ctime = CURRENT_TIME;
+		mark_inode_dirty(ip);
+
+		txCommit(tid, 1, &ip, 0);
+		txEnd(tid);
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	} while (newsize > length);	/* Truncate isn't always atomic */
+}
+
+void jfs_truncate(struct inode *ip)
+{
+	jfs_info("jfs_truncate: size = 0x%lx", (ulong) ip->i_size);
+
+	nobh_truncate_page(ip->i_mapping, ip->i_size, jfs_get_block);
+
+	IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
+	jfs_truncate_nolock(ip, ip->i_size);
+	IWRITE_UNLOCK(ip);
+}
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
new file mode 100644
index 00000000..6f98a186
--- /dev/null
+++ b/fs/jfs/ioctl.c
@@ -0,0 +1,148 @@
+/*
+ * linux/fs/jfs/ioctl.c
+ *
+ * Copyright (C) 2006 Herbert Poetzl
+ * adapted from Remy Card's ext2/ioctl.c
+ */
+
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/capability.h>
+#include <linux/mount.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <asm/current.h>
+#include <asm/uaccess.h>
+
+#include "jfs_incore.h"
+#include "jfs_dinode.h"
+#include "jfs_inode.h"
+
+
+static struct {
+	long jfs_flag;
+	long ext2_flag;
+} jfs_map[] = {
+	{JFS_NOATIME_FL,	FS_NOATIME_FL},
+	{JFS_DIRSYNC_FL,	FS_DIRSYNC_FL},
+	{JFS_SYNC_FL,		FS_SYNC_FL},
+	{JFS_SECRM_FL,		FS_SECRM_FL},
+	{JFS_UNRM_FL,		FS_UNRM_FL},
+	{JFS_APPEND_FL,		FS_APPEND_FL},
+	{JFS_IMMUTABLE_FL,	FS_IMMUTABLE_FL},
+	{0, 0},
+};
+
+static long jfs_map_ext2(unsigned long flags, int from)
+{
+	int index=0;
+	long mapped=0;
+
+	while (jfs_map[index].jfs_flag) {
+		if (from) {
+			if (jfs_map[index].ext2_flag & flags)
+				mapped |= jfs_map[index].jfs_flag;
+		} else {
+			if (jfs_map[index].jfs_flag & flags)
+				mapped |= jfs_map[index].ext2_flag;
+		}
+		index++;
+	}
+	return mapped;
+}
+
+
+long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct jfs_inode_info *jfs_inode = JFS_IP(inode);
+	unsigned int flags;
+
+	switch (cmd) {
+	case JFS_IOC_GETFLAGS:
+		jfs_get_inode_flags(jfs_inode);
+		flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE;
+		flags = jfs_map_ext2(flags, 0);
+		return put_user(flags, (int __user *) arg);
+	case JFS_IOC_SETFLAGS: {
+		unsigned int oldflags;
+		int err;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+
+		if (!inode_owner_or_capable(inode)) {
+			err = -EACCES;
+			goto setflags_out;
+		}
+		if (get_user(flags, (int __user *) arg)) {
+			err = -EFAULT;
+			goto setflags_out;
+		}
+
+		flags = jfs_map_ext2(flags, 1);
+		if (!S_ISDIR(inode->i_mode))
+			flags &= ~JFS_DIRSYNC_FL;
+
+		/* Is it quota file? Do not allow user to mess with it */
+		if (IS_NOQUOTA(inode)) {
+			err = -EPERM;
+			goto setflags_out;
+		}
+
+		/* Lock against other parallel changes of flags */
+		mutex_lock(&inode->i_mutex);
+
+		jfs_get_inode_flags(jfs_inode);
+		oldflags = jfs_inode->mode2;
+
+		/*
+		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+		 * the relevant capability.
+		 */
+		if ((oldflags & JFS_IMMUTABLE_FL) ||
+			((flags ^ oldflags) &
+			(JFS_APPEND_FL | JFS_IMMUTABLE_FL))) {
+			if (!capable(CAP_LINUX_IMMUTABLE)) {
+				mutex_unlock(&inode->i_mutex);
+				err = -EPERM;
+				goto setflags_out;
+			}
+		}
+
+		flags = flags & JFS_FL_USER_MODIFIABLE;
+		flags |= oldflags & ~JFS_FL_USER_MODIFIABLE;
+		jfs_inode->mode2 = flags;
+
+		jfs_set_inode_flags(inode);
+		mutex_unlock(&inode->i_mutex);
+		inode->i_ctime = CURRENT_TIME_SEC;
+		mark_inode_dirty(inode);
+setflags_out:
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
+	}
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	/* While these ioctl numbers defined with 'long' and have different
+	 * numbers than the 64bit ABI,
+	 * the actual implementation only deals with ints and is compatible.
+	 */
+	switch (cmd) {
+	case JFS_IOC_GETFLAGS32:
+		cmd = JFS_IOC_GETFLAGS;
+		break;
+	case JFS_IOC_SETFLAGS32:
+		cmd = JFS_IOC_SETFLAGS;
+		break;
+	}
+	return jfs_ioctl(filp, cmd, arg);
+}
+#endif
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
new file mode 100644
index 00000000..f9285c49
--- /dev/null
+++ b/fs/jfs/jfs_acl.h
@@ -0,0 +1,41 @@
+/*
+ *   Copyright (C) International Business Machines  Corp., 2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_ACL
+#define _H_JFS_ACL
+
+#ifdef CONFIG_JFS_POSIX_ACL
+
+int jfs_check_acl(struct inode *, int, unsigned int flags);
+int jfs_init_acl(tid_t, struct inode *, struct inode *);
+int jfs_acl_chmod(struct inode *inode);
+
+#else
+
+static inline int jfs_init_acl(tid_t tid, struct inode *inode,
+			       struct inode *dir)
+{
+	return 0;
+}
+
+static inline int jfs_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+
+#endif
+#endif		/* _H_JFS_ACL */
diff --git a/fs/jfs/jfs_btree.h b/fs/jfs/jfs_btree.h
new file mode 100644
index 00000000..79c61805
--- /dev/null
+++ b/fs/jfs/jfs_btree.h
@@ -0,0 +1,172 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_BTREE
+#define _H_JFS_BTREE
+
+/*
+ *	jfs_btree.h: B+-tree
+ *
+ * JFS B+-tree (dtree and xtree) common definitions
+ */
+
+/*
+ *	basic btree page - btpage
+ *
+struct btpage {
+	s64 next;		right sibling bn
+	s64 prev;		left sibling bn
+
+	u8 flag;
+	u8 rsrvd[7];		type specific
+	s64 self;		self address
+
+	u8 entry[4064];
+};						*/
+
+/* btpaget_t flag */
+#define BT_TYPE		0x07	/* B+-tree index */
+#define	BT_ROOT		0x01	/* root page */
+#define	BT_LEAF		0x02	/* leaf page */
+#define	BT_INTERNAL	0x04	/* internal page */
+#define	BT_RIGHTMOST	0x10	/* rightmost page */
+#define	BT_LEFTMOST	0x20	/* leftmost page */
+#define	BT_SWAPPED	0x80	/* used by fsck for endian swapping */
+
+/* btorder (in inode) */
+#define	BT_RANDOM		0x0000
+#define	BT_SEQUENTIAL		0x0001
+#define	BT_LOOKUP		0x0010
+#define	BT_INSERT		0x0020
+#define	BT_DELETE		0x0040
+
+/*
+ *	btree page buffer cache access
+ */
+#define BT_IS_ROOT(MP) (((MP)->xflag & COMMIT_PAGE) == 0)
+
+/* get page from buffer page */
+#define BT_PAGE(IP, MP, TYPE, ROOT)\
+	(BT_IS_ROOT(MP) ? (TYPE *)&JFS_IP(IP)->ROOT : (TYPE *)(MP)->data)
+
+/* get the page buffer and the page for specified block address */
+#define BT_GETPAGE(IP, BN, MP, TYPE, SIZE, P, RC, ROOT)\
+{\
+	if ((BN) == 0)\
+	{\
+		MP = (struct metapage *)&JFS_IP(IP)->bxflag;\
+		P = (TYPE *)&JFS_IP(IP)->ROOT;\
+		RC = 0;\
+	}\
+	else\
+	{\
+		MP = read_metapage((IP), BN, SIZE, 1);\
+		if (MP) {\
+			RC = 0;\
+			P = (MP)->data;\
+		} else {\
+			P = NULL;\
+			jfs_err("bread failed!");\
+			RC = -EIO;\
+		}\
+	}\
+}
+
+#define BT_MARK_DIRTY(MP, IP)\
+{\
+	if (BT_IS_ROOT(MP))\
+		mark_inode_dirty(IP);\
+	else\
+		mark_metapage_dirty(MP);\
+}
+
+/* put the page buffer */
+#define BT_PUTPAGE(MP)\
+{\
+	if (! BT_IS_ROOT(MP)) \
+		release_metapage(MP); \
+}
+
+
+/*
+ *	btree traversal stack
+ *
+ * record the path traversed during the search;
+ * top frame record the leaf page/entry selected.
+ */
+struct btframe {	/* stack frame */
+	s64 bn;			/* 8: */
+	s16 index;		/* 2: */
+	s16 lastindex;		/* 2: unused */
+	struct metapage *mp;	/* 4/8: */
+};				/* (16/24) */
+
+struct btstack {
+	struct btframe *top;
+	int nsplit;
+	struct btframe stack[MAXTREEHEIGHT];
+};
+
+#define BT_CLR(btstack)\
+	(btstack)->top = (btstack)->stack
+
+#define BT_STACK_FULL(btstack)\
+	( (btstack)->top == &((btstack)->stack[MAXTREEHEIGHT-1]))
+
+#define BT_PUSH(BTSTACK, BN, INDEX)\
+{\
+	assert(!BT_STACK_FULL(BTSTACK));\
+	(BTSTACK)->top->bn = BN;\
+	(BTSTACK)->top->index = INDEX;\
+	++(BTSTACK)->top;\
+}
+
+#define BT_POP(btstack)\
+	( (btstack)->top == (btstack)->stack ? NULL : --(btstack)->top )
+
+#define BT_STACK(btstack)\
+	( (btstack)->top == (btstack)->stack ? NULL : (btstack)->top )
+
+static inline void BT_STACK_DUMP(struct btstack *btstack)
+{
+	int i;
+	printk("btstack dump:\n");
+	for (i = 0; i < MAXTREEHEIGHT; i++)
+		printk(KERN_ERR "bn = %Lx, index = %d\n",
+		       (long long)btstack->stack[i].bn,
+		       btstack->stack[i].index);
+}
+
+/* retrieve search results */
+#define BT_GETSEARCH(IP, LEAF, BN, MP, TYPE, P, INDEX, ROOT)\
+{\
+	BN = (LEAF)->bn;\
+	MP = (LEAF)->mp;\
+	if (BN)\
+		P = (TYPE *)MP->data;\
+	else\
+		P = (TYPE *)&JFS_IP(IP)->ROOT;\
+	INDEX = (LEAF)->index;\
+}
+
+/* put the page buffer of search */
+#define BT_PUTSEARCH(BTSTACK)\
+{\
+	if (! BT_IS_ROOT((BTSTACK)->top->mp))\
+		release_metapage((BTSTACK)->top->mp);\
+}
+#endif				/* _H_JFS_BTREE */
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
new file mode 100644
index 00000000..dd824d9b
--- /dev/null
+++ b/fs/jfs/jfs_debug.c
@@ -0,0 +1,109 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_debug.h"
+
+#ifdef PROC_FS_JFS /* see jfs_debug.h */
+
+static struct proc_dir_entry *base;
+#ifdef CONFIG_JFS_DEBUG
+static int jfs_loglevel_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", jfsloglevel);
+	return 0;
+}
+
+static int jfs_loglevel_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_loglevel_proc_show, NULL);
+}
+
+static ssize_t jfs_loglevel_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
+{
+	char c;
+
+	if (get_user(c, buffer))
+		return -EFAULT;
+
+	/* yes, I know this is an ASCIIism.  --hch */
+	if (c < '0' || c > '9')
+		return -EINVAL;
+	jfsloglevel = c - '0';
+	return count;
+}
+
+static const struct file_operations jfs_loglevel_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_loglevel_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= jfs_loglevel_proc_write,
+};
+#endif
+
+static struct {
+	const char	*name;
+	const struct file_operations *proc_fops;
+} Entries[] = {
+#ifdef CONFIG_JFS_STATISTICS
+	{ "lmstats",	&jfs_lmstats_proc_fops, },
+	{ "txstats",	&jfs_txstats_proc_fops, },
+	{ "xtstat",	&jfs_xtstat_proc_fops, },
+	{ "mpstat",	&jfs_mpstat_proc_fops, },
+#endif
+#ifdef CONFIG_JFS_DEBUG
+	{ "TxAnchor",	&jfs_txanchor_proc_fops, },
+	{ "loglevel",	&jfs_loglevel_proc_fops }
+#endif
+};
+#define NPROCENT	ARRAY_SIZE(Entries)
+
+void jfs_proc_init(void)
+{
+	int i;
+
+	if (!(base = proc_mkdir("fs/jfs", NULL)))
+		return;
+
+	for (i = 0; i < NPROCENT; i++)
+		proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
+}
+
+void jfs_proc_clean(void)
+{
+	int i;
+
+	if (base) {
+		for (i = 0; i < NPROCENT; i++)
+			remove_proc_entry(Entries[i].name, base);
+		remove_proc_entry("fs/jfs", NULL);
+	}
+}
+
+#endif /* PROC_FS_JFS */
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
new file mode 100644
index 00000000..eafd1300
--- /dev/null
+++ b/fs/jfs/jfs_debug.h
@@ -0,0 +1,122 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_DEBUG
+#define _H_JFS_DEBUG
+
+/*
+ *	jfs_debug.h
+ *
+ * global debug message, data structure/macro definitions
+ * under control of CONFIG_JFS_DEBUG, CONFIG_JFS_STATISTICS;
+ */
+
+/*
+ * Create /proc/fs/jfs if procfs is enabled andeither
+ * CONFIG_JFS_DEBUG or CONFIG_JFS_STATISTICS is defined
+ */
+#if defined(CONFIG_PROC_FS) && (defined(CONFIG_JFS_DEBUG) || defined(CONFIG_JFS_STATISTICS))
+#define PROC_FS_JFS
+extern void jfs_proc_init(void);
+extern void jfs_proc_clean(void);
+#endif
+
+/*
+ *	assert with traditional printf/panic
+ */
+#define assert(p) do {	\
+	if (!(p)) {	\
+		printk(KERN_CRIT "BUG at %s:%d assert(%s)\n",	\
+		       __FILE__, __LINE__, #p);			\
+		BUG();	\
+	}		\
+} while (0)
+
+/*
+ *	debug ON
+ *	--------
+ */
+#ifdef CONFIG_JFS_DEBUG
+#define ASSERT(p) assert(p)
+
+/* printk verbosity */
+#define JFS_LOGLEVEL_ERR 1
+#define JFS_LOGLEVEL_WARN 2
+#define JFS_LOGLEVEL_DEBUG 3
+#define JFS_LOGLEVEL_INFO 4
+
+extern int jfsloglevel;
+
+extern const struct file_operations jfs_txanchor_proc_fops;
+
+/* information message: e.g., configuration, major event */
+#define jfs_info(fmt, arg...) do {			\
+	if (jfsloglevel >= JFS_LOGLEVEL_INFO)		\
+		printk(KERN_INFO fmt "\n", ## arg);	\
+} while (0)
+
+/* debug message: ad hoc */
+#define jfs_debug(fmt, arg...) do {			\
+	if (jfsloglevel >= JFS_LOGLEVEL_DEBUG)		\
+		printk(KERN_DEBUG fmt "\n", ## arg);	\
+} while (0)
+
+/* warn message: */
+#define jfs_warn(fmt, arg...) do {			\
+	if (jfsloglevel >= JFS_LOGLEVEL_WARN)		\
+		printk(KERN_WARNING fmt "\n", ## arg);	\
+} while (0)
+
+/* error event message: e.g., i/o error */
+#define jfs_err(fmt, arg...) do {			\
+	if (jfsloglevel >= JFS_LOGLEVEL_ERR)		\
+		printk(KERN_ERR fmt "\n", ## arg);	\
+} while (0)
+
+/*
+ *	debug OFF
+ *	---------
+ */
+#else				/* CONFIG_JFS_DEBUG */
+#define ASSERT(p) do {} while (0)
+#define jfs_info(fmt, arg...) do {} while (0)
+#define jfs_debug(fmt, arg...) do {} while (0)
+#define jfs_warn(fmt, arg...) do {} while (0)
+#define jfs_err(fmt, arg...) do {} while (0)
+#endif				/* CONFIG_JFS_DEBUG */
+
+/*
+ *	statistics
+ *	----------
+ */
+#ifdef	CONFIG_JFS_STATISTICS
+extern const struct file_operations jfs_lmstats_proc_fops;
+extern const struct file_operations jfs_txstats_proc_fops;
+extern const struct file_operations jfs_mpstat_proc_fops;
+extern const struct file_operations jfs_xtstat_proc_fops;
+
+#define	INCREMENT(x)		((x)++)
+#define	DECREMENT(x)		((x)--)
+#define	HIGHWATERMARK(x,y)	((x) = max((x), (y)))
+#else
+#define	INCREMENT(x)
+#define	DECREMENT(x)
+#define	HIGHWATERMARK(x,y)
+#endif				/* CONFIG_JFS_STATISTICS */
+
+#endif				/* _H_JFS_DEBUG */
diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h
new file mode 100644
index 00000000..395c4c0d
--- /dev/null
+++ b/fs/jfs/jfs_dinode.h
@@ -0,0 +1,176 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2001
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_DINODE
+#define _H_JFS_DINODE
+
+/*
+ *	jfs_dinode.h: on-disk inode manager
+ */
+
+#define INODESLOTSIZE		128
+#define L2INODESLOTSIZE		7
+#define log2INODESIZE		9	/* log2(bytes per dinode) */
+
+
+/*
+ *	on-disk inode : 512 bytes
+ *
+ * note: align 64-bit fields on 8-byte boundary.
+ */
+struct dinode {
+	/*
+	 *	I. base area (128 bytes)
+	 *	------------------------
+	 *
+	 * define generic/POSIX attributes
+	 */
+	__le32 di_inostamp;	/* 4: stamp to show inode belongs to fileset */
+	__le32 di_fileset;	/* 4: fileset number */
+	__le32 di_number;	/* 4: inode number, aka file serial number */
+	__le32 di_gen;		/* 4: inode generation number */
+
+	pxd_t di_ixpxd;		/* 8: inode extent descriptor */
+
+	__le64 di_size;		/* 8: size */
+	__le64 di_nblocks;	/* 8: number of blocks allocated */
+
+	__le32 di_nlink;	/* 4: number of links to the object */
+
+	__le32 di_uid;		/* 4: user id of owner */
+	__le32 di_gid;		/* 4: group id of owner */
+
+	__le32 di_mode;		/* 4: attribute, format and permission */
+
+	struct timestruc_t di_atime;	/* 8: time last data accessed */
+	struct timestruc_t di_ctime;	/* 8: time last status changed */
+	struct timestruc_t di_mtime;	/* 8: time last data modified */
+	struct timestruc_t di_otime;	/* 8: time created */
+
+	dxd_t di_acl;		/* 16: acl descriptor */
+
+	dxd_t di_ea;		/* 16: ea descriptor */
+
+	__le32 di_next_index;	/* 4: Next available dir_table index */
+
+	__le32 di_acltype;	/* 4: Type of ACL */
+
+	/*
+	 *	Extension Areas.
+	 *
+	 *	Historically, the inode was partitioned into 4 128-byte areas,
+	 *	the last 3 being defined as unions which could have multiple
+	 *	uses.  The first 96 bytes had been completely unused until
+	 *	an index table was added to the directory.  It is now more
+	 *	useful to describe the last 3/4 of the inode as a single
+	 *	union.  We would probably be better off redesigning the
+	 *	entire structure from scratch, but we don't want to break
+	 *	commonality with OS/2's JFS at this time.
+	 */
+	union {
+		struct {
+			/*
+			 * This table contains the information needed to
+			 * find a directory entry from a 32-bit index.
+			 * If the index is small enough, the table is inline,
+			 * otherwise, an x-tree root overlays this table
+			 */
+			struct dir_table_slot _table[12]; /* 96: inline */
+
+			dtroot_t _dtroot;		/* 288: dtree root */
+		} _dir;					/* (384) */
+#define di_dirtable	u._dir._table
+#define di_dtroot	u._dir._dtroot
+#define di_parent	di_dtroot.header.idotdot
+#define di_DASD		di_dtroot.header.DASD
+
+		struct {
+			union {
+				u8 _data[96];		/* 96: unused */
+				struct {
+					void *_imap;	/* 4: unused */
+					__le32 _gengen;	/* 4: generator */
+				} _imap;
+			} _u1;				/* 96: */
+#define di_gengen	u._file._u1._imap._gengen
+
+			union {
+				xtpage_t _xtroot;
+				struct {
+					u8 unused[16];	/* 16: */
+					dxd_t _dxd;	/* 16: */
+					union {
+						__le32 _rdev;	/* 4: */
+						u8 _fastsymlink[128];
+					} _u;
+					u8 _inlineea[128];
+				} _special;
+			} _u2;
+		} _file;
+#define di_xtroot	u._file._u2._xtroot
+#define di_dxd		u._file._u2._special._dxd
+#define di_btroot	di_xtroot
+#define di_inlinedata	u._file._u2._special._u
+#define di_rdev		u._file._u2._special._u._rdev
+#define di_fastsymlink	u._file._u2._special._u._fastsymlink
+#define di_inlineea	u._file._u2._special._inlineea
+	} u;
+};
+
+/* extended mode bits (on-disk inode di_mode) */
+#define IFJOURNAL	0x00010000	/* journalled file */
+#define ISPARSE		0x00020000	/* sparse file enabled */
+#define INLINEEA	0x00040000	/* inline EA area free */
+#define ISWAPFILE	0x00800000	/* file open for pager swap space */
+
+/* more extended mode bits: attributes for OS/2 */
+#define IREADONLY	0x02000000	/* no write access to file */
+#define IHIDDEN		0x04000000	/* hidden file */
+#define ISYSTEM		0x08000000	/* system file */
+
+#define IDIRECTORY	0x20000000	/* directory (shadow of real bit) */
+#define IARCHIVE	0x40000000	/* file archive bit */
+#define INEWNAME	0x80000000	/* non-8.3 filename format */
+
+#define IRASH		0x4E000000	/* mask for changeable attributes */
+#define ATTRSHIFT	25	/* bits to shift to move attribute
+				   specification to mode position */
+
+/* extended attributes for Linux */
+
+#define JFS_NOATIME_FL		0x00080000 /* do not update atime */
+
+#define JFS_DIRSYNC_FL		0x00100000 /* dirsync behaviour */
+#define JFS_SYNC_FL		0x00200000 /* Synchronous updates */
+#define JFS_SECRM_FL		0x00400000 /* Secure deletion */
+#define JFS_UNRM_FL		0x00800000 /* allow for undelete */
+
+#define JFS_APPEND_FL		0x01000000 /* writes to file may only append */
+#define JFS_IMMUTABLE_FL	0x02000000 /* Immutable file */
+
+#define JFS_FL_USER_VISIBLE	0x03F80000
+#define JFS_FL_USER_MODIFIABLE	0x03F80000
+#define JFS_FL_INHERIT		0x03C80000
+
+/* These are identical to EXT[23]_IOC_GETFLAGS/SETFLAGS */
+#define JFS_IOC_GETFLAGS	_IOR('f', 1, long)
+#define JFS_IOC_SETFLAGS	_IOW('f', 2, long)
+
+#define JFS_IOC_GETFLAGS32	_IOR('f', 1, int)
+#define JFS_IOC_SETFLAGS32	_IOW('f', 2, int)
+
+#endif /*_H_JFS_DINODE */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
new file mode 100644
index 00000000..4496872c
--- /dev/null
+++ b/fs/jfs/jfs_dmap.c
@@ -0,0 +1,3992 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_lock.h"
+#include "jfs_metapage.h"
+#include "jfs_debug.h"
+
+/*
+ *	SERIALIZATION of the Block Allocation Map.
+ *
+ *	the working state of the block allocation map is accessed in
+ *	two directions:
+ *
+ *	1) allocation and free requests that start at the dmap
+ *	   level and move up through the dmap control pages (i.e.
+ *	   the vast majority of requests).
+ *
+ *	2) allocation requests that start at dmap control page
+ *	   level and work down towards the dmaps.
+ *
+ *	the serialization scheme used here is as follows.
+ *
+ *	requests which start at the bottom are serialized against each
+ *	other through buffers and each requests holds onto its buffers
+ *	as it works it way up from a single dmap to the required level
+ *	of dmap control page.
+ *	requests that start at the top are serialized against each other
+ *	and request that start from the bottom by the multiple read/single
+ *	write inode lock of the bmap inode. requests starting at the top
+ *	take this lock in write mode while request starting at the bottom
+ *	take the lock in read mode.  a single top-down request may proceed
+ *	exclusively while multiple bottoms-up requests may proceed
+ *	simultaneously (under the protection of busy buffers).
+ *
+ *	in addition to information found in dmaps and dmap control pages,
+ *	the working state of the block allocation map also includes read/
+ *	write information maintained in the bmap descriptor (i.e. total
+ *	free block count, allocation group level free block counts).
+ *	a single exclusive lock (BMAP_LOCK) is used to guard this information
+ *	in the face of multiple-bottoms up requests.
+ *	(lock ordering: IREAD_LOCK, BMAP_LOCK);
+ *
+ *	accesses to the persistent state of the block allocation map (limited
+ *	to the persistent bitmaps in dmaps) is guarded by (busy) buffers.
+ */
+
+#define BMAP_LOCK_INIT(bmp)	mutex_init(&bmp->db_bmaplock)
+#define BMAP_LOCK(bmp)		mutex_lock(&bmp->db_bmaplock)
+#define BMAP_UNLOCK(bmp)	mutex_unlock(&bmp->db_bmaplock)
+
+/*
+ * forward references
+ */
+static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
+			int nblocks);
+static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
+static int dbBackSplit(dmtree_t * tp, int leafno);
+static int dbJoin(dmtree_t * tp, int leafno, int newval);
+static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
+static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
+		    int level);
+static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results);
+static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		       int nblocks);
+static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		       int nblocks,
+		       int l2nb, s64 * results);
+static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		       int nblocks);
+static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks,
+			  int l2nb,
+			  s64 * results);
+static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb,
+		     s64 * results);
+static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno,
+		      s64 * results);
+static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks);
+static int dbFindBits(u32 word, int l2nb);
+static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno);
+static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx);
+static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		      int nblocks);
+static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		      int nblocks);
+static int dbMaxBud(u8 * cp);
+s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
+static int blkstol2(s64 nb);
+
+static int cntlz(u32 value);
+static int cnttz(u32 word);
+
+static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
+			 int nblocks);
+static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks);
+static int dbInitDmapTree(struct dmap * dp);
+static int dbInitTree(struct dmaptree * dtp);
+static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i);
+static int dbGetL2AGSize(s64 nblocks);
+
+/*
+ *	buddy table
+ *
+ * table used for determining buddy sizes within characters of
+ * dmap bitmap words.  the characters themselves serve as indexes
+ * into the table, with the table elements yielding the maximum
+ * binary buddy of free bits within the character.
+ */
+static const s8 budtab[256] = {
+	3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1
+};
+
+
+/*
+ * NAME:	dbMount()
+ *
+ * FUNCTION:	initializate the block allocation map.
+ *
+ *		memory is allocated for the in-core bmap descriptor and
+ *		the in-core descriptor is initialized from disk.
+ *
+ * PARAMETERS:
+ *	ipbmap	- pointer to in-core inode for the block map.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOMEM	- insufficient memory
+ *	-EIO	- i/o error
+ */
+int dbMount(struct inode *ipbmap)
+{
+	struct bmap *bmp;
+	struct dbmap_disk *dbmp_le;
+	struct metapage *mp;
+	int i;
+
+	/*
+	 * allocate/initialize the in-memory bmap descriptor
+	 */
+	/* allocate memory for the in-memory bmap descriptor */
+	bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL);
+	if (bmp == NULL)
+		return -ENOMEM;
+
+	/* read the on-disk bmap descriptor. */
+	mp = read_metapage(ipbmap,
+			   BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
+			   PSIZE, 0);
+	if (mp == NULL) {
+		kfree(bmp);
+		return -EIO;
+	}
+
+	/* copy the on-disk bmap descriptor to its in-memory version. */
+	dbmp_le = (struct dbmap_disk *) mp->data;
+	bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
+	bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
+	bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
+	bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
+	bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
+	bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
+	bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
+	bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
+	bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
+	bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
+	bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
+	bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
+	for (i = 0; i < MAXAG; i++)
+		bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
+	bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
+	bmp->db_maxfreebud = dbmp_le->dn_maxfreebud;
+
+	/* release the buffer. */
+	release_metapage(mp);
+
+	/* bind the bmap inode and the bmap descriptor to each other. */
+	bmp->db_ipbmap = ipbmap;
+	JFS_SBI(ipbmap->i_sb)->bmap = bmp;
+
+	memset(bmp->db_active, 0, sizeof(bmp->db_active));
+
+	/*
+	 * allocate/initialize the bmap lock
+	 */
+	BMAP_LOCK_INIT(bmp);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	dbUnmount()
+ *
+ * FUNCTION:	terminate the block allocation map in preparation for
+ *		file system unmount.
+ *
+ *		the in-core bmap descriptor is written to disk and
+ *		the memory for this descriptor is freed.
+ *
+ * PARAMETERS:
+ *	ipbmap	- pointer to in-core inode for the block map.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error
+ */
+int dbUnmount(struct inode *ipbmap, int mounterror)
+{
+	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+
+	if (!(mounterror || isReadOnly(ipbmap)))
+		dbSync(ipbmap);
+
+	/*
+	 * Invalidate the page cache buffers
+	 */
+	truncate_inode_pages(ipbmap->i_mapping, 0);
+
+	/* free the memory for the in-memory bmap. */
+	kfree(bmp);
+
+	return (0);
+}
+
+/*
+ *	dbSync()
+ */
+int dbSync(struct inode *ipbmap)
+{
+	struct dbmap_disk *dbmp_le;
+	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+	struct metapage *mp;
+	int i;
+
+	/*
+	 * write bmap global control page
+	 */
+	/* get the buffer for the on-disk bmap descriptor. */
+	mp = read_metapage(ipbmap,
+			   BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
+			   PSIZE, 0);
+	if (mp == NULL) {
+		jfs_err("dbSync: read_metapage failed!");
+		return -EIO;
+	}
+	/* copy the in-memory version of the bmap to the on-disk version */
+	dbmp_le = (struct dbmap_disk *) mp->data;
+	dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize);
+	dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree);
+	dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage);
+	dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag);
+	dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel);
+	dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
+	dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
+	dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
+	dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight);
+	dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
+	dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
+	dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
+	for (i = 0; i < MAXAG; i++)
+		dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]);
+	dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize);
+	dbmp_le->dn_maxfreebud = bmp->db_maxfreebud;
+
+	/* write the buffer */
+	write_metapage(mp);
+
+	/*
+	 * write out dirty pages of bmap
+	 */
+	filemap_write_and_wait(ipbmap->i_mapping);
+
+	diWriteSpecial(ipbmap, 0);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	dbFree()
+ *
+ * FUNCTION:	free the specified block range from the working block
+ *		allocation map.
+ *
+ *		the blocks will be free from the working map one dmap
+ *		at a time.
+ *
+ * PARAMETERS:
+ *	ip	- pointer to in-core inode;
+ *	blkno	- starting block number to be freed.
+ *	nblocks	- number of blocks to be freed.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error
+ */
+int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
+{
+	struct metapage *mp;
+	struct dmap *dp;
+	int nb, rc;
+	s64 lblkno, rem;
+	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
+
+	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
+
+	/* block to be freed better be within the mapsize. */
+	if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) {
+		IREAD_UNLOCK(ipbmap);
+		printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
+		       (unsigned long long) blkno,
+		       (unsigned long long) nblocks);
+		jfs_error(ip->i_sb,
+			  "dbFree: block to be freed is outside the map");
+		return -EIO;
+	}
+
+	/*
+	 * free the blocks a dmap at a time.
+	 */
+	mp = NULL;
+	for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
+		/* release previous dmap if any */
+		if (mp) {
+			write_metapage(mp);
+		}
+
+		/* get the buffer for the current dmap. */
+		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+		if (mp == NULL) {
+			IREAD_UNLOCK(ipbmap);
+			return -EIO;
+		}
+		dp = (struct dmap *) mp->data;
+
+		/* determine the number of blocks to be freed from
+		 * this dmap.
+		 */
+		nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
+
+		/* free the blocks. */
+		if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
+			jfs_error(ip->i_sb, "dbFree: error in block map\n");
+			release_metapage(mp);
+			IREAD_UNLOCK(ipbmap);
+			return (rc);
+		}
+	}
+
+	/* write the last buffer. */
+	write_metapage(mp);
+
+	IREAD_UNLOCK(ipbmap);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	dbUpdatePMap()
+ *
+ * FUNCTION:	update the allocation state (free or allocate) of the
+ *		specified block range in the persistent block allocation map.
+ *
+ *		the blocks will be updated in the persistent map one
+ *		dmap at a time.
+ *
+ * PARAMETERS:
+ *	ipbmap	- pointer to in-core inode for the block map.
+ *	free	- 'true' if block range is to be freed from the persistent
+ *		  map; 'false' if it is to be allocated.
+ *	blkno	- starting block number of the range.
+ *	nblocks	- number of contiguous blocks in the range.
+ *	tblk	- transaction block;
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error
+ */
+int
+dbUpdatePMap(struct inode *ipbmap,
+	     int free, s64 blkno, s64 nblocks, struct tblock * tblk)
+{
+	int nblks, dbitno, wbitno, rbits;
+	int word, nbits, nwords;
+	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+	s64 lblkno, rem, lastlblkno;
+	u32 mask;
+	struct dmap *dp;
+	struct metapage *mp;
+	struct jfs_log *log;
+	int lsn, difft, diffp;
+	unsigned long flags;
+
+	/* the blocks better be within the mapsize. */
+	if (blkno + nblocks > bmp->db_mapsize) {
+		printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
+		       (unsigned long long) blkno,
+		       (unsigned long long) nblocks);
+		jfs_error(ipbmap->i_sb,
+			  "dbUpdatePMap: blocks are outside the map");
+		return -EIO;
+	}
+
+	/* compute delta of transaction lsn from log syncpt */
+	lsn = tblk->lsn;
+	log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
+	logdiff(difft, lsn, log);
+
+	/*
+	 * update the block state a dmap at a time.
+	 */
+	mp = NULL;
+	lastlblkno = 0;
+	for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) {
+		/* get the buffer for the current dmap. */
+		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+		if (lblkno != lastlblkno) {
+			if (mp) {
+				write_metapage(mp);
+			}
+
+			mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE,
+					   0);
+			if (mp == NULL)
+				return -EIO;
+			metapage_wait_for_io(mp);
+		}
+		dp = (struct dmap *) mp->data;
+
+		/* determine the bit number and word within the dmap of
+		 * the starting block.  also determine how many blocks
+		 * are to be updated within this dmap.
+		 */
+		dbitno = blkno & (BPERDMAP - 1);
+		word = dbitno >> L2DBWORD;
+		nblks = min(rem, (s64)BPERDMAP - dbitno);
+
+		/* update the bits of the dmap words. the first and last
+		 * words may only have a subset of their bits updated. if
+		 * this is the case, we'll work against that word (i.e.
+		 * partial first and/or last) only in a single pass.  a
+		 * single pass will also be used to update all words that
+		 * are to have all their bits updated.
+		 */
+		for (rbits = nblks; rbits > 0;
+		     rbits -= nbits, dbitno += nbits) {
+			/* determine the bit number within the word and
+			 * the number of bits within the word.
+			 */
+			wbitno = dbitno & (DBWORD - 1);
+			nbits = min(rbits, DBWORD - wbitno);
+
+			/* check if only part of the word is to be updated. */
+			if (nbits < DBWORD) {
+				/* update (free or allocate) the bits
+				 * in this word.
+				 */
+				mask =
+				    (ONES << (DBWORD - nbits) >> wbitno);
+				if (free)
+					dp->pmap[word] &=
+					    cpu_to_le32(~mask);
+				else
+					dp->pmap[word] |=
+					    cpu_to_le32(mask);
+
+				word += 1;
+			} else {
+				/* one or more words are to have all
+				 * their bits updated.  determine how
+				 * many words and how many bits.
+				 */
+				nwords = rbits >> L2DBWORD;
+				nbits = nwords << L2DBWORD;
+
+				/* update (free or allocate) the bits
+				 * in these words.
+				 */
+				if (free)
+					memset(&dp->pmap[word], 0,
+					       nwords * 4);
+				else
+					memset(&dp->pmap[word], (int) ONES,
+					       nwords * 4);
+
+				word += nwords;
+			}
+		}
+
+		/*
+		 * update dmap lsn
+		 */
+		if (lblkno == lastlblkno)
+			continue;
+
+		lastlblkno = lblkno;
+
+		LOGSYNC_LOCK(log, flags);
+		if (mp->lsn != 0) {
+			/* inherit older/smaller lsn */
+			logdiff(diffp, mp->lsn, log);
+			if (difft < diffp) {
+				mp->lsn = lsn;
+
+				/* move bp after tblock in logsync list */
+				list_move(&mp->synclist, &tblk->synclist);
+			}
+
+			/* inherit younger/larger clsn */
+			logdiff(difft, tblk->clsn, log);
+			logdiff(diffp, mp->clsn, log);
+			if (difft > diffp)
+				mp->clsn = tblk->clsn;
+		} else {
+			mp->log = log;
+			mp->lsn = lsn;
+
+			/* insert bp after tblock in logsync list */
+			log->count++;
+			list_add(&mp->synclist, &tblk->synclist);
+
+			mp->clsn = tblk->clsn;
+		}
+		LOGSYNC_UNLOCK(log, flags);
+	}
+
+	/* write the last buffer. */
+	if (mp) {
+		write_metapage(mp);
+	}
+
+	return (0);
+}
+
+
+/*
+ * NAME:	dbNextAG()
+ *
+ * FUNCTION:	find the preferred allocation group for new allocations.
+ *
+ *		Within the allocation groups, we maintain a preferred
+ *		allocation group which consists of a group with at least
+ *		average free space.  It is the preferred group that we target
+ *		new inode allocation towards.  The tie-in between inode
+ *		allocation and block allocation occurs as we allocate the
+ *		first (data) block of an inode and specify the inode (block)
+ *		as the allocation hint for this block.
+ *
+ *		We try to avoid having more than one open file growing in
+ *		an allocation group, as this will lead to fragmentation.
+ *		This differs from the old OS/2 method of trying to keep
+ *		empty ags around for large allocations.
+ *
+ * PARAMETERS:
+ *	ipbmap	- pointer to in-core inode for the block map.
+ *
+ * RETURN VALUES:
+ *	the preferred allocation group number.
+ */
+int dbNextAG(struct inode *ipbmap)
+{
+	s64 avgfree;
+	int agpref;
+	s64 hwm = 0;
+	int i;
+	int next_best = -1;
+	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+
+	BMAP_LOCK(bmp);
+
+	/* determine the average number of free blocks within the ags. */
+	avgfree = (u32)bmp->db_nfree / bmp->db_numag;
+
+	/*
+	 * if the current preferred ag does not have an active allocator
+	 * and has at least average freespace, return it
+	 */
+	agpref = bmp->db_agpref;
+	if ((atomic_read(&bmp->db_active[agpref]) == 0) &&
+	    (bmp->db_agfree[agpref] >= avgfree))
+		goto unlock;
+
+	/* From the last preferred ag, find the next one with at least
+	 * average free space.
+	 */
+	for (i = 0 ; i < bmp->db_numag; i++, agpref++) {
+		if (agpref == bmp->db_numag)
+			agpref = 0;
+
+		if (atomic_read(&bmp->db_active[agpref]))
+			/* open file is currently growing in this ag */
+			continue;
+		if (bmp->db_agfree[agpref] >= avgfree) {
+			/* Return this one */
+			bmp->db_agpref = agpref;
+			goto unlock;
+		} else if (bmp->db_agfree[agpref] > hwm) {
+			/* Less than avg. freespace, but best so far */
+			hwm = bmp->db_agfree[agpref];
+			next_best = agpref;
+		}
+	}
+
+	/*
+	 * If no inactive ag was found with average freespace, use the
+	 * next best
+	 */
+	if (next_best != -1)
+		bmp->db_agpref = next_best;
+	/* else leave db_agpref unchanged */
+unlock:
+	BMAP_UNLOCK(bmp);
+
+	/* return the preferred group.
+	 */
+	return (bmp->db_agpref);
+}
+
+/*
+ * NAME:	dbAlloc()
+ *
+ * FUNCTION:	attempt to allocate a specified number of contiguous free
+ *		blocks from the working allocation block map.
+ *
+ *		the block allocation policy uses hints and a multi-step
+ *		approach.
+ *
+ *		for allocation requests smaller than the number of blocks
+ *		per dmap, we first try to allocate the new blocks
+ *		immediately following the hint.  if these blocks are not
+ *		available, we try to allocate blocks near the hint.  if
+ *		no blocks near the hint are available, we next try to
+ *		allocate within the same dmap as contains the hint.
+ *
+ *		if no blocks are available in the dmap or the allocation
+ *		request is larger than the dmap size, we try to allocate
+ *		within the same allocation group as contains the hint. if
+ *		this does not succeed, we finally try to allocate anywhere
+ *		within the aggregate.
+ *
+ *		we also try to allocate anywhere within the aggregate for
+ *		for allocation requests larger than the allocation group
+ *		size or requests that specify no hint value.
+ *
+ * PARAMETERS:
+ *	ip	- pointer to in-core inode;
+ *	hint	- allocation hint.
+ *	nblocks	- number of contiguous blocks in the range.
+ *	results	- on successful return, set to the starting block number
+ *		  of the newly allocated contiguous range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ */
+int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
+{
+	int rc, agno;
+	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+	struct bmap *bmp;
+	struct metapage *mp;
+	s64 lblkno, blkno;
+	struct dmap *dp;
+	int l2nb;
+	s64 mapSize;
+	int writers;
+
+	/* assert that nblocks is valid */
+	assert(nblocks > 0);
+
+	/* get the log2 number of blocks to be allocated.
+	 * if the number of blocks is not a log2 multiple,
+	 * it will be rounded up to the next log2 multiple.
+	 */
+	l2nb = BLKSTOL2(nblocks);
+
+	bmp = JFS_SBI(ip->i_sb)->bmap;
+
+	mapSize = bmp->db_mapsize;
+
+	/* the hint should be within the map */
+	if (hint >= mapSize) {
+		jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map");
+		return -EIO;
+	}
+
+	/* if the number of blocks to be allocated is greater than the
+	 * allocation group size, try to allocate anywhere.
+	 */
+	if (l2nb > bmp->db_agl2size) {
+		IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
+
+		rc = dbAllocAny(bmp, nblocks, l2nb, results);
+
+		goto write_unlock;
+	}
+
+	/*
+	 * If no hint, let dbNextAG recommend an allocation group
+	 */
+	if (hint == 0)
+		goto pref_ag;
+
+	/* we would like to allocate close to the hint.  adjust the
+	 * hint to the block following the hint since the allocators
+	 * will start looking for free space starting at this point.
+	 */
+	blkno = hint + 1;
+
+	if (blkno >= bmp->db_mapsize)
+		goto pref_ag;
+
+	agno = blkno >> bmp->db_agl2size;
+
+	/* check if blkno crosses over into a new allocation group.
+	 * if so, check if we should allow allocations within this
+	 * allocation group.
+	 */
+	if ((blkno & (bmp->db_agsize - 1)) == 0)
+		/* check if the AG is currently being written to.
+		 * if so, call dbNextAG() to find a non-busy
+		 * AG with sufficient free space.
+		 */
+		if (atomic_read(&bmp->db_active[agno]))
+			goto pref_ag;
+
+	/* check if the allocation request size can be satisfied from a
+	 * single dmap.  if so, try to allocate from the dmap containing
+	 * the hint using a tiered strategy.
+	 */
+	if (nblocks <= BPERDMAP) {
+		IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
+
+		/* get the buffer for the dmap containing the hint.
+		 */
+		rc = -EIO;
+		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+		if (mp == NULL)
+			goto read_unlock;
+
+		dp = (struct dmap *) mp->data;
+
+		/* first, try to satisfy the allocation request with the
+		 * blocks beginning at the hint.
+		 */
+		if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks))
+		    != -ENOSPC) {
+			if (rc == 0) {
+				*results = blkno;
+				mark_metapage_dirty(mp);
+			}
+
+			release_metapage(mp);
+			goto read_unlock;
+		}
+
+		writers = atomic_read(&bmp->db_active[agno]);
+		if ((writers > 1) ||
+		    ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) {
+			/*
+			 * Someone else is writing in this allocation
+			 * group.  To avoid fragmenting, try another ag
+			 */
+			release_metapage(mp);
+			IREAD_UNLOCK(ipbmap);
+			goto pref_ag;
+		}
+
+		/* next, try to satisfy the allocation request with blocks
+		 * near the hint.
+		 */
+		if ((rc =
+		     dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results))
+		    != -ENOSPC) {
+			if (rc == 0)
+				mark_metapage_dirty(mp);
+
+			release_metapage(mp);
+			goto read_unlock;
+		}
+
+		/* try to satisfy the allocation request with blocks within
+		 * the same dmap as the hint.
+		 */
+		if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
+		    != -ENOSPC) {
+			if (rc == 0)
+				mark_metapage_dirty(mp);
+
+			release_metapage(mp);
+			goto read_unlock;
+		}
+
+		release_metapage(mp);
+		IREAD_UNLOCK(ipbmap);
+	}
+
+	/* try to satisfy the allocation request with blocks within
+	 * the same allocation group as the hint.
+	 */
+	IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
+	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC)
+		goto write_unlock;
+
+	IWRITE_UNLOCK(ipbmap);
+
+
+      pref_ag:
+	/*
+	 * Let dbNextAG recommend a preferred allocation group
+	 */
+	agno = dbNextAG(ipbmap);
+	IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
+
+	/* Try to allocate within this allocation group.  if that fails, try to
+	 * allocate anywhere in the map.
+	 */
+	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC)
+		rc = dbAllocAny(bmp, nblocks, l2nb, results);
+
+      write_unlock:
+	IWRITE_UNLOCK(ipbmap);
+
+	return (rc);
+
+      read_unlock:
+	IREAD_UNLOCK(ipbmap);
+
+	return (rc);
+}
+
+#ifdef _NOTYET
+/*
+ * NAME:	dbAllocExact()
+ *
+ * FUNCTION:	try to allocate the requested extent;
+ *
+ * PARAMETERS:
+ *	ip	- pointer to in-core inode;
+ *	blkno	- extent address;
+ *	nblocks	- extent length;
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ */
+int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
+{
+	int rc;
+	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
+	struct dmap *dp;
+	s64 lblkno;
+	struct metapage *mp;
+
+	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
+
+	/*
+	 * validate extent request:
+	 *
+	 * note: defragfs policy:
+	 *  max 64 blocks will be moved.
+	 *  allocation request size must be satisfied from a single dmap.
+	 */
+	if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) {
+		IREAD_UNLOCK(ipbmap);
+		return -EINVAL;
+	}
+
+	if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) {
+		/* the free space is no longer available */
+		IREAD_UNLOCK(ipbmap);
+		return -ENOSPC;
+	}
+
+	/* read in the dmap covering the extent */
+	lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+	mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+	if (mp == NULL) {
+		IREAD_UNLOCK(ipbmap);
+		return -EIO;
+	}
+	dp = (struct dmap *) mp->data;
+
+	/* try to allocate the requested extent */
+	rc = dbAllocNext(bmp, dp, blkno, nblocks);
+
+	IREAD_UNLOCK(ipbmap);
+
+	if (rc == 0)
+		mark_metapage_dirty(mp);
+
+	release_metapage(mp);
+
+	return (rc);
+}
+#endif /* _NOTYET */
+
+/*
+ * NAME:	dbReAlloc()
+ *
+ * FUNCTION:	attempt to extend a current allocation by a specified
+ *		number of blocks.
+ *
+ *		this routine attempts to satisfy the allocation request
+ *		by first trying to extend the existing allocation in
+ *		place by allocating the additional blocks as the blocks
+ *		immediately following the current allocation.  if these
+ *		blocks are not available, this routine will attempt to
+ *		allocate a new set of contiguous blocks large enough
+ *		to cover the existing allocation plus the additional
+ *		number of blocks required.
+ *
+ * PARAMETERS:
+ *	ip	    -  pointer to in-core inode requiring allocation.
+ *	blkno	    -  starting block of the current allocation.
+ *	nblocks	    -  number of contiguous blocks within the current
+ *		       allocation.
+ *	addnblocks  -  number of blocks to add to the allocation.
+ *	results	-      on successful return, set to the starting block number
+ *		       of the existing allocation if the existing allocation
+ *		       was extended in place or to a newly allocated contiguous
+ *		       range if the existing allocation could not be extended
+ *		       in place.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ */
+int
+dbReAlloc(struct inode *ip,
+	  s64 blkno, s64 nblocks, s64 addnblocks, s64 * results)
+{
+	int rc;
+
+	/* try to extend the allocation in place.
+	 */
+	if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) {
+		*results = blkno;
+		return (0);
+	} else {
+		if (rc != -ENOSPC)
+			return (rc);
+	}
+
+	/* could not extend the allocation in place, so allocate a
+	 * new set of blocks for the entire request (i.e. try to get
+	 * a range of contiguous blocks large enough to cover the
+	 * existing allocation plus the additional blocks.)
+	 */
+	return (dbAlloc
+		(ip, blkno + nblocks - 1, addnblocks + nblocks, results));
+}
+
+
+/*
+ * NAME:	dbExtend()
+ *
+ * FUNCTION:	attempt to extend a current allocation by a specified
+ *		number of blocks.
+ *
+ *		this routine attempts to satisfy the allocation request
+ *		by first trying to extend the existing allocation in
+ *		place by allocating the additional blocks as the blocks
+ *		immediately following the current allocation.
+ *
+ * PARAMETERS:
+ *	ip	    -  pointer to in-core inode requiring allocation.
+ *	blkno	    -  starting block of the current allocation.
+ *	nblocks	    -  number of contiguous blocks within the current
+ *		       allocation.
+ *	addnblocks  -  number of blocks to add to the allocation.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ */
+static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+	s64 lblkno, lastblkno, extblkno;
+	uint rel_block;
+	struct metapage *mp;
+	struct dmap *dp;
+	int rc;
+	struct inode *ipbmap = sbi->ipbmap;
+	struct bmap *bmp;
+
+	/*
+	 * We don't want a non-aligned extent to cross a page boundary
+	 */
+	if (((rel_block = blkno & (sbi->nbperpage - 1))) &&
+	    (rel_block + nblocks + addnblocks > sbi->nbperpage))
+		return -ENOSPC;
+
+	/* get the last block of the current allocation */
+	lastblkno = blkno + nblocks - 1;
+
+	/* determine the block number of the block following
+	 * the existing allocation.
+	 */
+	extblkno = lastblkno + 1;
+
+	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
+
+	/* better be within the file system */
+	bmp = sbi->bmap;
+	if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
+		IREAD_UNLOCK(ipbmap);
+		jfs_error(ip->i_sb,
+			  "dbExtend: the block is outside the filesystem");
+		return -EIO;
+	}
+
+	/* we'll attempt to extend the current allocation in place by
+	 * allocating the additional blocks as the blocks immediately
+	 * following the current allocation.  we only try to extend the
+	 * current allocation in place if the number of additional blocks
+	 * can fit into a dmap, the last block of the current allocation
+	 * is not the last block of the file system, and the start of the
+	 * inplace extension is not on an allocation group boundary.
+	 */
+	if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize ||
+	    (extblkno & (bmp->db_agsize - 1)) == 0) {
+		IREAD_UNLOCK(ipbmap);
+		return -ENOSPC;
+	}
+
+	/* get the buffer for the dmap containing the first block
+	 * of the extension.
+	 */
+	lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage);
+	mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+	if (mp == NULL) {
+		IREAD_UNLOCK(ipbmap);
+		return -EIO;
+	}
+
+	dp = (struct dmap *) mp->data;
+
+	/* try to allocate the blocks immediately following the
+	 * current allocation.
+	 */
+	rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks);
+
+	IREAD_UNLOCK(ipbmap);
+
+	/* were we successful ? */
+	if (rc == 0)
+		write_metapage(mp);
+	else
+		/* we were not successful */
+		release_metapage(mp);
+
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	dbAllocNext()
+ *
+ * FUNCTION:	attempt to allocate the blocks of the specified block
+ *		range within a dmap.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	dp	-  pointer to dmap.
+ *	blkno	-  starting block number of the range.
+ *	nblocks	-  number of contiguous free blocks of the range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		       int nblocks)
+{
+	int dbitno, word, rembits, nb, nwords, wbitno, nw;
+	int l2size;
+	s8 *leaf;
+	u32 mask;
+
+	if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
+		jfs_error(bmp->db_ipbmap->i_sb,
+			  "dbAllocNext: Corrupt dmap page");
+		return -EIO;
+	}
+
+	/* pick up a pointer to the leaves of the dmap tree.
+	 */
+	leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
+
+	/* determine the bit number and word within the dmap of the
+	 * starting block.
+	 */
+	dbitno = blkno & (BPERDMAP - 1);
+	word = dbitno >> L2DBWORD;
+
+	/* check if the specified block range is contained within
+	 * this dmap.
+	 */
+	if (dbitno + nblocks > BPERDMAP)
+		return -ENOSPC;
+
+	/* check if the starting leaf indicates that anything
+	 * is free.
+	 */
+	if (leaf[word] == NOFREE)
+		return -ENOSPC;
+
+	/* check the dmaps words corresponding to block range to see
+	 * if the block range is free.  not all bits of the first and
+	 * last words may be contained within the block range.  if this
+	 * is the case, we'll work against those words (i.e. partial first
+	 * and/or last) on an individual basis (a single pass) and examine
+	 * the actual bits to determine if they are free.  a single pass
+	 * will be used for all dmap words fully contained within the
+	 * specified range.  within this pass, the leaves of the dmap
+	 * tree will be examined to determine if the blocks are free. a
+	 * single leaf may describe the free space of multiple dmap
+	 * words, so we may visit only a subset of the actual leaves
+	 * corresponding to the dmap words of the block range.
+	 */
+	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+		/* determine the bit number within the word and
+		 * the number of bits within the word.
+		 */
+		wbitno = dbitno & (DBWORD - 1);
+		nb = min(rembits, DBWORD - wbitno);
+
+		/* check if only part of the word is to be examined.
+		 */
+		if (nb < DBWORD) {
+			/* check if the bits are free.
+			 */
+			mask = (ONES << (DBWORD - nb) >> wbitno);
+			if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask)
+				return -ENOSPC;
+
+			word += 1;
+		} else {
+			/* one or more dmap words are fully contained
+			 * within the block range.  determine how many
+			 * words and how many bits.
+			 */
+			nwords = rembits >> L2DBWORD;
+			nb = nwords << L2DBWORD;
+
+			/* now examine the appropriate leaves to determine
+			 * if the blocks are free.
+			 */
+			while (nwords > 0) {
+				/* does the leaf describe any free space ?
+				 */
+				if (leaf[word] < BUDMIN)
+					return -ENOSPC;
+
+				/* determine the l2 number of bits provided
+				 * by this leaf.
+				 */
+				l2size =
+				    min((int)leaf[word], NLSTOL2BSZ(nwords));
+
+				/* determine how many words were handled.
+				 */
+				nw = BUDSIZE(l2size, BUDMIN);
+
+				nwords -= nw;
+				word += nw;
+			}
+		}
+	}
+
+	/* allocate the blocks.
+	 */
+	return (dbAllocDmap(bmp, dp, blkno, nblocks));
+}
+
+
+/*
+ * NAME:	dbAllocNear()
+ *
+ * FUNCTION:	attempt to allocate a number of contiguous free blocks near
+ *		a specified block (hint) within a dmap.
+ *
+ *		starting with the dmap leaf that covers the hint, we'll
+ *		check the next four contiguous leaves for sufficient free
+ *		space.  if sufficient free space is found, we'll allocate
+ *		the desired free space.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	dp	-  pointer to dmap.
+ *	blkno	-  block number to allocate near.
+ *	nblocks	-  actual number of contiguous free blocks desired.
+ *	l2nb	-  log2 number of contiguous free blocks desired.
+ *	results	-  on successful return, set to the starting block number
+ *		   of the newly allocated range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
+ */
+static int
+dbAllocNear(struct bmap * bmp,
+	    struct dmap * dp, s64 blkno, int nblocks, int l2nb, s64 * results)
+{
+	int word, lword, rc;
+	s8 *leaf;
+
+	if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
+		jfs_error(bmp->db_ipbmap->i_sb,
+			  "dbAllocNear: Corrupt dmap page");
+		return -EIO;
+	}
+
+	leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
+
+	/* determine the word within the dmap that holds the hint
+	 * (i.e. blkno).  also, determine the last word in the dmap
+	 * that we'll include in our examination.
+	 */
+	word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
+	lword = min(word + 4, LPERDMAP);
+
+	/* examine the leaves for sufficient free space.
+	 */
+	for (; word < lword; word++) {
+		/* does the leaf describe sufficient free space ?
+		 */
+		if (leaf[word] < l2nb)
+			continue;
+
+		/* determine the block number within the file system
+		 * of the first block described by this dmap word.
+		 */
+		blkno = le64_to_cpu(dp->start) + (word << L2DBWORD);
+
+		/* if not all bits of the dmap word are free, get the
+		 * starting bit number within the dmap word of the required
+		 * string of free bits and adjust the block number with the
+		 * value.
+		 */
+		if (leaf[word] < BUDMIN)
+			blkno +=
+			    dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb);
+
+		/* allocate the blocks.
+		 */
+		if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
+			*results = blkno;
+
+		return (rc);
+	}
+
+	return -ENOSPC;
+}
+
+
+/*
+ * NAME:	dbAllocAG()
+ *
+ * FUNCTION:	attempt to allocate the specified number of contiguous
+ *		free blocks within the specified allocation group.
+ *
+ *		unless the allocation group size is equal to the number
+ *		of blocks per dmap, the dmap control pages will be used to
+ *		find the required free space, if available.  we start the
+ *		search at the highest dmap control page level which
+ *		distinctly describes the allocation group's free space
+ *		(i.e. the highest level at which the allocation group's
+ *		free space is not mixed in with that of any other group).
+ *		in addition, we start the search within this level at a
+ *		height of the dmapctl dmtree at which the nodes distinctly
+ *		describe the allocation group's free space.  at this height,
+ *		the allocation group's free space may be represented by 1
+ *		or two sub-trees, depending on the allocation group size.
+ *		we search the top nodes of these subtrees left to right for
+ *		sufficient free space.  if sufficient free space is found,
+ *		the subtree is searched to find the leftmost leaf that
+ *		has free space.  once we have made it to the leaf, we
+ *		move the search to the next lower level dmap control page
+ *		corresponding to this leaf.  we continue down the dmap control
+ *		pages until we find the dmap that contains or starts the
+ *		sufficient free space and we allocate at this dmap.
+ *
+ *		if the allocation group size is equal to the dmap size,
+ *		we'll start at the dmap corresponding to the allocation
+ *		group and attempt the allocation at this level.
+ *
+ *		the dmap control page search is also not performed if the
+ *		allocation group is completely free and we go to the first
+ *		dmap of the allocation group to do the allocation.  this is
+ *		done because the allocation group may be part (not the first
+ *		part) of a larger binary buddy system, causing the dmap
+ *		control pages to indicate no free space (NOFREE) within
+ *		the allocation group.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	agno	- allocation group number.
+ *	nblocks	-  actual number of contiguous free blocks desired.
+ *	l2nb	-  log2 number of contiguous free blocks desired.
+ *	results	-  on successful return, set to the starting block number
+ *		   of the newly allocated range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ *
+ * note: IWRITE_LOCK(ipmap) held on entry/exit;
+ */
+static int
+dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
+{
+	struct metapage *mp;
+	struct dmapctl *dcp;
+	int rc, ti, i, k, m, n, agperlev;
+	s64 blkno, lblkno;
+	int budmin;
+
+	/* allocation request should not be for more than the
+	 * allocation group size.
+	 */
+	if (l2nb > bmp->db_agl2size) {
+		jfs_error(bmp->db_ipbmap->i_sb,
+			  "dbAllocAG: allocation request is larger than the "
+			  "allocation group size");
+		return -EIO;
+	}
+
+	/* determine the starting block number of the allocation
+	 * group.
+	 */
+	blkno = (s64) agno << bmp->db_agl2size;
+
+	/* check if the allocation group size is the minimum allocation
+	 * group size or if the allocation group is completely free. if
+	 * the allocation group size is the minimum size of BPERDMAP (i.e.
+	 * 1 dmap), there is no need to search the dmap control page (below)
+	 * that fully describes the allocation group since the allocation
+	 * group is already fully described by a dmap.  in this case, we
+	 * just call dbAllocCtl() to search the dmap tree and allocate the
+	 * required space if available.
+	 *
+	 * if the allocation group is completely free, dbAllocCtl() is
+	 * also called to allocate the required space.  this is done for
+	 * two reasons.  first, it makes no sense searching the dmap control
+	 * pages for free space when we know that free space exists.  second,
+	 * the dmap control pages may indicate that the allocation group
+	 * has no free space if the allocation group is part (not the first
+	 * part) of a larger binary buddy system.
+	 */
+	if (bmp->db_agsize == BPERDMAP
+	    || bmp->db_agfree[agno] == bmp->db_agsize) {
+		rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
+		if ((rc == -ENOSPC) &&
+		    (bmp->db_agfree[agno] == bmp->db_agsize)) {
+			printk(KERN_ERR "blkno = %Lx, blocks = %Lx\n",
+			       (unsigned long long) blkno,
+			       (unsigned long long) nblocks);
+			jfs_error(bmp->db_ipbmap->i_sb,
+				  "dbAllocAG: dbAllocCtl failed in free AG");
+		}
+		return (rc);
+	}
+
+	/* the buffer for the dmap control page that fully describes the
+	 * allocation group.
+	 */
+	lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel);
+	mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+	if (mp == NULL)
+		return -EIO;
+	dcp = (struct dmapctl *) mp->data;
+	budmin = dcp->budmin;
+
+	if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
+		jfs_error(bmp->db_ipbmap->i_sb,
+			  "dbAllocAG: Corrupt dmapctl page");
+		release_metapage(mp);
+		return -EIO;
+	}
+
+	/* search the subtree(s) of the dmap control page that describes
+	 * the allocation group, looking for sufficient free space.  to begin,
+	 * determine how many allocation groups are represented in a dmap
+	 * control page at the control page level (i.e. L0, L1, L2) that
+	 * fully describes an allocation group. next, determine the starting
+	 * tree index of this allocation group within the control page.
+	 */
+	agperlev =
+	    (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth;
+	ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
+
+	/* dmap control page trees fan-out by 4 and a single allocation
+	 * group may be described by 1 or 2 subtrees within the ag level
+	 * dmap control page, depending upon the ag size. examine the ag's
+	 * subtrees for sufficient free space, starting with the leftmost
+	 * subtree.
+	 */
+	for (i = 0; i < bmp->db_agwidth; i++, ti++) {
+		/* is there sufficient free space ?
+		 */
+		if (l2nb > dcp->stree[ti])
+			continue;
+
+		/* sufficient free space found in a subtree. now search down
+		 * the subtree to find the leftmost leaf that describes this
+		 * free space.
+		 */
+		for (k = bmp->db_agheight; k > 0; k--) {
+			for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
+				if (l2nb <= dcp->stree[m + n]) {
+					ti = m + n;
+					break;
+				}
+			}
+			if (n == 4) {
+				jfs_error(bmp->db_ipbmap->i_sb,
+					  "dbAllocAG: failed descending stree");
+				release_metapage(mp);
+				return -EIO;
+			}
+		}
+
+		/* determine the block number within the file system
+		 * that corresponds to this leaf.
+		 */
+		if (bmp->db_aglevel == 2)
+			blkno = 0;
+		else if (bmp->db_aglevel == 1)
+			blkno &= ~(MAXL1SIZE - 1);
+		else		/* bmp->db_aglevel == 0 */
+			blkno &= ~(MAXL0SIZE - 1);
+
+		blkno +=
+		    ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin;
+
+		/* release the buffer in preparation for going down
+		 * the next level of dmap control pages.
+		 */
+		release_metapage(mp);
+
+		/* check if we need to continue to search down the lower
+		 * level dmap control pages.  we need to if the number of
+		 * blocks required is less than maximum number of blocks
+		 * described at the next lower level.
+		 */
+		if (l2nb < budmin) {
+
+			/* search the lower level dmap control pages to get
+			 * the starting block number of the dmap that
+			 * contains or starts off the free space.
+			 */
+			if ((rc =
+			     dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1,
+				       &blkno))) {
+				if (rc == -ENOSPC) {
+					jfs_error(bmp->db_ipbmap->i_sb,
+						  "dbAllocAG: control page "
+						  "inconsistent");
+					return -EIO;
+				}
+				return (rc);
+			}
+		}
+
+		/* allocate the blocks.
+		 */
+		rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
+		if (rc == -ENOSPC) {
+			jfs_error(bmp->db_ipbmap->i_sb,
+				  "dbAllocAG: unable to allocate blocks");
+			rc = -EIO;
+		}
+		return (rc);
+	}
+
+	/* no space in the allocation group.  release the buffer and
+	 * return -ENOSPC.
+	 */
+	release_metapage(mp);
+
+	return -ENOSPC;
+}
+
+
+/*
+ * NAME:	dbAllocAny()
+ *
+ * FUNCTION:	attempt to allocate the specified number of contiguous
+ *		free blocks anywhere in the file system.
+ *
+ *		dbAllocAny() attempts to find the sufficient free space by
+ *		searching down the dmap control pages, starting with the
+ *		highest level (i.e. L0, L1, L2) control page.  if free space
+ *		large enough to satisfy the desired free space is found, the
+ *		desired free space is allocated.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	nblocks	 -  actual number of contiguous free blocks desired.
+ *	l2nb	 -  log2 number of contiguous free blocks desired.
+ *	results	-  on successful return, set to the starting block number
+ *		   of the newly allocated range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ *
+ * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
+{
+	int rc;
+	s64 blkno = 0;
+
+	/* starting with the top level dmap control page, search
+	 * down the dmap control levels for sufficient free space.
+	 * if free space is found, dbFindCtl() returns the starting
+	 * block number of the dmap that contains or starts off the
+	 * range of free space.
+	 */
+	if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno)))
+		return (rc);
+
+	/* allocate the blocks.
+	 */
+	rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
+	if (rc == -ENOSPC) {
+		jfs_error(bmp->db_ipbmap->i_sb,
+			  "dbAllocAny: unable to allocate blocks");
+		return -EIO;
+	}
+	return (rc);
+}
+
+
+/*
+ * NAME:	dbFindCtl()
+ *
+ * FUNCTION:	starting at a specified dmap control page level and block
+ *		number, search down the dmap control levels for a range of
+ *		contiguous free blocks large enough to satisfy an allocation
+ *		request for the specified number of free blocks.
+ *
+ *		if sufficient contiguous free blocks are found, this routine
+ *		returns the starting block number within a dmap page that
+ *		contains or starts a range of contiqious free blocks that
+ *		is sufficient in size.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	level	-  starting dmap control page level.
+ *	l2nb	-  log2 number of contiguous free blocks desired.
+ *	*blkno	-  on entry, starting block number for conducting the search.
+ *		   on successful return, the first block within a dmap page
+ *		   that contains or starts a range of contiguous free blocks.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ *
+ * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
+{
+	int rc, leafidx, lev;
+	s64 b, lblkno;
+	struct dmapctl *dcp;
+	int budmin;
+	struct metapage *mp;
+
+	/* starting at the specified dmap control page level and block
+	 * number, search down the dmap control levels for the starting
+	 * block number of a dmap page that contains or starts off
+	 * sufficient free blocks.
+	 */
+	for (lev = level, b = *blkno; lev >= 0; lev--) {
+		/* get the buffer of the dmap control page for the block
+		 * number and level (i.e. L0, L1, L2).
+		 */
+		lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev);
+		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+		if (mp == NULL)
+			return -EIO;
+		dcp = (struct dmapctl *) mp->data;
+		budmin = dcp->budmin;
+
+		if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
+			jfs_error(bmp->db_ipbmap->i_sb,
+				  "dbFindCtl: Corrupt dmapctl page");
+			release_metapage(mp);
+			return -EIO;
+		}
+
+		/* search the tree within the dmap control page for
+		 * sufficient free space.  if sufficient free space is found,
+		 * dbFindLeaf() returns the index of the leaf at which
+		 * free space was found.
+		 */
+		rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx);
+
+		/* release the buffer.
+		 */
+		release_metapage(mp);
+
+		/* space found ?
+		 */
+		if (rc) {
+			if (lev != level) {
+				jfs_error(bmp->db_ipbmap->i_sb,
+					  "dbFindCtl: dmap inconsistent");
+				return -EIO;
+			}
+			return -ENOSPC;
+		}
+
+		/* adjust the block number to reflect the location within
+		 * the dmap control page (i.e. the leaf) at which free
+		 * space was found.
+		 */
+		b += (((s64) leafidx) << budmin);
+
+		/* we stop the search at this dmap control page level if
+		 * the number of blocks required is greater than or equal
+		 * to the maximum number of blocks described at the next
+		 * (lower) level.
+		 */
+		if (l2nb >= budmin)
+			break;
+	}
+
+	*blkno = b;
+	return (0);
+}
+
+
+/*
+ * NAME:	dbAllocCtl()
+ *
+ * FUNCTION:	attempt to allocate a specified number of contiguous
+ *		blocks starting within a specific dmap.
+ *
+ *		this routine is called by higher level routines that search
+ *		the dmap control pages above the actual dmaps for contiguous
+ *		free space.  the result of successful searches by these
+ *		routines are the starting block numbers within dmaps, with
+ *		the dmaps themselves containing the desired contiguous free
+ *		space or starting a contiguous free space of desired size
+ *		that is made up of the blocks of one or more dmaps. these
+ *		calls should not fail due to insufficent resources.
+ *
+ *		this routine is called in some cases where it is not known
+ *		whether it will fail due to insufficient resources.  more
+ *		specifically, this occurs when allocating from an allocation
+ *		group whose size is equal to the number of blocks per dmap.
+ *		in this case, the dmap control pages are not examined prior
+ *		to calling this routine (to save pathlength) and the call
+ *		might fail.
+ *
+ *		for a request size that fits within a dmap, this routine relies
+ *		upon the dmap's dmtree to find the requested contiguous free
+ *		space.  for request sizes that are larger than a dmap, the
+ *		requested free space will start at the first block of the
+ *		first dmap (i.e. blkno).
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	nblocks	 -  actual number of contiguous free blocks to allocate.
+ *	l2nb	 -  log2 number of contiguous free blocks to allocate.
+ *	blkno	 -  starting block number of the dmap to start the allocation
+ *		    from.
+ *	results	-  on successful return, set to the starting block number
+ *		   of the newly allocated range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ *
+ * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int
+dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
+{
+	int rc, nb;
+	s64 b, lblkno, n;
+	struct metapage *mp;
+	struct dmap *dp;
+
+	/* check if the allocation request is confined to a single dmap.
+	 */
+	if (l2nb <= L2BPERDMAP) {
+		/* get the buffer for the dmap.
+		 */
+		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+		if (mp == NULL)
+			return -EIO;
+		dp = (struct dmap *) mp->data;
+
+		/* try to allocate the blocks.
+		 */
+		rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results);
+		if (rc == 0)
+			mark_metapage_dirty(mp);
+
+		release_metapage(mp);
+
+		return (rc);
+	}
+
+	/* allocation request involving multiple dmaps. it must start on
+	 * a dmap boundary.
+	 */
+	assert((blkno & (BPERDMAP - 1)) == 0);
+
+	/* allocate the blocks dmap by dmap.
+	 */
+	for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) {
+		/* get the buffer for the dmap.
+		 */
+		lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
+		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+		if (mp == NULL) {
+			rc = -EIO;
+			goto backout;
+		}
+		dp = (struct dmap *) mp->data;
+
+		/* the dmap better be all free.
+		 */
+		if (dp->tree.stree[ROOT] != L2BPERDMAP) {
+			release_metapage(mp);
+			jfs_error(bmp->db_ipbmap->i_sb,
+				  "dbAllocCtl: the dmap is not all free");
+			rc = -EIO;
+			goto backout;
+		}
+
+		/* determine how many blocks to allocate from this dmap.
+		 */
+		nb = min(n, (s64)BPERDMAP);
+
+		/* allocate the blocks from the dmap.
+		 */
+		if ((rc = dbAllocDmap(bmp, dp, b, nb))) {
+			release_metapage(mp);
+			goto backout;
+		}
+
+		/* write the buffer.
+		 */
+		write_metapage(mp);
+	}
+
+	/* set the results (starting block number) and return.
+	 */
+	*results = blkno;
+	return (0);
+
+	/* something failed in handling an allocation request involving
+	 * multiple dmaps.  we'll try to clean up by backing out any
+	 * allocation that has already happened for this request.  if
+	 * we fail in backing out the allocation, we'll mark the file
+	 * system to indicate that blocks have been leaked.
+	 */
+      backout:
+
+	/* try to backout the allocations dmap by dmap.
+	 */
+	for (n = nblocks - n, b = blkno; n > 0;
+	     n -= BPERDMAP, b += BPERDMAP) {
+		/* get the buffer for this dmap.
+		 */
+		lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
+		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+		if (mp == NULL) {
+			/* could not back out.  mark the file system
+			 * to indicate that we have leaked blocks.
+			 */
+			jfs_error(bmp->db_ipbmap->i_sb,
+				  "dbAllocCtl: I/O Error: Block Leakage.");
+			continue;
+		}
+		dp = (struct dmap *) mp->data;
+
+		/* free the blocks is this dmap.
+		 */
+		if (dbFreeDmap(bmp, dp, b, BPERDMAP)) {
+			/* could not back out.  mark the file system
+			 * to indicate that we have leaked blocks.
+			 */
+			release_metapage(mp);
+			jfs_error(bmp->db_ipbmap->i_sb,
+				  "dbAllocCtl: Block Leakage.");
+			continue;
+		}
+
+		/* write the buffer.
+		 */
+		write_metapage(mp);
+	}
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	dbAllocDmapLev()
+ *
+ * FUNCTION:	attempt to allocate a specified number of contiguous blocks
+ *		from a specified dmap.
+ *
+ *		this routine checks if the contiguous blocks are available.
+ *		if so, nblocks of blocks are allocated; otherwise, ENOSPC is
+ *		returned.
+ *
+ * PARAMETERS:
+ *	mp	-  pointer to bmap descriptor
+ *	dp	-  pointer to dmap to attempt to allocate blocks from.
+ *	l2nb	-  log2 number of contiguous block desired.
+ *	nblocks	-  actual number of contiguous block desired.
+ *	results	-  on successful return, set to the starting block number
+ *		   of the newly allocated range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient disk resources
+ *	-EIO	- i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or
+ *	IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
+ */
+static int
+dbAllocDmapLev(struct bmap * bmp,
+	       struct dmap * dp, int nblocks, int l2nb, s64 * results)
+{
+	s64 blkno;
+	int leafidx, rc;
+
+	/* can't be more than a dmaps worth of blocks */
+	assert(l2nb <= L2BPERDMAP);
+
+	/* search the tree within the dmap page for sufficient
+	 * free space.  if sufficient free space is found, dbFindLeaf()
+	 * returns the index of the leaf at which free space was found.
+	 */
+	if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx))
+		return -ENOSPC;
+
+	/* determine the block number within the file system corresponding
+	 * to the leaf at which free space was found.
+	 */
+	blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD);
+
+	/* if not all bits of the dmap word are free, get the starting
+	 * bit number within the dmap word of the required string of free
+	 * bits and adjust the block number with this value.
+	 */
+	if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN)
+		blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb);
+
+	/* allocate the blocks */
+	if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
+		*results = blkno;
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	dbAllocDmap()
+ *
+ * FUNCTION:	adjust the disk allocation map to reflect the allocation
+ *		of a specified block range within a dmap.
+ *
+ *		this routine allocates the specified blocks from the dmap
+ *		through a call to dbAllocBits(). if the allocation of the
+ *		block range causes the maximum string of free blocks within
+ *		the dmap to change (i.e. the value of the root of the dmap's
+ *		dmtree), this routine will cause this change to be reflected
+ *		up through the appropriate levels of the dmap control pages
+ *		by a call to dbAdjCtl() for the L0 dmap control page that
+ *		covers this dmap.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	dp	-  pointer to dmap to allocate the block range from.
+ *	blkno	-  starting block number of the block to be allocated.
+ *	nblocks	-  number of blocks to be allocated.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		       int nblocks)
+{
+	s8 oldroot;
+	int rc;
+
+	/* save the current value of the root (i.e. maximum free string)
+	 * of the dmap tree.
+	 */
+	oldroot = dp->tree.stree[ROOT];
+
+	/* allocate the specified (blocks) bits */
+	dbAllocBits(bmp, dp, blkno, nblocks);
+
+	/* if the root has not changed, done. */
+	if (dp->tree.stree[ROOT] == oldroot)
+		return (0);
+
+	/* root changed. bubble the change up to the dmap control pages.
+	 * if the adjustment of the upper level control pages fails,
+	 * backout the bit allocation (thus making everything consistent).
+	 */
+	if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0)))
+		dbFreeBits(bmp, dp, blkno, nblocks);
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	dbFreeDmap()
+ *
+ * FUNCTION:	adjust the disk allocation map to reflect the allocation
+ *		of a specified block range within a dmap.
+ *
+ *		this routine frees the specified blocks from the dmap through
+ *		a call to dbFreeBits(). if the deallocation of the block range
+ *		causes the maximum string of free blocks within the dmap to
+ *		change (i.e. the value of the root of the dmap's dmtree), this
+ *		routine will cause this change to be reflected up through the
+ *		appropriate levels of the dmap control pages by a call to
+ *		dbAdjCtl() for the L0 dmap control page that covers this dmap.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	dp	-  pointer to dmap to free the block range from.
+ *	blkno	-  starting block number of the block to be freed.
+ *	nblocks	-  number of blocks to be freed.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		      int nblocks)
+{
+	s8 oldroot;
+	int rc = 0, word;
+
+	/* save the current value of the root (i.e. maximum free string)
+	 * of the dmap tree.
+	 */
+	oldroot = dp->tree.stree[ROOT];
+
+	/* free the specified (blocks) bits */
+	rc = dbFreeBits(bmp, dp, blkno, nblocks);
+
+	/* if error or the root has not changed, done. */
+	if (rc || (dp->tree.stree[ROOT] == oldroot))
+		return (rc);
+
+	/* root changed. bubble the change up to the dmap control pages.
+	 * if the adjustment of the upper level control pages fails,
+	 * backout the deallocation.
+	 */
+	if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) {
+		word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
+
+		/* as part of backing out the deallocation, we will have
+		 * to back split the dmap tree if the deallocation caused
+		 * the freed blocks to become part of a larger binary buddy
+		 * system.
+		 */
+		if (dp->tree.stree[word] == NOFREE)
+			dbBackSplit((dmtree_t *) & dp->tree, word);
+
+		dbAllocBits(bmp, dp, blkno, nblocks);
+	}
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	dbAllocBits()
+ *
+ * FUNCTION:	allocate a specified block range from a dmap.
+ *
+ *		this routine updates the dmap to reflect the working
+ *		state allocation of the specified block range. it directly
+ *		updates the bits of the working map and causes the adjustment
+ *		of the binary buddy system described by the dmap's dmtree
+ *		leaves to reflect the bits allocated.  it also causes the
+ *		dmap's dmtree, as a whole, to reflect the allocated range.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	dp	-  pointer to dmap to allocate bits from.
+ *	blkno	-  starting block number of the bits to be allocated.
+ *	nblocks	-  number of bits to be allocated.
+ *
+ * RETURN VALUES: none
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
+			int nblocks)
+{
+	int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
+	dmtree_t *tp = (dmtree_t *) & dp->tree;
+	int size;
+	s8 *leaf;
+
+	/* pick up a pointer to the leaves of the dmap tree */
+	leaf = dp->tree.stree + LEAFIND;
+
+	/* determine the bit number and word within the dmap of the
+	 * starting block.
+	 */
+	dbitno = blkno & (BPERDMAP - 1);
+	word = dbitno >> L2DBWORD;
+
+	/* block range better be within the dmap */
+	assert(dbitno + nblocks <= BPERDMAP);
+
+	/* allocate the bits of the dmap's words corresponding to the block
+	 * range. not all bits of the first and last words may be contained
+	 * within the block range.  if this is the case, we'll work against
+	 * those words (i.e. partial first and/or last) on an individual basis
+	 * (a single pass), allocating the bits of interest by hand and
+	 * updating the leaf corresponding to the dmap word. a single pass
+	 * will be used for all dmap words fully contained within the
+	 * specified range.  within this pass, the bits of all fully contained
+	 * dmap words will be marked as free in a single shot and the leaves
+	 * will be updated. a single leaf may describe the free space of
+	 * multiple dmap words, so we may update only a subset of the actual
+	 * leaves corresponding to the dmap words of the block range.
+	 */
+	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+		/* determine the bit number within the word and
+		 * the number of bits within the word.
+		 */
+		wbitno = dbitno & (DBWORD - 1);
+		nb = min(rembits, DBWORD - wbitno);
+
+		/* check if only part of a word is to be allocated.
+		 */
+		if (nb < DBWORD) {
+			/* allocate (set to 1) the appropriate bits within
+			 * this dmap word.
+			 */
+			dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
+						      >> wbitno);
+
+			/* update the leaf for this dmap word. in addition
+			 * to setting the leaf value to the binary buddy max
+			 * of the updated dmap word, dbSplit() will split
+			 * the binary system of the leaves if need be.
+			 */
+			dbSplit(tp, word, BUDMIN,
+				dbMaxBud((u8 *) & dp->wmap[word]));
+
+			word += 1;
+		} else {
+			/* one or more dmap words are fully contained
+			 * within the block range.  determine how many
+			 * words and allocate (set to 1) the bits of these
+			 * words.
+			 */
+			nwords = rembits >> L2DBWORD;
+			memset(&dp->wmap[word], (int) ONES, nwords * 4);
+
+			/* determine how many bits.
+			 */
+			nb = nwords << L2DBWORD;
+
+			/* now update the appropriate leaves to reflect
+			 * the allocated words.
+			 */
+			for (; nwords > 0; nwords -= nw) {
+				if (leaf[word] < BUDMIN) {
+					jfs_error(bmp->db_ipbmap->i_sb,
+						  "dbAllocBits: leaf page "
+						  "corrupt");
+					break;
+				}
+
+				/* determine what the leaf value should be
+				 * updated to as the minimum of the l2 number
+				 * of bits being allocated and the l2 number
+				 * of bits currently described by this leaf.
+				 */
+				size = min((int)leaf[word], NLSTOL2BSZ(nwords));
+
+				/* update the leaf to reflect the allocation.
+				 * in addition to setting the leaf value to
+				 * NOFREE, dbSplit() will split the binary
+				 * system of the leaves to reflect the current
+				 * allocation (size).
+				 */
+				dbSplit(tp, word, size, NOFREE);
+
+				/* get the number of dmap words handled */
+				nw = BUDSIZE(size, BUDMIN);
+				word += nw;
+			}
+		}
+	}
+
+	/* update the free count for this dmap */
+	le32_add_cpu(&dp->nfree, -nblocks);
+
+	BMAP_LOCK(bmp);
+
+	/* if this allocation group is completely free,
+	 * update the maximum allocation group number if this allocation
+	 * group is the new max.
+	 */
+	agno = blkno >> bmp->db_agl2size;
+	if (agno > bmp->db_maxag)
+		bmp->db_maxag = agno;
+
+	/* update the free count for the allocation group and map */
+	bmp->db_agfree[agno] -= nblocks;
+	bmp->db_nfree -= nblocks;
+
+	BMAP_UNLOCK(bmp);
+}
+
+
+/*
+ * NAME:	dbFreeBits()
+ *
+ * FUNCTION:	free a specified block range from a dmap.
+ *
+ *		this routine updates the dmap to reflect the working
+ *		state allocation of the specified block range. it directly
+ *		updates the bits of the working map and causes the adjustment
+ *		of the binary buddy system described by the dmap's dmtree
+ *		leaves to reflect the bits freed.  it also causes the dmap's
+ *		dmtree, as a whole, to reflect the deallocated range.
+ *
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	dp	-  pointer to dmap to free bits from.
+ *	blkno	-  starting block number of the bits to be freed.
+ *	nblocks	-  number of bits to be freed.
+ *
+ * RETURN VALUES: 0 for success
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
+		       int nblocks)
+{
+	int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
+	dmtree_t *tp = (dmtree_t *) & dp->tree;
+	int rc = 0;
+	int size;
+
+	/* determine the bit number and word within the dmap of the
+	 * starting block.
+	 */
+	dbitno = blkno & (BPERDMAP - 1);
+	word = dbitno >> L2DBWORD;
+
+	/* block range better be within the dmap.
+	 */
+	assert(dbitno + nblocks <= BPERDMAP);
+
+	/* free the bits of the dmaps words corresponding to the block range.
+	 * not all bits of the first and last words may be contained within
+	 * the block range.  if this is the case, we'll work against those
+	 * words (i.e. partial first and/or last) on an individual basis
+	 * (a single pass), freeing the bits of interest by hand and updating
+	 * the leaf corresponding to the dmap word. a single pass will be used
+	 * for all dmap words fully contained within the specified range.
+	 * within this pass, the bits of all fully contained dmap words will
+	 * be marked as free in a single shot and the leaves will be updated. a
+	 * single leaf may describe the free space of multiple dmap words,
+	 * so we may update only a subset of the actual leaves corresponding
+	 * to the dmap words of the block range.
+	 *
+	 * dbJoin() is used to update leaf values and will join the binary
+	 * buddy system of the leaves if the new leaf values indicate this
+	 * should be done.
+	 */
+	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+		/* determine the bit number within the word and
+		 * the number of bits within the word.
+		 */
+		wbitno = dbitno & (DBWORD - 1);
+		nb = min(rembits, DBWORD - wbitno);
+
+		/* check if only part of a word is to be freed.
+		 */
+		if (nb < DBWORD) {
+			/* free (zero) the appropriate bits within this
+			 * dmap word.
+			 */
+			dp->wmap[word] &=
+			    cpu_to_le32(~(ONES << (DBWORD - nb)
+					  >> wbitno));
+
+			/* update the leaf for this dmap word.
+			 */
+			rc = dbJoin(tp, word,
+				    dbMaxBud((u8 *) & dp->wmap[word]));
+			if (rc)
+				return rc;
+
+			word += 1;
+		} else {
+			/* one or more dmap words are fully contained
+			 * within the block range.  determine how many
+			 * words and free (zero) the bits of these words.
+			 */
+			nwords = rembits >> L2DBWORD;
+			memset(&dp->wmap[word], 0, nwords * 4);
+
+			/* determine how many bits.
+			 */
+			nb = nwords << L2DBWORD;
+
+			/* now update the appropriate leaves to reflect
+			 * the freed words.
+			 */
+			for (; nwords > 0; nwords -= nw) {
+				/* determine what the leaf value should be
+				 * updated to as the minimum of the l2 number
+				 * of bits being freed and the l2 (max) number
+				 * of bits that can be described by this leaf.
+				 */
+				size =
+				    min(LITOL2BSZ
+					(word, L2LPERDMAP, BUDMIN),
+					NLSTOL2BSZ(nwords));
+
+				/* update the leaf.
+				 */
+				rc = dbJoin(tp, word, size);
+				if (rc)
+					return rc;
+
+				/* get the number of dmap words handled.
+				 */
+				nw = BUDSIZE(size, BUDMIN);
+				word += nw;
+			}
+		}
+	}
+
+	/* update the free count for this dmap.
+	 */
+	le32_add_cpu(&dp->nfree, nblocks);
+
+	BMAP_LOCK(bmp);
+
+	/* update the free count for the allocation group and
+	 * map.
+	 */
+	agno = blkno >> bmp->db_agl2size;
+	bmp->db_nfree += nblocks;
+	bmp->db_agfree[agno] += nblocks;
+
+	/* check if this allocation group is not completely free and
+	 * if it is currently the maximum (rightmost) allocation group.
+	 * if so, establish the new maximum allocation group number by
+	 * searching left for the first allocation group with allocation.
+	 */
+	if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) ||
+	    (agno == bmp->db_numag - 1 &&
+	     bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) {
+		while (bmp->db_maxag > 0) {
+			bmp->db_maxag -= 1;
+			if (bmp->db_agfree[bmp->db_maxag] !=
+			    bmp->db_agsize)
+				break;
+		}
+
+		/* re-establish the allocation group preference if the
+		 * current preference is right of the maximum allocation
+		 * group.
+		 */
+		if (bmp->db_agpref > bmp->db_maxag)
+			bmp->db_agpref = bmp->db_maxag;
+	}
+
+	BMAP_UNLOCK(bmp);
+
+	return 0;
+}
+
+
+/*
+ * NAME:	dbAdjCtl()
+ *
+ * FUNCTION:	adjust a dmap control page at a specified level to reflect
+ *		the change in a lower level dmap or dmap control page's
+ *		maximum string of free blocks (i.e. a change in the root
+ *		of the lower level object's dmtree) due to the allocation
+ *		or deallocation of a range of blocks with a single dmap.
+ *
+ *		on entry, this routine is provided with the new value of
+ *		the lower level dmap or dmap control page root and the
+ *		starting block number of the block range whose allocation
+ *		or deallocation resulted in the root change.  this range
+ *		is respresented by a single leaf of the current dmapctl
+ *		and the leaf will be updated with this value, possibly
+ *		causing a binary buddy system within the leaves to be
+ *		split or joined.  the update may also cause the dmapctl's
+ *		dmtree to be updated.
+ *
+ *		if the adjustment of the dmap control page, itself, causes its
+ *		root to change, this change will be bubbled up to the next dmap
+ *		control level by a recursive call to this routine, specifying
+ *		the new root value and the next dmap control page level to
+ *		be adjusted.
+ * PARAMETERS:
+ *	bmp	-  pointer to bmap descriptor
+ *	blkno	-  the first block of a block range within a dmap.  it is
+ *		   the allocation or deallocation of this block range that
+ *		   requires the dmap control page to be adjusted.
+ *	newval	-  the new value of the lower level dmap or dmap control
+ *		   page root.
+ *	alloc	-  'true' if adjustment is due to an allocation.
+ *	level	-  current level of dmap control page (i.e. L0, L1, L2) to
+ *		   be adjusted.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int
+dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
+{
+	struct metapage *mp;
+	s8 oldroot;
+	int oldval;
+	s64 lblkno;
+	struct dmapctl *dcp;
+	int rc, leafno, ti;
+
+	/* get the buffer for the dmap control page for the specified
+	 * block number and control page level.
+	 */
+	lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level);
+	mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+	if (mp == NULL)
+		return -EIO;
+	dcp = (struct dmapctl *) mp->data;
+
+	if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
+		jfs_error(bmp->db_ipbmap->i_sb,
+			  "dbAdjCtl: Corrupt dmapctl page");
+		release_metapage(mp);
+		return -EIO;
+	}
+
+	/* determine the leaf number corresponding to the block and
+	 * the index within the dmap control tree.
+	 */
+	leafno = BLKTOCTLLEAF(blkno, dcp->budmin);
+	ti = leafno + le32_to_cpu(dcp->leafidx);
+
+	/* save the current leaf value and the current root level (i.e.
+	 * maximum l2 free string described by this dmapctl).
+	 */
+	oldval = dcp->stree[ti];
+	oldroot = dcp->stree[ROOT];
+
+	/* check if this is a control page update for an allocation.
+	 * if so, update the leaf to reflect the new leaf value using
+	 * dbSplit(); otherwise (deallocation), use dbJoin() to update
+	 * the leaf with the new value.  in addition to updating the
+	 * leaf, dbSplit() will also split the binary buddy system of
+	 * the leaves, if required, and bubble new values within the
+	 * dmapctl tree, if required.  similarly, dbJoin() will join
+	 * the binary buddy system of leaves and bubble new values up
+	 * the dmapctl tree as required by the new leaf value.
+	 */
+	if (alloc) {
+		/* check if we are in the middle of a binary buddy
+		 * system.  this happens when we are performing the
+		 * first allocation out of an allocation group that
+		 * is part (not the first part) of a larger binary
+		 * buddy system.  if we are in the middle, back split
+		 * the system prior to calling dbSplit() which assumes
+		 * that it is at the front of a binary buddy system.
+		 */
+		if (oldval == NOFREE) {
+			rc = dbBackSplit((dmtree_t *) dcp, leafno);
+			if (rc)
+				return rc;
+			oldval = dcp->stree[ti];
+		}
+		dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
+	} else {
+		rc = dbJoin((dmtree_t *) dcp, leafno, newval);
+		if (rc)
+			return rc;
+	}
+
+	/* check if the root of the current dmap control page changed due
+	 * to the update and if the current dmap control page is not at
+	 * the current top level (i.e. L0, L1, L2) of the map.  if so (i.e.
+	 * root changed and this is not the top level), call this routine
+	 * again (recursion) for the next higher level of the mapping to
+	 * reflect the change in root for the current dmap control page.
+	 */
+	if (dcp->stree[ROOT] != oldroot) {
+		/* are we below the top level of the map.  if so,
+		 * bubble the root up to the next higher level.
+		 */
+		if (level < bmp->db_maxlevel) {
+			/* bubble up the new root of this dmap control page to
+			 * the next level.
+			 */
+			if ((rc =
+			     dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc,
+				      level + 1))) {
+				/* something went wrong in bubbling up the new
+				 * root value, so backout the changes to the
+				 * current dmap control page.
+				 */
+				if (alloc) {
+					dbJoin((dmtree_t *) dcp, leafno,
+					       oldval);
+				} else {
+					/* the dbJoin() above might have
+					 * caused a larger binary buddy system
+					 * to form and we may now be in the
+					 * middle of it.  if this is the case,
+					 * back split the buddies.
+					 */
+					if (dcp->stree[ti] == NOFREE)
+						dbBackSplit((dmtree_t *)
+							    dcp, leafno);
+					dbSplit((dmtree_t *) dcp, leafno,
+						dcp->budmin, oldval);
+				}
+
+				/* release the buffer and return the error.
+				 */
+				release_metapage(mp);
+				return (rc);
+			}
+		} else {
+			/* we're at the top level of the map. update
+			 * the bmap control page to reflect the size
+			 * of the maximum free buddy system.
+			 */
+			assert(level == bmp->db_maxlevel);
+			if (bmp->db_maxfreebud != oldroot) {
+				jfs_error(bmp->db_ipbmap->i_sb,
+					  "dbAdjCtl: the maximum free buddy is "
+					  "not the old root");
+			}
+			bmp->db_maxfreebud = dcp->stree[ROOT];
+		}
+	}
+
+	/* write the buffer.
+	 */
+	write_metapage(mp);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	dbSplit()
+ *
+ * FUNCTION:	update the leaf of a dmtree with a new value, splitting
+ *		the leaf from the binary buddy system of the dmtree's
+ *		leaves, as required.
+ *
+ * PARAMETERS:
+ *	tp	- pointer to the tree containing the leaf.
+ *	leafno	- the number of the leaf to be updated.
+ *	splitsz	- the size the binary buddy system starting at the leaf
+ *		  must be split to, specified as the log2 number of blocks.
+ *	newval	- the new value for the leaf.
+ *
+ * RETURN VALUES: none
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
+{
+	int budsz;
+	int cursz;
+	s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
+
+	/* check if the leaf needs to be split.
+	 */
+	if (leaf[leafno] > tp->dmt_budmin) {
+		/* the split occurs by cutting the buddy system in half
+		 * at the specified leaf until we reach the specified
+		 * size.  pick up the starting split size (current size
+		 * - 1 in l2) and the corresponding buddy size.
+		 */
+		cursz = leaf[leafno] - 1;
+		budsz = BUDSIZE(cursz, tp->dmt_budmin);
+
+		/* split until we reach the specified size.
+		 */
+		while (cursz >= splitsz) {
+			/* update the buddy's leaf with its new value.
+			 */
+			dbAdjTree(tp, leafno ^ budsz, cursz);
+
+			/* on to the next size and buddy.
+			 */
+			cursz -= 1;
+			budsz >>= 1;
+		}
+	}
+
+	/* adjust the dmap tree to reflect the specified leaf's new
+	 * value.
+	 */
+	dbAdjTree(tp, leafno, newval);
+}
+
+
+/*
+ * NAME:	dbBackSplit()
+ *
+ * FUNCTION:	back split the binary buddy system of dmtree leaves
+ *		that hold a specified leaf until the specified leaf
+ *		starts its own binary buddy system.
+ *
+ *		the allocators typically perform allocations at the start
+ *		of binary buddy systems and dbSplit() is used to accomplish
+ *		any required splits.  in some cases, however, allocation
+ *		may occur in the middle of a binary system and requires a
+ *		back split, with the split proceeding out from the middle of
+ *		the system (less efficient) rather than the start of the
+ *		system (more efficient).  the cases in which a back split
+ *		is required are rare and are limited to the first allocation
+ *		within an allocation group which is a part (not first part)
+ *		of a larger binary buddy system and a few exception cases
+ *		in which a previous join operation must be backed out.
+ *
+ * PARAMETERS:
+ *	tp	- pointer to the tree containing the leaf.
+ *	leafno	- the number of the leaf to be updated.
+ *
+ * RETURN VALUES: none
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbBackSplit(dmtree_t * tp, int leafno)
+{
+	int budsz, bud, w, bsz, size;
+	int cursz;
+	s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
+
+	/* leaf should be part (not first part) of a binary
+	 * buddy system.
+	 */
+	assert(leaf[leafno] == NOFREE);
+
+	/* the back split is accomplished by iteratively finding the leaf
+	 * that starts the buddy system that contains the specified leaf and
+	 * splitting that system in two.  this iteration continues until
+	 * the specified leaf becomes the start of a buddy system.
+	 *
+	 * determine maximum possible l2 size for the specified leaf.
+	 */
+	size =
+	    LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs),
+		      tp->dmt_budmin);
+
+	/* determine the number of leaves covered by this size.  this
+	 * is the buddy size that we will start with as we search for
+	 * the buddy system that contains the specified leaf.
+	 */
+	budsz = BUDSIZE(size, tp->dmt_budmin);
+
+	/* back split.
+	 */
+	while (leaf[leafno] == NOFREE) {
+		/* find the leftmost buddy leaf.
+		 */
+		for (w = leafno, bsz = budsz;; bsz <<= 1,
+		     w = (w < bud) ? w : bud) {
+			if (bsz >= le32_to_cpu(tp->dmt_nleafs)) {
+				jfs_err("JFS: block map error in dbBackSplit");
+				return -EIO;
+			}
+
+			/* determine the buddy.
+			 */
+			bud = w ^ bsz;
+
+			/* check if this buddy is the start of the system.
+			 */
+			if (leaf[bud] != NOFREE) {
+				/* split the leaf at the start of the
+				 * system in two.
+				 */
+				cursz = leaf[bud] - 1;
+				dbSplit(tp, bud, cursz, cursz);
+				break;
+			}
+		}
+	}
+
+	if (leaf[leafno] != size) {
+		jfs_err("JFS: wrong leaf value in dbBackSplit");
+		return -EIO;
+	}
+	return 0;
+}
+
+
+/*
+ * NAME:	dbJoin()
+ *
+ * FUNCTION:	update the leaf of a dmtree with a new value, joining
+ *		the leaf with other leaves of the dmtree into a multi-leaf
+ *		binary buddy system, as required.
+ *
+ * PARAMETERS:
+ *	tp	- pointer to the tree containing the leaf.
+ *	leafno	- the number of the leaf to be updated.
+ *	newval	- the new value for the leaf.
+ *
+ * RETURN VALUES: none
+ */
+static int dbJoin(dmtree_t * tp, int leafno, int newval)
+{
+	int budsz, buddy;
+	s8 *leaf;
+
+	/* can the new leaf value require a join with other leaves ?
+	 */
+	if (newval >= tp->dmt_budmin) {
+		/* pickup a pointer to the leaves of the tree.
+		 */
+		leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
+
+		/* try to join the specified leaf into a large binary
+		 * buddy system.  the join proceeds by attempting to join
+		 * the specified leafno with its buddy (leaf) at new value.
+		 * if the join occurs, we attempt to join the left leaf
+		 * of the joined buddies with its buddy at new value + 1.
+		 * we continue to join until we find a buddy that cannot be
+		 * joined (does not have a value equal to the size of the
+		 * last join) or until all leaves have been joined into a
+		 * single system.
+		 *
+		 * get the buddy size (number of words covered) of
+		 * the new value.
+		 */
+		budsz = BUDSIZE(newval, tp->dmt_budmin);
+
+		/* try to join.
+		 */
+		while (budsz < le32_to_cpu(tp->dmt_nleafs)) {
+			/* get the buddy leaf.
+			 */
+			buddy = leafno ^ budsz;
+
+			/* if the leaf's new value is greater than its
+			 * buddy's value, we join no more.
+			 */
+			if (newval > leaf[buddy])
+				break;
+
+			/* It shouldn't be less */
+			if (newval < leaf[buddy])
+				return -EIO;
+
+			/* check which (leafno or buddy) is the left buddy.
+			 * the left buddy gets to claim the blocks resulting
+			 * from the join while the right gets to claim none.
+			 * the left buddy is also eligible to participate in
+			 * a join at the next higher level while the right
+			 * is not.
+			 *
+			 */
+			if (leafno < buddy) {
+				/* leafno is the left buddy.
+				 */
+				dbAdjTree(tp, buddy, NOFREE);
+			} else {
+				/* buddy is the left buddy and becomes
+				 * leafno.
+				 */
+				dbAdjTree(tp, leafno, NOFREE);
+				leafno = buddy;
+			}
+
+			/* on to try the next join.
+			 */
+			newval += 1;
+			budsz <<= 1;
+		}
+	}
+
+	/* update the leaf value.
+	 */
+	dbAdjTree(tp, leafno, newval);
+
+	return 0;
+}
+
+
+/*
+ * NAME:	dbAdjTree()
+ *
+ * FUNCTION:	update a leaf of a dmtree with a new value, adjusting
+ *		the dmtree, as required, to reflect the new leaf value.
+ *		the combination of any buddies must already be done before
+ *		this is called.
+ *
+ * PARAMETERS:
+ *	tp	- pointer to the tree to be adjusted.
+ *	leafno	- the number of the leaf to be updated.
+ *	newval	- the new value for the leaf.
+ *
+ * RETURN VALUES: none
+ */
+static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
+{
+	int lp, pp, k;
+	int max;
+
+	/* pick up the index of the leaf for this leafno.
+	 */
+	lp = leafno + le32_to_cpu(tp->dmt_leafidx);
+
+	/* is the current value the same as the old value ?  if so,
+	 * there is nothing to do.
+	 */
+	if (tp->dmt_stree[lp] == newval)
+		return;
+
+	/* set the new value.
+	 */
+	tp->dmt_stree[lp] = newval;
+
+	/* bubble the new value up the tree as required.
+	 */
+	for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) {
+		/* get the index of the first leaf of the 4 leaf
+		 * group containing the specified leaf (leafno).
+		 */
+		lp = ((lp - 1) & ~0x03) + 1;
+
+		/* get the index of the parent of this 4 leaf group.
+		 */
+		pp = (lp - 1) >> 2;
+
+		/* determine the maximum of the 4 leaves.
+		 */
+		max = TREEMAX(&tp->dmt_stree[lp]);
+
+		/* if the maximum of the 4 is the same as the
+		 * parent's value, we're done.
+		 */
+		if (tp->dmt_stree[pp] == max)
+			break;
+
+		/* parent gets new value.
+		 */
+		tp->dmt_stree[pp] = max;
+
+		/* parent becomes leaf for next go-round.
+		 */
+		lp = pp;
+	}
+}
+
+
+/*
+ * NAME:	dbFindLeaf()
+ *
+ * FUNCTION:	search a dmtree_t for sufficient free blocks, returning
+ *		the index of a leaf describing the free blocks if
+ *		sufficient free blocks are found.
+ *
+ *		the search starts at the top of the dmtree_t tree and
+ *		proceeds down the tree to the leftmost leaf with sufficient
+ *		free space.
+ *
+ * PARAMETERS:
+ *	tp	- pointer to the tree to be searched.
+ *	l2nb	- log2 number of free blocks to search for.
+ *	leafidx	- return pointer to be set to the index of the leaf
+ *		  describing at least l2nb free blocks if sufficient
+ *		  free blocks are found.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOSPC	- insufficient free blocks.
+ */
+static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
+{
+	int ti, n = 0, k, x = 0;
+
+	/* first check the root of the tree to see if there is
+	 * sufficient free space.
+	 */
+	if (l2nb > tp->dmt_stree[ROOT])
+		return -ENOSPC;
+
+	/* sufficient free space available. now search down the tree
+	 * starting at the next level for the leftmost leaf that
+	 * describes sufficient free space.
+	 */
+	for (k = le32_to_cpu(tp->dmt_height), ti = 1;
+	     k > 0; k--, ti = ((ti + n) << 2) + 1) {
+		/* search the four nodes at this level, starting from
+		 * the left.
+		 */
+		for (x = ti, n = 0; n < 4; n++) {
+			/* sufficient free space found.  move to the next
+			 * level (or quit if this is the last level).
+			 */
+			if (l2nb <= tp->dmt_stree[x + n])
+				break;
+		}
+
+		/* better have found something since the higher
+		 * levels of the tree said it was here.
+		 */
+		assert(n < 4);
+	}
+
+	/* set the return to the leftmost leaf describing sufficient
+	 * free space.
+	 */
+	*leafidx = x + n - le32_to_cpu(tp->dmt_leafidx);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	dbFindBits()
+ *
+ * FUNCTION:	find a specified number of binary buddy free bits within a
+ *		dmap bitmap word value.
+ *
+ *		this routine searches the bitmap value for (1 << l2nb) free
+ *		bits at (1 << l2nb) alignments within the value.
+ *
+ * PARAMETERS:
+ *	word	-  dmap bitmap word value.
+ *	l2nb	-  number of free bits specified as a log2 number.
+ *
+ * RETURN VALUES:
+ *	starting bit number of free bits.
+ */
+static int dbFindBits(u32 word, int l2nb)
+{
+	int bitno, nb;
+	u32 mask;
+
+	/* get the number of bits.
+	 */
+	nb = 1 << l2nb;
+	assert(nb <= DBWORD);
+
+	/* complement the word so we can use a mask (i.e. 0s represent
+	 * free bits) and compute the mask.
+	 */
+	word = ~word;
+	mask = ONES << (DBWORD - nb);
+
+	/* scan the word for nb free bits at nb alignments.
+	 */
+	for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) {
+		if ((mask & word) == mask)
+			break;
+	}
+
+	ASSERT(bitno < 32);
+
+	/* return the bit number.
+	 */
+	return (bitno);
+}
+
+
+/*
+ * NAME:	dbMaxBud(u8 *cp)
+ *
+ * FUNCTION:	determine the largest binary buddy string of free
+ *		bits within 32-bits of the map.
+ *
+ * PARAMETERS:
+ *	cp	-  pointer to the 32-bit value.
+ *
+ * RETURN VALUES:
+ *	largest binary buddy of free bits within a dmap word.
+ */
+static int dbMaxBud(u8 * cp)
+{
+	signed char tmp1, tmp2;
+
+	/* check if the wmap word is all free. if so, the
+	 * free buddy size is BUDMIN.
+	 */
+	if (*((uint *) cp) == 0)
+		return (BUDMIN);
+
+	/* check if the wmap word is half free. if so, the
+	 * free buddy size is BUDMIN-1.
+	 */
+	if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0)
+		return (BUDMIN - 1);
+
+	/* not all free or half free. determine the free buddy
+	 * size thru table lookup using quarters of the wmap word.
+	 */
+	tmp1 = max(budtab[cp[2]], budtab[cp[3]]);
+	tmp2 = max(budtab[cp[0]], budtab[cp[1]]);
+	return (max(tmp1, tmp2));
+}
+
+
+/*
+ * NAME:	cnttz(uint word)
+ *
+ * FUNCTION:	determine the number of trailing zeros within a 32-bit
+ *		value.
+ *
+ * PARAMETERS:
+ *	value	-  32-bit value to be examined.
+ *
+ * RETURN VALUES:
+ *	count of trailing zeros
+ */
+static int cnttz(u32 word)
+{
+	int n;
+
+	for (n = 0; n < 32; n++, word >>= 1) {
+		if (word & 0x01)
+			break;
+	}
+
+	return (n);
+}
+
+
+/*
+ * NAME:	cntlz(u32 value)
+ *
+ * FUNCTION:	determine the number of leading zeros within a 32-bit
+ *		value.
+ *
+ * PARAMETERS:
+ *	value	-  32-bit value to be examined.
+ *
+ * RETURN VALUES:
+ *	count of leading zeros
+ */
+static int cntlz(u32 value)
+{
+	int n;
+
+	for (n = 0; n < 32; n++, value <<= 1) {
+		if (value & HIGHORDER)
+			break;
+	}
+	return (n);
+}
+
+
+/*
+ * NAME:	blkstol2(s64 nb)
+ *
+ * FUNCTION:	convert a block count to its log2 value. if the block
+ *		count is not a l2 multiple, it is rounded up to the next
+ *		larger l2 multiple.
+ *
+ * PARAMETERS:
+ *	nb	-  number of blocks
+ *
+ * RETURN VALUES:
+ *	log2 number of blocks
+ */
+static int blkstol2(s64 nb)
+{
+	int l2nb;
+	s64 mask;		/* meant to be signed */
+
+	mask = (s64) 1 << (64 - 1);
+
+	/* count the leading bits.
+	 */
+	for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) {
+		/* leading bit found.
+		 */
+		if (nb & mask) {
+			/* determine the l2 value.
+			 */
+			l2nb = (64 - 1) - l2nb;
+
+			/* check if we need to round up.
+			 */
+			if (~mask & nb)
+				l2nb++;
+
+			return (l2nb);
+		}
+	}
+	assert(0);
+	return 0;		/* fix compiler warning */
+}
+
+
+/*
+ * NAME:	dbAllocBottomUp()
+ *
+ * FUNCTION:	alloc the specified block range from the working block
+ *		allocation map.
+ *
+ *		the blocks will be alloc from the working map one dmap
+ *		at a time.
+ *
+ * PARAMETERS:
+ *	ip	-  pointer to in-core inode;
+ *	blkno	-  starting block number to be freed.
+ *	nblocks	-  number of blocks to be freed.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error
+ */
+int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
+{
+	struct metapage *mp;
+	struct dmap *dp;
+	int nb, rc;
+	s64 lblkno, rem;
+	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
+
+	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
+
+	/* block to be allocated better be within the mapsize. */
+	ASSERT(nblocks <= bmp->db_mapsize - blkno);
+
+	/*
+	 * allocate the blocks a dmap at a time.
+	 */
+	mp = NULL;
+	for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
+		/* release previous dmap if any */
+		if (mp) {
+			write_metapage(mp);
+		}
+
+		/* get the buffer for the current dmap. */
+		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+		if (mp == NULL) {
+			IREAD_UNLOCK(ipbmap);
+			return -EIO;
+		}
+		dp = (struct dmap *) mp->data;
+
+		/* determine the number of blocks to be allocated from
+		 * this dmap.
+		 */
+		nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
+
+		/* allocate the blocks. */
+		if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) {
+			release_metapage(mp);
+			IREAD_UNLOCK(ipbmap);
+			return (rc);
+		}
+	}
+
+	/* write the last buffer. */
+	write_metapage(mp);
+
+	IREAD_UNLOCK(ipbmap);
+
+	return (0);
+}
+
+
+static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
+			 int nblocks)
+{
+	int rc;
+	int dbitno, word, rembits, nb, nwords, wbitno, agno;
+	s8 oldroot, *leaf;
+	struct dmaptree *tp = (struct dmaptree *) & dp->tree;
+
+	/* save the current value of the root (i.e. maximum free string)
+	 * of the dmap tree.
+	 */
+	oldroot = tp->stree[ROOT];
+
+	/* pick up a pointer to the leaves of the dmap tree */
+	leaf = tp->stree + LEAFIND;
+
+	/* determine the bit number and word within the dmap of the
+	 * starting block.
+	 */
+	dbitno = blkno & (BPERDMAP - 1);
+	word = dbitno >> L2DBWORD;
+
+	/* block range better be within the dmap */
+	assert(dbitno + nblocks <= BPERDMAP);
+
+	/* allocate the bits of the dmap's words corresponding to the block
+	 * range. not all bits of the first and last words may be contained
+	 * within the block range.  if this is the case, we'll work against
+	 * those words (i.e. partial first and/or last) on an individual basis
+	 * (a single pass), allocating the bits of interest by hand and
+	 * updating the leaf corresponding to the dmap word. a single pass
+	 * will be used for all dmap words fully contained within the
+	 * specified range.  within this pass, the bits of all fully contained
+	 * dmap words will be marked as free in a single shot and the leaves
+	 * will be updated. a single leaf may describe the free space of
+	 * multiple dmap words, so we may update only a subset of the actual
+	 * leaves corresponding to the dmap words of the block range.
+	 */
+	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+		/* determine the bit number within the word and
+		 * the number of bits within the word.
+		 */
+		wbitno = dbitno & (DBWORD - 1);
+		nb = min(rembits, DBWORD - wbitno);
+
+		/* check if only part of a word is to be allocated.
+		 */
+		if (nb < DBWORD) {
+			/* allocate (set to 1) the appropriate bits within
+			 * this dmap word.
+			 */
+			dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
+						      >> wbitno);
+
+			word++;
+		} else {
+			/* one or more dmap words are fully contained
+			 * within the block range.  determine how many
+			 * words and allocate (set to 1) the bits of these
+			 * words.
+			 */
+			nwords = rembits >> L2DBWORD;
+			memset(&dp->wmap[word], (int) ONES, nwords * 4);
+
+			/* determine how many bits */
+			nb = nwords << L2DBWORD;
+			word += nwords;
+		}
+	}
+
+	/* update the free count for this dmap */
+	le32_add_cpu(&dp->nfree, -nblocks);
+
+	/* reconstruct summary tree */
+	dbInitDmapTree(dp);
+
+	BMAP_LOCK(bmp);
+
+	/* if this allocation group is completely free,
+	 * update the highest active allocation group number
+	 * if this allocation group is the new max.
+	 */
+	agno = blkno >> bmp->db_agl2size;
+	if (agno > bmp->db_maxag)
+		bmp->db_maxag = agno;
+
+	/* update the free count for the allocation group and map */
+	bmp->db_agfree[agno] -= nblocks;
+	bmp->db_nfree -= nblocks;
+
+	BMAP_UNLOCK(bmp);
+
+	/* if the root has not changed, done. */
+	if (tp->stree[ROOT] == oldroot)
+		return (0);
+
+	/* root changed. bubble the change up to the dmap control pages.
+	 * if the adjustment of the upper level control pages fails,
+	 * backout the bit allocation (thus making everything consistent).
+	 */
+	if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0)))
+		dbFreeBits(bmp, dp, blkno, nblocks);
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	dbExtendFS()
+ *
+ * FUNCTION:	extend bmap from blkno for nblocks;
+ *		dbExtendFS() updates bmap ready for dbAllocBottomUp();
+ *
+ * L2
+ *  |
+ *   L1---------------------------------L1
+ *    |					 |
+ *     L0---------L0---------L0		  L0---------L0---------L0
+ *      |	   |	      |		   |	      |		 |
+ *	 d0,...,dn  d0,...,dn  d0,...,dn    d0,...,dn  d0,...,dn  d0,.,dm;
+ * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
+ *
+ * <---old---><----------------------------extend----------------------->
+ */
+int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb);
+	int nbperpage = sbi->nbperpage;
+	int i, i0 = true, j, j0 = true, k, n;
+	s64 newsize;
+	s64 p;
+	struct metapage *mp, *l2mp, *l1mp = NULL, *l0mp = NULL;
+	struct dmapctl *l2dcp, *l1dcp, *l0dcp;
+	struct dmap *dp;
+	s8 *l0leaf, *l1leaf, *l2leaf;
+	struct bmap *bmp = sbi->bmap;
+	int agno, l2agsize, oldl2agsize;
+	s64 ag_rem;
+
+	newsize = blkno + nblocks;
+
+	jfs_info("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld",
+		 (long long) blkno, (long long) nblocks, (long long) newsize);
+
+	/*
+	 *	initialize bmap control page.
+	 *
+	 * all the data in bmap control page should exclude
+	 * the mkfs hidden dmap page.
+	 */
+
+	/* update mapsize */
+	bmp->db_mapsize = newsize;
+	bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize);
+
+	/* compute new AG size */
+	l2agsize = dbGetL2AGSize(newsize);
+	oldl2agsize = bmp->db_agl2size;
+
+	bmp->db_agl2size = l2agsize;
+	bmp->db_agsize = 1 << l2agsize;
+
+	/* compute new number of AG */
+	agno = bmp->db_numag;
+	bmp->db_numag = newsize >> l2agsize;
+	bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
+
+	/*
+	 *	reconfigure db_agfree[]
+	 * from old AG configuration to new AG configuration;
+	 *
+	 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
+	 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
+	 * note: new AG size = old AG size * (2**x).
+	 */
+	if (l2agsize == oldl2agsize)
+		goto extend;
+	k = 1 << (l2agsize - oldl2agsize);
+	ag_rem = bmp->db_agfree[0];	/* save agfree[0] */
+	for (i = 0, n = 0; i < agno; n++) {
+		bmp->db_agfree[n] = 0;	/* init collection point */
+
+		/* coalesce contiguous k AGs; */
+		for (j = 0; j < k && i < agno; j++, i++) {
+			/* merge AGi to AGn */
+			bmp->db_agfree[n] += bmp->db_agfree[i];
+		}
+	}
+	bmp->db_agfree[0] += ag_rem;	/* restore agfree[0] */
+
+	for (; n < MAXAG; n++)
+		bmp->db_agfree[n] = 0;
+
+	/*
+	 * update highest active ag number
+	 */
+
+	bmp->db_maxag = bmp->db_maxag / k;
+
+	/*
+	 *	extend bmap
+	 *
+	 * update bit maps and corresponding level control pages;
+	 * global control page db_nfree, db_agfree[agno], db_maxfreebud;
+	 */
+      extend:
+	/* get L2 page */
+	p = BMAPBLKNO + nbperpage;	/* L2 page */
+	l2mp = read_metapage(ipbmap, p, PSIZE, 0);
+	if (!l2mp) {
+		jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read");
+		return -EIO;
+	}
+	l2dcp = (struct dmapctl *) l2mp->data;
+
+	/* compute start L1 */
+	k = blkno >> L2MAXL1SIZE;
+	l2leaf = l2dcp->stree + CTLLEAFIND + k;
+	p = BLKTOL1(blkno, sbi->l2nbperpage);	/* L1 page */
+
+	/*
+	 * extend each L1 in L2
+	 */
+	for (; k < LPERCTL; k++, p += nbperpage) {
+		/* get L1 page */
+		if (j0) {
+			/* read in L1 page: (blkno & (MAXL1SIZE - 1)) */
+			l1mp = read_metapage(ipbmap, p, PSIZE, 0);
+			if (l1mp == NULL)
+				goto errout;
+			l1dcp = (struct dmapctl *) l1mp->data;
+
+			/* compute start L0 */
+			j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE;
+			l1leaf = l1dcp->stree + CTLLEAFIND + j;
+			p = BLKTOL0(blkno, sbi->l2nbperpage);
+			j0 = false;
+		} else {
+			/* assign/init L1 page */
+			l1mp = get_metapage(ipbmap, p, PSIZE, 0);
+			if (l1mp == NULL)
+				goto errout;
+
+			l1dcp = (struct dmapctl *) l1mp->data;
+
+			/* compute start L0 */
+			j = 0;
+			l1leaf = l1dcp->stree + CTLLEAFIND;
+			p += nbperpage;	/* 1st L0 of L1.k */
+		}
+
+		/*
+		 * extend each L0 in L1
+		 */
+		for (; j < LPERCTL; j++) {
+			/* get L0 page */
+			if (i0) {
+				/* read in L0 page: (blkno & (MAXL0SIZE - 1)) */
+
+				l0mp = read_metapage(ipbmap, p, PSIZE, 0);
+				if (l0mp == NULL)
+					goto errout;
+				l0dcp = (struct dmapctl *) l0mp->data;
+
+				/* compute start dmap */
+				i = (blkno & (MAXL0SIZE - 1)) >>
+				    L2BPERDMAP;
+				l0leaf = l0dcp->stree + CTLLEAFIND + i;
+				p = BLKTODMAP(blkno,
+					      sbi->l2nbperpage);
+				i0 = false;
+			} else {
+				/* assign/init L0 page */
+				l0mp = get_metapage(ipbmap, p, PSIZE, 0);
+				if (l0mp == NULL)
+					goto errout;
+
+				l0dcp = (struct dmapctl *) l0mp->data;
+
+				/* compute start dmap */
+				i = 0;
+				l0leaf = l0dcp->stree + CTLLEAFIND;
+				p += nbperpage;	/* 1st dmap of L0.j */
+			}
+
+			/*
+			 * extend each dmap in L0
+			 */
+			for (; i < LPERCTL; i++) {
+				/*
+				 * reconstruct the dmap page, and
+				 * initialize corresponding parent L0 leaf
+				 */
+				if ((n = blkno & (BPERDMAP - 1))) {
+					/* read in dmap page: */
+					mp = read_metapage(ipbmap, p,
+							   PSIZE, 0);
+					if (mp == NULL)
+						goto errout;
+					n = min(nblocks, (s64)BPERDMAP - n);
+				} else {
+					/* assign/init dmap page */
+					mp = read_metapage(ipbmap, p,
+							   PSIZE, 0);
+					if (mp == NULL)
+						goto errout;
+
+					n = min(nblocks, (s64)BPERDMAP);
+				}
+
+				dp = (struct dmap *) mp->data;
+				*l0leaf = dbInitDmap(dp, blkno, n);
+
+				bmp->db_nfree += n;
+				agno = le64_to_cpu(dp->start) >> l2agsize;
+				bmp->db_agfree[agno] += n;
+
+				write_metapage(mp);
+
+				l0leaf++;
+				p += nbperpage;
+
+				blkno += n;
+				nblocks -= n;
+				if (nblocks == 0)
+					break;
+			}	/* for each dmap in a L0 */
+
+			/*
+			 * build current L0 page from its leaves, and
+			 * initialize corresponding parent L1 leaf
+			 */
+			*l1leaf = dbInitDmapCtl(l0dcp, 0, ++i);
+			write_metapage(l0mp);
+			l0mp = NULL;
+
+			if (nblocks)
+				l1leaf++;	/* continue for next L0 */
+			else {
+				/* more than 1 L0 ? */
+				if (j > 0)
+					break;	/* build L1 page */
+				else {
+					/* summarize in global bmap page */
+					bmp->db_maxfreebud = *l1leaf;
+					release_metapage(l1mp);
+					release_metapage(l2mp);
+					goto finalize;
+				}
+			}
+		}		/* for each L0 in a L1 */
+
+		/*
+		 * build current L1 page from its leaves, and
+		 * initialize corresponding parent L2 leaf
+		 */
+		*l2leaf = dbInitDmapCtl(l1dcp, 1, ++j);
+		write_metapage(l1mp);
+		l1mp = NULL;
+
+		if (nblocks)
+			l2leaf++;	/* continue for next L1 */
+		else {
+			/* more than 1 L1 ? */
+			if (k > 0)
+				break;	/* build L2 page */
+			else {
+				/* summarize in global bmap page */
+				bmp->db_maxfreebud = *l2leaf;
+				release_metapage(l2mp);
+				goto finalize;
+			}
+		}
+	}			/* for each L1 in a L2 */
+
+	jfs_error(ipbmap->i_sb,
+		  "dbExtendFS: function has not returned as expected");
+errout:
+	if (l0mp)
+		release_metapage(l0mp);
+	if (l1mp)
+		release_metapage(l1mp);
+	release_metapage(l2mp);
+	return -EIO;
+
+	/*
+	 *	finalize bmap control page
+	 */
+finalize:
+
+	return 0;
+}
+
+
+/*
+ *	dbFinalizeBmap()
+ */
+void dbFinalizeBmap(struct inode *ipbmap)
+{
+	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+	int actags, inactags, l2nl;
+	s64 ag_rem, actfree, inactfree, avgfree;
+	int i, n;
+
+	/*
+	 *	finalize bmap control page
+	 */
+//finalize:
+	/*
+	 * compute db_agpref: preferred ag to allocate from
+	 * (the leftmost ag with average free space in it);
+	 */
+//agpref:
+	/* get the number of active ags and inacitve ags */
+	actags = bmp->db_maxag + 1;
+	inactags = bmp->db_numag - actags;
+	ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1);	/* ??? */
+
+	/* determine how many blocks are in the inactive allocation
+	 * groups. in doing this, we must account for the fact that
+	 * the rightmost group might be a partial group (i.e. file
+	 * system size is not a multiple of the group size).
+	 */
+	inactfree = (inactags && ag_rem) ?
+	    ((inactags - 1) << bmp->db_agl2size) + ag_rem
+	    : inactags << bmp->db_agl2size;
+
+	/* determine how many free blocks are in the active
+	 * allocation groups plus the average number of free blocks
+	 * within the active ags.
+	 */
+	actfree = bmp->db_nfree - inactfree;
+	avgfree = (u32) actfree / (u32) actags;
+
+	/* if the preferred allocation group has not average free space.
+	 * re-establish the preferred group as the leftmost
+	 * group with average free space.
+	 */
+	if (bmp->db_agfree[bmp->db_agpref] < avgfree) {
+		for (bmp->db_agpref = 0; bmp->db_agpref < actags;
+		     bmp->db_agpref++) {
+			if (bmp->db_agfree[bmp->db_agpref] >= avgfree)
+				break;
+		}
+		if (bmp->db_agpref >= bmp->db_numag) {
+			jfs_error(ipbmap->i_sb,
+				  "cannot find ag with average freespace");
+		}
+	}
+
+	/*
+	 * compute db_aglevel, db_agheight, db_width, db_agstart:
+	 * an ag is covered in aglevel dmapctl summary tree,
+	 * at agheight level height (from leaf) with agwidth number of nodes
+	 * each, which starts at agstart index node of the smmary tree node
+	 * array;
+	 */
+	bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
+	l2nl =
+	    bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
+	bmp->db_agheight = l2nl >> 1;
+	bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1));
+	for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0;
+	     i--) {
+		bmp->db_agstart += n;
+		n <<= 2;
+	}
+
+}
+
+
+/*
+ * NAME:	dbInitDmap()/ujfs_idmap_page()
+ *
+ * FUNCTION:	initialize working/persistent bitmap of the dmap page
+ *		for the specified number of blocks:
+ *
+ *		at entry, the bitmaps had been initialized as free (ZEROS);
+ *		The number of blocks will only account for the actually
+ *		existing blocks. Blocks which don't actually exist in
+ *		the aggregate will be marked as allocated (ONES);
+ *
+ * PARAMETERS:
+ *	dp	- pointer to page of map
+ *	nblocks	- number of blocks this page
+ *
+ * RETURNS: NONE
+ */
+static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
+{
+	int blkno, w, b, r, nw, nb, i;
+
+	/* starting block number within the dmap */
+	blkno = Blkno & (BPERDMAP - 1);
+
+	if (blkno == 0) {
+		dp->nblocks = dp->nfree = cpu_to_le32(nblocks);
+		dp->start = cpu_to_le64(Blkno);
+
+		if (nblocks == BPERDMAP) {
+			memset(&dp->wmap[0], 0, LPERDMAP * 4);
+			memset(&dp->pmap[0], 0, LPERDMAP * 4);
+			goto initTree;
+		}
+	} else {
+		le32_add_cpu(&dp->nblocks, nblocks);
+		le32_add_cpu(&dp->nfree, nblocks);
+	}
+
+	/* word number containing start block number */
+	w = blkno >> L2DBWORD;
+
+	/*
+	 * free the bits corresponding to the block range (ZEROS):
+	 * note: not all bits of the first and last words may be contained
+	 * within the block range.
+	 */
+	for (r = nblocks; r > 0; r -= nb, blkno += nb) {
+		/* number of bits preceding range to be freed in the word */
+		b = blkno & (DBWORD - 1);
+		/* number of bits to free in the word */
+		nb = min(r, DBWORD - b);
+
+		/* is partial word to be freed ? */
+		if (nb < DBWORD) {
+			/* free (set to 0) from the bitmap word */
+			dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
+						     >> b));
+			dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
+						     >> b));
+
+			/* skip the word freed */
+			w++;
+		} else {
+			/* free (set to 0) contiguous bitmap words */
+			nw = r >> L2DBWORD;
+			memset(&dp->wmap[w], 0, nw * 4);
+			memset(&dp->pmap[w], 0, nw * 4);
+
+			/* skip the words freed */
+			nb = nw << L2DBWORD;
+			w += nw;
+		}
+	}
+
+	/*
+	 * mark bits following the range to be freed (non-existing
+	 * blocks) as allocated (ONES)
+	 */
+
+	if (blkno == BPERDMAP)
+		goto initTree;
+
+	/* the first word beyond the end of existing blocks */
+	w = blkno >> L2DBWORD;
+
+	/* does nblocks fall on a 32-bit boundary ? */
+	b = blkno & (DBWORD - 1);
+	if (b) {
+		/* mark a partial word allocated */
+		dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b);
+		w++;
+	}
+
+	/* set the rest of the words in the page to allocated (ONES) */
+	for (i = w; i < LPERDMAP; i++)
+		dp->pmap[i] = dp->wmap[i] = cpu_to_le32(ONES);
+
+	/*
+	 * init tree
+	 */
+      initTree:
+	return (dbInitDmapTree(dp));
+}
+
+
+/*
+ * NAME:	dbInitDmapTree()/ujfs_complete_dmap()
+ *
+ * FUNCTION:	initialize summary tree of the specified dmap:
+ *
+ *		at entry, bitmap of the dmap has been initialized;
+ *
+ * PARAMETERS:
+ *	dp	- dmap to complete
+ *	blkno	- starting block number for this dmap
+ *	treemax	- will be filled in with max free for this dmap
+ *
+ * RETURNS:	max free string at the root of the tree
+ */
+static int dbInitDmapTree(struct dmap * dp)
+{
+	struct dmaptree *tp;
+	s8 *cp;
+	int i;
+
+	/* init fixed info of tree */
+	tp = &dp->tree;
+	tp->nleafs = cpu_to_le32(LPERDMAP);
+	tp->l2nleafs = cpu_to_le32(L2LPERDMAP);
+	tp->leafidx = cpu_to_le32(LEAFIND);
+	tp->height = cpu_to_le32(4);
+	tp->budmin = BUDMIN;
+
+	/* init each leaf from corresponding wmap word:
+	 * note: leaf is set to NOFREE(-1) if all blocks of corresponding
+	 * bitmap word are allocated.
+	 */
+	cp = tp->stree + le32_to_cpu(tp->leafidx);
+	for (i = 0; i < LPERDMAP; i++)
+		*cp++ = dbMaxBud((u8 *) & dp->wmap[i]);
+
+	/* build the dmap's binary buddy summary tree */
+	return (dbInitTree(tp));
+}
+
+
+/*
+ * NAME:	dbInitTree()/ujfs_adjtree()
+ *
+ * FUNCTION:	initialize binary buddy summary tree of a dmap or dmapctl.
+ *
+ *		at entry, the leaves of the tree has been initialized
+ *		from corresponding bitmap word or root of summary tree
+ *		of the child control page;
+ *		configure binary buddy system at the leaf level, then
+ *		bubble up the values of the leaf nodes up the tree.
+ *
+ * PARAMETERS:
+ *	cp	- Pointer to the root of the tree
+ *	l2leaves- Number of leaf nodes as a power of 2
+ *	l2min	- Number of blocks that can be covered by a leaf
+ *		  as a power of 2
+ *
+ * RETURNS: max free string at the root of the tree
+ */
+static int dbInitTree(struct dmaptree * dtp)
+{
+	int l2max, l2free, bsize, nextb, i;
+	int child, parent, nparent;
+	s8 *tp, *cp, *cp1;
+
+	tp = dtp->stree;
+
+	/* Determine the maximum free string possible for the leaves */
+	l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin;
+
+	/*
+	 * configure the leaf levevl into binary buddy system
+	 *
+	 * Try to combine buddies starting with a buddy size of 1
+	 * (i.e. two leaves). At a buddy size of 1 two buddy leaves
+	 * can be combined if both buddies have a maximum free of l2min;
+	 * the combination will result in the left-most buddy leaf having
+	 * a maximum free of l2min+1.
+	 * After processing all buddies for a given size, process buddies
+	 * at the next higher buddy size (i.e. current size * 2) and
+	 * the next maximum free (current free + 1).
+	 * This continues until the maximum possible buddy combination
+	 * yields maximum free.
+	 */
+	for (l2free = dtp->budmin, bsize = 1; l2free < l2max;
+	     l2free++, bsize = nextb) {
+		/* get next buddy size == current buddy pair size */
+		nextb = bsize << 1;
+
+		/* scan each adjacent buddy pair at current buddy size */
+		for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx);
+		     i < le32_to_cpu(dtp->nleafs);
+		     i += nextb, cp += nextb) {
+			/* coalesce if both adjacent buddies are max free */
+			if (*cp == l2free && *(cp + bsize) == l2free) {
+				*cp = l2free + 1;	/* left take right */
+				*(cp + bsize) = -1;	/* right give left */
+			}
+		}
+	}
+
+	/*
+	 * bubble summary information of leaves up the tree.
+	 *
+	 * Starting at the leaf node level, the four nodes described by
+	 * the higher level parent node are compared for a maximum free and
+	 * this maximum becomes the value of the parent node.
+	 * when all lower level nodes are processed in this fashion then
+	 * move up to the next level (parent becomes a lower level node) and
+	 * continue the process for that level.
+	 */
+	for (child = le32_to_cpu(dtp->leafidx),
+	     nparent = le32_to_cpu(dtp->nleafs) >> 2;
+	     nparent > 0; nparent >>= 2, child = parent) {
+		/* get index of 1st node of parent level */
+		parent = (child - 1) >> 2;
+
+		/* set the value of the parent node as the maximum
+		 * of the four nodes of the current level.
+		 */
+		for (i = 0, cp = tp + child, cp1 = tp + parent;
+		     i < nparent; i++, cp += 4, cp1++)
+			*cp1 = TREEMAX(cp);
+	}
+
+	return (*tp);
+}
+
+
+/*
+ *	dbInitDmapCtl()
+ *
+ * function: initialize dmapctl page
+ */
+static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i)
+{				/* start leaf index not covered by range */
+	s8 *cp;
+
+	dcp->nleafs = cpu_to_le32(LPERCTL);
+	dcp->l2nleafs = cpu_to_le32(L2LPERCTL);
+	dcp->leafidx = cpu_to_le32(CTLLEAFIND);
+	dcp->height = cpu_to_le32(5);
+	dcp->budmin = L2BPERDMAP + L2LPERCTL * level;
+
+	/*
+	 * initialize the leaves of current level that were not covered
+	 * by the specified input block range (i.e. the leaves have no
+	 * low level dmapctl or dmap).
+	 */
+	cp = &dcp->stree[CTLLEAFIND + i];
+	for (; i < LPERCTL; i++)
+		*cp++ = NOFREE;
+
+	/* build the dmap's binary buddy summary tree */
+	return (dbInitTree((struct dmaptree *) dcp));
+}
+
+
+/*
+ * NAME:	dbGetL2AGSize()/ujfs_getagl2size()
+ *
+ * FUNCTION:	Determine log2(allocation group size) from aggregate size
+ *
+ * PARAMETERS:
+ *	nblocks	- Number of blocks in aggregate
+ *
+ * RETURNS: log2(allocation group size) in aggregate blocks
+ */
+static int dbGetL2AGSize(s64 nblocks)
+{
+	s64 sz;
+	s64 m;
+	int l2sz;
+
+	if (nblocks < BPERDMAP * MAXAG)
+		return (L2BPERDMAP);
+
+	/* round up aggregate size to power of 2 */
+	m = ((u64) 1 << (64 - 1));
+	for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) {
+		if (m & nblocks)
+			break;
+	}
+
+	sz = (s64) 1 << l2sz;
+	if (sz < nblocks)
+		l2sz += 1;
+
+	/* agsize = roundupSize/max_number_of_ag */
+	return (l2sz - L2MAXAG);
+}
+
+
+/*
+ * NAME:	dbMapFileSizeToMapSize()
+ *
+ * FUNCTION:	compute number of blocks the block allocation map file
+ *		can cover from the map file size;
+ *
+ * RETURNS:	Number of blocks which can be covered by this block map file;
+ */
+
+/*
+ * maximum number of map pages at each level including control pages
+ */
+#define MAXL0PAGES	(1 + LPERCTL)
+#define MAXL1PAGES	(1 + LPERCTL * MAXL0PAGES)
+#define MAXL2PAGES	(1 + LPERCTL * MAXL1PAGES)
+
+/*
+ * convert number of map pages to the zero origin top dmapctl level
+ */
+#define BMAPPGTOLEV(npages)	\
+	(((npages) <= 3 + MAXL0PAGES) ? 0 : \
+	 ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
+
+s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
+{
+	struct super_block *sb = ipbmap->i_sb;
+	s64 nblocks;
+	s64 npages, ndmaps;
+	int level, i;
+	int complete, factor;
+
+	nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize;
+	npages = nblocks >> JFS_SBI(sb)->l2nbperpage;
+	level = BMAPPGTOLEV(npages);
+
+	/* At each level, accumulate the number of dmap pages covered by
+	 * the number of full child levels below it;
+	 * repeat for the last incomplete child level.
+	 */
+	ndmaps = 0;
+	npages--;		/* skip the first global control page */
+	/* skip higher level control pages above top level covered by map */
+	npages -= (2 - level);
+	npages--;		/* skip top level's control page */
+	for (i = level; i >= 0; i--) {
+		factor =
+		    (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1);
+		complete = (u32) npages / factor;
+		ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL :
+				      ((i == 1) ? LPERCTL : 1));
+
+		/* pages in last/incomplete child */
+		npages = (u32) npages % factor;
+		/* skip incomplete child's level control page */
+		npages--;
+	}
+
+	/* convert the number of dmaps into the number of blocks
+	 * which can be covered by the dmaps;
+	 */
+	nblocks = ndmaps << L2BPERDMAP;
+
+	return (nblocks);
+}
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
new file mode 100644
index 00000000..6dcb906c
--- /dev/null
+++ b/fs/jfs/jfs_dmap.h
@@ -0,0 +1,314 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_DMAP
+#define _H_JFS_DMAP
+
+#include "jfs_txnmgr.h"
+
+#define BMAPVERSION	1	/* version number */
+#define	TREESIZE	(256+64+16+4+1)	/* size of a dmap tree */
+#define	LEAFIND		(64+16+4+1)	/* index of 1st leaf of a dmap tree */
+#define LPERDMAP	256	/* num leaves per dmap tree */
+#define L2LPERDMAP	8	/* l2 number of leaves per dmap tree */
+#define	DBWORD		32	/* # of blks covered by a map word */
+#define	L2DBWORD	5	/* l2 # of blks covered by a mword */
+#define BUDMIN		L2DBWORD	/* max free string in a map word */
+#define BPERDMAP	(LPERDMAP * DBWORD)	/* num of blks per dmap */
+#define L2BPERDMAP	13	/* l2 num of blks per dmap */
+#define CTLTREESIZE	(1024+256+64+16+4+1)	/* size of a dmapctl tree */
+#define CTLLEAFIND	(256+64+16+4+1)	/* idx of 1st leaf of a dmapctl tree */
+#define LPERCTL		1024	/* num of leaves per dmapctl tree */
+#define L2LPERCTL	10	/* l2 num of leaves per dmapctl tree */
+#define	ROOT		0	/* index of the root of a tree */
+#define	NOFREE		((s8) -1)	/* no blocks free */
+#define	MAXAG		128	/* max number of allocation groups */
+#define L2MAXAG		7	/* l2 max num of AG */
+#define L2MINAGSZ	25	/* l2 of minimum AG size in bytes */
+#define	BMAPBLKNO	0	/* lblkno of bmap within the map */
+
+/*
+ * maximum l2 number of disk blocks at the various dmapctl levels.
+ */
+#define	L2MAXL0SIZE	(L2BPERDMAP + 1 * L2LPERCTL)
+#define	L2MAXL1SIZE	(L2BPERDMAP + 2 * L2LPERCTL)
+#define	L2MAXL2SIZE	(L2BPERDMAP + 3 * L2LPERCTL)
+
+/*
+ * maximum number of disk blocks at the various dmapctl levels.
+ */
+#define	MAXL0SIZE	((s64)1 << L2MAXL0SIZE)
+#define	MAXL1SIZE	((s64)1 << L2MAXL1SIZE)
+#define	MAXL2SIZE	((s64)1 << L2MAXL2SIZE)
+
+#define	MAXMAPSIZE	MAXL2SIZE	/* maximum aggregate map size */
+
+/*
+ * determine the maximum free string for four (lower level) nodes
+ * of the tree.
+ */
+static inline signed char TREEMAX(signed char *cp)
+{
+	signed char tmp1, tmp2;
+
+	tmp1 = max(*(cp+2), *(cp+3));
+	tmp2 = max(*(cp), *(cp+1));
+
+	return max(tmp1, tmp2);
+}
+
+/*
+ * convert disk block number to the logical block number of the dmap
+ * describing the disk block.  s is the log2(number of logical blocks per page)
+ *
+ * The calculation figures out how many logical pages are in front of the dmap.
+ *	- the number of dmaps preceding it
+ *	- the number of L0 pages preceding its L0 page
+ *	- the number of L1 pages preceding its L1 page
+ *	- 3 is added to account for the L2, L1, and L0 page for this dmap
+ *	- 1 is added to account for the control page of the map.
+ */
+#define BLKTODMAP(b,s)    \
+	((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s))
+
+/*
+ * convert disk block number to the logical block number of the LEVEL 0
+ * dmapctl describing the disk block.  s is the log2(number of logical blocks
+ * per page)
+ *
+ * The calculation figures out how many logical pages are in front of the L0.
+ *	- the number of dmap pages preceding it
+ *	- the number of L0 pages preceding it
+ *	- the number of L1 pages preceding its L1 page
+ *	- 2 is added to account for the L2, and L1 page for this L0
+ *	- 1 is added to account for the control page of the map.
+ */
+#define BLKTOL0(b,s)      \
+	(((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s))
+
+/*
+ * convert disk block number to the logical block number of the LEVEL 1
+ * dmapctl describing the disk block.  s is the log2(number of logical blocks
+ * per page)
+ *
+ * The calculation figures out how many logical pages are in front of the L1.
+ *	- the number of dmap pages preceding it
+ *	- the number of L0 pages preceding it
+ *	- the number of L1 pages preceding it
+ *	- 1 is added to account for the L2 page
+ *	- 1 is added to account for the control page of the map.
+ */
+#define BLKTOL1(b,s)      \
+     (((((b) >> 33) << 20) + (((b) >> 33) << 10) + ((b) >> 33) + 1 + 1) << (s))
+
+/*
+ * convert disk block number to the logical block number of the dmapctl
+ * at the specified level which describes the disk block.
+ */
+#define BLKTOCTL(b,s,l)   \
+	(((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s)))
+
+/*
+ * convert aggregate map size to the zero origin dmapctl level of the
+ * top dmapctl.
+ */
+#define	BMAPSZTOLEV(size)	\
+	(((size) <= MAXL0SIZE) ? 0 : ((size) <= MAXL1SIZE) ? 1 : 2)
+
+/* convert disk block number to allocation group number.
+ */
+#define BLKTOAG(b,sbi)	((b) >> ((sbi)->bmap->db_agl2size))
+
+/* convert allocation group number to starting disk block
+ * number.
+ */
+#define AGTOBLK(a,ip)	\
+	((s64)(a) << (JFS_SBI((ip)->i_sb)->bmap->db_agl2size))
+
+/*
+ *	dmap summary tree
+ *
+ * dmaptree must be consistent with dmapctl.
+ */
+struct dmaptree {
+	__le32 nleafs;		/* 4: number of tree leafs	*/
+	__le32 l2nleafs;	/* 4: l2 number of tree leafs	*/
+	__le32 leafidx;		/* 4: index of first tree leaf	*/
+	__le32 height;		/* 4: height of the tree	*/
+	s8 budmin;		/* 1: min l2 tree leaf value to combine */
+	s8 stree[TREESIZE];	/* TREESIZE: tree		*/
+	u8 pad[2];		/* 2: pad to word boundary	*/
+};				/* - 360 -			*/
+
+/*
+ *	dmap page per 8K blocks bitmap
+ */
+struct dmap {
+	__le32 nblocks;		/* 4: num blks covered by this dmap	*/
+	__le32 nfree;		/* 4: num of free blks in this dmap	*/
+	__le64 start;		/* 8: starting blkno for this dmap	*/
+	struct dmaptree tree;	/* 360: dmap tree			*/
+	u8 pad[1672];		/* 1672: pad to 2048 bytes		*/
+	__le32 wmap[LPERDMAP];	/* 1024: bits of the working map	*/
+	__le32 pmap[LPERDMAP];	/* 1024: bits of the persistent map	*/
+};				/* - 4096 -				*/
+
+/*
+ *	disk map control page per level.
+ *
+ * dmapctl must be consistent with dmaptree.
+ */
+struct dmapctl {
+	__le32 nleafs;		/* 4: number of tree leafs	*/
+	__le32 l2nleafs;	/* 4: l2 number of tree leafs	*/
+	__le32 leafidx;		/* 4: index of the first tree leaf	*/
+	__le32 height;		/* 4: height of tree		*/
+	s8 budmin;		/* 1: minimum l2 tree leaf value	*/
+	s8 stree[CTLTREESIZE];	/* CTLTREESIZE: dmapctl tree	*/
+	u8 pad[2714];		/* 2714: pad to 4096		*/
+};				/* - 4096 -			*/
+
+/*
+ *	common definition for dmaptree within dmap and dmapctl
+ */
+typedef union dmtree {
+	struct dmaptree t1;
+	struct dmapctl t2;
+} dmtree_t;
+
+/* macros for accessing fields within dmtree */
+#define	dmt_nleafs	t1.nleafs
+#define	dmt_l2nleafs	t1.l2nleafs
+#define	dmt_leafidx	t1.leafidx
+#define	dmt_height	t1.height
+#define	dmt_budmin	t1.budmin
+#define	dmt_stree	t1.stree
+
+/*
+ *	on-disk aggregate disk allocation map descriptor.
+ */
+struct dbmap_disk {
+	__le64 dn_mapsize;	/* 8: number of blocks in aggregate	*/
+	__le64 dn_nfree;	/* 8: num free blks in aggregate map	*/
+	__le32 dn_l2nbperpage;	/* 4: number of blks per page		*/
+	__le32 dn_numag;	/* 4: total number of ags		*/
+	__le32 dn_maxlevel;	/* 4: number of active ags		*/
+	__le32 dn_maxag;	/* 4: max active alloc group number	*/
+	__le32 dn_agpref;	/* 4: preferred alloc group (hint)	*/
+	__le32 dn_aglevel;	/* 4: dmapctl level holding the AG	*/
+	__le32 dn_agheight;	/* 4: height in dmapctl of the AG	*/
+	__le32 dn_agwidth;	/* 4: width in dmapctl of the AG	*/
+	__le32 dn_agstart;	/* 4: start tree index at AG height	*/
+	__le32 dn_agl2size;	/* 4: l2 num of blks per alloc group	*/
+	__le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count		*/
+	__le64 dn_agsize;	/* 8: num of blks per alloc group	*/
+	s8 dn_maxfreebud;	/* 1: max free buddy system		*/
+	u8 pad[3007];		/* 3007: pad to 4096			*/
+};				/* - 4096 -				*/
+
+struct dbmap {
+	s64 dn_mapsize;		/* number of blocks in aggregate	*/
+	s64 dn_nfree;		/* num free blks in aggregate map	*/
+	int dn_l2nbperpage;	/* number of blks per page		*/
+	int dn_numag;		/* total number of ags			*/
+	int dn_maxlevel;	/* number of active ags			*/
+	int dn_maxag;		/* max active alloc group number	*/
+	int dn_agpref;		/* preferred alloc group (hint)		*/
+	int dn_aglevel;		/* dmapctl level holding the AG		*/
+	int dn_agheight;	/* height in dmapctl of the AG		*/
+	int dn_agwidth;		/* width in dmapctl of the AG		*/
+	int dn_agstart;		/* start tree index at AG height	*/
+	int dn_agl2size;	/* l2 num of blks per alloc group	*/
+	s64 dn_agfree[MAXAG];	/* per AG free count			*/
+	s64 dn_agsize;		/* num of blks per alloc group		*/
+	signed char dn_maxfreebud;	/* max free buddy system	*/
+};				/* - 4096 -				*/
+/*
+ *	in-memory aggregate disk allocation map descriptor.
+ */
+struct bmap {
+	struct dbmap db_bmap;		/* on-disk aggregate map descriptor */
+	struct inode *db_ipbmap;	/* ptr to aggregate map incore inode */
+	struct mutex db_bmaplock;	/* aggregate map lock */
+	atomic_t db_active[MAXAG];	/* count of active, open files in AG */
+	u32 *db_DBmap;
+};
+
+/* macros for accessing fields within in-memory aggregate map descriptor */
+#define	db_mapsize	db_bmap.dn_mapsize
+#define	db_nfree	db_bmap.dn_nfree
+#define	db_agfree	db_bmap.dn_agfree
+#define	db_agsize	db_bmap.dn_agsize
+#define	db_agl2size	db_bmap.dn_agl2size
+#define	db_agwidth	db_bmap.dn_agwidth
+#define	db_agheight	db_bmap.dn_agheight
+#define	db_agstart	db_bmap.dn_agstart
+#define	db_numag	db_bmap.dn_numag
+#define	db_maxlevel	db_bmap.dn_maxlevel
+#define	db_aglevel	db_bmap.dn_aglevel
+#define	db_agpref	db_bmap.dn_agpref
+#define	db_maxag	db_bmap.dn_maxag
+#define	db_maxfreebud	db_bmap.dn_maxfreebud
+#define	db_l2nbperpage	db_bmap.dn_l2nbperpage
+
+/*
+ * macros for various conversions needed by the allocators.
+ * blkstol2(), cntlz(), and cnttz() are operating system dependent functions.
+ */
+/* convert number of blocks to log2 number of blocks, rounding up to
+ * the next log2 value if blocks is not a l2 multiple.
+ */
+#define	BLKSTOL2(d)		(blkstol2(d))
+
+/* convert number of leafs to log2 leaf value */
+#define	NLSTOL2BSZ(n)		(31 - cntlz((n)) + BUDMIN)
+
+/* convert leaf index to log2 leaf value */
+#define	LITOL2BSZ(n,m,b)	((((n) == 0) ? (m) : cnttz((n))) + (b))
+
+/* convert a block number to a dmap control leaf index */
+#define BLKTOCTLLEAF(b,m)	\
+	(((b) & (((s64)1 << ((m) + L2LPERCTL)) - 1)) >> (m))
+
+/* convert log2 leaf value to buddy size */
+#define	BUDSIZE(s,m)		(1 << ((s) - (m)))
+
+/*
+ *	external references.
+ */
+extern int dbMount(struct inode *ipbmap);
+
+extern int dbUnmount(struct inode *ipbmap, int mounterror);
+
+extern int dbFree(struct inode *ipbmap, s64 blkno, s64 nblocks);
+
+extern int dbUpdatePMap(struct inode *ipbmap,
+			int free, s64 blkno, s64 nblocks, struct tblock * tblk);
+
+extern int dbNextAG(struct inode *ipbmap);
+
+extern int dbAlloc(struct inode *ipbmap, s64 hint, s64 nblocks, s64 * results);
+
+extern int dbReAlloc(struct inode *ipbmap,
+		     s64 blkno, s64 nblocks, s64 addnblocks, s64 * results);
+
+extern int dbSync(struct inode *ipbmap);
+extern int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks);
+extern int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks);
+extern void dbFinalizeBmap(struct inode *ipbmap);
+extern s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
+#endif				/* _H_JFS_DMAP */
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
new file mode 100644
index 00000000..9197a1b0
--- /dev/null
+++ b/fs/jfs/jfs_dtree.c
@@ -0,0 +1,4567 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ *	jfs_dtree.c: directory B+-tree manager
+ *
+ * B+-tree with variable length key directory:
+ *
+ * each directory page is structured as an array of 32-byte
+ * directory entry slots initialized as a freelist
+ * to avoid search/compaction of free space at insertion.
+ * when an entry is inserted, a number of slots are allocated
+ * from the freelist as required to store variable length data
+ * of the entry; when the entry is deleted, slots of the entry
+ * are returned to freelist.
+ *
+ * leaf entry stores full name as key and file serial number
+ * (aka inode number) as data.
+ * internal/router entry stores sufffix compressed name
+ * as key and simple extent descriptor as data.
+ *
+ * each directory page maintains a sorted entry index table
+ * which stores the start slot index of sorted entries
+ * to allow binary search on the table.
+ *
+ * directory starts as a root/leaf page in on-disk inode
+ * inline data area.
+ * when it becomes full, it starts a leaf of a external extent
+ * of length of 1 block. each time the first leaf becomes full,
+ * it is extended rather than split (its size is doubled),
+ * until its length becoms 4 KBytes, from then the extent is split
+ * with new 4 Kbyte extent when it becomes full
+ * to reduce external fragmentation of small directories.
+ *
+ * blah, blah, blah, for linear scan of directory in pieces by
+ * readdir().
+ *
+ *
+ *	case-insensitive directory file system
+ *
+ * names are stored in case-sensitive way in leaf entry.
+ * but stored, searched and compared in case-insensitive (uppercase) order
+ * (i.e., both search key and entry key are folded for search/compare):
+ * (note that case-sensitive order is BROKEN in storage, e.g.,
+ *  sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad
+ *
+ *  entries which folds to the same key makes up a equivalent class
+ *  whose members are stored as contiguous cluster (may cross page boundary)
+ *  but whose order is arbitrary and acts as duplicate, e.g.,
+ *  abc, Abc, aBc, abC)
+ *
+ * once match is found at leaf, requires scan forward/backward
+ * either for, in case-insensitive search, duplicate
+ * or for, in case-sensitive search, for exact match
+ *
+ * router entry must be created/stored in case-insensitive way
+ * in internal entry:
+ * (right most key of left page and left most key of right page
+ * are folded, and its suffix compression is propagated as router
+ * key in parent)
+ * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB>
+ * should be made the router key for the split)
+ *
+ * case-insensitive search:
+ *
+ *	fold search key;
+ *
+ *	case-insensitive search of B-tree:
+ *	for internal entry, router key is already folded;
+ *	for leaf entry, fold the entry key before comparison.
+ *
+ *	if (leaf entry case-insensitive match found)
+ *		if (next entry satisfies case-insensitive match)
+ *			return EDUPLICATE;
+ *		if (prev entry satisfies case-insensitive match)
+ *			return EDUPLICATE;
+ *		return match;
+ *	else
+ *		return no match;
+ *
+ *	serialization:
+ * target directory inode lock is being held on entry/exit
+ * of all main directory service routines.
+ *
+ *	log based recovery:
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_dmap.h"
+#include "jfs_unicode.h"
+#include "jfs_debug.h"
+
+/* dtree split parameter */
+struct dtsplit {
+	struct metapage *mp;
+	s16 index;
+	s16 nslot;
+	struct component_name *key;
+	ddata_t *data;
+	struct pxdlist *pxdlist;
+};
+
+#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
+
+/* get page buffer for specified block address */
+#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
+{\
+	BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\
+	if (!(RC))\
+	{\
+		if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\
+		    ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\
+		{\
+			BT_PUTPAGE(MP);\
+			jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\
+			MP = NULL;\
+			RC = -EIO;\
+		}\
+	}\
+}
+
+/* for consistency */
+#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
+
+#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
+	BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot)
+
+/*
+ * forward references
+ */
+static int dtSplitUp(tid_t tid, struct inode *ip,
+		     struct dtsplit * split, struct btstack * btstack);
+
+static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
+		       struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp);
+
+static int dtExtendPage(tid_t tid, struct inode *ip,
+			struct dtsplit * split, struct btstack * btstack);
+
+static int dtSplitRoot(tid_t tid, struct inode *ip,
+		       struct dtsplit * split, struct metapage ** rmpp);
+
+static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
+		      dtpage_t * fp, struct btstack * btstack);
+
+static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p);
+
+static int dtReadFirst(struct inode *ip, struct btstack * btstack);
+
+static int dtReadNext(struct inode *ip,
+		      loff_t * offset, struct btstack * btstack);
+
+static int dtCompare(struct component_name * key, dtpage_t * p, int si);
+
+static int ciCompare(struct component_name * key, dtpage_t * p, int si,
+		     int flag);
+
+static void dtGetKey(dtpage_t * p, int i, struct component_name * key,
+		     int flag);
+
+static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
+			      int ri, struct component_name * key, int flag);
+
+static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
+			  ddata_t * data, struct dt_lock **);
+
+static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
+			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
+			int do_index);
+
+static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock);
+
+static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock);
+
+static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock);
+
+#define ciToUpper(c)	UniStrupr((c)->name)
+
+/*
+ *	read_index_page()
+ *
+ *	Reads a page of a directory's index table.
+ *	Having metadata mapped into the directory inode's address space
+ *	presents a multitude of problems.  We avoid this by mapping to
+ *	the absolute address space outside of the *_metapage routines
+ */
+static struct metapage *read_index_page(struct inode *inode, s64 blkno)
+{
+	int rc;
+	s64 xaddr;
+	int xflag;
+	s32 xlen;
+
+	rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
+	if (rc || (xaddr == 0))
+		return NULL;
+
+	return read_metapage(inode, xaddr, PSIZE, 1);
+}
+
+/*
+ *	get_index_page()
+ *
+ *	Same as get_index_page(), but get's a new page without reading
+ */
+static struct metapage *get_index_page(struct inode *inode, s64 blkno)
+{
+	int rc;
+	s64 xaddr;
+	int xflag;
+	s32 xlen;
+
+	rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
+	if (rc || (xaddr == 0))
+		return NULL;
+
+	return get_metapage(inode, xaddr, PSIZE, 1);
+}
+
+/*
+ *	find_index()
+ *
+ *	Returns dtree page containing directory table entry for specified
+ *	index and pointer to its entry.
+ *
+ *	mp must be released by caller.
+ */
+static struct dir_table_slot *find_index(struct inode *ip, u32 index,
+					 struct metapage ** mp, s64 *lblock)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	s64 blkno;
+	s64 offset;
+	int page_offset;
+	struct dir_table_slot *slot;
+	static int maxWarnings = 10;
+
+	if (index < 2) {
+		if (maxWarnings) {
+			jfs_warn("find_entry called with index = %d", index);
+			maxWarnings--;
+		}
+		return NULL;
+	}
+
+	if (index >= jfs_ip->next_index) {
+		jfs_warn("find_entry called with index >= next_index");
+		return NULL;
+	}
+
+	if (jfs_dirtable_inline(ip)) {
+		/*
+		 * Inline directory table
+		 */
+		*mp = NULL;
+		slot = &jfs_ip->i_dirtable[index - 2];
+	} else {
+		offset = (index - 2) * sizeof(struct dir_table_slot);
+		page_offset = offset & (PSIZE - 1);
+		blkno = ((offset + 1) >> L2PSIZE) <<
+		    JFS_SBI(ip->i_sb)->l2nbperpage;
+
+		if (*mp && (*lblock != blkno)) {
+			release_metapage(*mp);
+			*mp = NULL;
+		}
+		if (!(*mp)) {
+			*lblock = blkno;
+			*mp = read_index_page(ip, blkno);
+		}
+		if (!(*mp)) {
+			jfs_err("free_index: error reading directory table");
+			return NULL;
+		}
+
+		slot =
+		    (struct dir_table_slot *) ((char *) (*mp)->data +
+					       page_offset);
+	}
+	return slot;
+}
+
+static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp,
+			      u32 index)
+{
+	struct tlock *tlck;
+	struct linelock *llck;
+	struct lv *lv;
+
+	tlck = txLock(tid, ip, mp, tlckDATA);
+	llck = (struct linelock *) tlck->lock;
+
+	if (llck->index >= llck->maxcnt)
+		llck = txLinelock(llck);
+	lv = &llck->lv[llck->index];
+
+	/*
+	 *	Linelock slot size is twice the size of directory table
+	 *	slot size.  512 entries per page.
+	 */
+	lv->offset = ((index - 2) & 511) >> 1;
+	lv->length = 1;
+	llck->index++;
+}
+
+/*
+ *	add_index()
+ *
+ *	Adds an entry to the directory index table.  This is used to provide
+ *	each directory entry with a persistent index in which to resume
+ *	directory traversals
+ */
+static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
+{
+	struct super_block *sb = ip->i_sb;
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	u64 blkno;
+	struct dir_table_slot *dirtab_slot;
+	u32 index;
+	struct linelock *llck;
+	struct lv *lv;
+	struct metapage *mp;
+	s64 offset;
+	uint page_offset;
+	struct tlock *tlck;
+	s64 xaddr;
+
+	ASSERT(DO_INDEX(ip));
+
+	if (jfs_ip->next_index < 2) {
+		jfs_warn("add_index: next_index = %d.  Resetting!",
+			   jfs_ip->next_index);
+		jfs_ip->next_index = 2;
+	}
+
+	index = jfs_ip->next_index++;
+
+	if (index <= MAX_INLINE_DIRTABLE_ENTRY) {
+		/*
+		 * i_size reflects size of index table, or 8 bytes per entry.
+		 */
+		ip->i_size = (loff_t) (index - 1) << 3;
+
+		/*
+		 * dir table fits inline within inode
+		 */
+		dirtab_slot = &jfs_ip->i_dirtable[index-2];
+		dirtab_slot->flag = DIR_INDEX_VALID;
+		dirtab_slot->slot = slot;
+		DTSaddress(dirtab_slot, bn);
+
+		set_cflag(COMMIT_Dirtable, ip);
+
+		return index;
+	}
+	if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
+		struct dir_table_slot temp_table[12];
+
+		/*
+		 * It's time to move the inline table to an external
+		 * page and begin to build the xtree
+		 */
+		if (dquot_alloc_block(ip, sbi->nbperpage))
+			goto clean_up;
+		if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
+			dquot_free_block(ip, sbi->nbperpage);
+			goto clean_up;
+		}
+
+		/*
+		 * Save the table, we're going to overwrite it with the
+		 * xtree root
+		 */
+		memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table));
+
+		/*
+		 * Initialize empty x-tree
+		 */
+		xtInitRoot(tid, ip);
+
+		/*
+		 * Add the first block to the xtree
+		 */
+		if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) {
+			/* This really shouldn't fail */
+			jfs_warn("add_index: xtInsert failed!");
+			memcpy(&jfs_ip->i_dirtable, temp_table,
+			       sizeof (temp_table));
+			dbFree(ip, xaddr, sbi->nbperpage);
+			dquot_free_block(ip, sbi->nbperpage);
+			goto clean_up;
+		}
+		ip->i_size = PSIZE;
+
+		mp = get_index_page(ip, 0);
+		if (!mp) {
+			jfs_err("add_index: get_metapage failed!");
+			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
+			memcpy(&jfs_ip->i_dirtable, temp_table,
+			       sizeof (temp_table));
+			goto clean_up;
+		}
+		tlck = txLock(tid, ip, mp, tlckDATA);
+		llck = (struct linelock *) & tlck->lock;
+		ASSERT(llck->index == 0);
+		lv = &llck->lv[0];
+
+		lv->offset = 0;
+		lv->length = 6;	/* tlckDATA slot size is 16 bytes */
+		llck->index++;
+
+		memcpy(mp->data, temp_table, sizeof(temp_table));
+
+		mark_metapage_dirty(mp);
+		release_metapage(mp);
+
+		/*
+		 * Logging is now directed by xtree tlocks
+		 */
+		clear_cflag(COMMIT_Dirtable, ip);
+	}
+
+	offset = (index - 2) * sizeof(struct dir_table_slot);
+	page_offset = offset & (PSIZE - 1);
+	blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage;
+	if (page_offset == 0) {
+		/*
+		 * This will be the beginning of a new page
+		 */
+		xaddr = 0;
+		if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) {
+			jfs_warn("add_index: xtInsert failed!");
+			goto clean_up;
+		}
+		ip->i_size += PSIZE;
+
+		if ((mp = get_index_page(ip, blkno)))
+			memset(mp->data, 0, PSIZE);	/* Just looks better */
+		else
+			xtTruncate(tid, ip, offset, COMMIT_PWMAP);
+	} else
+		mp = read_index_page(ip, blkno);
+
+	if (!mp) {
+		jfs_err("add_index: get/read_metapage failed!");
+		goto clean_up;
+	}
+
+	lock_index(tid, ip, mp, index);
+
+	dirtab_slot =
+	    (struct dir_table_slot *) ((char *) mp->data + page_offset);
+	dirtab_slot->flag = DIR_INDEX_VALID;
+	dirtab_slot->slot = slot;
+	DTSaddress(dirtab_slot, bn);
+
+	mark_metapage_dirty(mp);
+	release_metapage(mp);
+
+	return index;
+
+      clean_up:
+
+	jfs_ip->next_index--;
+
+	return 0;
+}
+
+/*
+ *	free_index()
+ *
+ *	Marks an entry to the directory index table as free.
+ */
+static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
+{
+	struct dir_table_slot *dirtab_slot;
+	s64 lblock;
+	struct metapage *mp = NULL;
+
+	dirtab_slot = find_index(ip, index, &mp, &lblock);
+
+	if (!dirtab_slot)
+		return;
+
+	dirtab_slot->flag = DIR_INDEX_FREE;
+	dirtab_slot->slot = dirtab_slot->addr1 = 0;
+	dirtab_slot->addr2 = cpu_to_le32(next);
+
+	if (mp) {
+		lock_index(tid, ip, mp, index);
+		mark_metapage_dirty(mp);
+		release_metapage(mp);
+	} else
+		set_cflag(COMMIT_Dirtable, ip);
+}
+
+/*
+ *	modify_index()
+ *
+ *	Changes an entry in the directory index table
+ */
+static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
+			 int slot, struct metapage ** mp, s64 *lblock)
+{
+	struct dir_table_slot *dirtab_slot;
+
+	dirtab_slot = find_index(ip, index, mp, lblock);
+
+	if (!dirtab_slot)
+		return;
+
+	DTSaddress(dirtab_slot, bn);
+	dirtab_slot->slot = slot;
+
+	if (*mp) {
+		lock_index(tid, ip, *mp, index);
+		mark_metapage_dirty(*mp);
+	} else
+		set_cflag(COMMIT_Dirtable, ip);
+}
+
+/*
+ *	read_index()
+ *
+ *	reads a directory table slot
+ */
+static int read_index(struct inode *ip, u32 index,
+		     struct dir_table_slot * dirtab_slot)
+{
+	s64 lblock;
+	struct metapage *mp = NULL;
+	struct dir_table_slot *slot;
+
+	slot = find_index(ip, index, &mp, &lblock);
+	if (!slot) {
+		return -EIO;
+	}
+
+	memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot));
+
+	if (mp)
+		release_metapage(mp);
+
+	return 0;
+}
+
+/*
+ *	dtSearch()
+ *
+ * function:
+ *	Search for the entry with specified key
+ *
+ * parameter:
+ *
+ * return: 0 - search result on stack, leaf page pinned;
+ *	   errno - I/O error
+ */
+int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
+	     struct btstack * btstack, int flag)
+{
+	int rc = 0;
+	int cmp = 1;		/* init for empty page */
+	s64 bn;
+	struct metapage *mp;
+	dtpage_t *p;
+	s8 *stbl;
+	int base, index, lim;
+	struct btframe *btsp;
+	pxd_t *pxd;
+	int psize = 288;	/* initial in-line directory */
+	ino_t inumber;
+	struct component_name ciKey;
+	struct super_block *sb = ip->i_sb;
+
+	ciKey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS);
+	if (!ciKey.name) {
+		rc = -ENOMEM;
+		goto dtSearch_Exit2;
+	}
+
+
+	/* uppercase search key for c-i directory */
+	UniStrcpy(ciKey.name, key->name);
+	ciKey.namlen = key->namlen;
+
+	/* only uppercase if case-insensitive support is on */
+	if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) {
+		ciToUpper(&ciKey);
+	}
+	BT_CLR(btstack);	/* reset stack */
+
+	/* init level count for max pages to split */
+	btstack->nsplit = 1;
+
+	/*
+	 *	search down tree from root:
+	 *
+	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
+	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
+	 *
+	 * if entry with search key K is not found
+	 * internal page search find the entry with largest key Ki
+	 * less than K which point to the child page to search;
+	 * leaf page search find the entry with smallest key Kj
+	 * greater than K so that the returned index is the position of
+	 * the entry to be shifted right for insertion of new entry.
+	 * for empty tree, search key is greater than any key of the tree.
+	 *
+	 * by convention, root bn = 0.
+	 */
+	for (bn = 0;;) {
+		/* get/pin the page to search */
+		DT_GETPAGE(ip, bn, mp, psize, p, rc);
+		if (rc)
+			goto dtSearch_Exit1;
+
+		/* get sorted entry table of the page */
+		stbl = DT_GETSTBL(p);
+
+		/*
+		 * binary search with search key K on the current page.
+		 */
+		for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) {
+			index = base + (lim >> 1);
+
+			if (p->header.flag & BT_LEAF) {
+				/* uppercase leaf name to compare */
+				cmp =
+				    ciCompare(&ciKey, p, stbl[index],
+					      JFS_SBI(sb)->mntflag);
+			} else {
+				/* router key is in uppercase */
+
+				cmp = dtCompare(&ciKey, p, stbl[index]);
+
+
+			}
+			if (cmp == 0) {
+				/*
+				 *	search hit
+				 */
+				/* search hit - leaf page:
+				 * return the entry found
+				 */
+				if (p->header.flag & BT_LEAF) {
+					inumber = le32_to_cpu(
+			((struct ldtentry *) & p->slot[stbl[index]])->inumber);
+
+					/*
+					 * search for JFS_LOOKUP
+					 */
+					if (flag == JFS_LOOKUP) {
+						*data = inumber;
+						rc = 0;
+						goto out;
+					}
+
+					/*
+					 * search for JFS_CREATE
+					 */
+					if (flag == JFS_CREATE) {
+						*data = inumber;
+						rc = -EEXIST;
+						goto out;
+					}
+
+					/*
+					 * search for JFS_REMOVE or JFS_RENAME
+					 */
+					if ((flag == JFS_REMOVE ||
+					     flag == JFS_RENAME) &&
+					    *data != inumber) {
+						rc = -ESTALE;
+						goto out;
+					}
+
+					/*
+					 * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME
+					 */
+					/* save search result */
+					*data = inumber;
+					btsp = btstack->top;
+					btsp->bn = bn;
+					btsp->index = index;
+					btsp->mp = mp;
+
+					rc = 0;
+					goto dtSearch_Exit1;
+				}
+
+				/* search hit - internal page:
+				 * descend/search its child page
+				 */
+				goto getChild;
+			}
+
+			if (cmp > 0) {
+				base = index + 1;
+				--lim;
+			}
+		}
+
+		/*
+		 *	search miss
+		 *
+		 * base is the smallest index with key (Kj) greater than
+		 * search key (K) and may be zero or (maxindex + 1) index.
+		 */
+		/*
+		 * search miss - leaf page
+		 *
+		 * return location of entry (base) where new entry with
+		 * search key K is to be inserted.
+		 */
+		if (p->header.flag & BT_LEAF) {
+			/*
+			 * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME
+			 */
+			if (flag == JFS_LOOKUP || flag == JFS_REMOVE ||
+			    flag == JFS_RENAME) {
+				rc = -ENOENT;
+				goto out;
+			}
+
+			/*
+			 * search for JFS_CREATE|JFS_FINDDIR:
+			 *
+			 * save search result
+			 */
+			*data = 0;
+			btsp = btstack->top;
+			btsp->bn = bn;
+			btsp->index = base;
+			btsp->mp = mp;
+
+			rc = 0;
+			goto dtSearch_Exit1;
+		}
+
+		/*
+		 * search miss - internal page
+		 *
+		 * if base is non-zero, decrement base by one to get the parent
+		 * entry of the child page to search.
+		 */
+		index = base ? base - 1 : base;
+
+		/*
+		 * go down to child page
+		 */
+	      getChild:
+		/* update max. number of pages to split */
+		if (BT_STACK_FULL(btstack)) {
+			/* Something's corrupted, mark filesystem dirty so
+			 * chkdsk will fix it.
+			 */
+			jfs_error(sb, "stack overrun in dtSearch!");
+			BT_STACK_DUMP(btstack);
+			rc = -EIO;
+			goto out;
+		}
+		btstack->nsplit++;
+
+		/* push (bn, index) of the parent page/entry */
+		BT_PUSH(btstack, bn, index);
+
+		/* get the child page block number */
+		pxd = (pxd_t *) & p->slot[stbl[index]];
+		bn = addressPXD(pxd);
+		psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
+
+		/* unpin the parent page */
+		DT_PUTPAGE(mp);
+	}
+
+      out:
+	DT_PUTPAGE(mp);
+
+      dtSearch_Exit1:
+
+	kfree(ciKey.name);
+
+      dtSearch_Exit2:
+
+	return rc;
+}
+
+
+/*
+ *	dtInsert()
+ *
+ * function: insert an entry to directory tree
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ *	   errno - failure;
+ */
+int dtInsert(tid_t tid, struct inode *ip,
+	 struct component_name * name, ino_t * fsn, struct btstack * btstack)
+{
+	int rc = 0;
+	struct metapage *mp;	/* meta-page buffer */
+	dtpage_t *p;		/* base B+-tree index page */
+	s64 bn;
+	int index;
+	struct dtsplit split;	/* split information */
+	ddata_t data;
+	struct dt_lock *dtlck;
+	int n;
+	struct tlock *tlck;
+	struct lv *lv;
+
+	/*
+	 *	retrieve search result
+	 *
+	 * dtSearch() returns (leaf page pinned, index at which to insert).
+	 * n.b. dtSearch() may return index of (maxindex + 1) of
+	 * the full page.
+	 */
+	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
+
+	/*
+	 *	insert entry for new key
+	 */
+	if (DO_INDEX(ip)) {
+		if (JFS_IP(ip)->next_index == DIREND) {
+			DT_PUTPAGE(mp);
+			return -EMLINK;
+		}
+		n = NDTLEAF(name->namlen);
+		data.leaf.tid = tid;
+		data.leaf.ip = ip;
+	} else {
+		n = NDTLEAF_LEGACY(name->namlen);
+		data.leaf.ip = NULL;	/* signifies legacy directory format */
+	}
+	data.leaf.ino = *fsn;
+
+	/*
+	 *	leaf page does not have enough room for new entry:
+	 *
+	 *	extend/split the leaf page;
+	 *
+	 * dtSplitUp() will insert the entry and unpin the leaf page.
+	 */
+	if (n > p->header.freecnt) {
+		split.mp = mp;
+		split.index = index;
+		split.nslot = n;
+		split.key = name;
+		split.data = &data;
+		rc = dtSplitUp(tid, ip, &split, btstack);
+		return rc;
+	}
+
+	/*
+	 *	leaf page does have enough room for new entry:
+	 *
+	 *	insert the new data entry into the leaf page;
+	 */
+	BT_MARK_DIRTY(mp, ip);
+	/*
+	 * acquire a transaction lock on the leaf page
+	 */
+	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+	dtlck = (struct dt_lock *) & tlck->lock;
+	ASSERT(dtlck->index == 0);
+	lv = & dtlck->lv[0];
+
+	/* linelock header */
+	lv->offset = 0;
+	lv->length = 1;
+	dtlck->index++;
+
+	dtInsertEntry(p, index, name, &data, &dtlck);
+
+	/* linelock stbl of non-root leaf page */
+	if (!(p->header.flag & BT_ROOT)) {
+		if (dtlck->index >= dtlck->maxcnt)
+			dtlck = (struct dt_lock *) txLinelock(dtlck);
+		lv = & dtlck->lv[dtlck->index];
+		n = index >> L2DTSLOTSIZE;
+		lv->offset = p->header.stblindex + n;
+		lv->length =
+		    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
+		dtlck->index++;
+	}
+
+	/* unpin the leaf page */
+	DT_PUTPAGE(mp);
+
+	return 0;
+}
+
+
+/*
+ *	dtSplitUp()
+ *
+ * function: propagate insertion bottom up;
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ *	   errno - failure;
+ *	leaf page unpinned;
+ */
+static int dtSplitUp(tid_t tid,
+	  struct inode *ip, struct dtsplit * split, struct btstack * btstack)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+	int rc = 0;
+	struct metapage *smp;
+	dtpage_t *sp;		/* split page */
+	struct metapage *rmp;
+	dtpage_t *rp;		/* new right page split from sp */
+	pxd_t rpxd;		/* new right page extent descriptor */
+	struct metapage *lmp;
+	dtpage_t *lp;		/* left child page */
+	int skip;		/* index of entry of insertion */
+	struct btframe *parent;	/* parent page entry on traverse stack */
+	s64 xaddr, nxaddr;
+	int xlen, xsize;
+	struct pxdlist pxdlist;
+	pxd_t *pxd;
+	struct component_name key = { 0, NULL };
+	ddata_t *data = split->data;
+	int n;
+	struct dt_lock *dtlck;
+	struct tlock *tlck;
+	struct lv *lv;
+	int quota_allocation = 0;
+
+	/* get split page */
+	smp = split->mp;
+	sp = DT_PAGE(ip, smp);
+
+	key.name = kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), GFP_NOFS);
+	if (!key.name) {
+		DT_PUTPAGE(smp);
+		rc = -ENOMEM;
+		goto dtSplitUp_Exit;
+	}
+
+	/*
+	 *	split leaf page
+	 *
+	 * The split routines insert the new entry, and
+	 * acquire txLock as appropriate.
+	 */
+	/*
+	 *	split root leaf page:
+	 */
+	if (sp->header.flag & BT_ROOT) {
+		/*
+		 * allocate a single extent child page
+		 */
+		xlen = 1;
+		n = sbi->bsize >> L2DTSLOTSIZE;
+		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
+		n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */
+		if (n <= split->nslot)
+			xlen++;
+		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) {
+			DT_PUTPAGE(smp);
+			goto freeKeyName;
+		}
+
+		pxdlist.maxnpxd = 1;
+		pxdlist.npxd = 0;
+		pxd = &pxdlist.pxd[0];
+		PXDaddress(pxd, xaddr);
+		PXDlength(pxd, xlen);
+		split->pxdlist = &pxdlist;
+		rc = dtSplitRoot(tid, ip, split, &rmp);
+
+		if (rc)
+			dbFree(ip, xaddr, xlen);
+		else
+			DT_PUTPAGE(rmp);
+
+		DT_PUTPAGE(smp);
+
+		if (!DO_INDEX(ip))
+			ip->i_size = xlen << sbi->l2bsize;
+
+		goto freeKeyName;
+	}
+
+	/*
+	 *	extend first leaf page
+	 *
+	 * extend the 1st extent if less than buffer page size
+	 * (dtExtendPage() reurns leaf page unpinned)
+	 */
+	pxd = &sp->header.self;
+	xlen = lengthPXD(pxd);
+	xsize = xlen << sbi->l2bsize;
+	if (xsize < PSIZE) {
+		xaddr = addressPXD(pxd);
+		n = xsize >> L2DTSLOTSIZE;
+		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
+		if ((n + sp->header.freecnt) <= split->nslot)
+			n = xlen + (xlen << 1);
+		else
+			n = xlen;
+
+		/* Allocate blocks to quota. */
+		rc = dquot_alloc_block(ip, n);
+		if (rc)
+			goto extendOut;
+		quota_allocation += n;
+
+		if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
+				    (s64) n, &nxaddr)))
+			goto extendOut;
+
+		pxdlist.maxnpxd = 1;
+		pxdlist.npxd = 0;
+		pxd = &pxdlist.pxd[0];
+		PXDaddress(pxd, nxaddr)
+		    PXDlength(pxd, xlen + n);
+		split->pxdlist = &pxdlist;
+		if ((rc = dtExtendPage(tid, ip, split, btstack))) {
+			nxaddr = addressPXD(pxd);
+			if (xaddr != nxaddr) {
+				/* free relocated extent */
+				xlen = lengthPXD(pxd);
+				dbFree(ip, nxaddr, (s64) xlen);
+			} else {
+				/* free extended delta */
+				xlen = lengthPXD(pxd) - n;
+				xaddr = addressPXD(pxd) + xlen;
+				dbFree(ip, xaddr, (s64) n);
+			}
+		} else if (!DO_INDEX(ip))
+			ip->i_size = lengthPXD(pxd) << sbi->l2bsize;
+
+
+	      extendOut:
+		DT_PUTPAGE(smp);
+		goto freeKeyName;
+	}
+
+	/*
+	 *	split leaf page <sp> into <sp> and a new right page <rp>.
+	 *
+	 * return <rp> pinned and its extent descriptor <rpxd>
+	 */
+	/*
+	 * allocate new directory page extent and
+	 * new index page(s) to cover page split(s)
+	 *
+	 * allocation hint: ?
+	 */
+	n = btstack->nsplit;
+	pxdlist.maxnpxd = pxdlist.npxd = 0;
+	xlen = sbi->nbperpage;
+	for (pxd = pxdlist.pxd; n > 0; n--, pxd++) {
+		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) {
+			PXDaddress(pxd, xaddr);
+			PXDlength(pxd, xlen);
+			pxdlist.maxnpxd++;
+			continue;
+		}
+
+		DT_PUTPAGE(smp);
+
+		/* undo allocation */
+		goto splitOut;
+	}
+
+	split->pxdlist = &pxdlist;
+	if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) {
+		DT_PUTPAGE(smp);
+
+		/* undo allocation */
+		goto splitOut;
+	}
+
+	if (!DO_INDEX(ip))
+		ip->i_size += PSIZE;
+
+	/*
+	 * propagate up the router entry for the leaf page just split
+	 *
+	 * insert a router entry for the new page into the parent page,
+	 * propagate the insert/split up the tree by walking back the stack
+	 * of (bn of parent page, index of child page entry in parent page)
+	 * that were traversed during the search for the page that split.
+	 *
+	 * the propagation of insert/split up the tree stops if the root
+	 * splits or the page inserted into doesn't have to split to hold
+	 * the new entry.
+	 *
+	 * the parent entry for the split page remains the same, and
+	 * a new entry is inserted at its right with the first key and
+	 * block number of the new right page.
+	 *
+	 * There are a maximum of 4 pages pinned at any time:
+	 * two children, left parent and right parent (when the parent splits).
+	 * keep the child pages pinned while working on the parent.
+	 * make sure that all pins are released at exit.
+	 */
+	while ((parent = BT_POP(btstack)) != NULL) {
+		/* parent page specified by stack frame <parent> */
+
+		/* keep current child pages (<lp>, <rp>) pinned */
+		lmp = smp;
+		lp = sp;
+
+		/*
+		 * insert router entry in parent for new right child page <rp>
+		 */
+		/* get the parent page <sp> */
+		DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
+		if (rc) {
+			DT_PUTPAGE(lmp);
+			DT_PUTPAGE(rmp);
+			goto splitOut;
+		}
+
+		/*
+		 * The new key entry goes ONE AFTER the index of parent entry,
+		 * because the split was to the right.
+		 */
+		skip = parent->index + 1;
+
+		/*
+		 * compute the key for the router entry
+		 *
+		 * key suffix compression:
+		 * for internal pages that have leaf pages as children,
+		 * retain only what's needed to distinguish between
+		 * the new entry and the entry on the page to its left.
+		 * If the keys compare equal, retain the entire key.
+		 *
+		 * note that compression is performed only at computing
+		 * router key at the lowest internal level.
+		 * further compression of the key between pairs of higher
+		 * level internal pages loses too much information and
+		 * the search may fail.
+		 * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,}
+		 * results in two adjacent parent entries (a)(xx).
+		 * if split occurs between these two entries, and
+		 * if compression is applied, the router key of parent entry
+		 * of right page (x) will divert search for x into right
+		 * subtree and miss x in the left subtree.)
+		 *
+		 * the entire key must be retained for the next-to-leftmost
+		 * internal key at any level of the tree, or search may fail
+		 * (e.g., ?)
+		 */
+		switch (rp->header.flag & BT_TYPE) {
+		case BT_LEAF:
+			/*
+			 * compute the length of prefix for suffix compression
+			 * between last entry of left page and first entry
+			 * of right page
+			 */
+			if ((sp->header.flag & BT_ROOT && skip > 1) ||
+			    sp->header.prev != 0 || skip > 1) {
+				/* compute uppercase router prefix key */
+				rc = ciGetLeafPrefixKey(lp,
+							lp->header.nextindex-1,
+							rp, 0, &key,
+							sbi->mntflag);
+				if (rc) {
+					DT_PUTPAGE(lmp);
+					DT_PUTPAGE(rmp);
+					DT_PUTPAGE(smp);
+					goto splitOut;
+				}
+			} else {
+				/* next to leftmost entry of
+				   lowest internal level */
+
+				/* compute uppercase router key */
+				dtGetKey(rp, 0, &key, sbi->mntflag);
+				key.name[key.namlen] = 0;
+
+				if ((sbi->mntflag & JFS_OS2) == JFS_OS2)
+					ciToUpper(&key);
+			}
+
+			n = NDTINTERNAL(key.namlen);
+			break;
+
+		case BT_INTERNAL:
+			dtGetKey(rp, 0, &key, sbi->mntflag);
+			n = NDTINTERNAL(key.namlen);
+			break;
+
+		default:
+			jfs_err("dtSplitUp(): UFO!");
+			break;
+		}
+
+		/* unpin left child page */
+		DT_PUTPAGE(lmp);
+
+		/*
+		 * compute the data for the router entry
+		 */
+		data->xd = rpxd;	/* child page xd */
+
+		/*
+		 * parent page is full - split the parent page
+		 */
+		if (n > sp->header.freecnt) {
+			/* init for parent page split */
+			split->mp = smp;
+			split->index = skip;	/* index at insert */
+			split->nslot = n;
+			split->key = &key;
+			/* split->data = data; */
+
+			/* unpin right child page */
+			DT_PUTPAGE(rmp);
+
+			/* The split routines insert the new entry,
+			 * acquire txLock as appropriate.
+			 * return <rp> pinned and its block number <rbn>.
+			 */
+			rc = (sp->header.flag & BT_ROOT) ?
+			    dtSplitRoot(tid, ip, split, &rmp) :
+			    dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd);
+			if (rc) {
+				DT_PUTPAGE(smp);
+				goto splitOut;
+			}
+
+			/* smp and rmp are pinned */
+		}
+		/*
+		 * parent page is not full - insert router entry in parent page
+		 */
+		else {
+			BT_MARK_DIRTY(smp, ip);
+			/*
+			 * acquire a transaction lock on the parent page
+			 */
+			tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
+			dtlck = (struct dt_lock *) & tlck->lock;
+			ASSERT(dtlck->index == 0);
+			lv = & dtlck->lv[0];
+
+			/* linelock header */
+			lv->offset = 0;
+			lv->length = 1;
+			dtlck->index++;
+
+			/* linelock stbl of non-root parent page */
+			if (!(sp->header.flag & BT_ROOT)) {
+				lv++;
+				n = skip >> L2DTSLOTSIZE;
+				lv->offset = sp->header.stblindex + n;
+				lv->length =
+				    ((sp->header.nextindex -
+				      1) >> L2DTSLOTSIZE) - n + 1;
+				dtlck->index++;
+			}
+
+			dtInsertEntry(sp, skip, &key, data, &dtlck);
+
+			/* exit propagate up */
+			break;
+		}
+	}
+
+	/* unpin current split and its right page */
+	DT_PUTPAGE(smp);
+	DT_PUTPAGE(rmp);
+
+	/*
+	 * free remaining extents allocated for split
+	 */
+      splitOut:
+	n = pxdlist.npxd;
+	pxd = &pxdlist.pxd[n];
+	for (; n < pxdlist.maxnpxd; n++, pxd++)
+		dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd));
+
+      freeKeyName:
+	kfree(key.name);
+
+	/* Rollback quota allocation */
+	if (rc && quota_allocation)
+		dquot_free_block(ip, quota_allocation);
+
+      dtSplitUp_Exit:
+
+	return rc;
+}
+
+
+/*
+ *	dtSplitPage()
+ *
+ * function: Split a non-root page of a btree.
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ *	   errno - failure;
+ *	return split and new page pinned;
+ */
+static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
+	    struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp)
+{
+	int rc = 0;
+	struct metapage *smp;
+	dtpage_t *sp;
+	struct metapage *rmp;
+	dtpage_t *rp;		/* new right page allocated */
+	s64 rbn;		/* new right page block number */
+	struct metapage *mp;
+	dtpage_t *p;
+	s64 nextbn;
+	struct pxdlist *pxdlist;
+	pxd_t *pxd;
+	int skip, nextindex, half, left, nxt, off, si;
+	struct ldtentry *ldtentry;
+	struct idtentry *idtentry;
+	u8 *stbl;
+	struct dtslot *f;
+	int fsi, stblsize;
+	int n;
+	struct dt_lock *sdtlck, *rdtlck;
+	struct tlock *tlck;
+	struct dt_lock *dtlck;
+	struct lv *slv, *rlv, *lv;
+
+	/* get split page */
+	smp = split->mp;
+	sp = DT_PAGE(ip, smp);
+
+	/*
+	 * allocate the new right page for the split
+	 */
+	pxdlist = split->pxdlist;
+	pxd = &pxdlist->pxd[pxdlist->npxd];
+	pxdlist->npxd++;
+	rbn = addressPXD(pxd);
+	rmp = get_metapage(ip, rbn, PSIZE, 1);
+	if (rmp == NULL)
+		return -EIO;
+
+	/* Allocate blocks to quota. */
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc) {
+		release_metapage(rmp);
+		return rc;
+	}
+
+	jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
+
+	BT_MARK_DIRTY(rmp, ip);
+	/*
+	 * acquire a transaction lock on the new right page
+	 */
+	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
+	rdtlck = (struct dt_lock *) & tlck->lock;
+
+	rp = (dtpage_t *) rmp->data;
+	*rpp = rp;
+	rp->header.self = *pxd;
+
+	BT_MARK_DIRTY(smp, ip);
+	/*
+	 * acquire a transaction lock on the split page
+	 *
+	 * action:
+	 */
+	tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
+	sdtlck = (struct dt_lock *) & tlck->lock;
+
+	/* linelock header of split page */
+	ASSERT(sdtlck->index == 0);
+	slv = & sdtlck->lv[0];
+	slv->offset = 0;
+	slv->length = 1;
+	sdtlck->index++;
+
+	/*
+	 * initialize/update sibling pointers between sp and rp
+	 */
+	nextbn = le64_to_cpu(sp->header.next);
+	rp->header.next = cpu_to_le64(nextbn);
+	rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
+	sp->header.next = cpu_to_le64(rbn);
+
+	/*
+	 * initialize new right page
+	 */
+	rp->header.flag = sp->header.flag;
+
+	/* compute sorted entry table at start of extent data area */
+	rp->header.nextindex = 0;
+	rp->header.stblindex = 1;
+
+	n = PSIZE >> L2DTSLOTSIZE;
+	rp->header.maxslot = n;
+	stblsize = (n + 31) >> L2DTSLOTSIZE;	/* in unit of slot */
+
+	/* init freelist */
+	fsi = rp->header.stblindex + stblsize;
+	rp->header.freelist = fsi;
+	rp->header.freecnt = rp->header.maxslot - fsi;
+
+	/*
+	 *	sequential append at tail: append without split
+	 *
+	 * If splitting the last page on a level because of appending
+	 * a entry to it (skip is maxentry), it's likely that the access is
+	 * sequential. Adding an empty page on the side of the level is less
+	 * work and can push the fill factor much higher than normal.
+	 * If we're wrong it's no big deal, we'll just do the split the right
+	 * way next time.
+	 * (It may look like it's equally easy to do a similar hack for
+	 * reverse sorted data, that is, split the tree left,
+	 * but it's not. Be my guest.)
+	 */
+	if (nextbn == 0 && split->index == sp->header.nextindex) {
+		/* linelock header + stbl (first slot) of new page */
+		rlv = & rdtlck->lv[rdtlck->index];
+		rlv->offset = 0;
+		rlv->length = 2;
+		rdtlck->index++;
+
+		/*
+		 * initialize freelist of new right page
+		 */
+		f = &rp->slot[fsi];
+		for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
+			f->next = fsi;
+		f->next = -1;
+
+		/* insert entry at the first entry of the new right page */
+		dtInsertEntry(rp, 0, split->key, split->data, &rdtlck);
+
+		goto out;
+	}
+
+	/*
+	 *	non-sequential insert (at possibly middle page)
+	 */
+
+	/*
+	 * update prev pointer of previous right sibling page;
+	 */
+	if (nextbn != 0) {
+		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+		if (rc) {
+			discard_metapage(rmp);
+			return rc;
+		}
+
+		BT_MARK_DIRTY(mp, ip);
+		/*
+		 * acquire a transaction lock on the next page
+		 */
+		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
+		jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p",
+			tlck, ip, mp);
+		dtlck = (struct dt_lock *) & tlck->lock;
+
+		/* linelock header of previous right sibling page */
+		lv = & dtlck->lv[dtlck->index];
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+
+		p->header.prev = cpu_to_le64(rbn);
+
+		DT_PUTPAGE(mp);
+	}
+
+	/*
+	 * split the data between the split and right pages.
+	 */
+	skip = split->index;
+	half = (PSIZE >> L2DTSLOTSIZE) >> 1;	/* swag */
+	left = 0;
+
+	/*
+	 *	compute fill factor for split pages
+	 *
+	 * <nxt> traces the next entry to move to rp
+	 * <off> traces the next entry to stay in sp
+	 */
+	stbl = (u8 *) & sp->slot[sp->header.stblindex];
+	nextindex = sp->header.nextindex;
+	for (nxt = off = 0; nxt < nextindex; ++off) {
+		if (off == skip)
+			/* check for fill factor with new entry size */
+			n = split->nslot;
+		else {
+			si = stbl[nxt];
+			switch (sp->header.flag & BT_TYPE) {
+			case BT_LEAF:
+				ldtentry = (struct ldtentry *) & sp->slot[si];
+				if (DO_INDEX(ip))
+					n = NDTLEAF(ldtentry->namlen);
+				else
+					n = NDTLEAF_LEGACY(ldtentry->
+							   namlen);
+				break;
+
+			case BT_INTERNAL:
+				idtentry = (struct idtentry *) & sp->slot[si];
+				n = NDTINTERNAL(idtentry->namlen);
+				break;
+
+			default:
+				break;
+			}
+
+			++nxt;	/* advance to next entry to move in sp */
+		}
+
+		left += n;
+		if (left >= half)
+			break;
+	}
+
+	/* <nxt> poins to the 1st entry to move */
+
+	/*
+	 *	move entries to right page
+	 *
+	 * dtMoveEntry() initializes rp and reserves entry for insertion
+	 *
+	 * split page moved out entries are linelocked;
+	 * new/right page moved in entries are linelocked;
+	 */
+	/* linelock header + stbl of new right page */
+	rlv = & rdtlck->lv[rdtlck->index];
+	rlv->offset = 0;
+	rlv->length = 5;
+	rdtlck->index++;
+
+	dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip));
+
+	sp->header.nextindex = nxt;
+
+	/*
+	 * finalize freelist of new right page
+	 */
+	fsi = rp->header.freelist;
+	f = &rp->slot[fsi];
+	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
+		f->next = fsi;
+	f->next = -1;
+
+	/*
+	 * Update directory index table for entries now in right page
+	 */
+	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
+		s64 lblock;
+
+		mp = NULL;
+		stbl = DT_GETSTBL(rp);
+		for (n = 0; n < rp->header.nextindex; n++) {
+			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
+			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
+				     rbn, n, &mp, &lblock);
+		}
+		if (mp)
+			release_metapage(mp);
+	}
+
+	/*
+	 * the skipped index was on the left page,
+	 */
+	if (skip <= off) {
+		/* insert the new entry in the split page */
+		dtInsertEntry(sp, skip, split->key, split->data, &sdtlck);
+
+		/* linelock stbl of split page */
+		if (sdtlck->index >= sdtlck->maxcnt)
+			sdtlck = (struct dt_lock *) txLinelock(sdtlck);
+		slv = & sdtlck->lv[sdtlck->index];
+		n = skip >> L2DTSLOTSIZE;
+		slv->offset = sp->header.stblindex + n;
+		slv->length =
+		    ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
+		sdtlck->index++;
+	}
+	/*
+	 * the skipped index was on the right page,
+	 */
+	else {
+		/* adjust the skip index to reflect the new position */
+		skip -= nxt;
+
+		/* insert the new entry in the right page */
+		dtInsertEntry(rp, skip, split->key, split->data, &rdtlck);
+	}
+
+      out:
+	*rmpp = rmp;
+	*rpxdp = *pxd;
+
+	return rc;
+}
+
+
+/*
+ *	dtExtendPage()
+ *
+ * function: extend 1st/only directory leaf page
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ *	   errno - failure;
+ *	return extended page pinned;
+ */
+static int dtExtendPage(tid_t tid,
+	     struct inode *ip, struct dtsplit * split, struct btstack * btstack)
+{
+	struct super_block *sb = ip->i_sb;
+	int rc;
+	struct metapage *smp, *pmp, *mp;
+	dtpage_t *sp, *pp;
+	struct pxdlist *pxdlist;
+	pxd_t *pxd, *tpxd;
+	int xlen, xsize;
+	int newstblindex, newstblsize;
+	int oldstblindex, oldstblsize;
+	int fsi, last;
+	struct dtslot *f;
+	struct btframe *parent;
+	int n;
+	struct dt_lock *dtlck;
+	s64 xaddr, txaddr;
+	struct tlock *tlck;
+	struct pxd_lock *pxdlock;
+	struct lv *lv;
+	uint type;
+	struct ldtentry *ldtentry;
+	u8 *stbl;
+
+	/* get page to extend */
+	smp = split->mp;
+	sp = DT_PAGE(ip, smp);
+
+	/* get parent/root page */
+	parent = BT_POP(btstack);
+	DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc);
+	if (rc)
+		return (rc);
+
+	/*
+	 *	extend the extent
+	 */
+	pxdlist = split->pxdlist;
+	pxd = &pxdlist->pxd[pxdlist->npxd];
+	pxdlist->npxd++;
+
+	xaddr = addressPXD(pxd);
+	tpxd = &sp->header.self;
+	txaddr = addressPXD(tpxd);
+	/* in-place extension */
+	if (xaddr == txaddr) {
+		type = tlckEXTEND;
+	}
+	/* relocation */
+	else {
+		type = tlckNEW;
+
+		/* save moved extent descriptor for later free */
+		tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE);
+		pxdlock = (struct pxd_lock *) & tlck->lock;
+		pxdlock->flag = mlckFREEPXD;
+		pxdlock->pxd = sp->header.self;
+		pxdlock->index = 1;
+
+		/*
+		 * Update directory index table to reflect new page address
+		 */
+		if (DO_INDEX(ip)) {
+			s64 lblock;
+
+			mp = NULL;
+			stbl = DT_GETSTBL(sp);
+			for (n = 0; n < sp->header.nextindex; n++) {
+				ldtentry =
+				    (struct ldtentry *) & sp->slot[stbl[n]];
+				modify_index(tid, ip,
+					     le32_to_cpu(ldtentry->index),
+					     xaddr, n, &mp, &lblock);
+			}
+			if (mp)
+				release_metapage(mp);
+		}
+	}
+
+	/*
+	 *	extend the page
+	 */
+	sp->header.self = *pxd;
+
+	jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp);
+
+	BT_MARK_DIRTY(smp, ip);
+	/*
+	 * acquire a transaction lock on the extended/leaf page
+	 */
+	tlck = txLock(tid, ip, smp, tlckDTREE | type);
+	dtlck = (struct dt_lock *) & tlck->lock;
+	lv = & dtlck->lv[0];
+
+	/* update buffer extent descriptor of extended page */
+	xlen = lengthPXD(pxd);
+	xsize = xlen << JFS_SBI(sb)->l2bsize;
+
+	/*
+	 * copy old stbl to new stbl at start of extended area
+	 */
+	oldstblindex = sp->header.stblindex;
+	oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE;
+	newstblindex = sp->header.maxslot;
+	n = xsize >> L2DTSLOTSIZE;
+	newstblsize = (n + 31) >> L2DTSLOTSIZE;
+	memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex],
+	       sp->header.nextindex);
+
+	/*
+	 * in-line extension: linelock old area of extended page
+	 */
+	if (type == tlckEXTEND) {
+		/* linelock header */
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+		lv++;
+
+		/* linelock new stbl of extended page */
+		lv->offset = newstblindex;
+		lv->length = newstblsize;
+	}
+	/*
+	 * relocation: linelock whole relocated area
+	 */
+	else {
+		lv->offset = 0;
+		lv->length = sp->header.maxslot + newstblsize;
+	}
+
+	dtlck->index++;
+
+	sp->header.maxslot = n;
+	sp->header.stblindex = newstblindex;
+	/* sp->header.nextindex remains the same */
+
+	/*
+	 * add old stbl region at head of freelist
+	 */
+	fsi = oldstblindex;
+	f = &sp->slot[fsi];
+	last = sp->header.freelist;
+	for (n = 0; n < oldstblsize; n++, fsi++, f++) {
+		f->next = last;
+		last = fsi;
+	}
+	sp->header.freelist = last;
+	sp->header.freecnt += oldstblsize;
+
+	/*
+	 * append free region of newly extended area at tail of freelist
+	 */
+	/* init free region of newly extended area */
+	fsi = n = newstblindex + newstblsize;
+	f = &sp->slot[fsi];
+	for (fsi++; fsi < sp->header.maxslot; f++, fsi++)
+		f->next = fsi;
+	f->next = -1;
+
+	/* append new free region at tail of old freelist */
+	fsi = sp->header.freelist;
+	if (fsi == -1)
+		sp->header.freelist = n;
+	else {
+		do {
+			f = &sp->slot[fsi];
+			fsi = f->next;
+		} while (fsi != -1);
+
+		f->next = n;
+	}
+
+	sp->header.freecnt += sp->header.maxslot - n;
+
+	/*
+	 * insert the new entry
+	 */
+	dtInsertEntry(sp, split->index, split->key, split->data, &dtlck);
+
+	BT_MARK_DIRTY(pmp, ip);
+	/*
+	 * linelock any freeslots residing in old extent
+	 */
+	if (type == tlckEXTEND) {
+		n = sp->header.maxslot >> 2;
+		if (sp->header.freelist < n)
+			dtLinelockFreelist(sp, n, &dtlck);
+	}
+
+	/*
+	 *	update parent entry on the parent/root page
+	 */
+	/*
+	 * acquire a transaction lock on the parent/root page
+	 */
+	tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
+	dtlck = (struct dt_lock *) & tlck->lock;
+	lv = & dtlck->lv[dtlck->index];
+
+	/* linelock parent entry - 1st slot */
+	lv->offset = 1;
+	lv->length = 1;
+	dtlck->index++;
+
+	/* update the parent pxd for page extension */
+	tpxd = (pxd_t *) & pp->slot[1];
+	*tpxd = *pxd;
+
+	DT_PUTPAGE(pmp);
+	return 0;
+}
+
+
+/*
+ *	dtSplitRoot()
+ *
+ * function:
+ *	split the full root page into
+ *	original/root/split page and new right page
+ *	i.e., root remains fixed in tree anchor (inode) and
+ *	the root is copied to a single new right child page
+ *	since root page << non-root page, and
+ *	the split root page contains a single entry for the
+ *	new right child page.
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ *	   errno - failure;
+ *	return new page pinned;
+ */
+static int dtSplitRoot(tid_t tid,
+	    struct inode *ip, struct dtsplit * split, struct metapage ** rmpp)
+{
+	struct super_block *sb = ip->i_sb;
+	struct metapage *smp;
+	dtroot_t *sp;
+	struct metapage *rmp;
+	dtpage_t *rp;
+	s64 rbn;
+	int xlen;
+	int xsize;
+	struct dtslot *f;
+	s8 *stbl;
+	int fsi, stblsize, n;
+	struct idtentry *s;
+	pxd_t *ppxd;
+	struct pxdlist *pxdlist;
+	pxd_t *pxd;
+	struct dt_lock *dtlck;
+	struct tlock *tlck;
+	struct lv *lv;
+	int rc;
+
+	/* get split root page */
+	smp = split->mp;
+	sp = &JFS_IP(ip)->i_dtroot;
+
+	/*
+	 *	allocate/initialize a single (right) child page
+	 *
+	 * N.B. at first split, a one (or two) block to fit new entry
+	 * is allocated; at subsequent split, a full page is allocated;
+	 */
+	pxdlist = split->pxdlist;
+	pxd = &pxdlist->pxd[pxdlist->npxd];
+	pxdlist->npxd++;
+	rbn = addressPXD(pxd);
+	xlen = lengthPXD(pxd);
+	xsize = xlen << JFS_SBI(sb)->l2bsize;
+	rmp = get_metapage(ip, rbn, xsize, 1);
+	if (!rmp)
+		return -EIO;
+
+	rp = rmp->data;
+
+	/* Allocate blocks to quota. */
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc) {
+		release_metapage(rmp);
+		return rc;
+	}
+
+	BT_MARK_DIRTY(rmp, ip);
+	/*
+	 * acquire a transaction lock on the new right page
+	 */
+	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
+	dtlck = (struct dt_lock *) & tlck->lock;
+
+	rp->header.flag =
+	    (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
+	rp->header.self = *pxd;
+
+	/* initialize sibling pointers */
+	rp->header.next = 0;
+	rp->header.prev = 0;
+
+	/*
+	 *	move in-line root page into new right page extent
+	 */
+	/* linelock header + copied entries + new stbl (1st slot) in new page */
+	ASSERT(dtlck->index == 0);
+	lv = & dtlck->lv[0];
+	lv->offset = 0;
+	lv->length = 10;	/* 1 + 8 + 1 */
+	dtlck->index++;
+
+	n = xsize >> L2DTSLOTSIZE;
+	rp->header.maxslot = n;
+	stblsize = (n + 31) >> L2DTSLOTSIZE;
+
+	/* copy old stbl to new stbl at start of extended area */
+	rp->header.stblindex = DTROOTMAXSLOT;
+	stbl = (s8 *) & rp->slot[DTROOTMAXSLOT];
+	memcpy(stbl, sp->header.stbl, sp->header.nextindex);
+	rp->header.nextindex = sp->header.nextindex;
+
+	/* copy old data area to start of new data area */
+	memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE);
+
+	/*
+	 * append free region of newly extended area at tail of freelist
+	 */
+	/* init free region of newly extended area */
+	fsi = n = DTROOTMAXSLOT + stblsize;
+	f = &rp->slot[fsi];
+	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
+		f->next = fsi;
+	f->next = -1;
+
+	/* append new free region at tail of old freelist */
+	fsi = sp->header.freelist;
+	if (fsi == -1)
+		rp->header.freelist = n;
+	else {
+		rp->header.freelist = fsi;
+
+		do {
+			f = &rp->slot[fsi];
+			fsi = f->next;
+		} while (fsi != -1);
+
+		f->next = n;
+	}
+
+	rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n;
+
+	/*
+	 * Update directory index table for entries now in right page
+	 */
+	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
+		s64 lblock;
+		struct metapage *mp = NULL;
+		struct ldtentry *ldtentry;
+
+		stbl = DT_GETSTBL(rp);
+		for (n = 0; n < rp->header.nextindex; n++) {
+			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
+			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
+				     rbn, n, &mp, &lblock);
+		}
+		if (mp)
+			release_metapage(mp);
+	}
+	/*
+	 * insert the new entry into the new right/child page
+	 * (skip index in the new right page will not change)
+	 */
+	dtInsertEntry(rp, split->index, split->key, split->data, &dtlck);
+
+	/*
+	 *	reset parent/root page
+	 *
+	 * set the 1st entry offset to 0, which force the left-most key
+	 * at any level of the tree to be less than any search key.
+	 *
+	 * The btree comparison code guarantees that the left-most key on any
+	 * level of the tree is never used, so it doesn't need to be filled in.
+	 */
+	BT_MARK_DIRTY(smp, ip);
+	/*
+	 * acquire a transaction lock on the root page (in-memory inode)
+	 */
+	tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT);
+	dtlck = (struct dt_lock *) & tlck->lock;
+
+	/* linelock root */
+	ASSERT(dtlck->index == 0);
+	lv = & dtlck->lv[0];
+	lv->offset = 0;
+	lv->length = DTROOTMAXSLOT;
+	dtlck->index++;
+
+	/* update page header of root */
+	if (sp->header.flag & BT_LEAF) {
+		sp->header.flag &= ~BT_LEAF;
+		sp->header.flag |= BT_INTERNAL;
+	}
+
+	/* init the first entry */
+	s = (struct idtentry *) & sp->slot[DTENTRYSTART];
+	ppxd = (pxd_t *) s;
+	*ppxd = *pxd;
+	s->next = -1;
+	s->namlen = 0;
+
+	stbl = sp->header.stbl;
+	stbl[0] = DTENTRYSTART;
+	sp->header.nextindex = 1;
+
+	/* init freelist */
+	fsi = DTENTRYSTART + 1;
+	f = &sp->slot[fsi];
+
+	/* init free region of remaining area */
+	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
+		f->next = fsi;
+	f->next = -1;
+
+	sp->header.freelist = DTENTRYSTART + 1;
+	sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1);
+
+	*rmpp = rmp;
+
+	return 0;
+}
+
+
+/*
+ *	dtDelete()
+ *
+ * function: delete the entry(s) referenced by a key.
+ *
+ * parameter:
+ *
+ * return:
+ */
+int dtDelete(tid_t tid,
+	 struct inode *ip, struct component_name * key, ino_t * ino, int flag)
+{
+	int rc = 0;
+	s64 bn;
+	struct metapage *mp, *imp;
+	dtpage_t *p;
+	int index;
+	struct btstack btstack;
+	struct dt_lock *dtlck;
+	struct tlock *tlck;
+	struct lv *lv;
+	int i;
+	struct ldtentry *ldtentry;
+	u8 *stbl;
+	u32 table_index, next_index;
+	struct metapage *nmp;
+	dtpage_t *np;
+
+	/*
+	 *	search for the entry to delete:
+	 *
+	 * dtSearch() returns (leaf page pinned, index at which to delete).
+	 */
+	if ((rc = dtSearch(ip, key, ino, &btstack, flag)))
+		return rc;
+
+	/* retrieve search result */
+	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+	/*
+	 * We need to find put the index of the next entry into the
+	 * directory index table in order to resume a readdir from this
+	 * entry.
+	 */
+	if (DO_INDEX(ip)) {
+		stbl = DT_GETSTBL(p);
+		ldtentry = (struct ldtentry *) & p->slot[stbl[index]];
+		table_index = le32_to_cpu(ldtentry->index);
+		if (index == (p->header.nextindex - 1)) {
+			/*
+			 * Last entry in this leaf page
+			 */
+			if ((p->header.flag & BT_ROOT)
+			    || (p->header.next == 0))
+				next_index = -1;
+			else {
+				/* Read next leaf page */
+				DT_GETPAGE(ip, le64_to_cpu(p->header.next),
+					   nmp, PSIZE, np, rc);
+				if (rc)
+					next_index = -1;
+				else {
+					stbl = DT_GETSTBL(np);
+					ldtentry =
+					    (struct ldtentry *) & np->
+					    slot[stbl[0]];
+					next_index =
+					    le32_to_cpu(ldtentry->index);
+					DT_PUTPAGE(nmp);
+				}
+			}
+		} else {
+			ldtentry =
+			    (struct ldtentry *) & p->slot[stbl[index + 1]];
+			next_index = le32_to_cpu(ldtentry->index);
+		}
+		free_index(tid, ip, table_index, next_index);
+	}
+	/*
+	 * the leaf page becomes empty, delete the page
+	 */
+	if (p->header.nextindex == 1) {
+		/* delete empty page */
+		rc = dtDeleteUp(tid, ip, mp, p, &btstack);
+	}
+	/*
+	 * the leaf page has other entries remaining:
+	 *
+	 * delete the entry from the leaf page.
+	 */
+	else {
+		BT_MARK_DIRTY(mp, ip);
+		/*
+		 * acquire a transaction lock on the leaf page
+		 */
+		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+		dtlck = (struct dt_lock *) & tlck->lock;
+
+		/*
+		 * Do not assume that dtlck->index will be zero.  During a
+		 * rename within a directory, this transaction may have
+		 * modified this page already when adding the new entry.
+		 */
+
+		/* linelock header */
+		if (dtlck->index >= dtlck->maxcnt)
+			dtlck = (struct dt_lock *) txLinelock(dtlck);
+		lv = & dtlck->lv[dtlck->index];
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+
+		/* linelock stbl of non-root leaf page */
+		if (!(p->header.flag & BT_ROOT)) {
+			if (dtlck->index >= dtlck->maxcnt)
+				dtlck = (struct dt_lock *) txLinelock(dtlck);
+			lv = & dtlck->lv[dtlck->index];
+			i = index >> L2DTSLOTSIZE;
+			lv->offset = p->header.stblindex + i;
+			lv->length =
+			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
+			    i + 1;
+			dtlck->index++;
+		}
+
+		/* free the leaf entry */
+		dtDeleteEntry(p, index, &dtlck);
+
+		/*
+		 * Update directory index table for entries moved in stbl
+		 */
+		if (DO_INDEX(ip) && index < p->header.nextindex) {
+			s64 lblock;
+
+			imp = NULL;
+			stbl = DT_GETSTBL(p);
+			for (i = index; i < p->header.nextindex; i++) {
+				ldtentry =
+				    (struct ldtentry *) & p->slot[stbl[i]];
+				modify_index(tid, ip,
+					     le32_to_cpu(ldtentry->index),
+					     bn, i, &imp, &lblock);
+			}
+			if (imp)
+				release_metapage(imp);
+		}
+
+		DT_PUTPAGE(mp);
+	}
+
+	return rc;
+}
+
+
+/*
+ *	dtDeleteUp()
+ *
+ * function:
+ *	free empty pages as propagating deletion up the tree
+ *
+ * parameter:
+ *
+ * return:
+ */
+static int dtDeleteUp(tid_t tid, struct inode *ip,
+	   struct metapage * fmp, dtpage_t * fp, struct btstack * btstack)
+{
+	int rc = 0;
+	struct metapage *mp;
+	dtpage_t *p;
+	int index, nextindex;
+	int xlen;
+	struct btframe *parent;
+	struct dt_lock *dtlck;
+	struct tlock *tlck;
+	struct lv *lv;
+	struct pxd_lock *pxdlock;
+	int i;
+
+	/*
+	 *	keep the root leaf page which has become empty
+	 */
+	if (BT_IS_ROOT(fmp)) {
+		/*
+		 * reset the root
+		 *
+		 * dtInitRoot() acquires txlock on the root
+		 */
+		dtInitRoot(tid, ip, PARENT(ip));
+
+		DT_PUTPAGE(fmp);
+
+		return 0;
+	}
+
+	/*
+	 *	free the non-root leaf page
+	 */
+	/*
+	 * acquire a transaction lock on the page
+	 *
+	 * write FREEXTENT|NOREDOPAGE log record
+	 * N.B. linelock is overlaid as freed extent descriptor, and
+	 * the buffer page is freed;
+	 */
+	tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
+	pxdlock = (struct pxd_lock *) & tlck->lock;
+	pxdlock->flag = mlckFREEPXD;
+	pxdlock->pxd = fp->header.self;
+	pxdlock->index = 1;
+
+	/* update sibling pointers */
+	if ((rc = dtRelink(tid, ip, fp))) {
+		BT_PUTPAGE(fmp);
+		return rc;
+	}
+
+	xlen = lengthPXD(&fp->header.self);
+
+	/* Free quota allocation. */
+	dquot_free_block(ip, xlen);
+
+	/* free/invalidate its buffer page */
+	discard_metapage(fmp);
+
+	/*
+	 *	propagate page deletion up the directory tree
+	 *
+	 * If the delete from the parent page makes it empty,
+	 * continue all the way up the tree.
+	 * stop if the root page is reached (which is never deleted) or
+	 * if the entry deletion does not empty the page.
+	 */
+	while ((parent = BT_POP(btstack)) != NULL) {
+		/* pin the parent page <sp> */
+		DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/*
+		 * free the extent of the child page deleted
+		 */
+		index = parent->index;
+
+		/*
+		 * delete the entry for the child page from parent
+		 */
+		nextindex = p->header.nextindex;
+
+		/*
+		 * the parent has the single entry being deleted:
+		 *
+		 * free the parent page which has become empty.
+		 */
+		if (nextindex == 1) {
+			/*
+			 * keep the root internal page which has become empty
+			 */
+			if (p->header.flag & BT_ROOT) {
+				/*
+				 * reset the root
+				 *
+				 * dtInitRoot() acquires txlock on the root
+				 */
+				dtInitRoot(tid, ip, PARENT(ip));
+
+				DT_PUTPAGE(mp);
+
+				return 0;
+			}
+			/*
+			 * free the parent page
+			 */
+			else {
+				/*
+				 * acquire a transaction lock on the page
+				 *
+				 * write FREEXTENT|NOREDOPAGE log record
+				 */
+				tlck =
+				    txMaplock(tid, ip,
+					      tlckDTREE | tlckFREE);
+				pxdlock = (struct pxd_lock *) & tlck->lock;
+				pxdlock->flag = mlckFREEPXD;
+				pxdlock->pxd = p->header.self;
+				pxdlock->index = 1;
+
+				/* update sibling pointers */
+				if ((rc = dtRelink(tid, ip, p))) {
+					DT_PUTPAGE(mp);
+					return rc;
+				}
+
+				xlen = lengthPXD(&p->header.self);
+
+				/* Free quota allocation */
+				dquot_free_block(ip, xlen);
+
+				/* free/invalidate its buffer page */
+				discard_metapage(mp);
+
+				/* propagate up */
+				continue;
+			}
+		}
+
+		/*
+		 * the parent has other entries remaining:
+		 *
+		 * delete the router entry from the parent page.
+		 */
+		BT_MARK_DIRTY(mp, ip);
+		/*
+		 * acquire a transaction lock on the page
+		 *
+		 * action: router entry deletion
+		 */
+		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+		dtlck = (struct dt_lock *) & tlck->lock;
+
+		/* linelock header */
+		if (dtlck->index >= dtlck->maxcnt)
+			dtlck = (struct dt_lock *) txLinelock(dtlck);
+		lv = & dtlck->lv[dtlck->index];
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+
+		/* linelock stbl of non-root leaf page */
+		if (!(p->header.flag & BT_ROOT)) {
+			if (dtlck->index < dtlck->maxcnt)
+				lv++;
+			else {
+				dtlck = (struct dt_lock *) txLinelock(dtlck);
+				lv = & dtlck->lv[0];
+			}
+			i = index >> L2DTSLOTSIZE;
+			lv->offset = p->header.stblindex + i;
+			lv->length =
+			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
+			    i + 1;
+			dtlck->index++;
+		}
+
+		/* free the router entry */
+		dtDeleteEntry(p, index, &dtlck);
+
+		/* reset key of new leftmost entry of level (for consistency) */
+		if (index == 0 &&
+		    ((p->header.flag & BT_ROOT) || p->header.prev == 0))
+			dtTruncateEntry(p, 0, &dtlck);
+
+		/* unpin the parent page */
+		DT_PUTPAGE(mp);
+
+		/* exit propagation up */
+		break;
+	}
+
+	if (!DO_INDEX(ip))
+		ip->i_size -= PSIZE;
+
+	return 0;
+}
+
+#ifdef _NOTYET
+/*
+ * NAME:	dtRelocate()
+ *
+ * FUNCTION:	relocate dtpage (internal or leaf) of directory;
+ *		This function is mainly used by defragfs utility.
+ */
+int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
+	       s64 nxaddr)
+{
+	int rc = 0;
+	struct metapage *mp, *pmp, *lmp, *rmp;
+	dtpage_t *p, *pp, *rp = 0, *lp= 0;
+	s64 bn;
+	int index;
+	struct btstack btstack;
+	pxd_t *pxd;
+	s64 oxaddr, nextbn, prevbn;
+	int xlen, xsize;
+	struct tlock *tlck;
+	struct dt_lock *dtlck;
+	struct pxd_lock *pxdlock;
+	s8 *stbl;
+	struct lv *lv;
+
+	oxaddr = addressPXD(opxd);
+	xlen = lengthPXD(opxd);
+
+	jfs_info("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d",
+		   (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr,
+		   xlen);
+
+	/*
+	 *	1. get the internal parent dtpage covering
+	 *	router entry for the tartget page to be relocated;
+	 */
+	rc = dtSearchNode(ip, lmxaddr, opxd, &btstack);
+	if (rc)
+		return rc;
+
+	/* retrieve search result */
+	DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+	jfs_info("dtRelocate: parent router entry validated.");
+
+	/*
+	 *	2. relocate the target dtpage
+	 */
+	/* read in the target page from src extent */
+	DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
+	if (rc) {
+		/* release the pinned parent page */
+		DT_PUTPAGE(pmp);
+		return rc;
+	}
+
+	/*
+	 * read in sibling pages if any to update sibling pointers;
+	 */
+	rmp = NULL;
+	if (p->header.next) {
+		nextbn = le64_to_cpu(p->header.next);
+		DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
+		if (rc) {
+			DT_PUTPAGE(mp);
+			DT_PUTPAGE(pmp);
+			return (rc);
+		}
+	}
+
+	lmp = NULL;
+	if (p->header.prev) {
+		prevbn = le64_to_cpu(p->header.prev);
+		DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
+		if (rc) {
+			DT_PUTPAGE(mp);
+			DT_PUTPAGE(pmp);
+			if (rmp)
+				DT_PUTPAGE(rmp);
+			return (rc);
+		}
+	}
+
+	/* at this point, all xtpages to be updated are in memory */
+
+	/*
+	 * update sibling pointers of sibling dtpages if any;
+	 */
+	if (lmp) {
+		tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK);
+		dtlck = (struct dt_lock *) & tlck->lock;
+		/* linelock header */
+		ASSERT(dtlck->index == 0);
+		lv = & dtlck->lv[0];
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+
+		lp->header.next = cpu_to_le64(nxaddr);
+		DT_PUTPAGE(lmp);
+	}
+
+	if (rmp) {
+		tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK);
+		dtlck = (struct dt_lock *) & tlck->lock;
+		/* linelock header */
+		ASSERT(dtlck->index == 0);
+		lv = & dtlck->lv[0];
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+
+		rp->header.prev = cpu_to_le64(nxaddr);
+		DT_PUTPAGE(rmp);
+	}
+
+	/*
+	 * update the target dtpage to be relocated
+	 *
+	 * write LOG_REDOPAGE of LOG_NEW type for dst page
+	 * for the whole target page (logredo() will apply
+	 * after image and update bmap for allocation of the
+	 * dst extent), and update bmap for allocation of
+	 * the dst extent;
+	 */
+	tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW);
+	dtlck = (struct dt_lock *) & tlck->lock;
+	/* linelock header */
+	ASSERT(dtlck->index == 0);
+	lv = & dtlck->lv[0];
+
+	/* update the self address in the dtpage header */
+	pxd = &p->header.self;
+	PXDaddress(pxd, nxaddr);
+
+	/* the dst page is the same as the src page, i.e.,
+	 * linelock for afterimage of the whole page;
+	 */
+	lv->offset = 0;
+	lv->length = p->header.maxslot;
+	dtlck->index++;
+
+	/* update the buffer extent descriptor of the dtpage */
+	xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
+
+	/* unpin the relocated page */
+	DT_PUTPAGE(mp);
+	jfs_info("dtRelocate: target dtpage relocated.");
+
+	/* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec
+	 * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec
+	 * will also force a bmap update ).
+	 */
+
+	/*
+	 *	3. acquire maplock for the source extent to be freed;
+	 */
+	/* for dtpage relocation, write a LOG_NOREDOPAGE record
+	 * for the source dtpage (logredo() will init NoRedoPage
+	 * filter and will also update bmap for free of the source
+	 * dtpage), and upadte bmap for free of the source dtpage;
+	 */
+	tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
+	pxdlock = (struct pxd_lock *) & tlck->lock;
+	pxdlock->flag = mlckFREEPXD;
+	PXDaddress(&pxdlock->pxd, oxaddr);
+	PXDlength(&pxdlock->pxd, xlen);
+	pxdlock->index = 1;
+
+	/*
+	 *	4. update the parent router entry for relocation;
+	 *
+	 * acquire tlck for the parent entry covering the target dtpage;
+	 * write LOG_REDOPAGE to apply after image only;
+	 */
+	jfs_info("dtRelocate: update parent router entry.");
+	tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
+	dtlck = (struct dt_lock *) & tlck->lock;
+	lv = & dtlck->lv[dtlck->index];
+
+	/* update the PXD with the new address */
+	stbl = DT_GETSTBL(pp);
+	pxd = (pxd_t *) & pp->slot[stbl[index]];
+	PXDaddress(pxd, nxaddr);
+	lv->offset = stbl[index];
+	lv->length = 1;
+	dtlck->index++;
+
+	/* unpin the parent dtpage */
+	DT_PUTPAGE(pmp);
+
+	return rc;
+}
+
+/*
+ * NAME:	dtSearchNode()
+ *
+ * FUNCTION:	Search for an dtpage containing a specified address
+ *		This function is mainly used by defragfs utility.
+ *
+ * NOTE:	Search result on stack, the found page is pinned at exit.
+ *		The result page must be an internal dtpage.
+ *		lmxaddr give the address of the left most page of the
+ *		dtree level, in which the required dtpage resides.
+ */
+static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
+			struct btstack * btstack)
+{
+	int rc = 0;
+	s64 bn;
+	struct metapage *mp;
+	dtpage_t *p;
+	int psize = 288;	/* initial in-line directory */
+	s8 *stbl;
+	int i;
+	pxd_t *pxd;
+	struct btframe *btsp;
+
+	BT_CLR(btstack);	/* reset stack */
+
+	/*
+	 *	descend tree to the level with specified leftmost page
+	 *
+	 *  by convention, root bn = 0.
+	 */
+	for (bn = 0;;) {
+		/* get/pin the page to search */
+		DT_GETPAGE(ip, bn, mp, psize, p, rc);
+		if (rc)
+			return rc;
+
+		/* does the xaddr of leftmost page of the levevl
+		 * matches levevl search key ?
+		 */
+		if (p->header.flag & BT_ROOT) {
+			if (lmxaddr == 0)
+				break;
+		} else if (addressPXD(&p->header.self) == lmxaddr)
+			break;
+
+		/*
+		 * descend down to leftmost child page
+		 */
+		if (p->header.flag & BT_LEAF) {
+			DT_PUTPAGE(mp);
+			return -ESTALE;
+		}
+
+		/* get the leftmost entry */
+		stbl = DT_GETSTBL(p);
+		pxd = (pxd_t *) & p->slot[stbl[0]];
+
+		/* get the child page block address */
+		bn = addressPXD(pxd);
+		psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
+		/* unpin the parent page */
+		DT_PUTPAGE(mp);
+	}
+
+	/*
+	 *	search each page at the current levevl
+	 */
+      loop:
+	stbl = DT_GETSTBL(p);
+	for (i = 0; i < p->header.nextindex; i++) {
+		pxd = (pxd_t *) & p->slot[stbl[i]];
+
+		/* found the specified router entry */
+		if (addressPXD(pxd) == addressPXD(kpxd) &&
+		    lengthPXD(pxd) == lengthPXD(kpxd)) {
+			btsp = btstack->top;
+			btsp->bn = bn;
+			btsp->index = i;
+			btsp->mp = mp;
+
+			return 0;
+		}
+	}
+
+	/* get the right sibling page if any */
+	if (p->header.next)
+		bn = le64_to_cpu(p->header.next);
+	else {
+		DT_PUTPAGE(mp);
+		return -ESTALE;
+	}
+
+	/* unpin current page */
+	DT_PUTPAGE(mp);
+
+	/* get the right sibling page */
+	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+	if (rc)
+		return rc;
+
+	goto loop;
+}
+#endif /* _NOTYET */
+
+/*
+ *	dtRelink()
+ *
+ * function:
+ *	link around a freed page.
+ *
+ * parameter:
+ *	fp:	page to be freed
+ *
+ * return:
+ */
+static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p)
+{
+	int rc;
+	struct metapage *mp;
+	s64 nextbn, prevbn;
+	struct tlock *tlck;
+	struct dt_lock *dtlck;
+	struct lv *lv;
+
+	nextbn = le64_to_cpu(p->header.next);
+	prevbn = le64_to_cpu(p->header.prev);
+
+	/* update prev pointer of the next page */
+	if (nextbn != 0) {
+		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		BT_MARK_DIRTY(mp, ip);
+		/*
+		 * acquire a transaction lock on the next page
+		 *
+		 * action: update prev pointer;
+		 */
+		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
+		jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
+			tlck, ip, mp);
+		dtlck = (struct dt_lock *) & tlck->lock;
+
+		/* linelock header */
+		if (dtlck->index >= dtlck->maxcnt)
+			dtlck = (struct dt_lock *) txLinelock(dtlck);
+		lv = & dtlck->lv[dtlck->index];
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+
+		p->header.prev = cpu_to_le64(prevbn);
+		DT_PUTPAGE(mp);
+	}
+
+	/* update next pointer of the previous page */
+	if (prevbn != 0) {
+		DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		BT_MARK_DIRTY(mp, ip);
+		/*
+		 * acquire a transaction lock on the prev page
+		 *
+		 * action: update next pointer;
+		 */
+		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
+		jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
+			tlck, ip, mp);
+		dtlck = (struct dt_lock *) & tlck->lock;
+
+		/* linelock header */
+		if (dtlck->index >= dtlck->maxcnt)
+			dtlck = (struct dt_lock *) txLinelock(dtlck);
+		lv = & dtlck->lv[dtlck->index];
+		lv->offset = 0;
+		lv->length = 1;
+		dtlck->index++;
+
+		p->header.next = cpu_to_le64(nextbn);
+		DT_PUTPAGE(mp);
+	}
+
+	return 0;
+}
+
+
+/*
+ *	dtInitRoot()
+ *
+ * initialize directory root (inline in inode)
+ */
+void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	dtroot_t *p;
+	int fsi;
+	struct dtslot *f;
+	struct tlock *tlck;
+	struct dt_lock *dtlck;
+	struct lv *lv;
+	u16 xflag_save;
+
+	/*
+	 * If this was previously an non-empty directory, we need to remove
+	 * the old directory table.
+	 */
+	if (DO_INDEX(ip)) {
+		if (!jfs_dirtable_inline(ip)) {
+			struct tblock *tblk = tid_to_tblock(tid);
+			/*
+			 * We're playing games with the tid's xflag.  If
+			 * we're removing a regular file, the file's xtree
+			 * is committed with COMMIT_PMAP, but we always
+			 * commit the directories xtree with COMMIT_PWMAP.
+			 */
+			xflag_save = tblk->xflag;
+			tblk->xflag = 0;
+			/*
+			 * xtTruncate isn't guaranteed to fully truncate
+			 * the xtree.  The caller needs to check i_size
+			 * after committing the transaction to see if
+			 * additional truncation is needed.  The
+			 * COMMIT_Stale flag tells caller that we
+			 * initiated the truncation.
+			 */
+			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
+			set_cflag(COMMIT_Stale, ip);
+
+			tblk->xflag = xflag_save;
+		} else
+			ip->i_size = 1;
+
+		jfs_ip->next_index = 2;
+	} else
+		ip->i_size = IDATASIZE;
+
+	/*
+	 * acquire a transaction lock on the root
+	 *
+	 * action: directory initialization;
+	 */
+	tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag,
+		      tlckDTREE | tlckENTRY | tlckBTROOT);
+	dtlck = (struct dt_lock *) & tlck->lock;
+
+	/* linelock root */
+	ASSERT(dtlck->index == 0);
+	lv = & dtlck->lv[0];
+	lv->offset = 0;
+	lv->length = DTROOTMAXSLOT;
+	dtlck->index++;
+
+	p = &jfs_ip->i_dtroot;
+
+	p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
+
+	p->header.nextindex = 0;
+
+	/* init freelist */
+	fsi = 1;
+	f = &p->slot[fsi];
+
+	/* init data area of root */
+	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
+		f->next = fsi;
+	f->next = -1;
+
+	p->header.freelist = 1;
+	p->header.freecnt = 8;
+
+	/* init '..' entry */
+	p->header.idotdot = cpu_to_le32(idotdot);
+
+	return;
+}
+
+/*
+ *	add_missing_indices()
+ *
+ * function: Fix dtree page in which one or more entries has an invalid index.
+ *	     fsck.jfs should really fix this, but it currently does not.
+ *	     Called from jfs_readdir when bad index is detected.
+ */
+static void add_missing_indices(struct inode *inode, s64 bn)
+{
+	struct ldtentry *d;
+	struct dt_lock *dtlck;
+	int i;
+	uint index;
+	struct lv *lv;
+	struct metapage *mp;
+	dtpage_t *p;
+	int rc;
+	s8 *stbl;
+	tid_t tid;
+	struct tlock *tlck;
+
+	tid = txBegin(inode->i_sb, 0);
+
+	DT_GETPAGE(inode, bn, mp, PSIZE, p, rc);
+
+	if (rc) {
+		printk(KERN_ERR "DT_GETPAGE failed!\n");
+		goto end;
+	}
+	BT_MARK_DIRTY(mp, inode);
+
+	ASSERT(p->header.flag & BT_LEAF);
+
+	tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY);
+	if (BT_IS_ROOT(mp))
+		tlck->type |= tlckBTROOT;
+
+	dtlck = (struct dt_lock *) &tlck->lock;
+
+	stbl = DT_GETSTBL(p);
+	for (i = 0; i < p->header.nextindex; i++) {
+		d = (struct ldtentry *) &p->slot[stbl[i]];
+		index = le32_to_cpu(d->index);
+		if ((index < 2) || (index >= JFS_IP(inode)->next_index)) {
+			d->index = cpu_to_le32(add_index(tid, inode, bn, i));
+			if (dtlck->index >= dtlck->maxcnt)
+				dtlck = (struct dt_lock *) txLinelock(dtlck);
+			lv = &dtlck->lv[dtlck->index];
+			lv->offset = stbl[i];
+			lv->length = 1;
+			dtlck->index++;
+		}
+	}
+
+	DT_PUTPAGE(mp);
+	(void) txCommit(tid, 1, &inode, 0);
+end:
+	txEnd(tid);
+}
+
+/*
+ * Buffer to hold directory entry info while traversing a dtree page
+ * before being fed to the filldir function
+ */
+struct jfs_dirent {
+	loff_t position;
+	int ino;
+	u16 name_len;
+	char name[0];
+};
+
+/*
+ * function to determine next variable-sized jfs_dirent in buffer
+ */
+static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
+{
+	return (struct jfs_dirent *)
+		((char *)dirent +
+		 ((sizeof (struct jfs_dirent) + dirent->name_len + 1 +
+		   sizeof (loff_t) - 1) &
+		  ~(sizeof (loff_t) - 1)));
+}
+
+/*
+ *	jfs_readdir()
+ *
+ * function: read directory entries sequentially
+ *	from the specified entry offset
+ *
+ * parameter:
+ *
+ * return: offset = (pn, index) of start entry
+ *	of next jfs_readdir()/dtRead()
+ */
+int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *ip = filp->f_path.dentry->d_inode;
+	struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
+	int rc = 0;
+	loff_t dtpos;	/* legacy OS/2 style position */
+	struct dtoffset {
+		s16 pn;
+		s16 index;
+		s32 unused;
+	} *dtoffset = (struct dtoffset *) &dtpos;
+	s64 bn;
+	struct metapage *mp;
+	dtpage_t *p;
+	int index;
+	s8 *stbl;
+	struct btstack btstack;
+	int i, next;
+	struct ldtentry *d;
+	struct dtslot *t;
+	int d_namleft, len, outlen;
+	unsigned long dirent_buf;
+	char *name_ptr;
+	u32 dir_index;
+	int do_index = 0;
+	uint loop_count = 0;
+	struct jfs_dirent *jfs_dirent;
+	int jfs_dirents;
+	int overflow, fix_page, page_fixed = 0;
+	static int unique_pos = 2;	/* If we can't fix broken index */
+
+	if (filp->f_pos == DIREND)
+		return 0;
+
+	if (DO_INDEX(ip)) {
+		/*
+		 * persistent index is stored in directory entries.
+		 * Special cases:	 0 = .
+		 *			 1 = ..
+		 *			-1 = End of directory
+		 */
+		do_index = 1;
+
+		dir_index = (u32) filp->f_pos;
+
+		if (dir_index > 1) {
+			struct dir_table_slot dirtab_slot;
+
+			if (dtEmpty(ip) ||
+			    (dir_index >= JFS_IP(ip)->next_index)) {
+				/* Stale position.  Directory has shrunk */
+				filp->f_pos = DIREND;
+				return 0;
+			}
+		      repeat:
+			rc = read_index(ip, dir_index, &dirtab_slot);
+			if (rc) {
+				filp->f_pos = DIREND;
+				return rc;
+			}
+			if (dirtab_slot.flag == DIR_INDEX_FREE) {
+				if (loop_count++ > JFS_IP(ip)->next_index) {
+					jfs_err("jfs_readdir detected "
+						   "infinite loop!");
+					filp->f_pos = DIREND;
+					return 0;
+				}
+				dir_index = le32_to_cpu(dirtab_slot.addr2);
+				if (dir_index == -1) {
+					filp->f_pos = DIREND;
+					return 0;
+				}
+				goto repeat;
+			}
+			bn = addressDTS(&dirtab_slot);
+			index = dirtab_slot.slot;
+			DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+			if (rc) {
+				filp->f_pos = DIREND;
+				return 0;
+			}
+			if (p->header.flag & BT_INTERNAL) {
+				jfs_err("jfs_readdir: bad index table");
+				DT_PUTPAGE(mp);
+				filp->f_pos = -1;
+				return 0;
+			}
+		} else {
+			if (dir_index == 0) {
+				/*
+				 * self "."
+				 */
+				filp->f_pos = 0;
+				if (filldir(dirent, ".", 1, 0, ip->i_ino,
+					    DT_DIR))
+					return 0;
+			}
+			/*
+			 * parent ".."
+			 */
+			filp->f_pos = 1;
+			if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR))
+				return 0;
+
+			/*
+			 * Find first entry of left-most leaf
+			 */
+			if (dtEmpty(ip)) {
+				filp->f_pos = DIREND;
+				return 0;
+			}
+
+			if ((rc = dtReadFirst(ip, &btstack)))
+				return rc;
+
+			DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+		}
+	} else {
+		/*
+		 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6
+		 *
+		 * pn = index = 0:	First entry "."
+		 * pn = 0; index = 1:	Second entry ".."
+		 * pn > 0:		Real entries, pn=1 -> leftmost page
+		 * pn = index = -1:	No more entries
+		 */
+		dtpos = filp->f_pos;
+		if (dtpos == 0) {
+			/* build "." entry */
+
+			if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino,
+				    DT_DIR))
+				return 0;
+			dtoffset->index = 1;
+			filp->f_pos = dtpos;
+		}
+
+		if (dtoffset->pn == 0) {
+			if (dtoffset->index == 1) {
+				/* build ".." entry */
+
+				if (filldir(dirent, "..", 2, filp->f_pos,
+					    PARENT(ip), DT_DIR))
+					return 0;
+			} else {
+				jfs_err("jfs_readdir called with "
+					"invalid offset!");
+			}
+			dtoffset->pn = 1;
+			dtoffset->index = 0;
+			filp->f_pos = dtpos;
+		}
+
+		if (dtEmpty(ip)) {
+			filp->f_pos = DIREND;
+			return 0;
+		}
+
+		if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) {
+			jfs_err("jfs_readdir: unexpected rc = %d "
+				"from dtReadNext", rc);
+			filp->f_pos = DIREND;
+			return 0;
+		}
+		/* get start leaf page and index */
+		DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+		/* offset beyond directory eof ? */
+		if (bn < 0) {
+			filp->f_pos = DIREND;
+			return 0;
+		}
+	}
+
+	dirent_buf = __get_free_page(GFP_KERNEL);
+	if (dirent_buf == 0) {
+		DT_PUTPAGE(mp);
+		jfs_warn("jfs_readdir: __get_free_page failed!");
+		filp->f_pos = DIREND;
+		return -ENOMEM;
+	}
+
+	while (1) {
+		jfs_dirent = (struct jfs_dirent *) dirent_buf;
+		jfs_dirents = 0;
+		overflow = fix_page = 0;
+
+		stbl = DT_GETSTBL(p);
+
+		for (i = index; i < p->header.nextindex; i++) {
+			d = (struct ldtentry *) & p->slot[stbl[i]];
+
+			if (((long) jfs_dirent + d->namlen + 1) >
+			    (dirent_buf + PAGE_SIZE)) {
+				/* DBCS codepages could overrun dirent_buf */
+				index = i;
+				overflow = 1;
+				break;
+			}
+
+			d_namleft = d->namlen;
+			name_ptr = jfs_dirent->name;
+			jfs_dirent->ino = le32_to_cpu(d->inumber);
+
+			if (do_index) {
+				len = min(d_namleft, DTLHDRDATALEN);
+				jfs_dirent->position = le32_to_cpu(d->index);
+				/*
+				 * d->index should always be valid, but it
+				 * isn't.  fsck.jfs doesn't create the
+				 * directory index for the lost+found
+				 * directory.  Rather than let it go,
+				 * we can try to fix it.
+				 */
+				if ((jfs_dirent->position < 2) ||
+				    (jfs_dirent->position >=
+				     JFS_IP(ip)->next_index)) {
+					if (!page_fixed && !isReadOnly(ip)) {
+						fix_page = 1;
+						/*
+						 * setting overflow and setting
+						 * index to i will cause the
+						 * same page to be processed
+						 * again starting here
+						 */
+						overflow = 1;
+						index = i;
+						break;
+					}
+					jfs_dirent->position = unique_pos++;
+				}
+			} else {
+				jfs_dirent->position = dtpos;
+				len = min(d_namleft, DTLHDRDATALEN_LEGACY);
+			}
+
+			/* copy the name of head/only segment */
+			outlen = jfs_strfromUCS_le(name_ptr, d->name, len,
+						   codepage);
+			jfs_dirent->name_len = outlen;
+
+			/* copy name in the additional segment(s) */
+			next = d->next;
+			while (next >= 0) {
+				t = (struct dtslot *) & p->slot[next];
+				name_ptr += outlen;
+				d_namleft -= len;
+				/* Sanity Check */
+				if (d_namleft == 0) {
+					jfs_error(ip->i_sb,
+						  "JFS:Dtree error: ino = "
+						  "%ld, bn=%Ld, index = %d",
+						  (long)ip->i_ino,
+						  (long long)bn,
+						  i);
+					goto skip_one;
+				}
+				len = min(d_namleft, DTSLOTDATALEN);
+				outlen = jfs_strfromUCS_le(name_ptr, t->name,
+							   len, codepage);
+				jfs_dirent->name_len += outlen;
+
+				next = t->next;
+			}
+
+			jfs_dirents++;
+			jfs_dirent = next_jfs_dirent(jfs_dirent);
+skip_one:
+			if (!do_index)
+				dtoffset->index++;
+		}
+
+		if (!overflow) {
+			/* Point to next leaf page */
+			if (p->header.flag & BT_ROOT)
+				bn = 0;
+			else {
+				bn = le64_to_cpu(p->header.next);
+				index = 0;
+				/* update offset (pn:index) for new page */
+				if (!do_index) {
+					dtoffset->pn++;
+					dtoffset->index = 0;
+				}
+			}
+			page_fixed = 0;
+		}
+
+		/* unpin previous leaf page */
+		DT_PUTPAGE(mp);
+
+		jfs_dirent = (struct jfs_dirent *) dirent_buf;
+		while (jfs_dirents--) {
+			filp->f_pos = jfs_dirent->position;
+			if (filldir(dirent, jfs_dirent->name,
+				    jfs_dirent->name_len, filp->f_pos,
+				    jfs_dirent->ino, DT_UNKNOWN))
+				goto out;
+			jfs_dirent = next_jfs_dirent(jfs_dirent);
+		}
+
+		if (fix_page) {
+			add_missing_indices(ip, bn);
+			page_fixed = 1;
+		}
+
+		if (!overflow && (bn == 0)) {
+			filp->f_pos = DIREND;
+			break;
+		}
+
+		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc) {
+			free_page(dirent_buf);
+			return rc;
+		}
+	}
+
+      out:
+	free_page(dirent_buf);
+
+	return rc;
+}
+
+
+/*
+ *	dtReadFirst()
+ *
+ * function: get the leftmost page of the directory
+ */
+static int dtReadFirst(struct inode *ip, struct btstack * btstack)
+{
+	int rc = 0;
+	s64 bn;
+	int psize = 288;	/* initial in-line directory */
+	struct metapage *mp;
+	dtpage_t *p;
+	s8 *stbl;
+	struct btframe *btsp;
+	pxd_t *xd;
+
+	BT_CLR(btstack);	/* reset stack */
+
+	/*
+	 *	descend leftmost path of the tree
+	 *
+	 * by convention, root bn = 0.
+	 */
+	for (bn = 0;;) {
+		DT_GETPAGE(ip, bn, mp, psize, p, rc);
+		if (rc)
+			return rc;
+
+		/*
+		 * leftmost leaf page
+		 */
+		if (p->header.flag & BT_LEAF) {
+			/* return leftmost entry */
+			btsp = btstack->top;
+			btsp->bn = bn;
+			btsp->index = 0;
+			btsp->mp = mp;
+
+			return 0;
+		}
+
+		/*
+		 * descend down to leftmost child page
+		 */
+		if (BT_STACK_FULL(btstack)) {
+			DT_PUTPAGE(mp);
+			jfs_error(ip->i_sb, "dtReadFirst: btstack overrun");
+			BT_STACK_DUMP(btstack);
+			return -EIO;
+		}
+		/* push (bn, index) of the parent page/entry */
+		BT_PUSH(btstack, bn, 0);
+
+		/* get the leftmost entry */
+		stbl = DT_GETSTBL(p);
+		xd = (pxd_t *) & p->slot[stbl[0]];
+
+		/* get the child page block address */
+		bn = addressPXD(xd);
+		psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize;
+
+		/* unpin the parent page */
+		DT_PUTPAGE(mp);
+	}
+}
+
+
+/*
+ *	dtReadNext()
+ *
+ * function: get the page of the specified offset (pn:index)
+ *
+ * return: if (offset > eof), bn = -1;
+ *
+ * note: if index > nextindex of the target leaf page,
+ * start with 1st entry of next leaf page;
+ */
+static int dtReadNext(struct inode *ip, loff_t * offset,
+		      struct btstack * btstack)
+{
+	int rc = 0;
+	struct dtoffset {
+		s16 pn;
+		s16 index;
+		s32 unused;
+	} *dtoffset = (struct dtoffset *) offset;
+	s64 bn;
+	struct metapage *mp;
+	dtpage_t *p;
+	int index;
+	int pn;
+	s8 *stbl;
+	struct btframe *btsp, *parent;
+	pxd_t *xd;
+
+	/*
+	 * get leftmost leaf page pinned
+	 */
+	if ((rc = dtReadFirst(ip, btstack)))
+		return rc;
+
+	/* get leaf page */
+	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
+
+	/* get the start offset (pn:index) */
+	pn = dtoffset->pn - 1;	/* Now pn = 0 represents leftmost leaf */
+	index = dtoffset->index;
+
+	/* start at leftmost page ? */
+	if (pn == 0) {
+		/* offset beyond eof ? */
+		if (index < p->header.nextindex)
+			goto out;
+
+		if (p->header.flag & BT_ROOT) {
+			bn = -1;
+			goto out;
+		}
+
+		/* start with 1st entry of next leaf page */
+		dtoffset->pn++;
+		dtoffset->index = index = 0;
+		goto a;
+	}
+
+	/* start at non-leftmost page: scan parent pages for large pn */
+	if (p->header.flag & BT_ROOT) {
+		bn = -1;
+		goto out;
+	}
+
+	/* start after next leaf page ? */
+	if (pn > 1)
+		goto b;
+
+	/* get leaf page pn = 1 */
+      a:
+	bn = le64_to_cpu(p->header.next);
+
+	/* unpin leaf page */
+	DT_PUTPAGE(mp);
+
+	/* offset beyond eof ? */
+	if (bn == 0) {
+		bn = -1;
+		goto out;
+	}
+
+	goto c;
+
+	/*
+	 * scan last internal page level to get target leaf page
+	 */
+      b:
+	/* unpin leftmost leaf page */
+	DT_PUTPAGE(mp);
+
+	/* get left most parent page */
+	btsp = btstack->top;
+	parent = btsp - 1;
+	bn = parent->bn;
+	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+	if (rc)
+		return rc;
+
+	/* scan parent pages at last internal page level */
+	while (pn >= p->header.nextindex) {
+		pn -= p->header.nextindex;
+
+		/* get next parent page address */
+		bn = le64_to_cpu(p->header.next);
+
+		/* unpin current parent page */
+		DT_PUTPAGE(mp);
+
+		/* offset beyond eof ? */
+		if (bn == 0) {
+			bn = -1;
+			goto out;
+		}
+
+		/* get next parent page */
+		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/* update parent page stack frame */
+		parent->bn = bn;
+	}
+
+	/* get leaf page address */
+	stbl = DT_GETSTBL(p);
+	xd = (pxd_t *) & p->slot[stbl[pn]];
+	bn = addressPXD(xd);
+
+	/* unpin parent page */
+	DT_PUTPAGE(mp);
+
+	/*
+	 * get target leaf page
+	 */
+      c:
+	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+	if (rc)
+		return rc;
+
+	/*
+	 * leaf page has been completed:
+	 * start with 1st entry of next leaf page
+	 */
+	if (index >= p->header.nextindex) {
+		bn = le64_to_cpu(p->header.next);
+
+		/* unpin leaf page */
+		DT_PUTPAGE(mp);
+
+		/* offset beyond eof ? */
+		if (bn == 0) {
+			bn = -1;
+			goto out;
+		}
+
+		/* get next leaf page */
+		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/* start with 1st entry of next leaf page */
+		dtoffset->pn++;
+		dtoffset->index = 0;
+	}
+
+      out:
+	/* return target leaf page pinned */
+	btsp = btstack->top;
+	btsp->bn = bn;
+	btsp->index = dtoffset->index;
+	btsp->mp = mp;
+
+	return 0;
+}
+
+
+/*
+ *	dtCompare()
+ *
+ * function: compare search key with an internal entry
+ *
+ * return:
+ *	< 0 if k is < record
+ *	= 0 if k is = record
+ *	> 0 if k is > record
+ */
+static int dtCompare(struct component_name * key,	/* search key */
+		     dtpage_t * p,	/* directory page */
+		     int si)
+{				/* entry slot index */
+	wchar_t *kname;
+	__le16 *name;
+	int klen, namlen, len, rc;
+	struct idtentry *ih;
+	struct dtslot *t;
+
+	/*
+	 * force the left-most key on internal pages, at any level of
+	 * the tree, to be less than any search key.
+	 * this obviates having to update the leftmost key on an internal
+	 * page when the user inserts a new key in the tree smaller than
+	 * anything that has been stored.
+	 *
+	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
+	 * at any internal page at any level of the tree,
+	 * it descends to child of the entry anyway -
+	 * ? make the entry as min size dummy entry)
+	 *
+	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
+	 * return (1);
+	 */
+
+	kname = key->name;
+	klen = key->namlen;
+
+	ih = (struct idtentry *) & p->slot[si];
+	si = ih->next;
+	name = ih->name;
+	namlen = ih->namlen;
+	len = min(namlen, DTIHDRDATALEN);
+
+	/* compare with head/only segment */
+	len = min(klen, len);
+	if ((rc = UniStrncmp_le(kname, name, len)))
+		return rc;
+
+	klen -= len;
+	namlen -= len;
+
+	/* compare with additional segment(s) */
+	kname += len;
+	while (klen > 0 && namlen > 0) {
+		/* compare with next name segment */
+		t = (struct dtslot *) & p->slot[si];
+		len = min(namlen, DTSLOTDATALEN);
+		len = min(klen, len);
+		name = t->name;
+		if ((rc = UniStrncmp_le(kname, name, len)))
+			return rc;
+
+		klen -= len;
+		namlen -= len;
+		kname += len;
+		si = t->next;
+	}
+
+	return (klen - namlen);
+}
+
+
+
+
+/*
+ *	ciCompare()
+ *
+ * function: compare search key with an (leaf/internal) entry
+ *
+ * return:
+ *	< 0 if k is < record
+ *	= 0 if k is = record
+ *	> 0 if k is > record
+ */
+static int ciCompare(struct component_name * key,	/* search key */
+		     dtpage_t * p,	/* directory page */
+		     int si,	/* entry slot index */
+		     int flag)
+{
+	wchar_t *kname, x;
+	__le16 *name;
+	int klen, namlen, len, rc;
+	struct ldtentry *lh;
+	struct idtentry *ih;
+	struct dtslot *t;
+	int i;
+
+	/*
+	 * force the left-most key on internal pages, at any level of
+	 * the tree, to be less than any search key.
+	 * this obviates having to update the leftmost key on an internal
+	 * page when the user inserts a new key in the tree smaller than
+	 * anything that has been stored.
+	 *
+	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
+	 * at any internal page at any level of the tree,
+	 * it descends to child of the entry anyway -
+	 * ? make the entry as min size dummy entry)
+	 *
+	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
+	 * return (1);
+	 */
+
+	kname = key->name;
+	klen = key->namlen;
+
+	/*
+	 * leaf page entry
+	 */
+	if (p->header.flag & BT_LEAF) {
+		lh = (struct ldtentry *) & p->slot[si];
+		si = lh->next;
+		name = lh->name;
+		namlen = lh->namlen;
+		if (flag & JFS_DIR_INDEX)
+			len = min(namlen, DTLHDRDATALEN);
+		else
+			len = min(namlen, DTLHDRDATALEN_LEGACY);
+	}
+	/*
+	 * internal page entry
+	 */
+	else {
+		ih = (struct idtentry *) & p->slot[si];
+		si = ih->next;
+		name = ih->name;
+		namlen = ih->namlen;
+		len = min(namlen, DTIHDRDATALEN);
+	}
+
+	/* compare with head/only segment */
+	len = min(klen, len);
+	for (i = 0; i < len; i++, kname++, name++) {
+		/* only uppercase if case-insensitive support is on */
+		if ((flag & JFS_OS2) == JFS_OS2)
+			x = UniToupper(le16_to_cpu(*name));
+		else
+			x = le16_to_cpu(*name);
+		if ((rc = *kname - x))
+			return rc;
+	}
+
+	klen -= len;
+	namlen -= len;
+
+	/* compare with additional segment(s) */
+	while (klen > 0 && namlen > 0) {
+		/* compare with next name segment */
+		t = (struct dtslot *) & p->slot[si];
+		len = min(namlen, DTSLOTDATALEN);
+		len = min(klen, len);
+		name = t->name;
+		for (i = 0; i < len; i++, kname++, name++) {
+			/* only uppercase if case-insensitive support is on */
+			if ((flag & JFS_OS2) == JFS_OS2)
+				x = UniToupper(le16_to_cpu(*name));
+			else
+				x = le16_to_cpu(*name);
+
+			if ((rc = *kname - x))
+				return rc;
+		}
+
+		klen -= len;
+		namlen -= len;
+		si = t->next;
+	}
+
+	return (klen - namlen);
+}
+
+
+/*
+ *	ciGetLeafPrefixKey()
+ *
+ * function: compute prefix of suffix compression
+ *	     from two adjacent leaf entries
+ *	     across page boundary
+ *
+ * return: non-zero on error
+ *
+ */
+static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
+			       int ri, struct component_name * key, int flag)
+{
+	int klen, namlen;
+	wchar_t *pl, *pr, *kname;
+	struct component_name lkey;
+	struct component_name rkey;
+
+	lkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
+					GFP_KERNEL);
+	if (lkey.name == NULL)
+		return -ENOMEM;
+
+	rkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
+					GFP_KERNEL);
+	if (rkey.name == NULL) {
+		kfree(lkey.name);
+		return -ENOMEM;
+	}
+
+	/* get left and right key */
+	dtGetKey(lp, li, &lkey, flag);
+	lkey.name[lkey.namlen] = 0;
+
+	if ((flag & JFS_OS2) == JFS_OS2)
+		ciToUpper(&lkey);
+
+	dtGetKey(rp, ri, &rkey, flag);
+	rkey.name[rkey.namlen] = 0;
+
+
+	if ((flag & JFS_OS2) == JFS_OS2)
+		ciToUpper(&rkey);
+
+	/* compute prefix */
+	klen = 0;
+	kname = key->name;
+	namlen = min(lkey.namlen, rkey.namlen);
+	for (pl = lkey.name, pr = rkey.name;
+	     namlen; pl++, pr++, namlen--, klen++, kname++) {
+		*kname = *pr;
+		if (*pl != *pr) {
+			key->namlen = klen + 1;
+			goto free_names;
+		}
+	}
+
+	/* l->namlen <= r->namlen since l <= r */
+	if (lkey.namlen < rkey.namlen) {
+		*kname = *pr;
+		key->namlen = klen + 1;
+	} else			/* l->namelen == r->namelen */
+		key->namlen = klen;
+
+free_names:
+	kfree(lkey.name);
+	kfree(rkey.name);
+	return 0;
+}
+
+
+
+/*
+ *	dtGetKey()
+ *
+ * function: get key of the entry
+ */
+static void dtGetKey(dtpage_t * p, int i,	/* entry index */
+		     struct component_name * key, int flag)
+{
+	int si;
+	s8 *stbl;
+	struct ldtentry *lh;
+	struct idtentry *ih;
+	struct dtslot *t;
+	int namlen, len;
+	wchar_t *kname;
+	__le16 *name;
+
+	/* get entry */
+	stbl = DT_GETSTBL(p);
+	si = stbl[i];
+	if (p->header.flag & BT_LEAF) {
+		lh = (struct ldtentry *) & p->slot[si];
+		si = lh->next;
+		namlen = lh->namlen;
+		name = lh->name;
+		if (flag & JFS_DIR_INDEX)
+			len = min(namlen, DTLHDRDATALEN);
+		else
+			len = min(namlen, DTLHDRDATALEN_LEGACY);
+	} else {
+		ih = (struct idtentry *) & p->slot[si];
+		si = ih->next;
+		namlen = ih->namlen;
+		name = ih->name;
+		len = min(namlen, DTIHDRDATALEN);
+	}
+
+	key->namlen = namlen;
+	kname = key->name;
+
+	/*
+	 * move head/only segment
+	 */
+	UniStrncpy_from_le(kname, name, len);
+
+	/*
+	 * move additional segment(s)
+	 */
+	while (si >= 0) {
+		/* get next segment */
+		t = &p->slot[si];
+		kname += len;
+		namlen -= len;
+		len = min(namlen, DTSLOTDATALEN);
+		UniStrncpy_from_le(kname, t->name, len);
+
+		si = t->next;
+	}
+}
+
+
+/*
+ *	dtInsertEntry()
+ *
+ * function: allocate free slot(s) and
+ *	     write a leaf/internal entry
+ *
+ * return: entry slot index
+ */
+static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
+			  ddata_t * data, struct dt_lock ** dtlock)
+{
+	struct dtslot *h, *t;
+	struct ldtentry *lh = NULL;
+	struct idtentry *ih = NULL;
+	int hsi, fsi, klen, len, nextindex;
+	wchar_t *kname;
+	__le16 *name;
+	s8 *stbl;
+	pxd_t *xd;
+	struct dt_lock *dtlck = *dtlock;
+	struct lv *lv;
+	int xsi, n;
+	s64 bn = 0;
+	struct metapage *mp = NULL;
+
+	klen = key->namlen;
+	kname = key->name;
+
+	/* allocate a free slot */
+	hsi = fsi = p->header.freelist;
+	h = &p->slot[fsi];
+	p->header.freelist = h->next;
+	--p->header.freecnt;
+
+	/* open new linelock */
+	if (dtlck->index >= dtlck->maxcnt)
+		dtlck = (struct dt_lock *) txLinelock(dtlck);
+
+	lv = & dtlck->lv[dtlck->index];
+	lv->offset = hsi;
+
+	/* write head/only segment */
+	if (p->header.flag & BT_LEAF) {
+		lh = (struct ldtentry *) h;
+		lh->next = h->next;
+		lh->inumber = cpu_to_le32(data->leaf.ino);
+		lh->namlen = klen;
+		name = lh->name;
+		if (data->leaf.ip) {
+			len = min(klen, DTLHDRDATALEN);
+			if (!(p->header.flag & BT_ROOT))
+				bn = addressPXD(&p->header.self);
+			lh->index = cpu_to_le32(add_index(data->leaf.tid,
+							  data->leaf.ip,
+							  bn, index));
+		} else
+			len = min(klen, DTLHDRDATALEN_LEGACY);
+	} else {
+		ih = (struct idtentry *) h;
+		ih->next = h->next;
+		xd = (pxd_t *) ih;
+		*xd = data->xd;
+		ih->namlen = klen;
+		name = ih->name;
+		len = min(klen, DTIHDRDATALEN);
+	}
+
+	UniStrncpy_to_le(name, kname, len);
+
+	n = 1;
+	xsi = hsi;
+
+	/* write additional segment(s) */
+	t = h;
+	klen -= len;
+	while (klen) {
+		/* get free slot */
+		fsi = p->header.freelist;
+		t = &p->slot[fsi];
+		p->header.freelist = t->next;
+		--p->header.freecnt;
+
+		/* is next slot contiguous ? */
+		if (fsi != xsi + 1) {
+			/* close current linelock */
+			lv->length = n;
+			dtlck->index++;
+
+			/* open new linelock */
+			if (dtlck->index < dtlck->maxcnt)
+				lv++;
+			else {
+				dtlck = (struct dt_lock *) txLinelock(dtlck);
+				lv = & dtlck->lv[0];
+			}
+
+			lv->offset = fsi;
+			n = 0;
+		}
+
+		kname += len;
+		len = min(klen, DTSLOTDATALEN);
+		UniStrncpy_to_le(t->name, kname, len);
+
+		n++;
+		xsi = fsi;
+		klen -= len;
+	}
+
+	/* close current linelock */
+	lv->length = n;
+	dtlck->index++;
+
+	*dtlock = dtlck;
+
+	/* terminate last/only segment */
+	if (h == t) {
+		/* single segment entry */
+		if (p->header.flag & BT_LEAF)
+			lh->next = -1;
+		else
+			ih->next = -1;
+	} else
+		/* multi-segment entry */
+		t->next = -1;
+
+	/* if insert into middle, shift right succeeding entries in stbl */
+	stbl = DT_GETSTBL(p);
+	nextindex = p->header.nextindex;
+	if (index < nextindex) {
+		memmove(stbl + index + 1, stbl + index, nextindex - index);
+
+		if ((p->header.flag & BT_LEAF) && data->leaf.ip) {
+			s64 lblock;
+
+			/*
+			 * Need to update slot number for entries that moved
+			 * in the stbl
+			 */
+			mp = NULL;
+			for (n = index + 1; n <= nextindex; n++) {
+				lh = (struct ldtentry *) & (p->slot[stbl[n]]);
+				modify_index(data->leaf.tid, data->leaf.ip,
+					     le32_to_cpu(lh->index), bn, n,
+					     &mp, &lblock);
+			}
+			if (mp)
+				release_metapage(mp);
+		}
+	}
+
+	stbl[index] = hsi;
+
+	/* advance next available entry index of stbl */
+	++p->header.nextindex;
+}
+
+
+/*
+ *	dtMoveEntry()
+ *
+ * function: move entries from split/left page to new/right page
+ *
+ *	nextindex of dst page and freelist/freecnt of both pages
+ *	are updated.
+ */
+static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
+			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
+			int do_index)
+{
+	int ssi, next;		/* src slot index */
+	int di;			/* dst entry index */
+	int dsi;		/* dst slot index */
+	s8 *sstbl, *dstbl;	/* sorted entry table */
+	int snamlen, len;
+	struct ldtentry *slh, *dlh = NULL;
+	struct idtentry *sih, *dih = NULL;
+	struct dtslot *h, *s, *d;
+	struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock;
+	struct lv *slv, *dlv;
+	int xssi, ns, nd;
+	int sfsi;
+
+	sstbl = (s8 *) & sp->slot[sp->header.stblindex];
+	dstbl = (s8 *) & dp->slot[dp->header.stblindex];
+
+	dsi = dp->header.freelist;	/* first (whole page) free slot */
+	sfsi = sp->header.freelist;
+
+	/* linelock destination entry slot */
+	dlv = & ddtlck->lv[ddtlck->index];
+	dlv->offset = dsi;
+
+	/* linelock source entry slot */
+	slv = & sdtlck->lv[sdtlck->index];
+	slv->offset = sstbl[si];
+	xssi = slv->offset - 1;
+
+	/*
+	 * move entries
+	 */
+	ns = nd = 0;
+	for (di = 0; si < sp->header.nextindex; si++, di++) {
+		ssi = sstbl[si];
+		dstbl[di] = dsi;
+
+		/* is next slot contiguous ? */
+		if (ssi != xssi + 1) {
+			/* close current linelock */
+			slv->length = ns;
+			sdtlck->index++;
+
+			/* open new linelock */
+			if (sdtlck->index < sdtlck->maxcnt)
+				slv++;
+			else {
+				sdtlck = (struct dt_lock *) txLinelock(sdtlck);
+				slv = & sdtlck->lv[0];
+			}
+
+			slv->offset = ssi;
+			ns = 0;
+		}
+
+		/*
+		 * move head/only segment of an entry
+		 */
+		/* get dst slot */
+		h = d = &dp->slot[dsi];
+
+		/* get src slot and move */
+		s = &sp->slot[ssi];
+		if (sp->header.flag & BT_LEAF) {
+			/* get source entry */
+			slh = (struct ldtentry *) s;
+			dlh = (struct ldtentry *) h;
+			snamlen = slh->namlen;
+
+			if (do_index) {
+				len = min(snamlen, DTLHDRDATALEN);
+				dlh->index = slh->index; /* little-endian */
+			} else
+				len = min(snamlen, DTLHDRDATALEN_LEGACY);
+
+			memcpy(dlh, slh, 6 + len * 2);
+
+			next = slh->next;
+
+			/* update dst head/only segment next field */
+			dsi++;
+			dlh->next = dsi;
+		} else {
+			sih = (struct idtentry *) s;
+			snamlen = sih->namlen;
+
+			len = min(snamlen, DTIHDRDATALEN);
+			dih = (struct idtentry *) h;
+			memcpy(dih, sih, 10 + len * 2);
+			next = sih->next;
+
+			dsi++;
+			dih->next = dsi;
+		}
+
+		/* free src head/only segment */
+		s->next = sfsi;
+		s->cnt = 1;
+		sfsi = ssi;
+
+		ns++;
+		nd++;
+		xssi = ssi;
+
+		/*
+		 * move additional segment(s) of the entry
+		 */
+		snamlen -= len;
+		while ((ssi = next) >= 0) {
+			/* is next slot contiguous ? */
+			if (ssi != xssi + 1) {
+				/* close current linelock */
+				slv->length = ns;
+				sdtlck->index++;
+
+				/* open new linelock */
+				if (sdtlck->index < sdtlck->maxcnt)
+					slv++;
+				else {
+					sdtlck =
+					    (struct dt_lock *)
+					    txLinelock(sdtlck);
+					slv = & sdtlck->lv[0];
+				}
+
+				slv->offset = ssi;
+				ns = 0;
+			}
+
+			/* get next source segment */
+			s = &sp->slot[ssi];
+
+			/* get next destination free slot */
+			d++;
+
+			len = min(snamlen, DTSLOTDATALEN);
+			UniStrncpy_le(d->name, s->name, len);
+
+			ns++;
+			nd++;
+			xssi = ssi;
+
+			dsi++;
+			d->next = dsi;
+
+			/* free source segment */
+			next = s->next;
+			s->next = sfsi;
+			s->cnt = 1;
+			sfsi = ssi;
+
+			snamlen -= len;
+		}		/* end while */
+
+		/* terminate dst last/only segment */
+		if (h == d) {
+			/* single segment entry */
+			if (dp->header.flag & BT_LEAF)
+				dlh->next = -1;
+			else
+				dih->next = -1;
+		} else
+			/* multi-segment entry */
+			d->next = -1;
+	}			/* end for */
+
+	/* close current linelock */
+	slv->length = ns;
+	sdtlck->index++;
+	*sdtlock = sdtlck;
+
+	dlv->length = nd;
+	ddtlck->index++;
+	*ddtlock = ddtlck;
+
+	/* update source header */
+	sp->header.freelist = sfsi;
+	sp->header.freecnt += nd;
+
+	/* update destination header */
+	dp->header.nextindex = di;
+
+	dp->header.freelist = dsi;
+	dp->header.freecnt -= nd;
+}
+
+
+/*
+ *	dtDeleteEntry()
+ *
+ * function: free a (leaf/internal) entry
+ *
+ * log freelist header, stbl, and each segment slot of entry
+ * (even though last/only segment next field is modified,
+ * physical image logging requires all segment slots of
+ * the entry logged to avoid applying previous updates
+ * to the same slots)
+ */
+static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock)
+{
+	int fsi;		/* free entry slot index */
+	s8 *stbl;
+	struct dtslot *t;
+	int si, freecnt;
+	struct dt_lock *dtlck = *dtlock;
+	struct lv *lv;
+	int xsi, n;
+
+	/* get free entry slot index */
+	stbl = DT_GETSTBL(p);
+	fsi = stbl[fi];
+
+	/* open new linelock */
+	if (dtlck->index >= dtlck->maxcnt)
+		dtlck = (struct dt_lock *) txLinelock(dtlck);
+	lv = & dtlck->lv[dtlck->index];
+
+	lv->offset = fsi;
+
+	/* get the head/only segment */
+	t = &p->slot[fsi];
+	if (p->header.flag & BT_LEAF)
+		si = ((struct ldtentry *) t)->next;
+	else
+		si = ((struct idtentry *) t)->next;
+	t->next = si;
+	t->cnt = 1;
+
+	n = freecnt = 1;
+	xsi = fsi;
+
+	/* find the last/only segment */
+	while (si >= 0) {
+		/* is next slot contiguous ? */
+		if (si != xsi + 1) {
+			/* close current linelock */
+			lv->length = n;
+			dtlck->index++;
+
+			/* open new linelock */
+			if (dtlck->index < dtlck->maxcnt)
+				lv++;
+			else {
+				dtlck = (struct dt_lock *) txLinelock(dtlck);
+				lv = & dtlck->lv[0];
+			}
+
+			lv->offset = si;
+			n = 0;
+		}
+
+		n++;
+		xsi = si;
+		freecnt++;
+
+		t = &p->slot[si];
+		t->cnt = 1;
+		si = t->next;
+	}
+
+	/* close current linelock */
+	lv->length = n;
+	dtlck->index++;
+
+	*dtlock = dtlck;
+
+	/* update freelist */
+	t->next = p->header.freelist;
+	p->header.freelist = fsi;
+	p->header.freecnt += freecnt;
+
+	/* if delete from middle,
+	 * shift left the succedding entries in the stbl
+	 */
+	si = p->header.nextindex;
+	if (fi < si - 1)
+		memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1);
+
+	p->header.nextindex--;
+}
+
+
+/*
+ *	dtTruncateEntry()
+ *
+ * function: truncate a (leaf/internal) entry
+ *
+ * log freelist header, stbl, and each segment slot of entry
+ * (even though last/only segment next field is modified,
+ * physical image logging requires all segment slots of
+ * the entry logged to avoid applying previous updates
+ * to the same slots)
+ */
+static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock)
+{
+	int tsi;		/* truncate entry slot index */
+	s8 *stbl;
+	struct dtslot *t;
+	int si, freecnt;
+	struct dt_lock *dtlck = *dtlock;
+	struct lv *lv;
+	int fsi, xsi, n;
+
+	/* get free entry slot index */
+	stbl = DT_GETSTBL(p);
+	tsi = stbl[ti];
+
+	/* open new linelock */
+	if (dtlck->index >= dtlck->maxcnt)
+		dtlck = (struct dt_lock *) txLinelock(dtlck);
+	lv = & dtlck->lv[dtlck->index];
+
+	lv->offset = tsi;
+
+	/* get the head/only segment */
+	t = &p->slot[tsi];
+	ASSERT(p->header.flag & BT_INTERNAL);
+	((struct idtentry *) t)->namlen = 0;
+	si = ((struct idtentry *) t)->next;
+	((struct idtentry *) t)->next = -1;
+
+	n = 1;
+	freecnt = 0;
+	fsi = si;
+	xsi = tsi;
+
+	/* find the last/only segment */
+	while (si >= 0) {
+		/* is next slot contiguous ? */
+		if (si != xsi + 1) {
+			/* close current linelock */
+			lv->length = n;
+			dtlck->index++;
+
+			/* open new linelock */
+			if (dtlck->index < dtlck->maxcnt)
+				lv++;
+			else {
+				dtlck = (struct dt_lock *) txLinelock(dtlck);
+				lv = & dtlck->lv[0];
+			}
+
+			lv->offset = si;
+			n = 0;
+		}
+
+		n++;
+		xsi = si;
+		freecnt++;
+
+		t = &p->slot[si];
+		t->cnt = 1;
+		si = t->next;
+	}
+
+	/* close current linelock */
+	lv->length = n;
+	dtlck->index++;
+
+	*dtlock = dtlck;
+
+	/* update freelist */
+	if (freecnt == 0)
+		return;
+	t->next = p->header.freelist;
+	p->header.freelist = fsi;
+	p->header.freecnt += freecnt;
+}
+
+
+/*
+ *	dtLinelockFreelist()
+ */
+static void dtLinelockFreelist(dtpage_t * p,	/* directory page */
+			       int m,	/* max slot index */
+			       struct dt_lock ** dtlock)
+{
+	int fsi;		/* free entry slot index */
+	struct dtslot *t;
+	int si;
+	struct dt_lock *dtlck = *dtlock;
+	struct lv *lv;
+	int xsi, n;
+
+	/* get free entry slot index */
+	fsi = p->header.freelist;
+
+	/* open new linelock */
+	if (dtlck->index >= dtlck->maxcnt)
+		dtlck = (struct dt_lock *) txLinelock(dtlck);
+	lv = & dtlck->lv[dtlck->index];
+
+	lv->offset = fsi;
+
+	n = 1;
+	xsi = fsi;
+
+	t = &p->slot[fsi];
+	si = t->next;
+
+	/* find the last/only segment */
+	while (si < m && si >= 0) {
+		/* is next slot contiguous ? */
+		if (si != xsi + 1) {
+			/* close current linelock */
+			lv->length = n;
+			dtlck->index++;
+
+			/* open new linelock */
+			if (dtlck->index < dtlck->maxcnt)
+				lv++;
+			else {
+				dtlck = (struct dt_lock *) txLinelock(dtlck);
+				lv = & dtlck->lv[0];
+			}
+
+			lv->offset = si;
+			n = 0;
+		}
+
+		n++;
+		xsi = si;
+
+		t = &p->slot[si];
+		si = t->next;
+	}
+
+	/* close current linelock */
+	lv->length = n;
+	dtlck->index++;
+
+	*dtlock = dtlck;
+}
+
+
+/*
+ * NAME: dtModify
+ *
+ * FUNCTION: Modify the inode number part of a directory entry
+ *
+ * PARAMETERS:
+ *	tid	- Transaction id
+ *	ip	- Inode of parent directory
+ *	key	- Name of entry to be modified
+ *	orig_ino	- Original inode number expected in entry
+ *	new_ino	- New inode number to put into entry
+ *	flag	- JFS_RENAME
+ *
+ * RETURNS:
+ *	-ESTALE	- If entry found does not match orig_ino passed in
+ *	-ENOENT	- If no entry can be found to match key
+ *	0	- If successfully modified entry
+ */
+int dtModify(tid_t tid, struct inode *ip,
+	 struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag)
+{
+	int rc;
+	s64 bn;
+	struct metapage *mp;
+	dtpage_t *p;
+	int index;
+	struct btstack btstack;
+	struct tlock *tlck;
+	struct dt_lock *dtlck;
+	struct lv *lv;
+	s8 *stbl;
+	int entry_si;		/* entry slot index */
+	struct ldtentry *entry;
+
+	/*
+	 *	search for the entry to modify:
+	 *
+	 * dtSearch() returns (leaf page pinned, index at which to modify).
+	 */
+	if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag)))
+		return rc;
+
+	/* retrieve search result */
+	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+	BT_MARK_DIRTY(mp, ip);
+	/*
+	 * acquire a transaction lock on the leaf page of named entry
+	 */
+	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+	dtlck = (struct dt_lock *) & tlck->lock;
+
+	/* get slot index of the entry */
+	stbl = DT_GETSTBL(p);
+	entry_si = stbl[index];
+
+	/* linelock entry */
+	ASSERT(dtlck->index == 0);
+	lv = & dtlck->lv[0];
+	lv->offset = entry_si;
+	lv->length = 1;
+	dtlck->index++;
+
+	/* get the head/only segment */
+	entry = (struct ldtentry *) & p->slot[entry_si];
+
+	/* substitute the inode number of the entry */
+	entry->inumber = cpu_to_le32(new_ino);
+
+	/* unpin the leaf page */
+	DT_PUTPAGE(mp);
+
+	return 0;
+}
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
new file mode 100644
index 00000000..2545bb31
--- /dev/null
+++ b/fs/jfs/jfs_dtree.h
@@ -0,0 +1,269 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_DTREE
+#define	_H_JFS_DTREE
+
+/*
+ *	jfs_dtree.h: directory B+-tree manager
+ */
+
+#include "jfs_btree.h"
+
+typedef union {
+	struct {
+		tid_t tid;
+		struct inode *ip;
+		u32 ino;
+	} leaf;
+	pxd_t xd;
+} ddata_t;
+
+
+/*
+ *	entry segment/slot
+ *
+ * an entry consists of type dependent head/only segment/slot and
+ * additional segments/slots linked vi next field;
+ * N.B. last/only segment of entry is terminated by next = -1;
+ */
+/*
+ *	directory page slot
+ */
+struct dtslot {
+	s8 next;		/* 1: */
+	s8 cnt;			/* 1: */
+	__le16 name[15];	/* 30: */
+};				/* (32) */
+
+
+#define DATASLOTSIZE	16
+#define L2DATASLOTSIZE	4
+#define	DTSLOTSIZE	32
+#define	L2DTSLOTSIZE	5
+#define DTSLOTHDRSIZE	2
+#define DTSLOTDATASIZE	30
+#define DTSLOTDATALEN	15
+
+/*
+ *	 internal node entry head/only segment
+ */
+struct idtentry {
+	pxd_t xd;		/* 8: child extent descriptor */
+
+	s8 next;		/* 1: */
+	u8 namlen;		/* 1: */
+	__le16 name[11];	/* 22: 2-byte aligned */
+};				/* (32) */
+
+#define DTIHDRSIZE	10
+#define DTIHDRDATALEN	11
+
+/* compute number of slots for entry */
+#define	NDTINTERNAL(klen) (DIV_ROUND_UP((4 + (klen)), 15))
+
+
+/*
+ *	leaf node entry head/only segment
+ *
+ *	For legacy filesystems, name contains 13 wchars -- no index field
+ */
+struct ldtentry {
+	__le32 inumber;		/* 4: 4-byte aligned */
+	s8 next;		/* 1: */
+	u8 namlen;		/* 1: */
+	__le16 name[11];	/* 22: 2-byte aligned */
+	__le32 index;		/* 4: index into dir_table */
+};				/* (32) */
+
+#define DTLHDRSIZE	6
+#define DTLHDRDATALEN_LEGACY	13	/* Old (OS/2) format */
+#define DTLHDRDATALEN	11
+
+/*
+ * dir_table used for directory traversal during readdir
+ */
+
+/*
+ * Keep persistent index for directory entries
+ */
+#define DO_INDEX(INODE) (JFS_SBI((INODE)->i_sb)->mntflag & JFS_DIR_INDEX)
+
+/*
+ * Maximum entry in inline directory table
+ */
+#define MAX_INLINE_DIRTABLE_ENTRY 13
+
+struct dir_table_slot {
+	u8 rsrvd;		/* 1: */
+	u8 flag;		/* 1: 0 if free */
+	u8 slot;		/* 1: slot within leaf page of entry */
+	u8 addr1;		/* 1: upper 8 bits of leaf page address */
+	__le32 addr2;		/* 4: lower 32 bits of leaf page address -OR-
+				   index of next entry when this entry was deleted */
+};				/* (8) */
+
+/*
+ * flag values
+ */
+#define DIR_INDEX_VALID 1
+#define DIR_INDEX_FREE 0
+
+#define DTSaddress(dir_table_slot, address64)\
+{\
+	(dir_table_slot)->addr1 = ((u64)address64) >> 32;\
+	(dir_table_slot)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
+}
+
+#define addressDTS(dts)\
+	( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) )
+
+/* compute number of slots for entry */
+#define	NDTLEAF_LEGACY(klen)	(DIV_ROUND_UP((2 + (klen)), 15))
+#define	NDTLEAF	NDTINTERNAL
+
+
+/*
+ *	directory root page (in-line in on-disk inode):
+ *
+ * cf. dtpage_t below.
+ */
+typedef union {
+	struct {
+		struct dasd DASD; /* 16: DASD limit/usage info */
+
+		u8 flag;	/* 1: */
+		u8 nextindex;	/* 1: next free entry in stbl */
+		s8 freecnt;	/* 1: free count */
+		s8 freelist;	/* 1: freelist header */
+
+		__le32 idotdot;	/* 4: parent inode number */
+
+		s8 stbl[8];	/* 8: sorted entry index table */
+	} header;		/* (32) */
+
+	struct dtslot slot[9];
+} dtroot_t;
+
+#define PARENT(IP) \
+	(le32_to_cpu(JFS_IP(IP)->i_dtroot.header.idotdot))
+
+#define DTROOTMAXSLOT	9
+
+#define	dtEmpty(IP) (JFS_IP(IP)->i_dtroot.header.nextindex == 0)
+
+
+/*
+ *	directory regular page:
+ *
+ *	entry slot array of 32 byte slot
+ *
+ * sorted entry slot index table (stbl):
+ * contiguous slots at slot specified by stblindex,
+ * 1-byte per entry
+ *   512 byte block:  16 entry tbl (1 slot)
+ *  1024 byte block:  32 entry tbl (1 slot)
+ *  2048 byte block:  64 entry tbl (2 slot)
+ *  4096 byte block: 128 entry tbl (4 slot)
+ *
+ * data area:
+ *   512 byte block:  16 - 2 =  14 slot
+ *  1024 byte block:  32 - 2 =  30 slot
+ *  2048 byte block:  64 - 3 =  61 slot
+ *  4096 byte block: 128 - 5 = 123 slot
+ *
+ * N.B. index is 0-based; index fields refer to slot index
+ * except nextindex which refers to entry index in stbl;
+ * end of entry stot list or freelist is marked with -1.
+ */
+typedef union {
+	struct {
+		__le64 next;	/* 8: next sibling */
+		__le64 prev;	/* 8: previous sibling */
+
+		u8 flag;	/* 1: */
+		u8 nextindex;	/* 1: next entry index in stbl */
+		s8 freecnt;	/* 1: */
+		s8 freelist;	/* 1: slot index of head of freelist */
+
+		u8 maxslot;	/* 1: number of slots in page slot[] */
+		u8 stblindex;	/* 1: slot index of start of stbl */
+		u8 rsrvd[2];	/* 2: */
+
+		pxd_t self;	/* 8: self pxd */
+	} header;		/* (32) */
+
+	struct dtslot slot[128];
+} dtpage_t;
+
+#define DTPAGEMAXSLOT        128
+
+#define DT8THPGNODEBYTES     512
+#define DT8THPGNODETSLOTS      1
+#define DT8THPGNODESLOTS      16
+
+#define DTQTRPGNODEBYTES    1024
+#define DTQTRPGNODETSLOTS      1
+#define DTQTRPGNODESLOTS      32
+
+#define DTHALFPGNODEBYTES   2048
+#define DTHALFPGNODETSLOTS     2
+#define DTHALFPGNODESLOTS     64
+
+#define DTFULLPGNODEBYTES   4096
+#define DTFULLPGNODETSLOTS     4
+#define DTFULLPGNODESLOTS    128
+
+#define DTENTRYSTART	1
+
+/* get sorted entry table of the page */
+#define DT_GETSTBL(p) ( ((p)->header.flag & BT_ROOT) ?\
+	((dtroot_t *)(p))->header.stbl : \
+	(s8 *)&(p)->slot[(p)->header.stblindex] )
+
+/*
+ * Flags for dtSearch
+ */
+#define JFS_CREATE 1
+#define JFS_LOOKUP 2
+#define JFS_REMOVE 3
+#define JFS_RENAME 4
+
+/*
+ * Maximum file offset for directories.
+ */
+#define DIREND	INT_MAX
+
+/*
+ *	external declarations
+ */
+extern void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot);
+
+extern int dtSearch(struct inode *ip, struct component_name * key,
+		    ino_t * data, struct btstack * btstack, int flag);
+
+extern int dtInsert(tid_t tid, struct inode *ip, struct component_name * key,
+		    ino_t * ino, struct btstack * btstack);
+
+extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key,
+		    ino_t * data, int flag);
+
+extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key,
+		    ino_t * orig_ino, ino_t new_ino, int flag);
+
+extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir);
+#endif				/* !_H_JFS_DTREE */
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
new file mode 100644
index 00000000..e5fe8506
--- /dev/null
+++ b/fs/jfs/jfs_extent.c
@@ -0,0 +1,651 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_inode.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_extent.h"
+#include "jfs_debug.h"
+
+/*
+ * forward references
+ */
+static int extBalloc(struct inode *, s64, s64 *, s64 *);
+#ifdef _NOTYET
+static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *);
+#endif
+static s64 extRoundDown(s64 nb);
+
+#define DPD(a)		(printk("(a): %d\n",(a)))
+#define DPC(a)		(printk("(a): %c\n",(a)))
+#define DPL1(a)					\
+{						\
+	if ((a) >> 32)				\
+		printk("(a): %x%08x  ",(a));	\
+	else					\
+		printk("(a): %x  ",(a) << 32);	\
+}
+#define DPL(a)					\
+{						\
+	if ((a) >> 32)				\
+		printk("(a): %x%08x\n",(a));	\
+	else					\
+		printk("(a): %x\n",(a) << 32);	\
+}
+
+#define DPD1(a)		(printk("(a): %d  ",(a)))
+#define DPX(a)		(printk("(a): %08x\n",(a)))
+#define DPX1(a)		(printk("(a): %08x  ",(a)))
+#define DPS(a)		(printk("%s\n",(a)))
+#define DPE(a)		(printk("\nENTERING: %s\n",(a)))
+#define DPE1(a)		(printk("\nENTERING: %s",(a)))
+#define DPS1(a)		(printk("  %s  ",(a)))
+
+
+/*
+ * NAME:	extAlloc()
+ *
+ * FUNCTION:	allocate an extent for a specified page range within a
+ *		file.
+ *
+ * PARAMETERS:
+ *	ip	- the inode of the file.
+ *	xlen	- requested extent length.
+ *	pno	- the starting page number with the file.
+ *	xp	- pointer to an xad.  on entry, xad describes an
+ *		  extent that is used as an allocation hint if the
+ *		  xaddr of the xad is non-zero.  on successful exit,
+ *		  the xad describes the newly allocated extent.
+ *	abnr	- bool indicating whether the newly allocated extent
+ *		  should be marked as allocated but not recorded.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ *	-ENOSPC	- insufficient disk resources.
+ */
+int
+extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+	s64 nxlen, nxaddr, xoff, hint, xaddr = 0;
+	int rc;
+	int xflag;
+
+	/* This blocks if we are low on resources */
+	txBeginAnon(ip->i_sb);
+
+	/* Avoid race with jfs_commit_inode() */
+	mutex_lock(&JFS_IP(ip)->commit_mutex);
+
+	/* validate extent length */
+	if (xlen > MAXXLEN)
+		xlen = MAXXLEN;
+
+	/* get the page's starting extent offset */
+	xoff = pno << sbi->l2nbperpage;
+
+	/* check if an allocation hint was provided */
+	if ((hint = addressXAD(xp))) {
+		/* get the size of the extent described by the hint */
+		nxlen = lengthXAD(xp);
+
+		/* check if the hint is for the portion of the file
+		 * immediately previous to the current allocation
+		 * request and if hint extent has the same abnr
+		 * value as the current request.  if so, we can
+		 * extend the hint extent to include the current
+		 * extent if we can allocate the blocks immediately
+		 * following the hint extent.
+		 */
+		if (offsetXAD(xp) + nxlen == xoff &&
+		    abnr == ((xp->flag & XAD_NOTRECORDED) ? true : false))
+			xaddr = hint + nxlen;
+
+		/* adjust the hint to the last block of the extent */
+		hint += (nxlen - 1);
+	}
+
+	/* allocate the disk blocks for the extent.  initially, extBalloc()
+	 * will try to allocate disk blocks for the requested size (xlen).
+	 * if this fails (xlen contiguous free blocks not available), it'll
+	 * try to allocate a smaller number of blocks (producing a smaller
+	 * extent), with this smaller number of blocks consisting of the
+	 * requested number of blocks rounded down to the next smaller
+	 * power of 2 number (i.e. 16 -> 8).  it'll continue to round down
+	 * and retry the allocation until the number of blocks to allocate
+	 * is smaller than the number of blocks per page.
+	 */
+	nxlen = xlen;
+	if ((rc = extBalloc(ip, hint ? hint : INOHINT(ip), &nxlen, &nxaddr))) {
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+		return (rc);
+	}
+
+	/* Allocate blocks to quota. */
+	rc = dquot_alloc_block(ip, nxlen);
+	if (rc) {
+		dbFree(ip, nxaddr, (s64) nxlen);
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+		return rc;
+	}
+
+	/* determine the value of the extent flag */
+	xflag = abnr ? XAD_NOTRECORDED : 0;
+
+	/* if we can extend the hint extent to cover the current request,
+	 * extend it.  otherwise, insert a new extent to
+	 * cover the current request.
+	 */
+	if (xaddr && xaddr == nxaddr)
+		rc = xtExtend(0, ip, xoff, (int) nxlen, 0);
+	else
+		rc = xtInsert(0, ip, xflag, xoff, (int) nxlen, &nxaddr, 0);
+
+	/* if the extend or insert failed,
+	 * free the newly allocated blocks and return the error.
+	 */
+	if (rc) {
+		dbFree(ip, nxaddr, nxlen);
+		dquot_free_block(ip, nxlen);
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+		return (rc);
+	}
+
+	/* set the results of the extent allocation */
+	XADaddress(xp, nxaddr);
+	XADlength(xp, nxlen);
+	XADoffset(xp, xoff);
+	xp->flag = xflag;
+
+	mark_inode_dirty(ip);
+
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	/*
+	 * COMMIT_SyncList flags an anonymous tlock on page that is on
+	 * sync list.
+	 * We need to commit the inode to get the page written disk.
+	 */
+	if (test_and_clear_cflag(COMMIT_Synclist,ip))
+		jfs_commit_inode(ip, 0);
+
+	return (0);
+}
+
+
+#ifdef _NOTYET
+/*
+ * NAME:	extRealloc()
+ *
+ * FUNCTION:	extend the allocation of a file extent containing a
+ *		partial back last page.
+ *
+ * PARAMETERS:
+ *	ip	- the inode of the file.
+ *	cp	- cbuf for the partial backed last page.
+ *	xlen	- request size of the resulting extent.
+ *	xp	- pointer to an xad. on successful exit, the xad
+ *		  describes the newly allocated extent.
+ *	abnr	- bool indicating whether the newly allocated extent
+ *		  should be marked as allocated but not recorded.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ *	-ENOSPC	- insufficient disk resources.
+ */
+int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
+{
+	struct super_block *sb = ip->i_sb;
+	s64 xaddr, xlen, nxaddr, delta, xoff;
+	s64 ntail, nextend, ninsert;
+	int rc, nbperpage = JFS_SBI(sb)->nbperpage;
+	int xflag;
+
+	/* This blocks if we are low on resources */
+	txBeginAnon(ip->i_sb);
+
+	mutex_lock(&JFS_IP(ip)->commit_mutex);
+	/* validate extent length */
+	if (nxlen > MAXXLEN)
+		nxlen = MAXXLEN;
+
+	/* get the extend (partial) page's disk block address and
+	 * number of blocks.
+	 */
+	xaddr = addressXAD(xp);
+	xlen = lengthXAD(xp);
+	xoff = offsetXAD(xp);
+
+	/* if the extend page is abnr and if the request is for
+	 * the extent to be allocated and recorded,
+	 * make the page allocated and recorded.
+	 */
+	if ((xp->flag & XAD_NOTRECORDED) && !abnr) {
+		xp->flag = 0;
+		if ((rc = xtUpdate(0, ip, xp)))
+			goto exit;
+	}
+
+	/* try to allocated the request number of blocks for the
+	 * extent.  dbRealloc() first tries to satisfy the request
+	 * by extending the allocation in place. otherwise, it will
+	 * try to allocate a new set of blocks large enough for the
+	 * request.  in satisfying a request, dbReAlloc() may allocate
+	 * less than what was request but will always allocate enough
+	 * space as to satisfy the extend page.
+	 */
+	if ((rc = extBrealloc(ip, xaddr, xlen, &nxlen, &nxaddr)))
+		goto exit;
+
+	/* Allocat blocks to quota. */
+	rc = dquot_alloc_block(ip, nxlen);
+	if (rc) {
+		dbFree(ip, nxaddr, (s64) nxlen);
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+		return rc;
+	}
+
+	delta = nxlen - xlen;
+
+	/* check if the extend page is not abnr but the request is abnr
+	 * and the allocated disk space is for more than one page.  if this
+	 * is the case, there is a miss match of abnr between the extend page
+	 * and the one or more pages following the extend page.  as a result,
+	 * two extents will have to be manipulated. the first will be that
+	 * of the extent of the extend page and will be manipulated thru
+	 * an xtExtend() or an xtTailgate(), depending upon whether the
+	 * disk allocation occurred as an inplace extension.  the second
+	 * extent will be manipulated (created) through an xtInsert() and
+	 * will be for the pages following the extend page.
+	 */
+	if (abnr && (!(xp->flag & XAD_NOTRECORDED)) && (nxlen > nbperpage)) {
+		ntail = nbperpage;
+		nextend = ntail - xlen;
+		ninsert = nxlen - nbperpage;
+
+		xflag = XAD_NOTRECORDED;
+	} else {
+		ntail = nxlen;
+		nextend = delta;
+		ninsert = 0;
+
+		xflag = xp->flag;
+	}
+
+	/* if we were able to extend the disk allocation in place,
+	 * extend the extent.  otherwise, move the extent to a
+	 * new disk location.
+	 */
+	if (xaddr == nxaddr) {
+		/* extend the extent */
+		if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
+			dbFree(ip, xaddr + xlen, delta);
+			dquot_free_block(ip, nxlen);
+			goto exit;
+		}
+	} else {
+		/*
+		 * move the extent to a new location:
+		 *
+		 * xtTailgate() accounts for relocated tail extent;
+		 */
+		if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
+			dbFree(ip, nxaddr, nxlen);
+			dquot_free_block(ip, nxlen);
+			goto exit;
+		}
+	}
+
+
+	/* check if we need to also insert a new extent */
+	if (ninsert) {
+		/* perform the insert.  if it fails, free the blocks
+		 * to be inserted and make it appear that we only did
+		 * the xtExtend() or xtTailgate() above.
+		 */
+		xaddr = nxaddr + ntail;
+		if (xtInsert (0, ip, xflag, xoff + ntail, (int) ninsert,
+			      &xaddr, 0)) {
+			dbFree(ip, xaddr, (s64) ninsert);
+			delta = nextend;
+			nxlen = ntail;
+			xflag = 0;
+		}
+	}
+
+	/* set the return results */
+	XADaddress(xp, nxaddr);
+	XADlength(xp, nxlen);
+	XADoffset(xp, xoff);
+	xp->flag = xflag;
+
+	mark_inode_dirty(ip);
+exit:
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	return (rc);
+}
+#endif			/* _NOTYET */
+
+
+/*
+ * NAME:	extHint()
+ *
+ * FUNCTION:	produce an extent allocation hint for a file offset.
+ *
+ * PARAMETERS:
+ *	ip	- the inode of the file.
+ *	offset  - file offset for which the hint is needed.
+ *	xp	- pointer to the xad that is to be filled in with
+ *		  the hint.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ */
+int extHint(struct inode *ip, s64 offset, xad_t * xp)
+{
+	struct super_block *sb = ip->i_sb;
+	int nbperpage = JFS_SBI(sb)->nbperpage;
+	s64 prev;
+	int rc = 0;
+	s64 xaddr;
+	int xlen;
+	int xflag;
+
+	/* init the hint as "no hint provided" */
+	XADaddress(xp, 0);
+
+	/* determine the starting extent offset of the page previous
+	 * to the page containing the offset.
+	 */
+	prev = ((offset & ~POFFSET) >> JFS_SBI(sb)->l2bsize) - nbperpage;
+
+	/* if the offset is in the first page of the file, no hint provided.
+	 */
+	if (prev < 0)
+		goto out;
+
+	rc = xtLookup(ip, prev, nbperpage, &xflag, &xaddr, &xlen, 0);
+
+	if ((rc == 0) && xlen) {
+		if (xlen != nbperpage) {
+			jfs_error(ip->i_sb, "extHint: corrupt xtree");
+			rc = -EIO;
+		}
+		XADaddress(xp, xaddr);
+		XADlength(xp, xlen);
+		XADoffset(xp, prev);
+		/*
+		 * only preserve the abnr flag within the xad flags
+		 * of the returned hint.
+		 */
+		xp->flag  = xflag & XAD_NOTRECORDED;
+	} else
+		rc = 0;
+
+out:
+	return (rc);
+}
+
+
+/*
+ * NAME:	extRecord()
+ *
+ * FUNCTION:	change a page with a file from not recorded to recorded.
+ *
+ * PARAMETERS:
+ *	ip	- inode of the file.
+ *	cp	- cbuf of the file page.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ *	-ENOSPC	- insufficient disk resources.
+ */
+int extRecord(struct inode *ip, xad_t * xp)
+{
+	int rc;
+
+	txBeginAnon(ip->i_sb);
+
+	mutex_lock(&JFS_IP(ip)->commit_mutex);
+
+	/* update the extent */
+	rc = xtUpdate(0, ip, xp);
+
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	return rc;
+}
+
+
+#ifdef _NOTYET
+/*
+ * NAME:	extFill()
+ *
+ * FUNCTION:	allocate disk space for a file page that represents
+ *		a file hole.
+ *
+ * PARAMETERS:
+ *	ip	- the inode of the file.
+ *	cp	- cbuf of the file page represent the hole.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ *	-ENOSPC	- insufficient disk resources.
+ */
+int extFill(struct inode *ip, xad_t * xp)
+{
+	int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
+	s64 blkno = offsetXAD(xp) >> ip->i_blkbits;
+
+//	assert(ISSPARSE(ip));
+
+	/* initialize the extent allocation hint */
+	XADaddress(xp, 0);
+
+	/* allocate an extent to fill the hole */
+	if ((rc = extAlloc(ip, nbperpage, blkno, xp, false)))
+		return (rc);
+
+	assert(lengthPXD(xp) == nbperpage);
+
+	return (0);
+}
+#endif			/* _NOTYET */
+
+
+/*
+ * NAME:	extBalloc()
+ *
+ * FUNCTION:	allocate disk blocks to form an extent.
+ *
+ *		initially, we will try to allocate disk blocks for the
+ *		requested size (nblocks).  if this fails (nblocks
+ *		contiguous free blocks not available), we'll try to allocate
+ *		a smaller number of blocks (producing a smaller extent), with
+ *		this smaller number of blocks consisting of the requested
+ *		number of blocks rounded down to the next smaller power of 2
+ *		number (i.e. 16 -> 8).  we'll continue to round down and
+ *		retry the allocation until the number of blocks to allocate
+ *		is smaller than the number of blocks per page.
+ *
+ * PARAMETERS:
+ *	ip	 - the inode of the file.
+ *	hint	 - disk block number to be used as an allocation hint.
+ *	*nblocks - pointer to an s64 value.  on entry, this value specifies
+ *		   the desired number of block to be allocated. on successful
+ *		   exit, this value is set to the number of blocks actually
+ *		   allocated.
+ *	blkno	 - pointer to a block address that is filled in on successful
+ *		   return with the starting block number of the newly
+ *		   allocated block range.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ *	-ENOSPC	- insufficient disk resources.
+ */
+static int
+extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
+{
+	struct jfs_inode_info *ji = JFS_IP(ip);
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+	s64 nb, nblks, daddr, max;
+	int rc, nbperpage = sbi->nbperpage;
+	struct bmap *bmp = sbi->bmap;
+	int ag;
+
+	/* get the number of blocks to initially attempt to allocate.
+	 * we'll first try the number of blocks requested unless this
+	 * number is greater than the maximum number of contiguous free
+	 * blocks in the map. in that case, we'll start off with the
+	 * maximum free.
+	 */
+	max = (s64) 1 << bmp->db_maxfreebud;
+	if (*nblocks >= max && *nblocks > nbperpage)
+		nb = nblks = (max > nbperpage) ? max : nbperpage;
+	else
+		nb = nblks = *nblocks;
+
+	/* try to allocate blocks */
+	while ((rc = dbAlloc(ip, hint, nb, &daddr)) != 0) {
+		/* if something other than an out of space error,
+		 * stop and return this error.
+		 */
+		if (rc != -ENOSPC)
+			return (rc);
+
+		/* decrease the allocation request size */
+		nb = min(nblks, extRoundDown(nb));
+
+		/* give up if we cannot cover a page */
+		if (nb < nbperpage)
+			return (rc);
+	}
+
+	*nblocks = nb;
+	*blkno = daddr;
+
+	if (S_ISREG(ip->i_mode) && (ji->fileset == FILESYSTEM_I)) {
+		ag = BLKTOAG(daddr, sbi);
+		spin_lock_irq(&ji->ag_lock);
+		if (ji->active_ag == -1) {
+			atomic_inc(&bmp->db_active[ag]);
+			ji->active_ag = ag;
+		} else if (ji->active_ag != ag) {
+			atomic_dec(&bmp->db_active[ji->active_ag]);
+			atomic_inc(&bmp->db_active[ag]);
+			ji->active_ag = ag;
+		}
+		spin_unlock_irq(&ji->ag_lock);
+	}
+
+	return (0);
+}
+
+
+#ifdef _NOTYET
+/*
+ * NAME:	extBrealloc()
+ *
+ * FUNCTION:	attempt to extend an extent's allocation.
+ *
+ *		Initially, we will try to extend the extent's allocation
+ *		in place.  If this fails, we'll try to move the extent
+ *		to a new set of blocks.  If moving the extent, we initially
+ *		will try to allocate disk blocks for the requested size
+ *		(newnblks).  if this fails (new contiguous free blocks not
+ *		available), we'll try to allocate a smaller number of
+ *		blocks (producing a smaller extent), with this smaller
+ *		number of blocks consisting of the requested number of
+ *		blocks rounded down to the next smaller power of 2
+ *		number (i.e. 16 -> 8).  We'll continue to round down and
+ *		retry the allocation until the number of blocks to allocate
+ *		is smaller than the number of blocks per page.
+ *
+ * PARAMETERS:
+ *	ip	 - the inode of the file.
+ *	blkno	 - starting block number of the extents current allocation.
+ *	nblks	 - number of blocks within the extents current allocation.
+ *	newnblks - pointer to a s64 value.  on entry, this value is the
+ *		   the new desired extent size (number of blocks).  on
+ *		   successful exit, this value is set to the extent's actual
+ *		   new size (new number of blocks).
+ *	newblkno - the starting block number of the extents new allocation.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ *	-ENOSPC	- insufficient disk resources.
+ */
+static int
+extBrealloc(struct inode *ip,
+	    s64 blkno, s64 nblks, s64 * newnblks, s64 * newblkno)
+{
+	int rc;
+
+	/* try to extend in place */
+	if ((rc = dbExtend(ip, blkno, nblks, *newnblks - nblks)) == 0) {
+		*newblkno = blkno;
+		return (0);
+	} else {
+		if (rc != -ENOSPC)
+			return (rc);
+	}
+
+	/* in place extension not possible.
+	 * try to move the extent to a new set of blocks.
+	 */
+	return (extBalloc(ip, blkno, newnblks, newblkno));
+}
+#endif			/* _NOTYET */
+
+
+/*
+ * NAME:	extRoundDown()
+ *
+ * FUNCTION:	round down a specified number of blocks to the next
+ *		smallest power of 2 number.
+ *
+ * PARAMETERS:
+ *	nb	- the inode of the file.
+ *
+ * RETURN VALUES:
+ *	next smallest power of 2 number.
+ */
+static s64 extRoundDown(s64 nb)
+{
+	int i;
+	u64 m, k;
+
+	for (i = 0, m = (u64) 1 << 63; i < 64; i++, m >>= 1) {
+		if (m & nb)
+			break;
+	}
+
+	i = 63 - i;
+	k = (u64) 1 << i;
+	k = ((k - 1) & nb) ? k : k >> 1;
+
+	return (k);
+}
diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h
new file mode 100644
index 00000000..b567e12c
--- /dev/null
+++ b/fs/jfs/jfs_extent.h
@@ -0,0 +1,31 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2001
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_EXTENT
+#define _H_JFS_EXTENT
+
+/*  get block allocation allocation hint as location of disk inode */
+#define	INOHINT(ip)	\
+	(addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1)
+
+extern int	extAlloc(struct inode *, s64, s64, xad_t *, bool);
+extern int	extFill(struct inode *, xad_t *);
+extern int	extHint(struct inode *, s64, xad_t *);
+extern int	extRealloc(struct inode *, s64, xad_t *, bool);
+extern int	extRecord(struct inode *, xad_t *);
+
+#endif	/* _H_JFS_EXTENT */
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
new file mode 100644
index 00000000..b3f5463f
--- /dev/null
+++ b/fs/jfs/jfs_filsys.h
@@ -0,0 +1,282 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2003
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_FILSYS
+#define _H_JFS_FILSYS
+
+/*
+ *	jfs_filsys.h
+ *
+ * file system (implementation-dependent) constants
+ *
+ * refer to <limits.h> for system wide implementation-dependent constants
+ */
+
+/*
+ *	 file system option (superblock flag)
+ */
+
+/* directory option */
+#define JFS_UNICODE	0x00000001	/* unicode name */
+
+/* mount time flags for error handling */
+#define JFS_ERR_REMOUNT_RO 0x00000002	/* remount read-only */
+#define JFS_ERR_CONTINUE   0x00000004	/* continue */
+#define JFS_ERR_PANIC      0x00000008	/* panic */
+
+/* Quota support */
+#define	JFS_USRQUOTA	0x00000010
+#define	JFS_GRPQUOTA	0x00000020
+
+/* mount time flag to disable journaling to disk */
+#define JFS_NOINTEGRITY 0x00000040
+
+/* commit option */
+#define	JFS_COMMIT	0x00000f00	/* commit option mask */
+#define	JFS_GROUPCOMMIT	0x00000100	/* group (of 1) commit */
+#define	JFS_LAZYCOMMIT	0x00000200	/* lazy commit */
+#define	JFS_TMPFS	0x00000400	/* temporary file system -
+					 * do not log/commit:
+					 * Never implemented
+					 */
+
+/* log logical volume option */
+#define	JFS_INLINELOG	0x00000800	/* inline log within file system */
+#define JFS_INLINEMOVE	0x00001000	/* inline log being moved */
+
+/* Secondary aggregate inode table */
+#define JFS_BAD_SAIT	0x00010000	/* current secondary ait is bad */
+
+/* sparse regular file support */
+#define JFS_SPARSE	0x00020000	/* sparse regular file */
+
+/* DASD Limits		F226941 */
+#define JFS_DASD_ENABLED 0x00040000	/* DASD limits enabled */
+#define	JFS_DASD_PRIME	0x00080000	/* Prime DASD usage on boot */
+
+/* big endian flag */
+#define	JFS_SWAP_BYTES	0x00100000	/* running on big endian computer */
+
+/* Directory index */
+#define JFS_DIR_INDEX	0x00200000	/* Persistent index for */
+
+/* platform options */
+#define JFS_LINUX	0x10000000	/* Linux support */
+#define JFS_DFS		0x20000000	/* DCE DFS LFS support */
+/*	Never implemented */
+
+#define JFS_OS2		0x40000000	/* OS/2 support */
+/*	case-insensitive name/directory support */
+
+#define JFS_AIX		0x80000000	/* AIX support */
+
+/*
+ *	buffer cache configuration
+ */
+/* page size */
+#ifdef PSIZE
+#undef PSIZE
+#endif
+#define	PSIZE		4096	/* page size (in byte) */
+#define	L2PSIZE		12	/* log2(PSIZE) */
+#define	POFFSET		4095	/* offset within page */
+
+/* buffer page size */
+#define BPSIZE	PSIZE
+
+/*
+ *	fs fundamental size
+ *
+ * PSIZE >= file system block size >= PBSIZE >= DISIZE
+ */
+#define	PBSIZE		512	/* physical block size (in byte) */
+#define	L2PBSIZE	9	/* log2(PBSIZE) */
+
+#define DISIZE		512	/* on-disk inode size (in byte) */
+#define L2DISIZE	9	/* log2(DISIZE) */
+
+#define IDATASIZE	256	/* inode inline data size */
+#define	IXATTRSIZE	128	/* inode inline extended attribute size */
+
+#define XTPAGE_SIZE	4096
+#define log2_PAGESIZE	12
+
+#define IAG_SIZE	4096
+#define IAG_EXTENT_SIZE 4096
+#define	INOSPERIAG	4096	/* number of disk inodes per iag */
+#define	L2INOSPERIAG	12	/* l2 number of disk inodes per iag */
+#define INOSPEREXT	32	/* number of disk inode per extent */
+#define L2INOSPEREXT	5	/* l2 number of disk inode per extent */
+#define	IXSIZE		(DISIZE * INOSPEREXT)	/* inode extent size */
+#define	INOSPERPAGE	8	/* number of disk inodes per 4K page */
+#define	L2INOSPERPAGE	3	/* log2(INOSPERPAGE) */
+
+#define	IAGFREELIST_LWM	64
+
+#define INODE_EXTENT_SIZE	IXSIZE	/* inode extent size */
+#define NUM_INODE_PER_EXTENT	INOSPEREXT
+#define NUM_INODE_PER_IAG	INOSPERIAG
+
+#define MINBLOCKSIZE		512
+#define MAXBLOCKSIZE		4096
+#define	MAXFILESIZE		((s64)1 << 52)
+
+#define JFS_LINK_MAX		0xffffffff
+
+/* Minimum number of bytes supported for a JFS partition */
+#define MINJFS			(0x1000000)
+#define MINJFSTEXT		"16"
+
+/*
+ * file system block size -> physical block size
+ */
+#define LBOFFSET(x)	((x) & (PBSIZE - 1))
+#define LBNUMBER(x)	((x) >> L2PBSIZE)
+#define	LBLK2PBLK(sb,b)	((b) << (sb->s_blocksize_bits - L2PBSIZE))
+#define	PBLK2LBLK(sb,b)	((b) >> (sb->s_blocksize_bits - L2PBSIZE))
+/* size in byte -> last page number */
+#define	SIZE2PN(size)	( ((s64)((size) - 1)) >> (L2PSIZE) )
+/* size in byte -> last file system block number */
+#define	SIZE2BN(size, l2bsize) ( ((s64)((size) - 1)) >> (l2bsize) )
+
+/*
+ * fixed physical block address (physical block size = 512 byte)
+ *
+ * NOTE: since we can't guarantee a physical block size of 512 bytes the use of
+ *	 these macros should be removed and the byte offset macros used instead.
+ */
+#define SUPER1_B	64	/* primary superblock */
+#define	AIMAP_B		(SUPER1_B + 8)	/* 1st extent of aggregate inode map */
+#define	AITBL_B		(AIMAP_B + 16)	/*
+					 * 1st extent of aggregate inode table
+					 */
+#define	SUPER2_B	(AITBL_B + 32)	/* 2ndary superblock pbn */
+#define	BMAP_B		(SUPER2_B + 8)	/* block allocation map */
+
+/*
+ * SIZE_OF_SUPER defines the total amount of space reserved on disk for the
+ * superblock.  This is not the same as the superblock structure, since all of
+ * this space is not currently being used.
+ */
+#define SIZE_OF_SUPER	PSIZE
+
+/*
+ * SIZE_OF_AG_TABLE defines the amount of space reserved to hold the AG table
+ */
+#define SIZE_OF_AG_TABLE	PSIZE
+
+/*
+ * SIZE_OF_MAP_PAGE defines the amount of disk space reserved for each page of
+ * the inode allocation map (to hold iag)
+ */
+#define SIZE_OF_MAP_PAGE	PSIZE
+
+/*
+ * fixed byte offset address
+ */
+#define SUPER1_OFF	0x8000	/* primary superblock */
+#define AIMAP_OFF	(SUPER1_OFF + SIZE_OF_SUPER)
+					/*
+					 * Control page of aggregate inode map
+					 * followed by 1st extent of map
+					 */
+#define AITBL_OFF	(AIMAP_OFF + (SIZE_OF_MAP_PAGE << 1))
+					/*
+					 * 1st extent of aggregate inode table
+					 */
+#define SUPER2_OFF	(AITBL_OFF + INODE_EXTENT_SIZE)
+					/*
+					 * secondary superblock
+					 */
+#define BMAP_OFF	(SUPER2_OFF + SIZE_OF_SUPER)
+					/*
+					 * block allocation map
+					 */
+
+/*
+ * The following macro is used to indicate the number of reserved disk blocks at
+ * the front of an aggregate, in terms of physical blocks.  This value is
+ * currently defined to be 32K.  This turns out to be the same as the primary
+ * superblock's address, since it directly follows the reserved blocks.
+ */
+#define AGGR_RSVD_BLOCKS	SUPER1_B
+
+/*
+ * The following macro is used to indicate the number of reserved bytes at the
+ * front of an aggregate.  This value is currently defined to be 32K.  This
+ * turns out to be the same as the primary superblock's byte offset, since it
+ * directly follows the reserved blocks.
+ */
+#define AGGR_RSVD_BYTES	SUPER1_OFF
+
+/*
+ * The following macro defines the byte offset for the first inode extent in
+ * the aggregate inode table.  This allows us to find the self inode to find the
+ * rest of the table.  Currently this value is 44K.
+ */
+#define AGGR_INODE_TABLE_START	AITBL_OFF
+
+/*
+ *	fixed reserved inode number
+ */
+/* aggregate inode */
+#define AGGR_RESERVED_I	0	/* aggregate inode (reserved) */
+#define	AGGREGATE_I	1	/* aggregate inode map inode */
+#define	BMAP_I		2	/* aggregate block allocation map inode */
+#define	LOG_I		3	/* aggregate inline log inode */
+#define BADBLOCK_I	4	/* aggregate bad block inode */
+#define	FILESYSTEM_I	16	/* 1st/only fileset inode in ait:
+				 * fileset inode map inode
+				 */
+
+/* per fileset inode */
+#define FILESET_RSVD_I	0	/* fileset inode (reserved) */
+#define FILESET_EXT_I	1	/* fileset inode extension */
+#define	ROOT_I		2	/* fileset root inode */
+#define ACL_I		3	/* fileset ACL inode */
+
+#define FILESET_OBJECT_I 4	/* the first fileset inode available for a file
+				 * or directory or link...
+				 */
+#define FIRST_FILESET_INO 16	/* the first aggregate inode which describes
+				 * an inode.  (To fsck this is also the first
+				 * inode in part 2 of the agg inode table.)
+				 */
+
+/*
+ *	directory configuration
+ */
+#define JFS_NAME_MAX	255
+#define JFS_PATH_MAX	BPSIZE
+
+
+/*
+ *	file system state (superblock state)
+ */
+#define FM_CLEAN 0x00000000	/* file system is unmounted and clean */
+#define FM_MOUNT 0x00000001	/* file system is mounted cleanly */
+#define FM_DIRTY 0x00000002	/* file system was not unmounted and clean
+				 * when mounted or
+				 * commit failure occurred while being mounted:
+				 * fsck() must be run to repair
+				 */
+#define	FM_LOGREDO 0x00000004	/* log based recovery (logredo()) failed:
+				 * fsck() must be run to repair
+				 */
+#define	FM_EXTENDFS 0x00000008	/* file system extendfs() in progress */
+
+#endif				/* _H_JFS_FILSYS */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
new file mode 100644
index 00000000..b78b2f97
--- /dev/null
+++ b/fs/jfs/jfs_imap.c
@@ -0,0 +1,3187 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ *	jfs_imap.c: inode allocation map manager
+ *
+ * Serialization:
+ *   Each AG has a simple lock which is used to control the serialization of
+ *	the AG level lists.  This lock should be taken first whenever an AG
+ *	level list will be modified or accessed.
+ *
+ *   Each IAG is locked by obtaining the buffer for the IAG page.
+ *
+ *   There is also a inode lock for the inode map inode.  A read lock needs to
+ *	be taken whenever an IAG is read from the map or the global level
+ *	information is read.  A write lock needs to be taken whenever the global
+ *	level information is modified or an atomic operation needs to be used.
+ *
+ *	If more than one IAG is read at one time, the read lock may not
+ *	be given up until all of the IAG's are read.  Otherwise, a deadlock
+ *	may occur when trying to obtain the read lock while another thread
+ *	holding the read lock is waiting on the IAG already being held.
+ *
+ *   The control page of the inode map is read into memory by diMount().
+ *	Thereafter it should only be modified in memory and then it will be
+ *	written out when the filesystem is unmounted by diUnmount().
+ */
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+
+#include "jfs_incore.h"
+#include "jfs_inode.h"
+#include "jfs_filsys.h"
+#include "jfs_dinode.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_metapage.h"
+#include "jfs_superblock.h"
+#include "jfs_debug.h"
+
+/*
+ * imap locks
+ */
+/* iag free list lock */
+#define IAGFREE_LOCK_INIT(imap)		mutex_init(&imap->im_freelock)
+#define IAGFREE_LOCK(imap)		mutex_lock(&imap->im_freelock)
+#define IAGFREE_UNLOCK(imap)		mutex_unlock(&imap->im_freelock)
+
+/* per ag iag list locks */
+#define AG_LOCK_INIT(imap,index)	mutex_init(&(imap->im_aglock[index]))
+#define AG_LOCK(imap,agno)		mutex_lock(&imap->im_aglock[agno])
+#define AG_UNLOCK(imap,agno)		mutex_unlock(&imap->im_aglock[agno])
+
+/*
+ * forward references
+ */
+static int diAllocAG(struct inomap *, int, bool, struct inode *);
+static int diAllocAny(struct inomap *, int, bool, struct inode *);
+static int diAllocBit(struct inomap *, struct iag *, int);
+static int diAllocExt(struct inomap *, int, struct inode *);
+static int diAllocIno(struct inomap *, int, struct inode *);
+static int diFindFree(u32, int);
+static int diNewExt(struct inomap *, struct iag *, int);
+static int diNewIAG(struct inomap *, int *, int, struct metapage **);
+static void duplicateIXtree(struct super_block *, s64, int, s64 *);
+
+static int diIAGRead(struct inomap * imap, int, struct metapage **);
+static int copy_from_dinode(struct dinode *, struct inode *);
+static void copy_to_dinode(struct dinode *, struct inode *);
+
+/*
+ * NAME:	diMount()
+ *
+ * FUNCTION:	initialize the incore inode map control structures for
+ *		a fileset or aggregate init time.
+ *
+ *		the inode map's control structure (dinomap) is
+ *		brought in from disk and placed in virtual memory.
+ *
+ * PARAMETERS:
+ *	ipimap	- pointer to inode map inode for the aggregate or fileset.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOMEM	- insufficient free virtual memory.
+ *	-EIO	- i/o error.
+ */
+int diMount(struct inode *ipimap)
+{
+	struct inomap *imap;
+	struct metapage *mp;
+	int index;
+	struct dinomap_disk *dinom_le;
+
+	/*
+	 * allocate/initialize the in-memory inode map control structure
+	 */
+	/* allocate the in-memory inode map control structure. */
+	imap = kmalloc(sizeof(struct inomap), GFP_KERNEL);
+	if (imap == NULL) {
+		jfs_err("diMount: kmalloc returned NULL!");
+		return -ENOMEM;
+	}
+
+	/* read the on-disk inode map control structure. */
+
+	mp = read_metapage(ipimap,
+			   IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
+			   PSIZE, 0);
+	if (mp == NULL) {
+		kfree(imap);
+		return -EIO;
+	}
+
+	/* copy the on-disk version to the in-memory version. */
+	dinom_le = (struct dinomap_disk *) mp->data;
+	imap->im_freeiag = le32_to_cpu(dinom_le->in_freeiag);
+	imap->im_nextiag = le32_to_cpu(dinom_le->in_nextiag);
+	atomic_set(&imap->im_numinos, le32_to_cpu(dinom_le->in_numinos));
+	atomic_set(&imap->im_numfree, le32_to_cpu(dinom_le->in_numfree));
+	imap->im_nbperiext = le32_to_cpu(dinom_le->in_nbperiext);
+	imap->im_l2nbperiext = le32_to_cpu(dinom_le->in_l2nbperiext);
+	for (index = 0; index < MAXAG; index++) {
+		imap->im_agctl[index].inofree =
+		    le32_to_cpu(dinom_le->in_agctl[index].inofree);
+		imap->im_agctl[index].extfree =
+		    le32_to_cpu(dinom_le->in_agctl[index].extfree);
+		imap->im_agctl[index].numinos =
+		    le32_to_cpu(dinom_le->in_agctl[index].numinos);
+		imap->im_agctl[index].numfree =
+		    le32_to_cpu(dinom_le->in_agctl[index].numfree);
+	}
+
+	/* release the buffer. */
+	release_metapage(mp);
+
+	/*
+	 * allocate/initialize inode allocation map locks
+	 */
+	/* allocate and init iag free list lock */
+	IAGFREE_LOCK_INIT(imap);
+
+	/* allocate and init ag list locks */
+	for (index = 0; index < MAXAG; index++) {
+		AG_LOCK_INIT(imap, index);
+	}
+
+	/* bind the inode map inode and inode map control structure
+	 * to each other.
+	 */
+	imap->im_ipimap = ipimap;
+	JFS_IP(ipimap)->i_imap = imap;
+
+	return (0);
+}
+
+
+/*
+ * NAME:	diUnmount()
+ *
+ * FUNCTION:	write to disk the incore inode map control structures for
+ *		a fileset or aggregate at unmount time.
+ *
+ * PARAMETERS:
+ *	ipimap	- pointer to inode map inode for the aggregate or fileset.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOMEM	- insufficient free virtual memory.
+ *	-EIO	- i/o error.
+ */
+int diUnmount(struct inode *ipimap, int mounterror)
+{
+	struct inomap *imap = JFS_IP(ipimap)->i_imap;
+
+	/*
+	 * update the on-disk inode map control structure
+	 */
+
+	if (!(mounterror || isReadOnly(ipimap)))
+		diSync(ipimap);
+
+	/*
+	 * Invalidate the page cache buffers
+	 */
+	truncate_inode_pages(ipimap->i_mapping, 0);
+
+	/*
+	 * free in-memory control structure
+	 */
+	kfree(imap);
+
+	return (0);
+}
+
+
+/*
+ *	diSync()
+ */
+int diSync(struct inode *ipimap)
+{
+	struct dinomap_disk *dinom_le;
+	struct inomap *imp = JFS_IP(ipimap)->i_imap;
+	struct metapage *mp;
+	int index;
+
+	/*
+	 * write imap global conrol page
+	 */
+	/* read the on-disk inode map control structure */
+	mp = get_metapage(ipimap,
+			  IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
+			  PSIZE, 0);
+	if (mp == NULL) {
+		jfs_err("diSync: get_metapage failed!");
+		return -EIO;
+	}
+
+	/* copy the in-memory version to the on-disk version */
+	dinom_le = (struct dinomap_disk *) mp->data;
+	dinom_le->in_freeiag = cpu_to_le32(imp->im_freeiag);
+	dinom_le->in_nextiag = cpu_to_le32(imp->im_nextiag);
+	dinom_le->in_numinos = cpu_to_le32(atomic_read(&imp->im_numinos));
+	dinom_le->in_numfree = cpu_to_le32(atomic_read(&imp->im_numfree));
+	dinom_le->in_nbperiext = cpu_to_le32(imp->im_nbperiext);
+	dinom_le->in_l2nbperiext = cpu_to_le32(imp->im_l2nbperiext);
+	for (index = 0; index < MAXAG; index++) {
+		dinom_le->in_agctl[index].inofree =
+		    cpu_to_le32(imp->im_agctl[index].inofree);
+		dinom_le->in_agctl[index].extfree =
+		    cpu_to_le32(imp->im_agctl[index].extfree);
+		dinom_le->in_agctl[index].numinos =
+		    cpu_to_le32(imp->im_agctl[index].numinos);
+		dinom_le->in_agctl[index].numfree =
+		    cpu_to_le32(imp->im_agctl[index].numfree);
+	}
+
+	/* write out the control structure */
+	write_metapage(mp);
+
+	/*
+	 * write out dirty pages of imap
+	 */
+	filemap_write_and_wait(ipimap->i_mapping);
+
+	diWriteSpecial(ipimap, 0);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	diRead()
+ *
+ * FUNCTION:	initialize an incore inode from disk.
+ *
+ *		on entry, the specifed incore inode should itself
+ *		specify the disk inode number corresponding to the
+ *		incore inode (i.e. i_number should be initialized).
+ *
+ *		this routine handles incore inode initialization for
+ *		both "special" and "regular" inodes.  special inodes
+ *		are those required early in the mount process and
+ *		require special handling since much of the file system
+ *		is not yet initialized.  these "special" inodes are
+ *		identified by a NULL inode map inode pointer and are
+ *		actually initialized by a call to diReadSpecial().
+ *
+ *		for regular inodes, the iag describing the disk inode
+ *		is read from disk to determine the inode extent address
+ *		for the disk inode.  with the inode extent address in
+ *		hand, the page of the extent that contains the disk
+ *		inode is read and the disk inode is copied to the
+ *		incore inode.
+ *
+ * PARAMETERS:
+ *	ip	-  pointer to incore inode to be initialized from disk.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ *	-ENOMEM	- insufficient memory
+ *
+ */
+int diRead(struct inode *ip)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+	int iagno, ino, extno, rc;
+	struct inode *ipimap;
+	struct dinode *dp;
+	struct iag *iagp;
+	struct metapage *mp;
+	s64 blkno, agstart;
+	struct inomap *imap;
+	int block_offset;
+	int inodes_left;
+	unsigned long pageno;
+	int rel_inode;
+
+	jfs_info("diRead: ino = %ld", ip->i_ino);
+
+	ipimap = sbi->ipimap;
+	JFS_IP(ip)->ipimap = ipimap;
+
+	/* determine the iag number for this inode (number) */
+	iagno = INOTOIAG(ip->i_ino);
+
+	/* read the iag */
+	imap = JFS_IP(ipimap)->i_imap;
+	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
+	rc = diIAGRead(imap, iagno, &mp);
+	IREAD_UNLOCK(ipimap);
+	if (rc) {
+		jfs_err("diRead: diIAGRead returned %d", rc);
+		return (rc);
+	}
+
+	iagp = (struct iag *) mp->data;
+
+	/* determine inode extent that holds the disk inode */
+	ino = ip->i_ino & (INOSPERIAG - 1);
+	extno = ino >> L2INOSPEREXT;
+
+	if ((lengthPXD(&iagp->inoext[extno]) != imap->im_nbperiext) ||
+	    (addressPXD(&iagp->inoext[extno]) == 0)) {
+		release_metapage(mp);
+		return -ESTALE;
+	}
+
+	/* get disk block number of the page within the inode extent
+	 * that holds the disk inode.
+	 */
+	blkno = INOPBLK(&iagp->inoext[extno], ino, sbi->l2nbperpage);
+
+	/* get the ag for the iag */
+	agstart = le64_to_cpu(iagp->agstart);
+
+	release_metapage(mp);
+
+	rel_inode = (ino & (INOSPERPAGE - 1));
+	pageno = blkno >> sbi->l2nbperpage;
+
+	if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
+		/*
+		 * OS/2 didn't always align inode extents on page boundaries
+		 */
+		inodes_left =
+		     (sbi->nbperpage - block_offset) << sbi->l2niperblk;
+
+		if (rel_inode < inodes_left)
+			rel_inode += block_offset << sbi->l2niperblk;
+		else {
+			pageno += 1;
+			rel_inode -= inodes_left;
+		}
+	}
+
+	/* read the page of disk inode */
+	mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
+	if (!mp) {
+		jfs_err("diRead: read_metapage failed");
+		return -EIO;
+	}
+
+	/* locate the disk inode requested */
+	dp = (struct dinode *) mp->data;
+	dp += rel_inode;
+
+	if (ip->i_ino != le32_to_cpu(dp->di_number)) {
+		jfs_error(ip->i_sb, "diRead: i_ino != di_number");
+		rc = -EIO;
+	} else if (le32_to_cpu(dp->di_nlink) == 0)
+		rc = -ESTALE;
+	else
+		/* copy the disk inode to the in-memory inode */
+		rc = copy_from_dinode(dp, ip);
+
+	release_metapage(mp);
+
+	/* set the ag for the inode */
+	JFS_IP(ip)->agstart = agstart;
+	JFS_IP(ip)->active_ag = -1;
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	diReadSpecial()
+ *
+ * FUNCTION:	initialize a 'special' inode from disk.
+ *
+ *		this routines handles aggregate level inodes.  The
+ *		inode cache cannot differentiate between the
+ *		aggregate inodes and the filesystem inodes, so we
+ *		handle these here.  We don't actually use the aggregate
+ *		inode map, since these inodes are at a fixed location
+ *		and in some cases the aggregate inode map isn't initialized
+ *		yet.
+ *
+ * PARAMETERS:
+ *	sb - filesystem superblock
+ *	inum - aggregate inode number
+ *	secondary - 1 if secondary aggregate inode table
+ *
+ * RETURN VALUES:
+ *	new inode	- success
+ *	NULL		- i/o error.
+ */
+struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	uint address;
+	struct dinode *dp;
+	struct inode *ip;
+	struct metapage *mp;
+
+	ip = new_inode(sb);
+	if (ip == NULL) {
+		jfs_err("diReadSpecial: new_inode returned NULL!");
+		return ip;
+	}
+
+	if (secondary) {
+		address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
+		JFS_IP(ip)->ipimap = sbi->ipaimap2;
+	} else {
+		address = AITBL_OFF >> L2PSIZE;
+		JFS_IP(ip)->ipimap = sbi->ipaimap;
+	}
+
+	ASSERT(inum < INOSPEREXT);
+
+	ip->i_ino = inum;
+
+	address += inum >> 3;	/* 8 inodes per 4K page */
+
+	/* read the page of fixed disk inode (AIT) in raw mode */
+	mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
+	if (mp == NULL) {
+		ip->i_nlink = 1;	/* Don't want iput() deleting it */
+		iput(ip);
+		return (NULL);
+	}
+
+	/* get the pointer to the disk inode of interest */
+	dp = (struct dinode *) (mp->data);
+	dp += inum % 8;		/* 8 inodes per 4K page */
+
+	/* copy on-disk inode to in-memory inode */
+	if ((copy_from_dinode(dp, ip)) != 0) {
+		/* handle bad return by returning NULL for ip */
+		ip->i_nlink = 1;	/* Don't want iput() deleting it */
+		iput(ip);
+		/* release the page */
+		release_metapage(mp);
+		return (NULL);
+
+	}
+
+	ip->i_mapping->a_ops = &jfs_metapage_aops;
+	mapping_set_gfp_mask(ip->i_mapping, GFP_NOFS);
+
+	/* Allocations to metadata inodes should not affect quotas */
+	ip->i_flags |= S_NOQUOTA;
+
+	if ((inum == FILESYSTEM_I) && (JFS_IP(ip)->ipimap == sbi->ipaimap)) {
+		sbi->gengen = le32_to_cpu(dp->di_gengen);
+		sbi->inostamp = le32_to_cpu(dp->di_inostamp);
+	}
+
+	/* release the page */
+	release_metapage(mp);
+
+	/*
+	 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
+	 * want special inodes in the fileset inode space, we make them
+	 * appear hashed, but do not put on any lists.  hlist_del()
+	 * will work fine and require no locking.
+	 */
+	hlist_add_fake(&ip->i_hash);
+
+	return (ip);
+}
+
+/*
+ * NAME:	diWriteSpecial()
+ *
+ * FUNCTION:	Write the special inode to disk
+ *
+ * PARAMETERS:
+ *	ip - special inode
+ *	secondary - 1 if secondary aggregate inode table
+ *
+ * RETURN VALUES: none
+ */
+
+void diWriteSpecial(struct inode *ip, int secondary)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+	uint address;
+	struct dinode *dp;
+	ino_t inum = ip->i_ino;
+	struct metapage *mp;
+
+	if (secondary)
+		address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
+	else
+		address = AITBL_OFF >> L2PSIZE;
+
+	ASSERT(inum < INOSPEREXT);
+
+	address += inum >> 3;	/* 8 inodes per 4K page */
+
+	/* read the page of fixed disk inode (AIT) in raw mode */
+	mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
+	if (mp == NULL) {
+		jfs_err("diWriteSpecial: failed to read aggregate inode "
+			"extent!");
+		return;
+	}
+
+	/* get the pointer to the disk inode of interest */
+	dp = (struct dinode *) (mp->data);
+	dp += inum % 8;		/* 8 inodes per 4K page */
+
+	/* copy on-disk inode to in-memory inode */
+	copy_to_dinode(dp, ip);
+	memcpy(&dp->di_xtroot, &JFS_IP(ip)->i_xtroot, 288);
+
+	if (inum == FILESYSTEM_I)
+		dp->di_gengen = cpu_to_le32(sbi->gengen);
+
+	/* write the page */
+	write_metapage(mp);
+}
+
+/*
+ * NAME:	diFreeSpecial()
+ *
+ * FUNCTION:	Free allocated space for special inode
+ */
+void diFreeSpecial(struct inode *ip)
+{
+	if (ip == NULL) {
+		jfs_err("diFreeSpecial called with NULL ip!");
+		return;
+	}
+	filemap_write_and_wait(ip->i_mapping);
+	truncate_inode_pages(ip->i_mapping, 0);
+	iput(ip);
+}
+
+
+
+/*
+ * NAME:	diWrite()
+ *
+ * FUNCTION:	write the on-disk inode portion of the in-memory inode
+ *		to its corresponding on-disk inode.
+ *
+ *		on entry, the specifed incore inode should itself
+ *		specify the disk inode number corresponding to the
+ *		incore inode (i.e. i_number should be initialized).
+ *
+ *		the inode contains the inode extent address for the disk
+ *		inode.  with the inode extent address in hand, the
+ *		page of the extent that contains the disk inode is
+ *		read and the disk inode portion of the incore inode
+ *		is copied to the disk inode.
+ *
+ * PARAMETERS:
+ *	tid -  transacation id
+ *	ip  -  pointer to incore inode to be written to the inode extent.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ */
+int diWrite(tid_t tid, struct inode *ip)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	int rc = 0;
+	s32 ino;
+	struct dinode *dp;
+	s64 blkno;
+	int block_offset;
+	int inodes_left;
+	struct metapage *mp;
+	unsigned long pageno;
+	int rel_inode;
+	int dioffset;
+	struct inode *ipimap;
+	uint type;
+	lid_t lid;
+	struct tlock *ditlck, *tlck;
+	struct linelock *dilinelock, *ilinelock;
+	struct lv *lv;
+	int n;
+
+	ipimap = jfs_ip->ipimap;
+
+	ino = ip->i_ino & (INOSPERIAG - 1);
+
+	if (!addressPXD(&(jfs_ip->ixpxd)) ||
+	    (lengthPXD(&(jfs_ip->ixpxd)) !=
+	     JFS_IP(ipimap)->i_imap->im_nbperiext)) {
+		jfs_error(ip->i_sb, "diWrite: ixpxd invalid");
+		return -EIO;
+	}
+
+	/*
+	 * read the page of disk inode containing the specified inode:
+	 */
+	/* compute the block address of the page */
+	blkno = INOPBLK(&(jfs_ip->ixpxd), ino, sbi->l2nbperpage);
+
+	rel_inode = (ino & (INOSPERPAGE - 1));
+	pageno = blkno >> sbi->l2nbperpage;
+
+	if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
+		/*
+		 * OS/2 didn't always align inode extents on page boundaries
+		 */
+		inodes_left =
+		    (sbi->nbperpage - block_offset) << sbi->l2niperblk;
+
+		if (rel_inode < inodes_left)
+			rel_inode += block_offset << sbi->l2niperblk;
+		else {
+			pageno += 1;
+			rel_inode -= inodes_left;
+		}
+	}
+	/* read the page of disk inode */
+      retry:
+	mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
+	if (!mp)
+		return -EIO;
+
+	/* get the pointer to the disk inode */
+	dp = (struct dinode *) mp->data;
+	dp += rel_inode;
+
+	dioffset = (ino & (INOSPERPAGE - 1)) << L2DISIZE;
+
+	/*
+	 * acquire transaction lock on the on-disk inode;
+	 * N.B. tlock is acquired on ipimap not ip;
+	 */
+	if ((ditlck =
+	     txLock(tid, ipimap, mp, tlckINODE | tlckENTRY)) == NULL)
+		goto retry;
+	dilinelock = (struct linelock *) & ditlck->lock;
+
+	/*
+	 * copy btree root from in-memory inode to on-disk inode
+	 *
+	 * (tlock is taken from inline B+-tree root in in-memory
+	 * inode when the B+-tree root is updated, which is pointed
+	 * by jfs_ip->blid as well as being on tx tlock list)
+	 *
+	 * further processing of btree root is based on the copy
+	 * in in-memory inode, where txLog() will log from, and,
+	 * for xtree root, txUpdateMap() will update map and reset
+	 * XAD_NEW bit;
+	 */
+
+	if (S_ISDIR(ip->i_mode) && (lid = jfs_ip->xtlid)) {
+		/*
+		 * This is the special xtree inside the directory for storing
+		 * the directory table
+		 */
+		xtpage_t *p, *xp;
+		xad_t *xad;
+
+		jfs_ip->xtlid = 0;
+		tlck = lid_to_tlock(lid);
+		assert(tlck->type & tlckXTREE);
+		tlck->type |= tlckBTROOT;
+		tlck->mp = mp;
+		ilinelock = (struct linelock *) & tlck->lock;
+
+		/*
+		 * copy xtree root from inode to dinode:
+		 */
+		p = &jfs_ip->i_xtroot;
+		xp = (xtpage_t *) &dp->di_dirtable;
+		lv = ilinelock->lv;
+		for (n = 0; n < ilinelock->index; n++, lv++) {
+			memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
+			       lv->length << L2XTSLOTSIZE);
+		}
+
+		/* reset on-disk (metadata page) xtree XAD_NEW bit */
+		xad = &xp->xad[XTENTRYSTART];
+		for (n = XTENTRYSTART;
+		     n < le16_to_cpu(xp->header.nextindex); n++, xad++)
+			if (xad->flag & (XAD_NEW | XAD_EXTENDED))
+				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
+	}
+
+	if ((lid = jfs_ip->blid) == 0)
+		goto inlineData;
+	jfs_ip->blid = 0;
+
+	tlck = lid_to_tlock(lid);
+	type = tlck->type;
+	tlck->type |= tlckBTROOT;
+	tlck->mp = mp;
+	ilinelock = (struct linelock *) & tlck->lock;
+
+	/*
+	 *	regular file: 16 byte (XAD slot) granularity
+	 */
+	if (type & tlckXTREE) {
+		xtpage_t *p, *xp;
+		xad_t *xad;
+
+		/*
+		 * copy xtree root from inode to dinode:
+		 */
+		p = &jfs_ip->i_xtroot;
+		xp = &dp->di_xtroot;
+		lv = ilinelock->lv;
+		for (n = 0; n < ilinelock->index; n++, lv++) {
+			memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
+			       lv->length << L2XTSLOTSIZE);
+		}
+
+		/* reset on-disk (metadata page) xtree XAD_NEW bit */
+		xad = &xp->xad[XTENTRYSTART];
+		for (n = XTENTRYSTART;
+		     n < le16_to_cpu(xp->header.nextindex); n++, xad++)
+			if (xad->flag & (XAD_NEW | XAD_EXTENDED))
+				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
+	}
+	/*
+	 *	directory: 32 byte (directory entry slot) granularity
+	 */
+	else if (type & tlckDTREE) {
+		dtpage_t *p, *xp;
+
+		/*
+		 * copy dtree root from inode to dinode:
+		 */
+		p = (dtpage_t *) &jfs_ip->i_dtroot;
+		xp = (dtpage_t *) & dp->di_dtroot;
+		lv = ilinelock->lv;
+		for (n = 0; n < ilinelock->index; n++, lv++) {
+			memcpy(&xp->slot[lv->offset], &p->slot[lv->offset],
+			       lv->length << L2DTSLOTSIZE);
+		}
+	} else {
+		jfs_err("diWrite: UFO tlock");
+	}
+
+      inlineData:
+	/*
+	 * copy inline symlink from in-memory inode to on-disk inode
+	 */
+	if (S_ISLNK(ip->i_mode) && ip->i_size < IDATASIZE) {
+		lv = & dilinelock->lv[dilinelock->index];
+		lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE;
+		lv->length = 2;
+		memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE);
+		dilinelock->index++;
+	}
+	/*
+	 * copy inline data from in-memory inode to on-disk inode:
+	 * 128 byte slot granularity
+	 */
+	if (test_cflag(COMMIT_Inlineea, ip)) {
+		lv = & dilinelock->lv[dilinelock->index];
+		lv->offset = (dioffset + 3 * 128) >> L2INODESLOTSIZE;
+		lv->length = 1;
+		memcpy(&dp->di_inlineea, jfs_ip->i_inline_ea, INODESLOTSIZE);
+		dilinelock->index++;
+
+		clear_cflag(COMMIT_Inlineea, ip);
+	}
+
+	/*
+	 *	lock/copy inode base: 128 byte slot granularity
+	 */
+	lv = & dilinelock->lv[dilinelock->index];
+	lv->offset = dioffset >> L2INODESLOTSIZE;
+	copy_to_dinode(dp, ip);
+	if (test_and_clear_cflag(COMMIT_Dirtable, ip)) {
+		lv->length = 2;
+		memcpy(&dp->di_dirtable, &jfs_ip->i_dirtable, 96);
+	} else
+		lv->length = 1;
+	dilinelock->index++;
+
+	/* release the buffer holding the updated on-disk inode.
+	 * the buffer will be later written by commit processing.
+	 */
+	write_metapage(mp);
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	diFree(ip)
+ *
+ * FUNCTION:	free a specified inode from the inode working map
+ *		for a fileset or aggregate.
+ *
+ *		if the inode to be freed represents the first (only)
+ *		free inode within the iag, the iag will be placed on
+ *		the ag free inode list.
+ *
+ *		freeing the inode will cause the inode extent to be
+ *		freed if the inode is the only allocated inode within
+ *		the extent.  in this case all the disk resource backing
+ *		up the inode extent will be freed. in addition, the iag
+ *		will be placed on the ag extent free list if the extent
+ *		is the first free extent in the iag.  if freeing the
+ *		extent also means that no free inodes will exist for
+ *		the iag, the iag will also be removed from the ag free
+ *		inode list.
+ *
+ *		the iag describing the inode will be freed if the extent
+ *		is to be freed and it is the only backed extent within
+ *		the iag.  in this case, the iag will be removed from the
+ *		ag free extent list and ag free inode list and placed on
+ *		the inode map's free iag list.
+ *
+ *		a careful update approach is used to provide consistency
+ *		in the face of updates to multiple buffers.  under this
+ *		approach, all required buffers are obtained before making
+ *		any updates and are held until all updates are complete.
+ *
+ * PARAMETERS:
+ *	ip	- inode to be freed.
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-EIO	- i/o error.
+ */
+int diFree(struct inode *ip)
+{
+	int rc;
+	ino_t inum = ip->i_ino;
+	struct iag *iagp, *aiagp, *biagp, *ciagp, *diagp;
+	struct metapage *mp, *amp, *bmp, *cmp, *dmp;
+	int iagno, ino, extno, bitno, sword, agno;
+	int back, fwd;
+	u32 bitmap, mask;
+	struct inode *ipimap = JFS_SBI(ip->i_sb)->ipimap;
+	struct inomap *imap = JFS_IP(ipimap)->i_imap;
+	pxd_t freepxd;
+	tid_t tid;
+	struct inode *iplist[3];
+	struct tlock *tlck;
+	struct pxd_lock *pxdlock;
+
+	/*
+	 * This is just to suppress compiler warnings.  The same logic that
+	 * references these variables is used to initialize them.
+	 */
+	aiagp = biagp = ciagp = diagp = NULL;
+
+	/* get the iag number containing the inode.
+	 */
+	iagno = INOTOIAG(inum);
+
+	/* make sure that the iag is contained within
+	 * the map.
+	 */
+	if (iagno >= imap->im_nextiag) {
+		print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4,
+			       imap, 32, 0);
+		jfs_error(ip->i_sb,
+			  "diFree: inum = %d, iagno = %d, nextiag = %d",
+			  (uint) inum, iagno, imap->im_nextiag);
+		return -EIO;
+	}
+
+	/* get the allocation group for this ino.
+	 */
+	agno = BLKTOAG(JFS_IP(ip)->agstart, JFS_SBI(ip->i_sb));
+
+	/* Lock the AG specific inode map information
+	 */
+	AG_LOCK(imap, agno);
+
+	/* Obtain read lock in imap inode.  Don't release it until we have
+	 * read all of the IAG's that we are going to.
+	 */
+	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
+
+	/* read the iag.
+	 */
+	if ((rc = diIAGRead(imap, iagno, &mp))) {
+		IREAD_UNLOCK(ipimap);
+		AG_UNLOCK(imap, agno);
+		return (rc);
+	}
+	iagp = (struct iag *) mp->data;
+
+	/* get the inode number and extent number of the inode within
+	 * the iag and the inode number within the extent.
+	 */
+	ino = inum & (INOSPERIAG - 1);
+	extno = ino >> L2INOSPEREXT;
+	bitno = ino & (INOSPEREXT - 1);
+	mask = HIGHORDER >> bitno;
+
+	if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
+		jfs_error(ip->i_sb,
+			  "diFree: wmap shows inode already free");
+	}
+
+	if (!addressPXD(&iagp->inoext[extno])) {
+		release_metapage(mp);
+		IREAD_UNLOCK(ipimap);
+		AG_UNLOCK(imap, agno);
+		jfs_error(ip->i_sb, "diFree: invalid inoext");
+		return -EIO;
+	}
+
+	/* compute the bitmap for the extent reflecting the freed inode.
+	 */
+	bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask;
+
+	if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) {
+		release_metapage(mp);
+		IREAD_UNLOCK(ipimap);
+		AG_UNLOCK(imap, agno);
+		jfs_error(ip->i_sb, "diFree: numfree > numinos");
+		return -EIO;
+	}
+	/*
+	 *	inode extent still has some inodes or below low water mark:
+	 *	keep the inode extent;
+	 */
+	if (bitmap ||
+	    imap->im_agctl[agno].numfree < 96 ||
+	    (imap->im_agctl[agno].numfree < 288 &&
+	     (((imap->im_agctl[agno].numfree * 100) /
+	       imap->im_agctl[agno].numinos) <= 25))) {
+		/* if the iag currently has no free inodes (i.e.,
+		 * the inode being freed is the first free inode of iag),
+		 * insert the iag at head of the inode free list for the ag.
+		 */
+		if (iagp->nfreeinos == 0) {
+			/* check if there are any iags on the ag inode
+			 * free list.  if so, read the first one so that
+			 * we can link the current iag onto the list at
+			 * the head.
+			 */
+			if ((fwd = imap->im_agctl[agno].inofree) >= 0) {
+				/* read the iag that currently is the head
+				 * of the list.
+				 */
+				if ((rc = diIAGRead(imap, fwd, &amp))) {
+					IREAD_UNLOCK(ipimap);
+					AG_UNLOCK(imap, agno);
+					release_metapage(mp);
+					return (rc);
+				}
+				aiagp = (struct iag *) amp->data;
+
+				/* make current head point back to the iag.
+				 */
+				aiagp->inofreeback = cpu_to_le32(iagno);
+
+				write_metapage(amp);
+			}
+
+			/* iag points forward to current head and iag
+			 * becomes the new head of the list.
+			 */
+			iagp->inofreefwd =
+			    cpu_to_le32(imap->im_agctl[agno].inofree);
+			iagp->inofreeback = cpu_to_le32(-1);
+			imap->im_agctl[agno].inofree = iagno;
+		}
+		IREAD_UNLOCK(ipimap);
+
+		/* update the free inode summary map for the extent if
+		 * freeing the inode means the extent will now have free
+		 * inodes (i.e., the inode being freed is the first free
+		 * inode of extent),
+		 */
+		if (iagp->wmap[extno] == cpu_to_le32(ONES)) {
+			sword = extno >> L2EXTSPERSUM;
+			bitno = extno & (EXTSPERSUM - 1);
+			iagp->inosmap[sword] &=
+			    cpu_to_le32(~(HIGHORDER >> bitno));
+		}
+
+		/* update the bitmap.
+		 */
+		iagp->wmap[extno] = cpu_to_le32(bitmap);
+
+		/* update the free inode counts at the iag, ag and
+		 * map level.
+		 */
+		le32_add_cpu(&iagp->nfreeinos, 1);
+		imap->im_agctl[agno].numfree += 1;
+		atomic_inc(&imap->im_numfree);
+
+		/* release the AG inode map lock
+		 */
+		AG_UNLOCK(imap, agno);
+
+		/* write the iag */
+		write_metapage(mp);
+
+		return (0);
+	}
+
+
+	/*
+	 *	inode extent has become free and above low water mark:
+	 *	free the inode extent;
+	 */
+
+	/*
+	 *	prepare to update iag list(s) (careful update step 1)
+	 */
+	amp = bmp = cmp = dmp = NULL;
+	fwd = back = -1;
+
+	/* check if the iag currently has no free extents.  if so,
+	 * it will be placed on the head of the ag extent free list.
+	 */
+	if (iagp->nfreeexts == 0) {
+		/* check if the ag extent free list has any iags.
+		 * if so, read the iag at the head of the list now.
+		 * this (head) iag will be updated later to reflect
+		 * the addition of the current iag at the head of
+		 * the list.
+		 */
+		if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
+			if ((rc = diIAGRead(imap, fwd, &amp)))
+				goto error_out;
+			aiagp = (struct iag *) amp->data;
+		}
+	} else {
+		/* iag has free extents. check if the addition of a free
+		 * extent will cause all extents to be free within this
+		 * iag.  if so, the iag will be removed from the ag extent
+		 * free list and placed on the inode map's free iag list.
+		 */
+		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
+			/* in preparation for removing the iag from the
+			 * ag extent free list, read the iags preceding
+			 * and following the iag on the ag extent free
+			 * list.
+			 */
+			if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
+				if ((rc = diIAGRead(imap, fwd, &amp)))
+					goto error_out;
+				aiagp = (struct iag *) amp->data;
+			}
+
+			if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
+				if ((rc = diIAGRead(imap, back, &bmp)))
+					goto error_out;
+				biagp = (struct iag *) bmp->data;
+			}
+		}
+	}
+
+	/* remove the iag from the ag inode free list if freeing
+	 * this extent cause the iag to have no free inodes.
+	 */
+	if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
+		int inofreeback = le32_to_cpu(iagp->inofreeback);
+		int inofreefwd = le32_to_cpu(iagp->inofreefwd);
+
+		/* in preparation for removing the iag from the
+		 * ag inode free list, read the iags preceding
+		 * and following the iag on the ag inode free
+		 * list.  before reading these iags, we must make
+		 * sure that we already don't have them in hand
+		 * from up above, since re-reading an iag (buffer)
+		 * we are currently holding would cause a deadlock.
+		 */
+		if (inofreefwd >= 0) {
+
+			if (inofreefwd == fwd)
+				ciagp = (struct iag *) amp->data;
+			else if (inofreefwd == back)
+				ciagp = (struct iag *) bmp->data;
+			else {
+				if ((rc =
+				     diIAGRead(imap, inofreefwd, &cmp)))
+					goto error_out;
+				ciagp = (struct iag *) cmp->data;
+			}
+			assert(ciagp != NULL);
+		}
+
+		if (inofreeback >= 0) {
+			if (inofreeback == fwd)
+				diagp = (struct iag *) amp->data;
+			else if (inofreeback == back)
+				diagp = (struct iag *) bmp->data;
+			else {
+				if ((rc =
+				     diIAGRead(imap, inofreeback, &dmp)))
+					goto error_out;
+				diagp = (struct iag *) dmp->data;
+			}
+			assert(diagp != NULL);
+		}
+	}
+
+	IREAD_UNLOCK(ipimap);
+
+	/*
+	 * invalidate any page of the inode extent freed from buffer cache;
+	 */
+	freepxd = iagp->inoext[extno];
+	invalidate_pxd_metapages(ip, freepxd);
+
+	/*
+	 *	update iag list(s) (careful update step 2)
+	 */
+	/* add the iag to the ag extent free list if this is the
+	 * first free extent for the iag.
+	 */
+	if (iagp->nfreeexts == 0) {
+		if (fwd >= 0)
+			aiagp->extfreeback = cpu_to_le32(iagno);
+
+		iagp->extfreefwd =
+		    cpu_to_le32(imap->im_agctl[agno].extfree);
+		iagp->extfreeback = cpu_to_le32(-1);
+		imap->im_agctl[agno].extfree = iagno;
+	} else {
+		/* remove the iag from the ag extent list if all extents
+		 * are now free and place it on the inode map iag free list.
+		 */
+		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
+			if (fwd >= 0)
+				aiagp->extfreeback = iagp->extfreeback;
+
+			if (back >= 0)
+				biagp->extfreefwd = iagp->extfreefwd;
+			else
+				imap->im_agctl[agno].extfree =
+				    le32_to_cpu(iagp->extfreefwd);
+
+			iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
+
+			IAGFREE_LOCK(imap);
+			iagp->iagfree = cpu_to_le32(imap->im_freeiag);
+			imap->im_freeiag = iagno;
+			IAGFREE_UNLOCK(imap);
+		}
+	}
+
+	/* remove the iag from the ag inode free list if freeing
+	 * this extent causes the iag to have no free inodes.
+	 */
+	if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
+		if ((int) le32_to_cpu(iagp->inofreefwd) >= 0)
+			ciagp->inofreeback = iagp->inofreeback;
+
+		if ((int) le32_to_cpu(iagp->inofreeback) >= 0)
+			diagp->inofreefwd = iagp->inofreefwd;
+		else
+			imap->im_agctl[agno].inofree =
+			    le32_to_cpu(iagp->inofreefwd);
+
+		iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
+	}
+
+	/* update the inode extent address and working map
+	 * to reflect the free extent.
+	 * the permanent map should have been updated already
+	 * for the inode being freed.
+	 */
+	if (iagp->pmap[extno] != 0) {
+		jfs_error(ip->i_sb, "diFree: the pmap does not show inode free");
+	}
+	iagp->wmap[extno] = 0;
+	PXDlength(&iagp->inoext[extno], 0);
+	PXDaddress(&iagp->inoext[extno], 0);
+
+	/* update the free extent and free inode summary maps
+	 * to reflect the freed extent.
+	 * the inode summary map is marked to indicate no inodes
+	 * available for the freed extent.
+	 */
+	sword = extno >> L2EXTSPERSUM;
+	bitno = extno & (EXTSPERSUM - 1);
+	mask = HIGHORDER >> bitno;
+	iagp->inosmap[sword] |= cpu_to_le32(mask);
+	iagp->extsmap[sword] &= cpu_to_le32(~mask);
+
+	/* update the number of free inodes and number of free extents
+	 * for the iag.
+	 */
+	le32_add_cpu(&iagp->nfreeinos, -(INOSPEREXT - 1));
+	le32_add_cpu(&iagp->nfreeexts, 1);
+
+	/* update the number of free inodes and backed inodes
+	 * at the ag and inode map level.
+	 */
+	imap->im_agctl[agno].numfree -= (INOSPEREXT - 1);
+	imap->im_agctl[agno].numinos -= INOSPEREXT;
+	atomic_sub(INOSPEREXT - 1, &imap->im_numfree);
+	atomic_sub(INOSPEREXT, &imap->im_numinos);
+
+	if (amp)
+		write_metapage(amp);
+	if (bmp)
+		write_metapage(bmp);
+	if (cmp)
+		write_metapage(cmp);
+	if (dmp)
+		write_metapage(dmp);
+
+	/*
+	 * start transaction to update block allocation map
+	 * for the inode extent freed;
+	 *
+	 * N.B. AG_LOCK is released and iag will be released below, and
+	 * other thread may allocate inode from/reusing the ixad freed
+	 * BUT with new/different backing inode extent from the extent
+	 * to be freed by the transaction;
+	 */
+	tid = txBegin(ipimap->i_sb, COMMIT_FORCE);
+	mutex_lock(&JFS_IP(ipimap)->commit_mutex);
+
+	/* acquire tlock of the iag page of the freed ixad
+	 * to force the page NOHOMEOK (even though no data is
+	 * logged from the iag page) until NOREDOPAGE|FREEXTENT log
+	 * for the free of the extent is committed;
+	 * write FREEXTENT|NOREDOPAGE log record
+	 * N.B. linelock is overlaid as freed extent descriptor;
+	 */
+	tlck = txLock(tid, ipimap, mp, tlckINODE | tlckFREE);
+	pxdlock = (struct pxd_lock *) & tlck->lock;
+	pxdlock->flag = mlckFREEPXD;
+	pxdlock->pxd = freepxd;
+	pxdlock->index = 1;
+
+	write_metapage(mp);
+
+	iplist[0] = ipimap;
+
+	/*
+	 * logredo needs the IAG number and IAG extent index in order
+	 * to ensure that the IMap is consistent.  The least disruptive
+	 * way to pass these values through  to the transaction manager
+	 * is in the iplist array.
+	 *
+	 * It's not pretty, but it works.
+	 */
+	iplist[1] = (struct inode *) (size_t)iagno;
+	iplist[2] = (struct inode *) (size_t)extno;
+
+	rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
+
+	txEnd(tid);
+	mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
+
+	/* unlock the AG inode map information */
+	AG_UNLOCK(imap, agno);
+
+	return (0);
+
+      error_out:
+	IREAD_UNLOCK(ipimap);
+
+	if (amp)
+		release_metapage(amp);
+	if (bmp)
+		release_metapage(bmp);
+	if (cmp)
+		release_metapage(cmp);
+	if (dmp)
+		release_metapage(dmp);
+
+	AG_UNLOCK(imap, agno);
+
+	release_metapage(mp);
+
+	return (rc);
+}
+
+/*
+ * There are several places in the diAlloc* routines where we initialize
+ * the inode.
+ */
+static inline void
+diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+
+	ip->i_ino = (iagno << L2INOSPERIAG) + ino;
+	jfs_ip->ixpxd = iagp->inoext[extno];
+	jfs_ip->agstart = le64_to_cpu(iagp->agstart);
+	jfs_ip->active_ag = -1;
+}
+
+
+/*
+ * NAME:	diAlloc(pip,dir,ip)
+ *
+ * FUNCTION:	allocate a disk inode from the inode working map
+ *		for a fileset or aggregate.
+ *
+ * PARAMETERS:
+ *	pip	- pointer to incore inode for the parent inode.
+ *	dir	- 'true' if the new disk inode is for a directory.
+ *	ip	- pointer to a new inode
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ */
+int diAlloc(struct inode *pip, bool dir, struct inode *ip)
+{
+	int rc, ino, iagno, addext, extno, bitno, sword;
+	int nwords, rem, i, agno;
+	u32 mask, inosmap, extsmap;
+	struct inode *ipimap;
+	struct metapage *mp;
+	ino_t inum;
+	struct iag *iagp;
+	struct inomap *imap;
+
+	/* get the pointers to the inode map inode and the
+	 * corresponding imap control structure.
+	 */
+	ipimap = JFS_SBI(pip->i_sb)->ipimap;
+	imap = JFS_IP(ipimap)->i_imap;
+	JFS_IP(ip)->ipimap = ipimap;
+	JFS_IP(ip)->fileset = FILESYSTEM_I;
+
+	/* for a directory, the allocation policy is to start
+	 * at the ag level using the preferred ag.
+	 */
+	if (dir) {
+		agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
+		AG_LOCK(imap, agno);
+		goto tryag;
+	}
+
+	/* for files, the policy starts off by trying to allocate from
+	 * the same iag containing the parent disk inode:
+	 * try to allocate the new disk inode close to the parent disk
+	 * inode, using parent disk inode number + 1 as the allocation
+	 * hint.  (we use a left-to-right policy to attempt to avoid
+	 * moving backward on the disk.)  compute the hint within the
+	 * file system and the iag.
+	 */
+
+	/* get the ag number of this iag */
+	agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb));
+
+	if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) {
+		/*
+		 * There is an open file actively growing.  We want to
+		 * allocate new inodes from a different ag to avoid
+		 * fragmentation problems.
+		 */
+		agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
+		AG_LOCK(imap, agno);
+		goto tryag;
+	}
+
+	inum = pip->i_ino + 1;
+	ino = inum & (INOSPERIAG - 1);
+
+	/* back off the hint if it is outside of the iag */
+	if (ino == 0)
+		inum = pip->i_ino;
+
+	/* lock the AG inode map information */
+	AG_LOCK(imap, agno);
+
+	/* Get read lock on imap inode */
+	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
+
+	/* get the iag number and read the iag */
+	iagno = INOTOIAG(inum);
+	if ((rc = diIAGRead(imap, iagno, &mp))) {
+		IREAD_UNLOCK(ipimap);
+		AG_UNLOCK(imap, agno);
+		return (rc);
+	}
+	iagp = (struct iag *) mp->data;
+
+	/* determine if new inode extent is allowed to be added to the iag.
+	 * new inode extent can be added to the iag if the ag
+	 * has less than 32 free disk inodes and the iag has free extents.
+	 */
+	addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts);
+
+	/*
+	 *	try to allocate from the IAG
+	 */
+	/* check if the inode may be allocated from the iag
+	 * (i.e. the inode has free inodes or new extent can be added).
+	 */
+	if (iagp->nfreeinos || addext) {
+		/* determine the extent number of the hint.
+		 */
+		extno = ino >> L2INOSPEREXT;
+
+		/* check if the extent containing the hint has backed
+		 * inodes.  if so, try to allocate within this extent.
+		 */
+		if (addressPXD(&iagp->inoext[extno])) {
+			bitno = ino & (INOSPEREXT - 1);
+			if ((bitno =
+			     diFindFree(le32_to_cpu(iagp->wmap[extno]),
+					bitno))
+			    < INOSPEREXT) {
+				ino = (extno << L2INOSPEREXT) + bitno;
+
+				/* a free inode (bit) was found within this
+				 * extent, so allocate it.
+				 */
+				rc = diAllocBit(imap, iagp, ino);
+				IREAD_UNLOCK(ipimap);
+				if (rc) {
+					assert(rc == -EIO);
+				} else {
+					/* set the results of the allocation
+					 * and write the iag.
+					 */
+					diInitInode(ip, iagno, ino, extno,
+						    iagp);
+					mark_metapage_dirty(mp);
+				}
+				release_metapage(mp);
+
+				/* free the AG lock and return.
+				 */
+				AG_UNLOCK(imap, agno);
+				return (rc);
+			}
+
+			if (!addext)
+				extno =
+				    (extno ==
+				     EXTSPERIAG - 1) ? 0 : extno + 1;
+		}
+
+		/*
+		 * no free inodes within the extent containing the hint.
+		 *
+		 * try to allocate from the backed extents following
+		 * hint or, if appropriate (i.e. addext is true), allocate
+		 * an extent of free inodes at or following the extent
+		 * containing the hint.
+		 *
+		 * the free inode and free extent summary maps are used
+		 * here, so determine the starting summary map position
+		 * and the number of words we'll have to examine.  again,
+		 * the approach is to allocate following the hint, so we
+		 * might have to initially ignore prior bits of the summary
+		 * map that represent extents prior to the extent containing
+		 * the hint and later revisit these bits.
+		 */
+		bitno = extno & (EXTSPERSUM - 1);
+		nwords = (bitno == 0) ? SMAPSZ : SMAPSZ + 1;
+		sword = extno >> L2EXTSPERSUM;
+
+		/* mask any prior bits for the starting words of the
+		 * summary map.
+		 */
+		mask = ONES << (EXTSPERSUM - bitno);
+		inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask;
+		extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask;
+
+		/* scan the free inode and free extent summary maps for
+		 * free resources.
+		 */
+		for (i = 0; i < nwords; i++) {
+			/* check if this word of the free inode summary
+			 * map describes an extent with free inodes.
+			 */
+			if (~inosmap) {
+				/* an extent with free inodes has been
+				 * found. determine the extent number
+				 * and the inode number within the extent.
+				 */
+				rem = diFindFree(inosmap, 0);
+				extno = (sword << L2EXTSPERSUM) + rem;
+				rem = diFindFree(le32_to_cpu(iagp->wmap[extno]),
+						 0);
+				if (rem >= INOSPEREXT) {
+					IREAD_UNLOCK(ipimap);
+					release_metapage(mp);
+					AG_UNLOCK(imap, agno);
+					jfs_error(ip->i_sb,
+						  "diAlloc: can't find free bit "
+						  "in wmap");
+					return -EIO;
+				}
+
+				/* determine the inode number within the
+				 * iag and allocate the inode from the
+				 * map.
+				 */
+				ino = (extno << L2INOSPEREXT) + rem;
+				rc = diAllocBit(imap, iagp, ino);
+				IREAD_UNLOCK(ipimap);
+				if (rc)
+					assert(rc == -EIO);
+				else {
+					/* set the results of the allocation
+					 * and write the iag.
+					 */
+					diInitInode(ip, iagno, ino, extno,
+						    iagp);
+					mark_metapage_dirty(mp);
+				}
+				release_metapage(mp);
+
+				/* free the AG lock and return.
+				 */
+				AG_UNLOCK(imap, agno);
+				return (rc);
+
+			}
+
+			/* check if we may allocate an extent of free
+			 * inodes and whether this word of the free
+			 * extents summary map describes a free extent.
+			 */
+			if (addext && ~extsmap) {
+				/* a free extent has been found.  determine
+				 * the extent number.
+				 */
+				rem = diFindFree(extsmap, 0);
+				extno = (sword << L2EXTSPERSUM) + rem;
+
+				/* allocate an extent of free inodes.
+				 */
+				if ((rc = diNewExt(imap, iagp, extno))) {
+					/* if there is no disk space for a
+					 * new extent, try to allocate the
+					 * disk inode from somewhere else.
+					 */
+					if (rc == -ENOSPC)
+						break;
+
+					assert(rc == -EIO);
+				} else {
+					/* set the results of the allocation
+					 * and write the iag.
+					 */
+					diInitInode(ip, iagno,
+						    extno << L2INOSPEREXT,
+						    extno, iagp);
+					mark_metapage_dirty(mp);
+				}
+				release_metapage(mp);
+				/* free the imap inode & the AG lock & return.
+				 */
+				IREAD_UNLOCK(ipimap);
+				AG_UNLOCK(imap, agno);
+				return (rc);
+			}
+
+			/* move on to the next set of summary map words.
+			 */
+			sword = (sword == SMAPSZ - 1) ? 0 : sword + 1;
+			inosmap = le32_to_cpu(iagp->inosmap[sword]);
+			extsmap = le32_to_cpu(iagp->extsmap[sword]);
+		}
+	}
+	/* unlock imap inode */
+	IREAD_UNLOCK(ipimap);
+
+	/* nothing doing in this iag, so release it. */
+	release_metapage(mp);
+
+      tryag:
+	/*
+	 * try to allocate anywhere within the same AG as the parent inode.
+	 */
+	rc = diAllocAG(imap, agno, dir, ip);
+
+	AG_UNLOCK(imap, agno);
+
+	if (rc != -ENOSPC)
+		return (rc);
+
+	/*
+	 * try to allocate in any AG.
+	 */
+	return (diAllocAny(imap, agno, dir, ip));
+}
+
+
+/*
+ * NAME:	diAllocAG(imap,agno,dir,ip)
+ *
+ * FUNCTION:	allocate a disk inode from the allocation group.
+ *
+ *		this routine first determines if a new extent of free
+ *		inodes should be added for the allocation group, with
+ *		the current request satisfied from this extent. if this
+ *		is the case, an attempt will be made to do just that.  if
+ *		this attempt fails or it has been determined that a new
+ *		extent should not be added, an attempt is made to satisfy
+ *		the request by allocating an existing (backed) free inode
+ *		from the allocation group.
+ *
+ * PRE CONDITION: Already have the AG lock for this AG.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	agno	- allocation group to allocate from.
+ *	dir	- 'true' if the new disk inode is for a directory.
+ *	ip	- pointer to the new inode to be filled in on successful return
+ *		  with the disk inode number allocated, its extent address
+ *		  and the start of the ag.
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ */
+static int
+diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
+{
+	int rc, addext, numfree, numinos;
+
+	/* get the number of free and the number of backed disk
+	 * inodes currently within the ag.
+	 */
+	numfree = imap->im_agctl[agno].numfree;
+	numinos = imap->im_agctl[agno].numinos;
+
+	if (numfree > numinos) {
+		jfs_error(ip->i_sb, "diAllocAG: numfree > numinos");
+		return -EIO;
+	}
+
+	/* determine if we should allocate a new extent of free inodes
+	 * within the ag: for directory inodes, add a new extent
+	 * if there are a small number of free inodes or number of free
+	 * inodes is a small percentage of the number of backed inodes.
+	 */
+	if (dir)
+		addext = (numfree < 64 ||
+			  (numfree < 256
+			   && ((numfree * 100) / numinos) <= 20));
+	else
+		addext = (numfree == 0);
+
+	/*
+	 * try to allocate a new extent of free inodes.
+	 */
+	if (addext) {
+		/* if free space is not available for this new extent, try
+		 * below to allocate a free and existing (already backed)
+		 * inode from the ag.
+		 */
+		if ((rc = diAllocExt(imap, agno, ip)) != -ENOSPC)
+			return (rc);
+	}
+
+	/*
+	 * try to allocate an existing free inode from the ag.
+	 */
+	return (diAllocIno(imap, agno, ip));
+}
+
+
+/*
+ * NAME:	diAllocAny(imap,agno,dir,iap)
+ *
+ * FUNCTION:	allocate a disk inode from any other allocation group.
+ *
+ *		this routine is called when an allocation attempt within
+ *		the primary allocation group has failed. if attempts to
+ *		allocate an inode from any allocation group other than the
+ *		specified primary group.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	agno	- primary allocation group (to avoid).
+ *	dir	- 'true' if the new disk inode is for a directory.
+ *	ip	- pointer to a new inode to be filled in on successful return
+ *		  with the disk inode number allocated, its extent address
+ *		  and the start of the ag.
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ */
+static int
+diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
+{
+	int ag, rc;
+	int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag;
+
+
+	/* try to allocate from the ags following agno up to
+	 * the maximum ag number.
+	 */
+	for (ag = agno + 1; ag <= maxag; ag++) {
+		AG_LOCK(imap, ag);
+
+		rc = diAllocAG(imap, ag, dir, ip);
+
+		AG_UNLOCK(imap, ag);
+
+		if (rc != -ENOSPC)
+			return (rc);
+	}
+
+	/* try to allocate from the ags in front of agno.
+	 */
+	for (ag = 0; ag < agno; ag++) {
+		AG_LOCK(imap, ag);
+
+		rc = diAllocAG(imap, ag, dir, ip);
+
+		AG_UNLOCK(imap, ag);
+
+		if (rc != -ENOSPC)
+			return (rc);
+	}
+
+	/* no free disk inodes.
+	 */
+	return -ENOSPC;
+}
+
+
+/*
+ * NAME:	diAllocIno(imap,agno,ip)
+ *
+ * FUNCTION:	allocate a disk inode from the allocation group's free
+ *		inode list, returning an error if this free list is
+ *		empty (i.e. no iags on the list).
+ *
+ *		allocation occurs from the first iag on the list using
+ *		the iag's free inode summary map to find the leftmost
+ *		free inode in the iag.
+ *
+ * PRE CONDITION: Already have AG lock for this AG.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	agno	- allocation group.
+ *	ip	- pointer to new inode to be filled in on successful return
+ *		  with the disk inode number allocated, its extent address
+ *		  and the start of the ag.
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ */
+static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
+{
+	int iagno, ino, rc, rem, extno, sword;
+	struct metapage *mp;
+	struct iag *iagp;
+
+	/* check if there are iags on the ag's free inode list.
+	 */
+	if ((iagno = imap->im_agctl[agno].inofree) < 0)
+		return -ENOSPC;
+
+	/* obtain read lock on imap inode */
+	IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
+
+	/* read the iag at the head of the list.
+	 */
+	if ((rc = diIAGRead(imap, iagno, &mp))) {
+		IREAD_UNLOCK(imap->im_ipimap);
+		return (rc);
+	}
+	iagp = (struct iag *) mp->data;
+
+	/* better be free inodes in this iag if it is on the
+	 * list.
+	 */
+	if (!iagp->nfreeinos) {
+		IREAD_UNLOCK(imap->im_ipimap);
+		release_metapage(mp);
+		jfs_error(ip->i_sb,
+			  "diAllocIno: nfreeinos = 0, but iag on freelist");
+		return -EIO;
+	}
+
+	/* scan the free inode summary map to find an extent
+	 * with free inodes.
+	 */
+	for (sword = 0;; sword++) {
+		if (sword >= SMAPSZ) {
+			IREAD_UNLOCK(imap->im_ipimap);
+			release_metapage(mp);
+			jfs_error(ip->i_sb,
+				  "diAllocIno: free inode not found in summary map");
+			return -EIO;
+		}
+
+		if (~iagp->inosmap[sword])
+			break;
+	}
+
+	/* found a extent with free inodes. determine
+	 * the extent number.
+	 */
+	rem = diFindFree(le32_to_cpu(iagp->inosmap[sword]), 0);
+	if (rem >= EXTSPERSUM) {
+		IREAD_UNLOCK(imap->im_ipimap);
+		release_metapage(mp);
+		jfs_error(ip->i_sb, "diAllocIno: no free extent found");
+		return -EIO;
+	}
+	extno = (sword << L2EXTSPERSUM) + rem;
+
+	/* find the first free inode in the extent.
+	 */
+	rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0);
+	if (rem >= INOSPEREXT) {
+		IREAD_UNLOCK(imap->im_ipimap);
+		release_metapage(mp);
+		jfs_error(ip->i_sb, "diAllocIno: free inode not found");
+		return -EIO;
+	}
+
+	/* compute the inode number within the iag.
+	 */
+	ino = (extno << L2INOSPEREXT) + rem;
+
+	/* allocate the inode.
+	 */
+	rc = diAllocBit(imap, iagp, ino);
+	IREAD_UNLOCK(imap->im_ipimap);
+	if (rc) {
+		release_metapage(mp);
+		return (rc);
+	}
+
+	/* set the results of the allocation and write the iag.
+	 */
+	diInitInode(ip, iagno, ino, extno, iagp);
+	write_metapage(mp);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	diAllocExt(imap,agno,ip)
+ *
+ * FUNCTION:	add a new extent of free inodes to an iag, allocating
+ *		an inode from this extent to satisfy the current allocation
+ *		request.
+ *
+ *		this routine first tries to find an existing iag with free
+ *		extents through the ag free extent list.  if list is not
+ *		empty, the head of the list will be selected as the home
+ *		of the new extent of free inodes.  otherwise (the list is
+ *		empty), a new iag will be allocated for the ag to contain
+ *		the extent.
+ *
+ *		once an iag has been selected, the free extent summary map
+ *		is used to locate a free extent within the iag and diNewExt()
+ *		is called to initialize the extent, with initialization
+ *		including the allocation of the first inode of the extent
+ *		for the purpose of satisfying this request.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	agno	- allocation group number.
+ *	ip	- pointer to new inode to be filled in on successful return
+ *		  with the disk inode number allocated, its extent address
+ *		  and the start of the ag.
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ */
+static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
+{
+	int rem, iagno, sword, extno, rc;
+	struct metapage *mp;
+	struct iag *iagp;
+
+	/* check if the ag has any iags with free extents.  if not,
+	 * allocate a new iag for the ag.
+	 */
+	if ((iagno = imap->im_agctl[agno].extfree) < 0) {
+		/* If successful, diNewIAG will obtain the read lock on the
+		 * imap inode.
+		 */
+		if ((rc = diNewIAG(imap, &iagno, agno, &mp))) {
+			return (rc);
+		}
+		iagp = (struct iag *) mp->data;
+
+		/* set the ag number if this a brand new iag
+		 */
+		iagp->agstart =
+		    cpu_to_le64(AGTOBLK(agno, imap->im_ipimap));
+	} else {
+		/* read the iag.
+		 */
+		IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
+		if ((rc = diIAGRead(imap, iagno, &mp))) {
+			IREAD_UNLOCK(imap->im_ipimap);
+			jfs_error(ip->i_sb, "diAllocExt: error reading iag");
+			return rc;
+		}
+		iagp = (struct iag *) mp->data;
+	}
+
+	/* using the free extent summary map, find a free extent.
+	 */
+	for (sword = 0;; sword++) {
+		if (sword >= SMAPSZ) {
+			release_metapage(mp);
+			IREAD_UNLOCK(imap->im_ipimap);
+			jfs_error(ip->i_sb,
+				  "diAllocExt: free ext summary map not found");
+			return -EIO;
+		}
+		if (~iagp->extsmap[sword])
+			break;
+	}
+
+	/* determine the extent number of the free extent.
+	 */
+	rem = diFindFree(le32_to_cpu(iagp->extsmap[sword]), 0);
+	if (rem >= EXTSPERSUM) {
+		release_metapage(mp);
+		IREAD_UNLOCK(imap->im_ipimap);
+		jfs_error(ip->i_sb, "diAllocExt: free extent not found");
+		return -EIO;
+	}
+	extno = (sword << L2EXTSPERSUM) + rem;
+
+	/* initialize the new extent.
+	 */
+	rc = diNewExt(imap, iagp, extno);
+	IREAD_UNLOCK(imap->im_ipimap);
+	if (rc) {
+		/* something bad happened.  if a new iag was allocated,
+		 * place it back on the inode map's iag free list, and
+		 * clear the ag number information.
+		 */
+		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
+			IAGFREE_LOCK(imap);
+			iagp->iagfree = cpu_to_le32(imap->im_freeiag);
+			imap->im_freeiag = iagno;
+			IAGFREE_UNLOCK(imap);
+		}
+		write_metapage(mp);
+		return (rc);
+	}
+
+	/* set the results of the allocation and write the iag.
+	 */
+	diInitInode(ip, iagno, extno << L2INOSPEREXT, extno, iagp);
+
+	write_metapage(mp);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	diAllocBit(imap,iagp,ino)
+ *
+ * FUNCTION:	allocate a backed inode from an iag.
+ *
+ *		this routine performs the mechanics of allocating a
+ *		specified inode from a backed extent.
+ *
+ *		if the inode to be allocated represents the last free
+ *		inode within the iag, the iag will be removed from the
+ *		ag free inode list.
+ *
+ *		a careful update approach is used to provide consistency
+ *		in the face of updates to multiple buffers.  under this
+ *		approach, all required buffers are obtained before making
+ *		any updates and are held all are updates are complete.
+ *
+ * PRE CONDITION: Already have buffer lock on iagp.  Already have AG lock on
+ *	this AG.  Must have read lock on imap inode.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	iagp	- pointer to iag.
+ *	ino	- inode number to be allocated within the iag.
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ */
+static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
+{
+	int extno, bitno, agno, sword, rc;
+	struct metapage *amp = NULL, *bmp = NULL;
+	struct iag *aiagp = NULL, *biagp = NULL;
+	u32 mask;
+
+	/* check if this is the last free inode within the iag.
+	 * if so, it will have to be removed from the ag free
+	 * inode list, so get the iags preceding and following
+	 * it on the list.
+	 */
+	if (iagp->nfreeinos == cpu_to_le32(1)) {
+		if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) {
+			if ((rc =
+			     diIAGRead(imap, le32_to_cpu(iagp->inofreefwd),
+				       &amp)))
+				return (rc);
+			aiagp = (struct iag *) amp->data;
+		}
+
+		if ((int) le32_to_cpu(iagp->inofreeback) >= 0) {
+			if ((rc =
+			     diIAGRead(imap,
+				       le32_to_cpu(iagp->inofreeback),
+				       &bmp))) {
+				if (amp)
+					release_metapage(amp);
+				return (rc);
+			}
+			biagp = (struct iag *) bmp->data;
+		}
+	}
+
+	/* get the ag number, extent number, inode number within
+	 * the extent.
+	 */
+	agno = BLKTOAG(le64_to_cpu(iagp->agstart), JFS_SBI(imap->im_ipimap->i_sb));
+	extno = ino >> L2INOSPEREXT;
+	bitno = ino & (INOSPEREXT - 1);
+
+	/* compute the mask for setting the map.
+	 */
+	mask = HIGHORDER >> bitno;
+
+	/* the inode should be free and backed.
+	 */
+	if (((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) ||
+	    ((le32_to_cpu(iagp->wmap[extno]) & mask) != 0) ||
+	    (addressPXD(&iagp->inoext[extno]) == 0)) {
+		if (amp)
+			release_metapage(amp);
+		if (bmp)
+			release_metapage(bmp);
+
+		jfs_error(imap->im_ipimap->i_sb,
+			  "diAllocBit: iag inconsistent");
+		return -EIO;
+	}
+
+	/* mark the inode as allocated in the working map.
+	 */
+	iagp->wmap[extno] |= cpu_to_le32(mask);
+
+	/* check if all inodes within the extent are now
+	 * allocated.  if so, update the free inode summary
+	 * map to reflect this.
+	 */
+	if (iagp->wmap[extno] == cpu_to_le32(ONES)) {
+		sword = extno >> L2EXTSPERSUM;
+		bitno = extno & (EXTSPERSUM - 1);
+		iagp->inosmap[sword] |= cpu_to_le32(HIGHORDER >> bitno);
+	}
+
+	/* if this was the last free inode in the iag, remove the
+	 * iag from the ag free inode list.
+	 */
+	if (iagp->nfreeinos == cpu_to_le32(1)) {
+		if (amp) {
+			aiagp->inofreeback = iagp->inofreeback;
+			write_metapage(amp);
+		}
+
+		if (bmp) {
+			biagp->inofreefwd = iagp->inofreefwd;
+			write_metapage(bmp);
+		} else {
+			imap->im_agctl[agno].inofree =
+			    le32_to_cpu(iagp->inofreefwd);
+		}
+		iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
+	}
+
+	/* update the free inode count at the iag, ag, inode
+	 * map levels.
+	 */
+	le32_add_cpu(&iagp->nfreeinos, -1);
+	imap->im_agctl[agno].numfree -= 1;
+	atomic_dec(&imap->im_numfree);
+
+	return (0);
+}
+
+
+/*
+ * NAME:	diNewExt(imap,iagp,extno)
+ *
+ * FUNCTION:	initialize a new extent of inodes for an iag, allocating
+ *		the first inode of the extent for use for the current
+ *		allocation request.
+ *
+ *		disk resources are allocated for the new extent of inodes
+ *		and the inodes themselves are initialized to reflect their
+ *		existence within the extent (i.e. their inode numbers and
+ *		inode extent addresses are set) and their initial state
+ *		(mode and link count are set to zero).
+ *
+ *		if the iag is new, it is not yet on an ag extent free list
+ *		but will now be placed on this list.
+ *
+ *		if the allocation of the new extent causes the iag to
+ *		have no free extent, the iag will be removed from the
+ *		ag extent free list.
+ *
+ *		if the iag has no free backed inodes, it will be placed
+ *		on the ag free inode list, since the addition of the new
+ *		extent will now cause it to have free inodes.
+ *
+ *		a careful update approach is used to provide consistency
+ *		(i.e. list consistency) in the face of updates to multiple
+ *		buffers.  under this approach, all required buffers are
+ *		obtained before making any updates and are held until all
+ *		updates are complete.
+ *
+ * PRE CONDITION: Already have buffer lock on iagp.  Already have AG lock on
+ *	this AG.  Must have read lock on imap inode.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	iagp	- pointer to iag.
+ *	extno	- extent number.
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ */
+static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
+{
+	int agno, iagno, fwd, back, freei = 0, sword, rc;
+	struct iag *aiagp = NULL, *biagp = NULL, *ciagp = NULL;
+	struct metapage *amp, *bmp, *cmp, *dmp;
+	struct inode *ipimap;
+	s64 blkno, hint;
+	int i, j;
+	u32 mask;
+	ino_t ino;
+	struct dinode *dp;
+	struct jfs_sb_info *sbi;
+
+	/* better have free extents.
+	 */
+	if (!iagp->nfreeexts) {
+		jfs_error(imap->im_ipimap->i_sb, "diNewExt: no free extents");
+		return -EIO;
+	}
+
+	/* get the inode map inode.
+	 */
+	ipimap = imap->im_ipimap;
+	sbi = JFS_SBI(ipimap->i_sb);
+
+	amp = bmp = cmp = NULL;
+
+	/* get the ag and iag numbers for this iag.
+	 */
+	agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
+	iagno = le32_to_cpu(iagp->iagnum);
+
+	/* check if this is the last free extent within the
+	 * iag.  if so, the iag must be removed from the ag
+	 * free extent list, so get the iags preceding and
+	 * following the iag on this list.
+	 */
+	if (iagp->nfreeexts == cpu_to_le32(1)) {
+		if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
+			if ((rc = diIAGRead(imap, fwd, &amp)))
+				return (rc);
+			aiagp = (struct iag *) amp->data;
+		}
+
+		if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
+			if ((rc = diIAGRead(imap, back, &bmp)))
+				goto error_out;
+			biagp = (struct iag *) bmp->data;
+		}
+	} else {
+		/* the iag has free extents.  if all extents are free
+		 * (as is the case for a newly allocated iag), the iag
+		 * must be added to the ag free extent list, so get
+		 * the iag at the head of the list in preparation for
+		 * adding this iag to this list.
+		 */
+		fwd = back = -1;
+		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
+			if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
+				if ((rc = diIAGRead(imap, fwd, &amp)))
+					goto error_out;
+				aiagp = (struct iag *) amp->data;
+			}
+		}
+	}
+
+	/* check if the iag has no free inodes.  if so, the iag
+	 * will have to be added to the ag free inode list, so get
+	 * the iag at the head of the list in preparation for
+	 * adding this iag to this list.  in doing this, we must
+	 * check if we already have the iag at the head of
+	 * the list in hand.
+	 */
+	if (iagp->nfreeinos == 0) {
+		freei = imap->im_agctl[agno].inofree;
+
+		if (freei >= 0) {
+			if (freei == fwd) {
+				ciagp = aiagp;
+			} else if (freei == back) {
+				ciagp = biagp;
+			} else {
+				if ((rc = diIAGRead(imap, freei, &cmp)))
+					goto error_out;
+				ciagp = (struct iag *) cmp->data;
+			}
+			if (ciagp == NULL) {
+				jfs_error(imap->im_ipimap->i_sb,
+					  "diNewExt: ciagp == NULL");
+				rc = -EIO;
+				goto error_out;
+			}
+		}
+	}
+
+	/* allocate disk space for the inode extent.
+	 */
+	if ((extno == 0) || (addressPXD(&iagp->inoext[extno - 1]) == 0))
+		hint = ((s64) agno << sbi->bmap->db_agl2size) - 1;
+	else
+		hint = addressPXD(&iagp->inoext[extno - 1]) +
+		    lengthPXD(&iagp->inoext[extno - 1]) - 1;
+
+	if ((rc = dbAlloc(ipimap, hint, (s64) imap->im_nbperiext, &blkno)))
+		goto error_out;
+
+	/* compute the inode number of the first inode within the
+	 * extent.
+	 */
+	ino = (iagno << L2INOSPERIAG) + (extno << L2INOSPEREXT);
+
+	/* initialize the inodes within the newly allocated extent a
+	 * page at a time.
+	 */
+	for (i = 0; i < imap->im_nbperiext; i += sbi->nbperpage) {
+		/* get a buffer for this page of disk inodes.
+		 */
+		dmp = get_metapage(ipimap, blkno + i, PSIZE, 1);
+		if (dmp == NULL) {
+			rc = -EIO;
+			goto error_out;
+		}
+		dp = (struct dinode *) dmp->data;
+
+		/* initialize the inode number, mode, link count and
+		 * inode extent address.
+		 */
+		for (j = 0; j < INOSPERPAGE; j++, dp++, ino++) {
+			dp->di_inostamp = cpu_to_le32(sbi->inostamp);
+			dp->di_number = cpu_to_le32(ino);
+			dp->di_fileset = cpu_to_le32(FILESYSTEM_I);
+			dp->di_mode = 0;
+			dp->di_nlink = 0;
+			PXDaddress(&(dp->di_ixpxd), blkno);
+			PXDlength(&(dp->di_ixpxd), imap->im_nbperiext);
+		}
+		write_metapage(dmp);
+	}
+
+	/* if this is the last free extent within the iag, remove the
+	 * iag from the ag free extent list.
+	 */
+	if (iagp->nfreeexts == cpu_to_le32(1)) {
+		if (fwd >= 0)
+			aiagp->extfreeback = iagp->extfreeback;
+
+		if (back >= 0)
+			biagp->extfreefwd = iagp->extfreefwd;
+		else
+			imap->im_agctl[agno].extfree =
+			    le32_to_cpu(iagp->extfreefwd);
+
+		iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
+	} else {
+		/* if the iag has all free extents (newly allocated iag),
+		 * add the iag to the ag free extent list.
+		 */
+		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
+			if (fwd >= 0)
+				aiagp->extfreeback = cpu_to_le32(iagno);
+
+			iagp->extfreefwd = cpu_to_le32(fwd);
+			iagp->extfreeback = cpu_to_le32(-1);
+			imap->im_agctl[agno].extfree = iagno;
+		}
+	}
+
+	/* if the iag has no free inodes, add the iag to the
+	 * ag free inode list.
+	 */
+	if (iagp->nfreeinos == 0) {
+		if (freei >= 0)
+			ciagp->inofreeback = cpu_to_le32(iagno);
+
+		iagp->inofreefwd =
+		    cpu_to_le32(imap->im_agctl[agno].inofree);
+		iagp->inofreeback = cpu_to_le32(-1);
+		imap->im_agctl[agno].inofree = iagno;
+	}
+
+	/* initialize the extent descriptor of the extent. */
+	PXDlength(&iagp->inoext[extno], imap->im_nbperiext);
+	PXDaddress(&iagp->inoext[extno], blkno);
+
+	/* initialize the working and persistent map of the extent.
+	 * the working map will be initialized such that
+	 * it indicates the first inode of the extent is allocated.
+	 */
+	iagp->wmap[extno] = cpu_to_le32(HIGHORDER);
+	iagp->pmap[extno] = 0;
+
+	/* update the free inode and free extent summary maps
+	 * for the extent to indicate the extent has free inodes
+	 * and no longer represents a free extent.
+	 */
+	sword = extno >> L2EXTSPERSUM;
+	mask = HIGHORDER >> (extno & (EXTSPERSUM - 1));
+	iagp->extsmap[sword] |= cpu_to_le32(mask);
+	iagp->inosmap[sword] &= cpu_to_le32(~mask);
+
+	/* update the free inode and free extent counts for the
+	 * iag.
+	 */
+	le32_add_cpu(&iagp->nfreeinos, (INOSPEREXT - 1));
+	le32_add_cpu(&iagp->nfreeexts, -1);
+
+	/* update the free and backed inode counts for the ag.
+	 */
+	imap->im_agctl[agno].numfree += (INOSPEREXT - 1);
+	imap->im_agctl[agno].numinos += INOSPEREXT;
+
+	/* update the free and backed inode counts for the inode map.
+	 */
+	atomic_add(INOSPEREXT - 1, &imap->im_numfree);
+	atomic_add(INOSPEREXT, &imap->im_numinos);
+
+	/* write the iags.
+	 */
+	if (amp)
+		write_metapage(amp);
+	if (bmp)
+		write_metapage(bmp);
+	if (cmp)
+		write_metapage(cmp);
+
+	return (0);
+
+      error_out:
+
+	/* release the iags.
+	 */
+	if (amp)
+		release_metapage(amp);
+	if (bmp)
+		release_metapage(bmp);
+	if (cmp)
+		release_metapage(cmp);
+
+	return (rc);
+}
+
+
+/*
+ * NAME:	diNewIAG(imap,iagnop,agno)
+ *
+ * FUNCTION:	allocate a new iag for an allocation group.
+ *
+ *		first tries to allocate the iag from the inode map
+ *		iagfree list:
+ *		if the list has free iags, the head of the list is removed
+ *		and returned to satisfy the request.
+ *		if the inode map's iag free list is empty, the inode map
+ *		is extended to hold a new iag. this new iag is initialized
+ *		and returned to satisfy the request.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	iagnop	- pointer to an iag number set with the number of the
+ *		  newly allocated iag upon successful return.
+ *	agno	- allocation group number.
+ *	bpp	- Buffer pointer to be filled in with new IAG's buffer
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-ENOSPC	- insufficient disk resources.
+ *	-EIO	- i/o error.
+ *
+ * serialization:
+ *	AG lock held on entry/exit;
+ *	write lock on the map is held inside;
+ *	read lock on the map is held on successful completion;
+ *
+ * note: new iag transaction:
+ * . synchronously write iag;
+ * . write log of xtree and inode of imap;
+ * . commit;
+ * . synchronous write of xtree (right to left, bottom to top);
+ * . at start of logredo(): init in-memory imap with one additional iag page;
+ * . at end of logredo(): re-read imap inode to determine
+ *   new imap size;
+ */
+static int
+diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
+{
+	int rc;
+	int iagno, i, xlen;
+	struct inode *ipimap;
+	struct super_block *sb;
+	struct jfs_sb_info *sbi;
+	struct metapage *mp;
+	struct iag *iagp;
+	s64 xaddr = 0;
+	s64 blkno;
+	tid_t tid;
+	struct inode *iplist[1];
+
+	/* pick up pointers to the inode map and mount inodes */
+	ipimap = imap->im_ipimap;
+	sb = ipimap->i_sb;
+	sbi = JFS_SBI(sb);
+
+	/* acquire the free iag lock */
+	IAGFREE_LOCK(imap);
+
+	/* if there are any iags on the inode map free iag list,
+	 * allocate the iag from the head of the list.
+	 */
+	if (imap->im_freeiag >= 0) {
+		/* pick up the iag number at the head of the list */
+		iagno = imap->im_freeiag;
+
+		/* determine the logical block number of the iag */
+		blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
+	} else {
+		/* no free iags. the inode map will have to be extented
+		 * to include a new iag.
+		 */
+
+		/* acquire inode map lock */
+		IWRITE_LOCK(ipimap, RDWRLOCK_IMAP);
+
+		if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) {
+			IWRITE_UNLOCK(ipimap);
+			IAGFREE_UNLOCK(imap);
+			jfs_error(imap->im_ipimap->i_sb,
+				  "diNewIAG: ipimap->i_size is wrong");
+			return -EIO;
+		}
+
+
+		/* get the next available iag number */
+		iagno = imap->im_nextiag;
+
+		/* make sure that we have not exceeded the maximum inode
+		 * number limit.
+		 */
+		if (iagno > (MAXIAGS - 1)) {
+			/* release the inode map lock */
+			IWRITE_UNLOCK(ipimap);
+
+			rc = -ENOSPC;
+			goto out;
+		}
+
+		/*
+		 * synchronously append new iag page.
+		 */
+		/* determine the logical address of iag page to append */
+		blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
+
+		/* Allocate extent for new iag page */
+		xlen = sbi->nbperpage;
+		if ((rc = dbAlloc(ipimap, 0, (s64) xlen, &xaddr))) {
+			/* release the inode map lock */
+			IWRITE_UNLOCK(ipimap);
+
+			goto out;
+		}
+
+		/*
+		 * start transaction of update of the inode map
+		 * addressing structure pointing to the new iag page;
+		 */
+		tid = txBegin(sb, COMMIT_FORCE);
+		mutex_lock(&JFS_IP(ipimap)->commit_mutex);
+
+		/* update the inode map addressing structure to point to it */
+		if ((rc =
+		     xtInsert(tid, ipimap, 0, blkno, xlen, &xaddr, 0))) {
+			txEnd(tid);
+			mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
+			/* Free the blocks allocated for the iag since it was
+			 * not successfully added to the inode map
+			 */
+			dbFree(ipimap, xaddr, (s64) xlen);
+
+			/* release the inode map lock */
+			IWRITE_UNLOCK(ipimap);
+
+			goto out;
+		}
+
+		/* update the inode map's inode to reflect the extension */
+		ipimap->i_size += PSIZE;
+		inode_add_bytes(ipimap, PSIZE);
+
+		/* assign a buffer for the page */
+		mp = get_metapage(ipimap, blkno, PSIZE, 0);
+		if (!mp) {
+			/*
+			 * This is very unlikely since we just created the
+			 * extent, but let's try to handle it correctly
+			 */
+			xtTruncate(tid, ipimap, ipimap->i_size - PSIZE,
+				   COMMIT_PWMAP);
+
+			txAbort(tid, 0);
+			txEnd(tid);
+			mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
+
+			/* release the inode map lock */
+			IWRITE_UNLOCK(ipimap);
+
+			rc = -EIO;
+			goto out;
+		}
+		iagp = (struct iag *) mp->data;
+
+		/* init the iag */
+		memset(iagp, 0, sizeof(struct iag));
+		iagp->iagnum = cpu_to_le32(iagno);
+		iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
+		iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
+		iagp->iagfree = cpu_to_le32(-1);
+		iagp->nfreeinos = 0;
+		iagp->nfreeexts = cpu_to_le32(EXTSPERIAG);
+
+		/* initialize the free inode summary map (free extent
+		 * summary map initialization handled by bzero).
+		 */
+		for (i = 0; i < SMAPSZ; i++)
+			iagp->inosmap[i] = cpu_to_le32(ONES);
+
+		/*
+		 * Write and sync the metapage
+		 */
+		flush_metapage(mp);
+
+		/*
+		 * txCommit(COMMIT_FORCE) will synchronously write address
+		 * index pages and inode after commit in careful update order
+		 * of address index pages (right to left, bottom up);
+		 */
+		iplist[0] = ipimap;
+		rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
+
+		txEnd(tid);
+		mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
+
+		duplicateIXtree(sb, blkno, xlen, &xaddr);
+
+		/* update the next available iag number */
+		imap->im_nextiag += 1;
+
+		/* Add the iag to the iag free list so we don't lose the iag
+		 * if a failure happens now.
+		 */
+		imap->im_freeiag = iagno;
+
+		/* Until we have logredo working, we want the imap inode &
+		 * control page to be up to date.
+		 */
+		diSync(ipimap);
+
+		/* release the inode map lock */
+		IWRITE_UNLOCK(ipimap);
+	}
+
+	/* obtain read lock on map */
+	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
+
+	/* read the iag */
+	if ((rc = diIAGRead(imap, iagno, &mp))) {
+		IREAD_UNLOCK(ipimap);
+		rc = -EIO;
+		goto out;
+	}
+	iagp = (struct iag *) mp->data;
+
+	/* remove the iag from the iag free list */
+	imap->im_freeiag = le32_to_cpu(iagp->iagfree);
+	iagp->iagfree = cpu_to_le32(-1);
+
+	/* set the return iag number and buffer pointer */
+	*iagnop = iagno;
+	*mpp = mp;
+
+      out:
+	/* release the iag free lock */
+	IAGFREE_UNLOCK(imap);
+
+	return (rc);
+}
+
+/*
+ * NAME:	diIAGRead()
+ *
+ * FUNCTION:	get the buffer for the specified iag within a fileset
+ *		or aggregate inode map.
+ *
+ * PARAMETERS:
+ *	imap	- pointer to inode map control structure.
+ *	iagno	- iag number.
+ *	bpp	- point to buffer pointer to be filled in on successful
+ *		  exit.
+ *
+ * SERIALIZATION:
+ *	must have read lock on imap inode
+ *	(When called by diExtendFS, the filesystem is quiesced, therefore
+ *	 the read lock is unnecessary.)
+ *
+ * RETURN VALUES:
+ *	0	- success.
+ *	-EIO	- i/o error.
+ */
+static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
+{
+	struct inode *ipimap = imap->im_ipimap;
+	s64 blkno;
+
+	/* compute the logical block number of the iag. */
+	blkno = IAGTOLBLK(iagno, JFS_SBI(ipimap->i_sb)->l2nbperpage);
+
+	/* read the iag. */
+	*mpp = read_metapage(ipimap, blkno, PSIZE, 0);
+	if (*mpp == NULL) {
+		return -EIO;
+	}
+
+	return (0);
+}
+
+/*
+ * NAME:	diFindFree()
+ *
+ * FUNCTION:	find the first free bit in a word starting at
+ *		the specified bit position.
+ *
+ * PARAMETERS:
+ *	word	- word to be examined.
+ *	start	- starting bit position.
+ *
+ * RETURN VALUES:
+ *	bit position of first free bit in the word or 32 if
+ *	no free bits were found.
+ */
+static int diFindFree(u32 word, int start)
+{
+	int bitno;
+	assert(start < 32);
+	/* scan the word for the first free bit. */
+	for (word <<= start, bitno = start; bitno < 32;
+	     bitno++, word <<= 1) {
+		if ((word & HIGHORDER) == 0)
+			break;
+	}
+	return (bitno);
+}
+
+/*
+ * NAME:	diUpdatePMap()
+ *
+ * FUNCTION: Update the persistent map in an IAG for the allocation or
+ *	freeing of the specified inode.
+ *
+ * PRE CONDITIONS: Working map has already been updated for allocate.
+ *
+ * PARAMETERS:
+ *	ipimap	- Incore inode map inode
+ *	inum	- Number of inode to mark in permanent map
+ *	is_free	- If 'true' indicates inode should be marked freed, otherwise
+ *		  indicates inode should be marked allocated.
+ *
+ * RETURN VALUES:
+ *		0 for success
+ */
+int
+diUpdatePMap(struct inode *ipimap,
+	     unsigned long inum, bool is_free, struct tblock * tblk)
+{
+	int rc;
+	struct iag *iagp;
+	struct metapage *mp;
+	int iagno, ino, extno, bitno;
+	struct inomap *imap;
+	u32 mask;
+	struct jfs_log *log;
+	int lsn, difft, diffp;
+	unsigned long flags;
+
+	imap = JFS_IP(ipimap)->i_imap;
+	/* get the iag number containing the inode */
+	iagno = INOTOIAG(inum);
+	/* make sure that the iag is contained within the map */
+	if (iagno >= imap->im_nextiag) {
+		jfs_error(ipimap->i_sb,
+			  "diUpdatePMap: the iag is outside the map");
+		return -EIO;
+	}
+	/* read the iag */
+	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
+	rc = diIAGRead(imap, iagno, &mp);
+	IREAD_UNLOCK(ipimap);
+	if (rc)
+		return (rc);
+	metapage_wait_for_io(mp);
+	iagp = (struct iag *) mp->data;
+	/* get the inode number and extent number of the inode within
+	 * the iag and the inode number within the extent.
+	 */
+	ino = inum & (INOSPERIAG - 1);
+	extno = ino >> L2INOSPEREXT;
+	bitno = ino & (INOSPEREXT - 1);
+	mask = HIGHORDER >> bitno;
+	/*
+	 * mark the inode free in persistent map:
+	 */
+	if (is_free) {
+		/* The inode should have been allocated both in working
+		 * map and in persistent map;
+		 * the inode will be freed from working map at the release
+		 * of last reference release;
+		 */
+		if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
+			jfs_error(ipimap->i_sb,
+				  "diUpdatePMap: inode %ld not marked as "
+				  "allocated in wmap!", inum);
+		}
+		if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) {
+			jfs_error(ipimap->i_sb,
+				  "diUpdatePMap: inode %ld not marked as "
+				  "allocated in pmap!", inum);
+		}
+		/* update the bitmap for the extent of the freed inode */
+		iagp->pmap[extno] &= cpu_to_le32(~mask);
+	}
+	/*
+	 * mark the inode allocated in persistent map:
+	 */
+	else {
+		/* The inode should be already allocated in the working map
+		 * and should be free in persistent map;
+		 */
+		if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
+			release_metapage(mp);
+			jfs_error(ipimap->i_sb,
+				  "diUpdatePMap: the inode is not allocated in "
+				  "the working map");
+			return -EIO;
+		}
+		if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) {
+			release_metapage(mp);
+			jfs_error(ipimap->i_sb,
+				  "diUpdatePMap: the inode is not free in the "
+				  "persistent map");
+			return -EIO;
+		}
+		/* update the bitmap for the extent of the allocated inode */
+		iagp->pmap[extno] |= cpu_to_le32(mask);
+	}
+	/*
+	 * update iag lsn
+	 */
+	lsn = tblk->lsn;
+	log = JFS_SBI(tblk->sb)->log;
+	LOGSYNC_LOCK(log, flags);
+	if (mp->lsn != 0) {
+		/* inherit older/smaller lsn */
+		logdiff(difft, lsn, log);
+		logdiff(diffp, mp->lsn, log);
+		if (difft < diffp) {
+			mp->lsn = lsn;
+			/* move mp after tblock in logsync list */
+			list_move(&mp->synclist, &tblk->synclist);
+		}
+		/* inherit younger/larger clsn */
+		assert(mp->clsn);
+		logdiff(difft, tblk->clsn, log);
+		logdiff(diffp, mp->clsn, log);
+		if (difft > diffp)
+			mp->clsn = tblk->clsn;
+	} else {
+		mp->log = log;
+		mp->lsn = lsn;
+		/* insert mp after tblock in logsync list */
+		log->count++;
+		list_add(&mp->synclist, &tblk->synclist);
+		mp->clsn = tblk->clsn;
+	}
+	LOGSYNC_UNLOCK(log, flags);
+	write_metapage(mp);
+	return (0);
+}
+
+/*
+ *	diExtendFS()
+ *
+ * function: update imap for extendfs();
+ *
+ * note: AG size has been increased s.t. each k old contiguous AGs are
+ * coalesced into a new AG;
+ */
+int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
+{
+	int rc, rcx = 0;
+	struct inomap *imap = JFS_IP(ipimap)->i_imap;
+	struct iag *iagp = NULL, *hiagp = NULL;
+	struct bmap *mp = JFS_SBI(ipbmap->i_sb)->bmap;
+	struct metapage *bp, *hbp;
+	int i, n, head;
+	int numinos, xnuminos = 0, xnumfree = 0;
+	s64 agstart;
+
+	jfs_info("diExtendFS: nextiag:%d numinos:%d numfree:%d",
+		   imap->im_nextiag, atomic_read(&imap->im_numinos),
+		   atomic_read(&imap->im_numfree));
+
+	/*
+	 *	reconstruct imap
+	 *
+	 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
+	 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
+	 * note: new AG size = old AG size * (2**x).
+	 */
+
+	/* init per AG control information im_agctl[] */
+	for (i = 0; i < MAXAG; i++) {
+		imap->im_agctl[i].inofree = -1;
+		imap->im_agctl[i].extfree = -1;
+		imap->im_agctl[i].numinos = 0;	/* number of backed inodes */
+		imap->im_agctl[i].numfree = 0;	/* number of free backed inodes */
+	}
+
+	/*
+	 *	process each iag page of the map.
+	 *
+	 * rebuild AG Free Inode List, AG Free Inode Extent List;
+	 */
+	for (i = 0; i < imap->im_nextiag; i++) {
+		if ((rc = diIAGRead(imap, i, &bp))) {
+			rcx = rc;
+			continue;
+		}
+		iagp = (struct iag *) bp->data;
+		if (le32_to_cpu(iagp->iagnum) != i) {
+			release_metapage(bp);
+			jfs_error(ipimap->i_sb,
+				  "diExtendFs: unexpected value of iagnum");
+			return -EIO;
+		}
+
+		/* leave free iag in the free iag list */
+		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
+			release_metapage(bp);
+			continue;
+		}
+
+		agstart = le64_to_cpu(iagp->agstart);
+		n = agstart >> mp->db_agl2size;
+		iagp->agstart = cpu_to_le64((s64)n << mp->db_agl2size);
+
+		/* compute backed inodes */
+		numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts))
+		    << L2INOSPEREXT;
+		if (numinos > 0) {
+			/* merge AG backed inodes */
+			imap->im_agctl[n].numinos += numinos;
+			xnuminos += numinos;
+		}
+
+		/* if any backed free inodes, insert at AG free inode list */
+		if ((int) le32_to_cpu(iagp->nfreeinos) > 0) {
+			if ((head = imap->im_agctl[n].inofree) == -1) {
+				iagp->inofreefwd = cpu_to_le32(-1);
+				iagp->inofreeback = cpu_to_le32(-1);
+			} else {
+				if ((rc = diIAGRead(imap, head, &hbp))) {
+					rcx = rc;
+					goto nextiag;
+				}
+				hiagp = (struct iag *) hbp->data;
+				hiagp->inofreeback = iagp->iagnum;
+				iagp->inofreefwd = cpu_to_le32(head);
+				iagp->inofreeback = cpu_to_le32(-1);
+				write_metapage(hbp);
+			}
+
+			imap->im_agctl[n].inofree =
+			    le32_to_cpu(iagp->iagnum);
+
+			/* merge AG backed free inodes */
+			imap->im_agctl[n].numfree +=
+			    le32_to_cpu(iagp->nfreeinos);
+			xnumfree += le32_to_cpu(iagp->nfreeinos);
+		}
+
+		/* if any free extents, insert at AG free extent list */
+		if (le32_to_cpu(iagp->nfreeexts) > 0) {
+			if ((head = imap->im_agctl[n].extfree) == -1) {
+				iagp->extfreefwd = cpu_to_le32(-1);
+				iagp->extfreeback = cpu_to_le32(-1);
+			} else {
+				if ((rc = diIAGRead(imap, head, &hbp))) {
+					rcx = rc;
+					goto nextiag;
+				}
+				hiagp = (struct iag *) hbp->data;
+				hiagp->extfreeback = iagp->iagnum;
+				iagp->extfreefwd = cpu_to_le32(head);
+				iagp->extfreeback = cpu_to_le32(-1);
+				write_metapage(hbp);
+			}
+
+			imap->im_agctl[n].extfree =
+			    le32_to_cpu(iagp->iagnum);
+		}
+
+	      nextiag:
+		write_metapage(bp);
+	}
+
+	if (xnuminos != atomic_read(&imap->im_numinos) ||
+	    xnumfree != atomic_read(&imap->im_numfree)) {
+		jfs_error(ipimap->i_sb,
+			  "diExtendFs: numinos or numfree incorrect");
+		return -EIO;
+	}
+
+	return rcx;
+}
+
+
+/*
+ *	duplicateIXtree()
+ *
+ * serialization: IWRITE_LOCK held on entry/exit
+ *
+ * note: shadow page with regular inode (rel.2);
+ */
+static void duplicateIXtree(struct super_block *sb, s64 blkno,
+			    int xlen, s64 *xaddr)
+{
+	struct jfs_superblock *j_sb;
+	struct buffer_head *bh;
+	struct inode *ip;
+	tid_t tid;
+
+	/* if AIT2 ipmap2 is bad, do not try to update it */
+	if (JFS_SBI(sb)->mntflag & JFS_BAD_SAIT)	/* s_flag */
+		return;
+	ip = diReadSpecial(sb, FILESYSTEM_I, 1);
+	if (ip == NULL) {
+		JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
+		if (readSuper(sb, &bh))
+			return;
+		j_sb = (struct jfs_superblock *)bh->b_data;
+		j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT);
+
+		mark_buffer_dirty(bh);
+		sync_dirty_buffer(bh);
+		brelse(bh);
+		return;
+	}
+
+	/* start transaction */
+	tid = txBegin(sb, COMMIT_FORCE);
+	/* update the inode map addressing structure to point to it */
+	if (xtInsert(tid, ip, 0, blkno, xlen, xaddr, 0)) {
+		JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
+		txAbort(tid, 1);
+		goto cleanup;
+
+	}
+	/* update the inode map's inode to reflect the extension */
+	ip->i_size += PSIZE;
+	inode_add_bytes(ip, PSIZE);
+	txCommit(tid, 1, &ip, COMMIT_FORCE);
+      cleanup:
+	txEnd(tid);
+	diFreeSpecial(ip);
+}
+
+/*
+ * NAME:	copy_from_dinode()
+ *
+ * FUNCTION:	Copies inode info from disk inode to in-memory inode
+ *
+ * RETURN VALUES:
+ *	0	- success
+ *	-ENOMEM	- insufficient memory
+ */
+static int copy_from_dinode(struct dinode * dip, struct inode *ip)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+
+	jfs_ip->fileset = le32_to_cpu(dip->di_fileset);
+	jfs_ip->mode2 = le32_to_cpu(dip->di_mode);
+	jfs_set_inode_flags(ip);
+
+	ip->i_mode = le32_to_cpu(dip->di_mode) & 0xffff;
+	if (sbi->umask != -1) {
+		ip->i_mode = (ip->i_mode & ~0777) | (0777 & ~sbi->umask);
+		/* For directories, add x permission if r is allowed by umask */
+		if (S_ISDIR(ip->i_mode)) {
+			if (ip->i_mode & 0400)
+				ip->i_mode |= 0100;
+			if (ip->i_mode & 0040)
+				ip->i_mode |= 0010;
+			if (ip->i_mode & 0004)
+				ip->i_mode |= 0001;
+		}
+	}
+	ip->i_nlink = le32_to_cpu(dip->di_nlink);
+
+	jfs_ip->saved_uid = le32_to_cpu(dip->di_uid);
+	if (sbi->uid == -1)
+		ip->i_uid = jfs_ip->saved_uid;
+	else {
+		ip->i_uid = sbi->uid;
+	}
+
+	jfs_ip->saved_gid = le32_to_cpu(dip->di_gid);
+	if (sbi->gid == -1)
+		ip->i_gid = jfs_ip->saved_gid;
+	else {
+		ip->i_gid = sbi->gid;
+	}
+
+	ip->i_size = le64_to_cpu(dip->di_size);
+	ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec);
+	ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec);
+	ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec);
+	ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec);
+	ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec);
+	ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec);
+	ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
+	ip->i_generation = le32_to_cpu(dip->di_gen);
+
+	jfs_ip->ixpxd = dip->di_ixpxd;	/* in-memory pxd's are little-endian */
+	jfs_ip->acl = dip->di_acl;	/* as are dxd's */
+	jfs_ip->ea = dip->di_ea;
+	jfs_ip->next_index = le32_to_cpu(dip->di_next_index);
+	jfs_ip->otime = le32_to_cpu(dip->di_otime.tv_sec);
+	jfs_ip->acltype = le32_to_cpu(dip->di_acltype);
+
+	if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) {
+		jfs_ip->dev = le32_to_cpu(dip->di_rdev);
+		ip->i_rdev = new_decode_dev(jfs_ip->dev);
+	}
+
+	if (S_ISDIR(ip->i_mode)) {
+		memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384);
+	} else if (S_ISREG(ip->i_mode) || S_ISLNK(ip->i_mode)) {
+		memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288);
+	} else
+		memcpy(&jfs_ip->i_inline_ea, &dip->di_inlineea, 128);
+
+	/* Zero the in-memory-only stuff */
+	jfs_ip->cflag = 0;
+	jfs_ip->btindex = 0;
+	jfs_ip->btorder = 0;
+	jfs_ip->bxflag = 0;
+	jfs_ip->blid = 0;
+	jfs_ip->atlhead = 0;
+	jfs_ip->atltail = 0;
+	jfs_ip->xtlid = 0;
+	return (0);
+}
+
+/*
+ * NAME:	copy_to_dinode()
+ *
+ * FUNCTION:	Copies inode info from in-memory inode to disk inode
+ */
+static void copy_to_dinode(struct dinode * dip, struct inode *ip)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+
+	dip->di_fileset = cpu_to_le32(jfs_ip->fileset);
+	dip->di_inostamp = cpu_to_le32(sbi->inostamp);
+	dip->di_number = cpu_to_le32(ip->i_ino);
+	dip->di_gen = cpu_to_le32(ip->i_generation);
+	dip->di_size = cpu_to_le64(ip->i_size);
+	dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks));
+	dip->di_nlink = cpu_to_le32(ip->i_nlink);
+	if (sbi->uid == -1)
+		dip->di_uid = cpu_to_le32(ip->i_uid);
+	else
+		dip->di_uid = cpu_to_le32(jfs_ip->saved_uid);
+	if (sbi->gid == -1)
+		dip->di_gid = cpu_to_le32(ip->i_gid);
+	else
+		dip->di_gid = cpu_to_le32(jfs_ip->saved_gid);
+	jfs_get_inode_flags(jfs_ip);
+	/*
+	 * mode2 is only needed for storing the higher order bits.
+	 * Trust i_mode for the lower order ones
+	 */
+	if (sbi->umask == -1)
+		dip->di_mode = cpu_to_le32((jfs_ip->mode2 & 0xffff0000) |
+					   ip->i_mode);
+	else /* Leave the original permissions alone */
+		dip->di_mode = cpu_to_le32(jfs_ip->mode2);
+
+	dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec);
+	dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec);
+	dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec);
+	dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec);
+	dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec);
+	dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec);
+	dip->di_ixpxd = jfs_ip->ixpxd;	/* in-memory pxd's are little-endian */
+	dip->di_acl = jfs_ip->acl;	/* as are dxd's */
+	dip->di_ea = jfs_ip->ea;
+	dip->di_next_index = cpu_to_le32(jfs_ip->next_index);
+	dip->di_otime.tv_sec = cpu_to_le32(jfs_ip->otime);
+	dip->di_otime.tv_nsec = 0;
+	dip->di_acltype = cpu_to_le32(jfs_ip->acltype);
+	if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode))
+		dip->di_rdev = cpu_to_le32(jfs_ip->dev);
+}
diff --git a/fs/jfs/jfs_imap.h b/fs/jfs/jfs_imap.h
new file mode 100644
index 00000000..610a0e9d
--- /dev/null
+++ b/fs/jfs/jfs_imap.h
@@ -0,0 +1,175 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_IMAP
+#define _H_JFS_IMAP
+
+#include "jfs_txnmgr.h"
+
+/*
+ *	jfs_imap.h: disk inode manager
+ */
+
+#define	EXTSPERIAG	128	/* number of disk inode extent per iag	*/
+#define IMAPBLKNO	0	/* lblkno of dinomap within inode map	*/
+#define SMAPSZ		4	/* number of words per summary map	*/
+#define	EXTSPERSUM	32	/* number of extents per summary map entry */
+#define	L2EXTSPERSUM	5	/* l2 number of extents per summary map */
+#define	PGSPERIEXT	4	/* number of 4K pages per dinode extent */
+#define	MAXIAGS		((1<<20)-1)	/* maximum number of iags	*/
+#define	MAXAG		128	/* maximum number of allocation groups	*/
+
+#define AMAPSIZE	512	/* bytes in the IAG allocation maps */
+#define SMAPSIZE	16	/* bytes in the IAG summary maps */
+
+/* convert inode number to iag number */
+#define	INOTOIAG(ino)	((ino) >> L2INOSPERIAG)
+
+/* convert iag number to logical block number of the iag page */
+#define IAGTOLBLK(iagno,l2nbperpg)	(((iagno) + 1) << (l2nbperpg))
+
+/* get the starting block number of the 4K page of an inode extent
+ * that contains ino.
+ */
+#define INOPBLK(pxd,ino,l2nbperpg)	(addressPXD((pxd)) +		\
+	((((ino) & (INOSPEREXT-1)) >> L2INOSPERPAGE) << (l2nbperpg)))
+
+/*
+ *	inode allocation map:
+ *
+ * inode allocation map consists of
+ * . the inode map control page and
+ * . inode allocation group pages (per 4096 inodes)
+ * which are addressed by standard JFS xtree.
+ */
+/*
+ *	inode allocation group page (per 4096 inodes of an AG)
+ */
+struct iag {
+	__le64 agstart;		/* 8: starting block of ag		*/
+	__le32 iagnum;		/* 4: inode allocation group number	*/
+	__le32 inofreefwd;	/* 4: ag inode free list forward	*/
+	__le32 inofreeback;	/* 4: ag inode free list back		*/
+	__le32 extfreefwd;	/* 4: ag inode extent free list forward	*/
+	__le32 extfreeback;	/* 4: ag inode extent free list back	*/
+	__le32 iagfree;		/* 4: iag free list			*/
+
+	/* summary map: 1 bit per inode extent */
+	__le32 inosmap[SMAPSZ];	/* 16: sum map of mapwords w/ free inodes;
+				 *	note: this indicates free and backed
+				 *	inodes, if the extent is not backed the
+				 *	value will be 1.  if the extent is
+				 *	backed but all inodes are being used the
+				 *	value will be 1.  if the extent is
+				 *	backed but at least one of the inodes is
+				 *	free the value will be 0.
+				 */
+	__le32 extsmap[SMAPSZ];	/* 16: sum map of mapwords w/ free extents */
+	__le32 nfreeinos;	/* 4: number of free inodes		*/
+	__le32 nfreeexts;	/* 4: number of free extents		*/
+	/* (72) */
+	u8 pad[1976];		/* 1976: pad to 2048 bytes */
+	/* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */
+	__le32 wmap[EXTSPERIAG];	/* 512: working allocation map */
+	__le32 pmap[EXTSPERIAG];	/* 512: persistent allocation map */
+	pxd_t inoext[EXTSPERIAG];	/* 1024: inode extent addresses */
+};				/* (4096) */
+
+/*
+ *	per AG control information (in inode map control page)
+ */
+struct iagctl_disk {
+	__le32 inofree;		/* 4: free inode list anchor		*/
+	__le32 extfree;		/* 4: free extent list anchor		*/
+	__le32 numinos;		/* 4: number of backed inodes		*/
+	__le32 numfree;		/* 4: number of free inodes		*/
+};				/* (16) */
+
+struct iagctl {
+	int inofree;		/* free inode list anchor		*/
+	int extfree;		/* free extent list anchor		*/
+	int numinos;		/* number of backed inodes		*/
+	int numfree;		/* number of free inodes		*/
+};
+
+/*
+ *	per fileset/aggregate inode map control page
+ */
+struct dinomap_disk {
+	__le32 in_freeiag;	/* 4: free iag list anchor	*/
+	__le32 in_nextiag;	/* 4: next free iag number	*/
+	__le32 in_numinos;	/* 4: num of backed inodes	*/
+	__le32 in_numfree;	/* 4: num of free backed inodes */
+	__le32 in_nbperiext;	/* 4: num of blocks per inode extent */
+	__le32 in_l2nbperiext;	/* 4: l2 of in_nbperiext	*/
+	__le32 in_diskblock;	/* 4: for standalone test driver */
+	__le32 in_maxag;	/* 4: for standalone test driver */
+	u8 pad[2016];		/* 2016: pad to 2048		*/
+	struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */
+};				/* (4096) */
+
+struct dinomap {
+	int in_freeiag;		/* free iag list anchor		*/
+	int in_nextiag;		/* next free iag number		*/
+	int in_numinos;		/* num of backed inodes		*/
+	int in_numfree;		/* num of free backed inodes	*/
+	int in_nbperiext;	/* num of blocks per inode extent */
+	int in_l2nbperiext;	/* l2 of in_nbperiext		*/
+	int in_diskblock;	/* for standalone test driver	*/
+	int in_maxag;		/* for standalone test driver	*/
+	struct iagctl in_agctl[MAXAG];	/* AG control information */
+};
+
+/*
+ *	In-core inode map control page
+ */
+struct inomap {
+	struct dinomap im_imap;		/* 4096: inode allocation control */
+	struct inode *im_ipimap;	/* 4: ptr to inode for imap	*/
+	struct mutex im_freelock;	/* 4: iag free list lock	*/
+	struct mutex im_aglock[MAXAG];	/* 512: per AG locks		*/
+	u32 *im_DBGdimap;
+	atomic_t im_numinos;	/* num of backed inodes */
+	atomic_t im_numfree;	/* num of free backed inodes */
+};
+
+#define	im_freeiag	im_imap.in_freeiag
+#define	im_nextiag	im_imap.in_nextiag
+#define	im_agctl	im_imap.in_agctl
+#define	im_nbperiext	im_imap.in_nbperiext
+#define	im_l2nbperiext	im_imap.in_l2nbperiext
+
+/* for standalone testdriver
+ */
+#define	im_diskblock	im_imap.in_diskblock
+#define	im_maxag	im_imap.in_maxag
+
+extern int diFree(struct inode *);
+extern int diAlloc(struct inode *, bool, struct inode *);
+extern int diSync(struct inode *);
+/* external references */
+extern int diUpdatePMap(struct inode *ipimap, unsigned long inum,
+			bool is_free, struct tblock * tblk);
+extern int diExtendFS(struct inode *ipimap, struct inode *ipbmap);
+extern int diMount(struct inode *);
+extern int diUnmount(struct inode *, int);
+extern int diRead(struct inode *);
+extern struct inode *diReadSpecial(struct super_block *, ino_t, int);
+extern void diWriteSpecial(struct inode *, int);
+extern void diFreeSpecial(struct inode *);
+extern int diWrite(tid_t tid, struct inode *);
+#endif				/* _H_JFS_IMAP */
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
new file mode 100644
index 00000000..584a4a1a
--- /dev/null
+++ b/fs/jfs/jfs_incore.h
@@ -0,0 +1,224 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_INCORE
+#define _H_JFS_INCORE
+
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include "jfs_types.h"
+#include "jfs_xtree.h"
+#include "jfs_dtree.h"
+
+/*
+ * JFS magic number
+ */
+#define JFS_SUPER_MAGIC 0x3153464a /* "JFS1" */
+
+/*
+ * JFS-private inode information
+ */
+struct jfs_inode_info {
+	int	fileset;	/* fileset number (always 16)*/
+	uint	mode2;		/* jfs-specific mode		*/
+	uint	saved_uid;	/* saved for uid mount option */
+	uint	saved_gid;	/* saved for gid mount option */
+	pxd_t	ixpxd;		/* inode extent descriptor	*/
+	dxd_t	acl;		/* dxd describing acl	*/
+	dxd_t	ea;		/* dxd describing ea	*/
+	time_t	otime;		/* time created	*/
+	uint	next_index;	/* next available directory entry index */
+	int	acltype;	/* Type of ACL	*/
+	short	btorder;	/* access order	*/
+	short	btindex;	/* btpage entry index*/
+	struct inode *ipimap;	/* inode map			*/
+	unsigned long cflag;	/* commit flags		*/
+	u64	agstart;	/* agstart of the containing IAG */
+	u16	bxflag;		/* xflag of pseudo buffer?	*/
+	unchar	pad;
+	signed char active_ag;	/* ag currently allocating from	*/
+	lid_t	blid;		/* lid of pseudo buffer?	*/
+	lid_t	atlhead;	/* anonymous tlock list head	*/
+	lid_t	atltail;	/* anonymous tlock list tail	*/
+	spinlock_t ag_lock;	/* protects active_ag		*/
+	struct list_head anon_inode_list; /* inodes having anonymous txns */
+	/*
+	 * rdwrlock serializes xtree between reads & writes and synchronizes
+	 * changes to special inodes.  It's use would be redundant on
+	 * directories since the i_mutex taken in the VFS is sufficient.
+	 */
+	struct rw_semaphore rdwrlock;
+	/*
+	 * commit_mutex serializes transaction processing on an inode.
+	 * It must be taken after beginning a transaction (txBegin), since
+	 * dirty inodes may be committed while a new transaction on the
+	 * inode is blocked in txBegin or TxBeginAnon
+	 */
+	struct mutex commit_mutex;
+	/* xattr_sem allows us to access the xattrs without taking i_mutex */
+	struct rw_semaphore xattr_sem;
+	lid_t	xtlid;		/* lid of xtree lock on directory */
+	union {
+		struct {
+			xtpage_t _xtroot;	/* 288: xtree root */
+			struct inomap *_imap;	/* 4: inode map header	*/
+		} file;
+		struct {
+			struct dir_table_slot _table[12]; /* 96: dir index */
+			dtroot_t _dtroot;	/* 288: dtree root */
+		} dir;
+		struct {
+			unchar _unused[16];	/* 16: */
+			dxd_t _dxd;		/* 16: */
+			unchar _inline[128];	/* 128: inline symlink */
+			/* _inline_ea may overlay the last part of
+			 * file._xtroot if maxentry = XTROOTINITSLOT
+			 */
+			unchar _inline_ea[128];	/* 128: inline extended attr */
+		} link;
+	} u;
+	u32 dev;	/* will die when we get wide dev_t */
+	struct inode	vfs_inode;
+};
+#define i_xtroot u.file._xtroot
+#define i_imap u.file._imap
+#define i_dirtable u.dir._table
+#define i_dtroot u.dir._dtroot
+#define i_inline u.link._inline
+#define i_inline_ea u.link._inline_ea
+
+#define IREAD_LOCK(ip, subclass) \
+	down_read_nested(&JFS_IP(ip)->rdwrlock, subclass)
+#define IREAD_UNLOCK(ip)	up_read(&JFS_IP(ip)->rdwrlock)
+#define IWRITE_LOCK(ip, subclass) \
+	down_write_nested(&JFS_IP(ip)->rdwrlock, subclass)
+#define IWRITE_UNLOCK(ip)	up_write(&JFS_IP(ip)->rdwrlock)
+
+/*
+ * cflag
+ */
+enum cflags {
+	COMMIT_Nolink,		/* inode committed with zero link count */
+	COMMIT_Inlineea,	/* commit inode inline EA */
+	COMMIT_Freewmap,	/* free WMAP at iClose() */
+	COMMIT_Dirty,		/* Inode is really dirty */
+	COMMIT_Dirtable,	/* commit changes to di_dirtable */
+	COMMIT_Stale,		/* data extent is no longer valid */
+	COMMIT_Synclist,	/* metadata pages on group commit synclist */
+};
+
+/*
+ * commit_mutex nesting subclasses:
+ */
+enum commit_mutex_class
+{
+	COMMIT_MUTEX_PARENT,
+	COMMIT_MUTEX_CHILD,
+	COMMIT_MUTEX_SECOND_PARENT,	/* Renaming */
+	COMMIT_MUTEX_VICTIM		/* Inode being unlinked due to rename */
+};
+
+/*
+ * rdwrlock subclasses:
+ * The dmap inode may be locked while a normal inode or the imap inode are
+ * locked.
+ */
+enum rdwrlock_class
+{
+	RDWRLOCK_NORMAL,
+	RDWRLOCK_IMAP,
+	RDWRLOCK_DMAP
+};
+
+#define set_cflag(flag, ip)	set_bit(flag, &(JFS_IP(ip)->cflag))
+#define clear_cflag(flag, ip)	clear_bit(flag, &(JFS_IP(ip)->cflag))
+#define test_cflag(flag, ip)	test_bit(flag, &(JFS_IP(ip)->cflag))
+#define test_and_clear_cflag(flag, ip) \
+	test_and_clear_bit(flag, &(JFS_IP(ip)->cflag))
+/*
+ * JFS-private superblock information.
+ */
+struct jfs_sb_info {
+	struct super_block *sb;		/* Point back to vfs super block */
+	unsigned long	mntflag;	/* aggregate attributes	*/
+	struct inode	*ipbmap;	/* block map inode		*/
+	struct inode	*ipaimap;	/* aggregate inode map inode	*/
+	struct inode	*ipaimap2;	/* secondary aimap inode	*/
+	struct inode	*ipimap;	/* aggregate inode map inode	*/
+	struct jfs_log	*log;		/* log			*/
+	struct list_head log_list;	/* volumes associated with a journal */
+	short		bsize;		/* logical block size	*/
+	short		l2bsize;	/* log2 logical block size	*/
+	short		nbperpage;	/* blocks per page		*/
+	short		l2nbperpage;	/* log2 blocks per page	*/
+	short		l2niperblk;	/* log2 inodes per page	*/
+	dev_t		logdev;		/* external log device	*/
+	uint		aggregate;	/* volume identifier in log record */
+	pxd_t		logpxd;		/* pxd describing log	*/
+	pxd_t		fsckpxd;	/* pxd describing fsck wkspc */
+	pxd_t		ait2;		/* pxd describing AIT copy	*/
+	char		uuid[16];	/* 128-bit uuid for volume	*/
+	char		loguuid[16];	/* 128-bit uuid for log	*/
+	/*
+	 * commit_state is used for synchronization of the jfs_commit
+	 * threads.  It is protected by LAZY_LOCK().
+	 */
+	int		commit_state;	/* commit state */
+	/* Formerly in ipimap */
+	uint		gengen;		/* inode generation generator*/
+	uint		inostamp;	/* shows inode belongs to fileset*/
+
+	/* Formerly in ipbmap */
+	struct bmap	*bmap;		/* incore bmap descriptor	*/
+	struct nls_table *nls_tab;	/* current codepage		*/
+	struct inode *direct_inode;	/* metadata inode */
+	uint		state;		/* mount/recovery state	*/
+	unsigned long	flag;		/* mount time flags */
+	uint		p_state;	/* state prior to going no integrity */
+	uint		uid;		/* uid to override on-disk uid */
+	uint		gid;		/* gid to override on-disk gid */
+	uint		umask;		/* umask to override on-disk umask */
+};
+
+/* jfs_sb_info commit_state */
+#define IN_LAZYCOMMIT 1
+
+static inline struct jfs_inode_info *JFS_IP(struct inode *inode)
+{
+	return list_entry(inode, struct jfs_inode_info, vfs_inode);
+}
+
+static inline int jfs_dirtable_inline(struct inode *inode)
+{
+	return (JFS_IP(inode)->next_index <= (MAX_INLINE_DIRTABLE_ENTRY + 1));
+}
+
+static inline struct jfs_sb_info *JFS_SBI(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline int isReadOnly(struct inode *inode)
+{
+	if (JFS_SBI(inode->i_sb)->log)
+		return 0;
+	return 1;
+}
+#endif /* _H_JFS_INCORE */
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
new file mode 100644
index 00000000..2686531e
--- /dev/null
+++ b/fs/jfs/jfs_inode.c
@@ -0,0 +1,166 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_inode.h"
+#include "jfs_filsys.h"
+#include "jfs_imap.h"
+#include "jfs_dinode.h"
+#include "jfs_debug.h"
+
+
+void jfs_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = JFS_IP(inode)->mode2;
+
+	inode->i_flags &= ~(S_IMMUTABLE | S_APPEND |
+		S_NOATIME | S_DIRSYNC | S_SYNC);
+
+	if (flags & JFS_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (flags & JFS_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & JFS_NOATIME_FL)
+		inode->i_flags |= S_NOATIME;
+	if (flags & JFS_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+	if (flags & JFS_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+}
+
+void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip)
+{
+	unsigned int flags = jfs_ip->vfs_inode.i_flags;
+
+	jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_APPEND_FL | JFS_NOATIME_FL |
+			   JFS_DIRSYNC_FL | JFS_SYNC_FL);
+	if (flags & S_IMMUTABLE)
+		jfs_ip->mode2 |= JFS_IMMUTABLE_FL;
+	if (flags & S_APPEND)
+		jfs_ip->mode2 |= JFS_APPEND_FL;
+	if (flags & S_NOATIME)
+		jfs_ip->mode2 |= JFS_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		jfs_ip->mode2 |= JFS_DIRSYNC_FL;
+	if (flags & S_SYNC)
+		jfs_ip->mode2 |= JFS_SYNC_FL;
+}
+
+/*
+ * NAME:	ialloc()
+ *
+ * FUNCTION:	Allocate a new inode
+ *
+ */
+struct inode *ialloc(struct inode *parent, umode_t mode)
+{
+	struct super_block *sb = parent->i_sb;
+	struct inode *inode;
+	struct jfs_inode_info *jfs_inode;
+	int rc;
+
+	inode = new_inode(sb);
+	if (!inode) {
+		jfs_warn("ialloc: new_inode returned NULL!");
+		rc = -ENOMEM;
+		goto fail;
+	}
+
+	jfs_inode = JFS_IP(inode);
+
+	rc = diAlloc(parent, S_ISDIR(mode), inode);
+	if (rc) {
+		jfs_warn("ialloc: diAlloc returned %d!", rc);
+		if (rc == -EIO)
+			make_bad_inode(inode);
+		goto fail_put;
+	}
+
+	if (insert_inode_locked(inode) < 0) {
+		rc = -EINVAL;
+		goto fail_unlock;
+	}
+
+	inode_init_owner(inode, parent, mode);
+	/*
+	 * New inodes need to save sane values on disk when
+	 * uid & gid mount options are used
+	 */
+	jfs_inode->saved_uid = inode->i_uid;
+	jfs_inode->saved_gid = inode->i_gid;
+
+	/*
+	 * Allocate inode to quota.
+	 */
+	dquot_initialize(inode);
+	rc = dquot_alloc_inode(inode);
+	if (rc)
+		goto fail_drop;
+
+	/* inherit flags from parent */
+	jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT;
+
+	if (S_ISDIR(mode)) {
+		jfs_inode->mode2 |= IDIRECTORY;
+		jfs_inode->mode2 &= ~JFS_DIRSYNC_FL;
+	}
+	else {
+		jfs_inode->mode2 |= INLINEEA | ISPARSE;
+		if (S_ISLNK(mode))
+			jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL);
+	}
+	jfs_inode->mode2 |= inode->i_mode;
+
+	inode->i_blocks = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	jfs_inode->otime = inode->i_ctime.tv_sec;
+	inode->i_generation = JFS_SBI(sb)->gengen++;
+
+	jfs_inode->cflag = 0;
+
+	/* Zero remaining fields */
+	memset(&jfs_inode->acl, 0, sizeof(dxd_t));
+	memset(&jfs_inode->ea, 0, sizeof(dxd_t));
+	jfs_inode->next_index = 0;
+	jfs_inode->acltype = 0;
+	jfs_inode->btorder = 0;
+	jfs_inode->btindex = 0;
+	jfs_inode->bxflag = 0;
+	jfs_inode->blid = 0;
+	jfs_inode->atlhead = 0;
+	jfs_inode->atltail = 0;
+	jfs_inode->xtlid = 0;
+	jfs_set_inode_flags(inode);
+
+	jfs_info("ialloc returns inode = 0x%p\n", inode);
+
+	return inode;
+
+fail_drop:
+	dquot_drop(inode);
+	inode->i_flags |= S_NOQUOTA;
+fail_unlock:
+	inode->i_nlink = 0;
+	unlock_new_inode(inode);
+fail_put:
+	iput(inode);
+fail:
+	return ERR_PTR(rc);
+}
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
new file mode 100644
index 00000000..ec2fb8b9
--- /dev/null
+++ b/fs/jfs/jfs_inode.h
@@ -0,0 +1,53 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2001
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_INODE
+#define _H_JFS_INODE
+
+struct fid;
+
+extern struct inode *ialloc(struct inode *, umode_t);
+extern int jfs_fsync(struct file *, int);
+extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
+extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
+extern struct inode *jfs_iget(struct super_block *, unsigned long);
+extern int jfs_commit_inode(struct inode *, int);
+extern int jfs_write_inode(struct inode *, struct writeback_control *);
+extern void jfs_evict_inode(struct inode *);
+extern void jfs_dirty_inode(struct inode *, int);
+extern void jfs_truncate(struct inode *);
+extern void jfs_truncate_nolock(struct inode *, loff_t);
+extern void jfs_free_zero_link(struct inode *);
+extern struct dentry *jfs_get_parent(struct dentry *dentry);
+extern void jfs_get_inode_flags(struct jfs_inode_info *);
+extern struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+	int fh_len, int fh_type);
+extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
+	int fh_len, int fh_type);
+extern void jfs_set_inode_flags(struct inode *);
+extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern int jfs_setattr(struct dentry *, struct iattr *);
+
+extern const struct address_space_operations jfs_aops;
+extern const struct inode_operations jfs_dir_inode_operations;
+extern const struct file_operations jfs_dir_operations;
+extern const struct inode_operations jfs_file_inode_operations;
+extern const struct file_operations jfs_file_operations;
+extern const struct inode_operations jfs_symlink_inode_operations;
+extern const struct inode_operations jfs_fast_symlink_inode_operations;
+extern const struct dentry_operations jfs_ci_dentry_operations;
+#endif				/* _H_JFS_INODE */
diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h
new file mode 100644
index 00000000..ecf04882
--- /dev/null
+++ b/fs/jfs/jfs_lock.h
@@ -0,0 +1,52 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2001
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_LOCK
+#define _H_JFS_LOCK
+
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+
+/*
+ *	jfs_lock.h
+ */
+
+/*
+ * Conditional sleep where condition is protected by spinlock
+ *
+ * lock_cmd and unlock_cmd take and release the spinlock
+ */
+#define __SLEEP_COND(wq, cond, lock_cmd, unlock_cmd)	\
+do {							\
+	DECLARE_WAITQUEUE(__wait, current);		\
+							\
+	add_wait_queue(&wq, &__wait);			\
+	for (;;) {					\
+		set_current_state(TASK_UNINTERRUPTIBLE);\
+		if (cond)				\
+			break;				\
+		unlock_cmd;				\
+		io_schedule();				\
+		lock_cmd;				\
+	}						\
+	__set_current_state(TASK_RUNNING);			\
+	remove_wait_queue(&wq, &__wait);		\
+} while (0)
+
+#endif				/* _H_JFS_LOCK */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
new file mode 100644
index 00000000..583636f7
--- /dev/null
+++ b/fs/jfs/jfs_logmgr.c
@@ -0,0 +1,2529 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ *	jfs_logmgr.c: log manager
+ *
+ * for related information, see transaction manager (jfs_txnmgr.c), and
+ * recovery manager (jfs_logredo.c).
+ *
+ * note: for detail, RTFS.
+ *
+ *	log buffer manager:
+ * special purpose buffer manager supporting log i/o requirements.
+ * per log serial pageout of logpage
+ * queuing i/o requests and redrive i/o at iodone
+ * maintain current logpage buffer
+ * no caching since append only
+ * appropriate jfs buffer cache buffers as needed
+ *
+ *	group commit:
+ * transactions which wrote COMMIT records in the same in-memory
+ * log page during the pageout of previous/current log page(s) are
+ * committed together by the pageout of the page.
+ *
+ *	TBD lazy commit:
+ * transactions are committed asynchronously when the log page
+ * containing it COMMIT is paged out when it becomes full;
+ *
+ *	serialization:
+ * . a per log lock serialize log write.
+ * . a per log lock serialize group commit.
+ * . a per log lock serialize log open/close;
+ *
+ *	TBD log integrity:
+ * careful-write (ping-pong) of last logpage to recover from crash
+ * in overwrite.
+ * detection of split (out-of-order) write of physical sectors
+ * of last logpage via timestamp at end of each sector
+ * with its mirror data array at trailer).
+ *
+ *	alternatives:
+ * lsn - 64-bit monotonically increasing integer vs
+ * 32-bit lspn and page eor.
+ */
+
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/buffer_head.h>		/* for sync_blockdev() */
+#include <linux/bio.h>
+#include <linux/freezer.h>
+#include <linux/delay.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_superblock.h"
+#include "jfs_txnmgr.h"
+#include "jfs_debug.h"
+
+
+/*
+ * lbuf's ready to be redriven.  Protected by log_redrive_lock (jfsIO thread)
+ */
+static struct lbuf *log_redrive_list;
+static DEFINE_SPINLOCK(log_redrive_lock);
+
+
+/*
+ *	log read/write serialization (per log)
+ */
+#define LOG_LOCK_INIT(log)	mutex_init(&(log)->loglock)
+#define LOG_LOCK(log)		mutex_lock(&((log)->loglock))
+#define LOG_UNLOCK(log)		mutex_unlock(&((log)->loglock))
+
+
+/*
+ *	log group commit serialization (per log)
+ */
+
+#define LOGGC_LOCK_INIT(log)	spin_lock_init(&(log)->gclock)
+#define LOGGC_LOCK(log)		spin_lock_irq(&(log)->gclock)
+#define LOGGC_UNLOCK(log)	spin_unlock_irq(&(log)->gclock)
+#define LOGGC_WAKEUP(tblk)	wake_up_all(&(tblk)->gcwait)
+
+/*
+ *	log sync serialization (per log)
+ */
+#define	LOGSYNC_DELTA(logsize)		min((logsize)/8, 128*LOGPSIZE)
+#define	LOGSYNC_BARRIER(logsize)	((logsize)/4)
+/*
+#define	LOGSYNC_DELTA(logsize)		min((logsize)/4, 256*LOGPSIZE)
+#define	LOGSYNC_BARRIER(logsize)	((logsize)/2)
+*/
+
+
+/*
+ *	log buffer cache synchronization
+ */
+static DEFINE_SPINLOCK(jfsLCacheLock);
+
+#define	LCACHE_LOCK(flags)	spin_lock_irqsave(&jfsLCacheLock, flags)
+#define	LCACHE_UNLOCK(flags)	spin_unlock_irqrestore(&jfsLCacheLock, flags)
+
+/*
+ * See __SLEEP_COND in jfs_locks.h
+ */
+#define LCACHE_SLEEP_COND(wq, cond, flags)	\
+do {						\
+	if (cond)				\
+		break;				\
+	__SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \
+} while (0)
+
+#define	LCACHE_WAKEUP(event)	wake_up(event)
+
+
+/*
+ *	lbuf buffer cache (lCache) control
+ */
+/* log buffer manager pageout control (cumulative, inclusive) */
+#define	lbmREAD		0x0001
+#define	lbmWRITE	0x0002	/* enqueue at tail of write queue;
+				 * init pageout if at head of queue;
+				 */
+#define	lbmRELEASE	0x0004	/* remove from write queue
+				 * at completion of pageout;
+				 * do not free/recycle it yet:
+				 * caller will free it;
+				 */
+#define	lbmSYNC		0x0008	/* do not return to freelist
+				 * when removed from write queue;
+				 */
+#define lbmFREE		0x0010	/* return to freelist
+				 * at completion of pageout;
+				 * the buffer may be recycled;
+				 */
+#define	lbmDONE		0x0020
+#define	lbmERROR	0x0040
+#define lbmGC		0x0080	/* lbmIODone to perform post-GC processing
+				 * of log page
+				 */
+#define lbmDIRECT	0x0100
+
+/*
+ * Global list of active external journals
+ */
+static LIST_HEAD(jfs_external_logs);
+static struct jfs_log *dummy_log = NULL;
+static DEFINE_MUTEX(jfs_log_mutex);
+
+/*
+ * forward references
+ */
+static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk,
+			 struct lrd * lrd, struct tlock * tlck);
+
+static int lmNextPage(struct jfs_log * log);
+static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi,
+			   int activate);
+
+static int open_inline_log(struct super_block *sb);
+static int open_dummy_log(struct super_block *sb);
+static int lbmLogInit(struct jfs_log * log);
+static void lbmLogShutdown(struct jfs_log * log);
+static struct lbuf *lbmAllocate(struct jfs_log * log, int);
+static void lbmFree(struct lbuf * bp);
+static void lbmfree(struct lbuf * bp);
+static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp);
+static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, int cant_block);
+static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag);
+static int lbmIOWait(struct lbuf * bp, int flag);
+static bio_end_io_t lbmIODone;
+static void lbmStartIO(struct lbuf * bp);
+static void lmGCwrite(struct jfs_log * log, int cant_block);
+static int lmLogSync(struct jfs_log * log, int hard_sync);
+
+
+
+/*
+ *	statistics
+ */
+#ifdef CONFIG_JFS_STATISTICS
+static struct lmStat {
+	uint commit;		/* # of commit */
+	uint pagedone;		/* # of page written */
+	uint submitted;		/* # of pages submitted */
+	uint full_page;		/* # of full pages submitted */
+	uint partial_page;	/* # of partial pages submitted */
+} lmStat;
+#endif
+
+static void write_special_inodes(struct jfs_log *log,
+				 int (*writer)(struct address_space *))
+{
+	struct jfs_sb_info *sbi;
+
+	list_for_each_entry(sbi, &log->sb_list, log_list) {
+		writer(sbi->ipbmap->i_mapping);
+		writer(sbi->ipimap->i_mapping);
+		writer(sbi->direct_inode->i_mapping);
+	}
+}
+
+/*
+ * NAME:	lmLog()
+ *
+ * FUNCTION:	write a log record;
+ *
+ * PARAMETER:
+ *
+ * RETURN:	lsn - offset to the next log record to write (end-of-log);
+ *		-1  - error;
+ *
+ * note: todo: log error handler
+ */
+int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+	  struct tlock * tlck)
+{
+	int lsn;
+	int diffp, difft;
+	struct metapage *mp = NULL;
+	unsigned long flags;
+
+	jfs_info("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p",
+		 log, tblk, lrd, tlck);
+
+	LOG_LOCK(log);
+
+	/* log by (out-of-transaction) JFS ? */
+	if (tblk == NULL)
+		goto writeRecord;
+
+	/* log from page ? */
+	if (tlck == NULL ||
+	    tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL)
+		goto writeRecord;
+
+	/*
+	 *	initialize/update page/transaction recovery lsn
+	 */
+	lsn = log->lsn;
+
+	LOGSYNC_LOCK(log, flags);
+
+	/*
+	 * initialize page lsn if first log write of the page
+	 */
+	if (mp->lsn == 0) {
+		mp->log = log;
+		mp->lsn = lsn;
+		log->count++;
+
+		/* insert page at tail of logsynclist */
+		list_add_tail(&mp->synclist, &log->synclist);
+	}
+
+	/*
+	 *	initialize/update lsn of tblock of the page
+	 *
+	 * transaction inherits oldest lsn of pages associated
+	 * with allocation/deallocation of resources (their
+	 * log records are used to reconstruct allocation map
+	 * at recovery time: inode for inode allocation map,
+	 * B+-tree index of extent descriptors for block
+	 * allocation map);
+	 * allocation map pages inherit transaction lsn at
+	 * commit time to allow forwarding log syncpt past log
+	 * records associated with allocation/deallocation of
+	 * resources only after persistent map of these map pages
+	 * have been updated and propagated to home.
+	 */
+	/*
+	 * initialize transaction lsn:
+	 */
+	if (tblk->lsn == 0) {
+		/* inherit lsn of its first page logged */
+		tblk->lsn = mp->lsn;
+		log->count++;
+
+		/* insert tblock after the page on logsynclist */
+		list_add(&tblk->synclist, &mp->synclist);
+	}
+	/*
+	 * update transaction lsn:
+	 */
+	else {
+		/* inherit oldest/smallest lsn of page */
+		logdiff(diffp, mp->lsn, log);
+		logdiff(difft, tblk->lsn, log);
+		if (diffp < difft) {
+			/* update tblock lsn with page lsn */
+			tblk->lsn = mp->lsn;
+
+			/* move tblock after page on logsynclist */
+			list_move(&tblk->synclist, &mp->synclist);
+		}
+	}
+
+	LOGSYNC_UNLOCK(log, flags);
+
+	/*
+	 *	write the log record
+	 */
+      writeRecord:
+	lsn = lmWriteRecord(log, tblk, lrd, tlck);
+
+	/*
+	 * forward log syncpt if log reached next syncpt trigger
+	 */
+	logdiff(diffp, lsn, log);
+	if (diffp >= log->nextsync)
+		lsn = lmLogSync(log, 0);
+
+	/* update end-of-log lsn */
+	log->lsn = lsn;
+
+	LOG_UNLOCK(log);
+
+	/* return end-of-log address */
+	return lsn;
+}
+
+/*
+ * NAME:	lmWriteRecord()
+ *
+ * FUNCTION:	move the log record to current log page
+ *
+ * PARAMETER:	cd	- commit descriptor
+ *
+ * RETURN:	end-of-log address
+ *
+ * serialization: LOG_LOCK() held on entry/exit
+ */
+static int
+lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+	      struct tlock * tlck)
+{
+	int lsn = 0;		/* end-of-log address */
+	struct lbuf *bp;	/* dst log page buffer */
+	struct logpage *lp;	/* dst log page */
+	caddr_t dst;		/* destination address in log page */
+	int dstoffset;		/* end-of-log offset in log page */
+	int freespace;		/* free space in log page */
+	caddr_t p;		/* src meta-data page */
+	caddr_t src;
+	int srclen;
+	int nbytes;		/* number of bytes to move */
+	int i;
+	int len;
+	struct linelock *linelock;
+	struct lv *lv;
+	struct lvd *lvd;
+	int l2linesize;
+
+	len = 0;
+
+	/* retrieve destination log page to write */
+	bp = (struct lbuf *) log->bp;
+	lp = (struct logpage *) bp->l_ldata;
+	dstoffset = log->eor;
+
+	/* any log data to write ? */
+	if (tlck == NULL)
+		goto moveLrd;
+
+	/*
+	 *	move log record data
+	 */
+	/* retrieve source meta-data page to log */
+	if (tlck->flag & tlckPAGELOCK) {
+		p = (caddr_t) (tlck->mp->data);
+		linelock = (struct linelock *) & tlck->lock;
+	}
+	/* retrieve source in-memory inode to log */
+	else if (tlck->flag & tlckINODELOCK) {
+		if (tlck->type & tlckDTREE)
+			p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot;
+		else
+			p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot;
+		linelock = (struct linelock *) & tlck->lock;
+	}
+#ifdef	_JFS_WIP
+	else if (tlck->flag & tlckINLINELOCK) {
+
+		inlinelock = (struct inlinelock *) & tlck;
+		p = (caddr_t) & inlinelock->pxd;
+		linelock = (struct linelock *) & tlck;
+	}
+#endif				/* _JFS_WIP */
+	else {
+		jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck);
+		return 0;	/* Probably should trap */
+	}
+	l2linesize = linelock->l2linesize;
+
+      moveData:
+	ASSERT(linelock->index <= linelock->maxcnt);
+
+	lv = linelock->lv;
+	for (i = 0; i < linelock->index; i++, lv++) {
+		if (lv->length == 0)
+			continue;
+
+		/* is page full ? */
+		if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) {
+			/* page become full: move on to next page */
+			lmNextPage(log);
+
+			bp = log->bp;
+			lp = (struct logpage *) bp->l_ldata;
+			dstoffset = LOGPHDRSIZE;
+		}
+
+		/*
+		 * move log vector data
+		 */
+		src = (u8 *) p + (lv->offset << l2linesize);
+		srclen = lv->length << l2linesize;
+		len += srclen;
+		while (srclen > 0) {
+			freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
+			nbytes = min(freespace, srclen);
+			dst = (caddr_t) lp + dstoffset;
+			memcpy(dst, src, nbytes);
+			dstoffset += nbytes;
+
+			/* is page not full ? */
+			if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
+				break;
+
+			/* page become full: move on to next page */
+			lmNextPage(log);
+
+			bp = (struct lbuf *) log->bp;
+			lp = (struct logpage *) bp->l_ldata;
+			dstoffset = LOGPHDRSIZE;
+
+			srclen -= nbytes;
+			src += nbytes;
+		}
+
+		/*
+		 * move log vector descriptor
+		 */
+		len += 4;
+		lvd = (struct lvd *) ((caddr_t) lp + dstoffset);
+		lvd->offset = cpu_to_le16(lv->offset);
+		lvd->length = cpu_to_le16(lv->length);
+		dstoffset += 4;
+		jfs_info("lmWriteRecord: lv offset:%d length:%d",
+			 lv->offset, lv->length);
+	}
+
+	if ((i = linelock->next)) {
+		linelock = (struct linelock *) lid_to_tlock(i);
+		goto moveData;
+	}
+
+	/*
+	 *	move log record descriptor
+	 */
+      moveLrd:
+	lrd->length = cpu_to_le16(len);
+
+	src = (caddr_t) lrd;
+	srclen = LOGRDSIZE;
+
+	while (srclen > 0) {
+		freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
+		nbytes = min(freespace, srclen);
+		dst = (caddr_t) lp + dstoffset;
+		memcpy(dst, src, nbytes);
+
+		dstoffset += nbytes;
+		srclen -= nbytes;
+
+		/* are there more to move than freespace of page ? */
+		if (srclen)
+			goto pageFull;
+
+		/*
+		 * end of log record descriptor
+		 */
+
+		/* update last log record eor */
+		log->eor = dstoffset;
+		bp->l_eor = dstoffset;
+		lsn = (log->page << L2LOGPSIZE) + dstoffset;
+
+		if (lrd->type & cpu_to_le16(LOG_COMMIT)) {
+			tblk->clsn = lsn;
+			jfs_info("wr: tclsn:0x%x, beor:0x%x", tblk->clsn,
+				 bp->l_eor);
+
+			INCREMENT(lmStat.commit);	/* # of commit */
+
+			/*
+			 * enqueue tblock for group commit:
+			 *
+			 * enqueue tblock of non-trivial/synchronous COMMIT
+			 * at tail of group commit queue
+			 * (trivial/asynchronous COMMITs are ignored by
+			 * group commit.)
+			 */
+			LOGGC_LOCK(log);
+
+			/* init tblock gc state */
+			tblk->flag = tblkGC_QUEUE;
+			tblk->bp = log->bp;
+			tblk->pn = log->page;
+			tblk->eor = log->eor;
+
+			/* enqueue transaction to commit queue */
+			list_add_tail(&tblk->cqueue, &log->cqueue);
+
+			LOGGC_UNLOCK(log);
+		}
+
+		jfs_info("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x",
+			le16_to_cpu(lrd->type), log->bp, log->page, dstoffset);
+
+		/* page not full ? */
+		if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
+			return lsn;
+
+	      pageFull:
+		/* page become full: move on to next page */
+		lmNextPage(log);
+
+		bp = (struct lbuf *) log->bp;
+		lp = (struct logpage *) bp->l_ldata;
+		dstoffset = LOGPHDRSIZE;
+		src += nbytes;
+	}
+
+	return lsn;
+}
+
+
+/*
+ * NAME:	lmNextPage()
+ *
+ * FUNCTION:	write current page and allocate next page.
+ *
+ * PARAMETER:	log
+ *
+ * RETURN:	0
+ *
+ * serialization: LOG_LOCK() held on entry/exit
+ */
+static int lmNextPage(struct jfs_log * log)
+{
+	struct logpage *lp;
+	int lspn;		/* log sequence page number */
+	int pn;			/* current page number */
+	struct lbuf *bp;
+	struct lbuf *nextbp;
+	struct tblock *tblk;
+
+	/* get current log page number and log sequence page number */
+	pn = log->page;
+	bp = log->bp;
+	lp = (struct logpage *) bp->l_ldata;
+	lspn = le32_to_cpu(lp->h.page);
+
+	LOGGC_LOCK(log);
+
+	/*
+	 *	write or queue the full page at the tail of write queue
+	 */
+	/* get the tail tblk on commit queue */
+	if (list_empty(&log->cqueue))
+		tblk = NULL;
+	else
+		tblk = list_entry(log->cqueue.prev, struct tblock, cqueue);
+
+	/* every tblk who has COMMIT record on the current page,
+	 * and has not been committed, must be on commit queue
+	 * since tblk is queued at commit queueu at the time
+	 * of writing its COMMIT record on the page before
+	 * page becomes full (even though the tblk thread
+	 * who wrote COMMIT record may have been suspended
+	 * currently);
+	 */
+
+	/* is page bound with outstanding tail tblk ? */
+	if (tblk && tblk->pn == pn) {
+		/* mark tblk for end-of-page */
+		tblk->flag |= tblkGC_EOP;
+
+		if (log->cflag & logGC_PAGEOUT) {
+			/* if page is not already on write queue,
+			 * just enqueue (no lbmWRITE to prevent redrive)
+			 * buffer to wqueue to ensure correct serial order
+			 * of the pages since log pages will be added
+			 * continuously
+			 */
+			if (bp->l_wqnext == NULL)
+				lbmWrite(log, bp, 0, 0);
+		} else {
+			/*
+			 * No current GC leader, initiate group commit
+			 */
+			log->cflag |= logGC_PAGEOUT;
+			lmGCwrite(log, 0);
+		}
+	}
+	/* page is not bound with outstanding tblk:
+	 * init write or mark it to be redriven (lbmWRITE)
+	 */
+	else {
+		/* finalize the page */
+		bp->l_ceor = bp->l_eor;
+		lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
+		lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0);
+	}
+	LOGGC_UNLOCK(log);
+
+	/*
+	 *	allocate/initialize next page
+	 */
+	/* if log wraps, the first data page of log is 2
+	 * (0 never used, 1 is superblock).
+	 */
+	log->page = (pn == log->size - 1) ? 2 : pn + 1;
+	log->eor = LOGPHDRSIZE;	/* ? valid page empty/full at logRedo() */
+
+	/* allocate/initialize next log page buffer */
+	nextbp = lbmAllocate(log, log->page);
+	nextbp->l_eor = log->eor;
+	log->bp = nextbp;
+
+	/* initialize next log page */
+	lp = (struct logpage *) nextbp->l_ldata;
+	lp->h.page = lp->t.page = cpu_to_le32(lspn + 1);
+	lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
+
+	return 0;
+}
+
+
+/*
+ * NAME:	lmGroupCommit()
+ *
+ * FUNCTION:	group commit
+ *	initiate pageout of the pages with COMMIT in the order of
+ *	page number - redrive pageout of the page at the head of
+ *	pageout queue until full page has been written.
+ *
+ * RETURN:
+ *
+ * NOTE:
+ *	LOGGC_LOCK serializes log group commit queue, and
+ *	transaction blocks on the commit queue.
+ *	N.B. LOG_LOCK is NOT held during lmGroupCommit().
+ */
+int lmGroupCommit(struct jfs_log * log, struct tblock * tblk)
+{
+	int rc = 0;
+
+	LOGGC_LOCK(log);
+
+	/* group committed already ? */
+	if (tblk->flag & tblkGC_COMMITTED) {
+		if (tblk->flag & tblkGC_ERROR)
+			rc = -EIO;
+
+		LOGGC_UNLOCK(log);
+		return rc;
+	}
+	jfs_info("lmGroup Commit: tblk = 0x%p, gcrtc = %d", tblk, log->gcrtc);
+
+	if (tblk->xflag & COMMIT_LAZY)
+		tblk->flag |= tblkGC_LAZY;
+
+	if ((!(log->cflag & logGC_PAGEOUT)) && (!list_empty(&log->cqueue)) &&
+	    (!(tblk->xflag & COMMIT_LAZY) || test_bit(log_FLUSH, &log->flag)
+	     || jfs_tlocks_low)) {
+		/*
+		 * No pageout in progress
+		 *
+		 * start group commit as its group leader.
+		 */
+		log->cflag |= logGC_PAGEOUT;
+
+		lmGCwrite(log, 0);
+	}
+
+	if (tblk->xflag & COMMIT_LAZY) {
+		/*
+		 * Lazy transactions can leave now
+		 */
+		LOGGC_UNLOCK(log);
+		return 0;
+	}
+
+	/* lmGCwrite gives up LOGGC_LOCK, check again */
+
+	if (tblk->flag & tblkGC_COMMITTED) {
+		if (tblk->flag & tblkGC_ERROR)
+			rc = -EIO;
+
+		LOGGC_UNLOCK(log);
+		return rc;
+	}
+
+	/* upcount transaction waiting for completion
+	 */
+	log->gcrtc++;
+	tblk->flag |= tblkGC_READY;
+
+	__SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED),
+		     LOGGC_LOCK(log), LOGGC_UNLOCK(log));
+
+	/* removed from commit queue */
+	if (tblk->flag & tblkGC_ERROR)
+		rc = -EIO;
+
+	LOGGC_UNLOCK(log);
+	return rc;
+}
+
+/*
+ * NAME:	lmGCwrite()
+ *
+ * FUNCTION:	group commit write
+ *	initiate write of log page, building a group of all transactions
+ *	with commit records on that page.
+ *
+ * RETURN:	None
+ *
+ * NOTE:
+ *	LOGGC_LOCK must be held by caller.
+ *	N.B. LOG_LOCK is NOT held during lmGroupCommit().
+ */
+static void lmGCwrite(struct jfs_log * log, int cant_write)
+{
+	struct lbuf *bp;
+	struct logpage *lp;
+	int gcpn;		/* group commit page number */
+	struct tblock *tblk;
+	struct tblock *xtblk = NULL;
+
+	/*
+	 * build the commit group of a log page
+	 *
+	 * scan commit queue and make a commit group of all
+	 * transactions with COMMIT records on the same log page.
+	 */
+	/* get the head tblk on the commit queue */
+	gcpn = list_entry(log->cqueue.next, struct tblock, cqueue)->pn;
+
+	list_for_each_entry(tblk, &log->cqueue, cqueue) {
+		if (tblk->pn != gcpn)
+			break;
+
+		xtblk = tblk;
+
+		/* state transition: (QUEUE, READY) -> COMMIT */
+		tblk->flag |= tblkGC_COMMIT;
+	}
+	tblk = xtblk;		/* last tblk of the page */
+
+	/*
+	 * pageout to commit transactions on the log page.
+	 */
+	bp = (struct lbuf *) tblk->bp;
+	lp = (struct logpage *) bp->l_ldata;
+	/* is page already full ? */
+	if (tblk->flag & tblkGC_EOP) {
+		/* mark page to free at end of group commit of the page */
+		tblk->flag &= ~tblkGC_EOP;
+		tblk->flag |= tblkGC_FREE;
+		bp->l_ceor = bp->l_eor;
+		lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
+		lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC,
+			 cant_write);
+		INCREMENT(lmStat.full_page);
+	}
+	/* page is not yet full */
+	else {
+		bp->l_ceor = tblk->eor;	/* ? bp->l_ceor = bp->l_eor; */
+		lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
+		lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write);
+		INCREMENT(lmStat.partial_page);
+	}
+}
+
+/*
+ * NAME:	lmPostGC()
+ *
+ * FUNCTION:	group commit post-processing
+ *	Processes transactions after their commit records have been written
+ *	to disk, redriving log I/O if necessary.
+ *
+ * RETURN:	None
+ *
+ * NOTE:
+ *	This routine is called a interrupt time by lbmIODone
+ */
+static void lmPostGC(struct lbuf * bp)
+{
+	unsigned long flags;
+	struct jfs_log *log = bp->l_log;
+	struct logpage *lp;
+	struct tblock *tblk, *temp;
+
+	//LOGGC_LOCK(log);
+	spin_lock_irqsave(&log->gclock, flags);
+	/*
+	 * current pageout of group commit completed.
+	 *
+	 * remove/wakeup transactions from commit queue who were
+	 * group committed with the current log page
+	 */
+	list_for_each_entry_safe(tblk, temp, &log->cqueue, cqueue) {
+		if (!(tblk->flag & tblkGC_COMMIT))
+			break;
+		/* if transaction was marked GC_COMMIT then
+		 * it has been shipped in the current pageout
+		 * and made it to disk - it is committed.
+		 */
+
+		if (bp->l_flag & lbmERROR)
+			tblk->flag |= tblkGC_ERROR;
+
+		/* remove it from the commit queue */
+		list_del(&tblk->cqueue);
+		tblk->flag &= ~tblkGC_QUEUE;
+
+		if (tblk == log->flush_tblk) {
+			/* we can stop flushing the log now */
+			clear_bit(log_FLUSH, &log->flag);
+			log->flush_tblk = NULL;
+		}
+
+		jfs_info("lmPostGC: tblk = 0x%p, flag = 0x%x", tblk,
+			 tblk->flag);
+
+		if (!(tblk->xflag & COMMIT_FORCE))
+			/*
+			 * Hand tblk over to lazy commit thread
+			 */
+			txLazyUnlock(tblk);
+		else {
+			/* state transition: COMMIT -> COMMITTED */
+			tblk->flag |= tblkGC_COMMITTED;
+
+			if (tblk->flag & tblkGC_READY)
+				log->gcrtc--;
+
+			LOGGC_WAKEUP(tblk);
+		}
+
+		/* was page full before pageout ?
+		 * (and this is the last tblk bound with the page)
+		 */
+		if (tblk->flag & tblkGC_FREE)
+			lbmFree(bp);
+		/* did page become full after pageout ?
+		 * (and this is the last tblk bound with the page)
+		 */
+		else if (tblk->flag & tblkGC_EOP) {
+			/* finalize the page */
+			lp = (struct logpage *) bp->l_ldata;
+			bp->l_ceor = bp->l_eor;
+			lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
+			jfs_info("lmPostGC: calling lbmWrite");
+			lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE,
+				 1);
+		}
+
+	}
+
+	/* are there any transactions who have entered lnGroupCommit()
+	 * (whose COMMITs are after that of the last log page written.
+	 * They are waiting for new group commit (above at (SLEEP 1))
+	 * or lazy transactions are on a full (queued) log page,
+	 * select the latest ready transaction as new group leader and
+	 * wake her up to lead her group.
+	 */
+	if ((!list_empty(&log->cqueue)) &&
+	    ((log->gcrtc > 0) || (tblk->bp->l_wqnext != NULL) ||
+	     test_bit(log_FLUSH, &log->flag) || jfs_tlocks_low))
+		/*
+		 * Call lmGCwrite with new group leader
+		 */
+		lmGCwrite(log, 1);
+
+	/* no transaction are ready yet (transactions are only just
+	 * queued (GC_QUEUE) and not entered for group commit yet).
+	 * the first transaction entering group commit
+	 * will elect herself as new group leader.
+	 */
+	else
+		log->cflag &= ~logGC_PAGEOUT;
+
+	//LOGGC_UNLOCK(log);
+	spin_unlock_irqrestore(&log->gclock, flags);
+	return;
+}
+
+/*
+ * NAME:	lmLogSync()
+ *
+ * FUNCTION:	write log SYNCPT record for specified log
+ *	if new sync address is available
+ *	(normally the case if sync() is executed by back-ground
+ *	process).
+ *	calculate new value of i_nextsync which determines when
+ *	this code is called again.
+ *
+ * PARAMETERS:	log	- log structure
+ *		hard_sync - 1 to force all metadata to be written
+ *
+ * RETURN:	0
+ *
+ * serialization: LOG_LOCK() held on entry/exit
+ */
+static int lmLogSync(struct jfs_log * log, int hard_sync)
+{
+	int logsize;
+	int written;		/* written since last syncpt */
+	int free;		/* free space left available */
+	int delta;		/* additional delta to write normally */
+	int more;		/* additional write granted */
+	struct lrd lrd;
+	int lsn;
+	struct logsyncblk *lp;
+	unsigned long flags;
+
+	/* push dirty metapages out to disk */
+	if (hard_sync)
+		write_special_inodes(log, filemap_fdatawrite);
+	else
+		write_special_inodes(log, filemap_flush);
+
+	/*
+	 *	forward syncpt
+	 */
+	/* if last sync is same as last syncpt,
+	 * invoke sync point forward processing to update sync.
+	 */
+
+	if (log->sync == log->syncpt) {
+		LOGSYNC_LOCK(log, flags);
+		if (list_empty(&log->synclist))
+			log->sync = log->lsn;
+		else {
+			lp = list_entry(log->synclist.next,
+					struct logsyncblk, synclist);
+			log->sync = lp->lsn;
+		}
+		LOGSYNC_UNLOCK(log, flags);
+
+	}
+
+	/* if sync is different from last syncpt,
+	 * write a SYNCPT record with syncpt = sync.
+	 * reset syncpt = sync
+	 */
+	if (log->sync != log->syncpt) {
+		lrd.logtid = 0;
+		lrd.backchain = 0;
+		lrd.type = cpu_to_le16(LOG_SYNCPT);
+		lrd.length = 0;
+		lrd.log.syncpt.sync = cpu_to_le32(log->sync);
+		lsn = lmWriteRecord(log, NULL, &lrd, NULL);
+
+		log->syncpt = log->sync;
+	} else
+		lsn = log->lsn;
+
+	/*
+	 *	setup next syncpt trigger (SWAG)
+	 */
+	logsize = log->logsize;
+
+	logdiff(written, lsn, log);
+	free = logsize - written;
+	delta = LOGSYNC_DELTA(logsize);
+	more = min(free / 2, delta);
+	if (more < 2 * LOGPSIZE) {
+		jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n");
+		/*
+		 *	log wrapping
+		 *
+		 * option 1 - panic ? No.!
+		 * option 2 - shutdown file systems
+		 *	      associated with log ?
+		 * option 3 - extend log ?
+		 * option 4 - second chance
+		 *
+		 * mark log wrapped, and continue.
+		 * when all active transactions are completed,
+		 * mark log valid for recovery.
+		 * if crashed during invalid state, log state
+		 * implies invalid log, forcing fsck().
+		 */
+		/* mark log state log wrap in log superblock */
+		/* log->state = LOGWRAP; */
+
+		/* reset sync point computation */
+		log->syncpt = log->sync = lsn;
+		log->nextsync = delta;
+	} else
+		/* next syncpt trigger = written + more */
+		log->nextsync = written + more;
+
+	/* if number of bytes written from last sync point is more
+	 * than 1/4 of the log size, stop new transactions from
+	 * starting until all current transactions are completed
+	 * by setting syncbarrier flag.
+	 */
+	if (!test_bit(log_SYNCBARRIER, &log->flag) &&
+	    (written > LOGSYNC_BARRIER(logsize)) && log->active) {
+		set_bit(log_SYNCBARRIER, &log->flag);
+		jfs_info("log barrier on: lsn=0x%x syncpt=0x%x", lsn,
+			 log->syncpt);
+		/*
+		 * We may have to initiate group commit
+		 */
+		jfs_flush_journal(log, 0);
+	}
+
+	return lsn;
+}
+
+/*
+ * NAME:	jfs_syncpt
+ *
+ * FUNCTION:	write log SYNCPT record for specified log
+ *
+ * PARAMETERS:	log	  - log structure
+ *		hard_sync - set to 1 to force metadata to be written
+ */
+void jfs_syncpt(struct jfs_log *log, int hard_sync)
+{	LOG_LOCK(log);
+	lmLogSync(log, hard_sync);
+	LOG_UNLOCK(log);
+}
+
+/*
+ * NAME:	lmLogOpen()
+ *
+ * FUNCTION:	open the log on first open;
+ *	insert filesystem in the active list of the log.
+ *
+ * PARAMETER:	ipmnt	- file system mount inode
+ *		iplog	- log inode (out)
+ *
+ * RETURN:
+ *
+ * serialization:
+ */
+int lmLogOpen(struct super_block *sb)
+{
+	int rc;
+	struct block_device *bdev;
+	struct jfs_log *log;
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+
+	if (sbi->flag & JFS_NOINTEGRITY)
+		return open_dummy_log(sb);
+
+	if (sbi->mntflag & JFS_INLINELOG)
+		return open_inline_log(sb);
+
+	mutex_lock(&jfs_log_mutex);
+	list_for_each_entry(log, &jfs_external_logs, journal_list) {
+		if (log->bdev->bd_dev == sbi->logdev) {
+			if (memcmp(log->uuid, sbi->loguuid,
+				   sizeof(log->uuid))) {
+				jfs_warn("wrong uuid on JFS journal\n");
+				mutex_unlock(&jfs_log_mutex);
+				return -EINVAL;
+			}
+			/*
+			 * add file system to log active file system list
+			 */
+			if ((rc = lmLogFileSystem(log, sbi, 1))) {
+				mutex_unlock(&jfs_log_mutex);
+				return rc;
+			}
+			goto journal_found;
+		}
+	}
+
+	if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL))) {
+		mutex_unlock(&jfs_log_mutex);
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&log->sb_list);
+	init_waitqueue_head(&log->syncwait);
+
+	/*
+	 *	external log as separate logical volume
+	 *
+	 * file systems to log may have n-to-1 relationship;
+	 */
+
+	bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				 log);
+	if (IS_ERR(bdev)) {
+		rc = PTR_ERR(bdev);
+		goto free;
+	}
+
+	log->bdev = bdev;
+	memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
+
+	/*
+	 * initialize log:
+	 */
+	if ((rc = lmLogInit(log)))
+		goto close;
+
+	list_add(&log->journal_list, &jfs_external_logs);
+
+	/*
+	 * add file system to log active file system list
+	 */
+	if ((rc = lmLogFileSystem(log, sbi, 1)))
+		goto shutdown;
+
+journal_found:
+	LOG_LOCK(log);
+	list_add(&sbi->log_list, &log->sb_list);
+	sbi->log = log;
+	LOG_UNLOCK(log);
+
+	mutex_unlock(&jfs_log_mutex);
+	return 0;
+
+	/*
+	 *	unwind on error
+	 */
+      shutdown:		/* unwind lbmLogInit() */
+	list_del(&log->journal_list);
+	lbmLogShutdown(log);
+
+      close:		/* close external log device */
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+
+      free:		/* free log descriptor */
+	mutex_unlock(&jfs_log_mutex);
+	kfree(log);
+
+	jfs_warn("lmLogOpen: exit(%d)", rc);
+	return rc;
+}
+
+static int open_inline_log(struct super_block *sb)
+{
+	struct jfs_log *log;
+	int rc;
+
+	if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL)))
+		return -ENOMEM;
+	INIT_LIST_HEAD(&log->sb_list);
+	init_waitqueue_head(&log->syncwait);
+
+	set_bit(log_INLINELOG, &log->flag);
+	log->bdev = sb->s_bdev;
+	log->base = addressPXD(&JFS_SBI(sb)->logpxd);
+	log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
+	    (L2LOGPSIZE - sb->s_blocksize_bits);
+	log->l2bsize = sb->s_blocksize_bits;
+	ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits);
+
+	/*
+	 * initialize log.
+	 */
+	if ((rc = lmLogInit(log))) {
+		kfree(log);
+		jfs_warn("lmLogOpen: exit(%d)", rc);
+		return rc;
+	}
+
+	list_add(&JFS_SBI(sb)->log_list, &log->sb_list);
+	JFS_SBI(sb)->log = log;
+
+	return rc;
+}
+
+static int open_dummy_log(struct super_block *sb)
+{
+	int rc;
+
+	mutex_lock(&jfs_log_mutex);
+	if (!dummy_log) {
+		dummy_log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL);
+		if (!dummy_log) {
+			mutex_unlock(&jfs_log_mutex);
+			return -ENOMEM;
+		}
+		INIT_LIST_HEAD(&dummy_log->sb_list);
+		init_waitqueue_head(&dummy_log->syncwait);
+		dummy_log->no_integrity = 1;
+		/* Make up some stuff */
+		dummy_log->base = 0;
+		dummy_log->size = 1024;
+		rc = lmLogInit(dummy_log);
+		if (rc) {
+			kfree(dummy_log);
+			dummy_log = NULL;
+			mutex_unlock(&jfs_log_mutex);
+			return rc;
+		}
+	}
+
+	LOG_LOCK(dummy_log);
+	list_add(&JFS_SBI(sb)->log_list, &dummy_log->sb_list);
+	JFS_SBI(sb)->log = dummy_log;
+	LOG_UNLOCK(dummy_log);
+	mutex_unlock(&jfs_log_mutex);
+
+	return 0;
+}
+
+/*
+ * NAME:	lmLogInit()
+ *
+ * FUNCTION:	log initialization at first log open.
+ *
+ *	logredo() (or logformat()) should have been run previously.
+ *	initialize the log from log superblock.
+ *	set the log state in the superblock to LOGMOUNT and
+ *	write SYNCPT log record.
+ *
+ * PARAMETER:	log	- log structure
+ *
+ * RETURN:	0	- if ok
+ *		-EINVAL	- bad log magic number or superblock dirty
+ *		error returned from logwait()
+ *
+ * serialization: single first open thread
+ */
+int lmLogInit(struct jfs_log * log)
+{
+	int rc = 0;
+	struct lrd lrd;
+	struct logsuper *logsuper;
+	struct lbuf *bpsuper;
+	struct lbuf *bp;
+	struct logpage *lp;
+	int lsn = 0;
+
+	jfs_info("lmLogInit: log:0x%p", log);
+
+	/* initialize the group commit serialization lock */
+	LOGGC_LOCK_INIT(log);
+
+	/* allocate/initialize the log write serialization lock */
+	LOG_LOCK_INIT(log);
+
+	LOGSYNC_LOCK_INIT(log);
+
+	INIT_LIST_HEAD(&log->synclist);
+
+	INIT_LIST_HEAD(&log->cqueue);
+	log->flush_tblk = NULL;
+
+	log->count = 0;
+
+	/*
+	 * initialize log i/o
+	 */
+	if ((rc = lbmLogInit(log)))
+		return rc;
+
+	if (!test_bit(log_INLINELOG, &log->flag))
+		log->l2bsize = L2LOGPSIZE;
+
+	/* check for disabled journaling to disk */
+	if (log->no_integrity) {
+		/*
+		 * Journal pages will still be filled.  When the time comes
+		 * to actually do the I/O, the write is not done, and the
+		 * endio routine is called directly.
+		 */
+		bp = lbmAllocate(log , 0);
+		log->bp = bp;
+		bp->l_pn = bp->l_eor = 0;
+	} else {
+		/*
+		 * validate log superblock
+		 */
+		if ((rc = lbmRead(log, 1, &bpsuper)))
+			goto errout10;
+
+		logsuper = (struct logsuper *) bpsuper->l_ldata;
+
+		if (logsuper->magic != cpu_to_le32(LOGMAGIC)) {
+			jfs_warn("*** Log Format Error ! ***");
+			rc = -EINVAL;
+			goto errout20;
+		}
+
+		/* logredo() should have been run successfully. */
+		if (logsuper->state != cpu_to_le32(LOGREDONE)) {
+			jfs_warn("*** Log Is Dirty ! ***");
+			rc = -EINVAL;
+			goto errout20;
+		}
+
+		/* initialize log from log superblock */
+		if (test_bit(log_INLINELOG,&log->flag)) {
+			if (log->size != le32_to_cpu(logsuper->size)) {
+				rc = -EINVAL;
+				goto errout20;
+			}
+			jfs_info("lmLogInit: inline log:0x%p base:0x%Lx "
+				 "size:0x%x", log,
+				 (unsigned long long) log->base, log->size);
+		} else {
+			if (memcmp(logsuper->uuid, log->uuid, 16)) {
+				jfs_warn("wrong uuid on JFS log device");
+				goto errout20;
+			}
+			log->size = le32_to_cpu(logsuper->size);
+			log->l2bsize = le32_to_cpu(logsuper->l2bsize);
+			jfs_info("lmLogInit: external log:0x%p base:0x%Lx "
+				 "size:0x%x", log,
+				 (unsigned long long) log->base, log->size);
+		}
+
+		log->page = le32_to_cpu(logsuper->end) / LOGPSIZE;
+		log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page);
+
+		/*
+		 * initialize for log append write mode
+		 */
+		/* establish current/end-of-log page/buffer */
+		if ((rc = lbmRead(log, log->page, &bp)))
+			goto errout20;
+
+		lp = (struct logpage *) bp->l_ldata;
+
+		jfs_info("lmLogInit: lsn:0x%x page:%d eor:%d:%d",
+			 le32_to_cpu(logsuper->end), log->page, log->eor,
+			 le16_to_cpu(lp->h.eor));
+
+		log->bp = bp;
+		bp->l_pn = log->page;
+		bp->l_eor = log->eor;
+
+		/* if current page is full, move on to next page */
+		if (log->eor >= LOGPSIZE - LOGPTLRSIZE)
+			lmNextPage(log);
+
+		/*
+		 * initialize log syncpoint
+		 */
+		/*
+		 * write the first SYNCPT record with syncpoint = 0
+		 * (i.e., log redo up to HERE !);
+		 * remove current page from lbm write queue at end of pageout
+		 * (to write log superblock update), but do not release to
+		 * freelist;
+		 */
+		lrd.logtid = 0;
+		lrd.backchain = 0;
+		lrd.type = cpu_to_le16(LOG_SYNCPT);
+		lrd.length = 0;
+		lrd.log.syncpt.sync = 0;
+		lsn = lmWriteRecord(log, NULL, &lrd, NULL);
+		bp = log->bp;
+		bp->l_ceor = bp->l_eor;
+		lp = (struct logpage *) bp->l_ldata;
+		lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
+		lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0);
+		if ((rc = lbmIOWait(bp, 0)))
+			goto errout30;
+
+		/*
+		 * update/write superblock
+		 */
+		logsuper->state = cpu_to_le32(LOGMOUNT);
+		log->serial = le32_to_cpu(logsuper->serial) + 1;
+		logsuper->serial = cpu_to_le32(log->serial);
+		lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
+		if ((rc = lbmIOWait(bpsuper, lbmFREE)))
+			goto errout30;
+	}
+
+	/* initialize logsync parameters */
+	log->logsize = (log->size - 2) << L2LOGPSIZE;
+	log->lsn = lsn;
+	log->syncpt = lsn;
+	log->sync = log->syncpt;
+	log->nextsync = LOGSYNC_DELTA(log->logsize);
+
+	jfs_info("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x",
+		 log->lsn, log->syncpt, log->sync);
+
+	/*
+	 * initialize for lazy/group commit
+	 */
+	log->clsn = lsn;
+
+	return 0;
+
+	/*
+	 *	unwind on error
+	 */
+      errout30:		/* release log page */
+	log->wqueue = NULL;
+	bp->l_wqnext = NULL;
+	lbmFree(bp);
+
+      errout20:		/* release log superblock */
+	lbmFree(bpsuper);
+
+      errout10:		/* unwind lbmLogInit() */
+	lbmLogShutdown(log);
+
+	jfs_warn("lmLogInit: exit(%d)", rc);
+	return rc;
+}
+
+
+/*
+ * NAME:	lmLogClose()
+ *
+ * FUNCTION:	remove file system <ipmnt> from active list of log <iplog>
+ *		and close it on last close.
+ *
+ * PARAMETER:	sb	- superblock
+ *
+ * RETURN:	errors from subroutines
+ *
+ * serialization:
+ */
+int lmLogClose(struct super_block *sb)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_log *log = sbi->log;
+	struct block_device *bdev;
+	int rc = 0;
+
+	jfs_info("lmLogClose: log:0x%p", log);
+
+	mutex_lock(&jfs_log_mutex);
+	LOG_LOCK(log);
+	list_del(&sbi->log_list);
+	LOG_UNLOCK(log);
+	sbi->log = NULL;
+
+	/*
+	 * We need to make sure all of the "written" metapages
+	 * actually make it to disk
+	 */
+	sync_blockdev(sb->s_bdev);
+
+	if (test_bit(log_INLINELOG, &log->flag)) {
+		/*
+		 *	in-line log in host file system
+		 */
+		rc = lmLogShutdown(log);
+		kfree(log);
+		goto out;
+	}
+
+	if (!log->no_integrity)
+		lmLogFileSystem(log, sbi, 0);
+
+	if (!list_empty(&log->sb_list))
+		goto out;
+
+	/*
+	 * TODO: ensure that the dummy_log is in a state to allow
+	 * lbmLogShutdown to deallocate all the buffers and call
+	 * kfree against dummy_log.  For now, leave dummy_log & its
+	 * buffers in memory, and resuse if another no-integrity mount
+	 * is requested.
+	 */
+	if (log->no_integrity)
+		goto out;
+
+	/*
+	 *	external log as separate logical volume
+	 */
+	list_del(&log->journal_list);
+	bdev = log->bdev;
+	rc = lmLogShutdown(log);
+
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+
+	kfree(log);
+
+      out:
+	mutex_unlock(&jfs_log_mutex);
+	jfs_info("lmLogClose: exit(%d)", rc);
+	return rc;
+}
+
+
+/*
+ * NAME:	jfs_flush_journal()
+ *
+ * FUNCTION:	initiate write of any outstanding transactions to the journal
+ *		and optionally wait until they are all written to disk
+ *
+ *		wait == 0  flush until latest txn is committed, don't wait
+ *		wait == 1  flush until latest txn is committed, wait
+ *		wait > 1   flush until all txn's are complete, wait
+ */
+void jfs_flush_journal(struct jfs_log *log, int wait)
+{
+	int i;
+	struct tblock *target = NULL;
+
+	/* jfs_write_inode may call us during read-only mount */
+	if (!log)
+		return;
+
+	jfs_info("jfs_flush_journal: log:0x%p wait=%d", log, wait);
+
+	LOGGC_LOCK(log);
+
+	if (!list_empty(&log->cqueue)) {
+		/*
+		 * This ensures that we will keep writing to the journal as long
+		 * as there are unwritten commit records
+		 */
+		target = list_entry(log->cqueue.prev, struct tblock, cqueue);
+
+		if (test_bit(log_FLUSH, &log->flag)) {
+			/*
+			 * We're already flushing.
+			 * if flush_tblk is NULL, we are flushing everything,
+			 * so leave it that way.  Otherwise, update it to the
+			 * latest transaction
+			 */
+			if (log->flush_tblk)
+				log->flush_tblk = target;
+		} else {
+			/* Only flush until latest transaction is committed */
+			log->flush_tblk = target;
+			set_bit(log_FLUSH, &log->flag);
+
+			/*
+			 * Initiate I/O on outstanding transactions
+			 */
+			if (!(log->cflag & logGC_PAGEOUT)) {
+				log->cflag |= logGC_PAGEOUT;
+				lmGCwrite(log, 0);
+			}
+		}
+	}
+	if ((wait > 1) || test_bit(log_SYNCBARRIER, &log->flag)) {
+		/* Flush until all activity complete */
+		set_bit(log_FLUSH, &log->flag);
+		log->flush_tblk = NULL;
+	}
+
+	if (wait && target && !(target->flag & tblkGC_COMMITTED)) {
+		DECLARE_WAITQUEUE(__wait, current);
+
+		add_wait_queue(&target->gcwait, &__wait);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		LOGGC_UNLOCK(log);
+		schedule();
+		__set_current_state(TASK_RUNNING);
+		LOGGC_LOCK(log);
+		remove_wait_queue(&target->gcwait, &__wait);
+	}
+	LOGGC_UNLOCK(log);
+
+	if (wait < 2)
+		return;
+
+	write_special_inodes(log, filemap_fdatawrite);
+
+	/*
+	 * If there was recent activity, we may need to wait
+	 * for the lazycommit thread to catch up
+	 */
+	if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) {
+		for (i = 0; i < 200; i++) {	/* Too much? */
+			msleep(250);
+			write_special_inodes(log, filemap_fdatawrite);
+			if (list_empty(&log->cqueue) &&
+			    list_empty(&log->synclist))
+				break;
+		}
+	}
+	assert(list_empty(&log->cqueue));
+
+#ifdef CONFIG_JFS_DEBUG
+	if (!list_empty(&log->synclist)) {
+		struct logsyncblk *lp;
+
+		printk(KERN_ERR "jfs_flush_journal: synclist not empty\n");
+		list_for_each_entry(lp, &log->synclist, synclist) {
+			if (lp->xflag & COMMIT_PAGE) {
+				struct metapage *mp = (struct metapage *)lp;
+				print_hex_dump(KERN_ERR, "metapage: ",
+					       DUMP_PREFIX_ADDRESS, 16, 4,
+					       mp, sizeof(struct metapage), 0);
+				print_hex_dump(KERN_ERR, "page: ",
+					       DUMP_PREFIX_ADDRESS, 16,
+					       sizeof(long), mp->page,
+					       sizeof(struct page), 0);
+			} else
+				print_hex_dump(KERN_ERR, "tblock:",
+					       DUMP_PREFIX_ADDRESS, 16, 4,
+					       lp, sizeof(struct tblock), 0);
+		}
+	}
+#else
+	WARN_ON(!list_empty(&log->synclist));
+#endif
+	clear_bit(log_FLUSH, &log->flag);
+}
+
+/*
+ * NAME:	lmLogShutdown()
+ *
+ * FUNCTION:	log shutdown at last LogClose().
+ *
+ *		write log syncpt record.
+ *		update super block to set redone flag to 0.
+ *
+ * PARAMETER:	log	- log inode
+ *
+ * RETURN:	0	- success
+ *
+ * serialization: single last close thread
+ */
+int lmLogShutdown(struct jfs_log * log)
+{
+	int rc;
+	struct lrd lrd;
+	int lsn;
+	struct logsuper *logsuper;
+	struct lbuf *bpsuper;
+	struct lbuf *bp;
+	struct logpage *lp;
+
+	jfs_info("lmLogShutdown: log:0x%p", log);
+
+	jfs_flush_journal(log, 2);
+
+	/*
+	 * write the last SYNCPT record with syncpoint = 0
+	 * (i.e., log redo up to HERE !)
+	 */
+	lrd.logtid = 0;
+	lrd.backchain = 0;
+	lrd.type = cpu_to_le16(LOG_SYNCPT);
+	lrd.length = 0;
+	lrd.log.syncpt.sync = 0;
+
+	lsn = lmWriteRecord(log, NULL, &lrd, NULL);
+	bp = log->bp;
+	lp = (struct logpage *) bp->l_ldata;
+	lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
+	lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0);
+	lbmIOWait(log->bp, lbmFREE);
+	log->bp = NULL;
+
+	/*
+	 * synchronous update log superblock
+	 * mark log state as shutdown cleanly
+	 * (i.e., Log does not need to be replayed).
+	 */
+	if ((rc = lbmRead(log, 1, &bpsuper)))
+		goto out;
+
+	logsuper = (struct logsuper *) bpsuper->l_ldata;
+	logsuper->state = cpu_to_le32(LOGREDONE);
+	logsuper->end = cpu_to_le32(lsn);
+	lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
+	rc = lbmIOWait(bpsuper, lbmFREE);
+
+	jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d",
+		 lsn, log->page, log->eor);
+
+      out:
+	/*
+	 * shutdown per log i/o
+	 */
+	lbmLogShutdown(log);
+
+	if (rc) {
+		jfs_warn("lmLogShutdown: exit(%d)", rc);
+	}
+	return rc;
+}
+
+
+/*
+ * NAME:	lmLogFileSystem()
+ *
+ * FUNCTION:	insert (<activate> = true)/remove (<activate> = false)
+ *	file system into/from log active file system list.
+ *
+ * PARAMETE:	log	- pointer to logs inode.
+ *		fsdev	- kdev_t of filesystem.
+ *		serial	- pointer to returned log serial number
+ *		activate - insert/remove device from active list.
+ *
+ * RETURN:	0	- success
+ *		errors returned by vms_iowait().
+ */
+static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi,
+			   int activate)
+{
+	int rc = 0;
+	int i;
+	struct logsuper *logsuper;
+	struct lbuf *bpsuper;
+	char *uuid = sbi->uuid;
+
+	/*
+	 * insert/remove file system device to log active file system list.
+	 */
+	if ((rc = lbmRead(log, 1, &bpsuper)))
+		return rc;
+
+	logsuper = (struct logsuper *) bpsuper->l_ldata;
+	if (activate) {
+		for (i = 0; i < MAX_ACTIVE; i++)
+			if (!memcmp(logsuper->active[i].uuid, NULL_UUID, 16)) {
+				memcpy(logsuper->active[i].uuid, uuid, 16);
+				sbi->aggregate = i;
+				break;
+			}
+		if (i == MAX_ACTIVE) {
+			jfs_warn("Too many file systems sharing journal!");
+			lbmFree(bpsuper);
+			return -EMFILE;	/* Is there a better rc? */
+		}
+	} else {
+		for (i = 0; i < MAX_ACTIVE; i++)
+			if (!memcmp(logsuper->active[i].uuid, uuid, 16)) {
+				memcpy(logsuper->active[i].uuid, NULL_UUID, 16);
+				break;
+			}
+		if (i == MAX_ACTIVE) {
+			jfs_warn("Somebody stomped on the journal!");
+			lbmFree(bpsuper);
+			return -EIO;
+		}
+
+	}
+
+	/*
+	 * synchronous write log superblock:
+	 *
+	 * write sidestream bypassing write queue:
+	 * at file system mount, log super block is updated for
+	 * activation of the file system before any log record
+	 * (MOUNT record) of the file system, and at file system
+	 * unmount, all meta data for the file system has been
+	 * flushed before log super block is updated for deactivation
+	 * of the file system.
+	 */
+	lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
+	rc = lbmIOWait(bpsuper, lbmFREE);
+
+	return rc;
+}
+
+/*
+ *		log buffer manager (lbm)
+ *		------------------------
+ *
+ * special purpose buffer manager supporting log i/o requirements.
+ *
+ * per log write queue:
+ * log pageout occurs in serial order by fifo write queue and
+ * restricting to a single i/o in pregress at any one time.
+ * a circular singly-linked list
+ * (log->wrqueue points to the tail, and buffers are linked via
+ * bp->wrqueue field), and
+ * maintains log page in pageout ot waiting for pageout in serial pageout.
+ */
+
+/*
+ *	lbmLogInit()
+ *
+ * initialize per log I/O setup at lmLogInit()
+ */
+static int lbmLogInit(struct jfs_log * log)
+{				/* log inode */
+	int i;
+	struct lbuf *lbuf;
+
+	jfs_info("lbmLogInit: log:0x%p", log);
+
+	/* initialize current buffer cursor */
+	log->bp = NULL;
+
+	/* initialize log device write queue */
+	log->wqueue = NULL;
+
+	/*
+	 * Each log has its own buffer pages allocated to it.  These are
+	 * not managed by the page cache.  This ensures that a transaction
+	 * writing to the log does not block trying to allocate a page from
+	 * the page cache (for the log).  This would be bad, since page
+	 * allocation waits on the kswapd thread that may be committing inodes
+	 * which would cause log activity.  Was that clear?  I'm trying to
+	 * avoid deadlock here.
+	 */
+	init_waitqueue_head(&log->free_wait);
+
+	log->lbuf_free = NULL;
+
+	for (i = 0; i < LOGPAGES;) {
+		char *buffer;
+		uint offset;
+		struct page *page;
+
+		buffer = (char *) get_zeroed_page(GFP_KERNEL);
+		if (buffer == NULL)
+			goto error;
+		page = virt_to_page(buffer);
+		for (offset = 0; offset < PAGE_SIZE; offset += LOGPSIZE) {
+			lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL);
+			if (lbuf == NULL) {
+				if (offset == 0)
+					free_page((unsigned long) buffer);
+				goto error;
+			}
+			if (offset) /* we already have one reference */
+				get_page(page);
+			lbuf->l_offset = offset;
+			lbuf->l_ldata = buffer + offset;
+			lbuf->l_page = page;
+			lbuf->l_log = log;
+			init_waitqueue_head(&lbuf->l_ioevent);
+
+			lbuf->l_freelist = log->lbuf_free;
+			log->lbuf_free = lbuf;
+			i++;
+		}
+	}
+
+	return (0);
+
+      error:
+	lbmLogShutdown(log);
+	return -ENOMEM;
+}
+
+
+/*
+ *	lbmLogShutdown()
+ *
+ * finalize per log I/O setup at lmLogShutdown()
+ */
+static void lbmLogShutdown(struct jfs_log * log)
+{
+	struct lbuf *lbuf;
+
+	jfs_info("lbmLogShutdown: log:0x%p", log);
+
+	lbuf = log->lbuf_free;
+	while (lbuf) {
+		struct lbuf *next = lbuf->l_freelist;
+		__free_page(lbuf->l_page);
+		kfree(lbuf);
+		lbuf = next;
+	}
+}
+
+
+/*
+ *	lbmAllocate()
+ *
+ * allocate an empty log buffer
+ */
+static struct lbuf *lbmAllocate(struct jfs_log * log, int pn)
+{
+	struct lbuf *bp;
+	unsigned long flags;
+
+	/*
+	 * recycle from log buffer freelist if any
+	 */
+	LCACHE_LOCK(flags);
+	LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags);
+	log->lbuf_free = bp->l_freelist;
+	LCACHE_UNLOCK(flags);
+
+	bp->l_flag = 0;
+
+	bp->l_wqnext = NULL;
+	bp->l_freelist = NULL;
+
+	bp->l_pn = pn;
+	bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize));
+	bp->l_ceor = 0;
+
+	return bp;
+}
+
+
+/*
+ *	lbmFree()
+ *
+ * release a log buffer to freelist
+ */
+static void lbmFree(struct lbuf * bp)
+{
+	unsigned long flags;
+
+	LCACHE_LOCK(flags);
+
+	lbmfree(bp);
+
+	LCACHE_UNLOCK(flags);
+}
+
+static void lbmfree(struct lbuf * bp)
+{
+	struct jfs_log *log = bp->l_log;
+
+	assert(bp->l_wqnext == NULL);
+
+	/*
+	 * return the buffer to head of freelist
+	 */
+	bp->l_freelist = log->lbuf_free;
+	log->lbuf_free = bp;
+
+	wake_up(&log->free_wait);
+	return;
+}
+
+
+/*
+ * NAME:	lbmRedrive
+ *
+ * FUNCTION:	add a log buffer to the log redrive list
+ *
+ * PARAMETER:
+ *	bp	- log buffer
+ *
+ * NOTES:
+ *	Takes log_redrive_lock.
+ */
+static inline void lbmRedrive(struct lbuf *bp)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&log_redrive_lock, flags);
+	bp->l_redrive_next = log_redrive_list;
+	log_redrive_list = bp;
+	spin_unlock_irqrestore(&log_redrive_lock, flags);
+
+	wake_up_process(jfsIOthread);
+}
+
+
+/*
+ *	lbmRead()
+ */
+static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
+{
+	struct bio *bio;
+	struct lbuf *bp;
+
+	/*
+	 * allocate a log buffer
+	 */
+	*bpp = bp = lbmAllocate(log, pn);
+	jfs_info("lbmRead: bp:0x%p pn:0x%x", bp, pn);
+
+	bp->l_flag |= lbmREAD;
+
+	bio = bio_alloc(GFP_NOFS, 1);
+
+	bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
+	bio->bi_bdev = log->bdev;
+	bio->bi_io_vec[0].bv_page = bp->l_page;
+	bio->bi_io_vec[0].bv_len = LOGPSIZE;
+	bio->bi_io_vec[0].bv_offset = bp->l_offset;
+
+	bio->bi_vcnt = 1;
+	bio->bi_idx = 0;
+	bio->bi_size = LOGPSIZE;
+
+	bio->bi_end_io = lbmIODone;
+	bio->bi_private = bp;
+	submit_bio(READ_SYNC, bio);
+
+	wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
+
+	return 0;
+}
+
+
+/*
+ *	lbmWrite()
+ *
+ * buffer at head of pageout queue stays after completion of
+ * partial-page pageout and redriven by explicit initiation of
+ * pageout by caller until full-page pageout is completed and
+ * released.
+ *
+ * device driver i/o done redrives pageout of new buffer at
+ * head of pageout queue when current buffer at head of pageout
+ * queue is released at the completion of its full-page pageout.
+ *
+ * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit().
+ * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone()
+ */
+static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag,
+		     int cant_block)
+{
+	struct lbuf *tail;
+	unsigned long flags;
+
+	jfs_info("lbmWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn);
+
+	/* map the logical block address to physical block address */
+	bp->l_blkno =
+	    log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
+
+	LCACHE_LOCK(flags);		/* disable+lock */
+
+	/*
+	 * initialize buffer for device driver
+	 */
+	bp->l_flag = flag;
+
+	/*
+	 *	insert bp at tail of write queue associated with log
+	 *
+	 * (request is either for bp already/currently at head of queue
+	 * or new bp to be inserted at tail)
+	 */
+	tail = log->wqueue;
+
+	/* is buffer not already on write queue ? */
+	if (bp->l_wqnext == NULL) {
+		/* insert at tail of wqueue */
+		if (tail == NULL) {
+			log->wqueue = bp;
+			bp->l_wqnext = bp;
+		} else {
+			log->wqueue = bp;
+			bp->l_wqnext = tail->l_wqnext;
+			tail->l_wqnext = bp;
+		}
+
+		tail = bp;
+	}
+
+	/* is buffer at head of wqueue and for write ? */
+	if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) {
+		LCACHE_UNLOCK(flags);	/* unlock+enable */
+		return;
+	}
+
+	LCACHE_UNLOCK(flags);	/* unlock+enable */
+
+	if (cant_block)
+		lbmRedrive(bp);
+	else if (flag & lbmSYNC)
+		lbmStartIO(bp);
+	else {
+		LOGGC_UNLOCK(log);
+		lbmStartIO(bp);
+		LOGGC_LOCK(log);
+	}
+}
+
+
+/*
+ *	lbmDirectWrite()
+ *
+ * initiate pageout bypassing write queue for sidestream
+ * (e.g., log superblock) write;
+ */
+static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
+{
+	jfs_info("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x",
+		 bp, flag, bp->l_pn);
+
+	/*
+	 * initialize buffer for device driver
+	 */
+	bp->l_flag = flag | lbmDIRECT;
+
+	/* map the logical block address to physical block address */
+	bp->l_blkno =
+	    log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
+
+	/*
+	 *	initiate pageout of the page
+	 */
+	lbmStartIO(bp);
+}
+
+
+/*
+ * NAME:	lbmStartIO()
+ *
+ * FUNCTION:	Interface to DD strategy routine
+ *
+ * RETURN:	none
+ *
+ * serialization: LCACHE_LOCK() is NOT held during log i/o;
+ */
+static void lbmStartIO(struct lbuf * bp)
+{
+	struct bio *bio;
+	struct jfs_log *log = bp->l_log;
+
+	jfs_info("lbmStartIO\n");
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
+	bio->bi_bdev = log->bdev;
+	bio->bi_io_vec[0].bv_page = bp->l_page;
+	bio->bi_io_vec[0].bv_len = LOGPSIZE;
+	bio->bi_io_vec[0].bv_offset = bp->l_offset;
+
+	bio->bi_vcnt = 1;
+	bio->bi_idx = 0;
+	bio->bi_size = LOGPSIZE;
+
+	bio->bi_end_io = lbmIODone;
+	bio->bi_private = bp;
+
+	/* check if journaling to disk has been disabled */
+	if (log->no_integrity) {
+		bio->bi_size = 0;
+		lbmIODone(bio, 0);
+	} else {
+		submit_bio(WRITE_SYNC, bio);
+		INCREMENT(lmStat.submitted);
+	}
+}
+
+
+/*
+ *	lbmIOWait()
+ */
+static int lbmIOWait(struct lbuf * bp, int flag)
+{
+	unsigned long flags;
+	int rc = 0;
+
+	jfs_info("lbmIOWait1: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);
+
+	LCACHE_LOCK(flags);		/* disable+lock */
+
+	LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags);
+
+	rc = (bp->l_flag & lbmERROR) ? -EIO : 0;
+
+	if (flag & lbmFREE)
+		lbmfree(bp);
+
+	LCACHE_UNLOCK(flags);	/* unlock+enable */
+
+	jfs_info("lbmIOWait2: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);
+	return rc;
+}
+
+/*
+ *	lbmIODone()
+ *
+ * executed at INTIODONE level
+ */
+static void lbmIODone(struct bio *bio, int error)
+{
+	struct lbuf *bp = bio->bi_private;
+	struct lbuf *nextbp, *tail;
+	struct jfs_log *log;
+	unsigned long flags;
+
+	/*
+	 * get back jfs buffer bound to the i/o buffer
+	 */
+	jfs_info("lbmIODone: bp:0x%p flag:0x%x", bp, bp->l_flag);
+
+	LCACHE_LOCK(flags);		/* disable+lock */
+
+	bp->l_flag |= lbmDONE;
+
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+		bp->l_flag |= lbmERROR;
+
+		jfs_err("lbmIODone: I/O error in JFS log");
+	}
+
+	bio_put(bio);
+
+	/*
+	 *	pagein completion
+	 */
+	if (bp->l_flag & lbmREAD) {
+		bp->l_flag &= ~lbmREAD;
+
+		LCACHE_UNLOCK(flags);	/* unlock+enable */
+
+		/* wakeup I/O initiator */
+		LCACHE_WAKEUP(&bp->l_ioevent);
+
+		return;
+	}
+
+	/*
+	 *	pageout completion
+	 *
+	 * the bp at the head of write queue has completed pageout.
+	 *
+	 * if single-commit/full-page pageout, remove the current buffer
+	 * from head of pageout queue, and redrive pageout with
+	 * the new buffer at head of pageout queue;
+	 * otherwise, the partial-page pageout buffer stays at
+	 * the head of pageout queue to be redriven for pageout
+	 * by lmGroupCommit() until full-page pageout is completed.
+	 */
+	bp->l_flag &= ~lbmWRITE;
+	INCREMENT(lmStat.pagedone);
+
+	/* update committed lsn */
+	log = bp->l_log;
+	log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor;
+
+	if (bp->l_flag & lbmDIRECT) {
+		LCACHE_WAKEUP(&bp->l_ioevent);
+		LCACHE_UNLOCK(flags);
+		return;
+	}
+
+	tail = log->wqueue;
+
+	/* single element queue */
+	if (bp == tail) {
+		/* remove head buffer of full-page pageout
+		 * from log device write queue
+		 */
+		if (bp->l_flag & lbmRELEASE) {
+			log->wqueue = NULL;
+			bp->l_wqnext = NULL;
+		}
+	}
+	/* multi element queue */
+	else {
+		/* remove head buffer of full-page pageout
+		 * from log device write queue
+		 */
+		if (bp->l_flag & lbmRELEASE) {
+			nextbp = tail->l_wqnext = bp->l_wqnext;
+			bp->l_wqnext = NULL;
+
+			/*
+			 * redrive pageout of next page at head of write queue:
+			 * redrive next page without any bound tblk
+			 * (i.e., page w/o any COMMIT records), or
+			 * first page of new group commit which has been
+			 * queued after current page (subsequent pageout
+			 * is performed synchronously, except page without
+			 * any COMMITs) by lmGroupCommit() as indicated
+			 * by lbmWRITE flag;
+			 */
+			if (nextbp->l_flag & lbmWRITE) {
+				/*
+				 * We can't do the I/O at interrupt time.
+				 * The jfsIO thread can do it
+				 */
+				lbmRedrive(nextbp);
+			}
+		}
+	}
+
+	/*
+	 *	synchronous pageout:
+	 *
+	 * buffer has not necessarily been removed from write queue
+	 * (e.g., synchronous write of partial-page with COMMIT):
+	 * leave buffer for i/o initiator to dispose
+	 */
+	if (bp->l_flag & lbmSYNC) {
+		LCACHE_UNLOCK(flags);	/* unlock+enable */
+
+		/* wakeup I/O initiator */
+		LCACHE_WAKEUP(&bp->l_ioevent);
+	}
+
+	/*
+	 *	Group Commit pageout:
+	 */
+	else if (bp->l_flag & lbmGC) {
+		LCACHE_UNLOCK(flags);
+		lmPostGC(bp);
+	}
+
+	/*
+	 *	asynchronous pageout:
+	 *
+	 * buffer must have been removed from write queue:
+	 * insert buffer at head of freelist where it can be recycled
+	 */
+	else {
+		assert(bp->l_flag & lbmRELEASE);
+		assert(bp->l_flag & lbmFREE);
+		lbmfree(bp);
+
+		LCACHE_UNLOCK(flags);	/* unlock+enable */
+	}
+}
+
+int jfsIOWait(void *arg)
+{
+	struct lbuf *bp;
+
+	do {
+		spin_lock_irq(&log_redrive_lock);
+		while ((bp = log_redrive_list)) {
+			log_redrive_list = bp->l_redrive_next;
+			bp->l_redrive_next = NULL;
+			spin_unlock_irq(&log_redrive_lock);
+			lbmStartIO(bp);
+			spin_lock_irq(&log_redrive_lock);
+		}
+
+		if (freezing(current)) {
+			spin_unlock_irq(&log_redrive_lock);
+			refrigerator();
+		} else {
+			set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&log_redrive_lock);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+
+	jfs_info("jfsIOWait being killed!");
+	return 0;
+}
+
+/*
+ * NAME:	lmLogFormat()/jfs_logform()
+ *
+ * FUNCTION:	format file system log
+ *
+ * PARAMETERS:
+ *	log	- volume log
+ *	logAddress - start address of log space in FS block
+ *	logSize	- length of log space in FS block;
+ *
+ * RETURN:	0	- success
+ *		-EIO	- i/o error
+ *
+ * XXX: We're synchronously writing one page at a time.  This needs to
+ *	be improved by writing multiple pages at once.
+ */
+int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
+{
+	int rc = -EIO;
+	struct jfs_sb_info *sbi;
+	struct logsuper *logsuper;
+	struct logpage *lp;
+	int lspn;		/* log sequence page number */
+	struct lrd *lrd_ptr;
+	int npages = 0;
+	struct lbuf *bp;
+
+	jfs_info("lmLogFormat: logAddress:%Ld logSize:%d",
+		 (long long)logAddress, logSize);
+
+	sbi = list_entry(log->sb_list.next, struct jfs_sb_info, log_list);
+
+	/* allocate a log buffer */
+	bp = lbmAllocate(log, 1);
+
+	npages = logSize >> sbi->l2nbperpage;
+
+	/*
+	 *	log space:
+	 *
+	 * page 0 - reserved;
+	 * page 1 - log superblock;
+	 * page 2 - log data page: A SYNC log record is written
+	 *	    into this page at logform time;
+	 * pages 3-N - log data page: set to empty log data pages;
+	 */
+	/*
+	 *	init log superblock: log page 1
+	 */
+	logsuper = (struct logsuper *) bp->l_ldata;
+
+	logsuper->magic = cpu_to_le32(LOGMAGIC);
+	logsuper->version = cpu_to_le32(LOGVERSION);
+	logsuper->state = cpu_to_le32(LOGREDONE);
+	logsuper->flag = cpu_to_le32(sbi->mntflag);	/* ? */
+	logsuper->size = cpu_to_le32(npages);
+	logsuper->bsize = cpu_to_le32(sbi->bsize);
+	logsuper->l2bsize = cpu_to_le32(sbi->l2bsize);
+	logsuper->end = cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE);
+
+	bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
+	bp->l_blkno = logAddress + sbi->nbperpage;
+	lbmStartIO(bp);
+	if ((rc = lbmIOWait(bp, 0)))
+		goto exit;
+
+	/*
+	 *	init pages 2 to npages-1 as log data pages:
+	 *
+	 * log page sequence number (lpsn) initialization:
+	 *
+	 * pn:   0     1     2     3                 n-1
+	 *       +-----+-----+=====+=====+===.....===+=====+
+	 * lspn:             N-1   0     1           N-2
+	 *                   <--- N page circular file ---->
+	 *
+	 * the N (= npages-2) data pages of the log is maintained as
+	 * a circular file for the log records;
+	 * lpsn grows by 1 monotonically as each log page is written
+	 * to the circular file of the log;
+	 * and setLogpage() will not reset the page number even if
+	 * the eor is equal to LOGPHDRSIZE. In order for binary search
+	 * still work in find log end process, we have to simulate the
+	 * log wrap situation at the log format time.
+	 * The 1st log page written will have the highest lpsn. Then
+	 * the succeeding log pages will have ascending order of
+	 * the lspn starting from 0, ... (N-2)
+	 */
+	lp = (struct logpage *) bp->l_ldata;
+	/*
+	 * initialize 1st log page to be written: lpsn = N - 1,
+	 * write a SYNCPT log record is written to this page
+	 */
+	lp->h.page = lp->t.page = cpu_to_le32(npages - 3);
+	lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE);
+
+	lrd_ptr = (struct lrd *) &lp->data;
+	lrd_ptr->logtid = 0;
+	lrd_ptr->backchain = 0;
+	lrd_ptr->type = cpu_to_le16(LOG_SYNCPT);
+	lrd_ptr->length = 0;
+	lrd_ptr->log.syncpt.sync = 0;
+
+	bp->l_blkno += sbi->nbperpage;
+	bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
+	lbmStartIO(bp);
+	if ((rc = lbmIOWait(bp, 0)))
+		goto exit;
+
+	/*
+	 *	initialize succeeding log pages: lpsn = 0, 1, ..., (N-2)
+	 */
+	for (lspn = 0; lspn < npages - 3; lspn++) {
+		lp->h.page = lp->t.page = cpu_to_le32(lspn);
+		lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
+
+		bp->l_blkno += sbi->nbperpage;
+		bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
+		lbmStartIO(bp);
+		if ((rc = lbmIOWait(bp, 0)))
+			goto exit;
+	}
+
+	rc = 0;
+exit:
+	/*
+	 *	finalize log
+	 */
+	/* release the buffer */
+	lbmFree(bp);
+
+	return rc;
+}
+
+#ifdef CONFIG_JFS_STATISTICS
+static int jfs_lmstats_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m,
+		       "JFS Logmgr stats\n"
+		       "================\n"
+		       "commits = %d\n"
+		       "writes submitted = %d\n"
+		       "writes completed = %d\n"
+		       "full pages submitted = %d\n"
+		       "partial pages submitted = %d\n",
+		       lmStat.commit,
+		       lmStat.submitted,
+		       lmStat.pagedone,
+		       lmStat.full_page,
+		       lmStat.partial_page);
+	return 0;
+}
+
+static int jfs_lmstats_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_lmstats_proc_show, NULL);
+}
+
+const struct file_operations jfs_lmstats_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_lmstats_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif /* CONFIG_JFS_STATISTICS */
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
new file mode 100644
index 00000000..e38c2159
--- /dev/null
+++ b/fs/jfs/jfs_logmgr.h
@@ -0,0 +1,513 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_LOGMGR
+#define _H_JFS_LOGMGR
+
+#include "jfs_filsys.h"
+#include "jfs_lock.h"
+
+/*
+ *	log manager configuration parameters
+ */
+
+/* log page size */
+#define	LOGPSIZE	4096
+#define	L2LOGPSIZE	12
+
+#define LOGPAGES	16	/* Log pages per mounted file system */
+
+/*
+ *	log logical volume
+ *
+ * a log is used to make the commit operation on journalled
+ * files within the same logical volume group atomic.
+ * a log is implemented with a logical volume.
+ * there is one log per logical volume group.
+ *
+ * block 0 of the log logical volume is not used (ipl etc).
+ * block 1 contains a log "superblock" and is used by logFormat(),
+ * lmLogInit(), lmLogShutdown(), and logRedo() to record status
+ * of the log but is not otherwise used during normal processing.
+ * blocks 2 - (N-1) are used to contain log records.
+ *
+ * when a volume group is varied-on-line, logRedo() must have
+ * been executed before the file systems (logical volumes) in
+ * the volume group can be mounted.
+ */
+/*
+ *	log superblock (block 1 of logical volume)
+ */
+#define	LOGSUPER_B	1
+#define	LOGSTART_B	2
+
+#define	LOGMAGIC	0x87654321
+#define	LOGVERSION	1
+
+#define MAX_ACTIVE	128	/* Max active file systems sharing log */
+
+struct logsuper {
+	__le32 magic;		/* 4: log lv identifier */
+	__le32 version;		/* 4: version number */
+	__le32 serial;		/* 4: log open/mount counter */
+	__le32 size;		/* 4: size in number of LOGPSIZE blocks */
+	__le32 bsize;		/* 4: logical block size in byte */
+	__le32 l2bsize;		/* 4: log2 of bsize */
+
+	__le32 flag;		/* 4: option */
+	__le32 state;		/* 4: state - see below */
+
+	__le32 end;		/* 4: addr of last log record set by logredo */
+	char uuid[16];		/* 16: 128-bit journal uuid */
+	char label[16];		/* 16: journal label */
+	struct {
+		char uuid[16];
+	} active[MAX_ACTIVE];	/* 2048: active file systems list */
+};
+
+#define NULL_UUID "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+/* log flag: commit option (see jfs_filsys.h) */
+
+/* log state */
+#define	LOGMOUNT	0	/* log mounted by lmLogInit() */
+#define LOGREDONE	1	/* log shutdown by lmLogShutdown().
+				 * log redo completed by logredo().
+				 */
+#define LOGWRAP		2	/* log wrapped */
+#define LOGREADERR	3	/* log read error detected in logredo() */
+
+
+/*
+ *	log logical page
+ *
+ * (this comment should be rewritten !)
+ * the header and trailer structures (h,t) will normally have
+ * the same page and eor value.
+ * An exception to this occurs when a complete page write is not
+ * accomplished on a power failure. Since the hardware may "split write"
+ * sectors in the page, any out of order sequence may occur during powerfail
+ * and needs to be recognized during log replay.  The xor value is
+ * an "exclusive or" of all log words in the page up to eor.  This
+ * 32 bit eor is stored with the top 16 bits in the header and the
+ * bottom 16 bits in the trailer.  logredo can easily recognize pages
+ * that were not completed by reconstructing this eor and checking
+ * the log page.
+ *
+ * Previous versions of the operating system did not allow split
+ * writes and detected partially written records in logredo by
+ * ordering the updates to the header, trailer, and the move of data
+ * into the logdata area.  The order: (1) data is moved (2) header
+ * is updated (3) trailer is updated.  In logredo, when the header
+ * differed from the trailer, the header and trailer were reconciled
+ * as follows: if h.page != t.page they were set to the smaller of
+ * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only)
+ * h.eor != t.eor they were set to the smaller of their two values.
+ */
+struct logpage {
+	struct {		/* header */
+		__le32 page;	/* 4: log sequence page number */
+		__le16 rsrvd;	/* 2: */
+		__le16 eor;	/* 2: end-of-log offset of lasrt record write */
+	} h;
+
+	__le32 data[LOGPSIZE / 4 - 4];	/* log record area */
+
+	struct {		/* trailer */
+		__le32 page;	/* 4: normally the same as h.page */
+		__le16 rsrvd;	/* 2: */
+		__le16 eor;	/* 2: normally the same as h.eor */
+	} t;
+};
+
+#define LOGPHDRSIZE	8	/* log page header size */
+#define LOGPTLRSIZE	8	/* log page trailer size */
+
+
+/*
+ *	log record
+ *
+ * (this comment should be rewritten !)
+ * jfs uses only "after" log records (only a single writer is allowed
+ * in a page, pages are written to temporary paging space if
+ * if they must be written to disk before commit, and i/o is
+ * scheduled for modified pages to their home location after
+ * the log records containing the after values and the commit
+ * record is written to the log on disk, undo discards the copy
+ * in main-memory.)
+ *
+ * a log record consists of a data area of variable length followed by
+ * a descriptor of fixed size LOGRDSIZE bytes.
+ * the data area is rounded up to an integral number of 4-bytes and
+ * must be no longer than LOGPSIZE.
+ * the descriptor is of size of multiple of 4-bytes and aligned on a
+ * 4-byte boundary.
+ * records are packed one after the other in the data area of log pages.
+ * (sometimes a DUMMY record is inserted so that at least one record ends
+ * on every page or the longest record is placed on at most two pages).
+ * the field eor in page header/trailer points to the byte following
+ * the last record on a page.
+ */
+
+/* log record types */
+#define LOG_COMMIT		0x8000
+#define LOG_SYNCPT		0x4000
+#define LOG_MOUNT		0x2000
+#define LOG_REDOPAGE		0x0800
+#define LOG_NOREDOPAGE		0x0080
+#define LOG_NOREDOINOEXT	0x0040
+#define LOG_UPDATEMAP		0x0008
+#define LOG_NOREDOFILE		0x0001
+
+/* REDOPAGE/NOREDOPAGE log record data type */
+#define	LOG_INODE		0x0001
+#define	LOG_XTREE		0x0002
+#define	LOG_DTREE		0x0004
+#define	LOG_BTROOT		0x0010
+#define	LOG_EA			0x0020
+#define	LOG_ACL			0x0040
+#define	LOG_DATA		0x0080
+#define	LOG_NEW			0x0100
+#define	LOG_EXTEND		0x0200
+#define LOG_RELOCATE		0x0400
+#define LOG_DIR_XTREE		0x0800	/* Xtree is in directory inode */
+
+/* UPDATEMAP log record descriptor type */
+#define	LOG_ALLOCXADLIST	0x0080
+#define	LOG_ALLOCPXDLIST	0x0040
+#define	LOG_ALLOCXAD		0x0020
+#define	LOG_ALLOCPXD		0x0010
+#define	LOG_FREEXADLIST		0x0008
+#define	LOG_FREEPXDLIST		0x0004
+#define	LOG_FREEXAD		0x0002
+#define	LOG_FREEPXD		0x0001
+
+
+struct lrd {
+	/*
+	 * type independent area
+	 */
+	__le32 logtid;		/* 4: log transaction identifier */
+	__le32 backchain;	/* 4: ptr to prev record of same transaction */
+	__le16 type;		/* 2: record type */
+	__le16 length;		/* 2: length of data in record (in byte) */
+	__le32 aggregate;	/* 4: file system lv/aggregate */
+	/* (16) */
+
+	/*
+	 * type dependent area (20)
+	 */
+	union {
+
+		/*
+		 *	COMMIT: commit
+		 *
+		 * transaction commit: no type-dependent information;
+		 */
+
+		/*
+		 *	REDOPAGE: after-image
+		 *
+		 * apply after-image;
+		 *
+		 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
+		 */
+		struct {
+			__le32 fileset;	/* 4: fileset number */
+			__le32 inode;	/* 4: inode number */
+			__le16 type;	/* 2: REDOPAGE record type */
+			__le16 l2linesize;	/* 2: log2 of line size */
+			pxd_t pxd;	/* 8: on-disk page pxd */
+		} redopage;	/* (20) */
+
+		/*
+		 *	NOREDOPAGE: the page is freed
+		 *
+		 * do not apply after-image records which precede this record
+		 * in the log with the same page block number to this page.
+		 *
+		 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
+		 */
+		struct {
+			__le32 fileset;	/* 4: fileset number */
+			__le32 inode;	/* 4: inode number */
+			__le16 type;	/* 2: NOREDOPAGE record type */
+			__le16 rsrvd;	/* 2: reserved */
+			pxd_t pxd;	/* 8: on-disk page pxd */
+		} noredopage;	/* (20) */
+
+		/*
+		 *	UPDATEMAP: update block allocation map
+		 *
+		 * either in-line PXD,
+		 * or     out-of-line  XADLIST;
+		 *
+		 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
+		 */
+		struct {
+			__le32 fileset;	/* 4: fileset number */
+			__le32 inode;	/* 4: inode number */
+			__le16 type;	/* 2: UPDATEMAP record type */
+			__le16 nxd;	/* 2: number of extents */
+			pxd_t pxd;	/* 8: pxd */
+		} updatemap;	/* (20) */
+
+		/*
+		 *	NOREDOINOEXT: the inode extent is freed
+		 *
+		 * do not apply after-image records which precede this
+		 * record in the log with the any of the 4 page block
+		 * numbers in this inode extent.
+		 *
+		 * NOTE: The fileset and pxd fields MUST remain in
+		 *       the same fields in the REDOPAGE record format.
+		 *
+		 */
+		struct {
+			__le32 fileset;	/* 4: fileset number */
+			__le32 iagnum;	/* 4: IAG number     */
+			__le32 inoext_idx;	/* 4: inode extent index */
+			pxd_t pxd;	/* 8: on-disk page pxd */
+		} noredoinoext;	/* (20) */
+
+		/*
+		 *	SYNCPT: log sync point
+		 *
+		 * replay log up to syncpt address specified;
+		 */
+		struct {
+			__le32 sync;	/* 4: syncpt address (0 = here) */
+		} syncpt;
+
+		/*
+		 *	MOUNT: file system mount
+		 *
+		 * file system mount: no type-dependent information;
+		 */
+
+		/*
+		 *	? FREEXTENT: free specified extent(s)
+		 *
+		 * free specified extent(s) from block allocation map
+		 * N.B.: nextents should be length of data/sizeof(xad_t)
+		 */
+		struct {
+			__le32 type;	/* 4: FREEXTENT record type */
+			__le32 nextent;	/* 4: number of extents */
+
+			/* data: PXD or XAD list */
+		} freextent;
+
+		/*
+		 *	? NOREDOFILE: this file is freed
+		 *
+		 * do not apply records which precede this record in the log
+		 * with the same inode number.
+		 *
+		 * NOREDOFILE must be the first to be written at commit
+		 * (last to be read in logredo()) - it prevents
+		 * replay of preceding updates of all preceding generations
+		 * of the inumber esp. the on-disk inode itself.
+		 */
+		struct {
+			__le32 fileset;	/* 4: fileset number */
+			__le32 inode;	/* 4: inode number */
+		} noredofile;
+
+		/*
+		 *	? NEWPAGE:
+		 *
+		 * metadata type dependent
+		 */
+		struct {
+			__le32 fileset;	/* 4: fileset number */
+			__le32 inode;	/* 4: inode number */
+			__le32 type;	/* 4: NEWPAGE record type */
+			pxd_t pxd;	/* 8: on-disk page pxd */
+		} newpage;
+
+		/*
+		 *	? DUMMY: filler
+		 *
+		 * no type-dependent information
+		 */
+	} log;
+};					/* (36) */
+
+#define	LOGRDSIZE	(sizeof(struct lrd))
+
+/*
+ *	line vector descriptor
+ */
+struct lvd {
+	__le16 offset;
+	__le16 length;
+};
+
+
+/*
+ *	log logical volume
+ */
+struct jfs_log {
+
+	struct list_head sb_list;/*  This is used to sync metadata
+				 *    before writing syncpt.
+				 */
+	struct list_head journal_list; /* Global list */
+	struct block_device *bdev; /* 4: log lv pointer */
+	int serial;		/* 4: log mount serial number */
+
+	s64 base;		/* @8: log extent address (inline log ) */
+	int size;		/* 4: log size in log page (in page) */
+	int l2bsize;		/* 4: log2 of bsize */
+
+	unsigned long flag;	/* 4: flag */
+
+	struct lbuf *lbuf_free;	/* 4: free lbufs */
+	wait_queue_head_t free_wait;	/* 4: */
+
+	/* log write */
+	int logtid;		/* 4: log tid */
+	int page;		/* 4: page number of eol page */
+	int eor;		/* 4: eor of last record in eol page */
+	struct lbuf *bp;	/* 4: current log page buffer */
+
+	struct mutex loglock;	/* 4: log write serialization lock */
+
+	/* syncpt */
+	int nextsync;		/* 4: bytes to write before next syncpt */
+	int active;		/* 4: */
+	wait_queue_head_t syncwait;	/* 4: */
+
+	/* commit */
+	uint cflag;		/* 4: */
+	struct list_head cqueue; /* FIFO commit queue */
+	struct tblock *flush_tblk; /* tblk we're waiting on for flush */
+	int gcrtc;		/* 4: GC_READY transaction count */
+	struct tblock *gclrt;	/* 4: latest GC_READY transaction */
+	spinlock_t gclock;	/* 4: group commit lock */
+	int logsize;		/* 4: log data area size in byte */
+	int lsn;		/* 4: end-of-log */
+	int clsn;		/* 4: clsn */
+	int syncpt;		/* 4: addr of last syncpt record */
+	int sync;		/* 4: addr from last logsync() */
+	struct list_head synclist;	/* 8: logsynclist anchor */
+	spinlock_t synclock;	/* 4: synclist lock */
+	struct lbuf *wqueue;	/* 4: log pageout queue */
+	int count;		/* 4: count */
+	char uuid[16];		/* 16: 128-bit uuid of log device */
+
+	int no_integrity;	/* 3: flag to disable journaling to disk */
+};
+
+/*
+ * Log flag
+ */
+#define log_INLINELOG	1
+#define log_SYNCBARRIER	2
+#define log_QUIESCE	3
+#define log_FLUSH	4
+
+/*
+ * group commit flag
+ */
+/* jfs_log */
+#define logGC_PAGEOUT	0x00000001
+
+/* tblock/lbuf */
+#define tblkGC_QUEUE		0x0001
+#define tblkGC_READY		0x0002
+#define tblkGC_COMMIT		0x0004
+#define tblkGC_COMMITTED	0x0008
+#define tblkGC_EOP		0x0010
+#define tblkGC_FREE		0x0020
+#define tblkGC_LEADER		0x0040
+#define tblkGC_ERROR		0x0080
+#define tblkGC_LAZY		0x0100	// D230860
+#define tblkGC_UNLOCKED		0x0200	// D230860
+
+/*
+ *		log cache buffer header
+ */
+struct lbuf {
+	struct jfs_log *l_log;	/* 4: log associated with buffer */
+
+	/*
+	 * data buffer base area
+	 */
+	uint l_flag;		/* 4: pageout control flags */
+
+	struct lbuf *l_wqnext;	/* 4: write queue link */
+	struct lbuf *l_freelist;	/* 4: freelistlink */
+
+	int l_pn;		/* 4: log page number */
+	int l_eor;		/* 4: log record eor */
+	int l_ceor;		/* 4: committed log record eor */
+
+	s64 l_blkno;		/* 8: log page block number */
+	caddr_t l_ldata;	/* 4: data page */
+	struct page *l_page;	/* The page itself */
+	uint l_offset;		/* Offset of l_ldata within the page */
+
+	wait_queue_head_t l_ioevent;	/* 4: i/o done event */
+};
+
+/* Reuse l_freelist for redrive list */
+#define l_redrive_next l_freelist
+
+/*
+ *	logsynclist block
+ *
+ * common logsyncblk prefix for jbuf_t and tblock
+ */
+struct logsyncblk {
+	u16 xflag;		/* flags */
+	u16 flag;		/* only meaninful in tblock */
+	lid_t lid;		/* lock id */
+	s32 lsn;		/* log sequence number */
+	struct list_head synclist;	/* log sync list link */
+};
+
+/*
+ *	logsynclist serialization (per log)
+ */
+
+#define LOGSYNC_LOCK_INIT(log) spin_lock_init(&(log)->synclock)
+#define LOGSYNC_LOCK(log, flags) spin_lock_irqsave(&(log)->synclock, flags)
+#define LOGSYNC_UNLOCK(log, flags) \
+	spin_unlock_irqrestore(&(log)->synclock, flags)
+
+/* compute the difference in bytes of lsn from sync point */
+#define logdiff(diff, lsn, log)\
+{\
+	diff = (lsn) - (log)->syncpt;\
+	if (diff < 0)\
+		diff += (log)->logsize;\
+}
+
+extern int lmLogOpen(struct super_block *sb);
+extern int lmLogClose(struct super_block *sb);
+extern int lmLogShutdown(struct jfs_log * log);
+extern int lmLogInit(struct jfs_log * log);
+extern int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize);
+extern int lmGroupCommit(struct jfs_log *, struct tblock *);
+extern int jfsIOWait(void *);
+extern void jfs_flush_journal(struct jfs_log * log, int wait);
+extern void jfs_syncpt(struct jfs_log *log, int hard_sync);
+
+#endif				/* _H_JFS_LOGMGR */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
new file mode 100644
index 00000000..6740d34c
--- /dev/null
+++ b/fs/jfs/jfs_metapage.c
@@ -0,0 +1,843 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2005
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/buffer_head.h>
+#include <linux/mempool.h>
+#include <linux/seq_file.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_txnmgr.h"
+#include "jfs_debug.h"
+
+#ifdef CONFIG_JFS_STATISTICS
+static struct {
+	uint	pagealloc;	/* # of page allocations */
+	uint	pagefree;	/* # of page frees */
+	uint	lockwait;	/* # of sleeping lock_metapage() calls */
+} mpStat;
+#endif
+
+#define metapage_locked(mp) test_bit(META_locked, &(mp)->flag)
+#define trylock_metapage(mp) test_and_set_bit_lock(META_locked, &(mp)->flag)
+
+static inline void unlock_metapage(struct metapage *mp)
+{
+	clear_bit_unlock(META_locked, &mp->flag);
+	wake_up(&mp->wait);
+}
+
+static inline void __lock_metapage(struct metapage *mp)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	INCREMENT(mpStat.lockwait);
+	add_wait_queue_exclusive(&mp->wait, &wait);
+	do {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (metapage_locked(mp)) {
+			unlock_page(mp->page);
+			io_schedule();
+			lock_page(mp->page);
+		}
+	} while (trylock_metapage(mp));
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&mp->wait, &wait);
+}
+
+/*
+ * Must have mp->page locked
+ */
+static inline void lock_metapage(struct metapage *mp)
+{
+	if (trylock_metapage(mp))
+		__lock_metapage(mp);
+}
+
+#define METAPOOL_MIN_PAGES 32
+static struct kmem_cache *metapage_cache;
+static mempool_t *metapage_mempool;
+
+#define MPS_PER_PAGE (PAGE_CACHE_SIZE >> L2PSIZE)
+
+#if MPS_PER_PAGE > 1
+
+struct meta_anchor {
+	int mp_count;
+	atomic_t io_count;
+	struct metapage *mp[MPS_PER_PAGE];
+};
+#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
+
+static inline struct metapage *page_to_mp(struct page *page, int offset)
+{
+	if (!PagePrivate(page))
+		return NULL;
+	return mp_anchor(page)->mp[offset >> L2PSIZE];
+}
+
+static inline int insert_metapage(struct page *page, struct metapage *mp)
+{
+	struct meta_anchor *a;
+	int index;
+	int l2mp_blocks;	/* log2 blocks per metapage */
+
+	if (PagePrivate(page))
+		a = mp_anchor(page);
+	else {
+		a = kzalloc(sizeof(struct meta_anchor), GFP_NOFS);
+		if (!a)
+			return -ENOMEM;
+		set_page_private(page, (unsigned long)a);
+		SetPagePrivate(page);
+		kmap(page);
+	}
+
+	if (mp) {
+		l2mp_blocks = L2PSIZE - page->mapping->host->i_blkbits;
+		index = (mp->index >> l2mp_blocks) & (MPS_PER_PAGE - 1);
+		a->mp_count++;
+		a->mp[index] = mp;
+	}
+
+	return 0;
+}
+
+static inline void remove_metapage(struct page *page, struct metapage *mp)
+{
+	struct meta_anchor *a = mp_anchor(page);
+	int l2mp_blocks = L2PSIZE - page->mapping->host->i_blkbits;
+	int index;
+
+	index = (mp->index >> l2mp_blocks) & (MPS_PER_PAGE - 1);
+
+	BUG_ON(a->mp[index] != mp);
+
+	a->mp[index] = NULL;
+	if (--a->mp_count == 0) {
+		kfree(a);
+		set_page_private(page, 0);
+		ClearPagePrivate(page);
+		kunmap(page);
+	}
+}
+
+static inline void inc_io(struct page *page)
+{
+	atomic_inc(&mp_anchor(page)->io_count);
+}
+
+static inline void dec_io(struct page *page, void (*handler) (struct page *))
+{
+	if (atomic_dec_and_test(&mp_anchor(page)->io_count))
+		handler(page);
+}
+
+#else
+static inline struct metapage *page_to_mp(struct page *page, int offset)
+{
+	return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
+}
+
+static inline int insert_metapage(struct page *page, struct metapage *mp)
+{
+	if (mp) {
+		set_page_private(page, (unsigned long)mp);
+		SetPagePrivate(page);
+		kmap(page);
+	}
+	return 0;
+}
+
+static inline void remove_metapage(struct page *page, struct metapage *mp)
+{
+	set_page_private(page, 0);
+	ClearPagePrivate(page);
+	kunmap(page);
+}
+
+#define inc_io(page) do {} while(0)
+#define dec_io(page, handler) handler(page)
+
+#endif
+
+static void init_once(void *foo)
+{
+	struct metapage *mp = (struct metapage *)foo;
+
+	mp->lid = 0;
+	mp->lsn = 0;
+	mp->flag = 0;
+	mp->data = NULL;
+	mp->clsn = 0;
+	mp->log = NULL;
+	set_bit(META_free, &mp->flag);
+	init_waitqueue_head(&mp->wait);
+}
+
+static inline struct metapage *alloc_metapage(gfp_t gfp_mask)
+{
+	return mempool_alloc(metapage_mempool, gfp_mask);
+}
+
+static inline void free_metapage(struct metapage *mp)
+{
+	mp->flag = 0;
+	set_bit(META_free, &mp->flag);
+
+	mempool_free(mp, metapage_mempool);
+}
+
+int __init metapage_init(void)
+{
+	/*
+	 * Allocate the metapage structures
+	 */
+	metapage_cache = kmem_cache_create("jfs_mp", sizeof(struct metapage),
+					   0, 0, init_once);
+	if (metapage_cache == NULL)
+		return -ENOMEM;
+
+	metapage_mempool = mempool_create_slab_pool(METAPOOL_MIN_PAGES,
+						    metapage_cache);
+
+	if (metapage_mempool == NULL) {
+		kmem_cache_destroy(metapage_cache);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void metapage_exit(void)
+{
+	mempool_destroy(metapage_mempool);
+	kmem_cache_destroy(metapage_cache);
+}
+
+static inline void drop_metapage(struct page *page, struct metapage *mp)
+{
+	if (mp->count || mp->nohomeok || test_bit(META_dirty, &mp->flag) ||
+	    test_bit(META_io, &mp->flag))
+		return;
+	remove_metapage(page, mp);
+	INCREMENT(mpStat.pagefree);
+	free_metapage(mp);
+}
+
+/*
+ * Metapage address space operations
+ */
+
+static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock,
+				    int *len)
+{
+	int rc = 0;
+	int xflag;
+	s64 xaddr;
+	sector_t file_blocks = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+			       inode->i_blkbits;
+
+	if (lblock >= file_blocks)
+		return 0;
+	if (lblock + *len > file_blocks)
+		*len = file_blocks - lblock;
+
+	if (inode->i_ino) {
+		rc = xtLookup(inode, (s64)lblock, *len, &xflag, &xaddr, len, 0);
+		if ((rc == 0) && *len)
+			lblock = (sector_t)xaddr;
+		else
+			lblock = 0;
+	} /* else no mapping */
+
+	return lblock;
+}
+
+static void last_read_complete(struct page *page)
+{
+	if (!PageError(page))
+		SetPageUptodate(page);
+	unlock_page(page);
+}
+
+static void metapage_read_end_io(struct bio *bio, int err)
+{
+	struct page *page = bio->bi_private;
+
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+		printk(KERN_ERR "metapage_read_end_io: I/O error\n");
+		SetPageError(page);
+	}
+
+	dec_io(page, last_read_complete);
+	bio_put(bio);
+}
+
+static void remove_from_logsync(struct metapage *mp)
+{
+	struct jfs_log *log = mp->log;
+	unsigned long flags;
+/*
+ * This can race.  Recheck that log hasn't been set to null, and after
+ * acquiring logsync lock, recheck lsn
+ */
+	if (!log)
+		return;
+
+	LOGSYNC_LOCK(log, flags);
+	if (mp->lsn) {
+		mp->log = NULL;
+		mp->lsn = 0;
+		mp->clsn = 0;
+		log->count--;
+		list_del(&mp->synclist);
+	}
+	LOGSYNC_UNLOCK(log, flags);
+}
+
+static void last_write_complete(struct page *page)
+{
+	struct metapage *mp;
+	unsigned int offset;
+
+	for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+		mp = page_to_mp(page, offset);
+		if (mp && test_bit(META_io, &mp->flag)) {
+			if (mp->lsn)
+				remove_from_logsync(mp);
+			clear_bit(META_io, &mp->flag);
+		}
+		/*
+		 * I'd like to call drop_metapage here, but I don't think it's
+		 * safe unless I have the page locked
+		 */
+	}
+	end_page_writeback(page);
+}
+
+static void metapage_write_end_io(struct bio *bio, int err)
+{
+	struct page *page = bio->bi_private;
+
+	BUG_ON(!PagePrivate(page));
+
+	if (! test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+		printk(KERN_ERR "metapage_write_end_io: I/O error\n");
+		SetPageError(page);
+	}
+	dec_io(page, last_write_complete);
+	bio_put(bio);
+}
+
+static int metapage_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct bio *bio = NULL;
+	int block_offset;	/* block offset of mp within page */
+	struct inode *inode = page->mapping->host;
+	int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage;
+	int len;
+	int xlen;
+	struct metapage *mp;
+	int redirty = 0;
+	sector_t lblock;
+	int nr_underway = 0;
+	sector_t pblock;
+	sector_t next_block = 0;
+	sector_t page_start;
+	unsigned long bio_bytes = 0;
+	unsigned long bio_offset = 0;
+	int offset;
+	int bad_blocks = 0;
+
+	page_start = (sector_t)page->index <<
+		     (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	BUG_ON(!PageLocked(page));
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+		mp = page_to_mp(page, offset);
+
+		if (!mp || !test_bit(META_dirty, &mp->flag))
+			continue;
+
+		if (mp->nohomeok && !test_bit(META_forcewrite, &mp->flag)) {
+			redirty = 1;
+			/*
+			 * Make sure this page isn't blocked indefinitely.
+			 * If the journal isn't undergoing I/O, push it
+			 */
+			if (mp->log && !(mp->log->cflag & logGC_PAGEOUT))
+				jfs_flush_journal(mp->log, 0);
+			continue;
+		}
+
+		clear_bit(META_dirty, &mp->flag);
+		set_bit(META_io, &mp->flag);
+		block_offset = offset >> inode->i_blkbits;
+		lblock = page_start + block_offset;
+		if (bio) {
+			if (xlen && lblock == next_block) {
+				/* Contiguous, in memory & on disk */
+				len = min(xlen, blocks_per_mp);
+				xlen -= len;
+				bio_bytes += len << inode->i_blkbits;
+				continue;
+			}
+			/* Not contiguous */
+			if (bio_add_page(bio, page, bio_bytes, bio_offset) <
+			    bio_bytes)
+				goto add_failed;
+			/*
+			 * Increment counter before submitting i/o to keep
+			 * count from hitting zero before we're through
+			 */
+			inc_io(page);
+			if (!bio->bi_size)
+				goto dump_bio;
+			submit_bio(WRITE, bio);
+			nr_underway++;
+			bio = NULL;
+		} else
+			inc_io(page);
+		xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits;
+		pblock = metapage_get_blocks(inode, lblock, &xlen);
+		if (!pblock) {
+			printk(KERN_ERR "JFS: metapage_get_blocks failed\n");
+			/*
+			 * We already called inc_io(), but can't cancel it
+			 * with dec_io() until we're done with the page
+			 */
+			bad_blocks++;
+			continue;
+		}
+		len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage);
+
+		bio = bio_alloc(GFP_NOFS, 1);
+		bio->bi_bdev = inode->i_sb->s_bdev;
+		bio->bi_sector = pblock << (inode->i_blkbits - 9);
+		bio->bi_end_io = metapage_write_end_io;
+		bio->bi_private = page;
+
+		/* Don't call bio_add_page yet, we may add to this vec */
+		bio_offset = offset;
+		bio_bytes = len << inode->i_blkbits;
+
+		xlen -= len;
+		next_block = lblock + len;
+	}
+	if (bio) {
+		if (bio_add_page(bio, page, bio_bytes, bio_offset) < bio_bytes)
+				goto add_failed;
+		if (!bio->bi_size)
+			goto dump_bio;
+
+		submit_bio(WRITE, bio);
+		nr_underway++;
+	}
+	if (redirty)
+		redirty_page_for_writepage(wbc, page);
+
+	unlock_page(page);
+
+	if (bad_blocks)
+		goto err_out;
+
+	if (nr_underway == 0)
+		end_page_writeback(page);
+
+	return 0;
+add_failed:
+	/* We should never reach here, since we're only adding one vec */
+	printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n");
+	goto skip;
+dump_bio:
+	print_hex_dump(KERN_ERR, "JFS: dump of bio: ", DUMP_PREFIX_ADDRESS, 16,
+		       4, bio, sizeof(*bio), 0);
+skip:
+	bio_put(bio);
+	unlock_page(page);
+	dec_io(page, last_write_complete);
+err_out:
+	while (bad_blocks--)
+		dec_io(page, last_write_complete);
+	return -EIO;
+}
+
+static int metapage_readpage(struct file *fp, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct bio *bio = NULL;
+	int block_offset;
+	int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+	sector_t page_start;	/* address of page in fs blocks */
+	sector_t pblock;
+	int xlen;
+	unsigned int len;
+	int offset;
+
+	BUG_ON(!PageLocked(page));
+	page_start = (sector_t)page->index <<
+		     (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	block_offset = 0;
+	while (block_offset < blocks_per_page) {
+		xlen = blocks_per_page - block_offset;
+		pblock = metapage_get_blocks(inode, page_start + block_offset,
+					     &xlen);
+		if (pblock) {
+			if (!PagePrivate(page))
+				insert_metapage(page, NULL);
+			inc_io(page);
+			if (bio)
+				submit_bio(READ, bio);
+
+			bio = bio_alloc(GFP_NOFS, 1);
+			bio->bi_bdev = inode->i_sb->s_bdev;
+			bio->bi_sector = pblock << (inode->i_blkbits - 9);
+			bio->bi_end_io = metapage_read_end_io;
+			bio->bi_private = page;
+			len = xlen << inode->i_blkbits;
+			offset = block_offset << inode->i_blkbits;
+			if (bio_add_page(bio, page, len, offset) < len)
+				goto add_failed;
+			block_offset += xlen;
+		} else
+			block_offset++;
+	}
+	if (bio)
+		submit_bio(READ, bio);
+	else
+		unlock_page(page);
+
+	return 0;
+
+add_failed:
+	printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n");
+	bio_put(bio);
+	dec_io(page, last_read_complete);
+	return -EIO;
+}
+
+static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	struct metapage *mp;
+	int ret = 1;
+	int offset;
+
+	for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+		mp = page_to_mp(page, offset);
+
+		if (!mp)
+			continue;
+
+		jfs_info("metapage_releasepage: mp = 0x%p", mp);
+		if (mp->count || mp->nohomeok ||
+		    test_bit(META_dirty, &mp->flag)) {
+			jfs_info("count = %ld, nohomeok = %d", mp->count,
+				 mp->nohomeok);
+			ret = 0;
+			continue;
+		}
+		if (mp->lsn)
+			remove_from_logsync(mp);
+		remove_metapage(page, mp);
+		INCREMENT(mpStat.pagefree);
+		free_metapage(mp);
+	}
+	return ret;
+}
+
+static void metapage_invalidatepage(struct page *page, unsigned long offset)
+{
+	BUG_ON(offset);
+
+	BUG_ON(PageWriteback(page));
+
+	metapage_releasepage(page, 0);
+}
+
+const struct address_space_operations jfs_metapage_aops = {
+	.readpage	= metapage_readpage,
+	.writepage	= metapage_writepage,
+	.releasepage	= metapage_releasepage,
+	.invalidatepage	= metapage_invalidatepage,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+};
+
+struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
+				unsigned int size, int absolute,
+				unsigned long new)
+{
+	int l2BlocksPerPage;
+	int l2bsize;
+	struct address_space *mapping;
+	struct metapage *mp = NULL;
+	struct page *page;
+	unsigned long page_index;
+	unsigned long page_offset;
+
+	jfs_info("__get_metapage: ino = %ld, lblock = 0x%lx, abs=%d",
+		 inode->i_ino, lblock, absolute);
+
+	l2bsize = inode->i_blkbits;
+	l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize;
+	page_index = lblock >> l2BlocksPerPage;
+	page_offset = (lblock - (page_index << l2BlocksPerPage)) << l2bsize;
+	if ((page_offset + size) > PAGE_CACHE_SIZE) {
+		jfs_err("MetaData crosses page boundary!!");
+		jfs_err("lblock = %lx, size  = %d", lblock, size);
+		dump_stack();
+		return NULL;
+	}
+	if (absolute)
+		mapping = JFS_SBI(inode->i_sb)->direct_inode->i_mapping;
+	else {
+		/*
+		 * If an nfs client tries to read an inode that is larger
+		 * than any existing inodes, we may try to read past the
+		 * end of the inode map
+		 */
+		if ((lblock << inode->i_blkbits) >= inode->i_size)
+			return NULL;
+		mapping = inode->i_mapping;
+	}
+
+	if (new && (PSIZE == PAGE_CACHE_SIZE)) {
+		page = grab_cache_page(mapping, page_index);
+		if (!page) {
+			jfs_err("grab_cache_page failed!");
+			return NULL;
+		}
+		SetPageUptodate(page);
+	} else {
+		page = read_mapping_page(mapping, page_index, NULL);
+		if (IS_ERR(page) || !PageUptodate(page)) {
+			jfs_err("read_mapping_page failed!");
+			return NULL;
+		}
+		lock_page(page);
+	}
+
+	mp = page_to_mp(page, page_offset);
+	if (mp) {
+		if (mp->logical_size != size) {
+			jfs_error(inode->i_sb,
+				  "__get_metapage: mp->logical_size != size");
+			jfs_err("logical_size = %d, size = %d",
+				mp->logical_size, size);
+			dump_stack();
+			goto unlock;
+		}
+		mp->count++;
+		lock_metapage(mp);
+		if (test_bit(META_discard, &mp->flag)) {
+			if (!new) {
+				jfs_error(inode->i_sb,
+					  "__get_metapage: using a "
+					  "discarded metapage");
+				discard_metapage(mp);
+				goto unlock;
+			}
+			clear_bit(META_discard, &mp->flag);
+		}
+	} else {
+		INCREMENT(mpStat.pagealloc);
+		mp = alloc_metapage(GFP_NOFS);
+		mp->page = page;
+		mp->flag = 0;
+		mp->xflag = COMMIT_PAGE;
+		mp->count = 1;
+		mp->nohomeok = 0;
+		mp->logical_size = size;
+		mp->data = page_address(page) + page_offset;
+		mp->index = lblock;
+		if (unlikely(insert_metapage(page, mp))) {
+			free_metapage(mp);
+			goto unlock;
+		}
+		lock_metapage(mp);
+	}
+
+	if (new) {
+		jfs_info("zeroing mp = 0x%p", mp);
+		memset(mp->data, 0, PSIZE);
+	}
+
+	unlock_page(page);
+	jfs_info("__get_metapage: returning = 0x%p data = 0x%p", mp, mp->data);
+	return mp;
+
+unlock:
+	unlock_page(page);
+	return NULL;
+}
+
+void grab_metapage(struct metapage * mp)
+{
+	jfs_info("grab_metapage: mp = 0x%p", mp);
+	page_cache_get(mp->page);
+	lock_page(mp->page);
+	mp->count++;
+	lock_metapage(mp);
+	unlock_page(mp->page);
+}
+
+void force_metapage(struct metapage *mp)
+{
+	struct page *page = mp->page;
+	jfs_info("force_metapage: mp = 0x%p", mp);
+	set_bit(META_forcewrite, &mp->flag);
+	clear_bit(META_sync, &mp->flag);
+	page_cache_get(page);
+	lock_page(page);
+	set_page_dirty(page);
+	write_one_page(page, 1);
+	clear_bit(META_forcewrite, &mp->flag);
+	page_cache_release(page);
+}
+
+void hold_metapage(struct metapage *mp)
+{
+	lock_page(mp->page);
+}
+
+void put_metapage(struct metapage *mp)
+{
+	if (mp->count || mp->nohomeok) {
+		/* Someone else will release this */
+		unlock_page(mp->page);
+		return;
+	}
+	page_cache_get(mp->page);
+	mp->count++;
+	lock_metapage(mp);
+	unlock_page(mp->page);
+	release_metapage(mp);
+}
+
+void release_metapage(struct metapage * mp)
+{
+	struct page *page = mp->page;
+	jfs_info("release_metapage: mp = 0x%p, flag = 0x%lx", mp, mp->flag);
+
+	BUG_ON(!page);
+
+	lock_page(page);
+	unlock_metapage(mp);
+
+	assert(mp->count);
+	if (--mp->count || mp->nohomeok) {
+		unlock_page(page);
+		page_cache_release(page);
+		return;
+	}
+
+	if (test_bit(META_dirty, &mp->flag)) {
+		set_page_dirty(page);
+		if (test_bit(META_sync, &mp->flag)) {
+			clear_bit(META_sync, &mp->flag);
+			write_one_page(page, 1);
+			lock_page(page); /* write_one_page unlocks the page */
+		}
+	} else if (mp->lsn)	/* discard_metapage doesn't remove it */
+		remove_from_logsync(mp);
+
+	/* Try to keep metapages from using up too much memory */
+	drop_metapage(page, mp);
+
+	unlock_page(page);
+	page_cache_release(page);
+}
+
+void __invalidate_metapages(struct inode *ip, s64 addr, int len)
+{
+	sector_t lblock;
+	int l2BlocksPerPage = PAGE_CACHE_SHIFT - ip->i_blkbits;
+	int BlocksPerPage = 1 << l2BlocksPerPage;
+	/* All callers are interested in block device's mapping */
+	struct address_space *mapping =
+		JFS_SBI(ip->i_sb)->direct_inode->i_mapping;
+	struct metapage *mp;
+	struct page *page;
+	unsigned int offset;
+
+	/*
+	 * Mark metapages to discard.  They will eventually be
+	 * released, but should not be written.
+	 */
+	for (lblock = addr & ~(BlocksPerPage - 1); lblock < addr + len;
+	     lblock += BlocksPerPage) {
+		page = find_lock_page(mapping, lblock >> l2BlocksPerPage);
+		if (!page)
+			continue;
+		for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+			mp = page_to_mp(page, offset);
+			if (!mp)
+				continue;
+			if (mp->index < addr)
+				continue;
+			if (mp->index >= addr + len)
+				break;
+
+			clear_bit(META_dirty, &mp->flag);
+			set_bit(META_discard, &mp->flag);
+			if (mp->lsn)
+				remove_from_logsync(mp);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+	}
+}
+
+#ifdef CONFIG_JFS_STATISTICS
+static int jfs_mpstat_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m,
+		       "JFS Metapage statistics\n"
+		       "=======================\n"
+		       "page allocations = %d\n"
+		       "page frees = %d\n"
+		       "lock waits = %d\n",
+		       mpStat.pagealloc,
+		       mpStat.pagefree,
+		       mpStat.lockwait);
+	return 0;
+}
+
+static int jfs_mpstat_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_mpstat_proc_show, NULL);
+}
+
+const struct file_operations jfs_mpstat_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_mpstat_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
new file mode 100644
index 00000000..a78beda8
--- /dev/null
+++ b/fs/jfs/jfs_metapage.h
@@ -0,0 +1,155 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_METAPAGE
+#define _H_JFS_METAPAGE
+
+#include <linux/pagemap.h>
+
+struct metapage {
+	/* Common logsyncblk prefix (see jfs_logmgr.h) */
+	u16 xflag;
+	u16 unused;
+	lid_t lid;
+	int lsn;
+	struct list_head synclist;
+	/* End of logsyncblk prefix */
+
+	unsigned long flag;	/* See Below */
+	unsigned long count;	/* Reference count */
+	void *data;		/* Data pointer */
+	sector_t index;		/* block address of page */
+	wait_queue_head_t wait;
+
+	/* implementation */
+	struct page *page;
+	unsigned int logical_size;
+
+	/* Journal management */
+	int clsn;
+	int nohomeok;
+	struct jfs_log *log;
+};
+
+/* metapage flag */
+#define META_locked	0
+#define META_free	1
+#define META_dirty	2
+#define META_sync	3
+#define META_discard	4
+#define META_forcewrite	5
+#define META_io		6
+
+#define mark_metapage_dirty(mp) set_bit(META_dirty, &(mp)->flag)
+
+/* function prototypes */
+extern int metapage_init(void);
+extern void metapage_exit(void);
+extern struct metapage *__get_metapage(struct inode *inode,
+				  unsigned long lblock, unsigned int size,
+				  int absolute, unsigned long new);
+
+#define read_metapage(inode, lblock, size, absolute)\
+	 __get_metapage(inode, lblock, size, absolute, false)
+
+#define get_metapage(inode, lblock, size, absolute)\
+	 __get_metapage(inode, lblock, size, absolute, true)
+
+extern void release_metapage(struct metapage *);
+extern void grab_metapage(struct metapage *);
+extern void force_metapage(struct metapage *);
+
+/*
+ * hold_metapage and put_metapage are used in conjunction.  The page lock
+ * is not dropped between the two, so no other threads can get or release
+ * the metapage
+ */
+extern void hold_metapage(struct metapage *);
+extern void put_metapage(struct metapage *);
+
+static inline void write_metapage(struct metapage *mp)
+{
+	set_bit(META_dirty, &mp->flag);
+	release_metapage(mp);
+}
+
+static inline void flush_metapage(struct metapage *mp)
+{
+	set_bit(META_sync, &mp->flag);
+	write_metapage(mp);
+}
+
+static inline void discard_metapage(struct metapage *mp)
+{
+	clear_bit(META_dirty, &mp->flag);
+	set_bit(META_discard, &mp->flag);
+	release_metapage(mp);
+}
+
+static inline void metapage_nohomeok(struct metapage *mp)
+{
+	struct page *page = mp->page;
+	lock_page(page);
+	if (!mp->nohomeok++) {
+		mark_metapage_dirty(mp);
+		page_cache_get(page);
+		wait_on_page_writeback(page);
+	}
+	unlock_page(page);
+}
+
+/*
+ * This serializes access to mp->lsn when metapages are added to logsynclist
+ * without setting nohomeok.  i.e. updating imap & dmap
+ */
+static inline void metapage_wait_for_io(struct metapage *mp)
+{
+	if (test_bit(META_io, &mp->flag))
+		wait_on_page_writeback(mp->page);
+}
+
+/*
+ * This is called when already holding the metapage
+ */
+static inline void _metapage_homeok(struct metapage *mp)
+{
+	if (!--mp->nohomeok)
+		page_cache_release(mp->page);
+}
+
+static inline void metapage_homeok(struct metapage *mp)
+{
+	hold_metapage(mp);
+	_metapage_homeok(mp);
+	put_metapage(mp);
+}
+
+extern const struct address_space_operations jfs_metapage_aops;
+
+/*
+ * This routines invalidate all pages for an extent.
+ */
+extern void __invalidate_metapages(struct inode *, s64, int);
+#define invalidate_pxd_metapages(ip, pxd) \
+	__invalidate_metapages((ip), addressPXD(&(pxd)), lengthPXD(&(pxd)))
+#define invalidate_dxd_metapages(ip, dxd) \
+	__invalidate_metapages((ip), addressDXD(&(dxd)), lengthDXD(&(dxd)))
+#define invalidate_xad_metapages(ip, xad) \
+	__invalidate_metapages((ip), addressXAD(&(xad)), lengthXAD(&(xad)))
+
+#endif				/* _H_JFS_METAPAGE */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
new file mode 100644
index 00000000..9895595f
--- /dev/null
+++ b/fs/jfs/jfs_mount.c
@@ -0,0 +1,507 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ * Module: jfs_mount.c
+ *
+ * note: file system in transition to aggregate/fileset:
+ *
+ * file system mount is interpreted as the mount of aggregate,
+ * if not already mounted, and mount of the single/only fileset in
+ * the aggregate;
+ *
+ * a file system/aggregate is represented by an internal inode
+ * (aka mount inode) initialized with aggregate superblock;
+ * each vfs represents a fileset, and points to its "fileset inode
+ * allocation map inode" (aka fileset inode):
+ * (an aggregate itself is structured recursively as a filset:
+ * an internal vfs is constructed and points to its "fileset inode
+ * allocation map inode" (aka aggregate inode) where each inode
+ * represents a fileset inode) so that inode number is mapped to
+ * on-disk inode in uniform way at both aggregate and fileset level;
+ *
+ * each vnode/inode of a fileset is linked to its vfs (to facilitate
+ * per fileset inode operations, e.g., unmount of a fileset, etc.);
+ * each inode points to the mount inode (to facilitate access to
+ * per aggregate information, e.g., block size, etc.) as well as
+ * its file set inode.
+ *
+ *   aggregate
+ *   ipmnt
+ *   mntvfs -> fileset ipimap+ -> aggregate ipbmap -> aggregate ipaimap;
+ *             fileset vfs     -> vp(1) <-> ... <-> vp(n) <->vproot;
+ */
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_metapage.h"
+#include "jfs_debug.h"
+
+
+/*
+ * forward references
+ */
+static int chkSuper(struct super_block *);
+static int logMOUNT(struct super_block *sb);
+
+/*
+ * NAME:	jfs_mount(sb)
+ *
+ * FUNCTION:	vfs_mount()
+ *
+ * PARAMETER:	sb	- super block
+ *
+ * RETURN:	-EBUSY	- device already mounted or open for write
+ *		-EBUSY	- cvrdvp already mounted;
+ *		-EBUSY	- mount table full
+ *		-ENOTDIR- cvrdvp not directory on a device mount
+ *		-ENXIO	- device open failure
+ */
+int jfs_mount(struct super_block *sb)
+{
+	int rc = 0;		/* Return code */
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct inode *ipaimap = NULL;
+	struct inode *ipaimap2 = NULL;
+	struct inode *ipimap = NULL;
+	struct inode *ipbmap = NULL;
+
+	/*
+	 * read/validate superblock
+	 * (initialize mount inode from the superblock)
+	 */
+	if ((rc = chkSuper(sb))) {
+		goto errout20;
+	}
+
+	ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
+	if (ipaimap == NULL) {
+		jfs_err("jfs_mount: Failed to read AGGREGATE_I");
+		rc = -EIO;
+		goto errout20;
+	}
+	sbi->ipaimap = ipaimap;
+
+	jfs_info("jfs_mount: ipaimap:0x%p", ipaimap);
+
+	/*
+	 * initialize aggregate inode allocation map
+	 */
+	if ((rc = diMount(ipaimap))) {
+		jfs_err("jfs_mount: diMount(ipaimap) failed w/rc = %d", rc);
+		goto errout21;
+	}
+
+	/*
+	 * open aggregate block allocation map
+	 */
+	ipbmap = diReadSpecial(sb, BMAP_I, 0);
+	if (ipbmap == NULL) {
+		rc = -EIO;
+		goto errout22;
+	}
+
+	jfs_info("jfs_mount: ipbmap:0x%p", ipbmap);
+
+	sbi->ipbmap = ipbmap;
+
+	/*
+	 * initialize aggregate block allocation map
+	 */
+	if ((rc = dbMount(ipbmap))) {
+		jfs_err("jfs_mount: dbMount failed w/rc = %d", rc);
+		goto errout22;
+	}
+
+	/*
+	 * open the secondary aggregate inode allocation map
+	 *
+	 * This is a duplicate of the aggregate inode allocation map.
+	 *
+	 * hand craft a vfs in the same fashion as we did to read ipaimap.
+	 * By adding INOSPEREXT (32) to the inode number, we are telling
+	 * diReadSpecial that we are reading from the secondary aggregate
+	 * inode table.  This also creates a unique entry in the inode hash
+	 * table.
+	 */
+	if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
+		ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
+		if (!ipaimap2) {
+			jfs_err("jfs_mount: Failed to read AGGREGATE_I");
+			rc = -EIO;
+			goto errout35;
+		}
+		sbi->ipaimap2 = ipaimap2;
+
+		jfs_info("jfs_mount: ipaimap2:0x%p", ipaimap2);
+
+		/*
+		 * initialize secondary aggregate inode allocation map
+		 */
+		if ((rc = diMount(ipaimap2))) {
+			jfs_err("jfs_mount: diMount(ipaimap2) failed, rc = %d",
+				rc);
+			goto errout35;
+		}
+	} else
+		/* Secondary aggregate inode table is not valid */
+		sbi->ipaimap2 = NULL;
+
+	/*
+	 *	mount (the only/single) fileset
+	 */
+	/*
+	 * open fileset inode allocation map (aka fileset inode)
+	 */
+	ipimap = diReadSpecial(sb, FILESYSTEM_I, 0);
+	if (ipimap == NULL) {
+		jfs_err("jfs_mount: Failed to read FILESYSTEM_I");
+		/* open fileset secondary inode allocation map */
+		rc = -EIO;
+		goto errout40;
+	}
+	jfs_info("jfs_mount: ipimap:0x%p", ipimap);
+
+	/* map further access of per fileset inodes by the fileset inode */
+	sbi->ipimap = ipimap;
+
+	/* initialize fileset inode allocation map */
+	if ((rc = diMount(ipimap))) {
+		jfs_err("jfs_mount: diMount failed w/rc = %d", rc);
+		goto errout41;
+	}
+
+	goto out;
+
+	/*
+	 *	unwind on error
+	 */
+      errout41:		/* close fileset inode allocation map inode */
+	diFreeSpecial(ipimap);
+
+      errout40:		/* fileset closed */
+
+	/* close secondary aggregate inode allocation map */
+	if (ipaimap2) {
+		diUnmount(ipaimap2, 1);
+		diFreeSpecial(ipaimap2);
+	}
+
+      errout35:
+
+	/* close aggregate block allocation map */
+	dbUnmount(ipbmap, 1);
+	diFreeSpecial(ipbmap);
+
+      errout22:		/* close aggregate inode allocation map */
+
+	diUnmount(ipaimap, 1);
+
+      errout21:		/* close aggregate inodes */
+	diFreeSpecial(ipaimap);
+      errout20:		/* aggregate closed */
+
+      out:
+
+	if (rc)
+		jfs_err("Mount JFS Failure: %d", rc);
+
+	return rc;
+}
+
+/*
+ * NAME:	jfs_mount_rw(sb, remount)
+ *
+ * FUNCTION:	Completes read-write mount, or remounts read-only volume
+ *		as read-write
+ */
+int jfs_mount_rw(struct super_block *sb, int remount)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	int rc;
+
+	/*
+	 * If we are re-mounting a previously read-only volume, we want to
+	 * re-read the inode and block maps, since fsck.jfs may have updated
+	 * them.
+	 */
+	if (remount) {
+		if (chkSuper(sb) || (sbi->state != FM_CLEAN))
+			return -EINVAL;
+
+		truncate_inode_pages(sbi->ipimap->i_mapping, 0);
+		truncate_inode_pages(sbi->ipbmap->i_mapping, 0);
+		diUnmount(sbi->ipimap, 1);
+		if ((rc = diMount(sbi->ipimap))) {
+			jfs_err("jfs_mount_rw: diMount failed!");
+			return rc;
+		}
+
+		dbUnmount(sbi->ipbmap, 1);
+		if ((rc = dbMount(sbi->ipbmap))) {
+			jfs_err("jfs_mount_rw: dbMount failed!");
+			return rc;
+		}
+	}
+
+	/*
+	 * open/initialize log
+	 */
+	if ((rc = lmLogOpen(sb)))
+		return rc;
+
+	/*
+	 * update file system superblock;
+	 */
+	if ((rc = updateSuper(sb, FM_MOUNT))) {
+		jfs_err("jfs_mount: updateSuper failed w/rc = %d", rc);
+		lmLogClose(sb);
+		return rc;
+	}
+
+	/*
+	 * write MOUNT log record of the file system
+	 */
+	logMOUNT(sb);
+
+	return rc;
+}
+
+/*
+ *	chkSuper()
+ *
+ * validate the superblock of the file system to be mounted and
+ * get the file system parameters.
+ *
+ * returns
+ *	0 with fragsize set if check successful
+ *	error code if not successful
+ */
+static int chkSuper(struct super_block *sb)
+{
+	int rc = 0;
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_superblock *j_sb;
+	struct buffer_head *bh;
+	int AIM_bytesize, AIT_bytesize;
+	int expected_AIM_bytesize, expected_AIT_bytesize;
+	s64 AIM_byte_addr, AIT_byte_addr, fsckwsp_addr;
+	s64 byte_addr_diff0, byte_addr_diff1;
+	s32 bsize;
+
+	if ((rc = readSuper(sb, &bh)))
+		return rc;
+	j_sb = (struct jfs_superblock *)bh->b_data;
+
+	/*
+	 * validate superblock
+	 */
+	/* validate fs signature */
+	if (strncmp(j_sb->s_magic, JFS_MAGIC, 4) ||
+	    le32_to_cpu(j_sb->s_version) > JFS_VERSION) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	bsize = le32_to_cpu(j_sb->s_bsize);
+#ifdef _JFS_4K
+	if (bsize != PSIZE) {
+		jfs_err("Currently only 4K block size supported!");
+		rc = -EINVAL;
+		goto out;
+	}
+#endif				/* _JFS_4K */
+
+	jfs_info("superblock: flag:0x%08x state:0x%08x size:0x%Lx",
+		 le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state),
+		 (unsigned long long) le64_to_cpu(j_sb->s_size));
+
+	/* validate the descriptors for Secondary AIM and AIT */
+	if ((j_sb->s_flag & cpu_to_le32(JFS_BAD_SAIT)) !=
+	    cpu_to_le32(JFS_BAD_SAIT)) {
+		expected_AIM_bytesize = 2 * PSIZE;
+		AIM_bytesize = lengthPXD(&(j_sb->s_aim2)) * bsize;
+		expected_AIT_bytesize = 4 * PSIZE;
+		AIT_bytesize = lengthPXD(&(j_sb->s_ait2)) * bsize;
+		AIM_byte_addr = addressPXD(&(j_sb->s_aim2)) * bsize;
+		AIT_byte_addr = addressPXD(&(j_sb->s_ait2)) * bsize;
+		byte_addr_diff0 = AIT_byte_addr - AIM_byte_addr;
+		fsckwsp_addr = addressPXD(&(j_sb->s_fsckpxd)) * bsize;
+		byte_addr_diff1 = fsckwsp_addr - AIT_byte_addr;
+		if ((AIM_bytesize != expected_AIM_bytesize) ||
+		    (AIT_bytesize != expected_AIT_bytesize) ||
+		    (byte_addr_diff0 != AIM_bytesize) ||
+		    (byte_addr_diff1 <= AIT_bytesize))
+			j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT);
+	}
+
+	if ((j_sb->s_flag & cpu_to_le32(JFS_GROUPCOMMIT)) !=
+	    cpu_to_le32(JFS_GROUPCOMMIT))
+		j_sb->s_flag |= cpu_to_le32(JFS_GROUPCOMMIT);
+
+	/* validate fs state */
+	if (j_sb->s_state != cpu_to_le32(FM_CLEAN) &&
+	    !(sb->s_flags & MS_RDONLY)) {
+		jfs_err("jfs_mount: Mount Failure: File System Dirty.");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	sbi->state = le32_to_cpu(j_sb->s_state);
+	sbi->mntflag = le32_to_cpu(j_sb->s_flag);
+
+	/*
+	 * JFS always does I/O by 4K pages.  Don't tell the buffer cache
+	 * that we use anything else (leave s_blocksize alone).
+	 */
+	sbi->bsize = bsize;
+	sbi->l2bsize = le16_to_cpu(j_sb->s_l2bsize);
+
+	/*
+	 * For now, ignore s_pbsize, l2bfactor.  All I/O going through buffer
+	 * cache.
+	 */
+	sbi->nbperpage = PSIZE >> sbi->l2bsize;
+	sbi->l2nbperpage = L2PSIZE - sbi->l2bsize;
+	sbi->l2niperblk = sbi->l2bsize - L2DISIZE;
+	if (sbi->mntflag & JFS_INLINELOG)
+		sbi->logpxd = j_sb->s_logpxd;
+	else {
+		sbi->logdev = new_decode_dev(le32_to_cpu(j_sb->s_logdev));
+		memcpy(sbi->uuid, j_sb->s_uuid, sizeof(sbi->uuid));
+		memcpy(sbi->loguuid, j_sb->s_loguuid, sizeof(sbi->uuid));
+	}
+	sbi->fsckpxd = j_sb->s_fsckpxd;
+	sbi->ait2 = j_sb->s_ait2;
+
+      out:
+	brelse(bh);
+	return rc;
+}
+
+
+/*
+ *	updateSuper()
+ *
+ * update synchronously superblock if it is mounted read-write.
+ */
+int updateSuper(struct super_block *sb, uint state)
+{
+	struct jfs_superblock *j_sb;
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct buffer_head *bh;
+	int rc;
+
+	if (sbi->flag & JFS_NOINTEGRITY) {
+		if (state == FM_DIRTY) {
+			sbi->p_state = state;
+			return 0;
+		} else if (state == FM_MOUNT) {
+			sbi->p_state = sbi->state;
+			state = FM_DIRTY;
+		} else if (state == FM_CLEAN) {
+			state = sbi->p_state;
+		} else
+			jfs_err("updateSuper: bad state");
+	} else if (sbi->state == FM_DIRTY)
+		return 0;
+
+	if ((rc = readSuper(sb, &bh)))
+		return rc;
+
+	j_sb = (struct jfs_superblock *)bh->b_data;
+
+	j_sb->s_state = cpu_to_le32(state);
+	sbi->state = state;
+
+	if (state == FM_MOUNT) {
+		/* record log's dev_t and mount serial number */
+		j_sb->s_logdev = cpu_to_le32(new_encode_dev(sbi->log->bdev->bd_dev));
+		j_sb->s_logserial = cpu_to_le32(sbi->log->serial);
+	} else if (state == FM_CLEAN) {
+		/*
+		 * If this volume is shared with OS/2, OS/2 will need to
+		 * recalculate DASD usage, since we don't deal with it.
+		 */
+		if (j_sb->s_flag & cpu_to_le32(JFS_DASD_ENABLED))
+			j_sb->s_flag |= cpu_to_le32(JFS_DASD_PRIME);
+	}
+
+	mark_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+	brelse(bh);
+
+	return 0;
+}
+
+
+/*
+ *	readSuper()
+ *
+ * read superblock by raw sector address
+ */
+int readSuper(struct super_block *sb, struct buffer_head **bpp)
+{
+	/* read in primary superblock */
+	*bpp = sb_bread(sb, SUPER1_OFF >> sb->s_blocksize_bits);
+	if (*bpp)
+		return 0;
+
+	/* read in secondary/replicated superblock */
+	*bpp = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits);
+	if (*bpp)
+		return 0;
+
+	return -EIO;
+}
+
+
+/*
+ *	logMOUNT()
+ *
+ * function: write a MOUNT log record for file system.
+ *
+ * MOUNT record keeps logredo() from processing log records
+ * for this file system past this point in log.
+ * it is harmless if mount fails.
+ *
+ * note: MOUNT record is at aggregate level, not at fileset level,
+ * since log records of previous mounts of a fileset
+ * (e.g., AFTER record of extent allocation) have to be processed
+ * to update block allocation map at aggregate level.
+ */
+static int logMOUNT(struct super_block *sb)
+{
+	struct jfs_log *log = JFS_SBI(sb)->log;
+	struct lrd lrd;
+
+	lrd.logtid = 0;
+	lrd.backchain = 0;
+	lrd.type = cpu_to_le16(LOG_MOUNT);
+	lrd.length = 0;
+	lrd.aggregate = cpu_to_le32(new_encode_dev(sb->s_bdev->bd_dev));
+	lmLog(log, NULL, &lrd, NULL);
+
+	return 0;
+}
diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h
new file mode 100644
index 00000000..884fc21a
--- /dev/null
+++ b/fs/jfs/jfs_superblock.h
@@ -0,0 +1,121 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2003
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef	_H_JFS_SUPERBLOCK
+#define _H_JFS_SUPERBLOCK
+
+/*
+ * make the magic number something a human could read
+ */
+#define JFS_MAGIC	"JFS1"	/* Magic word */
+
+#define JFS_VERSION	2	/* Version number: Version 2 */
+
+#define LV_NAME_SIZE	11	/* MUST BE 11 for OS/2 boot sector */
+
+/*
+ *	aggregate superblock
+ *
+ * The name superblock is too close to super_block, so the name has been
+ * changed to jfs_superblock.  The utilities are still using the old name.
+ */
+struct jfs_superblock {
+	char s_magic[4];	/* 4: magic number */
+	__le32 s_version;	/* 4: version number */
+
+	__le64 s_size;		/* 8: aggregate size in hardware/LVM blocks;
+				 * VFS: number of blocks
+				 */
+	__le32 s_bsize;		/* 4: aggregate block size in bytes;
+				 * VFS: fragment size
+				 */
+	__le16 s_l2bsize;	/* 2: log2 of s_bsize */
+	__le16 s_l2bfactor;	/* 2: log2(s_bsize/hardware block size) */
+	__le32 s_pbsize;	/* 4: hardware/LVM block size in bytes */
+	__le16 s_l2pbsize;	/* 2: log2 of s_pbsize */
+	__le16 pad;		/* 2: padding necessary for alignment */
+
+	__le32 s_agsize;	/* 4: allocation group size in aggr. blocks */
+
+	__le32 s_flag;		/* 4: aggregate attributes:
+				 *    see jfs_filsys.h
+				 */
+	__le32 s_state;		/* 4: mount/unmount/recovery state:
+				 *    see jfs_filsys.h
+				 */
+	__le32 s_compress;		/* 4: > 0 if data compression */
+
+	pxd_t s_ait2;		/* 8: first extent of secondary
+				 *    aggregate inode table
+				 */
+
+	pxd_t s_aim2;		/* 8: first extent of secondary
+				 *    aggregate inode map
+				 */
+	__le32 s_logdev;		/* 4: device address of log */
+	__le32 s_logserial;	/* 4: log serial number at aggregate mount */
+	pxd_t s_logpxd;		/* 8: inline log extent */
+
+	pxd_t s_fsckpxd;	/* 8: inline fsck work space extent */
+
+	struct timestruc_t s_time;	/* 8: time last updated */
+
+	__le32 s_fsckloglen;	/* 4: Number of filesystem blocks reserved for
+				 *    the fsck service log.
+				 *    N.B. These blocks are divided among the
+				 *         versions kept.  This is not a per
+				 *         version size.
+				 *    N.B. These blocks are included in the
+				 *         length field of s_fsckpxd.
+				 */
+	s8 s_fscklog;		/* 1: which fsck service log is most recent
+				 *    0 => no service log data yet
+				 *    1 => the first one
+				 *    2 => the 2nd one
+				 */
+	char s_fpack[11];	/* 11: file system volume name
+				 *     N.B. This must be 11 bytes to
+				 *          conform with the OS/2 BootSector
+				 *          requirements
+				 *          Only used when s_version is 1
+				 */
+
+	/* extendfs() parameter under s_state & FM_EXTENDFS */
+	__le64 s_xsize;		/* 8: extendfs s_size */
+	pxd_t s_xfsckpxd;	/* 8: extendfs fsckpxd */
+	pxd_t s_xlogpxd;	/* 8: extendfs logpxd */
+	/* - 128 byte boundary - */
+
+	char s_uuid[16];	/* 16: 128-bit uuid for volume */
+	char s_label[16];	/* 16: volume label */
+	char s_loguuid[16];	/* 16: 128-bit uuid for log device */
+
+};
+
+extern int readSuper(struct super_block *, struct buffer_head **);
+extern int updateSuper(struct super_block *, uint);
+extern void jfs_error(struct super_block *, const char *, ...);
+extern int jfs_mount(struct super_block *);
+extern int jfs_mount_rw(struct super_block *, int);
+extern int jfs_umount(struct super_block *);
+extern int jfs_umount_rw(struct super_block *);
+extern int jfs_extendfs(struct super_block *, s64, int);
+
+extern struct task_struct *jfsIOthread;
+extern struct task_struct *jfsSyncThread;
+
+#endif /*_H_JFS_SUPERBLOCK */
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
new file mode 100644
index 00000000..f6cc0c09
--- /dev/null
+++ b/fs/jfs/jfs_txnmgr.c
@@ -0,0 +1,3101 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2005
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ *	jfs_txnmgr.c: transaction manager
+ *
+ * notes:
+ * transaction starts with txBegin() and ends with txCommit()
+ * or txAbort().
+ *
+ * tlock is acquired at the time of update;
+ * (obviate scan at commit time for xtree and dtree)
+ * tlock and mp points to each other;
+ * (no hashlist for mp -> tlock).
+ *
+ * special cases:
+ * tlock on in-memory inode:
+ * in-place tlock in the in-memory inode itself;
+ * converted to page lock by iWrite() at commit time.
+ *
+ * tlock during write()/mmap() under anonymous transaction (tid = 0):
+ * transferred (?) to transaction at commit time.
+ *
+ * use the page itself to update allocation maps
+ * (obviate intermediate replication of allocation/deallocation data)
+ * hold on to mp+lock thru update of maps
+ */
+
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/completion.h>
+#include <linux/freezer.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kthread.h>
+#include <linux/seq_file.h>
+#include "jfs_incore.h"
+#include "jfs_inode.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_dinode.h"
+#include "jfs_imap.h"
+#include "jfs_dmap.h"
+#include "jfs_superblock.h"
+#include "jfs_debug.h"
+
+/*
+ *	transaction management structures
+ */
+static struct {
+	int freetid;		/* index of a free tid structure */
+	int freelock;		/* index first free lock word */
+	wait_queue_head_t freewait;	/* eventlist of free tblock */
+	wait_queue_head_t freelockwait;	/* eventlist of free tlock */
+	wait_queue_head_t lowlockwait;	/* eventlist of ample tlocks */
+	int tlocksInUse;	/* Number of tlocks in use */
+	spinlock_t LazyLock;	/* synchronize sync_queue & unlock_queue */
+/*	struct tblock *sync_queue; * Transactions waiting for data sync */
+	struct list_head unlock_queue;	/* Txns waiting to be released */
+	struct list_head anon_list;	/* inodes having anonymous txns */
+	struct list_head anon_list2;	/* inodes having anonymous txns
+					   that couldn't be sync'ed */
+} TxAnchor;
+
+int jfs_tlocks_low;		/* Indicates low number of available tlocks */
+
+#ifdef CONFIG_JFS_STATISTICS
+static struct {
+	uint txBegin;
+	uint txBegin_barrier;
+	uint txBegin_lockslow;
+	uint txBegin_freetid;
+	uint txBeginAnon;
+	uint txBeginAnon_barrier;
+	uint txBeginAnon_lockslow;
+	uint txLockAlloc;
+	uint txLockAlloc_freelock;
+} TxStat;
+#endif
+
+static int nTxBlock = -1;	/* number of transaction blocks */
+module_param(nTxBlock, int, 0);
+MODULE_PARM_DESC(nTxBlock,
+		 "Number of transaction blocks (max:65536)");
+
+static int nTxLock = -1;	/* number of transaction locks */
+module_param(nTxLock, int, 0);
+MODULE_PARM_DESC(nTxLock,
+		 "Number of transaction locks (max:65536)");
+
+struct tblock *TxBlock;	/* transaction block table */
+static int TxLockLWM;	/* Low water mark for number of txLocks used */
+static int TxLockHWM;	/* High water mark for number of txLocks used */
+static int TxLockVHWM;	/* Very High water mark */
+struct tlock *TxLock;	/* transaction lock table */
+
+/*
+ *	transaction management lock
+ */
+static DEFINE_SPINLOCK(jfsTxnLock);
+
+#define TXN_LOCK()		spin_lock(&jfsTxnLock)
+#define TXN_UNLOCK()		spin_unlock(&jfsTxnLock)
+
+#define LAZY_LOCK_INIT()	spin_lock_init(&TxAnchor.LazyLock);
+#define LAZY_LOCK(flags)	spin_lock_irqsave(&TxAnchor.LazyLock, flags)
+#define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags)
+
+static DECLARE_WAIT_QUEUE_HEAD(jfs_commit_thread_wait);
+static int jfs_commit_thread_waking;
+
+/*
+ * Retry logic exist outside these macros to protect from spurrious wakeups.
+ */
+static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(event, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	TXN_UNLOCK();
+	io_schedule();
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(event, &wait);
+}
+
+#define TXN_SLEEP(event)\
+{\
+	TXN_SLEEP_DROP_LOCK(event);\
+	TXN_LOCK();\
+}
+
+#define TXN_WAKEUP(event) wake_up_all(event)
+
+/*
+ *	statistics
+ */
+static struct {
+	tid_t maxtid;		/* 4: biggest tid ever used */
+	lid_t maxlid;		/* 4: biggest lid ever used */
+	int ntid;		/* 4: # of transactions performed */
+	int nlid;		/* 4: # of tlocks acquired */
+	int waitlock;		/* 4: # of tlock wait */
+} stattx;
+
+/*
+ * forward references
+ */
+static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+		struct tlock * tlck, struct commit * cd);
+static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+		struct tlock * tlck);
+static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+		struct tlock * tlck);
+static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+		struct tlock * tlck);
+static void txAllocPMap(struct inode *ip, struct maplock * maplock,
+		struct tblock * tblk);
+static void txForce(struct tblock * tblk);
+static int txLog(struct jfs_log * log, struct tblock * tblk,
+		struct commit * cd);
+static void txUpdateMap(struct tblock * tblk);
+static void txRelease(struct tblock * tblk);
+static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+	   struct tlock * tlck);
+static void LogSyncRelease(struct metapage * mp);
+
+/*
+ *		transaction block/lock management
+ *		---------------------------------
+ */
+
+/*
+ * Get a transaction lock from the free list.  If the number in use is
+ * greater than the high water mark, wake up the sync daemon.  This should
+ * free some anonymous transaction locks.  (TXN_LOCK must be held.)
+ */
+static lid_t txLockAlloc(void)
+{
+	lid_t lid;
+
+	INCREMENT(TxStat.txLockAlloc);
+	if (!TxAnchor.freelock) {
+		INCREMENT(TxStat.txLockAlloc_freelock);
+	}
+
+	while (!(lid = TxAnchor.freelock))
+		TXN_SLEEP(&TxAnchor.freelockwait);
+	TxAnchor.freelock = TxLock[lid].next;
+	HIGHWATERMARK(stattx.maxlid, lid);
+	if ((++TxAnchor.tlocksInUse > TxLockHWM) && (jfs_tlocks_low == 0)) {
+		jfs_info("txLockAlloc tlocks low");
+		jfs_tlocks_low = 1;
+		wake_up_process(jfsSyncThread);
+	}
+
+	return lid;
+}
+
+static void txLockFree(lid_t lid)
+{
+	TxLock[lid].tid = 0;
+	TxLock[lid].next = TxAnchor.freelock;
+	TxAnchor.freelock = lid;
+	TxAnchor.tlocksInUse--;
+	if (jfs_tlocks_low && (TxAnchor.tlocksInUse < TxLockLWM)) {
+		jfs_info("txLockFree jfs_tlocks_low no more");
+		jfs_tlocks_low = 0;
+		TXN_WAKEUP(&TxAnchor.lowlockwait);
+	}
+	TXN_WAKEUP(&TxAnchor.freelockwait);
+}
+
+/*
+ * NAME:	txInit()
+ *
+ * FUNCTION:	initialize transaction management structures
+ *
+ * RETURN:
+ *
+ * serialization: single thread at jfs_init()
+ */
+int txInit(void)
+{
+	int k, size;
+	struct sysinfo si;
+
+	/* Set defaults for nTxLock and nTxBlock if unset */
+
+	if (nTxLock == -1) {
+		if (nTxBlock == -1) {
+			/* Base default on memory size */
+			si_meminfo(&si);
+			if (si.totalram > (256 * 1024)) /* 1 GB */
+				nTxLock = 64 * 1024;
+			else
+				nTxLock = si.totalram >> 2;
+		} else if (nTxBlock > (8 * 1024))
+			nTxLock = 64 * 1024;
+		else
+			nTxLock = nTxBlock << 3;
+	}
+	if (nTxBlock == -1)
+		nTxBlock = nTxLock >> 3;
+
+	/* Verify tunable parameters */
+	if (nTxBlock < 16)
+		nTxBlock = 16;	/* No one should set it this low */
+	if (nTxBlock > 65536)
+		nTxBlock = 65536;
+	if (nTxLock < 256)
+		nTxLock = 256;	/* No one should set it this low */
+	if (nTxLock > 65536)
+		nTxLock = 65536;
+
+	printk(KERN_INFO "JFS: nTxBlock = %d, nTxLock = %d\n",
+	       nTxBlock, nTxLock);
+	/*
+	 * initialize transaction block (tblock) table
+	 *
+	 * transaction id (tid) = tblock index
+	 * tid = 0 is reserved.
+	 */
+	TxLockLWM = (nTxLock * 4) / 10;
+	TxLockHWM = (nTxLock * 7) / 10;
+	TxLockVHWM = (nTxLock * 8) / 10;
+
+	size = sizeof(struct tblock) * nTxBlock;
+	TxBlock = vmalloc(size);
+	if (TxBlock == NULL)
+		return -ENOMEM;
+
+	for (k = 1; k < nTxBlock - 1; k++) {
+		TxBlock[k].next = k + 1;
+		init_waitqueue_head(&TxBlock[k].gcwait);
+		init_waitqueue_head(&TxBlock[k].waitor);
+	}
+	TxBlock[k].next = 0;
+	init_waitqueue_head(&TxBlock[k].gcwait);
+	init_waitqueue_head(&TxBlock[k].waitor);
+
+	TxAnchor.freetid = 1;
+	init_waitqueue_head(&TxAnchor.freewait);
+
+	stattx.maxtid = 1;	/* statistics */
+
+	/*
+	 * initialize transaction lock (tlock) table
+	 *
+	 * transaction lock id = tlock index
+	 * tlock id = 0 is reserved.
+	 */
+	size = sizeof(struct tlock) * nTxLock;
+	TxLock = vmalloc(size);
+	if (TxLock == NULL) {
+		vfree(TxBlock);
+		return -ENOMEM;
+	}
+
+	/* initialize tlock table */
+	for (k = 1; k < nTxLock - 1; k++)
+		TxLock[k].next = k + 1;
+	TxLock[k].next = 0;
+	init_waitqueue_head(&TxAnchor.freelockwait);
+	init_waitqueue_head(&TxAnchor.lowlockwait);
+
+	TxAnchor.freelock = 1;
+	TxAnchor.tlocksInUse = 0;
+	INIT_LIST_HEAD(&TxAnchor.anon_list);
+	INIT_LIST_HEAD(&TxAnchor.anon_list2);
+
+	LAZY_LOCK_INIT();
+	INIT_LIST_HEAD(&TxAnchor.unlock_queue);
+
+	stattx.maxlid = 1;	/* statistics */
+
+	return 0;
+}
+
+/*
+ * NAME:	txExit()
+ *
+ * FUNCTION:	clean up when module is unloaded
+ */
+void txExit(void)
+{
+	vfree(TxLock);
+	TxLock = NULL;
+	vfree(TxBlock);
+	TxBlock = NULL;
+}
+
+/*
+ * NAME:	txBegin()
+ *
+ * FUNCTION:	start a transaction.
+ *
+ * PARAMETER:	sb	- superblock
+ *		flag	- force for nested tx;
+ *
+ * RETURN:	tid	- transaction id
+ *
+ * note: flag force allows to start tx for nested tx
+ * to prevent deadlock on logsync barrier;
+ */
+tid_t txBegin(struct super_block *sb, int flag)
+{
+	tid_t t;
+	struct tblock *tblk;
+	struct jfs_log *log;
+
+	jfs_info("txBegin: flag = 0x%x", flag);
+	log = JFS_SBI(sb)->log;
+
+	TXN_LOCK();
+
+	INCREMENT(TxStat.txBegin);
+
+      retry:
+	if (!(flag & COMMIT_FORCE)) {
+		/*
+		 * synchronize with logsync barrier
+		 */
+		if (test_bit(log_SYNCBARRIER, &log->flag) ||
+		    test_bit(log_QUIESCE, &log->flag)) {
+			INCREMENT(TxStat.txBegin_barrier);
+			TXN_SLEEP(&log->syncwait);
+			goto retry;
+		}
+	}
+	if (flag == 0) {
+		/*
+		 * Don't begin transaction if we're getting starved for tlocks
+		 * unless COMMIT_FORCE or COMMIT_INODE (which may ultimately
+		 * free tlocks)
+		 */
+		if (TxAnchor.tlocksInUse > TxLockVHWM) {
+			INCREMENT(TxStat.txBegin_lockslow);
+			TXN_SLEEP(&TxAnchor.lowlockwait);
+			goto retry;
+		}
+	}
+
+	/*
+	 * allocate transaction id/block
+	 */
+	if ((t = TxAnchor.freetid) == 0) {
+		jfs_info("txBegin: waiting for free tid");
+		INCREMENT(TxStat.txBegin_freetid);
+		TXN_SLEEP(&TxAnchor.freewait);
+		goto retry;
+	}
+
+	tblk = tid_to_tblock(t);
+
+	if ((tblk->next == 0) && !(flag & COMMIT_FORCE)) {
+		/* Don't let a non-forced transaction take the last tblk */
+		jfs_info("txBegin: waiting for free tid");
+		INCREMENT(TxStat.txBegin_freetid);
+		TXN_SLEEP(&TxAnchor.freewait);
+		goto retry;
+	}
+
+	TxAnchor.freetid = tblk->next;
+
+	/*
+	 * initialize transaction
+	 */
+
+	/*
+	 * We can't zero the whole thing or we screw up another thread being
+	 * awakened after sleeping on tblk->waitor
+	 *
+	 * memset(tblk, 0, sizeof(struct tblock));
+	 */
+	tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0;
+
+	tblk->sb = sb;
+	++log->logtid;
+	tblk->logtid = log->logtid;
+
+	++log->active;
+
+	HIGHWATERMARK(stattx.maxtid, t);	/* statistics */
+	INCREMENT(stattx.ntid);	/* statistics */
+
+	TXN_UNLOCK();
+
+	jfs_info("txBegin: returning tid = %d", t);
+
+	return t;
+}
+
+/*
+ * NAME:	txBeginAnon()
+ *
+ * FUNCTION:	start an anonymous transaction.
+ *		Blocks if logsync or available tlocks are low to prevent
+ *		anonymous tlocks from depleting supply.
+ *
+ * PARAMETER:	sb	- superblock
+ *
+ * RETURN:	none
+ */
+void txBeginAnon(struct super_block *sb)
+{
+	struct jfs_log *log;
+
+	log = JFS_SBI(sb)->log;
+
+	TXN_LOCK();
+	INCREMENT(TxStat.txBeginAnon);
+
+      retry:
+	/*
+	 * synchronize with logsync barrier
+	 */
+	if (test_bit(log_SYNCBARRIER, &log->flag) ||
+	    test_bit(log_QUIESCE, &log->flag)) {
+		INCREMENT(TxStat.txBeginAnon_barrier);
+		TXN_SLEEP(&log->syncwait);
+		goto retry;
+	}
+
+	/*
+	 * Don't begin transaction if we're getting starved for tlocks
+	 */
+	if (TxAnchor.tlocksInUse > TxLockVHWM) {
+		INCREMENT(TxStat.txBeginAnon_lockslow);
+		TXN_SLEEP(&TxAnchor.lowlockwait);
+		goto retry;
+	}
+	TXN_UNLOCK();
+}
+
+/*
+ *	txEnd()
+ *
+ * function: free specified transaction block.
+ *
+ *	logsync barrier processing:
+ *
+ * serialization:
+ */
+void txEnd(tid_t tid)
+{
+	struct tblock *tblk = tid_to_tblock(tid);
+	struct jfs_log *log;
+
+	jfs_info("txEnd: tid = %d", tid);
+	TXN_LOCK();
+
+	/*
+	 * wakeup transactions waiting on the page locked
+	 * by the current transaction
+	 */
+	TXN_WAKEUP(&tblk->waitor);
+
+	log = JFS_SBI(tblk->sb)->log;
+
+	/*
+	 * Lazy commit thread can't free this guy until we mark it UNLOCKED,
+	 * otherwise, we would be left with a transaction that may have been
+	 * reused.
+	 *
+	 * Lazy commit thread will turn off tblkGC_LAZY before calling this
+	 * routine.
+	 */
+	if (tblk->flag & tblkGC_LAZY) {
+		jfs_info("txEnd called w/lazy tid: %d, tblk = 0x%p", tid, tblk);
+		TXN_UNLOCK();
+
+		spin_lock_irq(&log->gclock);	// LOGGC_LOCK
+		tblk->flag |= tblkGC_UNLOCKED;
+		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
+		return;
+	}
+
+	jfs_info("txEnd: tid: %d, tblk = 0x%p", tid, tblk);
+
+	assert(tblk->next == 0);
+
+	/*
+	 * insert tblock back on freelist
+	 */
+	tblk->next = TxAnchor.freetid;
+	TxAnchor.freetid = tid;
+
+	/*
+	 * mark the tblock not active
+	 */
+	if (--log->active == 0) {
+		clear_bit(log_FLUSH, &log->flag);
+
+		/*
+		 * synchronize with logsync barrier
+		 */
+		if (test_bit(log_SYNCBARRIER, &log->flag)) {
+			TXN_UNLOCK();
+
+			/* write dirty metadata & forward log syncpt */
+			jfs_syncpt(log, 1);
+
+			jfs_info("log barrier off: 0x%x", log->lsn);
+
+			/* enable new transactions start */
+			clear_bit(log_SYNCBARRIER, &log->flag);
+
+			/* wakeup all waitors for logsync barrier */
+			TXN_WAKEUP(&log->syncwait);
+
+			goto wakeup;
+		}
+	}
+
+	TXN_UNLOCK();
+wakeup:
+	/*
+	 * wakeup all waitors for a free tblock
+	 */
+	TXN_WAKEUP(&TxAnchor.freewait);
+}
+
+/*
+ *	txLock()
+ *
+ * function: acquire a transaction lock on the specified <mp>
+ *
+ * parameter:
+ *
+ * return:	transaction lock id
+ *
+ * serialization:
+ */
+struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
+		     int type)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	int dir_xtree = 0;
+	lid_t lid;
+	tid_t xtid;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+	struct linelock *linelock;
+	xtpage_t *p;
+	struct tblock *tblk;
+
+	TXN_LOCK();
+
+	if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) &&
+	    !(mp->xflag & COMMIT_PAGE)) {
+		/*
+		 * Directory inode is special.  It can have both an xtree tlock
+		 * and a dtree tlock associated with it.
+		 */
+		dir_xtree = 1;
+		lid = jfs_ip->xtlid;
+	} else
+		lid = mp->lid;
+
+	/* is page not locked by a transaction ? */
+	if (lid == 0)
+		goto allocateLock;
+
+	jfs_info("txLock: tid:%d ip:0x%p mp:0x%p lid:%d", tid, ip, mp, lid);
+
+	/* is page locked by the requester transaction ? */
+	tlck = lid_to_tlock(lid);
+	if ((xtid = tlck->tid) == tid) {
+		TXN_UNLOCK();
+		goto grantLock;
+	}
+
+	/*
+	 * is page locked by anonymous transaction/lock ?
+	 *
+	 * (page update without transaction (i.e., file write) is
+	 * locked under anonymous transaction tid = 0:
+	 * anonymous tlocks maintained on anonymous tlock list of
+	 * the inode of the page and available to all anonymous
+	 * transactions until txCommit() time at which point
+	 * they are transferred to the transaction tlock list of
+	 * the committing transaction of the inode)
+	 */
+	if (xtid == 0) {
+		tlck->tid = tid;
+		TXN_UNLOCK();
+		tblk = tid_to_tblock(tid);
+		/*
+		 * The order of the tlocks in the transaction is important
+		 * (during truncate, child xtree pages must be freed before
+		 * parent's tlocks change the working map).
+		 * Take tlock off anonymous list and add to tail of
+		 * transaction list
+		 *
+		 * Note:  We really need to get rid of the tid & lid and
+		 * use list_head's.  This code is getting UGLY!
+		 */
+		if (jfs_ip->atlhead == lid) {
+			if (jfs_ip->atltail == lid) {
+				/* only anonymous txn.
+				 * Remove from anon_list
+				 */
+				TXN_LOCK();
+				list_del_init(&jfs_ip->anon_inode_list);
+				TXN_UNLOCK();
+			}
+			jfs_ip->atlhead = tlck->next;
+		} else {
+			lid_t last;
+			for (last = jfs_ip->atlhead;
+			     lid_to_tlock(last)->next != lid;
+			     last = lid_to_tlock(last)->next) {
+				assert(last);
+			}
+			lid_to_tlock(last)->next = tlck->next;
+			if (jfs_ip->atltail == lid)
+				jfs_ip->atltail = last;
+		}
+
+		/* insert the tlock at tail of transaction tlock list */
+
+		if (tblk->next)
+			lid_to_tlock(tblk->last)->next = lid;
+		else
+			tblk->next = lid;
+		tlck->next = 0;
+		tblk->last = lid;
+
+		goto grantLock;
+	}
+
+	goto waitLock;
+
+	/*
+	 * allocate a tlock
+	 */
+      allocateLock:
+	lid = txLockAlloc();
+	tlck = lid_to_tlock(lid);
+
+	/*
+	 * initialize tlock
+	 */
+	tlck->tid = tid;
+
+	TXN_UNLOCK();
+
+	/* mark tlock for meta-data page */
+	if (mp->xflag & COMMIT_PAGE) {
+
+		tlck->flag = tlckPAGELOCK;
+
+		/* mark the page dirty and nohomeok */
+		metapage_nohomeok(mp);
+
+		jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p",
+			 mp, mp->nohomeok, tid, tlck);
+
+		/* if anonymous transaction, and buffer is on the group
+		 * commit synclist, mark inode to show this.  This will
+		 * prevent the buffer from being marked nohomeok for too
+		 * long a time.
+		 */
+		if ((tid == 0) && mp->lsn)
+			set_cflag(COMMIT_Synclist, ip);
+	}
+	/* mark tlock for in-memory inode */
+	else
+		tlck->flag = tlckINODELOCK;
+
+	if (S_ISDIR(ip->i_mode))
+		tlck->flag |= tlckDIRECTORY;
+
+	tlck->type = 0;
+
+	/* bind the tlock and the page */
+	tlck->ip = ip;
+	tlck->mp = mp;
+	if (dir_xtree)
+		jfs_ip->xtlid = lid;
+	else
+		mp->lid = lid;
+
+	/*
+	 * enqueue transaction lock to transaction/inode
+	 */
+	/* insert the tlock at tail of transaction tlock list */
+	if (tid) {
+		tblk = tid_to_tblock(tid);
+		if (tblk->next)
+			lid_to_tlock(tblk->last)->next = lid;
+		else
+			tblk->next = lid;
+		tlck->next = 0;
+		tblk->last = lid;
+	}
+	/* anonymous transaction:
+	 * insert the tlock at head of inode anonymous tlock list
+	 */
+	else {
+		tlck->next = jfs_ip->atlhead;
+		jfs_ip->atlhead = lid;
+		if (tlck->next == 0) {
+			/* This inode's first anonymous transaction */
+			jfs_ip->atltail = lid;
+			TXN_LOCK();
+			list_add_tail(&jfs_ip->anon_inode_list,
+				      &TxAnchor.anon_list);
+			TXN_UNLOCK();
+		}
+	}
+
+	/* initialize type dependent area for linelock */
+	linelock = (struct linelock *) & tlck->lock;
+	linelock->next = 0;
+	linelock->flag = tlckLINELOCK;
+	linelock->maxcnt = TLOCKSHORT;
+	linelock->index = 0;
+
+	switch (type & tlckTYPE) {
+	case tlckDTREE:
+		linelock->l2linesize = L2DTSLOTSIZE;
+		break;
+
+	case tlckXTREE:
+		linelock->l2linesize = L2XTSLOTSIZE;
+
+		xtlck = (struct xtlock *) linelock;
+		xtlck->header.offset = 0;
+		xtlck->header.length = 2;
+
+		if (type & tlckNEW) {
+			xtlck->lwm.offset = XTENTRYSTART;
+		} else {
+			if (mp->xflag & COMMIT_PAGE)
+				p = (xtpage_t *) mp->data;
+			else
+				p = &jfs_ip->i_xtroot;
+			xtlck->lwm.offset =
+			    le16_to_cpu(p->header.nextindex);
+		}
+		xtlck->lwm.length = 0;	/* ! */
+		xtlck->twm.offset = 0;
+		xtlck->hwm.offset = 0;
+
+		xtlck->index = 2;
+		break;
+
+	case tlckINODE:
+		linelock->l2linesize = L2INODESLOTSIZE;
+		break;
+
+	case tlckDATA:
+		linelock->l2linesize = L2DATASLOTSIZE;
+		break;
+
+	default:
+		jfs_err("UFO tlock:0x%p", tlck);
+	}
+
+	/*
+	 * update tlock vector
+	 */
+      grantLock:
+	tlck->type |= type;
+
+	return tlck;
+
+	/*
+	 * page is being locked by another transaction:
+	 */
+      waitLock:
+	/* Only locks on ipimap or ipaimap should reach here */
+	/* assert(jfs_ip->fileset == AGGREGATE_I); */
+	if (jfs_ip->fileset != AGGREGATE_I) {
+		printk(KERN_ERR "txLock: trying to lock locked page!");
+		print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4,
+			       ip, sizeof(*ip), 0);
+		print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4,
+			       mp, sizeof(*mp), 0);
+		print_hex_dump(KERN_ERR, "Locker's tblock: ",
+			       DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid),
+			       sizeof(struct tblock), 0);
+		print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4,
+			       tlck, sizeof(*tlck), 0);
+		BUG();
+	}
+	INCREMENT(stattx.waitlock);	/* statistics */
+	TXN_UNLOCK();
+	release_metapage(mp);
+	TXN_LOCK();
+	xtid = tlck->tid;	/* reacquire after dropping TXN_LOCK */
+
+	jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d",
+		 tid, xtid, lid);
+
+	/* Recheck everything since dropping TXN_LOCK */
+	if (xtid && (tlck->mp == mp) && (mp->lid == lid))
+		TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor);
+	else
+		TXN_UNLOCK();
+	jfs_info("txLock: awakened     tid = %d, lid = %d", tid, lid);
+
+	return NULL;
+}
+
+/*
+ * NAME:	txRelease()
+ *
+ * FUNCTION:	Release buffers associated with transaction locks, but don't
+ *		mark homeok yet.  The allows other transactions to modify
+ *		buffers, but won't let them go to disk until commit record
+ *		actually gets written.
+ *
+ * PARAMETER:
+ *		tblk	-
+ *
+ * RETURN:	Errors from subroutines.
+ */
+static void txRelease(struct tblock * tblk)
+{
+	struct metapage *mp;
+	lid_t lid;
+	struct tlock *tlck;
+
+	TXN_LOCK();
+
+	for (lid = tblk->next; lid; lid = tlck->next) {
+		tlck = lid_to_tlock(lid);
+		if ((mp = tlck->mp) != NULL &&
+		    (tlck->type & tlckBTROOT) == 0) {
+			assert(mp->xflag & COMMIT_PAGE);
+			mp->lid = 0;
+		}
+	}
+
+	/*
+	 * wakeup transactions waiting on a page locked
+	 * by the current transaction
+	 */
+	TXN_WAKEUP(&tblk->waitor);
+
+	TXN_UNLOCK();
+}
+
+/*
+ * NAME:	txUnlock()
+ *
+ * FUNCTION:	Initiates pageout of pages modified by tid in journalled
+ *		objects and frees their lockwords.
+ */
+static void txUnlock(struct tblock * tblk)
+{
+	struct tlock *tlck;
+	struct linelock *linelock;
+	lid_t lid, next, llid, k;
+	struct metapage *mp;
+	struct jfs_log *log;
+	int difft, diffp;
+	unsigned long flags;
+
+	jfs_info("txUnlock: tblk = 0x%p", tblk);
+	log = JFS_SBI(tblk->sb)->log;
+
+	/*
+	 * mark page under tlock homeok (its log has been written):
+	 */
+	for (lid = tblk->next; lid; lid = next) {
+		tlck = lid_to_tlock(lid);
+		next = tlck->next;
+
+		jfs_info("unlocking lid = %d, tlck = 0x%p", lid, tlck);
+
+		/* unbind page from tlock */
+		if ((mp = tlck->mp) != NULL &&
+		    (tlck->type & tlckBTROOT) == 0) {
+			assert(mp->xflag & COMMIT_PAGE);
+
+			/* hold buffer
+			 */
+			hold_metapage(mp);
+
+			assert(mp->nohomeok > 0);
+			_metapage_homeok(mp);
+
+			/* inherit younger/larger clsn */
+			LOGSYNC_LOCK(log, flags);
+			if (mp->clsn) {
+				logdiff(difft, tblk->clsn, log);
+				logdiff(diffp, mp->clsn, log);
+				if (difft > diffp)
+					mp->clsn = tblk->clsn;
+			} else
+				mp->clsn = tblk->clsn;
+			LOGSYNC_UNLOCK(log, flags);
+
+			assert(!(tlck->flag & tlckFREEPAGE));
+
+			put_metapage(mp);
+		}
+
+		/* insert tlock, and linelock(s) of the tlock if any,
+		 * at head of freelist
+		 */
+		TXN_LOCK();
+
+		llid = ((struct linelock *) & tlck->lock)->next;
+		while (llid) {
+			linelock = (struct linelock *) lid_to_tlock(llid);
+			k = linelock->next;
+			txLockFree(llid);
+			llid = k;
+		}
+		txLockFree(lid);
+
+		TXN_UNLOCK();
+	}
+	tblk->next = tblk->last = 0;
+
+	/*
+	 * remove tblock from logsynclist
+	 * (allocation map pages inherited lsn of tblk and
+	 * has been inserted in logsync list at txUpdateMap())
+	 */
+	if (tblk->lsn) {
+		LOGSYNC_LOCK(log, flags);
+		log->count--;
+		list_del(&tblk->synclist);
+		LOGSYNC_UNLOCK(log, flags);
+	}
+}
+
+/*
+ *	txMaplock()
+ *
+ * function: allocate a transaction lock for freed page/entry;
+ *	for freed page, maplock is used as xtlock/dtlock type;
+ */
+struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	lid_t lid;
+	struct tblock *tblk;
+	struct tlock *tlck;
+	struct maplock *maplock;
+
+	TXN_LOCK();
+
+	/*
+	 * allocate a tlock
+	 */
+	lid = txLockAlloc();
+	tlck = lid_to_tlock(lid);
+
+	/*
+	 * initialize tlock
+	 */
+	tlck->tid = tid;
+
+	/* bind the tlock and the object */
+	tlck->flag = tlckINODELOCK;
+	if (S_ISDIR(ip->i_mode))
+		tlck->flag |= tlckDIRECTORY;
+	tlck->ip = ip;
+	tlck->mp = NULL;
+
+	tlck->type = type;
+
+	/*
+	 * enqueue transaction lock to transaction/inode
+	 */
+	/* insert the tlock at tail of transaction tlock list */
+	if (tid) {
+		tblk = tid_to_tblock(tid);
+		if (tblk->next)
+			lid_to_tlock(tblk->last)->next = lid;
+		else
+			tblk->next = lid;
+		tlck->next = 0;
+		tblk->last = lid;
+	}
+	/* anonymous transaction:
+	 * insert the tlock at head of inode anonymous tlock list
+	 */
+	else {
+		tlck->next = jfs_ip->atlhead;
+		jfs_ip->atlhead = lid;
+		if (tlck->next == 0) {
+			/* This inode's first anonymous transaction */
+			jfs_ip->atltail = lid;
+			list_add_tail(&jfs_ip->anon_inode_list,
+				      &TxAnchor.anon_list);
+		}
+	}
+
+	TXN_UNLOCK();
+
+	/* initialize type dependent area for maplock */
+	maplock = (struct maplock *) & tlck->lock;
+	maplock->next = 0;
+	maplock->maxcnt = 0;
+	maplock->index = 0;
+
+	return tlck;
+}
+
+/*
+ *	txLinelock()
+ *
+ * function: allocate a transaction lock for log vector list
+ */
+struct linelock *txLinelock(struct linelock * tlock)
+{
+	lid_t lid;
+	struct tlock *tlck;
+	struct linelock *linelock;
+
+	TXN_LOCK();
+
+	/* allocate a TxLock structure */
+	lid = txLockAlloc();
+	tlck = lid_to_tlock(lid);
+
+	TXN_UNLOCK();
+
+	/* initialize linelock */
+	linelock = (struct linelock *) tlck;
+	linelock->next = 0;
+	linelock->flag = tlckLINELOCK;
+	linelock->maxcnt = TLOCKLONG;
+	linelock->index = 0;
+	if (tlck->flag & tlckDIRECTORY)
+		linelock->flag |= tlckDIRECTORY;
+
+	/* append linelock after tlock */
+	linelock->next = tlock->next;
+	tlock->next = lid;
+
+	return linelock;
+}
+
+/*
+ *		transaction commit management
+ *		-----------------------------
+ */
+
+/*
+ * NAME:	txCommit()
+ *
+ * FUNCTION:	commit the changes to the objects specified in
+ *		clist.  For journalled segments only the
+ *		changes of the caller are committed, ie by tid.
+ *		for non-journalled segments the data are flushed to
+ *		disk and then the change to the disk inode and indirect
+ *		blocks committed (so blocks newly allocated to the
+ *		segment will be made a part of the segment atomically).
+ *
+ *		all of the segments specified in clist must be in
+ *		one file system. no more than 6 segments are needed
+ *		to handle all unix svcs.
+ *
+ *		if the i_nlink field (i.e. disk inode link count)
+ *		is zero, and the type of inode is a regular file or
+ *		directory, or symbolic link , the inode is truncated
+ *		to zero length. the truncation is committed but the
+ *		VM resources are unaffected until it is closed (see
+ *		iput and iclose).
+ *
+ * PARAMETER:
+ *
+ * RETURN:
+ *
+ * serialization:
+ *		on entry the inode lock on each segment is assumed
+ *		to be held.
+ *
+ * i/o error:
+ */
+int txCommit(tid_t tid,		/* transaction identifier */
+	     int nip,		/* number of inodes to commit */
+	     struct inode **iplist,	/* list of inode to commit */
+	     int flag)
+{
+	int rc = 0;
+	struct commit cd;
+	struct jfs_log *log;
+	struct tblock *tblk;
+	struct lrd *lrd;
+	int lsn;
+	struct inode *ip;
+	struct jfs_inode_info *jfs_ip;
+	int k, n;
+	ino_t top;
+	struct super_block *sb;
+
+	jfs_info("txCommit, tid = %d, flag = %d", tid, flag);
+	/* is read-only file system ? */
+	if (isReadOnly(iplist[0])) {
+		rc = -EROFS;
+		goto TheEnd;
+	}
+
+	sb = cd.sb = iplist[0]->i_sb;
+	cd.tid = tid;
+
+	if (tid == 0)
+		tid = txBegin(sb, 0);
+	tblk = tid_to_tblock(tid);
+
+	/*
+	 * initialize commit structure
+	 */
+	log = JFS_SBI(sb)->log;
+	cd.log = log;
+
+	/* initialize log record descriptor in commit */
+	lrd = &cd.lrd;
+	lrd->logtid = cpu_to_le32(tblk->logtid);
+	lrd->backchain = 0;
+
+	tblk->xflag |= flag;
+
+	if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0)
+		tblk->xflag |= COMMIT_LAZY;
+	/*
+	 *	prepare non-journaled objects for commit
+	 *
+	 * flush data pages of non-journaled file
+	 * to prevent the file getting non-initialized disk blocks
+	 * in case of crash.
+	 * (new blocks - )
+	 */
+	cd.iplist = iplist;
+	cd.nip = nip;
+
+	/*
+	 *	acquire transaction lock on (on-disk) inodes
+	 *
+	 * update on-disk inode from in-memory inode
+	 * acquiring transaction locks for AFTER records
+	 * on the on-disk inode of file object
+	 *
+	 * sort the inodes array by inode number in descending order
+	 * to prevent deadlock when acquiring transaction lock
+	 * of on-disk inodes on multiple on-disk inode pages by
+	 * multiple concurrent transactions
+	 */
+	for (k = 0; k < cd.nip; k++) {
+		top = (cd.iplist[k])->i_ino;
+		for (n = k + 1; n < cd.nip; n++) {
+			ip = cd.iplist[n];
+			if (ip->i_ino > top) {
+				top = ip->i_ino;
+				cd.iplist[n] = cd.iplist[k];
+				cd.iplist[k] = ip;
+			}
+		}
+
+		ip = cd.iplist[k];
+		jfs_ip = JFS_IP(ip);
+
+		/*
+		 * BUGBUG - This code has temporarily been removed.  The
+		 * intent is to ensure that any file data is written before
+		 * the metadata is committed to the journal.  This prevents
+		 * uninitialized data from appearing in a file after the
+		 * journal has been replayed.  (The uninitialized data
+		 * could be sensitive data removed by another user.)
+		 *
+		 * The problem now is that we are holding the IWRITELOCK
+		 * on the inode, and calling filemap_fdatawrite on an
+		 * unmapped page will cause a deadlock in jfs_get_block.
+		 *
+		 * The long term solution is to pare down the use of
+		 * IWRITELOCK.  We are currently holding it too long.
+		 * We could also be smarter about which data pages need
+		 * to be written before the transaction is committed and
+		 * when we don't need to worry about it at all.
+		 *
+		 * if ((!S_ISDIR(ip->i_mode))
+		 *    && (tblk->flag & COMMIT_DELETE) == 0)
+		 *	filemap_write_and_wait(ip->i_mapping);
+		 */
+
+		/*
+		 * Mark inode as not dirty.  It will still be on the dirty
+		 * inode list, but we'll know not to commit it again unless
+		 * it gets marked dirty again
+		 */
+		clear_cflag(COMMIT_Dirty, ip);
+
+		/* inherit anonymous tlock(s) of inode */
+		if (jfs_ip->atlhead) {
+			lid_to_tlock(jfs_ip->atltail)->next = tblk->next;
+			tblk->next = jfs_ip->atlhead;
+			if (!tblk->last)
+				tblk->last = jfs_ip->atltail;
+			jfs_ip->atlhead = jfs_ip->atltail = 0;
+			TXN_LOCK();
+			list_del_init(&jfs_ip->anon_inode_list);
+			TXN_UNLOCK();
+		}
+
+		/*
+		 * acquire transaction lock on on-disk inode page
+		 * (become first tlock of the tblk's tlock list)
+		 */
+		if (((rc = diWrite(tid, ip))))
+			goto out;
+	}
+
+	/*
+	 *	write log records from transaction locks
+	 *
+	 * txUpdateMap() resets XAD_NEW in XAD.
+	 */
+	if ((rc = txLog(log, tblk, &cd)))
+		goto TheEnd;
+
+	/*
+	 * Ensure that inode isn't reused before
+	 * lazy commit thread finishes processing
+	 */
+	if (tblk->xflag & COMMIT_DELETE) {
+		ihold(tblk->u.ip);
+		/*
+		 * Avoid a rare deadlock
+		 *
+		 * If the inode is locked, we may be blocked in
+		 * jfs_commit_inode.  If so, we don't want the
+		 * lazy_commit thread doing the last iput() on the inode
+		 * since that may block on the locked inode.  Instead,
+		 * commit the transaction synchronously, so the last iput
+		 * will be done by the calling thread (or later)
+		 */
+		/*
+		 * I believe this code is no longer needed.  Splitting I_LOCK
+		 * into two bits, I_NEW and I_SYNC should prevent this
+		 * deadlock as well.  But since I don't have a JFS testload
+		 * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done.
+		 * Joern
+		 */
+		if (tblk->u.ip->i_state & I_SYNC)
+			tblk->xflag &= ~COMMIT_LAZY;
+	}
+
+	ASSERT((!(tblk->xflag & COMMIT_DELETE)) ||
+	       ((tblk->u.ip->i_nlink == 0) &&
+		!test_cflag(COMMIT_Nolink, tblk->u.ip)));
+
+	/*
+	 *	write COMMIT log record
+	 */
+	lrd->type = cpu_to_le16(LOG_COMMIT);
+	lrd->length = 0;
+	lsn = lmLog(log, tblk, lrd, NULL);
+
+	lmGroupCommit(log, tblk);
+
+	/*
+	 *	- transaction is now committed -
+	 */
+
+	/*
+	 * force pages in careful update
+	 * (imap addressing structure update)
+	 */
+	if (flag & COMMIT_FORCE)
+		txForce(tblk);
+
+	/*
+	 *	update allocation map.
+	 *
+	 * update inode allocation map and inode:
+	 * free pager lock on memory object of inode if any.
+	 * update block allocation map.
+	 *
+	 * txUpdateMap() resets XAD_NEW in XAD.
+	 */
+	if (tblk->xflag & COMMIT_FORCE)
+		txUpdateMap(tblk);
+
+	/*
+	 *	free transaction locks and pageout/free pages
+	 */
+	txRelease(tblk);
+
+	if ((tblk->flag & tblkGC_LAZY) == 0)
+		txUnlock(tblk);
+
+
+	/*
+	 *	reset in-memory object state
+	 */
+	for (k = 0; k < cd.nip; k++) {
+		ip = cd.iplist[k];
+		jfs_ip = JFS_IP(ip);
+
+		/*
+		 * reset in-memory inode state
+		 */
+		jfs_ip->bxflag = 0;
+		jfs_ip->blid = 0;
+	}
+
+      out:
+	if (rc != 0)
+		txAbort(tid, 1);
+
+      TheEnd:
+	jfs_info("txCommit: tid = %d, returning %d", tid, rc);
+	return rc;
+}
+
+/*
+ * NAME:	txLog()
+ *
+ * FUNCTION:	Writes AFTER log records for all lines modified
+ *		by tid for segments specified by inodes in comdata.
+ *		Code assumes only WRITELOCKS are recorded in lockwords.
+ *
+ * PARAMETERS:
+ *
+ * RETURN :
+ */
+static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd)
+{
+	int rc = 0;
+	struct inode *ip;
+	lid_t lid;
+	struct tlock *tlck;
+	struct lrd *lrd = &cd->lrd;
+
+	/*
+	 * write log record(s) for each tlock of transaction,
+	 */
+	for (lid = tblk->next; lid; lid = tlck->next) {
+		tlck = lid_to_tlock(lid);
+
+		tlck->flag |= tlckLOG;
+
+		/* initialize lrd common */
+		ip = tlck->ip;
+		lrd->aggregate = cpu_to_le32(JFS_SBI(ip->i_sb)->aggregate);
+		lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset);
+		lrd->log.redopage.inode = cpu_to_le32(ip->i_ino);
+
+		/* write log record of page from the tlock */
+		switch (tlck->type & tlckTYPE) {
+		case tlckXTREE:
+			xtLog(log, tblk, lrd, tlck);
+			break;
+
+		case tlckDTREE:
+			dtLog(log, tblk, lrd, tlck);
+			break;
+
+		case tlckINODE:
+			diLog(log, tblk, lrd, tlck, cd);
+			break;
+
+		case tlckMAP:
+			mapLog(log, tblk, lrd, tlck);
+			break;
+
+		case tlckDATA:
+			dataLog(log, tblk, lrd, tlck);
+			break;
+
+		default:
+			jfs_err("UFO tlock:0x%p", tlck);
+		}
+	}
+
+	return rc;
+}
+
+/*
+ *	diLog()
+ *
+ * function:	log inode tlock and format maplock to update bmap;
+ */
+static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+		 struct tlock * tlck, struct commit * cd)
+{
+	int rc = 0;
+	struct metapage *mp;
+	pxd_t *pxd;
+	struct pxd_lock *pxdlock;
+
+	mp = tlck->mp;
+
+	/* initialize as REDOPAGE record format */
+	lrd->log.redopage.type = cpu_to_le16(LOG_INODE);
+	lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE);
+
+	pxd = &lrd->log.redopage.pxd;
+
+	/*
+	 *	inode after image
+	 */
+	if (tlck->type & tlckENTRY) {
+		/* log after-image for logredo(): */
+		lrd->type = cpu_to_le16(LOG_REDOPAGE);
+		PXDaddress(pxd, mp->index);
+		PXDlength(pxd,
+			  mp->logical_size >> tblk->sb->s_blocksize_bits);
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+
+		/* mark page as homeward bound */
+		tlck->flag |= tlckWRITEPAGE;
+	} else if (tlck->type & tlckFREE) {
+		/*
+		 *	free inode extent
+		 *
+		 * (pages of the freed inode extent have been invalidated and
+		 * a maplock for free of the extent has been formatted at
+		 * txLock() time);
+		 *
+		 * the tlock had been acquired on the inode allocation map page
+		 * (iag) that specifies the freed extent, even though the map
+		 * page is not itself logged, to prevent pageout of the map
+		 * page before the log;
+		 */
+
+		/* log LOG_NOREDOINOEXT of the freed inode extent for
+		 * logredo() to start NoRedoPage filters, and to update
+		 * imap and bmap for free of the extent;
+		 */
+		lrd->type = cpu_to_le16(LOG_NOREDOINOEXT);
+		/*
+		 * For the LOG_NOREDOINOEXT record, we need
+		 * to pass the IAG number and inode extent
+		 * index (within that IAG) from which the
+		 * the extent being released.  These have been
+		 * passed to us in the iplist[1] and iplist[2].
+		 */
+		lrd->log.noredoinoext.iagnum =
+		    cpu_to_le32((u32) (size_t) cd->iplist[1]);
+		lrd->log.noredoinoext.inoext_idx =
+		    cpu_to_le32((u32) (size_t) cd->iplist[2]);
+
+		pxdlock = (struct pxd_lock *) & tlck->lock;
+		*pxd = pxdlock->pxd;
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+
+		/* update bmap */
+		tlck->flag |= tlckUPDATEMAP;
+
+		/* mark page as homeward bound */
+		tlck->flag |= tlckWRITEPAGE;
+	} else
+		jfs_err("diLog: UFO type tlck:0x%p", tlck);
+#ifdef  _JFS_WIP
+	/*
+	 *	alloc/free external EA extent
+	 *
+	 * a maplock for txUpdateMap() to update bPWMAP for alloc/free
+	 * of the extent has been formatted at txLock() time;
+	 */
+	else {
+		assert(tlck->type & tlckEA);
+
+		/* log LOG_UPDATEMAP for logredo() to update bmap for
+		 * alloc of new (and free of old) external EA extent;
+		 */
+		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+		pxdlock = (struct pxd_lock *) & tlck->lock;
+		nlock = pxdlock->index;
+		for (i = 0; i < nlock; i++, pxdlock++) {
+			if (pxdlock->flag & mlckALLOCPXD)
+				lrd->log.updatemap.type =
+				    cpu_to_le16(LOG_ALLOCPXD);
+			else
+				lrd->log.updatemap.type =
+				    cpu_to_le16(LOG_FREEPXD);
+			lrd->log.updatemap.nxd = cpu_to_le16(1);
+			lrd->log.updatemap.pxd = pxdlock->pxd;
+			lrd->backchain =
+			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+		}
+
+		/* update bmap */
+		tlck->flag |= tlckUPDATEMAP;
+	}
+#endif				/* _JFS_WIP */
+
+	return rc;
+}
+
+/*
+ *	dataLog()
+ *
+ * function:	log data tlock
+ */
+static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+	    struct tlock * tlck)
+{
+	struct metapage *mp;
+	pxd_t *pxd;
+
+	mp = tlck->mp;
+
+	/* initialize as REDOPAGE record format */
+	lrd->log.redopage.type = cpu_to_le16(LOG_DATA);
+	lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE);
+
+	pxd = &lrd->log.redopage.pxd;
+
+	/* log after-image for logredo(): */
+	lrd->type = cpu_to_le16(LOG_REDOPAGE);
+
+	if (jfs_dirtable_inline(tlck->ip)) {
+		/*
+		 * The table has been truncated, we've must have deleted
+		 * the last entry, so don't bother logging this
+		 */
+		mp->lid = 0;
+		grab_metapage(mp);
+		metapage_homeok(mp);
+		discard_metapage(mp);
+		tlck->mp = NULL;
+		return 0;
+	}
+
+	PXDaddress(pxd, mp->index);
+	PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits);
+
+	lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+
+	/* mark page as homeward bound */
+	tlck->flag |= tlckWRITEPAGE;
+
+	return 0;
+}
+
+/*
+ *	dtLog()
+ *
+ * function:	log dtree tlock and format maplock to update bmap;
+ */
+static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+	   struct tlock * tlck)
+{
+	struct metapage *mp;
+	struct pxd_lock *pxdlock;
+	pxd_t *pxd;
+
+	mp = tlck->mp;
+
+	/* initialize as REDOPAGE/NOREDOPAGE record format */
+	lrd->log.redopage.type = cpu_to_le16(LOG_DTREE);
+	lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE);
+
+	pxd = &lrd->log.redopage.pxd;
+
+	if (tlck->type & tlckBTROOT)
+		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
+
+	/*
+	 *	page extension via relocation: entry insertion;
+	 *	page extension in-place: entry insertion;
+	 *	new right page from page split, reinitialized in-line
+	 *	root from root page split: entry insertion;
+	 */
+	if (tlck->type & (tlckNEW | tlckEXTEND)) {
+		/* log after-image of the new page for logredo():
+		 * mark log (LOG_NEW) for logredo() to initialize
+		 * freelist and update bmap for alloc of the new page;
+		 */
+		lrd->type = cpu_to_le16(LOG_REDOPAGE);
+		if (tlck->type & tlckEXTEND)
+			lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND);
+		else
+			lrd->log.redopage.type |= cpu_to_le16(LOG_NEW);
+		PXDaddress(pxd, mp->index);
+		PXDlength(pxd,
+			  mp->logical_size >> tblk->sb->s_blocksize_bits);
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+
+		/* format a maplock for txUpdateMap() to update bPMAP for
+		 * alloc of the new page;
+		 */
+		if (tlck->type & tlckBTROOT)
+			return;
+		tlck->flag |= tlckUPDATEMAP;
+		pxdlock = (struct pxd_lock *) & tlck->lock;
+		pxdlock->flag = mlckALLOCPXD;
+		pxdlock->pxd = *pxd;
+
+		pxdlock->index = 1;
+
+		/* mark page as homeward bound */
+		tlck->flag |= tlckWRITEPAGE;
+		return;
+	}
+
+	/*
+	 *	entry insertion/deletion,
+	 *	sibling page link update (old right page before split);
+	 */
+	if (tlck->type & (tlckENTRY | tlckRELINK)) {
+		/* log after-image for logredo(): */
+		lrd->type = cpu_to_le16(LOG_REDOPAGE);
+		PXDaddress(pxd, mp->index);
+		PXDlength(pxd,
+			  mp->logical_size >> tblk->sb->s_blocksize_bits);
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+
+		/* mark page as homeward bound */
+		tlck->flag |= tlckWRITEPAGE;
+		return;
+	}
+
+	/*
+	 *	page deletion: page has been invalidated
+	 *	page relocation: source extent
+	 *
+	 *	a maplock for free of the page has been formatted
+	 *	at txLock() time);
+	 */
+	if (tlck->type & (tlckFREE | tlckRELOCATE)) {
+		/* log LOG_NOREDOPAGE of the deleted page for logredo()
+		 * to start NoRedoPage filter and to update bmap for free
+		 * of the deletd page
+		 */
+		lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
+		pxdlock = (struct pxd_lock *) & tlck->lock;
+		*pxd = pxdlock->pxd;
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+
+		/* a maplock for txUpdateMap() for free of the page
+		 * has been formatted at txLock() time;
+		 */
+		tlck->flag |= tlckUPDATEMAP;
+	}
+	return;
+}
+
+/*
+ *	xtLog()
+ *
+ * function:	log xtree tlock and format maplock to update bmap;
+ */
+static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+	   struct tlock * tlck)
+{
+	struct inode *ip;
+	struct metapage *mp;
+	xtpage_t *p;
+	struct xtlock *xtlck;
+	struct maplock *maplock;
+	struct xdlistlock *xadlock;
+	struct pxd_lock *pxdlock;
+	pxd_t *page_pxd;
+	int next, lwm, hwm;
+
+	ip = tlck->ip;
+	mp = tlck->mp;
+
+	/* initialize as REDOPAGE/NOREDOPAGE record format */
+	lrd->log.redopage.type = cpu_to_le16(LOG_XTREE);
+	lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE);
+
+	page_pxd = &lrd->log.redopage.pxd;
+
+	if (tlck->type & tlckBTROOT) {
+		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
+		p = &JFS_IP(ip)->i_xtroot;
+		if (S_ISDIR(ip->i_mode))
+			lrd->log.redopage.type |=
+			    cpu_to_le16(LOG_DIR_XTREE);
+	} else
+		p = (xtpage_t *) mp->data;
+	next = le16_to_cpu(p->header.nextindex);
+
+	xtlck = (struct xtlock *) & tlck->lock;
+
+	maplock = (struct maplock *) & tlck->lock;
+	xadlock = (struct xdlistlock *) maplock;
+
+	/*
+	 *	entry insertion/extension;
+	 *	sibling page link update (old right page before split);
+	 */
+	if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) {
+		/* log after-image for logredo():
+		 * logredo() will update bmap for alloc of new/extended
+		 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
+		 * after-image of XADlist;
+		 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
+		 * applying the after-image to the meta-data page.
+		 */
+		lrd->type = cpu_to_le16(LOG_REDOPAGE);
+		PXDaddress(page_pxd, mp->index);
+		PXDlength(page_pxd,
+			  mp->logical_size >> tblk->sb->s_blocksize_bits);
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+
+		/* format a maplock for txUpdateMap() to update bPMAP
+		 * for alloc of new/extended extents of XAD[lwm:next)
+		 * from the page itself;
+		 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
+		 */
+		lwm = xtlck->lwm.offset;
+		if (lwm == 0)
+			lwm = XTPAGEMAXSLOT;
+
+		if (lwm == next)
+			goto out;
+		if (lwm > next) {
+			jfs_err("xtLog: lwm > next\n");
+			goto out;
+		}
+		tlck->flag |= tlckUPDATEMAP;
+		xadlock->flag = mlckALLOCXADLIST;
+		xadlock->count = next - lwm;
+		if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) {
+			int i;
+			pxd_t *pxd;
+			/*
+			 * Lazy commit may allow xtree to be modified before
+			 * txUpdateMap runs.  Copy xad into linelock to
+			 * preserve correct data.
+			 *
+			 * We can fit twice as may pxd's as xads in the lock
+			 */
+			xadlock->flag = mlckALLOCPXDLIST;
+			pxd = xadlock->xdlist = &xtlck->pxdlock;
+			for (i = 0; i < xadlock->count; i++) {
+				PXDaddress(pxd, addressXAD(&p->xad[lwm + i]));
+				PXDlength(pxd, lengthXAD(&p->xad[lwm + i]));
+				p->xad[lwm + i].flag &=
+				    ~(XAD_NEW | XAD_EXTENDED);
+				pxd++;
+			}
+		} else {
+			/*
+			 * xdlist will point to into inode's xtree, ensure
+			 * that transaction is not committed lazily.
+			 */
+			xadlock->flag = mlckALLOCXADLIST;
+			xadlock->xdlist = &p->xad[lwm];
+			tblk->xflag &= ~COMMIT_LAZY;
+		}
+		jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d "
+			 "count:%d", tlck->ip, mp, tlck, lwm, xadlock->count);
+
+		maplock->index = 1;
+
+	      out:
+		/* mark page as homeward bound */
+		tlck->flag |= tlckWRITEPAGE;
+
+		return;
+	}
+
+	/*
+	 *	page deletion: file deletion/truncation (ref. xtTruncate())
+	 *
+	 * (page will be invalidated after log is written and bmap
+	 * is updated from the page);
+	 */
+	if (tlck->type & tlckFREE) {
+		/* LOG_NOREDOPAGE log for NoRedoPage filter:
+		 * if page free from file delete, NoRedoFile filter from
+		 * inode image of zero link count will subsume NoRedoPage
+		 * filters for each page;
+		 * if page free from file truncattion, write NoRedoPage
+		 * filter;
+		 *
+		 * upadte of block allocation map for the page itself:
+		 * if page free from deletion and truncation, LOG_UPDATEMAP
+		 * log for the page itself is generated from processing
+		 * its parent page xad entries;
+		 */
+		/* if page free from file truncation, log LOG_NOREDOPAGE
+		 * of the deleted page for logredo() to start NoRedoPage
+		 * filter for the page;
+		 */
+		if (tblk->xflag & COMMIT_TRUNCATE) {
+			/* write NOREDOPAGE for the page */
+			lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
+			PXDaddress(page_pxd, mp->index);
+			PXDlength(page_pxd,
+				  mp->logical_size >> tblk->sb->
+				  s_blocksize_bits);
+			lrd->backchain =
+			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+
+			if (tlck->type & tlckBTROOT) {
+				/* Empty xtree must be logged */
+				lrd->type = cpu_to_le16(LOG_REDOPAGE);
+				lrd->backchain =
+				    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+			}
+		}
+
+		/* init LOG_UPDATEMAP of the freed extents
+		 * XAD[XTENTRYSTART:hwm) from the deleted page itself
+		 * for logredo() to update bmap;
+		 */
+		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+		lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST);
+		xtlck = (struct xtlock *) & tlck->lock;
+		hwm = xtlck->hwm.offset;
+		lrd->log.updatemap.nxd =
+		    cpu_to_le16(hwm - XTENTRYSTART + 1);
+		/* reformat linelock for lmLog() */
+		xtlck->header.offset = XTENTRYSTART;
+		xtlck->header.length = hwm - XTENTRYSTART + 1;
+		xtlck->index = 1;
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+
+		/* format a maplock for txUpdateMap() to update bmap
+		 * to free extents of XAD[XTENTRYSTART:hwm) from the
+		 * deleted page itself;
+		 */
+		tlck->flag |= tlckUPDATEMAP;
+		xadlock->count = hwm - XTENTRYSTART + 1;
+		if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) {
+			int i;
+			pxd_t *pxd;
+			/*
+			 * Lazy commit may allow xtree to be modified before
+			 * txUpdateMap runs.  Copy xad into linelock to
+			 * preserve correct data.
+			 *
+			 * We can fit twice as may pxd's as xads in the lock
+			 */
+			xadlock->flag = mlckFREEPXDLIST;
+			pxd = xadlock->xdlist = &xtlck->pxdlock;
+			for (i = 0; i < xadlock->count; i++) {
+				PXDaddress(pxd,
+					addressXAD(&p->xad[XTENTRYSTART + i]));
+				PXDlength(pxd,
+					lengthXAD(&p->xad[XTENTRYSTART + i]));
+				pxd++;
+			}
+		} else {
+			/*
+			 * xdlist will point to into inode's xtree, ensure
+			 * that transaction is not committed lazily.
+			 */
+			xadlock->flag = mlckFREEXADLIST;
+			xadlock->xdlist = &p->xad[XTENTRYSTART];
+			tblk->xflag &= ~COMMIT_LAZY;
+		}
+		jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2",
+			 tlck->ip, mp, xadlock->count);
+
+		maplock->index = 1;
+
+		/* mark page as invalid */
+		if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode))
+		    && !(tlck->type & tlckBTROOT))
+			tlck->flag |= tlckFREEPAGE;
+		/*
+		   else (tblk->xflag & COMMIT_PMAP)
+		   ? release the page;
+		 */
+		return;
+	}
+
+	/*
+	 *	page/entry truncation: file truncation (ref. xtTruncate())
+	 *
+	 *	|----------+------+------+---------------|
+	 *		   |      |      |
+	 *		   |      |     hwm - hwm before truncation
+	 *		   |     next - truncation point
+	 *		  lwm - lwm before truncation
+	 * header ?
+	 */
+	if (tlck->type & tlckTRUNCATE) {
+		/* This odd declaration suppresses a bogus gcc warning */
+		pxd_t pxd = pxd;	/* truncated extent of xad */
+		int twm;
+
+		/*
+		 * For truncation the entire linelock may be used, so it would
+		 * be difficult to store xad list in linelock itself.
+		 * Therefore, we'll just force transaction to be committed
+		 * synchronously, so that xtree pages won't be changed before
+		 * txUpdateMap runs.
+		 */
+		tblk->xflag &= ~COMMIT_LAZY;
+		lwm = xtlck->lwm.offset;
+		if (lwm == 0)
+			lwm = XTPAGEMAXSLOT;
+		hwm = xtlck->hwm.offset;
+		twm = xtlck->twm.offset;
+
+		/*
+		 *	write log records
+		 */
+		/* log after-image for logredo():
+		 *
+		 * logredo() will update bmap for alloc of new/extended
+		 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
+		 * after-image of XADlist;
+		 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
+		 * applying the after-image to the meta-data page.
+		 */
+		lrd->type = cpu_to_le16(LOG_REDOPAGE);
+		PXDaddress(page_pxd, mp->index);
+		PXDlength(page_pxd,
+			  mp->logical_size >> tblk->sb->s_blocksize_bits);
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+
+		/*
+		 * truncate entry XAD[twm == next - 1]:
+		 */
+		if (twm == next - 1) {
+			/* init LOG_UPDATEMAP for logredo() to update bmap for
+			 * free of truncated delta extent of the truncated
+			 * entry XAD[next - 1]:
+			 * (xtlck->pxdlock = truncated delta extent);
+			 */
+			pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
+			/* assert(pxdlock->type & tlckTRUNCATE); */
+			lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+			lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
+			lrd->log.updatemap.nxd = cpu_to_le16(1);
+			lrd->log.updatemap.pxd = pxdlock->pxd;
+			pxd = pxdlock->pxd;	/* save to format maplock */
+			lrd->backchain =
+			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+		}
+
+		/*
+		 * free entries XAD[next:hwm]:
+		 */
+		if (hwm >= next) {
+			/* init LOG_UPDATEMAP of the freed extents
+			 * XAD[next:hwm] from the deleted page itself
+			 * for logredo() to update bmap;
+			 */
+			lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+			lrd->log.updatemap.type =
+			    cpu_to_le16(LOG_FREEXADLIST);
+			xtlck = (struct xtlock *) & tlck->lock;
+			hwm = xtlck->hwm.offset;
+			lrd->log.updatemap.nxd =
+			    cpu_to_le16(hwm - next + 1);
+			/* reformat linelock for lmLog() */
+			xtlck->header.offset = next;
+			xtlck->header.length = hwm - next + 1;
+			xtlck->index = 1;
+			lrd->backchain =
+			    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+		}
+
+		/*
+		 *	format maplock(s) for txUpdateMap() to update bmap
+		 */
+		maplock->index = 0;
+
+		/*
+		 * allocate entries XAD[lwm:next):
+		 */
+		if (lwm < next) {
+			/* format a maplock for txUpdateMap() to update bPMAP
+			 * for alloc of new/extended extents of XAD[lwm:next)
+			 * from the page itself;
+			 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
+			 */
+			tlck->flag |= tlckUPDATEMAP;
+			xadlock->flag = mlckALLOCXADLIST;
+			xadlock->count = next - lwm;
+			xadlock->xdlist = &p->xad[lwm];
+
+			jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d "
+				 "lwm:%d next:%d",
+				 tlck->ip, mp, xadlock->count, lwm, next);
+			maplock->index++;
+			xadlock++;
+		}
+
+		/*
+		 * truncate entry XAD[twm == next - 1]:
+		 */
+		if (twm == next - 1) {
+			/* format a maplock for txUpdateMap() to update bmap
+			 * to free truncated delta extent of the truncated
+			 * entry XAD[next - 1];
+			 * (xtlck->pxdlock = truncated delta extent);
+			 */
+			tlck->flag |= tlckUPDATEMAP;
+			pxdlock = (struct pxd_lock *) xadlock;
+			pxdlock->flag = mlckFREEPXD;
+			pxdlock->count = 1;
+			pxdlock->pxd = pxd;
+
+			jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d "
+				 "hwm:%d", ip, mp, pxdlock->count, hwm);
+			maplock->index++;
+			xadlock++;
+		}
+
+		/*
+		 * free entries XAD[next:hwm]:
+		 */
+		if (hwm >= next) {
+			/* format a maplock for txUpdateMap() to update bmap
+			 * to free extents of XAD[next:hwm] from thedeleted
+			 * page itself;
+			 */
+			tlck->flag |= tlckUPDATEMAP;
+			xadlock->flag = mlckFREEXADLIST;
+			xadlock->count = hwm - next + 1;
+			xadlock->xdlist = &p->xad[next];
+
+			jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d "
+				 "next:%d hwm:%d",
+				 tlck->ip, mp, xadlock->count, next, hwm);
+			maplock->index++;
+		}
+
+		/* mark page as homeward bound */
+		tlck->flag |= tlckWRITEPAGE;
+	}
+	return;
+}
+
+/*
+ *	mapLog()
+ *
+ * function:	log from maplock of freed data extents;
+ */
+static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+		   struct tlock * tlck)
+{
+	struct pxd_lock *pxdlock;
+	int i, nlock;
+	pxd_t *pxd;
+
+	/*
+	 *	page relocation: free the source page extent
+	 *
+	 * a maplock for txUpdateMap() for free of the page
+	 * has been formatted at txLock() time saving the src
+	 * relocated page address;
+	 */
+	if (tlck->type & tlckRELOCATE) {
+		/* log LOG_NOREDOPAGE of the old relocated page
+		 * for logredo() to start NoRedoPage filter;
+		 */
+		lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
+		pxdlock = (struct pxd_lock *) & tlck->lock;
+		pxd = &lrd->log.redopage.pxd;
+		*pxd = pxdlock->pxd;
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+
+		/* (N.B. currently, logredo() does NOT update bmap
+		 * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE);
+		 * if page free from relocation, LOG_UPDATEMAP log is
+		 * specifically generated now for logredo()
+		 * to update bmap for free of src relocated page;
+		 * (new flag LOG_RELOCATE may be introduced which will
+		 * inform logredo() to start NORedoPage filter and also
+		 * update block allocation map at the same time, thus
+		 * avoiding an extra log write);
+		 */
+		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+		lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
+		lrd->log.updatemap.nxd = cpu_to_le16(1);
+		lrd->log.updatemap.pxd = pxdlock->pxd;
+		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+
+		/* a maplock for txUpdateMap() for free of the page
+		 * has been formatted at txLock() time;
+		 */
+		tlck->flag |= tlckUPDATEMAP;
+		return;
+	}
+	/*
+
+	 * Otherwise it's not a relocate request
+	 *
+	 */
+	else {
+		/* log LOG_UPDATEMAP for logredo() to update bmap for
+		 * free of truncated/relocated delta extent of the data;
+		 * e.g.: external EA extent, relocated/truncated extent
+		 * from xtTailgate();
+		 */
+		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+		pxdlock = (struct pxd_lock *) & tlck->lock;
+		nlock = pxdlock->index;
+		for (i = 0; i < nlock; i++, pxdlock++) {
+			if (pxdlock->flag & mlckALLOCPXD)
+				lrd->log.updatemap.type =
+				    cpu_to_le16(LOG_ALLOCPXD);
+			else
+				lrd->log.updatemap.type =
+				    cpu_to_le16(LOG_FREEPXD);
+			lrd->log.updatemap.nxd = cpu_to_le16(1);
+			lrd->log.updatemap.pxd = pxdlock->pxd;
+			lrd->backchain =
+			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+			jfs_info("mapLog: xaddr:0x%lx xlen:0x%x",
+				 (ulong) addressPXD(&pxdlock->pxd),
+				 lengthPXD(&pxdlock->pxd));
+		}
+
+		/* update bmap */
+		tlck->flag |= tlckUPDATEMAP;
+	}
+}
+
+/*
+ *	txEA()
+ *
+ * function:	acquire maplock for EA/ACL extents or
+ *		set COMMIT_INLINE flag;
+ */
+void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
+{
+	struct tlock *tlck = NULL;
+	struct pxd_lock *maplock = NULL, *pxdlock = NULL;
+
+	/*
+	 * format maplock for alloc of new EA extent
+	 */
+	if (newea) {
+		/* Since the newea could be a completely zeroed entry we need to
+		 * check for the two flags which indicate we should actually
+		 * commit new EA data
+		 */
+		if (newea->flag & DXD_EXTENT) {
+			tlck = txMaplock(tid, ip, tlckMAP);
+			maplock = (struct pxd_lock *) & tlck->lock;
+			pxdlock = (struct pxd_lock *) maplock;
+			pxdlock->flag = mlckALLOCPXD;
+			PXDaddress(&pxdlock->pxd, addressDXD(newea));
+			PXDlength(&pxdlock->pxd, lengthDXD(newea));
+			pxdlock++;
+			maplock->index = 1;
+		} else if (newea->flag & DXD_INLINE) {
+			tlck = NULL;
+
+			set_cflag(COMMIT_Inlineea, ip);
+		}
+	}
+
+	/*
+	 * format maplock for free of old EA extent
+	 */
+	if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) {
+		if (tlck == NULL) {
+			tlck = txMaplock(tid, ip, tlckMAP);
+			maplock = (struct pxd_lock *) & tlck->lock;
+			pxdlock = (struct pxd_lock *) maplock;
+			maplock->index = 0;
+		}
+		pxdlock->flag = mlckFREEPXD;
+		PXDaddress(&pxdlock->pxd, addressDXD(oldea));
+		PXDlength(&pxdlock->pxd, lengthDXD(oldea));
+		maplock->index++;
+	}
+}
+
+/*
+ *	txForce()
+ *
+ * function: synchronously write pages locked by transaction
+ *	     after txLog() but before txUpdateMap();
+ */
+static void txForce(struct tblock * tblk)
+{
+	struct tlock *tlck;
+	lid_t lid, next;
+	struct metapage *mp;
+
+	/*
+	 * reverse the order of transaction tlocks in
+	 * careful update order of address index pages
+	 * (right to left, bottom up)
+	 */
+	tlck = lid_to_tlock(tblk->next);
+	lid = tlck->next;
+	tlck->next = 0;
+	while (lid) {
+		tlck = lid_to_tlock(lid);
+		next = tlck->next;
+		tlck->next = tblk->next;
+		tblk->next = lid;
+		lid = next;
+	}
+
+	/*
+	 * synchronously write the page, and
+	 * hold the page for txUpdateMap();
+	 */
+	for (lid = tblk->next; lid; lid = next) {
+		tlck = lid_to_tlock(lid);
+		next = tlck->next;
+
+		if ((mp = tlck->mp) != NULL &&
+		    (tlck->type & tlckBTROOT) == 0) {
+			assert(mp->xflag & COMMIT_PAGE);
+
+			if (tlck->flag & tlckWRITEPAGE) {
+				tlck->flag &= ~tlckWRITEPAGE;
+
+				/* do not release page to freelist */
+				force_metapage(mp);
+#if 0
+				/*
+				 * The "right" thing to do here is to
+				 * synchronously write the metadata.
+				 * With the current implementation this
+				 * is hard since write_metapage requires
+				 * us to kunmap & remap the page.  If we
+				 * have tlocks pointing into the metadata
+				 * pages, we don't want to do this.  I think
+				 * we can get by with synchronously writing
+				 * the pages when they are released.
+				 */
+				assert(mp->nohomeok);
+				set_bit(META_dirty, &mp->flag);
+				set_bit(META_sync, &mp->flag);
+#endif
+			}
+		}
+	}
+}
+
+/*
+ *	txUpdateMap()
+ *
+ * function:	update persistent allocation map (and working map
+ *		if appropriate);
+ *
+ * parameter:
+ */
+static void txUpdateMap(struct tblock * tblk)
+{
+	struct inode *ip;
+	struct inode *ipimap;
+	lid_t lid;
+	struct tlock *tlck;
+	struct maplock *maplock;
+	struct pxd_lock pxdlock;
+	int maptype;
+	int k, nlock;
+	struct metapage *mp = NULL;
+
+	ipimap = JFS_SBI(tblk->sb)->ipimap;
+
+	maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP;
+
+
+	/*
+	 *	update block allocation map
+	 *
+	 * update allocation state in pmap (and wmap) and
+	 * update lsn of the pmap page;
+	 */
+	/*
+	 * scan each tlock/page of transaction for block allocation/free:
+	 *
+	 * for each tlock/page of transaction, update map.
+	 *  ? are there tlock for pmap and pwmap at the same time ?
+	 */
+	for (lid = tblk->next; lid; lid = tlck->next) {
+		tlck = lid_to_tlock(lid);
+
+		if ((tlck->flag & tlckUPDATEMAP) == 0)
+			continue;
+
+		if (tlck->flag & tlckFREEPAGE) {
+			/*
+			 * Another thread may attempt to reuse freed space
+			 * immediately, so we want to get rid of the metapage
+			 * before anyone else has a chance to get it.
+			 * Lock metapage, update maps, then invalidate
+			 * the metapage.
+			 */
+			mp = tlck->mp;
+			ASSERT(mp->xflag & COMMIT_PAGE);
+			grab_metapage(mp);
+		}
+
+		/*
+		 * extent list:
+		 * . in-line PXD list:
+		 * . out-of-line XAD list:
+		 */
+		maplock = (struct maplock *) & tlck->lock;
+		nlock = maplock->index;
+
+		for (k = 0; k < nlock; k++, maplock++) {
+			/*
+			 * allocate blocks in persistent map:
+			 *
+			 * blocks have been allocated from wmap at alloc time;
+			 */
+			if (maplock->flag & mlckALLOC) {
+				txAllocPMap(ipimap, maplock, tblk);
+			}
+			/*
+			 * free blocks in persistent and working map:
+			 * blocks will be freed in pmap and then in wmap;
+			 *
+			 * ? tblock specifies the PMAP/PWMAP based upon
+			 * transaction
+			 *
+			 * free blocks in persistent map:
+			 * blocks will be freed from wmap at last reference
+			 * release of the object for regular files;
+			 *
+			 * Alway free blocks from both persistent & working
+			 * maps for directories
+			 */
+			else {	/* (maplock->flag & mlckFREE) */
+
+				if (tlck->flag & tlckDIRECTORY)
+					txFreeMap(ipimap, maplock,
+						  tblk, COMMIT_PWMAP);
+				else
+					txFreeMap(ipimap, maplock,
+						  tblk, maptype);
+			}
+		}
+		if (tlck->flag & tlckFREEPAGE) {
+			if (!(tblk->flag & tblkGC_LAZY)) {
+				/* This is equivalent to txRelease */
+				ASSERT(mp->lid == lid);
+				tlck->mp->lid = 0;
+			}
+			assert(mp->nohomeok == 1);
+			metapage_homeok(mp);
+			discard_metapage(mp);
+			tlck->mp = NULL;
+		}
+	}
+	/*
+	 *	update inode allocation map
+	 *
+	 * update allocation state in pmap and
+	 * update lsn of the pmap page;
+	 * update in-memory inode flag/state
+	 *
+	 * unlock mapper/write lock
+	 */
+	if (tblk->xflag & COMMIT_CREATE) {
+		diUpdatePMap(ipimap, tblk->ino, false, tblk);
+		/* update persistent block allocation map
+		 * for the allocation of inode extent;
+		 */
+		pxdlock.flag = mlckALLOCPXD;
+		pxdlock.pxd = tblk->u.ixpxd;
+		pxdlock.index = 1;
+		txAllocPMap(ipimap, (struct maplock *) & pxdlock, tblk);
+	} else if (tblk->xflag & COMMIT_DELETE) {
+		ip = tblk->u.ip;
+		diUpdatePMap(ipimap, ip->i_ino, true, tblk);
+		iput(ip);
+	}
+}
+
+/*
+ *	txAllocPMap()
+ *
+ * function: allocate from persistent map;
+ *
+ * parameter:
+ *	ipbmap	-
+ *	malock	-
+ *		xad list:
+ *		pxd:
+ *
+ *	maptype -
+ *		allocate from persistent map;
+ *		free from persistent map;
+ *		(e.g., tmp file - free from working map at releae
+ *		 of last reference);
+ *		free from persistent and working map;
+ *
+ *	lsn	- log sequence number;
+ */
+static void txAllocPMap(struct inode *ip, struct maplock * maplock,
+			struct tblock * tblk)
+{
+	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+	struct xdlistlock *xadlistlock;
+	xad_t *xad;
+	s64 xaddr;
+	int xlen;
+	struct pxd_lock *pxdlock;
+	struct xdlistlock *pxdlistlock;
+	pxd_t *pxd;
+	int n;
+
+	/*
+	 * allocate from persistent map;
+	 */
+	if (maplock->flag & mlckALLOCXADLIST) {
+		xadlistlock = (struct xdlistlock *) maplock;
+		xad = xadlistlock->xdlist;
+		for (n = 0; n < xadlistlock->count; n++, xad++) {
+			if (xad->flag & (XAD_NEW | XAD_EXTENDED)) {
+				xaddr = addressXAD(xad);
+				xlen = lengthXAD(xad);
+				dbUpdatePMap(ipbmap, false, xaddr,
+					     (s64) xlen, tblk);
+				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
+				jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
+					 (ulong) xaddr, xlen);
+			}
+		}
+	} else if (maplock->flag & mlckALLOCPXD) {
+		pxdlock = (struct pxd_lock *) maplock;
+		xaddr = addressPXD(&pxdlock->pxd);
+		xlen = lengthPXD(&pxdlock->pxd);
+		dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, tblk);
+		jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen);
+	} else {		/* (maplock->flag & mlckALLOCPXDLIST) */
+
+		pxdlistlock = (struct xdlistlock *) maplock;
+		pxd = pxdlistlock->xdlist;
+		for (n = 0; n < pxdlistlock->count; n++, pxd++) {
+			xaddr = addressPXD(pxd);
+			xlen = lengthPXD(pxd);
+			dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen,
+				     tblk);
+			jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
+				 (ulong) xaddr, xlen);
+		}
+	}
+}
+
+/*
+ *	txFreeMap()
+ *
+ * function:	free from persistent and/or working map;
+ *
+ * todo: optimization
+ */
+void txFreeMap(struct inode *ip,
+	       struct maplock * maplock, struct tblock * tblk, int maptype)
+{
+	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+	struct xdlistlock *xadlistlock;
+	xad_t *xad;
+	s64 xaddr;
+	int xlen;
+	struct pxd_lock *pxdlock;
+	struct xdlistlock *pxdlistlock;
+	pxd_t *pxd;
+	int n;
+
+	jfs_info("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x",
+		 tblk, maplock, maptype);
+
+	/*
+	 * free from persistent map;
+	 */
+	if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) {
+		if (maplock->flag & mlckFREEXADLIST) {
+			xadlistlock = (struct xdlistlock *) maplock;
+			xad = xadlistlock->xdlist;
+			for (n = 0; n < xadlistlock->count; n++, xad++) {
+				if (!(xad->flag & XAD_NEW)) {
+					xaddr = addressXAD(xad);
+					xlen = lengthXAD(xad);
+					dbUpdatePMap(ipbmap, true, xaddr,
+						     (s64) xlen, tblk);
+					jfs_info("freePMap: xaddr:0x%lx "
+						 "xlen:%d",
+						 (ulong) xaddr, xlen);
+				}
+			}
+		} else if (maplock->flag & mlckFREEPXD) {
+			pxdlock = (struct pxd_lock *) maplock;
+			xaddr = addressPXD(&pxdlock->pxd);
+			xlen = lengthPXD(&pxdlock->pxd);
+			dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen,
+				     tblk);
+			jfs_info("freePMap: xaddr:0x%lx xlen:%d",
+				 (ulong) xaddr, xlen);
+		} else {	/* (maplock->flag & mlckALLOCPXDLIST) */
+
+			pxdlistlock = (struct xdlistlock *) maplock;
+			pxd = pxdlistlock->xdlist;
+			for (n = 0; n < pxdlistlock->count; n++, pxd++) {
+				xaddr = addressPXD(pxd);
+				xlen = lengthPXD(pxd);
+				dbUpdatePMap(ipbmap, true, xaddr,
+					     (s64) xlen, tblk);
+				jfs_info("freePMap: xaddr:0x%lx xlen:%d",
+					 (ulong) xaddr, xlen);
+			}
+		}
+	}
+
+	/*
+	 * free from working map;
+	 */
+	if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) {
+		if (maplock->flag & mlckFREEXADLIST) {
+			xadlistlock = (struct xdlistlock *) maplock;
+			xad = xadlistlock->xdlist;
+			for (n = 0; n < xadlistlock->count; n++, xad++) {
+				xaddr = addressXAD(xad);
+				xlen = lengthXAD(xad);
+				dbFree(ip, xaddr, (s64) xlen);
+				xad->flag = 0;
+				jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
+					 (ulong) xaddr, xlen);
+			}
+		} else if (maplock->flag & mlckFREEPXD) {
+			pxdlock = (struct pxd_lock *) maplock;
+			xaddr = addressPXD(&pxdlock->pxd);
+			xlen = lengthPXD(&pxdlock->pxd);
+			dbFree(ip, xaddr, (s64) xlen);
+			jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
+				 (ulong) xaddr, xlen);
+		} else {	/* (maplock->flag & mlckFREEPXDLIST) */
+
+			pxdlistlock = (struct xdlistlock *) maplock;
+			pxd = pxdlistlock->xdlist;
+			for (n = 0; n < pxdlistlock->count; n++, pxd++) {
+				xaddr = addressPXD(pxd);
+				xlen = lengthPXD(pxd);
+				dbFree(ip, xaddr, (s64) xlen);
+				jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
+					 (ulong) xaddr, xlen);
+			}
+		}
+	}
+}
+
+/*
+ *	txFreelock()
+ *
+ * function:	remove tlock from inode anonymous locklist
+ */
+void txFreelock(struct inode *ip)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	struct tlock *xtlck, *tlck;
+	lid_t xlid = 0, lid;
+
+	if (!jfs_ip->atlhead)
+		return;
+
+	TXN_LOCK();
+	xtlck = (struct tlock *) &jfs_ip->atlhead;
+
+	while ((lid = xtlck->next) != 0) {
+		tlck = lid_to_tlock(lid);
+		if (tlck->flag & tlckFREELOCK) {
+			xtlck->next = tlck->next;
+			txLockFree(lid);
+		} else {
+			xtlck = tlck;
+			xlid = lid;
+		}
+	}
+
+	if (jfs_ip->atlhead)
+		jfs_ip->atltail = xlid;
+	else {
+		jfs_ip->atltail = 0;
+		/*
+		 * If inode was on anon_list, remove it
+		 */
+		list_del_init(&jfs_ip->anon_inode_list);
+	}
+	TXN_UNLOCK();
+}
+
+/*
+ *	txAbort()
+ *
+ * function: abort tx before commit;
+ *
+ * frees line-locks and segment locks for all
+ * segments in comdata structure.
+ * Optionally sets state of file-system to FM_DIRTY in super-block.
+ * log age of page-frames in memory for which caller has
+ * are reset to 0 (to avoid logwarap).
+ */
+void txAbort(tid_t tid, int dirty)
+{
+	lid_t lid, next;
+	struct metapage *mp;
+	struct tblock *tblk = tid_to_tblock(tid);
+	struct tlock *tlck;
+
+	/*
+	 * free tlocks of the transaction
+	 */
+	for (lid = tblk->next; lid; lid = next) {
+		tlck = lid_to_tlock(lid);
+		next = tlck->next;
+		mp = tlck->mp;
+		JFS_IP(tlck->ip)->xtlid = 0;
+
+		if (mp) {
+			mp->lid = 0;
+
+			/*
+			 * reset lsn of page to avoid logwarap:
+			 *
+			 * (page may have been previously committed by another
+			 * transaction(s) but has not been paged, i.e.,
+			 * it may be on logsync list even though it has not
+			 * been logged for the current tx.)
+			 */
+			if (mp->xflag & COMMIT_PAGE && mp->lsn)
+				LogSyncRelease(mp);
+		}
+		/* insert tlock at head of freelist */
+		TXN_LOCK();
+		txLockFree(lid);
+		TXN_UNLOCK();
+	}
+
+	/* caller will free the transaction block */
+
+	tblk->next = tblk->last = 0;
+
+	/*
+	 * mark filesystem dirty
+	 */
+	if (dirty)
+		jfs_error(tblk->sb, "txAbort");
+
+	return;
+}
+
+/*
+ *	txLazyCommit(void)
+ *
+ *	All transactions except those changing ipimap (COMMIT_FORCE) are
+ *	processed by this routine.  This insures that the inode and block
+ *	allocation maps are updated in order.  For synchronous transactions,
+ *	let the user thread finish processing after txUpdateMap() is called.
+ */
+static void txLazyCommit(struct tblock * tblk)
+{
+	struct jfs_log *log;
+
+	while (((tblk->flag & tblkGC_READY) == 0) &&
+	       ((tblk->flag & tblkGC_UNLOCKED) == 0)) {
+		/* We must have gotten ahead of the user thread
+		 */
+		jfs_info("jfs_lazycommit: tblk 0x%p not unlocked", tblk);
+		yield();
+	}
+
+	jfs_info("txLazyCommit: processing tblk 0x%p", tblk);
+
+	txUpdateMap(tblk);
+
+	log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
+
+	spin_lock_irq(&log->gclock);	// LOGGC_LOCK
+
+	tblk->flag |= tblkGC_COMMITTED;
+
+	if (tblk->flag & tblkGC_READY)
+		log->gcrtc--;
+
+	wake_up_all(&tblk->gcwait);	// LOGGC_WAKEUP
+
+	/*
+	 * Can't release log->gclock until we've tested tblk->flag
+	 */
+	if (tblk->flag & tblkGC_LAZY) {
+		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
+		txUnlock(tblk);
+		tblk->flag &= ~tblkGC_LAZY;
+		txEnd(tblk - TxBlock);	/* Convert back to tid */
+	} else
+		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
+
+	jfs_info("txLazyCommit: done: tblk = 0x%p", tblk);
+}
+
+/*
+ *	jfs_lazycommit(void)
+ *
+ *	To be run as a kernel daemon.  If lbmIODone is called in an interrupt
+ *	context, or where blocking is not wanted, this routine will process
+ *	committed transactions from the unlock queue.
+ */
+int jfs_lazycommit(void *arg)
+{
+	int WorkDone;
+	struct tblock *tblk;
+	unsigned long flags;
+	struct jfs_sb_info *sbi;
+
+	do {
+		LAZY_LOCK(flags);
+		jfs_commit_thread_waking = 0;	/* OK to wake another thread */
+		while (!list_empty(&TxAnchor.unlock_queue)) {
+			WorkDone = 0;
+			list_for_each_entry(tblk, &TxAnchor.unlock_queue,
+					    cqueue) {
+
+				sbi = JFS_SBI(tblk->sb);
+				/*
+				 * For each volume, the transactions must be
+				 * handled in order.  If another commit thread
+				 * is handling a tblk for this superblock,
+				 * skip it
+				 */
+				if (sbi->commit_state & IN_LAZYCOMMIT)
+					continue;
+
+				sbi->commit_state |= IN_LAZYCOMMIT;
+				WorkDone = 1;
+
+				/*
+				 * Remove transaction from queue
+				 */
+				list_del(&tblk->cqueue);
+
+				LAZY_UNLOCK(flags);
+				txLazyCommit(tblk);
+				LAZY_LOCK(flags);
+
+				sbi->commit_state &= ~IN_LAZYCOMMIT;
+				/*
+				 * Don't continue in the for loop.  (We can't
+				 * anyway, it's unsafe!)  We want to go back to
+				 * the beginning of the list.
+				 */
+				break;
+			}
+
+			/* If there was nothing to do, don't continue */
+			if (!WorkDone)
+				break;
+		}
+		/* In case a wakeup came while all threads were active */
+		jfs_commit_thread_waking = 0;
+
+		if (freezing(current)) {
+			LAZY_UNLOCK(flags);
+			refrigerator();
+		} else {
+			DECLARE_WAITQUEUE(wq, current);
+
+			add_wait_queue(&jfs_commit_thread_wait, &wq);
+			set_current_state(TASK_INTERRUPTIBLE);
+			LAZY_UNLOCK(flags);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+			remove_wait_queue(&jfs_commit_thread_wait, &wq);
+		}
+	} while (!kthread_should_stop());
+
+	if (!list_empty(&TxAnchor.unlock_queue))
+		jfs_err("jfs_lazycommit being killed w/pending transactions!");
+	else
+		jfs_info("jfs_lazycommit being killed\n");
+	return 0;
+}
+
+void txLazyUnlock(struct tblock * tblk)
+{
+	unsigned long flags;
+
+	LAZY_LOCK(flags);
+
+	list_add_tail(&tblk->cqueue, &TxAnchor.unlock_queue);
+	/*
+	 * Don't wake up a commit thread if there is already one servicing
+	 * this superblock, or if the last one we woke up hasn't started yet.
+	 */
+	if (!(JFS_SBI(tblk->sb)->commit_state & IN_LAZYCOMMIT) &&
+	    !jfs_commit_thread_waking) {
+		jfs_commit_thread_waking = 1;
+		wake_up(&jfs_commit_thread_wait);
+	}
+	LAZY_UNLOCK(flags);
+}
+
+static void LogSyncRelease(struct metapage * mp)
+{
+	struct jfs_log *log = mp->log;
+
+	assert(mp->nohomeok);
+	assert(log);
+	metapage_homeok(mp);
+}
+
+/*
+ *	txQuiesce
+ *
+ *	Block all new transactions and push anonymous transactions to
+ *	completion
+ *
+ *	This does almost the same thing as jfs_sync below.  We don't
+ *	worry about deadlocking when jfs_tlocks_low is set, since we would
+ *	expect jfs_sync to get us out of that jam.
+ */
+void txQuiesce(struct super_block *sb)
+{
+	struct inode *ip;
+	struct jfs_inode_info *jfs_ip;
+	struct jfs_log *log = JFS_SBI(sb)->log;
+	tid_t tid;
+
+	set_bit(log_QUIESCE, &log->flag);
+
+	TXN_LOCK();
+restart:
+	while (!list_empty(&TxAnchor.anon_list)) {
+		jfs_ip = list_entry(TxAnchor.anon_list.next,
+				    struct jfs_inode_info,
+				    anon_inode_list);
+		ip = &jfs_ip->vfs_inode;
+
+		/*
+		 * inode will be removed from anonymous list
+		 * when it is committed
+		 */
+		TXN_UNLOCK();
+		tid = txBegin(ip->i_sb, COMMIT_INODE | COMMIT_FORCE);
+		mutex_lock(&jfs_ip->commit_mutex);
+		txCommit(tid, 1, &ip, 0);
+		txEnd(tid);
+		mutex_unlock(&jfs_ip->commit_mutex);
+		/*
+		 * Just to be safe.  I don't know how
+		 * long we can run without blocking
+		 */
+		cond_resched();
+		TXN_LOCK();
+	}
+
+	/*
+	 * If jfs_sync is running in parallel, there could be some inodes
+	 * on anon_list2.  Let's check.
+	 */
+	if (!list_empty(&TxAnchor.anon_list2)) {
+		list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list);
+		INIT_LIST_HEAD(&TxAnchor.anon_list2);
+		goto restart;
+	}
+	TXN_UNLOCK();
+
+	/*
+	 * We may need to kick off the group commit
+	 */
+	jfs_flush_journal(log, 0);
+}
+
+/*
+ * txResume()
+ *
+ * Allows transactions to start again following txQuiesce
+ */
+void txResume(struct super_block *sb)
+{
+	struct jfs_log *log = JFS_SBI(sb)->log;
+
+	clear_bit(log_QUIESCE, &log->flag);
+	TXN_WAKEUP(&log->syncwait);
+}
+
+/*
+ *	jfs_sync(void)
+ *
+ *	To be run as a kernel daemon.  This is awakened when tlocks run low.
+ *	We write any inodes that have anonymous tlocks so they will become
+ *	available.
+ */
+int jfs_sync(void *arg)
+{
+	struct inode *ip;
+	struct jfs_inode_info *jfs_ip;
+	int rc;
+	tid_t tid;
+
+	do {
+		/*
+		 * write each inode on the anonymous inode list
+		 */
+		TXN_LOCK();
+		while (jfs_tlocks_low && !list_empty(&TxAnchor.anon_list)) {
+			jfs_ip = list_entry(TxAnchor.anon_list.next,
+					    struct jfs_inode_info,
+					    anon_inode_list);
+			ip = &jfs_ip->vfs_inode;
+
+			if (! igrab(ip)) {
+				/*
+				 * Inode is being freed
+				 */
+				list_del_init(&jfs_ip->anon_inode_list);
+			} else if (mutex_trylock(&jfs_ip->commit_mutex)) {
+				/*
+				 * inode will be removed from anonymous list
+				 * when it is committed
+				 */
+				TXN_UNLOCK();
+				tid = txBegin(ip->i_sb, COMMIT_INODE);
+				rc = txCommit(tid, 1, &ip, 0);
+				txEnd(tid);
+				mutex_unlock(&jfs_ip->commit_mutex);
+
+				iput(ip);
+				/*
+				 * Just to be safe.  I don't know how
+				 * long we can run without blocking
+				 */
+				cond_resched();
+				TXN_LOCK();
+			} else {
+				/* We can't get the commit mutex.  It may
+				 * be held by a thread waiting for tlock's
+				 * so let's not block here.  Save it to
+				 * put back on the anon_list.
+				 */
+
+				/* Take off anon_list */
+				list_del(&jfs_ip->anon_inode_list);
+
+				/* Put on anon_list2 */
+				list_add(&jfs_ip->anon_inode_list,
+					 &TxAnchor.anon_list2);
+
+				TXN_UNLOCK();
+				iput(ip);
+				TXN_LOCK();
+			}
+		}
+		/* Add anon_list2 back to anon_list */
+		list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list);
+
+		if (freezing(current)) {
+			TXN_UNLOCK();
+			refrigerator();
+		} else {
+			set_current_state(TASK_INTERRUPTIBLE);
+			TXN_UNLOCK();
+			schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+
+	jfs_info("jfs_sync being killed");
+	return 0;
+}
+
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
+static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
+{
+	char *freewait;
+	char *freelockwait;
+	char *lowlockwait;
+
+	freewait =
+	    waitqueue_active(&TxAnchor.freewait) ? "active" : "empty";
+	freelockwait =
+	    waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty";
+	lowlockwait =
+	    waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
+
+	seq_printf(m,
+		       "JFS TxAnchor\n"
+		       "============\n"
+		       "freetid = %d\n"
+		       "freewait = %s\n"
+		       "freelock = %d\n"
+		       "freelockwait = %s\n"
+		       "lowlockwait = %s\n"
+		       "tlocksInUse = %d\n"
+		       "jfs_tlocks_low = %d\n"
+		       "unlock_queue is %sempty\n",
+		       TxAnchor.freetid,
+		       freewait,
+		       TxAnchor.freelock,
+		       freelockwait,
+		       lowlockwait,
+		       TxAnchor.tlocksInUse,
+		       jfs_tlocks_low,
+		       list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
+	return 0;
+}
+
+static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_txanchor_proc_show, NULL);
+}
+
+const struct file_operations jfs_txanchor_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_txanchor_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
+static int jfs_txstats_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m,
+		       "JFS TxStats\n"
+		       "===========\n"
+		       "calls to txBegin = %d\n"
+		       "txBegin blocked by sync barrier = %d\n"
+		       "txBegin blocked by tlocks low = %d\n"
+		       "txBegin blocked by no free tid = %d\n"
+		       "calls to txBeginAnon = %d\n"
+		       "txBeginAnon blocked by sync barrier = %d\n"
+		       "txBeginAnon blocked by tlocks low = %d\n"
+		       "calls to txLockAlloc = %d\n"
+		       "tLockAlloc blocked by no free lock = %d\n",
+		       TxStat.txBegin,
+		       TxStat.txBegin_barrier,
+		       TxStat.txBegin_lockslow,
+		       TxStat.txBegin_freetid,
+		       TxStat.txBeginAnon,
+		       TxStat.txBeginAnon_barrier,
+		       TxStat.txBeginAnon_lockslow,
+		       TxStat.txLockAlloc,
+		       TxStat.txLockAlloc_freelock);
+	return 0;
+}
+
+static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_txstats_proc_show, NULL);
+}
+
+const struct file_operations jfs_txstats_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_txstats_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
diff --git a/fs/jfs/jfs_txnmgr.h b/fs/jfs/jfs_txnmgr.h
new file mode 100644
index 00000000..ab728893
--- /dev/null
+++ b/fs/jfs/jfs_txnmgr.h
@@ -0,0 +1,311 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_TXNMGR
+#define _H_JFS_TXNMGR
+
+#include "jfs_logmgr.h"
+
+/*
+ * Hide implementation of TxBlock and TxLock
+ */
+#define tid_to_tblock(tid) (&TxBlock[tid])
+
+#define lid_to_tlock(lid) (&TxLock[lid])
+
+/*
+ *	transaction block
+ */
+struct tblock {
+	/*
+	 * tblock and jbuf_t common area: struct logsyncblk
+	 *
+	 * the following 5 fields are the same as struct logsyncblk
+	 * which is common to tblock and jbuf to form logsynclist
+	 */
+	u16 xflag;		/* tx commit type */
+	u16 flag;		/* tx commit state */
+	lid_t dummy;		/* Must keep structures common */
+	s32 lsn;		/* recovery lsn */
+	struct list_head synclist;	/* logsynclist link */
+
+	/* lock management */
+	struct super_block *sb;	/* super block */
+	lid_t next;		/* index of first tlock of tid */
+	lid_t last;		/* index of last tlock of tid */
+	wait_queue_head_t waitor;	/* tids waiting on this tid */
+
+	/* log management */
+	u32 logtid;		/* log transaction id */
+
+	/* commit management */
+	struct list_head cqueue;	/* commit queue list */
+	s32 clsn;		/* commit lsn */
+	struct lbuf *bp;
+	s32 pn;			/* commit record log page number */
+	s32 eor;		/* commit record eor */
+	wait_queue_head_t gcwait;	/* group commit event list:
+					 * ready transactions wait on this
+					 * event for group commit completion.
+					 */
+	union {
+		struct inode *ip; /* inode being deleted */
+		pxd_t ixpxd;	/* pxd of inode extent for created inode */
+	} u;
+	u32 ino;		/* inode number being created */
+};
+
+extern struct tblock *TxBlock;	/* transaction block table */
+
+/* commit flags: tblk->xflag */
+#define	COMMIT_SYNC	0x0001	/* synchronous commit */
+#define	COMMIT_FORCE	0x0002	/* force pageout at end of commit */
+#define	COMMIT_FLUSH	0x0004	/* init flush at end of commit */
+#define COMMIT_MAP	0x00f0
+#define	COMMIT_PMAP	0x0010	/* update pmap */
+#define	COMMIT_WMAP	0x0020	/* update wmap */
+#define	COMMIT_PWMAP	0x0040	/* update pwmap */
+#define	COMMIT_FREE	0x0f00
+#define	COMMIT_DELETE	0x0100	/* inode delete */
+#define	COMMIT_TRUNCATE	0x0200	/* file truncation */
+#define	COMMIT_CREATE	0x0400	/* inode create */
+#define	COMMIT_LAZY	0x0800	/* lazy commit */
+#define COMMIT_PAGE	0x1000	/* Identifies element as metapage */
+#define COMMIT_INODE	0x2000	/* Identifies element as inode */
+
+/* group commit flags tblk->flag: see jfs_logmgr.h */
+
+/*
+ *	transaction lock
+ */
+struct tlock {
+	lid_t next;		/* 2: index next lockword on tid locklist
+				 *	    next lockword on freelist
+				 */
+	tid_t tid;		/* 2: transaction id holding lock */
+
+	u16 flag;		/* 2: lock control */
+	u16 type;		/* 2: log type */
+
+	struct metapage *mp;	/* 4/8: object page buffer locked */
+	struct inode *ip;	/* 4/8: object */
+	/* (16) */
+
+	s16 lock[24];		/* 48: overlay area */
+};				/* (64) */
+
+extern struct tlock *TxLock;	/* transaction lock table */
+
+/*
+ * tlock flag
+ */
+/* txLock state */
+#define tlckPAGELOCK		0x8000
+#define tlckINODELOCK		0x4000
+#define tlckLINELOCK		0x2000
+#define tlckINLINELOCK		0x1000
+/* lmLog state */
+#define tlckLOG			0x0800
+/* updateMap state */
+#define	tlckUPDATEMAP		0x0080
+#define	tlckDIRECTORY		0x0040
+/* freeLock state */
+#define tlckFREELOCK		0x0008
+#define tlckWRITEPAGE		0x0004
+#define tlckFREEPAGE		0x0002
+
+/*
+ * tlock type
+ */
+#define	tlckTYPE		0xfe00
+#define	tlckINODE		0x8000
+#define	tlckXTREE		0x4000
+#define	tlckDTREE		0x2000
+#define	tlckMAP			0x1000
+#define	tlckEA			0x0800
+#define	tlckACL			0x0400
+#define	tlckDATA		0x0200
+#define	tlckBTROOT		0x0100
+
+#define	tlckOPERATION		0x00ff
+#define tlckGROW		0x0001	/* file grow */
+#define tlckREMOVE		0x0002	/* file delete */
+#define tlckTRUNCATE		0x0004	/* file truncate */
+#define tlckRELOCATE		0x0008	/* file/directory relocate */
+#define tlckENTRY		0x0001	/* directory insert/delete */
+#define tlckEXTEND		0x0002	/* directory extend in-line */
+#define tlckSPLIT		0x0010	/* splited page */
+#define tlckNEW			0x0020	/* new page from split */
+#define tlckFREE		0x0040	/* free page */
+#define tlckRELINK		0x0080	/* update sibling pointer */
+
+/*
+ *	linelock for lmLog()
+ *
+ * note: linelock and its variations are overlaid
+ * at tlock.lock: watch for alignment;
+ */
+struct lv {
+	u8 offset;		/* 1: */
+	u8 length;		/* 1: */
+};				/* (2) */
+
+#define	TLOCKSHORT	20
+#define	TLOCKLONG	28
+
+struct linelock {
+	lid_t next;		/* 2: next linelock */
+
+	s8 maxcnt;		/* 1: */
+	s8 index;		/* 1: */
+
+	u16 flag;		/* 2: */
+	u8 type;		/* 1: */
+	u8 l2linesize;		/* 1: log2 of linesize */
+	/* (8) */
+
+	struct lv lv[20];	/* 40: */
+};				/* (48) */
+
+#define dt_lock	linelock
+
+struct xtlock {
+	lid_t next;		/* 2: */
+
+	s8 maxcnt;		/* 1: */
+	s8 index;		/* 1: */
+
+	u16 flag;		/* 2: */
+	u8 type;		/* 1: */
+	u8 l2linesize;		/* 1: log2 of linesize */
+				/* (8) */
+
+	struct lv header;	/* 2: */
+	struct lv lwm;		/* 2: low water mark */
+	struct lv hwm;		/* 2: high water mark */
+	struct lv twm;		/* 2: */
+				/* (16) */
+
+	s32 pxdlock[8];		/* 32: */
+};				/* (48) */
+
+
+/*
+ *	maplock for txUpdateMap()
+ *
+ * note: maplock and its variations are overlaid
+ * at tlock.lock/linelock: watch for alignment;
+ * N.B. next field may be set by linelock, and should not
+ * be modified by maplock;
+ * N.B. index of the first pxdlock specifies index of next
+ * free maplock (i.e., number of maplock) in the tlock;
+ */
+struct maplock {
+	lid_t next;		/* 2: */
+
+	u8 maxcnt;		/* 2: */
+	u8 index;		/* 2: next free maplock index */
+
+	u16 flag;		/* 2: */
+	u8 type;		/* 1: */
+	u8 count;		/* 1: number of pxd/xad */
+				/* (8) */
+
+	pxd_t pxd;		/* 8: */
+};				/* (16): */
+
+/* maplock flag */
+#define	mlckALLOC		0x00f0
+#define	mlckALLOCXADLIST	0x0080
+#define	mlckALLOCPXDLIST	0x0040
+#define	mlckALLOCXAD		0x0020
+#define	mlckALLOCPXD		0x0010
+#define	mlckFREE		0x000f
+#define	mlckFREEXADLIST		0x0008
+#define	mlckFREEPXDLIST		0x0004
+#define	mlckFREEXAD		0x0002
+#define	mlckFREEPXD		0x0001
+
+#define	pxd_lock	maplock
+
+struct xdlistlock {
+	lid_t next;		/* 2: */
+
+	u8 maxcnt;		/* 2: */
+	u8 index;		/* 2: */
+
+	u16 flag;		/* 2: */
+	u8 type;		/* 1: */
+	u8 count;		/* 1: number of pxd/xad */
+				/* (8) */
+
+	/*
+	 * We need xdlist to be 64 bits (8 bytes), regardless of
+	 * whether void * is 32 or 64 bits
+	 */
+	union {
+		void *_xdlist;	/* pxd/xad list */
+		s64 pad;	/* 8: Force 64-bit xdlist size */
+	} union64;
+};				/* (16): */
+
+#define xdlist union64._xdlist
+
+/*
+ *	commit
+ *
+ * parameter to the commit manager routines
+ */
+struct commit {
+	tid_t tid;		/* tid = index of tblock */
+	int flag;		/* flags */
+	struct jfs_log *log;	/* log */
+	struct super_block *sb;	/* superblock */
+
+	int nip;		/* number of entries in iplist */
+	struct inode **iplist;	/* list of pointers to inodes */
+
+	/* log record descriptor on 64-bit boundary */
+	struct lrd lrd;		/* : log record descriptor */
+};
+
+/*
+ * external declarations
+ */
+extern int jfs_tlocks_low;
+
+extern int txInit(void);
+extern void txExit(void);
+extern struct tlock *txLock(tid_t, struct inode *, struct metapage *, int);
+extern struct tlock *txMaplock(tid_t, struct inode *, int);
+extern int txCommit(tid_t, int, struct inode **, int);
+extern tid_t txBegin(struct super_block *, int);
+extern void txBeginAnon(struct super_block *);
+extern void txEnd(tid_t);
+extern void txAbort(tid_t, int);
+extern struct linelock *txLinelock(struct linelock *);
+extern void txFreeMap(struct inode *, struct maplock *, struct tblock *, int);
+extern void txEA(tid_t, struct inode *, dxd_t *, dxd_t *);
+extern void txFreelock(struct inode *);
+extern int lmLog(struct jfs_log *, struct tblock *, struct lrd *,
+		 struct tlock *);
+extern void txQuiesce(struct super_block *);
+extern void txResume(struct super_block *);
+extern void txLazyUnlock(struct tblock *);
+extern int jfs_lazycommit(void *);
+extern int jfs_sync(void *);
+#endif				/* _H_JFS_TXNMGR */
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
new file mode 100644
index 00000000..43ea3713
--- /dev/null
+++ b/fs/jfs/jfs_types.h
@@ -0,0 +1,159 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_TYPES
+#define	_H_JFS_TYPES
+
+/*
+ *	jfs_types.h:
+ *
+ * basic type/utility definitions
+ *
+ * note: this header file must be the 1st include file
+ * of JFS include list in all JFS .c file.
+ */
+
+#include <linux/types.h>
+#include <linux/nls.h>
+
+#include "endian24.h"
+
+/*
+ * transaction and lock id's
+ *
+ * Don't change these without carefully considering the impact on the
+ * size and alignment of all of the linelock variants
+ */
+typedef u16 tid_t;
+typedef u16 lid_t;
+
+/*
+ * Almost identical to Linux's timespec, but not quite
+ */
+struct timestruc_t {
+	__le32 tv_sec;
+	__le32 tv_nsec;
+};
+
+/*
+ *	handy
+ */
+
+#define LEFTMOSTONE	0x80000000
+#define	HIGHORDER	0x80000000u	/* high order bit on	*/
+#define	ONES		0xffffffffu	/* all bit on		*/
+
+/*
+ *	physical xd (pxd)
+ */
+typedef struct {
+	unsigned len:24;
+	unsigned addr1:8;
+	__le32 addr2;
+} pxd_t;
+
+/* xd_t field construction */
+
+#define	PXDlength(pxd, length32)	((pxd)->len = __cpu_to_le24(length32))
+#define	PXDaddress(pxd, address64)\
+{\
+	(pxd)->addr1 = ((s64)address64) >> 32;\
+	(pxd)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
+}
+
+/* xd_t field extraction */
+#define	lengthPXD(pxd)	__le24_to_cpu((pxd)->len)
+#define	addressPXD(pxd)\
+	( ((s64)((pxd)->addr1)) << 32 | __le32_to_cpu((pxd)->addr2))
+
+#define MAXTREEHEIGHT 8
+/* pxd list */
+struct pxdlist {
+	s16 maxnpxd;
+	s16 npxd;
+	pxd_t pxd[MAXTREEHEIGHT];
+};
+
+
+/*
+ *	data extent descriptor (dxd)
+ */
+typedef struct {
+	unsigned flag:8;	/* 1: flags */
+	unsigned rsrvd:24;
+	__le32 size;		/* 4: size in byte */
+	unsigned len:24;	/* 3: length in unit of fsblksize */
+	unsigned addr1:8;	/* 1: address in unit of fsblksize */
+	__le32 addr2;		/* 4: address in unit of fsblksize */
+} dxd_t;			/* - 16 - */
+
+/* dxd_t flags */
+#define	DXD_INDEX	0x80	/* B+-tree index */
+#define	DXD_INLINE	0x40	/* in-line data extent */
+#define	DXD_EXTENT	0x20	/* out-of-line single extent */
+#define	DXD_FILE	0x10	/* out-of-line file (inode) */
+#define DXD_CORRUPT	0x08	/* Inconsistency detected */
+
+/* dxd_t field construction
+ *	Conveniently, the PXD macros work for DXD
+ */
+#define	DXDlength	PXDlength
+#define	DXDaddress	PXDaddress
+#define	lengthDXD	lengthPXD
+#define	addressDXD	addressPXD
+#define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32))
+#define sizeDXD(dxd)	le32_to_cpu((dxd)->size)
+
+/*
+ *	directory entry argument
+ */
+struct component_name {
+	int namlen;
+	wchar_t *name;
+};
+
+
+/*
+ *	DASD limit information - stored in directory inode
+ */
+struct dasd {
+	u8 thresh;		/* Alert Threshold (in percent)		*/
+	u8 delta;		/* Alert Threshold delta (in percent)	*/
+	u8 rsrvd1;
+	u8 limit_hi;		/* DASD limit (in logical blocks)	*/
+	__le32 limit_lo;	/* DASD limit (in logical blocks)	*/
+	u8 rsrvd2[3];
+	u8 used_hi;		/* DASD usage (in logical blocks)	*/
+	__le32 used_lo;		/* DASD usage (in logical blocks)	*/
+};
+
+#define DASDLIMIT(dasdp) \
+	(((u64)((dasdp)->limit_hi) << 32) + __le32_to_cpu((dasdp)->limit_lo))
+#define setDASDLIMIT(dasdp, limit)\
+{\
+	(dasdp)->limit_hi = ((u64)limit) >> 32;\
+	(dasdp)->limit_lo = __cpu_to_le32(limit);\
+}
+#define DASDUSED(dasdp) \
+	(((u64)((dasdp)->used_hi) << 32) + __le32_to_cpu((dasdp)->used_lo))
+#define setDASDUSED(dasdp, used)\
+{\
+	(dasdp)->used_hi = ((u64)used) >> 32;\
+	(dasdp)->used_lo = __cpu_to_le32(used);\
+}
+
+#endif				/* !_H_JFS_TYPES */
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
new file mode 100644
index 00000000..adcf92d3
--- /dev/null
+++ b/fs/jfs/jfs_umount.c
@@ -0,0 +1,168 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ *	jfs_umount.c
+ *
+ * note: file system in transition to aggregate/fileset:
+ * (ref. jfs_mount.c)
+ *
+ * file system unmount is interpreted as mount of the single/only
+ * fileset in the aggregate and, if unmount of the last fileset,
+ * as unmount of the aggerate;
+ */
+
+#include <linux/fs.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_metapage.h"
+#include "jfs_debug.h"
+
+/*
+ * NAME:	jfs_umount(vfsp, flags, crp)
+ *
+ * FUNCTION:	vfs_umount()
+ *
+ * PARAMETERS:	vfsp	- virtual file system pointer
+ *		flags	- unmount for shutdown
+ *		crp	- credential
+ *
+ * RETURN :	EBUSY	- device has open files
+ */
+int jfs_umount(struct super_block *sb)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct inode *ipbmap = sbi->ipbmap;
+	struct inode *ipimap = sbi->ipimap;
+	struct inode *ipaimap = sbi->ipaimap;
+	struct inode *ipaimap2 = sbi->ipaimap2;
+	struct jfs_log *log;
+	int rc = 0;
+
+	jfs_info("UnMount JFS: sb:0x%p", sb);
+
+	/*
+	 *	update superblock and close log
+	 *
+	 * if mounted read-write and log based recovery was enabled
+	 */
+	if ((log = sbi->log))
+		/*
+		 * Wait for outstanding transactions to be written to log:
+		 */
+		jfs_flush_journal(log, 1);
+
+	/*
+	 * close fileset inode allocation map (aka fileset inode)
+	 */
+	diUnmount(ipimap, 0);
+
+	diFreeSpecial(ipimap);
+	sbi->ipimap = NULL;
+
+	/*
+	 * close secondary aggregate inode allocation map
+	 */
+	ipaimap2 = sbi->ipaimap2;
+	if (ipaimap2) {
+		diUnmount(ipaimap2, 0);
+		diFreeSpecial(ipaimap2);
+		sbi->ipaimap2 = NULL;
+	}
+
+	/*
+	 * close aggregate inode allocation map
+	 */
+	ipaimap = sbi->ipaimap;
+	diUnmount(ipaimap, 0);
+	diFreeSpecial(ipaimap);
+	sbi->ipaimap = NULL;
+
+	/*
+	 * close aggregate block allocation map
+	 */
+	dbUnmount(ipbmap, 0);
+
+	diFreeSpecial(ipbmap);
+	sbi->ipimap = NULL;
+
+	/*
+	 * Make sure all metadata makes it to disk before we mark
+	 * the superblock as clean
+	 */
+	filemap_write_and_wait(sbi->direct_inode->i_mapping);
+
+	/*
+	 * ensure all file system file pages are propagated to their
+	 * home blocks on disk (and their in-memory buffer pages are
+	 * invalidated) BEFORE updating file system superblock state
+	 * (to signify file system is unmounted cleanly, and thus in
+	 * consistent state) and log superblock active file system
+	 * list (to signify skip logredo()).
+	 */
+	if (log) {		/* log = NULL if read-only mount */
+		updateSuper(sb, FM_CLEAN);
+
+		/*
+		 * close log:
+		 *
+		 * remove file system from log active file system list.
+		 */
+		rc = lmLogClose(sb);
+	}
+	jfs_info("UnMount JFS Complete: rc = %d", rc);
+	return rc;
+}
+
+
+int jfs_umount_rw(struct super_block *sb)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_log *log = sbi->log;
+
+	if (!log)
+		return 0;
+
+	/*
+	 * close log:
+	 *
+	 * remove file system from log active file system list.
+	 */
+	jfs_flush_journal(log, 1);
+
+	/*
+	 * Make sure all metadata makes it to disk
+	 */
+	dbSync(sbi->ipbmap);
+	diSync(sbi->ipimap);
+
+	/*
+	 * Note that we have to do this even if sync_blockdev() will
+	 * do exactly the same a few instructions later:  We can't
+	 * mark the superblock clean before everything is flushed to
+	 * disk.
+	 */
+	filemap_write_and_wait(sbi->direct_inode->i_mapping);
+
+	updateSuper(sb, FM_CLEAN);
+
+	return lmLogClose(sb);
+}
diff --git a/fs/jfs/jfs_unicode.c b/fs/jfs/jfs_unicode.c
new file mode 100644
index 00000000..c7de6f5b
--- /dev/null
+++ b/fs/jfs/jfs_unicode.c
@@ -0,0 +1,138 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_unicode.h"
+#include "jfs_debug.h"
+
+/*
+ * NAME:	jfs_strfromUCS()
+ *
+ * FUNCTION:	Convert little-endian unicode string to character string
+ *
+ */
+int jfs_strfromUCS_le(char *to, const __le16 * from,
+		      int len, struct nls_table *codepage)
+{
+	int i;
+	int outlen = 0;
+	static int warn_again = 5;	/* Only warn up to 5 times total */
+	int warn = !!warn_again;	/* once per string */
+
+	if (codepage) {
+		for (i = 0; (i < len) && from[i]; i++) {
+			int charlen;
+			charlen =
+			    codepage->uni2char(le16_to_cpu(from[i]),
+					       &to[outlen],
+					       NLS_MAX_CHARSET_SIZE);
+			if (charlen > 0)
+				outlen += charlen;
+			else
+				to[outlen++] = '?';
+		}
+	} else {
+		for (i = 0; (i < len) && from[i]; i++) {
+			if (unlikely(le16_to_cpu(from[i]) & 0xff00)) {
+				to[i] = '?';
+				if (unlikely(warn)) {
+					warn--;
+					warn_again--;
+					printk(KERN_ERR
+			"non-latin1 character 0x%x found in JFS file name\n",
+					       le16_to_cpu(from[i]));
+					printk(KERN_ERR
+				"mount with iocharset=utf8 to access\n");
+				}
+
+			}
+			else
+				to[i] = (char) (le16_to_cpu(from[i]));
+		}
+		outlen = i;
+	}
+	to[outlen] = 0;
+	return outlen;
+}
+
+/*
+ * NAME:	jfs_strtoUCS()
+ *
+ * FUNCTION:	Convert character string to unicode string
+ *
+ */
+static int jfs_strtoUCS(wchar_t * to, const unsigned char *from, int len,
+		struct nls_table *codepage)
+{
+	int charlen;
+	int i;
+
+	if (codepage) {
+		for (i = 0; len && *from; i++, from += charlen, len -= charlen)
+		{
+			charlen = codepage->char2uni(from, len, &to[i]);
+			if (charlen < 1) {
+				jfs_err("jfs_strtoUCS: char2uni returned %d.",
+					charlen);
+				jfs_err("charset = %s, char = 0x%x",
+					codepage->charset, *from);
+				return charlen;
+			}
+		}
+	} else {
+		for (i = 0; (i < len) && from[i]; i++)
+			to[i] = (wchar_t) from[i];
+	}
+
+	to[i] = 0;
+	return i;
+}
+
+/*
+ * NAME:	get_UCSname()
+ *
+ * FUNCTION:	Allocate and translate to unicode string
+ *
+ */
+int get_UCSname(struct component_name * uniName, struct dentry *dentry)
+{
+	struct nls_table *nls_tab = JFS_SBI(dentry->d_sb)->nls_tab;
+	int length = dentry->d_name.len;
+
+	if (length > JFS_NAME_MAX)
+		return -ENAMETOOLONG;
+
+	uniName->name =
+	    kmalloc((length + 1) * sizeof(wchar_t), GFP_NOFS);
+
+	if (uniName->name == NULL)
+		return -ENOMEM;
+
+	uniName->namlen = jfs_strtoUCS(uniName->name, dentry->d_name.name,
+				       length, nls_tab);
+
+	if (uniName->namlen < 0) {
+		kfree(uniName->name);
+		return uniName->namlen;
+	}
+
+	return 0;
+}
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
new file mode 100644
index 00000000..8f0f02cb
--- /dev/null
+++ b/fs/jfs/jfs_unicode.h
@@ -0,0 +1,156 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_UNICODE
+#define _H_JFS_UNICODE
+
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+#include "jfs_types.h"
+
+typedef struct {
+	wchar_t start;
+	wchar_t end;
+	signed char *table;
+} UNICASERANGE;
+
+extern signed char UniUpperTable[512];
+extern UNICASERANGE UniUpperRange[];
+extern int get_UCSname(struct component_name *, struct dentry *);
+extern int jfs_strfromUCS_le(char *, const __le16 *, int, struct nls_table *);
+
+#define free_UCSname(COMP) kfree((COMP)->name)
+
+/*
+ * UniStrcpy:  Copy a string
+ */
+static inline wchar_t *UniStrcpy(wchar_t * ucs1, const wchar_t * ucs2)
+{
+	wchar_t *anchor = ucs1;	/* save the start of result string */
+
+	while ((*ucs1++ = *ucs2++));
+	return anchor;
+}
+
+
+
+/*
+ * UniStrncpy:  Copy length limited string with pad
+ */
+static inline __le16 *UniStrncpy_le(__le16 * ucs1, const __le16 * ucs2,
+				  size_t n)
+{
+	__le16 *anchor = ucs1;
+
+	while (n-- && *ucs2)	/* Copy the strings */
+		*ucs1++ = *ucs2++;
+
+	n++;
+	while (n--)		/* Pad with nulls */
+		*ucs1++ = 0;
+	return anchor;
+}
+
+/*
+ * UniStrncmp_le:  Compare length limited string - native to little-endian
+ */
+static inline int UniStrncmp_le(const wchar_t * ucs1, const __le16 * ucs2,
+				size_t n)
+{
+	if (!n)
+		return 0;	/* Null strings are equal */
+	while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) {
+		ucs1++;
+		ucs2++;
+	}
+	return (int) *ucs1 - (int) __le16_to_cpu(*ucs2);
+}
+
+/*
+ * UniStrncpy_to_le:  Copy length limited string with pad to little-endian
+ */
+static inline __le16 *UniStrncpy_to_le(__le16 * ucs1, const wchar_t * ucs2,
+				       size_t n)
+{
+	__le16 *anchor = ucs1;
+
+	while (n-- && *ucs2)	/* Copy the strings */
+		*ucs1++ = cpu_to_le16(*ucs2++);
+
+	n++;
+	while (n--)		/* Pad with nulls */
+		*ucs1++ = 0;
+	return anchor;
+}
+
+/*
+ * UniStrncpy_from_le:  Copy length limited string with pad from little-endian
+ */
+static inline wchar_t *UniStrncpy_from_le(wchar_t * ucs1, const __le16 * ucs2,
+					  size_t n)
+{
+	wchar_t *anchor = ucs1;
+
+	while (n-- && *ucs2)	/* Copy the strings */
+		*ucs1++ = __le16_to_cpu(*ucs2++);
+
+	n++;
+	while (n--)		/* Pad with nulls */
+		*ucs1++ = 0;
+	return anchor;
+}
+
+/*
+ * UniToupper:  Convert a unicode character to upper case
+ */
+static inline wchar_t UniToupper(wchar_t uc)
+{
+	UNICASERANGE *rp;
+
+	if (uc < sizeof(UniUpperTable)) {	/* Latin characters */
+		return uc + UniUpperTable[uc];	/* Use base tables */
+	} else {
+		rp = UniUpperRange;	/* Use range tables */
+		while (rp->start) {
+			if (uc < rp->start)	/* Before start of range */
+				return uc;	/* Uppercase = input */
+			if (uc <= rp->end)	/* In range */
+				return uc + rp->table[uc - rp->start];
+			rp++;	/* Try next range */
+		}
+	}
+	return uc;		/* Past last range */
+}
+
+
+/*
+ * UniStrupr:  Upper case a unicode string
+ */
+static inline wchar_t *UniStrupr(wchar_t * upin)
+{
+	wchar_t *up;
+
+	up = upin;
+	while (*up) {		/* For all characters */
+		*up = UniToupper(*up);
+		up++;
+	}
+	return upin;		/* Return input pointer */
+}
+
+#endif				/* !_H_JFS_UNICODE */
diff --git a/fs/jfs/jfs_uniupr.c b/fs/jfs/jfs_uniupr.c
new file mode 100644
index 00000000..cfe50666
--- /dev/null
+++ b/fs/jfs/jfs_uniupr.c
@@ -0,0 +1,134 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include "jfs_unicode.h"
+
+/*
+ * Latin upper case
+ */
+signed char UniUpperTable[512] = {
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 000-00f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 010-01f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 020-02f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 030-03f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 040-04f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 050-05f */
+   0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 060-06f */
+ -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,  0,  0,  0,  0,  0, /* 070-07f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 080-08f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 090-09f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0a0-0af */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0b0-0bf */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0c0-0cf */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0d0-0df */
+ -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 0e0-0ef */
+ -32,-32,-32,-32,-32,-32,-32,  0,-32,-32,-32,-32,-32,-32,-32,121, /* 0f0-0ff */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 100-10f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 110-11f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 120-12f */
+   0,  0,  0, -1,  0, -1,  0, -1,  0,  0, -1,  0, -1,  0, -1,  0, /* 130-13f */
+  -1,  0, -1,  0, -1,  0, -1,  0, -1,  0,  0, -1,  0, -1,  0, -1, /* 140-14f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 150-15f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 160-16f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0,  0, -1,  0, -1,  0, -1,  0, /* 170-17f */
+   0,  0,  0, -1,  0, -1,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, /* 180-18f */
+   0,  0, -1,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0, /* 190-19f */
+   0, -1,  0, -1,  0, -1,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0, /* 1a0-1af */
+  -1,  0,  0,  0, -1,  0, -1,  0,  0, -1,  0,  0,  0, -1,  0,  0, /* 1b0-1bf */
+   0,  0,  0,  0,  0, -1, -2,  0, -1, -2,  0, -1, -2,  0, -1,  0, /* 1c0-1cf */
+  -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,-79,  0, -1, /* 1d0-1df */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e0-1ef */
+   0,  0, -1, -2,  0, -1,  0,  0,  0, -1,  0, -1,  0, -1,  0, -1, /* 1f0-1ff */
+};
+
+/* Upper case range - Greek */
+static signed char UniCaseRangeU03a0[47] = {
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,-38,-37,-37,-37, /* 3a0-3af */
+   0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 3b0-3bf */
+ -32,-32,-31,-32,-32,-32,-32,-32,-32,-32,-32,-32,-64,-63,-63,
+};
+
+/* Upper case range - Cyrillic */
+static signed char UniCaseRangeU0430[48] = {
+ -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 430-43f */
+ -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 440-44f */
+   0,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,  0,-80,-80, /* 450-45f */
+};
+
+/* Upper case range - Extended cyrillic */
+static signed char UniCaseRangeU0490[61] = {
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 490-49f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 4a0-4af */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 4b0-4bf */
+   0,  0, -1,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,
+};
+
+/* Upper case range - Extended latin and greek */
+static signed char UniCaseRangeU1e00[509] = {
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e00-1e0f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e10-1e1f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e20-1e2f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e30-1e3f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e40-1e4f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e50-1e5f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e60-1e6f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e70-1e7f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e80-1e8f */
+   0, -1,  0, -1,  0, -1,  0,  0,  0,  0,  0,-59,  0, -1,  0, -1, /* 1e90-1e9f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1ea0-1eaf */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1eb0-1ebf */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1ec0-1ecf */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1ed0-1edf */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1ee0-1eef */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0,  0,  0,  0,  0,  0, /* 1ef0-1eff */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f00-1f0f */
+   8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f10-1f1f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f20-1f2f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f30-1f3f */
+   8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f40-1f4f */
+   0,  8,  0,  8,  0,  8,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f50-1f5f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f60-1f6f */
+  74, 74, 86, 86, 86, 86,100,100,  0,  0,112,112,126,126,  0,  0, /* 1f70-1f7f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f80-1f8f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f90-1f9f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1fa0-1faf */
+   8,  8,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1fb0-1fbf */
+   0,  0,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1fc0-1fcf */
+   8,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1fd0-1fdf */
+   8,  8,  0,  0,  0,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1fe0-1fef */
+   0,  0,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+};
+
+/* Upper case range - Wide latin */
+static signed char UniCaseRangeUff40[27] = {
+   0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* ff40-ff4f */
+ -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,
+};
+
+/*
+ * Upper Case Range
+ */
+UNICASERANGE UniUpperRange[] = {
+    { 0x03a0,  0x03ce,  UniCaseRangeU03a0 },
+    { 0x0430,  0x045f,  UniCaseRangeU0430 },
+    { 0x0490,  0x04cc,  UniCaseRangeU0490 },
+    { 0x1e00,  0x1ffc,  UniCaseRangeU1e00 },
+    { 0xff40,  0xff5a,  UniCaseRangeUff40 },
+    { 0 }
+};
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
new file mode 100644
index 00000000..e9e100fd
--- /dev/null
+++ b/fs/jfs/jfs_xattr.h
@@ -0,0 +1,75 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef H_JFS_XATTR
+#define H_JFS_XATTR
+
+/*
+ * jfs_ea_list describe the on-disk format of the extended attributes.
+ * I know the null-terminator is redundant since namelen is stored, but
+ * I am maintaining compatibility with OS/2 where possible.
+ */
+struct jfs_ea {
+	u8 flag;	/* Unused? */
+	u8 namelen;	/* Length of name */
+	__le16 valuelen;	/* Length of value */
+	char name[0];	/* Attribute name (includes null-terminator) */
+};			/* Value immediately follows name */
+
+struct jfs_ea_list {
+	__le32 size;		/* overall size */
+	struct jfs_ea ea[0];	/* Variable length list */
+};
+
+/* Macros for defining maxiumum number of bytes supported for EAs */
+#define MAXEASIZE	65535
+#define MAXEALISTSIZE	MAXEASIZE
+
+/*
+ * some macros for dealing with variable length EA lists.
+ */
+#define EA_SIZE(ea) \
+	(sizeof (struct jfs_ea) + (ea)->namelen + 1 + \
+	 le16_to_cpu((ea)->valuelen))
+#define	NEXT_EA(ea) ((struct jfs_ea *) (((char *) (ea)) + (EA_SIZE (ea))))
+#define	FIRST_EA(ealist) ((ealist)->ea)
+#define	EALIST_SIZE(ealist) le32_to_cpu((ealist)->size)
+#define	END_EALIST(ealist) \
+	((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist)))
+
+extern int __jfs_setxattr(tid_t, struct inode *, const char *, const void *,
+			  size_t, int);
+extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t,
+			int);
+extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t);
+extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
+extern int jfs_removexattr(struct dentry *, const char *);
+
+#ifdef CONFIG_JFS_SECURITY
+extern int jfs_init_security(tid_t, struct inode *, struct inode *,
+			     const struct qstr *);
+#else
+static inline int jfs_init_security(tid_t tid, struct inode *inode,
+				    struct inode *dir, const struct qstr *qstr)
+{
+	return 0;
+}
+#endif
+
+#endif	/* H_JFS_XATTR */
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
new file mode 100644
index 00000000..6c50871e
--- /dev/null
+++ b/fs/jfs/jfs_xtree.c
@@ -0,0 +1,3905 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2005
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ *	jfs_xtree.c: extent allocation descriptor B+-tree manager
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/quotaops.h>
+#include <linux/seq_file.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_dmap.h"
+#include "jfs_dinode.h"
+#include "jfs_superblock.h"
+#include "jfs_debug.h"
+
+/*
+ * xtree local flag
+ */
+#define XT_INSERT	0x00000001
+
+/*
+ *	xtree key/entry comparison: extent offset
+ *
+ * return:
+ *	-1: k < start of extent
+ *	 0: start_of_extent <= k <= end_of_extent
+ *	 1: k > end_of_extent
+ */
+#define XT_CMP(CMP, K, X, OFFSET64)\
+{\
+	OFFSET64 = offsetXAD(X);\
+	(CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\
+		((K) < OFFSET64) ? -1 : 0;\
+}
+
+/* write a xad entry */
+#define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\
+{\
+	(XAD)->flag = (FLAG);\
+	XADoffset((XAD), (OFF));\
+	XADlength((XAD), (LEN));\
+	XADaddress((XAD), (ADDR));\
+}
+
+#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot)
+
+/* get page buffer for specified block address */
+/* ToDo: Replace this ugly macro with a function */
+#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
+{\
+	BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\
+	if (!(RC))\
+	{\
+		if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\
+		    (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\
+		    (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\
+		{\
+			jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\
+			BT_PUTPAGE(MP);\
+			MP = NULL;\
+			RC = -EIO;\
+		}\
+	}\
+}
+
+/* for consistency */
+#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
+
+#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
+	BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot)
+/* xtree entry parameter descriptor */
+struct xtsplit {
+	struct metapage *mp;
+	s16 index;
+	u8 flag;
+	s64 off;
+	s64 addr;
+	int len;
+	struct pxdlist *pxdlist;
+};
+
+
+/*
+ *	statistics
+ */
+#ifdef CONFIG_JFS_STATISTICS
+static struct {
+	uint search;
+	uint fastSearch;
+	uint split;
+} xtStat;
+#endif
+
+
+/*
+ * forward references
+ */
+static int xtSearch(struct inode *ip, s64 xoff, s64 *next, int *cmpp,
+		    struct btstack * btstack, int flag);
+
+static int xtSplitUp(tid_t tid,
+		     struct inode *ip,
+		     struct xtsplit * split, struct btstack * btstack);
+
+static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split,
+		       struct metapage ** rmpp, s64 * rbnp);
+
+static int xtSplitRoot(tid_t tid, struct inode *ip,
+		       struct xtsplit * split, struct metapage ** rmpp);
+
+#ifdef _STILL_TO_PORT
+static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
+		      xtpage_t * fp, struct btstack * btstack);
+
+static int xtSearchNode(struct inode *ip,
+			xad_t * xad,
+			int *cmpp, struct btstack * btstack, int flag);
+
+static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp);
+#endif				/*  _STILL_TO_PORT */
+
+/*
+ *	xtLookup()
+ *
+ * function: map a single page into a physical extent;
+ */
+int xtLookup(struct inode *ip, s64 lstart,
+	     s64 llen, int *pflag, s64 * paddr, s32 * plen, int no_check)
+{
+	int rc = 0;
+	struct btstack btstack;
+	int cmp;
+	s64 bn;
+	struct metapage *mp;
+	xtpage_t *p;
+	int index;
+	xad_t *xad;
+	s64 next, size, xoff, xend;
+	int xlen;
+	s64 xaddr;
+
+	*paddr = 0;
+	*plen = llen;
+
+	if (!no_check) {
+		/* is lookup offset beyond eof ? */
+		size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
+		    JFS_SBI(ip->i_sb)->l2bsize;
+		if (lstart >= size)
+			return 0;
+	}
+
+	/*
+	 * search for the xad entry covering the logical extent
+	 */
+//search:
+	if ((rc = xtSearch(ip, lstart, &next, &cmp, &btstack, 0))) {
+		jfs_err("xtLookup: xtSearch returned %d", rc);
+		return rc;
+	}
+
+	/*
+	 *	compute the physical extent covering logical extent
+	 *
+	 * N.B. search may have failed (e.g., hole in sparse file),
+	 * and returned the index of the next entry.
+	 */
+	/* retrieve search result */
+	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+	/* is xad found covering start of logical extent ?
+	 * lstart is a page start address,
+	 * i.e., lstart cannot start in a hole;
+	 */
+	if (cmp) {
+		if (next)
+			*plen = min(next - lstart, llen);
+		goto out;
+	}
+
+	/*
+	 * lxd covered by xad
+	 */
+	xad = &p->xad[index];
+	xoff = offsetXAD(xad);
+	xlen = lengthXAD(xad);
+	xend = xoff + xlen;
+	xaddr = addressXAD(xad);
+
+	/* initialize new pxd */
+	*pflag = xad->flag;
+	*paddr = xaddr + (lstart - xoff);
+	/* a page must be fully covered by an xad */
+	*plen = min(xend - lstart, llen);
+
+      out:
+	XT_PUTPAGE(mp);
+
+	return rc;
+}
+
+/*
+ *	xtSearch()
+ *
+ * function:	search for the xad entry covering specified offset.
+ *
+ * parameters:
+ *	ip	- file object;
+ *	xoff	- extent offset;
+ *	nextp	- address of next extent (if any) for search miss
+ *	cmpp	- comparison result:
+ *	btstack - traverse stack;
+ *	flag	- search process flag (XT_INSERT);
+ *
+ * returns:
+ *	btstack contains (bn, index) of search path traversed to the entry.
+ *	*cmpp is set to result of comparison with the entry returned.
+ *	the page containing the entry is pinned at exit.
+ */
+static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
+		    int *cmpp, struct btstack * btstack, int flag)
+{
+	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+	int rc = 0;
+	int cmp = 1;		/* init for empty page */
+	s64 bn;			/* block number */
+	struct metapage *mp;	/* page buffer */
+	xtpage_t *p;		/* page */
+	xad_t *xad;
+	int base, index, lim, btindex;
+	struct btframe *btsp;
+	int nsplit = 0;		/* number of pages to split */
+	s64 t64;
+	s64 next = 0;
+
+	INCREMENT(xtStat.search);
+
+	BT_CLR(btstack);
+
+	btstack->nsplit = 0;
+
+	/*
+	 *	search down tree from root:
+	 *
+	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
+	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
+	 *
+	 * if entry with search key K is not found
+	 * internal page search find the entry with largest key Ki
+	 * less than K which point to the child page to search;
+	 * leaf page search find the entry with smallest key Kj
+	 * greater than K so that the returned index is the position of
+	 * the entry to be shifted right for insertion of new entry.
+	 * for empty tree, search key is greater than any key of the tree.
+	 *
+	 * by convention, root bn = 0.
+	 */
+	for (bn = 0;;) {
+		/* get/pin the page to search */
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/* try sequential access heuristics with the previous
+		 * access entry in target leaf page:
+		 * once search narrowed down into the target leaf,
+		 * key must either match an entry in the leaf or
+		 * key entry does not exist in the tree;
+		 */
+//fastSearch:
+		if ((jfs_ip->btorder & BT_SEQUENTIAL) &&
+		    (p->header.flag & BT_LEAF) &&
+		    (index = jfs_ip->btindex) <
+		    le16_to_cpu(p->header.nextindex)) {
+			xad = &p->xad[index];
+			t64 = offsetXAD(xad);
+			if (xoff < t64 + lengthXAD(xad)) {
+				if (xoff >= t64) {
+					*cmpp = 0;
+					goto out;
+				}
+
+				/* stop sequential access heuristics */
+				goto binarySearch;
+			} else {	/* (t64 + lengthXAD(xad)) <= xoff */
+
+				/* try next sequential entry */
+				index++;
+				if (index <
+				    le16_to_cpu(p->header.nextindex)) {
+					xad++;
+					t64 = offsetXAD(xad);
+					if (xoff < t64 + lengthXAD(xad)) {
+						if (xoff >= t64) {
+							*cmpp = 0;
+							goto out;
+						}
+
+						/* miss: key falls between
+						 * previous and this entry
+						 */
+						*cmpp = 1;
+						next = t64;
+						goto out;
+					}
+
+					/* (xoff >= t64 + lengthXAD(xad));
+					 * matching entry may be further out:
+					 * stop heuristic search
+					 */
+					/* stop sequential access heuristics */
+					goto binarySearch;
+				}
+
+				/* (index == p->header.nextindex);
+				 * miss: key entry does not exist in
+				 * the target leaf/tree
+				 */
+				*cmpp = 1;
+				goto out;
+			}
+
+			/*
+			 * if hit, return index of the entry found, and
+			 * if miss, where new entry with search key is
+			 * to be inserted;
+			 */
+		      out:
+			/* compute number of pages to split */
+			if (flag & XT_INSERT) {
+				if (p->header.nextindex ==	/* little-endian */
+				    p->header.maxentry)
+					nsplit++;
+				else
+					nsplit = 0;
+				btstack->nsplit = nsplit;
+			}
+
+			/* save search result */
+			btsp = btstack->top;
+			btsp->bn = bn;
+			btsp->index = index;
+			btsp->mp = mp;
+
+			/* update sequential access heuristics */
+			jfs_ip->btindex = index;
+
+			if (nextp)
+				*nextp = next;
+
+			INCREMENT(xtStat.fastSearch);
+			return 0;
+		}
+
+		/* well, ... full search now */
+	      binarySearch:
+		lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
+
+		/*
+		 * binary search with search key K on the current page
+		 */
+		for (base = XTENTRYSTART; lim; lim >>= 1) {
+			index = base + (lim >> 1);
+
+			XT_CMP(cmp, xoff, &p->xad[index], t64);
+			if (cmp == 0) {
+				/*
+				 *	search hit
+				 */
+				/* search hit - leaf page:
+				 * return the entry found
+				 */
+				if (p->header.flag & BT_LEAF) {
+					*cmpp = cmp;
+
+					/* compute number of pages to split */
+					if (flag & XT_INSERT) {
+						if (p->header.nextindex ==
+						    p->header.maxentry)
+							nsplit++;
+						else
+							nsplit = 0;
+						btstack->nsplit = nsplit;
+					}
+
+					/* save search result */
+					btsp = btstack->top;
+					btsp->bn = bn;
+					btsp->index = index;
+					btsp->mp = mp;
+
+					/* init sequential access heuristics */
+					btindex = jfs_ip->btindex;
+					if (index == btindex ||
+					    index == btindex + 1)
+						jfs_ip->btorder = BT_SEQUENTIAL;
+					else
+						jfs_ip->btorder = BT_RANDOM;
+					jfs_ip->btindex = index;
+
+					return 0;
+				}
+				/* search hit - internal page:
+				 * descend/search its child page
+				 */
+				if (index < le16_to_cpu(p->header.nextindex)-1)
+					next = offsetXAD(&p->xad[index + 1]);
+				goto next;
+			}
+
+			if (cmp > 0) {
+				base = index + 1;
+				--lim;
+			}
+		}
+
+		/*
+		 *	search miss
+		 *
+		 * base is the smallest index with key (Kj) greater than
+		 * search key (K) and may be zero or maxentry index.
+		 */
+		if (base < le16_to_cpu(p->header.nextindex))
+			next = offsetXAD(&p->xad[base]);
+		/*
+		 * search miss - leaf page:
+		 *
+		 * return location of entry (base) where new entry with
+		 * search key K is to be inserted.
+		 */
+		if (p->header.flag & BT_LEAF) {
+			*cmpp = cmp;
+
+			/* compute number of pages to split */
+			if (flag & XT_INSERT) {
+				if (p->header.nextindex ==
+				    p->header.maxentry)
+					nsplit++;
+				else
+					nsplit = 0;
+				btstack->nsplit = nsplit;
+			}
+
+			/* save search result */
+			btsp = btstack->top;
+			btsp->bn = bn;
+			btsp->index = base;
+			btsp->mp = mp;
+
+			/* init sequential access heuristics */
+			btindex = jfs_ip->btindex;
+			if (base == btindex || base == btindex + 1)
+				jfs_ip->btorder = BT_SEQUENTIAL;
+			else
+				jfs_ip->btorder = BT_RANDOM;
+			jfs_ip->btindex = base;
+
+			if (nextp)
+				*nextp = next;
+
+			return 0;
+		}
+
+		/*
+		 * search miss - non-leaf page:
+		 *
+		 * if base is non-zero, decrement base by one to get the parent
+		 * entry of the child page to search.
+		 */
+		index = base ? base - 1 : base;
+
+		/*
+		 * go down to child page
+		 */
+	      next:
+		/* update number of pages to split */
+		if (p->header.nextindex == p->header.maxentry)
+			nsplit++;
+		else
+			nsplit = 0;
+
+		/* push (bn, index) of the parent page/entry */
+		if (BT_STACK_FULL(btstack)) {
+			jfs_error(ip->i_sb, "stack overrun in xtSearch!");
+			XT_PUTPAGE(mp);
+			return -EIO;
+		}
+		BT_PUSH(btstack, bn, index);
+
+		/* get the child page block number */
+		bn = addressXAD(&p->xad[index]);
+
+		/* unpin the parent page */
+		XT_PUTPAGE(mp);
+	}
+}
+
+/*
+ *	xtInsert()
+ *
+ * function:
+ *
+ * parameter:
+ *	tid	- transaction id;
+ *	ip	- file object;
+ *	xflag	- extent flag (XAD_NOTRECORDED):
+ *	xoff	- extent offset;
+ *	xlen	- extent length;
+ *	xaddrp	- extent address pointer (in/out):
+ *		if (*xaddrp)
+ *			caller allocated data extent at *xaddrp;
+ *		else
+ *			allocate data extent and return its xaddr;
+ *	flag	-
+ *
+ * return:
+ */
+int xtInsert(tid_t tid,		/* transaction id */
+	     struct inode *ip, int xflag, s64 xoff, s32 xlen, s64 * xaddrp,
+	     int flag)
+{
+	int rc = 0;
+	s64 xaddr, hint;
+	struct metapage *mp;	/* meta-page buffer */
+	xtpage_t *p;		/* base B+-tree index page */
+	s64 bn;
+	int index, nextindex;
+	struct btstack btstack;	/* traverse stack */
+	struct xtsplit split;	/* split information */
+	xad_t *xad;
+	int cmp;
+	s64 next;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+
+	jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
+
+	/*
+	 *	search for the entry location at which to insert:
+	 *
+	 * xtFastSearch() and xtSearch() both returns (leaf page
+	 * pinned, index at which to insert).
+	 * n.b. xtSearch() may return index of maxentry of
+	 * the full page.
+	 */
+	if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT)))
+		return rc;
+
+	/* retrieve search result */
+	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+	/* This test must follow XT_GETSEARCH since mp must be valid if
+	 * we branch to out: */
+	if ((cmp == 0) || (next && (xlen > next - xoff))) {
+		rc = -EEXIST;
+		goto out;
+	}
+
+	/*
+	 * allocate data extent requested
+	 *
+	 * allocation hint: last xad
+	 */
+	if ((xaddr = *xaddrp) == 0) {
+		if (index > XTENTRYSTART) {
+			xad = &p->xad[index - 1];
+			hint = addressXAD(xad) + lengthXAD(xad) - 1;
+		} else
+			hint = 0;
+		if ((rc = dquot_alloc_block(ip, xlen)))
+			goto out;
+		if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
+			dquot_free_block(ip, xlen);
+			goto out;
+		}
+	}
+
+	/*
+	 *	insert entry for new extent
+	 */
+	xflag |= XAD_NEW;
+
+	/*
+	 *	if the leaf page is full, split the page and
+	 *	propagate up the router entry for the new page from split
+	 *
+	 * The xtSplitUp() will insert the entry and unpin the leaf page.
+	 */
+	nextindex = le16_to_cpu(p->header.nextindex);
+	if (nextindex == le16_to_cpu(p->header.maxentry)) {
+		split.mp = mp;
+		split.index = index;
+		split.flag = xflag;
+		split.off = xoff;
+		split.len = xlen;
+		split.addr = xaddr;
+		split.pxdlist = NULL;
+		if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
+			/* undo data extent allocation */
+			if (*xaddrp == 0) {
+				dbFree(ip, xaddr, (s64) xlen);
+				dquot_free_block(ip, xlen);
+			}
+			return rc;
+		}
+
+		*xaddrp = xaddr;
+		return 0;
+	}
+
+	/*
+	 *	insert the new entry into the leaf page
+	 */
+	/*
+	 * acquire a transaction lock on the leaf page;
+	 *
+	 * action: xad insertion/extension;
+	 */
+	BT_MARK_DIRTY(mp, ip);
+
+	/* if insert into middle, shift right remaining entries. */
+	if (index < nextindex)
+		memmove(&p->xad[index + 1], &p->xad[index],
+			(nextindex - index) * sizeof(xad_t));
+
+	/* insert the new entry: mark the entry NEW */
+	xad = &p->xad[index];
+	XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
+
+	/* advance next available entry index */
+	le16_add_cpu(&p->header.nextindex, 1);
+
+	/* Don't log it if there are no links to the file */
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+		xtlck = (struct xtlock *) & tlck->lock;
+		xtlck->lwm.offset =
+		    (xtlck->lwm.offset) ? min(index,
+					      (int)xtlck->lwm.offset) : index;
+		xtlck->lwm.length =
+		    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
+	}
+
+	*xaddrp = xaddr;
+
+      out:
+	/* unpin the leaf page */
+	XT_PUTPAGE(mp);
+
+	return rc;
+}
+
+
+/*
+ *	xtSplitUp()
+ *
+ * function:
+ *	split full pages as propagating insertion up the tree
+ *
+ * parameter:
+ *	tid	- transaction id;
+ *	ip	- file object;
+ *	split	- entry parameter descriptor;
+ *	btstack - traverse stack from xtSearch()
+ *
+ * return:
+ */
+static int
+xtSplitUp(tid_t tid,
+	  struct inode *ip, struct xtsplit * split, struct btstack * btstack)
+{
+	int rc = 0;
+	struct metapage *smp;
+	xtpage_t *sp;		/* split page */
+	struct metapage *rmp;
+	s64 rbn;		/* new right page block number */
+	struct metapage *rcmp;
+	xtpage_t *rcp;		/* right child page */
+	s64 rcbn;		/* right child page block number */
+	int skip;		/* index of entry of insertion */
+	int nextindex;		/* next available entry index of p */
+	struct btframe *parent;	/* parent page entry on traverse stack */
+	xad_t *xad;
+	s64 xaddr;
+	int xlen;
+	int nsplit;		/* number of pages split */
+	struct pxdlist pxdlist;
+	pxd_t *pxd;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+
+	smp = split->mp;
+	sp = XT_PAGE(ip, smp);
+
+	/* is inode xtree root extension/inline EA area free ? */
+	if ((sp->header.flag & BT_ROOT) && (!S_ISDIR(ip->i_mode)) &&
+	    (le16_to_cpu(sp->header.maxentry) < XTROOTMAXSLOT) &&
+	    (JFS_IP(ip)->mode2 & INLINEEA)) {
+		sp->header.maxentry = cpu_to_le16(XTROOTMAXSLOT);
+		JFS_IP(ip)->mode2 &= ~INLINEEA;
+
+		BT_MARK_DIRTY(smp, ip);
+		/*
+		 * acquire a transaction lock on the leaf page;
+		 *
+		 * action: xad insertion/extension;
+		 */
+
+		/* if insert into middle, shift right remaining entries. */
+		skip = split->index;
+		nextindex = le16_to_cpu(sp->header.nextindex);
+		if (skip < nextindex)
+			memmove(&sp->xad[skip + 1], &sp->xad[skip],
+				(nextindex - skip) * sizeof(xad_t));
+
+		/* insert the new entry: mark the entry NEW */
+		xad = &sp->xad[skip];
+		XT_PUTENTRY(xad, split->flag, split->off, split->len,
+			    split->addr);
+
+		/* advance next available entry index */
+		le16_add_cpu(&sp->header.nextindex, 1);
+
+		/* Don't log it if there are no links to the file */
+		if (!test_cflag(COMMIT_Nolink, ip)) {
+			tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
+			xtlck = (struct xtlock *) & tlck->lock;
+			xtlck->lwm.offset = (xtlck->lwm.offset) ?
+			    min(skip, (int)xtlck->lwm.offset) : skip;
+			xtlck->lwm.length =
+			    le16_to_cpu(sp->header.nextindex) -
+			    xtlck->lwm.offset;
+		}
+
+		return 0;
+	}
+
+	/*
+	 * allocate new index blocks to cover index page split(s)
+	 *
+	 * allocation hint: ?
+	 */
+	if (split->pxdlist == NULL) {
+		nsplit = btstack->nsplit;
+		split->pxdlist = &pxdlist;
+		pxdlist.maxnpxd = pxdlist.npxd = 0;
+		pxd = &pxdlist.pxd[0];
+		xlen = JFS_SBI(ip->i_sb)->nbperpage;
+		for (; nsplit > 0; nsplit--, pxd++) {
+			if ((rc = dbAlloc(ip, (s64) 0, (s64) xlen, &xaddr))
+			    == 0) {
+				PXDaddress(pxd, xaddr);
+				PXDlength(pxd, xlen);
+
+				pxdlist.maxnpxd++;
+
+				continue;
+			}
+
+			/* undo allocation */
+
+			XT_PUTPAGE(smp);
+			return rc;
+		}
+	}
+
+	/*
+	 * Split leaf page <sp> into <sp> and a new right page <rp>.
+	 *
+	 * The split routines insert the new entry into the leaf page,
+	 * and acquire txLock as appropriate.
+	 * return <rp> pinned and its block number <rpbn>.
+	 */
+	rc = (sp->header.flag & BT_ROOT) ?
+	    xtSplitRoot(tid, ip, split, &rmp) :
+	    xtSplitPage(tid, ip, split, &rmp, &rbn);
+
+	XT_PUTPAGE(smp);
+
+	if (rc)
+		return -EIO;
+	/*
+	 * propagate up the router entry for the leaf page just split
+	 *
+	 * insert a router entry for the new page into the parent page,
+	 * propagate the insert/split up the tree by walking back the stack
+	 * of (bn of parent page, index of child page entry in parent page)
+	 * that were traversed during the search for the page that split.
+	 *
+	 * the propagation of insert/split up the tree stops if the root
+	 * splits or the page inserted into doesn't have to split to hold
+	 * the new entry.
+	 *
+	 * the parent entry for the split page remains the same, and
+	 * a new entry is inserted at its right with the first key and
+	 * block number of the new right page.
+	 *
+	 * There are a maximum of 3 pages pinned at any time:
+	 * right child, left parent and right parent (when the parent splits)
+	 * to keep the child page pinned while working on the parent.
+	 * make sure that all pins are released at exit.
+	 */
+	while ((parent = BT_POP(btstack)) != NULL) {
+		/* parent page specified by stack frame <parent> */
+
+		/* keep current child pages <rcp> pinned */
+		rcmp = rmp;
+		rcbn = rbn;
+		rcp = XT_PAGE(ip, rcmp);
+
+		/*
+		 * insert router entry in parent for new right child page <rp>
+		 */
+		/* get/pin the parent page <sp> */
+		XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
+		if (rc) {
+			XT_PUTPAGE(rcmp);
+			return rc;
+		}
+
+		/*
+		 * The new key entry goes ONE AFTER the index of parent entry,
+		 * because the split was to the right.
+		 */
+		skip = parent->index + 1;
+
+		/*
+		 * split or shift right remaining entries of the parent page
+		 */
+		nextindex = le16_to_cpu(sp->header.nextindex);
+		/*
+		 * parent page is full - split the parent page
+		 */
+		if (nextindex == le16_to_cpu(sp->header.maxentry)) {
+			/* init for parent page split */
+			split->mp = smp;
+			split->index = skip;	/* index at insert */
+			split->flag = XAD_NEW;
+			split->off = offsetXAD(&rcp->xad[XTENTRYSTART]);
+			split->len = JFS_SBI(ip->i_sb)->nbperpage;
+			split->addr = rcbn;
+
+			/* unpin previous right child page */
+			XT_PUTPAGE(rcmp);
+
+			/* The split routines insert the new entry,
+			 * and acquire txLock as appropriate.
+			 * return <rp> pinned and its block number <rpbn>.
+			 */
+			rc = (sp->header.flag & BT_ROOT) ?
+			    xtSplitRoot(tid, ip, split, &rmp) :
+			    xtSplitPage(tid, ip, split, &rmp, &rbn);
+			if (rc) {
+				XT_PUTPAGE(smp);
+				return rc;
+			}
+
+			XT_PUTPAGE(smp);
+			/* keep new child page <rp> pinned */
+		}
+		/*
+		 * parent page is not full - insert in parent page
+		 */
+		else {
+			/*
+			 * insert router entry in parent for the right child
+			 * page from the first entry of the right child page:
+			 */
+			/*
+			 * acquire a transaction lock on the parent page;
+			 *
+			 * action: router xad insertion;
+			 */
+			BT_MARK_DIRTY(smp, ip);
+
+			/*
+			 * if insert into middle, shift right remaining entries
+			 */
+			if (skip < nextindex)
+				memmove(&sp->xad[skip + 1], &sp->xad[skip],
+					(nextindex -
+					 skip) << L2XTSLOTSIZE);
+
+			/* insert the router entry */
+			xad = &sp->xad[skip];
+			XT_PUTENTRY(xad, XAD_NEW,
+				    offsetXAD(&rcp->xad[XTENTRYSTART]),
+				    JFS_SBI(ip->i_sb)->nbperpage, rcbn);
+
+			/* advance next available entry index. */
+			le16_add_cpu(&sp->header.nextindex, 1);
+
+			/* Don't log it if there are no links to the file */
+			if (!test_cflag(COMMIT_Nolink, ip)) {
+				tlck = txLock(tid, ip, smp,
+					      tlckXTREE | tlckGROW);
+				xtlck = (struct xtlock *) & tlck->lock;
+				xtlck->lwm.offset = (xtlck->lwm.offset) ?
+				    min(skip, (int)xtlck->lwm.offset) : skip;
+				xtlck->lwm.length =
+				    le16_to_cpu(sp->header.nextindex) -
+				    xtlck->lwm.offset;
+			}
+
+			/* unpin parent page */
+			XT_PUTPAGE(smp);
+
+			/* exit propagate up */
+			break;
+		}
+	}
+
+	/* unpin current right page */
+	XT_PUTPAGE(rmp);
+
+	return 0;
+}
+
+
+/*
+ *	xtSplitPage()
+ *
+ * function:
+ *	split a full non-root page into
+ *	original/split/left page and new right page
+ *	i.e., the original/split page remains as left page.
+ *
+ * parameter:
+ *	int		tid,
+ *	struct inode	*ip,
+ *	struct xtsplit	*split,
+ *	struct metapage	**rmpp,
+ *	u64		*rbnp,
+ *
+ * return:
+ *	Pointer to page in which to insert or NULL on error.
+ */
+static int
+xtSplitPage(tid_t tid, struct inode *ip,
+	    struct xtsplit * split, struct metapage ** rmpp, s64 * rbnp)
+{
+	int rc = 0;
+	struct metapage *smp;
+	xtpage_t *sp;
+	struct metapage *rmp;
+	xtpage_t *rp;		/* new right page allocated */
+	s64 rbn;		/* new right page block number */
+	struct metapage *mp;
+	xtpage_t *p;
+	s64 nextbn;
+	int skip, maxentry, middle, righthalf, n;
+	xad_t *xad;
+	struct pxdlist *pxdlist;
+	pxd_t *pxd;
+	struct tlock *tlck;
+	struct xtlock *sxtlck = NULL, *rxtlck = NULL;
+	int quota_allocation = 0;
+
+	smp = split->mp;
+	sp = XT_PAGE(ip, smp);
+
+	INCREMENT(xtStat.split);
+
+	pxdlist = split->pxdlist;
+	pxd = &pxdlist->pxd[pxdlist->npxd];
+	pxdlist->npxd++;
+	rbn = addressPXD(pxd);
+
+	/* Allocate blocks to quota. */
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc)
+		goto clean_up;
+
+	quota_allocation += lengthPXD(pxd);
+
+	/*
+	 * allocate the new right page for the split
+	 */
+	rmp = get_metapage(ip, rbn, PSIZE, 1);
+	if (rmp == NULL) {
+		rc = -EIO;
+		goto clean_up;
+	}
+
+	jfs_info("xtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
+
+	BT_MARK_DIRTY(rmp, ip);
+	/*
+	 * action: new page;
+	 */
+
+	rp = (xtpage_t *) rmp->data;
+	rp->header.self = *pxd;
+	rp->header.flag = sp->header.flag & BT_TYPE;
+	rp->header.maxentry = sp->header.maxentry;	/* little-endian */
+	rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
+
+	BT_MARK_DIRTY(smp, ip);
+	/* Don't log it if there are no links to the file */
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		/*
+		 * acquire a transaction lock on the new right page;
+		 */
+		tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
+		rxtlck = (struct xtlock *) & tlck->lock;
+		rxtlck->lwm.offset = XTENTRYSTART;
+		/*
+		 * acquire a transaction lock on the split page
+		 */
+		tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
+		sxtlck = (struct xtlock *) & tlck->lock;
+	}
+
+	/*
+	 * initialize/update sibling pointers of <sp> and <rp>
+	 */
+	nextbn = le64_to_cpu(sp->header.next);
+	rp->header.next = cpu_to_le64(nextbn);
+	rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
+	sp->header.next = cpu_to_le64(rbn);
+
+	skip = split->index;
+
+	/*
+	 *	sequential append at tail (after last entry of last page)
+	 *
+	 * if splitting the last page on a level because of appending
+	 * a entry to it (skip is maxentry), it's likely that the access is
+	 * sequential. adding an empty page on the side of the level is less
+	 * work and can push the fill factor much higher than normal.
+	 * if we're wrong it's no big deal -  we will do the split the right
+	 * way next time.
+	 * (it may look like it's equally easy to do a similar hack for
+	 * reverse sorted data, that is, split the tree left, but it's not.
+	 * Be my guest.)
+	 */
+	if (nextbn == 0 && skip == le16_to_cpu(sp->header.maxentry)) {
+		/*
+		 * acquire a transaction lock on the new/right page;
+		 *
+		 * action: xad insertion;
+		 */
+		/* insert entry at the first entry of the new right page */
+		xad = &rp->xad[XTENTRYSTART];
+		XT_PUTENTRY(xad, split->flag, split->off, split->len,
+			    split->addr);
+
+		rp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
+
+		if (!test_cflag(COMMIT_Nolink, ip)) {
+			/* rxtlck->lwm.offset = XTENTRYSTART; */
+			rxtlck->lwm.length = 1;
+		}
+
+		*rmpp = rmp;
+		*rbnp = rbn;
+
+		jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp);
+		return 0;
+	}
+
+	/*
+	 *	non-sequential insert (at possibly middle page)
+	 */
+
+	/*
+	 * update previous pointer of old next/right page of <sp>
+	 */
+	if (nextbn != 0) {
+		XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+		if (rc) {
+			XT_PUTPAGE(rmp);
+			goto clean_up;
+		}
+
+		BT_MARK_DIRTY(mp, ip);
+		/*
+		 * acquire a transaction lock on the next page;
+		 *
+		 * action:sibling pointer update;
+		 */
+		if (!test_cflag(COMMIT_Nolink, ip))
+			tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
+
+		p->header.prev = cpu_to_le64(rbn);
+
+		/* sibling page may have been updated previously, or
+		 * it may be updated later;
+		 */
+
+		XT_PUTPAGE(mp);
+	}
+
+	/*
+	 * split the data between the split and new/right pages
+	 */
+	maxentry = le16_to_cpu(sp->header.maxentry);
+	middle = maxentry >> 1;
+	righthalf = maxentry - middle;
+
+	/*
+	 * skip index in old split/left page - insert into left page:
+	 */
+	if (skip <= middle) {
+		/* move right half of split page to the new right page */
+		memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
+			righthalf << L2XTSLOTSIZE);
+
+		/* shift right tail of left half to make room for new entry */
+		if (skip < middle)
+			memmove(&sp->xad[skip + 1], &sp->xad[skip],
+				(middle - skip) << L2XTSLOTSIZE);
+
+		/* insert new entry */
+		xad = &sp->xad[skip];
+		XT_PUTENTRY(xad, split->flag, split->off, split->len,
+			    split->addr);
+
+		/* update page header */
+		sp->header.nextindex = cpu_to_le16(middle + 1);
+		if (!test_cflag(COMMIT_Nolink, ip)) {
+			sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
+			    min(skip, (int)sxtlck->lwm.offset) : skip;
+		}
+
+		rp->header.nextindex =
+		    cpu_to_le16(XTENTRYSTART + righthalf);
+	}
+	/*
+	 * skip index in new right page - insert into right page:
+	 */
+	else {
+		/* move left head of right half to right page */
+		n = skip - middle;
+		memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
+			n << L2XTSLOTSIZE);
+
+		/* insert new entry */
+		n += XTENTRYSTART;
+		xad = &rp->xad[n];
+		XT_PUTENTRY(xad, split->flag, split->off, split->len,
+			    split->addr);
+
+		/* move right tail of right half to right page */
+		if (skip < maxentry)
+			memmove(&rp->xad[n + 1], &sp->xad[skip],
+				(maxentry - skip) << L2XTSLOTSIZE);
+
+		/* update page header */
+		sp->header.nextindex = cpu_to_le16(middle);
+		if (!test_cflag(COMMIT_Nolink, ip)) {
+			sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
+			    min(middle, (int)sxtlck->lwm.offset) : middle;
+		}
+
+		rp->header.nextindex = cpu_to_le16(XTENTRYSTART +
+						   righthalf + 1);
+	}
+
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		sxtlck->lwm.length = le16_to_cpu(sp->header.nextindex) -
+		    sxtlck->lwm.offset;
+
+		/* rxtlck->lwm.offset = XTENTRYSTART; */
+		rxtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
+		    XTENTRYSTART;
+	}
+
+	*rmpp = rmp;
+	*rbnp = rbn;
+
+	jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp);
+	return rc;
+
+      clean_up:
+
+	/* Rollback quota allocation. */
+	if (quota_allocation)
+		dquot_free_block(ip, quota_allocation);
+
+	return (rc);
+}
+
+
+/*
+ *	xtSplitRoot()
+ *
+ * function:
+ *	split the full root page into original/root/split page and new
+ *	right page
+ *	i.e., root remains fixed in tree anchor (inode) and the root is
+ *	copied to a single new right child page since root page <<
+ *	non-root page, and the split root page contains a single entry
+ *	for the new right child page.
+ *
+ * parameter:
+ *	int		tid,
+ *	struct inode	*ip,
+ *	struct xtsplit	*split,
+ *	struct metapage	**rmpp)
+ *
+ * return:
+ *	Pointer to page in which to insert or NULL on error.
+ */
+static int
+xtSplitRoot(tid_t tid,
+	    struct inode *ip, struct xtsplit * split, struct metapage ** rmpp)
+{
+	xtpage_t *sp;
+	struct metapage *rmp;
+	xtpage_t *rp;
+	s64 rbn;
+	int skip, nextindex;
+	xad_t *xad;
+	pxd_t *pxd;
+	struct pxdlist *pxdlist;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+	int rc;
+
+	sp = &JFS_IP(ip)->i_xtroot;
+
+	INCREMENT(xtStat.split);
+
+	/*
+	 *	allocate a single (right) child page
+	 */
+	pxdlist = split->pxdlist;
+	pxd = &pxdlist->pxd[pxdlist->npxd];
+	pxdlist->npxd++;
+	rbn = addressPXD(pxd);
+	rmp = get_metapage(ip, rbn, PSIZE, 1);
+	if (rmp == NULL)
+		return -EIO;
+
+	/* Allocate blocks to quota. */
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc) {
+		release_metapage(rmp);
+		return rc;
+	}
+
+	jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp);
+
+	/*
+	 * acquire a transaction lock on the new right page;
+	 *
+	 * action: new page;
+	 */
+	BT_MARK_DIRTY(rmp, ip);
+
+	rp = (xtpage_t *) rmp->data;
+	rp->header.flag =
+	    (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
+	rp->header.self = *pxd;
+	rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
+	rp->header.maxentry = cpu_to_le16(PSIZE >> L2XTSLOTSIZE);
+
+	/* initialize sibling pointers */
+	rp->header.next = 0;
+	rp->header.prev = 0;
+
+	/*
+	 * copy the in-line root page into new right page extent
+	 */
+	nextindex = le16_to_cpu(sp->header.maxentry);
+	memmove(&rp->xad[XTENTRYSTART], &sp->xad[XTENTRYSTART],
+		(nextindex - XTENTRYSTART) << L2XTSLOTSIZE);
+
+	/*
+	 * insert the new entry into the new right/child page
+	 * (skip index in the new right page will not change)
+	 */
+	skip = split->index;
+	/* if insert into middle, shift right remaining entries */
+	if (skip != nextindex)
+		memmove(&rp->xad[skip + 1], &rp->xad[skip],
+			(nextindex - skip) * sizeof(xad_t));
+
+	xad = &rp->xad[skip];
+	XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr);
+
+	/* update page header */
+	rp->header.nextindex = cpu_to_le16(nextindex + 1);
+
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
+		xtlck = (struct xtlock *) & tlck->lock;
+		xtlck->lwm.offset = XTENTRYSTART;
+		xtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
+		    XTENTRYSTART;
+	}
+
+	/*
+	 *	reset the root
+	 *
+	 * init root with the single entry for the new right page
+	 * set the 1st entry offset to 0, which force the left-most key
+	 * at any level of the tree to be less than any search key.
+	 */
+	/*
+	 * acquire a transaction lock on the root page (in-memory inode);
+	 *
+	 * action: root split;
+	 */
+	BT_MARK_DIRTY(split->mp, ip);
+
+	xad = &sp->xad[XTENTRYSTART];
+	XT_PUTENTRY(xad, XAD_NEW, 0, JFS_SBI(ip->i_sb)->nbperpage, rbn);
+
+	/* update page header of root */
+	sp->header.flag &= ~BT_LEAF;
+	sp->header.flag |= BT_INTERNAL;
+
+	sp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
+
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		tlck = txLock(tid, ip, split->mp, tlckXTREE | tlckGROW);
+		xtlck = (struct xtlock *) & tlck->lock;
+		xtlck->lwm.offset = XTENTRYSTART;
+		xtlck->lwm.length = 1;
+	}
+
+	*rmpp = rmp;
+
+	jfs_info("xtSplitRoot: sp:0x%p rp:0x%p", sp, rp);
+	return 0;
+}
+
+
+/*
+ *	xtExtend()
+ *
+ * function: extend in-place;
+ *
+ * note: existing extent may or may not have been committed.
+ * caller is responsible for pager buffer cache update, and
+ * working block allocation map update;
+ * update pmap: alloc whole extended extent;
+ */
+int xtExtend(tid_t tid,		/* transaction id */
+	     struct inode *ip, s64 xoff,	/* delta extent offset */
+	     s32 xlen,		/* delta extent length */
+	     int flag)
+{
+	int rc = 0;
+	int cmp;
+	struct metapage *mp;	/* meta-page buffer */
+	xtpage_t *p;		/* base B+-tree index page */
+	s64 bn;
+	int index, nextindex, len;
+	struct btstack btstack;	/* traverse stack */
+	struct xtsplit split;	/* split information */
+	xad_t *xad;
+	s64 xaddr;
+	struct tlock *tlck;
+	struct xtlock *xtlck = NULL;
+
+	jfs_info("xtExtend: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
+
+	/* there must exist extent to be extended */
+	if ((rc = xtSearch(ip, xoff - 1, NULL, &cmp, &btstack, XT_INSERT)))
+		return rc;
+
+	/* retrieve search result */
+	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+	if (cmp != 0) {
+		XT_PUTPAGE(mp);
+		jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent");
+		return -EIO;
+	}
+
+	/* extension must be contiguous */
+	xad = &p->xad[index];
+	if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) {
+		XT_PUTPAGE(mp);
+		jfs_error(ip->i_sb, "xtExtend: extension is not contiguous");
+		return -EIO;
+	}
+
+	/*
+	 * acquire a transaction lock on the leaf page;
+	 *
+	 * action: xad insertion/extension;
+	 */
+	BT_MARK_DIRTY(mp, ip);
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+		xtlck = (struct xtlock *) & tlck->lock;
+	}
+
+	/* extend will overflow extent ? */
+	xlen = lengthXAD(xad) + xlen;
+	if ((len = xlen - MAXXLEN) <= 0)
+		goto extendOld;
+
+	/*
+	 *	extent overflow: insert entry for new extent
+	 */
+//insertNew:
+	xoff = offsetXAD(xad) + MAXXLEN;
+	xaddr = addressXAD(xad) + MAXXLEN;
+	nextindex = le16_to_cpu(p->header.nextindex);
+
+	/*
+	 *	if the leaf page is full, insert the new entry and
+	 *	propagate up the router entry for the new page from split
+	 *
+	 * The xtSplitUp() will insert the entry and unpin the leaf page.
+	 */
+	if (nextindex == le16_to_cpu(p->header.maxentry)) {
+		/* xtSpliUp() unpins leaf pages */
+		split.mp = mp;
+		split.index = index + 1;
+		split.flag = XAD_NEW;
+		split.off = xoff;	/* split offset */
+		split.len = len;
+		split.addr = xaddr;
+		split.pxdlist = NULL;
+		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+			return rc;
+
+		/* get back old page */
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+		/*
+		 * if leaf root has been split, original root has been
+		 * copied to new child page, i.e., original entry now
+		 * resides on the new child page;
+		 */
+		if (p->header.flag & BT_INTERNAL) {
+			ASSERT(p->header.nextindex ==
+			       cpu_to_le16(XTENTRYSTART + 1));
+			xad = &p->xad[XTENTRYSTART];
+			bn = addressXAD(xad);
+			XT_PUTPAGE(mp);
+
+			/* get new child page */
+			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+			if (rc)
+				return rc;
+
+			BT_MARK_DIRTY(mp, ip);
+			if (!test_cflag(COMMIT_Nolink, ip)) {
+				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+				xtlck = (struct xtlock *) & tlck->lock;
+			}
+		}
+	}
+	/*
+	 *	insert the new entry into the leaf page
+	 */
+	else {
+		/* insert the new entry: mark the entry NEW */
+		xad = &p->xad[index + 1];
+		XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr);
+
+		/* advance next available entry index */
+		le16_add_cpu(&p->header.nextindex, 1);
+	}
+
+	/* get back old entry */
+	xad = &p->xad[index];
+	xlen = MAXXLEN;
+
+	/*
+	 * extend old extent
+	 */
+      extendOld:
+	XADlength(xad, xlen);
+	if (!(xad->flag & XAD_NEW))
+		xad->flag |= XAD_EXTENDED;
+
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		xtlck->lwm.offset =
+		    (xtlck->lwm.offset) ? min(index,
+					      (int)xtlck->lwm.offset) : index;
+		xtlck->lwm.length =
+		    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
+	}
+
+	/* unpin the leaf page */
+	XT_PUTPAGE(mp);
+
+	return rc;
+}
+
+#ifdef _NOTYET
+/*
+ *	xtTailgate()
+ *
+ * function: split existing 'tail' extent
+ *	(split offset >= start offset of tail extent), and
+ *	relocate and extend the split tail half;
+ *
+ * note: existing extent may or may not have been committed.
+ * caller is responsible for pager buffer cache update, and
+ * working block allocation map update;
+ * update pmap: free old split tail extent, alloc new extent;
+ */
+int xtTailgate(tid_t tid,		/* transaction id */
+	       struct inode *ip, s64 xoff,	/* split/new extent offset */
+	       s32 xlen,	/* new extent length */
+	       s64 xaddr,	/* new extent address */
+	       int flag)
+{
+	int rc = 0;
+	int cmp;
+	struct metapage *mp;	/* meta-page buffer */
+	xtpage_t *p;		/* base B+-tree index page */
+	s64 bn;
+	int index, nextindex, llen, rlen;
+	struct btstack btstack;	/* traverse stack */
+	struct xtsplit split;	/* split information */
+	xad_t *xad;
+	struct tlock *tlck;
+	struct xtlock *xtlck = 0;
+	struct tlock *mtlck;
+	struct maplock *pxdlock;
+
+/*
+printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
+	(ulong)xoff, xlen, (ulong)xaddr);
+*/
+
+	/* there must exist extent to be tailgated */
+	if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, XT_INSERT)))
+		return rc;
+
+	/* retrieve search result */
+	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+	if (cmp != 0) {
+		XT_PUTPAGE(mp);
+		jfs_error(ip->i_sb, "xtTailgate: couldn't find extent");
+		return -EIO;
+	}
+
+	/* entry found must be last entry */
+	nextindex = le16_to_cpu(p->header.nextindex);
+	if (index != nextindex - 1) {
+		XT_PUTPAGE(mp);
+		jfs_error(ip->i_sb,
+			  "xtTailgate: the entry found is not the last entry");
+		return -EIO;
+	}
+
+	BT_MARK_DIRTY(mp, ip);
+	/*
+	 * acquire tlock of the leaf page containing original entry
+	 */
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+		xtlck = (struct xtlock *) & tlck->lock;
+	}
+
+	/* completely replace extent ? */
+	xad = &p->xad[index];
+/*
+printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
+	(ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
+*/
+	if ((llen = xoff - offsetXAD(xad)) == 0)
+		goto updateOld;
+
+	/*
+	 *	partially replace extent: insert entry for new extent
+	 */
+//insertNew:
+	/*
+	 *	if the leaf page is full, insert the new entry and
+	 *	propagate up the router entry for the new page from split
+	 *
+	 * The xtSplitUp() will insert the entry and unpin the leaf page.
+	 */
+	if (nextindex == le16_to_cpu(p->header.maxentry)) {
+		/* xtSpliUp() unpins leaf pages */
+		split.mp = mp;
+		split.index = index + 1;
+		split.flag = XAD_NEW;
+		split.off = xoff;	/* split offset */
+		split.len = xlen;
+		split.addr = xaddr;
+		split.pxdlist = NULL;
+		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+			return rc;
+
+		/* get back old page */
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+		/*
+		 * if leaf root has been split, original root has been
+		 * copied to new child page, i.e., original entry now
+		 * resides on the new child page;
+		 */
+		if (p->header.flag & BT_INTERNAL) {
+			ASSERT(p->header.nextindex ==
+			       cpu_to_le16(XTENTRYSTART + 1));
+			xad = &p->xad[XTENTRYSTART];
+			bn = addressXAD(xad);
+			XT_PUTPAGE(mp);
+
+			/* get new child page */
+			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+			if (rc)
+				return rc;
+
+			BT_MARK_DIRTY(mp, ip);
+			if (!test_cflag(COMMIT_Nolink, ip)) {
+				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+				xtlck = (struct xtlock *) & tlck->lock;
+			}
+		}
+	}
+	/*
+	 *	insert the new entry into the leaf page
+	 */
+	else {
+		/* insert the new entry: mark the entry NEW */
+		xad = &p->xad[index + 1];
+		XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
+
+		/* advance next available entry index */
+		le16_add_cpu(&p->header.nextindex, 1);
+	}
+
+	/* get back old XAD */
+	xad = &p->xad[index];
+
+	/*
+	 * truncate/relocate old extent at split offset
+	 */
+      updateOld:
+	/* update dmap for old/committed/truncated extent */
+	rlen = lengthXAD(xad) - llen;
+	if (!(xad->flag & XAD_NEW)) {
+		/* free from PWMAP at commit */
+		if (!test_cflag(COMMIT_Nolink, ip)) {
+			mtlck = txMaplock(tid, ip, tlckMAP);
+			pxdlock = (struct maplock *) & mtlck->lock;
+			pxdlock->flag = mlckFREEPXD;
+			PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen);
+			PXDlength(&pxdlock->pxd, rlen);
+			pxdlock->index = 1;
+		}
+	} else
+		/* free from WMAP */
+		dbFree(ip, addressXAD(xad) + llen, (s64) rlen);
+
+	if (llen)
+		/* truncate */
+		XADlength(xad, llen);
+	else
+		/* replace */
+		XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
+
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		xtlck->lwm.offset = (xtlck->lwm.offset) ?
+		    min(index, (int)xtlck->lwm.offset) : index;
+		xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
+		    xtlck->lwm.offset;
+	}
+
+	/* unpin the leaf page */
+	XT_PUTPAGE(mp);
+
+	return rc;
+}
+#endif /* _NOTYET */
+
+/*
+ *	xtUpdate()
+ *
+ * function: update XAD;
+ *
+ *	update extent for allocated_but_not_recorded or
+ *	compressed extent;
+ *
+ * parameter:
+ *	nxad	- new XAD;
+ *		logical extent of the specified XAD must be completely
+ *		contained by an existing XAD;
+ */
+int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
+{				/* new XAD */
+	int rc = 0;
+	int cmp;
+	struct metapage *mp;	/* meta-page buffer */
+	xtpage_t *p;		/* base B+-tree index page */
+	s64 bn;
+	int index0, index, newindex, nextindex;
+	struct btstack btstack;	/* traverse stack */
+	struct xtsplit split;	/* split information */
+	xad_t *xad, *lxad, *rxad;
+	int xflag;
+	s64 nxoff, xoff;
+	int nxlen, xlen, lxlen, rxlen;
+	s64 nxaddr, xaddr;
+	struct tlock *tlck;
+	struct xtlock *xtlck = NULL;
+	int newpage = 0;
+
+	/* there must exist extent to be tailgated */
+	nxoff = offsetXAD(nxad);
+	nxlen = lengthXAD(nxad);
+	nxaddr = addressXAD(nxad);
+
+	if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT)))
+		return rc;
+
+	/* retrieve search result */
+	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
+
+	if (cmp != 0) {
+		XT_PUTPAGE(mp);
+		jfs_error(ip->i_sb, "xtUpdate: Could not find extent");
+		return -EIO;
+	}
+
+	BT_MARK_DIRTY(mp, ip);
+	/*
+	 * acquire tlock of the leaf page containing original entry
+	 */
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+		xtlck = (struct xtlock *) & tlck->lock;
+	}
+
+	xad = &p->xad[index0];
+	xflag = xad->flag;
+	xoff = offsetXAD(xad);
+	xlen = lengthXAD(xad);
+	xaddr = addressXAD(xad);
+
+	/* nXAD must be completely contained within XAD */
+	if ((xoff > nxoff) ||
+	    (nxoff + nxlen > xoff + xlen)) {
+		XT_PUTPAGE(mp);
+		jfs_error(ip->i_sb,
+			  "xtUpdate: nXAD in not completely contained within XAD");
+		return -EIO;
+	}
+
+	index = index0;
+	newindex = index + 1;
+	nextindex = le16_to_cpu(p->header.nextindex);
+
+#ifdef  _JFS_WIP_NOCOALESCE
+	if (xoff < nxoff)
+		goto updateRight;
+
+	/*
+	 * replace XAD with nXAD
+	 */
+      replace:			/* (nxoff == xoff) */
+	if (nxlen == xlen) {
+		/* replace XAD with nXAD:recorded */
+		*xad = *nxad;
+		xad->flag = xflag & ~XAD_NOTRECORDED;
+
+		goto out;
+	} else			/* (nxlen < xlen) */
+		goto updateLeft;
+#endif				/* _JFS_WIP_NOCOALESCE */
+
+/* #ifdef _JFS_WIP_COALESCE */
+	if (xoff < nxoff)
+		goto coalesceRight;
+
+	/*
+	 * coalesce with left XAD
+	 */
+//coalesceLeft: /* (xoff == nxoff) */
+	/* is XAD first entry of page ? */
+	if (index == XTENTRYSTART)
+		goto replace;
+
+	/* is nXAD logically and physically contiguous with lXAD ? */
+	lxad = &p->xad[index - 1];
+	lxlen = lengthXAD(lxad);
+	if (!(lxad->flag & XAD_NOTRECORDED) &&
+	    (nxoff == offsetXAD(lxad) + lxlen) &&
+	    (nxaddr == addressXAD(lxad) + lxlen) &&
+	    (lxlen + nxlen < MAXXLEN)) {
+		/* extend right lXAD */
+		index0 = index - 1;
+		XADlength(lxad, lxlen + nxlen);
+
+		/* If we just merged two extents together, need to make sure the
+		 * right extent gets logged.  If the left one is marked XAD_NEW,
+		 * then we know it will be logged.  Otherwise, mark as
+		 * XAD_EXTENDED
+		 */
+		if (!(lxad->flag & XAD_NEW))
+			lxad->flag |= XAD_EXTENDED;
+
+		if (xlen > nxlen) {
+			/* truncate XAD */
+			XADoffset(xad, xoff + nxlen);
+			XADlength(xad, xlen - nxlen);
+			XADaddress(xad, xaddr + nxlen);
+			goto out;
+		} else {	/* (xlen == nxlen) */
+
+			/* remove XAD */
+			if (index < nextindex - 1)
+				memmove(&p->xad[index], &p->xad[index + 1],
+					(nextindex - index -
+					 1) << L2XTSLOTSIZE);
+
+			p->header.nextindex =
+			    cpu_to_le16(le16_to_cpu(p->header.nextindex) -
+					1);
+
+			index = index0;
+			newindex = index + 1;
+			nextindex = le16_to_cpu(p->header.nextindex);
+			xoff = nxoff = offsetXAD(lxad);
+			xlen = nxlen = lxlen + nxlen;
+			xaddr = nxaddr = addressXAD(lxad);
+			goto coalesceRight;
+		}
+	}
+
+	/*
+	 * replace XAD with nXAD
+	 */
+      replace:			/* (nxoff == xoff) */
+	if (nxlen == xlen) {
+		/* replace XAD with nXAD:recorded */
+		*xad = *nxad;
+		xad->flag = xflag & ~XAD_NOTRECORDED;
+
+		goto coalesceRight;
+	} else			/* (nxlen < xlen) */
+		goto updateLeft;
+
+	/*
+	 * coalesce with right XAD
+	 */
+      coalesceRight:		/* (xoff <= nxoff) */
+	/* is XAD last entry of page ? */
+	if (newindex == nextindex) {
+		if (xoff == nxoff)
+			goto out;
+		goto updateRight;
+	}
+
+	/* is nXAD logically and physically contiguous with rXAD ? */
+	rxad = &p->xad[index + 1];
+	rxlen = lengthXAD(rxad);
+	if (!(rxad->flag & XAD_NOTRECORDED) &&
+	    (nxoff + nxlen == offsetXAD(rxad)) &&
+	    (nxaddr + nxlen == addressXAD(rxad)) &&
+	    (rxlen + nxlen < MAXXLEN)) {
+		/* extend left rXAD */
+		XADoffset(rxad, nxoff);
+		XADlength(rxad, rxlen + nxlen);
+		XADaddress(rxad, nxaddr);
+
+		/* If we just merged two extents together, need to make sure
+		 * the left extent gets logged.  If the right one is marked
+		 * XAD_NEW, then we know it will be logged.  Otherwise, mark as
+		 * XAD_EXTENDED
+		 */
+		if (!(rxad->flag & XAD_NEW))
+			rxad->flag |= XAD_EXTENDED;
+
+		if (xlen > nxlen)
+			/* truncate XAD */
+			XADlength(xad, xlen - nxlen);
+		else {		/* (xlen == nxlen) */
+
+			/* remove XAD */
+			memmove(&p->xad[index], &p->xad[index + 1],
+				(nextindex - index - 1) << L2XTSLOTSIZE);
+
+			p->header.nextindex =
+			    cpu_to_le16(le16_to_cpu(p->header.nextindex) -
+					1);
+		}
+
+		goto out;
+	} else if (xoff == nxoff)
+		goto out;
+
+	if (xoff >= nxoff) {
+		XT_PUTPAGE(mp);
+		jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff");
+		return -EIO;
+	}
+/* #endif _JFS_WIP_COALESCE */
+
+	/*
+	 * split XAD into (lXAD, nXAD):
+	 *
+	 *          |---nXAD--->
+	 * --|----------XAD----------|--
+	 *   |-lXAD-|
+	 */
+      updateRight:		/* (xoff < nxoff) */
+	/* truncate old XAD as lXAD:not_recorded */
+	xad = &p->xad[index];
+	XADlength(xad, nxoff - xoff);
+
+	/* insert nXAD:recorded */
+	if (nextindex == le16_to_cpu(p->header.maxentry)) {
+
+		/* xtSpliUp() unpins leaf pages */
+		split.mp = mp;
+		split.index = newindex;
+		split.flag = xflag & ~XAD_NOTRECORDED;
+		split.off = nxoff;
+		split.len = nxlen;
+		split.addr = nxaddr;
+		split.pxdlist = NULL;
+		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+			return rc;
+
+		/* get back old page */
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+		/*
+		 * if leaf root has been split, original root has been
+		 * copied to new child page, i.e., original entry now
+		 * resides on the new child page;
+		 */
+		if (p->header.flag & BT_INTERNAL) {
+			ASSERT(p->header.nextindex ==
+			       cpu_to_le16(XTENTRYSTART + 1));
+			xad = &p->xad[XTENTRYSTART];
+			bn = addressXAD(xad);
+			XT_PUTPAGE(mp);
+
+			/* get new child page */
+			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+			if (rc)
+				return rc;
+
+			BT_MARK_DIRTY(mp, ip);
+			if (!test_cflag(COMMIT_Nolink, ip)) {
+				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+				xtlck = (struct xtlock *) & tlck->lock;
+			}
+		} else {
+			/* is nXAD on new page ? */
+			if (newindex >
+			    (le16_to_cpu(p->header.maxentry) >> 1)) {
+				newindex =
+				    newindex -
+				    le16_to_cpu(p->header.nextindex) +
+				    XTENTRYSTART;
+				newpage = 1;
+			}
+		}
+	} else {
+		/* if insert into middle, shift right remaining entries */
+		if (newindex < nextindex)
+			memmove(&p->xad[newindex + 1], &p->xad[newindex],
+				(nextindex - newindex) << L2XTSLOTSIZE);
+
+		/* insert the entry */
+		xad = &p->xad[newindex];
+		*xad = *nxad;
+		xad->flag = xflag & ~XAD_NOTRECORDED;
+
+		/* advance next available entry index. */
+		p->header.nextindex =
+		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+	}
+
+	/*
+	 * does nXAD force 3-way split ?
+	 *
+	 *          |---nXAD--->|
+	 * --|----------XAD-------------|--
+	 *   |-lXAD-|           |-rXAD -|
+	 */
+	if (nxoff + nxlen == xoff + xlen)
+		goto out;
+
+	/* reorient nXAD as XAD for further split XAD into (nXAD, rXAD) */
+	if (newpage) {
+		/* close out old page */
+		if (!test_cflag(COMMIT_Nolink, ip)) {
+			xtlck->lwm.offset = (xtlck->lwm.offset) ?
+			    min(index0, (int)xtlck->lwm.offset) : index0;
+			xtlck->lwm.length =
+			    le16_to_cpu(p->header.nextindex) -
+			    xtlck->lwm.offset;
+		}
+
+		bn = le64_to_cpu(p->header.next);
+		XT_PUTPAGE(mp);
+
+		/* get new right page */
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		BT_MARK_DIRTY(mp, ip);
+		if (!test_cflag(COMMIT_Nolink, ip)) {
+			tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+			xtlck = (struct xtlock *) & tlck->lock;
+		}
+
+		index0 = index = newindex;
+	} else
+		index++;
+
+	newindex = index + 1;
+	nextindex = le16_to_cpu(p->header.nextindex);
+	xlen = xlen - (nxoff - xoff);
+	xoff = nxoff;
+	xaddr = nxaddr;
+
+	/* recompute split pages */
+	if (nextindex == le16_to_cpu(p->header.maxentry)) {
+		XT_PUTPAGE(mp);
+
+		if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT)))
+			return rc;
+
+		/* retrieve search result */
+		XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
+
+		if (cmp != 0) {
+			XT_PUTPAGE(mp);
+			jfs_error(ip->i_sb, "xtUpdate: xtSearch failed");
+			return -EIO;
+		}
+
+		if (index0 != index) {
+			XT_PUTPAGE(mp);
+			jfs_error(ip->i_sb,
+				  "xtUpdate: unexpected value of index");
+			return -EIO;
+		}
+	}
+
+	/*
+	 * split XAD into (nXAD, rXAD)
+	 *
+	 *          ---nXAD---|
+	 * --|----------XAD----------|--
+	 *                    |-rXAD-|
+	 */
+      updateLeft:		/* (nxoff == xoff) && (nxlen < xlen) */
+	/* update old XAD with nXAD:recorded */
+	xad = &p->xad[index];
+	*xad = *nxad;
+	xad->flag = xflag & ~XAD_NOTRECORDED;
+
+	/* insert rXAD:not_recorded */
+	xoff = xoff + nxlen;
+	xlen = xlen - nxlen;
+	xaddr = xaddr + nxlen;
+	if (nextindex == le16_to_cpu(p->header.maxentry)) {
+/*
+printf("xtUpdate.updateLeft.split p:0x%p\n", p);
+*/
+		/* xtSpliUp() unpins leaf pages */
+		split.mp = mp;
+		split.index = newindex;
+		split.flag = xflag;
+		split.off = xoff;
+		split.len = xlen;
+		split.addr = xaddr;
+		split.pxdlist = NULL;
+		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+			return rc;
+
+		/* get back old page */
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/*
+		 * if leaf root has been split, original root has been
+		 * copied to new child page, i.e., original entry now
+		 * resides on the new child page;
+		 */
+		if (p->header.flag & BT_INTERNAL) {
+			ASSERT(p->header.nextindex ==
+			       cpu_to_le16(XTENTRYSTART + 1));
+			xad = &p->xad[XTENTRYSTART];
+			bn = addressXAD(xad);
+			XT_PUTPAGE(mp);
+
+			/* get new child page */
+			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+			if (rc)
+				return rc;
+
+			BT_MARK_DIRTY(mp, ip);
+			if (!test_cflag(COMMIT_Nolink, ip)) {
+				tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+				xtlck = (struct xtlock *) & tlck->lock;
+			}
+		}
+	} else {
+		/* if insert into middle, shift right remaining entries */
+		if (newindex < nextindex)
+			memmove(&p->xad[newindex + 1], &p->xad[newindex],
+				(nextindex - newindex) << L2XTSLOTSIZE);
+
+		/* insert the entry */
+		xad = &p->xad[newindex];
+		XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
+
+		/* advance next available entry index. */
+		p->header.nextindex =
+		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+	}
+
+      out:
+	if (!test_cflag(COMMIT_Nolink, ip)) {
+		xtlck->lwm.offset = (xtlck->lwm.offset) ?
+		    min(index0, (int)xtlck->lwm.offset) : index0;
+		xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
+		    xtlck->lwm.offset;
+	}
+
+	/* unpin the leaf page */
+	XT_PUTPAGE(mp);
+
+	return rc;
+}
+
+
+/*
+ *	xtAppend()
+ *
+ * function: grow in append mode from contiguous region specified ;
+ *
+ * parameter:
+ *	tid		- transaction id;
+ *	ip		- file object;
+ *	xflag		- extent flag:
+ *	xoff		- extent offset;
+ *	maxblocks	- max extent length;
+ *	xlen		- extent length (in/out);
+ *	xaddrp		- extent address pointer (in/out):
+ *	flag		-
+ *
+ * return:
+ */
+int xtAppend(tid_t tid,		/* transaction id */
+	     struct inode *ip, int xflag, s64 xoff, s32 maxblocks,
+	     s32 * xlenp,	/* (in/out) */
+	     s64 * xaddrp,	/* (in/out) */
+	     int flag)
+{
+	int rc = 0;
+	struct metapage *mp;	/* meta-page buffer */
+	xtpage_t *p;		/* base B+-tree index page */
+	s64 bn, xaddr;
+	int index, nextindex;
+	struct btstack btstack;	/* traverse stack */
+	struct xtsplit split;	/* split information */
+	xad_t *xad;
+	int cmp;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+	int nsplit, nblocks, xlen;
+	struct pxdlist pxdlist;
+	pxd_t *pxd;
+	s64 next;
+
+	xaddr = *xaddrp;
+	xlen = *xlenp;
+	jfs_info("xtAppend: xoff:0x%lx maxblocks:%d xlen:%d xaddr:0x%lx",
+		 (ulong) xoff, maxblocks, xlen, (ulong) xaddr);
+
+	/*
+	 *	search for the entry location at which to insert:
+	 *
+	 * xtFastSearch() and xtSearch() both returns (leaf page
+	 * pinned, index at which to insert).
+	 * n.b. xtSearch() may return index of maxentry of
+	 * the full page.
+	 */
+	if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT)))
+		return rc;
+
+	/* retrieve search result */
+	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+	if (cmp == 0) {
+		rc = -EEXIST;
+		goto out;
+	}
+
+	if (next)
+		xlen = min(xlen, (int)(next - xoff));
+//insert:
+	/*
+	 *	insert entry for new extent
+	 */
+	xflag |= XAD_NEW;
+
+	/*
+	 *	if the leaf page is full, split the page and
+	 *	propagate up the router entry for the new page from split
+	 *
+	 * The xtSplitUp() will insert the entry and unpin the leaf page.
+	 */
+	nextindex = le16_to_cpu(p->header.nextindex);
+	if (nextindex < le16_to_cpu(p->header.maxentry))
+		goto insertLeaf;
+
+	/*
+	 * allocate new index blocks to cover index page split(s)
+	 */
+	nsplit = btstack.nsplit;
+	split.pxdlist = &pxdlist;
+	pxdlist.maxnpxd = pxdlist.npxd = 0;
+	pxd = &pxdlist.pxd[0];
+	nblocks = JFS_SBI(ip->i_sb)->nbperpage;
+	for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) {
+		if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) {
+			PXDaddress(pxd, xaddr);
+			PXDlength(pxd, nblocks);
+
+			pxdlist.maxnpxd++;
+
+			continue;
+		}
+
+		/* undo allocation */
+
+		goto out;
+	}
+
+	xlen = min(xlen, maxblocks);
+
+	/*
+	 * allocate data extent requested
+	 */
+	if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
+		goto out;
+
+	split.mp = mp;
+	split.index = index;
+	split.flag = xflag;
+	split.off = xoff;
+	split.len = xlen;
+	split.addr = xaddr;
+	if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
+		/* undo data extent allocation */
+		dbFree(ip, *xaddrp, (s64) * xlenp);
+
+		return rc;
+	}
+
+	*xaddrp = xaddr;
+	*xlenp = xlen;
+	return 0;
+
+	/*
+	 *	insert the new entry into the leaf page
+	 */
+      insertLeaf:
+	/*
+	 * allocate data extent requested
+	 */
+	if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
+		goto out;
+
+	BT_MARK_DIRTY(mp, ip);
+	/*
+	 * acquire a transaction lock on the leaf page;
+	 *
+	 * action: xad insertion/extension;
+	 */
+	tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+	xtlck = (struct xtlock *) & tlck->lock;
+
+	/* insert the new entry: mark the entry NEW */
+	xad = &p->xad[index];
+	XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
+
+	/* advance next available entry index */
+	le16_add_cpu(&p->header.nextindex, 1);
+
+	xtlck->lwm.offset =
+	    (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index;
+	xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
+	    xtlck->lwm.offset;
+
+	*xaddrp = xaddr;
+	*xlenp = xlen;
+
+      out:
+	/* unpin the leaf page */
+	XT_PUTPAGE(mp);
+
+	return rc;
+}
+#ifdef _STILL_TO_PORT
+
+/* - TBD for defragmentaion/reorganization -
+ *
+ *	xtDelete()
+ *
+ * function:
+ *	delete the entry with the specified key.
+ *
+ *	N.B.: whole extent of the entry is assumed to be deleted.
+ *
+ * parameter:
+ *
+ * return:
+ *	ENOENT: if the entry is not found.
+ *
+ * exception:
+ */
+int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
+{
+	int rc = 0;
+	struct btstack btstack;
+	int cmp;
+	s64 bn;
+	struct metapage *mp;
+	xtpage_t *p;
+	int index, nextindex;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+
+	/*
+	 * find the matching entry; xtSearch() pins the page
+	 */
+	if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0)))
+		return rc;
+
+	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+	if (cmp) {
+		/* unpin the leaf page */
+		XT_PUTPAGE(mp);
+		return -ENOENT;
+	}
+
+	/*
+	 * delete the entry from the leaf page
+	 */
+	nextindex = le16_to_cpu(p->header.nextindex);
+	le16_add_cpu(&p->header.nextindex, -1);
+
+	/*
+	 * if the leaf page bocome empty, free the page
+	 */
+	if (p->header.nextindex == cpu_to_le16(XTENTRYSTART))
+		return (xtDeleteUp(tid, ip, mp, p, &btstack));
+
+	BT_MARK_DIRTY(mp, ip);
+	/*
+	 * acquire a transaction lock on the leaf page;
+	 *
+	 * action:xad deletion;
+	 */
+	tlck = txLock(tid, ip, mp, tlckXTREE);
+	xtlck = (struct xtlock *) & tlck->lock;
+	xtlck->lwm.offset =
+	    (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index;
+
+	/* if delete from middle, shift left/compact the remaining entries */
+	if (index < nextindex - 1)
+		memmove(&p->xad[index], &p->xad[index + 1],
+			(nextindex - index - 1) * sizeof(xad_t));
+
+	XT_PUTPAGE(mp);
+
+	return 0;
+}
+
+
+/* - TBD for defragmentaion/reorganization -
+ *
+ *	xtDeleteUp()
+ *
+ * function:
+ *	free empty pages as propagating deletion up the tree
+ *
+ * parameter:
+ *
+ * return:
+ */
+static int
+xtDeleteUp(tid_t tid, struct inode *ip,
+	   struct metapage * fmp, xtpage_t * fp, struct btstack * btstack)
+{
+	int rc = 0;
+	struct metapage *mp;
+	xtpage_t *p;
+	int index, nextindex;
+	s64 xaddr;
+	int xlen;
+	struct btframe *parent;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+
+	/*
+	 * keep root leaf page which has become empty
+	 */
+	if (fp->header.flag & BT_ROOT) {
+		/* keep the root page */
+		fp->header.flag &= ~BT_INTERNAL;
+		fp->header.flag |= BT_LEAF;
+		fp->header.nextindex = cpu_to_le16(XTENTRYSTART);
+
+		/* XT_PUTPAGE(fmp); */
+
+		return 0;
+	}
+
+	/*
+	 * free non-root leaf page
+	 */
+	if ((rc = xtRelink(tid, ip, fp))) {
+		XT_PUTPAGE(fmp);
+		return rc;
+	}
+
+	xaddr = addressPXD(&fp->header.self);
+	xlen = lengthPXD(&fp->header.self);
+	/* free the page extent */
+	dbFree(ip, xaddr, (s64) xlen);
+
+	/* free the buffer page */
+	discard_metapage(fmp);
+
+	/*
+	 * propagate page deletion up the index tree
+	 *
+	 * If the delete from the parent page makes it empty,
+	 * continue all the way up the tree.
+	 * stop if the root page is reached (which is never deleted) or
+	 * if the entry deletion does not empty the page.
+	 */
+	while ((parent = BT_POP(btstack)) != NULL) {
+		/* get/pin the parent page <sp> */
+		XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		index = parent->index;
+
+		/* delete the entry for the freed child page from parent.
+		 */
+		nextindex = le16_to_cpu(p->header.nextindex);
+
+		/*
+		 * the parent has the single entry being deleted:
+		 * free the parent page which has become empty.
+		 */
+		if (nextindex == 1) {
+			if (p->header.flag & BT_ROOT) {
+				/* keep the root page */
+				p->header.flag &= ~BT_INTERNAL;
+				p->header.flag |= BT_LEAF;
+				p->header.nextindex =
+				    cpu_to_le16(XTENTRYSTART);
+
+				/* XT_PUTPAGE(mp); */
+
+				break;
+			} else {
+				/* free the parent page */
+				if ((rc = xtRelink(tid, ip, p)))
+					return rc;
+
+				xaddr = addressPXD(&p->header.self);
+				/* free the page extent */
+				dbFree(ip, xaddr,
+				       (s64) JFS_SBI(ip->i_sb)->nbperpage);
+
+				/* unpin/free the buffer page */
+				discard_metapage(mp);
+
+				/* propagate up */
+				continue;
+			}
+		}
+		/*
+		 * the parent has other entries remaining:
+		 * delete the router entry from the parent page.
+		 */
+		else {
+			BT_MARK_DIRTY(mp, ip);
+			/*
+			 * acquire a transaction lock on the leaf page;
+			 *
+			 * action:xad deletion;
+			 */
+			tlck = txLock(tid, ip, mp, tlckXTREE);
+			xtlck = (struct xtlock *) & tlck->lock;
+			xtlck->lwm.offset =
+			    (xtlck->lwm.offset) ? min(index,
+						      xtlck->lwm.
+						      offset) : index;
+
+			/* if delete from middle,
+			 * shift left/compact the remaining entries in the page
+			 */
+			if (index < nextindex - 1)
+				memmove(&p->xad[index], &p->xad[index + 1],
+					(nextindex - index -
+					 1) << L2XTSLOTSIZE);
+
+			le16_add_cpu(&p->header.nextindex, -1);
+			jfs_info("xtDeleteUp(entry): 0x%lx[%d]",
+				 (ulong) parent->bn, index);
+		}
+
+		/* unpin the parent page */
+		XT_PUTPAGE(mp);
+
+		/* exit propagation up */
+		break;
+	}
+
+	return 0;
+}
+
+
+/*
+ * NAME:	xtRelocate()
+ *
+ * FUNCTION:	relocate xtpage or data extent of regular file;
+ *		This function is mainly used by defragfs utility.
+ *
+ * NOTE:	This routine does not have the logic to handle
+ *		uncommitted allocated extent. The caller should call
+ *		txCommit() to commit all the allocation before call
+ *		this routine.
+ */
+int
+xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
+	   s64 nxaddr,		/* new xaddr */
+	   int xtype)
+{				/* extent type: XTPAGE or DATAEXT */
+	int rc = 0;
+	struct tblock *tblk;
+	struct tlock *tlck;
+	struct xtlock *xtlck;
+	struct metapage *mp, *pmp, *lmp, *rmp;	/* meta-page buffer */
+	xtpage_t *p, *pp, *rp, *lp;	/* base B+-tree index page */
+	xad_t *xad;
+	pxd_t *pxd;
+	s64 xoff, xsize;
+	int xlen;
+	s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn;
+	cbuf_t *cp;
+	s64 offset, nbytes, nbrd, pno;
+	int nb, npages, nblks;
+	s64 bn;
+	int cmp;
+	int index;
+	struct pxd_lock *pxdlock;
+	struct btstack btstack;	/* traverse stack */
+
+	xtype = xtype & EXTENT_TYPE;
+
+	xoff = offsetXAD(oxad);
+	oxaddr = addressXAD(oxad);
+	xlen = lengthXAD(oxad);
+
+	/* validate extent offset */
+	offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
+	if (offset >= ip->i_size)
+		return -ESTALE;	/* stale extent */
+
+	jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx",
+		 xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr);
+
+	/*
+	 *	1. get and validate the parent xtpage/xad entry
+	 *	covering the source extent to be relocated;
+	 */
+	if (xtype == DATAEXT) {
+		/* search in leaf entry */
+		rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0);
+		if (rc)
+			return rc;
+
+		/* retrieve search result */
+		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+
+		if (cmp) {
+			XT_PUTPAGE(pmp);
+			return -ESTALE;
+		}
+
+		/* validate for exact match with a single entry */
+		xad = &pp->xad[index];
+		if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) {
+			XT_PUTPAGE(pmp);
+			return -ESTALE;
+		}
+	} else {		/* (xtype == XTPAGE) */
+
+		/* search in internal entry */
+		rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0);
+		if (rc)
+			return rc;
+
+		/* retrieve search result */
+		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+
+		if (cmp) {
+			XT_PUTPAGE(pmp);
+			return -ESTALE;
+		}
+
+		/* xtSearchNode() validated for exact match with a single entry
+		 */
+		xad = &pp->xad[index];
+	}
+	jfs_info("xtRelocate: parent xad entry validated.");
+
+	/*
+	 *	2. relocate the extent
+	 */
+	if (xtype == DATAEXT) {
+		/* if the extent is allocated-but-not-recorded
+		 * there is no real data to be moved in this extent,
+		 */
+		if (xad->flag & XAD_NOTRECORDED)
+			goto out;
+		else
+			/* release xtpage for cmRead()/xtLookup() */
+			XT_PUTPAGE(pmp);
+
+		/*
+		 *	cmRelocate()
+		 *
+		 * copy target data pages to be relocated;
+		 *
+		 * data extent must start at page boundary and
+		 * multiple of page size (except the last data extent);
+		 * read in each page of the source data extent into cbuf,
+		 * update the cbuf extent descriptor of the page to be
+		 * homeward bound to new dst data extent
+		 * copy the data from the old extent to new extent.
+		 * copy is essential for compressed files to avoid problems
+		 * that can arise if there was a change in compression
+		 * algorithms.
+		 * it is a good strategy because it may disrupt cache
+		 * policy to keep the pages in memory afterwards.
+		 */
+		offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
+		assert((offset & CM_OFFSET) == 0);
+		nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize;
+		pno = offset >> CM_L2BSIZE;
+		npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE;
+/*
+		npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
+			  (offset >> CM_L2BSIZE) + 1;
+*/
+		sxaddr = oxaddr;
+		dxaddr = nxaddr;
+
+		/* process the request one cache buffer at a time */
+		for (nbrd = 0; nbrd < nbytes; nbrd += nb,
+		     offset += nb, pno++, npages--) {
+			/* compute page size */
+			nb = min(nbytes - nbrd, CM_BSIZE);
+
+			/* get the cache buffer of the page */
+			if (rc = cmRead(ip, offset, npages, &cp))
+				break;
+
+			assert(addressPXD(&cp->cm_pxd) == sxaddr);
+			assert(!cp->cm_modified);
+
+			/* bind buffer with the new extent address */
+			nblks = nb >> JFS_IP(ip->i_sb)->l2bsize;
+			cmSetXD(ip, cp, pno, dxaddr, nblks);
+
+			/* release the cbuf, mark it as modified */
+			cmPut(cp, true);
+
+			dxaddr += nblks;
+			sxaddr += nblks;
+		}
+
+		/* get back parent page */
+		if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0)))
+			return rc;
+
+		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+		jfs_info("xtRelocate: target data extent relocated.");
+	} else {		/* (xtype == XTPAGE) */
+
+		/*
+		 * read in the target xtpage from the source extent;
+		 */
+		XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
+		if (rc) {
+			XT_PUTPAGE(pmp);
+			return rc;
+		}
+
+		/*
+		 * read in sibling pages if any to update sibling pointers;
+		 */
+		rmp = NULL;
+		if (p->header.next) {
+			nextbn = le64_to_cpu(p->header.next);
+			XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
+			if (rc) {
+				XT_PUTPAGE(pmp);
+				XT_PUTPAGE(mp);
+				return (rc);
+			}
+		}
+
+		lmp = NULL;
+		if (p->header.prev) {
+			prevbn = le64_to_cpu(p->header.prev);
+			XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
+			if (rc) {
+				XT_PUTPAGE(pmp);
+				XT_PUTPAGE(mp);
+				if (rmp)
+					XT_PUTPAGE(rmp);
+				return (rc);
+			}
+		}
+
+		/* at this point, all xtpages to be updated are in memory */
+
+		/*
+		 * update sibling pointers of sibling xtpages if any;
+		 */
+		if (lmp) {
+			BT_MARK_DIRTY(lmp, ip);
+			tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
+			lp->header.next = cpu_to_le64(nxaddr);
+			XT_PUTPAGE(lmp);
+		}
+
+		if (rmp) {
+			BT_MARK_DIRTY(rmp, ip);
+			tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
+			rp->header.prev = cpu_to_le64(nxaddr);
+			XT_PUTPAGE(rmp);
+		}
+
+		/*
+		 * update the target xtpage to be relocated
+		 *
+		 * update the self address of the target page
+		 * and write to destination extent;
+		 * redo image covers the whole xtpage since it is new page
+		 * to the destination extent;
+		 * update of bmap for the free of source extent
+		 * of the target xtpage itself:
+		 * update of bmap for the allocation of destination extent
+		 * of the target xtpage itself:
+		 * update of bmap for the extents covered by xad entries in
+		 * the target xtpage is not necessary since they are not
+		 * updated;
+		 * if not committed before this relocation,
+		 * target page may contain XAD_NEW entries which must
+		 * be scanned for bmap update (logredo() always
+		 * scan xtpage REDOPAGE image for bmap update);
+		 * if committed before this relocation (tlckRELOCATE),
+		 * scan may be skipped by commit() and logredo();
+		 */
+		BT_MARK_DIRTY(mp, ip);
+		/* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */
+		tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW);
+		xtlck = (struct xtlock *) & tlck->lock;
+
+		/* update the self address in the xtpage header */
+		pxd = &p->header.self;
+		PXDaddress(pxd, nxaddr);
+
+		/* linelock for the after image of the whole page */
+		xtlck->lwm.length =
+		    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
+
+		/* update the buffer extent descriptor of target xtpage */
+		xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
+		bmSetXD(mp, nxaddr, xsize);
+
+		/* unpin the target page to new homeward bound */
+		XT_PUTPAGE(mp);
+		jfs_info("xtRelocate: target xtpage relocated.");
+	}
+
+	/*
+	 *	3. acquire maplock for the source extent to be freed;
+	 *
+	 * acquire a maplock saving the src relocated extent address;
+	 * to free of the extent at commit time;
+	 */
+      out:
+	/* if DATAEXT relocation, write a LOG_UPDATEMAP record for
+	 * free PXD of the source data extent (logredo() will update
+	 * bmap for free of source data extent), and update bmap for
+	 * free of the source data extent;
+	 */
+	if (xtype == DATAEXT)
+		tlck = txMaplock(tid, ip, tlckMAP);
+	/* if XTPAGE relocation, write a LOG_NOREDOPAGE record
+	 * for the source xtpage (logredo() will init NoRedoPage
+	 * filter and will also update bmap for free of the source
+	 * xtpage), and update bmap for free of the source xtpage;
+	 * N.B. We use tlckMAP instead of tlkcXTREE because there
+	 *      is no buffer associated with this lock since the buffer
+	 *      has been redirected to the target location.
+	 */
+	else			/* (xtype == XTPAGE) */
+		tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE);
+
+	pxdlock = (struct pxd_lock *) & tlck->lock;
+	pxdlock->flag = mlckFREEPXD;
+	PXDaddress(&pxdlock->pxd, oxaddr);
+	PXDlength(&pxdlock->pxd, xlen);
+	pxdlock->index = 1;
+
+	/*
+	 *	4. update the parent xad entry for relocation;
+	 *
+	 * acquire tlck for the parent entry with XAD_NEW as entry
+	 * update which will write LOG_REDOPAGE and update bmap for
+	 * allocation of XAD_NEW destination extent;
+	 */
+	jfs_info("xtRelocate: update parent xad entry.");
+	BT_MARK_DIRTY(pmp, ip);
+	tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW);
+	xtlck = (struct xtlock *) & tlck->lock;
+
+	/* update the XAD with the new destination extent; */
+	xad = &pp->xad[index];
+	xad->flag |= XAD_NEW;
+	XADaddress(xad, nxaddr);
+
+	xtlck->lwm.offset = min(index, xtlck->lwm.offset);
+	xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) -
+	    xtlck->lwm.offset;
+
+	/* unpin the parent xtpage */
+	XT_PUTPAGE(pmp);
+
+	return rc;
+}
+
+
+/*
+ *	xtSearchNode()
+ *
+ * function:	search for the internal xad entry covering specified extent.
+ *		This function is mainly used by defragfs utility.
+ *
+ * parameters:
+ *	ip	- file object;
+ *	xad	- extent to find;
+ *	cmpp	- comparison result:
+ *	btstack - traverse stack;
+ *	flag	- search process flag;
+ *
+ * returns:
+ *	btstack contains (bn, index) of search path traversed to the entry.
+ *	*cmpp is set to result of comparison with the entry returned.
+ *	the page containing the entry is pinned at exit.
+ */
+static int xtSearchNode(struct inode *ip, xad_t * xad,	/* required XAD entry */
+			int *cmpp, struct btstack * btstack, int flag)
+{
+	int rc = 0;
+	s64 xoff, xaddr;
+	int xlen;
+	int cmp = 1;		/* init for empty page */
+	s64 bn;			/* block number */
+	struct metapage *mp;	/* meta-page buffer */
+	xtpage_t *p;		/* page */
+	int base, index, lim;
+	struct btframe *btsp;
+	s64 t64;
+
+	BT_CLR(btstack);
+
+	xoff = offsetXAD(xad);
+	xlen = lengthXAD(xad);
+	xaddr = addressXAD(xad);
+
+	/*
+	 *	search down tree from root:
+	 *
+	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
+	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
+	 *
+	 * if entry with search key K is not found
+	 * internal page search find the entry with largest key Ki
+	 * less than K which point to the child page to search;
+	 * leaf page search find the entry with smallest key Kj
+	 * greater than K so that the returned index is the position of
+	 * the entry to be shifted right for insertion of new entry.
+	 * for empty tree, search key is greater than any key of the tree.
+	 *
+	 * by convention, root bn = 0.
+	 */
+	for (bn = 0;;) {
+		/* get/pin the page to search */
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+		if (p->header.flag & BT_LEAF) {
+			XT_PUTPAGE(mp);
+			return -ESTALE;
+		}
+
+		lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
+
+		/*
+		 * binary search with search key K on the current page
+		 */
+		for (base = XTENTRYSTART; lim; lim >>= 1) {
+			index = base + (lim >> 1);
+
+			XT_CMP(cmp, xoff, &p->xad[index], t64);
+			if (cmp == 0) {
+				/*
+				 *	search hit
+				 *
+				 * verify for exact match;
+				 */
+				if (xaddr == addressXAD(&p->xad[index]) &&
+				    xoff == offsetXAD(&p->xad[index])) {
+					*cmpp = cmp;
+
+					/* save search result */
+					btsp = btstack->top;
+					btsp->bn = bn;
+					btsp->index = index;
+					btsp->mp = mp;
+
+					return 0;
+				}
+
+				/* descend/search its child page */
+				goto next;
+			}
+
+			if (cmp > 0) {
+				base = index + 1;
+				--lim;
+			}
+		}
+
+		/*
+		 *	search miss - non-leaf page:
+		 *
+		 * base is the smallest index with key (Kj) greater than
+		 * search key (K) and may be zero or maxentry index.
+		 * if base is non-zero, decrement base by one to get the parent
+		 * entry of the child page to search.
+		 */
+		index = base ? base - 1 : base;
+
+		/*
+		 * go down to child page
+		 */
+	      next:
+		/* get the child page block number */
+		bn = addressXAD(&p->xad[index]);
+
+		/* unpin the parent page */
+		XT_PUTPAGE(mp);
+	}
+}
+
+
+/*
+ *	xtRelink()
+ *
+ * function:
+ *	link around a freed page.
+ *
+ * Parameter:
+ *	int		tid,
+ *	struct inode	*ip,
+ *	xtpage_t	*p)
+ *
+ * returns:
+ */
+static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p)
+{
+	int rc = 0;
+	struct metapage *mp;
+	s64 nextbn, prevbn;
+	struct tlock *tlck;
+
+	nextbn = le64_to_cpu(p->header.next);
+	prevbn = le64_to_cpu(p->header.prev);
+
+	/* update prev pointer of the next page */
+	if (nextbn != 0) {
+		XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/*
+		 * acquire a transaction lock on the page;
+		 *
+		 * action: update prev pointer;
+		 */
+		BT_MARK_DIRTY(mp, ip);
+		tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
+
+		/* the page may already have been tlock'd */
+
+		p->header.prev = cpu_to_le64(prevbn);
+
+		XT_PUTPAGE(mp);
+	}
+
+	/* update next pointer of the previous page */
+	if (prevbn != 0) {
+		XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/*
+		 * acquire a transaction lock on the page;
+		 *
+		 * action: update next pointer;
+		 */
+		BT_MARK_DIRTY(mp, ip);
+		tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
+
+		/* the page may already have been tlock'd */
+
+		p->header.next = le64_to_cpu(nextbn);
+
+		XT_PUTPAGE(mp);
+	}
+
+	return 0;
+}
+#endif				/*  _STILL_TO_PORT */
+
+
+/*
+ *	xtInitRoot()
+ *
+ * initialize file root (inline in inode)
+ */
+void xtInitRoot(tid_t tid, struct inode *ip)
+{
+	xtpage_t *p;
+
+	/*
+	 * acquire a transaction lock on the root
+	 *
+	 * action:
+	 */
+	txLock(tid, ip, (struct metapage *) &JFS_IP(ip)->bxflag,
+		      tlckXTREE | tlckNEW);
+	p = &JFS_IP(ip)->i_xtroot;
+
+	p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
+	p->header.nextindex = cpu_to_le16(XTENTRYSTART);
+
+	if (S_ISDIR(ip->i_mode))
+		p->header.maxentry = cpu_to_le16(XTROOTINITSLOT_DIR);
+	else {
+		p->header.maxentry = cpu_to_le16(XTROOTINITSLOT);
+		ip->i_size = 0;
+	}
+
+
+	return;
+}
+
+
+/*
+ * We can run into a deadlock truncating a file with a large number of
+ * xtree pages (large fragmented file).  A robust fix would entail a
+ * reservation system where we would reserve a number of metadata pages
+ * and tlocks which we would be guaranteed without a deadlock.  Without
+ * this, a partial fix is to limit number of metadata pages we will lock
+ * in a single transaction.  Currently we will truncate the file so that
+ * no more than 50 leaf pages will be locked.  The caller of xtTruncate
+ * will be responsible for ensuring that the current transaction gets
+ * committed, and that subsequent transactions are created to truncate
+ * the file further if needed.
+ */
+#define MAX_TRUNCATE_LEAVES 50
+
+/*
+ *	xtTruncate()
+ *
+ * function:
+ *	traverse for truncation logging backward bottom up;
+ *	terminate at the last extent entry at the current subtree
+ *	root page covering new down size.
+ *	truncation may occur within the last extent entry.
+ *
+ * parameter:
+ *	int		tid,
+ *	struct inode	*ip,
+ *	s64		newsize,
+ *	int		type)	{PWMAP, PMAP, WMAP; DELETE, TRUNCATE}
+ *
+ * return:
+ *
+ * note:
+ *	PWMAP:
+ *	 1. truncate (non-COMMIT_NOLINK file)
+ *	    by jfs_truncate() or jfs_open(O_TRUNC):
+ *	    xtree is updated;
+ *	 2. truncate index table of directory when last entry removed
+ *	map update via tlock at commit time;
+ *	PMAP:
+ *	 Call xtTruncate_pmap instead
+ *	WMAP:
+ *	 1. remove (free zero link count) on last reference release
+ *	    (pmap has been freed at commit zero link count);
+ *	 2. truncate (COMMIT_NOLINK file, i.e., tmp file):
+ *	    xtree is updated;
+ *	 map update directly at truncation time;
+ *
+ *	if (DELETE)
+ *		no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient);
+ *	else if (TRUNCATE)
+ *		must write LOG_NOREDOPAGE for deleted index page;
+ *
+ * pages may already have been tlocked by anonymous transactions
+ * during file growth (i.e., write) before truncation;
+ *
+ * except last truncated entry, deleted entries remains as is
+ * in the page (nextindex is updated) for other use
+ * (e.g., log/update allocation map): this avoid copying the page
+ * info but delay free of pages;
+ *
+ */
+s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
+{
+	int rc = 0;
+	s64 teof;
+	struct metapage *mp;
+	xtpage_t *p;
+	s64 bn;
+	int index, nextindex;
+	xad_t *xad;
+	s64 xoff, xaddr;
+	int xlen, len, freexlen;
+	struct btstack btstack;
+	struct btframe *parent;
+	struct tblock *tblk = NULL;
+	struct tlock *tlck = NULL;
+	struct xtlock *xtlck = NULL;
+	struct xdlistlock xadlock;	/* maplock for COMMIT_WMAP */
+	struct pxd_lock *pxdlock;		/* maplock for COMMIT_WMAP */
+	s64 nfreed;
+	int freed, log;
+	int locked_leaves = 0;
+
+	/* save object truncation type */
+	if (tid) {
+		tblk = tid_to_tblock(tid);
+		tblk->xflag |= flag;
+	}
+
+	nfreed = 0;
+
+	flag &= COMMIT_MAP;
+	assert(flag != COMMIT_PMAP);
+
+	if (flag == COMMIT_PWMAP)
+		log = 1;
+	else {
+		log = 0;
+		xadlock.flag = mlckFREEXADLIST;
+		xadlock.index = 1;
+	}
+
+	/*
+	 * if the newsize is not an integral number of pages,
+	 * the file between newsize and next page boundary will
+	 * be cleared.
+	 * if truncating into a file hole, it will cause
+	 * a full block to be allocated for the logical block.
+	 */
+
+	/*
+	 * release page blocks of truncated region <teof, eof>
+	 *
+	 * free the data blocks from the leaf index blocks.
+	 * delete the parent index entries corresponding to
+	 * the freed child data/index blocks.
+	 * free the index blocks themselves which aren't needed
+	 * in new sized file.
+	 *
+	 * index blocks are updated only if the blocks are to be
+	 * retained in the new sized file.
+	 * if type is PMAP, the data and index pages are NOT
+	 * freed, and the data and index blocks are NOT freed
+	 * from working map.
+	 * (this will allow continued access of data/index of
+	 * temporary file (zerolink count file truncated to zero-length)).
+	 */
+	teof = (newsize + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
+	    JFS_SBI(ip->i_sb)->l2bsize;
+
+	/* clear stack */
+	BT_CLR(&btstack);
+
+	/*
+	 * start with root
+	 *
+	 * root resides in the inode
+	 */
+	bn = 0;
+
+	/*
+	 * first access of each page:
+	 */
+      getPage:
+	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+	if (rc)
+		return rc;
+
+	/* process entries backward from last index */
+	index = le16_to_cpu(p->header.nextindex) - 1;
+
+
+	/* Since this is the rightmost page at this level, and we may have
+	 * already freed a page that was formerly to the right, let's make
+	 * sure that the next pointer is zero.
+	 */
+	if (p->header.next) {
+		if (log)
+			/*
+			 * Make sure this change to the header is logged.
+			 * If we really truncate this leaf, the flag
+			 * will be changed to tlckTRUNCATE
+			 */
+			tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+		BT_MARK_DIRTY(mp, ip);
+		p->header.next = 0;
+	}
+
+	if (p->header.flag & BT_INTERNAL)
+		goto getChild;
+
+	/*
+	 *	leaf page
+	 */
+	freed = 0;
+
+	/* does region covered by leaf page precede Teof ? */
+	xad = &p->xad[index];
+	xoff = offsetXAD(xad);
+	xlen = lengthXAD(xad);
+	if (teof >= xoff + xlen) {
+		XT_PUTPAGE(mp);
+		goto getParent;
+	}
+
+	/* (re)acquire tlock of the leaf page */
+	if (log) {
+		if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
+			/*
+			 * We need to limit the size of the transaction
+			 * to avoid exhausting pagecache & tlocks
+			 */
+			XT_PUTPAGE(mp);
+			newsize = (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
+			goto getParent;
+		}
+		tlck = txLock(tid, ip, mp, tlckXTREE);
+		tlck->type = tlckXTREE | tlckTRUNCATE;
+		xtlck = (struct xtlock *) & tlck->lock;
+		xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
+	}
+	BT_MARK_DIRTY(mp, ip);
+
+	/*
+	 * scan backward leaf page entries
+	 */
+	for (; index >= XTENTRYSTART; index--) {
+		xad = &p->xad[index];
+		xoff = offsetXAD(xad);
+		xlen = lengthXAD(xad);
+		xaddr = addressXAD(xad);
+
+		/*
+		 * The "data" for a directory is indexed by the block
+		 * device's address space.  This metadata must be invalidated
+		 * here
+		 */
+		if (S_ISDIR(ip->i_mode) && (teof == 0))
+			invalidate_xad_metapages(ip, *xad);
+		/*
+		 * entry beyond eof: continue scan of current page
+		 *          xad
+		 * ---|---=======------->
+		 *   eof
+		 */
+		if (teof < xoff) {
+			nfreed += xlen;
+			continue;
+		}
+
+		/*
+		 * (xoff <= teof): last entry to be deleted from page;
+		 * If other entries remain in page: keep and update the page.
+		 */
+
+		/*
+		 * eof == entry_start: delete the entry
+		 *           xad
+		 * -------|=======------->
+		 *       eof
+		 *
+		 */
+		if (teof == xoff) {
+			nfreed += xlen;
+
+			if (index == XTENTRYSTART)
+				break;
+
+			nextindex = index;
+		}
+		/*
+		 * eof within the entry: truncate the entry.
+		 *          xad
+		 * -------===|===------->
+		 *          eof
+		 */
+		else if (teof < xoff + xlen) {
+			/* update truncated entry */
+			len = teof - xoff;
+			freexlen = xlen - len;
+			XADlength(xad, len);
+
+			/* save pxd of truncated extent in tlck */
+			xaddr += len;
+			if (log) {	/* COMMIT_PWMAP */
+				xtlck->lwm.offset = (xtlck->lwm.offset) ?
+				    min(index, (int)xtlck->lwm.offset) : index;
+				xtlck->lwm.length = index + 1 -
+				    xtlck->lwm.offset;
+				xtlck->twm.offset = index;
+				pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
+				pxdlock->flag = mlckFREEPXD;
+				PXDaddress(&pxdlock->pxd, xaddr);
+				PXDlength(&pxdlock->pxd, freexlen);
+			}
+			/* free truncated extent */
+			else {	/* COMMIT_WMAP */
+
+				pxdlock = (struct pxd_lock *) & xadlock;
+				pxdlock->flag = mlckFREEPXD;
+				PXDaddress(&pxdlock->pxd, xaddr);
+				PXDlength(&pxdlock->pxd, freexlen);
+				txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP);
+
+				/* reset map lock */
+				xadlock.flag = mlckFREEXADLIST;
+			}
+
+			/* current entry is new last entry; */
+			nextindex = index + 1;
+
+			nfreed += freexlen;
+		}
+		/*
+		 * eof beyond the entry:
+		 *          xad
+		 * -------=======---|--->
+		 *                 eof
+		 */
+		else {		/* (xoff + xlen < teof) */
+
+			nextindex = index + 1;
+		}
+
+		if (nextindex < le16_to_cpu(p->header.nextindex)) {
+			if (!log) {	/* COMMIT_WAMP */
+				xadlock.xdlist = &p->xad[nextindex];
+				xadlock.count =
+				    le16_to_cpu(p->header.nextindex) -
+				    nextindex;
+				txFreeMap(ip, (struct maplock *) & xadlock,
+					  NULL, COMMIT_WMAP);
+			}
+			p->header.nextindex = cpu_to_le16(nextindex);
+		}
+
+		XT_PUTPAGE(mp);
+
+		/* assert(freed == 0); */
+		goto getParent;
+	}			/* end scan of leaf page entries */
+
+	freed = 1;
+
+	/*
+	 * leaf page become empty: free the page if type != PMAP
+	 */
+	if (log) {		/* COMMIT_PWMAP */
+		/* txCommit() with tlckFREE:
+		 * free data extents covered by leaf [XTENTRYSTART:hwm);
+		 * invalidate leaf if COMMIT_PWMAP;
+		 * if (TRUNCATE), will write LOG_NOREDOPAGE;
+		 */
+		tlck->type = tlckXTREE | tlckFREE;
+	} else {		/* COMMIT_WAMP */
+
+		/* free data extents covered by leaf */
+		xadlock.xdlist = &p->xad[XTENTRYSTART];
+		xadlock.count =
+		    le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
+		txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP);
+	}
+
+	if (p->header.flag & BT_ROOT) {
+		p->header.flag &= ~BT_INTERNAL;
+		p->header.flag |= BT_LEAF;
+		p->header.nextindex = cpu_to_le16(XTENTRYSTART);
+
+		XT_PUTPAGE(mp);	/* debug */
+		goto out;
+	} else {
+		if (log) {	/* COMMIT_PWMAP */
+			/* page will be invalidated at tx completion
+			 */
+			XT_PUTPAGE(mp);
+		} else {	/* COMMIT_WMAP */
+
+			if (mp->lid)
+				lid_to_tlock(mp->lid)->flag |= tlckFREELOCK;
+
+			/* invalidate empty leaf page */
+			discard_metapage(mp);
+		}
+	}
+
+	/*
+	 * the leaf page become empty: delete the parent entry
+	 * for the leaf page if the parent page is to be kept
+	 * in the new sized file.
+	 */
+
+	/*
+	 * go back up to the parent page
+	 */
+      getParent:
+	/* pop/restore parent entry for the current child page */
+	if ((parent = BT_POP(&btstack)) == NULL)
+		/* current page must have been root */
+		goto out;
+
+	/* get back the parent page */
+	bn = parent->bn;
+	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+	if (rc)
+		return rc;
+
+	index = parent->index;
+
+	/*
+	 * child page was not empty:
+	 */
+	if (freed == 0) {
+		/* has any entry deleted from parent ? */
+		if (index < le16_to_cpu(p->header.nextindex) - 1) {
+			/* (re)acquire tlock on the parent page */
+			if (log) {	/* COMMIT_PWMAP */
+				/* txCommit() with tlckTRUNCATE:
+				 * free child extents covered by parent [);
+				 */
+				tlck = txLock(tid, ip, mp, tlckXTREE);
+				xtlck = (struct xtlock *) & tlck->lock;
+				if (!(tlck->type & tlckTRUNCATE)) {
+					xtlck->hwm.offset =
+					    le16_to_cpu(p->header.
+							nextindex) - 1;
+					tlck->type =
+					    tlckXTREE | tlckTRUNCATE;
+				}
+			} else {	/* COMMIT_WMAP */
+
+				/* free child extents covered by parent */
+				xadlock.xdlist = &p->xad[index + 1];
+				xadlock.count =
+				    le16_to_cpu(p->header.nextindex) -
+				    index - 1;
+				txFreeMap(ip, (struct maplock *) & xadlock,
+					  NULL, COMMIT_WMAP);
+			}
+			BT_MARK_DIRTY(mp, ip);
+
+			p->header.nextindex = cpu_to_le16(index + 1);
+		}
+		XT_PUTPAGE(mp);
+		goto getParent;
+	}
+
+	/*
+	 * child page was empty:
+	 */
+	nfreed += lengthXAD(&p->xad[index]);
+
+	/*
+	 * During working map update, child page's tlock must be handled
+	 * before parent's.  This is because the parent's tlock will cause
+	 * the child's disk space to be marked available in the wmap, so
+	 * it's important that the child page be released by that time.
+	 *
+	 * ToDo:  tlocks should be on doubly-linked list, so we can
+	 * quickly remove it and add it to the end.
+	 */
+
+	/*
+	 * Move parent page's tlock to the end of the tid's tlock list
+	 */
+	if (log && mp->lid && (tblk->last != mp->lid) &&
+	    lid_to_tlock(mp->lid)->tid) {
+		lid_t lid = mp->lid;
+		struct tlock *prev;
+
+		tlck = lid_to_tlock(lid);
+
+		if (tblk->next == lid)
+			tblk->next = tlck->next;
+		else {
+			for (prev = lid_to_tlock(tblk->next);
+			     prev->next != lid;
+			     prev = lid_to_tlock(prev->next)) {
+				assert(prev->next);
+			}
+			prev->next = tlck->next;
+		}
+		lid_to_tlock(tblk->last)->next = lid;
+		tlck->next = 0;
+		tblk->last = lid;
+	}
+
+	/*
+	 * parent page become empty: free the page
+	 */
+	if (index == XTENTRYSTART) {
+		if (log) {	/* COMMIT_PWMAP */
+			/* txCommit() with tlckFREE:
+			 * free child extents covered by parent;
+			 * invalidate parent if COMMIT_PWMAP;
+			 */
+			tlck = txLock(tid, ip, mp, tlckXTREE);
+			xtlck = (struct xtlock *) & tlck->lock;
+			xtlck->hwm.offset =
+			    le16_to_cpu(p->header.nextindex) - 1;
+			tlck->type = tlckXTREE | tlckFREE;
+		} else {	/* COMMIT_WMAP */
+
+			/* free child extents covered by parent */
+			xadlock.xdlist = &p->xad[XTENTRYSTART];
+			xadlock.count =
+			    le16_to_cpu(p->header.nextindex) -
+			    XTENTRYSTART;
+			txFreeMap(ip, (struct maplock *) & xadlock, NULL,
+				  COMMIT_WMAP);
+		}
+		BT_MARK_DIRTY(mp, ip);
+
+		if (p->header.flag & BT_ROOT) {
+			p->header.flag &= ~BT_INTERNAL;
+			p->header.flag |= BT_LEAF;
+			p->header.nextindex = cpu_to_le16(XTENTRYSTART);
+			if (le16_to_cpu(p->header.maxentry) == XTROOTMAXSLOT) {
+				/*
+				 * Shrink root down to allow inline
+				 * EA (otherwise fsck complains)
+				 */
+				p->header.maxentry =
+				    cpu_to_le16(XTROOTINITSLOT);
+				JFS_IP(ip)->mode2 |= INLINEEA;
+			}
+
+			XT_PUTPAGE(mp);	/* debug */
+			goto out;
+		} else {
+			if (log) {	/* COMMIT_PWMAP */
+				/* page will be invalidated at tx completion
+				 */
+				XT_PUTPAGE(mp);
+			} else {	/* COMMIT_WMAP */
+
+				if (mp->lid)
+					lid_to_tlock(mp->lid)->flag |=
+						tlckFREELOCK;
+
+				/* invalidate parent page */
+				discard_metapage(mp);
+			}
+
+			/* parent has become empty and freed:
+			 * go back up to its parent page
+			 */
+			/* freed = 1; */
+			goto getParent;
+		}
+	}
+	/*
+	 * parent page still has entries for front region;
+	 */
+	else {
+		/* try truncate region covered by preceding entry
+		 * (process backward)
+		 */
+		index--;
+
+		/* go back down to the child page corresponding
+		 * to the entry
+		 */
+		goto getChild;
+	}
+
+	/*
+	 *	internal page: go down to child page of current entry
+	 */
+      getChild:
+	/* save current parent entry for the child page */
+	if (BT_STACK_FULL(&btstack)) {
+		jfs_error(ip->i_sb, "stack overrun in xtTruncate!");
+		XT_PUTPAGE(mp);
+		return -EIO;
+	}
+	BT_PUSH(&btstack, bn, index);
+
+	/* get child page */
+	xad = &p->xad[index];
+	bn = addressXAD(xad);
+
+	/*
+	 * first access of each internal entry:
+	 */
+	/* release parent page */
+	XT_PUTPAGE(mp);
+
+	/* process the child page */
+	goto getPage;
+
+      out:
+	/*
+	 * update file resource stat
+	 */
+	/* set size
+	 */
+	if (S_ISDIR(ip->i_mode) && !newsize)
+		ip->i_size = 1;	/* fsck hates zero-length directories */
+	else
+		ip->i_size = newsize;
+
+	/* update quota allocation to reflect freed blocks */
+	dquot_free_block(ip, nfreed);
+
+	/*
+	 * free tlock of invalidated pages
+	 */
+	if (flag == COMMIT_WMAP)
+		txFreelock(ip);
+
+	return newsize;
+}
+
+
+/*
+ *	xtTruncate_pmap()
+ *
+ * function:
+ *	Perform truncate to zero length for deleted file, leaving the
+ *	the xtree and working map untouched.  This allows the file to
+ *	be accessed via open file handles, while the delete of the file
+ *	is committed to disk.
+ *
+ * parameter:
+ *	tid_t		tid,
+ *	struct inode	*ip,
+ *	s64		committed_size)
+ *
+ * return: new committed size
+ *
+ * note:
+ *
+ *	To avoid deadlock by holding too many transaction locks, the
+ *	truncation may be broken up into multiple transactions.
+ *	The committed_size keeps track of part of the file has been
+ *	freed from the pmaps.
+ */
+s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
+{
+	s64 bn;
+	struct btstack btstack;
+	int cmp;
+	int index;
+	int locked_leaves = 0;
+	struct metapage *mp;
+	xtpage_t *p;
+	struct btframe *parent;
+	int rc;
+	struct tblock *tblk;
+	struct tlock *tlck = NULL;
+	xad_t *xad;
+	int xlen;
+	s64 xoff;
+	struct xtlock *xtlck = NULL;
+
+	/* save object truncation type */
+	tblk = tid_to_tblock(tid);
+	tblk->xflag |= COMMIT_PMAP;
+
+	/* clear stack */
+	BT_CLR(&btstack);
+
+	if (committed_size) {
+		xoff = (committed_size >> JFS_SBI(ip->i_sb)->l2bsize) - 1;
+		rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0);
+		if (rc)
+			return rc;
+
+		XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+		if (cmp != 0) {
+			XT_PUTPAGE(mp);
+			jfs_error(ip->i_sb,
+				  "xtTruncate_pmap: did not find extent");
+			return -EIO;
+		}
+	} else {
+		/*
+		 * start with root
+		 *
+		 * root resides in the inode
+		 */
+		bn = 0;
+
+		/*
+		 * first access of each page:
+		 */
+      getPage:
+		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+		if (rc)
+			return rc;
+
+		/* process entries backward from last index */
+		index = le16_to_cpu(p->header.nextindex) - 1;
+
+		if (p->header.flag & BT_INTERNAL)
+			goto getChild;
+	}
+
+	/*
+	 *	leaf page
+	 */
+
+	if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
+		/*
+		 * We need to limit the size of the transaction
+		 * to avoid exhausting pagecache & tlocks
+		 */
+		xad = &p->xad[index];
+		xoff = offsetXAD(xad);
+		xlen = lengthXAD(xad);
+		XT_PUTPAGE(mp);
+		return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
+	}
+	tlck = txLock(tid, ip, mp, tlckXTREE);
+	tlck->type = tlckXTREE | tlckFREE;
+	xtlck = (struct xtlock *) & tlck->lock;
+	xtlck->hwm.offset = index;
+
+
+	XT_PUTPAGE(mp);
+
+	/*
+	 * go back up to the parent page
+	 */
+      getParent:
+	/* pop/restore parent entry for the current child page */
+	if ((parent = BT_POP(&btstack)) == NULL)
+		/* current page must have been root */
+		goto out;
+
+	/* get back the parent page */
+	bn = parent->bn;
+	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+	if (rc)
+		return rc;
+
+	index = parent->index;
+
+	/*
+	 * parent page become empty: free the page
+	 */
+	if (index == XTENTRYSTART) {
+		/* txCommit() with tlckFREE:
+		 * free child extents covered by parent;
+		 * invalidate parent if COMMIT_PWMAP;
+		 */
+		tlck = txLock(tid, ip, mp, tlckXTREE);
+		xtlck = (struct xtlock *) & tlck->lock;
+		xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
+		tlck->type = tlckXTREE | tlckFREE;
+
+		XT_PUTPAGE(mp);
+
+		if (p->header.flag & BT_ROOT) {
+
+			goto out;
+		} else {
+			goto getParent;
+		}
+	}
+	/*
+	 * parent page still has entries for front region;
+	 */
+	else
+		index--;
+	/*
+	 *	internal page: go down to child page of current entry
+	 */
+      getChild:
+	/* save current parent entry for the child page */
+	if (BT_STACK_FULL(&btstack)) {
+		jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!");
+		XT_PUTPAGE(mp);
+		return -EIO;
+	}
+	BT_PUSH(&btstack, bn, index);
+
+	/* get child page */
+	xad = &p->xad[index];
+	bn = addressXAD(xad);
+
+	/*
+	 * first access of each internal entry:
+	 */
+	/* release parent page */
+	XT_PUTPAGE(mp);
+
+	/* process the child page */
+	goto getPage;
+
+      out:
+
+	return 0;
+}
+
+#ifdef CONFIG_JFS_STATISTICS
+static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m,
+		       "JFS Xtree statistics\n"
+		       "====================\n"
+		       "searches = %d\n"
+		       "fast searches = %d\n"
+		       "splits = %d\n",
+		       xtStat.search,
+		       xtStat.fastSearch,
+		       xtStat.split);
+	return 0;
+}
+
+static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_xtstat_proc_show, NULL);
+}
+
+const struct file_operations jfs_xtstat_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_xtstat_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
new file mode 100644
index 00000000..08c0c749
--- /dev/null
+++ b/fs/jfs/jfs_xtree.h
@@ -0,0 +1,132 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_XTREE
+#define _H_JFS_XTREE
+
+/*
+ *	jfs_xtree.h: extent allocation descriptor B+-tree manager
+ */
+
+#include "jfs_btree.h"
+
+
+/*
+ *	extent allocation descriptor (xad)
+ */
+typedef struct xad {
+	unsigned flag:8;	/* 1: flag */
+	unsigned rsvrd:16;	/* 2: reserved */
+	unsigned off1:8;	/* 1: offset in unit of fsblksize */
+	__le32 off2;		/* 4: offset in unit of fsblksize */
+	unsigned len:24;	/* 3: length in unit of fsblksize */
+	unsigned addr1:8;	/* 1: address in unit of fsblksize */
+	__le32 addr2;		/* 4: address in unit of fsblksize */
+} xad_t;			/* (16) */
+
+#define MAXXLEN		((1 << 24) - 1)
+
+#define XTSLOTSIZE	16
+#define L2XTSLOTSIZE	4
+
+/* xad_t field construction */
+#define XADoffset(xad, offset64)\
+{\
+	(xad)->off1 = ((u64)offset64) >> 32;\
+	(xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
+}
+#define XADaddress(xad, address64)\
+{\
+	(xad)->addr1 = ((u64)address64) >> 32;\
+	(xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
+}
+#define XADlength(xad, length32)	(xad)->len = __cpu_to_le24(length32)
+
+/* xad_t field extraction */
+#define offsetXAD(xad)\
+	( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
+#define addressXAD(xad)\
+	( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2))
+#define lengthXAD(xad)	__le24_to_cpu((xad)->len)
+
+/* xad list */
+struct xadlist {
+	s16 maxnxad;
+	s16 nxad;
+	xad_t *xad;
+};
+
+/* xad_t flags */
+#define XAD_NEW		0x01	/* new */
+#define XAD_EXTENDED	0x02	/* extended */
+#define XAD_COMPRESSED	0x04	/* compressed with recorded length */
+#define XAD_NOTRECORDED 0x08	/* allocated but not recorded */
+#define XAD_COW		0x10	/* copy-on-write */
+
+
+/* possible values for maxentry */
+#define XTROOTINITSLOT_DIR 6
+#define XTROOTINITSLOT	10
+#define XTROOTMAXSLOT	18
+#define XTPAGEMAXSLOT	256
+#define XTENTRYSTART	2
+
+/*
+ *	xtree page:
+ */
+typedef union {
+	struct xtheader {
+		__le64 next;	/* 8: */
+		__le64 prev;	/* 8: */
+
+		u8 flag;	/* 1: */
+		u8 rsrvd1;	/* 1: */
+		__le16 nextindex;	/* 2: next index = number of entries */
+		__le16 maxentry;	/* 2: max number of entries */
+		__le16 rsrvd2;	/* 2: */
+
+		pxd_t self;	/* 8: self */
+	} header;		/* (32) */
+
+	xad_t xad[XTROOTMAXSLOT];	/* 16 * maxentry: xad array */
+} xtpage_t;
+
+/*
+ *	external declaration
+ */
+extern int xtLookup(struct inode *ip, s64 lstart, s64 llen,
+		    int *pflag, s64 * paddr, int *plen, int flag);
+extern void xtInitRoot(tid_t tid, struct inode *ip);
+extern int xtInsert(tid_t tid, struct inode *ip,
+		    int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag);
+extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen,
+		    int flag);
+#ifdef _NOTYET
+extern int xtTailgate(tid_t tid, struct inode *ip,
+		      s64 xoff, int xlen, s64 xaddr, int flag);
+#endif
+extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad);
+extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen,
+		    int flag);
+extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type);
+extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size);
+extern int xtRelocate(tid_t tid, struct inode *ip,
+		      xad_t * oxad, s64 nxaddr, int xtype);
+extern int xtAppend(tid_t tid,
+		    struct inode *ip, int xflag, s64 xoff, int maxblocks,
+		    int *xlenp, s64 * xaddrp, int flag);
+#endif				/* !_H_JFS_XTREE */
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
new file mode 100644
index 00000000..eaaf2b51
--- /dev/null
+++ b/fs/jfs/namei.c
@@ -0,0 +1,1639 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/ctype.h>
+#include <linux/quotaops.h>
+#include <linux/exportfs.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_inode.h"
+#include "jfs_dinode.h"
+#include "jfs_dmap.h"
+#include "jfs_unicode.h"
+#include "jfs_metapage.h"
+#include "jfs_xattr.h"
+#include "jfs_acl.h"
+#include "jfs_debug.h"
+
+/*
+ * forward references
+ */
+const struct dentry_operations jfs_ci_dentry_operations;
+
+static s64 commitZeroLink(tid_t, struct inode *);
+
+/*
+ * NAME:	free_ea_wmap(inode)
+ *
+ * FUNCTION:	free uncommitted extended attributes from working map
+ *
+ */
+static inline void free_ea_wmap(struct inode *inode)
+{
+	dxd_t *ea = &JFS_IP(inode)->ea;
+
+	if (ea->flag & DXD_EXTENT) {
+		/* free EA pages from cache */
+		invalidate_dxd_metapages(inode, *ea);
+		dbFree(inode, addressDXD(ea), lengthDXD(ea));
+	}
+	ea->flag = 0;
+}
+
+/*
+ * NAME:	jfs_create(dip, dentry, mode)
+ *
+ * FUNCTION:	create a regular file in the parent directory <dip>
+ *		with name = <from dentry> and mode = <mode>
+ *
+ * PARAMETER:	dip	- parent directory vnode
+ *		dentry	- dentry of new file
+ *		mode	- create mode (rwxrwxrwx).
+ *		nd- nd struct
+ *
+ * RETURN:	Errors from subroutines
+ *
+ */
+static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	int rc = 0;
+	tid_t tid;		/* transaction id */
+	struct inode *ip = NULL;	/* child directory inode */
+	ino_t ino;
+	struct component_name dname;	/* child directory name */
+	struct btstack btstack;
+	struct inode *iplist[2];
+	struct tblock *tblk;
+
+	jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name);
+
+	dquot_initialize(dip);
+
+	/*
+	 * search parent directory for entry/freespace
+	 * (dtSearch() returns parent directory page pinned)
+	 */
+	if ((rc = get_UCSname(&dname, dentry)))
+		goto out1;
+
+	/*
+	 * Either iAlloc() or txBegin() may block.  Deadlock can occur if we
+	 * block there while holding dtree page, so we allocate the inode &
+	 * begin the transaction before we search the directory.
+	 */
+	ip = ialloc(dip, mode);
+	if (IS_ERR(ip)) {
+		rc = PTR_ERR(ip);
+		goto out2;
+	}
+
+	tid = txBegin(dip->i_sb, 0);
+
+	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+
+	rc = jfs_init_acl(tid, ip, dip);
+	if (rc)
+		goto out3;
+
+	rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
+	if (rc) {
+		txAbort(tid, 0);
+		goto out3;
+	}
+
+	if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) {
+		jfs_err("jfs_create: dtSearch returned %d", rc);
+		txAbort(tid, 0);
+		goto out3;
+	}
+
+	tblk = tid_to_tblock(tid);
+	tblk->xflag |= COMMIT_CREATE;
+	tblk->ino = ip->i_ino;
+	tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
+
+	iplist[0] = dip;
+	iplist[1] = ip;
+
+	/*
+	 * initialize the child XAD tree root in-line in inode
+	 */
+	xtInitRoot(tid, ip);
+
+	/*
+	 * create entry in parent directory for child directory
+	 * (dtInsert() releases parent directory page)
+	 */
+	ino = ip->i_ino;
+	if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) {
+		if (rc == -EIO) {
+			jfs_err("jfs_create: dtInsert returned -EIO");
+			txAbort(tid, 1);	/* Marks Filesystem dirty */
+		} else
+			txAbort(tid, 0);	/* Filesystem full */
+		goto out3;
+	}
+
+	ip->i_op = &jfs_file_inode_operations;
+	ip->i_fop = &jfs_file_operations;
+	ip->i_mapping->a_ops = &jfs_aops;
+
+	mark_inode_dirty(ip);
+
+	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+
+	mark_inode_dirty(dip);
+
+	rc = txCommit(tid, 2, &iplist[0], 0);
+
+      out3:
+	txEnd(tid);
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(dip)->commit_mutex);
+	if (rc) {
+		free_ea_wmap(ip);
+		ip->i_nlink = 0;
+		unlock_new_inode(ip);
+		iput(ip);
+	} else {
+		d_instantiate(dentry, ip);
+		unlock_new_inode(ip);
+	}
+
+      out2:
+	free_UCSname(&dname);
+
+      out1:
+
+	jfs_info("jfs_create: rc:%d", rc);
+	return rc;
+}
+
+
+/*
+ * NAME:	jfs_mkdir(dip, dentry, mode)
+ *
+ * FUNCTION:	create a child directory in the parent directory <dip>
+ *		with name = <from dentry> and mode = <mode>
+ *
+ * PARAMETER:	dip	- parent directory vnode
+ *		dentry	- dentry of child directory
+ *		mode	- create mode (rwxrwxrwx).
+ *
+ * RETURN:	Errors from subroutines
+ *
+ * note:
+ * EACCESS: user needs search+write permission on the parent directory
+ */
+static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
+{
+	int rc = 0;
+	tid_t tid;		/* transaction id */
+	struct inode *ip = NULL;	/* child directory inode */
+	ino_t ino;
+	struct component_name dname;	/* child directory name */
+	struct btstack btstack;
+	struct inode *iplist[2];
+	struct tblock *tblk;
+
+	jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name);
+
+	dquot_initialize(dip);
+
+	/* link count overflow on parent directory ? */
+	if (dip->i_nlink == JFS_LINK_MAX) {
+		rc = -EMLINK;
+		goto out1;
+	}
+
+	/*
+	 * search parent directory for entry/freespace
+	 * (dtSearch() returns parent directory page pinned)
+	 */
+	if ((rc = get_UCSname(&dname, dentry)))
+		goto out1;
+
+	/*
+	 * Either iAlloc() or txBegin() may block.  Deadlock can occur if we
+	 * block there while holding dtree page, so we allocate the inode &
+	 * begin the transaction before we search the directory.
+	 */
+	ip = ialloc(dip, S_IFDIR | mode);
+	if (IS_ERR(ip)) {
+		rc = PTR_ERR(ip);
+		goto out2;
+	}
+
+	tid = txBegin(dip->i_sb, 0);
+
+	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+
+	rc = jfs_init_acl(tid, ip, dip);
+	if (rc)
+		goto out3;
+
+	rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
+	if (rc) {
+		txAbort(tid, 0);
+		goto out3;
+	}
+
+	if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) {
+		jfs_err("jfs_mkdir: dtSearch returned %d", rc);
+		txAbort(tid, 0);
+		goto out3;
+	}
+
+	tblk = tid_to_tblock(tid);
+	tblk->xflag |= COMMIT_CREATE;
+	tblk->ino = ip->i_ino;
+	tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
+
+	iplist[0] = dip;
+	iplist[1] = ip;
+
+	/*
+	 * initialize the child directory in-line in inode
+	 */
+	dtInitRoot(tid, ip, dip->i_ino);
+
+	/*
+	 * create entry in parent directory for child directory
+	 * (dtInsert() releases parent directory page)
+	 */
+	ino = ip->i_ino;
+	if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) {
+		if (rc == -EIO) {
+			jfs_err("jfs_mkdir: dtInsert returned -EIO");
+			txAbort(tid, 1);	/* Marks Filesystem dirty */
+		} else
+			txAbort(tid, 0);	/* Filesystem full */
+		goto out3;
+	}
+
+	ip->i_nlink = 2;	/* for '.' */
+	ip->i_op = &jfs_dir_inode_operations;
+	ip->i_fop = &jfs_dir_operations;
+
+	mark_inode_dirty(ip);
+
+	/* update parent directory inode */
+	inc_nlink(dip);		/* for '..' from child directory */
+	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+	mark_inode_dirty(dip);
+
+	rc = txCommit(tid, 2, &iplist[0], 0);
+
+      out3:
+	txEnd(tid);
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(dip)->commit_mutex);
+	if (rc) {
+		free_ea_wmap(ip);
+		ip->i_nlink = 0;
+		unlock_new_inode(ip);
+		iput(ip);
+	} else {
+		d_instantiate(dentry, ip);
+		unlock_new_inode(ip);
+	}
+
+      out2:
+	free_UCSname(&dname);
+
+
+      out1:
+
+	jfs_info("jfs_mkdir: rc:%d", rc);
+	return rc;
+}
+
+/*
+ * NAME:	jfs_rmdir(dip, dentry)
+ *
+ * FUNCTION:	remove a link to child directory
+ *
+ * PARAMETER:	dip	- parent inode
+ *		dentry	- child directory dentry
+ *
+ * RETURN:	-EINVAL	- if name is . or ..
+ *		-EINVAL - if . or .. exist but are invalid.
+ *		errors from subroutines
+ *
+ * note:
+ * if other threads have the directory open when the last link
+ * is removed, the "." and ".." entries, if present, are removed before
+ * rmdir() returns and no new entries may be created in the directory,
+ * but the directory is not removed until the last reference to
+ * the directory is released (cf.unlink() of regular file).
+ */
+static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
+{
+	int rc;
+	tid_t tid;		/* transaction id */
+	struct inode *ip = dentry->d_inode;
+	ino_t ino;
+	struct component_name dname;
+	struct inode *iplist[2];
+	struct tblock *tblk;
+
+	jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
+
+	/* Init inode for quota operations. */
+	dquot_initialize(dip);
+	dquot_initialize(ip);
+
+	/* directory must be empty to be removed */
+	if (!dtEmpty(ip)) {
+		rc = -ENOTEMPTY;
+		goto out;
+	}
+
+	if ((rc = get_UCSname(&dname, dentry))) {
+		goto out;
+	}
+
+	tid = txBegin(dip->i_sb, 0);
+
+	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+
+	iplist[0] = dip;
+	iplist[1] = ip;
+
+	tblk = tid_to_tblock(tid);
+	tblk->xflag |= COMMIT_DELETE;
+	tblk->u.ip = ip;
+
+	/*
+	 * delete the entry of target directory from parent directory
+	 */
+	ino = ip->i_ino;
+	if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) {
+		jfs_err("jfs_rmdir: dtDelete returned %d", rc);
+		if (rc == -EIO)
+			txAbort(tid, 1);
+		txEnd(tid);
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+		mutex_unlock(&JFS_IP(dip)->commit_mutex);
+
+		goto out2;
+	}
+
+	/* update parent directory's link count corresponding
+	 * to ".." entry of the target directory deleted
+	 */
+	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+	inode_dec_link_count(dip);
+
+	/*
+	 * OS/2 could have created EA and/or ACL
+	 */
+	/* free EA from both persistent and working map */
+	if (JFS_IP(ip)->ea.flag & DXD_EXTENT) {
+		/* free EA pages */
+		txEA(tid, ip, &JFS_IP(ip)->ea, NULL);
+	}
+	JFS_IP(ip)->ea.flag = 0;
+
+	/* free ACL from both persistent and working map */
+	if (JFS_IP(ip)->acl.flag & DXD_EXTENT) {
+		/* free ACL pages */
+		txEA(tid, ip, &JFS_IP(ip)->acl, NULL);
+	}
+	JFS_IP(ip)->acl.flag = 0;
+
+	/* mark the target directory as deleted */
+	clear_nlink(ip);
+	mark_inode_dirty(ip);
+
+	rc = txCommit(tid, 2, &iplist[0], 0);
+
+	txEnd(tid);
+
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(dip)->commit_mutex);
+
+	/*
+	 * Truncating the directory index table is not guaranteed.  It
+	 * may need to be done iteratively
+	 */
+	if (test_cflag(COMMIT_Stale, dip)) {
+		if (dip->i_size > 1)
+			jfs_truncate_nolock(dip, 0);
+
+		clear_cflag(COMMIT_Stale, dip);
+	}
+
+      out2:
+	free_UCSname(&dname);
+
+      out:
+	jfs_info("jfs_rmdir: rc:%d", rc);
+	return rc;
+}
+
+/*
+ * NAME:	jfs_unlink(dip, dentry)
+ *
+ * FUNCTION:	remove a link to object <vp> named by <name>
+ *		from parent directory <dvp>
+ *
+ * PARAMETER:	dip	- inode of parent directory
+ *		dentry	- dentry of object to be removed
+ *
+ * RETURN:	errors from subroutines
+ *
+ * note:
+ * temporary file: if one or more processes have the file open
+ * when the last link is removed, the link will be removed before
+ * unlink() returns, but the removal of the file contents will be
+ * postponed until all references to the files are closed.
+ *
+ * JFS does NOT support unlink() on directories.
+ *
+ */
+static int jfs_unlink(struct inode *dip, struct dentry *dentry)
+{
+	int rc;
+	tid_t tid;		/* transaction id */
+	struct inode *ip = dentry->d_inode;
+	ino_t ino;
+	struct component_name dname;	/* object name */
+	struct inode *iplist[2];
+	struct tblock *tblk;
+	s64 new_size = 0;
+	int commit_flag;
+
+	jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name);
+
+	/* Init inode for quota operations. */
+	dquot_initialize(dip);
+	dquot_initialize(ip);
+
+	if ((rc = get_UCSname(&dname, dentry)))
+		goto out;
+
+	IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
+
+	tid = txBegin(dip->i_sb, 0);
+
+	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+
+	iplist[0] = dip;
+	iplist[1] = ip;
+
+	/*
+	 * delete the entry of target file from parent directory
+	 */
+	ino = ip->i_ino;
+	if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) {
+		jfs_err("jfs_unlink: dtDelete returned %d", rc);
+		if (rc == -EIO)
+			txAbort(tid, 1);	/* Marks FS Dirty */
+		txEnd(tid);
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+		mutex_unlock(&JFS_IP(dip)->commit_mutex);
+		IWRITE_UNLOCK(ip);
+		goto out1;
+	}
+
+	ASSERT(ip->i_nlink);
+
+	ip->i_ctime = dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+	mark_inode_dirty(dip);
+
+	/* update target's inode */
+	inode_dec_link_count(ip);
+
+	/*
+	 *	commit zero link count object
+	 */
+	if (ip->i_nlink == 0) {
+		assert(!test_cflag(COMMIT_Nolink, ip));
+		/* free block resources */
+		if ((new_size = commitZeroLink(tid, ip)) < 0) {
+			txAbort(tid, 1);	/* Marks FS Dirty */
+			txEnd(tid);
+			mutex_unlock(&JFS_IP(ip)->commit_mutex);
+			mutex_unlock(&JFS_IP(dip)->commit_mutex);
+			IWRITE_UNLOCK(ip);
+			rc = new_size;
+			goto out1;
+		}
+		tblk = tid_to_tblock(tid);
+		tblk->xflag |= COMMIT_DELETE;
+		tblk->u.ip = ip;
+	}
+
+	/*
+	 * Incomplete truncate of file data can
+	 * result in timing problems unless we synchronously commit the
+	 * transaction.
+	 */
+	if (new_size)
+		commit_flag = COMMIT_SYNC;
+	else
+		commit_flag = 0;
+
+	/*
+	 * If xtTruncate was incomplete, commit synchronously to avoid
+	 * timing complications
+	 */
+	rc = txCommit(tid, 2, &iplist[0], commit_flag);
+
+	txEnd(tid);
+
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(dip)->commit_mutex);
+
+	while (new_size && (rc == 0)) {
+		tid = txBegin(dip->i_sb, 0);
+		mutex_lock(&JFS_IP(ip)->commit_mutex);
+		new_size = xtTruncate_pmap(tid, ip, new_size);
+		if (new_size < 0) {
+			txAbort(tid, 1);	/* Marks FS Dirty */
+			rc = new_size;
+		} else
+			rc = txCommit(tid, 2, &iplist[0], COMMIT_SYNC);
+		txEnd(tid);
+		mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	}
+
+	if (ip->i_nlink == 0)
+		set_cflag(COMMIT_Nolink, ip);
+
+	IWRITE_UNLOCK(ip);
+
+	/*
+	 * Truncating the directory index table is not guaranteed.  It
+	 * may need to be done iteratively
+	 */
+	if (test_cflag(COMMIT_Stale, dip)) {
+		if (dip->i_size > 1)
+			jfs_truncate_nolock(dip, 0);
+
+		clear_cflag(COMMIT_Stale, dip);
+	}
+
+      out1:
+	free_UCSname(&dname);
+      out:
+	jfs_info("jfs_unlink: rc:%d", rc);
+	return rc;
+}
+
+/*
+ * NAME:	commitZeroLink()
+ *
+ * FUNCTION:	for non-directory, called by jfs_remove(),
+ *		truncate a regular file, directory or symbolic
+ *		link to zero length. return 0 if type is not
+ *		one of these.
+ *
+ *		if the file is currently associated with a VM segment
+ *		only permanent disk and inode map resources are freed,
+ *		and neither the inode nor indirect blocks are modified
+ *		so that the resources can be later freed in the work
+ *		map by ctrunc1.
+ *		if there is no VM segment on entry, the resources are
+ *		freed in both work and permanent map.
+ *		(? for temporary file - memory object is cached even
+ *		after no reference:
+ *		reference count > 0 -   )
+ *
+ * PARAMETERS:	cd	- pointer to commit data structure.
+ *			  current inode is the one to truncate.
+ *
+ * RETURN:	Errors from subroutines
+ */
+static s64 commitZeroLink(tid_t tid, struct inode *ip)
+{
+	int filetype;
+	struct tblock *tblk;
+
+	jfs_info("commitZeroLink: tid = %d, ip = 0x%p", tid, ip);
+
+	filetype = ip->i_mode & S_IFMT;
+	switch (filetype) {
+	case S_IFREG:
+		break;
+	case S_IFLNK:
+		/* fast symbolic link */
+		if (ip->i_size < IDATASIZE) {
+			ip->i_size = 0;
+			return 0;
+		}
+		break;
+	default:
+		assert(filetype != S_IFDIR);
+		return 0;
+	}
+
+	set_cflag(COMMIT_Freewmap, ip);
+
+	/* mark transaction of block map update type */
+	tblk = tid_to_tblock(tid);
+	tblk->xflag |= COMMIT_PMAP;
+
+	/*
+	 * free EA
+	 */
+	if (JFS_IP(ip)->ea.flag & DXD_EXTENT)
+		/* acquire maplock on EA to be freed from block map */
+		txEA(tid, ip, &JFS_IP(ip)->ea, NULL);
+
+	/*
+	 * free ACL
+	 */
+	if (JFS_IP(ip)->acl.flag & DXD_EXTENT)
+		/* acquire maplock on EA to be freed from block map */
+		txEA(tid, ip, &JFS_IP(ip)->acl, NULL);
+
+	/*
+	 * free xtree/data (truncate to zero length):
+	 * free xtree/data pages from cache if COMMIT_PWMAP,
+	 * free xtree/data blocks from persistent block map, and
+	 * free xtree/data blocks from working block map if COMMIT_PWMAP;
+	 */
+	if (ip->i_size)
+		return xtTruncate_pmap(tid, ip, 0);
+
+	return 0;
+}
+
+
+/*
+ * NAME:	jfs_free_zero_link()
+ *
+ * FUNCTION:	for non-directory, called by iClose(),
+ *		free resources of a file from cache and WORKING map
+ *		for a file previously committed with zero link count
+ *		while associated with a pager object,
+ *
+ * PARAMETER:	ip	- pointer to inode of file.
+ */
+void jfs_free_zero_link(struct inode *ip)
+{
+	int type;
+
+	jfs_info("jfs_free_zero_link: ip = 0x%p", ip);
+
+	/* return if not reg or symbolic link or if size is
+	 * already ok.
+	 */
+	type = ip->i_mode & S_IFMT;
+
+	switch (type) {
+	case S_IFREG:
+		break;
+	case S_IFLNK:
+		/* if its contained in inode nothing to do */
+		if (ip->i_size < IDATASIZE)
+			return;
+		break;
+	default:
+		return;
+	}
+
+	/*
+	 * free EA
+	 */
+	if (JFS_IP(ip)->ea.flag & DXD_EXTENT) {
+		s64 xaddr = addressDXD(&JFS_IP(ip)->ea);
+		int xlen = lengthDXD(&JFS_IP(ip)->ea);
+		struct maplock maplock;	/* maplock for COMMIT_WMAP */
+		struct pxd_lock *pxdlock;	/* maplock for COMMIT_WMAP */
+
+		/* free EA pages from cache */
+		invalidate_dxd_metapages(ip, JFS_IP(ip)->ea);
+
+		/* free EA extent from working block map */
+		maplock.index = 1;
+		pxdlock = (struct pxd_lock *) & maplock;
+		pxdlock->flag = mlckFREEPXD;
+		PXDaddress(&pxdlock->pxd, xaddr);
+		PXDlength(&pxdlock->pxd, xlen);
+		txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP);
+	}
+
+	/*
+	 * free ACL
+	 */
+	if (JFS_IP(ip)->acl.flag & DXD_EXTENT) {
+		s64 xaddr = addressDXD(&JFS_IP(ip)->acl);
+		int xlen = lengthDXD(&JFS_IP(ip)->acl);
+		struct maplock maplock;	/* maplock for COMMIT_WMAP */
+		struct pxd_lock *pxdlock;	/* maplock for COMMIT_WMAP */
+
+		invalidate_dxd_metapages(ip, JFS_IP(ip)->acl);
+
+		/* free ACL extent from working block map */
+		maplock.index = 1;
+		pxdlock = (struct pxd_lock *) & maplock;
+		pxdlock->flag = mlckFREEPXD;
+		PXDaddress(&pxdlock->pxd, xaddr);
+		PXDlength(&pxdlock->pxd, xlen);
+		txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP);
+	}
+
+	/*
+	 * free xtree/data (truncate to zero length):
+	 * free xtree/data pages from cache, and
+	 * free xtree/data blocks from working block map;
+	 */
+	if (ip->i_size)
+		xtTruncate(0, ip, 0, COMMIT_WMAP);
+}
+
+/*
+ * NAME:	jfs_link(vp, dvp, name, crp)
+ *
+ * FUNCTION:	create a link to <vp> by the name = <name>
+ *		in the parent directory <dvp>
+ *
+ * PARAMETER:	vp	- target object
+ *		dvp	- parent directory of new link
+ *		name	- name of new link to target object
+ *		crp	- credential
+ *
+ * RETURN:	Errors from subroutines
+ *
+ * note:
+ * JFS does NOT support link() on directories (to prevent circular
+ * path in the directory hierarchy);
+ * EPERM: the target object is a directory, and either the caller
+ * does not have appropriate privileges or the implementation prohibits
+ * using link() on directories [XPG4.2].
+ *
+ * JFS does NOT support links between file systems:
+ * EXDEV: target object and new link are on different file systems and
+ * implementation does not support links between file systems [XPG4.2].
+ */
+static int jfs_link(struct dentry *old_dentry,
+	     struct inode *dir, struct dentry *dentry)
+{
+	int rc;
+	tid_t tid;
+	struct inode *ip = old_dentry->d_inode;
+	ino_t ino;
+	struct component_name dname;
+	struct btstack btstack;
+	struct inode *iplist[2];
+
+	jfs_info("jfs_link: %s %s", old_dentry->d_name.name,
+		 dentry->d_name.name);
+
+	if (ip->i_nlink == JFS_LINK_MAX)
+		return -EMLINK;
+
+	dquot_initialize(dir);
+
+	tid = txBegin(ip->i_sb, 0);
+
+	mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+
+	/*
+	 * scan parent directory for entry/freespace
+	 */
+	if ((rc = get_UCSname(&dname, dentry)))
+		goto out;
+
+	if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE)))
+		goto free_dname;
+
+	/*
+	 * create entry for new link in parent directory
+	 */
+	ino = ip->i_ino;
+	if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack)))
+		goto free_dname;
+
+	/* update object inode */
+	inc_nlink(ip);		/* for new link */
+	ip->i_ctime = CURRENT_TIME;
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	mark_inode_dirty(dir);
+	ihold(ip);
+
+	iplist[0] = ip;
+	iplist[1] = dir;
+	rc = txCommit(tid, 2, &iplist[0], 0);
+
+	if (rc) {
+		ip->i_nlink--; /* never instantiated */
+		iput(ip);
+	} else
+		d_instantiate(dentry, ip);
+
+      free_dname:
+	free_UCSname(&dname);
+
+      out:
+	txEnd(tid);
+
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(dir)->commit_mutex);
+
+	jfs_info("jfs_link: rc:%d", rc);
+	return rc;
+}
+
+/*
+ * NAME:	jfs_symlink(dip, dentry, name)
+ *
+ * FUNCTION:	creates a symbolic link to <symlink> by name <name>
+ *			in directory <dip>
+ *
+ * PARAMETER:	dip	- parent directory vnode
+ *		dentry	- dentry of symbolic link
+ *		name	- the path name of the existing object
+ *			  that will be the source of the link
+ *
+ * RETURN:	errors from subroutines
+ *
+ * note:
+ * ENAMETOOLONG: pathname resolution of a symbolic link produced
+ * an intermediate result whose length exceeds PATH_MAX [XPG4.2]
+*/
+
+static int jfs_symlink(struct inode *dip, struct dentry *dentry,
+		const char *name)
+{
+	int rc;
+	tid_t tid;
+	ino_t ino = 0;
+	struct component_name dname;
+	int ssize;		/* source pathname size */
+	struct btstack btstack;
+	struct inode *ip = dentry->d_inode;
+	unchar *i_fastsymlink;
+	s64 xlen = 0;
+	int bmask = 0, xsize;
+	s64 extent = 0, xaddr;
+	struct metapage *mp;
+	struct super_block *sb;
+	struct tblock *tblk;
+
+	struct inode *iplist[2];
+
+	jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name);
+
+	dquot_initialize(dip);
+
+	ssize = strlen(name) + 1;
+
+	/*
+	 * search parent directory for entry/freespace
+	 * (dtSearch() returns parent directory page pinned)
+	 */
+
+	if ((rc = get_UCSname(&dname, dentry)))
+		goto out1;
+
+	/*
+	 * allocate on-disk/in-memory inode for symbolic link:
+	 * (iAlloc() returns new, locked inode)
+	 */
+	ip = ialloc(dip, S_IFLNK | 0777);
+	if (IS_ERR(ip)) {
+		rc = PTR_ERR(ip);
+		goto out2;
+	}
+
+	tid = txBegin(dip->i_sb, 0);
+
+	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+
+	rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
+	if (rc)
+		goto out3;
+
+	tblk = tid_to_tblock(tid);
+	tblk->xflag |= COMMIT_CREATE;
+	tblk->ino = ip->i_ino;
+	tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
+
+	/* fix symlink access permission
+	 * (dir_create() ANDs in the u.u_cmask,
+	 * but symlinks really need to be 777 access)
+	 */
+	ip->i_mode |= 0777;
+
+	/*
+	 * write symbolic link target path name
+	 */
+	xtInitRoot(tid, ip);
+
+	/*
+	 * write source path name inline in on-disk inode (fast symbolic link)
+	 */
+
+	if (ssize <= IDATASIZE) {
+		ip->i_op = &jfs_fast_symlink_inode_operations;
+
+		i_fastsymlink = JFS_IP(ip)->i_inline;
+		memcpy(i_fastsymlink, name, ssize);
+		ip->i_size = ssize - 1;
+
+		/*
+		 * if symlink is > 128 bytes, we don't have the space to
+		 * store inline extended attributes
+		 */
+		if (ssize > sizeof (JFS_IP(ip)->i_inline))
+			JFS_IP(ip)->mode2 &= ~INLINEEA;
+
+		jfs_info("jfs_symlink: fast symlink added  ssize:%d name:%s ",
+			 ssize, name);
+	}
+	/*
+	 * write source path name in a single extent
+	 */
+	else {
+		jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
+
+		ip->i_op = &jfs_symlink_inode_operations;
+		ip->i_mapping->a_ops = &jfs_aops;
+
+		/*
+		 * even though the data of symlink object (source
+		 * path name) is treated as non-journaled user data,
+		 * it is read/written thru buffer cache for performance.
+		 */
+		sb = ip->i_sb;
+		bmask = JFS_SBI(sb)->bsize - 1;
+		xsize = (ssize + bmask) & ~bmask;
+		xaddr = 0;
+		xlen = xsize >> JFS_SBI(sb)->l2bsize;
+		if ((rc = xtInsert(tid, ip, 0, 0, xlen, &xaddr, 0))) {
+			txAbort(tid, 0);
+			goto out3;
+		}
+		extent = xaddr;
+		ip->i_size = ssize - 1;
+		while (ssize) {
+			/* This is kind of silly since PATH_MAX == 4K */
+			int copy_size = min(ssize, PSIZE);
+
+			mp = get_metapage(ip, xaddr, PSIZE, 1);
+
+			if (mp == NULL) {
+				xtTruncate(tid, ip, 0, COMMIT_PWMAP);
+				rc = -EIO;
+				txAbort(tid, 0);
+				goto out3;
+			}
+			memcpy(mp->data, name, copy_size);
+			flush_metapage(mp);
+			ssize -= copy_size;
+			name += copy_size;
+			xaddr += JFS_SBI(sb)->nbperpage;
+		}
+	}
+
+	/*
+	 * create entry for symbolic link in parent directory
+	 */
+	rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE);
+	if (rc == 0) {
+		ino = ip->i_ino;
+		rc = dtInsert(tid, dip, &dname, &ino, &btstack);
+	}
+	if (rc) {
+		if (xlen)
+			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
+		txAbort(tid, 0);
+		/* discard new inode */
+		goto out3;
+	}
+
+	mark_inode_dirty(ip);
+
+	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+	mark_inode_dirty(dip);
+	/*
+	 * commit update of parent directory and link object
+	 */
+
+	iplist[0] = dip;
+	iplist[1] = ip;
+	rc = txCommit(tid, 2, &iplist[0], 0);
+
+      out3:
+	txEnd(tid);
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(dip)->commit_mutex);
+	if (rc) {
+		free_ea_wmap(ip);
+		ip->i_nlink = 0;
+		unlock_new_inode(ip);
+		iput(ip);
+	} else {
+		d_instantiate(dentry, ip);
+		unlock_new_inode(ip);
+	}
+
+      out2:
+	free_UCSname(&dname);
+
+      out1:
+	jfs_info("jfs_symlink: rc:%d", rc);
+	return rc;
+}
+
+
+/*
+ * NAME:	jfs_rename
+ *
+ * FUNCTION:	rename a file or directory
+ */
+static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+	       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct btstack btstack;
+	ino_t ino;
+	struct component_name new_dname;
+	struct inode *new_ip;
+	struct component_name old_dname;
+	struct inode *old_ip;
+	int rc;
+	tid_t tid;
+	struct tlock *tlck;
+	struct dt_lock *dtlck;
+	struct lv *lv;
+	int ipcount;
+	struct inode *iplist[4];
+	struct tblock *tblk;
+	s64 new_size = 0;
+	int commit_flag;
+
+
+	jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
+		 new_dentry->d_name.name);
+
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
+	old_ip = old_dentry->d_inode;
+	new_ip = new_dentry->d_inode;
+
+	if ((rc = get_UCSname(&old_dname, old_dentry)))
+		goto out1;
+
+	if ((rc = get_UCSname(&new_dname, new_dentry)))
+		goto out2;
+
+	/*
+	 * Make sure source inode number is what we think it is
+	 */
+	rc = dtSearch(old_dir, &old_dname, &ino, &btstack, JFS_LOOKUP);
+	if (rc || (ino != old_ip->i_ino)) {
+		rc = -ENOENT;
+		goto out3;
+	}
+
+	/*
+	 * Make sure dest inode number (if any) is what we think it is
+	 */
+	rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP);
+	if (!rc) {
+		if ((!new_ip) || (ino != new_ip->i_ino)) {
+			rc = -ESTALE;
+			goto out3;
+		}
+	} else if (rc != -ENOENT)
+		goto out3;
+	else if (new_ip) {
+		/* no entry exists, but one was expected */
+		rc = -ESTALE;
+		goto out3;
+	}
+
+	if (S_ISDIR(old_ip->i_mode)) {
+		if (new_ip) {
+			if (!dtEmpty(new_ip)) {
+				rc = -ENOTEMPTY;
+				goto out3;
+			}
+		} else if ((new_dir != old_dir) &&
+			   (new_dir->i_nlink == JFS_LINK_MAX)) {
+			rc = -EMLINK;
+			goto out3;
+		}
+	} else if (new_ip) {
+		IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
+		/* Init inode for quota operations. */
+		dquot_initialize(new_ip);
+	}
+
+	/*
+	 * The real work starts here
+	 */
+	tid = txBegin(new_dir->i_sb, 0);
+
+	/*
+	 * How do we know the locking is safe from deadlocks?
+	 * The vfs does the hard part for us.  Any time we are taking nested
+	 * commit_mutexes, the vfs already has i_mutex held on the parent.
+	 * Here, the vfs has already taken i_mutex on both old_dir and new_dir.
+	 */
+	mutex_lock_nested(&JFS_IP(new_dir)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(old_ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+	if (old_dir != new_dir)
+		mutex_lock_nested(&JFS_IP(old_dir)->commit_mutex,
+				  COMMIT_MUTEX_SECOND_PARENT);
+
+	if (new_ip) {
+		mutex_lock_nested(&JFS_IP(new_ip)->commit_mutex,
+				  COMMIT_MUTEX_VICTIM);
+		/*
+		 * Change existing directory entry to new inode number
+		 */
+		ino = new_ip->i_ino;
+		rc = dtModify(tid, new_dir, &new_dname, &ino,
+			      old_ip->i_ino, JFS_RENAME);
+		if (rc)
+			goto out4;
+		drop_nlink(new_ip);
+		if (S_ISDIR(new_ip->i_mode)) {
+			drop_nlink(new_ip);
+			if (new_ip->i_nlink) {
+				mutex_unlock(&JFS_IP(new_ip)->commit_mutex);
+				if (old_dir != new_dir)
+					mutex_unlock(&JFS_IP(old_dir)->commit_mutex);
+				mutex_unlock(&JFS_IP(old_ip)->commit_mutex);
+				mutex_unlock(&JFS_IP(new_dir)->commit_mutex);
+				if (!S_ISDIR(old_ip->i_mode) && new_ip)
+					IWRITE_UNLOCK(new_ip);
+				jfs_error(new_ip->i_sb,
+					  "jfs_rename: new_ip->i_nlink != 0");
+				return -EIO;
+			}
+			tblk = tid_to_tblock(tid);
+			tblk->xflag |= COMMIT_DELETE;
+			tblk->u.ip = new_ip;
+		} else if (new_ip->i_nlink == 0) {
+			assert(!test_cflag(COMMIT_Nolink, new_ip));
+			/* free block resources */
+			if ((new_size = commitZeroLink(tid, new_ip)) < 0) {
+				txAbort(tid, 1);	/* Marks FS Dirty */
+				rc = new_size;
+				goto out4;
+			}
+			tblk = tid_to_tblock(tid);
+			tblk->xflag |= COMMIT_DELETE;
+			tblk->u.ip = new_ip;
+		} else {
+			new_ip->i_ctime = CURRENT_TIME;
+			mark_inode_dirty(new_ip);
+		}
+	} else {
+		/*
+		 * Add new directory entry
+		 */
+		rc = dtSearch(new_dir, &new_dname, &ino, &btstack,
+			      JFS_CREATE);
+		if (rc) {
+			jfs_err("jfs_rename didn't expect dtSearch to fail "
+				"w/rc = %d", rc);
+			goto out4;
+		}
+
+		ino = old_ip->i_ino;
+		rc = dtInsert(tid, new_dir, &new_dname, &ino, &btstack);
+		if (rc) {
+			if (rc == -EIO)
+				jfs_err("jfs_rename: dtInsert returned -EIO");
+			goto out4;
+		}
+		if (S_ISDIR(old_ip->i_mode))
+			inc_nlink(new_dir);
+	}
+	/*
+	 * Remove old directory entry
+	 */
+
+	ino = old_ip->i_ino;
+	rc = dtDelete(tid, old_dir, &old_dname, &ino, JFS_REMOVE);
+	if (rc) {
+		jfs_err("jfs_rename did not expect dtDelete to return rc = %d",
+			rc);
+		txAbort(tid, 1);	/* Marks Filesystem dirty */
+		goto out4;
+	}
+	if (S_ISDIR(old_ip->i_mode)) {
+		drop_nlink(old_dir);
+		if (old_dir != new_dir) {
+			/*
+			 * Change inode number of parent for moved directory
+			 */
+
+			JFS_IP(old_ip)->i_dtroot.header.idotdot =
+				cpu_to_le32(new_dir->i_ino);
+
+			/* Linelock header of dtree */
+			tlck = txLock(tid, old_ip,
+				    (struct metapage *) &JFS_IP(old_ip)->bxflag,
+				      tlckDTREE | tlckBTROOT | tlckRELINK);
+			dtlck = (struct dt_lock *) & tlck->lock;
+			ASSERT(dtlck->index == 0);
+			lv = & dtlck->lv[0];
+			lv->offset = 0;
+			lv->length = 1;
+			dtlck->index++;
+		}
+	}
+
+	/*
+	 * Update ctime on changed/moved inodes & mark dirty
+	 */
+	old_ip->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(old_ip);
+
+	new_dir->i_ctime = new_dir->i_mtime = current_fs_time(new_dir->i_sb);
+	mark_inode_dirty(new_dir);
+
+	/* Build list of inodes modified by this transaction */
+	ipcount = 0;
+	iplist[ipcount++] = old_ip;
+	if (new_ip)
+		iplist[ipcount++] = new_ip;
+	iplist[ipcount++] = old_dir;
+
+	if (old_dir != new_dir) {
+		iplist[ipcount++] = new_dir;
+		old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+		mark_inode_dirty(old_dir);
+	}
+
+	/*
+	 * Incomplete truncate of file data can
+	 * result in timing problems unless we synchronously commit the
+	 * transaction.
+	 */
+	if (new_size)
+		commit_flag = COMMIT_SYNC;
+	else
+		commit_flag = 0;
+
+	rc = txCommit(tid, ipcount, iplist, commit_flag);
+
+      out4:
+	txEnd(tid);
+	if (new_ip)
+		mutex_unlock(&JFS_IP(new_ip)->commit_mutex);
+	if (old_dir != new_dir)
+		mutex_unlock(&JFS_IP(old_dir)->commit_mutex);
+	mutex_unlock(&JFS_IP(old_ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(new_dir)->commit_mutex);
+
+	while (new_size && (rc == 0)) {
+		tid = txBegin(new_ip->i_sb, 0);
+		mutex_lock(&JFS_IP(new_ip)->commit_mutex);
+		new_size = xtTruncate_pmap(tid, new_ip, new_size);
+		if (new_size < 0) {
+			txAbort(tid, 1);
+			rc = new_size;
+		} else
+			rc = txCommit(tid, 1, &new_ip, COMMIT_SYNC);
+		txEnd(tid);
+		mutex_unlock(&JFS_IP(new_ip)->commit_mutex);
+	}
+	if (new_ip && (new_ip->i_nlink == 0))
+		set_cflag(COMMIT_Nolink, new_ip);
+      out3:
+	free_UCSname(&new_dname);
+      out2:
+	free_UCSname(&old_dname);
+      out1:
+	if (new_ip && !S_ISDIR(new_ip->i_mode))
+		IWRITE_UNLOCK(new_ip);
+	/*
+	 * Truncating the directory index table is not guaranteed.  It
+	 * may need to be done iteratively
+	 */
+	if (test_cflag(COMMIT_Stale, old_dir)) {
+		if (old_dir->i_size > 1)
+			jfs_truncate_nolock(old_dir, 0);
+
+		clear_cflag(COMMIT_Stale, old_dir);
+	}
+
+	jfs_info("jfs_rename: returning %d", rc);
+	return rc;
+}
+
+
+/*
+ * NAME:	jfs_mknod
+ *
+ * FUNCTION:	Create a special file (device)
+ */
+static int jfs_mknod(struct inode *dir, struct dentry *dentry,
+		int mode, dev_t rdev)
+{
+	struct jfs_inode_info *jfs_ip;
+	struct btstack btstack;
+	struct component_name dname;
+	ino_t ino;
+	struct inode *ip;
+	struct inode *iplist[2];
+	int rc;
+	tid_t tid;
+	struct tblock *tblk;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	jfs_info("jfs_mknod: %s", dentry->d_name.name);
+
+	dquot_initialize(dir);
+
+	if ((rc = get_UCSname(&dname, dentry)))
+		goto out;
+
+	ip = ialloc(dir, mode);
+	if (IS_ERR(ip)) {
+		rc = PTR_ERR(ip);
+		goto out1;
+	}
+	jfs_ip = JFS_IP(ip);
+
+	tid = txBegin(dir->i_sb, 0);
+
+	mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
+
+	rc = jfs_init_acl(tid, ip, dir);
+	if (rc)
+		goto out3;
+
+	rc = jfs_init_security(tid, ip, dir, &dentry->d_name);
+	if (rc) {
+		txAbort(tid, 0);
+		goto out3;
+	}
+
+	if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) {
+		txAbort(tid, 0);
+		goto out3;
+	}
+
+	tblk = tid_to_tblock(tid);
+	tblk->xflag |= COMMIT_CREATE;
+	tblk->ino = ip->i_ino;
+	tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
+
+	ino = ip->i_ino;
+	if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack))) {
+		txAbort(tid, 0);
+		goto out3;
+	}
+
+	ip->i_op = &jfs_file_inode_operations;
+	jfs_ip->dev = new_encode_dev(rdev);
+	init_special_inode(ip, ip->i_mode, rdev);
+
+	mark_inode_dirty(ip);
+
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+
+	mark_inode_dirty(dir);
+
+	iplist[0] = dir;
+	iplist[1] = ip;
+	rc = txCommit(tid, 2, iplist, 0);
+
+      out3:
+	txEnd(tid);
+	mutex_unlock(&JFS_IP(ip)->commit_mutex);
+	mutex_unlock(&JFS_IP(dir)->commit_mutex);
+	if (rc) {
+		free_ea_wmap(ip);
+		ip->i_nlink = 0;
+		unlock_new_inode(ip);
+		iput(ip);
+	} else {
+		d_instantiate(dentry, ip);
+		unlock_new_inode(ip);
+	}
+
+      out1:
+	free_UCSname(&dname);
+
+      out:
+	jfs_info("jfs_mknod: returning %d", rc);
+	return rc;
+}
+
+static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd)
+{
+	struct btstack btstack;
+	ino_t inum;
+	struct inode *ip;
+	struct component_name key;
+	const char *name = dentry->d_name.name;
+	int len = dentry->d_name.len;
+	int rc;
+
+	jfs_info("jfs_lookup: name = %s", name);
+
+	if ((name[0] == '.') && (len == 1))
+		inum = dip->i_ino;
+	else if (strcmp(name, "..") == 0)
+		inum = PARENT(dip);
+	else {
+		if ((rc = get_UCSname(&key, dentry)))
+			return ERR_PTR(rc);
+		rc = dtSearch(dip, &key, &inum, &btstack, JFS_LOOKUP);
+		free_UCSname(&key);
+		if (rc == -ENOENT) {
+			d_add(dentry, NULL);
+			return NULL;
+		} else if (rc) {
+			jfs_err("jfs_lookup: dtSearch returned %d", rc);
+			return ERR_PTR(rc);
+		}
+	}
+
+	ip = jfs_iget(dip->i_sb, inum);
+	if (IS_ERR(ip)) {
+		jfs_err("jfs_lookup: iget failed on inum %d", (uint) inum);
+		return ERR_CAST(ip);
+	}
+
+	return d_splice_alias(ip, dentry);
+}
+
+static struct inode *jfs_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	if (ino == 0)
+		return ERR_PTR(-ESTALE);
+	inode = jfs_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return inode;
+}
+
+struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    jfs_nfs_get_inode);
+}
+
+struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    jfs_nfs_get_inode);
+}
+
+struct dentry *jfs_get_parent(struct dentry *dentry)
+{
+	unsigned long parent_ino;
+
+	parent_ino =
+		le32_to_cpu(JFS_IP(dentry->d_inode)->i_dtroot.header.idotdot);
+
+	return d_obtain_alias(jfs_iget(dentry->d_inode->i_sb, parent_ino));
+}
+
+const struct inode_operations jfs_dir_inode_operations = {
+	.create		= jfs_create,
+	.lookup		= jfs_lookup,
+	.link		= jfs_link,
+	.unlink		= jfs_unlink,
+	.symlink	= jfs_symlink,
+	.mkdir		= jfs_mkdir,
+	.rmdir		= jfs_rmdir,
+	.mknod		= jfs_mknod,
+	.rename		= jfs_rename,
+	.setxattr	= jfs_setxattr,
+	.getxattr	= jfs_getxattr,
+	.listxattr	= jfs_listxattr,
+	.removexattr	= jfs_removexattr,
+	.setattr	= jfs_setattr,
+#ifdef CONFIG_JFS_POSIX_ACL
+	.check_acl	= jfs_check_acl,
+#endif
+};
+
+const struct file_operations jfs_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= jfs_readdir,
+	.fsync		= jfs_fsync,
+	.unlocked_ioctl = jfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= jfs_compat_ioctl,
+#endif
+	.llseek		= generic_file_llseek,
+};
+
+static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
+		struct qstr *this)
+{
+	unsigned long hash;
+	int i;
+
+	hash = init_name_hash();
+	for (i=0; i < this->len; i++)
+		hash = partial_name_hash(tolower(this->name[i]), hash);
+	this->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+static int jfs_ci_compare(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	int i, result = 1;
+
+	if (len != name->len)
+		goto out;
+	for (i=0; i < len; i++) {
+		if (tolower(str[i]) != tolower(name->name[i]))
+			goto out;
+	}
+	result = 0;
+out:
+	return result;
+}
+
+static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+	/*
+	 * This is not negative dentry. Always valid.
+	 *
+	 * Note, rename() to existing directory entry will have ->d_inode,
+	 * and will use existing name which isn't specified name by user.
+	 *
+	 * We may be able to drop this positive dentry here. But dropping
+	 * positive dentry isn't good idea. So it's unsupported like
+	 * rename("filename", "FILENAME") for now.
+	 */
+	if (dentry->d_inode)
+		return 1;
+
+	/*
+	 * This may be nfsd (or something), anyway, we can't see the
+	 * intent of this. So, since this can be for creation, drop it.
+	 */
+	if (!nd)
+		return 0;
+
+	/*
+	 * Drop the negative dentry, in order to make sure to use the
+	 * case sensitive name which is specified by user if this is
+	 * for creation.
+	 */
+	if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+		if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+			return 0;
+	}
+	return 1;
+}
+
+const struct dentry_operations jfs_ci_dentry_operations =
+{
+	.d_hash = jfs_ci_hash,
+	.d_compare = jfs_ci_compare,
+	.d_revalidate = jfs_ci_revalidate,
+};
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
new file mode 100644
index 00000000..8d0c1c7c
--- /dev/null
+++ b/fs/jfs/resize.c
@@ -0,0 +1,543 @@
+/*
+ *   Copyright (C) International Business Machines  Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_dinode.h"
+#include "jfs_imap.h"
+#include "jfs_dmap.h"
+#include "jfs_superblock.h"
+#include "jfs_txnmgr.h"
+#include "jfs_debug.h"
+
+#define BITSPERPAGE	(PSIZE << 3)
+#define L2MEGABYTE	20
+#define MEGABYTE	(1 << L2MEGABYTE)
+#define MEGABYTE32	(MEGABYTE << 5)
+
+/* convert block number to bmap file page number */
+#define BLKTODMAPN(b)\
+	(((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1)
+
+/*
+ *	jfs_extendfs()
+ *
+ * function: extend file system;
+ *
+ *   |-------------------------------|----------|----------|
+ *   file system space               fsck       inline log
+ *                                   workspace  space
+ *
+ * input:
+ *	new LVSize: in LV blocks (required)
+ *	new LogSize: in LV blocks (optional)
+ *	new FSSize: in LV blocks (optional)
+ *
+ * new configuration:
+ * 1. set new LogSize as specified or default from new LVSize;
+ * 2. compute new FSCKSize from new LVSize;
+ * 3. set new FSSize as MIN(FSSize, LVSize-(LogSize+FSCKSize)) where
+ *    assert(new FSSize >= old FSSize),
+ *    i.e., file system must not be shrunk;
+ */
+int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
+{
+	int rc = 0;
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct inode *ipbmap = sbi->ipbmap;
+	struct inode *ipbmap2;
+	struct inode *ipimap = sbi->ipimap;
+	struct jfs_log *log = sbi->log;
+	struct bmap *bmp = sbi->bmap;
+	s64 newLogAddress, newFSCKAddress;
+	int newFSCKSize;
+	s64 newMapSize = 0, mapSize;
+	s64 XAddress, XSize, nblocks, xoff, xaddr, t64;
+	s64 oldLVSize;
+	s64 newFSSize;
+	s64 VolumeSize;
+	int newNpages = 0, nPages, newPage, xlen, t32;
+	int tid;
+	int log_formatted = 0;
+	struct inode *iplist[1];
+	struct jfs_superblock *j_sb, *j_sb2;
+	s64 old_agsize;
+	int agsizechanged = 0;
+	struct buffer_head *bh, *bh2;
+
+	/* If the volume hasn't grown, get out now */
+
+	if (sbi->mntflag & JFS_INLINELOG)
+		oldLVSize = addressPXD(&sbi->logpxd) + lengthPXD(&sbi->logpxd);
+	else
+		oldLVSize = addressPXD(&sbi->fsckpxd) +
+		    lengthPXD(&sbi->fsckpxd);
+
+	if (oldLVSize >= newLVSize) {
+		printk(KERN_WARNING
+		       "jfs_extendfs: volume hasn't grown, returning\n");
+		goto out;
+	}
+
+	VolumeSize = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+
+	if (VolumeSize) {
+		if (newLVSize > VolumeSize) {
+			printk(KERN_WARNING "jfs_extendfs: invalid size\n");
+			rc = -EINVAL;
+			goto out;
+		}
+	} else {
+		/* check the device */
+		bh = sb_bread(sb, newLVSize - 1);
+		if (!bh) {
+			printk(KERN_WARNING "jfs_extendfs: invalid size\n");
+			rc = -EINVAL;
+			goto out;
+		}
+		bforget(bh);
+	}
+
+	/* Can't extend write-protected drive */
+
+	if (isReadOnly(ipbmap)) {
+		printk(KERN_WARNING "jfs_extendfs: read-only file system\n");
+		rc = -EROFS;
+		goto out;
+	}
+
+	/*
+	 *	reconfigure LV spaces
+	 *	---------------------
+	 *
+	 * validate new size, or, if not specified, determine new size
+	 */
+
+	/*
+	 * reconfigure inline log space:
+	 */
+	if ((sbi->mntflag & JFS_INLINELOG)) {
+		if (newLogSize == 0) {
+			/*
+			 * no size specified: default to 1/256 of aggregate
+			 * size; rounded up to a megabyte boundary;
+			 */
+			newLogSize = newLVSize >> 8;
+			t32 = (1 << (20 - sbi->l2bsize)) - 1;
+			newLogSize = (newLogSize + t32) & ~t32;
+			newLogSize =
+			    min(newLogSize, MEGABYTE32 >> sbi->l2bsize);
+		} else {
+			/*
+			 * convert the newLogSize to fs blocks.
+			 *
+			 * Since this is given in megabytes, it will always be
+			 * an even number of pages.
+			 */
+			newLogSize = (newLogSize * MEGABYTE) >> sbi->l2bsize;
+		}
+
+	} else
+		newLogSize = 0;
+
+	newLogAddress = newLVSize - newLogSize;
+
+	/*
+	 * reconfigure fsck work space:
+	 *
+	 * configure it to the end of the logical volume regardless of
+	 * whether file system extends to the end of the aggregate;
+	 * Need enough 4k pages to cover:
+	 *  - 1 bit per block in aggregate rounded up to BPERDMAP boundary
+	 *  - 1 extra page to handle control page and intermediate level pages
+	 *  - 50 extra pages for the chkdsk service log
+	 */
+	t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP)
+	    << L2BPERDMAP;
+	t32 = DIV_ROUND_UP(t64, BITSPERPAGE) + 1 + 50;
+	newFSCKSize = t32 << sbi->l2nbperpage;
+	newFSCKAddress = newLogAddress - newFSCKSize;
+
+	/*
+	 * compute new file system space;
+	 */
+	newFSSize = newLVSize - newLogSize - newFSCKSize;
+
+	/* file system cannot be shrunk */
+	if (newFSSize < bmp->db_mapsize) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * If we're expanding enough that the inline log does not overlap
+	 * the old one, we can format the new log before we quiesce the
+	 * filesystem.
+	 */
+	if ((sbi->mntflag & JFS_INLINELOG) && (newLogAddress > oldLVSize)) {
+		if ((rc = lmLogFormat(log, newLogAddress, newLogSize)))
+			goto out;
+		log_formatted = 1;
+	}
+	/*
+	 *	quiesce file system
+	 *
+	 * (prepare to move the inline log and to prevent map update)
+	 *
+	 * block any new transactions and wait for completion of
+	 * all wip transactions and flush modified pages s.t.
+	 * on-disk file system is in consistent state and
+	 * log is not required for recovery.
+	 */
+	txQuiesce(sb);
+
+	/* Reset size of direct inode */
+	sbi->direct_inode->i_size =  sb->s_bdev->bd_inode->i_size;
+
+	if (sbi->mntflag & JFS_INLINELOG) {
+		/*
+		 * deactivate old inline log
+		 */
+		lmLogShutdown(log);
+
+		/*
+		 * mark on-disk super block for fs in transition;
+		 *
+		 * update on-disk superblock for the new space configuration
+		 * of inline log space and fsck work space descriptors:
+		 * N.B. FS descriptor is NOT updated;
+		 *
+		 * crash recovery:
+		 * logredo(): if FM_EXTENDFS, return to fsck() for cleanup;
+		 * fsck(): if FM_EXTENDFS, reformat inline log and fsck
+		 * workspace from superblock inline log descriptor and fsck
+		 * workspace descriptor;
+		 */
+
+		/* read in superblock */
+		if ((rc = readSuper(sb, &bh)))
+			goto error_out;
+		j_sb = (struct jfs_superblock *)bh->b_data;
+
+		/* mark extendfs() in progress */
+		j_sb->s_state |= cpu_to_le32(FM_EXTENDFS);
+		j_sb->s_xsize = cpu_to_le64(newFSSize);
+		PXDaddress(&j_sb->s_xfsckpxd, newFSCKAddress);
+		PXDlength(&j_sb->s_xfsckpxd, newFSCKSize);
+		PXDaddress(&j_sb->s_xlogpxd, newLogAddress);
+		PXDlength(&j_sb->s_xlogpxd, newLogSize);
+
+		/* synchronously update superblock */
+		mark_buffer_dirty(bh);
+		sync_dirty_buffer(bh);
+		brelse(bh);
+
+		/*
+		 * format new inline log synchronously;
+		 *
+		 * crash recovery: if log move in progress,
+		 * reformat log and exit success;
+		 */
+		if (!log_formatted)
+			if ((rc = lmLogFormat(log, newLogAddress, newLogSize)))
+				goto error_out;
+
+		/*
+		 * activate new log
+		 */
+		log->base = newLogAddress;
+		log->size = newLogSize >> (L2LOGPSIZE - sb->s_blocksize_bits);
+		if ((rc = lmLogInit(log)))
+			goto error_out;
+	}
+
+	/*
+	 *	extend block allocation map
+	 *	---------------------------
+	 *
+	 * extendfs() for new extension, retry after crash recovery;
+	 *
+	 * note: both logredo() and fsck() rebuild map from
+	 * the bitmap and configuration parameter from superblock
+	 * (disregarding all other control information in the map);
+	 *
+	 * superblock:
+	 *  s_size: aggregate size in physical blocks;
+	 */
+	/*
+	 *	compute the new block allocation map configuration
+	 *
+	 * map dinode:
+	 *  di_size: map file size in byte;
+	 *  di_nblocks: number of blocks allocated for map file;
+	 *  di_mapsize: number of blocks in aggregate (covered by map);
+	 * map control page:
+	 *  db_mapsize: number of blocks in aggregate (covered by map);
+	 */
+	newMapSize = newFSSize;
+	/* number of data pages of new bmap file:
+	 * roundup new size to full dmap page boundary and
+	 * add 1 extra dmap page for next extendfs()
+	 */
+	t64 = (newMapSize - 1) + BPERDMAP;
+	newNpages = BLKTODMAPN(t64) + 1;
+
+	/*
+	 *	extend map from current map (WITHOUT growing mapfile)
+	 *
+	 * map new extension with unmapped part of the last partial
+	 * dmap page, if applicable, and extra page(s) allocated
+	 * at end of bmap by mkfs() or previous extendfs();
+	 */
+      extendBmap:
+	/* compute number of blocks requested to extend */
+	mapSize = bmp->db_mapsize;
+	XAddress = mapSize;	/* eXtension Address */
+	XSize = newMapSize - mapSize;	/* eXtension Size */
+	old_agsize = bmp->db_agsize;	/* We need to know if this changes */
+
+	/* compute number of blocks that can be extended by current mapfile */
+	t64 = dbMapFileSizeToMapSize(ipbmap);
+	if (mapSize > t64) {
+		printk(KERN_ERR "jfs_extendfs: mapSize (0x%Lx) > t64 (0x%Lx)\n",
+		       (long long) mapSize, (long long) t64);
+		rc = -EIO;
+		goto error_out;
+	}
+	nblocks = min(t64 - mapSize, XSize);
+
+	/*
+	 * update map pages for new extension:
+	 *
+	 * update/init dmap and bubble up the control hierarchy
+	 * incrementally fold up dmaps into upper levels;
+	 * update bmap control page;
+	 */
+	if ((rc = dbExtendFS(ipbmap, XAddress, nblocks)))
+		goto error_out;
+
+	agsizechanged |= (bmp->db_agsize != old_agsize);
+
+	/*
+	 * the map now has extended to cover additional nblocks:
+	 * dn_mapsize = oldMapsize + nblocks;
+	 */
+	/* ipbmap->i_mapsize += nblocks; */
+	XSize -= nblocks;
+
+	/*
+	 *	grow map file to cover remaining extension
+	 *	and/or one extra dmap page for next extendfs();
+	 *
+	 * allocate new map pages and its backing blocks, and
+	 * update map file xtree
+	 */
+	/* compute number of data pages of current bmap file */
+	nPages = ipbmap->i_size >> L2PSIZE;
+
+	/* need to grow map file ? */
+	if (nPages == newNpages)
+		goto finalizeBmap;
+
+	/*
+	 * grow bmap file for the new map pages required:
+	 *
+	 * allocate growth at the start of newly extended region;
+	 * bmap file only grows sequentially, i.e., both data pages
+	 * and possibly xtree index pages may grow in append mode,
+	 * s.t. logredo() can reconstruct pre-extension state
+	 * by washing away bmap file of pages outside s_size boundary;
+	 */
+	/*
+	 * journal map file growth as if a regular file growth:
+	 * (note: bmap is created with di_mode = IFJOURNAL|IFREG);
+	 *
+	 * journaling of bmap file growth is not required since
+	 * logredo() do/can not use log records of bmap file growth
+	 * but it provides careful write semantics, pmap update, etc.;
+	 */
+	/* synchronous write of data pages: bmap data pages are
+	 * cached in meta-data cache, and not written out
+	 * by txCommit();
+	 */
+	filemap_fdatawait(ipbmap->i_mapping);
+	filemap_write_and_wait(ipbmap->i_mapping);
+	diWriteSpecial(ipbmap, 0);
+
+	newPage = nPages;	/* first new page number */
+	xoff = newPage << sbi->l2nbperpage;
+	xlen = (newNpages - nPages) << sbi->l2nbperpage;
+	xlen = min(xlen, (int) nblocks) & ~(sbi->nbperpage - 1);
+	xaddr = XAddress;
+
+	tid = txBegin(sb, COMMIT_FORCE);
+
+	if ((rc = xtAppend(tid, ipbmap, 0, xoff, nblocks, &xlen, &xaddr, 0))) {
+		txEnd(tid);
+		goto error_out;
+	}
+	/* update bmap file size */
+	ipbmap->i_size += xlen << sbi->l2bsize;
+	inode_add_bytes(ipbmap, xlen << sbi->l2bsize);
+
+	iplist[0] = ipbmap;
+	rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
+
+	txEnd(tid);
+
+	if (rc)
+		goto error_out;
+
+	/*
+	 * map file has been grown now to cover extension to further out;
+	 * di_size = new map file size;
+	 *
+	 * if huge extension, the previous extension based on previous
+	 * map file size may not have been sufficient to cover whole extension
+	 * (it could have been used up for new map pages),
+	 * but the newly grown map file now covers lot bigger new free space
+	 * available for further extension of map;
+	 */
+	/* any more blocks to extend ? */
+	if (XSize)
+		goto extendBmap;
+
+      finalizeBmap:
+	/* finalize bmap */
+	dbFinalizeBmap(ipbmap);
+
+	/*
+	 *	update inode allocation map
+	 *	---------------------------
+	 *
+	 * move iag lists from old to new iag;
+	 * agstart field is not updated for logredo() to reconstruct
+	 * iag lists if system crash occurs.
+	 * (computation of ag number from agstart based on agsize
+	 * will correctly identify the new ag);
+	 */
+	/* if new AG size the same as old AG size, done! */
+	if (agsizechanged) {
+		if ((rc = diExtendFS(ipimap, ipbmap)))
+			goto error_out;
+
+		/* finalize imap */
+		if ((rc = diSync(ipimap)))
+			goto error_out;
+	}
+
+	/*
+	 *	finalize
+	 *	--------
+	 *
+	 * extension is committed when on-disk super block is
+	 * updated with new descriptors: logredo will recover
+	 * crash before it to pre-extension state;
+	 */
+
+	/* sync log to skip log replay of bmap file growth transaction; */
+	/* lmLogSync(log, 1); */
+
+	/*
+	 * synchronous write bmap global control page;
+	 * for crash before completion of write
+	 * logredo() will recover to pre-extendfs state;
+	 * for crash after completion of write,
+	 * logredo() will recover post-extendfs state;
+	 */
+	if ((rc = dbSync(ipbmap)))
+		goto error_out;
+
+	/*
+	 * copy primary bmap inode to secondary bmap inode
+	 */
+
+	ipbmap2 = diReadSpecial(sb, BMAP_I, 1);
+	if (ipbmap2 == NULL) {
+		printk(KERN_ERR "jfs_extendfs: diReadSpecial(bmap) failed\n");
+		goto error_out;
+	}
+	memcpy(&JFS_IP(ipbmap2)->i_xtroot, &JFS_IP(ipbmap)->i_xtroot, 288);
+	ipbmap2->i_size = ipbmap->i_size;
+	ipbmap2->i_blocks = ipbmap->i_blocks;
+
+	diWriteSpecial(ipbmap2, 1);
+	diFreeSpecial(ipbmap2);
+
+	/*
+	 *	update superblock
+	 */
+	if ((rc = readSuper(sb, &bh)))
+		goto error_out;
+	j_sb = (struct jfs_superblock *)bh->b_data;
+
+	/* mark extendfs() completion */
+	j_sb->s_state &= cpu_to_le32(~FM_EXTENDFS);
+	j_sb->s_size = cpu_to_le64(bmp->db_mapsize <<
+				   le16_to_cpu(j_sb->s_l2bfactor));
+	j_sb->s_agsize = cpu_to_le32(bmp->db_agsize);
+
+	/* update inline log space descriptor */
+	if (sbi->mntflag & JFS_INLINELOG) {
+		PXDaddress(&(j_sb->s_logpxd), newLogAddress);
+		PXDlength(&(j_sb->s_logpxd), newLogSize);
+	}
+
+	/* record log's mount serial number */
+	j_sb->s_logserial = cpu_to_le32(log->serial);
+
+	/* update fsck work space descriptor */
+	PXDaddress(&(j_sb->s_fsckpxd), newFSCKAddress);
+	PXDlength(&(j_sb->s_fsckpxd), newFSCKSize);
+	j_sb->s_fscklog = 1;
+	/* sb->s_fsckloglen remains the same */
+
+	/* Update secondary superblock */
+	bh2 = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits);
+	if (bh2) {
+		j_sb2 = (struct jfs_superblock *)bh2->b_data;
+		memcpy(j_sb2, j_sb, sizeof (struct jfs_superblock));
+
+		mark_buffer_dirty(bh);
+		sync_dirty_buffer(bh2);
+		brelse(bh2);
+	}
+
+	/* write primary superblock */
+	mark_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+	brelse(bh);
+
+	goto resume;
+
+      error_out:
+	jfs_error(sb, "jfs_extendfs");
+
+      resume:
+	/*
+	 *	resume file system transactions
+	 */
+	txResume(sb);
+
+      out:
+	return rc;
+}
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
new file mode 100644
index 00000000..06c8a67c
--- /dev/null
+++ b/fs/jfs/super.c
@@ -0,0 +1,901 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/parser.h>
+#include <linux/completion.h>
+#include <linux/vfs.h>
+#include <linux/quotaops.h>
+#include <linux/mount.h>
+#include <linux/moduleparam.h>
+#include <linux/kthread.h>
+#include <linux/posix_acl.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/crc32.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/seq_file.h>
+
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_inode.h"
+#include "jfs_metapage.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_acl.h"
+#include "jfs_debug.h"
+
+MODULE_DESCRIPTION("The Journaled Filesystem (JFS)");
+MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM");
+MODULE_LICENSE("GPL");
+
+static struct kmem_cache * jfs_inode_cachep;
+
+static const struct super_operations jfs_super_operations;
+static const struct export_operations jfs_export_operations;
+static struct file_system_type jfs_fs_type;
+
+#define MAX_COMMIT_THREADS 64
+static int commit_threads = 0;
+module_param(commit_threads, int, 0);
+MODULE_PARM_DESC(commit_threads, "Number of commit threads");
+
+static struct task_struct *jfsCommitThread[MAX_COMMIT_THREADS];
+struct task_struct *jfsIOthread;
+struct task_struct *jfsSyncThread;
+
+#ifdef CONFIG_JFS_DEBUG
+int jfsloglevel = JFS_LOGLEVEL_WARN;
+module_param(jfsloglevel, int, 0644);
+MODULE_PARM_DESC(jfsloglevel, "Specify JFS loglevel (0, 1 or 2)");
+#endif
+
+static void jfs_handle_error(struct super_block *sb)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	updateSuper(sb, FM_DIRTY);
+
+	if (sbi->flag & JFS_ERR_PANIC)
+		panic("JFS (device %s): panic forced after error\n",
+			sb->s_id);
+	else if (sbi->flag & JFS_ERR_REMOUNT_RO) {
+		jfs_err("ERROR: (device %s): remounting filesystem "
+			"as read-only\n",
+			sb->s_id);
+		sb->s_flags |= MS_RDONLY;
+	}
+
+	/* nothing is done for continue beyond marking the superblock dirty */
+}
+
+void jfs_error(struct super_block *sb, const char * function, ...)
+{
+	static char error_buf[256];
+	va_list args;
+
+	va_start(args, function);
+	vsnprintf(error_buf, sizeof(error_buf), function, args);
+	va_end(args);
+
+	printk(KERN_ERR "ERROR: (device %s): %s\n", sb->s_id, error_buf);
+
+	jfs_handle_error(sb);
+}
+
+static struct inode *jfs_alloc_inode(struct super_block *sb)
+{
+	struct jfs_inode_info *jfs_inode;
+
+	jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS);
+	if (!jfs_inode)
+		return NULL;
+	return &jfs_inode->vfs_inode;
+}
+
+static void jfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(jfs_inode_cachep, ji);
+}
+
+static void jfs_destroy_inode(struct inode *inode)
+{
+	struct jfs_inode_info *ji = JFS_IP(inode);
+
+	BUG_ON(!list_empty(&ji->anon_inode_list));
+
+	spin_lock_irq(&ji->ag_lock);
+	if (ji->active_ag != -1) {
+		struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap;
+		atomic_dec(&bmap->db_active[ji->active_ag]);
+		ji->active_ag = -1;
+	}
+	spin_unlock_irq(&ji->ag_lock);
+	call_rcu(&inode->i_rcu, jfs_i_callback);
+}
+
+static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
+	s64 maxinodes;
+	struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap;
+
+	jfs_info("In jfs_statfs");
+	buf->f_type = JFS_SUPER_MAGIC;
+	buf->f_bsize = sbi->bsize;
+	buf->f_blocks = sbi->bmap->db_mapsize;
+	buf->f_bfree = sbi->bmap->db_nfree;
+	buf->f_bavail = sbi->bmap->db_nfree;
+	/*
+	 * If we really return the number of allocated & free inodes, some
+	 * applications will fail because they won't see enough free inodes.
+	 * We'll try to calculate some guess as to how may inodes we can
+	 * really allocate
+	 *
+	 * buf->f_files = atomic_read(&imap->im_numinos);
+	 * buf->f_ffree = atomic_read(&imap->im_numfree);
+	 */
+	maxinodes = min((s64) atomic_read(&imap->im_numinos) +
+			((sbi->bmap->db_nfree >> imap->im_l2nbperiext)
+			 << L2INOSPEREXT), (s64) 0xffffffffLL);
+	buf->f_files = maxinodes;
+	buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) -
+				    atomic_read(&imap->im_numfree));
+	buf->f_fsid.val[0] = (u32)crc32_le(0, sbi->uuid, sizeof(sbi->uuid)/2);
+	buf->f_fsid.val[1] = (u32)crc32_le(0, sbi->uuid + sizeof(sbi->uuid)/2,
+					sizeof(sbi->uuid)/2);
+
+	buf->f_namelen = JFS_NAME_MAX;
+	return 0;
+}
+
+static void jfs_put_super(struct super_block *sb)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	int rc;
+
+	jfs_info("In jfs_put_super");
+
+	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
+	rc = jfs_umount(sb);
+	if (rc)
+		jfs_err("jfs_umount failed with return code %d", rc);
+
+	unload_nls(sbi->nls_tab);
+
+	truncate_inode_pages(sbi->direct_inode->i_mapping, 0);
+	iput(sbi->direct_inode);
+
+	kfree(sbi);
+}
+
+enum {
+	Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize,
+	Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota,
+	Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask
+};
+
+static const match_table_t tokens = {
+	{Opt_integrity, "integrity"},
+	{Opt_nointegrity, "nointegrity"},
+	{Opt_iocharset, "iocharset=%s"},
+	{Opt_resize, "resize=%u"},
+	{Opt_resize_nosize, "resize"},
+	{Opt_errors, "errors=%s"},
+	{Opt_ignore, "noquota"},
+	{Opt_ignore, "quota"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_grpquota, "grpquota"},
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_umask, "umask=%u"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
+			 int *flag)
+{
+	void *nls_map = (void *)-1;	/* -1: no change;  NULL: none */
+	char *p;
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+
+	*newLVSize = 0;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_integrity:
+			*flag &= ~JFS_NOINTEGRITY;
+			break;
+		case Opt_nointegrity:
+			*flag |= JFS_NOINTEGRITY;
+			break;
+		case Opt_ignore:
+			/* Silently ignore the quota options */
+			/* Don't do anything ;-) */
+			break;
+		case Opt_iocharset:
+			if (nls_map && nls_map != (void *) -1)
+				unload_nls(nls_map);
+			if (!strcmp(args[0].from, "none"))
+				nls_map = NULL;
+			else {
+				nls_map = load_nls(args[0].from);
+				if (!nls_map) {
+					printk(KERN_ERR
+					       "JFS: charset not found\n");
+					goto cleanup;
+				}
+			}
+			break;
+		case Opt_resize:
+		{
+			char *resize = args[0].from;
+			*newLVSize = simple_strtoull(resize, &resize, 0);
+			break;
+		}
+		case Opt_resize_nosize:
+		{
+			*newLVSize = sb->s_bdev->bd_inode->i_size >>
+				sb->s_blocksize_bits;
+			if (*newLVSize == 0)
+				printk(KERN_ERR
+				       "JFS: Cannot determine volume size\n");
+			break;
+		}
+		case Opt_errors:
+		{
+			char *errors = args[0].from;
+			if (!errors || !*errors)
+				goto cleanup;
+			if (!strcmp(errors, "continue")) {
+				*flag &= ~JFS_ERR_REMOUNT_RO;
+				*flag &= ~JFS_ERR_PANIC;
+				*flag |= JFS_ERR_CONTINUE;
+			} else if (!strcmp(errors, "remount-ro")) {
+				*flag &= ~JFS_ERR_CONTINUE;
+				*flag &= ~JFS_ERR_PANIC;
+				*flag |= JFS_ERR_REMOUNT_RO;
+			} else if (!strcmp(errors, "panic")) {
+				*flag &= ~JFS_ERR_CONTINUE;
+				*flag &= ~JFS_ERR_REMOUNT_RO;
+				*flag |= JFS_ERR_PANIC;
+			} else {
+				printk(KERN_ERR
+				       "JFS: %s is an invalid error handler\n",
+				       errors);
+				goto cleanup;
+			}
+			break;
+		}
+
+#ifdef CONFIG_QUOTA
+		case Opt_quota:
+		case Opt_usrquota:
+			*flag |= JFS_USRQUOTA;
+			break;
+		case Opt_grpquota:
+			*flag |= JFS_GRPQUOTA;
+			break;
+#else
+		case Opt_usrquota:
+		case Opt_grpquota:
+		case Opt_quota:
+			printk(KERN_ERR
+			       "JFS: quota operations not supported\n");
+			break;
+#endif
+		case Opt_uid:
+		{
+			char *uid = args[0].from;
+			sbi->uid = simple_strtoul(uid, &uid, 0);
+			break;
+		}
+		case Opt_gid:
+		{
+			char *gid = args[0].from;
+			sbi->gid = simple_strtoul(gid, &gid, 0);
+			break;
+		}
+		case Opt_umask:
+		{
+			char *umask = args[0].from;
+			sbi->umask = simple_strtoul(umask, &umask, 8);
+			if (sbi->umask & ~0777) {
+				printk(KERN_ERR
+				       "JFS: Invalid value of umask\n");
+				goto cleanup;
+			}
+			break;
+		}
+		default:
+			printk("jfs: Unrecognized mount option \"%s\" "
+					" or missing value\n", p);
+			goto cleanup;
+		}
+	}
+
+	if (nls_map != (void *) -1) {
+		/* Discard old (if remount) */
+		unload_nls(sbi->nls_tab);
+		sbi->nls_tab = nls_map;
+	}
+	return 1;
+
+cleanup:
+	if (nls_map && nls_map != (void *) -1)
+		unload_nls(nls_map);
+	return 0;
+}
+
+static int jfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	s64 newLVSize = 0;
+	int rc = 0;
+	int flag = JFS_SBI(sb)->flag;
+	int ret;
+
+	if (!parse_options(data, sb, &newLVSize, &flag)) {
+		return -EINVAL;
+	}
+
+	if (newLVSize) {
+		if (sb->s_flags & MS_RDONLY) {
+			printk(KERN_ERR
+		  "JFS: resize requires volume to be mounted read-write\n");
+			return -EROFS;
+		}
+		rc = jfs_extendfs(sb, newLVSize, 0);
+		if (rc)
+			return rc;
+	}
+
+	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+		/*
+		 * Invalidate any previously read metadata.  fsck may have
+		 * changed the on-disk data since we mounted r/o
+		 */
+		truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0);
+
+		JFS_SBI(sb)->flag = flag;
+		ret = jfs_mount_rw(sb, 1);
+
+		/* mark the fs r/w for quota activity */
+		sb->s_flags &= ~MS_RDONLY;
+
+		dquot_resume(sb, -1);
+		return ret;
+	}
+	if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
+		rc = dquot_suspend(sb, -1);
+		if (rc < 0) {
+			return rc;
+		}
+		rc = jfs_umount_rw(sb);
+		JFS_SBI(sb)->flag = flag;
+		return rc;
+	}
+	if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
+		if (!(sb->s_flags & MS_RDONLY)) {
+			rc = jfs_umount_rw(sb);
+			if (rc)
+				return rc;
+
+			JFS_SBI(sb)->flag = flag;
+			ret = jfs_mount_rw(sb, 1);
+			return ret;
+		}
+	JFS_SBI(sb)->flag = flag;
+
+	return 0;
+}
+
+static int jfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct jfs_sb_info *sbi;
+	struct inode *inode;
+	int rc;
+	s64 newLVSize = 0;
+	int flag, ret = -EINVAL;
+
+	jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags);
+
+	if (!new_valid_dev(sb->s_bdev->bd_dev))
+		return -EOVERFLOW;
+
+	sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sb->s_fs_info = sbi;
+	sbi->sb = sb;
+	sbi->uid = sbi->gid = sbi->umask = -1;
+
+	/* initialize the mount flag and determine the default error handler */
+	flag = JFS_ERR_REMOUNT_RO;
+
+	if (!parse_options((char *) data, sb, &newLVSize, &flag))
+		goto out_kfree;
+	sbi->flag = flag;
+
+#ifdef CONFIG_JFS_POSIX_ACL
+	sb->s_flags |= MS_POSIXACL;
+#endif
+
+	if (newLVSize) {
+		printk(KERN_ERR "resize option for remount only\n");
+		goto out_kfree;
+	}
+
+	/*
+	 * Initialize blocksize to 4K.
+	 */
+	sb_set_blocksize(sb, PSIZE);
+
+	/*
+	 * Set method vectors.
+	 */
+	sb->s_op = &jfs_super_operations;
+	sb->s_export_op = &jfs_export_operations;
+#ifdef CONFIG_QUOTA
+	sb->dq_op = &dquot_operations;
+	sb->s_qcop = &dquot_quotactl_ops;
+#endif
+
+	/*
+	 * Initialize direct-mapping inode/address-space
+	 */
+	inode = new_inode(sb);
+	if (inode == NULL) {
+		ret = -ENOMEM;
+		goto out_unload;
+	}
+	inode->i_ino = 0;
+	inode->i_nlink = 1;
+	inode->i_size = sb->s_bdev->bd_inode->i_size;
+	inode->i_mapping->a_ops = &jfs_metapage_aops;
+	insert_inode_hash(inode);
+	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+
+	sbi->direct_inode = inode;
+
+	rc = jfs_mount(sb);
+	if (rc) {
+		if (!silent) {
+			jfs_err("jfs_mount failed w/return code = %d", rc);
+		}
+		goto out_mount_failed;
+	}
+	if (sb->s_flags & MS_RDONLY)
+		sbi->log = NULL;
+	else {
+		rc = jfs_mount_rw(sb, 0);
+		if (rc) {
+			if (!silent) {
+				jfs_err("jfs_mount_rw failed, return code = %d",
+					rc);
+			}
+			goto out_no_rw;
+		}
+	}
+
+	sb->s_magic = JFS_SUPER_MAGIC;
+
+	if (sbi->mntflag & JFS_OS2)
+		sb->s_d_op = &jfs_ci_dentry_operations;
+
+	inode = jfs_iget(sb, ROOT_I);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto out_no_rw;
+	}
+	sb->s_root = d_alloc_root(inode);
+	if (!sb->s_root)
+		goto out_no_root;
+
+	/* logical blocks are represented by 40 bits in pxd_t, etc. */
+	sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
+#if BITS_PER_LONG == 32
+	/*
+	 * Page cache is indexed by long.
+	 * I would use MAX_LFS_FILESIZE, but it's only half as big
+	 */
+	sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, (u64)sb->s_maxbytes);
+#endif
+	sb->s_time_gran = 1;
+	return 0;
+
+out_no_root:
+	jfs_err("jfs_read_super: get root dentry failed");
+	iput(inode);
+
+out_no_rw:
+	rc = jfs_umount(sb);
+	if (rc) {
+		jfs_err("jfs_umount failed with return code %d", rc);
+	}
+out_mount_failed:
+	filemap_write_and_wait(sbi->direct_inode->i_mapping);
+	truncate_inode_pages(sbi->direct_inode->i_mapping, 0);
+	make_bad_inode(sbi->direct_inode);
+	iput(sbi->direct_inode);
+	sbi->direct_inode = NULL;
+out_unload:
+	if (sbi->nls_tab)
+		unload_nls(sbi->nls_tab);
+out_kfree:
+	kfree(sbi);
+	return ret;
+}
+
+static int jfs_freeze(struct super_block *sb)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_log *log = sbi->log;
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		txQuiesce(sb);
+		lmLogShutdown(log);
+		updateSuper(sb, FM_CLEAN);
+	}
+	return 0;
+}
+
+static int jfs_unfreeze(struct super_block *sb)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_log *log = sbi->log;
+	int rc = 0;
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		updateSuper(sb, FM_MOUNT);
+		if ((rc = lmLogInit(log)))
+			jfs_err("jfs_unlock failed with return code %d", rc);
+		else
+			txResume(sb);
+	}
+	return 0;
+}
+
+static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
+}
+
+static int jfs_sync_fs(struct super_block *sb, int wait)
+{
+	struct jfs_log *log = JFS_SBI(sb)->log;
+
+	/* log == NULL indicates read-only mount */
+	if (log) {
+		jfs_flush_journal(log, wait);
+		jfs_syncpt(log, 0);
+	}
+
+	return 0;
+}
+
+static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(vfs->mnt_sb);
+
+	if (sbi->uid != -1)
+		seq_printf(seq, ",uid=%d", sbi->uid);
+	if (sbi->gid != -1)
+		seq_printf(seq, ",gid=%d", sbi->gid);
+	if (sbi->umask != -1)
+		seq_printf(seq, ",umask=%03o", sbi->umask);
+	if (sbi->flag & JFS_NOINTEGRITY)
+		seq_puts(seq, ",nointegrity");
+	if (sbi->nls_tab)
+		seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset);
+	if (sbi->flag & JFS_ERR_CONTINUE)
+		seq_printf(seq, ",errors=continue");
+	if (sbi->flag & JFS_ERR_PANIC)
+		seq_printf(seq, ",errors=panic");
+
+#ifdef CONFIG_QUOTA
+	if (sbi->flag & JFS_USRQUOTA)
+		seq_puts(seq, ",usrquota");
+
+	if (sbi->flag & JFS_GRPQUOTA)
+		seq_puts(seq, ",grpquota");
+#endif
+
+	return 0;
+}
+
+#ifdef CONFIG_QUOTA
+
+/* Read data from quotafile - avoid pagecache and such because we cannot afford
+ * acquiring the locks... As quota files are never truncated and quota code
+ * itself serializes the operations (and no one else should touch the files)
+ * we don't have to be afraid of races */
+static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data,
+			      size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	sector_t blk = off >> sb->s_blocksize_bits;
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	int tocopy;
+	size_t toread;
+	struct buffer_head tmp_bh;
+	struct buffer_head *bh;
+	loff_t i_size = i_size_read(inode);
+
+	if (off > i_size)
+		return 0;
+	if (off+len > i_size)
+		len = i_size-off;
+	toread = len;
+	while (toread > 0) {
+		tocopy = sb->s_blocksize - offset < toread ?
+				sb->s_blocksize - offset : toread;
+
+		tmp_bh.b_state = 0;
+		tmp_bh.b_size = 1 << inode->i_blkbits;
+		err = jfs_get_block(inode, blk, &tmp_bh, 0);
+		if (err)
+			return err;
+		if (!buffer_mapped(&tmp_bh))	/* A hole? */
+			memset(data, 0, tocopy);
+		else {
+			bh = sb_bread(sb, tmp_bh.b_blocknr);
+			if (!bh)
+				return -EIO;
+			memcpy(data, bh->b_data+offset, tocopy);
+			brelse(bh);
+		}
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/* Write to quotafile */
+static ssize_t jfs_quota_write(struct super_block *sb, int type,
+			       const char *data, size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	sector_t blk = off >> sb->s_blocksize_bits;
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	int tocopy;
+	size_t towrite = len;
+	struct buffer_head tmp_bh;
+	struct buffer_head *bh;
+
+	mutex_lock(&inode->i_mutex);
+	while (towrite > 0) {
+		tocopy = sb->s_blocksize - offset < towrite ?
+				sb->s_blocksize - offset : towrite;
+
+		tmp_bh.b_state = 0;
+		tmp_bh.b_size = 1 << inode->i_blkbits;
+		err = jfs_get_block(inode, blk, &tmp_bh, 1);
+		if (err)
+			goto out;
+		if (offset || tocopy != sb->s_blocksize)
+			bh = sb_bread(sb, tmp_bh.b_blocknr);
+		else
+			bh = sb_getblk(sb, tmp_bh.b_blocknr);
+		if (!bh) {
+			err = -EIO;
+			goto out;
+		}
+		lock_buffer(bh);
+		memcpy(bh->b_data+offset, data, tocopy);
+		flush_dcache_page(bh->b_page);
+		set_buffer_uptodate(bh);
+		mark_buffer_dirty(bh);
+		unlock_buffer(bh);
+		brelse(bh);
+		offset = 0;
+		towrite -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+out:
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
+		return err;
+	}
+	if (inode->i_size < off+len-towrite)
+		i_size_write(inode, off+len-towrite);
+	inode->i_version++;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+	mutex_unlock(&inode->i_mutex);
+	return len - towrite;
+}
+
+#endif
+
+static const struct super_operations jfs_super_operations = {
+	.alloc_inode	= jfs_alloc_inode,
+	.destroy_inode	= jfs_destroy_inode,
+	.dirty_inode	= jfs_dirty_inode,
+	.write_inode	= jfs_write_inode,
+	.evict_inode	= jfs_evict_inode,
+	.put_super	= jfs_put_super,
+	.sync_fs	= jfs_sync_fs,
+	.freeze_fs	= jfs_freeze,
+	.unfreeze_fs	= jfs_unfreeze,
+	.statfs		= jfs_statfs,
+	.remount_fs	= jfs_remount,
+	.show_options	= jfs_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read	= jfs_quota_read,
+	.quota_write	= jfs_quota_write,
+#endif
+};
+
+static const struct export_operations jfs_export_operations = {
+	.fh_to_dentry	= jfs_fh_to_dentry,
+	.fh_to_parent	= jfs_fh_to_parent,
+	.get_parent	= jfs_get_parent,
+};
+
+static struct file_system_type jfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "jfs",
+	.mount		= jfs_do_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static void init_once(void *foo)
+{
+	struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
+
+	memset(jfs_ip, 0, sizeof(struct jfs_inode_info));
+	INIT_LIST_HEAD(&jfs_ip->anon_inode_list);
+	init_rwsem(&jfs_ip->rdwrlock);
+	mutex_init(&jfs_ip->commit_mutex);
+	init_rwsem(&jfs_ip->xattr_sem);
+	spin_lock_init(&jfs_ip->ag_lock);
+	jfs_ip->active_ag = -1;
+	inode_init_once(&jfs_ip->vfs_inode);
+}
+
+static int __init init_jfs_fs(void)
+{
+	int i;
+	int rc;
+
+	jfs_inode_cachep =
+	    kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
+			    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			    init_once);
+	if (jfs_inode_cachep == NULL)
+		return -ENOMEM;
+
+	/*
+	 * Metapage initialization
+	 */
+	rc = metapage_init();
+	if (rc) {
+		jfs_err("metapage_init failed w/rc = %d", rc);
+		goto free_slab;
+	}
+
+	/*
+	 * Transaction Manager initialization
+	 */
+	rc = txInit();
+	if (rc) {
+		jfs_err("txInit failed w/rc = %d", rc);
+		goto free_metapage;
+	}
+
+	/*
+	 * I/O completion thread (endio)
+	 */
+	jfsIOthread = kthread_run(jfsIOWait, NULL, "jfsIO");
+	if (IS_ERR(jfsIOthread)) {
+		rc = PTR_ERR(jfsIOthread);
+		jfs_err("init_jfs_fs: fork failed w/rc = %d", rc);
+		goto end_txmngr;
+	}
+
+	if (commit_threads < 1)
+		commit_threads = num_online_cpus();
+	if (commit_threads > MAX_COMMIT_THREADS)
+		commit_threads = MAX_COMMIT_THREADS;
+
+	for (i = 0; i < commit_threads; i++) {
+		jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL, "jfsCommit");
+		if (IS_ERR(jfsCommitThread[i])) {
+			rc = PTR_ERR(jfsCommitThread[i]);
+			jfs_err("init_jfs_fs: fork failed w/rc = %d", rc);
+			commit_threads = i;
+			goto kill_committask;
+		}
+	}
+
+	jfsSyncThread = kthread_run(jfs_sync, NULL, "jfsSync");
+	if (IS_ERR(jfsSyncThread)) {
+		rc = PTR_ERR(jfsSyncThread);
+		jfs_err("init_jfs_fs: fork failed w/rc = %d", rc);
+		goto kill_committask;
+	}
+
+#ifdef PROC_FS_JFS
+	jfs_proc_init();
+#endif
+
+	return register_filesystem(&jfs_fs_type);
+
+kill_committask:
+	for (i = 0; i < commit_threads; i++)
+		kthread_stop(jfsCommitThread[i]);
+	kthread_stop(jfsIOthread);
+end_txmngr:
+	txExit();
+free_metapage:
+	metapage_exit();
+free_slab:
+	kmem_cache_destroy(jfs_inode_cachep);
+	return rc;
+}
+
+static void __exit exit_jfs_fs(void)
+{
+	int i;
+
+	jfs_info("exit_jfs_fs called");
+
+	txExit();
+	metapage_exit();
+
+	kthread_stop(jfsIOthread);
+	for (i = 0; i < commit_threads; i++)
+		kthread_stop(jfsCommitThread[i]);
+	kthread_stop(jfsSyncThread);
+#ifdef PROC_FS_JFS
+	jfs_proc_clean();
+#endif
+	unregister_filesystem(&jfs_fs_type);
+	kmem_cache_destroy(jfs_inode_cachep);
+}
+
+module_init(init_jfs_fs)
+module_exit(exit_jfs_fs)
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
new file mode 100644
index 00000000..205b946d
--- /dev/null
+++ b/fs/jfs/symlink.c
@@ -0,0 +1,52 @@
+/*
+ *   Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include "jfs_incore.h"
+#include "jfs_inode.h"
+#include "jfs_xattr.h"
+
+static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	char *s = JFS_IP(dentry->d_inode)->i_inline;
+	nd_set_link(nd, s);
+	return NULL;
+}
+
+const struct inode_operations jfs_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= jfs_follow_link,
+	.setattr	= jfs_setattr,
+	.setxattr	= jfs_setxattr,
+	.getxattr	= jfs_getxattr,
+	.listxattr	= jfs_listxattr,
+	.removexattr	= jfs_removexattr,
+};
+
+const struct inode_operations jfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= jfs_setattr,
+	.setxattr	= jfs_setxattr,
+	.getxattr	= jfs_getxattr,
+	.listxattr	= jfs_listxattr,
+	.removexattr	= jfs_removexattr,
+};
+
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
new file mode 100644
index 00000000..24838f1e
--- /dev/null
+++ b/fs/jfs/xattr.c
@@ -0,0 +1,1128 @@
+/*
+ *   Copyright (C) International Business Machines  Corp., 2000-2004
+ *   Copyright (C) Christoph Hellwig, 2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
+#include <linux/quotaops.h>
+#include <linux/security.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_debug.h"
+#include "jfs_dinode.h"
+#include "jfs_extent.h"
+#include "jfs_metapage.h"
+#include "jfs_xattr.h"
+#include "jfs_acl.h"
+
+/*
+ *	jfs_xattr.c: extended attribute service
+ *
+ * Overall design --
+ *
+ * Format:
+ *
+ *   Extended attribute lists (jfs_ea_list) consist of an overall size (32 bit
+ *   value) and a variable (0 or more) number of extended attribute
+ *   entries.  Each extended attribute entry (jfs_ea) is a <name,value> double
+ *   where <name> is constructed from a null-terminated ascii string
+ *   (1 ... 255 bytes in the name) and <value> is arbitrary 8 bit data
+ *   (1 ... 65535 bytes).  The in-memory format is
+ *
+ *   0       1        2        4                4 + namelen + 1
+ *   +-------+--------+--------+----------------+-------------------+
+ *   | Flags | Name   | Value  | Name String \0 | Data . . . .      |
+ *   |       | Length | Length |                |                   |
+ *   +-------+--------+--------+----------------+-------------------+
+ *
+ *   A jfs_ea_list then is structured as
+ *
+ *   0            4                   4 + EA_SIZE(ea1)
+ *   +------------+-------------------+--------------------+-----
+ *   | Overall EA | First FEA Element | Second FEA Element | .....
+ *   | List Size  |                   |                    |
+ *   +------------+-------------------+--------------------+-----
+ *
+ *   On-disk:
+ *
+ *	FEALISTs are stored on disk using blocks allocated by dbAlloc() and
+ *	written directly. An EA list may be in-lined in the inode if there is
+ *	sufficient room available.
+ */
+
+struct ea_buffer {
+	int flag;		/* Indicates what storage xattr points to */
+	int max_size;		/* largest xattr that fits in current buffer */
+	dxd_t new_ea;		/* dxd to replace ea when modifying xattr */
+	struct metapage *mp;	/* metapage containing ea list */
+	struct jfs_ea_list *xattr;	/* buffer containing ea list */
+};
+
+/*
+ * ea_buffer.flag values
+ */
+#define EA_INLINE	0x0001
+#define EA_EXTENT	0x0002
+#define EA_NEW		0x0004
+#define EA_MALLOC	0x0008
+
+
+static int is_known_namespace(const char *name)
+{
+	if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) &&
+	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+	    strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+	    strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+		return false;
+
+	return true;
+}
+
+/*
+ * These three routines are used to recognize on-disk extended attributes
+ * that are in a recognized namespace.  If the attribute is not recognized,
+ * "os2." is prepended to the name
+ */
+static int is_os2_xattr(struct jfs_ea *ea)
+{
+	return !is_known_namespace(ea->name);
+}
+
+static inline int name_size(struct jfs_ea *ea)
+{
+	if (is_os2_xattr(ea))
+		return ea->namelen + XATTR_OS2_PREFIX_LEN;
+	else
+		return ea->namelen;
+}
+
+static inline int copy_name(char *buffer, struct jfs_ea *ea)
+{
+	int len = ea->namelen;
+
+	if (is_os2_xattr(ea)) {
+		memcpy(buffer, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN);
+		buffer += XATTR_OS2_PREFIX_LEN;
+		len += XATTR_OS2_PREFIX_LEN;
+	}
+	memcpy(buffer, ea->name, ea->namelen);
+	buffer[ea->namelen] = 0;
+
+	return len;
+}
+
+/* Forward references */
+static void ea_release(struct inode *inode, struct ea_buffer *ea_buf);
+
+/*
+ * NAME: ea_write_inline
+ *
+ * FUNCTION: Attempt to write an EA inline if area is available
+ *
+ * PRE CONDITIONS:
+ *	Already verified that the specified EA is small enough to fit inline
+ *
+ * PARAMETERS:
+ *	ip	- Inode pointer
+ *	ealist	- EA list pointer
+ *	size	- size of ealist in bytes
+ *	ea	- dxd_t structure to be filled in with necessary EA information
+ *		  if we successfully copy the EA inline
+ *
+ * NOTES:
+ *	Checks if the inode's inline area is available.  If so, copies EA inline
+ *	and sets <ea> fields appropriately.  Otherwise, returns failure, EA will
+ *	have to be put into an extent.
+ *
+ * RETURNS: 0 for successful copy to inline area; -1 if area not available
+ */
+static int ea_write_inline(struct inode *ip, struct jfs_ea_list *ealist,
+			   int size, dxd_t * ea)
+{
+	struct jfs_inode_info *ji = JFS_IP(ip);
+
+	/*
+	 * Make sure we have an EA -- the NULL EA list is valid, but you
+	 * can't copy it!
+	 */
+	if (ealist && size > sizeof (struct jfs_ea_list)) {
+		assert(size <= sizeof (ji->i_inline_ea));
+
+		/*
+		 * See if the space is available or if it is already being
+		 * used for an inline EA.
+		 */
+		if (!(ji->mode2 & INLINEEA) && !(ji->ea.flag & DXD_INLINE))
+			return -EPERM;
+
+		DXDsize(ea, size);
+		DXDlength(ea, 0);
+		DXDaddress(ea, 0);
+		memcpy(ji->i_inline_ea, ealist, size);
+		ea->flag = DXD_INLINE;
+		ji->mode2 &= ~INLINEEA;
+	} else {
+		ea->flag = 0;
+		DXDsize(ea, 0);
+		DXDlength(ea, 0);
+		DXDaddress(ea, 0);
+
+		/* Free up INLINE area */
+		if (ji->ea.flag & DXD_INLINE)
+			ji->mode2 |= INLINEEA;
+	}
+
+	return 0;
+}
+
+/*
+ * NAME: ea_write
+ *
+ * FUNCTION: Write an EA for an inode
+ *
+ * PRE CONDITIONS: EA has been verified
+ *
+ * PARAMETERS:
+ *	ip	- Inode pointer
+ *	ealist	- EA list pointer
+ *	size	- size of ealist in bytes
+ *	ea	- dxd_t structure to be filled in appropriately with where the
+ *		  EA was copied
+ *
+ * NOTES: Will write EA inline if able to, otherwise allocates blocks for an
+ *	extent and synchronously writes it to those blocks.
+ *
+ * RETURNS: 0 for success; Anything else indicates failure
+ */
+static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
+		       dxd_t * ea)
+{
+	struct super_block *sb = ip->i_sb;
+	struct jfs_inode_info *ji = JFS_IP(ip);
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	int nblocks;
+	s64 blkno;
+	int rc = 0, i;
+	char *cp;
+	s32 nbytes, nb;
+	s32 bytes_to_write;
+	struct metapage *mp;
+
+	/*
+	 * Quick check to see if this is an in-linable EA.  Short EAs
+	 * and empty EAs are all in-linable, provided the space exists.
+	 */
+	if (!ealist || size <= sizeof (ji->i_inline_ea)) {
+		if (!ea_write_inline(ip, ealist, size, ea))
+			return 0;
+	}
+
+	/* figure out how many blocks we need */
+	nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits;
+
+	/* Allocate new blocks to quota. */
+	rc = dquot_alloc_block(ip, nblocks);
+	if (rc)
+		return rc;
+
+	rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno);
+	if (rc) {
+		/*Rollback quota allocation. */
+		dquot_free_block(ip, nblocks);
+		return rc;
+	}
+
+	/*
+	 * Now have nblocks worth of storage to stuff into the FEALIST.
+	 * loop over the FEALIST copying data into the buffer one page at
+	 * a time.
+	 */
+	cp = (char *) ealist;
+	nbytes = size;
+	for (i = 0; i < nblocks; i += sbi->nbperpage) {
+		/*
+		 * Determine how many bytes for this request, and round up to
+		 * the nearest aggregate block size
+		 */
+		nb = min(PSIZE, nbytes);
+		bytes_to_write =
+		    ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits))
+		    << sb->s_blocksize_bits;
+
+		if (!(mp = get_metapage(ip, blkno + i, bytes_to_write, 1))) {
+			rc = -EIO;
+			goto failed;
+		}
+
+		memcpy(mp->data, cp, nb);
+
+		/*
+		 * We really need a way to propagate errors for
+		 * forced writes like this one.  --hch
+		 *
+		 * (__write_metapage => release_metapage => flush_metapage)
+		 */
+#ifdef _JFS_FIXME
+		if ((rc = flush_metapage(mp))) {
+			/*
+			 * the write failed -- this means that the buffer
+			 * is still assigned and the blocks are not being
+			 * used.  this seems like the best error recovery
+			 * we can get ...
+			 */
+			goto failed;
+		}
+#else
+		flush_metapage(mp);
+#endif
+
+		cp += PSIZE;
+		nbytes -= nb;
+	}
+
+	ea->flag = DXD_EXTENT;
+	DXDsize(ea, le32_to_cpu(ealist->size));
+	DXDlength(ea, nblocks);
+	DXDaddress(ea, blkno);
+
+	/* Free up INLINE area */
+	if (ji->ea.flag & DXD_INLINE)
+		ji->mode2 |= INLINEEA;
+
+	return 0;
+
+      failed:
+	/* Rollback quota allocation. */
+	dquot_free_block(ip, nblocks);
+
+	dbFree(ip, blkno, nblocks);
+	return rc;
+}
+
+/*
+ * NAME: ea_read_inline
+ *
+ * FUNCTION: Read an inlined EA into user's buffer
+ *
+ * PARAMETERS:
+ *	ip	- Inode pointer
+ *	ealist	- Pointer to buffer to fill in with EA
+ *
+ * RETURNS: 0
+ */
+static int ea_read_inline(struct inode *ip, struct jfs_ea_list *ealist)
+{
+	struct jfs_inode_info *ji = JFS_IP(ip);
+	int ea_size = sizeDXD(&ji->ea);
+
+	if (ea_size == 0) {
+		ealist->size = 0;
+		return 0;
+	}
+
+	/* Sanity Check */
+	if ((sizeDXD(&ji->ea) > sizeof (ji->i_inline_ea)))
+		return -EIO;
+	if (le32_to_cpu(((struct jfs_ea_list *) &ji->i_inline_ea)->size)
+	    != ea_size)
+		return -EIO;
+
+	memcpy(ealist, ji->i_inline_ea, ea_size);
+	return 0;
+}
+
+/*
+ * NAME: ea_read
+ *
+ * FUNCTION: copy EA data into user's buffer
+ *
+ * PARAMETERS:
+ *	ip	- Inode pointer
+ *	ealist	- Pointer to buffer to fill in with EA
+ *
+ * NOTES:  If EA is inline calls ea_read_inline() to copy EA.
+ *
+ * RETURNS: 0 for success; other indicates failure
+ */
+static int ea_read(struct inode *ip, struct jfs_ea_list *ealist)
+{
+	struct super_block *sb = ip->i_sb;
+	struct jfs_inode_info *ji = JFS_IP(ip);
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	int nblocks;
+	s64 blkno;
+	char *cp = (char *) ealist;
+	int i;
+	int nbytes, nb;
+	s32 bytes_to_read;
+	struct metapage *mp;
+
+	/* quick check for in-line EA */
+	if (ji->ea.flag & DXD_INLINE)
+		return ea_read_inline(ip, ealist);
+
+	nbytes = sizeDXD(&ji->ea);
+	if (!nbytes) {
+		jfs_error(sb, "ea_read: nbytes is 0");
+		return -EIO;
+	}
+
+	/*
+	 * Figure out how many blocks were allocated when this EA list was
+	 * originally written to disk.
+	 */
+	nblocks = lengthDXD(&ji->ea) << sbi->l2nbperpage;
+	blkno = addressDXD(&ji->ea) << sbi->l2nbperpage;
+
+	/*
+	 * I have found the disk blocks which were originally used to store
+	 * the FEALIST.  now i loop over each contiguous block copying the
+	 * data into the buffer.
+	 */
+	for (i = 0; i < nblocks; i += sbi->nbperpage) {
+		/*
+		 * Determine how many bytes for this request, and round up to
+		 * the nearest aggregate block size
+		 */
+		nb = min(PSIZE, nbytes);
+		bytes_to_read =
+		    ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits))
+		    << sb->s_blocksize_bits;
+
+		if (!(mp = read_metapage(ip, blkno + i, bytes_to_read, 1)))
+			return -EIO;
+
+		memcpy(cp, mp->data, nb);
+		release_metapage(mp);
+
+		cp += PSIZE;
+		nbytes -= nb;
+	}
+
+	return 0;
+}
+
+/*
+ * NAME: ea_get
+ *
+ * FUNCTION: Returns buffer containing existing extended attributes.
+ *	     The size of the buffer will be the larger of the existing
+ *	     attributes size, or min_size.
+ *
+ *	     The buffer, which may be inlined in the inode or in the
+ *	     page cache must be release by calling ea_release or ea_put
+ *
+ * PARAMETERS:
+ *	inode	- Inode pointer
+ *	ea_buf	- Structure to be populated with ealist and its metadata
+ *	min_size- minimum size of buffer to be returned
+ *
+ * RETURNS: 0 for success; Other indicates failure
+ */
+static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
+{
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	struct super_block *sb = inode->i_sb;
+	int size;
+	int ea_size = sizeDXD(&ji->ea);
+	int blocks_needed, current_blocks;
+	s64 blkno;
+	int rc;
+	int quota_allocation = 0;
+
+	/* When fsck.jfs clears a bad ea, it doesn't clear the size */
+	if (ji->ea.flag == 0)
+		ea_size = 0;
+
+	if (ea_size == 0) {
+		if (min_size == 0) {
+			ea_buf->flag = 0;
+			ea_buf->max_size = 0;
+			ea_buf->xattr = NULL;
+			return 0;
+		}
+		if ((min_size <= sizeof (ji->i_inline_ea)) &&
+		    (ji->mode2 & INLINEEA)) {
+			ea_buf->flag = EA_INLINE | EA_NEW;
+			ea_buf->max_size = sizeof (ji->i_inline_ea);
+			ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea;
+			DXDlength(&ea_buf->new_ea, 0);
+			DXDaddress(&ea_buf->new_ea, 0);
+			ea_buf->new_ea.flag = DXD_INLINE;
+			DXDsize(&ea_buf->new_ea, min_size);
+			return 0;
+		}
+		current_blocks = 0;
+	} else if (ji->ea.flag & DXD_INLINE) {
+		if (min_size <= sizeof (ji->i_inline_ea)) {
+			ea_buf->flag = EA_INLINE;
+			ea_buf->max_size = sizeof (ji->i_inline_ea);
+			ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea;
+			goto size_check;
+		}
+		current_blocks = 0;
+	} else {
+		if (!(ji->ea.flag & DXD_EXTENT)) {
+			jfs_error(sb, "ea_get: invalid ea.flag)");
+			return -EIO;
+		}
+		current_blocks = (ea_size + sb->s_blocksize - 1) >>
+		    sb->s_blocksize_bits;
+	}
+	size = max(min_size, ea_size);
+
+	if (size > PSIZE) {
+		/*
+		 * To keep the rest of the code simple.  Allocate a
+		 * contiguous buffer to work with
+		 */
+		ea_buf->xattr = kmalloc(size, GFP_KERNEL);
+		if (ea_buf->xattr == NULL)
+			return -ENOMEM;
+
+		ea_buf->flag = EA_MALLOC;
+		ea_buf->max_size = (size + sb->s_blocksize - 1) &
+		    ~(sb->s_blocksize - 1);
+
+		if (ea_size == 0)
+			return 0;
+
+		if ((rc = ea_read(inode, ea_buf->xattr))) {
+			kfree(ea_buf->xattr);
+			ea_buf->xattr = NULL;
+			return rc;
+		}
+		goto size_check;
+	}
+	blocks_needed = (min_size + sb->s_blocksize - 1) >>
+	    sb->s_blocksize_bits;
+
+	if (blocks_needed > current_blocks) {
+		/* Allocate new blocks to quota. */
+		rc = dquot_alloc_block(inode, blocks_needed);
+		if (rc)
+			return -EDQUOT;
+
+		quota_allocation = blocks_needed;
+
+		rc = dbAlloc(inode, INOHINT(inode), (s64) blocks_needed,
+			     &blkno);
+		if (rc)
+			goto clean_up;
+
+		DXDlength(&ea_buf->new_ea, blocks_needed);
+		DXDaddress(&ea_buf->new_ea, blkno);
+		ea_buf->new_ea.flag = DXD_EXTENT;
+		DXDsize(&ea_buf->new_ea, min_size);
+
+		ea_buf->flag = EA_EXTENT | EA_NEW;
+
+		ea_buf->mp = get_metapage(inode, blkno,
+					  blocks_needed << sb->s_blocksize_bits,
+					  1);
+		if (ea_buf->mp == NULL) {
+			dbFree(inode, blkno, (s64) blocks_needed);
+			rc = -EIO;
+			goto clean_up;
+		}
+		ea_buf->xattr = ea_buf->mp->data;
+		ea_buf->max_size = (min_size + sb->s_blocksize - 1) &
+		    ~(sb->s_blocksize - 1);
+		if (ea_size == 0)
+			return 0;
+		if ((rc = ea_read(inode, ea_buf->xattr))) {
+			discard_metapage(ea_buf->mp);
+			dbFree(inode, blkno, (s64) blocks_needed);
+			goto clean_up;
+		}
+		goto size_check;
+	}
+	ea_buf->flag = EA_EXTENT;
+	ea_buf->mp = read_metapage(inode, addressDXD(&ji->ea),
+				   lengthDXD(&ji->ea) << sb->s_blocksize_bits,
+				   1);
+	if (ea_buf->mp == NULL) {
+		rc = -EIO;
+		goto clean_up;
+	}
+	ea_buf->xattr = ea_buf->mp->data;
+	ea_buf->max_size = (ea_size + sb->s_blocksize - 1) &
+	    ~(sb->s_blocksize - 1);
+
+      size_check:
+	if (EALIST_SIZE(ea_buf->xattr) != ea_size) {
+		printk(KERN_ERR "ea_get: invalid extended attribute\n");
+		print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1,
+				     ea_buf->xattr, ea_size, 1);
+		ea_release(inode, ea_buf);
+		rc = -EIO;
+		goto clean_up;
+	}
+
+	return ea_size;
+
+      clean_up:
+	/* Rollback quota allocation */
+	if (quota_allocation)
+		dquot_free_block(inode, quota_allocation);
+
+	return (rc);
+}
+
+static void ea_release(struct inode *inode, struct ea_buffer *ea_buf)
+{
+	if (ea_buf->flag & EA_MALLOC)
+		kfree(ea_buf->xattr);
+	else if (ea_buf->flag & EA_EXTENT) {
+		assert(ea_buf->mp);
+		release_metapage(ea_buf->mp);
+
+		if (ea_buf->flag & EA_NEW)
+			dbFree(inode, addressDXD(&ea_buf->new_ea),
+			       lengthDXD(&ea_buf->new_ea));
+	}
+}
+
+static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
+		  int new_size)
+{
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	unsigned long old_blocks, new_blocks;
+	int rc = 0;
+
+	if (new_size == 0) {
+		ea_release(inode, ea_buf);
+		ea_buf = NULL;
+	} else if (ea_buf->flag & EA_INLINE) {
+		assert(new_size <= sizeof (ji->i_inline_ea));
+		ji->mode2 &= ~INLINEEA;
+		ea_buf->new_ea.flag = DXD_INLINE;
+		DXDsize(&ea_buf->new_ea, new_size);
+		DXDaddress(&ea_buf->new_ea, 0);
+		DXDlength(&ea_buf->new_ea, 0);
+	} else if (ea_buf->flag & EA_MALLOC) {
+		rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea);
+		kfree(ea_buf->xattr);
+	} else if (ea_buf->flag & EA_NEW) {
+		/* We have already allocated a new dxd */
+		flush_metapage(ea_buf->mp);
+	} else {
+		/* ->xattr must point to original ea's metapage */
+		rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea);
+		discard_metapage(ea_buf->mp);
+	}
+	if (rc)
+		return rc;
+
+	old_blocks = new_blocks = 0;
+
+	if (ji->ea.flag & DXD_EXTENT) {
+		invalidate_dxd_metapages(inode, ji->ea);
+		old_blocks = lengthDXD(&ji->ea);
+	}
+
+	if (ea_buf) {
+		txEA(tid, inode, &ji->ea, &ea_buf->new_ea);
+		if (ea_buf->new_ea.flag & DXD_EXTENT) {
+			new_blocks = lengthDXD(&ea_buf->new_ea);
+			if (ji->ea.flag & DXD_INLINE)
+				ji->mode2 |= INLINEEA;
+		}
+		ji->ea = ea_buf->new_ea;
+	} else {
+		txEA(tid, inode, &ji->ea, NULL);
+		if (ji->ea.flag & DXD_INLINE)
+			ji->mode2 |= INLINEEA;
+		ji->ea.flag = 0;
+		ji->ea.size = 0;
+	}
+
+	/* If old blocks exist, they must be removed from quota allocation. */
+	if (old_blocks)
+		dquot_free_block(inode, old_blocks);
+
+	inode->i_ctime = CURRENT_TIME;
+
+	return 0;
+}
+
+/*
+ * can_set_system_xattr
+ *
+ * This code is specific to the system.* namespace.  It contains policy
+ * which doesn't belong in the main xattr codepath.
+ */
+static int can_set_system_xattr(struct inode *inode, const char *name,
+				const void *value, size_t value_len)
+{
+#ifdef CONFIG_JFS_POSIX_ACL
+	struct posix_acl *acl;
+	int rc;
+
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
+	/*
+	 * POSIX_ACL_XATTR_ACCESS is tied to i_mode
+	 */
+	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) {
+		acl = posix_acl_from_xattr(value, value_len);
+		if (IS_ERR(acl)) {
+			rc = PTR_ERR(acl);
+			printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
+			       rc);
+			return rc;
+		}
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			rc = posix_acl_equiv_mode(acl, &mode);
+			posix_acl_release(acl);
+			if (rc < 0) {
+				printk(KERN_ERR
+				       "posix_acl_equiv_mode returned %d\n",
+				       rc);
+				return rc;
+			}
+			inode->i_mode = mode;
+			mark_inode_dirty(inode);
+		}
+		/*
+		 * We're changing the ACL.  Get rid of the cached one
+		 */
+		forget_cached_acl(inode, ACL_TYPE_ACCESS);
+
+		return 0;
+	} else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
+		acl = posix_acl_from_xattr(value, value_len);
+		if (IS_ERR(acl)) {
+			rc = PTR_ERR(acl);
+			printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
+			       rc);
+			return rc;
+		}
+		posix_acl_release(acl);
+
+		/*
+		 * We're changing the default ACL.  Get rid of the cached one
+		 */
+		forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+
+		return 0;
+	}
+#endif			/* CONFIG_JFS_POSIX_ACL */
+	return -EOPNOTSUPP;
+}
+
+/*
+ * Most of the permission checking is done by xattr_permission in the vfs.
+ * The local file system is responsible for handling the system.* namespace.
+ * We also need to verify that this is a namespace that we recognize.
+ */
+static int can_set_xattr(struct inode *inode, const char *name,
+			 const void *value, size_t value_len)
+{
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return can_set_system_xattr(inode, name, value, value_len);
+
+	if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) {
+		/*
+		 * This makes sure that we aren't trying to set an
+		 * attribute in a different namespace by prefixing it
+		 * with "os2."
+		 */
+		if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN))
+				return -EOPNOTSUPP;
+		return 0;
+	}
+
+	/*
+	 * Don't allow setting an attribute in an unknown namespace.
+	 */
+	if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
+	    strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
+		   const void *value, size_t value_len, int flags)
+{
+	struct jfs_ea_list *ealist;
+	struct jfs_ea *ea, *old_ea = NULL, *next_ea = NULL;
+	struct ea_buffer ea_buf;
+	int old_ea_size = 0;
+	int xattr_size;
+	int new_size;
+	int namelen = strlen(name);
+	char *os2name = NULL;
+	int found = 0;
+	int rc;
+	int length;
+
+	if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
+		os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
+				  GFP_KERNEL);
+		if (!os2name)
+			return -ENOMEM;
+		strcpy(os2name, name + XATTR_OS2_PREFIX_LEN);
+		name = os2name;
+		namelen -= XATTR_OS2_PREFIX_LEN;
+	}
+
+	down_write(&JFS_IP(inode)->xattr_sem);
+
+	xattr_size = ea_get(inode, &ea_buf, 0);
+	if (xattr_size < 0) {
+		rc = xattr_size;
+		goto out;
+	}
+
+      again:
+	ealist = (struct jfs_ea_list *) ea_buf.xattr;
+	new_size = sizeof (struct jfs_ea_list);
+
+	if (xattr_size) {
+		for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist);
+		     ea = NEXT_EA(ea)) {
+			if ((namelen == ea->namelen) &&
+			    (memcmp(name, ea->name, namelen) == 0)) {
+				found = 1;
+				if (flags & XATTR_CREATE) {
+					rc = -EEXIST;
+					goto release;
+				}
+				old_ea = ea;
+				old_ea_size = EA_SIZE(ea);
+				next_ea = NEXT_EA(ea);
+			} else
+				new_size += EA_SIZE(ea);
+		}
+	}
+
+	if (!found) {
+		if (flags & XATTR_REPLACE) {
+			rc = -ENODATA;
+			goto release;
+		}
+		if (value == NULL) {
+			rc = 0;
+			goto release;
+		}
+	}
+	if (value)
+		new_size += sizeof (struct jfs_ea) + namelen + 1 + value_len;
+
+	if (new_size > ea_buf.max_size) {
+		/*
+		 * We need to allocate more space for merged ea list.
+		 * We should only have loop to again: once.
+		 */
+		ea_release(inode, &ea_buf);
+		xattr_size = ea_get(inode, &ea_buf, new_size);
+		if (xattr_size < 0) {
+			rc = xattr_size;
+			goto out;
+		}
+		goto again;
+	}
+
+	/* Remove old ea of the same name */
+	if (found) {
+		/* number of bytes following target EA */
+		length = (char *) END_EALIST(ealist) - (char *) next_ea;
+		if (length > 0)
+			memmove(old_ea, next_ea, length);
+		xattr_size -= old_ea_size;
+	}
+
+	/* Add new entry to the end */
+	if (value) {
+		if (xattr_size == 0)
+			/* Completely new ea list */
+			xattr_size = sizeof (struct jfs_ea_list);
+
+		ea = (struct jfs_ea *) ((char *) ealist + xattr_size);
+		ea->flag = 0;
+		ea->namelen = namelen;
+		ea->valuelen = (cpu_to_le16(value_len));
+		memcpy(ea->name, name, namelen);
+		ea->name[namelen] = 0;
+		if (value_len)
+			memcpy(&ea->name[namelen + 1], value, value_len);
+		xattr_size += EA_SIZE(ea);
+	}
+
+	/* DEBUG - If we did this right, these number match */
+	if (xattr_size != new_size) {
+		printk(KERN_ERR
+		       "jfs_xsetattr: xattr_size = %d, new_size = %d\n",
+		       xattr_size, new_size);
+
+		rc = -EINVAL;
+		goto release;
+	}
+
+	/*
+	 * If we're left with an empty list, there's no ea
+	 */
+	if (new_size == sizeof (struct jfs_ea_list))
+		new_size = 0;
+
+	ealist->size = cpu_to_le32(new_size);
+
+	rc = ea_put(tid, inode, &ea_buf, new_size);
+
+	goto out;
+      release:
+	ea_release(inode, &ea_buf);
+      out:
+	up_write(&JFS_IP(inode)->xattr_sem);
+
+	kfree(os2name);
+
+	return rc;
+}
+
+int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		 size_t value_len, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	int rc;
+	tid_t tid;
+
+	if ((rc = can_set_xattr(inode, name, value, value_len)))
+		return rc;
+
+	if (value == NULL) {	/* empty EA, do not remove */
+		value = "";
+		value_len = 0;
+	}
+
+	tid = txBegin(inode->i_sb, 0);
+	mutex_lock(&ji->commit_mutex);
+	rc = __jfs_setxattr(tid, dentry->d_inode, name, value, value_len,
+			    flags);
+	if (!rc)
+		rc = txCommit(tid, 1, &inode, 0);
+	txEnd(tid);
+	mutex_unlock(&ji->commit_mutex);
+
+	return rc;
+}
+
+ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
+		       size_t buf_size)
+{
+	struct jfs_ea_list *ealist;
+	struct jfs_ea *ea;
+	struct ea_buffer ea_buf;
+	int xattr_size;
+	ssize_t size;
+	int namelen = strlen(name);
+	char *value;
+
+	down_read(&JFS_IP(inode)->xattr_sem);
+
+	xattr_size = ea_get(inode, &ea_buf, 0);
+
+	if (xattr_size < 0) {
+		size = xattr_size;
+		goto out;
+	}
+
+	if (xattr_size == 0)
+		goto not_found;
+
+	ealist = (struct jfs_ea_list *) ea_buf.xattr;
+
+	/* Find the named attribute */
+	for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea))
+		if ((namelen == ea->namelen) &&
+		    memcmp(name, ea->name, namelen) == 0) {
+			/* Found it */
+			size = le16_to_cpu(ea->valuelen);
+			if (!data)
+				goto release;
+			else if (size > buf_size) {
+				size = -ERANGE;
+				goto release;
+			}
+			value = ((char *) &ea->name) + ea->namelen + 1;
+			memcpy(data, value, size);
+			goto release;
+		}
+      not_found:
+	size = -ENODATA;
+      release:
+	ea_release(inode, &ea_buf);
+      out:
+	up_read(&JFS_IP(inode)->xattr_sem);
+
+	return size;
+}
+
+ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
+		     size_t buf_size)
+{
+	int err;
+
+	if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
+		/*
+		 * skip past "os2." prefix
+		 */
+		name += XATTR_OS2_PREFIX_LEN;
+		/*
+		 * Don't allow retrieving properly prefixed attributes
+		 * by prepending them with "os2."
+		 */
+		if (is_known_namespace(name))
+			return -EOPNOTSUPP;
+	}
+
+	err = __jfs_getxattr(dentry->d_inode, name, data, buf_size);
+
+	return err;
+}
+
+/*
+ * No special permissions are needed to list attributes except for trusted.*
+ */
+static inline int can_list(struct jfs_ea *ea)
+{
+	return (strncmp(ea->name, XATTR_TRUSTED_PREFIX,
+			    XATTR_TRUSTED_PREFIX_LEN) ||
+		capable(CAP_SYS_ADMIN));
+}
+
+ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size)
+{
+	struct inode *inode = dentry->d_inode;
+	char *buffer;
+	ssize_t size = 0;
+	int xattr_size;
+	struct jfs_ea_list *ealist;
+	struct jfs_ea *ea;
+	struct ea_buffer ea_buf;
+
+	down_read(&JFS_IP(inode)->xattr_sem);
+
+	xattr_size = ea_get(inode, &ea_buf, 0);
+	if (xattr_size < 0) {
+		size = xattr_size;
+		goto out;
+	}
+
+	if (xattr_size == 0)
+		goto release;
+
+	ealist = (struct jfs_ea_list *) ea_buf.xattr;
+
+	/* compute required size of list */
+	for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) {
+		if (can_list(ea))
+			size += name_size(ea) + 1;
+	}
+
+	if (!data)
+		goto release;
+
+	if (size > buf_size) {
+		size = -ERANGE;
+		goto release;
+	}
+
+	/* Copy attribute names to buffer */
+	buffer = data;
+	for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) {
+		if (can_list(ea)) {
+			int namelen = copy_name(buffer, ea);
+			buffer += namelen + 1;
+		}
+	}
+
+      release:
+	ea_release(inode, &ea_buf);
+      out:
+	up_read(&JFS_IP(inode)->xattr_sem);
+	return size;
+}
+
+int jfs_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = dentry->d_inode;
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	int rc;
+	tid_t tid;
+
+	if ((rc = can_set_xattr(inode, name, NULL, 0)))
+		return rc;
+
+	tid = txBegin(inode->i_sb, 0);
+	mutex_lock(&ji->commit_mutex);
+	rc = __jfs_setxattr(tid, dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
+	if (!rc)
+		rc = txCommit(tid, 1, &inode, 0);
+	txEnd(tid);
+	mutex_unlock(&ji->commit_mutex);
+
+	return rc;
+}
+
+#ifdef CONFIG_JFS_SECURITY
+int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
+		      const struct qstr *qstr)
+{
+	int rc;
+	size_t len;
+	void *value;
+	char *suffix;
+	char *name;
+
+	rc = security_inode_init_security(inode, dir, qstr, &suffix, &value,
+					  &len);
+	if (rc) {
+		if (rc == -EOPNOTSUPP)
+			return 0;
+		return rc;
+	}
+	name = kmalloc(XATTR_SECURITY_PREFIX_LEN + 1 + strlen(suffix),
+		       GFP_NOFS);
+	if (!name) {
+		rc = -ENOMEM;
+		goto kmalloc_failed;
+	}
+	strcpy(name, XATTR_SECURITY_PREFIX);
+	strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
+
+	rc = __jfs_setxattr(tid, inode, name, value, len, 0);
+
+	kfree(name);
+kmalloc_failed:
+	kfree(suffix);
+	kfree(value);
+
+	return rc;
+}
+#endif
diff --git a/fs/libfs.c b/fs/libfs.c
new file mode 100644
index 00000000..275ca474
--- /dev/null
+++ b/fs/libfs.c
@@ -0,0 +1,997 @@
+/*
+ *	fs/libfs.c
+ *	Library for filesystems writers.
+ */
+
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/quotaops.h>
+#include <linux/mutex.h>
+#include <linux/exportfs.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+
+#include <asm/uaccess.h>
+
+static inline int simple_positive(struct dentry *dentry)
+{
+	return dentry->d_inode && !d_unhashed(dentry);
+}
+
+int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		   struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	generic_fillattr(inode, stat);
+	stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9);
+	return 0;
+}
+
+int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	buf->f_type = dentry->d_sb->s_magic;
+	buf->f_bsize = PAGE_CACHE_SIZE;
+	buf->f_namelen = NAME_MAX;
+	return 0;
+}
+
+/*
+ * Retaining negative dentries for an in-memory filesystem just wastes
+ * memory and lookup time: arrange for them to be deleted immediately.
+ */
+static int simple_delete_dentry(const struct dentry *dentry)
+{
+	return 1;
+}
+
+/*
+ * Lookup the data. This is trivial - if the dentry didn't already
+ * exist, we know it is negative.  Set d_op to delete negative dentries.
+ */
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	static const struct dentry_operations simple_dentry_operations = {
+		.d_delete = simple_delete_dentry,
+	};
+
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+	d_set_d_op(dentry, &simple_dentry_operations);
+	d_add(dentry, NULL);
+	return NULL;
+}
+
+int dcache_dir_open(struct inode *inode, struct file *file)
+{
+	static struct qstr cursor_name = {.len = 1, .name = "."};
+
+	file->private_data = d_alloc(file->f_path.dentry, &cursor_name);
+
+	return file->private_data ? 0 : -ENOMEM;
+}
+
+int dcache_dir_close(struct inode *inode, struct file *file)
+{
+	dput(file->private_data);
+	return 0;
+}
+
+loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	mutex_lock(&dentry->d_inode->i_mutex);
+	switch (origin) {
+		case 1:
+			offset += file->f_pos;
+		case 0:
+			if (offset >= 0)
+				break;
+		default:
+			mutex_unlock(&dentry->d_inode->i_mutex);
+			return -EINVAL;
+	}
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		if (file->f_pos >= 2) {
+			struct list_head *p;
+			struct dentry *cursor = file->private_data;
+			loff_t n = file->f_pos - 2;
+
+			spin_lock(&dentry->d_lock);
+			/* d_lock not required for cursor */
+			list_del(&cursor->d_u.d_child);
+			p = dentry->d_subdirs.next;
+			while (n && p != &dentry->d_subdirs) {
+				struct dentry *next;
+				next = list_entry(p, struct dentry, d_u.d_child);
+				spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+				if (simple_positive(next))
+					n--;
+				spin_unlock(&next->d_lock);
+				p = p->next;
+			}
+			list_add_tail(&cursor->d_u.d_child, p);
+			spin_unlock(&dentry->d_lock);
+		}
+	}
+	mutex_unlock(&dentry->d_inode->i_mutex);
+	return offset;
+}
+
+/* Relationship between i_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct inode *inode)
+{
+	return (inode->i_mode >> 12) & 15;
+}
+
+/*
+ * Directory is locked and all positive dentries in it are safe, since
+ * for ramfs-type trees they can't go away without unlink() or rmdir(),
+ * both impossible due to the lock on directory.
+ */
+
+int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct dentry *cursor = filp->private_data;
+	struct list_head *p, *q = &cursor->d_u.d_child;
+	ino_t ino;
+	int i = filp->f_pos;
+
+	switch (i) {
+		case 0:
+			ino = dentry->d_inode->i_ino;
+			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		case 1:
+			ino = parent_ino(dentry);
+			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		default:
+			spin_lock(&dentry->d_lock);
+			if (filp->f_pos == 2)
+				list_move(q, &dentry->d_subdirs);
+
+			for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
+				struct dentry *next;
+				next = list_entry(p, struct dentry, d_u.d_child);
+				spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+				if (!simple_positive(next)) {
+					spin_unlock(&next->d_lock);
+					continue;
+				}
+
+				spin_unlock(&next->d_lock);
+				spin_unlock(&dentry->d_lock);
+				if (filldir(dirent, next->d_name.name, 
+					    next->d_name.len, filp->f_pos, 
+					    next->d_inode->i_ino, 
+					    dt_type(next->d_inode)) < 0)
+					return 0;
+				spin_lock(&dentry->d_lock);
+				spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+				/* next is still alive */
+				list_move(q, p);
+				spin_unlock(&next->d_lock);
+				p = q;
+				filp->f_pos++;
+			}
+			spin_unlock(&dentry->d_lock);
+	}
+	return 0;
+}
+
+ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos)
+{
+	return -EISDIR;
+}
+
+const struct file_operations simple_dir_operations = {
+	.open		= dcache_dir_open,
+	.release	= dcache_dir_close,
+	.llseek		= dcache_dir_lseek,
+	.read		= generic_read_dir,
+	.readdir	= dcache_readdir,
+	.fsync		= noop_fsync,
+};
+
+const struct inode_operations simple_dir_inode_operations = {
+	.lookup		= simple_lookup,
+};
+
+static const struct super_operations simple_super_operations = {
+	.statfs		= simple_statfs,
+};
+
+/*
+ * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
+ * will never be mountable)
+ */
+struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
+	const struct super_operations *ops,
+	const struct dentry_operations *dops, unsigned long magic)
+{
+	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
+	struct dentry *dentry;
+	struct inode *root;
+	struct qstr d_name = {.name = name, .len = strlen(name)};
+
+	if (IS_ERR(s))
+		return ERR_CAST(s);
+
+	s->s_flags = MS_NOUSER;
+	s->s_maxbytes = MAX_LFS_FILESIZE;
+	s->s_blocksize = PAGE_SIZE;
+	s->s_blocksize_bits = PAGE_SHIFT;
+	s->s_magic = magic;
+	s->s_op = ops ? ops : &simple_super_operations;
+	s->s_time_gran = 1;
+	root = new_inode(s);
+	if (!root)
+		goto Enomem;
+	/*
+	 * since this is the first inode, make it number 1. New inodes created
+	 * after this must take care not to collide with it (by passing
+	 * max_reserved of 1 to iunique).
+	 */
+	root->i_ino = 1;
+	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
+	root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
+	dentry = d_alloc(NULL, &d_name);
+	if (!dentry) {
+		iput(root);
+		goto Enomem;
+	}
+	dentry->d_sb = s;
+	dentry->d_parent = dentry;
+	d_instantiate(dentry, root);
+	s->s_root = dentry;
+	s->s_d_op = dops;
+	s->s_flags |= MS_ACTIVE;
+	return dget(s->s_root);
+
+Enomem:
+	deactivate_locked_super(s);
+	return ERR_PTR(-ENOMEM);
+}
+
+int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	inc_nlink(inode);
+	ihold(inode);
+	dget(dentry);
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+int simple_empty(struct dentry *dentry)
+{
+	struct dentry *child;
+	int ret = 0;
+
+	spin_lock(&dentry->d_lock);
+	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
+		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+		if (simple_positive(child)) {
+			spin_unlock(&child->d_lock);
+			goto out;
+		}
+		spin_unlock(&child->d_lock);
+	}
+	ret = 1;
+out:
+	spin_unlock(&dentry->d_lock);
+	return ret;
+}
+
+int simple_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	drop_nlink(inode);
+	dput(dentry);
+	return 0;
+}
+
+int simple_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	if (!simple_empty(dentry))
+		return -ENOTEMPTY;
+
+	drop_nlink(dentry->d_inode);
+	simple_unlink(dir, dentry);
+	drop_nlink(dir);
+	return 0;
+}
+
+int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+	int they_are_dirs = S_ISDIR(old_dentry->d_inode->i_mode);
+
+	if (!simple_empty(new_dentry))
+		return -ENOTEMPTY;
+
+	if (new_dentry->d_inode) {
+		simple_unlink(new_dir, new_dentry);
+		if (they_are_dirs)
+			drop_nlink(old_dir);
+	} else if (they_are_dirs) {
+		drop_nlink(old_dir);
+		inc_nlink(new_dir);
+	}
+
+	old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
+		new_dir->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	return 0;
+}
+
+/**
+ * simple_setattr - setattr for simple filesystem
+ * @dentry: dentry
+ * @iattr: iattr structure
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setattr is a simple ->setattr implementation without a proper
+ * implementation of size changes.
+ *
+ * It can either be used for in-memory filesystems or special files
+ * on simple regular filesystems.  Anything that needs to change on-disk
+ * or wire state on size changes needs its own setattr method.
+ */
+int simple_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	WARN_ON_ONCE(inode->i_op->truncate);
+
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		return error;
+
+	if (iattr->ia_valid & ATTR_SIZE)
+		truncate_setsize(inode, iattr->ia_size);
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+EXPORT_SYMBOL(simple_setattr);
+
+int simple_readpage(struct file *file, struct page *page)
+{
+	clear_highpage(page);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	unlock_page(page);
+	return 0;
+}
+
+int simple_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	struct page *page;
+	pgoff_t index;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+
+	*pagep = page;
+
+	if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+		unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+		zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
+	}
+	return 0;
+}
+
+/**
+ * simple_write_end - .write_end helper for non-block-device FSes
+ * @available: See .write_end of address_space_operations
+ * @file: 		"
+ * @mapping: 		"
+ * @pos: 		"
+ * @len: 		"
+ * @copied: 		"
+ * @page: 		"
+ * @fsdata: 		"
+ *
+ * simple_write_end does the minimum needed for updating a page after writing is
+ * done. It has the same API signature as the .write_end of
+ * address_space_operations vector. So it can just be set onto .write_end for
+ * FSes that don't need any other processing. i_mutex is assumed to be held.
+ * Block based filesystems should use generic_write_end().
+ * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
+ * is not called, so a filesystem that actually does store data in .write_inode
+ * should extend on what's done here with a call to mark_inode_dirty() in the
+ * case that i_size has changed.
+ */
+int simple_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t last_pos = pos + copied;
+
+	/* zero the stale part of the page if we did a short copy */
+	if (copied < len) {
+		unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+		zero_user(page, from + copied, len - copied);
+	}
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold the i_mutex.
+	 */
+	if (last_pos > inode->i_size)
+		i_size_write(inode, last_pos);
+
+	set_page_dirty(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	return copied;
+}
+
+/*
+ * the inodes created here are not hashed. If you use iunique to generate
+ * unique inode values later for this filesystem, then you must take care
+ * to pass it an appropriate max_reserved value to avoid collisions.
+ */
+int simple_fill_super(struct super_block *s, unsigned long magic,
+		      struct tree_descr *files)
+{
+	struct inode *inode;
+	struct dentry *root;
+	struct dentry *dentry;
+	int i;
+
+	s->s_blocksize = PAGE_CACHE_SIZE;
+	s->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	s->s_magic = magic;
+	s->s_op = &simple_super_operations;
+	s->s_time_gran = 1;
+
+	inode = new_inode(s);
+	if (!inode)
+		return -ENOMEM;
+	/*
+	 * because the root inode is 1, the files array must not contain an
+	 * entry at index 1
+	 */
+	inode->i_ino = 1;
+	inode->i_mode = S_IFDIR | 0755;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_op = &simple_dir_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+	inode->i_nlink = 2;
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+	for (i = 0; !files->name || files->name[0]; i++, files++) {
+		if (!files->name)
+			continue;
+
+		/* warn if it tries to conflict with the root inode */
+		if (unlikely(i == 1))
+			printk(KERN_WARNING "%s: %s passed in a files array"
+				"with an index of 1!\n", __func__,
+				s->s_type->name);
+
+		dentry = d_alloc_name(root, files->name);
+		if (!dentry)
+			goto out;
+		inode = new_inode(s);
+		if (!inode)
+			goto out;
+		inode->i_mode = S_IFREG | files->mode;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_fop = files->ops;
+		inode->i_ino = i;
+		d_add(dentry, inode);
+	}
+	s->s_root = root;
+	return 0;
+out:
+	d_genocide(root);
+	dput(root);
+	return -ENOMEM;
+}
+
+static DEFINE_SPINLOCK(pin_fs_lock);
+
+int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
+{
+	struct vfsmount *mnt = NULL;
+	spin_lock(&pin_fs_lock);
+	if (unlikely(!*mount)) {
+		spin_unlock(&pin_fs_lock);
+		mnt = vfs_kern_mount(type, 0, type->name, NULL);
+		if (IS_ERR(mnt))
+			return PTR_ERR(mnt);
+		spin_lock(&pin_fs_lock);
+		if (!*mount)
+			*mount = mnt;
+	}
+	mntget(*mount);
+	++*count;
+	spin_unlock(&pin_fs_lock);
+	mntput(mnt);
+	return 0;
+}
+
+void simple_release_fs(struct vfsmount **mount, int *count)
+{
+	struct vfsmount *mnt;
+	spin_lock(&pin_fs_lock);
+	mnt = *mount;
+	if (!--*count)
+		*mount = NULL;
+	spin_unlock(&pin_fs_lock);
+	mntput(mnt);
+}
+
+/**
+ * simple_read_from_buffer - copy data from the buffer to user space
+ * @to: the user space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The simple_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the user space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
+ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
+				const void *from, size_t available)
+{
+	loff_t pos = *ppos;
+	size_t ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= available || !count)
+		return 0;
+	if (count > available - pos)
+		count = available - pos;
+	ret = copy_to_user(to, from + pos, count);
+	if (ret == count)
+		return -EFAULT;
+	count -= ret;
+	*ppos = pos + count;
+	return count;
+}
+
+/**
+ * simple_write_to_buffer - copy data from user space to the buffer
+ * @to: the buffer to write to
+ * @available: the size of the buffer
+ * @ppos: the current position in the buffer
+ * @from: the user space buffer to read from
+ * @count: the maximum number of bytes to read
+ *
+ * The simple_write_to_buffer() function reads up to @count bytes from the user
+ * space address starting at @from into the buffer @to at offset @ppos.
+ *
+ * On success, the number of bytes written is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
+ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
+		const void __user *from, size_t count)
+{
+	loff_t pos = *ppos;
+	size_t res;
+
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= available || !count)
+		return 0;
+	if (count > available - pos)
+		count = available - pos;
+	res = copy_from_user(to + pos, from, count);
+	if (res == count)
+		return -EFAULT;
+	count -= res;
+	*ppos = pos + count;
+	return count;
+}
+
+/**
+ * memory_read_from_buffer - copy data from the buffer
+ * @to: the kernel space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The memory_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the kernel space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
+ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
+				const void *from, size_t available)
+{
+	loff_t pos = *ppos;
+
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= available)
+		return 0;
+	if (count > available - pos)
+		count = available - pos;
+	memcpy(to, from + pos, count);
+	*ppos = pos + count;
+
+	return count;
+}
+
+/*
+ * Transaction based IO.
+ * The file expects a single write which triggers the transaction, and then
+ * possibly a read which collects the result - which is stored in a
+ * file-local buffer.
+ */
+
+void simple_transaction_set(struct file *file, size_t n)
+{
+	struct simple_transaction_argresp *ar = file->private_data;
+
+	BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
+
+	/*
+	 * The barrier ensures that ar->size will really remain zero until
+	 * ar->data is ready for reading.
+	 */
+	smp_mb();
+	ar->size = n;
+}
+
+char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
+{
+	struct simple_transaction_argresp *ar;
+	static DEFINE_SPINLOCK(simple_transaction_lock);
+
+	if (size > SIMPLE_TRANSACTION_LIMIT - 1)
+		return ERR_PTR(-EFBIG);
+
+	ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL);
+	if (!ar)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock(&simple_transaction_lock);
+
+	/* only one write allowed per open */
+	if (file->private_data) {
+		spin_unlock(&simple_transaction_lock);
+		free_page((unsigned long)ar);
+		return ERR_PTR(-EBUSY);
+	}
+
+	file->private_data = ar;
+
+	spin_unlock(&simple_transaction_lock);
+
+	if (copy_from_user(ar->data, buf, size))
+		return ERR_PTR(-EFAULT);
+
+	return ar->data;
+}
+
+ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
+{
+	struct simple_transaction_argresp *ar = file->private_data;
+
+	if (!ar)
+		return 0;
+	return simple_read_from_buffer(buf, size, pos, ar->data, ar->size);
+}
+
+int simple_transaction_release(struct inode *inode, struct file *file)
+{
+	free_page((unsigned long)file->private_data);
+	return 0;
+}
+
+/* Simple attribute files */
+
+struct simple_attr {
+	int (*get)(void *, u64 *);
+	int (*set)(void *, u64);
+	char get_buf[24];	/* enough to store a u64 and "\n\0" */
+	char set_buf[24];
+	void *data;
+	const char *fmt;	/* format for read operation */
+	struct mutex mutex;	/* protects access to these buffers */
+};
+
+/* simple_attr_open is called by an actual attribute open file operation
+ * to set the attribute specific access operations. */
+int simple_attr_open(struct inode *inode, struct file *file,
+		     int (*get)(void *, u64 *), int (*set)(void *, u64),
+		     const char *fmt)
+{
+	struct simple_attr *attr;
+
+	attr = kmalloc(sizeof(*attr), GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	attr->get = get;
+	attr->set = set;
+	attr->data = inode->i_private;
+	attr->fmt = fmt;
+	mutex_init(&attr->mutex);
+
+	file->private_data = attr;
+
+	return nonseekable_open(inode, file);
+}
+
+int simple_attr_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+/* read from the buffer that is filled with the get function */
+ssize_t simple_attr_read(struct file *file, char __user *buf,
+			 size_t len, loff_t *ppos)
+{
+	struct simple_attr *attr;
+	size_t size;
+	ssize_t ret;
+
+	attr = file->private_data;
+
+	if (!attr->get)
+		return -EACCES;
+
+	ret = mutex_lock_interruptible(&attr->mutex);
+	if (ret)
+		return ret;
+
+	if (*ppos) {		/* continued read */
+		size = strlen(attr->get_buf);
+	} else {		/* first read */
+		u64 val;
+		ret = attr->get(attr->data, &val);
+		if (ret)
+			goto out;
+
+		size = scnprintf(attr->get_buf, sizeof(attr->get_buf),
+				 attr->fmt, (unsigned long long)val);
+	}
+
+	ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size);
+out:
+	mutex_unlock(&attr->mutex);
+	return ret;
+}
+
+/* interpret the buffer as a number to call the set function with */
+ssize_t simple_attr_write(struct file *file, const char __user *buf,
+			  size_t len, loff_t *ppos)
+{
+	struct simple_attr *attr;
+	u64 val;
+	size_t size;
+	ssize_t ret;
+
+	attr = file->private_data;
+	if (!attr->set)
+		return -EACCES;
+
+	ret = mutex_lock_interruptible(&attr->mutex);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
+	size = min(sizeof(attr->set_buf) - 1, len);
+	if (copy_from_user(attr->set_buf, buf, size))
+		goto out;
+
+	attr->set_buf[size] = '\0';
+	val = simple_strtoll(attr->set_buf, NULL, 0);
+	ret = attr->set(attr->data, val);
+	if (ret == 0)
+		ret = len; /* on success, claim we got the whole input */
+out:
+	mutex_unlock(&attr->mutex);
+	return ret;
+}
+
+/**
+ * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
+ * @sb:		filesystem to do the file handle conversion on
+ * @fid:	file handle to convert
+ * @fh_len:	length of the file handle in bytes
+ * @fh_type:	type of file handle
+ * @get_inode:	filesystem callback to retrieve inode
+ *
+ * This function decodes @fid as long as it has one of the well-known
+ * Linux filehandle types and calls @get_inode on it to retrieve the
+ * inode for the object specified in the file handle.
+ */
+struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type, struct inode *(*get_inode)
+			(struct super_block *sb, u64 ino, u32 gen))
+{
+	struct inode *inode = NULL;
+
+	if (fh_len < 2)
+		return NULL;
+
+	switch (fh_type) {
+	case FILEID_INO32_GEN:
+	case FILEID_INO32_GEN_PARENT:
+		inode = get_inode(sb, fid->i32.ino, fid->i32.gen);
+		break;
+	}
+
+	return d_obtain_alias(inode);
+}
+EXPORT_SYMBOL_GPL(generic_fh_to_dentry);
+
+/**
+ * generic_fh_to_dentry - generic helper for the fh_to_parent export operation
+ * @sb:		filesystem to do the file handle conversion on
+ * @fid:	file handle to convert
+ * @fh_len:	length of the file handle in bytes
+ * @fh_type:	type of file handle
+ * @get_inode:	filesystem callback to retrieve inode
+ *
+ * This function decodes @fid as long as it has one of the well-known
+ * Linux filehandle types and calls @get_inode on it to retrieve the
+ * inode for the _parent_ object specified in the file handle if it
+ * is specified in the file handle, or NULL otherwise.
+ */
+struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type, struct inode *(*get_inode)
+			(struct super_block *sb, u64 ino, u32 gen))
+{
+	struct inode *inode = NULL;
+
+	if (fh_len <= 2)
+		return NULL;
+
+	switch (fh_type) {
+	case FILEID_INO32_GEN_PARENT:
+		inode = get_inode(sb, fid->i32.parent_ino,
+				  (fh_len > 3 ? fid->i32.parent_gen : 0));
+		break;
+	}
+
+	return d_obtain_alias(inode);
+}
+EXPORT_SYMBOL_GPL(generic_fh_to_parent);
+
+/**
+ * generic_file_fsync - generic fsync implementation for simple filesystems
+ * @file:	file to synchronize
+ * @datasync:	only synchronize essential metadata if true
+ *
+ * This is a generic implementation of the fsync method for simple
+ * filesystems which track all non-inode metadata in the buffers list
+ * hanging off the address_space structure.
+ */
+int generic_file_fsync(struct file *file, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	int err;
+	int ret;
+
+	ret = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY))
+		return ret;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return ret;
+
+	err = sync_inode_metadata(inode, 1);
+	if (ret == 0)
+		ret = err;
+	return ret;
+}
+EXPORT_SYMBOL(generic_file_fsync);
+
+/**
+ * generic_check_addressable - Check addressability of file system
+ * @blocksize_bits:	log of file system block size
+ * @num_blocks:		number of blocks in file system
+ *
+ * Determine whether a file system with @num_blocks blocks (and a
+ * block size of 2**@blocksize_bits) is addressable by the sector_t
+ * and page cache of the system.  Return 0 if so and -EFBIG otherwise.
+ */
+int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
+{
+	u64 last_fs_block = num_blocks - 1;
+	u64 last_fs_page =
+		last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
+
+	if (unlikely(num_blocks == 0))
+		return 0;
+
+	if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
+		return -EINVAL;
+
+	if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
+	    (last_fs_page > (pgoff_t)(~0ULL))) {
+		return -EFBIG;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(generic_check_addressable);
+
+/*
+ * No-op implementation of ->fsync for in-memory filesystems.
+ */
+int noop_fsync(struct file *file, int datasync)
+{
+	return 0;
+}
+
+EXPORT_SYMBOL(dcache_dir_close);
+EXPORT_SYMBOL(dcache_dir_lseek);
+EXPORT_SYMBOL(dcache_dir_open);
+EXPORT_SYMBOL(dcache_readdir);
+EXPORT_SYMBOL(generic_read_dir);
+EXPORT_SYMBOL(mount_pseudo);
+EXPORT_SYMBOL(simple_write_begin);
+EXPORT_SYMBOL(simple_write_end);
+EXPORT_SYMBOL(simple_dir_inode_operations);
+EXPORT_SYMBOL(simple_dir_operations);
+EXPORT_SYMBOL(simple_empty);
+EXPORT_SYMBOL(simple_fill_super);
+EXPORT_SYMBOL(simple_getattr);
+EXPORT_SYMBOL(simple_link);
+EXPORT_SYMBOL(simple_lookup);
+EXPORT_SYMBOL(simple_pin_fs);
+EXPORT_SYMBOL(simple_readpage);
+EXPORT_SYMBOL(simple_release_fs);
+EXPORT_SYMBOL(simple_rename);
+EXPORT_SYMBOL(simple_rmdir);
+EXPORT_SYMBOL(simple_statfs);
+EXPORT_SYMBOL(noop_fsync);
+EXPORT_SYMBOL(simple_unlink);
+EXPORT_SYMBOL(simple_read_from_buffer);
+EXPORT_SYMBOL(simple_write_to_buffer);
+EXPORT_SYMBOL(memory_read_from_buffer);
+EXPORT_SYMBOL(simple_transaction_set);
+EXPORT_SYMBOL(simple_transaction_get);
+EXPORT_SYMBOL(simple_transaction_read);
+EXPORT_SYMBOL(simple_transaction_release);
+EXPORT_SYMBOL_GPL(simple_attr_open);
+EXPORT_SYMBOL_GPL(simple_attr_release);
+EXPORT_SYMBOL_GPL(simple_attr_read);
+EXPORT_SYMBOL_GPL(simple_attr_write);
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
new file mode 100644
index 00000000..ca58d643
--- /dev/null
+++ b/fs/lockd/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the linux lock manager stuff
+#
+
+obj-$(CONFIG_LOCKD) += lockd.o
+
+lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
+	        svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o
+lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
+lockd-objs		      := $(lockd-objs-y)
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
new file mode 100644
index 00000000..046bb77c
--- /dev/null
+++ b/fs/lockd/clnt4xdr.c
@@ -0,0 +1,605 @@
+/*
+ * linux/fs/lockd/clnt4xdr.c
+ *
+ * XDR functions to encode/decode NLM version 4 RPC arguments and results.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+
+#define NLMDBG_FACILITY		NLMDBG_XDR
+
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+
+#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
+#  error "NLM host name cannot be larger than NLM's maximum string length!"
+#endif
+
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM4_void_sz		(0)
+#define NLM4_cookie_sz		(1+(NLM_MAXCOOKIELEN>>2))
+#define NLM4_caller_sz		(1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_owner_sz		(1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_fhandle_sz		(1+(NFS3_FHSIZE>>2))
+#define NLM4_lock_sz		(5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz)
+#define NLM4_holder_sz		(6+NLM4_owner_sz)
+
+#define NLM4_testargs_sz	(NLM4_cookie_sz+1+NLM4_lock_sz)
+#define NLM4_lockargs_sz	(NLM4_cookie_sz+4+NLM4_lock_sz)
+#define NLM4_cancargs_sz	(NLM4_cookie_sz+2+NLM4_lock_sz)
+#define NLM4_unlockargs_sz	(NLM4_cookie_sz+NLM4_lock_sz)
+
+#define NLM4_testres_sz		(NLM4_cookie_sz+1+NLM4_holder_sz)
+#define NLM4_res_sz		(NLM4_cookie_sz+1)
+#define NLM4_norep_sz		(0)
+
+
+static s64 loff_t_to_s64(loff_t offset)
+{
+	s64 res;
+
+	if (offset >= NLM4_OFFSET_MAX)
+		res = NLM4_OFFSET_MAX;
+	else if (offset <= -NLM4_OFFSET_MAX)
+		res = -NLM4_OFFSET_MAX;
+	else
+		res = offset;
+	return res;
+}
+
+static void nlm4_compute_offsets(const struct nlm_lock *lock,
+				 u64 *l_offset, u64 *l_len)
+{
+	const struct file_lock *fl = &lock->fl;
+
+	BUG_ON(fl->fl_start > NLM4_OFFSET_MAX);
+	BUG_ON(fl->fl_end > NLM4_OFFSET_MAX &&
+				fl->fl_end != OFFSET_MAX);
+
+	*l_offset = loff_t_to_s64(fl->fl_start);
+	if (fl->fl_end == OFFSET_MAX)
+		*l_len = 0;
+	else
+		*l_len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
+}
+
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
+
+/*
+ * Encode/decode NLMv4 basic data types
+ *
+ * Basic NLMv4 data types are defined in Appendix II, section 6.1.4
+ * of RFC 1813: "NFS Version 3 Protocol Specification" and in Chapter
+ * 10 of X/Open's "Protocols for Interworking: XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = value ? xdr_one : xdr_zero;
+}
+
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(value);
+}
+
+/*
+ *	typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+			  const u8 *data, const unsigned int length)
+{
+	__be32 *p;
+
+	BUG_ON(length > XDR_MAX_NETOBJ);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, data, length);
+}
+
+static int decode_netobj(struct xdr_stream *xdr,
+			 struct xdr_netobj *obj)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	if (unlikely(length > XDR_MAX_NETOBJ))
+		goto out_size;
+	obj->len = length;
+	obj->data = (u8 *)p;
+	return 0;
+out_size:
+	dprintk("NFS: returned netobj was too long: %u\n", length);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+			  const struct nlm_cookie *cookie)
+{
+	BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
+	encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+
+static int decode_cookie(struct xdr_stream *xdr,
+			     struct nlm_cookie *cookie)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	/* apparently HPUX can return empty cookies */
+	if (length == 0)
+		goto out_hpux;
+	if (length > NLM_MAXCOOKIELEN)
+		goto out_size;
+	p = xdr_inline_decode(xdr, length);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	cookie->len = length;
+	memcpy(cookie->data, p, length);
+	return 0;
+out_hpux:
+	cookie->len = 4;
+	memset(cookie->data, 0, 4);
+	return 0;
+out_size:
+	dprintk("NFS: returned cookie was too long: %u\n", length);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+	BUG_ON(fh->size > NFS3_FHSIZE);
+	encode_netobj(xdr, (u8 *)&fh->data, fh->size);
+}
+
+/*
+ *	enum nlm4_stats {
+ *		NLM4_GRANTED = 0,
+ *		NLM4_DENIED = 1,
+ *		NLM4_DENIED_NOLOCKS = 2,
+ *		NLM4_BLOCKED = 3,
+ *		NLM4_DENIED_GRACE_PERIOD = 4,
+ *		NLM4_DEADLCK = 5,
+ *		NLM4_ROFS = 6,
+ *		NLM4_STALE_FH = 7,
+ *		NLM4_FBIG = 8,
+ *		NLM4_FAILED = 9
+ *	};
+ *
+ *	struct nlm4_stat {
+ *		nlm4_stats stat;
+ *	};
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+static void encode_nlm4_stat(struct xdr_stream *xdr,
+			     const __be32 stat)
+{
+	__be32 *p;
+
+	BUG_ON(be32_to_cpu(stat) > NLM_FAILED);
+	p = xdr_reserve_space(xdr, 4);
+	*p = stat;
+}
+
+static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (unlikely(ntohl(*p) > ntohl(nlm4_failed)))
+		goto out_bad_xdr;
+	*stat = *p;
+	return 0;
+out_bad_xdr:
+	dprintk("%s: server returned invalid nlm4_stats value: %u\n",
+			__func__, be32_to_cpup(p));
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	struct nlm4_holder {
+ *		bool	exclusive;
+ *		int32	svid;
+ *		netobj	oh;
+ *		uint64	l_offset;
+ *		uint64	l_len;
+ *	};
+ */
+static void encode_nlm4_holder(struct xdr_stream *xdr,
+			       const struct nlm_res *result)
+{
+	const struct nlm_lock *lock = &result->lock;
+	u64 l_offset, l_len;
+	__be32 *p;
+
+	encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+	encode_int32(xdr, lock->svid);
+	encode_netobj(xdr, lock->oh.data, lock->oh.len);
+
+	p = xdr_reserve_space(xdr, 4 + 4);
+	nlm4_compute_offsets(lock, &l_offset, &l_len);
+	p = xdr_encode_hyper(p, l_offset);
+	xdr_encode_hyper(p, l_len);
+}
+
+static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+	struct nlm_lock *lock = &result->lock;
+	struct file_lock *fl = &lock->fl;
+	u64 l_offset, l_len;
+	u32 exclusive;
+	int error;
+	__be32 *p;
+	s32 end;
+
+	memset(lock, 0, sizeof(*lock));
+	locks_init_lock(fl);
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	exclusive = be32_to_cpup(p++);
+	lock->svid = be32_to_cpup(p);
+	fl->fl_pid = (pid_t)lock->svid;
+
+	error = decode_netobj(xdr, &lock->oh);
+	if (unlikely(error))
+		goto out;
+
+	p = xdr_inline_decode(xdr, 8 + 8);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+
+	fl->fl_flags = FL_POSIX;
+	fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+	p = xdr_decode_hyper(p, &l_offset);
+	xdr_decode_hyper(p, &l_len);
+	end = l_offset + l_len - 1;
+
+	fl->fl_start = (loff_t)l_offset;
+	if (l_len == 0 || end < 0)
+		fl->fl_end = OFFSET_MAX;
+	else
+		fl->fl_end = (loff_t)end;
+	error = 0;
+out:
+	return error;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+	/* NB: client-side does not set lock->len */
+	u32 length = strlen(name);
+	__be32 *p;
+
+	BUG_ON(length > NLM_MAXSTRLEN);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, name, length);
+}
+
+/*
+ *	struct nlm4_lock {
+ *		string	caller_name<LM_MAXSTRLEN>;
+ *		netobj	fh;
+ *		netobj	oh;
+ *		int32	svid;
+ *		uint64	l_offset;
+ *		uint64	l_len;
+ *	};
+ */
+static void encode_nlm4_lock(struct xdr_stream *xdr,
+			     const struct nlm_lock *lock)
+{
+	u64 l_offset, l_len;
+	__be32 *p;
+
+	encode_caller_name(xdr, lock->caller);
+	encode_fh(xdr, &lock->fh);
+	encode_netobj(xdr, lock->oh.data, lock->oh.len);
+
+	p = xdr_reserve_space(xdr, 4 + 8 + 8);
+	*p++ = cpu_to_be32(lock->svid);
+
+	nlm4_compute_offsets(lock, &l_offset, &l_len);
+	p = xdr_encode_hyper(p, l_offset);
+	xdr_encode_hyper(p, l_len);
+}
+
+
+/*
+ * NLMv4 XDR encode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+
+/*
+ *	struct nlm4_testargs {
+ *		netobj cookie;
+ *		bool exclusive;
+ *		struct nlm4_lock alock;
+ *	};
+ */
+static void nlm4_xdr_enc_testargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm4_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm4_lockargs {
+ *		netobj cookie;
+ *		bool block;
+ *		bool exclusive;
+ *		struct nlm4_lock alock;
+ *		bool reclaim;
+ *		int state;
+ *	};
+ */
+static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm4_lock(xdr, lock);
+	encode_bool(xdr, args->reclaim);
+	encode_int32(xdr, args->state);
+}
+
+/*
+ *	struct nlm4_cancargs {
+ *		netobj cookie;
+ *		bool block;
+ *		bool exclusive;
+ *		struct nlm4_lock alock;
+ *	};
+ */
+static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm4_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm4_unlockargs {
+ *		netobj cookie;
+ *		struct nlm4_lock alock;
+ *	};
+ */
+static void nlm4_xdr_enc_unlockargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_nlm4_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm4_res {
+ *		netobj cookie;
+ *		nlm4_stat stat;
+ *	};
+ */
+static void nlm4_xdr_enc_res(struct rpc_rqst *req,
+			     struct xdr_stream *xdr,
+			     const struct nlm_res *result)
+{
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm4_stat(xdr, result->status);
+}
+
+/*
+ *	union nlm4_testrply switch (nlm4_stats stat) {
+ *	case NLM4_DENIED:
+ *		struct nlm4_holder holder;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct nlm4_testres {
+ *		netobj cookie;
+ *		nlm4_testrply test_stat;
+ *	};
+ */
+static void nlm4_xdr_enc_testres(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_res *result)
+{
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm4_stat(xdr, result->status);
+	if (result->status == nlm_lck_denied)
+		encode_nlm4_holder(xdr, result);
+}
+
+
+/*
+ * NLMv4 XDR decode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+
+/*
+ *	union nlm4_testrply switch (nlm4_stats stat) {
+ *	case NLM4_DENIED:
+ *		struct nlm4_holder holder;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct nlm4_testres {
+ *		netobj cookie;
+ *		nlm4_testrply test_stat;
+ *	};
+ */
+static int decode_nlm4_testrply(struct xdr_stream *xdr,
+				struct nlm_res *result)
+{
+	int error;
+
+	error = decode_nlm4_stat(xdr, &result->status);
+	if (unlikely(error))
+		goto out;
+	if (result->status == nlm_lck_denied)
+		error = decode_nlm4_holder(xdr, result);
+out:
+	return error;
+}
+
+static int nlm4_xdr_dec_testres(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				struct nlm_res *result)
+{
+	int error;
+
+	error = decode_cookie(xdr, &result->cookie);
+	if (unlikely(error))
+		goto out;
+	error = decode_nlm4_testrply(xdr, result);
+out:
+	return error;
+}
+
+/*
+ *	struct nlm4_res {
+ *		netobj cookie;
+ *		nlm4_stat stat;
+ *	};
+ */
+static int nlm4_xdr_dec_res(struct rpc_rqst *req,
+			    struct xdr_stream *xdr,
+			    struct nlm_res *result)
+{
+	int error;
+
+	error = decode_cookie(xdr, &result->cookie);
+	if (unlikely(error))
+		goto out;
+	error = decode_nlm4_stat(xdr, &result->status);
+out:
+	return error;
+}
+
+
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm4_xdr_dec_norep	NULL
+
+#define PROC(proc, argtype, restype)					\
+[NLMPROC_##proc] = {							\
+	.p_proc      = NLMPROC_##proc,					\
+	.p_encode    = (kxdreproc_t)nlm4_xdr_enc_##argtype,		\
+	.p_decode    = (kxdrdproc_t)nlm4_xdr_dec_##restype,		\
+	.p_arglen    = NLM4_##argtype##_sz,				\
+	.p_replen    = NLM4_##restype##_sz,				\
+	.p_statidx   = NLMPROC_##proc,					\
+	.p_name      = #proc,						\
+	}
+
+static struct rpc_procinfo	nlm4_procedures[] = {
+	PROC(TEST,		testargs,	testres),
+	PROC(LOCK,		lockargs,	res),
+	PROC(CANCEL,		cancargs,	res),
+	PROC(UNLOCK,		unlockargs,	res),
+	PROC(GRANTED,		testargs,	res),
+	PROC(TEST_MSG,		testargs,	norep),
+	PROC(LOCK_MSG,		lockargs,	norep),
+	PROC(CANCEL_MSG,	cancargs,	norep),
+	PROC(UNLOCK_MSG,	unlockargs,	norep),
+	PROC(GRANTED_MSG,	testargs,	norep),
+	PROC(TEST_RES,		testres,	norep),
+	PROC(LOCK_RES,		res,		norep),
+	PROC(CANCEL_RES,	res,		norep),
+	PROC(UNLOCK_RES,	res,		norep),
+	PROC(GRANTED_RES,	res,		norep),
+};
+
+struct rpc_version	nlm_version4 = {
+	.number		= 4,
+	.nrprocs	= ARRAY_SIZE(nlm4_procedures),
+	.procs		= nlm4_procedures,
+};
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
new file mode 100644
index 00000000..8d4ea835
--- /dev/null
+++ b/fs/lockd/clntlock.c
@@ -0,0 +1,279 @@
+/*
+ * linux/fs/lockd/clntlock.c
+ *
+ * Lock handling for the client side NLM implementation
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/nfs_fs.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/lockd/lockd.h>
+#include <linux/kthread.h>
+
+#define NLMDBG_FACILITY		NLMDBG_CLIENT
+
+/*
+ * Local function prototypes
+ */
+static int			reclaimer(void *ptr);
+
+/*
+ * The following functions handle blocking and granting from the
+ * client perspective.
+ */
+
+/*
+ * This is the representation of a blocked client lock.
+ */
+struct nlm_wait {
+	struct list_head	b_list;		/* linked list */
+	wait_queue_head_t	b_wait;		/* where to wait on */
+	struct nlm_host *	b_host;
+	struct file_lock *	b_lock;		/* local file lock */
+	unsigned short		b_reclaim;	/* got to reclaim lock */
+	__be32			b_status;	/* grant callback status */
+};
+
+static LIST_HEAD(nlm_blocked);
+static DEFINE_SPINLOCK(nlm_blocked_lock);
+
+/**
+ * nlmclnt_init - Set up per-NFS mount point lockd data structures
+ * @nlm_init: pointer to arguments structure
+ *
+ * Returns pointer to an appropriate nlm_host struct,
+ * or an ERR_PTR value.
+ */
+struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
+{
+	struct nlm_host *host;
+	u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
+	int status;
+
+	status = lockd_up();
+	if (status < 0)
+		return ERR_PTR(status);
+
+	host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
+				   nlm_init->protocol, nlm_version,
+				   nlm_init->hostname, nlm_init->noresvport);
+	if (host == NULL) {
+		lockd_down();
+		return ERR_PTR(-ENOLCK);
+	}
+
+	return host;
+}
+EXPORT_SYMBOL_GPL(nlmclnt_init);
+
+/**
+ * nlmclnt_done - Release resources allocated by nlmclnt_init()
+ * @host: nlm_host structure reserved by nlmclnt_init()
+ *
+ */
+void nlmclnt_done(struct nlm_host *host)
+{
+	nlmclnt_release_host(host);
+	lockd_down();
+}
+EXPORT_SYMBOL_GPL(nlmclnt_done);
+
+/*
+ * Queue up a lock for blocking so that the GRANTED request can see it
+ */
+struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl)
+{
+	struct nlm_wait *block;
+
+	block = kmalloc(sizeof(*block), GFP_KERNEL);
+	if (block != NULL) {
+		block->b_host = host;
+		block->b_lock = fl;
+		init_waitqueue_head(&block->b_wait);
+		block->b_status = nlm_lck_blocked;
+
+		spin_lock(&nlm_blocked_lock);
+		list_add(&block->b_list, &nlm_blocked);
+		spin_unlock(&nlm_blocked_lock);
+	}
+	return block;
+}
+
+void nlmclnt_finish_block(struct nlm_wait *block)
+{
+	if (block == NULL)
+		return;
+	spin_lock(&nlm_blocked_lock);
+	list_del(&block->b_list);
+	spin_unlock(&nlm_blocked_lock);
+	kfree(block);
+}
+
+/*
+ * Block on a lock
+ */
+int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
+{
+	long ret;
+
+	/* A borken server might ask us to block even if we didn't
+	 * request it. Just say no!
+	 */
+	if (block == NULL)
+		return -EAGAIN;
+
+	/* Go to sleep waiting for GRANT callback. Some servers seem
+	 * to lose callbacks, however, so we're going to poll from
+	 * time to time just to make sure.
+	 *
+	 * For now, the retry frequency is pretty high; normally 
+	 * a 1 minute timeout would do. See the comment before
+	 * nlmclnt_lock for an explanation.
+	 */
+	ret = wait_event_interruptible_timeout(block->b_wait,
+			block->b_status != nlm_lck_blocked,
+			timeout);
+	if (ret < 0)
+		return -ERESTARTSYS;
+	req->a_res.status = block->b_status;
+	return 0;
+}
+
+/*
+ * The server lockd has called us back to tell us the lock was granted
+ */
+__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
+{
+	const struct file_lock *fl = &lock->fl;
+	const struct nfs_fh *fh = &lock->fh;
+	struct nlm_wait	*block;
+	__be32 res = nlm_lck_denied;
+
+	/*
+	 * Look up blocked request based on arguments. 
+	 * Warning: must not use cookie to match it!
+	 */
+	spin_lock(&nlm_blocked_lock);
+	list_for_each_entry(block, &nlm_blocked, b_list) {
+		struct file_lock *fl_blocked = block->b_lock;
+
+		if (fl_blocked->fl_start != fl->fl_start)
+			continue;
+		if (fl_blocked->fl_end != fl->fl_end)
+			continue;
+		/*
+		 * Careful! The NLM server will return the 32-bit "pid" that
+		 * we put on the wire: in this case the lockowner "pid".
+		 */
+		if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
+			continue;
+		if (!rpc_cmp_addr(nlm_addr(block->b_host), addr))
+			continue;
+		if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
+			continue;
+		/* Alright, we found a lock. Set the return status
+		 * and wake up the caller
+		 */
+		block->b_status = nlm_granted;
+		wake_up(&block->b_wait);
+		res = nlm_granted;
+	}
+	spin_unlock(&nlm_blocked_lock);
+	return res;
+}
+
+/*
+ * The following procedures deal with the recovery of locks after a
+ * server crash.
+ */
+
+/*
+ * Reclaim all locks on server host. We do this by spawning a separate
+ * reclaimer thread.
+ */
+void
+nlmclnt_recovery(struct nlm_host *host)
+{
+	struct task_struct *task;
+
+	if (!host->h_reclaiming++) {
+		nlm_get_host(host);
+		task = kthread_run(reclaimer, host, "%s-reclaim", host->h_name);
+		if (IS_ERR(task))
+			printk(KERN_ERR "lockd: unable to spawn reclaimer "
+				"thread. Locks for %s won't be reclaimed! "
+				"(%ld)\n", host->h_name, PTR_ERR(task));
+	}
+}
+
+static int
+reclaimer(void *ptr)
+{
+	struct nlm_host	  *host = (struct nlm_host *) ptr;
+	struct nlm_wait	  *block;
+	struct file_lock *fl, *next;
+	u32 nsmstate;
+
+	allow_signal(SIGKILL);
+
+	down_write(&host->h_rwsem);
+	lockd_up();	/* note: this cannot fail as lockd is already running */
+
+	dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
+
+restart:
+	nsmstate = host->h_nsmstate;
+
+	/* Force a portmap getport - the peer's lockd will
+	 * most likely end up on a different port.
+	 */
+	host->h_nextrebind = jiffies;
+	nlm_rebind_host(host);
+
+	/* First, reclaim all locks that have been granted. */
+	list_splice_init(&host->h_granted, &host->h_reclaim);
+	list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
+		list_del_init(&fl->fl_u.nfs_fl.list);
+
+		/*
+		 * sending this thread a SIGKILL will result in any unreclaimed
+		 * locks being removed from the h_granted list. This means that
+		 * the kernel will not attempt to reclaim them again if a new
+		 * reclaimer thread is spawned for this host.
+		 */
+		if (signalled())
+			continue;
+		if (nlmclnt_reclaim(host, fl) != 0)
+			continue;
+		list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
+		if (host->h_nsmstate != nsmstate) {
+			/* Argh! The server rebooted again! */
+			goto restart;
+		}
+	}
+
+	host->h_reclaiming = 0;
+	up_write(&host->h_rwsem);
+	dprintk("NLM: done reclaiming locks for host %s\n", host->h_name);
+
+	/* Now, wake up all processes that sleep on a blocked lock */
+	spin_lock(&nlm_blocked_lock);
+	list_for_each_entry(block, &nlm_blocked, b_list) {
+		if (block->b_host == host) {
+			block->b_status = nlm_lck_denied_grace_period;
+			wake_up(&block->b_wait);
+		}
+	}
+	spin_unlock(&nlm_blocked_lock);
+
+	/* Release host handle after use */
+	nlmclnt_release_host(host);
+	lockd_down();
+	return 0;
+}
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
new file mode 100644
index 00000000..e374050a
--- /dev/null
+++ b/fs/lockd/clntproc.c
@@ -0,0 +1,848 @@
+/*
+ * linux/fs/lockd/clntproc.c
+ *
+ * RPC procedures for the client side NLM implementation
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/nfs_fs.h>
+#include <linux/utsname.h>
+#include <linux/freezer.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/lockd/lockd.h>
+
+#define NLMDBG_FACILITY		NLMDBG_CLIENT
+#define NLMCLNT_GRACE_WAIT	(5*HZ)
+#define NLMCLNT_POLL_TIMEOUT	(30*HZ)
+#define NLMCLNT_MAX_RETRIES	3
+
+static int	nlmclnt_test(struct nlm_rqst *, struct file_lock *);
+static int	nlmclnt_lock(struct nlm_rqst *, struct file_lock *);
+static int	nlmclnt_unlock(struct nlm_rqst *, struct file_lock *);
+static int	nlm_stat_to_errno(__be32 stat);
+static void	nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host);
+static int	nlmclnt_cancel(struct nlm_host *, int , struct file_lock *);
+
+static const struct rpc_call_ops nlmclnt_unlock_ops;
+static const struct rpc_call_ops nlmclnt_cancel_ops;
+
+/*
+ * Cookie counter for NLM requests
+ */
+static atomic_t	nlm_cookie = ATOMIC_INIT(0x1234);
+
+void nlmclnt_next_cookie(struct nlm_cookie *c)
+{
+	u32	cookie = atomic_inc_return(&nlm_cookie);
+
+	memcpy(c->data, &cookie, 4);
+	c->len=4;
+}
+
+static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner)
+{
+	atomic_inc(&lockowner->count);
+	return lockowner;
+}
+
+static void nlm_put_lockowner(struct nlm_lockowner *lockowner)
+{
+	if (!atomic_dec_and_lock(&lockowner->count, &lockowner->host->h_lock))
+		return;
+	list_del(&lockowner->list);
+	spin_unlock(&lockowner->host->h_lock);
+	nlmclnt_release_host(lockowner->host);
+	kfree(lockowner);
+}
+
+static inline int nlm_pidbusy(struct nlm_host *host, uint32_t pid)
+{
+	struct nlm_lockowner *lockowner;
+	list_for_each_entry(lockowner, &host->h_lockowners, list) {
+		if (lockowner->pid == pid)
+			return -EBUSY;
+	}
+	return 0;
+}
+
+static inline uint32_t __nlm_alloc_pid(struct nlm_host *host)
+{
+	uint32_t res;
+	do {
+		res = host->h_pidcount++;
+	} while (nlm_pidbusy(host, res) < 0);
+	return res;
+}
+
+static struct nlm_lockowner *__nlm_find_lockowner(struct nlm_host *host, fl_owner_t owner)
+{
+	struct nlm_lockowner *lockowner;
+	list_for_each_entry(lockowner, &host->h_lockowners, list) {
+		if (lockowner->owner != owner)
+			continue;
+		return nlm_get_lockowner(lockowner);
+	}
+	return NULL;
+}
+
+static struct nlm_lockowner *nlm_find_lockowner(struct nlm_host *host, fl_owner_t owner)
+{
+	struct nlm_lockowner *res, *new = NULL;
+
+	spin_lock(&host->h_lock);
+	res = __nlm_find_lockowner(host, owner);
+	if (res == NULL) {
+		spin_unlock(&host->h_lock);
+		new = kmalloc(sizeof(*new), GFP_KERNEL);
+		spin_lock(&host->h_lock);
+		res = __nlm_find_lockowner(host, owner);
+		if (res == NULL && new != NULL) {
+			res = new;
+			atomic_set(&new->count, 1);
+			new->owner = owner;
+			new->pid = __nlm_alloc_pid(host);
+			new->host = nlm_get_host(host);
+			list_add(&new->list, &host->h_lockowners);
+			new = NULL;
+		}
+	}
+	spin_unlock(&host->h_lock);
+	kfree(new);
+	return res;
+}
+
+/*
+ * Initialize arguments for TEST/LOCK/UNLOCK/CANCEL calls
+ */
+static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
+{
+	struct nlm_args	*argp = &req->a_args;
+	struct nlm_lock	*lock = &argp->lock;
+
+	nlmclnt_next_cookie(&argp->cookie);
+	memcpy(&lock->fh, NFS_FH(fl->fl_file->f_path.dentry->d_inode), sizeof(struct nfs_fh));
+	lock->caller  = utsname()->nodename;
+	lock->oh.data = req->a_owner;
+	lock->oh.len  = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s",
+				(unsigned int)fl->fl_u.nfs_fl.owner->pid,
+				utsname()->nodename);
+	lock->svid = fl->fl_u.nfs_fl.owner->pid;
+	lock->fl.fl_start = fl->fl_start;
+	lock->fl.fl_end = fl->fl_end;
+	lock->fl.fl_type = fl->fl_type;
+}
+
+static void nlmclnt_release_lockargs(struct nlm_rqst *req)
+{
+	BUG_ON(req->a_args.lock.fl.fl_ops != NULL);
+}
+
+/**
+ * nlmclnt_proc - Perform a single client-side lock request
+ * @host: address of a valid nlm_host context representing the NLM server
+ * @cmd: fcntl-style file lock operation to perform
+ * @fl: address of arguments for the lock operation
+ *
+ */
+int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
+{
+	struct nlm_rqst		*call;
+	int			status;
+
+	nlm_get_host(host);
+	call = nlm_alloc_call(host);
+	if (call == NULL)
+		return -ENOMEM;
+
+	nlmclnt_locks_init_private(fl, host);
+	/* Set up the argument struct */
+	nlmclnt_setlockargs(call, fl);
+
+	if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
+		if (fl->fl_type != F_UNLCK) {
+			call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
+			status = nlmclnt_lock(call, fl);
+		} else
+			status = nlmclnt_unlock(call, fl);
+	} else if (IS_GETLK(cmd))
+		status = nlmclnt_test(call, fl);
+	else
+		status = -EINVAL;
+	fl->fl_ops->fl_release_private(fl);
+	fl->fl_ops = NULL;
+
+	dprintk("lockd: clnt proc returns %d\n", status);
+	return status;
+}
+EXPORT_SYMBOL_GPL(nlmclnt_proc);
+
+/*
+ * Allocate an NLM RPC call struct
+ *
+ * Note: the caller must hold a reference to host. In case of failure,
+ * this reference will be released.
+ */
+struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
+{
+	struct nlm_rqst	*call;
+
+	for(;;) {
+		call = kzalloc(sizeof(*call), GFP_KERNEL);
+		if (call != NULL) {
+			atomic_set(&call->a_count, 1);
+			locks_init_lock(&call->a_args.lock.fl);
+			locks_init_lock(&call->a_res.lock.fl);
+			call->a_host = host;
+			return call;
+		}
+		if (signalled())
+			break;
+		printk("nlm_alloc_call: failed, waiting for memory\n");
+		schedule_timeout_interruptible(5*HZ);
+	}
+	nlmclnt_release_host(host);
+	return NULL;
+}
+
+void nlmclnt_release_call(struct nlm_rqst *call)
+{
+	if (!atomic_dec_and_test(&call->a_count))
+		return;
+	nlmclnt_release_host(call->a_host);
+	nlmclnt_release_lockargs(call);
+	kfree(call);
+}
+
+static void nlmclnt_rpc_release(void *data)
+{
+	nlmclnt_release_call(data);
+}
+
+static int nlm_wait_on_grace(wait_queue_head_t *queue)
+{
+	DEFINE_WAIT(wait);
+	int status = -EINTR;
+
+	prepare_to_wait(queue, &wait, TASK_INTERRUPTIBLE);
+	if (!signalled ()) {
+		schedule_timeout(NLMCLNT_GRACE_WAIT);
+		try_to_freeze();
+		if (!signalled ())
+			status = 0;
+	}
+	finish_wait(queue, &wait);
+	return status;
+}
+
+/*
+ * Generic NLM call
+ */
+static int
+nlmclnt_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc)
+{
+	struct nlm_host	*host = req->a_host;
+	struct rpc_clnt	*clnt;
+	struct nlm_args	*argp = &req->a_args;
+	struct nlm_res	*resp = &req->a_res;
+	struct rpc_message msg = {
+		.rpc_argp	= argp,
+		.rpc_resp	= resp,
+		.rpc_cred	= cred,
+	};
+	int		status;
+
+	dprintk("lockd: call procedure %d on %s\n",
+			(int)proc, host->h_name);
+
+	do {
+		if (host->h_reclaiming && !argp->reclaim)
+			goto in_grace_period;
+
+		/* If we have no RPC client yet, create one. */
+		if ((clnt = nlm_bind_host(host)) == NULL)
+			return -ENOLCK;
+		msg.rpc_proc = &clnt->cl_procinfo[proc];
+
+		/* Perform the RPC call. If an error occurs, try again */
+		if ((status = rpc_call_sync(clnt, &msg, 0)) < 0) {
+			dprintk("lockd: rpc_call returned error %d\n", -status);
+			switch (status) {
+			case -EPROTONOSUPPORT:
+				status = -EINVAL;
+				break;
+			case -ECONNREFUSED:
+			case -ETIMEDOUT:
+			case -ENOTCONN:
+				nlm_rebind_host(host);
+				status = -EAGAIN;
+				break;
+			case -ERESTARTSYS:
+				return signalled () ? -EINTR : status;
+			default:
+				break;
+			}
+			break;
+		} else
+		if (resp->status == nlm_lck_denied_grace_period) {
+			dprintk("lockd: server in grace period\n");
+			if (argp->reclaim) {
+				printk(KERN_WARNING
+				     "lockd: spurious grace period reject?!\n");
+				return -ENOLCK;
+			}
+		} else {
+			if (!argp->reclaim) {
+				/* We appear to be out of the grace period */
+				wake_up_all(&host->h_gracewait);
+			}
+			dprintk("lockd: server returns status %d\n", resp->status);
+			return 0;	/* Okay, call complete */
+		}
+
+in_grace_period:
+		/*
+		 * The server has rebooted and appears to be in the grace
+		 * period during which locks are only allowed to be
+		 * reclaimed.
+		 * We can only back off and try again later.
+		 */
+		status = nlm_wait_on_grace(&host->h_gracewait);
+	} while (status == 0);
+
+	return status;
+}
+
+/*
+ * Generic NLM call, async version.
+ */
+static struct rpc_task *__nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops)
+{
+	struct nlm_host	*host = req->a_host;
+	struct rpc_clnt	*clnt;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_message = msg,
+		.callback_ops = tk_ops,
+		.callback_data = req,
+		.flags = RPC_TASK_ASYNC,
+	};
+
+	dprintk("lockd: call procedure %d on %s (async)\n",
+			(int)proc, host->h_name);
+
+	/* If we have no RPC client yet, create one. */
+	clnt = nlm_bind_host(host);
+	if (clnt == NULL)
+		goto out_err;
+	msg->rpc_proc = &clnt->cl_procinfo[proc];
+	task_setup_data.rpc_client = clnt;
+
+        /* bootstrap and kick off the async RPC call */
+	return rpc_run_task(&task_setup_data);
+out_err:
+	tk_ops->rpc_release(req);
+	return ERR_PTR(-ENOLCK);
+}
+
+static int nlm_do_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops)
+{
+	struct rpc_task *task;
+
+	task = __nlm_async_call(req, proc, msg, tk_ops);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+	return 0;
+}
+
+/*
+ * NLM asynchronous call.
+ */
+int nlm_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
+{
+	struct rpc_message msg = {
+		.rpc_argp	= &req->a_args,
+		.rpc_resp	= &req->a_res,
+	};
+	return nlm_do_async_call(req, proc, &msg, tk_ops);
+}
+
+int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
+{
+	struct rpc_message msg = {
+		.rpc_argp	= &req->a_res,
+	};
+	return nlm_do_async_call(req, proc, &msg, tk_ops);
+}
+
+/*
+ * NLM client asynchronous call.
+ *
+ * Note that although the calls are asynchronous, and are therefore
+ *      guaranteed to complete, we still always attempt to wait for
+ *      completion in order to be able to correctly track the lock
+ *      state.
+ */
+static int nlmclnt_async_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
+{
+	struct rpc_message msg = {
+		.rpc_argp	= &req->a_args,
+		.rpc_resp	= &req->a_res,
+		.rpc_cred	= cred,
+	};
+	struct rpc_task *task;
+	int err;
+
+	task = __nlm_async_call(req, proc, &msg, tk_ops);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	err = rpc_wait_for_completion_task(task);
+	rpc_put_task(task);
+	return err;
+}
+
+/*
+ * TEST for the presence of a conflicting lock
+ */
+static int
+nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
+{
+	int	status;
+
+	status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_TEST);
+	if (status < 0)
+		goto out;
+
+	switch (req->a_res.status) {
+		case nlm_granted:
+			fl->fl_type = F_UNLCK;
+			break;
+		case nlm_lck_denied:
+			/*
+			 * Report the conflicting lock back to the application.
+			 */
+			fl->fl_start = req->a_res.lock.fl.fl_start;
+			fl->fl_end = req->a_res.lock.fl.fl_end;
+			fl->fl_type = req->a_res.lock.fl.fl_type;
+			fl->fl_pid = 0;
+			break;
+		default:
+			status = nlm_stat_to_errno(req->a_res.status);
+	}
+out:
+	nlmclnt_release_call(req);
+	return status;
+}
+
+static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
+{
+	spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
+	new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state;
+	new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner);
+	list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted);
+	spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
+}
+
+static void nlmclnt_locks_release_private(struct file_lock *fl)
+{
+	spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
+	list_del(&fl->fl_u.nfs_fl.list);
+	spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
+	nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
+}
+
+static const struct file_lock_operations nlmclnt_lock_ops = {
+	.fl_copy_lock = nlmclnt_locks_copy_lock,
+	.fl_release_private = nlmclnt_locks_release_private,
+};
+
+static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host)
+{
+	BUG_ON(fl->fl_ops != NULL);
+	fl->fl_u.nfs_fl.state = 0;
+	fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner);
+	INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list);
+	fl->fl_ops = &nlmclnt_lock_ops;
+}
+
+static int do_vfs_lock(struct file_lock *fl)
+{
+	int res = 0;
+	switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
+		case FL_POSIX:
+			res = posix_lock_file_wait(fl->fl_file, fl);
+			break;
+		case FL_FLOCK:
+			res = flock_lock_file_wait(fl->fl_file, fl);
+			break;
+		default:
+			BUG();
+	}
+	return res;
+}
+
+/*
+ * LOCK: Try to create a lock
+ *
+ *			Programmer Harassment Alert
+ *
+ * When given a blocking lock request in a sync RPC call, the HPUX lockd
+ * will faithfully return LCK_BLOCKED but never cares to notify us when
+ * the lock could be granted. This way, our local process could hang
+ * around forever waiting for the callback.
+ *
+ *  Solution A:	Implement busy-waiting
+ *  Solution B: Use the async version of the call (NLM_LOCK_{MSG,RES})
+ *
+ * For now I am implementing solution A, because I hate the idea of
+ * re-implementing lockd for a third time in two months. The async
+ * calls shouldn't be too hard to do, however.
+ *
+ * This is one of the lovely things about standards in the NFS area:
+ * they're so soft and squishy you can't really blame HP for doing this.
+ */
+static int
+nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
+{
+	struct rpc_cred *cred = nfs_file_cred(fl->fl_file);
+	struct nlm_host	*host = req->a_host;
+	struct nlm_res	*resp = &req->a_res;
+	struct nlm_wait *block = NULL;
+	unsigned char fl_flags = fl->fl_flags;
+	unsigned char fl_type;
+	int status = -ENOLCK;
+
+	if (nsm_monitor(host) < 0)
+		goto out;
+	req->a_args.state = nsm_local_state;
+
+	fl->fl_flags |= FL_ACCESS;
+	status = do_vfs_lock(fl);
+	fl->fl_flags = fl_flags;
+	if (status < 0)
+		goto out;
+
+	block = nlmclnt_prepare_block(host, fl);
+again:
+	/*
+	 * Initialise resp->status to a valid non-zero value,
+	 * since 0 == nlm_lck_granted
+	 */
+	resp->status = nlm_lck_blocked;
+	for(;;) {
+		/* Reboot protection */
+		fl->fl_u.nfs_fl.state = host->h_state;
+		status = nlmclnt_call(cred, req, NLMPROC_LOCK);
+		if (status < 0)
+			break;
+		/* Did a reclaimer thread notify us of a server reboot? */
+		if (resp->status ==  nlm_lck_denied_grace_period)
+			continue;
+		if (resp->status != nlm_lck_blocked)
+			break;
+		/* Wait on an NLM blocking lock */
+		status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT);
+		if (status < 0)
+			break;
+		if (resp->status != nlm_lck_blocked)
+			break;
+	}
+
+	/* if we were interrupted while blocking, then cancel the lock request
+	 * and exit
+	 */
+	if (resp->status == nlm_lck_blocked) {
+		if (!req->a_args.block)
+			goto out_unlock;
+		if (nlmclnt_cancel(host, req->a_args.block, fl) == 0)
+			goto out_unblock;
+	}
+
+	if (resp->status == nlm_granted) {
+		down_read(&host->h_rwsem);
+		/* Check whether or not the server has rebooted */
+		if (fl->fl_u.nfs_fl.state != host->h_state) {
+			up_read(&host->h_rwsem);
+			goto again;
+		}
+		/* Ensure the resulting lock will get added to granted list */
+		fl->fl_flags |= FL_SLEEP;
+		if (do_vfs_lock(fl) < 0)
+			printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
+		up_read(&host->h_rwsem);
+		fl->fl_flags = fl_flags;
+		status = 0;
+	}
+	if (status < 0)
+		goto out_unlock;
+	/*
+	 * EAGAIN doesn't make sense for sleeping locks, and in some
+	 * cases NLM_LCK_DENIED is returned for a permanent error.  So
+	 * turn it into an ENOLCK.
+	 */
+	if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP))
+		status = -ENOLCK;
+	else
+		status = nlm_stat_to_errno(resp->status);
+out_unblock:
+	nlmclnt_finish_block(block);
+out:
+	nlmclnt_release_call(req);
+	return status;
+out_unlock:
+	/* Fatal error: ensure that we remove the lock altogether */
+	dprintk("lockd: lock attempt ended in fatal error.\n"
+		"       Attempting to unlock.\n");
+	nlmclnt_finish_block(block);
+	fl_type = fl->fl_type;
+	fl->fl_type = F_UNLCK;
+	down_read(&host->h_rwsem);
+	do_vfs_lock(fl);
+	up_read(&host->h_rwsem);
+	fl->fl_type = fl_type;
+	fl->fl_flags = fl_flags;
+	nlmclnt_async_call(cred, req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
+	return status;
+}
+
+/*
+ * RECLAIM: Try to reclaim a lock
+ */
+int
+nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl)
+{
+	struct nlm_rqst reqst, *req;
+	int		status;
+
+	req = &reqst;
+	memset(req, 0, sizeof(*req));
+	locks_init_lock(&req->a_args.lock.fl);
+	locks_init_lock(&req->a_res.lock.fl);
+	req->a_host  = host;
+	req->a_flags = 0;
+
+	/* Set up the argument struct */
+	nlmclnt_setlockargs(req, fl);
+	req->a_args.reclaim = 1;
+
+	status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_LOCK);
+	if (status >= 0 && req->a_res.status == nlm_granted)
+		return 0;
+
+	printk(KERN_WARNING "lockd: failed to reclaim lock for pid %d "
+				"(errno %d, status %d)\n", fl->fl_pid,
+				status, ntohl(req->a_res.status));
+
+	/*
+	 * FIXME: This is a serious failure. We can
+	 *
+	 *  a.	Ignore the problem
+	 *  b.	Send the owning process some signal (Linux doesn't have
+	 *	SIGLOST, though...)
+	 *  c.	Retry the operation
+	 *
+	 * Until someone comes up with a simple implementation
+	 * for b or c, I'll choose option a.
+	 */
+
+	return -ENOLCK;
+}
+
+/*
+ * UNLOCK: remove an existing lock
+ */
+static int
+nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
+{
+	struct nlm_host	*host = req->a_host;
+	struct nlm_res	*resp = &req->a_res;
+	int status;
+	unsigned char fl_flags = fl->fl_flags;
+
+	/*
+	 * Note: the server is supposed to either grant us the unlock
+	 * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
+	 * case, we want to unlock.
+	 */
+	fl->fl_flags |= FL_EXISTS;
+	down_read(&host->h_rwsem);
+	status = do_vfs_lock(fl);
+	up_read(&host->h_rwsem);
+	fl->fl_flags = fl_flags;
+	if (status == -ENOENT) {
+		status = 0;
+		goto out;
+	}
+
+	atomic_inc(&req->a_count);
+	status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
+			NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
+	if (status < 0)
+		goto out;
+
+	if (resp->status == nlm_granted)
+		goto out;
+
+	if (resp->status != nlm_lck_denied_nolocks)
+		printk("lockd: unexpected unlock status: %d\n", resp->status);
+	/* What to do now? I'm out of my depth... */
+	status = -ENOLCK;
+out:
+	nlmclnt_release_call(req);
+	return status;
+}
+
+static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
+{
+	struct nlm_rqst	*req = data;
+	u32 status = ntohl(req->a_res.status);
+
+	if (RPC_ASSASSINATED(task))
+		goto die;
+
+	if (task->tk_status < 0) {
+		dprintk("lockd: unlock failed (err = %d)\n", -task->tk_status);
+		switch (task->tk_status) {
+		case -EACCES:
+		case -EIO:
+			goto die;
+		default:
+			goto retry_rebind;
+		}
+	}
+	if (status == NLM_LCK_DENIED_GRACE_PERIOD) {
+		rpc_delay(task, NLMCLNT_GRACE_WAIT);
+		goto retry_unlock;
+	}
+	if (status != NLM_LCK_GRANTED)
+		printk(KERN_WARNING "lockd: unexpected unlock status: %d\n", status);
+die:
+	return;
+ retry_rebind:
+	nlm_rebind_host(req->a_host);
+ retry_unlock:
+	rpc_restart_call(task);
+}
+
+static const struct rpc_call_ops nlmclnt_unlock_ops = {
+	.rpc_call_done = nlmclnt_unlock_callback,
+	.rpc_release = nlmclnt_rpc_release,
+};
+
+/*
+ * Cancel a blocked lock request.
+ * We always use an async RPC call for this in order not to hang a
+ * process that has been Ctrl-C'ed.
+ */
+static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl)
+{
+	struct nlm_rqst	*req;
+	int status;
+
+	dprintk("lockd: blocking lock attempt was interrupted by a signal.\n"
+		"       Attempting to cancel lock.\n");
+
+	req = nlm_alloc_call(nlm_get_host(host));
+	if (!req)
+		return -ENOMEM;
+	req->a_flags = RPC_TASK_ASYNC;
+
+	nlmclnt_setlockargs(req, fl);
+	req->a_args.block = block;
+
+	atomic_inc(&req->a_count);
+	status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
+			NLMPROC_CANCEL, &nlmclnt_cancel_ops);
+	if (status == 0 && req->a_res.status == nlm_lck_denied)
+		status = -ENOLCK;
+	nlmclnt_release_call(req);
+	return status;
+}
+
+static void nlmclnt_cancel_callback(struct rpc_task *task, void *data)
+{
+	struct nlm_rqst	*req = data;
+	u32 status = ntohl(req->a_res.status);
+
+	if (RPC_ASSASSINATED(task))
+		goto die;
+
+	if (task->tk_status < 0) {
+		dprintk("lockd: CANCEL call error %d, retrying.\n",
+					task->tk_status);
+		goto retry_cancel;
+	}
+
+	dprintk("lockd: cancel status %u (task %u)\n",
+			status, task->tk_pid);
+
+	switch (status) {
+	case NLM_LCK_GRANTED:
+	case NLM_LCK_DENIED_GRACE_PERIOD:
+	case NLM_LCK_DENIED:
+		/* Everything's good */
+		break;
+	case NLM_LCK_DENIED_NOLOCKS:
+		dprintk("lockd: CANCEL failed (server has no locks)\n");
+		goto retry_cancel;
+	default:
+		printk(KERN_NOTICE "lockd: weird return %d for CANCEL call\n",
+			status);
+	}
+
+die:
+	return;
+
+retry_cancel:
+	/* Don't ever retry more than 3 times */
+	if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
+		goto die;
+	nlm_rebind_host(req->a_host);
+	rpc_restart_call(task);
+	rpc_delay(task, 30 * HZ);
+}
+
+static const struct rpc_call_ops nlmclnt_cancel_ops = {
+	.rpc_call_done = nlmclnt_cancel_callback,
+	.rpc_release = nlmclnt_rpc_release,
+};
+
+/*
+ * Convert an NLM status code to a generic kernel errno
+ */
+static int
+nlm_stat_to_errno(__be32 status)
+{
+	switch(ntohl(status)) {
+	case NLM_LCK_GRANTED:
+		return 0;
+	case NLM_LCK_DENIED:
+		return -EAGAIN;
+	case NLM_LCK_DENIED_NOLOCKS:
+	case NLM_LCK_DENIED_GRACE_PERIOD:
+		return -ENOLCK;
+	case NLM_LCK_BLOCKED:
+		printk(KERN_NOTICE "lockd: unexpected status NLM_BLOCKED\n");
+		return -ENOLCK;
+#ifdef CONFIG_LOCKD_V4
+	case NLM_DEADLCK:
+		return -EDEADLK;
+	case NLM_ROFS:
+		return -EROFS;
+	case NLM_STALE_FH:
+		return -ESTALE;
+	case NLM_FBIG:
+		return -EOVERFLOW;
+	case NLM_FAILED:
+		return -ENOLCK;
+#endif
+	}
+	printk(KERN_NOTICE "lockd: unexpected server status %d\n", status);
+	return -ENOLCK;
+}
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
new file mode 100644
index 00000000..36057ced
--- /dev/null
+++ b/fs/lockd/clntxdr.c
@@ -0,0 +1,627 @@
+/*
+ * linux/fs/lockd/clntxdr.c
+ *
+ * XDR functions to encode/decode NLM version 3 RPC arguments and results.
+ * NLM version 3 is backwards compatible with NLM versions 1 and 2.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+
+#define NLMDBG_FACILITY		NLMDBG_XDR
+
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM_cookie_sz		(1+(NLM_MAXCOOKIELEN>>2))
+#define NLM_caller_sz		(1+(NLMCLNT_OHSIZE>>2))
+#define NLM_owner_sz		(1+(NLMCLNT_OHSIZE>>2))
+#define NLM_fhandle_sz		(1+(NFS2_FHSIZE>>2))
+#define NLM_lock_sz		(3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz)
+#define NLM_holder_sz		(4+NLM_owner_sz)
+
+#define NLM_testargs_sz		(NLM_cookie_sz+1+NLM_lock_sz)
+#define NLM_lockargs_sz		(NLM_cookie_sz+4+NLM_lock_sz)
+#define NLM_cancargs_sz		(NLM_cookie_sz+2+NLM_lock_sz)
+#define NLM_unlockargs_sz	(NLM_cookie_sz+NLM_lock_sz)
+
+#define NLM_testres_sz		(NLM_cookie_sz+1+NLM_holder_sz)
+#define NLM_res_sz		(NLM_cookie_sz+1)
+#define NLM_norep_sz		(0)
+
+
+static s32 loff_t_to_s32(loff_t offset)
+{
+	s32 res;
+
+	if (offset >= NLM_OFFSET_MAX)
+		res = NLM_OFFSET_MAX;
+	else if (offset <= -NLM_OFFSET_MAX)
+		res = -NLM_OFFSET_MAX;
+	else
+		res = offset;
+	return res;
+}
+
+static void nlm_compute_offsets(const struct nlm_lock *lock,
+				u32 *l_offset, u32 *l_len)
+{
+	const struct file_lock *fl = &lock->fl;
+
+	BUG_ON(fl->fl_start > NLM_OFFSET_MAX);
+	BUG_ON(fl->fl_end > NLM_OFFSET_MAX &&
+				fl->fl_end != OFFSET_MAX);
+
+	*l_offset = loff_t_to_s32(fl->fl_start);
+	if (fl->fl_end == OFFSET_MAX)
+		*l_len = 0;
+	else
+		*l_len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
+}
+
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
+
+/*
+ * Encode/decode NLMv3 basic data types
+ *
+ * Basic NLMv3 data types are not defined in an IETF standards
+ * document.  X/Open has a description of these data types that
+ * is useful.  See Chapter 10 of "Protocols for Interworking:
+ * XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = value ? xdr_one : xdr_zero;
+}
+
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(value);
+}
+
+/*
+ *	typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+			  const u8 *data, const unsigned int length)
+{
+	__be32 *p;
+
+	BUG_ON(length > XDR_MAX_NETOBJ);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, data, length);
+}
+
+static int decode_netobj(struct xdr_stream *xdr,
+			 struct xdr_netobj *obj)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	if (unlikely(length > XDR_MAX_NETOBJ))
+		goto out_size;
+	obj->len = length;
+	obj->data = (u8 *)p;
+	return 0;
+out_size:
+	dprintk("NFS: returned netobj was too long: %u\n", length);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+			  const struct nlm_cookie *cookie)
+{
+	BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
+	encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+
+static int decode_cookie(struct xdr_stream *xdr,
+			 struct nlm_cookie *cookie)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	/* apparently HPUX can return empty cookies */
+	if (length == 0)
+		goto out_hpux;
+	if (length > NLM_MAXCOOKIELEN)
+		goto out_size;
+	p = xdr_inline_decode(xdr, length);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	cookie->len = length;
+	memcpy(cookie->data, p, length);
+	return 0;
+out_hpux:
+	cookie->len = 4;
+	memset(cookie->data, 0, 4);
+	return 0;
+out_size:
+	dprintk("NFS: returned cookie was too long: %u\n", length);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+	BUG_ON(fh->size != NFS2_FHSIZE);
+	encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE);
+}
+
+/*
+ *	enum nlm_stats {
+ *		LCK_GRANTED = 0,
+ *		LCK_DENIED = 1,
+ *		LCK_DENIED_NOLOCKS = 2,
+ *		LCK_BLOCKED = 3,
+ *		LCK_DENIED_GRACE_PERIOD = 4
+ *	};
+ *
+ *
+ *	struct nlm_stat {
+ *		nlm_stats stat;
+ *	};
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+
+static void encode_nlm_stat(struct xdr_stream *xdr,
+			    const __be32 stat)
+{
+	__be32 *p;
+
+	BUG_ON(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD);
+	p = xdr_reserve_space(xdr, 4);
+	*p = stat;
+}
+
+static int decode_nlm_stat(struct xdr_stream *xdr,
+			   __be32 *stat)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (unlikely(ntohl(*p) > ntohl(nlm_lck_denied_grace_period)))
+		goto out_enum;
+	*stat = *p;
+	return 0;
+out_enum:
+	dprintk("%s: server returned invalid nlm_stats value: %u\n",
+		__func__, be32_to_cpup(p));
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	struct nlm_holder {
+ *		bool exclusive;
+ *		int uppid;
+ *		netobj oh;
+ *		unsigned l_offset;
+ *		unsigned l_len;
+ *	};
+ */
+static void encode_nlm_holder(struct xdr_stream *xdr,
+			      const struct nlm_res *result)
+{
+	const struct nlm_lock *lock = &result->lock;
+	u32 l_offset, l_len;
+	__be32 *p;
+
+	encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+	encode_int32(xdr, lock->svid);
+	encode_netobj(xdr, lock->oh.data, lock->oh.len);
+
+	p = xdr_reserve_space(xdr, 4 + 4);
+	nlm_compute_offsets(lock, &l_offset, &l_len);
+	*p++ = cpu_to_be32(l_offset);
+	*p   = cpu_to_be32(l_len);
+}
+
+static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+	struct nlm_lock *lock = &result->lock;
+	struct file_lock *fl = &lock->fl;
+	u32 exclusive, l_offset, l_len;
+	int error;
+	__be32 *p;
+	s32 end;
+
+	memset(lock, 0, sizeof(*lock));
+	locks_init_lock(fl);
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	exclusive = be32_to_cpup(p++);
+	lock->svid = be32_to_cpup(p);
+	fl->fl_pid = (pid_t)lock->svid;
+
+	error = decode_netobj(xdr, &lock->oh);
+	if (unlikely(error))
+		goto out;
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+
+	fl->fl_flags = FL_POSIX;
+	fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+	l_offset = be32_to_cpup(p++);
+	l_len = be32_to_cpup(p);
+	end = l_offset + l_len - 1;
+
+	fl->fl_start = (loff_t)l_offset;
+	if (l_len == 0 || end < 0)
+		fl->fl_end = OFFSET_MAX;
+	else
+		fl->fl_end = (loff_t)end;
+	error = 0;
+out:
+	return error;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+	/* NB: client-side does not set lock->len */
+	u32 length = strlen(name);
+	__be32 *p;
+
+	BUG_ON(length > NLM_MAXSTRLEN);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, name, length);
+}
+
+/*
+ *	struct nlm_lock {
+ *		string caller_name<LM_MAXSTRLEN>;
+ *		netobj fh;
+ *		netobj oh;
+ *		int uppid;
+ *		unsigned l_offset;
+ *		unsigned l_len;
+ *	};
+ */
+static void encode_nlm_lock(struct xdr_stream *xdr,
+			    const struct nlm_lock *lock)
+{
+	u32 l_offset, l_len;
+	__be32 *p;
+
+	encode_caller_name(xdr, lock->caller);
+	encode_fh(xdr, &lock->fh);
+	encode_netobj(xdr, lock->oh.data, lock->oh.len);
+
+	p = xdr_reserve_space(xdr, 4 + 4 + 4);
+	*p++ = cpu_to_be32(lock->svid);
+
+	nlm_compute_offsets(lock, &l_offset, &l_len);
+	*p++ = cpu_to_be32(l_offset);
+	*p   = cpu_to_be32(l_len);
+}
+
+
+/*
+ * NLMv3 XDR encode functions
+ *
+ * NLMv3 argument types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+
+/*
+ *	struct nlm_testargs {
+ *		netobj cookie;
+ *		bool exclusive;
+ *		struct nlm_lock alock;
+ *	};
+ */
+static void nlm_xdr_enc_testargs(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm_lockargs {
+ *		netobj cookie;
+ *		bool block;
+ *		bool exclusive;
+ *		struct nlm_lock alock;
+ *		bool reclaim;
+ *		int state;
+ *	};
+ */
+static void nlm_xdr_enc_lockargs(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm_lock(xdr, lock);
+	encode_bool(xdr, args->reclaim);
+	encode_int32(xdr, args->state);
+}
+
+/*
+ *	struct nlm_cancargs {
+ *		netobj cookie;
+ *		bool block;
+ *		bool exclusive;
+ *		struct nlm_lock alock;
+ *	};
+ */
+static void nlm_xdr_enc_cancargs(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm_unlockargs {
+ *		netobj cookie;
+ *		struct nlm_lock alock;
+ *	};
+ */
+static void nlm_xdr_enc_unlockargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_nlm_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm_res {
+ *		netobj cookie;
+ *		nlm_stat stat;
+ *	};
+ */
+static void nlm_xdr_enc_res(struct rpc_rqst *req,
+			    struct xdr_stream *xdr,
+			    const struct nlm_res *result)
+{
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm_stat(xdr, result->status);
+}
+
+/*
+ *	union nlm_testrply switch (nlm_stats stat) {
+ *	case LCK_DENIED:
+ *		struct nlm_holder holder;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct nlm_testres {
+ *		netobj cookie;
+ *		nlm_testrply test_stat;
+ *	};
+ */
+static void encode_nlm_testrply(struct xdr_stream *xdr,
+				const struct nlm_res *result)
+{
+	if (result->status == nlm_lck_denied)
+		encode_nlm_holder(xdr, result);
+}
+
+static void nlm_xdr_enc_testres(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				const struct nlm_res *result)
+{
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm_stat(xdr, result->status);
+	encode_nlm_testrply(xdr, result);
+}
+
+
+/*
+ * NLMv3 XDR decode functions
+ *
+ * NLMv3 result types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+
+/*
+ *	union nlm_testrply switch (nlm_stats stat) {
+ *	case LCK_DENIED:
+ *		struct nlm_holder holder;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct nlm_testres {
+ *		netobj cookie;
+ *		nlm_testrply test_stat;
+ *	};
+ */
+static int decode_nlm_testrply(struct xdr_stream *xdr,
+			       struct nlm_res *result)
+{
+	int error;
+
+	error = decode_nlm_stat(xdr, &result->status);
+	if (unlikely(error))
+		goto out;
+	if (result->status == nlm_lck_denied)
+		error = decode_nlm_holder(xdr, result);
+out:
+	return error;
+}
+
+static int nlm_xdr_dec_testres(struct rpc_rqst *req,
+			       struct xdr_stream *xdr,
+			       struct nlm_res *result)
+{
+	int error;
+
+	error = decode_cookie(xdr, &result->cookie);
+	if (unlikely(error))
+		goto out;
+	error = decode_nlm_testrply(xdr, result);
+out:
+	return error;
+}
+
+/*
+ *	struct nlm_res {
+ *		netobj cookie;
+ *		nlm_stat stat;
+ *	};
+ */
+static int nlm_xdr_dec_res(struct rpc_rqst *req,
+			   struct xdr_stream *xdr,
+			   struct nlm_res *result)
+{
+	int error;
+
+	error = decode_cookie(xdr, &result->cookie);
+	if (unlikely(error))
+		goto out;
+	error = decode_nlm_stat(xdr, &result->status);
+out:
+	return error;
+}
+
+
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm_xdr_dec_norep	NULL
+
+#define PROC(proc, argtype, restype)	\
+[NLMPROC_##proc] = {							\
+	.p_proc      = NLMPROC_##proc,					\
+	.p_encode    = (kxdreproc_t)nlm_xdr_enc_##argtype,		\
+	.p_decode    = (kxdrdproc_t)nlm_xdr_dec_##restype,		\
+	.p_arglen    = NLM_##argtype##_sz,				\
+	.p_replen    = NLM_##restype##_sz,				\
+	.p_statidx   = NLMPROC_##proc,					\
+	.p_name      = #proc,						\
+	}
+
+static struct rpc_procinfo	nlm_procedures[] = {
+	PROC(TEST,		testargs,	testres),
+	PROC(LOCK,		lockargs,	res),
+	PROC(CANCEL,		cancargs,	res),
+	PROC(UNLOCK,		unlockargs,	res),
+	PROC(GRANTED,		testargs,	res),
+	PROC(TEST_MSG,		testargs,	norep),
+	PROC(LOCK_MSG,		lockargs,	norep),
+	PROC(CANCEL_MSG,	cancargs,	norep),
+	PROC(UNLOCK_MSG,	unlockargs,	norep),
+	PROC(GRANTED_MSG,	testargs,	norep),
+	PROC(TEST_RES,		testres,	norep),
+	PROC(LOCK_RES,		res,		norep),
+	PROC(CANCEL_RES,	res,		norep),
+	PROC(UNLOCK_RES,	res,		norep),
+	PROC(GRANTED_RES,	res,		norep),
+};
+
+static struct rpc_version	nlm_version1 = {
+		.number		= 1,
+		.nrprocs	= ARRAY_SIZE(nlm_procedures),
+		.procs		= nlm_procedures,
+};
+
+static struct rpc_version	nlm_version3 = {
+		.number		= 3,
+		.nrprocs	= ARRAY_SIZE(nlm_procedures),
+		.procs		= nlm_procedures,
+};
+
+static struct rpc_version	*nlm_versions[] = {
+	[1] = &nlm_version1,
+	[3] = &nlm_version3,
+#ifdef CONFIG_LOCKD_V4
+	[4] = &nlm_version4,
+#endif
+};
+
+static struct rpc_stat		nlm_rpc_stats;
+
+struct rpc_program		nlm_program = {
+		.name		= "lockd",
+		.number		= NLM_PROGRAM,
+		.nrvers		= ARRAY_SIZE(nlm_versions),
+		.version	= nlm_versions,
+		.stats		= &nlm_rpc_stats,
+};
diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c
new file mode 100644
index 00000000..183cc1f0
--- /dev/null
+++ b/fs/lockd/grace.c
@@ -0,0 +1,59 @@
+/*
+ * Common code for control of lockd and nfsv4 grace periods.
+ */
+
+#include <linux/module.h>
+#include <linux/lockd/bind.h>
+
+static LIST_HEAD(grace_list);
+static DEFINE_SPINLOCK(grace_lock);
+
+/**
+ * locks_start_grace
+ * @lm: who this grace period is for
+ *
+ * A grace period is a period during which locks should not be given
+ * out.  Currently grace periods are only enforced by the two lock
+ * managers (lockd and nfsd), using the locks_in_grace() function to
+ * check when they are in a grace period.
+ *
+ * This function is called to start a grace period.
+ */
+void locks_start_grace(struct lock_manager *lm)
+{
+	spin_lock(&grace_lock);
+	list_add(&lm->list, &grace_list);
+	spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_start_grace);
+
+/**
+ * locks_end_grace
+ * @lm: who this grace period is for
+ *
+ * Call this function to state that the given lock manager is ready to
+ * resume regular locking.  The grace period will not end until all lock
+ * managers that called locks_start_grace() also call locks_end_grace().
+ * Note that callers count on it being safe to call this more than once,
+ * and the second call should be a no-op.
+ */
+void locks_end_grace(struct lock_manager *lm)
+{
+	spin_lock(&grace_lock);
+	list_del_init(&lm->list);
+	spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_end_grace);
+
+/**
+ * locks_in_grace
+ *
+ * Lock managers call this function to determine when it is OK for them
+ * to answer ordinary lock requests, and when they should accept only
+ * lock reclaims.
+ */
+int locks_in_grace(void)
+{
+	return !list_empty(&grace_list);
+}
+EXPORT_SYMBOL_GPL(locks_in_grace);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
new file mode 100644
index 00000000..b7c99bfb
--- /dev/null
+++ b/fs/lockd/host.c
@@ -0,0 +1,649 @@
+/*
+ * linux/fs/lockd/host.c
+ *
+ * Management for NLM peer hosts. The nlm_host struct is shared
+ * between client and server implementation. The only reason to
+ * do so is to reduce code bloat.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/lockd/lockd.h>
+#include <linux/mutex.h>
+
+#include <net/ipv6.h>
+
+#define NLMDBG_FACILITY		NLMDBG_HOSTCACHE
+#define NLM_HOST_NRHASH		32
+#define NLM_HOST_REBIND		(60 * HZ)
+#define NLM_HOST_EXPIRE		(300 * HZ)
+#define NLM_HOST_COLLECT	(120 * HZ)
+
+static struct hlist_head	nlm_server_hosts[NLM_HOST_NRHASH];
+static struct hlist_head	nlm_client_hosts[NLM_HOST_NRHASH];
+
+#define for_each_host(host, pos, chain, table) \
+	for ((chain) = (table); \
+	     (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+		hlist_for_each_entry((host), (pos), (chain), h_hash)
+
+#define for_each_host_safe(host, pos, next, chain, table) \
+	for ((chain) = (table); \
+	     (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+		hlist_for_each_entry_safe((host), (pos), (next), \
+						(chain), h_hash)
+
+static unsigned long		next_gc;
+static unsigned long		nrhosts;
+static DEFINE_MUTEX(nlm_host_mutex);
+
+static void			nlm_gc_hosts(void);
+
+struct nlm_lookup_host_info {
+	const int		server;		/* search for server|client */
+	const struct sockaddr	*sap;		/* address to search for */
+	const size_t		salen;		/* it's length */
+	const unsigned short	protocol;	/* transport to search for*/
+	const u32		version;	/* NLM version to search for */
+	const char		*hostname;	/* remote's hostname */
+	const size_t		hostname_len;	/* it's length */
+	const int		noresvport;	/* use non-priv port */
+};
+
+/*
+ * Hash function must work well on big- and little-endian platforms
+ */
+static unsigned int __nlm_hash32(const __be32 n)
+{
+	unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16);
+	return hash ^ (hash >> 8);
+}
+
+static unsigned int __nlm_hash_addr4(const struct sockaddr *sap)
+{
+	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	return __nlm_hash32(sin->sin_addr.s_addr);
+}
+
+static unsigned int __nlm_hash_addr6(const struct sockaddr *sap)
+{
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	const struct in6_addr addr = sin6->sin6_addr;
+	return __nlm_hash32(addr.s6_addr32[0]) ^
+	       __nlm_hash32(addr.s6_addr32[1]) ^
+	       __nlm_hash32(addr.s6_addr32[2]) ^
+	       __nlm_hash32(addr.s6_addr32[3]);
+}
+
+static unsigned int nlm_hash_address(const struct sockaddr *sap)
+{
+	unsigned int hash;
+
+	switch (sap->sa_family) {
+	case AF_INET:
+		hash = __nlm_hash_addr4(sap);
+		break;
+	case AF_INET6:
+		hash = __nlm_hash_addr6(sap);
+		break;
+	default:
+		hash = 0;
+	}
+	return hash & (NLM_HOST_NRHASH - 1);
+}
+
+/*
+ * Allocate and initialize an nlm_host.  Common to both client and server.
+ */
+static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
+				       struct nsm_handle *nsm)
+{
+	struct nlm_host *host = NULL;
+	unsigned long now = jiffies;
+
+	if (nsm != NULL)
+		atomic_inc(&nsm->sm_count);
+	else {
+		host = NULL;
+		nsm = nsm_get_handle(ni->sap, ni->salen,
+					ni->hostname, ni->hostname_len);
+		if (unlikely(nsm == NULL)) {
+			dprintk("lockd: %s failed; no nsm handle\n",
+				__func__);
+			goto out;
+		}
+	}
+
+	host = kmalloc(sizeof(*host), GFP_KERNEL);
+	if (unlikely(host == NULL)) {
+		dprintk("lockd: %s failed; no memory\n", __func__);
+		nsm_release(nsm);
+		goto out;
+	}
+
+	memcpy(nlm_addr(host), ni->sap, ni->salen);
+	host->h_addrlen    = ni->salen;
+	rpc_set_port(nlm_addr(host), 0);
+	host->h_srcaddrlen = 0;
+
+	host->h_rpcclnt    = NULL;
+	host->h_name	   = nsm->sm_name;
+	host->h_version    = ni->version;
+	host->h_proto      = ni->protocol;
+	host->h_reclaiming = 0;
+	host->h_server     = ni->server;
+	host->h_noresvport = ni->noresvport;
+	host->h_inuse      = 0;
+	init_waitqueue_head(&host->h_gracewait);
+	init_rwsem(&host->h_rwsem);
+	host->h_state      = 0;
+	host->h_nsmstate   = 0;
+	host->h_pidcount   = 0;
+	atomic_set(&host->h_count, 1);
+	mutex_init(&host->h_mutex);
+	host->h_nextrebind = now + NLM_HOST_REBIND;
+	host->h_expires    = now + NLM_HOST_EXPIRE;
+	INIT_LIST_HEAD(&host->h_lockowners);
+	spin_lock_init(&host->h_lock);
+	INIT_LIST_HEAD(&host->h_granted);
+	INIT_LIST_HEAD(&host->h_reclaim);
+	host->h_nsmhandle  = nsm;
+	host->h_addrbuf    = nsm->sm_addrbuf;
+
+out:
+	return host;
+}
+
+/*
+ * Destroy an nlm_host and free associated resources
+ *
+ * Caller must hold nlm_host_mutex.
+ */
+static void nlm_destroy_host_locked(struct nlm_host *host)
+{
+	struct rpc_clnt	*clnt;
+
+	dprintk("lockd: destroy host %s\n", host->h_name);
+
+	BUG_ON(!list_empty(&host->h_lockowners));
+	BUG_ON(atomic_read(&host->h_count));
+
+	hlist_del_init(&host->h_hash);
+
+	nsm_unmonitor(host);
+	nsm_release(host->h_nsmhandle);
+
+	clnt = host->h_rpcclnt;
+	if (clnt != NULL)
+		rpc_shutdown_client(clnt);
+	kfree(host);
+
+	nrhosts--;
+}
+
+/**
+ * nlmclnt_lookup_host - Find an NLM host handle matching a remote server
+ * @sap: network address of server
+ * @salen: length of server address
+ * @protocol: transport protocol to use
+ * @version: NLM protocol version
+ * @hostname: '\0'-terminated hostname of server
+ * @noresvport: 1 if non-privileged port should be used
+ *
+ * Returns an nlm_host structure that matches the passed-in
+ * [server address, transport protocol, NLM version, server hostname].
+ * If one doesn't already exist in the host cache, a new handle is
+ * created and returned.
+ */
+struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
+				     const size_t salen,
+				     const unsigned short protocol,
+				     const u32 version,
+				     const char *hostname,
+				     int noresvport)
+{
+	struct nlm_lookup_host_info ni = {
+		.server		= 0,
+		.sap		= sap,
+		.salen		= salen,
+		.protocol	= protocol,
+		.version	= version,
+		.hostname	= hostname,
+		.hostname_len	= strlen(hostname),
+		.noresvport	= noresvport,
+	};
+	struct hlist_head *chain;
+	struct hlist_node *pos;
+	struct nlm_host	*host;
+	struct nsm_handle *nsm = NULL;
+
+	dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
+			(hostname ? hostname : "<none>"), version,
+			(protocol == IPPROTO_UDP ? "udp" : "tcp"));
+
+	mutex_lock(&nlm_host_mutex);
+
+	chain = &nlm_client_hosts[nlm_hash_address(sap)];
+	hlist_for_each_entry(host, pos, chain, h_hash) {
+		if (!rpc_cmp_addr(nlm_addr(host), sap))
+			continue;
+
+		/* Same address. Share an NSM handle if we already have one */
+		if (nsm == NULL)
+			nsm = host->h_nsmhandle;
+
+		if (host->h_proto != protocol)
+			continue;
+		if (host->h_version != version)
+			continue;
+
+		nlm_get_host(host);
+		dprintk("lockd: %s found host %s (%s)\n", __func__,
+			host->h_name, host->h_addrbuf);
+		goto out;
+	}
+
+	host = nlm_alloc_host(&ni, nsm);
+	if (unlikely(host == NULL))
+		goto out;
+
+	hlist_add_head(&host->h_hash, chain);
+	nrhosts++;
+
+	dprintk("lockd: %s created host %s (%s)\n", __func__,
+		host->h_name, host->h_addrbuf);
+
+out:
+	mutex_unlock(&nlm_host_mutex);
+	return host;
+}
+
+/**
+ * nlmclnt_release_host - release client nlm_host
+ * @host: nlm_host to release
+ *
+ */
+void nlmclnt_release_host(struct nlm_host *host)
+{
+	if (host == NULL)
+		return;
+
+	dprintk("lockd: release client host %s\n", host->h_name);
+
+	BUG_ON(atomic_read(&host->h_count) < 0);
+	BUG_ON(host->h_server);
+
+	if (atomic_dec_and_test(&host->h_count)) {
+		BUG_ON(!list_empty(&host->h_lockowners));
+		BUG_ON(!list_empty(&host->h_granted));
+		BUG_ON(!list_empty(&host->h_reclaim));
+
+		mutex_lock(&nlm_host_mutex);
+		nlm_destroy_host_locked(host);
+		mutex_unlock(&nlm_host_mutex);
+	}
+}
+
+/**
+ * nlmsvc_lookup_host - Find an NLM host handle matching a remote client
+ * @rqstp: incoming NLM request
+ * @hostname: name of client host
+ * @hostname_len: length of client hostname
+ *
+ * Returns an nlm_host structure that matches the [client address,
+ * transport protocol, NLM version, client hostname] of the passed-in
+ * NLM request.  If one doesn't already exist in the host cache, a
+ * new handle is created and returned.
+ *
+ * Before possibly creating a new nlm_host, construct a sockaddr
+ * for a specific source address in case the local system has
+ * multiple network addresses.  The family of the address in
+ * rq_daddr is guaranteed to be the same as the family of the
+ * address in rq_addr, so it's safe to use the same family for
+ * the source address.
+ */
+struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
+				    const char *hostname,
+				    const size_t hostname_len)
+{
+	struct hlist_head *chain;
+	struct hlist_node *pos;
+	struct nlm_host	*host = NULL;
+	struct nsm_handle *nsm = NULL;
+	struct sockaddr_in sin = {
+		.sin_family	= AF_INET,
+	};
+	struct sockaddr_in6 sin6 = {
+		.sin6_family	= AF_INET6,
+	};
+	struct sockaddr *src_sap;
+	size_t src_len = rqstp->rq_addrlen;
+	struct nlm_lookup_host_info ni = {
+		.server		= 1,
+		.sap		= svc_addr(rqstp),
+		.salen		= rqstp->rq_addrlen,
+		.protocol	= rqstp->rq_prot,
+		.version	= rqstp->rq_vers,
+		.hostname	= hostname,
+		.hostname_len	= hostname_len,
+	};
+
+	dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
+			(int)hostname_len, hostname, rqstp->rq_vers,
+			(rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
+
+	mutex_lock(&nlm_host_mutex);
+
+	switch (ni.sap->sa_family) {
+	case AF_INET:
+		sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
+		src_sap = (struct sockaddr *)&sin;
+		break;
+	case AF_INET6:
+		ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
+		src_sap = (struct sockaddr *)&sin6;
+		break;
+	default:
+		dprintk("lockd: %s failed; unrecognized address family\n",
+			__func__);
+		goto out;
+	}
+
+	if (time_after_eq(jiffies, next_gc))
+		nlm_gc_hosts();
+
+	chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
+	hlist_for_each_entry(host, pos, chain, h_hash) {
+		if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
+			continue;
+
+		/* Same address. Share an NSM handle if we already have one */
+		if (nsm == NULL)
+			nsm = host->h_nsmhandle;
+
+		if (host->h_proto != ni.protocol)
+			continue;
+		if (host->h_version != ni.version)
+			continue;
+		if (!rpc_cmp_addr(nlm_srcaddr(host), src_sap))
+			continue;
+
+		/* Move to head of hash chain. */
+		hlist_del(&host->h_hash);
+		hlist_add_head(&host->h_hash, chain);
+
+		nlm_get_host(host);
+		dprintk("lockd: %s found host %s (%s)\n",
+			__func__, host->h_name, host->h_addrbuf);
+		goto out;
+	}
+
+	host = nlm_alloc_host(&ni, nsm);
+	if (unlikely(host == NULL))
+		goto out;
+
+	memcpy(nlm_srcaddr(host), src_sap, src_len);
+	host->h_srcaddrlen = src_len;
+	hlist_add_head(&host->h_hash, chain);
+	nrhosts++;
+
+	dprintk("lockd: %s created host %s (%s)\n",
+		__func__, host->h_name, host->h_addrbuf);
+
+out:
+	mutex_unlock(&nlm_host_mutex);
+	return host;
+}
+
+/**
+ * nlmsvc_release_host - release server nlm_host
+ * @host: nlm_host to release
+ *
+ * Host is destroyed later in nlm_gc_host().
+ */
+void nlmsvc_release_host(struct nlm_host *host)
+{
+	if (host == NULL)
+		return;
+
+	dprintk("lockd: release server host %s\n", host->h_name);
+
+	BUG_ON(atomic_read(&host->h_count) < 0);
+	BUG_ON(!host->h_server);
+	atomic_dec(&host->h_count);
+}
+
+/*
+ * Create the NLM RPC client for an NLM peer
+ */
+struct rpc_clnt *
+nlm_bind_host(struct nlm_host *host)
+{
+	struct rpc_clnt	*clnt;
+
+	dprintk("lockd: nlm_bind_host %s (%s)\n",
+			host->h_name, host->h_addrbuf);
+
+	/* Lock host handle */
+	mutex_lock(&host->h_mutex);
+
+	/* If we've already created an RPC client, check whether
+	 * RPC rebind is required
+	 */
+	if ((clnt = host->h_rpcclnt) != NULL) {
+		if (time_after_eq(jiffies, host->h_nextrebind)) {
+			rpc_force_rebind(clnt);
+			host->h_nextrebind = jiffies + NLM_HOST_REBIND;
+			dprintk("lockd: next rebind in %lu jiffies\n",
+					host->h_nextrebind - jiffies);
+		}
+	} else {
+		unsigned long increment = nlmsvc_timeout;
+		struct rpc_timeout timeparms = {
+			.to_initval	= increment,
+			.to_increment	= increment,
+			.to_maxval	= increment * 6UL,
+			.to_retries	= 5U,
+		};
+		struct rpc_create_args args = {
+			.net		= &init_net,
+			.protocol	= host->h_proto,
+			.address	= nlm_addr(host),
+			.addrsize	= host->h_addrlen,
+			.timeout	= &timeparms,
+			.servername	= host->h_name,
+			.program	= &nlm_program,
+			.version	= host->h_version,
+			.authflavor	= RPC_AUTH_UNIX,
+			.flags		= (RPC_CLNT_CREATE_NOPING |
+					   RPC_CLNT_CREATE_AUTOBIND),
+		};
+
+		/*
+		 * lockd retries server side blocks automatically so we want
+		 * those to be soft RPC calls. Client side calls need to be
+		 * hard RPC tasks.
+		 */
+		if (!host->h_server)
+			args.flags |= RPC_CLNT_CREATE_HARDRTRY;
+		if (host->h_noresvport)
+			args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+		if (host->h_srcaddrlen)
+			args.saddress = nlm_srcaddr(host);
+
+		clnt = rpc_create(&args);
+		if (!IS_ERR(clnt))
+			host->h_rpcclnt = clnt;
+		else {
+			printk("lockd: couldn't create RPC handle for %s\n", host->h_name);
+			clnt = NULL;
+		}
+	}
+
+	mutex_unlock(&host->h_mutex);
+	return clnt;
+}
+
+/*
+ * Force a portmap lookup of the remote lockd port
+ */
+void
+nlm_rebind_host(struct nlm_host *host)
+{
+	dprintk("lockd: rebind host %s\n", host->h_name);
+	if (host->h_rpcclnt && time_after_eq(jiffies, host->h_nextrebind)) {
+		rpc_force_rebind(host->h_rpcclnt);
+		host->h_nextrebind = jiffies + NLM_HOST_REBIND;
+	}
+}
+
+/*
+ * Increment NLM host count
+ */
+struct nlm_host * nlm_get_host(struct nlm_host *host)
+{
+	if (host) {
+		dprintk("lockd: get host %s\n", host->h_name);
+		atomic_inc(&host->h_count);
+		host->h_expires = jiffies + NLM_HOST_EXPIRE;
+	}
+	return host;
+}
+
+static struct nlm_host *next_host_state(struct hlist_head *cache,
+					struct nsm_handle *nsm,
+					const struct nlm_reboot *info)
+{
+	struct nlm_host *host;
+	struct hlist_head *chain;
+	struct hlist_node *pos;
+
+	mutex_lock(&nlm_host_mutex);
+	for_each_host(host, pos, chain, cache) {
+		if (host->h_nsmhandle == nsm
+		    && host->h_nsmstate != info->state) {
+			host->h_nsmstate = info->state;
+			host->h_state++;
+
+			nlm_get_host(host);
+			mutex_unlock(&nlm_host_mutex);
+			return host;
+		}
+	}
+
+	mutex_unlock(&nlm_host_mutex);
+	return NULL;
+}
+
+/**
+ * nlm_host_rebooted - Release all resources held by rebooted host
+ * @info: pointer to decoded results of NLM_SM_NOTIFY call
+ *
+ * We were notified that the specified host has rebooted.  Release
+ * all resources held by that peer.
+ */
+void nlm_host_rebooted(const struct nlm_reboot *info)
+{
+	struct nsm_handle *nsm;
+	struct nlm_host	*host;
+
+	nsm = nsm_reboot_lookup(info);
+	if (unlikely(nsm == NULL))
+		return;
+
+	/* Mark all hosts tied to this NSM state as having rebooted.
+	 * We run the loop repeatedly, because we drop the host table
+	 * lock for this.
+	 * To avoid processing a host several times, we match the nsmstate.
+	 */
+	while ((host = next_host_state(nlm_server_hosts, nsm, info)) != NULL) {
+		nlmsvc_free_host_resources(host);
+		nlmsvc_release_host(host);
+	}
+	while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) {
+		nlmclnt_recovery(host);
+		nlmclnt_release_host(host);
+	}
+
+	nsm_release(nsm);
+}
+
+/*
+ * Shut down the hosts module.
+ * Note that this routine is called only at server shutdown time.
+ */
+void
+nlm_shutdown_hosts(void)
+{
+	struct hlist_head *chain;
+	struct hlist_node *pos;
+	struct nlm_host	*host;
+
+	dprintk("lockd: shutting down host module\n");
+	mutex_lock(&nlm_host_mutex);
+
+	/* First, make all hosts eligible for gc */
+	dprintk("lockd: nuking all hosts...\n");
+	for_each_host(host, pos, chain, nlm_server_hosts) {
+		host->h_expires = jiffies - 1;
+		if (host->h_rpcclnt) {
+			rpc_shutdown_client(host->h_rpcclnt);
+			host->h_rpcclnt = NULL;
+		}
+	}
+
+	/* Then, perform a garbage collection pass */
+	nlm_gc_hosts();
+	mutex_unlock(&nlm_host_mutex);
+
+	/* complain if any hosts are left */
+	if (nrhosts != 0) {
+		printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
+		dprintk("lockd: %lu hosts left:\n", nrhosts);
+		for_each_host(host, pos, chain, nlm_server_hosts) {
+			dprintk("       %s (cnt %d use %d exp %ld)\n",
+				host->h_name, atomic_read(&host->h_count),
+				host->h_inuse, host->h_expires);
+		}
+	}
+}
+
+/*
+ * Garbage collect any unused NLM hosts.
+ * This GC combines reference counting for async operations with
+ * mark & sweep for resources held by remote clients.
+ */
+static void
+nlm_gc_hosts(void)
+{
+	struct hlist_head *chain;
+	struct hlist_node *pos, *next;
+	struct nlm_host	*host;
+
+	dprintk("lockd: host garbage collection\n");
+	for_each_host(host, pos, chain, nlm_server_hosts)
+		host->h_inuse = 0;
+
+	/* Mark all hosts that hold locks, blocks or shares */
+	nlmsvc_mark_resources();
+
+	for_each_host_safe(host, pos, next, chain, nlm_server_hosts) {
+		if (atomic_read(&host->h_count) || host->h_inuse
+		 || time_before(jiffies, host->h_expires)) {
+			dprintk("nlm_gc_hosts skipping %s "
+				"(cnt %d use %d exp %ld)\n",
+				host->h_name, atomic_read(&host->h_count),
+				host->h_inuse, host->h_expires);
+			continue;
+		}
+		nlm_destroy_host_locked(host);
+	}
+
+	next_gc = jiffies + NLM_HOST_COLLECT;
+}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
new file mode 100644
index 00000000..23d7451b
--- /dev/null
+++ b/fs/lockd/mon.c
@@ -0,0 +1,555 @@
+/*
+ * linux/fs/lockd/mon.c
+ *
+ * The kernel statd client.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/utsname.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/slab.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/lockd/lockd.h>
+
+#include <asm/unaligned.h>
+
+#define NLMDBG_FACILITY		NLMDBG_MONITOR
+#define NSM_PROGRAM		100024
+#define NSM_VERSION		1
+
+enum {
+	NSMPROC_NULL,
+	NSMPROC_STAT,
+	NSMPROC_MON,
+	NSMPROC_UNMON,
+	NSMPROC_UNMON_ALL,
+	NSMPROC_SIMU_CRASH,
+	NSMPROC_NOTIFY,
+};
+
+struct nsm_args {
+	struct nsm_private	*priv;
+	u32			prog;		/* RPC callback info */
+	u32			vers;
+	u32			proc;
+
+	char			*mon_name;
+};
+
+struct nsm_res {
+	u32			status;
+	u32			state;
+};
+
+static struct rpc_program	nsm_program;
+static				LIST_HEAD(nsm_handles);
+static				DEFINE_SPINLOCK(nsm_lock);
+
+/*
+ * Local NSM state
+ */
+u32	__read_mostly		nsm_local_state;
+int	__read_mostly		nsm_use_hostnames;
+
+static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
+{
+	return (struct sockaddr *)&nsm->sm_addr;
+}
+
+static struct rpc_clnt *nsm_create(void)
+{
+	struct sockaddr_in sin = {
+		.sin_family		= AF_INET,
+		.sin_addr.s_addr	= htonl(INADDR_LOOPBACK),
+	};
+	struct rpc_create_args args = {
+		.net			= &init_net,
+		.protocol		= XPRT_TRANSPORT_UDP,
+		.address		= (struct sockaddr *)&sin,
+		.addrsize		= sizeof(sin),
+		.servername		= "rpc.statd",
+		.program		= &nsm_program,
+		.version		= NSM_VERSION,
+		.authflavor		= RPC_AUTH_NULL,
+		.flags			= RPC_CLNT_CREATE_NOPING,
+	};
+
+	return rpc_create(&args);
+}
+
+static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
+{
+	struct rpc_clnt	*clnt;
+	int		status;
+	struct nsm_args args = {
+		.priv		= &nsm->sm_priv,
+		.prog		= NLM_PROGRAM,
+		.vers		= 3,
+		.proc		= NLMPROC_NSM_NOTIFY,
+		.mon_name	= nsm->sm_mon_name,
+	};
+	struct rpc_message msg = {
+		.rpc_argp	= &args,
+		.rpc_resp	= res,
+	};
+
+	clnt = nsm_create();
+	if (IS_ERR(clnt)) {
+		status = PTR_ERR(clnt);
+		dprintk("lockd: failed to create NSM upcall transport, "
+				"status=%d\n", status);
+		goto out;
+	}
+
+	memset(res, 0, sizeof(*res));
+
+	msg.rpc_proc = &clnt->cl_procinfo[proc];
+	status = rpc_call_sync(clnt, &msg, 0);
+	if (status < 0)
+		dprintk("lockd: NSM upcall RPC failed, status=%d\n",
+				status);
+	else
+		status = 0;
+	rpc_shutdown_client(clnt);
+ out:
+	return status;
+}
+
+/**
+ * nsm_monitor - Notify a peer in case we reboot
+ * @host: pointer to nlm_host of peer to notify
+ *
+ * If this peer is not already monitored, this function sends an
+ * upcall to the local rpc.statd to record the name/address of
+ * the peer to notify in case we reboot.
+ *
+ * Returns zero if the peer is monitored by the local rpc.statd;
+ * otherwise a negative errno value is returned.
+ */
+int nsm_monitor(const struct nlm_host *host)
+{
+	struct nsm_handle *nsm = host->h_nsmhandle;
+	struct nsm_res	res;
+	int		status;
+
+	dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
+
+	if (nsm->sm_monitored)
+		return 0;
+
+	/*
+	 * Choose whether to record the caller_name or IP address of
+	 * this peer in the local rpc.statd's database.
+	 */
+	nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
+
+	status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
+	if (unlikely(res.status != 0))
+		status = -EIO;
+	if (unlikely(status < 0)) {
+		printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
+		return status;
+	}
+
+	nsm->sm_monitored = 1;
+	if (unlikely(nsm_local_state != res.state)) {
+		nsm_local_state = res.state;
+		dprintk("lockd: NSM state changed to %d\n", nsm_local_state);
+	}
+	return 0;
+}
+
+/**
+ * nsm_unmonitor - Unregister peer notification
+ * @host: pointer to nlm_host of peer to stop monitoring
+ *
+ * If this peer is monitored, this function sends an upcall to
+ * tell the local rpc.statd not to send this peer a notification
+ * when we reboot.
+ */
+void nsm_unmonitor(const struct nlm_host *host)
+{
+	struct nsm_handle *nsm = host->h_nsmhandle;
+	struct nsm_res	res;
+	int status;
+
+	if (atomic_read(&nsm->sm_count) == 1
+	 && nsm->sm_monitored && !nsm->sm_sticky) {
+		dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
+
+		status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res);
+		if (res.status != 0)
+			status = -EIO;
+		if (status < 0)
+			printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
+					nsm->sm_name);
+		else
+			nsm->sm_monitored = 0;
+	}
+}
+
+static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
+					      const size_t len)
+{
+	struct nsm_handle *nsm;
+
+	list_for_each_entry(nsm, &nsm_handles, sm_link)
+		if (strlen(nsm->sm_name) == len &&
+		    memcmp(nsm->sm_name, hostname, len) == 0)
+			return nsm;
+	return NULL;
+}
+
+static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
+{
+	struct nsm_handle *nsm;
+
+	list_for_each_entry(nsm, &nsm_handles, sm_link)
+		if (rpc_cmp_addr(nsm_addr(nsm), sap))
+			return nsm;
+	return NULL;
+}
+
+static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
+{
+	struct nsm_handle *nsm;
+
+	list_for_each_entry(nsm, &nsm_handles, sm_link)
+		if (memcmp(nsm->sm_priv.data, priv->data,
+					sizeof(priv->data)) == 0)
+			return nsm;
+	return NULL;
+}
+
+/*
+ * Construct a unique cookie to match this nsm_handle to this monitored
+ * host.  It is passed to the local rpc.statd via NSMPROC_MON, and
+ * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these
+ * requests.
+ *
+ * The NSM protocol requires that these cookies be unique while the
+ * system is running.  We prefer a stronger requirement of making them
+ * unique across reboots.  If user space bugs cause a stale cookie to
+ * be sent to the kernel, it could cause the wrong host to lose its
+ * lock state if cookies were not unique across reboots.
+ *
+ * The cookies are exposed only to local user space via loopback.  They
+ * do not appear on the physical network.  If we want greater security
+ * for some reason, nsm_init_private() could perform a one-way hash to
+ * obscure the contents of the cookie.
+ */
+static void nsm_init_private(struct nsm_handle *nsm)
+{
+	u64 *p = (u64 *)&nsm->sm_priv.data;
+	struct timespec ts;
+	s64 ns;
+
+	ktime_get_ts(&ts);
+	ns = timespec_to_ns(&ts);
+	put_unaligned(ns, p);
+	put_unaligned((unsigned long)nsm, p + 1);
+}
+
+static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
+					    const size_t salen,
+					    const char *hostname,
+					    const size_t hostname_len)
+{
+	struct nsm_handle *new;
+
+	new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL);
+	if (unlikely(new == NULL))
+		return NULL;
+
+	atomic_set(&new->sm_count, 1);
+	new->sm_name = (char *)(new + 1);
+	memcpy(nsm_addr(new), sap, salen);
+	new->sm_addrlen = salen;
+	nsm_init_private(new);
+
+	if (rpc_ntop(nsm_addr(new), new->sm_addrbuf,
+					sizeof(new->sm_addrbuf)) == 0)
+		(void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf),
+				"unsupported address family");
+	memcpy(new->sm_name, hostname, hostname_len);
+	new->sm_name[hostname_len] = '\0';
+
+	return new;
+}
+
+/**
+ * nsm_get_handle - Find or create a cached nsm_handle
+ * @sap: pointer to socket address of handle to find
+ * @salen: length of socket address
+ * @hostname: pointer to C string containing hostname to find
+ * @hostname_len: length of C string
+ *
+ * Behavior is modulated by the global nsm_use_hostnames variable.
+ *
+ * Returns a cached nsm_handle after bumping its ref count, or
+ * returns a fresh nsm_handle if a handle that matches @sap and/or
+ * @hostname cannot be found in the handle cache.  Returns NULL if
+ * an error occurs.
+ */
+struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
+				  const size_t salen, const char *hostname,
+				  const size_t hostname_len)
+{
+	struct nsm_handle *cached, *new = NULL;
+
+	if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
+		if (printk_ratelimit()) {
+			printk(KERN_WARNING "Invalid hostname \"%.*s\" "
+					    "in NFS lock request\n",
+				(int)hostname_len, hostname);
+		}
+		return NULL;
+	}
+
+retry:
+	spin_lock(&nsm_lock);
+
+	if (nsm_use_hostnames && hostname != NULL)
+		cached = nsm_lookup_hostname(hostname, hostname_len);
+	else
+		cached = nsm_lookup_addr(sap);
+
+	if (cached != NULL) {
+		atomic_inc(&cached->sm_count);
+		spin_unlock(&nsm_lock);
+		kfree(new);
+		dprintk("lockd: found nsm_handle for %s (%s), "
+				"cnt %d\n", cached->sm_name,
+				cached->sm_addrbuf,
+				atomic_read(&cached->sm_count));
+		return cached;
+	}
+
+	if (new != NULL) {
+		list_add(&new->sm_link, &nsm_handles);
+		spin_unlock(&nsm_lock);
+		dprintk("lockd: created nsm_handle for %s (%s)\n",
+				new->sm_name, new->sm_addrbuf);
+		return new;
+	}
+
+	spin_unlock(&nsm_lock);
+
+	new = nsm_create_handle(sap, salen, hostname, hostname_len);
+	if (unlikely(new == NULL))
+		return NULL;
+	goto retry;
+}
+
+/**
+ * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
+ * @info: pointer to NLMPROC_SM_NOTIFY arguments
+ *
+ * Returns a matching nsm_handle if found in the nsm cache. The returned
+ * nsm_handle's reference count is bumped. Otherwise returns NULL if some
+ * error occurred.
+ */
+struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
+{
+	struct nsm_handle *cached;
+
+	spin_lock(&nsm_lock);
+
+	cached = nsm_lookup_priv(&info->priv);
+	if (unlikely(cached == NULL)) {
+		spin_unlock(&nsm_lock);
+		dprintk("lockd: never saw rebooted peer '%.*s' before\n",
+				info->len, info->mon);
+		return cached;
+	}
+
+	atomic_inc(&cached->sm_count);
+	spin_unlock(&nsm_lock);
+
+	dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
+			cached->sm_name, cached->sm_addrbuf,
+			atomic_read(&cached->sm_count));
+	return cached;
+}
+
+/**
+ * nsm_release - Release an NSM handle
+ * @nsm: pointer to handle to be released
+ *
+ */
+void nsm_release(struct nsm_handle *nsm)
+{
+	if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
+		list_del(&nsm->sm_link);
+		spin_unlock(&nsm_lock);
+		dprintk("lockd: destroyed nsm_handle for %s (%s)\n",
+				nsm->sm_name, nsm->sm_addrbuf);
+		kfree(nsm);
+	}
+}
+
+/*
+ * XDR functions for NSM.
+ *
+ * See http://www.opengroup.org/ for details on the Network
+ * Status Monitor wire protocol.
+ */
+
+static void encode_nsm_string(struct xdr_stream *xdr, const char *string)
+{
+	const u32 len = strlen(string);
+	__be32 *p;
+
+	BUG_ON(len > SM_MAXSTRLEN);
+	p = xdr_reserve_space(xdr, 4 + len);
+	xdr_encode_opaque(p, string, len);
+}
+
+/*
+ * "mon_name" specifies the host to be monitored.
+ */
+static void encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
+{
+	encode_nsm_string(xdr, argp->mon_name);
+}
+
+/*
+ * The "my_id" argument specifies the hostname and RPC procedure
+ * to be called when the status manager receives notification
+ * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
+ * has changed.
+ */
+static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+{
+	__be32 *p;
+
+	encode_nsm_string(xdr, utsname()->nodename);
+	p = xdr_reserve_space(xdr, 4 + 4 + 4);
+	*p++ = cpu_to_be32(argp->prog);
+	*p++ = cpu_to_be32(argp->vers);
+	*p = cpu_to_be32(argp->proc);
+}
+
+/*
+ * The "mon_id" argument specifies the non-private arguments
+ * of an NSMPROC_MON or NSMPROC_UNMON call.
+ */
+static void encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+{
+	encode_mon_name(xdr, argp);
+	encode_my_id(xdr, argp);
+}
+
+/*
+ * The "priv" argument may contain private information required
+ * by the NSMPROC_MON call. This information will be supplied in the
+ * NLMPROC_SM_NOTIFY call.
+ */
+static void encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
+	xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
+}
+
+static void nsm_xdr_enc_mon(struct rpc_rqst *req, struct xdr_stream *xdr,
+			    const struct nsm_args *argp)
+{
+	encode_mon_id(xdr, argp);
+	encode_priv(xdr, argp);
+}
+
+static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      const struct nsm_args *argp)
+{
+	encode_mon_id(xdr, argp);
+}
+
+static int nsm_xdr_dec_stat_res(struct rpc_rqst *rqstp,
+				struct xdr_stream *xdr,
+				struct nsm_res *resp)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		return -EIO;
+	resp->status = be32_to_cpup(p++);
+	resp->state = be32_to_cpup(p);
+
+	dprintk("lockd: %s status %d state %d\n",
+		__func__, resp->status, resp->state);
+	return 0;
+}
+
+static int nsm_xdr_dec_stat(struct rpc_rqst *rqstp,
+			    struct xdr_stream *xdr,
+			    struct nsm_res *resp)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -EIO;
+	resp->state = be32_to_cpup(p);
+
+	dprintk("lockd: %s state %d\n", __func__, resp->state);
+	return 0;
+}
+
+#define SM_my_name_sz	(1+XDR_QUADLEN(SM_MAXSTRLEN))
+#define SM_my_id_sz	(SM_my_name_sz+3)
+#define SM_mon_name_sz	(1+XDR_QUADLEN(SM_MAXSTRLEN))
+#define SM_mon_id_sz	(SM_mon_name_sz+SM_my_id_sz)
+#define SM_priv_sz	(XDR_QUADLEN(SM_PRIV_SIZE))
+#define SM_mon_sz	(SM_mon_id_sz+SM_priv_sz)
+#define SM_monres_sz	2
+#define SM_unmonres_sz	1
+
+static struct rpc_procinfo	nsm_procedures[] = {
+[NSMPROC_MON] = {
+		.p_proc		= NSMPROC_MON,
+		.p_encode	= (kxdreproc_t)nsm_xdr_enc_mon,
+		.p_decode	= (kxdrdproc_t)nsm_xdr_dec_stat_res,
+		.p_arglen	= SM_mon_sz,
+		.p_replen	= SM_monres_sz,
+		.p_statidx	= NSMPROC_MON,
+		.p_name		= "MONITOR",
+	},
+[NSMPROC_UNMON] = {
+		.p_proc		= NSMPROC_UNMON,
+		.p_encode	= (kxdreproc_t)nsm_xdr_enc_unmon,
+		.p_decode	= (kxdrdproc_t)nsm_xdr_dec_stat,
+		.p_arglen	= SM_mon_id_sz,
+		.p_replen	= SM_unmonres_sz,
+		.p_statidx	= NSMPROC_UNMON,
+		.p_name		= "UNMONITOR",
+	},
+};
+
+static struct rpc_version	nsm_version1 = {
+		.number		= 1,
+		.nrprocs	= ARRAY_SIZE(nsm_procedures),
+		.procs		= nsm_procedures
+};
+
+static struct rpc_version *	nsm_version[] = {
+	[1] = &nsm_version1,
+};
+
+static struct rpc_stat		nsm_stats;
+
+static struct rpc_program	nsm_program = {
+		.name		= "statd",
+		.number		= NSM_PROGRAM,
+		.nrvers		= ARRAY_SIZE(nsm_version),
+		.version	= nsm_version,
+		.stats		= &nsm_stats
+};
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
new file mode 100644
index 00000000..1743064c
--- /dev/null
+++ b/fs/lockd/svc.c
@@ -0,0 +1,568 @@
+/*
+ * linux/fs/lockd/svc.c
+ *
+ * This is the central lockd service.
+ *
+ * FIXME: Separate the lockd NFS server functionality from the lockd NFS
+ * 	  client functionality. Oh why didn't Sun create two separate
+ *	  services in the first place?
+ *
+ * Authors:	Olaf Kirch (okir@monad.swb.de)
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/moduleparam.h>
+
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/uio.h>
+#include <linux/smp.h>
+#include <linux/mutex.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svcsock.h>
+#include <net/ip.h>
+#include <linux/lockd/lockd.h>
+#include <linux/nfs.h>
+
+#define NLMDBG_FACILITY		NLMDBG_SVC
+#define LOCKD_BUFSIZE		(1024 + NLMSVC_XDRSIZE)
+#define ALLOWED_SIGS		(sigmask(SIGKILL))
+
+static struct svc_program	nlmsvc_program;
+
+struct nlmsvc_binding *		nlmsvc_ops;
+EXPORT_SYMBOL_GPL(nlmsvc_ops);
+
+static DEFINE_MUTEX(nlmsvc_mutex);
+static unsigned int		nlmsvc_users;
+static struct task_struct	*nlmsvc_task;
+static struct svc_rqst		*nlmsvc_rqst;
+unsigned long			nlmsvc_timeout;
+
+/*
+ * These can be set at insmod time (useful for NFS as root filesystem),
+ * and also changed through the sysctl interface.  -- Jamie Lokier, Aug 2003
+ */
+static unsigned long		nlm_grace_period;
+static unsigned long		nlm_timeout = LOCKD_DFLT_TIMEO;
+static int			nlm_udpport, nlm_tcpport;
+
+/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
+static unsigned int		nlm_max_connections = 1024;
+
+/*
+ * Constants needed for the sysctl interface.
+ */
+static const unsigned long	nlm_grace_period_min = 0;
+static const unsigned long	nlm_grace_period_max = 240;
+static const unsigned long	nlm_timeout_min = 3;
+static const unsigned long	nlm_timeout_max = 20;
+static const int		nlm_port_min = 0, nlm_port_max = 65535;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header * nlm_sysctl_table;
+#endif
+
+static unsigned long get_lockd_grace_period(void)
+{
+	/* Note: nlm_timeout should always be nonzero */
+	if (nlm_grace_period)
+		return roundup(nlm_grace_period, nlm_timeout) * HZ;
+	else
+		return nlm_timeout * 5 * HZ;
+}
+
+static struct lock_manager lockd_manager = {
+};
+
+static void grace_ender(struct work_struct *not_used)
+{
+	locks_end_grace(&lockd_manager);
+}
+
+static DECLARE_DELAYED_WORK(grace_period_end, grace_ender);
+
+static void set_grace_period(void)
+{
+	unsigned long grace_period = get_lockd_grace_period();
+
+	locks_start_grace(&lockd_manager);
+	cancel_delayed_work_sync(&grace_period_end);
+	schedule_delayed_work(&grace_period_end, grace_period);
+}
+
+static void restart_grace(void)
+{
+	if (nlmsvc_ops) {
+		cancel_delayed_work_sync(&grace_period_end);
+		locks_end_grace(&lockd_manager);
+		nlmsvc_invalidate_all();
+		set_grace_period();
+	}
+}
+
+/*
+ * This is the lockd kernel thread
+ */
+static int
+lockd(void *vrqstp)
+{
+	int		err = 0, preverr = 0;
+	struct svc_rqst *rqstp = vrqstp;
+
+	/* try_to_freeze() is called from svc_recv() */
+	set_freezable();
+
+	/* Allow SIGKILL to tell lockd to drop all of its locks */
+	allow_signal(SIGKILL);
+
+	dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
+
+	if (!nlm_timeout)
+		nlm_timeout = LOCKD_DFLT_TIMEO;
+	nlmsvc_timeout = nlm_timeout * HZ;
+
+	set_grace_period();
+
+	/*
+	 * The main request loop. We don't terminate until the last
+	 * NFS mount or NFS daemon has gone away.
+	 */
+	while (!kthread_should_stop()) {
+		long timeout = MAX_SCHEDULE_TIMEOUT;
+		RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+
+		/* update sv_maxconn if it has changed */
+		rqstp->rq_server->sv_maxconn = nlm_max_connections;
+
+		if (signalled()) {
+			flush_signals(current);
+			restart_grace();
+			continue;
+		}
+
+		timeout = nlmsvc_retry_blocked();
+
+		/*
+		 * Find a socket with data available and call its
+		 * recvfrom routine.
+		 */
+		err = svc_recv(rqstp, timeout);
+		if (err == -EAGAIN || err == -EINTR) {
+			preverr = err;
+			continue;
+		}
+		if (err < 0) {
+			if (err != preverr) {
+				printk(KERN_WARNING "%s: unexpected error "
+					"from svc_recv (%d)\n", __func__, err);
+				preverr = err;
+			}
+			schedule_timeout_interruptible(HZ);
+			continue;
+		}
+		preverr = err;
+
+		dprintk("lockd: request from %s\n",
+				svc_print_addr(rqstp, buf, sizeof(buf)));
+
+		svc_process(rqstp);
+	}
+	flush_signals(current);
+	cancel_delayed_work_sync(&grace_period_end);
+	locks_end_grace(&lockd_manager);
+	if (nlmsvc_ops)
+		nlmsvc_invalidate_all();
+	nlm_shutdown_hosts();
+	return 0;
+}
+
+static int create_lockd_listener(struct svc_serv *serv, const char *name,
+				 const int family, const unsigned short port)
+{
+	struct svc_xprt *xprt;
+
+	xprt = svc_find_xprt(serv, name, family, 0);
+	if (xprt == NULL)
+		return svc_create_xprt(serv, name, &init_net, family, port,
+						SVC_SOCK_DEFAULTS);
+	svc_xprt_put(xprt);
+	return 0;
+}
+
+static int create_lockd_family(struct svc_serv *serv, const int family)
+{
+	int err;
+
+	err = create_lockd_listener(serv, "udp", family, nlm_udpport);
+	if (err < 0)
+		return err;
+
+	return create_lockd_listener(serv, "tcp", family, nlm_tcpport);
+}
+
+/*
+ * Ensure there are active UDP and TCP listeners for lockd.
+ *
+ * Even if we have only TCP NFS mounts and/or TCP NFSDs, some
+ * local services (such as rpc.statd) still require UDP, and
+ * some NFS servers do not yet support NLM over TCP.
+ *
+ * Returns zero if all listeners are available; otherwise a
+ * negative errno value is returned.
+ */
+static int make_socks(struct svc_serv *serv)
+{
+	static int warned;
+	int err;
+
+	err = create_lockd_family(serv, PF_INET);
+	if (err < 0)
+		goto out_err;
+
+	err = create_lockd_family(serv, PF_INET6);
+	if (err < 0 && err != -EAFNOSUPPORT)
+		goto out_err;
+
+	warned = 0;
+	return 0;
+
+out_err:
+	if (warned++ == 0)
+		printk(KERN_WARNING
+			"lockd_up: makesock failed, error=%d\n", err);
+	return err;
+}
+
+/*
+ * Bring up the lockd process if it's not already up.
+ */
+int lockd_up(void)
+{
+	struct svc_serv *serv;
+	int		error = 0;
+
+	mutex_lock(&nlmsvc_mutex);
+	/*
+	 * Check whether we're already up and running.
+	 */
+	if (nlmsvc_rqst)
+		goto out;
+
+	/*
+	 * Sanity check: if there's no pid,
+	 * we should be the first user ...
+	 */
+	if (nlmsvc_users)
+		printk(KERN_WARNING
+			"lockd_up: no pid, %d users??\n", nlmsvc_users);
+
+	error = -ENOMEM;
+	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
+	if (!serv) {
+		printk(KERN_WARNING "lockd_up: create service failed\n");
+		goto out;
+	}
+
+	error = make_socks(serv);
+	if (error < 0)
+		goto destroy_and_out;
+
+	/*
+	 * Create the kernel thread and wait for it to start.
+	 */
+	nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+	if (IS_ERR(nlmsvc_rqst)) {
+		error = PTR_ERR(nlmsvc_rqst);
+		nlmsvc_rqst = NULL;
+		printk(KERN_WARNING
+			"lockd_up: svc_rqst allocation failed, error=%d\n",
+			error);
+		goto destroy_and_out;
+	}
+
+	svc_sock_update_bufs(serv);
+	serv->sv_maxconn = nlm_max_connections;
+
+	nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
+	if (IS_ERR(nlmsvc_task)) {
+		error = PTR_ERR(nlmsvc_task);
+		svc_exit_thread(nlmsvc_rqst);
+		nlmsvc_task = NULL;
+		nlmsvc_rqst = NULL;
+		printk(KERN_WARNING
+			"lockd_up: kthread_run failed, error=%d\n", error);
+		goto destroy_and_out;
+	}
+
+	/*
+	 * Note: svc_serv structures have an initial use count of 1,
+	 * so we exit through here on both success and failure.
+	 */
+destroy_and_out:
+	svc_destroy(serv);
+out:
+	if (!error)
+		nlmsvc_users++;
+	mutex_unlock(&nlmsvc_mutex);
+	return error;
+}
+EXPORT_SYMBOL_GPL(lockd_up);
+
+/*
+ * Decrement the user count and bring down lockd if we're the last.
+ */
+void
+lockd_down(void)
+{
+	mutex_lock(&nlmsvc_mutex);
+	if (nlmsvc_users) {
+		if (--nlmsvc_users)
+			goto out;
+	} else {
+		printk(KERN_ERR "lockd_down: no users! task=%p\n",
+			nlmsvc_task);
+		BUG();
+	}
+
+	if (!nlmsvc_task) {
+		printk(KERN_ERR "lockd_down: no lockd running.\n");
+		BUG();
+	}
+	kthread_stop(nlmsvc_task);
+	svc_exit_thread(nlmsvc_rqst);
+	nlmsvc_task = NULL;
+	nlmsvc_rqst = NULL;
+out:
+	mutex_unlock(&nlmsvc_mutex);
+}
+EXPORT_SYMBOL_GPL(lockd_down);
+
+#ifdef CONFIG_SYSCTL
+
+/*
+ * Sysctl parameters (same as module parameters, different interface).
+ */
+
+static ctl_table nlm_sysctls[] = {
+	{
+		.procname	= "nlm_grace_period",
+		.data		= &nlm_grace_period,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= (unsigned long *) &nlm_grace_period_min,
+		.extra2		= (unsigned long *) &nlm_grace_period_max,
+	},
+	{
+		.procname	= "nlm_timeout",
+		.data		= &nlm_timeout,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= (unsigned long *) &nlm_timeout_min,
+		.extra2		= (unsigned long *) &nlm_timeout_max,
+	},
+	{
+		.procname	= "nlm_udpport",
+		.data		= &nlm_udpport,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= (int *) &nlm_port_min,
+		.extra2		= (int *) &nlm_port_max,
+	},
+	{
+		.procname	= "nlm_tcpport",
+		.data		= &nlm_tcpport,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= (int *) &nlm_port_min,
+		.extra2		= (int *) &nlm_port_max,
+	},
+	{
+		.procname	= "nsm_use_hostnames",
+		.data		= &nsm_use_hostnames,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "nsm_local_state",
+		.data		= &nsm_local_state,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table nlm_sysctl_dir[] = {
+	{
+		.procname	= "nfs",
+		.mode		= 0555,
+		.child		= nlm_sysctls,
+	},
+	{ }
+};
+
+static ctl_table nlm_sysctl_root[] = {
+	{
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= nlm_sysctl_dir,
+	},
+	{ }
+};
+
+#endif	/* CONFIG_SYSCTL */
+
+/*
+ * Module (and sysfs) parameters.
+ */
+
+#define param_set_min_max(name, type, which_strtol, min, max)		\
+static int param_set_##name(const char *val, struct kernel_param *kp)	\
+{									\
+	char *endp;							\
+	__typeof__(type) num = which_strtol(val, &endp, 0);		\
+	if (endp == val || *endp || num < (min) || num > (max))		\
+		return -EINVAL;						\
+	*((type *) kp->arg) = num;					\
+	return 0;							\
+}
+
+static inline int is_callback(u32 proc)
+{
+	return proc == NLMPROC_GRANTED
+		|| proc == NLMPROC_GRANTED_MSG
+		|| proc == NLMPROC_TEST_RES
+		|| proc == NLMPROC_LOCK_RES
+		|| proc == NLMPROC_CANCEL_RES
+		|| proc == NLMPROC_UNLOCK_RES
+		|| proc == NLMPROC_NSM_NOTIFY;
+}
+
+
+static int lockd_authenticate(struct svc_rqst *rqstp)
+{
+	rqstp->rq_client = NULL;
+	switch (rqstp->rq_authop->flavour) {
+		case RPC_AUTH_NULL:
+		case RPC_AUTH_UNIX:
+			if (rqstp->rq_proc == 0)
+				return SVC_OK;
+			if (is_callback(rqstp->rq_proc)) {
+				/* Leave it to individual procedures to
+				 * call nlmsvc_lookup_host(rqstp)
+				 */
+				return SVC_OK;
+			}
+			return svc_set_client(rqstp);
+	}
+	return SVC_DENIED;
+}
+
+
+param_set_min_max(port, int, simple_strtol, 0, 65535)
+param_set_min_max(grace_period, unsigned long, simple_strtoul,
+		  nlm_grace_period_min, nlm_grace_period_max)
+param_set_min_max(timeout, unsigned long, simple_strtoul,
+		  nlm_timeout_min, nlm_timeout_max)
+
+MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
+MODULE_DESCRIPTION("NFS file locking service version " LOCKD_VERSION ".");
+MODULE_LICENSE("GPL");
+
+module_param_call(nlm_grace_period, param_set_grace_period, param_get_ulong,
+		  &nlm_grace_period, 0644);
+module_param_call(nlm_timeout, param_set_timeout, param_get_ulong,
+		  &nlm_timeout, 0644);
+module_param_call(nlm_udpport, param_set_port, param_get_int,
+		  &nlm_udpport, 0644);
+module_param_call(nlm_tcpport, param_set_port, param_get_int,
+		  &nlm_tcpport, 0644);
+module_param(nsm_use_hostnames, bool, 0644);
+module_param(nlm_max_connections, uint, 0644);
+
+/*
+ * Initialising and terminating the module.
+ */
+
+static int __init init_nlm(void)
+{
+#ifdef CONFIG_SYSCTL
+	nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root);
+	return nlm_sysctl_table ? 0 : -ENOMEM;
+#else
+	return 0;
+#endif
+}
+
+static void __exit exit_nlm(void)
+{
+	/* FIXME: delete all NLM clients */
+	nlm_shutdown_hosts();
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(nlm_sysctl_table);
+#endif
+}
+
+module_init(init_nlm);
+module_exit(exit_nlm);
+
+/*
+ * Define NLM program and procedures
+ */
+static struct svc_version	nlmsvc_version1 = {
+		.vs_vers	= 1,
+		.vs_nproc	= 17,
+		.vs_proc	= nlmsvc_procedures,
+		.vs_xdrsize	= NLMSVC_XDRSIZE,
+};
+static struct svc_version	nlmsvc_version3 = {
+		.vs_vers	= 3,
+		.vs_nproc	= 24,
+		.vs_proc	= nlmsvc_procedures,
+		.vs_xdrsize	= NLMSVC_XDRSIZE,
+};
+#ifdef CONFIG_LOCKD_V4
+static struct svc_version	nlmsvc_version4 = {
+		.vs_vers	= 4,
+		.vs_nproc	= 24,
+		.vs_proc	= nlmsvc_procedures4,
+		.vs_xdrsize	= NLMSVC_XDRSIZE,
+};
+#endif
+static struct svc_version *	nlmsvc_version[] = {
+	[1] = &nlmsvc_version1,
+	[3] = &nlmsvc_version3,
+#ifdef CONFIG_LOCKD_V4
+	[4] = &nlmsvc_version4,
+#endif
+};
+
+static struct svc_stat		nlmsvc_stats;
+
+#define NLM_NRVERS	ARRAY_SIZE(nlmsvc_version)
+static struct svc_program	nlmsvc_program = {
+	.pg_prog		= NLM_PROGRAM,		/* program number */
+	.pg_nvers		= NLM_NRVERS,		/* number of entries in nlmsvc_version */
+	.pg_vers		= nlmsvc_version,	/* version table */
+	.pg_name		= "lockd",		/* service name */
+	.pg_class		= "nfsd",		/* share authentication with nfsd */
+	.pg_stats		= &nlmsvc_stats,	/* stats table */
+	.pg_authenticate = &lockd_authenticate	/* export authentication */
+};
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
new file mode 100644
index 00000000..9a41fdc1
--- /dev/null
+++ b/fs/lockd/svc4proc.c
@@ -0,0 +1,503 @@
+/*
+ * linux/fs/lockd/svc4proc.c
+ *
+ * Lockd server procedures. We don't implement the NLM_*_RES 
+ * procedures because we don't use the async procedures.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/lockd/lockd.h>
+#include <linux/lockd/share.h>
+
+#define NLMDBG_FACILITY		NLMDBG_CLIENT
+
+/*
+ * Obtain client and file from arguments
+ */
+static __be32
+nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
+			struct nlm_host **hostp, struct nlm_file **filp)
+{
+	struct nlm_host		*host = NULL;
+	struct nlm_file		*file = NULL;
+	struct nlm_lock		*lock = &argp->lock;
+	__be32			error = 0;
+
+	/* nfsd callbacks must have been installed for this procedure */
+	if (!nlmsvc_ops)
+		return nlm_lck_denied_nolocks;
+
+	/* Obtain host handle */
+	if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
+	 || (argp->monitor && nsm_monitor(host) < 0))
+		goto no_locks;
+	*hostp = host;
+
+	/* Obtain file pointer. Not used by FREE_ALL call. */
+	if (filp != NULL) {
+		if ((error = nlm_lookup_file(rqstp, &file, &lock->fh)) != 0)
+			goto no_locks;
+		*filp = file;
+
+		/* Set up the missing parts of the file_lock structure */
+		lock->fl.fl_file  = file->f_file;
+		lock->fl.fl_owner = (fl_owner_t) host;
+		lock->fl.fl_lmops = &nlmsvc_lock_operations;
+	}
+
+	return 0;
+
+no_locks:
+	nlmsvc_release_host(host);
+ 	if (error)
+		return error;	
+	return nlm_lck_denied_nolocks;
+}
+
+/*
+ * NULL: Test for presence of service
+ */
+static __be32
+nlm4svc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	dprintk("lockd: NULL          called\n");
+	return rpc_success;
+}
+
+/*
+ * TEST: Check for conflicting lock
+ */
+static __be32
+nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
+				         struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+	__be32 rc = rpc_success;
+
+	dprintk("lockd: TEST4        called\n");
+	resp->cookie = argp->cookie;
+
+	/* Obtain client and file */
+	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now check for conflicting locks */
+	resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie);
+	if (resp->status == nlm_drop_reply)
+		rc = rpc_drop_reply;
+	else
+		dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
+
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rc;
+}
+
+static __be32
+nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
+				         struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+	__be32 rc = rpc_success;
+
+	dprintk("lockd: LOCK          called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Obtain client and file */
+	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+#if 0
+	/* If supplied state doesn't match current state, we assume it's
+	 * an old request that time-warped somehow. Any error return would
+	 * do in this case because it's irrelevant anyway.
+	 *
+	 * NB: We don't retrieve the remote host's state yet.
+	 */
+	if (host->h_nsmstate && host->h_nsmstate != argp->state) {
+		resp->status = nlm_lck_denied_nolocks;
+	} else
+#endif
+
+	/* Now try to lock the file */
+	resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock,
+					argp->block, &argp->cookie,
+					argp->reclaim);
+	if (resp->status == nlm_drop_reply)
+		rc = rpc_drop_reply;
+	else
+		dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
+
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rc;
+}
+
+static __be32
+nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
+				           struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+
+	dprintk("lockd: CANCEL        called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept requests during grace period */
+	if (locks_in_grace()) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Try to cancel request. */
+	resp->status = nlmsvc_cancel_blocked(file, &argp->lock);
+
+	dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * UNLOCK: release a lock
+ */
+static __be32
+nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
+				           struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+
+	dprintk("lockd: UNLOCK        called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept new lock requests during grace period */
+	if (locks_in_grace()) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now try to remove the lock */
+	resp->status = nlmsvc_unlock(file, &argp->lock);
+
+	dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * GRANTED: A server calls us to tell that a process' lock request
+ * was granted
+ */
+static __be32
+nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
+				            struct nlm_res  *resp)
+{
+	resp->cookie = argp->cookie;
+
+	dprintk("lockd: GRANTED       called\n");
+	resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
+	dprintk("lockd: GRANTED       status %d\n", ntohl(resp->status));
+	return rpc_success;
+}
+
+/*
+ * This is the generic lockd callback for async RPC calls
+ */
+static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
+{
+	dprintk("lockd: %5u callback returned %d\n", task->tk_pid,
+			-task->tk_status);
+}
+
+static void nlm4svc_callback_release(void *data)
+{
+	nlmsvc_release_call(data);
+}
+
+static const struct rpc_call_ops nlm4svc_callback_ops = {
+	.rpc_call_done = nlm4svc_callback_exit,
+	.rpc_release = nlm4svc_callback_release,
+};
+
+/*
+ * `Async' versions of the above service routines. They aren't really,
+ * because we send the callback before the reply proper. I hope this
+ * doesn't break any clients.
+ */
+static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
+		__be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res  *))
+{
+	struct nlm_host	*host;
+	struct nlm_rqst	*call;
+	__be32 stat;
+
+	host = nlmsvc_lookup_host(rqstp,
+				  argp->lock.caller,
+				  argp->lock.len);
+	if (host == NULL)
+		return rpc_system_err;
+
+	call = nlm_alloc_call(host);
+	if (call == NULL)
+		return rpc_system_err;
+
+	stat = func(rqstp, argp, &call->a_res);
+	if (stat != 0) {
+		nlmsvc_release_call(call);
+		return stat;
+	}
+
+	call->a_flags = RPC_TASK_ASYNC;
+	if (nlm_async_reply(call, proc, &nlm4svc_callback_ops) < 0)
+		return rpc_system_err;
+	return rpc_success;
+}
+
+static __be32 nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+					     void	     *resp)
+{
+	dprintk("lockd: TEST_MSG      called\n");
+	return nlm4svc_callback(rqstp, NLMPROC_TEST_RES, argp, nlm4svc_proc_test);
+}
+
+static __be32 nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+					     void	     *resp)
+{
+	dprintk("lockd: LOCK_MSG      called\n");
+	return nlm4svc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlm4svc_proc_lock);
+}
+
+static __be32 nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+					       void	       *resp)
+{
+	dprintk("lockd: CANCEL_MSG    called\n");
+	return nlm4svc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlm4svc_proc_cancel);
+}
+
+static __be32 nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+                                               void            *resp)
+{
+	dprintk("lockd: UNLOCK_MSG    called\n");
+	return nlm4svc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlm4svc_proc_unlock);
+}
+
+static __be32 nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+                                                void            *resp)
+{
+	dprintk("lockd: GRANTED_MSG   called\n");
+	return nlm4svc_callback(rqstp, NLMPROC_GRANTED_RES, argp, nlm4svc_proc_granted);
+}
+
+/*
+ * SHARE: create a DOS share or alter existing share.
+ */
+static __be32
+nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
+				          struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+
+	dprintk("lockd: SHARE         called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept new lock requests during grace period */
+	if (locks_in_grace() && !argp->reclaim) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now try to create the share */
+	resp->status = nlmsvc_share_file(host, file, argp);
+
+	dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * UNSHARE: Release a DOS share.
+ */
+static __be32
+nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
+				            struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+
+	dprintk("lockd: UNSHARE       called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept requests during grace period */
+	if (locks_in_grace()) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now try to lock the file */
+	resp->status = nlmsvc_unshare_file(host, file, argp);
+
+	dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * NM_LOCK: Create an unmonitored lock
+ */
+static __be32
+nlm4svc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
+				            struct nlm_res  *resp)
+{
+	dprintk("lockd: NM_LOCK       called\n");
+
+	argp->monitor = 0;		/* just clean the monitor flag */
+	return nlm4svc_proc_lock(rqstp, argp, resp);
+}
+
+/*
+ * FREE_ALL: Release all locks and shares held by client
+ */
+static __be32
+nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
+					     void            *resp)
+{
+	struct nlm_host	*host;
+
+	/* Obtain client */
+	if (nlm4svc_retrieve_args(rqstp, argp, &host, NULL))
+		return rpc_success;
+
+	nlmsvc_free_host_resources(host);
+	nlmsvc_release_host(host);
+	return rpc_success;
+}
+
+/*
+ * SM_NOTIFY: private callback from statd (not part of official NLM proto)
+ */
+static __be32
+nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
+					      void	        *resp)
+{
+	dprintk("lockd: SM_NOTIFY     called\n");
+
+	if (!nlm_privileged_requester(rqstp)) {
+		char buf[RPC_MAX_ADDRBUFLEN];
+		printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
+				svc_print_addr(rqstp, buf, sizeof(buf)));
+		return rpc_system_err;
+	}
+
+	nlm_host_rebooted(argp);
+	return rpc_success;
+}
+
+/*
+ * client sent a GRANTED_RES, let's remove the associated block
+ */
+static __be32
+nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
+                                                void            *resp)
+{
+        if (!nlmsvc_ops)
+                return rpc_success;
+
+        dprintk("lockd: GRANTED_RES   called\n");
+
+        nlmsvc_grant_reply(&argp->cookie, argp->status);
+        return rpc_success;
+}
+
+
+/*
+ * NLM Server procedures.
+ */
+
+#define nlm4svc_encode_norep	nlm4svc_encode_void
+#define nlm4svc_decode_norep	nlm4svc_decode_void
+#define nlm4svc_decode_testres	nlm4svc_decode_void
+#define nlm4svc_decode_lockres	nlm4svc_decode_void
+#define nlm4svc_decode_unlockres	nlm4svc_decode_void
+#define nlm4svc_decode_cancelres	nlm4svc_decode_void
+#define nlm4svc_decode_grantedres	nlm4svc_decode_void
+
+#define nlm4svc_proc_none	nlm4svc_proc_null
+#define nlm4svc_proc_test_res	nlm4svc_proc_null
+#define nlm4svc_proc_lock_res	nlm4svc_proc_null
+#define nlm4svc_proc_cancel_res	nlm4svc_proc_null
+#define nlm4svc_proc_unlock_res	nlm4svc_proc_null
+
+struct nlm_void			{ int dummy; };
+
+#define PROC(name, xargt, xrest, argt, rest, respsize)	\
+ { .pc_func	= (svc_procfunc) nlm4svc_proc_##name,	\
+   .pc_decode	= (kxdrproc_t) nlm4svc_decode_##xargt,	\
+   .pc_encode	= (kxdrproc_t) nlm4svc_encode_##xrest,	\
+   .pc_release	= NULL,					\
+   .pc_argsize	= sizeof(struct nlm_##argt),		\
+   .pc_ressize	= sizeof(struct nlm_##rest),		\
+   .pc_xdrressize = respsize,				\
+ }
+#define	Ck	(1+XDR_QUADLEN(NLM_MAXCOOKIELEN))	/* cookie */
+#define	No	(1+1024/4)				/* netobj */
+#define	St	1					/* status */
+#define	Rg	4					/* range (offset + length) */
+struct svc_procedure		nlmsvc_procedures4[] = {
+  PROC(null,		void,		void,		void,	void, 1),
+  PROC(test,		testargs,	testres,	args,	res, Ck+St+2+No+Rg),
+  PROC(lock,		lockargs,	res,		args,	res, Ck+St),
+  PROC(cancel,		cancargs,	res,		args,	res, Ck+St),
+  PROC(unlock,		unlockargs,	res,		args,	res, Ck+St),
+  PROC(granted,		testargs,	res,		args,	res, Ck+St),
+  PROC(test_msg,	testargs,	norep,		args,	void, 1),
+  PROC(lock_msg,	lockargs,	norep,		args,	void, 1),
+  PROC(cancel_msg,	cancargs,	norep,		args,	void, 1),
+  PROC(unlock_msg,	unlockargs,	norep,		args,	void, 1),
+  PROC(granted_msg,	testargs,	norep,		args,	void, 1),
+  PROC(test_res,	testres,	norep,		res,	void, 1),
+  PROC(lock_res,	lockres,	norep,		res,	void, 1),
+  PROC(cancel_res,	cancelres,	norep,		res,	void, 1),
+  PROC(unlock_res,	unlockres,	norep,		res,	void, 1),
+  PROC(granted_res,	res,		norep,		res,	void, 1),
+  /* statd callback */
+  PROC(sm_notify,	reboot,		void,		reboot,	void, 1),
+  PROC(none,		void,		void,		void,	void, 0),
+  PROC(none,		void,		void,		void,	void, 0),
+  PROC(none,		void,		void,		void,	void, 0),
+  PROC(share,		shareargs,	shareres,	args,	res, Ck+St+1),
+  PROC(unshare,		shareargs,	shareres,	args,	res, Ck+St+1),
+  PROC(nm_lock,		lockargs,	res,		args,	res, Ck+St),
+  PROC(free_all,	notify,		void,		args,	void, 1),
+
+};
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
new file mode 100644
index 00000000..6e31695d
--- /dev/null
+++ b/fs/lockd/svclock.c
@@ -0,0 +1,966 @@
+/*
+ * linux/fs/lockd/svclock.c
+ *
+ * Handling of server-side locks, mostly of the blocked variety.
+ * This is the ugliest part of lockd because we tread on very thin ice.
+ * GRANT and CANCEL calls may get stuck, meet in mid-flight, etc.
+ * IMNSHO introducing the grant callback into the NLM protocol was one
+ * of the worst ideas Sun ever had. Except maybe for the idea of doing
+ * NFS file locking at all.
+ *
+ * I'm trying hard to avoid race conditions by protecting most accesses
+ * to a file's list of blocked locks through a semaphore. The global
+ * list of blocked locks is not protected in this fashion however.
+ * Therefore, some functions (such as the RPC callback for the async grant
+ * call) move blocked locks towards the head of the list *while some other
+ * process might be traversing it*. This should not be a problem in
+ * practice, because this will only cause functions traversing the list
+ * to visit some blocks twice.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/lockd/nlm.h>
+#include <linux/lockd/lockd.h>
+#include <linux/kthread.h>
+
+#define NLMDBG_FACILITY		NLMDBG_SVCLOCK
+
+#ifdef CONFIG_LOCKD_V4
+#define nlm_deadlock	nlm4_deadlock
+#else
+#define nlm_deadlock	nlm_lck_denied
+#endif
+
+static void nlmsvc_release_block(struct nlm_block *block);
+static void	nlmsvc_insert_block(struct nlm_block *block, unsigned long);
+static void	nlmsvc_remove_block(struct nlm_block *block);
+
+static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
+static void nlmsvc_freegrantargs(struct nlm_rqst *call);
+static const struct rpc_call_ops nlmsvc_grant_ops;
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
+
+/*
+ * The list of blocked locks to retry
+ */
+static LIST_HEAD(nlm_blocked);
+static DEFINE_SPINLOCK(nlm_blocked_lock);
+
+/*
+ * Insert a blocked lock into the global list
+ */
+static void
+nlmsvc_insert_block_locked(struct nlm_block *block, unsigned long when)
+{
+	struct nlm_block *b;
+	struct list_head *pos;
+
+	dprintk("lockd: nlmsvc_insert_block(%p, %ld)\n", block, when);
+	if (list_empty(&block->b_list)) {
+		kref_get(&block->b_count);
+	} else {
+		list_del_init(&block->b_list);
+	}
+
+	pos = &nlm_blocked;
+	if (when != NLM_NEVER) {
+		if ((when += jiffies) == NLM_NEVER)
+			when ++;
+		list_for_each(pos, &nlm_blocked) {
+			b = list_entry(pos, struct nlm_block, b_list);
+			if (time_after(b->b_when,when) || b->b_when == NLM_NEVER)
+				break;
+		}
+		/* On normal exit from the loop, pos == &nlm_blocked,
+		 * so we will be adding to the end of the list - good
+		 */
+	}
+
+	list_add_tail(&block->b_list, pos);
+	block->b_when = when;
+}
+
+static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
+{
+	spin_lock(&nlm_blocked_lock);
+	nlmsvc_insert_block_locked(block, when);
+	spin_unlock(&nlm_blocked_lock);
+}
+
+/*
+ * Remove a block from the global list
+ */
+static inline void
+nlmsvc_remove_block(struct nlm_block *block)
+{
+	if (!list_empty(&block->b_list)) {
+		spin_lock(&nlm_blocked_lock);
+		list_del_init(&block->b_list);
+		spin_unlock(&nlm_blocked_lock);
+		nlmsvc_release_block(block);
+	}
+}
+
+/*
+ * Find a block for a given lock
+ */
+static struct nlm_block *
+nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock)
+{
+	struct nlm_block	*block;
+	struct file_lock	*fl;
+
+	dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n",
+				file, lock->fl.fl_pid,
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end, lock->fl.fl_type);
+	list_for_each_entry(block, &nlm_blocked, b_list) {
+		fl = &block->b_call->a_args.lock.fl;
+		dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n",
+				block->b_file, fl->fl_pid,
+				(long long)fl->fl_start,
+				(long long)fl->fl_end, fl->fl_type,
+				nlmdbg_cookie2a(&block->b_call->a_args.cookie));
+		if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) {
+			kref_get(&block->b_count);
+			return block;
+		}
+	}
+
+	return NULL;
+}
+
+static inline int nlm_cookie_match(struct nlm_cookie *a, struct nlm_cookie *b)
+{
+	if (a->len != b->len)
+		return 0;
+	if (memcmp(a->data, b->data, a->len))
+		return 0;
+	return 1;
+}
+
+/*
+ * Find a block with a given NLM cookie.
+ */
+static inline struct nlm_block *
+nlmsvc_find_block(struct nlm_cookie *cookie)
+{
+	struct nlm_block *block;
+
+	list_for_each_entry(block, &nlm_blocked, b_list) {
+		if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie))
+			goto found;
+	}
+
+	return NULL;
+
+found:
+	dprintk("nlmsvc_find_block(%s): block=%p\n", nlmdbg_cookie2a(cookie), block);
+	kref_get(&block->b_count);
+	return block;
+}
+
+/*
+ * Create a block and initialize it.
+ *
+ * Note: we explicitly set the cookie of the grant reply to that of
+ * the blocked lock request. The spec explicitly mentions that the client
+ * should _not_ rely on the callback containing the same cookie as the
+ * request, but (as I found out later) that's because some implementations
+ * do just this. Never mind the standards comittees, they support our
+ * logging industries.
+ *
+ * 10 years later: I hope we can safely ignore these old and broken
+ * clients by now. Let's fix this so we can uniquely identify an incoming
+ * GRANTED_RES message by cookie, without having to rely on the client's IP
+ * address. --okir
+ */
+static struct nlm_block *
+nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
+		    struct nlm_file *file, struct nlm_lock *lock,
+		    struct nlm_cookie *cookie)
+{
+	struct nlm_block	*block;
+	struct nlm_rqst		*call = NULL;
+
+	nlm_get_host(host);
+	call = nlm_alloc_call(host);
+	if (call == NULL)
+		return NULL;
+
+	/* Allocate memory for block, and initialize arguments */
+	block = kzalloc(sizeof(*block), GFP_KERNEL);
+	if (block == NULL)
+		goto failed;
+	kref_init(&block->b_count);
+	INIT_LIST_HEAD(&block->b_list);
+	INIT_LIST_HEAD(&block->b_flist);
+
+	if (!nlmsvc_setgrantargs(call, lock))
+		goto failed_free;
+
+	/* Set notifier function for VFS, and init args */
+	call->a_args.lock.fl.fl_flags |= FL_SLEEP;
+	call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations;
+	nlmclnt_next_cookie(&call->a_args.cookie);
+
+	dprintk("lockd: created block %p...\n", block);
+
+	/* Create and initialize the block */
+	block->b_daemon = rqstp->rq_server;
+	block->b_host   = host;
+	block->b_file   = file;
+	block->b_fl = NULL;
+	file->f_count++;
+
+	/* Add to file's list of blocks */
+	list_add(&block->b_flist, &file->f_blocks);
+
+	/* Set up RPC arguments for callback */
+	block->b_call = call;
+	call->a_flags   = RPC_TASK_ASYNC;
+	call->a_block = block;
+
+	return block;
+
+failed_free:
+	kfree(block);
+failed:
+	nlmsvc_release_call(call);
+	return NULL;
+}
+
+/*
+ * Delete a block.
+ * It is the caller's responsibility to check whether the file
+ * can be closed hereafter.
+ */
+static int nlmsvc_unlink_block(struct nlm_block *block)
+{
+	int status;
+	dprintk("lockd: unlinking block %p...\n", block);
+
+	/* Remove block from list */
+	status = posix_unblock_lock(block->b_file->f_file, &block->b_call->a_args.lock.fl);
+	nlmsvc_remove_block(block);
+	return status;
+}
+
+static void nlmsvc_free_block(struct kref *kref)
+{
+	struct nlm_block *block = container_of(kref, struct nlm_block, b_count);
+	struct nlm_file		*file = block->b_file;
+
+	dprintk("lockd: freeing block %p...\n", block);
+
+	/* Remove block from file's list of blocks */
+	mutex_lock(&file->f_mutex);
+	list_del_init(&block->b_flist);
+	mutex_unlock(&file->f_mutex);
+
+	nlmsvc_freegrantargs(block->b_call);
+	nlmsvc_release_call(block->b_call);
+	nlm_release_file(block->b_file);
+	kfree(block->b_fl);
+	kfree(block);
+}
+
+static void nlmsvc_release_block(struct nlm_block *block)
+{
+	if (block != NULL)
+		kref_put(&block->b_count, nlmsvc_free_block);
+}
+
+/*
+ * Loop over all blocks and delete blocks held by
+ * a matching host.
+ */
+void nlmsvc_traverse_blocks(struct nlm_host *host,
+			struct nlm_file *file,
+			nlm_host_match_fn_t match)
+{
+	struct nlm_block *block, *next;
+
+restart:
+	mutex_lock(&file->f_mutex);
+	list_for_each_entry_safe(block, next, &file->f_blocks, b_flist) {
+		if (!match(block->b_host, host))
+			continue;
+		/* Do not destroy blocks that are not on
+		 * the global retry list - why? */
+		if (list_empty(&block->b_list))
+			continue;
+		kref_get(&block->b_count);
+		mutex_unlock(&file->f_mutex);
+		nlmsvc_unlink_block(block);
+		nlmsvc_release_block(block);
+		goto restart;
+	}
+	mutex_unlock(&file->f_mutex);
+}
+
+/*
+ * Initialize arguments for GRANTED call. The nlm_rqst structure
+ * has been cleared already.
+ */
+static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock)
+{
+	locks_copy_lock(&call->a_args.lock.fl, &lock->fl);
+	memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh));
+	call->a_args.lock.caller = utsname()->nodename;
+	call->a_args.lock.oh.len = lock->oh.len;
+
+	/* set default data area */
+	call->a_args.lock.oh.data = call->a_owner;
+	call->a_args.lock.svid = lock->fl.fl_pid;
+
+	if (lock->oh.len > NLMCLNT_OHSIZE) {
+		void *data = kmalloc(lock->oh.len, GFP_KERNEL);
+		if (!data)
+			return 0;
+		call->a_args.lock.oh.data = (u8 *) data;
+	}
+
+	memcpy(call->a_args.lock.oh.data, lock->oh.data, lock->oh.len);
+	return 1;
+}
+
+static void nlmsvc_freegrantargs(struct nlm_rqst *call)
+{
+	if (call->a_args.lock.oh.data != call->a_owner)
+		kfree(call->a_args.lock.oh.data);
+
+	locks_release_private(&call->a_args.lock.fl);
+}
+
+/*
+ * Deferred lock request handling for non-blocking lock
+ */
+static __be32
+nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block)
+{
+	__be32 status = nlm_lck_denied_nolocks;
+
+	block->b_flags |= B_QUEUED;
+
+	nlmsvc_insert_block(block, NLM_TIMEOUT);
+
+	block->b_cache_req = &rqstp->rq_chandle;
+	if (rqstp->rq_chandle.defer) {
+		block->b_deferred_req =
+			rqstp->rq_chandle.defer(block->b_cache_req);
+		if (block->b_deferred_req != NULL)
+			status = nlm_drop_reply;
+	}
+	dprintk("lockd: nlmsvc_defer_lock_rqst block %p flags %d status %d\n",
+		block, block->b_flags, ntohl(status));
+
+	return status;
+}
+
+/*
+ * Attempt to establish a lock, and if it can't be granted, block it
+ * if required.
+ */
+__be32
+nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
+	    struct nlm_host *host, struct nlm_lock *lock, int wait,
+	    struct nlm_cookie *cookie, int reclaim)
+{
+	struct nlm_block	*block = NULL;
+	int			error;
+	__be32			ret;
+
+	dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n",
+				file->f_file->f_path.dentry->d_inode->i_sb->s_id,
+				file->f_file->f_path.dentry->d_inode->i_ino,
+				lock->fl.fl_type, lock->fl.fl_pid,
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end,
+				wait);
+
+	/* Lock file against concurrent access */
+	mutex_lock(&file->f_mutex);
+	/* Get existing block (in case client is busy-waiting)
+	 * or create new block
+	 */
+	block = nlmsvc_lookup_block(file, lock);
+	if (block == NULL) {
+		block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
+		ret = nlm_lck_denied_nolocks;
+		if (block == NULL)
+			goto out;
+		lock = &block->b_call->a_args.lock;
+	} else
+		lock->fl.fl_flags &= ~FL_SLEEP;
+
+	if (block->b_flags & B_QUEUED) {
+		dprintk("lockd: nlmsvc_lock deferred block %p flags %d\n",
+							block, block->b_flags);
+		if (block->b_granted) {
+			nlmsvc_unlink_block(block);
+			ret = nlm_granted;
+			goto out;
+		}
+		if (block->b_flags & B_TIMED_OUT) {
+			nlmsvc_unlink_block(block);
+			ret = nlm_lck_denied;
+			goto out;
+		}
+		ret = nlm_drop_reply;
+		goto out;
+	}
+
+	if (locks_in_grace() && !reclaim) {
+		ret = nlm_lck_denied_grace_period;
+		goto out;
+	}
+	if (reclaim && !locks_in_grace()) {
+		ret = nlm_lck_denied_grace_period;
+		goto out;
+	}
+
+	if (!wait)
+		lock->fl.fl_flags &= ~FL_SLEEP;
+	error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
+	lock->fl.fl_flags &= ~FL_SLEEP;
+
+	dprintk("lockd: vfs_lock_file returned %d\n", error);
+	switch (error) {
+		case 0:
+			ret = nlm_granted;
+			goto out;
+		case -EAGAIN:
+			/*
+			 * If this is a blocking request for an
+			 * already pending lock request then we need
+			 * to put it back on lockd's block list
+			 */
+			if (wait)
+				break;
+			ret = nlm_lck_denied;
+			goto out;
+		case FILE_LOCK_DEFERRED:
+			if (wait)
+				break;
+			/* Filesystem lock operation is in progress
+			   Add it to the queue waiting for callback */
+			ret = nlmsvc_defer_lock_rqst(rqstp, block);
+			goto out;
+		case -EDEADLK:
+			ret = nlm_deadlock;
+			goto out;
+		default:			/* includes ENOLCK */
+			ret = nlm_lck_denied_nolocks;
+			goto out;
+	}
+
+	ret = nlm_lck_blocked;
+
+	/* Append to list of blocked */
+	nlmsvc_insert_block(block, NLM_NEVER);
+out:
+	mutex_unlock(&file->f_mutex);
+	nlmsvc_release_block(block);
+	dprintk("lockd: nlmsvc_lock returned %u\n", ret);
+	return ret;
+}
+
+/*
+ * Test for presence of a conflicting lock.
+ */
+__be32
+nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
+		struct nlm_host *host, struct nlm_lock *lock,
+		struct nlm_lock *conflock, struct nlm_cookie *cookie)
+{
+	struct nlm_block 	*block = NULL;
+	int			error;
+	__be32			ret;
+
+	dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
+				file->f_file->f_path.dentry->d_inode->i_sb->s_id,
+				file->f_file->f_path.dentry->d_inode->i_ino,
+				lock->fl.fl_type,
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end);
+
+	/* Get existing block (in case client is busy-waiting) */
+	block = nlmsvc_lookup_block(file, lock);
+
+	if (block == NULL) {
+		struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+
+		if (conf == NULL)
+			return nlm_granted;
+		block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
+		if (block == NULL) {
+			kfree(conf);
+			return nlm_granted;
+		}
+		block->b_fl = conf;
+	}
+	if (block->b_flags & B_QUEUED) {
+		dprintk("lockd: nlmsvc_testlock deferred block %p flags %d fl %p\n",
+			block, block->b_flags, block->b_fl);
+		if (block->b_flags & B_TIMED_OUT) {
+			nlmsvc_unlink_block(block);
+			ret = nlm_lck_denied;
+			goto out;
+		}
+		if (block->b_flags & B_GOT_CALLBACK) {
+			nlmsvc_unlink_block(block);
+			if (block->b_fl != NULL
+					&& block->b_fl->fl_type != F_UNLCK) {
+				lock->fl = *block->b_fl;
+				goto conf_lock;
+			} else {
+				ret = nlm_granted;
+				goto out;
+			}
+		}
+		ret = nlm_drop_reply;
+		goto out;
+	}
+
+	if (locks_in_grace()) {
+		ret = nlm_lck_denied_grace_period;
+		goto out;
+	}
+	error = vfs_test_lock(file->f_file, &lock->fl);
+	if (error == FILE_LOCK_DEFERRED) {
+		ret = nlmsvc_defer_lock_rqst(rqstp, block);
+		goto out;
+	}
+	if (error) {
+		ret = nlm_lck_denied_nolocks;
+		goto out;
+	}
+	if (lock->fl.fl_type == F_UNLCK) {
+		ret = nlm_granted;
+		goto out;
+	}
+
+conf_lock:
+	dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n",
+		lock->fl.fl_type, (long long)lock->fl.fl_start,
+		(long long)lock->fl.fl_end);
+	conflock->caller = "somehost";	/* FIXME */
+	conflock->len = strlen(conflock->caller);
+	conflock->oh.len = 0;		/* don't return OH info */
+	conflock->svid = lock->fl.fl_pid;
+	conflock->fl.fl_type = lock->fl.fl_type;
+	conflock->fl.fl_start = lock->fl.fl_start;
+	conflock->fl.fl_end = lock->fl.fl_end;
+	ret = nlm_lck_denied;
+out:
+	if (block)
+		nlmsvc_release_block(block);
+	return ret;
+}
+
+/*
+ * Remove a lock.
+ * This implies a CANCEL call: We send a GRANT_MSG, the client replies
+ * with a GRANT_RES call which gets lost, and calls UNLOCK immediately
+ * afterwards. In this case the block will still be there, and hence
+ * must be removed.
+ */
+__be32
+nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock)
+{
+	int	error;
+
+	dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n",
+				file->f_file->f_path.dentry->d_inode->i_sb->s_id,
+				file->f_file->f_path.dentry->d_inode->i_ino,
+				lock->fl.fl_pid,
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end);
+
+	/* First, cancel any lock that might be there */
+	nlmsvc_cancel_blocked(file, lock);
+
+	lock->fl.fl_type = F_UNLCK;
+	error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
+
+	return (error < 0)? nlm_lck_denied_nolocks : nlm_granted;
+}
+
+/*
+ * Cancel a previously blocked request.
+ *
+ * A cancel request always overrides any grant that may currently
+ * be in progress.
+ * The calling procedure must check whether the file can be closed.
+ */
+__be32
+nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
+{
+	struct nlm_block	*block;
+	int status = 0;
+
+	dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n",
+				file->f_file->f_path.dentry->d_inode->i_sb->s_id,
+				file->f_file->f_path.dentry->d_inode->i_ino,
+				lock->fl.fl_pid,
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end);
+
+	if (locks_in_grace())
+		return nlm_lck_denied_grace_period;
+
+	mutex_lock(&file->f_mutex);
+	block = nlmsvc_lookup_block(file, lock);
+	mutex_unlock(&file->f_mutex);
+	if (block != NULL) {
+		vfs_cancel_lock(block->b_file->f_file,
+				&block->b_call->a_args.lock.fl);
+		status = nlmsvc_unlink_block(block);
+		nlmsvc_release_block(block);
+	}
+	return status ? nlm_lck_denied : nlm_granted;
+}
+
+/*
+ * This is a callback from the filesystem for VFS file lock requests.
+ * It will be used if fl_grant is defined and the filesystem can not
+ * respond to the request immediately.
+ * For GETLK request it will copy the reply to the nlm_block.
+ * For SETLK or SETLKW request it will get the local posix lock.
+ * In all cases it will move the block to the head of nlm_blocked q where
+ * nlmsvc_retry_blocked() can send back a reply for SETLKW or revisit the
+ * deferred rpc for GETLK and SETLK.
+ */
+static void
+nlmsvc_update_deferred_block(struct nlm_block *block, struct file_lock *conf,
+			     int result)
+{
+	block->b_flags |= B_GOT_CALLBACK;
+	if (result == 0)
+		block->b_granted = 1;
+	else
+		block->b_flags |= B_TIMED_OUT;
+	if (conf) {
+		if (block->b_fl)
+			__locks_copy_lock(block->b_fl, conf);
+	}
+}
+
+static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
+					int result)
+{
+	struct nlm_block *block;
+	int rc = -ENOENT;
+
+	spin_lock(&nlm_blocked_lock);
+	list_for_each_entry(block, &nlm_blocked, b_list) {
+		if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
+			dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n",
+							block, block->b_flags);
+			if (block->b_flags & B_QUEUED) {
+				if (block->b_flags & B_TIMED_OUT) {
+					rc = -ENOLCK;
+					break;
+				}
+				nlmsvc_update_deferred_block(block, conf, result);
+			} else if (result == 0)
+				block->b_granted = 1;
+
+			nlmsvc_insert_block_locked(block, 0);
+			svc_wake_up(block->b_daemon);
+			rc = 0;
+			break;
+		}
+	}
+	spin_unlock(&nlm_blocked_lock);
+	if (rc == -ENOENT)
+		printk(KERN_WARNING "lockd: grant for unknown block\n");
+	return rc;
+}
+
+/*
+ * Unblock a blocked lock request. This is a callback invoked from the
+ * VFS layer when a lock on which we blocked is removed.
+ *
+ * This function doesn't grant the blocked lock instantly, but rather moves
+ * the block to the head of nlm_blocked where it can be picked up by lockd.
+ */
+static void
+nlmsvc_notify_blocked(struct file_lock *fl)
+{
+	struct nlm_block	*block;
+
+	dprintk("lockd: VFS unblock notification for block %p\n", fl);
+	spin_lock(&nlm_blocked_lock);
+	list_for_each_entry(block, &nlm_blocked, b_list) {
+		if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
+			nlmsvc_insert_block_locked(block, 0);
+			spin_unlock(&nlm_blocked_lock);
+			svc_wake_up(block->b_daemon);
+			return;
+		}
+	}
+	spin_unlock(&nlm_blocked_lock);
+	printk(KERN_WARNING "lockd: notification for unknown block!\n");
+}
+
+static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
+{
+	return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
+}
+
+const struct lock_manager_operations nlmsvc_lock_operations = {
+	.fl_compare_owner = nlmsvc_same_owner,
+	.fl_notify = nlmsvc_notify_blocked,
+	.fl_grant = nlmsvc_grant_deferred,
+};
+
+/*
+ * Try to claim a lock that was previously blocked.
+ *
+ * Note that we use both the RPC_GRANTED_MSG call _and_ an async
+ * RPC thread when notifying the client. This seems like overkill...
+ * Here's why:
+ *  -	we don't want to use a synchronous RPC thread, otherwise
+ *	we might find ourselves hanging on a dead portmapper.
+ *  -	Some lockd implementations (e.g. HP) don't react to
+ *	RPC_GRANTED calls; they seem to insist on RPC_GRANTED_MSG calls.
+ */
+static void
+nlmsvc_grant_blocked(struct nlm_block *block)
+{
+	struct nlm_file		*file = block->b_file;
+	struct nlm_lock		*lock = &block->b_call->a_args.lock;
+	int			error;
+
+	dprintk("lockd: grant blocked lock %p\n", block);
+
+	kref_get(&block->b_count);
+
+	/* Unlink block request from list */
+	nlmsvc_unlink_block(block);
+
+	/* If b_granted is true this means we've been here before.
+	 * Just retry the grant callback, possibly refreshing the RPC
+	 * binding */
+	if (block->b_granted) {
+		nlm_rebind_host(block->b_host);
+		goto callback;
+	}
+
+	/* Try the lock operation again */
+	lock->fl.fl_flags |= FL_SLEEP;
+	error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
+	lock->fl.fl_flags &= ~FL_SLEEP;
+
+	switch (error) {
+	case 0:
+		break;
+	case FILE_LOCK_DEFERRED:
+		dprintk("lockd: lock still blocked error %d\n", error);
+		nlmsvc_insert_block(block, NLM_NEVER);
+		nlmsvc_release_block(block);
+		return;
+	default:
+		printk(KERN_WARNING "lockd: unexpected error %d in %s!\n",
+				-error, __func__);
+		nlmsvc_insert_block(block, 10 * HZ);
+		nlmsvc_release_block(block);
+		return;
+	}
+
+callback:
+	/* Lock was granted by VFS. */
+	dprintk("lockd: GRANTing blocked lock.\n");
+	block->b_granted = 1;
+
+	/* keep block on the list, but don't reattempt until the RPC
+	 * completes or the submission fails
+	 */
+	nlmsvc_insert_block(block, NLM_NEVER);
+
+	/* Call the client -- use a soft RPC task since nlmsvc_retry_blocked
+	 * will queue up a new one if this one times out
+	 */
+	error = nlm_async_call(block->b_call, NLMPROC_GRANTED_MSG,
+				&nlmsvc_grant_ops);
+
+	/* RPC submission failed, wait a bit and retry */
+	if (error < 0)
+		nlmsvc_insert_block(block, 10 * HZ);
+}
+
+/*
+ * This is the callback from the RPC layer when the NLM_GRANTED_MSG
+ * RPC call has succeeded or timed out.
+ * Like all RPC callbacks, it is invoked by the rpciod process, so it
+ * better not sleep. Therefore, we put the blocked lock on the nlm_blocked
+ * chain once more in order to have it removed by lockd itself (which can
+ * then sleep on the file semaphore without disrupting e.g. the nfs client).
+ */
+static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
+{
+	struct nlm_rqst		*call = data;
+	struct nlm_block	*block = call->a_block;
+	unsigned long		timeout;
+
+	dprintk("lockd: GRANT_MSG RPC callback\n");
+
+	spin_lock(&nlm_blocked_lock);
+	/* if the block is not on a list at this point then it has
+	 * been invalidated. Don't try to requeue it.
+	 *
+	 * FIXME: it's possible that the block is removed from the list
+	 * after this check but before the nlmsvc_insert_block. In that
+	 * case it will be added back. Perhaps we need better locking
+	 * for nlm_blocked?
+	 */
+	if (list_empty(&block->b_list))
+		goto out;
+
+	/* Technically, we should down the file semaphore here. Since we
+	 * move the block towards the head of the queue only, no harm
+	 * can be done, though. */
+	if (task->tk_status < 0) {
+		/* RPC error: Re-insert for retransmission */
+		timeout = 10 * HZ;
+	} else {
+		/* Call was successful, now wait for client callback */
+		timeout = 60 * HZ;
+	}
+	nlmsvc_insert_block_locked(block, timeout);
+	svc_wake_up(block->b_daemon);
+out:
+	spin_unlock(&nlm_blocked_lock);
+}
+
+/*
+ * FIXME: nlmsvc_release_block() grabs a mutex.  This is not allowed for an
+ * .rpc_release rpc_call_op
+ */
+static void nlmsvc_grant_release(void *data)
+{
+	struct nlm_rqst		*call = data;
+	nlmsvc_release_block(call->a_block);
+}
+
+static const struct rpc_call_ops nlmsvc_grant_ops = {
+	.rpc_call_done = nlmsvc_grant_callback,
+	.rpc_release = nlmsvc_grant_release,
+};
+
+/*
+ * We received a GRANT_RES callback. Try to find the corresponding
+ * block.
+ */
+void
+nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status)
+{
+	struct nlm_block	*block;
+
+	dprintk("grant_reply: looking for cookie %x, s=%d \n",
+		*(unsigned int *)(cookie->data), status);
+	if (!(block = nlmsvc_find_block(cookie)))
+		return;
+
+	if (block) {
+		if (status == nlm_lck_denied_grace_period) {
+			/* Try again in a couple of seconds */
+			nlmsvc_insert_block(block, 10 * HZ);
+		} else {
+			/* Lock is now held by client, or has been rejected.
+			 * In both cases, the block should be removed. */
+			nlmsvc_unlink_block(block);
+		}
+	}
+	nlmsvc_release_block(block);
+}
+
+/* Helper function to handle retry of a deferred block.
+ * If it is a blocking lock, call grant_blocked.
+ * For a non-blocking lock or test lock, revisit the request.
+ */
+static void
+retry_deferred_block(struct nlm_block *block)
+{
+	if (!(block->b_flags & B_GOT_CALLBACK))
+		block->b_flags |= B_TIMED_OUT;
+	nlmsvc_insert_block(block, NLM_TIMEOUT);
+	dprintk("revisit block %p flags %d\n",	block, block->b_flags);
+	if (block->b_deferred_req) {
+		block->b_deferred_req->revisit(block->b_deferred_req, 0);
+		block->b_deferred_req = NULL;
+	}
+}
+
+/*
+ * Retry all blocked locks that have been notified. This is where lockd
+ * picks up locks that can be granted, or grant notifications that must
+ * be retransmitted.
+ */
+unsigned long
+nlmsvc_retry_blocked(void)
+{
+	unsigned long	timeout = MAX_SCHEDULE_TIMEOUT;
+	struct nlm_block *block;
+
+	while (!list_empty(&nlm_blocked) && !kthread_should_stop()) {
+		block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
+
+		if (block->b_when == NLM_NEVER)
+			break;
+		if (time_after(block->b_when, jiffies)) {
+			timeout = block->b_when - jiffies;
+			break;
+		}
+
+		dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
+			block, block->b_when);
+		if (block->b_flags & B_QUEUED) {
+			dprintk("nlmsvc_retry_blocked delete block (%p, granted=%d, flags=%d)\n",
+				block, block->b_granted, block->b_flags);
+			retry_deferred_block(block);
+		} else
+			nlmsvc_grant_blocked(block);
+	}
+
+	return timeout;
+}
+
+#ifdef RPC_DEBUG
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
+{
+	/*
+	 * We can get away with a static buffer because we're only
+	 * called with BKL held.
+	 */
+	static char buf[2*NLM_MAXCOOKIELEN+1];
+	unsigned int i, len = sizeof(buf);
+	char *p = buf;
+
+	len--;	/* allow for trailing \0 */
+	if (len < 3)
+		return "???";
+	for (i = 0 ; i < cookie->len ; i++) {
+		if (len < 2) {
+			strcpy(p-3, "...");
+			break;
+		}
+		sprintf(p, "%02x", cookie->data[i]);
+		p += 2;
+		len -= 2;
+	}
+	*p = '\0';
+
+	return buf;
+}
+#endif
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
new file mode 100644
index 00000000..d27aab11
--- /dev/null
+++ b/fs/lockd/svcproc.c
@@ -0,0 +1,544 @@
+/*
+ * linux/fs/lockd/svcproc.c
+ *
+ * Lockd server procedures. We don't implement the NLM_*_RES 
+ * procedures because we don't use the async procedures.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/lockd/lockd.h>
+#include <linux/lockd/share.h>
+
+#define NLMDBG_FACILITY		NLMDBG_CLIENT
+
+#ifdef CONFIG_LOCKD_V4
+static __be32
+cast_to_nlm(__be32 status, u32 vers)
+{
+	/* Note: status is assumed to be in network byte order !!! */
+	if (vers != 4){
+		switch (status) {
+		case nlm_granted:
+		case nlm_lck_denied:
+		case nlm_lck_denied_nolocks:
+		case nlm_lck_blocked:
+		case nlm_lck_denied_grace_period:
+		case nlm_drop_reply:
+			break;
+		case nlm4_deadlock:
+			status = nlm_lck_denied;
+			break;
+		default:
+			status = nlm_lck_denied_nolocks;
+		}
+	}
+
+	return (status);
+}
+#define	cast_status(status) (cast_to_nlm(status, rqstp->rq_vers))
+#else
+#define cast_status(status) (status)
+#endif
+
+/*
+ * Obtain client and file from arguments
+ */
+static __be32
+nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
+			struct nlm_host **hostp, struct nlm_file **filp)
+{
+	struct nlm_host		*host = NULL;
+	struct nlm_file		*file = NULL;
+	struct nlm_lock		*lock = &argp->lock;
+	__be32			error = 0;
+
+	/* nfsd callbacks must have been installed for this procedure */
+	if (!nlmsvc_ops)
+		return nlm_lck_denied_nolocks;
+
+	/* Obtain host handle */
+	if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
+	 || (argp->monitor && nsm_monitor(host) < 0))
+		goto no_locks;
+	*hostp = host;
+
+	/* Obtain file pointer. Not used by FREE_ALL call. */
+	if (filp != NULL) {
+		if ((error = nlm_lookup_file(rqstp, &file, &lock->fh)) != 0)
+			goto no_locks;
+		*filp = file;
+
+		/* Set up the missing parts of the file_lock structure */
+		lock->fl.fl_file  = file->f_file;
+		lock->fl.fl_owner = (fl_owner_t) host;
+		lock->fl.fl_lmops = &nlmsvc_lock_operations;
+	}
+
+	return 0;
+
+no_locks:
+	nlmsvc_release_host(host);
+	if (error)
+		return error;
+	return nlm_lck_denied_nolocks;
+}
+
+/*
+ * NULL: Test for presence of service
+ */
+static __be32
+nlmsvc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	dprintk("lockd: NULL          called\n");
+	return rpc_success;
+}
+
+/*
+ * TEST: Check for conflicting lock
+ */
+static __be32
+nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
+				         struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+	__be32 rc = rpc_success;
+
+	dprintk("lockd: TEST          called\n");
+	resp->cookie = argp->cookie;
+
+	/* Obtain client and file */
+	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now check for conflicting locks */
+	resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie));
+	if (resp->status == nlm_drop_reply)
+		rc = rpc_drop_reply;
+	else
+		dprintk("lockd: TEST          status %d vers %d\n",
+			ntohl(resp->status), rqstp->rq_vers);
+
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rc;
+}
+
+static __be32
+nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
+				         struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+	__be32 rc = rpc_success;
+
+	dprintk("lockd: LOCK          called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Obtain client and file */
+	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+#if 0
+	/* If supplied state doesn't match current state, we assume it's
+	 * an old request that time-warped somehow. Any error return would
+	 * do in this case because it's irrelevant anyway.
+	 *
+	 * NB: We don't retrieve the remote host's state yet.
+	 */
+	if (host->h_nsmstate && host->h_nsmstate != argp->state) {
+		resp->status = nlm_lck_denied_nolocks;
+	} else
+#endif
+
+	/* Now try to lock the file */
+	resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock,
+					       argp->block, &argp->cookie,
+					       argp->reclaim));
+	if (resp->status == nlm_drop_reply)
+		rc = rpc_drop_reply;
+	else
+		dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
+
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rc;
+}
+
+static __be32
+nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
+				           struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+
+	dprintk("lockd: CANCEL        called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept requests during grace period */
+	if (locks_in_grace()) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Try to cancel request. */
+	resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock));
+
+	dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * UNLOCK: release a lock
+ */
+static __be32
+nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
+				           struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+
+	dprintk("lockd: UNLOCK        called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept new lock requests during grace period */
+	if (locks_in_grace()) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now try to remove the lock */
+	resp->status = cast_status(nlmsvc_unlock(file, &argp->lock));
+
+	dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * GRANTED: A server calls us to tell that a process' lock request
+ * was granted
+ */
+static __be32
+nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
+				            struct nlm_res  *resp)
+{
+	resp->cookie = argp->cookie;
+
+	dprintk("lockd: GRANTED       called\n");
+	resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
+	dprintk("lockd: GRANTED       status %d\n", ntohl(resp->status));
+	return rpc_success;
+}
+
+/*
+ * This is the generic lockd callback for async RPC calls
+ */
+static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
+{
+	dprintk("lockd: %5u callback returned %d\n", task->tk_pid,
+			-task->tk_status);
+}
+
+void nlmsvc_release_call(struct nlm_rqst *call)
+{
+	if (!atomic_dec_and_test(&call->a_count))
+		return;
+	nlmsvc_release_host(call->a_host);
+	kfree(call);
+}
+
+static void nlmsvc_callback_release(void *data)
+{
+	nlmsvc_release_call(data);
+}
+
+static const struct rpc_call_ops nlmsvc_callback_ops = {
+	.rpc_call_done = nlmsvc_callback_exit,
+	.rpc_release = nlmsvc_callback_release,
+};
+
+/*
+ * `Async' versions of the above service routines. They aren't really,
+ * because we send the callback before the reply proper. I hope this
+ * doesn't break any clients.
+ */
+static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
+		__be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res  *))
+{
+	struct nlm_host	*host;
+	struct nlm_rqst	*call;
+	__be32 stat;
+
+	host = nlmsvc_lookup_host(rqstp,
+				  argp->lock.caller,
+				  argp->lock.len);
+	if (host == NULL)
+		return rpc_system_err;
+
+	call = nlm_alloc_call(host);
+	if (call == NULL)
+		return rpc_system_err;
+
+	stat = func(rqstp, argp, &call->a_res);
+	if (stat != 0) {
+		nlmsvc_release_call(call);
+		return stat;
+	}
+
+	call->a_flags = RPC_TASK_ASYNC;
+	if (nlm_async_reply(call, proc, &nlmsvc_callback_ops) < 0)
+		return rpc_system_err;
+	return rpc_success;
+}
+
+static __be32 nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+					     void	     *resp)
+{
+	dprintk("lockd: TEST_MSG      called\n");
+	return nlmsvc_callback(rqstp, NLMPROC_TEST_RES, argp, nlmsvc_proc_test);
+}
+
+static __be32 nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+					     void	     *resp)
+{
+	dprintk("lockd: LOCK_MSG      called\n");
+	return nlmsvc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlmsvc_proc_lock);
+}
+
+static __be32 nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+					       void	       *resp)
+{
+	dprintk("lockd: CANCEL_MSG    called\n");
+	return nlmsvc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlmsvc_proc_cancel);
+}
+
+static __be32
+nlmsvc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+                                               void            *resp)
+{
+	dprintk("lockd: UNLOCK_MSG    called\n");
+	return nlmsvc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlmsvc_proc_unlock);
+}
+
+static __be32
+nlmsvc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+                                                void            *resp)
+{
+	dprintk("lockd: GRANTED_MSG   called\n");
+	return nlmsvc_callback(rqstp, NLMPROC_GRANTED_RES, argp, nlmsvc_proc_granted);
+}
+
+/*
+ * SHARE: create a DOS share or alter existing share.
+ */
+static __be32
+nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
+				          struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+
+	dprintk("lockd: SHARE         called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept new lock requests during grace period */
+	if (locks_in_grace() && !argp->reclaim) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now try to create the share */
+	resp->status = cast_status(nlmsvc_share_file(host, file, argp));
+
+	dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * UNSHARE: Release a DOS share.
+ */
+static __be32
+nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
+				            struct nlm_res  *resp)
+{
+	struct nlm_host	*host;
+	struct nlm_file	*file;
+
+	dprintk("lockd: UNSHARE       called\n");
+
+	resp->cookie = argp->cookie;
+
+	/* Don't accept requests during grace period */
+	if (locks_in_grace()) {
+		resp->status = nlm_lck_denied_grace_period;
+		return rpc_success;
+	}
+
+	/* Obtain client and file */
+	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
+	/* Now try to unshare the file */
+	resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
+
+	dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
+	nlmsvc_release_host(host);
+	nlm_release_file(file);
+	return rpc_success;
+}
+
+/*
+ * NM_LOCK: Create an unmonitored lock
+ */
+static __be32
+nlmsvc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
+				            struct nlm_res  *resp)
+{
+	dprintk("lockd: NM_LOCK       called\n");
+
+	argp->monitor = 0;		/* just clean the monitor flag */
+	return nlmsvc_proc_lock(rqstp, argp, resp);
+}
+
+/*
+ * FREE_ALL: Release all locks and shares held by client
+ */
+static __be32
+nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
+					     void            *resp)
+{
+	struct nlm_host	*host;
+
+	/* Obtain client */
+	if (nlmsvc_retrieve_args(rqstp, argp, &host, NULL))
+		return rpc_success;
+
+	nlmsvc_free_host_resources(host);
+	nlmsvc_release_host(host);
+	return rpc_success;
+}
+
+/*
+ * SM_NOTIFY: private callback from statd (not part of official NLM proto)
+ */
+static __be32
+nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
+					      void	        *resp)
+{
+	dprintk("lockd: SM_NOTIFY     called\n");
+
+	if (!nlm_privileged_requester(rqstp)) {
+		char buf[RPC_MAX_ADDRBUFLEN];
+		printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
+				svc_print_addr(rqstp, buf, sizeof(buf)));
+		return rpc_system_err;
+	}
+
+	nlm_host_rebooted(argp);
+	return rpc_success;
+}
+
+/*
+ * client sent a GRANTED_RES, let's remove the associated block
+ */
+static __be32
+nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
+                                                void            *resp)
+{
+	if (!nlmsvc_ops)
+		return rpc_success;
+
+	dprintk("lockd: GRANTED_RES   called\n");
+
+	nlmsvc_grant_reply(&argp->cookie, argp->status);
+	return rpc_success;
+}
+
+/*
+ * NLM Server procedures.
+ */
+
+#define nlmsvc_encode_norep	nlmsvc_encode_void
+#define nlmsvc_decode_norep	nlmsvc_decode_void
+#define nlmsvc_decode_testres	nlmsvc_decode_void
+#define nlmsvc_decode_lockres	nlmsvc_decode_void
+#define nlmsvc_decode_unlockres	nlmsvc_decode_void
+#define nlmsvc_decode_cancelres	nlmsvc_decode_void
+#define nlmsvc_decode_grantedres	nlmsvc_decode_void
+
+#define nlmsvc_proc_none	nlmsvc_proc_null
+#define nlmsvc_proc_test_res	nlmsvc_proc_null
+#define nlmsvc_proc_lock_res	nlmsvc_proc_null
+#define nlmsvc_proc_cancel_res	nlmsvc_proc_null
+#define nlmsvc_proc_unlock_res	nlmsvc_proc_null
+
+struct nlm_void			{ int dummy; };
+
+#define PROC(name, xargt, xrest, argt, rest, respsize)	\
+ { .pc_func	= (svc_procfunc) nlmsvc_proc_##name,	\
+   .pc_decode	= (kxdrproc_t) nlmsvc_decode_##xargt,	\
+   .pc_encode	= (kxdrproc_t) nlmsvc_encode_##xrest,	\
+   .pc_release	= NULL,					\
+   .pc_argsize	= sizeof(struct nlm_##argt),		\
+   .pc_ressize	= sizeof(struct nlm_##rest),		\
+   .pc_xdrressize = respsize,				\
+ }
+
+#define	Ck	(1+XDR_QUADLEN(NLM_MAXCOOKIELEN))	/* cookie */
+#define	St	1				/* status */
+#define	No	(1+1024/4)			/* Net Obj */
+#define	Rg	2				/* range - offset + size */
+
+struct svc_procedure		nlmsvc_procedures[] = {
+  PROC(null,		void,		void,		void,	void, 1),
+  PROC(test,		testargs,	testres,	args,	res, Ck+St+2+No+Rg),
+  PROC(lock,		lockargs,	res,		args,	res, Ck+St),
+  PROC(cancel,		cancargs,	res,		args,	res, Ck+St),
+  PROC(unlock,		unlockargs,	res,		args,	res, Ck+St),
+  PROC(granted,		testargs,	res,		args,	res, Ck+St),
+  PROC(test_msg,	testargs,	norep,		args,	void, 1),
+  PROC(lock_msg,	lockargs,	norep,		args,	void, 1),
+  PROC(cancel_msg,	cancargs,	norep,		args,	void, 1),
+  PROC(unlock_msg,	unlockargs,	norep,		args,	void, 1),
+  PROC(granted_msg,	testargs,	norep,		args,	void, 1),
+  PROC(test_res,	testres,	norep,		res,	void, 1),
+  PROC(lock_res,	lockres,	norep,		res,	void, 1),
+  PROC(cancel_res,	cancelres,	norep,		res,	void, 1),
+  PROC(unlock_res,	unlockres,	norep,		res,	void, 1),
+  PROC(granted_res,	res,		norep,		res,	void, 1),
+  /* statd callback */
+  PROC(sm_notify,	reboot,		void,		reboot,	void, 1),
+  PROC(none,		void,		void,		void,	void, 1),
+  PROC(none,		void,		void,		void,	void, 1),
+  PROC(none,		void,		void,		void,	void, 1),
+  PROC(share,		shareargs,	shareres,	args,	res, Ck+St+1),
+  PROC(unshare,		shareargs,	shareres,	args,	res, Ck+St+1),
+  PROC(nm_lock,		lockargs,	res,		args,	res, Ck+St),
+  PROC(free_all,	notify,		void,		args,	void, 0),
+
+};
diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c
new file mode 100644
index 00000000..b0ae0700
--- /dev/null
+++ b/fs/lockd/svcshare.c
@@ -0,0 +1,106 @@
+/*
+ * linux/fs/lockd/svcshare.c
+ *
+ * Management of DOS shares.
+ *
+ * Copyright (C) 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/time.h>
+#include <linux/unistd.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/lockd/lockd.h>
+#include <linux/lockd/share.h>
+
+static inline int
+nlm_cmp_owner(struct nlm_share *share, struct xdr_netobj *oh)
+{
+	return share->s_owner.len == oh->len
+	    && !memcmp(share->s_owner.data, oh->data, oh->len);
+}
+
+__be32
+nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file,
+			struct nlm_args *argp)
+{
+	struct nlm_share	*share;
+	struct xdr_netobj	*oh = &argp->lock.oh;
+	u8			*ohdata;
+
+	for (share = file->f_shares; share; share = share->s_next) {
+		if (share->s_host == host && nlm_cmp_owner(share, oh))
+			goto update;
+		if ((argp->fsm_access & share->s_mode)
+		 || (argp->fsm_mode   & share->s_access ))
+			return nlm_lck_denied;
+	}
+
+	share = kmalloc(sizeof(*share) + oh->len,
+						GFP_KERNEL);
+	if (share == NULL)
+		return nlm_lck_denied_nolocks;
+
+	/* Copy owner handle */
+	ohdata = (u8 *) (share + 1);
+	memcpy(ohdata, oh->data, oh->len);
+
+	share->s_file	    = file;
+	share->s_host       = host;
+	share->s_owner.data = ohdata;
+	share->s_owner.len  = oh->len;
+	share->s_next       = file->f_shares;
+	file->f_shares      = share;
+
+update:
+	share->s_access = argp->fsm_access;
+	share->s_mode   = argp->fsm_mode;
+	return nlm_granted;
+}
+
+/*
+ * Delete a share.
+ */
+__be32
+nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file,
+			struct nlm_args *argp)
+{
+	struct nlm_share	*share, **shpp;
+	struct xdr_netobj	*oh = &argp->lock.oh;
+
+	for (shpp = &file->f_shares; (share = *shpp) != NULL;
+					shpp = &share->s_next) {
+		if (share->s_host == host && nlm_cmp_owner(share, oh)) {
+			*shpp = share->s_next;
+			kfree(share);
+			return nlm_granted;
+		}
+	}
+
+	/* X/Open spec says return success even if there was no
+	 * corresponding share. */
+	return nlm_granted;
+}
+
+/*
+ * Traverse all shares for a given file, and delete
+ * those owned by the given (type of) host
+ */
+void nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file,
+		nlm_host_match_fn_t match)
+{
+	struct nlm_share	*share, **shpp;
+
+	shpp = &file->f_shares;
+	while ((share = *shpp) !=  NULL) {
+		if (match(share->s_host, host)) {
+			*shpp = share->s_next;
+			kfree(share);
+			continue;
+		}
+		shpp = &share->s_next;
+	}
+}
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
new file mode 100644
index 00000000..1ca0679c
--- /dev/null
+++ b/fs/lockd/svcsubs.c
@@ -0,0 +1,446 @@
+/*
+ * linux/fs/lockd/svcsubs.c
+ *
+ * Various support routines for the NLM server.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfsd/nfsfh.h>
+#include <linux/nfsd/export.h>
+#include <linux/lockd/lockd.h>
+#include <linux/lockd/share.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+
+#define NLMDBG_FACILITY		NLMDBG_SVCSUBS
+
+
+/*
+ * Global file hash table
+ */
+#define FILE_HASH_BITS		7
+#define FILE_NRHASH		(1<<FILE_HASH_BITS)
+static struct hlist_head	nlm_files[FILE_NRHASH];
+static DEFINE_MUTEX(nlm_file_mutex);
+
+#ifdef NFSD_DEBUG
+static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f)
+{
+	u32 *fhp = (u32*)f->data;
+
+	/* print the first 32 bytes of the fh */
+	dprintk("lockd: %s (%08x %08x %08x %08x %08x %08x %08x %08x)\n",
+		msg, fhp[0], fhp[1], fhp[2], fhp[3],
+		fhp[4], fhp[5], fhp[6], fhp[7]);
+}
+
+static inline void nlm_debug_print_file(char *msg, struct nlm_file *file)
+{
+	struct inode *inode = file->f_file->f_path.dentry->d_inode;
+
+	dprintk("lockd: %s %s/%ld\n",
+		msg, inode->i_sb->s_id, inode->i_ino);
+}
+#else
+static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f)
+{
+	return;
+}
+
+static inline void nlm_debug_print_file(char *msg, struct nlm_file *file)
+{
+	return;
+}
+#endif
+
+static inline unsigned int file_hash(struct nfs_fh *f)
+{
+	unsigned int tmp=0;
+	int i;
+	for (i=0; i<NFS2_FHSIZE;i++)
+		tmp += f->data[i];
+	return tmp & (FILE_NRHASH - 1);
+}
+
+/*
+ * Lookup file info. If it doesn't exist, create a file info struct
+ * and open a (VFS) file for the given inode.
+ *
+ * FIXME:
+ * Note that we open the file O_RDONLY even when creating write locks.
+ * This is not quite right, but for now, we assume the client performs
+ * the proper R/W checking.
+ */
+__be32
+nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
+					struct nfs_fh *f)
+{
+	struct hlist_node *pos;
+	struct nlm_file	*file;
+	unsigned int	hash;
+	__be32		nfserr;
+
+	nlm_debug_print_fh("nlm_lookup_file", f);
+
+	hash = file_hash(f);
+
+	/* Lock file table */
+	mutex_lock(&nlm_file_mutex);
+
+	hlist_for_each_entry(file, pos, &nlm_files[hash], f_list)
+		if (!nfs_compare_fh(&file->f_handle, f))
+			goto found;
+
+	nlm_debug_print_fh("creating file for", f);
+
+	nfserr = nlm_lck_denied_nolocks;
+	file = kzalloc(sizeof(*file), GFP_KERNEL);
+	if (!file)
+		goto out_unlock;
+
+	memcpy(&file->f_handle, f, sizeof(struct nfs_fh));
+	mutex_init(&file->f_mutex);
+	INIT_HLIST_NODE(&file->f_list);
+	INIT_LIST_HEAD(&file->f_blocks);
+
+	/* Open the file. Note that this must not sleep for too long, else
+	 * we would lock up lockd:-) So no NFS re-exports, folks.
+	 *
+	 * We have to make sure we have the right credential to open
+	 * the file.
+	 */
+	if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) {
+		dprintk("lockd: open failed (error %d)\n", nfserr);
+		goto out_free;
+	}
+
+	hlist_add_head(&file->f_list, &nlm_files[hash]);
+
+found:
+	dprintk("lockd: found file %p (count %d)\n", file, file->f_count);
+	*result = file;
+	file->f_count++;
+	nfserr = 0;
+
+out_unlock:
+	mutex_unlock(&nlm_file_mutex);
+	return nfserr;
+
+out_free:
+	kfree(file);
+	goto out_unlock;
+}
+
+/*
+ * Delete a file after having released all locks, blocks and shares
+ */
+static inline void
+nlm_delete_file(struct nlm_file *file)
+{
+	nlm_debug_print_file("closing file", file);
+	if (!hlist_unhashed(&file->f_list)) {
+		hlist_del(&file->f_list);
+		nlmsvc_ops->fclose(file->f_file);
+		kfree(file);
+	} else {
+		printk(KERN_WARNING "lockd: attempt to release unknown file!\n");
+	}
+}
+
+/*
+ * Loop over all locks on the given file and perform the specified
+ * action.
+ */
+static int
+nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
+			nlm_host_match_fn_t match)
+{
+	struct inode	 *inode = nlmsvc_file_inode(file);
+	struct file_lock *fl;
+	struct nlm_host	 *lockhost;
+
+again:
+	file->f_locks = 0;
+	lock_flocks(); /* protects i_flock list */
+	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
+		if (fl->fl_lmops != &nlmsvc_lock_operations)
+			continue;
+
+		/* update current lock count */
+		file->f_locks++;
+
+		lockhost = (struct nlm_host *) fl->fl_owner;
+		if (match(lockhost, host)) {
+			struct file_lock lock = *fl;
+
+			unlock_flocks();
+			lock.fl_type  = F_UNLCK;
+			lock.fl_start = 0;
+			lock.fl_end   = OFFSET_MAX;
+			if (vfs_lock_file(file->f_file, F_SETLK, &lock, NULL) < 0) {
+				printk("lockd: unlock failure in %s:%d\n",
+						__FILE__, __LINE__);
+				return 1;
+			}
+			goto again;
+		}
+	}
+	unlock_flocks();
+
+	return 0;
+}
+
+static int
+nlmsvc_always_match(void *dummy1, struct nlm_host *dummy2)
+{
+	return 1;
+}
+
+/*
+ * Inspect a single file
+ */
+static inline int
+nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, nlm_host_match_fn_t match)
+{
+	nlmsvc_traverse_blocks(host, file, match);
+	nlmsvc_traverse_shares(host, file, match);
+	return nlm_traverse_locks(host, file, match);
+}
+
+/*
+ * Quick check whether there are still any locks, blocks or
+ * shares on a given file.
+ */
+static inline int
+nlm_file_inuse(struct nlm_file *file)
+{
+	struct inode	 *inode = nlmsvc_file_inode(file);
+	struct file_lock *fl;
+
+	if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
+		return 1;
+
+	lock_flocks();
+	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
+		if (fl->fl_lmops == &nlmsvc_lock_operations) {
+			unlock_flocks();
+			return 1;
+		}
+	}
+	unlock_flocks();
+	file->f_locks = 0;
+	return 0;
+}
+
+/*
+ * Loop over all files in the file table.
+ */
+static int
+nlm_traverse_files(void *data, nlm_host_match_fn_t match,
+		int (*is_failover_file)(void *data, struct nlm_file *file))
+{
+	struct hlist_node *pos, *next;
+	struct nlm_file	*file;
+	int i, ret = 0;
+
+	mutex_lock(&nlm_file_mutex);
+	for (i = 0; i < FILE_NRHASH; i++) {
+		hlist_for_each_entry_safe(file, pos, next, &nlm_files[i], f_list) {
+			if (is_failover_file && !is_failover_file(data, file))
+				continue;
+			file->f_count++;
+			mutex_unlock(&nlm_file_mutex);
+
+			/* Traverse locks, blocks and shares of this file
+			 * and update file->f_locks count */
+			if (nlm_inspect_file(data, file, match))
+				ret = 1;
+
+			mutex_lock(&nlm_file_mutex);
+			file->f_count--;
+			/* No more references to this file. Let go of it. */
+			if (list_empty(&file->f_blocks) && !file->f_locks
+			 && !file->f_shares && !file->f_count) {
+				hlist_del(&file->f_list);
+				nlmsvc_ops->fclose(file->f_file);
+				kfree(file);
+			}
+		}
+	}
+	mutex_unlock(&nlm_file_mutex);
+	return ret;
+}
+
+/*
+ * Release file. If there are no more remote locks on this file,
+ * close it and free the handle.
+ *
+ * Note that we can't do proper reference counting without major
+ * contortions because the code in fs/locks.c creates, deletes and
+ * splits locks without notification. Our only way is to walk the
+ * entire lock list each time we remove a lock.
+ */
+void
+nlm_release_file(struct nlm_file *file)
+{
+	dprintk("lockd: nlm_release_file(%p, ct = %d)\n",
+				file, file->f_count);
+
+	/* Lock file table */
+	mutex_lock(&nlm_file_mutex);
+
+	/* If there are no more locks etc, delete the file */
+	if (--file->f_count == 0 && !nlm_file_inuse(file))
+		nlm_delete_file(file);
+
+	mutex_unlock(&nlm_file_mutex);
+}
+
+/*
+ * Helpers function for resource traversal
+ *
+ * nlmsvc_mark_host:
+ *	used by the garbage collector; simply sets h_inuse.
+ *	Always returns 0.
+ *
+ * nlmsvc_same_host:
+ *	returns 1 iff the two hosts match. Used to release
+ *	all resources bound to a specific host.
+ *
+ * nlmsvc_is_client:
+ *	returns 1 iff the host is a client.
+ *	Used by nlmsvc_invalidate_all
+ */
+static int
+nlmsvc_mark_host(void *data, struct nlm_host *dummy)
+{
+	struct nlm_host *host = data;
+
+	host->h_inuse = 1;
+	return 0;
+}
+
+static int
+nlmsvc_same_host(void *data, struct nlm_host *other)
+{
+	struct nlm_host *host = data;
+
+	return host == other;
+}
+
+static int
+nlmsvc_is_client(void *data, struct nlm_host *dummy)
+{
+	struct nlm_host *host = data;
+
+	if (host->h_server) {
+		/* we are destroying locks even though the client
+		 * hasn't asked us too, so don't unmonitor the
+		 * client
+		 */
+		if (host->h_nsmhandle)
+			host->h_nsmhandle->sm_sticky = 1;
+		return 1;
+	} else
+		return 0;
+}
+
+/*
+ * Mark all hosts that still hold resources
+ */
+void
+nlmsvc_mark_resources(void)
+{
+	dprintk("lockd: nlmsvc_mark_resources\n");
+	nlm_traverse_files(NULL, nlmsvc_mark_host, NULL);
+}
+
+/*
+ * Release all resources held by the given client
+ */
+void
+nlmsvc_free_host_resources(struct nlm_host *host)
+{
+	dprintk("lockd: nlmsvc_free_host_resources\n");
+
+	if (nlm_traverse_files(host, nlmsvc_same_host, NULL)) {
+		printk(KERN_WARNING
+			"lockd: couldn't remove all locks held by %s\n",
+			host->h_name);
+		BUG();
+	}
+}
+
+/**
+ * nlmsvc_invalidate_all - remove all locks held for clients
+ *
+ * Release all locks held by NFS clients.
+ *
+ */
+void
+nlmsvc_invalidate_all(void)
+{
+	/*
+	 * Previously, the code would call
+	 * nlmsvc_free_host_resources for each client in
+	 * turn, which is about as inefficient as it gets.
+	 * Now we just do it once in nlm_traverse_files.
+	 */
+	nlm_traverse_files(NULL, nlmsvc_is_client, NULL);
+}
+
+static int
+nlmsvc_match_sb(void *datap, struct nlm_file *file)
+{
+	struct super_block *sb = datap;
+
+	return sb == file->f_file->f_path.mnt->mnt_sb;
+}
+
+/**
+ * nlmsvc_unlock_all_by_sb - release locks held on this file system
+ * @sb: super block
+ *
+ * Release all locks held by clients accessing this file system.
+ */
+int
+nlmsvc_unlock_all_by_sb(struct super_block *sb)
+{
+	int ret;
+
+	ret = nlm_traverse_files(sb, nlmsvc_always_match, nlmsvc_match_sb);
+	return ret ? -EIO : 0;
+}
+EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
+
+static int
+nlmsvc_match_ip(void *datap, struct nlm_host *host)
+{
+	return rpc_cmp_addr(nlm_srcaddr(host), datap);
+}
+
+/**
+ * nlmsvc_unlock_all_by_ip - release local locks by IP address
+ * @server_addr: server's IP address as seen by clients
+ *
+ * Release all locks held by clients accessing this host
+ * via the passed in IP address.
+ */
+int
+nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr)
+{
+	int ret;
+
+	ret = nlm_traverse_files(server_addr, nlmsvc_match_ip, NULL);
+	return ret ? -EIO : 0;
+}
+EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_ip);
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
new file mode 100644
index 00000000..964666c6
--- /dev/null
+++ b/fs/lockd/xdr.c
@@ -0,0 +1,343 @@
+/*
+ * linux/fs/lockd/xdr.c
+ *
+ * XDR support for lockd and the lock client.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/nfs.h>
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+
+#define NLMDBG_FACILITY		NLMDBG_XDR
+
+
+static inline loff_t
+s32_to_loff_t(__s32 offset)
+{
+	return (loff_t)offset;
+}
+
+static inline __s32
+loff_t_to_s32(loff_t offset)
+{
+	__s32 res;
+	if (offset >= NLM_OFFSET_MAX)
+		res = NLM_OFFSET_MAX;
+	else if (offset <= -NLM_OFFSET_MAX)
+		res = -NLM_OFFSET_MAX;
+	else
+		res = offset;
+	return res;
+}
+
+/*
+ * XDR functions for basic NLM types
+ */
+static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c)
+{
+	unsigned int	len;
+
+	len = ntohl(*p++);
+	
+	if(len==0)
+	{
+		c->len=4;
+		memset(c->data, 0, 4);	/* hockeypux brain damage */
+	}
+	else if(len<=NLM_MAXCOOKIELEN)
+	{
+		c->len=len;
+		memcpy(c->data, p, len);
+		p+=XDR_QUADLEN(len);
+	}
+	else 
+	{
+		dprintk("lockd: bad cookie size %d (only cookies under "
+			"%d bytes are supported.)\n",
+				len, NLM_MAXCOOKIELEN);
+		return NULL;
+	}
+	return p;
+}
+
+static inline __be32 *
+nlm_encode_cookie(__be32 *p, struct nlm_cookie *c)
+{
+	*p++ = htonl(c->len);
+	memcpy(p, c->data, c->len);
+	p+=XDR_QUADLEN(c->len);
+	return p;
+}
+
+static __be32 *
+nlm_decode_fh(__be32 *p, struct nfs_fh *f)
+{
+	unsigned int	len;
+
+	if ((len = ntohl(*p++)) != NFS2_FHSIZE) {
+		dprintk("lockd: bad fhandle size %d (should be %d)\n",
+			len, NFS2_FHSIZE);
+		return NULL;
+	}
+	f->size = NFS2_FHSIZE;
+	memset(f->data, 0, sizeof(f->data));
+	memcpy(f->data, p, NFS2_FHSIZE);
+	return p + XDR_QUADLEN(NFS2_FHSIZE);
+}
+
+static inline __be32 *
+nlm_encode_fh(__be32 *p, struct nfs_fh *f)
+{
+	*p++ = htonl(NFS2_FHSIZE);
+	memcpy(p, f->data, NFS2_FHSIZE);
+	return p + XDR_QUADLEN(NFS2_FHSIZE);
+}
+
+/*
+ * Encode and decode owner handle
+ */
+static inline __be32 *
+nlm_decode_oh(__be32 *p, struct xdr_netobj *oh)
+{
+	return xdr_decode_netobj(p, oh);
+}
+
+static inline __be32 *
+nlm_encode_oh(__be32 *p, struct xdr_netobj *oh)
+{
+	return xdr_encode_netobj(p, oh);
+}
+
+static __be32 *
+nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
+{
+	struct file_lock	*fl = &lock->fl;
+	s32			start, len, end;
+
+	if (!(p = xdr_decode_string_inplace(p, &lock->caller,
+					    &lock->len,
+					    NLM_MAXSTRLEN))
+	 || !(p = nlm_decode_fh(p, &lock->fh))
+	 || !(p = nlm_decode_oh(p, &lock->oh)))
+		return NULL;
+	lock->svid  = ntohl(*p++);
+
+	locks_init_lock(fl);
+	fl->fl_owner = current->files;
+	fl->fl_pid   = (pid_t)lock->svid;
+	fl->fl_flags = FL_POSIX;
+	fl->fl_type  = F_RDLCK;		/* as good as anything else */
+	start = ntohl(*p++);
+	len = ntohl(*p++);
+	end = start + len - 1;
+
+	fl->fl_start = s32_to_loff_t(start);
+
+	if (len == 0 || end < 0)
+		fl->fl_end = OFFSET_MAX;
+	else
+		fl->fl_end = s32_to_loff_t(end);
+	return p;
+}
+
+/*
+ * Encode result of a TEST/TEST_MSG call
+ */
+static __be32 *
+nlm_encode_testres(__be32 *p, struct nlm_res *resp)
+{
+	s32		start, len;
+
+	if (!(p = nlm_encode_cookie(p, &resp->cookie)))
+		return NULL;
+	*p++ = resp->status;
+
+	if (resp->status == nlm_lck_denied) {
+		struct file_lock	*fl = &resp->lock.fl;
+
+		*p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one;
+		*p++ = htonl(resp->lock.svid);
+
+		/* Encode owner handle. */
+		if (!(p = xdr_encode_netobj(p, &resp->lock.oh)))
+			return NULL;
+
+		start = loff_t_to_s32(fl->fl_start);
+		if (fl->fl_end == OFFSET_MAX)
+			len = 0;
+		else
+			len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
+
+		*p++ = htonl(start);
+		*p++ = htonl(len);
+	}
+
+	return p;
+}
+
+
+/*
+ * First, the server side XDR functions
+ */
+int
+nlmsvc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	u32	exclusive;
+
+	if (!(p = nlm_decode_cookie(p, &argp->cookie)))
+		return 0;
+
+	exclusive = ntohl(*p++);
+	if (!(p = nlm_decode_lock(p, &argp->lock)))
+		return 0;
+	if (exclusive)
+		argp->lock.fl.fl_type = F_WRLCK;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm_encode_testres(p, resp)))
+		return 0;
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	u32	exclusive;
+
+	if (!(p = nlm_decode_cookie(p, &argp->cookie)))
+		return 0;
+	argp->block  = ntohl(*p++);
+	exclusive    = ntohl(*p++);
+	if (!(p = nlm_decode_lock(p, &argp->lock)))
+		return 0;
+	if (exclusive)
+		argp->lock.fl.fl_type = F_WRLCK;
+	argp->reclaim = ntohl(*p++);
+	argp->state   = ntohl(*p++);
+	argp->monitor = 1;		/* monitor client by default */
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	u32	exclusive;
+
+	if (!(p = nlm_decode_cookie(p, &argp->cookie)))
+		return 0;
+	argp->block = ntohl(*p++);
+	exclusive = ntohl(*p++);
+	if (!(p = nlm_decode_lock(p, &argp->lock)))
+		return 0;
+	if (exclusive)
+		argp->lock.fl.fl_type = F_WRLCK;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	if (!(p = nlm_decode_cookie(p, &argp->cookie))
+	 || !(p = nlm_decode_lock(p, &argp->lock)))
+		return 0;
+	argp->lock.fl.fl_type = F_UNLCK;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	struct nlm_lock	*lock = &argp->lock;
+
+	memset(lock, 0, sizeof(*lock));
+	locks_init_lock(&lock->fl);
+	lock->svid = ~(u32) 0;
+	lock->fl.fl_pid = (pid_t)lock->svid;
+
+	if (!(p = nlm_decode_cookie(p, &argp->cookie))
+	 || !(p = xdr_decode_string_inplace(p, &lock->caller,
+					    &lock->len, NLM_MAXSTRLEN))
+	 || !(p = nlm_decode_fh(p, &lock->fh))
+	 || !(p = nlm_decode_oh(p, &lock->oh)))
+		return 0;
+	argp->fsm_mode = ntohl(*p++);
+	argp->fsm_access = ntohl(*p++);
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm_encode_cookie(p, &resp->cookie)))
+		return 0;
+	*p++ = resp->status;
+	*p++ = xdr_zero;		/* sequence argument */
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm_encode_cookie(p, &resp->cookie)))
+		return 0;
+	*p++ = resp->status;
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp)
+{
+	struct nlm_lock	*lock = &argp->lock;
+
+	if (!(p = xdr_decode_string_inplace(p, &lock->caller,
+					    &lock->len, NLM_MAXSTRLEN)))
+		return 0;
+	argp->state = ntohl(*p++);
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
+{
+	if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
+		return 0;
+	argp->state = ntohl(*p++);
+	memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+	p += XDR_QUADLEN(SM_PRIV_SIZE);
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm_decode_cookie(p, &resp->cookie)))
+		return 0;
+	resp->status = *p++;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_ressize_check(rqstp, p);
+}
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
new file mode 100644
index 00000000..dfa4789c
--- /dev/null
+++ b/fs/lockd/xdr4.c
@@ -0,0 +1,334 @@
+/*
+ * linux/fs/lockd/xdr4.c
+ *
+ * XDR support for lockd and the lock client.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ * Copyright (C) 1999, Trond Myklebust <trond.myklebust@fys.uio.no>
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/nfs.h>
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+
+#define NLMDBG_FACILITY		NLMDBG_XDR
+
+static inline loff_t
+s64_to_loff_t(__s64 offset)
+{
+	return (loff_t)offset;
+}
+
+
+static inline s64
+loff_t_to_s64(loff_t offset)
+{
+	s64 res;
+	if (offset > NLM4_OFFSET_MAX)
+		res = NLM4_OFFSET_MAX;
+	else if (offset < -NLM4_OFFSET_MAX)
+		res = -NLM4_OFFSET_MAX;
+	else
+		res = offset;
+	return res;
+}
+
+/*
+ * XDR functions for basic NLM types
+ */
+static __be32 *
+nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c)
+{
+	unsigned int	len;
+
+	len = ntohl(*p++);
+	
+	if(len==0)
+	{
+		c->len=4;
+		memset(c->data, 0, 4);	/* hockeypux brain damage */
+	}
+	else if(len<=NLM_MAXCOOKIELEN)
+	{
+		c->len=len;
+		memcpy(c->data, p, len);
+		p+=XDR_QUADLEN(len);
+	}
+	else 
+	{
+		dprintk("lockd: bad cookie size %d (only cookies under "
+			"%d bytes are supported.)\n",
+				len, NLM_MAXCOOKIELEN);
+		return NULL;
+	}
+	return p;
+}
+
+static __be32 *
+nlm4_encode_cookie(__be32 *p, struct nlm_cookie *c)
+{
+	*p++ = htonl(c->len);
+	memcpy(p, c->data, c->len);
+	p+=XDR_QUADLEN(c->len);
+	return p;
+}
+
+static __be32 *
+nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
+{
+	memset(f->data, 0, sizeof(f->data));
+	f->size = ntohl(*p++);
+	if (f->size > NFS_MAXFHSIZE) {
+		dprintk("lockd: bad fhandle size %d (should be <=%d)\n",
+			f->size, NFS_MAXFHSIZE);
+		return NULL;
+	}
+      	memcpy(f->data, p, f->size);
+	return p + XDR_QUADLEN(f->size);
+}
+
+/*
+ * Encode and decode owner handle
+ */
+static __be32 *
+nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
+{
+	return xdr_decode_netobj(p, oh);
+}
+
+static __be32 *
+nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
+{
+	struct file_lock	*fl = &lock->fl;
+	__u64			len, start;
+	__s64			end;
+
+	if (!(p = xdr_decode_string_inplace(p, &lock->caller,
+					    &lock->len, NLM_MAXSTRLEN))
+	 || !(p = nlm4_decode_fh(p, &lock->fh))
+	 || !(p = nlm4_decode_oh(p, &lock->oh)))
+		return NULL;
+	lock->svid  = ntohl(*p++);
+
+	locks_init_lock(fl);
+	fl->fl_owner = current->files;
+	fl->fl_pid   = (pid_t)lock->svid;
+	fl->fl_flags = FL_POSIX;
+	fl->fl_type  = F_RDLCK;		/* as good as anything else */
+	p = xdr_decode_hyper(p, &start);
+	p = xdr_decode_hyper(p, &len);
+	end = start + len - 1;
+
+	fl->fl_start = s64_to_loff_t(start);
+
+	if (len == 0 || end < 0)
+		fl->fl_end = OFFSET_MAX;
+	else
+		fl->fl_end = s64_to_loff_t(end);
+	return p;
+}
+
+/*
+ * Encode result of a TEST/TEST_MSG call
+ */
+static __be32 *
+nlm4_encode_testres(__be32 *p, struct nlm_res *resp)
+{
+	s64		start, len;
+
+	dprintk("xdr: before encode_testres (p %p resp %p)\n", p, resp);
+	if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
+		return NULL;
+	*p++ = resp->status;
+
+	if (resp->status == nlm_lck_denied) {
+		struct file_lock	*fl = &resp->lock.fl;
+
+		*p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one;
+		*p++ = htonl(resp->lock.svid);
+
+		/* Encode owner handle. */
+		if (!(p = xdr_encode_netobj(p, &resp->lock.oh)))
+			return NULL;
+
+		start = loff_t_to_s64(fl->fl_start);
+		if (fl->fl_end == OFFSET_MAX)
+			len = 0;
+		else
+			len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
+		
+		p = xdr_encode_hyper(p, start);
+		p = xdr_encode_hyper(p, len);
+		dprintk("xdr: encode_testres (status %u pid %d type %d start %Ld end %Ld)\n",
+			resp->status, (int)resp->lock.svid, fl->fl_type,
+			(long long)fl->fl_start,  (long long)fl->fl_end);
+	}
+
+	dprintk("xdr: after encode_testres (p %p resp %p)\n", p, resp);
+	return p;
+}
+
+
+/*
+ * First, the server side XDR functions
+ */
+int
+nlm4svc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	u32	exclusive;
+
+	if (!(p = nlm4_decode_cookie(p, &argp->cookie)))
+		return 0;
+
+	exclusive = ntohl(*p++);
+	if (!(p = nlm4_decode_lock(p, &argp->lock)))
+		return 0;
+	if (exclusive)
+		argp->lock.fl.fl_type = F_WRLCK;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm4_encode_testres(p, resp)))
+		return 0;
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	u32	exclusive;
+
+	if (!(p = nlm4_decode_cookie(p, &argp->cookie)))
+		return 0;
+	argp->block  = ntohl(*p++);
+	exclusive    = ntohl(*p++);
+	if (!(p = nlm4_decode_lock(p, &argp->lock)))
+		return 0;
+	if (exclusive)
+		argp->lock.fl.fl_type = F_WRLCK;
+	argp->reclaim = ntohl(*p++);
+	argp->state   = ntohl(*p++);
+	argp->monitor = 1;		/* monitor client by default */
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	u32	exclusive;
+
+	if (!(p = nlm4_decode_cookie(p, &argp->cookie)))
+		return 0;
+	argp->block = ntohl(*p++);
+	exclusive = ntohl(*p++);
+	if (!(p = nlm4_decode_lock(p, &argp->lock)))
+		return 0;
+	if (exclusive)
+		argp->lock.fl.fl_type = F_WRLCK;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	if (!(p = nlm4_decode_cookie(p, &argp->cookie))
+	 || !(p = nlm4_decode_lock(p, &argp->lock)))
+		return 0;
+	argp->lock.fl.fl_type = F_UNLCK;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
+{
+	struct nlm_lock	*lock = &argp->lock;
+
+	memset(lock, 0, sizeof(*lock));
+	locks_init_lock(&lock->fl);
+	lock->svid = ~(u32) 0;
+	lock->fl.fl_pid = (pid_t)lock->svid;
+
+	if (!(p = nlm4_decode_cookie(p, &argp->cookie))
+	 || !(p = xdr_decode_string_inplace(p, &lock->caller,
+					    &lock->len, NLM_MAXSTRLEN))
+	 || !(p = nlm4_decode_fh(p, &lock->fh))
+	 || !(p = nlm4_decode_oh(p, &lock->oh)))
+		return 0;
+	argp->fsm_mode = ntohl(*p++);
+	argp->fsm_access = ntohl(*p++);
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
+		return 0;
+	*p++ = resp->status;
+	*p++ = xdr_zero;		/* sequence argument */
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
+		return 0;
+	*p++ = resp->status;
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp)
+{
+	struct nlm_lock	*lock = &argp->lock;
+
+	if (!(p = xdr_decode_string_inplace(p, &lock->caller,
+					    &lock->len, NLM_MAXSTRLEN)))
+		return 0;
+	argp->state = ntohl(*p++);
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
+{
+	if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
+		return 0;
+	argp->state = ntohl(*p++);
+	memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+	p += XDR_QUADLEN(SM_PRIV_SIZE);
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
+{
+	if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
+		return 0;
+	resp->status = *p++;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_ressize_check(rqstp, p);
+}
diff --git a/fs/locks.c b/fs/locks.c
new file mode 100644
index 00000000..b286539d
--- /dev/null
+++ b/fs/locks.c
@@ -0,0 +1,2341 @@
+/*
+ *  linux/fs/locks.c
+ *
+ *  Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls.
+ *  Doug Evans (dje@spiff.uucp), August 07, 1992
+ *
+ *  Deadlock detection added.
+ *  FIXME: one thing isn't handled yet:
+ *	- mandatory locks (requires lots of changes elsewhere)
+ *  Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994.
+ *
+ *  Miscellaneous edits, and a total rewrite of posix_lock_file() code.
+ *  Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994
+ *  
+ *  Converted file_lock_table to a linked list from an array, which eliminates
+ *  the limits on how many active file locks are open.
+ *  Chad Page (pageone@netcom.com), November 27, 1994
+ * 
+ *  Removed dependency on file descriptors. dup()'ed file descriptors now
+ *  get the same locks as the original file descriptors, and a close() on
+ *  any file descriptor removes ALL the locks on the file for the current
+ *  process. Since locks still depend on the process id, locks are inherited
+ *  after an exec() but not after a fork(). This agrees with POSIX, and both
+ *  BSD and SVR4 practice.
+ *  Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995
+ *
+ *  Scrapped free list which is redundant now that we allocate locks
+ *  dynamically with kmalloc()/kfree().
+ *  Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995
+ *
+ *  Implemented two lock personalities - FL_FLOCK and FL_POSIX.
+ *
+ *  FL_POSIX locks are created with calls to fcntl() and lockf() through the
+ *  fcntl() system call. They have the semantics described above.
+ *
+ *  FL_FLOCK locks are created with calls to flock(), through the flock()
+ *  system call, which is new. Old C libraries implement flock() via fcntl()
+ *  and will continue to use the old, broken implementation.
+ *
+ *  FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated
+ *  with a file pointer (filp). As a result they can be shared by a parent
+ *  process and its children after a fork(). They are removed when the last
+ *  file descriptor referring to the file pointer is closed (unless explicitly
+ *  unlocked). 
+ *
+ *  FL_FLOCK locks never deadlock, an existing lock is always removed before
+ *  upgrading from shared to exclusive (or vice versa). When this happens
+ *  any processes blocked by the current lock are woken up and allowed to
+ *  run before the new lock is applied.
+ *  Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995
+ *
+ *  Removed some race conditions in flock_lock_file(), marked other possible
+ *  races. Just grep for FIXME to see them. 
+ *  Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996.
+ *
+ *  Addressed Dmitry's concerns. Deadlock checking no longer recursive.
+ *  Lock allocation changed to GFP_ATOMIC as we can't afford to sleep
+ *  once we've checked for blocking and deadlocking.
+ *  Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996.
+ *
+ *  Initial implementation of mandatory locks. SunOS turned out to be
+ *  a rotten model, so I implemented the "obvious" semantics.
+ *  See 'Documentation/mandatory.txt' for details.
+ *  Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
+ *
+ *  Don't allow mandatory locks on mmap()'ed files. Added simple functions to
+ *  check if a file has mandatory locks, used by mmap(), open() and creat() to
+ *  see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference
+ *  Manual, Section 2.
+ *  Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996.
+ *
+ *  Tidied up block list handling. Added '/proc/locks' interface.
+ *  Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996.
+ *
+ *  Fixed deadlock condition for pathological code that mixes calls to
+ *  flock() and fcntl().
+ *  Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996.
+ *
+ *  Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use
+ *  for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to
+ *  guarantee sensible behaviour in the case where file system modules might
+ *  be compiled with different options than the kernel itself.
+ *  Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
+ *
+ *  Added a couple of missing wake_up() calls. Thanks to Thomas Meckel
+ *  (Thomas.Meckel@mni.fh-giessen.de) for spotting this.
+ *  Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
+ *
+ *  Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK
+ *  locks. Changed process synchronisation to avoid dereferencing locks that
+ *  have already been freed.
+ *  Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996.
+ *
+ *  Made the block list a circular list to minimise searching in the list.
+ *  Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996.
+ *
+ *  Made mandatory locking a mount option. Default is not to allow mandatory
+ *  locking.
+ *  Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996.
+ *
+ *  Some adaptations for NFS support.
+ *  Olaf Kirch (okir@monad.swb.de), Dec 1996,
+ *
+ *  Fixed /proc/locks interface so that we can't overrun the buffer we are handed.
+ *  Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997.
+ *
+ *  Use slab allocator instead of kmalloc/kfree.
+ *  Use generic list implementation from <linux/list.h>.
+ *  Sped up posix_locks_deadlock by only considering blocked locks.
+ *  Matthew Wilcox <willy@debian.org>, March, 2000.
+ *
+ *  Leases and LOCK_MAND
+ *  Matthew Wilcox <willy@debian.org>, June, 2000.
+ *  Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.
+ */
+
+#include <linux/capability.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/time.h>
+#include <linux/rcupdate.h>
+#include <linux/pid_namespace.h>
+
+#include <asm/uaccess.h>
+
+#define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
+#define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
+#define IS_LEASE(fl)	(fl->fl_flags & FL_LEASE)
+
+int leases_enable = 1;
+int lease_break_time = 45;
+
+#define for_each_lock(inode, lockp) \
+	for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
+
+static LIST_HEAD(file_lock_list);
+static LIST_HEAD(blocked_list);
+static DEFINE_SPINLOCK(file_lock_lock);
+
+/*
+ * Protects the two list heads above, plus the inode->i_flock list
+ */
+void lock_flocks(void)
+{
+	spin_lock(&file_lock_lock);
+}
+EXPORT_SYMBOL_GPL(lock_flocks);
+
+void unlock_flocks(void)
+{
+	spin_unlock(&file_lock_lock);
+}
+EXPORT_SYMBOL_GPL(unlock_flocks);
+
+static struct kmem_cache *filelock_cache __read_mostly;
+
+static void locks_init_lock_always(struct file_lock *fl)
+{
+	fl->fl_next = NULL;
+	fl->fl_fasync = NULL;
+	fl->fl_owner = NULL;
+	fl->fl_pid = 0;
+	fl->fl_nspid = NULL;
+	fl->fl_file = NULL;
+	fl->fl_flags = 0;
+	fl->fl_type = 0;
+	fl->fl_start = fl->fl_end = 0;
+}
+
+/* Allocate an empty lock structure. */
+struct file_lock *locks_alloc_lock(void)
+{
+	struct file_lock *fl = kmem_cache_alloc(filelock_cache, GFP_KERNEL);
+
+	if (fl)
+		locks_init_lock_always(fl);
+
+	return fl;
+}
+EXPORT_SYMBOL_GPL(locks_alloc_lock);
+
+void locks_release_private(struct file_lock *fl)
+{
+	if (fl->fl_ops) {
+		if (fl->fl_ops->fl_release_private)
+			fl->fl_ops->fl_release_private(fl);
+		fl->fl_ops = NULL;
+	}
+	if (fl->fl_lmops) {
+		if (fl->fl_lmops->fl_release_private)
+			fl->fl_lmops->fl_release_private(fl);
+		fl->fl_lmops = NULL;
+	}
+
+}
+EXPORT_SYMBOL_GPL(locks_release_private);
+
+/* Free a lock which is not in use. */
+void locks_free_lock(struct file_lock *fl)
+{
+	BUG_ON(waitqueue_active(&fl->fl_wait));
+	BUG_ON(!list_empty(&fl->fl_block));
+	BUG_ON(!list_empty(&fl->fl_link));
+
+	locks_release_private(fl);
+	kmem_cache_free(filelock_cache, fl);
+}
+EXPORT_SYMBOL(locks_free_lock);
+
+void locks_init_lock(struct file_lock *fl)
+{
+	INIT_LIST_HEAD(&fl->fl_link);
+	INIT_LIST_HEAD(&fl->fl_block);
+	init_waitqueue_head(&fl->fl_wait);
+	fl->fl_ops = NULL;
+	fl->fl_lmops = NULL;
+	locks_init_lock_always(fl);
+}
+
+EXPORT_SYMBOL(locks_init_lock);
+
+/*
+ * Initialises the fields of the file lock which are invariant for
+ * free file_locks.
+ */
+static void init_once(void *foo)
+{
+	struct file_lock *lock = (struct file_lock *) foo;
+
+	locks_init_lock(lock);
+}
+
+static void locks_copy_private(struct file_lock *new, struct file_lock *fl)
+{
+	if (fl->fl_ops) {
+		if (fl->fl_ops->fl_copy_lock)
+			fl->fl_ops->fl_copy_lock(new, fl);
+		new->fl_ops = fl->fl_ops;
+	}
+	if (fl->fl_lmops)
+		new->fl_lmops = fl->fl_lmops;
+}
+
+/*
+ * Initialize a new lock from an existing file_lock structure.
+ */
+void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl)
+{
+	new->fl_owner = fl->fl_owner;
+	new->fl_pid = fl->fl_pid;
+	new->fl_file = NULL;
+	new->fl_flags = fl->fl_flags;
+	new->fl_type = fl->fl_type;
+	new->fl_start = fl->fl_start;
+	new->fl_end = fl->fl_end;
+	new->fl_ops = NULL;
+	new->fl_lmops = NULL;
+}
+EXPORT_SYMBOL(__locks_copy_lock);
+
+void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
+{
+	locks_release_private(new);
+
+	__locks_copy_lock(new, fl);
+	new->fl_file = fl->fl_file;
+	new->fl_ops = fl->fl_ops;
+	new->fl_lmops = fl->fl_lmops;
+
+	locks_copy_private(new, fl);
+}
+
+EXPORT_SYMBOL(locks_copy_lock);
+
+static inline int flock_translate_cmd(int cmd) {
+	if (cmd & LOCK_MAND)
+		return cmd & (LOCK_MAND | LOCK_RW);
+	switch (cmd) {
+	case LOCK_SH:
+		return F_RDLCK;
+	case LOCK_EX:
+		return F_WRLCK;
+	case LOCK_UN:
+		return F_UNLCK;
+	}
+	return -EINVAL;
+}
+
+/* Fill in a file_lock structure with an appropriate FLOCK lock. */
+static int flock_make_lock(struct file *filp, struct file_lock **lock,
+		unsigned int cmd)
+{
+	struct file_lock *fl;
+	int type = flock_translate_cmd(cmd);
+	if (type < 0)
+		return type;
+	
+	fl = locks_alloc_lock();
+	if (fl == NULL)
+		return -ENOMEM;
+
+	fl->fl_file = filp;
+	fl->fl_pid = current->tgid;
+	fl->fl_flags = FL_FLOCK;
+	fl->fl_type = type;
+	fl->fl_end = OFFSET_MAX;
+	
+	*lock = fl;
+	return 0;
+}
+
+static int assign_type(struct file_lock *fl, int type)
+{
+	switch (type) {
+	case F_RDLCK:
+	case F_WRLCK:
+	case F_UNLCK:
+		fl->fl_type = type;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
+ * style lock.
+ */
+static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
+			       struct flock *l)
+{
+	off_t start, end;
+
+	switch (l->l_whence) {
+	case SEEK_SET:
+		start = 0;
+		break;
+	case SEEK_CUR:
+		start = filp->f_pos;
+		break;
+	case SEEK_END:
+		start = i_size_read(filp->f_path.dentry->d_inode);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* POSIX-1996 leaves the case l->l_len < 0 undefined;
+	   POSIX-2001 defines it. */
+	start += l->l_start;
+	if (start < 0)
+		return -EINVAL;
+	fl->fl_end = OFFSET_MAX;
+	if (l->l_len > 0) {
+		end = start + l->l_len - 1;
+		fl->fl_end = end;
+	} else if (l->l_len < 0) {
+		end = start - 1;
+		fl->fl_end = end;
+		start += l->l_len;
+		if (start < 0)
+			return -EINVAL;
+	}
+	fl->fl_start = start;	/* we record the absolute position */
+	if (fl->fl_end < fl->fl_start)
+		return -EOVERFLOW;
+	
+	fl->fl_owner = current->files;
+	fl->fl_pid = current->tgid;
+	fl->fl_file = filp;
+	fl->fl_flags = FL_POSIX;
+	fl->fl_ops = NULL;
+	fl->fl_lmops = NULL;
+
+	return assign_type(fl, l->l_type);
+}
+
+#if BITS_PER_LONG == 32
+static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
+				 struct flock64 *l)
+{
+	loff_t start;
+
+	switch (l->l_whence) {
+	case SEEK_SET:
+		start = 0;
+		break;
+	case SEEK_CUR:
+		start = filp->f_pos;
+		break;
+	case SEEK_END:
+		start = i_size_read(filp->f_path.dentry->d_inode);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	start += l->l_start;
+	if (start < 0)
+		return -EINVAL;
+	fl->fl_end = OFFSET_MAX;
+	if (l->l_len > 0) {
+		fl->fl_end = start + l->l_len - 1;
+	} else if (l->l_len < 0) {
+		fl->fl_end = start - 1;
+		start += l->l_len;
+		if (start < 0)
+			return -EINVAL;
+	}
+	fl->fl_start = start;	/* we record the absolute position */
+	if (fl->fl_end < fl->fl_start)
+		return -EOVERFLOW;
+	
+	fl->fl_owner = current->files;
+	fl->fl_pid = current->tgid;
+	fl->fl_file = filp;
+	fl->fl_flags = FL_POSIX;
+	fl->fl_ops = NULL;
+	fl->fl_lmops = NULL;
+
+	return assign_type(fl, l->l_type);
+}
+#endif
+
+/* default lease lock manager operations */
+static void lease_break_callback(struct file_lock *fl)
+{
+	kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG);
+}
+
+static void lease_release_private_callback(struct file_lock *fl)
+{
+	if (!fl->fl_file)
+		return;
+
+	f_delown(fl->fl_file);
+	fl->fl_file->f_owner.signum = 0;
+}
+
+static const struct lock_manager_operations lease_manager_ops = {
+	.fl_break = lease_break_callback,
+	.fl_release_private = lease_release_private_callback,
+	.fl_change = lease_modify,
+};
+
+/*
+ * Initialize a lease, use the default lock manager operations
+ */
+static int lease_init(struct file *filp, int type, struct file_lock *fl)
+ {
+	if (assign_type(fl, type) != 0)
+		return -EINVAL;
+
+	fl->fl_owner = current->files;
+	fl->fl_pid = current->tgid;
+
+	fl->fl_file = filp;
+	fl->fl_flags = FL_LEASE;
+	fl->fl_start = 0;
+	fl->fl_end = OFFSET_MAX;
+	fl->fl_ops = NULL;
+	fl->fl_lmops = &lease_manager_ops;
+	return 0;
+}
+
+/* Allocate a file_lock initialised to this type of lease */
+static struct file_lock *lease_alloc(struct file *filp, int type)
+{
+	struct file_lock *fl = locks_alloc_lock();
+	int error = -ENOMEM;
+
+	if (fl == NULL)
+		return ERR_PTR(error);
+
+	error = lease_init(filp, type, fl);
+	if (error) {
+		locks_free_lock(fl);
+		return ERR_PTR(error);
+	}
+	return fl;
+}
+
+/* Check if two locks overlap each other.
+ */
+static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2)
+{
+	return ((fl1->fl_end >= fl2->fl_start) &&
+		(fl2->fl_end >= fl1->fl_start));
+}
+
+/*
+ * Check whether two locks have the same owner.
+ */
+static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
+{
+	if (fl1->fl_lmops && fl1->fl_lmops->fl_compare_owner)
+		return fl2->fl_lmops == fl1->fl_lmops &&
+			fl1->fl_lmops->fl_compare_owner(fl1, fl2);
+	return fl1->fl_owner == fl2->fl_owner;
+}
+
+/* Remove waiter from blocker's block list.
+ * When blocker ends up pointing to itself then the list is empty.
+ */
+static void __locks_delete_block(struct file_lock *waiter)
+{
+	list_del_init(&waiter->fl_block);
+	list_del_init(&waiter->fl_link);
+	waiter->fl_next = NULL;
+}
+
+/*
+ */
+static void locks_delete_block(struct file_lock *waiter)
+{
+	lock_flocks();
+	__locks_delete_block(waiter);
+	unlock_flocks();
+}
+
+/* Insert waiter into blocker's block list.
+ * We use a circular list so that processes can be easily woken up in
+ * the order they blocked. The documentation doesn't require this but
+ * it seems like the reasonable thing to do.
+ */
+static void locks_insert_block(struct file_lock *blocker, 
+			       struct file_lock *waiter)
+{
+	BUG_ON(!list_empty(&waiter->fl_block));
+	list_add_tail(&waiter->fl_block, &blocker->fl_block);
+	waiter->fl_next = blocker;
+	if (IS_POSIX(blocker))
+		list_add(&waiter->fl_link, &blocked_list);
+}
+
+/* Wake up processes blocked waiting for blocker.
+ * If told to wait then schedule the processes until the block list
+ * is empty, otherwise empty the block list ourselves.
+ */
+static void locks_wake_up_blocks(struct file_lock *blocker)
+{
+	while (!list_empty(&blocker->fl_block)) {
+		struct file_lock *waiter;
+
+		waiter = list_first_entry(&blocker->fl_block,
+				struct file_lock, fl_block);
+		__locks_delete_block(waiter);
+		if (waiter->fl_lmops && waiter->fl_lmops->fl_notify)
+			waiter->fl_lmops->fl_notify(waiter);
+		else
+			wake_up(&waiter->fl_wait);
+	}
+}
+
+/* Insert file lock fl into an inode's lock list at the position indicated
+ * by pos. At the same time add the lock to the global file lock list.
+ */
+static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
+{
+	list_add(&fl->fl_link, &file_lock_list);
+
+	fl->fl_nspid = get_pid(task_tgid(current));
+
+	/* insert into file's list */
+	fl->fl_next = *pos;
+	*pos = fl;
+}
+
+/*
+ * Delete a lock and then free it.
+ * Wake up processes that are blocked waiting for this lock,
+ * notify the FS that the lock has been cleared and
+ * finally free the lock.
+ */
+static void locks_delete_lock(struct file_lock **thisfl_p)
+{
+	struct file_lock *fl = *thisfl_p;
+
+	*thisfl_p = fl->fl_next;
+	fl->fl_next = NULL;
+	list_del_init(&fl->fl_link);
+
+	fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync);
+	if (fl->fl_fasync != NULL) {
+		printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
+		fl->fl_fasync = NULL;
+	}
+
+	if (fl->fl_nspid) {
+		put_pid(fl->fl_nspid);
+		fl->fl_nspid = NULL;
+	}
+
+	locks_wake_up_blocks(fl);
+	locks_free_lock(fl);
+}
+
+/* Determine if lock sys_fl blocks lock caller_fl. Common functionality
+ * checks for shared/exclusive status of overlapping locks.
+ */
+static int locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl)
+{
+	if (sys_fl->fl_type == F_WRLCK)
+		return 1;
+	if (caller_fl->fl_type == F_WRLCK)
+		return 1;
+	return 0;
+}
+
+/* Determine if lock sys_fl blocks lock caller_fl. POSIX specific
+ * checking before calling the locks_conflict().
+ */
+static int posix_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl)
+{
+	/* POSIX locks owned by the same process do not conflict with
+	 * each other.
+	 */
+	if (!IS_POSIX(sys_fl) || posix_same_owner(caller_fl, sys_fl))
+		return (0);
+
+	/* Check whether they overlap */
+	if (!locks_overlap(caller_fl, sys_fl))
+		return 0;
+
+	return (locks_conflict(caller_fl, sys_fl));
+}
+
+/* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific
+ * checking before calling the locks_conflict().
+ */
+static int flock_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl)
+{
+	/* FLOCK locks referring to the same filp do not conflict with
+	 * each other.
+	 */
+	if (!IS_FLOCK(sys_fl) || (caller_fl->fl_file == sys_fl->fl_file))
+		return (0);
+	if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND))
+		return 0;
+
+	return (locks_conflict(caller_fl, sys_fl));
+}
+
+void
+posix_test_lock(struct file *filp, struct file_lock *fl)
+{
+	struct file_lock *cfl;
+
+	lock_flocks();
+	for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) {
+		if (!IS_POSIX(cfl))
+			continue;
+		if (posix_locks_conflict(fl, cfl))
+			break;
+	}
+	if (cfl) {
+		__locks_copy_lock(fl, cfl);
+		if (cfl->fl_nspid)
+			fl->fl_pid = pid_vnr(cfl->fl_nspid);
+	} else
+		fl->fl_type = F_UNLCK;
+	unlock_flocks();
+	return;
+}
+EXPORT_SYMBOL(posix_test_lock);
+
+/*
+ * Deadlock detection:
+ *
+ * We attempt to detect deadlocks that are due purely to posix file
+ * locks.
+ *
+ * We assume that a task can be waiting for at most one lock at a time.
+ * So for any acquired lock, the process holding that lock may be
+ * waiting on at most one other lock.  That lock in turns may be held by
+ * someone waiting for at most one other lock.  Given a requested lock
+ * caller_fl which is about to wait for a conflicting lock block_fl, we
+ * follow this chain of waiters to ensure we are not about to create a
+ * cycle.
+ *
+ * Since we do this before we ever put a process to sleep on a lock, we
+ * are ensured that there is never a cycle; that is what guarantees that
+ * the while() loop in posix_locks_deadlock() eventually completes.
+ *
+ * Note: the above assumption may not be true when handling lock
+ * requests from a broken NFS client. It may also fail in the presence
+ * of tasks (such as posix threads) sharing the same open file table.
+ *
+ * To handle those cases, we just bail out after a few iterations.
+ */
+
+#define MAX_DEADLK_ITERATIONS 10
+
+/* Find a lock that the owner of the given block_fl is blocking on. */
+static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
+{
+	struct file_lock *fl;
+
+	list_for_each_entry(fl, &blocked_list, fl_link) {
+		if (posix_same_owner(fl, block_fl))
+			return fl->fl_next;
+	}
+	return NULL;
+}
+
+static int posix_locks_deadlock(struct file_lock *caller_fl,
+				struct file_lock *block_fl)
+{
+	int i = 0;
+
+	while ((block_fl = what_owner_is_waiting_for(block_fl))) {
+		if (i++ > MAX_DEADLK_ITERATIONS)
+			return 0;
+		if (posix_same_owner(caller_fl, block_fl))
+			return 1;
+	}
+	return 0;
+}
+
+/* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
+ * after any leases, but before any posix locks.
+ *
+ * Note that if called with an FL_EXISTS argument, the caller may determine
+ * whether or not a lock was successfully freed by testing the return
+ * value for -ENOENT.
+ */
+static int flock_lock_file(struct file *filp, struct file_lock *request)
+{
+	struct file_lock *new_fl = NULL;
+	struct file_lock **before;
+	struct inode * inode = filp->f_path.dentry->d_inode;
+	int error = 0;
+	int found = 0;
+
+	if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
+		new_fl = locks_alloc_lock();
+		if (!new_fl)
+			return -ENOMEM;
+	}
+
+	lock_flocks();
+	if (request->fl_flags & FL_ACCESS)
+		goto find_conflict;
+
+	for_each_lock(inode, before) {
+		struct file_lock *fl = *before;
+		if (IS_POSIX(fl))
+			break;
+		if (IS_LEASE(fl))
+			continue;
+		if (filp != fl->fl_file)
+			continue;
+		if (request->fl_type == fl->fl_type)
+			goto out;
+		found = 1;
+		locks_delete_lock(before);
+		break;
+	}
+
+	if (request->fl_type == F_UNLCK) {
+		if ((request->fl_flags & FL_EXISTS) && !found)
+			error = -ENOENT;
+		goto out;
+	}
+
+	/*
+	 * If a higher-priority process was blocked on the old file lock,
+	 * give it the opportunity to lock the file.
+	 */
+	if (found) {
+		unlock_flocks();
+		cond_resched();
+		lock_flocks();
+	}
+
+find_conflict:
+	for_each_lock(inode, before) {
+		struct file_lock *fl = *before;
+		if (IS_POSIX(fl))
+			break;
+		if (IS_LEASE(fl))
+			continue;
+		if (!flock_locks_conflict(request, fl))
+			continue;
+		error = -EAGAIN;
+		if (!(request->fl_flags & FL_SLEEP))
+			goto out;
+		error = FILE_LOCK_DEFERRED;
+		locks_insert_block(fl, request);
+		goto out;
+	}
+	if (request->fl_flags & FL_ACCESS)
+		goto out;
+	locks_copy_lock(new_fl, request);
+	locks_insert_lock(before, new_fl);
+	new_fl = NULL;
+	error = 0;
+
+out:
+	unlock_flocks();
+	if (new_fl)
+		locks_free_lock(new_fl);
+	return error;
+}
+
+static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
+{
+	struct file_lock *fl;
+	struct file_lock *new_fl = NULL;
+	struct file_lock *new_fl2 = NULL;
+	struct file_lock *left = NULL;
+	struct file_lock *right = NULL;
+	struct file_lock **before;
+	int error, added = 0;
+
+	/*
+	 * We may need two file_lock structures for this operation,
+	 * so we get them in advance to avoid races.
+	 *
+	 * In some cases we can be sure, that no new locks will be needed
+	 */
+	if (!(request->fl_flags & FL_ACCESS) &&
+	    (request->fl_type != F_UNLCK ||
+	     request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
+		new_fl = locks_alloc_lock();
+		new_fl2 = locks_alloc_lock();
+	}
+
+	lock_flocks();
+	if (request->fl_type != F_UNLCK) {
+		for_each_lock(inode, before) {
+			fl = *before;
+			if (!IS_POSIX(fl))
+				continue;
+			if (!posix_locks_conflict(request, fl))
+				continue;
+			if (conflock)
+				__locks_copy_lock(conflock, fl);
+			error = -EAGAIN;
+			if (!(request->fl_flags & FL_SLEEP))
+				goto out;
+			error = -EDEADLK;
+			if (posix_locks_deadlock(request, fl))
+				goto out;
+			error = FILE_LOCK_DEFERRED;
+			locks_insert_block(fl, request);
+			goto out;
+  		}
+  	}
+
+	/* If we're just looking for a conflict, we're done. */
+	error = 0;
+	if (request->fl_flags & FL_ACCESS)
+		goto out;
+
+	/*
+	 * Find the first old lock with the same owner as the new lock.
+	 */
+	
+	before = &inode->i_flock;
+
+	/* First skip locks owned by other processes.  */
+	while ((fl = *before) && (!IS_POSIX(fl) ||
+				  !posix_same_owner(request, fl))) {
+		before = &fl->fl_next;
+	}
+
+	/* Process locks with this owner.  */
+	while ((fl = *before) && posix_same_owner(request, fl)) {
+		/* Detect adjacent or overlapping regions (if same lock type)
+		 */
+		if (request->fl_type == fl->fl_type) {
+			/* In all comparisons of start vs end, use
+			 * "start - 1" rather than "end + 1". If end
+			 * is OFFSET_MAX, end + 1 will become negative.
+			 */
+			if (fl->fl_end < request->fl_start - 1)
+				goto next_lock;
+			/* If the next lock in the list has entirely bigger
+			 * addresses than the new one, insert the lock here.
+			 */
+			if (fl->fl_start - 1 > request->fl_end)
+				break;
+
+			/* If we come here, the new and old lock are of the
+			 * same type and adjacent or overlapping. Make one
+			 * lock yielding from the lower start address of both
+			 * locks to the higher end address.
+			 */
+			if (fl->fl_start > request->fl_start)
+				fl->fl_start = request->fl_start;
+			else
+				request->fl_start = fl->fl_start;
+			if (fl->fl_end < request->fl_end)
+				fl->fl_end = request->fl_end;
+			else
+				request->fl_end = fl->fl_end;
+			if (added) {
+				locks_delete_lock(before);
+				continue;
+			}
+			request = fl;
+			added = 1;
+		}
+		else {
+			/* Processing for different lock types is a bit
+			 * more complex.
+			 */
+			if (fl->fl_end < request->fl_start)
+				goto next_lock;
+			if (fl->fl_start > request->fl_end)
+				break;
+			if (request->fl_type == F_UNLCK)
+				added = 1;
+			if (fl->fl_start < request->fl_start)
+				left = fl;
+			/* If the next lock in the list has a higher end
+			 * address than the new one, insert the new one here.
+			 */
+			if (fl->fl_end > request->fl_end) {
+				right = fl;
+				break;
+			}
+			if (fl->fl_start >= request->fl_start) {
+				/* The new lock completely replaces an old
+				 * one (This may happen several times).
+				 */
+				if (added) {
+					locks_delete_lock(before);
+					continue;
+				}
+				/* Replace the old lock with the new one.
+				 * Wake up anybody waiting for the old one,
+				 * as the change in lock type might satisfy
+				 * their needs.
+				 */
+				locks_wake_up_blocks(fl);
+				fl->fl_start = request->fl_start;
+				fl->fl_end = request->fl_end;
+				fl->fl_type = request->fl_type;
+				locks_release_private(fl);
+				locks_copy_private(fl, request);
+				request = fl;
+				added = 1;
+			}
+		}
+		/* Go on to next lock.
+		 */
+	next_lock:
+		before = &fl->fl_next;
+	}
+
+	/*
+	 * The above code only modifies existing locks in case of
+	 * merging or replacing.  If new lock(s) need to be inserted
+	 * all modifications are done bellow this, so it's safe yet to
+	 * bail out.
+	 */
+	error = -ENOLCK; /* "no luck" */
+	if (right && left == right && !new_fl2)
+		goto out;
+
+	error = 0;
+	if (!added) {
+		if (request->fl_type == F_UNLCK) {
+			if (request->fl_flags & FL_EXISTS)
+				error = -ENOENT;
+			goto out;
+		}
+
+		if (!new_fl) {
+			error = -ENOLCK;
+			goto out;
+		}
+		locks_copy_lock(new_fl, request);
+		locks_insert_lock(before, new_fl);
+		new_fl = NULL;
+	}
+	if (right) {
+		if (left == right) {
+			/* The new lock breaks the old one in two pieces,
+			 * so we have to use the second new lock.
+			 */
+			left = new_fl2;
+			new_fl2 = NULL;
+			locks_copy_lock(left, right);
+			locks_insert_lock(before, left);
+		}
+		right->fl_start = request->fl_end + 1;
+		locks_wake_up_blocks(right);
+	}
+	if (left) {
+		left->fl_end = request->fl_start - 1;
+		locks_wake_up_blocks(left);
+	}
+ out:
+	unlock_flocks();
+	/*
+	 * Free any unused locks.
+	 */
+	if (new_fl)
+		locks_free_lock(new_fl);
+	if (new_fl2)
+		locks_free_lock(new_fl2);
+	return error;
+}
+
+/**
+ * posix_lock_file - Apply a POSIX-style lock to a file
+ * @filp: The file to apply the lock to
+ * @fl: The lock to be applied
+ * @conflock: Place to return a copy of the conflicting lock, if found.
+ *
+ * Add a POSIX style lock to a file.
+ * We merge adjacent & overlapping locks whenever possible.
+ * POSIX locks are sorted by owner task, then by starting address
+ *
+ * Note that if called with an FL_EXISTS argument, the caller may determine
+ * whether or not a lock was successfully freed by testing the return
+ * value for -ENOENT.
+ */
+int posix_lock_file(struct file *filp, struct file_lock *fl,
+			struct file_lock *conflock)
+{
+	return __posix_lock_file(filp->f_path.dentry->d_inode, fl, conflock);
+}
+EXPORT_SYMBOL(posix_lock_file);
+
+/**
+ * posix_lock_file_wait - Apply a POSIX-style lock to a file
+ * @filp: The file to apply the lock to
+ * @fl: The lock to be applied
+ *
+ * Add a POSIX style lock to a file.
+ * We merge adjacent & overlapping locks whenever possible.
+ * POSIX locks are sorted by owner task, then by starting address
+ */
+int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
+{
+	int error;
+	might_sleep ();
+	for (;;) {
+		error = posix_lock_file(filp, fl, NULL);
+		if (error != FILE_LOCK_DEFERRED)
+			break;
+		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
+		if (!error)
+			continue;
+
+		locks_delete_block(fl);
+		break;
+	}
+	return error;
+}
+EXPORT_SYMBOL(posix_lock_file_wait);
+
+/**
+ * locks_mandatory_locked - Check for an active lock
+ * @inode: the file to check
+ *
+ * Searches the inode's list of locks to find any POSIX locks which conflict.
+ * This function is called from locks_verify_locked() only.
+ */
+int locks_mandatory_locked(struct inode *inode)
+{
+	fl_owner_t owner = current->files;
+	struct file_lock *fl;
+
+	/*
+	 * Search the lock list for this inode for any POSIX locks.
+	 */
+	lock_flocks();
+	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+		if (!IS_POSIX(fl))
+			continue;
+		if (fl->fl_owner != owner)
+			break;
+	}
+	unlock_flocks();
+	return fl ? -EAGAIN : 0;
+}
+
+/**
+ * locks_mandatory_area - Check for a conflicting lock
+ * @read_write: %FLOCK_VERIFY_WRITE for exclusive access, %FLOCK_VERIFY_READ
+ *		for shared
+ * @inode:      the file to check
+ * @filp:       how the file was opened (if it was)
+ * @offset:     start of area to check
+ * @count:      length of area to check
+ *
+ * Searches the inode's list of locks to find any POSIX locks which conflict.
+ * This function is called from rw_verify_area() and
+ * locks_verify_truncate().
+ */
+int locks_mandatory_area(int read_write, struct inode *inode,
+			 struct file *filp, loff_t offset,
+			 size_t count)
+{
+	struct file_lock fl;
+	int error;
+
+	locks_init_lock(&fl);
+	fl.fl_owner = current->files;
+	fl.fl_pid = current->tgid;
+	fl.fl_file = filp;
+	fl.fl_flags = FL_POSIX | FL_ACCESS;
+	if (filp && !(filp->f_flags & O_NONBLOCK))
+		fl.fl_flags |= FL_SLEEP;
+	fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK;
+	fl.fl_start = offset;
+	fl.fl_end = offset + count - 1;
+
+	for (;;) {
+		error = __posix_lock_file(inode, &fl, NULL);
+		if (error != FILE_LOCK_DEFERRED)
+			break;
+		error = wait_event_interruptible(fl.fl_wait, !fl.fl_next);
+		if (!error) {
+			/*
+			 * If we've been sleeping someone might have
+			 * changed the permissions behind our back.
+			 */
+			if (__mandatory_lock(inode))
+				continue;
+		}
+
+		locks_delete_block(&fl);
+		break;
+	}
+
+	return error;
+}
+
+EXPORT_SYMBOL(locks_mandatory_area);
+
+/* We already had a lease on this file; just change its type */
+int lease_modify(struct file_lock **before, int arg)
+{
+	struct file_lock *fl = *before;
+	int error = assign_type(fl, arg);
+
+	if (error)
+		return error;
+	locks_wake_up_blocks(fl);
+	if (arg == F_UNLCK)
+		locks_delete_lock(before);
+	return 0;
+}
+
+EXPORT_SYMBOL(lease_modify);
+
+static void time_out_leases(struct inode *inode)
+{
+	struct file_lock **before;
+	struct file_lock *fl;
+
+	before = &inode->i_flock;
+	while ((fl = *before) && IS_LEASE(fl) && (fl->fl_type & F_INPROGRESS)) {
+		if ((fl->fl_break_time == 0)
+				|| time_before(jiffies, fl->fl_break_time)) {
+			before = &fl->fl_next;
+			continue;
+		}
+		lease_modify(before, fl->fl_type & ~F_INPROGRESS);
+		if (fl == *before)	/* lease_modify may have freed fl */
+			before = &fl->fl_next;
+	}
+}
+
+/**
+ *	__break_lease	-	revoke all outstanding leases on file
+ *	@inode: the inode of the file to return
+ *	@mode: the open mode (read or write)
+ *
+ *	break_lease (inlined for speed) has checked there already is at least
+ *	some kind of lock (maybe a lease) on this file.  Leases are broken on
+ *	a call to open() or truncate().  This function can sleep unless you
+ *	specified %O_NONBLOCK to your open().
+ */
+int __break_lease(struct inode *inode, unsigned int mode)
+{
+	int error = 0, future;
+	struct file_lock *new_fl, *flock;
+	struct file_lock *fl;
+	unsigned long break_time;
+	int i_have_this_lease = 0;
+	int want_write = (mode & O_ACCMODE) != O_RDONLY;
+
+	new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
+
+	lock_flocks();
+
+	time_out_leases(inode);
+
+	flock = inode->i_flock;
+	if ((flock == NULL) || !IS_LEASE(flock))
+		goto out;
+
+	for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next)
+		if (fl->fl_owner == current->files)
+			i_have_this_lease = 1;
+
+	if (want_write) {
+		/* If we want write access, we have to revoke any lease. */
+		future = F_UNLCK | F_INPROGRESS;
+	} else if (flock->fl_type & F_INPROGRESS) {
+		/* If the lease is already being broken, we just leave it */
+		future = flock->fl_type;
+	} else if (flock->fl_type & F_WRLCK) {
+		/* Downgrade the exclusive lease to a read-only lease. */
+		future = F_RDLCK | F_INPROGRESS;
+	} else {
+		/* the existing lease was read-only, so we can read too. */
+		goto out;
+	}
+
+	if (IS_ERR(new_fl) && !i_have_this_lease
+			&& ((mode & O_NONBLOCK) == 0)) {
+		error = PTR_ERR(new_fl);
+		goto out;
+	}
+
+	break_time = 0;
+	if (lease_break_time > 0) {
+		break_time = jiffies + lease_break_time * HZ;
+		if (break_time == 0)
+			break_time++;	/* so that 0 means no break time */
+	}
+
+	for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) {
+		if (fl->fl_type != future) {
+			fl->fl_type = future;
+			fl->fl_break_time = break_time;
+			/* lease must have lmops break callback */
+			fl->fl_lmops->fl_break(fl);
+		}
+	}
+
+	if (i_have_this_lease || (mode & O_NONBLOCK)) {
+		error = -EWOULDBLOCK;
+		goto out;
+	}
+
+restart:
+	break_time = flock->fl_break_time;
+	if (break_time != 0) {
+		break_time -= jiffies;
+		if (break_time == 0)
+			break_time++;
+	}
+	locks_insert_block(flock, new_fl);
+	unlock_flocks();
+	error = wait_event_interruptible_timeout(new_fl->fl_wait,
+						!new_fl->fl_next, break_time);
+	lock_flocks();
+	__locks_delete_block(new_fl);
+	if (error >= 0) {
+		if (error == 0)
+			time_out_leases(inode);
+		/* Wait for the next lease that has not been broken yet */
+		for (flock = inode->i_flock; flock && IS_LEASE(flock);
+				flock = flock->fl_next) {
+			if (flock->fl_type & F_INPROGRESS)
+				goto restart;
+		}
+		error = 0;
+	}
+
+out:
+	unlock_flocks();
+	if (!IS_ERR(new_fl))
+		locks_free_lock(new_fl);
+	return error;
+}
+
+EXPORT_SYMBOL(__break_lease);
+
+/**
+ *	lease_get_mtime - get the last modified time of an inode
+ *	@inode: the inode
+ *      @time:  pointer to a timespec which will contain the last modified time
+ *
+ * This is to force NFS clients to flush their caches for files with
+ * exclusive leases.  The justification is that if someone has an
+ * exclusive lease, then they could be modifying it.
+ */
+void lease_get_mtime(struct inode *inode, struct timespec *time)
+{
+	struct file_lock *flock = inode->i_flock;
+	if (flock && IS_LEASE(flock) && (flock->fl_type & F_WRLCK))
+		*time = current_fs_time(inode->i_sb);
+	else
+		*time = inode->i_mtime;
+}
+
+EXPORT_SYMBOL(lease_get_mtime);
+
+/**
+ *	fcntl_getlease - Enquire what lease is currently active
+ *	@filp: the file
+ *
+ *	The value returned by this function will be one of
+ *	(if no lease break is pending):
+ *
+ *	%F_RDLCK to indicate a shared lease is held.
+ *
+ *	%F_WRLCK to indicate an exclusive lease is held.
+ *
+ *	%F_UNLCK to indicate no lease is held.
+ *
+ *	(if a lease break is pending):
+ *
+ *	%F_RDLCK to indicate an exclusive lease needs to be
+ *		changed to a shared lease (or removed).
+ *
+ *	%F_UNLCK to indicate the lease needs to be removed.
+ *
+ *	XXX: sfr & willy disagree over whether F_INPROGRESS
+ *	should be returned to userspace.
+ */
+int fcntl_getlease(struct file *filp)
+{
+	struct file_lock *fl;
+	int type = F_UNLCK;
+
+	lock_flocks();
+	time_out_leases(filp->f_path.dentry->d_inode);
+	for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
+			fl = fl->fl_next) {
+		if (fl->fl_file == filp) {
+			type = fl->fl_type & ~F_INPROGRESS;
+			break;
+		}
+	}
+	unlock_flocks();
+	return type;
+}
+
+/**
+ *	generic_setlease	-	sets a lease on an open file
+ *	@filp: file pointer
+ *	@arg: type of lease to obtain
+ *	@flp: input - file_lock to use, output - file_lock inserted
+ *
+ *	The (input) flp->fl_lmops->fl_break function is required
+ *	by break_lease().
+ *
+ *	Called with file_lock_lock held.
+ */
+int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
+{
+	struct file_lock *fl, **before, **my_before = NULL, *lease;
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	int error, rdlease_count = 0, wrlease_count = 0;
+
+	lease = *flp;
+
+	error = -EACCES;
+	if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
+		goto out;
+	error = -EINVAL;
+	if (!S_ISREG(inode->i_mode))
+		goto out;
+	error = security_file_lock(filp, arg);
+	if (error)
+		goto out;
+
+	time_out_leases(inode);
+
+	BUG_ON(!(*flp)->fl_lmops->fl_break);
+
+	if (arg != F_UNLCK) {
+		error = -EAGAIN;
+		if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
+			goto out;
+		if ((arg == F_WRLCK)
+		    && ((dentry->d_count > 1)
+			|| (atomic_read(&inode->i_count) > 1)))
+			goto out;
+	}
+
+	/*
+	 * At this point, we know that if there is an exclusive
+	 * lease on this file, then we hold it on this filp
+	 * (otherwise our open of this file would have blocked).
+	 * And if we are trying to acquire an exclusive lease,
+	 * then the file is not open by anyone (including us)
+	 * except for this filp.
+	 */
+	for (before = &inode->i_flock;
+			((fl = *before) != NULL) && IS_LEASE(fl);
+			before = &fl->fl_next) {
+		if (fl->fl_file == filp)
+			my_before = before;
+		else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
+			/*
+			 * Someone is in the process of opening this
+			 * file for writing so we may not take an
+			 * exclusive lease on it.
+			 */
+			wrlease_count++;
+		else
+			rdlease_count++;
+	}
+
+	error = -EAGAIN;
+	if ((arg == F_RDLCK && (wrlease_count > 0)) ||
+	    (arg == F_WRLCK && ((rdlease_count + wrlease_count) > 0)))
+		goto out;
+
+	if (my_before != NULL) {
+		error = lease->fl_lmops->fl_change(my_before, arg);
+		if (!error)
+			*flp = *my_before;
+		goto out;
+	}
+
+	if (arg == F_UNLCK)
+		goto out;
+
+	error = -EINVAL;
+	if (!leases_enable)
+		goto out;
+
+	locks_insert_lock(before, lease);
+	return 0;
+
+out:
+	return error;
+}
+EXPORT_SYMBOL(generic_setlease);
+
+static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
+{
+	if (filp->f_op && filp->f_op->setlease)
+		return filp->f_op->setlease(filp, arg, lease);
+	else
+		return generic_setlease(filp, arg, lease);
+}
+
+/**
+ *	vfs_setlease        -       sets a lease on an open file
+ *	@filp: file pointer
+ *	@arg: type of lease to obtain
+ *	@lease: file_lock to use
+ *
+ *	Call this to establish a lease on the file.
+ *	The (*lease)->fl_lmops->fl_break operation must be set; if not,
+ *	break_lease will oops!
+ *
+ *	This will call the filesystem's setlease file method, if
+ *	defined.  Note that there is no getlease method; instead, the
+ *	filesystem setlease method should call back to setlease() to
+ *	add a lease to the inode's lease list, where fcntl_getlease() can
+ *	find it.  Since fcntl_getlease() only reports whether the current
+ *	task holds a lease, a cluster filesystem need only do this for
+ *	leases held by processes on this node.
+ *
+ *	There is also no break_lease method; filesystems that
+ *	handle their own leases should break leases themselves from the
+ *	filesystem's open, create, and (on truncate) setattr methods.
+ *
+ *	Warning: the only current setlease methods exist only to disable
+ *	leases in certain cases.  More vfs changes may be required to
+ *	allow a full filesystem lease implementation.
+ */
+
+int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
+{
+	int error;
+
+	lock_flocks();
+	error = __vfs_setlease(filp, arg, lease);
+	unlock_flocks();
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_setlease);
+
+static int do_fcntl_delete_lease(struct file *filp)
+{
+	struct file_lock fl, *flp = &fl;
+
+	lease_init(filp, F_UNLCK, flp);
+
+	return vfs_setlease(filp, F_UNLCK, &flp);
+}
+
+static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
+{
+	struct file_lock *fl, *ret;
+	struct fasync_struct *new;
+	int error;
+
+	fl = lease_alloc(filp, arg);
+	if (IS_ERR(fl))
+		return PTR_ERR(fl);
+
+	new = fasync_alloc();
+	if (!new) {
+		locks_free_lock(fl);
+		return -ENOMEM;
+	}
+	ret = fl;
+	lock_flocks();
+	error = __vfs_setlease(filp, arg, &ret);
+	if (error) {
+		unlock_flocks();
+		locks_free_lock(fl);
+		goto out_free_fasync;
+	}
+	if (ret != fl)
+		locks_free_lock(fl);
+
+	/*
+	 * fasync_insert_entry() returns the old entry if any.
+	 * If there was no old entry, then it used 'new' and
+	 * inserted it into the fasync list. Clear new so that
+	 * we don't release it here.
+	 */
+	if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new))
+		new = NULL;
+
+	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+	unlock_flocks();
+
+out_free_fasync:
+	if (new)
+		fasync_free(new);
+	return error;
+}
+
+/**
+ *	fcntl_setlease	-	sets a lease on an open file
+ *	@fd: open file descriptor
+ *	@filp: file pointer
+ *	@arg: type of lease to obtain
+ *
+ *	Call this fcntl to establish a lease on the file.
+ *	Note that you also need to call %F_SETSIG to
+ *	receive a signal when the lease is broken.
+ */
+int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
+{
+	if (arg == F_UNLCK)
+		return do_fcntl_delete_lease(filp);
+	return do_fcntl_add_lease(fd, filp, arg);
+}
+
+/**
+ * flock_lock_file_wait - Apply a FLOCK-style lock to a file
+ * @filp: The file to apply the lock to
+ * @fl: The lock to be applied
+ *
+ * Add a FLOCK style lock to a file.
+ */
+int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
+{
+	int error;
+	might_sleep();
+	for (;;) {
+		error = flock_lock_file(filp, fl);
+		if (error != FILE_LOCK_DEFERRED)
+			break;
+		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
+		if (!error)
+			continue;
+
+		locks_delete_block(fl);
+		break;
+	}
+	return error;
+}
+
+EXPORT_SYMBOL(flock_lock_file_wait);
+
+/**
+ *	sys_flock: - flock() system call.
+ *	@fd: the file descriptor to lock.
+ *	@cmd: the type of lock to apply.
+ *
+ *	Apply a %FL_FLOCK style lock to an open file descriptor.
+ *	The @cmd can be one of
+ *
+ *	%LOCK_SH -- a shared lock.
+ *
+ *	%LOCK_EX -- an exclusive lock.
+ *
+ *	%LOCK_UN -- remove an existing lock.
+ *
+ *	%LOCK_MAND -- a `mandatory' flock.  This exists to emulate Windows Share Modes.
+ *
+ *	%LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
+ *	processes read and write access respectively.
+ */
+SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
+{
+	struct file *filp;
+	struct file_lock *lock;
+	int can_sleep, unlock;
+	int error;
+
+	error = -EBADF;
+	filp = fget(fd);
+	if (!filp)
+		goto out;
+
+	can_sleep = !(cmd & LOCK_NB);
+	cmd &= ~LOCK_NB;
+	unlock = (cmd == LOCK_UN);
+
+	if (!unlock && !(cmd & LOCK_MAND) &&
+	    !(filp->f_mode & (FMODE_READ|FMODE_WRITE)))
+		goto out_putf;
+
+	error = flock_make_lock(filp, &lock, cmd);
+	if (error)
+		goto out_putf;
+	if (can_sleep)
+		lock->fl_flags |= FL_SLEEP;
+
+	error = security_file_lock(filp, lock->fl_type);
+	if (error)
+		goto out_free;
+
+	if (filp->f_op && filp->f_op->flock)
+		error = filp->f_op->flock(filp,
+					  (can_sleep) ? F_SETLKW : F_SETLK,
+					  lock);
+	else
+		error = flock_lock_file_wait(filp, lock);
+
+ out_free:
+	locks_free_lock(lock);
+
+ out_putf:
+	fput(filp);
+ out:
+	return error;
+}
+
+/**
+ * vfs_test_lock - test file byte range lock
+ * @filp: The file to test lock for
+ * @fl: The lock to test; also used to hold result
+ *
+ * Returns -ERRNO on failure.  Indicates presence of conflicting lock by
+ * setting conf->fl_type to something other than F_UNLCK.
+ */
+int vfs_test_lock(struct file *filp, struct file_lock *fl)
+{
+	if (filp->f_op && filp->f_op->lock)
+		return filp->f_op->lock(filp, F_GETLK, fl);
+	posix_test_lock(filp, fl);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfs_test_lock);
+
+static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
+{
+	flock->l_pid = fl->fl_pid;
+#if BITS_PER_LONG == 32
+	/*
+	 * Make sure we can represent the posix lock via
+	 * legacy 32bit flock.
+	 */
+	if (fl->fl_start > OFFT_OFFSET_MAX)
+		return -EOVERFLOW;
+	if (fl->fl_end != OFFSET_MAX && fl->fl_end > OFFT_OFFSET_MAX)
+		return -EOVERFLOW;
+#endif
+	flock->l_start = fl->fl_start;
+	flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
+		fl->fl_end - fl->fl_start + 1;
+	flock->l_whence = 0;
+	flock->l_type = fl->fl_type;
+	return 0;
+}
+
+#if BITS_PER_LONG == 32
+static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
+{
+	flock->l_pid = fl->fl_pid;
+	flock->l_start = fl->fl_start;
+	flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
+		fl->fl_end - fl->fl_start + 1;
+	flock->l_whence = 0;
+	flock->l_type = fl->fl_type;
+}
+#endif
+
+/* Report the first existing lock that would conflict with l.
+ * This implements the F_GETLK command of fcntl().
+ */
+int fcntl_getlk(struct file *filp, struct flock __user *l)
+{
+	struct file_lock file_lock;
+	struct flock flock;
+	int error;
+
+	error = -EFAULT;
+	if (copy_from_user(&flock, l, sizeof(flock)))
+		goto out;
+	error = -EINVAL;
+	if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK))
+		goto out;
+
+	error = flock_to_posix_lock(filp, &file_lock, &flock);
+	if (error)
+		goto out;
+
+	error = vfs_test_lock(filp, &file_lock);
+	if (error)
+		goto out;
+ 
+	flock.l_type = file_lock.fl_type;
+	if (file_lock.fl_type != F_UNLCK) {
+		error = posix_lock_to_flock(&flock, &file_lock);
+		if (error)
+			goto out;
+	}
+	error = -EFAULT;
+	if (!copy_to_user(l, &flock, sizeof(flock)))
+		error = 0;
+out:
+	return error;
+}
+
+/**
+ * vfs_lock_file - file byte range lock
+ * @filp: The file to apply the lock to
+ * @cmd: type of locking operation (F_SETLK, F_GETLK, etc.)
+ * @fl: The lock to be applied
+ * @conf: Place to return a copy of the conflicting lock, if found.
+ *
+ * A caller that doesn't care about the conflicting lock may pass NULL
+ * as the final argument.
+ *
+ * If the filesystem defines a private ->lock() method, then @conf will
+ * be left unchanged; so a caller that cares should initialize it to
+ * some acceptable default.
+ *
+ * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
+ * locks, the ->lock() interface may return asynchronously, before the lock has
+ * been granted or denied by the underlying filesystem, if (and only if)
+ * fl_grant is set. Callers expecting ->lock() to return asynchronously
+ * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
+ * the request is for a blocking lock. When ->lock() does return asynchronously,
+ * it must return FILE_LOCK_DEFERRED, and call ->fl_grant() when the lock
+ * request completes.
+ * If the request is for non-blocking lock the file system should return
+ * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
+ * with the result. If the request timed out the callback routine will return a
+ * nonzero return code and the file system should release the lock. The file
+ * system is also responsible to keep a corresponding posix lock when it
+ * grants a lock so the VFS can find out which locks are locally held and do
+ * the correct lock cleanup when required.
+ * The underlying filesystem must not drop the kernel lock or call
+ * ->fl_grant() before returning to the caller with a FILE_LOCK_DEFERRED
+ * return code.
+ */
+int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
+{
+	if (filp->f_op && filp->f_op->lock)
+		return filp->f_op->lock(filp, cmd, fl);
+	else
+		return posix_lock_file(filp, fl, conf);
+}
+EXPORT_SYMBOL_GPL(vfs_lock_file);
+
+static int do_lock_file_wait(struct file *filp, unsigned int cmd,
+			     struct file_lock *fl)
+{
+	int error;
+
+	error = security_file_lock(filp, fl->fl_type);
+	if (error)
+		return error;
+
+	for (;;) {
+		error = vfs_lock_file(filp, cmd, fl, NULL);
+		if (error != FILE_LOCK_DEFERRED)
+			break;
+		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
+		if (!error)
+			continue;
+
+		locks_delete_block(fl);
+		break;
+	}
+
+	return error;
+}
+
+/* Apply the lock described by l to an open file descriptor.
+ * This implements both the F_SETLK and F_SETLKW commands of fcntl().
+ */
+int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
+		struct flock __user *l)
+{
+	struct file_lock *file_lock = locks_alloc_lock();
+	struct flock flock;
+	struct inode *inode;
+	struct file *f;
+	int error;
+
+	if (file_lock == NULL)
+		return -ENOLCK;
+
+	/*
+	 * This might block, so we do it before checking the inode.
+	 */
+	error = -EFAULT;
+	if (copy_from_user(&flock, l, sizeof(flock)))
+		goto out;
+
+	inode = filp->f_path.dentry->d_inode;
+
+	/* Don't allow mandatory locks on files that may be memory mapped
+	 * and shared.
+	 */
+	if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
+		error = -EAGAIN;
+		goto out;
+	}
+
+again:
+	error = flock_to_posix_lock(filp, file_lock, &flock);
+	if (error)
+		goto out;
+	if (cmd == F_SETLKW) {
+		file_lock->fl_flags |= FL_SLEEP;
+	}
+	
+	error = -EBADF;
+	switch (flock.l_type) {
+	case F_RDLCK:
+		if (!(filp->f_mode & FMODE_READ))
+			goto out;
+		break;
+	case F_WRLCK:
+		if (!(filp->f_mode & FMODE_WRITE))
+			goto out;
+		break;
+	case F_UNLCK:
+		break;
+	default:
+		error = -EINVAL;
+		goto out;
+	}
+
+	error = do_lock_file_wait(filp, cmd, file_lock);
+
+	/*
+	 * Attempt to detect a close/fcntl race and recover by
+	 * releasing the lock that was just acquired.
+	 */
+	/*
+	 * we need that spin_lock here - it prevents reordering between
+	 * update of inode->i_flock and check for it done in close().
+	 * rcu_read_lock() wouldn't do.
+	 */
+	spin_lock(&current->files->file_lock);
+	f = fcheck(fd);
+	spin_unlock(&current->files->file_lock);
+	if (!error && f != filp && flock.l_type != F_UNLCK) {
+		flock.l_type = F_UNLCK;
+		goto again;
+	}
+
+out:
+	locks_free_lock(file_lock);
+	return error;
+}
+
+#if BITS_PER_LONG == 32
+/* Report the first existing lock that would conflict with l.
+ * This implements the F_GETLK command of fcntl().
+ */
+int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
+{
+	struct file_lock file_lock;
+	struct flock64 flock;
+	int error;
+
+	error = -EFAULT;
+	if (copy_from_user(&flock, l, sizeof(flock)))
+		goto out;
+	error = -EINVAL;
+	if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK))
+		goto out;
+
+	error = flock64_to_posix_lock(filp, &file_lock, &flock);
+	if (error)
+		goto out;
+
+	error = vfs_test_lock(filp, &file_lock);
+	if (error)
+		goto out;
+
+	flock.l_type = file_lock.fl_type;
+	if (file_lock.fl_type != F_UNLCK)
+		posix_lock_to_flock64(&flock, &file_lock);
+
+	error = -EFAULT;
+	if (!copy_to_user(l, &flock, sizeof(flock)))
+		error = 0;
+  
+out:
+	return error;
+}
+
+/* Apply the lock described by l to an open file descriptor.
+ * This implements both the F_SETLK and F_SETLKW commands of fcntl().
+ */
+int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
+		struct flock64 __user *l)
+{
+	struct file_lock *file_lock = locks_alloc_lock();
+	struct flock64 flock;
+	struct inode *inode;
+	struct file *f;
+	int error;
+
+	if (file_lock == NULL)
+		return -ENOLCK;
+
+	/*
+	 * This might block, so we do it before checking the inode.
+	 */
+	error = -EFAULT;
+	if (copy_from_user(&flock, l, sizeof(flock)))
+		goto out;
+
+	inode = filp->f_path.dentry->d_inode;
+
+	/* Don't allow mandatory locks on files that may be memory mapped
+	 * and shared.
+	 */
+	if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
+		error = -EAGAIN;
+		goto out;
+	}
+
+again:
+	error = flock64_to_posix_lock(filp, file_lock, &flock);
+	if (error)
+		goto out;
+	if (cmd == F_SETLKW64) {
+		file_lock->fl_flags |= FL_SLEEP;
+	}
+	
+	error = -EBADF;
+	switch (flock.l_type) {
+	case F_RDLCK:
+		if (!(filp->f_mode & FMODE_READ))
+			goto out;
+		break;
+	case F_WRLCK:
+		if (!(filp->f_mode & FMODE_WRITE))
+			goto out;
+		break;
+	case F_UNLCK:
+		break;
+	default:
+		error = -EINVAL;
+		goto out;
+	}
+
+	error = do_lock_file_wait(filp, cmd, file_lock);
+
+	/*
+	 * Attempt to detect a close/fcntl race and recover by
+	 * releasing the lock that was just acquired.
+	 */
+	spin_lock(&current->files->file_lock);
+	f = fcheck(fd);
+	spin_unlock(&current->files->file_lock);
+	if (!error && f != filp && flock.l_type != F_UNLCK) {
+		flock.l_type = F_UNLCK;
+		goto again;
+	}
+
+out:
+	locks_free_lock(file_lock);
+	return error;
+}
+#endif /* BITS_PER_LONG == 32 */
+
+/*
+ * This function is called when the file is being removed
+ * from the task's fd array.  POSIX locks belonging to this task
+ * are deleted at this time.
+ */
+void locks_remove_posix(struct file *filp, fl_owner_t owner)
+{
+	struct file_lock lock;
+
+	/*
+	 * If there are no locks held on this file, we don't need to call
+	 * posix_lock_file().  Another process could be setting a lock on this
+	 * file at the same time, but we wouldn't remove that lock anyway.
+	 */
+	if (!filp->f_path.dentry->d_inode->i_flock)
+		return;
+
+	lock.fl_type = F_UNLCK;
+	lock.fl_flags = FL_POSIX | FL_CLOSE;
+	lock.fl_start = 0;
+	lock.fl_end = OFFSET_MAX;
+	lock.fl_owner = owner;
+	lock.fl_pid = current->tgid;
+	lock.fl_file = filp;
+	lock.fl_ops = NULL;
+	lock.fl_lmops = NULL;
+
+	vfs_lock_file(filp, F_SETLK, &lock, NULL);
+
+	if (lock.fl_ops && lock.fl_ops->fl_release_private)
+		lock.fl_ops->fl_release_private(&lock);
+}
+
+EXPORT_SYMBOL(locks_remove_posix);
+
+/*
+ * This function is called on the last close of an open file.
+ */
+void locks_remove_flock(struct file *filp)
+{
+	struct inode * inode = filp->f_path.dentry->d_inode;
+	struct file_lock *fl;
+	struct file_lock **before;
+
+	if (!inode->i_flock)
+		return;
+
+	if (filp->f_op && filp->f_op->flock) {
+		struct file_lock fl = {
+			.fl_pid = current->tgid,
+			.fl_file = filp,
+			.fl_flags = FL_FLOCK,
+			.fl_type = F_UNLCK,
+			.fl_end = OFFSET_MAX,
+		};
+		filp->f_op->flock(filp, F_SETLKW, &fl);
+		if (fl.fl_ops && fl.fl_ops->fl_release_private)
+			fl.fl_ops->fl_release_private(&fl);
+	}
+
+	lock_flocks();
+	before = &inode->i_flock;
+
+	while ((fl = *before) != NULL) {
+		if (fl->fl_file == filp) {
+			if (IS_FLOCK(fl)) {
+				locks_delete_lock(before);
+				continue;
+			}
+			if (IS_LEASE(fl)) {
+				lease_modify(before, F_UNLCK);
+				continue;
+			}
+			/* What? */
+			BUG();
+ 		}
+		before = &fl->fl_next;
+	}
+	unlock_flocks();
+}
+
+/**
+ *	posix_unblock_lock - stop waiting for a file lock
+ *      @filp:   how the file was opened
+ *	@waiter: the lock which was waiting
+ *
+ *	lockd needs to block waiting for locks.
+ */
+int
+posix_unblock_lock(struct file *filp, struct file_lock *waiter)
+{
+	int status = 0;
+
+	lock_flocks();
+	if (waiter->fl_next)
+		__locks_delete_block(waiter);
+	else
+		status = -ENOENT;
+	unlock_flocks();
+	return status;
+}
+
+EXPORT_SYMBOL(posix_unblock_lock);
+
+/**
+ * vfs_cancel_lock - file byte range unblock lock
+ * @filp: The file to apply the unblock to
+ * @fl: The lock to be unblocked
+ *
+ * Used by lock managers to cancel blocked requests
+ */
+int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
+{
+	if (filp->f_op && filp->f_op->lock)
+		return filp->f_op->lock(filp, F_CANCELLK, fl);
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(vfs_cancel_lock);
+
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+static void lock_get_status(struct seq_file *f, struct file_lock *fl,
+			    loff_t id, char *pfx)
+{
+	struct inode *inode = NULL;
+	unsigned int fl_pid;
+
+	if (fl->fl_nspid)
+		fl_pid = pid_vnr(fl->fl_nspid);
+	else
+		fl_pid = fl->fl_pid;
+
+	if (fl->fl_file != NULL)
+		inode = fl->fl_file->f_path.dentry->d_inode;
+
+	seq_printf(f, "%lld:%s ", id, pfx);
+	if (IS_POSIX(fl)) {
+		seq_printf(f, "%6s %s ",
+			     (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
+			     (inode == NULL) ? "*NOINODE*" :
+			     mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
+	} else if (IS_FLOCK(fl)) {
+		if (fl->fl_type & LOCK_MAND) {
+			seq_printf(f, "FLOCK  MSNFS     ");
+		} else {
+			seq_printf(f, "FLOCK  ADVISORY  ");
+		}
+	} else if (IS_LEASE(fl)) {
+		seq_printf(f, "LEASE  ");
+		if (fl->fl_type & F_INPROGRESS)
+			seq_printf(f, "BREAKING  ");
+		else if (fl->fl_file)
+			seq_printf(f, "ACTIVE    ");
+		else
+			seq_printf(f, "BREAKER   ");
+	} else {
+		seq_printf(f, "UNKNOWN UNKNOWN  ");
+	}
+	if (fl->fl_type & LOCK_MAND) {
+		seq_printf(f, "%s ",
+			       (fl->fl_type & LOCK_READ)
+			       ? (fl->fl_type & LOCK_WRITE) ? "RW   " : "READ "
+			       : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
+	} else {
+		seq_printf(f, "%s ",
+			       (fl->fl_type & F_INPROGRESS)
+			       ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ "
+			       : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ ");
+	}
+	if (inode) {
+#ifdef WE_CAN_BREAK_LSLK_NOW
+		seq_printf(f, "%d %s:%ld ", fl_pid,
+				inode->i_sb->s_id, inode->i_ino);
+#else
+		/* userspace relies on this representation of dev_t ;-( */
+		seq_printf(f, "%d %02x:%02x:%ld ", fl_pid,
+				MAJOR(inode->i_sb->s_dev),
+				MINOR(inode->i_sb->s_dev), inode->i_ino);
+#endif
+	} else {
+		seq_printf(f, "%d <none>:0 ", fl_pid);
+	}
+	if (IS_POSIX(fl)) {
+		if (fl->fl_end == OFFSET_MAX)
+			seq_printf(f, "%Ld EOF\n", fl->fl_start);
+		else
+			seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end);
+	} else {
+		seq_printf(f, "0 EOF\n");
+	}
+}
+
+static int locks_show(struct seq_file *f, void *v)
+{
+	struct file_lock *fl, *bfl;
+
+	fl = list_entry(v, struct file_lock, fl_link);
+
+	lock_get_status(f, fl, *((loff_t *)f->private), "");
+
+	list_for_each_entry(bfl, &fl->fl_block, fl_block)
+		lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
+
+	return 0;
+}
+
+static void *locks_start(struct seq_file *f, loff_t *pos)
+{
+	loff_t *p = f->private;
+
+	lock_flocks();
+	*p = (*pos + 1);
+	return seq_list_start(&file_lock_list, *pos);
+}
+
+static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	loff_t *p = f->private;
+	++*p;
+	return seq_list_next(v, &file_lock_list, pos);
+}
+
+static void locks_stop(struct seq_file *f, void *v)
+{
+	unlock_flocks();
+}
+
+static const struct seq_operations locks_seq_operations = {
+	.start	= locks_start,
+	.next	= locks_next,
+	.stop	= locks_stop,
+	.show	= locks_show,
+};
+
+static int locks_open(struct inode *inode, struct file *filp)
+{
+	return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
+}
+
+static const struct file_operations proc_locks_operations = {
+	.open		= locks_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+
+static int __init proc_locks_init(void)
+{
+	proc_create("locks", 0, NULL, &proc_locks_operations);
+	return 0;
+}
+module_init(proc_locks_init);
+#endif
+
+/**
+ *	lock_may_read - checks that the region is free of locks
+ *	@inode: the inode that is being read
+ *	@start: the first byte to read
+ *	@len: the number of bytes to read
+ *
+ *	Emulates Windows locking requirements.  Whole-file
+ *	mandatory locks (share modes) can prohibit a read and
+ *	byte-range POSIX locks can prohibit a read if they overlap.
+ *
+ *	N.B. this function is only ever called
+ *	from knfsd and ownership of locks is never checked.
+ */
+int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
+{
+	struct file_lock *fl;
+	int result = 1;
+	lock_flocks();
+	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+		if (IS_POSIX(fl)) {
+			if (fl->fl_type == F_RDLCK)
+				continue;
+			if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
+				continue;
+		} else if (IS_FLOCK(fl)) {
+			if (!(fl->fl_type & LOCK_MAND))
+				continue;
+			if (fl->fl_type & LOCK_READ)
+				continue;
+		} else
+			continue;
+		result = 0;
+		break;
+	}
+	unlock_flocks();
+	return result;
+}
+
+EXPORT_SYMBOL(lock_may_read);
+
+/**
+ *	lock_may_write - checks that the region is free of locks
+ *	@inode: the inode that is being written
+ *	@start: the first byte to write
+ *	@len: the number of bytes to write
+ *
+ *	Emulates Windows locking requirements.  Whole-file
+ *	mandatory locks (share modes) can prohibit a write and
+ *	byte-range POSIX locks can prohibit a write if they overlap.
+ *
+ *	N.B. this function is only ever called
+ *	from knfsd and ownership of locks is never checked.
+ */
+int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
+{
+	struct file_lock *fl;
+	int result = 1;
+	lock_flocks();
+	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+		if (IS_POSIX(fl)) {
+			if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
+				continue;
+		} else if (IS_FLOCK(fl)) {
+			if (!(fl->fl_type & LOCK_MAND))
+				continue;
+			if (fl->fl_type & LOCK_WRITE)
+				continue;
+		} else
+			continue;
+		result = 0;
+		break;
+	}
+	unlock_flocks();
+	return result;
+}
+
+EXPORT_SYMBOL(lock_may_write);
+
+static int __init filelock_init(void)
+{
+	filelock_cache = kmem_cache_create("file_lock_cache",
+			sizeof(struct file_lock), 0, SLAB_PANIC,
+			init_once);
+	return 0;
+}
+
+core_initcall(filelock_init);
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
new file mode 100644
index 00000000..daf9a9b3
--- /dev/null
+++ b/fs/logfs/Kconfig
@@ -0,0 +1,17 @@
+config LOGFS
+	tristate "LogFS file system (EXPERIMENTAL)"
+	depends on (MTD || BLOCK) && EXPERIMENTAL
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	select CRC32
+	select BTREE
+	help
+	  Flash filesystem aimed to scale efficiently to large devices.
+	  In comparison to JFFS2 it offers significantly faster mount
+	  times and potentially less RAM usage, although the latter has
+	  not been measured yet.
+
+	  In its current state it is still very experimental and should
+	  not be used for other than testing purposes.
+
+	  If unsure, say N.
diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile
new file mode 100644
index 00000000..48200277
--- /dev/null
+++ b/fs/logfs/Makefile
@@ -0,0 +1,13 @@
+obj-$(CONFIG_LOGFS)	+= logfs.o
+
+logfs-y	+= compr.o
+logfs-y	+= dir.o
+logfs-y	+= file.o
+logfs-y	+= gc.o
+logfs-y	+= inode.o
+logfs-y	+= journal.o
+logfs-y	+= readwrite.o
+logfs-y	+= segment.o
+logfs-y	+= super.o
+logfs-$(CONFIG_BLOCK)	+= dev_bdev.o
+logfs-$(CONFIG_MTD)	+= dev_mtd.o
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
new file mode 100644
index 00000000..961f02b8
--- /dev/null
+++ b/fs/logfs/compr.c
@@ -0,0 +1,95 @@
+/*
+ * fs/logfs/compr.c	- compression routines
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/vmalloc.h>
+#include <linux/zlib.h>
+
+#define COMPR_LEVEL 3
+
+static DEFINE_MUTEX(compr_mutex);
+static struct z_stream_s stream;
+
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	mutex_lock(&compr_mutex);
+	err = zlib_deflateInit(&stream, COMPR_LEVEL);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_deflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	if (stream.total_out >= stream.total_in)
+		goto error;
+
+	ret = stream.total_out;
+error:
+	mutex_unlock(&compr_mutex);
+	return ret;
+}
+
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	mutex_lock(&compr_mutex);
+	err = zlib_inflateInit(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_inflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_inflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	ret = 0;
+error:
+	mutex_unlock(&compr_mutex);
+	return ret;
+}
+
+int __init logfs_compr_init(void)
+{
+	size_t size = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+			zlib_inflate_workspacesize());
+	stream.workspace = vmalloc(size);
+	if (!stream.workspace)
+		return -ENOMEM;
+	return 0;
+}
+
+void logfs_compr_exit(void)
+{
+	vfree(stream.workspace);
+}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
new file mode 100644
index 00000000..df0de27c
--- /dev/null
+++ b/fs/logfs/dev_bdev.c
@@ -0,0 +1,342 @@
+/*
+ * fs/logfs/dev_bdev.c	- Device access methods for block devices
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/gfp.h>
+#include <linux/prefetch.h>
+
+#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
+
+static void request_complete(struct bio *bio, int err)
+{
+	complete((struct completion *)bio->bi_private);
+}
+
+static int sync_request(struct page *page, struct block_device *bdev, int rw)
+{
+	struct bio bio;
+	struct bio_vec bio_vec;
+	struct completion complete;
+
+	bio_init(&bio);
+	bio.bi_io_vec = &bio_vec;
+	bio_vec.bv_page = page;
+	bio_vec.bv_len = PAGE_SIZE;
+	bio_vec.bv_offset = 0;
+	bio.bi_vcnt = 1;
+	bio.bi_idx = 0;
+	bio.bi_size = PAGE_SIZE;
+	bio.bi_bdev = bdev;
+	bio.bi_sector = page->index * (PAGE_SIZE >> 9);
+	init_completion(&complete);
+	bio.bi_private = &complete;
+	bio.bi_end_io = request_complete;
+
+	submit_bio(rw, &bio);
+	wait_for_completion(&complete);
+	return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
+}
+
+static int bdev_readpage(void *_sb, struct page *page)
+{
+	struct super_block *sb = _sb;
+	struct block_device *bdev = logfs_super(sb)->s_bdev;
+	int err;
+
+	err = sync_request(page, bdev, READ);
+	if (err) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+	unlock_page(page);
+	return err;
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(wq);
+
+static void writeseg_end_io(struct bio *bio, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct super_block *sb = bio->bi_private;
+	struct logfs_super *super = logfs_super(sb);
+	struct page *page;
+
+	BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
+	BUG_ON(err);
+	BUG_ON(bio->bi_vcnt == 0);
+	do {
+		page = bvec->bv_page;
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		end_page_writeback(page);
+		page_cache_release(page);
+	} while (bvec >= bio->bi_io_vec);
+	bio_put(bio);
+	if (atomic_dec_and_test(&super->s_pending_writes))
+		wake_up(&wq);
+}
+
+static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
+		size_t nr_pages)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct bio *bio;
+	struct page *page;
+	struct request_queue *q = bdev_get_queue(sb->s_bdev);
+	unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
+	int i;
+
+	if (max_pages > BIO_MAX_PAGES)
+		max_pages = BIO_MAX_PAGES;
+	bio = bio_alloc(GFP_NOFS, max_pages);
+	BUG_ON(!bio);
+
+	for (i = 0; i < nr_pages; i++) {
+		if (i >= max_pages) {
+			/* Block layer cannot split bios :( */
+			bio->bi_vcnt = i;
+			bio->bi_idx = 0;
+			bio->bi_size = i * PAGE_SIZE;
+			bio->bi_bdev = super->s_bdev;
+			bio->bi_sector = ofs >> 9;
+			bio->bi_private = sb;
+			bio->bi_end_io = writeseg_end_io;
+			atomic_inc(&super->s_pending_writes);
+			submit_bio(WRITE, bio);
+
+			ofs += i * PAGE_SIZE;
+			index += i;
+			nr_pages -= i;
+			i = 0;
+
+			bio = bio_alloc(GFP_NOFS, max_pages);
+			BUG_ON(!bio);
+		}
+		page = find_lock_page(mapping, index + i);
+		BUG_ON(!page);
+		bio->bi_io_vec[i].bv_page = page;
+		bio->bi_io_vec[i].bv_len = PAGE_SIZE;
+		bio->bi_io_vec[i].bv_offset = 0;
+
+		BUG_ON(PageWriteback(page));
+		set_page_writeback(page);
+		unlock_page(page);
+	}
+	bio->bi_vcnt = nr_pages;
+	bio->bi_idx = 0;
+	bio->bi_size = nr_pages * PAGE_SIZE;
+	bio->bi_bdev = super->s_bdev;
+	bio->bi_sector = ofs >> 9;
+	bio->bi_private = sb;
+	bio->bi_end_io = writeseg_end_io;
+	atomic_inc(&super->s_pending_writes);
+	submit_bio(WRITE, bio);
+	return 0;
+}
+
+static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int head;
+
+	BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
+
+	if (len == 0) {
+		/* This can happen when the object fit perfectly into a
+		 * segment, the segment gets written per sync and subsequently
+		 * closed.
+		 */
+		return;
+	}
+	head = ofs & (PAGE_SIZE - 1);
+	if (head) {
+		ofs -= head;
+		len += head;
+	}
+	len = PAGE_ALIGN(len);
+	__bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
+}
+
+
+static void erase_end_io(struct bio *bio, int err) 
+{ 
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 
+	struct super_block *sb = bio->bi_private; 
+	struct logfs_super *super = logfs_super(sb); 
+
+	BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ 
+	BUG_ON(err); 
+	BUG_ON(bio->bi_vcnt == 0); 
+	bio_put(bio); 
+	if (atomic_dec_and_test(&super->s_pending_writes))
+		wake_up(&wq); 
+} 
+
+static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
+		size_t nr_pages)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct bio *bio;
+	struct request_queue *q = bdev_get_queue(sb->s_bdev);
+	unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
+	int i;
+
+	if (max_pages > BIO_MAX_PAGES)
+		max_pages = BIO_MAX_PAGES;
+	bio = bio_alloc(GFP_NOFS, max_pages);
+	BUG_ON(!bio);
+
+	for (i = 0; i < nr_pages; i++) {
+		if (i >= max_pages) {
+			/* Block layer cannot split bios :( */
+			bio->bi_vcnt = i;
+			bio->bi_idx = 0;
+			bio->bi_size = i * PAGE_SIZE;
+			bio->bi_bdev = super->s_bdev;
+			bio->bi_sector = ofs >> 9;
+			bio->bi_private = sb;
+			bio->bi_end_io = erase_end_io;
+			atomic_inc(&super->s_pending_writes);
+			submit_bio(WRITE, bio);
+
+			ofs += i * PAGE_SIZE;
+			index += i;
+			nr_pages -= i;
+			i = 0;
+
+			bio = bio_alloc(GFP_NOFS, max_pages);
+			BUG_ON(!bio);
+		}
+		bio->bi_io_vec[i].bv_page = super->s_erase_page;
+		bio->bi_io_vec[i].bv_len = PAGE_SIZE;
+		bio->bi_io_vec[i].bv_offset = 0;
+	}
+	bio->bi_vcnt = nr_pages;
+	bio->bi_idx = 0;
+	bio->bi_size = nr_pages * PAGE_SIZE;
+	bio->bi_bdev = super->s_bdev;
+	bio->bi_sector = ofs >> 9;
+	bio->bi_private = sb;
+	bio->bi_end_io = erase_end_io;
+	atomic_inc(&super->s_pending_writes);
+	submit_bio(WRITE, bio);
+	return 0;
+}
+
+static int bdev_erase(struct super_block *sb, loff_t to, size_t len,
+		int ensure_write)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	BUG_ON(to & (PAGE_SIZE - 1));
+	BUG_ON(len & (PAGE_SIZE - 1));
+
+	if (super->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	if (ensure_write) {
+		/*
+		 * Object store doesn't care whether erases happen or not.
+		 * But for the journal they are required.  Otherwise a scan
+		 * can find an old commit entry and assume it is the current
+		 * one, travelling back in time.
+		 */
+		do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT);
+	}
+
+	return 0;
+}
+
+static void bdev_sync(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
+}
+
+static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = bdev_readpage;
+
+	*ofs = 0;
+	return read_cache_page(mapping, 0, filler, sb);
+}
+
+static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = bdev_readpage;
+	u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
+	pgoff_t index = pos >> PAGE_SHIFT;
+
+	*ofs = pos;
+	return read_cache_page(mapping, index, filler, sb);
+}
+
+static int bdev_write_sb(struct super_block *sb, struct page *page)
+{
+	struct block_device *bdev = logfs_super(sb)->s_bdev;
+
+	/* Nothing special to do for block devices. */
+	return sync_request(page, bdev, WRITE);
+}
+
+static void bdev_put_device(struct logfs_super *s)
+{
+	blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+}
+
+static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
+{
+	return 0;
+}
+
+static const struct logfs_device_ops bd_devops = {
+	.find_first_sb	= bdev_find_first_sb,
+	.find_last_sb	= bdev_find_last_sb,
+	.write_sb	= bdev_write_sb,
+	.readpage	= bdev_readpage,
+	.writeseg	= bdev_writeseg,
+	.erase		= bdev_erase,
+	.can_write_buf	= bdev_can_write_buf,
+	.sync		= bdev_sync,
+	.put_device	= bdev_put_device,
+};
+
+int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
+		const char *devname)
+{
+	struct block_device *bdev;
+
+	bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				  type);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+
+	if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
+		int mtdnr = MINOR(bdev->bd_dev);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+		return logfs_get_sb_mtd(p, mtdnr);
+	}
+
+	p->s_bdev = bdev;
+	p->s_mtd = NULL;
+	p->s_devops = &bd_devops;
+	return 0;
+}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
new file mode 100644
index 00000000..339e17e9
--- /dev/null
+++ b/fs/logfs/dev_mtd.c
@@ -0,0 +1,278 @@
+/*
+ * fs/logfs/dev_mtd.c	- Device access methods for MTD
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/completion.h>
+#include <linux/mount.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
+
+static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+{
+	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+	size_t retlen;
+	int ret;
+
+	ret = mtd->read(mtd, ofs, len, &retlen, buf);
+	BUG_ON(ret == -EINVAL);
+	if (ret)
+		return ret;
+
+	/* Not sure if we should loop instead. */
+	if (retlen != len)
+		return -EIO;
+
+	return 0;
+}
+
+static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct mtd_info *mtd = super->s_mtd;
+	size_t retlen;
+	loff_t page_start, page_end;
+	int ret;
+
+	if (super->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
+	BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
+	BUG_ON(len > PAGE_CACHE_SIZE);
+	page_start = ofs & PAGE_CACHE_MASK;
+	page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
+	ret = mtd->write(mtd, ofs, len, &retlen, buf);
+	if (ret || (retlen != len))
+		return -EIO;
+
+	return 0;
+}
+
+/*
+ * For as long as I can remember (since about 2001) mtd->erase has been an
+ * asynchronous interface lacking the first driver to actually use the
+ * asynchronous properties.  So just to prevent the first implementor of such
+ * a thing from breaking logfs in 2350, we do the usual pointless dance to
+ * declare a completion variable and wait for completion before returning
+ * from mtd_erase().  What an exercise in futility!
+ */
+static void logfs_erase_callback(struct erase_info *ei)
+{
+	complete((struct completion *)ei->priv);
+}
+
+static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct page *page;
+	pgoff_t index = ofs >> PAGE_SHIFT;
+
+	for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
+		page = find_get_page(mapping, index);
+		if (!page)
+			continue;
+		memset(page_address(page), 0xFF, PAGE_SIZE);
+		page_cache_release(page);
+	}
+	return 0;
+}
+
+static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
+		int ensure_write)
+{
+	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+	struct erase_info ei;
+	DECLARE_COMPLETION_ONSTACK(complete);
+	int ret;
+
+	BUG_ON(len % mtd->erasesize);
+	if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	memset(&ei, 0, sizeof(ei));
+	ei.mtd = mtd;
+	ei.addr = ofs;
+	ei.len = len;
+	ei.callback = logfs_erase_callback;
+	ei.priv = (long)&complete;
+	ret = mtd->erase(mtd, &ei);
+	if (ret)
+		return -EIO;
+
+	wait_for_completion(&complete);
+	if (ei.state != MTD_ERASE_DONE)
+		return -EIO;
+	return mtd_erase_mapping(sb, ofs, len);
+}
+
+static void mtd_sync(struct super_block *sb)
+{
+	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+
+	if (mtd->sync)
+		mtd->sync(mtd);
+}
+
+static int mtd_readpage(void *_sb, struct page *page)
+{
+	struct super_block *sb = _sb;
+	int err;
+
+	err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+			page_address(page));
+	if (err == -EUCLEAN || err == -EBADMSG) {
+		/* -EBADMSG happens regularly on power failures */
+		err = 0;
+		/* FIXME: force GC this segment */
+	}
+	if (err) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+	unlock_page(page);
+	return err;
+}
+
+static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = mtd_readpage;
+	struct mtd_info *mtd = super->s_mtd;
+
+	if (!mtd->block_isbad)
+		return NULL;
+
+	*ofs = 0;
+	while (mtd->block_isbad(mtd, *ofs)) {
+		*ofs += mtd->erasesize;
+		if (*ofs >= mtd->size)
+			return NULL;
+	}
+	BUG_ON(*ofs & ~PAGE_MASK);
+	return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
+}
+
+static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = mtd_readpage;
+	struct mtd_info *mtd = super->s_mtd;
+
+	if (!mtd->block_isbad)
+		return NULL;
+
+	*ofs = mtd->size - mtd->erasesize;
+	while (mtd->block_isbad(mtd, *ofs)) {
+		*ofs -= mtd->erasesize;
+		if (*ofs <= 0)
+			return NULL;
+	}
+	*ofs = *ofs + mtd->erasesize - 0x1000;
+	BUG_ON(*ofs & ~PAGE_MASK);
+	return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
+}
+
+static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
+		size_t nr_pages)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct page *page;
+	int i, err;
+
+	for (i = 0; i < nr_pages; i++) {
+		page = find_lock_page(mapping, index + i);
+		BUG_ON(!page);
+
+		err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+				page_address(page));
+		unlock_page(page);
+		page_cache_release(page);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int head;
+
+	if (super->s_flags & LOGFS_SB_FLAG_RO)
+		return;
+
+	if (len == 0) {
+		/* This can happen when the object fit perfectly into a
+		 * segment, the segment gets written per sync and subsequently
+		 * closed.
+		 */
+		return;
+	}
+	head = ofs & (PAGE_SIZE - 1);
+	if (head) {
+		ofs -= head;
+		len += head;
+	}
+	len = PAGE_ALIGN(len);
+	__mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
+}
+
+static void mtd_put_device(struct logfs_super *s)
+{
+	put_mtd_device(s->s_mtd);
+}
+
+static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	void *buf;
+	int err;
+
+	buf = kmalloc(super->s_writesize, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	err = mtd_read(sb, ofs, super->s_writesize, buf);
+	if (err)
+		goto out;
+	if (memchr_inv(buf, 0xff, super->s_writesize))
+		err = -EIO;
+	kfree(buf);
+out:
+	return err;
+}
+
+static const struct logfs_device_ops mtd_devops = {
+	.find_first_sb	= mtd_find_first_sb,
+	.find_last_sb	= mtd_find_last_sb,
+	.readpage	= mtd_readpage,
+	.writeseg	= mtd_writeseg,
+	.erase		= mtd_erase,
+	.can_write_buf	= mtd_can_write_buf,
+	.sync		= mtd_sync,
+	.put_device	= mtd_put_device,
+};
+
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
+{
+	struct mtd_info *mtd = get_mtd_device(NULL, mtdnr);
+	if (IS_ERR(mtd))
+		return PTR_ERR(mtd);
+
+	s->s_bdev = NULL;
+	s->s_mtd = mtd;
+	s->s_devops = &mtd_devops;
+	return 0;
+}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
new file mode 100644
index 00000000..1afae26c
--- /dev/null
+++ b/fs/logfs/dir.c
@@ -0,0 +1,825 @@
+/*
+ * fs/logfs/dir.c	- directory-related code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+
+/*
+ * Atomic dir operations
+ *
+ * Directory operations are by default not atomic.  Dentries and Inodes are
+ * created/removed/altered in separate operations.  Therefore we need to do
+ * a small amount of journaling.
+ *
+ * Create, link, mkdir, mknod and symlink all share the same function to do
+ * the work: __logfs_create.  This function works in two atomic steps:
+ * 1. allocate inode (remember in journal)
+ * 2. allocate dentry (clear journal)
+ *
+ * As we can only get interrupted between the two, when the inode we just
+ * created is simply stored in the anchor.  On next mount, if we were
+ * interrupted, we delete the inode.  From a users point of view the
+ * operation never happened.
+ *
+ * Unlink and rmdir also share the same function: unlink.  Again, this
+ * function works in two atomic steps
+ * 1. remove dentry (remember inode in journal)
+ * 2. unlink inode (clear journal)
+ *
+ * And again, on the next mount, if we were interrupted, we delete the inode.
+ * From a users point of view the operation succeeded.
+ *
+ * Rename is the real pain to deal with, harder than all the other methods
+ * combined.  Depending on the circumstances we can run into three cases.
+ * A "target rename" where the target dentry already existed, a "local
+ * rename" where both parent directories are identical or a "cross-directory
+ * rename" in the remaining case.
+ *
+ * Local rename is atomic, as the old dentry is simply rewritten with a new
+ * name.
+ *
+ * Cross-directory rename works in two steps, similar to __logfs_create and
+ * logfs_unlink:
+ * 1. Write new dentry (remember old dentry in journal)
+ * 2. Remove old dentry (clear journal)
+ *
+ * Here we remember a dentry instead of an inode.  On next mount, if we were
+ * interrupted, we delete the dentry.  From a users point of view, the
+ * operation succeeded.
+ *
+ * Target rename works in three atomic steps:
+ * 1. Attach old inode to new dentry (remember old dentry and new inode)
+ * 2. Remove old dentry (still remember the new inode)
+ * 3. Remove victim inode
+ *
+ * Here we remember both an inode an a dentry.  If we get interrupted
+ * between steps 1 and 2, we delete both the dentry and the inode.  If
+ * we get interrupted between steps 2 and 3, we delete just the inode.
+ * In either case, the remaining objects are deleted on next mount.  From
+ * a users point of view, the operation succeeded.
+ */
+
+static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
+		loff_t pos)
+{
+	return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL);
+}
+
+static int write_inode(struct inode *inode)
+{
+	return __logfs_write_inode(inode, WF_LOCK);
+}
+
+static s64 dir_seek_data(struct inode *inode, s64 pos)
+{
+	s64 new_pos = logfs_seek_data(inode, pos);
+
+	return max(pos, new_pos - 1);
+}
+
+static int beyond_eof(struct inode *inode, loff_t bix)
+{
+	loff_t pos = bix << inode->i_sb->s_blocksize_bits;
+	return pos >= i_size_read(inode);
+}
+
+/*
+ * Prime value was chosen to be roughly 256 + 26.  r5 hash uses 11,
+ * so short names (len <= 9) don't even occupy the complete 32bit name
+ * space.  A prime >256 ensures short names quickly spread the 32bit
+ * name space.  Add about 26 for the estimated amount of information
+ * of each character and pick a prime nearby, preferably a bit-sparse
+ * one.
+ */
+static u32 hash_32(const char *s, int len, u32 seed)
+{
+	u32 hash = seed;
+	int i;
+
+	for (i = 0; i < len; i++)
+		hash = hash * 293 + s[i];
+	return hash;
+}
+
+/*
+ * We have to satisfy several conflicting requirements here.  Small
+ * directories should stay fairly compact and not require too many
+ * indirect blocks.  The number of possible locations for a given hash
+ * should be small to make lookup() fast.  And we should try hard not
+ * to overflow the 32bit name space or nfs and 32bit host systems will
+ * be unhappy.
+ *
+ * So we use the following scheme.  First we reduce the hash to 0..15
+ * and try a direct block.  If that is occupied we reduce the hash to
+ * 16..255 and try an indirect block.  Same for 2x and 3x indirect
+ * blocks.  Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff,
+ * but use buckets containing eight entries instead of a single one.
+ *
+ * Using 16 entries should allow for a reasonable amount of hash
+ * collisions, so the 32bit name space can be packed fairly tight
+ * before overflowing.  Oh and currently we don't overflow but return
+ * and error.
+ *
+ * How likely are collisions?  Doing the appropriate math is beyond me
+ * and the Bronstein textbook.  But running a test program to brute
+ * force collisions for a couple of days showed that on average the
+ * first collision occurs after 598M entries, with 290M being the
+ * smallest result.  Obviously 21 entries could already cause a
+ * collision if all entries are carefully chosen.
+ */
+static pgoff_t hash_index(u32 hash, int round)
+{
+	u32 i0_blocks = I0_BLOCKS;
+	u32 i1_blocks = I1_BLOCKS;
+	u32 i2_blocks = I2_BLOCKS;
+	u32 i3_blocks = I3_BLOCKS;
+
+	switch (round) {
+	case 0:
+		return hash % i0_blocks;
+	case 1:
+		return i0_blocks + hash % (i1_blocks - i0_blocks);
+	case 2:
+		return i1_blocks + hash % (i2_blocks - i1_blocks);
+	case 3:
+		return i2_blocks + hash % (i3_blocks - i2_blocks);
+	case 4 ... 19:
+		return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16))
+			+ round - 4;
+	}
+	BUG();
+}
+
+static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
+{
+	struct qstr *name = &dentry->d_name;
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+	u32 hash = hash_32(name->name, name->len, 0);
+	pgoff_t index;
+	int round;
+
+	if (name->len > LOGFS_MAX_NAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	for (round = 0; round < 20; round++) {
+		index = hash_index(hash, round);
+
+		if (beyond_eof(dir, index))
+			return NULL;
+		if (!logfs_exist_block(dir, index))
+			continue;
+		page = read_cache_page(dir->i_mapping, index,
+				(filler_t *)logfs_readpage, NULL);
+		if (IS_ERR(page))
+			return page;
+		dd = kmap_atomic(page, KM_USER0);
+		BUG_ON(dd->namelen == 0);
+
+		if (name->len != be16_to_cpu(dd->namelen) ||
+				memcmp(name->name, dd->name, name->len)) {
+			kunmap_atomic(dd, KM_USER0);
+			page_cache_release(page);
+			continue;
+		}
+
+		kunmap_atomic(dd, KM_USER0);
+		return page;
+	}
+	return NULL;
+}
+
+static int logfs_remove_inode(struct inode *inode)
+{
+	int ret;
+
+	inode->i_nlink--;
+	ret = write_inode(inode);
+	LOGFS_BUG_ON(ret, inode->i_sb);
+	return ret;
+}
+
+static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+	if (logfs_inode(inode)->li_block)
+		logfs_inode(inode)->li_block->ta = NULL;
+	kfree(ta);
+}
+
+static int logfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct logfs_super *super = logfs_super(dir->i_sb);
+	struct inode *inode = dentry->d_inode;
+	struct logfs_transaction *ta;
+	struct page *page;
+	pgoff_t index;
+	int ret;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = UNLINK_1;
+	ta->ino = inode->i_ino;
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+
+	page = logfs_get_dd_page(dir, dentry);
+	if (!page) {
+		kfree(ta);
+		return -ENOENT;
+	}
+	if (IS_ERR(page)) {
+		kfree(ta);
+		return PTR_ERR(page);
+	}
+	index = page->index;
+	page_cache_release(page);
+
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(dir, ta);
+
+	ret = logfs_delete(dir, index, NULL);
+	if (!ret)
+		ret = write_inode(dir);
+
+	if (ret) {
+		abort_transaction(dir, ta);
+		printk(KERN_ERR"LOGFS: unable to delete inode\n");
+		goto out;
+	}
+
+	ta->state = UNLINK_2;
+	logfs_add_transaction(inode, ta);
+	ret = logfs_remove_inode(inode);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return ret;
+}
+
+static inline int logfs_empty_dir(struct inode *dir)
+{
+	u64 data;
+
+	data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits;
+	return data >= i_size_read(dir);
+}
+
+static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (!logfs_empty_dir(inode))
+		return -ENOTEMPTY;
+
+	return logfs_unlink(dir, dentry);
+}
+
+/* FIXME: readdir currently has it's own dir_walk code.  I don't see a good
+ * way to combine the two copies */
+#define IMPLICIT_NODES 2
+static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+{
+	struct inode *dir = file->f_dentry->d_inode;
+	loff_t pos = file->f_pos - IMPLICIT_NODES;
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+	int full;
+
+	BUG_ON(pos < 0);
+	for (;; pos++) {
+		if (beyond_eof(dir, pos))
+			break;
+		if (!logfs_exist_block(dir, pos)) {
+			/* deleted dentry */
+			pos = dir_seek_data(dir, pos);
+			continue;
+		}
+		page = read_cache_page(dir->i_mapping, pos,
+				(filler_t *)logfs_readpage, NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		dd = kmap(page);
+		BUG_ON(dd->namelen == 0);
+
+		full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
+				pos, be64_to_cpu(dd->ino), dd->type);
+		kunmap(page);
+		page_cache_release(page);
+		if (full)
+			break;
+	}
+
+	file->f_pos = pos + IMPLICIT_NODES;
+	return 0;
+}
+
+static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	ino_t pino = parent_ino(file->f_dentry);
+	int err;
+
+	if (file->f_pos < 0)
+		return -EINVAL;
+
+	if (file->f_pos == 0) {
+		if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
+			return 0;
+		file->f_pos++;
+	}
+	if (file->f_pos == 1) {
+		if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
+			return 0;
+		file->f_pos++;
+	}
+
+	err = __logfs_readdir(file, buf, filldir);
+	return err;
+}
+
+static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
+{
+	dd->namelen = cpu_to_be16(name->len);
+	memcpy(dd->name, name->name, name->len);
+}
+
+static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
+		struct nameidata *nd)
+{
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+	pgoff_t index;
+	u64 ino = 0;
+	struct inode *inode;
+
+	page = logfs_get_dd_page(dir, dentry);
+	if (IS_ERR(page))
+		return ERR_CAST(page);
+	if (!page) {
+		d_add(dentry, NULL);
+		return NULL;
+	}
+	index = page->index;
+	dd = kmap_atomic(page, KM_USER0);
+	ino = be64_to_cpu(dd->ino);
+	kunmap_atomic(dd, KM_USER0);
+	page_cache_release(page);
+
+	inode = logfs_iget(dir->i_sb, ino);
+	if (IS_ERR(inode)) {
+		printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n",
+				ino, dir->i_ino, index);
+		return ERR_CAST(inode);
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+static void grow_dir(struct inode *dir, loff_t index)
+{
+	index = (index + 1) << dir->i_sb->s_blocksize_bits;
+	if (i_size_read(dir) < index)
+		i_size_write(dir, index);
+}
+
+static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
+		struct inode *inode)
+{
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+	u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0);
+	pgoff_t index;
+	int round, err;
+
+	for (round = 0; round < 20; round++) {
+		index = hash_index(hash, round);
+
+		if (logfs_exist_block(dir, index))
+			continue;
+		page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+
+		dd = kmap_atomic(page, KM_USER0);
+		memset(dd, 0, sizeof(*dd));
+		dd->ino = cpu_to_be64(inode->i_ino);
+		dd->type = logfs_type(inode);
+		logfs_set_name(dd, &dentry->d_name);
+		kunmap_atomic(dd, KM_USER0);
+
+		err = logfs_write_buf(dir, page, WF_LOCK);
+		unlock_page(page);
+		page_cache_release(page);
+		if (!err)
+			grow_dir(dir, index);
+		return err;
+	}
+	/* FIXME: Is there a better return value?  In most cases neither
+	 * the filesystem nor the directory are full.  But we have had
+	 * too many collisions for this particular hash and no fallback.
+	 */
+	return -ENOSPC;
+}
+
+static int __logfs_create(struct inode *dir, struct dentry *dentry,
+		struct inode *inode, const char *dest, long destlen)
+{
+	struct logfs_super *super = logfs_super(dir->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_transaction *ta;
+	int ret;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta) {
+		inode->i_nlink--;
+		iput(inode);
+		return -ENOMEM;
+	}
+
+	ta->state = CREATE_1;
+	ta->ino = inode->i_ino;
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(inode, ta);
+
+	if (dest) {
+		/* symlink */
+		ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL);
+		if (!ret)
+			ret = write_inode(inode);
+	} else {
+		/* creat/mkdir/mknod */
+		ret = write_inode(inode);
+	}
+	if (ret) {
+		abort_transaction(inode, ta);
+		li->li_flags |= LOGFS_IF_STILLBORN;
+		/* FIXME: truncate symlink */
+		inode->i_nlink--;
+		iput(inode);
+		goto out;
+	}
+
+	ta->state = CREATE_2;
+	logfs_add_transaction(dir, ta);
+	ret = logfs_write_dir(dir, dentry, inode);
+	/* sync directory */
+	if (!ret)
+		ret = write_inode(dir);
+
+	if (ret) {
+		logfs_del_transaction(dir, ta);
+		ta->state = CREATE_2;
+		logfs_add_transaction(inode, ta);
+		logfs_remove_inode(inode);
+		iput(inode);
+		goto out;
+	}
+	d_instantiate(dentry, inode);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return ret;
+}
+
+static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+
+	/*
+	 * FIXME: why do we have to fill in S_IFDIR, while the mode is
+	 * correct for mknod, creat, etc.?  Smells like the vfs *should*
+	 * do it for us but for some reason fails to do so.
+	 */
+	inode = logfs_new_inode(dir, S_IFDIR | mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_dir_iops;
+	inode->i_fop = &logfs_dir_fops;
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	struct inode *inode;
+
+	inode = logfs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_reg_iops;
+	inode->i_fop = &logfs_reg_fops;
+	inode->i_mapping->a_ops = &logfs_reg_aops;
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+		dev_t rdev)
+{
+	struct inode *inode;
+
+	if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
+		return -ENAMETOOLONG;
+
+	inode = logfs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	init_special_inode(inode, mode, rdev);
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_symlink(struct inode *dir, struct dentry *dentry,
+		const char *target)
+{
+	struct inode *inode;
+	size_t destlen = strlen(target) + 1;
+
+	if (destlen > dir->i_sb->s_blocksize)
+		return -ENAMETOOLONG;
+
+	inode = logfs_new_inode(dir, S_IFLNK | 0777);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_symlink_iops;
+	inode->i_mapping->a_ops = &logfs_reg_aops;
+
+	return __logfs_create(dir, dentry, inode, target, destlen);
+}
+
+static int logfs_link(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+
+	if (inode->i_nlink >= LOGFS_LINK_MAX)
+		return -EMLINK;
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	ihold(inode);
+	inode->i_nlink++;
+	mark_inode_dirty_sync(inode);
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t *pos)
+{
+	struct page *page;
+	void *map;
+
+	page = logfs_get_dd_page(dir, dentry);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+	*pos = page->index;
+	map = kmap_atomic(page, KM_USER0);
+	memcpy(dd, map, sizeof(*dd));
+	kunmap_atomic(map, KM_USER0);
+	page_cache_release(page);
+	return 0;
+}
+
+static int logfs_delete_dd(struct inode *dir, loff_t pos)
+{
+	/*
+	 * Getting called with pos somewhere beyond eof is either a goofup
+	 * within this file or means someone maliciously edited the
+	 * (crc-protected) journal.
+	 */
+	BUG_ON(beyond_eof(dir, pos));
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
+	return logfs_delete(dir, pos, NULL);
+}
+
+/*
+ * Cross-directory rename, target does not exist.  Just a little nasty.
+ * Create a new dentry in the target dir, then remove the old dentry,
+ * all the while taking care to remember our operation in the journal.
+ */
+static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
+			      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct logfs_super *super = logfs_super(old_dir->i_sb);
+	struct logfs_disk_dentry dd;
+	struct logfs_transaction *ta;
+	loff_t pos;
+	int err;
+
+	/* 1. locate source dd */
+	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+	if (err)
+		return err;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = CROSS_RENAME_1;
+	ta->dir = old_dir->i_ino;
+	ta->pos = pos;
+
+	/* 2. write target dd */
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(new_dir, ta);
+	err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode);
+	if (!err)
+		err = write_inode(new_dir);
+
+	if (err) {
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		abort_transaction(new_dir, ta);
+		goto out;
+	}
+
+	/* 3. remove source dd */
+	ta->state = CROSS_RENAME_2;
+	logfs_add_transaction(old_dir, ta);
+	err = logfs_delete_dd(old_dir, pos);
+	if (!err)
+		err = write_inode(old_dir);
+	LOGFS_BUG_ON(err, old_dir->i_sb);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return err;
+}
+
+static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, struct inode *inode)
+{
+	loff_t pos;
+	int err;
+
+	err = logfs_get_dd(dir, dentry, dd, &pos);
+	if (err)
+		return err;
+	dd->ino = cpu_to_be64(inode->i_ino);
+	dd->type = logfs_type(inode);
+
+	err = write_dir(dir, dd, pos);
+	if (err)
+		return err;
+	log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos,
+			dd->name, be64_to_cpu(dd->ino));
+	return write_inode(dir);
+}
+
+/* Target dentry exists - the worst case.  We need to attach the source
+ * inode to the target dentry, then remove the orphaned target inode and
+ * source dentry.
+ */
+static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
+			       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct logfs_super *super = logfs_super(old_dir->i_sb);
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	int isdir = S_ISDIR(old_inode->i_mode);
+	struct logfs_disk_dentry dd;
+	struct logfs_transaction *ta;
+	loff_t pos;
+	int err;
+
+	BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
+	if (isdir) {
+		if (!logfs_empty_dir(new_inode))
+			return -ENOTEMPTY;
+	}
+
+	/* 1. locate source dd */
+	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+	if (err)
+		return err;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = TARGET_RENAME_1;
+	ta->dir = old_dir->i_ino;
+	ta->pos = pos;
+	ta->ino = new_inode->i_ino;
+
+	/* 2. attach source inode to target dd */
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(new_dir, ta);
+	err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode);
+	if (err) {
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		super->s_victim_ino = 0;
+		abort_transaction(new_dir, ta);
+		goto out;
+	}
+
+	/* 3. remove source dd */
+	ta->state = TARGET_RENAME_2;
+	logfs_add_transaction(old_dir, ta);
+	err = logfs_delete_dd(old_dir, pos);
+	if (!err)
+		err = write_inode(old_dir);
+	LOGFS_BUG_ON(err, old_dir->i_sb);
+
+	/* 4. remove target inode */
+	ta->state = TARGET_RENAME_3;
+	logfs_add_transaction(new_inode, ta);
+	err = logfs_remove_inode(new_inode);
+
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return err;
+}
+
+static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	if (new_dentry->d_inode)
+		return logfs_rename_target(old_dir, old_dentry,
+					   new_dir, new_dentry);
+	return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
+}
+
+/* No locking done here, as this is called before .get_sb() returns. */
+int logfs_replay_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+	u64 ino, pos;
+	int err;
+
+	if (super->s_victim_ino) {
+		/* delete victim inode */
+		ino = super->s_victim_ino;
+		printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
+		inode = logfs_iget(sb, ino);
+		if (IS_ERR(inode))
+			goto fail;
+
+		LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
+		super->s_victim_ino = 0;
+		err = logfs_remove_inode(inode);
+		iput(inode);
+		if (err) {
+			super->s_victim_ino = ino;
+			goto fail;
+		}
+	}
+	if (super->s_rename_dir) {
+		/* delete old dd from rename */
+		ino = super->s_rename_dir;
+		pos = super->s_rename_pos;
+		printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
+				ino, pos);
+		inode = logfs_iget(sb, ino);
+		if (IS_ERR(inode))
+			goto fail;
+
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		err = logfs_delete_dd(inode, pos);
+		iput(inode);
+		if (err) {
+			super->s_rename_dir = ino;
+			super->s_rename_pos = pos;
+			goto fail;
+		}
+	}
+	return 0;
+fail:
+	LOGFS_BUG(sb);
+	return -EIO;
+}
+
+const struct inode_operations logfs_symlink_iops = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+};
+
+const struct inode_operations logfs_dir_iops = {
+	.create		= logfs_create,
+	.link		= logfs_link,
+	.lookup		= logfs_lookup,
+	.mkdir		= logfs_mkdir,
+	.mknod		= logfs_mknod,
+	.rename		= logfs_rename,
+	.rmdir		= logfs_rmdir,
+	.symlink	= logfs_symlink,
+	.unlink		= logfs_unlink,
+};
+const struct file_operations logfs_dir_fops = {
+	.fsync		= logfs_fsync,
+	.unlocked_ioctl	= logfs_ioctl,
+	.readdir	= logfs_readdir,
+	.read		= generic_read_dir,
+	.llseek		= default_llseek,
+};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
new file mode 100644
index 00000000..c2ad7028
--- /dev/null
+++ b/fs/logfs/file.c
@@ -0,0 +1,275 @@
+/*
+ * fs/logfs/file.c	- prepare_write, commit_write and friends
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/writeback.h>
+
+static int logfs_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct page *page;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
+		return 0;
+	if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
+		unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+		unsigned end = start + len;
+
+		/* Reading beyond i_size is simple: memset to zero */
+		zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+		return 0;
+	}
+	return logfs_readpage_nolock(page);
+}
+
+static int logfs_write_end(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned copied, struct page *page,
+		void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	pgoff_t index = page->index;
+	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned end = start + copied;
+	int ret = 0;
+
+	BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
+	BUG_ON(page->index > I3_BLOCKS);
+
+	if (copied < len) {
+		/*
+		 * Short write of a non-initialized paged.  Just tell userspace
+		 * to retry the entire page.
+		 */
+		if (!PageUptodate(page)) {
+			copied = 0;
+			goto out;
+		}
+	}
+	if (copied == 0)
+		goto out; /* FIXME: do we need to update inode? */
+
+	if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
+		i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
+		mark_inode_dirty_sync(inode);
+	}
+
+	SetPageUptodate(page);
+	if (!PageDirty(page)) {
+		if (!get_page_reserve(inode, page))
+			__set_page_dirty_nobuffers(page);
+		else
+			ret = logfs_write_buf(inode, page, WF_LOCK);
+	}
+out:
+	unlock_page(page);
+	page_cache_release(page);
+	return ret ? ret : copied;
+}
+
+int logfs_readpage(struct file *file, struct page *page)
+{
+	int ret;
+
+	ret = logfs_readpage_nolock(page);
+	unlock_page(page);
+	return ret;
+}
+
+/* Clear the page's dirty flag in the radix tree. */
+/* TODO: mucking with PageWriteback is silly.  Add a generic function to clear
+ * the dirty bit from the radix tree for filesystems that don't have to wait
+ * for page writeback to finish (i.e. any compressing filesystem).
+ */
+static void clear_radix_tree_dirty(struct page *page)
+{
+	BUG_ON(PagePrivate(page) || page->private);
+	set_page_writeback(page);
+	end_page_writeback(page);
+}
+
+static int __logfs_writepage(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int err;
+
+	err = logfs_write_buf(inode, page, WF_LOCK);
+	if (err)
+		set_page_dirty(page);
+	else
+		clear_radix_tree_dirty(page);
+	unlock_page(page);
+	return err;
+}
+
+static int logfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	u64 bix;
+	level_t level;
+
+	log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
+			page);
+
+	logfs_unpack_index(page->index, &bix, &level);
+
+	/* Indirect blocks are never truncated */
+	if (level != 0)
+		return __logfs_writepage(page);
+
+	/*
+	 * TODO: everything below is a near-verbatim copy of nobh_writepage().
+	 * The relevant bits should be factored out after logfs is merged.
+	 */
+
+	/* Is the page fully inside i_size? */
+	if (bix < end_index)
+		return __logfs_writepage(page);
+
+	 /* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (bix > end_index || offset == 0) {
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invokation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	return __logfs_writepage(page);
+}
+
+static void logfs_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct logfs_block *block = logfs_block(page);
+
+	if (block->reserved_bytes) {
+		struct super_block *sb = page->mapping->host->i_sb;
+		struct logfs_super *super = logfs_super(sb);
+
+		super->s_dirty_pages -= block->reserved_bytes;
+		block->ops->free_block(sb, block);
+		BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR));
+	} else
+		move_page_to_btree(page);
+	BUG_ON(PagePrivate(page) || page->private);
+}
+
+static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
+{
+	return 0; /* None of these are easy to release */
+}
+
+
+long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	unsigned int oldflags, flags;
+	int err;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
+		return put_user(flags, (int __user *)arg);
+	case FS_IOC_SETFLAGS:
+		if (IS_RDONLY(inode))
+			return -EROFS;
+
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		err = get_user(flags, (int __user *)arg);
+		if (err)
+			return err;
+
+		mutex_lock(&inode->i_mutex);
+		oldflags = li->li_flags;
+		flags &= LOGFS_FL_USER_MODIFIABLE;
+		flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
+		li->li_flags = flags;
+		mutex_unlock(&inode->i_mutex);
+
+		inode->i_ctime = CURRENT_TIME;
+		mark_inode_dirty_sync(inode);
+		return 0;
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+int logfs_fsync(struct file *file, int datasync)
+{
+	struct super_block *sb = file->f_mapping->host->i_sb;
+
+	logfs_write_anchor(sb);
+	return 0;
+}
+
+static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int err = 0;
+
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		err = logfs_truncate(inode, attr->ia_size);
+		if (err)
+			return err;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+const struct inode_operations logfs_reg_iops = {
+	.setattr	= logfs_setattr,
+};
+
+const struct file_operations logfs_reg_fops = {
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.fsync		= logfs_fsync,
+	.unlocked_ioctl	= logfs_ioctl,
+	.llseek		= generic_file_llseek,
+	.mmap		= generic_file_readonly_mmap,
+	.open		= generic_file_open,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+};
+
+const struct address_space_operations logfs_reg_aops = {
+	.invalidatepage	= logfs_invalidatepage,
+	.readpage	= logfs_readpage,
+	.releasepage	= logfs_releasepage,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.writepage	= logfs_writepage,
+	.writepages	= generic_writepages,
+	.write_begin	= logfs_write_begin,
+	.write_end	= logfs_write_end,
+};
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
new file mode 100644
index 00000000..caa44192
--- /dev/null
+++ b/fs/logfs/gc.c
@@ -0,0 +1,732 @@
+/*
+ * fs/logfs/gc.c	- garbage collection code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+/*
+ * Wear leveling needs to kick in when the difference between low erase
+ * counts and high erase counts gets too big.  A good value for "too big"
+ * may be somewhat below 10% of maximum erase count for the device.
+ * Why not 397, to pick a nice round number with no specific meaning? :)
+ *
+ * WL_RATELIMIT is the minimum time between two wear level events.  A huge
+ * number of segments may fulfil the requirements for wear leveling at the
+ * same time.  If that happens we don't want to cause a latency from hell,
+ * but just gently pick one segment every so often and minimize overhead.
+ */
+#define WL_DELTA 397
+#define WL_RATELIMIT 100
+#define MAX_OBJ_ALIASES	2600
+#define SCAN_RATIO 512	/* number of scanned segments per gc'd segment */
+#define LIST_SIZE 64	/* base size of candidate lists */
+#define SCAN_ROUNDS 128	/* maximum number of complete medium scans */
+#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
+
+static int no_free_segments(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	return super->s_free_list.count;
+}
+
+/* journal has distance -1, top-most ifile layer distance 0 */
+static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u8 gc_level = (__force u8)__gc_level;
+
+	switch (gc_level) {
+	case 0: /* fall through */
+	case 1: /* fall through */
+	case 2: /* fall through */
+	case 3:
+		/* file data or indirect blocks */
+		return super->s_ifile_levels + super->s_iblock_levels - gc_level;
+	case 6: /* fall through */
+	case 7: /* fall through */
+	case 8: /* fall through */
+	case 9:
+		/* inode file data or indirect blocks */
+		return super->s_ifile_levels - (gc_level - 6);
+	default:
+		printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
+				gc_level);
+		WARN_ON(1);
+		return super->s_ifile_levels + super->s_iblock_levels;
+	}
+}
+
+static int segment_is_reserved(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area;
+	void *reserved;
+	int i;
+
+	/* Some segments are reserved.  Just pretend they were all valid */
+	reserved = btree_lookup32(&super->s_reserved_segments, segno);
+	if (reserved)
+		return 1;
+
+	/* Currently open segments */
+	for_each_area(i) {
+		area = super->s_area[i];
+		if (area->a_is_open && area->a_segno == segno)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
+{
+	BUG();
+}
+
+/*
+ * Returns the bytes consumed by valid objects in this segment.  Object headers
+ * are counted, the segment header is not.
+ */
+static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
+		gc_level_t *gc_level)
+{
+	struct logfs_segment_entry se;
+	u32 ec_level;
+
+	logfs_get_segment_entry(sb, segno, &se);
+	if (se.ec_level == cpu_to_be32(BADSEG) ||
+			se.valid == cpu_to_be32(RESERVED))
+		return RESERVED;
+
+	ec_level = be32_to_cpu(se.ec_level);
+	*ec = ec_level >> 4;
+	*gc_level = GC_LEVEL(ec_level & 0xf);
+	return be32_to_cpu(se.valid);
+}
+
+static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
+		u64 bix, gc_level_t gc_level)
+{
+	struct inode *inode;
+	int err, cookie;
+
+	inode = logfs_safe_iget(sb, ino, &cookie);
+	err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
+	BUG_ON(err);
+	logfs_safe_iput(inode, cookie);
+}
+
+static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_header sh;
+	struct logfs_object_header oh;
+	u64 ofs, ino, bix;
+	u32 seg_ofs, logical_segno, cleaned = 0;
+	int err, len, valid;
+	gc_level_t gc_level;
+
+	LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
+
+	btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
+	err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
+	BUG_ON(err);
+	gc_level = GC_LEVEL(sh.level);
+	logical_segno = be32_to_cpu(sh.segno);
+	if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
+		logfs_mark_segment_bad(sb, segno);
+		cleaned = -1;
+		goto out;
+	}
+
+	for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
+			seg_ofs + sizeof(oh) < super->s_segsize; ) {
+		ofs = dev_ofs(sb, logical_segno, seg_ofs);
+		err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
+				&oh);
+		BUG_ON(err);
+
+		if (!memchr_inv(&oh, 0xff, sizeof(oh)))
+			break;
+
+		if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
+			logfs_mark_segment_bad(sb, segno);
+			cleaned = super->s_segsize - 1;
+			goto out;
+		}
+
+		ino = be64_to_cpu(oh.ino);
+		bix = be64_to_cpu(oh.bix);
+		len = sizeof(oh) + be16_to_cpu(oh.len);
+		valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
+		if (valid == 1) {
+			logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
+			cleaned += len;
+		} else if (valid == 2) {
+			/* Will be invalid upon journal commit */
+			cleaned += len;
+		}
+		seg_ofs += len;
+	}
+out:
+	btree_remove32(&super->s_reserved_segments, segno);
+	return cleaned;
+}
+
+static struct gc_candidate *add_list(struct gc_candidate *cand,
+		struct candidate_list *list)
+{
+	struct rb_node **p = &list->rb_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct gc_candidate *cur;
+	int comp;
+
+	cand->list = list;
+	while (*p) {
+		parent = *p;
+		cur = rb_entry(parent, struct gc_candidate, rb_node);
+
+		if (list->sort_by_ec)
+			comp = cand->erase_count < cur->erase_count;
+		else
+			comp = cand->valid < cur->valid;
+
+		if (comp)
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+	}
+	rb_link_node(&cand->rb_node, parent, p);
+	rb_insert_color(&cand->rb_node, &list->rb_tree);
+
+	if (list->count <= list->maxcount) {
+		list->count++;
+		return NULL;
+	}
+	cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
+	rb_erase(&cand->rb_node, &list->rb_tree);
+	cand->list = NULL;
+	return cand;
+}
+
+static void remove_from_list(struct gc_candidate *cand)
+{
+	struct candidate_list *list = cand->list;
+
+	rb_erase(&cand->rb_node, &list->rb_tree);
+	list->count--;
+}
+
+static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	btree_remove32(&super->s_cand_tree, cand->segno);
+	kfree(cand);
+}
+
+u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
+{
+	struct gc_candidate *cand;
+	u32 segno;
+
+	BUG_ON(list->count == 0);
+
+	cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
+	remove_from_list(cand);
+	segno = cand->segno;
+	if (ec)
+		*ec = cand->erase_count;
+	free_candidate(sb, cand);
+	return segno;
+}
+
+/*
+ * We have several lists to manage segments with.  The reserve_list is used to
+ * deal with bad blocks.  We try to keep the best (lowest ec) segments on this
+ * list.
+ * The free_list contains free segments for normal usage.  It usually gets the
+ * second pick after the reserve_list.  But when the free_list is running short
+ * it is more important to keep the free_list full than to keep a reserve.
+ *
+ * Segments that are not free are put onto a per-level low_list.  If we have
+ * to run garbage collection, we pick a candidate from there.  All segments on
+ * those lists should have at least some free space so GC will make progress.
+ *
+ * And last we have the ec_list, which is used to pick segments for wear
+ * leveling.
+ *
+ * If all appropriate lists are full, we simply free the candidate and forget
+ * about that segment for a while.  We have better candidates for each purpose.
+ */
+static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
+
+	if (cand->valid == 0) {
+		/* 100% free segments */
+		log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
+				cand->segno, cand->erase_count,
+				dev_ofs(sb, cand->segno, 0));
+		cand = add_list(cand, &super->s_reserve_list);
+		if (cand) {
+			log_gc_noisy("add free segment %x (ec %x) at %llx\n",
+					cand->segno, cand->erase_count,
+					dev_ofs(sb, cand->segno, 0));
+			cand = add_list(cand, &super->s_free_list);
+		}
+	} else {
+		/* good candidates for Garbage Collection */
+		if (cand->valid < full)
+			cand = add_list(cand, &super->s_low_list[cand->dist]);
+		/* good candidates for wear leveling,
+		 * segments that were recently written get ignored */
+		if (cand)
+			cand = add_list(cand, &super->s_ec_list);
+	}
+	if (cand)
+		free_candidate(sb, cand);
+}
+
+static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
+		u8 dist)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+
+	cand = kmalloc(sizeof(*cand), GFP_NOFS);
+	if (!cand)
+		return -ENOMEM;
+
+	cand->segno = segno;
+	cand->valid = valid;
+	cand->erase_count = ec;
+	cand->dist = dist;
+
+	btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
+	__add_candidate(sb, cand);
+	return 0;
+}
+
+static void remove_segment_from_lists(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+
+	cand = btree_lookup32(&super->s_cand_tree, segno);
+	if (cand) {
+		remove_from_list(cand);
+		free_candidate(sb, cand);
+	}
+}
+
+static void scan_segment(struct super_block *sb, u32 segno)
+{
+	u32 valid, ec = 0;
+	gc_level_t gc_level = 0;
+	u8 dist;
+
+	if (segment_is_reserved(sb, segno))
+		return;
+
+	remove_segment_from_lists(sb, segno);
+	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+	if (valid == RESERVED)
+		return;
+
+	dist = root_distance(sb, gc_level);
+	add_candidate(sb, segno, valid, ec, dist);
+}
+
+static struct gc_candidate *first_in_list(struct candidate_list *list)
+{
+	if (list->count == 0)
+		return NULL;
+	return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
+}
+
+/*
+ * Find the best segment for garbage collection.  Main criterion is
+ * the segment requiring the least effort to clean.  Secondary
+ * criterion is to GC on the lowest level available.
+ *
+ * So we search the least effort segment on the lowest level first,
+ * then move up and pick another segment iff is requires significantly
+ * less effort.  Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
+ */
+static struct gc_candidate *get_candidate(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i, max_dist;
+	struct gc_candidate *cand = NULL, *this;
+
+	max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
+
+	for (i = max_dist; i >= 0; i--) {
+		this = first_in_list(&super->s_low_list[i]);
+		if (!this)
+			continue;
+		if (!cand)
+			cand = this;
+		if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
+			cand = this;
+	}
+	return cand;
+}
+
+static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
+{
+	struct logfs_super *super = logfs_super(sb);
+	gc_level_t gc_level;
+	u32 cleaned, valid, segno, ec;
+	u8 dist;
+
+	if (!cand) {
+		log_gc("GC attempted, but no candidate found\n");
+		return 0;
+	}
+
+	segno = cand->segno;
+	dist = cand->dist;
+	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+	free_candidate(sb, cand);
+	log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
+			segno, (u64)segno << super->s_segshift,
+			dist, no_free_segments(sb), valid,
+			super->s_free_bytes);
+	cleaned = logfs_gc_segment(sb, segno);
+	log_gc("GC segment #%02x complete - now %x valid\n", segno,
+			valid - cleaned);
+	BUG_ON(cleaned != valid);
+	return 1;
+}
+
+static int logfs_gc_once(struct super_block *sb)
+{
+	struct gc_candidate *cand;
+
+	cand = get_candidate(sb);
+	if (cand)
+		remove_from_list(cand);
+	return __logfs_gc_once(sb, cand);
+}
+
+/* returns 1 if a wrap occurs, 0 otherwise */
+static int logfs_scan_some(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 segno;
+	int i, ret = 0;
+
+	segno = super->s_sweeper;
+	for (i = SCAN_RATIO; i > 0; i--) {
+		segno++;
+		if (segno >= super->s_no_segs) {
+			segno = 0;
+			ret = 1;
+			/* Break out of the loop.  We want to read a single
+			 * block from the segment size on next invocation if
+			 * SCAN_RATIO is set to match block size
+			 */
+			break;
+		}
+
+		scan_segment(sb, segno);
+	}
+	super->s_sweeper = segno;
+	return ret;
+}
+
+/*
+ * In principle, this function should loop forever, looking for GC candidates
+ * and moving data.  LogFS is designed in such a way that this loop is
+ * guaranteed to terminate.
+ *
+ * Limiting the loop to some iterations serves purely to catch cases when
+ * these guarantees have failed.  An actual endless loop is an obvious bug
+ * and should be reported as such.
+ */
+static void __logfs_gc_pass(struct super_block *sb, int target)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+	int round, progress, last_progress = 0;
+
+	/*
+	 * Doing too many changes to the segfile at once would result
+	 * in a large number of aliases.  Write the journal before
+	 * things get out of hand.
+	 */
+	if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
+		logfs_write_anchor(sb);
+
+	if (no_free_segments(sb) >= target &&
+			super->s_no_object_aliases < MAX_OBJ_ALIASES)
+		return;
+
+	log_gc("__logfs_gc_pass(%x)\n", target);
+	for (round = 0; round < SCAN_ROUNDS; ) {
+		if (no_free_segments(sb) >= target)
+			goto write_alias;
+
+		/* Sync in-memory state with on-medium state in case they
+		 * diverged */
+		logfs_write_anchor(sb);
+		round += logfs_scan_some(sb);
+		if (no_free_segments(sb) >= target)
+			goto write_alias;
+		progress = logfs_gc_once(sb);
+		if (progress)
+			last_progress = round;
+		else if (round - last_progress > 2)
+			break;
+		continue;
+
+		/*
+		 * The goto logic is nasty, I just don't know a better way to
+		 * code it.  GC is supposed to ensure two things:
+		 * 1. Enough free segments are available.
+		 * 2. The number of aliases is bounded.
+		 * When 1. is achieved, we take a look at 2. and write back
+		 * some alias-containing blocks, if necessary.  However, after
+		 * each such write we need to go back to 1., as writes can
+		 * consume free segments.
+		 */
+write_alias:
+		if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
+			return;
+		if (list_empty(&super->s_object_alias)) {
+			/* All aliases are still in btree */
+			return;
+		}
+		log_gc("Write back one alias\n");
+		block = list_entry(super->s_object_alias.next,
+				struct logfs_block, alias_list);
+		block->ops->write_block(block);
+		/*
+		 * To round off the nasty goto logic, we reset round here.  It
+		 * is a safety-net for GC not making any progress and limited
+		 * to something reasonably small.  If incremented it for every
+		 * single alias, the loop could terminate rather quickly.
+		 */
+		round = 0;
+	}
+	LOGFS_BUG(sb);
+}
+
+static int wl_ratelimit(struct super_block *sb, u64 *next_event)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (*next_event < super->s_gec) {
+		*next_event = super->s_gec + WL_RATELIMIT;
+		return 0;
+	}
+	return 1;
+}
+
+static void logfs_wl_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *wl_cand, *free_cand;
+
+	if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
+		return;
+
+	wl_cand = first_in_list(&super->s_ec_list);
+	if (!wl_cand)
+		return;
+	free_cand = first_in_list(&super->s_free_list);
+	if (!free_cand)
+		return;
+
+	if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
+		remove_from_list(wl_cand);
+		__logfs_gc_once(sb, wl_cand);
+	}
+}
+
+/*
+ * The journal needs wear leveling as well.  But moving the journal is an
+ * expensive operation so we try to avoid it as much as possible.  And if we
+ * have to do it, we move the whole journal, not individual segments.
+ *
+ * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
+ * calculations.  First we check whether moving the journal would be a
+ * significant improvement.  That means that a) the current journal segments
+ * have more wear than the future journal segments and b) the current journal
+ * segments have more wear than normal ostore segments.
+ * Rationale for b) is that we don't have to move the journal if it is aging
+ * less than the ostore, even if the reserve segments age even less (they are
+ * excluded from wear leveling, after all).
+ * Next we check that the superblocks have less wear than the journal.  Since
+ * moving the journal requires writing the superblocks, we have to protect the
+ * superblocks even more than the journal.
+ *
+ * Also we double the acceptable wear difference, compared to ostore wear
+ * leveling.  Journal data is read and rewritten rapidly, comparatively.  So
+ * soft errors have much less time to accumulate and we allow the journal to
+ * be a bit worse than the ostore.
+ */
+static void logfs_journal_wl_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+	u32 min_journal_ec = -1, max_reserve_ec = 0;
+	int i;
+
+	if (wl_ratelimit(sb, &super->s_wl_gec_journal))
+		return;
+
+	if (super->s_reserve_list.count < super->s_no_journal_segs) {
+		/* Reserve is not full enough to move complete journal */
+		return;
+	}
+
+	journal_for_each(i)
+		if (super->s_journal_seg[i])
+			min_journal_ec = min(min_journal_ec,
+					super->s_journal_ec[i]);
+	cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
+			struct gc_candidate, rb_node);
+	max_reserve_ec = cand->erase_count;
+	for (i = 0; i < 2; i++) {
+		struct logfs_segment_entry se;
+		u32 segno = seg_no(sb, super->s_sb_ofs[i]);
+		u32 ec;
+
+		logfs_get_segment_entry(sb, segno, &se);
+		ec = be32_to_cpu(se.ec_level) >> 4;
+		max_reserve_ec = max(max_reserve_ec, ec);
+	}
+
+	if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
+		do_logfs_journal_wl_pass(sb);
+	}
+}
+
+void logfs_gc_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	//BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
+	/* Write journal before free space is getting saturated with dirty
+	 * objects.
+	 */
+	if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
+			+ LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
+		logfs_write_anchor(sb);
+	__logfs_gc_pass(sb, super->s_total_levels);
+	logfs_wl_pass(sb);
+	logfs_journal_wl_pass(sb);
+}
+
+static int check_area(struct super_block *sb, int i)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[i];
+	gc_level_t gc_level;
+	u32 cleaned, valid, ec;
+	u32 segno = area->a_segno;
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+
+	if (!area->a_is_open)
+		return 0;
+
+	if (super->s_devops->can_write_buf(sb, ofs) == 0)
+		return 0;
+
+	printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
+	/*
+	 * The device cannot write back the write buffer.  Most likely the
+	 * wbuf was already written out and the system crashed at some point
+	 * before the journal commit happened.  In that case we wouldn't have
+	 * to do anything.  But if the crash happened before the wbuf was
+	 * written out correctly, we must GC this segment.  So assume the
+	 * worst and always do the GC run.
+	 */
+	area->a_is_open = 0;
+	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+	cleaned = logfs_gc_segment(sb, segno);
+	if (cleaned != valid)
+		return -EIO;
+	return 0;
+}
+
+int logfs_check_areas(struct super_block *sb)
+{
+	int i, err;
+
+	for_each_area(i) {
+		err = check_area(sb, i);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void logfs_init_candlist(struct candidate_list *list, int maxcount,
+		int sort_by_ec)
+{
+	list->count = 0;
+	list->maxcount = maxcount;
+	list->sort_by_ec = sort_by_ec;
+	list->rb_tree = RB_ROOT;
+}
+
+int logfs_init_gc(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
+	logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
+	logfs_init_candlist(&super->s_reserve_list,
+			super->s_bad_seg_reserve, 1);
+	for_each_area(i)
+		logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
+	logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
+	return 0;
+}
+
+static void logfs_cleanup_list(struct super_block *sb,
+		struct candidate_list *list)
+{
+	struct gc_candidate *cand;
+
+	while (list->count) {
+		cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
+				rb_node);
+		remove_from_list(cand);
+		free_candidate(sb, cand);
+	}
+	BUG_ON(list->rb_tree.rb_node);
+}
+
+void logfs_cleanup_gc(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	if (!super->s_free_list.count)
+		return;
+
+	/*
+	 * FIXME: The btree may still contain a single empty node.  So we
+	 * call the grim visitor to clean up that mess.  Btree code should
+	 * do it for us, really.
+	 */
+	btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
+	logfs_cleanup_list(sb, &super->s_free_list);
+	logfs_cleanup_list(sb, &super->s_reserve_list);
+	for_each_area(i)
+		logfs_cleanup_list(sb, &super->s_low_list[i]);
+	logfs_cleanup_list(sb, &super->s_ec_list);
+}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
new file mode 100644
index 00000000..edfea7a3
--- /dev/null
+++ b/fs/logfs/inode.c
@@ -0,0 +1,405 @@
+/*
+ * fs/logfs/inode.c	- inode handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+
+/*
+ * How soon to reuse old inode numbers?  LogFS doesn't store deleted inodes
+ * on the medium.  It therefore also lacks a method to store the previous
+ * generation number for deleted inodes.  Instead a single generation number
+ * is stored which will be used for new inodes.  Being just a 32bit counter,
+ * this can obvious wrap relatively quickly.  So we only reuse inodes if we
+ * know that a fair number of inodes can be created before we have to increment
+ * the generation again - effectively adding some bits to the counter.
+ * But being too aggressive here means we keep a very large and very sparse
+ * inode file, wasting space on indirect blocks.
+ * So what is a good value?  Beats me.  64k seems moderately bad on both
+ * fronts, so let's use that for now...
+ *
+ * NFS sucks, as everyone already knows.
+ */
+#define INOS_PER_WRAP (0x10000)
+
+/*
+ * Logfs' requirement to read inodes for garbage collection makes life a bit
+ * harder.  GC may have to read inodes that are in I_FREEING state, when they
+ * are being written out - and waiting for GC to make progress, naturally.
+ *
+ * So we cannot just call iget() or some variant of it, but first have to check
+ * wether the inode in question might be in I_FREEING state.  Therefore we
+ * maintain our own per-sb list of "almost deleted" inodes and check against
+ * that list first.  Normally this should be at most 1-2 entries long.
+ *
+ * Also, inodes have logfs-specific reference counting on top of what the vfs
+ * does.  When .destroy_inode is called, normally the reference count will drop
+ * to zero and the inode gets deleted.  But if GC accessed the inode, its
+ * refcount will remain nonzero and final deletion will have to wait.
+ *
+ * As a result we have two sets of functions to get/put inodes:
+ * logfs_safe_iget/logfs_safe_iput	- safe to call from GC context
+ * logfs_iget/iput			- normal version
+ */
+static struct kmem_cache *logfs_inode_cache;
+
+static DEFINE_SPINLOCK(logfs_inode_lock);
+
+static void logfs_inode_setops(struct inode *inode)
+{
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFDIR:
+		inode->i_op = &logfs_dir_iops;
+		inode->i_fop = &logfs_dir_fops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFREG:
+		inode->i_op = &logfs_reg_iops;
+		inode->i_fop = &logfs_reg_fops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFLNK:
+		inode->i_op = &logfs_symlink_iops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFSOCK:	/* fall through */
+	case S_IFBLK:	/* fall through */
+	case S_IFCHR:	/* fall through */
+	case S_IFIFO:
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
+{
+	struct inode *inode = iget_locked(sb, ino);
+	int err;
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	err = logfs_read_inode(inode);
+	if (err || inode->i_nlink == 0) {
+		/* inode->i_nlink == 0 can be true when called from
+		 * block validator */
+		/* set i_nlink to 0 to prevent caching */
+		inode->i_nlink = 0;
+		logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
+		iget_failed(inode);
+		if (!err)
+			err = -ENOENT;
+		return ERR_PTR(err);
+	}
+
+	logfs_inode_setops(inode);
+	unlock_new_inode(inode);
+	return inode;
+}
+
+struct inode *logfs_iget(struct super_block *sb, ino_t ino)
+{
+	BUG_ON(ino == LOGFS_INO_MASTER);
+	BUG_ON(ino == LOGFS_INO_SEGFILE);
+	return __logfs_iget(sb, ino);
+}
+
+/*
+ * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
+ * this allows logfs_iput to do the right thing later
+ */
+struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_inode *li;
+
+	if (ino == LOGFS_INO_MASTER)
+		return super->s_master_inode;
+	if (ino == LOGFS_INO_SEGFILE)
+		return super->s_segfile_inode;
+
+	spin_lock(&logfs_inode_lock);
+	list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
+		if (li->vfs_inode.i_ino == ino) {
+			li->li_refcount++;
+			spin_unlock(&logfs_inode_lock);
+			*is_cached = 1;
+			return &li->vfs_inode;
+		}
+	spin_unlock(&logfs_inode_lock);
+
+	*is_cached = 0;
+	return __logfs_iget(sb, ino);
+}
+
+static void logfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
+}
+
+static void __logfs_destroy_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	BUG_ON(li->li_block);
+	list_del(&li->li_freeing_list);
+	call_rcu(&inode->i_rcu, logfs_i_callback);
+}
+
+static void logfs_destroy_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	BUG_ON(list_empty(&li->li_freeing_list));
+	spin_lock(&logfs_inode_lock);
+	li->li_refcount--;
+	if (li->li_refcount == 0)
+		__logfs_destroy_inode(inode);
+	spin_unlock(&logfs_inode_lock);
+}
+
+void logfs_safe_iput(struct inode *inode, int is_cached)
+{
+	if (inode->i_ino == LOGFS_INO_MASTER)
+		return;
+	if (inode->i_ino == LOGFS_INO_SEGFILE)
+		return;
+
+	if (is_cached) {
+		logfs_destroy_inode(inode);
+		return;
+	}
+
+	iput(inode);
+}
+
+static void logfs_init_inode(struct super_block *sb, struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	li->li_flags	= 0;
+	li->li_height	= 0;
+	li->li_used_bytes = 0;
+	li->li_block	= NULL;
+	inode->i_uid	= 0;
+	inode->i_gid	= 0;
+	inode->i_size	= 0;
+	inode->i_blocks	= 0;
+	inode->i_ctime	= CURRENT_TIME;
+	inode->i_mtime	= CURRENT_TIME;
+	inode->i_nlink	= 1;
+	li->li_refcount = 1;
+	INIT_LIST_HEAD(&li->li_freeing_list);
+
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = 0;
+
+	return;
+}
+
+static struct inode *logfs_alloc_inode(struct super_block *sb)
+{
+	struct logfs_inode *li;
+
+	li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
+	if (!li)
+		return NULL;
+	logfs_init_inode(sb, &li->vfs_inode);
+	return &li->vfs_inode;
+}
+
+/*
+ * In logfs inodes are written to an inode file.  The inode file, like any
+ * other file, is managed with a inode.  The inode file's inode, aka master
+ * inode, requires special handling in several respects.  First, it cannot be
+ * written to the inode file, so it is stored in the journal instead.
+ *
+ * Secondly, this inode cannot be written back and destroyed before all other
+ * inodes have been written.  The ordering is important.  Linux' VFS is happily
+ * unaware of the ordering constraint and would ordinarily destroy the master
+ * inode at umount time while other inodes are still in use and dirty.  Not
+ * good.
+ *
+ * So logfs makes sure the master inode is not written until all other inodes
+ * have been destroyed.  Sadly, this method has another side-effect.  The VFS
+ * will notice one remaining inode and print a frightening warning message.
+ * Worse, it is impossible to judge whether such a warning was caused by the
+ * master inode or any other inodes have leaked as well.
+ *
+ * Our attempt of solving this is with logfs_new_meta_inode() below.  Its
+ * purpose is to create a new inode that will not trigger the warning if such
+ * an inode is still in use.  An ugly hack, no doubt.  Suggections for
+ * improvement are welcome.
+ *
+ * AV: that's what ->put_super() is for...
+ */
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode->i_mode = S_IFREG;
+	inode->i_ino = ino;
+	inode->i_data.a_ops = &logfs_reg_aops;
+	mapping_set_gfp_mask(&inode->i_data, GFP_NOFS);
+
+	return inode;
+}
+
+struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
+{
+	struct inode *inode;
+	int err;
+
+	inode = logfs_new_meta_inode(sb, ino);
+	if (IS_ERR(inode))
+		return inode;
+
+	err = logfs_read_inode(inode);
+	if (err) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	logfs_inode_setops(inode);
+	return inode;
+}
+
+static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int ret;
+	long flags = WF_LOCK;
+
+	/* Can only happen if creat() failed.  Safe to skip. */
+	if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
+		return 0;
+
+	ret = __logfs_write_inode(inode, flags);
+	LOGFS_BUG_ON(ret, inode->i_sb);
+	return ret;
+}
+
+/* called with inode->i_lock held */
+static int logfs_drop_inode(struct inode *inode)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+
+	spin_lock(&logfs_inode_lock);
+	list_move(&li->li_freeing_list, &super->s_freeing_list);
+	spin_unlock(&logfs_inode_lock);
+	return generic_drop_inode(inode);
+}
+
+static void logfs_set_ino_generation(struct super_block *sb,
+		struct inode *inode)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 ino;
+
+	mutex_lock(&super->s_journal_mutex);
+	ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1);
+	super->s_last_ino = ino;
+	super->s_inos_till_wrap--;
+	if (super->s_inos_till_wrap < 0) {
+		super->s_last_ino = LOGFS_RESERVED_INOS;
+		super->s_generation++;
+		super->s_inos_till_wrap = INOS_PER_WRAP;
+	}
+	inode->i_ino = ino;
+	inode->i_generation = super->s_generation;
+	mutex_unlock(&super->s_journal_mutex);
+}
+
+struct inode *logfs_new_inode(struct inode *dir, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	logfs_init_inode(sb, inode);
+
+	/* inherit parent flags */
+	logfs_inode(inode)->li_flags |=
+		logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
+
+	inode->i_mode = mode;
+	logfs_set_ino_generation(sb, inode);
+
+	inode_init_owner(inode, dir, mode);
+	logfs_inode_setops(inode);
+	insert_inode_hash(inode);
+
+	return inode;
+}
+
+static void logfs_init_once(void *_li)
+{
+	struct logfs_inode *li = _li;
+	int i;
+
+	li->li_flags = 0;
+	li->li_used_bytes = 0;
+	li->li_refcount = 1;
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = 0;
+	inode_init_once(&li->vfs_inode);
+}
+
+static int logfs_sync_fs(struct super_block *sb, int wait)
+{
+	logfs_write_anchor(sb);
+	return 0;
+}
+
+static void logfs_put_super(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	/* kill the meta-inodes */
+	iput(super->s_master_inode);
+	iput(super->s_segfile_inode);
+	iput(super->s_mapping_inode);
+}
+
+const struct super_operations logfs_super_operations = {
+	.alloc_inode	= logfs_alloc_inode,
+	.destroy_inode	= logfs_destroy_inode,
+	.evict_inode	= logfs_evict_inode,
+	.drop_inode	= logfs_drop_inode,
+	.put_super	= logfs_put_super,
+	.write_inode	= logfs_write_inode,
+	.statfs		= logfs_statfs,
+	.sync_fs	= logfs_sync_fs,
+};
+
+int logfs_init_inode_cache(void)
+{
+	logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
+			sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+			logfs_init_once);
+	if (!logfs_inode_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void logfs_destroy_inode_cache(void)
+{
+	kmem_cache_destroy(logfs_inode_cache);
+}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
new file mode 100644
index 00000000..9da29706
--- /dev/null
+++ b/fs/logfs/journal.c
@@ -0,0 +1,895 @@
+/*
+ * fs/logfs/journal.c	- journal handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+
+static void logfs_calc_free(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 reserve, no_segs = super->s_no_segs;
+	s64 free;
+	int i;
+
+	/* superblock segments */
+	no_segs -= 2;
+	super->s_no_journal_segs = 0;
+	/* journal */
+	journal_for_each(i)
+		if (super->s_journal_seg[i]) {
+			no_segs--;
+			super->s_no_journal_segs++;
+		}
+
+	/* open segments plus one extra per level for GC */
+	no_segs -= 2 * super->s_total_levels;
+
+	free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
+	free -= super->s_used_bytes;
+	/* just a bit extra */
+	free -= super->s_total_levels * 4096;
+
+	/* Bad blocks are 'paid' for with speed reserve - the filesystem
+	 * simply gets slower as bad blocks accumulate.  Until the bad blocks
+	 * exceed the speed reserve - then the filesystem gets smaller.
+	 */
+	reserve = super->s_bad_segments + super->s_bad_seg_reserve;
+	reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE;
+	reserve = max(reserve, super->s_speed_reserve);
+	free -= reserve;
+	if (free < 0)
+		free = 0;
+
+	super->s_free_bytes = free;
+}
+
+static void reserve_sb_and_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head32 *head = &super->s_reserved_segments;
+	int i, err;
+
+	err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1,
+			GFP_KERNEL);
+	BUG_ON(err);
+
+	err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1,
+			GFP_KERNEL);
+	BUG_ON(err);
+
+	journal_for_each(i) {
+		if (!super->s_journal_seg[i])
+			continue;
+		err = btree_insert32(head, super->s_journal_seg[i], (void *)1,
+				GFP_KERNEL);
+		BUG_ON(err);
+	}
+}
+
+static void read_dynsb(struct super_block *sb,
+		struct logfs_je_dynsb *dynsb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	super->s_gec		= be64_to_cpu(dynsb->ds_gec);
+	super->s_sweeper	= be64_to_cpu(dynsb->ds_sweeper);
+	super->s_victim_ino	= be64_to_cpu(dynsb->ds_victim_ino);
+	super->s_rename_dir	= be64_to_cpu(dynsb->ds_rename_dir);
+	super->s_rename_pos	= be64_to_cpu(dynsb->ds_rename_pos);
+	super->s_used_bytes	= be64_to_cpu(dynsb->ds_used_bytes);
+	super->s_generation	= be32_to_cpu(dynsb->ds_generation);
+}
+
+static void read_anchor(struct super_block *sb,
+		struct logfs_je_anchor *da)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	super->s_last_ino = be64_to_cpu(da->da_last_ino);
+	li->li_flags	= 0;
+	li->li_height	= da->da_height;
+	i_size_write(inode, be64_to_cpu(da->da_size));
+	li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
+
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = be64_to_cpu(da->da_data[i]);
+}
+
+static void read_erasecount(struct super_block *sb,
+		struct logfs_je_journal_ec *ec)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	journal_for_each(i)
+		super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
+}
+
+static int read_area(struct super_block *sb, struct logfs_je_area *a)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[a->gc_level];
+	u64 ofs;
+	u32 writemask = ~(super->s_writesize - 1);
+
+	if (a->gc_level >= LOGFS_NO_AREAS)
+		return -EIO;
+	if (a->vim != VIM_DEFAULT)
+		return -EIO; /* TODO: close area and continue */
+
+	area->a_used_bytes = be32_to_cpu(a->used_bytes);
+	area->a_written_bytes = area->a_used_bytes & writemask;
+	area->a_segno = be32_to_cpu(a->segno);
+	if (area->a_segno)
+		area->a_is_open = 1;
+
+	ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+	if (super->s_writesize > 1)
+		return logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
+	else
+		return logfs_buf_recover(area, ofs, NULL, 0);
+}
+
+static void *unpack(void *from, void *to)
+{
+	struct logfs_journal_header *jh = from;
+	void *data = from + sizeof(struct logfs_journal_header);
+	int err;
+	size_t inlen, outlen;
+
+	inlen = be16_to_cpu(jh->h_len);
+	outlen = be16_to_cpu(jh->h_datalen);
+
+	if (jh->h_compr == COMPR_NONE)
+		memcpy(to, data, inlen);
+	else {
+		err = logfs_uncompress(data, to, inlen, outlen);
+		BUG_ON(err);
+	}
+	return to;
+}
+
+static int __read_je_header(struct super_block *sb, u64 ofs,
+		struct logfs_journal_header *jh)
+{
+	struct logfs_super *super = logfs_super(sb);
+	size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
+		+ MAX_JOURNAL_HEADER;
+	u16 type, len, datalen;
+	int err;
+
+	/* read header only */
+	err = wbuf_read(sb, ofs, sizeof(*jh), jh);
+	if (err)
+		return err;
+	type = be16_to_cpu(jh->h_type);
+	len = be16_to_cpu(jh->h_len);
+	datalen = be16_to_cpu(jh->h_datalen);
+	if (len > sb->s_blocksize)
+		return -EIO;
+	if ((type < JE_FIRST) || (type > JE_LAST))
+		return -EIO;
+	if (datalen > bufsize)
+		return -EIO;
+	return 0;
+}
+
+static int __read_je_payload(struct super_block *sb, u64 ofs,
+		struct logfs_journal_header *jh)
+{
+	u16 len;
+	int err;
+
+	len = be16_to_cpu(jh->h_len);
+	err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1);
+	if (err)
+		return err;
+	if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) {
+		/* Old code was confused.  It forgot about the header length
+		 * and stopped calculating the crc 16 bytes before the end
+		 * of data - ick!
+		 * FIXME: Remove this hack once the old code is fixed.
+		 */
+		if (jh->h_crc == logfs_crc32(jh, len, 4))
+			WARN_ON_ONCE(1);
+		else
+			return -EIO;
+	}
+	return 0;
+}
+
+/*
+ * jh needs to be large enough to hold the complete entry, not just the header
+ */
+static int __read_je(struct super_block *sb, u64 ofs,
+		struct logfs_journal_header *jh)
+{
+	int err;
+
+	err = __read_je_header(sb, ofs, jh);
+	if (err)
+		return err;
+	return __read_je_payload(sb, ofs, jh);
+}
+
+static int read_je(struct super_block *sb, u64 ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_journal_header *jh = super->s_compressed_je;
+	void *scratch = super->s_je;
+	u16 type, datalen;
+	int err;
+
+	err = __read_je(sb, ofs, jh);
+	if (err)
+		return err;
+	type = be16_to_cpu(jh->h_type);
+	datalen = be16_to_cpu(jh->h_datalen);
+
+	switch (type) {
+	case JE_DYNSB:
+		read_dynsb(sb, unpack(jh, scratch));
+		break;
+	case JE_ANCHOR:
+		read_anchor(sb, unpack(jh, scratch));
+		break;
+	case JE_ERASECOUNT:
+		read_erasecount(sb, unpack(jh, scratch));
+		break;
+	case JE_AREA:
+		err = read_area(sb, unpack(jh, scratch));
+		break;
+	case JE_OBJ_ALIAS:
+		err = logfs_load_object_aliases(sb, unpack(jh, scratch),
+				datalen);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return -EIO;
+	}
+	return err;
+}
+
+static int logfs_read_segment(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_journal_header *jh = super->s_compressed_je;
+	u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
+	u32 h_ofs, last_ofs = 0;
+	u16 len, datalen, last_len = 0;
+	int i, err;
+
+	/* search for most recent commit */
+	for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) {
+		ofs = seg_ofs + h_ofs;
+		err = __read_je_header(sb, ofs, jh);
+		if (err)
+			continue;
+		if (jh->h_type != cpu_to_be16(JE_COMMIT))
+			continue;
+		err = __read_je_payload(sb, ofs, jh);
+		if (err)
+			continue;
+		len = be16_to_cpu(jh->h_len);
+		datalen = be16_to_cpu(jh->h_datalen);
+		if ((datalen > sizeof(super->s_je_array)) ||
+				(datalen % sizeof(__be64)))
+			continue;
+		last_ofs = h_ofs;
+		last_len = datalen;
+		h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh);
+	}
+	/* read commit */
+	if (last_ofs == 0)
+		return -ENOENT;
+	ofs = seg_ofs + last_ofs;
+	log_journal("Read commit from %llx\n", ofs);
+	err = __read_je(sb, ofs, jh);
+	BUG_ON(err); /* We should have caught it in the scan loop already */
+	if (err)
+		return err;
+	/* uncompress */
+	unpack(jh, super->s_je_array);
+	super->s_no_je = last_len / sizeof(__be64);
+	/* iterate over array */
+	for (i = 0; i < super->s_no_je; i++) {
+		err = read_je(sb, be64_to_cpu(super->s_je_array[i]));
+		if (err)
+			return err;
+	}
+	super->s_journal_area->a_segno = segno;
+	return 0;
+}
+
+static u64 read_gec(struct super_block *sb, u32 segno)
+{
+	struct logfs_segment_header sh;
+	__be32 crc;
+	int err;
+
+	if (!segno)
+		return 0;
+	err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
+	if (err)
+		return 0;
+	crc = logfs_crc32(&sh, sizeof(sh), 4);
+	if (crc != sh.crc) {
+		WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull));
+		/* Most likely it was just erased */
+		return 0;
+	}
+	return be64_to_cpu(sh.gec);
+}
+
+static int logfs_read_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 gec[LOGFS_JOURNAL_SEGS], max;
+	u32 segno;
+	int i, max_i;
+
+	max = 0;
+	max_i = -1;
+	journal_for_each(i) {
+		segno = super->s_journal_seg[i];
+		gec[i] = read_gec(sb, super->s_journal_seg[i]);
+		if (gec[i] > max) {
+			max = gec[i];
+			max_i = i;
+		}
+	}
+	if (max_i == -1)
+		return -EIO;
+	/* FIXME: Try older segments in case of error */
+	return logfs_read_segment(sb, super->s_journal_seg[max_i]);
+}
+
+/*
+ * First search the current segment (outer loop), then pick the next segment
+ * in the array, skipping any zero entries (inner loop).
+ */
+static void journal_get_free_segment(struct logfs_area *area)
+{
+	struct logfs_super *super = logfs_super(area->a_sb);
+	int i;
+
+	journal_for_each(i) {
+		if (area->a_segno != super->s_journal_seg[i])
+			continue;
+
+		do {
+			i++;
+			if (i == LOGFS_JOURNAL_SEGS)
+				i = 0;
+		} while (!super->s_journal_seg[i]);
+
+		area->a_segno = super->s_journal_seg[i];
+		area->a_erase_count = ++(super->s_journal_ec[i]);
+		log_journal("Journal now at %x (ec %x)\n", area->a_segno,
+				area->a_erase_count);
+		return;
+	}
+	BUG();
+}
+
+static void journal_get_erase_count(struct logfs_area *area)
+{
+	/* erase count is stored globally and incremented in
+	 * journal_get_free_segment() - nothing to do here */
+}
+
+static int journal_erase_segment(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	union {
+		struct logfs_segment_header sh;
+		unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)];
+	} u;
+	u64 ofs;
+	int err;
+
+	err = logfs_erase_segment(sb, area->a_segno, 1);
+	if (err)
+		return err;
+
+	memset(&u, 0, sizeof(u));
+	u.sh.pad = 0;
+	u.sh.type = SEG_JOURNAL;
+	u.sh.level = 0;
+	u.sh.segno = cpu_to_be32(area->a_segno);
+	u.sh.ec = cpu_to_be32(area->a_erase_count);
+	u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+	u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4);
+
+	/* This causes a bug in segment.c.  Not yet. */
+	//logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
+
+	ofs = dev_ofs(sb, area->a_segno, 0);
+	area->a_used_bytes = sizeof(u);
+	logfs_buf_write(area, ofs, &u, sizeof(u));
+	return 0;
+}
+
+static size_t __logfs_write_header(struct logfs_super *super,
+		struct logfs_journal_header *jh, size_t len, size_t datalen,
+		u16 type, u8 compr)
+{
+	jh->h_len	= cpu_to_be16(len);
+	jh->h_type	= cpu_to_be16(type);
+	jh->h_datalen	= cpu_to_be16(datalen);
+	jh->h_compr	= compr;
+	jh->h_pad[0]	= 'H';
+	jh->h_pad[1]	= 'E';
+	jh->h_pad[2]	= 'A';
+	jh->h_pad[3]	= 'D';
+	jh->h_pad[4]	= 'R';
+	jh->h_crc	= logfs_crc32(jh, len + sizeof(*jh), 4);
+	return ALIGN(len, 16) + sizeof(*jh);
+}
+
+static size_t logfs_write_header(struct logfs_super *super,
+		struct logfs_journal_header *jh, size_t datalen, u16 type)
+{
+	size_t len = datalen;
+
+	return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE);
+}
+
+static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
+{
+	return LOGFS_JOURNAL_SEGS * sizeof(__be32);
+}
+
+static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_journal_ec *ec = _ec;
+	int i;
+
+	journal_for_each(i)
+		ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
+	*type = JE_ERASECOUNT;
+	*len = logfs_journal_erasecount_size(super);
+	return ec;
+}
+
+static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore,
+		size_t ignore2)
+{
+	struct logfs_shadow *shadow = _shadow;
+	struct super_block *sb = (void *)_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	/* consume new space */
+	super->s_free_bytes	  -= shadow->new_len;
+	super->s_used_bytes	  += shadow->new_len;
+	super->s_dirty_used_bytes -= shadow->new_len;
+
+	/* free up old space */
+	super->s_free_bytes	  += shadow->old_len;
+	super->s_used_bytes	  -= shadow->old_len;
+	super->s_dirty_free_bytes -= shadow->old_len;
+
+	logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len);
+	logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len);
+
+	log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n",
+			shadow->ino, shadow->bix, shadow->gc_level,
+			shadow->old_ofs, shadow->new_ofs,
+			shadow->old_len, shadow->new_len);
+	mempool_free(shadow, super->s_shadow_pool);
+}
+
+static void account_shadows(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	struct shadow_tree *tree = &super->s_shadow_tree;
+
+	btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
+	btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
+	btree_grim_visitor32(&tree->segment_map, 0, NULL);
+	tree->no_shadowed_segments = 0;
+
+	if (li->li_block) {
+		/*
+		 * We never actually use the structure, when attached to the
+		 * master inode.  But it is easier to always free it here than
+		 * to have checks in several places elsewhere when allocating
+		 * it.
+		 */
+		li->li_block->ops->free_block(sb, li->li_block);
+	}
+	BUG_ON((s64)li->li_used_bytes < 0);
+}
+
+static void *__logfs_write_anchor(struct super_block *sb, void *_da,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_anchor *da = _da;
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	da->da_height	= li->li_height;
+	da->da_last_ino = cpu_to_be64(super->s_last_ino);
+	da->da_size	= cpu_to_be64(i_size_read(inode));
+	da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		da->da_data[i] = cpu_to_be64(li->li_data[i]);
+	*type = JE_ANCHOR;
+	*len = sizeof(*da);
+	return da;
+}
+
+static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_dynsb *dynsb = _dynsb;
+
+	dynsb->ds_gec		= cpu_to_be64(super->s_gec);
+	dynsb->ds_sweeper	= cpu_to_be64(super->s_sweeper);
+	dynsb->ds_victim_ino	= cpu_to_be64(super->s_victim_ino);
+	dynsb->ds_rename_dir	= cpu_to_be64(super->s_rename_dir);
+	dynsb->ds_rename_pos	= cpu_to_be64(super->s_rename_pos);
+	dynsb->ds_used_bytes	= cpu_to_be64(super->s_used_bytes);
+	dynsb->ds_generation	= cpu_to_be32(super->s_generation);
+	*type = JE_DYNSB;
+	*len = sizeof(*dynsb);
+	return dynsb;
+}
+
+static void write_wbuf(struct super_block *sb, struct logfs_area *area,
+		void *wbuf)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	u64 ofs;
+	pgoff_t index;
+	int page_ofs;
+	struct page *page;
+
+	ofs = dev_ofs(sb, area->a_segno,
+			area->a_used_bytes & ~(super->s_writesize - 1));
+	index = ofs >> PAGE_SHIFT;
+	page_ofs = ofs & (PAGE_SIZE - 1);
+
+	page = find_lock_page(mapping, index);
+	BUG_ON(!page);
+	memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
+	unlock_page(page);
+}
+
+static void *logfs_write_area(struct super_block *sb, void *_a,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[super->s_sum_index];
+	struct logfs_je_area *a = _a;
+
+	a->vim = VIM_DEFAULT;
+	a->gc_level = super->s_sum_index;
+	a->used_bytes = cpu_to_be32(area->a_used_bytes);
+	a->segno = cpu_to_be32(area->a_segno);
+	if (super->s_writesize > 1)
+		write_wbuf(sb, area, a + 1);
+
+	*type = JE_AREA;
+	*len = sizeof(*a) + super->s_writesize;
+	return a;
+}
+
+static void *logfs_write_commit(struct super_block *sb, void *h,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	*type = JE_COMMIT;
+	*len = super->s_no_je * sizeof(__be64);
+	return super->s_je_array;
+}
+
+static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
+		size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	void *header = super->s_compressed_je;
+	void *data = header + sizeof(struct logfs_journal_header);
+	ssize_t compr_len, pad_len;
+	u8 compr = COMPR_ZLIB;
+
+	if (len == 0)
+		return logfs_write_header(super, header, 0, type);
+
+	BUG_ON(len > sb->s_blocksize);
+	compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
+	if (compr_len < 0 || type == JE_ANCHOR) {
+		memcpy(data, buf, len);
+		compr_len = len;
+		compr = COMPR_NONE;
+	}
+
+	pad_len = ALIGN(compr_len, 16);
+	memset(data + compr_len, 0, pad_len - compr_len);
+
+	return __logfs_write_header(super, header, compr_len, len, type, compr);
+}
+
+static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
+		int must_pad)
+{
+	u32 writesize = logfs_super(area->a_sb)->s_writesize;
+	s32 ofs;
+	int ret;
+
+	ret = logfs_open_area(area, *bytes);
+	if (ret)
+		return -EAGAIN;
+
+	ofs = area->a_used_bytes;
+	area->a_used_bytes += *bytes;
+
+	if (must_pad) {
+		area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
+		*bytes = area->a_used_bytes - ofs;
+	}
+
+	return dev_ofs(area->a_sb, area->a_segno, ofs);
+}
+
+static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
+		size_t buf_len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	struct logfs_journal_header *jh = super->s_compressed_je;
+	size_t len;
+	int must_pad = 0;
+	s64 ofs;
+
+	len = __logfs_write_je(sb, buf, type, buf_len);
+	if (jh->h_type == cpu_to_be16(JE_COMMIT))
+		must_pad = 1;
+
+	ofs = logfs_get_free_bytes(area, &len, must_pad);
+	if (ofs < 0)
+		return ofs;
+	logfs_buf_write(area, ofs, super->s_compressed_je, len);
+	BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES);
+	super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
+	return 0;
+}
+
+static int logfs_write_je(struct super_block *sb,
+		void* (*write)(struct super_block *sb, void *scratch,
+			u16 *type, size_t *len))
+{
+	void *buf;
+	size_t len;
+	u16 type;
+
+	buf = write(sb, logfs_super(sb)->s_je, &type, &len);
+	return logfs_write_je_buf(sb, buf, type, len);
+}
+
+int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, int child_no, __be64 val)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_obj_alias *oa = super->s_je;
+	int err = 0, fill = super->s_je_fill;
+
+	log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n",
+			fill, ino, bix, level, child_no, be64_to_cpu(val));
+	oa[fill].ino = cpu_to_be64(ino);
+	oa[fill].bix = cpu_to_be64(bix);
+	oa[fill].val = val;
+	oa[fill].level = (__force u8)level;
+	oa[fill].child_no = cpu_to_be16(child_no);
+	fill++;
+	if (fill >= sb->s_blocksize / sizeof(*oa)) {
+		err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize);
+		fill = 0;
+	}
+
+	super->s_je_fill = fill;
+	return err;
+}
+
+static int logfs_write_obj_aliases(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int err;
+
+	log_journal("logfs_write_obj_aliases: %d aliases to write\n",
+			super->s_no_object_aliases);
+	super->s_je_fill = 0;
+	err = logfs_write_obj_aliases_pagecache(sb);
+	if (err)
+		return err;
+
+	if (super->s_je_fill)
+		err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS,
+				super->s_je_fill
+				* sizeof(struct logfs_obj_alias));
+	return err;
+}
+
+/*
+ * Write all journal entries.  The goto logic ensures that all journal entries
+ * are written whenever a new segment is used.  It is ugly and potentially a
+ * bit wasteful, but robustness is more important.  With this we can *always*
+ * erase all journal segments except the one containing the most recent commit.
+ */
+void logfs_write_anchor(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	int i, err;
+
+	if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY))
+		return;
+	super->s_flags &= ~LOGFS_SB_FLAG_DIRTY;
+
+	BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+	mutex_lock(&super->s_journal_mutex);
+
+	/* Do this first or suffer corruption */
+	logfs_sync_segments(sb);
+	account_shadows(sb);
+
+again:
+	super->s_no_je = 0;
+	for_each_area(i) {
+		if (!super->s_area[i]->a_is_open)
+			continue;
+		super->s_sum_index = i;
+		err = logfs_write_je(sb, logfs_write_area);
+		if (err)
+			goto again;
+	}
+	err = logfs_write_obj_aliases(sb);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_erasecount);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, __logfs_write_anchor);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_dynsb);
+	if (err)
+		goto again;
+	/*
+	 * Order is imperative.  First we sync all writes, including the
+	 * non-committed journal writes.  Then we write the final commit and
+	 * sync the current journal segment.
+	 * There is a theoretical bug here.  Syncing the journal segment will
+	 * write a number of journal entries and the final commit.  All these
+	 * are written in a single operation.  If the device layer writes the
+	 * data back-to-front, the commit will precede the other journal
+	 * entries, leaving a race window.
+	 * Two fixes are possible.  Preferred is to fix the device layer to
+	 * ensure writes happen front-to-back.  Alternatively we can insert
+	 * another logfs_sync_area() super->s_devops->sync() combo before
+	 * writing the commit.
+	 */
+	/*
+	 * On another subject, super->s_devops->sync is usually not necessary.
+	 * Unless called from sys_sync or friends, a barrier would suffice.
+	 */
+	super->s_devops->sync(sb);
+	err = logfs_write_je(sb, logfs_write_commit);
+	if (err)
+		goto again;
+	log_journal("Write commit to %llx\n",
+			be64_to_cpu(super->s_je_array[super->s_no_je - 1]));
+	logfs_sync_area(area);
+	BUG_ON(area->a_used_bytes != area->a_written_bytes);
+	super->s_devops->sync(sb);
+
+	mutex_unlock(&super->s_journal_mutex);
+	return;
+}
+
+void do_logfs_journal_wl_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	struct btree_head32 *head = &super->s_reserved_segments;
+	u32 segno, ec;
+	int i, err;
+
+	log_journal("Journal requires wear-leveling.\n");
+	/* Drop old segments */
+	journal_for_each(i)
+		if (super->s_journal_seg[i]) {
+			btree_remove32(head, super->s_journal_seg[i]);
+			logfs_set_segment_unreserved(sb,
+					super->s_journal_seg[i],
+					super->s_journal_ec[i]);
+			super->s_journal_seg[i] = 0;
+			super->s_journal_ec[i] = 0;
+		}
+	/* Get new segments */
+	for (i = 0; i < super->s_no_journal_segs; i++) {
+		segno = get_best_cand(sb, &super->s_reserve_list, &ec);
+		super->s_journal_seg[i] = segno;
+		super->s_journal_ec[i] = ec;
+		logfs_set_segment_reserved(sb, segno);
+		err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
+		BUG_ON(err); /* mempool should prevent this */
+		err = logfs_erase_segment(sb, segno, 1);
+		BUG_ON(err); /* FIXME: remount-ro would be nicer */
+	}
+	/* Manually move journal_area */
+	freeseg(sb, area->a_segno);
+	area->a_segno = super->s_journal_seg[0];
+	area->a_is_open = 0;
+	area->a_used_bytes = 0;
+	/* Write journal */
+	logfs_write_anchor(sb);
+	/* Write superblocks */
+	err = logfs_write_sb(sb);
+	BUG_ON(err);
+}
+
+static const struct logfs_area_ops journal_area_ops = {
+	.get_free_segment	= journal_get_free_segment,
+	.get_erase_count	= journal_get_erase_count,
+	.erase_segment		= journal_erase_segment,
+};
+
+int logfs_init_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
+		+ MAX_JOURNAL_HEADER;
+	int ret = -ENOMEM;
+
+	mutex_init(&super->s_journal_mutex);
+	btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool);
+
+	super->s_je = kzalloc(bufsize, GFP_KERNEL);
+	if (!super->s_je)
+		return ret;
+
+	super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL);
+	if (!super->s_compressed_je)
+		return ret;
+
+	super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
+	if (IS_ERR(super->s_master_inode))
+		return PTR_ERR(super->s_master_inode);
+
+	ret = logfs_read_journal(sb);
+	if (ret)
+		return -EIO;
+
+	reserve_sb_and_journal(sb);
+	logfs_calc_free(sb);
+
+	super->s_journal_area->a_ops = &journal_area_ops;
+	return 0;
+}
+
+void logfs_cleanup_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
+
+	kfree(super->s_compressed_je);
+	kfree(super->s_je);
+}
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
new file mode 100644
index 00000000..57afd4a6
--- /dev/null
+++ b/fs/logfs/logfs.h
@@ -0,0 +1,734 @@
+/*
+ * fs/logfs/logfs.h
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Private header for logfs.
+ */
+#ifndef FS_LOGFS_LOGFS_H
+#define FS_LOGFS_LOGFS_H
+
+#undef __CHECK_ENDIAN__
+#define __CHECK_ENDIAN__
+
+#include <linux/btree.h>
+#include <linux/crc32.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/mtd.h>
+#include "logfs_abi.h"
+
+#define LOGFS_DEBUG_SUPER	(0x0001)
+#define LOGFS_DEBUG_SEGMENT	(0x0002)
+#define LOGFS_DEBUG_JOURNAL	(0x0004)
+#define LOGFS_DEBUG_DIR		(0x0008)
+#define LOGFS_DEBUG_FILE	(0x0010)
+#define LOGFS_DEBUG_INODE	(0x0020)
+#define LOGFS_DEBUG_READWRITE	(0x0040)
+#define LOGFS_DEBUG_GC		(0x0080)
+#define LOGFS_DEBUG_GC_NOISY	(0x0100)
+#define LOGFS_DEBUG_ALIASES	(0x0200)
+#define LOGFS_DEBUG_BLOCKMOVE	(0x0400)
+#define LOGFS_DEBUG_ALL		(0xffffffff)
+
+#define LOGFS_DEBUG		(0x01)
+/*
+ * To enable specific log messages, simply define LOGFS_DEBUG to match any
+ * or all of the above.
+ */
+#ifndef LOGFS_DEBUG
+#define LOGFS_DEBUG		(0)
+#endif
+
+#define log_cond(cond, fmt, arg...) do {	\
+	if (cond)				\
+		printk(KERN_DEBUG fmt, ##arg);	\
+} while (0)
+
+#define log_super(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg)
+#define log_segment(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg)
+#define log_journal(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg)
+#define log_dir(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg)
+#define log_file(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg)
+#define log_inode(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg)
+#define log_readwrite(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg)
+#define log_gc(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg)
+#define log_gc_noisy(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg)
+#define log_aliases(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg)
+#define log_blockmove(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg)
+
+#define PG_pre_locked		PG_owner_priv_1
+#define PagePreLocked(page)	test_bit(PG_pre_locked, &(page)->flags)
+#define SetPagePreLocked(page)	set_bit(PG_pre_locked, &(page)->flags)
+#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
+
+/* FIXME: This should really be somewhere in the 64bit area. */
+#define LOGFS_LINK_MAX		(1<<30)
+
+/* Read-only filesystem */
+#define LOGFS_SB_FLAG_RO	0x0001
+#define LOGFS_SB_FLAG_DIRTY	0x0002
+#define LOGFS_SB_FLAG_OBJ_ALIAS	0x0004
+#define LOGFS_SB_FLAG_SHUTDOWN	0x0008
+
+/* Write Control Flags */
+#define WF_LOCK			0x01 /* take write lock */
+#define WF_WRITE		0x02 /* write block */
+#define WF_DELETE		0x04 /* delete old block */
+
+typedef u8 __bitwise level_t;
+typedef u8 __bitwise gc_level_t;
+
+#define LEVEL(level) ((__force level_t)(level))
+#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level))
+
+#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)),	\
+		(__force level_t)((__force u8)(level) - 1) )
+
+/**
+ * struct logfs_area - area management information
+ *
+ * @a_sb:			the superblock this area belongs to
+ * @a_is_open:			1 if the area is currently open, else 0
+ * @a_segno:			segment number of area
+ * @a_written_bytes:		number of bytes already written back
+ * @a_used_bytes:		number of used bytes
+ * @a_ops:			area operations (either journal or ostore)
+ * @a_erase_count:		erase count
+ * @a_level:			GC level
+ */
+struct logfs_area { /* a segment open for writing */
+	struct super_block *a_sb;
+	int	a_is_open;
+	u32	a_segno;
+	u32	a_written_bytes;
+	u32	a_used_bytes;
+	const struct logfs_area_ops *a_ops;
+	u32	a_erase_count;
+	gc_level_t a_level;
+};
+
+/**
+ * struct logfs_area_ops - area operations
+ *
+ * @get_free_segment:		fill area->ofs with the offset of a free segment
+ * @get_erase_count:		fill area->erase_count (needs area->ofs)
+ * @erase_segment:		erase and setup segment
+ */
+struct logfs_area_ops {
+	void	(*get_free_segment)(struct logfs_area *area);
+	void	(*get_erase_count)(struct logfs_area *area);
+	int	(*erase_segment)(struct logfs_area *area);
+};
+
+struct logfs_super;	/* forward */
+/**
+ * struct logfs_device_ops - device access operations
+ *
+ * @readpage:			read one page (mm page)
+ * @writeseg:			write one segment.  may be a partial segment
+ * @erase:			erase one segment
+ * @read:			read from the device
+ * @erase:			erase part of the device
+ * @can_write_buf:		decide whether wbuf can be written to ofs
+ */
+struct logfs_device_ops {
+	struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
+	struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
+	int (*write_sb)(struct super_block *sb, struct page *page);
+	int (*readpage)(void *_sb, struct page *page);
+	void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
+	int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
+			int ensure_write);
+	int (*can_write_buf)(struct super_block *sb, u64 ofs);
+	void (*sync)(struct super_block *sb);
+	void (*put_device)(struct logfs_super *s);
+};
+
+/**
+ * struct candidate_list - list of similar candidates
+ */
+struct candidate_list {
+	struct rb_root rb_tree;
+	int count;
+	int maxcount;
+	int sort_by_ec;
+};
+
+/**
+ * struct gc_candidate - "candidate" segment to be garbage collected next
+ *
+ * @list:			list (either free of low)
+ * @segno:			segment number
+ * @valid:			number of valid bytes
+ * @erase_count:		erase count of segment
+ * @dist:			distance from tree root
+ *
+ * Candidates can be on two lists.  The free list contains electees rather
+ * than candidates - segments that no longer contain any valid data.  The
+ * low list contains candidates to be picked for GC.  It should be kept
+ * short.  It is not required to always pick a perfect candidate.  In the
+ * worst case GC will have to move more data than absolutely necessary.
+ */
+struct gc_candidate {
+	struct rb_node rb_node;
+	struct candidate_list *list;
+	u32	segno;
+	u32	valid;
+	u32	erase_count;
+	u8	dist;
+};
+
+/**
+ * struct logfs_journal_entry - temporary structure used during journal scan
+ *
+ * @used:
+ * @version:			normalized version
+ * @len:			length
+ * @offset:			offset
+ */
+struct logfs_journal_entry {
+	int used;
+	s16 version;
+	u16 len;
+	u16 datalen;
+	u64 offset;
+};
+
+enum transaction_state {
+	CREATE_1 = 1,
+	CREATE_2,
+	UNLINK_1,
+	UNLINK_2,
+	CROSS_RENAME_1,
+	CROSS_RENAME_2,
+	TARGET_RENAME_1,
+	TARGET_RENAME_2,
+	TARGET_RENAME_3
+};
+
+/**
+ * struct logfs_transaction - essential fields to support atomic dirops
+ *
+ * @ino:			target inode
+ * @dir:			inode of directory containing dentry
+ * @pos:			pos of dentry in directory
+ */
+struct logfs_transaction {
+	enum transaction_state state;
+	u64	 ino;
+	u64	 dir;
+	u64	 pos;
+};
+
+/**
+ * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
+ * @old_ofs:			offset of old block on medium
+ * @new_ofs:			offset of new block on medium
+ * @ino:			inode number
+ * @bix:			block index
+ * @old_len:			size of old block, including header
+ * @new_len:			size of new block, including header
+ * @level:			block level
+ */
+struct logfs_shadow {
+	u64 old_ofs;
+	u64 new_ofs;
+	u64 ino;
+	u64 bix;
+	int old_len;
+	int new_len;
+	gc_level_t gc_level;
+};
+
+/**
+ * struct shadow_tree
+ * @new:			shadows where old_ofs==0, indexed by new_ofs
+ * @old:			shadows where old_ofs!=0, indexed by old_ofs
+ * @segment_map:		bitfield of segments containing shadows
+ * @no_shadowed_segment:	number of segments containing shadows
+ */
+struct shadow_tree {
+	struct btree_head64 new;
+	struct btree_head64 old;
+	struct btree_head32 segment_map;
+	int no_shadowed_segments;
+};
+
+struct object_alias_item {
+	struct list_head list;
+	__be64 val;
+	int child_no;
+};
+
+/**
+ * struct logfs_block - contains any block state
+ * @type:			indirect block or inode
+ * @full:			number of fully populated children
+ * @partial:			number of partially populated children
+ *
+ * Most blocks are directly represented by page cache pages.  But when a block
+ * becomes dirty, is part of a transaction, contains aliases or is otherwise
+ * special, a struct logfs_block is allocated to track the additional state.
+ * Inodes are very similar to indirect blocks, so they can also get one of
+ * these structures added when appropriate.
+ */
+#define BLOCK_INDIRECT	1	/* Indirect block */
+#define BLOCK_INODE	2	/* Inode */
+struct logfs_block_ops;
+struct logfs_block {
+	struct list_head alias_list;
+	struct list_head item_list;
+	struct super_block *sb;
+	u64 ino;
+	u64 bix;
+	level_t level;
+	struct page *page;
+	struct inode *inode;
+	struct logfs_transaction *ta;
+	unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
+	struct logfs_block_ops *ops;
+	int full;
+	int partial;
+	int reserved_bytes;
+};
+
+typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, int child_no, __be64 val);
+struct logfs_block_ops {
+	void	(*write_block)(struct logfs_block *block);
+	void	(*free_block)(struct super_block *sb, struct logfs_block*block);
+	int	(*write_alias)(struct super_block *sb,
+			struct logfs_block *block,
+			write_alias_t *write_one_alias);
+};
+
+#define MAX_JOURNAL_ENTRIES 256
+
+struct logfs_super {
+	struct mtd_info *s_mtd;			/* underlying device */
+	struct block_device *s_bdev;		/* underlying device */
+	const struct logfs_device_ops *s_devops;/* device access */
+	struct inode	*s_master_inode;	/* inode file */
+	struct inode	*s_segfile_inode;	/* segment file */
+	struct inode *s_mapping_inode;		/* device mapping */
+	atomic_t s_pending_writes;		/* outstanting bios */
+	long	 s_flags;
+	mempool_t *s_btree_pool;		/* for btree nodes */
+	mempool_t *s_alias_pool;		/* aliases in segment.c */
+	u64	 s_feature_incompat;
+	u64	 s_feature_ro_compat;
+	u64	 s_feature_compat;
+	u64	 s_feature_flags;
+	u64	 s_sb_ofs[2];
+	struct page *s_erase_page;		/* for dev_bdev.c */
+	/* alias.c fields */
+	struct btree_head32 s_segment_alias;	/* remapped segments */
+	int	 s_no_object_aliases;
+	struct list_head s_object_alias;	/* remapped objects */
+	struct btree_head128 s_object_alias_tree; /* remapped objects */
+	struct mutex s_object_alias_mutex;
+	/* dir.c fields */
+	struct mutex s_dirop_mutex;		/* for creat/unlink/rename */
+	u64	 s_victim_ino;			/* used for atomic dir-ops */
+	u64	 s_rename_dir;			/* source directory ino */
+	u64	 s_rename_pos;			/* position of source dd */
+	/* gc.c fields */
+	long	 s_segsize;			/* size of a segment */
+	int	 s_segshift;			/* log2 of segment size */
+	long	 s_segmask;			/* 1 << s_segshift - 1 */
+	long	 s_no_segs;			/* segments on device */
+	long	 s_no_journal_segs;		/* segments used for journal */
+	long	 s_no_blocks;			/* blocks per segment */
+	long	 s_writesize;			/* minimum write size */
+	int	 s_writeshift;			/* log2 of write size */
+	u64	 s_size;			/* filesystem size */
+	struct logfs_area *s_area[LOGFS_NO_AREAS];	/* open segment array */
+	u64	 s_gec;				/* global erase count */
+	u64	 s_wl_gec_ostore;		/* time of last wl event */
+	u64	 s_wl_gec_journal;		/* time of last wl event */
+	u64	 s_sweeper;			/* current sweeper pos */
+	u8	 s_ifile_levels;		/* max level of ifile */
+	u8	 s_iblock_levels;		/* max level of regular files */
+	u8	 s_data_levels;			/* # of segments to leaf block*/
+	u8	 s_total_levels;		/* sum of above three */
+	struct btree_head32 s_cand_tree;	/* all candidates */
+	struct candidate_list s_free_list;	/* 100% free segments */
+	struct candidate_list s_reserve_list;	/* Bad segment reserve */
+	struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
+	struct candidate_list s_ec_list;	/* wear level candidates */
+	struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */
+	/* inode.c fields */
+	u64	 s_last_ino;			/* highest ino used */
+	long	 s_inos_till_wrap;
+	u32	 s_generation;			/* i_generation for new files */
+	struct list_head s_freeing_list;	/* inodes being freed */
+	/* journal.c fields */
+	struct mutex s_journal_mutex;
+	void	*s_je;				/* journal entry to compress */
+	void	*s_compressed_je;		/* block to write to journal */
+	u32	 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
+	u32	 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
+	u64	 s_last_version;
+	struct logfs_area *s_journal_area;	/* open journal segment */
+	__be64	s_je_array[MAX_JOURNAL_ENTRIES];
+	int	s_no_je;
+
+	int	 s_sum_index;			/* for the 12 summaries */
+	struct shadow_tree s_shadow_tree;
+	int	 s_je_fill;			/* index of current je */
+	/* readwrite.c fields */
+	struct mutex s_write_mutex;
+	int	 s_lock_count;
+	mempool_t *s_block_pool;		/* struct logfs_block pool */
+	mempool_t *s_shadow_pool;		/* struct logfs_shadow pool */
+	struct list_head s_writeback_list;	/* writeback pages */
+	/*
+	 * Space accounting:
+	 * - s_used_bytes specifies space used to store valid data objects.
+	 * - s_dirty_used_bytes is space used to store non-committed data
+	 *   objects.  Those objects have already been written themselves,
+	 *   but they don't become valid until all indirect blocks up to the
+	 *   journal have been written as well.
+	 * - s_dirty_free_bytes is space used to store the old copy of a
+	 *   replaced object, as long as the replacement is non-committed.
+	 *   In other words, it is the amount of space freed when all dirty
+	 *   blocks are written back.
+	 * - s_free_bytes is the amount of free space available for any
+	 *   purpose.
+	 * - s_root_reserve is the amount of free space available only to
+	 *   the root user.  Non-privileged users can no longer write once
+	 *   this watermark has been reached.
+	 * - s_speed_reserve is space which remains unused to speed up
+	 *   garbage collection performance.
+	 * - s_dirty_pages is the space reserved for currently dirty pages.
+	 *   It is a pessimistic estimate, so some/most will get freed on
+	 *   page writeback.
+	 *
+	 * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size
+	 */
+	u64	 s_free_bytes;
+	u64	 s_used_bytes;
+	u64	 s_dirty_free_bytes;
+	u64	 s_dirty_used_bytes;
+	u64	 s_root_reserve;
+	u64	 s_speed_reserve;
+	u64	 s_dirty_pages;
+	/* Bad block handling:
+	 * - s_bad_seg_reserve is a number of segments usually kept
+	 *   free.  When encountering bad blocks, the affected segment's data
+	 *   is _temporarily_ moved to a reserved segment.
+	 * - s_bad_segments is the number of known bad segments.
+	 */
+	u32	 s_bad_seg_reserve;
+	u32	 s_bad_segments;
+};
+
+/**
+ * struct logfs_inode - in-memory inode
+ *
+ * @vfs_inode:			struct inode
+ * @li_data:			data pointers
+ * @li_used_bytes:		number of used bytes
+ * @li_freeing_list:		used to track inodes currently being freed
+ * @li_flags:			inode flags
+ * @li_refcount:		number of internal (GC-induced) references
+ */
+struct logfs_inode {
+	struct inode vfs_inode;
+	u64	li_data[LOGFS_EMBEDDED_FIELDS];
+	u64	li_used_bytes;
+	struct list_head li_freeing_list;
+	struct logfs_block *li_block;
+	u32	li_flags;
+	u8	li_height;
+	int	li_refcount;
+};
+
+#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
+#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
+#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--)
+
+/* compr.c */
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
+int __init logfs_compr_init(void);
+void logfs_compr_exit(void);
+
+/* dev_bdev.c */
+#ifdef CONFIG_BLOCK
+int logfs_get_sb_bdev(struct logfs_super *s,
+		struct file_system_type *type,
+		const char *devname);
+#else
+static inline int logfs_get_sb_bdev(struct logfs_super *s,
+		struct file_system_type *type,
+		const char *devname)
+{
+	return -ENODEV;
+}
+#endif
+
+/* dev_mtd.c */
+#ifdef CONFIG_MTD
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
+#else
+static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
+{
+	return -ENODEV;
+}
+#endif
+
+/* dir.c */
+extern const struct inode_operations logfs_symlink_iops;
+extern const struct inode_operations logfs_dir_iops;
+extern const struct file_operations logfs_dir_fops;
+int logfs_replay_journal(struct super_block *sb);
+
+/* file.c */
+extern const struct inode_operations logfs_reg_iops;
+extern const struct file_operations logfs_reg_fops;
+extern const struct address_space_operations logfs_reg_aops;
+int logfs_readpage(struct file *file, struct page *page);
+long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int logfs_fsync(struct file *file, int datasync);
+
+/* gc.c */
+u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
+void logfs_gc_pass(struct super_block *sb);
+int logfs_check_areas(struct super_block *sb);
+int logfs_init_gc(struct super_block *sb);
+void logfs_cleanup_gc(struct super_block *sb);
+
+/* inode.c */
+extern const struct super_operations logfs_super_operations;
+struct inode *logfs_iget(struct super_block *sb, ino_t ino);
+struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
+void logfs_safe_iput(struct inode *inode, int cookie);
+struct inode *logfs_new_inode(struct inode *dir, int mode);
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
+struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
+int logfs_init_inode_cache(void);
+void logfs_destroy_inode_cache(void);
+void logfs_set_blocks(struct inode *inode, u64 no);
+/* these logically belong into inode.c but actually reside in readwrite.c */
+int logfs_read_inode(struct inode *inode);
+int __logfs_write_inode(struct inode *inode, long flags);
+void logfs_evict_inode(struct inode *inode);
+
+/* journal.c */
+void logfs_write_anchor(struct super_block *sb);
+int logfs_init_journal(struct super_block *sb);
+void logfs_cleanup_journal(struct super_block *sb);
+int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, int child_no, __be64 val);
+void do_logfs_journal_wl_pass(struct super_block *sb);
+
+/* readwrite.c */
+pgoff_t logfs_pack_index(u64 bix, level_t level);
+void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level);
+int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
+		loff_t bix, long flags, struct shadow_tree *shadow_tree);
+int logfs_readpage_nolock(struct page *page);
+int logfs_write_buf(struct inode *inode, struct page *page, long flags);
+int logfs_delete(struct inode *inode, pgoff_t index,
+		struct shadow_tree *shadow_tree);
+int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
+		gc_level_t gc_level, long flags);
+int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
+		gc_level_t gc_level);
+int logfs_truncate(struct inode *inode, u64 size);
+u64 logfs_seek_hole(struct inode *inode, u64 bix);
+u64 logfs_seek_data(struct inode *inode, u64 bix);
+int logfs_open_segfile(struct super_block *sb);
+int logfs_init_rw(struct super_block *sb);
+void logfs_cleanup_rw(struct super_block *sb);
+void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta);
+void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta);
+void logfs_write_block(struct logfs_block *block, long flags);
+int logfs_write_obj_aliases_pagecache(struct super_block *sb);
+void logfs_get_segment_entry(struct super_block *sb, u32 segno,
+		struct logfs_segment_entry *se);
+void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment);
+void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
+		gc_level_t gc_level);
+void logfs_set_segment_reserved(struct super_block *sb, u32 segno);
+void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec);
+struct logfs_block *__alloc_block(struct super_block *sb,
+		u64 ino, u64 bix, level_t level);
+void __free_block(struct super_block *sb, struct logfs_block *block);
+void btree_write_block(struct logfs_block *block);
+void initialize_block_counters(struct page *page, struct logfs_block *block,
+		__be64 *array, int page_is_empty);
+int logfs_exist_block(struct inode *inode, u64 bix);
+int get_page_reserve(struct inode *inode, struct page *page);
+extern struct logfs_block_ops indirect_block_ops;
+
+/* segment.c */
+int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
+int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
+int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
+		level_t level);
+int logfs_segment_write(struct inode *inode, struct page *page,
+		struct logfs_shadow *shadow);
+int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
+int logfs_load_object_aliases(struct super_block *sb,
+		struct logfs_obj_alias *oa, int count);
+void move_page_to_btree(struct page *page);
+int logfs_init_mapping(struct super_block *sb);
+void logfs_sync_area(struct logfs_area *area);
+void logfs_sync_segments(struct super_block *sb);
+void freeseg(struct super_block *sb, u32 segno);
+
+/* area handling */
+int logfs_init_areas(struct super_block *sb);
+void logfs_cleanup_areas(struct super_block *sb);
+int logfs_open_area(struct logfs_area *area, size_t bytes);
+int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+		int use_filler);
+
+static inline int logfs_buf_write(struct logfs_area *area, u64 ofs,
+		void *buf, size_t len)
+{
+	return __logfs_buf_write(area, ofs, buf, len, 0);
+}
+
+static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
+		void *buf, size_t len)
+{
+	return __logfs_buf_write(area, ofs, buf, len, 1);
+}
+
+/* super.c */
+struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
+void emergency_read_end(struct page *page);
+void logfs_crash_dump(struct super_block *sb);
+void *memchr_inv(const void *s, int c, size_t n);
+int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
+int logfs_check_ds(struct logfs_disk_super *ds);
+int logfs_write_sb(struct super_block *sb);
+
+static inline struct logfs_super *logfs_super(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct logfs_inode *logfs_inode(struct inode *inode)
+{
+	return container_of(inode, struct logfs_inode, vfs_inode);
+}
+
+static inline void logfs_set_ro(struct super_block *sb)
+{
+	logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
+}
+
+#define LOGFS_BUG(sb) do {					\
+	struct super_block *__sb = sb;				\
+	logfs_crash_dump(__sb);					\
+	logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO;		\
+	BUG();							\
+} while (0)
+
+#define LOGFS_BUG_ON(condition, sb) \
+	do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
+
+static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
+{
+	return cpu_to_be32(crc32(~0, data+skip, len-skip));
+}
+
+static inline u8 logfs_type(struct inode *inode)
+{
+	return (inode->i_mode >> 12) & 15;
+}
+
+static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
+{
+	return pos >> sb->s_blocksize_bits;
+}
+
+static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
+{
+	return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
+}
+
+static inline u32 seg_no(struct super_block *sb, u64 ofs)
+{
+	return ofs >> logfs_super(sb)->s_segshift;
+}
+
+static inline u32 seg_ofs(struct super_block *sb, u64 ofs)
+{
+	return ofs & logfs_super(sb)->s_segmask;
+}
+
+static inline u64 seg_align(struct super_block *sb, u64 ofs)
+{
+	return ofs & ~logfs_super(sb)->s_segmask;
+}
+
+static inline struct logfs_block *logfs_block(struct page *page)
+{
+	return (void *)page->private;
+}
+
+static inline level_t shrink_level(gc_level_t __level)
+{
+	u8 level = (__force u8)__level;
+
+	if (level >= LOGFS_MAX_LEVELS)
+		level -= LOGFS_MAX_LEVELS;
+	return (__force level_t)level;
+}
+
+static inline gc_level_t expand_level(u64 ino, level_t __level)
+{
+	u8 level = (__force u8)__level;
+
+	if (ino == LOGFS_INO_MASTER) {
+		/* ifile has separate areas */
+		level += LOGFS_MAX_LEVELS;
+	}
+	return (__force gc_level_t)level;
+}
+
+static inline int logfs_block_shift(struct super_block *sb, level_t level)
+{
+	level = shrink_level((__force gc_level_t)level);
+	return (__force int)level * (sb->s_blocksize_bits - 3);
+}
+
+static inline u64 logfs_block_mask(struct super_block *sb, level_t level)
+{
+	return ~0ull << logfs_block_shift(sb, level);
+}
+
+static inline struct logfs_area *get_area(struct super_block *sb,
+		gc_level_t gc_level)
+{
+	return logfs_super(sb)->s_area[(__force u8)gc_level];
+}
+
+static inline void logfs_mempool_destroy(mempool_t *pool)
+{
+	if (pool)
+		mempool_destroy(pool);
+}
+
+#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
new file mode 100644
index 00000000..ae960519
--- /dev/null
+++ b/fs/logfs/logfs_abi.h
@@ -0,0 +1,629 @@
+/*
+ * fs/logfs/logfs_abi.h
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Public header for logfs.
+ */
+#ifndef FS_LOGFS_LOGFS_ABI_H
+#define FS_LOGFS_LOGFS_ABI_H
+
+/* For out-of-kernel compiles */
+#ifndef BUILD_BUG_ON
+#define BUILD_BUG_ON(condition) /**/
+#endif
+
+#define SIZE_CHECK(type, size)					\
+static inline void check_##type(void)				\
+{								\
+	BUILD_BUG_ON(sizeof(struct type) != (size));		\
+}
+
+/*
+ * Throughout the logfs code, we're constantly dealing with blocks at
+ * various positions or offsets.  To remove confusion, we stricly
+ * distinguish between a "position" - the logical position within a
+ * file and an "offset" - the physical location within the device.
+ *
+ * Any usage of the term offset for a logical location or position for
+ * a physical one is a bug and should get fixed.
+ */
+
+/*
+ * Block are allocated in one of several segments depending on their
+ * level.  The following levels are used:
+ *  0	- regular data block
+ *  1	- i1 indirect blocks
+ *  2	- i2 indirect blocks
+ *  3	- i3 indirect blocks
+ *  4	- i4 indirect blocks
+ *  5	- i5 indirect blocks
+ *  6	- ifile data blocks
+ *  7	- ifile i1 indirect blocks
+ *  8	- ifile i2 indirect blocks
+ *  9	- ifile i3 indirect blocks
+ * 10	- ifile i4 indirect blocks
+ * 11	- ifile i5 indirect blocks
+ * Potential levels to be used in the future:
+ * 12	- gc recycled blocks, long-lived data
+ * 13	- replacement blocks, short-lived data
+ *
+ * Levels 1-11 are necessary for robust gc operations and help separate
+ * short-lived metadata from longer-lived file data.  In the future,
+ * file data should get separated into several segments based on simple
+ * heuristics.  Old data recycled during gc operation is expected to be
+ * long-lived.  New data is of uncertain life expectancy.  New data
+ * used to replace older blocks in existing files is expected to be
+ * short-lived.
+ */
+
+
+/* Magic numbers.  64bit for superblock, 32bit for statfs f_type */
+#define LOGFS_MAGIC		0x7a3a8e5cb9d5bf67ull
+#define LOGFS_MAGIC_U32		0xc97e8168u
+
+/*
+ * Various blocksize related macros.  Blocksize is currently fixed at 4KiB.
+ * Sooner or later that should become configurable and the macros replaced
+ * by something superblock-dependent.  Pointers in indirect blocks are and
+ * will remain 64bit.
+ *
+ * LOGFS_BLOCKSIZE	- self-explaining
+ * LOGFS_BLOCK_FACTOR	- number of pointers per indirect block
+ * LOGFS_BLOCK_BITS	- log2 of LOGFS_BLOCK_FACTOR, used for shifts
+ */
+#define LOGFS_BLOCKSIZE		(4096ull)
+#define LOGFS_BLOCK_FACTOR	(LOGFS_BLOCKSIZE / sizeof(u64))
+#define LOGFS_BLOCK_BITS	(9)
+
+/*
+ * Number of blocks at various levels of indirection.  There are 16 direct
+ * block pointers plus a single indirect pointer.
+ */
+#define I0_BLOCKS		(16)
+#define I1_BLOCKS		LOGFS_BLOCK_FACTOR
+#define I2_BLOCKS		(LOGFS_BLOCK_FACTOR * I1_BLOCKS)
+#define I3_BLOCKS		(LOGFS_BLOCK_FACTOR * I2_BLOCKS)
+#define I4_BLOCKS		(LOGFS_BLOCK_FACTOR * I3_BLOCKS)
+#define I5_BLOCKS		(LOGFS_BLOCK_FACTOR * I4_BLOCKS)
+
+#define INDIRECT_INDEX		I0_BLOCKS
+#define LOGFS_EMBEDDED_FIELDS	(I0_BLOCKS + 1)
+
+/*
+ * Sizes at which files require another level of indirection.  Files smaller
+ * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
+ * similar like ext2 fast symlinks.
+ *
+ * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
+ * direct pointers, else through the 1x indirect pointer and so forth.
+ */
+#define LOGFS_EMBEDDED_SIZE	(LOGFS_EMBEDDED_FIELDS * sizeof(u64))
+#define LOGFS_I0_SIZE		(I0_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I1_SIZE		(I1_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I2_SIZE		(I2_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I3_SIZE		(I3_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I4_SIZE		(I4_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I5_SIZE		(I5_BLOCKS * LOGFS_BLOCKSIZE)
+
+/*
+ * Each indirect block pointer must have this flag set, if all block pointers
+ * behind it are set, i.e. there is no hole hidden in the shadow of this
+ * indirect block pointer.
+ */
+#define LOGFS_FULLY_POPULATED (1ULL << 63)
+#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
+
+/*
+ * LogFS needs to separate data into levels.  Each level is defined as the
+ * maximal possible distance from the master inode (inode of the inode file).
+ * Data blocks reside on level 0, 1x indirect block on level 1, etc.
+ * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
+ * This effort is necessary to guarantee garbage collection to always make
+ * progress.
+ *
+ * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
+ * LOGFS_MAX_LEVELS is one more for the actual data level of a file.  It is
+ * the maximal number of levels for one file.
+ * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
+ * effectively stacked on top of each other.
+ */
+#define LOGFS_MAX_INDIRECT	(5)
+#define LOGFS_MAX_LEVELS	(LOGFS_MAX_INDIRECT + 1)
+#define LOGFS_NO_AREAS		(2 * LOGFS_MAX_LEVELS)
+
+/* Maximum size of filenames */
+#define LOGFS_MAX_NAMELEN	(255)
+
+/* Number of segments in the primary journal. */
+#define LOGFS_JOURNAL_SEGS	(16)
+
+/* Maximum number of free/erased/etc. segments in journal entries */
+#define MAX_CACHED_SEGS		(64)
+
+
+/*
+ * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
+ * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
+ * its header,
+ * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
+ * its segment header and the padded space at the end when no further objects
+ * fit.
+ */
+#define LOGFS_OBJECT_HEADERSIZE	(0x1c)
+#define LOGFS_SEGMENT_HEADERSIZE (0x18)
+#define LOGFS_MAX_OBJECTSIZE	(LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
+#define LOGFS_SEGMENT_RESERVE	\
+	(LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
+
+/*
+ * Segment types:
+ * SEG_SUPER	- Data or indirect block
+ * SEG_JOURNAL	- Inode
+ * SEG_OSTORE	- Dentry
+ */
+enum {
+	SEG_SUPER	= 0x01,
+	SEG_JOURNAL	= 0x02,
+	SEG_OSTORE	= 0x03,
+};
+
+/**
+ * struct logfs_segment_header - per-segment header in the ostore
+ *
+ * @crc:			crc32 of header (there is no data)
+ * @pad:			unused, must be 0
+ * @type:			segment type, see above
+ * @level:			GC level for all objects in this segment
+ * @segno:			segment number
+ * @ec:				erase count for this segment
+ * @gec:			global erase count at time of writing
+ */
+struct logfs_segment_header {
+	__be32	crc;
+	__be16	pad;
+	__u8	type;
+	__u8	level;
+	__be32	segno;
+	__be32	ec;
+	__be64	gec;
+};
+
+SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
+
+#define LOGFS_FEATURES_INCOMPAT		(0ull)
+#define LOGFS_FEATURES_RO_COMPAT	(0ull)
+#define LOGFS_FEATURES_COMPAT		(0ull)
+
+/**
+ * struct logfs_disk_super - on-medium superblock
+ *
+ * @ds_magic:			magic number, must equal LOGFS_MAGIC
+ * @ds_crc:			crc32 of structure starting with the next field
+ * @ds_ifile_levels:		maximum number of levels for ifile
+ * @ds_iblock_levels:		maximum number of levels for regular files
+ * @ds_data_levels:		number of separate levels for data
+ * @pad0:			reserved, must be 0
+ * @ds_feature_incompat:	incompatible filesystem features
+ * @ds_feature_ro_compat:	read-only compatible filesystem features
+ * @ds_feature_compat:		compatible filesystem features
+ * @ds_flags:			flags
+ * @ds_segment_shift:		log2 of segment size
+ * @ds_block_shift:		log2 of block size
+ * @ds_write_shift:		log2 of write size
+ * @pad1:			reserved, must be 0
+ * @ds_journal_seg:		segments used by primary journal
+ * @ds_root_reserve:		bytes reserved for the superuser
+ * @ds_speed_reserve:		bytes reserved to speed up GC
+ * @ds_bad_seg_reserve:		number of segments reserved to handle bad blocks
+ * @pad2:			reserved, must be 0
+ * @pad3:			reserved, must be 0
+ *
+ * Contains only read-only fields.  Read-write fields like the amount of used
+ * space is tracked in the dynamic superblock, which is stored in the journal.
+ */
+struct logfs_disk_super {
+	struct logfs_segment_header ds_sh;
+	__be64	ds_magic;
+
+	__be32	ds_crc;
+	__u8	ds_ifile_levels;
+	__u8	ds_iblock_levels;
+	__u8	ds_data_levels;
+	__u8	ds_segment_shift;
+	__u8	ds_block_shift;
+	__u8	ds_write_shift;
+	__u8	pad0[6];
+
+	__be64	ds_filesystem_size;
+	__be32	ds_segment_size;
+	__be32  ds_bad_seg_reserve;
+
+	__be64	ds_feature_incompat;
+	__be64	ds_feature_ro_compat;
+
+	__be64	ds_feature_compat;
+	__be64	ds_feature_flags;
+
+	__be64	ds_root_reserve;
+	__be64  ds_speed_reserve;
+
+	__be32	ds_journal_seg[LOGFS_JOURNAL_SEGS];
+
+	__be64	ds_super_ofs[2];
+	__be64	pad3[8];
+};
+
+SIZE_CHECK(logfs_disk_super, 256);
+
+/*
+ * Object types:
+ * OBJ_BLOCK	- Data or indirect block
+ * OBJ_INODE	- Inode
+ * OBJ_DENTRY	- Dentry
+ */
+enum {
+	OBJ_BLOCK	= 0x04,
+	OBJ_INODE	= 0x05,
+	OBJ_DENTRY	= 0x06,
+};
+
+/**
+ * struct logfs_object_header - per-object header in the ostore
+ *
+ * @crc:			crc32 of header, excluding data_crc
+ * @len:			length of data
+ * @type:			object type, see above
+ * @compr:			compression type
+ * @ino:			inode number
+ * @bix:			block index
+ * @data_crc:			crc32 of payload
+ */
+struct logfs_object_header {
+	__be32	crc;
+	__be16	len;
+	__u8	type;
+	__u8	compr;
+	__be64	ino;
+	__be64	bix;
+	__be32	data_crc;
+} __attribute__((packed));
+
+SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
+
+/*
+ * Reserved inode numbers:
+ * LOGFS_INO_MASTER	- master inode (for inode file)
+ * LOGFS_INO_ROOT	- root directory
+ * LOGFS_INO_SEGFILE	- per-segment used bytes and erase count
+ */
+enum {
+	LOGFS_INO_MAPPING	= 0x00,
+	LOGFS_INO_MASTER	= 0x01,
+	LOGFS_INO_ROOT		= 0x02,
+	LOGFS_INO_SEGFILE	= 0x03,
+	LOGFS_RESERVED_INOS	= 0x10,
+};
+
+/*
+ * Inode flags.  High bits should never be written to the medium.  They are
+ * reserved for in-memory usage.
+ * Low bits should either remain in sync with the corresponding FS_*_FL or
+ * reuse slots that obviously don't make sense for logfs.
+ *
+ * LOGFS_IF_DIRTY	Inode must be written back
+ * LOGFS_IF_ZOMBIE	Inode has been deleted
+ * LOGFS_IF_STILLBORN	-ENOSPC happened when creating inode
+ */
+#define LOGFS_IF_COMPRESSED	0x00000004 /* == FS_COMPR_FL */
+#define LOGFS_IF_DIRTY		0x20000000
+#define LOGFS_IF_ZOMBIE		0x40000000
+#define LOGFS_IF_STILLBORN	0x80000000
+
+/* Flags available to chattr */
+#define LOGFS_FL_USER_VISIBLE	(LOGFS_IF_COMPRESSED)
+#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
+/* Flags inherited from parent directory on file/directory creation */
+#define LOGFS_FL_INHERITED	(LOGFS_IF_COMPRESSED)
+
+/**
+ * struct logfs_disk_inode - on-medium inode
+ *
+ * @di_mode:			file mode
+ * @di_pad:			reserved, must be 0
+ * @di_flags:			inode flags, see above
+ * @di_uid:			user id
+ * @di_gid:			group id
+ * @di_ctime:			change time
+ * @di_mtime:			modify time
+ * @di_refcount:		reference count (aka nlink or link count)
+ * @di_generation:		inode generation, for nfs
+ * @di_used_bytes:		number of bytes used
+ * @di_size:			file size
+ * @di_data:			data pointers
+ */
+struct logfs_disk_inode {
+	__be16	di_mode;
+	__u8	di_height;
+	__u8	di_pad;
+	__be32	di_flags;
+	__be32	di_uid;
+	__be32	di_gid;
+
+	__be64	di_ctime;
+	__be64	di_mtime;
+
+	__be64	di_atime;
+	__be32	di_refcount;
+	__be32	di_generation;
+
+	__be64	di_used_bytes;
+	__be64	di_size;
+
+	__be64	di_data[LOGFS_EMBEDDED_FIELDS];
+};
+
+SIZE_CHECK(logfs_disk_inode, 200);
+
+#define INODE_POINTER_OFS \
+	(offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
+#define INODE_USED_OFS \
+	(offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
+#define INODE_SIZE_OFS \
+	(offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
+#define INODE_HEIGHT_OFS	(0)
+
+/**
+ * struct logfs_disk_dentry - on-medium dentry structure
+ *
+ * @ino:			inode number
+ * @namelen:			length of file name
+ * @type:			file type, identical to bits 12..15 of mode
+ * @name:			file name
+ */
+/* FIXME: add 6 bytes of padding to remove the __packed */
+struct logfs_disk_dentry {
+	__be64	ino;
+	__be16	namelen;
+	__u8	type;
+	__u8	name[LOGFS_MAX_NAMELEN];
+} __attribute__((packed));
+
+SIZE_CHECK(logfs_disk_dentry, 266);
+
+#define RESERVED		0xffffffff
+#define BADSEG			0xffffffff
+/**
+ * struct logfs_segment_entry - segment file entry
+ *
+ * @ec_level:			erase count and level
+ * @valid:			number of valid bytes
+ *
+ * Segment file contains one entry for every segment.  ec_level contains the
+ * erasecount in the upper 28 bits and the level in the lower 4 bits.  An
+ * ec_level of BADSEG (-1) identifies bad segments.  valid contains the number
+ * of valid bytes or RESERVED (-1 again) if the segment is used for either the
+ * superblock or the journal, or when the segment is bad.
+ */
+struct logfs_segment_entry {
+	__be32	ec_level;
+	__be32	valid;
+};
+
+SIZE_CHECK(logfs_segment_entry, 8);
+
+/**
+ * struct logfs_journal_header - header for journal entries (JEs)
+ *
+ * @h_crc:			crc32 of journal entry
+ * @h_len:			length of compressed journal entry,
+ *				not including header
+ * @h_datalen:			length of uncompressed data
+ * @h_type:			JE type
+ * @h_compr:			compression type
+ * @h_pad:			reserved
+ */
+struct logfs_journal_header {
+	__be32	h_crc;
+	__be16	h_len;
+	__be16	h_datalen;
+	__be16	h_type;
+	__u8	h_compr;
+	__u8	h_pad[5];
+};
+
+SIZE_CHECK(logfs_journal_header, 16);
+
+/*
+ * Life expectency of data.
+ * VIM_DEFAULT		- default vim
+ * VIM_SEGFILE		- for segment file only - very short-living
+ * VIM_GC		- GC'd data - likely long-living
+ */
+enum logfs_vim {
+	VIM_DEFAULT	= 0,
+	VIM_SEGFILE	= 1,
+};
+
+/**
+ * struct logfs_je_area - wbuf header
+ *
+ * @segno:			segment number of area
+ * @used_bytes:			number of bytes already used
+ * @gc_level:			GC level
+ * @vim:			life expectancy of data
+ *
+ * "Areas" are segments currently being used for writing.  There is at least
+ * one area per GC level.  Several may be used to separate long-living from
+ * short-living data.  If an area with unknown vim is encountered, it can
+ * simply be closed.
+ * The write buffer immediately follow this header.
+ */
+struct logfs_je_area {
+	__be32	segno;
+	__be32	used_bytes;
+	__u8	gc_level;
+	__u8	vim;
+} __attribute__((packed));
+
+SIZE_CHECK(logfs_je_area, 10);
+
+#define MAX_JOURNAL_HEADER \
+	(sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
+
+/**
+ * struct logfs_je_dynsb - dynamic superblock
+ *
+ * @ds_gec:			global erase count
+ * @ds_sweeper:			current position of GC "sweeper"
+ * @ds_rename_dir:		source directory ino (see dir.c documentation)
+ * @ds_rename_pos:		position of source dd (see dir.c documentation)
+ * @ds_victim_ino:		victims of incomplete dir operation (see dir.c)
+ * @ds_victim_ino:		parent inode of victim (see dir.c)
+ * @ds_used_bytes:		number of used bytes
+ */
+struct logfs_je_dynsb {
+	__be64	ds_gec;
+	__be64	ds_sweeper;
+
+	__be64	ds_rename_dir;
+	__be64	ds_rename_pos;
+
+	__be64	ds_victim_ino;
+	__be64	ds_victim_parent; /* XXX */
+
+	__be64	ds_used_bytes;
+	__be32	ds_generation;
+	__be32	pad;
+};
+
+SIZE_CHECK(logfs_je_dynsb, 64);
+
+/**
+ * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
+ *
+ * @da_size:			size of inode file
+ * @da_last_ino:		last created inode
+ * @da_used_bytes:		number of bytes used
+ * @da_data:			data pointers
+ */
+struct logfs_je_anchor {
+	__be64	da_size;
+	__be64	da_last_ino;
+
+	__be64	da_used_bytes;
+	u8	da_height;
+	u8	pad[7];
+
+	__be64	da_data[LOGFS_EMBEDDED_FIELDS];
+};
+
+SIZE_CHECK(logfs_je_anchor, 168);
+
+/**
+ * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
+ *
+ * @so_segment:			segments used for 2nd journal
+ *
+ * Length of the array is given by h_len field in the header.
+ */
+struct logfs_je_spillout {
+	__be64	so_segment[0];
+};
+
+SIZE_CHECK(logfs_je_spillout, 0);
+
+/**
+ * struct logfs_je_journal_ec - erase counts for all journal segments
+ *
+ * @ec:				erase count
+ *
+ * Length of the array is given by h_len field in the header.
+ */
+struct logfs_je_journal_ec {
+	__be32	ec[0];
+};
+
+SIZE_CHECK(logfs_je_journal_ec, 0);
+
+/**
+ * struct logfs_je_free_segments - list of free segmetns with erase count
+ */
+struct logfs_je_free_segments {
+	__be32	segno;
+	__be32	ec;
+};
+
+SIZE_CHECK(logfs_je_free_segments, 8);
+
+/**
+ * struct logfs_seg_alias - list of segment aliases
+ */
+struct logfs_seg_alias {
+	__be32	old_segno;
+	__be32	new_segno;
+};
+
+SIZE_CHECK(logfs_seg_alias, 8);
+
+/**
+ * struct logfs_obj_alias - list of object aliases
+ */
+struct logfs_obj_alias {
+	__be64	ino;
+	__be64	bix;
+	__be64	val;
+	u8	level;
+	u8	pad[5];
+	__be16	child_no;
+};
+
+SIZE_CHECK(logfs_obj_alias, 32);
+
+/**
+ * Compression types.
+ *
+ * COMPR_NONE	- uncompressed
+ * COMPR_ZLIB	- compressed with zlib
+ */
+enum {
+	COMPR_NONE	= 0,
+	COMPR_ZLIB	= 1,
+};
+
+/*
+ * Journal entries come in groups of 16.  First group contains unique
+ * entries, next groups contain one entry per level
+ *
+ * JE_FIRST	- smallest possible journal entry number
+ *
+ * JEG_BASE	- base group, containing unique entries
+ * JE_COMMIT	- commit entry, validates all previous entries
+ * JE_DYNSB	- dynamic superblock, anything that ought to be in the
+ *		  superblock but cannot because it is read-write data
+ * JE_ANCHOR	- anchor aka master inode aka inode file's inode
+ * JE_ERASECOUNT  erasecounts for all journal segments
+ * JE_SPILLOUT	- unused
+ * JE_SEG_ALIAS	- aliases segments
+ * JE_AREA	- area description
+ *
+ * JE_LAST	- largest possible journal entry number
+ */
+enum {
+	JE_FIRST	= 0x01,
+
+	JEG_BASE	= 0x00,
+	JE_COMMIT	= 0x02,
+	JE_DYNSB	= 0x03,
+	JE_ANCHOR	= 0x04,
+	JE_ERASECOUNT	= 0x05,
+	JE_SPILLOUT	= 0x06,
+	JE_OBJ_ALIAS	= 0x0d,
+	JE_AREA		= 0x0e,
+
+	JE_LAST		= 0x0e,
+};
+
+#endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
new file mode 100644
index 00000000..d8d09380
--- /dev/null
+++ b/fs/logfs/readwrite.c
@@ -0,0 +1,2277 @@
+/*
+ * fs/logfs/readwrite.c
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ *
+ * Actually contains five sets of very similar functions:
+ * read		read blocks from a file
+ * seek_hole	find next hole
+ * seek_data	find next data block
+ * valid	check whether a block still belongs to a file
+ * write	write blocks to a file
+ * delete	delete a block (for directories and ifile)
+ * rewrite	move existing blocks of a file to a new location (gc helper)
+ * truncate	truncate a file
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+static u64 adjust_bix(u64 bix, level_t level)
+{
+	switch (level) {
+	case 0:
+		return bix;
+	case LEVEL(1):
+		return max_t(u64, bix, I0_BLOCKS);
+	case LEVEL(2):
+		return max_t(u64, bix, I1_BLOCKS);
+	case LEVEL(3):
+		return max_t(u64, bix, I2_BLOCKS);
+	case LEVEL(4):
+		return max_t(u64, bix, I3_BLOCKS);
+	case LEVEL(5):
+		return max_t(u64, bix, I4_BLOCKS);
+	default:
+		WARN_ON(1);
+		return bix;
+	}
+}
+
+static inline u64 maxbix(u8 height)
+{
+	return 1ULL << (LOGFS_BLOCK_BITS * height);
+}
+
+/**
+ * The inode address space is cut in two halves.  Lower half belongs to data
+ * pages, upper half to indirect blocks.  If the high bit (INDIRECT_BIT) is
+ * set, the actual block index (bix) and level can be derived from the page
+ * index.
+ *
+ * The lowest three bits of the block index are set to 0 after packing and
+ * unpacking.  Since the lowest n bits (9 for 4KiB blocksize) are ignored
+ * anyway this is harmless.
+ */
+#define ARCH_SHIFT	(BITS_PER_LONG - 32)
+#define INDIRECT_BIT	(0x80000000UL << ARCH_SHIFT)
+#define LEVEL_SHIFT	(28 + ARCH_SHIFT)
+static inline pgoff_t first_indirect_block(void)
+{
+	return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
+}
+
+pgoff_t logfs_pack_index(u64 bix, level_t level)
+{
+	pgoff_t index;
+
+	BUG_ON(bix >= INDIRECT_BIT);
+	if (level == 0)
+		return bix;
+
+	index  = INDIRECT_BIT;
+	index |= (__force long)level << LEVEL_SHIFT;
+	index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
+	return index;
+}
+
+void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
+{
+	u8 __level;
+
+	if (!(index & INDIRECT_BIT)) {
+		*bix = index;
+		*level = 0;
+		return;
+	}
+
+	__level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
+	*level = LEVEL(__level);
+	*bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
+	*bix = adjust_bix(*bix, *level);
+	return;
+}
+#undef ARCH_SHIFT
+#undef INDIRECT_BIT
+#undef LEVEL_SHIFT
+
+/*
+ * Time is stored as nanoseconds since the epoch.
+ */
+static struct timespec be64_to_timespec(__be64 betime)
+{
+	return ns_to_timespec(be64_to_cpu(betime));
+}
+
+static __be64 timespec_to_be64(struct timespec tsp)
+{
+	return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
+}
+
+static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	inode->i_mode	= be16_to_cpu(di->di_mode);
+	li->li_height	= di->di_height;
+	li->li_flags	= be32_to_cpu(di->di_flags);
+	inode->i_uid	= be32_to_cpu(di->di_uid);
+	inode->i_gid	= be32_to_cpu(di->di_gid);
+	inode->i_size	= be64_to_cpu(di->di_size);
+	logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
+	inode->i_atime	= be64_to_timespec(di->di_atime);
+	inode->i_ctime	= be64_to_timespec(di->di_ctime);
+	inode->i_mtime	= be64_to_timespec(di->di_mtime);
+	inode->i_nlink	= be32_to_cpu(di->di_refcount);
+	inode->i_generation = be32_to_cpu(di->di_generation);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFSOCK:	/* fall through */
+	case S_IFBLK:	/* fall through */
+	case S_IFCHR:	/* fall through */
+	case S_IFIFO:
+		inode->i_rdev = be64_to_cpu(di->di_data[0]);
+		break;
+	case S_IFDIR:	/* fall through */
+	case S_IFREG:	/* fall through */
+	case S_IFLNK:
+		for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+			li->li_data[i] = be64_to_cpu(di->di_data[i]);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	di->di_mode	= cpu_to_be16(inode->i_mode);
+	di->di_height	= li->li_height;
+	di->di_pad	= 0;
+	di->di_flags	= cpu_to_be32(li->li_flags);
+	di->di_uid	= cpu_to_be32(inode->i_uid);
+	di->di_gid	= cpu_to_be32(inode->i_gid);
+	di->di_size	= cpu_to_be64(i_size_read(inode));
+	di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
+	di->di_atime	= timespec_to_be64(inode->i_atime);
+	di->di_ctime	= timespec_to_be64(inode->i_ctime);
+	di->di_mtime	= timespec_to_be64(inode->i_mtime);
+	di->di_refcount	= cpu_to_be32(inode->i_nlink);
+	di->di_generation = cpu_to_be32(inode->i_generation);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFSOCK:	/* fall through */
+	case S_IFBLK:	/* fall through */
+	case S_IFCHR:	/* fall through */
+	case S_IFIFO:
+		di->di_data[0] = cpu_to_be64(inode->i_rdev);
+		break;
+	case S_IFDIR:	/* fall through */
+	case S_IFREG:	/* fall through */
+	case S_IFLNK:
+		for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+			di->di_data[i] = cpu_to_be64(li->li_data[i]);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void __logfs_set_blocks(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_inode *li = logfs_inode(inode);
+
+	inode->i_blocks = ULONG_MAX;
+	if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
+		inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
+}
+
+void logfs_set_blocks(struct inode *inode, u64 bytes)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	li->li_used_bytes = bytes;
+	__logfs_set_blocks(inode);
+}
+
+static void prelock_page(struct super_block *sb, struct page *page, int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	BUG_ON(!PageLocked(page));
+	if (lock) {
+		BUG_ON(PagePreLocked(page));
+		SetPagePreLocked(page);
+	} else {
+		/* We are in GC path. */
+		if (PagePreLocked(page))
+			super->s_lock_count++;
+		else
+			SetPagePreLocked(page);
+	}
+}
+
+static void preunlock_page(struct super_block *sb, struct page *page, int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	BUG_ON(!PageLocked(page));
+	if (lock)
+		ClearPagePreLocked(page);
+	else {
+		/* We are in GC path. */
+		BUG_ON(!PagePreLocked(page));
+		if (super->s_lock_count)
+			super->s_lock_count--;
+		else
+			ClearPagePreLocked(page);
+	}
+}
+
+/*
+ * Logfs is prone to an AB-BA deadlock where one task tries to acquire
+ * s_write_mutex with a locked page and GC tries to get that page while holding
+ * s_write_mutex.
+ * To solve this issue logfs will ignore the page lock iff the page in question
+ * is waiting for s_write_mutex.  We annotate this fact by setting PG_pre_locked
+ * in addition to PG_locked.
+ */
+static void logfs_get_wblocks(struct super_block *sb, struct page *page,
+		int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (page)
+		prelock_page(sb, page, lock);
+
+	if (lock) {
+		mutex_lock(&super->s_write_mutex);
+		logfs_gc_pass(sb);
+		/* FIXME: We also have to check for shadowed space
+		 * and mempool fill grade */
+	}
+}
+
+static void logfs_put_wblocks(struct super_block *sb, struct page *page,
+		int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (page)
+		preunlock_page(sb, page, lock);
+	/* Order matters - we must clear PG_pre_locked before releasing
+	 * s_write_mutex or we could race against another task. */
+	if (lock)
+		mutex_unlock(&super->s_write_mutex);
+}
+
+static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
+		level_t level)
+{
+	return find_or_create_page(inode->i_mapping,
+			logfs_pack_index(bix, level), GFP_NOFS);
+}
+
+static void logfs_put_read_page(struct page *page)
+{
+	unlock_page(page);
+	page_cache_release(page);
+}
+
+static void logfs_lock_write_page(struct page *page)
+{
+	int loop = 0;
+
+	while (unlikely(!trylock_page(page))) {
+		if (loop++ > 0x1000) {
+			/* Has been observed once so far... */
+			printk(KERN_ERR "stack at %p\n", &loop);
+			BUG();
+		}
+		if (PagePreLocked(page)) {
+			/* Holder of page lock is waiting for us, it
+			 * is safe to use this page. */
+			break;
+		}
+		/* Some other process has this page locked and has
+		 * nothing to do with us.  Wait for it to finish.
+		 */
+		schedule();
+	}
+	BUG_ON(!PageLocked(page));
+}
+
+static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
+		level_t level)
+{
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t index = logfs_pack_index(bix, level);
+	struct page *page;
+	int err;
+
+repeat:
+	page = find_get_page(mapping, index);
+	if (!page) {
+		page = __page_cache_alloc(GFP_NOFS);
+		if (!page)
+			return NULL;
+		err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
+		if (unlikely(err)) {
+			page_cache_release(page);
+			if (err == -EEXIST)
+				goto repeat;
+			return NULL;
+		}
+	} else logfs_lock_write_page(page);
+	BUG_ON(!PageLocked(page));
+	return page;
+}
+
+static void logfs_unlock_write_page(struct page *page)
+{
+	if (!PagePreLocked(page))
+		unlock_page(page);
+}
+
+static void logfs_put_write_page(struct page *page)
+{
+	logfs_unlock_write_page(page);
+	page_cache_release(page);
+}
+
+static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
+		int rw)
+{
+	if (rw == READ)
+		return logfs_get_read_page(inode, bix, level);
+	else
+		return logfs_get_write_page(inode, bix, level);
+}
+
+static void logfs_put_page(struct page *page, int rw)
+{
+	if (rw == READ)
+		logfs_put_read_page(page);
+	else
+		logfs_put_write_page(page);
+}
+
+static unsigned long __get_bits(u64 val, int skip, int no)
+{
+	u64 ret = val;
+
+	ret >>= skip * no;
+	ret <<= 64 - no;
+	ret >>= 64 - no;
+	return ret;
+}
+
+static unsigned long get_bits(u64 val, level_t skip)
+{
+	return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
+}
+
+static inline void init_shadow_tree(struct super_block *sb,
+		struct shadow_tree *tree)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	btree_init_mempool64(&tree->new, super->s_btree_pool);
+	btree_init_mempool64(&tree->old, super->s_btree_pool);
+}
+
+static void indirect_write_block(struct logfs_block *block)
+{
+	struct page *page;
+	struct inode *inode;
+	int ret;
+
+	page = block->page;
+	inode = page->mapping->host;
+	logfs_lock_write_page(page);
+	ret = logfs_write_buf(inode, page, 0);
+	logfs_unlock_write_page(page);
+	/*
+	 * This needs some rework.  Unless you want your filesystem to run
+	 * completely synchronously (you don't), the filesystem will always
+	 * report writes as 'successful' before the actual work has been
+	 * done.  The actual work gets done here and this is where any errors
+	 * will show up.  And there isn't much we can do about it, really.
+	 *
+	 * Some attempts to fix the errors (move from bad blocks, retry io,...)
+	 * have already been done, so anything left should be either a broken
+	 * device or a bug somewhere in logfs itself.  Being relatively new,
+	 * the odds currently favor a bug, so for now the line below isn't
+	 * entirely tasteles.
+	 */
+	BUG_ON(ret);
+}
+
+static void inode_write_block(struct logfs_block *block)
+{
+	struct inode *inode;
+	int ret;
+
+	inode = block->inode;
+	if (inode->i_ino == LOGFS_INO_MASTER)
+		logfs_write_anchor(inode->i_sb);
+	else {
+		ret = __logfs_write_inode(inode, 0);
+		/* see indirect_write_block comment */
+		BUG_ON(ret);
+	}
+}
+
+/*
+ * This silences a false, yet annoying gcc warning.  I hate it when my editor
+ * jumps into bitops.h each time I recompile this file.
+ * TODO: Complain to gcc folks about this and upgrade compiler.
+ */
+static unsigned long fnb(const unsigned long *addr,
+		unsigned long size, unsigned long offset)
+{
+	return find_next_bit(addr, size, offset);
+}
+
+static __be64 inode_val0(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 val;
+
+	/*
+	 * Explicit shifting generates good code, but must match the format
+	 * of the structure.  Add some paranoia just in case.
+	 */
+	BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
+	BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
+	BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
+
+	val =	(u64)inode->i_mode << 48 |
+		(u64)li->li_height << 40 |
+		(u64)li->li_flags;
+	return cpu_to_be64(val);
+}
+
+static int inode_write_alias(struct super_block *sb,
+		struct logfs_block *block, write_alias_t *write_one_alias)
+{
+	struct inode *inode = block->inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	unsigned long pos;
+	u64 ino , bix;
+	__be64 val;
+	level_t level;
+	int err;
+
+	for (pos = 0; ; pos++) {
+		pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+		if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
+			return 0;
+
+		switch (pos) {
+		case INODE_HEIGHT_OFS:
+			val = inode_val0(inode);
+			break;
+		case INODE_USED_OFS:
+			val = cpu_to_be64(li->li_used_bytes);
+			break;
+		case INODE_SIZE_OFS:
+			val = cpu_to_be64(i_size_read(inode));
+			break;
+		case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
+			val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
+			break;
+		default:
+			BUG();
+		}
+
+		ino = LOGFS_INO_MASTER;
+		bix = inode->i_ino;
+		level = LEVEL(0);
+		err = write_one_alias(sb, ino, bix, level, pos, val);
+		if (err)
+			return err;
+	}
+}
+
+static int indirect_write_alias(struct super_block *sb,
+		struct logfs_block *block, write_alias_t *write_one_alias)
+{
+	unsigned long pos;
+	struct page *page = block->page;
+	u64 ino , bix;
+	__be64 *child, val;
+	level_t level;
+	int err;
+
+	for (pos = 0; ; pos++) {
+		pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+		if (pos >= LOGFS_BLOCK_FACTOR)
+			return 0;
+
+		ino = page->mapping->host->i_ino;
+		logfs_unpack_index(page->index, &bix, &level);
+		child = kmap_atomic(page, KM_USER0);
+		val = child[pos];
+		kunmap_atomic(child, KM_USER0);
+		err = write_one_alias(sb, ino, bix, level, pos, val);
+		if (err)
+			return err;
+	}
+}
+
+int logfs_write_obj_aliases_pagecache(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+	int err;
+
+	list_for_each_entry(block, &super->s_object_alias, alias_list) {
+		err = block->ops->write_alias(sb, block, write_alias_journal);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+void __free_block(struct super_block *sb, struct logfs_block *block)
+{
+	BUG_ON(!list_empty(&block->item_list));
+	list_del(&block->alias_list);
+	mempool_free(block, logfs_super(sb)->s_block_pool);
+}
+
+static void inode_free_block(struct super_block *sb, struct logfs_block *block)
+{
+	struct inode *inode = block->inode;
+
+	logfs_inode(inode)->li_block = NULL;
+	__free_block(sb, block);
+}
+
+static void indirect_free_block(struct super_block *sb,
+		struct logfs_block *block)
+{
+	ClearPagePrivate(block->page);
+	block->page->private = 0;
+	__free_block(sb, block);
+}
+
+
+static struct logfs_block_ops inode_block_ops = {
+	.write_block = inode_write_block,
+	.free_block = inode_free_block,
+	.write_alias = inode_write_alias,
+};
+
+struct logfs_block_ops indirect_block_ops = {
+	.write_block = indirect_write_block,
+	.free_block = indirect_free_block,
+	.write_alias = indirect_write_alias,
+};
+
+struct logfs_block *__alloc_block(struct super_block *sb,
+		u64 ino, u64 bix, level_t level)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+
+	block = mempool_alloc(super->s_block_pool, GFP_NOFS);
+	memset(block, 0, sizeof(*block));
+	INIT_LIST_HEAD(&block->alias_list);
+	INIT_LIST_HEAD(&block->item_list);
+	block->sb = sb;
+	block->ino = ino;
+	block->bix = bix;
+	block->level = level;
+	return block;
+}
+
+static void alloc_inode_block(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block;
+
+	if (li->li_block)
+		return;
+
+	block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
+	block->inode = inode;
+	li->li_block = block;
+	block->ops = &inode_block_ops;
+}
+
+void initialize_block_counters(struct page *page, struct logfs_block *block,
+		__be64 *array, int page_is_empty)
+{
+	u64 ptr;
+	int i, start;
+
+	block->partial = 0;
+	block->full = 0;
+	start = 0;
+	if (page->index < first_indirect_block()) {
+		/* Counters are pointless on level 0 */
+		return;
+	}
+	if (page->index == first_indirect_block()) {
+		/* Skip unused pointers */
+		start = I0_BLOCKS;
+		block->full = I0_BLOCKS;
+	}
+	if (!page_is_empty) {
+		for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
+			ptr = be64_to_cpu(array[i]);
+			if (ptr)
+				block->partial++;
+			if (ptr & LOGFS_FULLY_POPULATED)
+				block->full++;
+		}
+	}
+}
+
+static void alloc_data_block(struct inode *inode, struct page *page)
+{
+	struct logfs_block *block;
+	u64 bix;
+	level_t level;
+
+	if (PagePrivate(page))
+		return;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
+	block->page = page;
+	SetPagePrivate(page);
+	page->private = (unsigned long)block;
+	block->ops = &indirect_block_ops;
+}
+
+static void alloc_indirect_block(struct inode *inode, struct page *page,
+		int page_is_empty)
+{
+	struct logfs_block *block;
+	__be64 *array;
+
+	if (PagePrivate(page))
+		return;
+
+	alloc_data_block(inode, page);
+
+	block = logfs_block(page);
+	array = kmap_atomic(page, KM_USER0);
+	initialize_block_counters(page, block, array, page_is_empty);
+	kunmap_atomic(array, KM_USER0);
+}
+
+static void block_set_pointer(struct page *page, int index, u64 ptr)
+{
+	struct logfs_block *block = logfs_block(page);
+	__be64 *array;
+	u64 oldptr;
+
+	BUG_ON(!block);
+	array = kmap_atomic(page, KM_USER0);
+	oldptr = be64_to_cpu(array[index]);
+	array[index] = cpu_to_be64(ptr);
+	kunmap_atomic(array, KM_USER0);
+	SetPageUptodate(page);
+
+	block->full += !!(ptr & LOGFS_FULLY_POPULATED)
+		- !!(oldptr & LOGFS_FULLY_POPULATED);
+	block->partial += !!ptr - !!oldptr;
+}
+
+static u64 block_get_pointer(struct page *page, int index)
+{
+	__be64 *block;
+	u64 ptr;
+
+	block = kmap_atomic(page, KM_USER0);
+	ptr = be64_to_cpu(block[index]);
+	kunmap_atomic(block, KM_USER0);
+	return ptr;
+}
+
+static int logfs_read_empty(struct page *page)
+{
+	zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+	return 0;
+}
+
+static int logfs_read_direct(struct inode *inode, struct page *page)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	pgoff_t index = page->index;
+	u64 block;
+
+	block = li->li_data[index];
+	if (!block)
+		return logfs_read_empty(page);
+
+	return logfs_segment_read(inode, page, block, index, 0);
+}
+
+static int logfs_read_loop(struct inode *inode, struct page *page,
+		int rw_context)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 bix, bofs = li->li_data[INDIRECT_INDEX];
+	level_t level, target_level;
+	int ret;
+	struct page *ipage;
+
+	logfs_unpack_index(page->index, &bix, &target_level);
+	if (!bofs)
+		return logfs_read_empty(page);
+
+	if (bix >= maxbix(li->li_height))
+		return logfs_read_empty(page);
+
+	for (level = LEVEL(li->li_height);
+			(__force u8)level > (__force u8)target_level;
+			level = SUBLEVEL(level)){
+		ipage = logfs_get_page(inode, bix, level, rw_context);
+		if (!ipage)
+			return -ENOMEM;
+
+		ret = logfs_segment_read(inode, ipage, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(ipage);
+			return ret;
+		}
+
+		bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
+		logfs_put_page(ipage, rw_context);
+		if (!bofs)
+			return logfs_read_empty(page);
+	}
+
+	return logfs_segment_read(inode, page, bofs, bix, 0);
+}
+
+static int logfs_read_block(struct inode *inode, struct page *page,
+		int rw_context)
+{
+	pgoff_t index = page->index;
+
+	if (index < I0_BLOCKS)
+		return logfs_read_direct(inode, page);
+	return logfs_read_loop(inode, page, rw_context);
+}
+
+static int logfs_exist_loop(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 bofs = li->li_data[INDIRECT_INDEX];
+	level_t level;
+	int ret;
+	struct page *ipage;
+
+	if (!bofs)
+		return 0;
+	if (bix >= maxbix(li->li_height))
+		return 0;
+
+	for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
+		ipage = logfs_get_read_page(inode, bix, level);
+		if (!ipage)
+			return -ENOMEM;
+
+		ret = logfs_segment_read(inode, ipage, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(ipage);
+			return ret;
+		}
+
+		bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
+		logfs_put_read_page(ipage);
+		if (!bofs)
+			return 0;
+	}
+
+	return 1;
+}
+
+int logfs_exist_block(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (bix < I0_BLOCKS)
+		return !!li->li_data[bix];
+	return logfs_exist_loop(inode, bix);
+}
+
+static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	for (; bix < I0_BLOCKS; bix++)
+		if (data ^ (li->li_data[bix] == 0))
+			return bix;
+	return I0_BLOCKS;
+}
+
+static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	__be64 *rblock;
+	u64 increment, bofs = li->li_data[INDIRECT_INDEX];
+	level_t level;
+	int ret, slot;
+	struct page *page;
+
+	BUG_ON(!bofs);
+
+	for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
+		increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
+		page = logfs_get_read_page(inode, bix, level);
+		if (!page)
+			return bix;
+
+		ret = logfs_segment_read(inode, page, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(page);
+			return bix;
+		}
+
+		slot = get_bits(bix, SUBLEVEL(level));
+		rblock = kmap_atomic(page, KM_USER0);
+		while (slot < LOGFS_BLOCK_FACTOR) {
+			if (data && (rblock[slot] != 0))
+				break;
+			if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
+				break;
+			slot++;
+			bix += increment;
+			bix &= ~(increment - 1);
+		}
+		if (slot >= LOGFS_BLOCK_FACTOR) {
+			kunmap_atomic(rblock, KM_USER0);
+			logfs_put_read_page(page);
+			return bix;
+		}
+		bofs = be64_to_cpu(rblock[slot]);
+		kunmap_atomic(rblock, KM_USER0);
+		logfs_put_read_page(page);
+		if (!bofs) {
+			BUG_ON(data);
+			return bix;
+		}
+	}
+	return bix;
+}
+
+/**
+ * logfs_seek_hole - find next hole starting at a given block index
+ * @inode:		inode to search in
+ * @bix:		block index to start searching
+ *
+ * Returns next hole.  If the file doesn't contain any further holes, the
+ * block address next to eof is returned instead.
+ */
+u64 logfs_seek_hole(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (bix < I0_BLOCKS) {
+		bix = seek_holedata_direct(inode, bix, 0);
+		if (bix < I0_BLOCKS)
+			return bix;
+	}
+
+	if (!li->li_data[INDIRECT_INDEX])
+		return bix;
+	else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
+		bix = maxbix(li->li_height);
+	else if (bix >= maxbix(li->li_height))
+		return bix;
+	else {
+		bix = seek_holedata_loop(inode, bix, 0);
+		if (bix < maxbix(li->li_height))
+			return bix;
+		/* Should not happen anymore.  But if some port writes semi-
+		 * corrupt images (as this one used to) we might run into it.
+		 */
+		WARN_ON_ONCE(bix == maxbix(li->li_height));
+	}
+
+	return bix;
+}
+
+static u64 __logfs_seek_data(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (bix < I0_BLOCKS) {
+		bix = seek_holedata_direct(inode, bix, 1);
+		if (bix < I0_BLOCKS)
+			return bix;
+	}
+
+	if (bix < maxbix(li->li_height)) {
+		if (!li->li_data[INDIRECT_INDEX])
+			bix = maxbix(li->li_height);
+		else
+			return seek_holedata_loop(inode, bix, 1);
+	}
+
+	return bix;
+}
+
+/**
+ * logfs_seek_data - find next data block after a given block index
+ * @inode:		inode to search in
+ * @bix:		block index to start searching
+ *
+ * Returns next data block.  If the file doesn't contain any further data
+ * blocks, the last block in the file is returned instead.
+ */
+u64 logfs_seek_data(struct inode *inode, u64 bix)
+{
+	struct super_block *sb = inode->i_sb;
+	u64 ret, end;
+
+	ret = __logfs_seek_data(inode, bix);
+	end = i_size_read(inode) >> sb->s_blocksize_bits;
+	if (ret >= end)
+		ret = max(bix, end);
+	return ret;
+}
+
+static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
+{
+	return pure_ofs(li->li_data[bix]) == ofs;
+}
+
+static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
+		u64 ofs, u64 bofs)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	level_t level;
+	int ret;
+	struct page *page;
+
+	for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
+		page = logfs_get_write_page(inode, bix, level);
+		BUG_ON(!page);
+
+		ret = logfs_segment_read(inode, page, bofs, bix, level);
+		if (ret) {
+			logfs_put_write_page(page);
+			return 0;
+		}
+
+		bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
+		logfs_put_write_page(page);
+		if (!bofs)
+			return 0;
+
+		if (pure_ofs(bofs) == ofs)
+			return 1;
+	}
+	return 0;
+}
+
+static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 bofs = li->li_data[INDIRECT_INDEX];
+
+	if (!bofs)
+		return 0;
+
+	if (bix >= maxbix(li->li_height))
+		return 0;
+
+	if (pure_ofs(bofs) == ofs)
+		return 1;
+
+	return __logfs_is_valid_loop(inode, bix, ofs, bofs);
+}
+
+static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
+		return 0;
+
+	if (bix < I0_BLOCKS)
+		return logfs_is_valid_direct(li, bix, ofs);
+	return logfs_is_valid_loop(inode, bix, ofs);
+}
+
+/**
+ * logfs_is_valid_block - check whether this block is still valid
+ *
+ * @sb	- superblock
+ * @ofs	- block physical offset
+ * @ino	- block inode number
+ * @bix	- block index
+ * @level - block level
+ *
+ * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
+ * become invalid once the journal is written.
+ */
+int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
+		gc_level_t gc_level)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+	int ret, cookie;
+
+	/* Umount closes a segment with free blocks remaining.  Those
+	 * blocks are by definition invalid. */
+	if (ino == -1)
+		return 0;
+
+	LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
+
+	inode = logfs_safe_iget(sb, ino, &cookie);
+	if (IS_ERR(inode))
+		goto invalid;
+
+	ret = __logfs_is_valid_block(inode, bix, ofs);
+	logfs_safe_iput(inode, cookie);
+	if (ret)
+		return ret;
+
+invalid:
+	/* Block is nominally invalid, but may still sit in the shadow tree,
+	 * waiting for a journal commit.
+	 */
+	if (btree_lookup64(&super->s_shadow_tree.old, ofs))
+		return 2;
+	return 0;
+}
+
+int logfs_readpage_nolock(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int ret = -EIO;
+
+	ret = logfs_read_block(inode, page, READ);
+
+	if (ret) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+	flush_dcache_page(page);
+
+	return ret;
+}
+
+static int logfs_reserve_bytes(struct inode *inode, int bytes)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	u64 available = super->s_free_bytes + super->s_dirty_free_bytes
+			- super->s_dirty_used_bytes - super->s_dirty_pages;
+
+	if (!bytes)
+		return 0;
+
+	if (available < bytes)
+		return -ENOSPC;
+
+	if (available < bytes + super->s_root_reserve &&
+			!capable(CAP_SYS_RESOURCE))
+		return -ENOSPC;
+
+	return 0;
+}
+
+int get_page_reserve(struct inode *inode, struct page *page)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_block *block = logfs_block(page);
+	int ret;
+
+	if (block && block->reserved_bytes)
+		return 0;
+
+	logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
+	while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) &&
+			!list_empty(&super->s_writeback_list)) {
+		block = list_entry(super->s_writeback_list.next,
+				struct logfs_block, alias_list);
+		block->ops->write_block(block);
+	}
+	if (!ret) {
+		alloc_data_block(inode, page);
+		block = logfs_block(page);
+		block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
+		super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
+		list_move_tail(&block->alias_list, &super->s_writeback_list);
+	}
+	logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
+	return ret;
+}
+
+/*
+ * We are protected by write lock.  Push victims up to superblock level
+ * and release transaction when appropriate.
+ */
+/* FIXME: This is currently called from the wrong spots. */
+static void logfs_handle_transaction(struct inode *inode,
+		struct logfs_transaction *ta)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+
+	if (!ta)
+		return;
+	logfs_inode(inode)->li_block->ta = NULL;
+
+	if (inode->i_ino != LOGFS_INO_MASTER) {
+		BUG(); /* FIXME: Yes, this needs more thought */
+		/* just remember the transaction until inode is written */
+		//BUG_ON(logfs_inode(inode)->li_transaction);
+		//logfs_inode(inode)->li_transaction = ta;
+		return;
+	}
+
+	switch (ta->state) {
+	case CREATE_1: /* fall through */
+	case UNLINK_1:
+		BUG_ON(super->s_victim_ino);
+		super->s_victim_ino = ta->ino;
+		break;
+	case CREATE_2: /* fall through */
+	case UNLINK_2:
+		BUG_ON(super->s_victim_ino != ta->ino);
+		super->s_victim_ino = 0;
+		/* transaction ends here - free it */
+		kfree(ta);
+		break;
+	case CROSS_RENAME_1:
+		BUG_ON(super->s_rename_dir);
+		BUG_ON(super->s_rename_pos);
+		super->s_rename_dir = ta->dir;
+		super->s_rename_pos = ta->pos;
+		break;
+	case CROSS_RENAME_2:
+		BUG_ON(super->s_rename_dir != ta->dir);
+		BUG_ON(super->s_rename_pos != ta->pos);
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		kfree(ta);
+		break;
+	case TARGET_RENAME_1:
+		BUG_ON(super->s_rename_dir);
+		BUG_ON(super->s_rename_pos);
+		BUG_ON(super->s_victim_ino);
+		super->s_rename_dir = ta->dir;
+		super->s_rename_pos = ta->pos;
+		super->s_victim_ino = ta->ino;
+		break;
+	case TARGET_RENAME_2:
+		BUG_ON(super->s_rename_dir != ta->dir);
+		BUG_ON(super->s_rename_pos != ta->pos);
+		BUG_ON(super->s_victim_ino != ta->ino);
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		break;
+	case TARGET_RENAME_3:
+		BUG_ON(super->s_rename_dir);
+		BUG_ON(super->s_rename_pos);
+		BUG_ON(super->s_victim_ino != ta->ino);
+		super->s_victim_ino = 0;
+		kfree(ta);
+		break;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Not strictly a reservation, but rather a check that we still have enough
+ * space to satisfy the write.
+ */
+static int logfs_reserve_blocks(struct inode *inode, int blocks)
+{
+	return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
+}
+
+struct write_control {
+	u64 ofs;
+	long flags;
+};
+
+static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
+		level_t level, u64 old_ofs)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_shadow *shadow;
+
+	shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
+	memset(shadow, 0, sizeof(*shadow));
+	shadow->ino = inode->i_ino;
+	shadow->bix = bix;
+	shadow->gc_level = expand_level(inode->i_ino, level);
+	shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
+	return shadow;
+}
+
+static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+
+	mempool_free(shadow, super->s_shadow_pool);
+}
+
+static void mark_segment(struct shadow_tree *tree, u32 segno)
+{
+	int err;
+
+	if (!btree_lookup32(&tree->segment_map, segno)) {
+		err = btree_insert32(&tree->segment_map, segno, (void *)1,
+				GFP_NOFS);
+		BUG_ON(err);
+		tree->no_shadowed_segments++;
+	}
+}
+
+/**
+ * fill_shadow_tree - Propagate shadow tree changes due to a write
+ * @inode:	Inode owning the page
+ * @page:	Struct page that was written
+ * @shadow:	Shadow for the current write
+ *
+ * Writes in logfs can result in two semi-valid objects.  The old object
+ * is still valid as long as it can be reached by following pointers on
+ * the medium.  Only when writes propagate all the way up to the journal
+ * has the new object safely replaced the old one.
+ *
+ * To handle this problem, a struct logfs_shadow is used to represent
+ * every single write.  It is attached to the indirect block, which is
+ * marked dirty.  When the indirect block is written, its shadows are
+ * handed up to the next indirect block (or inode).  Untimately they
+ * will reach the master inode and be freed upon journal commit.
+ *
+ * This function handles a single step in the propagation.  It adds the
+ * shadow for the current write to the tree, along with any shadows in
+ * the page's tree, in case it was an indirect block.  If a page is
+ * written, the inode parameter is left NULL, if an inode is written,
+ * the page parameter is left NULL.
+ */
+static void fill_shadow_tree(struct inode *inode, struct page *page,
+		struct logfs_shadow *shadow)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_block *block = logfs_block(page);
+	struct shadow_tree *tree = &super->s_shadow_tree;
+
+	if (PagePrivate(page)) {
+		if (block->alias_map)
+			super->s_no_object_aliases -= bitmap_weight(
+					block->alias_map, LOGFS_BLOCK_FACTOR);
+		logfs_handle_transaction(inode, block->ta);
+		block->ops->free_block(inode->i_sb, block);
+	}
+	if (shadow) {
+		if (shadow->old_ofs)
+			btree_insert64(&tree->old, shadow->old_ofs, shadow,
+					GFP_NOFS);
+		else
+			btree_insert64(&tree->new, shadow->new_ofs, shadow,
+					GFP_NOFS);
+
+		super->s_dirty_used_bytes += shadow->new_len;
+		super->s_dirty_free_bytes += shadow->old_len;
+		mark_segment(tree, shadow->old_ofs >> super->s_segshift);
+		mark_segment(tree, shadow->new_ofs >> super->s_segshift);
+	}
+}
+
+static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
+		long child_no)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
+		/* Aliases in the master inode are pointless. */
+		return;
+	}
+
+	if (!test_bit(child_no, block->alias_map)) {
+		set_bit(child_no, block->alias_map);
+		super->s_no_object_aliases++;
+	}
+	list_move_tail(&block->alias_list, &super->s_object_alias);
+}
+
+/*
+ * Object aliases can and often do change the size and occupied space of a
+ * file.  So not only do we have to change the pointers, we also have to
+ * change inode->i_size and li->li_used_bytes.  Which is done by setting
+ * another two object aliases for the inode itself.
+ */
+static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (shadow->new_len == shadow->old_len)
+		return;
+
+	alloc_inode_block(inode);
+	li->li_used_bytes += shadow->new_len - shadow->old_len;
+	__logfs_set_blocks(inode);
+	logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
+	logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
+}
+
+static int logfs_write_i0(struct inode *inode, struct page *page,
+		struct write_control *wc)
+{
+	struct logfs_shadow *shadow;
+	u64 bix;
+	level_t level;
+	int full, err = 0;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	if (wc->ofs == 0)
+		if (logfs_reserve_blocks(inode, 1))
+			return -ENOSPC;
+
+	shadow = alloc_shadow(inode, bix, level, wc->ofs);
+	if (wc->flags & WF_WRITE)
+		err = logfs_segment_write(inode, page, shadow);
+	if (wc->flags & WF_DELETE)
+		logfs_segment_delete(inode, shadow);
+	if (err) {
+		free_shadow(inode, shadow);
+		return err;
+	}
+
+	set_iused(inode, shadow);
+	full = 1;
+	if (level != 0) {
+		alloc_indirect_block(inode, page, 0);
+		full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
+	}
+	fill_shadow_tree(inode, page, shadow);
+	wc->ofs = shadow->new_ofs;
+	if (wc->ofs && full)
+		wc->ofs |= LOGFS_FULLY_POPULATED;
+	return 0;
+}
+
+static int logfs_write_direct(struct inode *inode, struct page *page,
+		long flags)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.ofs = li->li_data[page->index],
+		.flags = flags,
+	};
+	int err;
+
+	alloc_inode_block(inode);
+
+	err = logfs_write_i0(inode, page, &wc);
+	if (err)
+		return err;
+
+	li->li_data[page->index] = wc.ofs;
+	logfs_set_alias(inode->i_sb, li->li_block,
+			page->index + INODE_POINTER_OFS);
+	return 0;
+}
+
+static int ptr_change(u64 ofs, struct page *page)
+{
+	struct logfs_block *block = logfs_block(page);
+	int empty0, empty1, full0, full1;
+
+	empty0 = ofs == 0;
+	empty1 = block->partial == 0;
+	if (empty0 != empty1)
+		return 1;
+
+	/* The !! is necessary to shrink result to int */
+	full0 = !!(ofs & LOGFS_FULLY_POPULATED);
+	full1 = block->full == LOGFS_BLOCK_FACTOR;
+	if (full0 != full1)
+		return 1;
+	return 0;
+}
+
+static int __logfs_write_rec(struct inode *inode, struct page *page,
+		struct write_control *this_wc,
+		pgoff_t bix, level_t target_level, level_t level)
+{
+	int ret, page_empty = 0;
+	int child_no = get_bits(bix, SUBLEVEL(level));
+	struct page *ipage;
+	struct write_control child_wc = {
+		.flags = this_wc->flags,
+	};
+
+	ipage = logfs_get_write_page(inode, bix, level);
+	if (!ipage)
+		return -ENOMEM;
+
+	if (this_wc->ofs) {
+		ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
+		if (ret)
+			goto out;
+	} else if (!PageUptodate(ipage)) {
+		page_empty = 1;
+		logfs_read_empty(ipage);
+	}
+
+	child_wc.ofs = block_get_pointer(ipage, child_no);
+
+	if ((__force u8)level-1 > (__force u8)target_level)
+		ret = __logfs_write_rec(inode, page, &child_wc, bix,
+				target_level, SUBLEVEL(level));
+	else
+		ret = logfs_write_i0(inode, page, &child_wc);
+
+	if (ret)
+		goto out;
+
+	alloc_indirect_block(inode, ipage, page_empty);
+	block_set_pointer(ipage, child_no, child_wc.ofs);
+	/* FIXME: first condition seems superfluous */
+	if (child_wc.ofs || logfs_block(ipage)->partial)
+		this_wc->flags |= WF_WRITE;
+	/* the condition on this_wc->ofs ensures that we won't consume extra
+	 * space for indirect blocks in the future, which we cannot reserve */
+	if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
+		ret = logfs_write_i0(inode, ipage, this_wc);
+	else
+		logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
+out:
+	logfs_put_write_page(ipage);
+	return ret;
+}
+
+static int logfs_write_rec(struct inode *inode, struct page *page,
+		pgoff_t bix, level_t target_level, long flags)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.ofs = li->li_data[INDIRECT_INDEX],
+		.flags = flags,
+	};
+	int ret;
+
+	alloc_inode_block(inode);
+
+	if (li->li_height > (__force u8)target_level)
+		ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
+				LEVEL(li->li_height));
+	else
+		ret = logfs_write_i0(inode, page, &wc);
+	if (ret)
+		return ret;
+
+	if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
+		li->li_data[INDIRECT_INDEX] = wc.ofs;
+		logfs_set_alias(inode->i_sb, li->li_block,
+				INDIRECT_INDEX + INODE_POINTER_OFS);
+	}
+	return ret;
+}
+
+void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+	alloc_inode_block(inode);
+	logfs_inode(inode)->li_block->ta = ta;
+}
+
+void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+	struct logfs_block *block = logfs_inode(inode)->li_block;
+
+	if (block && block->ta)
+		block->ta = NULL;
+}
+
+static int grow_inode(struct inode *inode, u64 bix, level_t level)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u8 height = (__force u8)level;
+	struct page *page;
+	struct write_control wc = {
+		.flags = WF_WRITE,
+	};
+	int err;
+
+	BUG_ON(height > 5 || li->li_height > 5);
+	while (height > li->li_height || bix >= maxbix(li->li_height)) {
+		page = logfs_get_write_page(inode, I0_BLOCKS + 1,
+				LEVEL(li->li_height + 1));
+		if (!page)
+			return -ENOMEM;
+		logfs_read_empty(page);
+		alloc_indirect_block(inode, page, 1);
+		block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
+		err = logfs_write_i0(inode, page, &wc);
+		logfs_put_write_page(page);
+		if (err)
+			return err;
+		li->li_data[INDIRECT_INDEX] = wc.ofs;
+		wc.ofs = 0;
+		li->li_height++;
+		logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
+	}
+	return 0;
+}
+
+static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	pgoff_t index = page->index;
+	u64 bix;
+	level_t level;
+	int err;
+
+	flags |= WF_WRITE | WF_DELETE;
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	logfs_unpack_index(index, &bix, &level);
+	if (logfs_block(page) && logfs_block(page)->reserved_bytes)
+		super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
+
+	if (index < I0_BLOCKS)
+		return logfs_write_direct(inode, page, flags);
+
+	bix = adjust_bix(bix, level);
+	err = grow_inode(inode, bix, level);
+	if (err)
+		return err;
+	return logfs_write_rec(inode, page, bix, level, flags);
+}
+
+int logfs_write_buf(struct inode *inode, struct page *page, long flags)
+{
+	struct super_block *sb = inode->i_sb;
+	int ret;
+
+	logfs_get_wblocks(sb, page, flags & WF_LOCK);
+	ret = __logfs_write_buf(inode, page, flags);
+	logfs_put_wblocks(sb, page, flags & WF_LOCK);
+	return ret;
+}
+
+static int __logfs_delete(struct inode *inode, struct page *page)
+{
+	long flags = WF_DELETE;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	if (page->index < I0_BLOCKS)
+		return logfs_write_direct(inode, page, flags);
+	return logfs_write_rec(inode, page, page->index, 0, flags);
+}
+
+int logfs_delete(struct inode *inode, pgoff_t index,
+		struct shadow_tree *shadow_tree)
+{
+	struct super_block *sb = inode->i_sb;
+	struct page *page;
+	int ret;
+
+	page = logfs_get_read_page(inode, index, 0);
+	if (!page)
+		return -ENOMEM;
+
+	logfs_get_wblocks(sb, page, 1);
+	ret = __logfs_delete(inode, page);
+	logfs_put_wblocks(sb, page, 1);
+
+	logfs_put_read_page(page);
+
+	return ret;
+}
+
+int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
+		gc_level_t gc_level, long flags)
+{
+	level_t level = shrink_level(gc_level);
+	struct page *page;
+	int err;
+
+	page = logfs_get_write_page(inode, bix, level);
+	if (!page)
+		return -ENOMEM;
+
+	err = logfs_segment_read(inode, page, ofs, bix, level);
+	if (!err) {
+		if (level != 0)
+			alloc_indirect_block(inode, page, 0);
+		err = logfs_write_buf(inode, page, flags);
+		if (!err && shrink_level(gc_level) == 0) {
+			/* Rewrite cannot mark the inode dirty but has to
+			 * write it immediately.
+			 * Q: Can't we just create an alias for the inode
+			 * instead?  And if not, why not?
+			 */
+			if (inode->i_ino == LOGFS_INO_MASTER)
+				logfs_write_anchor(inode->i_sb);
+			else {
+				err = __logfs_write_inode(inode, flags);
+			}
+		}
+	}
+	logfs_put_write_page(page);
+	return err;
+}
+
+static int truncate_data_block(struct inode *inode, struct page *page,
+		u64 ofs, struct logfs_shadow *shadow, u64 size)
+{
+	loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
+	u64 bix;
+	level_t level;
+	int err;
+
+	/* Does truncation happen within this page? */
+	if (size <= pageofs || size - pageofs >= PAGE_SIZE)
+		return 0;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	BUG_ON(level != 0);
+
+	err = logfs_segment_read(inode, page, ofs, bix, level);
+	if (err)
+		return err;
+
+	zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
+	return logfs_segment_write(inode, page, shadow);
+}
+
+static int logfs_truncate_i0(struct inode *inode, struct page *page,
+		struct write_control *wc, u64 size)
+{
+	struct logfs_shadow *shadow;
+	u64 bix;
+	level_t level;
+	int err = 0;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	BUG_ON(level != 0);
+	shadow = alloc_shadow(inode, bix, level, wc->ofs);
+
+	err = truncate_data_block(inode, page, wc->ofs, shadow, size);
+	if (err) {
+		free_shadow(inode, shadow);
+		return err;
+	}
+
+	logfs_segment_delete(inode, shadow);
+	set_iused(inode, shadow);
+	fill_shadow_tree(inode, page, shadow);
+	wc->ofs = shadow->new_ofs;
+	return 0;
+}
+
+static int logfs_truncate_direct(struct inode *inode, u64 size)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc;
+	struct page *page;
+	int e;
+	int err;
+
+	alloc_inode_block(inode);
+
+	for (e = I0_BLOCKS - 1; e >= 0; e--) {
+		if (size > (e+1) * LOGFS_BLOCKSIZE)
+			break;
+
+		wc.ofs = li->li_data[e];
+		if (!wc.ofs)
+			continue;
+
+		page = logfs_get_write_page(inode, e, 0);
+		if (!page)
+			return -ENOMEM;
+		err = logfs_segment_read(inode, page, wc.ofs, e, 0);
+		if (err) {
+			logfs_put_write_page(page);
+			return err;
+		}
+		err = logfs_truncate_i0(inode, page, &wc, size);
+		logfs_put_write_page(page);
+		if (err)
+			return err;
+
+		li->li_data[e] = wc.ofs;
+	}
+	return 0;
+}
+
+/* FIXME: these need to become per-sb once we support different blocksizes */
+static u64 __logfs_step[] = {
+	1,
+	I1_BLOCKS,
+	I2_BLOCKS,
+	I3_BLOCKS,
+};
+
+static u64 __logfs_start_index[] = {
+	I0_BLOCKS,
+	I1_BLOCKS,
+	I2_BLOCKS,
+	I3_BLOCKS
+};
+
+static inline u64 logfs_step(level_t level)
+{
+	return __logfs_step[(__force u8)level];
+}
+
+static inline u64 logfs_factor(u8 level)
+{
+	return __logfs_step[level] * LOGFS_BLOCKSIZE;
+}
+
+static inline u64 logfs_start_index(level_t level)
+{
+	return __logfs_start_index[(__force u8)level];
+}
+
+static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
+{
+	logfs_unpack_index(index, bix, level);
+	if (*bix <= logfs_start_index(SUBLEVEL(*level)))
+		*bix = 0;
+}
+
+static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
+		struct write_control *this_wc, u64 size)
+{
+	int truncate_happened = 0;
+	int e, err = 0;
+	u64 bix, child_bix, next_bix;
+	level_t level;
+	struct page *page;
+	struct write_control child_wc = { /* FIXME: flags */ };
+
+	logfs_unpack_raw_index(ipage->index, &bix, &level);
+	err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
+	if (err)
+		return err;
+
+	for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
+		child_bix = bix + e * logfs_step(SUBLEVEL(level));
+		next_bix = child_bix + logfs_step(SUBLEVEL(level));
+		if (size > next_bix * LOGFS_BLOCKSIZE)
+			break;
+
+		child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
+		if (!child_wc.ofs)
+			continue;
+
+		page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
+		if (!page)
+			return -ENOMEM;
+
+		if ((__force u8)level > 1)
+			err = __logfs_truncate_rec(inode, page, &child_wc, size);
+		else
+			err = logfs_truncate_i0(inode, page, &child_wc, size);
+		logfs_put_write_page(page);
+		if (err)
+			return err;
+
+		truncate_happened = 1;
+		alloc_indirect_block(inode, ipage, 0);
+		block_set_pointer(ipage, e, child_wc.ofs);
+	}
+
+	if (!truncate_happened) {
+		printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
+		return 0;
+	}
+
+	this_wc->flags = WF_DELETE;
+	if (logfs_block(ipage)->partial)
+		this_wc->flags |= WF_WRITE;
+
+	return logfs_write_i0(inode, ipage, this_wc);
+}
+
+static int logfs_truncate_rec(struct inode *inode, u64 size)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.ofs = li->li_data[INDIRECT_INDEX],
+	};
+	struct page *page;
+	int err;
+
+	alloc_inode_block(inode);
+
+	if (!wc.ofs)
+		return 0;
+
+	page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
+	if (!page)
+		return -ENOMEM;
+
+	err = __logfs_truncate_rec(inode, page, &wc, size);
+	logfs_put_write_page(page);
+	if (err)
+		return err;
+
+	if (li->li_data[INDIRECT_INDEX] != wc.ofs)
+		li->li_data[INDIRECT_INDEX] = wc.ofs;
+	return 0;
+}
+
+static int __logfs_truncate(struct inode *inode, u64 size)
+{
+	int ret;
+
+	if (size >= logfs_factor(logfs_inode(inode)->li_height))
+		return 0;
+
+	ret = logfs_truncate_rec(inode, size);
+	if (ret)
+		return ret;
+
+	return logfs_truncate_direct(inode, size);
+}
+
+/*
+ * Truncate, by changing the segment file, can consume a fair amount
+ * of resources.  So back off from time to time and do some GC.
+ * 8 or 2048 blocks should be well within safety limits even if
+ * every single block resided in a different segment.
+ */
+#define TRUNCATE_STEP	(8 * 1024 * 1024)
+int logfs_truncate(struct inode *inode, u64 target)
+{
+	struct super_block *sb = inode->i_sb;
+	u64 size = i_size_read(inode);
+	int err = 0;
+
+	size = ALIGN(size, TRUNCATE_STEP);
+	while (size > target) {
+		if (size > TRUNCATE_STEP)
+			size -= TRUNCATE_STEP;
+		else
+			size = 0;
+		if (size < target)
+			size = target;
+
+		logfs_get_wblocks(sb, NULL, 1);
+		err = __logfs_truncate(inode, size);
+		if (!err)
+			err = __logfs_write_inode(inode, 0);
+		logfs_put_wblocks(sb, NULL, 1);
+	}
+
+	if (!err)
+		err = vmtruncate(inode, target);
+
+	/* I don't trust error recovery yet. */
+	WARN_ON(err);
+	return err;
+}
+
+static void move_page_to_inode(struct inode *inode, struct page *page)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block = logfs_block(page);
+
+	if (!block)
+		return;
+
+	log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	BUG_ON(li->li_block);
+	block->ops = &inode_block_ops;
+	block->inode = inode;
+	li->li_block = block;
+
+	block->page = NULL;
+	page->private = 0;
+	ClearPagePrivate(page);
+}
+
+static void move_inode_to_page(struct page *page, struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block = li->li_block;
+
+	if (!block)
+		return;
+
+	log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	BUG_ON(PagePrivate(page));
+	block->ops = &indirect_block_ops;
+	block->page = page;
+	page->private = (unsigned long)block;
+	SetPagePrivate(page);
+
+	block->inode = NULL;
+	li->li_block = NULL;
+}
+
+int logfs_read_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *master_inode = super->s_master_inode;
+	struct page *page;
+	struct logfs_disk_inode *di;
+	u64 ino = inode->i_ino;
+
+	if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
+		return -ENODATA;
+	if (!logfs_exist_block(master_inode, ino))
+		return -ENODATA;
+
+	page = read_cache_page(master_inode->i_mapping, ino,
+			(filler_t *)logfs_readpage, NULL);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+
+	di = kmap_atomic(page, KM_USER0);
+	logfs_disk_to_inode(di, inode);
+	kunmap_atomic(di, KM_USER0);
+	move_page_to_inode(inode, page);
+	page_cache_release(page);
+	return 0;
+}
+
+/* Caller must logfs_put_write_page(page); */
+static struct page *inode_to_page(struct inode *inode)
+{
+	struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
+	struct logfs_disk_inode *di;
+	struct page *page;
+
+	BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
+
+	page = logfs_get_write_page(master_inode, inode->i_ino, 0);
+	if (!page)
+		return NULL;
+
+	di = kmap_atomic(page, KM_USER0);
+	logfs_inode_to_disk(inode, di);
+	kunmap_atomic(di, KM_USER0);
+	move_inode_to_page(page, inode);
+	return page;
+}
+
+static int do_write_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct inode *master_inode = logfs_super(sb)->s_master_inode;
+	loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
+	struct page *page;
+	int err;
+
+	BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
+	/* FIXME: lock inode */
+
+	if (i_size_read(master_inode) < size)
+		i_size_write(master_inode, size);
+
+	/* TODO: Tell vfs this inode is clean now */
+
+	page = inode_to_page(inode);
+	if (!page)
+		return -ENOMEM;
+
+	/* FIXME: transaction is part of logfs_block now.  Is that enough? */
+	err = logfs_write_buf(master_inode, page, 0);
+	if (err)
+		move_page_to_inode(inode, page);
+
+	logfs_put_write_page(page);
+	return err;
+}
+
+static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
+		int write,
+		void (*change_se)(struct logfs_segment_entry *, long),
+		long arg)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+	struct page *page;
+	struct logfs_segment_entry *se;
+	pgoff_t page_no;
+	int child_no;
+
+	page_no = segno >> (sb->s_blocksize_bits - 3);
+	child_no = segno & ((sb->s_blocksize >> 3) - 1);
+
+	inode = super->s_segfile_inode;
+	page = logfs_get_write_page(inode, page_no, 0);
+	BUG_ON(!page); /* FIXME: We need some reserve page for this case */
+	if (!PageUptodate(page))
+		logfs_read_block(inode, page, WRITE);
+
+	if (write)
+		alloc_indirect_block(inode, page, 0);
+	se = kmap_atomic(page, KM_USER0);
+	change_se(se + child_no, arg);
+	if (write) {
+		logfs_set_alias(sb, logfs_block(page), child_no);
+		BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
+	}
+	kunmap_atomic(se, KM_USER0);
+
+	logfs_put_write_page(page);
+}
+
+static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
+{
+	struct logfs_segment_entry *target = (void *)_target;
+
+	*target = *se;
+}
+
+void logfs_get_segment_entry(struct super_block *sb, u32 segno,
+		struct logfs_segment_entry *se)
+{
+	logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
+}
+
+static void __set_segment_used(struct logfs_segment_entry *se, long increment)
+{
+	u32 valid;
+
+	valid = be32_to_cpu(se->valid);
+	valid += increment;
+	se->valid = cpu_to_be32(valid);
+}
+
+void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 segno = ofs >> super->s_segshift;
+
+	if (!increment)
+		return;
+
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
+}
+
+static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
+{
+	se->ec_level = cpu_to_be32(ec_level);
+}
+
+void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
+		gc_level_t gc_level)
+{
+	u32 ec_level = ec << 4 | (__force u8)gc_level;
+
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
+}
+
+static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
+{
+	se->valid = cpu_to_be32(RESERVED);
+}
+
+void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
+{
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
+}
+
+static void __set_segment_unreserved(struct logfs_segment_entry *se,
+		long ec_level)
+{
+	se->valid = 0;
+	se->ec_level = cpu_to_be32(ec_level);
+}
+
+void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
+{
+	u32 ec_level = ec << 4;
+
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
+			ec_level);
+}
+
+int __logfs_write_inode(struct inode *inode, long flags)
+{
+	struct super_block *sb = inode->i_sb;
+	int ret;
+
+	logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
+	ret = do_write_inode(inode);
+	logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
+	return ret;
+}
+
+static int do_delete_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct inode *master_inode = logfs_super(sb)->s_master_inode;
+	struct page *page;
+	int ret;
+
+	page = logfs_get_write_page(master_inode, inode->i_ino, 0);
+	if (!page)
+		return -ENOMEM;
+
+	move_inode_to_page(page, inode);
+
+	logfs_get_wblocks(sb, page, 1);
+	ret = __logfs_delete(master_inode, page);
+	logfs_put_wblocks(sb, page, 1);
+
+	logfs_put_write_page(page);
+	return ret;
+}
+
+/*
+ * ZOMBIE inodes have already been deleted before and should remain dead,
+ * if it weren't for valid checking.  No need to kill them again here.
+ */
+void logfs_evict_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block = li->li_block;
+	struct page *page;
+
+	if (!inode->i_nlink) {
+		if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
+			li->li_flags |= LOGFS_IF_ZOMBIE;
+			if (i_size_read(inode) > 0)
+				logfs_truncate(inode, 0);
+			do_delete_inode(inode);
+		}
+	}
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+
+	/* Cheaper version of write_inode.  All changes are concealed in
+	 * aliases, which are moved back.  No write to the medium happens.
+	 */
+	/* Only deleted files may be dirty at this point */
+	BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
+	if (!block)
+		return;
+	if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
+		block->ops->free_block(inode->i_sb, block);
+		return;
+	}
+
+	BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
+	page = inode_to_page(inode);
+	BUG_ON(!page); /* FIXME: Use emergency page */
+	logfs_put_write_page(page);
+}
+
+void btree_write_block(struct logfs_block *block)
+{
+	struct inode *inode;
+	struct page *page;
+	int err, cookie;
+
+	inode = logfs_safe_iget(block->sb, block->ino, &cookie);
+	page = logfs_get_write_page(inode, block->bix, block->level);
+
+	err = logfs_readpage_nolock(page);
+	BUG_ON(err);
+	BUG_ON(!PagePrivate(page));
+	BUG_ON(logfs_block(page) != block);
+	err = __logfs_write_buf(inode, page, 0);
+	BUG_ON(err);
+	BUG_ON(PagePrivate(page) || page->private);
+
+	logfs_put_write_page(page);
+	logfs_safe_iput(inode, cookie);
+}
+
+/**
+ * logfs_inode_write - write inode or dentry objects
+ *
+ * @inode:		parent inode (ifile or directory)
+ * @buf:		object to write (inode or dentry)
+ * @n:			object size
+ * @_pos:		object number (file position in blocks/objects)
+ * @flags:		write flags
+ * @lock:		0 if write lock is already taken, 1 otherwise
+ * @shadow_tree:	shadow below this inode
+ *
+ * FIXME: All caller of this put a 200-300 byte variable on the stack,
+ * only to call here and do a memcpy from that stack variable.  A good
+ * example of wasted performance and stack space.
+ */
+int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
+		loff_t bix, long flags, struct shadow_tree *shadow_tree)
+{
+	loff_t pos = bix << inode->i_sb->s_blocksize_bits;
+	int err;
+	struct page *page;
+	void *pagebuf;
+
+	BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
+	BUG_ON(count > LOGFS_BLOCKSIZE);
+	page = logfs_get_write_page(inode, bix, 0);
+	if (!page)
+		return -ENOMEM;
+
+	pagebuf = kmap_atomic(page, KM_USER0);
+	memcpy(pagebuf, buf, count);
+	flush_dcache_page(page);
+	kunmap_atomic(pagebuf, KM_USER0);
+
+	if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
+		i_size_write(inode, pos + LOGFS_BLOCKSIZE);
+
+	err = logfs_write_buf(inode, page, flags);
+	logfs_put_write_page(page);
+	return err;
+}
+
+int logfs_open_segfile(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+
+	inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	super->s_segfile_inode = inode;
+	return 0;
+}
+
+int logfs_init_rw(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int min_fill = 3 * super->s_no_blocks;
+
+	INIT_LIST_HEAD(&super->s_object_alias);
+	INIT_LIST_HEAD(&super->s_writeback_list);
+	mutex_init(&super->s_write_mutex);
+	super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
+			sizeof(struct logfs_block));
+	super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
+			sizeof(struct logfs_shadow));
+	return 0;
+}
+
+void logfs_cleanup_rw(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	logfs_mempool_destroy(super->s_block_pool);
+	logfs_mempool_destroy(super->s_shadow_pool);
+}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
new file mode 100644
index 00000000..9d518735
--- /dev/null
+++ b/fs/logfs/segment.c
@@ -0,0 +1,932 @@
+/*
+ * fs/logfs/segment.c	- Handling the Object Store
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Object store or ostore makes up the complete device with exception of
+ * the superblock and journal areas.  Apart from its own metadata it stores
+ * three kinds of objects: inodes, dentries and blocks, both data and indirect.
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+
+static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head32 *head = &super->s_reserved_segments;
+	int err;
+
+	err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
+	if (err)
+		return err;
+	logfs_super(sb)->s_bad_segments++;
+	/* FIXME: write to journal */
+	return 0;
+}
+
+int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	super->s_gec++;
+
+	return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
+			super->s_segsize, ensure_erase);
+}
+
+static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
+{
+	s32 ofs;
+
+	logfs_open_area(area, bytes);
+
+	ofs = area->a_used_bytes;
+	area->a_used_bytes += bytes;
+	BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
+
+	return dev_ofs(area->a_sb, area->a_segno, ofs);
+}
+
+static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
+		int use_filler)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = super->s_devops->readpage;
+	struct page *page;
+
+	BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
+	if (use_filler)
+		page = read_cache_page(mapping, index, filler, sb);
+	else {
+		page = find_or_create_page(mapping, index, GFP_NOFS);
+		unlock_page(page);
+	}
+	return page;
+}
+
+int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+		int use_filler)
+{
+	pgoff_t index = ofs >> PAGE_SHIFT;
+	struct page *page;
+	long offset = ofs & (PAGE_SIZE-1);
+	long copylen;
+
+	/* Only logfs_wbuf_recover may use len==0 */
+	BUG_ON(!len && !use_filler);
+	do {
+		copylen = min((ulong)len, PAGE_SIZE - offset);
+
+		page = get_mapping_page(area->a_sb, index, use_filler);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		BUG_ON(!page); /* FIXME: reserve a pool */
+		SetPageUptodate(page);
+		memcpy(page_address(page) + offset, buf, copylen);
+		SetPagePrivate(page);
+		page_cache_release(page);
+
+		buf += copylen;
+		len -= copylen;
+		offset = 0;
+		index++;
+	} while (len);
+	return 0;
+}
+
+static void pad_partial_page(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct page *page;
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
+	pgoff_t index = ofs >> PAGE_SHIFT;
+	long offset = ofs & (PAGE_SIZE-1);
+	u32 len = PAGE_SIZE - offset;
+
+	if (len % PAGE_SIZE) {
+		page = get_mapping_page(sb, index, 0);
+		BUG_ON(!page); /* FIXME: reserve a pool */
+		memset(page_address(page) + offset, 0xff, len);
+		SetPagePrivate(page);
+		page_cache_release(page);
+	}
+}
+
+static void pad_full_pages(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
+	u32 len = super->s_segsize - area->a_used_bytes;
+	pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
+	pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (no_indizes) {
+		page = get_mapping_page(sb, index, 0);
+		BUG_ON(!page); /* FIXME: reserve a pool */
+		SetPageUptodate(page);
+		memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
+		SetPagePrivate(page);
+		page_cache_release(page);
+		index++;
+		no_indizes--;
+	}
+}
+
+/*
+ * bdev_writeseg will write full pages.  Memset the tail to prevent data leaks.
+ * Also make sure we allocate (and memset) all pages for final writeout.
+ */
+static void pad_wbuf(struct logfs_area *area, int final)
+{
+	pad_partial_page(area);
+	if (final)
+		pad_full_pages(area);
+}
+
+/*
+ * We have to be careful with the alias tree.  Since lookup is done by bix,
+ * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
+ * indirect blocks.  So always use it through accessor functions.
+ */
+static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix,
+		level_t level)
+{
+	struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
+	pgoff_t index = logfs_pack_index(bix, level);
+
+	return btree_lookup128(head, ino, index);
+}
+
+static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, void *val)
+{
+	struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
+	pgoff_t index = logfs_pack_index(bix, level);
+
+	return btree_insert128(head, ino, index, val, GFP_NOFS);
+}
+
+static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
+		write_alias_t *write_one_alias)
+{
+	struct object_alias_item *item;
+	int err;
+
+	list_for_each_entry(item, &block->item_list, list) {
+		err = write_alias_journal(sb, block->ino, block->bix,
+				block->level, item->child_no, item->val);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static struct logfs_block_ops btree_block_ops = {
+	.write_block	= btree_write_block,
+	.free_block	= __free_block,
+	.write_alias	= btree_write_alias,
+};
+
+int logfs_load_object_aliases(struct super_block *sb,
+		struct logfs_obj_alias *oa, int count)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+	struct object_alias_item *item;
+	u64 ino, bix;
+	level_t level;
+	int i, err;
+
+	super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
+	count /= sizeof(*oa);
+	for (i = 0; i < count; i++) {
+		item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
+		if (!item)
+			return -ENOMEM;
+		memset(item, 0, sizeof(*item));
+
+		super->s_no_object_aliases++;
+		item->val = oa[i].val;
+		item->child_no = be16_to_cpu(oa[i].child_no);
+
+		ino = be64_to_cpu(oa[i].ino);
+		bix = be64_to_cpu(oa[i].bix);
+		level = LEVEL(oa[i].level);
+
+		log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n",
+				ino, bix, level, item->child_no,
+				be64_to_cpu(item->val));
+		block = alias_tree_lookup(sb, ino, bix, level);
+		if (!block) {
+			block = __alloc_block(sb, ino, bix, level);
+			block->ops = &btree_block_ops;
+			err = alias_tree_insert(sb, ino, bix, level, block);
+			BUG_ON(err); /* mempool empty */
+		}
+		if (test_and_set_bit(item->child_no, block->alias_map)) {
+			printk(KERN_ERR"LogFS: Alias collision detected\n");
+			return -EIO;
+		}
+		list_move_tail(&block->alias_list, &super->s_object_alias);
+		list_add(&item->list, &block->item_list);
+	}
+	return 0;
+}
+
+static void kill_alias(void *_block, unsigned long ignore0,
+		u64 ignore1, u64 ignore2, size_t ignore3)
+{
+	struct logfs_block *block = _block;
+	struct super_block *sb = block->sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct object_alias_item *item;
+
+	while (!list_empty(&block->item_list)) {
+		item = list_entry(block->item_list.next, typeof(*item), list);
+		list_del(&item->list);
+		mempool_free(item, super->s_alias_pool);
+	}
+	block->ops->free_block(sb, block);
+}
+
+static int obj_type(struct inode *inode, level_t level)
+{
+	if (level == 0) {
+		if (S_ISDIR(inode->i_mode))
+			return OBJ_DENTRY;
+		if (inode->i_ino == LOGFS_INO_MASTER)
+			return OBJ_INODE;
+	}
+	return OBJ_BLOCK;
+}
+
+static int obj_len(struct super_block *sb, int obj_type)
+{
+	switch (obj_type) {
+	case OBJ_DENTRY:
+		return sizeof(struct logfs_disk_dentry);
+	case OBJ_INODE:
+		return sizeof(struct logfs_disk_inode);
+	case OBJ_BLOCK:
+		return sb->s_blocksize;
+	default:
+		BUG();
+	}
+}
+
+static int __logfs_segment_write(struct inode *inode, void *buf,
+		struct logfs_shadow *shadow, int type, int len, int compr)
+{
+	struct logfs_area *area;
+	struct super_block *sb = inode->i_sb;
+	s64 ofs;
+	struct logfs_object_header h;
+	int acc_len;
+
+	if (shadow->gc_level == 0)
+		acc_len = len;
+	else
+		acc_len = obj_len(sb, type);
+
+	area = get_area(sb, shadow->gc_level);
+	ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE);
+	LOGFS_BUG_ON(ofs <= 0, sb);
+	/*
+	 * Order is important.  logfs_get_free_bytes(), by modifying the
+	 * segment file, may modify the content of the very page we're about
+	 * to write now.  Which is fine, as long as the calculated crc and
+	 * written data still match.  So do the modifications _before_
+	 * calculating the crc.
+	 */
+
+	h.len	= cpu_to_be16(len);
+	h.type	= type;
+	h.compr	= compr;
+	h.ino	= cpu_to_be64(inode->i_ino);
+	h.bix	= cpu_to_be64(shadow->bix);
+	h.crc	= logfs_crc32(&h, sizeof(h) - 4, 4);
+	h.data_crc = logfs_crc32(buf, len, 0);
+
+	logfs_buf_write(area, ofs, &h, sizeof(h));
+	logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len);
+
+	shadow->new_ofs = ofs;
+	shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE;
+
+	return 0;
+}
+
+static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
+		struct logfs_shadow *shadow, int type, int len)
+{
+	struct super_block *sb = inode->i_sb;
+	void *compressor_buf = logfs_super(sb)->s_compressed_je;
+	ssize_t compr_len;
+	int ret;
+
+	mutex_lock(&logfs_super(sb)->s_journal_mutex);
+	compr_len = logfs_compress(buf, compressor_buf, len, len);
+
+	if (compr_len >= 0) {
+		ret = __logfs_segment_write(inode, compressor_buf, shadow,
+				type, compr_len, COMPR_ZLIB);
+	} else {
+		ret = __logfs_segment_write(inode, buf, shadow, type, len,
+				COMPR_NONE);
+	}
+	mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+	return ret;
+}
+
+/**
+ * logfs_segment_write - write data block to object store
+ * @inode:		inode containing data
+ *
+ * Returns an errno or zero.
+ */
+int logfs_segment_write(struct inode *inode, struct page *page,
+		struct logfs_shadow *shadow)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	int do_compress, type, len;
+	int ret;
+	void *buf;
+
+	super->s_flags |= LOGFS_SB_FLAG_DIRTY;
+	BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+	do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
+	if (shadow->gc_level != 0) {
+		/* temporarily disable compression for indirect blocks */
+		do_compress = 0;
+	}
+
+	type = obj_type(inode, shrink_level(shadow->gc_level));
+	len = obj_len(sb, type);
+	buf = kmap(page);
+	if (do_compress)
+		ret = logfs_segment_write_compress(inode, buf, shadow, type,
+				len);
+	else
+		ret = __logfs_segment_write(inode, buf, shadow, type, len,
+				COMPR_NONE);
+	kunmap(page);
+
+	log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n",
+			shadow->ino, shadow->bix, shadow->gc_level,
+			shadow->old_ofs, shadow->new_ofs,
+			shadow->old_len, shadow->new_len);
+	/* this BUG_ON did catch a locking bug.  useful */
+	BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1)));
+	return ret;
+}
+
+int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
+{
+	pgoff_t index = ofs >> PAGE_SHIFT;
+	struct page *page;
+	long offset = ofs & (PAGE_SIZE-1);
+	long copylen;
+
+	while (len) {
+		copylen = min((ulong)len, PAGE_SIZE - offset);
+
+		page = get_mapping_page(sb, index, 1);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		memcpy(buf, page_address(page) + offset, copylen);
+		page_cache_release(page);
+
+		buf += copylen;
+		len -= copylen;
+		offset = 0;
+		index++;
+	}
+	return 0;
+}
+
+/*
+ * The "position" of indirect blocks is ambiguous.  It can be the position
+ * of any data block somewhere behind this indirect block.  So we need to
+ * normalize the positions through logfs_block_mask() before comparing.
+ */
+static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level)
+{
+	return	(pos1 & logfs_block_mask(sb, level)) !=
+		(pos2 & logfs_block_mask(sb, level));
+}
+
+#if 0
+static int read_seg_header(struct super_block *sb, u64 ofs,
+		struct logfs_segment_header *sh)
+{
+	__be32 crc;
+	int err;
+
+	err = wbuf_read(sb, ofs, sizeof(*sh), sh);
+	if (err)
+		return err;
+	crc = logfs_crc32(sh, sizeof(*sh), 4);
+	if (crc != sh->crc) {
+		printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
+				"got %x\n", ofs, be32_to_cpu(sh->crc),
+				be32_to_cpu(crc));
+		return -EIO;
+	}
+	return 0;
+}
+#endif
+
+static int read_obj_header(struct super_block *sb, u64 ofs,
+		struct logfs_object_header *oh)
+{
+	__be32 crc;
+	int err;
+
+	err = wbuf_read(sb, ofs, sizeof(*oh), oh);
+	if (err)
+		return err;
+	crc = logfs_crc32(oh, sizeof(*oh) - 4, 4);
+	if (crc != oh->crc) {
+		printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
+				"got %x\n", ofs, be32_to_cpu(oh->crc),
+				be32_to_cpu(crc));
+		return -EIO;
+	}
+	return 0;
+}
+
+static void move_btree_to_page(struct inode *inode, struct page *page,
+		__be64 *data)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head128 *head = &super->s_object_alias_tree;
+	struct logfs_block *block;
+	struct object_alias_item *item, *next;
+
+	if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS))
+		return;
+
+	block = btree_remove128(head, inode->i_ino, page->index);
+	if (!block)
+		return;
+
+	log_blockmove("move_btree_to_page(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	list_for_each_entry_safe(item, next, &block->item_list, list) {
+		data[item->child_no] = item->val;
+		list_del(&item->list);
+		mempool_free(item, super->s_alias_pool);
+	}
+	block->page = page;
+	SetPagePrivate(page);
+	page->private = (unsigned long)block;
+	block->ops = &indirect_block_ops;
+	initialize_block_counters(page, block, data, 0);
+}
+
+/*
+ * This silences a false, yet annoying gcc warning.  I hate it when my editor
+ * jumps into bitops.h each time I recompile this file.
+ * TODO: Complain to gcc folks about this and upgrade compiler.
+ */
+static unsigned long fnb(const unsigned long *addr,
+		unsigned long size, unsigned long offset)
+{
+	return find_next_bit(addr, size, offset);
+}
+
+void move_page_to_btree(struct page *page)
+{
+	struct logfs_block *block = logfs_block(page);
+	struct super_block *sb = block->sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct object_alias_item *item;
+	unsigned long pos;
+	__be64 *child;
+	int err;
+
+	if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) {
+		block->ops->free_block(sb, block);
+		return;
+	}
+	log_blockmove("move_page_to_btree(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
+
+	for (pos = 0; ; pos++) {
+		pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+		if (pos >= LOGFS_BLOCK_FACTOR)
+			break;
+
+		item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
+		BUG_ON(!item); /* mempool empty */
+		memset(item, 0, sizeof(*item));
+
+		child = kmap_atomic(page, KM_USER0);
+		item->val = child[pos];
+		kunmap_atomic(child, KM_USER0);
+		item->child_no = pos;
+		list_add(&item->list, &block->item_list);
+	}
+	block->page = NULL;
+	ClearPagePrivate(page);
+	page->private = 0;
+	block->ops = &btree_block_ops;
+	err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
+			block);
+	BUG_ON(err); /* mempool empty */
+	ClearPageUptodate(page);
+}
+
+static int __logfs_segment_read(struct inode *inode, void *buf,
+		u64 ofs, u64 bix, level_t level)
+{
+	struct super_block *sb = inode->i_sb;
+	void *compressor_buf = logfs_super(sb)->s_compressed_je;
+	struct logfs_object_header oh;
+	__be32 crc;
+	u16 len;
+	int err, block_len;
+
+	block_len = obj_len(sb, obj_type(inode, level));
+	err = read_obj_header(sb, ofs, &oh);
+	if (err)
+		goto out_err;
+
+	err = -EIO;
+	if (be64_to_cpu(oh.ino) != inode->i_ino
+			|| check_pos(sb, be64_to_cpu(oh.bix), bix, level)) {
+		printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
+				"expected (%lx, %llx), got (%llx, %llx)\n",
+				ofs, inode->i_ino, bix,
+				be64_to_cpu(oh.ino), be64_to_cpu(oh.bix));
+		goto out_err;
+	}
+
+	len = be16_to_cpu(oh.len);
+
+	switch (oh.compr) {
+	case COMPR_NONE:
+		err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf);
+		if (err)
+			goto out_err;
+		crc = logfs_crc32(buf, len, 0);
+		if (crc != oh.data_crc) {
+			printk(KERN_ERR"LOGFS: uncompressed data crc error at "
+					"%llx: expected %x, got %x\n", ofs,
+					be32_to_cpu(oh.data_crc),
+					be32_to_cpu(crc));
+			goto out_err;
+		}
+		break;
+	case COMPR_ZLIB:
+		mutex_lock(&logfs_super(sb)->s_journal_mutex);
+		err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len,
+				compressor_buf);
+		if (err) {
+			mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+			goto out_err;
+		}
+		crc = logfs_crc32(compressor_buf, len, 0);
+		if (crc != oh.data_crc) {
+			printk(KERN_ERR"LOGFS: compressed data crc error at "
+					"%llx: expected %x, got %x\n", ofs,
+					be32_to_cpu(oh.data_crc),
+					be32_to_cpu(crc));
+			mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+			goto out_err;
+		}
+		err = logfs_uncompress(compressor_buf, buf, len, block_len);
+		mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+		if (err) {
+			printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
+			goto out_err;
+		}
+		break;
+	default:
+		LOGFS_BUG(sb);
+		err = -EIO;
+		goto out_err;
+	}
+	return 0;
+
+out_err:
+	logfs_set_ro(sb);
+	printk(KERN_ERR"LOGFS: device is read-only now\n");
+	LOGFS_BUG(sb);
+	return err;
+}
+
+/**
+ * logfs_segment_read - read data block from object store
+ * @inode:		inode containing data
+ * @buf:		data buffer
+ * @ofs:		physical data offset
+ * @bix:		block index
+ * @level:		block level
+ *
+ * Returns 0 on success or a negative errno.
+ */
+int logfs_segment_read(struct inode *inode, struct page *page,
+		u64 ofs, u64 bix, level_t level)
+{
+	int err;
+	void *buf;
+
+	if (PageUptodate(page))
+		return 0;
+
+	ofs &= ~LOGFS_FULLY_POPULATED;
+
+	buf = kmap(page);
+	err = __logfs_segment_read(inode, buf, ofs, bix, level);
+	if (!err) {
+		move_btree_to_page(inode, page, buf);
+		SetPageUptodate(page);
+	}
+	kunmap(page);
+	log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n",
+			inode->i_ino, bix, level, ofs, err);
+	return err;
+}
+
+int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_object_header h;
+	u16 len;
+	int err;
+
+	super->s_flags |= LOGFS_SB_FLAG_DIRTY;
+	BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+	BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
+	if (!shadow->old_ofs)
+		return 0;
+
+	log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n",
+			shadow->ino, shadow->bix, shadow->gc_level,
+			shadow->old_ofs, shadow->new_ofs,
+			shadow->old_len, shadow->new_len);
+	err = read_obj_header(sb, shadow->old_ofs, &h);
+	LOGFS_BUG_ON(err, sb);
+	LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
+	LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
+				shrink_level(shadow->gc_level)), sb);
+
+	if (shadow->gc_level == 0)
+		len = be16_to_cpu(h.len);
+	else
+		len = obj_len(sb, h.type);
+	shadow->old_len = len + sizeof(h);
+	return 0;
+}
+
+void freeseg(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct page *page;
+	u64 ofs, start, end;
+
+	start = dev_ofs(sb, segno, 0);
+	end = dev_ofs(sb, segno + 1, 0);
+	for (ofs = start; ofs < end; ofs += PAGE_SIZE) {
+		page = find_get_page(mapping, ofs >> PAGE_SHIFT);
+		if (!page)
+			continue;
+		ClearPagePrivate(page);
+		page_cache_release(page);
+	}
+}
+
+int logfs_open_area(struct logfs_area *area, size_t bytes)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	int err, closed = 0;
+
+	if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize)
+		return 0;
+
+	if (area->a_is_open) {
+		u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+		u32 len = super->s_segsize - area->a_written_bytes;
+
+		log_gc("logfs_close_area(%x)\n", area->a_segno);
+		pad_wbuf(area, 1);
+		super->s_devops->writeseg(area->a_sb, ofs, len);
+		freeseg(sb, area->a_segno);
+		closed = 1;
+	}
+
+	area->a_used_bytes = 0;
+	area->a_written_bytes = 0;
+again:
+	area->a_ops->get_free_segment(area);
+	area->a_ops->get_erase_count(area);
+
+	log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level);
+	err = area->a_ops->erase_segment(area);
+	if (err) {
+		printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
+				area->a_segno);
+		logfs_mark_segment_bad(sb, area->a_segno);
+		goto again;
+	}
+	area->a_is_open = 1;
+	return closed;
+}
+
+void logfs_sync_area(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+	u32 len = (area->a_used_bytes - area->a_written_bytes);
+
+	if (super->s_writesize)
+		len &= ~(super->s_writesize - 1);
+	if (len == 0)
+		return;
+	pad_wbuf(area, 0);
+	super->s_devops->writeseg(sb, ofs, len);
+	area->a_written_bytes += len;
+}
+
+void logfs_sync_segments(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	for_each_area(i)
+		logfs_sync_area(super->s_area[i]);
+}
+
+/*
+ * Pick a free segment to be used for this area.  Effectively takes a
+ * candidate from the free list (not really a candidate anymore).
+ */
+static void ostore_get_free_segment(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	if (super->s_free_list.count == 0) {
+		printk(KERN_ERR"LOGFS: ran out of free segments\n");
+		LOGFS_BUG(sb);
+	}
+
+	area->a_segno = get_best_cand(sb, &super->s_free_list, NULL);
+}
+
+static void ostore_get_erase_count(struct logfs_area *area)
+{
+	struct logfs_segment_entry se;
+	u32 ec_level;
+
+	logfs_get_segment_entry(area->a_sb, area->a_segno, &se);
+	BUG_ON(se.ec_level == cpu_to_be32(BADSEG) ||
+			se.valid == cpu_to_be32(RESERVED));
+
+	ec_level = be32_to_cpu(se.ec_level);
+	area->a_erase_count = (ec_level >> 4) + 1;
+}
+
+static int ostore_erase_segment(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_segment_header sh;
+	u64 ofs;
+	int err;
+
+	err = logfs_erase_segment(sb, area->a_segno, 0);
+	if (err)
+		return err;
+
+	sh.pad = 0;
+	sh.type = SEG_OSTORE;
+	sh.level = (__force u8)area->a_level;
+	sh.segno = cpu_to_be32(area->a_segno);
+	sh.ec = cpu_to_be32(area->a_erase_count);
+	sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+	sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
+
+	logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count,
+			area->a_level);
+
+	ofs = dev_ofs(sb, area->a_segno, 0);
+	area->a_used_bytes = sizeof(sh);
+	logfs_buf_write(area, ofs, &sh, sizeof(sh));
+	return 0;
+}
+
+static const struct logfs_area_ops ostore_area_ops = {
+	.get_free_segment	= ostore_get_free_segment,
+	.get_erase_count	= ostore_get_erase_count,
+	.erase_segment		= ostore_erase_segment,
+};
+
+static void free_area(struct logfs_area *area)
+{
+	if (area)
+		freeseg(area->a_sb, area->a_segno);
+	kfree(area);
+}
+
+static struct logfs_area *alloc_area(struct super_block *sb)
+{
+	struct logfs_area *area;
+
+	area = kzalloc(sizeof(*area), GFP_KERNEL);
+	if (!area)
+		return NULL;
+
+	area->a_sb = sb;
+	return area;
+}
+
+static void map_invalidatepage(struct page *page, unsigned long l)
+{
+	BUG();
+}
+
+static int map_releasepage(struct page *page, gfp_t g)
+{
+	/* Don't release these pages */
+	return 0;
+}
+
+static const struct address_space_operations mapping_aops = {
+	.invalidatepage = map_invalidatepage,
+	.releasepage	= map_releasepage,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+};
+
+int logfs_init_mapping(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping;
+	struct inode *inode;
+
+	inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	super->s_mapping_inode = inode;
+	mapping = inode->i_mapping;
+	mapping->a_ops = &mapping_aops;
+	/* Would it be possible to use __GFP_HIGHMEM as well? */
+	mapping_set_gfp_mask(mapping, GFP_NOFS);
+	return 0;
+}
+
+int logfs_init_areas(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i = -1;
+
+	super->s_alias_pool = mempool_create_kmalloc_pool(600,
+			sizeof(struct object_alias_item));
+	if (!super->s_alias_pool)
+		return -ENOMEM;
+
+	super->s_journal_area = alloc_area(sb);
+	if (!super->s_journal_area)
+		goto err;
+
+	for_each_area(i) {
+		super->s_area[i] = alloc_area(sb);
+		if (!super->s_area[i])
+			goto err;
+		super->s_area[i]->a_level = GC_LEVEL(i);
+		super->s_area[i]->a_ops = &ostore_area_ops;
+	}
+	btree_init_mempool128(&super->s_object_alias_tree,
+			super->s_btree_pool);
+	return 0;
+
+err:
+	for (i--; i >= 0; i--)
+		free_area(super->s_area[i]);
+	free_area(super->s_journal_area);
+	logfs_mempool_destroy(super->s_alias_pool);
+	return -ENOMEM;
+}
+
+void logfs_cleanup_areas(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
+	for_each_area(i)
+		free_area(super->s_area[i]);
+	free_area(super->s_journal_area);
+}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
new file mode 100644
index 00000000..ce03a182
--- /dev/null
+++ b/fs/logfs/super.c
@@ -0,0 +1,671 @@
+/*
+ * fs/logfs/super.c
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Generally contains mount/umount code and also serves as a dump area for
+ * any functions that don't fit elsewhere and neither justify a file of their
+ * own.
+ */
+#include "logfs.h"
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/mtd/mtd.h>
+#include <linux/statfs.h>
+#include <linux/buffer_head.h>
+
+static DEFINE_MUTEX(emergency_mutex);
+static struct page *emergency_page;
+
+struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index)
+{
+	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+	struct page *page;
+	int err;
+
+	page = read_cache_page(mapping, index, filler, NULL);
+	if (page)
+		return page;
+
+	/* No more pages available, switch to emergency page */
+	printk(KERN_INFO"Logfs: Using emergency page\n");
+	mutex_lock(&emergency_mutex);
+	err = filler(NULL, emergency_page);
+	if (err) {
+		mutex_unlock(&emergency_mutex);
+		printk(KERN_EMERG"Logfs: Error reading emergency page\n");
+		return ERR_PTR(err);
+	}
+	return emergency_page;
+}
+
+void emergency_read_end(struct page *page)
+{
+	if (page == emergency_page)
+		mutex_unlock(&emergency_mutex);
+	else
+		page_cache_release(page);
+}
+
+static void dump_segfile(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_entry se;
+	u32 segno;
+
+	for (segno = 0; segno < super->s_no_segs; segno++) {
+		logfs_get_segment_entry(sb, segno, &se);
+		printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level),
+				be32_to_cpu(se.valid));
+		if (++segno < super->s_no_segs) {
+			logfs_get_segment_entry(sb, segno, &se);
+			printk(" %6x %8x", be32_to_cpu(se.ec_level),
+					be32_to_cpu(se.valid));
+		}
+		if (++segno < super->s_no_segs) {
+			logfs_get_segment_entry(sb, segno, &se);
+			printk(" %6x %8x", be32_to_cpu(se.ec_level),
+					be32_to_cpu(se.valid));
+		}
+		if (++segno < super->s_no_segs) {
+			logfs_get_segment_entry(sb, segno, &se);
+			printk(" %6x %8x", be32_to_cpu(se.ec_level),
+					be32_to_cpu(se.valid));
+		}
+		printk("\n");
+	}
+}
+
+/*
+ * logfs_crash_dump - dump debug information to device
+ *
+ * The LogFS superblock only occupies part of a segment.  This function will
+ * write as much debug information as it can gather into the spare space.
+ */
+void logfs_crash_dump(struct super_block *sb)
+{
+	dump_segfile(sb);
+}
+
+/*
+ * TODO: move to lib/string.c
+ */
+/**
+ * memchr_inv - Find a character in an area of memory.
+ * @s: The memory area
+ * @c: The byte to search for
+ * @n: The size of the area.
+ *
+ * returns the address of the first character other than @c, or %NULL
+ * if the whole buffer contains just @c.
+ */
+void *memchr_inv(const void *s, int c, size_t n)
+{
+	const unsigned char *p = s;
+	while (n-- != 0)
+		if ((unsigned char)c != *p++)
+			return (void *)(p - 1);
+
+	return NULL;
+}
+
+/*
+ * FIXME: There should be a reserve for root, similar to ext2.
+ */
+int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	stats->f_type		= LOGFS_MAGIC_U32;
+	stats->f_bsize		= sb->s_blocksize;
+	stats->f_blocks		= super->s_size >> LOGFS_BLOCK_BITS >> 3;
+	stats->f_bfree		= super->s_free_bytes >> sb->s_blocksize_bits;
+	stats->f_bavail		= super->s_free_bytes >> sb->s_blocksize_bits;
+	stats->f_files		= 0;
+	stats->f_ffree		= 0;
+	stats->f_namelen	= LOGFS_MAX_NAMELEN;
+	return 0;
+}
+
+static int logfs_sb_set(struct super_block *sb, void *_super)
+{
+	struct logfs_super *super = _super;
+
+	sb->s_fs_info = super;
+	sb->s_mtd = super->s_mtd;
+	sb->s_bdev = super->s_bdev;
+#ifdef CONFIG_BLOCK
+	if (sb->s_bdev)
+		sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
+#endif
+#ifdef CONFIG_MTD
+	if (sb->s_mtd)
+		sb->s_bdi = sb->s_mtd->backing_dev_info;
+#endif
+	return 0;
+}
+
+static int logfs_sb_test(struct super_block *sb, void *_super)
+{
+	struct logfs_super *super = _super;
+	struct mtd_info *mtd = super->s_mtd;
+
+	if (mtd && sb->s_mtd == mtd)
+		return 1;
+	if (super->s_bdev && sb->s_bdev == super->s_bdev)
+		return 1;
+	return 0;
+}
+
+static void set_segment_header(struct logfs_segment_header *sh, u8 type,
+		u8 level, u32 segno, u32 ec)
+{
+	sh->pad = 0;
+	sh->type = type;
+	sh->level = level;
+	sh->segno = cpu_to_be32(segno);
+	sh->ec = cpu_to_be32(ec);
+	sh->gec = cpu_to_be64(segno);
+	sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4);
+}
+
+static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds,
+		u32 segno, u32 ec)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_header *sh = &ds->ds_sh;
+	int i;
+
+	memset(ds, 0, sizeof(*ds));
+	set_segment_header(sh, SEG_SUPER, 0, segno, ec);
+
+	ds->ds_ifile_levels	= super->s_ifile_levels;
+	ds->ds_iblock_levels	= super->s_iblock_levels;
+	ds->ds_data_levels	= super->s_data_levels; /* XXX: Remove */
+	ds->ds_segment_shift	= super->s_segshift;
+	ds->ds_block_shift	= sb->s_blocksize_bits;
+	ds->ds_write_shift	= super->s_writeshift;
+	ds->ds_filesystem_size	= cpu_to_be64(super->s_size);
+	ds->ds_segment_size	= cpu_to_be32(super->s_segsize);
+	ds->ds_bad_seg_reserve	= cpu_to_be32(super->s_bad_seg_reserve);
+	ds->ds_feature_incompat	= cpu_to_be64(super->s_feature_incompat);
+	ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat);
+	ds->ds_feature_compat	= cpu_to_be64(super->s_feature_compat);
+	ds->ds_feature_flags	= cpu_to_be64(super->s_feature_flags);
+	ds->ds_root_reserve	= cpu_to_be64(super->s_root_reserve);
+	ds->ds_speed_reserve	= cpu_to_be64(super->s_speed_reserve);
+	journal_for_each(i)
+		ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]);
+	ds->ds_magic		= cpu_to_be64(LOGFS_MAGIC);
+	ds->ds_crc = logfs_crc32(ds, sizeof(*ds),
+			LOGFS_SEGMENT_HEADERSIZE + 12);
+}
+
+static int write_one_sb(struct super_block *sb,
+		struct page *(*find_sb)(struct super_block *sb, u64 *ofs))
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_disk_super *ds;
+	struct logfs_segment_entry se;
+	struct page *page;
+	u64 ofs;
+	u32 ec, segno;
+	int err;
+
+	page = find_sb(sb, &ofs);
+	if (!page)
+		return -EIO;
+	ds = page_address(page);
+	segno = seg_no(sb, ofs);
+	logfs_get_segment_entry(sb, segno, &se);
+	ec = be32_to_cpu(se.ec_level) >> 4;
+	ec++;
+	logfs_set_segment_erased(sb, segno, ec, 0);
+	logfs_write_ds(sb, ds, segno, ec);
+	err = super->s_devops->write_sb(sb, page);
+	page_cache_release(page);
+	return err;
+}
+
+int logfs_write_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int err;
+
+	/* First superblock */
+	err = write_one_sb(sb, super->s_devops->find_first_sb);
+	if (err)
+		return err;
+
+	/* Last superblock */
+	err = write_one_sb(sb, super->s_devops->find_last_sb);
+	if (err)
+		return err;
+	return 0;
+}
+
+static int ds_cmp(const void *ds0, const void *ds1)
+{
+	size_t len = sizeof(struct logfs_disk_super);
+
+	/* We know the segment headers differ, so ignore them */
+	len -= LOGFS_SEGMENT_HEADERSIZE;
+	ds0 += LOGFS_SEGMENT_HEADERSIZE;
+	ds1 += LOGFS_SEGMENT_HEADERSIZE;
+	return memcmp(ds0, ds1, len);
+}
+
+static int logfs_recover_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_disk_super _ds0, *ds0 = &_ds0;
+	struct logfs_disk_super _ds1, *ds1 = &_ds1;
+	int err, valid0, valid1;
+
+	/* read first superblock */
+	err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0);
+	if (err)
+		return err;
+	/* read last superblock */
+	err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1);
+	if (err)
+		return err;
+	valid0 = logfs_check_ds(ds0) == 0;
+	valid1 = logfs_check_ds(ds1) == 0;
+
+	if (!valid0 && valid1) {
+		printk(KERN_INFO"First superblock is invalid - fixing.\n");
+		return write_one_sb(sb, super->s_devops->find_first_sb);
+	}
+	if (valid0 && !valid1) {
+		printk(KERN_INFO"Last superblock is invalid - fixing.\n");
+		return write_one_sb(sb, super->s_devops->find_last_sb);
+	}
+	if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
+		printk(KERN_INFO"Superblocks don't match - fixing.\n");
+		return logfs_write_sb(sb);
+	}
+	/* If neither is valid now, something's wrong.  Didn't we properly
+	 * check them before?!? */
+	BUG_ON(!valid0 && !valid1);
+	return 0;
+}
+
+static int logfs_make_writeable(struct super_block *sb)
+{
+	int err;
+
+	err = logfs_open_segfile(sb);
+	if (err)
+		return err;
+
+	/* Repair any broken superblock copies */
+	err = logfs_recover_sb(sb);
+	if (err)
+		return err;
+
+	/* Check areas for trailing unaccounted data */
+	err = logfs_check_areas(sb);
+	if (err)
+		return err;
+
+	/* Do one GC pass before any data gets dirtied */
+	logfs_gc_pass(sb);
+
+	/* after all initializations are done, replay the journal
+	 * for rw-mounts, if necessary */
+	err = logfs_replay_journal(sb);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int logfs_get_sb_final(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *rootdir;
+	int err;
+
+	/* root dir */
+	rootdir = logfs_iget(sb, LOGFS_INO_ROOT);
+	if (IS_ERR(rootdir))
+		goto fail;
+
+	sb->s_root = d_alloc_root(rootdir);
+	if (!sb->s_root) {
+		iput(rootdir);
+		goto fail;
+	}
+
+	/* at that point we know that ->put_super() will be called */
+	super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
+	if (!super->s_erase_page)
+		return -ENOMEM;
+	memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
+
+	/* FIXME: check for read-only mounts */
+	err = logfs_make_writeable(sb);
+	if (err) {
+		__free_page(super->s_erase_page);
+		return err;
+	}
+
+	log_super("LogFS: Finished mounting\n");
+	return 0;
+
+fail:
+	iput(super->s_master_inode);
+	iput(super->s_segfile_inode);
+	iput(super->s_mapping_inode);
+	return -EIO;
+}
+
+int logfs_check_ds(struct logfs_disk_super *ds)
+{
+	struct logfs_segment_header *sh = &ds->ds_sh;
+
+	if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC))
+		return -EINVAL;
+	if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4))
+		return -EINVAL;
+	if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds),
+				LOGFS_SEGMENT_HEADERSIZE + 12))
+		return -EINVAL;
+	return 0;
+}
+
+static struct page *find_super_block(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct page *first, *last;
+
+	first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]);
+	if (!first || IS_ERR(first))
+		return NULL;
+	last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
+	if (!last || IS_ERR(last)) {
+		page_cache_release(first);
+		return NULL;
+	}
+
+	if (!logfs_check_ds(page_address(first))) {
+		page_cache_release(last);
+		return first;
+	}
+
+	/* First one didn't work, try the second superblock */
+	if (!logfs_check_ds(page_address(last))) {
+		page_cache_release(first);
+		return last;
+	}
+
+	/* Neither worked, sorry folks */
+	page_cache_release(first);
+	page_cache_release(last);
+	return NULL;
+}
+
+static int __logfs_read_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct page *page;
+	struct logfs_disk_super *ds;
+	int i;
+
+	page = find_super_block(sb);
+	if (!page)
+		return -EINVAL;
+
+	ds = page_address(page);
+	super->s_size = be64_to_cpu(ds->ds_filesystem_size);
+	super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve);
+	super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve);
+	super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve);
+	super->s_segsize = 1 << ds->ds_segment_shift;
+	super->s_segmask = (1 << ds->ds_segment_shift) - 1;
+	super->s_segshift = ds->ds_segment_shift;
+	sb->s_blocksize = 1 << ds->ds_block_shift;
+	sb->s_blocksize_bits = ds->ds_block_shift;
+	super->s_writesize = 1 << ds->ds_write_shift;
+	super->s_writeshift = ds->ds_write_shift;
+	super->s_no_segs = super->s_size >> super->s_segshift;
+	super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
+	super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat);
+	super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat);
+	super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat);
+	super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags);
+
+	journal_for_each(i)
+		super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]);
+
+	super->s_ifile_levels = ds->ds_ifile_levels;
+	super->s_iblock_levels = ds->ds_iblock_levels;
+	super->s_data_levels = ds->ds_data_levels;
+	super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
+		+ super->s_data_levels;
+	page_cache_release(page);
+	return 0;
+}
+
+static int logfs_read_sb(struct super_block *sb, int read_only)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int ret;
+
+	super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL);
+	if (!super->s_btree_pool)
+		return -ENOMEM;
+
+	btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
+	btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
+	btree_init_mempool32(&super->s_shadow_tree.segment_map,
+			super->s_btree_pool);
+
+	ret = logfs_init_mapping(sb);
+	if (ret)
+		return ret;
+
+	ret = __logfs_read_sb(sb);
+	if (ret)
+		return ret;
+
+	if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT)
+		return -EIO;
+	if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) &&
+			!read_only)
+		return -EIO;
+
+	ret = logfs_init_rw(sb);
+	if (ret)
+		return ret;
+
+	ret = logfs_init_areas(sb);
+	if (ret)
+		return ret;
+
+	ret = logfs_init_gc(sb);
+	if (ret)
+		return ret;
+
+	ret = logfs_init_journal(sb);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void logfs_kill_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	log_super("LogFS: Start unmounting\n");
+	/* Alias entries slow down mount, so evict as many as possible */
+	sync_filesystem(sb);
+	logfs_write_anchor(sb);
+
+	/*
+	 * From this point on alias entries are simply dropped - and any
+	 * writes to the object store are considered bugs.
+	 */
+	super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
+	log_super("LogFS: Now in shutdown\n");
+	generic_shutdown_super(sb);
+
+	BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
+
+	logfs_cleanup_gc(sb);
+	logfs_cleanup_journal(sb);
+	logfs_cleanup_areas(sb);
+	logfs_cleanup_rw(sb);
+	if (super->s_erase_page)
+		__free_page(super->s_erase_page);
+	super->s_devops->put_device(super);
+	logfs_mempool_destroy(super->s_btree_pool);
+	logfs_mempool_destroy(super->s_alias_pool);
+	kfree(super);
+	log_super("LogFS: Finished unmounting\n");
+}
+
+static struct dentry *logfs_get_sb_device(struct logfs_super *super,
+		struct file_system_type *type, int flags)
+{
+	struct super_block *sb;
+	int err = -ENOMEM;
+	static int mount_count;
+
+	log_super("LogFS: Start mount %x\n", mount_count++);
+
+	err = -EINVAL;
+	sb = sget(type, logfs_sb_test, logfs_sb_set, super);
+	if (IS_ERR(sb)) {
+		super->s_devops->put_device(super);
+		kfree(super);
+		return ERR_CAST(sb);
+	}
+
+	if (sb->s_root) {
+		/* Device is already in use */
+		super->s_devops->put_device(super);
+		kfree(super);
+		return dget(sb->s_root);
+	}
+
+	/*
+	 * sb->s_maxbytes is limited to 8TB.  On 32bit systems, the page cache
+	 * only covers 16TB and the upper 8TB are used for indirect blocks.
+	 * On 64bit system we could bump up the limit, but that would make
+	 * the filesystem incompatible with 32bit systems.
+	 */
+	sb->s_maxbytes	= (1ull << 43) - 1;
+	sb->s_op	= &logfs_super_operations;
+	sb->s_flags	= flags | MS_NOATIME;
+
+	err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY);
+	if (err)
+		goto err1;
+
+	sb->s_flags |= MS_ACTIVE;
+	err = logfs_get_sb_final(sb);
+	if (err) {
+		deactivate_locked_super(sb);
+		return ERR_PTR(err);
+	}
+	return dget(sb->s_root);
+
+err1:
+	/* no ->s_root, no ->put_super() */
+	iput(super->s_master_inode);
+	iput(super->s_segfile_inode);
+	iput(super->s_mapping_inode);
+	deactivate_locked_super(sb);
+	return ERR_PTR(err);
+}
+
+static struct dentry *logfs_mount(struct file_system_type *type, int flags,
+		const char *devname, void *data)
+{
+	ulong mtdnr;
+	struct logfs_super *super;
+	int err;
+
+	super = kzalloc(sizeof(*super), GFP_KERNEL);
+	if (!super)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&super->s_dirop_mutex);
+	mutex_init(&super->s_object_alias_mutex);
+	INIT_LIST_HEAD(&super->s_freeing_list);
+
+	if (!devname)
+		err = logfs_get_sb_bdev(super, type, devname);
+	else if (strncmp(devname, "mtd", 3))
+		err = logfs_get_sb_bdev(super, type, devname);
+	else {
+		char *garbage;
+		mtdnr = simple_strtoul(devname+3, &garbage, 0);
+		if (*garbage)
+			err = -EINVAL;
+		else
+			err = logfs_get_sb_mtd(super, mtdnr);
+	}
+
+	if (err) {
+		kfree(super);
+		return ERR_PTR(err);
+	}
+
+	return logfs_get_sb_device(super, type, flags);
+}
+
+static struct file_system_type logfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "logfs",
+	.mount		= logfs_mount,
+	.kill_sb	= logfs_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+
+};
+
+static int __init logfs_init(void)
+{
+	int ret;
+
+	emergency_page = alloc_pages(GFP_KERNEL, 0);
+	if (!emergency_page)
+		return -ENOMEM;
+
+	ret = logfs_compr_init();
+	if (ret)
+		goto out1;
+
+	ret = logfs_init_inode_cache();
+	if (ret)
+		goto out2;
+
+	return register_filesystem(&logfs_fs_type);
+out2:
+	logfs_compr_exit();
+out1:
+	__free_pages(emergency_page, 0);
+	return ret;
+}
+
+static void __exit logfs_exit(void)
+{
+	unregister_filesystem(&logfs_fs_type);
+	logfs_destroy_inode_cache();
+	logfs_compr_exit();
+	__free_pages(emergency_page, 0);
+}
+
+module_init(logfs_init);
+module_exit(logfs_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
+MODULE_DESCRIPTION("scalable flash filesystem");
diff --git a/fs/mbcache.c b/fs/mbcache.c
new file mode 100644
index 00000000..8c32ef3b
--- /dev/null
+++ b/fs/mbcache.c
@@ -0,0 +1,620 @@
+/*
+ * linux/fs/mbcache.c
+ * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+/*
+ * Filesystem Meta Information Block Cache (mbcache)
+ *
+ * The mbcache caches blocks of block devices that need to be located
+ * by their device/block number, as well as by other criteria (such
+ * as the block's contents).
+ *
+ * There can only be one cache entry in a cache per device and block number.
+ * Additional indexes need not be unique in this sense. The number of
+ * additional indexes (=other criteria) can be hardwired at compile time
+ * or specified at cache create time.
+ *
+ * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
+ * in the cache. A valid entry is in the main hash tables of the cache,
+ * and may also be in the lru list. An invalid entry is not in any hashes
+ * or lists.
+ *
+ * A valid cache entry is only in the lru list if no handles refer to it.
+ * Invalid cache entries will be freed when the last handle to the cache
+ * entry is released. Entries that cannot be freed immediately are put
+ * back on the lru list.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/mbcache.h>
+
+
+#ifdef MB_CACHE_DEBUG
+# define mb_debug(f...) do { \
+		printk(KERN_DEBUG f); \
+		printk("\n"); \
+	} while (0)
+#define mb_assert(c) do { if (!(c)) \
+		printk(KERN_ERR "assertion " #c " failed\n"); \
+	} while(0)
+#else
+# define mb_debug(f...) do { } while(0)
+# define mb_assert(c) do { } while(0)
+#endif
+#define mb_error(f...) do { \
+		printk(KERN_ERR f); \
+		printk("\n"); \
+	} while(0)
+
+#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
+
+static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
+		
+MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
+MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL(mb_cache_create);
+EXPORT_SYMBOL(mb_cache_shrink);
+EXPORT_SYMBOL(mb_cache_destroy);
+EXPORT_SYMBOL(mb_cache_entry_alloc);
+EXPORT_SYMBOL(mb_cache_entry_insert);
+EXPORT_SYMBOL(mb_cache_entry_release);
+EXPORT_SYMBOL(mb_cache_entry_free);
+EXPORT_SYMBOL(mb_cache_entry_get);
+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
+EXPORT_SYMBOL(mb_cache_entry_find_first);
+EXPORT_SYMBOL(mb_cache_entry_find_next);
+#endif
+
+/*
+ * Global data: list of all mbcache's, lru list, and a spinlock for
+ * accessing cache data structures on SMP machines. The lru list is
+ * global across all mbcaches.
+ */
+
+static LIST_HEAD(mb_cache_list);
+static LIST_HEAD(mb_cache_lru_list);
+static DEFINE_SPINLOCK(mb_cache_spinlock);
+
+/*
+ * What the mbcache registers as to get shrunk dynamically.
+ */
+
+static int mb_cache_shrink_fn(struct shrinker *shrink,
+			      struct shrink_control *sc);
+
+static struct shrinker mb_cache_shrinker = {
+	.shrink = mb_cache_shrink_fn,
+	.seeks = DEFAULT_SEEKS,
+};
+
+static inline int
+__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
+{
+	return !list_empty(&ce->e_block_list);
+}
+
+
+static void
+__mb_cache_entry_unhash(struct mb_cache_entry *ce)
+{
+	if (__mb_cache_entry_is_hashed(ce)) {
+		list_del_init(&ce->e_block_list);
+		list_del(&ce->e_index.o_list);
+	}
+}
+
+
+static void
+__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
+{
+	struct mb_cache *cache = ce->e_cache;
+
+	mb_assert(!(ce->e_used || ce->e_queued));
+	kmem_cache_free(cache->c_entry_cache, ce);
+	atomic_dec(&cache->c_entry_count);
+}
+
+
+static void
+__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
+	__releases(mb_cache_spinlock)
+{
+	/* Wake up all processes queuing for this cache entry. */
+	if (ce->e_queued)
+		wake_up_all(&mb_cache_queue);
+	if (ce->e_used >= MB_CACHE_WRITER)
+		ce->e_used -= MB_CACHE_WRITER;
+	ce->e_used--;
+	if (!(ce->e_used || ce->e_queued)) {
+		if (!__mb_cache_entry_is_hashed(ce))
+			goto forget;
+		mb_assert(list_empty(&ce->e_lru_list));
+		list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
+	}
+	spin_unlock(&mb_cache_spinlock);
+	return;
+forget:
+	spin_unlock(&mb_cache_spinlock);
+	__mb_cache_entry_forget(ce, GFP_KERNEL);
+}
+
+
+/*
+ * mb_cache_shrink_fn()  memory pressure callback
+ *
+ * This function is called by the kernel memory management when memory
+ * gets low.
+ *
+ * @shrink: (ignored)
+ * @sc: shrink_control passed from reclaim
+ *
+ * Returns the number of objects which are present in the cache.
+ */
+static int
+mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
+{
+	LIST_HEAD(free_list);
+	struct mb_cache *cache;
+	struct mb_cache_entry *entry, *tmp;
+	int count = 0;
+	int nr_to_scan = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
+
+	mb_debug("trying to free %d entries", nr_to_scan);
+	spin_lock(&mb_cache_spinlock);
+	while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
+		struct mb_cache_entry *ce =
+			list_entry(mb_cache_lru_list.next,
+				   struct mb_cache_entry, e_lru_list);
+		list_move_tail(&ce->e_lru_list, &free_list);
+		__mb_cache_entry_unhash(ce);
+	}
+	list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
+		mb_debug("cache %s (%d)", cache->c_name,
+			  atomic_read(&cache->c_entry_count));
+		count += atomic_read(&cache->c_entry_count);
+	}
+	spin_unlock(&mb_cache_spinlock);
+	list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
+		__mb_cache_entry_forget(entry, gfp_mask);
+	}
+	return (count / 100) * sysctl_vfs_cache_pressure;
+}
+
+
+/*
+ * mb_cache_create()  create a new cache
+ *
+ * All entries in one cache are equal size. Cache entries may be from
+ * multiple devices. If this is the first mbcache created, registers
+ * the cache with kernel memory management. Returns NULL if no more
+ * memory was available.
+ *
+ * @name: name of the cache (informal)
+ * @bucket_bits: log2(number of hash buckets)
+ */
+struct mb_cache *
+mb_cache_create(const char *name, int bucket_bits)
+{
+	int n, bucket_count = 1 << bucket_bits;
+	struct mb_cache *cache = NULL;
+
+	cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
+	if (!cache)
+		return NULL;
+	cache->c_name = name;
+	atomic_set(&cache->c_entry_count, 0);
+	cache->c_bucket_bits = bucket_bits;
+	cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
+	                              GFP_KERNEL);
+	if (!cache->c_block_hash)
+		goto fail;
+	for (n=0; n<bucket_count; n++)
+		INIT_LIST_HEAD(&cache->c_block_hash[n]);
+	cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head),
+				      GFP_KERNEL);
+	if (!cache->c_index_hash)
+		goto fail;
+	for (n=0; n<bucket_count; n++)
+		INIT_LIST_HEAD(&cache->c_index_hash[n]);
+	cache->c_entry_cache = kmem_cache_create(name,
+		sizeof(struct mb_cache_entry), 0,
+		SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+	if (!cache->c_entry_cache)
+		goto fail2;
+
+	/*
+	 * Set an upper limit on the number of cache entries so that the hash
+	 * chains won't grow too long.
+	 */
+	cache->c_max_entries = bucket_count << 4;
+
+	spin_lock(&mb_cache_spinlock);
+	list_add(&cache->c_cache_list, &mb_cache_list);
+	spin_unlock(&mb_cache_spinlock);
+	return cache;
+
+fail2:
+	kfree(cache->c_index_hash);
+
+fail:
+	kfree(cache->c_block_hash);
+	kfree(cache);
+	return NULL;
+}
+
+
+/*
+ * mb_cache_shrink()
+ *
+ * Removes all cache entries of a device from the cache. All cache entries
+ * currently in use cannot be freed, and thus remain in the cache. All others
+ * are freed.
+ *
+ * @bdev: which device's cache entries to shrink
+ */
+void
+mb_cache_shrink(struct block_device *bdev)
+{
+	LIST_HEAD(free_list);
+	struct list_head *l, *ltmp;
+
+	spin_lock(&mb_cache_spinlock);
+	list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
+		struct mb_cache_entry *ce =
+			list_entry(l, struct mb_cache_entry, e_lru_list);
+		if (ce->e_bdev == bdev) {
+			list_move_tail(&ce->e_lru_list, &free_list);
+			__mb_cache_entry_unhash(ce);
+		}
+	}
+	spin_unlock(&mb_cache_spinlock);
+	list_for_each_safe(l, ltmp, &free_list) {
+		__mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
+						   e_lru_list), GFP_KERNEL);
+	}
+}
+
+
+/*
+ * mb_cache_destroy()
+ *
+ * Shrinks the cache to its minimum possible size (hopefully 0 entries),
+ * and then destroys it. If this was the last mbcache, un-registers the
+ * mbcache from kernel memory management.
+ */
+void
+mb_cache_destroy(struct mb_cache *cache)
+{
+	LIST_HEAD(free_list);
+	struct list_head *l, *ltmp;
+
+	spin_lock(&mb_cache_spinlock);
+	list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
+		struct mb_cache_entry *ce =
+			list_entry(l, struct mb_cache_entry, e_lru_list);
+		if (ce->e_cache == cache) {
+			list_move_tail(&ce->e_lru_list, &free_list);
+			__mb_cache_entry_unhash(ce);
+		}
+	}
+	list_del(&cache->c_cache_list);
+	spin_unlock(&mb_cache_spinlock);
+
+	list_for_each_safe(l, ltmp, &free_list) {
+		__mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
+						   e_lru_list), GFP_KERNEL);
+	}
+
+	if (atomic_read(&cache->c_entry_count) > 0) {
+		mb_error("cache %s: %d orphaned entries",
+			  cache->c_name,
+			  atomic_read(&cache->c_entry_count));
+	}
+
+	kmem_cache_destroy(cache->c_entry_cache);
+
+	kfree(cache->c_index_hash);
+	kfree(cache->c_block_hash);
+	kfree(cache);
+}
+
+/*
+ * mb_cache_entry_alloc()
+ *
+ * Allocates a new cache entry. The new entry will not be valid initially,
+ * and thus cannot be looked up yet. It should be filled with data, and
+ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
+ * if no more memory was available.
+ */
+struct mb_cache_entry *
+mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
+{
+	struct mb_cache_entry *ce = NULL;
+
+	if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
+		spin_lock(&mb_cache_spinlock);
+		if (!list_empty(&mb_cache_lru_list)) {
+			ce = list_entry(mb_cache_lru_list.next,
+					struct mb_cache_entry, e_lru_list);
+			list_del_init(&ce->e_lru_list);
+			__mb_cache_entry_unhash(ce);
+		}
+		spin_unlock(&mb_cache_spinlock);
+	}
+	if (!ce) {
+		ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+		if (!ce)
+			return NULL;
+		atomic_inc(&cache->c_entry_count);
+		INIT_LIST_HEAD(&ce->e_lru_list);
+		INIT_LIST_HEAD(&ce->e_block_list);
+		ce->e_cache = cache;
+		ce->e_queued = 0;
+	}
+	ce->e_used = 1 + MB_CACHE_WRITER;
+	return ce;
+}
+
+
+/*
+ * mb_cache_entry_insert()
+ *
+ * Inserts an entry that was allocated using mb_cache_entry_alloc() into
+ * the cache. After this, the cache entry can be looked up, but is not yet
+ * in the lru list as the caller still holds a handle to it. Returns 0 on
+ * success, or -EBUSY if a cache entry for that device + inode exists
+ * already (this may happen after a failed lookup, but when another process
+ * has inserted the same cache entry in the meantime).
+ *
+ * @bdev: device the cache entry belongs to
+ * @block: block number
+ * @key: lookup key
+ */
+int
+mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
+		      sector_t block, unsigned int key)
+{
+	struct mb_cache *cache = ce->e_cache;
+	unsigned int bucket;
+	struct list_head *l;
+	int error = -EBUSY;
+
+	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 
+			   cache->c_bucket_bits);
+	spin_lock(&mb_cache_spinlock);
+	list_for_each_prev(l, &cache->c_block_hash[bucket]) {
+		struct mb_cache_entry *ce =
+			list_entry(l, struct mb_cache_entry, e_block_list);
+		if (ce->e_bdev == bdev && ce->e_block == block)
+			goto out;
+	}
+	__mb_cache_entry_unhash(ce);
+	ce->e_bdev = bdev;
+	ce->e_block = block;
+	list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
+	ce->e_index.o_key = key;
+	bucket = hash_long(key, cache->c_bucket_bits);
+	list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]);
+	error = 0;
+out:
+	spin_unlock(&mb_cache_spinlock);
+	return error;
+}
+
+
+/*
+ * mb_cache_entry_release()
+ *
+ * Release a handle to a cache entry. When the last handle to a cache entry
+ * is released it is either freed (if it is invalid) or otherwise inserted
+ * in to the lru list.
+ */
+void
+mb_cache_entry_release(struct mb_cache_entry *ce)
+{
+	spin_lock(&mb_cache_spinlock);
+	__mb_cache_entry_release_unlock(ce);
+}
+
+
+/*
+ * mb_cache_entry_free()
+ *
+ * This is equivalent to the sequence mb_cache_entry_takeout() --
+ * mb_cache_entry_release().
+ */
+void
+mb_cache_entry_free(struct mb_cache_entry *ce)
+{
+	spin_lock(&mb_cache_spinlock);
+	mb_assert(list_empty(&ce->e_lru_list));
+	__mb_cache_entry_unhash(ce);
+	__mb_cache_entry_release_unlock(ce);
+}
+
+
+/*
+ * mb_cache_entry_get()
+ *
+ * Get a cache entry  by device / block number. (There can only be one entry
+ * in the cache per device and block.) Returns NULL if no such cache entry
+ * exists. The returned cache entry is locked for exclusive access ("single
+ * writer").
+ */
+struct mb_cache_entry *
+mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
+		   sector_t block)
+{
+	unsigned int bucket;
+	struct list_head *l;
+	struct mb_cache_entry *ce;
+
+	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
+			   cache->c_bucket_bits);
+	spin_lock(&mb_cache_spinlock);
+	list_for_each(l, &cache->c_block_hash[bucket]) {
+		ce = list_entry(l, struct mb_cache_entry, e_block_list);
+		if (ce->e_bdev == bdev && ce->e_block == block) {
+			DEFINE_WAIT(wait);
+
+			if (!list_empty(&ce->e_lru_list))
+				list_del_init(&ce->e_lru_list);
+
+			while (ce->e_used > 0) {
+				ce->e_queued++;
+				prepare_to_wait(&mb_cache_queue, &wait,
+						TASK_UNINTERRUPTIBLE);
+				spin_unlock(&mb_cache_spinlock);
+				schedule();
+				spin_lock(&mb_cache_spinlock);
+				ce->e_queued--;
+			}
+			finish_wait(&mb_cache_queue, &wait);
+			ce->e_used += 1 + MB_CACHE_WRITER;
+
+			if (!__mb_cache_entry_is_hashed(ce)) {
+				__mb_cache_entry_release_unlock(ce);
+				return NULL;
+			}
+			goto cleanup;
+		}
+	}
+	ce = NULL;
+
+cleanup:
+	spin_unlock(&mb_cache_spinlock);
+	return ce;
+}
+
+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
+
+static struct mb_cache_entry *
+__mb_cache_entry_find(struct list_head *l, struct list_head *head,
+		      struct block_device *bdev, unsigned int key)
+{
+	while (l != head) {
+		struct mb_cache_entry *ce =
+			list_entry(l, struct mb_cache_entry, e_index.o_list);
+		if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
+			DEFINE_WAIT(wait);
+
+			if (!list_empty(&ce->e_lru_list))
+				list_del_init(&ce->e_lru_list);
+
+			/* Incrementing before holding the lock gives readers
+			   priority over writers. */
+			ce->e_used++;
+			while (ce->e_used >= MB_CACHE_WRITER) {
+				ce->e_queued++;
+				prepare_to_wait(&mb_cache_queue, &wait,
+						TASK_UNINTERRUPTIBLE);
+				spin_unlock(&mb_cache_spinlock);
+				schedule();
+				spin_lock(&mb_cache_spinlock);
+				ce->e_queued--;
+			}
+			finish_wait(&mb_cache_queue, &wait);
+
+			if (!__mb_cache_entry_is_hashed(ce)) {
+				__mb_cache_entry_release_unlock(ce);
+				spin_lock(&mb_cache_spinlock);
+				return ERR_PTR(-EAGAIN);
+			}
+			return ce;
+		}
+		l = l->next;
+	}
+	return NULL;
+}
+
+
+/*
+ * mb_cache_entry_find_first()
+ *
+ * Find the first cache entry on a given device with a certain key in
+ * an additional index. Additional matches can be found with
+ * mb_cache_entry_find_next(). Returns NULL if no match was found. The
+ * returned cache entry is locked for shared access ("multiple readers").
+ *
+ * @cache: the cache to search
+ * @bdev: the device the cache entry should belong to
+ * @key: the key in the index
+ */
+struct mb_cache_entry *
+mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
+			  unsigned int key)
+{
+	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
+	struct list_head *l;
+	struct mb_cache_entry *ce;
+
+	spin_lock(&mb_cache_spinlock);
+	l = cache->c_index_hash[bucket].next;
+	ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
+	spin_unlock(&mb_cache_spinlock);
+	return ce;
+}
+
+
+/*
+ * mb_cache_entry_find_next()
+ *
+ * Find the next cache entry on a given device with a certain key in an
+ * additional index. Returns NULL if no match could be found. The previous
+ * entry is atomatically released, so that mb_cache_entry_find_next() can
+ * be called like this:
+ *
+ * entry = mb_cache_entry_find_first();
+ * while (entry) {
+ * 	...
+ *	entry = mb_cache_entry_find_next(entry, ...);
+ * }
+ *
+ * @prev: The previous match
+ * @bdev: the device the cache entry should belong to
+ * @key: the key in the index
+ */
+struct mb_cache_entry *
+mb_cache_entry_find_next(struct mb_cache_entry *prev,
+			 struct block_device *bdev, unsigned int key)
+{
+	struct mb_cache *cache = prev->e_cache;
+	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
+	struct list_head *l;
+	struct mb_cache_entry *ce;
+
+	spin_lock(&mb_cache_spinlock);
+	l = prev->e_index.o_list.next;
+	ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
+	__mb_cache_entry_release_unlock(prev);
+	return ce;
+}
+
+#endif  /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
+
+static int __init init_mbcache(void)
+{
+	register_shrinker(&mb_cache_shrinker);
+	return 0;
+}
+
+static void __exit exit_mbcache(void)
+{
+	unregister_shrinker(&mb_cache_shrinker);
+}
+
+module_init(init_mbcache)
+module_exit(exit_mbcache)
+
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
new file mode 100644
index 00000000..6624684d
--- /dev/null
+++ b/fs/minix/Kconfig
@@ -0,0 +1,25 @@
+config MINIX_FS
+	tristate "Minix file system support"
+	depends on BLOCK
+	help
+	  Minix is a simple operating system used in many classes about OS's.
+	  The minix file system (method to organize files on a hard disk
+	  partition or a floppy disk) was the original file system for Linux,
+	  but has been superseded by the second extended file system ext2fs.
+	  You don't want to use the minix file system on your hard disk
+	  because of certain built-in restrictions, but it is sometimes found
+	  on older Linux floppy disks.  This option will enlarge your kernel
+	  by about 28 KB. If unsure, say N.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called minix.  Note that the file system of your root
+	  partition (the one containing the directory /) cannot be compiled as
+	  a module.
+
+config MINIX_FS_NATIVE_ENDIAN
+	def_bool MINIX_FS
+	depends on H8300 || M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU)
+
+config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED
+	def_bool MINIX_FS
+	depends on M68K && MMU
diff --git a/fs/minix/Makefile b/fs/minix/Makefile
new file mode 100644
index 00000000..3063015a
--- /dev/null
+++ b/fs/minix/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux minix filesystem routines.
+#
+
+obj-$(CONFIG_MINIX_FS) += minix.o
+
+minix-objs := bitmap.o itree_v1.o itree_v2.o namei.o inode.o file.o dir.o
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
new file mode 100644
index 00000000..3f32bcb0
--- /dev/null
+++ b/fs/minix/bitmap.c
@@ -0,0 +1,279 @@
+/*
+ *  linux/fs/minix/bitmap.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * Modified for 680x0 by Hamish Macdonald
+ * Fixed for 680x0 by Andreas Schwab
+ */
+
+/* bitmap.c contains the code that handles the inode and block bitmaps */
+
+#include "minix.h"
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <linux/sched.h>
+
+static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 };
+
+static DEFINE_SPINLOCK(bitmap_lock);
+
+static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits)
+{
+	unsigned i, j, sum = 0;
+	struct buffer_head *bh;
+  
+	for (i=0; i<numblocks-1; i++) {
+		if (!(bh=map[i])) 
+			return(0);
+		for (j=0; j<bh->b_size; j++)
+			sum += nibblemap[bh->b_data[j] & 0xf]
+				+ nibblemap[(bh->b_data[j]>>4) & 0xf];
+	}
+
+	if (numblocks==0 || !(bh=map[numblocks-1]))
+		return(0);
+	i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2;
+	for (j=0; j<i; j++) {
+		sum += nibblemap[bh->b_data[j] & 0xf]
+			+ nibblemap[(bh->b_data[j]>>4) & 0xf];
+	}
+
+	i = numbits%16;
+	if (i!=0) {
+		i = *(__u16 *)(&bh->b_data[j]) | ~((1<<i) - 1);
+		sum += nibblemap[i & 0xf] + nibblemap[(i>>4) & 0xf];
+		sum += nibblemap[(i>>8) & 0xf] + nibblemap[(i>>12) & 0xf];
+	}
+	return(sum);
+}
+
+void minix_free_block(struct inode *inode, unsigned long block)
+{
+	struct super_block *sb = inode->i_sb;
+	struct minix_sb_info *sbi = minix_sb(sb);
+	struct buffer_head *bh;
+	int k = sb->s_blocksize_bits + 3;
+	unsigned long bit, zone;
+
+	if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) {
+		printk("Trying to free block not in datazone\n");
+		return;
+	}
+	zone = block - sbi->s_firstdatazone + 1;
+	bit = zone & ((1<<k) - 1);
+	zone >>= k;
+	if (zone >= sbi->s_zmap_blocks) {
+		printk("minix_free_block: nonexistent bitmap buffer\n");
+		return;
+	}
+	bh = sbi->s_zmap[zone];
+	spin_lock(&bitmap_lock);
+	if (!minix_test_and_clear_bit(bit, bh->b_data))
+		printk("minix_free_block (%s:%lu): bit already cleared\n",
+		       sb->s_id, block);
+	spin_unlock(&bitmap_lock);
+	mark_buffer_dirty(bh);
+	return;
+}
+
+int minix_new_block(struct inode * inode)
+{
+	struct minix_sb_info *sbi = minix_sb(inode->i_sb);
+	int bits_per_zone = 8 * inode->i_sb->s_blocksize;
+	int i;
+
+	for (i = 0; i < sbi->s_zmap_blocks; i++) {
+		struct buffer_head *bh = sbi->s_zmap[i];
+		int j;
+
+		spin_lock(&bitmap_lock);
+		j = minix_find_first_zero_bit(bh->b_data, bits_per_zone);
+		if (j < bits_per_zone) {
+			minix_set_bit(j, bh->b_data);
+			spin_unlock(&bitmap_lock);
+			mark_buffer_dirty(bh);
+			j += i * bits_per_zone + sbi->s_firstdatazone-1;
+			if (j < sbi->s_firstdatazone || j >= sbi->s_nzones)
+				break;
+			return j;
+		}
+		spin_unlock(&bitmap_lock);
+	}
+	return 0;
+}
+
+unsigned long minix_count_free_blocks(struct minix_sb_info *sbi)
+{
+	return (count_free(sbi->s_zmap, sbi->s_zmap_blocks,
+		sbi->s_nzones - sbi->s_firstdatazone + 1)
+		<< sbi->s_log_zone_size);
+}
+
+struct minix_inode *
+minix_V1_raw_inode(struct super_block *sb, ino_t ino, struct buffer_head **bh)
+{
+	int block;
+	struct minix_sb_info *sbi = minix_sb(sb);
+	struct minix_inode *p;
+
+	if (!ino || ino > sbi->s_ninodes) {
+		printk("Bad inode number on dev %s: %ld is out of range\n",
+		       sb->s_id, (long)ino);
+		return NULL;
+	}
+	ino--;
+	block = 2 + sbi->s_imap_blocks + sbi->s_zmap_blocks +
+		 ino / MINIX_INODES_PER_BLOCK;
+	*bh = sb_bread(sb, block);
+	if (!*bh) {
+		printk("Unable to read inode block\n");
+		return NULL;
+	}
+	p = (void *)(*bh)->b_data;
+	return p + ino % MINIX_INODES_PER_BLOCK;
+}
+
+struct minix2_inode *
+minix_V2_raw_inode(struct super_block *sb, ino_t ino, struct buffer_head **bh)
+{
+	int block;
+	struct minix_sb_info *sbi = minix_sb(sb);
+	struct minix2_inode *p;
+	int minix2_inodes_per_block = sb->s_blocksize / sizeof(struct minix2_inode);
+
+	*bh = NULL;
+	if (!ino || ino > sbi->s_ninodes) {
+		printk("Bad inode number on dev %s: %ld is out of range\n",
+		       sb->s_id, (long)ino);
+		return NULL;
+	}
+	ino--;
+	block = 2 + sbi->s_imap_blocks + sbi->s_zmap_blocks +
+		 ino / minix2_inodes_per_block;
+	*bh = sb_bread(sb, block);
+	if (!*bh) {
+		printk("Unable to read inode block\n");
+		return NULL;
+	}
+	p = (void *)(*bh)->b_data;
+	return p + ino % minix2_inodes_per_block;
+}
+
+/* Clear the link count and mode of a deleted inode on disk. */
+
+static void minix_clear_inode(struct inode *inode)
+{
+	struct buffer_head *bh = NULL;
+
+	if (INODE_VERSION(inode) == MINIX_V1) {
+		struct minix_inode *raw_inode;
+		raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh);
+		if (raw_inode) {
+			raw_inode->i_nlinks = 0;
+			raw_inode->i_mode = 0;
+		}
+	} else {
+		struct minix2_inode *raw_inode;
+		raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh);
+		if (raw_inode) {
+			raw_inode->i_nlinks = 0;
+			raw_inode->i_mode = 0;
+		}
+	}
+	if (bh) {
+		mark_buffer_dirty(bh);
+		brelse (bh);
+	}
+}
+
+void minix_free_inode(struct inode * inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct minix_sb_info *sbi = minix_sb(inode->i_sb);
+	struct buffer_head *bh;
+	int k = sb->s_blocksize_bits + 3;
+	unsigned long ino, bit;
+
+	ino = inode->i_ino;
+	if (ino < 1 || ino > sbi->s_ninodes) {
+		printk("minix_free_inode: inode 0 or nonexistent inode\n");
+		return;
+	}
+	bit = ino & ((1<<k) - 1);
+	ino >>= k;
+	if (ino >= sbi->s_imap_blocks) {
+		printk("minix_free_inode: nonexistent imap in superblock\n");
+		return;
+	}
+
+	minix_clear_inode(inode);	/* clear on-disk copy */
+
+	bh = sbi->s_imap[ino];
+	spin_lock(&bitmap_lock);
+	if (!minix_test_and_clear_bit(bit, bh->b_data))
+		printk("minix_free_inode: bit %lu already cleared\n", bit);
+	spin_unlock(&bitmap_lock);
+	mark_buffer_dirty(bh);
+}
+
+struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
+{
+	struct super_block *sb = dir->i_sb;
+	struct minix_sb_info *sbi = minix_sb(sb);
+	struct inode *inode = new_inode(sb);
+	struct buffer_head * bh;
+	int bits_per_zone = 8 * sb->s_blocksize;
+	unsigned long j;
+	int i;
+
+	if (!inode) {
+		*error = -ENOMEM;
+		return NULL;
+	}
+	j = bits_per_zone;
+	bh = NULL;
+	*error = -ENOSPC;
+	spin_lock(&bitmap_lock);
+	for (i = 0; i < sbi->s_imap_blocks; i++) {
+		bh = sbi->s_imap[i];
+		j = minix_find_first_zero_bit(bh->b_data, bits_per_zone);
+		if (j < bits_per_zone)
+			break;
+	}
+	if (!bh || j >= bits_per_zone) {
+		spin_unlock(&bitmap_lock);
+		iput(inode);
+		return NULL;
+	}
+	if (minix_test_and_set_bit(j, bh->b_data)) {	/* shouldn't happen */
+		spin_unlock(&bitmap_lock);
+		printk("minix_new_inode: bit already set\n");
+		iput(inode);
+		return NULL;
+	}
+	spin_unlock(&bitmap_lock);
+	mark_buffer_dirty(bh);
+	j += i * bits_per_zone;
+	if (!j || j > sbi->s_ninodes) {
+		iput(inode);
+		return NULL;
+	}
+	inode_init_owner(inode, dir, mode);
+	inode->i_ino = j;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_blocks = 0;
+	memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u));
+	insert_inode_hash(inode);
+	mark_inode_dirty(inode);
+
+	*error = 0;
+	return inode;
+}
+
+unsigned long minix_count_free_inodes(struct minix_sb_info *sbi)
+{
+	return count_free(sbi->s_imap, sbi->s_imap_blocks, sbi->s_ninodes + 1);
+}
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
new file mode 100644
index 00000000..085a9262
--- /dev/null
+++ b/fs/minix/dir.c
@@ -0,0 +1,477 @@
+/*
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ *  minix directory handling functions
+ *
+ *  Updated to filesystem version 3 by Daniel Aragones
+ */
+
+#include "minix.h"
+#include <linux/buffer_head.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+
+typedef struct minix_dir_entry minix_dirent;
+typedef struct minix3_dir_entry minix3_dirent;
+
+static int minix_readdir(struct file *, void *, filldir_t);
+
+const struct file_operations minix_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= minix_readdir,
+	.fsync		= generic_file_fsync,
+};
+
+static inline void dir_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned
+minix_last_byte(struct inode *inode, unsigned long page_nr)
+{
+	unsigned last_byte = PAGE_CACHE_SIZE;
+
+	if (page_nr == (inode->i_size >> PAGE_CACHE_SHIFT))
+		last_byte = inode->i_size & (PAGE_CACHE_SIZE - 1);
+	return last_byte;
+}
+
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+
+static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *dir = mapping->host;
+	int err = 0;
+	block_write_end(NULL, mapping, pos, len, len, page, NULL);
+
+	if (pos+len > dir->i_size) {
+		i_size_write(dir, pos+len);
+		mark_inode_dirty(dir);
+	}
+	if (IS_DIRSYNC(dir))
+		err = write_one_page(page, 1);
+	else
+		unlock_page(page);
+	return err;
+}
+
+static struct page * dir_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+	if (!IS_ERR(page))
+		kmap(page);
+	return page;
+}
+
+static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
+{
+	return (void*)((char*)de + sbi->s_dirsize);
+}
+
+static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	unsigned long pos = filp->f_pos;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	unsigned offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = dir_pages(inode);
+	struct minix_sb_info *sbi = minix_sb(sb);
+	unsigned chunk_size = sbi->s_dirsize;
+	char *name;
+	__u32 inumber;
+
+	pos = (pos + chunk_size-1) & ~(chunk_size-1);
+	if (pos >= inode->i_size)
+		goto done;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *p, *kaddr, *limit;
+		struct page *page = dir_get_page(inode, n);
+
+		if (IS_ERR(page))
+			continue;
+		kaddr = (char *)page_address(page);
+		p = kaddr+offset;
+		limit = kaddr + minix_last_byte(inode, n) - chunk_size;
+		for ( ; p <= limit; p = minix_next_entry(p, sbi)) {
+			if (sbi->s_version == MINIX_V3) {
+				minix3_dirent *de3 = (minix3_dirent *)p;
+				name = de3->name;
+				inumber = de3->inode;
+	 		} else {
+				minix_dirent *de = (minix_dirent *)p;
+				name = de->name;
+				inumber = de->inode;
+			}
+			if (inumber) {
+				int over;
+
+				unsigned l = strnlen(name, sbi->s_namelen);
+				offset = p - kaddr;
+				over = filldir(dirent, name, l,
+					(n << PAGE_CACHE_SHIFT) | offset,
+					inumber, DT_UNKNOWN);
+				if (over) {
+					dir_put_page(page);
+					goto done;
+				}
+			}
+		}
+		dir_put_page(page);
+	}
+
+done:
+	filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
+	return 0;
+}
+
+static inline int namecompare(int len, int maxlen,
+	const char * name, const char * buffer)
+{
+	if (len < maxlen && buffer[len])
+		return 0;
+	return !memcmp(name, buffer, len);
+}
+
+/*
+ *	minix_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the cache buffer in which the entry was found, and the entry
+ * itself (as a parameter - res_dir). It does NOT read the inode of the
+ * entry - you'll have to do that yourself if you want to.
+ */
+minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page)
+{
+	const char * name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct inode * dir = dentry->d_parent->d_inode;
+	struct super_block * sb = dir->i_sb;
+	struct minix_sb_info * sbi = minix_sb(sb);
+	unsigned long n;
+	unsigned long npages = dir_pages(dir);
+	struct page *page = NULL;
+	char *p;
+
+	char *namx;
+	__u32 inumber;
+	*res_page = NULL;
+
+	for (n = 0; n < npages; n++) {
+		char *kaddr, *limit;
+
+		page = dir_get_page(dir, n);
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = (char*)page_address(page);
+		limit = kaddr + minix_last_byte(dir, n) - sbi->s_dirsize;
+		for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
+			if (sbi->s_version == MINIX_V3) {
+				minix3_dirent *de3 = (minix3_dirent *)p;
+				namx = de3->name;
+				inumber = de3->inode;
+ 			} else {
+				minix_dirent *de = (minix_dirent *)p;
+				namx = de->name;
+				inumber = de->inode;
+			}
+			if (!inumber)
+				continue;
+			if (namecompare(namelen, sbi->s_namelen, name, namx))
+				goto found;
+		}
+		dir_put_page(page);
+	}
+	return NULL;
+
+found:
+	*res_page = page;
+	return (minix_dirent *)p;
+}
+
+int minix_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const char * name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct super_block * sb = dir->i_sb;
+	struct minix_sb_info * sbi = minix_sb(sb);
+	struct page *page = NULL;
+	unsigned long npages = dir_pages(dir);
+	unsigned long n;
+	char *kaddr, *p;
+	minix_dirent *de;
+	minix3_dirent *de3;
+	loff_t pos;
+	int err;
+	char *namx = NULL;
+	__u32 inumber;
+
+	/*
+	 * We take care of directory expansion in the same loop
+	 * This code plays outside i_size, so it locks the page
+	 * to protect that region.
+	 */
+	for (n = 0; n <= npages; n++) {
+		char *limit, *dir_end;
+
+		page = dir_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = (char*)page_address(page);
+		dir_end = kaddr + minix_last_byte(dir, n);
+		limit = kaddr + PAGE_CACHE_SIZE - sbi->s_dirsize;
+		for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
+			de = (minix_dirent *)p;
+			de3 = (minix3_dirent *)p;
+			if (sbi->s_version == MINIX_V3) {
+				namx = de3->name;
+				inumber = de3->inode;
+		 	} else {
+  				namx = de->name;
+				inumber = de->inode;
+			}
+			if (p == dir_end) {
+				/* We hit i_size */
+				if (sbi->s_version == MINIX_V3)
+					de3->inode = 0;
+		 		else
+					de->inode = 0;
+				goto got_it;
+			}
+			if (!inumber)
+				goto got_it;
+			err = -EEXIST;
+			if (namecompare(namelen, sbi->s_namelen, name, namx))
+				goto out_unlock;
+		}
+		unlock_page(page);
+		dir_put_page(page);
+	}
+	BUG();
+	return -EINVAL;
+
+got_it:
+	pos = page_offset(page) + p - (char *)page_address(page);
+	err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
+	if (err)
+		goto out_unlock;
+	memcpy (namx, name, namelen);
+	if (sbi->s_version == MINIX_V3) {
+		memset (namx + namelen, 0, sbi->s_dirsize - namelen - 4);
+		de3->inode = inode->i_ino;
+	} else {
+		memset (namx + namelen, 0, sbi->s_dirsize - namelen - 2);
+		de->inode = inode->i_ino;
+	}
+	err = dir_commit_chunk(page, pos, sbi->s_dirsize);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+out_put:
+	dir_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	char *kaddr = page_address(page);
+	loff_t pos = page_offset(page) + (char*)de - kaddr;
+	struct minix_sb_info *sbi = minix_sb(inode->i_sb);
+	unsigned len = sbi->s_dirsize;
+	int err;
+
+	lock_page(page);
+	err = minix_prepare_chunk(page, pos, len);
+	if (err == 0) {
+		if (sbi->s_version == MINIX_V3)
+			((minix3_dirent *) de)->inode = 0;
+		else
+			de->inode = 0;
+		err = dir_commit_chunk(page, pos, len);
+	} else {
+		unlock_page(page);
+	}
+	dir_put_page(page);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	return err;
+}
+
+int minix_make_empty(struct inode *inode, struct inode *dir)
+{
+	struct page *page = grab_cache_page(inode->i_mapping, 0);
+	struct minix_sb_info *sbi = minix_sb(inode->i_sb);
+	char *kaddr;
+	int err;
+
+	if (!page)
+		return -ENOMEM;
+	err = minix_prepare_chunk(page, 0, 2 * sbi->s_dirsize);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr, 0, PAGE_CACHE_SIZE);
+
+	if (sbi->s_version == MINIX_V3) {
+		minix3_dirent *de3 = (minix3_dirent *)kaddr;
+
+		de3->inode = inode->i_ino;
+		strcpy(de3->name, ".");
+		de3 = minix_next_entry(de3, sbi);
+		de3->inode = dir->i_ino;
+		strcpy(de3->name, "..");
+	} else {
+		minix_dirent *de = (minix_dirent *)kaddr;
+
+		de->inode = inode->i_ino;
+		strcpy(de->name, ".");
+		de = minix_next_entry(de, sbi);
+		de->inode = dir->i_ino;
+		strcpy(de->name, "..");
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+
+	err = dir_commit_chunk(page, 0, 2 * sbi->s_dirsize);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int minix_empty_dir(struct inode * inode)
+{
+	struct page *page = NULL;
+	unsigned long i, npages = dir_pages(inode);
+	struct minix_sb_info *sbi = minix_sb(inode->i_sb);
+	char *name;
+	__u32 inumber;
+
+	for (i = 0; i < npages; i++) {
+		char *p, *kaddr, *limit;
+
+		page = dir_get_page(inode, i);
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = (char *)page_address(page);
+		limit = kaddr + minix_last_byte(inode, i) - sbi->s_dirsize;
+		for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
+			if (sbi->s_version == MINIX_V3) {
+				minix3_dirent *de3 = (minix3_dirent *)p;
+				name = de3->name;
+				inumber = de3->inode;
+			} else {
+				minix_dirent *de = (minix_dirent *)p;
+				name = de->name;
+				inumber = de->inode;
+			}
+
+			if (inumber != 0) {
+				/* check for . and .. */
+				if (name[0] != '.')
+					goto not_empty;
+				if (!name[1]) {
+					if (inumber != inode->i_ino)
+						goto not_empty;
+				} else if (name[1] != '.')
+					goto not_empty;
+				else if (name[2])
+					goto not_empty;
+			}
+		}
+		dir_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	dir_put_page(page);
+	return 0;
+}
+
+/* Releases the page */
+void minix_set_link(struct minix_dir_entry *de, struct page *page,
+	struct inode *inode)
+{
+	struct inode *dir = page->mapping->host;
+	struct minix_sb_info *sbi = minix_sb(dir->i_sb);
+	loff_t pos = page_offset(page) +
+			(char *)de-(char*)page_address(page);
+	int err;
+
+	lock_page(page);
+
+	err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
+	if (err == 0) {
+		if (sbi->s_version == MINIX_V3)
+			((minix3_dirent *) de)->inode = inode->i_ino;
+		else
+			de->inode = inode->i_ino;
+		err = dir_commit_chunk(page, pos, sbi->s_dirsize);
+	} else {
+		unlock_page(page);
+	}
+	dir_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+}
+
+struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p)
+{
+	struct page *page = dir_get_page(dir, 0);
+	struct minix_sb_info *sbi = minix_sb(dir->i_sb);
+	struct minix_dir_entry *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = minix_next_entry(page_address(page), sbi);
+		*p = page;
+	}
+	return de;
+}
+
+ino_t minix_inode_by_name(struct dentry *dentry)
+{
+	struct page *page;
+	struct minix_dir_entry *de = minix_find_entry(dentry, &page);
+	ino_t res = 0;
+
+	if (de) {
+		struct address_space *mapping = page->mapping;
+		struct inode *inode = mapping->host;
+		struct minix_sb_info *sbi = minix_sb(inode->i_sb);
+
+		if (sbi->s_version == MINIX_V3)
+			res = ((minix3_dirent *) de)->inode;
+		else
+			res = de->inode;
+		dir_put_page(page);
+	}
+	return res;
+}
diff --git a/fs/minix/file.c b/fs/minix/file.c
new file mode 100644
index 00000000..4493ce69
--- /dev/null
+++ b/fs/minix/file.c
@@ -0,0 +1,51 @@
+/*
+ *  linux/fs/minix/file.c
+ *
+ *  Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ *  minix regular file handling primitives
+ */
+
+#include "minix.h"
+
+/*
+ * We have mostly NULLs here: the current defaults are OK for
+ * the minix filesystem.
+ */
+const struct file_operations minix_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.fsync		= generic_file_fsync,
+	.splice_read	= generic_file_splice_read,
+};
+
+static int minix_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+const struct inode_operations minix_file_inode_operations = {
+	.truncate	= minix_truncate,
+	.setattr	= minix_setattr,
+	.getattr	= minix_getattr,
+};
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
new file mode 100644
index 00000000..adcdc0a4
--- /dev/null
+++ b/fs/minix/inode.c
@@ -0,0 +1,661 @@
+/*
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Copyright (C) 1996  Gertjan van Wingerde
+ *	Minix V2 fs support.
+ *
+ *  Modified for 680x0 by Andreas Schwab
+ *  Updated to filesystem version 3 by Daniel Aragones
+ */
+
+#include <linux/module.h>
+#include "minix.h"
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/highuid.h>
+#include <linux/vfs.h>
+#include <linux/writeback.h>
+
+static int minix_write_inode(struct inode *inode,
+		struct writeback_control *wbc);
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
+static int minix_remount (struct super_block * sb, int * flags, char * data);
+
+static void minix_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	if (!inode->i_nlink) {
+		inode->i_size = 0;
+		minix_truncate(inode);
+	}
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
+	if (!inode->i_nlink)
+		minix_free_inode(inode);
+}
+
+static void minix_put_super(struct super_block *sb)
+{
+	int i;
+	struct minix_sb_info *sbi = minix_sb(sb);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		if (sbi->s_version != MINIX_V3)	 /* s_state is now out from V3 sb */
+			sbi->s_ms->s_state = sbi->s_mount_state;
+		mark_buffer_dirty(sbi->s_sbh);
+	}
+	for (i = 0; i < sbi->s_imap_blocks; i++)
+		brelse(sbi->s_imap[i]);
+	for (i = 0; i < sbi->s_zmap_blocks; i++)
+		brelse(sbi->s_zmap[i]);
+	brelse (sbi->s_sbh);
+	kfree(sbi->s_imap);
+	sb->s_fs_info = NULL;
+	kfree(sbi);
+}
+
+static struct kmem_cache * minix_inode_cachep;
+
+static struct inode *minix_alloc_inode(struct super_block *sb)
+{
+	struct minix_inode_info *ei;
+	ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void minix_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(minix_inode_cachep, minix_i(inode));
+}
+
+static void minix_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, minix_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct minix_inode_info *ei = (struct minix_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	minix_inode_cachep = kmem_cache_create("minix_inode_cache",
+					     sizeof(struct minix_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (minix_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(minix_inode_cachep);
+}
+
+static const struct super_operations minix_sops = {
+	.alloc_inode	= minix_alloc_inode,
+	.destroy_inode	= minix_destroy_inode,
+	.write_inode	= minix_write_inode,
+	.evict_inode	= minix_evict_inode,
+	.put_super	= minix_put_super,
+	.statfs		= minix_statfs,
+	.remount_fs	= minix_remount,
+};
+
+static int minix_remount (struct super_block * sb, int * flags, char * data)
+{
+	struct minix_sb_info * sbi = minix_sb(sb);
+	struct minix_super_block * ms;
+
+	ms = sbi->s_ms;
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		return 0;
+	if (*flags & MS_RDONLY) {
+		if (ms->s_state & MINIX_VALID_FS ||
+		    !(sbi->s_mount_state & MINIX_VALID_FS))
+			return 0;
+		/* Mounting a rw partition read-only. */
+		if (sbi->s_version != MINIX_V3)
+			ms->s_state = sbi->s_mount_state;
+		mark_buffer_dirty(sbi->s_sbh);
+	} else {
+	  	/* Mount a partition which is read-only, read-write. */
+		if (sbi->s_version != MINIX_V3) {
+			sbi->s_mount_state = ms->s_state;
+			ms->s_state &= ~MINIX_VALID_FS;
+		} else {
+			sbi->s_mount_state = MINIX_VALID_FS;
+		}
+		mark_buffer_dirty(sbi->s_sbh);
+
+		if (!(sbi->s_mount_state & MINIX_VALID_FS))
+			printk("MINIX-fs warning: remounting unchecked fs, "
+				"running fsck is recommended\n");
+		else if ((sbi->s_mount_state & MINIX_ERROR_FS))
+			printk("MINIX-fs warning: remounting fs with errors, "
+				"running fsck is recommended\n");
+	}
+	return 0;
+}
+
+static int minix_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct buffer_head *bh;
+	struct buffer_head **map;
+	struct minix_super_block *ms;
+	struct minix3_super_block *m3s = NULL;
+	unsigned long i, block;
+	struct inode *root_inode;
+	struct minix_sb_info *sbi;
+	int ret = -EINVAL;
+
+	sbi = kzalloc(sizeof(struct minix_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	s->s_fs_info = sbi;
+
+	BUILD_BUG_ON(32 != sizeof (struct minix_inode));
+	BUILD_BUG_ON(64 != sizeof(struct minix2_inode));
+
+	if (!sb_set_blocksize(s, BLOCK_SIZE))
+		goto out_bad_hblock;
+
+	if (!(bh = sb_bread(s, 1)))
+		goto out_bad_sb;
+
+	ms = (struct minix_super_block *) bh->b_data;
+	sbi->s_ms = ms;
+	sbi->s_sbh = bh;
+	sbi->s_mount_state = ms->s_state;
+	sbi->s_ninodes = ms->s_ninodes;
+	sbi->s_nzones = ms->s_nzones;
+	sbi->s_imap_blocks = ms->s_imap_blocks;
+	sbi->s_zmap_blocks = ms->s_zmap_blocks;
+	sbi->s_firstdatazone = ms->s_firstdatazone;
+	sbi->s_log_zone_size = ms->s_log_zone_size;
+	sbi->s_max_size = ms->s_max_size;
+	s->s_magic = ms->s_magic;
+	if (s->s_magic == MINIX_SUPER_MAGIC) {
+		sbi->s_version = MINIX_V1;
+		sbi->s_dirsize = 16;
+		sbi->s_namelen = 14;
+		sbi->s_link_max = MINIX_LINK_MAX;
+	} else if (s->s_magic == MINIX_SUPER_MAGIC2) {
+		sbi->s_version = MINIX_V1;
+		sbi->s_dirsize = 32;
+		sbi->s_namelen = 30;
+		sbi->s_link_max = MINIX_LINK_MAX;
+	} else if (s->s_magic == MINIX2_SUPER_MAGIC) {
+		sbi->s_version = MINIX_V2;
+		sbi->s_nzones = ms->s_zones;
+		sbi->s_dirsize = 16;
+		sbi->s_namelen = 14;
+		sbi->s_link_max = MINIX2_LINK_MAX;
+	} else if (s->s_magic == MINIX2_SUPER_MAGIC2) {
+		sbi->s_version = MINIX_V2;
+		sbi->s_nzones = ms->s_zones;
+		sbi->s_dirsize = 32;
+		sbi->s_namelen = 30;
+		sbi->s_link_max = MINIX2_LINK_MAX;
+	} else if ( *(__u16 *)(bh->b_data + 24) == MINIX3_SUPER_MAGIC) {
+		m3s = (struct minix3_super_block *) bh->b_data;
+		s->s_magic = m3s->s_magic;
+		sbi->s_imap_blocks = m3s->s_imap_blocks;
+		sbi->s_zmap_blocks = m3s->s_zmap_blocks;
+		sbi->s_firstdatazone = m3s->s_firstdatazone;
+		sbi->s_log_zone_size = m3s->s_log_zone_size;
+		sbi->s_max_size = m3s->s_max_size;
+		sbi->s_ninodes = m3s->s_ninodes;
+		sbi->s_nzones = m3s->s_zones;
+		sbi->s_dirsize = 64;
+		sbi->s_namelen = 60;
+		sbi->s_version = MINIX_V3;
+		sbi->s_link_max = MINIX2_LINK_MAX;
+		sbi->s_mount_state = MINIX_VALID_FS;
+		sb_set_blocksize(s, m3s->s_blocksize);
+	} else
+		goto out_no_fs;
+
+	/*
+	 * Allocate the buffer map to keep the superblock small.
+	 */
+	if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0)
+		goto out_illegal_sb;
+	i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh);
+	map = kzalloc(i, GFP_KERNEL);
+	if (!map)
+		goto out_no_map;
+	sbi->s_imap = &map[0];
+	sbi->s_zmap = &map[sbi->s_imap_blocks];
+
+	block=2;
+	for (i=0 ; i < sbi->s_imap_blocks ; i++) {
+		if (!(sbi->s_imap[i]=sb_bread(s, block)))
+			goto out_no_bitmap;
+		block++;
+	}
+	for (i=0 ; i < sbi->s_zmap_blocks ; i++) {
+		if (!(sbi->s_zmap[i]=sb_bread(s, block)))
+			goto out_no_bitmap;
+		block++;
+	}
+
+	minix_set_bit(0,sbi->s_imap[0]->b_data);
+	minix_set_bit(0,sbi->s_zmap[0]->b_data);
+
+	/* set up enough so that it can read an inode */
+	s->s_op = &minix_sops;
+	root_inode = minix_iget(s, MINIX_ROOT_INO);
+	if (IS_ERR(root_inode)) {
+		ret = PTR_ERR(root_inode);
+		goto out_no_root;
+	}
+
+	ret = -ENOMEM;
+	s->s_root = d_alloc_root(root_inode);
+	if (!s->s_root)
+		goto out_iput;
+
+	if (!(s->s_flags & MS_RDONLY)) {
+		if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */
+			ms->s_state &= ~MINIX_VALID_FS;
+		mark_buffer_dirty(bh);
+	}
+	if (!(sbi->s_mount_state & MINIX_VALID_FS))
+		printk("MINIX-fs: mounting unchecked file system, "
+			"running fsck is recommended\n");
+ 	else if (sbi->s_mount_state & MINIX_ERROR_FS)
+		printk("MINIX-fs: mounting file system with errors, "
+			"running fsck is recommended\n");
+	return 0;
+
+out_iput:
+	iput(root_inode);
+	goto out_freemap;
+
+out_no_root:
+	if (!silent)
+		printk("MINIX-fs: get root inode failed\n");
+	goto out_freemap;
+
+out_no_bitmap:
+	printk("MINIX-fs: bad superblock or unable to read bitmaps\n");
+out_freemap:
+	for (i = 0; i < sbi->s_imap_blocks; i++)
+		brelse(sbi->s_imap[i]);
+	for (i = 0; i < sbi->s_zmap_blocks; i++)
+		brelse(sbi->s_zmap[i]);
+	kfree(sbi->s_imap);
+	goto out_release;
+
+out_no_map:
+	ret = -ENOMEM;
+	if (!silent)
+		printk("MINIX-fs: can't allocate map\n");
+	goto out_release;
+
+out_illegal_sb:
+	if (!silent)
+		printk("MINIX-fs: bad superblock\n");
+	goto out_release;
+
+out_no_fs:
+	if (!silent)
+		printk("VFS: Can't find a Minix filesystem V1 | V2 | V3 "
+		       "on device %s.\n", s->s_id);
+out_release:
+	brelse(bh);
+	goto out;
+
+out_bad_hblock:
+	printk("MINIX-fs: blocksize too small for device\n");
+	goto out;
+
+out_bad_sb:
+	printk("MINIX-fs: unable to read superblock\n");
+out:
+	s->s_fs_info = NULL;
+	kfree(sbi);
+	return ret;
+}
+
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct minix_sb_info *sbi = minix_sb(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+	buf->f_type = sb->s_magic;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
+	buf->f_bfree = minix_count_free_blocks(sbi);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = sbi->s_ninodes;
+	buf->f_ffree = minix_count_free_inodes(sbi);
+	buf->f_namelen = sbi->s_namelen;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+static int minix_get_block(struct inode *inode, sector_t block,
+		    struct buffer_head *bh_result, int create)
+{
+	if (INODE_VERSION(inode) == MINIX_V1)
+		return V1_minix_get_block(inode, block, bh_result, create);
+	else
+		return V2_minix_get_block(inode, block, bh_result, create);
+}
+
+static int minix_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, minix_get_block, wbc);
+}
+
+static int minix_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page,minix_get_block);
+}
+
+int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	return __block_write_begin(page, pos, len, minix_get_block);
+}
+
+static int minix_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				minix_get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
+
+static sector_t minix_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,minix_get_block);
+}
+
+static const struct address_space_operations minix_aops = {
+	.readpage = minix_readpage,
+	.writepage = minix_writepage,
+	.write_begin = minix_write_begin,
+	.write_end = generic_write_end,
+	.bmap = minix_bmap
+};
+
+static const struct inode_operations minix_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.getattr	= minix_getattr,
+};
+
+void minix_set_inode(struct inode *inode, dev_t rdev)
+{
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &minix_file_inode_operations;
+		inode->i_fop = &minix_file_operations;
+		inode->i_mapping->a_ops = &minix_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &minix_dir_inode_operations;
+		inode->i_fop = &minix_dir_operations;
+		inode->i_mapping->a_ops = &minix_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &minix_symlink_inode_operations;
+		inode->i_mapping->a_ops = &minix_aops;
+	} else
+		init_special_inode(inode, inode->i_mode, rdev);
+}
+
+/*
+ * The minix V1 function to read an inode.
+ */
+static struct inode *V1_minix_iget(struct inode *inode)
+{
+	struct buffer_head * bh;
+	struct minix_inode * raw_inode;
+	struct minix_inode_info *minix_inode = minix_i(inode);
+	int i;
+
+	raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh);
+	if (!raw_inode) {
+		iget_failed(inode);
+		return ERR_PTR(-EIO);
+	}
+	inode->i_mode = raw_inode->i_mode;
+	inode->i_uid = (uid_t)raw_inode->i_uid;
+	inode->i_gid = (gid_t)raw_inode->i_gid;
+	inode->i_nlink = raw_inode->i_nlinks;
+	inode->i_size = raw_inode->i_size;
+	inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time;
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_atime.tv_nsec = 0;
+	inode->i_ctime.tv_nsec = 0;
+	inode->i_blocks = 0;
+	for (i = 0; i < 9; i++)
+		minix_inode->u.i1_data[i] = raw_inode->i_zone[i];
+	minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0]));
+	brelse(bh);
+	unlock_new_inode(inode);
+	return inode;
+}
+
+/*
+ * The minix V2 function to read an inode.
+ */
+static struct inode *V2_minix_iget(struct inode *inode)
+{
+	struct buffer_head * bh;
+	struct minix2_inode * raw_inode;
+	struct minix_inode_info *minix_inode = minix_i(inode);
+	int i;
+
+	raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh);
+	if (!raw_inode) {
+		iget_failed(inode);
+		return ERR_PTR(-EIO);
+	}
+	inode->i_mode = raw_inode->i_mode;
+	inode->i_uid = (uid_t)raw_inode->i_uid;
+	inode->i_gid = (gid_t)raw_inode->i_gid;
+	inode->i_nlink = raw_inode->i_nlinks;
+	inode->i_size = raw_inode->i_size;
+	inode->i_mtime.tv_sec = raw_inode->i_mtime;
+	inode->i_atime.tv_sec = raw_inode->i_atime;
+	inode->i_ctime.tv_sec = raw_inode->i_ctime;
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_atime.tv_nsec = 0;
+	inode->i_ctime.tv_nsec = 0;
+	inode->i_blocks = 0;
+	for (i = 0; i < 10; i++)
+		minix_inode->u.i2_data[i] = raw_inode->i_zone[i];
+	minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0]));
+	brelse(bh);
+	unlock_new_inode(inode);
+	return inode;
+}
+
+/*
+ * The global function to read an inode.
+ */
+struct inode *minix_iget(struct super_block *sb, unsigned long ino)
+{
+	struct inode *inode;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	if (INODE_VERSION(inode) == MINIX_V1)
+		return V1_minix_iget(inode);
+	else
+		return V2_minix_iget(inode);
+}
+
+/*
+ * The minix V1 function to synchronize an inode.
+ */
+static struct buffer_head * V1_minix_update_inode(struct inode * inode)
+{
+	struct buffer_head * bh;
+	struct minix_inode * raw_inode;
+	struct minix_inode_info *minix_inode = minix_i(inode);
+	int i;
+
+	raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh);
+	if (!raw_inode)
+		return NULL;
+	raw_inode->i_mode = inode->i_mode;
+	raw_inode->i_uid = fs_high2lowuid(inode->i_uid);
+	raw_inode->i_gid = fs_high2lowgid(inode->i_gid);
+	raw_inode->i_nlinks = inode->i_nlink;
+	raw_inode->i_size = inode->i_size;
+	raw_inode->i_time = inode->i_mtime.tv_sec;
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
+	else for (i = 0; i < 9; i++)
+		raw_inode->i_zone[i] = minix_inode->u.i1_data[i];
+	mark_buffer_dirty(bh);
+	return bh;
+}
+
+/*
+ * The minix V2 function to synchronize an inode.
+ */
+static struct buffer_head * V2_minix_update_inode(struct inode * inode)
+{
+	struct buffer_head * bh;
+	struct minix2_inode * raw_inode;
+	struct minix_inode_info *minix_inode = minix_i(inode);
+	int i;
+
+	raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh);
+	if (!raw_inode)
+		return NULL;
+	raw_inode->i_mode = inode->i_mode;
+	raw_inode->i_uid = fs_high2lowuid(inode->i_uid);
+	raw_inode->i_gid = fs_high2lowgid(inode->i_gid);
+	raw_inode->i_nlinks = inode->i_nlink;
+	raw_inode->i_size = inode->i_size;
+	raw_inode->i_mtime = inode->i_mtime.tv_sec;
+	raw_inode->i_atime = inode->i_atime.tv_sec;
+	raw_inode->i_ctime = inode->i_ctime.tv_sec;
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
+	else for (i = 0; i < 10; i++)
+		raw_inode->i_zone[i] = minix_inode->u.i2_data[i];
+	mark_buffer_dirty(bh);
+	return bh;
+}
+
+static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int err = 0;
+	struct buffer_head *bh;
+
+	if (INODE_VERSION(inode) == MINIX_V1)
+		bh = V1_minix_update_inode(inode);
+	else
+		bh = V2_minix_update_inode(inode);
+	if (!bh)
+		return -EIO;
+	if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh)) {
+			printk("IO error syncing minix inode [%s:%08lx]\n",
+				inode->i_sb->s_id, inode->i_ino);
+			err = -EIO;
+		}
+	}
+	brelse (bh);
+	return err;
+}
+
+int minix_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct super_block *sb = dir->i_sb;
+	generic_fillattr(dentry->d_inode, stat);
+	if (INODE_VERSION(dentry->d_inode) == MINIX_V1)
+		stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb);
+	else
+		stat->blocks = (sb->s_blocksize / 512) * V2_minix_blocks(stat->size, sb);
+	stat->blksize = sb->s_blocksize;
+	return 0;
+}
+
+/*
+ * The function that is called for file truncation.
+ */
+void minix_truncate(struct inode * inode)
+{
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)))
+		return;
+	if (INODE_VERSION(inode) == MINIX_V1)
+		V1_minix_truncate(inode);
+	else
+		V2_minix_truncate(inode);
+}
+
+static struct dentry *minix_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super);
+}
+
+static struct file_system_type minix_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "minix",
+	.mount		= minix_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_minix_fs(void)
+{
+	int err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&minix_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_minix_fs(void)
+{
+        unregister_filesystem(&minix_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_minix_fs)
+module_exit(exit_minix_fs)
+MODULE_LICENSE("GPL");
+
diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c
new file mode 100644
index 00000000..a731cabf
--- /dev/null
+++ b/fs/minix/itree_common.c
@@ -0,0 +1,364 @@
+/* Generic part */
+
+typedef struct {
+	block_t	*p;
+	block_t	key;
+	struct buffer_head *bh;
+} Indirect;
+
+static DEFINE_RWLOCK(pointers_lock);
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, block_t *v)
+{
+	p->key = *(p->p = v);
+	p->bh = bh;
+}
+
+static inline int verify_chain(Indirect *from, Indirect *to)
+{
+	while (from <= to && from->key == *from->p)
+		from++;
+	return (from > to);
+}
+
+static inline block_t *block_end(struct buffer_head *bh)
+{
+	return (block_t *)((char*)bh->b_data + bh->b_size);
+}
+
+static inline Indirect *get_branch(struct inode *inode,
+					int depth,
+					int *offsets,
+					Indirect chain[DEPTH],
+					int *err)
+{
+	struct super_block *sb = inode->i_sb;
+	Indirect *p = chain;
+	struct buffer_head *bh;
+
+	*err = 0;
+	/* i_data is not going away, no lock needed */
+	add_chain (chain, NULL, i_data(inode) + *offsets);
+	if (!p->key)
+		goto no_block;
+	while (--depth) {
+		bh = sb_bread(sb, block_to_cpu(p->key));
+		if (!bh)
+			goto failure;
+		read_lock(&pointers_lock);
+		if (!verify_chain(chain, p))
+			goto changed;
+		add_chain(++p, bh, (block_t *)bh->b_data + *++offsets);
+		read_unlock(&pointers_lock);
+		if (!p->key)
+			goto no_block;
+	}
+	return NULL;
+
+changed:
+	read_unlock(&pointers_lock);
+	brelse(bh);
+	*err = -EAGAIN;
+	goto no_block;
+failure:
+	*err = -EIO;
+no_block:
+	return p;
+}
+
+static int alloc_branch(struct inode *inode,
+			     int num,
+			     int *offsets,
+			     Indirect *branch)
+{
+	int n = 0;
+	int i;
+	int parent = minix_new_block(inode);
+
+	branch[0].key = cpu_to_block(parent);
+	if (parent) for (n = 1; n < num; n++) {
+		struct buffer_head *bh;
+		/* Allocate the next block */
+		int nr = minix_new_block(inode);
+		if (!nr)
+			break;
+		branch[n].key = cpu_to_block(nr);
+		bh = sb_getblk(inode->i_sb, parent);
+		lock_buffer(bh);
+		memset(bh->b_data, 0, bh->b_size);
+		branch[n].bh = bh;
+		branch[n].p = (block_t*) bh->b_data + offsets[n];
+		*branch[n].p = branch[n].key;
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		mark_buffer_dirty_inode(bh, inode);
+		parent = nr;
+	}
+	if (n == num)
+		return 0;
+
+	/* Allocation failed, free what we already allocated */
+	for (i = 1; i < n; i++)
+		bforget(branch[i].bh);
+	for (i = 0; i < n; i++)
+		minix_free_block(inode, block_to_cpu(branch[i].key));
+	return -ENOSPC;
+}
+
+static inline int splice_branch(struct inode *inode,
+				     Indirect chain[DEPTH],
+				     Indirect *where,
+				     int num)
+{
+	int i;
+
+	write_lock(&pointers_lock);
+
+	/* Verify that place we are splicing to is still there and vacant */
+	if (!verify_chain(chain, where-1) || *where->p)
+		goto changed;
+
+	*where->p = where->key;
+
+	write_unlock(&pointers_lock);
+
+	/* We are done with atomic stuff, now do the rest of housekeeping */
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+
+	/* had we spliced it onto indirect block? */
+	if (where->bh)
+		mark_buffer_dirty_inode(where->bh, inode);
+
+	mark_inode_dirty(inode);
+	return 0;
+
+changed:
+	write_unlock(&pointers_lock);
+	for (i = 1; i < num; i++)
+		bforget(where[i].bh);
+	for (i = 0; i < num; i++)
+		minix_free_block(inode, block_to_cpu(where[i].key));
+	return -EAGAIN;
+}
+
+static inline int get_block(struct inode * inode, sector_t block,
+			struct buffer_head *bh, int create)
+{
+	int err = -EIO;
+	int offsets[DEPTH];
+	Indirect chain[DEPTH];
+	Indirect *partial;
+	int left;
+	int depth = block_to_path(inode, block, offsets);
+
+	if (depth == 0)
+		goto out;
+
+reread:
+	partial = get_branch(inode, depth, offsets, chain, &err);
+
+	/* Simplest case - block found, no allocation needed */
+	if (!partial) {
+got_it:
+		map_bh(bh, inode->i_sb, block_to_cpu(chain[depth-1].key));
+		/* Clean up and exit */
+		partial = chain+depth-1; /* the whole chain */
+		goto cleanup;
+	}
+
+	/* Next simple case - plain lookup or failed read of indirect block */
+	if (!create || err == -EIO) {
+cleanup:
+		while (partial > chain) {
+			brelse(partial->bh);
+			partial--;
+		}
+out:
+		return err;
+	}
+
+	/*
+	 * Indirect block might be removed by truncate while we were
+	 * reading it. Handling of that case (forget what we've got and
+	 * reread) is taken out of the main path.
+	 */
+	if (err == -EAGAIN)
+		goto changed;
+
+	left = (chain + depth) - partial;
+	err = alloc_branch(inode, left, offsets+(partial-chain), partial);
+	if (err)
+		goto cleanup;
+
+	if (splice_branch(inode, chain, partial, left) < 0)
+		goto changed;
+
+	set_buffer_new(bh);
+	goto got_it;
+
+changed:
+	while (partial > chain) {
+		brelse(partial->bh);
+		partial--;
+	}
+	goto reread;
+}
+
+static inline int all_zeroes(block_t *p, block_t *q)
+{
+	while (p < q)
+		if (*p++)
+			return 0;
+	return 1;
+}
+
+static Indirect *find_shared(struct inode *inode,
+				int depth,
+				int offsets[DEPTH],
+				Indirect chain[DEPTH],
+				block_t *top)
+{
+	Indirect *partial, *p;
+	int k, err;
+
+	*top = 0;
+	for (k = depth; k > 1 && !offsets[k-1]; k--)
+		;
+	partial = get_branch(inode, k, offsets, chain, &err);
+
+	write_lock(&pointers_lock);
+	if (!partial)
+		partial = chain + k-1;
+	if (!partial->key && *partial->p) {
+		write_unlock(&pointers_lock);
+		goto no_top;
+	}
+	for (p=partial;p>chain && all_zeroes((block_t*)p->bh->b_data,p->p);p--)
+		;
+	if (p == chain + k - 1 && p > chain) {
+		p->p--;
+	} else {
+		*top = *p->p;
+		*p->p = 0;
+	}
+	write_unlock(&pointers_lock);
+
+	while(partial > p)
+	{
+		brelse(partial->bh);
+		partial--;
+	}
+no_top:
+	return partial;
+}
+
+static inline void free_data(struct inode *inode, block_t *p, block_t *q)
+{
+	unsigned long nr;
+
+	for ( ; p < q ; p++) {
+		nr = block_to_cpu(*p);
+		if (nr) {
+			*p = 0;
+			minix_free_block(inode, nr);
+		}
+	}
+}
+
+static void free_branches(struct inode *inode, block_t *p, block_t *q, int depth)
+{
+	struct buffer_head * bh;
+	unsigned long nr;
+
+	if (depth--) {
+		for ( ; p < q ; p++) {
+			nr = block_to_cpu(*p);
+			if (!nr)
+				continue;
+			*p = 0;
+			bh = sb_bread(inode->i_sb, nr);
+			if (!bh)
+				continue;
+			free_branches(inode, (block_t*)bh->b_data,
+				      block_end(bh), depth);
+			bforget(bh);
+			minix_free_block(inode, nr);
+			mark_inode_dirty(inode);
+		}
+	} else
+		free_data(inode, p, q);
+}
+
+static inline void truncate (struct inode * inode)
+{
+	struct super_block *sb = inode->i_sb;
+	block_t *idata = i_data(inode);
+	int offsets[DEPTH];
+	Indirect chain[DEPTH];
+	Indirect *partial;
+	block_t nr = 0;
+	int n;
+	int first_whole;
+	long iblock;
+
+	iblock = (inode->i_size + sb->s_blocksize -1) >> sb->s_blocksize_bits;
+	block_truncate_page(inode->i_mapping, inode->i_size, get_block);
+
+	n = block_to_path(inode, iblock, offsets);
+	if (!n)
+		return;
+
+	if (n == 1) {
+		free_data(inode, idata+offsets[0], idata + DIRECT);
+		first_whole = 0;
+		goto do_indirects;
+	}
+
+	first_whole = offsets[0] + 1 - DIRECT;
+	partial = find_shared(inode, n, offsets, chain, &nr);
+	if (nr) {
+		if (partial == chain)
+			mark_inode_dirty(inode);
+		else
+			mark_buffer_dirty_inode(partial->bh, inode);
+		free_branches(inode, &nr, &nr+1, (chain+n-1) - partial);
+	}
+	/* Clear the ends of indirect blocks on the shared branch */
+	while (partial > chain) {
+		free_branches(inode, partial->p + 1, block_end(partial->bh),
+				(chain+n-1) - partial);
+		mark_buffer_dirty_inode(partial->bh, inode);
+		brelse (partial->bh);
+		partial--;
+	}
+do_indirects:
+	/* Kill the remaining (whole) subtrees */
+	while (first_whole < DEPTH-1) {
+		nr = idata[DIRECT+first_whole];
+		if (nr) {
+			idata[DIRECT+first_whole] = 0;
+			mark_inode_dirty(inode);
+			free_branches(inode, &nr, &nr+1, first_whole+1);
+		}
+		first_whole++;
+	}
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+}
+
+static inline unsigned nblocks(loff_t size, struct super_block *sb)
+{
+	int k = sb->s_blocksize_bits - 10;
+	unsigned blocks, res, direct = DIRECT, i = DEPTH;
+	blocks = (size + sb->s_blocksize - 1) >> (BLOCK_SIZE_BITS + k);
+	res = blocks;
+	while (--i && blocks > direct) {
+		blocks -= direct;
+		blocks += sb->s_blocksize/sizeof(block_t) - 1;
+		blocks /= sb->s_blocksize/sizeof(block_t);
+		res += blocks;
+		direct = 1;
+	}
+	return res;
+}
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
new file mode 100644
index 00000000..282e15ad
--- /dev/null
+++ b/fs/minix/itree_v1.c
@@ -0,0 +1,67 @@
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include "minix.h"
+
+enum {DEPTH = 3, DIRECT = 7};	/* Only double indirect */
+
+typedef u16 block_t;	/* 16 bit, host order */
+
+static inline unsigned long block_to_cpu(block_t n)
+{
+	return n;
+}
+
+static inline block_t cpu_to_block(unsigned long n)
+{
+	return n;
+}
+
+static inline block_t *i_data(struct inode *inode)
+{
+	return (block_t *)minix_i(inode)->u.i1_data;
+}
+
+static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
+{
+	int n = 0;
+	char b[BDEVNAME_SIZE];
+
+	if (block < 0) {
+		printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n",
+			block, bdevname(inode->i_sb->s_bdev, b));
+	} else if (block >= (minix_sb(inode->i_sb)->s_max_size/BLOCK_SIZE)) {
+		if (printk_ratelimit())
+			printk("MINIX-fs: block_to_path: "
+			       "block %ld too big on dev %s\n",
+				block, bdevname(inode->i_sb->s_bdev, b));
+	} else if (block < 7) {
+		offsets[n++] = block;
+	} else if ((block -= 7) < 512) {
+		offsets[n++] = 7;
+		offsets[n++] = block;
+	} else {
+		block -= 512;
+		offsets[n++] = 8;
+		offsets[n++] = block>>9;
+		offsets[n++] = block & 511;
+	}
+	return n;
+}
+
+#include "itree_common.c"
+
+int V1_minix_get_block(struct inode * inode, long block,
+			struct buffer_head *bh_result, int create)
+{
+	return get_block(inode, block, bh_result, create);
+}
+
+void V1_minix_truncate(struct inode * inode)
+{
+	truncate(inode);
+}
+
+unsigned V1_minix_blocks(loff_t size, struct super_block *sb)
+{
+	return nblocks(size, sb);
+}
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
new file mode 100644
index 00000000..13487ad1
--- /dev/null
+++ b/fs/minix/itree_v2.c
@@ -0,0 +1,75 @@
+#include <linux/buffer_head.h>
+#include "minix.h"
+
+enum {DIRECT = 7, DEPTH = 4};	/* Have triple indirect */
+
+typedef u32 block_t;	/* 32 bit, host order */
+
+static inline unsigned long block_to_cpu(block_t n)
+{
+	return n;
+}
+
+static inline block_t cpu_to_block(unsigned long n)
+{
+	return n;
+}
+
+static inline block_t *i_data(struct inode *inode)
+{
+	return (block_t *)minix_i(inode)->u.i2_data;
+}
+
+#define DIRCOUNT 7
+#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2))
+
+static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
+{
+	int n = 0;
+	char b[BDEVNAME_SIZE];
+	struct super_block *sb = inode->i_sb;
+
+	if (block < 0) {
+		printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n",
+			block, bdevname(sb->s_bdev, b));
+	} else if (block >= (minix_sb(inode->i_sb)->s_max_size/sb->s_blocksize)) {
+		if (printk_ratelimit())
+			printk("MINIX-fs: block_to_path: "
+			       "block %ld too big on dev %s\n",
+				block, bdevname(sb->s_bdev, b));
+	} else if (block < DIRCOUNT) {
+		offsets[n++] = block;
+	} else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
+		offsets[n++] = DIRCOUNT;
+		offsets[n++] = block;
+	} else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) {
+		offsets[n++] = DIRCOUNT + 1;
+		offsets[n++] = block / INDIRCOUNT(sb);
+		offsets[n++] = block % INDIRCOUNT(sb);
+	} else {
+		block -= INDIRCOUNT(sb) * INDIRCOUNT(sb);
+		offsets[n++] = DIRCOUNT + 2;
+		offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb);
+		offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb);
+		offsets[n++] = block % INDIRCOUNT(sb);
+	}
+	return n;
+}
+
+#include "itree_common.c"
+
+int V2_minix_get_block(struct inode * inode, long block,
+			struct buffer_head *bh_result, int create)
+{
+	return get_block(inode, block, bh_result, create);
+}
+
+void V2_minix_truncate(struct inode * inode)
+{
+	truncate(inode);
+}
+
+unsigned V2_minix_blocks(loff_t size, struct super_block *sb)
+{
+	return nblocks(size, sb);
+}
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
new file mode 100644
index 00000000..341e2122
--- /dev/null
+++ b/fs/minix/minix.h
@@ -0,0 +1,165 @@
+#ifndef FS_MINIX_H
+#define FS_MINIX_H
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/minix_fs.h>
+
+#define INODE_VERSION(inode)	minix_sb(inode->i_sb)->s_version
+#define MINIX_V1		0x0001		/* original minix fs */
+#define MINIX_V2		0x0002		/* minix V2 fs */
+#define MINIX_V3		0x0003		/* minix V3 fs */
+
+/*
+ * minix fs inode data in memory
+ */
+struct minix_inode_info {
+	union {
+		__u16 i1_data[16];
+		__u32 i2_data[16];
+	} u;
+	struct inode vfs_inode;
+};
+
+/*
+ * minix super-block data in memory
+ */
+struct minix_sb_info {
+	unsigned long s_ninodes;
+	unsigned long s_nzones;
+	unsigned long s_imap_blocks;
+	unsigned long s_zmap_blocks;
+	unsigned long s_firstdatazone;
+	unsigned long s_log_zone_size;
+	unsigned long s_max_size;
+	int s_dirsize;
+	int s_namelen;
+	int s_link_max;
+	struct buffer_head ** s_imap;
+	struct buffer_head ** s_zmap;
+	struct buffer_head * s_sbh;
+	struct minix_super_block * s_ms;
+	unsigned short s_mount_state;
+	unsigned short s_version;
+};
+
+extern struct inode *minix_iget(struct super_block *, unsigned long);
+extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
+extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
+extern struct inode * minix_new_inode(const struct inode *, int, int *);
+extern void minix_free_inode(struct inode * inode);
+extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi);
+extern int minix_new_block(struct inode * inode);
+extern void minix_free_block(struct inode *inode, unsigned long block);
+extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi);
+extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
+
+extern void V1_minix_truncate(struct inode *);
+extern void V2_minix_truncate(struct inode *);
+extern void minix_truncate(struct inode *);
+extern void minix_set_inode(struct inode *, dev_t);
+extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int);
+extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int);
+extern unsigned V1_minix_blocks(loff_t, struct super_block *);
+extern unsigned V2_minix_blocks(loff_t, struct super_block *);
+
+extern struct minix_dir_entry *minix_find_entry(struct dentry*, struct page**);
+extern int minix_add_link(struct dentry*, struct inode*);
+extern int minix_delete_entry(struct minix_dir_entry*, struct page*);
+extern int minix_make_empty(struct inode*, struct inode*);
+extern int minix_empty_dir(struct inode*);
+extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*);
+extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**);
+extern ino_t minix_inode_by_name(struct dentry*);
+
+extern const struct inode_operations minix_file_inode_operations;
+extern const struct inode_operations minix_dir_inode_operations;
+extern const struct file_operations minix_file_operations;
+extern const struct file_operations minix_dir_operations;
+
+static inline struct minix_sb_info *minix_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct minix_inode_info *minix_i(struct inode *inode)
+{
+	return list_entry(inode, struct minix_inode_info, vfs_inode);
+}
+
+#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \
+	defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
+
+#error Minix file system byte order broken
+
+#elif defined(CONFIG_MINIX_FS_NATIVE_ENDIAN)
+
+/*
+ * big-endian 32 or 64 bit indexed bitmaps on big-endian system or
+ * little-endian bitmaps on little-endian system
+ */
+
+#define minix_test_and_set_bit(nr, addr)	\
+	__test_and_set_bit((nr), (unsigned long *)(addr))
+#define minix_set_bit(nr, addr)		\
+	__set_bit((nr), (unsigned long *)(addr))
+#define minix_test_and_clear_bit(nr, addr) \
+	__test_and_clear_bit((nr), (unsigned long *)(addr))
+#define minix_test_bit(nr, addr)		\
+	test_bit((nr), (unsigned long *)(addr))
+#define minix_find_first_zero_bit(addr, size) \
+	find_first_zero_bit((unsigned long *)(addr), (size))
+
+#elif defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
+
+/*
+ * big-endian 16bit indexed bitmaps
+ */
+
+static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size)
+{
+	const unsigned short *p = vaddr, *addr = vaddr;
+	unsigned short num;
+
+	if (!size)
+		return 0;
+
+	size = (size >> 4) + ((size & 15) > 0);
+	while (*p++ == 0xffff) {
+		if (--size == 0)
+			return (p - addr) << 4;
+	}
+
+	num = *--p;
+	return ((p - addr) << 4) + ffz(num);
+}
+
+#define minix_test_and_set_bit(nr, addr)	\
+	__test_and_set_bit((nr) ^ 16, (unsigned long *)(addr))
+#define minix_set_bit(nr, addr)	\
+	__set_bit((nr) ^ 16, (unsigned long *)(addr))
+#define minix_test_and_clear_bit(nr, addr)	\
+	__test_and_clear_bit((nr) ^ 16, (unsigned long *)(addr))
+
+static inline int minix_test_bit(int nr, const void *vaddr)
+{
+	const unsigned short *p = vaddr;
+	return (p[nr >> 4] & (1U << (nr & 15))) != 0;
+}
+
+#else
+
+/*
+ * little-endian bitmaps
+ */
+
+#define minix_test_and_set_bit	__test_and_set_bit_le
+#define minix_set_bit		__set_bit_le
+#define minix_test_and_clear_bit	__test_and_clear_bit_le
+#define minix_test_bit	test_bit_le
+#define minix_find_first_zero_bit	find_first_zero_bit_le
+
+#endif
+
+#endif /* FS_MINIX_H */
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
new file mode 100644
index 00000000..6e6777f1
--- /dev/null
+++ b/fs/minix/namei.c
@@ -0,0 +1,269 @@
+/*
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include "minix.h"
+
+static int add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = minix_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode * inode = NULL;
+	ino_t ino;
+
+	if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ino = minix_inode_by_name(dentry);
+	if (ino) {
+		inode = minix_iget(dir->i_sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	d_add(dentry, inode);
+	return NULL;
+}
+
+static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_t rdev)
+{
+	int error;
+	struct inode *inode;
+
+	if (!old_valid_dev(rdev))
+		return -EINVAL;
+
+	inode = minix_new_inode(dir, mode, &error);
+
+	if (inode) {
+		minix_set_inode(inode, rdev);
+		mark_inode_dirty(inode);
+		error = add_nondir(dentry, inode);
+	}
+	return error;
+}
+
+static int minix_create(struct inode * dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	return minix_mknod(dir, dentry, mode, 0);
+}
+
+static int minix_symlink(struct inode * dir, struct dentry *dentry,
+	  const char * symname)
+{
+	int err = -ENAMETOOLONG;
+	int i = strlen(symname)+1;
+	struct inode * inode;
+
+	if (i > dir->i_sb->s_blocksize)
+		goto out;
+
+	inode = minix_new_inode(dir, S_IFLNK | 0777, &err);
+	if (!inode)
+		goto out;
+
+	minix_set_inode(inode, 0);
+	err = page_symlink(inode, symname, i);
+	if (err)
+		goto out_fail;
+
+	err = add_nondir(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	iput(inode);
+	goto out;
+}
+
+static int minix_link(struct dentry * old_dentry, struct inode * dir,
+	struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+
+	if (inode->i_nlink >= minix_sb(inode->i_sb)->s_link_max)
+		return -EMLINK;
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	inode_inc_link_count(inode);
+	ihold(inode);
+	return add_nondir(dentry, inode);
+}
+
+static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
+{
+	struct inode * inode;
+	int err = -EMLINK;
+
+	if (dir->i_nlink >= minix_sb(dir->i_sb)->s_link_max)
+		goto out;
+
+	inode_inc_link_count(dir);
+
+	inode = minix_new_inode(dir, S_IFDIR | mode, &err);
+	if (!inode)
+		goto out_dir;
+
+	minix_set_inode(inode, 0);
+
+	inode_inc_link_count(inode);
+
+	err = minix_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = minix_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+
+	d_instantiate(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	inode_dec_link_count(inode);
+	iput(inode);
+out_dir:
+	inode_dec_link_count(dir);
+	goto out;
+}
+
+static int minix_unlink(struct inode * dir, struct dentry *dentry)
+{
+	int err = -ENOENT;
+	struct inode * inode = dentry->d_inode;
+	struct page * page;
+	struct minix_dir_entry * de;
+
+	de = minix_find_entry(dentry, &page);
+	if (!de)
+		goto end_unlink;
+
+	err = minix_delete_entry(de, page);
+	if (err)
+		goto end_unlink;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+end_unlink:
+	return err;
+}
+
+static int minix_rmdir(struct inode * dir, struct dentry *dentry)
+{
+	struct inode * inode = dentry->d_inode;
+	int err = -ENOTEMPTY;
+
+	if (minix_empty_dir(inode)) {
+		err = minix_unlink(dir, dentry);
+		if (!err) {
+			inode_dec_link_count(dir);
+			inode_dec_link_count(inode);
+		}
+	}
+	return err;
+}
+
+static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
+			   struct inode * new_dir, struct dentry *new_dentry)
+{
+	struct minix_sb_info * info = minix_sb(old_dir->i_sb);
+	struct inode * old_inode = old_dentry->d_inode;
+	struct inode * new_inode = new_dentry->d_inode;
+	struct page * dir_page = NULL;
+	struct minix_dir_entry * dir_de = NULL;
+	struct page * old_page;
+	struct minix_dir_entry * old_de;
+	int err = -ENOENT;
+
+	old_de = minix_find_entry(old_dentry, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = minix_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page * new_page;
+		struct minix_dir_entry * new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !minix_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = minix_find_entry(new_dentry, &new_page);
+		if (!new_de)
+			goto out_dir;
+		minix_set_link(new_de, new_page, old_inode);
+		new_inode->i_ctime = CURRENT_TIME_SEC;
+		if (dir_de)
+			drop_nlink(new_inode);
+		inode_dec_link_count(new_inode);
+	} else {
+		if (dir_de) {
+			err = -EMLINK;
+			if (new_dir->i_nlink >= info->s_link_max)
+				goto out_dir;
+		}
+		err = minix_add_link(new_dentry, old_inode);
+		if (err)
+			goto out_dir;
+		if (dir_de)
+			inode_inc_link_count(new_dir);
+	}
+
+	minix_delete_entry(old_de, old_page);
+	mark_inode_dirty(old_inode);
+
+	if (dir_de) {
+		minix_set_link(dir_de, dir_page, new_dir);
+		inode_dec_link_count(old_dir);
+	}
+	return 0;
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	return err;
+}
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations minix_dir_inode_operations = {
+	.create		= minix_create,
+	.lookup		= minix_lookup,
+	.link		= minix_link,
+	.unlink		= minix_unlink,
+	.symlink	= minix_symlink,
+	.mkdir		= minix_mkdir,
+	.rmdir		= minix_rmdir,
+	.mknod		= minix_mknod,
+	.rename		= minix_rename,
+	.getattr	= minix_getattr,
+};
diff --git a/fs/mpage.c b/fs/mpage.c
new file mode 100644
index 00000000..fdfae9fa
--- /dev/null
+++ b/fs/mpage.c
@@ -0,0 +1,718 @@
+/*
+ * fs/mpage.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * Contains functions related to preparing and submitting BIOs which contain
+ * multiple pagecache pages.
+ *
+ * 15May2002	Andrew Morton
+ *		Initial version
+ * 27Jun2002	axboe@suse.de
+ *		use bio_add_page() to build bio's just the right size
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/kdev_t.h>
+#include <linux/gfp.h>
+#include <linux/bio.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/highmem.h>
+#include <linux/prefetch.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+#include <linux/cleancache.h>
+
+/*
+ * I/O completion handler for multipage BIOs.
+ *
+ * The mpage code never puts partial pages into a BIO (except for end-of-file).
+ * If a page does not map to a contiguous run of blocks then it simply falls
+ * back to block_read_full_page().
+ *
+ * Why is this?  If a page's completion depends on a number of different BIOs
+ * which can complete in any order (or at the same time) then determining the
+ * status of that page is hard.  See end_buffer_async_read() for the details.
+ * There is no point in duplicating all that complexity.
+ */
+static void mpage_end_io(struct bio *bio, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+	do {
+		struct page *page = bvec->bv_page;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+		if (bio_data_dir(bio) == READ) {
+			if (uptodate) {
+				SetPageUptodate(page);
+			} else {
+				ClearPageUptodate(page);
+				SetPageError(page);
+			}
+			unlock_page(page);
+		} else { /* bio_data_dir(bio) == WRITE */
+			if (!uptodate) {
+				SetPageError(page);
+				if (page->mapping)
+					set_bit(AS_EIO, &page->mapping->flags);
+			}
+			end_page_writeback(page);
+		}
+	} while (bvec >= bio->bi_io_vec);
+	bio_put(bio);
+}
+
+static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+{
+	bio->bi_end_io = mpage_end_io;
+	submit_bio(rw, bio);
+	return NULL;
+}
+
+static struct bio *
+mpage_alloc(struct block_device *bdev,
+		sector_t first_sector, int nr_vecs,
+		gfp_t gfp_flags)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(gfp_flags, nr_vecs);
+
+	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
+
+	if (bio) {
+		bio->bi_bdev = bdev;
+		bio->bi_sector = first_sector;
+	}
+	return bio;
+}
+
+/*
+ * support function for mpage_readpages.  The fs supplied get_block might
+ * return an up to date buffer.  This is used to map that buffer into
+ * the page, which allows readpage to avoid triggering a duplicate call
+ * to get_block.
+ *
+ * The idea is to avoid adding buffers to pages that don't already have
+ * them.  So when the buffer is up to date and the page size == block size,
+ * this marks the page up to date instead of adding new buffers.
+ */
+static void 
+map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) 
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head *page_bh, *head;
+	int block = 0;
+
+	if (!page_has_buffers(page)) {
+		/*
+		 * don't make any buffers if there is only one buffer on
+		 * the page and the page just needs to be set up to date
+		 */
+		if (inode->i_blkbits == PAGE_CACHE_SHIFT && 
+		    buffer_uptodate(bh)) {
+			SetPageUptodate(page);    
+			return;
+		}
+		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+	}
+	head = page_buffers(page);
+	page_bh = head;
+	do {
+		if (block == page_block) {
+			page_bh->b_state = bh->b_state;
+			page_bh->b_bdev = bh->b_bdev;
+			page_bh->b_blocknr = bh->b_blocknr;
+			break;
+		}
+		page_bh = page_bh->b_this_page;
+		block++;
+	} while (page_bh != head);
+}
+
+/*
+ * This is the worker routine which does all the work of mapping the disk
+ * blocks and constructs largest possible bios, submits them for IO if the
+ * blocks are not contiguous on the disk.
+ *
+ * We pass a buffer_head back and forth and use its buffer_mapped() flag to
+ * represent the validity of its disk mapping and to decide when to do the next
+ * get_block() call.
+ */
+static struct bio *
+do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
+		sector_t *last_block_in_bio, struct buffer_head *map_bh,
+		unsigned long *first_logical_block, get_block_t get_block)
+{
+	struct inode *inode = page->mapping->host;
+	const unsigned blkbits = inode->i_blkbits;
+	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+	const unsigned blocksize = 1 << blkbits;
+	sector_t block_in_file;
+	sector_t last_block;
+	sector_t last_block_in_file;
+	sector_t blocks[MAX_BUF_PER_PAGE];
+	unsigned page_block;
+	unsigned first_hole = blocks_per_page;
+	struct block_device *bdev = NULL;
+	int length;
+	int fully_mapped = 1;
+	unsigned nblocks;
+	unsigned relative_block;
+
+	if (page_has_buffers(page))
+		goto confused;
+
+	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+	last_block = block_in_file + nr_pages * blocks_per_page;
+	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+	if (last_block > last_block_in_file)
+		last_block = last_block_in_file;
+	page_block = 0;
+
+	/*
+	 * Map blocks using the result from the previous get_blocks call first.
+	 */
+	nblocks = map_bh->b_size >> blkbits;
+	if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
+			block_in_file < (*first_logical_block + nblocks)) {
+		unsigned map_offset = block_in_file - *first_logical_block;
+		unsigned last = nblocks - map_offset;
+
+		for (relative_block = 0; ; relative_block++) {
+			if (relative_block == last) {
+				clear_buffer_mapped(map_bh);
+				break;
+			}
+			if (page_block == blocks_per_page)
+				break;
+			blocks[page_block] = map_bh->b_blocknr + map_offset +
+						relative_block;
+			page_block++;
+			block_in_file++;
+		}
+		bdev = map_bh->b_bdev;
+	}
+
+	/*
+	 * Then do more get_blocks calls until we are done with this page.
+	 */
+	map_bh->b_page = page;
+	while (page_block < blocks_per_page) {
+		map_bh->b_state = 0;
+		map_bh->b_size = 0;
+
+		if (block_in_file < last_block) {
+			map_bh->b_size = (last_block-block_in_file) << blkbits;
+			if (get_block(inode, block_in_file, map_bh, 0))
+				goto confused;
+			*first_logical_block = block_in_file;
+		}
+
+		if (!buffer_mapped(map_bh)) {
+			fully_mapped = 0;
+			if (first_hole == blocks_per_page)
+				first_hole = page_block;
+			page_block++;
+			block_in_file++;
+			continue;
+		}
+
+		/* some filesystems will copy data into the page during
+		 * the get_block call, in which case we don't want to
+		 * read it again.  map_buffer_to_page copies the data
+		 * we just collected from get_block into the page's buffers
+		 * so readpage doesn't have to repeat the get_block call
+		 */
+		if (buffer_uptodate(map_bh)) {
+			map_buffer_to_page(page, map_bh, page_block);
+			goto confused;
+		}
+	
+		if (first_hole != blocks_per_page)
+			goto confused;		/* hole -> non-hole */
+
+		/* Contiguous blocks? */
+		if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
+			goto confused;
+		nblocks = map_bh->b_size >> blkbits;
+		for (relative_block = 0; ; relative_block++) {
+			if (relative_block == nblocks) {
+				clear_buffer_mapped(map_bh);
+				break;
+			} else if (page_block == blocks_per_page)
+				break;
+			blocks[page_block] = map_bh->b_blocknr+relative_block;
+			page_block++;
+			block_in_file++;
+		}
+		bdev = map_bh->b_bdev;
+	}
+
+	if (first_hole != blocks_per_page) {
+		zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
+		if (first_hole == 0) {
+			SetPageUptodate(page);
+			unlock_page(page);
+			goto out;
+		}
+	} else if (fully_mapped) {
+		SetPageMappedToDisk(page);
+	}
+
+	if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
+	    cleancache_get_page(page) == 0) {
+		SetPageUptodate(page);
+		goto confused;
+	}
+
+	/*
+	 * This page will go to BIO.  Do we need to send this BIO off first?
+	 */
+	if (bio && (*last_block_in_bio != blocks[0] - 1))
+		bio = mpage_bio_submit(READ, bio);
+
+alloc_new:
+	if (bio == NULL) {
+		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
+			  	min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
+				GFP_KERNEL);
+		if (bio == NULL)
+			goto confused;
+	}
+
+	length = first_hole << blkbits;
+	if (bio_add_page(bio, page, length, 0) < length) {
+		bio = mpage_bio_submit(READ, bio);
+		goto alloc_new;
+	}
+
+	relative_block = block_in_file - *first_logical_block;
+	nblocks = map_bh->b_size >> blkbits;
+	if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
+	    (first_hole != blocks_per_page))
+		bio = mpage_bio_submit(READ, bio);
+	else
+		*last_block_in_bio = blocks[blocks_per_page - 1];
+out:
+	return bio;
+
+confused:
+	if (bio)
+		bio = mpage_bio_submit(READ, bio);
+	if (!PageUptodate(page))
+	        block_read_full_page(page, get_block);
+	else
+		unlock_page(page);
+	goto out;
+}
+
+/**
+ * mpage_readpages - populate an address space with some pages & start reads against them
+ * @mapping: the address_space
+ * @pages: The address of a list_head which contains the target pages.  These
+ *   pages have their ->index populated and are otherwise uninitialised.
+ *   The page at @pages->prev has the lowest file offset, and reads should be
+ *   issued in @pages->prev to @pages->next order.
+ * @nr_pages: The number of pages at *@pages
+ * @get_block: The filesystem's block mapper function.
+ *
+ * This function walks the pages and the blocks within each page, building and
+ * emitting large BIOs.
+ *
+ * If anything unusual happens, such as:
+ *
+ * - encountering a page which has buffers
+ * - encountering a page which has a non-hole after a hole
+ * - encountering a page with non-contiguous blocks
+ *
+ * then this code just gives up and calls the buffer_head-based read function.
+ * It does handle a page which has holes at the end - that is a common case:
+ * the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
+ *
+ * BH_Boundary explanation:
+ *
+ * There is a problem.  The mpage read code assembles several pages, gets all
+ * their disk mappings, and then submits them all.  That's fine, but obtaining
+ * the disk mappings may require I/O.  Reads of indirect blocks, for example.
+ *
+ * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
+ * submitted in the following order:
+ * 	12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
+ *
+ * because the indirect block has to be read to get the mappings of blocks
+ * 13,14,15,16.  Obviously, this impacts performance.
+ *
+ * So what we do it to allow the filesystem's get_block() function to set
+ * BH_Boundary when it maps block 11.  BH_Boundary says: mapping of the block
+ * after this one will require I/O against a block which is probably close to
+ * this one.  So you should push what I/O you have currently accumulated.
+ *
+ * This all causes the disk requests to be issued in the correct order.
+ */
+int
+mpage_readpages(struct address_space *mapping, struct list_head *pages,
+				unsigned nr_pages, get_block_t get_block)
+{
+	struct bio *bio = NULL;
+	unsigned page_idx;
+	sector_t last_block_in_bio = 0;
+	struct buffer_head map_bh;
+	unsigned long first_logical_block = 0;
+	struct blk_plug plug;
+
+	blk_start_plug(&plug);
+
+	map_bh.b_state = 0;
+	map_bh.b_size = 0;
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page = list_entry(pages->prev, struct page, lru);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+		if (!add_to_page_cache_lru(page, mapping,
+					page->index, GFP_KERNEL)) {
+			bio = do_mpage_readpage(bio, page,
+					nr_pages - page_idx,
+					&last_block_in_bio, &map_bh,
+					&first_logical_block,
+					get_block);
+		}
+		page_cache_release(page);
+	}
+	BUG_ON(!list_empty(pages));
+	if (bio)
+		mpage_bio_submit(READ, bio);
+	blk_finish_plug(&plug);
+	return 0;
+}
+EXPORT_SYMBOL(mpage_readpages);
+
+/*
+ * This isn't called much at all
+ */
+int mpage_readpage(struct page *page, get_block_t get_block)
+{
+	struct bio *bio = NULL;
+	sector_t last_block_in_bio = 0;
+	struct buffer_head map_bh;
+	unsigned long first_logical_block = 0;
+
+	map_bh.b_state = 0;
+	map_bh.b_size = 0;
+	bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
+			&map_bh, &first_logical_block, get_block);
+	if (bio)
+		mpage_bio_submit(READ, bio);
+	return 0;
+}
+EXPORT_SYMBOL(mpage_readpage);
+
+/*
+ * Writing is not so simple.
+ *
+ * If the page has buffers then they will be used for obtaining the disk
+ * mapping.  We only support pages which are fully mapped-and-dirty, with a
+ * special case for pages which are unmapped at the end: end-of-file.
+ *
+ * If the page has no buffers (preferred) then the page is mapped here.
+ *
+ * If all blocks are found to be contiguous then the page can go into the
+ * BIO.  Otherwise fall back to the mapping's writepage().
+ * 
+ * FIXME: This code wants an estimate of how many pages are still to be
+ * written, so it can intelligently allocate a suitably-sized BIO.  For now,
+ * just allocate full-size (16-page) BIOs.
+ */
+
+struct mpage_data {
+	struct bio *bio;
+	sector_t last_block_in_bio;
+	get_block_t *get_block;
+	unsigned use_writepage;
+};
+
+static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+		      void *data)
+{
+	struct mpage_data *mpd = data;
+	struct bio *bio = mpd->bio;
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = page->mapping->host;
+	const unsigned blkbits = inode->i_blkbits;
+	unsigned long end_index;
+	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+	sector_t last_block;
+	sector_t block_in_file;
+	sector_t blocks[MAX_BUF_PER_PAGE];
+	unsigned page_block;
+	unsigned first_unmapped = blocks_per_page;
+	struct block_device *bdev = NULL;
+	int boundary = 0;
+	sector_t boundary_block = 0;
+	struct block_device *boundary_bdev = NULL;
+	int length;
+	struct buffer_head map_bh;
+	loff_t i_size = i_size_read(inode);
+	int ret = 0;
+
+	if (page_has_buffers(page)) {
+		struct buffer_head *head = page_buffers(page);
+		struct buffer_head *bh = head;
+
+		/* If they're all mapped and dirty, do it */
+		page_block = 0;
+		do {
+			BUG_ON(buffer_locked(bh));
+			if (!buffer_mapped(bh)) {
+				/*
+				 * unmapped dirty buffers are created by
+				 * __set_page_dirty_buffers -> mmapped data
+				 */
+				if (buffer_dirty(bh))
+					goto confused;
+				if (first_unmapped == blocks_per_page)
+					first_unmapped = page_block;
+				continue;
+			}
+
+			if (first_unmapped != blocks_per_page)
+				goto confused;	/* hole -> non-hole */
+
+			if (!buffer_dirty(bh) || !buffer_uptodate(bh))
+				goto confused;
+			if (page_block) {
+				if (bh->b_blocknr != blocks[page_block-1] + 1)
+					goto confused;
+			}
+			blocks[page_block++] = bh->b_blocknr;
+			boundary = buffer_boundary(bh);
+			if (boundary) {
+				boundary_block = bh->b_blocknr;
+				boundary_bdev = bh->b_bdev;
+			}
+			bdev = bh->b_bdev;
+		} while ((bh = bh->b_this_page) != head);
+
+		if (first_unmapped)
+			goto page_is_mapped;
+
+		/*
+		 * Page has buffers, but they are all unmapped. The page was
+		 * created by pagein or read over a hole which was handled by
+		 * block_read_full_page().  If this address_space is also
+		 * using mpage_readpages then this can rarely happen.
+		 */
+		goto confused;
+	}
+
+	/*
+	 * The page has no buffers: map it to disk
+	 */
+	BUG_ON(!PageUptodate(page));
+	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+	last_block = (i_size - 1) >> blkbits;
+	map_bh.b_page = page;
+	for (page_block = 0; page_block < blocks_per_page; ) {
+
+		map_bh.b_state = 0;
+		map_bh.b_size = 1 << blkbits;
+		if (mpd->get_block(inode, block_in_file, &map_bh, 1))
+			goto confused;
+		if (buffer_new(&map_bh))
+			unmap_underlying_metadata(map_bh.b_bdev,
+						map_bh.b_blocknr);
+		if (buffer_boundary(&map_bh)) {
+			boundary_block = map_bh.b_blocknr;
+			boundary_bdev = map_bh.b_bdev;
+		}
+		if (page_block) {
+			if (map_bh.b_blocknr != blocks[page_block-1] + 1)
+				goto confused;
+		}
+		blocks[page_block++] = map_bh.b_blocknr;
+		boundary = buffer_boundary(&map_bh);
+		bdev = map_bh.b_bdev;
+		if (block_in_file == last_block)
+			break;
+		block_in_file++;
+	}
+	BUG_ON(page_block == 0);
+
+	first_unmapped = page_block;
+
+page_is_mapped:
+	end_index = i_size >> PAGE_CACHE_SHIFT;
+	if (page->index >= end_index) {
+		/*
+		 * The page straddles i_size.  It must be zeroed out on each
+		 * and every writepage invocation because it may be mmapped.
+		 * "A file is mapped in multiples of the page size.  For a file
+		 * that is not a multiple of the page size, the remaining memory
+		 * is zeroed when mapped, and writes to that region are not
+		 * written out to the file."
+		 */
+		unsigned offset = i_size & (PAGE_CACHE_SIZE - 1);
+
+		if (page->index > end_index || !offset)
+			goto confused;
+		zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	}
+
+	/*
+	 * This page will go to BIO.  Do we need to send this BIO off first?
+	 */
+	if (bio && mpd->last_block_in_bio != blocks[0] - 1)
+		bio = mpage_bio_submit(WRITE, bio);
+
+alloc_new:
+	if (bio == NULL) {
+		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
+				bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
+		if (bio == NULL)
+			goto confused;
+	}
+
+	/*
+	 * Must try to add the page before marking the buffer clean or
+	 * the confused fail path above (OOM) will be very confused when
+	 * it finds all bh marked clean (i.e. it will not write anything)
+	 */
+	length = first_unmapped << blkbits;
+	if (bio_add_page(bio, page, length, 0) < length) {
+		bio = mpage_bio_submit(WRITE, bio);
+		goto alloc_new;
+	}
+
+	/*
+	 * OK, we have our BIO, so we can now mark the buffers clean.  Make
+	 * sure to only clean buffers which we know we'll be writing.
+	 */
+	if (page_has_buffers(page)) {
+		struct buffer_head *head = page_buffers(page);
+		struct buffer_head *bh = head;
+		unsigned buffer_counter = 0;
+
+		do {
+			if (buffer_counter++ == first_unmapped)
+				break;
+			clear_buffer_dirty(bh);
+			bh = bh->b_this_page;
+		} while (bh != head);
+
+		/*
+		 * we cannot drop the bh if the page is not uptodate
+		 * or a concurrent readpage would fail to serialize with the bh
+		 * and it would read from disk before we reach the platter.
+		 */
+		if (buffer_heads_over_limit && PageUptodate(page))
+			try_to_free_buffers(page);
+	}
+
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
+	if (boundary || (first_unmapped != blocks_per_page)) {
+		bio = mpage_bio_submit(WRITE, bio);
+		if (boundary_block) {
+			write_boundary_block(boundary_bdev,
+					boundary_block, 1 << blkbits);
+		}
+	} else {
+		mpd->last_block_in_bio = blocks[blocks_per_page - 1];
+	}
+	goto out;
+
+confused:
+	if (bio)
+		bio = mpage_bio_submit(WRITE, bio);
+
+	if (mpd->use_writepage) {
+		ret = mapping->a_ops->writepage(page, wbc);
+	} else {
+		ret = -EAGAIN;
+		goto out;
+	}
+	/*
+	 * The caller has a ref on the inode, so *mapping is stable
+	 */
+	mapping_set_error(mapping, ret);
+out:
+	mpd->bio = bio;
+	return ret;
+}
+
+/**
+ * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *             If this is NULL then use a_ops->writepage.  Otherwise, go
+ *             direct-to-BIO.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * If a page is already under I/O, generic_writepages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+int
+mpage_writepages(struct address_space *mapping,
+		struct writeback_control *wbc, get_block_t get_block)
+{
+	struct blk_plug plug;
+	int ret;
+
+	blk_start_plug(&plug);
+
+	if (!get_block)
+		ret = generic_writepages(mapping, wbc);
+	else {
+		struct mpage_data mpd = {
+			.bio = NULL,
+			.last_block_in_bio = 0,
+			.get_block = get_block,
+			.use_writepage = 1,
+		};
+
+		ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
+		if (mpd.bio)
+			mpage_bio_submit(WRITE, mpd.bio);
+	}
+	blk_finish_plug(&plug);
+	return ret;
+}
+EXPORT_SYMBOL(mpage_writepages);
+
+int mpage_writepage(struct page *page, get_block_t get_block,
+	struct writeback_control *wbc)
+{
+	struct mpage_data mpd = {
+		.bio = NULL,
+		.last_block_in_bio = 0,
+		.get_block = get_block,
+		.use_writepage = 0,
+	};
+	int ret = __mpage_writepage(page, wbc, &mpd);
+	if (mpd.bio)
+		mpage_bio_submit(WRITE, mpd.bio);
+	return ret;
+}
+EXPORT_SYMBOL(mpage_writepage);
diff --git a/fs/namei.c b/fs/namei.c
new file mode 100644
index 00000000..16bda6cd
--- /dev/null
+++ b/fs/namei.c
@@ -0,0 +1,3402 @@
+/*
+ *  linux/fs/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * Some corrections by tytso.
+ */
+
+/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
+ * lookup logic.
+ */
+/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/fsnotify.h>
+#include <linux/personality.h>
+#include <linux/security.h>
+#include <linux/ima.h>
+#include <linux/syscalls.h>
+#include <linux/mount.h>
+#include <linux/audit.h>
+#include <linux/capability.h>
+#include <linux/file.h>
+#include <linux/fcntl.h>
+#include <linux/device_cgroup.h>
+#include <linux/fs_struct.h>
+#include <asm/uaccess.h>
+
+#include "internal.h"
+
+/* [Feb-1997 T. Schoebel-Theuer]
+ * Fundamental changes in the pathname lookup mechanisms (namei)
+ * were necessary because of omirr.  The reason is that omirr needs
+ * to know the _real_ pathname, not the user-supplied one, in case
+ * of symlinks (and also when transname replacements occur).
+ *
+ * The new code replaces the old recursive symlink resolution with
+ * an iterative one (in case of non-nested symlink chains).  It does
+ * this with calls to <fs>_follow_link().
+ * As a side effect, dir_namei(), _namei() and follow_link() are now 
+ * replaced with a single function lookup_dentry() that can handle all 
+ * the special cases of the former code.
+ *
+ * With the new dcache, the pathname is stored at each inode, at least as
+ * long as the refcount of the inode is positive.  As a side effect, the
+ * size of the dcache depends on the inode cache and thus is dynamic.
+ *
+ * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
+ * resolution to correspond with current state of the code.
+ *
+ * Note that the symlink resolution is not *completely* iterative.
+ * There is still a significant amount of tail- and mid- recursion in
+ * the algorithm.  Also, note that <fs>_readlink() is not used in
+ * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
+ * may return different results than <fs>_follow_link().  Many virtual
+ * filesystems (including /proc) exhibit this behavior.
+ */
+
+/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
+ * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
+ * and the name already exists in form of a symlink, try to create the new
+ * name indicated by the symlink. The old code always complained that the
+ * name already exists, due to not following the symlink even if its target
+ * is nonexistent.  The new semantics affects also mknod() and link() when
+ * the name is a symlink pointing to a non-existent name.
+ *
+ * I don't know which semantics is the right one, since I have no access
+ * to standards. But I found by trial that HP-UX 9.0 has the full "new"
+ * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
+ * "old" one. Personally, I think the new semantics is much more logical.
+ * Note that "ln old new" where "new" is a symlink pointing to a non-existing
+ * file does succeed in both HP-UX and SunOs, but not in Solaris
+ * and in the old Linux semantics.
+ */
+
+/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
+ * semantics.  See the comments in "open_namei" and "do_link" below.
+ *
+ * [10-Sep-98 Alan Modra] Another symlink change.
+ */
+
+/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
+ *	inside the path - always follow.
+ *	in the last component in creation/removal/renaming - never follow.
+ *	if LOOKUP_FOLLOW passed - follow.
+ *	if the pathname has trailing slashes - follow.
+ *	otherwise - don't follow.
+ * (applied in that order).
+ *
+ * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
+ * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
+ * During the 2.4 we need to fix the userland stuff depending on it -
+ * hopefully we will be able to get rid of that wart in 2.5. So far only
+ * XEmacs seems to be relying on it...
+ */
+/*
+ * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
+ * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
+ * any extra contention...
+ */
+
+/* In order to reduce some races, while at the same time doing additional
+ * checking and hopefully speeding things up, we copy filenames to the
+ * kernel data space before using them..
+ *
+ * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
+ * PATH_MAX includes the nul terminator --RR.
+ */
+static int do_getname(const char __user *filename, char *page)
+{
+	int retval;
+	unsigned long len = PATH_MAX;
+
+	if (!segment_eq(get_fs(), KERNEL_DS)) {
+		if ((unsigned long) filename >= TASK_SIZE)
+			return -EFAULT;
+		if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
+			len = TASK_SIZE - (unsigned long) filename;
+	}
+
+	retval = strncpy_from_user(page, filename, len);
+	if (retval > 0) {
+		if (retval < len)
+			return 0;
+		return -ENAMETOOLONG;
+	} else if (!retval)
+		retval = -ENOENT;
+	return retval;
+}
+
+static char *getname_flags(const char __user *filename, int flags, int *empty)
+{
+	char *tmp, *result;
+
+	result = ERR_PTR(-ENOMEM);
+	tmp = __getname();
+	if (tmp)  {
+		int retval = do_getname(filename, tmp);
+
+		result = tmp;
+		if (retval < 0) {
+			if (retval == -ENOENT && empty)
+				*empty = 1;
+			if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+				__putname(tmp);
+				result = ERR_PTR(retval);
+			}
+		}
+	}
+	audit_getname(result);
+	return result;
+}
+
+char *getname(const char __user * filename)
+{
+	return getname_flags(filename, 0, 0);
+}
+
+#ifdef CONFIG_AUDITSYSCALL
+void putname(const char *name)
+{
+	if (unlikely(!audit_dummy_context()))
+		audit_putname(name);
+	else
+		__putname(name);
+}
+EXPORT_SYMBOL(putname);
+#endif
+
+/*
+ * This does basic POSIX ACL permission checking
+ */
+static int acl_permission_check(struct inode *inode, int mask, unsigned int flags,
+		int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
+{
+	unsigned int mode = inode->i_mode;
+
+	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
+
+	if (current_user_ns() != inode_userns(inode))
+		goto other_perms;
+
+	if (current_fsuid() == inode->i_uid)
+		mode >>= 6;
+	else {
+		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
+			int error = check_acl(inode, mask, flags);
+			if (error != -EAGAIN)
+				return error;
+		}
+
+		if (in_group_p(inode->i_gid))
+			mode >>= 3;
+	}
+
+other_perms:
+	/*
+	 * If the DACs are ok we don't need any capability check.
+	 */
+	if ((mask & ~mode) == 0)
+		return 0;
+	return -EACCES;
+}
+
+/**
+ * generic_permission -  check for access rights on a Posix-like filesystem
+ * @inode:	inode to check access rights for
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @check_acl:	optional callback to check for Posix ACLs
+ * @flags:	IPERM_FLAG_ flags.
+ *
+ * Used to check for read/write/execute permissions on a file.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things.
+ *
+ * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
+ * request cannot be satisfied (eg. requires blocking or too much complexity).
+ * It would then be called again in ref-walk mode.
+ */
+int generic_permission(struct inode *inode, int mask, unsigned int flags,
+	int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
+{
+	int ret;
+
+	/*
+	 * Do the basic POSIX ACL permission checks.
+	 */
+	ret = acl_permission_check(inode, mask, flags, check_acl);
+	if (ret != -EACCES)
+		return ret;
+
+	/*
+	 * Read/write DACs are always overridable.
+	 * Executable DACs are overridable for all directories and
+	 * for non-directories that have least one exec bit set.
+	 */
+	if (!(mask & MAY_EXEC) || execute_ok(inode))
+		if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
+			return 0;
+
+	/*
+	 * Searching includes executable on directories, else just read.
+	 */
+	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
+	if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
+		if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
+			return 0;
+
+	return -EACCES;
+}
+
+/**
+ * inode_permission  -  check for access rights to a given inode
+ * @inode:	inode to check permission on
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ *
+ * Used to check for read/write/execute permissions on an inode.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things.
+ */
+int inode_permission(struct inode *inode, int mask)
+{
+	int retval;
+
+	if (mask & MAY_WRITE) {
+		umode_t mode = inode->i_mode;
+
+		/*
+		 * Nobody gets write access to a read-only fs.
+		 */
+		if (IS_RDONLY(inode) &&
+		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+			return -EROFS;
+
+		/*
+		 * Nobody gets write access to an immutable file.
+		 */
+		if (IS_IMMUTABLE(inode))
+			return -EACCES;
+	}
+
+	if (inode->i_op->permission)
+		retval = inode->i_op->permission(inode, mask, 0);
+	else
+		retval = generic_permission(inode, mask, 0,
+				inode->i_op->check_acl);
+
+	if (retval)
+		return retval;
+
+	retval = devcgroup_inode_permission(inode, mask);
+	if (retval)
+		return retval;
+
+	return security_inode_permission(inode, mask);
+}
+
+/**
+ * file_permission  -  check for additional access rights to a given file
+ * @file:	file to check access rights for
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ *
+ * Used to check for read/write/execute permissions on an already opened
+ * file.
+ *
+ * Note:
+ *	Do not use this function in new code.  All access checks should
+ *	be done using inode_permission().
+ */
+int file_permission(struct file *file, int mask)
+{
+	return inode_permission(file->f_path.dentry->d_inode, mask);
+}
+
+/*
+ * get_write_access() gets write permission for a file.
+ * put_write_access() releases this write permission.
+ * This is used for regular files.
+ * We cannot support write (and maybe mmap read-write shared) accesses and
+ * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
+ * can have the following values:
+ * 0: no writers, no VM_DENYWRITE mappings
+ * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
+ * > 0: (i_writecount) users are writing to the file.
+ *
+ * Normally we operate on that counter with atomic_{inc,dec} and it's safe
+ * except for the cases where we don't hold i_writecount yet. Then we need to
+ * use {get,deny}_write_access() - these functions check the sign and refuse
+ * to do the change if sign is wrong. Exclusion between them is provided by
+ * the inode->i_lock spinlock.
+ */
+
+int get_write_access(struct inode * inode)
+{
+	spin_lock(&inode->i_lock);
+	if (atomic_read(&inode->i_writecount) < 0) {
+		spin_unlock(&inode->i_lock);
+		return -ETXTBSY;
+	}
+	atomic_inc(&inode->i_writecount);
+	spin_unlock(&inode->i_lock);
+
+	return 0;
+}
+
+int deny_write_access(struct file * file)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	spin_lock(&inode->i_lock);
+	if (atomic_read(&inode->i_writecount) > 0) {
+		spin_unlock(&inode->i_lock);
+		return -ETXTBSY;
+	}
+	atomic_dec(&inode->i_writecount);
+	spin_unlock(&inode->i_lock);
+
+	return 0;
+}
+
+/**
+ * path_get - get a reference to a path
+ * @path: path to get the reference to
+ *
+ * Given a path increment the reference count to the dentry and the vfsmount.
+ */
+void path_get(struct path *path)
+{
+	mntget(path->mnt);
+	dget(path->dentry);
+}
+EXPORT_SYMBOL(path_get);
+
+/**
+ * path_put - put a reference to a path
+ * @path: path to put the reference to
+ *
+ * Given a path decrement the reference count to the dentry and the vfsmount.
+ */
+void path_put(struct path *path)
+{
+	dput(path->dentry);
+	mntput(path->mnt);
+}
+EXPORT_SYMBOL(path_put);
+
+/*
+ * Path walking has 2 modes, rcu-walk and ref-walk (see
+ * Documentation/filesystems/path-lookup.txt).  In situations when we can't
+ * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
+ * normal reference counts on dentries and vfsmounts to transition to rcu-walk
+ * mode.  Refcounts are grabbed at the last known good point before rcu-walk
+ * got stuck, so ref-walk may continue from there. If this is not successful
+ * (eg. a seqcount has changed), then failure is returned and it's up to caller
+ * to restart the path walk from the beginning in ref-walk mode.
+ */
+
+/**
+ * unlazy_walk - try to switch to ref-walk mode.
+ * @nd: nameidata pathwalk data
+ * @dentry: child of nd->path.dentry or NULL
+ * Returns: 0 on success, -ECHILD on failure
+ *
+ * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
+ * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
+ * @nd or NULL.  Must be called from rcu-walk context.
+ */
+static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
+{
+	struct fs_struct *fs = current->fs;
+	struct dentry *parent = nd->path.dentry;
+	int want_root = 0;
+
+	BUG_ON(!(nd->flags & LOOKUP_RCU));
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+		want_root = 1;
+		spin_lock(&fs->lock);
+		if (nd->root.mnt != fs->root.mnt ||
+				nd->root.dentry != fs->root.dentry)
+			goto err_root;
+	}
+	spin_lock(&parent->d_lock);
+	if (!dentry) {
+		if (!__d_rcu_to_refcount(parent, nd->seq))
+			goto err_parent;
+		BUG_ON(nd->inode != parent->d_inode);
+	} else {
+		if (dentry->d_parent != parent)
+			goto err_parent;
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+		if (!__d_rcu_to_refcount(dentry, nd->seq))
+			goto err_child;
+		/*
+		 * If the sequence check on the child dentry passed, then
+		 * the child has not been removed from its parent. This
+		 * means the parent dentry must be valid and able to take
+		 * a reference at this point.
+		 */
+		BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+		BUG_ON(!parent->d_count);
+		parent->d_count++;
+		spin_unlock(&dentry->d_lock);
+	}
+	spin_unlock(&parent->d_lock);
+	if (want_root) {
+		path_get(&nd->root);
+		spin_unlock(&fs->lock);
+	}
+	mntget(nd->path.mnt);
+
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+	nd->flags &= ~LOOKUP_RCU;
+	return 0;
+
+err_child:
+	spin_unlock(&dentry->d_lock);
+err_parent:
+	spin_unlock(&parent->d_lock);
+err_root:
+	if (want_root)
+		spin_unlock(&fs->lock);
+	return -ECHILD;
+}
+
+/**
+ * release_open_intent - free up open intent resources
+ * @nd: pointer to nameidata
+ */
+void release_open_intent(struct nameidata *nd)
+{
+	struct file *file = nd->intent.open.file;
+
+	if (file && !IS_ERR(file)) {
+		if (file->f_path.dentry == NULL)
+			put_filp(file);
+		else
+			fput(file);
+	}
+}
+
+static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	return dentry->d_op->d_revalidate(dentry, nd);
+}
+
+static struct dentry *
+do_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	int status = d_revalidate(dentry, nd);
+	if (unlikely(status <= 0)) {
+		/*
+		 * The dentry failed validation.
+		 * If d_revalidate returned 0 attempt to invalidate
+		 * the dentry otherwise d_revalidate is asking us
+		 * to return a fail status.
+		 */
+		if (status < 0) {
+			dput(dentry);
+			dentry = ERR_PTR(status);
+		} else if (!d_invalidate(dentry)) {
+			dput(dentry);
+			dentry = NULL;
+		}
+	}
+	return dentry;
+}
+
+/**
+ * complete_walk - successful completion of path walk
+ * @nd:  pointer nameidata
+ *
+ * If we had been in RCU mode, drop out of it and legitimize nd->path.
+ * Revalidate the final result, unless we'd already done that during
+ * the path walk or the filesystem doesn't ask for it.  Return 0 on
+ * success, -error on failure.  In case of failure caller does not
+ * need to drop nd->path.
+ */
+static int complete_walk(struct nameidata *nd)
+{
+	struct dentry *dentry = nd->path.dentry;
+	int status;
+
+	if (nd->flags & LOOKUP_RCU) {
+		nd->flags &= ~LOOKUP_RCU;
+		if (!(nd->flags & LOOKUP_ROOT))
+			nd->root.mnt = NULL;
+		spin_lock(&dentry->d_lock);
+		if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
+			spin_unlock(&dentry->d_lock);
+			rcu_read_unlock();
+			br_read_unlock(vfsmount_lock);
+			return -ECHILD;
+		}
+		BUG_ON(nd->inode != dentry->d_inode);
+		spin_unlock(&dentry->d_lock);
+		mntget(nd->path.mnt);
+		rcu_read_unlock();
+		br_read_unlock(vfsmount_lock);
+	}
+
+	if (likely(!(nd->flags & LOOKUP_JUMPED)))
+		return 0;
+
+	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
+		return 0;
+
+	if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
+		return 0;
+
+	/* Note: we do not d_invalidate() */
+	status = d_revalidate(dentry, nd);
+	if (status > 0)
+		return 0;
+
+	if (!status)
+		status = -ESTALE;
+
+	path_put(&nd->path);
+	return status;
+}
+
+/*
+ * Short-cut version of permission(), for calling on directories
+ * during pathname resolution.  Combines parts of permission()
+ * and generic_permission(), and tests ONLY for MAY_EXEC permission.
+ *
+ * If appropriate, check DAC only.  If not appropriate, or
+ * short-cut DAC fails, then call ->permission() to do more
+ * complete permission check.
+ */
+static inline int exec_permission(struct inode *inode, unsigned int flags)
+{
+	int ret;
+	struct user_namespace *ns = inode_userns(inode);
+
+	if (inode->i_op->permission) {
+		ret = inode->i_op->permission(inode, MAY_EXEC, flags);
+	} else {
+		ret = acl_permission_check(inode, MAY_EXEC, flags,
+				inode->i_op->check_acl);
+	}
+	if (likely(!ret))
+		goto ok;
+	if (ret == -ECHILD)
+		return ret;
+
+	if (ns_capable(ns, CAP_DAC_OVERRIDE) ||
+			ns_capable(ns, CAP_DAC_READ_SEARCH))
+		goto ok;
+
+	return ret;
+ok:
+	return security_inode_exec_permission(inode, flags);
+}
+
+static __always_inline void set_root(struct nameidata *nd)
+{
+	if (!nd->root.mnt)
+		get_fs_root(current->fs, &nd->root);
+}
+
+static int link_path_walk(const char *, struct nameidata *);
+
+static __always_inline void set_root_rcu(struct nameidata *nd)
+{
+	if (!nd->root.mnt) {
+		struct fs_struct *fs = current->fs;
+		unsigned seq;
+
+		do {
+			seq = read_seqcount_begin(&fs->seq);
+			nd->root = fs->root;
+			nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
+		} while (read_seqcount_retry(&fs->seq, seq));
+	}
+}
+
+static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
+{
+	int ret;
+
+	if (IS_ERR(link))
+		goto fail;
+
+	if (*link == '/') {
+		set_root(nd);
+		path_put(&nd->path);
+		nd->path = nd->root;
+		path_get(&nd->root);
+		nd->flags |= LOOKUP_JUMPED;
+	}
+	nd->inode = nd->path.dentry->d_inode;
+
+	ret = link_path_walk(link, nd);
+	return ret;
+fail:
+	path_put(&nd->path);
+	return PTR_ERR(link);
+}
+
+static void path_put_conditional(struct path *path, struct nameidata *nd)
+{
+	dput(path->dentry);
+	if (path->mnt != nd->path.mnt)
+		mntput(path->mnt);
+}
+
+static inline void path_to_nameidata(const struct path *path,
+					struct nameidata *nd)
+{
+	if (!(nd->flags & LOOKUP_RCU)) {
+		dput(nd->path.dentry);
+		if (nd->path.mnt != path->mnt)
+			mntput(nd->path.mnt);
+	}
+	nd->path.mnt = path->mnt;
+	nd->path.dentry = path->dentry;
+}
+
+static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+{
+	struct inode *inode = link->dentry->d_inode;
+	if (!IS_ERR(cookie) && inode->i_op->put_link)
+		inode->i_op->put_link(link->dentry, nd, cookie);
+	path_put(link);
+}
+
+static __always_inline int
+follow_link(struct path *link, struct nameidata *nd, void **p)
+{
+	int error;
+	struct dentry *dentry = link->dentry;
+
+	BUG_ON(nd->flags & LOOKUP_RCU);
+
+	if (link->mnt == nd->path.mnt)
+		mntget(link->mnt);
+
+	if (unlikely(current->total_link_count >= 40)) {
+		*p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
+		path_put(&nd->path);
+		return -ELOOP;
+	}
+	cond_resched();
+	current->total_link_count++;
+
+	touch_atime(link->mnt, dentry);
+	nd_set_link(nd, NULL);
+
+	error = security_inode_follow_link(link->dentry, nd);
+	if (error) {
+		*p = ERR_PTR(error); /* no ->put_link(), please */
+		path_put(&nd->path);
+		return error;
+	}
+
+	nd->last_type = LAST_BIND;
+	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
+	error = PTR_ERR(*p);
+	if (!IS_ERR(*p)) {
+		char *s = nd_get_link(nd);
+		error = 0;
+		if (s)
+			error = __vfs_follow_link(nd, s);
+		else if (nd->last_type == LAST_BIND) {
+			nd->flags |= LOOKUP_JUMPED;
+			nd->inode = nd->path.dentry->d_inode;
+			if (nd->inode->i_op->follow_link) {
+				/* stepped on a _really_ weird one */
+				path_put(&nd->path);
+				error = -ELOOP;
+			}
+		}
+	}
+	return error;
+}
+
+static int follow_up_rcu(struct path *path)
+{
+	struct vfsmount *parent;
+	struct dentry *mountpoint;
+
+	parent = path->mnt->mnt_parent;
+	if (parent == path->mnt)
+		return 0;
+	mountpoint = path->mnt->mnt_mountpoint;
+	path->dentry = mountpoint;
+	path->mnt = parent;
+	return 1;
+}
+
+int follow_up(struct path *path)
+{
+	struct vfsmount *parent;
+	struct dentry *mountpoint;
+
+	br_read_lock(vfsmount_lock);
+	parent = path->mnt->mnt_parent;
+	if (parent == path->mnt) {
+		br_read_unlock(vfsmount_lock);
+		return 0;
+	}
+	mntget(parent);
+	mountpoint = dget(path->mnt->mnt_mountpoint);
+	br_read_unlock(vfsmount_lock);
+	dput(path->dentry);
+	path->dentry = mountpoint;
+	mntput(path->mnt);
+	path->mnt = parent;
+	return 1;
+}
+
+/*
+ * Perform an automount
+ * - return -EISDIR to tell follow_managed() to stop and return the path we
+ *   were called with.
+ */
+static int follow_automount(struct path *path, unsigned flags,
+			    bool *need_mntput)
+{
+	struct vfsmount *mnt;
+	int err;
+
+	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
+		return -EREMOTE;
+
+	/* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
+	 * and this is the terminal part of the path.
+	 */
+	if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
+		return -EISDIR; /* we actually want to stop here */
+
+	/* We don't want to mount if someone's just doing a stat -
+	 * unless they're stat'ing a directory and appended a '/' to
+	 * the name.
+	 *
+	 * We do, however, want to mount if someone wants to open or
+	 * create a file of any type under the mountpoint, wants to
+	 * traverse through the mountpoint or wants to open the
+	 * mounted directory.  Also, autofs may mark negative dentries
+	 * as being automount points.  These will need the attentions
+	 * of the daemon to instantiate them before they can be used.
+	 */
+	if (!(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
+		     LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+	    path->dentry->d_inode)
+		return -EISDIR;
+
+	current->total_link_count++;
+	if (current->total_link_count >= 40)
+		return -ELOOP;
+
+	mnt = path->dentry->d_op->d_automount(path);
+	if (IS_ERR(mnt)) {
+		/*
+		 * The filesystem is allowed to return -EISDIR here to indicate
+		 * it doesn't want to automount.  For instance, autofs would do
+		 * this so that its userspace daemon can mount on this dentry.
+		 *
+		 * However, we can only permit this if it's a terminal point in
+		 * the path being looked up; if it wasn't then the remainder of
+		 * the path is inaccessible and we should say so.
+		 */
+		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
+			return -EREMOTE;
+		return PTR_ERR(mnt);
+	}
+
+	if (!mnt) /* mount collision */
+		return 0;
+
+	if (!*need_mntput) {
+		/* lock_mount() may release path->mnt on error */
+		mntget(path->mnt);
+		*need_mntput = true;
+	}
+	err = finish_automount(mnt, path);
+
+	switch (err) {
+	case -EBUSY:
+		/* Someone else made a mount here whilst we were busy */
+		return 0;
+	case 0:
+		path_put(path);
+		path->mnt = mnt;
+		path->dentry = dget(mnt->mnt_root);
+		return 0;
+	default:
+		return err;
+	}
+
+}
+
+/*
+ * Handle a dentry that is managed in some way.
+ * - Flagged for transit management (autofs)
+ * - Flagged as mountpoint
+ * - Flagged as automount point
+ *
+ * This may only be called in refwalk mode.
+ *
+ * Serialization is taken care of in namespace.c
+ */
+static int follow_managed(struct path *path, unsigned flags)
+{
+	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
+	unsigned managed;
+	bool need_mntput = false;
+	int ret = 0;
+
+	/* Given that we're not holding a lock here, we retain the value in a
+	 * local variable for each dentry as we look at it so that we don't see
+	 * the components of that value change under us */
+	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	       managed &= DCACHE_MANAGED_DENTRY,
+	       unlikely(managed != 0)) {
+		/* Allow the filesystem to manage the transit without i_mutex
+		 * being held. */
+		if (managed & DCACHE_MANAGE_TRANSIT) {
+			BUG_ON(!path->dentry->d_op);
+			BUG_ON(!path->dentry->d_op->d_manage);
+			ret = path->dentry->d_op->d_manage(path->dentry, false);
+			if (ret < 0)
+				break;
+		}
+
+		/* Transit to a mounted filesystem. */
+		if (managed & DCACHE_MOUNTED) {
+			struct vfsmount *mounted = lookup_mnt(path);
+			if (mounted) {
+				dput(path->dentry);
+				if (need_mntput)
+					mntput(path->mnt);
+				path->mnt = mounted;
+				path->dentry = dget(mounted->mnt_root);
+				need_mntput = true;
+				continue;
+			}
+
+			/* Something is mounted on this dentry in another
+			 * namespace and/or whatever was mounted there in this
+			 * namespace got unmounted before we managed to get the
+			 * vfsmount_lock */
+		}
+
+		/* Handle an automount point */
+		if (managed & DCACHE_NEED_AUTOMOUNT) {
+			ret = follow_automount(path, flags, &need_mntput);
+			if (ret < 0)
+				break;
+			continue;
+		}
+
+		/* We didn't change the current path point */
+		break;
+	}
+
+	if (need_mntput && path->mnt == mnt)
+		mntput(path->mnt);
+	if (ret == -EISDIR)
+		ret = 0;
+	return ret < 0 ? ret : need_mntput;
+}
+
+int follow_down_one(struct path *path)
+{
+	struct vfsmount *mounted;
+
+	mounted = lookup_mnt(path);
+	if (mounted) {
+		dput(path->dentry);
+		mntput(path->mnt);
+		path->mnt = mounted;
+		path->dentry = dget(mounted->mnt_root);
+		return 1;
+	}
+	return 0;
+}
+
+static inline bool managed_dentry_might_block(struct dentry *dentry)
+{
+	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
+		dentry->d_op->d_manage(dentry, true) < 0);
+}
+
+/*
+ * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
+ * we meet a managed dentry that would need blocking.
+ */
+static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
+			       struct inode **inode)
+{
+	for (;;) {
+		struct vfsmount *mounted;
+		/*
+		 * Don't forget we might have a non-mountpoint managed dentry
+		 * that wants to block transit.
+		 */
+		if (unlikely(managed_dentry_might_block(path->dentry)))
+			return false;
+
+		if (!d_mountpoint(path->dentry))
+			break;
+
+		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
+		if (!mounted)
+			break;
+		path->mnt = mounted;
+		path->dentry = mounted->mnt_root;
+		nd->flags |= LOOKUP_JUMPED;
+		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+		/*
+		 * Update the inode too. We don't need to re-check the
+		 * dentry sequence number here after this d_inode read,
+		 * because a mount-point is always pinned.
+		 */
+		*inode = path->dentry->d_inode;
+	}
+	return true;
+}
+
+static void follow_mount_rcu(struct nameidata *nd)
+{
+	while (d_mountpoint(nd->path.dentry)) {
+		struct vfsmount *mounted;
+		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
+		if (!mounted)
+			break;
+		nd->path.mnt = mounted;
+		nd->path.dentry = mounted->mnt_root;
+		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+	}
+}
+
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
+	set_root_rcu(nd);
+
+	while (1) {
+		if (nd->path.dentry == nd->root.dentry &&
+		    nd->path.mnt == nd->root.mnt) {
+			break;
+		}
+		if (nd->path.dentry != nd->path.mnt->mnt_root) {
+			struct dentry *old = nd->path.dentry;
+			struct dentry *parent = old->d_parent;
+			unsigned seq;
+
+			seq = read_seqcount_begin(&parent->d_seq);
+			if (read_seqcount_retry(&old->d_seq, nd->seq))
+				goto failed;
+			nd->path.dentry = parent;
+			nd->seq = seq;
+			break;
+		}
+		if (!follow_up_rcu(&nd->path))
+			break;
+		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+	}
+	follow_mount_rcu(nd);
+	nd->inode = nd->path.dentry->d_inode;
+	return 0;
+
+failed:
+	nd->flags &= ~LOOKUP_RCU;
+	if (!(nd->flags & LOOKUP_ROOT))
+		nd->root.mnt = NULL;
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+	return -ECHILD;
+}
+
+/*
+ * Follow down to the covering mount currently visible to userspace.  At each
+ * point, the filesystem owning that dentry may be queried as to whether the
+ * caller is permitted to proceed or not.
+ */
+int follow_down(struct path *path)
+{
+	unsigned managed;
+	int ret;
+
+	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
+		/* Allow the filesystem to manage the transit without i_mutex
+		 * being held.
+		 *
+		 * We indicate to the filesystem if someone is trying to mount
+		 * something here.  This gives autofs the chance to deny anyone
+		 * other than its daemon the right to mount on its
+		 * superstructure.
+		 *
+		 * The filesystem may sleep at this point.
+		 */
+		if (managed & DCACHE_MANAGE_TRANSIT) {
+			BUG_ON(!path->dentry->d_op);
+			BUG_ON(!path->dentry->d_op->d_manage);
+			ret = path->dentry->d_op->d_manage(
+				path->dentry, false);
+			if (ret < 0)
+				return ret == -EISDIR ? 0 : ret;
+		}
+
+		/* Transit to a mounted filesystem. */
+		if (managed & DCACHE_MOUNTED) {
+			struct vfsmount *mounted = lookup_mnt(path);
+			if (!mounted)
+				break;
+			dput(path->dentry);
+			mntput(path->mnt);
+			path->mnt = mounted;
+			path->dentry = dget(mounted->mnt_root);
+			continue;
+		}
+
+		/* Don't handle automount points here */
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
+ */
+static void follow_mount(struct path *path)
+{
+	while (d_mountpoint(path->dentry)) {
+		struct vfsmount *mounted = lookup_mnt(path);
+		if (!mounted)
+			break;
+		dput(path->dentry);
+		mntput(path->mnt);
+		path->mnt = mounted;
+		path->dentry = dget(mounted->mnt_root);
+	}
+}
+
+static void follow_dotdot(struct nameidata *nd)
+{
+	set_root(nd);
+
+	while(1) {
+		struct dentry *old = nd->path.dentry;
+
+		if (nd->path.dentry == nd->root.dentry &&
+		    nd->path.mnt == nd->root.mnt) {
+			break;
+		}
+		if (nd->path.dentry != nd->path.mnt->mnt_root) {
+			/* rare case of legitimate dget_parent()... */
+			nd->path.dentry = dget_parent(nd->path.dentry);
+			dput(old);
+			break;
+		}
+		if (!follow_up(&nd->path))
+			break;
+	}
+	follow_mount(&nd->path);
+	nd->inode = nd->path.dentry->d_inode;
+}
+
+/*
+ * Allocate a dentry with name and parent, and perform a parent
+ * directory ->lookup on it. Returns the new dentry, or ERR_PTR
+ * on error. parent->d_inode->i_mutex must be held. d_lookup must
+ * have verified that no child exists while under i_mutex.
+ */
+static struct dentry *d_alloc_and_lookup(struct dentry *parent,
+				struct qstr *name, struct nameidata *nd)
+{
+	struct inode *inode = parent->d_inode;
+	struct dentry *dentry;
+	struct dentry *old;
+
+	/* Don't create child dentry for a dead directory. */
+	if (unlikely(IS_DEADDIR(inode)))
+		return ERR_PTR(-ENOENT);
+
+	dentry = d_alloc(parent, name);
+	if (unlikely(!dentry))
+		return ERR_PTR(-ENOMEM);
+
+	old = inode->i_op->lookup(inode, dentry, nd);
+	if (unlikely(old)) {
+		dput(dentry);
+		dentry = old;
+	}
+	return dentry;
+}
+
+/*
+ *  It's more convoluted than I'd like it to be, but... it's still fairly
+ *  small and for now I'd prefer to have fast path as straight as possible.
+ *  It _is_ time-critical.
+ */
+static int do_lookup(struct nameidata *nd, struct qstr *name,
+			struct path *path, struct inode **inode)
+{
+	struct vfsmount *mnt = nd->path.mnt;
+	struct dentry *dentry, *parent = nd->path.dentry;
+	int need_reval = 1;
+	int status = 1;
+	int err;
+
+	/*
+	 * Rename seqlock is not required here because in the off chance
+	 * of a false negative due to a concurrent rename, we're going to
+	 * do the non-racy lookup, below.
+	 */
+	if (nd->flags & LOOKUP_RCU) {
+		unsigned seq;
+		*inode = nd->inode;
+		dentry = __d_lookup_rcu(parent, name, &seq, inode);
+		if (!dentry)
+			goto unlazy;
+
+		/* Memory barrier in read_seqcount_begin of child is enough */
+		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
+			return -ECHILD;
+		nd->seq = seq;
+
+		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+			status = d_revalidate(dentry, nd);
+			if (unlikely(status <= 0)) {
+				if (status != -ECHILD)
+					need_reval = 0;
+				goto unlazy;
+			}
+		}
+		path->mnt = mnt;
+		path->dentry = dentry;
+		if (unlikely(!__follow_mount_rcu(nd, path, inode)))
+			goto unlazy;
+		if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+			goto unlazy;
+		return 0;
+unlazy:
+		if (unlazy_walk(nd, dentry))
+			return -ECHILD;
+	} else {
+		dentry = __d_lookup(parent, name);
+	}
+
+retry:
+	if (unlikely(!dentry)) {
+		struct inode *dir = parent->d_inode;
+		BUG_ON(nd->inode != dir);
+
+		mutex_lock(&dir->i_mutex);
+		dentry = d_lookup(parent, name);
+		if (likely(!dentry)) {
+			dentry = d_alloc_and_lookup(parent, name, nd);
+			if (IS_ERR(dentry)) {
+				mutex_unlock(&dir->i_mutex);
+				return PTR_ERR(dentry);
+			}
+			/* known good */
+			need_reval = 0;
+			status = 1;
+		}
+		mutex_unlock(&dir->i_mutex);
+	}
+	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
+		status = d_revalidate(dentry, nd);
+	if (unlikely(status <= 0)) {
+		if (status < 0) {
+			dput(dentry);
+			return status;
+		}
+		if (!d_invalidate(dentry)) {
+			dput(dentry);
+			dentry = NULL;
+			need_reval = 1;
+			goto retry;
+		}
+	}
+
+	path->mnt = mnt;
+	path->dentry = dentry;
+	err = follow_managed(path, nd->flags);
+	if (unlikely(err < 0)) {
+		path_put_conditional(path, nd);
+		return err;
+	}
+	if (err)
+		nd->flags |= LOOKUP_JUMPED;
+	*inode = path->dentry->d_inode;
+	return 0;
+}
+
+static inline int may_lookup(struct nameidata *nd)
+{
+	if (nd->flags & LOOKUP_RCU) {
+		int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+		if (err != -ECHILD)
+			return err;
+		if (unlazy_walk(nd, NULL))
+			return -ECHILD;
+	}
+	return exec_permission(nd->inode, 0);
+}
+
+static inline int handle_dots(struct nameidata *nd, int type)
+{
+	if (type == LAST_DOTDOT) {
+		if (nd->flags & LOOKUP_RCU) {
+			if (follow_dotdot_rcu(nd))
+				return -ECHILD;
+		} else
+			follow_dotdot(nd);
+	}
+	return 0;
+}
+
+static void terminate_walk(struct nameidata *nd)
+{
+	if (!(nd->flags & LOOKUP_RCU)) {
+		path_put(&nd->path);
+	} else {
+		nd->flags &= ~LOOKUP_RCU;
+		if (!(nd->flags & LOOKUP_ROOT))
+			nd->root.mnt = NULL;
+		rcu_read_unlock();
+		br_read_unlock(vfsmount_lock);
+	}
+}
+
+static inline int walk_component(struct nameidata *nd, struct path *path,
+		struct qstr *name, int type, int follow)
+{
+	struct inode *inode;
+	int err;
+	/*
+	 * "." and ".." are special - ".." especially so because it has
+	 * to be able to know about the current root directory and
+	 * parent relationships.
+	 */
+	if (unlikely(type != LAST_NORM))
+		return handle_dots(nd, type);
+	err = do_lookup(nd, name, path, &inode);
+	if (unlikely(err)) {
+		terminate_walk(nd);
+		return err;
+	}
+	if (!inode) {
+		path_to_nameidata(path, nd);
+		terminate_walk(nd);
+		return -ENOENT;
+	}
+	if (unlikely(inode->i_op->follow_link) && follow) {
+		if (nd->flags & LOOKUP_RCU) {
+			if (unlikely(unlazy_walk(nd, path->dentry))) {
+				terminate_walk(nd);
+				return -ECHILD;
+			}
+		}
+		BUG_ON(inode != path->dentry->d_inode);
+		return 1;
+	}
+	path_to_nameidata(path, nd);
+	nd->inode = inode;
+	return 0;
+}
+
+/*
+ * This limits recursive symlink follows to 8, while
+ * limiting consecutive symlinks to 40.
+ *
+ * Without that kind of total limit, nasty chains of consecutive
+ * symlinks can cause almost arbitrarily long lookups.
+ */
+static inline int nested_symlink(struct path *path, struct nameidata *nd)
+{
+	int res;
+
+	if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
+		path_put_conditional(path, nd);
+		path_put(&nd->path);
+		return -ELOOP;
+	}
+	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
+
+	nd->depth++;
+	current->link_count++;
+
+	do {
+		struct path link = *path;
+		void *cookie;
+
+		res = follow_link(&link, nd, &cookie);
+		if (!res)
+			res = walk_component(nd, path, &nd->last,
+					     nd->last_type, LOOKUP_FOLLOW);
+		put_link(nd, &link, cookie);
+	} while (res > 0);
+
+	current->link_count--;
+	nd->depth--;
+	return res;
+}
+
+/*
+ * Name resolution.
+ * This is the basic name resolution function, turning a pathname into
+ * the final dentry. We expect 'base' to be positive and a directory.
+ *
+ * Returns 0 and nd will have valid dentry and mnt on success.
+ * Returns error and drops reference to input namei data on failure.
+ */
+static int link_path_walk(const char *name, struct nameidata *nd)
+{
+	struct path next;
+	int err;
+	unsigned int lookup_flags = nd->flags;
+	
+	while (*name=='/')
+		name++;
+	if (!*name)
+		return 0;
+
+	/* At this point we know we have a real path component. */
+	for(;;) {
+		unsigned long hash;
+		struct qstr this;
+		unsigned int c;
+		int type;
+
+		nd->flags |= LOOKUP_CONTINUE;
+
+		err = may_lookup(nd);
+ 		if (err)
+			break;
+
+		this.name = name;
+		c = *(const unsigned char *)name;
+
+		hash = init_name_hash();
+		do {
+			name++;
+			hash = partial_name_hash(c, hash);
+			c = *(const unsigned char *)name;
+		} while (c && (c != '/'));
+		this.len = name - (const char *) this.name;
+		this.hash = end_name_hash(hash);
+
+		type = LAST_NORM;
+		if (this.name[0] == '.') switch (this.len) {
+			case 2:
+				if (this.name[1] == '.') {
+					type = LAST_DOTDOT;
+					nd->flags |= LOOKUP_JUMPED;
+				}
+				break;
+			case 1:
+				type = LAST_DOT;
+		}
+		if (likely(type == LAST_NORM)) {
+			struct dentry *parent = nd->path.dentry;
+			nd->flags &= ~LOOKUP_JUMPED;
+			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
+				err = parent->d_op->d_hash(parent, nd->inode,
+							   &this);
+				if (err < 0)
+					break;
+			}
+		}
+
+		/* remove trailing slashes? */
+		if (!c)
+			goto last_component;
+		while (*++name == '/');
+		if (!*name)
+			goto last_component;
+
+		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
+		if (err < 0)
+			return err;
+
+		if (err) {
+			err = nested_symlink(&next, nd);
+			if (err)
+				return err;
+		}
+		err = -ENOTDIR; 
+		if (!nd->inode->i_op->lookup)
+			break;
+		continue;
+		/* here ends the main loop */
+
+last_component:
+		/* Clear LOOKUP_CONTINUE iff it was previously unset */
+		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
+		nd->last = this;
+		nd->last_type = type;
+		return 0;
+	}
+	terminate_walk(nd);
+	return err;
+}
+
+static int path_init(int dfd, const char *name, unsigned int flags,
+		     struct nameidata *nd, struct file **fp)
+{
+	int retval = 0;
+	int fput_needed;
+	struct file *file;
+
+	nd->last_type = LAST_ROOT; /* if there are only slashes... */
+	nd->flags = flags | LOOKUP_JUMPED;
+	nd->depth = 0;
+	if (flags & LOOKUP_ROOT) {
+		struct inode *inode = nd->root.dentry->d_inode;
+		if (*name) {
+			if (!inode->i_op->lookup)
+				return -ENOTDIR;
+			retval = inode_permission(inode, MAY_EXEC);
+			if (retval)
+				return retval;
+		}
+		nd->path = nd->root;
+		nd->inode = inode;
+		if (flags & LOOKUP_RCU) {
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
+			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+		} else {
+			path_get(&nd->path);
+		}
+		return 0;
+	}
+
+	nd->root.mnt = NULL;
+
+	if (*name=='/') {
+		if (flags & LOOKUP_RCU) {
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
+			set_root_rcu(nd);
+		} else {
+			set_root(nd);
+			path_get(&nd->root);
+		}
+		nd->path = nd->root;
+	} else if (dfd == AT_FDCWD) {
+		if (flags & LOOKUP_RCU) {
+			struct fs_struct *fs = current->fs;
+			unsigned seq;
+
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
+
+			do {
+				seq = read_seqcount_begin(&fs->seq);
+				nd->path = fs->pwd;
+				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			} while (read_seqcount_retry(&fs->seq, seq));
+		} else {
+			get_fs_pwd(current->fs, &nd->path);
+		}
+	} else {
+		struct dentry *dentry;
+
+		file = fget_raw_light(dfd, &fput_needed);
+		retval = -EBADF;
+		if (!file)
+			goto out_fail;
+
+		dentry = file->f_path.dentry;
+
+		if (*name) {
+			retval = -ENOTDIR;
+			if (!S_ISDIR(dentry->d_inode->i_mode))
+				goto fput_fail;
+
+			retval = file_permission(file, MAY_EXEC);
+			if (retval)
+				goto fput_fail;
+		}
+
+		nd->path = file->f_path;
+		if (flags & LOOKUP_RCU) {
+			if (fput_needed)
+				*fp = file;
+			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
+		} else {
+			path_get(&file->f_path);
+			fput_light(file, fput_needed);
+		}
+	}
+
+	nd->inode = nd->path.dentry->d_inode;
+	return 0;
+
+fput_fail:
+	fput_light(file, fput_needed);
+out_fail:
+	return retval;
+}
+
+static inline int lookup_last(struct nameidata *nd, struct path *path)
+{
+	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
+		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+
+	nd->flags &= ~LOOKUP_PARENT;
+	return walk_component(nd, path, &nd->last, nd->last_type,
+					nd->flags & LOOKUP_FOLLOW);
+}
+
+/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+static int path_lookupat(int dfd, const char *name,
+				unsigned int flags, struct nameidata *nd)
+{
+	struct file *base = NULL;
+	struct path path;
+	int err;
+
+	/*
+	 * Path walking is largely split up into 2 different synchronisation
+	 * schemes, rcu-walk and ref-walk (explained in
+	 * Documentation/filesystems/path-lookup.txt). These share much of the
+	 * path walk code, but some things particularly setup, cleanup, and
+	 * following mounts are sufficiently divergent that functions are
+	 * duplicated. Typically there is a function foo(), and its RCU
+	 * analogue, foo_rcu().
+	 *
+	 * -ECHILD is the error number of choice (just to avoid clashes) that
+	 * is returned if some aspect of an rcu-walk fails. Such an error must
+	 * be handled by restarting a traditional ref-walk (which will always
+	 * be able to complete).
+	 */
+	err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
+
+	if (unlikely(err))
+		return err;
+
+	current->total_link_count = 0;
+	err = link_path_walk(name, nd);
+
+	if (!err && !(flags & LOOKUP_PARENT)) {
+		err = lookup_last(nd, &path);
+		while (err > 0) {
+			void *cookie;
+			struct path link = path;
+			nd->flags |= LOOKUP_PARENT;
+			err = follow_link(&link, nd, &cookie);
+			if (!err)
+				err = lookup_last(nd, &path);
+			put_link(nd, &link, cookie);
+		}
+	}
+
+	if (!err)
+		err = complete_walk(nd);
+
+	if (!err && nd->flags & LOOKUP_DIRECTORY) {
+		if (!nd->inode->i_op->lookup) {
+			path_put(&nd->path);
+			err = -ENOTDIR;
+		}
+	}
+
+	if (base)
+		fput(base);
+
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+		path_put(&nd->root);
+		nd->root.mnt = NULL;
+	}
+	return err;
+}
+
+static int do_path_lookup(int dfd, const char *name,
+				unsigned int flags, struct nameidata *nd)
+{
+	int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
+	if (unlikely(retval == -ECHILD))
+		retval = path_lookupat(dfd, name, flags, nd);
+	if (unlikely(retval == -ESTALE))
+		retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
+
+	if (likely(!retval)) {
+		if (unlikely(!audit_dummy_context())) {
+			if (nd->path.dentry && nd->inode)
+				audit_inode(name, nd->path.dentry);
+		}
+	}
+	return retval;
+}
+
+int kern_path_parent(const char *name, struct nameidata *nd)
+{
+	return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
+}
+
+int kern_path(const char *name, unsigned int flags, struct path *path)
+{
+	struct nameidata nd;
+	int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
+	if (!res)
+		*path = nd.path;
+	return res;
+}
+
+/**
+ * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
+ * @dentry:  pointer to dentry of the base directory
+ * @mnt: pointer to vfs mount of the base directory
+ * @name: pointer to file name
+ * @flags: lookup flags
+ * @nd: pointer to nameidata
+ */
+int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
+		    const char *name, unsigned int flags,
+		    struct nameidata *nd)
+{
+	nd->root.dentry = dentry;
+	nd->root.mnt = mnt;
+	/* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
+	return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
+}
+
+static struct dentry *__lookup_hash(struct qstr *name,
+		struct dentry *base, struct nameidata *nd)
+{
+	struct inode *inode = base->d_inode;
+	struct dentry *dentry;
+	int err;
+
+	err = exec_permission(inode, 0);
+	if (err)
+		return ERR_PTR(err);
+
+	/*
+	 * Don't bother with __d_lookup: callers are for creat as
+	 * well as unlink, so a lot of the time it would cost
+	 * a double lookup.
+	 */
+	dentry = d_lookup(base, name);
+
+	if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE))
+		dentry = do_revalidate(dentry, nd);
+
+	if (!dentry)
+		dentry = d_alloc_and_lookup(base, name, nd);
+
+	return dentry;
+}
+
+/*
+ * Restricted form of lookup. Doesn't follow links, single-component only,
+ * needs parent already locked. Doesn't follow mounts.
+ * SMP-safe.
+ */
+static struct dentry *lookup_hash(struct nameidata *nd)
+{
+	return __lookup_hash(&nd->last, nd->path.dentry, nd);
+}
+
+/**
+ * lookup_one_len - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.  Also note that by using this function the
+ * nameidata argument is passed to the filesystem methods and a filesystem
+ * using this helper needs to be prepared for that.
+ */
+struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned long hash;
+	unsigned int c;
+
+	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
+
+	this.name = name;
+	this.len = len;
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	hash = init_name_hash();
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+		hash = partial_name_hash(c, hash);
+	}
+	this.hash = end_name_hash(hash);
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, base->d_inode, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	return __lookup_hash(&this, base, NULL);
+}
+
+int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
+		 struct path *path, int *empty)
+{
+	struct nameidata nd;
+	char *tmp = getname_flags(name, flags, empty);
+	int err = PTR_ERR(tmp);
+	if (!IS_ERR(tmp)) {
+
+		BUG_ON(flags & LOOKUP_PARENT);
+
+		err = do_path_lookup(dfd, tmp, flags, &nd);
+		putname(tmp);
+		if (!err)
+			*path = nd.path;
+	}
+	return err;
+}
+
+int user_path_at(int dfd, const char __user *name, unsigned flags,
+		 struct path *path)
+{
+	return user_path_at_empty(dfd, name, flags, path, 0);
+}
+
+static int user_path_parent(int dfd, const char __user *path,
+			struct nameidata *nd, char **name)
+{
+	char *s = getname(path);
+	int error;
+
+	if (IS_ERR(s))
+		return PTR_ERR(s);
+
+	error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd);
+	if (error)
+		putname(s);
+	else
+		*name = s;
+
+	return error;
+}
+
+/*
+ * It's inline, so penalty for filesystems that don't use sticky bit is
+ * minimal.
+ */
+static inline int check_sticky(struct inode *dir, struct inode *inode)
+{
+	uid_t fsuid = current_fsuid();
+
+	if (!(dir->i_mode & S_ISVTX))
+		return 0;
+	if (current_user_ns() != inode_userns(inode))
+		goto other_userns;
+	if (inode->i_uid == fsuid)
+		return 0;
+	if (dir->i_uid == fsuid)
+		return 0;
+
+other_userns:
+	return !ns_capable(inode_userns(inode), CAP_FOWNER);
+}
+
+/*
+ *	Check whether we can remove a link victim from directory dir, check
+ *  whether the type of victim is right.
+ *  1. We can't do it if dir is read-only (done in permission())
+ *  2. We should have write and exec permissions on dir
+ *  3. We can't remove anything from append-only dir
+ *  4. We can't do anything with immutable dir (done in permission())
+ *  5. If the sticky bit on dir is set we should either
+ *	a. be owner of dir, or
+ *	b. be owner of victim, or
+ *	c. have CAP_FOWNER capability
+ *  6. If the victim is append-only or immutable we can't do antyhing with
+ *     links pointing to it.
+ *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ *  9. We can't remove a root or mountpoint.
+ * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ *     nfs_async_unlink().
+ */
+static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
+{
+	int error;
+
+	if (!victim->d_inode)
+		return -ENOENT;
+
+	BUG_ON(victim->d_parent->d_inode != dir);
+	audit_inode_child(victim, dir);
+
+	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	if (error)
+		return error;
+	if (IS_APPEND(dir))
+		return -EPERM;
+	if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
+	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
+		return -EPERM;
+	if (isdir) {
+		if (!S_ISDIR(victim->d_inode->i_mode))
+			return -ENOTDIR;
+		if (IS_ROOT(victim))
+			return -EBUSY;
+	} else if (S_ISDIR(victim->d_inode->i_mode))
+		return -EISDIR;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
+		return -EBUSY;
+	return 0;
+}
+
+/*	Check whether we can create an object with dentry child in directory
+ *  dir.
+ *  1. We can't do it if child already exists (open has special treatment for
+ *     this case, but since we are inlined it's OK)
+ *  2. We can't do it if dir is read-only (done in permission())
+ *  3. We should have write and exec permissions on dir
+ *  4. We can't do it if dir is immutable (done in permission())
+ */
+static inline int may_create(struct inode *dir, struct dentry *child)
+{
+	if (child->d_inode)
+		return -EEXIST;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/*
+ * p1 and p2 should be directories on the same fs.
+ */
+struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
+{
+	struct dentry *p;
+
+	if (p1 == p2) {
+		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+		return NULL;
+	}
+
+	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
+
+	p = d_ancestor(p2, p1);
+	if (p) {
+		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
+		return p;
+	}
+
+	p = d_ancestor(p1, p2);
+	if (p) {
+		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+		return p;
+	}
+
+	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+	return NULL;
+}
+
+void unlock_rename(struct dentry *p1, struct dentry *p2)
+{
+	mutex_unlock(&p1->d_inode->i_mutex);
+	if (p1 != p2) {
+		mutex_unlock(&p2->d_inode->i_mutex);
+		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
+	}
+}
+
+int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	int error = may_create(dir, dentry);
+
+	if (error)
+		return error;
+
+	if (!dir->i_op->create)
+		return -EACCES;	/* shouldn't it be ENOSYS? */
+	mode &= S_IALLUGO;
+	mode |= S_IFREG;
+	error = security_inode_create(dir, dentry, mode);
+	if (error)
+		return error;
+	error = dir->i_op->create(dir, dentry, mode, nd);
+	if (!error)
+		fsnotify_create(dir, dentry);
+	return error;
+}
+
+static int may_open(struct path *path, int acc_mode, int flag)
+{
+	struct dentry *dentry = path->dentry;
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	/* O_PATH? */
+	if (!acc_mode)
+		return 0;
+
+	if (!inode)
+		return -ENOENT;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFLNK:
+		return -ELOOP;
+	case S_IFDIR:
+		if (acc_mode & MAY_WRITE)
+			return -EISDIR;
+		break;
+	case S_IFBLK:
+	case S_IFCHR:
+		if (path->mnt->mnt_flags & MNT_NODEV)
+			return -EACCES;
+		/*FALLTHRU*/
+	case S_IFIFO:
+	case S_IFSOCK:
+		flag &= ~O_TRUNC;
+		break;
+	}
+
+	error = inode_permission(inode, acc_mode);
+	if (error)
+		return error;
+
+	/*
+	 * An append-only file must be opened in append mode for writing.
+	 */
+	if (IS_APPEND(inode)) {
+		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
+			return -EPERM;
+		if (flag & O_TRUNC)
+			return -EPERM;
+	}
+
+	/* O_NOATIME can only be set by the owner or superuser */
+	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
+		return -EPERM;
+
+	/*
+	 * Ensure there are no outstanding leases on the file.
+	 */
+	return break_lease(inode, flag);
+}
+
+static int handle_truncate(struct file *filp)
+{
+	struct path *path = &filp->f_path;
+	struct inode *inode = path->dentry->d_inode;
+	int error = get_write_access(inode);
+	if (error)
+		return error;
+	/*
+	 * Refuse to truncate files with mandatory locks held on them.
+	 */
+	error = locks_verify_locked(inode);
+	if (!error)
+		error = security_path_truncate(path);
+	if (!error) {
+		error = do_truncate(path->dentry, 0,
+				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
+				    filp);
+	}
+	put_write_access(inode);
+	return error;
+}
+
+/*
+ * Note that while the flag value (low two bits) for sys_open means:
+ *	00 - read-only
+ *	01 - write-only
+ *	10 - read-write
+ *	11 - special
+ * it is changed into
+ *	00 - no permissions needed
+ *	01 - read-permission
+ *	10 - write-permission
+ *	11 - read-write
+ * for the internal routines (ie open_namei()/follow_link() etc)
+ * This is more logical, and also allows the 00 "no perm needed"
+ * to be used for symlinks (where the permissions are checked
+ * later).
+ *
+*/
+static inline int open_to_namei_flags(int flag)
+{
+	if ((flag+1) & O_ACCMODE)
+		flag++;
+	return flag;
+}
+
+/*
+ * Handle the last step of open()
+ */
+static struct file *do_last(struct nameidata *nd, struct path *path,
+			    const struct open_flags *op, const char *pathname)
+{
+	struct dentry *dir = nd->path.dentry;
+	struct dentry *dentry;
+	int open_flag = op->open_flag;
+	int will_truncate = open_flag & O_TRUNC;
+	int want_write = 0;
+	int acc_mode = op->acc_mode;
+	struct file *filp;
+	int error;
+
+	nd->flags &= ~LOOKUP_PARENT;
+	nd->flags |= op->intent;
+
+	switch (nd->last_type) {
+	case LAST_DOTDOT:
+	case LAST_DOT:
+		error = handle_dots(nd, nd->last_type);
+		if (error)
+			return ERR_PTR(error);
+		/* fallthrough */
+	case LAST_ROOT:
+		error = complete_walk(nd);
+		if (error)
+			return ERR_PTR(error);
+		audit_inode(pathname, nd->path.dentry);
+		if (open_flag & O_CREAT) {
+			error = -EISDIR;
+			goto exit;
+		}
+		goto ok;
+	case LAST_BIND:
+		error = complete_walk(nd);
+		if (error)
+			return ERR_PTR(error);
+		audit_inode(pathname, dir);
+		goto ok;
+	}
+
+	if (!(open_flag & O_CREAT)) {
+		int symlink_ok = 0;
+		if (nd->last.name[nd->last.len])
+			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+		if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
+			symlink_ok = 1;
+		/* we _can_ be in RCU mode here */
+		error = walk_component(nd, path, &nd->last, LAST_NORM,
+					!symlink_ok);
+		if (error < 0)
+			return ERR_PTR(error);
+		if (error) /* symlink */
+			return NULL;
+		/* sayonara */
+		error = complete_walk(nd);
+		if (error)
+			return ERR_PTR(error);
+
+		error = -ENOTDIR;
+		if (nd->flags & LOOKUP_DIRECTORY) {
+			if (!nd->inode->i_op->lookup)
+				goto exit;
+		}
+		audit_inode(pathname, nd->path.dentry);
+		goto ok;
+	}
+
+	/* create side of things */
+	/*
+	 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been
+	 * cleared when we got to the last component we are about to look up
+	 */
+	error = complete_walk(nd);
+	if (error)
+		return ERR_PTR(error);
+
+	audit_inode(pathname, dir);
+	error = -EISDIR;
+	/* trailing slashes? */
+	if (nd->last.name[nd->last.len])
+		goto exit;
+
+	mutex_lock(&dir->d_inode->i_mutex);
+
+	dentry = lookup_hash(nd);
+	error = PTR_ERR(dentry);
+	if (IS_ERR(dentry)) {
+		mutex_unlock(&dir->d_inode->i_mutex);
+		goto exit;
+	}
+
+	path->dentry = dentry;
+	path->mnt = nd->path.mnt;
+
+	/* Negative dentry, just create the file */
+	if (!dentry->d_inode) {
+		int mode = op->mode;
+		if (!IS_POSIXACL(dir->d_inode))
+			mode &= ~current_umask();
+		/*
+		 * This write is needed to ensure that a
+		 * rw->ro transition does not occur between
+		 * the time when the file is created and when
+		 * a permanent write count is taken through
+		 * the 'struct file' in nameidata_to_filp().
+		 */
+		error = mnt_want_write(nd->path.mnt);
+		if (error)
+			goto exit_mutex_unlock;
+		want_write = 1;
+		/* Don't check for write permission, don't truncate */
+		open_flag &= ~O_TRUNC;
+		will_truncate = 0;
+		acc_mode = MAY_OPEN;
+		error = security_path_mknod(&nd->path, dentry, mode, 0);
+		if (error)
+			goto exit_mutex_unlock;
+		error = vfs_create(dir->d_inode, dentry, mode, nd);
+		if (error)
+			goto exit_mutex_unlock;
+		mutex_unlock(&dir->d_inode->i_mutex);
+		dput(nd->path.dentry);
+		nd->path.dentry = dentry;
+		goto common;
+	}
+
+	/*
+	 * It already exists.
+	 */
+	mutex_unlock(&dir->d_inode->i_mutex);
+	audit_inode(pathname, path->dentry);
+
+	error = -EEXIST;
+	if (open_flag & O_EXCL)
+		goto exit_dput;
+
+	error = follow_managed(path, nd->flags);
+	if (error < 0)
+		goto exit_dput;
+
+	if (error)
+		nd->flags |= LOOKUP_JUMPED;
+
+	error = -ENOENT;
+	if (!path->dentry->d_inode)
+		goto exit_dput;
+
+	if (path->dentry->d_inode->i_op->follow_link)
+		return NULL;
+
+	path_to_nameidata(path, nd);
+	nd->inode = path->dentry->d_inode;
+	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
+	error = complete_walk(nd);
+	if (error)
+		return ERR_PTR(error);
+	error = -EISDIR;
+	if (S_ISDIR(nd->inode->i_mode))
+		goto exit;
+ok:
+	if (!S_ISREG(nd->inode->i_mode))
+		will_truncate = 0;
+
+	if (will_truncate) {
+		error = mnt_want_write(nd->path.mnt);
+		if (error)
+			goto exit;
+		want_write = 1;
+	}
+common:
+	error = may_open(&nd->path, acc_mode, open_flag);
+	if (error)
+		goto exit;
+	filp = nameidata_to_filp(nd);
+	if (!IS_ERR(filp)) {
+		error = ima_file_check(filp, op->acc_mode);
+		if (error) {
+			fput(filp);
+			filp = ERR_PTR(error);
+		}
+	}
+	if (!IS_ERR(filp)) {
+		if (will_truncate) {
+			error = handle_truncate(filp);
+			if (error) {
+				fput(filp);
+				filp = ERR_PTR(error);
+			}
+		}
+	}
+out:
+	if (want_write)
+		mnt_drop_write(nd->path.mnt);
+	path_put(&nd->path);
+	return filp;
+
+exit_mutex_unlock:
+	mutex_unlock(&dir->d_inode->i_mutex);
+exit_dput:
+	path_put_conditional(path, nd);
+exit:
+	filp = ERR_PTR(error);
+	goto out;
+}
+
+static struct file *path_openat(int dfd, const char *pathname,
+		struct nameidata *nd, const struct open_flags *op, int flags)
+{
+	struct file *base = NULL;
+	struct file *filp;
+	struct path path;
+	int error;
+
+	filp = get_empty_filp();
+	if (!filp)
+		return ERR_PTR(-ENFILE);
+
+	filp->f_flags = op->open_flag;
+	nd->intent.open.file = filp;
+	nd->intent.open.flags = open_to_namei_flags(op->open_flag);
+	nd->intent.open.create_mode = op->mode;
+
+	error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
+	if (unlikely(error))
+		goto out_filp;
+
+	current->total_link_count = 0;
+	error = link_path_walk(pathname, nd);
+	if (unlikely(error))
+		goto out_filp;
+
+	filp = do_last(nd, &path, op, pathname);
+	while (unlikely(!filp)) { /* trailing symlink */
+		struct path link = path;
+		void *cookie;
+		if (!(nd->flags & LOOKUP_FOLLOW)) {
+			path_put_conditional(&path, nd);
+			path_put(&nd->path);
+			filp = ERR_PTR(-ELOOP);
+			break;
+		}
+		nd->flags |= LOOKUP_PARENT;
+		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
+		error = follow_link(&link, nd, &cookie);
+		if (unlikely(error))
+			filp = ERR_PTR(error);
+		else
+			filp = do_last(nd, &path, op, pathname);
+		put_link(nd, &link, cookie);
+	}
+out:
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
+		path_put(&nd->root);
+	if (base)
+		fput(base);
+	release_open_intent(nd);
+	return filp;
+
+out_filp:
+	filp = ERR_PTR(error);
+	goto out;
+}
+
+struct file *do_filp_open(int dfd, const char *pathname,
+		const struct open_flags *op, int flags)
+{
+	struct nameidata nd;
+	struct file *filp;
+
+	filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
+	if (unlikely(filp == ERR_PTR(-ECHILD)))
+		filp = path_openat(dfd, pathname, &nd, op, flags);
+	if (unlikely(filp == ERR_PTR(-ESTALE)))
+		filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
+	return filp;
+}
+
+struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+		const char *name, const struct open_flags *op, int flags)
+{
+	struct nameidata nd;
+	struct file *file;
+
+	nd.root.mnt = mnt;
+	nd.root.dentry = dentry;
+
+	flags |= LOOKUP_ROOT;
+
+	if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
+		return ERR_PTR(-ELOOP);
+
+	file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
+	if (unlikely(file == ERR_PTR(-ECHILD)))
+		file = path_openat(-1, name, &nd, op, flags);
+	if (unlikely(file == ERR_PTR(-ESTALE)))
+		file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
+	return file;
+}
+
+/**
+ * lookup_create - lookup a dentry, creating it if it doesn't exist
+ * @nd: nameidata info
+ * @is_dir: directory flag
+ *
+ * Simple function to lookup and return a dentry and create it
+ * if it doesn't exist.  Is SMP-safe.
+ *
+ * Returns with nd->path.dentry->d_inode->i_mutex locked.
+ */
+struct dentry *lookup_create(struct nameidata *nd, int is_dir)
+{
+	struct dentry *dentry = ERR_PTR(-EEXIST);
+
+	mutex_lock_nested(&nd->path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	/*
+	 * Yucky last component or no last component at all?
+	 * (foo/., foo/.., /////)
+	 */
+	if (nd->last_type != LAST_NORM)
+		goto fail;
+	nd->flags &= ~LOOKUP_PARENT;
+	nd->flags |= LOOKUP_CREATE | LOOKUP_EXCL;
+	nd->intent.open.flags = O_EXCL;
+
+	/*
+	 * Do the final lookup.
+	 */
+	dentry = lookup_hash(nd);
+	if (IS_ERR(dentry))
+		goto fail;
+
+	if (dentry->d_inode)
+		goto eexist;
+	/*
+	 * Special case - lookup gave negative, but... we had foo/bar/
+	 * From the vfs_mknod() POV we just have a negative dentry -
+	 * all is fine. Let's be bastards - you had / on the end, you've
+	 * been asking for (non-existent) directory. -ENOENT for you.
+	 */
+	if (unlikely(!is_dir && nd->last.name[nd->last.len])) {
+		dput(dentry);
+		dentry = ERR_PTR(-ENOENT);
+	}
+	return dentry;
+eexist:
+	dput(dentry);
+	dentry = ERR_PTR(-EEXIST);
+fail:
+	return dentry;
+}
+EXPORT_SYMBOL_GPL(lookup_create);
+
+int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+	int error = may_create(dir, dentry);
+
+	if (error)
+		return error;
+
+	if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
+	    !ns_capable(inode_userns(dir), CAP_MKNOD))
+		return -EPERM;
+
+	if (!dir->i_op->mknod)
+		return -EPERM;
+
+	error = devcgroup_inode_mknod(mode, dev);
+	if (error)
+		return error;
+
+	error = security_inode_mknod(dir, dentry, mode, dev);
+	if (error)
+		return error;
+
+	error = dir->i_op->mknod(dir, dentry, mode, dev);
+	if (!error)
+		fsnotify_create(dir, dentry);
+	return error;
+}
+
+static int may_mknod(mode_t mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+	case 0: /* zero mode translates to S_IFREG */
+		return 0;
+	case S_IFDIR:
+		return -EPERM;
+	default:
+		return -EINVAL;
+	}
+}
+
+SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
+		unsigned, dev)
+{
+	int error;
+	char *tmp;
+	struct dentry *dentry;
+	struct nameidata nd;
+
+	if (S_ISDIR(mode))
+		return -EPERM;
+
+	error = user_path_parent(dfd, filename, &nd, &tmp);
+	if (error)
+		return error;
+
+	dentry = lookup_create(&nd, 0);
+	if (IS_ERR(dentry)) {
+		error = PTR_ERR(dentry);
+		goto out_unlock;
+	}
+	if (!IS_POSIXACL(nd.path.dentry->d_inode))
+		mode &= ~current_umask();
+	error = may_mknod(mode);
+	if (error)
+		goto out_dput;
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
+		goto out_dput;
+	error = security_path_mknod(&nd.path, dentry, mode, dev);
+	if (error)
+		goto out_drop_write;
+	switch (mode & S_IFMT) {
+		case 0: case S_IFREG:
+			error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
+			break;
+		case S_IFCHR: case S_IFBLK:
+			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,
+					new_decode_dev(dev));
+			break;
+		case S_IFIFO: case S_IFSOCK:
+			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
+			break;
+	}
+out_drop_write:
+	mnt_drop_write(nd.path.mnt);
+out_dput:
+	dput(dentry);
+out_unlock:
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+	path_put(&nd.path);
+	putname(tmp);
+
+	return error;
+}
+
+SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev)
+{
+	return sys_mknodat(AT_FDCWD, filename, mode, dev);
+}
+
+int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int error = may_create(dir, dentry);
+
+	if (error)
+		return error;
+
+	if (!dir->i_op->mkdir)
+		return -EPERM;
+
+	mode &= (S_IRWXUGO|S_ISVTX);
+	error = security_inode_mkdir(dir, dentry, mode);
+	if (error)
+		return error;
+
+	error = dir->i_op->mkdir(dir, dentry, mode);
+	if (!error)
+		fsnotify_mkdir(dir, dentry);
+	return error;
+}
+
+SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
+{
+	int error = 0;
+	char * tmp;
+	struct dentry *dentry;
+	struct nameidata nd;
+
+	error = user_path_parent(dfd, pathname, &nd, &tmp);
+	if (error)
+		goto out_err;
+
+	dentry = lookup_create(&nd, 1);
+	error = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto out_unlock;
+
+	if (!IS_POSIXACL(nd.path.dentry->d_inode))
+		mode &= ~current_umask();
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
+		goto out_dput;
+	error = security_path_mkdir(&nd.path, dentry, mode);
+	if (error)
+		goto out_drop_write;
+	error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+out_drop_write:
+	mnt_drop_write(nd.path.mnt);
+out_dput:
+	dput(dentry);
+out_unlock:
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+	path_put(&nd.path);
+	putname(tmp);
+out_err:
+	return error;
+}
+
+SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
+{
+	return sys_mkdirat(AT_FDCWD, pathname, mode);
+}
+
+/*
+ * The dentry_unhash() helper will try to drop the dentry early: we
+ * should have a usage count of 2 if we're the only user of this
+ * dentry, and if that is true (possibly after pruning the dcache),
+ * then we drop the dentry now.
+ *
+ * A low-level filesystem can, if it choses, legally
+ * do a
+ *
+ *	if (!d_unhashed(dentry))
+ *		return -EBUSY;
+ *
+ * if it cannot handle the case of removing a directory
+ * that is still in use by something else..
+ */
+void dentry_unhash(struct dentry *dentry)
+{
+	shrink_dcache_parent(dentry);
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_count == 1)
+		__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int error = may_delete(dir, dentry, 1);
+
+	if (error)
+		return error;
+
+	if (!dir->i_op->rmdir)
+		return -EPERM;
+
+	dget(dentry);
+	mutex_lock(&dentry->d_inode->i_mutex);
+
+	error = -EBUSY;
+	if (d_mountpoint(dentry))
+		goto out;
+
+	error = security_inode_rmdir(dir, dentry);
+	if (error)
+		goto out;
+
+	shrink_dcache_parent(dentry);
+	error = dir->i_op->rmdir(dir, dentry);
+	if (error)
+		goto out;
+
+	dentry->d_inode->i_flags |= S_DEAD;
+	dont_mount(dentry);
+
+out:
+	mutex_unlock(&dentry->d_inode->i_mutex);
+	dput(dentry);
+	if (!error)
+		d_delete(dentry);
+	return error;
+}
+
+static long do_rmdir(int dfd, const char __user *pathname)
+{
+	int error = 0;
+	char * name;
+	struct dentry *dentry;
+	struct nameidata nd;
+
+	error = user_path_parent(dfd, pathname, &nd, &name);
+	if (error)
+		return error;
+
+	switch(nd.last_type) {
+	case LAST_DOTDOT:
+		error = -ENOTEMPTY;
+		goto exit1;
+	case LAST_DOT:
+		error = -EINVAL;
+		goto exit1;
+	case LAST_ROOT:
+		error = -EBUSY;
+		goto exit1;
+	}
+
+	nd.flags &= ~LOOKUP_PARENT;
+
+	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_hash(&nd);
+	error = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto exit2;
+	if (!dentry->d_inode) {
+		error = -ENOENT;
+		goto exit3;
+	}
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
+		goto exit3;
+	error = security_path_rmdir(&nd.path, dentry);
+	if (error)
+		goto exit4;
+	error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+exit4:
+	mnt_drop_write(nd.path.mnt);
+exit3:
+	dput(dentry);
+exit2:
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+exit1:
+	path_put(&nd.path);
+	putname(name);
+	return error;
+}
+
+SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
+{
+	return do_rmdir(AT_FDCWD, pathname);
+}
+
+int vfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int error = may_delete(dir, dentry, 0);
+
+	if (error)
+		return error;
+
+	if (!dir->i_op->unlink)
+		return -EPERM;
+
+	mutex_lock(&dentry->d_inode->i_mutex);
+	if (d_mountpoint(dentry))
+		error = -EBUSY;
+	else {
+		error = security_inode_unlink(dir, dentry);
+		if (!error) {
+			error = dir->i_op->unlink(dir, dentry);
+			if (!error)
+				dont_mount(dentry);
+		}
+	}
+	mutex_unlock(&dentry->d_inode->i_mutex);
+
+	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
+	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
+		fsnotify_link_count(dentry->d_inode);
+		d_delete(dentry);
+	}
+
+	return error;
+}
+
+/*
+ * Make sure that the actual truncation of the file will occur outside its
+ * directory's i_mutex.  Truncate can take a long time if there is a lot of
+ * writeout happening, and we don't want to prevent access to the directory
+ * while waiting on the I/O.
+ */
+static long do_unlinkat(int dfd, const char __user *pathname)
+{
+	int error;
+	char *name;
+	struct dentry *dentry;
+	struct nameidata nd;
+	struct inode *inode = NULL;
+
+	error = user_path_parent(dfd, pathname, &nd, &name);
+	if (error)
+		return error;
+
+	error = -EISDIR;
+	if (nd.last_type != LAST_NORM)
+		goto exit1;
+
+	nd.flags &= ~LOOKUP_PARENT;
+
+	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_hash(&nd);
+	error = PTR_ERR(dentry);
+	if (!IS_ERR(dentry)) {
+		/* Why not before? Because we want correct error value */
+		if (nd.last.name[nd.last.len])
+			goto slashes;
+		inode = dentry->d_inode;
+		if (!inode)
+			goto slashes;
+		ihold(inode);
+		error = mnt_want_write(nd.path.mnt);
+		if (error)
+			goto exit2;
+		error = security_path_unlink(&nd.path, dentry);
+		if (error)
+			goto exit3;
+		error = vfs_unlink(nd.path.dentry->d_inode, dentry);
+exit3:
+		mnt_drop_write(nd.path.mnt);
+	exit2:
+		dput(dentry);
+	}
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+	if (inode)
+		iput(inode);	/* truncate the inode here */
+exit1:
+	path_put(&nd.path);
+	putname(name);
+	return error;
+
+slashes:
+	error = !dentry->d_inode ? -ENOENT :
+		S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
+	goto exit2;
+}
+
+SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
+{
+	if ((flag & ~AT_REMOVEDIR) != 0)
+		return -EINVAL;
+
+	if (flag & AT_REMOVEDIR)
+		return do_rmdir(dfd, pathname);
+
+	return do_unlinkat(dfd, pathname);
+}
+
+SYSCALL_DEFINE1(unlink, const char __user *, pathname)
+{
+	return do_unlinkat(AT_FDCWD, pathname);
+}
+
+int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+{
+	int error = may_create(dir, dentry);
+
+	if (error)
+		return error;
+
+	if (!dir->i_op->symlink)
+		return -EPERM;
+
+	error = security_inode_symlink(dir, dentry, oldname);
+	if (error)
+		return error;
+
+	error = dir->i_op->symlink(dir, dentry, oldname);
+	if (!error)
+		fsnotify_create(dir, dentry);
+	return error;
+}
+
+SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
+		int, newdfd, const char __user *, newname)
+{
+	int error;
+	char *from;
+	char *to;
+	struct dentry *dentry;
+	struct nameidata nd;
+
+	from = getname(oldname);
+	if (IS_ERR(from))
+		return PTR_ERR(from);
+
+	error = user_path_parent(newdfd, newname, &nd, &to);
+	if (error)
+		goto out_putname;
+
+	dentry = lookup_create(&nd, 0);
+	error = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto out_unlock;
+
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
+		goto out_dput;
+	error = security_path_symlink(&nd.path, dentry, from);
+	if (error)
+		goto out_drop_write;
+	error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
+out_drop_write:
+	mnt_drop_write(nd.path.mnt);
+out_dput:
+	dput(dentry);
+out_unlock:
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+	path_put(&nd.path);
+	putname(to);
+out_putname:
+	putname(from);
+	return error;
+}
+
+SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
+{
+	return sys_symlinkat(oldname, AT_FDCWD, newname);
+}
+
+int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+	int error;
+
+	if (!inode)
+		return -ENOENT;
+
+	error = may_create(dir, new_dentry);
+	if (error)
+		return error;
+
+	if (dir->i_sb != inode->i_sb)
+		return -EXDEV;
+
+	/*
+	 * A link to an append-only or immutable file cannot be created.
+	 */
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return -EPERM;
+	if (!dir->i_op->link)
+		return -EPERM;
+	if (S_ISDIR(inode->i_mode))
+		return -EPERM;
+
+	error = security_inode_link(old_dentry, dir, new_dentry);
+	if (error)
+		return error;
+
+	mutex_lock(&inode->i_mutex);
+	/* Make sure we don't allow creating hardlink to an unlinked file */
+	if (inode->i_nlink == 0)
+		error =  -ENOENT;
+	else
+		error = dir->i_op->link(old_dentry, dir, new_dentry);
+	mutex_unlock(&inode->i_mutex);
+	if (!error)
+		fsnotify_link(dir, inode, new_dentry);
+	return error;
+}
+
+/*
+ * Hardlinks are often used in delicate situations.  We avoid
+ * security-related surprises by not following symlinks on the
+ * newname.  --KAB
+ *
+ * We don't follow them on the oldname either to be compatible
+ * with linux 2.0, and to avoid hard-linking to directories
+ * and other special files.  --ADM
+ */
+SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
+		int, newdfd, const char __user *, newname, int, flags)
+{
+	struct dentry *new_dentry;
+	struct nameidata nd;
+	struct path old_path;
+	int how = 0;
+	int error;
+	char *to;
+
+	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+	/*
+	 * To use null names we require CAP_DAC_READ_SEARCH
+	 * This ensures that not everyone will be able to create
+	 * handlink using the passed filedescriptor.
+	 */
+	if (flags & AT_EMPTY_PATH) {
+		if (!capable(CAP_DAC_READ_SEARCH))
+			return -ENOENT;
+		how = LOOKUP_EMPTY;
+	}
+
+	if (flags & AT_SYMLINK_FOLLOW)
+		how |= LOOKUP_FOLLOW;
+
+	error = user_path_at(olddfd, oldname, how, &old_path);
+	if (error)
+		return error;
+
+	error = user_path_parent(newdfd, newname, &nd, &to);
+	if (error)
+		goto out;
+	error = -EXDEV;
+	if (old_path.mnt != nd.path.mnt)
+		goto out_release;
+	new_dentry = lookup_create(&nd, 0);
+	error = PTR_ERR(new_dentry);
+	if (IS_ERR(new_dentry))
+		goto out_unlock;
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
+		goto out_dput;
+	error = security_path_link(old_path.dentry, &nd.path, new_dentry);
+	if (error)
+		goto out_drop_write;
+	error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
+out_drop_write:
+	mnt_drop_write(nd.path.mnt);
+out_dput:
+	dput(new_dentry);
+out_unlock:
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+out_release:
+	path_put(&nd.path);
+	putname(to);
+out:
+	path_put(&old_path);
+
+	return error;
+}
+
+SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
+{
+	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
+}
+
+/*
+ * The worst of all namespace operations - renaming directory. "Perverted"
+ * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
+ * Problems:
+ *	a) we can get into loop creation. Check is done in is_subdir().
+ *	b) race potential - two innocent renames can create a loop together.
+ *	   That's where 4.4 screws up. Current fix: serialization on
+ *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
+ *	   story.
+ *	c) we have to lock _three_ objects - parents and victim (if it exists).
+ *	   And that - after we got ->i_mutex on parents (until then we don't know
+ *	   whether the target exists).  Solution: try to be smart with locking
+ *	   order for inodes.  We rely on the fact that tree topology may change
+ *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
+ *	   move will be locked.  Thus we can rank directories by the tree
+ *	   (ancestors first) and rank all non-directories after them.
+ *	   That works since everybody except rename does "lock parent, lookup,
+ *	   lock child" and rename is under ->s_vfs_rename_mutex.
+ *	   HOWEVER, it relies on the assumption that any object with ->lookup()
+ *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
+ *	   we'd better make sure that there's no link(2) for them.
+ *	d) conversion from fhandle to dentry may come in the wrong moment - when
+ *	   we are removing the target. Solution: we will have to grab ->i_mutex
+ *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
+ *	   ->i_mutex on parents, which works but leads to some truly excessive
+ *	   locking].
+ */
+static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
+			  struct inode *new_dir, struct dentry *new_dentry)
+{
+	int error = 0;
+	struct inode *target = new_dentry->d_inode;
+
+	/*
+	 * If we are going to change the parent - check write permissions,
+	 * we'll need to flip '..'.
+	 */
+	if (new_dir != old_dir) {
+		error = inode_permission(old_dentry->d_inode, MAY_WRITE);
+		if (error)
+			return error;
+	}
+
+	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
+	if (error)
+		return error;
+
+	dget(new_dentry);
+	if (target)
+		mutex_lock(&target->i_mutex);
+
+	error = -EBUSY;
+	if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
+		goto out;
+
+	if (target)
+		shrink_dcache_parent(new_dentry);
+	error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+	if (error)
+		goto out;
+
+	if (target) {
+		target->i_flags |= S_DEAD;
+		dont_mount(new_dentry);
+	}
+out:
+	if (target)
+		mutex_unlock(&target->i_mutex);
+	dput(new_dentry);
+	if (!error)
+		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+			d_move(old_dentry,new_dentry);
+	return error;
+}
+
+static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
+			    struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *target = new_dentry->d_inode;
+	int error;
+
+	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
+	if (error)
+		return error;
+
+	dget(new_dentry);
+	if (target)
+		mutex_lock(&target->i_mutex);
+
+	error = -EBUSY;
+	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
+		goto out;
+
+	error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+	if (error)
+		goto out;
+
+	if (target)
+		dont_mount(new_dentry);
+	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+		d_move(old_dentry, new_dentry);
+out:
+	if (target)
+		mutex_unlock(&target->i_mutex);
+	dput(new_dentry);
+	return error;
+}
+
+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+	       struct inode *new_dir, struct dentry *new_dentry)
+{
+	int error;
+	int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
+	const unsigned char *old_name;
+
+	if (old_dentry->d_inode == new_dentry->d_inode)
+ 		return 0;
+ 
+	error = may_delete(old_dir, old_dentry, is_dir);
+	if (error)
+		return error;
+
+	if (!new_dentry->d_inode)
+		error = may_create(new_dir, new_dentry);
+	else
+		error = may_delete(new_dir, new_dentry, is_dir);
+	if (error)
+		return error;
+
+	if (!old_dir->i_op->rename)
+		return -EPERM;
+
+	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
+
+	if (is_dir)
+		error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
+	else
+		error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
+	if (!error)
+		fsnotify_move(old_dir, new_dir, old_name, is_dir,
+			      new_dentry->d_inode, old_dentry);
+	fsnotify_oldname_free(old_name);
+
+	return error;
+}
+
+SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
+		int, newdfd, const char __user *, newname)
+{
+	struct dentry *old_dir, *new_dir;
+	struct dentry *old_dentry, *new_dentry;
+	struct dentry *trap;
+	struct nameidata oldnd, newnd;
+	char *from;
+	char *to;
+	int error;
+
+	error = user_path_parent(olddfd, oldname, &oldnd, &from);
+	if (error)
+		goto exit;
+
+	error = user_path_parent(newdfd, newname, &newnd, &to);
+	if (error)
+		goto exit1;
+
+	error = -EXDEV;
+	if (oldnd.path.mnt != newnd.path.mnt)
+		goto exit2;
+
+	old_dir = oldnd.path.dentry;
+	error = -EBUSY;
+	if (oldnd.last_type != LAST_NORM)
+		goto exit2;
+
+	new_dir = newnd.path.dentry;
+	if (newnd.last_type != LAST_NORM)
+		goto exit2;
+
+	oldnd.flags &= ~LOOKUP_PARENT;
+	newnd.flags &= ~LOOKUP_PARENT;
+	newnd.flags |= LOOKUP_RENAME_TARGET;
+
+	trap = lock_rename(new_dir, old_dir);
+
+	old_dentry = lookup_hash(&oldnd);
+	error = PTR_ERR(old_dentry);
+	if (IS_ERR(old_dentry))
+		goto exit3;
+	/* source must exist */
+	error = -ENOENT;
+	if (!old_dentry->d_inode)
+		goto exit4;
+	/* unless the source is a directory trailing slashes give -ENOTDIR */
+	if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
+		error = -ENOTDIR;
+		if (oldnd.last.name[oldnd.last.len])
+			goto exit4;
+		if (newnd.last.name[newnd.last.len])
+			goto exit4;
+	}
+	/* source should not be ancestor of target */
+	error = -EINVAL;
+	if (old_dentry == trap)
+		goto exit4;
+	new_dentry = lookup_hash(&newnd);
+	error = PTR_ERR(new_dentry);
+	if (IS_ERR(new_dentry))
+		goto exit4;
+	/* target should not be an ancestor of source */
+	error = -ENOTEMPTY;
+	if (new_dentry == trap)
+		goto exit5;
+
+	error = mnt_want_write(oldnd.path.mnt);
+	if (error)
+		goto exit5;
+	error = security_path_rename(&oldnd.path, old_dentry,
+				     &newnd.path, new_dentry);
+	if (error)
+		goto exit6;
+	error = vfs_rename(old_dir->d_inode, old_dentry,
+				   new_dir->d_inode, new_dentry);
+exit6:
+	mnt_drop_write(oldnd.path.mnt);
+exit5:
+	dput(new_dentry);
+exit4:
+	dput(old_dentry);
+exit3:
+	unlock_rename(new_dir, old_dir);
+exit2:
+	path_put(&newnd.path);
+	putname(to);
+exit1:
+	path_put(&oldnd.path);
+	putname(from);
+exit:
+	return error;
+}
+
+SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
+{
+	return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
+}
+
+int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
+{
+	int len;
+
+	len = PTR_ERR(link);
+	if (IS_ERR(link))
+		goto out;
+
+	len = strlen(link);
+	if (len > (unsigned) buflen)
+		len = buflen;
+	if (copy_to_user(buffer, link, len))
+		len = -EFAULT;
+out:
+	return len;
+}
+
+/*
+ * A helper for ->readlink().  This should be used *ONLY* for symlinks that
+ * have ->follow_link() touching nd only in nd_set_link().  Using (or not
+ * using) it for any given inode is up to filesystem.
+ */
+int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+	struct nameidata nd;
+	void *cookie;
+	int res;
+
+	nd.depth = 0;
+	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
+	if (IS_ERR(cookie))
+		return PTR_ERR(cookie);
+
+	res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
+	if (dentry->d_inode->i_op->put_link)
+		dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
+	return res;
+}
+
+int vfs_follow_link(struct nameidata *nd, const char *link)
+{
+	return __vfs_follow_link(nd, link);
+}
+
+/* get the link contents into pagecache */
+static char *page_getlink(struct dentry * dentry, struct page **ppage)
+{
+	char *kaddr;
+	struct page *page;
+	struct address_space *mapping = dentry->d_inode->i_mapping;
+	page = read_mapping_page(mapping, 0, NULL);
+	if (IS_ERR(page))
+		return (char*)page;
+	*ppage = page;
+	kaddr = kmap(page);
+	nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
+	return kaddr;
+}
+
+int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+	struct page *page = NULL;
+	char *s = page_getlink(dentry, &page);
+	int res = vfs_readlink(dentry,buffer,buflen,s);
+	if (page) {
+		kunmap(page);
+		page_cache_release(page);
+	}
+	return res;
+}
+
+void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
+{
+	struct page *page = NULL;
+	nd_set_link(nd, page_getlink(dentry, &page));
+	return page;
+}
+
+void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+	struct page *page = cookie;
+
+	if (page) {
+		kunmap(page);
+		page_cache_release(page);
+	}
+}
+
+/*
+ * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
+ */
+int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	void *fsdata;
+	int err;
+	char *kaddr;
+	unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
+	if (nofs)
+		flags |= AOP_FLAG_NOFS;
+
+retry:
+	err = pagecache_write_begin(NULL, mapping, 0, len-1,
+				flags, &page, &fsdata);
+	if (err)
+		goto fail;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memcpy(kaddr, symname, len-1);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
+							page, fsdata);
+	if (err < 0)
+		goto fail;
+	if (err < len-1)
+		goto retry;
+
+	mark_inode_dirty(inode);
+	return 0;
+fail:
+	return err;
+}
+
+int page_symlink(struct inode *inode, const char *symname, int len)
+{
+	return __page_symlink(inode, symname, len,
+			!(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
+}
+
+const struct inode_operations page_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+};
+
+EXPORT_SYMBOL(user_path_at);
+EXPORT_SYMBOL(follow_down_one);
+EXPORT_SYMBOL(follow_down);
+EXPORT_SYMBOL(follow_up);
+EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
+EXPORT_SYMBOL(getname);
+EXPORT_SYMBOL(lock_rename);
+EXPORT_SYMBOL(lookup_one_len);
+EXPORT_SYMBOL(page_follow_link_light);
+EXPORT_SYMBOL(page_put_link);
+EXPORT_SYMBOL(page_readlink);
+EXPORT_SYMBOL(__page_symlink);
+EXPORT_SYMBOL(page_symlink);
+EXPORT_SYMBOL(page_symlink_inode_operations);
+EXPORT_SYMBOL(kern_path_parent);
+EXPORT_SYMBOL(kern_path);
+EXPORT_SYMBOL(vfs_path_lookup);
+EXPORT_SYMBOL(inode_permission);
+EXPORT_SYMBOL(file_permission);
+EXPORT_SYMBOL(unlock_rename);
+EXPORT_SYMBOL(vfs_create);
+EXPORT_SYMBOL(vfs_follow_link);
+EXPORT_SYMBOL(vfs_link);
+EXPORT_SYMBOL(vfs_mkdir);
+EXPORT_SYMBOL(vfs_mknod);
+EXPORT_SYMBOL(generic_permission);
+EXPORT_SYMBOL(vfs_readlink);
+EXPORT_SYMBOL(vfs_rename);
+EXPORT_SYMBOL(vfs_rmdir);
+EXPORT_SYMBOL(vfs_symlink);
+EXPORT_SYMBOL(vfs_unlink);
+EXPORT_SYMBOL(dentry_unhash);
+EXPORT_SYMBOL(generic_readlink);
diff --git a/fs/namespace.c b/fs/namespace.c
new file mode 100644
index 00000000..b3d8f51c
--- /dev/null
+++ b/fs/namespace.c
@@ -0,0 +1,2730 @@
+/*
+ *  linux/fs/namespace.c
+ *
+ * (C) Copyright Al Viro 2000, 2001
+ *	Released under GPL v2.
+ *
+ * Based on code from fs/super.c, copyright Linus Torvalds and others.
+ * Heavily rewritten.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/acct.h>
+#include <linux/capability.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/sysfs.h>
+#include <linux/seq_file.h>
+#include <linux/mnt_namespace.h>
+#include <linux/namei.h>
+#include <linux/nsproxy.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/ramfs.h>
+#include <linux/log2.h>
+#include <linux/idr.h>
+#include <linux/fs_struct.h>
+#include <linux/fsnotify.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include "pnode.h"
+#include "internal.h"
+
+#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
+#define HASH_SIZE (1UL << HASH_SHIFT)
+
+static int event;
+static DEFINE_IDA(mnt_id_ida);
+static DEFINE_IDA(mnt_group_ida);
+static DEFINE_SPINLOCK(mnt_id_lock);
+static int mnt_id_start = 0;
+static int mnt_group_start = 1;
+
+static struct list_head *mount_hashtable __read_mostly;
+static struct kmem_cache *mnt_cache __read_mostly;
+static struct rw_semaphore namespace_sem;
+
+/* /sys/fs */
+struct kobject *fs_kobj;
+EXPORT_SYMBOL_GPL(fs_kobj);
+
+/*
+ * vfsmount lock may be taken for read to prevent changes to the
+ * vfsmount hash, ie. during mountpoint lookups or walking back
+ * up the tree.
+ *
+ * It should be taken for write in all cases where the vfsmount
+ * tree or hash is modified or when a vfsmount structure is modified.
+ */
+DEFINE_BRLOCK(vfsmount_lock);
+
+static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
+{
+	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
+	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
+	tmp = tmp + (tmp >> HASH_SHIFT);
+	return tmp & (HASH_SIZE - 1);
+}
+
+#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
+
+/*
+ * allocation is serialized by namespace_sem, but we need the spinlock to
+ * serialize with freeing.
+ */
+static int mnt_alloc_id(struct vfsmount *mnt)
+{
+	int res;
+
+retry:
+	ida_pre_get(&mnt_id_ida, GFP_KERNEL);
+	spin_lock(&mnt_id_lock);
+	res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
+	if (!res)
+		mnt_id_start = mnt->mnt_id + 1;
+	spin_unlock(&mnt_id_lock);
+	if (res == -EAGAIN)
+		goto retry;
+
+	return res;
+}
+
+static void mnt_free_id(struct vfsmount *mnt)
+{
+	int id = mnt->mnt_id;
+	spin_lock(&mnt_id_lock);
+	ida_remove(&mnt_id_ida, id);
+	if (mnt_id_start > id)
+		mnt_id_start = id;
+	spin_unlock(&mnt_id_lock);
+}
+
+/*
+ * Allocate a new peer group ID
+ *
+ * mnt_group_ida is protected by namespace_sem
+ */
+static int mnt_alloc_group_id(struct vfsmount *mnt)
+{
+	int res;
+
+	if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
+		return -ENOMEM;
+
+	res = ida_get_new_above(&mnt_group_ida,
+				mnt_group_start,
+				&mnt->mnt_group_id);
+	if (!res)
+		mnt_group_start = mnt->mnt_group_id + 1;
+
+	return res;
+}
+
+/*
+ * Release a peer group ID
+ */
+void mnt_release_group_id(struct vfsmount *mnt)
+{
+	int id = mnt->mnt_group_id;
+	ida_remove(&mnt_group_ida, id);
+	if (mnt_group_start > id)
+		mnt_group_start = id;
+	mnt->mnt_group_id = 0;
+}
+
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_add_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
+#else
+	preempt_disable();
+	mnt->mnt_count += n;
+	preempt_enable();
+#endif
+}
+
+static inline void mnt_set_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+	this_cpu_write(mnt->mnt_pcp->mnt_count, n);
+#else
+	mnt->mnt_count = n;
+#endif
+}
+
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_inc_count(struct vfsmount *mnt)
+{
+	mnt_add_count(mnt, 1);
+}
+
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_dec_count(struct vfsmount *mnt)
+{
+	mnt_add_count(mnt, -1);
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+unsigned int mnt_get_count(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	unsigned int count = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
+	}
+
+	return count;
+#else
+	return mnt->mnt_count;
+#endif
+}
+
+static struct vfsmount *alloc_vfsmnt(const char *name)
+{
+	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
+	if (mnt) {
+		int err;
+
+		err = mnt_alloc_id(mnt);
+		if (err)
+			goto out_free_cache;
+
+		if (name) {
+			mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
+			if (!mnt->mnt_devname)
+				goto out_free_id;
+		}
+
+#ifdef CONFIG_SMP
+		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
+		if (!mnt->mnt_pcp)
+			goto out_free_devname;
+
+		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
+#else
+		mnt->mnt_count = 1;
+		mnt->mnt_writers = 0;
+#endif
+
+		INIT_LIST_HEAD(&mnt->mnt_hash);
+		INIT_LIST_HEAD(&mnt->mnt_child);
+		INIT_LIST_HEAD(&mnt->mnt_mounts);
+		INIT_LIST_HEAD(&mnt->mnt_list);
+		INIT_LIST_HEAD(&mnt->mnt_expire);
+		INIT_LIST_HEAD(&mnt->mnt_share);
+		INIT_LIST_HEAD(&mnt->mnt_slave_list);
+		INIT_LIST_HEAD(&mnt->mnt_slave);
+#ifdef CONFIG_FSNOTIFY
+		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
+#endif
+	}
+	return mnt;
+
+#ifdef CONFIG_SMP
+out_free_devname:
+	kfree(mnt->mnt_devname);
+#endif
+out_free_id:
+	mnt_free_id(mnt);
+out_free_cache:
+	kmem_cache_free(mnt_cache, mnt);
+	return NULL;
+}
+
+/*
+ * Most r/o checks on a fs are for operations that take
+ * discrete amounts of time, like a write() or unlink().
+ * We must keep track of when those operations start
+ * (for permission checks) and when they end, so that
+ * we can determine when writes are able to occur to
+ * a filesystem.
+ */
+/*
+ * __mnt_is_readonly: check whether a mount is read-only
+ * @mnt: the mount to check for its write status
+ *
+ * This shouldn't be used directly ouside of the VFS.
+ * It does not guarantee that the filesystem will stay
+ * r/w, just that it is right *now*.  This can not and
+ * should not be used in place of IS_RDONLY(inode).
+ * mnt_want/drop_write() will _keep_ the filesystem
+ * r/w.
+ */
+int __mnt_is_readonly(struct vfsmount *mnt)
+{
+	if (mnt->mnt_flags & MNT_READONLY)
+		return 1;
+	if (mnt->mnt_sb->s_flags & MS_RDONLY)
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__mnt_is_readonly);
+
+static inline void mnt_inc_writers(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
+#else
+	mnt->mnt_writers++;
+#endif
+}
+
+static inline void mnt_dec_writers(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
+#else
+	mnt->mnt_writers--;
+#endif
+}
+
+static unsigned int mnt_get_writers(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	unsigned int count = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
+	}
+
+	return count;
+#else
+	return mnt->mnt_writers;
+#endif
+}
+
+/*
+ * Most r/o checks on a fs are for operations that take
+ * discrete amounts of time, like a write() or unlink().
+ * We must keep track of when those operations start
+ * (for permission checks) and when they end, so that
+ * we can determine when writes are able to occur to
+ * a filesystem.
+ */
+/**
+ * mnt_want_write - get write access to a mount
+ * @mnt: the mount on which to take a write
+ *
+ * This tells the low-level filesystem that a write is
+ * about to be performed to it, and makes sure that
+ * writes are allowed before returning success.  When
+ * the write operation is finished, mnt_drop_write()
+ * must be called.  This is effectively a refcount.
+ */
+int mnt_want_write(struct vfsmount *mnt)
+{
+	int ret = 0;
+
+	preempt_disable();
+	mnt_inc_writers(mnt);
+	/*
+	 * The store to mnt_inc_writers must be visible before we pass
+	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
+	 * incremented count after it has set MNT_WRITE_HOLD.
+	 */
+	smp_mb();
+	while (mnt->mnt_flags & MNT_WRITE_HOLD)
+		cpu_relax();
+	/*
+	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
+	 * be set to match its requirements. So we must not load that until
+	 * MNT_WRITE_HOLD is cleared.
+	 */
+	smp_rmb();
+	if (__mnt_is_readonly(mnt)) {
+		mnt_dec_writers(mnt);
+		ret = -EROFS;
+		goto out;
+	}
+out:
+	preempt_enable();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mnt_want_write);
+
+/**
+ * mnt_clone_write - get write access to a mount
+ * @mnt: the mount on which to take a write
+ *
+ * This is effectively like mnt_want_write, except
+ * it must only be used to take an extra write reference
+ * on a mountpoint that we already know has a write reference
+ * on it. This allows some optimisation.
+ *
+ * After finished, mnt_drop_write must be called as usual to
+ * drop the reference.
+ */
+int mnt_clone_write(struct vfsmount *mnt)
+{
+	/* superblock may be r/o */
+	if (__mnt_is_readonly(mnt))
+		return -EROFS;
+	preempt_disable();
+	mnt_inc_writers(mnt);
+	preempt_enable();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mnt_clone_write);
+
+/**
+ * mnt_want_write_file - get write access to a file's mount
+ * @file: the file who's mount on which to take a write
+ *
+ * This is like mnt_want_write, but it takes a file and can
+ * do some optimisations if the file is open for write already
+ */
+int mnt_want_write_file(struct file *file)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
+		return mnt_want_write(file->f_path.mnt);
+	else
+		return mnt_clone_write(file->f_path.mnt);
+}
+EXPORT_SYMBOL_GPL(mnt_want_write_file);
+
+/**
+ * mnt_drop_write - give up write access to a mount
+ * @mnt: the mount on which to give up write access
+ *
+ * Tells the low-level filesystem that we are done
+ * performing writes to it.  Must be matched with
+ * mnt_want_write() call above.
+ */
+void mnt_drop_write(struct vfsmount *mnt)
+{
+	preempt_disable();
+	mnt_dec_writers(mnt);
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(mnt_drop_write);
+
+static int mnt_make_readonly(struct vfsmount *mnt)
+{
+	int ret = 0;
+
+	br_write_lock(vfsmount_lock);
+	mnt->mnt_flags |= MNT_WRITE_HOLD;
+	/*
+	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+	 * should be visible before we do.
+	 */
+	smp_mb();
+
+	/*
+	 * With writers on hold, if this value is zero, then there are
+	 * definitely no active writers (although held writers may subsequently
+	 * increment the count, they'll have to wait, and decrement it after
+	 * seeing MNT_READONLY).
+	 *
+	 * It is OK to have counter incremented on one CPU and decremented on
+	 * another: the sum will add up correctly. The danger would be when we
+	 * sum up each counter, if we read a counter before it is incremented,
+	 * but then read another CPU's count which it has been subsequently
+	 * decremented from -- we would see more decrements than we should.
+	 * MNT_WRITE_HOLD protects against this scenario, because
+	 * mnt_want_write first increments count, then smp_mb, then spins on
+	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
+	 * we're counting up here.
+	 */
+	if (mnt_get_writers(mnt) > 0)
+		ret = -EBUSY;
+	else
+		mnt->mnt_flags |= MNT_READONLY;
+	/*
+	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+	 * that become unheld will see MNT_READONLY.
+	 */
+	smp_wmb();
+	mnt->mnt_flags &= ~MNT_WRITE_HOLD;
+	br_write_unlock(vfsmount_lock);
+	return ret;
+}
+
+static void __mnt_unmake_readonly(struct vfsmount *mnt)
+{
+	br_write_lock(vfsmount_lock);
+	mnt->mnt_flags &= ~MNT_READONLY;
+	br_write_unlock(vfsmount_lock);
+}
+
+static void free_vfsmnt(struct vfsmount *mnt)
+{
+	kfree(mnt->mnt_devname);
+	mnt_free_id(mnt);
+#ifdef CONFIG_SMP
+	free_percpu(mnt->mnt_pcp);
+#endif
+	kmem_cache_free(mnt_cache, mnt);
+}
+
+/*
+ * find the first or last mount at @dentry on vfsmount @mnt depending on
+ * @dir. If @dir is set return the first mount else return the last mount.
+ * vfsmount_lock must be held for read or write.
+ */
+struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
+			      int dir)
+{
+	struct list_head *head = mount_hashtable + hash(mnt, dentry);
+	struct list_head *tmp = head;
+	struct vfsmount *p, *found = NULL;
+
+	for (;;) {
+		tmp = dir ? tmp->next : tmp->prev;
+		p = NULL;
+		if (tmp == head)
+			break;
+		p = list_entry(tmp, struct vfsmount, mnt_hash);
+		if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
+			found = p;
+			break;
+		}
+	}
+	return found;
+}
+
+/*
+ * lookup_mnt increments the ref count before returning
+ * the vfsmount struct.
+ */
+struct vfsmount *lookup_mnt(struct path *path)
+{
+	struct vfsmount *child_mnt;
+
+	br_read_lock(vfsmount_lock);
+	if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
+		mntget(child_mnt);
+	br_read_unlock(vfsmount_lock);
+	return child_mnt;
+}
+
+static inline int check_mnt(struct vfsmount *mnt)
+{
+	return mnt->mnt_ns == current->nsproxy->mnt_ns;
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+static void touch_mnt_namespace(struct mnt_namespace *ns)
+{
+	if (ns) {
+		ns->event = ++event;
+		wake_up_interruptible(&ns->poll);
+	}
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+static void __touch_mnt_namespace(struct mnt_namespace *ns)
+{
+	if (ns && ns->event != event) {
+		ns->event = event;
+		wake_up_interruptible(&ns->poll);
+	}
+}
+
+/*
+ * Clear dentry's mounted state if it has no remaining mounts.
+ * vfsmount_lock must be held for write.
+ */
+static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry)
+{
+	unsigned u;
+
+	for (u = 0; u < HASH_SIZE; u++) {
+		struct vfsmount *p;
+
+		list_for_each_entry(p, &mount_hashtable[u], mnt_hash) {
+			if (p->mnt_mountpoint == dentry)
+				return;
+		}
+	}
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags &= ~DCACHE_MOUNTED;
+	spin_unlock(&dentry->d_lock);
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
+{
+	old_path->dentry = mnt->mnt_mountpoint;
+	old_path->mnt = mnt->mnt_parent;
+	mnt->mnt_parent = mnt;
+	mnt->mnt_mountpoint = mnt->mnt_root;
+	list_del_init(&mnt->mnt_child);
+	list_del_init(&mnt->mnt_hash);
+	dentry_reset_mounted(old_path->mnt, old_path->dentry);
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
+			struct vfsmount *child_mnt)
+{
+	child_mnt->mnt_parent = mntget(mnt);
+	child_mnt->mnt_mountpoint = dget(dentry);
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags |= DCACHE_MOUNTED;
+	spin_unlock(&dentry->d_lock);
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+static void attach_mnt(struct vfsmount *mnt, struct path *path)
+{
+	mnt_set_mountpoint(path->mnt, path->dentry, mnt);
+	list_add_tail(&mnt->mnt_hash, mount_hashtable +
+			hash(path->mnt, path->dentry));
+	list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
+}
+
+static inline void __mnt_make_longterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	atomic_inc(&mnt->mnt_longterm);
+#endif
+}
+
+/* needs vfsmount lock for write */
+static inline void __mnt_make_shortterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	atomic_dec(&mnt->mnt_longterm);
+#endif
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+static void commit_tree(struct vfsmount *mnt)
+{
+	struct vfsmount *parent = mnt->mnt_parent;
+	struct vfsmount *m;
+	LIST_HEAD(head);
+	struct mnt_namespace *n = parent->mnt_ns;
+
+	BUG_ON(parent == mnt);
+
+	list_add_tail(&head, &mnt->mnt_list);
+	list_for_each_entry(m, &head, mnt_list) {
+		m->mnt_ns = n;
+		__mnt_make_longterm(m);
+	}
+
+	list_splice(&head, n->list.prev);
+
+	list_add_tail(&mnt->mnt_hash, mount_hashtable +
+				hash(parent, mnt->mnt_mountpoint));
+	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	touch_mnt_namespace(n);
+}
+
+static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
+{
+	struct list_head *next = p->mnt_mounts.next;
+	if (next == &p->mnt_mounts) {
+		while (1) {
+			if (p == root)
+				return NULL;
+			next = p->mnt_child.next;
+			if (next != &p->mnt_parent->mnt_mounts)
+				break;
+			p = p->mnt_parent;
+		}
+	}
+	return list_entry(next, struct vfsmount, mnt_child);
+}
+
+static struct vfsmount *skip_mnt_tree(struct vfsmount *p)
+{
+	struct list_head *prev = p->mnt_mounts.prev;
+	while (prev != &p->mnt_mounts) {
+		p = list_entry(prev, struct vfsmount, mnt_child);
+		prev = p->mnt_mounts.prev;
+	}
+	return p;
+}
+
+struct vfsmount *
+vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
+{
+	struct vfsmount *mnt;
+	struct dentry *root;
+
+	if (!type)
+		return ERR_PTR(-ENODEV);
+
+	mnt = alloc_vfsmnt(name);
+	if (!mnt)
+		return ERR_PTR(-ENOMEM);
+
+	if (flags & MS_KERNMOUNT)
+		mnt->mnt_flags = MNT_INTERNAL;
+
+	root = mount_fs(type, flags, name, data);
+	if (IS_ERR(root)) {
+		free_vfsmnt(mnt);
+		return ERR_CAST(root);
+	}
+
+	mnt->mnt_root = root;
+	mnt->mnt_sb = root->d_sb;
+	mnt->mnt_mountpoint = mnt->mnt_root;
+	mnt->mnt_parent = mnt;
+	return mnt;
+}
+EXPORT_SYMBOL_GPL(vfs_kern_mount);
+
+static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
+					int flag)
+{
+	struct super_block *sb = old->mnt_sb;
+	struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
+
+	if (mnt) {
+		if (flag & (CL_SLAVE | CL_PRIVATE))
+			mnt->mnt_group_id = 0; /* not a peer of original */
+		else
+			mnt->mnt_group_id = old->mnt_group_id;
+
+		if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
+			int err = mnt_alloc_group_id(mnt);
+			if (err)
+				goto out_free;
+		}
+
+		mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
+		atomic_inc(&sb->s_active);
+		mnt->mnt_sb = sb;
+		mnt->mnt_root = dget(root);
+		mnt->mnt_mountpoint = mnt->mnt_root;
+		mnt->mnt_parent = mnt;
+
+		if (flag & CL_SLAVE) {
+			list_add(&mnt->mnt_slave, &old->mnt_slave_list);
+			mnt->mnt_master = old;
+			CLEAR_MNT_SHARED(mnt);
+		} else if (!(flag & CL_PRIVATE)) {
+			if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
+				list_add(&mnt->mnt_share, &old->mnt_share);
+			if (IS_MNT_SLAVE(old))
+				list_add(&mnt->mnt_slave, &old->mnt_slave);
+			mnt->mnt_master = old->mnt_master;
+		}
+		if (flag & CL_MAKE_SHARED)
+			set_mnt_shared(mnt);
+
+		/* stick the duplicate mount on the same expiry list
+		 * as the original if that was on one */
+		if (flag & CL_EXPIRE) {
+			if (!list_empty(&old->mnt_expire))
+				list_add(&mnt->mnt_expire, &old->mnt_expire);
+		}
+	}
+	return mnt;
+
+ out_free:
+	free_vfsmnt(mnt);
+	return NULL;
+}
+
+static inline void mntfree(struct vfsmount *mnt)
+{
+	struct super_block *sb = mnt->mnt_sb;
+
+	/*
+	 * This probably indicates that somebody messed
+	 * up a mnt_want/drop_write() pair.  If this
+	 * happens, the filesystem was probably unable
+	 * to make r/w->r/o transitions.
+	 */
+	/*
+	 * The locking used to deal with mnt_count decrement provides barriers,
+	 * so mnt_get_writers() below is safe.
+	 */
+	WARN_ON(mnt_get_writers(mnt));
+	fsnotify_vfsmount_delete(mnt);
+	dput(mnt->mnt_root);
+	free_vfsmnt(mnt);
+	deactivate_super(sb);
+}
+
+static void mntput_no_expire(struct vfsmount *mnt)
+{
+put_again:
+#ifdef CONFIG_SMP
+	br_read_lock(vfsmount_lock);
+	if (likely(atomic_read(&mnt->mnt_longterm))) {
+		mnt_dec_count(mnt);
+		br_read_unlock(vfsmount_lock);
+		return;
+	}
+	br_read_unlock(vfsmount_lock);
+
+	br_write_lock(vfsmount_lock);
+	mnt_dec_count(mnt);
+	if (mnt_get_count(mnt)) {
+		br_write_unlock(vfsmount_lock);
+		return;
+	}
+#else
+	mnt_dec_count(mnt);
+	if (likely(mnt_get_count(mnt)))
+		return;
+	br_write_lock(vfsmount_lock);
+#endif
+	if (unlikely(mnt->mnt_pinned)) {
+		mnt_add_count(mnt, mnt->mnt_pinned + 1);
+		mnt->mnt_pinned = 0;
+		br_write_unlock(vfsmount_lock);
+		acct_auto_close_mnt(mnt);
+		goto put_again;
+	}
+	br_write_unlock(vfsmount_lock);
+	mntfree(mnt);
+}
+
+void mntput(struct vfsmount *mnt)
+{
+	if (mnt) {
+		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+		if (unlikely(mnt->mnt_expiry_mark))
+			mnt->mnt_expiry_mark = 0;
+		mntput_no_expire(mnt);
+	}
+}
+EXPORT_SYMBOL(mntput);
+
+struct vfsmount *mntget(struct vfsmount *mnt)
+{
+	if (mnt)
+		mnt_inc_count(mnt);
+	return mnt;
+}
+EXPORT_SYMBOL(mntget);
+
+void mnt_pin(struct vfsmount *mnt)
+{
+	br_write_lock(vfsmount_lock);
+	mnt->mnt_pinned++;
+	br_write_unlock(vfsmount_lock);
+}
+EXPORT_SYMBOL(mnt_pin);
+
+void mnt_unpin(struct vfsmount *mnt)
+{
+	br_write_lock(vfsmount_lock);
+	if (mnt->mnt_pinned) {
+		mnt_inc_count(mnt);
+		mnt->mnt_pinned--;
+	}
+	br_write_unlock(vfsmount_lock);
+}
+EXPORT_SYMBOL(mnt_unpin);
+
+static inline void mangle(struct seq_file *m, const char *s)
+{
+	seq_escape(m, s, " \t\n\\");
+}
+
+/*
+ * Simple .show_options callback for filesystems which don't want to
+ * implement more complex mount option showing.
+ *
+ * See also save_mount_options().
+ */
+int generic_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	const char *options;
+
+	rcu_read_lock();
+	options = rcu_dereference(mnt->mnt_sb->s_options);
+
+	if (options != NULL && options[0]) {
+		seq_putc(m, ',');
+		mangle(m, options);
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+EXPORT_SYMBOL(generic_show_options);
+
+/*
+ * If filesystem uses generic_show_options(), this function should be
+ * called from the fill_super() callback.
+ *
+ * The .remount_fs callback usually needs to be handled in a special
+ * way, to make sure, that previous options are not overwritten if the
+ * remount fails.
+ *
+ * Also note, that if the filesystem's .remount_fs function doesn't
+ * reset all options to their default value, but changes only newly
+ * given options, then the displayed options will not reflect reality
+ * any more.
+ */
+void save_mount_options(struct super_block *sb, char *options)
+{
+	BUG_ON(sb->s_options);
+	rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
+}
+EXPORT_SYMBOL(save_mount_options);
+
+void replace_mount_options(struct super_block *sb, char *options)
+{
+	char *old = sb->s_options;
+	rcu_assign_pointer(sb->s_options, options);
+	if (old) {
+		synchronize_rcu();
+		kfree(old);
+	}
+}
+EXPORT_SYMBOL(replace_mount_options);
+
+#ifdef CONFIG_PROC_FS
+/* iterator */
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+	struct proc_mounts *p = m->private;
+
+	down_read(&namespace_sem);
+	return seq_list_start(&p->ns->list, *pos);
+}
+
+static void *m_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct proc_mounts *p = m->private;
+
+	return seq_list_next(v, &p->ns->list, pos);
+}
+
+static void m_stop(struct seq_file *m, void *v)
+{
+	up_read(&namespace_sem);
+}
+
+int mnt_had_events(struct proc_mounts *p)
+{
+	struct mnt_namespace *ns = p->ns;
+	int res = 0;
+
+	br_read_lock(vfsmount_lock);
+	if (p->event != ns->event) {
+		p->event = ns->event;
+		res = 1;
+	}
+	br_read_unlock(vfsmount_lock);
+
+	return res;
+}
+
+struct proc_fs_info {
+	int flag;
+	const char *str;
+};
+
+static int show_sb_opts(struct seq_file *m, struct super_block *sb)
+{
+	static const struct proc_fs_info fs_info[] = {
+		{ MS_SYNCHRONOUS, ",sync" },
+		{ MS_DIRSYNC, ",dirsync" },
+		{ MS_MANDLOCK, ",mand" },
+		{ 0, NULL }
+	};
+	const struct proc_fs_info *fs_infop;
+
+	for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
+		if (sb->s_flags & fs_infop->flag)
+			seq_puts(m, fs_infop->str);
+	}
+
+	return security_sb_show_options(m, sb);
+}
+
+static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
+{
+	static const struct proc_fs_info mnt_info[] = {
+		{ MNT_NOSUID, ",nosuid" },
+		{ MNT_NODEV, ",nodev" },
+		{ MNT_NOEXEC, ",noexec" },
+		{ MNT_NOATIME, ",noatime" },
+		{ MNT_NODIRATIME, ",nodiratime" },
+		{ MNT_RELATIME, ",relatime" },
+		{ 0, NULL }
+	};
+	const struct proc_fs_info *fs_infop;
+
+	for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) {
+		if (mnt->mnt_flags & fs_infop->flag)
+			seq_puts(m, fs_infop->str);
+	}
+}
+
+static void show_type(struct seq_file *m, struct super_block *sb)
+{
+	mangle(m, sb->s_type->name);
+	if (sb->s_subtype && sb->s_subtype[0]) {
+		seq_putc(m, '.');
+		mangle(m, sb->s_subtype);
+	}
+}
+
+static int show_vfsmnt(struct seq_file *m, void *v)
+{
+	struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
+	int err = 0;
+	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+
+	if (mnt->mnt_sb->s_op->show_devname) {
+		err = mnt->mnt_sb->s_op->show_devname(m, mnt);
+		if (err)
+			goto out;
+	} else {
+		mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+	}
+	seq_putc(m, ' ');
+	seq_path(m, &mnt_path, " \t\n\\");
+	seq_putc(m, ' ');
+	show_type(m, mnt->mnt_sb);
+	seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
+	err = show_sb_opts(m, mnt->mnt_sb);
+	if (err)
+		goto out;
+	show_mnt_opts(m, mnt);
+	if (mnt->mnt_sb->s_op->show_options)
+		err = mnt->mnt_sb->s_op->show_options(m, mnt);
+	seq_puts(m, " 0 0\n");
+out:
+	return err;
+}
+
+const struct seq_operations mounts_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_vfsmnt
+};
+
+static int show_mountinfo(struct seq_file *m, void *v)
+{
+	struct proc_mounts *p = m->private;
+	struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
+	struct super_block *sb = mnt->mnt_sb;
+	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+	struct path root = p->root;
+	int err = 0;
+
+	seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id,
+		   MAJOR(sb->s_dev), MINOR(sb->s_dev));
+	if (sb->s_op->show_path)
+		err = sb->s_op->show_path(m, mnt);
+	else
+		seq_dentry(m, mnt->mnt_root, " \t\n\\");
+	if (err)
+		goto out;
+	seq_putc(m, ' ');
+
+	/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
+	err = seq_path_root(m, &mnt_path, &root, " \t\n\\");
+	if (err)
+		goto out;
+
+	seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
+	show_mnt_opts(m, mnt);
+
+	/* Tagged fields ("foo:X" or "bar") */
+	if (IS_MNT_SHARED(mnt))
+		seq_printf(m, " shared:%i", mnt->mnt_group_id);
+	if (IS_MNT_SLAVE(mnt)) {
+		int master = mnt->mnt_master->mnt_group_id;
+		int dom = get_dominating_id(mnt, &p->root);
+		seq_printf(m, " master:%i", master);
+		if (dom && dom != master)
+			seq_printf(m, " propagate_from:%i", dom);
+	}
+	if (IS_MNT_UNBINDABLE(mnt))
+		seq_puts(m, " unbindable");
+
+	/* Filesystem specific data */
+	seq_puts(m, " - ");
+	show_type(m, sb);
+	seq_putc(m, ' ');
+	if (sb->s_op->show_devname)
+		err = sb->s_op->show_devname(m, mnt);
+	else
+		mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+	if (err)
+		goto out;
+	seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
+	err = show_sb_opts(m, sb);
+	if (err)
+		goto out;
+	if (sb->s_op->show_options)
+		err = sb->s_op->show_options(m, mnt);
+	seq_putc(m, '\n');
+out:
+	return err;
+}
+
+const struct seq_operations mountinfo_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_mountinfo,
+};
+
+static int show_vfsstat(struct seq_file *m, void *v)
+{
+	struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
+	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+	int err = 0;
+
+	/* device */
+	if (mnt->mnt_sb->s_op->show_devname) {
+		seq_puts(m, "device ");
+		err = mnt->mnt_sb->s_op->show_devname(m, mnt);
+	} else {
+		if (mnt->mnt_devname) {
+			seq_puts(m, "device ");
+			mangle(m, mnt->mnt_devname);
+		} else
+			seq_puts(m, "no device");
+	}
+
+	/* mount point */
+	seq_puts(m, " mounted on ");
+	seq_path(m, &mnt_path, " \t\n\\");
+	seq_putc(m, ' ');
+
+	/* file system type */
+	seq_puts(m, "with fstype ");
+	show_type(m, mnt->mnt_sb);
+
+	/* optional statistics */
+	if (mnt->mnt_sb->s_op->show_stats) {
+		seq_putc(m, ' ');
+		if (!err)
+			err = mnt->mnt_sb->s_op->show_stats(m, mnt);
+	}
+
+	seq_putc(m, '\n');
+	return err;
+}
+
+const struct seq_operations mountstats_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_vfsstat,
+};
+#endif  /* CONFIG_PROC_FS */
+
+/**
+ * may_umount_tree - check if a mount tree is busy
+ * @mnt: root of mount tree
+ *
+ * This is called to check if a tree of mounts has any
+ * open files, pwds, chroots or sub mounts that are
+ * busy.
+ */
+int may_umount_tree(struct vfsmount *mnt)
+{
+	int actual_refs = 0;
+	int minimum_refs = 0;
+	struct vfsmount *p;
+
+	/* write lock needed for mnt_get_count */
+	br_write_lock(vfsmount_lock);
+	for (p = mnt; p; p = next_mnt(p, mnt)) {
+		actual_refs += mnt_get_count(p);
+		minimum_refs += 2;
+	}
+	br_write_unlock(vfsmount_lock);
+
+	if (actual_refs > minimum_refs)
+		return 0;
+
+	return 1;
+}
+
+EXPORT_SYMBOL(may_umount_tree);
+
+/**
+ * may_umount - check if a mount point is busy
+ * @mnt: root of mount
+ *
+ * This is called to check if a mount point has any
+ * open files, pwds, chroots or sub mounts. If the
+ * mount has sub mounts this will return busy
+ * regardless of whether the sub mounts are busy.
+ *
+ * Doesn't take quota and stuff into account. IOW, in some cases it will
+ * give false negatives. The main reason why it's here is that we need
+ * a non-destructive way to look for easily umountable filesystems.
+ */
+int may_umount(struct vfsmount *mnt)
+{
+	int ret = 1;
+	down_read(&namespace_sem);
+	br_write_lock(vfsmount_lock);
+	if (propagate_mount_busy(mnt, 2))
+		ret = 0;
+	br_write_unlock(vfsmount_lock);
+	up_read(&namespace_sem);
+	return ret;
+}
+
+EXPORT_SYMBOL(may_umount);
+
+void release_mounts(struct list_head *head)
+{
+	struct vfsmount *mnt;
+	while (!list_empty(head)) {
+		mnt = list_first_entry(head, struct vfsmount, mnt_hash);
+		list_del_init(&mnt->mnt_hash);
+		if (mnt->mnt_parent != mnt) {
+			struct dentry *dentry;
+			struct vfsmount *m;
+
+			br_write_lock(vfsmount_lock);
+			dentry = mnt->mnt_mountpoint;
+			m = mnt->mnt_parent;
+			mnt->mnt_mountpoint = mnt->mnt_root;
+			mnt->mnt_parent = mnt;
+			m->mnt_ghosts--;
+			br_write_unlock(vfsmount_lock);
+			dput(dentry);
+			mntput(m);
+		}
+		mntput(mnt);
+	}
+}
+
+/*
+ * vfsmount lock must be held for write
+ * namespace_sem must be held for write
+ */
+void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
+{
+	LIST_HEAD(tmp_list);
+	struct vfsmount *p;
+
+	for (p = mnt; p; p = next_mnt(p, mnt))
+		list_move(&p->mnt_hash, &tmp_list);
+
+	if (propagate)
+		propagate_umount(&tmp_list);
+
+	list_for_each_entry(p, &tmp_list, mnt_hash) {
+		list_del_init(&p->mnt_expire);
+		list_del_init(&p->mnt_list);
+		__touch_mnt_namespace(p->mnt_ns);
+		if (p->mnt_ns)
+			__mnt_make_shortterm(p);
+		p->mnt_ns = NULL;
+		list_del_init(&p->mnt_child);
+		if (p->mnt_parent != p) {
+			p->mnt_parent->mnt_ghosts++;
+			dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint);
+		}
+		change_mnt_propagation(p, MS_PRIVATE);
+	}
+	list_splice(&tmp_list, kill);
+}
+
+static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
+
+static int do_umount(struct vfsmount *mnt, int flags)
+{
+	struct super_block *sb = mnt->mnt_sb;
+	int retval;
+	LIST_HEAD(umount_list);
+
+	retval = security_sb_umount(mnt, flags);
+	if (retval)
+		return retval;
+
+	/*
+	 * Allow userspace to request a mountpoint be expired rather than
+	 * unmounting unconditionally. Unmount only happens if:
+	 *  (1) the mark is already set (the mark is cleared by mntput())
+	 *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
+	 */
+	if (flags & MNT_EXPIRE) {
+		if (mnt == current->fs->root.mnt ||
+		    flags & (MNT_FORCE | MNT_DETACH))
+			return -EINVAL;
+
+		/*
+		 * probably don't strictly need the lock here if we examined
+		 * all race cases, but it's a slowpath.
+		 */
+		br_write_lock(vfsmount_lock);
+		if (mnt_get_count(mnt) != 2) {
+			br_write_unlock(vfsmount_lock);
+			return -EBUSY;
+		}
+		br_write_unlock(vfsmount_lock);
+
+		if (!xchg(&mnt->mnt_expiry_mark, 1))
+			return -EAGAIN;
+	}
+
+	/*
+	 * If we may have to abort operations to get out of this
+	 * mount, and they will themselves hold resources we must
+	 * allow the fs to do things. In the Unix tradition of
+	 * 'Gee thats tricky lets do it in userspace' the umount_begin
+	 * might fail to complete on the first run through as other tasks
+	 * must return, and the like. Thats for the mount program to worry
+	 * about for the moment.
+	 */
+
+	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
+		sb->s_op->umount_begin(sb);
+	}
+
+	/*
+	 * No sense to grab the lock for this test, but test itself looks
+	 * somewhat bogus. Suggestions for better replacement?
+	 * Ho-hum... In principle, we might treat that as umount + switch
+	 * to rootfs. GC would eventually take care of the old vfsmount.
+	 * Actually it makes sense, especially if rootfs would contain a
+	 * /reboot - static binary that would close all descriptors and
+	 * call reboot(9). Then init(8) could umount root and exec /reboot.
+	 */
+	if (mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
+		/*
+		 * Special case for "unmounting" root ...
+		 * we just try to remount it readonly.
+		 */
+		down_write(&sb->s_umount);
+		if (!(sb->s_flags & MS_RDONLY))
+			retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
+		up_write(&sb->s_umount);
+		return retval;
+	}
+
+	down_write(&namespace_sem);
+	br_write_lock(vfsmount_lock);
+	event++;
+
+	if (!(flags & MNT_DETACH))
+		shrink_submounts(mnt, &umount_list);
+
+	retval = -EBUSY;
+	if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
+		if (!list_empty(&mnt->mnt_list))
+			umount_tree(mnt, 1, &umount_list);
+		retval = 0;
+	}
+	br_write_unlock(vfsmount_lock);
+	up_write(&namespace_sem);
+	release_mounts(&umount_list);
+	return retval;
+}
+
+/*
+ * Now umount can handle mount points as well as block devices.
+ * This is important for filesystems which use unnamed block devices.
+ *
+ * We now support a flag for forced unmount like the other 'big iron'
+ * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
+ */
+
+SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
+{
+	struct path path;
+	int retval;
+	int lookup_flags = 0;
+
+	if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
+		return -EINVAL;
+
+	if (!(flags & UMOUNT_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
+
+	retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
+	if (retval)
+		goto out;
+	retval = -EINVAL;
+	if (path.dentry != path.mnt->mnt_root)
+		goto dput_and_out;
+	if (!check_mnt(path.mnt))
+		goto dput_and_out;
+
+	retval = -EPERM;
+	if (!capable(CAP_SYS_ADMIN))
+		goto dput_and_out;
+
+	retval = do_umount(path.mnt, flags);
+dput_and_out:
+	/* we mustn't call path_put() as that would clear mnt_expiry_mark */
+	dput(path.dentry);
+	mntput_no_expire(path.mnt);
+out:
+	return retval;
+}
+
+#ifdef __ARCH_WANT_SYS_OLDUMOUNT
+
+/*
+ *	The 2.0 compatible umount. No flags.
+ */
+SYSCALL_DEFINE1(oldumount, char __user *, name)
+{
+	return sys_umount(name, 0);
+}
+
+#endif
+
+static int mount_is_safe(struct path *path)
+{
+	if (capable(CAP_SYS_ADMIN))
+		return 0;
+	return -EPERM;
+#ifdef notyet
+	if (S_ISLNK(path->dentry->d_inode->i_mode))
+		return -EPERM;
+	if (path->dentry->d_inode->i_mode & S_ISVTX) {
+		if (current_uid() != path->dentry->d_inode->i_uid)
+			return -EPERM;
+	}
+	if (inode_permission(path->dentry->d_inode, MAY_WRITE))
+		return -EPERM;
+	return 0;
+#endif
+}
+
+struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
+					int flag)
+{
+	struct vfsmount *res, *p, *q, *r, *s;
+	struct path path;
+
+	if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
+		return NULL;
+
+	res = q = clone_mnt(mnt, dentry, flag);
+	if (!q)
+		goto Enomem;
+	q->mnt_mountpoint = mnt->mnt_mountpoint;
+
+	p = mnt;
+	list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
+		if (!is_subdir(r->mnt_mountpoint, dentry))
+			continue;
+
+		for (s = r; s; s = next_mnt(s, r)) {
+			if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) {
+				s = skip_mnt_tree(s);
+				continue;
+			}
+			while (p != s->mnt_parent) {
+				p = p->mnt_parent;
+				q = q->mnt_parent;
+			}
+			p = s;
+			path.mnt = q;
+			path.dentry = p->mnt_mountpoint;
+			q = clone_mnt(p, p->mnt_root, flag);
+			if (!q)
+				goto Enomem;
+			br_write_lock(vfsmount_lock);
+			list_add_tail(&q->mnt_list, &res->mnt_list);
+			attach_mnt(q, &path);
+			br_write_unlock(vfsmount_lock);
+		}
+	}
+	return res;
+Enomem:
+	if (res) {
+		LIST_HEAD(umount_list);
+		br_write_lock(vfsmount_lock);
+		umount_tree(res, 0, &umount_list);
+		br_write_unlock(vfsmount_lock);
+		release_mounts(&umount_list);
+	}
+	return NULL;
+}
+
+struct vfsmount *collect_mounts(struct path *path)
+{
+	struct vfsmount *tree;
+	down_write(&namespace_sem);
+	tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE);
+	up_write(&namespace_sem);
+	return tree;
+}
+
+void drop_collected_mounts(struct vfsmount *mnt)
+{
+	LIST_HEAD(umount_list);
+	down_write(&namespace_sem);
+	br_write_lock(vfsmount_lock);
+	umount_tree(mnt, 0, &umount_list);
+	br_write_unlock(vfsmount_lock);
+	up_write(&namespace_sem);
+	release_mounts(&umount_list);
+}
+
+int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
+		   struct vfsmount *root)
+{
+	struct vfsmount *mnt;
+	int res = f(root, arg);
+	if (res)
+		return res;
+	list_for_each_entry(mnt, &root->mnt_list, mnt_list) {
+		res = f(mnt, arg);
+		if (res)
+			return res;
+	}
+	return 0;
+}
+
+static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end)
+{
+	struct vfsmount *p;
+
+	for (p = mnt; p != end; p = next_mnt(p, mnt)) {
+		if (p->mnt_group_id && !IS_MNT_SHARED(p))
+			mnt_release_group_id(p);
+	}
+}
+
+static int invent_group_ids(struct vfsmount *mnt, bool recurse)
+{
+	struct vfsmount *p;
+
+	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
+		if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
+			int err = mnt_alloc_group_id(p);
+			if (err) {
+				cleanup_group_ids(mnt, p);
+				return err;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*
+ *  @source_mnt : mount tree to be attached
+ *  @nd         : place the mount tree @source_mnt is attached
+ *  @parent_nd  : if non-null, detach the source_mnt from its parent and
+ *  		   store the parent mount and mountpoint dentry.
+ *  		   (done when source_mnt is moved)
+ *
+ *  NOTE: in the table below explains the semantics when a source mount
+ *  of a given type is attached to a destination mount of a given type.
+ * ---------------------------------------------------------------------------
+ * |         BIND MOUNT OPERATION                                            |
+ * |**************************************************************************
+ * | source-->| shared        |       private  |       slave    | unbindable |
+ * | dest     |               |                |                |            |
+ * |   |      |               |                |                |            |
+ * |   v      |               |                |                |            |
+ * |**************************************************************************
+ * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
+ * |          |               |                |                |            |
+ * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
+ * ***************************************************************************
+ * A bind operation clones the source mount and mounts the clone on the
+ * destination mount.
+ *
+ * (++)  the cloned mount is propagated to all the mounts in the propagation
+ * 	 tree of the destination mount and the cloned mount is added to
+ * 	 the peer group of the source mount.
+ * (+)   the cloned mount is created under the destination mount and is marked
+ *       as shared. The cloned mount is added to the peer group of the source
+ *       mount.
+ * (+++) the mount is propagated to all the mounts in the propagation tree
+ *       of the destination mount and the cloned mount is made slave
+ *       of the same master as that of the source mount. The cloned mount
+ *       is marked as 'shared and slave'.
+ * (*)   the cloned mount is made a slave of the same master as that of the
+ * 	 source mount.
+ *
+ * ---------------------------------------------------------------------------
+ * |         		MOVE MOUNT OPERATION                                 |
+ * |**************************************************************************
+ * | source-->| shared        |       private  |       slave    | unbindable |
+ * | dest     |               |                |                |            |
+ * |   |      |               |                |                |            |
+ * |   v      |               |                |                |            |
+ * |**************************************************************************
+ * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
+ * |          |               |                |                |            |
+ * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
+ * ***************************************************************************
+ *
+ * (+)  the mount is moved to the destination. And is then propagated to
+ * 	all the mounts in the propagation tree of the destination mount.
+ * (+*)  the mount is moved to the destination.
+ * (+++)  the mount is moved to the destination and is then propagated to
+ * 	all the mounts belonging to the destination mount's propagation tree.
+ * 	the mount is marked as 'shared and slave'.
+ * (*)	the mount continues to be a slave at the new location.
+ *
+ * if the source mount is a tree, the operations explained above is
+ * applied to each mount in the tree.
+ * Must be called without spinlocks held, since this function can sleep
+ * in allocations.
+ */
+static int attach_recursive_mnt(struct vfsmount *source_mnt,
+			struct path *path, struct path *parent_path)
+{
+	LIST_HEAD(tree_list);
+	struct vfsmount *dest_mnt = path->mnt;
+	struct dentry *dest_dentry = path->dentry;
+	struct vfsmount *child, *p;
+	int err;
+
+	if (IS_MNT_SHARED(dest_mnt)) {
+		err = invent_group_ids(source_mnt, true);
+		if (err)
+			goto out;
+	}
+	err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list);
+	if (err)
+		goto out_cleanup_ids;
+
+	br_write_lock(vfsmount_lock);
+
+	if (IS_MNT_SHARED(dest_mnt)) {
+		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
+			set_mnt_shared(p);
+	}
+	if (parent_path) {
+		detach_mnt(source_mnt, parent_path);
+		attach_mnt(source_mnt, path);
+		touch_mnt_namespace(parent_path->mnt->mnt_ns);
+	} else {
+		mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
+		commit_tree(source_mnt);
+	}
+
+	list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
+		list_del_init(&child->mnt_hash);
+		commit_tree(child);
+	}
+	br_write_unlock(vfsmount_lock);
+
+	return 0;
+
+ out_cleanup_ids:
+	if (IS_MNT_SHARED(dest_mnt))
+		cleanup_group_ids(source_mnt, NULL);
+ out:
+	return err;
+}
+
+static int lock_mount(struct path *path)
+{
+	struct vfsmount *mnt;
+retry:
+	mutex_lock(&path->dentry->d_inode->i_mutex);
+	if (unlikely(cant_mount(path->dentry))) {
+		mutex_unlock(&path->dentry->d_inode->i_mutex);
+		return -ENOENT;
+	}
+	down_write(&namespace_sem);
+	mnt = lookup_mnt(path);
+	if (likely(!mnt))
+		return 0;
+	up_write(&namespace_sem);
+	mutex_unlock(&path->dentry->d_inode->i_mutex);
+	path_put(path);
+	path->mnt = mnt;
+	path->dentry = dget(mnt->mnt_root);
+	goto retry;
+}
+
+static void unlock_mount(struct path *path)
+{
+	up_write(&namespace_sem);
+	mutex_unlock(&path->dentry->d_inode->i_mutex);
+}
+
+static int graft_tree(struct vfsmount *mnt, struct path *path)
+{
+	if (mnt->mnt_sb->s_flags & MS_NOUSER)
+		return -EINVAL;
+
+	if (S_ISDIR(path->dentry->d_inode->i_mode) !=
+	      S_ISDIR(mnt->mnt_root->d_inode->i_mode))
+		return -ENOTDIR;
+
+	if (d_unlinked(path->dentry))
+		return -ENOENT;
+
+	return attach_recursive_mnt(mnt, path, NULL);
+}
+
+/*
+ * Sanity check the flags to change_mnt_propagation.
+ */
+
+static int flags_to_propagation_type(int flags)
+{
+	int type = flags & ~(MS_REC | MS_SILENT);
+
+	/* Fail if any non-propagation flags are set */
+	if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
+		return 0;
+	/* Only one propagation flag should be set */
+	if (!is_power_of_2(type))
+		return 0;
+	return type;
+}
+
+/*
+ * recursively change the type of the mountpoint.
+ */
+static int do_change_type(struct path *path, int flag)
+{
+	struct vfsmount *m, *mnt = path->mnt;
+	int recurse = flag & MS_REC;
+	int type;
+	int err = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (path->dentry != path->mnt->mnt_root)
+		return -EINVAL;
+
+	type = flags_to_propagation_type(flag);
+	if (!type)
+		return -EINVAL;
+
+	down_write(&namespace_sem);
+	if (type == MS_SHARED) {
+		err = invent_group_ids(mnt, recurse);
+		if (err)
+			goto out_unlock;
+	}
+
+	br_write_lock(vfsmount_lock);
+	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
+		change_mnt_propagation(m, type);
+	br_write_unlock(vfsmount_lock);
+
+ out_unlock:
+	up_write(&namespace_sem);
+	return err;
+}
+
+/*
+ * do loopback mount.
+ */
+static int do_loopback(struct path *path, char *old_name,
+				int recurse)
+{
+	LIST_HEAD(umount_list);
+	struct path old_path;
+	struct vfsmount *mnt = NULL;
+	int err = mount_is_safe(path);
+	if (err)
+		return err;
+	if (!old_name || !*old_name)
+		return -EINVAL;
+	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
+	if (err)
+		return err;
+
+	err = lock_mount(path);
+	if (err)
+		goto out;
+
+	err = -EINVAL;
+	if (IS_MNT_UNBINDABLE(old_path.mnt))
+		goto out2;
+
+	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
+		goto out2;
+
+	err = -ENOMEM;
+	if (recurse)
+		mnt = copy_tree(old_path.mnt, old_path.dentry, 0);
+	else
+		mnt = clone_mnt(old_path.mnt, old_path.dentry, 0);
+
+	if (!mnt)
+		goto out2;
+
+	err = graft_tree(mnt, path);
+	if (err) {
+		br_write_lock(vfsmount_lock);
+		umount_tree(mnt, 0, &umount_list);
+		br_write_unlock(vfsmount_lock);
+	}
+out2:
+	unlock_mount(path);
+	release_mounts(&umount_list);
+out:
+	path_put(&old_path);
+	return err;
+}
+
+static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
+{
+	int error = 0;
+	int readonly_request = 0;
+
+	if (ms_flags & MS_RDONLY)
+		readonly_request = 1;
+	if (readonly_request == __mnt_is_readonly(mnt))
+		return 0;
+
+	if (readonly_request)
+		error = mnt_make_readonly(mnt);
+	else
+		__mnt_unmake_readonly(mnt);
+	return error;
+}
+
+/*
+ * change filesystem flags. dir should be a physical root of filesystem.
+ * If you've mounted a non-root directory somewhere and want to do remount
+ * on it - tough luck.
+ */
+static int do_remount(struct path *path, int flags, int mnt_flags,
+		      void *data)
+{
+	int err;
+	struct super_block *sb = path->mnt->mnt_sb;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!check_mnt(path->mnt))
+		return -EINVAL;
+
+	if (path->dentry != path->mnt->mnt_root)
+		return -EINVAL;
+
+	err = security_sb_remount(sb, data);
+	if (err)
+		return err;
+
+	down_write(&sb->s_umount);
+	if (flags & MS_BIND)
+		err = change_mount_flags(path->mnt, flags);
+	else
+		err = do_remount_sb(sb, flags, data, 0);
+	if (!err) {
+		br_write_lock(vfsmount_lock);
+		mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
+		path->mnt->mnt_flags = mnt_flags;
+		br_write_unlock(vfsmount_lock);
+	}
+	up_write(&sb->s_umount);
+	if (!err) {
+		br_write_lock(vfsmount_lock);
+		touch_mnt_namespace(path->mnt->mnt_ns);
+		br_write_unlock(vfsmount_lock);
+	}
+	return err;
+}
+
+static inline int tree_contains_unbindable(struct vfsmount *mnt)
+{
+	struct vfsmount *p;
+	for (p = mnt; p; p = next_mnt(p, mnt)) {
+		if (IS_MNT_UNBINDABLE(p))
+			return 1;
+	}
+	return 0;
+}
+
+static int do_move_mount(struct path *path, char *old_name)
+{
+	struct path old_path, parent_path;
+	struct vfsmount *p;
+	int err = 0;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (!old_name || !*old_name)
+		return -EINVAL;
+	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
+	if (err)
+		return err;
+
+	err = lock_mount(path);
+	if (err < 0)
+		goto out;
+
+	err = -EINVAL;
+	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
+		goto out1;
+
+	if (d_unlinked(path->dentry))
+		goto out1;
+
+	err = -EINVAL;
+	if (old_path.dentry != old_path.mnt->mnt_root)
+		goto out1;
+
+	if (old_path.mnt == old_path.mnt->mnt_parent)
+		goto out1;
+
+	if (S_ISDIR(path->dentry->d_inode->i_mode) !=
+	      S_ISDIR(old_path.dentry->d_inode->i_mode))
+		goto out1;
+	/*
+	 * Don't move a mount residing in a shared parent.
+	 */
+	if (old_path.mnt->mnt_parent &&
+	    IS_MNT_SHARED(old_path.mnt->mnt_parent))
+		goto out1;
+	/*
+	 * Don't move a mount tree containing unbindable mounts to a destination
+	 * mount which is shared.
+	 */
+	if (IS_MNT_SHARED(path->mnt) &&
+	    tree_contains_unbindable(old_path.mnt))
+		goto out1;
+	err = -ELOOP;
+	for (p = path->mnt; p->mnt_parent != p; p = p->mnt_parent)
+		if (p == old_path.mnt)
+			goto out1;
+
+	err = attach_recursive_mnt(old_path.mnt, path, &parent_path);
+	if (err)
+		goto out1;
+
+	/* if the mount is moved, it should no longer be expire
+	 * automatically */
+	list_del_init(&old_path.mnt->mnt_expire);
+out1:
+	unlock_mount(path);
+out:
+	if (!err)
+		path_put(&parent_path);
+	path_put(&old_path);
+	return err;
+}
+
+static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
+{
+	int err;
+	const char *subtype = strchr(fstype, '.');
+	if (subtype) {
+		subtype++;
+		err = -EINVAL;
+		if (!subtype[0])
+			goto err;
+	} else
+		subtype = "";
+
+	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
+	err = -ENOMEM;
+	if (!mnt->mnt_sb->s_subtype)
+		goto err;
+	return mnt;
+
+ err:
+	mntput(mnt);
+	return ERR_PTR(err);
+}
+
+struct vfsmount *
+do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+{
+	struct file_system_type *type = get_fs_type(fstype);
+	struct vfsmount *mnt;
+	if (!type)
+		return ERR_PTR(-ENODEV);
+	mnt = vfs_kern_mount(type, flags, name, data);
+	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
+	    !mnt->mnt_sb->s_subtype)
+		mnt = fs_set_subtype(mnt, fstype);
+	put_filesystem(type);
+	return mnt;
+}
+EXPORT_SYMBOL_GPL(do_kern_mount);
+
+/*
+ * add a mount into a namespace's mount tree
+ */
+static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
+{
+	int err;
+
+	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
+
+	err = lock_mount(path);
+	if (err)
+		return err;
+
+	err = -EINVAL;
+	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
+		goto unlock;
+
+	/* Refuse the same filesystem on the same mount point */
+	err = -EBUSY;
+	if (path->mnt->mnt_sb == newmnt->mnt_sb &&
+	    path->mnt->mnt_root == path->dentry)
+		goto unlock;
+
+	err = -EINVAL;
+	if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
+		goto unlock;
+
+	newmnt->mnt_flags = mnt_flags;
+	err = graft_tree(newmnt, path);
+
+unlock:
+	unlock_mount(path);
+	return err;
+}
+
+/*
+ * create a new mount for userspace and request it to be added into the
+ * namespace's tree
+ */
+static int do_new_mount(struct path *path, char *type, int flags,
+			int mnt_flags, char *name, void *data)
+{
+	struct vfsmount *mnt;
+	int err;
+
+	if (!type)
+		return -EINVAL;
+
+	/* we need capabilities... */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mnt = do_kern_mount(type, flags, name, data);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+
+	err = do_add_mount(mnt, path, mnt_flags);
+	if (err)
+		mntput(mnt);
+	return err;
+}
+
+int finish_automount(struct vfsmount *m, struct path *path)
+{
+	int err;
+	/* The new mount record should have at least 2 refs to prevent it being
+	 * expired before we get a chance to add it
+	 */
+	BUG_ON(mnt_get_count(m) < 2);
+
+	if (m->mnt_sb == path->mnt->mnt_sb &&
+	    m->mnt_root == path->dentry) {
+		err = -ELOOP;
+		goto fail;
+	}
+
+	err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+	if (!err)
+		return 0;
+fail:
+	/* remove m from any expiration list it may be on */
+	if (!list_empty(&m->mnt_expire)) {
+		down_write(&namespace_sem);
+		br_write_lock(vfsmount_lock);
+		list_del_init(&m->mnt_expire);
+		br_write_unlock(vfsmount_lock);
+		up_write(&namespace_sem);
+	}
+	mntput(m);
+	mntput(m);
+	return err;
+}
+
+/**
+ * mnt_set_expiry - Put a mount on an expiration list
+ * @mnt: The mount to list.
+ * @expiry_list: The list to add the mount to.
+ */
+void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
+{
+	down_write(&namespace_sem);
+	br_write_lock(vfsmount_lock);
+
+	list_add_tail(&mnt->mnt_expire, expiry_list);
+
+	br_write_unlock(vfsmount_lock);
+	up_write(&namespace_sem);
+}
+EXPORT_SYMBOL(mnt_set_expiry);
+
+/*
+ * process a list of expirable mountpoints with the intent of discarding any
+ * mountpoints that aren't in use and haven't been touched since last we came
+ * here
+ */
+void mark_mounts_for_expiry(struct list_head *mounts)
+{
+	struct vfsmount *mnt, *next;
+	LIST_HEAD(graveyard);
+	LIST_HEAD(umounts);
+
+	if (list_empty(mounts))
+		return;
+
+	down_write(&namespace_sem);
+	br_write_lock(vfsmount_lock);
+
+	/* extract from the expiration list every vfsmount that matches the
+	 * following criteria:
+	 * - only referenced by its parent vfsmount
+	 * - still marked for expiry (marked on the last call here; marks are
+	 *   cleared by mntput())
+	 */
+	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
+		if (!xchg(&mnt->mnt_expiry_mark, 1) ||
+			propagate_mount_busy(mnt, 1))
+			continue;
+		list_move(&mnt->mnt_expire, &graveyard);
+	}
+	while (!list_empty(&graveyard)) {
+		mnt = list_first_entry(&graveyard, struct vfsmount, mnt_expire);
+		touch_mnt_namespace(mnt->mnt_ns);
+		umount_tree(mnt, 1, &umounts);
+	}
+	br_write_unlock(vfsmount_lock);
+	up_write(&namespace_sem);
+
+	release_mounts(&umounts);
+}
+
+EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+
+/*
+ * Ripoff of 'select_parent()'
+ *
+ * search the list of submounts for a given mountpoint, and move any
+ * shrinkable submounts to the 'graveyard' list.
+ */
+static int select_submounts(struct vfsmount *parent, struct list_head *graveyard)
+{
+	struct vfsmount *this_parent = parent;
+	struct list_head *next;
+	int found = 0;
+
+repeat:
+	next = this_parent->mnt_mounts.next;
+resume:
+	while (next != &this_parent->mnt_mounts) {
+		struct list_head *tmp = next;
+		struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child);
+
+		next = tmp->next;
+		if (!(mnt->mnt_flags & MNT_SHRINKABLE))
+			continue;
+		/*
+		 * Descend a level if the d_mounts list is non-empty.
+		 */
+		if (!list_empty(&mnt->mnt_mounts)) {
+			this_parent = mnt;
+			goto repeat;
+		}
+
+		if (!propagate_mount_busy(mnt, 1)) {
+			list_move_tail(&mnt->mnt_expire, graveyard);
+			found++;
+		}
+	}
+	/*
+	 * All done at this level ... ascend and resume the search
+	 */
+	if (this_parent != parent) {
+		next = this_parent->mnt_child.next;
+		this_parent = this_parent->mnt_parent;
+		goto resume;
+	}
+	return found;
+}
+
+/*
+ * process a list of expirable mountpoints with the intent of discarding any
+ * submounts of a specific parent mountpoint
+ *
+ * vfsmount_lock must be held for write
+ */
+static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
+{
+	LIST_HEAD(graveyard);
+	struct vfsmount *m;
+
+	/* extract submounts of 'mountpoint' from the expiration list */
+	while (select_submounts(mnt, &graveyard)) {
+		while (!list_empty(&graveyard)) {
+			m = list_first_entry(&graveyard, struct vfsmount,
+						mnt_expire);
+			touch_mnt_namespace(m->mnt_ns);
+			umount_tree(m, 1, umounts);
+		}
+	}
+}
+
+/*
+ * Some copy_from_user() implementations do not return the exact number of
+ * bytes remaining to copy on a fault.  But copy_mount_options() requires that.
+ * Note that this function differs from copy_from_user() in that it will oops
+ * on bad values of `to', rather than returning a short copy.
+ */
+static long exact_copy_from_user(void *to, const void __user * from,
+				 unsigned long n)
+{
+	char *t = to;
+	const char __user *f = from;
+	char c;
+
+	if (!access_ok(VERIFY_READ, from, n))
+		return n;
+
+	while (n) {
+		if (__get_user(c, f)) {
+			memset(t, 0, n);
+			break;
+		}
+		*t++ = c;
+		f++;
+		n--;
+	}
+	return n;
+}
+
+int copy_mount_options(const void __user * data, unsigned long *where)
+{
+	int i;
+	unsigned long page;
+	unsigned long size;
+
+	*where = 0;
+	if (!data)
+		return 0;
+
+	if (!(page = __get_free_page(GFP_KERNEL)))
+		return -ENOMEM;
+
+	/* We only care that *some* data at the address the user
+	 * gave us is valid.  Just in case, we'll zero
+	 * the remainder of the page.
+	 */
+	/* copy_from_user cannot cross TASK_SIZE ! */
+	size = TASK_SIZE - (unsigned long)data;
+	if (size > PAGE_SIZE)
+		size = PAGE_SIZE;
+
+	i = size - exact_copy_from_user((void *)page, data, size);
+	if (!i) {
+		free_page(page);
+		return -EFAULT;
+	}
+	if (i != PAGE_SIZE)
+		memset((char *)page + i, 0, PAGE_SIZE - i);
+	*where = page;
+	return 0;
+}
+
+int copy_mount_string(const void __user *data, char **where)
+{
+	char *tmp;
+
+	if (!data) {
+		*where = NULL;
+		return 0;
+	}
+
+	tmp = strndup_user(data, PAGE_SIZE);
+	if (IS_ERR(tmp))
+		return PTR_ERR(tmp);
+
+	*where = tmp;
+	return 0;
+}
+
+/*
+ * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
+ * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
+ *
+ * data is a (void *) that can point to any structure up to
+ * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
+ * information (or be NULL).
+ *
+ * Pre-0.97 versions of mount() didn't have a flags word.
+ * When the flags word was introduced its top half was required
+ * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
+ * Therefore, if this magic number is present, it carries no information
+ * and must be discarded.
+ */
+long do_mount(char *dev_name, char *dir_name, char *type_page,
+		  unsigned long flags, void *data_page)
+{
+	struct path path;
+	int retval = 0;
+	int mnt_flags = 0;
+
+	/* Discard magic */
+	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
+		flags &= ~MS_MGC_MSK;
+
+	/* Basic sanity checks */
+
+	if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
+		return -EINVAL;
+
+	if (data_page)
+		((char *)data_page)[PAGE_SIZE - 1] = 0;
+
+	/* ... and get the mountpoint */
+	retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
+	if (retval)
+		return retval;
+
+	retval = security_sb_mount(dev_name, &path,
+				   type_page, flags, data_page);
+	if (retval)
+		goto dput_out;
+
+	/* Default to relatime unless overriden */
+	if (!(flags & MS_NOATIME))
+		mnt_flags |= MNT_RELATIME;
+
+	/* Separate the per-mountpoint flags */
+	if (flags & MS_NOSUID)
+		mnt_flags |= MNT_NOSUID;
+	if (flags & MS_NODEV)
+		mnt_flags |= MNT_NODEV;
+	if (flags & MS_NOEXEC)
+		mnt_flags |= MNT_NOEXEC;
+	if (flags & MS_NOATIME)
+		mnt_flags |= MNT_NOATIME;
+	if (flags & MS_NODIRATIME)
+		mnt_flags |= MNT_NODIRATIME;
+	if (flags & MS_STRICTATIME)
+		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
+	if (flags & MS_RDONLY)
+		mnt_flags |= MNT_READONLY;
+
+	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
+		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
+		   MS_STRICTATIME);
+
+	if (flags & MS_REMOUNT)
+		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
+				    data_page);
+	else if (flags & MS_BIND)
+		retval = do_loopback(&path, dev_name, flags & MS_REC);
+	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
+		retval = do_change_type(&path, flags);
+	else if (flags & MS_MOVE)
+		retval = do_move_mount(&path, dev_name);
+	else
+		retval = do_new_mount(&path, type_page, flags, mnt_flags,
+				      dev_name, data_page);
+dput_out:
+	path_put(&path);
+	return retval;
+}
+
+static struct mnt_namespace *alloc_mnt_ns(void)
+{
+	struct mnt_namespace *new_ns;
+
+	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+	if (!new_ns)
+		return ERR_PTR(-ENOMEM);
+	atomic_set(&new_ns->count, 1);
+	new_ns->root = NULL;
+	INIT_LIST_HEAD(&new_ns->list);
+	init_waitqueue_head(&new_ns->poll);
+	new_ns->event = 0;
+	return new_ns;
+}
+
+void mnt_make_longterm(struct vfsmount *mnt)
+{
+	__mnt_make_longterm(mnt);
+}
+
+void mnt_make_shortterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
+		return;
+	br_write_lock(vfsmount_lock);
+	atomic_dec(&mnt->mnt_longterm);
+	br_write_unlock(vfsmount_lock);
+#endif
+}
+
+/*
+ * Allocate a new namespace structure and populate it with contents
+ * copied from the namespace of the passed in task structure.
+ */
+static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
+		struct fs_struct *fs)
+{
+	struct mnt_namespace *new_ns;
+	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
+	struct vfsmount *p, *q;
+
+	new_ns = alloc_mnt_ns();
+	if (IS_ERR(new_ns))
+		return new_ns;
+
+	down_write(&namespace_sem);
+	/* First pass: copy the tree topology */
+	new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root,
+					CL_COPY_ALL | CL_EXPIRE);
+	if (!new_ns->root) {
+		up_write(&namespace_sem);
+		kfree(new_ns);
+		return ERR_PTR(-ENOMEM);
+	}
+	br_write_lock(vfsmount_lock);
+	list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
+	br_write_unlock(vfsmount_lock);
+
+	/*
+	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
+	 * as belonging to new namespace.  We have already acquired a private
+	 * fs_struct, so tsk->fs->lock is not needed.
+	 */
+	p = mnt_ns->root;
+	q = new_ns->root;
+	while (p) {
+		q->mnt_ns = new_ns;
+		__mnt_make_longterm(q);
+		if (fs) {
+			if (p == fs->root.mnt) {
+				fs->root.mnt = mntget(q);
+				__mnt_make_longterm(q);
+				mnt_make_shortterm(p);
+				rootmnt = p;
+			}
+			if (p == fs->pwd.mnt) {
+				fs->pwd.mnt = mntget(q);
+				__mnt_make_longterm(q);
+				mnt_make_shortterm(p);
+				pwdmnt = p;
+			}
+		}
+		p = next_mnt(p, mnt_ns->root);
+		q = next_mnt(q, new_ns->root);
+	}
+	up_write(&namespace_sem);
+
+	if (rootmnt)
+		mntput(rootmnt);
+	if (pwdmnt)
+		mntput(pwdmnt);
+
+	return new_ns;
+}
+
+struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
+		struct fs_struct *new_fs)
+{
+	struct mnt_namespace *new_ns;
+
+	BUG_ON(!ns);
+	get_mnt_ns(ns);
+
+	if (!(flags & CLONE_NEWNS))
+		return ns;
+
+	new_ns = dup_mnt_ns(ns, new_fs);
+
+	put_mnt_ns(ns);
+	return new_ns;
+}
+
+/**
+ * create_mnt_ns - creates a private namespace and adds a root filesystem
+ * @mnt: pointer to the new root filesystem mountpoint
+ */
+struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
+{
+	struct mnt_namespace *new_ns;
+
+	new_ns = alloc_mnt_ns();
+	if (!IS_ERR(new_ns)) {
+		mnt->mnt_ns = new_ns;
+		__mnt_make_longterm(mnt);
+		new_ns->root = mnt;
+		list_add(&new_ns->list, &new_ns->root->mnt_list);
+	}
+	return new_ns;
+}
+EXPORT_SYMBOL(create_mnt_ns);
+
+SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
+		char __user *, type, unsigned long, flags, void __user *, data)
+{
+	int ret;
+	char *kernel_type;
+	char *kernel_dir;
+	char *kernel_dev;
+	unsigned long data_page;
+
+	ret = copy_mount_string(type, &kernel_type);
+	if (ret < 0)
+		goto out_type;
+
+	kernel_dir = getname(dir_name);
+	if (IS_ERR(kernel_dir)) {
+		ret = PTR_ERR(kernel_dir);
+		goto out_dir;
+	}
+
+	ret = copy_mount_string(dev_name, &kernel_dev);
+	if (ret < 0)
+		goto out_dev;
+
+	ret = copy_mount_options(data, &data_page);
+	if (ret < 0)
+		goto out_data;
+
+	ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags,
+		(void *) data_page);
+
+	free_page(data_page);
+out_data:
+	kfree(kernel_dev);
+out_dev:
+	putname(kernel_dir);
+out_dir:
+	kfree(kernel_type);
+out_type:
+	return ret;
+}
+
+/*
+ * pivot_root Semantics:
+ * Moves the root file system of the current process to the directory put_old,
+ * makes new_root as the new root file system of the current process, and sets
+ * root/cwd of all processes which had them on the current root to new_root.
+ *
+ * Restrictions:
+ * The new_root and put_old must be directories, and  must not be on the
+ * same file  system as the current process root. The put_old  must  be
+ * underneath new_root,  i.e. adding a non-zero number of /.. to the string
+ * pointed to by put_old must yield the same directory as new_root. No other
+ * file system may be mounted on put_old. After all, new_root is a mountpoint.
+ *
+ * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
+ * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
+ * in this situation.
+ *
+ * Notes:
+ *  - we don't move root/cwd if they are not at the root (reason: if something
+ *    cared enough to change them, it's probably wrong to force them elsewhere)
+ *  - it's okay to pick a root that isn't the root of a file system, e.g.
+ *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
+ *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
+ *    first.
+ */
+SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
+		const char __user *, put_old)
+{
+	struct vfsmount *tmp;
+	struct path new, old, parent_path, root_parent, root;
+	int error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	error = user_path_dir(new_root, &new);
+	if (error)
+		goto out0;
+
+	error = user_path_dir(put_old, &old);
+	if (error)
+		goto out1;
+
+	error = security_sb_pivotroot(&old, &new);
+	if (error)
+		goto out2;
+
+	get_fs_root(current->fs, &root);
+	error = lock_mount(&old);
+	if (error)
+		goto out3;
+
+	error = -EINVAL;
+	if (IS_MNT_SHARED(old.mnt) ||
+		IS_MNT_SHARED(new.mnt->mnt_parent) ||
+		IS_MNT_SHARED(root.mnt->mnt_parent))
+		goto out4;
+	if (!check_mnt(root.mnt) || !check_mnt(new.mnt))
+		goto out4;
+	error = -ENOENT;
+	if (d_unlinked(new.dentry))
+		goto out4;
+	if (d_unlinked(old.dentry))
+		goto out4;
+	error = -EBUSY;
+	if (new.mnt == root.mnt ||
+	    old.mnt == root.mnt)
+		goto out4; /* loop, on the same file system  */
+	error = -EINVAL;
+	if (root.mnt->mnt_root != root.dentry)
+		goto out4; /* not a mountpoint */
+	if (root.mnt->mnt_parent == root.mnt)
+		goto out4; /* not attached */
+	if (new.mnt->mnt_root != new.dentry)
+		goto out4; /* not a mountpoint */
+	if (new.mnt->mnt_parent == new.mnt)
+		goto out4; /* not attached */
+	/* make sure we can reach put_old from new_root */
+	tmp = old.mnt;
+	if (tmp != new.mnt) {
+		for (;;) {
+			if (tmp->mnt_parent == tmp)
+				goto out4; /* already mounted on put_old */
+			if (tmp->mnt_parent == new.mnt)
+				break;
+			tmp = tmp->mnt_parent;
+		}
+		if (!is_subdir(tmp->mnt_mountpoint, new.dentry))
+			goto out4;
+	} else if (!is_subdir(old.dentry, new.dentry))
+		goto out4;
+	br_write_lock(vfsmount_lock);
+	detach_mnt(new.mnt, &parent_path);
+	detach_mnt(root.mnt, &root_parent);
+	/* mount old root on put_old */
+	attach_mnt(root.mnt, &old);
+	/* mount new_root on / */
+	attach_mnt(new.mnt, &root_parent);
+	touch_mnt_namespace(current->nsproxy->mnt_ns);
+	br_write_unlock(vfsmount_lock);
+	chroot_fs_refs(&root, &new);
+	error = 0;
+out4:
+	unlock_mount(&old);
+	if (!error) {
+		path_put(&root_parent);
+		path_put(&parent_path);
+	}
+out3:
+	path_put(&root);
+out2:
+	path_put(&old);
+out1:
+	path_put(&new);
+out0:
+	return error;
+}
+
+static void __init init_mount_tree(void)
+{
+	struct vfsmount *mnt;
+	struct mnt_namespace *ns;
+	struct path root;
+
+	mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
+	if (IS_ERR(mnt))
+		panic("Can't create rootfs");
+
+	ns = create_mnt_ns(mnt);
+	if (IS_ERR(ns))
+		panic("Can't allocate initial namespace");
+
+	init_task.nsproxy->mnt_ns = ns;
+	get_mnt_ns(ns);
+
+	root.mnt = ns->root;
+	root.dentry = ns->root->mnt_root;
+
+	set_fs_pwd(current->fs, &root);
+	set_fs_root(current->fs, &root);
+}
+
+void __init mnt_init(void)
+{
+	unsigned u;
+	int err;
+
+	init_rwsem(&namespace_sem);
+
+	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+
+	mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
+
+	if (!mount_hashtable)
+		panic("Failed to allocate mount hash table\n");
+
+	printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE);
+
+	for (u = 0; u < HASH_SIZE; u++)
+		INIT_LIST_HEAD(&mount_hashtable[u]);
+
+	br_lock_init(vfsmount_lock);
+
+	err = sysfs_init();
+	if (err)
+		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
+			__func__, err);
+	fs_kobj = kobject_create_and_add("fs", NULL);
+	if (!fs_kobj)
+		printk(KERN_WARNING "%s: kobj create error\n", __func__);
+	init_rootfs();
+	init_mount_tree();
+}
+
+void put_mnt_ns(struct mnt_namespace *ns)
+{
+	LIST_HEAD(umount_list);
+
+	if (!atomic_dec_and_test(&ns->count))
+		return;
+	down_write(&namespace_sem);
+	br_write_lock(vfsmount_lock);
+	umount_tree(ns->root, 0, &umount_list);
+	br_write_unlock(vfsmount_lock);
+	up_write(&namespace_sem);
+	release_mounts(&umount_list);
+	kfree(ns);
+}
+EXPORT_SYMBOL(put_mnt_ns);
+
+struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
+{
+	return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
+}
+EXPORT_SYMBOL_GPL(kern_mount_data);
+
+bool our_mnt(struct vfsmount *mnt)
+{
+	return check_mnt(mnt);
+}
diff --git a/fs/ncpfs/Kconfig b/fs/ncpfs/Kconfig
new file mode 100644
index 00000000..c931cf22
--- /dev/null
+++ b/fs/ncpfs/Kconfig
@@ -0,0 +1,108 @@
+#
+# NCP Filesystem configuration
+#
+config NCP_FS
+	tristate "NCP file system support (to mount NetWare volumes)"
+	depends on IPX!=n || INET
+	help
+	  NCP (NetWare Core Protocol) is a protocol that runs over IPX and is
+	  used by Novell NetWare clients to talk to file servers.  It is to
+	  IPX what NFS is to TCP/IP, if that helps.  Saying Y here allows you
+	  to mount NetWare file server volumes and to access them just like
+	  any other Unix directory.  For details, please read the file
+	  <file:Documentation/filesystems/ncpfs.txt> in the kernel source and
+	  the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>.
+
+	  You do not have to say Y here if you want your Linux box to act as a
+	  file *server* for Novell NetWare clients.
+
+	  General information about how to connect Linux, Windows machines and
+	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
+
+	  To compile this as a module, choose M here: the module will be called
+	  ncpfs.  Say N unless you are connected to a Novell network.
+
+config NCPFS_PACKET_SIGNING
+	bool "Packet signatures"
+	depends on NCP_FS
+	help
+	  NCP allows packets to be signed for stronger security. If you want
+	  security, say Y.  Normal users can leave it off.  To be able to use
+	  packet signing you must use ncpfs > 2.0.12.
+
+config NCPFS_IOCTL_LOCKING
+	bool "Proprietary file locking"
+	depends on NCP_FS
+	help
+	  Allows locking of records on remote volumes.  Say N unless you have
+	  special applications which are able to utilize this locking scheme.
+
+config NCPFS_STRONG
+	bool "Clear remove/delete inhibit when needed"
+	depends on NCP_FS
+	help
+	  Allows manipulation of files flagged as Delete or Rename Inhibit.
+	  To use this feature you must mount volumes with the ncpmount
+	  parameter "-s" (ncpfs-2.0.12 and newer).  Say Y unless you are not
+	  mounting volumes with -f 444.
+
+config NCPFS_NFS_NS
+	bool "Use NFS namespace if available"
+	depends on NCP_FS
+	help
+	  Allows you to utilize NFS namespace on NetWare servers.  It brings
+	  you case sensitive filenames.  Say Y.  You can disable it at
+	  mount-time with the `-N nfs' parameter of ncpmount.
+
+config NCPFS_OS2_NS
+	bool "Use LONG (OS/2) namespace if available"
+	depends on NCP_FS
+	help
+	  Allows you to utilize OS2/LONG namespace on NetWare servers.
+	  Filenames in this namespace are limited to 255 characters, they are
+	  case insensitive, and case in names is preserved.  Say Y.  You can
+	  disable it at mount time with the -N os2 parameter of ncpmount.
+
+config NCPFS_SMALLDOS
+	bool "Lowercase DOS filenames"
+	depends on NCP_FS
+	---help---
+	  If you say Y here, every filename on a NetWare server volume using
+	  the OS2/LONG namespace and created under DOS or on a volume using
+	  DOS namespace will be converted to lowercase characters.
+	  Saying N here will give you these filenames in uppercase.
+
+	  This is only a cosmetic option since the OS2/LONG namespace is case
+	  insensitive. The only major reason for this option is backward
+	  compatibility when moving from DOS to OS2/LONG namespace support.
+	  Long filenames (created by Win95) will not be affected.
+
+	  This option does not solve the problem that filenames appear
+	  differently under Linux and under Windows, since Windows does an
+	  additional conversions on the client side. You can achieve similar
+	  effects by saying Y to "Allow using of Native Language Support"
+	  below.
+
+config NCPFS_NLS
+	bool "Use Native Language Support"
+	depends on NCP_FS
+	select NLS
+	help
+	  Allows you to use codepages and I/O charsets for file name
+	  translation between the server file system and input/output. This
+	  may be useful, if you want to access the server with other operating
+	  systems, e.g. Windows 95. See also NLS for more Information.
+
+	  To select codepages and I/O charsets use ncpfs-2.2.0.13 or newer.
+
+config NCPFS_EXTRAS
+	bool "Enable symbolic links and execute flags"
+	depends on NCP_FS
+	help
+	  This enables the use of symbolic links and an execute permission
+	  bit on NCPFS. The file server need not have long name space or NFS
+	  name space loaded for these to work.
+
+	  To use the new attributes, it is recommended to use the flags
+	  '-f 600 -d 755' on the ncpmount command line.
+
diff --git a/fs/ncpfs/Makefile b/fs/ncpfs/Makefile
new file mode 100644
index 00000000..c66af563
--- /dev/null
+++ b/fs/ncpfs/Makefile
@@ -0,0 +1,16 @@
+#
+# Makefile for the linux ncp filesystem routines.
+#
+
+obj-$(CONFIG_NCP_FS) += ncpfs.o
+
+ncpfs-y      := dir.o file.o inode.o ioctl.o mmap.o ncplib_kernel.o sock.o \
+		ncpsign_kernel.o getopt.o
+
+ncpfs-$(CONFIG_NCPFS_EXTRAS)   += symlink.o
+ncpfs-$(CONFIG_NCPFS_NFS_NS)   += symlink.o
+
+# If you want debugging output, please uncomment the following line
+# ccflags-y := -DDEBUG_NCP=1
+
+CFLAGS_ncplib_kernel.o := -finline-functions
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
new file mode 100644
index 00000000..9c51f621
--- /dev/null
+++ b/fs/ncpfs/dir.c
@@ -0,0 +1,1280 @@
+/*
+ *  dir.c
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Modified for big endian by J.F. Chadima and David S. Miller
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *  Modified 1998, 1999 Wolfram Pienkoss for NLS
+ *  Modified 1999 Wolfram Pienkoss for directory caching
+ *  Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info
+ *
+ */
+
+
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/namei.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+
+#include "ncp_fs.h"
+
+static void ncp_read_volume_list(struct file *, void *, filldir_t,
+				struct ncp_cache_control *);
+static void ncp_do_readdir(struct file *, void *, filldir_t,
+				struct ncp_cache_control *);
+
+static int ncp_readdir(struct file *, void *, filldir_t);
+
+static int ncp_create(struct inode *, struct dentry *, int, struct nameidata *);
+static struct dentry *ncp_lookup(struct inode *, struct dentry *, struct nameidata *);
+static int ncp_unlink(struct inode *, struct dentry *);
+static int ncp_mkdir(struct inode *, struct dentry *, int);
+static int ncp_rmdir(struct inode *, struct dentry *);
+static int ncp_rename(struct inode *, struct dentry *,
+	  	      struct inode *, struct dentry *);
+static int ncp_mknod(struct inode * dir, struct dentry *dentry,
+		     int mode, dev_t rdev);
+#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
+extern int ncp_symlink(struct inode *, struct dentry *, const char *);
+#else
+#define ncp_symlink NULL
+#endif
+		      
+const struct file_operations ncp_dir_operations =
+{
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= ncp_readdir,
+	.unlocked_ioctl	= ncp_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ncp_compat_ioctl,
+#endif
+};
+
+const struct inode_operations ncp_dir_inode_operations =
+{
+	.create		= ncp_create,
+	.lookup		= ncp_lookup,
+	.unlink		= ncp_unlink,
+	.symlink	= ncp_symlink,
+	.mkdir		= ncp_mkdir,
+	.rmdir		= ncp_rmdir,
+	.mknod		= ncp_mknod,
+	.rename		= ncp_rename,
+	.setattr	= ncp_notify_change,
+};
+
+/*
+ * Dentry operations routines
+ */
+static int ncp_lookup_validate(struct dentry *, struct nameidata *);
+static int ncp_hash_dentry(const struct dentry *, const struct inode *,
+		struct qstr *);
+static int ncp_compare_dentry(const struct dentry *, const struct inode *,
+		const struct dentry *, const struct inode *,
+		unsigned int, const char *, const struct qstr *);
+static int ncp_delete_dentry(const struct dentry *);
+
+const struct dentry_operations ncp_dentry_operations =
+{
+	.d_revalidate	= ncp_lookup_validate,
+	.d_hash		= ncp_hash_dentry,
+	.d_compare	= ncp_compare_dentry,
+	.d_delete	= ncp_delete_dentry,
+};
+
+#define ncp_namespace(i)	(NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
+
+static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
+{
+#ifdef CONFIG_NCPFS_SMALLDOS
+	int ns = ncp_namespace(i);
+
+	if ((ns == NW_NS_DOS)
+#ifdef CONFIG_NCPFS_OS2_NS
+		|| ((ns == NW_NS_OS2) && (nscreator == NW_NS_DOS))
+#endif /* CONFIG_NCPFS_OS2_NS */
+	   )
+		return 0;
+#endif /* CONFIG_NCPFS_SMALLDOS */
+	return 1;
+}
+
+#define ncp_preserve_case(i)	(ncp_namespace(i) != NW_NS_DOS)
+
+static inline int ncp_case_sensitive(const struct inode *i)
+{
+#ifdef CONFIG_NCPFS_NFS_NS
+	return ncp_namespace(i) == NW_NS_NFS;
+#else
+	return 0;
+#endif /* CONFIG_NCPFS_NFS_NS */
+}
+
+/*
+ * Note: leave the hash unchanged if the directory
+ * is case-sensitive.
+ */
+static int 
+ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *this)
+{
+	if (!ncp_case_sensitive(inode)) {
+		struct super_block *sb = dentry->d_sb;
+		struct nls_table *t;
+		unsigned long hash;
+		int i;
+
+		t = NCP_IO_TABLE(sb);
+		hash = init_name_hash();
+		for (i=0; i<this->len ; i++)
+			hash = partial_name_hash(ncp_tolower(t, this->name[i]),
+									hash);
+		this->hash = end_name_hash(hash);
+	}
+	return 0;
+}
+
+static int
+ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	if (len != name->len)
+		return 1;
+
+	if (ncp_case_sensitive(pinode))
+		return strncmp(str, name->name, len);
+
+	return ncp_strnicmp(NCP_IO_TABLE(pinode->i_sb), str, name->name, len);
+}
+
+/*
+ * This is the callback from dput() when d_count is going to 0.
+ * We use this to unhash dentries with bad inodes.
+ * Closing files can be safely postponed until iput() - it's done there anyway.
+ */
+static int
+ncp_delete_dentry(const struct dentry * dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (inode) {
+		if (is_bad_inode(inode))
+			return 1;
+	} else
+	{
+	/* N.B. Unhash negative dentries? */
+	}
+	return 0;
+}
+
+static inline int
+ncp_single_volume(struct ncp_server *server)
+{
+	return (server->m.mounted_vol[0] != '\0');
+}
+
+static inline int ncp_is_server_root(struct inode *inode)
+{
+	return (!ncp_single_volume(NCP_SERVER(inode)) &&
+		inode == inode->i_sb->s_root->d_inode);
+}
+
+
+/*
+ * This is the callback when the dcache has a lookup hit.
+ */
+
+
+#ifdef CONFIG_NCPFS_STRONG
+/* try to delete a readonly file (NW R bit set) */
+
+static int
+ncp_force_unlink(struct inode *dir, struct dentry* dentry)
+{
+        int res=0x9c,res2;
+	struct nw_modify_dos_info info;
+	__le32 old_nwattr;
+	struct inode *inode;
+
+	memset(&info, 0, sizeof(info));
+	
+        /* remove the Read-Only flag on the NW server */
+	inode = dentry->d_inode;
+
+	old_nwattr = NCP_FINFO(inode)->nwattr;
+	info.attributes = old_nwattr & ~(aRONLY|aDELETEINHIBIT|aRENAMEINHIBIT);
+	res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(inode), inode, NULL, DM_ATTRIBUTES, &info);
+	if (res2)
+		goto leave_me;
+
+        /* now try again the delete operation */
+        res = ncp_del_file_or_subdir2(NCP_SERVER(dir), dentry);
+
+        if (res)  /* delete failed, set R bit again */
+        {
+		info.attributes = old_nwattr;
+		res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(inode), inode, NULL, DM_ATTRIBUTES, &info);
+		if (res2)
+                        goto leave_me;
+        }
+leave_me:
+        return(res);
+}
+#endif	/* CONFIG_NCPFS_STRONG */
+
+#ifdef CONFIG_NCPFS_STRONG
+static int
+ncp_force_rename(struct inode *old_dir, struct dentry* old_dentry, char *_old_name,
+                 struct inode *new_dir, struct dentry* new_dentry, char *_new_name)
+{
+	struct nw_modify_dos_info info;
+        int res=0x90,res2;
+	struct inode *old_inode = old_dentry->d_inode;
+	__le32 old_nwattr = NCP_FINFO(old_inode)->nwattr;
+	__le32 new_nwattr = 0; /* shut compiler warning */
+	int old_nwattr_changed = 0;
+	int new_nwattr_changed = 0;
+
+	memset(&info, 0, sizeof(info));
+	
+        /* remove the Read-Only flag on the NW server */
+
+	info.attributes = old_nwattr & ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT);
+	res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(old_inode), old_inode, NULL, DM_ATTRIBUTES, &info);
+	if (!res2)
+		old_nwattr_changed = 1;
+	if (new_dentry && new_dentry->d_inode) {
+		new_nwattr = NCP_FINFO(new_dentry->d_inode)->nwattr;
+		info.attributes = new_nwattr & ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT);
+		res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(new_dir), new_dir, _new_name, DM_ATTRIBUTES, &info);
+		if (!res2)
+			new_nwattr_changed = 1;
+	}
+        /* now try again the rename operation */
+	/* but only if something really happened */
+	if (new_nwattr_changed || old_nwattr_changed) {
+	        res = ncp_ren_or_mov_file_or_subdir(NCP_SERVER(old_dir),
+        	                                    old_dir, _old_name,
+                	                            new_dir, _new_name);
+	} 
+	if (res)
+		goto leave_me;
+	/* file was successfully renamed, so:
+	   do not set attributes on old file - it no longer exists
+	   copy attributes from old file to new */
+	new_nwattr_changed = old_nwattr_changed;
+	new_nwattr = old_nwattr;
+	old_nwattr_changed = 0;
+	
+leave_me:;
+	if (old_nwattr_changed) {
+		info.attributes = old_nwattr;
+		res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(old_inode), old_inode, NULL, DM_ATTRIBUTES, &info);
+		/* ignore errors */
+	}
+	if (new_nwattr_changed)	{
+		info.attributes = new_nwattr;
+		res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(new_dir), new_dir, _new_name, DM_ATTRIBUTES, &info);
+		/* ignore errors */
+	}
+        return(res);
+}
+#endif	/* CONFIG_NCPFS_STRONG */
+
+
+static int
+ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ncp_server *server;
+	struct dentry *parent;
+	struct inode *dir;
+	struct ncp_entry_info finfo;
+	int res, val = 0, len;
+	__u8 __name[NCP_MAXPATHLEN + 1];
+
+	if (dentry == dentry->d_sb->s_root)
+		return 1;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	parent = dget_parent(dentry);
+	dir = parent->d_inode;
+
+	if (!dentry->d_inode)
+		goto finished;
+
+	server = NCP_SERVER(dir);
+
+	/*
+	 * Inspired by smbfs:
+	 * The default validation is based on dentry age:
+	 * We set the max age at mount time.  (But each
+	 * successful server lookup renews the timestamp.)
+	 */
+	val = NCP_TEST_AGE(server, dentry);
+	if (val)
+		goto finished;
+
+	DDPRINTK("ncp_lookup_validate: %s/%s not valid, age=%ld, server lookup\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name,
+		NCP_GET_AGE(dentry));
+
+	len = sizeof(__name);
+	if (ncp_is_server_root(dir)) {
+		res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
+				 dentry->d_name.len, 1);
+		if (!res) {
+			res = ncp_lookup_volume(server, __name, &(finfo.i));
+			if (!res)
+				ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
+		}
+	} else {
+		res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
+				 dentry->d_name.len, !ncp_preserve_case(dir));
+		if (!res)
+			res = ncp_obtain_info(server, dir, __name, &(finfo.i));
+	}
+	finfo.volume = finfo.i.volNumber;
+	DDPRINTK("ncp_lookup_validate: looked for %s/%s, res=%d\n",
+		dentry->d_parent->d_name.name, __name, res);
+	/*
+	 * If we didn't find it, or if it has a different dirEntNum to
+	 * what we remember, it's not valid any more.
+	 */
+	if (!res) {
+		struct inode *inode = dentry->d_inode;
+
+		mutex_lock(&inode->i_mutex);
+		if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
+			ncp_new_dentry(dentry);
+			val=1;
+		} else
+			DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n");
+
+		ncp_update_inode2(inode, &finfo);
+		mutex_unlock(&inode->i_mutex);
+	}
+
+finished:
+	DDPRINTK("ncp_lookup_validate: result=%d\n", val);
+	dput(parent);
+	return val;
+}
+
+static struct dentry *
+ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
+{
+	struct dentry *dent = dentry;
+	struct list_head *next;
+
+	if (d_validate(dent, parent)) {
+		if (dent->d_name.len <= NCP_MAXPATHLEN &&
+		    (unsigned long)dent->d_fsdata == fpos) {
+			if (!dent->d_inode) {
+				dput(dent);
+				dent = NULL;
+			}
+			return dent;
+		}
+		dput(dent);
+	}
+
+	/* If a pointer is invalid, we search the dentry. */
+	spin_lock(&parent->d_lock);
+	next = parent->d_subdirs.next;
+	while (next != &parent->d_subdirs) {
+		dent = list_entry(next, struct dentry, d_u.d_child);
+		if ((unsigned long)dent->d_fsdata == fpos) {
+			if (dent->d_inode)
+				dget(dent);
+			else
+				dent = NULL;
+			spin_unlock(&parent->d_lock);
+			goto out;
+		}
+		next = next->next;
+	}
+	spin_unlock(&parent->d_lock);
+	return NULL;
+
+out:
+	return dent;
+}
+
+static time_t ncp_obtain_mtime(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ncp_server *server = NCP_SERVER(inode);
+	struct nw_info_struct i;
+
+	if (!ncp_conn_valid(server) || ncp_is_server_root(inode))
+		return 0;
+
+	if (ncp_obtain_info(server, inode, NULL, &i))
+		return 0;
+
+	return ncp_date_dos2unix(i.modifyTime, i.modifyDate);
+}
+
+static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct page *page = NULL;
+	struct ncp_server *server = NCP_SERVER(inode);
+	union  ncp_dir_cache *cache = NULL;
+	struct ncp_cache_control ctl;
+	int result, mtime_valid = 0;
+	time_t mtime = 0;
+
+	ctl.page  = NULL;
+	ctl.cache = NULL;
+
+	DDPRINTK("ncp_readdir: reading %s/%s, pos=%d\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name,
+		(int) filp->f_pos);
+
+	result = -EIO;
+	/* Do not generate '.' and '..' when server is dead. */
+	if (!ncp_conn_valid(server))
+		goto out;
+
+	result = 0;
+	if (filp->f_pos == 0) {
+		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+			goto out;
+		filp->f_pos = 1;
+	}
+	if (filp->f_pos == 1) {
+		if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR))
+			goto out;
+		filp->f_pos = 2;
+	}
+
+	page = grab_cache_page(&inode->i_data, 0);
+	if (!page)
+		goto read_really;
+
+	ctl.cache = cache = kmap(page);
+	ctl.head  = cache->head;
+
+	if (!PageUptodate(page) || !ctl.head.eof)
+		goto init_cache;
+
+	if (filp->f_pos == 2) {
+		if (jiffies - ctl.head.time >= NCP_MAX_AGE(server))
+			goto init_cache;
+
+		mtime = ncp_obtain_mtime(dentry);
+		mtime_valid = 1;
+		if ((!mtime) || (mtime != ctl.head.mtime))
+			goto init_cache;
+	}
+
+	if (filp->f_pos > ctl.head.end)
+		goto finished;
+
+	ctl.fpos = filp->f_pos + (NCP_DIRCACHE_START - 2);
+	ctl.ofs  = ctl.fpos / NCP_DIRCACHE_SIZE;
+	ctl.idx  = ctl.fpos % NCP_DIRCACHE_SIZE;
+
+	for (;;) {
+		if (ctl.ofs != 0) {
+			ctl.page = find_lock_page(&inode->i_data, ctl.ofs);
+			if (!ctl.page)
+				goto invalid_cache;
+			ctl.cache = kmap(ctl.page);
+			if (!PageUptodate(ctl.page))
+				goto invalid_cache;
+		}
+		while (ctl.idx < NCP_DIRCACHE_SIZE) {
+			struct dentry *dent;
+			int res;
+
+			dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx],
+						dentry, filp->f_pos);
+			if (!dent)
+				goto invalid_cache;
+			res = filldir(dirent, dent->d_name.name,
+					dent->d_name.len, filp->f_pos,
+					dent->d_inode->i_ino, DT_UNKNOWN);
+			dput(dent);
+			if (res)
+				goto finished;
+			filp->f_pos += 1;
+			ctl.idx += 1;
+			if (filp->f_pos > ctl.head.end)
+				goto finished;
+		}
+		if (ctl.page) {
+			kunmap(ctl.page);
+			SetPageUptodate(ctl.page);
+			unlock_page(ctl.page);
+			page_cache_release(ctl.page);
+			ctl.page = NULL;
+		}
+		ctl.idx  = 0;
+		ctl.ofs += 1;
+	}
+invalid_cache:
+	if (ctl.page) {
+		kunmap(ctl.page);
+		unlock_page(ctl.page);
+		page_cache_release(ctl.page);
+		ctl.page = NULL;
+	}
+	ctl.cache = cache;
+init_cache:
+	ncp_invalidate_dircache_entries(dentry);
+	if (!mtime_valid) {
+		mtime = ncp_obtain_mtime(dentry);
+		mtime_valid = 1;
+	}
+	ctl.head.mtime = mtime;
+	ctl.head.time = jiffies;
+	ctl.head.eof = 0;
+	ctl.fpos = 2;
+	ctl.ofs = 0;
+	ctl.idx = NCP_DIRCACHE_START;
+	ctl.filled = 0;
+	ctl.valid  = 1;
+read_really:
+	if (ncp_is_server_root(inode)) {
+		ncp_read_volume_list(filp, dirent, filldir, &ctl);
+	} else {
+		ncp_do_readdir(filp, dirent, filldir, &ctl);
+	}
+	ctl.head.end = ctl.fpos - 1;
+	ctl.head.eof = ctl.valid;
+finished:
+	if (ctl.page) {
+		kunmap(ctl.page);
+		SetPageUptodate(ctl.page);
+		unlock_page(ctl.page);
+		page_cache_release(ctl.page);
+	}
+	if (page) {
+		cache->head = ctl.head;
+		kunmap(page);
+		SetPageUptodate(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+out:
+	return result;
+}
+
+static int
+ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+		struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
+		int inval_childs)
+{
+	struct dentry *newdent, *dentry = filp->f_path.dentry;
+	struct inode *dir = dentry->d_inode;
+	struct ncp_cache_control ctl = *ctrl;
+	struct qstr qname;
+	int valid = 0;
+	int hashed = 0;
+	ino_t ino = 0;
+	__u8 __name[NCP_MAXPATHLEN + 1];
+
+	qname.len = sizeof(__name);
+	if (ncp_vol2io(NCP_SERVER(dir), __name, &qname.len,
+			entry->i.entryName, entry->i.nameLen,
+			!ncp_preserve_entry_case(dir, entry->i.NSCreator)))
+		return 1; /* I'm not sure */
+
+	qname.name = __name;
+	qname.hash = full_name_hash(qname.name, qname.len);
+
+	if (dentry->d_op && dentry->d_op->d_hash)
+		if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0)
+			goto end_advance;
+
+	newdent = d_lookup(dentry, &qname);
+
+	if (!newdent) {
+		newdent = d_alloc(dentry, &qname);
+		if (!newdent)
+			goto end_advance;
+	} else {
+		hashed = 1;
+
+		/* If case sensitivity changed for this volume, all entries below this one
+		   should be thrown away.  This entry itself is not affected, as its case
+		   sensitivity is controlled by its own parent. */
+		if (inval_childs)
+			shrink_dcache_parent(newdent);
+
+		/*
+		 * NetWare's OS2 namespace is case preserving yet case
+		 * insensitive.  So we update dentry's name as received from
+		 * server. Parent dir's i_mutex is locked because we're in
+		 * readdir.
+		 */
+		dentry_update_name_case(newdent, &qname);
+	}
+
+	if (!newdent->d_inode) {
+		struct inode *inode;
+
+		entry->opened = 0;
+		entry->ino = iunique(dir->i_sb, 2);
+		inode = ncp_iget(dir->i_sb, entry);
+		if (inode) {
+			d_instantiate(newdent, inode);
+			if (!hashed)
+				d_rehash(newdent);
+		}
+	} else {
+		struct inode *inode = newdent->d_inode;
+
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		ncp_update_inode2(inode, entry);
+		mutex_unlock(&inode->i_mutex);
+	}
+
+	if (newdent->d_inode) {
+		ino = newdent->d_inode->i_ino;
+		newdent->d_fsdata = (void *) ctl.fpos;
+		ncp_new_dentry(newdent);
+	}
+
+	if (ctl.idx >= NCP_DIRCACHE_SIZE) {
+		if (ctl.page) {
+			kunmap(ctl.page);
+			SetPageUptodate(ctl.page);
+			unlock_page(ctl.page);
+			page_cache_release(ctl.page);
+		}
+		ctl.cache = NULL;
+		ctl.idx  -= NCP_DIRCACHE_SIZE;
+		ctl.ofs  += 1;
+		ctl.page  = grab_cache_page(&dir->i_data, ctl.ofs);
+		if (ctl.page)
+			ctl.cache = kmap(ctl.page);
+	}
+	if (ctl.cache) {
+		ctl.cache->dentry[ctl.idx] = newdent;
+		valid = 1;
+	}
+	dput(newdent);
+end_advance:
+	if (!valid)
+		ctl.valid = 0;
+	if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
+		if (!ino)
+			ino = find_inode_number(dentry, &qname);
+		if (!ino)
+			ino = iunique(dir->i_sb, 2);
+		ctl.filled = filldir(dirent, qname.name, qname.len,
+				     filp->f_pos, ino, DT_UNKNOWN);
+		if (!ctl.filled)
+			filp->f_pos += 1;
+	}
+	ctl.fpos += 1;
+	ctl.idx  += 1;
+	*ctrl = ctl;
+	return (ctl.valid || !ctl.filled);
+}
+
+static void
+ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
+			struct ncp_cache_control *ctl)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct ncp_server *server = NCP_SERVER(inode);
+	struct ncp_volume_info info;
+	struct ncp_entry_info entry;
+	int i;
+
+	DPRINTK("ncp_read_volume_list: pos=%ld\n",
+			(unsigned long) filp->f_pos);
+
+	for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
+		int inval_dentry;
+
+		if (ncp_get_volume_info_with_number(server, i, &info) != 0)
+			return;
+		if (!strlen(info.volume_name))
+			continue;
+
+		DPRINTK("ncp_read_volume_list: found vol: %s\n",
+			info.volume_name);
+
+		if (ncp_lookup_volume(server, info.volume_name,
+					&entry.i)) {
+			DPRINTK("ncpfs: could not lookup vol %s\n",
+				info.volume_name);
+			continue;
+		}
+		inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
+		entry.volume = entry.i.volNumber;
+		if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry))
+			return;
+	}
+}
+
+static void
+ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
+						struct ncp_cache_control *ctl)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *dir = dentry->d_inode;
+	struct ncp_server *server = NCP_SERVER(dir);
+	struct nw_search_sequence seq;
+	struct ncp_entry_info entry;
+	int err;
+	void* buf;
+	int more;
+	size_t bufsize;
+
+	DPRINTK("ncp_do_readdir: %s/%s, fpos=%ld\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name,
+		(unsigned long) filp->f_pos);
+	PPRINTK("ncp_do_readdir: init %s, volnum=%d, dirent=%u\n",
+		dentry->d_name.name, NCP_FINFO(dir)->volNumber,
+		NCP_FINFO(dir)->dirEntNum);
+
+	err = ncp_initialize_search(server, dir, &seq);
+	if (err) {
+		DPRINTK("ncp_do_readdir: init failed, err=%d\n", err);
+		return;
+	}
+	/* We MUST NOT use server->buffer_size handshaked with server if we are
+	   using UDP, as for UDP server uses max. buffer size determined by
+	   MTU, and for TCP server uses hardwired value 65KB (== 66560 bytes). 
+	   So we use 128KB, just to be sure, as there is no way how to know
+	   this value in advance. */
+	bufsize = 131072;
+	buf = vmalloc(bufsize);
+	if (!buf)
+		return;
+	do {
+		int cnt;
+		char* rpl;
+		size_t rpls;
+
+		err = ncp_search_for_fileset(server, &seq, &more, &cnt, buf, bufsize, &rpl, &rpls);
+		if (err)		/* Error */
+			break;
+		if (!cnt)		/* prevent endless loop */
+			break;
+		while (cnt--) {
+			size_t onerpl;
+			
+			if (rpls < offsetof(struct nw_info_struct, entryName))
+				break;	/* short packet */
+			ncp_extract_file_info(rpl, &entry.i);
+			onerpl = offsetof(struct nw_info_struct, entryName) + entry.i.nameLen;
+			if (rpls < onerpl)
+				break;	/* short packet */
+			(void)ncp_obtain_nfs_info(server, &entry.i);
+			rpl += onerpl;
+			rpls -= onerpl;
+			entry.volume = entry.i.volNumber;
+			if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0))
+				break;
+		}
+	} while (more);
+	vfree(buf);
+	return;
+}
+
+int ncp_conn_logged_in(struct super_block *sb)
+{
+	struct ncp_server* server = NCP_SBP(sb);
+	int result;
+
+	if (ncp_single_volume(server)) {
+		int len;
+		struct dentry* dent;
+		__u32 volNumber;
+		__le32 dirEntNum;
+		__le32 DosDirNum;
+		__u8 __name[NCP_MAXPATHLEN + 1];
+
+		len = sizeof(__name);
+		result = ncp_io2vol(server, __name, &len, server->m.mounted_vol,
+				    strlen(server->m.mounted_vol), 1);
+		if (result)
+			goto out;
+		result = -ENOENT;
+		if (ncp_get_volume_root(server, __name, &volNumber, &dirEntNum, &DosDirNum)) {
+			PPRINTK("ncp_conn_logged_in: %s not found\n",
+				server->m.mounted_vol);
+			goto out;
+		}
+		dent = sb->s_root;
+		if (dent) {
+			struct inode* ino = dent->d_inode;
+			if (ino) {
+				ncp_update_known_namespace(server, volNumber, NULL);
+				NCP_FINFO(ino)->volNumber = volNumber;
+				NCP_FINFO(ino)->dirEntNum = dirEntNum;
+				NCP_FINFO(ino)->DosDirNum = DosDirNum;
+				result = 0;
+			} else {
+				DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n");
+			}
+		} else {
+			DPRINTK("ncpfs: sb->s_root == NULL!\n");
+		}
+	} else
+		result = 0;
+
+out:
+	return result;
+}
+
+static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct ncp_server *server = NCP_SERVER(dir);
+	struct inode *inode = NULL;
+	struct ncp_entry_info finfo;
+	int error, res, len;
+	__u8 __name[NCP_MAXPATHLEN + 1];
+
+	error = -EIO;
+	if (!ncp_conn_valid(server))
+		goto finished;
+
+	PPRINTK("ncp_lookup: server lookup for %s/%s\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+
+	len = sizeof(__name);
+	if (ncp_is_server_root(dir)) {
+		res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
+				 dentry->d_name.len, 1);
+		if (!res)
+			res = ncp_lookup_volume(server, __name, &(finfo.i));
+			if (!res)
+				ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
+	} else {
+		res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
+				 dentry->d_name.len, !ncp_preserve_case(dir));
+		if (!res)
+			res = ncp_obtain_info(server, dir, __name, &(finfo.i));
+	}
+	PPRINTK("ncp_lookup: looked for %s/%s, res=%d\n",
+		dentry->d_parent->d_name.name, __name, res);
+	/*
+	 * If we didn't find an entry, make a negative dentry.
+	 */
+	if (res)
+		goto add_entry;
+
+	/*
+	 * Create an inode for the entry.
+	 */
+	finfo.opened = 0;
+	finfo.ino = iunique(dir->i_sb, 2);
+	finfo.volume = finfo.i.volNumber;
+	error = -EACCES;
+	inode = ncp_iget(dir->i_sb, &finfo);
+
+	if (inode) {
+		ncp_new_dentry(dentry);
+add_entry:
+		d_add(dentry, inode);
+		error = 0;
+	}
+
+finished:
+	PPRINTK("ncp_lookup: result=%d\n", error);
+	return ERR_PTR(error);
+}
+
+/*
+ * This code is common to create, mkdir, and mknod.
+ */
+static int ncp_instantiate(struct inode *dir, struct dentry *dentry,
+			struct ncp_entry_info *finfo)
+{
+	struct inode *inode;
+	int error = -EINVAL;
+
+	finfo->ino = iunique(dir->i_sb, 2);
+	inode = ncp_iget(dir->i_sb, finfo);
+	if (!inode)
+		goto out_close;
+	d_instantiate(dentry,inode);
+	error = 0;
+out:
+	return error;
+
+out_close:
+	PPRINTK("ncp_instantiate: %s/%s failed, closing file\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+	ncp_close_file(NCP_SERVER(dir), finfo->file_handle);
+	goto out;
+}
+
+int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
+		   dev_t rdev, __le32 attributes)
+{
+	struct ncp_server *server = NCP_SERVER(dir);
+	struct ncp_entry_info finfo;
+	int error, result, len;
+	int opmode;
+	__u8 __name[NCP_MAXPATHLEN + 1];
+	
+	PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name, mode);
+
+	ncp_age_dentry(server, dentry);
+	len = sizeof(__name);
+	error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
+			   dentry->d_name.len, !ncp_preserve_case(dir));
+	if (error)
+		goto out;
+
+	error = -EACCES;
+	
+	if (S_ISREG(mode) && 
+	    (server->m.flags & NCP_MOUNT_EXTRAS) && 
+	    (mode & S_IXUGO))
+		attributes |= aSYSTEM | aSHARED;
+	
+	result = ncp_open_create_file_or_subdir(server, dir, __name,
+				OC_MODE_CREATE | OC_MODE_OPEN | OC_MODE_REPLACE,
+				attributes, AR_READ | AR_WRITE, &finfo);
+	opmode = O_RDWR;
+	if (result) {
+		result = ncp_open_create_file_or_subdir(server, dir, __name,
+				OC_MODE_CREATE | OC_MODE_OPEN | OC_MODE_REPLACE,
+				attributes, AR_WRITE, &finfo);
+		if (result) {
+			if (result == 0x87)
+				error = -ENAMETOOLONG;
+			else if (result < 0)
+				error = result;
+			DPRINTK("ncp_create: %s/%s failed\n",
+				dentry->d_parent->d_name.name, dentry->d_name.name);
+			goto out;
+		}
+		opmode = O_WRONLY;
+	}
+	finfo.access = opmode;
+	if (ncp_is_nfs_extras(server, finfo.volume)) {
+		finfo.i.nfs.mode = mode;
+		finfo.i.nfs.rdev = new_encode_dev(rdev);
+		if (ncp_modify_nfs_info(server, finfo.volume,
+					finfo.i.dirEntNum,
+					mode, new_encode_dev(rdev)) != 0)
+			goto out;
+	}
+
+	error = ncp_instantiate(dir, dentry, &finfo);
+out:
+	return error;
+}
+
+static int ncp_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	return ncp_create_new(dir, dentry, mode, 0, 0);
+}
+
+static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct ncp_entry_info finfo;
+	struct ncp_server *server = NCP_SERVER(dir);
+	int error, len;
+	__u8 __name[NCP_MAXPATHLEN + 1];
+
+	DPRINTK("ncp_mkdir: making %s/%s\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+
+	ncp_age_dentry(server, dentry);
+	len = sizeof(__name);
+	error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
+			   dentry->d_name.len, !ncp_preserve_case(dir));
+	if (error)
+		goto out;
+
+	error = ncp_open_create_file_or_subdir(server, dir, __name,
+					   OC_MODE_CREATE, aDIR,
+					   cpu_to_le16(0xffff),
+					   &finfo);
+	if (error == 0) {
+		if (ncp_is_nfs_extras(server, finfo.volume)) {
+			mode |= S_IFDIR;
+			finfo.i.nfs.mode = mode;
+			if (ncp_modify_nfs_info(server,
+						finfo.volume,
+						finfo.i.dirEntNum,
+						mode, 0) != 0)
+				goto out;
+		}
+		error = ncp_instantiate(dir, dentry, &finfo);
+	} else if (error > 0) {
+		error = -EACCES;
+	}
+out:
+	return error;
+}
+
+static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct ncp_server *server = NCP_SERVER(dir);
+	int error, result, len;
+	__u8 __name[NCP_MAXPATHLEN + 1];
+
+	DPRINTK("ncp_rmdir: removing %s/%s\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+
+	/*
+	 * fail with EBUSY if there are still references to this
+	 * directory.
+	 */
+	dentry_unhash(dentry);
+	error = -EBUSY;
+	if (!d_unhashed(dentry))
+		goto out;
+
+	len = sizeof(__name);
+	error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
+			   dentry->d_name.len, !ncp_preserve_case(dir));
+	if (error)
+		goto out;
+
+	result = ncp_del_file_or_subdir(server, dir, __name);
+	switch (result) {
+		case 0x00:
+			error = 0;
+			break;
+		case 0x85:	/* unauthorized to delete file */
+		case 0x8A:	/* unauthorized to delete file */
+			error = -EACCES;
+			break;
+		case 0x8F:
+		case 0x90:	/* read only */
+			error = -EPERM;
+			break;
+		case 0x9F:	/* in use by another client */
+			error = -EBUSY;
+			break;
+		case 0xA0:	/* directory not empty */
+			error = -ENOTEMPTY;
+			break;
+		case 0xFF:	/* someone deleted file */
+			error = -ENOENT;
+			break;
+		default:
+			error = result < 0 ? result : -EACCES;
+			break;
+       	}
+out:
+	return error;
+}
+
+static int ncp_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ncp_server *server;
+	int error;
+
+	server = NCP_SERVER(dir);
+	DPRINTK("ncp_unlink: unlinking %s/%s\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+	
+	/*
+	 * Check whether to close the file ...
+	 */
+	if (inode) {
+		PPRINTK("ncp_unlink: closing file\n");
+		ncp_make_closed(inode);
+	}
+
+	error = ncp_del_file_or_subdir2(server, dentry);
+#ifdef CONFIG_NCPFS_STRONG
+	/* 9C is Invalid path.. It should be 8F, 90 - read only, but
+	   it is not :-( */
+	if ((error == 0x9C || error == 0x90) && server->m.flags & NCP_MOUNT_STRONG) { /* R/O */
+		error = ncp_force_unlink(dir, dentry);
+	}
+#endif
+	switch (error) {
+		case 0x00:
+			DPRINTK("ncp: removed %s/%s\n",
+				dentry->d_parent->d_name.name, dentry->d_name.name);
+			break;
+		case 0x85:
+		case 0x8A:
+			error = -EACCES;
+			break;
+		case 0x8D:	/* some files in use */
+		case 0x8E:	/* all files in use */
+			error = -EBUSY;
+			break;
+		case 0x8F:	/* some read only */
+		case 0x90:	/* all read only */
+		case 0x9C:	/* !!! returned when in-use or read-only by NW4 */
+			error = -EPERM;
+			break;
+		case 0xFF:
+			error = -ENOENT;
+			break;
+		default:
+			error = error < 0 ? error : -EACCES;
+			break;
+	}
+	return error;
+}
+
+static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct ncp_server *server = NCP_SERVER(old_dir);
+	int error;
+	int old_len, new_len;
+	__u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1];
+
+	DPRINTK("ncp_rename: %s/%s to %s/%s\n",
+		old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
+		new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
+
+	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) {
+		/*
+		 * fail with EBUSY if there are still references to this
+		 * directory.
+		 */
+		dentry_unhash(new_dentry);
+		error = -EBUSY;
+		if (!d_unhashed(new_dentry))
+			goto out;
+	}
+
+	ncp_age_dentry(server, old_dentry);
+	ncp_age_dentry(server, new_dentry);
+
+	old_len = sizeof(__old_name);
+	error = ncp_io2vol(server, __old_name, &old_len,
+			   old_dentry->d_name.name, old_dentry->d_name.len,
+			   !ncp_preserve_case(old_dir));
+	if (error)
+		goto out;
+
+	new_len = sizeof(__new_name);
+	error = ncp_io2vol(server, __new_name, &new_len,
+			   new_dentry->d_name.name, new_dentry->d_name.len,
+			   !ncp_preserve_case(new_dir));
+	if (error)
+		goto out;
+
+	error = ncp_ren_or_mov_file_or_subdir(server, old_dir, __old_name,
+						      new_dir, __new_name);
+#ifdef CONFIG_NCPFS_STRONG
+	if ((error == 0x90 || error == 0x8B || error == -EACCES) &&
+			server->m.flags & NCP_MOUNT_STRONG) {	/* RO */
+		error = ncp_force_rename(old_dir, old_dentry, __old_name,
+					 new_dir, new_dentry, __new_name);
+	}
+#endif
+	switch (error) {
+		case 0x00:
+               	        DPRINTK("ncp renamed %s -> %s.\n",
+                                old_dentry->d_name.name,new_dentry->d_name.name);
+			break;
+		case 0x9E:
+			error = -ENAMETOOLONG;
+			break;
+		case 0xFF:
+			error = -ENOENT;
+			break;
+		default:
+			error = error < 0 ? error : -EACCES;
+			break;
+	}
+out:
+	return error;
+}
+
+static int ncp_mknod(struct inode * dir, struct dentry *dentry,
+		     int mode, dev_t rdev)
+{
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+	if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) {
+		DPRINTK(KERN_DEBUG "ncp_mknod: mode = 0%o\n", mode);
+		return ncp_create_new(dir, dentry, mode, rdev, 0);
+	}
+	return -EPERM; /* Strange, but true */
+}
+
+/* The following routines are taken directly from msdos-fs */
+
+/* Linear day numbers of the respective 1sts in non-leap years. */
+
+static int day_n[] =
+{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0};
+/* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */
+
+
+extern struct timezone sys_tz;
+
+static int utc2local(int time)
+{
+	return time - sys_tz.tz_minuteswest * 60;
+}
+
+static int local2utc(int time)
+{
+	return time + sys_tz.tz_minuteswest * 60;
+}
+
+/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
+int
+ncp_date_dos2unix(__le16 t, __le16 d)
+{
+	unsigned short time = le16_to_cpu(t), date = le16_to_cpu(d);
+	int month, year, secs;
+
+	/* first subtract and mask after that... Otherwise, if
+	   date == 0, bad things happen */
+	month = ((date >> 5) - 1) & 15;
+	year = date >> 9;
+	secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 +
+		86400 * ((date & 31) - 1 + day_n[month] + (year / 4) + 
+		year * 365 - ((year & 3) == 0 && month < 2 ? 1 : 0) + 3653);
+	/* days since 1.1.70 plus 80's leap day */
+	return local2utc(secs);
+}
+
+
+/* Convert linear UNIX date to a MS-DOS time/date pair. */
+void
+ncp_date_unix2dos(int unix_date, __le16 *time, __le16 *date)
+{
+	int day, year, nl_day, month;
+
+	unix_date = utc2local(unix_date);
+	*time = cpu_to_le16(
+		(unix_date % 60) / 2 + (((unix_date / 60) % 60) << 5) +
+		(((unix_date / 3600) % 24) << 11));
+	day = unix_date / 86400 - 3652;
+	year = day / 365;
+	if ((year + 3) / 4 + 365 * year > day)
+		year--;
+	day -= (year + 3) / 4 + 365 * year;
+	if (day == 59 && !(year & 3)) {
+		nl_day = day;
+		month = 2;
+	} else {
+		nl_day = (year & 3) || day <= 59 ? day : day - 1;
+		for (month = 1; month < 12; month++)
+			if (day_n[month] > nl_day)
+				break;
+	}
+	*date = cpu_to_le16(nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9));
+}
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
new file mode 100644
index 00000000..0ed65e0c
--- /dev/null
+++ b/fs/ncpfs/file.c
@@ -0,0 +1,297 @@
+/*
+ *  file.c
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+
+#include "ncp_fs.h"
+
+static int ncp_fsync(struct file *file, int datasync)
+{
+	return 0;
+}
+
+/*
+ * Open a file with the specified read/write mode.
+ */
+int ncp_make_open(struct inode *inode, int right)
+{
+	int error;
+	int access;
+
+	error = -EINVAL;
+	if (!inode) {
+		printk(KERN_ERR "ncp_make_open: got NULL inode\n");
+		goto out;
+	}
+
+	DPRINTK("ncp_make_open: opened=%d, volume # %u, dir entry # %u\n",
+		atomic_read(&NCP_FINFO(inode)->opened), 
+		NCP_FINFO(inode)->volNumber, 
+		NCP_FINFO(inode)->dirEntNum);
+	error = -EACCES;
+	mutex_lock(&NCP_FINFO(inode)->open_mutex);
+	if (!atomic_read(&NCP_FINFO(inode)->opened)) {
+		struct ncp_entry_info finfo;
+		int result;
+
+		/* tries max. rights */
+		finfo.access = O_RDWR;
+		result = ncp_open_create_file_or_subdir(NCP_SERVER(inode),
+					inode, NULL, OC_MODE_OPEN,
+					0, AR_READ | AR_WRITE, &finfo);
+		if (!result)
+			goto update;
+		/* RDWR did not succeeded, try readonly or writeonly as requested */
+		switch (right) {
+			case O_RDONLY:
+				finfo.access = O_RDONLY;
+				result = ncp_open_create_file_or_subdir(NCP_SERVER(inode),
+					inode, NULL, OC_MODE_OPEN,
+					0, AR_READ, &finfo);
+				break;
+			case O_WRONLY:
+				finfo.access = O_WRONLY;
+				result = ncp_open_create_file_or_subdir(NCP_SERVER(inode),
+					inode, NULL, OC_MODE_OPEN,
+					0, AR_WRITE, &finfo);
+				break;
+		}
+		if (result) {
+			PPRINTK("ncp_make_open: failed, result=%d\n", result);
+			goto out_unlock;
+		}
+		/*
+		 * Update the inode information.
+		 */
+	update:
+		ncp_update_inode(inode, &finfo);
+		atomic_set(&NCP_FINFO(inode)->opened, 1);
+	}
+
+	access = NCP_FINFO(inode)->access;
+	PPRINTK("ncp_make_open: file open, access=%x\n", access);
+	if (access == right || access == O_RDWR) {
+		atomic_inc(&NCP_FINFO(inode)->opened);
+		error = 0;
+	}
+
+out_unlock:
+	mutex_unlock(&NCP_FINFO(inode)->open_mutex);
+out:
+	return error;
+}
+
+static ssize_t
+ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	size_t already_read = 0;
+	off_t pos;
+	size_t bufsize;
+	int error;
+	void* freepage;
+	size_t freelen;
+
+	DPRINTK("ncp_file_read: enter %s/%s\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+
+	pos = *ppos;
+
+	if ((ssize_t) count < 0) {
+		return -EINVAL;
+	}
+	if (!count)
+		return 0;
+	if (pos > inode->i_sb->s_maxbytes)
+		return 0;
+	if (pos + count > inode->i_sb->s_maxbytes) {
+		count = inode->i_sb->s_maxbytes - pos;
+	}
+
+	error = ncp_make_open(inode, O_RDONLY);
+	if (error) {
+		DPRINTK(KERN_ERR "ncp_file_read: open failed, error=%d\n", error);
+		return error;
+	}
+
+	bufsize = NCP_SERVER(inode)->buffer_size;
+
+	error = -EIO;
+	freelen = ncp_read_bounce_size(bufsize);
+	freepage = vmalloc(freelen);
+	if (!freepage)
+		goto outrel;
+	error = 0;
+	/* First read in as much as possible for each bufsize. */
+	while (already_read < count) {
+		int read_this_time;
+		size_t to_read = min_t(unsigned int,
+				     bufsize - (pos % bufsize),
+				     count - already_read);
+
+		error = ncp_read_bounce(NCP_SERVER(inode),
+			 	NCP_FINFO(inode)->file_handle,
+				pos, to_read, buf, &read_this_time, 
+				freepage, freelen);
+		if (error) {
+			error = -EIO;	/* NW errno -> Linux errno */
+			break;
+		}
+		pos += read_this_time;
+		buf += read_this_time;
+		already_read += read_this_time;
+
+		if (read_this_time != to_read) {
+			break;
+		}
+	}
+	vfree(freepage);
+
+	*ppos = pos;
+
+	file_accessed(file);
+
+	DPRINTK("ncp_file_read: exit %s/%s\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+outrel:
+	ncp_inode_close(inode);		
+	return already_read ? already_read : error;
+}
+
+static ssize_t
+ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	size_t already_written = 0;
+	off_t pos;
+	size_t bufsize;
+	int errno;
+	void* bouncebuffer;
+
+	DPRINTK("ncp_file_write: enter %s/%s\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+	if ((ssize_t) count < 0)
+		return -EINVAL;
+	pos = *ppos;
+	if (file->f_flags & O_APPEND) {
+		pos = i_size_read(inode);
+	}
+
+	if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
+		if (pos >= MAX_NON_LFS) {
+			return -EFBIG;
+		}
+		if (count > MAX_NON_LFS - (u32)pos) {
+			count = MAX_NON_LFS - (u32)pos;
+		}
+	}
+	if (pos >= inode->i_sb->s_maxbytes) {
+		if (count || pos > inode->i_sb->s_maxbytes) {
+			return -EFBIG;
+		}
+	}
+	if (pos + count > inode->i_sb->s_maxbytes) {
+		count = inode->i_sb->s_maxbytes - pos;
+	}
+	
+	if (!count)
+		return 0;
+	errno = ncp_make_open(inode, O_WRONLY);
+	if (errno) {
+		DPRINTK(KERN_ERR "ncp_file_write: open failed, error=%d\n", errno);
+		return errno;
+	}
+	bufsize = NCP_SERVER(inode)->buffer_size;
+
+	already_written = 0;
+
+	bouncebuffer = vmalloc(bufsize);
+	if (!bouncebuffer) {
+		errno = -EIO;	/* -ENOMEM */
+		goto outrel;
+	}
+	while (already_written < count) {
+		int written_this_time;
+		size_t to_write = min_t(unsigned int,
+				      bufsize - (pos % bufsize),
+				      count - already_written);
+
+		if (copy_from_user(bouncebuffer, buf, to_write)) {
+			errno = -EFAULT;
+			break;
+		}
+		if (ncp_write_kernel(NCP_SERVER(inode), 
+		    NCP_FINFO(inode)->file_handle,
+		    pos, to_write, bouncebuffer, &written_this_time) != 0) {
+			errno = -EIO;
+			break;
+		}
+		pos += written_this_time;
+		buf += written_this_time;
+		already_written += written_this_time;
+
+		if (written_this_time != to_write) {
+			break;
+		}
+	}
+	vfree(bouncebuffer);
+
+	file_update_time(file);
+
+	*ppos = pos;
+
+	if (pos > i_size_read(inode)) {
+		mutex_lock(&inode->i_mutex);
+		if (pos > i_size_read(inode))
+			i_size_write(inode, pos);
+		mutex_unlock(&inode->i_mutex);
+	}
+	DPRINTK("ncp_file_write: exit %s/%s\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+outrel:
+	ncp_inode_close(inode);		
+	return already_written ? already_written : errno;
+}
+
+static int ncp_release(struct inode *inode, struct file *file) {
+	if (ncp_make_closed(inode)) {
+		DPRINTK("ncp_release: failed to close\n");
+	}
+	return 0;
+}
+
+const struct file_operations ncp_file_operations =
+{
+	.llseek		= generic_file_llseek,
+	.read		= ncp_file_read,
+	.write		= ncp_file_write,
+	.unlocked_ioctl	= ncp_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ncp_compat_ioctl,
+#endif
+	.mmap		= ncp_mmap,
+	.release	= ncp_release,
+	.fsync		= ncp_fsync,
+};
+
+const struct inode_operations ncp_file_inode_operations =
+{
+	.setattr	= ncp_notify_change,
+};
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
new file mode 100644
index 00000000..0af3349d
--- /dev/null
+++ b/fs/ncpfs/getopt.c
@@ -0,0 +1,74 @@
+/*
+ * getopt.c
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#include <asm/errno.h>
+
+#include "getopt.h"
+
+/**
+ *	ncp_getopt - option parser
+ *	@caller: name of the caller, for error messages
+ *	@options: the options string
+ *	@opts: an array of &struct option entries controlling parser operations
+ *	@optopt: output; will contain the current option
+ *	@optarg: output; will contain the value (if one exists)
+ *	@value: output; may be NULL; will be overwritten with the integer value
+ *		of the current argument.
+ *
+ *	Helper to parse options on the format used by mount ("a=b,c=d,e,f").
+ *	Returns opts->val if a matching entry in the 'opts' array is found,
+ *	0 when no more tokens are found, -1 if an error is encountered.
+ */
+int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts,
+	       char **optopt, char **optarg, unsigned long *value)
+{
+	char *token;
+	char *val;
+
+	do {
+		if ((token = strsep(options, ",")) == NULL)
+			return 0;
+	} while (*token == '\0');
+	if (optopt)
+		*optopt = token;
+
+	if ((val = strchr (token, '=')) != NULL) {
+		*val++ = 0;
+	}
+	*optarg = val;
+	for (; opts->name; opts++) {
+		if (!strcmp(opts->name, token)) {
+			if (!val) {
+				if (opts->has_arg & OPT_NOPARAM) {
+					return opts->val;
+				}
+				printk(KERN_INFO "%s: the %s option requires an argument\n",
+				       caller, token);
+				return -EINVAL;
+			}
+			if (opts->has_arg & OPT_INT) {
+				char* v;
+
+				*value = simple_strtoul(val, &v, 0);
+				if (!*v) {
+					return opts->val;
+				}
+				printk(KERN_INFO "%s: invalid numeric value in %s=%s\n",
+					caller, token, val);
+				return -EDOM;
+			}
+			if (opts->has_arg & OPT_STRING) {
+				return opts->val;
+			}
+			printk(KERN_INFO "%s: unexpected argument %s to the %s option\n",
+				caller, val, token);
+			return -EINVAL;
+		}
+	}
+	printk(KERN_INFO "%s: Unrecognized mount option %s\n", caller, token);
+	return -EOPNOTSUPP;
+}
diff --git a/fs/ncpfs/getopt.h b/fs/ncpfs/getopt.h
new file mode 100644
index 00000000..cccc007d
--- /dev/null
+++ b/fs/ncpfs/getopt.h
@@ -0,0 +1,16 @@
+#ifndef _LINUX_GETOPT_H
+#define _LINUX_GETOPT_H
+
+#define OPT_NOPARAM	1
+#define OPT_INT		2
+#define OPT_STRING	4
+struct ncp_option {
+	const char *name;
+	unsigned int has_arg;
+	int val;
+};
+
+extern int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts,
+		      char **optopt, char **optarg, unsigned long *value);
+
+#endif /* _LINUX_GETOPT_H */
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
new file mode 100644
index 00000000..202f3705
--- /dev/null
+++ b/fs/ncpfs/inode.c
@@ -0,0 +1,1072 @@
+/*
+ *  inode.c
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Modified for big endian by J.F. Chadima and David S. Miller
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *  Modified 1998 Wolfram Pienkoss for NLS
+ *  Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info
+ *
+ */
+
+#include <linux/module.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fcntl.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+
+#include <net/sock.h>
+
+#include "ncp_fs.h"
+#include "getopt.h"
+
+#define NCP_DEFAULT_FILE_MODE 0600
+#define NCP_DEFAULT_DIR_MODE 0700
+#define NCP_DEFAULT_TIME_OUT 10
+#define NCP_DEFAULT_RETRY_COUNT 20
+
+static void ncp_evict_inode(struct inode *);
+static void ncp_put_super(struct super_block *);
+static int  ncp_statfs(struct dentry *, struct kstatfs *);
+static int  ncp_show_options(struct seq_file *, struct vfsmount *);
+
+static struct kmem_cache * ncp_inode_cachep;
+
+static struct inode *ncp_alloc_inode(struct super_block *sb)
+{
+	struct ncp_inode_info *ei;
+	ei = (struct ncp_inode_info *)kmem_cache_alloc(ncp_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void ncp_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
+}
+
+static void ncp_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ncp_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
+
+	mutex_init(&ei->open_mutex);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	ncp_inode_cachep = kmem_cache_create("ncp_inode_cache",
+					     sizeof(struct ncp_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (ncp_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(ncp_inode_cachep);
+}
+
+static int ncp_remount(struct super_block *sb, int *flags, char* data)
+{
+	*flags |= MS_NODIRATIME;
+	return 0;
+}
+
+static const struct super_operations ncp_sops =
+{
+	.alloc_inode	= ncp_alloc_inode,
+	.destroy_inode	= ncp_destroy_inode,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= ncp_evict_inode,
+	.put_super	= ncp_put_super,
+	.statfs		= ncp_statfs,
+	.remount_fs	= ncp_remount,
+	.show_options	= ncp_show_options,
+};
+
+/*
+ * Fill in the ncpfs-specific information in the inode.
+ */
+static void ncp_update_dirent(struct inode *inode, struct ncp_entry_info *nwinfo)
+{
+	NCP_FINFO(inode)->DosDirNum = nwinfo->i.DosDirNum;
+	NCP_FINFO(inode)->dirEntNum = nwinfo->i.dirEntNum;
+	NCP_FINFO(inode)->volNumber = nwinfo->volume;
+}
+
+void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo)
+{
+	ncp_update_dirent(inode, nwinfo);
+	NCP_FINFO(inode)->nwattr = nwinfo->i.attributes;
+	NCP_FINFO(inode)->access = nwinfo->access;
+	memcpy(NCP_FINFO(inode)->file_handle, nwinfo->file_handle,
+			sizeof(nwinfo->file_handle));
+	DPRINTK("ncp_update_inode: updated %s, volnum=%d, dirent=%u\n",
+		nwinfo->i.entryName, NCP_FINFO(inode)->volNumber,
+		NCP_FINFO(inode)->dirEntNum);
+}
+
+static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi)
+{
+	/* NFS namespace mode overrides others if it's set. */
+	DPRINTK(KERN_DEBUG "ncp_update_dates_and_mode: (%s) nfs.mode=0%o\n",
+		nwi->entryName, nwi->nfs.mode);
+	if (nwi->nfs.mode) {
+		/* XXX Security? */
+		inode->i_mode = nwi->nfs.mode;
+	}
+
+	inode->i_blocks = (i_size_read(inode) + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT;
+
+	inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate);
+	inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate);
+	inode->i_atime.tv_sec = ncp_date_dos2unix(0, nwi->lastAccessDate);
+	inode->i_atime.tv_nsec = 0;
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_ctime.tv_nsec = 0;
+}
+
+static void ncp_update_attrs(struct inode *inode, struct ncp_entry_info *nwinfo)
+{
+	struct nw_info_struct *nwi = &nwinfo->i;
+	struct ncp_server *server = NCP_SERVER(inode);
+
+	if (nwi->attributes & aDIR) {
+		inode->i_mode = server->m.dir_mode;
+		/* for directories dataStreamSize seems to be some
+		   Object ID ??? */
+		i_size_write(inode, NCP_BLOCK_SIZE);
+	} else {
+		u32 size;
+
+		inode->i_mode = server->m.file_mode;
+		size = le32_to_cpu(nwi->dataStreamSize);
+		i_size_write(inode, size);
+#ifdef CONFIG_NCPFS_EXTRAS
+		if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS)) 
+		 && (nwi->attributes & aSHARED)) {
+			switch (nwi->attributes & (aHIDDEN|aSYSTEM)) {
+				case aHIDDEN:
+					if (server->m.flags & NCP_MOUNT_SYMLINKS) {
+						if (/* (size >= NCP_MIN_SYMLINK_SIZE)
+						 && */ (size <= NCP_MAX_SYMLINK_SIZE)) {
+							inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK;
+							NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK;
+							break;
+						}
+					}
+					/* FALLTHROUGH */
+				case 0:
+					if (server->m.flags & NCP_MOUNT_EXTRAS)
+						inode->i_mode |= S_IRUGO;
+					break;
+				case aSYSTEM:
+					if (server->m.flags & NCP_MOUNT_EXTRAS)
+						inode->i_mode |= (inode->i_mode >> 2) & S_IXUGO;
+					break;
+				/* case aSYSTEM|aHIDDEN: */
+				default:
+					/* reserved combination */
+					break;
+			}
+		}
+#endif
+	}
+	if (nwi->attributes & aRONLY) inode->i_mode &= ~S_IWUGO;
+}
+
+void ncp_update_inode2(struct inode* inode, struct ncp_entry_info *nwinfo)
+{
+	NCP_FINFO(inode)->flags = 0;
+	if (!atomic_read(&NCP_FINFO(inode)->opened)) {
+		NCP_FINFO(inode)->nwattr = nwinfo->i.attributes;
+		ncp_update_attrs(inode, nwinfo);
+	}
+
+	ncp_update_dates(inode, &nwinfo->i);
+	ncp_update_dirent(inode, nwinfo);
+}
+
+/*
+ * Fill in the inode based on the ncp_entry_info structure.  Used only for brand new inodes.
+ */
+static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
+{
+	struct ncp_server *server = NCP_SERVER(inode);
+
+	NCP_FINFO(inode)->flags = 0;
+	
+	ncp_update_attrs(inode, nwinfo);
+
+	DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode);
+
+	inode->i_nlink = 1;
+	inode->i_uid = server->m.uid;
+	inode->i_gid = server->m.gid;
+
+	ncp_update_dates(inode, &nwinfo->i);
+	ncp_update_inode(inode, nwinfo);
+}
+
+#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
+static const struct inode_operations ncp_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= ncp_notify_change,
+};
+#endif
+
+/*
+ * Get a new inode.
+ */
+struct inode * 
+ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
+{
+	struct inode *inode;
+
+	if (info == NULL) {
+		printk(KERN_ERR "ncp_iget: info is NULL\n");
+		return NULL;
+	}
+
+	inode = new_inode(sb);
+	if (inode) {
+		atomic_set(&NCP_FINFO(inode)->opened, info->opened);
+
+		inode->i_mapping->backing_dev_info = sb->s_bdi;
+		inode->i_ino = info->ino;
+		ncp_set_attr(inode, info);
+		if (S_ISREG(inode->i_mode)) {
+			inode->i_op = &ncp_file_inode_operations;
+			inode->i_fop = &ncp_file_operations;
+		} else if (S_ISDIR(inode->i_mode)) {
+			inode->i_op = &ncp_dir_inode_operations;
+			inode->i_fop = &ncp_dir_operations;
+#ifdef CONFIG_NCPFS_NFS_NS
+		} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+			init_special_inode(inode, inode->i_mode,
+				new_decode_dev(info->i.nfs.rdev));
+#endif
+#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
+		} else if (S_ISLNK(inode->i_mode)) {
+			inode->i_op = &ncp_symlink_inode_operations;
+			inode->i_data.a_ops = &ncp_symlink_aops;
+#endif
+		} else {
+			make_bad_inode(inode);
+		}
+		insert_inode_hash(inode);
+	} else
+		printk(KERN_ERR "ncp_iget: iget failed!\n");
+	return inode;
+}
+
+static void
+ncp_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+
+	if (S_ISDIR(inode->i_mode)) {
+		DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino);
+	}
+
+	if (ncp_make_closed(inode) != 0) {
+		/* We can't do anything but complain. */
+		printk(KERN_ERR "ncp_evict_inode: could not close\n");
+	}
+}
+
+static void ncp_stop_tasks(struct ncp_server *server) {
+	struct sock* sk = server->ncp_sock->sk;
+
+	lock_sock(sk);
+	sk->sk_error_report = server->error_report;
+	sk->sk_data_ready   = server->data_ready;
+	sk->sk_write_space  = server->write_space;
+	release_sock(sk);
+	del_timer_sync(&server->timeout_tm);
+
+	flush_work_sync(&server->rcv.tq);
+	if (sk->sk_socket->type == SOCK_STREAM)
+		flush_work_sync(&server->tx.tq);
+	else
+		flush_work_sync(&server->timeout_tq);
+}
+
+static int  ncp_show_options(struct seq_file *seq, struct vfsmount *mnt)
+{
+	struct ncp_server *server = NCP_SBP(mnt->mnt_sb);
+	unsigned int tmp;
+
+	if (server->m.uid != 0)
+		seq_printf(seq, ",uid=%u", server->m.uid);
+	if (server->m.gid != 0)
+		seq_printf(seq, ",gid=%u", server->m.gid);
+	if (server->m.mounted_uid != 0)
+		seq_printf(seq, ",owner=%u", server->m.mounted_uid);
+	tmp = server->m.file_mode & S_IALLUGO;
+	if (tmp != NCP_DEFAULT_FILE_MODE)
+		seq_printf(seq, ",mode=0%o", tmp);
+	tmp = server->m.dir_mode & S_IALLUGO;
+	if (tmp != NCP_DEFAULT_DIR_MODE)
+		seq_printf(seq, ",dirmode=0%o", tmp);
+	if (server->m.time_out != NCP_DEFAULT_TIME_OUT * HZ / 100) {
+		tmp = server->m.time_out * 100 / HZ;
+		seq_printf(seq, ",timeout=%u", tmp);
+	}
+	if (server->m.retry_count != NCP_DEFAULT_RETRY_COUNT)
+		seq_printf(seq, ",retry=%u", server->m.retry_count);
+	if (server->m.flags != 0)
+		seq_printf(seq, ",flags=%lu", server->m.flags);
+	if (server->m.wdog_pid != NULL)
+		seq_printf(seq, ",wdogpid=%u", pid_vnr(server->m.wdog_pid));
+
+	return 0;
+}
+
+static const struct ncp_option ncp_opts[] = {
+	{ "uid",	OPT_INT,	'u' },
+	{ "gid",	OPT_INT,	'g' },
+	{ "owner",	OPT_INT,	'o' },
+	{ "mode",	OPT_INT,	'm' },
+	{ "dirmode",	OPT_INT,	'd' },
+	{ "timeout",	OPT_INT,	't' },
+	{ "retry",	OPT_INT,	'r' },
+	{ "flags",	OPT_INT,	'f' },
+	{ "wdogpid",	OPT_INT,	'w' },
+	{ "ncpfd",	OPT_INT,	'n' },
+	{ "infofd",	OPT_INT,	'i' },	/* v5 */
+	{ "version",	OPT_INT,	'v' },
+	{ NULL,		0,		0 } };
+
+static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) {
+	int optval;
+	char *optarg;
+	unsigned long optint;
+	int version = 0;
+	int ret;
+
+	data->flags = 0;
+	data->int_flags = 0;
+	data->mounted_uid = 0;
+	data->wdog_pid = NULL;
+	data->ncp_fd = ~0;
+	data->time_out = NCP_DEFAULT_TIME_OUT;
+	data->retry_count = NCP_DEFAULT_RETRY_COUNT;
+	data->uid = 0;
+	data->gid = 0;
+	data->file_mode = NCP_DEFAULT_FILE_MODE;
+	data->dir_mode = NCP_DEFAULT_DIR_MODE;
+	data->info_fd = -1;
+	data->mounted_vol[0] = 0;
+	
+	while ((optval = ncp_getopt("ncpfs", &options, ncp_opts, NULL, &optarg, &optint)) != 0) {
+		ret = optval;
+		if (ret < 0)
+			goto err;
+		switch (optval) {
+			case 'u':
+				data->uid = optint;
+				break;
+			case 'g':
+				data->gid = optint;
+				break;
+			case 'o':
+				data->mounted_uid = optint;
+				break;
+			case 'm':
+				data->file_mode = optint;
+				break;
+			case 'd':
+				data->dir_mode = optint;
+				break;
+			case 't':
+				data->time_out = optint;
+				break;
+			case 'r':
+				data->retry_count = optint;
+				break;
+			case 'f':
+				data->flags = optint;
+				break;
+			case 'w':
+				data->wdog_pid = find_get_pid(optint);
+				break;
+			case 'n':
+				data->ncp_fd = optint;
+				break;
+			case 'i':
+				data->info_fd = optint;
+				break;
+			case 'v':
+				ret = -ECHRNG;
+				if (optint < NCP_MOUNT_VERSION_V4)
+					goto err;
+				if (optint > NCP_MOUNT_VERSION_V5)
+					goto err;
+				version = optint;
+				break;
+			
+		}
+	}
+	return 0;
+err:
+	put_pid(data->wdog_pid);
+	data->wdog_pid = NULL;
+	return ret;
+}
+
+static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
+{
+	struct ncp_mount_data_kernel data;
+	struct ncp_server *server;
+	struct file *ncp_filp;
+	struct inode *root_inode;
+	struct inode *sock_inode;
+	struct socket *sock;
+	int error;
+	int default_bufsize;
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+	int options;
+#endif
+	struct ncp_entry_info finfo;
+
+	memset(&data, 0, sizeof(data));
+	server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL);
+	if (!server)
+		return -ENOMEM;
+	sb->s_fs_info = server;
+
+	error = -EFAULT;
+	if (raw_data == NULL)
+		goto out;
+	switch (*(int*)raw_data) {
+		case NCP_MOUNT_VERSION:
+			{
+				struct ncp_mount_data* md = (struct ncp_mount_data*)raw_data;
+
+				data.flags = md->flags;
+				data.int_flags = NCP_IMOUNT_LOGGEDIN_POSSIBLE;
+				data.mounted_uid = md->mounted_uid;
+				data.wdog_pid = find_get_pid(md->wdog_pid);
+				data.ncp_fd = md->ncp_fd;
+				data.time_out = md->time_out;
+				data.retry_count = md->retry_count;
+				data.uid = md->uid;
+				data.gid = md->gid;
+				data.file_mode = md->file_mode;
+				data.dir_mode = md->dir_mode;
+				data.info_fd = -1;
+				memcpy(data.mounted_vol, md->mounted_vol,
+					NCP_VOLNAME_LEN+1);
+			}
+			break;
+		case NCP_MOUNT_VERSION_V4:
+			{
+				struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data;
+
+				data.flags = md->flags;
+				data.mounted_uid = md->mounted_uid;
+				data.wdog_pid = find_get_pid(md->wdog_pid);
+				data.ncp_fd = md->ncp_fd;
+				data.time_out = md->time_out;
+				data.retry_count = md->retry_count;
+				data.uid = md->uid;
+				data.gid = md->gid;
+				data.file_mode = md->file_mode;
+				data.dir_mode = md->dir_mode;
+				data.info_fd = -1;
+			}
+			break;
+		default:
+			error = -ECHRNG;
+			if (memcmp(raw_data, "vers", 4) == 0) {
+				error = ncp_parse_options(&data, raw_data);
+			}
+			if (error)
+				goto out;
+			break;
+	}
+	error = -EBADF;
+	ncp_filp = fget(data.ncp_fd);
+	if (!ncp_filp)
+		goto out;
+	error = -ENOTSOCK;
+	sock_inode = ncp_filp->f_path.dentry->d_inode;
+	if (!S_ISSOCK(sock_inode->i_mode))
+		goto out_fput;
+	sock = SOCKET_I(sock_inode);
+	if (!sock)
+		goto out_fput;
+		
+	if (sock->type == SOCK_STREAM)
+		default_bufsize = 0xF000;
+	else
+		default_bufsize = 1024;
+
+	sb->s_flags |= MS_NODIRATIME;	/* probably even noatime */
+	sb->s_maxbytes = 0xFFFFFFFFU;
+	sb->s_blocksize = 1024;	/* Eh...  Is this correct? */
+	sb->s_blocksize_bits = 10;
+	sb->s_magic = NCP_SUPER_MAGIC;
+	sb->s_op = &ncp_sops;
+	sb->s_d_op = &ncp_dentry_operations;
+	sb->s_bdi = &server->bdi;
+
+	server = NCP_SBP(sb);
+	memset(server, 0, sizeof(*server));
+
+	error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
+	if (error)
+		goto out_bdi;
+
+	server->ncp_filp = ncp_filp;
+	server->ncp_sock = sock;
+	
+	if (data.info_fd != -1) {
+		struct socket *info_sock;
+
+		error = -EBADF;
+		server->info_filp = fget(data.info_fd);
+		if (!server->info_filp)
+			goto out_fput;
+		error = -ENOTSOCK;
+		sock_inode = server->info_filp->f_path.dentry->d_inode;
+		if (!S_ISSOCK(sock_inode->i_mode))
+			goto out_fput2;
+		info_sock = SOCKET_I(sock_inode);
+		if (!info_sock)
+			goto out_fput2;
+		error = -EBADFD;
+		if (info_sock->type != SOCK_STREAM)
+			goto out_fput2;
+		server->info_sock = info_sock;
+	}
+
+/*	server->lock = 0;	*/
+	mutex_init(&server->mutex);
+	server->packet = NULL;
+/*	server->buffer_size = 0;	*/
+/*	server->conn_status = 0;	*/
+/*	server->root_dentry = NULL;	*/
+/*	server->root_setuped = 0;	*/
+	mutex_init(&server->root_setup_lock);
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+/*	server->sign_wanted = 0;	*/
+/*	server->sign_active = 0;	*/
+#endif
+	init_rwsem(&server->auth_rwsem);
+	server->auth.auth_type = NCP_AUTH_NONE;
+/*	server->auth.object_name_len = 0;	*/
+/*	server->auth.object_name = NULL;	*/
+/*	server->auth.object_type = 0;		*/
+/*	server->priv.len = 0;			*/
+/*	server->priv.data = NULL;		*/
+
+	server->m = data;
+	/* Although anything producing this is buggy, it happens
+	   now because of PATH_MAX changes.. */
+	if (server->m.time_out < 1) {
+		server->m.time_out = 10;
+		printk(KERN_INFO "You need to recompile your ncpfs utils..\n");
+	}
+	server->m.time_out = server->m.time_out * HZ / 100;
+	server->m.file_mode = (server->m.file_mode & S_IRWXUGO) | S_IFREG;
+	server->m.dir_mode = (server->m.dir_mode & S_IRWXUGO) | S_IFDIR;
+
+#ifdef CONFIG_NCPFS_NLS
+	/* load the default NLS charsets */
+	server->nls_vol = load_nls_default();
+	server->nls_io = load_nls_default();
+#endif /* CONFIG_NCPFS_NLS */
+
+	atomic_set(&server->dentry_ttl, 0);	/* no caching */
+
+	INIT_LIST_HEAD(&server->tx.requests);
+	mutex_init(&server->rcv.creq_mutex);
+	server->tx.creq		= NULL;
+	server->rcv.creq	= NULL;
+
+	init_timer(&server->timeout_tm);
+#undef NCP_PACKET_SIZE
+#define NCP_PACKET_SIZE 131072
+	error = -ENOMEM;
+	server->packet_size = NCP_PACKET_SIZE;
+	server->packet = vmalloc(NCP_PACKET_SIZE);
+	if (server->packet == NULL)
+		goto out_nls;
+	server->txbuf = vmalloc(NCP_PACKET_SIZE);
+	if (server->txbuf == NULL)
+		goto out_packet;
+	server->rxbuf = vmalloc(NCP_PACKET_SIZE);
+	if (server->rxbuf == NULL)
+		goto out_txbuf;
+
+	lock_sock(sock->sk);
+	server->data_ready	= sock->sk->sk_data_ready;
+	server->write_space	= sock->sk->sk_write_space;
+	server->error_report	= sock->sk->sk_error_report;
+	sock->sk->sk_user_data	= server;
+	sock->sk->sk_data_ready	  = ncp_tcp_data_ready;
+	sock->sk->sk_error_report = ncp_tcp_error_report;
+	if (sock->type == SOCK_STREAM) {
+		server->rcv.ptr = (unsigned char*)&server->rcv.buf;
+		server->rcv.len = 10;
+		server->rcv.state = 0;
+		INIT_WORK(&server->rcv.tq, ncp_tcp_rcv_proc);
+		INIT_WORK(&server->tx.tq, ncp_tcp_tx_proc);
+		sock->sk->sk_write_space = ncp_tcp_write_space;
+	} else {
+		INIT_WORK(&server->rcv.tq, ncpdgram_rcv_proc);
+		INIT_WORK(&server->timeout_tq, ncpdgram_timeout_proc);
+		server->timeout_tm.data = (unsigned long)server;
+		server->timeout_tm.function = ncpdgram_timeout_call;
+	}
+	release_sock(sock->sk);
+
+	ncp_lock_server(server);
+	error = ncp_connect(server);
+	ncp_unlock_server(server);
+	if (error < 0)
+		goto out_rxbuf;
+	DPRINTK("ncp_fill_super: NCP_SBP(sb) = %x\n", (int) NCP_SBP(sb));
+
+	error = -EMSGSIZE;	/* -EREMOTESIDEINCOMPATIBLE */
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+	if (ncp_negotiate_size_and_options(server, default_bufsize,
+		NCP_DEFAULT_OPTIONS, &(server->buffer_size), &options) == 0)
+	{
+		if (options != NCP_DEFAULT_OPTIONS)
+		{
+			if (ncp_negotiate_size_and_options(server, 
+				default_bufsize,
+				options & 2, 
+				&(server->buffer_size), &options) != 0)
+				
+			{
+				goto out_disconnect;
+			}
+		}
+		ncp_lock_server(server);
+		if (options & 2)
+			server->sign_wanted = 1;
+		ncp_unlock_server(server);
+	}
+	else 
+#endif	/* CONFIG_NCPFS_PACKET_SIGNING */
+	if (ncp_negotiate_buffersize(server, default_bufsize,
+  				     &(server->buffer_size)) != 0)
+		goto out_disconnect;
+	DPRINTK("ncpfs: bufsize = %d\n", server->buffer_size);
+
+	memset(&finfo, 0, sizeof(finfo));
+	finfo.i.attributes	= aDIR;
+	finfo.i.dataStreamSize	= 0;	/* ignored */
+	finfo.i.dirEntNum	= 0;
+	finfo.i.DosDirNum	= 0;
+#ifdef CONFIG_NCPFS_SMALLDOS
+	finfo.i.NSCreator	= NW_NS_DOS;
+#endif
+	finfo.volume		= NCP_NUMBER_OF_VOLUMES;
+	/* set dates of mountpoint to Jan 1, 1986; 00:00 */
+	finfo.i.creationTime	= finfo.i.modifyTime
+				= cpu_to_le16(0x0000);
+	finfo.i.creationDate	= finfo.i.modifyDate
+				= finfo.i.lastAccessDate
+				= cpu_to_le16(0x0C21);
+	finfo.i.nameLen		= 0;
+	finfo.i.entryName[0]	= '\0';
+
+	finfo.opened		= 0;
+	finfo.ino		= 2;	/* tradition */
+
+	server->name_space[finfo.volume] = NW_NS_DOS;
+
+	error = -ENOMEM;
+        root_inode = ncp_iget(sb, &finfo);
+        if (!root_inode)
+		goto out_disconnect;
+	DPRINTK("ncp_fill_super: root vol=%d\n", NCP_FINFO(root_inode)->volNumber);
+	sb->s_root = d_alloc_root(root_inode);
+        if (!sb->s_root)
+		goto out_no_root;
+	return 0;
+
+out_no_root:
+	iput(root_inode);
+out_disconnect:
+	ncp_lock_server(server);
+	ncp_disconnect(server);
+	ncp_unlock_server(server);
+out_rxbuf:
+	ncp_stop_tasks(server);
+	vfree(server->rxbuf);
+out_txbuf:
+	vfree(server->txbuf);
+out_packet:
+	vfree(server->packet);
+out_nls:
+#ifdef CONFIG_NCPFS_NLS
+	unload_nls(server->nls_io);
+	unload_nls(server->nls_vol);
+#endif
+	mutex_destroy(&server->rcv.creq_mutex);
+	mutex_destroy(&server->root_setup_lock);
+	mutex_destroy(&server->mutex);
+out_fput2:
+	if (server->info_filp)
+		fput(server->info_filp);
+out_fput:
+	bdi_destroy(&server->bdi);
+out_bdi:
+	/* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
+	 * 
+	 * The previously used put_filp(ncp_filp); was bogus, since
+	 * it doesn't perform proper unlocking.
+	 */
+	fput(ncp_filp);
+out:
+	put_pid(data.wdog_pid);
+	sb->s_fs_info = NULL;
+	kfree(server);
+	return error;
+}
+
+static void ncp_put_super(struct super_block *sb)
+{
+	struct ncp_server *server = NCP_SBP(sb);
+
+	ncp_lock_server(server);
+	ncp_disconnect(server);
+	ncp_unlock_server(server);
+
+	ncp_stop_tasks(server);
+
+#ifdef CONFIG_NCPFS_NLS
+	/* unload the NLS charsets */
+	unload_nls(server->nls_vol);
+	unload_nls(server->nls_io);
+#endif /* CONFIG_NCPFS_NLS */
+	mutex_destroy(&server->rcv.creq_mutex);
+	mutex_destroy(&server->root_setup_lock);
+	mutex_destroy(&server->mutex);
+
+	if (server->info_filp)
+		fput(server->info_filp);
+	fput(server->ncp_filp);
+	kill_pid(server->m.wdog_pid, SIGTERM, 1);
+	put_pid(server->m.wdog_pid);
+
+	bdi_destroy(&server->bdi);
+	kfree(server->priv.data);
+	kfree(server->auth.object_name);
+	vfree(server->rxbuf);
+	vfree(server->txbuf);
+	vfree(server->packet);
+	sb->s_fs_info = NULL;
+	kfree(server);
+}
+
+static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct dentry* d;
+	struct inode* i;
+	struct ncp_inode_info* ni;
+	struct ncp_server* s;
+	struct ncp_volume_info vi;
+	struct super_block *sb = dentry->d_sb;
+	int err;
+	__u8 dh;
+	
+	d = sb->s_root;
+	if (!d) {
+		goto dflt;
+	}
+	i = d->d_inode;
+	if (!i) {
+		goto dflt;
+	}
+	ni = NCP_FINFO(i);
+	if (!ni) {
+		goto dflt;
+	}
+	s = NCP_SBP(sb);
+	if (!s) {
+		goto dflt;
+	}
+	if (!s->m.mounted_vol[0]) {
+		goto dflt;
+	}
+
+	err = ncp_dirhandle_alloc(s, ni->volNumber, ni->DosDirNum, &dh);
+	if (err) {
+		goto dflt;
+	}
+	err = ncp_get_directory_info(s, dh, &vi);
+	ncp_dirhandle_free(s, dh);
+	if (err) {
+		goto dflt;
+	}
+	buf->f_type = NCP_SUPER_MAGIC;
+	buf->f_bsize = vi.sectors_per_block * 512;
+	buf->f_blocks = vi.total_blocks;
+	buf->f_bfree = vi.free_blocks;
+	buf->f_bavail = vi.free_blocks;
+	buf->f_files = vi.total_dir_entries;
+	buf->f_ffree = vi.available_dir_entries;
+	buf->f_namelen = 12;
+	return 0;
+
+	/* We cannot say how much disk space is left on a mounted
+	   NetWare Server, because free space is distributed over
+	   volumes, and the current user might have disk quotas. So
+	   free space is not that simple to determine. Our decision
+	   here is to err conservatively. */
+
+dflt:;
+	buf->f_type = NCP_SUPER_MAGIC;
+	buf->f_bsize = NCP_BLOCK_SIZE;
+	buf->f_blocks = 0;
+	buf->f_bfree = 0;
+	buf->f_bavail = 0;
+	buf->f_namelen = 12;
+	return 0;
+}
+
+int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int result = 0;
+	__le32 info_mask;
+	struct nw_modify_dos_info info;
+	struct ncp_server *server;
+
+	result = -EIO;
+
+	server = NCP_SERVER(inode);
+	if (!server)	/* How this could happen? */
+		goto out;
+
+	/* ageing the dentry to force validation */
+	ncp_age_dentry(server, dentry);
+
+	result = inode_change_ok(inode, attr);
+	if (result < 0)
+		goto out;
+
+	result = -EPERM;
+	if (((attr->ia_valid & ATTR_UID) &&
+	     (attr->ia_uid != server->m.uid)))
+		goto out;
+
+	if (((attr->ia_valid & ATTR_GID) &&
+	     (attr->ia_gid != server->m.gid)))
+		goto out;
+
+	if (((attr->ia_valid & ATTR_MODE) &&
+	     (attr->ia_mode &
+	      ~(S_IFREG | S_IFDIR | S_IRWXUGO))))
+		goto out;
+
+	info_mask = 0;
+	memset(&info, 0, sizeof(info));
+
+#if 1 
+        if ((attr->ia_valid & ATTR_MODE) != 0)
+        {
+		umode_t newmode = attr->ia_mode;
+
+		info_mask |= DM_ATTRIBUTES;
+
+                if (S_ISDIR(inode->i_mode)) {
+                	newmode &= server->m.dir_mode;
+		} else {
+#ifdef CONFIG_NCPFS_EXTRAS			
+			if (server->m.flags & NCP_MOUNT_EXTRAS) {
+				/* any non-default execute bit set */
+				if (newmode & ~server->m.file_mode & S_IXUGO)
+					info.attributes |= aSHARED | aSYSTEM;
+				/* read for group/world and not in default file_mode */
+				else if (newmode & ~server->m.file_mode & S_IRUGO)
+					info.attributes |= aSHARED;
+			} else
+#endif
+				newmode &= server->m.file_mode;			
+                }
+                if (newmode & S_IWUGO)
+                	info.attributes &= ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT);
+                else
+			info.attributes |=  (aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT);
+
+#ifdef CONFIG_NCPFS_NFS_NS
+		if (ncp_is_nfs_extras(server, NCP_FINFO(inode)->volNumber)) {
+			result = ncp_modify_nfs_info(server,
+						     NCP_FINFO(inode)->volNumber,
+						     NCP_FINFO(inode)->dirEntNum,
+						     attr->ia_mode, 0);
+			if (result != 0)
+				goto out;
+			info.attributes &= ~(aSHARED | aSYSTEM);
+			{
+				/* mark partial success */
+				struct iattr tmpattr;
+				
+				tmpattr.ia_valid = ATTR_MODE;
+				tmpattr.ia_mode = attr->ia_mode;
+
+				setattr_copy(inode, &tmpattr);
+				mark_inode_dirty(inode);
+			}
+		}
+#endif
+        }
+#endif
+
+	/* Do SIZE before attributes, otherwise mtime together with size does not work...
+	 */
+	if ((attr->ia_valid & ATTR_SIZE) != 0) {
+		int written;
+
+		DPRINTK("ncpfs: trying to change size to %ld\n",
+			attr->ia_size);
+
+		if ((result = ncp_make_open(inode, O_WRONLY)) < 0) {
+			result = -EACCES;
+			goto out;
+		}
+		ncp_write_kernel(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle,
+			  attr->ia_size, 0, "", &written);
+
+		/* According to ndir, the changes only take effect after
+		   closing the file */
+		ncp_inode_close(inode);
+		result = ncp_make_closed(inode);
+		if (result)
+			goto out;
+
+		if (attr->ia_size != i_size_read(inode)) {
+			result = vmtruncate(inode, attr->ia_size);
+			if (result)
+				goto out;
+			mark_inode_dirty(inode);
+		}
+	}
+	if ((attr->ia_valid & ATTR_CTIME) != 0) {
+		info_mask |= (DM_CREATE_TIME | DM_CREATE_DATE);
+		ncp_date_unix2dos(attr->ia_ctime.tv_sec,
+			     &info.creationTime, &info.creationDate);
+	}
+	if ((attr->ia_valid & ATTR_MTIME) != 0) {
+		info_mask |= (DM_MODIFY_TIME | DM_MODIFY_DATE);
+		ncp_date_unix2dos(attr->ia_mtime.tv_sec,
+				  &info.modifyTime, &info.modifyDate);
+	}
+	if ((attr->ia_valid & ATTR_ATIME) != 0) {
+		__le16 dummy;
+		info_mask |= (DM_LAST_ACCESS_DATE);
+		ncp_date_unix2dos(attr->ia_atime.tv_sec,
+				  &dummy, &info.lastAccessDate);
+	}
+	if (info_mask != 0) {
+		result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode),
+				      inode, info_mask, &info);
+		if (result != 0) {
+			if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) {
+				/* NetWare seems not to allow this. I
+				   do not know why. So, just tell the
+				   user everything went fine. This is
+				   a terrible hack, but I do not know
+				   how to do this correctly. */
+				result = 0;
+			} else
+				goto out;
+		}
+#ifdef CONFIG_NCPFS_STRONG		
+		if ((!result) && (info_mask & DM_ATTRIBUTES))
+			NCP_FINFO(inode)->nwattr = info.attributes;
+#endif
+	}
+	if (result)
+		goto out;
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
+out:
+	if (result > 0)
+		result = -EACCES;
+	return result;
+}
+
+static struct dentry *ncp_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_nodev(fs_type, flags, data, ncp_fill_super);
+}
+
+static struct file_system_type ncp_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ncpfs",
+	.mount		= ncp_mount,
+	.kill_sb	= kill_anon_super,
+	.fs_flags	= FS_BINARY_MOUNTDATA,
+};
+
+static int __init init_ncp_fs(void)
+{
+	int err;
+	DPRINTK("ncpfs: init_ncp_fs called\n");
+
+	err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&ncp_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_ncp_fs(void)
+{
+	DPRINTK("ncpfs: exit_ncp_fs called\n");
+	unregister_filesystem(&ncp_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_ncp_fs)
+module_exit(exit_ncp_fs)
+MODULE_LICENSE("GPL");
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
new file mode 100644
index 00000000..790e92a9
--- /dev/null
+++ b/fs/ncpfs/ioctl.c
@@ -0,0 +1,918 @@
+/*
+ *  ioctl.c
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *  Modified 1998, 1999 Wolfram Pienkoss for NLS
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/compat.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/ioctl.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/highuid.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+
+#include <asm/uaccess.h>
+
+#include "ncp_fs.h"
+
+/* maximum limit for ncp_objectname_ioctl */
+#define NCP_OBJECT_NAME_MAX_LEN	4096
+/* maximum limit for ncp_privatedata_ioctl */
+#define NCP_PRIVATE_DATA_MAX_LEN 8192
+/* maximum negotiable packet size */
+#define NCP_PACKET_SIZE_INTERNAL 65536
+
+static int
+ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
+		struct ncp_fs_info __user *arg)
+{
+	struct ncp_fs_info info;
+
+	if (copy_from_user(&info, arg, sizeof(info)))
+		return -EFAULT;
+
+	if (info.version != NCP_GET_FS_INFO_VERSION) {
+		DPRINTK("info.version invalid: %d\n", info.version);
+		return -EINVAL;
+	}
+	/* TODO: info.addr = server->m.serv_addr; */
+	SET_UID(info.mounted_uid, server->m.mounted_uid);
+	info.connection		= server->connection;
+	info.buffer_size	= server->buffer_size;
+	info.volume_number	= NCP_FINFO(inode)->volNumber;
+	info.directory_id	= NCP_FINFO(inode)->DosDirNum;
+
+	if (copy_to_user(arg, &info, sizeof(info)))
+		return -EFAULT;
+	return 0;
+}
+
+static int
+ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
+		   struct ncp_fs_info_v2 __user * arg)
+{
+	struct ncp_fs_info_v2 info2;
+
+	if (copy_from_user(&info2, arg, sizeof(info2)))
+		return -EFAULT;
+
+	if (info2.version != NCP_GET_FS_INFO_VERSION_V2) {
+		DPRINTK("info.version invalid: %d\n", info2.version);
+		return -EINVAL;
+	}
+	info2.mounted_uid   = server->m.mounted_uid;
+	info2.connection    = server->connection;
+	info2.buffer_size   = server->buffer_size;
+	info2.volume_number = NCP_FINFO(inode)->volNumber;
+	info2.directory_id  = NCP_FINFO(inode)->DosDirNum;
+	info2.dummy1 = info2.dummy2 = info2.dummy3 = 0;
+
+	if (copy_to_user(arg, &info2, sizeof(info2)))
+		return -EFAULT;
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_ncp_objectname_ioctl
+{
+	s32		auth_type;
+	u32		object_name_len;
+	compat_caddr_t	object_name;	/* a userspace data, in most cases user name */
+};
+
+struct compat_ncp_fs_info_v2 {
+	s32 version;
+	u32 mounted_uid;
+	u32 connection;
+	u32 buffer_size;
+
+	u32 volume_number;
+	u32 directory_id;
+
+	u32 dummy1;
+	u32 dummy2;
+	u32 dummy3;
+};
+
+struct compat_ncp_ioctl_request {
+	u32 function;
+	u32 size;
+	compat_caddr_t data;
+};
+
+struct compat_ncp_privatedata_ioctl
+{
+	u32		len;
+	compat_caddr_t	data;		/* ~1000 for NDS */
+};
+
+#define NCP_IOC_GET_FS_INFO_V2_32	_IOWR('n', 4, struct compat_ncp_fs_info_v2)
+#define NCP_IOC_NCPREQUEST_32		_IOR('n', 1, struct compat_ncp_ioctl_request)
+#define NCP_IOC_GETOBJECTNAME_32	_IOWR('n', 9, struct compat_ncp_objectname_ioctl)
+#define NCP_IOC_SETOBJECTNAME_32	_IOR('n', 9, struct compat_ncp_objectname_ioctl)
+#define NCP_IOC_GETPRIVATEDATA_32	_IOWR('n', 10, struct compat_ncp_privatedata_ioctl)
+#define NCP_IOC_SETPRIVATEDATA_32	_IOR('n', 10, struct compat_ncp_privatedata_ioctl)
+
+static int
+ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
+		   struct compat_ncp_fs_info_v2 __user * arg)
+{
+	struct compat_ncp_fs_info_v2 info2;
+
+	if (copy_from_user(&info2, arg, sizeof(info2)))
+		return -EFAULT;
+
+	if (info2.version != NCP_GET_FS_INFO_VERSION_V2) {
+		DPRINTK("info.version invalid: %d\n", info2.version);
+		return -EINVAL;
+	}
+	info2.mounted_uid   = server->m.mounted_uid;
+	info2.connection    = server->connection;
+	info2.buffer_size   = server->buffer_size;
+	info2.volume_number = NCP_FINFO(inode)->volNumber;
+	info2.directory_id  = NCP_FINFO(inode)->DosDirNum;
+	info2.dummy1 = info2.dummy2 = info2.dummy3 = 0;
+
+	if (copy_to_user(arg, &info2, sizeof(info2)))
+		return -EFAULT;
+	return 0;
+}
+#endif
+
+#define NCP_IOC_GETMOUNTUID16		_IOW('n', 2, u16)
+#define NCP_IOC_GETMOUNTUID32		_IOW('n', 2, u32)
+#define NCP_IOC_GETMOUNTUID64		_IOW('n', 2, u64)
+
+#ifdef CONFIG_NCPFS_NLS
+/* Here we are select the iocharset and the codepage for NLS.
+ * Thanks Petr Vandrovec for idea and many hints.
+ */
+static int
+ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
+{
+	struct ncp_nls_ioctl user;
+	struct nls_table *codepage;
+	struct nls_table *iocharset;
+	struct nls_table *oldset_io;
+	struct nls_table *oldset_cp;
+	int utf8;
+	int err;
+
+	if (copy_from_user(&user, arg, sizeof(user)))
+		return -EFAULT;
+
+	codepage = NULL;
+	user.codepage[NCP_IOCSNAME_LEN] = 0;
+	if (!user.codepage[0] || !strcmp(user.codepage, "default"))
+		codepage = load_nls_default();
+	else {
+		codepage = load_nls(user.codepage);
+		if (!codepage) {
+			return -EBADRQC;
+		}
+	}
+
+	iocharset = NULL;
+	user.iocharset[NCP_IOCSNAME_LEN] = 0;
+	if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) {
+		iocharset = load_nls_default();
+		utf8 = 0;
+	} else if (!strcmp(user.iocharset, "utf8")) {
+		iocharset = load_nls_default();
+		utf8 = 1;
+	} else {
+		iocharset = load_nls(user.iocharset);
+		if (!iocharset) {
+			unload_nls(codepage);
+			return -EBADRQC;
+		}
+		utf8 = 0;
+	}
+
+	mutex_lock(&server->root_setup_lock);
+	if (server->root_setuped) {
+		oldset_cp = codepage;
+		oldset_io = iocharset;
+		err = -EBUSY;
+	} else {
+		if (utf8)
+			NCP_SET_FLAG(server, NCP_FLAG_UTF8);
+		else
+			NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
+		oldset_cp = server->nls_vol;
+		server->nls_vol = codepage;
+		oldset_io = server->nls_io;
+		server->nls_io = iocharset;
+		err = 0;
+	}
+	mutex_unlock(&server->root_setup_lock);
+	unload_nls(oldset_cp);
+	unload_nls(oldset_io);
+
+	return err;
+}
+
+static int
+ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
+{
+	struct ncp_nls_ioctl user;
+	int len;
+
+	memset(&user, 0, sizeof(user));
+	mutex_lock(&server->root_setup_lock);
+	if (server->nls_vol && server->nls_vol->charset) {
+		len = strlen(server->nls_vol->charset);
+		if (len > NCP_IOCSNAME_LEN)
+			len = NCP_IOCSNAME_LEN;
+		strncpy(user.codepage, server->nls_vol->charset, len);
+		user.codepage[len] = 0;
+	}
+
+	if (NCP_IS_FLAG(server, NCP_FLAG_UTF8))
+		strcpy(user.iocharset, "utf8");
+	else if (server->nls_io && server->nls_io->charset) {
+		len = strlen(server->nls_io->charset);
+		if (len > NCP_IOCSNAME_LEN)
+			len = NCP_IOCSNAME_LEN;
+		strncpy(user.iocharset,	server->nls_io->charset, len);
+		user.iocharset[len] = 0;
+	}
+	mutex_unlock(&server->root_setup_lock);
+
+	if (copy_to_user(arg, &user, sizeof(user)))
+		return -EFAULT;
+	return 0;
+}
+#endif /* CONFIG_NCPFS_NLS */
+
+static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg)
+{
+	struct ncp_server *server = NCP_SERVER(inode);
+	int result;
+	struct ncp_ioctl_request request;
+	char* bouncebuffer;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_NCPREQUEST_32:
+#endif
+	case NCP_IOC_NCPREQUEST:
+#ifdef CONFIG_COMPAT
+		if (cmd == NCP_IOC_NCPREQUEST_32) {
+			struct compat_ncp_ioctl_request request32;
+			if (copy_from_user(&request32, argp, sizeof(request32)))
+				return -EFAULT;
+			request.function = request32.function;
+			request.size = request32.size;
+			request.data = compat_ptr(request32.data);
+		} else
+#endif
+		if (copy_from_user(&request, argp, sizeof(request)))
+			return -EFAULT;
+
+		if ((request.function > 255)
+		    || (request.size >
+		  NCP_PACKET_SIZE - sizeof(struct ncp_request_header))) {
+			return -EINVAL;
+		}
+		bouncebuffer = vmalloc(NCP_PACKET_SIZE_INTERNAL);
+		if (!bouncebuffer)
+			return -ENOMEM;
+		if (copy_from_user(bouncebuffer, request.data, request.size)) {
+			vfree(bouncebuffer);
+			return -EFAULT;
+		}
+		ncp_lock_server(server);
+
+		/* FIXME: We hack around in the server's structures
+		   here to be able to use ncp_request */
+
+		server->has_subfunction = 0;
+		server->current_size = request.size;
+		memcpy(server->packet, bouncebuffer, request.size);
+
+		result = ncp_request2(server, request.function,
+			bouncebuffer, NCP_PACKET_SIZE_INTERNAL);
+		if (result < 0)
+			result = -EIO;
+		else
+			result = server->reply_size;
+		ncp_unlock_server(server);
+		DPRINTK("ncp_ioctl: copy %d bytes\n",
+			result);
+		if (result >= 0)
+			if (copy_to_user(request.data, bouncebuffer, result))
+				result = -EFAULT;
+		vfree(bouncebuffer);
+		return result;
+
+	case NCP_IOC_CONN_LOGGED_IN:
+
+		if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE))
+			return -EINVAL;
+		mutex_lock(&server->root_setup_lock);
+		if (server->root_setuped)
+			result = -EBUSY;
+		else {
+			result = ncp_conn_logged_in(inode->i_sb);
+			if (result == 0)
+				server->root_setuped = 1;
+		}
+		mutex_unlock(&server->root_setup_lock);
+		return result;
+
+	case NCP_IOC_GET_FS_INFO:
+		return ncp_get_fs_info(server, inode, argp);
+
+	case NCP_IOC_GET_FS_INFO_V2:
+		return ncp_get_fs_info_v2(server, inode, argp);
+
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_GET_FS_INFO_V2_32:
+		return ncp_get_compat_fs_info_v2(server, inode, argp);
+#endif
+	/* we have too many combinations of CONFIG_COMPAT,
+	 * CONFIG_64BIT and CONFIG_UID16, so just handle
+	 * any of the possible ioctls */
+	case NCP_IOC_GETMOUNTUID16:
+		{
+			u16 uid;
+
+			SET_UID(uid, server->m.mounted_uid);
+			if (put_user(uid, (u16 __user *)argp))
+				return -EFAULT;
+			return 0;
+		}
+	case NCP_IOC_GETMOUNTUID32:
+		if (put_user(server->m.mounted_uid,
+			     (u32 __user *)argp))
+			return -EFAULT;
+		return 0;
+	case NCP_IOC_GETMOUNTUID64:
+		if (put_user(server->m.mounted_uid,
+			     (u64 __user *)argp))
+			return -EFAULT;
+		return 0;
+
+	case NCP_IOC_GETROOT:
+		{
+			struct ncp_setroot_ioctl sr;
+
+			result = -EACCES;
+			mutex_lock(&server->root_setup_lock);
+			if (server->m.mounted_vol[0]) {
+				struct dentry* dentry = inode->i_sb->s_root;
+
+				if (dentry) {
+					struct inode* s_inode = dentry->d_inode;
+
+					if (s_inode) {
+						sr.volNumber = NCP_FINFO(s_inode)->volNumber;
+						sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum;
+						sr.namespace = server->name_space[sr.volNumber];
+						result = 0;
+					} else
+						DPRINTK("ncpfs: s_root->d_inode==NULL\n");
+				} else
+					DPRINTK("ncpfs: s_root==NULL\n");
+			} else {
+				sr.volNumber = -1;
+				sr.namespace = 0;
+				sr.dirEntNum = 0;
+				result = 0;
+			}
+			mutex_unlock(&server->root_setup_lock);
+			if (!result && copy_to_user(argp, &sr, sizeof(sr)))
+				result = -EFAULT;
+			return result;
+		}
+
+	case NCP_IOC_SETROOT:
+		{
+			struct ncp_setroot_ioctl sr;
+			__u32 vnum;
+			__le32 de;
+			__le32 dosde;
+			struct dentry* dentry;
+
+			if (copy_from_user(&sr, argp, sizeof(sr)))
+				return -EFAULT;
+			mutex_lock(&server->root_setup_lock);
+			if (server->root_setuped)
+				result = -EBUSY;
+			else {
+				if (sr.volNumber < 0) {
+					server->m.mounted_vol[0] = 0;
+					vnum = NCP_NUMBER_OF_VOLUMES;
+					de = 0;
+					dosde = 0;
+					result = 0;
+				} else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) {
+					result = -EINVAL;
+				} else if (ncp_mount_subdir(server, sr.volNumber,
+							sr.namespace, sr.dirEntNum,
+							&vnum, &de, &dosde)) {
+					result = -ENOENT;
+				} else
+					result = 0;
+
+				if (result == 0) {
+					dentry = inode->i_sb->s_root;
+					if (dentry) {
+						struct inode* s_inode = dentry->d_inode;
+
+						if (s_inode) {
+							NCP_FINFO(s_inode)->volNumber = vnum;
+							NCP_FINFO(s_inode)->dirEntNum = de;
+							NCP_FINFO(s_inode)->DosDirNum = dosde;
+							server->root_setuped = 1;
+						} else {
+							DPRINTK("ncpfs: s_root->d_inode==NULL\n");
+							result = -EIO;
+						}
+					} else {
+						DPRINTK("ncpfs: s_root==NULL\n");
+						result = -EIO;
+					}
+				}
+				result = 0;
+			}
+			mutex_unlock(&server->root_setup_lock);
+
+			return result;
+		}
+
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+	case NCP_IOC_SIGN_INIT:
+		{
+			struct ncp_sign_init sign;
+
+			if (argp)
+				if (copy_from_user(&sign, argp, sizeof(sign)))
+					return -EFAULT;
+			ncp_lock_server(server);
+			mutex_lock(&server->rcv.creq_mutex);
+			if (argp) {
+				if (server->sign_wanted) {
+					memcpy(server->sign_root,sign.sign_root,8);
+					memcpy(server->sign_last,sign.sign_last,16);
+					server->sign_active = 1;
+				}
+				/* ignore when signatures not wanted */
+			} else {
+				server->sign_active = 0;
+			}
+			mutex_unlock(&server->rcv.creq_mutex);
+			ncp_unlock_server(server);
+			return 0;
+		}
+
+        case NCP_IOC_SIGN_WANTED:
+		{
+			int state;
+
+			ncp_lock_server(server);
+			state = server->sign_wanted;
+			ncp_unlock_server(server);
+			if (put_user(state, (int __user *)argp))
+				return -EFAULT;
+			return 0;
+		}
+
+	case NCP_IOC_SET_SIGN_WANTED:
+		{
+			int newstate;
+
+			/* get only low 8 bits... */
+			if (get_user(newstate, (unsigned char __user *)argp))
+				return -EFAULT;
+			result = 0;
+			ncp_lock_server(server);
+			if (server->sign_active) {
+				/* cannot turn signatures OFF when active */
+				if (!newstate)
+					result = -EINVAL;
+			} else {
+				server->sign_wanted = newstate != 0;
+			}
+			ncp_unlock_server(server);
+			return result;
+		}
+
+#endif /* CONFIG_NCPFS_PACKET_SIGNING */
+
+#ifdef CONFIG_NCPFS_IOCTL_LOCKING
+	case NCP_IOC_LOCKUNLOCK:
+		{
+			struct ncp_lock_ioctl	 rqdata;
+
+			if (copy_from_user(&rqdata, argp, sizeof(rqdata)))
+				return -EFAULT;
+			if (rqdata.origin != 0)
+				return -EINVAL;
+			/* check for cmd */
+			switch (rqdata.cmd) {
+				case NCP_LOCK_EX:
+				case NCP_LOCK_SH:
+						if (rqdata.timeout == 0)
+							rqdata.timeout = NCP_LOCK_DEFAULT_TIMEOUT;
+						else if (rqdata.timeout > NCP_LOCK_MAX_TIMEOUT)
+							rqdata.timeout = NCP_LOCK_MAX_TIMEOUT;
+						break;
+				case NCP_LOCK_LOG:
+						rqdata.timeout = NCP_LOCK_DEFAULT_TIMEOUT;	/* has no effect */
+				case NCP_LOCK_CLEAR:
+						break;
+				default:
+						return -EINVAL;
+			}
+			/* locking needs both read and write access */
+			if ((result = ncp_make_open(inode, O_RDWR)) != 0)
+			{
+				return result;
+			}
+			result = -EISDIR;
+			if (!S_ISREG(inode->i_mode))
+				goto outrel;
+			if (rqdata.cmd == NCP_LOCK_CLEAR)
+			{
+				result = ncp_ClearPhysicalRecord(NCP_SERVER(inode),
+							NCP_FINFO(inode)->file_handle,
+							rqdata.offset,
+							rqdata.length);
+				if (result > 0) result = 0;	/* no such lock */
+			}
+			else
+			{
+				int lockcmd;
+
+				switch (rqdata.cmd)
+				{
+					case NCP_LOCK_EX:  lockcmd=1; break;
+					case NCP_LOCK_SH:  lockcmd=3; break;
+					default:	   lockcmd=0; break;
+				}
+				result = ncp_LogPhysicalRecord(NCP_SERVER(inode),
+							NCP_FINFO(inode)->file_handle,
+							lockcmd,
+							rqdata.offset,
+							rqdata.length,
+							rqdata.timeout);
+				if (result > 0) result = -EAGAIN;
+			}
+outrel:
+			ncp_inode_close(inode);
+			return result;
+		}
+#endif	/* CONFIG_NCPFS_IOCTL_LOCKING */
+
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_GETOBJECTNAME_32:
+		{
+			struct compat_ncp_objectname_ioctl user;
+			size_t outl;
+
+			if (copy_from_user(&user, argp, sizeof(user)))
+				return -EFAULT;
+			down_read(&server->auth_rwsem);
+			user.auth_type = server->auth.auth_type;
+			outl = user.object_name_len;
+			user.object_name_len = server->auth.object_name_len;
+			if (outl > user.object_name_len)
+				outl = user.object_name_len;
+			result = 0;
+			if (outl) {
+				if (copy_to_user(compat_ptr(user.object_name),
+						 server->auth.object_name,
+						 outl))
+					result = -EFAULT;
+			}
+			up_read(&server->auth_rwsem);
+			if (!result && copy_to_user(argp, &user, sizeof(user)))
+				result = -EFAULT;
+			return result;
+		}
+#endif
+
+	case NCP_IOC_GETOBJECTNAME:
+		{
+			struct ncp_objectname_ioctl user;
+			size_t outl;
+
+			if (copy_from_user(&user, argp, sizeof(user)))
+				return -EFAULT;
+			down_read(&server->auth_rwsem);
+			user.auth_type = server->auth.auth_type;
+			outl = user.object_name_len;
+			user.object_name_len = server->auth.object_name_len;
+			if (outl > user.object_name_len)
+				outl = user.object_name_len;
+			result = 0;
+			if (outl) {
+				if (copy_to_user(user.object_name,
+						 server->auth.object_name,
+						 outl))
+					result = -EFAULT;
+			}
+			up_read(&server->auth_rwsem);
+			if (!result && copy_to_user(argp, &user, sizeof(user)))
+				result = -EFAULT;
+			return result;
+		}
+
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_SETOBJECTNAME_32:
+#endif
+	case NCP_IOC_SETOBJECTNAME:
+		{
+			struct ncp_objectname_ioctl user;
+			void* newname;
+			void* oldname;
+			size_t oldnamelen;
+			void* oldprivate;
+			size_t oldprivatelen;
+
+#ifdef CONFIG_COMPAT
+			if (cmd == NCP_IOC_SETOBJECTNAME_32) {
+				struct compat_ncp_objectname_ioctl user32;
+				if (copy_from_user(&user32, argp, sizeof(user32)))
+					return -EFAULT;
+				user.auth_type = user32.auth_type;
+				user.object_name_len = user32.object_name_len;
+				user.object_name = compat_ptr(user32.object_name);
+			} else
+#endif
+			if (copy_from_user(&user, argp, sizeof(user)))
+				return -EFAULT;
+
+			if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN)
+				return -ENOMEM;
+			if (user.object_name_len) {
+				newname = memdup_user(user.object_name,
+						      user.object_name_len);
+				if (IS_ERR(newname))
+					return PTR_ERR(newname);
+			} else {
+				newname = NULL;
+			}
+			down_write(&server->auth_rwsem);
+			oldname = server->auth.object_name;
+			oldnamelen = server->auth.object_name_len;
+			oldprivate = server->priv.data;
+			oldprivatelen = server->priv.len;
+			server->auth.auth_type = user.auth_type;
+			server->auth.object_name_len = user.object_name_len;
+			server->auth.object_name = newname;
+			server->priv.len = 0;
+			server->priv.data = NULL;
+			up_write(&server->auth_rwsem);
+			kfree(oldprivate);
+			kfree(oldname);
+			return 0;
+		}
+
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_GETPRIVATEDATA_32:
+#endif
+	case NCP_IOC_GETPRIVATEDATA:
+		{
+			struct ncp_privatedata_ioctl user;
+			size_t outl;
+
+#ifdef CONFIG_COMPAT
+			if (cmd == NCP_IOC_GETPRIVATEDATA_32) {
+				struct compat_ncp_privatedata_ioctl user32;
+				if (copy_from_user(&user32, argp, sizeof(user32)))
+					return -EFAULT;
+				user.len = user32.len;
+				user.data = compat_ptr(user32.data);
+			} else
+#endif
+			if (copy_from_user(&user, argp, sizeof(user)))
+				return -EFAULT;
+
+			down_read(&server->auth_rwsem);
+			outl = user.len;
+			user.len = server->priv.len;
+			if (outl > user.len) outl = user.len;
+			result = 0;
+			if (outl) {
+				if (copy_to_user(user.data,
+						 server->priv.data,
+						 outl))
+					result = -EFAULT;
+			}
+			up_read(&server->auth_rwsem);
+			if (result)
+				return result;
+#ifdef CONFIG_COMPAT
+			if (cmd == NCP_IOC_GETPRIVATEDATA_32) {
+				struct compat_ncp_privatedata_ioctl user32;
+				user32.len = user.len;
+				user32.data = (unsigned long) user.data;
+				if (copy_to_user(argp, &user32, sizeof(user32)))
+					return -EFAULT;
+			} else
+#endif
+			if (copy_to_user(argp, &user, sizeof(user)))
+				return -EFAULT;
+
+			return 0;
+		}
+
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_SETPRIVATEDATA_32:
+#endif
+	case NCP_IOC_SETPRIVATEDATA:
+		{
+			struct ncp_privatedata_ioctl user;
+			void* new;
+			void* old;
+			size_t oldlen;
+
+#ifdef CONFIG_COMPAT
+			if (cmd == NCP_IOC_SETPRIVATEDATA_32) {
+				struct compat_ncp_privatedata_ioctl user32;
+				if (copy_from_user(&user32, argp, sizeof(user32)))
+					return -EFAULT;
+				user.len = user32.len;
+				user.data = compat_ptr(user32.data);
+			} else
+#endif
+			if (copy_from_user(&user, argp, sizeof(user)))
+				return -EFAULT;
+
+			if (user.len > NCP_PRIVATE_DATA_MAX_LEN)
+				return -ENOMEM;
+			if (user.len) {
+				new = memdup_user(user.data, user.len);
+				if (IS_ERR(new))
+					return PTR_ERR(new);
+			} else {
+				new = NULL;
+			}
+			down_write(&server->auth_rwsem);
+			old = server->priv.data;
+			oldlen = server->priv.len;
+			server->priv.len = user.len;
+			server->priv.data = new;
+			up_write(&server->auth_rwsem);
+			kfree(old);
+			return 0;
+		}
+
+#ifdef CONFIG_NCPFS_NLS
+	case NCP_IOC_SETCHARSETS:
+		return ncp_set_charsets(server, argp);
+
+	case NCP_IOC_GETCHARSETS:
+		return ncp_get_charsets(server, argp);
+
+#endif /* CONFIG_NCPFS_NLS */
+
+	case NCP_IOC_SETDENTRYTTL:
+		{
+			u_int32_t user;
+
+			if (copy_from_user(&user, argp, sizeof(user)))
+				return -EFAULT;
+			/* 20 secs at most... */
+			if (user > 20000)
+				return -EINVAL;
+			user = (user * HZ) / 1000;
+			atomic_set(&server->dentry_ttl, user);
+			return 0;
+		}
+
+	case NCP_IOC_GETDENTRYTTL:
+		{
+			u_int32_t user = (atomic_read(&server->dentry_ttl) * 1000) / HZ;
+			if (copy_to_user(argp, &user, sizeof(user)))
+				return -EFAULT;
+			return 0;
+		}
+
+	}
+	return -EINVAL;
+}
+
+long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ncp_server *server = NCP_SERVER(inode);
+	uid_t uid = current_uid();
+	int need_drop_write = 0;
+	long ret;
+
+	switch (cmd) {
+	case NCP_IOC_SETCHARSETS:
+	case NCP_IOC_CONN_LOGGED_IN:
+	case NCP_IOC_SETROOT:
+		if (!capable(CAP_SYS_ADMIN)) {
+			ret = -EACCES;
+			goto out;
+		}
+		break;
+	}
+	if (server->m.mounted_uid != uid) {
+		switch (cmd) {
+		/*
+		 * Only mount owner can issue these ioctls.  Information
+		 * necessary to authenticate to other NDS servers are
+		 * stored here.
+		 */
+		case NCP_IOC_GETOBJECTNAME:
+		case NCP_IOC_SETOBJECTNAME:
+		case NCP_IOC_GETPRIVATEDATA:
+		case NCP_IOC_SETPRIVATEDATA:
+#ifdef CONFIG_COMPAT
+		case NCP_IOC_GETOBJECTNAME_32:
+		case NCP_IOC_SETOBJECTNAME_32:
+		case NCP_IOC_GETPRIVATEDATA_32:
+		case NCP_IOC_SETPRIVATEDATA_32:
+#endif
+			ret = -EACCES;
+			goto out;
+		/*
+		 * These require write access on the inode if user id
+		 * does not match.  Note that they do not write to the
+		 * file...  But old code did mnt_want_write, so I keep
+		 * it as is.  Of course not for mountpoint owner, as
+		 * that breaks read-only mounts altogether as ncpmount
+		 * needs working NCP_IOC_NCPREQUEST and
+		 * NCP_IOC_GET_FS_INFO.  Some of these codes (setdentryttl,
+		 * signinit, setsignwanted) should be probably restricted
+		 * to owner only, or even more to CAP_SYS_ADMIN).
+		 */
+		case NCP_IOC_GET_FS_INFO:
+		case NCP_IOC_GET_FS_INFO_V2:
+		case NCP_IOC_NCPREQUEST:
+		case NCP_IOC_SETDENTRYTTL:
+		case NCP_IOC_SIGN_INIT:
+		case NCP_IOC_LOCKUNLOCK:
+		case NCP_IOC_SET_SIGN_WANTED:
+#ifdef CONFIG_COMPAT
+		case NCP_IOC_GET_FS_INFO_V2_32:
+		case NCP_IOC_NCPREQUEST_32:
+#endif
+			ret = mnt_want_write_file(filp);
+			if (ret)
+				goto out;
+			need_drop_write = 1;
+			ret = inode_permission(inode, MAY_WRITE);
+			if (ret)
+				goto outDropWrite;
+			break;
+		/*
+		 * Read access required.
+		 */
+		case NCP_IOC_GETMOUNTUID16:
+		case NCP_IOC_GETMOUNTUID32:
+		case NCP_IOC_GETMOUNTUID64:
+		case NCP_IOC_GETROOT:
+		case NCP_IOC_SIGN_WANTED:
+			ret = inode_permission(inode, MAY_READ);
+			if (ret)
+				goto out;
+			break;
+		/*
+		 * Anybody can read these.
+		 */
+		case NCP_IOC_GETCHARSETS:
+		case NCP_IOC_GETDENTRYTTL:
+		default:
+		/* Three codes below are protected by CAP_SYS_ADMIN above. */
+		case NCP_IOC_SETCHARSETS:
+		case NCP_IOC_CONN_LOGGED_IN:
+		case NCP_IOC_SETROOT:
+			break;
+		}
+	}
+	ret = __ncp_ioctl(inode, cmd, arg);
+outDropWrite:
+	if (need_drop_write)
+		mnt_drop_write(filp->f_path.mnt);
+out:
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long ret;
+
+	arg = (unsigned long) compat_ptr(arg);
+	ret = ncp_ioctl(file, cmd, arg);
+	return ret;
+}
+#endif
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
new file mode 100644
index 00000000..e5d71b27
--- /dev/null
+++ b/fs/ncpfs/mmap.c
@@ -0,0 +1,128 @@
+/*
+ *  mmap.c
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *
+ */
+
+#include <linux/stat.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/shm.h>
+#include <linux/errno.h>
+#include <linux/mman.h>
+#include <linux/string.h>
+#include <linux/fcntl.h>
+#include <linux/memcontrol.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include "ncp_fs.h"
+
+/*
+ * Fill in the supplied page for mmap
+ * XXX: how are we excluding truncate/invalidate here? Maybe need to lock
+ * page?
+ */
+static int ncp_file_mmap_fault(struct vm_area_struct *area,
+					struct vm_fault *vmf)
+{
+	struct file *file = area->vm_file;
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	char *pg_addr;
+	unsigned int already_read;
+	unsigned int count;
+	int bufsize;
+	int pos; /* XXX: loff_t ? */
+
+	/*
+	 * ncpfs has nothing against high pages as long
+	 * as recvmsg and memset works on it
+	 */
+	vmf->page = alloc_page(GFP_HIGHUSER);
+	if (!vmf->page)
+		return VM_FAULT_OOM;
+	pg_addr = kmap(vmf->page);
+	pos = vmf->pgoff << PAGE_SHIFT;
+
+	count = PAGE_SIZE;
+	/* what we can read in one go */
+	bufsize = NCP_SERVER(inode)->buffer_size;
+
+	already_read = 0;
+	if (ncp_make_open(inode, O_RDONLY) >= 0) {
+		while (already_read < count) {
+			int read_this_time;
+			int to_read;
+
+			to_read = bufsize - (pos % bufsize);
+
+			to_read = min_t(unsigned int, to_read, count - already_read);
+
+			if (ncp_read_kernel(NCP_SERVER(inode),
+				     NCP_FINFO(inode)->file_handle,
+				     pos, to_read,
+				     pg_addr + already_read,
+				     &read_this_time) != 0) {
+				read_this_time = 0;
+			}
+			pos += read_this_time;
+			already_read += read_this_time;
+
+			if (read_this_time < to_read) {
+				break;
+			}
+		}
+		ncp_inode_close(inode);
+
+	}
+
+	if (already_read < PAGE_SIZE)
+		memset(pg_addr + already_read, 0, PAGE_SIZE - already_read);
+	flush_dcache_page(vmf->page);
+	kunmap(vmf->page);
+
+	/*
+	 * If I understand ncp_read_kernel() properly, the above always
+	 * fetches from the network, here the analogue of disk.
+	 * -- wli
+	 */
+	count_vm_event(PGMAJFAULT);
+	mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT);
+	return VM_FAULT_MAJOR;
+}
+
+static const struct vm_operations_struct ncp_file_mmap =
+{
+	.fault = ncp_file_mmap_fault,
+};
+
+
+/* This is used for a general mmap of a ncp file */
+int ncp_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	
+	DPRINTK("ncp_mmap: called\n");
+
+	if (!ncp_conn_valid(NCP_SERVER(inode)))
+		return -EIO;
+
+	/* only PAGE_COW or read-only supported now */
+	if (vma->vm_flags & VM_SHARED)
+		return -EINVAL;
+	/* we do not support files bigger than 4GB... We eventually 
+	   supports just 4GB... */
+	if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff 
+	   > (1U << (32 - PAGE_SHIFT)))
+		return -EFBIG;
+
+	vma->vm_ops = &ncp_file_mmap;
+	file_accessed(file);
+	return 0;
+}
diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h
new file mode 100644
index 00000000..31831afe
--- /dev/null
+++ b/fs/ncpfs/ncp_fs.h
@@ -0,0 +1,98 @@
+#include <linux/ncp_fs.h>
+#include "ncp_fs_i.h"
+#include "ncp_fs_sb.h"
+
+/* define because it is easy to change PRINTK to {*}PRINTK */
+#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
+
+#undef NCPFS_PARANOIA
+#ifdef NCPFS_PARANOIA
+#define PPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define PPRINTK(format, args...)
+#endif
+
+#ifndef DEBUG_NCP
+#define DEBUG_NCP 0
+#endif
+#if DEBUG_NCP > 0
+#define DPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define DPRINTK(format, args...)
+#endif
+#if DEBUG_NCP > 1
+#define DDPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define DDPRINTK(format, args...)
+#endif
+
+#define NCP_MAX_RPC_TIMEOUT (6*HZ)
+
+
+struct ncp_entry_info {
+	struct nw_info_struct	i;
+	ino_t			ino;
+	int			opened;
+	int			access;
+	unsigned int		volume;
+	__u8			file_handle[6];
+};
+
+static inline struct ncp_server *NCP_SBP(const struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+#define NCP_SERVER(inode)	NCP_SBP((inode)->i_sb)
+static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode)
+{
+	return container_of(inode, struct ncp_inode_info, vfs_inode);
+}
+
+/* linux/fs/ncpfs/inode.c */
+int ncp_notify_change(struct dentry *, struct iattr *);
+struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *);
+void ncp_update_inode(struct inode *, struct ncp_entry_info *);
+void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
+
+/* linux/fs/ncpfs/dir.c */
+extern const struct inode_operations ncp_dir_inode_operations;
+extern const struct file_operations ncp_dir_operations;
+extern const struct dentry_operations ncp_dentry_operations;
+int ncp_conn_logged_in(struct super_block *);
+int ncp_date_dos2unix(__le16 time, __le16 date);
+void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
+
+/* linux/fs/ncpfs/ioctl.c */
+long ncp_ioctl(struct file *, unsigned int, unsigned long);
+long ncp_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* linux/fs/ncpfs/sock.c */
+int ncp_request2(struct ncp_server *server, int function,
+	void* reply, int max_reply_size);
+static inline int ncp_request(struct ncp_server *server, int function) {
+	return ncp_request2(server, function, server->packet, server->packet_size);
+}
+int ncp_connect(struct ncp_server *server);
+int ncp_disconnect(struct ncp_server *server);
+void ncp_lock_server(struct ncp_server *server);
+void ncp_unlock_server(struct ncp_server *server);
+
+/* linux/fs/ncpfs/symlink.c */
+#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
+extern const struct address_space_operations ncp_symlink_aops;
+int ncp_symlink(struct inode*, struct dentry*, const char*);
+#endif
+
+/* linux/fs/ncpfs/file.c */
+extern const struct inode_operations ncp_file_inode_operations;
+extern const struct file_operations ncp_file_operations;
+int ncp_make_open(struct inode *, int);
+
+/* linux/fs/ncpfs/mmap.c */
+int ncp_mmap(struct file *, struct vm_area_struct *);
+
+/* linux/fs/ncpfs/ncplib_kernel.c */
+int ncp_make_closed(struct inode *);
+
+#include "ncplib_kernel.h"
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
new file mode 100644
index 00000000..4b0bec47
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -0,0 +1,29 @@
+/*
+ *  ncp_fs_i.h
+ *
+ *  Copyright (C) 1995 Volker Lendecke
+ *
+ */
+
+#ifndef _LINUX_NCP_FS_I
+#define _LINUX_NCP_FS_I
+
+/*
+ * This is the ncpfs part of the inode structure. This must contain
+ * all the information we need to work with an inode after creation.
+ */
+struct ncp_inode_info {
+	__le32	dirEntNum;
+	__le32	DosDirNum;
+	__u8	volNumber;
+	__le32	nwattr;
+	struct mutex open_mutex;
+	atomic_t	opened;
+	int	access;
+	int	flags;
+#define NCPI_KLUDGE_SYMLINK	0x0001
+	__u8	file_handle[6];
+	struct inode vfs_inode;
+};
+
+#endif	/* _LINUX_NCP_FS_I */
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
new file mode 100644
index 00000000..4af803f1
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -0,0 +1,176 @@
+/*
+ *  ncp_fs_sb.h
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *
+ */
+
+#ifndef _NCP_FS_SB
+#define _NCP_FS_SB
+
+#include <linux/types.h>
+#include <linux/ncp_mount.h>
+#include <linux/net.h>
+#include <linux/mutex.h>
+#include <linux/backing-dev.h>
+#include <linux/workqueue.h>
+
+#define NCP_DEFAULT_OPTIONS 0		/* 2 for packet signatures */
+
+struct sock;
+
+struct ncp_mount_data_kernel {
+	unsigned long    flags;		/* NCP_MOUNT_* flags */
+	unsigned int	 int_flags;	/* internal flags */
+#define NCP_IMOUNT_LOGGEDIN_POSSIBLE	0x0001
+	__kernel_uid32_t mounted_uid;	/* Who may umount() this filesystem? */
+	struct pid      *wdog_pid;	/* Who cares for our watchdog packets? */
+	unsigned int     ncp_fd;	/* The socket to the ncp port */
+	unsigned int     time_out;	/* How long should I wait after
+					   sending a NCP request? */
+	unsigned int     retry_count;	/* And how often should I retry? */
+	unsigned char	 mounted_vol[NCP_VOLNAME_LEN + 1];
+	__kernel_uid32_t uid;
+	__kernel_gid32_t gid;
+	__kernel_mode_t  file_mode;
+	__kernel_mode_t  dir_mode;
+	int		 info_fd;
+};
+
+struct ncp_server {
+
+	struct ncp_mount_data_kernel m;	/* Nearly all of the mount data is of
+					   interest for us later, so we store
+					   it completely. */
+
+	__u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
+
+	struct file *ncp_filp;	/* File pointer to ncp socket */
+	struct socket *ncp_sock;/* ncp socket */
+	struct file *info_filp;
+	struct socket *info_sock;
+
+	u8 sequence;
+	u8 task;
+	u16 connection;		/* Remote connection number */
+
+	u8 completion;		/* Status message from server */
+	u8 conn_status;		/* Bit 4 = 1 ==> Server going down, no
+				   requests allowed anymore.
+				   Bit 0 = 1 ==> Server is down. */
+
+	int buffer_size;	/* Negotiated bufsize */
+
+	int reply_size;		/* Size of last reply */
+
+	int packet_size;
+	unsigned char *packet;	/* Here we prepare requests and
+				   receive replies */
+	unsigned char *txbuf;	/* Storage for current request */
+	unsigned char *rxbuf;	/* Storage for reply to current request */
+
+	int lock;		/* To prevent mismatch in protocols. */
+	struct mutex mutex;
+
+	int current_size;	/* for packet preparation */
+	int has_subfunction;
+	int ncp_reply_size;
+
+	int root_setuped;
+	struct mutex root_setup_lock;
+
+	/* info for packet signing */
+	int sign_wanted;	/* 1=Server needs signed packets */
+	int sign_active;	/* 0=don't do signing, 1=do */
+	char sign_root[8];	/* generated from password and encr. key */
+	char sign_last[16];	
+
+	/* Authentication info: NDS or BINDERY, username */
+	struct {
+		int	auth_type;
+		size_t	object_name_len;
+		void*	object_name;
+		int	object_type;
+	} auth;
+	/* Password info */
+	struct {
+		size_t	len;
+		void*	data;
+	} priv;
+	struct rw_semaphore auth_rwsem;
+
+	/* nls info: codepage for volume and charset for I/O */
+	struct nls_table *nls_vol;
+	struct nls_table *nls_io;
+
+	/* maximum age in jiffies */
+	atomic_t dentry_ttl;
+
+	/* miscellaneous */
+	unsigned int flags;
+
+	spinlock_t requests_lock;	/* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
+
+	void (*data_ready)(struct sock* sk, int len);
+	void (*error_report)(struct sock* sk);
+	void (*write_space)(struct sock* sk);	/* STREAM mode only */
+	struct {
+		struct work_struct tq;		/* STREAM/DGRAM: data/error ready */
+		struct ncp_request_reply* creq;	/* STREAM/DGRAM: awaiting reply from this request */
+		struct mutex creq_mutex;	/* DGRAM only: lock accesses to rcv.creq */
+
+		unsigned int state;		/* STREAM only: receiver state */
+		struct {
+			__u32 magic __packed;
+			__u32 len __packed;
+			__u16 type __packed;
+			__u16 p1 __packed;
+			__u16 p2 __packed;
+			__u16 p3 __packed;
+			__u16 type2 __packed;
+		} buf;				/* STREAM only: temporary buffer */
+		unsigned char* ptr;		/* STREAM only: pointer to data */
+		size_t len;			/* STREAM only: length of data to receive */
+	} rcv;
+	struct {
+		struct list_head requests;	/* STREAM only: queued requests */
+		struct work_struct tq;		/* STREAM only: transmitter ready */
+		struct ncp_request_reply* creq;	/* STREAM only: currently transmitted entry */
+	} tx;
+	struct timer_list timeout_tm;		/* DGRAM only: timeout timer */
+	struct work_struct timeout_tq;		/* DGRAM only: associated queue, we run timers from process context */
+	int timeout_last;			/* DGRAM only: current timeout length */
+	int timeout_retries;			/* DGRAM only: retries left */
+	struct {
+		size_t len;
+		__u8 data[128];
+	} unexpected_packet;
+	struct backing_dev_info bdi;
+};
+
+extern void ncp_tcp_rcv_proc(struct work_struct *work);
+extern void ncp_tcp_tx_proc(struct work_struct *work);
+extern void ncpdgram_rcv_proc(struct work_struct *work);
+extern void ncpdgram_timeout_proc(struct work_struct *work);
+extern void ncpdgram_timeout_call(unsigned long server);
+extern void ncp_tcp_data_ready(struct sock* sk, int len);
+extern void ncp_tcp_write_space(struct sock* sk);
+extern void ncp_tcp_error_report(struct sock* sk);
+
+#define NCP_FLAG_UTF8	1
+
+#define NCP_CLR_FLAG(server, flag)	((server)->flags &= ~(flag))
+#define NCP_SET_FLAG(server, flag)	((server)->flags |= (flag))
+#define NCP_IS_FLAG(server, flag)	((server)->flags & (flag))
+
+static inline int ncp_conn_valid(struct ncp_server *server)
+{
+	return ((server->conn_status & 0x11) == 0);
+}
+
+static inline void ncp_invalidate_conn(struct ncp_server *server)
+{
+	server->conn_status |= 0x01;
+}
+
+#endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
new file mode 100644
index 00000000..981a9561
--- /dev/null
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -0,0 +1,1323 @@
+/*
+ *  ncplib_kernel.c
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Modified for big endian by J.F. Chadima and David S. Miller
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *  Modified 1999 Wolfram Pienkoss for NLS
+ *  Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info
+ *
+ */
+
+
+
+#include "ncp_fs.h"
+
+static inline void assert_server_locked(struct ncp_server *server)
+{
+	if (server->lock == 0) {
+		DPRINTK("ncpfs: server not locked!\n");
+	}
+}
+
+static void ncp_add_byte(struct ncp_server *server, __u8 x)
+{
+	assert_server_locked(server);
+	*(__u8 *) (&(server->packet[server->current_size])) = x;
+	server->current_size += 1;
+	return;
+}
+
+static void ncp_add_word(struct ncp_server *server, __le16 x)
+{
+	assert_server_locked(server);
+	put_unaligned(x, (__le16 *) (&(server->packet[server->current_size])));
+	server->current_size += 2;
+	return;
+}
+
+static void ncp_add_be16(struct ncp_server *server, __u16 x)
+{
+	assert_server_locked(server);
+	put_unaligned(cpu_to_be16(x), (__be16 *) (&(server->packet[server->current_size])));
+	server->current_size += 2;
+}
+
+static void ncp_add_dword(struct ncp_server *server, __le32 x)
+{
+	assert_server_locked(server);
+	put_unaligned(x, (__le32 *) (&(server->packet[server->current_size])));
+	server->current_size += 4;
+	return;
+}
+
+static void ncp_add_be32(struct ncp_server *server, __u32 x)
+{
+	assert_server_locked(server);
+	put_unaligned(cpu_to_be32(x), (__be32 *)(&(server->packet[server->current_size])));
+	server->current_size += 4;
+}
+
+static inline void ncp_add_dword_lh(struct ncp_server *server, __u32 x) {
+	ncp_add_dword(server, cpu_to_le32(x));
+}
+
+static void ncp_add_mem(struct ncp_server *server, const void *source, int size)
+{
+	assert_server_locked(server);
+	memcpy(&(server->packet[server->current_size]), source, size);
+	server->current_size += size;
+	return;
+}
+
+static void ncp_add_pstring(struct ncp_server *server, const char *s)
+{
+	int len = strlen(s);
+	assert_server_locked(server);
+	if (len > 255) {
+		DPRINTK("ncpfs: string too long: %s\n", s);
+		len = 255;
+	}
+	ncp_add_byte(server, len);
+	ncp_add_mem(server, s, len);
+	return;
+}
+
+static inline void ncp_init_request(struct ncp_server *server)
+{
+	ncp_lock_server(server);
+
+	server->current_size = sizeof(struct ncp_request_header);
+	server->has_subfunction = 0;
+}
+
+static inline void ncp_init_request_s(struct ncp_server *server, int subfunction)
+{
+	ncp_lock_server(server);
+	
+	server->current_size = sizeof(struct ncp_request_header) + 2;
+	ncp_add_byte(server, subfunction);
+
+	server->has_subfunction = 1;
+}
+
+static inline char *
+ncp_reply_data(struct ncp_server *server, int offset)
+{
+	return &(server->packet[sizeof(struct ncp_reply_header) + offset]);
+}
+
+static inline u8 BVAL(const void *data)
+{
+	return *(const u8 *)data;
+}
+
+static u8 ncp_reply_byte(struct ncp_server *server, int offset)
+{
+	return *(const u8 *)ncp_reply_data(server, offset);
+}
+
+static inline u16 WVAL_LH(const void *data)
+{
+	return get_unaligned_le16(data);
+}
+
+static u16
+ncp_reply_le16(struct ncp_server *server, int offset)
+{
+	return get_unaligned_le16(ncp_reply_data(server, offset));
+}
+
+static u16
+ncp_reply_be16(struct ncp_server *server, int offset)
+{
+	return get_unaligned_be16(ncp_reply_data(server, offset));
+}
+
+static inline u32 DVAL_LH(const void *data)
+{
+	return get_unaligned_le32(data);
+}
+
+static __le32
+ncp_reply_dword(struct ncp_server *server, int offset)
+{
+	return get_unaligned((__le32 *)ncp_reply_data(server, offset));
+}
+
+static inline __u32 ncp_reply_dword_lh(struct ncp_server* server, int offset) {
+	return le32_to_cpu(ncp_reply_dword(server, offset));
+}
+
+int
+ncp_negotiate_buffersize(struct ncp_server *server, int size, int *target)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_be16(server, size);
+
+	if ((result = ncp_request(server, 33)) != 0) {
+		ncp_unlock_server(server);
+		return result;
+	}
+	*target = min_t(unsigned int, ncp_reply_be16(server, 0), size);
+
+	ncp_unlock_server(server);
+	return 0;
+}
+
+
+/* options: 
+ *	bit 0	ipx checksum
+ *	bit 1	packet signing
+ */
+int
+ncp_negotiate_size_and_options(struct ncp_server *server, 
+	int size, int options, int *ret_size, int *ret_options) {
+	int result;
+
+	/* there is minimum */
+	if (size < NCP_BLOCK_SIZE) size = NCP_BLOCK_SIZE;
+
+	ncp_init_request(server);
+	ncp_add_be16(server, size);
+	ncp_add_byte(server, options);
+	
+	if ((result = ncp_request(server, 0x61)) != 0)
+	{
+		ncp_unlock_server(server);
+		return result;
+	}
+
+	/* NCP over UDP returns 0 (!!!) */
+	result = ncp_reply_be16(server, 0);
+	if (result >= NCP_BLOCK_SIZE)
+		size = min(result, size);
+	*ret_size = size;
+	*ret_options = ncp_reply_byte(server, 4);
+
+	ncp_unlock_server(server);
+	return 0;
+}
+
+int ncp_get_volume_info_with_number(struct ncp_server* server,
+			     int n, struct ncp_volume_info* target) {
+	int result;
+	int len;
+
+	ncp_init_request_s(server, 44);
+	ncp_add_byte(server, n);
+
+	if ((result = ncp_request(server, 22)) != 0) {
+		goto out;
+	}
+	target->total_blocks = ncp_reply_dword_lh(server, 0);
+	target->free_blocks = ncp_reply_dword_lh(server, 4);
+	target->purgeable_blocks = ncp_reply_dword_lh(server, 8);
+	target->not_yet_purgeable_blocks = ncp_reply_dword_lh(server, 12);
+	target->total_dir_entries = ncp_reply_dword_lh(server, 16);
+	target->available_dir_entries = ncp_reply_dword_lh(server, 20);
+	target->sectors_per_block = ncp_reply_byte(server, 28);
+
+	memset(&(target->volume_name), 0, sizeof(target->volume_name));
+
+	result = -EIO;
+	len = ncp_reply_byte(server, 29);
+	if (len > NCP_VOLNAME_LEN) {
+		DPRINTK("ncpfs: volume name too long: %d\n", len);
+		goto out;
+	}
+	memcpy(&(target->volume_name), ncp_reply_data(server, 30), len);
+	result = 0;
+out:
+	ncp_unlock_server(server);
+	return result;
+}
+
+int ncp_get_directory_info(struct ncp_server* server, __u8 n, 
+			   struct ncp_volume_info* target) {
+	int result;
+	int len;
+
+	ncp_init_request_s(server, 45);
+	ncp_add_byte(server, n);
+
+	if ((result = ncp_request(server, 22)) != 0) {
+		goto out;
+	}
+	target->total_blocks = ncp_reply_dword_lh(server, 0);
+	target->free_blocks = ncp_reply_dword_lh(server, 4);
+	target->purgeable_blocks = 0;
+	target->not_yet_purgeable_blocks = 0;
+	target->total_dir_entries = ncp_reply_dword_lh(server, 8);
+	target->available_dir_entries = ncp_reply_dword_lh(server, 12);
+	target->sectors_per_block = ncp_reply_byte(server, 20);
+
+	memset(&(target->volume_name), 0, sizeof(target->volume_name));
+
+	result = -EIO;
+	len = ncp_reply_byte(server, 21);
+	if (len > NCP_VOLNAME_LEN) {
+		DPRINTK("ncpfs: volume name too long: %d\n", len);
+		goto out;
+	}
+	memcpy(&(target->volume_name), ncp_reply_data(server, 22), len);
+	result = 0;
+out:
+	ncp_unlock_server(server);
+	return result;
+}
+
+int
+ncp_close_file(struct ncp_server *server, const char *file_id)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 0);
+	ncp_add_mem(server, file_id, 6);
+
+	result = ncp_request(server, 66);
+	ncp_unlock_server(server);
+	return result;
+}
+
+int
+ncp_make_closed(struct inode *inode)
+{
+	int err;
+
+	err = 0;
+	mutex_lock(&NCP_FINFO(inode)->open_mutex);
+	if (atomic_read(&NCP_FINFO(inode)->opened) == 1) {
+		atomic_set(&NCP_FINFO(inode)->opened, 0);
+		err = ncp_close_file(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle);
+
+		if (!err)
+			PPRINTK("ncp_make_closed: volnum=%d, dirent=%u, error=%d\n",
+				NCP_FINFO(inode)->volNumber,
+				NCP_FINFO(inode)->dirEntNum, err);
+	}
+	mutex_unlock(&NCP_FINFO(inode)->open_mutex);
+	return err;
+}
+
+static void ncp_add_handle_path(struct ncp_server *server, __u8 vol_num,
+				__le32 dir_base, int have_dir_base, 
+				const char *path)
+{
+	ncp_add_byte(server, vol_num);
+	ncp_add_dword(server, dir_base);
+	if (have_dir_base != 0) {
+		ncp_add_byte(server, 1);	/* dir_base */
+	} else {
+		ncp_add_byte(server, 0xff);	/* no handle */
+	}
+	if (path != NULL) {
+		ncp_add_byte(server, 1);	/* 1 component */
+		ncp_add_pstring(server, path);
+	} else {
+		ncp_add_byte(server, 0);
+	}
+}
+
+int ncp_dirhandle_alloc(struct ncp_server* server, __u8 volnum, __le32 dirent,
+			__u8* dirhandle) {
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 12);		/* subfunction */
+	ncp_add_byte(server, NW_NS_DOS);
+	ncp_add_byte(server, 0);
+	ncp_add_word(server, 0);
+	ncp_add_handle_path(server, volnum, dirent, 1, NULL);
+	if ((result = ncp_request(server, 87)) == 0) {
+		*dirhandle = ncp_reply_byte(server, 0);
+	}
+	ncp_unlock_server(server);
+	return result;
+}
+
+int ncp_dirhandle_free(struct ncp_server* server, __u8 dirhandle) {
+	int result;
+	
+	ncp_init_request_s(server, 20);
+	ncp_add_byte(server, dirhandle);
+	result = ncp_request(server, 22);
+	ncp_unlock_server(server);
+	return result;
+}
+
+void ncp_extract_file_info(const void *structure, struct nw_info_struct *target)
+{
+	const __u8 *name_len;
+	const int info_struct_size = offsetof(struct nw_info_struct, nameLen);
+
+	memcpy(target, structure, info_struct_size);
+	name_len = structure + info_struct_size;
+	target->nameLen = *name_len;
+	memcpy(target->entryName, name_len + 1, *name_len);
+	target->entryName[*name_len] = '\0';
+	target->volNumber = le32_to_cpu(target->volNumber);
+	return;
+}
+
+#ifdef CONFIG_NCPFS_NFS_NS
+static inline void ncp_extract_nfs_info(const unsigned char *structure,
+				 struct nw_nfs_info *target)
+{
+	target->mode = DVAL_LH(structure);
+	target->rdev = DVAL_LH(structure + 8);
+}
+#endif
+
+int ncp_obtain_nfs_info(struct ncp_server *server,
+		        struct nw_info_struct *target)
+
+{
+	int result = 0;
+#ifdef CONFIG_NCPFS_NFS_NS
+	__u32 volnum = target->volNumber;
+
+	if (ncp_is_nfs_extras(server, volnum)) {
+		ncp_init_request(server);
+		ncp_add_byte(server, 19);	/* subfunction */
+		ncp_add_byte(server, server->name_space[volnum]);
+		ncp_add_byte(server, NW_NS_NFS);
+		ncp_add_byte(server, 0);
+		ncp_add_byte(server, volnum);
+		ncp_add_dword(server, target->dirEntNum);
+		/* We must retrieve both nlinks and rdev, otherwise some server versions
+		   report zeroes instead of valid data */
+		ncp_add_dword_lh(server, NSIBM_NFS_MODE | NSIBM_NFS_NLINKS | NSIBM_NFS_RDEV);
+
+		if ((result = ncp_request(server, 87)) == 0) {
+			ncp_extract_nfs_info(ncp_reply_data(server, 0), &target->nfs);
+			DPRINTK(KERN_DEBUG
+				"ncp_obtain_nfs_info: (%s) mode=0%o, rdev=0x%x\n",
+				target->entryName, target->nfs.mode,
+				target->nfs.rdev);
+		} else {
+			target->nfs.mode = 0;
+			target->nfs.rdev = 0;
+		}
+	        ncp_unlock_server(server);
+
+	} else
+#endif
+	{
+		target->nfs.mode = 0;
+		target->nfs.rdev = 0;
+	}
+	return result;
+}
+
+/*
+ * Returns information for a (one-component) name relative to
+ * the specified directory.
+ */
+int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *path,
+			struct nw_info_struct *target)
+{
+	__u8  volnum = NCP_FINFO(dir)->volNumber;
+	__le32 dirent = NCP_FINFO(dir)->dirEntNum;
+	int result;
+
+	if (target == NULL) {
+		printk(KERN_ERR "ncp_obtain_info: invalid call\n");
+		return -EINVAL;
+	}
+	ncp_init_request(server);
+	ncp_add_byte(server, 6);	/* subfunction */
+	ncp_add_byte(server, server->name_space[volnum]);
+	ncp_add_byte(server, server->name_space[volnum]); /* N.B. twice ?? */
+	ncp_add_word(server, cpu_to_le16(0x8006));	/* get all */
+	ncp_add_dword(server, RIM_ALL);
+	ncp_add_handle_path(server, volnum, dirent, 1, path);
+
+	if ((result = ncp_request(server, 87)) != 0)
+		goto out;
+	ncp_extract_file_info(ncp_reply_data(server, 0), target);
+	ncp_unlock_server(server);
+	
+	result = ncp_obtain_nfs_info(server, target);
+	return result;
+
+out:
+	ncp_unlock_server(server);
+	return result;
+}
+
+#ifdef CONFIG_NCPFS_NFS_NS
+static int
+ncp_obtain_DOS_dir_base(struct ncp_server *server,
+		__u8 ns, __u8 volnum, __le32 dirent,
+		const char *path, /* At most 1 component */
+		__le32 *DOS_dir_base)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 6); /* subfunction */
+	ncp_add_byte(server, ns);
+	ncp_add_byte(server, ns);
+	ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */
+	ncp_add_dword(server, RIM_DIRECTORY);
+	ncp_add_handle_path(server, volnum, dirent, 1, path);
+
+	if ((result = ncp_request(server, 87)) == 0)
+	{
+	   	if (DOS_dir_base) *DOS_dir_base=ncp_reply_dword(server, 0x34);
+	}
+	ncp_unlock_server(server);
+	return result;
+}
+#endif /* CONFIG_NCPFS_NFS_NS */
+
+static inline int
+ncp_get_known_namespace(struct ncp_server *server, __u8 volume)
+{
+#if defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS)
+	int result;
+	__u8 *namespace;
+	__u16 no_namespaces;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 24);	/* Subfunction: Get Name Spaces Loaded */
+	ncp_add_word(server, 0);
+	ncp_add_byte(server, volume);
+
+	if ((result = ncp_request(server, 87)) != 0) {
+		ncp_unlock_server(server);
+		return NW_NS_DOS; /* not result ?? */
+	}
+
+	result = NW_NS_DOS;
+	no_namespaces = ncp_reply_le16(server, 0);
+	namespace = ncp_reply_data(server, 2);
+
+	while (no_namespaces > 0) {
+		DPRINTK("get_namespaces: found %d on %d\n", *namespace, volume);
+
+#ifdef CONFIG_NCPFS_NFS_NS
+		if ((*namespace == NW_NS_NFS) && !(server->m.flags&NCP_MOUNT_NO_NFS)) 
+		{
+			result = NW_NS_NFS;
+			break;
+		}
+#endif	/* CONFIG_NCPFS_NFS_NS */
+#ifdef CONFIG_NCPFS_OS2_NS
+		if ((*namespace == NW_NS_OS2) && !(server->m.flags&NCP_MOUNT_NO_OS2))
+		{
+			result = NW_NS_OS2;
+		}
+#endif	/* CONFIG_NCPFS_OS2_NS */
+		namespace += 1;
+		no_namespaces -= 1;
+	}
+	ncp_unlock_server(server);
+	return result;
+#else	/* neither OS2 nor NFS - only DOS */
+	return NW_NS_DOS;
+#endif	/* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */
+}
+
+int
+ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns)
+{
+	int ns = ncp_get_known_namespace(server, volume);
+
+	if (ret_ns)
+		*ret_ns = ns;
+
+	DPRINTK("lookup_vol: namespace[%d] = %d\n",
+		volume, server->name_space[volume]);
+
+	if (server->name_space[volume] == ns)
+		return 0;
+	server->name_space[volume] = ns;
+	return 1;
+}
+
+static int
+ncp_ObtainSpecificDirBase(struct ncp_server *server,
+		__u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base,
+		const char *path, /* At most 1 component */
+		__le32 *dirEntNum, __le32 *DosDirNum)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 6); /* subfunction */
+	ncp_add_byte(server, nsSrc);
+	ncp_add_byte(server, nsDst);
+	ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */
+	ncp_add_dword(server, RIM_ALL);
+	ncp_add_handle_path(server, vol_num, dir_base, 1, path);
+
+	if ((result = ncp_request(server, 87)) != 0)
+	{
+		ncp_unlock_server(server);
+		return result;
+	}
+
+	if (dirEntNum)
+		*dirEntNum = ncp_reply_dword(server, 0x30);
+	if (DosDirNum)
+		*DosDirNum = ncp_reply_dword(server, 0x34);
+	ncp_unlock_server(server);
+	return 0;
+}
+
+int
+ncp_mount_subdir(struct ncp_server *server,
+		 __u8 volNumber, __u8 srcNS, __le32 dirEntNum,
+		 __u32* volume, __le32* newDirEnt, __le32* newDosEnt)
+{
+	int dstNS;
+	int result;
+
+	ncp_update_known_namespace(server, volNumber, &dstNS);
+	if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber, 
+				      dirEntNum, NULL, newDirEnt, newDosEnt)) != 0)
+	{
+		return result;
+	}
+	*volume = volNumber;
+	server->m.mounted_vol[1] = 0;
+	server->m.mounted_vol[0] = 'X';
+	return 0;
+}
+
+int 
+ncp_get_volume_root(struct ncp_server *server,
+		    const char *volname, __u32* volume, __le32* dirent, __le32* dosdirent)
+{
+	int result;
+
+	DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname);
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 22);	/* Subfunction: Generate dir handle */
+	ncp_add_byte(server, 0);	/* DOS namespace */
+	ncp_add_byte(server, 0);	/* reserved */
+	ncp_add_byte(server, 0);	/* reserved */
+	ncp_add_byte(server, 0);	/* reserved */
+
+	ncp_add_byte(server, 0);	/* faked volume number */
+	ncp_add_dword(server, 0);	/* faked dir_base */
+	ncp_add_byte(server, 0xff);	/* Don't have a dir_base */
+	ncp_add_byte(server, 1);	/* 1 path component */
+	ncp_add_pstring(server, volname);
+
+	if ((result = ncp_request(server, 87)) != 0) {
+		ncp_unlock_server(server);
+		return result;
+	}
+	*dirent = *dosdirent = ncp_reply_dword(server, 4);
+	*volume = ncp_reply_byte(server, 8);
+	ncp_unlock_server(server);
+	return 0;
+}
+
+int
+ncp_lookup_volume(struct ncp_server *server,
+		  const char *volname, struct nw_info_struct *target)
+{
+	int result;
+
+	memset(target, 0, sizeof(*target));
+	result = ncp_get_volume_root(server, volname,
+			&target->volNumber, &target->dirEntNum, &target->DosDirNum);
+	if (result) {
+		return result;
+	}
+	ncp_update_known_namespace(server, target->volNumber, NULL);
+	target->nameLen = strlen(volname);
+	memcpy(target->entryName, volname, target->nameLen+1);
+	target->attributes = aDIR;
+	/* set dates to Jan 1, 1986  00:00 */
+	target->creationTime = target->modifyTime = cpu_to_le16(0x0000);
+	target->creationDate = target->modifyDate = target->lastAccessDate = cpu_to_le16(0x0C21);
+	target->nfs.mode = 0;
+	return 0;
+}
+
+int ncp_modify_file_or_subdir_dos_info_path(struct ncp_server *server,
+					    struct inode *dir,
+					    const char *path,
+					    __le32 info_mask,
+					    const struct nw_modify_dos_info *info)
+{
+	__u8  volnum = NCP_FINFO(dir)->volNumber;
+	__le32 dirent = NCP_FINFO(dir)->dirEntNum;
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 7);	/* subfunction */
+	ncp_add_byte(server, server->name_space[volnum]);
+	ncp_add_byte(server, 0);	/* reserved */
+	ncp_add_word(server, cpu_to_le16(0x8006));	/* search attribs: all */
+
+	ncp_add_dword(server, info_mask);
+	ncp_add_mem(server, info, sizeof(*info));
+	ncp_add_handle_path(server, volnum, dirent, 1, path);
+
+	result = ncp_request(server, 87);
+	ncp_unlock_server(server);
+	return result;
+}
+
+int ncp_modify_file_or_subdir_dos_info(struct ncp_server *server,
+				       struct inode *dir,
+				       __le32 info_mask,
+				       const struct nw_modify_dos_info *info)
+{
+	return ncp_modify_file_or_subdir_dos_info_path(server, dir, NULL,
+		info_mask, info);
+}
+
+#ifdef CONFIG_NCPFS_NFS_NS
+int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
+			       __u32 mode, __u32 rdev)
+
+{
+	int result = 0;
+
+	ncp_init_request(server);
+	if (server->name_space[volnum] == NW_NS_NFS) {
+		ncp_add_byte(server, 25);	/* subfunction */
+		ncp_add_byte(server, server->name_space[volnum]);
+		ncp_add_byte(server, NW_NS_NFS);
+		ncp_add_byte(server, volnum);
+		ncp_add_dword(server, dirent);
+		/* we must always operate on both nlinks and rdev, otherwise
+		   rdev is not set */
+		ncp_add_dword_lh(server, NSIBM_NFS_MODE | NSIBM_NFS_NLINKS | NSIBM_NFS_RDEV);
+		ncp_add_dword_lh(server, mode);
+		ncp_add_dword_lh(server, 1);	/* nlinks */
+		ncp_add_dword_lh(server, rdev);
+		result = ncp_request(server, 87);
+	}
+	ncp_unlock_server(server);
+	return result;
+}
+#endif
+
+
+static int
+ncp_DeleteNSEntry(struct ncp_server *server,
+		  __u8 have_dir_base, __u8 volnum, __le32 dirent,
+		  const char* name, __u8 ns, __le16 attr)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 8);	/* subfunction */
+	ncp_add_byte(server, ns);
+	ncp_add_byte(server, 0);	/* reserved */
+	ncp_add_word(server, attr);	/* search attribs: all */
+	ncp_add_handle_path(server, volnum, dirent, have_dir_base, name);
+
+	result = ncp_request(server, 87);
+	ncp_unlock_server(server);
+	return result;
+}
+
+int
+ncp_del_file_or_subdir2(struct ncp_server *server,
+			struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	__u8  volnum;
+	__le32 dirent;
+
+	if (!inode) {
+		return 0xFF;	/* Any error */
+	}
+	volnum = NCP_FINFO(inode)->volNumber;
+	dirent = NCP_FINFO(inode)->DosDirNum;
+	return ncp_DeleteNSEntry(server, 1, volnum, dirent, NULL, NW_NS_DOS, cpu_to_le16(0x8006));
+}
+
+int
+ncp_del_file_or_subdir(struct ncp_server *server,
+		       struct inode *dir, const char *name)
+{
+	__u8  volnum = NCP_FINFO(dir)->volNumber;
+	__le32 dirent = NCP_FINFO(dir)->dirEntNum;
+	int name_space;
+
+	name_space = server->name_space[volnum];
+#ifdef CONFIG_NCPFS_NFS_NS
+	if (name_space == NW_NS_NFS)
+ 	{
+ 		int result;
+ 
+		result=ncp_obtain_DOS_dir_base(server, name_space, volnum, dirent, name, &dirent);
+ 		if (result) return result;
+		name = NULL;
+		name_space = NW_NS_DOS;
+ 	}
+#endif	/* CONFIG_NCPFS_NFS_NS */
+	return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, name_space, cpu_to_le16(0x8006));
+}
+
+static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
+{
+	__le16 *dest = (__le16 *) ret;
+	dest[1] = cpu_to_le16(v0);
+	dest[2] = cpu_to_le16(v1);
+	dest[0] = cpu_to_le16(v0 + 1);
+	return;
+}
+
+/* If both dir and name are NULL, then in target there's already a
+   looked-up entry that wants to be opened. */
+int ncp_open_create_file_or_subdir(struct ncp_server *server,
+				   struct inode *dir, const char *name,
+				   int open_create_mode,
+				   __le32 create_attributes,
+				   __le16 desired_acc_rights,
+				   struct ncp_entry_info *target)
+{
+	__le16 search_attribs = cpu_to_le16(0x0006);
+	__u8  volnum;
+	__le32 dirent;
+	int result;
+
+	volnum = NCP_FINFO(dir)->volNumber;
+	dirent = NCP_FINFO(dir)->dirEntNum;
+
+	if ((create_attributes & aDIR) != 0) {
+		search_attribs |= cpu_to_le16(0x8000);
+	}
+	ncp_init_request(server);
+	ncp_add_byte(server, 1);	/* subfunction */
+	ncp_add_byte(server, server->name_space[volnum]);
+	ncp_add_byte(server, open_create_mode);
+	ncp_add_word(server, search_attribs);
+	ncp_add_dword(server, RIM_ALL);
+	ncp_add_dword(server, create_attributes);
+	/* The desired acc rights seem to be the inherited rights mask
+	   for directories */
+	ncp_add_word(server, desired_acc_rights);
+	ncp_add_handle_path(server, volnum, dirent, 1, name);
+
+	if ((result = ncp_request(server, 87)) != 0)
+		goto out;
+	if (!(create_attributes & aDIR))
+		target->opened = 1;
+
+	/* in target there's a new finfo to fill */
+	ncp_extract_file_info(ncp_reply_data(server, 6), &(target->i));
+	target->volume = target->i.volNumber;
+	ConvertToNWfromDWORD(ncp_reply_le16(server, 0),
+			     ncp_reply_le16(server, 2),
+			     target->file_handle);
+	
+	ncp_unlock_server(server);
+
+	(void)ncp_obtain_nfs_info(server, &(target->i));
+	return 0;
+
+out:
+	ncp_unlock_server(server);
+	return result;
+}
+
+int
+ncp_initialize_search(struct ncp_server *server, struct inode *dir,
+			struct nw_search_sequence *target)
+{
+	__u8  volnum = NCP_FINFO(dir)->volNumber;
+	__le32 dirent = NCP_FINFO(dir)->dirEntNum;
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 2);	/* subfunction */
+	ncp_add_byte(server, server->name_space[volnum]);
+	ncp_add_byte(server, 0);	/* reserved */
+	ncp_add_handle_path(server, volnum, dirent, 1, NULL);
+
+	result = ncp_request(server, 87);
+	if (result)
+		goto out;
+	memcpy(target, ncp_reply_data(server, 0), sizeof(*target));
+
+out:
+	ncp_unlock_server(server);
+	return result;
+}
+
+int ncp_search_for_fileset(struct ncp_server *server,
+			   struct nw_search_sequence *seq,
+			   int* more,
+			   int* cnt,
+			   char* buffer,
+			   size_t bufsize,
+			   char** rbuf,
+			   size_t* rsize)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 20);
+	ncp_add_byte(server, server->name_space[seq->volNumber]);
+	ncp_add_byte(server, 0);		/* datastream */
+	ncp_add_word(server, cpu_to_le16(0x8006));
+	ncp_add_dword(server, RIM_ALL);
+	ncp_add_word(server, cpu_to_le16(32767));	/* max returned items */
+	ncp_add_mem(server, seq, 9);
+#ifdef CONFIG_NCPFS_NFS_NS
+	if (server->name_space[seq->volNumber] == NW_NS_NFS) {
+		ncp_add_byte(server, 0);	/* 0 byte pattern */
+	} else 
+#endif
+	{
+		ncp_add_byte(server, 2);	/* 2 byte pattern */
+		ncp_add_byte(server, 0xff);	/* following is a wildcard */
+		ncp_add_byte(server, '*');
+	}
+	result = ncp_request2(server, 87, buffer, bufsize);
+	if (result) {
+		ncp_unlock_server(server);
+		return result;
+	}
+	if (server->ncp_reply_size < 12) {
+		ncp_unlock_server(server);
+		return 0xFF;
+	}
+	*rsize = server->ncp_reply_size - 12;
+	ncp_unlock_server(server);
+	buffer = buffer + sizeof(struct ncp_reply_header);
+	*rbuf = buffer + 12;
+	*cnt = WVAL_LH(buffer + 10);
+	*more = BVAL(buffer + 9);
+	memcpy(seq, buffer, 9);
+	return 0;
+}
+
+static int
+ncp_RenameNSEntry(struct ncp_server *server,
+		  struct inode *old_dir, const char *old_name, __le16 old_type,
+		  struct inode *new_dir, const char *new_name)
+{
+	int result = -EINVAL;
+
+	if ((old_dir == NULL) || (old_name == NULL) ||
+	    (new_dir == NULL) || (new_name == NULL))
+		goto out;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 4);	/* subfunction */
+	ncp_add_byte(server, server->name_space[NCP_FINFO(old_dir)->volNumber]);
+	ncp_add_byte(server, 1);	/* rename flag */
+	ncp_add_word(server, old_type);	/* search attributes */
+
+	/* source Handle Path */
+	ncp_add_byte(server, NCP_FINFO(old_dir)->volNumber);
+	ncp_add_dword(server, NCP_FINFO(old_dir)->dirEntNum);
+	ncp_add_byte(server, 1);
+	ncp_add_byte(server, 1);	/* 1 source component */
+
+	/* dest Handle Path */
+	ncp_add_byte(server, NCP_FINFO(new_dir)->volNumber);
+	ncp_add_dword(server, NCP_FINFO(new_dir)->dirEntNum);
+	ncp_add_byte(server, 1);
+	ncp_add_byte(server, 1);	/* 1 destination component */
+
+	/* source path string */
+	ncp_add_pstring(server, old_name);
+	/* dest path string */
+	ncp_add_pstring(server, new_name);
+
+	result = ncp_request(server, 87);
+	ncp_unlock_server(server);
+out:
+	return result;
+}
+
+int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
+				struct inode *old_dir, const char *old_name,
+				struct inode *new_dir, const char *new_name)
+{
+        int result;
+        __le16 old_type = cpu_to_le16(0x06);
+
+/* If somebody can do it atomic, call me... vandrove@vc.cvut.cz */
+	result = ncp_RenameNSEntry(server, old_dir, old_name, old_type,
+	                                   new_dir, new_name);
+        if (result == 0xFF)	/* File Not Found, try directory */
+	{
+		old_type = cpu_to_le16(0x16);
+		result = ncp_RenameNSEntry(server, old_dir, old_name, old_type,
+						   new_dir, new_name);
+	}
+	if (result != 0x92) return result;	/* All except NO_FILES_RENAMED */
+	result = ncp_del_file_or_subdir(server, new_dir, new_name);
+	if (result != 0) return -EACCES;
+	result = ncp_RenameNSEntry(server, old_dir, old_name, old_type,
+					   new_dir, new_name);
+	return result;
+}
+	
+
+/* We have to transfer to/from user space */
+int
+ncp_read_kernel(struct ncp_server *server, const char *file_id,
+	     __u32 offset, __u16 to_read, char *target, int *bytes_read)
+{
+	const char *source;
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 0);
+	ncp_add_mem(server, file_id, 6);
+	ncp_add_be32(server, offset);
+	ncp_add_be16(server, to_read);
+
+	if ((result = ncp_request(server, 72)) != 0) {
+		goto out;
+	}
+	*bytes_read = ncp_reply_be16(server, 0);
+	source = ncp_reply_data(server, 2 + (offset & 1));
+
+	memcpy(target, source, *bytes_read);
+out:
+	ncp_unlock_server(server);
+	return result;
+}
+
+/* There is a problem... egrep and some other silly tools do:
+	x = mmap(NULL, MAP_PRIVATE, PROT_READ|PROT_WRITE, <ncpfs fd>, 32768);
+	read(<ncpfs fd>, x, 32768);
+   Now copying read result by copy_to_user causes pagefault. This pagefault
+   could not be handled because of server was locked due to read. So we have
+   to use temporary buffer. So ncp_unlock_server must be done before
+   copy_to_user (and for write, copy_from_user must be done before 
+   ncp_init_request... same applies for send raw packet ioctl). Because of
+   file is normally read in bigger chunks, caller provides kmalloced 
+   (vmalloced) chunk of memory with size >= to_read...
+ */
+int
+ncp_read_bounce(struct ncp_server *server, const char *file_id,
+	 __u32 offset, __u16 to_read, char __user *target, int *bytes_read,
+	 void* bounce, __u32 bufsize)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 0);
+	ncp_add_mem(server, file_id, 6);
+	ncp_add_be32(server, offset);
+	ncp_add_be16(server, to_read);
+	result = ncp_request2(server, 72, bounce, bufsize);
+	ncp_unlock_server(server);
+	if (!result) {
+		int len = get_unaligned_be16((char *)bounce +
+			  sizeof(struct ncp_reply_header));
+		result = -EIO;
+		if (len <= to_read) {
+			char* source;
+
+			source = (char*)bounce + 
+			         sizeof(struct ncp_reply_header) + 2 + 
+			         (offset & 1);
+			*bytes_read = len;
+			result = 0;
+			if (copy_to_user(target, source, len))
+				result = -EFAULT;
+		}
+	}
+	return result;
+}
+
+int
+ncp_write_kernel(struct ncp_server *server, const char *file_id,
+		 __u32 offset, __u16 to_write,
+		 const char *source, int *bytes_written)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 0);
+	ncp_add_mem(server, file_id, 6);
+	ncp_add_be32(server, offset);
+	ncp_add_be16(server, to_write);
+	ncp_add_mem(server, source, to_write);
+	
+	if ((result = ncp_request(server, 73)) == 0)
+		*bytes_written = to_write;
+	ncp_unlock_server(server);
+	return result;
+}
+
+#ifdef CONFIG_NCPFS_IOCTL_LOCKING
+int
+ncp_LogPhysicalRecord(struct ncp_server *server, const char *file_id,
+	  __u8 locktype, __u32 offset, __u32 length, __u16 timeout)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, locktype);
+	ncp_add_mem(server, file_id, 6);
+	ncp_add_be32(server, offset);
+	ncp_add_be32(server, length);
+	ncp_add_be16(server, timeout);
+
+	if ((result = ncp_request(server, 0x1A)) != 0)
+	{
+		ncp_unlock_server(server);
+		return result;
+	}
+	ncp_unlock_server(server);
+	return 0;
+}
+
+int
+ncp_ClearPhysicalRecord(struct ncp_server *server, const char *file_id,
+	  __u32 offset, __u32 length)
+{
+	int result;
+
+	ncp_init_request(server);
+	ncp_add_byte(server, 0);	/* who knows... lanalyzer says that */
+	ncp_add_mem(server, file_id, 6);
+	ncp_add_be32(server, offset);
+	ncp_add_be32(server, length);
+
+	if ((result = ncp_request(server, 0x1E)) != 0)
+	{
+		ncp_unlock_server(server);
+		return result;
+	}
+	ncp_unlock_server(server);
+	return 0;
+}
+#endif	/* CONFIG_NCPFS_IOCTL_LOCKING */
+
+#ifdef CONFIG_NCPFS_NLS
+/* This are the NLS conversion routines with inspirations and code parts
+ * from the vfat file system and hints from Petr Vandrovec.
+ */
+
+int
+ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen,
+		const unsigned char *iname, unsigned int ilen, int cc)
+{
+	struct nls_table *in = server->nls_io;
+	struct nls_table *out = server->nls_vol;
+	unsigned char *vname_start;
+	unsigned char *vname_end;
+	const unsigned char *iname_end;
+
+	iname_end = iname + ilen;
+	vname_start = vname;
+	vname_end = vname + *vlen - 1;
+
+	while (iname < iname_end) {
+		int chl;
+		wchar_t ec;
+
+		if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
+			int k;
+			unicode_t u;
+
+			k = utf8_to_utf32(iname, iname_end - iname, &u);
+			if (k < 0 || u > MAX_WCHAR_T)
+				return -EINVAL;
+			iname += k;
+			ec = u;
+		} else {
+			if (*iname == NCP_ESC) {
+				int k;
+
+				if (iname_end - iname < 5)
+					goto nospec;
+
+				ec = 0;
+				for (k = 1; k < 5; k++) {
+					unsigned char nc;
+
+					nc = iname[k] - '0';
+					if (nc >= 10) {
+						nc -= 'A' - '0' - 10;
+						if ((nc < 10) || (nc > 15)) {
+							goto nospec;
+						}
+					}
+					ec = (ec << 4) | nc;
+				}
+				iname += 5;
+			} else {
+nospec:;			
+				if ( (chl = in->char2uni(iname, iname_end - iname, &ec)) < 0)
+					return chl;
+				iname += chl;
+			}
+		}
+
+		/* unitoupper should be here! */
+
+		chl = out->uni2char(ec, vname, vname_end - vname);
+		if (chl < 0)
+			return chl;
+
+		/* this is wrong... */
+		if (cc) {
+			int chi;
+
+			for (chi = 0; chi < chl; chi++){
+				vname[chi] = ncp_toupper(out, vname[chi]);
+			}
+		}
+		vname += chl;
+	}
+
+	*vname = 0;
+	*vlen = vname - vname_start;
+	return 0;
+}
+
+int
+ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen,
+		const unsigned char *vname, unsigned int vlen, int cc)
+{
+	struct nls_table *in = server->nls_vol;
+	struct nls_table *out = server->nls_io;
+	const unsigned char *vname_end;
+	unsigned char *iname_start;
+	unsigned char *iname_end;
+	unsigned char *vname_cc;
+	int err;
+
+	vname_cc = NULL;
+
+	if (cc) {
+		int i;
+
+		/* this is wrong! */
+		vname_cc = kmalloc(vlen, GFP_KERNEL);
+		if (!vname_cc)
+			return -ENOMEM;
+		for (i = 0; i < vlen; i++)
+			vname_cc[i] = ncp_tolower(in, vname[i]);
+		vname = vname_cc;
+	}
+
+	iname_start = iname;
+	iname_end = iname + *ilen - 1;
+	vname_end = vname + vlen;
+
+	while (vname < vname_end) {
+		wchar_t ec;
+		int chl;
+
+		if ( (chl = in->char2uni(vname, vname_end - vname, &ec)) < 0) {
+			err = chl;
+			goto quit;
+		}
+		vname += chl;
+
+		/* unitolower should be here! */
+
+		if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
+			int k;
+
+			k = utf32_to_utf8(ec, iname, iname_end - iname);
+			if (k < 0) {
+				err = -ENAMETOOLONG;
+				goto quit;
+			}
+			iname += k;
+		} else {
+			if ( (chl = out->uni2char(ec, iname, iname_end - iname)) >= 0) {
+				iname += chl;
+			} else {
+				int k;
+
+				if (iname_end - iname < 5) {
+					err = -ENAMETOOLONG;
+					goto quit;
+				}
+				*iname = NCP_ESC;
+				for (k = 4; k > 0; k--) {
+					unsigned char v;
+					
+					v = (ec & 0xF) + '0';
+					if (v > '9') {
+						v += 'A' - '9' - 1;
+					}
+					iname[k] = v;
+					ec >>= 4;
+				}
+				iname += 5;
+			}
+		}
+	}
+
+	*iname = 0;
+	*ilen = iname - iname_start;
+	err = 0;
+quit:;
+	if (cc)
+		kfree(vname_cc);
+	return err;
+}
+
+#else
+
+int
+ncp__io2vol(unsigned char *vname, unsigned int *vlen,
+		const unsigned char *iname, unsigned int ilen, int cc)
+{
+	int i;
+
+	if (*vlen <= ilen)
+		return -ENAMETOOLONG;
+
+	if (cc)
+		for (i = 0; i < ilen; i++) {
+			*vname = toupper(*iname);
+			vname++;
+			iname++;
+		}
+	else {
+		memmove(vname, iname, ilen);
+		vname += ilen;
+	}
+
+	*vlen = ilen;
+	*vname = 0;
+	return 0;
+}
+
+int
+ncp__vol2io(unsigned char *iname, unsigned int *ilen,
+		const unsigned char *vname, unsigned int vlen, int cc)
+{
+	int i;
+
+	if (*ilen <= vlen)
+		return -ENAMETOOLONG;
+
+	if (cc)
+		for (i = 0; i < vlen; i++) {
+			*iname = tolower(*vname);
+			iname++;
+			vname++;
+		}
+	else {
+		memmove(iname, vname, vlen);
+		iname += vlen;
+	}
+
+	*ilen = vlen;
+	*iname = 0;
+	return 0;
+}
+
+#endif
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
new file mode 100644
index 00000000..09881e6a
--- /dev/null
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -0,0 +1,254 @@
+/*
+ *  ncplib_kernel.h
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Modified for big endian by J.F. Chadima and David S. Miller
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *  Modified 1998, 1999 Wolfram Pienkoss for NLS
+ *  Modified 1999 Wolfram Pienkoss for directory caching
+ *
+ */
+
+#ifndef _NCPLIB_H
+#define _NCPLIB_H
+
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/pagemap.h>
+
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <asm/string.h>
+
+#ifdef CONFIG_NCPFS_NLS
+#include <linux/nls.h>
+#else
+#include <linux/ctype.h>
+#endif /* CONFIG_NCPFS_NLS */
+
+#define NCP_MIN_SYMLINK_SIZE	8
+#define NCP_MAX_SYMLINK_SIZE	512
+
+#define NCP_BLOCK_SHIFT		9
+#define NCP_BLOCK_SIZE		(1 << (NCP_BLOCK_SHIFT))
+
+int ncp_negotiate_buffersize(struct ncp_server *, int, int *);
+int ncp_negotiate_size_and_options(struct ncp_server *server, int size,
+  			  int options, int *ret_size, int *ret_options);
+
+int ncp_get_volume_info_with_number(struct ncp_server* server, int n,
+				    struct ncp_volume_info *target);
+
+int ncp_get_directory_info(struct ncp_server* server, __u8 dirhandle,
+			   struct ncp_volume_info* target);
+
+int ncp_close_file(struct ncp_server *, const char *);
+static inline int ncp_read_bounce_size(__u32 size) {
+	return sizeof(struct ncp_reply_header) + 2 + 2 + size + 8;
+};
+int ncp_read_bounce(struct ncp_server *, const char *, __u32, __u16, 
+		char __user *, int *, void* bounce, __u32 bouncelen);
+int ncp_read_kernel(struct ncp_server *, const char *, __u32, __u16, 
+		char *, int *);
+int ncp_write_kernel(struct ncp_server *, const char *, __u32, __u16,
+		const char *, int *);
+
+static inline void ncp_inode_close(struct inode *inode) {
+	atomic_dec(&NCP_FINFO(inode)->opened);
+}
+
+void ncp_extract_file_info(const void* src, struct nw_info_struct* target);
+int ncp_obtain_info(struct ncp_server *server, struct inode *, const char *,
+		struct nw_info_struct *target);
+int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target);
+int ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns);
+int ncp_get_volume_root(struct ncp_server *server, const char *volname,
+			__u32 *volume, __le32 *dirent, __le32 *dosdirent);
+int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *);
+int ncp_modify_file_or_subdir_dos_info(struct ncp_server *, struct inode *,
+	 __le32, const struct nw_modify_dos_info *info);
+int ncp_modify_file_or_subdir_dos_info_path(struct ncp_server *, struct inode *,
+	 const char* path, __le32, const struct nw_modify_dos_info *info);
+int ncp_modify_nfs_info(struct ncp_server *, __u8 volnum, __le32 dirent,
+			__u32 mode, __u32 rdev);
+
+int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*);
+int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, const char *);
+int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, const char *,
+				int, __le32, __le16, struct ncp_entry_info *);
+
+int ncp_initialize_search(struct ncp_server *, struct inode *,
+		      struct nw_search_sequence *target);
+int ncp_search_for_fileset(struct ncp_server *server,
+			   struct nw_search_sequence *seq,
+			   int* more, int* cnt,
+			   char* buffer, size_t bufsize,
+			   char** rbuf, size_t* rsize);
+
+int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
+			      struct inode *, const char *, struct inode *, const char *);
+
+
+int
+ncp_LogPhysicalRecord(struct ncp_server *server,
+		      const char *file_id, __u8 locktype,
+		      __u32 offset, __u32 length, __u16 timeout);
+
+#ifdef CONFIG_NCPFS_IOCTL_LOCKING
+int
+ncp_ClearPhysicalRecord(struct ncp_server *server,
+			const char *file_id,
+			__u32 offset, __u32 length);
+#endif	/* CONFIG_NCPFS_IOCTL_LOCKING */
+
+int
+ncp_mount_subdir(struct ncp_server *, __u8, __u8, __le32,
+		 __u32* volume, __le32* dirent, __le32* dosdirent);
+int ncp_dirhandle_alloc(struct ncp_server *, __u8 vol, __le32 dirent, __u8 *dirhandle);
+int ncp_dirhandle_free(struct ncp_server *, __u8 dirhandle);
+
+int ncp_create_new(struct inode *dir, struct dentry *dentry,
+                          int mode, dev_t rdev, __le32 attributes);
+
+static inline int ncp_is_nfs_extras(struct ncp_server* server, unsigned int volnum) {
+#ifdef CONFIG_NCPFS_NFS_NS
+	return (server->m.flags & NCP_MOUNT_NFS_EXTRAS) &&
+	       (server->name_space[volnum] == NW_NS_NFS);
+#else
+	return 0;
+#endif
+}
+
+#ifdef CONFIG_NCPFS_NLS
+
+int ncp__io2vol(struct ncp_server *, unsigned char *, unsigned int *,
+				const unsigned char *, unsigned int, int);
+int ncp__vol2io(struct ncp_server *, unsigned char *, unsigned int *,
+				const unsigned char *, unsigned int, int);
+
+#define NCP_ESC			':'
+#define NCP_IO_TABLE(sb)	(NCP_SBP(sb)->nls_io)
+#define ncp_tolower(t, c)	nls_tolower(t, c)
+#define ncp_toupper(t, c)	nls_toupper(t, c)
+#define ncp_strnicmp(t, s1, s2, len) \
+	nls_strnicmp(t, s1, s2, len)
+#define ncp_io2vol(S,m,i,n,k,U)	ncp__io2vol(S,m,i,n,k,U)
+#define ncp_vol2io(S,m,i,n,k,U)	ncp__vol2io(S,m,i,n,k,U)
+
+#else
+
+int ncp__io2vol(unsigned char *, unsigned int *,
+				const unsigned char *, unsigned int, int);
+int ncp__vol2io(unsigned char *, unsigned int *,
+				const unsigned char *, unsigned int, int);
+
+#define NCP_IO_TABLE(sb)	NULL
+#define ncp_tolower(t, c)	tolower(c)
+#define ncp_toupper(t, c)	toupper(c)
+#define ncp_io2vol(S,m,i,n,k,U)	ncp__io2vol(m,i,n,k,U)
+#define ncp_vol2io(S,m,i,n,k,U)	ncp__vol2io(m,i,n,k,U)
+
+
+static inline int ncp_strnicmp(const struct nls_table *t,
+		const unsigned char *s1, const unsigned char *s2, int len)
+{
+	while (len--) {
+		if (tolower(*s1++) != tolower(*s2++))
+			return 1;
+	}
+
+	return 0;
+}
+
+#endif /* CONFIG_NCPFS_NLS */
+
+#define NCP_GET_AGE(dentry)	(jiffies - (dentry)->d_time)
+#define NCP_MAX_AGE(server)	atomic_read(&(server)->dentry_ttl)
+#define NCP_TEST_AGE(server,dentry)	(NCP_GET_AGE(dentry) < NCP_MAX_AGE(server))
+
+static inline void
+ncp_age_dentry(struct ncp_server* server, struct dentry* dentry)
+{
+	dentry->d_time = jiffies - NCP_MAX_AGE(server);
+}
+
+static inline void
+ncp_new_dentry(struct dentry* dentry)
+{
+	dentry->d_time = jiffies;
+}
+
+static inline void
+ncp_renew_dentries(struct dentry *parent)
+{
+	struct ncp_server *server = NCP_SERVER(parent->d_inode);
+	struct list_head *next;
+	struct dentry *dentry;
+
+	spin_lock(&parent->d_lock);
+	next = parent->d_subdirs.next;
+	while (next != &parent->d_subdirs) {
+		dentry = list_entry(next, struct dentry, d_u.d_child);
+
+		if (dentry->d_fsdata == NULL)
+			ncp_age_dentry(server, dentry);
+		else
+			ncp_new_dentry(dentry);
+
+		next = next->next;
+	}
+	spin_unlock(&parent->d_lock);
+}
+
+static inline void
+ncp_invalidate_dircache_entries(struct dentry *parent)
+{
+	struct ncp_server *server = NCP_SERVER(parent->d_inode);
+	struct list_head *next;
+	struct dentry *dentry;
+
+	spin_lock(&parent->d_lock);
+	next = parent->d_subdirs.next;
+	while (next != &parent->d_subdirs) {
+		dentry = list_entry(next, struct dentry, d_u.d_child);
+		dentry->d_fsdata = NULL;
+		ncp_age_dentry(server, dentry);
+		next = next->next;
+	}
+	spin_unlock(&parent->d_lock);
+}
+
+struct ncp_cache_head {
+	time_t		mtime;
+	unsigned long	time;	/* cache age */
+	unsigned long	end;	/* last valid fpos in cache */
+	int		eof;
+};
+
+#define NCP_DIRCACHE_SIZE	((int)(PAGE_CACHE_SIZE/sizeof(struct dentry *)))
+union ncp_dir_cache {
+	struct ncp_cache_head	head;
+	struct dentry		*dentry[NCP_DIRCACHE_SIZE];
+};
+
+#define NCP_FIRSTCACHE_SIZE	((int)((NCP_DIRCACHE_SIZE * \
+	sizeof(struct dentry *) - sizeof(struct ncp_cache_head)) / \
+	sizeof(struct dentry *)))
+
+#define NCP_DIRCACHE_START	(NCP_DIRCACHE_SIZE - NCP_FIRSTCACHE_SIZE)
+
+struct ncp_cache_control {
+	struct	ncp_cache_head		head;
+	struct	page			*page;
+	union	ncp_dir_cache		*cache;
+	unsigned long			fpos, ofs;
+	int				filled, valid, idx;
+};
+
+#endif /* _NCPLIB_H */
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
new file mode 100644
index 00000000..08907599
--- /dev/null
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -0,0 +1,127 @@
+/*
+ *  ncpsign_kernel.c
+ *
+ *  Arne de Bruijn (arne@knoware.nl), 1997
+ *
+ */
+
+
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+
+#include <linux/string.h>
+#include <linux/ncp.h>
+#include <linux/bitops.h>
+#include "ncp_fs.h"
+#include "ncpsign_kernel.h"
+
+/* i386: 32-bit, little endian, handles mis-alignment */
+#ifdef __i386__
+#define GET_LE32(p) (*(const int *)(p))
+#define PUT_LE32(p,v) { *(int *)(p)=v; }
+#else
+/* from include/ncplib.h */
+#define BVAL(buf,pos) (((const __u8 *)(buf))[pos])
+#define PVAL(buf,pos) ((unsigned)BVAL(buf,pos))
+#define BSET(buf,pos,val) (((__u8 *)(buf))[pos] = (val))
+
+static inline __u16
+WVAL_LH(const __u8 * buf, int pos)
+{
+	return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8;
+}
+static inline __u32
+DVAL_LH(const __u8 * buf, int pos)
+{
+	return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16;
+}
+static inline void
+WSET_LH(__u8 * buf, int pos, __u16 val)
+{
+	BSET(buf, pos, val & 0xff);
+	BSET(buf, pos + 1, val >> 8);
+}
+static inline void
+DSET_LH(__u8 * buf, int pos, __u32 val)
+{
+	WSET_LH(buf, pos, val & 0xffff);
+	WSET_LH(buf, pos + 2, val >> 16);
+}
+
+#define GET_LE32(p) DVAL_LH(p,0)
+#define PUT_LE32(p,v) DSET_LH(p,0,v)
+#endif
+
+static void nwsign(char *r_data1, char *r_data2, char *outdata) {
+ int i;
+ unsigned int w0,w1,w2,w3;
+ static int rbit[4]={0, 2, 1, 3};
+#ifdef __i386__
+ unsigned int *data2=(unsigned int *)r_data2;
+#else
+ unsigned int data2[16];
+ for (i=0;i<16;i++)
+  data2[i]=GET_LE32(r_data2+(i<<2));
+#endif 
+ w0=GET_LE32(r_data1);
+ w1=GET_LE32(r_data1+4);
+ w2=GET_LE32(r_data1+8);
+ w3=GET_LE32(r_data1+12);
+ for (i=0;i<16;i+=4) {
+  w0=rol32(w0 + ((w1 & w2) | ((~w1) & w3)) + data2[i+0],3);
+  w3=rol32(w3 + ((w0 & w1) | ((~w0) & w2)) + data2[i+1],7);
+  w2=rol32(w2 + ((w3 & w0) | ((~w3) & w1)) + data2[i+2],11);
+  w1=rol32(w1 + ((w2 & w3) | ((~w2) & w0)) + data2[i+3],19);
+ }
+ for (i=0;i<4;i++) {
+  w0=rol32(w0 + (((w2 | w3) & w1) | (w2 & w3)) + 0x5a827999 + data2[i+0],3);
+  w3=rol32(w3 + (((w1 | w2) & w0) | (w1 & w2)) + 0x5a827999 + data2[i+4],5);
+  w2=rol32(w2 + (((w0 | w1) & w3) | (w0 & w1)) + 0x5a827999 + data2[i+8],9);
+  w1=rol32(w1 + (((w3 | w0) & w2) | (w3 & w0)) + 0x5a827999 + data2[i+12],13);
+ }
+ for (i=0;i<4;i++) {
+  w0=rol32(w0 + ((w1 ^ w2) ^ w3) + 0x6ed9eba1 + data2[rbit[i]+0],3);
+  w3=rol32(w3 + ((w0 ^ w1) ^ w2) + 0x6ed9eba1 + data2[rbit[i]+8],9);
+  w2=rol32(w2 + ((w3 ^ w0) ^ w1) + 0x6ed9eba1 + data2[rbit[i]+4],11);
+  w1=rol32(w1 + ((w2 ^ w3) ^ w0) + 0x6ed9eba1 + data2[rbit[i]+12],15);
+ }
+ PUT_LE32(outdata,(w0+GET_LE32(r_data1)) & 0xffffffff);
+ PUT_LE32(outdata+4,(w1+GET_LE32(r_data1+4)) & 0xffffffff);
+ PUT_LE32(outdata+8,(w2+GET_LE32(r_data1+8)) & 0xffffffff);
+ PUT_LE32(outdata+12,(w3+GET_LE32(r_data1+12)) & 0xffffffff);
+}
+
+/* Make a signature for the current packet and add it at the end of the */
+/* packet. */
+void __sign_packet(struct ncp_server *server, const char *packet, size_t size, __u32 totalsize, void *sign_buff) {
+	unsigned char data[64];
+
+	memcpy(data, server->sign_root, 8);
+	*(__u32*)(data + 8) = totalsize;
+	if (size < 52) {
+		memcpy(data + 12, packet, size);
+		memset(data + 12 + size, 0, 52 - size);
+	} else {
+		memcpy(data + 12, packet, 52);
+	}
+	nwsign(server->sign_last, data, server->sign_last);
+	memcpy(sign_buff, server->sign_last, 8);
+}
+
+int sign_verify_reply(struct ncp_server *server, const char *packet, size_t size, __u32 totalsize, const void *sign_buff) {
+	unsigned char data[64];
+	unsigned char hash[16];
+
+	memcpy(data, server->sign_root, 8);
+	*(__u32*)(data + 8) = totalsize;
+	if (size < 52) {
+		memcpy(data + 12, packet, size);
+		memset(data + 12 + size, 0, 52 - size);
+	} else {
+		memcpy(data + 12, packet, 52);
+	}
+	nwsign(server->sign_last, data, hash);
+	return memcmp(sign_buff, hash, 8);
+}
+
+#endif	/* CONFIG_NCPFS_PACKET_SIGNING */
+
diff --git a/fs/ncpfs/ncpsign_kernel.h b/fs/ncpfs/ncpsign_kernel.h
new file mode 100644
index 00000000..d9a1438b
--- /dev/null
+++ b/fs/ncpfs/ncpsign_kernel.h
@@ -0,0 +1,26 @@
+/*
+ *  ncpsign_kernel.h
+ *
+ *  Arne de Bruijn (arne@knoware.nl), 1997
+ *
+ */
+ 
+#ifndef _NCPSIGN_KERNEL_H
+#define _NCPSIGN_KERNEL_H
+
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff);
+int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff);
+#endif
+
+static inline size_t sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff) {
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+	if (server->sign_active) {
+		__sign_packet(server, data, size, totalsize, sign_buff);
+		return 8;
+	}
+#endif
+	return 0;
+}
+
+#endif
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
new file mode 100644
index 00000000..3a158722
--- /dev/null
+++ b/fs/ncpfs/sock.c
@@ -0,0 +1,880 @@
+/*
+ *  linux/fs/ncpfs/sock.c
+ *
+ *  Copyright (C) 1992, 1993  Rick Sladkey
+ *
+ *  Modified 1995, 1996 by Volker Lendecke to be usable for ncp
+ *  Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache
+ *
+ */
+
+
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <asm/uaccess.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/mm.h>
+#include <linux/netdevice.h>
+#include <linux/signal.h>
+#include <linux/slab.h>
+#include <net/scm.h>
+#include <net/sock.h>
+#include <linux/ipx.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+
+#include "ncp_fs.h"
+
+#include "ncpsign_kernel.h"
+
+static int _recv(struct socket *sock, void *buf, int size, unsigned flags)
+{
+	struct msghdr msg = {NULL, };
+	struct kvec iov = {buf, size};
+	return kernel_recvmsg(sock, &msg, &iov, 1, size, flags);
+}
+
+static inline int do_send(struct socket *sock, struct kvec *vec, int count,
+			  int len, unsigned flags)
+{
+	struct msghdr msg = { .msg_flags = flags };
+	return kernel_sendmsg(sock, &msg, vec, count, len);
+}
+
+static int _send(struct socket *sock, const void *buff, int len)
+{
+	struct kvec vec;
+	vec.iov_base = (void *) buff;
+	vec.iov_len = len;
+	return do_send(sock, &vec, 1, len, 0);
+}
+
+struct ncp_request_reply {
+	struct list_head req;
+	wait_queue_head_t wq;
+	atomic_t refs;
+	unsigned char* reply_buf;
+	size_t datalen;
+	int result;
+	enum { RQ_DONE, RQ_INPROGRESS, RQ_QUEUED, RQ_IDLE, RQ_ABANDONED } status;
+	struct kvec* tx_ciov;
+	size_t tx_totallen;
+	size_t tx_iovlen;
+	struct kvec tx_iov[3];
+	u_int16_t tx_type;
+	u_int32_t sign[6];
+};
+
+static inline struct ncp_request_reply* ncp_alloc_req(void)
+{
+	struct ncp_request_reply *req;
+
+	req = kmalloc(sizeof(struct ncp_request_reply), GFP_KERNEL);
+	if (!req)
+		return NULL;
+
+	init_waitqueue_head(&req->wq);
+	atomic_set(&req->refs, (1));
+	req->status = RQ_IDLE;
+
+	return req;
+}
+
+static void ncp_req_get(struct ncp_request_reply *req)
+{
+	atomic_inc(&req->refs);
+}
+
+static void ncp_req_put(struct ncp_request_reply *req)
+{
+	if (atomic_dec_and_test(&req->refs))
+		kfree(req);
+}
+
+void ncp_tcp_data_ready(struct sock *sk, int len)
+{
+	struct ncp_server *server = sk->sk_user_data;
+
+	server->data_ready(sk, len);
+	schedule_work(&server->rcv.tq);
+}
+
+void ncp_tcp_error_report(struct sock *sk)
+{
+	struct ncp_server *server = sk->sk_user_data;
+	
+	server->error_report(sk);
+	schedule_work(&server->rcv.tq);
+}
+
+void ncp_tcp_write_space(struct sock *sk)
+{
+	struct ncp_server *server = sk->sk_user_data;
+	
+	/* We do not need any locking: we first set tx.creq, and then we do sendmsg,
+	   not vice versa... */
+	server->write_space(sk);
+	if (server->tx.creq)
+		schedule_work(&server->tx.tq);
+}
+
+void ncpdgram_timeout_call(unsigned long v)
+{
+	struct ncp_server *server = (void*)v;
+	
+	schedule_work(&server->timeout_tq);
+}
+
+static inline void ncp_finish_request(struct ncp_server *server, struct ncp_request_reply *req, int result)
+{
+	req->result = result;
+	if (req->status != RQ_ABANDONED)
+		memcpy(req->reply_buf, server->rxbuf, req->datalen);
+	req->status = RQ_DONE;
+	wake_up_all(&req->wq);
+	ncp_req_put(req);
+}
+
+static void __abort_ncp_connection(struct ncp_server *server)
+{
+	struct ncp_request_reply *req;
+
+	ncp_invalidate_conn(server);
+	del_timer(&server->timeout_tm);
+	while (!list_empty(&server->tx.requests)) {
+		req = list_entry(server->tx.requests.next, struct ncp_request_reply, req);
+		
+		list_del_init(&req->req);
+		ncp_finish_request(server, req, -EIO);
+	}
+	req = server->rcv.creq;
+	if (req) {
+		server->rcv.creq = NULL;
+		ncp_finish_request(server, req, -EIO);
+		server->rcv.ptr = NULL;
+		server->rcv.state = 0;
+	}
+	req = server->tx.creq;
+	if (req) {
+		server->tx.creq = NULL;
+		ncp_finish_request(server, req, -EIO);
+	}
+}
+
+static inline int get_conn_number(struct ncp_reply_header *rp)
+{
+	return rp->conn_low | (rp->conn_high << 8);
+}
+
+static inline void __ncp_abort_request(struct ncp_server *server, struct ncp_request_reply *req, int err)
+{
+	/* If req is done, we got signal, but we also received answer... */
+	switch (req->status) {
+		case RQ_IDLE:
+		case RQ_DONE:
+			break;
+		case RQ_QUEUED:
+			list_del_init(&req->req);
+			ncp_finish_request(server, req, err);
+			break;
+		case RQ_INPROGRESS:
+			req->status = RQ_ABANDONED;
+			break;
+		case RQ_ABANDONED:
+			break;
+	}
+}
+
+static inline void ncp_abort_request(struct ncp_server *server, struct ncp_request_reply *req, int err)
+{
+	mutex_lock(&server->rcv.creq_mutex);
+	__ncp_abort_request(server, req, err);
+	mutex_unlock(&server->rcv.creq_mutex);
+}
+
+static inline void __ncptcp_abort(struct ncp_server *server)
+{
+	__abort_ncp_connection(server);
+}
+
+static int ncpdgram_send(struct socket *sock, struct ncp_request_reply *req)
+{
+	struct kvec vec[3];
+	/* sock_sendmsg updates iov pointers for us :-( */
+	memcpy(vec, req->tx_ciov, req->tx_iovlen * sizeof(vec[0]));
+	return do_send(sock, vec, req->tx_iovlen,
+		       req->tx_totallen, MSG_DONTWAIT);
+}
+
+static void __ncptcp_try_send(struct ncp_server *server)
+{
+	struct ncp_request_reply *rq;
+	struct kvec *iov;
+	struct kvec iovc[3];
+	int result;
+
+	rq = server->tx.creq;
+	if (!rq)
+		return;
+
+	/* sock_sendmsg updates iov pointers for us :-( */
+	memcpy(iovc, rq->tx_ciov, rq->tx_iovlen * sizeof(iov[0]));
+	result = do_send(server->ncp_sock, iovc, rq->tx_iovlen,
+			 rq->tx_totallen, MSG_NOSIGNAL | MSG_DONTWAIT);
+
+	if (result == -EAGAIN)
+		return;
+
+	if (result < 0) {
+		printk(KERN_ERR "ncpfs: tcp: Send failed: %d\n", result);
+		__ncp_abort_request(server, rq, result);
+		return;
+	}
+	if (result >= rq->tx_totallen) {
+		server->rcv.creq = rq;
+		server->tx.creq = NULL;
+		return;
+	}
+	rq->tx_totallen -= result;
+	iov = rq->tx_ciov;
+	while (iov->iov_len <= result) {
+		result -= iov->iov_len;
+		iov++;
+		rq->tx_iovlen--;
+	}
+	iov->iov_base += result;
+	iov->iov_len -= result;
+	rq->tx_ciov = iov;
+}
+
+static inline void ncp_init_header(struct ncp_server *server, struct ncp_request_reply *req, struct ncp_request_header *h)
+{
+	req->status = RQ_INPROGRESS;
+	h->conn_low = server->connection;
+	h->conn_high = server->connection >> 8;
+	h->sequence = ++server->sequence;
+}
+	
+static void ncpdgram_start_request(struct ncp_server *server, struct ncp_request_reply *req)
+{
+	size_t signlen;
+	struct ncp_request_header* h;
+	
+	req->tx_ciov = req->tx_iov + 1;
+
+	h = req->tx_iov[1].iov_base;
+	ncp_init_header(server, req, h);
+	signlen = sign_packet(server, req->tx_iov[1].iov_base + sizeof(struct ncp_request_header) - 1, 
+			req->tx_iov[1].iov_len - sizeof(struct ncp_request_header) + 1,
+			cpu_to_le32(req->tx_totallen), req->sign);
+	if (signlen) {
+		req->tx_ciov[1].iov_base = req->sign;
+		req->tx_ciov[1].iov_len = signlen;
+		req->tx_iovlen += 1;
+		req->tx_totallen += signlen;
+	}
+	server->rcv.creq = req;
+	server->timeout_last = server->m.time_out;
+	server->timeout_retries = server->m.retry_count;
+	ncpdgram_send(server->ncp_sock, req);
+	mod_timer(&server->timeout_tm, jiffies + server->m.time_out);
+}
+
+#define NCP_TCP_XMIT_MAGIC	(0x446D6454)
+#define NCP_TCP_XMIT_VERSION	(1)
+#define NCP_TCP_RCVD_MAGIC	(0x744E6350)
+
+static void ncptcp_start_request(struct ncp_server *server, struct ncp_request_reply *req)
+{
+	size_t signlen;
+	struct ncp_request_header* h;
+
+	req->tx_ciov = req->tx_iov;
+	h = req->tx_iov[1].iov_base;
+	ncp_init_header(server, req, h);
+	signlen = sign_packet(server, req->tx_iov[1].iov_base + sizeof(struct ncp_request_header) - 1,
+			req->tx_iov[1].iov_len - sizeof(struct ncp_request_header) + 1,
+			cpu_to_be32(req->tx_totallen + 24), req->sign + 4) + 16;
+
+	req->sign[0] = htonl(NCP_TCP_XMIT_MAGIC);
+	req->sign[1] = htonl(req->tx_totallen + signlen);
+	req->sign[2] = htonl(NCP_TCP_XMIT_VERSION);
+	req->sign[3] = htonl(req->datalen + 8);
+	req->tx_iov[0].iov_base = req->sign;
+	req->tx_iov[0].iov_len = signlen;
+	req->tx_iovlen += 1;
+	req->tx_totallen += signlen;
+
+	server->tx.creq = req;
+	__ncptcp_try_send(server);
+}
+
+static inline void __ncp_start_request(struct ncp_server *server, struct ncp_request_reply *req)
+{
+	/* we copy the data so that we do not depend on the caller
+	   staying alive */
+	memcpy(server->txbuf, req->tx_iov[1].iov_base, req->tx_iov[1].iov_len);
+	req->tx_iov[1].iov_base = server->txbuf;
+
+	if (server->ncp_sock->type == SOCK_STREAM)
+		ncptcp_start_request(server, req);
+	else
+		ncpdgram_start_request(server, req);
+}
+
+static int ncp_add_request(struct ncp_server *server, struct ncp_request_reply *req)
+{
+	mutex_lock(&server->rcv.creq_mutex);
+	if (!ncp_conn_valid(server)) {
+		mutex_unlock(&server->rcv.creq_mutex);
+		printk(KERN_ERR "ncpfs: tcp: Server died\n");
+		return -EIO;
+	}
+	ncp_req_get(req);
+	if (server->tx.creq || server->rcv.creq) {
+		req->status = RQ_QUEUED;
+		list_add_tail(&req->req, &server->tx.requests);
+		mutex_unlock(&server->rcv.creq_mutex);
+		return 0;
+	}
+	__ncp_start_request(server, req);
+	mutex_unlock(&server->rcv.creq_mutex);
+	return 0;
+}
+
+static void __ncp_next_request(struct ncp_server *server)
+{
+	struct ncp_request_reply *req;
+
+	server->rcv.creq = NULL;
+	if (list_empty(&server->tx.requests)) {
+		return;
+	}
+	req = list_entry(server->tx.requests.next, struct ncp_request_reply, req);
+	list_del_init(&req->req);
+	__ncp_start_request(server, req);
+}
+
+static void info_server(struct ncp_server *server, unsigned int id, const void * data, size_t len)
+{
+	if (server->info_sock) {
+		struct kvec iov[2];
+		__be32 hdr[2];
+	
+		hdr[0] = cpu_to_be32(len + 8);
+		hdr[1] = cpu_to_be32(id);
+	
+		iov[0].iov_base = hdr;
+		iov[0].iov_len = 8;
+		iov[1].iov_base = (void *) data;
+		iov[1].iov_len = len;
+
+		do_send(server->info_sock, iov, 2, len + 8, MSG_NOSIGNAL);
+	}
+}
+
+void ncpdgram_rcv_proc(struct work_struct *work)
+{
+	struct ncp_server *server =
+		container_of(work, struct ncp_server, rcv.tq);
+	struct socket* sock;
+	
+	sock = server->ncp_sock;
+	
+	while (1) {
+		struct ncp_reply_header reply;
+		int result;
+
+		result = _recv(sock, &reply, sizeof(reply), MSG_PEEK | MSG_DONTWAIT);
+		if (result < 0) {
+			break;
+		}
+		if (result >= sizeof(reply)) {
+			struct ncp_request_reply *req;
+	
+			if (reply.type == NCP_WATCHDOG) {
+				unsigned char buf[10];
+
+				if (server->connection != get_conn_number(&reply)) {
+					goto drop;
+				}
+				result = _recv(sock, buf, sizeof(buf), MSG_DONTWAIT);
+				if (result < 0) {
+					DPRINTK("recv failed with %d\n", result);
+					continue;
+				}
+				if (result < 10) {
+					DPRINTK("too short (%u) watchdog packet\n", result);
+					continue;
+				}
+				if (buf[9] != '?') {
+					DPRINTK("bad signature (%02X) in watchdog packet\n", buf[9]);
+					continue;
+				}
+				buf[9] = 'Y';
+				_send(sock, buf, sizeof(buf));
+				continue;
+			}
+			if (reply.type != NCP_POSITIVE_ACK && reply.type != NCP_REPLY) {
+				result = _recv(sock, server->unexpected_packet.data, sizeof(server->unexpected_packet.data), MSG_DONTWAIT);
+				if (result < 0) {
+					continue;
+				}
+				info_server(server, 0, server->unexpected_packet.data, result);
+				continue;
+			}
+			mutex_lock(&server->rcv.creq_mutex);
+			req = server->rcv.creq;
+			if (req && (req->tx_type == NCP_ALLOC_SLOT_REQUEST || (server->sequence == reply.sequence && 
+					server->connection == get_conn_number(&reply)))) {
+				if (reply.type == NCP_POSITIVE_ACK) {
+					server->timeout_retries = server->m.retry_count;
+					server->timeout_last = NCP_MAX_RPC_TIMEOUT;
+					mod_timer(&server->timeout_tm, jiffies + NCP_MAX_RPC_TIMEOUT);
+				} else if (reply.type == NCP_REPLY) {
+					result = _recv(sock, server->rxbuf, req->datalen, MSG_DONTWAIT);
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
+					if (result >= 0 && server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) {
+						if (result < 8 + 8) {
+							result = -EIO;
+						} else {
+							unsigned int hdrl;
+							
+							result -= 8;
+							hdrl = sock->sk->sk_family == AF_INET ? 8 : 6;
+							if (sign_verify_reply(server, server->rxbuf + hdrl, result - hdrl, cpu_to_le32(result), server->rxbuf + result)) {
+								printk(KERN_INFO "ncpfs: Signature violation\n");
+								result = -EIO;
+							}
+						}
+					}
+#endif
+					del_timer(&server->timeout_tm);
+				     	server->rcv.creq = NULL;
+					ncp_finish_request(server, req, result);
+					__ncp_next_request(server);
+					mutex_unlock(&server->rcv.creq_mutex);
+					continue;
+				}
+			}
+			mutex_unlock(&server->rcv.creq_mutex);
+		}
+drop:;		
+		_recv(sock, &reply, sizeof(reply), MSG_DONTWAIT);
+	}
+}
+
+static void __ncpdgram_timeout_proc(struct ncp_server *server)
+{
+	/* If timer is pending, we are processing another request... */
+	if (!timer_pending(&server->timeout_tm)) {
+		struct ncp_request_reply* req;
+		
+		req = server->rcv.creq;
+		if (req) {
+			int timeout;
+			
+			if (server->m.flags & NCP_MOUNT_SOFT) {
+				if (server->timeout_retries-- == 0) {
+					__ncp_abort_request(server, req, -ETIMEDOUT);
+					return;
+				}
+			}
+			/* Ignore errors */
+			ncpdgram_send(server->ncp_sock, req);
+			timeout = server->timeout_last << 1;
+			if (timeout > NCP_MAX_RPC_TIMEOUT) {
+				timeout = NCP_MAX_RPC_TIMEOUT;
+			}
+			server->timeout_last = timeout;
+			mod_timer(&server->timeout_tm, jiffies + timeout);
+		}
+	}
+}
+
+void ncpdgram_timeout_proc(struct work_struct *work)
+{
+	struct ncp_server *server =
+		container_of(work, struct ncp_server, timeout_tq);
+	mutex_lock(&server->rcv.creq_mutex);
+	__ncpdgram_timeout_proc(server);
+	mutex_unlock(&server->rcv.creq_mutex);
+}
+
+static int do_tcp_rcv(struct ncp_server *server, void *buffer, size_t len)
+{
+	int result;
+	
+	if (buffer) {
+		result = _recv(server->ncp_sock, buffer, len, MSG_DONTWAIT);
+	} else {
+		static unsigned char dummy[1024];
+			
+		if (len > sizeof(dummy)) {
+			len = sizeof(dummy);
+		}
+		result = _recv(server->ncp_sock, dummy, len, MSG_DONTWAIT);
+	}
+	if (result < 0) {
+		return result;
+	}
+	if (result > len) {
+		printk(KERN_ERR "ncpfs: tcp: bug in recvmsg (%u > %Zu)\n", result, len);
+		return -EIO;			
+	}
+	return result;
+}	
+
+static int __ncptcp_rcv_proc(struct ncp_server *server)
+{
+	/* We have to check the result, so store the complete header */
+	while (1) {
+		int result;
+		struct ncp_request_reply *req;
+		int datalen;
+		int type;
+
+		while (server->rcv.len) {
+			result = do_tcp_rcv(server, server->rcv.ptr, server->rcv.len);
+			if (result == -EAGAIN) {
+				return 0;
+			}
+			if (result <= 0) {
+				req = server->rcv.creq;
+				if (req) {
+					__ncp_abort_request(server, req, -EIO);
+				} else {
+					__ncptcp_abort(server);
+				}
+				if (result < 0) {
+					printk(KERN_ERR "ncpfs: tcp: error in recvmsg: %d\n", result);
+				} else {
+					DPRINTK(KERN_ERR "ncpfs: tcp: EOF\n");
+				}
+				return -EIO;
+			}
+			if (server->rcv.ptr) {
+				server->rcv.ptr += result;
+			}
+			server->rcv.len -= result;
+		}
+		switch (server->rcv.state) {
+			case 0:
+				if (server->rcv.buf.magic != htonl(NCP_TCP_RCVD_MAGIC)) {
+					printk(KERN_ERR "ncpfs: tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic));
+					__ncptcp_abort(server);
+					return -EIO;
+				}
+				datalen = ntohl(server->rcv.buf.len) & 0x0FFFFFFF;
+				if (datalen < 10) {
+					printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen);
+					__ncptcp_abort(server);
+					return -EIO;
+				}
+#ifdef CONFIG_NCPFS_PACKET_SIGNING				
+				if (server->sign_active) {
+					if (datalen < 18) {
+						printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen);
+						__ncptcp_abort(server);
+						return -EIO;
+					}
+					server->rcv.buf.len = datalen - 8;
+					server->rcv.ptr = (unsigned char*)&server->rcv.buf.p1;
+					server->rcv.len = 8;
+					server->rcv.state = 4;
+					break;
+				}
+#endif				
+				type = ntohs(server->rcv.buf.type);
+#ifdef CONFIG_NCPFS_PACKET_SIGNING				
+cont:;				
+#endif
+				if (type != NCP_REPLY) {
+					if (datalen - 8 <= sizeof(server->unexpected_packet.data)) {
+						*(__u16*)(server->unexpected_packet.data) = htons(type);
+						server->unexpected_packet.len = datalen - 8;
+
+						server->rcv.state = 5;
+						server->rcv.ptr = server->unexpected_packet.data + 2;
+						server->rcv.len = datalen - 10;
+						break;
+					}					
+					DPRINTK("ncpfs: tcp: Unexpected NCP type %02X\n", type);
+skipdata2:;
+					server->rcv.state = 2;
+skipdata:;
+					server->rcv.ptr = NULL;
+					server->rcv.len = datalen - 10;
+					break;
+				}
+				req = server->rcv.creq;
+				if (!req) {
+					DPRINTK(KERN_ERR "ncpfs: Reply without appropriate request\n");
+					goto skipdata2;
+				}
+				if (datalen > req->datalen + 8) {
+					printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8);
+					server->rcv.state = 3;
+					goto skipdata;
+				}
+				req->datalen = datalen - 8;
+				((struct ncp_reply_header*)server->rxbuf)->type = NCP_REPLY;
+				server->rcv.ptr = server->rxbuf + 2;
+				server->rcv.len = datalen - 10;
+				server->rcv.state = 1;
+				break;
+#ifdef CONFIG_NCPFS_PACKET_SIGNING				
+			case 4:
+				datalen = server->rcv.buf.len;
+				type = ntohs(server->rcv.buf.type2);
+				goto cont;
+#endif
+			case 1:
+				req = server->rcv.creq;
+				if (req->tx_type != NCP_ALLOC_SLOT_REQUEST) {
+					if (((struct ncp_reply_header*)server->rxbuf)->sequence != server->sequence) {
+						printk(KERN_ERR "ncpfs: tcp: Bad sequence number\n");
+						__ncp_abort_request(server, req, -EIO);
+						return -EIO;
+					}
+					if ((((struct ncp_reply_header*)server->rxbuf)->conn_low | (((struct ncp_reply_header*)server->rxbuf)->conn_high << 8)) != server->connection) {
+						printk(KERN_ERR "ncpfs: tcp: Connection number mismatch\n");
+						__ncp_abort_request(server, req, -EIO);
+						return -EIO;
+					}
+				}
+#ifdef CONFIG_NCPFS_PACKET_SIGNING				
+				if (server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) {
+					if (sign_verify_reply(server, server->rxbuf + 6, req->datalen - 6, cpu_to_be32(req->datalen + 16), &server->rcv.buf.type)) {
+						printk(KERN_ERR "ncpfs: tcp: Signature violation\n");
+						__ncp_abort_request(server, req, -EIO);
+						return -EIO;
+					}
+				}
+#endif				
+				ncp_finish_request(server, req, req->datalen);
+			nextreq:;
+				__ncp_next_request(server);
+			case 2:
+			next:;
+				server->rcv.ptr = (unsigned char*)&server->rcv.buf;
+				server->rcv.len = 10;
+				server->rcv.state = 0;
+				break;
+			case 3:
+				ncp_finish_request(server, server->rcv.creq, -EIO);
+				goto nextreq;
+			case 5:
+				info_server(server, 0, server->unexpected_packet.data, server->unexpected_packet.len);
+				goto next;
+		}
+	}
+}
+
+void ncp_tcp_rcv_proc(struct work_struct *work)
+{
+	struct ncp_server *server =
+		container_of(work, struct ncp_server, rcv.tq);
+
+	mutex_lock(&server->rcv.creq_mutex);
+	__ncptcp_rcv_proc(server);
+	mutex_unlock(&server->rcv.creq_mutex);
+}
+
+void ncp_tcp_tx_proc(struct work_struct *work)
+{
+	struct ncp_server *server =
+		container_of(work, struct ncp_server, tx.tq);
+	
+	mutex_lock(&server->rcv.creq_mutex);
+	__ncptcp_try_send(server);
+	mutex_unlock(&server->rcv.creq_mutex);
+}
+
+static int do_ncp_rpc_call(struct ncp_server *server, int size,
+		unsigned char* reply_buf, int max_reply_size)
+{
+	int result;
+	struct ncp_request_reply *req;
+
+	req = ncp_alloc_req();
+	if (!req)
+		return -ENOMEM;
+
+	req->reply_buf = reply_buf;
+	req->datalen = max_reply_size;
+	req->tx_iov[1].iov_base = server->packet;
+	req->tx_iov[1].iov_len = size;
+	req->tx_iovlen = 1;
+	req->tx_totallen = size;
+	req->tx_type = *(u_int16_t*)server->packet;
+
+	result = ncp_add_request(server, req);
+	if (result < 0)
+		goto out;
+
+	if (wait_event_interruptible(req->wq, req->status == RQ_DONE)) {
+		ncp_abort_request(server, req, -EINTR);
+		result = -EINTR;
+		goto out;
+	}
+
+	result = req->result;
+
+out:
+	ncp_req_put(req);
+
+	return result;
+}
+
+/*
+ * We need the server to be locked here, so check!
+ */
+
+static int ncp_do_request(struct ncp_server *server, int size,
+		void* reply, int max_reply_size)
+{
+	int result;
+
+	if (server->lock == 0) {
+		printk(KERN_ERR "ncpfs: Server not locked!\n");
+		return -EIO;
+	}
+	if (!ncp_conn_valid(server)) {
+		return -EIO;
+	}
+	{
+		sigset_t old_set;
+		unsigned long mask, flags;
+
+		spin_lock_irqsave(&current->sighand->siglock, flags);
+		old_set = current->blocked;
+		if (current->flags & PF_EXITING)
+			mask = 0;
+		else
+			mask = sigmask(SIGKILL);
+		if (server->m.flags & NCP_MOUNT_INTR) {
+			/* FIXME: This doesn't seem right at all.  So, like,
+			   we can't handle SIGINT and get whatever to stop?
+			   What if we've blocked it ourselves?  What about
+			   alarms?  Why, in fact, are we mucking with the
+			   sigmask at all? -- r~ */
+			if (current->sighand->action[SIGINT - 1].sa.sa_handler == SIG_DFL)
+				mask |= sigmask(SIGINT);
+			if (current->sighand->action[SIGQUIT - 1].sa.sa_handler == SIG_DFL)
+				mask |= sigmask(SIGQUIT);
+		}
+		siginitsetinv(&current->blocked, mask);
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, flags);
+		
+		result = do_ncp_rpc_call(server, size, reply, max_reply_size);
+
+		spin_lock_irqsave(&current->sighand->siglock, flags);
+		current->blocked = old_set;
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	}
+
+	DDPRINTK("do_ncp_rpc_call returned %d\n", result);
+
+	return result;
+}
+
+/* ncp_do_request assures that at least a complete reply header is
+ * received. It assumes that server->current_size contains the ncp
+ * request size
+ */
+int ncp_request2(struct ncp_server *server, int function, 
+		void* rpl, int size)
+{
+	struct ncp_request_header *h;
+	struct ncp_reply_header* reply = rpl;
+	int result;
+
+	h = (struct ncp_request_header *) (server->packet);
+	if (server->has_subfunction != 0) {
+		*(__u16 *) & (h->data[0]) = htons(server->current_size - sizeof(*h) - 2);
+	}
+	h->type = NCP_REQUEST;
+	/*
+	 * The server shouldn't know or care what task is making a
+	 * request, so we always use the same task number.
+	 */
+	h->task = 2; /* (current->pid) & 0xff; */
+	h->function = function;
+
+	result = ncp_do_request(server, server->current_size, reply, size);
+	if (result < 0) {
+		DPRINTK("ncp_request_error: %d\n", result);
+		goto out;
+	}
+	server->completion = reply->completion_code;
+	server->conn_status = reply->connection_state;
+	server->reply_size = result;
+	server->ncp_reply_size = result - sizeof(struct ncp_reply_header);
+
+	result = reply->completion_code;
+
+	if (result != 0)
+		PPRINTK("ncp_request: completion code=%x\n", result);
+out:
+	return result;
+}
+
+int ncp_connect(struct ncp_server *server)
+{
+	struct ncp_request_header *h;
+	int result;
+
+	server->connection = 0xFFFF;
+	server->sequence = 255;
+
+	h = (struct ncp_request_header *) (server->packet);
+	h->type = NCP_ALLOC_SLOT_REQUEST;
+	h->task		= 2; /* see above */
+	h->function	= 0;
+
+	result = ncp_do_request(server, sizeof(*h), server->packet, server->packet_size);
+	if (result < 0)
+		goto out;
+	server->connection = h->conn_low + (h->conn_high * 256);
+	result = 0;
+out:
+	return result;
+}
+
+int ncp_disconnect(struct ncp_server *server)
+{
+	struct ncp_request_header *h;
+
+	h = (struct ncp_request_header *) (server->packet);
+	h->type = NCP_DEALLOC_SLOT_REQUEST;
+	h->task		= 2; /* see above */
+	h->function	= 0;
+
+	return ncp_do_request(server, sizeof(*h), server->packet, server->packet_size);
+}
+
+void ncp_lock_server(struct ncp_server *server)
+{
+	mutex_lock(&server->mutex);
+	if (server->lock)
+		printk(KERN_WARNING "ncp_lock_server: was locked!\n");
+	server->lock = 1;
+}
+
+void ncp_unlock_server(struct ncp_server *server)
+{
+	if (!server->lock) {
+		printk(KERN_WARNING "ncp_unlock_server: was not locked!\n");
+		return;
+	}
+	server->lock = 0;
+	mutex_unlock(&server->mutex);
+}
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
new file mode 100644
index 00000000..661f861d
--- /dev/null
+++ b/fs/ncpfs/symlink.c
@@ -0,0 +1,181 @@
+/*
+ *  linux/fs/ncpfs/symlink.c
+ *
+ *  Code for allowing symbolic links on NCPFS (i.e. NetWare)
+ *  Symbolic links are not supported on native NetWare, so we use an
+ *  infrequently-used flag (Sh) and store a two-word magic header in
+ *  the file to make sure we don't accidentally use a non-link file
+ *  as a link.
+ *
+ *  When using the NFS namespace, we set the mode to indicate a symlink and
+ *  don't bother with the magic numbers.
+ *
+ *  from linux/fs/ext2/symlink.c
+ *
+ *  Copyright (C) 1998-99, Frank A. Vorstenbosch
+ *
+ *  ncpfs symlink handling code
+ *  NLS support (c) 1999 Petr Vandrovec
+ *  Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info
+ *
+ */
+
+
+#include <asm/uaccess.h>
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include "ncp_fs.h"
+
+/* these magic numbers must appear in the symlink file -- this makes it a bit
+   more resilient against the magic attributes being set on random files. */
+
+#define NCP_SYMLINK_MAGIC0	cpu_to_le32(0x6c6d7973)     /* "symlnk->" */
+#define NCP_SYMLINK_MAGIC1	cpu_to_le32(0x3e2d6b6e)
+
+/* ----- read a symbolic link ------------------------------------------ */
+
+static int ncp_symlink_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int error, length, len;
+	char *link, *rawlink;
+	char *buf = kmap(page);
+
+	error = -ENOMEM;
+	rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL);
+	if (!rawlink)
+		goto fail;
+
+	if (ncp_make_open(inode,O_RDONLY))
+		goto failEIO;
+
+	error=ncp_read_kernel(NCP_SERVER(inode),NCP_FINFO(inode)->file_handle,
+                         0,NCP_MAX_SYMLINK_SIZE,rawlink,&length);
+
+	ncp_inode_close(inode);
+	/* Close file handle if no other users... */
+	ncp_make_closed(inode);
+	if (error)
+		goto failEIO;
+
+	if (NCP_FINFO(inode)->flags & NCPI_KLUDGE_SYMLINK) {
+		if (length<NCP_MIN_SYMLINK_SIZE || 
+		    ((__le32 *)rawlink)[0]!=NCP_SYMLINK_MAGIC0 ||
+		    ((__le32 *)rawlink)[1]!=NCP_SYMLINK_MAGIC1)
+		    	goto failEIO;
+		link = rawlink + 8;
+		length -= 8;
+	} else {
+		link = rawlink;
+	}
+
+	len = NCP_MAX_SYMLINK_SIZE;
+	error = ncp_vol2io(NCP_SERVER(inode), buf, &len, link, length, 0);
+	kfree(rawlink);
+	if (error)
+		goto fail;
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+	return 0;
+
+failEIO:
+	error = -EIO;
+	kfree(rawlink);
+fail:
+	SetPageError(page);
+	kunmap(page);
+	unlock_page(page);
+	return error;
+}
+
+/*
+ * symlinks can't do much...
+ */
+const struct address_space_operations ncp_symlink_aops = {
+	.readpage	= ncp_symlink_readpage,
+};
+	
+/* ----- create a new symbolic link -------------------------------------- */
+ 
+int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) {
+	struct inode *inode;
+	char *rawlink;
+	int length, err, i, outlen;
+	int kludge;
+	int mode;
+	__le32 attr;
+	unsigned int hdr;
+
+	DPRINTK("ncp_symlink(dir=%p,dentry=%p,symname=%s)\n",dir,dentry,symname);
+
+	if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber))
+		kludge = 0;
+	else
+#ifdef CONFIG_NCPFS_EXTRAS
+	if (NCP_SERVER(dir)->m.flags & NCP_MOUNT_SYMLINKS)
+		kludge = 1;
+	else
+#endif
+	/* EPERM is returned by VFS if symlink procedure does not exist */
+		return -EPERM;
+  
+	rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL);
+	if (!rawlink)
+		return -ENOMEM;
+
+	if (kludge) {
+		mode = 0;
+		attr = aSHARED | aHIDDEN;
+		((__le32 *)rawlink)[0]=NCP_SYMLINK_MAGIC0;
+		((__le32 *)rawlink)[1]=NCP_SYMLINK_MAGIC1;
+		hdr = 8;
+	} else {
+		mode = S_IFLNK | S_IRWXUGO;
+		attr = 0;
+		hdr = 0;
+	}			
+
+	length = strlen(symname);
+	/* map to/from server charset, do not touch upper/lower case as
+	   symlink can point out of ncp filesystem */
+	outlen = NCP_MAX_SYMLINK_SIZE - hdr;
+	err = ncp_io2vol(NCP_SERVER(dir), rawlink + hdr, &outlen, symname, length, 0);
+	if (err)
+		goto failfree;
+
+	outlen += hdr;
+
+	err = -EIO;
+	if (ncp_create_new(dir,dentry,mode,0,attr)) {
+		goto failfree;
+	}
+
+	inode=dentry->d_inode;
+
+	if (ncp_make_open(inode, O_WRONLY))
+		goto failfree;
+
+	if (ncp_write_kernel(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle, 
+			     0, outlen, rawlink, &i) || i!=outlen) {
+		goto fail;
+	}
+
+	ncp_inode_close(inode);
+	ncp_make_closed(inode);
+	kfree(rawlink);
+	return 0;
+fail:;
+	ncp_inode_close(inode);
+	ncp_make_closed(inode);
+failfree:;
+	kfree(rawlink);
+	return err;
+}
+
+/* ----- EOF ----- */
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
new file mode 100644
index 00000000..81515545
--- /dev/null
+++ b/fs/nfs/Kconfig
@@ -0,0 +1,144 @@
+config NFS_FS
+	tristate "NFS client support"
+	depends on INET && FILE_LOCKING
+	select LOCKD
+	select SUNRPC
+	select NFS_ACL_SUPPORT if NFS_V3_ACL
+	help
+	  Choose Y here if you want to access files residing on other
+	  computers using Sun's Network File System protocol.  To compile
+	  this file system support as a module, choose M here: the module
+	  will be called nfs.
+
+	  To mount file systems exported by NFS servers, you also need to
+	  install the user space mount.nfs command which can be found in
+	  the Linux nfs-utils package, available from http://linux-nfs.org/.
+	  Information about using the mount command is available in the
+	  mount(8) man page.  More detail about the Linux NFS client
+	  implementation is available via the nfs(5) man page.
+
+	  Below you can choose which versions of the NFS protocol are
+	  available in the kernel to mount NFS servers.  Support for NFS
+	  version 2 (RFC 1094) is always available when NFS_FS is selected.
+
+	  To configure a system which mounts its root file system via NFS
+	  at boot time, say Y here, select "Kernel level IP
+	  autoconfiguration" in the NETWORK menu, and select "Root file
+	  system on NFS" below.  You cannot compile this file system as a
+	  module in this case.
+
+	  If unsure, say N.
+
+config NFS_V3
+	bool "NFS client support for NFS version 3"
+	depends on NFS_FS
+	help
+	  This option enables support for version 3 of the NFS protocol
+	  (RFC 1813) in the kernel's NFS client.
+
+	  If unsure, say Y.
+
+config NFS_V3_ACL
+	bool "NFS client support for the NFSv3 ACL protocol extension"
+	depends on NFS_V3
+	help
+	  Some NFS servers support an auxiliary NFSv3 ACL protocol that
+	  Sun added to Solaris but never became an official part of the
+	  NFS version 3 protocol.  This protocol extension allows
+	  applications on NFS clients to manipulate POSIX Access Control
+	  Lists on files residing on NFS servers.  NFS servers enforce
+	  ACLs on local files whether this protocol is available or not.
+
+	  Choose Y here if your NFS server supports the Solaris NFSv3 ACL
+	  protocol extension and you want your NFS client to allow
+	  applications to access and modify ACLs on files on the server.
+
+	  Most NFS servers don't support the Solaris NFSv3 ACL protocol
+	  extension.  You can choose N here or specify the "noacl" mount
+	  option to prevent your NFS client from trying to use the NFSv3
+	  ACL protocol.
+
+	  If unsure, say N.
+
+config NFS_V4
+	bool "NFS client support for NFS version 4"
+	depends on NFS_FS
+	select SUNRPC_GSS
+	help
+	  This option enables support for version 4 of the NFS protocol
+	  (RFC 3530) in the kernel's NFS client.
+
+	  To mount NFS servers using NFSv4, you also need to install user
+	  space programs which can be found in the Linux nfs-utils package,
+	  available from http://linux-nfs.org/.
+
+	  If unsure, say Y.
+
+config NFS_V4_1
+	bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
+	depends on NFS_FS && NFS_V4 && EXPERIMENTAL
+	select PNFS_FILE_LAYOUT
+	help
+	  This option enables support for minor version 1 of the NFSv4 protocol
+	  (RFC 5661) in the kernel's NFS client.
+
+	  If unsure, say N.
+
+config PNFS_FILE_LAYOUT
+	tristate
+
+config PNFS_OBJLAYOUT
+	tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
+	depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
+	help
+	  Say M here if you want your pNFS client to support the Objects Layout Driver.
+	  Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
+	  upper level driver (SCSI_OSD_ULD).
+
+	  If unsure, say N.
+
+config ROOT_NFS
+	bool "Root file system on NFS"
+	depends on NFS_FS=y && IP_PNP
+	help
+	  If you want your system to mount its root file system via NFS,
+	  choose Y here.  This is common practice for managing systems
+	  without local permanent storage.  For details, read
+	  <file:Documentation/filesystems/nfs/nfsroot.txt>.
+
+	  Most people say N here.
+
+config NFS_FSCACHE
+	bool "Provide NFS client caching support"
+	depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
+	help
+	  Say Y here if you want NFS data to be cached locally on disc through
+	  the general filesystem cache manager
+
+config NFS_USE_LEGACY_DNS
+	bool "Use the legacy NFS DNS resolver"
+	depends on NFS_V4
+	help
+	  The kernel now provides a method for translating a host name into an
+	  IP address.  Select Y here if you would rather use your own DNS
+	  resolver script.
+
+	  If unsure, say N
+
+config NFS_USE_KERNEL_DNS
+	bool
+	depends on NFS_V4 && !NFS_USE_LEGACY_DNS
+	select DNS_RESOLVER
+	select KEYS
+	default y
+
+config NFS_USE_NEW_IDMAPPER
+	bool "Use the new idmapper upcall routine"
+	depends on NFS_V4 && KEYS
+	help
+	  Say Y here if you want NFS to use the new idmapper upcall functions.
+	  You will need /sbin/request-key (usually provided by the keyutils
+	  package).  For details, read
+	  <file:Documentation/filesystems/nfs/idmapper.txt>.
+
+	  If you are unsure, say N.
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
new file mode 100644
index 00000000..6a34f7dd
--- /dev/null
+++ b/fs/nfs/Makefile
@@ -0,0 +1,25 @@
+#
+# Makefile for the Linux nfs filesystem routines.
+#
+
+obj-$(CONFIG_NFS_FS) += nfs.o
+
+nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
+			   direct.o pagelist.o proc.o read.o symlink.o unlink.o \
+			   write.o namespace.o mount_clnt.o \
+			   dns_resolve.o cache_lib.o
+nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o
+nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o
+nfs-$(CONFIG_NFS_V3_ACL)	+= nfs3acl.o
+nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
+			   delegation.o idmap.o \
+			   callback.o callback_xdr.o callback_proc.o \
+			   nfs4namespace.o
+nfs-$(CONFIG_NFS_V4_1)	+= pnfs.o pnfs_dev.o
+nfs-$(CONFIG_SYSCTL) += sysctl.o
+nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
+
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
+nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
+
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
new file mode 100644
index 00000000..84690319
--- /dev/null
+++ b/fs/nfs/cache_lib.c
@@ -0,0 +1,141 @@
+/*
+ * linux/fs/nfs/cache_lib.c
+ *
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include "cache_lib.h"
+
+#define NFS_CACHE_UPCALL_PATHLEN 256
+#define NFS_CACHE_UPCALL_TIMEOUT 15
+
+static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
+				"/sbin/nfs_cache_getent";
+static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;
+
+module_param_string(cache_getent, nfs_cache_getent_prog,
+		sizeof(nfs_cache_getent_prog), 0600);
+MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
+module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
+MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
+		"the cache upcall is assumed to have failed");
+
+int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
+{
+	static char *envp[] = { "HOME=/",
+		"TERM=linux",
+		"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+		NULL
+	};
+	char *argv[] = {
+		nfs_cache_getent_prog,
+		cd->name,
+		entry_name,
+		NULL
+	};
+	int ret = -EACCES;
+
+	if (nfs_cache_getent_prog[0] == '\0')
+		goto out;
+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+	/*
+	 * Disable the upcall mechanism if we're getting an ENOENT or
+	 * EACCES error. The admin can re-enable it on the fly by using
+	 * sysfs to set the 'cache_getent' parameter once the problem
+	 * has been fixed.
+	 */
+	if (ret == -ENOENT || ret == -EACCES)
+		nfs_cache_getent_prog[0] = '\0';
+out:
+	return ret > 0 ? 0 : ret;
+}
+
+/*
+ * Deferred request handling
+ */
+void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
+{
+	if (atomic_dec_and_test(&dreq->count))
+		kfree(dreq);
+}
+
+static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
+{
+	struct nfs_cache_defer_req *dreq;
+
+	dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
+
+	complete_all(&dreq->completion);
+	nfs_cache_defer_req_put(dreq);
+}
+
+static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
+{
+	struct nfs_cache_defer_req *dreq;
+
+	dreq = container_of(req, struct nfs_cache_defer_req, req);
+	dreq->deferred_req.revisit = nfs_dns_cache_revisit;
+	atomic_inc(&dreq->count);
+
+	return &dreq->deferred_req;
+}
+
+struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
+{
+	struct nfs_cache_defer_req *dreq;
+
+	dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
+	if (dreq) {
+		init_completion(&dreq->completion);
+		atomic_set(&dreq->count, 1);
+		dreq->req.defer = nfs_dns_cache_defer;
+	}
+	return dreq;
+}
+
+int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
+{
+	if (wait_for_completion_timeout(&dreq->completion,
+			nfs_cache_getent_timeout * HZ) == 0)
+		return -ETIMEDOUT;
+	return 0;
+}
+
+int nfs_cache_register(struct cache_detail *cd)
+{
+	struct nameidata nd;
+	struct vfsmount *mnt;
+	int ret;
+
+	mnt = rpc_get_mount();
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+	ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd);
+	if (ret)
+		goto err;
+	ret = sunrpc_cache_register_pipefs(nd.path.dentry,
+			cd->name, 0600, cd);
+	path_put(&nd.path);
+	if (!ret)
+		return ret;
+err:
+	rpc_put_mount();
+	return ret;
+}
+
+void nfs_cache_unregister(struct cache_detail *cd)
+{
+	sunrpc_cache_unregister_pipefs(cd);
+	rpc_put_mount();
+}
+
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
new file mode 100644
index 00000000..76f856e2
--- /dev/null
+++ b/fs/nfs/cache_lib.h
@@ -0,0 +1,27 @@
+/*
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+
+#include <linux/completion.h>
+#include <linux/sunrpc/cache.h>
+#include <asm/atomic.h>
+
+/*
+ * Deferred request handling
+ */
+struct nfs_cache_defer_req {
+	struct cache_req req;
+	struct cache_deferred_req deferred_req;
+	struct completion completion;
+	atomic_t count;
+};
+
+extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
+extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
+extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
+extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
+
+extern int nfs_cache_register(struct cache_detail *cd);
+extern void nfs_cache_unregister(struct cache_detail *cd);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
new file mode 100644
index 00000000..e3d29426
--- /dev/null
+++ b/fs/nfs/callback.c
@@ -0,0 +1,403 @@
+/*
+ * linux/fs/nfs/callback.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback handling
+ */
+
+#include <linux/completion.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/nfs_fs.h>
+#include <linux/mutex.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#include <net/inet_sock.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_CALLBACK
+
+struct nfs_callback_data {
+	unsigned int users;
+	struct svc_serv *serv;
+	struct svc_rqst *rqst;
+	struct task_struct *task;
+};
+
+static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
+static DEFINE_MUTEX(nfs_callback_mutex);
+static struct svc_program nfs4_callback_program;
+
+unsigned int nfs_callback_set_tcpport;
+unsigned short nfs_callback_tcpport;
+unsigned short nfs_callback_tcpport6;
+#define NFS_CALLBACK_MAXPORTNR (65535U)
+
+static int param_set_portnr(const char *val, const struct kernel_param *kp)
+{
+	unsigned long num;
+	int ret;
+
+	if (!val)
+		return -EINVAL;
+	ret = strict_strtoul(val, 0, &num);
+	if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR)
+		return -EINVAL;
+	*((unsigned int *)kp->arg) = num;
+	return 0;
+}
+static struct kernel_param_ops param_ops_portnr = {
+	.set = param_set_portnr,
+	.get = param_get_uint,
+};
+#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
+
+module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
+
+/*
+ * This is the NFSv4 callback kernel thread.
+ */
+static int
+nfs4_callback_svc(void *vrqstp)
+{
+	int err, preverr = 0;
+	struct svc_rqst *rqstp = vrqstp;
+
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		/*
+		 * Listen for a request on the socket
+		 */
+		err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT);
+		if (err == -EAGAIN || err == -EINTR) {
+			preverr = err;
+			continue;
+		}
+		if (err < 0) {
+			if (err != preverr) {
+				printk(KERN_WARNING "%s: unexpected error "
+					"from svc_recv (%d)\n", __func__, err);
+				preverr = err;
+			}
+			schedule_timeout_uninterruptible(HZ);
+			continue;
+		}
+		preverr = err;
+		svc_process(rqstp);
+	}
+	return 0;
+}
+
+/*
+ * Prepare to bring up the NFSv4 callback service
+ */
+struct svc_rqst *
+nfs4_callback_up(struct svc_serv *serv)
+{
+	int ret;
+
+	ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
+				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+	if (ret <= 0)
+		goto out_err;
+	nfs_callback_tcpport = ret;
+	dprintk("NFS: Callback listener port = %u (af %u)\n",
+			nfs_callback_tcpport, PF_INET);
+
+	ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
+				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+	if (ret > 0) {
+		nfs_callback_tcpport6 = ret;
+		dprintk("NFS: Callback listener port = %u (af %u)\n",
+				nfs_callback_tcpport6, PF_INET6);
+	} else if (ret == -EAFNOSUPPORT)
+		ret = 0;
+	else
+		goto out_err;
+
+	return svc_prepare_thread(serv, &serv->sv_pools[0]);
+
+out_err:
+	if (ret == 0)
+		ret = -ENOMEM;
+	return ERR_PTR(ret);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * The callback service for NFSv4.1 callbacks
+ */
+static int
+nfs41_callback_svc(void *vrqstp)
+{
+	struct svc_rqst *rqstp = vrqstp;
+	struct svc_serv *serv = rqstp->rq_server;
+	struct rpc_rqst *req;
+	int error;
+	DEFINE_WAIT(wq);
+
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+		spin_lock_bh(&serv->sv_cb_lock);
+		if (!list_empty(&serv->sv_cb_list)) {
+			req = list_first_entry(&serv->sv_cb_list,
+					struct rpc_rqst, rq_bc_list);
+			list_del(&req->rq_bc_list);
+			spin_unlock_bh(&serv->sv_cb_lock);
+			dprintk("Invoking bc_svc_process()\n");
+			error = bc_svc_process(serv, req, rqstp);
+			dprintk("bc_svc_process() returned w/ error code= %d\n",
+				error);
+		} else {
+			spin_unlock_bh(&serv->sv_cb_lock);
+			schedule();
+		}
+		finish_wait(&serv->sv_cb_waitq, &wq);
+	}
+	return 0;
+}
+
+/*
+ * Bring up the NFSv4.1 callback service
+ */
+struct svc_rqst *
+nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
+{
+	struct svc_rqst *rqstp;
+	int ret;
+
+	/*
+	 * Create an svc_sock for the back channel service that shares the
+	 * fore channel connection.
+	 * Returns the input port (0) and sets the svc_serv bc_xprt on success
+	 */
+	ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
+			      SVC_SOCK_ANONYMOUS);
+	if (ret < 0) {
+		rqstp = ERR_PTR(ret);
+		goto out;
+	}
+
+	/*
+	 * Save the svc_serv in the transport so that it can
+	 * be referenced when the session backchannel is initialized
+	 */
+	xprt->bc_serv = serv;
+
+	INIT_LIST_HEAD(&serv->sv_cb_list);
+	spin_lock_init(&serv->sv_cb_lock);
+	init_waitqueue_head(&serv->sv_cb_waitq);
+	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
+	if (IS_ERR(rqstp)) {
+		svc_xprt_put(serv->sv_bc_xprt);
+		serv->sv_bc_xprt = NULL;
+	}
+out:
+	dprintk("--> %s return %ld\n", __func__,
+		IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
+	return rqstp;
+}
+
+static inline int nfs_minorversion_callback_svc_setup(u32 minorversion,
+		struct svc_serv *serv, struct rpc_xprt *xprt,
+		struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
+{
+	if (minorversion) {
+		*rqstpp = nfs41_callback_up(serv, xprt);
+		*callback_svc = nfs41_callback_svc;
+	}
+	return minorversion;
+}
+
+static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
+		struct nfs_callback_data *cb_info)
+{
+	if (minorversion)
+		xprt->bc_serv = cb_info->serv;
+}
+#else
+static inline int nfs_minorversion_callback_svc_setup(u32 minorversion,
+		struct svc_serv *serv, struct rpc_xprt *xprt,
+		struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
+{
+	return 0;
+}
+
+static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
+		struct nfs_callback_data *cb_info)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * Bring up the callback thread if it is not already up.
+ */
+int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
+{
+	struct svc_serv *serv = NULL;
+	struct svc_rqst *rqstp;
+	int (*callback_svc)(void *vrqstp);
+	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+	char svc_name[12];
+	int ret = 0;
+	int minorversion_setup;
+
+	mutex_lock(&nfs_callback_mutex);
+	if (cb_info->users++ || cb_info->task != NULL) {
+		nfs_callback_bc_serv(minorversion, xprt, cb_info);
+		goto out;
+	}
+	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+	if (!serv) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	minorversion_setup =  nfs_minorversion_callback_svc_setup(minorversion,
+					serv, xprt, &rqstp, &callback_svc);
+	if (!minorversion_setup) {
+		/* v4.0 callback setup */
+		rqstp = nfs4_callback_up(serv);
+		callback_svc = nfs4_callback_svc;
+	}
+
+	if (IS_ERR(rqstp)) {
+		ret = PTR_ERR(rqstp);
+		goto out_err;
+	}
+
+	svc_sock_update_bufs(serv);
+
+	sprintf(svc_name, "nfsv4.%u-svc", minorversion);
+	cb_info->serv = serv;
+	cb_info->rqst = rqstp;
+	cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name);
+	if (IS_ERR(cb_info->task)) {
+		ret = PTR_ERR(cb_info->task);
+		svc_exit_thread(cb_info->rqst);
+		cb_info->rqst = NULL;
+		cb_info->task = NULL;
+		goto out_err;
+	}
+out:
+	/*
+	 * svc_create creates the svc_serv with sv_nrthreads == 1, and then
+	 * svc_prepare_thread increments that. So we need to call svc_destroy
+	 * on both success and failure so that the refcount is 1 when the
+	 * thread exits.
+	 */
+	if (serv)
+		svc_destroy(serv);
+	mutex_unlock(&nfs_callback_mutex);
+	return ret;
+out_err:
+	dprintk("NFS: Couldn't create callback socket or server thread; "
+		"err = %d\n", ret);
+	cb_info->users--;
+	goto out;
+}
+
+/*
+ * Kill the callback thread if it's no longer being used.
+ */
+void nfs_callback_down(int minorversion)
+{
+	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+
+	mutex_lock(&nfs_callback_mutex);
+	cb_info->users--;
+	if (cb_info->users == 0 && cb_info->task != NULL) {
+		kthread_stop(cb_info->task);
+		svc_exit_thread(cb_info->rqst);
+		cb_info->serv = NULL;
+		cb_info->rqst = NULL;
+		cb_info->task = NULL;
+	}
+	mutex_unlock(&nfs_callback_mutex);
+}
+
+/* Boolean check of RPC_AUTH_GSS principal */
+int
+check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
+{
+	struct rpc_clnt *r = clp->cl_rpcclient;
+	char *p = svc_gss_principal(rqstp);
+
+	if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
+		return 1;
+
+	/* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
+	if (clp->cl_minorversion != 0)
+		return 0;
+	/*
+	 * It might just be a normal user principal, in which case
+	 * userspace won't bother to tell us the name at all.
+	 */
+	if (p == NULL)
+		return 0;
+
+	/* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
+
+	if (memcmp(p, "nfs@", 4) != 0)
+		return 0;
+	p += 4;
+	if (strcmp(p, r->cl_server) != 0)
+		return 0;
+	return 1;
+}
+
+/*
+ * pg_authenticate method for nfsv4 callback threads.
+ *
+ * The authflavor has been negotiated, so an incorrect flavor is a server
+ * bug. Drop packets with incorrect authflavor.
+ *
+ * All other checking done after NFS decoding where the nfs_client can be
+ * found in nfs4_callback_compound
+ */
+static int nfs_callback_authenticate(struct svc_rqst *rqstp)
+{
+	switch (rqstp->rq_authop->flavour) {
+	case RPC_AUTH_NULL:
+		if (rqstp->rq_proc != CB_NULL)
+			return SVC_DROP;
+		break;
+	case RPC_AUTH_GSS:
+		/* No RPC_AUTH_GSS support yet in NFSv4.1 */
+		 if (svc_is_backchannel(rqstp))
+			return SVC_DROP;
+	}
+	return SVC_OK;
+}
+
+/*
+ * Define NFS4 callback program
+ */
+static struct svc_version *nfs4_callback_version[] = {
+	[1] = &nfs4_callback_version1,
+	[4] = &nfs4_callback_version4,
+};
+
+static struct svc_stat nfs4_callback_stats;
+
+static struct svc_program nfs4_callback_program = {
+	.pg_prog = NFS4_CALLBACK,			/* RPC service number */
+	.pg_nvers = ARRAY_SIZE(nfs4_callback_version),	/* Number of entries */
+	.pg_vers = nfs4_callback_version,		/* version table */
+	.pg_name = "NFSv4 callback",			/* service name */
+	.pg_class = "nfs",				/* authentication class */
+	.pg_stats = &nfs4_callback_stats,
+	.pg_authenticate = nfs_callback_authenticate,
+};
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
new file mode 100644
index 00000000..07df5f1d
--- /dev/null
+++ b/fs/nfs/callback.h
@@ -0,0 +1,213 @@
+/*
+ * linux/fs/nfs/callback.h
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback definitions
+ */
+#ifndef __LINUX_FS_NFS_CALLBACK_H
+#define __LINUX_FS_NFS_CALLBACK_H
+#include <linux/sunrpc/svc.h>
+
+#define NFS4_CALLBACK 0x40000000
+#define NFS4_CALLBACK_XDRSIZE 2048
+#define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE)
+
+enum nfs4_callback_procnum {
+	CB_NULL = 0,
+	CB_COMPOUND = 1,
+};
+
+enum nfs4_callback_opnum {
+	OP_CB_GETATTR = 3,
+	OP_CB_RECALL  = 4,
+/* Callback operations new to NFSv4.1 */
+	OP_CB_LAYOUTRECALL  = 5,
+	OP_CB_NOTIFY        = 6,
+	OP_CB_PUSH_DELEG    = 7,
+	OP_CB_RECALL_ANY    = 8,
+	OP_CB_RECALLABLE_OBJ_AVAIL = 9,
+	OP_CB_RECALL_SLOT   = 10,
+	OP_CB_SEQUENCE      = 11,
+	OP_CB_WANTS_CANCELLED = 12,
+	OP_CB_NOTIFY_LOCK   = 13,
+	OP_CB_NOTIFY_DEVICEID = 14,
+	OP_CB_ILLEGAL = 10044,
+};
+
+struct cb_process_state {
+	__be32			drc_status;
+	struct nfs_client	*clp;
+	int			slotid;
+};
+
+struct cb_compound_hdr_arg {
+	unsigned int taglen;
+	const char *tag;
+	unsigned int minorversion;
+	unsigned int cb_ident; /* v4.0 callback identifier */
+	unsigned nops;
+};
+
+struct cb_compound_hdr_res {
+	__be32 *status;
+	unsigned int taglen;
+	const char *tag;
+	__be32 *nops;
+};
+
+struct cb_getattrargs {
+	struct sockaddr *addr;
+	struct nfs_fh fh;
+	uint32_t bitmap[2];
+};
+
+struct cb_getattrres {
+	__be32 status;
+	uint32_t bitmap[2];
+	uint64_t size;
+	uint64_t change_attr;
+	struct timespec ctime;
+	struct timespec mtime;
+};
+
+struct cb_recallargs {
+	struct sockaddr *addr;
+	struct nfs_fh fh;
+	nfs4_stateid stateid;
+	uint32_t truncate;
+};
+
+#if defined(CONFIG_NFS_V4_1)
+
+struct referring_call {
+	uint32_t			rc_sequenceid;
+	uint32_t			rc_slotid;
+};
+
+struct referring_call_list {
+	struct nfs4_sessionid		rcl_sessionid;
+	uint32_t			rcl_nrefcalls;
+	struct referring_call 		*rcl_refcalls;
+};
+
+struct cb_sequenceargs {
+	struct sockaddr			*csa_addr;
+	struct nfs4_sessionid		csa_sessionid;
+	uint32_t			csa_sequenceid;
+	uint32_t			csa_slotid;
+	uint32_t			csa_highestslotid;
+	uint32_t			csa_cachethis;
+	uint32_t			csa_nrclists;
+	struct referring_call_list	*csa_rclists;
+};
+
+struct cb_sequenceres {
+	__be32				csr_status;
+	struct nfs4_sessionid		csr_sessionid;
+	uint32_t			csr_sequenceid;
+	uint32_t			csr_slotid;
+	uint32_t			csr_highestslotid;
+	uint32_t			csr_target_highestslotid;
+};
+
+extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
+				       struct cb_sequenceres *res,
+				       struct cb_process_state *cps);
+
+extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
+					     const nfs4_stateid *stateid);
+
+#define RCA4_TYPE_MASK_RDATA_DLG	0
+#define RCA4_TYPE_MASK_WDATA_DLG	1
+#define RCA4_TYPE_MASK_DIR_DLG         2
+#define RCA4_TYPE_MASK_FILE_LAYOUT     3
+#define RCA4_TYPE_MASK_BLK_LAYOUT      4
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN  8
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX  9
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
+#define RCA4_TYPE_MASK_ALL 0xf31f
+
+struct cb_recallanyargs {
+	struct sockaddr	*craa_addr;
+	uint32_t	craa_objs_to_keep;
+	uint32_t	craa_type_mask;
+};
+
+extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
+					void *dummy,
+					struct cb_process_state *cps);
+
+struct cb_recallslotargs {
+	struct sockaddr	*crsa_addr;
+	uint32_t	crsa_target_max_slots;
+};
+extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
+					 void *dummy,
+					 struct cb_process_state *cps);
+
+struct cb_layoutrecallargs {
+	struct sockaddr		*cbl_addr;
+	uint32_t		cbl_recall_type;
+	uint32_t		cbl_layout_type;
+	uint32_t		cbl_layoutchanged;
+	union {
+		struct {
+			struct nfs_fh		cbl_fh;
+			struct pnfs_layout_range cbl_range;
+			nfs4_stateid		cbl_stateid;
+		};
+		struct nfs_fsid		cbl_fsid;
+	};
+};
+
+extern unsigned nfs4_callback_layoutrecall(
+	struct cb_layoutrecallargs *args,
+	void *dummy, struct cb_process_state *cps);
+
+extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
+
+struct cb_devicenotifyitem {
+	uint32_t		cbd_notify_type;
+	uint32_t		cbd_layout_type;
+	struct nfs4_deviceid	cbd_dev_id;
+	uint32_t		cbd_immediate;
+};
+
+struct cb_devicenotifyargs {
+	int				 ndevs;
+	struct cb_devicenotifyitem	 *devs;
+};
+
+extern __be32 nfs4_callback_devicenotify(
+	struct cb_devicenotifyargs *args,
+	void *dummy, struct cb_process_state *cps);
+
+#endif /* CONFIG_NFS_V4_1 */
+extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
+extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+				    struct cb_getattrres *res,
+				    struct cb_process_state *cps);
+extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+				   struct cb_process_state *cps);
+#ifdef CONFIG_NFS_V4
+extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
+extern void nfs_callback_down(int minorversion);
+extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
+					    const nfs4_stateid *stateid);
+extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
+#endif /* CONFIG_NFS_V4 */
+/*
+ * nfs41: Callbacks are expected to not cause substantial latency,
+ * so we limit their concurrency to 1 by setting up the maximum number
+ * of slots for the backchannel.
+ */
+#define NFS41_BC_MIN_CALLBACKS 1
+#define NFS41_BC_MAX_CALLBACKS 1
+
+extern unsigned int nfs_callback_set_tcpport;
+extern unsigned short nfs_callback_tcpport;
+extern unsigned short nfs_callback_tcpport6;
+
+#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
new file mode 100644
index 00000000..b5c826e1
--- /dev/null
+++ b/fs/nfs/callback_proc.c
@@ -0,0 +1,561 @@
+/*
+ * linux/fs/nfs/callback_proc.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback procedures
+ */
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/slab.h>
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "internal.h"
+#include "pnfs.h"
+
+#ifdef NFS_DEBUG
+#define NFSDBG_FACILITY NFSDBG_CALLBACK
+#endif
+
+__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+			     struct cb_getattrres *res,
+			     struct cb_process_state *cps)
+{
+	struct nfs_delegation *delegation;
+	struct nfs_inode *nfsi;
+	struct inode *inode;
+
+	res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+	if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+		goto out;
+
+	res->bitmap[0] = res->bitmap[1] = 0;
+	res->status = htonl(NFS4ERR_BADHANDLE);
+
+	dprintk("NFS: GETATTR callback request from %s\n",
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+	inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+	if (inode == NULL)
+		goto out;
+	nfsi = NFS_I(inode);
+	rcu_read_lock();
+	delegation = rcu_dereference(nfsi->delegation);
+	if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0)
+		goto out_iput;
+	res->size = i_size_read(inode);
+	res->change_attr = delegation->change_attr;
+	if (nfsi->npages != 0)
+		res->change_attr++;
+	res->ctime = inode->i_ctime;
+	res->mtime = inode->i_mtime;
+	res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
+		args->bitmap[0];
+	res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) &
+		args->bitmap[1];
+	res->status = 0;
+out_iput:
+	rcu_read_unlock();
+	iput(inode);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
+	return res->status;
+}
+
+__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+			    struct cb_process_state *cps)
+{
+	struct inode *inode;
+	__be32 res;
+	
+	res = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+	if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+		goto out;
+
+	dprintk("NFS: RECALL callback request from %s\n",
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+	res = htonl(NFS4ERR_BADHANDLE);
+	inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+	if (inode == NULL)
+		goto out;
+	/* Set up a helper thread to actually return the delegation */
+	switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
+	case 0:
+		res = 0;
+		break;
+	case -ENOENT:
+		if (res != 0)
+			res = htonl(NFS4ERR_BAD_STATEID);
+		break;
+	default:
+		res = htonl(NFS4ERR_RESOURCE);
+	}
+	iput(inode);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
+	return res;
+}
+
+int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
+{
+	if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data,
+					 sizeof(delegation->stateid.data)) != 0)
+		return 0;
+	return 1;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static u32 initiate_file_draining(struct nfs_client *clp,
+				  struct cb_layoutrecallargs *args)
+{
+	struct pnfs_layout_hdr *lo;
+	struct inode *ino;
+	bool found = false;
+	u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+	LIST_HEAD(free_me_list);
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+		if (nfs_compare_fh(&args->cbl_fh,
+				   &NFS_I(lo->plh_inode)->fh))
+			continue;
+		ino = igrab(lo->plh_inode);
+		if (!ino)
+			continue;
+		found = true;
+		/* Without this, layout can be freed as soon
+		 * as we release cl_lock.
+		 */
+		get_layout_hdr(lo);
+		break;
+	}
+	spin_unlock(&clp->cl_lock);
+	if (!found)
+		return NFS4ERR_NOMATCHING_LAYOUT;
+
+	spin_lock(&ino->i_lock);
+	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+	    mark_matching_lsegs_invalid(lo, &free_me_list,
+					&args->cbl_range))
+		rv = NFS4ERR_DELAY;
+	else
+		rv = NFS4ERR_NOMATCHING_LAYOUT;
+	pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+	spin_unlock(&ino->i_lock);
+	pnfs_free_lseg_list(&free_me_list);
+	put_layout_hdr(lo);
+	iput(ino);
+	return rv;
+}
+
+static u32 initiate_bulk_draining(struct nfs_client *clp,
+				  struct cb_layoutrecallargs *args)
+{
+	struct pnfs_layout_hdr *lo;
+	struct inode *ino;
+	u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+	struct pnfs_layout_hdr *tmp;
+	LIST_HEAD(recall_list);
+	LIST_HEAD(free_me_list);
+	struct pnfs_layout_range range = {
+		.iomode = IOMODE_ANY,
+		.offset = 0,
+		.length = NFS4_MAX_UINT64,
+	};
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+		if ((args->cbl_recall_type == RETURN_FSID) &&
+		    memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
+			   &args->cbl_fsid, sizeof(struct nfs_fsid)))
+			continue;
+		if (!igrab(lo->plh_inode))
+			continue;
+		get_layout_hdr(lo);
+		BUG_ON(!list_empty(&lo->plh_bulk_recall));
+		list_add(&lo->plh_bulk_recall, &recall_list);
+	}
+	spin_unlock(&clp->cl_lock);
+	list_for_each_entry_safe(lo, tmp,
+				 &recall_list, plh_bulk_recall) {
+		ino = lo->plh_inode;
+		spin_lock(&ino->i_lock);
+		set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+		if (mark_matching_lsegs_invalid(lo, &free_me_list, &range))
+			rv = NFS4ERR_DELAY;
+		list_del_init(&lo->plh_bulk_recall);
+		spin_unlock(&ino->i_lock);
+		pnfs_free_lseg_list(&free_me_list);
+		put_layout_hdr(lo);
+		iput(ino);
+	}
+	return rv;
+}
+
+static u32 do_callback_layoutrecall(struct nfs_client *clp,
+				    struct cb_layoutrecallargs *args)
+{
+	u32 res = NFS4ERR_DELAY;
+
+	dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
+	if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
+		goto out;
+	if (args->cbl_recall_type == RETURN_FILE)
+		res = initiate_file_draining(clp, args);
+	else
+		res = initiate_bulk_draining(clp, args);
+	clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
+out:
+	dprintk("%s returning %i\n", __func__, res);
+	return res;
+
+}
+
+__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
+				  void *dummy, struct cb_process_state *cps)
+{
+	u32 res;
+
+	dprintk("%s: -->\n", __func__);
+
+	if (cps->clp)
+		res = do_callback_layoutrecall(cps->clp, args);
+	else
+		res = NFS4ERR_OP_NOT_IN_SESSION;
+
+	dprintk("%s: exit with status = %d\n", __func__, res);
+	return cpu_to_be32(res);
+}
+
+static void pnfs_recall_all_layouts(struct nfs_client *clp)
+{
+	struct cb_layoutrecallargs args;
+
+	/* Pretend we got a CB_LAYOUTRECALL(ALL) */
+	memset(&args, 0, sizeof(args));
+	args.cbl_recall_type = RETURN_ALL;
+	/* FIXME we ignore errors, what should we do? */
+	do_callback_layoutrecall(clp, &args);
+}
+
+__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
+				  void *dummy, struct cb_process_state *cps)
+{
+	int i;
+	__be32 res = 0;
+	struct nfs_client *clp = cps->clp;
+	struct nfs_server *server = NULL;
+
+	dprintk("%s: -->\n", __func__);
+
+	if (!clp) {
+		res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+		goto out;
+	}
+
+	for (i = 0; i < args->ndevs; i++) {
+		struct cb_devicenotifyitem *dev = &args->devs[i];
+
+		if (!server ||
+		    server->pnfs_curr_ld->id != dev->cbd_layout_type) {
+			rcu_read_lock();
+			list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+				if (server->pnfs_curr_ld &&
+				    server->pnfs_curr_ld->id == dev->cbd_layout_type) {
+					rcu_read_unlock();
+					goto found;
+				}
+			rcu_read_unlock();
+			dprintk("%s: layout type %u not found\n",
+				__func__, dev->cbd_layout_type);
+			continue;
+		}
+
+	found:
+		if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
+			dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
+				"deleting instead\n", __func__);
+		nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
+	}
+
+out:
+	kfree(args->devs);
+	dprintk("%s: exit with status = %u\n",
+		__func__, be32_to_cpu(res));
+	return res;
+}
+
+int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
+{
+	if (delegation == NULL)
+		return 0;
+
+	if (stateid->stateid.seqid != 0)
+		return 0;
+	if (memcmp(&delegation->stateid.stateid.other,
+		   &stateid->stateid.other,
+		   NFS4_STATEID_OTHER_SIZE))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Validate the sequenceID sent by the server.
+ * Return success if the sequenceID is one more than what we last saw on
+ * this slot, accounting for wraparound.  Increments the slot's sequence.
+ *
+ * We don't yet implement a duplicate request cache, instead we set the
+ * back channel ca_maxresponsesize_cached to zero. This is OK for now
+ * since we only currently implement idempotent callbacks anyway.
+ *
+ * We have a single slot backchannel at this time, so we don't bother
+ * checking the used_slots bit array on the table.  The lower layer guarantees
+ * a single outstanding callback request at a time.
+ */
+static __be32
+validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
+{
+	struct nfs4_slot *slot;
+
+	dprintk("%s enter. slotid %d seqid %d\n",
+		__func__, args->csa_slotid, args->csa_sequenceid);
+
+	if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)
+		return htonl(NFS4ERR_BADSLOT);
+
+	slot = tbl->slots + args->csa_slotid;
+	dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr);
+
+	/* Normal */
+	if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
+		slot->seq_nr++;
+		goto out_ok;
+	}
+
+	/* Replay */
+	if (args->csa_sequenceid == slot->seq_nr) {
+		dprintk("%s seqid %d is a replay\n",
+			__func__, args->csa_sequenceid);
+		/* Signal process_op to set this error on next op */
+		if (args->csa_cachethis == 0)
+			return htonl(NFS4ERR_RETRY_UNCACHED_REP);
+
+		/* The ca_maxresponsesize_cached is 0 with no DRC */
+		else if (args->csa_cachethis == 1)
+			return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+	}
+
+	/* Wraparound */
+	if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
+		slot->seq_nr = 1;
+		goto out_ok;
+	}
+
+	/* Misordered request */
+	return htonl(NFS4ERR_SEQ_MISORDERED);
+out_ok:
+	tbl->highest_used_slotid = args->csa_slotid;
+	return htonl(NFS4_OK);
+}
+
+/*
+ * For each referring call triple, check the session's slot table for
+ * a match.  If the slot is in use and the sequence numbers match, the
+ * client is still waiting for a response to the original request.
+ */
+static bool referring_call_exists(struct nfs_client *clp,
+				  uint32_t nrclists,
+				  struct referring_call_list *rclists)
+{
+	bool status = 0;
+	int i, j;
+	struct nfs4_session *session;
+	struct nfs4_slot_table *tbl;
+	struct referring_call_list *rclist;
+	struct referring_call *ref;
+
+	/*
+	 * XXX When client trunking is implemented, this becomes
+	 * a session lookup from within the loop
+	 */
+	session = clp->cl_session;
+	tbl = &session->fc_slot_table;
+
+	for (i = 0; i < nrclists; i++) {
+		rclist = &rclists[i];
+		if (memcmp(session->sess_id.data,
+			   rclist->rcl_sessionid.data,
+			   NFS4_MAX_SESSIONID_LEN) != 0)
+			continue;
+
+		for (j = 0; j < rclist->rcl_nrefcalls; j++) {
+			ref = &rclist->rcl_refcalls[j];
+
+			dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u "
+				"slotid %u\n", __func__,
+				((u32 *)&rclist->rcl_sessionid.data)[0],
+				((u32 *)&rclist->rcl_sessionid.data)[1],
+				((u32 *)&rclist->rcl_sessionid.data)[2],
+				((u32 *)&rclist->rcl_sessionid.data)[3],
+				ref->rc_sequenceid, ref->rc_slotid);
+
+			spin_lock(&tbl->slot_tbl_lock);
+			status = (test_bit(ref->rc_slotid, tbl->used_slots) &&
+				  tbl->slots[ref->rc_slotid].seq_nr ==
+					ref->rc_sequenceid);
+			spin_unlock(&tbl->slot_tbl_lock);
+			if (status)
+				goto out;
+		}
+	}
+
+out:
+	return status;
+}
+
+__be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
+			      struct cb_sequenceres *res,
+			      struct cb_process_state *cps)
+{
+	struct nfs4_slot_table *tbl;
+	struct nfs_client *clp;
+	int i;
+	__be32 status = htonl(NFS4ERR_BADSESSION);
+
+	clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
+	if (clp == NULL)
+		goto out;
+
+	tbl = &clp->cl_session->bc_slot_table;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	/* state manager is resetting the session */
+	if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
+		spin_unlock(&tbl->slot_tbl_lock);
+		status = htonl(NFS4ERR_DELAY);
+		/* Return NFS4ERR_BADSESSION if we're draining the session
+		 * in order to reset it.
+		 */
+		if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
+			status = htonl(NFS4ERR_BADSESSION);
+		goto out;
+	}
+
+	status = validate_seqid(&clp->cl_session->bc_slot_table, args);
+	spin_unlock(&tbl->slot_tbl_lock);
+	if (status)
+		goto out;
+
+	cps->slotid = args->csa_slotid;
+
+	/*
+	 * Check for pending referring calls.  If a match is found, a
+	 * related callback was received before the response to the original
+	 * call.
+	 */
+	if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
+		status = htonl(NFS4ERR_DELAY);
+		goto out;
+	}
+
+	memcpy(&res->csr_sessionid, &args->csa_sessionid,
+	       sizeof(res->csr_sessionid));
+	res->csr_sequenceid = args->csa_sequenceid;
+	res->csr_slotid = args->csa_slotid;
+	res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+	res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+
+out:
+	cps->clp = clp; /* put in nfs4_callback_compound */
+	for (i = 0; i < args->csa_nrclists; i++)
+		kfree(args->csa_rclists[i].rcl_refcalls);
+	kfree(args->csa_rclists);
+
+	if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
+		cps->drc_status = status;
+		status = 0;
+	} else
+		res->csr_status = status;
+
+	dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
+		ntohl(status), ntohl(res->csr_status));
+	return status;
+}
+
+static bool
+validate_bitmap_values(unsigned long mask)
+{
+	return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
+}
+
+__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
+			       struct cb_process_state *cps)
+{
+	__be32 status;
+	fmode_t flags = 0;
+
+	status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+	if (!cps->clp) /* set in cb_sequence */
+		goto out;
+
+	dprintk("NFS: RECALL_ANY callback request from %s\n",
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+	status = cpu_to_be32(NFS4ERR_INVAL);
+	if (!validate_bitmap_values(args->craa_type_mask))
+		goto out;
+
+	status = cpu_to_be32(NFS4_OK);
+	if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
+		     &args->craa_type_mask))
+		flags = FMODE_READ;
+	if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
+		     &args->craa_type_mask))
+		flags |= FMODE_WRITE;
+	if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
+		     &args->craa_type_mask))
+		pnfs_recall_all_layouts(cps->clp);
+	if (flags)
+		nfs_expire_all_delegation_types(cps->clp, flags);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
+/* Reduce the fore channel's max_slots to the target value */
+__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
+				struct cb_process_state *cps)
+{
+	struct nfs4_slot_table *fc_tbl;
+	__be32 status;
+
+	status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+	if (!cps->clp) /* set in cb_sequence */
+		goto out;
+
+	dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+		args->crsa_target_max_slots);
+
+	fc_tbl = &cps->clp->cl_session->fc_slot_table;
+
+	status = htonl(NFS4ERR_BAD_HIGH_SLOT);
+	if (args->crsa_target_max_slots > fc_tbl->max_slots ||
+	    args->crsa_target_max_slots < 1)
+		goto out;
+
+	status = htonl(NFS4_OK);
+	if (args->crsa_target_max_slots == fc_tbl->max_slots)
+		goto out;
+
+	fc_tbl->target_max_slots = args->crsa_target_max_slots;
+	nfs41_handle_recall_slot(cps->clp);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
new file mode 100644
index 00000000..918ad647
--- /dev/null
+++ b/fs/nfs/callback_xdr.c
@@ -0,0 +1,989 @@
+/*
+ * linux/fs/nfs/callback_xdr.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback encode/decode procedures
+ */
+#include <linux/kernel.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "internal.h"
+
+#define CB_OP_TAGLEN_MAXSZ	(512)
+#define CB_OP_HDR_RES_MAXSZ	(2 + CB_OP_TAGLEN_MAXSZ)
+#define CB_OP_GETATTR_BITMAP_MAXSZ	(4)
+#define CB_OP_GETATTR_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ + \
+				CB_OP_GETATTR_BITMAP_MAXSZ + \
+				2 + 2 + 3 + 3)
+#define CB_OP_RECALL_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+
+#if defined(CONFIG_NFS_V4_1)
+#define CB_OP_LAYOUTRECALL_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_DEVICENOTIFY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_SEQUENCE_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ + \
+					4 + 1 + 3)
+#define CB_OP_RECALLANY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_RECALLSLOT_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+#endif /* CONFIG_NFS_V4_1 */
+
+#define NFSDBG_FACILITY NFSDBG_CALLBACK
+
+/* Internal error code */
+#define NFS4ERR_RESOURCE_HDR	11050
+
+typedef __be32 (*callback_process_op_t)(void *, void *,
+					struct cb_process_state *);
+typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
+typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
+
+
+struct callback_op {
+	callback_process_op_t process_op;
+	callback_decode_arg_t decode_args;
+	callback_encode_res_t encode_res;
+	long res_maxsize;
+};
+
+static struct callback_op callback_ops[];
+
+static __be32 nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	return htonl(NFS4_OK);
+}
+
+static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_argsize_check(rqstp, p);
+}
+
+static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_ressize_check(rqstp, p);
+}
+
+static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, nbytes);
+	if (unlikely(p == NULL))
+		printk(KERN_WARNING "NFSv4 callback reply buffer overflowed!\n");
+	return p;
+}
+
+static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str)
+{
+	__be32 *p;
+
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	*len = ntohl(*p);
+
+	if (*len != 0) {
+		p = read_buf(xdr, *len);
+		if (unlikely(p == NULL))
+			return htonl(NFS4ERR_RESOURCE);
+		*str = (const char *)p;
+	} else
+		*str = NULL;
+
+	return 0;
+}
+
+static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	__be32 *p;
+
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	fh->size = ntohl(*p);
+	if (fh->size > NFS4_FHSIZE)
+		return htonl(NFS4ERR_BADHANDLE);
+	p = read_buf(xdr, fh->size);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	memcpy(&fh->data[0], p, fh->size);
+	memset(&fh->data[fh->size], 0, sizeof(fh->data) - fh->size);
+	return 0;
+}
+
+static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+{
+	__be32 *p;
+	unsigned int attrlen;
+
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	attrlen = ntohl(*p);
+	p = read_buf(xdr, attrlen << 2);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	if (likely(attrlen > 0))
+		bitmap[0] = ntohl(*p++);
+	if (attrlen > 1)
+		bitmap[1] = ntohl(*p);
+	return 0;
+}
+
+static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	__be32 *p;
+
+	p = read_buf(xdr, 16);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	memcpy(stateid->data, p, 16);
+	return 0;
+}
+
+static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
+{
+	__be32 *p;
+	__be32 status;
+
+	status = decode_string(xdr, &hdr->taglen, &hdr->tag);
+	if (unlikely(status != 0))
+		return status;
+	/* We do not like overly long tags! */
+	if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
+		printk("NFSv4 CALLBACK %s: client sent tag of length %u\n",
+				__func__, hdr->taglen);
+		return htonl(NFS4ERR_RESOURCE);
+	}
+	p = read_buf(xdr, 12);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	hdr->minorversion = ntohl(*p++);
+	/* Check minor version is zero or one. */
+	if (hdr->minorversion <= 1) {
+		hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
+	} else {
+		printk(KERN_WARNING "%s: NFSv4 server callback with "
+			"illegal minor version %u!\n",
+			__func__, hdr->minorversion);
+		return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+	}
+	hdr->nops = ntohl(*p);
+	dprintk("%s: minorversion %d nops %d\n", __func__,
+		hdr->minorversion, hdr->nops);
+	return 0;
+}
+
+static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
+{
+	__be32 *p;
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE_HDR);
+	*op = ntohl(*p);
+	return 0;
+}
+
+static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args)
+{
+	__be32 status;
+
+	status = decode_fh(xdr, &args->fh);
+	if (unlikely(status != 0))
+		goto out;
+	args->addr = svc_addr(rqstp);
+	status = decode_bitmap(xdr, args->bitmap);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
+static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args)
+{
+	__be32 *p;
+	__be32 status;
+
+	args->addr = svc_addr(rqstp);
+	status = decode_stateid(xdr, &args->stateid);
+	if (unlikely(status != 0))
+		goto out;
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL)) {
+		status = htonl(NFS4ERR_RESOURCE);
+		goto out;
+	}
+	args->truncate = ntohl(*p);
+	status = decode_fh(xdr, &args->fh);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
+				       struct xdr_stream *xdr,
+				       struct cb_layoutrecallargs *args)
+{
+	__be32 *p;
+	__be32 status = 0;
+	uint32_t iomode;
+
+	args->cbl_addr = svc_addr(rqstp);
+	p = read_buf(xdr, 4 * sizeof(uint32_t));
+	if (unlikely(p == NULL)) {
+		status = htonl(NFS4ERR_BADXDR);
+		goto out;
+	}
+
+	args->cbl_layout_type = ntohl(*p++);
+	/* Depite the spec's xdr, iomode really belongs in the FILE switch,
+	 * as it is unusable and ignored with the other types.
+	 */
+	iomode = ntohl(*p++);
+	args->cbl_layoutchanged = ntohl(*p++);
+	args->cbl_recall_type = ntohl(*p++);
+
+	if (args->cbl_recall_type == RETURN_FILE) {
+		args->cbl_range.iomode = iomode;
+		status = decode_fh(xdr, &args->cbl_fh);
+		if (unlikely(status != 0))
+			goto out;
+
+		p = read_buf(xdr, 2 * sizeof(uint64_t));
+		if (unlikely(p == NULL)) {
+			status = htonl(NFS4ERR_BADXDR);
+			goto out;
+		}
+		p = xdr_decode_hyper(p, &args->cbl_range.offset);
+		p = xdr_decode_hyper(p, &args->cbl_range.length);
+		status = decode_stateid(xdr, &args->cbl_stateid);
+		if (unlikely(status != 0))
+			goto out;
+	} else if (args->cbl_recall_type == RETURN_FSID) {
+		p = read_buf(xdr, 2 * sizeof(uint64_t));
+		if (unlikely(p == NULL)) {
+			status = htonl(NFS4ERR_BADXDR);
+			goto out;
+		}
+		p = xdr_decode_hyper(p, &args->cbl_fsid.major);
+		p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
+	} else if (args->cbl_recall_type != RETURN_ALL) {
+		status = htonl(NFS4ERR_BADXDR);
+		goto out;
+	}
+	dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
+		__func__,
+		args->cbl_layout_type, iomode,
+		args->cbl_layoutchanged, args->cbl_recall_type);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
+static
+__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
+				struct xdr_stream *xdr,
+				struct cb_devicenotifyargs *args)
+{
+	__be32 *p;
+	__be32 status = 0;
+	u32 tmp;
+	int n, i;
+	args->ndevs = 0;
+
+	/* Num of device notifications */
+	p = read_buf(xdr, sizeof(uint32_t));
+	if (unlikely(p == NULL)) {
+		status = htonl(NFS4ERR_BADXDR);
+		goto out;
+	}
+	n = ntohl(*p++);
+	if (n <= 0)
+		goto out;
+
+	args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
+	if (!args->devs) {
+		status = htonl(NFS4ERR_DELAY);
+		goto out;
+	}
+
+	/* Decode each dev notification */
+	for (i = 0; i < n; i++) {
+		struct cb_devicenotifyitem *dev = &args->devs[i];
+
+		p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
+		if (unlikely(p == NULL)) {
+			status = htonl(NFS4ERR_BADXDR);
+			goto err;
+		}
+
+		tmp = ntohl(*p++);	/* bitmap size */
+		if (tmp != 1) {
+			status = htonl(NFS4ERR_INVAL);
+			goto err;
+		}
+		dev->cbd_notify_type = ntohl(*p++);
+		if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
+		    dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
+			status = htonl(NFS4ERR_INVAL);
+			goto err;
+		}
+
+		tmp = ntohl(*p++);	/* opaque size */
+		if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
+		     (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
+		    ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
+		     (tmp != NFS4_DEVICEID4_SIZE + 4))) {
+			status = htonl(NFS4ERR_INVAL);
+			goto err;
+		}
+		dev->cbd_layout_type = ntohl(*p++);
+		memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
+		p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+		if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
+			p = read_buf(xdr, sizeof(uint32_t));
+			if (unlikely(p == NULL)) {
+				status = htonl(NFS4ERR_BADXDR);
+				goto err;
+			}
+			dev->cbd_immediate = ntohl(*p++);
+		} else {
+			dev->cbd_immediate = 0;
+		}
+
+		args->ndevs++;
+
+		dprintk("%s: type %d layout 0x%x immediate %d\n",
+			__func__, dev->cbd_notify_type, dev->cbd_layout_type,
+			dev->cbd_immediate);
+	}
+out:
+	dprintk("%s: status %d ndevs %d\n",
+		__func__, ntohl(status), args->ndevs);
+	return status;
+err:
+	kfree(args->devs);
+	goto out;
+}
+
+static __be32 decode_sessionid(struct xdr_stream *xdr,
+				 struct nfs4_sessionid *sid)
+{
+	__be32 *p;
+	int len = NFS4_MAX_SESSIONID_LEN;
+
+	p = read_buf(xdr, len);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+
+	memcpy(sid->data, p, len);
+	return 0;
+}
+
+static __be32 decode_rc_list(struct xdr_stream *xdr,
+			       struct referring_call_list *rc_list)
+{
+	__be32 *p;
+	int i;
+	__be32 status;
+
+	status = decode_sessionid(xdr, &rc_list->rcl_sessionid);
+	if (status)
+		goto out;
+
+	status = htonl(NFS4ERR_RESOURCE);
+	p = read_buf(xdr, sizeof(uint32_t));
+	if (unlikely(p == NULL))
+		goto out;
+
+	rc_list->rcl_nrefcalls = ntohl(*p++);
+	if (rc_list->rcl_nrefcalls) {
+		p = read_buf(xdr,
+			     rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
+		if (unlikely(p == NULL))
+			goto out;
+		rc_list->rcl_refcalls = kmalloc(rc_list->rcl_nrefcalls *
+						sizeof(*rc_list->rcl_refcalls),
+						GFP_KERNEL);
+		if (unlikely(rc_list->rcl_refcalls == NULL))
+			goto out;
+		for (i = 0; i < rc_list->rcl_nrefcalls; i++) {
+			rc_list->rcl_refcalls[i].rc_sequenceid = ntohl(*p++);
+			rc_list->rcl_refcalls[i].rc_slotid = ntohl(*p++);
+		}
+	}
+	status = 0;
+
+out:
+	return status;
+}
+
+static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					struct cb_sequenceargs *args)
+{
+	__be32 *p;
+	int i;
+	__be32 status;
+
+	status = decode_sessionid(xdr, &args->csa_sessionid);
+	if (status)
+		goto out;
+
+	status = htonl(NFS4ERR_RESOURCE);
+	p = read_buf(xdr, 5 * sizeof(uint32_t));
+	if (unlikely(p == NULL))
+		goto out;
+
+	args->csa_addr = svc_addr(rqstp);
+	args->csa_sequenceid = ntohl(*p++);
+	args->csa_slotid = ntohl(*p++);
+	args->csa_highestslotid = ntohl(*p++);
+	args->csa_cachethis = ntohl(*p++);
+	args->csa_nrclists = ntohl(*p++);
+	args->csa_rclists = NULL;
+	if (args->csa_nrclists) {
+		args->csa_rclists = kmalloc(args->csa_nrclists *
+					    sizeof(*args->csa_rclists),
+					    GFP_KERNEL);
+		if (unlikely(args->csa_rclists == NULL))
+			goto out;
+
+		for (i = 0; i < args->csa_nrclists; i++) {
+			status = decode_rc_list(xdr, &args->csa_rclists[i]);
+			if (status)
+				goto out_free;
+		}
+	}
+	status = 0;
+
+	dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u "
+		"highestslotid %u cachethis %d nrclists %u\n",
+		__func__,
+		((u32 *)&args->csa_sessionid)[0],
+		((u32 *)&args->csa_sessionid)[1],
+		((u32 *)&args->csa_sessionid)[2],
+		((u32 *)&args->csa_sessionid)[3],
+		args->csa_sequenceid, args->csa_slotid,
+		args->csa_highestslotid, args->csa_cachethis,
+		args->csa_nrclists);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+
+out_free:
+	for (i = 0; i < args->csa_nrclists; i++)
+		kfree(args->csa_rclists[i].rcl_refcalls);
+	kfree(args->csa_rclists);
+	goto out;
+}
+
+static __be32 decode_recallany_args(struct svc_rqst *rqstp,
+				      struct xdr_stream *xdr,
+				      struct cb_recallanyargs *args)
+{
+	__be32 *p;
+
+	args->craa_addr = svc_addr(rqstp);
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_BADXDR);
+	args->craa_objs_to_keep = ntohl(*p++);
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_BADXDR);
+	args->craa_type_mask = ntohl(*p);
+
+	return 0;
+}
+
+static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					struct cb_recallslotargs *args)
+{
+	__be32 *p;
+
+	args->crsa_addr = svc_addr(rqstp);
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_BADXDR);
+	args->crsa_target_max_slots = ntohl(*p++);
+	return 0;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4 + len);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	xdr_encode_opaque(p, str, len);
+	return 0;
+}
+
+#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE)
+#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY)
+static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, __be32 **savep)
+{
+	__be32 bm[2];
+	__be32 *p;
+
+	bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0);
+	bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1);
+	if (bm[1] != 0) {
+		p = xdr_reserve_space(xdr, 16);
+		if (unlikely(p == NULL))
+			return htonl(NFS4ERR_RESOURCE);
+		*p++ = htonl(2);
+		*p++ = bm[0];
+		*p++ = bm[1];
+	} else if (bm[0] != 0) {
+		p = xdr_reserve_space(xdr, 12);
+		if (unlikely(p == NULL))
+			return htonl(NFS4ERR_RESOURCE);
+		*p++ = htonl(1);
+		*p++ = bm[0];
+	} else {
+		p = xdr_reserve_space(xdr, 8);
+		if (unlikely(p == NULL))
+			return htonl(NFS4ERR_RESOURCE);
+		*p++ = htonl(0);
+	}
+	*savep = p;
+	return 0;
+}
+
+static __be32 encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change)
+{
+	__be32 *p;
+
+	if (!(bitmap[0] & FATTR4_WORD0_CHANGE))
+		return 0;
+	p = xdr_reserve_space(xdr, 8);
+	if (unlikely(!p))
+		return htonl(NFS4ERR_RESOURCE);
+	p = xdr_encode_hyper(p, change);
+	return 0;
+}
+
+static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size)
+{
+	__be32 *p;
+
+	if (!(bitmap[0] & FATTR4_WORD0_SIZE))
+		return 0;
+	p = xdr_reserve_space(xdr, 8);
+	if (unlikely(!p))
+		return htonl(NFS4ERR_RESOURCE);
+	p = xdr_encode_hyper(p, size);
+	return 0;
+}
+
+static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *time)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 12);
+	if (unlikely(!p))
+		return htonl(NFS4ERR_RESOURCE);
+	p = xdr_encode_hyper(p, time->tv_sec);
+	*p = htonl(time->tv_nsec);
+	return 0;
+}
+
+static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
+{
+	if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA))
+		return 0;
+	return encode_attr_time(xdr,time);
+}
+
+static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
+{
+	if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY))
+		return 0;
+	return encode_attr_time(xdr,time);
+}
+
+static __be32 encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr)
+{
+	__be32 status;
+
+	hdr->status = xdr_reserve_space(xdr, 4);
+	if (unlikely(hdr->status == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	status = encode_string(xdr, hdr->taglen, hdr->tag);
+	if (unlikely(status != 0))
+		return status;
+	hdr->nops = xdr_reserve_space(xdr, 4);
+	if (unlikely(hdr->nops == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+	return 0;
+}
+
+static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res)
+{
+	__be32 *p;
+	
+	p = xdr_reserve_space(xdr, 8);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE_HDR);
+	*p++ = htonl(op);
+	*p = res;
+	return 0;
+}
+
+static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res)
+{
+	__be32 *savep = NULL;
+	__be32 status = res->status;
+	
+	if (unlikely(status != 0))
+		goto out;
+	status = encode_attr_bitmap(xdr, res->bitmap, &savep);
+	if (unlikely(status != 0))
+		goto out;
+	status = encode_attr_change(xdr, res->bitmap, res->change_attr);
+	if (unlikely(status != 0))
+		goto out;
+	status = encode_attr_size(xdr, res->bitmap, res->size);
+	if (unlikely(status != 0))
+		goto out;
+	status = encode_attr_ctime(xdr, res->bitmap, &res->ctime);
+	if (unlikely(status != 0))
+		goto out;
+	status = encode_attr_mtime(xdr, res->bitmap, &res->mtime);
+	*savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1)));
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static __be32 encode_sessionid(struct xdr_stream *xdr,
+				 const struct nfs4_sessionid *sid)
+{
+	__be32 *p;
+	int len = NFS4_MAX_SESSIONID_LEN;
+
+	p = xdr_reserve_space(xdr, len);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+
+	memcpy(p, sid, len);
+	return 0;
+}
+
+static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
+				       struct xdr_stream *xdr,
+				       const struct cb_sequenceres *res)
+{
+	__be32 *p;
+	unsigned status = res->csr_status;
+
+	if (unlikely(status != 0))
+		goto out;
+
+	encode_sessionid(xdr, &res->csr_sessionid);
+
+	p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t));
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_RESOURCE);
+
+	*p++ = htonl(res->csr_sequenceid);
+	*p++ = htonl(res->csr_slotid);
+	*p++ = htonl(res->csr_highestslotid);
+	*p++ = htonl(res->csr_target_highestslotid);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
+static __be32
+preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+	if (op_nr == OP_CB_SEQUENCE) {
+		if (nop != 0)
+			return htonl(NFS4ERR_SEQUENCE_POS);
+	} else {
+		if (nop == 0)
+			return htonl(NFS4ERR_OP_NOT_IN_SESSION);
+	}
+
+	switch (op_nr) {
+	case OP_CB_GETATTR:
+	case OP_CB_RECALL:
+	case OP_CB_SEQUENCE:
+	case OP_CB_RECALL_ANY:
+	case OP_CB_RECALL_SLOT:
+	case OP_CB_LAYOUTRECALL:
+	case OP_CB_NOTIFY_DEVICEID:
+		*op = &callback_ops[op_nr];
+		break;
+
+	case OP_CB_NOTIFY:
+	case OP_CB_PUSH_DELEG:
+	case OP_CB_RECALLABLE_OBJ_AVAIL:
+	case OP_CB_WANTS_CANCELLED:
+	case OP_CB_NOTIFY_LOCK:
+		return htonl(NFS4ERR_NOTSUPP);
+
+	default:
+		return htonl(NFS4ERR_OP_ILLEGAL);
+	}
+
+	return htonl(NFS_OK);
+}
+
+static void nfs4_callback_free_slot(struct nfs4_session *session)
+{
+	struct nfs4_slot_table *tbl = &session->bc_slot_table;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	/*
+	 * Let the state manager know callback processing done.
+	 * A single slot, so highest used slotid is either 0 or -1
+	 */
+	tbl->highest_used_slotid = -1;
+	nfs4_check_drain_bc_complete(session);
+	spin_unlock(&tbl->slot_tbl_lock);
+}
+
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
+{
+	if (cps->slotid != -1)
+		nfs4_callback_free_slot(cps->clp->cl_session);
+}
+
+#else /* CONFIG_NFS_V4_1 */
+
+static __be32
+preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+	return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static __be32
+preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
+{
+	switch (op_nr) {
+	case OP_CB_GETATTR:
+	case OP_CB_RECALL:
+		*op = &callback_ops[op_nr];
+		break;
+	default:
+		return htonl(NFS4ERR_OP_ILLEGAL);
+	}
+
+	return htonl(NFS_OK);
+}
+
+static __be32 process_op(uint32_t minorversion, int nop,
+		struct svc_rqst *rqstp,
+		struct xdr_stream *xdr_in, void *argp,
+		struct xdr_stream *xdr_out, void *resp,
+		struct cb_process_state *cps)
+{
+	struct callback_op *op = &callback_ops[0];
+	unsigned int op_nr;
+	__be32 status;
+	long maxlen;
+	__be32 res;
+
+	dprintk("%s: start\n", __func__);
+	status = decode_op_hdr(xdr_in, &op_nr);
+	if (unlikely(status))
+		return status;
+
+	dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
+		__func__, minorversion, nop, op_nr);
+
+	status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) :
+				preprocess_nfs4_op(op_nr, &op);
+	if (status == htonl(NFS4ERR_OP_ILLEGAL))
+		op_nr = OP_CB_ILLEGAL;
+	if (status)
+		goto encode_hdr;
+
+	if (cps->drc_status) {
+		status = cps->drc_status;
+		goto encode_hdr;
+	}
+
+	maxlen = xdr_out->end - xdr_out->p;
+	if (maxlen > 0 && maxlen < PAGE_SIZE) {
+		status = op->decode_args(rqstp, xdr_in, argp);
+		if (likely(status == 0))
+			status = op->process_op(argp, resp, cps);
+	} else
+		status = htonl(NFS4ERR_RESOURCE);
+
+encode_hdr:
+	res = encode_op_hdr(xdr_out, op_nr, status);
+	if (unlikely(res))
+		return res;
+	if (op->encode_res != NULL && status == 0)
+		status = op->encode_res(rqstp, xdr_out, resp);
+	dprintk("%s: done, status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
+/*
+ * Decode, process and encode a COMPOUND
+ */
+static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	struct cb_compound_hdr_arg hdr_arg = { 0 };
+	struct cb_compound_hdr_res hdr_res = { NULL };
+	struct xdr_stream xdr_in, xdr_out;
+	__be32 *p, status;
+	struct cb_process_state cps = {
+		.drc_status = 0,
+		.clp = NULL,
+		.slotid = -1,
+	};
+	unsigned int nops = 0;
+
+	dprintk("%s: start\n", __func__);
+
+	xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
+
+	p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
+	xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
+
+	status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
+	if (status == __constant_htonl(NFS4ERR_RESOURCE))
+		return rpc_garbage_args;
+
+	if (hdr_arg.minorversion == 0) {
+		cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
+		if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
+			return rpc_drop_reply;
+	}
+
+	hdr_res.taglen = hdr_arg.taglen;
+	hdr_res.tag = hdr_arg.tag;
+	if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
+		return rpc_system_err;
+
+	while (status == 0 && nops != hdr_arg.nops) {
+		status = process_op(hdr_arg.minorversion, nops, rqstp,
+				    &xdr_in, argp, &xdr_out, resp, &cps);
+		nops++;
+	}
+
+	/* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return
+	* resource error in cb_compound status without returning op */
+	if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) {
+		status = htonl(NFS4ERR_RESOURCE);
+		nops--;
+	}
+
+	*hdr_res.status = status;
+	*hdr_res.nops = htonl(nops);
+	nfs4_cb_free_slot(&cps);
+	nfs_put_client(cps.clp);
+	dprintk("%s: done, status = %u\n", __func__, ntohl(status));
+	return rpc_success;
+}
+
+/*
+ * Define NFS4 callback COMPOUND ops.
+ */
+static struct callback_op callback_ops[] = {
+	[0] = {
+		.res_maxsize = CB_OP_HDR_RES_MAXSZ,
+	},
+	[OP_CB_GETATTR] = {
+		.process_op = (callback_process_op_t)nfs4_callback_getattr,
+		.decode_args = (callback_decode_arg_t)decode_getattr_args,
+		.encode_res = (callback_encode_res_t)encode_getattr_res,
+		.res_maxsize = CB_OP_GETATTR_RES_MAXSZ,
+	},
+	[OP_CB_RECALL] = {
+		.process_op = (callback_process_op_t)nfs4_callback_recall,
+		.decode_args = (callback_decode_arg_t)decode_recall_args,
+		.res_maxsize = CB_OP_RECALL_RES_MAXSZ,
+	},
+#if defined(CONFIG_NFS_V4_1)
+	[OP_CB_LAYOUTRECALL] = {
+		.process_op = (callback_process_op_t)nfs4_callback_layoutrecall,
+		.decode_args =
+			(callback_decode_arg_t)decode_layoutrecall_args,
+		.res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
+	},
+	[OP_CB_NOTIFY_DEVICEID] = {
+		.process_op = (callback_process_op_t)nfs4_callback_devicenotify,
+		.decode_args =
+			(callback_decode_arg_t)decode_devicenotify_args,
+		.res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
+	},
+	[OP_CB_SEQUENCE] = {
+		.process_op = (callback_process_op_t)nfs4_callback_sequence,
+		.decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
+		.encode_res = (callback_encode_res_t)encode_cb_sequence_res,
+		.res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ,
+	},
+	[OP_CB_RECALL_ANY] = {
+		.process_op = (callback_process_op_t)nfs4_callback_recallany,
+		.decode_args = (callback_decode_arg_t)decode_recallany_args,
+		.res_maxsize = CB_OP_RECALLANY_RES_MAXSZ,
+	},
+	[OP_CB_RECALL_SLOT] = {
+		.process_op = (callback_process_op_t)nfs4_callback_recallslot,
+		.decode_args = (callback_decode_arg_t)decode_recallslot_args,
+		.res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ,
+	},
+#endif /* CONFIG_NFS_V4_1 */
+};
+
+/*
+ * Define NFS4 callback procedures
+ */
+static struct svc_procedure nfs4_callback_procedures1[] = {
+	[CB_NULL] = {
+		.pc_func = nfs4_callback_null,
+		.pc_decode = (kxdrproc_t)nfs4_decode_void,
+		.pc_encode = (kxdrproc_t)nfs4_encode_void,
+		.pc_xdrressize = 1,
+	},
+	[CB_COMPOUND] = {
+		.pc_func = nfs4_callback_compound,
+		.pc_encode = (kxdrproc_t)nfs4_encode_void,
+		.pc_argsize = 256,
+		.pc_ressize = 256,
+		.pc_xdrressize = NFS4_CALLBACK_BUFSIZE,
+	}
+};
+
+struct svc_version nfs4_callback_version1 = {
+	.vs_vers = 1,
+	.vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
+	.vs_proc = nfs4_callback_procedures1,
+	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
+	.vs_dispatch = NULL,
+	.vs_hidden = 1,
+};
+
+struct svc_version nfs4_callback_version4 = {
+	.vs_vers = 4,
+	.vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
+	.vs_proc = nfs4_callback_procedures1,
+	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
+	.vs_dispatch = NULL,
+};
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
new file mode 100644
index 00000000..b3dc2b88
--- /dev/null
+++ b/fs/nfs/client.c
@@ -0,0 +1,2004 @@
+/* client.c: NFS client sharing and management code
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/xprtrdma.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/nfs_idmap.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <net/ipv6.h>
+#include <linux/nfs_xdr.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#include <asm/system.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_CLIENT
+
+static DEFINE_SPINLOCK(nfs_client_lock);
+static LIST_HEAD(nfs_client_list);
+static LIST_HEAD(nfs_volume_list);
+static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
+#ifdef CONFIG_NFS_V4
+static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
+
+/*
+ * Get a unique NFSv4.0 callback identifier which will be used
+ * by the V4.0 callback service to lookup the nfs_client struct
+ */
+static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
+{
+	int ret = 0;
+
+	if (clp->rpc_ops->version != 4 || minorversion != 0)
+		return ret;
+retry:
+	if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL))
+		return -ENOMEM;
+	spin_lock(&nfs_client_lock);
+	ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident);
+	spin_unlock(&nfs_client_lock);
+	if (ret == -EAGAIN)
+		goto retry;
+	return ret;
+}
+#endif /* CONFIG_NFS_V4 */
+
+/*
+ * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
+ */
+static int nfs4_disable_idmapping = 0;
+
+/*
+ * RPC cruft for NFS
+ */
+static struct rpc_version *nfs_version[5] = {
+	[2]			= &nfs_version2,
+#ifdef CONFIG_NFS_V3
+	[3]			= &nfs_version3,
+#endif
+#ifdef CONFIG_NFS_V4
+	[4]			= &nfs_version4,
+#endif
+};
+
+struct rpc_program nfs_program = {
+	.name			= "nfs",
+	.number			= NFS_PROGRAM,
+	.nrvers			= ARRAY_SIZE(nfs_version),
+	.version		= nfs_version,
+	.stats			= &nfs_rpcstat,
+	.pipe_dir_name		= "/nfs",
+};
+
+struct rpc_stat nfs_rpcstat = {
+	.program		= &nfs_program
+};
+
+
+#ifdef CONFIG_NFS_V3_ACL
+static struct rpc_stat		nfsacl_rpcstat = { &nfsacl_program };
+static struct rpc_version *	nfsacl_version[] = {
+	[3]			= &nfsacl_version3,
+};
+
+struct rpc_program		nfsacl_program = {
+	.name			= "nfsacl",
+	.number			= NFS_ACL_PROGRAM,
+	.nrvers			= ARRAY_SIZE(nfsacl_version),
+	.version		= nfsacl_version,
+	.stats			= &nfsacl_rpcstat,
+};
+#endif  /* CONFIG_NFS_V3_ACL */
+
+struct nfs_client_initdata {
+	const char *hostname;
+	const struct sockaddr *addr;
+	size_t addrlen;
+	const struct nfs_rpc_ops *rpc_ops;
+	int proto;
+	u32 minorversion;
+};
+
+/*
+ * Allocate a shared client record
+ *
+ * Since these are allocated/deallocated very rarely, we don't
+ * bother putting them in a slab cache...
+ */
+static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
+{
+	struct nfs_client *clp;
+	struct rpc_cred *cred;
+	int err = -ENOMEM;
+
+	if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
+		goto error_0;
+
+	clp->rpc_ops = cl_init->rpc_ops;
+
+	atomic_set(&clp->cl_count, 1);
+	clp->cl_cons_state = NFS_CS_INITING;
+
+	memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen);
+	clp->cl_addrlen = cl_init->addrlen;
+
+	if (cl_init->hostname) {
+		err = -ENOMEM;
+		clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL);
+		if (!clp->cl_hostname)
+			goto error_cleanup;
+	}
+
+	INIT_LIST_HEAD(&clp->cl_superblocks);
+	clp->cl_rpcclient = ERR_PTR(-EINVAL);
+
+	clp->cl_proto = cl_init->proto;
+
+#ifdef CONFIG_NFS_V4
+	err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
+	if (err)
+		goto error_cleanup;
+
+	spin_lock_init(&clp->cl_lock);
+	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
+	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
+	clp->cl_boot_time = CURRENT_TIME;
+	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
+	clp->cl_minorversion = cl_init->minorversion;
+	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
+#endif
+	cred = rpc_lookup_machine_cred();
+	if (!IS_ERR(cred))
+		clp->cl_machine_cred = cred;
+#if defined(CONFIG_NFS_V4_1)
+	INIT_LIST_HEAD(&clp->cl_layouts);
+#endif
+	nfs_fscache_get_client_cookie(clp);
+
+	return clp;
+
+error_cleanup:
+	kfree(clp);
+error_0:
+	return ERR_PTR(err);
+}
+
+#ifdef CONFIG_NFS_V4
+#ifdef CONFIG_NFS_V4_1
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+	if (nfs4_has_session(clp))
+		nfs4_destroy_session(clp->cl_session);
+}
+#else /* CONFIG_NFS_V4_1 */
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * Destroy the NFS4 callback service
+ */
+static void nfs4_destroy_callback(struct nfs_client *clp)
+{
+	if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
+		nfs_callback_down(clp->cl_mvops->minor_version);
+}
+
+static void nfs4_shutdown_client(struct nfs_client *clp)
+{
+	if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
+		nfs4_kill_renewd(clp);
+	nfs4_shutdown_session(clp);
+	nfs4_destroy_callback(clp);
+	if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
+		nfs_idmap_delete(clp);
+
+	rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
+}
+
+/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
+void nfs_cleanup_cb_ident_idr(void)
+{
+	idr_destroy(&cb_ident_idr);
+}
+
+/* nfs_client_lock held */
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+	if (clp->cl_cb_ident)
+		idr_remove(&cb_ident_idr, clp->cl_cb_ident);
+}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+	rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+}
+
+#else
+static void nfs4_shutdown_client(struct nfs_client *clp)
+{
+}
+
+void nfs_cleanup_cb_ident_idr(void)
+{
+}
+
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+}
+
+#endif /* CONFIG_NFS_V4 */
+
+/*
+ * Destroy a shared client record
+ */
+static void nfs_free_client(struct nfs_client *clp)
+{
+	dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
+
+	nfs4_shutdown_client(clp);
+
+	nfs_fscache_release_client_cookie(clp);
+
+	/* -EIO all pending I/O */
+	if (!IS_ERR(clp->cl_rpcclient))
+		rpc_shutdown_client(clp->cl_rpcclient);
+
+	if (clp->cl_machine_cred != NULL)
+		put_rpccred(clp->cl_machine_cred);
+
+	nfs4_deviceid_purge_client(clp);
+
+	kfree(clp->cl_hostname);
+	kfree(clp);
+
+	dprintk("<-- nfs_free_client()\n");
+}
+
+/*
+ * Release a reference to a shared client record
+ */
+void nfs_put_client(struct nfs_client *clp)
+{
+	if (!clp)
+		return;
+
+	dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count));
+
+	if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
+		list_del(&clp->cl_share_link);
+		nfs_cb_idr_remove_locked(clp);
+		spin_unlock(&nfs_client_lock);
+
+		BUG_ON(!list_empty(&clp->cl_superblocks));
+
+		nfs_free_client(clp);
+	}
+}
+EXPORT_SYMBOL_GPL(nfs_put_client);
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/*
+ * Test if two ip6 socket addresses refer to the same socket by
+ * comparing relevant fields. The padding bytes specifically, are not
+ * compared. sin6_flowinfo is not compared because it only affects QoS
+ * and sin6_scope_id is only compared if the address is "link local"
+ * because "link local" addresses need only be unique to a specific
+ * link. Conversely, ordinary unicast addresses might have different
+ * sin6_scope_id.
+ *
+ * The caller should ensure both socket addresses are AF_INET6.
+ */
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
+				      const struct sockaddr *sa2)
+{
+	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
+
+	if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
+	    sin1->sin6_scope_id != sin2->sin6_scope_id)
+		return 0;
+
+	return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
+}
+#else	/* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
+				      const struct sockaddr *sa2)
+{
+	return 0;
+}
+#endif
+
+/*
+ * Test if two ip4 socket addresses refer to the same socket, by
+ * comparing relevant fields. The padding bytes specifically, are
+ * not compared.
+ *
+ * The caller should ensure both socket addresses are AF_INET.
+ */
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
+				      const struct sockaddr *sa2)
+{
+	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
+
+	return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
+}
+
+static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
+				const struct sockaddr *sa2)
+{
+	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
+
+	return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
+		(sin1->sin6_port == sin2->sin6_port);
+}
+
+static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
+				const struct sockaddr *sa2)
+{
+	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
+
+	return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
+		(sin1->sin_port == sin2->sin_port);
+}
+
+/*
+ * Test if two socket addresses represent the same actual socket,
+ * by comparing (only) relevant fields, excluding the port number.
+ */
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+				     const struct sockaddr *sa2)
+{
+	if (sa1->sa_family != sa2->sa_family)
+		return 0;
+
+	switch (sa1->sa_family) {
+	case AF_INET:
+		return nfs_sockaddr_match_ipaddr4(sa1, sa2);
+	case AF_INET6:
+		return nfs_sockaddr_match_ipaddr6(sa1, sa2);
+	}
+	return 0;
+}
+
+/*
+ * Test if two socket addresses represent the same actual socket,
+ * by comparing (only) relevant fields, including the port number.
+ */
+static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
+			    const struct sockaddr *sa2)
+{
+	if (sa1->sa_family != sa2->sa_family)
+		return 0;
+
+	switch (sa1->sa_family) {
+	case AF_INET:
+		return nfs_sockaddr_cmp_ip4(sa1, sa2);
+	case AF_INET6:
+		return nfs_sockaddr_cmp_ip6(sa1, sa2);
+	}
+	return 0;
+}
+
+/* Common match routine for v4.0 and v4.1 callback services */
+bool
+nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
+		     u32 minorversion)
+{
+	struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+
+	/* Don't match clients that failed to initialise */
+	if (!(clp->cl_cons_state == NFS_CS_READY ||
+	    clp->cl_cons_state == NFS_CS_SESSION_INITING))
+		return false;
+
+	/* Match the version and minorversion */
+	if (clp->rpc_ops->version != 4 ||
+	    clp->cl_minorversion != minorversion)
+		return false;
+
+	/* Match only the IP address, not the port number */
+	if (!nfs_sockaddr_match_ipaddr(addr, clap))
+		return false;
+
+	return true;
+}
+
+/*
+ * Find an nfs_client on the list that matches the initialisation data
+ * that is supplied.
+ */
+static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
+{
+	struct nfs_client *clp;
+	const struct sockaddr *sap = data->addr;
+
+	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+	        const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+		/* Don't match clients that failed to initialise properly */
+		if (clp->cl_cons_state < 0)
+			continue;
+
+		/* Different NFS versions cannot share the same nfs_client */
+		if (clp->rpc_ops != data->rpc_ops)
+			continue;
+
+		if (clp->cl_proto != data->proto)
+			continue;
+		/* Match nfsv4 minorversion */
+		if (clp->cl_minorversion != data->minorversion)
+			continue;
+		/* Match the full socket address */
+		if (!nfs_sockaddr_cmp(sap, clap))
+			continue;
+
+		atomic_inc(&clp->cl_count);
+		return clp;
+	}
+	return NULL;
+}
+
+/*
+ * Look up a client by IP address and protocol version
+ * - creates a new record if one doesn't yet exist
+ */
+static struct nfs_client *
+nfs_get_client(const struct nfs_client_initdata *cl_init,
+	       const struct rpc_timeout *timeparms,
+	       const char *ip_addr,
+	       rpc_authflavor_t authflavour,
+	       int noresvport)
+{
+	struct nfs_client *clp, *new = NULL;
+	int error;
+
+	dprintk("--> nfs_get_client(%s,v%u)\n",
+		cl_init->hostname ?: "", cl_init->rpc_ops->version);
+
+	/* see if the client already exists */
+	do {
+		spin_lock(&nfs_client_lock);
+
+		clp = nfs_match_client(cl_init);
+		if (clp)
+			goto found_client;
+		if (new)
+			goto install_client;
+
+		spin_unlock(&nfs_client_lock);
+
+		new = nfs_alloc_client(cl_init);
+	} while (!IS_ERR(new));
+
+	dprintk("--> nfs_get_client() = %ld [failed]\n", PTR_ERR(new));
+	return new;
+
+	/* install a new client and return with it unready */
+install_client:
+	clp = new;
+	list_add(&clp->cl_share_link, &nfs_client_list);
+	spin_unlock(&nfs_client_lock);
+
+	error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
+					      authflavour, noresvport);
+	if (error < 0) {
+		nfs_put_client(clp);
+		return ERR_PTR(error);
+	}
+	dprintk("--> nfs_get_client() = %p [new]\n", clp);
+	return clp;
+
+	/* found an existing client
+	 * - make sure it's ready before returning
+	 */
+found_client:
+	spin_unlock(&nfs_client_lock);
+
+	if (new)
+		nfs_free_client(new);
+
+	error = wait_event_killable(nfs_client_active_wq,
+				clp->cl_cons_state < NFS_CS_INITING);
+	if (error < 0) {
+		nfs_put_client(clp);
+		return ERR_PTR(-ERESTARTSYS);
+	}
+
+	if (clp->cl_cons_state < NFS_CS_READY) {
+		error = clp->cl_cons_state;
+		nfs_put_client(clp);
+		return ERR_PTR(error);
+	}
+
+	BUG_ON(clp->cl_cons_state != NFS_CS_READY);
+
+	dprintk("--> nfs_get_client() = %p [share]\n", clp);
+	return clp;
+}
+
+/*
+ * Mark a server as ready or failed
+ */
+void nfs_mark_client_ready(struct nfs_client *clp, int state)
+{
+	clp->cl_cons_state = state;
+	wake_up_all(&nfs_client_active_wq);
+}
+
+/*
+ * With sessions, the client is not marked ready until after a
+ * successful EXCHANGE_ID and CREATE_SESSION.
+ *
+ * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
+ * other versions of NFS can be tried.
+ */
+int nfs4_check_client_ready(struct nfs_client *clp)
+{
+	if (!nfs4_has_session(clp))
+		return 0;
+	if (clp->cl_cons_state < NFS_CS_READY)
+		return -EPROTONOSUPPORT;
+	return 0;
+}
+
+/*
+ * Initialise the timeout values for a connection
+ */
+static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
+				    unsigned int timeo, unsigned int retrans)
+{
+	to->to_initval = timeo * HZ / 10;
+	to->to_retries = retrans;
+
+	switch (proto) {
+	case XPRT_TRANSPORT_TCP:
+	case XPRT_TRANSPORT_RDMA:
+		if (to->to_retries == 0)
+			to->to_retries = NFS_DEF_TCP_RETRANS;
+		if (to->to_initval == 0)
+			to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
+		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
+			to->to_initval = NFS_MAX_TCP_TIMEOUT;
+		to->to_increment = to->to_initval;
+		to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
+		if (to->to_maxval > NFS_MAX_TCP_TIMEOUT)
+			to->to_maxval = NFS_MAX_TCP_TIMEOUT;
+		if (to->to_maxval < to->to_initval)
+			to->to_maxval = to->to_initval;
+		to->to_exponential = 0;
+		break;
+	case XPRT_TRANSPORT_UDP:
+		if (to->to_retries == 0)
+			to->to_retries = NFS_DEF_UDP_RETRANS;
+		if (!to->to_initval)
+			to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
+		if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
+			to->to_initval = NFS_MAX_UDP_TIMEOUT;
+		to->to_maxval = NFS_MAX_UDP_TIMEOUT;
+		to->to_exponential = 1;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Create an RPC client handle
+ */
+static int nfs_create_rpc_client(struct nfs_client *clp,
+				 const struct rpc_timeout *timeparms,
+				 rpc_authflavor_t flavor,
+				 int discrtry, int noresvport)
+{
+	struct rpc_clnt		*clnt = NULL;
+	struct rpc_create_args args = {
+		.net		= &init_net,
+		.protocol	= clp->cl_proto,
+		.address	= (struct sockaddr *)&clp->cl_addr,
+		.addrsize	= clp->cl_addrlen,
+		.timeout	= timeparms,
+		.servername	= clp->cl_hostname,
+		.program	= &nfs_program,
+		.version	= clp->rpc_ops->version,
+		.authflavor	= flavor,
+	};
+
+	if (discrtry)
+		args.flags |= RPC_CLNT_CREATE_DISCRTRY;
+	if (noresvport)
+		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+	if (!IS_ERR(clp->cl_rpcclient))
+		return 0;
+
+	clnt = rpc_create(&args);
+	if (IS_ERR(clnt)) {
+		dprintk("%s: cannot create RPC client. Error = %ld\n",
+				__func__, PTR_ERR(clnt));
+		return PTR_ERR(clnt);
+	}
+
+	clp->cl_rpcclient = clnt;
+	return 0;
+}
+
+/*
+ * Version 2 or 3 client destruction
+ */
+static void nfs_destroy_server(struct nfs_server *server)
+{
+	if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) ||
+			!(server->flags & NFS_MOUNT_LOCAL_FCNTL))
+		nlmclnt_done(server->nlm_host);
+}
+
+/*
+ * Version 2 or 3 lockd setup
+ */
+static int nfs_start_lockd(struct nfs_server *server)
+{
+	struct nlm_host *host;
+	struct nfs_client *clp = server->nfs_client;
+	struct nlmclnt_initdata nlm_init = {
+		.hostname	= clp->cl_hostname,
+		.address	= (struct sockaddr *)&clp->cl_addr,
+		.addrlen	= clp->cl_addrlen,
+		.nfs_version	= clp->rpc_ops->version,
+		.noresvport	= server->flags & NFS_MOUNT_NORESVPORT ?
+					1 : 0,
+	};
+
+	if (nlm_init.nfs_version > 3)
+		return 0;
+	if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) &&
+			(server->flags & NFS_MOUNT_LOCAL_FCNTL))
+		return 0;
+
+	switch (clp->cl_proto) {
+		default:
+			nlm_init.protocol = IPPROTO_TCP;
+			break;
+		case XPRT_TRANSPORT_UDP:
+			nlm_init.protocol = IPPROTO_UDP;
+	}
+
+	host = nlmclnt_init(&nlm_init);
+	if (IS_ERR(host))
+		return PTR_ERR(host);
+
+	server->nlm_host = host;
+	server->destroy = nfs_destroy_server;
+	return 0;
+}
+
+/*
+ * Initialise an NFSv3 ACL client connection
+ */
+#ifdef CONFIG_NFS_V3_ACL
+static void nfs_init_server_aclclient(struct nfs_server *server)
+{
+	if (server->nfs_client->rpc_ops->version != 3)
+		goto out_noacl;
+	if (server->flags & NFS_MOUNT_NOACL)
+		goto out_noacl;
+
+	server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
+	if (IS_ERR(server->client_acl))
+		goto out_noacl;
+
+	/* No errors! Assume that Sun nfsacls are supported */
+	server->caps |= NFS_CAP_ACLS;
+	return;
+
+out_noacl:
+	server->caps &= ~NFS_CAP_ACLS;
+}
+#else
+static inline void nfs_init_server_aclclient(struct nfs_server *server)
+{
+	server->flags &= ~NFS_MOUNT_NOACL;
+	server->caps &= ~NFS_CAP_ACLS;
+}
+#endif
+
+/*
+ * Create a general RPC client
+ */
+static int nfs_init_server_rpcclient(struct nfs_server *server,
+		const struct rpc_timeout *timeo,
+		rpc_authflavor_t pseudoflavour)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	server->client = rpc_clone_client(clp->cl_rpcclient);
+	if (IS_ERR(server->client)) {
+		dprintk("%s: couldn't create rpc_client!\n", __func__);
+		return PTR_ERR(server->client);
+	}
+
+	memcpy(&server->client->cl_timeout_default,
+			timeo,
+			sizeof(server->client->cl_timeout_default));
+	server->client->cl_timeout = &server->client->cl_timeout_default;
+
+	if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) {
+		struct rpc_auth *auth;
+
+		auth = rpcauth_create(pseudoflavour, server->client);
+		if (IS_ERR(auth)) {
+			dprintk("%s: couldn't create credcache!\n", __func__);
+			return PTR_ERR(auth);
+		}
+	}
+	server->client->cl_softrtry = 0;
+	if (server->flags & NFS_MOUNT_SOFT)
+		server->client->cl_softrtry = 1;
+
+	return 0;
+}
+
+/*
+ * Initialise an NFS2 or NFS3 client
+ */
+int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
+		    const char *ip_addr, rpc_authflavor_t authflavour,
+		    int noresvport)
+{
+	int error;
+
+	if (clp->cl_cons_state == NFS_CS_READY) {
+		/* the client is already initialised */
+		dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp);
+		return 0;
+	}
+
+	/*
+	 * Create a client RPC handle for doing FSSTAT with UNIX auth only
+	 * - RFC 2623, sec 2.3.2
+	 */
+	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
+				      0, noresvport);
+	if (error < 0)
+		goto error;
+	nfs_mark_client_ready(clp, NFS_CS_READY);
+	return 0;
+
+error:
+	nfs_mark_client_ready(clp, error);
+	dprintk("<-- nfs_init_client() = xerror %d\n", error);
+	return error;
+}
+
+/*
+ * Create a version 2 or 3 client
+ */
+static int nfs_init_server(struct nfs_server *server,
+			   const struct nfs_parsed_mount_data *data)
+{
+	struct nfs_client_initdata cl_init = {
+		.hostname = data->nfs_server.hostname,
+		.addr = (const struct sockaddr *)&data->nfs_server.address,
+		.addrlen = data->nfs_server.addrlen,
+		.rpc_ops = &nfs_v2_clientops,
+		.proto = data->nfs_server.protocol,
+	};
+	struct rpc_timeout timeparms;
+	struct nfs_client *clp;
+	int error;
+
+	dprintk("--> nfs_init_server()\n");
+
+#ifdef CONFIG_NFS_V3
+	if (data->version == 3)
+		cl_init.rpc_ops = &nfs_v3_clientops;
+#endif
+
+	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+			data->timeo, data->retrans);
+
+	/* Allocate or find a client reference we can use */
+	clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX,
+			     data->flags & NFS_MOUNT_NORESVPORT);
+	if (IS_ERR(clp)) {
+		dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
+		return PTR_ERR(clp);
+	}
+
+	server->nfs_client = clp;
+
+	/* Initialise the client representation from the mount data */
+	server->flags = data->flags;
+	server->options = data->options;
+	server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+		NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
+		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
+
+	if (data->rsize)
+		server->rsize = nfs_block_size(data->rsize, NULL);
+	if (data->wsize)
+		server->wsize = nfs_block_size(data->wsize, NULL);
+
+	server->acregmin = data->acregmin * HZ;
+	server->acregmax = data->acregmax * HZ;
+	server->acdirmin = data->acdirmin * HZ;
+	server->acdirmax = data->acdirmax * HZ;
+
+	/* Start lockd here, before we might error out */
+	error = nfs_start_lockd(server);
+	if (error < 0)
+		goto error;
+
+	server->port = data->nfs_server.port;
+
+	error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
+	if (error < 0)
+		goto error;
+
+	/* Preserve the values of mount_server-related mount options */
+	if (data->mount_server.addrlen) {
+		memcpy(&server->mountd_address, &data->mount_server.address,
+			data->mount_server.addrlen);
+		server->mountd_addrlen = data->mount_server.addrlen;
+	}
+	server->mountd_version = data->mount_server.version;
+	server->mountd_port = data->mount_server.port;
+	server->mountd_protocol = data->mount_server.protocol;
+
+	server->namelen  = data->namlen;
+	/* Create a client RPC handle for the NFSv3 ACL management interface */
+	nfs_init_server_aclclient(server);
+	dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp);
+	return 0;
+
+error:
+	server->nfs_client = NULL;
+	nfs_put_client(clp);
+	dprintk("<-- nfs_init_server() = xerror %d\n", error);
+	return error;
+}
+
+/*
+ * Load up the server record from information gained in an fsinfo record
+ */
+static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo)
+{
+	unsigned long max_rpc_payload;
+
+	/* Work out a lot of parameters */
+	if (server->rsize == 0)
+		server->rsize = nfs_block_size(fsinfo->rtpref, NULL);
+	if (server->wsize == 0)
+		server->wsize = nfs_block_size(fsinfo->wtpref, NULL);
+
+	if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax)
+		server->rsize = nfs_block_size(fsinfo->rtmax, NULL);
+	if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax)
+		server->wsize = nfs_block_size(fsinfo->wtmax, NULL);
+
+	max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
+	if (server->rsize > max_rpc_payload)
+		server->rsize = max_rpc_payload;
+	if (server->rsize > NFS_MAX_FILE_IO_SIZE)
+		server->rsize = NFS_MAX_FILE_IO_SIZE;
+	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	server->backing_dev_info.name = "nfs";
+	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
+
+	if (server->wsize > max_rpc_payload)
+		server->wsize = max_rpc_payload;
+	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
+		server->wsize = NFS_MAX_FILE_IO_SIZE;
+	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	set_pnfs_layoutdriver(server, fsinfo->layouttype);
+
+	server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
+
+	server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
+	if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES)
+		server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES;
+	if (server->dtsize > server->rsize)
+		server->dtsize = server->rsize;
+
+	if (server->flags & NFS_MOUNT_NOAC) {
+		server->acregmin = server->acregmax = 0;
+		server->acdirmin = server->acdirmax = 0;
+	}
+
+	server->maxfilesize = fsinfo->maxfilesize;
+
+	server->time_delta = fsinfo->time_delta;
+
+	/* We're airborne Set socket buffersize */
+	rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
+}
+
+/*
+ * Probe filesystem information, including the FSID on v2/v3
+ */
+static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr)
+{
+	struct nfs_fsinfo fsinfo;
+	struct nfs_client *clp = server->nfs_client;
+	int error;
+
+	dprintk("--> nfs_probe_fsinfo()\n");
+
+	if (clp->rpc_ops->set_capabilities != NULL) {
+		error = clp->rpc_ops->set_capabilities(server, mntfh);
+		if (error < 0)
+			goto out_error;
+	}
+
+	fsinfo.fattr = fattr;
+	fsinfo.layouttype = 0;
+	error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
+	if (error < 0)
+		goto out_error;
+
+	nfs_server_set_fsinfo(server, &fsinfo);
+
+	/* Get some general file system info */
+	if (server->namelen == 0) {
+		struct nfs_pathconf pathinfo;
+
+		pathinfo.fattr = fattr;
+		nfs_fattr_init(fattr);
+
+		if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0)
+			server->namelen = pathinfo.max_namelen;
+	}
+
+	dprintk("<-- nfs_probe_fsinfo() = 0\n");
+	return 0;
+
+out_error:
+	dprintk("nfs_probe_fsinfo: error = %d\n", -error);
+	return error;
+}
+
+/*
+ * Copy useful information when duplicating a server record
+ */
+static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)
+{
+	target->flags = source->flags;
+	target->rsize = source->rsize;
+	target->wsize = source->wsize;
+	target->acregmin = source->acregmin;
+	target->acregmax = source->acregmax;
+	target->acdirmin = source->acdirmin;
+	target->acdirmax = source->acdirmax;
+	target->caps = source->caps;
+	target->options = source->options;
+}
+
+static void nfs_server_insert_lists(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	spin_lock(&nfs_client_lock);
+	list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
+	list_add_tail(&server->master_link, &nfs_volume_list);
+	clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
+	spin_unlock(&nfs_client_lock);
+
+}
+
+static void nfs_server_remove_lists(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	spin_lock(&nfs_client_lock);
+	list_del_rcu(&server->client_link);
+	if (clp && list_empty(&clp->cl_superblocks))
+		set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
+	list_del(&server->master_link);
+	spin_unlock(&nfs_client_lock);
+
+	synchronize_rcu();
+}
+
+/*
+ * Allocate and initialise a server record
+ */
+static struct nfs_server *nfs_alloc_server(void)
+{
+	struct nfs_server *server;
+
+	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+	if (!server)
+		return NULL;
+
+	server->client = server->client_acl = ERR_PTR(-EINVAL);
+
+	/* Zero out the NFS state stuff */
+	INIT_LIST_HEAD(&server->client_link);
+	INIT_LIST_HEAD(&server->master_link);
+	INIT_LIST_HEAD(&server->delegations);
+
+	atomic_set(&server->active, 0);
+
+	server->io_stats = nfs_alloc_iostats();
+	if (!server->io_stats) {
+		kfree(server);
+		return NULL;
+	}
+
+	if (bdi_init(&server->backing_dev_info)) {
+		nfs_free_iostats(server->io_stats);
+		kfree(server);
+		return NULL;
+	}
+
+	pnfs_init_server(server);
+
+	return server;
+}
+
+/*
+ * Free up a server record
+ */
+void nfs_free_server(struct nfs_server *server)
+{
+	dprintk("--> nfs_free_server()\n");
+
+	nfs_server_remove_lists(server);
+	unset_pnfs_layoutdriver(server);
+
+	if (server->destroy != NULL)
+		server->destroy(server);
+
+	if (!IS_ERR(server->client_acl))
+		rpc_shutdown_client(server->client_acl);
+	if (!IS_ERR(server->client))
+		rpc_shutdown_client(server->client);
+
+	nfs_put_client(server->nfs_client);
+
+	nfs_free_iostats(server->io_stats);
+	bdi_destroy(&server->backing_dev_info);
+	kfree(server);
+	nfs_release_automount_timer();
+	dprintk("<-- nfs_free_server()\n");
+}
+
+/*
+ * Create a version 2 or 3 volume record
+ * - keyed on server and FSID
+ */
+struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
+				     struct nfs_fh *mntfh)
+{
+	struct nfs_server *server;
+	struct nfs_fattr *fattr;
+	int error;
+
+	server = nfs_alloc_server();
+	if (!server)
+		return ERR_PTR(-ENOMEM);
+
+	error = -ENOMEM;
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		goto error;
+
+	/* Get a client representation */
+	error = nfs_init_server(server, data);
+	if (error < 0)
+		goto error;
+
+	BUG_ON(!server->nfs_client);
+	BUG_ON(!server->nfs_client->rpc_ops);
+	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+
+	/* Probe the root fh to retrieve its FSID */
+	error = nfs_probe_fsinfo(server, mntfh, fattr);
+	if (error < 0)
+		goto error;
+	if (server->nfs_client->rpc_ops->version == 3) {
+		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
+			server->namelen = NFS3_MAXNAMLEN;
+		if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
+			server->caps |= NFS_CAP_READDIRPLUS;
+	} else {
+		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
+			server->namelen = NFS2_MAXNAMLEN;
+	}
+
+	if (!(fattr->valid & NFS_ATTR_FATTR)) {
+		error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
+		if (error < 0) {
+			dprintk("nfs_create_server: getattr error = %d\n", -error);
+			goto error;
+		}
+	}
+	memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+
+	dprintk("Server FSID: %llx:%llx\n",
+		(unsigned long long) server->fsid.major,
+		(unsigned long long) server->fsid.minor);
+
+	nfs_server_insert_lists(server);
+	server->mount_time = jiffies;
+	nfs_free_fattr(fattr);
+	return server;
+
+error:
+	nfs_free_fattr(fattr);
+	nfs_free_server(server);
+	return ERR_PTR(error);
+}
+
+#ifdef CONFIG_NFS_V4
+/*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by IP address, protocol version, and minorversion
+ *
+ * Called from the pg_authenticate method. The callback identifier
+ * is not used as it has not been decoded.
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_no_ident(const struct sockaddr *addr)
+{
+	struct nfs_client *clp;
+
+	spin_lock(&nfs_client_lock);
+	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+		if (nfs4_cb_match_client(addr, clp, 0) == false)
+			continue;
+		atomic_inc(&clp->cl_count);
+		spin_unlock(&nfs_client_lock);
+		return clp;
+	}
+	spin_unlock(&nfs_client_lock);
+	return NULL;
+}
+
+/*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by callback identifier
+ */
+struct nfs_client *
+nfs4_find_client_ident(int cb_ident)
+{
+	struct nfs_client *clp;
+
+	spin_lock(&nfs_client_lock);
+	clp = idr_find(&cb_ident_idr, cb_ident);
+	if (clp)
+		atomic_inc(&clp->cl_count);
+	spin_unlock(&nfs_client_lock);
+	return clp;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * NFSv4.1 callback thread helper
+ * For CB_COMPOUND calls, find a client by IP address, protocol version,
+ * minorversion, and sessionID
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+			   struct nfs4_sessionid *sid)
+{
+	struct nfs_client *clp;
+
+	spin_lock(&nfs_client_lock);
+	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+		if (nfs4_cb_match_client(addr, clp, 1) == false)
+			continue;
+
+		if (!nfs4_has_session(clp))
+			continue;
+
+		/* Match sessionid*/
+		if (memcmp(clp->cl_session->sess_id.data,
+		    sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
+			continue;
+
+		atomic_inc(&clp->cl_count);
+		spin_unlock(&nfs_client_lock);
+		return clp;
+	}
+	spin_unlock(&nfs_client_lock);
+	return NULL;
+}
+
+#else /* CONFIG_NFS_V4_1 */
+
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+			   struct nfs4_sessionid *sid)
+{
+	return NULL;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * Initialize the NFS4 callback service
+ */
+static int nfs4_init_callback(struct nfs_client *clp)
+{
+	int error;
+
+	if (clp->rpc_ops->version == 4) {
+		if (nfs4_has_session(clp)) {
+			error = xprt_setup_backchannel(
+						clp->cl_rpcclient->cl_xprt,
+						NFS41_BC_MIN_CALLBACKS);
+			if (error < 0)
+				return error;
+		}
+
+		error = nfs_callback_up(clp->cl_mvops->minor_version,
+					clp->cl_rpcclient->cl_xprt);
+		if (error < 0) {
+			dprintk("%s: failed to start callback. Error = %d\n",
+				__func__, error);
+			return error;
+		}
+		__set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
+	}
+	return 0;
+}
+
+/*
+ * Initialize the minor version specific parts of an NFS4 client record
+ */
+static int nfs4_init_client_minor_version(struct nfs_client *clp)
+{
+#if defined(CONFIG_NFS_V4_1)
+	if (clp->cl_mvops->minor_version) {
+		struct nfs4_session *session = NULL;
+		/*
+		 * Create the session and mark it expired.
+		 * When a SEQUENCE operation encounters the expired session
+		 * it will do session recovery to initialize it.
+		 */
+		session = nfs4_alloc_session(clp);
+		if (!session)
+			return -ENOMEM;
+
+		clp->cl_session = session;
+		/*
+		 * The create session reply races with the server back
+		 * channel probe. Mark the client NFS_CS_SESSION_INITING
+		 * so that the client back channel can find the
+		 * nfs_client struct
+		 */
+		clp->cl_cons_state = NFS_CS_SESSION_INITING;
+	}
+#endif /* CONFIG_NFS_V4_1 */
+
+	return nfs4_init_callback(clp);
+}
+
+/*
+ * Initialise an NFS4 client record
+ */
+int nfs4_init_client(struct nfs_client *clp,
+		     const struct rpc_timeout *timeparms,
+		     const char *ip_addr,
+		     rpc_authflavor_t authflavour,
+		     int noresvport)
+{
+	int error;
+
+	if (clp->cl_cons_state == NFS_CS_READY) {
+		/* the client is initialised already */
+		dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp);
+		return 0;
+	}
+
+	/* Check NFS protocol revision and initialize RPC op vector */
+	clp->rpc_ops = &nfs_v4_clientops;
+
+	error = nfs_create_rpc_client(clp, timeparms, authflavour,
+				      1, noresvport);
+	if (error < 0)
+		goto error;
+	strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
+
+	error = nfs_idmap_new(clp);
+	if (error < 0) {
+		dprintk("%s: failed to create idmapper. Error = %d\n",
+			__func__, error);
+		goto error;
+	}
+	__set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
+
+	error = nfs4_init_client_minor_version(clp);
+	if (error < 0)
+		goto error;
+
+	if (!nfs4_has_session(clp))
+		nfs_mark_client_ready(clp, NFS_CS_READY);
+	return 0;
+
+error:
+	nfs_mark_client_ready(clp, error);
+	dprintk("<-- nfs4_init_client() = xerror %d\n", error);
+	return error;
+}
+
+/*
+ * Set up an NFS4 client
+ */
+static int nfs4_set_client(struct nfs_server *server,
+		const char *hostname,
+		const struct sockaddr *addr,
+		const size_t addrlen,
+		const char *ip_addr,
+		rpc_authflavor_t authflavour,
+		int proto, const struct rpc_timeout *timeparms,
+		u32 minorversion)
+{
+	struct nfs_client_initdata cl_init = {
+		.hostname = hostname,
+		.addr = addr,
+		.addrlen = addrlen,
+		.rpc_ops = &nfs_v4_clientops,
+		.proto = proto,
+		.minorversion = minorversion,
+	};
+	struct nfs_client *clp;
+	int error;
+
+	dprintk("--> nfs4_set_client()\n");
+
+	/* Allocate or find a client reference we can use */
+	clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour,
+			     server->flags & NFS_MOUNT_NORESVPORT);
+	if (IS_ERR(clp)) {
+		error = PTR_ERR(clp);
+		goto error;
+	}
+
+	/*
+	 * Query for the lease time on clientid setup or renewal
+	 *
+	 * Note that this will be set on nfs_clients that were created
+	 * only for the DS role and did not set this bit, but now will
+	 * serve a dual role.
+	 */
+	set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
+
+	server->nfs_client = clp;
+	dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
+	return 0;
+error:
+	dprintk("<-- nfs4_set_client() = xerror %d\n", error);
+	return error;
+}
+
+/*
+ * Set up a pNFS Data Server client.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+		const struct sockaddr *ds_addr,
+		int ds_addrlen, int ds_proto)
+{
+	struct nfs_client_initdata cl_init = {
+		.addr = ds_addr,
+		.addrlen = ds_addrlen,
+		.rpc_ops = &nfs_v4_clientops,
+		.proto = ds_proto,
+		.minorversion = mds_clp->cl_minorversion,
+	};
+	struct rpc_timeout ds_timeout = {
+		.to_initval = 15 * HZ,
+		.to_maxval = 15 * HZ,
+		.to_retries = 1,
+		.to_exponential = 1,
+	};
+	struct nfs_client *clp;
+
+	/*
+	 * Set an authflavor equual to the MDS value. Use the MDS nfs_client
+	 * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
+	 * (section 13.1 RFC 5661).
+	 */
+	clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+			     mds_clp->cl_rpcclient->cl_auth->au_flavor, 0);
+
+	dprintk("<-- %s %p\n", __func__, clp);
+	return clp;
+}
+EXPORT_SYMBOL(nfs4_set_ds_client);
+
+/*
+ * Session has been established, and the client marked ready.
+ * Set the mount rsize and wsize with negotiated fore channel
+ * attributes which will be bound checked in nfs_server_set_fsinfo.
+ */
+static void nfs4_session_set_rwsize(struct nfs_server *server)
+{
+#ifdef CONFIG_NFS_V4_1
+	struct nfs4_session *sess;
+	u32 server_resp_sz;
+	u32 server_rqst_sz;
+
+	if (!nfs4_has_session(server->nfs_client))
+		return;
+	sess = server->nfs_client->cl_session;
+	server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead;
+	server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead;
+
+	if (server->rsize > server_resp_sz)
+		server->rsize = server_resp_sz;
+	if (server->wsize > server_rqst_sz)
+		server->wsize = server_rqst_sz;
+#endif /* CONFIG_NFS_V4_1 */
+}
+
+static int nfs4_server_common_setup(struct nfs_server *server,
+		struct nfs_fh *mntfh)
+{
+	struct nfs_fattr *fattr;
+	int error;
+
+	BUG_ON(!server->nfs_client);
+	BUG_ON(!server->nfs_client->rpc_ops);
+	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+
+	/* data servers support only a subset of NFSv4.1 */
+	if (is_ds_only_client(server->nfs_client))
+		return -EPROTONOSUPPORT;
+
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		return -ENOMEM;
+
+	/* We must ensure the session is initialised first */
+	error = nfs4_init_session(server);
+	if (error < 0)
+		goto out;
+
+	/* Probe the root fh to retrieve its FSID and filehandle */
+	error = nfs4_get_rootfh(server, mntfh);
+	if (error < 0)
+		goto out;
+
+	dprintk("Server FSID: %llx:%llx\n",
+			(unsigned long long) server->fsid.major,
+			(unsigned long long) server->fsid.minor);
+	dprintk("Mount FH: %d\n", mntfh->size);
+
+	nfs4_session_set_rwsize(server);
+
+	error = nfs_probe_fsinfo(server, mntfh, fattr);
+	if (error < 0)
+		goto out;
+
+	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+		server->namelen = NFS4_MAXNAMLEN;
+
+	nfs_server_insert_lists(server);
+	server->mount_time = jiffies;
+out:
+	nfs_free_fattr(fattr);
+	return error;
+}
+
+/*
+ * Create a version 4 volume record
+ */
+static int nfs4_init_server(struct nfs_server *server,
+		const struct nfs_parsed_mount_data *data)
+{
+	struct rpc_timeout timeparms;
+	int error;
+
+	dprintk("--> nfs4_init_server()\n");
+
+	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+			data->timeo, data->retrans);
+
+	/* Initialise the client representation from the mount data */
+	server->flags = data->flags;
+	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK;
+	if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
+			server->caps |= NFS_CAP_READDIRPLUS;
+	server->options = data->options;
+
+	/* Get a client record */
+	error = nfs4_set_client(server,
+			data->nfs_server.hostname,
+			(const struct sockaddr *)&data->nfs_server.address,
+			data->nfs_server.addrlen,
+			data->client_address,
+			data->auth_flavors[0],
+			data->nfs_server.protocol,
+			&timeparms,
+			data->minorversion);
+	if (error < 0)
+		goto error;
+
+	/*
+	 * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
+	 * authentication.
+	 */
+	if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX)
+		server->caps |= NFS_CAP_UIDGID_NOMAP;
+
+	if (data->rsize)
+		server->rsize = nfs_block_size(data->rsize, NULL);
+	if (data->wsize)
+		server->wsize = nfs_block_size(data->wsize, NULL);
+
+	server->acregmin = data->acregmin * HZ;
+	server->acregmax = data->acregmax * HZ;
+	server->acdirmin = data->acdirmin * HZ;
+	server->acdirmax = data->acdirmax * HZ;
+
+	server->port = data->nfs_server.port;
+
+	error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
+
+error:
+	/* Done */
+	dprintk("<-- nfs4_init_server() = %d\n", error);
+	return error;
+}
+
+/*
+ * Create a version 4 volume record
+ * - keyed on server and FSID
+ */
+struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
+				      struct nfs_fh *mntfh)
+{
+	struct nfs_server *server;
+	int error;
+
+	dprintk("--> nfs4_create_server()\n");
+
+	server = nfs_alloc_server();
+	if (!server)
+		return ERR_PTR(-ENOMEM);
+
+	/* set up the general RPC client */
+	error = nfs4_init_server(server, data);
+	if (error < 0)
+		goto error;
+
+	error = nfs4_server_common_setup(server, mntfh);
+	if (error < 0)
+		goto error;
+
+	dprintk("<-- nfs4_create_server() = %p\n", server);
+	return server;
+
+error:
+	nfs_free_server(server);
+	dprintk("<-- nfs4_create_server() = error %d\n", error);
+	return ERR_PTR(error);
+}
+
+/*
+ * Create an NFS4 referral server record
+ */
+struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
+					       struct nfs_fh *mntfh)
+{
+	struct nfs_client *parent_client;
+	struct nfs_server *server, *parent_server;
+	int error;
+
+	dprintk("--> nfs4_create_referral_server()\n");
+
+	server = nfs_alloc_server();
+	if (!server)
+		return ERR_PTR(-ENOMEM);
+
+	parent_server = NFS_SB(data->sb);
+	parent_client = parent_server->nfs_client;
+
+	/* Initialise the client representation from the parent server */
+	nfs_server_copy_userdata(server, parent_server);
+	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
+
+	/* Get a client representation.
+	 * Note: NFSv4 always uses TCP, */
+	error = nfs4_set_client(server, data->hostname,
+				data->addr,
+				data->addrlen,
+				parent_client->cl_ipaddr,
+				data->authflavor,
+				parent_server->client->cl_xprt->prot,
+				parent_server->client->cl_timeout,
+				parent_client->cl_mvops->minor_version);
+	if (error < 0)
+		goto error;
+
+	error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor);
+	if (error < 0)
+		goto error;
+
+	error = nfs4_server_common_setup(server, mntfh);
+	if (error < 0)
+		goto error;
+
+	dprintk("<-- nfs_create_referral_server() = %p\n", server);
+	return server;
+
+error:
+	nfs_free_server(server);
+	dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
+	return ERR_PTR(error);
+}
+
+#endif /* CONFIG_NFS_V4 */
+
+/*
+ * Clone an NFS2, NFS3 or NFS4 server record
+ */
+struct nfs_server *nfs_clone_server(struct nfs_server *source,
+				    struct nfs_fh *fh,
+				    struct nfs_fattr *fattr)
+{
+	struct nfs_server *server;
+	struct nfs_fattr *fattr_fsinfo;
+	int error;
+
+	dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
+		(unsigned long long) fattr->fsid.major,
+		(unsigned long long) fattr->fsid.minor);
+
+	server = nfs_alloc_server();
+	if (!server)
+		return ERR_PTR(-ENOMEM);
+
+	error = -ENOMEM;
+	fattr_fsinfo = nfs_alloc_fattr();
+	if (fattr_fsinfo == NULL)
+		goto out_free_server;
+
+	/* Copy data from the source */
+	server->nfs_client = source->nfs_client;
+	atomic_inc(&server->nfs_client->cl_count);
+	nfs_server_copy_userdata(server, source);
+
+	server->fsid = fattr->fsid;
+
+	error = nfs_init_server_rpcclient(server,
+			source->client->cl_timeout,
+			source->client->cl_auth->au_flavor);
+	if (error < 0)
+		goto out_free_server;
+	if (!IS_ERR(source->client_acl))
+		nfs_init_server_aclclient(server);
+
+	/* probe the filesystem info for this server filesystem */
+	error = nfs_probe_fsinfo(server, fh, fattr_fsinfo);
+	if (error < 0)
+		goto out_free_server;
+
+	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+		server->namelen = NFS4_MAXNAMLEN;
+
+	dprintk("Cloned FSID: %llx:%llx\n",
+		(unsigned long long) server->fsid.major,
+		(unsigned long long) server->fsid.minor);
+
+	error = nfs_start_lockd(server);
+	if (error < 0)
+		goto out_free_server;
+
+	nfs_server_insert_lists(server);
+	server->mount_time = jiffies;
+
+	nfs_free_fattr(fattr_fsinfo);
+	dprintk("<-- nfs_clone_server() = %p\n", server);
+	return server;
+
+out_free_server:
+	nfs_free_fattr(fattr_fsinfo);
+	nfs_free_server(server);
+	dprintk("<-- nfs_clone_server() = error %d\n", error);
+	return ERR_PTR(error);
+}
+
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry *proc_fs_nfs;
+
+static int nfs_server_list_open(struct inode *inode, struct file *file);
+static void *nfs_server_list_start(struct seq_file *p, loff_t *pos);
+static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
+static void nfs_server_list_stop(struct seq_file *p, void *v);
+static int nfs_server_list_show(struct seq_file *m, void *v);
+
+static const struct seq_operations nfs_server_list_ops = {
+	.start	= nfs_server_list_start,
+	.next	= nfs_server_list_next,
+	.stop	= nfs_server_list_stop,
+	.show	= nfs_server_list_show,
+};
+
+static const struct file_operations nfs_server_list_fops = {
+	.open		= nfs_server_list_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+	.owner		= THIS_MODULE,
+};
+
+static int nfs_volume_list_open(struct inode *inode, struct file *file);
+static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos);
+static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos);
+static void nfs_volume_list_stop(struct seq_file *p, void *v);
+static int nfs_volume_list_show(struct seq_file *m, void *v);
+
+static const struct seq_operations nfs_volume_list_ops = {
+	.start	= nfs_volume_list_start,
+	.next	= nfs_volume_list_next,
+	.stop	= nfs_volume_list_stop,
+	.show	= nfs_volume_list_show,
+};
+
+static const struct file_operations nfs_volume_list_fops = {
+	.open		= nfs_volume_list_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * open "/proc/fs/nfsfs/servers" which provides a summary of servers with which
+ * we're dealing
+ */
+static int nfs_server_list_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open(file, &nfs_server_list_ops);
+	if (ret < 0)
+		return ret;
+
+	m = file->private_data;
+	m->private = PDE(inode)->data;
+
+	return 0;
+}
+
+/*
+ * set up the iterator to start reading from the server list and return the first item
+ */
+static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
+{
+	/* lock the list against modification */
+	spin_lock(&nfs_client_lock);
+	return seq_list_start_head(&nfs_client_list, *_pos);
+}
+
+/*
+ * move to next server
+ */
+static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &nfs_client_list, pos);
+}
+
+/*
+ * clean up after reading from the transports list
+ */
+static void nfs_server_list_stop(struct seq_file *p, void *v)
+{
+	spin_unlock(&nfs_client_lock);
+}
+
+/*
+ * display a header line followed by a load of call lines
+ */
+static int nfs_server_list_show(struct seq_file *m, void *v)
+{
+	struct nfs_client *clp;
+
+	/* display header on line 1 */
+	if (v == &nfs_client_list) {
+		seq_puts(m, "NV SERVER   PORT USE HOSTNAME\n");
+		return 0;
+	}
+
+	/* display one transport per line on subsequent lines */
+	clp = list_entry(v, struct nfs_client, cl_share_link);
+
+	seq_printf(m, "v%u %s %s %3d %s\n",
+		   clp->rpc_ops->version,
+		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
+		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
+		   atomic_read(&clp->cl_count),
+		   clp->cl_hostname);
+
+	return 0;
+}
+
+/*
+ * open "/proc/fs/nfsfs/volumes" which provides a summary of extant volumes
+ */
+static int nfs_volume_list_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open(file, &nfs_volume_list_ops);
+	if (ret < 0)
+		return ret;
+
+	m = file->private_data;
+	m->private = PDE(inode)->data;
+
+	return 0;
+}
+
+/*
+ * set up the iterator to start reading from the volume list and return the first item
+ */
+static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
+{
+	/* lock the list against modification */
+	spin_lock(&nfs_client_lock);
+	return seq_list_start_head(&nfs_volume_list, *_pos);
+}
+
+/*
+ * move to next volume
+ */
+static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &nfs_volume_list, pos);
+}
+
+/*
+ * clean up after reading from the transports list
+ */
+static void nfs_volume_list_stop(struct seq_file *p, void *v)
+{
+	spin_unlock(&nfs_client_lock);
+}
+
+/*
+ * display a header line followed by a load of call lines
+ */
+static int nfs_volume_list_show(struct seq_file *m, void *v)
+{
+	struct nfs_server *server;
+	struct nfs_client *clp;
+	char dev[8], fsid[17];
+
+	/* display header on line 1 */
+	if (v == &nfs_volume_list) {
+		seq_puts(m, "NV SERVER   PORT DEV     FSID              FSC\n");
+		return 0;
+	}
+	/* display one transport per line on subsequent lines */
+	server = list_entry(v, struct nfs_server, master_link);
+	clp = server->nfs_client;
+
+	snprintf(dev, 8, "%u:%u",
+		 MAJOR(server->s_dev), MINOR(server->s_dev));
+
+	snprintf(fsid, 17, "%llx:%llx",
+		 (unsigned long long) server->fsid.major,
+		 (unsigned long long) server->fsid.minor);
+
+	seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
+		   clp->rpc_ops->version,
+		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
+		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
+		   dev,
+		   fsid,
+		   nfs_server_fscache_state(server));
+
+	return 0;
+}
+
+/*
+ * initialise the /proc/fs/nfsfs/ directory
+ */
+int __init nfs_fs_proc_init(void)
+{
+	struct proc_dir_entry *p;
+
+	proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL);
+	if (!proc_fs_nfs)
+		goto error_0;
+
+	/* a file of servers with which we're dealing */
+	p = proc_create("servers", S_IFREG|S_IRUGO,
+			proc_fs_nfs, &nfs_server_list_fops);
+	if (!p)
+		goto error_1;
+
+	/* a file of volumes that we have mounted */
+	p = proc_create("volumes", S_IFREG|S_IRUGO,
+			proc_fs_nfs, &nfs_volume_list_fops);
+	if (!p)
+		goto error_2;
+	return 0;
+
+error_2:
+	remove_proc_entry("servers", proc_fs_nfs);
+error_1:
+	remove_proc_entry("fs/nfsfs", NULL);
+error_0:
+	return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/nfsfs/ directory
+ */
+void nfs_fs_proc_exit(void)
+{
+	remove_proc_entry("volumes", proc_fs_nfs);
+	remove_proc_entry("servers", proc_fs_nfs);
+	remove_proc_entry("fs/nfsfs", NULL);
+}
+
+#endif /* CONFIG_PROC_FS */
+
+module_param(nfs4_disable_idmapping, bool, 0644);
+MODULE_PARM_DESC(nfs4_disable_idmapping,
+		"Turn off NFSv4 idmapping when using 'sec=sys'");
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
new file mode 100644
index 00000000..ecabbd8f
--- /dev/null
+++ b/fs/nfs/delegation.c
@@ -0,0 +1,716 @@
+/*
+ * linux/fs/nfs/delegation.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFS file delegation management
+ *
+ */
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+
+#include "nfs4_fs.h"
+#include "delegation.h"
+#include "internal.h"
+
+static void nfs_free_delegation(struct nfs_delegation *delegation)
+{
+	if (delegation->cred) {
+		put_rpccred(delegation->cred);
+		delegation->cred = NULL;
+	}
+	kfree_rcu(delegation, rcu);
+}
+
+/**
+ * nfs_mark_delegation_referenced - set delegation's REFERENCED flag
+ * @delegation: delegation to process
+ *
+ */
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
+{
+	set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
+}
+
+/**
+ * nfs_have_delegation - check if inode has a delegation
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
+int nfs_have_delegation(struct inode *inode, fmode_t flags)
+{
+	struct nfs_delegation *delegation;
+	int ret = 0;
+
+	flags &= FMODE_READ|FMODE_WRITE;
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation != NULL && (delegation->type & flags) == flags) {
+		nfs_mark_delegation_referenced(delegation);
+		ret = 1;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+	struct inode *inode = state->inode;
+	struct file_lock *fl;
+	int status = 0;
+
+	if (inode->i_flock == NULL)
+		goto out;
+
+	/* Protect inode->i_flock using the file locks lock */
+	lock_flocks();
+	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
+			continue;
+		if (nfs_file_open_context(fl->fl_file) != ctx)
+			continue;
+		unlock_flocks();
+		status = nfs4_lock_delegation_recall(state, fl);
+		if (status < 0)
+			goto out;
+		lock_flocks();
+	}
+	unlock_flocks();
+out:
+	return status;
+}
+
+static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_open_context *ctx;
+	struct nfs4_state *state;
+	int err;
+
+again:
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(ctx, &nfsi->open_files, list) {
+		state = ctx->state;
+		if (state == NULL)
+			continue;
+		if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
+			continue;
+		if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0)
+			continue;
+		get_nfs_open_context(ctx);
+		spin_unlock(&inode->i_lock);
+		err = nfs4_open_delegation_recall(ctx, state, stateid);
+		if (err >= 0)
+			err = nfs_delegation_claim_locks(ctx, state);
+		put_nfs_open_context(ctx);
+		if (err != 0)
+			return err;
+		goto again;
+	}
+	spin_unlock(&inode->i_lock);
+	return 0;
+}
+
+/**
+ * nfs_inode_reclaim_delegation - process a delegation reclaim request
+ * @inode: inode to process
+ * @cred: credential to use for request
+ * @res: new delegation state from server
+ *
+ */
+void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
+				  struct nfs_openres *res)
+{
+	struct nfs_delegation *delegation;
+	struct rpc_cred *oldcred = NULL;
+
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation != NULL) {
+		spin_lock(&delegation->lock);
+		if (delegation->inode != NULL) {
+			memcpy(delegation->stateid.data, res->delegation.data,
+			       sizeof(delegation->stateid.data));
+			delegation->type = res->delegation_type;
+			delegation->maxsize = res->maxsize;
+			oldcred = delegation->cred;
+			delegation->cred = get_rpccred(cred);
+			clear_bit(NFS_DELEGATION_NEED_RECLAIM,
+				  &delegation->flags);
+			NFS_I(inode)->delegation_state = delegation->type;
+			spin_unlock(&delegation->lock);
+			put_rpccred(oldcred);
+			rcu_read_unlock();
+		} else {
+			/* We appear to have raced with a delegation return. */
+			spin_unlock(&delegation->lock);
+			rcu_read_unlock();
+			nfs_inode_set_delegation(inode, cred, res);
+		}
+	} else {
+		rcu_read_unlock();
+	}
+}
+
+static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
+{
+	int res = 0;
+
+	res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
+	nfs_free_delegation(delegation);
+	return res;
+}
+
+static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation)
+{
+	struct inode *inode = NULL;
+
+	spin_lock(&delegation->lock);
+	if (delegation->inode != NULL)
+		inode = igrab(delegation->inode);
+	spin_unlock(&delegation->lock);
+	return inode;
+}
+
+static struct nfs_delegation *
+nfs_detach_delegation_locked(struct nfs_inode *nfsi,
+			     struct nfs_server *server)
+{
+	struct nfs_delegation *delegation =
+		rcu_dereference_protected(nfsi->delegation,
+				lockdep_is_held(&server->nfs_client->cl_lock));
+
+	if (delegation == NULL)
+		goto nomatch;
+
+	spin_lock(&delegation->lock);
+	list_del_rcu(&delegation->super_list);
+	delegation->inode = NULL;
+	nfsi->delegation_state = 0;
+	rcu_assign_pointer(nfsi->delegation, NULL);
+	spin_unlock(&delegation->lock);
+	return delegation;
+nomatch:
+	return NULL;
+}
+
+static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
+						    struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs_delegation *delegation;
+
+	spin_lock(&clp->cl_lock);
+	delegation = nfs_detach_delegation_locked(nfsi, server);
+	spin_unlock(&clp->cl_lock);
+	return delegation;
+}
+
+/**
+ * nfs_inode_set_delegation - set up a delegation on an inode
+ * @inode: inode to which delegation applies
+ * @cred: cred to use for subsequent delegation processing
+ * @res: new delegation state from server
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_delegation *delegation, *old_delegation;
+	struct nfs_delegation *freeme = NULL;
+	int status = 0;
+
+	delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
+	if (delegation == NULL)
+		return -ENOMEM;
+	memcpy(delegation->stateid.data, res->delegation.data,
+			sizeof(delegation->stateid.data));
+	delegation->type = res->delegation_type;
+	delegation->maxsize = res->maxsize;
+	delegation->change_attr = nfsi->change_attr;
+	delegation->cred = get_rpccred(cred);
+	delegation->inode = inode;
+	delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
+	spin_lock_init(&delegation->lock);
+
+	spin_lock(&clp->cl_lock);
+	old_delegation = rcu_dereference_protected(nfsi->delegation,
+					lockdep_is_held(&clp->cl_lock));
+	if (old_delegation != NULL) {
+		if (memcmp(&delegation->stateid, &old_delegation->stateid,
+					sizeof(old_delegation->stateid)) == 0 &&
+				delegation->type == old_delegation->type) {
+			goto out;
+		}
+		/*
+		 * Deal with broken servers that hand out two
+		 * delegations for the same file.
+		 */
+		dfprintk(FILE, "%s: server %s handed out "
+				"a duplicate delegation!\n",
+				__func__, clp->cl_hostname);
+		if (delegation->type <= old_delegation->type) {
+			freeme = delegation;
+			delegation = NULL;
+			goto out;
+		}
+		freeme = nfs_detach_delegation_locked(nfsi, server);
+	}
+	list_add_rcu(&delegation->super_list, &server->delegations);
+	nfsi->delegation_state = delegation->type;
+	rcu_assign_pointer(nfsi->delegation, delegation);
+	delegation = NULL;
+
+	/* Ensure we revalidate the attributes and page cache! */
+	spin_lock(&inode->i_lock);
+	nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
+	spin_unlock(&inode->i_lock);
+
+out:
+	spin_unlock(&clp->cl_lock);
+	if (delegation != NULL)
+		nfs_free_delegation(delegation);
+	if (freeme != NULL)
+		nfs_do_return_delegation(inode, freeme, 0);
+	return status;
+}
+
+/*
+ * Basic procedure for returning a delegation to the server
+ */
+static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int err;
+
+	/*
+	 * Guard against new delegated open/lock/unlock calls and against
+	 * state recovery
+	 */
+	down_write(&nfsi->rwsem);
+	err = nfs_delegation_claim_opens(inode, &delegation->stateid);
+	up_write(&nfsi->rwsem);
+	if (err)
+		goto out;
+
+	err = nfs_do_return_delegation(inode, delegation, issync);
+out:
+	return err;
+}
+
+/**
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_client_return_marked_delegations(struct nfs_client *clp)
+{
+	struct nfs_delegation *delegation;
+	struct nfs_server *server;
+	struct inode *inode;
+	int err = 0;
+
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		list_for_each_entry_rcu(delegation, &server->delegations,
+								super_list) {
+			if (!test_and_clear_bit(NFS_DELEGATION_RETURN,
+							&delegation->flags))
+				continue;
+			inode = nfs_delegation_grab_inode(delegation);
+			if (inode == NULL)
+				continue;
+			delegation = nfs_detach_delegation(NFS_I(inode),
+								server);
+			rcu_read_unlock();
+
+			if (delegation != NULL) {
+				filemap_flush(inode->i_mapping);
+				err = __nfs_inode_return_delegation(inode,
+								delegation, 0);
+			}
+			iput(inode);
+			if (!err)
+				goto restart;
+			set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+			return err;
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
+/**
+ * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens
+ * @inode: inode to process
+ *
+ * Does not protect against delegation reclaims, therefore really only safe
+ * to be called from nfs4_clear_inode().
+ */
+void nfs_inode_return_delegation_noreclaim(struct inode *inode)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_delegation *delegation;
+
+	if (rcu_access_pointer(nfsi->delegation) != NULL) {
+		delegation = nfs_detach_delegation(nfsi, server);
+		if (delegation != NULL)
+			nfs_do_return_delegation(inode, delegation, 0);
+	}
+}
+
+/**
+ * nfs_inode_return_delegation - synchronously return a delegation
+ * @inode: inode to process
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_inode_return_delegation(struct inode *inode)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_delegation *delegation;
+	int err = 0;
+
+	if (rcu_access_pointer(nfsi->delegation) != NULL) {
+		delegation = nfs_detach_delegation(nfsi, server);
+		if (delegation != NULL) {
+			nfs_wb_all(inode);
+			err = __nfs_inode_return_delegation(inode, delegation, 1);
+		}
+	}
+	return err;
+}
+
+static void nfs_mark_return_delegation(struct nfs_server *server,
+		struct nfs_delegation *delegation)
+{
+	set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+	set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+}
+
+/**
+ * nfs_super_return_all_delegations - return delegations for one superblock
+ * @sb: sb to process
+ *
+ */
+void nfs_super_return_all_delegations(struct super_block *sb)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs_delegation *delegation;
+
+	if (clp == NULL)
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+		spin_lock(&delegation->lock);
+		set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+		spin_unlock(&delegation->lock);
+	}
+	rcu_read_unlock();
+
+	if (nfs_client_return_marked_delegations(clp) != 0)
+		nfs4_schedule_state_manager(clp);
+}
+
+static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
+						 fmode_t flags)
+{
+	struct nfs_delegation *delegation;
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+		if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
+			continue;
+		if (delegation->type & flags)
+			nfs_mark_return_delegation(server, delegation);
+	}
+}
+
+static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
+							fmode_t flags)
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_mark_return_all_delegation_types(server, flags);
+	rcu_read_unlock();
+}
+
+static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
+{
+	nfs_client_mark_return_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
+}
+
+static void nfs_delegation_run_state_manager(struct nfs_client *clp)
+{
+	if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
+		nfs4_schedule_state_manager(clp);
+}
+
+void nfs_remove_bad_delegation(struct inode *inode)
+{
+	struct nfs_delegation *delegation;
+
+	delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode));
+	if (delegation) {
+		nfs_inode_find_state_and_recover(inode, &delegation->stateid);
+		nfs_free_delegation(delegation);
+	}
+}
+
+/**
+ * nfs_expire_all_delegation_types
+ * @clp: client to process
+ * @flags: delegation types to expire
+ *
+ */
+void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
+{
+	nfs_client_mark_return_all_delegation_types(clp, flags);
+	nfs_delegation_run_state_manager(clp);
+}
+
+/**
+ * nfs_expire_all_delegations
+ * @clp: client to process
+ *
+ */
+void nfs_expire_all_delegations(struct nfs_client *clp)
+{
+	nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
+}
+
+/**
+ * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
+ * @clp: client to process
+ *
+ */
+void nfs_handle_cb_pathdown(struct nfs_client *clp)
+{
+	if (clp == NULL)
+		return;
+	nfs_client_mark_return_all_delegations(clp);
+}
+
+static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
+{
+	struct nfs_delegation *delegation;
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+		if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
+			continue;
+		nfs_mark_return_delegation(server, delegation);
+	}
+}
+
+/**
+ * nfs_expire_unreferenced_delegations - Eliminate unused delegations
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_mark_return_unreferenced_delegations(server);
+	rcu_read_unlock();
+
+	nfs_delegation_run_state_manager(clp);
+}
+
+/**
+ * nfs_async_inode_return_delegation - asynchronously return a delegation
+ * @inode: inode to process
+ * @stateid: state ID information from CB_RECALL arguments
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_async_inode_return_delegation(struct inode *inode,
+				      const nfs4_stateid *stateid)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs_delegation *delegation;
+
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+
+	if (!clp->cl_mvops->validate_stateid(delegation, stateid)) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+	nfs_mark_return_delegation(server, delegation);
+	rcu_read_unlock();
+
+	nfs_delegation_run_state_manager(clp);
+	return 0;
+}
+
+static struct inode *
+nfs_delegation_find_inode_server(struct nfs_server *server,
+				 const struct nfs_fh *fhandle)
+{
+	struct nfs_delegation *delegation;
+	struct inode *res = NULL;
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+		spin_lock(&delegation->lock);
+		if (delegation->inode != NULL &&
+		    nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
+			res = igrab(delegation->inode);
+		}
+		spin_unlock(&delegation->lock);
+		if (res != NULL)
+			break;
+	}
+	return res;
+}
+
+/**
+ * nfs_delegation_find_inode - retrieve the inode associated with a delegation
+ * @clp: client state handle
+ * @fhandle: filehandle from a delegation recall
+ *
+ * Returns pointer to inode matching "fhandle," or NULL if a matching inode
+ * cannot be found.
+ */
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp,
+					const struct nfs_fh *fhandle)
+{
+	struct nfs_server *server;
+	struct inode *res = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		res = nfs_delegation_find_inode_server(server, fhandle);
+		if (res != NULL)
+			break;
+	}
+	rcu_read_unlock();
+	return res;
+}
+
+static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
+{
+	struct nfs_delegation *delegation;
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+		set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+}
+
+/**
+ * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_delegation_mark_reclaim(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_delegation_mark_reclaim_server(server);
+	rcu_read_unlock();
+}
+
+/**
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
+{
+	struct nfs_delegation *delegation;
+	struct nfs_server *server;
+	struct inode *inode;
+
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		list_for_each_entry_rcu(delegation, &server->delegations,
+								super_list) {
+			if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
+						&delegation->flags) == 0)
+				continue;
+			inode = nfs_delegation_grab_inode(delegation);
+			if (inode == NULL)
+				continue;
+			delegation = nfs_detach_delegation(NFS_I(inode),
+								server);
+			rcu_read_unlock();
+
+			if (delegation != NULL)
+				nfs_free_delegation(delegation);
+			iput(inode);
+			goto restart;
+		}
+	}
+	rcu_read_unlock();
+}
+
+/**
+ * nfs_delegations_present - check for existence of delegations
+ * @clp: client state handle
+ *
+ * Returns one if there are any nfs_delegation structures attached
+ * to this nfs_client.
+ */
+int nfs_delegations_present(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+	int ret = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		if (!list_empty(&server->delegations)) {
+			ret = 1;
+			break;
+		}
+	rcu_read_unlock();
+	return ret;
+}
+
+/**
+ * nfs4_copy_delegation_stateid - Copy inode's state ID information
+ * @dst: stateid data structure to fill in
+ * @inode: inode to check
+ *
+ * Returns one and fills in "dst->data" * if inode had a delegation,
+ * otherwise zero is returned.
+ */
+int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_delegation *delegation;
+	int ret = 0;
+
+	rcu_read_lock();
+	delegation = rcu_dereference(nfsi->delegation);
+	if (delegation != NULL) {
+		memcpy(dst->data, delegation->stateid.data, sizeof(dst->data));
+		ret = 1;
+	}
+	rcu_read_unlock();
+	return ret;
+}
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
new file mode 100644
index 00000000..691a7960
--- /dev/null
+++ b/fs/nfs/delegation.h
@@ -0,0 +1,80 @@
+/*
+ * linux/fs/nfs/delegation.h
+ *
+ * Copyright (c) Trond Myklebust
+ *
+ * Definitions pertaining to NFS delegated files
+ */
+#ifndef FS_NFS_DELEGATION_H
+#define FS_NFS_DELEGATION_H
+
+#if defined(CONFIG_NFS_V4)
+/*
+ * NFSv4 delegation
+ */
+struct nfs_delegation {
+	struct list_head super_list;
+	struct rpc_cred *cred;
+	struct inode *inode;
+	nfs4_stateid stateid;
+	fmode_t type;
+	loff_t maxsize;
+	__u64 change_attr;
+	unsigned long flags;
+	spinlock_t lock;
+	struct rcu_head rcu;
+};
+
+enum {
+	NFS_DELEGATION_NEED_RECLAIM = 0,
+	NFS_DELEGATION_RETURN,
+	NFS_DELEGATION_REFERENCED,
+};
+
+int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
+void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
+int nfs_inode_return_delegation(struct inode *inode);
+int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
+void nfs_inode_return_delegation_noreclaim(struct inode *inode);
+
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
+void nfs_super_return_all_delegations(struct super_block *sb);
+void nfs_expire_all_delegations(struct nfs_client *clp);
+void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
+void nfs_handle_cb_pathdown(struct nfs_client *clp);
+int nfs_client_return_marked_delegations(struct nfs_client *clp);
+int nfs_delegations_present(struct nfs_client *clp);
+void nfs_remove_bad_delegation(struct inode *inode);
+
+void nfs_delegation_mark_reclaim(struct nfs_client *clp);
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
+
+/* NFSv4 delegation-related procedures */
+int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
+int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
+int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
+int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
+
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
+int nfs_have_delegation(struct inode *inode, fmode_t flags);
+
+#else
+static inline int nfs_have_delegation(struct inode *inode, fmode_t flags)
+{
+	return 0;
+}
+
+static inline int nfs_inode_return_delegation(struct inode *inode)
+{
+	return 0;
+}
+#endif
+
+static inline int nfs_have_delegated_attributes(struct inode *inode)
+{
+	return nfs_have_delegation(inode, FMODE_READ) &&
+		!(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
+}
+
+#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
new file mode 100644
index 00000000..462a0060
--- /dev/null
+++ b/fs/nfs/dir.c
@@ -0,0 +1,2350 @@
+/*
+ *  linux/fs/nfs/dir.c
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  nfs directory handling functions
+ *
+ * 10 Apr 1996	Added silly rename for unlink	--okir
+ * 28 Sep 1996	Improved directory cache --okir
+ * 23 Aug 1997  Claus Heine claus@momo.math.rwth-aachen.de 
+ *              Re-implemented silly rename for unlink, newly implemented
+ *              silly rename for nfs_rename() following the suggestions
+ *              of Olaf Kirch (okir) found in this file.
+ *              Following Linus comments on my original hack, this version
+ *              depends only on the dcache stuff and doesn't touch the inode
+ *              layer (iput() and friends).
+ *  6 Jun 1999	Cache readdir lookups in the page cache. -DaveM
+ */
+
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/sched.h>
+#include <linux/kmemleak.h>
+#include <linux/xattr.h>
+
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+
+/* #define NFS_DEBUG_VERBOSE 1 */
+
+static int nfs_opendir(struct inode *, struct file *);
+static int nfs_closedir(struct inode *, struct file *);
+static int nfs_readdir(struct file *, void *, filldir_t);
+static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *);
+static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *);
+static int nfs_mkdir(struct inode *, struct dentry *, int);
+static int nfs_rmdir(struct inode *, struct dentry *);
+static int nfs_unlink(struct inode *, struct dentry *);
+static int nfs_symlink(struct inode *, struct dentry *, const char *);
+static int nfs_link(struct dentry *, struct inode *, struct dentry *);
+static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
+static int nfs_rename(struct inode *, struct dentry *,
+		      struct inode *, struct dentry *);
+static int nfs_fsync_dir(struct file *, int);
+static loff_t nfs_llseek_dir(struct file *, loff_t, int);
+static void nfs_readdir_clear_array(struct page*);
+
+const struct file_operations nfs_dir_operations = {
+	.llseek		= nfs_llseek_dir,
+	.read		= generic_read_dir,
+	.readdir	= nfs_readdir,
+	.open		= nfs_opendir,
+	.release	= nfs_closedir,
+	.fsync		= nfs_fsync_dir,
+};
+
+const struct inode_operations nfs_dir_inode_operations = {
+	.create		= nfs_create,
+	.lookup		= nfs_lookup,
+	.link		= nfs_link,
+	.unlink		= nfs_unlink,
+	.symlink	= nfs_symlink,
+	.mkdir		= nfs_mkdir,
+	.rmdir		= nfs_rmdir,
+	.mknod		= nfs_mknod,
+	.rename		= nfs_rename,
+	.permission	= nfs_permission,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+};
+
+const struct address_space_operations nfs_dir_aops = {
+	.freepage = nfs_readdir_clear_array,
+};
+
+#ifdef CONFIG_NFS_V3
+const struct inode_operations nfs3_dir_inode_operations = {
+	.create		= nfs_create,
+	.lookup		= nfs_lookup,
+	.link		= nfs_link,
+	.unlink		= nfs_unlink,
+	.symlink	= nfs_symlink,
+	.mkdir		= nfs_mkdir,
+	.rmdir		= nfs_rmdir,
+	.mknod		= nfs_mknod,
+	.rename		= nfs_rename,
+	.permission	= nfs_permission,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+	.listxattr	= nfs3_listxattr,
+	.getxattr	= nfs3_getxattr,
+	.setxattr	= nfs3_setxattr,
+	.removexattr	= nfs3_removexattr,
+};
+#endif  /* CONFIG_NFS_V3 */
+
+#ifdef CONFIG_NFS_V4
+
+static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *);
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd);
+const struct inode_operations nfs4_dir_inode_operations = {
+	.create		= nfs_open_create,
+	.lookup		= nfs_atomic_lookup,
+	.link		= nfs_link,
+	.unlink		= nfs_unlink,
+	.symlink	= nfs_symlink,
+	.mkdir		= nfs_mkdir,
+	.rmdir		= nfs_rmdir,
+	.mknod		= nfs_mknod,
+	.rename		= nfs_rename,
+	.permission	= nfs_permission,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+	.getxattr	= generic_getxattr,
+	.setxattr	= generic_setxattr,
+	.listxattr	= generic_listxattr,
+	.removexattr	= generic_removexattr,
+};
+
+#endif /* CONFIG_NFS_V4 */
+
+static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
+{
+	struct nfs_open_dir_context *ctx;
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (ctx != NULL) {
+		ctx->duped = 0;
+		ctx->attr_gencount = NFS_I(dir)->attr_gencount;
+		ctx->dir_cookie = 0;
+		ctx->dup_cookie = 0;
+		ctx->cred = get_rpccred(cred);
+		return ctx;
+	}
+	return  ERR_PTR(-ENOMEM);
+}
+
+static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
+{
+	put_rpccred(ctx->cred);
+	kfree(ctx);
+}
+
+/*
+ * Open file
+ */
+static int
+nfs_opendir(struct inode *inode, struct file *filp)
+{
+	int res = 0;
+	struct nfs_open_dir_context *ctx;
+	struct rpc_cred *cred;
+
+	dfprintk(FILE, "NFS: open dir(%s/%s)\n",
+			filp->f_path.dentry->d_parent->d_name.name,
+			filp->f_path.dentry->d_name.name);
+
+	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
+
+	cred = rpc_lookup_cred();
+	if (IS_ERR(cred))
+		return PTR_ERR(cred);
+	ctx = alloc_nfs_open_dir_context(inode, cred);
+	if (IS_ERR(ctx)) {
+		res = PTR_ERR(ctx);
+		goto out;
+	}
+	filp->private_data = ctx;
+	if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
+		/* This is a mountpoint, so d_revalidate will never
+		 * have been called, so we need to refresh the
+		 * inode (for close-open consistency) ourselves.
+		 */
+		__nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	}
+out:
+	put_rpccred(cred);
+	return res;
+}
+
+static int
+nfs_closedir(struct inode *inode, struct file *filp)
+{
+	put_nfs_open_dir_context(filp->private_data);
+	return 0;
+}
+
+struct nfs_cache_array_entry {
+	u64 cookie;
+	u64 ino;
+	struct qstr string;
+	unsigned char d_type;
+};
+
+struct nfs_cache_array {
+	unsigned int size;
+	int eof_index;
+	u64 last_cookie;
+	struct nfs_cache_array_entry array[0];
+};
+
+typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
+typedef struct {
+	struct file	*file;
+	struct page	*page;
+	unsigned long	page_index;
+	u64		*dir_cookie;
+	u64		last_cookie;
+	loff_t		current_index;
+	decode_dirent_t	decode;
+
+	unsigned long	timestamp;
+	unsigned long	gencount;
+	unsigned int	cache_entry_index;
+	unsigned int	plus:1;
+	unsigned int	eof:1;
+} nfs_readdir_descriptor_t;
+
+/*
+ * The caller is responsible for calling nfs_readdir_release_array(page)
+ */
+static
+struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
+{
+	void *ptr;
+	if (page == NULL)
+		return ERR_PTR(-EIO);
+	ptr = kmap(page);
+	if (ptr == NULL)
+		return ERR_PTR(-ENOMEM);
+	return ptr;
+}
+
+static
+void nfs_readdir_release_array(struct page *page)
+{
+	kunmap(page);
+}
+
+/*
+ * we are freeing strings created by nfs_add_to_readdir_array()
+ */
+static
+void nfs_readdir_clear_array(struct page *page)
+{
+	struct nfs_cache_array *array;
+	int i;
+
+	array = kmap_atomic(page, KM_USER0);
+	for (i = 0; i < array->size; i++)
+		kfree(array->array[i].string.name);
+	kunmap_atomic(array, KM_USER0);
+}
+
+/*
+ * the caller is responsible for freeing qstr.name
+ * when called by nfs_readdir_add_to_array, the strings will be freed in
+ * nfs_clear_readdir_array()
+ */
+static
+int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
+{
+	string->len = len;
+	string->name = kmemdup(name, len, GFP_KERNEL);
+	if (string->name == NULL)
+		return -ENOMEM;
+	/*
+	 * Avoid a kmemleak false positive. The pointer to the name is stored
+	 * in a page cache page which kmemleak does not scan.
+	 */
+	kmemleak_not_leak(string->name);
+	string->hash = full_name_hash(name, len);
+	return 0;
+}
+
+static
+int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
+{
+	struct nfs_cache_array *array = nfs_readdir_get_array(page);
+	struct nfs_cache_array_entry *cache_entry;
+	int ret;
+
+	if (IS_ERR(array))
+		return PTR_ERR(array);
+
+	cache_entry = &array->array[array->size];
+
+	/* Check that this entry lies within the page bounds */
+	ret = -ENOSPC;
+	if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE)
+		goto out;
+
+	cache_entry->cookie = entry->prev_cookie;
+	cache_entry->ino = entry->ino;
+	cache_entry->d_type = entry->d_type;
+	ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
+	if (ret)
+		goto out;
+	array->last_cookie = entry->cookie;
+	array->size++;
+	if (entry->eof != 0)
+		array->eof_index = array->size;
+out:
+	nfs_readdir_release_array(page);
+	return ret;
+}
+
+static
+int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+	loff_t diff = desc->file->f_pos - desc->current_index;
+	unsigned int index;
+
+	if (diff < 0)
+		goto out_eof;
+	if (diff >= array->size) {
+		if (array->eof_index >= 0)
+			goto out_eof;
+		return -EAGAIN;
+	}
+
+	index = (unsigned int)diff;
+	*desc->dir_cookie = array->array[index].cookie;
+	desc->cache_entry_index = index;
+	return 0;
+out_eof:
+	desc->eof = 1;
+	return -EBADCOOKIE;
+}
+
+static
+int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+	int i;
+	loff_t new_pos;
+	int status = -EAGAIN;
+
+	for (i = 0; i < array->size; i++) {
+		if (array->array[i].cookie == *desc->dir_cookie) {
+			struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode);
+			struct nfs_open_dir_context *ctx = desc->file->private_data;
+
+			new_pos = desc->current_index + i;
+			if (ctx->attr_gencount != nfsi->attr_gencount
+			    || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
+				ctx->duped = 0;
+				ctx->attr_gencount = nfsi->attr_gencount;
+			} else if (new_pos < desc->file->f_pos) {
+				if (ctx->duped > 0
+				    && ctx->dup_cookie == *desc->dir_cookie) {
+					if (printk_ratelimit()) {
+						pr_notice("NFS: directory %s/%s contains a readdir loop."
+								"Please contact your server vendor.  "
+								"Offending cookie: %llu\n",
+								desc->file->f_dentry->d_parent->d_name.name,
+								desc->file->f_dentry->d_name.name,
+								*desc->dir_cookie);
+					}
+					status = -ELOOP;
+					goto out;
+				}
+				ctx->dup_cookie = *desc->dir_cookie;
+				ctx->duped = -1;
+			}
+			desc->file->f_pos = new_pos;
+			desc->cache_entry_index = i;
+			return 0;
+		}
+	}
+	if (array->eof_index >= 0) {
+		status = -EBADCOOKIE;
+		if (*desc->dir_cookie == array->last_cookie)
+			desc->eof = 1;
+	}
+out:
+	return status;
+}
+
+static
+int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
+{
+	struct nfs_cache_array *array;
+	int status;
+
+	array = nfs_readdir_get_array(desc->page);
+	if (IS_ERR(array)) {
+		status = PTR_ERR(array);
+		goto out;
+	}
+
+	if (*desc->dir_cookie == 0)
+		status = nfs_readdir_search_for_pos(array, desc);
+	else
+		status = nfs_readdir_search_for_cookie(array, desc);
+
+	if (status == -EAGAIN) {
+		desc->last_cookie = array->last_cookie;
+		desc->current_index += array->size;
+		desc->page_index++;
+	}
+	nfs_readdir_release_array(desc->page);
+out:
+	return status;
+}
+
+/* Fill a page with xdr information before transferring to the cache page */
+static
+int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
+			struct nfs_entry *entry, struct file *file, struct inode *inode)
+{
+	struct nfs_open_dir_context *ctx = file->private_data;
+	struct rpc_cred	*cred = ctx->cred;
+	unsigned long	timestamp, gencount;
+	int		error;
+
+ again:
+	timestamp = jiffies;
+	gencount = nfs_inc_attr_generation_counter();
+	error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
+					  NFS_SERVER(inode)->dtsize, desc->plus);
+	if (error < 0) {
+		/* We requested READDIRPLUS, but the server doesn't grok it */
+		if (error == -ENOTSUPP && desc->plus) {
+			NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
+			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
+			desc->plus = 0;
+			goto again;
+		}
+		goto error;
+	}
+	desc->timestamp = timestamp;
+	desc->gencount = gencount;
+error:
+	return error;
+}
+
+static int xdr_decode(nfs_readdir_descriptor_t *desc,
+		      struct nfs_entry *entry, struct xdr_stream *xdr)
+{
+	int error;
+
+	error = desc->decode(xdr, entry, desc->plus);
+	if (error)
+		return error;
+	entry->fattr->time_start = desc->timestamp;
+	entry->fattr->gencount = desc->gencount;
+	return 0;
+}
+
+static
+int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
+{
+	if (dentry->d_inode == NULL)
+		goto different;
+	if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)
+		goto different;
+	return 1;
+different:
+	return 0;
+}
+
+static
+void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
+{
+	struct qstr filename = {
+		.len = entry->len,
+		.name = entry->name,
+	};
+	struct dentry *dentry;
+	struct dentry *alias;
+	struct inode *dir = parent->d_inode;
+	struct inode *inode;
+
+	if (filename.name[0] == '.') {
+		if (filename.len == 1)
+			return;
+		if (filename.len == 2 && filename.name[1] == '.')
+			return;
+	}
+	filename.hash = full_name_hash(filename.name, filename.len);
+
+	dentry = d_lookup(parent, &filename);
+	if (dentry != NULL) {
+		if (nfs_same_file(dentry, entry)) {
+			nfs_refresh_inode(dentry->d_inode, entry->fattr);
+			goto out;
+		} else {
+			d_drop(dentry);
+			dput(dentry);
+		}
+	}
+
+	dentry = d_alloc(parent, &filename);
+	if (dentry == NULL)
+		return;
+
+	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+	if (IS_ERR(inode))
+		goto out;
+
+	alias = d_materialise_unique(dentry, inode);
+	if (IS_ERR(alias))
+		goto out;
+	else if (alias) {
+		nfs_set_verifier(alias, nfs_save_change_attribute(dir));
+		dput(alias);
+	} else
+		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+
+out:
+	dput(dentry);
+}
+
+/* Perform conversion from xdr to cache array */
+static
+int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+				struct page **xdr_pages, struct page *page, unsigned int buflen)
+{
+	struct xdr_stream stream;
+	struct xdr_buf buf;
+	struct page *scratch;
+	struct nfs_cache_array *array;
+	unsigned int count = 0;
+	int status;
+
+	scratch = alloc_page(GFP_KERNEL);
+	if (scratch == NULL)
+		return -ENOMEM;
+
+	xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+	do {
+		status = xdr_decode(desc, entry, &stream);
+		if (status != 0) {
+			if (status == -EAGAIN)
+				status = 0;
+			break;
+		}
+
+		count++;
+
+		if (desc->plus != 0)
+			nfs_prime_dcache(desc->file->f_path.dentry, entry);
+
+		status = nfs_readdir_add_to_array(entry, page);
+		if (status != 0)
+			break;
+	} while (!entry->eof);
+
+	if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
+		array = nfs_readdir_get_array(page);
+		if (!IS_ERR(array)) {
+			array->eof_index = array->size;
+			status = 0;
+			nfs_readdir_release_array(page);
+		} else
+			status = PTR_ERR(array);
+	}
+
+	put_page(scratch);
+	return status;
+}
+
+static
+void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+{
+	unsigned int i;
+	for (i = 0; i < npages; i++)
+		put_page(pages[i]);
+}
+
+static
+void nfs_readdir_free_large_page(void *ptr, struct page **pages,
+		unsigned int npages)
+{
+	nfs_readdir_free_pagearray(pages, npages);
+}
+
+/*
+ * nfs_readdir_large_page will allocate pages that must be freed with a call
+ * to nfs_readdir_free_large_page
+ */
+static
+int nfs_readdir_large_page(struct page **pages, unsigned int npages)
+{
+	unsigned int i;
+
+	for (i = 0; i < npages; i++) {
+		struct page *page = alloc_page(GFP_KERNEL);
+		if (page == NULL)
+			goto out_freepages;
+		pages[i] = page;
+	}
+	return 0;
+
+out_freepages:
+	nfs_readdir_free_pagearray(pages, i);
+	return -ENOMEM;
+}
+
+static
+int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
+{
+	struct page *pages[NFS_MAX_READDIR_PAGES];
+	void *pages_ptr = NULL;
+	struct nfs_entry entry;
+	struct file	*file = desc->file;
+	struct nfs_cache_array *array;
+	int status = -ENOMEM;
+	unsigned int array_size = ARRAY_SIZE(pages);
+
+	entry.prev_cookie = 0;
+	entry.cookie = desc->last_cookie;
+	entry.eof = 0;
+	entry.fh = nfs_alloc_fhandle();
+	entry.fattr = nfs_alloc_fattr();
+	entry.server = NFS_SERVER(inode);
+	if (entry.fh == NULL || entry.fattr == NULL)
+		goto out;
+
+	array = nfs_readdir_get_array(page);
+	if (IS_ERR(array)) {
+		status = PTR_ERR(array);
+		goto out;
+	}
+	memset(array, 0, sizeof(struct nfs_cache_array));
+	array->eof_index = -1;
+
+	status = nfs_readdir_large_page(pages, array_size);
+	if (status < 0)
+		goto out_release_array;
+	do {
+		unsigned int pglen;
+		status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
+
+		if (status < 0)
+			break;
+		pglen = status;
+		status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
+		if (status < 0) {
+			if (status == -ENOSPC)
+				status = 0;
+			break;
+		}
+	} while (array->eof_index < 0);
+
+	nfs_readdir_free_large_page(pages_ptr, pages, array_size);
+out_release_array:
+	nfs_readdir_release_array(page);
+out:
+	nfs_free_fattr(entry.fattr);
+	nfs_free_fhandle(entry.fh);
+	return status;
+}
+
+/*
+ * Now we cache directories properly, by converting xdr information
+ * to an array that can be used for lookups later.  This results in
+ * fewer cache pages, since we can store more information on each page.
+ * We only need to convert from xdr once so future lookups are much simpler
+ */
+static
+int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
+{
+	struct inode	*inode = desc->file->f_path.dentry->d_inode;
+	int ret;
+
+	ret = nfs_readdir_xdr_to_array(desc, page, inode);
+	if (ret < 0)
+		goto error;
+	SetPageUptodate(page);
+
+	if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
+		/* Should never happen */
+		nfs_zap_mapping(inode, inode->i_mapping);
+	}
+	unlock_page(page);
+	return 0;
+ error:
+	unlock_page(page);
+	return ret;
+}
+
+static
+void cache_page_release(nfs_readdir_descriptor_t *desc)
+{
+	if (!desc->page->mapping)
+		nfs_readdir_clear_array(desc->page);
+	page_cache_release(desc->page);
+	desc->page = NULL;
+}
+
+static
+struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
+{
+	return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
+			desc->page_index, (filler_t *)nfs_readdir_filler, desc);
+}
+
+/*
+ * Returns 0 if desc->dir_cookie was found on page desc->page_index
+ */
+static
+int find_cache_page(nfs_readdir_descriptor_t *desc)
+{
+	int res;
+
+	desc->page = get_cache_page(desc);
+	if (IS_ERR(desc->page))
+		return PTR_ERR(desc->page);
+
+	res = nfs_readdir_search_array(desc);
+	if (res != 0)
+		cache_page_release(desc);
+	return res;
+}
+
+/* Search for desc->dir_cookie from the beginning of the page cache */
+static inline
+int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
+{
+	int res;
+
+	if (desc->page_index == 0) {
+		desc->current_index = 0;
+		desc->last_cookie = 0;
+	}
+	do {
+		res = find_cache_page(desc);
+	} while (res == -EAGAIN);
+	return res;
+}
+
+/*
+ * Once we've found the start of the dirent within a page: fill 'er up...
+ */
+static 
+int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
+		   filldir_t filldir)
+{
+	struct file	*file = desc->file;
+	int i = 0;
+	int res = 0;
+	struct nfs_cache_array *array = NULL;
+	struct nfs_open_dir_context *ctx = file->private_data;
+
+	array = nfs_readdir_get_array(desc->page);
+	if (IS_ERR(array)) {
+		res = PTR_ERR(array);
+		goto out;
+	}
+
+	for (i = desc->cache_entry_index; i < array->size; i++) {
+		struct nfs_cache_array_entry *ent;
+
+		ent = &array->array[i];
+		if (filldir(dirent, ent->string.name, ent->string.len,
+		    file->f_pos, nfs_compat_user_ino64(ent->ino),
+		    ent->d_type) < 0) {
+			desc->eof = 1;
+			break;
+		}
+		file->f_pos++;
+		if (i < (array->size-1))
+			*desc->dir_cookie = array->array[i+1].cookie;
+		else
+			*desc->dir_cookie = array->last_cookie;
+		if (ctx->duped != 0)
+			ctx->duped = 1;
+	}
+	if (array->eof_index >= 0)
+		desc->eof = 1;
+
+	nfs_readdir_release_array(desc->page);
+out:
+	cache_page_release(desc);
+	dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
+			(unsigned long long)*desc->dir_cookie, res);
+	return res;
+}
+
+/*
+ * If we cannot find a cookie in our cache, we suspect that this is
+ * because it points to a deleted file, so we ask the server to return
+ * whatever it thinks is the next entry. We then feed this to filldir.
+ * If all goes well, we should then be able to find our way round the
+ * cache on the next call to readdir_search_pagecache();
+ *
+ * NOTE: we cannot add the anonymous page to the pagecache because
+ *	 the data it contains might not be page aligned. Besides,
+ *	 we should already have a complete representation of the
+ *	 directory in the page cache by the time we get here.
+ */
+static inline
+int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
+		     filldir_t filldir)
+{
+	struct page	*page = NULL;
+	int		status;
+	struct inode *inode = desc->file->f_path.dentry->d_inode;
+	struct nfs_open_dir_context *ctx = desc->file->private_data;
+
+	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
+			(unsigned long long)*desc->dir_cookie);
+
+	page = alloc_page(GFP_HIGHUSER);
+	if (!page) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	desc->page_index = 0;
+	desc->last_cookie = *desc->dir_cookie;
+	desc->page = page;
+	ctx->duped = 0;
+
+	status = nfs_readdir_xdr_to_array(desc, page, inode);
+	if (status < 0)
+		goto out_release;
+
+	status = nfs_do_filldir(desc, dirent, filldir);
+
+ out:
+	dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
+			__func__, status);
+	return status;
+ out_release:
+	cache_page_release(desc);
+	goto out;
+}
+
+/* The file offset position represents the dirent entry number.  A
+   last cookie cache takes care of the common case of reading the
+   whole directory.
+ */
+static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry	*dentry = filp->f_path.dentry;
+	struct inode	*inode = dentry->d_inode;
+	nfs_readdir_descriptor_t my_desc,
+			*desc = &my_desc;
+	struct nfs_open_dir_context *dir_ctx = filp->private_data;
+	int res;
+
+	dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
+			dentry->d_parent->d_name.name, dentry->d_name.name,
+			(long long)filp->f_pos);
+	nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
+
+	/*
+	 * filp->f_pos points to the dirent entry number.
+	 * *desc->dir_cookie has the cookie for the next entry. We have
+	 * to either find the entry with the appropriate number or
+	 * revalidate the cookie.
+	 */
+	memset(desc, 0, sizeof(*desc));
+
+	desc->file = filp;
+	desc->dir_cookie = &dir_ctx->dir_cookie;
+	desc->decode = NFS_PROTO(inode)->decode_dirent;
+	desc->plus = NFS_USE_READDIRPLUS(inode);
+
+	nfs_block_sillyrename(dentry);
+	res = nfs_revalidate_mapping(inode, filp->f_mapping);
+	if (res < 0)
+		goto out;
+
+	do {
+		res = readdir_search_pagecache(desc);
+
+		if (res == -EBADCOOKIE) {
+			res = 0;
+			/* This means either end of directory */
+			if (*desc->dir_cookie && desc->eof == 0) {
+				/* Or that the server has 'lost' a cookie */
+				res = uncached_readdir(desc, dirent, filldir);
+				if (res == 0)
+					continue;
+			}
+			break;
+		}
+		if (res == -ETOOSMALL && desc->plus) {
+			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
+			nfs_zap_caches(inode);
+			desc->page_index = 0;
+			desc->plus = 0;
+			desc->eof = 0;
+			continue;
+		}
+		if (res < 0)
+			break;
+
+		res = nfs_do_filldir(desc, dirent, filldir);
+		if (res < 0)
+			break;
+	} while (!desc->eof);
+out:
+	nfs_unblock_sillyrename(dentry);
+	if (res > 0)
+		res = 0;
+	dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
+			dentry->d_parent->d_name.name, dentry->d_name.name,
+			res);
+	return res;
+}
+
+static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct nfs_open_dir_context *dir_ctx = filp->private_data;
+
+	dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
+			dentry->d_parent->d_name.name,
+			dentry->d_name.name,
+			offset, origin);
+
+	mutex_lock(&inode->i_mutex);
+	switch (origin) {
+		case 1:
+			offset += filp->f_pos;
+		case 0:
+			if (offset >= 0)
+				break;
+		default:
+			offset = -EINVAL;
+			goto out;
+	}
+	if (offset != filp->f_pos) {
+		filp->f_pos = offset;
+		dir_ctx->dir_cookie = 0;
+		dir_ctx->duped = 0;
+	}
+out:
+	mutex_unlock(&inode->i_mutex);
+	return offset;
+}
+
+/*
+ * All directory operations under NFS are synchronous, so fsync()
+ * is a dummy operation.
+ */
+static int nfs_fsync_dir(struct file *filp, int datasync)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+
+	dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
+			dentry->d_parent->d_name.name, dentry->d_name.name,
+			datasync);
+
+	nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);
+	return 0;
+}
+
+/**
+ * nfs_force_lookup_revalidate - Mark the directory as having changed
+ * @dir - pointer to directory inode
+ *
+ * This forces the revalidation code in nfs_lookup_revalidate() to do a
+ * full lookup on all child dentries of 'dir' whenever a change occurs
+ * on the server that might have invalidated our dcache.
+ *
+ * The caller should be holding dir->i_lock
+ */
+void nfs_force_lookup_revalidate(struct inode *dir)
+{
+	NFS_I(dir)->cache_change_attribute++;
+}
+
+/*
+ * A check for whether or not the parent directory has changed.
+ * In the case it has, we assume that the dentries are untrustworthy
+ * and may need to be looked up again.
+ */
+static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
+{
+	if (IS_ROOT(dentry))
+		return 1;
+	if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+		return 0;
+	if (!nfs_verify_change_attribute(dir, dentry->d_time))
+		return 0;
+	/* Revalidate nfsi->cache_change_attribute before we declare a match */
+	if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+		return 0;
+	if (!nfs_verify_change_attribute(dir, dentry->d_time))
+		return 0;
+	return 1;
+}
+
+/*
+ * Return the intent data that applies to this particular path component
+ *
+ * Note that the current set of intents only apply to the very last
+ * component of the path.
+ * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT.
+ */
+static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd,
+						unsigned int mask)
+{
+	if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT))
+		return 0;
+	return nd->flags & mask;
+}
+
+/*
+ * Use intent information to check whether or not we're going to do
+ * an O_EXCL create using this path component.
+ */
+static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
+{
+	if (NFS_PROTO(dir)->version == 2)
+		return 0;
+	return nd && nfs_lookup_check_intent(nd, LOOKUP_EXCL);
+}
+
+/*
+ * Inode and filehandle revalidation for lookups.
+ *
+ * We force revalidation in the cases where the VFS sets LOOKUP_REVAL,
+ * or if the intent information indicates that we're about to open this
+ * particular file and the "nocto" mount flag is not set.
+ *
+ */
+static inline
+int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+
+	if (IS_AUTOMOUNT(inode))
+		return 0;
+	if (nd != NULL) {
+		/* VFS wants an on-the-wire revalidation */
+		if (nd->flags & LOOKUP_REVAL)
+			goto out_force;
+		/* This is an open(2) */
+		if (nfs_lookup_check_intent(nd, LOOKUP_OPEN) != 0 &&
+				!(server->flags & NFS_MOUNT_NOCTO) &&
+				(S_ISREG(inode->i_mode) ||
+				 S_ISDIR(inode->i_mode)))
+			goto out_force;
+		return 0;
+	}
+	return nfs_revalidate_inode(server, inode);
+out_force:
+	return __nfs_revalidate_inode(server, inode);
+}
+
+/*
+ * We judge how long we want to trust negative
+ * dentries by looking at the parent inode mtime.
+ *
+ * If parent mtime has changed, we revalidate, else we wait for a
+ * period corresponding to the parent's attribute cache timeout value.
+ */
+static inline
+int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
+		       struct nameidata *nd)
+{
+	/* Don't revalidate a negative dentry if we're creating a new file */
+	if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0)
+		return 0;
+	if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
+		return 1;
+	return !nfs_check_verifier(dir, dentry);
+}
+
+/*
+ * This is called every time the dcache has a lookup hit,
+ * and we should check whether we can really trust that
+ * lookup.
+ *
+ * NOTE! The hit can be a negative hit too, don't assume
+ * we have an inode!
+ *
+ * If the parent directory is seen to have changed, we throw out the
+ * cached dentry and do a new lookup.
+ */
+static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *dir;
+	struct inode *inode;
+	struct dentry *parent;
+	struct nfs_fh *fhandle = NULL;
+	struct nfs_fattr *fattr = NULL;
+	int error;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	parent = dget_parent(dentry);
+	dir = parent->d_inode;
+	nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
+	inode = dentry->d_inode;
+
+	if (!inode) {
+		if (nfs_neg_need_reval(dir, dentry, nd))
+			goto out_bad;
+		goto out_valid;
+	}
+
+	if (is_bad_inode(inode)) {
+		dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n",
+				__func__, dentry->d_parent->d_name.name,
+				dentry->d_name.name);
+		goto out_bad;
+	}
+
+	if (nfs_have_delegation(inode, FMODE_READ))
+		goto out_set_verifier;
+
+	/* Force a full look up iff the parent directory has changed */
+	if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) {
+		if (nfs_lookup_verify_inode(inode, nd))
+			goto out_zap_parent;
+		goto out_valid;
+	}
+
+	if (NFS_STALE(inode))
+		goto out_bad;
+
+	error = -ENOMEM;
+	fhandle = nfs_alloc_fhandle();
+	fattr = nfs_alloc_fattr();
+	if (fhandle == NULL || fattr == NULL)
+		goto out_error;
+
+	error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
+	if (error)
+		goto out_bad;
+	if (nfs_compare_fh(NFS_FH(inode), fhandle))
+		goto out_bad;
+	if ((error = nfs_refresh_inode(inode, fattr)) != 0)
+		goto out_bad;
+
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fhandle);
+out_set_verifier:
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ out_valid:
+	dput(parent);
+	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
+			__func__, dentry->d_parent->d_name.name,
+			dentry->d_name.name);
+	return 1;
+out_zap_parent:
+	nfs_zap_caches(dir);
+ out_bad:
+	nfs_mark_for_revalidate(dir);
+	if (inode && S_ISDIR(inode->i_mode)) {
+		/* Purge readdir caches. */
+		nfs_zap_caches(inode);
+		/* If we have submounts, don't unhash ! */
+		if (have_submounts(dentry))
+			goto out_valid;
+		if (dentry->d_flags & DCACHE_DISCONNECTED)
+			goto out_valid;
+		shrink_dcache_parent(dentry);
+	}
+	d_drop(dentry);
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fhandle);
+	dput(parent);
+	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
+			__func__, dentry->d_parent->d_name.name,
+			dentry->d_name.name);
+	return 0;
+out_error:
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fhandle);
+	dput(parent);
+	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
+			__func__, dentry->d_parent->d_name.name,
+			dentry->d_name.name, error);
+	return error;
+}
+
+/*
+ * This is called from dput() when d_count is going to 0.
+ */
+static int nfs_dentry_delete(const struct dentry *dentry)
+{
+	dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name,
+		dentry->d_flags);
+
+	/* Unhash any dentry with a stale inode */
+	if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode))
+		return 1;
+
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+		/* Unhash it, so that ->d_iput() would be called */
+		return 1;
+	}
+	if (!(dentry->d_sb->s_flags & MS_ACTIVE)) {
+		/* Unhash it, so that ancestors of killed async unlink
+		 * files will be cleaned up during umount */
+		return 1;
+	}
+	return 0;
+
+}
+
+static void nfs_drop_nlink(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	if (inode->i_nlink > 0)
+		drop_nlink(inode);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Called when the dentry loses inode.
+ * We use it to clean up silly-renamed files.
+ */
+static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode))
+		/* drop any readdir cache as it could easily be old */
+		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+		drop_nlink(inode);
+		nfs_complete_unlink(dentry, inode);
+	}
+	iput(inode);
+}
+
+static void nfs_d_release(struct dentry *dentry)
+{
+	/* free cached devname value, if it survived that far */
+	if (unlikely(dentry->d_fsdata)) {
+		if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+			WARN_ON(1);
+		else
+			kfree(dentry->d_fsdata);
+	}
+}
+
+const struct dentry_operations nfs_dentry_operations = {
+	.d_revalidate	= nfs_lookup_revalidate,
+	.d_delete	= nfs_dentry_delete,
+	.d_iput		= nfs_dentry_iput,
+	.d_automount	= nfs_d_automount,
+	.d_release	= nfs_d_release,
+};
+
+static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+{
+	struct dentry *res;
+	struct dentry *parent;
+	struct inode *inode = NULL;
+	struct nfs_fh *fhandle = NULL;
+	struct nfs_fattr *fattr = NULL;
+	int error;
+
+	dfprintk(VFS, "NFS: lookup(%s/%s)\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+	nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);
+
+	res = ERR_PTR(-ENAMETOOLONG);
+	if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
+		goto out;
+
+	/*
+	 * If we're doing an exclusive create, optimize away the lookup
+	 * but don't hash the dentry.
+	 */
+	if (nfs_is_exclusive_create(dir, nd)) {
+		d_instantiate(dentry, NULL);
+		res = NULL;
+		goto out;
+	}
+
+	res = ERR_PTR(-ENOMEM);
+	fhandle = nfs_alloc_fhandle();
+	fattr = nfs_alloc_fattr();
+	if (fhandle == NULL || fattr == NULL)
+		goto out;
+
+	parent = dentry->d_parent;
+	/* Protect against concurrent sillydeletes */
+	nfs_block_sillyrename(parent);
+	error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
+	if (error == -ENOENT)
+		goto no_entry;
+	if (error < 0) {
+		res = ERR_PTR(error);
+		goto out_unblock_sillyrename;
+	}
+	inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+	res = ERR_CAST(inode);
+	if (IS_ERR(res))
+		goto out_unblock_sillyrename;
+
+no_entry:
+	res = d_materialise_unique(dentry, inode);
+	if (res != NULL) {
+		if (IS_ERR(res))
+			goto out_unblock_sillyrename;
+		dentry = res;
+	}
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+out_unblock_sillyrename:
+	nfs_unblock_sillyrename(parent);
+out:
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fhandle);
+	return res;
+}
+
+#ifdef CONFIG_NFS_V4
+static int nfs_open_revalidate(struct dentry *, struct nameidata *);
+
+const struct dentry_operations nfs4_dentry_operations = {
+	.d_revalidate	= nfs_open_revalidate,
+	.d_delete	= nfs_dentry_delete,
+	.d_iput		= nfs_dentry_iput,
+	.d_automount	= nfs_d_automount,
+	.d_release	= nfs_d_release,
+};
+
+/*
+ * Use intent information to determine whether we need to substitute
+ * the NFSv4-style stateful OPEN for the LOOKUP call
+ */
+static int is_atomic_open(struct nameidata *nd)
+{
+	if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_OPEN) == 0)
+		return 0;
+	/* NFS does not (yet) have a stateful open for directories */
+	if (nd->flags & LOOKUP_DIRECTORY)
+		return 0;
+	/* Are we trying to write to a read only partition? */
+	if (__mnt_is_readonly(nd->path.mnt) &&
+	    (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE)))
+		return 0;
+	return 1;
+}
+
+static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd)
+{
+	struct path path = {
+		.mnt = nd->path.mnt,
+		.dentry = dentry,
+	};
+	struct nfs_open_context *ctx;
+	struct rpc_cred *cred;
+	fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+
+	cred = rpc_lookup_cred();
+	if (IS_ERR(cred))
+		return ERR_CAST(cred);
+	ctx = alloc_nfs_open_context(&path, cred, fmode);
+	put_rpccred(cred);
+	if (ctx == NULL)
+		return ERR_PTR(-ENOMEM);
+	return ctx;
+}
+
+static int do_open(struct inode *inode, struct file *filp)
+{
+	nfs_fscache_set_inode_cookie(inode, filp);
+	return 0;
+}
+
+static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx)
+{
+	struct file *filp;
+	int ret = 0;
+
+	/* If the open_intent is for execute, we have an extra check to make */
+	if (ctx->mode & FMODE_EXEC) {
+		ret = nfs_may_open(ctx->path.dentry->d_inode,
+				ctx->cred,
+				nd->intent.open.flags);
+		if (ret < 0)
+			goto out;
+	}
+	filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open);
+	if (IS_ERR(filp))
+		ret = PTR_ERR(filp);
+	else
+		nfs_file_set_open_context(filp, ctx);
+out:
+	put_nfs_open_context(ctx);
+	return ret;
+}
+
+static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct nfs_open_context *ctx;
+	struct iattr attr;
+	struct dentry *res = NULL;
+	struct inode *inode;
+	int open_flags;
+	int err;
+
+	dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n",
+			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+
+	/* Check that we are indeed trying to open this file */
+	if (!is_atomic_open(nd))
+		goto no_open;
+
+	if (dentry->d_name.len > NFS_SERVER(dir)->namelen) {
+		res = ERR_PTR(-ENAMETOOLONG);
+		goto out;
+	}
+
+	/* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
+	 * the dentry. */
+	if (nd->flags & LOOKUP_EXCL) {
+		d_instantiate(dentry, NULL);
+		goto out;
+	}
+
+	ctx = nameidata_to_nfs_open_context(dentry, nd);
+	res = ERR_CAST(ctx);
+	if (IS_ERR(ctx))
+		goto out;
+
+	open_flags = nd->intent.open.flags;
+	if (nd->flags & LOOKUP_CREATE) {
+		attr.ia_mode = nd->intent.open.create_mode;
+		attr.ia_valid = ATTR_MODE;
+		attr.ia_mode &= ~current_umask();
+	} else {
+		open_flags &= ~(O_EXCL | O_CREAT);
+		attr.ia_valid = 0;
+	}
+
+	/* Open the file on the server */
+	nfs_block_sillyrename(dentry->d_parent);
+	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
+	if (IS_ERR(inode)) {
+		nfs_unblock_sillyrename(dentry->d_parent);
+		put_nfs_open_context(ctx);
+		switch (PTR_ERR(inode)) {
+			/* Make a negative dentry */
+			case -ENOENT:
+				d_add(dentry, NULL);
+				res = NULL;
+				goto out;
+			/* This turned out not to be a regular file */
+			case -EISDIR:
+			case -ENOTDIR:
+				goto no_open;
+			case -ELOOP:
+				if (!(nd->intent.open.flags & O_NOFOLLOW))
+					goto no_open;
+			/* case -EINVAL: */
+			default:
+				res = ERR_CAST(inode);
+				goto out;
+		}
+	}
+	res = d_add_unique(dentry, inode);
+	nfs_unblock_sillyrename(dentry->d_parent);
+	if (res != NULL) {
+		dput(ctx->path.dentry);
+		ctx->path.dentry = dget(res);
+		dentry = res;
+	}
+	err = nfs_intent_set_file(nd, ctx);
+	if (err < 0) {
+		if (res != NULL)
+			dput(res);
+		return ERR_PTR(err);
+	}
+out:
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+	return res;
+no_open:
+	return nfs_lookup(dir, dentry, nd);
+}
+
+static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct dentry *parent = NULL;
+	struct inode *inode;
+	struct inode *dir;
+	struct nfs_open_context *ctx;
+	int openflags, ret = 0;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	if (!is_atomic_open(nd) || d_mountpoint(dentry))
+		goto no_open;
+
+	parent = dget_parent(dentry);
+	dir = parent->d_inode;
+
+	/* We can't create new files in nfs_open_revalidate(), so we
+	 * optimize away revalidation of negative dentries.
+	 */
+	if (inode == NULL) {
+		if (!nfs_neg_need_reval(dir, dentry, nd))
+			ret = 1;
+		goto out;
+	}
+
+	/* NFS only supports OPEN on regular files */
+	if (!S_ISREG(inode->i_mode))
+		goto no_open_dput;
+	openflags = nd->intent.open.flags;
+	/* We cannot do exclusive creation on a positive dentry */
+	if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
+		goto no_open_dput;
+	/* We can't create new files, or truncate existing ones here */
+	openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
+
+	ctx = nameidata_to_nfs_open_context(dentry, nd);
+	ret = PTR_ERR(ctx);
+	if (IS_ERR(ctx))
+		goto out;
+	/*
+	 * Note: we're not holding inode->i_mutex and so may be racing with
+	 * operations that change the directory. We therefore save the
+	 * change attribute *before* we do the RPC call.
+	 */
+	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		switch (ret) {
+		case -EPERM:
+		case -EACCES:
+		case -EDQUOT:
+		case -ENOSPC:
+		case -EROFS:
+			goto out_put_ctx;
+		default:
+			goto out_drop;
+		}
+	}
+	iput(inode);
+	if (inode != dentry->d_inode)
+		goto out_drop;
+
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+	ret = nfs_intent_set_file(nd, ctx);
+	if (ret >= 0)
+		ret = 1;
+out:
+	dput(parent);
+	return ret;
+out_drop:
+	d_drop(dentry);
+	ret = 0;
+out_put_ctx:
+	put_nfs_open_context(ctx);
+	goto out;
+
+no_open_dput:
+	dput(parent);
+no_open:
+	return nfs_lookup_revalidate(dentry, nd);
+}
+
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	struct nfs_open_context *ctx = NULL;
+	struct iattr attr;
+	int error;
+	int open_flags = 0;
+
+	dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
+			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+
+	attr.ia_mode = mode;
+	attr.ia_valid = ATTR_MODE;
+
+	if ((nd->flags & LOOKUP_CREATE) != 0) {
+		open_flags = nd->intent.open.flags;
+
+		ctx = nameidata_to_nfs_open_context(dentry, nd);
+		error = PTR_ERR(ctx);
+		if (IS_ERR(ctx))
+			goto out_err_drop;
+	}
+
+	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx);
+	if (error != 0)
+		goto out_put_ctx;
+	if (ctx != NULL) {
+		error = nfs_intent_set_file(nd, ctx);
+		if (error < 0)
+			goto out_err;
+	}
+	return 0;
+out_put_ctx:
+	if (ctx != NULL)
+		put_nfs_open_context(ctx);
+out_err_drop:
+	d_drop(dentry);
+out_err:
+	return error;
+}
+
+#endif /* CONFIG_NFSV4 */
+
+/*
+ * Code common to create, mkdir, and mknod.
+ */
+int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
+				struct nfs_fattr *fattr)
+{
+	struct dentry *parent = dget_parent(dentry);
+	struct inode *dir = parent->d_inode;
+	struct inode *inode;
+	int error = -EACCES;
+
+	d_drop(dentry);
+
+	/* We may have been initialized further down */
+	if (dentry->d_inode)
+		goto out;
+	if (fhandle->size == 0) {
+		error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
+		if (error)
+			goto out_error;
+	}
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+	if (!(fattr->valid & NFS_ATTR_FATTR)) {
+		struct nfs_server *server = NFS_SB(dentry->d_sb);
+		error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
+		if (error < 0)
+			goto out_error;
+	}
+	inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+	error = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_error;
+	d_add(dentry, inode);
+out:
+	dput(parent);
+	return 0;
+out_error:
+	nfs_mark_for_revalidate(dir);
+	dput(parent);
+	return error;
+}
+
+/*
+ * Following a failed create operation, we drop the dentry rather
+ * than retain a negative dentry. This avoids a problem in the event
+ * that the operation succeeded on the server, but an error in the
+ * reply path made it appear to have failed.
+ */
+static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	struct iattr attr;
+	int error;
+	int open_flags = 0;
+
+	dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
+			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+
+	attr.ia_mode = mode;
+	attr.ia_valid = ATTR_MODE;
+
+	if ((nd->flags & LOOKUP_CREATE) != 0)
+		open_flags = nd->intent.open.flags;
+
+	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
+	if (error != 0)
+		goto out_err;
+	return 0;
+out_err:
+	d_drop(dentry);
+	return error;
+}
+
+/*
+ * See comments for nfs_proc_create regarding failed operations.
+ */
+static int
+nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+{
+	struct iattr attr;
+	int status;
+
+	dfprintk(VFS, "NFS: mknod(%s/%ld), %s\n",
+			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	attr.ia_mode = mode;
+	attr.ia_valid = ATTR_MODE;
+
+	status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
+	if (status != 0)
+		goto out_err;
+	return 0;
+out_err:
+	d_drop(dentry);
+	return status;
+}
+
+/*
+ * See comments for nfs_proc_create regarding failed operations.
+ */
+static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct iattr attr;
+	int error;
+
+	dfprintk(VFS, "NFS: mkdir(%s/%ld), %s\n",
+			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+
+	attr.ia_valid = ATTR_MODE;
+	attr.ia_mode = mode | S_IFDIR;
+
+	error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
+	if (error != 0)
+		goto out_err;
+	return 0;
+out_err:
+	d_drop(dentry);
+	return error;
+}
+
+static void nfs_dentry_handle_enoent(struct dentry *dentry)
+{
+	if (dentry->d_inode != NULL && !d_unhashed(dentry))
+		d_delete(dentry);
+}
+
+static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int error;
+
+	dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",
+			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+
+	error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
+	/* Ensure the VFS deletes this inode */
+	if (error == 0 && dentry->d_inode != NULL)
+		clear_nlink(dentry->d_inode);
+	else if (error == -ENOENT)
+		nfs_dentry_handle_enoent(dentry);
+
+	return error;
+}
+
+/*
+ * Remove a file after making sure there are no pending writes,
+ * and after checking that the file has only one user. 
+ *
+ * We invalidate the attribute cache and free the inode prior to the operation
+ * to avoid possible races if the server reuses the inode.
+ */
+static int nfs_safe_remove(struct dentry *dentry)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct inode *inode = dentry->d_inode;
+	int error = -EBUSY;
+		
+	dfprintk(VFS, "NFS: safe_remove(%s/%s)\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+
+	/* If the dentry was sillyrenamed, we simply call d_delete() */
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+		error = 0;
+		goto out;
+	}
+
+	if (inode != NULL) {
+		nfs_inode_return_delegation(inode);
+		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
+		/* The VFS may want to delete this inode */
+		if (error == 0)
+			nfs_drop_nlink(inode);
+		nfs_mark_for_revalidate(inode);
+	} else
+		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
+	if (error == -ENOENT)
+		nfs_dentry_handle_enoent(dentry);
+out:
+	return error;
+}
+
+/*  We do silly rename. In case sillyrename() returns -EBUSY, the inode
+ *  belongs to an active ".nfs..." file and we return -EBUSY.
+ *
+ *  If sillyrename() returns 0, we do nothing, otherwise we unlink.
+ */
+static int nfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int error;
+	int need_rehash = 0;
+
+	dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
+		dir->i_ino, dentry->d_name.name);
+
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_count > 1) {
+		spin_unlock(&dentry->d_lock);
+		/* Start asynchronous writeout of the inode */
+		write_inode_now(dentry->d_inode, 0);
+		error = nfs_sillyrename(dir, dentry);
+		return error;
+	}
+	if (!d_unhashed(dentry)) {
+		__d_drop(dentry);
+		need_rehash = 1;
+	}
+	spin_unlock(&dentry->d_lock);
+	error = nfs_safe_remove(dentry);
+	if (!error || error == -ENOENT) {
+		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+	} else if (need_rehash)
+		d_rehash(dentry);
+	return error;
+}
+
+/*
+ * To create a symbolic link, most file systems instantiate a new inode,
+ * add a page to it containing the path, then write it out to the disk
+ * using prepare_write/commit_write.
+ *
+ * Unfortunately the NFS client can't create the in-core inode first
+ * because it needs a file handle to create an in-core inode (see
+ * fs/nfs/inode.c:nfs_fhget).  We only have a file handle *after* the
+ * symlink request has completed on the server.
+ *
+ * So instead we allocate a raw page, copy the symname into it, then do
+ * the SYMLINK request with the page as the buffer.  If it succeeds, we
+ * now have a new file handle and can instantiate an in-core NFS inode
+ * and move the raw page into its mapping.
+ */
+static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	struct pagevec lru_pvec;
+	struct page *page;
+	char *kaddr;
+	struct iattr attr;
+	unsigned int pathlen = strlen(symname);
+	int error;
+
+	dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id,
+		dir->i_ino, dentry->d_name.name, symname);
+
+	if (pathlen > PAGE_SIZE)
+		return -ENAMETOOLONG;
+
+	attr.ia_mode = S_IFLNK | S_IRWXUGO;
+	attr.ia_valid = ATTR_MODE;
+
+	page = alloc_page(GFP_HIGHUSER);
+	if (!page)
+		return -ENOMEM;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memcpy(kaddr, symname, pathlen);
+	if (pathlen < PAGE_SIZE)
+		memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
+	if (error != 0) {
+		dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n",
+			dir->i_sb->s_id, dir->i_ino,
+			dentry->d_name.name, symname, error);
+		d_drop(dentry);
+		__free_page(page);
+		return error;
+	}
+
+	/*
+	 * No big deal if we can't add this page to the page cache here.
+	 * READLINK will get the missing page from the server if needed.
+	 */
+	pagevec_init(&lru_pvec, 0);
+	if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
+							GFP_KERNEL)) {
+		pagevec_add(&lru_pvec, page);
+		pagevec_lru_add_file(&lru_pvec);
+		SetPageUptodate(page);
+		unlock_page(page);
+	} else
+		__free_page(page);
+
+	return 0;
+}
+
+static int 
+nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+	int error;
+
+	dfprintk(VFS, "NFS: link(%s/%s -> %s/%s)\n",
+		old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+
+	nfs_inode_return_delegation(inode);
+
+	d_drop(dentry);
+	error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
+	if (error == 0) {
+		ihold(inode);
+		d_add(dentry, inode);
+	}
+	return error;
+}
+
+/*
+ * RENAME
+ * FIXME: Some nfsds, like the Linux user space nfsd, may generate a
+ * different file handle for the same inode after a rename (e.g. when
+ * moving to a different directory). A fail-safe method to do so would
+ * be to look up old_dir/old_name, create a link to new_dir/new_name and
+ * rename the old file using the sillyrename stuff. This way, the original
+ * file in old_dir will go away when the last process iput()s the inode.
+ *
+ * FIXED.
+ * 
+ * It actually works quite well. One needs to have the possibility for
+ * at least one ".nfs..." file in each directory the file ever gets
+ * moved or linked to which happens automagically with the new
+ * implementation that only depends on the dcache stuff instead of
+ * using the inode layer
+ *
+ * Unfortunately, things are a little more complicated than indicated
+ * above. For a cross-directory move, we want to make sure we can get
+ * rid of the old inode after the operation.  This means there must be
+ * no pending writes (if it's a file), and the use count must be 1.
+ * If these conditions are met, we can drop the dentries before doing
+ * the rename.
+ */
+static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct dentry *dentry = NULL, *rehash = NULL;
+	int error = -EBUSY;
+
+	dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
+		 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
+		 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
+		 new_dentry->d_count);
+
+	/*
+	 * For non-directories, check whether the target is busy and if so,
+	 * make a copy of the dentry and then do a silly-rename. If the
+	 * silly-rename succeeds, the copied dentry is hashed and becomes
+	 * the new target.
+	 */
+	if (new_inode && !S_ISDIR(new_inode->i_mode)) {
+		/*
+		 * To prevent any new references to the target during the
+		 * rename, we unhash the dentry in advance.
+		 */
+		if (!d_unhashed(new_dentry)) {
+			d_drop(new_dentry);
+			rehash = new_dentry;
+		}
+
+		if (new_dentry->d_count > 2) {
+			int err;
+
+			/* copy the target dentry's name */
+			dentry = d_alloc(new_dentry->d_parent,
+					 &new_dentry->d_name);
+			if (!dentry)
+				goto out;
+
+			/* silly-rename the existing target ... */
+			err = nfs_sillyrename(new_dir, new_dentry);
+			if (err)
+				goto out;
+
+			new_dentry = dentry;
+			rehash = NULL;
+			new_inode = NULL;
+		}
+	}
+
+	nfs_inode_return_delegation(old_inode);
+	if (new_inode != NULL)
+		nfs_inode_return_delegation(new_inode);
+
+	error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
+					   new_dir, &new_dentry->d_name);
+	nfs_mark_for_revalidate(old_inode);
+out:
+	if (rehash)
+		d_rehash(rehash);
+	if (!error) {
+		if (new_inode != NULL)
+			nfs_drop_nlink(new_inode);
+		d_move(old_dentry, new_dentry);
+		nfs_set_verifier(new_dentry,
+					nfs_save_change_attribute(new_dir));
+	} else if (error == -ENOENT)
+		nfs_dentry_handle_enoent(old_dentry);
+
+	/* new dentry created? */
+	if (dentry)
+		dput(dentry);
+	return error;
+}
+
+static DEFINE_SPINLOCK(nfs_access_lru_lock);
+static LIST_HEAD(nfs_access_lru_list);
+static atomic_long_t nfs_access_nr_entries;
+
+static void nfs_access_free_entry(struct nfs_access_entry *entry)
+{
+	put_rpccred(entry->cred);
+	kfree(entry);
+	smp_mb__before_atomic_dec();
+	atomic_long_dec(&nfs_access_nr_entries);
+	smp_mb__after_atomic_dec();
+}
+
+static void nfs_access_free_list(struct list_head *head)
+{
+	struct nfs_access_entry *cache;
+
+	while (!list_empty(head)) {
+		cache = list_entry(head->next, struct nfs_access_entry, lru);
+		list_del(&cache->lru);
+		nfs_access_free_entry(cache);
+	}
+}
+
+int nfs_access_cache_shrinker(struct shrinker *shrink,
+			      struct shrink_control *sc)
+{
+	LIST_HEAD(head);
+	struct nfs_inode *nfsi, *next;
+	struct nfs_access_entry *cache;
+	int nr_to_scan = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
+
+	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+		return (nr_to_scan == 0) ? 0 : -1;
+
+	spin_lock(&nfs_access_lru_lock);
+	list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
+		struct inode *inode;
+
+		if (nr_to_scan-- == 0)
+			break;
+		inode = &nfsi->vfs_inode;
+		spin_lock(&inode->i_lock);
+		if (list_empty(&nfsi->access_cache_entry_lru))
+			goto remove_lru_entry;
+		cache = list_entry(nfsi->access_cache_entry_lru.next,
+				struct nfs_access_entry, lru);
+		list_move(&cache->lru, &head);
+		rb_erase(&cache->rb_node, &nfsi->access_cache);
+		if (!list_empty(&nfsi->access_cache_entry_lru))
+			list_move_tail(&nfsi->access_cache_inode_lru,
+					&nfs_access_lru_list);
+		else {
+remove_lru_entry:
+			list_del_init(&nfsi->access_cache_inode_lru);
+			smp_mb__before_clear_bit();
+			clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
+			smp_mb__after_clear_bit();
+		}
+		spin_unlock(&inode->i_lock);
+	}
+	spin_unlock(&nfs_access_lru_lock);
+	nfs_access_free_list(&head);
+	return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
+}
+
+static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
+{
+	struct rb_root *root_node = &nfsi->access_cache;
+	struct rb_node *n;
+	struct nfs_access_entry *entry;
+
+	/* Unhook entries from the cache */
+	while ((n = rb_first(root_node)) != NULL) {
+		entry = rb_entry(n, struct nfs_access_entry, rb_node);
+		rb_erase(n, root_node);
+		list_move(&entry->lru, head);
+	}
+	nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
+}
+
+void nfs_access_zap_cache(struct inode *inode)
+{
+	LIST_HEAD(head);
+
+	if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)
+		return;
+	/* Remove from global LRU init */
+	spin_lock(&nfs_access_lru_lock);
+	if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
+		list_del_init(&NFS_I(inode)->access_cache_inode_lru);
+
+	spin_lock(&inode->i_lock);
+	__nfs_access_zap_cache(NFS_I(inode), &head);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&nfs_access_lru_lock);
+	nfs_access_free_list(&head);
+}
+
+static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
+{
+	struct rb_node *n = NFS_I(inode)->access_cache.rb_node;
+	struct nfs_access_entry *entry;
+
+	while (n != NULL) {
+		entry = rb_entry(n, struct nfs_access_entry, rb_node);
+
+		if (cred < entry->cred)
+			n = n->rb_left;
+		else if (cred > entry->cred)
+			n = n->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_access_entry *cache;
+	int err = -ENOENT;
+
+	spin_lock(&inode->i_lock);
+	if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+		goto out_zap;
+	cache = nfs_access_search_rbtree(inode, cred);
+	if (cache == NULL)
+		goto out;
+	if (!nfs_have_delegated_attributes(inode) &&
+	    !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+		goto out_stale;
+	res->jiffies = cache->jiffies;
+	res->cred = cache->cred;
+	res->mask = cache->mask;
+	list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
+	err = 0;
+out:
+	spin_unlock(&inode->i_lock);
+	return err;
+out_stale:
+	rb_erase(&cache->rb_node, &nfsi->access_cache);
+	list_del(&cache->lru);
+	spin_unlock(&inode->i_lock);
+	nfs_access_free_entry(cache);
+	return -ENOENT;
+out_zap:
+	spin_unlock(&inode->i_lock);
+	nfs_access_zap_cache(inode);
+	return -ENOENT;
+}
+
+static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct rb_root *root_node = &nfsi->access_cache;
+	struct rb_node **p = &root_node->rb_node;
+	struct rb_node *parent = NULL;
+	struct nfs_access_entry *entry;
+
+	spin_lock(&inode->i_lock);
+	while (*p != NULL) {
+		parent = *p;
+		entry = rb_entry(parent, struct nfs_access_entry, rb_node);
+
+		if (set->cred < entry->cred)
+			p = &parent->rb_left;
+		else if (set->cred > entry->cred)
+			p = &parent->rb_right;
+		else
+			goto found;
+	}
+	rb_link_node(&set->rb_node, parent, p);
+	rb_insert_color(&set->rb_node, root_node);
+	list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
+	spin_unlock(&inode->i_lock);
+	return;
+found:
+	rb_replace_node(parent, &set->rb_node, root_node);
+	list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
+	list_del(&entry->lru);
+	spin_unlock(&inode->i_lock);
+	nfs_access_free_entry(entry);
+}
+
+static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
+{
+	struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
+	if (cache == NULL)
+		return;
+	RB_CLEAR_NODE(&cache->rb_node);
+	cache->jiffies = set->jiffies;
+	cache->cred = get_rpccred(set->cred);
+	cache->mask = set->mask;
+
+	nfs_access_add_rbtree(inode, cache);
+
+	/* Update accounting */
+	smp_mb__before_atomic_inc();
+	atomic_long_inc(&nfs_access_nr_entries);
+	smp_mb__after_atomic_inc();
+
+	/* Add inode to global LRU list */
+	if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
+		spin_lock(&nfs_access_lru_lock);
+		if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
+			list_add_tail(&NFS_I(inode)->access_cache_inode_lru,
+					&nfs_access_lru_list);
+		spin_unlock(&nfs_access_lru_lock);
+	}
+}
+
+static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
+{
+	struct nfs_access_entry cache;
+	int status;
+
+	status = nfs_access_get_cached(inode, cred, &cache);
+	if (status == 0)
+		goto out;
+
+	/* Be clever: ask server to check for all possible rights */
+	cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
+	cache.cred = cred;
+	cache.jiffies = jiffies;
+	status = NFS_PROTO(inode)->access(inode, &cache);
+	if (status != 0) {
+		if (status == -ESTALE) {
+			nfs_zap_caches(inode);
+			if (!S_ISDIR(inode->i_mode))
+				set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+		}
+		return status;
+	}
+	nfs_access_add_cache(inode, &cache);
+out:
+	if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
+		return 0;
+	return -EACCES;
+}
+
+static int nfs_open_permission_mask(int openflags)
+{
+	int mask = 0;
+
+	if (openflags & FMODE_READ)
+		mask |= MAY_READ;
+	if (openflags & FMODE_WRITE)
+		mask |= MAY_WRITE;
+	if (openflags & FMODE_EXEC)
+		mask |= MAY_EXEC;
+	return mask;
+}
+
+int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
+{
+	return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
+}
+
+int nfs_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	struct rpc_cred *cred;
+	int res = 0;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	nfs_inc_stats(inode, NFSIOS_VFSACCESS);
+
+	if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
+		goto out;
+	/* Is this sys_access() ? */
+	if (mask & (MAY_ACCESS | MAY_CHDIR))
+		goto force_lookup;
+
+	switch (inode->i_mode & S_IFMT) {
+		case S_IFLNK:
+			goto out;
+		case S_IFREG:
+			/* NFSv4 has atomic_open... */
+			if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)
+					&& (mask & MAY_OPEN)
+					&& !(mask & MAY_EXEC))
+				goto out;
+			break;
+		case S_IFDIR:
+			/*
+			 * Optimize away all write operations, since the server
+			 * will check permissions when we perform the op.
+			 */
+			if ((mask & MAY_WRITE) && !(mask & MAY_READ))
+				goto out;
+	}
+
+force_lookup:
+	if (!NFS_PROTO(inode)->access)
+		goto out_notsup;
+
+	cred = rpc_lookup_cred();
+	if (!IS_ERR(cred)) {
+		res = nfs_do_access(inode, cred, mask);
+		put_rpccred(cred);
+	} else
+		res = PTR_ERR(cred);
+out:
+	if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
+		res = -EACCES;
+
+	dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
+		inode->i_sb->s_id, inode->i_ino, mask, res);
+	return res;
+out_notsup:
+	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	if (res == 0)
+		res = generic_permission(inode, mask, flags, NULL);
+	goto out;
+}
+
+/*
+ * Local variables:
+ *  version-control: t
+ *  kept-new-versions: 5
+ * End:
+ */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
new file mode 100644
index 00000000..8eea2536
--- /dev/null
+++ b/fs/nfs/direct.c
@@ -0,0 +1,1039 @@
+/*
+ * linux/fs/nfs/direct.c
+ *
+ * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
+ *
+ * High-performance uncached I/O for the Linux NFS client
+ *
+ * There are important applications whose performance or correctness
+ * depends on uncached access to file data.  Database clusters
+ * (multiple copies of the same instance running on separate hosts)
+ * implement their own cache coherency protocol that subsumes file
+ * system cache protocols.  Applications that process datasets
+ * considerably larger than the client's memory do not always benefit
+ * from a local cache.  A streaming video server, for instance, has no
+ * need to cache the contents of a file.
+ *
+ * When an application requests uncached I/O, all read and write requests
+ * are made directly to the server; data stored or fetched via these
+ * requests is not cached in the Linux page cache.  The client does not
+ * correct unaligned requests from applications.  All requested bytes are
+ * held on permanent storage before a direct write system call returns to
+ * an application.
+ *
+ * Solaris implements an uncached I/O facility called directio() that
+ * is used for backups and sequential I/O to very large files.  Solaris
+ * also supports uncaching whole NFS partitions with "-o forcedirectio,"
+ * an undocumented mount option.
+ *
+ * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
+ * help from Andrew Morton.
+ *
+ * 18 Dec 2001	Initial implementation for 2.4  --cel
+ * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
+ * 08 Jun 2003	Port to 2.5 APIs  --cel
+ * 31 Mar 2004	Handle direct I/O without VFS support  --cel
+ * 15 Sep 2004	Parallel async reads  --cel
+ * 04 May 2005	support O_DIRECT with aio  --cel
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/kref.h>
+#include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/sunrpc/clnt.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/atomic.h>
+
+#include "internal.h"
+#include "iostat.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+static struct kmem_cache *nfs_direct_cachep;
+
+/*
+ * This represents a set of asynchronous requests that we're waiting on
+ */
+struct nfs_direct_req {
+	struct kref		kref;		/* release manager */
+
+	/* I/O parameters */
+	struct nfs_open_context	*ctx;		/* file open context info */
+	struct nfs_lock_context *l_ctx;		/* Lock context info */
+	struct kiocb *		iocb;		/* controlling i/o request */
+	struct inode *		inode;		/* target file of i/o */
+
+	/* completion state */
+	atomic_t		io_count;	/* i/os we're waiting for */
+	spinlock_t		lock;		/* protect completion state */
+	ssize_t			count,		/* bytes actually processed */
+				error;		/* any reported error */
+	struct completion	completion;	/* wait for i/o completion */
+
+	/* commit state */
+	struct list_head	rewrite_list;	/* saved nfs_write_data structs */
+	struct nfs_write_data *	commit_data;	/* special write_data for commits */
+	int			flags;
+#define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
+#define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
+	struct nfs_writeverf	verf;		/* unstable write verifier */
+};
+
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
+static const struct rpc_call_ops nfs_write_direct_ops;
+
+static inline void get_dreq(struct nfs_direct_req *dreq)
+{
+	atomic_inc(&dreq->io_count);
+}
+
+static inline int put_dreq(struct nfs_direct_req *dreq)
+{
+	return atomic_dec_and_test(&dreq->io_count);
+}
+
+/**
+ * nfs_direct_IO - NFS address space operation for direct I/O
+ * @rw: direction (read or write)
+ * @iocb: target I/O control block
+ * @iov: array of vectors that define I/O buffer
+ * @pos: offset in file to begin the operation
+ * @nr_segs: size of iovec array
+ *
+ * The presence of this routine in the address space ops vector means
+ * the NFS client supports direct I/O.  However, we shunt off direct
+ * read and write requests before the VFS gets them, so this method
+ * should never be called.
+ */
+ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
+{
+	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
+			iocb->ki_filp->f_path.dentry->d_name.name,
+			(long long) pos, nr_segs);
+
+	return -EINVAL;
+}
+
+static void nfs_direct_dirty_pages(struct page **pages, unsigned int pgbase, size_t count)
+{
+	unsigned int npages;
+	unsigned int i;
+
+	if (count == 0)
+		return;
+	pages += (pgbase >> PAGE_SHIFT);
+	npages = (count + (pgbase & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	for (i = 0; i < npages; i++) {
+		struct page *page = pages[i];
+		if (!PageCompound(page))
+			set_page_dirty(page);
+	}
+}
+
+static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
+{
+	unsigned int i;
+	for (i = 0; i < npages; i++)
+		page_cache_release(pages[i]);
+}
+
+static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
+{
+	struct nfs_direct_req *dreq;
+
+	dreq = kmem_cache_alloc(nfs_direct_cachep, GFP_KERNEL);
+	if (!dreq)
+		return NULL;
+
+	kref_init(&dreq->kref);
+	kref_get(&dreq->kref);
+	init_completion(&dreq->completion);
+	INIT_LIST_HEAD(&dreq->rewrite_list);
+	dreq->iocb = NULL;
+	dreq->ctx = NULL;
+	dreq->l_ctx = NULL;
+	spin_lock_init(&dreq->lock);
+	atomic_set(&dreq->io_count, 0);
+	dreq->count = 0;
+	dreq->error = 0;
+	dreq->flags = 0;
+
+	return dreq;
+}
+
+static void nfs_direct_req_free(struct kref *kref)
+{
+	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
+
+	if (dreq->l_ctx != NULL)
+		nfs_put_lock_context(dreq->l_ctx);
+	if (dreq->ctx != NULL)
+		put_nfs_open_context(dreq->ctx);
+	kmem_cache_free(nfs_direct_cachep, dreq);
+}
+
+static void nfs_direct_req_release(struct nfs_direct_req *dreq)
+{
+	kref_put(&dreq->kref, nfs_direct_req_free);
+}
+
+/*
+ * Collects and returns the final error value/byte-count.
+ */
+static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
+{
+	ssize_t result = -EIOCBQUEUED;
+
+	/* Async requests don't wait here */
+	if (dreq->iocb)
+		goto out;
+
+	result = wait_for_completion_killable(&dreq->completion);
+
+	if (!result)
+		result = dreq->error;
+	if (!result)
+		result = dreq->count;
+
+out:
+	return (ssize_t) result;
+}
+
+/*
+ * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
+ * the iocb is still valid here if this is a synchronous request.
+ */
+static void nfs_direct_complete(struct nfs_direct_req *dreq)
+{
+	if (dreq->iocb) {
+		long res = (long) dreq->error;
+		if (!res)
+			res = (long) dreq->count;
+		aio_complete(dreq->iocb, res, 0);
+	}
+	complete_all(&dreq->completion);
+
+	nfs_direct_req_release(dreq);
+}
+
+/*
+ * We must hold a reference to all the pages in this direct read request
+ * until the RPCs complete.  This could be long *after* we are woken up in
+ * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
+ */
+static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
+{
+	struct nfs_read_data *data = calldata;
+
+	nfs_readpage_result(task, data);
+}
+
+static void nfs_direct_read_release(void *calldata)
+{
+
+	struct nfs_read_data *data = calldata;
+	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+	int status = data->task.tk_status;
+
+	spin_lock(&dreq->lock);
+	if (unlikely(status < 0)) {
+		dreq->error = status;
+		spin_unlock(&dreq->lock);
+	} else {
+		dreq->count += data->res.count;
+		spin_unlock(&dreq->lock);
+		nfs_direct_dirty_pages(data->pagevec,
+				data->args.pgbase,
+				data->res.count);
+	}
+	nfs_direct_release_pages(data->pagevec, data->npages);
+
+	if (put_dreq(dreq))
+		nfs_direct_complete(dreq);
+	nfs_readdata_free(data);
+}
+
+static const struct rpc_call_ops nfs_read_direct_ops = {
+#if defined(CONFIG_NFS_V4_1)
+	.rpc_call_prepare = nfs_read_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+	.rpc_call_done = nfs_direct_read_result,
+	.rpc_release = nfs_direct_read_release,
+};
+
+/*
+ * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
+ * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
+ * bail and stop sending more reads.  Read length accounting is
+ * handled automatically by nfs_direct_read_result().  Otherwise, if
+ * no requests have been sent, just return an error.
+ */
+static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
+						const struct iovec *iov,
+						loff_t pos)
+{
+	struct nfs_open_context *ctx = dreq->ctx;
+	struct inode *inode = ctx->path.dentry->d_inode;
+	unsigned long user_addr = (unsigned long)iov->iov_base;
+	size_t count = iov->iov_len;
+	size_t rsize = NFS_SERVER(inode)->rsize;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_cred = ctx->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(inode),
+		.rpc_message = &msg,
+		.callback_ops = &nfs_read_direct_ops,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+	unsigned int pgbase;
+	int result;
+	ssize_t started = 0;
+
+	do {
+		struct nfs_read_data *data;
+		size_t bytes;
+
+		pgbase = user_addr & ~PAGE_MASK;
+		bytes = min(rsize,count);
+
+		result = -ENOMEM;
+		data = nfs_readdata_alloc(nfs_page_array_len(pgbase, bytes));
+		if (unlikely(!data))
+			break;
+
+		down_read(&current->mm->mmap_sem);
+		result = get_user_pages(current, current->mm, user_addr,
+					data->npages, 1, 0, data->pagevec, NULL);
+		up_read(&current->mm->mmap_sem);
+		if (result < 0) {
+			nfs_readdata_free(data);
+			break;
+		}
+		if ((unsigned)result < data->npages) {
+			bytes = result * PAGE_SIZE;
+			if (bytes <= pgbase) {
+				nfs_direct_release_pages(data->pagevec, result);
+				nfs_readdata_free(data);
+				break;
+			}
+			bytes -= pgbase;
+			data->npages = result;
+		}
+
+		get_dreq(dreq);
+
+		data->req = (struct nfs_page *) dreq;
+		data->inode = inode;
+		data->cred = msg.rpc_cred;
+		data->args.fh = NFS_FH(inode);
+		data->args.context = ctx;
+		data->args.lock_context = dreq->l_ctx;
+		data->args.offset = pos;
+		data->args.pgbase = pgbase;
+		data->args.pages = data->pagevec;
+		data->args.count = bytes;
+		data->res.fattr = &data->fattr;
+		data->res.eof = 0;
+		data->res.count = bytes;
+		nfs_fattr_init(&data->fattr);
+		msg.rpc_argp = &data->args;
+		msg.rpc_resp = &data->res;
+
+		task_setup_data.task = &data->task;
+		task_setup_data.callback_data = data;
+		NFS_PROTO(inode)->read_setup(data, &msg);
+
+		task = rpc_run_task(&task_setup_data);
+		if (IS_ERR(task))
+			break;
+		rpc_put_task(task);
+
+		dprintk("NFS: %5u initiated direct read call "
+			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
+				data->task.tk_pid,
+				inode->i_sb->s_id,
+				(long long)NFS_FILEID(inode),
+				bytes,
+				(unsigned long long)data->args.offset);
+
+		started += bytes;
+		user_addr += bytes;
+		pos += bytes;
+		/* FIXME: Remove this unnecessary math from final patch */
+		pgbase += bytes;
+		pgbase &= ~PAGE_MASK;
+		BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
+
+		count -= bytes;
+	} while (count != 0);
+
+	if (started)
+		return started;
+	return result < 0 ? (ssize_t) result : -EFAULT;
+}
+
+static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
+					      const struct iovec *iov,
+					      unsigned long nr_segs,
+					      loff_t pos)
+{
+	ssize_t result = -EINVAL;
+	size_t requested_bytes = 0;
+	unsigned long seg;
+
+	get_dreq(dreq);
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		const struct iovec *vec = &iov[seg];
+		result = nfs_direct_read_schedule_segment(dreq, vec, pos);
+		if (result < 0)
+			break;
+		requested_bytes += result;
+		if ((size_t)result < vec->iov_len)
+			break;
+		pos += vec->iov_len;
+	}
+
+	/*
+	 * If no bytes were started, return the error, and let the
+	 * generic layer handle the completion.
+	 */
+	if (requested_bytes == 0) {
+		nfs_direct_req_release(dreq);
+		return result < 0 ? result : -EIO;
+	}
+
+	if (put_dreq(dreq))
+		nfs_direct_complete(dreq);
+	return 0;
+}
+
+static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos)
+{
+	ssize_t result = -ENOMEM;
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	struct nfs_direct_req *dreq;
+
+	dreq = nfs_direct_req_alloc();
+	if (dreq == NULL)
+		goto out;
+
+	dreq->inode = inode;
+	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+	dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+	if (dreq->l_ctx == NULL)
+		goto out_release;
+	if (!is_sync_kiocb(iocb))
+		dreq->iocb = iocb;
+
+	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
+	if (!result)
+		result = nfs_direct_wait(dreq);
+out_release:
+	nfs_direct_req_release(dreq);
+out:
+	return result;
+}
+
+static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
+{
+	while (!list_empty(&dreq->rewrite_list)) {
+		struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
+		list_del(&data->pages);
+		nfs_direct_release_pages(data->pagevec, data->npages);
+		nfs_writedata_free(data);
+	}
+}
+
+#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
+{
+	struct inode *inode = dreq->inode;
+	struct list_head *p;
+	struct nfs_write_data *data;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_cred = dreq->ctx->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(inode),
+		.rpc_message = &msg,
+		.callback_ops = &nfs_write_direct_ops,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+
+	dreq->count = 0;
+	get_dreq(dreq);
+
+	list_for_each(p, &dreq->rewrite_list) {
+		data = list_entry(p, struct nfs_write_data, pages);
+
+		get_dreq(dreq);
+
+		/* Use stable writes */
+		data->args.stable = NFS_FILE_SYNC;
+
+		/*
+		 * Reset data->res.
+		 */
+		nfs_fattr_init(&data->fattr);
+		data->res.count = data->args.count;
+		memset(&data->verf, 0, sizeof(data->verf));
+
+		/*
+		 * Reuse data->task; data->args should not have changed
+		 * since the original request was sent.
+		 */
+		task_setup_data.task = &data->task;
+		task_setup_data.callback_data = data;
+		msg.rpc_argp = &data->args;
+		msg.rpc_resp = &data->res;
+		NFS_PROTO(inode)->write_setup(data, &msg);
+
+		/*
+		 * We're called via an RPC callback, so BKL is already held.
+		 */
+		task = rpc_run_task(&task_setup_data);
+		if (!IS_ERR(task))
+			rpc_put_task(task);
+
+		dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
+				data->task.tk_pid,
+				inode->i_sb->s_id,
+				(long long)NFS_FILEID(inode),
+				data->args.count,
+				(unsigned long long)data->args.offset);
+	}
+
+	if (put_dreq(dreq))
+		nfs_direct_write_complete(dreq, inode);
+}
+
+static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
+{
+	struct nfs_write_data *data = calldata;
+
+	/* Call the NFS version-specific code */
+	NFS_PROTO(data->inode)->commit_done(task, data);
+}
+
+static void nfs_direct_commit_release(void *calldata)
+{
+	struct nfs_write_data *data = calldata;
+	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+	int status = data->task.tk_status;
+
+	if (status < 0) {
+		dprintk("NFS: %5u commit failed with error %d.\n",
+				data->task.tk_pid, status);
+		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+	} else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
+		dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
+		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+	}
+
+	dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
+	nfs_direct_write_complete(dreq, data->inode);
+	nfs_commit_free(data);
+}
+
+static const struct rpc_call_ops nfs_commit_direct_ops = {
+#if defined(CONFIG_NFS_V4_1)
+	.rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+	.rpc_call_done = nfs_direct_commit_result,
+	.rpc_release = nfs_direct_commit_release,
+};
+
+static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
+{
+	struct nfs_write_data *data = dreq->commit_data;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = dreq->ctx->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.task = &data->task,
+		.rpc_client = NFS_CLIENT(dreq->inode),
+		.rpc_message = &msg,
+		.callback_ops = &nfs_commit_direct_ops,
+		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+
+	data->inode = dreq->inode;
+	data->cred = msg.rpc_cred;
+
+	data->args.fh = NFS_FH(data->inode);
+	data->args.offset = 0;
+	data->args.count = 0;
+	data->args.context = dreq->ctx;
+	data->args.lock_context = dreq->l_ctx;
+	data->res.count = 0;
+	data->res.fattr = &data->fattr;
+	data->res.verf = &data->verf;
+	nfs_fattr_init(&data->fattr);
+
+	NFS_PROTO(data->inode)->commit_setup(data, &msg);
+
+	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
+	dreq->commit_data = NULL;
+
+	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+
+	task = rpc_run_task(&task_setup_data);
+	if (!IS_ERR(task))
+		rpc_put_task(task);
+}
+
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+{
+	int flags = dreq->flags;
+
+	dreq->flags = 0;
+	switch (flags) {
+		case NFS_ODIRECT_DO_COMMIT:
+			nfs_direct_commit_schedule(dreq);
+			break;
+		case NFS_ODIRECT_RESCHED_WRITES:
+			nfs_direct_write_reschedule(dreq);
+			break;
+		default:
+			if (dreq->commit_data != NULL)
+				nfs_commit_free(dreq->commit_data);
+			nfs_direct_free_writedata(dreq);
+			nfs_zap_mapping(inode, inode->i_mapping);
+			nfs_direct_complete(dreq);
+	}
+}
+
+static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
+{
+	dreq->commit_data = nfs_commitdata_alloc();
+	if (dreq->commit_data != NULL)
+		dreq->commit_data->req = (struct nfs_page *) dreq;
+}
+#else
+static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
+{
+	dreq->commit_data = NULL;
+}
+
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+{
+	nfs_direct_free_writedata(dreq);
+	nfs_zap_mapping(inode, inode->i_mapping);
+	nfs_direct_complete(dreq);
+}
+#endif
+
+static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
+{
+	struct nfs_write_data *data = calldata;
+
+	nfs_writeback_done(task, data);
+}
+
+/*
+ * NB: Return the value of the first error return code.  Subsequent
+ *     errors after the first one are ignored.
+ */
+static void nfs_direct_write_release(void *calldata)
+{
+	struct nfs_write_data *data = calldata;
+	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+	int status = data->task.tk_status;
+
+	spin_lock(&dreq->lock);
+
+	if (unlikely(status < 0)) {
+		/* An error has occurred, so we should not commit */
+		dreq->flags = 0;
+		dreq->error = status;
+	}
+	if (unlikely(dreq->error != 0))
+		goto out_unlock;
+
+	dreq->count += data->res.count;
+
+	if (data->res.verf->committed != NFS_FILE_SYNC) {
+		switch (dreq->flags) {
+			case 0:
+				memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
+				dreq->flags = NFS_ODIRECT_DO_COMMIT;
+				break;
+			case NFS_ODIRECT_DO_COMMIT:
+				if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
+					dprintk("NFS: %5u write verify failed\n", data->task.tk_pid);
+					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+				}
+		}
+	}
+out_unlock:
+	spin_unlock(&dreq->lock);
+
+	if (put_dreq(dreq))
+		nfs_direct_write_complete(dreq, data->inode);
+}
+
+static const struct rpc_call_ops nfs_write_direct_ops = {
+#if defined(CONFIG_NFS_V4_1)
+	.rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+	.rpc_call_done = nfs_direct_write_result,
+	.rpc_release = nfs_direct_write_release,
+};
+
+/*
+ * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
+ * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
+ * bail and stop sending more writes.  Write length accounting is
+ * handled automatically by nfs_direct_write_result().  Otherwise, if
+ * no requests have been sent, just return an error.
+ */
+static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
+						 const struct iovec *iov,
+						 loff_t pos, int sync)
+{
+	struct nfs_open_context *ctx = dreq->ctx;
+	struct inode *inode = ctx->path.dentry->d_inode;
+	unsigned long user_addr = (unsigned long)iov->iov_base;
+	size_t count = iov->iov_len;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_cred = ctx->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(inode),
+		.rpc_message = &msg,
+		.callback_ops = &nfs_write_direct_ops,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+	size_t wsize = NFS_SERVER(inode)->wsize;
+	unsigned int pgbase;
+	int result;
+	ssize_t started = 0;
+
+	do {
+		struct nfs_write_data *data;
+		size_t bytes;
+
+		pgbase = user_addr & ~PAGE_MASK;
+		bytes = min(wsize,count);
+
+		result = -ENOMEM;
+		data = nfs_writedata_alloc(nfs_page_array_len(pgbase, bytes));
+		if (unlikely(!data))
+			break;
+
+		down_read(&current->mm->mmap_sem);
+		result = get_user_pages(current, current->mm, user_addr,
+					data->npages, 0, 0, data->pagevec, NULL);
+		up_read(&current->mm->mmap_sem);
+		if (result < 0) {
+			nfs_writedata_free(data);
+			break;
+		}
+		if ((unsigned)result < data->npages) {
+			bytes = result * PAGE_SIZE;
+			if (bytes <= pgbase) {
+				nfs_direct_release_pages(data->pagevec, result);
+				nfs_writedata_free(data);
+				break;
+			}
+			bytes -= pgbase;
+			data->npages = result;
+		}
+
+		get_dreq(dreq);
+
+		list_move_tail(&data->pages, &dreq->rewrite_list);
+
+		data->req = (struct nfs_page *) dreq;
+		data->inode = inode;
+		data->cred = msg.rpc_cred;
+		data->args.fh = NFS_FH(inode);
+		data->args.context = ctx;
+		data->args.lock_context = dreq->l_ctx;
+		data->args.offset = pos;
+		data->args.pgbase = pgbase;
+		data->args.pages = data->pagevec;
+		data->args.count = bytes;
+		data->args.stable = sync;
+		data->res.fattr = &data->fattr;
+		data->res.count = bytes;
+		data->res.verf = &data->verf;
+		nfs_fattr_init(&data->fattr);
+
+		task_setup_data.task = &data->task;
+		task_setup_data.callback_data = data;
+		msg.rpc_argp = &data->args;
+		msg.rpc_resp = &data->res;
+		NFS_PROTO(inode)->write_setup(data, &msg);
+
+		task = rpc_run_task(&task_setup_data);
+		if (IS_ERR(task))
+			break;
+		rpc_put_task(task);
+
+		dprintk("NFS: %5u initiated direct write call "
+			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
+				data->task.tk_pid,
+				inode->i_sb->s_id,
+				(long long)NFS_FILEID(inode),
+				bytes,
+				(unsigned long long)data->args.offset);
+
+		started += bytes;
+		user_addr += bytes;
+		pos += bytes;
+
+		/* FIXME: Remove this useless math from the final patch */
+		pgbase += bytes;
+		pgbase &= ~PAGE_MASK;
+		BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
+
+		count -= bytes;
+	} while (count != 0);
+
+	if (started)
+		return started;
+	return result < 0 ? (ssize_t) result : -EFAULT;
+}
+
+static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
+					       const struct iovec *iov,
+					       unsigned long nr_segs,
+					       loff_t pos, int sync)
+{
+	ssize_t result = 0;
+	size_t requested_bytes = 0;
+	unsigned long seg;
+
+	get_dreq(dreq);
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		const struct iovec *vec = &iov[seg];
+		result = nfs_direct_write_schedule_segment(dreq, vec,
+							   pos, sync);
+		if (result < 0)
+			break;
+		requested_bytes += result;
+		if ((size_t)result < vec->iov_len)
+			break;
+		pos += vec->iov_len;
+	}
+
+	/*
+	 * If no bytes were started, return the error, and let the
+	 * generic layer handle the completion.
+	 */
+	if (requested_bytes == 0) {
+		nfs_direct_req_release(dreq);
+		return result < 0 ? result : -EIO;
+	}
+
+	if (put_dreq(dreq))
+		nfs_direct_write_complete(dreq, dreq->inode);
+	return 0;
+}
+
+static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos,
+				size_t count)
+{
+	ssize_t result = -ENOMEM;
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	struct nfs_direct_req *dreq;
+	size_t wsize = NFS_SERVER(inode)->wsize;
+	int sync = NFS_UNSTABLE;
+
+	dreq = nfs_direct_req_alloc();
+	if (!dreq)
+		goto out;
+	nfs_alloc_commit_data(dreq);
+
+	if (dreq->commit_data == NULL || count <= wsize)
+		sync = NFS_FILE_SYNC;
+
+	dreq->inode = inode;
+	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+	dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+	if (dreq->l_ctx == NULL)
+		goto out_release;
+	if (!is_sync_kiocb(iocb))
+		dreq->iocb = iocb;
+
+	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
+	if (!result)
+		result = nfs_direct_wait(dreq);
+out_release:
+	nfs_direct_req_release(dreq);
+out:
+	return result;
+}
+
+/**
+ * nfs_file_direct_read - file direct read operation for NFS files
+ * @iocb: target I/O control block
+ * @iov: vector of user buffers into which to read data
+ * @nr_segs: size of iov vector
+ * @pos: byte offset in file where reading starts
+ *
+ * We use this function for direct reads instead of calling
+ * generic_file_aio_read() in order to avoid gfar's check to see if
+ * the request starts before the end of the file.  For that check
+ * to work, we must generate a GETATTR before each direct read, and
+ * even then there is a window between the GETATTR and the subsequent
+ * READ where the file size could change.  Our preference is simply
+ * to do all reads the application wants, and the server will take
+ * care of managing the end of file boundary.
+ *
+ * This function also eliminates unnecessarily updating the file's
+ * atime locally, as the NFS server sets the file's atime, and this
+ * client must read the updated atime from the server back into its
+ * cache.
+ */
+ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
+{
+	ssize_t retval = -EINVAL;
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	size_t count;
+
+	count = iov_length(iov, nr_segs);
+	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
+
+	dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
+		file->f_path.dentry->d_parent->d_name.name,
+		file->f_path.dentry->d_name.name,
+		count, (long long) pos);
+
+	retval = 0;
+	if (!count)
+		goto out;
+
+	retval = nfs_sync_mapping(mapping);
+	if (retval)
+		goto out;
+
+	task_io_account_read(count);
+
+	retval = nfs_direct_read(iocb, iov, nr_segs, pos);
+	if (retval > 0)
+		iocb->ki_pos = pos + retval;
+
+out:
+	return retval;
+}
+
+/**
+ * nfs_file_direct_write - file direct write operation for NFS files
+ * @iocb: target I/O control block
+ * @iov: vector of user buffers from which to write data
+ * @nr_segs: size of iov vector
+ * @pos: byte offset in file where writing starts
+ *
+ * We use this function for direct writes instead of calling
+ * generic_file_aio_write() in order to avoid taking the inode
+ * semaphore and updating the i_size.  The NFS server will set
+ * the new i_size and this client must read the updated size
+ * back into its cache.  We let the server do generic write
+ * parameter checking and report problems.
+ *
+ * We eliminate local atime updates, see direct read above.
+ *
+ * We avoid unnecessary page cache invalidations for normal cached
+ * readers of this file.
+ *
+ * Note that O_APPEND is not supported for NFS direct writes, as there
+ * is no atomic O_APPEND write facility in the NFS protocol.
+ */
+ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
+{
+	ssize_t retval = -EINVAL;
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	size_t count;
+
+	count = iov_length(iov, nr_segs);
+	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
+
+	dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
+		file->f_path.dentry->d_parent->d_name.name,
+		file->f_path.dentry->d_name.name,
+		count, (long long) pos);
+
+	retval = generic_write_checks(file, &pos, &count, 0);
+	if (retval)
+		goto out;
+
+	retval = -EINVAL;
+	if ((ssize_t) count < 0)
+		goto out;
+	retval = 0;
+	if (!count)
+		goto out;
+
+	retval = nfs_sync_mapping(mapping);
+	if (retval)
+		goto out;
+
+	task_io_account_write(count);
+
+	retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
+
+	if (retval > 0)
+		iocb->ki_pos = pos + retval;
+
+out:
+	return retval;
+}
+
+/**
+ * nfs_init_directcache - create a slab cache for nfs_direct_req structures
+ *
+ */
+int __init nfs_init_directcache(void)
+{
+	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
+						sizeof(struct nfs_direct_req),
+						0, (SLAB_RECLAIM_ACCOUNT|
+							SLAB_MEM_SPREAD),
+						NULL);
+	if (nfs_direct_cachep == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
+ *
+ */
+void nfs_destroy_directcache(void)
+{
+	kmem_cache_destroy(nfs_direct_cachep);
+}
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
new file mode 100644
index 00000000..a6e711ad
--- /dev/null
+++ b/fs/nfs/dns_resolve.c
@@ -0,0 +1,372 @@
+/*
+ * linux/fs/nfs/dns_resolve.c
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * Resolves DNS hostnames into valid ip addresses
+ */
+
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/dns_resolver.h>
+
+ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+		struct sockaddr *sa, size_t salen)
+{
+	ssize_t ret;
+	char *ip_addr = NULL;
+	int ip_len;
+
+	ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);
+	if (ip_len > 0)
+		ret = rpc_pton(ip_addr, ip_len, sa, salen);
+	else
+		ret = -ESRCH;
+	kfree(ip_addr);
+	return ret;
+}
+
+#else
+
+#include <linux/hash.h>
+#include <linux/string.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/seq_file.h>
+#include <linux/inet.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/svcauth.h>
+
+#include "dns_resolve.h"
+#include "cache_lib.h"
+
+#define NFS_DNS_HASHBITS 4
+#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
+
+static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE];
+
+struct nfs_dns_ent {
+	struct cache_head h;
+
+	char *hostname;
+	size_t namelen;
+
+	struct sockaddr_storage addr;
+	size_t addrlen;
+};
+
+
+static void nfs_dns_ent_update(struct cache_head *cnew,
+		struct cache_head *ckey)
+{
+	struct nfs_dns_ent *new;
+	struct nfs_dns_ent *key;
+
+	new = container_of(cnew, struct nfs_dns_ent, h);
+	key = container_of(ckey, struct nfs_dns_ent, h);
+
+	memcpy(&new->addr, &key->addr, key->addrlen);
+	new->addrlen = key->addrlen;
+}
+
+static void nfs_dns_ent_init(struct cache_head *cnew,
+		struct cache_head *ckey)
+{
+	struct nfs_dns_ent *new;
+	struct nfs_dns_ent *key;
+
+	new = container_of(cnew, struct nfs_dns_ent, h);
+	key = container_of(ckey, struct nfs_dns_ent, h);
+
+	kfree(new->hostname);
+	new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
+	if (new->hostname) {
+		new->namelen = key->namelen;
+		nfs_dns_ent_update(cnew, ckey);
+	} else {
+		new->namelen = 0;
+		new->addrlen = 0;
+	}
+}
+
+static void nfs_dns_ent_put(struct kref *ref)
+{
+	struct nfs_dns_ent *item;
+
+	item = container_of(ref, struct nfs_dns_ent, h.ref);
+	kfree(item->hostname);
+	kfree(item);
+}
+
+static struct cache_head *nfs_dns_ent_alloc(void)
+{
+	struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
+
+	if (item != NULL) {
+		item->hostname = NULL;
+		item->namelen = 0;
+		item->addrlen = 0;
+		return &item->h;
+	}
+	return NULL;
+};
+
+static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key)
+{
+	return hash_str(key->hostname, NFS_DNS_HASHBITS);
+}
+
+static void nfs_dns_request(struct cache_detail *cd,
+		struct cache_head *ch,
+		char **bpp, int *blen)
+{
+	struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+
+	qword_add(bpp, blen, key->hostname);
+	(*bpp)[-1] = '\n';
+}
+
+static int nfs_dns_upcall(struct cache_detail *cd,
+		struct cache_head *ch)
+{
+	struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+	int ret;
+
+	ret = nfs_cache_upcall(cd, key->hostname);
+	if (ret)
+		ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request);
+	return ret;
+}
+
+static int nfs_dns_match(struct cache_head *ca,
+		struct cache_head *cb)
+{
+	struct nfs_dns_ent *a;
+	struct nfs_dns_ent *b;
+
+	a = container_of(ca, struct nfs_dns_ent, h);
+	b = container_of(cb, struct nfs_dns_ent, h);
+
+	if (a->namelen == 0 || a->namelen != b->namelen)
+		return 0;
+	return memcmp(a->hostname, b->hostname, a->namelen) == 0;
+}
+
+static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
+		struct cache_head *h)
+{
+	struct nfs_dns_ent *item;
+	long ttl;
+
+	if (h == NULL) {
+		seq_puts(m, "# ip address      hostname        ttl\n");
+		return 0;
+	}
+	item = container_of(h, struct nfs_dns_ent, h);
+	ttl = item->h.expiry_time - seconds_since_boot();
+	if (ttl < 0)
+		ttl = 0;
+
+	if (!test_bit(CACHE_NEGATIVE, &h->flags)) {
+		char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1];
+
+		rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf));
+		seq_printf(m, "%15s ", buf);
+	} else
+		seq_puts(m, "<none>          ");
+	seq_printf(m, "%15s %ld\n", item->hostname, ttl);
+	return 0;
+}
+
+static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
+		struct nfs_dns_ent *key)
+{
+	struct cache_head *ch;
+
+	ch = sunrpc_cache_lookup(cd,
+			&key->h,
+			nfs_dns_hash(key));
+	if (!ch)
+		return NULL;
+	return container_of(ch, struct nfs_dns_ent, h);
+}
+
+static struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
+		struct nfs_dns_ent *new,
+		struct nfs_dns_ent *key)
+{
+	struct cache_head *ch;
+
+	ch = sunrpc_cache_update(cd,
+			&new->h, &key->h,
+			nfs_dns_hash(key));
+	if (!ch)
+		return NULL;
+	return container_of(ch, struct nfs_dns_ent, h);
+}
+
+static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+	char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
+	struct nfs_dns_ent key, *item;
+	unsigned long ttl;
+	ssize_t len;
+	int ret = -EINVAL;
+
+	if (buf[buflen-1] != '\n')
+		goto out;
+	buf[buflen-1] = '\0';
+
+	len = qword_get(&buf, buf1, sizeof(buf1));
+	if (len <= 0)
+		goto out;
+	key.addrlen = rpc_pton(buf1, len,
+			(struct sockaddr *)&key.addr,
+			sizeof(key.addr));
+
+	len = qword_get(&buf, buf1, sizeof(buf1));
+	if (len <= 0)
+		goto out;
+
+	key.hostname = buf1;
+	key.namelen = len;
+	memset(&key.h, 0, sizeof(key.h));
+
+	ttl = get_expiry(&buf);
+	if (ttl == 0)
+		goto out;
+	key.h.expiry_time = ttl + seconds_since_boot();
+
+	ret = -ENOMEM;
+	item = nfs_dns_lookup(cd, &key);
+	if (item == NULL)
+		goto out;
+
+	if (key.addrlen == 0)
+		set_bit(CACHE_NEGATIVE, &key.h.flags);
+
+	item = nfs_dns_update(cd, &key, item);
+	if (item == NULL)
+		goto out;
+
+	ret = 0;
+	cache_put(&item->h, cd);
+out:
+	return ret;
+}
+
+static struct cache_detail nfs_dns_resolve = {
+	.owner = THIS_MODULE,
+	.hash_size = NFS_DNS_HASHTBL_SIZE,
+	.hash_table = nfs_dns_table,
+	.name = "dns_resolve",
+	.cache_put = nfs_dns_ent_put,
+	.cache_upcall = nfs_dns_upcall,
+	.cache_parse = nfs_dns_parse,
+	.cache_show = nfs_dns_show,
+	.match = nfs_dns_match,
+	.init = nfs_dns_ent_init,
+	.update = nfs_dns_ent_update,
+	.alloc = nfs_dns_ent_alloc,
+};
+
+static int do_cache_lookup(struct cache_detail *cd,
+		struct nfs_dns_ent *key,
+		struct nfs_dns_ent **item,
+		struct nfs_cache_defer_req *dreq)
+{
+	int ret = -ENOMEM;
+
+	*item = nfs_dns_lookup(cd, key);
+	if (*item) {
+		ret = cache_check(cd, &(*item)->h, &dreq->req);
+		if (ret)
+			*item = NULL;
+	}
+	return ret;
+}
+
+static int do_cache_lookup_nowait(struct cache_detail *cd,
+		struct nfs_dns_ent *key,
+		struct nfs_dns_ent **item)
+{
+	int ret = -ENOMEM;
+
+	*item = nfs_dns_lookup(cd, key);
+	if (!*item)
+		goto out_err;
+	ret = -ETIMEDOUT;
+	if (!test_bit(CACHE_VALID, &(*item)->h.flags)
+			|| (*item)->h.expiry_time < seconds_since_boot()
+			|| cd->flush_time > (*item)->h.last_refresh)
+		goto out_put;
+	ret = -ENOENT;
+	if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
+		goto out_put;
+	return 0;
+out_put:
+	cache_put(&(*item)->h, cd);
+out_err:
+	*item = NULL;
+	return ret;
+}
+
+static int do_cache_lookup_wait(struct cache_detail *cd,
+		struct nfs_dns_ent *key,
+		struct nfs_dns_ent **item)
+{
+	struct nfs_cache_defer_req *dreq;
+	int ret = -ENOMEM;
+
+	dreq = nfs_cache_defer_req_alloc();
+	if (!dreq)
+		goto out;
+	ret = do_cache_lookup(cd, key, item, dreq);
+	if (ret == -EAGAIN) {
+		ret = nfs_cache_wait_for_upcall(dreq);
+		if (!ret)
+			ret = do_cache_lookup_nowait(cd, key, item);
+	}
+	nfs_cache_defer_req_put(dreq);
+out:
+	return ret;
+}
+
+ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+		struct sockaddr *sa, size_t salen)
+{
+	struct nfs_dns_ent key = {
+		.hostname = name,
+		.namelen = namelen,
+	};
+	struct nfs_dns_ent *item = NULL;
+	ssize_t ret;
+
+	ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item);
+	if (ret == 0) {
+		if (salen >= item->addrlen) {
+			memcpy(sa, &item->addr, item->addrlen);
+			ret = item->addrlen;
+		} else
+			ret = -EOVERFLOW;
+		cache_put(&item->h, &nfs_dns_resolve);
+	} else if (ret == -ENOENT)
+		ret = -ESRCH;
+	return ret;
+}
+
+int nfs_dns_resolver_init(void)
+{
+	return nfs_cache_register(&nfs_dns_resolve);
+}
+
+void nfs_dns_resolver_destroy(void)
+{
+	nfs_cache_unregister(&nfs_dns_resolve);
+}
+
+#endif
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
new file mode 100644
index 00000000..199bb554
--- /dev/null
+++ b/fs/nfs/dns_resolve.h
@@ -0,0 +1,26 @@
+/*
+ * Resolve DNS hostnames into valid ip addresses
+ */
+#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H
+#define __LINUX_FS_NFS_DNS_RESOLVE_H
+
+#define NFS_DNS_HOSTNAME_MAXLEN	(128)
+
+
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+static inline int nfs_dns_resolver_init(void)
+{
+	return 0;
+}
+
+static inline void nfs_dns_resolver_destroy(void)
+{}
+#else
+extern int nfs_dns_resolver_init(void);
+extern void nfs_dns_resolver_destroy(void);
+#endif
+
+extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+		struct sockaddr *sa, size_t salen);
+
+#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
new file mode 100644
index 00000000..dd2f1307
--- /dev/null
+++ b/fs/nfs/file.c
@@ -0,0 +1,921 @@
+/*
+ *  linux/fs/nfs/file.c
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  Changes Copyright (C) 1994 by Florian La Roche
+ *   - Do not copy data too often around in the kernel.
+ *   - In nfs_file_read the return value of kmalloc wasn't checked.
+ *   - Put in a better version of read look-ahead buffering. Original idea
+ *     and implementation by Wai S Kok elekokws@ee.nus.sg.
+ *
+ *  Expire cache on write to a file by Wai S Kok (Oct 1994).
+ *
+ *  Total rewrite of read side for new NFS buffer cache.. Linus.
+ *
+ *  nfs regular file handling functions
+ */
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/aio.h>
+#include <linux/gfp.h>
+#include <linux/swap.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include "delegation.h"
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_FILE
+
+static int nfs_file_open(struct inode *, struct file *);
+static int nfs_file_release(struct inode *, struct file *);
+static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin);
+static int  nfs_file_mmap(struct file *, struct vm_area_struct *);
+static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos,
+					struct pipe_inode_info *pipe,
+					size_t count, unsigned int flags);
+static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos);
+static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
+					struct file *filp, loff_t *ppos,
+					size_t count, unsigned int flags);
+static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos);
+static int  nfs_file_flush(struct file *, fl_owner_t id);
+static int  nfs_file_fsync(struct file *, int datasync);
+static int nfs_check_flags(int flags);
+static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
+static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
+static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
+
+static const struct vm_operations_struct nfs_file_vm_ops;
+
+const struct file_operations nfs_file_operations = {
+	.llseek		= nfs_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= nfs_file_read,
+	.aio_write	= nfs_file_write,
+	.mmap		= nfs_file_mmap,
+	.open		= nfs_file_open,
+	.flush		= nfs_file_flush,
+	.release	= nfs_file_release,
+	.fsync		= nfs_file_fsync,
+	.lock		= nfs_lock,
+	.flock		= nfs_flock,
+	.splice_read	= nfs_file_splice_read,
+	.splice_write	= nfs_file_splice_write,
+	.check_flags	= nfs_check_flags,
+	.setlease	= nfs_setlease,
+};
+
+const struct inode_operations nfs_file_inode_operations = {
+	.permission	= nfs_permission,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+};
+
+#ifdef CONFIG_NFS_V3
+const struct inode_operations nfs3_file_inode_operations = {
+	.permission	= nfs_permission,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+	.listxattr	= nfs3_listxattr,
+	.getxattr	= nfs3_getxattr,
+	.setxattr	= nfs3_setxattr,
+	.removexattr	= nfs3_removexattr,
+};
+#endif  /* CONFIG_NFS_v3 */
+
+/* Hack for future NFS swap support */
+#ifndef IS_SWAPFILE
+# define IS_SWAPFILE(inode)	(0)
+#endif
+
+static int nfs_check_flags(int flags)
+{
+	if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Open file
+ */
+static int
+nfs_file_open(struct inode *inode, struct file *filp)
+{
+	int res;
+
+	dprintk("NFS: open file(%s/%s)\n",
+			filp->f_path.dentry->d_parent->d_name.name,
+			filp->f_path.dentry->d_name.name);
+
+	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
+	res = nfs_check_flags(filp->f_flags);
+	if (res)
+		return res;
+
+	res = nfs_open(inode, filp);
+	return res;
+}
+
+static int
+nfs_file_release(struct inode *inode, struct file *filp)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+
+	dprintk("NFS: release(%s/%s)\n",
+			dentry->d_parent->d_name.name,
+			dentry->d_name.name);
+
+	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
+	return nfs_release(inode, filp);
+}
+
+/**
+ * nfs_revalidate_size - Revalidate the file size
+ * @inode - pointer to inode struct
+ * @file - pointer to struct file
+ *
+ * Revalidates the file length. This is basically a wrapper around
+ * nfs_revalidate_inode() that takes into account the fact that we may
+ * have cached writes (in which case we don't care about the server's
+ * idea of what the file length is), or O_DIRECT (in which case we
+ * shouldn't trust the cache).
+ */
+static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	if (nfs_have_delegated_attributes(inode))
+		goto out_noreval;
+
+	if (filp->f_flags & O_DIRECT)
+		goto force_reval;
+	if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+		goto force_reval;
+	if (nfs_attribute_timeout(inode))
+		goto force_reval;
+out_noreval:
+	return 0;
+force_reval:
+	return __nfs_revalidate_inode(server, inode);
+}
+
+static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
+{
+	loff_t loff;
+
+	dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
+			filp->f_path.dentry->d_parent->d_name.name,
+			filp->f_path.dentry->d_name.name,
+			offset, origin);
+
+	/* origin == SEEK_END => we must revalidate the cached file length */
+	if (origin == SEEK_END) {
+		struct inode *inode = filp->f_mapping->host;
+
+		int retval = nfs_revalidate_file_size(inode, filp);
+		if (retval < 0)
+			return (loff_t)retval;
+
+		spin_lock(&inode->i_lock);
+		loff = generic_file_llseek_unlocked(filp, offset, origin);
+		spin_unlock(&inode->i_lock);
+	} else
+		loff = generic_file_llseek_unlocked(filp, offset, origin);
+	return loff;
+}
+
+/*
+ * Flush all dirty pages, and check for write errors.
+ */
+static int
+nfs_file_flush(struct file *file, fl_owner_t id)
+{
+	struct dentry	*dentry = file->f_path.dentry;
+	struct inode	*inode = dentry->d_inode;
+
+	dprintk("NFS: flush(%s/%s)\n",
+			dentry->d_parent->d_name.name,
+			dentry->d_name.name);
+
+	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
+	if ((file->f_mode & FMODE_WRITE) == 0)
+		return 0;
+
+	/* Flush writes to the server and return any errors */
+	return vfs_fsync(file, 0);
+}
+
+static ssize_t
+nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
+{
+	struct dentry * dentry = iocb->ki_filp->f_path.dentry;
+	struct inode * inode = dentry->d_inode;
+	ssize_t result;
+	size_t count = iov_length(iov, nr_segs);
+
+	if (iocb->ki_filp->f_flags & O_DIRECT)
+		return nfs_file_direct_read(iocb, iov, nr_segs, pos);
+
+	dprintk("NFS: read(%s/%s, %lu@%lu)\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name,
+		(unsigned long) count, (unsigned long) pos);
+
+	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
+	if (!result) {
+		result = generic_file_aio_read(iocb, iov, nr_segs, pos);
+		if (result > 0)
+			nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
+	}
+	return result;
+}
+
+static ssize_t
+nfs_file_splice_read(struct file *filp, loff_t *ppos,
+		     struct pipe_inode_info *pipe, size_t count,
+		     unsigned int flags)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	ssize_t res;
+
+	dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name,
+		(unsigned long) count, (unsigned long long) *ppos);
+
+	res = nfs_revalidate_mapping(inode, filp->f_mapping);
+	if (!res) {
+		res = generic_file_splice_read(filp, ppos, pipe, count, flags);
+		if (res > 0)
+			nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
+	}
+	return res;
+}
+
+static int
+nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	int	status;
+
+	dprintk("NFS: mmap(%s/%s)\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+
+	/* Note: generic_file_mmap() returns ENOSYS on nommu systems
+	 *       so we call that before revalidating the mapping
+	 */
+	status = generic_file_mmap(file, vma);
+	if (!status) {
+		vma->vm_ops = &nfs_file_vm_ops;
+		status = nfs_revalidate_mapping(inode, file->f_mapping);
+	}
+	return status;
+}
+
+/*
+ * Flush any dirty pages for this process, and check for write errors.
+ * The return status from this call provides a reliable indication of
+ * whether any write errors occurred for this process.
+ *
+ * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
+ * disk, but it retrieves and clears ctx->error after synching, despite
+ * the two being set at the same time in nfs_context_set_write_error().
+ * This is because the former is used to notify the _next_ call to
+ * nfs_file_write() that a write error occurred, and hence cause it to
+ * fall back to doing a synchronous write.
+ */
+static int
+nfs_file_fsync(struct file *file, int datasync)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
+	struct inode *inode = dentry->d_inode;
+	int have_error, status;
+	int ret = 0;
+
+
+	dprintk("NFS: fsync file(%s/%s) datasync %d\n",
+			dentry->d_parent->d_name.name, dentry->d_name.name,
+			datasync);
+
+	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
+	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+	status = nfs_commit_inode(inode, FLUSH_SYNC);
+	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+	if (have_error)
+		ret = xchg(&ctx->error, 0);
+	if (!ret && status < 0)
+		ret = status;
+	if (!ret && !datasync)
+		/* application has asked for meta-data sync */
+		ret = pnfs_layoutcommit_inode(inode, true);
+	return ret;
+}
+
+/*
+ * Decide whether a read/modify/write cycle may be more efficient
+ * then a modify/write/read cycle when writing to a page in the
+ * page cache.
+ *
+ * The modify/write/read cycle may occur if a page is read before
+ * being completely filled by the writer.  In this situation, the
+ * page must be completely written to stable storage on the server
+ * before it can be refilled by reading in the page from the server.
+ * This can lead to expensive, small, FILE_SYNC mode writes being
+ * done.
+ *
+ * It may be more efficient to read the page first if the file is
+ * open for reading in addition to writing, the page is not marked
+ * as Uptodate, it is not dirty or waiting to be committed,
+ * indicating that it was previously allocated and then modified,
+ * that there were valid bytes of data in that range of the file,
+ * and that the new data won't completely replace the old data in
+ * that range of the file.
+ */
+static int nfs_want_read_modify_write(struct file *file, struct page *page,
+			loff_t pos, unsigned len)
+{
+	unsigned int pglen = nfs_page_length(page);
+	unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned int end = offset + len;
+
+	if ((file->f_mode & FMODE_READ) &&	/* open for read? */
+	    !PageUptodate(page) &&		/* Uptodate? */
+	    !PagePrivate(page) &&		/* i/o request already? */
+	    pglen &&				/* valid bytes of file? */
+	    (end < pglen || offset))		/* replace all valid bytes? */
+		return 1;
+	return 0;
+}
+
+/*
+ * This does the "real" work of the write. We must allocate and lock the
+ * page to be sent back to the generic routine, which then copies the
+ * data from user space.
+ *
+ * If the writer ends up delaying the write, the writer needs to
+ * increment the page use counts until he is done with the page.
+ */
+static int nfs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	int once_thru = 0;
+
+	dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
+		file->f_path.dentry->d_parent->d_name.name,
+		file->f_path.dentry->d_name.name,
+		mapping->host->i_ino, len, (long long) pos);
+
+start:
+	/*
+	 * Prevent starvation issues if someone is doing a consistency
+	 * sync-to-disk
+	 */
+	ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+	if (ret)
+		return ret;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	ret = nfs_flush_incompatible(file, page);
+	if (ret) {
+		unlock_page(page);
+		page_cache_release(page);
+	} else if (!once_thru &&
+		   nfs_want_read_modify_write(file, page, pos, len)) {
+		once_thru = 1;
+		ret = nfs_readpage(file, page);
+		page_cache_release(page);
+		if (!ret)
+			goto start;
+	}
+	return ret;
+}
+
+static int nfs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+	int status;
+
+	dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
+		file->f_path.dentry->d_parent->d_name.name,
+		file->f_path.dentry->d_name.name,
+		mapping->host->i_ino, len, (long long) pos);
+
+	/*
+	 * Zero any uninitialised parts of the page, and then mark the page
+	 * as up to date if it turns out that we're extending the file.
+	 */
+	if (!PageUptodate(page)) {
+		unsigned pglen = nfs_page_length(page);
+		unsigned end = offset + len;
+
+		if (pglen == 0) {
+			zero_user_segments(page, 0, offset,
+					end, PAGE_CACHE_SIZE);
+			SetPageUptodate(page);
+		} else if (end >= pglen) {
+			zero_user_segment(page, end, PAGE_CACHE_SIZE);
+			if (offset == 0)
+				SetPageUptodate(page);
+		} else
+			zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
+	}
+
+	status = nfs_updatepage(file, page, offset, copied);
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (status < 0)
+		return status;
+	return copied;
+}
+
+/*
+ * Partially or wholly invalidate a page
+ * - Release the private state associated with a page if undergoing complete
+ *   page invalidation
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ */
+static void nfs_invalidate_page(struct page *page, unsigned long offset)
+{
+	dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
+
+	if (offset != 0)
+		return;
+	/* Cancel any unstarted writes on this page */
+	nfs_wb_page_cancel(page->mapping->host, page);
+
+	nfs_fscache_invalidate_page(page, page->mapping->host);
+}
+
+/*
+ * Attempt to release the private state associated with a page
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ * - Return true (may release page) or false (may not)
+ */
+static int nfs_release_page(struct page *page, gfp_t gfp)
+{
+	struct address_space *mapping = page->mapping;
+
+	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
+
+	/* Only do I/O if gfp is a superset of GFP_KERNEL */
+	if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) {
+		int how = FLUSH_SYNC;
+
+		/* Don't let kswapd deadlock waiting for OOM RPC calls */
+		if (current_is_kswapd())
+			how = 0;
+		nfs_commit_inode(mapping->host, how);
+	}
+	/* If PagePrivate() is set, then the page is not freeable */
+	if (PagePrivate(page))
+		return 0;
+	return nfs_fscache_release_page(page, gfp);
+}
+
+/*
+ * Attempt to clear the private state associated with a page when an error
+ * occurs that requires the cached contents of an inode to be written back or
+ * destroyed
+ * - Called if either PG_private or fscache is set on the page
+ * - Caller holds page lock
+ * - Return 0 if successful, -error otherwise
+ */
+static int nfs_launder_page(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
+		inode->i_ino, (long long)page_offset(page));
+
+	nfs_fscache_wait_on_page_write(nfsi, page);
+	return nfs_wb_page(inode, page);
+}
+
+const struct address_space_operations nfs_file_aops = {
+	.readpage = nfs_readpage,
+	.readpages = nfs_readpages,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+	.writepage = nfs_writepage,
+	.writepages = nfs_writepages,
+	.write_begin = nfs_write_begin,
+	.write_end = nfs_write_end,
+	.invalidatepage = nfs_invalidate_page,
+	.releasepage = nfs_release_page,
+	.direct_IO = nfs_direct_IO,
+	.migratepage = nfs_migrate_page,
+	.launder_page = nfs_launder_page,
+	.error_remove_page = generic_error_remove_page,
+};
+
+/*
+ * Notification that a PTE pointing to an NFS page is about to be made
+ * writable, implying that someone is about to modify the page through a
+ * shared-writable mapping
+ */
+static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct file *filp = vma->vm_file;
+	struct dentry *dentry = filp->f_path.dentry;
+	unsigned pagelen;
+	int ret = VM_FAULT_NOPAGE;
+	struct address_space *mapping;
+
+	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name,
+		filp->f_mapping->host->i_ino,
+		(long long)page_offset(page));
+
+	/* make sure the cache has finished storing the page */
+	nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
+
+	lock_page(page);
+	mapping = page->mapping;
+	if (mapping != dentry->d_inode->i_mapping)
+		goto out_unlock;
+
+	pagelen = nfs_page_length(page);
+	if (pagelen == 0)
+		goto out_unlock;
+
+	ret = VM_FAULT_LOCKED;
+	if (nfs_flush_incompatible(filp, page) == 0 &&
+	    nfs_updatepage(filp, page, 0, pagelen) == 0)
+		goto out;
+
+	ret = VM_FAULT_SIGBUS;
+out_unlock:
+	unlock_page(page);
+out:
+	return ret;
+}
+
+static const struct vm_operations_struct nfs_file_vm_ops = {
+	.fault = filemap_fault,
+	.page_mkwrite = nfs_vm_page_mkwrite,
+};
+
+static int nfs_need_sync_write(struct file *filp, struct inode *inode)
+{
+	struct nfs_open_context *ctx;
+
+	if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
+		return 1;
+	ctx = nfs_file_open_context(filp);
+	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))
+		return 1;
+	return 0;
+}
+
+static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
+{
+	struct dentry * dentry = iocb->ki_filp->f_path.dentry;
+	struct inode * inode = dentry->d_inode;
+	unsigned long written = 0;
+	ssize_t result;
+	size_t count = iov_length(iov, nr_segs);
+
+	if (iocb->ki_filp->f_flags & O_DIRECT)
+		return nfs_file_direct_write(iocb, iov, nr_segs, pos);
+
+	dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name,
+		(unsigned long) count, (long long) pos);
+
+	result = -EBUSY;
+	if (IS_SWAPFILE(inode))
+		goto out_swapfile;
+	/*
+	 * O_APPEND implies that we must revalidate the file length.
+	 */
+	if (iocb->ki_filp->f_flags & O_APPEND) {
+		result = nfs_revalidate_file_size(inode, iocb->ki_filp);
+		if (result)
+			goto out;
+	}
+
+	result = count;
+	if (!count)
+		goto out;
+
+	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+	if (result > 0)
+		written = result;
+
+	/* Return error values for O_DSYNC and IS_SYNC() */
+	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
+		int err = vfs_fsync(iocb->ki_filp, 0);
+		if (err < 0)
+			result = err;
+	}
+	if (result > 0)
+		nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
+out:
+	return result;
+
+out_swapfile:
+	printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
+	goto out;
+}
+
+static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
+				     struct file *filp, loff_t *ppos,
+				     size_t count, unsigned int flags)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	unsigned long written = 0;
+	ssize_t ret;
+
+	dprintk("NFS splice_write(%s/%s, %lu@%llu)\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name,
+		(unsigned long) count, (unsigned long long) *ppos);
+
+	/*
+	 * The combination of splice and an O_APPEND destination is disallowed.
+	 */
+
+	ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
+	if (ret > 0)
+		written = ret;
+
+	if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
+		int err = vfs_fsync(filp, 0);
+		if (err < 0)
+			ret = err;
+	}
+	if (ret > 0)
+		nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
+	return ret;
+}
+
+static int
+do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int status = 0;
+	unsigned int saved_type = fl->fl_type;
+
+	/* Try local locking first */
+	posix_test_lock(filp, fl);
+	if (fl->fl_type != F_UNLCK) {
+		/* found a conflict */
+		goto out;
+	}
+	fl->fl_type = saved_type;
+
+	if (nfs_have_delegation(inode, FMODE_READ))
+		goto out_noconflict;
+
+	if (is_local)
+		goto out_noconflict;
+
+	status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+out:
+	return status;
+out_noconflict:
+	fl->fl_type = F_UNLCK;
+	goto out;
+}
+
+static int do_vfs_lock(struct file *file, struct file_lock *fl)
+{
+	int res = 0;
+	switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
+		case FL_POSIX:
+			res = posix_lock_file_wait(file, fl);
+			break;
+		case FL_FLOCK:
+			res = flock_lock_file_wait(file, fl);
+			break;
+		default:
+			BUG();
+	}
+	return res;
+}
+
+static int
+do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int status;
+
+	/*
+	 * Flush all pending writes before doing anything
+	 * with locks..
+	 */
+	nfs_sync_mapping(filp->f_mapping);
+
+	/* NOTE: special case
+	 * 	If we're signalled while cleaning up locks on process exit, we
+	 * 	still need to complete the unlock.
+	 */
+	/*
+	 * Use local locking if mounted with "-onolock" or with appropriate
+	 * "-olocal_lock="
+	 */
+	if (!is_local)
+		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+	else
+		status = do_vfs_lock(filp, fl);
+	return status;
+}
+
+static int
+is_time_granular(struct timespec *ts) {
+	return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
+}
+
+static int
+do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int status;
+
+	/*
+	 * Flush all pending writes before doing anything
+	 * with locks..
+	 */
+	status = nfs_sync_mapping(filp->f_mapping);
+	if (status != 0)
+		goto out;
+
+	/*
+	 * Use local locking if mounted with "-onolock" or with appropriate
+	 * "-olocal_lock="
+	 */
+	if (!is_local)
+		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+	else
+		status = do_vfs_lock(filp, fl);
+	if (status < 0)
+		goto out;
+
+	/*
+	 * Revalidate the cache if the server has time stamps granular
+	 * enough to detect subsecond changes.  Otherwise, clear the
+	 * cache to prevent missing any changes.
+	 *
+	 * This makes locking act as a cache coherency point.
+	 */
+	nfs_sync_mapping(filp->f_mapping);
+	if (!nfs_have_delegation(inode, FMODE_READ)) {
+		if (is_time_granular(&NFS_SERVER(inode)->time_delta))
+			__nfs_revalidate_inode(NFS_SERVER(inode), inode);
+		else
+			nfs_zap_caches(inode);
+	}
+out:
+	return status;
+}
+
+/*
+ * Lock a (portion of) a file
+ */
+static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int ret = -ENOLCK;
+	int is_local = 0;
+
+	dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
+			filp->f_path.dentry->d_parent->d_name.name,
+			filp->f_path.dentry->d_name.name,
+			fl->fl_type, fl->fl_flags,
+			(long long)fl->fl_start, (long long)fl->fl_end);
+
+	nfs_inc_stats(inode, NFSIOS_VFSLOCK);
+
+	/* No mandatory locks over NFS */
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+		goto out_err;
+
+	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
+		is_local = 1;
+
+	if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
+		ret = NFS_PROTO(inode)->lock_check_bounds(fl);
+		if (ret < 0)
+			goto out_err;
+	}
+
+	if (IS_GETLK(cmd))
+		ret = do_getlk(filp, cmd, fl, is_local);
+	else if (fl->fl_type == F_UNLCK)
+		ret = do_unlk(filp, cmd, fl, is_local);
+	else
+		ret = do_setlk(filp, cmd, fl, is_local);
+out_err:
+	return ret;
+}
+
+/*
+ * Lock a (portion of) a file
+ */
+static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int is_local = 0;
+
+	dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
+			filp->f_path.dentry->d_parent->d_name.name,
+			filp->f_path.dentry->d_name.name,
+			fl->fl_type, fl->fl_flags);
+
+	if (!(fl->fl_flags & FL_FLOCK))
+		return -ENOLCK;
+
+	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
+		is_local = 1;
+
+	/* We're simulating flock() locks using posix locks on the server */
+	fl->fl_owner = (fl_owner_t)filp;
+	fl->fl_start = 0;
+	fl->fl_end = OFFSET_MAX;
+
+	if (fl->fl_type == F_UNLCK)
+		return do_unlk(filp, cmd, fl, is_local);
+	return do_setlk(filp, cmd, fl, is_local);
+}
+
+/*
+ * There is no protocol support for leases, so we have no way to implement
+ * them correctly in the face of opens by other clients.
+ */
+static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
+{
+	dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
+			file->f_path.dentry->d_parent->d_name.name,
+			file->f_path.dentry->d_name.name, arg);
+	return -EINVAL;
+}
+
+#ifdef CONFIG_NFS_V4
+static int
+nfs4_file_open(struct inode *inode, struct file *filp)
+{
+	/*
+	 * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to
+	 * this point, then something is very wrong
+	 */
+	dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp);
+	return -ENOTDIR;
+}
+
+const struct file_operations nfs4_file_operations = {
+	.llseek		= nfs_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= nfs_file_read,
+	.aio_write	= nfs_file_write,
+	.mmap		= nfs_file_mmap,
+	.open		= nfs4_file_open,
+	.flush		= nfs_file_flush,
+	.release	= nfs_file_release,
+	.fsync		= nfs_file_fsync,
+	.lock		= nfs_lock,
+	.flock		= nfs_flock,
+	.splice_read	= nfs_file_splice_read,
+	.splice_write	= nfs_file_splice_write,
+	.check_flags	= nfs_check_flags,
+	.setlease	= nfs_setlease,
+};
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
new file mode 100644
index 00000000..5b100648
--- /dev/null
+++ b/fs/nfs/fscache-index.c
@@ -0,0 +1,337 @@
+/* NFS FS-Cache index structure definition
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+
+#include "internal.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY		NFSDBG_FSCACHE
+
+/*
+ * Define the NFS filesystem for FS-Cache.  Upon registration FS-Cache sticks
+ * the cookie for the top-level index object for NFS into here.  The top-level
+ * index can than have other cache objects inserted into it.
+ */
+struct fscache_netfs nfs_fscache_netfs = {
+	.name		= "nfs",
+	.version	= 0,
+};
+
+/*
+ * Register NFS for caching
+ */
+int nfs_fscache_register(void)
+{
+	return fscache_register_netfs(&nfs_fscache_netfs);
+}
+
+/*
+ * Unregister NFS for caching
+ */
+void nfs_fscache_unregister(void)
+{
+	fscache_unregister_netfs(&nfs_fscache_netfs);
+}
+
+/*
+ * Layout of the key for an NFS server cache object.
+ */
+struct nfs_server_key {
+	uint16_t	nfsversion;		/* NFS protocol version */
+	uint16_t	family;			/* address family */
+	uint16_t	port;			/* IP port */
+	union {
+		struct in_addr	ipv4_addr;	/* IPv4 address */
+		struct in6_addr ipv6_addr;	/* IPv6 address */
+	} addr[0];
+};
+
+/*
+ * Generate a key to describe a server in the main NFS index
+ * - We return the length of the key, or 0 if we can't generate one
+ */
+static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
+				   void *buffer, uint16_t bufmax)
+{
+	const struct nfs_client *clp = cookie_netfs_data;
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
+	const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
+	struct nfs_server_key *key = buffer;
+	uint16_t len = sizeof(struct nfs_server_key);
+
+	key->nfsversion = clp->rpc_ops->version;
+	key->family = clp->cl_addr.ss_family;
+
+	memset(key, 0, len);
+
+	switch (clp->cl_addr.ss_family) {
+	case AF_INET:
+		key->port = sin->sin_port;
+		key->addr[0].ipv4_addr = sin->sin_addr;
+		len += sizeof(key->addr[0].ipv4_addr);
+		break;
+
+	case AF_INET6:
+		key->port = sin6->sin6_port;
+		key->addr[0].ipv6_addr = sin6->sin6_addr;
+		len += sizeof(key->addr[0].ipv6_addr);
+		break;
+
+	default:
+		printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
+		       clp->cl_addr.ss_family);
+		len = 0;
+		break;
+	}
+
+	return len;
+}
+
+/*
+ * Define the server object for FS-Cache.  This is used to describe a server
+ * object to fscache_acquire_cookie().  It is keyed by the NFS protocol and
+ * server address parameters.
+ */
+const struct fscache_cookie_def nfs_fscache_server_index_def = {
+	.name		= "NFS.server",
+	.type 		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= nfs_server_get_key,
+};
+
+/*
+ * Generate a key to describe a superblock key in the main NFS index
+ */
+static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
+				  void *buffer, uint16_t bufmax)
+{
+	const struct nfs_fscache_key *key;
+	const struct nfs_server *nfss = cookie_netfs_data;
+	uint16_t len;
+
+	key = nfss->fscache_key;
+	len = sizeof(key->key) + key->key.uniq_len;
+	if (len > bufmax) {
+		len = 0;
+	} else {
+		memcpy(buffer, &key->key, sizeof(key->key));
+		memcpy(buffer + sizeof(key->key),
+		       key->key.uniquifier, key->key.uniq_len);
+	}
+
+	return len;
+}
+
+/*
+ * Define the superblock object for FS-Cache.  This is used to describe a
+ * superblock object to fscache_acquire_cookie().  It is keyed by all the NFS
+ * parameters that might cause a separate superblock.
+ */
+const struct fscache_cookie_def nfs_fscache_super_index_def = {
+	.name		= "NFS.super",
+	.type 		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= nfs_super_get_key,
+};
+
+/*
+ * Definition of the auxiliary data attached to NFS inode storage objects
+ * within the cache.
+ *
+ * The contents of this struct are recorded in the on-disk local cache in the
+ * auxiliary data attached to the data storage object backing an inode.  This
+ * permits coherency to be managed when a new inode binds to an already extant
+ * cache object.
+ */
+struct nfs_fscache_inode_auxdata {
+	struct timespec	mtime;
+	struct timespec	ctime;
+	loff_t		size;
+	u64		change_attr;
+};
+
+/*
+ * Generate a key to describe an NFS inode in an NFS server's index
+ */
+static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data,
+					  void *buffer, uint16_t bufmax)
+{
+	const struct nfs_inode *nfsi = cookie_netfs_data;
+	uint16_t nsize;
+
+	/* use the inode's NFS filehandle as the key */
+	nsize = nfsi->fh.size;
+	memcpy(buffer, nfsi->fh.data, nsize);
+	return nsize;
+}
+
+/*
+ * Get certain file attributes from the netfs data
+ * - This function can be absent for an index
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data,
+				       uint64_t *size)
+{
+	const struct nfs_inode *nfsi = cookie_netfs_data;
+
+	*size = nfsi->vfs_inode.i_size;
+}
+
+/*
+ * Get the auxiliary data from netfs data
+ * - This function can be absent if the index carries no state data
+ * - Should store the auxiliary data in the buffer
+ * - Should return the amount of amount stored
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
+					  void *buffer, uint16_t bufmax)
+{
+	struct nfs_fscache_inode_auxdata auxdata;
+	const struct nfs_inode *nfsi = cookie_netfs_data;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.size = nfsi->vfs_inode.i_size;
+	auxdata.mtime = nfsi->vfs_inode.i_mtime;
+	auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
+	if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+		auxdata.change_attr = nfsi->change_attr;
+
+	if (bufmax > sizeof(auxdata))
+		bufmax = sizeof(auxdata);
+
+	memcpy(buffer, &auxdata, bufmax);
+	return bufmax;
+}
+
+/*
+ * Consult the netfs about the state of an object
+ * - This function can be absent if the index carries no state data
+ * - The netfs data from the cookie being used as the target is
+ *   presented, as is the auxiliary data
+ */
+static
+enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
+						  const void *data,
+						  uint16_t datalen)
+{
+	struct nfs_fscache_inode_auxdata auxdata;
+	struct nfs_inode *nfsi = cookie_netfs_data;
+
+	if (datalen != sizeof(auxdata))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.size = nfsi->vfs_inode.i_size;
+	auxdata.mtime = nfsi->vfs_inode.i_mtime;
+	auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
+	if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+		auxdata.change_attr = nfsi->change_attr;
+
+	if (memcmp(data, &auxdata, datalen) != 0)
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*
+ * Indication from FS-Cache that the cookie is no longer cached
+ * - This function is called when the backing store currently caching a cookie
+ *   is removed
+ * - The netfs should use this to clean up any markers indicating cached pages
+ * - This is mandatory for any object that may have data
+ */
+static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
+{
+	struct nfs_inode *nfsi = cookie_netfs_data;
+	struct pagevec pvec;
+	pgoff_t first;
+	int loop, nr_pages;
+
+	pagevec_init(&pvec, 0);
+	first = 0;
+
+	dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
+
+	for (;;) {
+		/* grab a bunch of pages to unmark */
+		nr_pages = pagevec_lookup(&pvec,
+					  nfsi->vfs_inode.i_mapping,
+					  first,
+					  PAGEVEC_SIZE - pagevec_count(&pvec));
+		if (!nr_pages)
+			break;
+
+		for (loop = 0; loop < nr_pages; loop++)
+			ClearPageFsCache(pvec.pages[loop]);
+
+		first = pvec.pages[nr_pages - 1]->index + 1;
+
+		pvec.nr = nr_pages;
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+/*
+ * Get an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ *   context.
+ * - The read context is passed back to NFS in the event that a data read on the
+ *   cache fails with EIO - in which case the server must be contacted to
+ *   retrieve the data, which requires the read context for security.
+ */
+static void nfs_fh_get_context(void *cookie_netfs_data, void *context)
+{
+	get_nfs_open_context(context);
+}
+
+/*
+ * Release an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ *   context.
+ */
+static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
+{
+	if (context)
+		put_nfs_open_context(context);
+}
+
+/*
+ * Define the inode object for FS-Cache.  This is used to describe an inode
+ * object to fscache_acquire_cookie().  It is keyed by the NFS file handle for
+ * an inode.
+ *
+ * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime
+ * held in the cache auxiliary data for the data storage object with those in
+ * the inode struct in memory.
+ */
+const struct fscache_cookie_def nfs_fscache_inode_object_def = {
+	.name		= "NFS.fh",
+	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
+	.get_key	= nfs_fscache_inode_get_key,
+	.get_attr	= nfs_fscache_inode_get_attr,
+	.get_aux	= nfs_fscache_inode_get_aux,
+	.check_aux	= nfs_fscache_inode_check_aux,
+	.now_uncached	= nfs_fscache_inode_now_uncached,
+	.get_context	= nfs_fh_get_context,
+	.put_context	= nfs_fh_put_context,
+};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
new file mode 100644
index 00000000..419119c3
--- /dev/null
+++ b/fs/nfs/fscache.c
@@ -0,0 +1,535 @@
+/* NFS filesystem cache interface
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY		NFSDBG_FSCACHE
+
+static struct rb_root nfs_fscache_keys = RB_ROOT;
+static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
+
+/*
+ * Get the per-client index cookie for an NFS client if the appropriate mount
+ * flag was set
+ * - We always try and get an index cookie for the client, but get filehandle
+ *   cookies on a per-superblock basis, depending on the mount flags
+ */
+void nfs_fscache_get_client_cookie(struct nfs_client *clp)
+{
+	/* create a cache index for looking up filehandles */
+	clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
+					      &nfs_fscache_server_index_def,
+					      clp);
+	dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
+		 clp, clp->fscache);
+}
+
+/*
+ * Dispose of a per-client cookie
+ */
+void nfs_fscache_release_client_cookie(struct nfs_client *clp)
+{
+	dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
+		 clp, clp->fscache);
+
+	fscache_relinquish_cookie(clp->fscache, 0);
+	clp->fscache = NULL;
+}
+
+/*
+ * Get the cache cookie for an NFS superblock.  We have to handle
+ * uniquification here because the cache doesn't do it for us.
+ *
+ * The default uniquifier is just an empty string, but it may be overridden
+ * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent
+ * superblock across an automount point of some nature.
+ */
+void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq,
+				  struct nfs_clone_mount *mntdata)
+{
+	struct nfs_fscache_key *key, *xkey;
+	struct nfs_server *nfss = NFS_SB(sb);
+	struct rb_node **p, *parent;
+	int diff, ulen;
+
+	if (uniq) {
+		ulen = strlen(uniq);
+	} else if (mntdata) {
+		struct nfs_server *mnt_s = NFS_SB(mntdata->sb);
+		if (mnt_s->fscache_key) {
+			uniq = mnt_s->fscache_key->key.uniquifier;
+			ulen = mnt_s->fscache_key->key.uniq_len;
+		}
+	}
+
+	if (!uniq) {
+		uniq = "";
+		ulen = 1;
+	}
+
+	key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
+	if (!key)
+		return;
+
+	key->nfs_client = nfss->nfs_client;
+	key->key.super.s_flags = sb->s_flags & NFS_MS_MASK;
+	key->key.nfs_server.flags = nfss->flags;
+	key->key.nfs_server.rsize = nfss->rsize;
+	key->key.nfs_server.wsize = nfss->wsize;
+	key->key.nfs_server.acregmin = nfss->acregmin;
+	key->key.nfs_server.acregmax = nfss->acregmax;
+	key->key.nfs_server.acdirmin = nfss->acdirmin;
+	key->key.nfs_server.acdirmax = nfss->acdirmax;
+	key->key.nfs_server.fsid = nfss->fsid;
+	key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
+
+	key->key.uniq_len = ulen;
+	memcpy(key->key.uniquifier, uniq, ulen);
+
+	spin_lock(&nfs_fscache_keys_lock);
+	p = &nfs_fscache_keys.rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+		xkey = rb_entry(parent, struct nfs_fscache_key, node);
+
+		if (key->nfs_client < xkey->nfs_client)
+			goto go_left;
+		if (key->nfs_client > xkey->nfs_client)
+			goto go_right;
+
+		diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
+		if (diff < 0)
+			goto go_left;
+		if (diff > 0)
+			goto go_right;
+
+		if (key->key.uniq_len == 0)
+			goto non_unique;
+		diff = memcmp(key->key.uniquifier,
+			      xkey->key.uniquifier,
+			      key->key.uniq_len);
+		if (diff < 0)
+			goto go_left;
+		if (diff > 0)
+			goto go_right;
+		goto non_unique;
+
+	go_left:
+		p = &(*p)->rb_left;
+		continue;
+	go_right:
+		p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&key->node, parent, p);
+	rb_insert_color(&key->node, &nfs_fscache_keys);
+	spin_unlock(&nfs_fscache_keys_lock);
+	nfss->fscache_key = key;
+
+	/* create a cache index for looking up filehandles */
+	nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
+					       &nfs_fscache_super_index_def,
+					       nfss);
+	dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
+		 nfss, nfss->fscache);
+	return;
+
+non_unique:
+	spin_unlock(&nfs_fscache_keys_lock);
+	kfree(key);
+	nfss->fscache_key = NULL;
+	nfss->fscache = NULL;
+	printk(KERN_WARNING "NFS:"
+	       " Cache request denied due to non-unique superblock keys\n");
+}
+
+/*
+ * release a per-superblock cookie
+ */
+void nfs_fscache_release_super_cookie(struct super_block *sb)
+{
+	struct nfs_server *nfss = NFS_SB(sb);
+
+	dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
+		 nfss, nfss->fscache);
+
+	fscache_relinquish_cookie(nfss->fscache, 0);
+	nfss->fscache = NULL;
+
+	if (nfss->fscache_key) {
+		spin_lock(&nfs_fscache_keys_lock);
+		rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
+		spin_unlock(&nfs_fscache_keys_lock);
+		kfree(nfss->fscache_key);
+		nfss->fscache_key = NULL;
+	}
+}
+
+/*
+ * Initialise the per-inode cache cookie pointer for an NFS inode.
+ */
+void nfs_fscache_init_inode_cookie(struct inode *inode)
+{
+	NFS_I(inode)->fscache = NULL;
+	if (S_ISREG(inode->i_mode))
+		set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+}
+
+/*
+ * Get the per-inode cache cookie for an NFS inode.
+ */
+static void nfs_fscache_enable_inode_cookie(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	if (nfsi->fscache || !NFS_FSCACHE(inode))
+		return;
+
+	if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
+		nfsi->fscache = fscache_acquire_cookie(
+			NFS_SB(sb)->fscache,
+			&nfs_fscache_inode_object_def,
+			nfsi);
+
+		dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
+			 sb, nfsi, nfsi->fscache);
+	}
+}
+
+/*
+ * Release a per-inode cookie.
+ */
+void nfs_fscache_release_inode_cookie(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
+		 nfsi, nfsi->fscache);
+
+	fscache_relinquish_cookie(nfsi->fscache, 0);
+	nfsi->fscache = NULL;
+}
+
+/*
+ * Retire a per-inode cookie, destroying the data attached to it.
+ */
+void nfs_fscache_zap_inode_cookie(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
+		 nfsi, nfsi->fscache);
+
+	fscache_relinquish_cookie(nfsi->fscache, 1);
+	nfsi->fscache = NULL;
+}
+
+/*
+ * Turn off the cache with regard to a per-inode cookie if opened for writing,
+ * invalidating all the pages in the page cache relating to the associated
+ * inode to clear the per-page caching.
+ */
+static void nfs_fscache_disable_inode_cookie(struct inode *inode)
+{
+	clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+
+	if (NFS_I(inode)->fscache) {
+		dfprintk(FSCACHE,
+			 "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
+
+		/* Need to uncache any pages attached to this inode that
+		 * fscache knows about before turning off the cache.
+		 */
+		fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode);
+		nfs_fscache_zap_inode_cookie(inode);
+	}
+}
+
+/*
+ * wait_on_bit() sleep function for uninterruptible waiting
+ */
+static int nfs_fscache_wait_bit(void *flags)
+{
+	schedule();
+	return 0;
+}
+
+/*
+ * Lock against someone else trying to also acquire or relinquish a cookie
+ */
+static inline void nfs_fscache_inode_lock(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
+		wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
+			    nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * Unlock cookie management lock
+ */
+static inline void nfs_fscache_inode_unlock(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	smp_mb__before_clear_bit();
+	clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
+}
+
+/*
+ * Decide if we should enable or disable local caching for this inode.
+ * - For now, with NFS, only regular files that are open read-only will be able
+ *   to use the cache.
+ * - May be invoked multiple times in parallel by parallel nfs_open() functions.
+ */
+void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
+{
+	if (NFS_FSCACHE(inode)) {
+		nfs_fscache_inode_lock(inode);
+		if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+			nfs_fscache_disable_inode_cookie(inode);
+		else
+			nfs_fscache_enable_inode_cookie(inode);
+		nfs_fscache_inode_unlock(inode);
+	}
+}
+
+/*
+ * Replace a per-inode cookie due to revalidation detecting a file having
+ * changed on the server.
+ */
+void nfs_fscache_reset_inode_cookie(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_server *nfss = NFS_SERVER(inode);
+	struct fscache_cookie *old = nfsi->fscache;
+
+	nfs_fscache_inode_lock(inode);
+	if (nfsi->fscache) {
+		/* retire the current fscache cache and get a new one */
+		fscache_relinquish_cookie(nfsi->fscache, 1);
+
+		nfsi->fscache = fscache_acquire_cookie(
+			nfss->nfs_client->fscache,
+			&nfs_fscache_inode_object_def,
+			nfsi);
+
+		dfprintk(FSCACHE,
+			 "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
+			 nfss, nfsi, old, nfsi->fscache);
+	}
+	nfs_fscache_inode_unlock(inode);
+}
+
+/*
+ * Release the caching state associated with a page, if the page isn't busy
+ * interacting with the cache.
+ * - Returns true (can release page) or false (page busy).
+ */
+int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	if (PageFsCache(page)) {
+		struct nfs_inode *nfsi = NFS_I(page->mapping->host);
+		struct fscache_cookie *cookie = nfsi->fscache;
+
+		BUG_ON(!cookie);
+		dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
+			 cookie, page, nfsi);
+
+		if (!fscache_maybe_release_page(cookie, page, gfp))
+			return 0;
+
+		nfs_add_fscache_stats(page->mapping->host,
+				      NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+	}
+
+	return 1;
+}
+
+/*
+ * Release the caching state associated with a page if undergoing complete page
+ * invalidation.
+ */
+void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct fscache_cookie *cookie = nfsi->fscache;
+
+	BUG_ON(!cookie);
+
+	dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
+		 cookie, page, nfsi);
+
+	fscache_wait_on_page_write(cookie, page);
+
+	BUG_ON(!PageLocked(page));
+	fscache_uncache_page(cookie, page);
+	nfs_add_fscache_stats(page->mapping->host,
+			      NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+}
+
+/*
+ * Handle completion of a page being read from the cache.
+ * - Called in process (keventd) context.
+ */
+static void nfs_readpage_from_fscache_complete(struct page *page,
+					       void *context,
+					       int error)
+{
+	dfprintk(FSCACHE,
+		 "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n",
+		 page, context, error);
+
+	/* if the read completes with an error, we just unlock the page and let
+	 * the VM reissue the readpage */
+	if (!error) {
+		SetPageUptodate(page);
+		unlock_page(page);
+	} else {
+		error = nfs_readpage_async(context, page->mapping->host, page);
+		if (error)
+			unlock_page(page);
+	}
+}
+
+/*
+ * Retrieve a page from fscache
+ */
+int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+				struct inode *inode, struct page *page)
+{
+	int ret;
+
+	dfprintk(FSCACHE,
+		 "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
+		 NFS_I(inode)->fscache, page, page->index, page->flags, inode);
+
+	ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache,
+					 page,
+					 nfs_readpage_from_fscache_complete,
+					 ctx,
+					 GFP_KERNEL);
+
+	switch (ret) {
+	case 0: /* read BIO submitted (page in fscache) */
+		dfprintk(FSCACHE,
+			 "NFS:    readpage_from_fscache: BIO submitted\n");
+		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1);
+		return ret;
+
+	case -ENOBUFS: /* inode not in cache */
+	case -ENODATA: /* page not in cache */
+		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+		dfprintk(FSCACHE,
+			 "NFS:    readpage_from_fscache %d\n", ret);
+		return 1;
+
+	default:
+		dfprintk(FSCACHE, "NFS:    readpage_from_fscache %d\n", ret);
+		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+	}
+	return ret;
+}
+
+/*
+ * Retrieve a set of pages from fscache
+ */
+int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+				 struct inode *inode,
+				 struct address_space *mapping,
+				 struct list_head *pages,
+				 unsigned *nr_pages)
+{
+	unsigned npages = *nr_pages;
+	int ret;
+
+	dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
+		 NFS_I(inode)->fscache, npages, inode);
+
+	ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache,
+					  mapping, pages, nr_pages,
+					  nfs_readpage_from_fscache_complete,
+					  ctx,
+					  mapping_gfp_mask(mapping));
+	if (*nr_pages < npages)
+		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK,
+				      npages);
+	if (*nr_pages > 0)
+		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL,
+				      *nr_pages);
+
+	switch (ret) {
+	case 0: /* read submitted to the cache for all pages */
+		BUG_ON(!list_empty(pages));
+		BUG_ON(*nr_pages != 0);
+		dfprintk(FSCACHE,
+			 "NFS: nfs_getpages_from_fscache: submitted\n");
+
+		return ret;
+
+	case -ENOBUFS: /* some pages aren't cached and can't be */
+	case -ENODATA: /* some pages aren't cached */
+		dfprintk(FSCACHE,
+			 "NFS: nfs_getpages_from_fscache: no page: %d\n", ret);
+		return 1;
+
+	default:
+		dfprintk(FSCACHE,
+			 "NFS: nfs_getpages_from_fscache: ret  %d\n", ret);
+	}
+
+	return ret;
+}
+
+/*
+ * Store a newly fetched page in fscache
+ * - PG_fscache must be set on the page
+ */
+void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
+{
+	int ret;
+
+	dfprintk(FSCACHE,
+		 "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
+		 NFS_I(inode)->fscache, page, page->index, page->flags, sync);
+
+	ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL);
+	dfprintk(FSCACHE,
+		 "NFS:     readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
+		 page, page->index, page->flags, ret);
+
+	if (ret != 0) {
+		fscache_uncache_page(NFS_I(inode)->fscache, page);
+		nfs_add_fscache_stats(inode,
+				      NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
+		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+	} else {
+		nfs_add_fscache_stats(inode,
+				      NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1);
+	}
+}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
new file mode 100644
index 00000000..b9c572d0
--- /dev/null
+++ b/fs/nfs/fscache.h
@@ -0,0 +1,222 @@
+/* NFS filesystem cache interface definitions
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _NFS_FSCACHE_H
+#define _NFS_FSCACHE_H
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/fscache.h>
+
+#ifdef CONFIG_NFS_FSCACHE
+
+/*
+ * set of NFS FS-Cache objects that form a superblock key
+ */
+struct nfs_fscache_key {
+	struct rb_node		node;
+	struct nfs_client	*nfs_client;	/* the server */
+
+	/* the elements of the unique key - as used by nfs_compare_super() and
+	 * nfs_compare_mount_options() to distinguish superblocks */
+	struct {
+		struct {
+			unsigned long	s_flags;	/* various flags
+							 * (& NFS_MS_MASK) */
+		} super;
+
+		struct {
+			struct nfs_fsid fsid;
+			int		flags;
+			unsigned int	rsize;		/* read size */
+			unsigned int	wsize;		/* write size */
+			unsigned int	acregmin;	/* attr cache timeouts */
+			unsigned int	acregmax;
+			unsigned int	acdirmin;
+			unsigned int	acdirmax;
+		} nfs_server;
+
+		struct {
+			rpc_authflavor_t au_flavor;
+		} rpc_auth;
+
+		/* uniquifier - can be used if nfs_server.flags includes
+		 * NFS_MOUNT_UNSHARED  */
+		u8 uniq_len;
+		char uniquifier[0];
+	} key;
+};
+
+/*
+ * fscache-index.c
+ */
+extern struct fscache_netfs nfs_fscache_netfs;
+extern const struct fscache_cookie_def nfs_fscache_server_index_def;
+extern const struct fscache_cookie_def nfs_fscache_super_index_def;
+extern const struct fscache_cookie_def nfs_fscache_inode_object_def;
+
+extern int nfs_fscache_register(void);
+extern void nfs_fscache_unregister(void);
+
+/*
+ * fscache.c
+ */
+extern void nfs_fscache_get_client_cookie(struct nfs_client *);
+extern void nfs_fscache_release_client_cookie(struct nfs_client *);
+
+extern void nfs_fscache_get_super_cookie(struct super_block *,
+					 const char *,
+					 struct nfs_clone_mount *);
+extern void nfs_fscache_release_super_cookie(struct super_block *);
+
+extern void nfs_fscache_init_inode_cookie(struct inode *);
+extern void nfs_fscache_release_inode_cookie(struct inode *);
+extern void nfs_fscache_zap_inode_cookie(struct inode *);
+extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
+extern void nfs_fscache_reset_inode_cookie(struct inode *);
+
+extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
+extern int nfs_fscache_release_page(struct page *, gfp_t);
+
+extern int __nfs_readpage_from_fscache(struct nfs_open_context *,
+				       struct inode *, struct page *);
+extern int __nfs_readpages_from_fscache(struct nfs_open_context *,
+					struct inode *, struct address_space *,
+					struct list_head *, unsigned *);
+extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int);
+
+/*
+ * wait for a page to complete writing to the cache
+ */
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+						  struct page *page)
+{
+	if (PageFsCache(page))
+		fscache_wait_on_page_write(nfsi->fscache, page);
+}
+
+/*
+ * release the caching state associated with a page if undergoing complete page
+ * invalidation
+ */
+static inline void nfs_fscache_invalidate_page(struct page *page,
+					       struct inode *inode)
+{
+	if (PageFsCache(page))
+		__nfs_fscache_invalidate_page(page, inode);
+}
+
+/*
+ * Retrieve a page from an inode data storage object.
+ */
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+					    struct inode *inode,
+					    struct page *page)
+{
+	if (NFS_I(inode)->fscache)
+		return __nfs_readpage_from_fscache(ctx, inode, page);
+	return -ENOBUFS;
+}
+
+/*
+ * Retrieve a set of pages from an inode data storage object.
+ */
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+					     struct inode *inode,
+					     struct address_space *mapping,
+					     struct list_head *pages,
+					     unsigned *nr_pages)
+{
+	if (NFS_I(inode)->fscache)
+		return __nfs_readpages_from_fscache(ctx, inode, mapping, pages,
+						    nr_pages);
+	return -ENOBUFS;
+}
+
+/*
+ * Store a page newly fetched from the server in an inode data storage object
+ * in the cache.
+ */
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+					   struct page *page,
+					   int sync)
+{
+	if (PageFsCache(page))
+		__nfs_readpage_to_fscache(inode, page, sync);
+}
+
+/*
+ * indicate the client caching state as readable text
+ */
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+	if (server->fscache && (server->options & NFS_OPTION_FSCACHE))
+		return "yes";
+	return "no ";
+}
+
+
+#else /* CONFIG_NFS_FSCACHE */
+static inline int nfs_fscache_register(void) { return 0; }
+static inline void nfs_fscache_unregister(void) {}
+
+static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
+static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
+
+static inline void nfs_fscache_get_super_cookie(
+	struct super_block *sb,
+	const char *uniq,
+	struct nfs_clone_mount *mntdata)
+{
+}
+static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
+
+static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
+						struct file *filp) {}
+static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
+
+static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	return 1; /* True: may release page */
+}
+static inline void nfs_fscache_invalidate_page(struct page *page,
+					       struct inode *inode) {}
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+						  struct page *page) {}
+
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+					    struct inode *inode,
+					    struct page *page)
+{
+	return -ENOBUFS;
+}
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+					     struct inode *inode,
+					     struct address_space *mapping,
+					     struct list_head *pages,
+					     unsigned *nr_pages)
+{
+	return -ENOBUFS;
+}
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+					   struct page *page, int sync) {}
+
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+	return "no ";
+}
+
+#endif /* CONFIG_NFS_FSCACHE */
+#endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
new file mode 100644
index 00000000..dcb61548
--- /dev/null
+++ b/fs/nfs/getroot.c
@@ -0,0 +1,267 @@
+/* getroot.c: get the root dentry for an NFS mount
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/nfs_idmap.h>
+#include <linux/vfs.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include "nfs4_fs.h"
+#include "delegation.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_CLIENT
+
+/*
+ * Set the superblock root dentry.
+ * Note that this function frees the inode in case of error.
+ */
+static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *inode)
+{
+	/* The mntroot acts as the dummy root dentry for this superblock */
+	if (sb->s_root == NULL) {
+		sb->s_root = d_alloc_root(inode);
+		if (sb->s_root == NULL) {
+			iput(inode);
+			return -ENOMEM;
+		}
+		ihold(inode);
+		/*
+		 * Ensure that this dentry is invisible to d_find_alias().
+		 * Otherwise, it may be spliced into the tree by
+		 * d_materialise_unique if a parent directory from the same
+		 * filesystem gets mounted at a later time.
+		 * This again causes shrink_dcache_for_umount_subtree() to
+		 * Oops, since the test for IS_ROOT() will fail.
+		 */
+		spin_lock(&sb->s_root->d_inode->i_lock);
+		spin_lock(&sb->s_root->d_lock);
+		list_del_init(&sb->s_root->d_alias);
+		spin_unlock(&sb->s_root->d_lock);
+		spin_unlock(&sb->s_root->d_inode->i_lock);
+	}
+	return 0;
+}
+
+/*
+ * get an NFS2/NFS3 root dentry from the root filehandle
+ */
+struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
+			    const char *devname)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	struct nfs_fsinfo fsinfo;
+	struct dentry *ret;
+	struct inode *inode;
+	void *name = kstrdup(devname, GFP_KERNEL);
+	int error;
+
+	if (!name)
+		return ERR_PTR(-ENOMEM);
+
+	/* get the actual root for this mount */
+	fsinfo.fattr = nfs_alloc_fattr();
+	if (fsinfo.fattr == NULL) {
+		kfree(name);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
+	if (error < 0) {
+		dprintk("nfs_get_root: getattr error = %d\n", -error);
+		ret = ERR_PTR(error);
+		goto out;
+	}
+
+	inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
+	if (IS_ERR(inode)) {
+		dprintk("nfs_get_root: get root inode failed\n");
+		ret = ERR_CAST(inode);
+		goto out;
+	}
+
+	error = nfs_superblock_set_dummy_root(sb, inode);
+	if (error != 0) {
+		ret = ERR_PTR(error);
+		goto out;
+	}
+
+	/* root dentries normally start off anonymous and get spliced in later
+	 * if the dentry tree reaches them; however if the dentry already
+	 * exists, we'll pick it up at this point and use it as the root
+	 */
+	ret = d_obtain_alias(inode);
+	if (IS_ERR(ret)) {
+		dprintk("nfs_get_root: get root dentry failed\n");
+		goto out;
+	}
+
+	security_d_instantiate(ret, inode);
+	spin_lock(&ret->d_lock);
+	if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
+		ret->d_fsdata = name;
+		name = NULL;
+	}
+	spin_unlock(&ret->d_lock);
+out:
+	if (name)
+		kfree(name);
+	nfs_free_fattr(fsinfo.fattr);
+	return ret;
+}
+
+#ifdef CONFIG_NFS_V4
+
+int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
+{
+	struct nfs_fsinfo fsinfo;
+	int ret = -ENOMEM;
+
+	dprintk("--> nfs4_get_rootfh()\n");
+
+	fsinfo.fattr = nfs_alloc_fattr();
+	if (fsinfo.fattr == NULL)
+		goto out;
+
+	/* Start by getting the root filehandle from the server */
+	ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
+	if (ret < 0) {
+		dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
+		goto out;
+	}
+
+	if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE)
+			|| !S_ISDIR(fsinfo.fattr->mode)) {
+		printk(KERN_ERR "nfs4_get_rootfh:"
+		       " getroot encountered non-directory\n");
+		ret = -ENOTDIR;
+		goto out;
+	}
+
+	if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+		printk(KERN_ERR "nfs4_get_rootfh:"
+		       " getroot obtained referral\n");
+		ret = -EREMOTE;
+		goto out;
+	}
+
+	memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
+out:
+	nfs_free_fattr(fsinfo.fattr);
+	dprintk("<-- nfs4_get_rootfh() = %d\n", ret);
+	return ret;
+}
+
+/*
+ * get an NFS4 root dentry from the root filehandle
+ */
+struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh,
+			     const char *devname)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	struct nfs_fattr *fattr = NULL;
+	struct dentry *ret;
+	struct inode *inode;
+	void *name = kstrdup(devname, GFP_KERNEL);
+	int error;
+
+	dprintk("--> nfs4_get_root()\n");
+
+	if (!name)
+		return ERR_PTR(-ENOMEM);
+
+	/* get the info about the server and filesystem */
+	error = nfs4_server_capabilities(server, mntfh);
+	if (error < 0) {
+		dprintk("nfs_get_root: getcaps error = %d\n",
+			-error);
+		kfree(name);
+		return ERR_PTR(error);
+	}
+
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL) {
+		kfree(name);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* get the actual root for this mount */
+	error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
+	if (error < 0) {
+		dprintk("nfs_get_root: getattr error = %d\n", -error);
+		ret = ERR_PTR(error);
+		goto out;
+	}
+
+	if (fattr->valid & NFS_ATTR_FATTR_FSID &&
+	    !nfs_fsid_equal(&server->fsid, &fattr->fsid))
+		memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+
+	inode = nfs_fhget(sb, mntfh, fattr);
+	if (IS_ERR(inode)) {
+		dprintk("nfs_get_root: get root inode failed\n");
+		ret = ERR_CAST(inode);
+		goto out;
+	}
+
+	error = nfs_superblock_set_dummy_root(sb, inode);
+	if (error != 0) {
+		ret = ERR_PTR(error);
+		goto out;
+	}
+
+	/* root dentries normally start off anonymous and get spliced in later
+	 * if the dentry tree reaches them; however if the dentry already
+	 * exists, we'll pick it up at this point and use it as the root
+	 */
+	ret = d_obtain_alias(inode);
+	if (IS_ERR(ret)) {
+		dprintk("nfs_get_root: get root dentry failed\n");
+		goto out;
+	}
+
+	security_d_instantiate(ret, inode);
+	spin_lock(&ret->d_lock);
+	if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
+		ret->d_fsdata = name;
+		name = NULL;
+	}
+	spin_unlock(&ret->d_lock);
+out:
+	if (name)
+		kfree(name);
+	nfs_free_fattr(fattr);
+	dprintk("<-- nfs4_get_root()\n");
+	return ret;
+}
+
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
new file mode 100644
index 00000000..79664a10
--- /dev/null
+++ b/fs/nfs/idmap.c
@@ -0,0 +1,779 @@
+/*
+ * fs/nfs/idmap.c
+ *
+ *  UID and GID to name mapping for clients.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+
+static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
+{
+	unsigned long val;
+	char buf[16];
+
+	if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf))
+		return 0;
+	memcpy(buf, name, namelen);
+	buf[namelen] = '\0';
+	if (strict_strtoul(buf, 0, &val) != 0)
+		return 0;
+	*res = val;
+	return 1;
+}
+
+static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
+{
+	return snprintf(buf, buflen, "%u", id);
+}
+
+#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
+
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/nfs_idmap.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <linux/rcupdate.h>
+#include <linux/err.h>
+
+#include <keys/user-type.h>
+
+#define NFS_UINT_MAXLEN 11
+
+const struct cred *id_resolver_cache;
+
+struct key_type key_type_id_resolver = {
+	.name		= "id_resolver",
+	.instantiate	= user_instantiate,
+	.match		= user_match,
+	.revoke		= user_revoke,
+	.destroy	= user_destroy,
+	.describe	= user_describe,
+	.read		= user_read,
+};
+
+int nfs_idmap_init(void)
+{
+	struct cred *cred;
+	struct key *keyring;
+	int ret = 0;
+
+	printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name);
+
+	cred = prepare_kernel_cred(NULL);
+	if (!cred)
+		return -ENOMEM;
+
+	keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred,
+			     (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			     KEY_USR_VIEW | KEY_USR_READ,
+			     KEY_ALLOC_NOT_IN_QUOTA);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto failed_put_cred;
+	}
+
+	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		goto failed_put_key;
+
+	ret = register_key_type(&key_type_id_resolver);
+	if (ret < 0)
+		goto failed_put_key;
+
+	cred->thread_keyring = keyring;
+	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+	id_resolver_cache = cred;
+	return 0;
+
+failed_put_key:
+	key_put(keyring);
+failed_put_cred:
+	put_cred(cred);
+	return ret;
+}
+
+void nfs_idmap_quit(void)
+{
+	key_revoke(id_resolver_cache->thread_keyring);
+	unregister_key_type(&key_type_id_resolver);
+	put_cred(id_resolver_cache);
+}
+
+/*
+ * Assemble the description to pass to request_key()
+ * This function will allocate a new string and update dest to point
+ * at it.  The caller is responsible for freeing dest.
+ *
+ * On error 0 is returned.  Otherwise, the length of dest is returned.
+ */
+static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
+				const char *type, size_t typelen, char **desc)
+{
+	char *cp;
+	size_t desclen = typelen + namelen + 2;
+
+	*desc = kmalloc(desclen, GFP_KERNEL);
+	if (!*desc)
+		return -ENOMEM;
+
+	cp = *desc;
+	memcpy(cp, type, typelen);
+	cp += typelen;
+	*cp++ = ':';
+
+	memcpy(cp, name, namelen);
+	cp += namelen;
+	*cp = '\0';
+	return desclen;
+}
+
+static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
+		const char *type, void *data, size_t data_size)
+{
+	const struct cred *saved_cred;
+	struct key *rkey;
+	char *desc;
+	struct user_key_payload *payload;
+	ssize_t ret;
+
+	ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
+	if (ret <= 0)
+		goto out;
+
+	saved_cred = override_creds(id_resolver_cache);
+	rkey = request_key(&key_type_id_resolver, desc, "");
+	revert_creds(saved_cred);
+	kfree(desc);
+	if (IS_ERR(rkey)) {
+		ret = PTR_ERR(rkey);
+		goto out;
+	}
+
+	rcu_read_lock();
+	rkey->perm |= KEY_USR_VIEW;
+
+	ret = key_validate(rkey);
+	if (ret < 0)
+		goto out_up;
+
+	payload = rcu_dereference(rkey->payload.data);
+	if (IS_ERR_OR_NULL(payload)) {
+		ret = PTR_ERR(payload);
+		goto out_up;
+	}
+
+	ret = payload->datalen;
+	if (ret > 0 && ret <= data_size)
+		memcpy(data, payload->data, ret);
+	else
+		ret = -EINVAL;
+
+out_up:
+	rcu_read_unlock();
+	key_put(rkey);
+out:
+	return ret;
+}
+
+
+/* ID -> Name */
+static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen)
+{
+	char id_str[NFS_UINT_MAXLEN];
+	int id_len;
+	ssize_t ret;
+
+	id_len = snprintf(id_str, sizeof(id_str), "%u", id);
+	ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen);
+	if (ret < 0)
+		return -EINVAL;
+	return ret;
+}
+
+/* Name -> ID */
+static int nfs_idmap_lookup_id(const char *name, size_t namelen,
+				const char *type, __u32 *id)
+{
+	char id_str[NFS_UINT_MAXLEN];
+	long id_long;
+	ssize_t data_size;
+	int ret = 0;
+
+	data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN);
+	if (data_size <= 0) {
+		ret = -EINVAL;
+	} else {
+		ret = strict_strtol(id_str, 10, &id_long);
+		*id = (__u32)id_long;
+	}
+	return ret;
+}
+
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
+{
+	if (nfs_map_string_to_numeric(name, namelen, uid))
+		return 0;
+	return nfs_idmap_lookup_id(name, namelen, "uid", uid);
+}
+
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
+{
+	if (nfs_map_string_to_numeric(name, namelen, gid))
+		return 0;
+	return nfs_idmap_lookup_id(name, namelen, "gid", gid);
+}
+
+int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
+{
+	int ret = -EINVAL;
+
+	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+		ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
+	if (ret < 0)
+		ret = nfs_map_numeric_to_string(uid, buf, buflen);
+	return ret;
+}
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
+{
+	int ret = -EINVAL;
+
+	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+		ret = nfs_idmap_lookup_name(gid, "group", buf, buflen);
+	if (ret < 0)
+		ret = nfs_map_numeric_to_string(gid, buf, buflen);
+	return ret;
+}
+
+#else  /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
+
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/sched.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include <linux/nfs_fs.h>
+
+#include <linux/nfs_idmap.h>
+#include "nfs4_fs.h"
+
+#define IDMAP_HASH_SZ          128
+
+/* Default cache timeout is 10 minutes */
+unsigned int nfs_idmap_cache_timeout = 600 * HZ;
+
+static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
+{
+	char *endp;
+	int num = simple_strtol(val, &endp, 0);
+	int jif = num * HZ;
+	if (endp == val || *endp || num < 0 || jif < num)
+		return -EINVAL;
+	*((int *)kp->arg) = jif;
+	return 0;
+}
+
+module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
+		 &nfs_idmap_cache_timeout, 0644);
+
+struct idmap_hashent {
+	unsigned long		ih_expires;
+	__u32			ih_id;
+	size_t			ih_namelen;
+	char			ih_name[IDMAP_NAMESZ];
+};
+
+struct idmap_hashtable {
+	__u8			h_type;
+	struct idmap_hashent	h_entries[IDMAP_HASH_SZ];
+};
+
+struct idmap {
+	struct dentry		*idmap_dentry;
+	wait_queue_head_t	idmap_wq;
+	struct idmap_msg	idmap_im;
+	struct mutex		idmap_lock;	/* Serializes upcalls */
+	struct mutex		idmap_im_lock;	/* Protects the hashtable */
+	struct idmap_hashtable	idmap_user_hash;
+	struct idmap_hashtable	idmap_group_hash;
+};
+
+static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
+				 char __user *, size_t);
+static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
+				   size_t);
+static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
+
+static unsigned int fnvhash32(const void *, size_t);
+
+static const struct rpc_pipe_ops idmap_upcall_ops = {
+	.upcall		= idmap_pipe_upcall,
+	.downcall	= idmap_pipe_downcall,
+	.destroy_msg	= idmap_pipe_destroy_msg,
+};
+
+int
+nfs_idmap_new(struct nfs_client *clp)
+{
+	struct idmap *idmap;
+	int error;
+
+	BUG_ON(clp->cl_idmap != NULL);
+
+	idmap = kzalloc(sizeof(*idmap), GFP_KERNEL);
+	if (idmap == NULL)
+		return -ENOMEM;
+
+	idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry,
+			"idmap", idmap, &idmap_upcall_ops, 0);
+	if (IS_ERR(idmap->idmap_dentry)) {
+		error = PTR_ERR(idmap->idmap_dentry);
+		kfree(idmap);
+		return error;
+	}
+
+	mutex_init(&idmap->idmap_lock);
+	mutex_init(&idmap->idmap_im_lock);
+	init_waitqueue_head(&idmap->idmap_wq);
+	idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER;
+	idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
+
+	clp->cl_idmap = idmap;
+	return 0;
+}
+
+void
+nfs_idmap_delete(struct nfs_client *clp)
+{
+	struct idmap *idmap = clp->cl_idmap;
+
+	if (!idmap)
+		return;
+	rpc_unlink(idmap->idmap_dentry);
+	clp->cl_idmap = NULL;
+	kfree(idmap);
+}
+
+/*
+ * Helper routines for manipulating the hashtable
+ */
+static inline struct idmap_hashent *
+idmap_name_hash(struct idmap_hashtable* h, const char *name, size_t len)
+{
+	return &h->h_entries[fnvhash32(name, len) % IDMAP_HASH_SZ];
+}
+
+static struct idmap_hashent *
+idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len)
+{
+	struct idmap_hashent *he = idmap_name_hash(h, name, len);
+
+	if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0)
+		return NULL;
+	if (time_after(jiffies, he->ih_expires))
+		return NULL;
+	return he;
+}
+
+static inline struct idmap_hashent *
+idmap_id_hash(struct idmap_hashtable* h, __u32 id)
+{
+	return &h->h_entries[fnvhash32(&id, sizeof(id)) % IDMAP_HASH_SZ];
+}
+
+static struct idmap_hashent *
+idmap_lookup_id(struct idmap_hashtable *h, __u32 id)
+{
+	struct idmap_hashent *he = idmap_id_hash(h, id);
+	if (he->ih_id != id || he->ih_namelen == 0)
+		return NULL;
+	if (time_after(jiffies, he->ih_expires))
+		return NULL;
+	return he;
+}
+
+/*
+ * Routines for allocating new entries in the hashtable.
+ * For now, we just have 1 entry per bucket, so it's all
+ * pretty trivial.
+ */
+static inline struct idmap_hashent *
+idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len)
+{
+	return idmap_name_hash(h, name, len);
+}
+
+static inline struct idmap_hashent *
+idmap_alloc_id(struct idmap_hashtable *h, __u32 id)
+{
+	return idmap_id_hash(h, id);
+}
+
+static void
+idmap_update_entry(struct idmap_hashent *he, const char *name,
+		size_t namelen, __u32 id)
+{
+	he->ih_id = id;
+	memcpy(he->ih_name, name, namelen);
+	he->ih_name[namelen] = '\0';
+	he->ih_namelen = namelen;
+	he->ih_expires = jiffies + nfs_idmap_cache_timeout;
+}
+
+/*
+ * Name -> ID
+ */
+static int
+nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h,
+		const char *name, size_t namelen, __u32 *id)
+{
+	struct rpc_pipe_msg msg;
+	struct idmap_msg *im;
+	struct idmap_hashent *he;
+	DECLARE_WAITQUEUE(wq, current);
+	int ret = -EIO;
+
+	im = &idmap->idmap_im;
+
+	/*
+	 * String sanity checks
+	 * Note that the userland daemon expects NUL terminated strings
+	 */
+	for (;;) {
+		if (namelen == 0)
+			return -EINVAL;
+		if (name[namelen-1] != '\0')
+			break;
+		namelen--;
+	}
+	if (namelen >= IDMAP_NAMESZ)
+		return -EINVAL;
+
+	mutex_lock(&idmap->idmap_lock);
+	mutex_lock(&idmap->idmap_im_lock);
+
+	he = idmap_lookup_name(h, name, namelen);
+	if (he != NULL) {
+		*id = he->ih_id;
+		ret = 0;
+		goto out;
+	}
+
+	memset(im, 0, sizeof(*im));
+	memcpy(im->im_name, name, namelen);
+
+	im->im_type = h->h_type;
+	im->im_conv = IDMAP_CONV_NAMETOID;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.data = im;
+	msg.len = sizeof(*im);
+
+	add_wait_queue(&idmap->idmap_wq, &wq);
+	if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
+		remove_wait_queue(&idmap->idmap_wq, &wq);
+		goto out;
+	}
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	mutex_unlock(&idmap->idmap_im_lock);
+	schedule();
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&idmap->idmap_wq, &wq);
+	mutex_lock(&idmap->idmap_im_lock);
+
+	if (im->im_status & IDMAP_STATUS_SUCCESS) {
+		*id = im->im_id;
+		ret = 0;
+	}
+
+ out:
+	memset(im, 0, sizeof(*im));
+	mutex_unlock(&idmap->idmap_im_lock);
+	mutex_unlock(&idmap->idmap_lock);
+	return ret;
+}
+
+/*
+ * ID -> Name
+ */
+static int
+nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
+		__u32 id, char *name)
+{
+	struct rpc_pipe_msg msg;
+	struct idmap_msg *im;
+	struct idmap_hashent *he;
+	DECLARE_WAITQUEUE(wq, current);
+	int ret = -EIO;
+	unsigned int len;
+
+	im = &idmap->idmap_im;
+
+	mutex_lock(&idmap->idmap_lock);
+	mutex_lock(&idmap->idmap_im_lock);
+
+	he = idmap_lookup_id(h, id);
+	if (he) {
+		memcpy(name, he->ih_name, he->ih_namelen);
+		ret = he->ih_namelen;
+		goto out;
+	}
+
+	memset(im, 0, sizeof(*im));
+	im->im_type = h->h_type;
+	im->im_conv = IDMAP_CONV_IDTONAME;
+	im->im_id = id;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.data = im;
+	msg.len = sizeof(*im);
+
+	add_wait_queue(&idmap->idmap_wq, &wq);
+
+	if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
+		remove_wait_queue(&idmap->idmap_wq, &wq);
+		goto out;
+	}
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	mutex_unlock(&idmap->idmap_im_lock);
+	schedule();
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&idmap->idmap_wq, &wq);
+	mutex_lock(&idmap->idmap_im_lock);
+
+	if (im->im_status & IDMAP_STATUS_SUCCESS) {
+		if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0)
+			goto out;
+		memcpy(name, im->im_name, len);
+		ret = len;
+	}
+
+ out:
+	memset(im, 0, sizeof(*im));
+	mutex_unlock(&idmap->idmap_im_lock);
+	mutex_unlock(&idmap->idmap_lock);
+	return ret;
+}
+
+/* RPC pipefs upcall/downcall routines */
+static ssize_t
+idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+		  char __user *dst, size_t buflen)
+{
+	char *data = (char *)msg->data + msg->copied;
+	size_t mlen = min(msg->len, buflen);
+	unsigned long left;
+
+	left = copy_to_user(dst, data, mlen);
+	if (left == mlen) {
+		msg->errno = -EFAULT;
+		return -EFAULT;
+	}
+
+	mlen -= left;
+	msg->copied += mlen;
+	msg->errno = 0;
+	return mlen;
+}
+
+static ssize_t
+idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
+{
+	struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
+	struct idmap *idmap = (struct idmap *)rpci->private;
+	struct idmap_msg im_in, *im = &idmap->idmap_im;
+	struct idmap_hashtable *h;
+	struct idmap_hashent *he = NULL;
+	size_t namelen_in;
+	int ret;
+
+	if (mlen != sizeof(im_in))
+		return -ENOSPC;
+
+	if (copy_from_user(&im_in, src, mlen) != 0)
+		return -EFAULT;
+
+	mutex_lock(&idmap->idmap_im_lock);
+
+	ret = mlen;
+	im->im_status = im_in.im_status;
+	/* If we got an error, terminate now, and wake up pending upcalls */
+	if (!(im_in.im_status & IDMAP_STATUS_SUCCESS)) {
+		wake_up(&idmap->idmap_wq);
+		goto out;
+	}
+
+	/* Sanity checking of strings */
+	ret = -EINVAL;
+	namelen_in = strnlen(im_in.im_name, IDMAP_NAMESZ);
+	if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ)
+		goto out;
+
+	switch (im_in.im_type) {
+		case IDMAP_TYPE_USER:
+			h = &idmap->idmap_user_hash;
+			break;
+		case IDMAP_TYPE_GROUP:
+			h = &idmap->idmap_group_hash;
+			break;
+		default:
+			goto out;
+	}
+
+	switch (im_in.im_conv) {
+	case IDMAP_CONV_IDTONAME:
+		/* Did we match the current upcall? */
+		if (im->im_conv == IDMAP_CONV_IDTONAME
+				&& im->im_type == im_in.im_type
+				&& im->im_id == im_in.im_id) {
+			/* Yes: copy string, including the terminating '\0'  */
+			memcpy(im->im_name, im_in.im_name, namelen_in);
+			im->im_name[namelen_in] = '\0';
+			wake_up(&idmap->idmap_wq);
+		}
+		he = idmap_alloc_id(h, im_in.im_id);
+		break;
+	case IDMAP_CONV_NAMETOID:
+		/* Did we match the current upcall? */
+		if (im->im_conv == IDMAP_CONV_NAMETOID
+				&& im->im_type == im_in.im_type
+				&& strnlen(im->im_name, IDMAP_NAMESZ) == namelen_in
+				&& memcmp(im->im_name, im_in.im_name, namelen_in) == 0) {
+			im->im_id = im_in.im_id;
+			wake_up(&idmap->idmap_wq);
+		}
+		he = idmap_alloc_name(h, im_in.im_name, namelen_in);
+		break;
+	default:
+		goto out;
+	}
+
+	/* If the entry is valid, also copy it to the cache */
+	if (he != NULL)
+		idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id);
+	ret = mlen;
+out:
+	mutex_unlock(&idmap->idmap_im_lock);
+	return ret;
+}
+
+static void
+idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+	struct idmap_msg *im = msg->data;
+	struct idmap *idmap = container_of(im, struct idmap, idmap_im); 
+
+	if (msg->errno >= 0)
+		return;
+	mutex_lock(&idmap->idmap_im_lock);
+	im->im_status = IDMAP_STATUS_LOOKUPFAIL;
+	wake_up(&idmap->idmap_wq);
+	mutex_unlock(&idmap->idmap_im_lock);
+}
+
+/* 
+ * Fowler/Noll/Vo hash
+ *    http://www.isthe.com/chongo/tech/comp/fnv/
+ */
+
+#define FNV_P_32 ((unsigned int)0x01000193) /* 16777619 */
+#define FNV_1_32 ((unsigned int)0x811c9dc5) /* 2166136261 */
+
+static unsigned int fnvhash32(const void *buf, size_t buflen)
+{
+	const unsigned char *p, *end = (const unsigned char *)buf + buflen;
+	unsigned int hash = FNV_1_32;
+
+	for (p = buf; p < end; p++) {
+		hash *= FNV_P_32;
+		hash ^= (unsigned int)*p;
+	}
+
+	return hash;
+}
+
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
+{
+	struct idmap *idmap = server->nfs_client->cl_idmap;
+
+	if (nfs_map_string_to_numeric(name, namelen, uid))
+		return 0;
+	return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
+}
+
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
+{
+	struct idmap *idmap = server->nfs_client->cl_idmap;
+
+	if (nfs_map_string_to_numeric(name, namelen, uid))
+		return 0;
+	return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
+}
+
+int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
+{
+	struct idmap *idmap = server->nfs_client->cl_idmap;
+	int ret = -EINVAL;
+
+	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+		ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+	if (ret < 0)
+		ret = nfs_map_numeric_to_string(uid, buf, buflen);
+	return ret;
+}
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
+{
+	struct idmap *idmap = server->nfs_client->cl_idmap;
+	int ret = -EINVAL;
+
+	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+		ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+	if (ret < 0)
+		ret = nfs_map_numeric_to_string(uid, buf, buflen);
+	return ret;
+}
+
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
new file mode 100644
index 00000000..c48f9f6a
--- /dev/null
+++ b/fs/nfs/inode.c
@@ -0,0 +1,1656 @@
+/*
+ *  linux/fs/nfs/inode.c
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  nfs inode and superblock handling functions
+ *
+ *  Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some
+ *  experimental NFS changes. Modularisation taken straight from SYS5 fs.
+ *
+ *  Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
+ *  J.S.Peatfield@damtp.cam.ac.uk
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/nfs_idmap.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
+#include <linux/slab.h>
+#include <linux/compat.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+#include "dns_resolve.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+#define NFS_64_BIT_INODE_NUMBERS_ENABLED	1
+
+/* Default is to see 64-bit inode numbers */
+static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
+
+static void nfs_invalidate_inode(struct inode *);
+static int nfs_update_inode(struct inode *, struct nfs_fattr *);
+
+static struct kmem_cache * nfs_inode_cachep;
+
+static inline unsigned long
+nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
+{
+	return nfs_fileid_to_ino_t(fattr->fileid);
+}
+
+/**
+ * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
+ * @word: long word containing the bit lock
+ */
+int nfs_wait_bit_killable(void *word)
+{
+	if (fatal_signal_pending(current))
+		return -ERESTARTSYS;
+	schedule();
+	return 0;
+}
+
+/**
+ * nfs_compat_user_ino64 - returns the user-visible inode number
+ * @fileid: 64-bit fileid
+ *
+ * This function returns a 32-bit inode number if the boot parameter
+ * nfs.enable_ino64 is zero.
+ */
+u64 nfs_compat_user_ino64(u64 fileid)
+{
+#ifdef CONFIG_COMPAT
+	compat_ulong_t ino;
+#else	
+	unsigned long ino;
+#endif
+
+	if (enable_ino64)
+		return fileid;
+	ino = fileid;
+	if (sizeof(ino) < sizeof(fileid))
+		ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8;
+	return ino;
+}
+
+static void nfs_clear_inode(struct inode *inode)
+{
+	/*
+	 * The following should never happen...
+	 */
+	BUG_ON(nfs_have_writebacks(inode));
+	BUG_ON(!list_empty(&NFS_I(inode)->open_files));
+	nfs_zap_acl_cache(inode);
+	nfs_access_zap_cache(inode);
+	nfs_fscache_release_inode_cookie(inode);
+}
+
+void nfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	nfs_clear_inode(inode);
+}
+
+/**
+ * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
+ */
+int nfs_sync_mapping(struct address_space *mapping)
+{
+	int ret = 0;
+
+	if (mapping->nrpages != 0) {
+		unmap_mapping_range(mapping, 0, 0, 0);
+		ret = nfs_wb_all(mapping->host);
+	}
+	return ret;
+}
+
+/*
+ * Invalidate the local caches
+ */
+static void nfs_zap_caches_locked(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int mode = inode->i_mode;
+
+	nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
+
+	nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+	nfsi->attrtimeo_timestamp = jiffies;
+
+	memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
+	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
+	else
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
+}
+
+void nfs_zap_caches(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	nfs_zap_caches_locked(inode);
+	spin_unlock(&inode->i_lock);
+}
+
+void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
+{
+	if (mapping->nrpages != 0) {
+		spin_lock(&inode->i_lock);
+		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+		spin_unlock(&inode->i_lock);
+	}
+}
+
+void nfs_zap_acl_cache(struct inode *inode)
+{
+	void (*clear_acl_cache)(struct inode *);
+
+	clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache;
+	if (clear_acl_cache != NULL)
+		clear_acl_cache(inode);
+	spin_lock(&inode->i_lock);
+	NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL;
+	spin_unlock(&inode->i_lock);
+}
+
+void nfs_invalidate_atime(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Invalidate, but do not unhash, the inode.
+ * NB: must be called with inode->i_lock held!
+ */
+static void nfs_invalidate_inode(struct inode *inode)
+{
+	set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+	nfs_zap_caches_locked(inode);
+}
+
+struct nfs_find_desc {
+	struct nfs_fh		*fh;
+	struct nfs_fattr	*fattr;
+};
+
+/*
+ * In NFSv3 we can have 64bit inode numbers. In order to support
+ * this, and re-exported directories (also seen in NFSv2)
+ * we are forced to allow 2 different inodes to have the same
+ * i_ino.
+ */
+static int
+nfs_find_actor(struct inode *inode, void *opaque)
+{
+	struct nfs_find_desc	*desc = (struct nfs_find_desc *)opaque;
+	struct nfs_fh		*fh = desc->fh;
+	struct nfs_fattr	*fattr = desc->fattr;
+
+	if (NFS_FILEID(inode) != fattr->fileid)
+		return 0;
+	if (nfs_compare_fh(NFS_FH(inode), fh))
+		return 0;
+	if (is_bad_inode(inode) || NFS_STALE(inode))
+		return 0;
+	return 1;
+}
+
+static int
+nfs_init_locked(struct inode *inode, void *opaque)
+{
+	struct nfs_find_desc	*desc = (struct nfs_find_desc *)opaque;
+	struct nfs_fattr	*fattr = desc->fattr;
+
+	set_nfs_fileid(inode, fattr->fileid);
+	nfs_copy_fh(NFS_FH(inode), desc->fh);
+	return 0;
+}
+
+/*
+ * This is our front-end to iget that looks up inodes by file handle
+ * instead of inode number.
+ */
+struct inode *
+nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+	struct nfs_find_desc desc = {
+		.fh	= fh,
+		.fattr	= fattr
+	};
+	struct inode *inode = ERR_PTR(-ENOENT);
+	unsigned long hash;
+
+	nfs_attr_check_mountpoint(sb, fattr);
+
+	if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) &&
+	    !nfs_attr_use_mounted_on_fileid(fattr))
+		goto out_no_inode;
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
+		goto out_no_inode;
+
+	hash = nfs_fattr_to_ino_t(fattr);
+
+	inode = iget5_locked(sb, hash, nfs_find_actor, nfs_init_locked, &desc);
+	if (inode == NULL) {
+		inode = ERR_PTR(-ENOMEM);
+		goto out_no_inode;
+	}
+
+	if (inode->i_state & I_NEW) {
+		struct nfs_inode *nfsi = NFS_I(inode);
+		unsigned long now = jiffies;
+
+		/* We set i_ino for the few things that still rely on it,
+		 * such as stat(2) */
+		inode->i_ino = hash;
+
+		/* We can't support update_atime(), since the server will reset it */
+		inode->i_flags |= S_NOATIME|S_NOCMTIME;
+		inode->i_mode = fattr->mode;
+		if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
+				&& nfs_server_capable(inode, NFS_CAP_MODE))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL;
+		/* Why so? Because we want revalidate for devices/FIFOs, and
+		 * that's precisely what we have in nfs_file_inode_operations.
+		 */
+		inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops;
+		if (S_ISREG(inode->i_mode)) {
+			inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
+			inode->i_data.a_ops = &nfs_file_aops;
+			inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
+		} else if (S_ISDIR(inode->i_mode)) {
+			inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
+			inode->i_fop = &nfs_dir_operations;
+			inode->i_data.a_ops = &nfs_dir_aops;
+			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
+				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
+			/* Deal with crossing mountpoints */
+			if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT ||
+					fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+				if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+					inode->i_op = &nfs_referral_inode_operations;
+				else
+					inode->i_op = &nfs_mountpoint_inode_operations;
+				inode->i_fop = NULL;
+				inode->i_flags |= S_AUTOMOUNT;
+			}
+		} else if (S_ISLNK(inode->i_mode))
+			inode->i_op = &nfs_symlink_inode_operations;
+		else
+			init_special_inode(inode, inode->i_mode, fattr->rdev);
+
+		memset(&inode->i_atime, 0, sizeof(inode->i_atime));
+		memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+		memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
+		nfsi->change_attr = 0;
+		inode->i_size = 0;
+		inode->i_nlink = 0;
+		inode->i_uid = -2;
+		inode->i_gid = -2;
+		inode->i_blocks = 0;
+		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+
+		nfsi->read_cache_jiffies = fattr->time_start;
+		nfsi->attr_gencount = fattr->gencount;
+		if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+			inode->i_atime = fattr->atime;
+		else if (nfs_server_capable(inode, NFS_CAP_ATIME))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
+			inode->i_mtime = fattr->mtime;
+		else if (nfs_server_capable(inode, NFS_CAP_MTIME))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_DATA;
+		if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+			inode->i_ctime = fattr->ctime;
+		else if (nfs_server_capable(inode, NFS_CAP_CTIME))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL;
+		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+			nfsi->change_attr = fattr->change_attr;
+		else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_DATA;
+		if (fattr->valid & NFS_ATTR_FATTR_SIZE)
+			inode->i_size = nfs_size_to_loff_t(fattr->size);
+		else
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_DATA
+				| NFS_INO_REVAL_PAGECACHE;
+		if (fattr->valid & NFS_ATTR_FATTR_NLINK)
+			inode->i_nlink = fattr->nlink;
+		else if (nfs_server_capable(inode, NFS_CAP_NLINK))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+		if (fattr->valid & NFS_ATTR_FATTR_OWNER)
+			inode->i_uid = fattr->uid;
+		else if (nfs_server_capable(inode, NFS_CAP_OWNER))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL;
+		if (fattr->valid & NFS_ATTR_FATTR_GROUP)
+			inode->i_gid = fattr->gid;
+		else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL;
+		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+			inode->i_blocks = fattr->du.nfs2.blocks;
+		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
+			/*
+			 * report the blocks in 512byte units
+			 */
+			inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
+		}
+		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+		nfsi->attrtimeo_timestamp = now;
+		nfsi->access_cache = RB_ROOT;
+
+		nfs_fscache_init_inode_cookie(inode);
+
+		unlock_new_inode(inode);
+	} else
+		nfs_refresh_inode(inode, fattr);
+	dprintk("NFS: nfs_fhget(%s/%Ld ct=%d)\n",
+		inode->i_sb->s_id,
+		(long long)NFS_FILEID(inode),
+		atomic_read(&inode->i_count));
+
+out:
+	return inode;
+
+out_no_inode:
+	dprintk("nfs_fhget: iget failed with error %ld\n", PTR_ERR(inode));
+	goto out;
+}
+
+#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE)
+
+int
+nfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct nfs_fattr *fattr;
+	int error = -ENOMEM;
+
+	nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
+
+	/* skip mode change if it's just for clearing setuid/setgid */
+	if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+		attr->ia_valid &= ~ATTR_MODE;
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode))
+			attr->ia_valid &= ~ATTR_SIZE;
+	}
+
+	/* Optimization: if the end result is no change, don't RPC */
+	attr->ia_valid &= NFS_VALID_ATTRS;
+	if ((attr->ia_valid & ~ATTR_FILE) == 0)
+		return 0;
+
+	/* Write all dirty data */
+	if (S_ISREG(inode->i_mode))
+		nfs_wb_all(inode);
+
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		goto out;
+	/*
+	 * Return any delegations if we're going to change ACLs
+	 */
+	if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
+		nfs_inode_return_delegation(inode);
+	error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
+	if (error == 0)
+		nfs_refresh_inode(inode, fattr);
+	nfs_free_fattr(fattr);
+out:
+	return error;
+}
+
+/**
+ * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * This is a copy of the common vmtruncate, but with the locking
+ * corrected to take into account the fact that NFS requires
+ * inode->i_size to be updated under the inode->i_lock.
+ */
+static int nfs_vmtruncate(struct inode * inode, loff_t offset)
+{
+	loff_t oldsize;
+	int err;
+
+	err = inode_newsize_ok(inode, offset);
+	if (err)
+		goto out;
+
+	spin_lock(&inode->i_lock);
+	oldsize = inode->i_size;
+	i_size_write(inode, offset);
+	spin_unlock(&inode->i_lock);
+
+	truncate_pagecache(inode, oldsize, offset);
+out:
+	return err;
+}
+
+/**
+ * nfs_setattr_update_inode - Update inode metadata after a setattr call.
+ * @inode: pointer to struct inode
+ * @attr: pointer to struct iattr
+ *
+ * Note: we do this in the *proc.c in order to ensure that
+ *       it works for things like exclusive creates too.
+ */
+void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
+{
+	if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
+		spin_lock(&inode->i_lock);
+		if ((attr->ia_valid & ATTR_MODE) != 0) {
+			int mode = attr->ia_mode & S_IALLUGO;
+			mode |= inode->i_mode & ~S_IALLUGO;
+			inode->i_mode = mode;
+		}
+		if ((attr->ia_valid & ATTR_UID) != 0)
+			inode->i_uid = attr->ia_uid;
+		if ((attr->ia_valid & ATTR_GID) != 0)
+			inode->i_gid = attr->ia_gid;
+		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+		spin_unlock(&inode->i_lock);
+	}
+	if ((attr->ia_valid & ATTR_SIZE) != 0) {
+		nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
+		nfs_vmtruncate(inode, attr->ia_size);
+	}
+}
+
+int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
+	int err;
+
+	/* Flush out writes to the server in order to update c/mtime.  */
+	if (S_ISREG(inode->i_mode)) {
+		err = filemap_write_and_wait(inode->i_mapping);
+		if (err)
+			goto out;
+	}
+
+	/*
+	 * We may force a getattr if the user cares about atime.
+	 *
+	 * Note that we only have to check the vfsmount flags here:
+	 *  - NFS always sets S_NOATIME by so checking it would give a
+	 *    bogus result
+	 *  - NFS never sets MS_NOATIME or MS_NODIRATIME so there is
+	 *    no point in checking those.
+	 */
+ 	if ((mnt->mnt_flags & MNT_NOATIME) ||
+ 	    ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
+		need_atime = 0;
+
+	if (need_atime)
+		err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	else
+		err = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	if (!err) {
+		generic_fillattr(inode, stat);
+		stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
+	}
+out:
+	return err;
+}
+
+static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
+{
+	atomic_set(&l_ctx->count, 1);
+	l_ctx->lockowner = current->files;
+	l_ctx->pid = current->tgid;
+	INIT_LIST_HEAD(&l_ctx->list);
+}
+
+static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
+{
+	struct nfs_lock_context *pos;
+
+	list_for_each_entry(pos, &ctx->lock_context.list, list) {
+		if (pos->lockowner != current->files)
+			continue;
+		if (pos->pid != current->tgid)
+			continue;
+		atomic_inc(&pos->count);
+		return pos;
+	}
+	return NULL;
+}
+
+struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
+{
+	struct nfs_lock_context *res, *new = NULL;
+	struct inode *inode = ctx->path.dentry->d_inode;
+
+	spin_lock(&inode->i_lock);
+	res = __nfs_find_lock_context(ctx);
+	if (res == NULL) {
+		spin_unlock(&inode->i_lock);
+		new = kmalloc(sizeof(*new), GFP_KERNEL);
+		if (new == NULL)
+			return NULL;
+		nfs_init_lock_context(new);
+		spin_lock(&inode->i_lock);
+		res = __nfs_find_lock_context(ctx);
+		if (res == NULL) {
+			list_add_tail(&new->list, &ctx->lock_context.list);
+			new->open_context = ctx;
+			res = new;
+			new = NULL;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	kfree(new);
+	return res;
+}
+
+void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
+{
+	struct nfs_open_context *ctx = l_ctx->open_context;
+	struct inode *inode = ctx->path.dentry->d_inode;
+
+	if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
+		return;
+	list_del(&l_ctx->list);
+	spin_unlock(&inode->i_lock);
+	kfree(l_ctx);
+}
+
+/**
+ * nfs_close_context - Common close_context() routine NFSv2/v3
+ * @ctx: pointer to context
+ * @is_sync: is this a synchronous close
+ *
+ * always ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics
+ */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+	struct inode *inode;
+	struct nfs_server *server;
+
+	if (!(ctx->mode & FMODE_WRITE))
+		return;
+	if (!is_sync)
+		return;
+	inode = ctx->path.dentry->d_inode;
+	if (!list_empty(&NFS_I(inode)->open_files))
+		return;
+	server = NFS_SERVER(inode);
+	if (server->flags & NFS_MOUNT_NOCTO)
+		return;
+	nfs_revalidate_inode(server, inode);
+}
+
+struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode)
+{
+	struct nfs_open_context *ctx;
+
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (ctx != NULL) {
+		ctx->path = *path;
+		path_get(&ctx->path);
+		ctx->cred = get_rpccred(cred);
+		ctx->state = NULL;
+		ctx->mode = f_mode;
+		ctx->flags = 0;
+		ctx->error = 0;
+		nfs_init_lock_context(&ctx->lock_context);
+		ctx->lock_context.open_context = ctx;
+		INIT_LIST_HEAD(&ctx->list);
+	}
+	return ctx;
+}
+
+struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
+{
+	if (ctx != NULL)
+		atomic_inc(&ctx->lock_context.count);
+	return ctx;
+}
+
+static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
+{
+	struct inode *inode = ctx->path.dentry->d_inode;
+
+	if (!list_empty(&ctx->list)) {
+		if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
+			return;
+		list_del(&ctx->list);
+		spin_unlock(&inode->i_lock);
+	} else if (!atomic_dec_and_test(&ctx->lock_context.count))
+		return;
+	if (inode != NULL)
+		NFS_PROTO(inode)->close_context(ctx, is_sync);
+	if (ctx->cred != NULL)
+		put_rpccred(ctx->cred);
+	path_put(&ctx->path);
+	kfree(ctx);
+}
+
+void put_nfs_open_context(struct nfs_open_context *ctx)
+{
+	__put_nfs_open_context(ctx, 0);
+}
+
+/*
+ * Ensure that mmap has a recent RPC credential for use when writing out
+ * shared pages
+ */
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	filp->private_data = get_nfs_open_context(ctx);
+	spin_lock(&inode->i_lock);
+	list_add(&ctx->list, &nfsi->open_files);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Given an inode, search for an open context with the desired characteristics
+ */
+struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_open_context *pos, *ctx = NULL;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(pos, &nfsi->open_files, list) {
+		if (cred != NULL && pos->cred != cred)
+			continue;
+		if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
+			continue;
+		ctx = get_nfs_open_context(pos);
+		break;
+	}
+	spin_unlock(&inode->i_lock);
+	return ctx;
+}
+
+static void nfs_file_clear_open_context(struct file *filp)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct nfs_open_context *ctx = nfs_file_open_context(filp);
+
+	if (ctx) {
+		filp->private_data = NULL;
+		spin_lock(&inode->i_lock);
+		list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
+		spin_unlock(&inode->i_lock);
+		__put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1);
+	}
+}
+
+/*
+ * These allocate and release file read/write context information.
+ */
+int nfs_open(struct inode *inode, struct file *filp)
+{
+	struct nfs_open_context *ctx;
+	struct rpc_cred *cred;
+
+	cred = rpc_lookup_cred();
+	if (IS_ERR(cred))
+		return PTR_ERR(cred);
+	ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode);
+	put_rpccred(cred);
+	if (ctx == NULL)
+		return -ENOMEM;
+	nfs_file_set_open_context(filp, ctx);
+	put_nfs_open_context(ctx);
+	nfs_fscache_set_inode_cookie(inode, filp);
+	return 0;
+}
+
+int nfs_release(struct inode *inode, struct file *filp)
+{
+	nfs_file_clear_open_context(filp);
+	return 0;
+}
+
+/*
+ * This function is called whenever some part of NFS notices that
+ * the cached attributes have to be refreshed.
+ */
+int
+__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
+{
+	int		 status = -ESTALE;
+	struct nfs_fattr *fattr = NULL;
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
+		inode->i_sb->s_id, (long long)NFS_FILEID(inode));
+
+	if (is_bad_inode(inode))
+		goto out;
+	if (NFS_STALE(inode))
+		goto out;
+
+	status = -ENOMEM;
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		goto out;
+
+	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
+	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
+	if (status != 0) {
+		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
+			 inode->i_sb->s_id,
+			 (long long)NFS_FILEID(inode), status);
+		if (status == -ESTALE) {
+			nfs_zap_caches(inode);
+			if (!S_ISDIR(inode->i_mode))
+				set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+		}
+		goto out;
+	}
+
+	status = nfs_refresh_inode(inode, fattr);
+	if (status) {
+		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
+			 inode->i_sb->s_id,
+			 (long long)NFS_FILEID(inode), status);
+		goto out;
+	}
+
+	if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
+		nfs_zap_acl_cache(inode);
+
+	dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n",
+		inode->i_sb->s_id,
+		(long long)NFS_FILEID(inode));
+
+ out:
+	nfs_free_fattr(fattr);
+	return status;
+}
+
+int nfs_attribute_timeout(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
+}
+
+static int nfs_attribute_cache_expired(struct inode *inode)
+{
+	if (nfs_have_delegated_attributes(inode))
+		return 0;
+	return nfs_attribute_timeout(inode);
+}
+
+/**
+ * nfs_revalidate_inode - Revalidate the inode attributes
+ * @server - pointer to nfs_server struct
+ * @inode - pointer to inode struct
+ *
+ * Updates inode attribute information by retrieving the data from the server.
+ */
+int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
+{
+	if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
+			&& !nfs_attribute_cache_expired(inode))
+		return NFS_STALE(inode) ? -ESTALE : 0;
+	return __nfs_revalidate_inode(server, inode);
+}
+
+static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	
+	if (mapping->nrpages != 0) {
+		int ret = invalidate_inode_pages2(mapping);
+		if (ret < 0)
+			return ret;
+	}
+	spin_lock(&inode->i_lock);
+	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+	if (S_ISDIR(inode->i_mode))
+		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+	spin_unlock(&inode->i_lock);
+	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
+	nfs_fscache_reset_inode_cookie(inode);
+	dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
+			inode->i_sb->s_id, (long long)NFS_FILEID(inode));
+	return 0;
+}
+
+/**
+ * nfs_revalidate_mapping - Revalidate the pagecache
+ * @inode - pointer to host inode
+ * @mapping - pointer to mapping
+ */
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int ret = 0;
+
+	if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+			|| nfs_attribute_cache_expired(inode)
+			|| NFS_STALE(inode)) {
+		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+		if (ret < 0)
+			goto out;
+	}
+	if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+		ret = nfs_invalidate_mapping(inode, mapping);
+out:
+	return ret;
+}
+
+static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	unsigned long ret = 0;
+
+	if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
+			&& (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+			&& nfsi->change_attr == fattr->pre_change_attr) {
+		nfsi->change_attr = fattr->change_attr;
+		if (S_ISDIR(inode->i_mode))
+			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+		ret |= NFS_INO_INVALID_ATTR;
+	}
+	/* If we have atomic WCC data, we may update some attributes */
+	if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
+			&& (fattr->valid & NFS_ATTR_FATTR_CTIME)
+			&& timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
+		memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+		ret |= NFS_INO_INVALID_ATTR;
+	}
+
+	if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
+			&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
+			&& timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
+		memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+		if (S_ISDIR(inode->i_mode))
+			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+		ret |= NFS_INO_INVALID_ATTR;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
+			&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
+			&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
+			&& nfsi->npages == 0) {
+		i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+		ret |= NFS_INO_INVALID_ATTR;
+	}
+	return ret;
+}
+
+/**
+ * nfs_check_inode_attributes - verify consistency of the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * Verifies the attribute cache. If we have just changed the attributes,
+ * so that fattr carries weak cache consistency data, then it may
+ * also update the ctime/mtime/change_attribute.
+ */
+static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fattr)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	loff_t cur_size, new_isize;
+	unsigned long invalid = 0;
+
+
+	/* Has the inode gone and changed behind our back? */
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
+		return -EIO;
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+		return -EIO;
+
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+			nfsi->change_attr != fattr->change_attr)
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+
+	/* Verify a few of the more important attributes */
+	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+
+	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+		cur_size = i_size_read(inode);
+		new_isize = nfs_size_to_loff_t(fattr->size);
+		if (cur_size != new_isize && nfsi->npages == 0)
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	}
+
+	/* Have any file permissions changed? */
+	if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+	if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+	if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+
+	/* Has the link count changed? */
+	if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
+		invalid |= NFS_INO_INVALID_ATTR;
+
+	if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
+		invalid |= NFS_INO_INVALID_ATIME;
+
+	if (invalid != 0)
+		nfsi->cache_validity |= invalid;
+
+	nfsi->read_cache_jiffies = fattr->time_start;
+	return 0;
+}
+
+static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+	if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
+		return 0;
+	return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
+}
+
+static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+	if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+		return 0;
+	return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
+}
+
+static atomic_long_t nfs_attr_generation_counter;
+
+static unsigned long nfs_read_attr_generation_counter(void)
+{
+	return atomic_long_read(&nfs_attr_generation_counter);
+}
+
+unsigned long nfs_inc_attr_generation_counter(void)
+{
+	return atomic_long_inc_return(&nfs_attr_generation_counter);
+}
+
+void nfs_fattr_init(struct nfs_fattr *fattr)
+{
+	fattr->valid = 0;
+	fattr->time_start = jiffies;
+	fattr->gencount = nfs_inc_attr_generation_counter();
+}
+
+struct nfs_fattr *nfs_alloc_fattr(void)
+{
+	struct nfs_fattr *fattr;
+
+	fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
+	if (fattr != NULL)
+		nfs_fattr_init(fattr);
+	return fattr;
+}
+
+struct nfs_fh *nfs_alloc_fhandle(void)
+{
+	struct nfs_fh *fh;
+
+	fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS);
+	if (fh != NULL)
+		fh->size = 0;
+	return fh;
+}
+
+/**
+ * nfs_inode_attrs_need_update - check if the inode attributes need updating
+ * @inode - pointer to inode
+ * @fattr - attributes
+ *
+ * Attempt to divine whether or not an RPC call reply carrying stale
+ * attributes got scheduled after another call carrying updated ones.
+ *
+ * To do so, the function first assumes that a more recent ctime means
+ * that the attributes in fattr are newer, however it also attempt to
+ * catch the case where ctime either didn't change, or went backwards
+ * (if someone reset the clock on the server) by looking at whether
+ * or not this RPC call was started after the inode was last updated.
+ * Note also the check for wraparound of 'attr_gencount'
+ *
+ * The function returns 'true' if it thinks the attributes in 'fattr' are
+ * more recent than the ones cached in the inode.
+ *
+ */
+static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+	const struct nfs_inode *nfsi = NFS_I(inode);
+
+	return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
+		nfs_ctime_need_update(inode, fattr) ||
+		nfs_size_need_update(inode, fattr) ||
+		((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
+}
+
+static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+	if (nfs_inode_attrs_need_update(inode, fattr))
+		return nfs_update_inode(inode, fattr);
+	return nfs_check_inode_attributes(inode, fattr);
+}
+
+/**
+ * nfs_refresh_inode - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * Check that an RPC call that returned attributes has not overlapped with
+ * other recent updates of the inode metadata, then decide whether it is
+ * safe to do a full update of the inode attributes, or whether just to
+ * call nfs_check_inode_attributes.
+ */
+int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+	int status;
+
+	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+		return 0;
+	spin_lock(&inode->i_lock);
+	status = nfs_refresh_inode_locked(inode, fattr);
+	spin_unlock(&inode->i_lock);
+
+	return status;
+}
+
+static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	if (S_ISDIR(inode->i_mode))
+		nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+		return 0;
+	return nfs_refresh_inode_locked(inode, fattr);
+}
+
+/**
+ * nfs_post_op_update_inode - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it.
+ *
+ * NB: if the server didn't return any post op attributes, this
+ * function will force the retrieval of attributes before the next
+ * NFS request.  Thus it should be used only for operations that
+ * are expected to change one or more attributes, to avoid
+ * unnecessary NFS requests and trips through nfs_update_inode().
+ */
+int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+	int status;
+
+	spin_lock(&inode->i_lock);
+	status = nfs_post_op_update_inode_locked(inode, fattr);
+	spin_unlock(&inode->i_lock);
+	return status;
+}
+
+/**
+ * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it. Fake up
+ * weak cache consistency data, if none exist.
+ *
+ * This function is mainly designed to be used by the ->write_done() functions.
+ */
+int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
+{
+	int status;
+
+	spin_lock(&inode->i_lock);
+	/* Don't do a WCC update if these attributes are already stale */
+	if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
+			!nfs_inode_attrs_need_update(inode, fattr)) {
+		fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
+				| NFS_ATTR_FATTR_PRESIZE
+				| NFS_ATTR_FATTR_PREMTIME
+				| NFS_ATTR_FATTR_PRECTIME);
+		goto out_noforce;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
+		fattr->pre_change_attr = NFS_I(inode)->change_attr;
+		fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
+		memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
+		fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
+		memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
+		fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
+		fattr->pre_size = i_size_read(inode);
+		fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
+	}
+out_noforce:
+	status = nfs_post_op_update_inode_locked(inode, fattr);
+	spin_unlock(&inode->i_lock);
+	return status;
+}
+
+/*
+ * Many nfs protocol calls return the new file attributes after
+ * an operation.  Here we update the inode to reflect the state
+ * of the server's inode.
+ *
+ * This is a bit tricky because we have to make sure all dirty pages
+ * have been sent off to the server before calling invalidate_inode_pages.
+ * To make sure no other process adds more write requests while we try
+ * our best to flush them, we make them sleep during the attribute refresh.
+ *
+ * A very similar scenario holds for the dir cache.
+ */
+static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+	struct nfs_server *server;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	loff_t cur_isize, new_isize;
+	unsigned long invalid = 0;
+	unsigned long now = jiffies;
+	unsigned long save_cache_validity;
+
+	dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
+			__func__, inode->i_sb->s_id, inode->i_ino,
+			atomic_read(&inode->i_count), fattr->valid);
+
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
+		goto out_fileid;
+
+	/*
+	 * Make sure the inode's type hasn't changed.
+	 */
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+		goto out_changed;
+
+	server = NFS_SERVER(inode);
+	/* Update the fsid? */
+	if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+			!nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
+			!IS_AUTOMOUNT(inode))
+		server->fsid = fattr->fsid;
+
+	/*
+	 * Update the read time so we don't revalidate too often.
+	 */
+	nfsi->read_cache_jiffies = fattr->time_start;
+
+	save_cache_validity = nfsi->cache_validity;
+	nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+			| NFS_INO_INVALID_ATIME
+			| NFS_INO_REVAL_FORCED
+			| NFS_INO_REVAL_PAGECACHE);
+
+	/* Do atomic weak cache consistency updates */
+	invalid |= nfs_wcc_update_inode(inode, fattr);
+
+	/* More cache consistency checks */
+	if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+		if (nfsi->change_attr != fattr->change_attr) {
+			dprintk("NFS: change_attr change on server for file %s/%ld\n",
+					inode->i_sb->s_id, inode->i_ino);
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			if (S_ISDIR(inode->i_mode))
+				nfs_force_lookup_revalidate(inode);
+			nfsi->change_attr = fattr->change_attr;
+		}
+	} else if (server->caps & NFS_CAP_CHANGE_ATTR)
+		invalid |= save_cache_validity;
+
+	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
+		/* NFSv2/v3: Check if the mtime agrees */
+		if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
+			dprintk("NFS: mtime change on server for file %s/%ld\n",
+					inode->i_sb->s_id, inode->i_ino);
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+			if (S_ISDIR(inode->i_mode))
+				nfs_force_lookup_revalidate(inode);
+			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+		}
+	} else if (server->caps & NFS_CAP_MTIME)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_DATA
+				| NFS_INO_REVAL_PAGECACHE
+				| NFS_INO_REVAL_FORCED);
+
+	if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
+		/* If ctime has changed we should definitely clear access+acl caches */
+		if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			/* and probably clear data for a directory too as utimes can cause
+			 * havoc with our cache.
+			 */
+			if (S_ISDIR(inode->i_mode)) {
+				invalid |= NFS_INO_INVALID_DATA;
+				nfs_force_lookup_revalidate(inode);
+			}
+			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+		}
+	} else if (server->caps & NFS_CAP_CTIME)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_FORCED);
+
+	/* Check if our cached file size is stale */
+	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+		new_isize = nfs_size_to_loff_t(fattr->size);
+		cur_isize = i_size_read(inode);
+		if (new_isize != cur_isize) {
+			/* Do we perhaps have any outstanding writes, or has
+			 * the file grown beyond our last write? */
+			if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) ||
+			     new_isize > cur_isize) {
+				i_size_write(inode, new_isize);
+				invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+			}
+			dprintk("NFS: isize change on server for file %s/%ld "
+					"(%Ld to %Ld)\n",
+					inode->i_sb->s_id,
+					inode->i_ino,
+					(long long)cur_isize,
+					(long long)new_isize);
+		}
+	} else
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_REVAL_PAGECACHE
+				| NFS_INO_REVAL_FORCED);
+
+
+	if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+		memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
+	else if (server->caps & NFS_CAP_ATIME)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME
+				| NFS_INO_REVAL_FORCED);
+
+	if (fattr->valid & NFS_ATTR_FATTR_MODE) {
+		if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
+			umode_t newmode = inode->i_mode & S_IFMT;
+			newmode |= fattr->mode & S_IALLUGO;
+			inode->i_mode = newmode;
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+		}
+	} else if (server->caps & NFS_CAP_MODE)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_FORCED);
+
+	if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
+		if (inode->i_uid != fattr->uid) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_uid = fattr->uid;
+		}
+	} else if (server->caps & NFS_CAP_OWNER)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_FORCED);
+
+	if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
+		if (inode->i_gid != fattr->gid) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_gid = fattr->gid;
+		}
+	} else if (server->caps & NFS_CAP_OWNER_GROUP)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_FORCED);
+
+	if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
+		if (inode->i_nlink != fattr->nlink) {
+			invalid |= NFS_INO_INVALID_ATTR;
+			if (S_ISDIR(inode->i_mode))
+				invalid |= NFS_INO_INVALID_DATA;
+			inode->i_nlink = fattr->nlink;
+		}
+	} else if (server->caps & NFS_CAP_NLINK)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_REVAL_FORCED);
+
+	if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
+		/*
+		 * report the blocks in 512byte units
+		 */
+		inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
+ 	}
+	if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+		inode->i_blocks = fattr->du.nfs2.blocks;
+
+	/* Update attrtimeo value if we're out of the unstable period */
+	if (invalid & NFS_INO_INVALID_ATTR) {
+		nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
+		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+		nfsi->attrtimeo_timestamp = now;
+		nfsi->attr_gencount = nfs_inc_attr_generation_counter();
+	} else {
+		if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+			if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
+				nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+			nfsi->attrtimeo_timestamp = now;
+		}
+	}
+	invalid &= ~NFS_INO_INVALID_ATTR;
+	/* Don't invalidate the data if we were to blame */
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+				|| S_ISLNK(inode->i_mode)))
+		invalid &= ~NFS_INO_INVALID_DATA;
+	if (!nfs_have_delegation(inode, FMODE_READ) ||
+			(save_cache_validity & NFS_INO_REVAL_FORCED))
+		nfsi->cache_validity |= invalid;
+
+	return 0;
+ out_changed:
+	/*
+	 * Big trouble! The inode has become a different object.
+	 */
+	printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n",
+			__func__, inode->i_ino, inode->i_mode, fattr->mode);
+ out_err:
+	/*
+	 * No need to worry about unhashing the dentry, as the
+	 * lookup validation will know that the inode is bad.
+	 * (But we fall through to invalidate the caches.)
+	 */
+	nfs_invalidate_inode(inode);
+	return -ESTALE;
+
+ out_fileid:
+	printk(KERN_ERR "NFS: server %s error: fileid changed\n"
+		"fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
+		NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id,
+		(long long)nfsi->fileid, (long long)fattr->fileid);
+	goto out_err;
+}
+
+
+#ifdef CONFIG_NFS_V4
+
+/*
+ * Clean out any remaining NFSv4 state that might be left over due
+ * to open() calls that passed nfs_atomic_lookup, but failed to call
+ * nfs_open().
+ */
+void nfs4_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	pnfs_return_layout(inode);
+	pnfs_destroy_layout(NFS_I(inode));
+	/* If we are holding a delegation, return it! */
+	nfs_inode_return_delegation_noreclaim(inode);
+	/* First call standard NFS clear_inode() code */
+	nfs_clear_inode(inode);
+}
+#endif
+
+struct inode *nfs_alloc_inode(struct super_block *sb)
+{
+	struct nfs_inode *nfsi;
+	nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL);
+	if (!nfsi)
+		return NULL;
+	nfsi->flags = 0UL;
+	nfsi->cache_validity = 0UL;
+#ifdef CONFIG_NFS_V3_ACL
+	nfsi->acl_access = ERR_PTR(-EAGAIN);
+	nfsi->acl_default = ERR_PTR(-EAGAIN);
+#endif
+#ifdef CONFIG_NFS_V4
+	nfsi->nfs4_acl = NULL;
+#endif /* CONFIG_NFS_V4 */
+	return &nfsi->vfs_inode;
+}
+
+static void nfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
+}
+
+void nfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, nfs_i_callback);
+}
+
+static inline void nfs4_init_once(struct nfs_inode *nfsi)
+{
+#ifdef CONFIG_NFS_V4
+	INIT_LIST_HEAD(&nfsi->open_states);
+	nfsi->delegation = NULL;
+	nfsi->delegation_state = 0;
+	init_rwsem(&nfsi->rwsem);
+	nfsi->layout = NULL;
+	atomic_set(&nfsi->commits_outstanding, 0);
+#endif
+}
+
+static void init_once(void *foo)
+{
+	struct nfs_inode *nfsi = (struct nfs_inode *) foo;
+
+	inode_init_once(&nfsi->vfs_inode);
+	INIT_LIST_HEAD(&nfsi->open_files);
+	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
+	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
+	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
+	nfsi->npages = 0;
+	nfsi->ncommit = 0;
+	atomic_set(&nfsi->silly_count, 1);
+	INIT_HLIST_HEAD(&nfsi->silly_list);
+	init_waitqueue_head(&nfsi->waitqueue);
+	nfs4_init_once(nfsi);
+}
+
+static int __init nfs_init_inodecache(void)
+{
+	nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
+					     sizeof(struct nfs_inode),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (nfs_inode_cachep == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void nfs_destroy_inodecache(void)
+{
+	kmem_cache_destroy(nfs_inode_cachep);
+}
+
+struct workqueue_struct *nfsiod_workqueue;
+
+/*
+ * start up the nfsiod workqueue
+ */
+static int nfsiod_start(void)
+{
+	struct workqueue_struct *wq;
+	dprintk("RPC:       creating workqueue nfsiod\n");
+	wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0);
+	if (wq == NULL)
+		return -ENOMEM;
+	nfsiod_workqueue = wq;
+	return 0;
+}
+
+/*
+ * Destroy the nfsiod workqueue
+ */
+static void nfsiod_stop(void)
+{
+	struct workqueue_struct *wq;
+
+	wq = nfsiod_workqueue;
+	if (wq == NULL)
+		return;
+	nfsiod_workqueue = NULL;
+	destroy_workqueue(wq);
+}
+
+/*
+ * Initialize NFS
+ */
+static int __init init_nfs_fs(void)
+{
+	int err;
+
+	err = nfs_idmap_init();
+	if (err < 0)
+		goto out9;
+
+	err = nfs_dns_resolver_init();
+	if (err < 0)
+		goto out8;
+
+	err = nfs_fscache_register();
+	if (err < 0)
+		goto out7;
+
+	err = nfsiod_start();
+	if (err)
+		goto out6;
+
+	err = nfs_fs_proc_init();
+	if (err)
+		goto out5;
+
+	err = nfs_init_nfspagecache();
+	if (err)
+		goto out4;
+
+	err = nfs_init_inodecache();
+	if (err)
+		goto out3;
+
+	err = nfs_init_readpagecache();
+	if (err)
+		goto out2;
+
+	err = nfs_init_writepagecache();
+	if (err)
+		goto out1;
+
+	err = nfs_init_directcache();
+	if (err)
+		goto out0;
+
+#ifdef CONFIG_PROC_FS
+	rpc_proc_register(&nfs_rpcstat);
+#endif
+	if ((err = register_nfs_fs()) != 0)
+		goto out;
+	return 0;
+out:
+#ifdef CONFIG_PROC_FS
+	rpc_proc_unregister("nfs");
+#endif
+	nfs_destroy_directcache();
+out0:
+	nfs_destroy_writepagecache();
+out1:
+	nfs_destroy_readpagecache();
+out2:
+	nfs_destroy_inodecache();
+out3:
+	nfs_destroy_nfspagecache();
+out4:
+	nfs_fs_proc_exit();
+out5:
+	nfsiod_stop();
+out6:
+	nfs_fscache_unregister();
+out7:
+	nfs_dns_resolver_destroy();
+out8:
+	nfs_idmap_quit();
+out9:
+	return err;
+}
+
+static void __exit exit_nfs_fs(void)
+{
+	nfs_destroy_directcache();
+	nfs_destroy_writepagecache();
+	nfs_destroy_readpagecache();
+	nfs_destroy_inodecache();
+	nfs_destroy_nfspagecache();
+	nfs_fscache_unregister();
+	nfs_dns_resolver_destroy();
+	nfs_idmap_quit();
+#ifdef CONFIG_PROC_FS
+	rpc_proc_unregister("nfs");
+#endif
+	nfs_cleanup_cb_ident_idr();
+	unregister_nfs_fs();
+	nfs_fs_proc_exit();
+	nfsiod_stop();
+}
+
+/* Not quite true; I just maintain it */
+MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
+MODULE_LICENSE("GPL");
+module_param(enable_ino64, bool, 0644);
+
+module_init(init_nfs_fs)
+module_exit(exit_nfs_fs)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
new file mode 100644
index 00000000..2a55347a
--- /dev/null
+++ b/fs/nfs/internal.h
@@ -0,0 +1,456 @@
+/*
+ * NFS internal definitions
+ */
+
+#include "nfs4_fs.h"
+#include <linux/mount.h>
+#include <linux/security.h>
+
+#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
+
+struct nfs_string;
+
+/* Maximum number of readahead requests
+ * FIXME: this should really be a sysctl so that users may tune it to suit
+ *        their needs. People that do NFS over a slow network, might for
+ *        instance want to reduce it to something closer to 1 for improved
+ *        interactive response.
+ */
+#define NFS_MAX_READAHEAD	(RPC_DEF_SLOT_TABLE - 1)
+
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+	if (clp->cl_session)
+		return 1;
+#endif /* CONFIG_NFS_V4_1 */
+	return 0;
+}
+
+static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+	if (nfs4_has_session(clp))
+		return (clp->cl_session->flags & SESSION4_PERSIST);
+#endif /* CONFIG_NFS_V4_1 */
+	return 0;
+}
+
+static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
+{
+	if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
+		fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT;
+}
+
+static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr)
+{
+	if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) ||
+	    (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) &&
+	     ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0)))
+		return 0;
+
+	fattr->fileid = fattr->mounted_on_fileid;
+	return 1;
+}
+
+struct nfs_clone_mount {
+	const struct super_block *sb;
+	const struct dentry *dentry;
+	struct nfs_fh *fh;
+	struct nfs_fattr *fattr;
+	char *hostname;
+	char *mnt_path;
+	struct sockaddr *addr;
+	size_t addrlen;
+	rpc_authflavor_t authflavor;
+};
+
+/*
+ * Note: RFC 1813 doesn't limit the number of auth flavors that
+ * a server can return, so make something up.
+ */
+#define NFS_MAX_SECFLAVORS	(12)
+
+/*
+ * Value used if the user did not specify a port value.
+ */
+#define NFS_UNSPEC_PORT		(-1)
+
+/*
+ * Maximum number of pages that readdir can use for creating
+ * a vmapped array of pages.
+ */
+#define NFS_MAX_READDIR_PAGES 8
+
+/*
+ * In-kernel mount arguments
+ */
+struct nfs_parsed_mount_data {
+	int			flags;
+	int			rsize, wsize;
+	int			timeo, retrans;
+	int			acregmin, acregmax,
+				acdirmin, acdirmax;
+	int			namlen;
+	unsigned int		options;
+	unsigned int		bsize;
+	unsigned int		auth_flavor_len;
+	rpc_authflavor_t	auth_flavors[1];
+	char			*client_address;
+	unsigned int		version;
+	unsigned int		minorversion;
+	char			*fscache_uniq;
+
+	struct {
+		struct sockaddr_storage	address;
+		size_t			addrlen;
+		char			*hostname;
+		u32			version;
+		int			port;
+		unsigned short		protocol;
+	} mount_server;
+
+	struct {
+		struct sockaddr_storage	address;
+		size_t			addrlen;
+		char			*hostname;
+		char			*export_path;
+		int			port;
+		unsigned short		protocol;
+	} nfs_server;
+
+	struct security_mnt_opts lsm_opts;
+};
+
+/* mount_clnt.c */
+struct nfs_mount_request {
+	struct sockaddr		*sap;
+	size_t			salen;
+	char			*hostname;
+	char			*dirpath;
+	u32			version;
+	unsigned short		protocol;
+	struct nfs_fh		*fh;
+	int			noresvport;
+	unsigned int		*auth_flav_len;
+	rpc_authflavor_t	*auth_flavs;
+};
+
+extern int nfs_mount(struct nfs_mount_request *info);
+extern void nfs_umount(const struct nfs_mount_request *info);
+
+/* client.c */
+extern struct rpc_program nfs_program;
+
+extern void nfs_cleanup_cb_ident_idr(void);
+extern void nfs_put_client(struct nfs_client *);
+extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
+extern struct nfs_client *nfs4_find_client_ident(int);
+extern struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *);
+extern struct nfs_server *nfs_create_server(
+					const struct nfs_parsed_mount_data *,
+					struct nfs_fh *);
+extern struct nfs_server *nfs4_create_server(
+					const struct nfs_parsed_mount_data *,
+					struct nfs_fh *);
+extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
+						      struct nfs_fh *);
+extern void nfs_free_server(struct nfs_server *server);
+extern struct nfs_server *nfs_clone_server(struct nfs_server *,
+					   struct nfs_fh *,
+					   struct nfs_fattr *);
+extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
+extern int nfs4_check_client_ready(struct nfs_client *clp);
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+					     const struct sockaddr *ds_addr,
+					     int ds_addrlen, int ds_proto);
+#ifdef CONFIG_PROC_FS
+extern int __init nfs_fs_proc_init(void);
+extern void nfs_fs_proc_exit(void);
+#else
+static inline int nfs_fs_proc_init(void)
+{
+	return 0;
+}
+static inline void nfs_fs_proc_exit(void)
+{
+}
+#endif
+
+/* nfs4namespace.c */
+#ifdef CONFIG_NFS_V4
+extern struct vfsmount *nfs_do_refmount(struct dentry *dentry);
+#else
+static inline
+struct vfsmount *nfs_do_refmount(struct dentry *dentry)
+{
+	return ERR_PTR(-ENOENT);
+}
+#endif
+
+/* callback_xdr.c */
+extern struct svc_version nfs4_callback_version1;
+extern struct svc_version nfs4_callback_version4;
+
+/* pagelist.c */
+extern int __init nfs_init_nfspagecache(void);
+extern void nfs_destroy_nfspagecache(void);
+extern int __init nfs_init_readpagecache(void);
+extern void nfs_destroy_readpagecache(void);
+extern int __init nfs_init_writepagecache(void);
+extern void nfs_destroy_writepagecache(void);
+
+extern int __init nfs_init_directcache(void);
+extern void nfs_destroy_directcache(void);
+
+/* nfs2xdr.c */
+extern int nfs_stat_to_errno(enum nfs_stat);
+extern struct rpc_procinfo nfs_procedures[];
+extern int nfs2_decode_dirent(struct xdr_stream *,
+				struct nfs_entry *, int);
+
+/* nfs3xdr.c */
+extern struct rpc_procinfo nfs3_procedures[];
+extern int nfs3_decode_dirent(struct xdr_stream *,
+				struct nfs_entry *, int);
+
+/* nfs4xdr.c */
+#ifdef CONFIG_NFS_V4
+extern int nfs4_decode_dirent(struct xdr_stream *,
+				struct nfs_entry *, int);
+#endif
+#ifdef CONFIG_NFS_V4_1
+extern const u32 nfs41_maxread_overhead;
+extern const u32 nfs41_maxwrite_overhead;
+#endif
+
+/* nfs4proc.c */
+#ifdef CONFIG_NFS_V4
+extern struct rpc_procinfo nfs4_procedures[];
+void nfs_fixup_secinfo_attributes(struct nfs_fattr *, struct nfs_fh *);
+#endif
+
+extern int nfs4_init_ds_session(struct nfs_client *clp);
+
+/* proc.c */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+extern int nfs_init_client(struct nfs_client *clp,
+			   const struct rpc_timeout *timeparms,
+			   const char *ip_addr, rpc_authflavor_t authflavour,
+			   int noresvport);
+
+/* dir.c */
+extern int nfs_access_cache_shrinker(struct shrinker *shrink,
+					struct shrink_control *sc);
+
+/* inode.c */
+extern struct workqueue_struct *nfsiod_workqueue;
+extern struct inode *nfs_alloc_inode(struct super_block *sb);
+extern void nfs_destroy_inode(struct inode *);
+extern int nfs_write_inode(struct inode *, struct writeback_control *);
+extern void nfs_evict_inode(struct inode *);
+#ifdef CONFIG_NFS_V4
+extern void nfs4_evict_inode(struct inode *);
+#endif
+void nfs_zap_acl_cache(struct inode *inode);
+extern int nfs_wait_bit_killable(void *word);
+
+/* super.c */
+extern struct file_system_type nfs_xdev_fs_type;
+#ifdef CONFIG_NFS_V4
+extern struct file_system_type nfs4_xdev_fs_type;
+extern struct file_system_type nfs4_referral_fs_type;
+#endif
+
+extern struct rpc_stat nfs_rpcstat;
+
+extern int __init register_nfs_fs(void);
+extern void __exit unregister_nfs_fs(void);
+extern void nfs_sb_active(struct super_block *sb);
+extern void nfs_sb_deactive(struct super_block *sb);
+
+/* namespace.c */
+extern char *nfs_path(char **p, struct dentry *dentry,
+		      char *buffer, ssize_t buflen);
+extern struct vfsmount *nfs_d_automount(struct path *path);
+
+/* getroot.c */
+extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
+				   const char *);
+#ifdef CONFIG_NFS_V4
+extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
+				    const char *);
+
+extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
+#endif
+
+/* read.c */
+extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+			     const struct rpc_call_ops *call_ops);
+extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
+
+/* write.c */
+extern void nfs_commit_free(struct nfs_write_data *p);
+extern int nfs_initiate_write(struct nfs_write_data *data,
+			      struct rpc_clnt *clnt,
+			      const struct rpc_call_ops *call_ops,
+			      int how);
+extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
+extern int nfs_initiate_commit(struct nfs_write_data *data,
+			       struct rpc_clnt *clnt,
+			       const struct rpc_call_ops *call_ops,
+			       int how);
+extern void nfs_init_commit(struct nfs_write_data *data,
+			    struct list_head *head,
+			    struct pnfs_layout_segment *lseg);
+void nfs_retry_commit(struct list_head *page_list,
+		      struct pnfs_layout_segment *lseg);
+void nfs_commit_clear_lock(struct nfs_inode *nfsi);
+void nfs_commitdata_release(void *data);
+void nfs_commit_release_pages(struct nfs_write_data *data);
+
+#ifdef CONFIG_MIGRATION
+extern int nfs_migrate_page(struct address_space *,
+		struct page *, struct page *);
+#else
+#define nfs_migrate_page NULL
+#endif
+
+/* nfs4proc.c */
+extern void __nfs4_read_done_cb(struct nfs_read_data *);
+extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
+extern int nfs4_init_client(struct nfs_client *clp,
+			    const struct rpc_timeout *timeparms,
+			    const char *ip_addr,
+			    rpc_authflavor_t authflavour,
+			    int noresvport);
+extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data);
+extern int _nfs4_call_sync(struct rpc_clnt *clnt,
+			   struct nfs_server *server,
+			   struct rpc_message *msg,
+			   struct nfs4_sequence_args *args,
+			   struct nfs4_sequence_res *res,
+			   int cache_reply);
+extern int _nfs4_call_sync_session(struct rpc_clnt *clnt,
+				   struct nfs_server *server,
+				   struct rpc_message *msg,
+				   struct nfs4_sequence_args *args,
+				   struct nfs4_sequence_res *res,
+				   int cache_reply);
+
+/*
+ * Determine the device name as a string
+ */
+static inline char *nfs_devname(struct dentry *dentry,
+				char *buffer, ssize_t buflen)
+{
+	char *dummy;
+	return nfs_path(&dummy, dentry, buffer, buflen);
+}
+
+/*
+ * Determine the actual block size (and log2 thereof)
+ */
+static inline
+unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
+{
+	/* make sure blocksize is a power of two */
+	if ((bsize & (bsize - 1)) || nrbitsp) {
+		unsigned char	nrbits;
+
+		for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
+			;
+		bsize = 1 << nrbits;
+		if (nrbitsp)
+			*nrbitsp = nrbits;
+	}
+
+	return bsize;
+}
+
+/*
+ * Calculate the number of 512byte blocks used.
+ */
+static inline blkcnt_t nfs_calc_block_size(u64 tsize)
+{
+	blkcnt_t used = (tsize + 511) >> 9;
+	return (used > ULONG_MAX) ? ULONG_MAX : used;
+}
+
+/*
+ * Compute and set NFS server blocksize
+ */
+static inline
+unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
+{
+	if (bsize < NFS_MIN_FILE_IO_SIZE)
+		bsize = NFS_DEF_FILE_IO_SIZE;
+	else if (bsize >= NFS_MAX_FILE_IO_SIZE)
+		bsize = NFS_MAX_FILE_IO_SIZE;
+
+	return nfs_block_bits(bsize, nrbitsp);
+}
+
+/*
+ * Determine the maximum file size for a superblock
+ */
+static inline
+void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
+{
+	sb->s_maxbytes = (loff_t)maxfilesize;
+	if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
+		sb->s_maxbytes = MAX_LFS_FILESIZE;
+}
+
+/*
+ * Determine the number of bytes of data the page contains
+ */
+static inline
+unsigned int nfs_page_length(struct page *page)
+{
+	loff_t i_size = i_size_read(page->mapping->host);
+
+	if (i_size > 0) {
+		pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+		if (page->index < end_index)
+			return PAGE_CACHE_SIZE;
+		if (page->index == end_index)
+			return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
+	}
+	return 0;
+}
+
+/*
+ * Convert a umode to a dirent->d_type
+ */
+static inline
+unsigned char nfs_umode_to_dtype(umode_t mode)
+{
+	return (mode >> 12) & 15;
+}
+
+/*
+ * Determine the number of pages in an array of length 'len' and
+ * with a base offset of 'base'
+ */
+static inline
+unsigned int nfs_page_array_len(unsigned int base, size_t len)
+{
+	return ((unsigned long)len + (unsigned long)base +
+		PAGE_SIZE - 1) >> PAGE_SHIFT;
+}
+
+/*
+ * Helper for restarting RPC calls in the possible presence of NFSv4.1
+ * sessions.
+ */
+static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
+{
+	if (nfs4_has_session(clp))
+		return rpc_restart_call_prepare(task);
+	return rpc_restart_call(task);
+}
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
new file mode 100644
index 00000000..c5832487
--- /dev/null
+++ b/fs/nfs/iostat.h
@@ -0,0 +1,71 @@
+/*
+ *  linux/fs/nfs/iostat.h
+ *
+ *  Declarations for NFS client per-mount statistics
+ *
+ *  Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com>
+ *
+ */
+
+#ifndef _NFS_IOSTAT
+#define _NFS_IOSTAT
+
+#include <linux/percpu.h>
+#include <linux/cache.h>
+#include <linux/nfs_iostat.h>
+
+struct nfs_iostats {
+	unsigned long long	bytes[__NFSIOS_BYTESMAX];
+#ifdef CONFIG_NFS_FSCACHE
+	unsigned long long	fscache[__NFSIOS_FSCACHEMAX];
+#endif
+	unsigned long		events[__NFSIOS_COUNTSMAX];
+} ____cacheline_aligned;
+
+static inline void nfs_inc_server_stats(const struct nfs_server *server,
+					enum nfs_stat_eventcounters stat)
+{
+	this_cpu_inc(server->io_stats->events[stat]);
+}
+
+static inline void nfs_inc_stats(const struct inode *inode,
+				 enum nfs_stat_eventcounters stat)
+{
+	nfs_inc_server_stats(NFS_SERVER(inode), stat);
+}
+
+static inline void nfs_add_server_stats(const struct nfs_server *server,
+					enum nfs_stat_bytecounters stat,
+					long addend)
+{
+	this_cpu_add(server->io_stats->bytes[stat], addend);
+}
+
+static inline void nfs_add_stats(const struct inode *inode,
+				 enum nfs_stat_bytecounters stat,
+				 long addend)
+{
+	nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
+}
+
+#ifdef CONFIG_NFS_FSCACHE
+static inline void nfs_add_fscache_stats(struct inode *inode,
+					 enum nfs_stat_fscachecounters stat,
+					 long addend)
+{
+	this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
+}
+#endif
+
+static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
+{
+	return alloc_percpu(struct nfs_iostats);
+}
+
+static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
+{
+	if (stats != NULL)
+		free_percpu(stats);
+}
+
+#endif /* _NFS_IOSTAT */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
new file mode 100644
index 00000000..d4c2d6b7
--- /dev/null
+++ b/fs/nfs/mount_clnt.c
@@ -0,0 +1,518 @@
+/*
+ * In-kernel MOUNT protocol client
+ *
+ * Copyright (C) 1997, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+
+#ifdef RPC_DEBUG
+# define NFSDBG_FACILITY	NFSDBG_MOUNT
+#endif
+
+/*
+ * Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4
+ */
+#define MNTPATHLEN		(1024)
+
+/*
+ * XDR data type sizes
+ */
+#define encode_dirpath_sz	(1 + XDR_QUADLEN(MNTPATHLEN))
+#define MNT_status_sz		(1)
+#define MNT_fhs_status_sz	(1)
+#define MNT_fhandle_sz		XDR_QUADLEN(NFS2_FHSIZE)
+#define MNT_fhandle3_sz		(1 + XDR_QUADLEN(NFS3_FHSIZE))
+#define MNT_authflav3_sz	(1 + NFS_MAX_SECFLAVORS)
+
+/*
+ * XDR argument and result sizes
+ */
+#define MNT_enc_dirpath_sz	encode_dirpath_sz
+#define MNT_dec_mountres_sz	(MNT_status_sz + MNT_fhandle_sz)
+#define MNT_dec_mountres3_sz	(MNT_status_sz + MNT_fhandle_sz + \
+				 MNT_authflav3_sz)
+
+/*
+ * Defined by RFC 1094, section A.5
+ */
+enum {
+	MOUNTPROC_NULL		= 0,
+	MOUNTPROC_MNT		= 1,
+	MOUNTPROC_DUMP		= 2,
+	MOUNTPROC_UMNT		= 3,
+	MOUNTPROC_UMNTALL	= 4,
+	MOUNTPROC_EXPORT	= 5,
+};
+
+/*
+ * Defined by RFC 1813, section 5.2
+ */
+enum {
+	MOUNTPROC3_NULL		= 0,
+	MOUNTPROC3_MNT		= 1,
+	MOUNTPROC3_DUMP		= 2,
+	MOUNTPROC3_UMNT		= 3,
+	MOUNTPROC3_UMNTALL	= 4,
+	MOUNTPROC3_EXPORT	= 5,
+};
+
+static struct rpc_program	mnt_program;
+
+/*
+ * Defined by OpenGroup XNFS Version 3W, chapter 8
+ */
+enum mountstat {
+	MNT_OK			= 0,
+	MNT_EPERM		= 1,
+	MNT_ENOENT		= 2,
+	MNT_EACCES		= 13,
+	MNT_EINVAL		= 22,
+};
+
+static struct {
+	u32 status;
+	int errno;
+} mnt_errtbl[] = {
+	{ .status = MNT_OK,			.errno = 0,		},
+	{ .status = MNT_EPERM,			.errno = -EPERM,	},
+	{ .status = MNT_ENOENT,			.errno = -ENOENT,	},
+	{ .status = MNT_EACCES,			.errno = -EACCES,	},
+	{ .status = MNT_EINVAL,			.errno = -EINVAL,	},
+};
+
+/*
+ * Defined by RFC 1813, section 5.1.5
+ */
+enum mountstat3 {
+	MNT3_OK			= 0,		/* no error */
+	MNT3ERR_PERM		= 1,		/* Not owner */
+	MNT3ERR_NOENT		= 2,		/* No such file or directory */
+	MNT3ERR_IO		= 5,		/* I/O error */
+	MNT3ERR_ACCES		= 13,		/* Permission denied */
+	MNT3ERR_NOTDIR		= 20,		/* Not a directory */
+	MNT3ERR_INVAL		= 22,		/* Invalid argument */
+	MNT3ERR_NAMETOOLONG	= 63,		/* Filename too long */
+	MNT3ERR_NOTSUPP		= 10004,	/* Operation not supported */
+	MNT3ERR_SERVERFAULT	= 10006,	/* A failure on the server */
+};
+
+static struct {
+	u32 status;
+	int errno;
+} mnt3_errtbl[] = {
+	{ .status = MNT3_OK,			.errno = 0,		},
+	{ .status = MNT3ERR_PERM,		.errno = -EPERM,	},
+	{ .status = MNT3ERR_NOENT,		.errno = -ENOENT,	},
+	{ .status = MNT3ERR_IO,			.errno = -EIO,		},
+	{ .status = MNT3ERR_ACCES,		.errno = -EACCES,	},
+	{ .status = MNT3ERR_NOTDIR,		.errno = -ENOTDIR,	},
+	{ .status = MNT3ERR_INVAL,		.errno = -EINVAL,	},
+	{ .status = MNT3ERR_NAMETOOLONG,	.errno = -ENAMETOOLONG,	},
+	{ .status = MNT3ERR_NOTSUPP,		.errno = -ENOTSUPP,	},
+	{ .status = MNT3ERR_SERVERFAULT,	.errno = -EREMOTEIO,	},
+};
+
+struct mountres {
+	int errno;
+	struct nfs_fh *fh;
+	unsigned int *auth_count;
+	rpc_authflavor_t *auth_flavors;
+};
+
+struct mnt_fhstatus {
+	u32 status;
+	struct nfs_fh *fh;
+};
+
+/**
+ * nfs_mount - Obtain an NFS file handle for the given host and path
+ * @info: pointer to mount request arguments
+ *
+ * Uses default timeout parameters specified by underlying transport.
+ */
+int nfs_mount(struct nfs_mount_request *info)
+{
+	struct mountres	result = {
+		.fh		= info->fh,
+		.auth_count	= info->auth_flav_len,
+		.auth_flavors	= info->auth_flavs,
+	};
+	struct rpc_message msg	= {
+		.rpc_argp	= info->dirpath,
+		.rpc_resp	= &result,
+	};
+	struct rpc_create_args args = {
+		.net		= &init_net,
+		.protocol	= info->protocol,
+		.address	= info->sap,
+		.addrsize	= info->salen,
+		.servername	= info->hostname,
+		.program	= &mnt_program,
+		.version	= info->version,
+		.authflavor	= RPC_AUTH_UNIX,
+	};
+	struct rpc_clnt		*mnt_clnt;
+	int			status;
+
+	dprintk("NFS: sending MNT request for %s:%s\n",
+		(info->hostname ? info->hostname : "server"),
+			info->dirpath);
+
+	if (info->noresvport)
+		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+	mnt_clnt = rpc_create(&args);
+	if (IS_ERR(mnt_clnt))
+		goto out_clnt_err;
+
+	if (info->version == NFS_MNT3_VERSION)
+		msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
+	else
+		msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT];
+
+	status = rpc_call_sync(mnt_clnt, &msg, 0);
+	rpc_shutdown_client(mnt_clnt);
+
+	if (status < 0)
+		goto out_call_err;
+	if (result.errno != 0)
+		goto out_mnt_err;
+
+	dprintk("NFS: MNT request succeeded\n");
+	status = 0;
+
+out:
+	return status;
+
+out_clnt_err:
+	status = PTR_ERR(mnt_clnt);
+	dprintk("NFS: failed to create MNT RPC client, status=%d\n", status);
+	goto out;
+
+out_call_err:
+	dprintk("NFS: MNT request failed, status=%d\n", status);
+	goto out;
+
+out_mnt_err:
+	dprintk("NFS: MNT server returned result %d\n", result.errno);
+	status = result.errno;
+	goto out;
+}
+
+/**
+ * nfs_umount - Notify a server that we have unmounted this export
+ * @info: pointer to umount request arguments
+ *
+ * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always
+ * use UDP.
+ */
+void nfs_umount(const struct nfs_mount_request *info)
+{
+	static const struct rpc_timeout nfs_umnt_timeout = {
+		.to_initval = 1 * HZ,
+		.to_maxval = 3 * HZ,
+		.to_retries = 2,
+	};
+	struct rpc_create_args args = {
+		.net		= &init_net,
+		.protocol	= IPPROTO_UDP,
+		.address	= info->sap,
+		.addrsize	= info->salen,
+		.timeout	= &nfs_umnt_timeout,
+		.servername	= info->hostname,
+		.program	= &mnt_program,
+		.version	= info->version,
+		.authflavor	= RPC_AUTH_UNIX,
+		.flags		= RPC_CLNT_CREATE_NOPING,
+	};
+	struct rpc_message msg	= {
+		.rpc_argp	= info->dirpath,
+	};
+	struct rpc_clnt *clnt;
+	int status;
+
+	if (info->noresvport)
+		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+	clnt = rpc_create(&args);
+	if (IS_ERR(clnt))
+		goto out_clnt_err;
+
+	dprintk("NFS: sending UMNT request for %s:%s\n",
+		(info->hostname ? info->hostname : "server"), info->dirpath);
+
+	if (info->version == NFS_MNT3_VERSION)
+		msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT];
+	else
+		msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT];
+
+	status = rpc_call_sync(clnt, &msg, 0);
+	rpc_shutdown_client(clnt);
+
+	if (unlikely(status < 0))
+		goto out_call_err;
+
+	return;
+
+out_clnt_err:
+	dprintk("NFS: failed to create UMNT RPC client, status=%ld\n",
+			PTR_ERR(clnt));
+	return;
+
+out_call_err:
+	dprintk("NFS: UMNT request failed, status=%d\n", status);
+}
+
+/*
+ * XDR encode/decode functions for MOUNT
+ */
+
+static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
+{
+	const u32 pathname_len = strlen(pathname);
+	__be32 *p;
+
+	BUG_ON(pathname_len > MNTPATHLEN);
+	p = xdr_reserve_space(xdr, 4 + pathname_len);
+	xdr_encode_opaque(p, pathname, pathname_len);
+}
+
+static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const char *dirpath)
+{
+	encode_mntdirpath(xdr, dirpath);
+}
+
+/*
+ * RFC 1094: "A non-zero status indicates some sort of error.  In this
+ * case, the status is a UNIX error number."  This can be problematic
+ * if the server and client use different errno values for the same
+ * error.
+ *
+ * However, the OpenGroup XNFS spec provides a simple mapping that is
+ * independent of local errno values on the server and the client.
+ */
+static int decode_status(struct xdr_stream *xdr, struct mountres *res)
+{
+	unsigned int i;
+	u32 status;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -EIO;
+	status = be32_to_cpup(p);
+
+	for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
+		if (mnt_errtbl[i].status == status) {
+			res->errno = mnt_errtbl[i].errno;
+			return 0;
+		}
+	}
+
+	dprintk("NFS: unrecognized MNT status code: %u\n", status);
+	res->errno = -EACCES;
+	return 0;
+}
+
+static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
+{
+	struct nfs_fh *fh = res->fh;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+	if (unlikely(p == NULL))
+		return -EIO;
+
+	fh->size = NFS2_FHSIZE;
+	memcpy(fh->data, p, NFS2_FHSIZE);
+	return 0;
+}
+
+static int mnt_xdr_dec_mountres(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				struct mountres *res)
+{
+	int status;
+
+	status = decode_status(xdr, res);
+	if (unlikely(status != 0 || res->errno != 0))
+		return status;
+	return decode_fhandle(xdr, res);
+}
+
+static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
+{
+	unsigned int i;
+	u32 status;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -EIO;
+	status = be32_to_cpup(p);
+
+	for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
+		if (mnt3_errtbl[i].status == status) {
+			res->errno = mnt3_errtbl[i].errno;
+			return 0;
+		}
+	}
+
+	dprintk("NFS: unrecognized MNT3 status code: %u\n", status);
+	res->errno = -EACCES;
+	return 0;
+}
+
+static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
+{
+	struct nfs_fh *fh = res->fh;
+	u32 size;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -EIO;
+
+	size = be32_to_cpup(p);
+	if (size > NFS3_FHSIZE || size == 0)
+		return -EIO;
+
+	p = xdr_inline_decode(xdr, size);
+	if (unlikely(p == NULL))
+		return -EIO;
+
+	fh->size = size;
+	memcpy(fh->data, p, size);
+	return 0;
+}
+
+static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
+{
+	rpc_authflavor_t *flavors = res->auth_flavors;
+	unsigned int *count = res->auth_count;
+	u32 entries, i;
+	__be32 *p;
+
+	if (*count == 0)
+		return 0;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -EIO;
+	entries = be32_to_cpup(p);
+	dprintk("NFS: received %u auth flavors\n", entries);
+	if (entries > NFS_MAX_SECFLAVORS)
+		entries = NFS_MAX_SECFLAVORS;
+
+	p = xdr_inline_decode(xdr, 4 * entries);
+	if (unlikely(p == NULL))
+		return -EIO;
+
+	if (entries > *count)
+		entries = *count;
+
+	for (i = 0; i < entries; i++) {
+		flavors[i] = be32_to_cpup(p++);
+		dprintk("NFS:   auth flavor[%u]: %d\n", i, flavors[i]);
+	}
+	*count = i;
+
+	return 0;
+}
+
+static int mnt_xdr_dec_mountres3(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 struct mountres *res)
+{
+	int status;
+
+	status = decode_fhs_status(xdr, res);
+	if (unlikely(status != 0 || res->errno != 0))
+		return status;
+	status = decode_fhandle3(xdr, res);
+	if (unlikely(status != 0)) {
+		res->errno = -EBADHANDLE;
+		return 0;
+	}
+	return decode_auth_flavors(xdr, res);
+}
+
+static struct rpc_procinfo mnt_procedures[] = {
+	[MOUNTPROC_MNT] = {
+		.p_proc		= MOUNTPROC_MNT,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
+		.p_decode	= (kxdrdproc_t)mnt_xdr_dec_mountres,
+		.p_arglen	= MNT_enc_dirpath_sz,
+		.p_replen	= MNT_dec_mountres_sz,
+		.p_statidx	= MOUNTPROC_MNT,
+		.p_name		= "MOUNT",
+	},
+	[MOUNTPROC_UMNT] = {
+		.p_proc		= MOUNTPROC_UMNT,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
+		.p_arglen	= MNT_enc_dirpath_sz,
+		.p_statidx	= MOUNTPROC_UMNT,
+		.p_name		= "UMOUNT",
+	},
+};
+
+static struct rpc_procinfo mnt3_procedures[] = {
+	[MOUNTPROC3_MNT] = {
+		.p_proc		= MOUNTPROC3_MNT,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
+		.p_decode	= (kxdrdproc_t)mnt_xdr_dec_mountres3,
+		.p_arglen	= MNT_enc_dirpath_sz,
+		.p_replen	= MNT_dec_mountres3_sz,
+		.p_statidx	= MOUNTPROC3_MNT,
+		.p_name		= "MOUNT",
+	},
+	[MOUNTPROC3_UMNT] = {
+		.p_proc		= MOUNTPROC3_UMNT,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
+		.p_arglen	= MNT_enc_dirpath_sz,
+		.p_statidx	= MOUNTPROC3_UMNT,
+		.p_name		= "UMOUNT",
+	},
+};
+
+
+static struct rpc_version mnt_version1 = {
+	.number		= 1,
+	.nrprocs	= ARRAY_SIZE(mnt_procedures),
+	.procs		= mnt_procedures,
+};
+
+static struct rpc_version mnt_version3 = {
+	.number		= 3,
+	.nrprocs	= ARRAY_SIZE(mnt3_procedures),
+	.procs		= mnt3_procedures,
+};
+
+static struct rpc_version *mnt_version[] = {
+	NULL,
+	&mnt_version1,
+	NULL,
+	&mnt_version3,
+};
+
+static struct rpc_stat mnt_stats;
+
+static struct rpc_program mnt_program = {
+	.name		= "mount",
+	.number		= NFS_MNT_PROGRAM,
+	.nrvers		= ARRAY_SIZE(mnt_version),
+	.version	= mnt_version,
+	.stats		= &mnt_stats,
+};
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
new file mode 100644
index 00000000..1f063bac
--- /dev/null
+++ b/fs/nfs/namespace.c
@@ -0,0 +1,371 @@
+/*
+ * linux/fs/nfs/namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ * - Modified by David Howells <dhowells@redhat.com>
+ *
+ * NFS namespace
+ */
+
+#include <linux/dcache.h>
+#include <linux/gfp.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+#include <linux/sunrpc/gss_api.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+static void nfs_expire_automounts(struct work_struct *work);
+
+static LIST_HEAD(nfs_automount_list);
+static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
+int nfs_mountpoint_expiry_timeout = 500 * HZ;
+
+static struct vfsmount *nfs_do_submount(struct dentry *dentry,
+					struct nfs_fh *fh,
+					struct nfs_fattr *fattr,
+					rpc_authflavor_t authflavor);
+
+/*
+ * nfs_path - reconstruct the path given an arbitrary dentry
+ * @base - used to return pointer to the end of devname part of path
+ * @dentry - pointer to dentry
+ * @buffer - result buffer
+ * @buflen - length of buffer
+ *
+ * Helper function for constructing the server pathname
+ * by arbitrary hashed dentry.
+ *
+ * This is mainly for use in figuring out the path on the
+ * server side when automounting on top of an existing partition
+ * and in generating /proc/mounts and friends.
+ */
+char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen)
+{
+	char *end;
+	int namelen;
+	unsigned seq;
+	const char *base;
+
+rename_retry:
+	end = buffer+buflen;
+	*--end = '\0';
+	buflen--;
+
+	seq = read_seqbegin(&rename_lock);
+	rcu_read_lock();
+	while (1) {
+		spin_lock(&dentry->d_lock);
+		if (IS_ROOT(dentry))
+			break;
+		namelen = dentry->d_name.len;
+		buflen -= namelen + 1;
+		if (buflen < 0)
+			goto Elong_unlock;
+		end -= namelen;
+		memcpy(end, dentry->d_name.name, namelen);
+		*--end = '/';
+		spin_unlock(&dentry->d_lock);
+		dentry = dentry->d_parent;
+	}
+	if (read_seqretry(&rename_lock, seq)) {
+		spin_unlock(&dentry->d_lock);
+		rcu_read_unlock();
+		goto rename_retry;
+	}
+	if (*end != '/') {
+		if (--buflen < 0) {
+			spin_unlock(&dentry->d_lock);
+			rcu_read_unlock();
+			goto Elong;
+		}
+		*--end = '/';
+	}
+	*p = end;
+	base = dentry->d_fsdata;
+	if (!base) {
+		spin_unlock(&dentry->d_lock);
+		rcu_read_unlock();
+		WARN_ON(1);
+		return end;
+	}
+	namelen = strlen(base);
+	/* Strip off excess slashes in base string */
+	while (namelen > 0 && base[namelen - 1] == '/')
+		namelen--;
+	buflen -= namelen;
+	if (buflen < 0) {
+		spin_unlock(&dentry->d_lock);
+		rcu_read_unlock();
+		goto Elong;
+	}
+	end -= namelen;
+	memcpy(end, base, namelen);
+	spin_unlock(&dentry->d_lock);
+	rcu_read_unlock();
+	return end;
+Elong_unlock:
+	spin_unlock(&dentry->d_lock);
+	rcu_read_unlock();
+	if (read_seqretry(&rename_lock, seq))
+		goto rename_retry;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+#ifdef CONFIG_NFS_V4
+static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
+{
+	struct gss_api_mech *mech;
+	struct xdr_netobj oid;
+	int i;
+	rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
+
+	for (i = 0; i < flavors->num_flavors; i++) {
+		struct nfs4_secinfo_flavor *flavor;
+		flavor = &flavors->flavors[i];
+
+		if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) {
+			pseudoflavor = flavor->flavor;
+			break;
+		} else if (flavor->flavor == RPC_AUTH_GSS) {
+			oid.len  = flavor->gss.sec_oid4.len;
+			oid.data = flavor->gss.sec_oid4.data;
+			mech = gss_mech_get_by_OID(&oid);
+			if (!mech)
+				continue;
+			pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service);
+			gss_mech_put(mech);
+			break;
+		}
+	}
+
+	return pseudoflavor;
+}
+
+static int nfs_negotiate_security(const struct dentry *parent,
+				  const struct dentry *dentry,
+				  rpc_authflavor_t *flavor)
+{
+	struct page *page;
+	struct nfs4_secinfo_flavors *flavors;
+	int (*secinfo)(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
+	int ret = -EPERM;
+
+	secinfo = NFS_PROTO(parent->d_inode)->secinfo;
+	if (secinfo != NULL) {
+		page = alloc_page(GFP_KERNEL);
+		if (!page) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		flavors = page_address(page);
+		ret = secinfo(parent->d_inode, &dentry->d_name, flavors);
+		*flavor = nfs_find_best_sec(flavors);
+		put_page(page);
+	}
+
+out:
+	return ret;
+}
+
+static int nfs_lookup_with_sec(struct nfs_server *server, struct dentry *parent,
+			       struct dentry *dentry, struct path *path,
+			       struct nfs_fh *fh, struct nfs_fattr *fattr,
+			       rpc_authflavor_t *flavor)
+{
+	struct rpc_clnt *clone;
+	struct rpc_auth *auth;
+	int err;
+
+	err = nfs_negotiate_security(parent, path->dentry, flavor);
+	if (err < 0)
+		goto out;
+	clone  = rpc_clone_client(server->client);
+	auth   = rpcauth_create(*flavor, clone);
+	if (!auth) {
+		err = -EIO;
+		goto out_shutdown;
+	}
+	err = server->nfs_client->rpc_ops->lookup(clone, parent->d_inode,
+						  &path->dentry->d_name,
+						  fh, fattr);
+out_shutdown:
+	rpc_shutdown_client(clone);
+out:
+	return err;
+}
+#else /* CONFIG_NFS_V4 */
+static inline int nfs_lookup_with_sec(struct nfs_server *server,
+				      struct dentry *parent, struct dentry *dentry,
+				      struct path *path, struct nfs_fh *fh,
+				      struct nfs_fattr *fattr,
+				      rpc_authflavor_t *flavor)
+{
+	return -EPERM;
+}
+#endif /* CONFIG_NFS_V4 */
+
+/*
+ * nfs_d_automount - Handle crossing a mountpoint on the server
+ * @path - The mountpoint
+ *
+ * When we encounter a mountpoint on the server, we want to set up
+ * a mountpoint on the client too, to prevent inode numbers from
+ * colliding, and to allow "df" to work properly.
+ * On NFSv4, we also want to allow for the fact that different
+ * filesystems may be migrated to different servers in a failover
+ * situation, and that different filesystems may want to use
+ * different security flavours.
+ */
+struct vfsmount *nfs_d_automount(struct path *path)
+{
+	struct vfsmount *mnt;
+	struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
+	struct dentry *parent;
+	struct nfs_fh *fh = NULL;
+	struct nfs_fattr *fattr = NULL;
+	int err;
+	rpc_authflavor_t flavor = RPC_AUTH_UNIX;
+
+	dprintk("--> nfs_d_automount()\n");
+
+	mnt = ERR_PTR(-ESTALE);
+	if (IS_ROOT(path->dentry))
+		goto out_nofree;
+
+	mnt = ERR_PTR(-ENOMEM);
+	fh = nfs_alloc_fhandle();
+	fattr = nfs_alloc_fattr();
+	if (fh == NULL || fattr == NULL)
+		goto out;
+
+	dprintk("%s: enter\n", __func__);
+
+	/* Look it up again to get its attributes */
+	parent = dget_parent(path->dentry);
+	err = server->nfs_client->rpc_ops->lookup(server->client, parent->d_inode,
+						  &path->dentry->d_name,
+						  fh, fattr);
+	if (err == -EPERM && NFS_PROTO(parent->d_inode)->secinfo != NULL)
+		err = nfs_lookup_with_sec(server, parent, path->dentry, path, fh, fattr, &flavor);
+	dput(parent);
+	if (err != 0) {
+		mnt = ERR_PTR(err);
+		goto out;
+	}
+
+	if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+		mnt = nfs_do_refmount(path->dentry);
+	else
+		mnt = nfs_do_submount(path->dentry, fh, fattr, flavor);
+	if (IS_ERR(mnt))
+		goto out;
+
+	dprintk("%s: done, success\n", __func__);
+	mntget(mnt); /* prevent immediate expiration */
+	mnt_set_expiry(mnt, &nfs_automount_list);
+	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+
+out:
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fh);
+out_nofree:
+	dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt);
+	return mnt;
+}
+
+const struct inode_operations nfs_mountpoint_inode_operations = {
+	.getattr	= nfs_getattr,
+};
+
+const struct inode_operations nfs_referral_inode_operations = {
+};
+
+static void nfs_expire_automounts(struct work_struct *work)
+{
+	struct list_head *list = &nfs_automount_list;
+
+	mark_mounts_for_expiry(list);
+	if (!list_empty(list))
+		schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+}
+
+void nfs_release_automount_timer(void)
+{
+	if (list_empty(&nfs_automount_list))
+		cancel_delayed_work(&nfs_automount_task);
+}
+
+/*
+ * Clone a mountpoint of the appropriate type
+ */
+static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
+					   const char *devname,
+					   struct nfs_clone_mount *mountdata)
+{
+#ifdef CONFIG_NFS_V4
+	struct vfsmount *mnt = ERR_PTR(-EINVAL);
+	switch (server->nfs_client->rpc_ops->version) {
+		case 2:
+		case 3:
+			mnt = vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
+			break;
+		case 4:
+			mnt = vfs_kern_mount(&nfs4_xdev_fs_type, 0, devname, mountdata);
+	}
+	return mnt;
+#else
+	return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
+#endif
+}
+
+/**
+ * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
+ * @dentry - parent directory
+ * @fh - filehandle for new root dentry
+ * @fattr - attributes for new root inode
+ * @authflavor - security flavor to use when performing the mount
+ *
+ */
+static struct vfsmount *nfs_do_submount(struct dentry *dentry,
+					struct nfs_fh *fh,
+					struct nfs_fattr *fattr,
+					rpc_authflavor_t authflavor)
+{
+	struct nfs_clone_mount mountdata = {
+		.sb = dentry->d_sb,
+		.dentry = dentry,
+		.fh = fh,
+		.fattr = fattr,
+		.authflavor = authflavor,
+	};
+	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+	char *page = (char *) __get_free_page(GFP_USER);
+	char *devname;
+
+	dprintk("--> nfs_do_submount()\n");
+
+	dprintk("%s: submounting on %s/%s\n", __func__,
+			dentry->d_parent->d_name.name,
+			dentry->d_name.name);
+	if (page == NULL)
+		goto out;
+	devname = nfs_devname(dentry, page, PAGE_SIZE);
+	mnt = (struct vfsmount *)devname;
+	if (IS_ERR(devname))
+		goto free_page;
+	mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata);
+free_page:
+	free_page((unsigned long)page);
+out:
+	dprintk("%s: done\n", __func__);
+
+	dprintk("<-- nfs_do_submount() = %p\n", mnt);
+	return mnt;
+}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
new file mode 100644
index 00000000..792cb13a
--- /dev/null
+++ b/fs/nfs/nfs2xdr.c
@@ -0,0 +1,1157 @@
+/*
+ * linux/fs/nfs/nfs2xdr.c
+ *
+ * XDR functions to encode/decode NFS RPC arguments and results.
+ *
+ * Copyright (C) 1992, 1993, 1994  Rick Sladkey
+ * Copyright (C) 1996 Olaf Kirch
+ * 04 Aug 1998  Ion Badulescu <ionut@cs.columbia.edu>
+ * 		FIFO's need special handling in NFSv2
+ */
+
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_XDR
+
+/* Mapping from NFS error code to "errno" error code. */
+#define errno_NFSERR_IO		EIO
+
+/*
+ * Declare the space requirements for NFS arguments and replies as
+ * number of 32bit-words
+ */
+#define NFS_fhandle_sz		(8)
+#define NFS_sattr_sz		(8)
+#define NFS_filename_sz		(1+(NFS2_MAXNAMLEN>>2))
+#define NFS_path_sz		(1+(NFS2_MAXPATHLEN>>2))
+#define NFS_fattr_sz		(17)
+#define NFS_info_sz		(5)
+#define NFS_entry_sz		(NFS_filename_sz+3)
+
+#define NFS_diropargs_sz	(NFS_fhandle_sz+NFS_filename_sz)
+#define NFS_removeargs_sz	(NFS_fhandle_sz+NFS_filename_sz)
+#define NFS_sattrargs_sz	(NFS_fhandle_sz+NFS_sattr_sz)
+#define NFS_readlinkargs_sz	(NFS_fhandle_sz)
+#define NFS_readargs_sz		(NFS_fhandle_sz+3)
+#define NFS_writeargs_sz	(NFS_fhandle_sz+4)
+#define NFS_createargs_sz	(NFS_diropargs_sz+NFS_sattr_sz)
+#define NFS_renameargs_sz	(NFS_diropargs_sz+NFS_diropargs_sz)
+#define NFS_linkargs_sz		(NFS_fhandle_sz+NFS_diropargs_sz)
+#define NFS_symlinkargs_sz	(NFS_diropargs_sz+1+NFS_sattr_sz)
+#define NFS_readdirargs_sz	(NFS_fhandle_sz+2)
+
+#define NFS_attrstat_sz		(1+NFS_fattr_sz)
+#define NFS_diropres_sz		(1+NFS_fhandle_sz+NFS_fattr_sz)
+#define NFS_readlinkres_sz	(2)
+#define NFS_readres_sz		(1+NFS_fattr_sz+1)
+#define NFS_writeres_sz         (NFS_attrstat_sz)
+#define NFS_stat_sz		(1)
+#define NFS_readdirres_sz	(1)
+#define NFS_statfsres_sz	(1+NFS_info_sz)
+
+
+/*
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
+ */
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
+				 unsigned int base, unsigned int len,
+				 unsigned int bufsize)
+{
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
+	unsigned int replen;
+
+	replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
+}
+
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("NFS: %s prematurely hit the end of our receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
+
+/*
+ * Encode/decode NFSv2 basic data types
+ *
+ * Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+/*
+ *	typedef opaque	nfsdata<>;
+ */
+static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result)
+{
+	u32 recvd, count;
+	size_t hdrlen;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p);
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+	recvd = xdr->buf->len - hdrlen;
+	if (unlikely(count > recvd))
+		goto out_cheating;
+out:
+	xdr_read_pages(xdr, count);
+	result->eof = 0;	/* NFSv2 does not pass EOF flag on the wire. */
+	result->count = count;
+	return count;
+out_cheating:
+	dprintk("NFS: server cheating in read result: "
+		"count %u > recvd %u\n", count, recvd);
+	count = recvd;
+	goto out;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	enum stat {
+ *		NFS_OK = 0,
+ *		NFSERR_PERM = 1,
+ *		NFSERR_NOENT = 2,
+ *		NFSERR_IO = 5,
+ *		NFSERR_NXIO = 6,
+ *		NFSERR_ACCES = 13,
+ *		NFSERR_EXIST = 17,
+ *		NFSERR_NODEV = 19,
+ *		NFSERR_NOTDIR = 20,
+ *		NFSERR_ISDIR = 21,
+ *		NFSERR_FBIG = 27,
+ *		NFSERR_NOSPC = 28,
+ *		NFSERR_ROFS = 30,
+ *		NFSERR_NAMETOOLONG = 63,
+ *		NFSERR_NOTEMPTY = 66,
+ *		NFSERR_DQUOT = 69,
+ *		NFSERR_STALE = 70,
+ *		NFSERR_WFLUSH = 99
+ *	};
+ */
+static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*status = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * 2.3.2.  ftype
+ *
+ *	enum ftype {
+ *		NFNON = 0,
+ *		NFREG = 1,
+ *		NFDIR = 2,
+ *		NFBLK = 3,
+ *		NFCHR = 4,
+ *		NFLNK = 5
+ *	};
+ *
+ */
+static __be32 *xdr_decode_ftype(__be32 *p, u32 *type)
+{
+	*type = be32_to_cpup(p++);
+	if (unlikely(*type > NF2FIFO))
+		*type = NFBAD;
+	return p;
+}
+
+/*
+ * 2.3.3.  fhandle
+ *
+ *	typedef opaque fhandle[FHSIZE];
+ */
+static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+	__be32 *p;
+
+	BUG_ON(fh->size != NFS2_FHSIZE);
+	p = xdr_reserve_space(xdr, NFS2_FHSIZE);
+	memcpy(p, fh->data, NFS2_FHSIZE);
+}
+
+static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	fh->size = NFS2_FHSIZE;
+	memcpy(fh->data, p, NFS2_FHSIZE);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * 2.3.4.  timeval
+ *
+ *	struct timeval {
+ *		unsigned int seconds;
+ *		unsigned int useconds;
+ *	};
+ */
+static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep)
+{
+	*p++ = cpu_to_be32(timep->tv_sec);
+	if (timep->tv_nsec != 0)
+		*p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC);
+	else
+		*p++ = cpu_to_be32(0);
+	return p;
+}
+
+/*
+ * Passing the invalid value useconds=1000000 is a Sun convention for
+ * "set to current server time".  It's needed to make permissions checks
+ * for the "touch" program across v2 mounts to Solaris and Irix servers
+ * work correctly.  See description of sattr in section 6.1 of "NFS
+ * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5.
+ */
+static __be32 *xdr_encode_current_server_time(__be32 *p,
+					      const struct timespec *timep)
+{
+	*p++ = cpu_to_be32(timep->tv_sec);
+	*p++ = cpu_to_be32(1000000);
+	return p;
+}
+
+static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep)
+{
+	timep->tv_sec = be32_to_cpup(p++);
+	timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC;
+	return p;
+}
+
+/*
+ * 2.3.5.  fattr
+ *
+ *	struct fattr {
+ *		ftype		type;
+ *		unsigned int	mode;
+ *		unsigned int	nlink;
+ *		unsigned int	uid;
+ *		unsigned int	gid;
+ *		unsigned int	size;
+ *		unsigned int	blocksize;
+ *		unsigned int	rdev;
+ *		unsigned int	blocks;
+ *		unsigned int	fsid;
+ *		unsigned int	fileid;
+ *		timeval		atime;
+ *		timeval		mtime;
+ *		timeval		ctime;
+ *	};
+ *
+ */
+static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	u32 rdev, type;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+
+	fattr->valid |= NFS_ATTR_FATTR_V2;
+
+	p = xdr_decode_ftype(p, &type);
+
+	fattr->mode = be32_to_cpup(p++);
+	fattr->nlink = be32_to_cpup(p++);
+	fattr->uid = be32_to_cpup(p++);
+	fattr->gid = be32_to_cpup(p++);
+	fattr->size = be32_to_cpup(p++);
+	fattr->du.nfs2.blocksize = be32_to_cpup(p++);
+
+	rdev = be32_to_cpup(p++);
+	fattr->rdev = new_decode_dev(rdev);
+	if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) {
+		fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
+		fattr->rdev = 0;
+	}
+
+	fattr->du.nfs2.blocks = be32_to_cpup(p++);
+	fattr->fsid.major = be32_to_cpup(p++);
+	fattr->fsid.minor = 0;
+	fattr->fileid = be32_to_cpup(p++);
+
+	p = xdr_decode_time(p, &fattr->atime);
+	p = xdr_decode_time(p, &fattr->mtime);
+	xdr_decode_time(p, &fattr->ctime);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * 2.3.6.  sattr
+ *
+ *	struct sattr {
+ *		unsigned int	mode;
+ *		unsigned int	uid;
+ *		unsigned int	gid;
+ *		unsigned int	size;
+ *		timeval		atime;
+ *		timeval		mtime;
+ *	};
+ */
+
+#define NFS2_SATTR_NOT_SET	(0xffffffff)
+
+static __be32 *xdr_time_not_set(__be32 *p)
+{
+	*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+	*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+	return p;
+}
+
+static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, NFS_sattr_sz << 2);
+
+	if (attr->ia_valid & ATTR_MODE)
+		*p++ = cpu_to_be32(attr->ia_mode);
+	else
+		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+	if (attr->ia_valid & ATTR_UID)
+		*p++ = cpu_to_be32(attr->ia_uid);
+	else
+		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+	if (attr->ia_valid & ATTR_GID)
+		*p++ = cpu_to_be32(attr->ia_gid);
+	else
+		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+	if (attr->ia_valid & ATTR_SIZE)
+		*p++ = cpu_to_be32((u32)attr->ia_size);
+	else
+		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+
+	if (attr->ia_valid & ATTR_ATIME_SET)
+		p = xdr_encode_time(p, &attr->ia_atime);
+	else if (attr->ia_valid & ATTR_ATIME)
+		p = xdr_encode_current_server_time(p, &attr->ia_atime);
+	else
+		p = xdr_time_not_set(p);
+	if (attr->ia_valid & ATTR_MTIME_SET)
+		xdr_encode_time(p, &attr->ia_mtime);
+	else if (attr->ia_valid & ATTR_MTIME)
+		xdr_encode_current_server_time(p, &attr->ia_mtime);
+	else
+		xdr_time_not_set(p);
+}
+
+/*
+ * 2.3.7.  filename
+ *
+ *	typedef string filename<MAXNAMLEN>;
+ */
+static void encode_filename(struct xdr_stream *xdr,
+			    const char *name, u32 length)
+{
+	__be32 *p;
+
+	BUG_ON(length > NFS2_MAXNAMLEN);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, name, length);
+}
+
+static int decode_filename_inline(struct xdr_stream *xdr,
+				  const char **name, u32 *length)
+{
+	__be32 *p;
+	u32 count;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p);
+	if (count > NFS3_MAXNAMLEN)
+		goto out_nametoolong;
+	p = xdr_inline_decode(xdr, count);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*name = (const char *)p;
+	*length = count;
+	return 0;
+out_nametoolong:
+	dprintk("NFS: returned filename too long: %u\n", count);
+	return -ENAMETOOLONG;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * 2.3.8.  path
+ *
+ *	typedef string path<MAXPATHLEN>;
+ */
+static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
+{
+	__be32 *p;
+
+	BUG_ON(length > NFS2_MAXPATHLEN);
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(length);
+	xdr_write_pages(xdr, pages, 0, length);
+}
+
+static int decode_path(struct xdr_stream *xdr)
+{
+	u32 length, recvd;
+	size_t hdrlen;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p);
+	if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
+		goto out_size;
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+	recvd = xdr->buf->len - hdrlen;
+	if (unlikely(length > recvd))
+		goto out_cheating;
+
+	xdr_read_pages(xdr, length);
+	xdr_terminate_string(xdr->buf, length);
+	return 0;
+out_size:
+	dprintk("NFS: returned pathname too long: %u\n", length);
+	return -ENAMETOOLONG;
+out_cheating:
+	dprintk("NFS: server cheating in pathname result: "
+		"length %u > received %u\n", length, recvd);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * 2.3.9.  attrstat
+ *
+ *	union attrstat switch (stat status) {
+ *	case NFS_OK:
+ *		fattr attributes;
+ *	default:
+ *		void;
+ *	};
+ */
+static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_fattr(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 2.3.10.  diropargs
+ *
+ *	struct diropargs {
+ *		fhandle  dir;
+ *		filename name;
+ *	};
+ */
+static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh,
+			     const char *name, u32 length)
+{
+	encode_fhandle(xdr, fh);
+	encode_filename(xdr, name, length);
+}
+
+/*
+ * 2.3.11.  diropres
+ *
+ *	union diropres switch (stat status) {
+ *	case NFS_OK:
+ *		struct {
+ *			fhandle file;
+ *			fattr   attributes;
+ *		} diropok;
+ *	default:
+ *		void;
+ *	};
+ */
+static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result)
+{
+	int error;
+
+	error = decode_fhandle(xdr, result->fh);
+	if (unlikely(error))
+		goto out;
+	error = decode_fattr(xdr, result->fattr);
+out:
+	return error;
+}
+
+static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_diropok(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+
+/*
+ * NFSv2 XDR encode functions
+ *
+ * NFSv2 argument types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ */
+
+static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nfs_fh *fh)
+{
+	encode_fhandle(xdr, fh);
+}
+
+/*
+ * 2.2.3.  sattrargs
+ *
+ *	struct sattrargs {
+ *		fhandle file;
+ *		sattr attributes;
+ *	};
+ */
+static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_sattrargs *args)
+{
+	encode_fhandle(xdr, args->fh);
+	encode_sattr(xdr, args->sattr);
+}
+
+static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_diropargs *args)
+{
+	encode_diropargs(xdr, args->fh, args->name, args->len);
+}
+
+static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs_readlinkargs *args)
+{
+	encode_fhandle(xdr, args->fh);
+	prepare_reply_buffer(req, args->pages, args->pgbase,
+					args->pglen, NFS_readlinkres_sz);
+}
+
+/*
+ * 2.2.7.  readargs
+ *
+ *	struct readargs {
+ *		fhandle file;
+ *		unsigned offset;
+ *		unsigned count;
+ *		unsigned totalcount;
+ *	};
+ */
+static void encode_readargs(struct xdr_stream *xdr,
+			    const struct nfs_readargs *args)
+{
+	u32 offset = args->offset;
+	u32 count = args->count;
+	__be32 *p;
+
+	encode_fhandle(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 4 + 4 + 4);
+	*p++ = cpu_to_be32(offset);
+	*p++ = cpu_to_be32(count);
+	*p = cpu_to_be32(count);
+}
+
+static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nfs_readargs *args)
+{
+	encode_readargs(xdr, args);
+	prepare_reply_buffer(req, args->pages, args->pgbase,
+					args->count, NFS_readres_sz);
+	req->rq_rcv_buf.flags |= XDRBUF_READ;
+}
+
+/*
+ * 2.2.9.  writeargs
+ *
+ *	struct writeargs {
+ *		fhandle file;
+ *		unsigned beginoffset;
+ *		unsigned offset;
+ *		unsigned totalcount;
+ *		nfsdata data;
+ *	};
+ */
+static void encode_writeargs(struct xdr_stream *xdr,
+			     const struct nfs_writeargs *args)
+{
+	u32 offset = args->offset;
+	u32 count = args->count;
+	__be32 *p;
+
+	encode_fhandle(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
+	*p++ = cpu_to_be32(offset);
+	*p++ = cpu_to_be32(offset);
+	*p++ = cpu_to_be32(count);
+
+	/* nfsdata */
+	*p = cpu_to_be32(count);
+	xdr_write_pages(xdr, args->pages, args->pgbase, count);
+}
+
+static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_writeargs *args)
+{
+	encode_writeargs(xdr, args);
+	xdr->buf->flags |= XDRBUF_WRITE;
+}
+
+/*
+ * 2.2.10.  createargs
+ *
+ *	struct createargs {
+ *		diropargs where;
+ *		sattr attributes;
+ *	};
+ */
+static void nfs2_xdr_enc_createargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_createargs *args)
+{
+	encode_diropargs(xdr, args->fh, args->name, args->len);
+	encode_sattr(xdr, args->sattr);
+}
+
+static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_removeargs *args)
+{
+	encode_diropargs(xdr, args->fh, args->name.name, args->name.len);
+}
+
+/*
+ * 2.2.12.  renameargs
+ *
+ *	struct renameargs {
+ *		diropargs from;
+ *		diropargs to;
+ *	};
+ */
+static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_renameargs *args)
+{
+	const struct qstr *old = args->old_name;
+	const struct qstr *new = args->new_name;
+
+	encode_diropargs(xdr, args->old_dir, old->name, old->len);
+	encode_diropargs(xdr, args->new_dir, new->name, new->len);
+}
+
+/*
+ * 2.2.13.  linkargs
+ *
+ *	struct linkargs {
+ *		fhandle from;
+ *		diropargs to;
+ *	};
+ */
+static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nfs_linkargs *args)
+{
+	encode_fhandle(xdr, args->fromfh);
+	encode_diropargs(xdr, args->tofh, args->toname, args->tolen);
+}
+
+/*
+ * 2.2.14.  symlinkargs
+ *
+ *	struct symlinkargs {
+ *		diropargs from;
+ *		path to;
+ *		sattr attributes;
+ *	};
+ */
+static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_symlinkargs *args)
+{
+	encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
+	encode_path(xdr, args->pages, args->pathlen);
+	encode_sattr(xdr, args->sattr);
+}
+
+/*
+ * 2.2.17.  readdirargs
+ *
+ *	struct readdirargs {
+ *		fhandle dir;
+ *		nfscookie cookie;
+ *		unsigned count;
+ *	};
+ */
+static void encode_readdirargs(struct xdr_stream *xdr,
+			       const struct nfs_readdirargs *args)
+{
+	__be32 *p;
+
+	encode_fhandle(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 4 + 4);
+	*p++ = cpu_to_be32(args->cookie);
+	*p = cpu_to_be32(args->count);
+}
+
+static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_readdirargs *args)
+{
+	encode_readdirargs(xdr, args);
+	prepare_reply_buffer(req, args->pages, 0,
+					args->count, NFS_readdirres_sz);
+}
+
+/*
+ * NFSv2 XDR decode functions
+ *
+ * NFSv2 result types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ */
+
+static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr,
+			     void *__unused)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_fattr *result)
+{
+	return decode_attrstat(xdr, result);
+}
+
+static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_diropok *result)
+{
+	return decode_diropres(xdr, result);
+}
+
+/*
+ * 2.2.6.  readlinkres
+ *
+ *	union readlinkres switch (stat status) {
+ *	case NFS_OK:
+ *		path data;
+ *	default:
+ *		void;
+ *	};
+ */
+static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req,
+				    struct xdr_stream *xdr, void *__unused)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_path(xdr);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 2.2.7.  readres
+ *
+ *	union readres switch (stat status) {
+ *	case NFS_OK:
+ *		fattr attributes;
+ *		nfsdata data;
+ *	default:
+ *		void;
+ *	};
+ */
+static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs_readres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_fattr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_nfsdata(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_writeres *result)
+{
+	/* All NFSv2 writes are "file sync" writes */
+	result->verf->committed = NFS_FILE_SYNC;
+	return decode_attrstat(xdr, result->fattr);
+}
+
+/**
+ * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 2.2.17.  entry
+ *
+ *	struct entry {
+ *		unsigned	fileid;
+ *		filename	name;
+ *		nfscookie	cookie;
+ *		entry		*nextentry;
+ *	};
+ */
+int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+		       int plus)
+{
+	__be32 *p;
+	int error;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p++ == xdr_zero) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(p == NULL))
+			goto out_overflow;
+		if (*p++ == xdr_zero)
+			return -EAGAIN;
+		entry->eof = 1;
+		return -EBADCOOKIE;
+	}
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	entry->ino = be32_to_cpup(p);
+
+	error = decode_filename_inline(xdr, &entry->name, &entry->len);
+	if (unlikely(error))
+		return error;
+
+	/*
+	 * The type (size and byte order) of nfscookie isn't defined in
+	 * RFC 1094.  This implementation assumes that it's an XDR uint32.
+	 */
+	entry->prev_cookie = entry->cookie;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	entry->cookie = be32_to_cpup(p);
+
+	entry->d_type = DT_UNKNOWN;
+
+	return 0;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EAGAIN;
+}
+
+/*
+ * 2.2.17.  readdirres
+ *
+ *	union readdirres switch (stat status) {
+ *	case NFS_OK:
+ *		struct {
+ *			entry *entries;
+ *			bool eof;
+ *		} readdirok;
+ *	default:
+ *		void;
+ *	};
+ *
+ * Read the directory contents into the page cache, but don't
+ * touch them.  The actual decoding is done by nfs2_decode_dirent()
+ * during subsequent nfs_readdir() calls.
+ */
+static int decode_readdirok(struct xdr_stream *xdr)
+{
+	u32 recvd, pglen;
+	size_t hdrlen;
+
+	pglen = xdr->buf->page_len;
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+	recvd = xdr->buf->len - hdrlen;
+	if (unlikely(pglen > recvd))
+		goto out_cheating;
+out:
+	xdr_read_pages(xdr, pglen);
+	return pglen;
+out_cheating:
+	dprintk("NFS: server cheating in readdir result: "
+		"pglen %u > recvd %u\n", pglen, recvd);
+	pglen = recvd;
+	goto out;
+}
+
+static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req,
+				   struct xdr_stream *xdr, void *__unused)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_readdirok(xdr);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 2.2.18.  statfsres
+ *
+ *	union statfsres (stat status) {
+ *	case NFS_OK:
+ *		struct {
+ *			unsigned tsize;
+ *			unsigned bsize;
+ *			unsigned blocks;
+ *			unsigned bfree;
+ *			unsigned bavail;
+ *		} info;
+ *	default:
+ *		void;
+ *	};
+ */
+static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS_info_sz << 2);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	result->tsize  = be32_to_cpup(p++);
+	result->bsize  = be32_to_cpup(p++);
+	result->blocks = be32_to_cpup(p++);
+	result->bfree  = be32_to_cpup(p++);
+	result->bavail = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  struct nfs2_fsstat *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_info(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ */
+static const struct {
+	int stat;
+	int errno;
+} nfs_errtbl[] = {
+	{ NFS_OK,		0		},
+	{ NFSERR_PERM,		-EPERM		},
+	{ NFSERR_NOENT,		-ENOENT		},
+	{ NFSERR_IO,		-errno_NFSERR_IO},
+	{ NFSERR_NXIO,		-ENXIO		},
+/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
+	{ NFSERR_ACCES,		-EACCES		},
+	{ NFSERR_EXIST,		-EEXIST		},
+	{ NFSERR_XDEV,		-EXDEV		},
+	{ NFSERR_NODEV,		-ENODEV		},
+	{ NFSERR_NOTDIR,	-ENOTDIR	},
+	{ NFSERR_ISDIR,		-EISDIR		},
+	{ NFSERR_INVAL,		-EINVAL		},
+	{ NFSERR_FBIG,		-EFBIG		},
+	{ NFSERR_NOSPC,		-ENOSPC		},
+	{ NFSERR_ROFS,		-EROFS		},
+	{ NFSERR_MLINK,		-EMLINK		},
+	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	},
+	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	},
+	{ NFSERR_DQUOT,		-EDQUOT		},
+	{ NFSERR_STALE,		-ESTALE		},
+	{ NFSERR_REMOTE,	-EREMOTE	},
+#ifdef EWFLUSH
+	{ NFSERR_WFLUSH,	-EWFLUSH	},
+#endif
+	{ NFSERR_BADHANDLE,	-EBADHANDLE	},
+	{ NFSERR_NOT_SYNC,	-ENOTSYNC	},
+	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
+	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
+	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
+	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
+	{ NFSERR_BADTYPE,	-EBADTYPE	},
+	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
+	{ -1,			-EIO		}
+};
+
+/**
+ * nfs_stat_to_errno - convert an NFS status code to a local errno
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized.  This function is used jointly by NFSv2 and NFSv3.
+ */
+int nfs_stat_to_errno(enum nfs_stat status)
+{
+	int i;
+
+	for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+		if (nfs_errtbl[i].stat == (int)status)
+			return nfs_errtbl[i].errno;
+	}
+	dprintk("NFS: Unrecognized nfs status value: %u\n", status);
+	return nfs_errtbl[i].errno;
+}
+
+#define PROC(proc, argtype, restype, timer)				\
+[NFSPROC_##proc] = {							\
+	.p_proc	    =  NFSPROC_##proc,					\
+	.p_encode   =  (kxdreproc_t)nfs2_xdr_enc_##argtype,		\
+	.p_decode   =  (kxdrdproc_t)nfs2_xdr_dec_##restype,		\
+	.p_arglen   =  NFS_##argtype##_sz,				\
+	.p_replen   =  NFS_##restype##_sz,				\
+	.p_timer    =  timer,						\
+	.p_statidx  =  NFSPROC_##proc,					\
+	.p_name     =  #proc,						\
+	}
+struct rpc_procinfo	nfs_procedures[] = {
+	PROC(GETATTR,	fhandle,	attrstat,	1),
+	PROC(SETATTR,	sattrargs,	attrstat,	0),
+	PROC(LOOKUP,	diropargs,	diropres,	2),
+	PROC(READLINK,	readlinkargs,	readlinkres,	3),
+	PROC(READ,	readargs,	readres,	3),
+	PROC(WRITE,	writeargs,	writeres,	4),
+	PROC(CREATE,	createargs,	diropres,	0),
+	PROC(REMOVE,	removeargs,	stat,		0),
+	PROC(RENAME,	renameargs,	stat,		0),
+	PROC(LINK,	linkargs,	stat,		0),
+	PROC(SYMLINK,	symlinkargs,	stat,		0),
+	PROC(MKDIR,	createargs,	diropres,	0),
+	PROC(RMDIR,	diropargs,	stat,		0),
+	PROC(READDIR,	readdirargs,	readdirres,	3),
+	PROC(STATFS,	fhandle,	statfsres,	0),
+};
+
+struct rpc_version		nfs_version2 = {
+	.number			= 2,
+	.nrprocs		= ARRAY_SIZE(nfs_procedures),
+	.procs			= nfs_procedures
+};
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
new file mode 100644
index 00000000..27434277
--- /dev/null
+++ b/fs/nfs/nfs3acl.c
@@ -0,0 +1,444 @@
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs_fs.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/nfsacl.h>
+
+#include "internal.h"
+
+#define NFSDBG_FACILITY	NFSDBG_PROC
+
+ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct posix_acl *acl;
+	int pos=0, len=0;
+
+#	define output(s) do {						\
+			if (pos + sizeof(s) <= size) {			\
+				memcpy(buffer + pos, s, sizeof(s));	\
+				pos += sizeof(s);			\
+			}						\
+			len += sizeof(s);				\
+		} while(0)
+
+	acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		output("system.posix_acl_access");
+		posix_acl_release(acl);
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+		acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (acl) {
+			output("system.posix_acl_default");
+			posix_acl_release(acl);
+		}
+	}
+
+#	undef output
+
+	if (!buffer || len <= size)
+		return len;
+	return -ERANGE;
+}
+
+ssize_t nfs3_getxattr(struct dentry *dentry, const char *name,
+		void *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct posix_acl *acl;
+	int type, error = 0;
+
+	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
+		type = ACL_TYPE_ACCESS;
+	else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
+		type = ACL_TYPE_DEFAULT;
+	else
+		return -EOPNOTSUPP;
+
+	acl = nfs3_proc_getacl(inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	else if (acl) {
+		if (type == ACL_TYPE_ACCESS && acl->a_count == 0)
+			error = -ENODATA;
+		else
+			error = posix_acl_to_xattr(acl, buffer, size);
+		posix_acl_release(acl);
+	} else
+		error = -ENODATA;
+
+	return error;
+}
+
+int nfs3_setxattr(struct dentry *dentry, const char *name,
+	     const void *value, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	struct posix_acl *acl;
+	int type, error;
+
+	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
+		type = ACL_TYPE_ACCESS;
+	else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
+		type = ACL_TYPE_DEFAULT;
+	else
+		return -EOPNOTSUPP;
+
+	acl = posix_acl_from_xattr(value, size);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	error = nfs3_proc_setacl(inode, type, acl);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+int nfs3_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = dentry->d_inode;
+	int type;
+
+	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
+		type = ACL_TYPE_ACCESS;
+	else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
+		type = ACL_TYPE_DEFAULT;
+	else
+		return -EOPNOTSUPP;
+
+	return nfs3_proc_setacl(inode, type, NULL);
+}
+
+static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi)
+{
+	if (!IS_ERR(nfsi->acl_access)) {
+		posix_acl_release(nfsi->acl_access);
+		nfsi->acl_access = ERR_PTR(-EAGAIN);
+	}
+	if (!IS_ERR(nfsi->acl_default)) {
+		posix_acl_release(nfsi->acl_default);
+		nfsi->acl_default = ERR_PTR(-EAGAIN);
+	}
+}
+
+void nfs3_forget_cached_acls(struct inode *inode)
+{
+	dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id,
+		inode->i_ino);
+	spin_lock(&inode->i_lock);
+	__nfs3_forget_cached_acls(NFS_I(inode));
+	spin_unlock(&inode->i_lock);
+}
+
+static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct posix_acl *acl = ERR_PTR(-EINVAL);
+
+	spin_lock(&inode->i_lock);
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			acl = nfsi->acl_access;
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			acl = nfsi->acl_default;
+			break;
+
+		default:
+			goto out;
+	}
+	if (IS_ERR(acl))
+		acl = ERR_PTR(-EAGAIN);
+	else
+		acl = posix_acl_dup(acl);
+out:
+	spin_unlock(&inode->i_lock);
+	dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id,
+		inode->i_ino, type, acl);
+	return acl;
+}
+
+static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
+		    struct posix_acl *dfacl)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id,
+		inode->i_ino, acl, dfacl);
+	spin_lock(&inode->i_lock);
+	__nfs3_forget_cached_acls(NFS_I(inode));
+	if (!IS_ERR(acl))
+		nfsi->acl_access = posix_acl_dup(acl);
+	if (!IS_ERR(dfacl))
+		nfsi->acl_default = posix_acl_dup(dfacl);
+	spin_unlock(&inode->i_lock);
+}
+
+struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct page *pages[NFSACL_MAXPAGES] = { };
+	struct nfs3_getaclargs args = {
+		.fh = NFS_FH(inode),
+		/* The xdr layer may allocate pages here. */
+		.pages = pages,
+	};
+	struct nfs3_getaclres res = {
+		0
+	};
+	struct rpc_message msg = {
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+	};
+	struct posix_acl *acl;
+	int status, count;
+
+	if (!nfs_server_capable(inode, NFS_CAP_ACLS))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	status = nfs_revalidate_inode(server, inode);
+	if (status < 0)
+		return ERR_PTR(status);
+	acl = nfs3_get_cached_acl(inode, type);
+	if (acl != ERR_PTR(-EAGAIN))
+		return acl;
+	acl = NULL;
+
+	/*
+	 * Only get the access acl when explicitly requested: We don't
+	 * need it for access decisions, and only some applications use
+	 * it. Applications which request the access acl first are not
+	 * penalized from this optimization.
+	 */
+	if (type == ACL_TYPE_ACCESS)
+		args.mask |= NFS_ACLCNT|NFS_ACL;
+	if (S_ISDIR(inode->i_mode))
+		args.mask |= NFS_DFACLCNT|NFS_DFACL;
+	if (args.mask == 0)
+		return NULL;
+
+	dprintk("NFS call getacl\n");
+	msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
+	res.fattr = nfs_alloc_fattr();
+	if (res.fattr == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	status = rpc_call_sync(server->client_acl, &msg, 0);
+	dprintk("NFS reply getacl: %d\n", status);
+
+	/* pages may have been allocated at the xdr layer. */
+	for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++)
+		__free_page(args.pages[count]);
+
+	switch (status) {
+		case 0:
+			status = nfs_refresh_inode(inode, res.fattr);
+			break;
+		case -EPFNOSUPPORT:
+		case -EPROTONOSUPPORT:
+			dprintk("NFS_V3_ACL extension not supported; disabling\n");
+			server->caps &= ~NFS_CAP_ACLS;
+		case -ENOTSUPP:
+			status = -EOPNOTSUPP;
+		default:
+			goto getout;
+	}
+	if ((args.mask & res.mask) != args.mask) {
+		status = -EIO;
+		goto getout;
+	}
+
+	if (res.acl_access != NULL) {
+		if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) {
+			posix_acl_release(res.acl_access);
+			res.acl_access = NULL;
+		}
+	}
+	nfs3_cache_acls(inode,
+		(res.mask & NFS_ACL)   ? res.acl_access  : ERR_PTR(-EINVAL),
+		(res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL));
+
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			acl = res.acl_access;
+			res.acl_access = NULL;
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			acl = res.acl_default;
+			res.acl_default = NULL;
+	}
+
+getout:
+	posix_acl_release(res.acl_access);
+	posix_acl_release(res.acl_default);
+	nfs_free_fattr(res.fattr);
+
+	if (status != 0) {
+		posix_acl_release(acl);
+		acl = ERR_PTR(status);
+	}
+	return acl;
+}
+
+static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+		  struct posix_acl *dfacl)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_fattr *fattr;
+	struct page *pages[NFSACL_MAXPAGES];
+	struct nfs3_setaclargs args = {
+		.inode = inode,
+		.mask = NFS_ACL,
+		.acl_access = acl,
+		.pages = pages,
+	};
+	struct rpc_message msg = {
+		.rpc_argp	= &args,
+		.rpc_resp	= &fattr,
+	};
+	int status;
+
+	status = -EOPNOTSUPP;
+	if (!nfs_server_capable(inode, NFS_CAP_ACLS))
+		goto out;
+
+	/* We are doing this here because XDR marshalling does not
+	 * return any results, it BUGs. */
+	status = -ENOSPC;
+	if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
+		goto out;
+	if (dfacl != NULL && dfacl->a_count > NFS_ACL_MAX_ENTRIES)
+		goto out;
+	if (S_ISDIR(inode->i_mode)) {
+		args.mask |= NFS_DFACL;
+		args.acl_default = dfacl;
+		args.len = nfsacl_size(acl, dfacl);
+	} else
+		args.len = nfsacl_size(acl, NULL);
+
+	if (args.len > NFS_ACL_INLINE_BUFSIZE) {
+		unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT);
+
+		status = -ENOMEM;
+		do {
+			args.pages[args.npages] = alloc_page(GFP_KERNEL);
+			if (args.pages[args.npages] == NULL)
+				goto out_freepages;
+			args.npages++;
+		} while (args.npages < npages);
+	}
+
+	dprintk("NFS call setacl\n");
+	status = -ENOMEM;
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		goto out_freepages;
+
+	msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
+	msg.rpc_resp = fattr;
+	status = rpc_call_sync(server->client_acl, &msg, 0);
+	nfs_access_zap_cache(inode);
+	nfs_zap_acl_cache(inode);
+	dprintk("NFS reply setacl: %d\n", status);
+
+	switch (status) {
+		case 0:
+			status = nfs_refresh_inode(inode, fattr);
+			nfs3_cache_acls(inode, acl, dfacl);
+			break;
+		case -EPFNOSUPPORT:
+		case -EPROTONOSUPPORT:
+			dprintk("NFS_V3_ACL SETACL RPC not supported"
+					"(will not retry)\n");
+			server->caps &= ~NFS_CAP_ACLS;
+		case -ENOTSUPP:
+			status = -EOPNOTSUPP;
+	}
+	nfs_free_fattr(fattr);
+out_freepages:
+	while (args.npages != 0) {
+		args.npages--;
+		__free_page(args.pages[args.npages]);
+	}
+out:
+	return status;
+}
+
+int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl)
+{
+	struct posix_acl *alloc = NULL, *dfacl = NULL;
+	int status;
+
+	if (S_ISDIR(inode->i_mode)) {
+		switch(type) {
+			case ACL_TYPE_ACCESS:
+				alloc = dfacl = nfs3_proc_getacl(inode,
+						ACL_TYPE_DEFAULT);
+				if (IS_ERR(alloc))
+					goto fail;
+				break;
+
+			case ACL_TYPE_DEFAULT:
+				dfacl = acl;
+				alloc = acl = nfs3_proc_getacl(inode,
+						ACL_TYPE_ACCESS);
+				if (IS_ERR(alloc))
+					goto fail;
+				break;
+
+			default:
+				return -EINVAL;
+		}
+	} else if (type != ACL_TYPE_ACCESS)
+			return -EINVAL;
+
+	if (acl == NULL) {
+		alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+		if (IS_ERR(alloc))
+			goto fail;
+	}
+	status = nfs3_proc_setacls(inode, acl, dfacl);
+	posix_acl_release(alloc);
+	return status;
+
+fail:
+	return PTR_ERR(alloc);
+}
+
+int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode,
+		mode_t mode)
+{
+	struct posix_acl *dfacl, *acl;
+	int error = 0;
+
+	dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT);
+	if (IS_ERR(dfacl)) {
+		error = PTR_ERR(dfacl);
+		return (error == -EOPNOTSUPP) ? 0 : error;
+	}
+	if (!dfacl)
+		return 0;
+	acl = posix_acl_clone(dfacl, GFP_KERNEL);
+	error = -ENOMEM;
+	if (!acl)
+		goto out_release_dfacl;
+	error = posix_acl_create_masq(acl, &mode);
+	if (error < 0)
+		goto out_release_acl;
+	error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ?
+						      dfacl : NULL);
+out_release_acl:
+	posix_acl_release(acl);
+out_release_dfacl:
+	posix_acl_release(dfacl);
+	return error;
+}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
new file mode 100644
index 00000000..771741f1
--- /dev/null
+++ b/fs/nfs/nfs3proc.c
@@ -0,0 +1,890 @@
+/*
+ *  linux/fs/nfs/nfs3proc.c
+ *
+ *  Client-side NFSv3 procedures stubs.
+ *
+ *  Copyright (C) 1997, Olaf Kirch
+ */
+
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/slab.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/lockd/bind.h>
+#include <linux/nfs_mount.h>
+
+#include "iostat.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PROC
+
+/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */
+static int
+nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
+{
+	int res;
+	do {
+		res = rpc_call_sync(clnt, msg, flags);
+		if (res != -EJUKEBOX && res != -EKEYEXPIRED)
+			break;
+		schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
+		res = -ERESTARTSYS;
+	} while (!fatal_signal_pending(current));
+	return res;
+}
+
+#define rpc_call_sync(clnt, msg, flags)	nfs3_rpc_wrapper(clnt, msg, flags)
+
+static int
+nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
+{
+	if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED)
+		return 0;
+	if (task->tk_status == -EJUKEBOX)
+		nfs_inc_stats(inode, NFSIOS_DELAY);
+	task->tk_status = 0;
+	rpc_restart_call(task);
+	rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+	return 1;
+}
+
+static int
+do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle,
+		 struct nfs_fsinfo *info)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_FSINFO],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= info,
+	};
+	int	status;
+
+	dprintk("%s: call  fsinfo\n", __func__);
+	nfs_fattr_init(info->fattr);
+	status = rpc_call_sync(client, &msg, 0);
+	dprintk("%s: reply fsinfo: %d\n", __func__, status);
+	if (!(info->fattr->valid & NFS_ATTR_FATTR)) {
+		msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
+		msg.rpc_resp = info->fattr;
+		status = rpc_call_sync(client, &msg, 0);
+		dprintk("%s: reply getattr: %d\n", __func__, status);
+	}
+	return status;
+}
+
+/*
+ * Bare-bones access to getattr: this is for nfs_get_root/nfs_get_sb
+ */
+static int
+nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
+		   struct nfs_fsinfo *info)
+{
+	int	status;
+
+	status = do_proc_get_root(server->client, fhandle, info);
+	if (status && server->nfs_client->cl_rpcclient != server->client)
+		status = do_proc_get_root(server->nfs_client->cl_rpcclient, fhandle, info);
+	return status;
+}
+
+/*
+ * One function for each procedure in the NFS protocol.
+ */
+static int
+nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+		struct nfs_fattr *fattr)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_GETATTR],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= fattr,
+	};
+	int	status;
+
+	dprintk("NFS call  getattr\n");
+	nfs_fattr_init(fattr);
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("NFS reply getattr: %d\n", status);
+	return status;
+}
+
+static int
+nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
+			struct iattr *sattr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct nfs3_sattrargs	arg = {
+		.fh		= NFS_FH(inode),
+		.sattr		= sattr,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_SETATTR],
+		.rpc_argp	= &arg,
+		.rpc_resp	= fattr,
+	};
+	int	status;
+
+	dprintk("NFS call  setattr\n");
+	if (sattr->ia_valid & ATTR_FILE)
+		msg.rpc_cred = nfs_file_cred(sattr->ia_file);
+	nfs_fattr_init(fattr);
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	if (status == 0)
+		nfs_setattr_update_inode(inode, sattr);
+	dprintk("NFS reply setattr: %d\n", status);
+	return status;
+}
+
+static int
+nfs3_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
+		 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+{
+	struct nfs3_diropargs	arg = {
+		.fh		= NFS_FH(dir),
+		.name		= name->name,
+		.len		= name->len
+	};
+	struct nfs3_diropres	res = {
+		.fh		= fhandle,
+		.fattr		= fattr
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_LOOKUP],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
+	int			status;
+
+	dprintk("NFS call  lookup %s\n", name->name);
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.dir_attr == NULL)
+		return -ENOMEM;
+
+	nfs_fattr_init(fattr);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_refresh_inode(dir, res.dir_attr);
+	if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
+		msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
+		msg.rpc_argp = fhandle;
+		msg.rpc_resp = fattr;
+		status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	}
+	nfs_free_fattr(res.dir_attr);
+	dprintk("NFS reply lookup: %d\n", status);
+	return status;
+}
+
+static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+{
+	struct nfs3_accessargs	arg = {
+		.fh		= NFS_FH(inode),
+	};
+	struct nfs3_accessres	res;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_ACCESS],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+		.rpc_cred	= entry->cred,
+	};
+	int mode = entry->mask;
+	int status = -ENOMEM;
+
+	dprintk("NFS call  access\n");
+
+	if (mode & MAY_READ)
+		arg.access |= NFS3_ACCESS_READ;
+	if (S_ISDIR(inode->i_mode)) {
+		if (mode & MAY_WRITE)
+			arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE;
+		if (mode & MAY_EXEC)
+			arg.access |= NFS3_ACCESS_LOOKUP;
+	} else {
+		if (mode & MAY_WRITE)
+			arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND;
+		if (mode & MAY_EXEC)
+			arg.access |= NFS3_ACCESS_EXECUTE;
+	}
+
+	res.fattr = nfs_alloc_fattr();
+	if (res.fattr == NULL)
+		goto out;
+
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	nfs_refresh_inode(inode, res.fattr);
+	if (status == 0) {
+		entry->mask = 0;
+		if (res.access & NFS3_ACCESS_READ)
+			entry->mask |= MAY_READ;
+		if (res.access & (NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE))
+			entry->mask |= MAY_WRITE;
+		if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE))
+			entry->mask |= MAY_EXEC;
+	}
+	nfs_free_fattr(res.fattr);
+out:
+	dprintk("NFS reply access: %d\n", status);
+	return status;
+}
+
+static int nfs3_proc_readlink(struct inode *inode, struct page *page,
+		unsigned int pgbase, unsigned int pglen)
+{
+	struct nfs_fattr	*fattr;
+	struct nfs3_readlinkargs args = {
+		.fh		= NFS_FH(inode),
+		.pgbase		= pgbase,
+		.pglen		= pglen,
+		.pages		= &page
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_READLINK],
+		.rpc_argp	= &args,
+	};
+	int status = -ENOMEM;
+
+	dprintk("NFS call  readlink\n");
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		goto out;
+	msg.rpc_resp = fattr;
+
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	nfs_refresh_inode(inode, fattr);
+	nfs_free_fattr(fattr);
+out:
+	dprintk("NFS reply readlink: %d\n", status);
+	return status;
+}
+
+struct nfs3_createdata {
+	struct rpc_message msg;
+	union {
+		struct nfs3_createargs create;
+		struct nfs3_mkdirargs mkdir;
+		struct nfs3_symlinkargs symlink;
+		struct nfs3_mknodargs mknod;
+	} arg;
+	struct nfs3_diropres res;
+	struct nfs_fh fh;
+	struct nfs_fattr fattr;
+	struct nfs_fattr dir_attr;
+};
+
+static struct nfs3_createdata *nfs3_alloc_createdata(void)
+{
+	struct nfs3_createdata *data;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data != NULL) {
+		data->msg.rpc_argp = &data->arg;
+		data->msg.rpc_resp = &data->res;
+		data->res.fh = &data->fh;
+		data->res.fattr = &data->fattr;
+		data->res.dir_attr = &data->dir_attr;
+		nfs_fattr_init(data->res.fattr);
+		nfs_fattr_init(data->res.dir_attr);
+	}
+	return data;
+}
+
+static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
+{
+	int status;
+
+	status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+	nfs_post_op_update_inode(dir, data->res.dir_attr);
+	if (status == 0)
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+	return status;
+}
+
+static void nfs3_free_createdata(struct nfs3_createdata *data)
+{
+	kfree(data);
+}
+
+/*
+ * Create a regular file.
+ */
+static int
+nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+		 int flags, struct nfs_open_context *ctx)
+{
+	struct nfs3_createdata *data;
+	mode_t mode = sattr->ia_mode;
+	int status = -ENOMEM;
+
+	dprintk("NFS call  create %s\n", dentry->d_name.name);
+
+	data = nfs3_alloc_createdata();
+	if (data == NULL)
+		goto out;
+
+	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE];
+	data->arg.create.fh = NFS_FH(dir);
+	data->arg.create.name = dentry->d_name.name;
+	data->arg.create.len = dentry->d_name.len;
+	data->arg.create.sattr = sattr;
+
+	data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
+	if (flags & O_EXCL) {
+		data->arg.create.createmode  = NFS3_CREATE_EXCLUSIVE;
+		data->arg.create.verifier[0] = jiffies;
+		data->arg.create.verifier[1] = current->pid;
+	}
+
+	sattr->ia_mode &= ~current_umask();
+
+	for (;;) {
+		status = nfs3_do_create(dir, dentry, data);
+
+		if (status != -ENOTSUPP)
+			break;
+		/* If the server doesn't support the exclusive creation
+		 * semantics, try again with simple 'guarded' mode. */
+		switch (data->arg.create.createmode) {
+			case NFS3_CREATE_EXCLUSIVE:
+				data->arg.create.createmode = NFS3_CREATE_GUARDED;
+				break;
+
+			case NFS3_CREATE_GUARDED:
+				data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
+				break;
+
+			case NFS3_CREATE_UNCHECKED:
+				goto out;
+		}
+		nfs_fattr_init(data->res.dir_attr);
+		nfs_fattr_init(data->res.fattr);
+	}
+
+	if (status != 0)
+		goto out;
+
+	/* When we created the file with exclusive semantics, make
+	 * sure we set the attributes afterwards. */
+	if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
+		dprintk("NFS call  setattr (post-create)\n");
+
+		if (!(sattr->ia_valid & ATTR_ATIME_SET))
+			sattr->ia_valid |= ATTR_ATIME;
+		if (!(sattr->ia_valid & ATTR_MTIME_SET))
+			sattr->ia_valid |= ATTR_MTIME;
+
+		/* Note: we could use a guarded setattr here, but I'm
+		 * not sure this buys us anything (and I'd have
+		 * to revamp the NFSv3 XDR code) */
+		status = nfs3_proc_setattr(dentry, data->res.fattr, sattr);
+		nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);
+		dprintk("NFS reply setattr (post-create): %d\n", status);
+		if (status != 0)
+			goto out;
+	}
+	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
+out:
+	nfs3_free_createdata(data);
+	dprintk("NFS reply create: %d\n", status);
+	return status;
+}
+
+static int
+nfs3_proc_remove(struct inode *dir, struct qstr *name)
+{
+	struct nfs_removeargs arg = {
+		.fh = NFS_FH(dir),
+		.name.len = name->len,
+		.name.name = name->name,
+	};
+	struct nfs_removeres res;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE],
+		.rpc_argp = &arg,
+		.rpc_resp = &res,
+	};
+	int status = -ENOMEM;
+
+	dprintk("NFS call  remove %s\n", name->name);
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.dir_attr == NULL)
+		goto out;
+
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_post_op_update_inode(dir, res.dir_attr);
+	nfs_free_fattr(res.dir_attr);
+out:
+	dprintk("NFS reply remove: %d\n", status);
+	return status;
+}
+
+static void
+nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
+{
+	msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE];
+}
+
+static int
+nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
+{
+	struct nfs_removeres *res;
+	if (nfs3_async_handle_jukebox(task, dir))
+		return 0;
+	res = task->tk_msg.rpc_resp;
+	nfs_post_op_update_inode(dir, res->dir_attr);
+	return 1;
+}
+
+static void
+nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+	msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
+}
+
+static int
+nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+		      struct inode *new_dir)
+{
+	struct nfs_renameres *res;
+
+	if (nfs3_async_handle_jukebox(task, old_dir))
+		return 0;
+	res = task->tk_msg.rpc_resp;
+
+	nfs_post_op_update_inode(old_dir, res->old_fattr);
+	nfs_post_op_update_inode(new_dir, res->new_fattr);
+	return 1;
+}
+
+static int
+nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
+		 struct inode *new_dir, struct qstr *new_name)
+{
+	struct nfs_renameargs	arg = {
+		.old_dir	= NFS_FH(old_dir),
+		.old_name	= old_name,
+		.new_dir	= NFS_FH(new_dir),
+		.new_name	= new_name,
+	};
+	struct nfs_renameres res;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_RENAME],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
+	int status = -ENOMEM;
+
+	dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name);
+
+	res.old_fattr = nfs_alloc_fattr();
+	res.new_fattr = nfs_alloc_fattr();
+	if (res.old_fattr == NULL || res.new_fattr == NULL)
+		goto out;
+
+	status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
+	nfs_post_op_update_inode(old_dir, res.old_fattr);
+	nfs_post_op_update_inode(new_dir, res.new_fattr);
+out:
+	nfs_free_fattr(res.old_fattr);
+	nfs_free_fattr(res.new_fattr);
+	dprintk("NFS reply rename: %d\n", status);
+	return status;
+}
+
+static int
+nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
+{
+	struct nfs3_linkargs	arg = {
+		.fromfh		= NFS_FH(inode),
+		.tofh		= NFS_FH(dir),
+		.toname		= name->name,
+		.tolen		= name->len
+	};
+	struct nfs3_linkres	res;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_LINK],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
+	int status = -ENOMEM;
+
+	dprintk("NFS call  link %s\n", name->name);
+	res.fattr = nfs_alloc_fattr();
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.fattr == NULL || res.dir_attr == NULL)
+		goto out;
+
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	nfs_post_op_update_inode(dir, res.dir_attr);
+	nfs_post_op_update_inode(inode, res.fattr);
+out:
+	nfs_free_fattr(res.dir_attr);
+	nfs_free_fattr(res.fattr);
+	dprintk("NFS reply link: %d\n", status);
+	return status;
+}
+
+static int
+nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+		  unsigned int len, struct iattr *sattr)
+{
+	struct nfs3_createdata *data;
+	int status = -ENOMEM;
+
+	if (len > NFS3_MAXPATHLEN)
+		return -ENAMETOOLONG;
+
+	dprintk("NFS call  symlink %s\n", dentry->d_name.name);
+
+	data = nfs3_alloc_createdata();
+	if (data == NULL)
+		goto out;
+	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK];
+	data->arg.symlink.fromfh = NFS_FH(dir);
+	data->arg.symlink.fromname = dentry->d_name.name;
+	data->arg.symlink.fromlen = dentry->d_name.len;
+	data->arg.symlink.pages = &page;
+	data->arg.symlink.pathlen = len;
+	data->arg.symlink.sattr = sattr;
+
+	status = nfs3_do_create(dir, dentry, data);
+
+	nfs3_free_createdata(data);
+out:
+	dprintk("NFS reply symlink: %d\n", status);
+	return status;
+}
+
+static int
+nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
+{
+	struct nfs3_createdata *data;
+	int mode = sattr->ia_mode;
+	int status = -ENOMEM;
+
+	dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
+
+	sattr->ia_mode &= ~current_umask();
+
+	data = nfs3_alloc_createdata();
+	if (data == NULL)
+		goto out;
+
+	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
+	data->arg.mkdir.fh = NFS_FH(dir);
+	data->arg.mkdir.name = dentry->d_name.name;
+	data->arg.mkdir.len = dentry->d_name.len;
+	data->arg.mkdir.sattr = sattr;
+
+	status = nfs3_do_create(dir, dentry, data);
+	if (status != 0)
+		goto out;
+
+	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
+out:
+	nfs3_free_createdata(data);
+	dprintk("NFS reply mkdir: %d\n", status);
+	return status;
+}
+
+static int
+nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
+{
+	struct nfs_fattr	*dir_attr;
+	struct nfs3_diropargs	arg = {
+		.fh		= NFS_FH(dir),
+		.name		= name->name,
+		.len		= name->len
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_RMDIR],
+		.rpc_argp	= &arg,
+	};
+	int status = -ENOMEM;
+
+	dprintk("NFS call  rmdir %s\n", name->name);
+	dir_attr = nfs_alloc_fattr();
+	if (dir_attr == NULL)
+		goto out;
+
+	msg.rpc_resp = dir_attr;
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_post_op_update_inode(dir, dir_attr);
+	nfs_free_fattr(dir_attr);
+out:
+	dprintk("NFS reply rmdir: %d\n", status);
+	return status;
+}
+
+/*
+ * The READDIR implementation is somewhat hackish - we pass the user buffer
+ * to the encode function, which installs it in the receive iovec.
+ * The decode function itself doesn't perform any decoding, it just makes
+ * sure the reply is syntactically correct.
+ *
+ * Also note that this implementation handles both plain readdir and
+ * readdirplus.
+ */
+static int
+nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+		  u64 cookie, struct page **pages, unsigned int count, int plus)
+{
+	struct inode		*dir = dentry->d_inode;
+	__be32			*verf = NFS_COOKIEVERF(dir);
+	struct nfs3_readdirargs	arg = {
+		.fh		= NFS_FH(dir),
+		.cookie		= cookie,
+		.verf		= {verf[0], verf[1]},
+		.plus		= plus,
+		.count		= count,
+		.pages		= pages
+	};
+	struct nfs3_readdirres	res = {
+		.verf		= verf,
+		.plus		= plus
+	};
+	struct rpc_message	msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_READDIR],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred
+	};
+	int status = -ENOMEM;
+
+	if (plus)
+		msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
+
+	dprintk("NFS call  readdir%s %d\n",
+			plus? "plus" : "", (unsigned int) cookie);
+
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.dir_attr == NULL)
+		goto out;
+
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+
+	nfs_invalidate_atime(dir);
+	nfs_refresh_inode(dir, res.dir_attr);
+
+	nfs_free_fattr(res.dir_attr);
+out:
+	dprintk("NFS reply readdir%s: %d\n",
+			plus? "plus" : "", status);
+	return status;
+}
+
+static int
+nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+		dev_t rdev)
+{
+	struct nfs3_createdata *data;
+	mode_t mode = sattr->ia_mode;
+	int status = -ENOMEM;
+
+	dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name,
+			MAJOR(rdev), MINOR(rdev));
+
+	sattr->ia_mode &= ~current_umask();
+
+	data = nfs3_alloc_createdata();
+	if (data == NULL)
+		goto out;
+
+	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
+	data->arg.mknod.fh = NFS_FH(dir);
+	data->arg.mknod.name = dentry->d_name.name;
+	data->arg.mknod.len = dentry->d_name.len;
+	data->arg.mknod.sattr = sattr;
+	data->arg.mknod.rdev = rdev;
+
+	switch (sattr->ia_mode & S_IFMT) {
+	case S_IFBLK:
+		data->arg.mknod.type = NF3BLK;
+		break;
+	case S_IFCHR:
+		data->arg.mknod.type = NF3CHR;
+		break;
+	case S_IFIFO:
+		data->arg.mknod.type = NF3FIFO;
+		break;
+	case S_IFSOCK:
+		data->arg.mknod.type = NF3SOCK;
+		break;
+	default:
+		status = -EINVAL;
+		goto out;
+	}
+
+	status = nfs3_do_create(dir, dentry, data);
+	if (status != 0)
+		goto out;
+	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
+out:
+	nfs3_free_createdata(data);
+	dprintk("NFS reply mknod: %d\n", status);
+	return status;
+}
+
+static int
+nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
+		 struct nfs_fsstat *stat)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_FSSTAT],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= stat,
+	};
+	int	status;
+
+	dprintk("NFS call  fsstat\n");
+	nfs_fattr_init(stat->fattr);
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("NFS reply fsstat: %d\n", status);
+	return status;
+}
+
+static int
+do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle,
+		 struct nfs_fsinfo *info)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_FSINFO],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= info,
+	};
+	int	status;
+
+	dprintk("NFS call  fsinfo\n");
+	nfs_fattr_init(info->fattr);
+	status = rpc_call_sync(client, &msg, 0);
+	dprintk("NFS reply fsinfo: %d\n", status);
+	return status;
+}
+
+/*
+ * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via
+ * nfs_create_server
+ */
+static int
+nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+		   struct nfs_fsinfo *info)
+{
+	int	status;
+
+	status = do_proc_fsinfo(server->client, fhandle, info);
+	if (status && server->nfs_client->cl_rpcclient != server->client)
+		status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info);
+	return status;
+}
+
+static int
+nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+		   struct nfs_pathconf *info)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_PATHCONF],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= info,
+	};
+	int	status;
+
+	dprintk("NFS call  pathconf\n");
+	nfs_fattr_init(info->fattr);
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("NFS reply pathconf: %d\n", status);
+	return status;
+}
+
+static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
+{
+	if (nfs3_async_handle_jukebox(task, data->inode))
+		return -EAGAIN;
+
+	nfs_invalidate_atime(data->inode);
+	nfs_refresh_inode(data->inode, &data->fattr);
+	return 0;
+}
+
+static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
+{
+	msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
+}
+
+static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
+{
+	if (nfs3_async_handle_jukebox(task, data->inode))
+		return -EAGAIN;
+	if (task->tk_status >= 0)
+		nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
+	return 0;
+}
+
+static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
+{
+	msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
+}
+
+static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
+{
+	if (nfs3_async_handle_jukebox(task, data->inode))
+		return -EAGAIN;
+	nfs_refresh_inode(data->inode, data->res.fattr);
+	return 0;
+}
+
+static void nfs3_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
+{
+	msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT];
+}
+
+static int
+nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
+}
+
+const struct nfs_rpc_ops nfs_v3_clientops = {
+	.version	= 3,			/* protocol version */
+	.dentry_ops	= &nfs_dentry_operations,
+	.dir_inode_ops	= &nfs3_dir_inode_operations,
+	.file_inode_ops	= &nfs3_file_inode_operations,
+	.file_ops	= &nfs_file_operations,
+	.getroot	= nfs3_proc_get_root,
+	.getattr	= nfs3_proc_getattr,
+	.setattr	= nfs3_proc_setattr,
+	.lookup		= nfs3_proc_lookup,
+	.access		= nfs3_proc_access,
+	.readlink	= nfs3_proc_readlink,
+	.create		= nfs3_proc_create,
+	.remove		= nfs3_proc_remove,
+	.unlink_setup	= nfs3_proc_unlink_setup,
+	.unlink_done	= nfs3_proc_unlink_done,
+	.rename		= nfs3_proc_rename,
+	.rename_setup	= nfs3_proc_rename_setup,
+	.rename_done	= nfs3_proc_rename_done,
+	.link		= nfs3_proc_link,
+	.symlink	= nfs3_proc_symlink,
+	.mkdir		= nfs3_proc_mkdir,
+	.rmdir		= nfs3_proc_rmdir,
+	.readdir	= nfs3_proc_readdir,
+	.mknod		= nfs3_proc_mknod,
+	.statfs		= nfs3_proc_statfs,
+	.fsinfo		= nfs3_proc_fsinfo,
+	.pathconf	= nfs3_proc_pathconf,
+	.decode_dirent	= nfs3_decode_dirent,
+	.read_setup	= nfs3_proc_read_setup,
+	.read_done	= nfs3_read_done,
+	.write_setup	= nfs3_proc_write_setup,
+	.write_done	= nfs3_write_done,
+	.commit_setup	= nfs3_proc_commit_setup,
+	.commit_done	= nfs3_commit_done,
+	.lock		= nfs3_proc_lock,
+	.clear_acl_cache = nfs3_forget_cached_acls,
+	.close_context	= nfs_close_context,
+	.init_client	= nfs_init_client,
+};
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
new file mode 100644
index 00000000..183c6b12
--- /dev/null
+++ b/fs/nfs/nfs3xdr.c
@@ -0,0 +1,2498 @@
+/*
+ * linux/fs/nfs/nfs3xdr.c
+ *
+ * XDR functions to encode/decode NFSv3 RPC arguments and results.
+ *
+ * Copyright (C) 1996, 1997 Olaf Kirch
+ */
+
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/kdev_t.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfsacl.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_XDR
+
+/* Mapping from NFS error code to "errno" error code. */
+#define errno_NFSERR_IO		EIO
+
+/*
+ * Declare the space requirements for NFS arguments and replies as
+ * number of 32bit-words
+ */
+#define NFS3_fhandle_sz		(1+16)
+#define NFS3_fh_sz		(NFS3_fhandle_sz)	/* shorthand */
+#define NFS3_sattr_sz		(15)
+#define NFS3_filename_sz	(1+(NFS3_MAXNAMLEN>>2))
+#define NFS3_path_sz		(1+(NFS3_MAXPATHLEN>>2))
+#define NFS3_fattr_sz		(21)
+#define NFS3_cookieverf_sz	(NFS3_COOKIEVERFSIZE>>2)
+#define NFS3_wcc_attr_sz	(6)
+#define NFS3_pre_op_attr_sz	(1+NFS3_wcc_attr_sz)
+#define NFS3_post_op_attr_sz	(1+NFS3_fattr_sz)
+#define NFS3_wcc_data_sz	(NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
+#define NFS3_diropargs_sz	(NFS3_fh_sz+NFS3_filename_sz)
+
+#define NFS3_getattrargs_sz	(NFS3_fh_sz)
+#define NFS3_setattrargs_sz	(NFS3_fh_sz+NFS3_sattr_sz+3)
+#define NFS3_lookupargs_sz	(NFS3_fh_sz+NFS3_filename_sz)
+#define NFS3_accessargs_sz	(NFS3_fh_sz+1)
+#define NFS3_readlinkargs_sz	(NFS3_fh_sz)
+#define NFS3_readargs_sz	(NFS3_fh_sz+3)
+#define NFS3_writeargs_sz	(NFS3_fh_sz+5)
+#define NFS3_createargs_sz	(NFS3_diropargs_sz+NFS3_sattr_sz)
+#define NFS3_mkdirargs_sz	(NFS3_diropargs_sz+NFS3_sattr_sz)
+#define NFS3_symlinkargs_sz	(NFS3_diropargs_sz+1+NFS3_sattr_sz)
+#define NFS3_mknodargs_sz	(NFS3_diropargs_sz+2+NFS3_sattr_sz)
+#define NFS3_removeargs_sz	(NFS3_fh_sz+NFS3_filename_sz)
+#define NFS3_renameargs_sz	(NFS3_diropargs_sz+NFS3_diropargs_sz)
+#define NFS3_linkargs_sz		(NFS3_fh_sz+NFS3_diropargs_sz)
+#define NFS3_readdirargs_sz	(NFS3_fh_sz+NFS3_cookieverf_sz+3)
+#define NFS3_readdirplusargs_sz	(NFS3_fh_sz+NFS3_cookieverf_sz+4)
+#define NFS3_commitargs_sz	(NFS3_fh_sz+3)
+
+#define NFS3_getattrres_sz	(1+NFS3_fattr_sz)
+#define NFS3_setattrres_sz	(1+NFS3_wcc_data_sz)
+#define NFS3_removeres_sz	(NFS3_setattrres_sz)
+#define NFS3_lookupres_sz	(1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
+#define NFS3_accessres_sz	(1+NFS3_post_op_attr_sz+1)
+#define NFS3_readlinkres_sz	(1+NFS3_post_op_attr_sz+1)
+#define NFS3_readres_sz		(1+NFS3_post_op_attr_sz+3)
+#define NFS3_writeres_sz	(1+NFS3_wcc_data_sz+4)
+#define NFS3_createres_sz	(1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
+#define NFS3_renameres_sz	(1+(2 * NFS3_wcc_data_sz))
+#define NFS3_linkres_sz		(1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
+#define NFS3_readdirres_sz	(1+NFS3_post_op_attr_sz+2)
+#define NFS3_fsstatres_sz	(1+NFS3_post_op_attr_sz+13)
+#define NFS3_fsinfores_sz	(1+NFS3_post_op_attr_sz+12)
+#define NFS3_pathconfres_sz	(1+NFS3_post_op_attr_sz+6)
+#define NFS3_commitres_sz	(1+NFS3_wcc_data_sz+2)
+
+#define ACL3_getaclargs_sz	(NFS3_fh_sz+1)
+#define ACL3_setaclargs_sz	(NFS3_fh_sz+1+ \
+				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
+#define ACL3_getaclres_sz	(1+NFS3_post_op_attr_sz+1+ \
+				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
+#define ACL3_setaclres_sz	(1+NFS3_post_op_attr_sz)
+
+/*
+ * Map file type to S_IFMT bits
+ */
+static const umode_t nfs_type2fmt[] = {
+	[NF3BAD] = 0,
+	[NF3REG] = S_IFREG,
+	[NF3DIR] = S_IFDIR,
+	[NF3BLK] = S_IFBLK,
+	[NF3CHR] = S_IFCHR,
+	[NF3LNK] = S_IFLNK,
+	[NF3SOCK] = S_IFSOCK,
+	[NF3FIFO] = S_IFIFO,
+};
+
+/*
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
+ */
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
+				 unsigned int base, unsigned int len,
+				 unsigned int bufsize)
+{
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
+	unsigned int replen;
+
+	replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
+}
+
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("NFS: %s prematurely hit the end of our receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
+
+/*
+ * Encode/decode NFSv3 basic data types
+ *
+ * Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+static void encode_uint32(struct xdr_stream *xdr, u32 value)
+{
+	__be32 *p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(value);
+}
+
+static int decode_uint32(struct xdr_stream *xdr, u32 *value)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*value = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_uint64(struct xdr_stream *xdr, u64 *value)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	xdr_decode_hyper(p, value);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * fileid3
+ *
+ *	typedef uint64 fileid3;
+ */
+static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid)
+{
+	return xdr_decode_hyper(p, fileid);
+}
+
+static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid)
+{
+	return decode_uint64(xdr, fileid);
+}
+
+/*
+ * filename3
+ *
+ *	typedef string filename3<>;
+ */
+static void encode_filename3(struct xdr_stream *xdr,
+			     const char *name, u32 length)
+{
+	__be32 *p;
+
+	BUG_ON(length > NFS3_MAXNAMLEN);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, name, length);
+}
+
+static int decode_inline_filename3(struct xdr_stream *xdr,
+				   const char **name, u32 *length)
+{
+	__be32 *p;
+	u32 count;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p);
+	if (count > NFS3_MAXNAMLEN)
+		goto out_nametoolong;
+	p = xdr_inline_decode(xdr, count);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*name = (const char *)p;
+	*length = count;
+	return 0;
+
+out_nametoolong:
+	dprintk("NFS: returned filename too long: %u\n", count);
+	return -ENAMETOOLONG;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * nfspath3
+ *
+ *	typedef string nfspath3<>;
+ */
+static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
+			    const u32 length)
+{
+	BUG_ON(length > NFS3_MAXPATHLEN);
+	encode_uint32(xdr, length);
+	xdr_write_pages(xdr, pages, 0, length);
+}
+
+static int decode_nfspath3(struct xdr_stream *xdr)
+{
+	u32 recvd, count;
+	size_t hdrlen;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p);
+	if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
+		goto out_nametoolong;
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+	recvd = xdr->buf->len - hdrlen;
+	if (unlikely(count > recvd))
+		goto out_cheating;
+
+	xdr_read_pages(xdr, count);
+	xdr_terminate_string(xdr->buf, count);
+	return 0;
+
+out_nametoolong:
+	dprintk("NFS: returned pathname too long: %u\n", count);
+	return -ENAMETOOLONG;
+out_cheating:
+	dprintk("NFS: server cheating in pathname result: "
+		"count %u > recvd %u\n", count, recvd);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * cookie3
+ *
+ *	typedef uint64 cookie3
+ */
+static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)
+{
+	return xdr_encode_hyper(p, cookie);
+}
+
+static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie)
+{
+	return decode_uint64(xdr, cookie);
+}
+
+/*
+ * cookieverf3
+ *
+ *	typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE];
+ */
+static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier)
+{
+	memcpy(p, verifier, NFS3_COOKIEVERFSIZE);
+	return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE);
+}
+
+static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * createverf3
+ *
+ *	typedef opaque createverf3[NFS3_CREATEVERFSIZE];
+ */
+static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE);
+	memcpy(p, verifier, NFS3_CREATEVERFSIZE);
+}
+
+static int decode_writeverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	memcpy(verifier, p, NFS3_WRITEVERFSIZE);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * size3
+ *
+ *	typedef uint64 size3;
+ */
+static __be32 *xdr_decode_size3(__be32 *p, u64 *size)
+{
+	return xdr_decode_hyper(p, size);
+}
+
+/*
+ * nfsstat3
+ *
+ *	enum nfsstat3 {
+ *		NFS3_OK = 0,
+ *		...
+ *	}
+ */
+#define NFS3_OK		NFS_OK
+
+static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*status = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * ftype3
+ *
+ *	enum ftype3 {
+ *		NF3REG	= 1,
+ *		NF3DIR	= 2,
+ *		NF3BLK	= 3,
+ *		NF3CHR	= 4,
+ *		NF3LNK	= 5,
+ *		NF3SOCK	= 6,
+ *		NF3FIFO	= 7
+ *	};
+ */
+static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
+{
+	BUG_ON(type > NF3FIFO);
+	encode_uint32(xdr, type);
+}
+
+static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode)
+{
+	u32 type;
+
+	type = be32_to_cpup(p++);
+	if (type > NF3FIFO)
+		type = NF3NON;
+	*mode = nfs_type2fmt[type];
+	return p;
+}
+
+/*
+ * specdata3
+ *
+ *     struct specdata3 {
+ *             uint32  specdata1;
+ *             uint32  specdata2;
+ *     };
+ */
+static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 8);
+	*p++ = cpu_to_be32(MAJOR(rdev));
+	*p = cpu_to_be32(MINOR(rdev));
+}
+
+static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev)
+{
+	unsigned int major, minor;
+
+	major = be32_to_cpup(p++);
+	minor = be32_to_cpup(p++);
+	*rdev = MKDEV(major, minor);
+	if (MAJOR(*rdev) != major || MINOR(*rdev) != minor)
+		*rdev = 0;
+	return p;
+}
+
+/*
+ * nfs_fh3
+ *
+ *	struct nfs_fh3 {
+ *		opaque       data<NFS3_FHSIZE>;
+ *	};
+ */
+static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+	__be32 *p;
+
+	BUG_ON(fh->size > NFS3_FHSIZE);
+	p = xdr_reserve_space(xdr, 4 + fh->size);
+	xdr_encode_opaque(p, fh->data, fh->size);
+}
+
+static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	if (unlikely(length > NFS3_FHSIZE))
+		goto out_toobig;
+	p = xdr_inline_decode(xdr, length);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	fh->size = length;
+	memcpy(fh->data, p, length);
+	return 0;
+out_toobig:
+	dprintk("NFS: file handle size (%u) too big\n", length);
+	return -E2BIG;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static void zero_nfs_fh3(struct nfs_fh *fh)
+{
+	memset(fh, 0, sizeof(*fh));
+}
+
+/*
+ * nfstime3
+ *
+ *	struct nfstime3 {
+ *		uint32	seconds;
+ *		uint32	nseconds;
+ *	};
+ */
+static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep)
+{
+	*p++ = cpu_to_be32(timep->tv_sec);
+	*p++ = cpu_to_be32(timep->tv_nsec);
+	return p;
+}
+
+static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep)
+{
+	timep->tv_sec = be32_to_cpup(p++);
+	timep->tv_nsec = be32_to_cpup(p++);
+	return p;
+}
+
+/*
+ * sattr3
+ *
+ *	enum time_how {
+ *		DONT_CHANGE		= 0,
+ *		SET_TO_SERVER_TIME	= 1,
+ *		SET_TO_CLIENT_TIME	= 2
+ *	};
+ *
+ *	union set_mode3 switch (bool set_it) {
+ *	case TRUE:
+ *		mode3	mode;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_uid3 switch (bool set_it) {
+ *	case TRUE:
+ *		uid3	uid;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_gid3 switch (bool set_it) {
+ *	case TRUE:
+ *		gid3	gid;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_size3 switch (bool set_it) {
+ *	case TRUE:
+ *		size3	size;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_atime switch (time_how set_it) {
+ *	case SET_TO_CLIENT_TIME:
+ *		nfstime3	atime;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_mtime switch (time_how set_it) {
+ *	case SET_TO_CLIENT_TIME:
+ *		nfstime3  mtime;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct sattr3 {
+ *		set_mode3	mode;
+ *		set_uid3	uid;
+ *		set_gid3	gid;
+ *		set_size3	size;
+ *		set_atime	atime;
+ *		set_mtime	mtime;
+ *	};
+ */
+static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
+{
+	u32 nbytes;
+	__be32 *p;
+
+	/*
+	 * In order to make only a single xdr_reserve_space() call,
+	 * pre-compute the total number of bytes to be reserved.
+	 * Six boolean values, one for each set_foo field, are always
+	 * present in the encoded result, so start there.
+	 */
+	nbytes = 6 * 4;
+	if (attr->ia_valid & ATTR_MODE)
+		nbytes += 4;
+	if (attr->ia_valid & ATTR_UID)
+		nbytes += 4;
+	if (attr->ia_valid & ATTR_GID)
+		nbytes += 4;
+	if (attr->ia_valid & ATTR_SIZE)
+		nbytes += 8;
+	if (attr->ia_valid & ATTR_ATIME_SET)
+		nbytes += 8;
+	if (attr->ia_valid & ATTR_MTIME_SET)
+		nbytes += 8;
+	p = xdr_reserve_space(xdr, nbytes);
+
+	if (attr->ia_valid & ATTR_MODE) {
+		*p++ = xdr_one;
+		*p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO);
+	} else
+		*p++ = xdr_zero;
+
+	if (attr->ia_valid & ATTR_UID) {
+		*p++ = xdr_one;
+		*p++ = cpu_to_be32(attr->ia_uid);
+	} else
+		*p++ = xdr_zero;
+
+	if (attr->ia_valid & ATTR_GID) {
+		*p++ = xdr_one;
+		*p++ = cpu_to_be32(attr->ia_gid);
+	} else
+		*p++ = xdr_zero;
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		*p++ = xdr_one;
+		p = xdr_encode_hyper(p, (u64)attr->ia_size);
+	} else
+		*p++ = xdr_zero;
+
+	if (attr->ia_valid & ATTR_ATIME_SET) {
+		*p++ = xdr_two;
+		p = xdr_encode_nfstime3(p, &attr->ia_atime);
+	} else if (attr->ia_valid & ATTR_ATIME) {
+		*p++ = xdr_one;
+	} else
+		*p++ = xdr_zero;
+
+	if (attr->ia_valid & ATTR_MTIME_SET) {
+		*p++ = xdr_two;
+		xdr_encode_nfstime3(p, &attr->ia_mtime);
+	} else if (attr->ia_valid & ATTR_MTIME) {
+		*p = xdr_one;
+	} else
+		*p = xdr_zero;
+}
+
+/*
+ * fattr3
+ *
+ *	struct fattr3 {
+ *		ftype3		type;
+ *		mode3		mode;
+ *		uint32		nlink;
+ *		uid3		uid;
+ *		gid3		gid;
+ *		size3		size;
+ *		size3		used;
+ *		specdata3	rdev;
+ *		uint64		fsid;
+ *		fileid3		fileid;
+ *		nfstime3	atime;
+ *		nfstime3	mtime;
+ *		nfstime3	ctime;
+ *	};
+ */
+static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	umode_t fmode;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+
+	p = xdr_decode_ftype3(p, &fmode);
+
+	fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
+	fattr->nlink = be32_to_cpup(p++);
+	fattr->uid = be32_to_cpup(p++);
+	fattr->gid = be32_to_cpup(p++);
+
+	p = xdr_decode_size3(p, &fattr->size);
+	p = xdr_decode_size3(p, &fattr->du.nfs3.used);
+	p = xdr_decode_specdata3(p, &fattr->rdev);
+
+	p = xdr_decode_hyper(p, &fattr->fsid.major);
+	fattr->fsid.minor = 0;
+
+	p = xdr_decode_fileid3(p, &fattr->fileid);
+	p = xdr_decode_nfstime3(p, &fattr->atime);
+	p = xdr_decode_nfstime3(p, &fattr->mtime);
+	xdr_decode_nfstime3(p, &fattr->ctime);
+
+	fattr->valid |= NFS_ATTR_FATTR_V3;
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * post_op_attr
+ *
+ *	union post_op_attr switch (bool attributes_follow) {
+ *	case TRUE:
+ *		fattr3	attributes;
+ *	case FALSE:
+ *		void;
+ *	};
+ */
+static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p != xdr_zero)
+		return decode_fattr3(xdr, fattr);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * wcc_attr
+ *	struct wcc_attr {
+ *		size3		size;
+ *		nfstime3	mtime;
+ *		nfstime3	ctime;
+ *	};
+ */
+static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+
+	fattr->valid |= NFS_ATTR_FATTR_PRESIZE
+		| NFS_ATTR_FATTR_PREMTIME
+		| NFS_ATTR_FATTR_PRECTIME;
+
+	p = xdr_decode_size3(p, &fattr->pre_size);
+	p = xdr_decode_nfstime3(p, &fattr->pre_mtime);
+	xdr_decode_nfstime3(p, &fattr->pre_ctime);
+
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * pre_op_attr
+ *	union pre_op_attr switch (bool attributes_follow) {
+ *	case TRUE:
+ *		wcc_attr	attributes;
+ *	case FALSE:
+ *		void;
+ *	};
+ *
+ * wcc_data
+ *
+ *	struct wcc_data {
+ *		pre_op_attr	before;
+ *		post_op_attr	after;
+ *	};
+ */
+static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p != xdr_zero)
+		return decode_wcc_attr(xdr, fattr);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	int error;
+
+	error = decode_pre_op_attr(xdr, fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, fattr);
+out:
+	return error;
+}
+
+/*
+ * post_op_fh3
+ *
+ *	union post_op_fh3 switch (bool handle_follows) {
+ *	case TRUE:
+ *		nfs_fh3  handle;
+ *	case FALSE:
+ *		void;
+ *	};
+ */
+static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	__be32 *p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p != xdr_zero)
+		return decode_nfs_fh3(xdr, fh);
+	zero_nfs_fh3(fh);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * diropargs3
+ *
+ *	struct diropargs3 {
+ *		nfs_fh3		dir;
+ *		filename3	name;
+ *	};
+ */
+static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh,
+			      const char *name, u32 length)
+{
+	encode_nfs_fh3(xdr, fh);
+	encode_filename3(xdr, name, length);
+}
+
+
+/*
+ * NFSv3 XDR encode functions
+ *
+ * NFSv3 argument types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ */
+
+/*
+ * 3.3.1  GETATTR3args
+ *
+ *	struct GETATTR3args {
+ *		nfs_fh3  object;
+ *	};
+ */
+static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs_fh *fh)
+{
+	encode_nfs_fh3(xdr, fh);
+}
+
+/*
+ * 3.3.2  SETATTR3args
+ *
+ *	union sattrguard3 switch (bool check) {
+ *	case TRUE:
+ *		nfstime3  obj_ctime;
+ *	case FALSE:
+ *		void;
+ *	};
+ *
+ *	struct SETATTR3args {
+ *		nfs_fh3		object;
+ *		sattr3		new_attributes;
+ *		sattrguard3	guard;
+ *	};
+ */
+static void encode_sattrguard3(struct xdr_stream *xdr,
+			       const struct nfs3_sattrargs *args)
+{
+	__be32 *p;
+
+	if (args->guard) {
+		p = xdr_reserve_space(xdr, 4 + 8);
+		*p++ = xdr_one;
+		xdr_encode_nfstime3(p, &args->guardtime);
+	} else {
+		p = xdr_reserve_space(xdr, 4);
+		*p = xdr_zero;
+	}
+}
+
+static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs3_sattrargs *args)
+{
+	encode_nfs_fh3(xdr, args->fh);
+	encode_sattr3(xdr, args->sattr);
+	encode_sattrguard3(xdr, args);
+}
+
+/*
+ * 3.3.3  LOOKUP3args
+ *
+ *	struct LOOKUP3args {
+ *		diropargs3  what;
+ *	};
+ */
+static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_diropargs *args)
+{
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
+}
+
+/*
+ * 3.3.4  ACCESS3args
+ *
+ *	struct ACCESS3args {
+ *		nfs_fh3		object;
+ *		uint32		access;
+ *	};
+ */
+static void encode_access3args(struct xdr_stream *xdr,
+			       const struct nfs3_accessargs *args)
+{
+	encode_nfs_fh3(xdr, args->fh);
+	encode_uint32(xdr, args->access);
+}
+
+static void nfs3_xdr_enc_access3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_accessargs *args)
+{
+	encode_access3args(xdr, args);
+}
+
+/*
+ * 3.3.5  READLINK3args
+ *
+ *	struct READLINK3args {
+ *		nfs_fh3	symlink;
+ *	};
+ */
+static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
+				       struct xdr_stream *xdr,
+				       const struct nfs3_readlinkargs *args)
+{
+	encode_nfs_fh3(xdr, args->fh);
+	prepare_reply_buffer(req, args->pages, args->pgbase,
+					args->pglen, NFS3_readlinkres_sz);
+}
+
+/*
+ * 3.3.6  READ3args
+ *
+ *	struct READ3args {
+ *		nfs_fh3		file;
+ *		offset3		offset;
+ *		count3		count;
+ *	};
+ */
+static void encode_read3args(struct xdr_stream *xdr,
+			     const struct nfs_readargs *args)
+{
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + 4);
+	p = xdr_encode_hyper(p, args->offset);
+	*p = cpu_to_be32(args->count);
+}
+
+static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_readargs *args)
+{
+	encode_read3args(xdr, args);
+	prepare_reply_buffer(req, args->pages, args->pgbase,
+					args->count, NFS3_readres_sz);
+	req->rq_rcv_buf.flags |= XDRBUF_READ;
+}
+
+/*
+ * 3.3.7  WRITE3args
+ *
+ *	enum stable_how {
+ *		UNSTABLE  = 0,
+ *		DATA_SYNC = 1,
+ *		FILE_SYNC = 2
+ *	};
+ *
+ *	struct WRITE3args {
+ *		nfs_fh3		file;
+ *		offset3		offset;
+ *		count3		count;
+ *		stable_how	stable;
+ *		opaque		data<>;
+ *	};
+ */
+static void encode_write3args(struct xdr_stream *xdr,
+			      const struct nfs_writeargs *args)
+{
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4);
+	p = xdr_encode_hyper(p, args->offset);
+	*p++ = cpu_to_be32(args->count);
+	*p++ = cpu_to_be32(args->stable);
+	*p = cpu_to_be32(args->count);
+	xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+}
+
+static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_writeargs *args)
+{
+	encode_write3args(xdr, args);
+	xdr->buf->flags |= XDRBUF_WRITE;
+}
+
+/*
+ * 3.3.8  CREATE3args
+ *
+ *	enum createmode3 {
+ *		UNCHECKED = 0,
+ *		GUARDED   = 1,
+ *		EXCLUSIVE = 2
+ *	};
+ *
+ *	union createhow3 switch (createmode3 mode) {
+ *	case UNCHECKED:
+ *	case GUARDED:
+ *		sattr3       obj_attributes;
+ *	case EXCLUSIVE:
+ *		createverf3  verf;
+ *	};
+ *
+ *	struct CREATE3args {
+ *		diropargs3	where;
+ *		createhow3	how;
+ *	};
+ */
+static void encode_createhow3(struct xdr_stream *xdr,
+			      const struct nfs3_createargs *args)
+{
+	encode_uint32(xdr, args->createmode);
+	switch (args->createmode) {
+	case NFS3_CREATE_UNCHECKED:
+	case NFS3_CREATE_GUARDED:
+		encode_sattr3(xdr, args->sattr);
+		break;
+	case NFS3_CREATE_EXCLUSIVE:
+		encode_createverf3(xdr, args->verifier);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void nfs3_xdr_enc_create3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_createargs *args)
+{
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
+	encode_createhow3(xdr, args);
+}
+
+/*
+ * 3.3.9  MKDIR3args
+ *
+ *	struct MKDIR3args {
+ *		diropargs3	where;
+ *		sattr3		attributes;
+ *	};
+ */
+static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs3_mkdirargs *args)
+{
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
+	encode_sattr3(xdr, args->sattr);
+}
+
+/*
+ * 3.3.10  SYMLINK3args
+ *
+ *	struct symlinkdata3 {
+ *		sattr3		symlink_attributes;
+ *		nfspath3	symlink_data;
+ *	};
+ *
+ *	struct SYMLINK3args {
+ *		diropargs3	where;
+ *		symlinkdata3	symlink;
+ *	};
+ */
+static void encode_symlinkdata3(struct xdr_stream *xdr,
+				const struct nfs3_symlinkargs *args)
+{
+	encode_sattr3(xdr, args->sattr);
+	encode_nfspath3(xdr, args->pages, args->pathlen);
+}
+
+static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs3_symlinkargs *args)
+{
+	encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
+	encode_symlinkdata3(xdr, args);
+}
+
+/*
+ * 3.3.11  MKNOD3args
+ *
+ *	struct devicedata3 {
+ *		sattr3		dev_attributes;
+ *		specdata3	spec;
+ *	};
+ *
+ *	union mknoddata3 switch (ftype3 type) {
+ *	case NF3CHR:
+ *	case NF3BLK:
+ *		devicedata3	device;
+ *	case NF3SOCK:
+ *	case NF3FIFO:
+ *		sattr3		pipe_attributes;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct MKNOD3args {
+ *		diropargs3	where;
+ *		mknoddata3	what;
+ *	};
+ */
+static void encode_devicedata3(struct xdr_stream *xdr,
+			       const struct nfs3_mknodargs *args)
+{
+	encode_sattr3(xdr, args->sattr);
+	encode_specdata3(xdr, args->rdev);
+}
+
+static void encode_mknoddata3(struct xdr_stream *xdr,
+			      const struct nfs3_mknodargs *args)
+{
+	encode_ftype3(xdr, args->type);
+	switch (args->type) {
+	case NF3CHR:
+	case NF3BLK:
+		encode_devicedata3(xdr, args);
+		break;
+	case NF3SOCK:
+	case NF3FIFO:
+		encode_sattr3(xdr, args->sattr);
+		break;
+	case NF3REG:
+	case NF3DIR:
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs3_mknodargs *args)
+{
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
+	encode_mknoddata3(xdr, args);
+}
+
+/*
+ * 3.3.12  REMOVE3args
+ *
+ *	struct REMOVE3args {
+ *		diropargs3  object;
+ *	};
+ */
+static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_removeargs *args)
+{
+	encode_diropargs3(xdr, args->fh, args->name.name, args->name.len);
+}
+
+/*
+ * 3.3.14  RENAME3args
+ *
+ *	struct RENAME3args {
+ *		diropargs3	from;
+ *		diropargs3	to;
+ *	};
+ */
+static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_renameargs *args)
+{
+	const struct qstr *old = args->old_name;
+	const struct qstr *new = args->new_name;
+
+	encode_diropargs3(xdr, args->old_dir, old->name, old->len);
+	encode_diropargs3(xdr, args->new_dir, new->name, new->len);
+}
+
+/*
+ * 3.3.15  LINK3args
+ *
+ *	struct LINK3args {
+ *		nfs_fh3		file;
+ *		diropargs3	link;
+ *	};
+ */
+static void nfs3_xdr_enc_link3args(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs3_linkargs *args)
+{
+	encode_nfs_fh3(xdr, args->fromfh);
+	encode_diropargs3(xdr, args->tofh, args->toname, args->tolen);
+}
+
+/*
+ * 3.3.16  READDIR3args
+ *
+ *	struct READDIR3args {
+ *		nfs_fh3		dir;
+ *		cookie3		cookie;
+ *		cookieverf3	cookieverf;
+ *		count3		count;
+ *	};
+ */
+static void encode_readdir3args(struct xdr_stream *xdr,
+				const struct nfs3_readdirargs *args)
+{
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4);
+	p = xdr_encode_cookie3(p, args->cookie);
+	p = xdr_encode_cookieverf3(p, args->verf);
+	*p = cpu_to_be32(args->count);
+}
+
+static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs3_readdirargs *args)
+{
+	encode_readdir3args(xdr, args);
+	prepare_reply_buffer(req, args->pages, 0,
+				args->count, NFS3_readdirres_sz);
+}
+
+/*
+ * 3.3.17  READDIRPLUS3args
+ *
+ *	struct READDIRPLUS3args {
+ *		nfs_fh3		dir;
+ *		cookie3		cookie;
+ *		cookieverf3	cookieverf;
+ *		count3		dircount;
+ *		count3		maxcount;
+ *	};
+ */
+static void encode_readdirplus3args(struct xdr_stream *xdr,
+				    const struct nfs3_readdirargs *args)
+{
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4);
+	p = xdr_encode_cookie3(p, args->cookie);
+	p = xdr_encode_cookieverf3(p, args->verf);
+
+	/*
+	 * readdirplus: need dircount + buffer size.
+	 * We just make sure we make dircount big enough
+	 */
+	*p++ = cpu_to_be32(args->count >> 3);
+
+	*p = cpu_to_be32(args->count);
+}
+
+static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
+					  struct xdr_stream *xdr,
+					  const struct nfs3_readdirargs *args)
+{
+	encode_readdirplus3args(xdr, args);
+	prepare_reply_buffer(req, args->pages, 0,
+				args->count, NFS3_readdirres_sz);
+}
+
+/*
+ * 3.3.21  COMMIT3args
+ *
+ *	struct COMMIT3args {
+ *		nfs_fh3		file;
+ *		offset3		offset;
+ *		count3		count;
+ *	};
+ */
+static void encode_commit3args(struct xdr_stream *xdr,
+			       const struct nfs_writeargs *args)
+{
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + 4);
+	p = xdr_encode_hyper(p, args->offset);
+	*p = cpu_to_be32(args->count);
+}
+
+static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_writeargs *args)
+{
+	encode_commit3args(xdr, args);
+}
+
+#ifdef CONFIG_NFS_V3_ACL
+
+static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_getaclargs *args)
+{
+	encode_nfs_fh3(xdr, args->fh);
+	encode_uint32(xdr, args->mask);
+	if (args->mask & (NFS_ACL | NFS_DFACL))
+		prepare_reply_buffer(req, args->pages, 0,
+					NFSACL_MAXPAGES << PAGE_SHIFT,
+					ACL3_getaclres_sz);
+}
+
+static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_setaclargs *args)
+{
+	unsigned int base;
+	int error;
+
+	encode_nfs_fh3(xdr, NFS_FH(args->inode));
+	encode_uint32(xdr, args->mask);
+
+	base = req->rq_slen;
+	if (args->npages != 0)
+		xdr_write_pages(xdr, args->pages, 0, args->len);
+	else
+		xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
+
+	error = nfsacl_encode(xdr->buf, base, args->inode,
+			    (args->mask & NFS_ACL) ?
+			    args->acl_access : NULL, 1, 0);
+	BUG_ON(error < 0);
+	error = nfsacl_encode(xdr->buf, base + error, args->inode,
+			    (args->mask & NFS_DFACL) ?
+			    args->acl_default : NULL, 1,
+			    NFS_ACL_DEFAULT);
+	BUG_ON(error < 0);
+}
+
+#endif  /* CONFIG_NFS_V3_ACL */
+
+/*
+ * NFSv3 XDR decode functions
+ *
+ * NFSv3 result types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ */
+
+/*
+ * 3.3.1  GETATTR3res
+ *
+ *	struct GETATTR3resok {
+ *		fattr3		obj_attributes;
+ *	};
+ *
+ *	union GETATTR3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		GETATTR3resok  resok;
+ *	default:
+ *		void;
+ *	};
+ */
+static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs_fattr *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_fattr3(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 3.3.2  SETATTR3res
+ *
+ *	struct SETATTR3resok {
+ *		wcc_data  obj_wcc;
+ *	};
+ *
+ *	struct SETATTR3resfail {
+ *		wcc_data  obj_wcc;
+ *	};
+ *
+ *	union SETATTR3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		SETATTR3resok   resok;
+ *	default:
+ *		SETATTR3resfail resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs_fattr *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 3.3.3  LOOKUP3res
+ *
+ *	struct LOOKUP3resok {
+ *		nfs_fh3		object;
+ *		post_op_attr	obj_attributes;
+ *		post_op_attr	dir_attributes;
+ *	};
+ *
+ *	struct LOOKUP3resfail {
+ *		post_op_attr	dir_attributes;
+ *	};
+ *
+ *	union LOOKUP3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		LOOKUP3resok	resok;
+ *	default:
+ *		LOOKUP3resfail	resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs3_diropres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_nfs_fh3(xdr, result->fh);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->dir_attr);
+out:
+	return error;
+out_default:
+	error = decode_post_op_attr(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 3.3.4  ACCESS3res
+ *
+ *	struct ACCESS3resok {
+ *		post_op_attr	obj_attributes;
+ *		uint32		access;
+ *	};
+ *
+ *	struct ACCESS3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union ACCESS3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		ACCESS3resok	resok;
+ *	default:
+ *		ACCESS3resfail	resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs3_accessres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_uint32(xdr, &result->access);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 3.3.5  READLINK3res
+ *
+ *	struct READLINK3resok {
+ *		post_op_attr	symlink_attributes;
+ *		nfspath3	data;
+ *	};
+ *
+ *	struct READLINK3resfail {
+ *		post_op_attr	symlink_attributes;
+ *	};
+ *
+ *	union READLINK3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		READLINK3resok	resok;
+ *	default:
+ *		READLINK3resfail resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs_fattr *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_nfspath3(xdr);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 3.3.6  READ3res
+ *
+ *	struct READ3resok {
+ *		post_op_attr	file_attributes;
+ *		count3		count;
+ *		bool		eof;
+ *		opaque		data<>;
+ *	};
+ *
+ *	struct READ3resfail {
+ *		post_op_attr	file_attributes;
+ *	};
+ *
+ *	union READ3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		READ3resok	resok;
+ *	default:
+ *		READ3resfail	resfail;
+ *	};
+ */
+static int decode_read3resok(struct xdr_stream *xdr,
+			     struct nfs_readres *result)
+{
+	u32 eof, count, ocount, recvd;
+	size_t hdrlen;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 + 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p++);
+	eof = be32_to_cpup(p++);
+	ocount = be32_to_cpup(p++);
+	if (unlikely(ocount != count))
+		goto out_mismatch;
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+	recvd = xdr->buf->len - hdrlen;
+	if (unlikely(count > recvd))
+		goto out_cheating;
+
+out:
+	xdr_read_pages(xdr, count);
+	result->eof = eof;
+	result->count = count;
+	return count;
+out_mismatch:
+	dprintk("NFS: READ count doesn't match length of opaque: "
+		"count %u != ocount %u\n", count, ocount);
+	return -EIO;
+out_cheating:
+	dprintk("NFS: server cheating in read result: "
+		"count %u > recvd %u\n", count, recvd);
+	count = recvd;
+	eof = 0;
+	goto out;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_readres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_read3resok(xdr, result);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 3.3.7  WRITE3res
+ *
+ *	enum stable_how {
+ *		UNSTABLE  = 0,
+ *		DATA_SYNC = 1,
+ *		FILE_SYNC = 2
+ *	};
+ *
+ *	struct WRITE3resok {
+ *		wcc_data	file_wcc;
+ *		count3		count;
+ *		stable_how	committed;
+ *		writeverf3	verf;
+ *	};
+ *
+ *	struct WRITE3resfail {
+ *		wcc_data	file_wcc;
+ *	};
+ *
+ *	union WRITE3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		WRITE3resok	resok;
+ *	default:
+ *		WRITE3resfail	resfail;
+ *	};
+ */
+static int decode_write3resok(struct xdr_stream *xdr,
+			      struct nfs_writeres *result)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 + 4 + NFS3_WRITEVERFSIZE);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	result->count = be32_to_cpup(p++);
+	result->verf->committed = be32_to_cpup(p++);
+	if (unlikely(result->verf->committed > NFS_FILE_SYNC))
+		goto out_badvalue;
+	memcpy(result->verf->verifier, p, NFS3_WRITEVERFSIZE);
+	return result->count;
+out_badvalue:
+	dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  struct nfs_writeres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_write3resok(xdr, result);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 3.3.8  CREATE3res
+ *
+ *	struct CREATE3resok {
+ *		post_op_fh3	obj;
+ *		post_op_attr	obj_attributes;
+ *		wcc_data	dir_wcc;
+ *	};
+ *
+ *	struct CREATE3resfail {
+ *		wcc_data	dir_wcc;
+ *	};
+ *
+ *	union CREATE3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		CREATE3resok	resok;
+ *	default:
+ *		CREATE3resfail	resfail;
+ *	};
+ */
+static int decode_create3resok(struct xdr_stream *xdr,
+			       struct nfs3_diropres *result)
+{
+	int error;
+
+	error = decode_post_op_fh3(xdr, result->fh);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	/* The server isn't required to return a file handle.
+	 * If it didn't, force the client to perform a LOOKUP
+	 * to determine the correct file handle and attribute
+	 * values for the new object. */
+	if (result->fh->size == 0)
+		result->fattr->valid = 0;
+	error = decode_wcc_data(xdr, result->dir_attr);
+out:
+	return error;
+}
+
+static int nfs3_xdr_dec_create3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs3_diropres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_create3resok(xdr, result);
+out:
+	return error;
+out_default:
+	error = decode_wcc_data(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 3.3.12  REMOVE3res
+ *
+ *	struct REMOVE3resok {
+ *		wcc_data    dir_wcc;
+ *	};
+ *
+ *	struct REMOVE3resfail {
+ *		wcc_data    dir_wcc;
+ *	};
+ *
+ *	union REMOVE3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		REMOVE3resok   resok;
+ *	default:
+ *		REMOVE3resfail resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_removeres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 3.3.14  RENAME3res
+ *
+ *	struct RENAME3resok {
+ *		wcc_data	fromdir_wcc;
+ *		wcc_data	todir_wcc;
+ *	};
+ *
+ *	struct RENAME3resfail {
+ *		wcc_data	fromdir_wcc;
+ *		wcc_data	todir_wcc;
+ *	};
+ *
+ *	union RENAME3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		RENAME3resok   resok;
+ *	default:
+ *		RENAME3resfail resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_renameres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->old_fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->new_fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 3.3.15  LINK3res
+ *
+ *	struct LINK3resok {
+ *		post_op_attr	file_attributes;
+ *		wcc_data	linkdir_wcc;
+ *	};
+ *
+ *	struct LINK3resfail {
+ *		post_op_attr	file_attributes;
+ *		wcc_data	linkdir_wcc;
+ *	};
+ *
+ *	union LINK3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		LINK3resok	resok;
+ *	default:
+ *		LINK3resfail	resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs3_linkres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
+/**
+ * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in
+ *			the local page cache
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 3.3.16  entry3
+ *
+ *	struct entry3 {
+ *		fileid3		fileid;
+ *		filename3	name;
+ *		cookie3		cookie;
+ *		fhandle3	filehandle;
+ *		post_op_attr3	attributes;
+ *		entry3		*nextentry;
+ *	};
+ *
+ * 3.3.17  entryplus3
+ *	struct entryplus3 {
+ *		fileid3		fileid;
+ *		filename3	name;
+ *		cookie3		cookie;
+ *		post_op_attr	name_attributes;
+ *		post_op_fh3	name_handle;
+ *		entryplus3	*nextentry;
+ *	};
+ */
+int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+		       int plus)
+{
+	struct nfs_entry old = *entry;
+	__be32 *p;
+	int error;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p == xdr_zero) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(p == NULL))
+			goto out_overflow;
+		if (*p == xdr_zero)
+			return -EAGAIN;
+		entry->eof = 1;
+		return -EBADCOOKIE;
+	}
+
+	error = decode_fileid3(xdr, &entry->ino);
+	if (unlikely(error))
+		return error;
+
+	error = decode_inline_filename3(xdr, &entry->name, &entry->len);
+	if (unlikely(error))
+		return error;
+
+	entry->prev_cookie = entry->cookie;
+	error = decode_cookie3(xdr, &entry->cookie);
+	if (unlikely(error))
+		return error;
+
+	entry->d_type = DT_UNKNOWN;
+
+	if (plus) {
+		entry->fattr->valid = 0;
+		error = decode_post_op_attr(xdr, entry->fattr);
+		if (unlikely(error))
+			return error;
+		if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
+			entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
+
+		/* In fact, a post_op_fh3: */
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(p == NULL))
+			goto out_overflow;
+		if (*p != xdr_zero) {
+			error = decode_nfs_fh3(xdr, entry->fh);
+			if (unlikely(error)) {
+				if (error == -E2BIG)
+					goto out_truncated;
+				return error;
+			}
+		} else
+			zero_nfs_fh3(entry->fh);
+	}
+
+	return 0;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EAGAIN;
+out_truncated:
+	dprintk("NFS: directory entry contains invalid file handle\n");
+	*entry = old;
+	return -EAGAIN;
+}
+
+/*
+ * 3.3.16  READDIR3res
+ *
+ *	struct dirlist3 {
+ *		entry3		*entries;
+ *		bool		eof;
+ *	};
+ *
+ *	struct READDIR3resok {
+ *		post_op_attr	dir_attributes;
+ *		cookieverf3	cookieverf;
+ *		dirlist3	reply;
+ *	};
+ *
+ *	struct READDIR3resfail {
+ *		post_op_attr	dir_attributes;
+ *	};
+ *
+ *	union READDIR3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		READDIR3resok	resok;
+ *	default:
+ *		READDIR3resfail	resfail;
+ *	};
+ *
+ * Read the directory contents into the page cache, but otherwise
+ * don't touch them.  The actual decoding is done by nfs3_decode_entry()
+ * during subsequent nfs_readdir() calls.
+ */
+static int decode_dirlist3(struct xdr_stream *xdr)
+{
+	u32 recvd, pglen;
+	size_t hdrlen;
+
+	pglen = xdr->buf->page_len;
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+	recvd = xdr->buf->len - hdrlen;
+	if (unlikely(pglen > recvd))
+		goto out_cheating;
+out:
+	xdr_read_pages(xdr, pglen);
+	return pglen;
+out_cheating:
+	dprintk("NFS: server cheating in readdir result: "
+		"pglen %u > recvd %u\n", pglen, recvd);
+	pglen = recvd;
+	goto out;
+}
+
+static int decode_readdir3resok(struct xdr_stream *xdr,
+				struct nfs3_readdirres *result)
+{
+	int error;
+
+	error = decode_post_op_attr(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	/* XXX: do we need to check if result->verf != NULL ? */
+	error = decode_cookieverf3(xdr, result->verf);
+	if (unlikely(error))
+		goto out;
+	error = decode_dirlist3(xdr);
+out:
+	return error;
+}
+
+static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs3_readdirres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_readdir3resok(xdr, result);
+out:
+	return error;
+out_default:
+	error = decode_post_op_attr(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 3.3.18  FSSTAT3res
+ *
+ *	struct FSSTAT3resok {
+ *		post_op_attr	obj_attributes;
+ *		size3		tbytes;
+ *		size3		fbytes;
+ *		size3		abytes;
+ *		size3		tfiles;
+ *		size3		ffiles;
+ *		size3		afiles;
+ *		uint32		invarsec;
+ *	};
+ *
+ *	struct FSSTAT3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union FSSTAT3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		FSSTAT3resok	resok;
+ *	default:
+ *		FSSTAT3resfail	resfail;
+ *	};
+ */
+static int decode_fsstat3resok(struct xdr_stream *xdr,
+			       struct nfs_fsstat *result)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 8 * 6 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	p = xdr_decode_size3(p, &result->tbytes);
+	p = xdr_decode_size3(p, &result->fbytes);
+	p = xdr_decode_size3(p, &result->abytes);
+	p = xdr_decode_size3(p, &result->tfiles);
+	p = xdr_decode_size3(p, &result->ffiles);
+	xdr_decode_size3(p, &result->afiles);
+	/* ignore invarsec */
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_fsstat *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_fsstat3resok(xdr, result);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 3.3.19  FSINFO3res
+ *
+ *	struct FSINFO3resok {
+ *		post_op_attr	obj_attributes;
+ *		uint32		rtmax;
+ *		uint32		rtpref;
+ *		uint32		rtmult;
+ *		uint32		wtmax;
+ *		uint32		wtpref;
+ *		uint32		wtmult;
+ *		uint32		dtpref;
+ *		size3		maxfilesize;
+ *		nfstime3	time_delta;
+ *		uint32		properties;
+ *	};
+ *
+ *	struct FSINFO3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union FSINFO3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		FSINFO3resok	resok;
+ *	default:
+ *		FSINFO3resfail	resfail;
+ *	};
+ */
+static int decode_fsinfo3resok(struct xdr_stream *xdr,
+			       struct nfs_fsinfo *result)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	result->rtmax  = be32_to_cpup(p++);
+	result->rtpref = be32_to_cpup(p++);
+	result->rtmult = be32_to_cpup(p++);
+	result->wtmax  = be32_to_cpup(p++);
+	result->wtpref = be32_to_cpup(p++);
+	result->wtmult = be32_to_cpup(p++);
+	result->dtpref = be32_to_cpup(p++);
+	p = xdr_decode_size3(p, &result->maxfilesize);
+	xdr_decode_nfstime3(p, &result->time_delta);
+
+	/* ignore properties */
+	result->lease_time = 0;
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_fsinfo *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_fsinfo3resok(xdr, result);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 3.3.20  PATHCONF3res
+ *
+ *	struct PATHCONF3resok {
+ *		post_op_attr	obj_attributes;
+ *		uint32		linkmax;
+ *		uint32		name_max;
+ *		bool		no_trunc;
+ *		bool		chown_restricted;
+ *		bool		case_insensitive;
+ *		bool		case_preserving;
+ *	};
+ *
+ *	struct PATHCONF3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union PATHCONF3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		PATHCONF3resok	resok;
+ *	default:
+ *		PATHCONF3resfail resfail;
+ *	};
+ */
+static int decode_pathconf3resok(struct xdr_stream *xdr,
+				 struct nfs_pathconf *result)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 * 6);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	result->max_link = be32_to_cpup(p++);
+	result->max_namelen = be32_to_cpup(p);
+	/* ignore remaining fields */
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs_pathconf *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_pathconf3resok(xdr, result);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
+/*
+ * 3.3.21  COMMIT3res
+ *
+ *	struct COMMIT3resok {
+ *		wcc_data	file_wcc;
+ *		writeverf3	verf;
+ *	};
+ *
+ *	struct COMMIT3resfail {
+ *		wcc_data	file_wcc;
+ *	};
+ *
+ *	union COMMIT3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		COMMIT3resok	resok;
+ *	default:
+ *		COMMIT3resfail	resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_writeres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_writeverf3(xdr, result->verf->verifier);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
+
+#ifdef CONFIG_NFS_V3_ACL
+
+static inline int decode_getacl3resok(struct xdr_stream *xdr,
+				      struct nfs3_getaclres *result)
+{
+	struct posix_acl **acl;
+	unsigned int *aclcnt;
+	size_t hdrlen;
+	int error;
+
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_uint32(xdr, &result->mask);
+	if (unlikely(error))
+		goto out;
+	error = -EINVAL;
+	if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+		goto out;
+
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+
+	acl = NULL;
+	if (result->mask & NFS_ACL)
+		acl = &result->acl_access;
+	aclcnt = NULL;
+	if (result->mask & NFS_ACLCNT)
+		aclcnt = &result->acl_access_count;
+	error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl);
+	if (unlikely(error <= 0))
+		goto out;
+
+	acl = NULL;
+	if (result->mask & NFS_DFACL)
+		acl = &result->acl_default;
+	aclcnt = NULL;
+	if (result->mask & NFS_DFACLCNT)
+		aclcnt = &result->acl_default_count;
+	error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl);
+	if (unlikely(error <= 0))
+		return error;
+	error = 0;
+out:
+	return error;
+}
+
+static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs3_getaclres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_getacl3resok(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_fattr *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_post_op_attr(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
+
+#endif  /* CONFIG_NFS_V3_ACL */
+
+#define PROC(proc, argtype, restype, timer)				\
+[NFS3PROC_##proc] = {							\
+	.p_proc      = NFS3PROC_##proc,					\
+	.p_encode    = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args,	\
+	.p_decode    = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res,	\
+	.p_arglen    = NFS3_##argtype##args_sz,				\
+	.p_replen    = NFS3_##restype##res_sz,				\
+	.p_timer     = timer,						\
+	.p_statidx   = NFS3PROC_##proc,					\
+	.p_name      = #proc,						\
+	}
+
+struct rpc_procinfo	nfs3_procedures[] = {
+	PROC(GETATTR,		getattr,	getattr,	1),
+	PROC(SETATTR,		setattr,	setattr,	0),
+	PROC(LOOKUP,		lookup,		lookup,		2),
+	PROC(ACCESS,		access,		access,		1),
+	PROC(READLINK,		readlink,	readlink,	3),
+	PROC(READ,		read,		read,		3),
+	PROC(WRITE,		write,		write,		4),
+	PROC(CREATE,		create,		create,		0),
+	PROC(MKDIR,		mkdir,		create,		0),
+	PROC(SYMLINK,		symlink,	create,		0),
+	PROC(MKNOD,		mknod,		create,		0),
+	PROC(REMOVE,		remove,		remove,		0),
+	PROC(RMDIR,		lookup,		setattr,	0),
+	PROC(RENAME,		rename,		rename,		0),
+	PROC(LINK,		link,		link,		0),
+	PROC(READDIR,		readdir,	readdir,	3),
+	PROC(READDIRPLUS,	readdirplus,	readdir,	3),
+	PROC(FSSTAT,		getattr,	fsstat,		0),
+	PROC(FSINFO,		getattr,	fsinfo,		0),
+	PROC(PATHCONF,		getattr,	pathconf,	0),
+	PROC(COMMIT,		commit,		commit,		5),
+};
+
+struct rpc_version		nfs_version3 = {
+	.number			= 3,
+	.nrprocs		= ARRAY_SIZE(nfs3_procedures),
+	.procs			= nfs3_procedures
+};
+
+#ifdef CONFIG_NFS_V3_ACL
+static struct rpc_procinfo	nfs3_acl_procedures[] = {
+	[ACLPROC3_GETACL] = {
+		.p_proc = ACLPROC3_GETACL,
+		.p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args,
+		.p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res,
+		.p_arglen = ACL3_getaclargs_sz,
+		.p_replen = ACL3_getaclres_sz,
+		.p_timer = 1,
+		.p_name = "GETACL",
+	},
+	[ACLPROC3_SETACL] = {
+		.p_proc = ACLPROC3_SETACL,
+		.p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args,
+		.p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res,
+		.p_arglen = ACL3_setaclargs_sz,
+		.p_replen = ACL3_setaclres_sz,
+		.p_timer = 0,
+		.p_name = "SETACL",
+	},
+};
+
+struct rpc_version		nfsacl_version3 = {
+	.number			= 3,
+	.nrprocs		= sizeof(nfs3_acl_procedures)/
+				  sizeof(nfs3_acl_procedures[0]),
+	.procs			= nfs3_acl_procedures,
+};
+#endif  /* CONFIG_NFS_V3_ACL */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
new file mode 100644
index 00000000..e1c1365b
--- /dev/null
+++ b/fs/nfs/nfs4_fs.h
@@ -0,0 +1,383 @@
+/*
+ * linux/fs/nfs/nfs4_fs.h
+ *
+ * Copyright (C) 2005 Trond Myklebust
+ *
+ * NFSv4-specific filesystem definitions and declarations
+ */
+
+#ifndef __LINUX_FS_NFS_NFS4_FS_H
+#define __LINUX_FS_NFS_NFS4_FS_H
+
+#ifdef CONFIG_NFS_V4
+
+struct idmap;
+
+/*
+ * In a seqid-mutating op, this macro controls which error return
+ * values trigger incrementation of the seqid.
+ *
+ * from rfc 3010:
+ * The client MUST monotonically increment the sequence number for the
+ * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE
+ * operations.  This is true even in the event that the previous
+ * operation that used the sequence number received an error.  The only
+ * exception to this rule is if the previous operation received one of
+ * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID,
+ * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR,
+ * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE.
+ *
+ */
+#define seqid_mutating_err(err)       \
+(((err) != NFSERR_STALE_CLIENTID) &&  \
+ ((err) != NFSERR_STALE_STATEID)  &&  \
+ ((err) != NFSERR_BAD_STATEID)    &&  \
+ ((err) != NFSERR_BAD_SEQID)      &&  \
+ ((err) != NFSERR_BAD_XDR)        &&  \
+ ((err) != NFSERR_RESOURCE)       &&  \
+ ((err) != NFSERR_NOFILEHANDLE))
+
+enum nfs4_client_state {
+	NFS4CLNT_MANAGER_RUNNING  = 0,
+	NFS4CLNT_CHECK_LEASE,
+	NFS4CLNT_LEASE_EXPIRED,
+	NFS4CLNT_RECLAIM_REBOOT,
+	NFS4CLNT_RECLAIM_NOGRACE,
+	NFS4CLNT_DELEGRETURN,
+	NFS4CLNT_LAYOUTRECALL,
+	NFS4CLNT_SESSION_RESET,
+	NFS4CLNT_RECALL_SLOT,
+	NFS4CLNT_LEASE_CONFIRM,
+};
+
+enum nfs4_session_state {
+	NFS4_SESSION_INITING,
+	NFS4_SESSION_DRAINING,
+};
+
+struct nfs4_minor_version_ops {
+	u32	minor_version;
+
+	int	(*call_sync)(struct rpc_clnt *clnt,
+			struct nfs_server *server,
+			struct rpc_message *msg,
+			struct nfs4_sequence_args *args,
+			struct nfs4_sequence_res *res,
+			int cache_reply);
+	int	(*validate_stateid)(struct nfs_delegation *,
+			const nfs4_stateid *);
+	const struct nfs4_state_recovery_ops *reboot_recovery_ops;
+	const struct nfs4_state_recovery_ops *nograce_recovery_ops;
+	const struct nfs4_state_maintenance_ops *state_renewal_ops;
+};
+
+/*
+ * struct rpc_sequence ensures that RPC calls are sent in the exact
+ * order that they appear on the list.
+ */
+struct rpc_sequence {
+	struct rpc_wait_queue	wait;	/* RPC call delay queue */
+	spinlock_t lock;		/* Protects the list */
+	struct list_head list;		/* Defines sequence of RPC calls */
+};
+
+#define NFS_SEQID_CONFIRMED 1
+struct nfs_seqid_counter {
+	struct rpc_sequence *sequence;
+	int flags;
+	u32 counter;
+};
+
+struct nfs_seqid {
+	struct nfs_seqid_counter *sequence;
+	struct list_head list;
+};
+
+static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status)
+{
+	if (seqid_mutating_err(-status))
+		seqid->flags |= NFS_SEQID_CONFIRMED;
+}
+
+struct nfs_unique_id {
+	struct rb_node rb_node;
+	__u64 id;
+};
+
+/*
+ * NFS4 state_owners and lock_owners are simply labels for ordered
+ * sequences of RPC calls. Their sole purpose is to provide once-only
+ * semantics by allowing the server to identify replayed requests.
+ */
+struct nfs4_state_owner {
+	struct nfs_unique_id so_owner_id;
+	struct nfs_server    *so_server;
+	struct rb_node	     so_server_node;
+
+	struct rpc_cred	     *so_cred;	 /* Associated cred */
+
+	spinlock_t	     so_lock;
+	atomic_t	     so_count;
+	unsigned long	     so_flags;
+	struct list_head     so_states;
+	struct nfs_seqid_counter so_seqid;
+	struct rpc_sequence  so_sequence;
+};
+
+enum {
+	NFS_OWNER_RECLAIM_REBOOT,
+	NFS_OWNER_RECLAIM_NOGRACE
+};
+
+#define NFS_LOCK_NEW		0
+#define NFS_LOCK_RECLAIM	1
+#define NFS_LOCK_EXPIRED	2
+
+/*
+ * struct nfs4_state maintains the client-side state for a given
+ * (state_owner,inode) tuple (OPEN) or state_owner (LOCK).
+ *
+ * OPEN:
+ * In order to know when to OPEN_DOWNGRADE or CLOSE the state on the server,
+ * we need to know how many files are open for reading or writing on a
+ * given inode. This information too is stored here.
+ *
+ * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
+ */
+
+struct nfs4_lock_owner {
+	unsigned int lo_type;
+#define NFS4_ANY_LOCK_TYPE	(0U)
+#define NFS4_FLOCK_LOCK_TYPE	(1U << 0)
+#define NFS4_POSIX_LOCK_TYPE	(1U << 1)
+	union {
+		fl_owner_t posix_owner;
+		pid_t flock_owner;
+	} lo_u;
+};
+
+struct nfs4_lock_state {
+	struct list_head	ls_locks;	/* Other lock stateids */
+	struct nfs4_state *	ls_state;	/* Pointer to open state */
+#define NFS_LOCK_INITIALIZED 1
+	int			ls_flags;
+	struct nfs_seqid_counter	ls_seqid;
+	struct rpc_sequence	ls_sequence;
+	struct nfs_unique_id	ls_id;
+	nfs4_stateid		ls_stateid;
+	atomic_t		ls_count;
+	struct nfs4_lock_owner	ls_owner;
+};
+
+/* bits for nfs4_state->flags */
+enum {
+	LK_STATE_IN_USE,
+	NFS_DELEGATED_STATE,		/* Current stateid is delegation */
+	NFS_O_RDONLY_STATE,		/* OPEN stateid has read-only state */
+	NFS_O_WRONLY_STATE,		/* OPEN stateid has write-only state */
+	NFS_O_RDWR_STATE,		/* OPEN stateid has read/write state */
+	NFS_STATE_RECLAIM_REBOOT,	/* OPEN stateid server rebooted */
+	NFS_STATE_RECLAIM_NOGRACE,	/* OPEN stateid needs to recover state */
+	NFS_STATE_POSIX_LOCKS,		/* Posix locks are supported */
+};
+
+struct nfs4_state {
+	struct list_head open_states;	/* List of states for the same state_owner */
+	struct list_head inode_states;	/* List of states for the same inode */
+	struct list_head lock_states;	/* List of subservient lock stateids */
+
+	struct nfs4_state_owner *owner;	/* Pointer to the open owner */
+	struct inode *inode;		/* Pointer to the inode */
+
+	unsigned long flags;		/* Do we hold any locks? */
+	spinlock_t state_lock;		/* Protects the lock_states list */
+
+	seqlock_t seqlock;		/* Protects the stateid/open_stateid */
+	nfs4_stateid stateid;		/* Current stateid: may be delegation */
+	nfs4_stateid open_stateid;	/* OPEN stateid */
+
+	/* The following 3 fields are protected by owner->so_lock */
+	unsigned int n_rdonly;		/* Number of read-only references */
+	unsigned int n_wronly;		/* Number of write-only references */
+	unsigned int n_rdwr;		/* Number of read/write references */
+	fmode_t state;			/* State on the server (R,W, or RW) */
+	atomic_t count;
+};
+
+
+struct nfs4_exception {
+	long timeout;
+	int retry;
+	struct nfs4_state *state;
+	struct inode *inode;
+};
+
+struct nfs4_state_recovery_ops {
+	int owner_flag_bit;
+	int state_flag_bit;
+	int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
+	int (*recover_lock)(struct nfs4_state *, struct file_lock *);
+	int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
+	struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
+	int (*reclaim_complete)(struct nfs_client *);
+};
+
+struct nfs4_state_maintenance_ops {
+	int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *);
+	struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
+	int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
+};
+
+extern const struct dentry_operations nfs4_dentry_operations;
+extern const struct inode_operations nfs4_dir_inode_operations;
+
+/* nfs4proc.c */
+extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
+extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
+extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
+extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
+extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
+extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
+extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
+		struct nfs4_fs_locations *fs_locations, struct page *page);
+extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
+extern const struct xattr_handler *nfs4_xattr_handlers[];
+
+#if defined(CONFIG_NFS_V4_1)
+static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
+{
+	return server->nfs_client->cl_session;
+}
+
+extern int nfs4_setup_sequence(const struct nfs_server *server,
+		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+		int cache_reply, struct rpc_task *task);
+extern int nfs41_setup_sequence(struct nfs4_session *session,
+		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+		int cache_reply, struct rpc_task *task);
+extern void nfs4_destroy_session(struct nfs4_session *session);
+extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
+extern int nfs4_proc_create_session(struct nfs_client *);
+extern int nfs4_proc_destroy_session(struct nfs4_session *);
+extern int nfs4_init_session(struct nfs_server *server);
+extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
+		struct nfs_fsinfo *fsinfo);
+extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
+				  bool sync);
+
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+	return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
+		EXCHGID4_FLAG_USE_PNFS_DS;
+}
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+	return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
+}
+#else /* CONFIG_NFS_v4_1 */
+static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
+{
+	return NULL;
+}
+
+static inline int nfs4_setup_sequence(const struct nfs_server *server,
+		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+		int cache_reply, struct rpc_task *task)
+{
+	return 0;
+}
+
+static inline int nfs4_init_session(struct nfs_server *server)
+{
+	return 0;
+}
+
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+	return false;
+}
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+	return false;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
+
+extern const u32 nfs4_fattr_bitmap[2];
+extern const u32 nfs4_statfs_bitmap[2];
+extern const u32 nfs4_pathconf_bitmap[2];
+extern const u32 nfs4_fsinfo_bitmap[2];
+extern const u32 nfs4_fs_locations_bitmap[2];
+
+/* nfs4renewd.c */
+extern void nfs4_schedule_state_renewal(struct nfs_client *);
+extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
+extern void nfs4_kill_renewd(struct nfs_client *);
+extern void nfs4_renew_state(struct work_struct *);
+
+/* nfs4state.c */
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
+#if defined(CONFIG_NFS_V4_1)
+struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
+struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
+extern void nfs4_schedule_session_recovery(struct nfs4_session *);
+#else
+static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
+extern void nfs4_put_state_owner(struct nfs4_state_owner *);
+extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
+extern void nfs4_put_open_state(struct nfs4_state *);
+extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t);
+extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t);
+extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
+extern void nfs_inode_find_state_and_recover(struct inode *inode,
+		const nfs4_stateid *stateid);
+extern void nfs4_schedule_lease_recovery(struct nfs_client *);
+extern void nfs4_schedule_state_manager(struct nfs_client *);
+extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
+extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
+extern void nfs41_handle_recall_slot(struct nfs_client *clp);
+extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
+extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
+extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t);
+
+extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
+extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
+extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
+extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
+extern void nfs_release_seqid(struct nfs_seqid *seqid);
+extern void nfs_free_seqid(struct nfs_seqid *seqid);
+
+extern const nfs4_stateid zero_stateid;
+
+/* nfs4xdr.c */
+extern struct rpc_procinfo nfs4_procedures[];
+
+struct nfs4_mount_data;
+
+/* callback_xdr.c */
+extern struct svc_version nfs4_callback_version1;
+extern struct svc_version nfs4_callback_version4;
+
+#else
+
+#define nfs4_close_state(a, b, c) do { } while (0)
+#define nfs4_close_sync(a, b, c) do { } while (0)
+
+#endif /* CONFIG_NFS_V4 */
+#endif /* __LINUX_FS_NFS_NFS4_FS.H */
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
new file mode 100644
index 00000000..75af8121
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.c
@@ -0,0 +1,914 @@
+/*
+ *  Module for the pnfs nfs4 file layout driver.
+ *  Defines all I/O and Policy interface operations, plus code
+ *  to register itself with the pNFS client.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+
+#include "internal.h"
+#include "nfs4filelayout.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
+MODULE_DESCRIPTION("The NFSv4 file layout driver");
+
+#define FILELAYOUT_POLL_RETRY_MAX     (15*HZ)
+
+static loff_t
+filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
+			    loff_t offset)
+{
+	u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
+	u64 tmp;
+
+	offset -= flseg->pattern_offset;
+	tmp = offset;
+	do_div(tmp, stripe_width);
+
+	return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
+}
+
+/* This function is used by the layout driver to calculate the
+ * offset of the file on the dserver based on whether the
+ * layout type is STRIPE_DENSE or STRIPE_SPARSE
+ */
+static loff_t
+filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+	switch (flseg->stripe_type) {
+	case STRIPE_SPARSE:
+		return offset;
+
+	case STRIPE_DENSE:
+		return filelayout_get_dense_offset(flseg, offset);
+	}
+
+	BUG();
+}
+
+/* For data server errors we don't recover from */
+static void
+filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+	if (lseg->pls_range.iomode == IOMODE_RW) {
+		dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+	} else {
+		dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+	}
+}
+
+static int filelayout_async_handle_error(struct rpc_task *task,
+					 struct nfs4_state *state,
+					 struct nfs_client *clp,
+					 int *reset)
+{
+	if (task->tk_status >= 0)
+		return 0;
+
+	*reset = 0;
+
+	switch (task->tk_status) {
+	case -NFS4ERR_BADSESSION:
+	case -NFS4ERR_BADSLOT:
+	case -NFS4ERR_BAD_HIGH_SLOT:
+	case -NFS4ERR_DEADSESSION:
+	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+	case -NFS4ERR_SEQ_FALSE_RETRY:
+	case -NFS4ERR_SEQ_MISORDERED:
+		dprintk("%s ERROR %d, Reset session. Exchangeid "
+			"flags 0x%x\n", __func__, task->tk_status,
+			clp->cl_exchange_flags);
+		nfs4_schedule_session_recovery(clp->cl_session);
+		break;
+	case -NFS4ERR_DELAY:
+	case -NFS4ERR_GRACE:
+	case -EKEYEXPIRED:
+		rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
+		break;
+	case -NFS4ERR_RETRY_UNCACHED_REP:
+		break;
+	default:
+		dprintk("%s DS error. Retry through MDS %d\n", __func__,
+			task->tk_status);
+		*reset = 1;
+		break;
+	}
+	task->tk_status = 0;
+	return -EAGAIN;
+}
+
+/* NFS_PROTO call done callback routines */
+
+static int filelayout_read_done_cb(struct rpc_task *task,
+				struct nfs_read_data *data)
+{
+	struct nfs_client *clp = data->ds_clp;
+	int reset = 0;
+
+	dprintk("%s DS read\n", __func__);
+
+	if (filelayout_async_handle_error(task, data->args.context->state,
+					  data->ds_clp, &reset) == -EAGAIN) {
+		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+			__func__, data->ds_clp, data->ds_clp->cl_session);
+		if (reset) {
+			filelayout_set_lo_fail(data->lseg);
+			nfs4_reset_read(task, data);
+			clp = NFS_SERVER(data->inode)->nfs_client;
+		}
+		nfs_restart_rpc(task, clp);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+/*
+ * We reference the rpc_cred of the first WRITE that triggers the need for
+ * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
+ * rfc5661 is not clear about which credential should be used.
+ */
+static void
+filelayout_set_layoutcommit(struct nfs_write_data *wdata)
+{
+	if (FILELAYOUT_LSEG(wdata->lseg)->commit_through_mds ||
+	    wdata->res.verf->committed == NFS_FILE_SYNC)
+		return;
+
+	pnfs_set_layoutcommit(wdata);
+	dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino,
+		(unsigned long) NFS_I(wdata->inode)->layout->plh_lwb);
+}
+
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void filelayout_read_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+	rdata->read_done_cb = filelayout_read_done_cb;
+
+	if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
+				&rdata->args.seq_args, &rdata->res.seq_res,
+				0, task))
+		return;
+
+	rpc_call_start(task);
+}
+
+static void filelayout_read_call_done(struct rpc_task *task, void *data)
+{
+	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+
+	/* Note this may cause RPC to be resent */
+	rdata->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_read_release(void *data)
+{
+	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+	rdata->mds_ops->rpc_release(data);
+}
+
+static int filelayout_write_done_cb(struct rpc_task *task,
+				struct nfs_write_data *data)
+{
+	int reset = 0;
+
+	if (filelayout_async_handle_error(task, data->args.context->state,
+					  data->ds_clp, &reset) == -EAGAIN) {
+		struct nfs_client *clp;
+
+		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+			__func__, data->ds_clp, data->ds_clp->cl_session);
+		if (reset) {
+			filelayout_set_lo_fail(data->lseg);
+			nfs4_reset_write(task, data);
+			clp = NFS_SERVER(data->inode)->nfs_client;
+		} else
+			clp = data->ds_clp;
+		nfs_restart_rpc(task, clp);
+		return -EAGAIN;
+	}
+
+	filelayout_set_layoutcommit(data);
+	return 0;
+}
+
+/* Fake up some data that will cause nfs_commit_release to retry the writes. */
+static void prepare_to_resend_writes(struct nfs_write_data *data)
+{
+	struct nfs_page *first = nfs_list_entry(data->pages.next);
+
+	data->task.tk_status = 0;
+	memcpy(data->verf.verifier, first->wb_verf.verifier,
+	       sizeof(first->wb_verf.verifier));
+	data->verf.verifier[0]++; /* ensure verifier mismatch */
+}
+
+static int filelayout_commit_done_cb(struct rpc_task *task,
+				     struct nfs_write_data *data)
+{
+	int reset = 0;
+
+	if (filelayout_async_handle_error(task, data->args.context->state,
+					  data->ds_clp, &reset) == -EAGAIN) {
+		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+			__func__, data->ds_clp, data->ds_clp->cl_session);
+		if (reset) {
+			prepare_to_resend_writes(data);
+			filelayout_set_lo_fail(data->lseg);
+		} else
+			nfs_restart_rpc(task, data->ds_clp);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static void filelayout_write_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+	if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
+				&wdata->args.seq_args, &wdata->res.seq_res,
+				0, task))
+		return;
+
+	rpc_call_start(task);
+}
+
+static void filelayout_write_call_done(struct rpc_task *task, void *data)
+{
+	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+	/* Note this may cause RPC to be resent */
+	wdata->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_write_release(void *data)
+{
+	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+	wdata->mds_ops->rpc_release(data);
+}
+
+static void filelayout_commit_release(void *data)
+{
+	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+	nfs_commit_release_pages(wdata);
+	if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding))
+		nfs_commit_clear_lock(NFS_I(wdata->inode));
+	nfs_commitdata_release(wdata);
+}
+
+struct rpc_call_ops filelayout_read_call_ops = {
+	.rpc_call_prepare = filelayout_read_prepare,
+	.rpc_call_done = filelayout_read_call_done,
+	.rpc_release = filelayout_read_release,
+};
+
+struct rpc_call_ops filelayout_write_call_ops = {
+	.rpc_call_prepare = filelayout_write_prepare,
+	.rpc_call_done = filelayout_write_call_done,
+	.rpc_release = filelayout_write_release,
+};
+
+struct rpc_call_ops filelayout_commit_call_ops = {
+	.rpc_call_prepare = filelayout_write_prepare,
+	.rpc_call_done = filelayout_write_call_done,
+	.rpc_release = filelayout_commit_release,
+};
+
+static enum pnfs_try_status
+filelayout_read_pagelist(struct nfs_read_data *data)
+{
+	struct pnfs_layout_segment *lseg = data->lseg;
+	struct nfs4_pnfs_ds *ds;
+	loff_t offset = data->args.offset;
+	u32 j, idx;
+	struct nfs_fh *fh;
+	int status;
+
+	dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+		__func__, data->inode->i_ino,
+		data->args.pgbase, (size_t)data->args.count, offset);
+
+	/* Retrieve the correct rpc_client for the byte range */
+	j = nfs4_fl_calc_j_index(lseg, offset);
+	idx = nfs4_fl_calc_ds_index(lseg, j);
+	ds = nfs4_fl_prepare_ds(lseg, idx);
+	if (!ds) {
+		/* Either layout fh index faulty, or ds connect failed */
+		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+		return PNFS_NOT_ATTEMPTED;
+	}
+	dprintk("%s USE DS:ip %x %hu\n", __func__,
+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+
+	/* No multipath support. Use first DS */
+	data->ds_clp = ds->ds_clp;
+	fh = nfs4_fl_select_ds_fh(lseg, j);
+	if (fh)
+		data->args.fh = fh;
+
+	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+	data->mds_offset = offset;
+
+	/* Perform an asynchronous read to ds */
+	status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
+				   &filelayout_read_call_ops);
+	BUG_ON(status != 0);
+	return PNFS_ATTEMPTED;
+}
+
+/* Perform async writes. */
+static enum pnfs_try_status
+filelayout_write_pagelist(struct nfs_write_data *data, int sync)
+{
+	struct pnfs_layout_segment *lseg = data->lseg;
+	struct nfs4_pnfs_ds *ds;
+	loff_t offset = data->args.offset;
+	u32 j, idx;
+	struct nfs_fh *fh;
+	int status;
+
+	/* Retrieve the correct rpc_client for the byte range */
+	j = nfs4_fl_calc_j_index(lseg, offset);
+	idx = nfs4_fl_calc_ds_index(lseg, j);
+	ds = nfs4_fl_prepare_ds(lseg, idx);
+	if (!ds) {
+		printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+		return PNFS_NOT_ATTEMPTED;
+	}
+	dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
+		data->inode->i_ino, sync, (size_t) data->args.count, offset,
+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+
+	data->write_done_cb = filelayout_write_done_cb;
+	data->ds_clp = ds->ds_clp;
+	fh = nfs4_fl_select_ds_fh(lseg, j);
+	if (fh)
+		data->args.fh = fh;
+	/*
+	 * Get the file offset on the dserver. Set the write offset to
+	 * this offset and save the original offset.
+	 */
+	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+
+	/* Perform an asynchronous write */
+	status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
+				    &filelayout_write_call_ops, sync);
+	BUG_ON(status != 0);
+	return PNFS_ATTEMPTED;
+}
+
+/*
+ * filelayout_check_layout()
+ *
+ * Make sure layout segment parameters are sane WRT the device.
+ * At this point no generic layer initialization of the lseg has occurred,
+ * and nothing has been added to the layout_hdr cache.
+ *
+ */
+static int
+filelayout_check_layout(struct pnfs_layout_hdr *lo,
+			struct nfs4_filelayout_segment *fl,
+			struct nfs4_layoutget_res *lgr,
+			struct nfs4_deviceid *id,
+			gfp_t gfp_flags)
+{
+	struct nfs4_deviceid_node *d;
+	struct nfs4_file_layout_dsaddr *dsaddr;
+	int status = -EINVAL;
+	struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
+
+	dprintk("--> %s\n", __func__);
+
+	/* FIXME: remove this check when layout segment support is added */
+	if (lgr->range.offset != 0 ||
+	    lgr->range.length != NFS4_MAX_UINT64) {
+		dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
+			__func__);
+		goto out;
+	}
+
+	if (fl->pattern_offset > lgr->range.offset) {
+		dprintk("%s pattern_offset %lld too large\n",
+				__func__, fl->pattern_offset);
+		goto out;
+	}
+
+	if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) {
+		dprintk("%s Invalid stripe unit (%u)\n",
+			__func__, fl->stripe_unit);
+		goto out;
+	}
+
+	/* find and reference the deviceid */
+	d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
+				   NFS_SERVER(lo->plh_inode)->nfs_client, id);
+	if (d == NULL) {
+		dsaddr = get_device_info(lo->plh_inode, id, gfp_flags);
+		if (dsaddr == NULL)
+			goto out;
+	} else
+		dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+	fl->dsaddr = dsaddr;
+
+	if (fl->first_stripe_index < 0 ||
+	    fl->first_stripe_index >= dsaddr->stripe_count) {
+		dprintk("%s Bad first_stripe_index %d\n",
+				__func__, fl->first_stripe_index);
+		goto out_put;
+	}
+
+	if ((fl->stripe_type == STRIPE_SPARSE &&
+	    fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
+	    (fl->stripe_type == STRIPE_DENSE &&
+	    fl->num_fh != dsaddr->stripe_count)) {
+		dprintk("%s num_fh %u not valid for given packing\n",
+			__func__, fl->num_fh);
+		goto out_put;
+	}
+
+	if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
+		dprintk("%s Stripe unit (%u) not aligned with rsize %u "
+			"wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
+			nfss->wsize);
+	}
+
+	status = 0;
+out:
+	dprintk("--> %s returns %d\n", __func__, status);
+	return status;
+out_put:
+	nfs4_fl_put_deviceid(dsaddr);
+	goto out;
+}
+
+static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
+{
+	int i;
+
+	for (i = 0; i < fl->num_fh; i++) {
+		if (!fl->fh_array[i])
+			break;
+		kfree(fl->fh_array[i]);
+	}
+	kfree(fl->fh_array);
+	fl->fh_array = NULL;
+}
+
+static void
+_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
+{
+	filelayout_free_fh_array(fl);
+	kfree(fl);
+}
+
+static int
+filelayout_decode_layout(struct pnfs_layout_hdr *flo,
+			 struct nfs4_filelayout_segment *fl,
+			 struct nfs4_layoutget_res *lgr,
+			 struct nfs4_deviceid *id,
+			 gfp_t gfp_flags)
+{
+	struct xdr_stream stream;
+	struct xdr_buf buf;
+	struct page *scratch;
+	__be32 *p;
+	uint32_t nfl_util;
+	int i;
+
+	dprintk("%s: set_layout_map Begin\n", __func__);
+
+	scratch = alloc_page(gfp_flags);
+	if (!scratch)
+		return -ENOMEM;
+
+	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+	/* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
+	 * num_fh (4) */
+	p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20);
+	if (unlikely(!p))
+		goto out_err;
+
+	memcpy(id, p, sizeof(*id));
+	p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+	nfs4_print_deviceid(id);
+
+	nfl_util = be32_to_cpup(p++);
+	if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
+		fl->commit_through_mds = 1;
+	if (nfl_util & NFL4_UFLG_DENSE)
+		fl->stripe_type = STRIPE_DENSE;
+	else
+		fl->stripe_type = STRIPE_SPARSE;
+	fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
+
+	fl->first_stripe_index = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &fl->pattern_offset);
+	fl->num_fh = be32_to_cpup(p++);
+
+	dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
+		__func__, nfl_util, fl->num_fh, fl->first_stripe_index,
+		fl->pattern_offset);
+
+	/* Note that a zero value for num_fh is legal for STRIPE_SPARSE.
+	 * Futher checking is done in filelayout_check_layout */
+	if (fl->num_fh < 0 || fl->num_fh >
+	    max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT))
+		goto out_err;
+
+	if (fl->num_fh > 0) {
+		fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
+				       gfp_flags);
+		if (!fl->fh_array)
+			goto out_err;
+	}
+
+	for (i = 0; i < fl->num_fh; i++) {
+		/* Do we want to use a mempool here? */
+		fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), gfp_flags);
+		if (!fl->fh_array[i])
+			goto out_err_free;
+
+		p = xdr_inline_decode(&stream, 4);
+		if (unlikely(!p))
+			goto out_err_free;
+		fl->fh_array[i]->size = be32_to_cpup(p++);
+		if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
+			printk(KERN_ERR "Too big fh %d received %d\n",
+			       i, fl->fh_array[i]->size);
+			goto out_err_free;
+		}
+
+		p = xdr_inline_decode(&stream, fl->fh_array[i]->size);
+		if (unlikely(!p))
+			goto out_err_free;
+		memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
+		dprintk("DEBUG: %s: fh len %d\n", __func__,
+			fl->fh_array[i]->size);
+	}
+
+	__free_page(scratch);
+	return 0;
+
+out_err_free:
+	filelayout_free_fh_array(fl);
+out_err:
+	__free_page(scratch);
+	return -EIO;
+}
+
+static void
+filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+
+	dprintk("--> %s\n", __func__);
+	nfs4_fl_put_deviceid(fl->dsaddr);
+	kfree(fl->commit_buckets);
+	_filelayout_free_lseg(fl);
+}
+
+static struct pnfs_layout_segment *
+filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
+		      struct nfs4_layoutget_res *lgr,
+		      gfp_t gfp_flags)
+{
+	struct nfs4_filelayout_segment *fl;
+	int rc;
+	struct nfs4_deviceid id;
+
+	dprintk("--> %s\n", __func__);
+	fl = kzalloc(sizeof(*fl), gfp_flags);
+	if (!fl)
+		return NULL;
+
+	rc = filelayout_decode_layout(layoutid, fl, lgr, &id, gfp_flags);
+	if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id, gfp_flags)) {
+		_filelayout_free_lseg(fl);
+		return NULL;
+	}
+
+	/* This assumes there is only one IOMODE_RW lseg.  What
+	 * we really want to do is have a layout_hdr level
+	 * dictionary of <multipath_list4, fh> keys, each
+	 * associated with a struct list_head, populated by calls
+	 * to filelayout_write_pagelist().
+	 * */
+	if ((!fl->commit_through_mds) && (lgr->range.iomode == IOMODE_RW)) {
+		int i;
+		int size = (fl->stripe_type == STRIPE_SPARSE) ?
+			fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
+
+		fl->commit_buckets = kcalloc(size, sizeof(struct list_head), gfp_flags);
+		if (!fl->commit_buckets) {
+			filelayout_free_lseg(&fl->generic_hdr);
+			return NULL;
+		}
+		fl->number_of_buckets = size;
+		for (i = 0; i < size; i++)
+			INIT_LIST_HEAD(&fl->commit_buckets[i]);
+	}
+	return &fl->generic_hdr;
+}
+
+/*
+ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * return true  : coalesce page
+ * return false : don't coalesce page
+ */
+bool
+filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+		   struct nfs_page *req)
+{
+	u64 p_stripe, r_stripe;
+	u32 stripe_unit;
+
+	if (!pnfs_generic_pg_test(pgio, prev, req) ||
+	    !nfs_generic_pg_test(pgio, prev, req))
+		return false;
+
+	if (!pgio->pg_lseg)
+		return 1;
+	p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
+	r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
+	stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+
+	do_div(p_stripe, stripe_unit);
+	do_div(r_stripe, stripe_unit);
+
+	return (p_stripe == r_stripe);
+}
+
+static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg)
+{
+	return !FILELAYOUT_LSEG(lseg)->commit_through_mds;
+}
+
+static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
+{
+	if (fl->stripe_type == STRIPE_SPARSE)
+		return nfs4_fl_calc_ds_index(&fl->generic_hdr, j);
+	else
+		return j;
+}
+
+struct list_head *filelayout_choose_commit_list(struct nfs_page *req)
+{
+	struct pnfs_layout_segment *lseg = req->wb_commit_lseg;
+	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+	u32 i, j;
+	struct list_head *list;
+
+	/* Note that we are calling nfs4_fl_calc_j_index on each page
+	 * that ends up being committed to a data server.  An attractive
+	 * alternative is to add a field to nfs_write_data and nfs_page
+	 * to store the value calculated in filelayout_write_pagelist
+	 * and just use that here.
+	 */
+	j = nfs4_fl_calc_j_index(lseg,
+				 (loff_t)req->wb_index << PAGE_CACHE_SHIFT);
+	i = select_bucket_index(fl, j);
+	list = &fl->commit_buckets[i];
+	if (list_empty(list)) {
+		/* Non-empty buckets hold a reference on the lseg */
+		get_lseg(lseg);
+	}
+	return list;
+}
+
+static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+	if (flseg->stripe_type == STRIPE_SPARSE)
+		return i;
+	else
+		return nfs4_fl_calc_ds_index(lseg, i);
+}
+
+static struct nfs_fh *
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+	if (flseg->stripe_type == STRIPE_SPARSE) {
+		if (flseg->num_fh == 1)
+			i = 0;
+		else if (flseg->num_fh == 0)
+			/* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+			return NULL;
+	}
+	return flseg->fh_array[i];
+}
+
+static int filelayout_initiate_commit(struct nfs_write_data *data, int how)
+{
+	struct pnfs_layout_segment *lseg = data->lseg;
+	struct nfs4_pnfs_ds *ds;
+	u32 idx;
+	struct nfs_fh *fh;
+
+	idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+	ds = nfs4_fl_prepare_ds(lseg, idx);
+	if (!ds) {
+		printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+		prepare_to_resend_writes(data);
+		data->mds_ops->rpc_release(data);
+		return -EAGAIN;
+	}
+	dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how);
+	data->write_done_cb = filelayout_commit_done_cb;
+	data->ds_clp = ds->ds_clp;
+	fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+	if (fh)
+		data->args.fh = fh;
+	return nfs_initiate_commit(data, ds->ds_clp->cl_rpcclient,
+				   &filelayout_commit_call_ops, how);
+}
+
+/*
+ * This is only useful while we are using whole file layouts.
+ */
+static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode)
+{
+	struct pnfs_layout_segment *lseg, *rv = NULL;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
+		if (lseg->pls_range.iomode == IOMODE_RW)
+			rv = get_lseg(lseg);
+	spin_unlock(&inode->i_lock);
+	return rv;
+}
+
+static int alloc_ds_commits(struct inode *inode, struct list_head *list)
+{
+	struct pnfs_layout_segment *lseg;
+	struct nfs4_filelayout_segment *fl;
+	struct nfs_write_data *data;
+	int i, j;
+
+	/* Won't need this when non-whole file layout segments are supported
+	 * instead we will use a pnfs_layout_hdr structure */
+	lseg = find_only_write_lseg(inode);
+	if (!lseg)
+		return 0;
+	fl = FILELAYOUT_LSEG(lseg);
+	for (i = 0; i < fl->number_of_buckets; i++) {
+		if (list_empty(&fl->commit_buckets[i]))
+			continue;
+		data = nfs_commitdata_alloc();
+		if (!data)
+			goto out_bad;
+		data->ds_commit_index = i;
+		data->lseg = lseg;
+		list_add(&data->pages, list);
+	}
+	put_lseg(lseg);
+	return 0;
+
+out_bad:
+	for (j = i; j < fl->number_of_buckets; j++) {
+		if (list_empty(&fl->commit_buckets[i]))
+			continue;
+		nfs_retry_commit(&fl->commit_buckets[i], lseg);
+		put_lseg(lseg);  /* associated with emptying bucket */
+	}
+	put_lseg(lseg);
+	/* Caller will clean up entries put on list */
+	return -ENOMEM;
+}
+
+/* This follows nfs_commit_list pretty closely */
+static int
+filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+			   int how)
+{
+	struct nfs_write_data	*data, *tmp;
+	LIST_HEAD(list);
+
+	if (!list_empty(mds_pages)) {
+		data = nfs_commitdata_alloc();
+		if (!data)
+			goto out_bad;
+		data->lseg = NULL;
+		list_add(&data->pages, &list);
+	}
+
+	if (alloc_ds_commits(inode, &list))
+		goto out_bad;
+
+	list_for_each_entry_safe(data, tmp, &list, pages) {
+		list_del_init(&data->pages);
+		atomic_inc(&NFS_I(inode)->commits_outstanding);
+		if (!data->lseg) {
+			nfs_init_commit(data, mds_pages, NULL);
+			nfs_initiate_commit(data, NFS_CLIENT(inode),
+					    data->mds_ops, how);
+		} else {
+			nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index], data->lseg);
+			filelayout_initiate_commit(data, how);
+		}
+	}
+	return 0;
+ out_bad:
+	list_for_each_entry_safe(data, tmp, &list, pages) {
+		nfs_retry_commit(&data->pages, data->lseg);
+		list_del_init(&data->pages);
+		nfs_commit_free(data);
+	}
+	nfs_retry_commit(mds_pages, NULL);
+	nfs_commit_clear_lock(NFS_I(inode));
+	return -ENOMEM;
+}
+
+static void
+filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
+{
+	nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
+}
+
+static struct pnfs_layoutdriver_type filelayout_type = {
+	.id			= LAYOUT_NFSV4_1_FILES,
+	.name			= "LAYOUT_NFSV4_1_FILES",
+	.owner			= THIS_MODULE,
+	.alloc_lseg		= filelayout_alloc_lseg,
+	.free_lseg		= filelayout_free_lseg,
+	.pg_test		= filelayout_pg_test,
+	.mark_pnfs_commit	= filelayout_mark_pnfs_commit,
+	.choose_commit_list	= filelayout_choose_commit_list,
+	.commit_pagelist	= filelayout_commit_pagelist,
+	.read_pagelist		= filelayout_read_pagelist,
+	.write_pagelist		= filelayout_write_pagelist,
+	.free_deviceid_node	= filelayout_free_deveiceid_node,
+};
+
+static int __init nfs4filelayout_init(void)
+{
+	printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
+	       __func__);
+	return pnfs_register_layoutdriver(&filelayout_type);
+}
+
+static void __exit nfs4filelayout_exit(void)
+{
+	printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
+	       __func__);
+	pnfs_unregister_layoutdriver(&filelayout_type);
+}
+
+module_init(nfs4filelayout_init);
+module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
new file mode 100644
index 00000000..cebe01e3
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.h
@@ -0,0 +1,105 @@
+/*
+ *  NFSv4 file layout driver data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#ifndef FS_NFS_NFS4FILELAYOUT_H
+#define FS_NFS_NFS4FILELAYOUT_H
+
+#include "pnfs.h"
+
+/*
+ * Field testing shows we need to support up to 4096 stripe indices.
+ * We store each index as a u8 (u32 on the wire) to keep the memory footprint
+ * reasonable. This in turn means we support a maximum of 256
+ * RFC 5661 multipath_list4 structures.
+ */
+#define NFS4_PNFS_MAX_STRIPE_CNT 4096
+#define NFS4_PNFS_MAX_MULTI_CNT  256 /* 256 fit into a u8 stripe_index */
+
+enum stripetype4 {
+	STRIPE_SPARSE = 1,
+	STRIPE_DENSE = 2
+};
+
+/* Individual ip address */
+struct nfs4_pnfs_ds {
+	struct list_head	ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
+	u32			ds_ip_addr;
+	u32			ds_port;
+	struct nfs_client	*ds_clp;
+	atomic_t		ds_count;
+};
+
+/* nfs4_file_layout_dsaddr flags */
+#define NFS4_DEVICE_ID_NEG_ENTRY	0x00000001
+
+struct nfs4_file_layout_dsaddr {
+	struct nfs4_deviceid_node	id_node;
+	unsigned long			flags;
+	u32				stripe_count;
+	u8				*stripe_indices;
+	u32				ds_num;
+	struct nfs4_pnfs_ds		*ds_list[1];
+};
+
+struct nfs4_filelayout_segment {
+	struct pnfs_layout_segment generic_hdr;
+	u32 stripe_type;
+	u32 commit_through_mds;
+	u32 stripe_unit;
+	u32 first_stripe_index;
+	u64 pattern_offset;
+	struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
+	unsigned int num_fh;
+	struct nfs_fh **fh_array;
+	struct list_head *commit_buckets; /* Sort commits to ds */
+	int number_of_buckets;
+};
+
+static inline struct nfs4_filelayout_segment *
+FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+	return container_of(lseg,
+			    struct nfs4_filelayout_segment,
+			    generic_hdr);
+}
+
+extern struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
+
+extern void print_ds(struct nfs4_pnfs_ds *ds);
+u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
+u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
+struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
+					u32 ds_idx);
+extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
+extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
+struct nfs4_file_layout_dsaddr *
+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
+
+#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
new file mode 100644
index 00000000..3b7bf137
--- /dev/null
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -0,0 +1,636 @@
+/*
+ *  Device operations for the pnfs nfs4 file layout driver.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *  Garth Goodson   <Garth.Goodson@netapp.com>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+
+#include "internal.h"
+#include "nfs4filelayout.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS_LD
+
+/*
+ * Data server cache
+ *
+ * Data servers can be mapped to different device ids.
+ * nfs4_pnfs_ds reference counting
+ *   - set to 1 on allocation
+ *   - incremented when a device id maps a data server already in the cache.
+ *   - decremented when deviceid is removed from the cache.
+ */
+DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static LIST_HEAD(nfs4_data_server_cache);
+
+/* Debug routines */
+void
+print_ds(struct nfs4_pnfs_ds *ds)
+{
+	if (ds == NULL) {
+		printk("%s NULL device\n", __func__);
+		return;
+	}
+	printk("        ip_addr %x port %hu\n"
+		"        ref count %d\n"
+		"        client %p\n"
+		"        cl_exchange_flags %x\n",
+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+		atomic_read(&ds->ds_count), ds->ds_clp,
+		ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+}
+
+/* nfs4_ds_cache_lock is held */
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(u32 ip_addr, u32 port)
+{
+	struct nfs4_pnfs_ds *ds;
+
+	dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
+			ntohl(ip_addr), ntohs(port));
+
+	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
+		if (ds->ds_ip_addr == ip_addr &&
+		    ds->ds_port == port) {
+			return ds;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * Create an rpc connection to the nfs4_pnfs_ds data server
+ * Currently only support IPv4
+ */
+static int
+nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
+{
+	struct nfs_client *clp;
+	struct sockaddr_in sin;
+	int status = 0;
+
+	dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+		mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
+
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = ds->ds_ip_addr;
+	sin.sin_port = ds->ds_port;
+
+	clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin,
+				 sizeof(sin), IPPROTO_TCP);
+	if (IS_ERR(clp)) {
+		status = PTR_ERR(clp);
+		goto out;
+	}
+
+	if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) {
+		if (!is_ds_client(clp)) {
+			status = -ENODEV;
+			goto out_put;
+		}
+		ds->ds_clp = clp;
+		dprintk("%s [existing] ip=%x, port=%hu\n", __func__,
+			ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+		goto out;
+	}
+
+	/*
+	 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to
+	 * be equal to the MDS lease. Renewal is scheduled in create_session.
+	 */
+	spin_lock(&mds_srv->nfs_client->cl_lock);
+	clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
+	spin_unlock(&mds_srv->nfs_client->cl_lock);
+	clp->cl_last_renewal = jiffies;
+
+	/* New nfs_client */
+	status = nfs4_init_ds_session(clp);
+	if (status)
+		goto out_put;
+
+	ds->ds_clp = clp;
+	dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr),
+		ntohs(ds->ds_port));
+out:
+	return status;
+out_put:
+	nfs_put_client(clp);
+	goto out;
+}
+
+static void
+destroy_ds(struct nfs4_pnfs_ds *ds)
+{
+	dprintk("--> %s\n", __func__);
+	ifdebug(FACILITY)
+		print_ds(ds);
+
+	if (ds->ds_clp)
+		nfs_put_client(ds->ds_clp);
+	kfree(ds);
+}
+
+void
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+	struct nfs4_pnfs_ds *ds;
+	int i;
+
+	nfs4_print_deviceid(&dsaddr->id_node.deviceid);
+
+	for (i = 0; i < dsaddr->ds_num; i++) {
+		ds = dsaddr->ds_list[i];
+		if (ds != NULL) {
+			if (atomic_dec_and_lock(&ds->ds_count,
+						&nfs4_ds_cache_lock)) {
+				list_del_init(&ds->ds_node);
+				spin_unlock(&nfs4_ds_cache_lock);
+				destroy_ds(ds);
+			}
+		}
+	}
+	kfree(dsaddr->stripe_indices);
+	kfree(dsaddr);
+}
+
+static struct nfs4_pnfs_ds *
+nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port, gfp_t gfp_flags)
+{
+	struct nfs4_pnfs_ds *tmp_ds, *ds;
+
+	ds = kzalloc(sizeof(*tmp_ds), gfp_flags);
+	if (!ds)
+		goto out;
+
+	spin_lock(&nfs4_ds_cache_lock);
+	tmp_ds = _data_server_lookup_locked(ip_addr, port);
+	if (tmp_ds == NULL) {
+		ds->ds_ip_addr = ip_addr;
+		ds->ds_port = port;
+		atomic_set(&ds->ds_count, 1);
+		INIT_LIST_HEAD(&ds->ds_node);
+		ds->ds_clp = NULL;
+		list_add(&ds->ds_node, &nfs4_data_server_cache);
+		dprintk("%s add new data server ip 0x%x\n", __func__,
+			ds->ds_ip_addr);
+	} else {
+		kfree(ds);
+		atomic_inc(&tmp_ds->ds_count);
+		dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
+			__func__, tmp_ds->ds_ip_addr,
+			atomic_read(&tmp_ds->ds_count));
+		ds = tmp_ds;
+	}
+	spin_unlock(&nfs4_ds_cache_lock);
+out:
+	return ds;
+}
+
+/*
+ * Currently only support ipv4, and one multi-path address.
+ */
+static struct nfs4_pnfs_ds *
+decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_flags)
+{
+	struct nfs4_pnfs_ds *ds = NULL;
+	char *buf;
+	const char *ipend, *pstr;
+	u32 ip_addr, port;
+	int nlen, rlen, i;
+	int tmp[2];
+	__be32 *p;
+
+	/* r_netid */
+	p = xdr_inline_decode(streamp, 4);
+	if (unlikely(!p))
+		goto out_err;
+	nlen = be32_to_cpup(p++);
+
+	p = xdr_inline_decode(streamp, nlen);
+	if (unlikely(!p))
+		goto out_err;
+
+	/* Check that netid is "tcp" */
+	if (nlen != 3 ||  memcmp((char *)p, "tcp", 3)) {
+		dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
+		goto out_err;
+	}
+
+	/* r_addr */
+	p = xdr_inline_decode(streamp, 4);
+	if (unlikely(!p))
+		goto out_err;
+	rlen = be32_to_cpup(p);
+
+	p = xdr_inline_decode(streamp, rlen);
+	if (unlikely(!p))
+		goto out_err;
+
+	/* ipv6 length plus port is legal */
+	if (rlen > INET6_ADDRSTRLEN + 8) {
+		dprintk("%s: Invalid address, length %d\n", __func__,
+			rlen);
+		goto out_err;
+	}
+	buf = kmalloc(rlen + 1, gfp_flags);
+	if (!buf) {
+		dprintk("%s: Not enough memory\n", __func__);
+		goto out_err;
+	}
+	buf[rlen] = '\0';
+	memcpy(buf, p, rlen);
+
+	/* replace the port dots with dashes for the in4_pton() delimiter*/
+	for (i = 0; i < 2; i++) {
+		char *res = strrchr(buf, '.');
+		if (!res) {
+			dprintk("%s: Failed finding expected dots in port\n",
+				__func__);
+			goto out_free;
+		}
+		*res = '-';
+	}
+
+	/* Currently only support ipv4 address */
+	if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
+		dprintk("%s: Only ipv4 addresses supported\n", __func__);
+		goto out_free;
+	}
+
+	/* port */
+	pstr = ipend;
+	sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
+	port = htons((tmp[0] << 8) | (tmp[1]));
+
+	ds = nfs4_pnfs_ds_add(inode, ip_addr, port, gfp_flags);
+	dprintk("%s: Decoded address and port %s\n", __func__, buf);
+out_free:
+	kfree(buf);
+out_err:
+	return ds;
+}
+
+/* Decode opaque device data and return the result */
+static struct nfs4_file_layout_dsaddr*
+decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+	int i;
+	u32 cnt, num;
+	u8 *indexp;
+	__be32 *p;
+	u8 *stripe_indices;
+	u8 max_stripe_index;
+	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
+	struct xdr_stream stream;
+	struct xdr_buf buf;
+	struct page *scratch;
+
+	/* set up xdr stream */
+	scratch = alloc_page(gfp_flags);
+	if (!scratch)
+		goto out_err;
+
+	xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+	/* Get the stripe count (number of stripe index) */
+	p = xdr_inline_decode(&stream, 4);
+	if (unlikely(!p))
+		goto out_err_free_scratch;
+
+	cnt = be32_to_cpup(p);
+	dprintk("%s stripe count  %d\n", __func__, cnt);
+	if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
+		printk(KERN_WARNING "%s: stripe count %d greater than "
+		       "supported maximum %d\n", __func__,
+			cnt, NFS4_PNFS_MAX_STRIPE_CNT);
+		goto out_err_free_scratch;
+	}
+
+	/* read stripe indices */
+	stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags);
+	if (!stripe_indices)
+		goto out_err_free_scratch;
+
+	p = xdr_inline_decode(&stream, cnt << 2);
+	if (unlikely(!p))
+		goto out_err_free_stripe_indices;
+
+	indexp = &stripe_indices[0];
+	max_stripe_index = 0;
+	for (i = 0; i < cnt; i++) {
+		*indexp = be32_to_cpup(p++);
+		max_stripe_index = max(max_stripe_index, *indexp);
+		indexp++;
+	}
+
+	/* Check the multipath list count */
+	p = xdr_inline_decode(&stream, 4);
+	if (unlikely(!p))
+		goto out_err_free_stripe_indices;
+
+	num = be32_to_cpup(p);
+	dprintk("%s ds_num %u\n", __func__, num);
+	if (num > NFS4_PNFS_MAX_MULTI_CNT) {
+		printk(KERN_WARNING "%s: multipath count %d greater than "
+			"supported maximum %d\n", __func__,
+			num, NFS4_PNFS_MAX_MULTI_CNT);
+		goto out_err_free_stripe_indices;
+	}
+
+	/* validate stripe indices are all < num */
+	if (max_stripe_index >= num) {
+		printk(KERN_WARNING "%s: stripe index %u >= num ds %u\n",
+			__func__, max_stripe_index, num);
+		goto out_err_free_stripe_indices;
+	}
+
+	dsaddr = kzalloc(sizeof(*dsaddr) +
+			(sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
+			gfp_flags);
+	if (!dsaddr)
+		goto out_err_free_stripe_indices;
+
+	dsaddr->stripe_count = cnt;
+	dsaddr->stripe_indices = stripe_indices;
+	stripe_indices = NULL;
+	dsaddr->ds_num = num;
+	nfs4_init_deviceid_node(&dsaddr->id_node,
+				NFS_SERVER(ino)->pnfs_curr_ld,
+				NFS_SERVER(ino)->nfs_client,
+				&pdev->dev_id);
+
+	for (i = 0; i < dsaddr->ds_num; i++) {
+		int j;
+		u32 mp_count;
+
+		p = xdr_inline_decode(&stream, 4);
+		if (unlikely(!p))
+			goto out_err_free_deviceid;
+
+		mp_count = be32_to_cpup(p); /* multipath count */
+		if (mp_count > 1) {
+			printk(KERN_WARNING
+			       "%s: Multipath count %d not supported, "
+			       "skipping all greater than 1\n", __func__,
+				mp_count);
+		}
+		for (j = 0; j < mp_count; j++) {
+			if (j == 0) {
+				dsaddr->ds_list[i] = decode_and_add_ds(&stream,
+					ino, gfp_flags);
+				if (dsaddr->ds_list[i] == NULL)
+					goto out_err_free_deviceid;
+			} else {
+				u32 len;
+				/* skip extra multipath */
+
+				/* read len, skip */
+				p = xdr_inline_decode(&stream, 4);
+				if (unlikely(!p))
+					goto out_err_free_deviceid;
+				len = be32_to_cpup(p);
+
+				p = xdr_inline_decode(&stream, len);
+				if (unlikely(!p))
+					goto out_err_free_deviceid;
+
+				/* read len, skip */
+				p = xdr_inline_decode(&stream, 4);
+				if (unlikely(!p))
+					goto out_err_free_deviceid;
+				len = be32_to_cpup(p);
+
+				p = xdr_inline_decode(&stream, len);
+				if (unlikely(!p))
+					goto out_err_free_deviceid;
+			}
+		}
+	}
+
+	__free_page(scratch);
+	return dsaddr;
+
+out_err_free_deviceid:
+	nfs4_fl_free_deviceid(dsaddr);
+	/* stripe_indicies was part of dsaddr */
+	goto out_err_free_scratch;
+out_err_free_stripe_indices:
+	kfree(stripe_indices);
+out_err_free_scratch:
+	__free_page(scratch);
+out_err:
+	dprintk("%s ERROR: returning NULL\n", __func__);
+	return NULL;
+}
+
+/*
+ * Decode the opaque device specified in 'dev' and add it to the cache of
+ * available devices.
+ */
+static struct nfs4_file_layout_dsaddr *
+decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
+{
+	struct nfs4_deviceid_node *d;
+	struct nfs4_file_layout_dsaddr *n, *new;
+
+	new = decode_device(inode, dev, gfp_flags);
+	if (!new) {
+		printk(KERN_WARNING "%s: Could not decode or add device\n",
+			__func__);
+		return NULL;
+	}
+
+	d = nfs4_insert_deviceid_node(&new->id_node);
+	n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+	if (n != new) {
+		nfs4_fl_free_deviceid(new);
+		return n;
+	}
+
+	return new;
+}
+
+/*
+ * Retrieve the information for dev_id, add it to the list
+ * of available devices, and return it.
+ */
+struct nfs4_file_layout_dsaddr *
+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags)
+{
+	struct pnfs_device *pdev = NULL;
+	u32 max_resp_sz;
+	int max_pages;
+	struct page **pages = NULL;
+	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
+	int rc, i;
+	struct nfs_server *server = NFS_SERVER(inode);
+
+	/*
+	 * Use the session max response size as the basis for setting
+	 * GETDEVICEINFO's maxcount
+	 */
+	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+	max_pages = max_resp_sz >> PAGE_SHIFT;
+	dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
+		__func__, inode, max_resp_sz, max_pages);
+
+	pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
+	if (pdev == NULL)
+		return NULL;
+
+	pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags);
+	if (pages == NULL) {
+		kfree(pdev);
+		return NULL;
+	}
+	for (i = 0; i < max_pages; i++) {
+		pages[i] = alloc_page(gfp_flags);
+		if (!pages[i])
+			goto out_free;
+	}
+
+	memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+	pdev->layout_type = LAYOUT_NFSV4_1_FILES;
+	pdev->pages = pages;
+	pdev->pgbase = 0;
+	pdev->pglen = PAGE_SIZE * max_pages;
+	pdev->mincount = 0;
+
+	rc = nfs4_proc_getdeviceinfo(server, pdev);
+	dprintk("%s getdevice info returns %d\n", __func__, rc);
+	if (rc)
+		goto out_free;
+
+	/*
+	 * Found new device, need to decode it and then add it to the
+	 * list of known devices for this mountpoint.
+	 */
+	dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
+out_free:
+	for (i = 0; i < max_pages; i++)
+		__free_page(pages[i]);
+	kfree(pages);
+	kfree(pdev);
+	dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
+	return dsaddr;
+}
+
+void
+nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+	nfs4_put_deviceid_node(&dsaddr->id_node);
+}
+
+/*
+ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
+ * Then: ((res + fsi) % dsaddr->stripe_count)
+ */
+u32
+nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+	u64 tmp;
+
+	tmp = offset - flseg->pattern_offset;
+	do_div(tmp, flseg->stripe_unit);
+	tmp += flseg->first_stripe_index;
+	return do_div(tmp, flseg->dsaddr->stripe_count);
+}
+
+u32
+nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
+{
+	return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
+}
+
+struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+	u32 i;
+
+	if (flseg->stripe_type == STRIPE_SPARSE) {
+		if (flseg->num_fh == 1)
+			i = 0;
+		else if (flseg->num_fh == 0)
+			/* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+			return NULL;
+		else
+			i = nfs4_fl_calc_ds_index(lseg, j);
+	} else
+		i = j;
+	return flseg->fh_array[i];
+}
+
+static void
+filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
+			       int err, u32 ds_addr)
+{
+	u32 *p = (u32 *)&dsaddr->id_node.deviceid;
+
+	printk(KERN_ERR "NFS: data server %x connection error %d."
+		" Deviceid [%x%x%x%x] marked out of use.\n",
+		ds_addr, err, p[0], p[1], p[2], p[3]);
+
+	spin_lock(&nfs4_ds_cache_lock);
+	dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
+	spin_unlock(&nfs4_ds_cache_lock);
+}
+
+struct nfs4_pnfs_ds *
+nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
+{
+	struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
+	struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
+
+	if (ds == NULL) {
+		printk(KERN_ERR "%s: No data server for offset index %d\n",
+			__func__, ds_idx);
+		return NULL;
+	}
+
+	if (!ds->ds_clp) {
+		struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
+		int err;
+
+		if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) {
+			/* Already tried to connect, don't try again */
+			dprintk("%s Deviceid marked out of use\n", __func__);
+			return NULL;
+		}
+		err = nfs4_ds_connect(s, ds);
+		if (err) {
+			filelayout_mark_devid_negative(dsaddr, err,
+						       ntohl(ds->ds_ip_addr));
+			return NULL;
+		}
+	}
+	return ds;
+}
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
new file mode 100644
index 00000000..bb80c49b
--- /dev/null
+++ b/fs/nfs/nfs4namespace.c
@@ -0,0 +1,265 @@
+/*
+ * linux/fs/nfs/nfs4namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ * - Modified by David Howells <dhowells@redhat.com>
+ *
+ * NFSv4 namespace
+ */
+
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include "internal.h"
+#include "nfs4_fs.h"
+#include "dns_resolve.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+/*
+ * Convert the NFSv4 pathname components into a standard posix path.
+ *
+ * Note that the resulting string will be placed at the end of the buffer
+ */
+static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname,
+					 char *buffer, ssize_t buflen)
+{
+	char *end = buffer + buflen;
+	int n;
+
+	*--end = '\0';
+	buflen--;
+
+	n = pathname->ncomponents;
+	while (--n >= 0) {
+		const struct nfs4_string *component = &pathname->components[n];
+		buflen -= component->len + 1;
+		if (buflen < 0)
+			goto Elong;
+		end -= component->len;
+		memcpy(end, component->data, component->len);
+		*--end = '/';
+	}
+	return end;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+/*
+ * Determine the mount path as a string
+ */
+static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen)
+{
+	char *limit;
+	char *path = nfs_path(&limit, dentry, buffer, buflen);
+	if (!IS_ERR(path)) {
+		char *colon = strchr(path, ':');
+		if (colon && colon < limit)
+			path = colon + 1;
+	}
+	return path;
+}
+
+/*
+ * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we
+ * believe to be the server path to this dentry
+ */
+static int nfs4_validate_fspath(struct dentry *dentry,
+				const struct nfs4_fs_locations *locations,
+				char *page, char *page2)
+{
+	const char *path, *fs_path;
+
+	path = nfs4_path(dentry, page, PAGE_SIZE);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE);
+	if (IS_ERR(fs_path))
+		return PTR_ERR(fs_path);
+
+	if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
+		dprintk("%s: path %s does not begin with fsroot %s\n",
+			__func__, path, fs_path);
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+static size_t nfs_parse_server_name(char *string, size_t len,
+		struct sockaddr *sa, size_t salen)
+{
+	ssize_t ret;
+
+	ret = rpc_pton(string, len, sa, salen);
+	if (ret == 0) {
+		ret = nfs_dns_resolve_name(string, len, sa, salen);
+		if (ret < 0)
+			ret = 0;
+	}
+	return ret;
+}
+
+static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
+				     char *page, char *page2,
+				     const struct nfs4_fs_location *location)
+{
+	const size_t addr_bufsize = sizeof(struct sockaddr_storage);
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	char *mnt_path;
+	unsigned int maxbuflen;
+	unsigned int s;
+
+	mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
+	if (IS_ERR(mnt_path))
+		return ERR_CAST(mnt_path);
+	mountdata->mnt_path = mnt_path;
+	maxbuflen = mnt_path - 1 - page2;
+
+	mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL);
+	if (mountdata->addr == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	for (s = 0; s < location->nservers; s++) {
+		const struct nfs4_string *buf = &location->servers[s];
+
+		if (buf->len <= 0 || buf->len >= maxbuflen)
+			continue;
+
+		if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
+			continue;
+
+		mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
+				mountdata->addr, addr_bufsize);
+		if (mountdata->addrlen == 0)
+			continue;
+
+		rpc_set_port(mountdata->addr, NFS_PORT);
+
+		memcpy(page2, buf->data, buf->len);
+		page2[buf->len] = '\0';
+		mountdata->hostname = page2;
+
+		snprintf(page, PAGE_SIZE, "%s:%s",
+				mountdata->hostname,
+				mountdata->mnt_path);
+
+		mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata);
+		if (!IS_ERR(mnt))
+			break;
+	}
+	kfree(mountdata->addr);
+	return mnt;
+}
+
+/**
+ * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
+ * @dentry - parent directory
+ * @locations - array of NFSv4 server location information
+ *
+ */
+static struct vfsmount *nfs_follow_referral(struct dentry *dentry,
+					    const struct nfs4_fs_locations *locations)
+{
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	struct nfs_clone_mount mountdata = {
+		.sb = dentry->d_sb,
+		.dentry = dentry,
+		.authflavor = NFS_SB(dentry->d_sb)->client->cl_auth->au_flavor,
+	};
+	char *page = NULL, *page2 = NULL;
+	int loc, error;
+
+	if (locations == NULL || locations->nlocations <= 0)
+		goto out;
+
+	dprintk("%s: referral at %s/%s\n", __func__,
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+
+	page = (char *) __get_free_page(GFP_USER);
+	if (!page)
+		goto out;
+
+	page2 = (char *) __get_free_page(GFP_USER);
+	if (!page2)
+		goto out;
+
+	/* Ensure fs path is a prefix of current dentry path */
+	error = nfs4_validate_fspath(dentry, locations, page, page2);
+	if (error < 0) {
+		mnt = ERR_PTR(error);
+		goto out;
+	}
+
+	for (loc = 0; loc < locations->nlocations; loc++) {
+		const struct nfs4_fs_location *location = &locations->locations[loc];
+
+		if (location == NULL || location->nservers <= 0 ||
+		    location->rootpath.ncomponents == 0)
+			continue;
+
+		mnt = try_location(&mountdata, page, page2, location);
+		if (!IS_ERR(mnt))
+			break;
+	}
+
+out:
+	free_page((unsigned long) page);
+	free_page((unsigned long) page2);
+	dprintk("%s: done\n", __func__);
+	return mnt;
+}
+
+/*
+ * nfs_do_refmount - handle crossing a referral on server
+ * @dentry - dentry of referral
+ *
+ */
+struct vfsmount *nfs_do_refmount(struct dentry *dentry)
+{
+	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+	struct dentry *parent;
+	struct nfs4_fs_locations *fs_locations = NULL;
+	struct page *page;
+	int err;
+
+	/* BUG_ON(IS_ROOT(dentry)); */
+	dprintk("%s: enter\n", __func__);
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL)
+		goto out;
+
+	fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (fs_locations == NULL)
+		goto out_free;
+
+	/* Get locations */
+	mnt = ERR_PTR(-ENOENT);
+
+	parent = dget_parent(dentry);
+	dprintk("%s: getting locations for %s/%s\n",
+		__func__, parent->d_name.name, dentry->d_name.name);
+
+	err = nfs4_proc_fs_locations(parent->d_inode, &dentry->d_name, fs_locations, page);
+	dput(parent);
+	if (err != 0 ||
+	    fs_locations->nlocations <= 0 ||
+	    fs_locations->fs_path.ncomponents <= 0)
+		goto out_free;
+
+	mnt = nfs_follow_referral(dentry, fs_locations);
+out_free:
+	__free_page(page);
+	kfree(fs_locations);
+out:
+	dprintk("%s: done\n", __func__);
+	return mnt;
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
new file mode 100644
index 00000000..b7a7e5fe
--- /dev/null
+++ b/fs/nfs/nfs4proc.c
@@ -0,0 +1,6114 @@
+/*
+ *  fs/nfs/nfs4proc.c
+ *
+ *  Client-side procedure declarations for NFSv4.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/gss_api.h>
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/nfs_mount.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include <linux/xattr.h>
+#include <linux/utsname.h>
+
+#include "nfs4_fs.h"
+#include "delegation.h"
+#include "internal.h"
+#include "iostat.h"
+#include "callback.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PROC
+
+#define NFS4_POLL_RETRY_MIN	(HZ/10)
+#define NFS4_POLL_RETRY_MAX	(15*HZ)
+
+#define NFS4_MAX_LOOP_ON_RECOVER (10)
+
+struct nfs4_opendata;
+static int _nfs4_proc_open(struct nfs4_opendata *data);
+static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
+static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
+static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir,
+			     const struct qstr *name, struct nfs_fh *fhandle,
+			     struct nfs_fattr *fattr);
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+			    struct nfs_fattr *fattr, struct iattr *sattr,
+			    struct nfs4_state *state);
+
+/* Prevent leaks of NFSv4 errors into userland */
+static int nfs4_map_errors(int err)
+{
+	if (err >= -1000)
+		return err;
+	switch (err) {
+	case -NFS4ERR_RESOURCE:
+		return -EREMOTEIO;
+	case -NFS4ERR_WRONGSEC:
+		return -EPERM;
+	case -NFS4ERR_BADOWNER:
+	case -NFS4ERR_BADNAME:
+		return -EINVAL;
+	case -NFS4ERR_SHARE_DENIED:
+		return -EACCES;
+	default:
+		dprintk("%s could not handle NFSv4 error %d\n",
+				__func__, -err);
+		break;
+	}
+	return -EIO;
+}
+
+/*
+ * This is our standard bitmap for GETATTR requests.
+ */
+const u32 nfs4_fattr_bitmap[2] = {
+	FATTR4_WORD0_TYPE
+	| FATTR4_WORD0_CHANGE
+	| FATTR4_WORD0_SIZE
+	| FATTR4_WORD0_FSID
+	| FATTR4_WORD0_FILEID,
+	FATTR4_WORD1_MODE
+	| FATTR4_WORD1_NUMLINKS
+	| FATTR4_WORD1_OWNER
+	| FATTR4_WORD1_OWNER_GROUP
+	| FATTR4_WORD1_RAWDEV
+	| FATTR4_WORD1_SPACE_USED
+	| FATTR4_WORD1_TIME_ACCESS
+	| FATTR4_WORD1_TIME_METADATA
+	| FATTR4_WORD1_TIME_MODIFY
+};
+
+const u32 nfs4_statfs_bitmap[2] = {
+	FATTR4_WORD0_FILES_AVAIL
+	| FATTR4_WORD0_FILES_FREE
+	| FATTR4_WORD0_FILES_TOTAL,
+	FATTR4_WORD1_SPACE_AVAIL
+	| FATTR4_WORD1_SPACE_FREE
+	| FATTR4_WORD1_SPACE_TOTAL
+};
+
+const u32 nfs4_pathconf_bitmap[2] = {
+	FATTR4_WORD0_MAXLINK
+	| FATTR4_WORD0_MAXNAME,
+	0
+};
+
+const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
+			| FATTR4_WORD0_MAXREAD
+			| FATTR4_WORD0_MAXWRITE
+			| FATTR4_WORD0_LEASE_TIME,
+			FATTR4_WORD1_TIME_DELTA
+			| FATTR4_WORD1_FS_LAYOUT_TYPES
+};
+
+const u32 nfs4_fs_locations_bitmap[2] = {
+	FATTR4_WORD0_TYPE
+	| FATTR4_WORD0_CHANGE
+	| FATTR4_WORD0_SIZE
+	| FATTR4_WORD0_FSID
+	| FATTR4_WORD0_FILEID
+	| FATTR4_WORD0_FS_LOCATIONS,
+	FATTR4_WORD1_MODE
+	| FATTR4_WORD1_NUMLINKS
+	| FATTR4_WORD1_OWNER
+	| FATTR4_WORD1_OWNER_GROUP
+	| FATTR4_WORD1_RAWDEV
+	| FATTR4_WORD1_SPACE_USED
+	| FATTR4_WORD1_TIME_ACCESS
+	| FATTR4_WORD1_TIME_METADATA
+	| FATTR4_WORD1_TIME_MODIFY
+	| FATTR4_WORD1_MOUNTED_ON_FILEID
+};
+
+static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
+		struct nfs4_readdir_arg *readdir)
+{
+	__be32 *start, *p;
+
+	BUG_ON(readdir->count < 80);
+	if (cookie > 2) {
+		readdir->cookie = cookie;
+		memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier));
+		return;
+	}
+
+	readdir->cookie = 0;
+	memset(&readdir->verifier, 0, sizeof(readdir->verifier));
+	if (cookie == 2)
+		return;
+	
+	/*
+	 * NFSv4 servers do not return entries for '.' and '..'
+	 * Therefore, we fake these entries here.  We let '.'
+	 * have cookie 0 and '..' have cookie 1.  Note that
+	 * when talking to the server, we always send cookie 0
+	 * instead of 1 or 2.
+	 */
+	start = p = kmap_atomic(*readdir->pages, KM_USER0);
+	
+	if (cookie == 0) {
+		*p++ = xdr_one;                                  /* next */
+		*p++ = xdr_zero;                   /* cookie, first word */
+		*p++ = xdr_one;                   /* cookie, second word */
+		*p++ = xdr_one;                             /* entry len */
+		memcpy(p, ".\0\0\0", 4);                        /* entry */
+		p++;
+		*p++ = xdr_one;                         /* bitmap length */
+		*p++ = htonl(FATTR4_WORD0_FILEID);             /* bitmap */
+		*p++ = htonl(8);              /* attribute buffer length */
+		p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_inode));
+	}
+	
+	*p++ = xdr_one;                                  /* next */
+	*p++ = xdr_zero;                   /* cookie, first word */
+	*p++ = xdr_two;                   /* cookie, second word */
+	*p++ = xdr_two;                             /* entry len */
+	memcpy(p, "..\0\0", 4);                         /* entry */
+	p++;
+	*p++ = xdr_one;                         /* bitmap length */
+	*p++ = htonl(FATTR4_WORD0_FILEID);             /* bitmap */
+	*p++ = htonl(8);              /* attribute buffer length */
+	p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_parent->d_inode));
+
+	readdir->pgbase = (char *)p - (char *)start;
+	readdir->count -= readdir->pgbase;
+	kunmap_atomic(start, KM_USER0);
+}
+
+static int nfs4_wait_clnt_recover(struct nfs_client *clp)
+{
+	int res;
+
+	might_sleep();
+
+	res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+	return res;
+}
+
+static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
+{
+	int res = 0;
+
+	might_sleep();
+
+	if (*timeout <= 0)
+		*timeout = NFS4_POLL_RETRY_MIN;
+	if (*timeout > NFS4_POLL_RETRY_MAX)
+		*timeout = NFS4_POLL_RETRY_MAX;
+	schedule_timeout_killable(*timeout);
+	if (fatal_signal_pending(current))
+		res = -ERESTARTSYS;
+	*timeout <<= 1;
+	return res;
+}
+
+/* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_state *state = exception->state;
+	struct inode *inode = exception->inode;
+	int ret = errorcode;
+
+	exception->retry = 0;
+	switch(errorcode) {
+		case 0:
+			return 0;
+		case -NFS4ERR_OPENMODE:
+			if (nfs_have_delegation(inode, FMODE_READ)) {
+				nfs_inode_return_delegation(inode);
+				exception->retry = 1;
+				return 0;
+			}
+			if (state == NULL)
+				break;
+			nfs4_schedule_stateid_recovery(server, state);
+			goto wait_on_recovery;
+		case -NFS4ERR_DELEG_REVOKED:
+		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_BAD_STATEID:
+			if (state != NULL)
+				nfs_remove_bad_delegation(state->inode);
+			if (state == NULL)
+				break;
+			nfs4_schedule_stateid_recovery(server, state);
+			goto wait_on_recovery;
+		case -NFS4ERR_EXPIRED:
+			if (state != NULL)
+				nfs4_schedule_stateid_recovery(server, state);
+		case -NFS4ERR_STALE_STATEID:
+		case -NFS4ERR_STALE_CLIENTID:
+			nfs4_schedule_lease_recovery(clp);
+			goto wait_on_recovery;
+#if defined(CONFIG_NFS_V4_1)
+		case -NFS4ERR_BADSESSION:
+		case -NFS4ERR_BADSLOT:
+		case -NFS4ERR_BAD_HIGH_SLOT:
+		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+		case -NFS4ERR_DEADSESSION:
+		case -NFS4ERR_SEQ_FALSE_RETRY:
+		case -NFS4ERR_SEQ_MISORDERED:
+			dprintk("%s ERROR: %d Reset session\n", __func__,
+				errorcode);
+			nfs4_schedule_session_recovery(clp->cl_session);
+			exception->retry = 1;
+			break;
+#endif /* defined(CONFIG_NFS_V4_1) */
+		case -NFS4ERR_FILE_OPEN:
+			if (exception->timeout > HZ) {
+				/* We have retried a decent amount, time to
+				 * fail
+				 */
+				ret = -EBUSY;
+				break;
+			}
+		case -NFS4ERR_GRACE:
+		case -NFS4ERR_DELAY:
+		case -EKEYEXPIRED:
+			ret = nfs4_delay(server->client, &exception->timeout);
+			if (ret != 0)
+				break;
+		case -NFS4ERR_RETRY_UNCACHED_REP:
+		case -NFS4ERR_OLD_STATEID:
+			exception->retry = 1;
+			break;
+		case -NFS4ERR_BADOWNER:
+			/* The following works around a Linux server bug! */
+		case -NFS4ERR_BADNAME:
+			if (server->caps & NFS_CAP_UIDGID_NOMAP) {
+				server->caps &= ~NFS_CAP_UIDGID_NOMAP;
+				exception->retry = 1;
+				printk(KERN_WARNING "NFS: v4 server %s "
+						"does not accept raw "
+						"uid/gids. "
+						"Reenabling the idmapper.\n",
+						server->nfs_client->cl_hostname);
+			}
+	}
+	/* We failed to handle the error */
+	return nfs4_map_errors(ret);
+wait_on_recovery:
+	ret = nfs4_wait_clnt_recover(clp);
+	if (ret == 0)
+		exception->retry = 1;
+	return ret;
+}
+
+
+static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp)
+{
+	spin_lock(&clp->cl_lock);
+	if (time_before(clp->cl_last_renewal,timestamp))
+		clp->cl_last_renewal = timestamp;
+	spin_unlock(&clp->cl_lock);
+}
+
+static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
+{
+	do_renew_lease(server->nfs_client, timestamp);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+/*
+ * nfs4_free_slot - free a slot and efficiently update slot table.
+ *
+ * freeing a slot is trivially done by clearing its respective bit
+ * in the bitmap.
+ * If the freed slotid equals highest_used_slotid we want to update it
+ * so that the server would be able to size down the slot table if needed,
+ * otherwise we know that the highest_used_slotid is still in use.
+ * When updating highest_used_slotid there may be "holes" in the bitmap
+ * so we need to scan down from highest_used_slotid to 0 looking for the now
+ * highest slotid in use.
+ * If none found, highest_used_slotid is set to -1.
+ *
+ * Must be called while holding tbl->slot_tbl_lock
+ */
+static void
+nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
+{
+	int free_slotid = free_slot - tbl->slots;
+	int slotid = free_slotid;
+
+	BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
+	/* clear used bit in bitmap */
+	__clear_bit(slotid, tbl->used_slots);
+
+	/* update highest_used_slotid when it is freed */
+	if (slotid == tbl->highest_used_slotid) {
+		slotid = find_last_bit(tbl->used_slots, tbl->max_slots);
+		if (slotid < tbl->max_slots)
+			tbl->highest_used_slotid = slotid;
+		else
+			tbl->highest_used_slotid = -1;
+	}
+	dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__,
+		free_slotid, tbl->highest_used_slotid);
+}
+
+/*
+ * Signal state manager thread if session fore channel is drained
+ */
+static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
+{
+	struct rpc_task *task;
+
+	if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
+		task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq);
+		if (task)
+			rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+		return;
+	}
+
+	if (ses->fc_slot_table.highest_used_slotid != -1)
+		return;
+
+	dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
+	complete(&ses->fc_slot_table.complete);
+}
+
+/*
+ * Signal state manager thread if session back channel is drained
+ */
+void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
+{
+	if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
+	    ses->bc_slot_table.highest_used_slotid != -1)
+		return;
+	dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
+	complete(&ses->bc_slot_table.complete);
+}
+
+static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+	struct nfs4_slot_table *tbl;
+
+	tbl = &res->sr_session->fc_slot_table;
+	if (!res->sr_slot) {
+		/* just wake up the next guy waiting since
+		 * we may have not consumed a slot after all */
+		dprintk("%s: No slot\n", __func__);
+		return;
+	}
+
+	spin_lock(&tbl->slot_tbl_lock);
+	nfs4_free_slot(tbl, res->sr_slot);
+	nfs4_check_drain_fc_complete(res->sr_session);
+	spin_unlock(&tbl->slot_tbl_lock);
+	res->sr_slot = NULL;
+}
+
+static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+	unsigned long timestamp;
+	struct nfs_client *clp;
+
+	/*
+	 * sr_status remains 1 if an RPC level error occurred. The server
+	 * may or may not have processed the sequence operation..
+	 * Proceed as if the server received and processed the sequence
+	 * operation.
+	 */
+	if (res->sr_status == 1)
+		res->sr_status = NFS_OK;
+
+	/* don't increment the sequence number if the task wasn't sent */
+	if (!RPC_WAS_SENT(task))
+		goto out;
+
+	/* Check the SEQUENCE operation status */
+	switch (res->sr_status) {
+	case 0:
+		/* Update the slot's sequence and clientid lease timer */
+		++res->sr_slot->seq_nr;
+		timestamp = res->sr_renewal_time;
+		clp = res->sr_session->clp;
+		do_renew_lease(clp, timestamp);
+		/* Check sequence flags */
+		if (res->sr_status_flags != 0)
+			nfs4_schedule_lease_recovery(clp);
+		break;
+	case -NFS4ERR_DELAY:
+		/* The server detected a resend of the RPC call and
+		 * returned NFS4ERR_DELAY as per Section 2.10.6.2
+		 * of RFC5661.
+		 */
+		dprintk("%s: slot=%td seq=%d: Operation in progress\n",
+			__func__,
+			res->sr_slot - res->sr_session->fc_slot_table.slots,
+			res->sr_slot->seq_nr);
+		goto out_retry;
+	default:
+		/* Just update the slot sequence no. */
+		++res->sr_slot->seq_nr;
+	}
+out:
+	/* The session may be reset by one of the error handlers. */
+	dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
+	nfs41_sequence_free_slot(res);
+	return 1;
+out_retry:
+	if (!rpc_restart_call(task))
+		goto out;
+	rpc_delay(task, NFS4_POLL_RETRY_MAX);
+	return 0;
+}
+
+static int nfs4_sequence_done(struct rpc_task *task,
+			       struct nfs4_sequence_res *res)
+{
+	if (res->sr_session == NULL)
+		return 1;
+	return nfs41_sequence_done(task, res);
+}
+
+/*
+ * nfs4_find_slot - efficiently look for a free slot
+ *
+ * nfs4_find_slot looks for an unset bit in the used_slots bitmap.
+ * If found, we mark the slot as used, update the highest_used_slotid,
+ * and respectively set up the sequence operation args.
+ * The slot number is returned if found, or NFS4_MAX_SLOT_TABLE otherwise.
+ *
+ * Note: must be called with under the slot_tbl_lock.
+ */
+static u8
+nfs4_find_slot(struct nfs4_slot_table *tbl)
+{
+	int slotid;
+	u8 ret_id = NFS4_MAX_SLOT_TABLE;
+	BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE);
+
+	dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n",
+		__func__, tbl->used_slots[0], tbl->highest_used_slotid,
+		tbl->max_slots);
+	slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots);
+	if (slotid >= tbl->max_slots)
+		goto out;
+	__set_bit(slotid, tbl->used_slots);
+	if (slotid > tbl->highest_used_slotid)
+		tbl->highest_used_slotid = slotid;
+	ret_id = slotid;
+out:
+	dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n",
+		__func__, tbl->used_slots[0], tbl->highest_used_slotid, ret_id);
+	return ret_id;
+}
+
+int nfs41_setup_sequence(struct nfs4_session *session,
+				struct nfs4_sequence_args *args,
+				struct nfs4_sequence_res *res,
+				int cache_reply,
+				struct rpc_task *task)
+{
+	struct nfs4_slot *slot;
+	struct nfs4_slot_table *tbl;
+	u8 slotid;
+
+	dprintk("--> %s\n", __func__);
+	/* slot already allocated? */
+	if (res->sr_slot != NULL)
+		return 0;
+
+	tbl = &session->fc_slot_table;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
+	    !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
+		/*
+		 * The state manager will wait until the slot table is empty.
+		 * Schedule the reset thread
+		 */
+		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+		spin_unlock(&tbl->slot_tbl_lock);
+		dprintk("%s Schedule Session Reset\n", __func__);
+		return -EAGAIN;
+	}
+
+	if (!rpc_queue_empty(&tbl->slot_tbl_waitq) &&
+	    !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
+		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+		spin_unlock(&tbl->slot_tbl_lock);
+		dprintk("%s enforce FIFO order\n", __func__);
+		return -EAGAIN;
+	}
+
+	slotid = nfs4_find_slot(tbl);
+	if (slotid == NFS4_MAX_SLOT_TABLE) {
+		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+		spin_unlock(&tbl->slot_tbl_lock);
+		dprintk("<-- %s: no free slots\n", __func__);
+		return -EAGAIN;
+	}
+	spin_unlock(&tbl->slot_tbl_lock);
+
+	rpc_task_set_priority(task, RPC_PRIORITY_NORMAL);
+	slot = tbl->slots + slotid;
+	args->sa_session = session;
+	args->sa_slotid = slotid;
+	args->sa_cache_this = cache_reply;
+
+	dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
+
+	res->sr_session = session;
+	res->sr_slot = slot;
+	res->sr_renewal_time = jiffies;
+	res->sr_status_flags = 0;
+	/*
+	 * sr_status is only set in decode_sequence, and so will remain
+	 * set to 1 if an rpc level failure occurs.
+	 */
+	res->sr_status = 1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
+
+int nfs4_setup_sequence(const struct nfs_server *server,
+			struct nfs4_sequence_args *args,
+			struct nfs4_sequence_res *res,
+			int cache_reply,
+			struct rpc_task *task)
+{
+	struct nfs4_session *session = nfs4_get_session(server);
+	int ret = 0;
+
+	if (session == NULL) {
+		args->sa_session = NULL;
+		res->sr_session = NULL;
+		goto out;
+	}
+
+	dprintk("--> %s clp %p session %p sr_slot %td\n",
+		__func__, session->clp, session, res->sr_slot ?
+			res->sr_slot - session->fc_slot_table.slots : -1);
+
+	ret = nfs41_setup_sequence(session, args, res, cache_reply,
+				   task);
+out:
+	dprintk("<-- %s status=%d\n", __func__, ret);
+	return ret;
+}
+
+struct nfs41_call_sync_data {
+	const struct nfs_server *seq_server;
+	struct nfs4_sequence_args *seq_args;
+	struct nfs4_sequence_res *seq_res;
+	int cache_reply;
+};
+
+static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs41_call_sync_data *data = calldata;
+
+	dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
+
+	if (nfs4_setup_sequence(data->seq_server, data->seq_args,
+				data->seq_res, data->cache_reply, task))
+		return;
+	rpc_call_start(task);
+}
+
+static void nfs41_call_priv_sync_prepare(struct rpc_task *task, void *calldata)
+{
+	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+	nfs41_call_sync_prepare(task, calldata);
+}
+
+static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs41_call_sync_data *data = calldata;
+
+	nfs41_sequence_done(task, data->seq_res);
+}
+
+struct rpc_call_ops nfs41_call_sync_ops = {
+	.rpc_call_prepare = nfs41_call_sync_prepare,
+	.rpc_call_done = nfs41_call_sync_done,
+};
+
+struct rpc_call_ops nfs41_call_priv_sync_ops = {
+	.rpc_call_prepare = nfs41_call_priv_sync_prepare,
+	.rpc_call_done = nfs41_call_sync_done,
+};
+
+static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
+				   struct nfs_server *server,
+				   struct rpc_message *msg,
+				   struct nfs4_sequence_args *args,
+				   struct nfs4_sequence_res *res,
+				   int cache_reply,
+				   int privileged)
+{
+	int ret;
+	struct rpc_task *task;
+	struct nfs41_call_sync_data data = {
+		.seq_server = server,
+		.seq_args = args,
+		.seq_res = res,
+		.cache_reply = cache_reply,
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_client = clnt,
+		.rpc_message = msg,
+		.callback_ops = &nfs41_call_sync_ops,
+		.callback_data = &data
+	};
+
+	res->sr_slot = NULL;
+	if (privileged)
+		task_setup.callback_ops = &nfs41_call_priv_sync_ops;
+	task = rpc_run_task(&task_setup);
+	if (IS_ERR(task))
+		ret = PTR_ERR(task);
+	else {
+		ret = task->tk_status;
+		rpc_put_task(task);
+	}
+	return ret;
+}
+
+int _nfs4_call_sync_session(struct rpc_clnt *clnt,
+			    struct nfs_server *server,
+			    struct rpc_message *msg,
+			    struct nfs4_sequence_args *args,
+			    struct nfs4_sequence_res *res,
+			    int cache_reply)
+{
+	return nfs4_call_sync_sequence(clnt, server, msg, args, res, cache_reply, 0);
+}
+
+#else
+static int nfs4_sequence_done(struct rpc_task *task,
+			       struct nfs4_sequence_res *res)
+{
+	return 1;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+int _nfs4_call_sync(struct rpc_clnt *clnt,
+		    struct nfs_server *server,
+		    struct rpc_message *msg,
+		    struct nfs4_sequence_args *args,
+		    struct nfs4_sequence_res *res,
+		    int cache_reply)
+{
+	args->sa_session = res->sr_session = NULL;
+	return rpc_call_sync(clnt, msg, 0);
+}
+
+static inline
+int nfs4_call_sync(struct rpc_clnt *clnt,
+		   struct nfs_server *server,
+		   struct rpc_message *msg,
+		   struct nfs4_sequence_args *args,
+		   struct nfs4_sequence_res *res,
+		   int cache_reply)
+{
+	return server->nfs_client->cl_mvops->call_sync(clnt, server, msg,
+						args, res, cache_reply);
+}
+
+static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
+{
+	struct nfs_inode *nfsi = NFS_I(dir);
+
+	spin_lock(&dir->i_lock);
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
+	if (!cinfo->atomic || cinfo->before != nfsi->change_attr)
+		nfs_force_lookup_revalidate(dir);
+	nfsi->change_attr = cinfo->after;
+	spin_unlock(&dir->i_lock);
+}
+
+struct nfs4_opendata {
+	struct kref kref;
+	struct nfs_openargs o_arg;
+	struct nfs_openres o_res;
+	struct nfs_open_confirmargs c_arg;
+	struct nfs_open_confirmres c_res;
+	struct nfs_fattr f_attr;
+	struct nfs_fattr dir_attr;
+	struct path path;
+	struct dentry *dir;
+	struct nfs4_state_owner *owner;
+	struct nfs4_state *state;
+	struct iattr attrs;
+	unsigned long timestamp;
+	unsigned int rpc_done : 1;
+	int rpc_status;
+	int cancelled;
+};
+
+
+static void nfs4_init_opendata_res(struct nfs4_opendata *p)
+{
+	p->o_res.f_attr = &p->f_attr;
+	p->o_res.dir_attr = &p->dir_attr;
+	p->o_res.seqid = p->o_arg.seqid;
+	p->c_res.seqid = p->c_arg.seqid;
+	p->o_res.server = p->o_arg.server;
+	nfs_fattr_init(&p->f_attr);
+	nfs_fattr_init(&p->dir_attr);
+}
+
+static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
+		struct nfs4_state_owner *sp, fmode_t fmode, int flags,
+		const struct iattr *attrs,
+		gfp_t gfp_mask)
+{
+	struct dentry *parent = dget_parent(path->dentry);
+	struct inode *dir = parent->d_inode;
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs4_opendata *p;
+
+	p = kzalloc(sizeof(*p), gfp_mask);
+	if (p == NULL)
+		goto err;
+	p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
+	if (p->o_arg.seqid == NULL)
+		goto err_free;
+	path_get(path);
+	p->path = *path;
+	p->dir = parent;
+	p->owner = sp;
+	atomic_inc(&sp->so_count);
+	p->o_arg.fh = NFS_FH(dir);
+	p->o_arg.open_flags = flags;
+	p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
+	p->o_arg.clientid = server->nfs_client->cl_clientid;
+	p->o_arg.id = sp->so_owner_id.id;
+	p->o_arg.name = &p->path.dentry->d_name;
+	p->o_arg.server = server;
+	p->o_arg.bitmask = server->attr_bitmask;
+	p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
+	if (flags & O_CREAT) {
+		u32 *s;
+
+		p->o_arg.u.attrs = &p->attrs;
+		memcpy(&p->attrs, attrs, sizeof(p->attrs));
+		s = (u32 *) p->o_arg.u.verifier.data;
+		s[0] = jiffies;
+		s[1] = current->pid;
+	}
+	p->c_arg.fh = &p->o_res.fh;
+	p->c_arg.stateid = &p->o_res.stateid;
+	p->c_arg.seqid = p->o_arg.seqid;
+	nfs4_init_opendata_res(p);
+	kref_init(&p->kref);
+	return p;
+err_free:
+	kfree(p);
+err:
+	dput(parent);
+	return NULL;
+}
+
+static void nfs4_opendata_free(struct kref *kref)
+{
+	struct nfs4_opendata *p = container_of(kref,
+			struct nfs4_opendata, kref);
+
+	nfs_free_seqid(p->o_arg.seqid);
+	if (p->state != NULL)
+		nfs4_put_open_state(p->state);
+	nfs4_put_state_owner(p->owner);
+	dput(p->dir);
+	path_put(&p->path);
+	kfree(p);
+}
+
+static void nfs4_opendata_put(struct nfs4_opendata *p)
+{
+	if (p != NULL)
+		kref_put(&p->kref, nfs4_opendata_free);
+}
+
+static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
+{
+	int ret;
+
+	ret = rpc_wait_for_completion_task(task);
+	return ret;
+}
+
+static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode)
+{
+	int ret = 0;
+
+	if (open_mode & O_EXCL)
+		goto out;
+	switch (mode & (FMODE_READ|FMODE_WRITE)) {
+		case FMODE_READ:
+			ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0
+				&& state->n_rdonly != 0;
+			break;
+		case FMODE_WRITE:
+			ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0
+				&& state->n_wronly != 0;
+			break;
+		case FMODE_READ|FMODE_WRITE:
+			ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0
+				&& state->n_rdwr != 0;
+	}
+out:
+	return ret;
+}
+
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
+{
+	if ((delegation->type & fmode) != fmode)
+		return 0;
+	if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
+		return 0;
+	nfs_mark_delegation_referenced(delegation);
+	return 1;
+}
+
+static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
+{
+	switch (fmode) {
+		case FMODE_WRITE:
+			state->n_wronly++;
+			break;
+		case FMODE_READ:
+			state->n_rdonly++;
+			break;
+		case FMODE_READ|FMODE_WRITE:
+			state->n_rdwr++;
+	}
+	nfs4_state_set_mode_locked(state, state->state | fmode);
+}
+
+static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+{
+	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
+		memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data));
+	memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data));
+	switch (fmode) {
+		case FMODE_READ:
+			set_bit(NFS_O_RDONLY_STATE, &state->flags);
+			break;
+		case FMODE_WRITE:
+			set_bit(NFS_O_WRONLY_STATE, &state->flags);
+			break;
+		case FMODE_READ|FMODE_WRITE:
+			set_bit(NFS_O_RDWR_STATE, &state->flags);
+	}
+}
+
+static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+{
+	write_seqlock(&state->seqlock);
+	nfs_set_open_stateid_locked(state, stateid, fmode);
+	write_sequnlock(&state->seqlock);
+}
+
+static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
+{
+	/*
+	 * Protect the call to nfs4_state_set_mode_locked and
+	 * serialise the stateid update
+	 */
+	write_seqlock(&state->seqlock);
+	if (deleg_stateid != NULL) {
+		memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data));
+		set_bit(NFS_DELEGATED_STATE, &state->flags);
+	}
+	if (open_stateid != NULL)
+		nfs_set_open_stateid_locked(state, open_stateid, fmode);
+	write_sequnlock(&state->seqlock);
+	spin_lock(&state->owner->so_lock);
+	update_open_stateflags(state, fmode);
+	spin_unlock(&state->owner->so_lock);
+}
+
+static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode)
+{
+	struct nfs_inode *nfsi = NFS_I(state->inode);
+	struct nfs_delegation *deleg_cur;
+	int ret = 0;
+
+	fmode &= (FMODE_READ|FMODE_WRITE);
+
+	rcu_read_lock();
+	deleg_cur = rcu_dereference(nfsi->delegation);
+	if (deleg_cur == NULL)
+		goto no_delegation;
+
+	spin_lock(&deleg_cur->lock);
+	if (nfsi->delegation != deleg_cur ||
+	    (deleg_cur->type & fmode) != fmode)
+		goto no_delegation_unlock;
+
+	if (delegation == NULL)
+		delegation = &deleg_cur->stateid;
+	else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0)
+		goto no_delegation_unlock;
+
+	nfs_mark_delegation_referenced(deleg_cur);
+	__update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode);
+	ret = 1;
+no_delegation_unlock:
+	spin_unlock(&deleg_cur->lock);
+no_delegation:
+	rcu_read_unlock();
+
+	if (!ret && open_stateid != NULL) {
+		__update_open_stateid(state, open_stateid, NULL, fmode);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+
+static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
+{
+	struct nfs_delegation *delegation;
+
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation == NULL || (delegation->type & fmode) == fmode) {
+		rcu_read_unlock();
+		return;
+	}
+	rcu_read_unlock();
+	nfs_inode_return_delegation(inode);
+}
+
+static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
+{
+	struct nfs4_state *state = opendata->state;
+	struct nfs_inode *nfsi = NFS_I(state->inode);
+	struct nfs_delegation *delegation;
+	int open_mode = opendata->o_arg.open_flags & O_EXCL;
+	fmode_t fmode = opendata->o_arg.fmode;
+	nfs4_stateid stateid;
+	int ret = -EAGAIN;
+
+	for (;;) {
+		if (can_open_cached(state, fmode, open_mode)) {
+			spin_lock(&state->owner->so_lock);
+			if (can_open_cached(state, fmode, open_mode)) {
+				update_open_stateflags(state, fmode);
+				spin_unlock(&state->owner->so_lock);
+				goto out_return_state;
+			}
+			spin_unlock(&state->owner->so_lock);
+		}
+		rcu_read_lock();
+		delegation = rcu_dereference(nfsi->delegation);
+		if (delegation == NULL ||
+		    !can_open_delegated(delegation, fmode)) {
+			rcu_read_unlock();
+			break;
+		}
+		/* Save the delegation */
+		memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
+		rcu_read_unlock();
+		ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
+		if (ret != 0)
+			goto out;
+		ret = -EAGAIN;
+
+		/* Try to update the stateid using the delegation */
+		if (update_open_stateid(state, NULL, &stateid, fmode))
+			goto out_return_state;
+	}
+out:
+	return ERR_PTR(ret);
+out_return_state:
+	atomic_inc(&state->count);
+	return state;
+}
+
+static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+{
+	struct inode *inode;
+	struct nfs4_state *state = NULL;
+	struct nfs_delegation *delegation;
+	int ret;
+
+	if (!data->rpc_done) {
+		state = nfs4_try_open_cached(data);
+		goto out;
+	}
+
+	ret = -EAGAIN;
+	if (!(data->f_attr.valid & NFS_ATTR_FATTR))
+		goto err;
+	inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr);
+	ret = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto err;
+	ret = -ENOMEM;
+	state = nfs4_get_open_state(inode, data->owner);
+	if (state == NULL)
+		goto err_put_inode;
+	if (data->o_res.delegation_type != 0) {
+		int delegation_flags = 0;
+
+		rcu_read_lock();
+		delegation = rcu_dereference(NFS_I(inode)->delegation);
+		if (delegation)
+			delegation_flags = delegation->flags;
+		rcu_read_unlock();
+		if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
+			nfs_inode_set_delegation(state->inode,
+					data->owner->so_cred,
+					&data->o_res);
+		else
+			nfs_inode_reclaim_delegation(state->inode,
+					data->owner->so_cred,
+					&data->o_res);
+	}
+
+	update_open_stateid(state, &data->o_res.stateid, NULL,
+			data->o_arg.fmode);
+	iput(inode);
+out:
+	return state;
+err_put_inode:
+	iput(inode);
+err:
+	return ERR_PTR(ret);
+}
+
+static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
+{
+	struct nfs_inode *nfsi = NFS_I(state->inode);
+	struct nfs_open_context *ctx;
+
+	spin_lock(&state->inode->i_lock);
+	list_for_each_entry(ctx, &nfsi->open_files, list) {
+		if (ctx->state != state)
+			continue;
+		get_nfs_open_context(ctx);
+		spin_unlock(&state->inode->i_lock);
+		return ctx;
+	}
+	spin_unlock(&state->inode->i_lock);
+	return ERR_PTR(-ENOENT);
+}
+
+static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+	struct nfs4_opendata *opendata;
+
+	opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS);
+	if (opendata == NULL)
+		return ERR_PTR(-ENOMEM);
+	opendata->state = state;
+	atomic_inc(&state->count);
+	return opendata;
+}
+
+static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res)
+{
+	struct nfs4_state *newstate;
+	int ret;
+
+	opendata->o_arg.open_flags = 0;
+	opendata->o_arg.fmode = fmode;
+	memset(&opendata->o_res, 0, sizeof(opendata->o_res));
+	memset(&opendata->c_res, 0, sizeof(opendata->c_res));
+	nfs4_init_opendata_res(opendata);
+	ret = _nfs4_recover_proc_open(opendata);
+	if (ret != 0)
+		return ret; 
+	newstate = nfs4_opendata_to_nfs4_state(opendata);
+	if (IS_ERR(newstate))
+		return PTR_ERR(newstate);
+	nfs4_close_state(&opendata->path, newstate, fmode);
+	*res = newstate;
+	return 0;
+}
+
+static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state)
+{
+	struct nfs4_state *newstate;
+	int ret;
+
+	/* memory barrier prior to reading state->n_* */
+	clear_bit(NFS_DELEGATED_STATE, &state->flags);
+	smp_rmb();
+	if (state->n_rdwr != 0) {
+		clear_bit(NFS_O_RDWR_STATE, &state->flags);
+		ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
+		if (ret != 0)
+			return ret;
+		if (newstate != state)
+			return -ESTALE;
+	}
+	if (state->n_wronly != 0) {
+		clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+		ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
+		if (ret != 0)
+			return ret;
+		if (newstate != state)
+			return -ESTALE;
+	}
+	if (state->n_rdonly != 0) {
+		clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+		ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
+		if (ret != 0)
+			return ret;
+		if (newstate != state)
+			return -ESTALE;
+	}
+	/*
+	 * We may have performed cached opens for all three recoveries.
+	 * Check if we need to update the current stateid.
+	 */
+	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 &&
+	    memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) {
+		write_seqlock(&state->seqlock);
+		if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
+			memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data));
+		write_sequnlock(&state->seqlock);
+	}
+	return 0;
+}
+
+/*
+ * OPEN_RECLAIM:
+ * 	reclaim state on the server after a reboot.
+ */
+static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+	struct nfs_delegation *delegation;
+	struct nfs4_opendata *opendata;
+	fmode_t delegation_type = 0;
+	int status;
+
+	opendata = nfs4_open_recoverdata_alloc(ctx, state);
+	if (IS_ERR(opendata))
+		return PTR_ERR(opendata);
+	opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS;
+	opendata->o_arg.fh = NFS_FH(state->inode);
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+	if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0)
+		delegation_type = delegation->type;
+	rcu_read_unlock();
+	opendata->o_arg.u.delegation_type = delegation_type;
+	status = nfs4_open_recover(opendata, state);
+	nfs4_opendata_put(opendata);
+	return status;
+}
+
+static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = _nfs4_do_open_reclaim(ctx, state);
+		if (err != -NFS4ERR_DELAY)
+			break;
+		nfs4_handle_exception(server, err, &exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+	struct nfs_open_context *ctx;
+	int ret;
+
+	ctx = nfs4_state_find_open_context(state);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+	ret = nfs4_do_open_reclaim(ctx, state);
+	put_nfs_open_context(ctx);
+	return ret;
+}
+
+static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
+{
+	struct nfs4_opendata *opendata;
+	int ret;
+
+	opendata = nfs4_open_recoverdata_alloc(ctx, state);
+	if (IS_ERR(opendata))
+		return PTR_ERR(opendata);
+	opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR;
+	memcpy(opendata->o_arg.u.delegation.data, stateid->data,
+			sizeof(opendata->o_arg.u.delegation.data));
+	ret = nfs4_open_recover(opendata, state);
+	nfs4_opendata_put(opendata);
+	return ret;
+}
+
+int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
+{
+	struct nfs4_exception exception = { };
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	int err;
+	do {
+		err = _nfs4_open_delegation_recall(ctx, state, stateid);
+		switch (err) {
+			case 0:
+			case -ENOENT:
+			case -ESTALE:
+				goto out;
+			case -NFS4ERR_BADSESSION:
+			case -NFS4ERR_BADSLOT:
+			case -NFS4ERR_BAD_HIGH_SLOT:
+			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+			case -NFS4ERR_DEADSESSION:
+				nfs4_schedule_session_recovery(server->nfs_client->cl_session);
+				goto out;
+			case -NFS4ERR_STALE_CLIENTID:
+			case -NFS4ERR_STALE_STATEID:
+			case -NFS4ERR_EXPIRED:
+				/* Don't recall a delegation if it was lost */
+				nfs4_schedule_lease_recovery(server->nfs_client);
+				goto out;
+			case -ERESTARTSYS:
+				/*
+				 * The show must go on: exit, but mark the
+				 * stateid as needing recovery.
+				 */
+			case -NFS4ERR_DELEG_REVOKED:
+			case -NFS4ERR_ADMIN_REVOKED:
+			case -NFS4ERR_BAD_STATEID:
+				nfs_inode_find_state_and_recover(state->inode,
+						stateid);
+				nfs4_schedule_stateid_recovery(server, state);
+			case -EKEYEXPIRED:
+				/*
+				 * User RPCSEC_GSS context has expired.
+				 * We cannot recover this stateid now, so
+				 * skip it and allow recovery thread to
+				 * proceed.
+				 */
+			case -ENOMEM:
+				err = 0;
+				goto out;
+		}
+		err = nfs4_handle_exception(server, err, &exception);
+	} while (exception.retry);
+out:
+	return err;
+}
+
+static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+
+	data->rpc_status = task->tk_status;
+	if (data->rpc_status == 0) {
+		memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
+				sizeof(data->o_res.stateid.data));
+		nfs_confirm_seqid(&data->owner->so_seqid, 0);
+		renew_lease(data->o_res.server, data->timestamp);
+		data->rpc_done = 1;
+	}
+}
+
+static void nfs4_open_confirm_release(void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+	struct nfs4_state *state = NULL;
+
+	/* If this request hasn't been cancelled, do nothing */
+	if (data->cancelled == 0)
+		goto out_free;
+	/* In case of error, no cleanup! */
+	if (!data->rpc_done)
+		goto out_free;
+	state = nfs4_opendata_to_nfs4_state(data);
+	if (!IS_ERR(state))
+		nfs4_close_state(&data->path, state, data->o_arg.fmode);
+out_free:
+	nfs4_opendata_put(data);
+}
+
+static const struct rpc_call_ops nfs4_open_confirm_ops = {
+	.rpc_call_done = nfs4_open_confirm_done,
+	.rpc_release = nfs4_open_confirm_release,
+};
+
+/*
+ * Note: On error, nfs4_proc_open_confirm will free the struct nfs4_opendata
+ */
+static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
+{
+	struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
+	struct rpc_task *task;
+	struct  rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
+		.rpc_argp = &data->c_arg,
+		.rpc_resp = &data->c_res,
+		.rpc_cred = data->owner->so_cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_open_confirm_ops,
+		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int status;
+
+	kref_get(&data->kref);
+	data->rpc_done = 0;
+	data->rpc_status = 0;
+	data->timestamp = jiffies;
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status != 0) {
+		data->cancelled = 1;
+		smp_wmb();
+	} else
+		status = data->rpc_status;
+	rpc_put_task(task);
+	return status;
+}
+
+static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+	struct nfs4_state_owner *sp = data->owner;
+
+	if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
+		return;
+	/*
+	 * Check if we still need to send an OPEN call, or if we can use
+	 * a delegation instead.
+	 */
+	if (data->state != NULL) {
+		struct nfs_delegation *delegation;
+
+		if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags))
+			goto out_no_action;
+		rcu_read_lock();
+		delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
+		if (delegation != NULL &&
+		    test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) {
+			rcu_read_unlock();
+			goto out_no_action;
+		}
+		rcu_read_unlock();
+	}
+	/* Update sequence id. */
+	data->o_arg.id = sp->so_owner_id.id;
+	data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
+	if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
+		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
+		nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
+	}
+	data->timestamp = jiffies;
+	if (nfs4_setup_sequence(data->o_arg.server,
+				&data->o_arg.seq_args,
+				&data->o_res.seq_res, 1, task))
+		return;
+	rpc_call_start(task);
+	return;
+out_no_action:
+	task->tk_action = NULL;
+
+}
+
+static void nfs4_recover_open_prepare(struct rpc_task *task, void *calldata)
+{
+	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+	nfs4_open_prepare(task, calldata);
+}
+
+static void nfs4_open_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+
+	data->rpc_status = task->tk_status;
+
+	if (!nfs4_sequence_done(task, &data->o_res.seq_res))
+		return;
+
+	if (task->tk_status == 0) {
+		switch (data->o_res.f_attr->mode & S_IFMT) {
+			case S_IFREG:
+				break;
+			case S_IFLNK:
+				data->rpc_status = -ELOOP;
+				break;
+			case S_IFDIR:
+				data->rpc_status = -EISDIR;
+				break;
+			default:
+				data->rpc_status = -ENOTDIR;
+		}
+		renew_lease(data->o_res.server, data->timestamp);
+		if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM))
+			nfs_confirm_seqid(&data->owner->so_seqid, 0);
+	}
+	data->rpc_done = 1;
+}
+
+static void nfs4_open_release(void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+	struct nfs4_state *state = NULL;
+
+	/* If this request hasn't been cancelled, do nothing */
+	if (data->cancelled == 0)
+		goto out_free;
+	/* In case of error, no cleanup! */
+	if (data->rpc_status != 0 || !data->rpc_done)
+		goto out_free;
+	/* In case we need an open_confirm, no cleanup! */
+	if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)
+		goto out_free;
+	state = nfs4_opendata_to_nfs4_state(data);
+	if (!IS_ERR(state))
+		nfs4_close_state(&data->path, state, data->o_arg.fmode);
+out_free:
+	nfs4_opendata_put(data);
+}
+
+static const struct rpc_call_ops nfs4_open_ops = {
+	.rpc_call_prepare = nfs4_open_prepare,
+	.rpc_call_done = nfs4_open_done,
+	.rpc_release = nfs4_open_release,
+};
+
+static const struct rpc_call_ops nfs4_recover_open_ops = {
+	.rpc_call_prepare = nfs4_recover_open_prepare,
+	.rpc_call_done = nfs4_open_done,
+	.rpc_release = nfs4_open_release,
+};
+
+static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
+{
+	struct inode *dir = data->dir->d_inode;
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs_openargs *o_arg = &data->o_arg;
+	struct nfs_openres *o_res = &data->o_res;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN],
+		.rpc_argp = o_arg,
+		.rpc_resp = o_res,
+		.rpc_cred = data->owner->so_cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_open_ops,
+		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int status;
+
+	kref_get(&data->kref);
+	data->rpc_done = 0;
+	data->rpc_status = 0;
+	data->cancelled = 0;
+	if (isrecover)
+		task_setup_data.callback_ops = &nfs4_recover_open_ops;
+	task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task))
+                return PTR_ERR(task);
+        status = nfs4_wait_for_completion_rpc_task(task);
+        if (status != 0) {
+                data->cancelled = 1;
+                smp_wmb();
+        } else
+                status = data->rpc_status;
+        rpc_put_task(task);
+
+	return status;
+}
+
+static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
+{
+	struct inode *dir = data->dir->d_inode;
+	struct nfs_openres *o_res = &data->o_res;
+        int status;
+
+	status = nfs4_run_open_task(data, 1);
+	if (status != 0 || !data->rpc_done)
+		return status;
+
+	nfs_refresh_inode(dir, o_res->dir_attr);
+
+	if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
+		status = _nfs4_proc_open_confirm(data);
+		if (status != 0)
+			return status;
+	}
+
+	return status;
+}
+
+/*
+ * Note: On error, nfs4_proc_open will free the struct nfs4_opendata
+ */
+static int _nfs4_proc_open(struct nfs4_opendata *data)
+{
+	struct inode *dir = data->dir->d_inode;
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs_openargs *o_arg = &data->o_arg;
+	struct nfs_openres *o_res = &data->o_res;
+	int status;
+
+	status = nfs4_run_open_task(data, 0);
+	if (status != 0 || !data->rpc_done)
+		return status;
+
+	if (o_arg->open_flags & O_CREAT) {
+		update_changeattr(dir, &o_res->cinfo);
+		nfs_post_op_update_inode(dir, o_res->dir_attr);
+	} else
+		nfs_refresh_inode(dir, o_res->dir_attr);
+	if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
+		server->caps &= ~NFS_CAP_POSIX_LOCK;
+	if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
+		status = _nfs4_proc_open_confirm(data);
+		if (status != 0)
+			return status;
+	}
+	if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
+		_nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr);
+	return 0;
+}
+
+static int nfs4_client_recover_expired_lease(struct nfs_client *clp)
+{
+	unsigned int loop;
+	int ret;
+
+	for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
+		ret = nfs4_wait_clnt_recover(clp);
+		if (ret != 0)
+			break;
+		if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
+		    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
+			break;
+		nfs4_schedule_state_manager(clp);
+		ret = -EIO;
+	}
+	return ret;
+}
+
+static int nfs4_recover_expired_lease(struct nfs_server *server)
+{
+	return nfs4_client_recover_expired_lease(server->nfs_client);
+}
+
+/*
+ * OPEN_EXPIRED:
+ * 	reclaim state on the server after a network partition.
+ * 	Assumes caller holds the appropriate lock
+ */
+static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+	struct nfs4_opendata *opendata;
+	int ret;
+
+	opendata = nfs4_open_recoverdata_alloc(ctx, state);
+	if (IS_ERR(opendata))
+		return PTR_ERR(opendata);
+	ret = nfs4_open_recover(opendata, state);
+	if (ret == -ESTALE)
+		d_drop(ctx->path.dentry);
+	nfs4_opendata_put(opendata);
+	return ret;
+}
+
+static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs4_exception exception = { };
+	int err;
+
+	do {
+		err = _nfs4_open_expired(ctx, state);
+		switch (err) {
+		default:
+			goto out;
+		case -NFS4ERR_GRACE:
+		case -NFS4ERR_DELAY:
+			nfs4_handle_exception(server, err, &exception);
+			err = 0;
+		}
+	} while (exception.retry);
+out:
+	return err;
+}
+
+static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+	struct nfs_open_context *ctx;
+	int ret;
+
+	ctx = nfs4_state_find_open_context(state);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+	ret = nfs4_do_open_expired(ctx, state);
+	put_nfs_open_context(ctx);
+	return ret;
+}
+
+/*
+ * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-*
+ * fields corresponding to attributes that were used to store the verifier.
+ * Make sure we clobber those fields in the later setattr call
+ */
+static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr)
+{
+	if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
+	    !(sattr->ia_valid & ATTR_ATIME_SET))
+		sattr->ia_valid |= ATTR_ATIME;
+
+	if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
+	    !(sattr->ia_valid & ATTR_MTIME_SET))
+		sattr->ia_valid |= ATTR_MTIME;
+}
+
+/*
+ * Returns a referenced nfs4_state
+ */
+static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
+{
+	struct nfs4_state_owner  *sp;
+	struct nfs4_state     *state = NULL;
+	struct nfs_server       *server = NFS_SERVER(dir);
+	struct nfs4_opendata *opendata;
+	int status;
+
+	/* Protect against reboot recovery conflicts */
+	status = -ENOMEM;
+	if (!(sp = nfs4_get_state_owner(server, cred))) {
+		dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
+		goto out_err;
+	}
+	status = nfs4_recover_expired_lease(server);
+	if (status != 0)
+		goto err_put_state_owner;
+	if (path->dentry->d_inode != NULL)
+		nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode);
+	status = -ENOMEM;
+	opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL);
+	if (opendata == NULL)
+		goto err_put_state_owner;
+
+	if (path->dentry->d_inode != NULL)
+		opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp);
+
+	status = _nfs4_proc_open(opendata);
+	if (status != 0)
+		goto err_opendata_put;
+
+	state = nfs4_opendata_to_nfs4_state(opendata);
+	status = PTR_ERR(state);
+	if (IS_ERR(state))
+		goto err_opendata_put;
+	if (server->caps & NFS_CAP_POSIX_LOCK)
+		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+
+	if (opendata->o_arg.open_flags & O_EXCL) {
+		nfs4_exclusive_attrset(opendata, sattr);
+
+		nfs_fattr_init(opendata->o_res.f_attr);
+		status = nfs4_do_setattr(state->inode, cred,
+				opendata->o_res.f_attr, sattr,
+				state);
+		if (status == 0)
+			nfs_setattr_update_inode(state->inode, sattr);
+		nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+	}
+	nfs_revalidate_inode(server, state->inode);
+	nfs4_opendata_put(opendata);
+	nfs4_put_state_owner(sp);
+	*res = state;
+	return 0;
+err_opendata_put:
+	nfs4_opendata_put(opendata);
+err_put_state_owner:
+	nfs4_put_state_owner(sp);
+out_err:
+	*res = NULL;
+	return status;
+}
+
+
+static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred)
+{
+	struct nfs4_exception exception = { };
+	struct nfs4_state *res;
+	int status;
+
+	do {
+		status = _nfs4_do_open(dir, path, fmode, flags, sattr, cred, &res);
+		if (status == 0)
+			break;
+		/* NOTE: BAD_SEQID means the server and client disagree about the
+		 * book-keeping w.r.t. state-changing operations
+		 * (OPEN/CLOSE/LOCK/LOCKU...)
+		 * It is actually a sign of a bug on the client or on the server.
+		 *
+		 * If we receive a BAD_SEQID error in the particular case of
+		 * doing an OPEN, we assume that nfs_increment_open_seqid() will
+		 * have unhashed the old state_owner for us, and that we can
+		 * therefore safely retry using a new one. We should still warn
+		 * the user though...
+		 */
+		if (status == -NFS4ERR_BAD_SEQID) {
+			printk(KERN_WARNING "NFS: v4 server %s "
+					" returned a bad sequence-id error!\n",
+					NFS_SERVER(dir)->nfs_client->cl_hostname);
+			exception.retry = 1;
+			continue;
+		}
+		/*
+		 * BAD_STATEID on OPEN means that the server cancelled our
+		 * state before it received the OPEN_CONFIRM.
+		 * Recover by retrying the request as per the discussion
+		 * on Page 181 of RFC3530.
+		 */
+		if (status == -NFS4ERR_BAD_STATEID) {
+			exception.retry = 1;
+			continue;
+		}
+		if (status == -EAGAIN) {
+			/* We must have found a delegation */
+			exception.retry = 1;
+			continue;
+		}
+		res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir),
+					status, &exception));
+	} while (exception.retry);
+	return res;
+}
+
+static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+			    struct nfs_fattr *fattr, struct iattr *sattr,
+			    struct nfs4_state *state)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+        struct nfs_setattrargs  arg = {
+                .fh             = NFS_FH(inode),
+                .iap            = sattr,
+		.server		= server,
+		.bitmask = server->attr_bitmask,
+        };
+        struct nfs_setattrres  res = {
+		.fattr		= fattr,
+		.server		= server,
+        };
+        struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+        };
+	unsigned long timestamp = jiffies;
+	int status;
+
+	nfs_fattr_init(fattr);
+
+	if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
+		/* Use that stateid */
+	} else if (state != NULL) {
+		nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid);
+	} else
+		memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
+
+	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+	if (status == 0 && state != NULL)
+		renew_lease(server, timestamp);
+	return status;
+}
+
+static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+			   struct nfs_fattr *fattr, struct iattr *sattr,
+			   struct nfs4_state *state)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs4_exception exception = {
+		.state = state,
+		.inode = inode,
+	};
+	int err;
+	do {
+		err = nfs4_handle_exception(server,
+				_nfs4_do_setattr(inode, cred, fattr, sattr, state),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+struct nfs4_closedata {
+	struct path path;
+	struct inode *inode;
+	struct nfs4_state *state;
+	struct nfs_closeargs arg;
+	struct nfs_closeres res;
+	struct nfs_fattr fattr;
+	unsigned long timestamp;
+	bool roc;
+	u32 roc_barrier;
+};
+
+static void nfs4_free_closedata(void *data)
+{
+	struct nfs4_closedata *calldata = data;
+	struct nfs4_state_owner *sp = calldata->state->owner;
+
+	if (calldata->roc)
+		pnfs_roc_release(calldata->state->inode);
+	nfs4_put_open_state(calldata->state);
+	nfs_free_seqid(calldata->arg.seqid);
+	nfs4_put_state_owner(sp);
+	path_put(&calldata->path);
+	kfree(calldata);
+}
+
+static void nfs4_close_clear_stateid_flags(struct nfs4_state *state,
+		fmode_t fmode)
+{
+	spin_lock(&state->owner->so_lock);
+	if (!(fmode & FMODE_READ))
+		clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+	if (!(fmode & FMODE_WRITE))
+		clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+	clear_bit(NFS_O_RDWR_STATE, &state->flags);
+	spin_unlock(&state->owner->so_lock);
+}
+
+static void nfs4_close_done(struct rpc_task *task, void *data)
+{
+	struct nfs4_closedata *calldata = data;
+	struct nfs4_state *state = calldata->state;
+	struct nfs_server *server = NFS_SERVER(calldata->inode);
+
+	if (!nfs4_sequence_done(task, &calldata->res.seq_res))
+		return;
+        /* hmm. we are done with the inode, and in the process of freeing
+	 * the state_owner. we keep this around to process errors
+	 */
+	switch (task->tk_status) {
+		case 0:
+			if (calldata->roc)
+				pnfs_roc_set_barrier(state->inode,
+						     calldata->roc_barrier);
+			nfs_set_open_stateid(state, &calldata->res.stateid, 0);
+			renew_lease(server, calldata->timestamp);
+			nfs4_close_clear_stateid_flags(state,
+					calldata->arg.fmode);
+			break;
+		case -NFS4ERR_STALE_STATEID:
+		case -NFS4ERR_OLD_STATEID:
+		case -NFS4ERR_BAD_STATEID:
+		case -NFS4ERR_EXPIRED:
+			if (calldata->arg.fmode == 0)
+				break;
+		default:
+			if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
+				rpc_restart_call_prepare(task);
+	}
+	nfs_release_seqid(calldata->arg.seqid);
+	nfs_refresh_inode(calldata->inode, calldata->res.fattr);
+}
+
+static void nfs4_close_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs4_closedata *calldata = data;
+	struct nfs4_state *state = calldata->state;
+	int call_close = 0;
+
+	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
+		return;
+
+	task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
+	calldata->arg.fmode = FMODE_READ|FMODE_WRITE;
+	spin_lock(&state->owner->so_lock);
+	/* Calculate the change in open mode */
+	if (state->n_rdwr == 0) {
+		if (state->n_rdonly == 0) {
+			call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags);
+			call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+			calldata->arg.fmode &= ~FMODE_READ;
+		}
+		if (state->n_wronly == 0) {
+			call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags);
+			call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+			calldata->arg.fmode &= ~FMODE_WRITE;
+		}
+	}
+	spin_unlock(&state->owner->so_lock);
+
+	if (!call_close) {
+		/* Note: exit _without_ calling nfs4_close_done */
+		task->tk_action = NULL;
+		return;
+	}
+
+	if (calldata->arg.fmode == 0) {
+		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+		if (calldata->roc &&
+		    pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
+			rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
+				     task, NULL);
+			return;
+		}
+	}
+
+	nfs_fattr_init(calldata->res.fattr);
+	calldata->timestamp = jiffies;
+	if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
+				&calldata->arg.seq_args, &calldata->res.seq_res,
+				1, task))
+		return;
+	rpc_call_start(task);
+}
+
+static const struct rpc_call_ops nfs4_close_ops = {
+	.rpc_call_prepare = nfs4_close_prepare,
+	.rpc_call_done = nfs4_close_done,
+	.rpc_release = nfs4_free_closedata,
+};
+
+/* 
+ * It is possible for data to be read/written from a mem-mapped file 
+ * after the sys_close call (which hits the vfs layer as a flush).
+ * This means that we can't safely call nfsv4 close on a file until 
+ * the inode is cleared. This in turn means that we are not good
+ * NFSv4 citizens - we do not indicate to the server to update the file's 
+ * share state even when we are done with one of the three share 
+ * stateid's in the inode.
+ *
+ * NOTE: Caller must be holding the sp->so_owner semaphore!
+ */
+int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
+{
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs4_closedata *calldata;
+	struct nfs4_state_owner *sp = state->owner;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
+		.rpc_cred = state->owner->so_cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_close_ops,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int status = -ENOMEM;
+
+	calldata = kzalloc(sizeof(*calldata), gfp_mask);
+	if (calldata == NULL)
+		goto out;
+	calldata->inode = state->inode;
+	calldata->state = state;
+	calldata->arg.fh = NFS_FH(state->inode);
+	calldata->arg.stateid = &state->open_stateid;
+	/* Serialization for the sequence id */
+	calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask);
+	if (calldata->arg.seqid == NULL)
+		goto out_free_calldata;
+	calldata->arg.fmode = 0;
+	calldata->arg.bitmask = server->cache_consistency_bitmask;
+	calldata->res.fattr = &calldata->fattr;
+	calldata->res.seqid = calldata->arg.seqid;
+	calldata->res.server = server;
+	calldata->roc = roc;
+	path_get(path);
+	calldata->path = *path;
+
+	msg.rpc_argp = &calldata->arg;
+	msg.rpc_resp = &calldata->res;
+	task_setup_data.callback_data = calldata;
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	status = 0;
+	if (wait)
+		status = rpc_wait_for_completion_task(task);
+	rpc_put_task(task);
+	return status;
+out_free_calldata:
+	kfree(calldata);
+out:
+	if (roc)
+		pnfs_roc_release(state->inode);
+	nfs4_put_open_state(state);
+	nfs4_put_state_owner(sp);
+	return status;
+}
+
+static struct inode *
+nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
+{
+	struct nfs4_state *state;
+
+	/* Protect against concurrent sillydeletes */
+	state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred);
+	if (IS_ERR(state))
+		return ERR_CAST(state);
+	ctx->state = state;
+	return igrab(state->inode);
+}
+
+static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+	if (ctx->state == NULL)
+		return;
+	if (is_sync)
+		nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
+	else
+		nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
+}
+
+static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+{
+	struct nfs4_server_caps_arg args = {
+		.fhandle = fhandle,
+	};
+	struct nfs4_server_caps_res res = {};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SERVER_CAPS],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	int status;
+
+	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+	if (status == 0) {
+		memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
+		server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
+				NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+				NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
+				NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
+				NFS_CAP_CTIME|NFS_CAP_MTIME);
+		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
+			server->caps |= NFS_CAP_ACLS;
+		if (res.has_links != 0)
+			server->caps |= NFS_CAP_HARDLINKS;
+		if (res.has_symlinks != 0)
+			server->caps |= NFS_CAP_SYMLINKS;
+		if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID)
+			server->caps |= NFS_CAP_FILEID;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_MODE)
+			server->caps |= NFS_CAP_MODE;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)
+			server->caps |= NFS_CAP_NLINK;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER)
+			server->caps |= NFS_CAP_OWNER;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)
+			server->caps |= NFS_CAP_OWNER_GROUP;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)
+			server->caps |= NFS_CAP_ATIME;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)
+			server->caps |= NFS_CAP_CTIME;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
+			server->caps |= NFS_CAP_MTIME;
+
+		memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
+		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
+		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+		server->acl_bitmask = res.acl_bitmask;
+	}
+
+	return status;
+}
+
+int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(server,
+				_nfs4_server_capabilities(server, fhandle),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
+		struct nfs_fsinfo *info)
+{
+	struct nfs4_lookup_root_arg args = {
+		.bitmask = nfs4_fattr_bitmap,
+	};
+	struct nfs4_lookup_res res = {
+		.server = server,
+		.fattr = info->fattr,
+		.fh = fhandle,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP_ROOT],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+
+	nfs_fattr_init(info->fattr);
+	return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
+		struct nfs_fsinfo *info)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = _nfs4_lookup_root(server, fhandle, info);
+		switch (err) {
+		case 0:
+		case -NFS4ERR_WRONGSEC:
+			break;
+		default:
+			err = nfs4_handle_exception(server, err, &exception);
+		}
+	} while (exception.retry);
+	return err;
+}
+
+static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
+				struct nfs_fsinfo *info, rpc_authflavor_t flavor)
+{
+	struct rpc_auth *auth;
+	int ret;
+
+	auth = rpcauth_create(flavor, server->client);
+	if (!auth) {
+		ret = -EIO;
+		goto out;
+	}
+	ret = nfs4_lookup_root(server, fhandle, info);
+out:
+	return ret;
+}
+
+static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
+			      struct nfs_fsinfo *info)
+{
+	int i, len, status = 0;
+	rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS];
+
+	len = gss_mech_list_pseudoflavors(&flav_array[0]);
+	flav_array[len] = RPC_AUTH_NULL;
+	len += 1;
+
+	for (i = 0; i < len; i++) {
+		status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]);
+		if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
+			continue;
+		break;
+	}
+	/*
+	 * -EACCESS could mean that the user doesn't have correct permissions
+	 * to access the mount.  It could also mean that we tried to mount
+	 * with a gss auth flavor, but rpc.gssd isn't running.  Either way,
+	 * existing mount programs don't handle -EACCES very well so it should
+	 * be mapped to -EPERM instead.
+	 */
+	if (status == -EACCES)
+		status = -EPERM;
+	return status;
+}
+
+/*
+ * get the file handle for the "/" directory on the server
+ */
+static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
+			      struct nfs_fsinfo *info)
+{
+	int status = nfs4_lookup_root(server, fhandle, info);
+	if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR))
+		/*
+		 * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM
+		 * by nfs4_map_errors() as this function exits.
+		 */
+		status = nfs4_find_root_sec(server, fhandle, info);
+	if (status == 0)
+		status = nfs4_server_capabilities(server, fhandle);
+	if (status == 0)
+		status = nfs4_do_fsinfo(server, fhandle, info);
+	return nfs4_map_errors(status);
+}
+
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
+/*
+ * Get locations and (maybe) other attributes of a referral.
+ * Note that we'll actually follow the referral later when
+ * we detect fsid mismatch in inode revalidation
+ */
+static int nfs4_get_referral(struct inode *dir, const struct qstr *name,
+			     struct nfs_fattr *fattr, struct nfs_fh *fhandle)
+{
+	int status = -ENOMEM;
+	struct page *page = NULL;
+	struct nfs4_fs_locations *locations = NULL;
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL)
+		goto out;
+	locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (locations == NULL)
+		goto out;
+
+	status = nfs4_proc_fs_locations(dir, name, locations, page);
+	if (status != 0)
+		goto out;
+	/* Make sure server returned a different fsid for the referral */
+	if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
+		dprintk("%s: server did not return a different fsid for"
+			" a referral at %s\n", __func__, name->name);
+		status = -EIO;
+		goto out;
+	}
+	/* Fixup attributes for the nfs_lookup() call to nfs_fhget() */
+	nfs_fixup_referral_attributes(&locations->fattr);
+
+	/* replace the lookup nfs_fattr with the locations nfs_fattr */
+	memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr));
+	memset(fhandle, 0, sizeof(struct nfs_fh));
+out:
+	if (page)
+		__free_page(page);
+	kfree(locations);
+	return status;
+}
+
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+{
+	struct nfs4_getattr_arg args = {
+		.fh = fhandle,
+		.bitmask = server->attr_bitmask,
+	};
+	struct nfs4_getattr_res res = {
+		.fattr = fattr,
+		.server = server,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	
+	nfs_fattr_init(fattr);
+	return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(server,
+				_nfs4_proc_getattr(server, fhandle, fattr),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+/* 
+ * The file is not closed if it is opened due to the a request to change
+ * the size of the file. The open call will not be needed once the
+ * VFS layer lookup-intents are implemented.
+ *
+ * Close is called when the inode is destroyed.
+ * If we haven't opened the file for O_WRONLY, we
+ * need to in the size_change case to obtain a stateid.
+ *
+ * Got race?
+ * Because OPEN is always done by name in nfsv4, it is
+ * possible that we opened a different file by the same
+ * name.  We can recognize this race condition, but we
+ * can't do anything about it besides returning an error.
+ *
+ * This will be fixed with VFS changes (lookup-intent).
+ */
+static int
+nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
+		  struct iattr *sattr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct rpc_cred *cred = NULL;
+	struct nfs4_state *state = NULL;
+	int status;
+
+	if (pnfs_ld_layoutret_on_setattr(inode))
+		pnfs_return_layout(inode);
+
+	nfs_fattr_init(fattr);
+	
+	/* Search for an existing open(O_WRITE) file */
+	if (sattr->ia_valid & ATTR_FILE) {
+		struct nfs_open_context *ctx;
+
+		ctx = nfs_file_open_context(sattr->ia_file);
+		if (ctx) {
+			cred = ctx->cred;
+			state = ctx->state;
+		}
+	}
+
+	status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
+	if (status == 0)
+		nfs_setattr_update_inode(inode, sattr);
+	return status;
+}
+
+static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server,
+		const struct nfs_fh *dirfh, const struct qstr *name,
+		struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+{
+	int		       status;
+	struct nfs4_lookup_arg args = {
+		.bitmask = server->attr_bitmask,
+		.dir_fh = dirfh,
+		.name = name,
+	};
+	struct nfs4_lookup_res res = {
+		.server = server,
+		.fattr = fattr,
+		.fh = fhandle,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+
+	nfs_fattr_init(fattr);
+
+	dprintk("NFS call  lookupfh %s\n", name->name);
+	status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
+	dprintk("NFS reply lookupfh: %d\n", status);
+	return status;
+}
+
+static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
+			      struct qstr *name, struct nfs_fh *fhandle,
+			      struct nfs_fattr *fattr)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr);
+		/* FIXME: !!!! */
+		if (err == -NFS4ERR_MOVED) {
+			err = -EREMOTE;
+			break;
+		}
+		err = nfs4_handle_exception(server, err, &exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
+		const struct qstr *name, struct nfs_fh *fhandle,
+		struct nfs_fattr *fattr)
+{
+	int status;
+	
+	dprintk("NFS call  lookup %s\n", name->name);
+	status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr);
+	if (status == -NFS4ERR_MOVED)
+		status = nfs4_get_referral(dir, name, fattr, fhandle);
+	dprintk("NFS reply lookup: %d\n", status);
+	return status;
+}
+
+void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr, struct nfs_fh *fh)
+{
+	memset(fh, 0, sizeof(struct nfs_fh));
+	fattr->fsid.major = 1;
+	fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
+		NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_FSID | NFS_ATTR_FATTR_MOUNTPOINT;
+	fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+	fattr->nlink = 2;
+}
+
+static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
+			    struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(dir),
+				_nfs4_proc_lookup(clnt, dir, name, fhandle, fattr),
+				&exception);
+		if (err == -EPERM)
+			nfs_fixup_secinfo_attributes(fattr, fhandle);
+	} while (exception.retry);
+	return err;
+}
+
+static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs4_accessargs args = {
+		.fh = NFS_FH(inode),
+		.bitmask = server->attr_bitmask,
+	};
+	struct nfs4_accessres res = {
+		.server = server,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+		.rpc_cred = entry->cred,
+	};
+	int mode = entry->mask;
+	int status;
+
+	/*
+	 * Determine which access bits we want to ask for...
+	 */
+	if (mode & MAY_READ)
+		args.access |= NFS4_ACCESS_READ;
+	if (S_ISDIR(inode->i_mode)) {
+		if (mode & MAY_WRITE)
+			args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE;
+		if (mode & MAY_EXEC)
+			args.access |= NFS4_ACCESS_LOOKUP;
+	} else {
+		if (mode & MAY_WRITE)
+			args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND;
+		if (mode & MAY_EXEC)
+			args.access |= NFS4_ACCESS_EXECUTE;
+	}
+
+	res.fattr = nfs_alloc_fattr();
+	if (res.fattr == NULL)
+		return -ENOMEM;
+
+	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+	if (!status) {
+		entry->mask = 0;
+		if (res.access & NFS4_ACCESS_READ)
+			entry->mask |= MAY_READ;
+		if (res.access & (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE))
+			entry->mask |= MAY_WRITE;
+		if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
+			entry->mask |= MAY_EXEC;
+		nfs_refresh_inode(inode, res.fattr);
+	}
+	nfs_free_fattr(res.fattr);
+	return status;
+}
+
+static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(inode),
+				_nfs4_proc_access(inode, entry),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+/*
+ * TODO: For the time being, we don't try to get any attributes
+ * along with any of the zero-copy operations READ, READDIR,
+ * READLINK, WRITE.
+ *
+ * In the case of the first three, we want to put the GETATTR
+ * after the read-type operation -- this is because it is hard
+ * to predict the length of a GETATTR response in v4, and thus
+ * align the READ data correctly.  This means that the GETATTR
+ * may end up partially falling into the page cache, and we should
+ * shift it into the 'tail' of the xdr_buf before processing.
+ * To do this efficiently, we need to know the total length
+ * of data received, which doesn't seem to be available outside
+ * of the RPC layer.
+ *
+ * In the case of WRITE, we also want to put the GETATTR after
+ * the operation -- in this case because we want to make sure
+ * we get the post-operation mtime and size.  This means that
+ * we can't use xdr_encode_pages() as written: we need a variant
+ * of it which would leave room in the 'tail' iovec.
+ *
+ * Both of these changes to the XDR layer would in fact be quite
+ * minor, but I decided to leave them for a subsequent patch.
+ */
+static int _nfs4_proc_readlink(struct inode *inode, struct page *page,
+		unsigned int pgbase, unsigned int pglen)
+{
+	struct nfs4_readlink args = {
+		.fh       = NFS_FH(inode),
+		.pgbase	  = pgbase,
+		.pglen    = pglen,
+		.pages    = &page,
+	};
+	struct nfs4_readlink_res res;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READLINK],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+
+	return nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_proc_readlink(struct inode *inode, struct page *page,
+		unsigned int pgbase, unsigned int pglen)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(inode),
+				_nfs4_proc_readlink(inode, page, pgbase, pglen),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+/*
+ * Got race?
+ * We will need to arrange for the VFS layer to provide an atomic open.
+ * Until then, this create/open method is prone to inefficiency and race
+ * conditions due to the lookup, create, and open VFS calls from sys_open()
+ * placed on the wire.
+ *
+ * Given the above sorry state of affairs, I'm simply sending an OPEN.
+ * The file will be opened again in the subsequent VFS open call
+ * (nfs4_proc_file_open).
+ *
+ * The open for read will just hang around to be used by any process that
+ * opens the file O_RDONLY. This will all be resolved with the VFS changes.
+ */
+
+static int
+nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+                 int flags, struct nfs_open_context *ctx)
+{
+	struct path my_path = {
+		.dentry = dentry,
+	};
+	struct path *path = &my_path;
+	struct nfs4_state *state;
+	struct rpc_cred *cred = NULL;
+	fmode_t fmode = 0;
+	int status = 0;
+
+	if (ctx != NULL) {
+		cred = ctx->cred;
+		path = &ctx->path;
+		fmode = ctx->mode;
+	}
+	sattr->ia_mode &= ~current_umask();
+	state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
+	d_drop(dentry);
+	if (IS_ERR(state)) {
+		status = PTR_ERR(state);
+		goto out;
+	}
+	d_add(dentry, igrab(state->inode));
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+	if (ctx != NULL)
+		ctx->state = state;
+	else
+		nfs4_close_sync(path, state, fmode);
+out:
+	return status;
+}
+
+static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs_removeargs args = {
+		.fh = NFS_FH(dir),
+		.name.len = name->len,
+		.name.name = name->name,
+		.bitmask = server->attr_bitmask,
+	};
+	struct nfs_removeres res = {
+		.server = server,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	int status = -ENOMEM;
+
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.dir_attr == NULL)
+		goto out;
+
+	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
+	if (status == 0) {
+		update_changeattr(dir, &res.cinfo);
+		nfs_post_op_update_inode(dir, res.dir_attr);
+	}
+	nfs_free_fattr(res.dir_attr);
+out:
+	return status;
+}
+
+static int nfs4_proc_remove(struct inode *dir, struct qstr *name)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(dir),
+				_nfs4_proc_remove(dir, name),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs_removeargs *args = msg->rpc_argp;
+	struct nfs_removeres *res = msg->rpc_resp;
+
+	args->bitmask = server->cache_consistency_bitmask;
+	res->server = server;
+	res->seq_res.sr_slot = NULL;
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
+}
+
+static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
+{
+	struct nfs_removeres *res = task->tk_msg.rpc_resp;
+
+	if (!nfs4_sequence_done(task, &res->seq_res))
+		return 0;
+	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+		return 0;
+	update_changeattr(dir, &res->cinfo);
+	nfs_post_op_update_inode(dir, res->dir_attr);
+	return 1;
+}
+
+static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs_renameargs *arg = msg->rpc_argp;
+	struct nfs_renameres *res = msg->rpc_resp;
+
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
+	arg->bitmask = server->attr_bitmask;
+	res->server = server;
+}
+
+static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+				 struct inode *new_dir)
+{
+	struct nfs_renameres *res = task->tk_msg.rpc_resp;
+
+	if (!nfs4_sequence_done(task, &res->seq_res))
+		return 0;
+	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+		return 0;
+
+	update_changeattr(old_dir, &res->old_cinfo);
+	nfs_post_op_update_inode(old_dir, res->old_fattr);
+	update_changeattr(new_dir, &res->new_cinfo);
+	nfs_post_op_update_inode(new_dir, res->new_fattr);
+	return 1;
+}
+
+static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
+		struct inode *new_dir, struct qstr *new_name)
+{
+	struct nfs_server *server = NFS_SERVER(old_dir);
+	struct nfs_renameargs arg = {
+		.old_dir = NFS_FH(old_dir),
+		.new_dir = NFS_FH(new_dir),
+		.old_name = old_name,
+		.new_name = new_name,
+		.bitmask = server->attr_bitmask,
+	};
+	struct nfs_renameres res = {
+		.server = server,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
+		.rpc_argp = &arg,
+		.rpc_resp = &res,
+	};
+	int status = -ENOMEM;
+	
+	res.old_fattr = nfs_alloc_fattr();
+	res.new_fattr = nfs_alloc_fattr();
+	if (res.old_fattr == NULL || res.new_fattr == NULL)
+		goto out;
+
+	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+	if (!status) {
+		update_changeattr(old_dir, &res.old_cinfo);
+		nfs_post_op_update_inode(old_dir, res.old_fattr);
+		update_changeattr(new_dir, &res.new_cinfo);
+		nfs_post_op_update_inode(new_dir, res.new_fattr);
+	}
+out:
+	nfs_free_fattr(res.new_fattr);
+	nfs_free_fattr(res.old_fattr);
+	return status;
+}
+
+static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
+		struct inode *new_dir, struct qstr *new_name)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(old_dir),
+				_nfs4_proc_rename(old_dir, old_name,
+					new_dir, new_name),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs4_link_arg arg = {
+		.fh     = NFS_FH(inode),
+		.dir_fh = NFS_FH(dir),
+		.name   = name,
+		.bitmask = server->attr_bitmask,
+	};
+	struct nfs4_link_res res = {
+		.server = server,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
+		.rpc_argp = &arg,
+		.rpc_resp = &res,
+	};
+	int status = -ENOMEM;
+
+	res.fattr = nfs_alloc_fattr();
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.fattr == NULL || res.dir_attr == NULL)
+		goto out;
+
+	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+	if (!status) {
+		update_changeattr(dir, &res.cinfo);
+		nfs_post_op_update_inode(dir, res.dir_attr);
+		nfs_post_op_update_inode(inode, res.fattr);
+	}
+out:
+	nfs_free_fattr(res.dir_attr);
+	nfs_free_fattr(res.fattr);
+	return status;
+}
+
+static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(inode),
+				_nfs4_proc_link(inode, dir, name),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+struct nfs4_createdata {
+	struct rpc_message msg;
+	struct nfs4_create_arg arg;
+	struct nfs4_create_res res;
+	struct nfs_fh fh;
+	struct nfs_fattr fattr;
+	struct nfs_fattr dir_fattr;
+};
+
+static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
+		struct qstr *name, struct iattr *sattr, u32 ftype)
+{
+	struct nfs4_createdata *data;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data != NULL) {
+		struct nfs_server *server = NFS_SERVER(dir);
+
+		data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
+		data->msg.rpc_argp = &data->arg;
+		data->msg.rpc_resp = &data->res;
+		data->arg.dir_fh = NFS_FH(dir);
+		data->arg.server = server;
+		data->arg.name = name;
+		data->arg.attrs = sattr;
+		data->arg.ftype = ftype;
+		data->arg.bitmask = server->attr_bitmask;
+		data->res.server = server;
+		data->res.fh = &data->fh;
+		data->res.fattr = &data->fattr;
+		data->res.dir_fattr = &data->dir_fattr;
+		nfs_fattr_init(data->res.fattr);
+		nfs_fattr_init(data->res.dir_fattr);
+	}
+	return data;
+}
+
+static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
+{
+	int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg,
+				    &data->arg.seq_args, &data->res.seq_res, 1);
+	if (status == 0) {
+		update_changeattr(dir, &data->res.dir_cinfo);
+		nfs_post_op_update_inode(dir, data->res.dir_fattr);
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+	}
+	return status;
+}
+
+static void nfs4_free_createdata(struct nfs4_createdata *data)
+{
+	kfree(data);
+}
+
+static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
+		struct page *page, unsigned int len, struct iattr *sattr)
+{
+	struct nfs4_createdata *data;
+	int status = -ENAMETOOLONG;
+
+	if (len > NFS4_MAXPATHLEN)
+		goto out;
+
+	status = -ENOMEM;
+	data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK);
+	if (data == NULL)
+		goto out;
+
+	data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
+	data->arg.u.symlink.pages = &page;
+	data->arg.u.symlink.len = len;
+	
+	status = nfs4_do_create(dir, dentry, data);
+
+	nfs4_free_createdata(data);
+out:
+	return status;
+}
+
+static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
+		struct page *page, unsigned int len, struct iattr *sattr)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(dir),
+				_nfs4_proc_symlink(dir, dentry, page,
+							len, sattr),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
+		struct iattr *sattr)
+{
+	struct nfs4_createdata *data;
+	int status = -ENOMEM;
+
+	data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
+	if (data == NULL)
+		goto out;
+
+	status = nfs4_do_create(dir, dentry, data);
+
+	nfs4_free_createdata(data);
+out:
+	return status;
+}
+
+static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
+		struct iattr *sattr)
+{
+	struct nfs4_exception exception = { };
+	int err;
+
+	sattr->ia_mode &= ~current_umask();
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(dir),
+				_nfs4_proc_mkdir(dir, dentry, sattr),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+		u64 cookie, struct page **pages, unsigned int count, int plus)
+{
+	struct inode		*dir = dentry->d_inode;
+	struct nfs4_readdir_arg args = {
+		.fh = NFS_FH(dir),
+		.pages = pages,
+		.pgbase = 0,
+		.count = count,
+		.bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
+		.plus = plus,
+	};
+	struct nfs4_readdir_res res;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READDIR],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+		.rpc_cred = cred,
+	};
+	int			status;
+
+	dprintk("%s: dentry = %s/%s, cookie = %Lu\n", __func__,
+			dentry->d_parent->d_name.name,
+			dentry->d_name.name,
+			(unsigned long long)cookie);
+	nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
+	res.pgbase = args.pgbase;
+	status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0);
+	if (status >= 0) {
+		memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
+		status += args.pgbase;
+	}
+
+	nfs_invalidate_atime(dir);
+
+	dprintk("%s: returns %d\n", __func__, status);
+	return status;
+}
+
+static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+		u64 cookie, struct page **pages, unsigned int count, int plus)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode),
+				_nfs4_proc_readdir(dentry, cred, cookie,
+					pages, count, plus),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
+		struct iattr *sattr, dev_t rdev)
+{
+	struct nfs4_createdata *data;
+	int mode = sattr->ia_mode;
+	int status = -ENOMEM;
+
+	BUG_ON(!(sattr->ia_valid & ATTR_MODE));
+	BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode));
+
+	data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
+	if (data == NULL)
+		goto out;
+
+	if (S_ISFIFO(mode))
+		data->arg.ftype = NF4FIFO;
+	else if (S_ISBLK(mode)) {
+		data->arg.ftype = NF4BLK;
+		data->arg.u.device.specdata1 = MAJOR(rdev);
+		data->arg.u.device.specdata2 = MINOR(rdev);
+	}
+	else if (S_ISCHR(mode)) {
+		data->arg.ftype = NF4CHR;
+		data->arg.u.device.specdata1 = MAJOR(rdev);
+		data->arg.u.device.specdata2 = MINOR(rdev);
+	}
+	
+	status = nfs4_do_create(dir, dentry, data);
+
+	nfs4_free_createdata(data);
+out:
+	return status;
+}
+
+static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
+		struct iattr *sattr, dev_t rdev)
+{
+	struct nfs4_exception exception = { };
+	int err;
+
+	sattr->ia_mode &= ~current_umask();
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(dir),
+				_nfs4_proc_mknod(dir, dentry, sattr, rdev),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
+		 struct nfs_fsstat *fsstat)
+{
+	struct nfs4_statfs_arg args = {
+		.fh = fhandle,
+		.bitmask = server->attr_bitmask,
+	};
+	struct nfs4_statfs_res res = {
+		.fsstat = fsstat,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_STATFS],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+
+	nfs_fattr_init(fsstat->fattr);
+	return  nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(server,
+				_nfs4_proc_statfs(server, fhandle, fsstat),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+		struct nfs_fsinfo *fsinfo)
+{
+	struct nfs4_fsinfo_arg args = {
+		.fh = fhandle,
+		.bitmask = server->attr_bitmask,
+	};
+	struct nfs4_fsinfo_res res = {
+		.fsinfo = fsinfo,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSINFO],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+
+	return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
+{
+	struct nfs4_exception exception = { };
+	int err;
+
+	do {
+		err = nfs4_handle_exception(server,
+				_nfs4_do_fsinfo(server, fhandle, fsinfo),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
+{
+	nfs_fattr_init(fsinfo->fattr);
+	return nfs4_do_fsinfo(server, fhandle, fsinfo);
+}
+
+static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+		struct nfs_pathconf *pathconf)
+{
+	struct nfs4_pathconf_arg args = {
+		.fh = fhandle,
+		.bitmask = server->attr_bitmask,
+	};
+	struct nfs4_pathconf_res res = {
+		.pathconf = pathconf,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PATHCONF],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+
+	/* None of the pathconf attributes are mandatory to implement */
+	if ((args.bitmask[0] & nfs4_pathconf_bitmap[0]) == 0) {
+		memset(pathconf, 0, sizeof(*pathconf));
+		return 0;
+	}
+
+	nfs_fattr_init(pathconf->fattr);
+	return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+		struct nfs_pathconf *pathconf)
+{
+	struct nfs4_exception exception = { };
+	int err;
+
+	do {
+		err = nfs4_handle_exception(server,
+				_nfs4_proc_pathconf(server, fhandle, pathconf),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+void __nfs4_read_done_cb(struct nfs_read_data *data)
+{
+	nfs_invalidate_atime(data->inode);
+}
+
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
+{
+	struct nfs_server *server = NFS_SERVER(data->inode);
+
+	if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
+		nfs_restart_rpc(task, server->nfs_client);
+		return -EAGAIN;
+	}
+
+	__nfs4_read_done_cb(data);
+	if (task->tk_status > 0)
+		renew_lease(server, data->timestamp);
+	return 0;
+}
+
+static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+{
+
+	dprintk("--> %s\n", __func__);
+
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return -EAGAIN;
+
+	return data->read_done_cb ? data->read_done_cb(task, data) :
+				    nfs4_read_done_cb(task, data);
+}
+
+static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
+{
+	data->timestamp   = jiffies;
+	data->read_done_cb = nfs4_read_done_cb;
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+}
+
+/* Reset the the nfs_read_data to send the read to the MDS. */
+void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
+{
+	dprintk("%s Reset task for i/o through\n", __func__);
+	put_lseg(data->lseg);
+	data->lseg = NULL;
+	/* offsets will differ in the dense stripe case */
+	data->args.offset = data->mds_offset;
+	data->ds_clp = NULL;
+	data->args.fh     = NFS_FH(data->inode);
+	data->read_done_cb = nfs4_read_done_cb;
+	task->tk_ops = data->mds_ops;
+	rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_read);
+
+static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
+{
+	struct inode *inode = data->inode;
+	
+	if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
+		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+		return -EAGAIN;
+	}
+	if (task->tk_status >= 0) {
+		renew_lease(NFS_SERVER(inode), data->timestamp);
+		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+	}
+	return 0;
+}
+
+static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+{
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return -EAGAIN;
+	return data->write_done_cb ? data->write_done_cb(task, data) :
+		nfs4_write_done_cb(task, data);
+}
+
+/* Reset the the nfs_write_data to send the write to the MDS. */
+void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
+{
+	dprintk("%s Reset task for i/o through\n", __func__);
+	put_lseg(data->lseg);
+	data->lseg          = NULL;
+	data->ds_clp        = NULL;
+	data->write_done_cb = nfs4_write_done_cb;
+	data->args.fh       = NFS_FH(data->inode);
+	data->args.bitmask  = data->res.server->cache_consistency_bitmask;
+	data->args.offset   = data->mds_offset;
+	data->res.fattr     = &data->fattr;
+	task->tk_ops        = data->mds_ops;
+	rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_write);
+
+static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
+{
+	struct nfs_server *server = NFS_SERVER(data->inode);
+
+	if (data->lseg) {
+		data->args.bitmask = NULL;
+		data->res.fattr = NULL;
+	} else
+		data->args.bitmask = server->cache_consistency_bitmask;
+	if (!data->write_done_cb)
+		data->write_done_cb = nfs4_write_done_cb;
+	data->res.server = server;
+	data->timestamp   = jiffies;
+
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
+}
+
+static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data)
+{
+	struct inode *inode = data->inode;
+
+	if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
+		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+		return -EAGAIN;
+	}
+	nfs_refresh_inode(inode, data->res.fattr);
+	return 0;
+}
+
+static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
+{
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return -EAGAIN;
+	return data->write_done_cb(task, data);
+}
+
+static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
+{
+	struct nfs_server *server = NFS_SERVER(data->inode);
+
+	if (data->lseg) {
+		data->args.bitmask = NULL;
+		data->res.fattr = NULL;
+	} else
+		data->args.bitmask = server->cache_consistency_bitmask;
+	if (!data->write_done_cb)
+		data->write_done_cb = nfs4_commit_done_cb;
+	data->res.server = server;
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
+}
+
+struct nfs4_renewdata {
+	struct nfs_client	*client;
+	unsigned long		timestamp;
+};
+
+/*
+ * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
+ * standalone procedure for queueing an asynchronous RENEW.
+ */
+static void nfs4_renew_release(void *calldata)
+{
+	struct nfs4_renewdata *data = calldata;
+	struct nfs_client *clp = data->client;
+
+	if (atomic_read(&clp->cl_count) > 1)
+		nfs4_schedule_state_renewal(clp);
+	nfs_put_client(clp);
+	kfree(data);
+}
+
+static void nfs4_renew_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_renewdata *data = calldata;
+	struct nfs_client *clp = data->client;
+	unsigned long timestamp = data->timestamp;
+
+	if (task->tk_status < 0) {
+		/* Unless we're shutting down, schedule state recovery! */
+		if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
+			nfs4_schedule_lease_recovery(clp);
+		return;
+	}
+	do_renew_lease(clp, timestamp);
+}
+
+static const struct rpc_call_ops nfs4_renew_ops = {
+	.rpc_call_done = nfs4_renew_done,
+	.rpc_release = nfs4_renew_release,
+};
+
+int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_RENEW],
+		.rpc_argp	= clp,
+		.rpc_cred	= cred,
+	};
+	struct nfs4_renewdata *data;
+
+	if (!atomic_inc_not_zero(&clp->cl_count))
+		return -EIO;
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		return -ENOMEM;
+	data->client = clp;
+	data->timestamp = jiffies;
+	return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
+			&nfs4_renew_ops, data);
+}
+
+int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_RENEW],
+		.rpc_argp	= clp,
+		.rpc_cred	= cred,
+	};
+	unsigned long now = jiffies;
+	int status;
+
+	status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+	if (status < 0)
+		return status;
+	do_renew_lease(clp, now);
+	return 0;
+}
+
+static inline int nfs4_server_supports_acls(struct nfs_server *server)
+{
+	return (server->caps & NFS_CAP_ACLS)
+		&& (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
+		&& (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL);
+}
+
+/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_CACHE_SIZE, and that
+ * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_CACHE_SIZE) bytes on
+ * the stack.
+ */
+#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT)
+
+static void buf_to_pages(const void *buf, size_t buflen,
+		struct page **pages, unsigned int *pgbase)
+{
+	const void *p = buf;
+
+	*pgbase = offset_in_page(buf);
+	p -= *pgbase;
+	while (p < buf + buflen) {
+		*(pages++) = virt_to_page(p);
+		p += PAGE_CACHE_SIZE;
+	}
+}
+
+static int buf_to_pages_noslab(const void *buf, size_t buflen,
+		struct page **pages, unsigned int *pgbase)
+{
+	struct page *newpage, **spages;
+	int rc = 0;
+	size_t len;
+	spages = pages;
+
+	do {
+		len = min_t(size_t, PAGE_CACHE_SIZE, buflen);
+		newpage = alloc_page(GFP_KERNEL);
+
+		if (newpage == NULL)
+			goto unwind;
+		memcpy(page_address(newpage), buf, len);
+                buf += len;
+                buflen -= len;
+		*pages++ = newpage;
+		rc++;
+	} while (buflen != 0);
+
+	return rc;
+
+unwind:
+	for(; rc > 0; rc--)
+		__free_page(spages[rc-1]);
+	return -ENOMEM;
+}
+
+struct nfs4_cached_acl {
+	int cached;
+	size_t len;
+	char data[0];
+};
+
+static void nfs4_set_cached_acl(struct inode *inode, struct nfs4_cached_acl *acl)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	spin_lock(&inode->i_lock);
+	kfree(nfsi->nfs4_acl);
+	nfsi->nfs4_acl = acl;
+	spin_unlock(&inode->i_lock);
+}
+
+static void nfs4_zap_acl_attr(struct inode *inode)
+{
+	nfs4_set_cached_acl(inode, NULL);
+}
+
+static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_t buflen)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs4_cached_acl *acl;
+	int ret = -ENOENT;
+
+	spin_lock(&inode->i_lock);
+	acl = nfsi->nfs4_acl;
+	if (acl == NULL)
+		goto out;
+	if (buf == NULL) /* user is just asking for length */
+		goto out_len;
+	if (acl->cached == 0)
+		goto out;
+	ret = -ERANGE; /* see getxattr(2) man page */
+	if (acl->len > buflen)
+		goto out;
+	memcpy(buf, acl->data, acl->len);
+out_len:
+	ret = acl->len;
+out:
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+static void nfs4_write_cached_acl(struct inode *inode, const char *buf, size_t acl_len)
+{
+	struct nfs4_cached_acl *acl;
+
+	if (buf && acl_len <= PAGE_SIZE) {
+		acl = kmalloc(sizeof(*acl) + acl_len, GFP_KERNEL);
+		if (acl == NULL)
+			goto out;
+		acl->cached = 1;
+		memcpy(acl->data, buf, acl_len);
+	} else {
+		acl = kmalloc(sizeof(*acl), GFP_KERNEL);
+		if (acl == NULL)
+			goto out;
+		acl->cached = 0;
+	}
+	acl->len = acl_len;
+out:
+	nfs4_set_cached_acl(inode, acl);
+}
+
+static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
+{
+	struct page *pages[NFS4ACL_MAXPAGES];
+	struct nfs_getaclargs args = {
+		.fh = NFS_FH(inode),
+		.acl_pages = pages,
+		.acl_len = buflen,
+	};
+	struct nfs_getaclres res = {
+		.acl_len = buflen,
+	};
+	void *resp_buf;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	struct page *localpage = NULL;
+	int ret;
+
+	if (buflen < PAGE_SIZE) {
+		/* As long as we're doing a round trip to the server anyway,
+		 * let's be prepared for a page of acl data. */
+		localpage = alloc_page(GFP_KERNEL);
+		resp_buf = page_address(localpage);
+		if (localpage == NULL)
+			return -ENOMEM;
+		args.acl_pages[0] = localpage;
+		args.acl_pgbase = 0;
+		args.acl_len = PAGE_SIZE;
+	} else {
+		resp_buf = buf;
+		buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
+	}
+	ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
+	if (ret)
+		goto out_free;
+	if (res.acl_len > args.acl_len)
+		nfs4_write_cached_acl(inode, NULL, res.acl_len);
+	else
+		nfs4_write_cached_acl(inode, resp_buf, res.acl_len);
+	if (buf) {
+		ret = -ERANGE;
+		if (res.acl_len > buflen)
+			goto out_free;
+		if (localpage)
+			memcpy(buf, resp_buf, res.acl_len);
+	}
+	ret = res.acl_len;
+out_free:
+	if (localpage)
+		__free_page(localpage);
+	return ret;
+}
+
+static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
+{
+	struct nfs4_exception exception = { };
+	ssize_t ret;
+	do {
+		ret = __nfs4_get_acl_uncached(inode, buf, buflen);
+		if (ret >= 0)
+			break;
+		ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception);
+	} while (exception.retry);
+	return ret;
+}
+
+static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	int ret;
+
+	if (!nfs4_server_supports_acls(server))
+		return -EOPNOTSUPP;
+	ret = nfs_revalidate_inode(server, inode);
+	if (ret < 0)
+		return ret;
+	if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+		nfs_zap_acl_cache(inode);
+	ret = nfs4_read_cached_acl(inode, buf, buflen);
+	if (ret != -ENOENT)
+		return ret;
+	return nfs4_get_acl_uncached(inode, buf, buflen);
+}
+
+static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct page *pages[NFS4ACL_MAXPAGES];
+	struct nfs_setaclargs arg = {
+		.fh		= NFS_FH(inode),
+		.acl_pages	= pages,
+		.acl_len	= buflen,
+	};
+	struct nfs_setaclres res;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_SETACL],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
+	int ret, i;
+
+	if (!nfs4_server_supports_acls(server))
+		return -EOPNOTSUPP;
+	i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
+	if (i < 0)
+		return i;
+	nfs_inode_return_delegation(inode);
+	ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+
+	/*
+	 * Free each page after tx, so the only ref left is
+	 * held by the network stack
+	 */
+	for (; i > 0; i--)
+		put_page(pages[i-1]);
+
+	/*
+	 * Acl update can result in inode attribute update.
+	 * so mark the attribute cache invalid.
+	 */
+	spin_lock(&inode->i_lock);
+	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
+	spin_unlock(&inode->i_lock);
+	nfs_access_zap_cache(inode);
+	nfs_zap_acl_cache(inode);
+	return ret;
+}
+
+static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(inode),
+				__nfs4_proc_set_acl(inode, buf, buflen),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	if (task->tk_status >= 0)
+		return 0;
+	switch(task->tk_status) {
+		case -NFS4ERR_DELEG_REVOKED:
+		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_BAD_STATEID:
+			if (state != NULL)
+				nfs_remove_bad_delegation(state->inode);
+		case -NFS4ERR_OPENMODE:
+			if (state == NULL)
+				break;
+			nfs4_schedule_stateid_recovery(server, state);
+			goto wait_on_recovery;
+		case -NFS4ERR_EXPIRED:
+			if (state != NULL)
+				nfs4_schedule_stateid_recovery(server, state);
+		case -NFS4ERR_STALE_STATEID:
+		case -NFS4ERR_STALE_CLIENTID:
+			nfs4_schedule_lease_recovery(clp);
+			goto wait_on_recovery;
+#if defined(CONFIG_NFS_V4_1)
+		case -NFS4ERR_BADSESSION:
+		case -NFS4ERR_BADSLOT:
+		case -NFS4ERR_BAD_HIGH_SLOT:
+		case -NFS4ERR_DEADSESSION:
+		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+		case -NFS4ERR_SEQ_FALSE_RETRY:
+		case -NFS4ERR_SEQ_MISORDERED:
+			dprintk("%s ERROR %d, Reset session\n", __func__,
+				task->tk_status);
+			nfs4_schedule_session_recovery(clp->cl_session);
+			task->tk_status = 0;
+			return -EAGAIN;
+#endif /* CONFIG_NFS_V4_1 */
+		case -NFS4ERR_DELAY:
+			nfs_inc_server_stats(server, NFSIOS_DELAY);
+		case -NFS4ERR_GRACE:
+		case -EKEYEXPIRED:
+			rpc_delay(task, NFS4_POLL_RETRY_MAX);
+			task->tk_status = 0;
+			return -EAGAIN;
+		case -NFS4ERR_RETRY_UNCACHED_REP:
+		case -NFS4ERR_OLD_STATEID:
+			task->tk_status = 0;
+			return -EAGAIN;
+	}
+	task->tk_status = nfs4_map_errors(task->tk_status);
+	return 0;
+wait_on_recovery:
+	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
+	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
+		rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+	task->tk_status = 0;
+	return -EAGAIN;
+}
+
+int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
+		unsigned short port, struct rpc_cred *cred,
+		struct nfs4_setclientid_res *res)
+{
+	nfs4_verifier sc_verifier;
+	struct nfs4_setclientid setclientid = {
+		.sc_verifier = &sc_verifier,
+		.sc_prog = program,
+		.sc_cb_ident = clp->cl_cb_ident,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
+		.rpc_argp = &setclientid,
+		.rpc_resp = res,
+		.rpc_cred = cred,
+	};
+	__be32 *p;
+	int loop = 0;
+	int status;
+
+	p = (__be32*)sc_verifier.data;
+	*p++ = htonl((u32)clp->cl_boot_time.tv_sec);
+	*p = htonl((u32)clp->cl_boot_time.tv_nsec);
+
+	for(;;) {
+		setclientid.sc_name_len = scnprintf(setclientid.sc_name,
+				sizeof(setclientid.sc_name), "%s/%s %s %s %u",
+				clp->cl_ipaddr,
+				rpc_peeraddr2str(clp->cl_rpcclient,
+							RPC_DISPLAY_ADDR),
+				rpc_peeraddr2str(clp->cl_rpcclient,
+							RPC_DISPLAY_PROTO),
+				clp->cl_rpcclient->cl_auth->au_ops->au_name,
+				clp->cl_id_uniquifier);
+		setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
+				sizeof(setclientid.sc_netid),
+				rpc_peeraddr2str(clp->cl_rpcclient,
+							RPC_DISPLAY_NETID));
+		setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
+				sizeof(setclientid.sc_uaddr), "%s.%u.%u",
+				clp->cl_ipaddr, port >> 8, port & 255);
+
+		status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+		if (status != -NFS4ERR_CLID_INUSE)
+			break;
+		if (loop != 0) {
+			++clp->cl_id_uniquifier;
+			break;
+		}
+		++loop;
+		ssleep(clp->cl_lease_time / HZ + 1);
+	}
+	return status;
+}
+
+int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
+		struct nfs4_setclientid_res *arg,
+		struct rpc_cred *cred)
+{
+	struct nfs_fsinfo fsinfo;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
+		.rpc_argp = arg,
+		.rpc_resp = &fsinfo,
+		.rpc_cred = cred,
+	};
+	unsigned long now;
+	int status;
+
+	now = jiffies;
+	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	if (status == 0) {
+		spin_lock(&clp->cl_lock);
+		clp->cl_lease_time = fsinfo.lease_time * HZ;
+		clp->cl_last_renewal = now;
+		spin_unlock(&clp->cl_lock);
+	}
+	return status;
+}
+
+struct nfs4_delegreturndata {
+	struct nfs4_delegreturnargs args;
+	struct nfs4_delegreturnres res;
+	struct nfs_fh fh;
+	nfs4_stateid stateid;
+	unsigned long timestamp;
+	struct nfs_fattr fattr;
+	int rpc_status;
+};
+
+static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_delegreturndata *data = calldata;
+
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return;
+
+	switch (task->tk_status) {
+	case -NFS4ERR_STALE_STATEID:
+	case -NFS4ERR_EXPIRED:
+	case 0:
+		renew_lease(data->res.server, data->timestamp);
+		break;
+	default:
+		if (nfs4_async_handle_error(task, data->res.server, NULL) ==
+				-EAGAIN) {
+			nfs_restart_rpc(task, data->res.server->nfs_client);
+			return;
+		}
+	}
+	data->rpc_status = task->tk_status;
+}
+
+static void nfs4_delegreturn_release(void *calldata)
+{
+	kfree(calldata);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs4_delegreturndata *d_data;
+
+	d_data = (struct nfs4_delegreturndata *)data;
+
+	if (nfs4_setup_sequence(d_data->res.server,
+				&d_data->args.seq_args,
+				&d_data->res.seq_res, 1, task))
+		return;
+	rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static const struct rpc_call_ops nfs4_delegreturn_ops = {
+#if defined(CONFIG_NFS_V4_1)
+	.rpc_call_prepare = nfs4_delegreturn_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+	.rpc_call_done = nfs4_delegreturn_done,
+	.rpc_release = nfs4_delegreturn_release,
+};
+
+static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
+{
+	struct nfs4_delegreturndata *data;
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
+		.rpc_cred = cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_delegreturn_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int status = 0;
+
+	data = kzalloc(sizeof(*data), GFP_NOFS);
+	if (data == NULL)
+		return -ENOMEM;
+	data->args.fhandle = &data->fh;
+	data->args.stateid = &data->stateid;
+	data->args.bitmask = server->attr_bitmask;
+	nfs_copy_fh(&data->fh, NFS_FH(inode));
+	memcpy(&data->stateid, stateid, sizeof(data->stateid));
+	data->res.fattr = &data->fattr;
+	data->res.server = server;
+	nfs_fattr_init(data->res.fattr);
+	data->timestamp = jiffies;
+	data->rpc_status = 0;
+
+	task_setup_data.callback_data = data;
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	if (!issync)
+		goto out;
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status != 0)
+		goto out;
+	status = data->rpc_status;
+	if (status != 0)
+		goto out;
+	nfs_refresh_inode(inode, &data->fattr);
+out:
+	rpc_put_task(task);
+	return status;
+}
+
+int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
+		switch (err) {
+			case -NFS4ERR_STALE_STATEID:
+			case -NFS4ERR_EXPIRED:
+			case 0:
+				return 0;
+		}
+		err = nfs4_handle_exception(server, err, &exception);
+	} while (exception.retry);
+	return err;
+}
+
+#define NFS4_LOCK_MINTIMEOUT (1 * HZ)
+#define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
+
+/* 
+ * sleep, with exponential backoff, and retry the LOCK operation. 
+ */
+static unsigned long
+nfs4_set_lock_task_retry(unsigned long timeout)
+{
+	schedule_timeout_killable(timeout);
+	timeout <<= 1;
+	if (timeout > NFS4_LOCK_MAXTIMEOUT)
+		return NFS4_LOCK_MAXTIMEOUT;
+	return timeout;
+}
+
+static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+	struct inode *inode = state->inode;
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs_lockt_args arg = {
+		.fh = NFS_FH(inode),
+		.fl = request,
+	};
+	struct nfs_lockt_res res = {
+		.denied = request,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_LOCKT],
+		.rpc_argp       = &arg,
+		.rpc_resp       = &res,
+		.rpc_cred	= state->owner->so_cred,
+	};
+	struct nfs4_lock_state *lsp;
+	int status;
+
+	arg.lock_owner.clientid = clp->cl_clientid;
+	status = nfs4_set_lock_state(state, request);
+	if (status != 0)
+		goto out;
+	lsp = request->fl_u.nfs4_fl.owner;
+	arg.lock_owner.id = lsp->ls_id.id;
+	arg.lock_owner.s_dev = server->s_dev;
+	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+	switch (status) {
+		case 0:
+			request->fl_type = F_UNLCK;
+			break;
+		case -NFS4ERR_DENIED:
+			status = 0;
+	}
+	request->fl_ops->fl_release_private(request);
+out:
+	return status;
+}
+
+static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+	struct nfs4_exception exception = { };
+	int err;
+
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(state->inode),
+				_nfs4_proc_getlk(state, cmd, request),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int do_vfs_lock(struct file *file, struct file_lock *fl)
+{
+	int res = 0;
+	switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
+		case FL_POSIX:
+			res = posix_lock_file_wait(file, fl);
+			break;
+		case FL_FLOCK:
+			res = flock_lock_file_wait(file, fl);
+			break;
+		default:
+			BUG();
+	}
+	return res;
+}
+
+struct nfs4_unlockdata {
+	struct nfs_locku_args arg;
+	struct nfs_locku_res res;
+	struct nfs4_lock_state *lsp;
+	struct nfs_open_context *ctx;
+	struct file_lock fl;
+	const struct nfs_server *server;
+	unsigned long timestamp;
+};
+
+static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
+		struct nfs_open_context *ctx,
+		struct nfs4_lock_state *lsp,
+		struct nfs_seqid *seqid)
+{
+	struct nfs4_unlockdata *p;
+	struct inode *inode = lsp->ls_state->inode;
+
+	p = kzalloc(sizeof(*p), GFP_NOFS);
+	if (p == NULL)
+		return NULL;
+	p->arg.fh = NFS_FH(inode);
+	p->arg.fl = &p->fl;
+	p->arg.seqid = seqid;
+	p->res.seqid = seqid;
+	p->arg.stateid = &lsp->ls_stateid;
+	p->lsp = lsp;
+	atomic_inc(&lsp->ls_count);
+	/* Ensure we don't close file until we're done freeing locks! */
+	p->ctx = get_nfs_open_context(ctx);
+	memcpy(&p->fl, fl, sizeof(p->fl));
+	p->server = NFS_SERVER(inode);
+	return p;
+}
+
+static void nfs4_locku_release_calldata(void *data)
+{
+	struct nfs4_unlockdata *calldata = data;
+	nfs_free_seqid(calldata->arg.seqid);
+	nfs4_put_lock_state(calldata->lsp);
+	put_nfs_open_context(calldata->ctx);
+	kfree(calldata);
+}
+
+static void nfs4_locku_done(struct rpc_task *task, void *data)
+{
+	struct nfs4_unlockdata *calldata = data;
+
+	if (!nfs4_sequence_done(task, &calldata->res.seq_res))
+		return;
+	switch (task->tk_status) {
+		case 0:
+			memcpy(calldata->lsp->ls_stateid.data,
+					calldata->res.stateid.data,
+					sizeof(calldata->lsp->ls_stateid.data));
+			renew_lease(calldata->server, calldata->timestamp);
+			break;
+		case -NFS4ERR_BAD_STATEID:
+		case -NFS4ERR_OLD_STATEID:
+		case -NFS4ERR_STALE_STATEID:
+		case -NFS4ERR_EXPIRED:
+			break;
+		default:
+			if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
+				nfs_restart_rpc(task,
+						 calldata->server->nfs_client);
+	}
+}
+
+static void nfs4_locku_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs4_unlockdata *calldata = data;
+
+	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
+		return;
+	if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) {
+		/* Note: exit _without_ running nfs4_locku_done */
+		task->tk_action = NULL;
+		return;
+	}
+	calldata->timestamp = jiffies;
+	if (nfs4_setup_sequence(calldata->server,
+				&calldata->arg.seq_args,
+				&calldata->res.seq_res, 1, task))
+		return;
+	rpc_call_start(task);
+}
+
+static const struct rpc_call_ops nfs4_locku_ops = {
+	.rpc_call_prepare = nfs4_locku_prepare,
+	.rpc_call_done = nfs4_locku_done,
+	.rpc_release = nfs4_locku_release_calldata,
+};
+
+static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
+		struct nfs_open_context *ctx,
+		struct nfs4_lock_state *lsp,
+		struct nfs_seqid *seqid)
+{
+	struct nfs4_unlockdata *data;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
+		.rpc_cred = ctx->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(lsp->ls_state->inode),
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_locku_ops,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+
+	/* Ensure this is an unlock - when canceling a lock, the
+	 * canceled lock is passed in, and it won't be an unlock.
+	 */
+	fl->fl_type = F_UNLCK;
+
+	data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid);
+	if (data == NULL) {
+		nfs_free_seqid(seqid);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	msg.rpc_argp = &data->arg;
+	msg.rpc_resp = &data->res;
+	task_setup_data.callback_data = data;
+	return rpc_run_task(&task_setup_data);
+}
+
+static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+	struct nfs_inode *nfsi = NFS_I(state->inode);
+	struct nfs_seqid *seqid;
+	struct nfs4_lock_state *lsp;
+	struct rpc_task *task;
+	int status = 0;
+	unsigned char fl_flags = request->fl_flags;
+
+	status = nfs4_set_lock_state(state, request);
+	/* Unlock _before_ we do the RPC call */
+	request->fl_flags |= FL_EXISTS;
+	down_read(&nfsi->rwsem);
+	if (do_vfs_lock(request->fl_file, request) == -ENOENT) {
+		up_read(&nfsi->rwsem);
+		goto out;
+	}
+	up_read(&nfsi->rwsem);
+	if (status != 0)
+		goto out;
+	/* Is this a delegated lock? */
+	if (test_bit(NFS_DELEGATED_STATE, &state->flags))
+		goto out;
+	lsp = request->fl_u.nfs4_fl.owner;
+	seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
+	status = -ENOMEM;
+	if (seqid == NULL)
+		goto out;
+	task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
+	status = PTR_ERR(task);
+	if (IS_ERR(task))
+		goto out;
+	status = nfs4_wait_for_completion_rpc_task(task);
+	rpc_put_task(task);
+out:
+	request->fl_flags = fl_flags;
+	return status;
+}
+
+struct nfs4_lockdata {
+	struct nfs_lock_args arg;
+	struct nfs_lock_res res;
+	struct nfs4_lock_state *lsp;
+	struct nfs_open_context *ctx;
+	struct file_lock fl;
+	unsigned long timestamp;
+	int rpc_status;
+	int cancelled;
+	struct nfs_server *server;
+};
+
+static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
+		struct nfs_open_context *ctx, struct nfs4_lock_state *lsp,
+		gfp_t gfp_mask)
+{
+	struct nfs4_lockdata *p;
+	struct inode *inode = lsp->ls_state->inode;
+	struct nfs_server *server = NFS_SERVER(inode);
+
+	p = kzalloc(sizeof(*p), gfp_mask);
+	if (p == NULL)
+		return NULL;
+
+	p->arg.fh = NFS_FH(inode);
+	p->arg.fl = &p->fl;
+	p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
+	if (p->arg.open_seqid == NULL)
+		goto out_free;
+	p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask);
+	if (p->arg.lock_seqid == NULL)
+		goto out_free_seqid;
+	p->arg.lock_stateid = &lsp->ls_stateid;
+	p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
+	p->arg.lock_owner.id = lsp->ls_id.id;
+	p->arg.lock_owner.s_dev = server->s_dev;
+	p->res.lock_seqid = p->arg.lock_seqid;
+	p->lsp = lsp;
+	p->server = server;
+	atomic_inc(&lsp->ls_count);
+	p->ctx = get_nfs_open_context(ctx);
+	memcpy(&p->fl, fl, sizeof(p->fl));
+	return p;
+out_free_seqid:
+	nfs_free_seqid(p->arg.open_seqid);
+out_free:
+	kfree(p);
+	return NULL;
+}
+
+static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_lockdata *data = calldata;
+	struct nfs4_state *state = data->lsp->ls_state;
+
+	dprintk("%s: begin!\n", __func__);
+	if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
+		return;
+	/* Do we need to do an open_to_lock_owner? */
+	if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
+		if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0)
+			return;
+		data->arg.open_stateid = &state->stateid;
+		data->arg.new_lock_owner = 1;
+		data->res.open_seqid = data->arg.open_seqid;
+	} else
+		data->arg.new_lock_owner = 0;
+	data->timestamp = jiffies;
+	if (nfs4_setup_sequence(data->server,
+				&data->arg.seq_args,
+				&data->res.seq_res, 1, task))
+		return;
+	rpc_call_start(task);
+	dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
+}
+
+static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata)
+{
+	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+	nfs4_lock_prepare(task, calldata);
+}
+
+static void nfs4_lock_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_lockdata *data = calldata;
+
+	dprintk("%s: begin!\n", __func__);
+
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return;
+
+	data->rpc_status = task->tk_status;
+	if (data->arg.new_lock_owner != 0) {
+		if (data->rpc_status == 0)
+			nfs_confirm_seqid(&data->lsp->ls_seqid, 0);
+		else
+			goto out;
+	}
+	if (data->rpc_status == 0) {
+		memcpy(data->lsp->ls_stateid.data, data->res.stateid.data,
+					sizeof(data->lsp->ls_stateid.data));
+		data->lsp->ls_flags |= NFS_LOCK_INITIALIZED;
+		renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp);
+	}
+out:
+	dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
+}
+
+static void nfs4_lock_release(void *calldata)
+{
+	struct nfs4_lockdata *data = calldata;
+
+	dprintk("%s: begin!\n", __func__);
+	nfs_free_seqid(data->arg.open_seqid);
+	if (data->cancelled != 0) {
+		struct rpc_task *task;
+		task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
+				data->arg.lock_seqid);
+		if (!IS_ERR(task))
+			rpc_put_task_async(task);
+		dprintk("%s: cancelling lock!\n", __func__);
+	} else
+		nfs_free_seqid(data->arg.lock_seqid);
+	nfs4_put_lock_state(data->lsp);
+	put_nfs_open_context(data->ctx);
+	kfree(data);
+	dprintk("%s: done!\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_lock_ops = {
+	.rpc_call_prepare = nfs4_lock_prepare,
+	.rpc_call_done = nfs4_lock_done,
+	.rpc_release = nfs4_lock_release,
+};
+
+static const struct rpc_call_ops nfs4_recover_lock_ops = {
+	.rpc_call_prepare = nfs4_recover_lock_prepare,
+	.rpc_call_done = nfs4_lock_done,
+	.rpc_release = nfs4_lock_release,
+};
+
+static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
+{
+	switch (error) {
+	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_BAD_STATEID:
+		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+		if (new_lock_owner != 0 ||
+		   (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
+			nfs4_schedule_stateid_recovery(server, lsp->ls_state);
+		break;
+	case -NFS4ERR_STALE_STATEID:
+		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+	case -NFS4ERR_EXPIRED:
+		nfs4_schedule_lease_recovery(server->nfs_client);
+	};
+}
+
+static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type)
+{
+	struct nfs4_lockdata *data;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK],
+		.rpc_cred = state->owner->so_cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(state->inode),
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_lock_ops,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int ret;
+
+	dprintk("%s: begin!\n", __func__);
+	data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
+			fl->fl_u.nfs4_fl.owner,
+			recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS);
+	if (data == NULL)
+		return -ENOMEM;
+	if (IS_SETLKW(cmd))
+		data->arg.block = 1;
+	if (recovery_type > NFS_LOCK_NEW) {
+		if (recovery_type == NFS_LOCK_RECLAIM)
+			data->arg.reclaim = NFS_LOCK_RECLAIM;
+		task_setup_data.callback_ops = &nfs4_recover_lock_ops;
+	}
+	msg.rpc_argp = &data->arg;
+	msg.rpc_resp = &data->res;
+	task_setup_data.callback_data = data;
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	ret = nfs4_wait_for_completion_rpc_task(task);
+	if (ret == 0) {
+		ret = data->rpc_status;
+		if (ret)
+			nfs4_handle_setlk_error(data->server, data->lsp,
+					data->arg.new_lock_owner, ret);
+	} else
+		data->cancelled = 1;
+	rpc_put_task(task);
+	dprintk("%s: done, ret = %d!\n", __func__, ret);
+	return ret;
+}
+
+static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request)
+{
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs4_exception exception = {
+		.inode = state->inode,
+	};
+	int err;
+
+	do {
+		/* Cache the lock if possible... */
+		if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
+			return 0;
+		err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
+		if (err != -NFS4ERR_DELAY)
+			break;
+		nfs4_handle_exception(server, err, &exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request)
+{
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs4_exception exception = {
+		.inode = state->inode,
+	};
+	int err;
+
+	err = nfs4_set_lock_state(state, request);
+	if (err != 0)
+		return err;
+	do {
+		if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
+			return 0;
+		err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED);
+		switch (err) {
+		default:
+			goto out;
+		case -NFS4ERR_GRACE:
+		case -NFS4ERR_DELAY:
+			nfs4_handle_exception(server, err, &exception);
+			err = 0;
+		}
+	} while (exception.retry);
+out:
+	return err;
+}
+
+static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+	struct nfs_inode *nfsi = NFS_I(state->inode);
+	unsigned char fl_flags = request->fl_flags;
+	int status = -ENOLCK;
+
+	if ((fl_flags & FL_POSIX) &&
+			!test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
+		goto out;
+	/* Is this a delegated open? */
+	status = nfs4_set_lock_state(state, request);
+	if (status != 0)
+		goto out;
+	request->fl_flags |= FL_ACCESS;
+	status = do_vfs_lock(request->fl_file, request);
+	if (status < 0)
+		goto out;
+	down_read(&nfsi->rwsem);
+	if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
+		/* Yes: cache locks! */
+		/* ...but avoid races with delegation recall... */
+		request->fl_flags = fl_flags & ~FL_SLEEP;
+		status = do_vfs_lock(request->fl_file, request);
+		goto out_unlock;
+	}
+	status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
+	if (status != 0)
+		goto out_unlock;
+	/* Note: we always want to sleep here! */
+	request->fl_flags = fl_flags | FL_SLEEP;
+	if (do_vfs_lock(request->fl_file, request) < 0)
+		printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
+out_unlock:
+	up_read(&nfsi->rwsem);
+out:
+	request->fl_flags = fl_flags;
+	return status;
+}
+
+static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+	struct nfs4_exception exception = {
+		.state = state,
+		.inode = state->inode,
+	};
+	int err;
+
+	do {
+		err = _nfs4_proc_setlk(state, cmd, request);
+		if (err == -NFS4ERR_DENIED)
+			err = -EAGAIN;
+		err = nfs4_handle_exception(NFS_SERVER(state->inode),
+				err, &exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int
+nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
+{
+	struct nfs_open_context *ctx;
+	struct nfs4_state *state;
+	unsigned long timeout = NFS4_LOCK_MINTIMEOUT;
+	int status;
+
+	/* verify open state */
+	ctx = nfs_file_open_context(filp);
+	state = ctx->state;
+
+	if (request->fl_start < 0 || request->fl_end < 0)
+		return -EINVAL;
+
+	if (IS_GETLK(cmd)) {
+		if (state != NULL)
+			return nfs4_proc_getlk(state, F_GETLK, request);
+		return 0;
+	}
+
+	if (!(IS_SETLK(cmd) || IS_SETLKW(cmd)))
+		return -EINVAL;
+
+	if (request->fl_type == F_UNLCK) {
+		if (state != NULL)
+			return nfs4_proc_unlck(state, cmd, request);
+		return 0;
+	}
+
+	if (state == NULL)
+		return -ENOLCK;
+	/*
+	 * Don't rely on the VFS having checked the file open mode,
+	 * since it won't do this for flock() locks.
+	 */
+	switch (request->fl_type & (F_RDLCK|F_WRLCK|F_UNLCK)) {
+	case F_RDLCK:
+		if (!(filp->f_mode & FMODE_READ))
+			return -EBADF;
+		break;
+	case F_WRLCK:
+		if (!(filp->f_mode & FMODE_WRITE))
+			return -EBADF;
+	}
+
+	do {
+		status = nfs4_proc_setlk(state, cmd, request);
+		if ((status != -EAGAIN) || IS_SETLK(cmd))
+			break;
+		timeout = nfs4_set_lock_task_retry(timeout);
+		status = -ERESTARTSYS;
+		if (signalled())
+			break;
+	} while(status < 0);
+	return status;
+}
+
+int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
+{
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs4_exception exception = { };
+	int err;
+
+	err = nfs4_set_lock_state(state, fl);
+	if (err != 0)
+		goto out;
+	do {
+		err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
+		switch (err) {
+			default:
+				printk(KERN_ERR "%s: unhandled error %d.\n",
+						__func__, err);
+			case 0:
+			case -ESTALE:
+				goto out;
+			case -NFS4ERR_EXPIRED:
+				nfs4_schedule_stateid_recovery(server, state);
+			case -NFS4ERR_STALE_CLIENTID:
+			case -NFS4ERR_STALE_STATEID:
+				nfs4_schedule_lease_recovery(server->nfs_client);
+				goto out;
+			case -NFS4ERR_BADSESSION:
+			case -NFS4ERR_BADSLOT:
+			case -NFS4ERR_BAD_HIGH_SLOT:
+			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+			case -NFS4ERR_DEADSESSION:
+				nfs4_schedule_session_recovery(server->nfs_client->cl_session);
+				goto out;
+			case -ERESTARTSYS:
+				/*
+				 * The show must go on: exit, but mark the
+				 * stateid as needing recovery.
+				 */
+			case -NFS4ERR_DELEG_REVOKED:
+			case -NFS4ERR_ADMIN_REVOKED:
+			case -NFS4ERR_BAD_STATEID:
+			case -NFS4ERR_OPENMODE:
+				nfs4_schedule_stateid_recovery(server, state);
+				err = 0;
+				goto out;
+			case -EKEYEXPIRED:
+				/*
+				 * User RPCSEC_GSS context has expired.
+				 * We cannot recover this stateid now, so
+				 * skip it and allow recovery thread to
+				 * proceed.
+				 */
+				err = 0;
+				goto out;
+			case -ENOMEM:
+			case -NFS4ERR_DENIED:
+				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
+				err = 0;
+				goto out;
+			case -NFS4ERR_DELAY:
+				break;
+		}
+		err = nfs4_handle_exception(server, err, &exception);
+	} while (exception.retry);
+out:
+	return err;
+}
+
+static void nfs4_release_lockowner_release(void *calldata)
+{
+	kfree(calldata);
+}
+
+const struct rpc_call_ops nfs4_release_lockowner_ops = {
+	.rpc_release = nfs4_release_lockowner_release,
+};
+
+void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
+{
+	struct nfs_server *server = lsp->ls_state->owner->so_server;
+	struct nfs_release_lockowner_args *args;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
+	};
+
+	if (server->nfs_client->cl_mvops->minor_version != 0)
+		return;
+	args = kmalloc(sizeof(*args), GFP_NOFS);
+	if (!args)
+		return;
+	args->lock_owner.clientid = server->nfs_client->cl_clientid;
+	args->lock_owner.id = lsp->ls_id.id;
+	args->lock_owner.s_dev = server->s_dev;
+	msg.rpc_argp = args;
+	rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
+}
+
+#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
+
+static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
+				   const void *buf, size_t buflen,
+				   int flags, int type)
+{
+	if (strcmp(key, "") != 0)
+		return -EINVAL;
+
+	return nfs4_proc_set_acl(dentry->d_inode, buf, buflen);
+}
+
+static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
+				   void *buf, size_t buflen, int type)
+{
+	if (strcmp(key, "") != 0)
+		return -EINVAL;
+
+	return nfs4_proc_get_acl(dentry->d_inode, buf, buflen);
+}
+
+static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
+				       size_t list_len, const char *name,
+				       size_t name_len, int type)
+{
+	size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
+
+	if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode)))
+		return 0;
+
+	if (list && len <= list_len)
+		memcpy(list, XATTR_NAME_NFSV4_ACL, len);
+	return len;
+}
+
+/*
+ * nfs_fhget will use either the mounted_on_fileid or the fileid
+ */
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
+{
+	if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) ||
+	       (fattr->valid & NFS_ATTR_FATTR_FILEID)) &&
+	      (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+	      (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
+		return;
+
+	fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
+		NFS_ATTR_FATTR_NLINK;
+	fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+	fattr->nlink = 2;
+}
+
+int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
+		struct nfs4_fs_locations *fs_locations, struct page *page)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	u32 bitmask[2] = {
+		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+	};
+	struct nfs4_fs_locations_arg args = {
+		.dir_fh = NFS_FH(dir),
+		.name = name,
+		.page = page,
+		.bitmask = bitmask,
+	};
+	struct nfs4_fs_locations_res res = {
+		.fs_locations = fs_locations,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	int status;
+
+	dprintk("%s: start\n", __func__);
+
+	/* Ask for the fileid of the absent filesystem if mounted_on_fileid
+	 * is not supported */
+	if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
+		bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
+	else
+		bitmask[0] |= FATTR4_WORD0_FILEID;
+
+	nfs_fattr_init(&fs_locations->fattr);
+	fs_locations->server = server;
+	fs_locations->nlocations = 0;
+	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+	dprintk("%s: returned status = %d\n", __func__, status);
+	return status;
+}
+
+static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors)
+{
+	int status;
+	struct nfs4_secinfo_arg args = {
+		.dir_fh = NFS_FH(dir),
+		.name   = name,
+	};
+	struct nfs4_secinfo_res res = {
+		.flavors     = flavors,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+
+	dprintk("NFS call  secinfo %s\n", name->name);
+	status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0);
+	dprintk("NFS reply  secinfo: %d\n", status);
+	return status;
+}
+
+int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(dir),
+				_nfs4_proc_secinfo(dir, name, flavors),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+#ifdef CONFIG_NFS_V4_1
+/*
+ * Check the exchange flags returned by the server for invalid flags, having
+ * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
+ * DS flags set.
+ */
+static int nfs4_check_cl_exchange_flags(u32 flags)
+{
+	if (flags & ~EXCHGID4_FLAG_MASK_R)
+		goto out_inval;
+	if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
+	    (flags & EXCHGID4_FLAG_USE_NON_PNFS))
+		goto out_inval;
+	if (!(flags & (EXCHGID4_FLAG_MASK_PNFS)))
+		goto out_inval;
+	return NFS_OK;
+out_inval:
+	return -NFS4ERR_INVAL;
+}
+
+/*
+ * nfs4_proc_exchange_id()
+ *
+ * Since the clientid has expired, all compounds using sessions
+ * associated with the stale clientid will be returning
+ * NFS4ERR_BADSESSION in the sequence operation, and will therefore
+ * be in some phase of session reset.
+ */
+int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	nfs4_verifier verifier;
+	struct nfs41_exchange_id_args args = {
+		.client = clp,
+		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
+	};
+	struct nfs41_exchange_id_res res = {
+		.client = clp,
+	};
+	int status;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+		.rpc_cred = cred,
+	};
+	__be32 *p;
+
+	dprintk("--> %s\n", __func__);
+	BUG_ON(clp == NULL);
+
+	p = (u32 *)verifier.data;
+	*p++ = htonl((u32)clp->cl_boot_time.tv_sec);
+	*p = htonl((u32)clp->cl_boot_time.tv_nsec);
+	args.verifier = &verifier;
+
+	args.id_len = scnprintf(args.id, sizeof(args.id),
+				"%s/%s.%s/%u",
+				clp->cl_ipaddr,
+				init_utsname()->nodename,
+				init_utsname()->domainname,
+				clp->cl_rpcclient->cl_auth->au_flavor);
+
+	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	if (!status)
+		status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
+	dprintk("<-- %s status= %d\n", __func__, status);
+	return status;
+}
+
+struct nfs4_get_lease_time_data {
+	struct nfs4_get_lease_time_args *args;
+	struct nfs4_get_lease_time_res *res;
+	struct nfs_client *clp;
+};
+
+static void nfs4_get_lease_time_prepare(struct rpc_task *task,
+					void *calldata)
+{
+	int ret;
+	struct nfs4_get_lease_time_data *data =
+			(struct nfs4_get_lease_time_data *)calldata;
+
+	dprintk("--> %s\n", __func__);
+	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+	/* just setup sequence, do not trigger session recovery
+	   since we're invoked within one */
+	ret = nfs41_setup_sequence(data->clp->cl_session,
+				   &data->args->la_seq_args,
+				   &data->res->lr_seq_res, 0, task);
+
+	BUG_ON(ret == -EAGAIN);
+	rpc_call_start(task);
+	dprintk("<-- %s\n", __func__);
+}
+
+/*
+ * Called from nfs4_state_manager thread for session setup, so don't recover
+ * from sequence operation or clientid errors.
+ */
+static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_get_lease_time_data *data =
+			(struct nfs4_get_lease_time_data *)calldata;
+
+	dprintk("--> %s\n", __func__);
+	if (!nfs41_sequence_done(task, &data->res->lr_seq_res))
+		return;
+	switch (task->tk_status) {
+	case -NFS4ERR_DELAY:
+	case -NFS4ERR_GRACE:
+		dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
+		rpc_delay(task, NFS4_POLL_RETRY_MIN);
+		task->tk_status = 0;
+		/* fall through */
+	case -NFS4ERR_RETRY_UNCACHED_REP:
+		nfs_restart_rpc(task, data->clp);
+		return;
+	}
+	dprintk("<-- %s\n", __func__);
+}
+
+struct rpc_call_ops nfs4_get_lease_time_ops = {
+	.rpc_call_prepare = nfs4_get_lease_time_prepare,
+	.rpc_call_done = nfs4_get_lease_time_done,
+};
+
+int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
+{
+	struct rpc_task *task;
+	struct nfs4_get_lease_time_args args;
+	struct nfs4_get_lease_time_res res = {
+		.lr_fsinfo = fsinfo,
+	};
+	struct nfs4_get_lease_time_data data = {
+		.args = &args,
+		.res = &res,
+		.clp = clp,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GET_LEASE_TIME],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_client = clp->cl_rpcclient,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_get_lease_time_ops,
+		.callback_data = &data,
+		.flags = RPC_TASK_TIMEOUT,
+	};
+	int status;
+
+	dprintk("--> %s\n", __func__);
+	task = rpc_run_task(&task_setup);
+
+	if (IS_ERR(task))
+		status = PTR_ERR(task);
+	else {
+		status = task->tk_status;
+		rpc_put_task(task);
+	}
+	dprintk("<-- %s return %d\n", __func__, status);
+
+	return status;
+}
+
+/*
+ * Reset a slot table
+ */
+static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
+				 int ivalue)
+{
+	struct nfs4_slot *new = NULL;
+	int i;
+	int ret = 0;
+
+	dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
+		max_reqs, tbl->max_slots);
+
+	/* Does the newly negotiated max_reqs match the existing slot table? */
+	if (max_reqs != tbl->max_slots) {
+		ret = -ENOMEM;
+		new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
+			      GFP_NOFS);
+		if (!new)
+			goto out;
+		ret = 0;
+		kfree(tbl->slots);
+	}
+	spin_lock(&tbl->slot_tbl_lock);
+	if (new) {
+		tbl->slots = new;
+		tbl->max_slots = max_reqs;
+	}
+	for (i = 0; i < tbl->max_slots; ++i)
+		tbl->slots[i].seq_nr = ivalue;
+	spin_unlock(&tbl->slot_tbl_lock);
+	dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
+		tbl, tbl->slots, tbl->max_slots);
+out:
+	dprintk("<-- %s: return %d\n", __func__, ret);
+	return ret;
+}
+
+/*
+ * Reset the forechannel and backchannel slot tables
+ */
+static int nfs4_reset_slot_tables(struct nfs4_session *session)
+{
+	int status;
+
+	status = nfs4_reset_slot_table(&session->fc_slot_table,
+			session->fc_attrs.max_reqs, 1);
+	if (status)
+		return status;
+
+	status = nfs4_reset_slot_table(&session->bc_slot_table,
+			session->bc_attrs.max_reqs, 0);
+	return status;
+}
+
+/* Destroy the slot table */
+static void nfs4_destroy_slot_tables(struct nfs4_session *session)
+{
+	if (session->fc_slot_table.slots != NULL) {
+		kfree(session->fc_slot_table.slots);
+		session->fc_slot_table.slots = NULL;
+	}
+	if (session->bc_slot_table.slots != NULL) {
+		kfree(session->bc_slot_table.slots);
+		session->bc_slot_table.slots = NULL;
+	}
+	return;
+}
+
+/*
+ * Initialize slot table
+ */
+static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
+		int max_slots, int ivalue)
+{
+	struct nfs4_slot *slot;
+	int ret = -ENOMEM;
+
+	BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE);
+
+	dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
+
+	slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS);
+	if (!slot)
+		goto out;
+	ret = 0;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	tbl->max_slots = max_slots;
+	tbl->slots = slot;
+	tbl->highest_used_slotid = -1;  /* no slot is currently used */
+	spin_unlock(&tbl->slot_tbl_lock);
+	dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
+		tbl, tbl->slots, tbl->max_slots);
+out:
+	dprintk("<-- %s: return %d\n", __func__, ret);
+	return ret;
+}
+
+/*
+ * Initialize the forechannel and backchannel tables
+ */
+static int nfs4_init_slot_tables(struct nfs4_session *session)
+{
+	struct nfs4_slot_table *tbl;
+	int status = 0;
+
+	tbl = &session->fc_slot_table;
+	if (tbl->slots == NULL) {
+		status = nfs4_init_slot_table(tbl,
+				session->fc_attrs.max_reqs, 1);
+		if (status)
+			return status;
+	}
+
+	tbl = &session->bc_slot_table;
+	if (tbl->slots == NULL) {
+		status = nfs4_init_slot_table(tbl,
+				session->bc_attrs.max_reqs, 0);
+		if (status)
+			nfs4_destroy_slot_tables(session);
+	}
+
+	return status;
+}
+
+struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
+{
+	struct nfs4_session *session;
+	struct nfs4_slot_table *tbl;
+
+	session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
+	if (!session)
+		return NULL;
+
+	tbl = &session->fc_slot_table;
+	tbl->highest_used_slotid = -1;
+	spin_lock_init(&tbl->slot_tbl_lock);
+	rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+	init_completion(&tbl->complete);
+
+	tbl = &session->bc_slot_table;
+	tbl->highest_used_slotid = -1;
+	spin_lock_init(&tbl->slot_tbl_lock);
+	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+	init_completion(&tbl->complete);
+
+	session->session_state = 1<<NFS4_SESSION_INITING;
+
+	session->clp = clp;
+	return session;
+}
+
+void nfs4_destroy_session(struct nfs4_session *session)
+{
+	nfs4_proc_destroy_session(session);
+	dprintk("%s Destroy backchannel for xprt %p\n",
+		__func__, session->clp->cl_rpcclient->cl_xprt);
+	xprt_destroy_backchannel(session->clp->cl_rpcclient->cl_xprt,
+				NFS41_BC_MIN_CALLBACKS);
+	nfs4_destroy_slot_tables(session);
+	kfree(session);
+}
+
+/*
+ * Initialize the values to be used by the client in CREATE_SESSION
+ * If nfs4_init_session set the fore channel request and response sizes,
+ * use them.
+ *
+ * Set the back channel max_resp_sz_cached to zero to force the client to
+ * always set csa_cachethis to FALSE because the current implementation
+ * of the back channel DRC only supports caching the CB_SEQUENCE operation.
+ */
+static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
+{
+	struct nfs4_session *session = args->client->cl_session;
+	unsigned int mxrqst_sz = session->fc_attrs.max_rqst_sz,
+		     mxresp_sz = session->fc_attrs.max_resp_sz;
+
+	if (mxrqst_sz == 0)
+		mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
+	if (mxresp_sz == 0)
+		mxresp_sz = NFS_MAX_FILE_IO_SIZE;
+	/* Fore channel attributes */
+	args->fc_attrs.max_rqst_sz = mxrqst_sz;
+	args->fc_attrs.max_resp_sz = mxresp_sz;
+	args->fc_attrs.max_ops = NFS4_MAX_OPS;
+	args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs;
+
+	dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
+		"max_ops=%u max_reqs=%u\n",
+		__func__,
+		args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz,
+		args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
+
+	/* Back channel attributes */
+	args->bc_attrs.max_rqst_sz = PAGE_SIZE;
+	args->bc_attrs.max_resp_sz = PAGE_SIZE;
+	args->bc_attrs.max_resp_sz_cached = 0;
+	args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
+	args->bc_attrs.max_reqs = 1;
+
+	dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
+		"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
+		__func__,
+		args->bc_attrs.max_rqst_sz, args->bc_attrs.max_resp_sz,
+		args->bc_attrs.max_resp_sz_cached, args->bc_attrs.max_ops,
+		args->bc_attrs.max_reqs);
+}
+
+static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
+{
+	struct nfs4_channel_attrs *sent = &args->fc_attrs;
+	struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
+
+	if (rcvd->max_resp_sz > sent->max_resp_sz)
+		return -EINVAL;
+	/*
+	 * Our requested max_ops is the minimum we need; we're not
+	 * prepared to break up compounds into smaller pieces than that.
+	 * So, no point even trying to continue if the server won't
+	 * cooperate:
+	 */
+	if (rcvd->max_ops < sent->max_ops)
+		return -EINVAL;
+	if (rcvd->max_reqs == 0)
+		return -EINVAL;
+	return 0;
+}
+
+static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
+{
+	struct nfs4_channel_attrs *sent = &args->bc_attrs;
+	struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
+
+	if (rcvd->max_rqst_sz > sent->max_rqst_sz)
+		return -EINVAL;
+	if (rcvd->max_resp_sz < sent->max_resp_sz)
+		return -EINVAL;
+	if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
+		return -EINVAL;
+	/* These would render the backchannel useless: */
+	if (rcvd->max_ops  == 0)
+		return -EINVAL;
+	if (rcvd->max_reqs == 0)
+		return -EINVAL;
+	return 0;
+}
+
+static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
+				     struct nfs4_session *session)
+{
+	int ret;
+
+	ret = nfs4_verify_fore_channel_attrs(args, session);
+	if (ret)
+		return ret;
+	return nfs4_verify_back_channel_attrs(args, session);
+}
+
+static int _nfs4_proc_create_session(struct nfs_client *clp)
+{
+	struct nfs4_session *session = clp->cl_session;
+	struct nfs41_create_session_args args = {
+		.client = clp,
+		.cb_program = NFS4_CALLBACK,
+	};
+	struct nfs41_create_session_res res = {
+		.client = clp,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	int status;
+
+	nfs4_init_channel_attrs(&args);
+	args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
+
+	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+
+	if (!status)
+		/* Verify the session's negotiated channel_attrs values */
+		status = nfs4_verify_channel_attrs(&args, session);
+	if (!status) {
+		/* Increment the clientid slot sequence id */
+		clp->cl_seqid++;
+	}
+
+	return status;
+}
+
+/*
+ * Issues a CREATE_SESSION operation to the server.
+ * It is the responsibility of the caller to verify the session is
+ * expired before calling this routine.
+ */
+int nfs4_proc_create_session(struct nfs_client *clp)
+{
+	int status;
+	unsigned *ptr;
+	struct nfs4_session *session = clp->cl_session;
+
+	dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
+
+	status = _nfs4_proc_create_session(clp);
+	if (status)
+		goto out;
+
+	/* Init and reset the fore channel */
+	status = nfs4_init_slot_tables(session);
+	dprintk("slot table initialization returned %d\n", status);
+	if (status)
+		goto out;
+	status = nfs4_reset_slot_tables(session);
+	dprintk("slot table reset returned %d\n", status);
+	if (status)
+		goto out;
+
+	ptr = (unsigned *)&session->sess_id.data[0];
+	dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
+		clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
+out:
+	dprintk("<-- %s\n", __func__);
+	return status;
+}
+
+/*
+ * Issue the over-the-wire RPC DESTROY_SESSION.
+ * The caller must serialize access to this routine.
+ */
+int nfs4_proc_destroy_session(struct nfs4_session *session)
+{
+	int status = 0;
+	struct rpc_message msg;
+
+	dprintk("--> nfs4_proc_destroy_session\n");
+
+	/* session is still being setup */
+	if (session->clp->cl_cons_state != NFS_CS_READY)
+		return status;
+
+	msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION];
+	msg.rpc_argp = session;
+	msg.rpc_resp = NULL;
+	msg.rpc_cred = NULL;
+	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+
+	if (status)
+		printk(KERN_WARNING
+			"Got error %d from the server on DESTROY_SESSION. "
+			"Session has been destroyed regardless...\n", status);
+
+	dprintk("<-- nfs4_proc_destroy_session\n");
+	return status;
+}
+
+int nfs4_init_session(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_session *session;
+	unsigned int rsize, wsize;
+	int ret;
+
+	if (!nfs4_has_session(clp))
+		return 0;
+
+	session = clp->cl_session;
+	if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
+		return 0;
+
+	rsize = server->rsize;
+	if (rsize == 0)
+		rsize = NFS_MAX_FILE_IO_SIZE;
+	wsize = server->wsize;
+	if (wsize == 0)
+		wsize = NFS_MAX_FILE_IO_SIZE;
+
+	session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
+	session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
+
+	ret = nfs4_recover_expired_lease(server);
+	if (!ret)
+		ret = nfs4_check_client_ready(clp);
+	return ret;
+}
+
+int nfs4_init_ds_session(struct nfs_client *clp)
+{
+	struct nfs4_session *session = clp->cl_session;
+	int ret;
+
+	if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
+		return 0;
+
+	ret = nfs4_client_recover_expired_lease(clp);
+	if (!ret)
+		/* Test for the DS role */
+		if (!is_ds_client(clp))
+			ret = -ENODEV;
+	if (!ret)
+		ret = nfs4_check_client_ready(clp);
+	return ret;
+
+}
+EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
+
+
+/*
+ * Renew the cl_session lease.
+ */
+struct nfs4_sequence_data {
+	struct nfs_client *clp;
+	struct nfs4_sequence_args args;
+	struct nfs4_sequence_res res;
+};
+
+static void nfs41_sequence_release(void *data)
+{
+	struct nfs4_sequence_data *calldata = data;
+	struct nfs_client *clp = calldata->clp;
+
+	if (atomic_read(&clp->cl_count) > 1)
+		nfs4_schedule_state_renewal(clp);
+	nfs_put_client(clp);
+	kfree(calldata);
+}
+
+static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp)
+{
+	switch(task->tk_status) {
+	case -NFS4ERR_DELAY:
+		rpc_delay(task, NFS4_POLL_RETRY_MAX);
+		return -EAGAIN;
+	default:
+		nfs4_schedule_lease_recovery(clp);
+	}
+	return 0;
+}
+
+static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
+{
+	struct nfs4_sequence_data *calldata = data;
+	struct nfs_client *clp = calldata->clp;
+
+	if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp))
+		return;
+
+	if (task->tk_status < 0) {
+		dprintk("%s ERROR %d\n", __func__, task->tk_status);
+		if (atomic_read(&clp->cl_count) == 1)
+			goto out;
+
+		if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) {
+			rpc_restart_call_prepare(task);
+			return;
+		}
+	}
+	dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
+out:
+	dprintk("<-- %s\n", __func__);
+}
+
+static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs4_sequence_data *calldata = data;
+	struct nfs_client *clp = calldata->clp;
+	struct nfs4_sequence_args *args;
+	struct nfs4_sequence_res *res;
+
+	args = task->tk_msg.rpc_argp;
+	res = task->tk_msg.rpc_resp;
+
+	if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task))
+		return;
+	rpc_call_start(task);
+}
+
+static const struct rpc_call_ops nfs41_sequence_ops = {
+	.rpc_call_done = nfs41_sequence_call_done,
+	.rpc_call_prepare = nfs41_sequence_prepare,
+	.rpc_release = nfs41_sequence_release,
+};
+
+static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	struct nfs4_sequence_data *calldata;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
+		.rpc_cred = cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clp->cl_rpcclient,
+		.rpc_message = &msg,
+		.callback_ops = &nfs41_sequence_ops,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_SOFT,
+	};
+
+	if (!atomic_inc_not_zero(&clp->cl_count))
+		return ERR_PTR(-EIO);
+	calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+	if (calldata == NULL) {
+		nfs_put_client(clp);
+		return ERR_PTR(-ENOMEM);
+	}
+	msg.rpc_argp = &calldata->args;
+	msg.rpc_resp = &calldata->res;
+	calldata->clp = clp;
+	task_setup_data.callback_data = calldata;
+
+	return rpc_run_task(&task_setup_data);
+}
+
+static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	struct rpc_task *task;
+	int ret = 0;
+
+	task = _nfs41_proc_sequence(clp, cred);
+	if (IS_ERR(task))
+		ret = PTR_ERR(task);
+	else
+		rpc_put_task_async(task);
+	dprintk("<-- %s status=%d\n", __func__, ret);
+	return ret;
+}
+
+static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	struct rpc_task *task;
+	int ret;
+
+	task = _nfs41_proc_sequence(clp, cred);
+	if (IS_ERR(task)) {
+		ret = PTR_ERR(task);
+		goto out;
+	}
+	ret = rpc_wait_for_completion_task(task);
+	if (!ret) {
+		struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
+
+		if (task->tk_status == 0)
+			nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+		ret = task->tk_status;
+	}
+	rpc_put_task(task);
+out:
+	dprintk("<-- %s status=%d\n", __func__, ret);
+	return ret;
+}
+
+struct nfs4_reclaim_complete_data {
+	struct nfs_client *clp;
+	struct nfs41_reclaim_complete_args arg;
+	struct nfs41_reclaim_complete_res res;
+};
+
+static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs4_reclaim_complete_data *calldata = data;
+
+	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+	if (nfs41_setup_sequence(calldata->clp->cl_session,
+				&calldata->arg.seq_args,
+				&calldata->res.seq_res, 0, task))
+		return;
+
+	rpc_call_start(task);
+}
+
+static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp)
+{
+	switch(task->tk_status) {
+	case 0:
+	case -NFS4ERR_COMPLETE_ALREADY:
+	case -NFS4ERR_WRONG_CRED: /* What to do here? */
+		break;
+	case -NFS4ERR_DELAY:
+		rpc_delay(task, NFS4_POLL_RETRY_MAX);
+		/* fall through */
+	case -NFS4ERR_RETRY_UNCACHED_REP:
+		return -EAGAIN;
+	default:
+		nfs4_schedule_lease_recovery(clp);
+	}
+	return 0;
+}
+
+static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
+{
+	struct nfs4_reclaim_complete_data *calldata = data;
+	struct nfs_client *clp = calldata->clp;
+	struct nfs4_sequence_res *res = &calldata->res.seq_res;
+
+	dprintk("--> %s\n", __func__);
+	if (!nfs41_sequence_done(task, res))
+		return;
+
+	if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) {
+		rpc_restart_call_prepare(task);
+		return;
+	}
+	dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_free_reclaim_complete_data(void *data)
+{
+	struct nfs4_reclaim_complete_data *calldata = data;
+
+	kfree(calldata);
+}
+
+static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
+	.rpc_call_prepare = nfs4_reclaim_complete_prepare,
+	.rpc_call_done = nfs4_reclaim_complete_done,
+	.rpc_release = nfs4_free_reclaim_complete_data,
+};
+
+/*
+ * Issue a global reclaim complete.
+ */
+static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
+{
+	struct nfs4_reclaim_complete_data *calldata;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clp->cl_rpcclient,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_reclaim_complete_call_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int status = -ENOMEM;
+
+	dprintk("--> %s\n", __func__);
+	calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+	if (calldata == NULL)
+		goto out;
+	calldata->clp = clp;
+	calldata->arg.one_fs = 0;
+
+	msg.rpc_argp = &calldata->arg;
+	msg.rpc_resp = &calldata->res;
+	task_setup_data.callback_data = calldata;
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task)) {
+		status = PTR_ERR(task);
+		goto out;
+	}
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status == 0)
+		status = task->tk_status;
+	rpc_put_task(task);
+	return 0;
+out:
+	dprintk("<-- %s status=%d\n", __func__, status);
+	return status;
+}
+
+static void
+nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutget *lgp = calldata;
+	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+
+	dprintk("--> %s\n", __func__);
+	/* Note the is a race here, where a CB_LAYOUTRECALL can come in
+	 * right now covering the LAYOUTGET we are about to send.
+	 * However, that is not so catastrophic, and there seems
+	 * to be no way to prevent it completely.
+	 */
+	if (nfs4_setup_sequence(server, &lgp->args.seq_args,
+				&lgp->res.seq_res, 0, task))
+		return;
+	if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+					  NFS_I(lgp->args.inode)->layout,
+					  lgp->args.ctx->state)) {
+		rpc_exit(task, NFS4_OK);
+		return;
+	}
+	rpc_call_start(task);
+}
+
+static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutget *lgp = calldata;
+	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+
+	dprintk("--> %s\n", __func__);
+
+	if (!nfs4_sequence_done(task, &lgp->res.seq_res))
+		return;
+
+	switch (task->tk_status) {
+	case 0:
+		break;
+	case -NFS4ERR_LAYOUTTRYLATER:
+	case -NFS4ERR_RECALLCONFLICT:
+		task->tk_status = -NFS4ERR_DELAY;
+		/* Fall through */
+	default:
+		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+			rpc_restart_call_prepare(task);
+			return;
+		}
+	}
+	dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_layoutget_release(void *calldata)
+{
+	struct nfs4_layoutget *lgp = calldata;
+
+	dprintk("--> %s\n", __func__);
+	put_nfs_open_context(lgp->args.ctx);
+	kfree(calldata);
+	dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_layoutget_call_ops = {
+	.rpc_call_prepare = nfs4_layoutget_prepare,
+	.rpc_call_done = nfs4_layoutget_done,
+	.rpc_release = nfs4_layoutget_release,
+};
+
+int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
+{
+	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
+		.rpc_argp = &lgp->args,
+		.rpc_resp = &lgp->res,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_layoutget_call_ops,
+		.callback_data = lgp,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int status = 0;
+
+	dprintk("--> %s\n", __func__);
+
+	lgp->res.layoutp = &lgp->args.layout;
+	lgp->res.seq_res.sr_slot = NULL;
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status == 0)
+		status = task->tk_status;
+	if (status == 0)
+		status = pnfs_layout_process(lgp);
+	rpc_put_task(task);
+	dprintk("<-- %s status=%d\n", __func__, status);
+	return status;
+}
+
+static void
+nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutreturn *lrp = calldata;
+
+	dprintk("--> %s\n", __func__);
+	if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
+				&lrp->res.seq_res, 0, task))
+		return;
+	rpc_call_start(task);
+}
+
+static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutreturn *lrp = calldata;
+	struct nfs_server *server;
+	struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
+
+	dprintk("--> %s\n", __func__);
+
+	if (!nfs4_sequence_done(task, &lrp->res.seq_res))
+		return;
+
+	server = NFS_SERVER(lrp->args.inode);
+	if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+		nfs_restart_rpc(task, lrp->clp);
+		return;
+	}
+	spin_lock(&lo->plh_inode->i_lock);
+	if (task->tk_status == 0) {
+		if (lrp->res.lrs_present) {
+			pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+		} else
+			BUG_ON(!list_empty(&lo->plh_segs));
+	}
+	lo->plh_block_lgets--;
+	spin_unlock(&lo->plh_inode->i_lock);
+	dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_layoutreturn_release(void *calldata)
+{
+	struct nfs4_layoutreturn *lrp = calldata;
+
+	dprintk("--> %s\n", __func__);
+	put_layout_hdr(NFS_I(lrp->args.inode)->layout);
+	kfree(calldata);
+	dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
+	.rpc_call_prepare = nfs4_layoutreturn_prepare,
+	.rpc_call_done = nfs4_layoutreturn_done,
+	.rpc_release = nfs4_layoutreturn_release,
+};
+
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
+{
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
+		.rpc_argp = &lrp->args,
+		.rpc_resp = &lrp->res,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = lrp->clp->cl_rpcclient,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_layoutreturn_call_ops,
+		.callback_data = lrp,
+	};
+	int status;
+
+	dprintk("--> %s\n", __func__);
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	status = task->tk_status;
+	dprintk("<-- %s status=%d\n", __func__, status);
+	rpc_put_task(task);
+	return status;
+}
+
+static int
+_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+{
+	struct nfs4_getdeviceinfo_args args = {
+		.pdev = pdev,
+	};
+	struct nfs4_getdeviceinfo_res res = {
+		.pdev = pdev,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	int status;
+
+	dprintk("--> %s\n", __func__);
+	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+	dprintk("<-- %s status=%d\n", __func__, status);
+
+	return status;
+}
+
+int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+{
+	struct nfs4_exception exception = { };
+	int err;
+
+	do {
+		err = nfs4_handle_exception(server,
+					_nfs4_proc_getdeviceinfo(server, pdev),
+					&exception);
+	} while (exception.retry);
+	return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
+
+static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutcommit_data *data = calldata;
+	struct nfs_server *server = NFS_SERVER(data->args.inode);
+
+	if (nfs4_setup_sequence(server, &data->args.seq_args,
+				&data->res.seq_res, 1, task))
+		return;
+	rpc_call_start(task);
+}
+
+static void
+nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutcommit_data *data = calldata;
+	struct nfs_server *server = NFS_SERVER(data->args.inode);
+
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return;
+
+	switch (task->tk_status) { /* Just ignore these failures */
+	case NFS4ERR_DELEG_REVOKED: /* layout was recalled */
+	case NFS4ERR_BADIOMODE:     /* no IOMODE_RW layout for range */
+	case NFS4ERR_BADLAYOUT:     /* no layout */
+	case NFS4ERR_GRACE:	    /* loca_recalim always false */
+		task->tk_status = 0;
+	}
+
+	if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+		nfs_restart_rpc(task, server->nfs_client);
+		return;
+	}
+
+	if (task->tk_status == 0)
+		nfs_post_op_update_inode_force_wcc(data->args.inode,
+						   data->res.fattr);
+}
+
+static void nfs4_layoutcommit_release(void *calldata)
+{
+	struct nfs4_layoutcommit_data *data = calldata;
+	struct pnfs_layout_segment *lseg, *tmp;
+
+	/* Matched by references in pnfs_set_layoutcommit */
+	list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) {
+		list_del_init(&lseg->pls_lc_list);
+		if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT,
+				       &lseg->pls_flags))
+			put_lseg(lseg);
+	}
+	put_rpccred(data->cred);
+	kfree(data);
+}
+
+static const struct rpc_call_ops nfs4_layoutcommit_ops = {
+	.rpc_call_prepare = nfs4_layoutcommit_prepare,
+	.rpc_call_done = nfs4_layoutcommit_done,
+	.rpc_release = nfs4_layoutcommit_release,
+};
+
+int
+nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT],
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = data->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.task = &data->task,
+		.rpc_client = NFS_CLIENT(data->args.inode),
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_layoutcommit_ops,
+		.callback_data = data,
+		.flags = RPC_TASK_ASYNC,
+	};
+	struct rpc_task *task;
+	int status = 0;
+
+	dprintk("NFS: %4d initiating layoutcommit call. sync %d "
+		"lbw: %llu inode %lu\n",
+		data->task.tk_pid, sync,
+		data->args.lastbytewritten,
+		data->args.inode->i_ino);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	if (sync == false)
+		goto out;
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status != 0)
+		goto out;
+	status = task->tk_status;
+out:
+	dprintk("%s: status %d\n", __func__, status);
+	rpc_put_task(task);
+	return status;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
+	.owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
+	.state_flag_bit	= NFS_STATE_RECLAIM_REBOOT,
+	.recover_open	= nfs4_open_reclaim,
+	.recover_lock	= nfs4_lock_reclaim,
+	.establish_clid = nfs4_init_clientid,
+	.get_clid_cred	= nfs4_get_setclientid_cred,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
+	.owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
+	.state_flag_bit	= NFS_STATE_RECLAIM_REBOOT,
+	.recover_open	= nfs4_open_reclaim,
+	.recover_lock	= nfs4_lock_reclaim,
+	.establish_clid = nfs41_init_clientid,
+	.get_clid_cred	= nfs4_get_exchange_id_cred,
+	.reclaim_complete = nfs41_proc_reclaim_complete,
+};
+#endif /* CONFIG_NFS_V4_1 */
+
+struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
+	.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
+	.state_flag_bit	= NFS_STATE_RECLAIM_NOGRACE,
+	.recover_open	= nfs4_open_expired,
+	.recover_lock	= nfs4_lock_expired,
+	.establish_clid = nfs4_init_clientid,
+	.get_clid_cred	= nfs4_get_setclientid_cred,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
+	.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
+	.state_flag_bit	= NFS_STATE_RECLAIM_NOGRACE,
+	.recover_open	= nfs4_open_expired,
+	.recover_lock	= nfs4_lock_expired,
+	.establish_clid = nfs41_init_clientid,
+	.get_clid_cred	= nfs4_get_exchange_id_cred,
+};
+#endif /* CONFIG_NFS_V4_1 */
+
+struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {
+	.sched_state_renewal = nfs4_proc_async_renew,
+	.get_state_renewal_cred_locked = nfs4_get_renew_cred_locked,
+	.renew_lease = nfs4_proc_renew,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
+	.sched_state_renewal = nfs41_proc_async_sequence,
+	.get_state_renewal_cred_locked = nfs4_get_machine_cred_locked,
+	.renew_lease = nfs4_proc_sequence,
+};
+#endif
+
+static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
+	.minor_version = 0,
+	.call_sync = _nfs4_call_sync,
+	.validate_stateid = nfs4_validate_delegation_stateid,
+	.reboot_recovery_ops = &nfs40_reboot_recovery_ops,
+	.nograce_recovery_ops = &nfs40_nograce_recovery_ops,
+	.state_renewal_ops = &nfs40_state_renewal_ops,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
+	.minor_version = 1,
+	.call_sync = _nfs4_call_sync_session,
+	.validate_stateid = nfs41_validate_delegation_stateid,
+	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+	.state_renewal_ops = &nfs41_state_renewal_ops,
+};
+#endif
+
+const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
+	[0] = &nfs_v4_0_minor_ops,
+#if defined(CONFIG_NFS_V4_1)
+	[1] = &nfs_v4_1_minor_ops,
+#endif
+};
+
+static const struct inode_operations nfs4_file_inode_operations = {
+	.permission	= nfs_permission,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+	.getxattr	= generic_getxattr,
+	.setxattr	= generic_setxattr,
+	.listxattr	= generic_listxattr,
+	.removexattr	= generic_removexattr,
+};
+
+const struct nfs_rpc_ops nfs_v4_clientops = {
+	.version	= 4,			/* protocol version */
+	.dentry_ops	= &nfs4_dentry_operations,
+	.dir_inode_ops	= &nfs4_dir_inode_operations,
+	.file_inode_ops	= &nfs4_file_inode_operations,
+	.file_ops	= &nfs4_file_operations,
+	.getroot	= nfs4_proc_get_root,
+	.getattr	= nfs4_proc_getattr,
+	.setattr	= nfs4_proc_setattr,
+	.lookupfh	= nfs4_proc_lookupfh,
+	.lookup		= nfs4_proc_lookup,
+	.access		= nfs4_proc_access,
+	.readlink	= nfs4_proc_readlink,
+	.create		= nfs4_proc_create,
+	.remove		= nfs4_proc_remove,
+	.unlink_setup	= nfs4_proc_unlink_setup,
+	.unlink_done	= nfs4_proc_unlink_done,
+	.rename		= nfs4_proc_rename,
+	.rename_setup	= nfs4_proc_rename_setup,
+	.rename_done	= nfs4_proc_rename_done,
+	.link		= nfs4_proc_link,
+	.symlink	= nfs4_proc_symlink,
+	.mkdir		= nfs4_proc_mkdir,
+	.rmdir		= nfs4_proc_remove,
+	.readdir	= nfs4_proc_readdir,
+	.mknod		= nfs4_proc_mknod,
+	.statfs		= nfs4_proc_statfs,
+	.fsinfo		= nfs4_proc_fsinfo,
+	.pathconf	= nfs4_proc_pathconf,
+	.set_capabilities = nfs4_server_capabilities,
+	.decode_dirent	= nfs4_decode_dirent,
+	.read_setup	= nfs4_proc_read_setup,
+	.read_done	= nfs4_read_done,
+	.write_setup	= nfs4_proc_write_setup,
+	.write_done	= nfs4_write_done,
+	.commit_setup	= nfs4_proc_commit_setup,
+	.commit_done	= nfs4_commit_done,
+	.lock		= nfs4_proc_lock,
+	.clear_acl_cache = nfs4_zap_acl_attr,
+	.close_context  = nfs4_close_context,
+	.open_context	= nfs4_atomic_open,
+	.init_client	= nfs4_init_client,
+	.secinfo	= nfs4_proc_secinfo,
+};
+
+static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
+	.prefix	= XATTR_NAME_NFSV4_ACL,
+	.list	= nfs4_xattr_list_nfs4_acl,
+	.get	= nfs4_xattr_get_nfs4_acl,
+	.set	= nfs4_xattr_set_nfs4_acl,
+};
+
+const struct xattr_handler *nfs4_xattr_handlers[] = {
+	&nfs4_xattr_nfs4_acl_handler,
+	NULL
+};
+
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
new file mode 100644
index 00000000..df8e7f3c
--- /dev/null
+++ b/fs/nfs/nfs4renewd.c
@@ -0,0 +1,130 @@
+/*
+ *  fs/nfs/nfs4renewd.c
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Implementation of the NFSv4 "renew daemon", which wakes up periodically to
+ * send a RENEW, to keep state alive on the server.  The daemon is implemented
+ * as an rpc_task, not a real kernel thread, so it always runs in rpciod's
+ * context.  There is one renewd per nfs_server.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/clnt.h>
+
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "delegation.h"
+
+#define NFSDBG_FACILITY	NFSDBG_PROC
+
+void
+nfs4_renew_state(struct work_struct *work)
+{
+	const struct nfs4_state_maintenance_ops *ops;
+	struct nfs_client *clp =
+		container_of(work, struct nfs_client, cl_renewd.work);
+	struct rpc_cred *cred;
+	long lease;
+	unsigned long last, now;
+
+	ops = clp->cl_mvops->state_renewal_ops;
+	dprintk("%s: start\n", __func__);
+
+	if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
+		goto out;
+
+	spin_lock(&clp->cl_lock);
+	lease = clp->cl_lease_time;
+	last = clp->cl_last_renewal;
+	now = jiffies;
+	/* Are we close to a lease timeout? */
+	if (time_after(now, last + lease/3)) {
+		cred = ops->get_state_renewal_cred_locked(clp);
+		spin_unlock(&clp->cl_lock);
+		if (cred == NULL) {
+			if (!nfs_delegations_present(clp)) {
+				set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+				goto out;
+			}
+			nfs_expire_all_delegations(clp);
+		} else {
+			/* Queue an asynchronous RENEW. */
+			ops->sched_state_renewal(clp, cred);
+			put_rpccred(cred);
+			goto out_exp;
+		}
+	} else {
+		dprintk("%s: failed to call renewd. Reason: lease not expired \n",
+				__func__);
+		spin_unlock(&clp->cl_lock);
+	}
+	nfs4_schedule_state_renewal(clp);
+out_exp:
+	nfs_expire_unreferenced_delegations(clp);
+out:
+	dprintk("%s: done\n", __func__);
+}
+
+void
+nfs4_schedule_state_renewal(struct nfs_client *clp)
+{
+	long timeout;
+
+	spin_lock(&clp->cl_lock);
+	timeout = (2 * clp->cl_lease_time) / 3 + (long)clp->cl_last_renewal
+		- (long)jiffies;
+	if (timeout < 5 * HZ)
+		timeout = 5 * HZ;
+	dprintk("%s: requeueing work. Lease period = %ld\n",
+			__func__, (timeout + HZ - 1) / HZ);
+	cancel_delayed_work(&clp->cl_renewd);
+	schedule_delayed_work(&clp->cl_renewd, timeout);
+	set_bit(NFS_CS_RENEWD, &clp->cl_res_state);
+	spin_unlock(&clp->cl_lock);
+}
+
+void
+nfs4_kill_renewd(struct nfs_client *clp)
+{
+	cancel_delayed_work_sync(&clp->cl_renewd);
+}
+
+/*
+ * Local variables:
+ *   c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
new file mode 100644
index 00000000..c6e2769f
--- /dev/null
+++ b/fs/nfs/nfs4state.c
@@ -0,0 +1,1766 @@
+/*
+ *  fs/nfs/nfs4state.c
+ *
+ *  Client-side XDR for NFSv4.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Implementation of the NFSv4 state model.  For the time being,
+ * this is minimal, but will be made much more complex in a
+ * subsequent patch.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_idmap.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/ratelimit.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "internal.h"
+#include "pnfs.h"
+
+#define OPENOWNER_POOL_SIZE	8
+
+const nfs4_stateid zero_stateid;
+
+static LIST_HEAD(nfs4_clientid_list);
+
+int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	struct nfs4_setclientid_res clid = {
+		.clientid = clp->cl_clientid,
+		.confirm = clp->cl_confirm,
+	};
+	unsigned short port;
+	int status;
+
+	if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
+		goto do_confirm;
+	port = nfs_callback_tcpport;
+	if (clp->cl_addr.ss_family == AF_INET6)
+		port = nfs_callback_tcpport6;
+
+	status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
+	if (status != 0)
+		goto out;
+	clp->cl_clientid = clid.clientid;
+	clp->cl_confirm = clid.confirm;
+	set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+do_confirm:
+	status = nfs4_proc_setclientid_confirm(clp, &clid, cred);
+	if (status != 0)
+		goto out;
+	clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+	nfs4_schedule_state_renewal(clp);
+out:
+	return status;
+}
+
+struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
+{
+	struct rpc_cred *cred = NULL;
+
+	if (clp->cl_machine_cred != NULL)
+		cred = get_rpccred(clp->cl_machine_cred);
+	return cred;
+}
+
+static void nfs4_clear_machine_cred(struct nfs_client *clp)
+{
+	struct rpc_cred *cred;
+
+	spin_lock(&clp->cl_lock);
+	cred = clp->cl_machine_cred;
+	clp->cl_machine_cred = NULL;
+	spin_unlock(&clp->cl_lock);
+	if (cred != NULL)
+		put_rpccred(cred);
+}
+
+static struct rpc_cred *
+nfs4_get_renew_cred_server_locked(struct nfs_server *server)
+{
+	struct rpc_cred *cred = NULL;
+	struct nfs4_state_owner *sp;
+	struct rb_node *pos;
+
+	for (pos = rb_first(&server->state_owners);
+	     pos != NULL;
+	     pos = rb_next(pos)) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+		if (list_empty(&sp->so_states))
+			continue;
+		cred = get_rpccred(sp->so_cred);
+		break;
+	}
+	return cred;
+}
+
+/**
+ * nfs4_get_renew_cred_locked - Acquire credential for a renew operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ * Caller must hold clp->cl_lock.
+ */
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+{
+	struct rpc_cred *cred = NULL;
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		cred = nfs4_get_renew_cred_server_locked(server);
+		if (cred != NULL)
+			break;
+	}
+	rcu_read_unlock();
+	return cred;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static int nfs41_setup_state_renewal(struct nfs_client *clp)
+{
+	int status;
+	struct nfs_fsinfo fsinfo;
+
+	if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
+		nfs4_schedule_state_renewal(clp);
+		return 0;
+	}
+
+	status = nfs4_proc_get_lease_time(clp, &fsinfo);
+	if (status == 0) {
+		/* Update lease time and schedule renewal */
+		spin_lock(&clp->cl_lock);
+		clp->cl_lease_time = fsinfo.lease_time * HZ;
+		clp->cl_last_renewal = jiffies;
+		spin_unlock(&clp->cl_lock);
+
+		nfs4_schedule_state_renewal(clp);
+	}
+
+	return status;
+}
+
+/*
+ * Back channel returns NFS4ERR_DELAY for new requests when
+ * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
+ * is ended.
+ */
+static void nfs4_end_drain_session(struct nfs_client *clp)
+{
+	struct nfs4_session *ses = clp->cl_session;
+	int max_slots;
+
+	if (ses == NULL)
+		return;
+	if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
+		spin_lock(&ses->fc_slot_table.slot_tbl_lock);
+		max_slots = ses->fc_slot_table.max_slots;
+		while (max_slots--) {
+			struct rpc_task *task;
+
+			task = rpc_wake_up_next(&ses->fc_slot_table.
+						slot_tbl_waitq);
+			if (!task)
+				break;
+			rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+		}
+		spin_unlock(&ses->fc_slot_table.slot_tbl_lock);
+	}
+}
+
+static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
+{
+	spin_lock(&tbl->slot_tbl_lock);
+	if (tbl->highest_used_slotid != -1) {
+		INIT_COMPLETION(tbl->complete);
+		spin_unlock(&tbl->slot_tbl_lock);
+		return wait_for_completion_interruptible(&tbl->complete);
+	}
+	spin_unlock(&tbl->slot_tbl_lock);
+	return 0;
+}
+
+static int nfs4_begin_drain_session(struct nfs_client *clp)
+{
+	struct nfs4_session *ses = clp->cl_session;
+	int ret = 0;
+
+	set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
+	/* back channel */
+	ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table);
+	if (ret)
+		return ret;
+	/* fore channel */
+	return nfs4_wait_on_slot_tbl(&ses->fc_slot_table);
+}
+
+int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	int status;
+
+	if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
+		goto do_confirm;
+	nfs4_begin_drain_session(clp);
+	status = nfs4_proc_exchange_id(clp, cred);
+	if (status != 0)
+		goto out;
+	set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+do_confirm:
+	status = nfs4_proc_create_session(clp);
+	if (status != 0)
+		goto out;
+	clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+	nfs41_setup_state_renewal(clp);
+	nfs_mark_client_ready(clp, NFS_CS_READY);
+out:
+	return status;
+}
+
+struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
+{
+	struct rpc_cred *cred;
+
+	spin_lock(&clp->cl_lock);
+	cred = nfs4_get_machine_cred_locked(clp);
+	spin_unlock(&clp->cl_lock);
+	return cred;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+static struct rpc_cred *
+nfs4_get_setclientid_cred_server(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct rpc_cred *cred = NULL;
+	struct nfs4_state_owner *sp;
+	struct rb_node *pos;
+
+	spin_lock(&clp->cl_lock);
+	pos = rb_first(&server->state_owners);
+	if (pos != NULL) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+		cred = get_rpccred(sp->so_cred);
+	}
+	spin_unlock(&clp->cl_lock);
+	return cred;
+}
+
+/**
+ * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ */
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+	struct rpc_cred *cred;
+
+	spin_lock(&clp->cl_lock);
+	cred = nfs4_get_machine_cred_locked(clp);
+	spin_unlock(&clp->cl_lock);
+	if (cred != NULL)
+		goto out;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		cred = nfs4_get_setclientid_cred_server(server);
+		if (cred != NULL)
+			break;
+	}
+	rcu_read_unlock();
+
+out:
+	return cred;
+}
+
+static void nfs_alloc_unique_id_locked(struct rb_root *root,
+				       struct nfs_unique_id *new,
+				       __u64 minval, int maxbits)
+{
+	struct rb_node **p, *parent;
+	struct nfs_unique_id *pos;
+	__u64 mask = ~0ULL;
+
+	if (maxbits < 64)
+		mask = (1ULL << maxbits) - 1ULL;
+
+	/* Ensure distribution is more or less flat */
+	get_random_bytes(&new->id, sizeof(new->id));
+	new->id &= mask;
+	if (new->id < minval)
+		new->id += minval;
+retry:
+	p = &root->rb_node;
+	parent = NULL;
+
+	while (*p != NULL) {
+		parent = *p;
+		pos = rb_entry(parent, struct nfs_unique_id, rb_node);
+
+		if (new->id < pos->id)
+			p = &(*p)->rb_left;
+		else if (new->id > pos->id)
+			p = &(*p)->rb_right;
+		else
+			goto id_exists;
+	}
+	rb_link_node(&new->rb_node, parent, p);
+	rb_insert_color(&new->rb_node, root);
+	return;
+id_exists:
+	for (;;) {
+		new->id++;
+		if (new->id < minval || (new->id & mask) != new->id) {
+			new->id = minval;
+			break;
+		}
+		parent = rb_next(parent);
+		if (parent == NULL)
+			break;
+		pos = rb_entry(parent, struct nfs_unique_id, rb_node);
+		if (new->id < pos->id)
+			break;
+	}
+	goto retry;
+}
+
+static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
+{
+	rb_erase(&id->rb_node, root);
+}
+
+static struct nfs4_state_owner *
+nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
+{
+	struct rb_node **p = &server->state_owners.rb_node,
+		       *parent = NULL;
+	struct nfs4_state_owner *sp, *res = NULL;
+
+	while (*p != NULL) {
+		parent = *p;
+		sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
+
+		if (server < sp->so_server) {
+			p = &parent->rb_left;
+			continue;
+		}
+		if (server > sp->so_server) {
+			p = &parent->rb_right;
+			continue;
+		}
+		if (cred < sp->so_cred)
+			p = &parent->rb_left;
+		else if (cred > sp->so_cred)
+			p = &parent->rb_right;
+		else {
+			atomic_inc(&sp->so_count);
+			res = sp;
+			break;
+		}
+	}
+	return res;
+}
+
+static struct nfs4_state_owner *
+nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
+{
+	struct nfs_server *server = new->so_server;
+	struct rb_node **p = &server->state_owners.rb_node,
+		       *parent = NULL;
+	struct nfs4_state_owner *sp;
+
+	while (*p != NULL) {
+		parent = *p;
+		sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
+
+		if (new->so_cred < sp->so_cred)
+			p = &parent->rb_left;
+		else if (new->so_cred > sp->so_cred)
+			p = &parent->rb_right;
+		else {
+			atomic_inc(&sp->so_count);
+			return sp;
+		}
+	}
+	nfs_alloc_unique_id_locked(&server->openowner_id,
+					&new->so_owner_id, 1, 64);
+	rb_link_node(&new->so_server_node, parent, p);
+	rb_insert_color(&new->so_server_node, &server->state_owners);
+	return new;
+}
+
+static void
+nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
+{
+	struct nfs_server *server = sp->so_server;
+
+	if (!RB_EMPTY_NODE(&sp->so_server_node))
+		rb_erase(&sp->so_server_node, &server->state_owners);
+	nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id);
+}
+
+/*
+ * nfs4_alloc_state_owner(): this is called on the OPEN or CREATE path to
+ * create a new state_owner.
+ *
+ */
+static struct nfs4_state_owner *
+nfs4_alloc_state_owner(void)
+{
+	struct nfs4_state_owner *sp;
+
+	sp = kzalloc(sizeof(*sp),GFP_NOFS);
+	if (!sp)
+		return NULL;
+	spin_lock_init(&sp->so_lock);
+	INIT_LIST_HEAD(&sp->so_states);
+	rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue");
+	sp->so_seqid.sequence = &sp->so_sequence;
+	spin_lock_init(&sp->so_sequence.lock);
+	INIT_LIST_HEAD(&sp->so_sequence.list);
+	atomic_set(&sp->so_count, 1);
+	return sp;
+}
+
+static void
+nfs4_drop_state_owner(struct nfs4_state_owner *sp)
+{
+	if (!RB_EMPTY_NODE(&sp->so_server_node)) {
+		struct nfs_server *server = sp->so_server;
+		struct nfs_client *clp = server->nfs_client;
+
+		spin_lock(&clp->cl_lock);
+		rb_erase(&sp->so_server_node, &server->state_owners);
+		RB_CLEAR_NODE(&sp->so_server_node);
+		spin_unlock(&clp->cl_lock);
+	}
+}
+
+/**
+ * nfs4_get_state_owner - Look up a state owner given a credential
+ * @server: nfs_server to search
+ * @cred: RPC credential to match
+ *
+ * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
+ */
+struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
+					      struct rpc_cred *cred)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_state_owner *sp, *new;
+
+	spin_lock(&clp->cl_lock);
+	sp = nfs4_find_state_owner_locked(server, cred);
+	spin_unlock(&clp->cl_lock);
+	if (sp != NULL)
+		return sp;
+	new = nfs4_alloc_state_owner();
+	if (new == NULL)
+		return NULL;
+	new->so_server = server;
+	new->so_cred = cred;
+	spin_lock(&clp->cl_lock);
+	sp = nfs4_insert_state_owner_locked(new);
+	spin_unlock(&clp->cl_lock);
+	if (sp == new)
+		get_rpccred(cred);
+	else {
+		rpc_destroy_wait_queue(&new->so_sequence.wait);
+		kfree(new);
+	}
+	return sp;
+}
+
+/**
+ * nfs4_put_state_owner - Release a nfs4_state_owner
+ * @sp: state owner data to release
+ *
+ */
+void nfs4_put_state_owner(struct nfs4_state_owner *sp)
+{
+	struct nfs_client *clp = sp->so_server->nfs_client;
+	struct rpc_cred *cred = sp->so_cred;
+
+	if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
+		return;
+	nfs4_remove_state_owner_locked(sp);
+	spin_unlock(&clp->cl_lock);
+	rpc_destroy_wait_queue(&sp->so_sequence.wait);
+	put_rpccred(cred);
+	kfree(sp);
+}
+
+static struct nfs4_state *
+nfs4_alloc_open_state(void)
+{
+	struct nfs4_state *state;
+
+	state = kzalloc(sizeof(*state), GFP_NOFS);
+	if (!state)
+		return NULL;
+	atomic_set(&state->count, 1);
+	INIT_LIST_HEAD(&state->lock_states);
+	spin_lock_init(&state->state_lock);
+	seqlock_init(&state->seqlock);
+	return state;
+}
+
+void
+nfs4_state_set_mode_locked(struct nfs4_state *state, fmode_t fmode)
+{
+	if (state->state == fmode)
+		return;
+	/* NB! List reordering - see the reclaim code for why.  */
+	if ((fmode & FMODE_WRITE) != (state->state & FMODE_WRITE)) {
+		if (fmode & FMODE_WRITE)
+			list_move(&state->open_states, &state->owner->so_states);
+		else
+			list_move_tail(&state->open_states, &state->owner->so_states);
+	}
+	state->state = fmode;
+}
+
+static struct nfs4_state *
+__nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs4_state *state;
+
+	list_for_each_entry(state, &nfsi->open_states, inode_states) {
+		if (state->owner != owner)
+			continue;
+		if (atomic_inc_not_zero(&state->count))
+			return state;
+	}
+	return NULL;
+}
+
+static void
+nfs4_free_open_state(struct nfs4_state *state)
+{
+	kfree(state);
+}
+
+struct nfs4_state *
+nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner)
+{
+	struct nfs4_state *state, *new;
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	spin_lock(&inode->i_lock);
+	state = __nfs4_find_state_byowner(inode, owner);
+	spin_unlock(&inode->i_lock);
+	if (state)
+		goto out;
+	new = nfs4_alloc_open_state();
+	spin_lock(&owner->so_lock);
+	spin_lock(&inode->i_lock);
+	state = __nfs4_find_state_byowner(inode, owner);
+	if (state == NULL && new != NULL) {
+		state = new;
+		state->owner = owner;
+		atomic_inc(&owner->so_count);
+		list_add(&state->inode_states, &nfsi->open_states);
+		ihold(inode);
+		state->inode = inode;
+		spin_unlock(&inode->i_lock);
+		/* Note: The reclaim code dictates that we add stateless
+		 * and read-only stateids to the end of the list */
+		list_add_tail(&state->open_states, &owner->so_states);
+		spin_unlock(&owner->so_lock);
+	} else {
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&owner->so_lock);
+		if (new)
+			nfs4_free_open_state(new);
+	}
+out:
+	return state;
+}
+
+void nfs4_put_open_state(struct nfs4_state *state)
+{
+	struct inode *inode = state->inode;
+	struct nfs4_state_owner *owner = state->owner;
+
+	if (!atomic_dec_and_lock(&state->count, &owner->so_lock))
+		return;
+	spin_lock(&inode->i_lock);
+	list_del(&state->inode_states);
+	list_del(&state->open_states);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&owner->so_lock);
+	iput(inode);
+	nfs4_free_open_state(state);
+	nfs4_put_state_owner(owner);
+}
+
+/*
+ * Close the current file.
+ */
+static void __nfs4_close(struct path *path, struct nfs4_state *state,
+		fmode_t fmode, gfp_t gfp_mask, int wait)
+{
+	struct nfs4_state_owner *owner = state->owner;
+	int call_close = 0;
+	fmode_t newstate;
+
+	atomic_inc(&owner->so_count);
+	/* Protect against nfs4_find_state() */
+	spin_lock(&owner->so_lock);
+	switch (fmode & (FMODE_READ | FMODE_WRITE)) {
+		case FMODE_READ:
+			state->n_rdonly--;
+			break;
+		case FMODE_WRITE:
+			state->n_wronly--;
+			break;
+		case FMODE_READ|FMODE_WRITE:
+			state->n_rdwr--;
+	}
+	newstate = FMODE_READ|FMODE_WRITE;
+	if (state->n_rdwr == 0) {
+		if (state->n_rdonly == 0) {
+			newstate &= ~FMODE_READ;
+			call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags);
+			call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+		}
+		if (state->n_wronly == 0) {
+			newstate &= ~FMODE_WRITE;
+			call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags);
+			call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+		}
+		if (newstate == 0)
+			clear_bit(NFS_DELEGATED_STATE, &state->flags);
+	}
+	nfs4_state_set_mode_locked(state, newstate);
+	spin_unlock(&owner->so_lock);
+
+	if (!call_close) {
+		nfs4_put_open_state(state);
+		nfs4_put_state_owner(owner);
+	} else {
+		bool roc = pnfs_roc(state->inode);
+
+		nfs4_do_close(path, state, gfp_mask, wait, roc);
+	}
+}
+
+void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
+{
+	__nfs4_close(path, state, fmode, GFP_NOFS, 0);
+}
+
+void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
+{
+	__nfs4_close(path, state, fmode, GFP_KERNEL, 1);
+}
+
+/*
+ * Search the state->lock_states for an existing lock_owner
+ * that is compatible with current->files
+ */
+static struct nfs4_lock_state *
+__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+{
+	struct nfs4_lock_state *pos;
+	list_for_each_entry(pos, &state->lock_states, ls_locks) {
+		if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type)
+			continue;
+		switch (pos->ls_owner.lo_type) {
+		case NFS4_POSIX_LOCK_TYPE:
+			if (pos->ls_owner.lo_u.posix_owner != fl_owner)
+				continue;
+			break;
+		case NFS4_FLOCK_LOCK_TYPE:
+			if (pos->ls_owner.lo_u.flock_owner != fl_pid)
+				continue;
+		}
+		atomic_inc(&pos->ls_count);
+		return pos;
+	}
+	return NULL;
+}
+
+/*
+ * Return a compatible lock_state. If no initialized lock_state structure
+ * exists, return an uninitialized one.
+ *
+ */
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+{
+	struct nfs4_lock_state *lsp;
+	struct nfs_server *server = state->owner->so_server;
+	struct nfs_client *clp = server->nfs_client;
+
+	lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
+	if (lsp == NULL)
+		return NULL;
+	rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue");
+	spin_lock_init(&lsp->ls_sequence.lock);
+	INIT_LIST_HEAD(&lsp->ls_sequence.list);
+	lsp->ls_seqid.sequence = &lsp->ls_sequence;
+	atomic_set(&lsp->ls_count, 1);
+	lsp->ls_state = state;
+	lsp->ls_owner.lo_type = type;
+	switch (lsp->ls_owner.lo_type) {
+	case NFS4_FLOCK_LOCK_TYPE:
+		lsp->ls_owner.lo_u.flock_owner = fl_pid;
+		break;
+	case NFS4_POSIX_LOCK_TYPE:
+		lsp->ls_owner.lo_u.posix_owner = fl_owner;
+		break;
+	default:
+		kfree(lsp);
+		return NULL;
+	}
+	spin_lock(&clp->cl_lock);
+	nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64);
+	spin_unlock(&clp->cl_lock);
+	INIT_LIST_HEAD(&lsp->ls_locks);
+	return lsp;
+}
+
+static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
+{
+	struct nfs_server *server = lsp->ls_state->owner->so_server;
+	struct nfs_client *clp = server->nfs_client;
+
+	spin_lock(&clp->cl_lock);
+	nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id);
+	spin_unlock(&clp->cl_lock);
+	rpc_destroy_wait_queue(&lsp->ls_sequence.wait);
+	kfree(lsp);
+}
+
+/*
+ * Return a compatible lock_state. If no initialized lock_state structure
+ * exists, return an uninitialized one.
+ *
+ */
+static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type)
+{
+	struct nfs4_lock_state *lsp, *new = NULL;
+	
+	for(;;) {
+		spin_lock(&state->state_lock);
+		lsp = __nfs4_find_lock_state(state, owner, pid, type);
+		if (lsp != NULL)
+			break;
+		if (new != NULL) {
+			list_add(&new->ls_locks, &state->lock_states);
+			set_bit(LK_STATE_IN_USE, &state->flags);
+			lsp = new;
+			new = NULL;
+			break;
+		}
+		spin_unlock(&state->state_lock);
+		new = nfs4_alloc_lock_state(state, owner, pid, type);
+		if (new == NULL)
+			return NULL;
+	}
+	spin_unlock(&state->state_lock);
+	if (new != NULL)
+		nfs4_free_lock_state(new);
+	return lsp;
+}
+
+/*
+ * Release reference to lock_state, and free it if we see that
+ * it is no longer in use
+ */
+void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
+{
+	struct nfs4_state *state;
+
+	if (lsp == NULL)
+		return;
+	state = lsp->ls_state;
+	if (!atomic_dec_and_lock(&lsp->ls_count, &state->state_lock))
+		return;
+	list_del(&lsp->ls_locks);
+	if (list_empty(&state->lock_states))
+		clear_bit(LK_STATE_IN_USE, &state->flags);
+	spin_unlock(&state->state_lock);
+	if (lsp->ls_flags & NFS_LOCK_INITIALIZED)
+		nfs4_release_lockowner(lsp);
+	nfs4_free_lock_state(lsp);
+}
+
+static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
+{
+	struct nfs4_lock_state *lsp = src->fl_u.nfs4_fl.owner;
+
+	dst->fl_u.nfs4_fl.owner = lsp;
+	atomic_inc(&lsp->ls_count);
+}
+
+static void nfs4_fl_release_lock(struct file_lock *fl)
+{
+	nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner);
+}
+
+static const struct file_lock_operations nfs4_fl_lock_ops = {
+	.fl_copy_lock = nfs4_fl_copy_lock,
+	.fl_release_private = nfs4_fl_release_lock,
+};
+
+int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
+{
+	struct nfs4_lock_state *lsp;
+
+	if (fl->fl_ops != NULL)
+		return 0;
+	if (fl->fl_flags & FL_POSIX)
+		lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
+	else if (fl->fl_flags & FL_FLOCK)
+		lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE);
+	else
+		return -EINVAL;
+	if (lsp == NULL)
+		return -ENOMEM;
+	fl->fl_u.nfs4_fl.owner = lsp;
+	fl->fl_ops = &nfs4_fl_lock_ops;
+	return 0;
+}
+
+/*
+ * Byte-range lock aware utility to initialize the stateid of read/write
+ * requests.
+ */
+void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid)
+{
+	struct nfs4_lock_state *lsp;
+	int seq;
+
+	do {
+		seq = read_seqbegin(&state->seqlock);
+		memcpy(dst, &state->stateid, sizeof(*dst));
+	} while (read_seqretry(&state->seqlock, seq));
+	if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
+		return;
+
+	spin_lock(&state->state_lock);
+	lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
+	if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
+		memcpy(dst, &lsp->ls_stateid, sizeof(*dst));
+	spin_unlock(&state->state_lock);
+	nfs4_put_lock_state(lsp);
+}
+
+struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
+{
+	struct nfs_seqid *new;
+
+	new = kmalloc(sizeof(*new), gfp_mask);
+	if (new != NULL) {
+		new->sequence = counter;
+		INIT_LIST_HEAD(&new->list);
+	}
+	return new;
+}
+
+void nfs_release_seqid(struct nfs_seqid *seqid)
+{
+	if (!list_empty(&seqid->list)) {
+		struct rpc_sequence *sequence = seqid->sequence->sequence;
+
+		spin_lock(&sequence->lock);
+		list_del_init(&seqid->list);
+		spin_unlock(&sequence->lock);
+		rpc_wake_up(&sequence->wait);
+	}
+}
+
+void nfs_free_seqid(struct nfs_seqid *seqid)
+{
+	nfs_release_seqid(seqid);
+	kfree(seqid);
+}
+
+/*
+ * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or
+ * failed with a seqid incrementing error -
+ * see comments nfs_fs.h:seqid_mutating_error()
+ */
+static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
+{
+	BUG_ON(list_first_entry(&seqid->sequence->sequence->list, struct nfs_seqid, list) != seqid);
+	switch (status) {
+		case 0:
+			break;
+		case -NFS4ERR_BAD_SEQID:
+			if (seqid->sequence->flags & NFS_SEQID_CONFIRMED)
+				return;
+			printk(KERN_WARNING "NFS: v4 server returned a bad"
+					" sequence-id error on an"
+					" unconfirmed sequence %p!\n",
+					seqid->sequence);
+		case -NFS4ERR_STALE_CLIENTID:
+		case -NFS4ERR_STALE_STATEID:
+		case -NFS4ERR_BAD_STATEID:
+		case -NFS4ERR_BADXDR:
+		case -NFS4ERR_RESOURCE:
+		case -NFS4ERR_NOFILEHANDLE:
+			/* Non-seqid mutating errors */
+			return;
+	};
+	/*
+	 * Note: no locking needed as we are guaranteed to be first
+	 * on the sequence list
+	 */
+	seqid->sequence->counter++;
+}
+
+void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
+{
+	struct nfs4_state_owner *sp = container_of(seqid->sequence,
+					struct nfs4_state_owner, so_seqid);
+	struct nfs_server *server = sp->so_server;
+
+	if (status == -NFS4ERR_BAD_SEQID)
+		nfs4_drop_state_owner(sp);
+	if (!nfs4_has_session(server->nfs_client))
+		nfs_increment_seqid(status, seqid);
+}
+
+/*
+ * Increment the seqid if the LOCK/LOCKU succeeded, or
+ * failed with a seqid incrementing error -
+ * see comments nfs_fs.h:seqid_mutating_error()
+ */
+void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
+{
+	nfs_increment_seqid(status, seqid);
+}
+
+int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
+{
+	struct rpc_sequence *sequence = seqid->sequence->sequence;
+	int status = 0;
+
+	spin_lock(&sequence->lock);
+	if (list_empty(&seqid->list))
+		list_add_tail(&seqid->list, &sequence->list);
+	if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid)
+		goto unlock;
+	rpc_sleep_on(&sequence->wait, task, NULL);
+	status = -EAGAIN;
+unlock:
+	spin_unlock(&sequence->lock);
+	return status;
+}
+
+static int nfs4_run_state_manager(void *);
+
+static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
+{
+	smp_mb__before_clear_bit();
+	clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING);
+	rpc_wake_up(&clp->cl_rpcwaitq);
+}
+
+/*
+ * Schedule the nfs_client asynchronous state management routine
+ */
+void nfs4_schedule_state_manager(struct nfs_client *clp)
+{
+	struct task_struct *task;
+
+	if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+		return;
+	__module_get(THIS_MODULE);
+	atomic_inc(&clp->cl_count);
+	task = kthread_run(nfs4_run_state_manager, clp, "%s-manager",
+				rpc_peeraddr2str(clp->cl_rpcclient,
+							RPC_DISPLAY_ADDR));
+	if (!IS_ERR(task))
+		return;
+	nfs4_clear_state_manager_bit(clp);
+	nfs_put_client(clp);
+	module_put(THIS_MODULE);
+}
+
+/*
+ * Schedule a lease recovery attempt
+ */
+void nfs4_schedule_lease_recovery(struct nfs_client *clp)
+{
+	if (!clp)
+		return;
+	if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+		set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+	nfs4_schedule_state_manager(clp);
+}
+
+static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+{
+
+	set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+	/* Don't recover state that expired before the reboot */
+	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) {
+		clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+		return 0;
+	}
+	set_bit(NFS_OWNER_RECLAIM_REBOOT, &state->owner->so_flags);
+	set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+	return 1;
+}
+
+static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+{
+	set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
+	clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+	set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags);
+	set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+	return 1;
+}
+
+void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	nfs4_state_mark_reclaim_nograce(clp, state);
+	nfs4_schedule_state_manager(clp);
+}
+
+void nfs_inode_find_state_and_recover(struct inode *inode,
+		const nfs4_stateid *stateid)
+{
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_open_context *ctx;
+	struct nfs4_state *state;
+	bool found = false;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(ctx, &nfsi->open_files, list) {
+		state = ctx->state;
+		if (state == NULL)
+			continue;
+		if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
+			continue;
+		if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0)
+			continue;
+		nfs4_state_mark_reclaim_nograce(clp, state);
+		found = true;
+	}
+	spin_unlock(&inode->i_lock);
+	if (found)
+		nfs4_schedule_state_manager(clp);
+}
+
+
+static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
+{
+	struct inode *inode = state->inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct file_lock *fl;
+	int status = 0;
+
+	if (inode->i_flock == NULL)
+		return 0;
+
+	/* Guard against delegation returns and new lock/unlock calls */
+	down_write(&nfsi->rwsem);
+	/* Protect inode->i_flock using the BKL */
+	lock_flocks();
+	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
+			continue;
+		if (nfs_file_open_context(fl->fl_file)->state != state)
+			continue;
+		unlock_flocks();
+		status = ops->recover_lock(state, fl);
+		switch (status) {
+			case 0:
+				break;
+			case -ESTALE:
+			case -NFS4ERR_ADMIN_REVOKED:
+			case -NFS4ERR_STALE_STATEID:
+			case -NFS4ERR_BAD_STATEID:
+			case -NFS4ERR_EXPIRED:
+			case -NFS4ERR_NO_GRACE:
+			case -NFS4ERR_STALE_CLIENTID:
+			case -NFS4ERR_BADSESSION:
+			case -NFS4ERR_BADSLOT:
+			case -NFS4ERR_BAD_HIGH_SLOT:
+			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+				goto out;
+			default:
+				printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
+						__func__, status);
+			case -ENOMEM:
+			case -NFS4ERR_DENIED:
+			case -NFS4ERR_RECLAIM_BAD:
+			case -NFS4ERR_RECLAIM_CONFLICT:
+				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
+				status = 0;
+		}
+		lock_flocks();
+	}
+	unlock_flocks();
+out:
+	up_write(&nfsi->rwsem);
+	return status;
+}
+
+static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops)
+{
+	struct nfs4_state *state;
+	struct nfs4_lock_state *lock;
+	int status = 0;
+
+	/* Note: we rely on the sp->so_states list being ordered 
+	 * so that we always reclaim open(O_RDWR) and/or open(O_WRITE)
+	 * states first.
+	 * This is needed to ensure that the server won't give us any
+	 * read delegations that we have to return if, say, we are
+	 * recovering after a network partition or a reboot from a
+	 * server that doesn't support a grace period.
+	 */
+restart:
+	spin_lock(&sp->so_lock);
+	list_for_each_entry(state, &sp->so_states, open_states) {
+		if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
+			continue;
+		if (state->state == 0)
+			continue;
+		atomic_inc(&state->count);
+		spin_unlock(&sp->so_lock);
+		status = ops->recover_open(sp, state);
+		if (status >= 0) {
+			status = nfs4_reclaim_locks(state, ops);
+			if (status >= 0) {
+				list_for_each_entry(lock, &state->lock_states, ls_locks) {
+					if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
+						printk("%s: Lock reclaim failed!\n",
+							__func__);
+				}
+				nfs4_put_open_state(state);
+				goto restart;
+			}
+		}
+		switch (status) {
+			default:
+				printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
+						__func__, status);
+			case -ENOENT:
+			case -ENOMEM:
+			case -ESTALE:
+				/*
+				 * Open state on this file cannot be recovered
+				 * All we can do is revert to using the zero stateid.
+				 */
+				memset(state->stateid.data, 0,
+					sizeof(state->stateid.data));
+				/* Mark the file as being 'closed' */
+				state->state = 0;
+				break;
+			case -EKEYEXPIRED:
+				/*
+				 * User RPCSEC_GSS context has expired.
+				 * We cannot recover this stateid now, so
+				 * skip it and allow recovery thread to
+				 * proceed.
+				 */
+				break;
+			case -NFS4ERR_ADMIN_REVOKED:
+			case -NFS4ERR_STALE_STATEID:
+			case -NFS4ERR_BAD_STATEID:
+			case -NFS4ERR_RECLAIM_BAD:
+			case -NFS4ERR_RECLAIM_CONFLICT:
+				nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+				break;
+			case -NFS4ERR_EXPIRED:
+			case -NFS4ERR_NO_GRACE:
+				nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+			case -NFS4ERR_STALE_CLIENTID:
+			case -NFS4ERR_BADSESSION:
+			case -NFS4ERR_BADSLOT:
+			case -NFS4ERR_BAD_HIGH_SLOT:
+			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+				goto out_err;
+		}
+		nfs4_put_open_state(state);
+		goto restart;
+	}
+	spin_unlock(&sp->so_lock);
+	return 0;
+out_err:
+	nfs4_put_open_state(state);
+	return status;
+}
+
+static void nfs4_clear_open_state(struct nfs4_state *state)
+{
+	struct nfs4_lock_state *lock;
+
+	clear_bit(NFS_DELEGATED_STATE, &state->flags);
+	clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+	clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+	clear_bit(NFS_O_RDWR_STATE, &state->flags);
+	list_for_each_entry(lock, &state->lock_states, ls_locks) {
+		lock->ls_seqid.flags = 0;
+		lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
+	}
+}
+
+static void nfs4_reset_seqids(struct nfs_server *server,
+	int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_state_owner *sp;
+	struct rb_node *pos;
+	struct nfs4_state *state;
+
+	spin_lock(&clp->cl_lock);
+	for (pos = rb_first(&server->state_owners);
+	     pos != NULL;
+	     pos = rb_next(pos)) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+		sp->so_seqid.flags = 0;
+		spin_lock(&sp->so_lock);
+		list_for_each_entry(state, &sp->so_states, open_states) {
+			if (mark_reclaim(clp, state))
+				nfs4_clear_open_state(state);
+		}
+		spin_unlock(&sp->so_lock);
+	}
+	spin_unlock(&clp->cl_lock);
+}
+
+static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
+	int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs4_reset_seqids(server, mark_reclaim);
+	rcu_read_unlock();
+}
+
+static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
+{
+	/* Mark all delegations for reclaim */
+	nfs_delegation_mark_reclaim(clp);
+	nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
+}
+
+static void nfs4_reclaim_complete(struct nfs_client *clp,
+				 const struct nfs4_state_recovery_ops *ops)
+{
+	/* Notify the server we're done reclaiming our state */
+	if (ops->reclaim_complete)
+		(void)ops->reclaim_complete(clp);
+}
+
+static void nfs4_clear_reclaim_server(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_state_owner *sp;
+	struct rb_node *pos;
+	struct nfs4_state *state;
+
+	spin_lock(&clp->cl_lock);
+	for (pos = rb_first(&server->state_owners);
+	     pos != NULL;
+	     pos = rb_next(pos)) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+		spin_lock(&sp->so_lock);
+		list_for_each_entry(state, &sp->so_states, open_states) {
+			if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT,
+						&state->flags))
+				continue;
+			nfs4_state_mark_reclaim_nograce(clp, state);
+		}
+		spin_unlock(&sp->so_lock);
+	}
+	spin_unlock(&clp->cl_lock);
+}
+
+static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+
+	if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+		return 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs4_clear_reclaim_server(server);
+	rcu_read_unlock();
+
+	nfs_delegation_reap_unclaimed(clp);
+	return 1;
+}
+
+static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
+{
+	if (!nfs4_state_clear_reclaim_reboot(clp))
+		return;
+	nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
+}
+
+static void nfs_delegation_clear_all(struct nfs_client *clp)
+{
+	nfs_delegation_mark_reclaim(clp);
+	nfs_delegation_reap_unclaimed(clp);
+}
+
+static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
+{
+	nfs_delegation_clear_all(clp);
+	nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
+}
+
+static void nfs4_warn_keyexpired(const char *s)
+{
+	printk_ratelimited(KERN_WARNING "Error: state manager"
+			" encountered RPCSEC_GSS session"
+			" expired against NFSv4 server %s.\n",
+			s);
+}
+
+static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
+{
+	switch (error) {
+		case -NFS4ERR_CB_PATH_DOWN:
+			nfs_handle_cb_pathdown(clp);
+			return 0;
+		case -NFS4ERR_NO_GRACE:
+			nfs4_state_end_reclaim_reboot(clp);
+			return 0;
+		case -NFS4ERR_STALE_CLIENTID:
+		case -NFS4ERR_LEASE_MOVED:
+			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+			nfs4_state_clear_reclaim_reboot(clp);
+			nfs4_state_start_reclaim_reboot(clp);
+			break;
+		case -NFS4ERR_EXPIRED:
+			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+			nfs4_state_start_reclaim_nograce(clp);
+			break;
+		case -NFS4ERR_BADSESSION:
+		case -NFS4ERR_BADSLOT:
+		case -NFS4ERR_BAD_HIGH_SLOT:
+		case -NFS4ERR_DEADSESSION:
+		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+		case -NFS4ERR_SEQ_FALSE_RETRY:
+		case -NFS4ERR_SEQ_MISORDERED:
+			set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+			/* Zero session reset errors */
+			return 0;
+		case -EKEYEXPIRED:
+			/* Nothing we can do */
+			nfs4_warn_keyexpired(clp->cl_hostname);
+			return 0;
+	}
+	return error;
+}
+
+static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
+{
+	struct nfs4_state_owner *sp;
+	struct nfs_server *server;
+	struct rb_node *pos;
+	int status = 0;
+
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		spin_lock(&clp->cl_lock);
+		for (pos = rb_first(&server->state_owners);
+		     pos != NULL;
+		     pos = rb_next(pos)) {
+			sp = rb_entry(pos,
+				struct nfs4_state_owner, so_server_node);
+			if (!test_and_clear_bit(ops->owner_flag_bit,
+							&sp->so_flags))
+				continue;
+			atomic_inc(&sp->so_count);
+			spin_unlock(&clp->cl_lock);
+			rcu_read_unlock();
+
+			status = nfs4_reclaim_open_state(sp, ops);
+			if (status < 0) {
+				set_bit(ops->owner_flag_bit, &sp->so_flags);
+				nfs4_put_state_owner(sp);
+				return nfs4_recovery_handle_error(clp, status);
+			}
+
+			nfs4_put_state_owner(sp);
+			goto restart;
+		}
+		spin_unlock(&clp->cl_lock);
+	}
+	rcu_read_unlock();
+	return status;
+}
+
+static int nfs4_check_lease(struct nfs_client *clp)
+{
+	struct rpc_cred *cred;
+	const struct nfs4_state_maintenance_ops *ops =
+		clp->cl_mvops->state_renewal_ops;
+	int status = -NFS4ERR_EXPIRED;
+
+	/* Is the client already known to have an expired lease? */
+	if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+		return 0;
+	spin_lock(&clp->cl_lock);
+	cred = ops->get_state_renewal_cred_locked(clp);
+	spin_unlock(&clp->cl_lock);
+	if (cred == NULL) {
+		cred = nfs4_get_setclientid_cred(clp);
+		if (cred == NULL)
+			goto out;
+	}
+	status = ops->renew_lease(clp, cred);
+	put_rpccred(cred);
+out:
+	return nfs4_recovery_handle_error(clp, status);
+}
+
+static int nfs4_reclaim_lease(struct nfs_client *clp)
+{
+	struct rpc_cred *cred;
+	const struct nfs4_state_recovery_ops *ops =
+		clp->cl_mvops->reboot_recovery_ops;
+	int status = -ENOENT;
+
+	cred = ops->get_clid_cred(clp);
+	if (cred != NULL) {
+		status = ops->establish_clid(clp, cred);
+		put_rpccred(cred);
+		/* Handle case where the user hasn't set up machine creds */
+		if (status == -EACCES && cred == clp->cl_machine_cred) {
+			nfs4_clear_machine_cred(clp);
+			status = -EAGAIN;
+		}
+		if (status == -NFS4ERR_MINOR_VERS_MISMATCH)
+			status = -EPROTONOSUPPORT;
+	}
+	return status;
+}
+
+#ifdef CONFIG_NFS_V4_1
+void nfs4_schedule_session_recovery(struct nfs4_session *session)
+{
+	struct nfs_client *clp = session->clp;
+
+	set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+	nfs4_schedule_lease_recovery(clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
+
+void nfs41_handle_recall_slot(struct nfs_client *clp)
+{
+	set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
+	nfs4_schedule_state_manager(clp);
+}
+
+static void nfs4_reset_all_state(struct nfs_client *clp)
+{
+	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
+		clp->cl_boot_time = CURRENT_TIME;
+		nfs4_state_start_reclaim_nograce(clp);
+		nfs4_schedule_state_manager(clp);
+	}
+}
+
+static void nfs41_handle_server_reboot(struct nfs_client *clp)
+{
+	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
+		nfs4_state_start_reclaim_reboot(clp);
+		nfs4_schedule_state_manager(clp);
+	}
+}
+
+static void nfs41_handle_state_revoked(struct nfs_client *clp)
+{
+	/* Temporary */
+	nfs4_reset_all_state(clp);
+}
+
+static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)
+{
+	/* This will need to handle layouts too */
+	nfs_expire_all_delegations(clp);
+}
+
+static void nfs41_handle_cb_path_down(struct nfs_client *clp)
+{
+	nfs_expire_all_delegations(clp);
+	if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
+		nfs4_schedule_state_manager(clp);
+}
+
+void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
+{
+	if (!flags)
+		return;
+	if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
+		nfs41_handle_server_reboot(clp);
+	if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
+			    SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
+			    SEQ4_STATUS_ADMIN_STATE_REVOKED |
+			    SEQ4_STATUS_LEASE_MOVED))
+		nfs41_handle_state_revoked(clp);
+	if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
+		nfs41_handle_recallable_state_revoked(clp);
+	if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
+			    SEQ4_STATUS_BACKCHANNEL_FAULT |
+			    SEQ4_STATUS_CB_PATH_DOWN_SESSION))
+		nfs41_handle_cb_path_down(clp);
+}
+
+static int nfs4_reset_session(struct nfs_client *clp)
+{
+	int status;
+
+	nfs4_begin_drain_session(clp);
+	status = nfs4_proc_destroy_session(clp->cl_session);
+	if (status && status != -NFS4ERR_BADSESSION &&
+	    status != -NFS4ERR_DEADSESSION) {
+		status = nfs4_recovery_handle_error(clp, status);
+		goto out;
+	}
+
+	memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
+	status = nfs4_proc_create_session(clp);
+	if (status) {
+		status = nfs4_recovery_handle_error(clp, status);
+		goto out;
+	}
+	clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+	/* create_session negotiated new slot table */
+	clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
+
+	 /* Let the state manager reestablish state */
+	if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+		nfs41_setup_state_renewal(clp);
+out:
+	return status;
+}
+
+static int nfs4_recall_slot(struct nfs_client *clp)
+{
+	struct nfs4_slot_table *fc_tbl = &clp->cl_session->fc_slot_table;
+	struct nfs4_channel_attrs *fc_attrs = &clp->cl_session->fc_attrs;
+	struct nfs4_slot *new, *old;
+	int i;
+
+	nfs4_begin_drain_session(clp);
+	new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
+		      GFP_NOFS);
+        if (!new)
+		return -ENOMEM;
+
+	spin_lock(&fc_tbl->slot_tbl_lock);
+	for (i = 0; i < fc_tbl->target_max_slots; i++)
+		new[i].seq_nr = fc_tbl->slots[i].seq_nr;
+	old = fc_tbl->slots;
+	fc_tbl->slots = new;
+	fc_tbl->max_slots = fc_tbl->target_max_slots;
+	fc_tbl->target_max_slots = 0;
+	fc_attrs->max_reqs = fc_tbl->max_slots;
+	spin_unlock(&fc_tbl->slot_tbl_lock);
+
+	kfree(old);
+	nfs4_end_drain_session(clp);
+	return 0;
+}
+
+#else /* CONFIG_NFS_V4_1 */
+static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
+static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
+static int nfs4_recall_slot(struct nfs_client *clp) { return 0; }
+#endif /* CONFIG_NFS_V4_1 */
+
+/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
+ * on EXCHANGE_ID for v4.1
+ */
+static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
+{
+	switch (status) {
+	case -NFS4ERR_CLID_INUSE:
+	case -NFS4ERR_STALE_CLIENTID:
+		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+		break;
+	case -NFS4ERR_DELAY:
+	case -ETIMEDOUT:
+	case -EAGAIN:
+		ssleep(1);
+		break;
+
+	case -EKEYEXPIRED:
+		nfs4_warn_keyexpired(clp->cl_hostname);
+	case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
+				 * in nfs4_exchange_id */
+	default:
+		return;
+	}
+	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+}
+
+static void nfs4_state_manager(struct nfs_client *clp)
+{
+	int status = 0;
+
+	/* Ensure exclusive access to NFSv4 state */
+	do {
+		if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
+			/* We're going to have to re-establish a clientid */
+			status = nfs4_reclaim_lease(clp);
+			if (status) {
+				nfs4_set_lease_expired(clp, status);
+				if (test_bit(NFS4CLNT_LEASE_EXPIRED,
+							&clp->cl_state))
+					continue;
+				if (clp->cl_cons_state ==
+							NFS_CS_SESSION_INITING)
+					nfs_mark_client_ready(clp, status);
+				goto out_error;
+			}
+			clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+			set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+			pnfs_destroy_all_layouts(clp);
+		}
+
+		if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
+			status = nfs4_check_lease(clp);
+			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+				continue;
+			if (status < 0 && status != -NFS4ERR_CB_PATH_DOWN)
+				goto out_error;
+		}
+
+		/* Initialize or reset the session */
+		if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)
+		   && nfs4_has_session(clp)) {
+			status = nfs4_reset_session(clp);
+			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+				continue;
+			if (status < 0)
+				goto out_error;
+		}
+
+		/* First recover reboot state... */
+		if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
+			status = nfs4_do_reclaim(clp,
+				clp->cl_mvops->reboot_recovery_ops);
+			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
+			    test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
+				continue;
+			nfs4_state_end_reclaim_reboot(clp);
+			if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
+				continue;
+			if (status < 0)
+				goto out_error;
+		}
+
+		/* Now recover expired state... */
+		if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
+			status = nfs4_do_reclaim(clp,
+				clp->cl_mvops->nograce_recovery_ops);
+			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
+			    test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
+			    test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+				continue;
+			if (status < 0)
+				goto out_error;
+		}
+
+		nfs4_end_drain_session(clp);
+		if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
+			nfs_client_return_marked_delegations(clp);
+			continue;
+		}
+		/* Recall session slots */
+		if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)
+		   && nfs4_has_session(clp)) {
+			status = nfs4_recall_slot(clp);
+			if (status < 0)
+				goto out_error;
+			continue;
+		}
+
+
+		nfs4_clear_state_manager_bit(clp);
+		/* Did we race with an attempt to give us more work? */
+		if (clp->cl_state == 0)
+			break;
+		if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+			break;
+	} while (atomic_read(&clp->cl_count) > 1);
+	return;
+out_error:
+	printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s"
+			" with error %d\n", clp->cl_hostname, -status);
+	nfs4_end_drain_session(clp);
+	nfs4_clear_state_manager_bit(clp);
+}
+
+static int nfs4_run_state_manager(void *ptr)
+{
+	struct nfs_client *clp = ptr;
+
+	allow_signal(SIGKILL);
+	nfs4_state_manager(clp);
+	nfs_put_client(clp);
+	module_put_and_exit(0);
+	return 0;
+}
+
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
new file mode 100644
index 00000000..fc97fd53
--- /dev/null
+++ b/fs/nfs/nfs4xdr.c
@@ -0,0 +1,6679 @@
+/*
+ *  fs/nfs/nfs4xdr.c
+ *
+ *  Client-side XDR for NFSv4.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/kdev_t.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/gss_api.h>
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_idmap.h>
+#include "nfs4_fs.h"
+#include "internal.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_XDR
+
+/* Mapping from NFS error code to "errno" error code. */
+#define errno_NFSERR_IO		EIO
+
+static int nfs4_stat_to_errno(int);
+
+/* NFSv4 COMPOUND tags are only wanted for debugging purposes */
+#ifdef DEBUG
+#define NFS4_MAXTAGLEN		20
+#else
+#define NFS4_MAXTAGLEN		0
+#endif
+
+/* lock,open owner id:
+ * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)
+ */
+#define open_owner_id_maxsz	(1 + 1 + 4)
+#define lock_owner_id_maxsz	(1 + 1 + 4)
+#define decode_lockowner_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define compound_encode_hdr_maxsz	(3 + (NFS4_MAXTAGLEN >> 2))
+#define compound_decode_hdr_maxsz	(3 + (NFS4_MAXTAGLEN >> 2))
+#define op_encode_hdr_maxsz	(1)
+#define op_decode_hdr_maxsz	(2)
+#define encode_stateid_maxsz	(XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_stateid_maxsz	(XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define encode_verifier_maxsz	(XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define decode_verifier_maxsz	(XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define encode_putfh_maxsz	(op_encode_hdr_maxsz + 1 + \
+				(NFS4_FHSIZE >> 2))
+#define decode_putfh_maxsz	(op_decode_hdr_maxsz)
+#define encode_putrootfh_maxsz	(op_encode_hdr_maxsz)
+#define decode_putrootfh_maxsz	(op_decode_hdr_maxsz)
+#define encode_getfh_maxsz      (op_encode_hdr_maxsz)
+#define decode_getfh_maxsz      (op_decode_hdr_maxsz + 1 + \
+				((3+NFS4_FHSIZE) >> 2))
+#define nfs4_fattr_bitmap_maxsz 4
+#define encode_getattr_maxsz    (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+#define nfs4_name_maxsz		(1 + ((3 + NFS4_MAXNAMLEN) >> 2))
+#define nfs4_path_maxsz		(1 + ((3 + NFS4_MAXPATHLEN) >> 2))
+#define nfs4_owner_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define nfs4_group_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
+/* This is based on getfattr, which uses the most attributes: */
+#define nfs4_fattr_value_maxsz	(1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
+				3 + 3 + 3 + nfs4_owner_maxsz + nfs4_group_maxsz))
+#define nfs4_fattr_maxsz	(nfs4_fattr_bitmap_maxsz + \
+				nfs4_fattr_value_maxsz)
+#define decode_getattr_maxsz    (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
+#define encode_attrs_maxsz	(nfs4_fattr_bitmap_maxsz + \
+				 1 + 2 + 1 + \
+				nfs4_owner_maxsz + \
+				nfs4_group_maxsz + \
+				4 + 4)
+#define encode_savefh_maxsz     (op_encode_hdr_maxsz)
+#define decode_savefh_maxsz     (op_decode_hdr_maxsz)
+#define encode_restorefh_maxsz  (op_encode_hdr_maxsz)
+#define decode_restorefh_maxsz  (op_decode_hdr_maxsz)
+#define encode_fsinfo_maxsz	(encode_getattr_maxsz)
+#define decode_fsinfo_maxsz	(op_decode_hdr_maxsz + 15)
+#define encode_renew_maxsz	(op_encode_hdr_maxsz + 3)
+#define decode_renew_maxsz	(op_decode_hdr_maxsz)
+#define encode_setclientid_maxsz \
+				(op_encode_hdr_maxsz + \
+				XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
+				XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \
+				1 /* sc_prog */ + \
+				XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
+				XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
+				1) /* sc_cb_ident */
+#define decode_setclientid_maxsz \
+				(op_decode_hdr_maxsz + \
+				2 + \
+				1024) /* large value for CLID_INUSE */
+#define encode_setclientid_confirm_maxsz \
+				(op_encode_hdr_maxsz + \
+				3 + (NFS4_VERIFIER_SIZE >> 2))
+#define decode_setclientid_confirm_maxsz \
+				(op_decode_hdr_maxsz)
+#define encode_lookup_maxsz	(op_encode_hdr_maxsz + nfs4_name_maxsz)
+#define decode_lookup_maxsz	(op_decode_hdr_maxsz)
+#define encode_share_access_maxsz \
+				(2)
+#define encode_createmode_maxsz	(1 + encode_attrs_maxsz + encode_verifier_maxsz)
+#define encode_opentype_maxsz	(1 + encode_createmode_maxsz)
+#define encode_claim_null_maxsz	(1 + nfs4_name_maxsz)
+#define encode_open_maxsz	(op_encode_hdr_maxsz + \
+				2 + encode_share_access_maxsz + 2 + \
+				open_owner_id_maxsz + \
+				encode_opentype_maxsz + \
+				encode_claim_null_maxsz)
+#define decode_ace_maxsz	(3 + nfs4_owner_maxsz)
+#define decode_delegation_maxsz	(1 + decode_stateid_maxsz + 1 + \
+				decode_ace_maxsz)
+#define decode_change_info_maxsz	(5)
+#define decode_open_maxsz	(op_decode_hdr_maxsz + \
+				decode_stateid_maxsz + \
+				decode_change_info_maxsz + 1 + \
+				nfs4_fattr_bitmap_maxsz + \
+				decode_delegation_maxsz)
+#define encode_open_confirm_maxsz \
+				(op_encode_hdr_maxsz + \
+				 encode_stateid_maxsz + 1)
+#define decode_open_confirm_maxsz \
+				(op_decode_hdr_maxsz + \
+				 decode_stateid_maxsz)
+#define encode_open_downgrade_maxsz \
+				(op_encode_hdr_maxsz + \
+				 encode_stateid_maxsz + 1 + \
+				 encode_share_access_maxsz)
+#define decode_open_downgrade_maxsz \
+				(op_decode_hdr_maxsz + \
+				 decode_stateid_maxsz)
+#define encode_close_maxsz	(op_encode_hdr_maxsz + \
+				 1 + encode_stateid_maxsz)
+#define decode_close_maxsz	(op_decode_hdr_maxsz + \
+				 decode_stateid_maxsz)
+#define encode_setattr_maxsz	(op_encode_hdr_maxsz + \
+				 encode_stateid_maxsz + \
+				 encode_attrs_maxsz)
+#define decode_setattr_maxsz	(op_decode_hdr_maxsz + \
+				 nfs4_fattr_bitmap_maxsz)
+#define encode_read_maxsz	(op_encode_hdr_maxsz + \
+				 encode_stateid_maxsz + 3)
+#define decode_read_maxsz	(op_decode_hdr_maxsz + 2)
+#define encode_readdir_maxsz	(op_encode_hdr_maxsz + \
+				 2 + encode_verifier_maxsz + 5)
+#define decode_readdir_maxsz	(op_decode_hdr_maxsz + \
+				 decode_verifier_maxsz)
+#define encode_readlink_maxsz	(op_encode_hdr_maxsz)
+#define decode_readlink_maxsz	(op_decode_hdr_maxsz + 1)
+#define encode_write_maxsz	(op_encode_hdr_maxsz + \
+				 encode_stateid_maxsz + 4)
+#define decode_write_maxsz	(op_decode_hdr_maxsz + \
+				 2 + decode_verifier_maxsz)
+#define encode_commit_maxsz	(op_encode_hdr_maxsz + 3)
+#define decode_commit_maxsz	(op_decode_hdr_maxsz + \
+				 decode_verifier_maxsz)
+#define encode_remove_maxsz	(op_encode_hdr_maxsz + \
+				nfs4_name_maxsz)
+#define decode_remove_maxsz	(op_decode_hdr_maxsz + \
+				 decode_change_info_maxsz)
+#define encode_rename_maxsz	(op_encode_hdr_maxsz + \
+				2 * nfs4_name_maxsz)
+#define decode_rename_maxsz	(op_decode_hdr_maxsz + \
+				 decode_change_info_maxsz + \
+				 decode_change_info_maxsz)
+#define encode_link_maxsz	(op_encode_hdr_maxsz + \
+				nfs4_name_maxsz)
+#define decode_link_maxsz	(op_decode_hdr_maxsz + decode_change_info_maxsz)
+#define encode_lockowner_maxsz	(7)
+#define encode_lock_maxsz	(op_encode_hdr_maxsz + \
+				 7 + \
+				 1 + encode_stateid_maxsz + 1 + \
+				 encode_lockowner_maxsz)
+#define decode_lock_denied_maxsz \
+				(8 + decode_lockowner_maxsz)
+#define decode_lock_maxsz	(op_decode_hdr_maxsz + \
+				 decode_lock_denied_maxsz)
+#define encode_lockt_maxsz	(op_encode_hdr_maxsz + 5 + \
+				encode_lockowner_maxsz)
+#define decode_lockt_maxsz	(op_decode_hdr_maxsz + \
+				 decode_lock_denied_maxsz)
+#define encode_locku_maxsz	(op_encode_hdr_maxsz + 3 + \
+				 encode_stateid_maxsz + \
+				 4)
+#define decode_locku_maxsz	(op_decode_hdr_maxsz + \
+				 decode_stateid_maxsz)
+#define encode_release_lockowner_maxsz \
+				(op_encode_hdr_maxsz + \
+				 encode_lockowner_maxsz)
+#define decode_release_lockowner_maxsz \
+				(op_decode_hdr_maxsz)
+#define encode_access_maxsz	(op_encode_hdr_maxsz + 1)
+#define decode_access_maxsz	(op_decode_hdr_maxsz + 2)
+#define encode_symlink_maxsz	(op_encode_hdr_maxsz + \
+				1 + nfs4_name_maxsz + \
+				1 + \
+				nfs4_fattr_maxsz)
+#define decode_symlink_maxsz	(op_decode_hdr_maxsz + 8)
+#define encode_create_maxsz	(op_encode_hdr_maxsz + \
+				1 + 2 + nfs4_name_maxsz + \
+				encode_attrs_maxsz)
+#define decode_create_maxsz	(op_decode_hdr_maxsz + \
+				decode_change_info_maxsz + \
+				nfs4_fattr_bitmap_maxsz)
+#define encode_statfs_maxsz	(encode_getattr_maxsz)
+#define decode_statfs_maxsz	(decode_getattr_maxsz)
+#define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4)
+#define decode_delegreturn_maxsz (op_decode_hdr_maxsz)
+#define encode_getacl_maxsz	(encode_getattr_maxsz)
+#define decode_getacl_maxsz	(op_decode_hdr_maxsz + \
+				 nfs4_fattr_bitmap_maxsz + 1)
+#define encode_setacl_maxsz	(op_encode_hdr_maxsz + \
+				 encode_stateid_maxsz + 3)
+#define decode_setacl_maxsz	(decode_setattr_maxsz)
+#define encode_fs_locations_maxsz \
+				(encode_getattr_maxsz)
+#define decode_fs_locations_maxsz \
+				(0)
+#define encode_secinfo_maxsz	(op_encode_hdr_maxsz + nfs4_name_maxsz)
+#define decode_secinfo_maxsz	(op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4))
+
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_MAX_MACHINE_NAME_LEN (64)
+
+#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
+				encode_verifier_maxsz + \
+				1 /* co_ownerid.len */ + \
+				XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \
+				1 /* flags */ + \
+				1 /* spa_how */ + \
+				0 /* SP4_NONE (for now) */ + \
+				1 /* zero implemetation id array */)
+#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
+				2 /* eir_clientid */ + \
+				1 /* eir_sequenceid */ + \
+				1 /* eir_flags */ + \
+				1 /* spr_how */ + \
+				0 /* SP4_NONE (for now) */ + \
+				2 /* eir_server_owner.so_minor_id */ + \
+				/* eir_server_owner.so_major_id<> */ \
+				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+				/* eir_server_scope<> */ \
+				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+				1 /* eir_server_impl_id array length */ + \
+				0 /* ignored eir_server_impl_id contents */)
+#define encode_channel_attrs_maxsz  (6 + 1 /* ca_rdma_ird.len (0) */)
+#define decode_channel_attrs_maxsz  (6 + \
+				     1 /* ca_rdma_ird.len */ + \
+				     1 /* ca_rdma_ird */)
+#define encode_create_session_maxsz  (op_encode_hdr_maxsz + \
+				     2 /* csa_clientid */ + \
+				     1 /* csa_sequence */ + \
+				     1 /* csa_flags */ + \
+				     encode_channel_attrs_maxsz + \
+				     encode_channel_attrs_maxsz + \
+				     1 /* csa_cb_program */ + \
+				     1 /* csa_sec_parms.len (1) */ + \
+				     1 /* cb_secflavor (AUTH_SYS) */ + \
+				     1 /* stamp */ + \
+				     1 /* machinename.len */ + \
+				     XDR_QUADLEN(NFS4_MAX_MACHINE_NAME_LEN) + \
+				     1 /* uid */ + \
+				     1 /* gid */ + \
+				     1 /* gids.len (0) */)
+#define decode_create_session_maxsz  (op_decode_hdr_maxsz +	\
+				     XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+				     1 /* csr_sequence */ + \
+				     1 /* csr_flags */ + \
+				     decode_channel_attrs_maxsz + \
+				     decode_channel_attrs_maxsz)
+#define encode_destroy_session_maxsz    (op_encode_hdr_maxsz + 4)
+#define decode_destroy_session_maxsz    (op_decode_hdr_maxsz)
+#define encode_sequence_maxsz	(op_encode_hdr_maxsz + \
+				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4)
+#define decode_sequence_maxsz	(op_decode_hdr_maxsz + \
+				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
+#define encode_reclaim_complete_maxsz	(op_encode_hdr_maxsz + 4)
+#define decode_reclaim_complete_maxsz	(op_decode_hdr_maxsz + 4)
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
+				XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
+#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
+				1 /* layout type */ + \
+				1 /* opaque devaddr4 length */ + \
+				  /* devaddr4 payload is read into page */ \
+				1 /* notification bitmap length */ + \
+				1 /* notification bitmap */)
+#define encode_layoutget_maxsz	(op_encode_hdr_maxsz + 10 + \
+				encode_stateid_maxsz)
+#define decode_layoutget_maxsz	(op_decode_hdr_maxsz + 8 + \
+				decode_stateid_maxsz + \
+				XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
+#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz +          \
+				2 /* offset */ + \
+				2 /* length */ + \
+				1 /* reclaim */ + \
+				encode_stateid_maxsz + \
+				1 /* new offset (true) */ + \
+				2 /* last byte written */ + \
+				1 /* nt_timechanged (false) */ + \
+				1 /* layoutupdate4 layout type */ + \
+				1 /* NULL filelayout layoutupdate4 payload */)
+#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
+#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
+				encode_stateid_maxsz + \
+				1 /* FIXME: opaque lrf_body always empty at the moment */)
+#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
+				1 + decode_stateid_maxsz)
+#else /* CONFIG_NFS_V4_1 */
+#define encode_sequence_maxsz	0
+#define decode_sequence_maxsz	0
+#endif /* CONFIG_NFS_V4_1 */
+
+#define NFS4_enc_compound_sz	(1024)  /* XXX: large enough? */
+#define NFS4_dec_compound_sz	(1024)  /* XXX: large enough? */
+#define NFS4_enc_read_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_read_maxsz)
+#define NFS4_dec_read_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_read_maxsz)
+#define NFS4_enc_readlink_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_readlink_maxsz)
+#define NFS4_dec_readlink_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_readlink_maxsz)
+#define NFS4_enc_readdir_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_readdir_maxsz)
+#define NFS4_dec_readdir_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_readdir_maxsz)
+#define NFS4_enc_write_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_write_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_write_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_write_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_commit_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_commit_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_commit_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_commit_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_open_sz        (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_savefh_maxsz + \
+				encode_open_maxsz + \
+				encode_getfh_maxsz + \
+				encode_getattr_maxsz + \
+				encode_restorefh_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_open_sz        (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_savefh_maxsz + \
+				decode_open_maxsz + \
+				decode_getfh_maxsz + \
+				decode_getattr_maxsz + \
+				decode_restorefh_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_open_confirm_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_open_confirm_maxsz)
+#define NFS4_dec_open_confirm_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_open_confirm_maxsz)
+#define NFS4_enc_open_noattr_sz	(compound_encode_hdr_maxsz + \
+					encode_sequence_maxsz + \
+					encode_putfh_maxsz + \
+					encode_open_maxsz + \
+					encode_getattr_maxsz)
+#define NFS4_dec_open_noattr_sz	(compound_decode_hdr_maxsz + \
+					decode_sequence_maxsz + \
+					decode_putfh_maxsz + \
+					decode_open_maxsz + \
+					decode_getattr_maxsz)
+#define NFS4_enc_open_downgrade_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_sequence_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_open_downgrade_maxsz + \
+				 encode_getattr_maxsz)
+#define NFS4_dec_open_downgrade_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_open_downgrade_maxsz + \
+				 decode_getattr_maxsz)
+#define NFS4_enc_close_sz	(compound_encode_hdr_maxsz + \
+				 encode_sequence_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_close_maxsz + \
+				 encode_getattr_maxsz)
+#define NFS4_dec_close_sz	(compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_close_maxsz + \
+				 decode_getattr_maxsz)
+#define NFS4_enc_setattr_sz	(compound_encode_hdr_maxsz + \
+				 encode_sequence_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_setattr_maxsz + \
+				 encode_getattr_maxsz)
+#define NFS4_dec_setattr_sz	(compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_setattr_maxsz + \
+				 decode_getattr_maxsz)
+#define NFS4_enc_fsinfo_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_fsinfo_maxsz)
+#define NFS4_dec_fsinfo_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_fsinfo_maxsz)
+#define NFS4_enc_renew_sz	(compound_encode_hdr_maxsz + \
+				encode_renew_maxsz)
+#define NFS4_dec_renew_sz	(compound_decode_hdr_maxsz + \
+				decode_renew_maxsz)
+#define NFS4_enc_setclientid_sz	(compound_encode_hdr_maxsz + \
+				encode_setclientid_maxsz)
+#define NFS4_dec_setclientid_sz	(compound_decode_hdr_maxsz + \
+				decode_setclientid_maxsz)
+#define NFS4_enc_setclientid_confirm_sz \
+				(compound_encode_hdr_maxsz + \
+				encode_setclientid_confirm_maxsz + \
+				encode_putrootfh_maxsz + \
+				encode_fsinfo_maxsz)
+#define NFS4_dec_setclientid_confirm_sz \
+				(compound_decode_hdr_maxsz + \
+				decode_setclientid_confirm_maxsz + \
+				decode_putrootfh_maxsz + \
+				decode_fsinfo_maxsz)
+#define NFS4_enc_lock_sz        (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_lock_maxsz)
+#define NFS4_dec_lock_sz        (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_lock_maxsz)
+#define NFS4_enc_lockt_sz       (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_lockt_maxsz)
+#define NFS4_dec_lockt_sz       (compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_lockt_maxsz)
+#define NFS4_enc_locku_sz       (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_locku_maxsz)
+#define NFS4_dec_locku_sz       (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_locku_maxsz)
+#define NFS4_enc_release_lockowner_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_lockowner_maxsz)
+#define NFS4_dec_release_lockowner_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_lockowner_maxsz)
+#define NFS4_enc_access_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_access_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_access_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_access_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_getattr_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_getattr_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_lookup_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_lookup_maxsz + \
+				encode_getattr_maxsz + \
+				encode_getfh_maxsz)
+#define NFS4_dec_lookup_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_lookup_maxsz + \
+				decode_getattr_maxsz + \
+				decode_getfh_maxsz)
+#define NFS4_enc_lookup_root_sz (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putrootfh_maxsz + \
+				encode_getattr_maxsz + \
+				encode_getfh_maxsz)
+#define NFS4_dec_lookup_root_sz (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putrootfh_maxsz + \
+				decode_getattr_maxsz + \
+				decode_getfh_maxsz)
+#define NFS4_enc_remove_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_remove_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_remove_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_remove_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_rename_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_savefh_maxsz + \
+				encode_putfh_maxsz + \
+				encode_rename_maxsz + \
+				encode_getattr_maxsz + \
+				encode_restorefh_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_rename_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_savefh_maxsz + \
+				decode_putfh_maxsz + \
+				decode_rename_maxsz + \
+				decode_getattr_maxsz + \
+				decode_restorefh_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_link_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_savefh_maxsz + \
+				encode_putfh_maxsz + \
+				encode_link_maxsz + \
+				decode_getattr_maxsz + \
+				encode_restorefh_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_dec_link_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_savefh_maxsz + \
+				decode_putfh_maxsz + \
+				decode_link_maxsz + \
+				decode_getattr_maxsz + \
+				decode_restorefh_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_symlink_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_symlink_maxsz + \
+				encode_getattr_maxsz + \
+				encode_getfh_maxsz)
+#define NFS4_dec_symlink_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_symlink_maxsz + \
+				decode_getattr_maxsz + \
+				decode_getfh_maxsz)
+#define NFS4_enc_create_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_savefh_maxsz + \
+				encode_create_maxsz + \
+				encode_getfh_maxsz + \
+				encode_getattr_maxsz + \
+				encode_restorefh_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_create_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_savefh_maxsz + \
+				decode_create_maxsz + \
+				decode_getfh_maxsz + \
+				decode_getattr_maxsz + \
+				decode_restorefh_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_pathconf_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_pathconf_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_statfs_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_statfs_maxsz)
+#define NFS4_dec_statfs_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_statfs_maxsz)
+#define NFS4_enc_server_caps_sz (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_server_caps_sz (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_delegreturn_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_delegreturn_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_delegreturn_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_getacl_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_getacl_maxsz)
+#define NFS4_dec_getacl_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_getacl_maxsz)
+#define NFS4_enc_setacl_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_setacl_maxsz)
+#define NFS4_dec_setacl_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_setacl_maxsz)
+#define NFS4_enc_fs_locations_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_sequence_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_lookup_maxsz + \
+				 encode_fs_locations_maxsz)
+#define NFS4_dec_fs_locations_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_lookup_maxsz + \
+				 decode_fs_locations_maxsz)
+#define NFS4_enc_secinfo_sz 	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_secinfo_maxsz)
+#define NFS4_dec_secinfo_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_secinfo_maxsz)
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_enc_exchange_id_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_exchange_id_maxsz)
+#define NFS4_dec_exchange_id_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_exchange_id_maxsz)
+#define NFS4_enc_create_session_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_create_session_maxsz)
+#define NFS4_dec_create_session_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_create_session_maxsz)
+#define NFS4_enc_destroy_session_sz	(compound_encode_hdr_maxsz + \
+					 encode_destroy_session_maxsz)
+#define NFS4_dec_destroy_session_sz	(compound_decode_hdr_maxsz + \
+					 decode_destroy_session_maxsz)
+#define NFS4_enc_sequence_sz \
+				(compound_decode_hdr_maxsz + \
+				 encode_sequence_maxsz)
+#define NFS4_dec_sequence_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz)
+#define NFS4_enc_get_lease_time_sz	(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_putrootfh_maxsz + \
+					 encode_fsinfo_maxsz)
+#define NFS4_dec_get_lease_time_sz	(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_putrootfh_maxsz + \
+					 decode_fsinfo_maxsz)
+#define NFS4_enc_reclaim_complete_sz	(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_reclaim_complete_maxsz)
+#define NFS4_dec_reclaim_complete_sz	(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_reclaim_complete_maxsz)
+#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
+				encode_sequence_maxsz +\
+				encode_getdeviceinfo_maxsz)
+#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz +    \
+				decode_sequence_maxsz + \
+				decode_getdeviceinfo_maxsz)
+#define NFS4_enc_layoutget_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz +        \
+				encode_layoutget_maxsz)
+#define NFS4_dec_layoutget_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz +        \
+				decode_layoutget_maxsz)
+#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz +\
+				encode_putfh_maxsz + \
+				encode_layoutcommit_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_layoutcommit_maxsz + \
+				decode_getattr_maxsz)
+#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_layoutreturn_maxsz)
+#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_layoutreturn_maxsz)
+
+const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+				      compound_encode_hdr_maxsz +
+				      encode_sequence_maxsz +
+				      encode_putfh_maxsz +
+				      encode_getattr_maxsz) *
+				     XDR_UNIT);
+
+const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+				     compound_decode_hdr_maxsz +
+				     decode_sequence_maxsz +
+				     decode_putfh_maxsz) *
+				    XDR_UNIT);
+#endif /* CONFIG_NFS_V4_1 */
+
+static const umode_t nfs_type2fmt[] = {
+	[NF4BAD] = 0,
+	[NF4REG] = S_IFREG,
+	[NF4DIR] = S_IFDIR,
+	[NF4BLK] = S_IFBLK,
+	[NF4CHR] = S_IFCHR,
+	[NF4LNK] = S_IFLNK,
+	[NF4SOCK] = S_IFSOCK,
+	[NF4FIFO] = S_IFIFO,
+	[NF4ATTRDIR] = 0,
+	[NF4NAMEDATTR] = 0,
+};
+
+struct compound_hdr {
+	int32_t		status;
+	uint32_t	nops;
+	__be32 *	nops_p;
+	uint32_t	taglen;
+	char *		tag;
+	uint32_t	replen;		/* expected reply words */
+	u32		minorversion;
+};
+
+static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
+{
+	__be32 *p = xdr_reserve_space(xdr, nbytes);
+	BUG_ON(!p);
+	return p;
+}
+
+static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4 + len);
+	BUG_ON(p == NULL);
+	xdr_encode_opaque(p, str, len);
+}
+
+static void encode_compound_hdr(struct xdr_stream *xdr,
+				struct rpc_rqst *req,
+				struct compound_hdr *hdr)
+{
+	__be32 *p;
+	struct rpc_auth *auth = req->rq_cred->cr_auth;
+
+	/* initialize running count of expected bytes in reply.
+	 * NOTE: the replied tag SHOULD be the same is the one sent,
+	 * but this is not required as a MUST for the server to do so. */
+	hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
+
+	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
+	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
+	p = reserve_space(xdr, 4 + hdr->taglen + 8);
+	p = xdr_encode_opaque(p, hdr->tag, hdr->taglen);
+	*p++ = cpu_to_be32(hdr->minorversion);
+	hdr->nops_p = p;
+	*p = cpu_to_be32(hdr->nops);
+}
+
+static void encode_nops(struct compound_hdr *hdr)
+{
+	BUG_ON(hdr->nops > NFS4_MAX_OPS);
+	*hdr->nops_p = htonl(hdr->nops);
+}
+
+static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
+	BUG_ON(p == NULL);
+	xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE);
+}
+
+static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
+{
+	char owner_name[IDMAP_NAMESZ];
+	char owner_group[IDMAP_NAMESZ];
+	int owner_namelen = 0;
+	int owner_grouplen = 0;
+	__be32 *p;
+	__be32 *q;
+	int len;
+	uint32_t bmval0 = 0;
+	uint32_t bmval1 = 0;
+
+	/*
+	 * We reserve enough space to write the entire attribute buffer at once.
+	 * In the worst-case, this would be
+	 *   12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
+	 *          = 36 bytes, plus any contribution from variable-length fields
+	 *            such as owner/group.
+	 */
+	len = 16;
+
+	/* Sigh */
+	if (iap->ia_valid & ATTR_SIZE)
+		len += 8;
+	if (iap->ia_valid & ATTR_MODE)
+		len += 4;
+	if (iap->ia_valid & ATTR_UID) {
+		owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
+		if (owner_namelen < 0) {
+			dprintk("nfs: couldn't resolve uid %d to string\n",
+					iap->ia_uid);
+			/* XXX */
+			strcpy(owner_name, "nobody");
+			owner_namelen = sizeof("nobody") - 1;
+			/* goto out; */
+		}
+		len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
+	}
+	if (iap->ia_valid & ATTR_GID) {
+		owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
+		if (owner_grouplen < 0) {
+			dprintk("nfs: couldn't resolve gid %d to string\n",
+					iap->ia_gid);
+			strcpy(owner_group, "nobody");
+			owner_grouplen = sizeof("nobody") - 1;
+			/* goto out; */
+		}
+		len += 4 + (XDR_QUADLEN(owner_grouplen) << 2);
+	}
+	if (iap->ia_valid & ATTR_ATIME_SET)
+		len += 16;
+	else if (iap->ia_valid & ATTR_ATIME)
+		len += 4;
+	if (iap->ia_valid & ATTR_MTIME_SET)
+		len += 16;
+	else if (iap->ia_valid & ATTR_MTIME)
+		len += 4;
+	p = reserve_space(xdr, len);
+
+	/*
+	 * We write the bitmap length now, but leave the bitmap and the attribute
+	 * buffer length to be backfilled at the end of this routine.
+	 */
+	*p++ = cpu_to_be32(2);
+	q = p;
+	p += 3;
+
+	if (iap->ia_valid & ATTR_SIZE) {
+		bmval0 |= FATTR4_WORD0_SIZE;
+		p = xdr_encode_hyper(p, iap->ia_size);
+	}
+	if (iap->ia_valid & ATTR_MODE) {
+		bmval1 |= FATTR4_WORD1_MODE;
+		*p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
+	}
+	if (iap->ia_valid & ATTR_UID) {
+		bmval1 |= FATTR4_WORD1_OWNER;
+		p = xdr_encode_opaque(p, owner_name, owner_namelen);
+	}
+	if (iap->ia_valid & ATTR_GID) {
+		bmval1 |= FATTR4_WORD1_OWNER_GROUP;
+		p = xdr_encode_opaque(p, owner_group, owner_grouplen);
+	}
+	if (iap->ia_valid & ATTR_ATIME_SET) {
+		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
+		*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(iap->ia_atime.tv_sec);
+		*p++ = cpu_to_be32(iap->ia_atime.tv_nsec);
+	}
+	else if (iap->ia_valid & ATTR_ATIME) {
+		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
+		*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
+	}
+	if (iap->ia_valid & ATTR_MTIME_SET) {
+		bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
+		*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
+		*p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
+	}
+	else if (iap->ia_valid & ATTR_MTIME) {
+		bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
+		*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
+	}
+
+	/*
+	 * Now we backfill the bitmap and the attribute buffer length.
+	 */
+	if (len != ((char *)p - (char *)q) + 4) {
+		printk(KERN_ERR "nfs: Attr length error, %u != %Zu\n",
+				len, ((char *)p - (char *)q) + 4);
+		BUG();
+	}
+	len = (char *)p - (char *)q - 12;
+	*q++ = htonl(bmval0);
+	*q++ = htonl(bmval1);
+	*q = htonl(len);
+
+/* out: */
+}
+
+static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 8);
+	*p++ = cpu_to_be32(OP_ACCESS);
+	*p = cpu_to_be32(access);
+	hdr->nops++;
+	hdr->replen += decode_access_maxsz;
+}
+
+static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 8+NFS4_STATEID_SIZE);
+	*p++ = cpu_to_be32(OP_CLOSE);
+	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
+	xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
+	hdr->nops++;
+	hdr->replen += decode_close_maxsz;
+}
+
+static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 16);
+	*p++ = cpu_to_be32(OP_COMMIT);
+	p = xdr_encode_hyper(p, args->offset);
+	*p = cpu_to_be32(args->count);
+	hdr->nops++;
+	hdr->replen += decode_commit_maxsz;
+}
+
+static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 8);
+	*p++ = cpu_to_be32(OP_CREATE);
+	*p = cpu_to_be32(create->ftype);
+
+	switch (create->ftype) {
+	case NF4LNK:
+		p = reserve_space(xdr, 4);
+		*p = cpu_to_be32(create->u.symlink.len);
+		xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
+		break;
+
+	case NF4BLK: case NF4CHR:
+		p = reserve_space(xdr, 8);
+		*p++ = cpu_to_be32(create->u.device.specdata1);
+		*p = cpu_to_be32(create->u.device.specdata2);
+		break;
+
+	default:
+		break;
+	}
+
+	encode_string(xdr, create->name->len, create->name->name);
+	hdr->nops++;
+	hdr->replen += decode_create_maxsz;
+
+	encode_attrs(xdr, create->attrs, create->server);
+}
+
+static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 12);
+	*p++ = cpu_to_be32(OP_GETATTR);
+	*p++ = cpu_to_be32(1);
+	*p = cpu_to_be32(bitmap);
+	hdr->nops++;
+	hdr->replen += decode_getattr_maxsz;
+}
+
+static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 16);
+	*p++ = cpu_to_be32(OP_GETATTR);
+	*p++ = cpu_to_be32(2);
+	*p++ = cpu_to_be32(bm0);
+	*p = cpu_to_be32(bm1);
+	hdr->nops++;
+	hdr->replen += decode_getattr_maxsz;
+}
+
+static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+{
+	encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
+			   bitmask[1] & nfs4_fattr_bitmap[1], hdr);
+}
+
+static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+{
+	encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
+			   bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
+}
+
+static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+{
+	encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0],
+			   bitmask[1] & nfs4_fs_locations_bitmap[1], hdr);
+}
+
+static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_GETFH);
+	hdr->nops++;
+	hdr->replen += decode_getfh_maxsz;
+}
+
+static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 8 + name->len);
+	*p++ = cpu_to_be32(OP_LINK);
+	xdr_encode_opaque(p, name->name, name->len);
+	hdr->nops++;
+	hdr->replen += decode_link_maxsz;
+}
+
+static inline int nfs4_lock_type(struct file_lock *fl, int block)
+{
+	if ((fl->fl_type & (F_RDLCK|F_WRLCK|F_UNLCK)) == F_RDLCK)
+		return block ? NFS4_READW_LT : NFS4_READ_LT;
+	return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT;
+}
+
+static inline uint64_t nfs4_lock_length(struct file_lock *fl)
+{
+	if (fl->fl_end == OFFSET_MAX)
+		return ~(uint64_t)0;
+	return fl->fl_end - fl->fl_start + 1;
+}
+
+static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 32);
+	p = xdr_encode_hyper(p, lowner->clientid);
+	*p++ = cpu_to_be32(20);
+	p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+	*p++ = cpu_to_be32(lowner->s_dev);
+	xdr_encode_hyper(p, lowner->id);
+}
+
+/*
+ * opcode,type,reclaim,offset,length,new_lock_owner = 32
+ * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
+ */
+static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 32);
+	*p++ = cpu_to_be32(OP_LOCK);
+	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
+	*p++ = cpu_to_be32(args->reclaim);
+	p = xdr_encode_hyper(p, args->fl->fl_start);
+	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+	*p = cpu_to_be32(args->new_lock_owner);
+	if (args->new_lock_owner){
+		p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
+		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
+		p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
+		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
+		encode_lockowner(xdr, &args->lock_owner);
+	}
+	else {
+		p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
+		p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
+		*p = cpu_to_be32(args->lock_seqid->sequence->counter);
+	}
+	hdr->nops++;
+	hdr->replen += decode_lock_maxsz;
+}
+
+static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 24);
+	*p++ = cpu_to_be32(OP_LOCKT);
+	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
+	p = xdr_encode_hyper(p, args->fl->fl_start);
+	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+	encode_lockowner(xdr, &args->lock_owner);
+	hdr->nops++;
+	hdr->replen += decode_lockt_maxsz;
+}
+
+static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16);
+	*p++ = cpu_to_be32(OP_LOCKU);
+	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
+	*p++ = cpu_to_be32(args->seqid->sequence->counter);
+	p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_hyper(p, args->fl->fl_start);
+	xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+	hdr->nops++;
+	hdr->replen += decode_locku_maxsz;
+}
+
+static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_RELEASE_LOCKOWNER);
+	encode_lockowner(xdr, lowner);
+	hdr->nops++;
+	hdr->replen += decode_release_lockowner_maxsz;
+}
+
+static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+	int len = name->len;
+	__be32 *p;
+
+	p = reserve_space(xdr, 8 + len);
+	*p++ = cpu_to_be32(OP_LOOKUP);
+	xdr_encode_opaque(p, name->name, len);
+	hdr->nops++;
+	hdr->replen += decode_lookup_maxsz;
+}
+
+static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 8);
+	switch (fmode & (FMODE_READ|FMODE_WRITE)) {
+	case FMODE_READ:
+		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
+		break;
+	case FMODE_WRITE:
+		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
+		break;
+	case FMODE_READ|FMODE_WRITE:
+		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
+		break;
+	default:
+		*p++ = cpu_to_be32(0);
+	}
+	*p = cpu_to_be32(0);		/* for linux, share_deny = 0 always */
+}
+
+static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+{
+	__be32 *p;
+ /*
+ * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
+ * owner 4 = 32
+ */
+	p = reserve_space(xdr, 8);
+	*p++ = cpu_to_be32(OP_OPEN);
+	*p = cpu_to_be32(arg->seqid->sequence->counter);
+	encode_share_access(xdr, arg->fmode);
+	p = reserve_space(xdr, 32);
+	p = xdr_encode_hyper(p, arg->clientid);
+	*p++ = cpu_to_be32(20);
+	p = xdr_encode_opaque_fixed(p, "open id:", 8);
+	*p++ = cpu_to_be32(arg->server->s_dev);
+	xdr_encode_hyper(p, arg->id);
+}
+
+static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+{
+	__be32 *p;
+	struct nfs_client *clp;
+
+	p = reserve_space(xdr, 4);
+	switch(arg->open_flags & O_EXCL) {
+	case 0:
+		*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
+		encode_attrs(xdr, arg->u.attrs, arg->server);
+		break;
+	default:
+		clp = arg->server->nfs_client;
+		if (clp->cl_mvops->minor_version > 0) {
+			if (nfs4_has_persistent_session(clp)) {
+				*p = cpu_to_be32(NFS4_CREATE_GUARDED);
+				encode_attrs(xdr, arg->u.attrs, arg->server);
+			} else {
+				struct iattr dummy;
+
+				*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
+				encode_nfs4_verifier(xdr, &arg->u.verifier);
+				dummy.ia_valid = 0;
+				encode_attrs(xdr, &dummy, arg->server);
+			}
+		} else {
+			*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
+			encode_nfs4_verifier(xdr, &arg->u.verifier);
+		}
+	}
+}
+
+static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	switch (arg->open_flags & O_CREAT) {
+	case 0:
+		*p = cpu_to_be32(NFS4_OPEN_NOCREATE);
+		break;
+	default:
+		BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
+		*p = cpu_to_be32(NFS4_OPEN_CREATE);
+		encode_createmode(xdr, arg);
+	}
+}
+
+static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	switch (delegation_type) {
+	case 0:
+		*p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
+		break;
+	case FMODE_READ:
+		*p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
+		break;
+	case FMODE_WRITE|FMODE_READ:
+		*p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *name)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
+	encode_string(xdr, name->len, name->name);
+}
+
+static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
+	encode_delegation_type(xdr, type);
+}
+
+static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struct qstr *name, const nfs4_stateid *stateid)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
+	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
+	xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
+	encode_string(xdr, name->len, name->name);
+}
+
+static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)
+{
+	encode_openhdr(xdr, arg);
+	encode_opentype(xdr, arg);
+	switch (arg->claim) {
+	case NFS4_OPEN_CLAIM_NULL:
+		encode_claim_null(xdr, arg->name);
+		break;
+	case NFS4_OPEN_CLAIM_PREVIOUS:
+		encode_claim_previous(xdr, arg->u.delegation_type);
+		break;
+	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+		encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
+		break;
+	default:
+		BUG();
+	}
+	hdr->nops++;
+	hdr->replen += decode_open_maxsz;
+}
+
+static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
+	*p++ = cpu_to_be32(OP_OPEN_CONFIRM);
+	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
+	*p = cpu_to_be32(arg->seqid->sequence->counter);
+	hdr->nops++;
+	hdr->replen += decode_open_confirm_maxsz;
+}
+
+static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
+	*p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
+	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
+	*p = cpu_to_be32(arg->seqid->sequence->counter);
+	encode_share_access(xdr, arg->fmode);
+	hdr->nops++;
+	hdr->replen += decode_open_downgrade_maxsz;
+}
+
+static void
+encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr)
+{
+	int len = fh->size;
+	__be32 *p;
+
+	p = reserve_space(xdr, 8 + len);
+	*p++ = cpu_to_be32(OP_PUTFH);
+	xdr_encode_opaque(p, fh->data, len);
+	hdr->nops++;
+	hdr->replen += decode_putfh_maxsz;
+}
+
+static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_PUTROOTFH);
+	hdr->nops++;
+	hdr->replen += decode_putrootfh_maxsz;
+}
+
+static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid)
+{
+	nfs4_stateid stateid;
+	__be32 *p;
+
+	p = reserve_space(xdr, NFS4_STATEID_SIZE);
+	if (ctx->state != NULL) {
+		nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
+		if (zero_seqid)
+			stateid.stateid.seqid = 0;
+		xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
+	} else
+		xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
+}
+
+static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_READ);
+
+	encode_stateid(xdr, args->context, args->lock_context,
+		       hdr->minorversion);
+
+	p = reserve_space(xdr, 12);
+	p = xdr_encode_hyper(p, args->offset);
+	*p = cpu_to_be32(args->count);
+	hdr->nops++;
+	hdr->replen += decode_read_maxsz;
+}
+
+static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
+{
+	uint32_t attrs[2] = {
+		FATTR4_WORD0_RDATTR_ERROR,
+		FATTR4_WORD1_MOUNTED_ON_FILEID,
+	};
+	uint32_t dircount = readdir->count >> 1;
+	__be32 *p;
+
+	if (readdir->plus) {
+		attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
+			FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID;
+		attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
+			FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
+			FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
+			FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+		dircount >>= 1;
+	}
+	/* Use mounted_on_fileid only if the server supports it */
+	if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))
+		attrs[0] |= FATTR4_WORD0_FILEID;
+
+	p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
+	*p++ = cpu_to_be32(OP_READDIR);
+	p = xdr_encode_hyper(p, readdir->cookie);
+	p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
+	*p++ = cpu_to_be32(dircount);
+	*p++ = cpu_to_be32(readdir->count);
+	*p++ = cpu_to_be32(2);
+
+	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
+	*p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+	hdr->nops++;
+	hdr->replen += decode_readdir_maxsz;
+	dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
+			__func__,
+			(unsigned long long)readdir->cookie,
+			((u32 *)readdir->verifier.data)[0],
+			((u32 *)readdir->verifier.data)[1],
+			attrs[0] & readdir->bitmask[0],
+			attrs[1] & readdir->bitmask[1]);
+}
+
+static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_READLINK);
+	hdr->nops++;
+	hdr->replen += decode_readlink_maxsz;
+}
+
+static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 8 + name->len);
+	*p++ = cpu_to_be32(OP_REMOVE);
+	xdr_encode_opaque(p, name->name, name->len);
+	hdr->nops++;
+	hdr->replen += decode_remove_maxsz;
+}
+
+static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_RENAME);
+	encode_string(xdr, oldname->len, oldname->name);
+	encode_string(xdr, newname->len, newname->name);
+	hdr->nops++;
+	hdr->replen += decode_rename_maxsz;
+}
+
+static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 12);
+	*p++ = cpu_to_be32(OP_RENEW);
+	xdr_encode_hyper(p, client_stateid->cl_clientid);
+	hdr->nops++;
+	hdr->replen += decode_renew_maxsz;
+}
+
+static void
+encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_RESTOREFH);
+	hdr->nops++;
+	hdr->replen += decode_restorefh_maxsz;
+}
+
+static void
+encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
+	*p++ = cpu_to_be32(OP_SETATTR);
+	xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 2*4);
+	*p++ = cpu_to_be32(1);
+	*p = cpu_to_be32(FATTR4_WORD0_ACL);
+	BUG_ON(arg->acl_len % 4);
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(arg->acl_len);
+	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
+	hdr->nops++;
+	hdr->replen += decode_setacl_maxsz;
+}
+
+static void
+encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_SAVEFH);
+	hdr->nops++;
+	hdr->replen += decode_savefh_maxsz;
+}
+
+static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
+	*p++ = cpu_to_be32(OP_SETATTR);
+	xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
+	hdr->nops++;
+	hdr->replen += decode_setattr_maxsz;
+	encode_attrs(xdr, arg->iap, server);
+}
+
+static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE);
+	*p++ = cpu_to_be32(OP_SETCLIENTID);
+	xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
+
+	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(setclientid->sc_prog);
+	encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
+	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(setclientid->sc_cb_ident);
+	hdr->nops++;
+	hdr->replen += decode_setclientid_maxsz;
+}
+
+static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
+	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
+	p = xdr_encode_hyper(p, arg->clientid);
+	xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE);
+	hdr->nops++;
+	hdr->replen += decode_setclientid_confirm_maxsz;
+}
+
+static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_WRITE);
+
+	encode_stateid(xdr, args->context, args->lock_context,
+		       hdr->minorversion);
+
+	p = reserve_space(xdr, 16);
+	p = xdr_encode_hyper(p, args->offset);
+	*p++ = cpu_to_be32(args->stable);
+	*p = cpu_to_be32(args->count);
+
+	xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+	hdr->nops++;
+	hdr->replen += decode_write_maxsz;
+}
+
+static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
+
+	*p++ = cpu_to_be32(OP_DELEGRETURN);
+	xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
+	hdr->nops++;
+	hdr->replen += decode_delegreturn_maxsz;
+}
+
+static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+	int len = name->len;
+	__be32 *p;
+
+	p = reserve_space(xdr, 8 + len);
+	*p++ = cpu_to_be32(OP_SECINFO);
+	xdr_encode_opaque(p, name->name, len);
+	hdr->nops++;
+	hdr->replen += decode_secinfo_maxsz;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/* NFSv4.1 operations */
+static void encode_exchange_id(struct xdr_stream *xdr,
+			       struct nfs41_exchange_id_args *args,
+			       struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4 + sizeof(args->verifier->data));
+	*p++ = cpu_to_be32(OP_EXCHANGE_ID);
+	xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
+
+	encode_string(xdr, args->id_len, args->id);
+
+	p = reserve_space(xdr, 12);
+	*p++ = cpu_to_be32(args->flags);
+	*p++ = cpu_to_be32(0);	/* zero length state_protect4_a */
+	*p = cpu_to_be32(0);	/* zero length implementation id array */
+	hdr->nops++;
+	hdr->replen += decode_exchange_id_maxsz;
+}
+
+static void encode_create_session(struct xdr_stream *xdr,
+				  struct nfs41_create_session_args *args,
+				  struct compound_hdr *hdr)
+{
+	__be32 *p;
+	char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
+	uint32_t len;
+	struct nfs_client *clp = args->client;
+	u32 max_resp_sz_cached;
+
+	/*
+	 * Assumes OPEN is the biggest non-idempotent compound.
+	 * 2 is the verifier.
+	 */
+	max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
+			      RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
+
+	len = scnprintf(machine_name, sizeof(machine_name), "%s",
+			clp->cl_ipaddr);
+
+	p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
+	*p++ = cpu_to_be32(OP_CREATE_SESSION);
+	p = xdr_encode_hyper(p, clp->cl_clientid);
+	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
+	*p++ = cpu_to_be32(args->flags);			/*flags */
+
+	/* Fore Channel */
+	*p++ = cpu_to_be32(0);				/* header padding size */
+	*p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz);	/* max req size */
+	*p++ = cpu_to_be32(args->fc_attrs.max_resp_sz);	/* max resp size */
+	*p++ = cpu_to_be32(max_resp_sz_cached);		/* Max resp sz cached */
+	*p++ = cpu_to_be32(args->fc_attrs.max_ops);	/* max operations */
+	*p++ = cpu_to_be32(args->fc_attrs.max_reqs);	/* max requests */
+	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */
+
+	/* Back Channel */
+	*p++ = cpu_to_be32(0);				/* header padding size */
+	*p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz);	/* max req size */
+	*p++ = cpu_to_be32(args->bc_attrs.max_resp_sz);	/* max resp size */
+	*p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
+	*p++ = cpu_to_be32(args->bc_attrs.max_ops);	/* max operations */
+	*p++ = cpu_to_be32(args->bc_attrs.max_reqs);	/* max requests */
+	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */
+
+	*p++ = cpu_to_be32(args->cb_program);		/* cb_program */
+	*p++ = cpu_to_be32(1);
+	*p++ = cpu_to_be32(RPC_AUTH_UNIX);			/* auth_sys */
+
+	/* authsys_parms rfc1831 */
+	*p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
+	p = xdr_encode_opaque(p, machine_name, len);
+	*p++ = cpu_to_be32(0);				/* UID */
+	*p++ = cpu_to_be32(0);				/* GID */
+	*p = cpu_to_be32(0);				/* No more gids */
+	hdr->nops++;
+	hdr->replen += decode_create_session_maxsz;
+}
+
+static void encode_destroy_session(struct xdr_stream *xdr,
+				   struct nfs4_session *session,
+				   struct compound_hdr *hdr)
+{
+	__be32 *p;
+	p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN);
+	*p++ = cpu_to_be32(OP_DESTROY_SESSION);
+	xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	hdr->nops++;
+	hdr->replen += decode_destroy_session_maxsz;
+}
+
+static void encode_reclaim_complete(struct xdr_stream *xdr,
+				    struct nfs41_reclaim_complete_args *args,
+				    struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 8);
+	*p++ = cpu_to_be32(OP_RECLAIM_COMPLETE);
+	*p++ = cpu_to_be32(args->one_fs);
+	hdr->nops++;
+	hdr->replen += decode_reclaim_complete_maxsz;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static void encode_sequence(struct xdr_stream *xdr,
+			    const struct nfs4_sequence_args *args,
+			    struct compound_hdr *hdr)
+{
+#if defined(CONFIG_NFS_V4_1)
+	struct nfs4_session *session = args->sa_session;
+	struct nfs4_slot_table *tp;
+	struct nfs4_slot *slot;
+	__be32 *p;
+
+	if (!session)
+		return;
+
+	tp = &session->fc_slot_table;
+
+	WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
+	slot = tp->slots + args->sa_slotid;
+
+	p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16);
+	*p++ = cpu_to_be32(OP_SEQUENCE);
+
+	/*
+	 * Sessionid + seqid + slotid + max slotid + cache_this
+	 */
+	dprintk("%s: sessionid=%u:%u:%u:%u seqid=%d slotid=%d "
+		"max_slotid=%d cache_this=%d\n",
+		__func__,
+		((u32 *)session->sess_id.data)[0],
+		((u32 *)session->sess_id.data)[1],
+		((u32 *)session->sess_id.data)[2],
+		((u32 *)session->sess_id.data)[3],
+		slot->seq_nr, args->sa_slotid,
+		tp->highest_used_slotid, args->sa_cache_this);
+	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	*p++ = cpu_to_be32(slot->seq_nr);
+	*p++ = cpu_to_be32(args->sa_slotid);
+	*p++ = cpu_to_be32(tp->highest_used_slotid);
+	*p = cpu_to_be32(args->sa_cache_this);
+	hdr->nops++;
+	hdr->replen += decode_sequence_maxsz;
+#endif /* CONFIG_NFS_V4_1 */
+}
+
+#ifdef CONFIG_NFS_V4_1
+static void
+encode_getdeviceinfo(struct xdr_stream *xdr,
+		     const struct nfs4_getdeviceinfo_args *args,
+		     struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
+	*p++ = cpu_to_be32(OP_GETDEVICEINFO);
+	p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
+				    NFS4_DEVICEID4_SIZE);
+	*p++ = cpu_to_be32(args->pdev->layout_type);
+	*p++ = cpu_to_be32(args->pdev->pglen);		/* gdia_maxcount */
+	*p++ = cpu_to_be32(0);				/* bitmap length 0 */
+	hdr->nops++;
+	hdr->replen += decode_getdeviceinfo_maxsz;
+}
+
+static void
+encode_layoutget(struct xdr_stream *xdr,
+		      const struct nfs4_layoutget_args *args,
+		      struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
+	*p++ = cpu_to_be32(OP_LAYOUTGET);
+	*p++ = cpu_to_be32(0);     /* Signal layout available */
+	*p++ = cpu_to_be32(args->type);
+	*p++ = cpu_to_be32(args->range.iomode);
+	p = xdr_encode_hyper(p, args->range.offset);
+	p = xdr_encode_hyper(p, args->range.length);
+	p = xdr_encode_hyper(p, args->minlength);
+	p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
+	*p = cpu_to_be32(args->maxcount);
+
+	dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
+		__func__,
+		args->type,
+		args->range.iomode,
+		(unsigned long)args->range.offset,
+		(unsigned long)args->range.length,
+		args->maxcount);
+	hdr->nops++;
+	hdr->replen += decode_layoutget_maxsz;
+}
+
+static int
+encode_layoutcommit(struct xdr_stream *xdr,
+		    struct inode *inode,
+		    const struct nfs4_layoutcommit_args *args,
+		    struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
+		NFS_SERVER(args->inode)->pnfs_curr_ld->id);
+
+	p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
+	*p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
+	/* Only whole file layouts */
+	p = xdr_encode_hyper(p, 0); /* offset */
+	p = xdr_encode_hyper(p, args->lastbytewritten + 1);	/* length */
+	*p++ = cpu_to_be32(0); /* reclaim */
+	p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
+	*p++ = cpu_to_be32(1); /* newoffset = TRUE */
+	p = xdr_encode_hyper(p, args->lastbytewritten);
+	*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
+	*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
+
+	if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
+		NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
+			NFS_I(inode)->layout, xdr, args);
+	else {
+		p = reserve_space(xdr, 4);
+		*p = cpu_to_be32(0); /* no layout-type payload */
+	}
+
+	hdr->nops++;
+	hdr->replen += decode_layoutcommit_maxsz;
+	return 0;
+}
+
+static void
+encode_layoutreturn(struct xdr_stream *xdr,
+		    const struct nfs4_layoutreturn_args *args,
+		    struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 20);
+	*p++ = cpu_to_be32(OP_LAYOUTRETURN);
+	*p++ = cpu_to_be32(0);		/* reclaim. always 0 for now */
+	*p++ = cpu_to_be32(args->layout_type);
+	*p++ = cpu_to_be32(IOMODE_ANY);
+	*p = cpu_to_be32(RETURN_FILE);
+	p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
+	p = xdr_encode_hyper(p, 0);
+	p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
+	spin_lock(&args->inode->i_lock);
+	xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
+	spin_unlock(&args->inode->i_lock);
+	if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
+		NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
+			NFS_I(args->inode)->layout, xdr, args);
+	} else {
+		p = reserve_space(xdr, 4);
+		*p = cpu_to_be32(0);
+	}
+	hdr->nops++;
+	hdr->replen += decode_layoutreturn_maxsz;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * END OF "GENERIC" ENCODE ROUTINES.
+ */
+
+static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
+{
+#if defined(CONFIG_NFS_V4_1)
+	if (args->sa_session)
+		return args->sa_session->clp->cl_mvops->minor_version;
+#endif /* CONFIG_NFS_V4_1 */
+	return 0;
+}
+
+/*
+ * Encode an ACCESS request
+ */
+static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_accessargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_access(xdr, args->access, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode LOOKUP request
+ */
+static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_lookup_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_lookup(xdr, args->name, &hdr);
+	encode_getfh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode LOOKUP_ROOT request
+ */
+static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_lookup_root_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putrootfh(xdr, &hdr);
+	encode_getfh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode REMOVE request
+ */
+static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs_removeargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_remove(xdr, &args->name, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode RENAME request
+ */
+static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs_renameargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->old_dir, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_putfh(xdr, args->new_dir, &hdr);
+	encode_rename(xdr, args->old_name, args->new_name, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_restorefh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode LINK request
+ */
+static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
+			     const struct nfs4_link_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_link(xdr, args->name, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_restorefh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode CREATE request
+ */
+static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_create_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_create(xdr, args, &hdr);
+	encode_getfh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_restorefh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode SYMLINK request
+ */
+static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 const struct nfs4_create_arg *args)
+{
+	nfs4_xdr_enc_create(req, xdr, args);
+}
+
+/*
+ * Encode GETATTR request
+ */
+static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 const struct nfs4_getattr_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a CLOSE request
+ */
+static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_closeargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_close(xdr, args, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN request
+ */
+static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      struct nfs_openargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_open(xdr, args, &hdr);
+	encode_getfh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_restorefh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN_CONFIRM request
+ */
+static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs_open_confirmargs *args)
+{
+	struct compound_hdr hdr = {
+		.nops   = 0,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_open_confirm(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN request with no attributes.
+ */
+static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs_openargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_open(xdr, args, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN_DOWNGRADE request
+ */
+static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
+					struct xdr_stream *xdr,
+					struct nfs_closeargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_open_downgrade(xdr, args, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a LOCK request
+ */
+static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      struct nfs_lock_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_lock(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a LOCKT request
+ */
+static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_lockt_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_lockt(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a LOCKU request
+ */
+static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_locku_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_locku(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req,
+					   struct xdr_stream *xdr,
+					struct nfs_release_lockowner_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = 0,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_release_lockowner(xdr, &args->lock_owner, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a READLINK request
+ */
+static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  const struct nfs4_readlink *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_readlink(xdr, args, req, &hdr);
+
+	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
+			args->pgbase, args->pglen);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a READDIR request
+ */
+static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 const struct nfs4_readdir_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_readdir(xdr, args, req, &hdr);
+
+	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
+			 args->pgbase, args->count);
+	dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
+			__func__, hdr.replen << 2, args->pages,
+			args->pgbase, args->count);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a READ request
+ */
+static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      struct nfs_readargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_read(xdr, args, &hdr);
+
+	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
+			 args->pages, args->pgbase, args->count);
+	req->rq_rcv_buf.flags |= XDRBUF_READ;
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode an SETATTR request
+ */
+static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_setattrargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_setattr(xdr, args, args->server, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a GETACL request
+ */
+static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs_getaclargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+	uint32_t replen;
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
+	encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
+
+	xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
+		args->acl_pages, args->acl_pgbase, args->acl_len);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode a WRITE request
+ */
+static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_writeargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_write(xdr, args, &hdr);
+	req->rq_snd_buf.flags |= XDRBUF_WRITE;
+	if (args->bitmask)
+		encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ *  a COMMIT request
+ */
+static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs_writeargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_commit(xdr, args, &hdr);
+	if (args->bitmask)
+		encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * FSINFO request
+ */
+static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs4_fsinfo_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_fsinfo(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a PATHCONF request
+ */
+static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  const struct nfs4_pathconf_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
+			   &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a STATFS request
+ */
+static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_statfs_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
+			   args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * GETATTR_BITMAP request
+ */
+static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs4_server_caps_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fhandle, &hdr);
+	encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
+			   FATTR4_WORD0_LINK_SUPPORT|
+			   FATTR4_WORD0_SYMLINK_SUPPORT|
+			   FATTR4_WORD0_ACLSUPPORT, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a RENEW request
+ */
+static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_client *clp)
+{
+	struct compound_hdr hdr = {
+		.nops	= 0,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_renew(xdr, clp, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a SETCLIENTID request
+ */
+static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs4_setclientid *sc)
+{
+	struct compound_hdr hdr = {
+		.nops	= 0,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_setclientid(xdr, sc, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a SETCLIENTID_CONFIRM request
+ */
+static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
+					     struct xdr_stream *xdr,
+					     struct nfs4_setclientid_res *arg)
+{
+	struct compound_hdr hdr = {
+		.nops	= 0,
+	};
+	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_setclientid_confirm(xdr, arg, &hdr);
+	encode_putrootfh(xdr, &hdr);
+	encode_fsinfo(xdr, lease_bitmap, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * DELEGRETURN request
+ */
+static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_delegreturnargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fhandle, &hdr);
+	encode_delegreturn(xdr, args->stateid, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode FS_LOCATIONS request
+ */
+static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs4_fs_locations_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+	uint32_t replen;
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_lookup(xdr, args->name, &hdr);
+	replen = hdr.replen;	/* get the attribute into args->page */
+	encode_fs_locations(xdr, args->bitmask, &hdr);
+
+	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
+			0, PAGE_SIZE);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode SECINFO request
+ */
+static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				struct nfs4_secinfo_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_secinfo(xdr, args->name, &hdr);
+	encode_nops(&hdr);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * EXCHANGE_ID request
+ */
+static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs41_exchange_id_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = args->client->cl_mvops->minor_version,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_exchange_id(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a CREATE_SESSION request
+ */
+static void nfs4_xdr_enc_create_session(struct rpc_rqst *req,
+					struct xdr_stream *xdr,
+					struct nfs41_create_session_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = args->client->cl_mvops->minor_version,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_create_session(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a DESTROY_SESSION request
+ */
+static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
+					 struct xdr_stream *xdr,
+					 struct nfs4_session *session)
+{
+	struct compound_hdr hdr = {
+		.minorversion = session->clp->cl_mvops->minor_version,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_destroy_session(xdr, session, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a SEQUENCE request
+ */
+static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  struct nfs4_sequence_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a GET_LEASE_TIME request
+ */
+static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
+					struct xdr_stream *xdr,
+					struct nfs4_get_lease_time_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
+	};
+	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->la_seq_args, &hdr);
+	encode_putrootfh(xdr, &hdr);
+	encode_fsinfo(xdr, lease_bitmap, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * a RECLAIM_COMPLETE request
+ */
+static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
+					  struct xdr_stream *xdr,
+				struct nfs41_reclaim_complete_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args)
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_reclaim_complete(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode GETDEVICEINFO request
+ */
+static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
+				       struct xdr_stream *xdr,
+				       struct nfs4_getdeviceinfo_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_getdeviceinfo(xdr, args, &hdr);
+
+	/* set up reply kvec. Subtract notification bitmap max size (2)
+	 * so that notification bitmap is put in xdr_buf tail */
+	xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
+			 args->pdev->pages, args->pdev->pgbase,
+			 args->pdev->pglen);
+
+	encode_nops(&hdr);
+}
+
+/*
+ *  Encode LAYOUTGET request
+ */
+static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs4_layoutget_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+	encode_layoutget(xdr, args, &hdr);
+
+	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
+	    args->layout.pages, 0, args->layout.pglen);
+
+	encode_nops(&hdr);
+}
+
+/*
+ *  Encode LAYOUTCOMMIT request
+ */
+static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs4_layoutcommit_args *args)
+{
+	struct nfs4_layoutcommit_data *data =
+		container_of(args, struct nfs4_layoutcommit_data, args);
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+	encode_layoutcommit(xdr, data->args.inode, args, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Encode LAYOUTRETURN request
+ */
+static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs4_layoutreturn_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+	encode_layoutreturn(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("nfs: %s: prematurely hit end of receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
+static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	*len = be32_to_cpup(p);
+	p = xdr_inline_decode(xdr, *len);
+	if (unlikely(!p))
+		goto out_overflow;
+	*string = (char *)p;
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	hdr->status = be32_to_cpup(p++);
+	hdr->taglen = be32_to_cpup(p);
+
+	p = xdr_inline_decode(xdr, hdr->taglen + 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	hdr->tag = (char *)p;
+	p += XDR_QUADLEN(hdr->taglen);
+	hdr->nops = be32_to_cpup(p);
+	if (unlikely(hdr->nops < 1))
+		return nfs4_stat_to_errno(hdr->status);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+{
+	__be32 *p;
+	uint32_t opnum;
+	int32_t nfserr;
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	opnum = be32_to_cpup(p++);
+	if (opnum != expected) {
+		dprintk("nfs: Server returned operation"
+			" %d but we issued a request for %d\n",
+				opnum, expected);
+		return -EIO;
+	}
+	nfserr = be32_to_cpup(p);
+	if (nfserr != NFS_OK)
+		return nfs4_stat_to_errno(nfserr);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/* Dummy routine */
+static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
+{
+	__be32 *p;
+	unsigned int strlen;
+	char *str;
+
+	p = xdr_inline_decode(xdr, 12);
+	if (likely(p))
+		return decode_opaque_inline(xdr, &strlen, &str);
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+{
+	uint32_t bmlen;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	bmlen = be32_to_cpup(p);
+
+	bitmap[0] = bitmap[1] = 0;
+	p = xdr_inline_decode(xdr, (bmlen << 2));
+	if (unlikely(!p))
+		goto out_overflow;
+	if (bmlen > 0) {
+		bitmap[0] = be32_to_cpup(p++);
+		if (bmlen > 1)
+			bitmap[1] = be32_to_cpup(p);
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	*attrlen = be32_to_cpup(p);
+	*savep = xdr->p;
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
+{
+	if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) {
+		int ret;
+		ret = decode_attr_bitmap(xdr, bitmask);
+		if (unlikely(ret < 0))
+			return ret;
+		bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
+	} else
+		bitmask[0] = bitmask[1] = 0;
+	dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]);
+	return 0;
+}
+
+static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
+{
+	__be32 *p;
+	int ret = 0;
+
+	*type = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*type = be32_to_cpup(p);
+		if (*type < NF4REG || *type > NF4NAMEDATTR) {
+			dprintk("%s: bad type %d\n", __func__, *type);
+			return -EIO;
+		}
+		bitmap[0] &= ~FATTR4_WORD0_TYPE;
+		ret = NFS_ATTR_FATTR_TYPE;
+	}
+	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
+{
+	__be32 *p;
+	int ret = 0;
+
+	*change = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, change);
+		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
+		ret = NFS_ATTR_FATTR_CHANGE;
+	}
+	dprintk("%s: change attribute=%Lu\n", __func__,
+			(unsigned long long)*change);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
+{
+	__be32 *p;
+	int ret = 0;
+
+	*size = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, size);
+		bitmap[0] &= ~FATTR4_WORD0_SIZE;
+		ret = NFS_ATTR_FATTR_SIZE;
+	}
+	dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+	__be32 *p;
+
+	*res = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*res = be32_to_cpup(p);
+		bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
+	}
+	dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+	__be32 *p;
+
+	*res = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*res = be32_to_cpup(p);
+		bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
+	}
+	dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
+{
+	__be32 *p;
+	int ret = 0;
+
+	fsid->major = 0;
+	fsid->minor = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
+		p = xdr_inline_decode(xdr, 16);
+		if (unlikely(!p))
+			goto out_overflow;
+		p = xdr_decode_hyper(p, &fsid->major);
+		xdr_decode_hyper(p, &fsid->minor);
+		bitmap[0] &= ~FATTR4_WORD0_FSID;
+		ret = NFS_ATTR_FATTR_FSID;
+	}
+	dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
+			(unsigned long long)fsid->major,
+			(unsigned long long)fsid->minor);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+	__be32 *p;
+
+	*res = 60;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*res = be32_to_cpup(p);
+		bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
+	}
+	dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res)
+{
+	__be32 *p;
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
+		*res = -be32_to_cpup(p);
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
+{
+	__be32 *p;
+	int len;
+
+	if (fh != NULL)
+		memset(fh, 0, sizeof(*fh));
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		len = be32_to_cpup(p);
+		if (len > NFS4_FHSIZE)
+			return -EIO;
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (fh != NULL) {
+			memcpy(fh->data, p, len);
+			fh->size = len;
+		}
+		bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+	__be32 *p;
+
+	*res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*res = be32_to_cpup(p);
+		bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
+	}
+	dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
+{
+	__be32 *p;
+	int ret = 0;
+
+	*fileid = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, fileid);
+		bitmap[0] &= ~FATTR4_WORD0_FILEID;
+		ret = NFS_ATTR_FATTR_FILEID;
+	}
+	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
+{
+	__be32 *p;
+	int ret = 0;
+
+	*fileid = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, fileid);
+		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+		ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID;
+	}
+	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+		bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
+	}
+	dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+		bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
+	}
+	dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+		bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
+	}
+	dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
+{
+	u32 n;
+	__be32 *p;
+	int status = 0;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	n = be32_to_cpup(p);
+	if (n == 0)
+		goto root_path;
+	dprintk("path ");
+	path->ncomponents = 0;
+	while (path->ncomponents < n) {
+		struct nfs4_string *component = &path->components[path->ncomponents];
+		status = decode_opaque_inline(xdr, &component->len, &component->data);
+		if (unlikely(status != 0))
+			goto out_eio;
+		if (path->ncomponents != n)
+			dprintk("/");
+		dprintk("%s", component->data);
+		if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS)
+			path->ncomponents++;
+		else {
+			dprintk("cannot parse %d components in path\n", n);
+			goto out_eio;
+		}
+	}
+out:
+	dprintk("\n");
+	return status;
+root_path:
+/* a root pathname is sent as a zero component4 */
+	path->ncomponents = 1;
+	path->components[0].len=0;
+	path->components[0].data=NULL;
+	dprintk("path /\n");
+	goto out;
+out_eio:
+	dprintk(" status %d", status);
+	status = -EIO;
+	goto out;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
+{
+	int n;
+	__be32 *p;
+	int status = -EIO;
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U)))
+		goto out;
+	status = 0;
+	if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
+		goto out;
+	dprintk("%s: fsroot ", __func__);
+	status = decode_pathname(xdr, &res->fs_path);
+	if (unlikely(status != 0))
+		goto out;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	n = be32_to_cpup(p);
+	if (n <= 0)
+		goto out_eio;
+	res->nlocations = 0;
+	while (res->nlocations < n) {
+		u32 m;
+		struct nfs4_fs_location *loc = &res->locations[res->nlocations];
+
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		m = be32_to_cpup(p);
+
+		loc->nservers = 0;
+		dprintk("%s: servers ", __func__);
+		while (loc->nservers < m) {
+			struct nfs4_string *server = &loc->servers[loc->nservers];
+			status = decode_opaque_inline(xdr, &server->len, &server->data);
+			if (unlikely(status != 0))
+				goto out_eio;
+			dprintk("%s ", server->data);
+			if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS)
+				loc->nservers++;
+			else {
+				unsigned int i;
+				dprintk("%s: using first %u of %u servers "
+					"returned for location %u\n",
+						__func__,
+						NFS4_FS_LOCATION_MAXSERVERS,
+						m, res->nlocations);
+				for (i = loc->nservers; i < m; i++) {
+					unsigned int len;
+					char *data;
+					status = decode_opaque_inline(xdr, &len, &data);
+					if (unlikely(status != 0))
+						goto out_eio;
+				}
+			}
+		}
+		status = decode_pathname(xdr, &loc->rootpath);
+		if (unlikely(status != 0))
+			goto out_eio;
+		if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
+			res->nlocations++;
+	}
+	if (res->nlocations != 0)
+		status = NFS_ATTR_FATTR_V4_REFERRAL;
+out:
+	dprintk("%s: fs_locations done, error = %d\n", __func__, status);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+out_eio:
+	status = -EIO;
+	goto out;
+}
+
+static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 0;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+		bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
+	}
+	dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
+{
+	__be32 *p;
+	int status = 0;
+
+	*maxlink = 1;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*maxlink = be32_to_cpup(p);
+		bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
+	}
+	dprintk("%s: maxlink=%u\n", __func__, *maxlink);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
+{
+	__be32 *p;
+	int status = 0;
+
+	*maxname = 1024;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*maxname = be32_to_cpup(p);
+		bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
+	}
+	dprintk("%s: maxname=%u\n", __func__, *maxname);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 1024;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXREAD - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
+		uint64_t maxread;
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, &maxread);
+		if (maxread > 0x7FFFFFFF)
+			maxread = 0x7FFFFFFF;
+		*res = (uint32_t)maxread;
+		bitmap[0] &= ~FATTR4_WORD0_MAXREAD;
+	}
+	dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 1024;
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXWRITE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
+		uint64_t maxwrite;
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, &maxwrite);
+		if (maxwrite > 0x7FFFFFFF)
+			maxwrite = 0x7FFFFFFF;
+		*res = (uint32_t)maxwrite;
+		bitmap[0] &= ~FATTR4_WORD0_MAXWRITE;
+	}
+	dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
+{
+	uint32_t tmp;
+	__be32 *p;
+	int ret = 0;
+
+	*mode = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		tmp = be32_to_cpup(p);
+		*mode = tmp & ~S_IFMT;
+		bitmap[1] &= ~FATTR4_WORD1_MODE;
+		ret = NFS_ATTR_FATTR_MODE;
+	}
+	dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
+{
+	__be32 *p;
+	int ret = 0;
+
+	*nlink = 1;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		*nlink = be32_to_cpup(p);
+		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
+		ret = NFS_ATTR_FATTR_NLINK;
+	}
+	dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
+		const struct nfs_server *server, uint32_t *uid, int may_sleep)
+{
+	uint32_t len;
+	__be32 *p;
+	int ret = 0;
+
+	*uid = -2;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		len = be32_to_cpup(p);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (!may_sleep) {
+			/* do nothing */
+		} else if (len < XDR_MAX_NETOBJ) {
+			if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
+				ret = NFS_ATTR_FATTR_OWNER;
+			else
+				dprintk("%s: nfs_map_name_to_uid failed!\n",
+						__func__);
+		} else
+			dprintk("%s: name too long (%u)!\n",
+					__func__, len);
+		bitmap[1] &= ~FATTR4_WORD1_OWNER;
+	}
+	dprintk("%s: uid=%d\n", __func__, (int)*uid);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
+		const struct nfs_server *server, uint32_t *gid, int may_sleep)
+{
+	uint32_t len;
+	__be32 *p;
+	int ret = 0;
+
+	*gid = -2;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		len = be32_to_cpup(p);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (!may_sleep) {
+			/* do nothing */
+		} else if (len < XDR_MAX_NETOBJ) {
+			if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
+				ret = NFS_ATTR_FATTR_GROUP;
+			else
+				dprintk("%s: nfs_map_group_to_gid failed!\n",
+						__func__);
+		} else
+			dprintk("%s: name too long (%u)!\n",
+					__func__, len);
+		bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
+	}
+	dprintk("%s: gid=%d\n", __func__, (int)*gid);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
+{
+	uint32_t major = 0, minor = 0;
+	__be32 *p;
+	int ret = 0;
+
+	*rdev = MKDEV(0,0);
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) {
+		dev_t tmp;
+
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		major = be32_to_cpup(p++);
+		minor = be32_to_cpup(p);
+		tmp = MKDEV(major, minor);
+		if (MAJOR(tmp) == major && MINOR(tmp) == minor)
+			*rdev = tmp;
+		bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
+		ret = NFS_ATTR_FATTR_RDEV;
+	}
+	dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+		bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
+	}
+	dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+		bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
+	}
+	dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+	__be32 *p;
+	int status = 0;
+
+	*res = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+		bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
+	}
+	dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
+{
+	__be32 *p;
+	int ret = 0;
+
+	*used = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, used);
+		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
+		ret = NFS_ATTR_FATTR_SPACE_USED;
+	}
+	dprintk("%s: space used=%Lu\n", __func__,
+			(unsigned long long)*used);
+	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
+{
+	__be32 *p;
+	uint64_t sec;
+	uint32_t nsec;
+
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
+	p = xdr_decode_hyper(p, &sec);
+	nsec = be32_to_cpup(p);
+	time->tv_sec = (time_t)sec;
+	time->tv_nsec = (long)nsec;
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
+{
+	int status = 0;
+
+	time->tv_sec = 0;
+	time->tv_nsec = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_ACCESS - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) {
+		status = decode_attr_time(xdr, time);
+		if (status == 0)
+			status = NFS_ATTR_FATTR_ATIME;
+		bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
+	}
+	dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
+	return status;
+}
+
+static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
+{
+	int status = 0;
+
+	time->tv_sec = 0;
+	time->tv_nsec = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_METADATA - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) {
+		status = decode_attr_time(xdr, time);
+		if (status == 0)
+			status = NFS_ATTR_FATTR_CTIME;
+		bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
+	}
+	dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
+	return status;
+}
+
+static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
+				  struct timespec *time)
+{
+	int status = 0;
+
+	time->tv_sec = 0;
+	time->tv_nsec = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) {
+		status = decode_attr_time(xdr, time);
+		bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA;
+	}
+	dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec,
+		(long)time->tv_nsec);
+	return status;
+}
+
+static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
+{
+	int status = 0;
+
+	time->tv_sec = 0;
+	time->tv_nsec = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_MODIFY - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) {
+		status = decode_attr_time(xdr, time);
+		if (status == 0)
+			status = NFS_ATTR_FATTR_MTIME;
+		bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
+	}
+	dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
+	return status;
+}
+
+static int verify_attr_len(struct xdr_stream *xdr, __be32 *savep, uint32_t attrlen)
+{
+	unsigned int attrwords = XDR_QUADLEN(attrlen);
+	unsigned int nwords = xdr->p - savep;
+
+	if (unlikely(attrwords != nwords)) {
+		dprintk("%s: server returned incorrect attribute length: "
+			"%u %c %u\n",
+				__func__,
+				attrwords << 2,
+				(attrwords < nwords) ? '<' : '>',
+				nwords << 2);
+		return -EIO;
+	}
+	return 0;
+}
+
+static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 20);
+	if (unlikely(!p))
+		goto out_overflow;
+	cinfo->atomic = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &cinfo->before);
+	xdr_decode_hyper(p, &cinfo->after);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
+{
+	__be32 *p;
+	uint32_t supp, acc;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_ACCESS);
+	if (status)
+		return status;
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	supp = be32_to_cpup(p++);
+	acc = be32_to_cpup(p);
+	access->supported = supp;
+	access->access = acc;
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, len);
+	if (likely(p)) {
+		memcpy(buf, p, len);
+		return 0;
+	}
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
+}
+
+static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_CLOSE);
+	if (status != -EIO)
+		nfs_increment_open_seqid(status, res->seqid);
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
+}
+
+static int decode_verifier(struct xdr_stream *xdr, void *verifier)
+{
+	return decode_opaque_fixed(xdr, verifier, 8);
+}
+
+static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_COMMIT);
+	if (!status)
+		status = decode_verifier(xdr, res->verf->verifier);
+	return status;
+}
+
+static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+	__be32 *p;
+	uint32_t bmlen;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_CREATE);
+	if (status)
+		return status;
+	if ((status = decode_change_info(xdr, cinfo)))
+		return status;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	bmlen = be32_to_cpup(p);
+	p = xdr_inline_decode(xdr, bmlen << 2);
+	if (likely(p))
+		return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
+{
+	__be32 *savep;
+	uint32_t attrlen, bitmap[2] = {0};
+	int status;
+
+	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_supported(xdr, bitmap, res->attr_bitmask)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_link_support(xdr, bitmap, &res->has_links)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_symlink_support(xdr, bitmap, &res->has_symlinks)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0)
+		goto xdr_error;
+	status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
+	return status;
+}
+
+static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
+{
+	__be32 *savep;
+	uint32_t attrlen, bitmap[2] = {0};
+	int status;
+
+	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+		goto xdr_error;
+
+	if ((status = decode_attr_files_avail(xdr, bitmap, &fsstat->afiles)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_files_free(xdr, bitmap, &fsstat->ffiles)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_files_total(xdr, bitmap, &fsstat->tfiles)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_space_avail(xdr, bitmap, &fsstat->abytes)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_space_free(xdr, bitmap, &fsstat->fbytes)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_space_total(xdr, bitmap, &fsstat->tbytes)) != 0)
+		goto xdr_error;
+
+	status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
+	return status;
+}
+
+static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
+{
+	__be32 *savep;
+	uint32_t attrlen, bitmap[2] = {0};
+	int status;
+
+	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+		goto xdr_error;
+
+	if ((status = decode_attr_maxlink(xdr, bitmap, &pathconf->max_link)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_maxname(xdr, bitmap, &pathconf->max_namelen)) != 0)
+		goto xdr_error;
+
+	status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
+	return status;
+}
+
+static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
+		struct nfs_fattr *fattr, struct nfs_fh *fh,
+		const struct nfs_server *server, int may_sleep)
+{
+	int status;
+	umode_t fmode = 0;
+	uint32_t type;
+	int32_t err;
+
+	status = decode_attr_type(xdr, bitmap, &type);
+	if (status < 0)
+		goto xdr_error;
+	fattr->mode = 0;
+	if (status != 0) {
+		fattr->mode |= nfs_type2fmt[type];
+		fattr->valid |= status;
+	}
+
+	status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_size(xdr, bitmap, &fattr->size);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	err = 0;
+	status = decode_attr_error(xdr, bitmap, &err);
+	if (status < 0)
+		goto xdr_error;
+	if (err == -NFS4ERR_WRONGSEC)
+		nfs_fixup_secinfo_attributes(fattr, fh);
+
+	status = decode_attr_filehandle(xdr, bitmap, fh);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
+						struct nfs4_fs_locations,
+						fattr));
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_mode(xdr, bitmap, &fmode);
+	if (status < 0)
+		goto xdr_error;
+	if (status != 0) {
+		fattr->mode |= fmode;
+		fattr->valid |= status;
+	}
+
+	status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+	status = decode_attr_mounted_on_fileid(xdr, bitmap, &fattr->mounted_on_fileid);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
+xdr_error:
+	dprintk("%s: xdr returned %d\n", __func__, -status);
+	return status;
+}
+
+static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		struct nfs_fh *fh, const struct nfs_server *server, int may_sleep)
+{
+	__be32 *savep;
+	uint32_t attrlen,
+		 bitmap[2] = {0};
+	int status;
+
+	status = decode_op_hdr(xdr, OP_GETATTR);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_attr_bitmap(xdr, bitmap);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_attr_length(xdr, &attrlen, &savep);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep);
+	if (status < 0)
+		goto xdr_error;
+
+	status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+	dprintk("%s: xdr returned %d\n", __func__, -status);
+	return status;
+}
+
+static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		const struct nfs_server *server, int may_sleep)
+{
+	return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
+}
+
+/*
+ * Decode potentially multiple layout types. Currently we only support
+ * one layout driver per file system.
+ */
+static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
+					 uint32_t *layouttype)
+{
+	uint32_t *p;
+	int num;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	num = be32_to_cpup(p);
+
+	/* pNFS is not supported by the underlying file system */
+	if (num == 0) {
+		*layouttype = 0;
+		return 0;
+	}
+	if (num > 1)
+		printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
+			"per filesystem not supported\n", __func__);
+
+	/* Decode and set first layout type, move xdr->p past unused types */
+	p = xdr_inline_decode(xdr, num * 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	*layouttype = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * The type of file system exported.
+ * Note we must ensure that layouttype is set in any non-error case.
+ */
+static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
+				uint32_t *layouttype)
+{
+	int status = 0;
+
+	dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
+		return -EIO;
+	if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
+		status = decode_first_pnfs_layout_type(xdr, layouttype);
+		bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
+	} else
+		*layouttype = 0;
+	return status;
+}
+
+static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
+{
+	__be32 *savep;
+	uint32_t attrlen, bitmap[2];
+	int status;
+
+	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+		goto xdr_error;
+
+	fsinfo->rtmult = fsinfo->wtmult = 512;	/* ??? */
+
+	if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_maxfilesize(xdr, bitmap, &fsinfo->maxfilesize)) != 0)
+		goto xdr_error;
+	if ((status = decode_attr_maxread(xdr, bitmap, &fsinfo->rtmax)) != 0)
+		goto xdr_error;
+	fsinfo->rtpref = fsinfo->dtpref = fsinfo->rtmax;
+	if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
+		goto xdr_error;
+	fsinfo->wtpref = fsinfo->wtmax;
+	status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
+	if (status != 0)
+		goto xdr_error;
+	status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
+	if (status != 0)
+		goto xdr_error;
+
+	status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
+	return status;
+}
+
+static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	__be32 *p;
+	uint32_t len;
+	int status;
+
+	/* Zero handle first to allow comparisons */
+	memset(fh, 0, sizeof(*fh));
+
+	status = decode_op_hdr(xdr, OP_GETFH);
+	if (status)
+		return status;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	len = be32_to_cpup(p);
+	if (len > NFS4_FHSIZE)
+		return -EIO;
+	fh->size = len;
+	p = xdr_inline_decode(xdr, len);
+	if (unlikely(!p))
+		goto out_overflow;
+	memcpy(fh->data, p, len);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_LINK);
+	if (status)
+		return status;
+	return decode_change_info(xdr, cinfo);
+}
+
+/*
+ * We create the owner, so we know a proper owner.id length is 4.
+ */
+static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
+{
+	uint64_t offset, length, clientid;
+	__be32 *p;
+	uint32_t namelen, type;
+
+	p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
+	if (unlikely(!p))
+		goto out_overflow;
+	p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
+	p = xdr_decode_hyper(p, &length);
+	type = be32_to_cpup(p++); /* 4 byte read */
+	if (fl != NULL) { /* manipulate file lock */
+		fl->fl_start = (loff_t)offset;
+		fl->fl_end = fl->fl_start + (loff_t)length - 1;
+		if (length == ~(uint64_t)0)
+			fl->fl_end = OFFSET_MAX;
+		fl->fl_type = F_WRLCK;
+		if (type & 1)
+			fl->fl_type = F_RDLCK;
+		fl->fl_pid = 0;
+	}
+	p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
+	namelen = be32_to_cpup(p); /* read 4 bytes */  /* have read all 32 bytes now */
+	p = xdr_inline_decode(xdr, namelen); /* variable size field */
+	if (likely(p))
+		return -NFS4ERR_DENIED;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_LOCK);
+	if (status == -EIO)
+		goto out;
+	if (status == 0) {
+		status = decode_stateid(xdr, &res->stateid);
+		if (unlikely(status))
+			goto out;
+	} else if (status == -NFS4ERR_DENIED)
+		status = decode_lock_denied(xdr, NULL);
+	if (res->open_seqid != NULL)
+		nfs_increment_open_seqid(status, res->open_seqid);
+	nfs_increment_lock_seqid(status, res->lock_seqid);
+out:
+	return status;
+}
+
+static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
+{
+	int status;
+	status = decode_op_hdr(xdr, OP_LOCKT);
+	if (status == -NFS4ERR_DENIED)
+		return decode_lock_denied(xdr, res->denied);
+	return status;
+}
+
+static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_LOCKU);
+	if (status != -EIO)
+		nfs_increment_lock_seqid(status, res->seqid);
+	if (status == 0)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
+}
+
+static int decode_release_lockowner(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER);
+}
+
+static int decode_lookup(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_LOOKUP);
+}
+
+/* This is too sick! */
+static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
+{
+	__be32 *p;
+	uint32_t limit_type, nblocks, blocksize;
+
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
+	limit_type = be32_to_cpup(p++);
+	switch (limit_type) {
+	case 1:
+		xdr_decode_hyper(p, maxsize);
+		break;
+	case 2:
+		nblocks = be32_to_cpup(p++);
+		blocksize = be32_to_cpup(p);
+		*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+	__be32 *p;
+	uint32_t delegation_type;
+	int status;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	delegation_type = be32_to_cpup(p);
+	if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
+		res->delegation_type = 0;
+		return 0;
+	}
+	status = decode_stateid(xdr, &res->delegation);
+	if (unlikely(status))
+		return status;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->do_recall = be32_to_cpup(p);
+
+	switch (delegation_type) {
+	case NFS4_OPEN_DELEGATE_READ:
+		res->delegation_type = FMODE_READ;
+		break;
+	case NFS4_OPEN_DELEGATE_WRITE:
+		res->delegation_type = FMODE_WRITE|FMODE_READ;
+		if (decode_space_limit(xdr, &res->maxsize) < 0)
+				return -EIO;
+	}
+	return decode_ace(xdr, NULL, res->server->nfs_client);
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+	__be32 *p;
+	uint32_t savewords, bmlen, i;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_OPEN);
+	if (status != -EIO)
+		nfs_increment_open_seqid(status, res->seqid);
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	if (unlikely(status))
+		return status;
+
+	decode_change_info(xdr, &res->cinfo);
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->rflags = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p);
+	if (bmlen > 10)
+		goto xdr_error;
+
+	p = xdr_inline_decode(xdr, bmlen << 2);
+	if (unlikely(!p))
+		goto out_overflow;
+	savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
+	for (i = 0; i < savewords; ++i)
+		res->attrset[i] = be32_to_cpup(p++);
+	for (; i < NFS4_BITMAP_SIZE; i++)
+		res->attrset[i] = 0;
+
+	return decode_delegation(xdr, res);
+xdr_error:
+	dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
+	if (status != -EIO)
+		nfs_increment_open_seqid(status, res->seqid);
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
+}
+
+static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
+	if (status != -EIO)
+		nfs_increment_open_seqid(status, res->seqid);
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
+}
+
+static int decode_putfh(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_PUTFH);
+}
+
+static int decode_putrootfh(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_PUTROOTFH);
+}
+
+static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res)
+{
+	struct kvec *iov = req->rq_rcv_buf.head;
+	__be32 *p;
+	uint32_t count, eof, recvd, hdrlen;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_READ);
+	if (status)
+		return status;
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	eof = be32_to_cpup(p++);
+	count = be32_to_cpup(p);
+	hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base;
+	recvd = req->rq_rcv_buf.len - hdrlen;
+	if (count > recvd) {
+		dprintk("NFS: server cheating in read reply: "
+				"count %u > recvd %u\n", count, recvd);
+		count = recvd;
+		eof = 0;
+	}
+	xdr_read_pages(xdr, count);
+	res->eof = eof;
+	res->count = count;
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
+{
+	struct xdr_buf	*rcvbuf = &req->rq_rcv_buf;
+	struct kvec	*iov = rcvbuf->head;
+	size_t		hdrlen;
+	u32		recvd, pglen = rcvbuf->page_len;
+	int		status;
+
+	status = decode_op_hdr(xdr, OP_READDIR);
+	if (!status)
+		status = decode_verifier(xdr, readdir->verifier.data);
+	if (unlikely(status))
+		return status;
+	dprintk("%s: verifier = %08x:%08x\n",
+			__func__,
+			((u32 *)readdir->verifier.data)[0],
+			((u32 *)readdir->verifier.data)[1]);
+
+
+	hdrlen = (char *) xdr->p - (char *) iov->iov_base;
+	recvd = rcvbuf->len - hdrlen;
+	if (pglen > recvd)
+		pglen = recvd;
+	xdr_read_pages(xdr, pglen);
+
+
+	return pglen;
+}
+
+static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
+{
+	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+	struct kvec *iov = rcvbuf->head;
+	size_t hdrlen;
+	u32 len, recvd;
+	__be32 *p;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_READLINK);
+	if (status)
+		return status;
+
+	/* Convert length of symlink */
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	len = be32_to_cpup(p);
+	if (len >= rcvbuf->page_len || len <= 0) {
+		dprintk("nfs: server returned giant symlink!\n");
+		return -ENAMETOOLONG;
+	}
+	hdrlen = (char *) xdr->p - (char *) iov->iov_base;
+	recvd = req->rq_rcv_buf.len - hdrlen;
+	if (recvd < len) {
+		dprintk("NFS: server cheating in readlink reply: "
+				"count %u > recvd %u\n", len, recvd);
+		return -EIO;
+	}
+	xdr_read_pages(xdr, len);
+	/*
+	 * The XDR encode routine has set things up so that
+	 * the link text will be copied directly into the
+	 * buffer.  We just have to do overflow-checking,
+	 * and and null-terminate the text (the VFS expects
+	 * null-termination).
+	 */
+	xdr_terminate_string(rcvbuf, len);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_REMOVE);
+	if (status)
+		goto out;
+	status = decode_change_info(xdr, cinfo);
+out:
+	return status;
+}
+
+static int decode_rename(struct xdr_stream *xdr, struct nfs4_change_info *old_cinfo,
+	      struct nfs4_change_info *new_cinfo)
+{
+	int status;
+
+	status = decode_op_hdr(xdr, OP_RENAME);
+	if (status)
+		goto out;
+	if ((status = decode_change_info(xdr, old_cinfo)))
+		goto out;
+	status = decode_change_info(xdr, new_cinfo);
+out:
+	return status;
+}
+
+static int decode_renew(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_RENEW);
+}
+
+static int
+decode_restorefh(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_RESTOREFH);
+}
+
+static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
+		size_t *acl_len)
+{
+	__be32 *savep;
+	uint32_t attrlen,
+		 bitmap[2] = {0};
+	struct kvec *iov = req->rq_rcv_buf.head;
+	int status;
+
+	*acl_len = 0;
+	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+		goto out;
+	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+		goto out;
+	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+		goto out;
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_ACL)) {
+		size_t hdrlen;
+		u32 recvd;
+
+		/* We ignore &savep and don't do consistency checks on
+		 * the attr length.  Let userspace figure it out.... */
+		hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base;
+		recvd = req->rq_rcv_buf.len - hdrlen;
+		if (attrlen > recvd) {
+			dprintk("NFS: server cheating in getattr"
+					" acl reply: attrlen %u > recvd %u\n",
+					attrlen, recvd);
+			return -EINVAL;
+		}
+		xdr_read_pages(xdr, attrlen);
+		*acl_len = attrlen;
+	} else
+		status = -EOPNOTSUPP;
+
+out:
+	return status;
+}
+
+static int
+decode_savefh(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_SAVEFH);
+}
+
+static int decode_setattr(struct xdr_stream *xdr)
+{
+	__be32 *p;
+	uint32_t bmlen;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_SETATTR);
+	if (status)
+		return status;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	bmlen = be32_to_cpup(p);
+	p = xdr_inline_decode(xdr, bmlen << 2);
+	if (likely(p))
+		return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res)
+{
+	__be32 *p;
+	uint32_t opnum;
+	int32_t nfserr;
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	opnum = be32_to_cpup(p++);
+	if (opnum != OP_SETCLIENTID) {
+		dprintk("nfs: decode_setclientid: Server returned operation"
+			" %d\n", opnum);
+		return -EIO;
+	}
+	nfserr = be32_to_cpup(p);
+	if (nfserr == NFS_OK) {
+		p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
+		if (unlikely(!p))
+			goto out_overflow;
+		p = xdr_decode_hyper(p, &res->clientid);
+		memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
+	} else if (nfserr == NFSERR_CLID_INUSE) {
+		uint32_t len;
+
+		/* skip netid string */
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		len = be32_to_cpup(p);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
+
+		/* skip uaddr string */
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		len = be32_to_cpup(p);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
+		return -NFSERR_CLID_INUSE;
+	} else
+		return nfs4_stat_to_errno(nfserr);
+
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_setclientid_confirm(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM);
+}
+
+static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
+{
+	__be32 *p;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_WRITE);
+	if (status)
+		return status;
+
+	p = xdr_inline_decode(xdr, 16);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->count = be32_to_cpup(p++);
+	res->verf->committed = be32_to_cpup(p++);
+	memcpy(res->verf->verifier, p, 8);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_delegreturn(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_DELEGRETURN);
+}
+
+static int decode_secinfo_gss(struct xdr_stream *xdr, struct nfs4_secinfo_flavor *flavor)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	flavor->gss.sec_oid4.len = be32_to_cpup(p);
+	if (flavor->gss.sec_oid4.len > GSS_OID_MAX_LEN)
+		goto out_err;
+
+	p = xdr_inline_decode(xdr, flavor->gss.sec_oid4.len);
+	if (unlikely(!p))
+		goto out_overflow;
+	memcpy(flavor->gss.sec_oid4.data, p, flavor->gss.sec_oid4.len);
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	flavor->gss.qop4 = be32_to_cpup(p++);
+	flavor->gss.service = be32_to_cpup(p);
+
+	return 0;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+out_err:
+	return -EINVAL;
+}
+
+static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
+{
+	struct nfs4_secinfo_flavor *sec_flavor;
+	int status;
+	__be32 *p;
+	int i, num_flavors;
+
+	status = decode_op_hdr(xdr, OP_SECINFO);
+	if (status)
+		goto out;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+
+	res->flavors->num_flavors = 0;
+	num_flavors = be32_to_cpup(p);
+
+	for (i = 0; i < num_flavors; i++) {
+		sec_flavor = &res->flavors->flavors[i];
+		if ((char *)&sec_flavor[1] - (char *)res->flavors > PAGE_SIZE)
+			break;
+
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		sec_flavor->flavor = be32_to_cpup(p);
+
+		if (sec_flavor->flavor == RPC_AUTH_GSS) {
+			status = decode_secinfo_gss(xdr, sec_flavor);
+			if (status)
+				goto out;
+		}
+		res->flavors->num_flavors++;
+	}
+
+out:
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static int decode_exchange_id(struct xdr_stream *xdr,
+			      struct nfs41_exchange_id_res *res)
+{
+	__be32 *p;
+	uint32_t dummy;
+	char *dummy_str;
+	int status;
+	struct nfs_client *clp = res->client;
+
+	status = decode_op_hdr(xdr, OP_EXCHANGE_ID);
+	if (status)
+		return status;
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	xdr_decode_hyper(p, &clp->cl_clientid);
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
+	clp->cl_seqid = be32_to_cpup(p++);
+	clp->cl_exchange_flags = be32_to_cpup(p++);
+
+	/* We ask for SP4_NONE */
+	dummy = be32_to_cpup(p);
+	if (dummy != SP4_NONE)
+		return -EIO;
+
+	/* Throw away minor_id */
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+
+	/* Throw away Major id */
+	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+	if (unlikely(status))
+		return status;
+
+	/* Throw away server_scope */
+	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+	if (unlikely(status))
+		return status;
+
+	/* Throw away Implementation id array */
+	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+	if (unlikely(status))
+		return status;
+
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_chan_attrs(struct xdr_stream *xdr,
+			     struct nfs4_channel_attrs *attrs)
+{
+	__be32 *p;
+	u32 nr_attrs, val;
+
+	p = xdr_inline_decode(xdr, 28);
+	if (unlikely(!p))
+		goto out_overflow;
+	val = be32_to_cpup(p++);	/* headerpadsz */
+	if (val)
+		return -EINVAL;		/* no support for header padding yet */
+	attrs->max_rqst_sz = be32_to_cpup(p++);
+	attrs->max_resp_sz = be32_to_cpup(p++);
+	attrs->max_resp_sz_cached = be32_to_cpup(p++);
+	attrs->max_ops = be32_to_cpup(p++);
+	attrs->max_reqs = be32_to_cpup(p++);
+	nr_attrs = be32_to_cpup(p);
+	if (unlikely(nr_attrs > 1)) {
+		printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
+			__func__, nr_attrs);
+		return -EINVAL;
+	}
+	if (nr_attrs == 1) {
+		p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
+		if (unlikely(!p))
+			goto out_overflow;
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
+{
+	return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);
+}
+
+static int decode_create_session(struct xdr_stream *xdr,
+				 struct nfs41_create_session_res *res)
+{
+	__be32 *p;
+	int status;
+	struct nfs_client *clp = res->client;
+	struct nfs4_session *session = clp->cl_session;
+
+	status = decode_op_hdr(xdr, OP_CREATE_SESSION);
+	if (!status)
+		status = decode_sessionid(xdr, &session->sess_id);
+	if (unlikely(status))
+		return status;
+
+	/* seqid, flags */
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	clp->cl_seqid = be32_to_cpup(p++);
+	session->flags = be32_to_cpup(p);
+
+	/* Channel attributes */
+	status = decode_chan_attrs(xdr, &session->fc_attrs);
+	if (!status)
+		status = decode_chan_attrs(xdr, &session->bc_attrs);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
+{
+	return decode_op_hdr(xdr, OP_DESTROY_SESSION);
+}
+
+static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy)
+{
+	return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static int decode_sequence(struct xdr_stream *xdr,
+			   struct nfs4_sequence_res *res,
+			   struct rpc_rqst *rqstp)
+{
+#if defined(CONFIG_NFS_V4_1)
+	struct nfs4_sessionid id;
+	u32 dummy;
+	int status;
+	__be32 *p;
+
+	if (!res->sr_session)
+		return 0;
+
+	status = decode_op_hdr(xdr, OP_SEQUENCE);
+	if (!status)
+		status = decode_sessionid(xdr, &id);
+	if (unlikely(status))
+		goto out_err;
+
+	/*
+	 * If the server returns different values for sessionID, slotID or
+	 * sequence number, the server is looney tunes.
+	 */
+	status = -EREMOTEIO;
+
+	if (memcmp(id.data, res->sr_session->sess_id.data,
+		   NFS4_MAX_SESSIONID_LEN)) {
+		dprintk("%s Invalid session id\n", __func__);
+		goto out_err;
+	}
+
+	p = xdr_inline_decode(xdr, 20);
+	if (unlikely(!p))
+		goto out_overflow;
+
+	/* seqid */
+	dummy = be32_to_cpup(p++);
+	if (dummy != res->sr_slot->seq_nr) {
+		dprintk("%s Invalid sequence number\n", __func__);
+		goto out_err;
+	}
+	/* slot id */
+	dummy = be32_to_cpup(p++);
+	if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) {
+		dprintk("%s Invalid slot id\n", __func__);
+		goto out_err;
+	}
+	/* highest slot id - currently not processed */
+	dummy = be32_to_cpup(p++);
+	/* target highest slot id - currently not processed */
+	dummy = be32_to_cpup(p++);
+	/* result flags */
+	res->sr_status_flags = be32_to_cpup(p);
+	status = 0;
+out_err:
+	res->sr_status = status;
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	status = -EIO;
+	goto out_err;
+#else  /* CONFIG_NFS_V4_1 */
+	return 0;
+#endif /* CONFIG_NFS_V4_1 */
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static int decode_getdeviceinfo(struct xdr_stream *xdr,
+				struct pnfs_device *pdev)
+{
+	__be32 *p;
+	uint32_t len, type;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
+	if (status) {
+		if (status == -ETOOSMALL) {
+			p = xdr_inline_decode(xdr, 4);
+			if (unlikely(!p))
+				goto out_overflow;
+			pdev->mincount = be32_to_cpup(p);
+			dprintk("%s: Min count too small. mincnt = %u\n",
+				__func__, pdev->mincount);
+		}
+		return status;
+	}
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	type = be32_to_cpup(p++);
+	if (type != pdev->layout_type) {
+		dprintk("%s: layout mismatch req: %u pdev: %u\n",
+			__func__, pdev->layout_type, type);
+		return -EINVAL;
+	}
+	/*
+	 * Get the length of the opaque device_addr4. xdr_read_pages places
+	 * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
+	 * and places the remaining xdr data in xdr_buf->tail
+	 */
+	pdev->mincount = be32_to_cpup(p);
+	xdr_read_pages(xdr, pdev->mincount); /* include space for the length */
+
+	/* Parse notification bitmap, verifying that it is zero. */
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	len = be32_to_cpup(p);
+	if (len) {
+		uint32_t i;
+
+		p = xdr_inline_decode(xdr, 4 * len);
+		if (unlikely(!p))
+			goto out_overflow;
+		for (i = 0; i < len; i++, p++) {
+			if (be32_to_cpup(p)) {
+				dprintk("%s: notifications not supported\n",
+					__func__);
+				return -EIO;
+			}
+		}
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+			    struct nfs4_layoutget_res *res)
+{
+	__be32 *p;
+	int status;
+	u32 layout_count;
+	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+	struct kvec *iov = rcvbuf->head;
+	u32 hdrlen, recvd;
+
+	status = decode_op_hdr(xdr, OP_LAYOUTGET);
+	if (status)
+		return status;
+	p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->return_on_close = be32_to_cpup(p++);
+	p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
+	layout_count = be32_to_cpup(p);
+	if (!layout_count) {
+		dprintk("%s: server responded with empty layout array\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	p = xdr_inline_decode(xdr, 28);
+	if (unlikely(!p))
+		goto out_overflow;
+	p = xdr_decode_hyper(p, &res->range.offset);
+	p = xdr_decode_hyper(p, &res->range.length);
+	res->range.iomode = be32_to_cpup(p++);
+	res->type = be32_to_cpup(p++);
+	res->layoutp->len = be32_to_cpup(p);
+
+	dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
+		__func__,
+		(unsigned long)res->range.offset,
+		(unsigned long)res->range.length,
+		res->range.iomode,
+		res->type,
+		res->layoutp->len);
+
+	hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base;
+	recvd = req->rq_rcv_buf.len - hdrlen;
+	if (res->layoutp->len > recvd) {
+		dprintk("NFS: server cheating in layoutget reply: "
+				"layout len %u > recvd %u\n",
+				res->layoutp->len, recvd);
+		return -EINVAL;
+	}
+
+	xdr_read_pages(xdr, res->layoutp->len);
+
+	if (layout_count > 1) {
+		/* We only handle a length one array at the moment.  Any
+		 * further entries are just ignored.  Note that this means
+		 * the client may see a response that is less than the
+		 * minimum it requested.
+		 */
+		dprintk("%s: server responded with %d layouts, dropping tail\n",
+			__func__, layout_count);
+	}
+
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_layoutreturn(struct xdr_stream *xdr,
+			       struct nfs4_layoutreturn_res *res)
+{
+	__be32 *p;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
+	if (status)
+		return status;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->lrs_present = be32_to_cpup(p);
+	if (res->lrs_present)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_layoutcommit(struct xdr_stream *xdr,
+			       struct rpc_rqst *req,
+			       struct nfs4_layoutcommit_res *res)
+{
+	__be32 *p;
+	__u32 sizechanged;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
+	if (status)
+		return status;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	sizechanged = be32_to_cpup(p);
+
+	if (sizechanged) {
+		/* throw away new size */
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * END OF "GENERIC" DECODE ROUTINES.
+ */
+
+/*
+ * Decode OPEN_DOWNGRADE response
+ */
+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
+				       struct xdr_stream *xdr,
+				       struct nfs_closeres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_open_downgrade(xdr, res);
+	if (status != 0)
+		goto out;
+	decode_getfattr(xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Decode ACCESS response
+ */
+static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs4_accessres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status != 0)
+		goto out;
+	status = decode_access(xdr, res);
+	if (status != 0)
+		goto out;
+	decode_getfattr(xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Decode LOOKUP response
+ */
+static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs4_lookup_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_lookup(xdr);
+	if (status)
+		goto out;
+	status = decode_getfh(xdr, res->fh);
+	if (status)
+		goto out;
+	status = decode_getfattr(xdr, res->fattr, res->server
+			,!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Decode LOOKUP_ROOT response
+ */
+static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    struct nfs4_lookup_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putrootfh(xdr);
+	if (status)
+		goto out;
+	status = decode_getfh(xdr, res->fh);
+	if (status == 0)
+		status = decode_getfattr(xdr, res->fattr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Decode REMOVE response
+ */
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs_removeres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_remove(xdr, &res->cinfo);
+	if (status)
+		goto out;
+	decode_getfattr(xdr, res->dir_attr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Decode RENAME response
+ */
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs_renameres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_savefh(xdr);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
+	if (status)
+		goto out;
+	/* Current FH is target directory */
+	if (decode_getfattr(xdr, res->new_fattr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+		goto out;
+	status = decode_restorefh(xdr);
+	if (status)
+		goto out;
+	decode_getfattr(xdr, res->old_fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Decode LINK response
+ */
+static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs4_link_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_savefh(xdr);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_link(xdr, &res->cinfo);
+	if (status)
+		goto out;
+	/*
+	 * Note order: OP_LINK leaves the directory as the current
+	 *             filehandle.
+	 */
+	if (decode_getfattr(xdr, res->dir_attr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+		goto out;
+	status = decode_restorefh(xdr);
+	if (status)
+		goto out;
+	decode_getfattr(xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Decode CREATE response
+ */
+static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs4_create_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_savefh(xdr);
+	if (status)
+		goto out;
+	status = decode_create(xdr, &res->dir_cinfo);
+	if (status)
+		goto out;
+	status = decode_getfh(xdr, res->fh);
+	if (status)
+		goto out;
+	if (decode_getfattr(xdr, res->fattr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+		goto out;
+	status = decode_restorefh(xdr);
+	if (status)
+		goto out;
+	decode_getfattr(xdr, res->dir_fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Decode SYMLINK response
+ */
+static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+				struct nfs4_create_res *res)
+{
+	return nfs4_xdr_dec_create(rqstp, xdr, res);
+}
+
+/*
+ * Decode GETATTR response
+ */
+static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+				struct nfs4_getattr_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_getfattr(xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Encode an SETACL request
+ */
+static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs_setaclargs *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_setacl(xdr, args, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
+ * Decode SETACL response
+ */
+static int
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+		    struct nfs_setaclres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_setattr(xdr);
+out:
+	return status;
+}
+
+/*
+ * Decode GETACL response
+ */
+static int
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+		    struct nfs_getaclres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_getacl(xdr, rqstp, &res->acl_len);
+
+out:
+	return status;
+}
+
+/*
+ * Decode CLOSE response
+ */
+static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_closeres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_close(xdr, res);
+	if (status != 0)
+		goto out;
+	/*
+	 * Note: Server may do delete on close for this file
+	 * 	in which case the getattr call will fail with
+	 * 	an ESTALE error. Shouldn't be a problem,
+	 * 	though, since fattr->valid will remain unset.
+	 */
+	decode_getfattr(xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Decode OPEN response
+ */
+static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs_openres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_savefh(xdr);
+	if (status)
+		goto out;
+	status = decode_open(xdr, res);
+	if (status)
+		goto out;
+	if (decode_getfh(xdr, &res->fh) != 0)
+		goto out;
+	if (decode_getfattr(xdr, res->f_attr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+		goto out;
+	if (decode_restorefh(xdr) != 0)
+		goto out;
+	decode_getfattr(xdr, res->dir_attr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Decode OPEN_CONFIRM response
+ */
+static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp,
+				     struct xdr_stream *xdr,
+				     struct nfs_open_confirmres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_open_confirm(xdr, res);
+out:
+	return status;
+}
+
+/*
+ * Decode OPEN response
+ */
+static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    struct nfs_openres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_open(xdr, res);
+	if (status)
+		goto out;
+	decode_getfattr(xdr, res->f_attr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Decode SETATTR response
+ */
+static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
+				struct xdr_stream *xdr,
+				struct nfs_setattrres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_setattr(xdr);
+	if (status)
+		goto out;
+	decode_getfattr(xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Decode LOCK response
+ */
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs_lock_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_lock(xdr, res);
+out:
+	return status;
+}
+
+/*
+ * Decode LOCKT response
+ */
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_lockt_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_lockt(xdr, res);
+out:
+	return status;
+}
+
+/*
+ * Decode LOCKU response
+ */
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_locku_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_locku(xdr, res);
+out:
+	return status;
+}
+
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp,
+					  struct xdr_stream *xdr, void *dummy)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_release_lockowner(xdr);
+	return status;
+}
+
+/*
+ * Decode READLINK response
+ */
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp,
+				 struct xdr_stream *xdr,
+				 struct nfs4_readlink_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_readlink(xdr, rqstp);
+out:
+	return status;
+}
+
+/*
+ * Decode READDIR response
+ */
+static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+				struct nfs4_readdir_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_readdir(xdr, rqstp, res);
+out:
+	return status;
+}
+
+/*
+ * Decode Read response
+ */
+static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs_readres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_read(xdr, rqstp, res);
+	if (!status)
+		status = res->count;
+out:
+	return status;
+}
+
+/*
+ * Decode WRITE response
+ */
+static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_writeres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_write(xdr, res);
+	if (status)
+		goto out;
+	if (res->fattr)
+		decode_getfattr(xdr, res->fattr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task));
+	if (!status)
+		status = res->count;
+out:
+	return status;
+}
+
+/*
+ * Decode COMMIT response
+ */
+static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs_writeres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_commit(xdr, res);
+	if (status)
+		goto out;
+	if (res->fattr)
+		decode_getfattr(xdr, res->fattr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Decode FSINFO response
+ */
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs4_fsinfo_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_sequence(xdr, &res->seq_res, req);
+	if (!status)
+		status = decode_putfh(xdr);
+	if (!status)
+		status = decode_fsinfo(xdr, res->fsinfo);
+	return status;
+}
+
+/*
+ * Decode PATHCONF response
+ */
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs4_pathconf_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_sequence(xdr, &res->seq_res, req);
+	if (!status)
+		status = decode_putfh(xdr);
+	if (!status)
+		status = decode_pathconf(xdr, res->pathconf);
+	return status;
+}
+
+/*
+ * Decode STATFS response
+ */
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs4_statfs_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_sequence(xdr, &res->seq_res, req);
+	if (!status)
+		status = decode_putfh(xdr);
+	if (!status)
+		status = decode_statfs(xdr, res->fsstat);
+	return status;
+}
+
+/*
+ * Decode GETATTR_BITMAP response
+ */
+static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs4_server_caps_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, req);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_server_caps(xdr, res);
+out:
+	return status;
+}
+
+/*
+ * Decode RENEW response
+ */
+static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      void *__unused)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_renew(xdr);
+	return status;
+}
+
+/*
+ * Decode SETCLIENTID response
+ */
+static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs4_setclientid_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_setclientid(xdr, res);
+	return status;
+}
+
+/*
+ * Decode SETCLIENTID_CONFIRM response
+ */
+static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
+					    struct xdr_stream *xdr,
+					    struct nfs_fsinfo *fsinfo)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_setclientid_confirm(xdr);
+	if (!status)
+		status = decode_putrootfh(xdr);
+	if (!status)
+		status = decode_fsinfo(xdr, fsinfo);
+	return status;
+}
+
+/*
+ * Decode DELEGRETURN response
+ */
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    struct nfs4_delegreturnres *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status != 0)
+		goto out;
+	status = decode_delegreturn(xdr);
+	if (status != 0)
+		goto out;
+	decode_getfattr(xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Decode FS_LOCATIONS response
+ */
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs4_fs_locations_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, req);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_lookup(xdr);
+	if (status)
+		goto out;
+	xdr_enter_page(xdr, PAGE_SIZE);
+	status = decode_getfattr(xdr, &res->fs_locations->fattr,
+				 res->fs_locations->server,
+				 !RPC_IS_ASYNC(req->rq_task));
+out:
+	return status;
+}
+
+/*
+ * Decode SECINFO response
+ */
+static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp,
+				struct xdr_stream *xdr,
+				struct nfs4_secinfo_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_secinfo(xdr, res);
+	if (status)
+		goto out;
+out:
+	return status;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * Decode EXCHANGE_ID response
+ */
+static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    void *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_exchange_id(xdr, res);
+	return status;
+}
+
+/*
+ * Decode CREATE_SESSION response
+ */
+static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp,
+				       struct xdr_stream *xdr,
+				       struct nfs41_create_session_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_create_session(xdr, res);
+	return status;
+}
+
+/*
+ * Decode DESTROY_SESSION response
+ */
+static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					void *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_destroy_session(xdr, res);
+	return status;
+}
+
+/*
+ * Decode SEQUENCE response
+ */
+static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
+				 struct xdr_stream *xdr,
+				 struct nfs4_sequence_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_sequence(xdr, res, rqstp);
+	return status;
+}
+
+/*
+ * Decode GET_LEASE_TIME response
+ */
+static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp,
+				       struct xdr_stream *xdr,
+				       struct nfs4_get_lease_time_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_sequence(xdr, &res->lr_seq_res, rqstp);
+	if (!status)
+		status = decode_putrootfh(xdr);
+	if (!status)
+		status = decode_fsinfo(xdr, res->lr_fsinfo);
+	return status;
+}
+
+/*
+ * Decode RECLAIM_COMPLETE response
+ */
+static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
+					 struct xdr_stream *xdr,
+					 struct nfs41_reclaim_complete_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (!status)
+		status = decode_reclaim_complete(xdr, (void *)NULL);
+	return status;
+}
+
+/*
+ * Decode GETDEVINFO response
+ */
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
+				      struct xdr_stream *xdr,
+				      struct nfs4_getdeviceinfo_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status != 0)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status != 0)
+		goto out;
+	status = decode_getdeviceinfo(xdr, res->pdev);
+out:
+	return status;
+}
+
+/*
+ * Decode LAYOUTGET response
+ */
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
+				  struct nfs4_layoutget_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_layoutget(xdr, rqstp, res);
+out:
+	return status;
+}
+
+/*
+ * Decode LAYOUTRETURN response
+ */
+static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
+				     struct xdr_stream *xdr,
+				     struct nfs4_layoutreturn_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_layoutreturn(xdr, res);
+out:
+	return status;
+}
+
+/*
+ * Decode LAYOUTCOMMIT response
+ */
+static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
+				     struct xdr_stream *xdr,
+				     struct nfs4_layoutcommit_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_layoutcommit(xdr, rqstp, res);
+	if (status)
+		goto out;
+	decode_getfattr(xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
+out:
+	return status;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ */
+int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+		       int plus)
+{
+	uint32_t bitmap[2] = {0};
+	uint32_t len;
+	__be32 *p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	if (*p == xdr_zero) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (*p == xdr_zero)
+			return -EAGAIN;
+		entry->eof = 1;
+		return -EBADCOOKIE;
+	}
+
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
+	entry->prev_cookie = entry->cookie;
+	p = xdr_decode_hyper(p, &entry->cookie);
+	entry->len = be32_to_cpup(p);
+
+	p = xdr_inline_decode(xdr, entry->len);
+	if (unlikely(!p))
+		goto out_overflow;
+	entry->name = (const char *) p;
+
+	/*
+	 * In case the server doesn't return an inode number,
+	 * we fake one here.  (We don't use inode number 0,
+	 * since glibc seems to choke on it...)
+	 */
+	entry->ino = 1;
+	entry->fattr->valid = 0;
+
+	if (decode_attr_bitmap(xdr, bitmap) < 0)
+		goto out_overflow;
+
+	if (decode_attr_length(xdr, &len, &p) < 0)
+		goto out_overflow;
+
+	if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
+					entry->server, 1) < 0)
+		goto out_overflow;
+	if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+		entry->ino = entry->fattr->mounted_on_fileid;
+	else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
+		entry->ino = entry->fattr->fileid;
+
+	entry->d_type = DT_UNKNOWN;
+	if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
+		entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
+
+	return 0;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EAGAIN;
+}
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ */
+static struct {
+	int stat;
+	int errno;
+} nfs_errtbl[] = {
+	{ NFS4_OK,		0		},
+	{ NFS4ERR_PERM,		-EPERM		},
+	{ NFS4ERR_NOENT,	-ENOENT		},
+	{ NFS4ERR_IO,		-errno_NFSERR_IO},
+	{ NFS4ERR_NXIO,		-ENXIO		},
+	{ NFS4ERR_ACCESS,	-EACCES		},
+	{ NFS4ERR_EXIST,	-EEXIST		},
+	{ NFS4ERR_XDEV,		-EXDEV		},
+	{ NFS4ERR_NOTDIR,	-ENOTDIR	},
+	{ NFS4ERR_ISDIR,	-EISDIR		},
+	{ NFS4ERR_INVAL,	-EINVAL		},
+	{ NFS4ERR_FBIG,		-EFBIG		},
+	{ NFS4ERR_NOSPC,	-ENOSPC		},
+	{ NFS4ERR_ROFS,		-EROFS		},
+	{ NFS4ERR_MLINK,	-EMLINK		},
+	{ NFS4ERR_NAMETOOLONG,	-ENAMETOOLONG	},
+	{ NFS4ERR_NOTEMPTY,	-ENOTEMPTY	},
+	{ NFS4ERR_DQUOT,	-EDQUOT		},
+	{ NFS4ERR_STALE,	-ESTALE		},
+	{ NFS4ERR_BADHANDLE,	-EBADHANDLE	},
+	{ NFS4ERR_BAD_COOKIE,	-EBADCOOKIE	},
+	{ NFS4ERR_NOTSUPP,	-ENOTSUPP	},
+	{ NFS4ERR_TOOSMALL,	-ETOOSMALL	},
+	{ NFS4ERR_SERVERFAULT,	-EREMOTEIO	},
+	{ NFS4ERR_BADTYPE,	-EBADTYPE	},
+	{ NFS4ERR_LOCKED,	-EAGAIN		},
+	{ NFS4ERR_SYMLINK,	-ELOOP		},
+	{ NFS4ERR_OP_ILLEGAL,	-EOPNOTSUPP	},
+	{ NFS4ERR_DEADLOCK,	-EDEADLK	},
+	{ -1,			-EIO		}
+};
+
+/*
+ * Convert an NFS error code to a local one.
+ * This one is used jointly by NFSv2 and NFSv3.
+ */
+static int
+nfs4_stat_to_errno(int stat)
+{
+	int i;
+	for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+		if (nfs_errtbl[i].stat == stat)
+			return nfs_errtbl[i].errno;
+	}
+	if (stat <= 10000 || stat > 10100) {
+		/* The server is looney tunes. */
+		return -EREMOTEIO;
+	}
+	/* If we cannot translate the error, the recovery routines should
+	 * handle it.
+	 * Note: remaining NFSv4 error codes have values > 10000, so should
+	 * not conflict with native Linux error codes.
+	 */
+	return -stat;
+}
+
+#define PROC(proc, argtype, restype)				\
+[NFSPROC4_CLNT_##proc] = {					\
+	.p_proc   = NFSPROC4_COMPOUND,				\
+	.p_encode = (kxdreproc_t)nfs4_xdr_##argtype,		\
+	.p_decode = (kxdrdproc_t)nfs4_xdr_##restype,		\
+	.p_arglen = NFS4_##argtype##_sz,			\
+	.p_replen = NFS4_##restype##_sz,			\
+	.p_statidx = NFSPROC4_CLNT_##proc,			\
+	.p_name   = #proc,					\
+}
+
+struct rpc_procinfo	nfs4_procedures[] = {
+	PROC(READ,		enc_read,		dec_read),
+	PROC(WRITE,		enc_write,		dec_write),
+	PROC(COMMIT,		enc_commit,		dec_commit),
+	PROC(OPEN,		enc_open,		dec_open),
+	PROC(OPEN_CONFIRM,	enc_open_confirm,	dec_open_confirm),
+	PROC(OPEN_NOATTR,	enc_open_noattr,	dec_open_noattr),
+	PROC(OPEN_DOWNGRADE,	enc_open_downgrade,	dec_open_downgrade),
+	PROC(CLOSE,		enc_close,		dec_close),
+	PROC(SETATTR,		enc_setattr,		dec_setattr),
+	PROC(FSINFO,		enc_fsinfo,		dec_fsinfo),
+	PROC(RENEW,		enc_renew,		dec_renew),
+	PROC(SETCLIENTID,	enc_setclientid,	dec_setclientid),
+	PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm),
+	PROC(LOCK,		enc_lock,		dec_lock),
+	PROC(LOCKT,		enc_lockt,		dec_lockt),
+	PROC(LOCKU,		enc_locku,		dec_locku),
+	PROC(ACCESS,		enc_access,		dec_access),
+	PROC(GETATTR,		enc_getattr,		dec_getattr),
+	PROC(LOOKUP,		enc_lookup,		dec_lookup),
+	PROC(LOOKUP_ROOT,	enc_lookup_root,	dec_lookup_root),
+	PROC(REMOVE,		enc_remove,		dec_remove),
+	PROC(RENAME,		enc_rename,		dec_rename),
+	PROC(LINK,		enc_link,		dec_link),
+	PROC(SYMLINK,		enc_symlink,		dec_symlink),
+	PROC(CREATE,		enc_create,		dec_create),
+	PROC(PATHCONF,		enc_pathconf,		dec_pathconf),
+	PROC(STATFS,		enc_statfs,		dec_statfs),
+	PROC(READLINK,		enc_readlink,		dec_readlink),
+	PROC(READDIR,		enc_readdir,		dec_readdir),
+	PROC(SERVER_CAPS,	enc_server_caps,	dec_server_caps),
+	PROC(DELEGRETURN,	enc_delegreturn,	dec_delegreturn),
+	PROC(GETACL,		enc_getacl,		dec_getacl),
+	PROC(SETACL,		enc_setacl,		dec_setacl),
+	PROC(FS_LOCATIONS,	enc_fs_locations,	dec_fs_locations),
+	PROC(RELEASE_LOCKOWNER,	enc_release_lockowner,	dec_release_lockowner),
+	PROC(SECINFO,		enc_secinfo,		dec_secinfo),
+#if defined(CONFIG_NFS_V4_1)
+	PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
+	PROC(CREATE_SESSION,	enc_create_session,	dec_create_session),
+	PROC(DESTROY_SESSION,	enc_destroy_session,	dec_destroy_session),
+	PROC(SEQUENCE,		enc_sequence,		dec_sequence),
+	PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time),
+	PROC(RECLAIM_COMPLETE,	enc_reclaim_complete,	dec_reclaim_complete),
+	PROC(GETDEVICEINFO,	enc_getdeviceinfo,	dec_getdeviceinfo),
+	PROC(LAYOUTGET,		enc_layoutget,		dec_layoutget),
+	PROC(LAYOUTCOMMIT,	enc_layoutcommit,	dec_layoutcommit),
+	PROC(LAYOUTRETURN,	enc_layoutreturn,	dec_layoutreturn),
+#endif /* CONFIG_NFS_V4_1 */
+};
+
+struct rpc_version		nfs_version4 = {
+	.number			= 4,
+	.nrprocs		= ARRAY_SIZE(nfs4_procedures),
+	.procs			= nfs4_procedures
+};
+
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
new file mode 100644
index 00000000..c4744e1d
--- /dev/null
+++ b/fs/nfs/nfsroot.c
@@ -0,0 +1,309 @@
+/*
+ *  Copyright (C) 1995, 1996  Gero Kuhlmann <gero@gkminix.han.de>
+ *
+ *  Allow an NFS filesystem to be mounted as root. The way this works is:
+ *     (1) Use the IP autoconfig mechanism to set local IP addresses and routes.
+ *     (2) Construct the device string and the options string using DHCP
+ *         option 17 and/or kernel command line options.
+ *     (3) When mount_root() sets up the root file system, pass these strings
+ *         to the NFS client's regular mount interface via sys_mount().
+ *
+ *
+ *	Changes:
+ *
+ *	Alan Cox	:	Removed get_address name clash with FPU.
+ *	Alan Cox	:	Reformatted a bit.
+ *	Gero Kuhlmann	:	Code cleanup
+ *	Michael Rausch  :	Fixed recognition of an incoming RARP answer.
+ *	Martin Mares	: (2.0)	Auto-configuration via BOOTP supported.
+ *	Martin Mares	:	Manual selection of interface & BOOTP/RARP.
+ *	Martin Mares	:	Using network routes instead of host routes,
+ *				allowing the default configuration to be used
+ *				for normal operation of the host.
+ *	Martin Mares	:	Randomized timer with exponential backoff
+ *				installed to minimize network congestion.
+ *	Martin Mares	:	Code cleanup.
+ *	Martin Mares	: (2.1)	BOOTP and RARP made configuration options.
+ *	Martin Mares	:	Server hostname generation fixed.
+ *	Gerd Knorr	:	Fixed wired inode handling
+ *	Martin Mares	: (2.2)	"0.0.0.0" addresses from command line ignored.
+ *	Martin Mares	:	RARP replies not tested for server address.
+ *	Gero Kuhlmann	: (2.3) Some bug fixes and code cleanup again (please
+ *				send me your new patches _before_ bothering
+ *				Linus so that I don' always have to cleanup
+ *				_afterwards_ - thanks)
+ *	Gero Kuhlmann	:	Last changes of Martin Mares undone.
+ *	Gero Kuhlmann	: 	RARP replies are tested for specified server
+ *				again. However, it's now possible to have
+ *				different RARP and NFS servers.
+ *	Gero Kuhlmann	:	"0.0.0.0" addresses from command line are
+ *				now mapped to INADDR_NONE.
+ *	Gero Kuhlmann	:	Fixed a bug which prevented BOOTP path name
+ *				from being used (thanks to Leo Spiekman)
+ *	Andy Walker	:	Allow to specify the NFS server in nfs_root
+ *				without giving a path name
+ *	Swen Thümmler	:	Allow to specify the NFS options in nfs_root
+ *				without giving a path name. Fix BOOTP request
+ *				for domainname (domainname is NIS domain, not
+ *				DNS domain!). Skip dummy devices for BOOTP.
+ *	Jacek Zapala	:	Fixed a bug which prevented server-ip address
+ *				from nfsroot parameter from being used.
+ *	Olaf Kirch	:	Adapted to new NFS code.
+ *	Jakub Jelinek	:	Free used code segment.
+ *	Marko Kohtala	:	Fixed some bugs.
+ *	Martin Mares	:	Debug message cleanup
+ *	Martin Mares	:	Changed to use the new generic IP layer autoconfig
+ *				code. BOOTP and RARP moved there.
+ *	Martin Mares	:	Default path now contains host name instead of
+ *				host IP address (but host name defaults to IP
+ *				address anyway).
+ *	Martin Mares	:	Use root_server_addr appropriately during setup.
+ *	Martin Mares	:	Rewrote parameter parsing, now hopefully giving
+ *				correct overriding.
+ *	Trond Myklebust :	Add in preliminary support for NFSv3 and TCP.
+ *				Fix bug in root_nfs_addr(). nfs_data.namlen
+ *				is NOT for the length of the hostname.
+ *	Hua Qin		:	Support for mounting root file system via
+ *				NFS over TCP.
+ *	Fabian Frederick:	Option parser rebuilt (using parser lib)
+ *	Chuck Lever	:	Use super.c's text-based mount option parsing
+ *	Chuck Lever	:	Add "nfsrootdebug".
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/nfs.h>
+#include <linux/nfs_fs.h>
+#include <linux/utsname.h>
+#include <linux/root_dev.h>
+#include <net/ipconfig.h>
+
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_ROOT
+
+/* Default path we try to mount. "%s" gets replaced by our IP address */
+#define NFS_ROOT		"/tftpboot/%s"
+
+/* Default NFSROOT mount options. */
+#define NFS_DEF_OPTIONS		"vers=2,udp,rsize=4096,wsize=4096"
+
+/* Parameters passed from the kernel command line */
+static char nfs_root_parms[256] __initdata = "";
+
+/* Text-based mount options passed to super.c */
+static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
+
+/* Address of NFS server */
+static __be32 servaddr __initdata = htonl(INADDR_NONE);
+
+/* Name of directory to mount */
+static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
+
+/* server:export path string passed to super.c */
+static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
+
+#ifdef RPC_DEBUG
+/*
+ * When the "nfsrootdebug" kernel command line option is specified,
+ * enable debugging messages for NFSROOT.
+ */
+static int __init nfs_root_debug(char *__unused)
+{
+	nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
+	return 1;
+}
+
+__setup("nfsrootdebug", nfs_root_debug);
+#endif
+
+/*
+ *  Parse NFS server and directory information passed on the kernel
+ *  command line.
+ *
+ *  nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
+ *
+ *  If there is a "%s" token in the <root-dir> string, it is replaced
+ *  by the ASCII-representation of the client's IP address.
+ */
+static int __init nfs_root_setup(char *line)
+{
+	ROOT_DEV = Root_NFS;
+
+	if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
+		strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms));
+	} else {
+		size_t n = strlen(line) + sizeof(NFS_ROOT) - 1;
+		if (n >= sizeof(nfs_root_parms))
+			line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0';
+		sprintf(nfs_root_parms, NFS_ROOT, line);
+	}
+
+	/*
+	 * Extract the IP address of the NFS server containing our
+	 * root file system, if one was specified.
+	 *
+	 * Note: root_nfs_parse_addr() removes the server-ip from
+	 *	 nfs_root_parms, if it exists.
+	 */
+	root_server_addr = root_nfs_parse_addr(nfs_root_parms);
+
+	return 1;
+}
+
+__setup("nfsroot=", nfs_root_setup);
+
+static int __init root_nfs_copy(char *dest, const char *src,
+				     const size_t destlen)
+{
+	if (strlcpy(dest, src, destlen) > destlen)
+		return -1;
+	return 0;
+}
+
+static int __init root_nfs_cat(char *dest, const char *src,
+			       const size_t destlen)
+{
+	size_t len = strlen(dest);
+
+	if (len && dest[len - 1] != ',')
+		if (strlcat(dest, ",", destlen) > destlen)
+			return -1;
+
+	if (strlcat(dest, src, destlen) > destlen)
+		return -1;
+	return 0;
+}
+
+/*
+ * Parse out root export path and mount options from
+ * passed-in string @incoming.
+ *
+ * Copy the export path into @exppath.
+ */
+static int __init root_nfs_parse_options(char *incoming, char *exppath,
+					 const size_t exppathlen)
+{
+	char *p;
+
+	/*
+	 * Set the NFS remote path
+	 */
+	p = strsep(&incoming, ",");
+	if (*p != '\0' && strcmp(p, "default") != 0)
+		if (root_nfs_copy(exppath, p, exppathlen))
+			return -1;
+
+	/*
+	 * @incoming now points to the rest of the string; if it
+	 * contains something, append it to our root options buffer
+	 */
+	if (incoming != NULL && *incoming != '\0')
+		if (root_nfs_cat(nfs_root_options, incoming,
+						sizeof(nfs_root_options)))
+			return -1;
+	return 0;
+}
+
+/*
+ *  Decode the export directory path name and NFS options from
+ *  the kernel command line.  This has to be done late in order to
+ *  use a dynamically acquired client IP address for the remote
+ *  root directory path.
+ *
+ *  Returns zero if successful; otherwise -1 is returned.
+ */
+static int __init root_nfs_data(char *cmdline)
+{
+	char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
+	int len, retval = -1;
+	char *tmp = NULL;
+	const size_t tmplen = sizeof(nfs_export_path);
+
+	tmp = kzalloc(tmplen, GFP_KERNEL);
+	if (tmp == NULL)
+		goto out_nomem;
+	strcpy(tmp, NFS_ROOT);
+
+	if (root_server_path[0] != '\0') {
+		dprintk("Root-NFS: DHCPv4 option 17: %s\n",
+			root_server_path);
+		if (root_nfs_parse_options(root_server_path, tmp, tmplen))
+			goto out_optionstoolong;
+	}
+
+	if (cmdline[0] != '\0') {
+		dprintk("Root-NFS: nfsroot=%s\n", cmdline);
+		if (root_nfs_parse_options(cmdline, tmp, tmplen))
+			goto out_optionstoolong;
+	}
+
+	/*
+	 * Append mandatory options for nfsroot so they override
+	 * what has come before
+	 */
+	snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4",
+			&servaddr);
+	if (root_nfs_cat(nfs_root_options, mand_options,
+						sizeof(nfs_root_options)))
+		goto out_optionstoolong;
+
+	/*
+	 * Set up nfs_root_device.  For NFS mounts, this looks like
+	 *
+	 *	server:/path
+	 *
+	 * At this point, utsname()->nodename contains our local
+	 * IP address or hostname, set by ipconfig.  If "%s" exists
+	 * in tmp, substitute the nodename, then shovel the whole
+	 * mess into nfs_root_device.
+	 */
+	len = snprintf(nfs_export_path, sizeof(nfs_export_path),
+				tmp, utsname()->nodename);
+	if (len > (int)sizeof(nfs_export_path))
+		goto out_devnametoolong;
+	len = snprintf(nfs_root_device, sizeof(nfs_root_device),
+				"%pI4:%s", &servaddr, nfs_export_path);
+	if (len > (int)sizeof(nfs_root_device))
+		goto out_devnametoolong;
+
+	retval = 0;
+
+out:
+	kfree(tmp);
+	return retval;
+out_nomem:
+	printk(KERN_ERR "Root-NFS: could not allocate memory\n");
+	goto out;
+out_optionstoolong:
+	printk(KERN_ERR "Root-NFS: mount options string too long\n");
+	goto out;
+out_devnametoolong:
+	printk(KERN_ERR "Root-NFS: root device name too long.\n");
+	goto out;
+}
+
+/**
+ * nfs_root_data - Return prepared 'data' for NFSROOT mount
+ * @root_device: OUT: address of string containing NFSROOT device
+ * @root_data: OUT: address of string containing NFSROOT mount options
+ *
+ * Returns zero and sets @root_device and @root_data if successful,
+ * otherwise -1 is returned.
+ */
+int __init nfs_root_data(char **root_device, char **root_data)
+{
+	servaddr = root_server_addr;
+	if (servaddr == htonl(INADDR_NONE)) {
+		printk(KERN_ERR "Root-NFS: no NFS server address\n");
+		return -1;
+	}
+
+	if (root_nfs_data(nfs_root_parms) < 0)
+		return -1;
+
+	*root_device = nfs_root_device;
+	*root_data = nfs_root_options;
+	return 0;
+}
diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
new file mode 100644
index 00000000..ed30ea07
--- /dev/null
+++ b/fs/nfs/objlayout/Kbuild
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Objects Layout Driver kernel module
+#
+objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
new file mode 100644
index 00000000..75fe694d
--- /dev/null
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -0,0 +1,1056 @@
+/*
+ *  pNFS Objects layout implementation over open-osd initiator library
+ *
+ *  Copyright (C) 2009 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <scsi/osd_initiator.h>
+
+#include "objlayout.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+#define _LLU(x) ((unsigned long long)x)
+
+enum { BIO_MAX_PAGES_KMALLOC =
+		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+};
+
+struct objio_dev_ent {
+	struct nfs4_deviceid_node id_node;
+	struct osd_dev *od;
+};
+
+static void
+objio_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+	struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
+
+	dprintk("%s: free od=%p\n", __func__, de->od);
+	osduld_put_device(de->od);
+	kfree(de);
+}
+
+static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
+	const struct nfs4_deviceid *d_id)
+{
+	struct nfs4_deviceid_node *d;
+	struct objio_dev_ent *de;
+
+	d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
+	if (!d)
+		return NULL;
+
+	de = container_of(d, struct objio_dev_ent, id_node);
+	return de;
+}
+
+static struct objio_dev_ent *
+_dev_list_add(const struct nfs_server *nfss,
+	const struct nfs4_deviceid *d_id, struct osd_dev *od,
+	gfp_t gfp_flags)
+{
+	struct nfs4_deviceid_node *d;
+	struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
+	struct objio_dev_ent *n;
+
+	if (!de) {
+		dprintk("%s: -ENOMEM od=%p\n", __func__, od);
+		return NULL;
+	}
+
+	dprintk("%s: Adding od=%p\n", __func__, od);
+	nfs4_init_deviceid_node(&de->id_node,
+				nfss->pnfs_curr_ld,
+				nfss->nfs_client,
+				d_id);
+	de->od = od;
+
+	d = nfs4_insert_deviceid_node(&de->id_node);
+	n = container_of(d, struct objio_dev_ent, id_node);
+	if (n != de) {
+		dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
+		objio_free_deviceid_node(&de->id_node);
+		de = n;
+	}
+
+	return de;
+}
+
+struct caps_buffers {
+	u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
+	u8 creds[OSD_CAP_LEN];
+};
+
+struct objio_segment {
+	struct pnfs_layout_segment lseg;
+
+	struct pnfs_osd_object_cred *comps;
+
+	unsigned mirrors_p1;
+	unsigned stripe_unit;
+	unsigned group_width;	/* Data stripe_units without integrity comps */
+	u64 group_depth;
+	unsigned group_count;
+
+	unsigned max_io_size;
+
+	unsigned comps_index;
+	unsigned num_comps;
+	/* variable length */
+	struct objio_dev_ent *ods[];
+};
+
+static inline struct objio_segment *
+OBJIO_LSEG(struct pnfs_layout_segment *lseg)
+{
+	return container_of(lseg, struct objio_segment, lseg);
+}
+
+struct objio_state;
+typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
+
+struct objio_state {
+	/* Generic layer */
+	struct objlayout_io_state ol_state;
+
+	struct objio_segment *layout;
+
+	struct kref kref;
+	objio_done_fn done;
+	void *private;
+
+	unsigned long length;
+	unsigned numdevs; /* Actually used devs in this IO */
+	/* A per-device variable array of size numdevs */
+	struct _objio_per_comp {
+		struct bio *bio;
+		struct osd_request *or;
+		unsigned long length;
+		u64 offset;
+		unsigned dev;
+	} per_dev[];
+};
+
+/* Send and wait for a get_device_info of devices in the layout,
+   then look them up with the osd_initiator library */
+static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
+				struct objio_segment *objio_seg, unsigned comp,
+				gfp_t gfp_flags)
+{
+	struct pnfs_osd_deviceaddr *deviceaddr;
+	struct nfs4_deviceid *d_id;
+	struct objio_dev_ent *ode;
+	struct osd_dev *od;
+	struct osd_dev_info odi;
+	int err;
+
+	d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
+
+	ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
+	if (ode)
+		return ode;
+
+	err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
+	if (unlikely(err)) {
+		dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
+			__func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
+		return ERR_PTR(err);
+	}
+
+	odi.systemid_len = deviceaddr->oda_systemid.len;
+	if (odi.systemid_len > sizeof(odi.systemid)) {
+		err = -EINVAL;
+		goto out;
+	} else if (odi.systemid_len)
+		memcpy(odi.systemid, deviceaddr->oda_systemid.data,
+		       odi.systemid_len);
+	odi.osdname_len	 = deviceaddr->oda_osdname.len;
+	odi.osdname	 = (u8 *)deviceaddr->oda_osdname.data;
+
+	if (!odi.osdname_len && !odi.systemid_len) {
+		dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
+			__func__);
+		err = -ENODEV;
+		goto out;
+	}
+
+	od = osduld_info_lookup(&odi);
+	if (unlikely(IS_ERR(od))) {
+		err = PTR_ERR(od);
+		dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
+		goto out;
+	}
+
+	ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
+			    gfp_flags);
+
+out:
+	dprintk("%s: return=%d\n", __func__, err);
+	objlayout_put_deviceinfo(deviceaddr);
+	return err ? ERR_PTR(err) : ode;
+}
+
+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
+	struct objio_segment *objio_seg,
+	gfp_t gfp_flags)
+{
+	unsigned i;
+	int err;
+
+	/* lookup all devices */
+	for (i = 0; i < objio_seg->num_comps; i++) {
+		struct objio_dev_ent *ode;
+
+		ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
+		if (unlikely(IS_ERR(ode))) {
+			err = PTR_ERR(ode);
+			goto out;
+		}
+		objio_seg->ods[i] = ode;
+	}
+	err = 0;
+
+out:
+	dprintk("%s: return=%d\n", __func__, err);
+	return err;
+}
+
+static int _verify_data_map(struct pnfs_osd_layout *layout)
+{
+	struct pnfs_osd_data_map *data_map = &layout->olo_map;
+	u64 stripe_length;
+	u32 group_width;
+
+/* FIXME: Only raid0 for now. if not go through MDS */
+	if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
+		printk(KERN_ERR "Only RAID_0 for now\n");
+		return -ENOTSUPP;
+	}
+	if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
+		printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
+			  data_map->odm_num_comps, data_map->odm_mirror_cnt);
+		return -EINVAL;
+	}
+
+	if (data_map->odm_group_width)
+		group_width = data_map->odm_group_width;
+	else
+		group_width = data_map->odm_num_comps /
+						(data_map->odm_mirror_cnt + 1);
+
+	stripe_length = (u64)data_map->odm_stripe_unit * group_width;
+	if (stripe_length >= (1ULL << 32)) {
+		printk(KERN_ERR "Total Stripe length(0x%llx)"
+			  " >= 32bit is not supported\n", _LLU(stripe_length));
+		return -ENOTSUPP;
+	}
+
+	if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
+		printk(KERN_ERR "Stripe Unit(0x%llx)"
+			  " must be Multples of PAGE_SIZE(0x%lx)\n",
+			  _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
+		return -ENOTSUPP;
+	}
+
+	return 0;
+}
+
+static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
+			     struct pnfs_osd_object_cred *src_comp,
+			     struct caps_buffers *caps_p)
+{
+	WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
+	WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
+
+	*cur_comp = *src_comp;
+
+	memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
+	       sizeof(caps_p->caps_key));
+	cur_comp->oc_cap_key.cred = caps_p->caps_key;
+
+	memcpy(caps_p->creds, src_comp->oc_cap.cred,
+	       sizeof(caps_p->creds));
+	cur_comp->oc_cap.cred = caps_p->creds;
+}
+
+int objio_alloc_lseg(struct pnfs_layout_segment **outp,
+	struct pnfs_layout_hdr *pnfslay,
+	struct pnfs_layout_range *range,
+	struct xdr_stream *xdr,
+	gfp_t gfp_flags)
+{
+	struct objio_segment *objio_seg;
+	struct pnfs_osd_xdr_decode_layout_iter iter;
+	struct pnfs_osd_layout layout;
+	struct pnfs_osd_object_cred *cur_comp, src_comp;
+	struct caps_buffers *caps_p;
+	int err;
+
+	err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
+	if (unlikely(err))
+		return err;
+
+	err = _verify_data_map(&layout);
+	if (unlikely(err))
+		return err;
+
+	objio_seg = kzalloc(sizeof(*objio_seg) +
+			    sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
+			    sizeof(*objio_seg->comps) * layout.olo_num_comps +
+			    sizeof(struct caps_buffers) * layout.olo_num_comps,
+			    gfp_flags);
+	if (!objio_seg)
+		return -ENOMEM;
+
+	objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
+	cur_comp = objio_seg->comps;
+	caps_p = (void *)(cur_comp + layout.olo_num_comps);
+	while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
+		copy_single_comp(cur_comp++, &src_comp, caps_p++);
+	if (unlikely(err))
+		goto err;
+
+	objio_seg->num_comps = layout.olo_num_comps;
+	objio_seg->comps_index = layout.olo_comps_index;
+	err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
+	if (err)
+		goto err;
+
+	objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
+	objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
+	if (layout.olo_map.odm_group_width) {
+		objio_seg->group_width = layout.olo_map.odm_group_width;
+		objio_seg->group_depth = layout.olo_map.odm_group_depth;
+		objio_seg->group_count = layout.olo_map.odm_num_comps /
+						objio_seg->mirrors_p1 /
+						objio_seg->group_width;
+	} else {
+		objio_seg->group_width = layout.olo_map.odm_num_comps /
+						objio_seg->mirrors_p1;
+		objio_seg->group_depth = -1;
+		objio_seg->group_count = 1;
+	}
+
+	/* Cache this calculation it will hit for every page */
+	objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
+				  objio_seg->stripe_unit) *
+				 objio_seg->group_width;
+
+	*outp = &objio_seg->lseg;
+	return 0;
+
+err:
+	kfree(objio_seg);
+	dprintk("%s: Error: return %d\n", __func__, err);
+	*outp = NULL;
+	return err;
+}
+
+void objio_free_lseg(struct pnfs_layout_segment *lseg)
+{
+	int i;
+	struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
+
+	for (i = 0; i < objio_seg->num_comps; i++) {
+		if (!objio_seg->ods[i])
+			break;
+		nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
+	}
+	kfree(objio_seg);
+}
+
+int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
+			 struct objlayout_io_state **outp,
+			 gfp_t gfp_flags)
+{
+	struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
+	struct objio_state *ios;
+	const unsigned first_size = sizeof(*ios) +
+				objio_seg->num_comps * sizeof(ios->per_dev[0]);
+	const unsigned sec_size = objio_seg->num_comps *
+						sizeof(ios->ol_state.ioerrs[0]);
+
+	ios = kzalloc(first_size + sec_size, gfp_flags);
+	if (unlikely(!ios))
+		return -ENOMEM;
+
+	ios->layout = objio_seg;
+	ios->ol_state.ioerrs = ((void *)ios) + first_size;
+	ios->ol_state.num_comps = objio_seg->num_comps;
+
+	*outp = &ios->ol_state;
+	return 0;
+}
+
+void objio_free_io_state(struct objlayout_io_state *ol_state)
+{
+	struct objio_state *ios = container_of(ol_state, struct objio_state,
+					       ol_state);
+
+	kfree(ios);
+}
+
+enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
+{
+	switch (oep) {
+	case OSD_ERR_PRI_NO_ERROR:
+		return (enum pnfs_osd_errno)0;
+
+	case OSD_ERR_PRI_CLEAR_PAGES:
+		BUG_ON(1);
+		return 0;
+
+	case OSD_ERR_PRI_RESOURCE:
+		return PNFS_OSD_ERR_RESOURCE;
+	case OSD_ERR_PRI_BAD_CRED:
+		return PNFS_OSD_ERR_BAD_CRED;
+	case OSD_ERR_PRI_NO_ACCESS:
+		return PNFS_OSD_ERR_NO_ACCESS;
+	case OSD_ERR_PRI_UNREACHABLE:
+		return PNFS_OSD_ERR_UNREACHABLE;
+	case OSD_ERR_PRI_NOT_FOUND:
+		return PNFS_OSD_ERR_NOT_FOUND;
+	case OSD_ERR_PRI_NO_SPACE:
+		return PNFS_OSD_ERR_NO_SPACE;
+	default:
+		WARN_ON(1);
+		/* fallthrough */
+	case OSD_ERR_PRI_EIO:
+		return PNFS_OSD_ERR_EIO;
+	}
+}
+
+static void _clear_bio(struct bio *bio)
+{
+	struct bio_vec *bv;
+	unsigned i;
+
+	__bio_for_each_segment(bv, bio, i, 0) {
+		unsigned this_count = bv->bv_len;
+
+		if (likely(PAGE_SIZE == this_count))
+			clear_highpage(bv->bv_page);
+		else
+			zero_user(bv->bv_page, bv->bv_offset, this_count);
+	}
+}
+
+static int _io_check(struct objio_state *ios, bool is_write)
+{
+	enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
+	int lin_ret = 0;
+	int i;
+
+	for (i = 0; i <  ios->numdevs; i++) {
+		struct osd_sense_info osi;
+		struct osd_request *or = ios->per_dev[i].or;
+		int ret;
+
+		if (!or)
+			continue;
+
+		ret = osd_req_decode_sense(or, &osi);
+		if (likely(!ret))
+			continue;
+
+		if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+			/* start read offset passed endof file */
+			BUG_ON(is_write);
+			_clear_bio(ios->per_dev[i].bio);
+			dprintk("%s: start read offset passed end of file "
+				"offset=0x%llx, length=0x%lx\n", __func__,
+				_LLU(ios->per_dev[i].offset),
+				ios->per_dev[i].length);
+
+			continue; /* we recovered */
+		}
+		objlayout_io_set_result(&ios->ol_state, i,
+					&ios->layout->comps[i].oc_object_id,
+					osd_pri_2_pnfs_err(osi.osd_err_pri),
+					ios->per_dev[i].offset,
+					ios->per_dev[i].length,
+					is_write);
+
+		if (osi.osd_err_pri >= oep) {
+			oep = osi.osd_err_pri;
+			lin_ret = ret;
+		}
+	}
+
+	return lin_ret;
+}
+
+/*
+ * Common IO state helpers.
+ */
+static void _io_free(struct objio_state *ios)
+{
+	unsigned i;
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct _objio_per_comp *per_dev = &ios->per_dev[i];
+
+		if (per_dev->or) {
+			osd_end_request(per_dev->or);
+			per_dev->or = NULL;
+		}
+
+		if (per_dev->bio) {
+			bio_put(per_dev->bio);
+			per_dev->bio = NULL;
+		}
+	}
+}
+
+struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
+{
+	unsigned min_dev = ios->layout->comps_index;
+	unsigned max_dev = min_dev + ios->layout->num_comps;
+
+	BUG_ON(dev < min_dev || max_dev <= dev);
+	return ios->layout->ods[dev - min_dev]->od;
+}
+
+struct _striping_info {
+	u64 obj_offset;
+	u64 group_length;
+	unsigned dev;
+	unsigned unit_off;
+};
+
+static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
+			      struct _striping_info *si)
+{
+	u32	stripe_unit = ios->layout->stripe_unit;
+	u32	group_width = ios->layout->group_width;
+	u64	group_depth = ios->layout->group_depth;
+	u32	U = stripe_unit * group_width;
+
+	u64	T = U * group_depth;
+	u64	S = T * ios->layout->group_count;
+	u64	M = div64_u64(file_offset, S);
+
+	/*
+	G = (L - (M * S)) / T
+	H = (L - (M * S)) % T
+	*/
+	u64	LmodU = file_offset - M * S;
+	u32	G = div64_u64(LmodU, T);
+	u64	H = LmodU - G * T;
+
+	u32	N = div_u64(H, U);
+
+	div_u64_rem(file_offset, stripe_unit, &si->unit_off);
+	si->obj_offset = si->unit_off + (N * stripe_unit) +
+				  (M * group_depth * stripe_unit);
+
+	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
+	si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+	si->dev *= ios->layout->mirrors_p1;
+
+	si->group_length = T - H;
+}
+
+static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
+		unsigned pgbase, struct _objio_per_comp *per_dev, int len,
+		gfp_t gfp_flags)
+{
+	unsigned pg = *cur_pg;
+	int cur_len = len;
+	struct request_queue *q =
+			osd_request_queue(_io_od(ios, per_dev->dev));
+
+	if (per_dev->bio == NULL) {
+		unsigned pages_in_stripe = ios->layout->group_width *
+				      (ios->layout->stripe_unit / PAGE_SIZE);
+		unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
+				    ios->layout->group_width;
+
+		if (BIO_MAX_PAGES_KMALLOC < bio_size)
+			bio_size = BIO_MAX_PAGES_KMALLOC;
+
+		per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
+		if (unlikely(!per_dev->bio)) {
+			dprintk("Faild to allocate BIO size=%u\n", bio_size);
+			return -ENOMEM;
+		}
+	}
+
+	while (cur_len > 0) {
+		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+		unsigned added_len;
+
+		BUG_ON(ios->ol_state.nr_pages <= pg);
+		cur_len -= pglen;
+
+		added_len = bio_add_pc_page(q, per_dev->bio,
+					ios->ol_state.pages[pg], pglen, pgbase);
+		if (unlikely(pglen != added_len))
+			return -ENOMEM;
+		pgbase = 0;
+		++pg;
+	}
+	BUG_ON(cur_len);
+
+	per_dev->length += len;
+	*cur_pg = pg;
+	return 0;
+}
+
+static int _prepare_one_group(struct objio_state *ios, u64 length,
+			      struct _striping_info *si, unsigned *last_pg,
+			      gfp_t gfp_flags)
+{
+	unsigned stripe_unit = ios->layout->stripe_unit;
+	unsigned mirrors_p1 = ios->layout->mirrors_p1;
+	unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
+	unsigned dev = si->dev;
+	unsigned first_dev = dev - (dev % devs_in_group);
+	unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+	unsigned cur_pg = *last_pg;
+	int ret = 0;
+
+	while (length) {
+		struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
+		unsigned cur_len, page_off = 0;
+
+		if (!per_dev->length) {
+			per_dev->dev = dev;
+			if (dev < si->dev) {
+				per_dev->offset = si->obj_offset + stripe_unit -
+								   si->unit_off;
+				cur_len = stripe_unit;
+			} else if (dev == si->dev) {
+				per_dev->offset = si->obj_offset;
+				cur_len = stripe_unit - si->unit_off;
+				page_off = si->unit_off & ~PAGE_MASK;
+				BUG_ON(page_off &&
+				      (page_off != ios->ol_state.pgbase));
+			} else { /* dev > si->dev */
+				per_dev->offset = si->obj_offset - si->unit_off;
+				cur_len = stripe_unit;
+			}
+
+			if (max_comp < dev - first_dev)
+				max_comp = dev - first_dev;
+		} else {
+			cur_len = stripe_unit;
+		}
+		if (cur_len >= length)
+			cur_len = length;
+
+		ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
+				       cur_len, gfp_flags);
+		if (unlikely(ret))
+			goto out;
+
+		dev += mirrors_p1;
+		dev = (dev % devs_in_group) + first_dev;
+
+		length -= cur_len;
+		ios->length += cur_len;
+	}
+out:
+	ios->numdevs = max_comp + mirrors_p1;
+	*last_pg = cur_pg;
+	return ret;
+}
+
+static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
+{
+	u64 length = ios->ol_state.count;
+	u64 offset = ios->ol_state.offset;
+	struct _striping_info si;
+	unsigned last_pg = 0;
+	int ret = 0;
+
+	while (length) {
+		_calc_stripe_info(ios, offset, &si);
+
+		if (length < si.group_length)
+			si.group_length = length;
+
+		ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
+		if (unlikely(ret))
+			goto out;
+
+		offset += si.group_length;
+		length -= si.group_length;
+	}
+
+out:
+	if (!ios->length)
+		return ret;
+
+	return 0;
+}
+
+static ssize_t _sync_done(struct objio_state *ios)
+{
+	struct completion *waiting = ios->private;
+
+	complete(waiting);
+	return 0;
+}
+
+static void _last_io(struct kref *kref)
+{
+	struct objio_state *ios = container_of(kref, struct objio_state, kref);
+
+	ios->done(ios);
+}
+
+static void _done_io(struct osd_request *or, void *p)
+{
+	struct objio_state *ios = p;
+
+	kref_put(&ios->kref, _last_io);
+}
+
+static ssize_t _io_exec(struct objio_state *ios)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	ssize_t status = 0; /* sync status */
+	unsigned i;
+	objio_done_fn saved_done_fn = ios->done;
+	bool sync = ios->ol_state.sync;
+
+	if (sync) {
+		ios->done = _sync_done;
+		ios->private = &wait;
+	}
+
+	kref_init(&ios->kref);
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct osd_request *or = ios->per_dev[i].or;
+
+		if (!or)
+			continue;
+
+		kref_get(&ios->kref);
+		osd_execute_request_async(or, _done_io, ios);
+	}
+
+	kref_put(&ios->kref, _last_io);
+
+	if (sync) {
+		wait_for_completion(&wait);
+		status = saved_done_fn(ios);
+	}
+
+	return status;
+}
+
+/*
+ * read
+ */
+static ssize_t _read_done(struct objio_state *ios)
+{
+	ssize_t status;
+	int ret = _io_check(ios, false);
+
+	_io_free(ios);
+
+	if (likely(!ret))
+		status = ios->length;
+	else
+		status = ret;
+
+	objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
+	return status;
+}
+
+static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
+	struct osd_request *or = NULL;
+	struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+	unsigned dev = per_dev->dev;
+	struct pnfs_osd_object_cred *cred =
+			&ios->layout->comps[cur_comp];
+	struct osd_obj_id obj = {
+		.partition = cred->oc_object_id.oid_partition_id,
+		.id = cred->oc_object_id.oid_object_id,
+	};
+	int ret;
+
+	or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
+	if (unlikely(!or)) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	per_dev->or = or;
+
+	osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
+
+	ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+	if (ret) {
+		dprintk("%s: Faild to osd_finalize_request() => %d\n",
+			__func__, ret);
+		goto err;
+	}
+
+	dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+		__func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+		per_dev->length);
+
+err:
+	return ret;
+}
+
+static ssize_t _read_exec(struct objio_state *ios)
+{
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+		if (!ios->per_dev[i].length)
+			continue;
+		ret = _read_mirrors(ios, i);
+		if (unlikely(ret))
+			goto err;
+	}
+
+	ios->done = _read_done;
+	return _io_exec(ios); /* In sync mode exec returns the io status */
+
+err:
+	_io_free(ios);
+	return ret;
+}
+
+ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
+{
+	struct objio_state *ios = container_of(ol_state, struct objio_state,
+					       ol_state);
+	int ret;
+
+	ret = _io_rw_pagelist(ios, GFP_KERNEL);
+	if (unlikely(ret))
+		return ret;
+
+	return _read_exec(ios);
+}
+
+/*
+ * write
+ */
+static ssize_t _write_done(struct objio_state *ios)
+{
+	ssize_t status;
+	int ret = _io_check(ios, true);
+
+	_io_free(ios);
+
+	if (likely(!ret)) {
+		/* FIXME: should be based on the OSD's persistence model
+		 * See OSD2r05 Section 4.13 Data persistence model */
+		ios->ol_state.committed = NFS_FILE_SYNC;
+		status = ios->length;
+	} else {
+		status = ret;
+	}
+
+	objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
+	return status;
+}
+
+static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
+	struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
+	unsigned dev = ios->per_dev[cur_comp].dev;
+	unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
+	int ret;
+
+	for (; cur_comp < last_comp; ++cur_comp, ++dev) {
+		struct osd_request *or = NULL;
+		struct pnfs_osd_object_cred *cred =
+					&ios->layout->comps[cur_comp];
+		struct osd_obj_id obj = {
+			.partition = cred->oc_object_id.oid_partition_id,
+			.id = cred->oc_object_id.oid_object_id,
+		};
+		struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+		struct bio *bio;
+
+		or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
+		if (unlikely(!or)) {
+			ret = -ENOMEM;
+			goto err;
+		}
+		per_dev->or = or;
+
+		if (per_dev != master_dev) {
+			bio = bio_kmalloc(GFP_NOFS,
+					  master_dev->bio->bi_max_vecs);
+			if (unlikely(!bio)) {
+				dprintk("Faild to allocate BIO size=%u\n",
+					master_dev->bio->bi_max_vecs);
+				ret = -ENOMEM;
+				goto err;
+			}
+
+			__bio_clone(bio, master_dev->bio);
+			bio->bi_bdev = NULL;
+			bio->bi_next = NULL;
+			per_dev->bio = bio;
+			per_dev->dev = dev;
+			per_dev->length = master_dev->length;
+			per_dev->offset =  master_dev->offset;
+		} else {
+			bio = master_dev->bio;
+			bio->bi_rw |= REQ_WRITE;
+		}
+
+		osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
+
+		ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+		if (ret) {
+			dprintk("%s: Faild to osd_finalize_request() => %d\n",
+				__func__, ret);
+			goto err;
+		}
+
+		dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+			__func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+			per_dev->length);
+	}
+
+err:
+	return ret;
+}
+
+static ssize_t _write_exec(struct objio_state *ios)
+{
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+		if (!ios->per_dev[i].length)
+			continue;
+		ret = _write_mirrors(ios, i);
+		if (unlikely(ret))
+			goto err;
+	}
+
+	ios->done = _write_done;
+	return _io_exec(ios); /* In sync mode exec returns the io->status */
+
+err:
+	_io_free(ios);
+	return ret;
+}
+
+ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
+{
+	struct objio_state *ios = container_of(ol_state, struct objio_state,
+					       ol_state);
+	int ret;
+
+	/* TODO: ios->stable = stable; */
+	ret = _io_rw_pagelist(ios, GFP_NOFS);
+	if (unlikely(ret))
+		return ret;
+
+	return _write_exec(ios);
+}
+
+static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
+			  struct nfs_page *prev, struct nfs_page *req)
+{
+	if (!pnfs_generic_pg_test(pgio, prev, req))
+		return false;
+
+	if (pgio->pg_lseg == NULL)
+		return true;
+
+	return pgio->pg_count + req->wb_bytes <=
+			OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
+}
+
+static struct pnfs_layoutdriver_type objlayout_type = {
+	.id = LAYOUT_OSD2_OBJECTS,
+	.name = "LAYOUT_OSD2_OBJECTS",
+	.flags                   = PNFS_LAYOUTRET_ON_SETATTR |
+				   PNFS_LAYOUTRET_ON_ERROR,
+
+	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
+	.free_layout_hdr         = objlayout_free_layout_hdr,
+
+	.alloc_lseg              = objlayout_alloc_lseg,
+	.free_lseg               = objlayout_free_lseg,
+
+	.read_pagelist           = objlayout_read_pagelist,
+	.write_pagelist          = objlayout_write_pagelist,
+	.pg_test                 = objio_pg_test,
+
+	.free_deviceid_node	 = objio_free_deviceid_node,
+
+	.encode_layoutcommit	 = objlayout_encode_layoutcommit,
+	.encode_layoutreturn     = objlayout_encode_layoutreturn,
+};
+
+MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
+MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
+MODULE_LICENSE("GPL");
+
+static int __init
+objlayout_init(void)
+{
+	int ret = pnfs_register_layoutdriver(&objlayout_type);
+
+	if (ret)
+		printk(KERN_INFO
+			"%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
+			__func__, ret);
+	else
+		printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
+			__func__);
+	return ret;
+}
+
+static void __exit
+objlayout_exit(void)
+{
+	pnfs_unregister_layoutdriver(&objlayout_type);
+	printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
+	       __func__);
+}
+
+module_init(objlayout_init);
+module_exit(objlayout_exit);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
new file mode 100644
index 00000000..fefa1224
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.c
@@ -0,0 +1,716 @@
+/*
+ *  pNFS Objects layout driver high level definitions
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <scsi/osd_initiator.h>
+#include "objlayout.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+/*
+ * Create a objlayout layout structure for the given inode and return it.
+ */
+struct pnfs_layout_hdr *
+objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+	struct objlayout *objlay;
+
+	objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
+	if (objlay) {
+		spin_lock_init(&objlay->lock);
+		INIT_LIST_HEAD(&objlay->err_list);
+	}
+	dprintk("%s: Return %p\n", __func__, objlay);
+	return &objlay->pnfs_layout;
+}
+
+/*
+ * Free an objlayout layout structure
+ */
+void
+objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	struct objlayout *objlay = OBJLAYOUT(lo);
+
+	dprintk("%s: objlay %p\n", __func__, objlay);
+
+	WARN_ON(!list_empty(&objlay->err_list));
+	kfree(objlay);
+}
+
+/*
+ * Unmarshall layout and store it in pnfslay.
+ */
+struct pnfs_layout_segment *
+objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
+		     struct nfs4_layoutget_res *lgr,
+		     gfp_t gfp_flags)
+{
+	int status = -ENOMEM;
+	struct xdr_stream stream;
+	struct xdr_buf buf = {
+		.pages =  lgr->layoutp->pages,
+		.page_len =  lgr->layoutp->len,
+		.buflen =  lgr->layoutp->len,
+		.len = lgr->layoutp->len,
+	};
+	struct page *scratch;
+	struct pnfs_layout_segment *lseg;
+
+	dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
+
+	scratch = alloc_page(gfp_flags);
+	if (!scratch)
+		goto err_nofree;
+
+	xdr_init_decode(&stream, &buf, NULL);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+	status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
+	if (unlikely(status)) {
+		dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
+			status);
+		goto err;
+	}
+
+	__free_page(scratch);
+
+	dprintk("%s: Return %p\n", __func__, lseg);
+	return lseg;
+
+err:
+	__free_page(scratch);
+err_nofree:
+	dprintk("%s: Err Return=>%d\n", __func__, status);
+	return ERR_PTR(status);
+}
+
+/*
+ * Free a layout segement
+ */
+void
+objlayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+	dprintk("%s: freeing layout segment %p\n", __func__, lseg);
+
+	if (unlikely(!lseg))
+		return;
+
+	objio_free_lseg(lseg);
+}
+
+/*
+ * I/O Operations
+ */
+static inline u64
+end_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	end = start + len;
+	return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	BUG_ON(!len);
+	end = start + len;
+	return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+
+static struct objlayout_io_state *
+objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
+			struct page **pages,
+			unsigned pgbase,
+			loff_t offset,
+			size_t count,
+			struct pnfs_layout_segment *lseg,
+			void *rpcdata,
+			gfp_t gfp_flags)
+{
+	struct objlayout_io_state *state;
+	u64 lseg_end_offset;
+
+	dprintk("%s: allocating io_state\n", __func__);
+	if (objio_alloc_io_state(lseg, &state, gfp_flags))
+		return NULL;
+
+	BUG_ON(offset < lseg->pls_range.offset);
+	lseg_end_offset = end_offset(lseg->pls_range.offset,
+				     lseg->pls_range.length);
+	BUG_ON(offset >= lseg_end_offset);
+	if (offset + count > lseg_end_offset) {
+		count = lseg->pls_range.length -
+				(offset - lseg->pls_range.offset);
+		dprintk("%s: truncated count %Zd\n", __func__, count);
+	}
+
+	if (pgbase > PAGE_SIZE) {
+		pages += pgbase >> PAGE_SHIFT;
+		pgbase &= ~PAGE_MASK;
+	}
+
+	INIT_LIST_HEAD(&state->err_list);
+	state->lseg = lseg;
+	state->rpcdata = rpcdata;
+	state->pages = pages;
+	state->pgbase = pgbase;
+	state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	state->offset = offset;
+	state->count = count;
+	state->sync = 0;
+
+	return state;
+}
+
+static void
+objlayout_free_io_state(struct objlayout_io_state *state)
+{
+	dprintk("%s: freeing io_state\n", __func__);
+	if (unlikely(!state))
+		return;
+
+	objio_free_io_state(state);
+}
+
+/*
+ * I/O done common code
+ */
+static void
+objlayout_iodone(struct objlayout_io_state *state)
+{
+	dprintk("%s: state %p status\n", __func__, state);
+
+	if (likely(state->status >= 0)) {
+		objlayout_free_io_state(state);
+	} else {
+		struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+
+		spin_lock(&objlay->lock);
+		objlay->delta_space_valid = OBJ_DSU_INVALID;
+		list_add(&objlay->err_list, &state->err_list);
+		spin_unlock(&objlay->lock);
+	}
+}
+
+/*
+ * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
+ *
+ * The @index component IO failed (error returned from target). Register
+ * the error for later reporting at layout-return.
+ */
+void
+objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
+			struct pnfs_osd_objid *pooid, int osd_error,
+			u64 offset, u64 length, bool is_write)
+{
+	struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
+
+	BUG_ON(index >= state->num_comps);
+	if (osd_error) {
+		ioerr->oer_component = *pooid;
+		ioerr->oer_comp_offset = offset;
+		ioerr->oer_comp_length = length;
+		ioerr->oer_iswrite = is_write;
+		ioerr->oer_errno = osd_error;
+
+		dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
+			"par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
+			__func__, index, ioerr->oer_errno,
+			ioerr->oer_iswrite,
+			_DEVID_LO(&ioerr->oer_component.oid_device_id),
+			_DEVID_HI(&ioerr->oer_component.oid_device_id),
+			ioerr->oer_component.oid_partition_id,
+			ioerr->oer_component.oid_object_id,
+			ioerr->oer_comp_offset,
+			ioerr->oer_comp_length);
+	} else {
+		/* User need not call if no error is reported */
+		ioerr->oer_errno = 0;
+	}
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_read_complete(struct work_struct *work)
+{
+	struct rpc_task *task;
+	struct nfs_read_data *rdata;
+
+	dprintk("%s enter\n", __func__);
+	task = container_of(work, struct rpc_task, u.tk_work);
+	rdata = container_of(task, struct nfs_read_data, task);
+
+	pnfs_ld_read_done(rdata);
+}
+
+void
+objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
+{
+	int eof = state->eof;
+	struct nfs_read_data *rdata;
+
+	state->status = status;
+	dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof);
+	rdata = state->rpcdata;
+	rdata->task.tk_status = status;
+	if (likely(status >= 0)) {
+		rdata->res.count = status;
+		rdata->res.eof = eof;
+	} else {
+		rdata->pnfs_error = status;
+	}
+	objlayout_iodone(state);
+	/* must not use state after this point */
+
+	if (sync)
+		pnfs_ld_read_done(rdata);
+	else {
+		INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
+		schedule_work(&rdata->task.u.tk_work);
+	}
+}
+
+/*
+ * Perform sync or async reads.
+ */
+enum pnfs_try_status
+objlayout_read_pagelist(struct nfs_read_data *rdata)
+{
+	loff_t offset = rdata->args.offset;
+	size_t count = rdata->args.count;
+	struct objlayout_io_state *state;
+	ssize_t status = 0;
+	loff_t eof;
+
+	dprintk("%s: Begin inode %p offset %llu count %d\n",
+		__func__, rdata->inode, offset, (int)count);
+
+	eof = i_size_read(rdata->inode);
+	if (unlikely(offset + count > eof)) {
+		if (offset >= eof) {
+			status = 0;
+			rdata->res.count = 0;
+			rdata->res.eof = 1;
+			goto out;
+		}
+		count = eof - offset;
+	}
+
+	state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
+					 rdata->args.pages, rdata->args.pgbase,
+					 offset, count,
+					 rdata->lseg, rdata,
+					 GFP_KERNEL);
+	if (unlikely(!state)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	state->eof = state->offset + state->count >= eof;
+
+	status = objio_read_pagelist(state);
+ out:
+	dprintk("%s: Return status %Zd\n", __func__, status);
+	rdata->pnfs_error = status;
+	return PNFS_ATTEMPTED;
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_write_complete(struct work_struct *work)
+{
+	struct rpc_task *task;
+	struct nfs_write_data *wdata;
+
+	dprintk("%s enter\n", __func__);
+	task = container_of(work, struct rpc_task, u.tk_work);
+	wdata = container_of(task, struct nfs_write_data, task);
+
+	pnfs_ld_write_done(wdata);
+}
+
+void
+objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
+		     bool sync)
+{
+	struct nfs_write_data *wdata;
+
+	dprintk("%s: Begin\n", __func__);
+	wdata = state->rpcdata;
+	state->status = status;
+	wdata->task.tk_status = status;
+	if (likely(status >= 0)) {
+		wdata->res.count = status;
+		wdata->verf.committed = state->committed;
+		dprintk("%s: Return status %d committed %d\n",
+			__func__, wdata->task.tk_status,
+			wdata->verf.committed);
+	} else {
+		wdata->pnfs_error = status;
+		dprintk("%s: Return status %d\n",
+			__func__, wdata->task.tk_status);
+	}
+	objlayout_iodone(state);
+	/* must not use state after this point */
+
+	if (sync)
+		pnfs_ld_write_done(wdata);
+	else {
+		INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
+		schedule_work(&wdata->task.u.tk_work);
+	}
+}
+
+/*
+ * Perform sync or async writes.
+ */
+enum pnfs_try_status
+objlayout_write_pagelist(struct nfs_write_data *wdata,
+			 int how)
+{
+	struct objlayout_io_state *state;
+	ssize_t status;
+
+	dprintk("%s: Begin inode %p offset %llu count %u\n",
+		__func__, wdata->inode, wdata->args.offset, wdata->args.count);
+
+	state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
+					 wdata->args.pages,
+					 wdata->args.pgbase,
+					 wdata->args.offset,
+					 wdata->args.count,
+					 wdata->lseg, wdata,
+					 GFP_NOFS);
+	if (unlikely(!state)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	state->sync = how & FLUSH_SYNC;
+
+	status = objio_write_pagelist(state, how & FLUSH_STABLE);
+ out:
+	dprintk("%s: Return status %Zd\n", __func__, status);
+	wdata->pnfs_error = status;
+	return PNFS_ATTEMPTED;
+}
+
+void
+objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
+			      struct xdr_stream *xdr,
+			      const struct nfs4_layoutcommit_args *args)
+{
+	struct objlayout *objlay = OBJLAYOUT(pnfslay);
+	struct pnfs_osd_layoutupdate lou;
+	__be32 *start;
+
+	dprintk("%s: Begin\n", __func__);
+
+	spin_lock(&objlay->lock);
+	lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
+	lou.dsu_delta = objlay->delta_space_used;
+	objlay->delta_space_used = 0;
+	objlay->delta_space_valid = OBJ_DSU_INIT;
+	lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
+	spin_unlock(&objlay->lock);
+
+	start = xdr_reserve_space(xdr, 4);
+
+	BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
+
+	*start = cpu_to_be32((xdr->p - start - 1) * 4);
+
+	dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
+		lou.dsu_delta, lou.olu_ioerr_flag);
+}
+
+static int
+err_prio(u32 oer_errno)
+{
+	switch (oer_errno) {
+	case 0:
+		return 0;
+
+	case PNFS_OSD_ERR_RESOURCE:
+		return OSD_ERR_PRI_RESOURCE;
+	case PNFS_OSD_ERR_BAD_CRED:
+		return OSD_ERR_PRI_BAD_CRED;
+	case PNFS_OSD_ERR_NO_ACCESS:
+		return OSD_ERR_PRI_NO_ACCESS;
+	case PNFS_OSD_ERR_UNREACHABLE:
+		return OSD_ERR_PRI_UNREACHABLE;
+	case PNFS_OSD_ERR_NOT_FOUND:
+		return OSD_ERR_PRI_NOT_FOUND;
+	case PNFS_OSD_ERR_NO_SPACE:
+		return OSD_ERR_PRI_NO_SPACE;
+	default:
+		WARN_ON(1);
+		/* fallthrough */
+	case PNFS_OSD_ERR_EIO:
+		return OSD_ERR_PRI_EIO;
+	}
+}
+
+static void
+merge_ioerr(struct pnfs_osd_ioerr *dest_err,
+	    const struct pnfs_osd_ioerr *src_err)
+{
+	u64 dest_end, src_end;
+
+	if (!dest_err->oer_errno) {
+		*dest_err = *src_err;
+		/* accumulated device must be blank */
+		memset(&dest_err->oer_component.oid_device_id, 0,
+			sizeof(dest_err->oer_component.oid_device_id));
+
+		return;
+	}
+
+	if (dest_err->oer_component.oid_partition_id !=
+				src_err->oer_component.oid_partition_id)
+		dest_err->oer_component.oid_partition_id = 0;
+
+	if (dest_err->oer_component.oid_object_id !=
+				src_err->oer_component.oid_object_id)
+		dest_err->oer_component.oid_object_id = 0;
+
+	if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
+		dest_err->oer_comp_offset = src_err->oer_comp_offset;
+
+	dest_end = end_offset(dest_err->oer_comp_offset,
+			      dest_err->oer_comp_length);
+	src_end =  end_offset(src_err->oer_comp_offset,
+			      src_err->oer_comp_length);
+	if (dest_end < src_end)
+		dest_end = src_end;
+
+	dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
+
+	if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
+	    (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
+			dest_err->oer_errno = src_err->oer_errno;
+	} else if (src_err->oer_iswrite) {
+		dest_err->oer_iswrite = true;
+		dest_err->oer_errno = src_err->oer_errno;
+	}
+}
+
+static void
+encode_accumulated_error(struct objlayout *objlay, __be32 *p)
+{
+	struct objlayout_io_state *state, *tmp;
+	struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
+
+	list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+		unsigned i;
+
+		for (i = 0; i < state->num_comps; i++) {
+			struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+
+			if (!ioerr->oer_errno)
+				continue;
+
+			printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
+				"dev(%llx:%llx) par=0x%llx obj=0x%llx "
+				"offset=0x%llx length=0x%llx\n",
+				__func__, i, ioerr->oer_errno,
+				ioerr->oer_iswrite,
+				_DEVID_LO(&ioerr->oer_component.oid_device_id),
+				_DEVID_HI(&ioerr->oer_component.oid_device_id),
+				ioerr->oer_component.oid_partition_id,
+				ioerr->oer_component.oid_object_id,
+				ioerr->oer_comp_offset,
+				ioerr->oer_comp_length);
+
+			merge_ioerr(&accumulated_err, ioerr);
+		}
+		list_del(&state->err_list);
+		objlayout_free_io_state(state);
+	}
+
+	pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
+}
+
+void
+objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
+			      struct xdr_stream *xdr,
+			      const struct nfs4_layoutreturn_args *args)
+{
+	struct objlayout *objlay = OBJLAYOUT(pnfslay);
+	struct objlayout_io_state *state, *tmp;
+	__be32 *start;
+
+	dprintk("%s: Begin\n", __func__);
+	start = xdr_reserve_space(xdr, 4);
+	BUG_ON(!start);
+
+	spin_lock(&objlay->lock);
+
+	list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+		__be32 *last_xdr = NULL, *p;
+		unsigned i;
+		int res = 0;
+
+		for (i = 0; i < state->num_comps; i++) {
+			struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+
+			if (!ioerr->oer_errno)
+				continue;
+
+			dprintk("%s: err[%d]: errno=%d is_write=%d "
+				"dev(%llx:%llx) par=0x%llx obj=0x%llx "
+				"offset=0x%llx length=0x%llx\n",
+				__func__, i, ioerr->oer_errno,
+				ioerr->oer_iswrite,
+				_DEVID_LO(&ioerr->oer_component.oid_device_id),
+				_DEVID_HI(&ioerr->oer_component.oid_device_id),
+				ioerr->oer_component.oid_partition_id,
+				ioerr->oer_component.oid_object_id,
+				ioerr->oer_comp_offset,
+				ioerr->oer_comp_length);
+
+			p = pnfs_osd_xdr_ioerr_reserve_space(xdr);
+			if (unlikely(!p)) {
+				res = -E2BIG;
+				break; /* accumulated_error */
+			}
+
+			last_xdr = p;
+			pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
+		}
+
+		/* TODO: use xdr_write_pages */
+		if (unlikely(res)) {
+			/* no space for even one error descriptor */
+			BUG_ON(!last_xdr);
+
+			/* we've encountered a situation with lots and lots of
+			 * errors and no space to encode them all. Use the last
+			 * available slot to report the union of all the
+			 * remaining errors.
+			 */
+			encode_accumulated_error(objlay, last_xdr);
+			goto loop_done;
+		}
+		list_del(&state->err_list);
+		objlayout_free_io_state(state);
+	}
+loop_done:
+	spin_unlock(&objlay->lock);
+
+	*start = cpu_to_be32((xdr->p - start - 1) * 4);
+	dprintk("%s: Return\n", __func__);
+}
+
+
+/*
+ * Get Device Info API for io engines
+ */
+struct objlayout_deviceinfo {
+	struct page *page;
+	struct pnfs_osd_deviceaddr da; /* This must be last */
+};
+
+/* Initialize and call nfs_getdeviceinfo, then decode and return a
+ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
+ * should be called.
+ */
+int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+	struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
+	gfp_t gfp_flags)
+{
+	struct objlayout_deviceinfo *odi;
+	struct pnfs_device pd;
+	struct super_block *sb;
+	struct page *page, **pages;
+	u32 *p;
+	int err;
+
+	page = alloc_page(gfp_flags);
+	if (!page)
+		return -ENOMEM;
+
+	pages = &page;
+	pd.pages = pages;
+
+	memcpy(&pd.dev_id, d_id, sizeof(*d_id));
+	pd.layout_type = LAYOUT_OSD2_OBJECTS;
+	pd.pages = &page;
+	pd.pgbase = 0;
+	pd.pglen = PAGE_SIZE;
+	pd.mincount = 0;
+
+	sb = pnfslay->plh_inode->i_sb;
+	err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
+	dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
+	if (err)
+		goto err_out;
+
+	p = page_address(page);
+	odi = kzalloc(sizeof(*odi), gfp_flags);
+	if (!odi) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
+	odi->page = page;
+	*deviceaddr = &odi->da;
+	return 0;
+
+err_out:
+	__free_page(page);
+	return err;
+}
+
+void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
+{
+	struct objlayout_deviceinfo *odi = container_of(deviceaddr,
+						struct objlayout_deviceinfo,
+						da);
+
+	__free_page(odi->page);
+	kfree(odi);
+}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
new file mode 100644
index 00000000..a8244c8e
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.h
@@ -0,0 +1,187 @@
+/*
+ *  Data types and function declerations for interfacing with the
+ *  pNFS standard object layout driver.
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _OBJLAYOUT_H
+#define _OBJLAYOUT_H
+
+#include <linux/nfs_fs.h>
+#include <linux/pnfs_osd_xdr.h>
+#include "../pnfs.h"
+
+/*
+ * per-inode layout
+ */
+struct objlayout {
+	struct pnfs_layout_hdr pnfs_layout;
+
+	 /* for layout_commit */
+	enum osd_delta_space_valid_enum {
+		OBJ_DSU_INIT = 0,
+		OBJ_DSU_VALID,
+		OBJ_DSU_INVALID,
+	} delta_space_valid;
+	s64 delta_space_used;  /* consumed by write ops */
+
+	 /* for layout_return */
+	spinlock_t lock;
+	struct list_head err_list;
+};
+
+static inline struct objlayout *
+OBJLAYOUT(struct pnfs_layout_hdr *lo)
+{
+	return container_of(lo, struct objlayout, pnfs_layout);
+}
+
+/*
+ * per-I/O operation state
+ * embedded in objects provider io_state data structure
+ */
+struct objlayout_io_state {
+	struct pnfs_layout_segment *lseg;
+
+	struct page **pages;
+	unsigned pgbase;
+	unsigned nr_pages;
+	unsigned long count;
+	loff_t offset;
+	bool sync;
+
+	void *rpcdata;
+	int status;             /* res */
+	int eof;                /* res */
+	int committed;          /* res */
+
+	/* Error reporting (layout_return) */
+	struct list_head err_list;
+	unsigned num_comps;
+	/* Pointer to array of error descriptors of size num_comps.
+	 * It should contain as many entries as devices in the osd_layout
+	 * that participate in the I/O. It is up to the io_engine to allocate
+	 * needed space and set num_comps.
+	 */
+	struct pnfs_osd_ioerr *ioerrs;
+};
+
+/*
+ * Raid engine I/O API
+ */
+extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
+	struct pnfs_layout_hdr *pnfslay,
+	struct pnfs_layout_range *range,
+	struct xdr_stream *xdr,
+	gfp_t gfp_flags);
+extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
+
+extern int objio_alloc_io_state(
+	struct pnfs_layout_segment *lseg,
+	struct objlayout_io_state **outp,
+	gfp_t gfp_flags);
+extern void objio_free_io_state(struct objlayout_io_state *state);
+
+extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
+extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
+				    bool stable);
+
+/*
+ * callback API
+ */
+extern void objlayout_io_set_result(struct objlayout_io_state *state,
+			unsigned index, struct pnfs_osd_objid *pooid,
+			int osd_error, u64 offset, u64 length, bool is_write);
+
+static inline void
+objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
+{
+	struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+
+	/* If one of the I/Os errored out and the delta_space_used was
+	 * invalid we render the complete report as invalid. Protocol mandate
+	 * the DSU be accurate or not reported.
+	 */
+	spin_lock(&objlay->lock);
+	if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
+		objlay->delta_space_valid = OBJ_DSU_VALID;
+		objlay->delta_space_used += space_used;
+	}
+	spin_unlock(&objlay->lock);
+}
+
+extern void objlayout_read_done(struct objlayout_io_state *state,
+				ssize_t status, bool sync);
+extern void objlayout_write_done(struct objlayout_io_state *state,
+				 ssize_t status, bool sync);
+
+extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+	struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
+	gfp_t gfp_flags);
+extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
+
+/*
+ * exported generic objects function vectors
+ */
+
+extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags);
+extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
+
+extern struct pnfs_layout_segment *objlayout_alloc_lseg(
+	struct pnfs_layout_hdr *,
+	struct nfs4_layoutget_res *,
+	gfp_t gfp_flags);
+extern void objlayout_free_lseg(struct pnfs_layout_segment *);
+
+extern enum pnfs_try_status objlayout_read_pagelist(
+	struct nfs_read_data *);
+
+extern enum pnfs_try_status objlayout_write_pagelist(
+	struct nfs_write_data *,
+	int how);
+
+extern void objlayout_encode_layoutcommit(
+	struct pnfs_layout_hdr *,
+	struct xdr_stream *,
+	const struct nfs4_layoutcommit_args *);
+
+extern void objlayout_encode_layoutreturn(
+	struct pnfs_layout_hdr *,
+	struct xdr_stream *,
+	const struct nfs4_layoutreturn_args *);
+
+#endif /* _OBJLAYOUT_H */
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
new file mode 100644
index 00000000..b3918f7a
--- /dev/null
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -0,0 +1,415 @@
+/*
+ *  Object-Based pNFS Layout XDR layer
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/pnfs_osd_xdr.h>
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+/*
+ * The following implementation is based on RFC5664
+ */
+
+/*
+ * struct pnfs_osd_objid {
+ *	struct nfs4_deviceid	oid_device_id;
+ *	u64			oid_partition_id;
+ *	u64			oid_object_id;
+ * }; // xdr size 32 bytes
+ */
+static __be32 *
+_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
+{
+	p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data,
+				    sizeof(objid->oid_device_id.data));
+
+	p = xdr_decode_hyper(p, &objid->oid_partition_id);
+	p = xdr_decode_hyper(p, &objid->oid_object_id);
+	return p;
+}
+/*
+ * struct pnfs_osd_opaque_cred {
+ *	u32 cred_len;
+ *	void *cred;
+ * }; // xdr size [variable]
+ * The return pointers are from the xdr buffer
+ */
+static int
+_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred,
+			    struct xdr_stream *xdr)
+{
+	__be32 *p = xdr_inline_decode(xdr, 1);
+
+	if (!p)
+		return -EINVAL;
+
+	opaque_cred->cred_len = be32_to_cpu(*p++);
+
+	p = xdr_inline_decode(xdr, opaque_cred->cred_len);
+	if (!p)
+		return -EINVAL;
+
+	opaque_cred->cred = p;
+	return 0;
+}
+
+/*
+ * struct pnfs_osd_object_cred {
+ *	struct pnfs_osd_objid		oc_object_id;
+ *	u32				oc_osd_version;
+ *	u32				oc_cap_key_sec;
+ *	struct pnfs_osd_opaque_cred	oc_cap_key
+ *	struct pnfs_osd_opaque_cred	oc_cap;
+ * }; // xdr size 32 + 4 + 4 + [variable] + [variable]
+ */
+static int
+_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp,
+			    struct xdr_stream *xdr)
+{
+	__be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4);
+	int ret;
+
+	if (!p)
+		return -EIO;
+
+	p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
+	comp->oc_osd_version = be32_to_cpup(p++);
+	comp->oc_cap_key_sec = be32_to_cpup(p);
+
+	ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr);
+	if (unlikely(ret))
+		return ret;
+
+	ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr);
+	return ret;
+}
+
+/*
+ * struct pnfs_osd_data_map {
+ *	u32	odm_num_comps;
+ *	u64	odm_stripe_unit;
+ *	u32	odm_group_width;
+ *	u32	odm_group_depth;
+ *	u32	odm_mirror_cnt;
+ *	u32	odm_raid_algorithm;
+ * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4
+ */
+static inline int
+_osd_data_map_xdr_sz(void)
+{
+	return 4 + 8 + 4 + 4 + 4 + 4;
+}
+
+static __be32 *
+_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map)
+{
+	data_map->odm_num_comps = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &data_map->odm_stripe_unit);
+	data_map->odm_group_width = be32_to_cpup(p++);
+	data_map->odm_group_depth = be32_to_cpup(p++);
+	data_map->odm_mirror_cnt = be32_to_cpup(p++);
+	data_map->odm_raid_algorithm = be32_to_cpup(p++);
+	dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
+		"odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
+		__func__,
+		data_map->odm_num_comps,
+		(unsigned long long)data_map->odm_stripe_unit,
+		data_map->odm_group_width,
+		data_map->odm_group_depth,
+		data_map->odm_mirror_cnt,
+		data_map->odm_raid_algorithm);
+	return p;
+}
+
+int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
+	struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr)
+{
+	__be32 *p;
+
+	memset(iter, 0, sizeof(*iter));
+
+	p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4);
+	if (unlikely(!p))
+		return -EINVAL;
+
+	p = _osd_xdr_decode_data_map(p, &layout->olo_map);
+	layout->olo_comps_index = be32_to_cpup(p++);
+	layout->olo_num_comps = be32_to_cpup(p++);
+	dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__,
+		layout->olo_comps_index, layout->olo_num_comps);
+
+	iter->total_comps = layout->olo_num_comps;
+	return 0;
+}
+
+bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
+	struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
+	int *err)
+{
+	BUG_ON(iter->decoded_comps > iter->total_comps);
+	if (iter->decoded_comps == iter->total_comps)
+		return false;
+
+	*err = _osd_xdr_decode_object_cred(comp, xdr);
+	if (unlikely(*err)) {
+		dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d "
+			"total_comps=%d\n", __func__, *err,
+			iter->decoded_comps, iter->total_comps);
+		return false; /* stop the loop */
+	}
+	dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx "
+		"key_len=%u cap_len=%u\n",
+		__func__,
+		_DEVID_LO(&comp->oc_object_id.oid_device_id),
+		_DEVID_HI(&comp->oc_object_id.oid_device_id),
+		comp->oc_object_id.oid_partition_id,
+		comp->oc_object_id.oid_object_id,
+		comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
+
+	iter->decoded_comps++;
+	return true;
+}
+
+/*
+ * Get Device Information Decoding
+ *
+ * Note: since Device Information is currently done synchronously, all
+ *       variable strings fields are left inside the rpc buffer and are only
+ *       pointed to by the pnfs_osd_deviceaddr members. So the read buffer
+ *       should not be freed while the returned information is in use.
+ */
+/*
+ *struct nfs4_string {
+ *	unsigned int len;
+ *	char *data;
+ *}; // size [variable]
+ * NOTE: Returned string points to inside the XDR buffer
+ */
+static __be32 *
+__read_u8_opaque(__be32 *p, struct nfs4_string *str)
+{
+	str->len = be32_to_cpup(p++);
+	str->data = (char *)p;
+
+	p += XDR_QUADLEN(str->len);
+	return p;
+}
+
+/*
+ * struct pnfs_osd_targetid {
+ *	u32			oti_type;
+ *	struct nfs4_string	oti_scsi_device_id;
+ * };// size 4 + [variable]
+ */
+static __be32 *
+__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid)
+{
+	u32 oti_type;
+
+	oti_type = be32_to_cpup(p++);
+	targetid->oti_type = oti_type;
+
+	switch (oti_type) {
+	case OBJ_TARGET_SCSI_NAME:
+	case OBJ_TARGET_SCSI_DEVICE_ID:
+		p = __read_u8_opaque(p, &targetid->oti_scsi_device_id);
+	}
+
+	return p;
+}
+
+/*
+ * struct pnfs_osd_net_addr {
+ *	struct nfs4_string	r_netid;
+ *	struct nfs4_string	r_addr;
+ * };
+ */
+static __be32 *
+__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr)
+{
+	p = __read_u8_opaque(p, &netaddr->r_netid);
+	p = __read_u8_opaque(p, &netaddr->r_addr);
+
+	return p;
+}
+
+/*
+ * struct pnfs_osd_targetaddr {
+ *	u32				ota_available;
+ *	struct pnfs_osd_net_addr	ota_netaddr;
+ * };
+ */
+static __be32 *
+__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr)
+{
+	u32 ota_available;
+
+	ota_available = be32_to_cpup(p++);
+	targetaddr->ota_available = ota_available;
+
+	if (ota_available)
+		p = __read_net_addr(p, &targetaddr->ota_netaddr);
+
+
+	return p;
+}
+
+/*
+ * struct pnfs_osd_deviceaddr {
+ *	struct pnfs_osd_targetid	oda_targetid;
+ *	struct pnfs_osd_targetaddr	oda_targetaddr;
+ *	u8				oda_lun[8];
+ *	struct nfs4_string		oda_systemid;
+ *	struct pnfs_osd_object_cred	oda_root_obj_cred;
+ *	struct nfs4_string		oda_osdname;
+ * };
+ */
+
+/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does
+ * not have an xdr_stream
+ */
+static __be32 *
+__read_opaque_cred(__be32 *p,
+			      struct pnfs_osd_opaque_cred *opaque_cred)
+{
+	opaque_cred->cred_len = be32_to_cpu(*p++);
+	opaque_cred->cred = p;
+	return p + XDR_QUADLEN(opaque_cred->cred_len);
+}
+
+static __be32 *
+__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp)
+{
+	p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
+	comp->oc_osd_version = be32_to_cpup(p++);
+	comp->oc_cap_key_sec = be32_to_cpup(p++);
+
+	p = __read_opaque_cred(p, &comp->oc_cap_key);
+	p = __read_opaque_cred(p, &comp->oc_cap);
+	return p;
+}
+
+void pnfs_osd_xdr_decode_deviceaddr(
+	struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p)
+{
+	p = __read_targetid(p, &deviceaddr->oda_targetid);
+
+	p = __read_targetaddr(p, &deviceaddr->oda_targetaddr);
+
+	p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun,
+				    sizeof(deviceaddr->oda_lun));
+
+	p = __read_u8_opaque(p, &deviceaddr->oda_systemid);
+
+	p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred);
+
+	p = __read_u8_opaque(p, &deviceaddr->oda_osdname);
+
+	/* libosd likes this terminated in dbg. It's last, so no problems */
+	deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0;
+}
+
+/*
+ * struct pnfs_osd_layoutupdate {
+ *	u32	dsu_valid;
+ *	s64	dsu_delta;
+ *	u32	olu_ioerr_flag;
+ * }; xdr size 4 + 8 + 4
+ */
+int
+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
+				 struct pnfs_osd_layoutupdate *lou)
+{
+	__be32 *p = xdr_reserve_space(xdr,  4 + 8 + 4);
+
+	if (!p)
+		return -E2BIG;
+
+	*p++ = cpu_to_be32(lou->dsu_valid);
+	if (lou->dsu_valid)
+		p = xdr_encode_hyper(p, lou->dsu_delta);
+	*p++ = cpu_to_be32(lou->olu_ioerr_flag);
+	return 0;
+}
+
+/*
+ * struct pnfs_osd_objid {
+ *	struct nfs4_deviceid	oid_device_id;
+ *	u64			oid_partition_id;
+ *	u64			oid_object_id;
+ * }; // xdr size 32 bytes
+ */
+static inline __be32 *
+pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id)
+{
+	p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
+				    sizeof(object_id->oid_device_id.data));
+	p = xdr_encode_hyper(p, object_id->oid_partition_id);
+	p = xdr_encode_hyper(p, object_id->oid_object_id);
+
+	return p;
+}
+
+/*
+ * struct pnfs_osd_ioerr {
+ *	struct pnfs_osd_objid	oer_component;
+ *	u64			oer_comp_offset;
+ *	u64			oer_comp_length;
+ *	u32			oer_iswrite;
+ *	u32			oer_errno;
+ * }; // xdr size 32 + 24 bytes
+ */
+void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr)
+{
+	p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component);
+	p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
+	p = xdr_encode_hyper(p, ioerr->oer_comp_length);
+	*p++ = cpu_to_be32(ioerr->oer_iswrite);
+	*p   = cpu_to_be32(ioerr->oer_errno);
+}
+
+__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 32 + 24);
+	if (unlikely(!p))
+		dprintk("%s: out of xdr space\n", __func__);
+
+	return p;
+}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
new file mode 100644
index 00000000..00985571
--- /dev/null
+++ b/fs/nfs/pagelist.c
@@ -0,0 +1,453 @@
+/*
+ * linux/fs/nfs/pagelist.c
+ *
+ * A set of helper functions for managing NFS read and write requests.
+ * The main purpose of these routines is to provide support for the
+ * coalescing of several requests into a single RPC call.
+ *
+ * Copyright 2000, 2001 (c) Trond Myklebust <trond.myklebust@fys.uio.no>
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_page.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+
+#include "internal.h"
+#include "pnfs.h"
+
+static struct kmem_cache *nfs_page_cachep;
+
+static inline struct nfs_page *
+nfs_page_alloc(void)
+{
+	struct nfs_page	*p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
+	if (p)
+		INIT_LIST_HEAD(&p->wb_list);
+	return p;
+}
+
+static inline void
+nfs_page_free(struct nfs_page *p)
+{
+	kmem_cache_free(nfs_page_cachep, p);
+}
+
+/**
+ * nfs_create_request - Create an NFS read/write request.
+ * @file: file descriptor to use
+ * @inode: inode to which the request is attached
+ * @page: page to write
+ * @offset: starting offset within the page for the write
+ * @count: number of bytes to read/write
+ *
+ * The page must be locked by the caller. This makes sure we never
+ * create two different requests for the same page.
+ * User should ensure it is safe to sleep in this function.
+ */
+struct nfs_page *
+nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
+		   struct page *page,
+		   unsigned int offset, unsigned int count)
+{
+	struct nfs_page		*req;
+
+	/* try to allocate the request struct */
+	req = nfs_page_alloc();
+	if (req == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	/* get lock context early so we can deal with alloc failures */
+	req->wb_lock_context = nfs_get_lock_context(ctx);
+	if (req->wb_lock_context == NULL) {
+		nfs_page_free(req);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* Initialize the request struct. Initially, we assume a
+	 * long write-back delay. This will be adjusted in
+	 * update_nfs_request below if the region is not locked. */
+	req->wb_page    = page;
+	atomic_set(&req->wb_complete, 0);
+	req->wb_index	= page->index;
+	page_cache_get(page);
+	BUG_ON(PagePrivate(page));
+	BUG_ON(!PageLocked(page));
+	BUG_ON(page->mapping->host != inode);
+	req->wb_offset  = offset;
+	req->wb_pgbase	= offset;
+	req->wb_bytes   = count;
+	req->wb_context = get_nfs_open_context(ctx);
+	kref_init(&req->wb_kref);
+	return req;
+}
+
+/**
+ * nfs_unlock_request - Unlock request and wake up sleepers.
+ * @req:
+ */
+void nfs_unlock_request(struct nfs_page *req)
+{
+	if (!NFS_WBACK_BUSY(req)) {
+		printk(KERN_ERR "NFS: Invalid unlock attempted\n");
+		BUG();
+	}
+	smp_mb__before_clear_bit();
+	clear_bit(PG_BUSY, &req->wb_flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&req->wb_flags, PG_BUSY);
+	nfs_release_request(req);
+}
+
+/**
+ * nfs_set_page_tag_locked - Tag a request as locked
+ * @req:
+ */
+int nfs_set_page_tag_locked(struct nfs_page *req)
+{
+	if (!nfs_lock_request_dontget(req))
+		return 0;
+	if (test_bit(PG_MAPPED, &req->wb_flags))
+		radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
+	return 1;
+}
+
+/**
+ * nfs_clear_page_tag_locked - Clear request tag and wake up sleepers
+ */
+void nfs_clear_page_tag_locked(struct nfs_page *req)
+{
+	if (test_bit(PG_MAPPED, &req->wb_flags)) {
+		struct inode *inode = req->wb_context->path.dentry->d_inode;
+		struct nfs_inode *nfsi = NFS_I(inode);
+
+		spin_lock(&inode->i_lock);
+		radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
+		nfs_unlock_request(req);
+		spin_unlock(&inode->i_lock);
+	} else
+		nfs_unlock_request(req);
+}
+
+/*
+ * nfs_clear_request - Free up all resources allocated to the request
+ * @req:
+ *
+ * Release page and open context resources associated with a read/write
+ * request after it has completed.
+ */
+static void nfs_clear_request(struct nfs_page *req)
+{
+	struct page *page = req->wb_page;
+	struct nfs_open_context *ctx = req->wb_context;
+	struct nfs_lock_context *l_ctx = req->wb_lock_context;
+
+	if (page != NULL) {
+		page_cache_release(page);
+		req->wb_page = NULL;
+	}
+	if (l_ctx != NULL) {
+		nfs_put_lock_context(l_ctx);
+		req->wb_lock_context = NULL;
+	}
+	if (ctx != NULL) {
+		put_nfs_open_context(ctx);
+		req->wb_context = NULL;
+	}
+}
+
+
+/**
+ * nfs_release_request - Release the count on an NFS read/write request
+ * @req: request to release
+ *
+ * Note: Should never be called with the spinlock held!
+ */
+static void nfs_free_request(struct kref *kref)
+{
+	struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+
+	/* Release struct file and open context */
+	nfs_clear_request(req);
+	nfs_page_free(req);
+}
+
+void nfs_release_request(struct nfs_page *req)
+{
+	kref_put(&req->wb_kref, nfs_free_request);
+}
+
+static int nfs_wait_bit_uninterruptible(void *word)
+{
+	io_schedule();
+	return 0;
+}
+
+/**
+ * nfs_wait_on_request - Wait for a request to complete.
+ * @req: request to wait upon.
+ *
+ * Interruptible by fatal signals only.
+ * The user is responsible for holding a count on the request.
+ */
+int
+nfs_wait_on_request(struct nfs_page *req)
+{
+	return wait_on_bit(&req->wb_flags, PG_BUSY,
+			nfs_wait_bit_uninterruptible,
+			TASK_UNINTERRUPTIBLE);
+}
+
+bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
+{
+	/*
+	 * FIXME: ideally we should be able to coalesce all requests
+	 * that are not block boundary aligned, but currently this
+	 * is problematic for the case of bsize < PAGE_CACHE_SIZE,
+	 * since nfs_flush_multi and nfs_pagein_multi assume you
+	 * can have only one struct nfs_page.
+	 */
+	if (desc->pg_bsize < PAGE_SIZE)
+		return 0;
+
+	return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
+}
+EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
+
+/**
+ * nfs_pageio_init - initialise a page io descriptor
+ * @desc: pointer to descriptor
+ * @inode: pointer to inode
+ * @doio: pointer to io function
+ * @bsize: io block size
+ * @io_flags: extra parameters for the io function
+ */
+void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
+		     struct inode *inode,
+		     int (*doio)(struct nfs_pageio_descriptor *),
+		     size_t bsize,
+		     int io_flags)
+{
+	INIT_LIST_HEAD(&desc->pg_list);
+	desc->pg_bytes_written = 0;
+	desc->pg_count = 0;
+	desc->pg_bsize = bsize;
+	desc->pg_base = 0;
+	desc->pg_moreio = 0;
+	desc->pg_inode = inode;
+	desc->pg_doio = doio;
+	desc->pg_ioflags = io_flags;
+	desc->pg_error = 0;
+	desc->pg_lseg = NULL;
+	desc->pg_test = nfs_generic_pg_test;
+	pnfs_pageio_init(desc, inode);
+}
+
+/**
+ * nfs_can_coalesce_requests - test two requests for compatibility
+ * @prev: pointer to nfs_page
+ * @req: pointer to nfs_page
+ *
+ * The nfs_page structures 'prev' and 'req' are compared to ensure that the
+ * page data area they describe is contiguous, and that their RPC
+ * credentials, NFSv4 open state, and lockowners are the same.
+ *
+ * Return 'true' if this is the case, else return 'false'.
+ */
+static bool nfs_can_coalesce_requests(struct nfs_page *prev,
+				      struct nfs_page *req,
+				      struct nfs_pageio_descriptor *pgio)
+{
+	if (req->wb_context->cred != prev->wb_context->cred)
+		return false;
+	if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
+		return false;
+	if (req->wb_context->state != prev->wb_context->state)
+		return false;
+	if (req->wb_index != (prev->wb_index + 1))
+		return false;
+	if (req->wb_pgbase != 0)
+		return false;
+	if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
+		return false;
+	return pgio->pg_test(pgio, prev, req);
+}
+
+/**
+ * nfs_pageio_do_add_request - Attempt to coalesce a request into a page list.
+ * @desc: destination io descriptor
+ * @req: request
+ *
+ * Returns true if the request 'req' was successfully coalesced into the
+ * existing list of pages 'desc'.
+ */
+static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
+				     struct nfs_page *req)
+{
+	if (desc->pg_count != 0) {
+		struct nfs_page *prev;
+
+		prev = nfs_list_entry(desc->pg_list.prev);
+		if (!nfs_can_coalesce_requests(prev, req, desc))
+			return 0;
+	} else {
+		desc->pg_base = req->wb_pgbase;
+	}
+	nfs_list_remove_request(req);
+	nfs_list_add_request(req, &desc->pg_list);
+	desc->pg_count += req->wb_bytes;
+	return 1;
+}
+
+/*
+ * Helper for nfs_pageio_add_request and nfs_pageio_complete
+ */
+static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
+{
+	if (!list_empty(&desc->pg_list)) {
+		int error = desc->pg_doio(desc);
+		if (error < 0)
+			desc->pg_error = error;
+		else
+			desc->pg_bytes_written += desc->pg_count;
+	}
+	if (list_empty(&desc->pg_list)) {
+		desc->pg_count = 0;
+		desc->pg_base = 0;
+	}
+}
+
+/**
+ * nfs_pageio_add_request - Attempt to coalesce a request into a page list.
+ * @desc: destination io descriptor
+ * @req: request
+ *
+ * Returns true if the request 'req' was successfully coalesced into the
+ * existing list of pages 'desc'.
+ */
+int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+			   struct nfs_page *req)
+{
+	while (!nfs_pageio_do_add_request(desc, req)) {
+		desc->pg_moreio = 1;
+		nfs_pageio_doio(desc);
+		if (desc->pg_error < 0)
+			return 0;
+		desc->pg_moreio = 0;
+	}
+	return 1;
+}
+
+/**
+ * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
+ * @desc: pointer to io descriptor
+ */
+void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
+{
+	nfs_pageio_doio(desc);
+}
+
+/**
+ * nfs_pageio_cond_complete - Conditional I/O completion
+ * @desc: pointer to io descriptor
+ * @index: page index
+ *
+ * It is important to ensure that processes don't try to take locks
+ * on non-contiguous ranges of pages as that might deadlock. This
+ * function should be called before attempting to wait on a locked
+ * nfs_page. It will complete the I/O if the page index 'index'
+ * is not contiguous with the existing list of pages in 'desc'.
+ */
+void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
+{
+	if (!list_empty(&desc->pg_list)) {
+		struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev);
+		if (index != prev->wb_index + 1)
+			nfs_pageio_doio(desc);
+	}
+}
+
+#define NFS_SCAN_MAXENTRIES 16
+/**
+ * nfs_scan_list - Scan a list for matching requests
+ * @nfsi: NFS inode
+ * @dst: Destination list
+ * @idx_start: lower bound of page->index to scan
+ * @npages: idx_start + npages sets the upper bound to scan.
+ * @tag: tag to scan for
+ *
+ * Moves elements from one of the inode request lists.
+ * If the number of requests is set to 0, the entire address_space
+ * starting at index idx_start, is scanned.
+ * The requests are *not* checked to ensure that they form a contiguous set.
+ * You must be holding the inode's i_lock when calling this function
+ */
+int nfs_scan_list(struct nfs_inode *nfsi,
+		struct list_head *dst, pgoff_t idx_start,
+		unsigned int npages, int tag)
+{
+	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
+	struct nfs_page *req;
+	pgoff_t idx_end;
+	int found, i;
+	int res;
+	struct list_head *list;
+
+	res = 0;
+	if (npages == 0)
+		idx_end = ~0;
+	else
+		idx_end = idx_start + npages - 1;
+
+	for (;;) {
+		found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree,
+				(void **)&pgvec[0], idx_start,
+				NFS_SCAN_MAXENTRIES, tag);
+		if (found <= 0)
+			break;
+		for (i = 0; i < found; i++) {
+			req = pgvec[i];
+			if (req->wb_index > idx_end)
+				goto out;
+			idx_start = req->wb_index + 1;
+			if (nfs_set_page_tag_locked(req)) {
+				kref_get(&req->wb_kref);
+				radix_tree_tag_clear(&nfsi->nfs_page_tree,
+						req->wb_index, tag);
+				list = pnfs_choose_commit_list(req, dst);
+				nfs_list_add_request(req, list);
+				res++;
+				if (res == INT_MAX)
+					goto out;
+			}
+		}
+		/* for latency reduction */
+		cond_resched_lock(&nfsi->vfs_inode.i_lock);
+	}
+out:
+	return res;
+}
+
+int __init nfs_init_nfspagecache(void)
+{
+	nfs_page_cachep = kmem_cache_create("nfs_page",
+					    sizeof(struct nfs_page),
+					    0, SLAB_HWCACHE_ALIGN,
+					    NULL);
+	if (nfs_page_cachep == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void nfs_destroy_nfspagecache(void)
+{
+	kmem_cache_destroy(nfs_page_cachep);
+}
+
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
new file mode 100644
index 00000000..99518872
--- /dev/null
+++ b/fs/nfs/pnfs.c
@@ -0,0 +1,1319 @@
+/*
+ *  pNFS functions to call and manage layout drivers.
+ *
+ *  Copyright (c) 2002 [year of first publication]
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "pnfs.h"
+#include "iostat.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS
+
+/* Locking:
+ *
+ * pnfs_spinlock:
+ *      protects pnfs_modules_tbl.
+ */
+static DEFINE_SPINLOCK(pnfs_spinlock);
+
+/*
+ * pnfs_modules_tbl holds all pnfs modules
+ */
+static LIST_HEAD(pnfs_modules_tbl);
+
+/* Return the registered pnfs layout driver module matching given id */
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver_locked(u32 id)
+{
+	struct pnfs_layoutdriver_type *local;
+
+	list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
+		if (local->id == id)
+			goto out;
+	local = NULL;
+out:
+	dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
+	return local;
+}
+
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver(u32 id)
+{
+	struct pnfs_layoutdriver_type *local;
+
+	spin_lock(&pnfs_spinlock);
+	local = find_pnfs_driver_locked(id);
+	spin_unlock(&pnfs_spinlock);
+	return local;
+}
+
+void
+unset_pnfs_layoutdriver(struct nfs_server *nfss)
+{
+	if (nfss->pnfs_curr_ld)
+		module_put(nfss->pnfs_curr_ld->owner);
+	nfss->pnfs_curr_ld = NULL;
+}
+
+/*
+ * Try to set the server's pnfs module to the pnfs layout type specified by id.
+ * Currently only one pNFS layout driver per filesystem is supported.
+ *
+ * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
+ */
+void
+set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
+{
+	struct pnfs_layoutdriver_type *ld_type = NULL;
+
+	if (id == 0)
+		goto out_no_driver;
+	if (!(server->nfs_client->cl_exchange_flags &
+		 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
+		printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
+		       id, server->nfs_client->cl_exchange_flags);
+		goto out_no_driver;
+	}
+	ld_type = find_pnfs_driver(id);
+	if (!ld_type) {
+		request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
+		ld_type = find_pnfs_driver(id);
+		if (!ld_type) {
+			dprintk("%s: No pNFS module found for %u.\n",
+				__func__, id);
+			goto out_no_driver;
+		}
+	}
+	if (!try_module_get(ld_type->owner)) {
+		dprintk("%s: Could not grab reference on module\n", __func__);
+		goto out_no_driver;
+	}
+	server->pnfs_curr_ld = ld_type;
+
+	dprintk("%s: pNFS module for %u set\n", __func__, id);
+	return;
+
+out_no_driver:
+	dprintk("%s: Using NFSv4 I/O\n", __func__);
+	server->pnfs_curr_ld = NULL;
+}
+
+int
+pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+	int status = -EINVAL;
+	struct pnfs_layoutdriver_type *tmp;
+
+	if (ld_type->id == 0) {
+		printk(KERN_ERR "%s id 0 is reserved\n", __func__);
+		return status;
+	}
+	if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
+		printk(KERN_ERR "%s Layout driver must provide "
+		       "alloc_lseg and free_lseg.\n", __func__);
+		return status;
+	}
+
+	spin_lock(&pnfs_spinlock);
+	tmp = find_pnfs_driver_locked(ld_type->id);
+	if (!tmp) {
+		list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
+		status = 0;
+		dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
+			ld_type->name);
+	} else {
+		printk(KERN_ERR "%s Module with id %d already loaded!\n",
+			__func__, ld_type->id);
+	}
+	spin_unlock(&pnfs_spinlock);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
+
+void
+pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+	dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
+	spin_lock(&pnfs_spinlock);
+	list_del(&ld_type->pnfs_tblid);
+	spin_unlock(&pnfs_spinlock);
+}
+EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
+
+/*
+ * pNFS client layout cache
+ */
+
+/* Need to hold i_lock if caller does not already hold reference */
+void
+get_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	atomic_inc(&lo->plh_refcount);
+}
+
+static struct pnfs_layout_hdr *
+pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
+{
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+	return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) :
+		kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
+}
+
+static void
+pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
+	put_rpccred(lo->plh_lc_cred);
+	return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
+}
+
+static void
+destroy_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	dprintk("%s: freeing layout cache %p\n", __func__, lo);
+	BUG_ON(!list_empty(&lo->plh_layouts));
+	NFS_I(lo->plh_inode)->layout = NULL;
+	pnfs_free_layout_hdr(lo);
+}
+
+static void
+put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+{
+	if (atomic_dec_and_test(&lo->plh_refcount))
+		destroy_layout_hdr(lo);
+}
+
+void
+put_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	struct inode *inode = lo->plh_inode;
+
+	if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+		destroy_layout_hdr(lo);
+		spin_unlock(&inode->i_lock);
+	}
+}
+
+static void
+init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
+{
+	INIT_LIST_HEAD(&lseg->pls_list);
+	INIT_LIST_HEAD(&lseg->pls_lc_list);
+	atomic_set(&lseg->pls_refcount, 1);
+	smp_mb();
+	set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
+	lseg->pls_layout = lo;
+}
+
+static void free_lseg(struct pnfs_layout_segment *lseg)
+{
+	struct inode *ino = lseg->pls_layout->plh_inode;
+
+	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+	/* Matched by get_layout_hdr in pnfs_insert_layout */
+	put_layout_hdr(NFS_I(ino)->layout);
+}
+
+static void
+put_lseg_common(struct pnfs_layout_segment *lseg)
+{
+	struct inode *inode = lseg->pls_layout->plh_inode;
+
+	WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+	list_del_init(&lseg->pls_list);
+	if (list_empty(&lseg->pls_layout->plh_segs)) {
+		set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
+		/* Matched by initial refcount set in alloc_init_layout_hdr */
+		put_layout_hdr_locked(lseg->pls_layout);
+	}
+	rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
+}
+
+void
+put_lseg(struct pnfs_layout_segment *lseg)
+{
+	struct inode *inode;
+
+	if (!lseg)
+		return;
+
+	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+		atomic_read(&lseg->pls_refcount),
+		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+	inode = lseg->pls_layout->plh_inode;
+	if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
+		LIST_HEAD(free_me);
+
+		put_lseg_common(lseg);
+		list_add(&lseg->pls_list, &free_me);
+		spin_unlock(&inode->i_lock);
+		pnfs_free_lseg_list(&free_me);
+	}
+}
+EXPORT_SYMBOL_GPL(put_lseg);
+
+static inline u64
+end_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	end = start + len;
+	return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	BUG_ON(!len);
+	end = start + len;
+	return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+
+/*
+ * is l2 fully contained in l1?
+ *   start1                             end1
+ *   [----------------------------------)
+ *           start2           end2
+ *           [----------------)
+ */
+static inline int
+lo_seg_contained(struct pnfs_layout_range *l1,
+		 struct pnfs_layout_range *l2)
+{
+	u64 start1 = l1->offset;
+	u64 end1 = end_offset(start1, l1->length);
+	u64 start2 = l2->offset;
+	u64 end2 = end_offset(start2, l2->length);
+
+	return (start1 <= start2) && (end1 >= end2);
+}
+
+/*
+ * is l1 and l2 intersecting?
+ *   start1                             end1
+ *   [----------------------------------)
+ *                              start2           end2
+ *                              [----------------)
+ */
+static inline int
+lo_seg_intersecting(struct pnfs_layout_range *l1,
+		    struct pnfs_layout_range *l2)
+{
+	u64 start1 = l1->offset;
+	u64 end1 = end_offset(start1, l1->length);
+	u64 start2 = l2->offset;
+	u64 end2 = end_offset(start2, l2->length);
+
+	return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
+	       (end2 == NFS4_MAX_UINT64 || end2 > start1);
+}
+
+static bool
+should_free_lseg(struct pnfs_layout_range *lseg_range,
+		 struct pnfs_layout_range *recall_range)
+{
+	return (recall_range->iomode == IOMODE_ANY ||
+		lseg_range->iomode == recall_range->iomode) &&
+	       lo_seg_intersecting(lseg_range, recall_range);
+}
+
+/* Returns 1 if lseg is removed from list, 0 otherwise */
+static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
+			     struct list_head *tmp_list)
+{
+	int rv = 0;
+
+	if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+		/* Remove the reference keeping the lseg in the
+		 * list.  It will now be removed when all
+		 * outstanding io is finished.
+		 */
+		dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+			atomic_read(&lseg->pls_refcount));
+		if (atomic_dec_and_test(&lseg->pls_refcount)) {
+			put_lseg_common(lseg);
+			list_add(&lseg->pls_list, tmp_list);
+			rv = 1;
+		}
+	}
+	return rv;
+}
+
+/* Returns count of number of matching invalid lsegs remaining in list
+ * after call.
+ */
+int
+mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+			    struct list_head *tmp_list,
+			    struct pnfs_layout_range *recall_range)
+{
+	struct pnfs_layout_segment *lseg, *next;
+	int invalid = 0, removed = 0;
+
+	dprintk("%s:Begin lo %p\n", __func__, lo);
+
+	if (list_empty(&lo->plh_segs)) {
+		if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
+			put_layout_hdr_locked(lo);
+		return 0;
+	}
+	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+		if (!recall_range ||
+		    should_free_lseg(&lseg->pls_range, recall_range)) {
+			dprintk("%s: freeing lseg %p iomode %d "
+				"offset %llu length %llu\n", __func__,
+				lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
+				lseg->pls_range.length);
+			invalid++;
+			removed += mark_lseg_invalid(lseg, tmp_list);
+		}
+	dprintk("%s:Return %i\n", __func__, invalid - removed);
+	return invalid - removed;
+}
+
+/* note free_me must contain lsegs from a single layout_hdr */
+void
+pnfs_free_lseg_list(struct list_head *free_me)
+{
+	struct pnfs_layout_segment *lseg, *tmp;
+	struct pnfs_layout_hdr *lo;
+
+	if (list_empty(free_me))
+		return;
+
+	lo = list_first_entry(free_me, struct pnfs_layout_segment,
+			      pls_list)->pls_layout;
+
+	if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
+		struct nfs_client *clp;
+
+		clp = NFS_SERVER(lo->plh_inode)->nfs_client;
+		spin_lock(&clp->cl_lock);
+		list_del_init(&lo->plh_layouts);
+		spin_unlock(&clp->cl_lock);
+	}
+	list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
+		list_del(&lseg->pls_list);
+		free_lseg(lseg);
+	}
+}
+
+void
+pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+	struct pnfs_layout_hdr *lo;
+	LIST_HEAD(tmp_list);
+
+	spin_lock(&nfsi->vfs_inode.i_lock);
+	lo = nfsi->layout;
+	if (lo) {
+		lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
+		mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+	}
+	spin_unlock(&nfsi->vfs_inode.i_lock);
+	pnfs_free_lseg_list(&tmp_list);
+}
+
+/*
+ * Called by the state manger to remove all layouts established under an
+ * expired lease.
+ */
+void
+pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+	struct pnfs_layout_hdr *lo;
+	LIST_HEAD(tmp_list);
+
+	spin_lock(&clp->cl_lock);
+	list_splice_init(&clp->cl_layouts, &tmp_list);
+	spin_unlock(&clp->cl_lock);
+
+	while (!list_empty(&tmp_list)) {
+		lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
+				plh_layouts);
+		dprintk("%s freeing layout for inode %lu\n", __func__,
+			lo->plh_inode->i_ino);
+		list_del_init(&lo->plh_layouts);
+		pnfs_destroy_layout(NFS_I(lo->plh_inode));
+	}
+}
+
+/* update lo->plh_stateid with new if is more recent */
+void
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
+			bool update_barrier)
+{
+	u32 oldseq, newseq;
+
+	oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+	newseq = be32_to_cpu(new->stateid.seqid);
+	if ((int)(newseq - oldseq) > 0) {
+		memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
+		if (update_barrier) {
+			u32 new_barrier = be32_to_cpu(new->stateid.seqid);
+
+			if ((int)(new_barrier - lo->plh_barrier))
+				lo->plh_barrier = new_barrier;
+		} else {
+			/* Because of wraparound, we want to keep the barrier
+			 * "close" to the current seqids.  It needs to be
+			 * within 2**31 to count as "behind", so if it
+			 * gets too near that limit, give us a litle leeway
+			 * and bring it to within 2**30.
+			 * NOTE - and yes, this is all unsigned arithmetic.
+			 */
+			if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
+				lo->plh_barrier = newseq - (1 << 30);
+		}
+	}
+}
+
+/* lget is set to 1 if called from inside send_layoutget call chain */
+static bool
+pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
+			int lget)
+{
+	if ((stateid) &&
+	    (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
+		return true;
+	return lo->plh_block_lgets ||
+		test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
+		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+		(list_empty(&lo->plh_segs) &&
+		 (atomic_read(&lo->plh_outstanding) > lget));
+}
+
+int
+pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+			      struct nfs4_state *open_state)
+{
+	int status = 0;
+
+	dprintk("--> %s\n", __func__);
+	spin_lock(&lo->plh_inode->i_lock);
+	if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
+		status = -EAGAIN;
+	} else if (list_empty(&lo->plh_segs)) {
+		int seq;
+
+		do {
+			seq = read_seqbegin(&open_state->seqlock);
+			memcpy(dst->data, open_state->stateid.data,
+			       sizeof(open_state->stateid.data));
+		} while (read_seqretry(&open_state->seqlock, seq));
+	} else
+		memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
+	spin_unlock(&lo->plh_inode->i_lock);
+	dprintk("<-- %s\n", __func__);
+	return status;
+}
+
+/*
+* Get layout from server.
+*    for now, assume that whole file layouts are requested.
+*    arg->offset: 0
+*    arg->length: all ones
+*/
+static struct pnfs_layout_segment *
+send_layoutget(struct pnfs_layout_hdr *lo,
+	   struct nfs_open_context *ctx,
+	   struct pnfs_layout_range *range,
+	   gfp_t gfp_flags)
+{
+	struct inode *ino = lo->plh_inode;
+	struct nfs_server *server = NFS_SERVER(ino);
+	struct nfs4_layoutget *lgp;
+	struct pnfs_layout_segment *lseg = NULL;
+	struct page **pages = NULL;
+	int i;
+	u32 max_resp_sz, max_pages;
+
+	dprintk("--> %s\n", __func__);
+
+	BUG_ON(ctx == NULL);
+	lgp = kzalloc(sizeof(*lgp), gfp_flags);
+	if (lgp == NULL)
+		return NULL;
+
+	/* allocate pages for xdr post processing */
+	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+	max_pages = max_resp_sz >> PAGE_SHIFT;
+
+	pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags);
+	if (!pages)
+		goto out_err_free;
+
+	for (i = 0; i < max_pages; i++) {
+		pages[i] = alloc_page(gfp_flags);
+		if (!pages[i])
+			goto out_err_free;
+	}
+
+	lgp->args.minlength = PAGE_CACHE_SIZE;
+	if (lgp->args.minlength > range->length)
+		lgp->args.minlength = range->length;
+	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+	lgp->args.range = *range;
+	lgp->args.type = server->pnfs_curr_ld->id;
+	lgp->args.inode = ino;
+	lgp->args.ctx = get_nfs_open_context(ctx);
+	lgp->args.layout.pages = pages;
+	lgp->args.layout.pglen = max_pages * PAGE_SIZE;
+	lgp->lsegpp = &lseg;
+	lgp->gfp_flags = gfp_flags;
+
+	/* Synchronously retrieve layout information from server and
+	 * store in lseg.
+	 */
+	nfs4_proc_layoutget(lgp);
+	if (!lseg) {
+		/* remember that LAYOUTGET failed and suspend trying */
+		set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
+	}
+
+	/* free xdr pages */
+	for (i = 0; i < max_pages; i++)
+		__free_page(pages[i]);
+	kfree(pages);
+
+	return lseg;
+
+out_err_free:
+	/* free any allocated xdr pages, lgp as it's not used */
+	if (pages) {
+		for (i = 0; i < max_pages; i++) {
+			if (!pages[i])
+				break;
+			__free_page(pages[i]);
+		}
+		kfree(pages);
+	}
+	kfree(lgp);
+	return NULL;
+}
+
+/* Initiates a LAYOUTRETURN(FILE) */
+int
+_pnfs_return_layout(struct inode *ino)
+{
+	struct pnfs_layout_hdr *lo = NULL;
+	struct nfs_inode *nfsi = NFS_I(ino);
+	LIST_HEAD(tmp_list);
+	struct nfs4_layoutreturn *lrp;
+	nfs4_stateid stateid;
+	int status = 0;
+
+	dprintk("--> %s\n", __func__);
+
+	spin_lock(&ino->i_lock);
+	lo = nfsi->layout;
+	if (!lo) {
+		spin_unlock(&ino->i_lock);
+		dprintk("%s: no layout to return\n", __func__);
+		return status;
+	}
+	stateid = nfsi->layout->plh_stateid;
+	/* Reference matched in nfs4_layoutreturn_release */
+	get_layout_hdr(lo);
+	mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+	lo->plh_block_lgets++;
+	spin_unlock(&ino->i_lock);
+	pnfs_free_lseg_list(&tmp_list);
+
+	WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags));
+
+	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
+	if (unlikely(lrp == NULL)) {
+		status = -ENOMEM;
+		set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags);
+		set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags);
+		put_layout_hdr(lo);
+		goto out;
+	}
+
+	lrp->args.stateid = stateid;
+	lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
+	lrp->args.inode = ino;
+	lrp->clp = NFS_SERVER(ino)->nfs_client;
+
+	status = nfs4_proc_layoutreturn(lrp);
+out:
+	dprintk("<-- %s status: %d\n", __func__, status);
+	return status;
+}
+
+bool pnfs_roc(struct inode *ino)
+{
+	struct pnfs_layout_hdr *lo;
+	struct pnfs_layout_segment *lseg, *tmp;
+	LIST_HEAD(tmp_list);
+	bool found = false;
+
+	spin_lock(&ino->i_lock);
+	lo = NFS_I(ino)->layout;
+	if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
+	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+		goto out_nolayout;
+	list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
+		if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+			mark_lseg_invalid(lseg, &tmp_list);
+			found = true;
+		}
+	if (!found)
+		goto out_nolayout;
+	lo->plh_block_lgets++;
+	get_layout_hdr(lo); /* matched in pnfs_roc_release */
+	spin_unlock(&ino->i_lock);
+	pnfs_free_lseg_list(&tmp_list);
+	return true;
+
+out_nolayout:
+	spin_unlock(&ino->i_lock);
+	return false;
+}
+
+void pnfs_roc_release(struct inode *ino)
+{
+	struct pnfs_layout_hdr *lo;
+
+	spin_lock(&ino->i_lock);
+	lo = NFS_I(ino)->layout;
+	lo->plh_block_lgets--;
+	put_layout_hdr_locked(lo);
+	spin_unlock(&ino->i_lock);
+}
+
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+	struct pnfs_layout_hdr *lo;
+
+	spin_lock(&ino->i_lock);
+	lo = NFS_I(ino)->layout;
+	if ((int)(barrier - lo->plh_barrier) > 0)
+		lo->plh_barrier = barrier;
+	spin_unlock(&ino->i_lock);
+}
+
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+	struct nfs_inode *nfsi = NFS_I(ino);
+	struct pnfs_layout_segment *lseg;
+	bool found = false;
+
+	spin_lock(&ino->i_lock);
+	list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
+		if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+			found = true;
+			break;
+		}
+	if (!found) {
+		struct pnfs_layout_hdr *lo = nfsi->layout;
+		u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+
+		/* Since close does not return a layout stateid for use as
+		 * a barrier, we choose the worst-case barrier.
+		 */
+		*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
+	}
+	spin_unlock(&ino->i_lock);
+	return found;
+}
+
+/*
+ * Compare two layout segments for sorting into layout cache.
+ * We want to preferentially return RW over RO layouts, so ensure those
+ * are seen first.
+ */
+static s64
+cmp_layout(struct pnfs_layout_range *l1,
+	   struct pnfs_layout_range *l2)
+{
+	s64 d;
+
+	/* high offset > low offset */
+	d = l1->offset - l2->offset;
+	if (d)
+		return d;
+
+	/* short length > long length */
+	d = l2->length - l1->length;
+	if (d)
+		return d;
+
+	/* read > read/write */
+	return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
+}
+
+static void
+pnfs_insert_layout(struct pnfs_layout_hdr *lo,
+		   struct pnfs_layout_segment *lseg)
+{
+	struct pnfs_layout_segment *lp;
+
+	dprintk("%s:Begin\n", __func__);
+
+	assert_spin_locked(&lo->plh_inode->i_lock);
+	list_for_each_entry(lp, &lo->plh_segs, pls_list) {
+		if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
+			continue;
+		list_add_tail(&lseg->pls_list, &lp->pls_list);
+		dprintk("%s: inserted lseg %p "
+			"iomode %d offset %llu length %llu before "
+			"lp %p iomode %d offset %llu length %llu\n",
+			__func__, lseg, lseg->pls_range.iomode,
+			lseg->pls_range.offset, lseg->pls_range.length,
+			lp, lp->pls_range.iomode, lp->pls_range.offset,
+			lp->pls_range.length);
+		goto out;
+	}
+	list_add_tail(&lseg->pls_list, &lo->plh_segs);
+	dprintk("%s: inserted lseg %p "
+		"iomode %d offset %llu length %llu at tail\n",
+		__func__, lseg, lseg->pls_range.iomode,
+		lseg->pls_range.offset, lseg->pls_range.length);
+out:
+	get_layout_hdr(lo);
+
+	dprintk("%s:Return\n", __func__);
+}
+
+static struct pnfs_layout_hdr *
+alloc_init_layout_hdr(struct inode *ino,
+		      struct nfs_open_context *ctx,
+		      gfp_t gfp_flags)
+{
+	struct pnfs_layout_hdr *lo;
+
+	lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
+	if (!lo)
+		return NULL;
+	atomic_set(&lo->plh_refcount, 1);
+	INIT_LIST_HEAD(&lo->plh_layouts);
+	INIT_LIST_HEAD(&lo->plh_segs);
+	INIT_LIST_HEAD(&lo->plh_bulk_recall);
+	lo->plh_inode = ino;
+	lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
+	return lo;
+}
+
+static struct pnfs_layout_hdr *
+pnfs_find_alloc_layout(struct inode *ino,
+		       struct nfs_open_context *ctx,
+		       gfp_t gfp_flags)
+{
+	struct nfs_inode *nfsi = NFS_I(ino);
+	struct pnfs_layout_hdr *new = NULL;
+
+	dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
+
+	assert_spin_locked(&ino->i_lock);
+	if (nfsi->layout) {
+		if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
+			return NULL;
+		else
+			return nfsi->layout;
+	}
+	spin_unlock(&ino->i_lock);
+	new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
+	spin_lock(&ino->i_lock);
+
+	if (likely(nfsi->layout == NULL))	/* Won the race? */
+		nfsi->layout = new;
+	else
+		pnfs_free_layout_hdr(new);
+	return nfsi->layout;
+}
+
+/*
+ * iomode matching rules:
+ * iomode	lseg	match
+ * -----	-----	-----
+ * ANY		READ	true
+ * ANY		RW	true
+ * RW		READ	false
+ * RW		RW	true
+ * READ		READ	true
+ * READ		RW	true
+ */
+static int
+is_matching_lseg(struct pnfs_layout_range *ls_range,
+		 struct pnfs_layout_range *range)
+{
+	struct pnfs_layout_range range1;
+
+	if ((range->iomode == IOMODE_RW &&
+	     ls_range->iomode != IOMODE_RW) ||
+	    !lo_seg_intersecting(ls_range, range))
+		return 0;
+
+	/* range1 covers only the first byte in the range */
+	range1 = *range;
+	range1.length = 1;
+	return lo_seg_contained(ls_range, &range1);
+}
+
+/*
+ * lookup range in layout
+ */
+static struct pnfs_layout_segment *
+pnfs_find_lseg(struct pnfs_layout_hdr *lo,
+		struct pnfs_layout_range *range)
+{
+	struct pnfs_layout_segment *lseg, *ret = NULL;
+
+	dprintk("%s:Begin\n", __func__);
+
+	assert_spin_locked(&lo->plh_inode->i_lock);
+	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+		    is_matching_lseg(&lseg->pls_range, range)) {
+			ret = get_lseg(lseg);
+			break;
+		}
+		if (lseg->pls_range.offset > range->offset)
+			break;
+	}
+
+	dprintk("%s:Return lseg %p ref %d\n",
+		__func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
+	return ret;
+}
+
+/*
+ * Layout segment is retreived from the server if not cached.
+ * The appropriate layout segment is referenced and returned to the caller.
+ */
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino,
+		   struct nfs_open_context *ctx,
+		   loff_t pos,
+		   u64 count,
+		   enum pnfs_iomode iomode,
+		   gfp_t gfp_flags)
+{
+	struct pnfs_layout_range arg = {
+		.iomode = iomode,
+		.offset = pos,
+		.length = count,
+	};
+	unsigned pg_offset;
+	struct nfs_inode *nfsi = NFS_I(ino);
+	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
+	struct pnfs_layout_hdr *lo;
+	struct pnfs_layout_segment *lseg = NULL;
+	bool first = false;
+
+	if (!pnfs_enabled_sb(NFS_SERVER(ino)))
+		return NULL;
+	spin_lock(&ino->i_lock);
+	lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
+	if (lo == NULL) {
+		dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
+		goto out_unlock;
+	}
+
+	/* Do we even need to bother with this? */
+	if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+		dprintk("%s matches recall, use MDS\n", __func__);
+		goto out_unlock;
+	}
+
+	/* if LAYOUTGET already failed once we don't try again */
+	if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
+		goto out_unlock;
+
+	/* Check to see if the layout for the given range already exists */
+	lseg = pnfs_find_lseg(lo, &arg);
+	if (lseg)
+		goto out_unlock;
+
+	if (pnfs_layoutgets_blocked(lo, NULL, 0))
+		goto out_unlock;
+	atomic_inc(&lo->plh_outstanding);
+
+	get_layout_hdr(lo);
+	if (list_empty(&lo->plh_segs))
+		first = true;
+	spin_unlock(&ino->i_lock);
+	if (first) {
+		/* The lo must be on the clp list if there is any
+		 * chance of a CB_LAYOUTRECALL(FILE) coming in.
+		 */
+		spin_lock(&clp->cl_lock);
+		BUG_ON(!list_empty(&lo->plh_layouts));
+		list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
+		spin_unlock(&clp->cl_lock);
+	}
+
+	pg_offset = arg.offset & ~PAGE_CACHE_MASK;
+	if (pg_offset) {
+		arg.offset -= pg_offset;
+		arg.length += pg_offset;
+	}
+	if (arg.length != NFS4_MAX_UINT64)
+		arg.length = PAGE_CACHE_ALIGN(arg.length);
+
+	lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
+	if (!lseg && first) {
+		spin_lock(&clp->cl_lock);
+		list_del_init(&lo->plh_layouts);
+		spin_unlock(&clp->cl_lock);
+	}
+	atomic_dec(&lo->plh_outstanding);
+	put_layout_hdr(lo);
+out:
+	dprintk("%s end, state 0x%lx lseg %p\n", __func__,
+		nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
+	return lseg;
+out_unlock:
+	spin_unlock(&ino->i_lock);
+	goto out;
+}
+
+int
+pnfs_layout_process(struct nfs4_layoutget *lgp)
+{
+	struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
+	struct nfs4_layoutget_res *res = &lgp->res;
+	struct pnfs_layout_segment *lseg;
+	struct inode *ino = lo->plh_inode;
+	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
+	int status = 0;
+
+	/* Inject layout blob into I/O device driver */
+	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
+	if (!lseg || IS_ERR(lseg)) {
+		if (!lseg)
+			status = -ENOMEM;
+		else
+			status = PTR_ERR(lseg);
+		dprintk("%s: Could not allocate layout: error %d\n",
+		       __func__, status);
+		goto out;
+	}
+
+	spin_lock(&ino->i_lock);
+	if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+		dprintk("%s forget reply due to recall\n", __func__);
+		goto out_forget_reply;
+	}
+
+	if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
+		dprintk("%s forget reply due to state\n", __func__);
+		goto out_forget_reply;
+	}
+	init_lseg(lo, lseg);
+	lseg->pls_range = res->range;
+	*lgp->lsegpp = get_lseg(lseg);
+	pnfs_insert_layout(lo, lseg);
+
+	if (res->return_on_close) {
+		set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+		set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
+	}
+
+	/* Done processing layoutget. Set the layout stateid */
+	pnfs_set_layout_stateid(lo, &res->stateid, false);
+	spin_unlock(&ino->i_lock);
+out:
+	return status;
+
+out_forget_reply:
+	spin_unlock(&ino->i_lock);
+	lseg->pls_layout = lo;
+	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+	goto out;
+}
+
+bool
+pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+		     struct nfs_page *req)
+{
+	enum pnfs_iomode access_type;
+	gfp_t gfp_flags;
+
+	/* We assume that pg_ioflags == 0 iff we're reading a page */
+	if (pgio->pg_ioflags == 0) {
+		access_type = IOMODE_READ;
+		gfp_flags = GFP_KERNEL;
+	} else {
+		access_type = IOMODE_RW;
+		gfp_flags = GFP_NOFS;
+	}
+
+	if (pgio->pg_lseg == NULL) {
+		if (pgio->pg_count != prev->wb_bytes)
+			return true;
+		/* This is first coelesce call for a series of nfs_pages */
+		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+						   prev->wb_context,
+						   req_offset(prev),
+						   pgio->pg_count,
+						   access_type,
+						   gfp_flags);
+		if (pgio->pg_lseg == NULL)
+			return true;
+	}
+
+	/*
+	 * Test if a nfs_page is fully contained in the pnfs_layout_range.
+	 * Note that this test makes several assumptions:
+	 * - that the previous nfs_page in the struct nfs_pageio_descriptor
+	 *   is known to lie within the range.
+	 *   - that the nfs_page being tested is known to be contiguous with the
+	 *   previous nfs_page.
+	 *   - Layout ranges are page aligned, so we only have to test the
+	 *   start offset of the request.
+	 *
+	 * Please also note that 'end_offset' is actually the offset of the
+	 * first byte that lies outside the pnfs_layout_range. FIXME?
+	 *
+	 */
+	return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset,
+					 pgio->pg_lseg->pls_range.length);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
+
+/*
+ * Called by non rpc-based layout drivers
+ */
+int
+pnfs_ld_write_done(struct nfs_write_data *data)
+{
+	int status;
+
+	if (!data->pnfs_error) {
+		pnfs_set_layoutcommit(data);
+		data->mds_ops->rpc_call_done(&data->task, data);
+		data->mds_ops->rpc_release(data);
+		return 0;
+	}
+	if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+					PNFS_LAYOUTRET_ON_ERROR) {
+		/* Don't lo_commit on error, Server will needs to
+		 * preform a file recovery.
+		 */
+		clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(data->inode)->flags);
+		pnfs_return_layout(data->inode);
+	}
+
+	dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
+		data->pnfs_error);
+	status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
+				    data->mds_ops, NFS_FILE_SYNC);
+	return status ? : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
+
+enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *wdata,
+			const struct rpc_call_ops *call_ops, int how)
+{
+	struct inode *inode = wdata->inode;
+	enum pnfs_try_status trypnfs;
+	struct nfs_server *nfss = NFS_SERVER(inode);
+
+	wdata->mds_ops = call_ops;
+
+	dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
+		inode->i_ino, wdata->args.count, wdata->args.offset, how);
+
+	trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
+	if (trypnfs == PNFS_NOT_ATTEMPTED) {
+		put_lseg(wdata->lseg);
+		wdata->lseg = NULL;
+	} else
+		nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
+
+	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+	return trypnfs;
+}
+
+/*
+ * Called by non rpc-based layout drivers
+ */
+int
+pnfs_ld_read_done(struct nfs_read_data *data)
+{
+	int status;
+
+	if (!data->pnfs_error) {
+		__nfs4_read_done_cb(data);
+		data->mds_ops->rpc_call_done(&data->task, data);
+		data->mds_ops->rpc_release(data);
+		return 0;
+	}
+
+	if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+						PNFS_LAYOUTRET_ON_ERROR)
+		pnfs_return_layout(data->inode);
+
+	dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
+		data->pnfs_error);
+	status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
+				   data->mds_ops);
+	return status ? : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
+
+/*
+ * Call the appropriate parallel I/O subsystem read function.
+ */
+enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_read_data *rdata,
+		       const struct rpc_call_ops *call_ops)
+{
+	struct inode *inode = rdata->inode;
+	struct nfs_server *nfss = NFS_SERVER(inode);
+	enum pnfs_try_status trypnfs;
+
+	rdata->mds_ops = call_ops;
+
+	dprintk("%s: Reading ino:%lu %u@%llu\n",
+		__func__, inode->i_ino, rdata->args.count, rdata->args.offset);
+
+	trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
+	if (trypnfs == PNFS_NOT_ATTEMPTED) {
+		put_lseg(rdata->lseg);
+		rdata->lseg = NULL;
+	} else {
+		nfs_inc_stats(inode, NFSIOS_PNFS_READ);
+	}
+	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+	return trypnfs;
+}
+
+/*
+ * There can be multiple RW segments.
+ */
+static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
+{
+	struct pnfs_layout_segment *lseg;
+
+	list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
+		if (lseg->pls_range.iomode == IOMODE_RW &&
+		    test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+			list_add(&lseg->pls_lc_list, listp);
+	}
+}
+
+void
+pnfs_set_layoutcommit(struct nfs_write_data *wdata)
+{
+	struct nfs_inode *nfsi = NFS_I(wdata->inode);
+	loff_t end_pos = wdata->mds_offset + wdata->res.count;
+	bool mark_as_dirty = false;
+
+	spin_lock(&nfsi->vfs_inode.i_lock);
+	if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+		mark_as_dirty = true;
+		dprintk("%s: Set layoutcommit for inode %lu ",
+			__func__, wdata->inode->i_ino);
+	}
+	if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) {
+		/* references matched in nfs4_layoutcommit_release */
+		get_lseg(wdata->lseg);
+	}
+	if (end_pos > nfsi->layout->plh_lwb)
+		nfsi->layout->plh_lwb = end_pos;
+	spin_unlock(&nfsi->vfs_inode.i_lock);
+	dprintk("%s: lseg %p end_pos %llu\n",
+		__func__, wdata->lseg, nfsi->layout->plh_lwb);
+
+	/* if pnfs_layoutcommit_inode() runs between inode locks, the next one
+	 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
+	if (mark_as_dirty)
+		mark_inode_dirty_sync(wdata->inode);
+}
+EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
+
+/*
+ * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
+ * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
+ * data to disk to allow the server to recover the data if it crashes.
+ * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
+ * is off, and a COMMIT is sent to a data server, or
+ * if WRITEs to a data server return NFS_DATA_SYNC.
+ */
+int
+pnfs_layoutcommit_inode(struct inode *inode, bool sync)
+{
+	struct nfs4_layoutcommit_data *data;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	loff_t end_pos;
+	int status = 0;
+
+	dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
+
+	if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+		return 0;
+
+	/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
+	data = kzalloc(sizeof(*data), GFP_NOFS);
+	if (!data) {
+		mark_inode_dirty_sync(inode);
+		status = -ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&data->lseg_list);
+	spin_lock(&inode->i_lock);
+	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+		spin_unlock(&inode->i_lock);
+		kfree(data);
+		goto out;
+	}
+
+	pnfs_list_write_lseg(inode, &data->lseg_list);
+
+	end_pos = nfsi->layout->plh_lwb;
+	nfsi->layout->plh_lwb = 0;
+
+	memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,
+		sizeof(nfsi->layout->plh_stateid.data));
+	spin_unlock(&inode->i_lock);
+
+	data->args.inode = inode;
+	data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
+	nfs_fattr_init(&data->fattr);
+	data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
+	data->res.fattr = &data->fattr;
+	data->args.lastbytewritten = end_pos - 1;
+	data->res.server = NFS_SERVER(inode);
+
+	status = nfs4_proc_layoutcommit(data, sync);
+out:
+	dprintk("<-- %s status %d\n", __func__, status);
+	return status;
+}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
new file mode 100644
index 00000000..bb8b3247
--- /dev/null
+++ b/fs/nfs/pnfs.h
@@ -0,0 +1,427 @@
+/*
+ *  pNFS client data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#ifndef FS_NFS_PNFS_H
+#define FS_NFS_PNFS_H
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+
+enum {
+	NFS_LSEG_VALID = 0,	/* cleared when lseg is recalled/returned */
+	NFS_LSEG_ROC,		/* roc bit received from server */
+	NFS_LSEG_LAYOUTCOMMIT,	/* layoutcommit bit set for layoutcommit */
+};
+
+struct pnfs_layout_segment {
+	struct list_head pls_list;
+	struct list_head pls_lc_list;
+	struct pnfs_layout_range pls_range;
+	atomic_t pls_refcount;
+	unsigned long pls_flags;
+	struct pnfs_layout_hdr *pls_layout;
+};
+
+enum pnfs_try_status {
+	PNFS_ATTEMPTED     = 0,
+	PNFS_NOT_ATTEMPTED = 1,
+};
+
+#ifdef CONFIG_NFS_V4_1
+
+#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
+
+enum {
+	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */
+	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
+	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
+	NFS_LAYOUT_ROC,			/* some lseg had roc bit set */
+	NFS_LAYOUT_DESTROYED,		/* no new use of layout allowed */
+};
+
+enum layoutdriver_policy_flags {
+	/* Should the pNFS client commit and return the layout upon a setattr */
+	PNFS_LAYOUTRET_ON_SETATTR	= 1 << 0,
+	PNFS_LAYOUTRET_ON_ERROR		= 1 << 1,
+};
+
+struct nfs4_deviceid_node;
+
+/* Per-layout driver specific registration structure */
+struct pnfs_layoutdriver_type {
+	struct list_head pnfs_tblid;
+	const u32 id;
+	const char *name;
+	struct module *owner;
+	unsigned flags;
+
+	struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
+	void (*free_layout_hdr) (struct pnfs_layout_hdr *);
+
+	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
+	void (*free_lseg) (struct pnfs_layout_segment *lseg);
+
+	/* test for nfs page cache coalescing */
+	bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+
+	/* Returns true if layoutdriver wants to divert this request to
+	 * driver's commit routine.
+	 */
+	bool (*mark_pnfs_commit)(struct pnfs_layout_segment *lseg);
+	struct list_head * (*choose_commit_list) (struct nfs_page *req);
+	int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how);
+
+	/*
+	 * Return PNFS_ATTEMPTED to indicate the layout code has attempted
+	 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
+	 */
+	enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
+	enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
+
+	void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+
+	void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_layoutreturn_args *args);
+
+	void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_layoutcommit_args *args);
+};
+
+struct pnfs_layout_hdr {
+	atomic_t		plh_refcount;
+	struct list_head	plh_layouts;   /* other client layouts */
+	struct list_head	plh_bulk_recall; /* clnt list of bulk recalls */
+	struct list_head	plh_segs;      /* layout segments list */
+	nfs4_stateid		plh_stateid;
+	atomic_t		plh_outstanding; /* number of RPCs out */
+	unsigned long		plh_block_lgets; /* block LAYOUTGET if >0 */
+	u32			plh_barrier; /* ignore lower seqids */
+	unsigned long		plh_flags;
+	loff_t			plh_lwb; /* last write byte for layoutcommit */
+	struct rpc_cred		*plh_lc_cred; /* layoutcommit cred */
+	struct inode		*plh_inode;
+};
+
+struct pnfs_device {
+	struct nfs4_deviceid dev_id;
+	unsigned int  layout_type;
+	unsigned int  mincount;
+	struct page **pages;
+	unsigned int  pgbase;
+	unsigned int  pglen;
+};
+
+extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
+extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+
+/* nfs4proc.c */
+extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+				   struct pnfs_device *dev);
+extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
+
+/* pnfs.c */
+void get_layout_hdr(struct pnfs_layout_hdr *lo);
+void put_lseg(struct pnfs_layout_segment *lseg);
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+		   loff_t pos, u64 count, enum pnfs_iomode access_type,
+		   gfp_t gfp_flags);
+void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
+void unset_pnfs_layoutdriver(struct nfs_server *);
+enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
+					     const struct rpc_call_ops *, int);
+enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
+					    const struct rpc_call_ops *);
+bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
+int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_free_lseg_list(struct list_head *tmp_list);
+void pnfs_destroy_layout(struct nfs_inode *);
+void pnfs_destroy_all_layouts(struct nfs_client *);
+void put_layout_hdr(struct pnfs_layout_hdr *lo);
+void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+			     const nfs4_stateid *new,
+			     bool update_barrier);
+int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
+				  struct pnfs_layout_hdr *lo,
+				  struct nfs4_state *open_state);
+int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+				struct list_head *tmp_list,
+				struct pnfs_layout_range *recall_range);
+bool pnfs_roc(struct inode *ino);
+void pnfs_roc_release(struct inode *ino);
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
+void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
+int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
+int _pnfs_return_layout(struct inode *);
+int pnfs_ld_write_done(struct nfs_write_data *);
+int pnfs_ld_read_done(struct nfs_read_data *);
+
+/* pnfs_dev.c */
+struct nfs4_deviceid_node {
+	struct hlist_node		node;
+	struct hlist_node		tmpnode;
+	const struct pnfs_layoutdriver_type *ld;
+	const struct nfs_client		*nfs_client;
+	struct nfs4_deviceid		deviceid;
+	atomic_t			ref;
+};
+
+void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
+struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
+			     const struct pnfs_layoutdriver_type *,
+			     const struct nfs_client *,
+			     const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
+bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
+void nfs4_deviceid_purge_client(const struct nfs_client *);
+
+static inline int lo_fail_bit(u32 iomode)
+{
+	return iomode == IOMODE_RW ?
+			 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
+}
+
+static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+	if (lseg) {
+		atomic_inc(&lseg->pls_refcount);
+		smp_mb__after_atomic_inc();
+	}
+	return lseg;
+}
+
+/* Return true if a layout driver is being used for this mountpoint */
+static inline int pnfs_enabled_sb(struct nfs_server *nfss)
+{
+	return nfss->pnfs_curr_ld != NULL;
+}
+
+static inline void
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+{
+	if (lseg) {
+		struct pnfs_layoutdriver_type *ld;
+
+		ld = NFS_SERVER(req->wb_page->mapping->host)->pnfs_curr_ld;
+		if (ld->mark_pnfs_commit && ld->mark_pnfs_commit(lseg)) {
+			set_bit(PG_PNFS_COMMIT, &req->wb_flags);
+			req->wb_commit_lseg = get_lseg(lseg);
+		}
+	}
+}
+
+static inline int
+pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
+{
+	if (!test_and_clear_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags))
+		return PNFS_NOT_ATTEMPTED;
+	return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how);
+}
+
+static inline struct list_head *
+pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds)
+{
+	struct list_head *rv;
+
+	if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) {
+		struct inode *inode = req->wb_commit_lseg->pls_layout->plh_inode;
+
+		set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags);
+		rv = NFS_SERVER(inode)->pnfs_curr_ld->choose_commit_list(req);
+		/* matched by ref taken when PG_PNFS_COMMIT is set */
+		put_lseg(req->wb_commit_lseg);
+	} else
+		rv = mds;
+	return rv;
+}
+
+static inline void pnfs_clear_request_commit(struct nfs_page *req)
+{
+	if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags))
+		put_lseg(req->wb_commit_lseg);
+}
+
+/* Should the pNFS client commit and return the layout upon a setattr */
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+	if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+		return false;
+	return NFS_SERVER(inode)->pnfs_curr_ld->flags &
+		PNFS_LAYOUTRET_ON_SETATTR;
+}
+
+static inline int pnfs_return_layout(struct inode *ino)
+{
+	struct nfs_inode *nfsi = NFS_I(ino);
+	struct nfs_server *nfss = NFS_SERVER(ino);
+
+	if (pnfs_enabled_sb(nfss) && nfsi->layout)
+		return _pnfs_return_layout(ino);
+
+	return 0;
+}
+
+static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
+				    struct inode *inode)
+{
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+	if (ld)
+		pgio->pg_test = ld->pg_test;
+}
+
+#else  /* CONFIG_NFS_V4_1 */
+
+static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+}
+
+static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+}
+
+static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+	return NULL;
+}
+
+static inline void put_lseg(struct pnfs_layout_segment *lseg)
+{
+}
+
+static inline struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+		   loff_t pos, u64 count, enum pnfs_iomode access_type,
+		   gfp_t gfp_flags)
+{
+	return NULL;
+}
+
+static inline enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_read_data *data,
+		      const struct rpc_call_ops *call_ops)
+{
+	return PNFS_NOT_ATTEMPTED;
+}
+
+static inline enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *data,
+		       const struct rpc_call_ops *call_ops, int how)
+{
+	return PNFS_NOT_ATTEMPTED;
+}
+
+static inline int pnfs_return_layout(struct inode *ino)
+{
+	return 0;
+}
+
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+	return false;
+}
+
+static inline bool
+pnfs_roc(struct inode *ino)
+{
+	return false;
+}
+
+static inline void
+pnfs_roc_release(struct inode *ino)
+{
+}
+
+static inline void
+pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+}
+
+static inline bool
+pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+	return false;
+}
+
+static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
+{
+}
+
+static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
+{
+}
+
+static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
+				    struct inode *inode)
+{
+}
+
+static inline void
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+{
+}
+
+static inline int
+pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
+{
+	return PNFS_NOT_ATTEMPTED;
+}
+
+static inline struct list_head *
+pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds)
+{
+	return mds;
+}
+
+static inline void pnfs_clear_request_commit(struct nfs_page *req)
+{
+}
+
+static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
+{
+	return 0;
+}
+
+static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
new file mode 100644
index 00000000..f0f8e1e2
--- /dev/null
+++ b/fs/nfs/pnfs_dev.c
@@ -0,0 +1,277 @@
+/*
+ *  Device operations for the pnfs client.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *  Garth Goodson   <Garth.Goodson@netapp.com>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS
+
+/*
+ * Device ID RCU cache. A device ID is unique per server and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS	5
+#define NFS4_DEVICE_ID_HASH_SIZE	(1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK	(NFS4_DEVICE_ID_HASH_SIZE - 1)
+
+static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfs4_deviceid_lock);
+
+void
+nfs4_print_deviceid(const struct nfs4_deviceid *id)
+{
+	u32 *p = (u32 *)id;
+
+	dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+		p[0], p[1], p[2], p[3]);
+}
+EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
+
+static inline u32
+nfs4_deviceid_hash(const struct nfs4_deviceid *id)
+{
+	unsigned char *cptr = (unsigned char *)id->data;
+	unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+	u32 x = 0;
+
+	while (nbytes--) {
+		x *= 37;
+		x += *cptr++;
+	}
+	return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+
+static struct nfs4_deviceid_node *
+_lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
+		 const struct nfs_client *clp, const struct nfs4_deviceid *id,
+		 long hash)
+{
+	struct nfs4_deviceid_node *d;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+		if (d->ld == ld && d->nfs_client == clp &&
+		    !memcmp(&d->deviceid, id, sizeof(*id))) {
+			if (atomic_read(&d->ref))
+				return d;
+			else
+				continue;
+		}
+	return NULL;
+}
+
+/*
+ * Lookup a deviceid in cache and get a reference count on it if found
+ *
+ * @clp nfs_client associated with deviceid
+ * @id deviceid to look up
+ */
+struct nfs4_deviceid_node *
+_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
+		   const struct nfs_client *clp, const struct nfs4_deviceid *id,
+		   long hash)
+{
+	struct nfs4_deviceid_node *d;
+
+	rcu_read_lock();
+	d = _lookup_deviceid(ld, clp, id, hash);
+	if (d && !atomic_inc_not_zero(&d->ref))
+		d = NULL;
+	rcu_read_unlock();
+	return d;
+}
+
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
+		       const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+	return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+}
+EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
+
+/*
+ * Unhash and put deviceid
+ *
+ * @clp nfs_client associated with deviceid
+ * @id the deviceid to unhash
+ *
+ * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
+ */
+struct nfs4_deviceid_node *
+nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
+			 const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+	struct nfs4_deviceid_node *d;
+
+	spin_lock(&nfs4_deviceid_lock);
+	rcu_read_lock();
+	d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+	rcu_read_unlock();
+	if (!d) {
+		spin_unlock(&nfs4_deviceid_lock);
+		return NULL;
+	}
+	hlist_del_init_rcu(&d->node);
+	spin_unlock(&nfs4_deviceid_lock);
+	synchronize_rcu();
+
+	/* balance the initial ref set in pnfs_insert_deviceid */
+	if (atomic_dec_and_test(&d->ref))
+		return d;
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid);
+
+/*
+ * Delete a deviceid from cache
+ *
+ * @clp struct nfs_client qualifying the deviceid
+ * @id deviceid to delete
+ */
+void
+nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
+		     const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+	struct nfs4_deviceid_node *d;
+
+	d = nfs4_unhash_put_deviceid(ld, clp, id);
+	if (!d)
+		return;
+	d->ld->free_deviceid_node(d);
+}
+EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
+
+void
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
+			const struct pnfs_layoutdriver_type *ld,
+			const struct nfs_client *nfs_client,
+			const struct nfs4_deviceid *id)
+{
+	INIT_HLIST_NODE(&d->node);
+	INIT_HLIST_NODE(&d->tmpnode);
+	d->ld = ld;
+	d->nfs_client = nfs_client;
+	d->deviceid = *id;
+	atomic_set(&d->ref, 1);
+}
+EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
+
+/*
+ * Uniquely initialize and insert a deviceid node into cache
+ *
+ * @new new deviceid node
+ *      Note that the caller must set up the following members:
+ *        new->ld
+ *        new->nfs_client
+ *        new->deviceid
+ *
+ * @ret the inserted node, if none found, otherwise, the found entry.
+ */
+struct nfs4_deviceid_node *
+nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
+{
+	struct nfs4_deviceid_node *d;
+	long hash;
+
+	spin_lock(&nfs4_deviceid_lock);
+	hash = nfs4_deviceid_hash(&new->deviceid);
+	d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
+	if (d) {
+		spin_unlock(&nfs4_deviceid_lock);
+		return d;
+	}
+
+	hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+	spin_unlock(&nfs4_deviceid_lock);
+	atomic_inc(&new->ref);
+
+	return new;
+}
+EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
+
+/*
+ * Dereference a deviceid node and delete it when its reference count drops
+ * to zero.
+ *
+ * @d deviceid node to put
+ *
+ * @ret true iff the node was deleted
+ */
+bool
+nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
+{
+	if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock))
+		return false;
+	hlist_del_init_rcu(&d->node);
+	spin_unlock(&nfs4_deviceid_lock);
+	synchronize_rcu();
+	d->ld->free_deviceid_node(d);
+	return true;
+}
+EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
+
+static void
+_deviceid_purge_client(const struct nfs_client *clp, long hash)
+{
+	struct nfs4_deviceid_node *d;
+	struct hlist_node *n;
+	HLIST_HEAD(tmp);
+
+	spin_lock(&nfs4_deviceid_lock);
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+		if (d->nfs_client == clp && atomic_read(&d->ref)) {
+			hlist_del_init_rcu(&d->node);
+			hlist_add_head(&d->tmpnode, &tmp);
+		}
+	rcu_read_unlock();
+	spin_unlock(&nfs4_deviceid_lock);
+
+	if (hlist_empty(&tmp))
+		return;
+
+	synchronize_rcu();
+	while (!hlist_empty(&tmp)) {
+		d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode);
+		hlist_del(&d->tmpnode);
+		if (atomic_dec_and_test(&d->ref))
+			d->ld->free_deviceid_node(d);
+	}
+}
+
+void
+nfs4_deviceid_purge_client(const struct nfs_client *clp)
+{
+	long h;
+
+	if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
+		return;
+	for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
+		_deviceid_purge_client(clp, h);
+}
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
new file mode 100644
index 00000000..f48125da
--- /dev/null
+++ b/fs/nfs/proc.c
@@ -0,0 +1,746 @@
+/*
+ *  linux/fs/nfs/proc.c
+ *
+ *  Copyright (C) 1992, 1993, 1994  Rick Sladkey
+ *
+ *  OS-independent nfs remote procedure call functions
+ *
+ *  Tuned by Alan Cox <A.Cox@swansea.ac.uk> for >3K buffers
+ *  so at last we can have decent(ish) throughput off a 
+ *  Sun server.
+ *
+ *  Coding optimized and cleaned up by Florian La Roche.
+ *  Note: Error returns are optimized for NFS_OK, which isn't translated via
+ *  nfs_stat_to_errno(), but happens to be already the right return code.
+ *
+ *  Also, the code currently doesn't check the size of the packet, when
+ *  it decodes the packet.
+ *
+ *  Feel free to fix it and mail me the diffs if it worries you.
+ *
+ *  Completely rewritten to support the new RPC call interface;
+ *  rewrote and moved the entire XDR stuff to xdr.c
+ *  --Olaf Kirch June 1996
+ *
+ *  The code below initializes all auto variables explicitly, otherwise
+ *  it will fail to work as a module (gcc generates a memset call for an
+ *  incomplete struct).
+ */
+
+#include <linux/types.h>
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/lockd/bind.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PROC
+
+/*
+ * wrapper to handle the -EKEYEXPIRED error message. This should generally
+ * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't
+ * support the NFSERR_JUKEBOX error code, but we handle this situation in the
+ * same way that we handle that error with NFSv3.
+ */
+static int
+nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
+{
+	int res;
+	do {
+		res = rpc_call_sync(clnt, msg, flags);
+		if (res != -EKEYEXPIRED)
+			break;
+		schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
+		res = -ERESTARTSYS;
+	} while (!fatal_signal_pending(current));
+	return res;
+}
+
+#define rpc_call_sync(clnt, msg, flags)	nfs_rpc_wrapper(clnt, msg, flags)
+
+static int
+nfs_async_handle_expired_key(struct rpc_task *task)
+{
+	if (task->tk_status != -EKEYEXPIRED)
+		return 0;
+	task->tk_status = 0;
+	rpc_restart_call(task);
+	rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+	return 1;
+}
+
+/*
+ * Bare-bones access to getattr: this is for nfs_read_super.
+ */
+static int
+nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
+		  struct nfs_fsinfo *info)
+{
+	struct nfs_fattr *fattr = info->fattr;
+	struct nfs2_fsstat fsinfo;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_GETATTR],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= fattr,
+	};
+	int status;
+
+	dprintk("%s: call getattr\n", __func__);
+	nfs_fattr_init(fattr);
+	status = rpc_call_sync(server->client, &msg, 0);
+	/* Retry with default authentication if different */
+	if (status && server->nfs_client->cl_rpcclient != server->client)
+		status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+	dprintk("%s: reply getattr: %d\n", __func__, status);
+	if (status)
+		return status;
+	dprintk("%s: call statfs\n", __func__);
+	msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS];
+	msg.rpc_resp = &fsinfo;
+	status = rpc_call_sync(server->client, &msg, 0);
+	/* Retry with default authentication if different */
+	if (status && server->nfs_client->cl_rpcclient != server->client)
+		status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+	dprintk("%s: reply statfs: %d\n", __func__, status);
+	if (status)
+		return status;
+	info->rtmax  = NFS_MAXDATA;
+	info->rtpref = fsinfo.tsize;
+	info->rtmult = fsinfo.bsize;
+	info->wtmax  = NFS_MAXDATA;
+	info->wtpref = fsinfo.tsize;
+	info->wtmult = fsinfo.bsize;
+	info->dtpref = fsinfo.tsize;
+	info->maxfilesize = 0x7FFFFFFF;
+	info->lease_time = 0;
+	return 0;
+}
+
+/*
+ * One function for each procedure in the NFS protocol.
+ */
+static int
+nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+		struct nfs_fattr *fattr)
+{
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_GETATTR],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= fattr,
+	};
+	int	status;
+
+	dprintk("NFS call  getattr\n");
+	nfs_fattr_init(fattr);
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("NFS reply getattr: %d\n", status);
+	return status;
+}
+
+static int
+nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
+		 struct iattr *sattr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct nfs_sattrargs	arg = { 
+		.fh	= NFS_FH(inode),
+		.sattr	= sattr
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_SETATTR],
+		.rpc_argp	= &arg,
+		.rpc_resp	= fattr,
+	};
+	int	status;
+
+	/* Mask out the non-modebit related stuff from attr->ia_mode */
+	sattr->ia_mode &= S_IALLUGO;
+
+	dprintk("NFS call  setattr\n");
+	if (sattr->ia_valid & ATTR_FILE)
+		msg.rpc_cred = nfs_file_cred(sattr->ia_file);
+	nfs_fattr_init(fattr);
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	if (status == 0)
+		nfs_setattr_update_inode(inode, sattr);
+	dprintk("NFS reply setattr: %d\n", status);
+	return status;
+}
+
+static int
+nfs_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
+		struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+{
+	struct nfs_diropargs	arg = {
+		.fh		= NFS_FH(dir),
+		.name		= name->name,
+		.len		= name->len
+	};
+	struct nfs_diropok	res = {
+		.fh		= fhandle,
+		.fattr		= fattr
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_LOOKUP],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
+	int			status;
+
+	dprintk("NFS call  lookup %s\n", name->name);
+	nfs_fattr_init(fattr);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	dprintk("NFS reply lookup: %d\n", status);
+	return status;
+}
+
+static int nfs_proc_readlink(struct inode *inode, struct page *page,
+		unsigned int pgbase, unsigned int pglen)
+{
+	struct nfs_readlinkargs	args = {
+		.fh		= NFS_FH(inode),
+		.pgbase		= pgbase,
+		.pglen		= pglen,
+		.pages		= &page
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_READLINK],
+		.rpc_argp	= &args,
+	};
+	int			status;
+
+	dprintk("NFS call  readlink\n");
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	dprintk("NFS reply readlink: %d\n", status);
+	return status;
+}
+
+struct nfs_createdata {
+	struct nfs_createargs arg;
+	struct nfs_diropok res;
+	struct nfs_fh fhandle;
+	struct nfs_fattr fattr;
+};
+
+static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir,
+		struct dentry *dentry, struct iattr *sattr)
+{
+	struct nfs_createdata *data;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+
+	if (data != NULL) {
+		data->arg.fh = NFS_FH(dir);
+		data->arg.name = dentry->d_name.name;
+		data->arg.len = dentry->d_name.len;
+		data->arg.sattr = sattr;
+		nfs_fattr_init(&data->fattr);
+		data->fhandle.size = 0;
+		data->res.fh = &data->fhandle;
+		data->res.fattr = &data->fattr;
+	}
+	return data;
+};
+
+static void nfs_free_createdata(const struct nfs_createdata *data)
+{
+	kfree(data);
+}
+
+static int
+nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+		int flags, struct nfs_open_context *ctx)
+{
+	struct nfs_createdata *data;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_CREATE],
+	};
+	int status = -ENOMEM;
+
+	dprintk("NFS call  create %s\n", dentry->d_name.name);
+	data = nfs_alloc_createdata(dir, dentry, sattr);
+	if (data == NULL)
+		goto out;
+	msg.rpc_argp = &data->arg;
+	msg.rpc_resp = &data->res;
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_mark_for_revalidate(dir);
+	if (status == 0)
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+	nfs_free_createdata(data);
+out:
+	dprintk("NFS reply create: %d\n", status);
+	return status;
+}
+
+/*
+ * In NFSv2, mknod is grafted onto the create call.
+ */
+static int
+nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+	       dev_t rdev)
+{
+	struct nfs_createdata *data;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_CREATE],
+	};
+	umode_t mode;
+	int status = -ENOMEM;
+
+	dprintk("NFS call  mknod %s\n", dentry->d_name.name);
+
+	mode = sattr->ia_mode;
+	if (S_ISFIFO(mode)) {
+		sattr->ia_mode = (mode & ~S_IFMT) | S_IFCHR;
+		sattr->ia_valid &= ~ATTR_SIZE;
+	} else if (S_ISCHR(mode) || S_ISBLK(mode)) {
+		sattr->ia_valid |= ATTR_SIZE;
+		sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */
+	}
+
+	data = nfs_alloc_createdata(dir, dentry, sattr);
+	if (data == NULL)
+		goto out;
+	msg.rpc_argp = &data->arg;
+	msg.rpc_resp = &data->res;
+
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_mark_for_revalidate(dir);
+
+	if (status == -EINVAL && S_ISFIFO(mode)) {
+		sattr->ia_mode = mode;
+		nfs_fattr_init(data->res.fattr);
+		status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	}
+	if (status == 0)
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+	nfs_free_createdata(data);
+out:
+	dprintk("NFS reply mknod: %d\n", status);
+	return status;
+}
+  
+static int
+nfs_proc_remove(struct inode *dir, struct qstr *name)
+{
+	struct nfs_removeargs arg = {
+		.fh = NFS_FH(dir),
+		.name.len = name->len,
+		.name.name = name->name,
+	};
+	struct rpc_message msg = { 
+		.rpc_proc = &nfs_procedures[NFSPROC_REMOVE],
+		.rpc_argp = &arg,
+	};
+	int			status;
+
+	dprintk("NFS call  remove %s\n", name->name);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_mark_for_revalidate(dir);
+
+	dprintk("NFS reply remove: %d\n", status);
+	return status;
+}
+
+static void
+nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
+{
+	msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE];
+}
+
+static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
+{
+	if (nfs_async_handle_expired_key(task))
+		return 0;
+	nfs_mark_for_revalidate(dir);
+	return 1;
+}
+
+static void
+nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+	msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
+}
+
+static int
+nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+		     struct inode *new_dir)
+{
+	if (nfs_async_handle_expired_key(task))
+		return 0;
+	nfs_mark_for_revalidate(old_dir);
+	nfs_mark_for_revalidate(new_dir);
+	return 1;
+}
+
+static int
+nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
+		struct inode *new_dir, struct qstr *new_name)
+{
+	struct nfs_renameargs	arg = {
+		.old_dir	= NFS_FH(old_dir),
+		.old_name	= old_name,
+		.new_dir	= NFS_FH(new_dir),
+		.new_name	= new_name,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_RENAME],
+		.rpc_argp	= &arg,
+	};
+	int			status;
+
+	dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name);
+	status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
+	nfs_mark_for_revalidate(old_dir);
+	nfs_mark_for_revalidate(new_dir);
+	dprintk("NFS reply rename: %d\n", status);
+	return status;
+}
+
+static int
+nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
+{
+	struct nfs_linkargs	arg = {
+		.fromfh		= NFS_FH(inode),
+		.tofh		= NFS_FH(dir),
+		.toname		= name->name,
+		.tolen		= name->len
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_LINK],
+		.rpc_argp	= &arg,
+	};
+	int			status;
+
+	dprintk("NFS call  link %s\n", name->name);
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	nfs_mark_for_revalidate(inode);
+	nfs_mark_for_revalidate(dir);
+	dprintk("NFS reply link: %d\n", status);
+	return status;
+}
+
+static int
+nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+		 unsigned int len, struct iattr *sattr)
+{
+	struct nfs_fh *fh;
+	struct nfs_fattr *fattr;
+	struct nfs_symlinkargs	arg = {
+		.fromfh		= NFS_FH(dir),
+		.fromname	= dentry->d_name.name,
+		.fromlen	= dentry->d_name.len,
+		.pages		= &page,
+		.pathlen	= len,
+		.sattr		= sattr
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_SYMLINK],
+		.rpc_argp	= &arg,
+	};
+	int status = -ENAMETOOLONG;
+
+	dprintk("NFS call  symlink %s\n", dentry->d_name.name);
+
+	if (len > NFS2_MAXPATHLEN)
+		goto out;
+
+	fh = nfs_alloc_fhandle();
+	fattr = nfs_alloc_fattr();
+	status = -ENOMEM;
+	if (fh == NULL || fattr == NULL)
+		goto out_free;
+
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_mark_for_revalidate(dir);
+
+	/*
+	 * V2 SYMLINK requests don't return any attributes.  Setting the
+	 * filehandle size to zero indicates to nfs_instantiate that it
+	 * should fill in the data with a LOOKUP call on the wire.
+	 */
+	if (status == 0)
+		status = nfs_instantiate(dentry, fh, fattr);
+
+out_free:
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fh);
+out:
+	dprintk("NFS reply symlink: %d\n", status);
+	return status;
+}
+
+static int
+nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
+{
+	struct nfs_createdata *data;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_MKDIR],
+	};
+	int status = -ENOMEM;
+
+	dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
+	data = nfs_alloc_createdata(dir, dentry, sattr);
+	if (data == NULL)
+		goto out;
+	msg.rpc_argp = &data->arg;
+	msg.rpc_resp = &data->res;
+
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_mark_for_revalidate(dir);
+	if (status == 0)
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+	nfs_free_createdata(data);
+out:
+	dprintk("NFS reply mkdir: %d\n", status);
+	return status;
+}
+
+static int
+nfs_proc_rmdir(struct inode *dir, struct qstr *name)
+{
+	struct nfs_diropargs	arg = {
+		.fh		= NFS_FH(dir),
+		.name		= name->name,
+		.len		= name->len
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_RMDIR],
+		.rpc_argp	= &arg,
+	};
+	int			status;
+
+	dprintk("NFS call  rmdir %s\n", name->name);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_mark_for_revalidate(dir);
+	dprintk("NFS reply rmdir: %d\n", status);
+	return status;
+}
+
+/*
+ * The READDIR implementation is somewhat hackish - we pass a temporary
+ * buffer to the encode function, which installs it in the receive
+ * the receive iovec. The decode function just parses the reply to make
+ * sure it is syntactically correct; the entries itself are decoded
+ * from nfs_readdir by calling the decode_entry function directly.
+ */
+static int
+nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+		 u64 cookie, struct page **pages, unsigned int count, int plus)
+{
+	struct inode		*dir = dentry->d_inode;
+	struct nfs_readdirargs	arg = {
+		.fh		= NFS_FH(dir),
+		.cookie		= cookie,
+		.count		= count,
+		.pages		= pages,
+	};
+	struct rpc_message	msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_READDIR],
+		.rpc_argp	= &arg,
+		.rpc_cred	= cred,
+	};
+	int			status;
+
+	dprintk("NFS call  readdir %d\n", (unsigned int)cookie);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+
+	nfs_invalidate_atime(dir);
+
+	dprintk("NFS reply readdir: %d\n", status);
+	return status;
+}
+
+static int
+nfs_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
+			struct nfs_fsstat *stat)
+{
+	struct nfs2_fsstat fsinfo;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_STATFS],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= &fsinfo,
+	};
+	int	status;
+
+	dprintk("NFS call  statfs\n");
+	nfs_fattr_init(stat->fattr);
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("NFS reply statfs: %d\n", status);
+	if (status)
+		goto out;
+	stat->tbytes = (u64)fsinfo.blocks * fsinfo.bsize;
+	stat->fbytes = (u64)fsinfo.bfree  * fsinfo.bsize;
+	stat->abytes = (u64)fsinfo.bavail * fsinfo.bsize;
+	stat->tfiles = 0;
+	stat->ffiles = 0;
+	stat->afiles = 0;
+out:
+	return status;
+}
+
+static int
+nfs_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+			struct nfs_fsinfo *info)
+{
+	struct nfs2_fsstat fsinfo;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_STATFS],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= &fsinfo,
+	};
+	int	status;
+
+	dprintk("NFS call  fsinfo\n");
+	nfs_fattr_init(info->fattr);
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("NFS reply fsinfo: %d\n", status);
+	if (status)
+		goto out;
+	info->rtmax  = NFS_MAXDATA;
+	info->rtpref = fsinfo.tsize;
+	info->rtmult = fsinfo.bsize;
+	info->wtmax  = NFS_MAXDATA;
+	info->wtpref = fsinfo.tsize;
+	info->wtmult = fsinfo.bsize;
+	info->dtpref = fsinfo.tsize;
+	info->maxfilesize = 0x7FFFFFFF;
+	info->lease_time = 0;
+out:
+	return status;
+}
+
+static int
+nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+		  struct nfs_pathconf *info)
+{
+	info->max_link = 0;
+	info->max_namelen = NFS2_MAXNAMLEN;
+	return 0;
+}
+
+static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
+{
+	if (nfs_async_handle_expired_key(task))
+		return -EAGAIN;
+
+	nfs_invalidate_atime(data->inode);
+	if (task->tk_status >= 0) {
+		nfs_refresh_inode(data->inode, data->res.fattr);
+		/* Emulate the eof flag, which isn't normally needed in NFSv2
+		 * as it is guaranteed to always return the file attributes
+		 */
+		if (data->args.offset + data->args.count >= data->res.fattr->size)
+			data->res.eof = 1;
+	}
+	return 0;
+}
+
+static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
+{
+	msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
+}
+
+static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
+{
+	if (nfs_async_handle_expired_key(task))
+		return -EAGAIN;
+
+	if (task->tk_status >= 0)
+		nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
+	return 0;
+}
+
+static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
+{
+	/* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
+	data->args.stable = NFS_FILE_SYNC;
+	msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
+}
+
+static void
+nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
+{
+	BUG();
+}
+
+static int
+nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
+}
+
+/* Helper functions for NFS lock bounds checking */
+#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL)
+static int nfs_lock_check_bounds(const struct file_lock *fl)
+{
+	__s32 start, end;
+
+	start = (__s32)fl->fl_start;
+	if ((loff_t)start != fl->fl_start)
+		goto out_einval;
+
+	if (fl->fl_end != OFFSET_MAX) {
+		end = (__s32)fl->fl_end;
+		if ((loff_t)end != fl->fl_end)
+			goto out_einval;
+	} else
+		end = NFS_LOCK32_OFFSET_MAX;
+
+	if (start < 0 || start > end)
+		goto out_einval;
+	return 0;
+out_einval:
+	return -EINVAL;
+}
+
+const struct nfs_rpc_ops nfs_v2_clientops = {
+	.version	= 2,		       /* protocol version */
+	.dentry_ops	= &nfs_dentry_operations,
+	.dir_inode_ops	= &nfs_dir_inode_operations,
+	.file_inode_ops	= &nfs_file_inode_operations,
+	.file_ops	= &nfs_file_operations,
+	.getroot	= nfs_proc_get_root,
+	.getattr	= nfs_proc_getattr,
+	.setattr	= nfs_proc_setattr,
+	.lookup		= nfs_proc_lookup,
+	.access		= NULL,		       /* access */
+	.readlink	= nfs_proc_readlink,
+	.create		= nfs_proc_create,
+	.remove		= nfs_proc_remove,
+	.unlink_setup	= nfs_proc_unlink_setup,
+	.unlink_done	= nfs_proc_unlink_done,
+	.rename		= nfs_proc_rename,
+	.rename_setup	= nfs_proc_rename_setup,
+	.rename_done	= nfs_proc_rename_done,
+	.link		= nfs_proc_link,
+	.symlink	= nfs_proc_symlink,
+	.mkdir		= nfs_proc_mkdir,
+	.rmdir		= nfs_proc_rmdir,
+	.readdir	= nfs_proc_readdir,
+	.mknod		= nfs_proc_mknod,
+	.statfs		= nfs_proc_statfs,
+	.fsinfo		= nfs_proc_fsinfo,
+	.pathconf	= nfs_proc_pathconf,
+	.decode_dirent	= nfs2_decode_dirent,
+	.read_setup	= nfs_proc_read_setup,
+	.read_done	= nfs_read_done,
+	.write_setup	= nfs_proc_write_setup,
+	.write_done	= nfs_write_done,
+	.commit_setup	= nfs_proc_commit_setup,
+	.lock		= nfs_proc_lock,
+	.lock_check_bounds = nfs_lock_check_bounds,
+	.close_context	= nfs_close_context,
+	.init_client	= nfs_init_client,
+};
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
new file mode 100644
index 00000000..20a7f952
--- /dev/null
+++ b/fs/nfs/read.c
@@ -0,0 +1,704 @@
+/*
+ * linux/fs/nfs/read.c
+ *
+ * Block I/O for NFS
+ *
+ * Partial copy of Linus' read cache modifications to fs/nfs/file.c
+ * modified for async RPC by okir@monad.swb.de
+ */
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+
+#include <asm/system.h>
+#include "pnfs.h"
+
+#include "nfs4_fs.h"
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PAGECACHE
+
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc);
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc);
+static const struct rpc_call_ops nfs_read_partial_ops;
+static const struct rpc_call_ops nfs_read_full_ops;
+
+static struct kmem_cache *nfs_rdata_cachep;
+static mempool_t *nfs_rdata_mempool;
+
+#define MIN_POOL_READ	(32)
+
+struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
+{
+	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL);
+
+	if (p) {
+		memset(p, 0, sizeof(*p));
+		INIT_LIST_HEAD(&p->pages);
+		p->npages = pagecount;
+		if (pagecount <= ARRAY_SIZE(p->page_array))
+			p->pagevec = p->page_array;
+		else {
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
+			if (!p->pagevec) {
+				mempool_free(p, nfs_rdata_mempool);
+				p = NULL;
+			}
+		}
+	}
+	return p;
+}
+
+void nfs_readdata_free(struct nfs_read_data *p)
+{
+	if (p && (p->pagevec != &p->page_array[0]))
+		kfree(p->pagevec);
+	mempool_free(p, nfs_rdata_mempool);
+}
+
+static void nfs_readdata_release(struct nfs_read_data *rdata)
+{
+	put_lseg(rdata->lseg);
+	put_nfs_open_context(rdata->args.context);
+	nfs_readdata_free(rdata);
+}
+
+static
+int nfs_return_empty_page(struct page *page)
+{
+	zero_user(page, 0, PAGE_CACHE_SIZE);
+	SetPageUptodate(page);
+	unlock_page(page);
+	return 0;
+}
+
+static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
+{
+	unsigned int remainder = data->args.count - data->res.count;
+	unsigned int base = data->args.pgbase + data->res.count;
+	unsigned int pglen;
+	struct page **pages;
+
+	if (data->res.eof == 0 || remainder == 0)
+		return;
+	/*
+	 * Note: "remainder" can never be negative, since we check for
+	 * 	this in the XDR code.
+	 */
+	pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
+	base &= ~PAGE_CACHE_MASK;
+	pglen = PAGE_CACHE_SIZE - base;
+	for (;;) {
+		if (remainder <= pglen) {
+			zero_user(*pages, base, remainder);
+			break;
+		}
+		zero_user(*pages, base, pglen);
+		pages++;
+		remainder -= pglen;
+		pglen = PAGE_CACHE_SIZE;
+		base = 0;
+	}
+}
+
+int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
+		       struct page *page)
+{
+	struct nfs_page	*new;
+	unsigned int len;
+	struct nfs_pageio_descriptor pgio;
+
+	len = nfs_page_length(page);
+	if (len == 0)
+		return nfs_return_empty_page(page);
+	new = nfs_create_request(ctx, inode, page, 0, len);
+	if (IS_ERR(new)) {
+		unlock_page(page);
+		return PTR_ERR(new);
+	}
+	if (len < PAGE_CACHE_SIZE)
+		zero_user_segment(page, len, PAGE_CACHE_SIZE);
+
+	nfs_pageio_init(&pgio, inode, NULL, 0, 0);
+	nfs_list_add_request(new, &pgio.pg_list);
+	pgio.pg_count = len;
+
+	if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
+		nfs_pagein_multi(&pgio);
+	else
+		nfs_pagein_one(&pgio);
+	return 0;
+}
+
+static void nfs_readpage_release(struct nfs_page *req)
+{
+	struct inode *d_inode = req->wb_context->path.dentry->d_inode;
+
+	if (PageUptodate(req->wb_page))
+		nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
+
+	unlock_page(req->wb_page);
+
+	dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
+			req->wb_context->path.dentry->d_inode->i_sb->s_id,
+			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
+			req->wb_bytes,
+			(long long)req_offset(req));
+	nfs_release_request(req);
+}
+
+int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+		      const struct rpc_call_ops *call_ops)
+{
+	struct inode *inode = data->inode;
+	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = data->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.task = &data->task,
+		.rpc_client = clnt,
+		.rpc_message = &msg,
+		.callback_ops = call_ops,
+		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC | swap_flags,
+	};
+
+	/* Set up the initial task struct. */
+	NFS_PROTO(inode)->read_setup(data, &msg);
+
+	dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ "
+			"offset %llu)\n",
+			data->task.tk_pid,
+			inode->i_sb->s_id,
+			(long long)NFS_FILEID(inode),
+			data->args.count,
+			(unsigned long long)data->args.offset);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_read);
+
+/*
+ * Set up the NFS read request struct
+ */
+static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+		const struct rpc_call_ops *call_ops,
+		unsigned int count, unsigned int offset,
+		struct pnfs_layout_segment *lseg)
+{
+	struct inode *inode = req->wb_context->path.dentry->d_inode;
+
+	data->req	  = req;
+	data->inode	  = inode;
+	data->cred	  = req->wb_context->cred;
+	data->lseg	  = get_lseg(lseg);
+
+	data->args.fh     = NFS_FH(inode);
+	data->args.offset = req_offset(req) + offset;
+	data->args.pgbase = req->wb_pgbase + offset;
+	data->args.pages  = data->pagevec;
+	data->args.count  = count;
+	data->args.context = get_nfs_open_context(req->wb_context);
+	data->args.lock_context = req->wb_lock_context;
+
+	data->res.fattr   = &data->fattr;
+	data->res.count   = count;
+	data->res.eof     = 0;
+	nfs_fattr_init(&data->fattr);
+
+	if (data->lseg &&
+	    (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
+		return 0;
+
+	return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
+}
+
+static void
+nfs_async_read_error(struct list_head *head)
+{
+	struct nfs_page	*req;
+
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		SetPageError(req->wb_page);
+		nfs_readpage_release(req);
+	}
+}
+
+/*
+ * Generate multiple requests to fill a single page.
+ *
+ * We optimize to reduce the number of read operations on the wire.  If we
+ * detect that we're reading a page, or an area of a page, that is past the
+ * end of file, we do not generate NFS read operations but just clear the
+ * parts of the page that would have come back zero from the server anyway.
+ *
+ * We rely on the cached value of i_size to make this determination; another
+ * client can fill pages on the server past our cached end-of-file, but we
+ * won't see the new data until our attribute cache is updated.  This is more
+ * or less conventional NFS client behavior.
+ */
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
+{
+	struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
+	struct page *page = req->wb_page;
+	struct nfs_read_data *data;
+	size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes;
+	unsigned int offset;
+	int requests = 0;
+	int ret = 0;
+	struct pnfs_layout_segment *lseg;
+	LIST_HEAD(list);
+
+	nfs_list_remove_request(req);
+
+	nbytes = desc->pg_count;
+	do {
+		size_t len = min(nbytes,rsize);
+
+		data = nfs_readdata_alloc(1);
+		if (!data)
+			goto out_bad;
+		list_add(&data->pages, &list);
+		requests++;
+		nbytes -= len;
+	} while(nbytes != 0);
+	atomic_set(&req->wb_complete, requests);
+
+	BUG_ON(desc->pg_lseg != NULL);
+	lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+				  req_offset(req), desc->pg_count,
+				  IOMODE_READ, GFP_KERNEL);
+	ClearPageError(page);
+	offset = 0;
+	nbytes = desc->pg_count;
+	do {
+		int ret2;
+
+		data = list_entry(list.next, struct nfs_read_data, pages);
+		list_del_init(&data->pages);
+
+		data->pagevec[0] = page;
+
+		if (nbytes < rsize)
+			rsize = nbytes;
+		ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
+					 rsize, offset, lseg);
+		if (ret == 0)
+			ret = ret2;
+		offset += rsize;
+		nbytes -= rsize;
+	} while (nbytes != 0);
+	put_lseg(lseg);
+	desc->pg_lseg = NULL;
+
+	return ret;
+
+out_bad:
+	while (!list_empty(&list)) {
+		data = list_entry(list.next, struct nfs_read_data, pages);
+		list_del(&data->pages);
+		nfs_readdata_free(data);
+	}
+	SetPageError(page);
+	nfs_readpage_release(req);
+	return -ENOMEM;
+}
+
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
+{
+	struct nfs_page		*req;
+	struct page		**pages;
+	struct nfs_read_data	*data;
+	struct list_head *head = &desc->pg_list;
+	struct pnfs_layout_segment *lseg = desc->pg_lseg;
+	int ret = -ENOMEM;
+
+	data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,
+						     desc->pg_count));
+	if (!data) {
+		nfs_async_read_error(head);
+		goto out;
+	}
+
+	pages = data->pagevec;
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_list_add_request(req, &data->pages);
+		ClearPageError(req->wb_page);
+		*pages++ = req->wb_page;
+	}
+	req = nfs_list_entry(data->pages.next);
+	if ((!lseg) && list_is_singular(&data->pages))
+		lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+					  req_offset(req), desc->pg_count,
+					  IOMODE_READ, GFP_KERNEL);
+
+	ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
+				0, lseg);
+out:
+	put_lseg(lseg);
+	desc->pg_lseg = NULL;
+	return ret;
+}
+
+/*
+ * This is the callback from RPC telling us whether a reply was
+ * received or some error occurred (timeout or socket shutdown).
+ */
+int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
+{
+	int status;
+
+	dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid,
+			task->tk_status);
+
+	status = NFS_PROTO(data->inode)->read_done(task, data);
+	if (status != 0)
+		return status;
+
+	nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, data->res.count);
+
+	if (task->tk_status == -ESTALE) {
+		set_bit(NFS_INO_STALE, &NFS_I(data->inode)->flags);
+		nfs_mark_for_revalidate(data->inode);
+	}
+	return 0;
+}
+
+static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data)
+{
+	struct nfs_readargs *argp = &data->args;
+	struct nfs_readres *resp = &data->res;
+
+	if (resp->eof || resp->count == argp->count)
+		return;
+
+	/* This is a short read! */
+	nfs_inc_stats(data->inode, NFSIOS_SHORTREAD);
+	/* Has the server at least made some progress? */
+	if (resp->count == 0)
+		return;
+
+	/* Yes, so retry the read at the end of the data */
+	data->mds_offset += resp->count;
+	argp->offset += resp->count;
+	argp->pgbase += resp->count;
+	argp->count -= resp->count;
+	nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
+}
+
+/*
+ * Handle a read reply that fills part of a page.
+ */
+static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata)
+{
+	struct nfs_read_data *data = calldata;
+ 
+	if (nfs_readpage_result(task, data) != 0)
+		return;
+	if (task->tk_status < 0)
+		return;
+
+	nfs_readpage_truncate_uninitialised_page(data);
+	nfs_readpage_retry(task, data);
+}
+
+static void nfs_readpage_release_partial(void *calldata)
+{
+	struct nfs_read_data *data = calldata;
+	struct nfs_page *req = data->req;
+	struct page *page = req->wb_page;
+	int status = data->task.tk_status;
+
+	if (status < 0)
+		SetPageError(page);
+
+	if (atomic_dec_and_test(&req->wb_complete)) {
+		if (!PageError(page))
+			SetPageUptodate(page);
+		nfs_readpage_release(req);
+	}
+	nfs_readdata_release(calldata);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+void nfs_read_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_read_data *data = calldata;
+
+	if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+				&data->args.seq_args, &data->res.seq_res,
+				0, task))
+		return;
+	rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static const struct rpc_call_ops nfs_read_partial_ops = {
+#if defined(CONFIG_NFS_V4_1)
+	.rpc_call_prepare = nfs_read_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+	.rpc_call_done = nfs_readpage_result_partial,
+	.rpc_release = nfs_readpage_release_partial,
+};
+
+static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data)
+{
+	unsigned int count = data->res.count;
+	unsigned int base = data->args.pgbase;
+	struct page **pages;
+
+	if (data->res.eof)
+		count = data->args.count;
+	if (unlikely(count == 0))
+		return;
+	pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
+	base &= ~PAGE_CACHE_MASK;
+	count += base;
+	for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
+		SetPageUptodate(*pages);
+	if (count == 0)
+		return;
+	/* Was this a short read? */
+	if (data->res.eof || data->res.count == data->args.count)
+		SetPageUptodate(*pages);
+}
+
+/*
+ * This is the callback from RPC telling us whether a reply was
+ * received or some error occurred (timeout or socket shutdown).
+ */
+static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
+{
+	struct nfs_read_data *data = calldata;
+
+	if (nfs_readpage_result(task, data) != 0)
+		return;
+	if (task->tk_status < 0)
+		return;
+	/*
+	 * Note: nfs_readpage_retry may change the values of
+	 * data->args. In the multi-page case, we therefore need
+	 * to ensure that we call nfs_readpage_set_pages_uptodate()
+	 * first.
+	 */
+	nfs_readpage_truncate_uninitialised_page(data);
+	nfs_readpage_set_pages_uptodate(data);
+	nfs_readpage_retry(task, data);
+}
+
+static void nfs_readpage_release_full(void *calldata)
+{
+	struct nfs_read_data *data = calldata;
+
+	while (!list_empty(&data->pages)) {
+		struct nfs_page *req = nfs_list_entry(data->pages.next);
+
+		nfs_list_remove_request(req);
+		nfs_readpage_release(req);
+	}
+	nfs_readdata_release(calldata);
+}
+
+static const struct rpc_call_ops nfs_read_full_ops = {
+#if defined(CONFIG_NFS_V4_1)
+	.rpc_call_prepare = nfs_read_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+	.rpc_call_done = nfs_readpage_result_full,
+	.rpc_release = nfs_readpage_release_full,
+};
+
+/*
+ * Read a page over NFS.
+ * We read the page synchronously in the following case:
+ *  -	The error flag is set for this page. This happens only when a
+ *	previous async read operation failed.
+ */
+int nfs_readpage(struct file *file, struct page *page)
+{
+	struct nfs_open_context *ctx;
+	struct inode *inode = page->mapping->host;
+	int		error;
+
+	dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
+		page, PAGE_CACHE_SIZE, page->index);
+	nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
+	nfs_add_stats(inode, NFSIOS_READPAGES, 1);
+
+	/*
+	 * Try to flush any pending writes to the file..
+	 *
+	 * NOTE! Because we own the page lock, there cannot
+	 * be any new pending writes generated at this point
+	 * for this page (other pages can be written to).
+	 */
+	error = nfs_wb_page(inode, page);
+	if (error)
+		goto out_unlock;
+	if (PageUptodate(page))
+		goto out_unlock;
+
+	error = -ESTALE;
+	if (NFS_STALE(inode))
+		goto out_unlock;
+
+	if (file == NULL) {
+		error = -EBADF;
+		ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
+		if (ctx == NULL)
+			goto out_unlock;
+	} else
+		ctx = get_nfs_open_context(nfs_file_open_context(file));
+
+	if (!IS_SYNC(inode)) {
+		error = nfs_readpage_from_fscache(ctx, inode, page);
+		if (error == 0)
+			goto out;
+	}
+
+	error = nfs_readpage_async(ctx, inode, page);
+
+out:
+	put_nfs_open_context(ctx);
+	return error;
+out_unlock:
+	unlock_page(page);
+	return error;
+}
+
+struct nfs_readdesc {
+	struct nfs_pageio_descriptor *pgio;
+	struct nfs_open_context *ctx;
+};
+
+static int
+readpage_async_filler(void *data, struct page *page)
+{
+	struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
+	struct inode *inode = page->mapping->host;
+	struct nfs_page *new;
+	unsigned int len;
+	int error;
+
+	len = nfs_page_length(page);
+	if (len == 0)
+		return nfs_return_empty_page(page);
+
+	new = nfs_create_request(desc->ctx, inode, page, 0, len);
+	if (IS_ERR(new))
+		goto out_error;
+
+	if (len < PAGE_CACHE_SIZE)
+		zero_user_segment(page, len, PAGE_CACHE_SIZE);
+	if (!nfs_pageio_add_request(desc->pgio, new)) {
+		error = desc->pgio->pg_error;
+		goto out_unlock;
+	}
+	return 0;
+out_error:
+	error = PTR_ERR(new);
+	SetPageError(page);
+out_unlock:
+	unlock_page(page);
+	return error;
+}
+
+int nfs_readpages(struct file *filp, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	struct nfs_pageio_descriptor pgio;
+	struct nfs_readdesc desc = {
+		.pgio = &pgio,
+	};
+	struct inode *inode = mapping->host;
+	struct nfs_server *server = NFS_SERVER(inode);
+	size_t rsize = server->rsize;
+	unsigned long npages;
+	int ret = -ESTALE;
+
+	dprintk("NFS: nfs_readpages (%s/%Ld %d)\n",
+			inode->i_sb->s_id,
+			(long long)NFS_FILEID(inode),
+			nr_pages);
+	nfs_inc_stats(inode, NFSIOS_VFSREADPAGES);
+
+	if (NFS_STALE(inode))
+		goto out;
+
+	if (filp == NULL) {
+		desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
+		if (desc.ctx == NULL)
+			return -EBADF;
+	} else
+		desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
+
+	/* attempt to read as many of the pages as possible from the cache
+	 * - this returns -ENOBUFS immediately if the cookie is negative
+	 */
+	ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping,
+					 pages, &nr_pages);
+	if (ret == 0)
+		goto read_complete; /* all pages were read */
+
+	if (rsize < PAGE_CACHE_SIZE)
+		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
+	else
+		nfs_pageio_init(&pgio, inode, nfs_pagein_one, rsize, 0);
+
+	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
+
+	nfs_pageio_complete(&pgio);
+	npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	nfs_add_stats(inode, NFSIOS_READPAGES, npages);
+read_complete:
+	put_nfs_open_context(desc.ctx);
+out:
+	return ret;
+}
+
+int __init nfs_init_readpagecache(void)
+{
+	nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
+					     sizeof(struct nfs_read_data),
+					     0, SLAB_HWCACHE_ALIGN,
+					     NULL);
+	if (nfs_rdata_cachep == NULL)
+		return -ENOMEM;
+
+	nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ,
+						     nfs_rdata_cachep);
+	if (nfs_rdata_mempool == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void nfs_destroy_readpagecache(void)
+{
+	mempool_destroy(nfs_rdata_mempool);
+	kmem_cache_destroy(nfs_rdata_cachep);
+}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
new file mode 100644
index 00000000..8e7b61d5
--- /dev/null
+++ b/fs/nfs/super.c
@@ -0,0 +1,3099 @@
+/*
+ *  linux/fs/nfs/super.c
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  nfs superblock handling functions
+ *
+ *  Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some
+ *  experimental NFS changes. Modularisation taken straight from SYS5 fs.
+ *
+ *  Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
+ *  J.S.Peatfield@damtp.cam.ac.uk
+ *
+ *  Split from inode.c by David Howells <dhowells@redhat.com>
+ *
+ * - superblocks are indexed on server only - all inodes, dentries, etc. associated with a
+ *   particular server are held in the same superblock
+ * - NFS superblocks can have several effective roots to the dentry tree
+ * - directory type roots are spliced into the tree when a path from one root reaches the root
+ *   of another (see nfs_lookup())
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/xprtrdma.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/mnt_namespace.h>
+#include <linux/namei.h>
+#include <linux/nfs_idmap.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <net/ipv6.h>
+#include <linux/netdevice.h>
+#include <linux/nfs_xdr.h>
+#include <linux/magic.h>
+#include <linux/parser.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+#ifdef CONFIG_NFS_V3
+#define NFS_DEFAULT_VERSION 3
+#else
+#define NFS_DEFAULT_VERSION 2
+#endif
+
+enum {
+	/* Mount options that take no arguments */
+	Opt_soft, Opt_hard,
+	Opt_posix, Opt_noposix,
+	Opt_cto, Opt_nocto,
+	Opt_ac, Opt_noac,
+	Opt_lock, Opt_nolock,
+	Opt_v2, Opt_v3, Opt_v4,
+	Opt_udp, Opt_tcp, Opt_rdma,
+	Opt_acl, Opt_noacl,
+	Opt_rdirplus, Opt_nordirplus,
+	Opt_sharecache, Opt_nosharecache,
+	Opt_resvport, Opt_noresvport,
+	Opt_fscache, Opt_nofscache,
+
+	/* Mount options that take integer arguments */
+	Opt_port,
+	Opt_rsize, Opt_wsize, Opt_bsize,
+	Opt_timeo, Opt_retrans,
+	Opt_acregmin, Opt_acregmax,
+	Opt_acdirmin, Opt_acdirmax,
+	Opt_actimeo,
+	Opt_namelen,
+	Opt_mountport,
+	Opt_mountvers,
+	Opt_nfsvers,
+	Opt_minorversion,
+
+	/* Mount options that take string arguments */
+	Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
+	Opt_addr, Opt_mountaddr, Opt_clientaddr,
+	Opt_lookupcache,
+	Opt_fscache_uniq,
+	Opt_local_lock,
+
+	/* Special mount options */
+	Opt_userspace, Opt_deprecated, Opt_sloppy,
+
+	Opt_err
+};
+
+static const match_table_t nfs_mount_option_tokens = {
+	{ Opt_userspace, "bg" },
+	{ Opt_userspace, "fg" },
+	{ Opt_userspace, "retry=%s" },
+
+	{ Opt_sloppy, "sloppy" },
+
+	{ Opt_soft, "soft" },
+	{ Opt_hard, "hard" },
+	{ Opt_deprecated, "intr" },
+	{ Opt_deprecated, "nointr" },
+	{ Opt_posix, "posix" },
+	{ Opt_noposix, "noposix" },
+	{ Opt_cto, "cto" },
+	{ Opt_nocto, "nocto" },
+	{ Opt_ac, "ac" },
+	{ Opt_noac, "noac" },
+	{ Opt_lock, "lock" },
+	{ Opt_nolock, "nolock" },
+	{ Opt_v2, "v2" },
+	{ Opt_v3, "v3" },
+	{ Opt_v4, "v4" },
+	{ Opt_udp, "udp" },
+	{ Opt_tcp, "tcp" },
+	{ Opt_rdma, "rdma" },
+	{ Opt_acl, "acl" },
+	{ Opt_noacl, "noacl" },
+	{ Opt_rdirplus, "rdirplus" },
+	{ Opt_nordirplus, "nordirplus" },
+	{ Opt_sharecache, "sharecache" },
+	{ Opt_nosharecache, "nosharecache" },
+	{ Opt_resvport, "resvport" },
+	{ Opt_noresvport, "noresvport" },
+	{ Opt_fscache, "fsc" },
+	{ Opt_nofscache, "nofsc" },
+
+	{ Opt_port, "port=%s" },
+	{ Opt_rsize, "rsize=%s" },
+	{ Opt_wsize, "wsize=%s" },
+	{ Opt_bsize, "bsize=%s" },
+	{ Opt_timeo, "timeo=%s" },
+	{ Opt_retrans, "retrans=%s" },
+	{ Opt_acregmin, "acregmin=%s" },
+	{ Opt_acregmax, "acregmax=%s" },
+	{ Opt_acdirmin, "acdirmin=%s" },
+	{ Opt_acdirmax, "acdirmax=%s" },
+	{ Opt_actimeo, "actimeo=%s" },
+	{ Opt_namelen, "namlen=%s" },
+	{ Opt_mountport, "mountport=%s" },
+	{ Opt_mountvers, "mountvers=%s" },
+	{ Opt_nfsvers, "nfsvers=%s" },
+	{ Opt_nfsvers, "vers=%s" },
+	{ Opt_minorversion, "minorversion=%s" },
+
+	{ Opt_sec, "sec=%s" },
+	{ Opt_proto, "proto=%s" },
+	{ Opt_mountproto, "mountproto=%s" },
+	{ Opt_addr, "addr=%s" },
+	{ Opt_clientaddr, "clientaddr=%s" },
+	{ Opt_mounthost, "mounthost=%s" },
+	{ Opt_mountaddr, "mountaddr=%s" },
+
+	{ Opt_lookupcache, "lookupcache=%s" },
+	{ Opt_fscache_uniq, "fsc=%s" },
+	{ Opt_local_lock, "local_lock=%s" },
+
+	{ Opt_err, NULL }
+};
+
+enum {
+	Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma,
+
+	Opt_xprt_err
+};
+
+static const match_table_t nfs_xprt_protocol_tokens = {
+	{ Opt_xprt_udp, "udp" },
+	{ Opt_xprt_udp6, "udp6" },
+	{ Opt_xprt_tcp, "tcp" },
+	{ Opt_xprt_tcp6, "tcp6" },
+	{ Opt_xprt_rdma, "rdma" },
+
+	{ Opt_xprt_err, NULL }
+};
+
+enum {
+	Opt_sec_none, Opt_sec_sys,
+	Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p,
+	Opt_sec_lkey, Opt_sec_lkeyi, Opt_sec_lkeyp,
+	Opt_sec_spkm, Opt_sec_spkmi, Opt_sec_spkmp,
+
+	Opt_sec_err
+};
+
+static const match_table_t nfs_secflavor_tokens = {
+	{ Opt_sec_none, "none" },
+	{ Opt_sec_none, "null" },
+	{ Opt_sec_sys, "sys" },
+
+	{ Opt_sec_krb5, "krb5" },
+	{ Opt_sec_krb5i, "krb5i" },
+	{ Opt_sec_krb5p, "krb5p" },
+
+	{ Opt_sec_lkey, "lkey" },
+	{ Opt_sec_lkeyi, "lkeyi" },
+	{ Opt_sec_lkeyp, "lkeyp" },
+
+	{ Opt_sec_spkm, "spkm3" },
+	{ Opt_sec_spkmi, "spkm3i" },
+	{ Opt_sec_spkmp, "spkm3p" },
+
+	{ Opt_sec_err, NULL }
+};
+
+enum {
+	Opt_lookupcache_all, Opt_lookupcache_positive,
+	Opt_lookupcache_none,
+
+	Opt_lookupcache_err
+};
+
+static match_table_t nfs_lookupcache_tokens = {
+	{ Opt_lookupcache_all, "all" },
+	{ Opt_lookupcache_positive, "pos" },
+	{ Opt_lookupcache_positive, "positive" },
+	{ Opt_lookupcache_none, "none" },
+
+	{ Opt_lookupcache_err, NULL }
+};
+
+enum {
+	Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix,
+	Opt_local_lock_none,
+
+	Opt_local_lock_err
+};
+
+static match_table_t nfs_local_lock_tokens = {
+	{ Opt_local_lock_all, "all" },
+	{ Opt_local_lock_flock, "flock" },
+	{ Opt_local_lock_posix, "posix" },
+	{ Opt_local_lock_none, "none" },
+
+	{ Opt_local_lock_err, NULL }
+};
+
+
+static void nfs_umount_begin(struct super_block *);
+static int  nfs_statfs(struct dentry *, struct kstatfs *);
+static int  nfs_show_options(struct seq_file *, struct vfsmount *);
+static int  nfs_show_devname(struct seq_file *, struct vfsmount *);
+static int  nfs_show_path(struct seq_file *, struct vfsmount *);
+static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
+static struct dentry *nfs_fs_mount(struct file_system_type *,
+		int, const char *, void *);
+static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data);
+static void nfs_put_super(struct super_block *);
+static void nfs_kill_super(struct super_block *);
+static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
+
+static struct file_system_type nfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs",
+	.mount		= nfs_fs_mount,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+struct file_system_type nfs_xdev_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs",
+	.mount		= nfs_xdev_mount,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+static const struct super_operations nfs_sops = {
+	.alloc_inode	= nfs_alloc_inode,
+	.destroy_inode	= nfs_destroy_inode,
+	.write_inode	= nfs_write_inode,
+	.put_super	= nfs_put_super,
+	.statfs		= nfs_statfs,
+	.evict_inode	= nfs_evict_inode,
+	.umount_begin	= nfs_umount_begin,
+	.show_options	= nfs_show_options,
+	.show_devname	= nfs_show_devname,
+	.show_path	= nfs_show_path,
+	.show_stats	= nfs_show_stats,
+	.remount_fs	= nfs_remount,
+};
+
+#ifdef CONFIG_NFS_V4
+static int nfs4_validate_text_mount_data(void *options,
+	struct nfs_parsed_mount_data *args, const char *dev_name);
+static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
+	struct nfs_parsed_mount_data *data);
+static struct dentry *nfs4_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
+static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
+static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
+static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
+static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
+static void nfs4_kill_super(struct super_block *sb);
+
+static struct file_system_type nfs4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.mount		= nfs4_mount,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+static struct file_system_type nfs4_remote_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.mount		= nfs4_remote_mount,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+struct file_system_type nfs4_xdev_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.mount		= nfs4_xdev_mount,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+static struct file_system_type nfs4_remote_referral_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.mount		= nfs4_remote_referral_mount,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+struct file_system_type nfs4_referral_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.mount		= nfs4_referral_mount,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+static const struct super_operations nfs4_sops = {
+	.alloc_inode	= nfs_alloc_inode,
+	.destroy_inode	= nfs_destroy_inode,
+	.write_inode	= nfs_write_inode,
+	.put_super	= nfs_put_super,
+	.statfs		= nfs_statfs,
+	.evict_inode	= nfs4_evict_inode,
+	.umount_begin	= nfs_umount_begin,
+	.show_options	= nfs_show_options,
+	.show_devname	= nfs_show_devname,
+	.show_path	= nfs_show_path,
+	.show_stats	= nfs_show_stats,
+	.remount_fs	= nfs_remount,
+};
+#endif
+
+static struct shrinker acl_shrinker = {
+	.shrink		= nfs_access_cache_shrinker,
+	.seeks		= DEFAULT_SEEKS,
+};
+
+/*
+ * Register the NFS filesystems
+ */
+int __init register_nfs_fs(void)
+{
+	int ret;
+
+        ret = register_filesystem(&nfs_fs_type);
+	if (ret < 0)
+		goto error_0;
+
+	ret = nfs_register_sysctl();
+	if (ret < 0)
+		goto error_1;
+#ifdef CONFIG_NFS_V4
+	ret = register_filesystem(&nfs4_fs_type);
+	if (ret < 0)
+		goto error_2;
+#endif
+	register_shrinker(&acl_shrinker);
+	return 0;
+
+#ifdef CONFIG_NFS_V4
+error_2:
+	nfs_unregister_sysctl();
+#endif
+error_1:
+	unregister_filesystem(&nfs_fs_type);
+error_0:
+	return ret;
+}
+
+/*
+ * Unregister the NFS filesystems
+ */
+void __exit unregister_nfs_fs(void)
+{
+	unregister_shrinker(&acl_shrinker);
+#ifdef CONFIG_NFS_V4
+	unregister_filesystem(&nfs4_fs_type);
+#endif
+	nfs_unregister_sysctl();
+	unregister_filesystem(&nfs_fs_type);
+}
+
+void nfs_sb_active(struct super_block *sb)
+{
+	struct nfs_server *server = NFS_SB(sb);
+
+	if (atomic_inc_return(&server->active) == 1)
+		atomic_inc(&sb->s_active);
+}
+
+void nfs_sb_deactive(struct super_block *sb)
+{
+	struct nfs_server *server = NFS_SB(sb);
+
+	if (atomic_dec_and_test(&server->active))
+		deactivate_super(sb);
+}
+
+/*
+ * Deliver file system statistics to userspace
+ */
+static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct nfs_server *server = NFS_SB(dentry->d_sb);
+	unsigned char blockbits;
+	unsigned long blockres;
+	struct nfs_fh *fh = NFS_FH(dentry->d_inode);
+	struct nfs_fsstat res;
+	int error = -ENOMEM;
+
+	res.fattr = nfs_alloc_fattr();
+	if (res.fattr == NULL)
+		goto out_err;
+
+	error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
+	if (unlikely(error == -ESTALE)) {
+		struct dentry *pd_dentry;
+
+		pd_dentry = dget_parent(dentry);
+		if (pd_dentry != NULL) {
+			nfs_zap_caches(pd_dentry->d_inode);
+			dput(pd_dentry);
+		}
+	}
+	nfs_free_fattr(res.fattr);
+	if (error < 0)
+		goto out_err;
+
+	buf->f_type = NFS_SUPER_MAGIC;
+
+	/*
+	 * Current versions of glibc do not correctly handle the
+	 * case where f_frsize != f_bsize.  Eventually we want to
+	 * report the value of wtmult in this field.
+	 */
+	buf->f_frsize = dentry->d_sb->s_blocksize;
+
+	/*
+	 * On most *nix systems, f_blocks, f_bfree, and f_bavail
+	 * are reported in units of f_frsize.  Linux hasn't had
+	 * an f_frsize field in its statfs struct until recently,
+	 * thus historically Linux's sys_statfs reports these
+	 * fields in units of f_bsize.
+	 */
+	buf->f_bsize = dentry->d_sb->s_blocksize;
+	blockbits = dentry->d_sb->s_blocksize_bits;
+	blockres = (1 << blockbits) - 1;
+	buf->f_blocks = (res.tbytes + blockres) >> blockbits;
+	buf->f_bfree = (res.fbytes + blockres) >> blockbits;
+	buf->f_bavail = (res.abytes + blockres) >> blockbits;
+
+	buf->f_files = res.tfiles;
+	buf->f_ffree = res.afiles;
+
+	buf->f_namelen = server->namelen;
+
+	return 0;
+
+ out_err:
+	dprintk("%s: statfs error = %d\n", __func__, -error);
+	return error;
+}
+
+/*
+ * Map the security flavour number to a name
+ */
+static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
+{
+	static const struct {
+		rpc_authflavor_t flavour;
+		const char *str;
+	} sec_flavours[] = {
+		{ RPC_AUTH_NULL, "null" },
+		{ RPC_AUTH_UNIX, "sys" },
+		{ RPC_AUTH_GSS_KRB5, "krb5" },
+		{ RPC_AUTH_GSS_KRB5I, "krb5i" },
+		{ RPC_AUTH_GSS_KRB5P, "krb5p" },
+		{ RPC_AUTH_GSS_LKEY, "lkey" },
+		{ RPC_AUTH_GSS_LKEYI, "lkeyi" },
+		{ RPC_AUTH_GSS_LKEYP, "lkeyp" },
+		{ RPC_AUTH_GSS_SPKM, "spkm" },
+		{ RPC_AUTH_GSS_SPKMI, "spkmi" },
+		{ RPC_AUTH_GSS_SPKMP, "spkmp" },
+		{ UINT_MAX, "unknown" }
+	};
+	int i;
+
+	for (i = 0; sec_flavours[i].flavour != UINT_MAX; i++) {
+		if (sec_flavours[i].flavour == flavour)
+			break;
+	}
+	return sec_flavours[i].str;
+}
+
+static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss,
+				  int showdefaults)
+{
+	struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address;
+
+	seq_printf(m, ",mountproto=");
+	switch (sap->sa_family) {
+	case AF_INET:
+		switch (nfss->mountd_protocol) {
+		case IPPROTO_UDP:
+			seq_printf(m, RPCBIND_NETID_UDP);
+			break;
+		case IPPROTO_TCP:
+			seq_printf(m, RPCBIND_NETID_TCP);
+			break;
+		default:
+			if (showdefaults)
+				seq_printf(m, "auto");
+		}
+		break;
+	case AF_INET6:
+		switch (nfss->mountd_protocol) {
+		case IPPROTO_UDP:
+			seq_printf(m, RPCBIND_NETID_UDP6);
+			break;
+		case IPPROTO_TCP:
+			seq_printf(m, RPCBIND_NETID_TCP6);
+			break;
+		default:
+			if (showdefaults)
+				seq_printf(m, "auto");
+		}
+		break;
+	default:
+		if (showdefaults)
+			seq_printf(m, "auto");
+	}
+}
+
+static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
+				    int showdefaults)
+{
+	struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address;
+
+	if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE)
+		return;
+
+	switch (sap->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+		seq_printf(m, ",mountaddr=%pI4", &sin->sin_addr.s_addr);
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+		seq_printf(m, ",mountaddr=%pI6c", &sin6->sin6_addr);
+		break;
+	}
+	default:
+		if (showdefaults)
+			seq_printf(m, ",mountaddr=unspecified");
+	}
+
+	if (nfss->mountd_version || showdefaults)
+		seq_printf(m, ",mountvers=%u", nfss->mountd_version);
+	if ((nfss->mountd_port &&
+		nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) ||
+		showdefaults)
+		seq_printf(m, ",mountport=%u", nfss->mountd_port);
+
+	nfs_show_mountd_netid(m, nfss, showdefaults);
+}
+
+#ifdef CONFIG_NFS_V4
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+				    int showdefaults)
+{
+	struct nfs_client *clp = nfss->nfs_client;
+
+	seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
+	seq_printf(m, ",minorversion=%u", clp->cl_minorversion);
+}
+#else
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+				    int showdefaults)
+{
+}
+#endif
+
+/*
+ * Describe the mount options in force on this server representation
+ */
+static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
+				   int showdefaults)
+{
+	static const struct proc_nfs_info {
+		int flag;
+		const char *str;
+		const char *nostr;
+	} nfs_info[] = {
+		{ NFS_MOUNT_SOFT, ",soft", ",hard" },
+		{ NFS_MOUNT_POSIX, ",posix", "" },
+		{ NFS_MOUNT_NOCTO, ",nocto", "" },
+		{ NFS_MOUNT_NOAC, ",noac", "" },
+		{ NFS_MOUNT_NONLM, ",nolock", "" },
+		{ NFS_MOUNT_NOACL, ",noacl", "" },
+		{ NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
+		{ NFS_MOUNT_UNSHARED, ",nosharecache", "" },
+		{ NFS_MOUNT_NORESVPORT, ",noresvport", "" },
+		{ 0, NULL, NULL }
+	};
+	const struct proc_nfs_info *nfs_infop;
+	struct nfs_client *clp = nfss->nfs_client;
+	u32 version = clp->rpc_ops->version;
+	int local_flock, local_fcntl;
+
+	seq_printf(m, ",vers=%u", version);
+	seq_printf(m, ",rsize=%u", nfss->rsize);
+	seq_printf(m, ",wsize=%u", nfss->wsize);
+	if (nfss->bsize != 0)
+		seq_printf(m, ",bsize=%u", nfss->bsize);
+	seq_printf(m, ",namlen=%u", nfss->namelen);
+	if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults)
+		seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ);
+	if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults)
+		seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ);
+	if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults)
+		seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ);
+	if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults)
+		seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ);
+	for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
+		if (nfss->flags & nfs_infop->flag)
+			seq_puts(m, nfs_infop->str);
+		else
+			seq_puts(m, nfs_infop->nostr);
+	}
+	seq_printf(m, ",proto=%s",
+		   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID));
+	if (version == 4) {
+		if (nfss->port != NFS_PORT)
+			seq_printf(m, ",port=%u", nfss->port);
+	} else
+		if (nfss->port)
+			seq_printf(m, ",port=%u", nfss->port);
+
+	seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ);
+	seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries);
+	seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
+
+	if (version != 4)
+		nfs_show_mountd_options(m, nfss, showdefaults);
+	else
+		nfs_show_nfsv4_options(m, nfss, showdefaults);
+
+	if (nfss->options & NFS_OPTION_FSCACHE)
+		seq_printf(m, ",fsc");
+
+	if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
+		if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+			seq_printf(m, ",lookupcache=none");
+		else
+			seq_printf(m, ",lookupcache=pos");
+	}
+
+	local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK;
+	local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL;
+
+	if (!local_flock && !local_fcntl)
+		seq_printf(m, ",local_lock=none");
+	else if (local_flock && local_fcntl)
+		seq_printf(m, ",local_lock=all");
+	else if (local_flock)
+		seq_printf(m, ",local_lock=flock");
+	else
+		seq_printf(m, ",local_lock=posix");
+}
+
+/*
+ * Describe the mount options on this VFS mountpoint
+ */
+static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+
+	nfs_show_mount_options(m, nfss, 0);
+
+	seq_printf(m, ",addr=%s",
+			rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient,
+							RPC_DISPLAY_ADDR));
+
+	return 0;
+}
+#ifdef CONFIG_NFS_V4_1
+void show_sessions(struct seq_file *m, struct nfs_server *server)
+{
+	if (nfs4_has_session(server->nfs_client))
+		seq_printf(m, ",sessions");
+}
+#else
+void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+#endif
+
+#ifdef CONFIG_NFS_V4_1
+void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+	seq_printf(m, ",pnfs=");
+	if (server->pnfs_curr_ld)
+		seq_printf(m, "%s", server->pnfs_curr_ld->name);
+	else
+		seq_printf(m, "not configured");
+}
+#else  /* CONFIG_NFS_V4_1 */
+void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
+#endif /* CONFIG_NFS_V4_1 */
+
+static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
+{
+	char *page = (char *) __get_free_page(GFP_KERNEL);
+	char *devname, *dummy;
+	int err = 0;
+	if (!page)
+		return -ENOMEM;
+	devname = nfs_path(&dummy, mnt->mnt_root, page, PAGE_SIZE);
+	if (IS_ERR(devname))
+		err = PTR_ERR(devname);
+	else
+		seq_escape(m, devname, " \t\n\\");
+	free_page((unsigned long)page);
+	return err;
+}
+
+static int nfs_show_path(struct seq_file *m, struct vfsmount *mnt)
+{
+	seq_puts(m, "/");
+	return 0;
+}
+
+/*
+ * Present statistical information for this VFS mountpoint
+ */
+static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
+{
+	int i, cpu;
+	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+	struct rpc_auth *auth = nfss->client->cl_auth;
+	struct nfs_iostats totals = { };
+
+	seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS);
+
+	/*
+	 * Display all mount option settings
+	 */
+	seq_printf(m, "\n\topts:\t");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : "");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
+	nfs_show_mount_options(m, nfss, 1);
+
+	seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
+
+	seq_printf(m, "\n\tcaps:\t");
+	seq_printf(m, "caps=0x%x", nfss->caps);
+	seq_printf(m, ",wtmult=%u", nfss->wtmult);
+	seq_printf(m, ",dtsize=%u", nfss->dtsize);
+	seq_printf(m, ",bsize=%u", nfss->bsize);
+	seq_printf(m, ",namlen=%u", nfss->namelen);
+
+#ifdef CONFIG_NFS_V4
+	if (nfss->nfs_client->rpc_ops->version == 4) {
+		seq_printf(m, "\n\tnfsv4:\t");
+		seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
+		seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+		seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+		show_sessions(m, nfss);
+		show_pnfs(m, nfss);
+	}
+#endif
+
+	/*
+	 * Display security flavor in effect for this mount
+	 */
+	seq_printf(m, "\n\tsec:\tflavor=%u", auth->au_ops->au_flavor);
+	if (auth->au_flavor)
+		seq_printf(m, ",pseudoflavor=%u", auth->au_flavor);
+
+	/*
+	 * Display superblock I/O counters
+	 */
+	for_each_possible_cpu(cpu) {
+		struct nfs_iostats *stats;
+
+		preempt_disable();
+		stats = per_cpu_ptr(nfss->io_stats, cpu);
+
+		for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+			totals.events[i] += stats->events[i];
+		for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+			totals.bytes[i] += stats->bytes[i];
+#ifdef CONFIG_NFS_FSCACHE
+		for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+			totals.fscache[i] += stats->fscache[i];
+#endif
+
+		preempt_enable();
+	}
+
+	seq_printf(m, "\n\tevents:\t");
+	for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+		seq_printf(m, "%lu ", totals.events[i]);
+	seq_printf(m, "\n\tbytes:\t");
+	for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+		seq_printf(m, "%Lu ", totals.bytes[i]);
+#ifdef CONFIG_NFS_FSCACHE
+	if (nfss->options & NFS_OPTION_FSCACHE) {
+		seq_printf(m, "\n\tfsc:\t");
+		for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+			seq_printf(m, "%Lu ", totals.bytes[i]);
+	}
+#endif
+	seq_printf(m, "\n");
+
+	rpc_print_iostats(m, nfss->client);
+
+	return 0;
+}
+
+/*
+ * Begin unmount by attempting to remove all automounted mountpoints we added
+ * in response to xdev traversals and referrals
+ */
+static void nfs_umount_begin(struct super_block *sb)
+{
+	struct nfs_server *server;
+	struct rpc_clnt *rpc;
+
+	server = NFS_SB(sb);
+	/* -EIO all pending I/O */
+	rpc = server->client_acl;
+	if (!IS_ERR(rpc))
+		rpc_killall_tasks(rpc);
+	rpc = server->client;
+	if (!IS_ERR(rpc))
+		rpc_killall_tasks(rpc);
+}
+
+static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version)
+{
+	struct nfs_parsed_mount_data *data;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data) {
+		data->acregmin		= NFS_DEF_ACREGMIN;
+		data->acregmax		= NFS_DEF_ACREGMAX;
+		data->acdirmin		= NFS_DEF_ACDIRMIN;
+		data->acdirmax		= NFS_DEF_ACDIRMAX;
+		data->mount_server.port	= NFS_UNSPEC_PORT;
+		data->nfs_server.port	= NFS_UNSPEC_PORT;
+		data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+		data->auth_flavors[0]	= RPC_AUTH_UNIX;
+		data->auth_flavor_len	= 1;
+		data->version		= version;
+		data->minorversion	= 0;
+		security_init_mnt_opts(&data->lsm_opts);
+	}
+	return data;
+}
+
+static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data)
+{
+	if (data) {
+		kfree(data->client_address);
+		kfree(data->mount_server.hostname);
+		kfree(data->nfs_server.export_path);
+		kfree(data->nfs_server.hostname);
+		kfree(data->fscache_uniq);
+		security_free_mnt_opts(&data->lsm_opts);
+		kfree(data);
+	}
+}
+
+/*
+ * Sanity-check a server address provided by the mount command.
+ *
+ * Address family must be initialized, and address must not be
+ * the ANY address for that family.
+ */
+static int nfs_verify_server_address(struct sockaddr *addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *sa = (struct sockaddr_in *)addr;
+		return sa->sin_addr.s_addr != htonl(INADDR_ANY);
+	}
+	case AF_INET6: {
+		struct in6_addr *sa = &((struct sockaddr_in6 *)addr)->sin6_addr;
+		return !ipv6_addr_any(sa);
+	}
+	}
+
+	dfprintk(MOUNT, "NFS: Invalid IP address specified\n");
+	return 0;
+}
+
+/*
+ * Select between a default port value and a user-specified port value.
+ * If a zero value is set, then autobind will be used.
+ */
+static void nfs_set_port(struct sockaddr *sap, int *port,
+				 const unsigned short default_port)
+{
+	if (*port == NFS_UNSPEC_PORT)
+		*port = default_port;
+
+	rpc_set_port(sap, *port);
+}
+
+/*
+ * Sanity check the NFS transport protocol.
+ *
+ */
+static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+	switch (mnt->nfs_server.protocol) {
+	case XPRT_TRANSPORT_UDP:
+	case XPRT_TRANSPORT_TCP:
+	case XPRT_TRANSPORT_RDMA:
+		break;
+	default:
+		mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+	}
+}
+
+/*
+ * For text based NFSv2/v3 mounts, the mount protocol transport default
+ * settings should depend upon the specified NFS transport.
+ */
+static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+	nfs_validate_transport_protocol(mnt);
+
+	if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP ||
+	    mnt->mount_server.protocol == XPRT_TRANSPORT_TCP)
+			return;
+	switch (mnt->nfs_server.protocol) {
+	case XPRT_TRANSPORT_UDP:
+		mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
+		break;
+	case XPRT_TRANSPORT_TCP:
+	case XPRT_TRANSPORT_RDMA:
+		mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
+	}
+}
+
+/*
+ * Parse the value of the 'sec=' option.
+ */
+static int nfs_parse_security_flavors(char *value,
+				      struct nfs_parsed_mount_data *mnt)
+{
+	substring_t args[MAX_OPT_ARGS];
+
+	dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
+
+	switch (match_token(value, nfs_secflavor_tokens, args)) {
+	case Opt_sec_none:
+		mnt->auth_flavors[0] = RPC_AUTH_NULL;
+		break;
+	case Opt_sec_sys:
+		mnt->auth_flavors[0] = RPC_AUTH_UNIX;
+		break;
+	case Opt_sec_krb5:
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
+		break;
+	case Opt_sec_krb5i:
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
+		break;
+	case Opt_sec_krb5p:
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
+		break;
+	case Opt_sec_lkey:
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
+		break;
+	case Opt_sec_lkeyi:
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
+		break;
+	case Opt_sec_lkeyp:
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
+		break;
+	case Opt_sec_spkm:
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
+		break;
+	case Opt_sec_spkmi:
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
+		break;
+	case Opt_sec_spkmp:
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
+		break;
+	default:
+		return 0;
+	}
+
+	mnt->flags |= NFS_MOUNT_SECFLAVOUR;
+	mnt->auth_flavor_len = 1;
+	return 1;
+}
+
+static int nfs_get_option_str(substring_t args[], char **option)
+{
+	kfree(*option);
+	*option = match_strdup(args);
+	return !option;
+}
+
+static int nfs_get_option_ul(substring_t args[], unsigned long *option)
+{
+	int rc;
+	char *string;
+
+	string = match_strdup(args);
+	if (string == NULL)
+		return -ENOMEM;
+	rc = strict_strtoul(string, 10, option);
+	kfree(string);
+
+	return rc;
+}
+
+/*
+ * Error-check and convert a string of mount options from user space into
+ * a data structure.  The whole mount string is processed; bad options are
+ * skipped as they are encountered.  If there were no errors, return 1;
+ * otherwise return 0 (zero).
+ */
+static int nfs_parse_mount_options(char *raw,
+				   struct nfs_parsed_mount_data *mnt)
+{
+	char *p, *string, *secdata;
+	int rc, sloppy = 0, invalid_option = 0;
+	unsigned short protofamily = AF_UNSPEC;
+	unsigned short mountfamily = AF_UNSPEC;
+
+	if (!raw) {
+		dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
+		return 1;
+	}
+	dfprintk(MOUNT, "NFS: nfs mount opts='%s'\n", raw);
+
+	secdata = alloc_secdata();
+	if (!secdata)
+		goto out_nomem;
+
+	rc = security_sb_copy_data(raw, secdata);
+	if (rc)
+		goto out_security_failure;
+
+	rc = security_sb_parse_opts_str(secdata, &mnt->lsm_opts);
+	if (rc)
+		goto out_security_failure;
+
+	free_secdata(secdata);
+
+	while ((p = strsep(&raw, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		unsigned long option;
+		int token;
+
+		if (!*p)
+			continue;
+
+		dfprintk(MOUNT, "NFS:   parsing nfs mount option '%s'\n", p);
+
+		token = match_token(p, nfs_mount_option_tokens, args);
+		switch (token) {
+
+		/*
+		 * boolean options:  foo/nofoo
+		 */
+		case Opt_soft:
+			mnt->flags |= NFS_MOUNT_SOFT;
+			break;
+		case Opt_hard:
+			mnt->flags &= ~NFS_MOUNT_SOFT;
+			break;
+		case Opt_posix:
+			mnt->flags |= NFS_MOUNT_POSIX;
+			break;
+		case Opt_noposix:
+			mnt->flags &= ~NFS_MOUNT_POSIX;
+			break;
+		case Opt_cto:
+			mnt->flags &= ~NFS_MOUNT_NOCTO;
+			break;
+		case Opt_nocto:
+			mnt->flags |= NFS_MOUNT_NOCTO;
+			break;
+		case Opt_ac:
+			mnt->flags &= ~NFS_MOUNT_NOAC;
+			break;
+		case Opt_noac:
+			mnt->flags |= NFS_MOUNT_NOAC;
+			break;
+		case Opt_lock:
+			mnt->flags &= ~NFS_MOUNT_NONLM;
+			mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+					NFS_MOUNT_LOCAL_FCNTL);
+			break;
+		case Opt_nolock:
+			mnt->flags |= NFS_MOUNT_NONLM;
+			mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+				       NFS_MOUNT_LOCAL_FCNTL);
+			break;
+		case Opt_v2:
+			mnt->flags &= ~NFS_MOUNT_VER3;
+			mnt->version = 2;
+			break;
+		case Opt_v3:
+			mnt->flags |= NFS_MOUNT_VER3;
+			mnt->version = 3;
+			break;
+		case Opt_v4:
+			mnt->flags &= ~NFS_MOUNT_VER3;
+			mnt->version = 4;
+			break;
+		case Opt_udp:
+			mnt->flags &= ~NFS_MOUNT_TCP;
+			mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+			break;
+		case Opt_tcp:
+			mnt->flags |= NFS_MOUNT_TCP;
+			mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+			break;
+		case Opt_rdma:
+			mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
+			mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+			xprt_load_transport(p);
+			break;
+		case Opt_acl:
+			mnt->flags &= ~NFS_MOUNT_NOACL;
+			break;
+		case Opt_noacl:
+			mnt->flags |= NFS_MOUNT_NOACL;
+			break;
+		case Opt_rdirplus:
+			mnt->flags &= ~NFS_MOUNT_NORDIRPLUS;
+			break;
+		case Opt_nordirplus:
+			mnt->flags |= NFS_MOUNT_NORDIRPLUS;
+			break;
+		case Opt_sharecache:
+			mnt->flags &= ~NFS_MOUNT_UNSHARED;
+			break;
+		case Opt_nosharecache:
+			mnt->flags |= NFS_MOUNT_UNSHARED;
+			break;
+		case Opt_resvport:
+			mnt->flags &= ~NFS_MOUNT_NORESVPORT;
+			break;
+		case Opt_noresvport:
+			mnt->flags |= NFS_MOUNT_NORESVPORT;
+			break;
+		case Opt_fscache:
+			mnt->options |= NFS_OPTION_FSCACHE;
+			kfree(mnt->fscache_uniq);
+			mnt->fscache_uniq = NULL;
+			break;
+		case Opt_nofscache:
+			mnt->options &= ~NFS_OPTION_FSCACHE;
+			kfree(mnt->fscache_uniq);
+			mnt->fscache_uniq = NULL;
+			break;
+
+		/*
+		 * options that take numeric values
+		 */
+		case Opt_port:
+			if (nfs_get_option_ul(args, &option) ||
+			    option > USHRT_MAX)
+				goto out_invalid_value;
+			mnt->nfs_server.port = option;
+			break;
+		case Opt_rsize:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->rsize = option;
+			break;
+		case Opt_wsize:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->wsize = option;
+			break;
+		case Opt_bsize:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->bsize = option;
+			break;
+		case Opt_timeo:
+			if (nfs_get_option_ul(args, &option) || option == 0)
+				goto out_invalid_value;
+			mnt->timeo = option;
+			break;
+		case Opt_retrans:
+			if (nfs_get_option_ul(args, &option) || option == 0)
+				goto out_invalid_value;
+			mnt->retrans = option;
+			break;
+		case Opt_acregmin:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->acregmin = option;
+			break;
+		case Opt_acregmax:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->acregmax = option;
+			break;
+		case Opt_acdirmin:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->acdirmin = option;
+			break;
+		case Opt_acdirmax:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->acdirmax = option;
+			break;
+		case Opt_actimeo:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->acregmin = mnt->acregmax =
+			mnt->acdirmin = mnt->acdirmax = option;
+			break;
+		case Opt_namelen:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			mnt->namlen = option;
+			break;
+		case Opt_mountport:
+			if (nfs_get_option_ul(args, &option) ||
+			    option > USHRT_MAX)
+				goto out_invalid_value;
+			mnt->mount_server.port = option;
+			break;
+		case Opt_mountvers:
+			if (nfs_get_option_ul(args, &option) ||
+			    option < NFS_MNT_VERSION ||
+			    option > NFS_MNT3_VERSION)
+				goto out_invalid_value;
+			mnt->mount_server.version = option;
+			break;
+		case Opt_nfsvers:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			switch (option) {
+			case NFS2_VERSION:
+				mnt->flags &= ~NFS_MOUNT_VER3;
+				mnt->version = 2;
+				break;
+			case NFS3_VERSION:
+				mnt->flags |= NFS_MOUNT_VER3;
+				mnt->version = 3;
+				break;
+			case NFS4_VERSION:
+				mnt->flags &= ~NFS_MOUNT_VER3;
+				mnt->version = 4;
+				break;
+			default:
+				goto out_invalid_value;
+			}
+			break;
+		case Opt_minorversion:
+			if (nfs_get_option_ul(args, &option))
+				goto out_invalid_value;
+			if (option > NFS4_MAX_MINOR_VERSION)
+				goto out_invalid_value;
+			mnt->minorversion = option;
+			break;
+
+		/*
+		 * options that take text values
+		 */
+		case Opt_sec:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			rc = nfs_parse_security_flavors(string, mnt);
+			kfree(string);
+			if (!rc) {
+				dfprintk(MOUNT, "NFS:   unrecognized "
+						"security flavor\n");
+				return 0;
+			}
+			break;
+		case Opt_proto:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			token = match_token(string,
+					    nfs_xprt_protocol_tokens, args);
+
+			protofamily = AF_INET;
+			switch (token) {
+			case Opt_xprt_udp6:
+				protofamily = AF_INET6;
+			case Opt_xprt_udp:
+				mnt->flags &= ~NFS_MOUNT_TCP;
+				mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+				break;
+			case Opt_xprt_tcp6:
+				protofamily = AF_INET6;
+			case Opt_xprt_tcp:
+				mnt->flags |= NFS_MOUNT_TCP;
+				mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+				break;
+			case Opt_xprt_rdma:
+				/* vector side protocols to TCP */
+				mnt->flags |= NFS_MOUNT_TCP;
+				mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+				xprt_load_transport(string);
+				break;
+			default:
+				dfprintk(MOUNT, "NFS:   unrecognized "
+						"transport protocol\n");
+				kfree(string);
+				return 0;
+			}
+			kfree(string);
+			break;
+		case Opt_mountproto:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			token = match_token(string,
+					    nfs_xprt_protocol_tokens, args);
+			kfree(string);
+
+			mountfamily = AF_INET;
+			switch (token) {
+			case Opt_xprt_udp6:
+				mountfamily = AF_INET6;
+			case Opt_xprt_udp:
+				mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
+				break;
+			case Opt_xprt_tcp6:
+				mountfamily = AF_INET6;
+			case Opt_xprt_tcp:
+				mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
+				break;
+			case Opt_xprt_rdma: /* not used for side protocols */
+			default:
+				dfprintk(MOUNT, "NFS:   unrecognized "
+						"transport protocol\n");
+				return 0;
+			}
+			break;
+		case Opt_addr:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			mnt->nfs_server.addrlen =
+				rpc_pton(string, strlen(string),
+					(struct sockaddr *)
+					&mnt->nfs_server.address,
+					sizeof(mnt->nfs_server.address));
+			kfree(string);
+			if (mnt->nfs_server.addrlen == 0)
+				goto out_invalid_address;
+			break;
+		case Opt_clientaddr:
+			if (nfs_get_option_str(args, &mnt->client_address))
+				goto out_nomem;
+			break;
+		case Opt_mounthost:
+			if (nfs_get_option_str(args,
+					       &mnt->mount_server.hostname))
+				goto out_nomem;
+			break;
+		case Opt_mountaddr:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			mnt->mount_server.addrlen =
+				rpc_pton(string, strlen(string),
+					(struct sockaddr *)
+					&mnt->mount_server.address,
+					sizeof(mnt->mount_server.address));
+			kfree(string);
+			if (mnt->mount_server.addrlen == 0)
+				goto out_invalid_address;
+			break;
+		case Opt_lookupcache:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			token = match_token(string,
+					nfs_lookupcache_tokens, args);
+			kfree(string);
+			switch (token) {
+				case Opt_lookupcache_all:
+					mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE);
+					break;
+				case Opt_lookupcache_positive:
+					mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE;
+					mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG;
+					break;
+				case Opt_lookupcache_none:
+					mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE;
+					break;
+				default:
+					dfprintk(MOUNT, "NFS:   invalid "
+							"lookupcache argument\n");
+					return 0;
+			};
+			break;
+		case Opt_fscache_uniq:
+			if (nfs_get_option_str(args, &mnt->fscache_uniq))
+				goto out_nomem;
+			mnt->options |= NFS_OPTION_FSCACHE;
+			break;
+		case Opt_local_lock:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			token = match_token(string, nfs_local_lock_tokens,
+					args);
+			kfree(string);
+			switch (token) {
+			case Opt_local_lock_all:
+				mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+					       NFS_MOUNT_LOCAL_FCNTL);
+				break;
+			case Opt_local_lock_flock:
+				mnt->flags |= NFS_MOUNT_LOCAL_FLOCK;
+				break;
+			case Opt_local_lock_posix:
+				mnt->flags |= NFS_MOUNT_LOCAL_FCNTL;
+				break;
+			case Opt_local_lock_none:
+				mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+						NFS_MOUNT_LOCAL_FCNTL);
+				break;
+			default:
+				dfprintk(MOUNT, "NFS:	invalid	"
+						"local_lock argument\n");
+				return 0;
+			};
+			break;
+
+		/*
+		 * Special options
+		 */
+		case Opt_sloppy:
+			sloppy = 1;
+			dfprintk(MOUNT, "NFS:   relaxing parsing rules\n");
+			break;
+		case Opt_userspace:
+		case Opt_deprecated:
+			dfprintk(MOUNT, "NFS:   ignoring mount option "
+					"'%s'\n", p);
+			break;
+
+		default:
+			invalid_option = 1;
+			dfprintk(MOUNT, "NFS:   unrecognized mount option "
+					"'%s'\n", p);
+		}
+	}
+
+	if (!sloppy && invalid_option)
+		return 0;
+
+	/*
+	 * verify that any proto=/mountproto= options match the address
+	 * familiies in the addr=/mountaddr= options.
+	 */
+	if (protofamily != AF_UNSPEC &&
+	    protofamily != mnt->nfs_server.address.ss_family)
+		goto out_proto_mismatch;
+
+	if (mountfamily != AF_UNSPEC) {
+		if (mnt->mount_server.addrlen) {
+			if (mountfamily != mnt->mount_server.address.ss_family)
+				goto out_mountproto_mismatch;
+		} else {
+			if (mountfamily != mnt->nfs_server.address.ss_family)
+				goto out_mountproto_mismatch;
+		}
+	}
+
+	return 1;
+
+out_mountproto_mismatch:
+	printk(KERN_INFO "NFS: mount server address does not match mountproto= "
+			 "option\n");
+	return 0;
+out_proto_mismatch:
+	printk(KERN_INFO "NFS: server address does not match proto= option\n");
+	return 0;
+out_invalid_address:
+	printk(KERN_INFO "NFS: bad IP address specified: %s\n", p);
+	return 0;
+out_invalid_value:
+	printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p);
+	return 0;
+out_nomem:
+	printk(KERN_INFO "NFS: not enough memory to parse option\n");
+	return 0;
+out_security_failure:
+	free_secdata(secdata);
+	printk(KERN_INFO "NFS: security options invalid: %d\n", rc);
+	return 0;
+}
+
+/*
+ * Match the requested auth flavors with the list returned by
+ * the server.  Returns zero and sets the mount's authentication
+ * flavor on success; returns -EACCES if server does not support
+ * the requested flavor.
+ */
+static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
+			     struct nfs_mount_request *request)
+{
+	unsigned int i, j, server_authlist_len = *(request->auth_flav_len);
+
+	/*
+	 * Certain releases of Linux's mountd return an empty
+	 * flavor list.  To prevent behavioral regression with
+	 * these servers (ie. rejecting mounts that used to
+	 * succeed), revert to pre-2.6.32 behavior (no checking)
+	 * if the returned flavor list is empty.
+	 */
+	if (server_authlist_len == 0)
+		return 0;
+
+	/*
+	 * We avoid sophisticated negotiating here, as there are
+	 * plenty of cases where we can get it wrong, providing
+	 * either too little or too much security.
+	 *
+	 * RFC 2623, section 2.7 suggests we SHOULD prefer the
+	 * flavor listed first.  However, some servers list
+	 * AUTH_NULL first.  Our caller plants AUTH_SYS, the
+	 * preferred default, in args->auth_flavors[0] if user
+	 * didn't specify sec= mount option.
+	 */
+	for (i = 0; i < args->auth_flavor_len; i++)
+		for (j = 0; j < server_authlist_len; j++)
+			if (args->auth_flavors[i] == request->auth_flavs[j]) {
+				dfprintk(MOUNT, "NFS: using auth flavor %d\n",
+					request->auth_flavs[j]);
+				args->auth_flavors[0] = request->auth_flavs[j];
+				return 0;
+			}
+
+	dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n");
+	nfs_umount(request);
+	return -EACCES;
+}
+
+/*
+ * Use the remote server's MOUNT service to request the NFS file handle
+ * corresponding to the provided path.
+ */
+static int nfs_try_mount(struct nfs_parsed_mount_data *args,
+			 struct nfs_fh *root_fh)
+{
+	rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
+	unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
+	struct nfs_mount_request request = {
+		.sap		= (struct sockaddr *)
+						&args->mount_server.address,
+		.dirpath	= args->nfs_server.export_path,
+		.protocol	= args->mount_server.protocol,
+		.fh		= root_fh,
+		.noresvport	= args->flags & NFS_MOUNT_NORESVPORT,
+		.auth_flav_len	= &server_authlist_len,
+		.auth_flavs	= server_authlist,
+	};
+	int status;
+
+	if (args->mount_server.version == 0) {
+		switch (args->version) {
+			default:
+				args->mount_server.version = NFS_MNT3_VERSION;
+				break;
+			case 2:
+				args->mount_server.version = NFS_MNT_VERSION;
+		}
+	}
+	request.version = args->mount_server.version;
+
+	if (args->mount_server.hostname)
+		request.hostname = args->mount_server.hostname;
+	else
+		request.hostname = args->nfs_server.hostname;
+
+	/*
+	 * Construct the mount server's address.
+	 */
+	if (args->mount_server.address.ss_family == AF_UNSPEC) {
+		memcpy(request.sap, &args->nfs_server.address,
+		       args->nfs_server.addrlen);
+		args->mount_server.addrlen = args->nfs_server.addrlen;
+	}
+	request.salen = args->mount_server.addrlen;
+	nfs_set_port(request.sap, &args->mount_server.port, 0);
+
+	/*
+	 * Now ask the mount server to map our export path
+	 * to a file handle.
+	 */
+	status = nfs_mount(&request);
+	if (status != 0) {
+		dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
+				request.hostname, status);
+		return status;
+	}
+
+	/*
+	 * MNTv1 (NFSv2) does not support auth flavor negotiation.
+	 */
+	if (args->mount_server.version != NFS_MNT3_VERSION)
+		return 0;
+	return nfs_walk_authlist(args, &request);
+}
+
+/*
+ * Split "dev_name" into "hostname:export_path".
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path.  If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_devname(const char *dev_name,
+			     char **hostname, size_t maxnamlen,
+			     char **export_path, size_t maxpathlen)
+{
+	size_t len;
+	char *end;
+
+	/* Is the host name protected with square brakcets? */
+	if (*dev_name == '[') {
+		end = strchr(++dev_name, ']');
+		if (end == NULL || end[1] != ':')
+			goto out_bad_devname;
+
+		len = end - dev_name;
+		end++;
+	} else {
+		char *comma;
+
+		end = strchr(dev_name, ':');
+		if (end == NULL)
+			goto out_bad_devname;
+		len = end - dev_name;
+
+		/* kill possible hostname list: not supported */
+		comma = strchr(dev_name, ',');
+		if (comma != NULL && comma < end)
+			*comma = 0;
+	}
+
+	if (len > maxnamlen)
+		goto out_hostname;
+
+	/* N.B. caller will free nfs_server.hostname in all cases */
+	*hostname = kstrndup(dev_name, len, GFP_KERNEL);
+	if (*hostname == NULL)
+		goto out_nomem;
+	len = strlen(++end);
+	if (len > maxpathlen)
+		goto out_path;
+	*export_path = kstrndup(end, len, GFP_KERNEL);
+	if (!*export_path)
+		goto out_nomem;
+
+	dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
+	return 0;
+
+out_bad_devname:
+	dfprintk(MOUNT, "NFS: device name not in host:path format\n");
+	return -EINVAL;
+
+out_nomem:
+	dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
+	return -ENOMEM;
+
+out_hostname:
+	dfprintk(MOUNT, "NFS: server hostname too long\n");
+	return -ENAMETOOLONG;
+
+out_path:
+	dfprintk(MOUNT, "NFS: export pathname too long\n");
+	return -ENAMETOOLONG;
+}
+
+/*
+ * Validate the NFS2/NFS3 mount data
+ * - fills in the mount root filehandle
+ *
+ * For option strings, user space handles the following behaviors:
+ *
+ * + DNS: mapping server host name to IP address ("addr=" option)
+ *
+ * + failure mode: how to behave if a mount request can't be handled
+ *   immediately ("fg/bg" option)
+ *
+ * + retry: how often to retry a mount request ("retry=" option)
+ *
+ * + breaking back: trying proto=udp after proto=tcp, v2 after v3,
+ *   mountproto=tcp after mountproto=udp, and so on
+ */
+static int nfs_validate_mount_data(void *options,
+				   struct nfs_parsed_mount_data *args,
+				   struct nfs_fh *mntfh,
+				   const char *dev_name)
+{
+	struct nfs_mount_data *data = (struct nfs_mount_data *)options;
+	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
+
+	if (data == NULL)
+		goto out_no_data;
+
+	switch (data->version) {
+	case 1:
+		data->namlen = 0;
+	case 2:
+		data->bsize = 0;
+	case 3:
+		if (data->flags & NFS_MOUNT_VER3)
+			goto out_no_v3;
+		data->root.size = NFS2_FHSIZE;
+		memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
+	case 4:
+		if (data->flags & NFS_MOUNT_SECFLAVOUR)
+			goto out_no_sec;
+	case 5:
+		memset(data->context, 0, sizeof(data->context));
+	case 6:
+		if (data->flags & NFS_MOUNT_VER3) {
+			if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
+				goto out_invalid_fh;
+			mntfh->size = data->root.size;
+			args->version = 3;
+		} else {
+			mntfh->size = NFS2_FHSIZE;
+			args->version = 2;
+		}
+
+
+		memcpy(mntfh->data, data->root.data, mntfh->size);
+		if (mntfh->size < sizeof(mntfh->data))
+			memset(mntfh->data + mntfh->size, 0,
+			       sizeof(mntfh->data) - mntfh->size);
+
+		/*
+		 * Translate to nfs_parsed_mount_data, which nfs_fill_super
+		 * can deal with.
+		 */
+		args->flags		= data->flags & NFS_MOUNT_FLAGMASK;
+		args->flags		|= NFS_MOUNT_LEGACY_INTERFACE;
+		args->rsize		= data->rsize;
+		args->wsize		= data->wsize;
+		args->timeo		= data->timeo;
+		args->retrans		= data->retrans;
+		args->acregmin		= data->acregmin;
+		args->acregmax		= data->acregmax;
+		args->acdirmin		= data->acdirmin;
+		args->acdirmax		= data->acdirmax;
+
+		memcpy(sap, &data->addr, sizeof(data->addr));
+		args->nfs_server.addrlen = sizeof(data->addr);
+		if (!nfs_verify_server_address(sap))
+			goto out_no_address;
+
+		if (!(data->flags & NFS_MOUNT_TCP))
+			args->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+		/* N.B. caller will free nfs_server.hostname in all cases */
+		args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
+		args->namlen		= data->namlen;
+		args->bsize		= data->bsize;
+
+		if (data->flags & NFS_MOUNT_SECFLAVOUR)
+			args->auth_flavors[0] = data->pseudoflavor;
+		if (!args->nfs_server.hostname)
+			goto out_nomem;
+
+		if (!(data->flags & NFS_MOUNT_NONLM))
+			args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK|
+					 NFS_MOUNT_LOCAL_FCNTL);
+		else
+			args->flags |= (NFS_MOUNT_LOCAL_FLOCK|
+					NFS_MOUNT_LOCAL_FCNTL);
+		/*
+		 * The legacy version 6 binary mount data from userspace has a
+		 * field used only to transport selinux information into the
+		 * the kernel.  To continue to support that functionality we
+		 * have a touch of selinux knowledge here in the NFS code. The
+		 * userspace code converted context=blah to just blah so we are
+		 * converting back to the full string selinux understands.
+		 */
+		if (data->context[0]){
+#ifdef CONFIG_SECURITY_SELINUX
+			int rc;
+			char *opts_str = kmalloc(sizeof(data->context) + 8, GFP_KERNEL);
+			if (!opts_str)
+				return -ENOMEM;
+			strcpy(opts_str, "context=");
+			data->context[NFS_MAX_CONTEXT_LEN] = '\0';
+			strcat(opts_str, &data->context[0]);
+			rc = security_sb_parse_opts_str(opts_str, &args->lsm_opts);
+			kfree(opts_str);
+			if (rc)
+				return rc;
+#else
+			return -EINVAL;
+#endif
+		}
+
+		break;
+	default: {
+		int status;
+
+		if (nfs_parse_mount_options((char *)options, args) == 0)
+			return -EINVAL;
+
+		if (!nfs_verify_server_address(sap))
+			goto out_no_address;
+
+		if (args->version == 4)
+#ifdef CONFIG_NFS_V4
+			return nfs4_validate_text_mount_data(options,
+							     args, dev_name);
+#else
+			goto out_v4_not_compiled;
+#endif
+
+		nfs_set_port(sap, &args->nfs_server.port, 0);
+
+		nfs_set_mount_transport_protocol(args);
+
+		status = nfs_parse_devname(dev_name,
+					   &args->nfs_server.hostname,
+					   PAGE_SIZE,
+					   &args->nfs_server.export_path,
+					   NFS_MAXPATHLEN);
+		if (!status)
+			status = nfs_try_mount(args, mntfh);
+
+		kfree(args->nfs_server.export_path);
+		args->nfs_server.export_path = NULL;
+
+		if (status)
+			return status;
+
+		break;
+		}
+	}
+
+#ifndef CONFIG_NFS_V3
+	if (args->version == 3)
+		goto out_v3_not_compiled;
+#endif /* !CONFIG_NFS_V3 */
+
+	return 0;
+
+out_no_data:
+	dfprintk(MOUNT, "NFS: mount program didn't pass any mount data\n");
+	return -EINVAL;
+
+out_no_v3:
+	dfprintk(MOUNT, "NFS: nfs_mount_data version %d does not support v3\n",
+		 data->version);
+	return -EINVAL;
+
+out_no_sec:
+	dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n");
+	return -EINVAL;
+
+#ifndef CONFIG_NFS_V3
+out_v3_not_compiled:
+	dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n");
+	return -EPROTONOSUPPORT;
+#endif /* !CONFIG_NFS_V3 */
+
+#ifndef CONFIG_NFS_V4
+out_v4_not_compiled:
+	dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
+	return -EPROTONOSUPPORT;
+#endif /* !CONFIG_NFS_V4 */
+
+out_nomem:
+	dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
+	return -ENOMEM;
+
+out_no_address:
+	dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n");
+	return -EINVAL;
+
+out_invalid_fh:
+	dfprintk(MOUNT, "NFS: invalid root filehandle\n");
+	return -EINVAL;
+}
+
+static int
+nfs_compare_remount_data(struct nfs_server *nfss,
+			 struct nfs_parsed_mount_data *data)
+{
+	if (data->flags != nfss->flags ||
+	    data->rsize != nfss->rsize ||
+	    data->wsize != nfss->wsize ||
+	    data->retrans != nfss->client->cl_timeout->to_retries ||
+	    data->auth_flavors[0] != nfss->client->cl_auth->au_flavor ||
+	    data->acregmin != nfss->acregmin / HZ ||
+	    data->acregmax != nfss->acregmax / HZ ||
+	    data->acdirmin != nfss->acdirmin / HZ ||
+	    data->acdirmax != nfss->acdirmax / HZ ||
+	    data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+	    data->nfs_server.port != nfss->port ||
+	    data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
+	    !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address,
+			  (struct sockaddr *)&nfss->nfs_client->cl_addr))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int
+nfs_remount(struct super_block *sb, int *flags, char *raw_data)
+{
+	int error;
+	struct nfs_server *nfss = sb->s_fs_info;
+	struct nfs_parsed_mount_data *data;
+	struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data;
+	struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
+	u32 nfsvers = nfss->nfs_client->rpc_ops->version;
+
+	/*
+	 * Userspace mount programs that send binary options generally send
+	 * them populated with default values. We have no way to know which
+	 * ones were explicitly specified. Fall back to legacy behavior and
+	 * just return success.
+	 */
+	if ((nfsvers == 4 && (!options4 || options4->version == 1)) ||
+	    (nfsvers <= 3 && (!options || (options->version >= 1 &&
+					   options->version <= 6))))
+		return 0;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		return -ENOMEM;
+
+	/* fill out struct with values from existing mount */
+	data->flags = nfss->flags;
+	data->rsize = nfss->rsize;
+	data->wsize = nfss->wsize;
+	data->retrans = nfss->client->cl_timeout->to_retries;
+	data->auth_flavors[0] = nfss->client->cl_auth->au_flavor;
+	data->acregmin = nfss->acregmin / HZ;
+	data->acregmax = nfss->acregmax / HZ;
+	data->acdirmin = nfss->acdirmin / HZ;
+	data->acdirmax = nfss->acdirmax / HZ;
+	data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
+	data->nfs_server.port = nfss->port;
+	data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
+	memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
+		data->nfs_server.addrlen);
+
+	/* overwrite those values with any that were specified */
+	error = nfs_parse_mount_options((char *)options, data);
+	if (error < 0)
+		goto out;
+
+	/*
+	 * noac is a special case. It implies -o sync, but that's not
+	 * necessarily reflected in the mtab options. do_remount_sb
+	 * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the
+	 * remount options, so we have to explicitly reset it.
+	 */
+	if (data->flags & NFS_MOUNT_NOAC)
+		*flags |= MS_SYNCHRONOUS;
+
+	/* compare new mount options with old ones */
+	error = nfs_compare_remount_data(nfss, data);
+out:
+	kfree(data);
+	return error;
+}
+
+/*
+ * Initialise the common bits of the superblock
+ */
+static inline void nfs_initialise_sb(struct super_block *sb)
+{
+	struct nfs_server *server = NFS_SB(sb);
+
+	sb->s_magic = NFS_SUPER_MAGIC;
+
+	/* We probably want something more informative here */
+	snprintf(sb->s_id, sizeof(sb->s_id),
+		 "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+
+	if (sb->s_blocksize == 0)
+		sb->s_blocksize = nfs_block_bits(server->wsize,
+						 &sb->s_blocksize_bits);
+
+	if (server->flags & NFS_MOUNT_NOAC)
+		sb->s_flags |= MS_SYNCHRONOUS;
+
+	sb->s_bdi = &server->backing_dev_info;
+
+	nfs_super_set_maxbytes(sb, server->maxfilesize);
+}
+
+/*
+ * Finish setting up an NFS2/3 superblock
+ */
+static void nfs_fill_super(struct super_block *sb,
+			   struct nfs_parsed_mount_data *data)
+{
+	struct nfs_server *server = NFS_SB(sb);
+
+	sb->s_blocksize_bits = 0;
+	sb->s_blocksize = 0;
+	if (data->bsize)
+		sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
+
+	if (server->nfs_client->rpc_ops->version == 3) {
+		/* The VFS shouldn't apply the umask to mode bits. We will do
+		 * so ourselves when necessary.
+		 */
+		sb->s_flags |= MS_POSIXACL;
+		sb->s_time_gran = 1;
+	}
+
+	sb->s_op = &nfs_sops;
+ 	nfs_initialise_sb(sb);
+}
+
+/*
+ * Finish setting up a cloned NFS2/3 superblock
+ */
+static void nfs_clone_super(struct super_block *sb,
+			    const struct super_block *old_sb)
+{
+	struct nfs_server *server = NFS_SB(sb);
+
+	sb->s_blocksize_bits = old_sb->s_blocksize_bits;
+	sb->s_blocksize = old_sb->s_blocksize;
+	sb->s_maxbytes = old_sb->s_maxbytes;
+
+	if (server->nfs_client->rpc_ops->version == 3) {
+		/* The VFS shouldn't apply the umask to mode bits. We will do
+		 * so ourselves when necessary.
+		 */
+		sb->s_flags |= MS_POSIXACL;
+		sb->s_time_gran = 1;
+	}
+
+	sb->s_op = old_sb->s_op;
+ 	nfs_initialise_sb(sb);
+}
+
+static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
+{
+	const struct nfs_server *a = s->s_fs_info;
+	const struct rpc_clnt *clnt_a = a->client;
+	const struct rpc_clnt *clnt_b = b->client;
+
+	if ((s->s_flags & NFS_MS_MASK) != (flags & NFS_MS_MASK))
+		goto Ebusy;
+	if (a->nfs_client != b->nfs_client)
+		goto Ebusy;
+	if (a->flags != b->flags)
+		goto Ebusy;
+	if (a->wsize != b->wsize)
+		goto Ebusy;
+	if (a->rsize != b->rsize)
+		goto Ebusy;
+	if (a->acregmin != b->acregmin)
+		goto Ebusy;
+	if (a->acregmax != b->acregmax)
+		goto Ebusy;
+	if (a->acdirmin != b->acdirmin)
+		goto Ebusy;
+	if (a->acdirmax != b->acdirmax)
+		goto Ebusy;
+	if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
+		goto Ebusy;
+	return 1;
+Ebusy:
+	return 0;
+}
+
+struct nfs_sb_mountdata {
+	struct nfs_server *server;
+	int mntflags;
+};
+
+static int nfs_set_super(struct super_block *s, void *data)
+{
+	struct nfs_sb_mountdata *sb_mntdata = data;
+	struct nfs_server *server = sb_mntdata->server;
+	int ret;
+
+	s->s_flags = sb_mntdata->mntflags;
+	s->s_fs_info = server;
+	s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
+	ret = set_anon_super(s, server);
+	if (ret == 0)
+		server->s_dev = s->s_dev;
+	return ret;
+}
+
+static int nfs_compare_super_address(struct nfs_server *server1,
+				     struct nfs_server *server2)
+{
+	struct sockaddr *sap1, *sap2;
+
+	sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr;
+	sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr;
+
+	if (sap1->sa_family != sap2->sa_family)
+		return 0;
+
+	switch (sap1->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *sin1 = (struct sockaddr_in *)sap1;
+		struct sockaddr_in *sin2 = (struct sockaddr_in *)sap2;
+		if (sin1->sin_addr.s_addr != sin2->sin_addr.s_addr)
+			return 0;
+		if (sin1->sin_port != sin2->sin_port)
+			return 0;
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *sin1 = (struct sockaddr_in6 *)sap1;
+		struct sockaddr_in6 *sin2 = (struct sockaddr_in6 *)sap2;
+		if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
+			return 0;
+		if (sin1->sin6_port != sin2->sin6_port)
+			return 0;
+		break;
+	}
+	default:
+		return 0;
+	}
+
+	return 1;
+}
+
+static int nfs_compare_super(struct super_block *sb, void *data)
+{
+	struct nfs_sb_mountdata *sb_mntdata = data;
+	struct nfs_server *server = sb_mntdata->server, *old = NFS_SB(sb);
+	int mntflags = sb_mntdata->mntflags;
+
+	if (!nfs_compare_super_address(old, server))
+		return 0;
+	/* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */
+	if (old->flags & NFS_MOUNT_UNSHARED)
+		return 0;
+	if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0)
+		return 0;
+	return nfs_compare_mount_options(sb, server, mntflags);
+}
+
+static int nfs_bdi_register(struct nfs_server *server)
+{
+	return bdi_register_dev(&server->backing_dev_info, server->s_dev);
+}
+
+static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_server *server = NULL;
+	struct super_block *s;
+	struct nfs_parsed_mount_data *data;
+	struct nfs_fh *mntfh;
+	struct dentry *mntroot = ERR_PTR(-ENOMEM);
+	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
+	struct nfs_sb_mountdata sb_mntdata = {
+		.mntflags = flags,
+	};
+	int error;
+
+	data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
+	mntfh = nfs_alloc_fhandle();
+	if (data == NULL || mntfh == NULL)
+		goto out;
+
+	/* Validate the mount data */
+	error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
+	if (error < 0) {
+		mntroot = ERR_PTR(error);
+		goto out;
+	}
+
+#ifdef CONFIG_NFS_V4
+	if (data->version == 4) {
+		mntroot = nfs4_try_mount(flags, dev_name, data);
+		goto out;
+	}
+#endif	/* CONFIG_NFS_V4 */
+
+	/* Get a volume representation */
+	server = nfs_create_server(data, mntfh);
+	if (IS_ERR(server)) {
+		mntroot = ERR_CAST(server);
+		goto out;
+	}
+	sb_mntdata.server = server;
+
+	if (server->flags & NFS_MOUNT_UNSHARED)
+		compare_super = NULL;
+
+	/* Get a superblock - note that we may end up sharing one that already exists */
+	s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
+	if (IS_ERR(s)) {
+		mntroot = ERR_CAST(s);
+		goto out_err_nosb;
+	}
+
+	if (s->s_fs_info != server) {
+		nfs_free_server(server);
+		server = NULL;
+	} else {
+		error = nfs_bdi_register(server);
+		if (error) {
+			mntroot = ERR_PTR(error);
+			goto error_splat_bdi;
+		}
+	}
+
+	if (!s->s_root) {
+		/* initial superblock/root creation */
+		nfs_fill_super(s, data);
+		nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL);
+	}
+
+	mntroot = nfs_get_root(s, mntfh, dev_name);
+	if (IS_ERR(mntroot))
+		goto error_splat_super;
+
+	error = security_sb_set_mnt_opts(s, &data->lsm_opts);
+	if (error)
+		goto error_splat_root;
+
+	s->s_flags |= MS_ACTIVE;
+
+out:
+	nfs_free_parsed_mount_data(data);
+	nfs_free_fhandle(mntfh);
+	return mntroot;
+
+out_err_nosb:
+	nfs_free_server(server);
+	goto out;
+
+error_splat_root:
+	dput(mntroot);
+	mntroot = ERR_PTR(error);
+error_splat_super:
+	if (server && !s->s_root)
+		bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
+	deactivate_locked_super(s);
+	goto out;
+}
+
+/*
+ * Ensure that we unregister the bdi before kill_anon_super
+ * releases the device name
+ */
+static void nfs_put_super(struct super_block *s)
+{
+	struct nfs_server *server = NFS_SB(s);
+
+	bdi_unregister(&server->backing_dev_info);
+}
+
+/*
+ * Destroy an NFS2/3 superblock
+ */
+static void nfs_kill_super(struct super_block *s)
+{
+	struct nfs_server *server = NFS_SB(s);
+
+	kill_anon_super(s);
+	nfs_fscache_release_super_cookie(s);
+	nfs_free_server(server);
+}
+
+/*
+ * Clone an NFS2/3 server record on xdev traversal (FSID-change)
+ */
+static struct dentry *
+nfs_xdev_mount(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	struct super_block *s;
+	struct nfs_server *server;
+	struct dentry *mntroot;
+	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
+	struct nfs_sb_mountdata sb_mntdata = {
+		.mntflags = flags,
+	};
+	int error;
+
+	dprintk("--> nfs_xdev_mount()\n");
+
+	/* create a new volume representation */
+	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
+	if (IS_ERR(server)) {
+		error = PTR_ERR(server);
+		goto out_err_noserver;
+	}
+	sb_mntdata.server = server;
+
+	if (server->flags & NFS_MOUNT_UNSHARED)
+		compare_super = NULL;
+
+	/* Get a superblock - note that we may end up sharing one that already exists */
+	s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto out_err_nosb;
+	}
+
+	if (s->s_fs_info != server) {
+		nfs_free_server(server);
+		server = NULL;
+	} else {
+		error = nfs_bdi_register(server);
+		if (error)
+			goto error_splat_bdi;
+	}
+
+	if (!s->s_root) {
+		/* initial superblock/root creation */
+		nfs_clone_super(s, data->sb);
+		nfs_fscache_get_super_cookie(s, NULL, data);
+	}
+
+	mntroot = nfs_get_root(s, data->fh, dev_name);
+	if (IS_ERR(mntroot)) {
+		error = PTR_ERR(mntroot);
+		goto error_splat_super;
+	}
+	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
+		dput(mntroot);
+		error = -ESTALE;
+		goto error_splat_super;
+	}
+
+	s->s_flags |= MS_ACTIVE;
+
+	/* clone any lsm security options from the parent to the new sb */
+	security_sb_clone_mnt_opts(data->sb, s);
+
+	dprintk("<-- nfs_xdev_mount() = 0\n");
+	return mntroot;
+
+out_err_nosb:
+	nfs_free_server(server);
+out_err_noserver:
+	dprintk("<-- nfs_xdev_mount() = %d [error]\n", error);
+	return ERR_PTR(error);
+
+error_splat_super:
+	if (server && !s->s_root)
+		bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
+	deactivate_locked_super(s);
+	dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error);
+	return ERR_PTR(error);
+}
+
+#ifdef CONFIG_NFS_V4
+
+/*
+ * Finish setting up a cloned NFS4 superblock
+ */
+static void nfs4_clone_super(struct super_block *sb,
+			    const struct super_block *old_sb)
+{
+	sb->s_blocksize_bits = old_sb->s_blocksize_bits;
+	sb->s_blocksize = old_sb->s_blocksize;
+	sb->s_maxbytes = old_sb->s_maxbytes;
+	sb->s_time_gran = 1;
+	sb->s_op = old_sb->s_op;
+	/*
+	 * The VFS shouldn't apply the umask to mode bits. We will do
+	 * so ourselves when necessary.
+	 */
+	sb->s_flags  |= MS_POSIXACL;
+	sb->s_xattr  = old_sb->s_xattr;
+	nfs_initialise_sb(sb);
+}
+
+/*
+ * Set up an NFS4 superblock
+ */
+static void nfs4_fill_super(struct super_block *sb)
+{
+	sb->s_time_gran = 1;
+	sb->s_op = &nfs4_sops;
+	/*
+	 * The VFS shouldn't apply the umask to mode bits. We will do
+	 * so ourselves when necessary.
+	 */
+	sb->s_flags  |= MS_POSIXACL;
+	sb->s_xattr = nfs4_xattr_handlers;
+	nfs_initialise_sb(sb);
+}
+
+static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
+{
+	args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3|
+			 NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL);
+}
+
+static int nfs4_validate_text_mount_data(void *options,
+					 struct nfs_parsed_mount_data *args,
+					 const char *dev_name)
+{
+	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
+
+	nfs_set_port(sap, &args->nfs_server.port, NFS_PORT);
+
+	nfs_validate_transport_protocol(args);
+
+	nfs4_validate_mount_flags(args);
+
+	if (args->version != 4) {
+		dfprintk(MOUNT,
+			 "NFS4: Illegal mount version\n");
+		return -EINVAL;
+	}
+
+	if (args->auth_flavor_len > 1) {
+		dfprintk(MOUNT,
+			 "NFS4: Too many RPC auth flavours specified\n");
+		return -EINVAL;
+	}
+
+	if (args->client_address == NULL) {
+		dfprintk(MOUNT,
+			 "NFS4: mount program didn't pass callback address\n");
+		return -EINVAL;
+	}
+
+	return nfs_parse_devname(dev_name,
+				   &args->nfs_server.hostname,
+				   NFS4_MAXNAMLEN,
+				   &args->nfs_server.export_path,
+				   NFS4_MAXPATHLEN);
+}
+
+/*
+ * Validate NFSv4 mount options
+ */
+static int nfs4_validate_mount_data(void *options,
+				    struct nfs_parsed_mount_data *args,
+				    const char *dev_name)
+{
+	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
+	struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
+	char *c;
+
+	if (data == NULL)
+		goto out_no_data;
+
+	switch (data->version) {
+	case 1:
+		if (data->host_addrlen > sizeof(args->nfs_server.address))
+			goto out_no_address;
+		if (data->host_addrlen == 0)
+			goto out_no_address;
+		args->nfs_server.addrlen = data->host_addrlen;
+		if (copy_from_user(sap, data->host_addr, data->host_addrlen))
+			return -EFAULT;
+		if (!nfs_verify_server_address(sap))
+			goto out_no_address;
+
+		if (data->auth_flavourlen) {
+			if (data->auth_flavourlen > 1)
+				goto out_inval_auth;
+			if (copy_from_user(&args->auth_flavors[0],
+					   data->auth_flavours,
+					   sizeof(args->auth_flavors[0])))
+				return -EFAULT;
+		}
+
+		c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
+		if (IS_ERR(c))
+			return PTR_ERR(c);
+		args->nfs_server.hostname = c;
+
+		c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN);
+		if (IS_ERR(c))
+			return PTR_ERR(c);
+		args->nfs_server.export_path = c;
+		dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c);
+
+		c = strndup_user(data->client_addr.data, 16);
+		if (IS_ERR(c))
+			return PTR_ERR(c);
+		args->client_address = c;
+
+		/*
+		 * Translate to nfs_parsed_mount_data, which nfs4_fill_super
+		 * can deal with.
+		 */
+
+		args->flags	= data->flags & NFS4_MOUNT_FLAGMASK;
+		args->rsize	= data->rsize;
+		args->wsize	= data->wsize;
+		args->timeo	= data->timeo;
+		args->retrans	= data->retrans;
+		args->acregmin	= data->acregmin;
+		args->acregmax	= data->acregmax;
+		args->acdirmin	= data->acdirmin;
+		args->acdirmax	= data->acdirmax;
+		args->nfs_server.protocol = data->proto;
+		nfs_validate_transport_protocol(args);
+
+		break;
+	default:
+		if (nfs_parse_mount_options((char *)options, args) == 0)
+			return -EINVAL;
+
+		if (!nfs_verify_server_address(sap))
+			return -EINVAL;
+
+		return nfs4_validate_text_mount_data(options, args, dev_name);
+	}
+
+	return 0;
+
+out_no_data:
+	dfprintk(MOUNT, "NFS4: mount program didn't pass any mount data\n");
+	return -EINVAL;
+
+out_inval_auth:
+	dfprintk(MOUNT, "NFS4: Invalid number of RPC auth flavours %d\n",
+		 data->auth_flavourlen);
+	return -EINVAL;
+
+out_no_address:
+	dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
+	return -EINVAL;
+}
+
+/*
+ * Get the superblock for the NFS4 root partition
+ */
+static struct dentry *
+nfs4_remote_mount(struct file_system_type *fs_type, int flags,
+		  const char *dev_name, void *raw_data)
+{
+	struct nfs_parsed_mount_data *data = raw_data;
+	struct super_block *s;
+	struct nfs_server *server;
+	struct nfs_fh *mntfh;
+	struct dentry *mntroot;
+	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
+	struct nfs_sb_mountdata sb_mntdata = {
+		.mntflags = flags,
+	};
+	int error = -ENOMEM;
+
+	mntfh = nfs_alloc_fhandle();
+	if (data == NULL || mntfh == NULL)
+		goto out;
+
+	/* Get a volume representation */
+	server = nfs4_create_server(data, mntfh);
+	if (IS_ERR(server)) {
+		error = PTR_ERR(server);
+		goto out;
+	}
+	sb_mntdata.server = server;
+
+	if (server->flags & NFS4_MOUNT_UNSHARED)
+		compare_super = NULL;
+
+	/* Get a superblock - note that we may end up sharing one that already exists */
+	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto out_free;
+	}
+
+	if (s->s_fs_info != server) {
+		nfs_free_server(server);
+		server = NULL;
+	} else {
+		error = nfs_bdi_register(server);
+		if (error)
+			goto error_splat_bdi;
+	}
+
+	if (!s->s_root) {
+		/* initial superblock/root creation */
+		nfs4_fill_super(s);
+		nfs_fscache_get_super_cookie(
+			s, data ? data->fscache_uniq : NULL, NULL);
+	}
+
+	mntroot = nfs4_get_root(s, mntfh, dev_name);
+	if (IS_ERR(mntroot)) {
+		error = PTR_ERR(mntroot);
+		goto error_splat_super;
+	}
+
+	error = security_sb_set_mnt_opts(s, &data->lsm_opts);
+	if (error)
+		goto error_splat_root;
+
+	s->s_flags |= MS_ACTIVE;
+
+	nfs_free_fhandle(mntfh);
+	return mntroot;
+
+out:
+	nfs_free_fhandle(mntfh);
+	return ERR_PTR(error);
+
+out_free:
+	nfs_free_server(server);
+	goto out;
+
+error_splat_root:
+	dput(mntroot);
+error_splat_super:
+	if (server && !s->s_root)
+		bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
+	deactivate_locked_super(s);
+	goto out;
+}
+
+static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type,
+		int flags, void *data, const char *hostname)
+{
+	struct vfsmount *root_mnt;
+	char *root_devname;
+	size_t len;
+
+	len = strlen(hostname) + 5;
+	root_devname = kmalloc(len, GFP_KERNEL);
+	if (root_devname == NULL)
+		return ERR_PTR(-ENOMEM);
+	/* Does hostname needs to be enclosed in brackets? */
+	if (strchr(hostname, ':'))
+		snprintf(root_devname, len, "[%s]:/", hostname);
+	else
+		snprintf(root_devname, len, "%s:/", hostname);
+	root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data);
+	kfree(root_devname);
+	return root_mnt;
+}
+
+struct nfs_referral_count {
+	struct list_head list;
+	const struct task_struct *task;
+	unsigned int referral_count;
+};
+
+static LIST_HEAD(nfs_referral_count_list);
+static DEFINE_SPINLOCK(nfs_referral_count_list_lock);
+
+static struct nfs_referral_count *nfs_find_referral_count(void)
+{
+	struct nfs_referral_count *p;
+
+	list_for_each_entry(p, &nfs_referral_count_list, list) {
+		if (p->task == current)
+			return p;
+	}
+	return NULL;
+}
+
+#define NFS_MAX_NESTED_REFERRALS 2
+
+static int nfs_referral_loop_protect(void)
+{
+	struct nfs_referral_count *p, *new;
+	int ret = -ENOMEM;
+
+	new = kmalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		goto out;
+	new->task = current;
+	new->referral_count = 1;
+
+	ret = 0;
+	spin_lock(&nfs_referral_count_list_lock);
+	p = nfs_find_referral_count();
+	if (p != NULL) {
+		if (p->referral_count >= NFS_MAX_NESTED_REFERRALS)
+			ret = -ELOOP;
+		else
+			p->referral_count++;
+	} else {
+		list_add(&new->list, &nfs_referral_count_list);
+		new = NULL;
+	}
+	spin_unlock(&nfs_referral_count_list_lock);
+	kfree(new);
+out:
+	return ret;
+}
+
+static void nfs_referral_loop_unprotect(void)
+{
+	struct nfs_referral_count *p;
+
+	spin_lock(&nfs_referral_count_list_lock);
+	p = nfs_find_referral_count();
+	p->referral_count--;
+	if (p->referral_count == 0)
+		list_del(&p->list);
+	else
+		p = NULL;
+	spin_unlock(&nfs_referral_count_list_lock);
+	kfree(p);
+}
+
+static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
+		const char *export_path)
+{
+	struct nameidata *nd = NULL;
+	struct mnt_namespace *ns_private;
+	struct super_block *s;
+	struct dentry *dentry;
+	int ret;
+
+	nd = kmalloc(sizeof(*nd), GFP_KERNEL);
+	if (nd == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ns_private = create_mnt_ns(root_mnt);
+	ret = PTR_ERR(ns_private);
+	if (IS_ERR(ns_private))
+		goto out_mntput;
+
+	ret = nfs_referral_loop_protect();
+	if (ret != 0)
+		goto out_put_mnt_ns;
+
+	ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
+			export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, nd);
+
+	nfs_referral_loop_unprotect();
+	put_mnt_ns(ns_private);
+
+	if (ret != 0)
+		goto out_err;
+
+	s = nd->path.mnt->mnt_sb;
+	atomic_inc(&s->s_active);
+	dentry = dget(nd->path.dentry);
+
+	path_put(&nd->path);
+	kfree(nd);
+	down_write(&s->s_umount);
+	return dentry;
+out_put_mnt_ns:
+	put_mnt_ns(ns_private);
+out_mntput:
+	mntput(root_mnt);
+out_err:
+	kfree(nd);
+	return ERR_PTR(ret);
+}
+
+static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
+			 struct nfs_parsed_mount_data *data)
+{
+	char *export_path;
+	struct vfsmount *root_mnt;
+	struct dentry *res;
+
+	dfprintk(MOUNT, "--> nfs4_try_mount()\n");
+
+	export_path = data->nfs_server.export_path;
+	data->nfs_server.export_path = "/";
+	root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
+			data->nfs_server.hostname);
+	data->nfs_server.export_path = export_path;
+
+	res = ERR_CAST(root_mnt);
+	if (!IS_ERR(root_mnt))
+		res = nfs_follow_remote_path(root_mnt, export_path);
+
+	dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n",
+			IS_ERR(res) ? PTR_ERR(res) : 0,
+			IS_ERR(res) ? " [error]" : "");
+	return res;
+}
+
+/*
+ * Get the superblock for an NFS4 mountpoint
+ */
+static struct dentry *nfs4_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_parsed_mount_data *data;
+	int error = -ENOMEM;
+	struct dentry *res = ERR_PTR(-ENOMEM);
+
+	data = nfs_alloc_parsed_mount_data(4);
+	if (data == NULL)
+		goto out;
+
+	/* Validate the mount data */
+	error = nfs4_validate_mount_data(raw_data, data, dev_name);
+	if (error < 0) {
+		res = ERR_PTR(error);
+		goto out;
+	}
+
+	res = nfs4_try_mount(flags, dev_name, data);
+	if (IS_ERR(res))
+		error = PTR_ERR(res);
+
+out:
+	nfs_free_parsed_mount_data(data);
+	dprintk("<-- nfs4_mount() = %d%s\n", error,
+			error != 0 ? " [error]" : "");
+	return res;
+}
+
+static void nfs4_kill_super(struct super_block *sb)
+{
+	struct nfs_server *server = NFS_SB(sb);
+
+	dprintk("--> %s\n", __func__);
+	nfs_super_return_all_delegations(sb);
+	kill_anon_super(sb);
+	nfs_fscache_release_super_cookie(sb);
+	nfs_free_server(server);
+	dprintk("<-- %s\n", __func__);
+}
+
+/*
+ * Clone an NFS4 server record on xdev traversal (FSID-change)
+ */
+static struct dentry *
+nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
+		 const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	struct super_block *s;
+	struct nfs_server *server;
+	struct dentry *mntroot;
+	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
+	struct nfs_sb_mountdata sb_mntdata = {
+		.mntflags = flags,
+	};
+	int error;
+
+	dprintk("--> nfs4_xdev_mount()\n");
+
+	/* create a new volume representation */
+	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
+	if (IS_ERR(server)) {
+		error = PTR_ERR(server);
+		goto out_err_noserver;
+	}
+	sb_mntdata.server = server;
+
+	if (server->flags & NFS4_MOUNT_UNSHARED)
+		compare_super = NULL;
+
+	/* Get a superblock - note that we may end up sharing one that already exists */
+	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto out_err_nosb;
+	}
+
+	if (s->s_fs_info != server) {
+		nfs_free_server(server);
+		server = NULL;
+	} else {
+		error = nfs_bdi_register(server);
+		if (error)
+			goto error_splat_bdi;
+	}
+
+	if (!s->s_root) {
+		/* initial superblock/root creation */
+		nfs4_clone_super(s, data->sb);
+		nfs_fscache_get_super_cookie(s, NULL, data);
+	}
+
+	mntroot = nfs4_get_root(s, data->fh, dev_name);
+	if (IS_ERR(mntroot)) {
+		error = PTR_ERR(mntroot);
+		goto error_splat_super;
+	}
+	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
+		dput(mntroot);
+		error = -ESTALE;
+		goto error_splat_super;
+	}
+
+	s->s_flags |= MS_ACTIVE;
+
+	security_sb_clone_mnt_opts(data->sb, s);
+
+	dprintk("<-- nfs4_xdev_mount() = 0\n");
+	return mntroot;
+
+out_err_nosb:
+	nfs_free_server(server);
+out_err_noserver:
+	dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error);
+	return ERR_PTR(error);
+
+error_splat_super:
+	if (server && !s->s_root)
+		bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
+	deactivate_locked_super(s);
+	dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error);
+	return ERR_PTR(error);
+}
+
+static struct dentry *
+nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
+			   const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	struct super_block *s;
+	struct nfs_server *server;
+	struct dentry *mntroot;
+	struct nfs_fh *mntfh;
+	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
+	struct nfs_sb_mountdata sb_mntdata = {
+		.mntflags = flags,
+	};
+	int error = -ENOMEM;
+
+	dprintk("--> nfs4_referral_get_sb()\n");
+
+	mntfh = nfs_alloc_fhandle();
+	if (mntfh == NULL)
+		goto out_err_nofh;
+
+	/* create a new volume representation */
+	server = nfs4_create_referral_server(data, mntfh);
+	if (IS_ERR(server)) {
+		error = PTR_ERR(server);
+		goto out_err_noserver;
+	}
+	sb_mntdata.server = server;
+
+	if (server->flags & NFS4_MOUNT_UNSHARED)
+		compare_super = NULL;
+
+	/* Get a superblock - note that we may end up sharing one that already exists */
+	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto out_err_nosb;
+	}
+
+	if (s->s_fs_info != server) {
+		nfs_free_server(server);
+		server = NULL;
+	} else {
+		error = nfs_bdi_register(server);
+		if (error)
+			goto error_splat_bdi;
+	}
+
+	if (!s->s_root) {
+		/* initial superblock/root creation */
+		nfs4_fill_super(s);
+		nfs_fscache_get_super_cookie(s, NULL, data);
+	}
+
+	mntroot = nfs4_get_root(s, mntfh, dev_name);
+	if (IS_ERR(mntroot)) {
+		error = PTR_ERR(mntroot);
+		goto error_splat_super;
+	}
+	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
+		dput(mntroot);
+		error = -ESTALE;
+		goto error_splat_super;
+	}
+
+	s->s_flags |= MS_ACTIVE;
+
+	security_sb_clone_mnt_opts(data->sb, s);
+
+	nfs_free_fhandle(mntfh);
+	dprintk("<-- nfs4_referral_get_sb() = 0\n");
+	return mntroot;
+
+out_err_nosb:
+	nfs_free_server(server);
+out_err_noserver:
+	nfs_free_fhandle(mntfh);
+out_err_nofh:
+	dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
+	return ERR_PTR(error);
+
+error_splat_super:
+	if (server && !s->s_root)
+		bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
+	deactivate_locked_super(s);
+	nfs_free_fhandle(mntfh);
+	dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
+	return ERR_PTR(error);
+}
+
+/*
+ * Create an NFS4 server record on referral traversal
+ */
+static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	char *export_path;
+	struct vfsmount *root_mnt;
+	struct dentry *res;
+
+	dprintk("--> nfs4_referral_mount()\n");
+
+	export_path = data->mnt_path;
+	data->mnt_path = "/";
+
+	root_mnt = nfs_do_root_mount(&nfs4_remote_referral_fs_type,
+			flags, data, data->hostname);
+	data->mnt_path = export_path;
+
+	res = ERR_CAST(root_mnt);
+	if (!IS_ERR(root_mnt))
+		res = nfs_follow_remote_path(root_mnt, export_path);
+	dprintk("<-- nfs4_referral_mount() = %ld%s\n",
+			IS_ERR(res) ? PTR_ERR(res) : 0,
+			IS_ERR(res) ? " [error]" : "");
+	return res;
+}
+
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
new file mode 100644
index 00000000..05c9e02f
--- /dev/null
+++ b/fs/nfs/symlink.c
@@ -0,0 +1,78 @@
+/*
+ *  linux/fs/nfs/symlink.c
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  Optimization changes Copyright (C) 1994 Florian La Roche
+ *
+ *  Jun 7 1999, cache symlink lookups in the page cache.  -DaveM
+ *
+ *  nfs symlink handling code
+ */
+
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs_fs.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/namei.h>
+
+/* Symlink caching in the page cache is even more simplistic
+ * and straight-forward than readdir caching.
+ */
+
+static int nfs_symlink_filler(struct inode *inode, struct page *page)
+{
+	int error;
+
+	error = NFS_PROTO(inode)->readlink(inode, page, 0, PAGE_SIZE);
+	if (error < 0)
+		goto error;
+	SetPageUptodate(page);
+	unlock_page(page);
+	return 0;
+
+error:
+	SetPageError(page);
+	unlock_page(page);
+	return -EIO;
+}
+
+static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	struct page *page;
+	void *err;
+
+	err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
+	if (err)
+		goto read_failed;
+	page = read_cache_page(&inode->i_data, 0,
+				(filler_t *)nfs_symlink_filler, inode);
+	if (IS_ERR(page)) {
+		err = page;
+		goto read_failed;
+	}
+	nd_set_link(nd, kmap(page));
+	return page;
+
+read_failed:
+	nd_set_link(nd, err);
+	return NULL;
+}
+
+/*
+ * symlinks can't do much...
+ */
+const struct inode_operations nfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= nfs_follow_link,
+	.put_link	= page_put_link,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+};
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
new file mode 100644
index 00000000..978aaeb8
--- /dev/null
+++ b/fs/nfs/sysctl.c
@@ -0,0 +1,92 @@
+/*
+ * linux/fs/nfs/sysctl.c
+ *
+ * Sysctl interface to NFS parameters
+ */
+#include <linux/types.h>
+#include <linux/linkage.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_idmap.h>
+#include <linux/nfs_fs.h>
+
+#include "callback.h"
+
+#ifdef CONFIG_NFS_V4
+static const int nfs_set_port_min = 0;
+static const int nfs_set_port_max = 65535;
+#endif
+static struct ctl_table_header *nfs_callback_sysctl_table;
+
+static ctl_table nfs_cb_sysctls[] = {
+#ifdef CONFIG_NFS_V4
+	{
+		.procname = "nfs_callback_tcpport",
+		.data = &nfs_callback_set_tcpport,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec_minmax,
+		.extra1 = (int *)&nfs_set_port_min,
+		.extra2 = (int *)&nfs_set_port_max,
+	},
+#ifndef CONFIG_NFS_USE_NEW_IDMAPPER
+	{
+		.procname = "idmap_cache_timeout",
+		.data = &nfs_idmap_cache_timeout,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec_jiffies,
+	},
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
+#endif
+	{
+		.procname	= "nfs_mountpoint_timeout",
+		.data		= &nfs_mountpoint_expiry_timeout,
+		.maxlen		= sizeof(nfs_mountpoint_expiry_timeout),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nfs_congestion_kb",
+		.data		= &nfs_congestion_kb,
+		.maxlen		= sizeof(nfs_congestion_kb),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static ctl_table nfs_cb_sysctl_dir[] = {
+	{
+		.procname = "nfs",
+		.mode = 0555,
+		.child = nfs_cb_sysctls,
+	},
+	{ }
+};
+
+static ctl_table nfs_cb_sysctl_root[] = {
+	{
+		.procname = "fs",
+		.mode = 0555,
+		.child = nfs_cb_sysctl_dir,
+	},
+	{ }
+};
+
+int nfs_register_sysctl(void)
+{
+	nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root);
+	if (nfs_callback_sysctl_table == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void nfs_unregister_sysctl(void)
+{
+	unregister_sysctl_table(nfs_callback_sysctl_table);
+	nfs_callback_sysctl_table = NULL;
+}
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
new file mode 100644
index 00000000..8d6864c2
--- /dev/null
+++ b/fs/nfs/unlink.c
@@ -0,0 +1,580 @@
+/*
+ *  linux/fs/nfs/unlink.c
+ *
+ * nfs sillydelete handling
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/dcache.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/namei.h>
+
+#include "internal.h"
+#include "nfs4_fs.h"
+#include "iostat.h"
+#include "delegation.h"
+
+struct nfs_unlinkdata {
+	struct hlist_node list;
+	struct nfs_removeargs args;
+	struct nfs_removeres res;
+	struct inode *dir;
+	struct rpc_cred	*cred;
+	struct nfs_fattr dir_attr;
+};
+
+/**
+ * nfs_free_unlinkdata - release data from a sillydelete operation.
+ * @data: pointer to unlink structure.
+ */
+static void
+nfs_free_unlinkdata(struct nfs_unlinkdata *data)
+{
+	iput(data->dir);
+	put_rpccred(data->cred);
+	kfree(data->args.name.name);
+	kfree(data);
+}
+
+#define NAME_ALLOC_LEN(len)	((len+16) & ~15)
+/**
+ * nfs_copy_dname - copy dentry name to data structure
+ * @dentry: pointer to dentry
+ * @data: nfs_unlinkdata
+ */
+static int nfs_copy_dname(struct dentry *dentry, struct nfs_unlinkdata *data)
+{
+	char		*str;
+	int		len = dentry->d_name.len;
+
+	str = kmemdup(dentry->d_name.name, NAME_ALLOC_LEN(len), GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+	data->args.name.len = len;
+	data->args.name.name = str;
+	return 0;
+}
+
+static void nfs_free_dname(struct nfs_unlinkdata *data)
+{
+	kfree(data->args.name.name);
+	data->args.name.name = NULL;
+	data->args.name.len = 0;
+}
+
+static void nfs_dec_sillycount(struct inode *dir)
+{
+	struct nfs_inode *nfsi = NFS_I(dir);
+	if (atomic_dec_return(&nfsi->silly_count) == 1)
+		wake_up(&nfsi->waitqueue);
+}
+
+/**
+ * nfs_async_unlink_done - Sillydelete post-processing
+ * @task: rpc_task of the sillydelete
+ *
+ * Do the directory attribute update.
+ */
+static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs_unlinkdata *data = calldata;
+	struct inode *dir = data->dir;
+
+	if (!NFS_PROTO(dir)->unlink_done(task, dir))
+		nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client);
+}
+
+/**
+ * nfs_async_unlink_release - Release the sillydelete data.
+ * @task: rpc_task of the sillydelete
+ *
+ * We need to call nfs_put_unlinkdata as a 'tk_release' task since the
+ * rpc_task would be freed too.
+ */
+static void nfs_async_unlink_release(void *calldata)
+{
+	struct nfs_unlinkdata	*data = calldata;
+	struct super_block *sb = data->dir->i_sb;
+
+	nfs_dec_sillycount(data->dir);
+	nfs_free_unlinkdata(data);
+	nfs_sb_deactive(sb);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_unlinkdata *data = calldata;
+	struct nfs_server *server = NFS_SERVER(data->dir);
+
+	if (nfs4_setup_sequence(server, &data->args.seq_args,
+				&data->res.seq_res, 1, task))
+		return;
+	rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static const struct rpc_call_ops nfs_unlink_ops = {
+	.rpc_call_done = nfs_async_unlink_done,
+	.rpc_release = nfs_async_unlink_release,
+#if defined(CONFIG_NFS_V4_1)
+	.rpc_call_prepare = nfs_unlink_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+};
+
+static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data)
+{
+	struct rpc_message msg = {
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = data->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_message = &msg,
+		.callback_ops = &nfs_unlink_ops,
+		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+	};
+	struct rpc_task *task;
+	struct dentry *alias;
+
+	alias = d_lookup(parent, &data->args.name);
+	if (alias != NULL) {
+		int ret = 0;
+		void *devname_garbage = NULL;
+
+		/*
+		 * Hey, we raced with lookup... See if we need to transfer
+		 * the sillyrename information to the aliased dentry.
+		 */
+		nfs_free_dname(data);
+		spin_lock(&alias->d_lock);
+		if (alias->d_inode != NULL &&
+		    !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
+			devname_garbage = alias->d_fsdata;
+			alias->d_fsdata = data;
+			alias->d_flags |= DCACHE_NFSFS_RENAMED;
+			ret = 1;
+		}
+		spin_unlock(&alias->d_lock);
+		nfs_dec_sillycount(dir);
+		dput(alias);
+		/*
+		 * If we'd displaced old cached devname, free it.  At that
+		 * point dentry is definitely not a root, so we won't need
+		 * that anymore.
+		 */
+		if (devname_garbage)
+			kfree(devname_garbage);
+		return ret;
+	}
+	data->dir = igrab(dir);
+	if (!data->dir) {
+		nfs_dec_sillycount(dir);
+		return 0;
+	}
+	nfs_sb_active(dir->i_sb);
+	data->args.fh = NFS_FH(dir);
+	nfs_fattr_init(data->res.dir_attr);
+
+	NFS_PROTO(dir)->unlink_setup(&msg, dir);
+
+	task_setup_data.rpc_client = NFS_CLIENT(dir);
+	task = rpc_run_task(&task_setup_data);
+	if (!IS_ERR(task))
+		rpc_put_task_async(task);
+	return 1;
+}
+
+static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
+{
+	struct dentry *parent;
+	struct inode *dir;
+	int ret = 0;
+
+
+	parent = dget_parent(dentry);
+	if (parent == NULL)
+		goto out_free;
+	dir = parent->d_inode;
+	if (nfs_copy_dname(dentry, data) != 0)
+		goto out_dput;
+	/* Non-exclusive lock protects against concurrent lookup() calls */
+	spin_lock(&dir->i_lock);
+	if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) {
+		/* Deferred delete */
+		hlist_add_head(&data->list, &NFS_I(dir)->silly_list);
+		spin_unlock(&dir->i_lock);
+		ret = 1;
+		goto out_dput;
+	}
+	spin_unlock(&dir->i_lock);
+	ret = nfs_do_call_unlink(parent, dir, data);
+out_dput:
+	dput(parent);
+out_free:
+	return ret;
+}
+
+void nfs_block_sillyrename(struct dentry *dentry)
+{
+	struct nfs_inode *nfsi = NFS_I(dentry->d_inode);
+
+	wait_event(nfsi->waitqueue, atomic_cmpxchg(&nfsi->silly_count, 1, 0) == 1);
+}
+
+void nfs_unblock_sillyrename(struct dentry *dentry)
+{
+	struct inode *dir = dentry->d_inode;
+	struct nfs_inode *nfsi = NFS_I(dir);
+	struct nfs_unlinkdata *data;
+
+	atomic_inc(&nfsi->silly_count);
+	spin_lock(&dir->i_lock);
+	while (!hlist_empty(&nfsi->silly_list)) {
+		if (!atomic_inc_not_zero(&nfsi->silly_count))
+			break;
+		data = hlist_entry(nfsi->silly_list.first, struct nfs_unlinkdata, list);
+		hlist_del(&data->list);
+		spin_unlock(&dir->i_lock);
+		if (nfs_do_call_unlink(dentry, dir, data) == 0)
+			nfs_free_unlinkdata(data);
+		spin_lock(&dir->i_lock);
+	}
+	spin_unlock(&dir->i_lock);
+}
+
+/**
+ * nfs_async_unlink - asynchronous unlinking of a file
+ * @dir: parent directory of dentry
+ * @dentry: dentry to unlink
+ */
+static int
+nfs_async_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct nfs_unlinkdata *data;
+	int status = -ENOMEM;
+	void *devname_garbage = NULL;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		goto out;
+
+	data->cred = rpc_lookup_cred();
+	if (IS_ERR(data->cred)) {
+		status = PTR_ERR(data->cred);
+		goto out_free;
+	}
+	data->res.dir_attr = &data->dir_attr;
+
+	status = -EBUSY;
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+		goto out_unlock;
+	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
+	devname_garbage = dentry->d_fsdata;
+	dentry->d_fsdata = data;
+	spin_unlock(&dentry->d_lock);
+	/*
+	 * If we'd displaced old cached devname, free it.  At that
+	 * point dentry is definitely not a root, so we won't need
+	 * that anymore.
+	 */
+	if (devname_garbage)
+		kfree(devname_garbage);
+	return 0;
+out_unlock:
+	spin_unlock(&dentry->d_lock);
+	put_rpccred(data->cred);
+out_free:
+	kfree(data);
+out:
+	return status;
+}
+
+/**
+ * nfs_complete_unlink - Initialize completion of the sillydelete
+ * @dentry: dentry to delete
+ * @inode: inode
+ *
+ * Since we're most likely to be called by dentry_iput(), we
+ * only use the dentry to find the sillydelete. We then copy the name
+ * into the qstr.
+ */
+void
+nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
+{
+	struct nfs_unlinkdata	*data = NULL;
+
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+		dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+		data = dentry->d_fsdata;
+		dentry->d_fsdata = NULL;
+	}
+	spin_unlock(&dentry->d_lock);
+
+	if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)))
+		nfs_free_unlinkdata(data);
+}
+
+/* Cancel a queued async unlink. Called when a sillyrename run fails. */
+static void
+nfs_cancel_async_unlink(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+		struct nfs_unlinkdata *data = dentry->d_fsdata;
+
+		dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+		dentry->d_fsdata = NULL;
+		spin_unlock(&dentry->d_lock);
+		nfs_free_unlinkdata(data);
+		return;
+	}
+	spin_unlock(&dentry->d_lock);
+}
+
+struct nfs_renamedata {
+	struct nfs_renameargs	args;
+	struct nfs_renameres	res;
+	struct rpc_cred		*cred;
+	struct inode		*old_dir;
+	struct dentry		*old_dentry;
+	struct nfs_fattr	old_fattr;
+	struct inode		*new_dir;
+	struct dentry		*new_dentry;
+	struct nfs_fattr	new_fattr;
+};
+
+/**
+ * nfs_async_rename_done - Sillyrename post-processing
+ * @task: rpc_task of the sillyrename
+ * @calldata: nfs_renamedata for the sillyrename
+ *
+ * Do the directory attribute updates and the d_move
+ */
+static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs_renamedata *data = calldata;
+	struct inode *old_dir = data->old_dir;
+	struct inode *new_dir = data->new_dir;
+
+	if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
+		nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
+		return;
+	}
+
+	if (task->tk_status != 0) {
+		nfs_cancel_async_unlink(data->old_dentry);
+		return;
+	}
+
+	nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir));
+	d_move(data->old_dentry, data->new_dentry);
+}
+
+/**
+ * nfs_async_rename_release - Release the sillyrename data.
+ * @calldata: the struct nfs_renamedata to be released
+ */
+static void nfs_async_rename_release(void *calldata)
+{
+	struct nfs_renamedata	*data = calldata;
+	struct super_block *sb = data->old_dir->i_sb;
+
+	if (data->old_dentry->d_inode)
+		nfs_mark_for_revalidate(data->old_dentry->d_inode);
+
+	dput(data->old_dentry);
+	dput(data->new_dentry);
+	iput(data->old_dir);
+	iput(data->new_dir);
+	nfs_sb_deactive(sb);
+	put_rpccred(data->cred);
+	kfree(data);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_renamedata *data = calldata;
+	struct nfs_server *server = NFS_SERVER(data->old_dir);
+
+	if (nfs4_setup_sequence(server, &data->args.seq_args,
+				&data->res.seq_res, 1, task))
+		return;
+	rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static const struct rpc_call_ops nfs_rename_ops = {
+	.rpc_call_done = nfs_async_rename_done,
+	.rpc_release = nfs_async_rename_release,
+#if defined(CONFIG_NFS_V4_1)
+	.rpc_call_prepare = nfs_rename_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+};
+
+/**
+ * nfs_async_rename - perform an asynchronous rename operation
+ * @old_dir: directory that currently holds the dentry to be renamed
+ * @new_dir: target directory for the rename
+ * @old_dentry: original dentry to be renamed
+ * @new_dentry: dentry to which the old_dentry should be renamed
+ *
+ * It's expected that valid references to the dentries and inodes are held
+ */
+static struct rpc_task *
+nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
+		 struct dentry *old_dentry, struct dentry *new_dentry)
+{
+	struct nfs_renamedata *data;
+	struct rpc_message msg = { };
+	struct rpc_task_setup task_setup_data = {
+		.rpc_message = &msg,
+		.callback_ops = &nfs_rename_ops,
+		.workqueue = nfsiod_workqueue,
+		.rpc_client = NFS_CLIENT(old_dir),
+		.flags = RPC_TASK_ASYNC,
+	};
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		return ERR_PTR(-ENOMEM);
+	task_setup_data.callback_data = data;
+
+	data->cred = rpc_lookup_cred();
+	if (IS_ERR(data->cred)) {
+		struct rpc_task *task = ERR_CAST(data->cred);
+		kfree(data);
+		return task;
+	}
+
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+	msg.rpc_cred = data->cred;
+
+	/* set up nfs_renamedata */
+	data->old_dir = old_dir;
+	ihold(old_dir);
+	data->new_dir = new_dir;
+	ihold(new_dir);
+	data->old_dentry = dget(old_dentry);
+	data->new_dentry = dget(new_dentry);
+	nfs_fattr_init(&data->old_fattr);
+	nfs_fattr_init(&data->new_fattr);
+
+	/* set up nfs_renameargs */
+	data->args.old_dir = NFS_FH(old_dir);
+	data->args.old_name = &old_dentry->d_name;
+	data->args.new_dir = NFS_FH(new_dir);
+	data->args.new_name = &new_dentry->d_name;
+
+	/* set up nfs_renameres */
+	data->res.old_fattr = &data->old_fattr;
+	data->res.new_fattr = &data->new_fattr;
+
+	nfs_sb_active(old_dir->i_sb);
+
+	NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir);
+
+	return rpc_run_task(&task_setup_data);
+}
+
+/**
+ * nfs_sillyrename - Perform a silly-rename of a dentry
+ * @dir: inode of directory that contains dentry
+ * @dentry: dentry to be sillyrenamed
+ *
+ * NFSv2/3 is stateless and the server doesn't know when the client is
+ * holding a file open. To prevent application problems when a file is
+ * unlinked while it's still open, the client performs a "silly-rename".
+ * That is, it renames the file to a hidden file in the same directory,
+ * and only performs the unlink once the last reference to it is put.
+ *
+ * The final cleanup is done during dentry_iput.
+ */
+int
+nfs_sillyrename(struct inode *dir, struct dentry *dentry)
+{
+	static unsigned int sillycounter;
+	const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2;
+	const int      countersize = sizeof(sillycounter)*2;
+	const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
+	char           silly[slen+1];
+	struct dentry *sdentry;
+	struct rpc_task *task;
+	int            error = -EIO;
+
+	dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name,
+		dentry->d_count);
+	nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
+
+	/*
+	 * We don't allow a dentry to be silly-renamed twice.
+	 */
+	error = -EBUSY;
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+		goto out;
+
+	sprintf(silly, ".nfs%*.*Lx",
+		fileidsize, fileidsize,
+		(unsigned long long)NFS_FILEID(dentry->d_inode));
+
+	/* Return delegation in anticipation of the rename */
+	nfs_inode_return_delegation(dentry->d_inode);
+
+	sdentry = NULL;
+	do {
+		char *suffix = silly + slen - countersize;
+
+		dput(sdentry);
+		sillycounter++;
+		sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
+
+		dfprintk(VFS, "NFS: trying to rename %s to %s\n",
+				dentry->d_name.name, silly);
+
+		sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+		/*
+		 * N.B. Better to return EBUSY here ... it could be
+		 * dangerous to delete the file while it's in use.
+		 */
+		if (IS_ERR(sdentry))
+			goto out;
+	} while (sdentry->d_inode != NULL); /* need negative lookup */
+
+	/* queue unlink first. Can't do this from rpc_release as it
+	 * has to allocate memory
+	 */
+	error = nfs_async_unlink(dir, dentry);
+	if (error)
+		goto out_dput;
+
+	/* run the rename task, undo unlink if it fails */
+	task = nfs_async_rename(dir, dir, dentry, sdentry);
+	if (IS_ERR(task)) {
+		error = -EBUSY;
+		nfs_cancel_async_unlink(dentry);
+		goto out_dput;
+	}
+
+	/* wait for the RPC task to complete, unless a SIGKILL intervenes */
+	error = rpc_wait_for_completion_task(task);
+	if (error == 0)
+		error = task->tk_status;
+	rpc_put_task(task);
+out_dput:
+	dput(sdentry);
+out:
+	return error;
+}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
new file mode 100644
index 00000000..f2f80c00
--- /dev/null
+++ b/fs/nfs/write.c
@@ -0,0 +1,1732 @@
+/*
+ * linux/fs/nfs/write.c
+ *
+ * Write file data over NFS.
+ *
+ * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/migrate.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs_page.h>
+#include <linux/backing-dev.h>
+
+#include <asm/uaccess.h>
+
+#include "delegation.h"
+#include "internal.h"
+#include "iostat.h"
+#include "nfs4_fs.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PAGECACHE
+
+#define MIN_POOL_WRITE		(32)
+#define MIN_POOL_COMMIT		(4)
+
+/*
+ * Local function declarations
+ */
+static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
+				  struct inode *inode, int ioflags);
+static void nfs_redirty_request(struct nfs_page *req);
+static const struct rpc_call_ops nfs_write_partial_ops;
+static const struct rpc_call_ops nfs_write_full_ops;
+static const struct rpc_call_ops nfs_commit_ops;
+
+static struct kmem_cache *nfs_wdata_cachep;
+static mempool_t *nfs_wdata_mempool;
+static mempool_t *nfs_commit_mempool;
+
+struct nfs_write_data *nfs_commitdata_alloc(void)
+{
+	struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
+
+	if (p) {
+		memset(p, 0, sizeof(*p));
+		INIT_LIST_HEAD(&p->pages);
+	}
+	return p;
+}
+EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
+
+void nfs_commit_free(struct nfs_write_data *p)
+{
+	if (p && (p->pagevec != &p->page_array[0]))
+		kfree(p->pagevec);
+	mempool_free(p, nfs_commit_mempool);
+}
+EXPORT_SYMBOL_GPL(nfs_commit_free);
+
+struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
+{
+	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
+
+	if (p) {
+		memset(p, 0, sizeof(*p));
+		INIT_LIST_HEAD(&p->pages);
+		p->npages = pagecount;
+		if (pagecount <= ARRAY_SIZE(p->page_array))
+			p->pagevec = p->page_array;
+		else {
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+			if (!p->pagevec) {
+				mempool_free(p, nfs_wdata_mempool);
+				p = NULL;
+			}
+		}
+	}
+	return p;
+}
+
+void nfs_writedata_free(struct nfs_write_data *p)
+{
+	if (p && (p->pagevec != &p->page_array[0]))
+		kfree(p->pagevec);
+	mempool_free(p, nfs_wdata_mempool);
+}
+
+static void nfs_writedata_release(struct nfs_write_data *wdata)
+{
+	put_lseg(wdata->lseg);
+	put_nfs_open_context(wdata->args.context);
+	nfs_writedata_free(wdata);
+}
+
+static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
+{
+	ctx->error = error;
+	smp_wmb();
+	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+}
+
+static struct nfs_page *nfs_page_find_request_locked(struct page *page)
+{
+	struct nfs_page *req = NULL;
+
+	if (PagePrivate(page)) {
+		req = (struct nfs_page *)page_private(page);
+		if (req != NULL)
+			kref_get(&req->wb_kref);
+	}
+	return req;
+}
+
+static struct nfs_page *nfs_page_find_request(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct nfs_page *req = NULL;
+
+	spin_lock(&inode->i_lock);
+	req = nfs_page_find_request_locked(page);
+	spin_unlock(&inode->i_lock);
+	return req;
+}
+
+/* Adjust the file length if we're writing beyond the end */
+static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t end, i_size;
+	pgoff_t end_index;
+
+	spin_lock(&inode->i_lock);
+	i_size = i_size_read(inode);
+	end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+	if (i_size > 0 && page->index < end_index)
+		goto out;
+	end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count);
+	if (i_size >= end)
+		goto out;
+	i_size_write(inode, end);
+	nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
+out:
+	spin_unlock(&inode->i_lock);
+}
+
+/* A writeback failed: mark the page as bad, and invalidate the page cache */
+static void nfs_set_pageerror(struct page *page)
+{
+	SetPageError(page);
+	nfs_zap_mapping(page->mapping->host, page->mapping);
+}
+
+/* We can set the PG_uptodate flag if we see that a write request
+ * covers the full page.
+ */
+static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count)
+{
+	if (PageUptodate(page))
+		return;
+	if (base != 0)
+		return;
+	if (count != nfs_page_length(page))
+		return;
+	SetPageUptodate(page);
+}
+
+static int wb_priority(struct writeback_control *wbc)
+{
+	if (wbc->for_reclaim)
+		return FLUSH_HIGHPRI | FLUSH_STABLE;
+	if (wbc->for_kupdate || wbc->for_background)
+		return FLUSH_LOWPRI | FLUSH_COND_STABLE;
+	return FLUSH_COND_STABLE;
+}
+
+/*
+ * NFS congestion control
+ */
+
+int nfs_congestion_kb;
+
+#define NFS_CONGESTION_ON_THRESH 	(nfs_congestion_kb >> (PAGE_SHIFT-10))
+#define NFS_CONGESTION_OFF_THRESH	\
+	(NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
+
+static int nfs_set_page_writeback(struct page *page)
+{
+	int ret = test_set_page_writeback(page);
+
+	if (!ret) {
+		struct inode *inode = page->mapping->host;
+		struct nfs_server *nfss = NFS_SERVER(inode);
+
+		page_cache_get(page);
+		if (atomic_long_inc_return(&nfss->writeback) >
+				NFS_CONGESTION_ON_THRESH) {
+			set_bdi_congested(&nfss->backing_dev_info,
+						BLK_RW_ASYNC);
+		}
+	}
+	return ret;
+}
+
+static void nfs_end_page_writeback(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct nfs_server *nfss = NFS_SERVER(inode);
+
+	end_page_writeback(page);
+	page_cache_release(page);
+	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
+		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+}
+
+static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock)
+{
+	struct inode *inode = page->mapping->host;
+	struct nfs_page *req;
+	int ret;
+
+	spin_lock(&inode->i_lock);
+	for (;;) {
+		req = nfs_page_find_request_locked(page);
+		if (req == NULL)
+			break;
+		if (nfs_set_page_tag_locked(req))
+			break;
+		/* Note: If we hold the page lock, as is the case in nfs_writepage,
+		 *	 then the call to nfs_set_page_tag_locked() will always
+		 *	 succeed provided that someone hasn't already marked the
+		 *	 request as dirty (in which case we don't care).
+		 */
+		spin_unlock(&inode->i_lock);
+		if (!nonblock)
+			ret = nfs_wait_on_request(req);
+		else
+			ret = -EAGAIN;
+		nfs_release_request(req);
+		if (ret != 0)
+			return ERR_PTR(ret);
+		spin_lock(&inode->i_lock);
+	}
+	spin_unlock(&inode->i_lock);
+	return req;
+}
+
+/*
+ * Find an associated nfs write request, and prepare to flush it out
+ * May return an error if the user signalled nfs_wait_on_request().
+ */
+static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
+				struct page *page, bool nonblock)
+{
+	struct nfs_page *req;
+	int ret = 0;
+
+	req = nfs_find_and_lock_request(page, nonblock);
+	if (!req)
+		goto out;
+	ret = PTR_ERR(req);
+	if (IS_ERR(req))
+		goto out;
+
+	ret = nfs_set_page_writeback(page);
+	BUG_ON(ret != 0);
+	BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
+
+	if (!nfs_pageio_add_request(pgio, req)) {
+		nfs_redirty_request(req);
+		ret = pgio->pg_error;
+	}
+out:
+	return ret;
+}
+
+static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
+{
+	struct inode *inode = page->mapping->host;
+	int ret;
+
+	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
+	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
+
+	nfs_pageio_cond_complete(pgio, page->index);
+	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
+	if (ret == -EAGAIN) {
+		redirty_page_for_writepage(wbc, page);
+		ret = 0;
+	}
+	return ret;
+}
+
+/*
+ * Write an mmapped page to the server.
+ */
+static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
+{
+	struct nfs_pageio_descriptor pgio;
+	int err;
+
+	nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc));
+	err = nfs_do_writepage(page, wbc, &pgio);
+	nfs_pageio_complete(&pgio);
+	if (err < 0)
+		return err;
+	if (pgio.pg_error < 0)
+		return pgio.pg_error;
+	return 0;
+}
+
+int nfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int ret;
+
+	ret = nfs_writepage_locked(page, wbc);
+	unlock_page(page);
+	return ret;
+}
+
+static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data)
+{
+	int ret;
+
+	ret = nfs_do_writepage(page, wbc, data);
+	unlock_page(page);
+	return ret;
+}
+
+int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	unsigned long *bitlock = &NFS_I(inode)->flags;
+	struct nfs_pageio_descriptor pgio;
+	int err;
+
+	/* Stop dirtying of new pages while we sync */
+	err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+	if (err)
+		goto out_err;
+
+	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
+
+	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
+	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
+	nfs_pageio_complete(&pgio);
+
+	clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
+	smp_mb__after_clear_bit();
+	wake_up_bit(bitlock, NFS_INO_FLUSHING);
+
+	if (err < 0)
+		goto out_err;
+	err = pgio.pg_error;
+	if (err < 0)
+		goto out_err;
+	return 0;
+out_err:
+	return err;
+}
+
+/*
+ * Insert a write request into an inode
+ */
+static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int error;
+
+	error = radix_tree_preload(GFP_NOFS);
+	if (error != 0)
+		goto out;
+
+	/* Lock the request! */
+	nfs_lock_request_dontget(req);
+
+	spin_lock(&inode->i_lock);
+	error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
+	BUG_ON(error);
+	if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
+		nfsi->change_attr++;
+	set_bit(PG_MAPPED, &req->wb_flags);
+	SetPagePrivate(req->wb_page);
+	set_page_private(req->wb_page, (unsigned long)req);
+	nfsi->npages++;
+	kref_get(&req->wb_kref);
+	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
+				NFS_PAGE_TAG_LOCKED);
+	spin_unlock(&inode->i_lock);
+	radix_tree_preload_end();
+out:
+	return error;
+}
+
+/*
+ * Remove a write request from an inode
+ */
+static void nfs_inode_remove_request(struct nfs_page *req)
+{
+	struct inode *inode = req->wb_context->path.dentry->d_inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	BUG_ON (!NFS_WBACK_BUSY(req));
+
+	spin_lock(&inode->i_lock);
+	set_page_private(req->wb_page, 0);
+	ClearPagePrivate(req->wb_page);
+	clear_bit(PG_MAPPED, &req->wb_flags);
+	radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
+	nfsi->npages--;
+	spin_unlock(&inode->i_lock);
+	nfs_release_request(req);
+}
+
+static void
+nfs_mark_request_dirty(struct nfs_page *req)
+{
+	__set_page_dirty_nobuffers(req->wb_page);
+}
+
+#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+/*
+ * Add a request to the inode's commit list.
+ */
+static void
+nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+{
+	struct inode *inode = req->wb_context->path.dentry->d_inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	spin_lock(&inode->i_lock);
+	set_bit(PG_CLEAN, &(req)->wb_flags);
+	radix_tree_tag_set(&nfsi->nfs_page_tree,
+			req->wb_index,
+			NFS_PAGE_TAG_COMMIT);
+	nfsi->ncommit++;
+	spin_unlock(&inode->i_lock);
+	pnfs_mark_request_commit(req, lseg);
+	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+}
+
+static int
+nfs_clear_request_commit(struct nfs_page *req)
+{
+	struct page *page = req->wb_page;
+
+	if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
+		dec_zone_page_state(page, NR_UNSTABLE_NFS);
+		dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+		return 1;
+	}
+	return 0;
+}
+
+static inline
+int nfs_write_need_commit(struct nfs_write_data *data)
+{
+	if (data->verf.committed == NFS_DATA_SYNC)
+		return data->lseg == NULL;
+	else
+		return data->verf.committed != NFS_FILE_SYNC;
+}
+
+static inline
+int nfs_reschedule_unstable_write(struct nfs_page *req,
+				  struct nfs_write_data *data)
+{
+	if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+		nfs_mark_request_commit(req, data->lseg);
+		return 1;
+	}
+	if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) {
+		nfs_mark_request_dirty(req);
+		return 1;
+	}
+	return 0;
+}
+#else
+static inline void
+nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+{
+}
+
+static inline int
+nfs_clear_request_commit(struct nfs_page *req)
+{
+	return 0;
+}
+
+static inline
+int nfs_write_need_commit(struct nfs_write_data *data)
+{
+	return 0;
+}
+
+static inline
+int nfs_reschedule_unstable_write(struct nfs_page *req,
+				  struct nfs_write_data *data)
+{
+	return 0;
+}
+#endif
+
+#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static int
+nfs_need_commit(struct nfs_inode *nfsi)
+{
+	return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
+}
+
+/*
+ * nfs_scan_commit - Scan an inode for commit requests
+ * @inode: NFS inode to scan
+ * @dst: destination list
+ * @idx_start: lower bound of page->index to scan.
+ * @npages: idx_start + npages sets the upper bound to scan.
+ *
+ * Moves requests from the inode's 'commit' request list.
+ * The requests are *not* checked to ensure that they form a contiguous set.
+ */
+static int
+nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int ret;
+
+	if (!nfs_need_commit(nfsi))
+		return 0;
+
+	spin_lock(&inode->i_lock);
+	ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
+	if (ret > 0)
+		nfsi->ncommit -= ret;
+	spin_unlock(&inode->i_lock);
+
+	if (nfs_need_commit(NFS_I(inode)))
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+
+	return ret;
+}
+#else
+static inline int nfs_need_commit(struct nfs_inode *nfsi)
+{
+	return 0;
+}
+
+static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
+{
+	return 0;
+}
+#endif
+
+/*
+ * Search for an existing write request, and attempt to update
+ * it to reflect a new dirty region on a given page.
+ *
+ * If the attempt fails, then the existing request is flushed out
+ * to disk.
+ */
+static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
+		struct page *page,
+		unsigned int offset,
+		unsigned int bytes)
+{
+	struct nfs_page *req;
+	unsigned int rqend;
+	unsigned int end;
+	int error;
+
+	if (!PagePrivate(page))
+		return NULL;
+
+	end = offset + bytes;
+	spin_lock(&inode->i_lock);
+
+	for (;;) {
+		req = nfs_page_find_request_locked(page);
+		if (req == NULL)
+			goto out_unlock;
+
+		rqend = req->wb_offset + req->wb_bytes;
+		/*
+		 * Tell the caller to flush out the request if
+		 * the offsets are non-contiguous.
+		 * Note: nfs_flush_incompatible() will already
+		 * have flushed out requests having wrong owners.
+		 */
+		if (offset > rqend
+		    || end < req->wb_offset)
+			goto out_flushme;
+
+		if (nfs_set_page_tag_locked(req))
+			break;
+
+		/* The request is locked, so wait and then retry */
+		spin_unlock(&inode->i_lock);
+		error = nfs_wait_on_request(req);
+		nfs_release_request(req);
+		if (error != 0)
+			goto out_err;
+		spin_lock(&inode->i_lock);
+	}
+
+	if (nfs_clear_request_commit(req) &&
+	    radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
+				 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) {
+		NFS_I(inode)->ncommit--;
+		pnfs_clear_request_commit(req);
+	}
+
+	/* Okay, the request matches. Update the region */
+	if (offset < req->wb_offset) {
+		req->wb_offset = offset;
+		req->wb_pgbase = offset;
+	}
+	if (end > rqend)
+		req->wb_bytes = end - req->wb_offset;
+	else
+		req->wb_bytes = rqend - req->wb_offset;
+out_unlock:
+	spin_unlock(&inode->i_lock);
+	return req;
+out_flushme:
+	spin_unlock(&inode->i_lock);
+	nfs_release_request(req);
+	error = nfs_wb_page(inode, page);
+out_err:
+	return ERR_PTR(error);
+}
+
+/*
+ * Try to update an existing write request, or create one if there is none.
+ *
+ * Note: Should always be called with the Page Lock held to prevent races
+ * if we have to add a new request. Also assumes that the caller has
+ * already called nfs_flush_incompatible() if necessary.
+ */
+static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
+		struct page *page, unsigned int offset, unsigned int bytes)
+{
+	struct inode *inode = page->mapping->host;
+	struct nfs_page	*req;
+	int error;
+
+	req = nfs_try_to_update_request(inode, page, offset, bytes);
+	if (req != NULL)
+		goto out;
+	req = nfs_create_request(ctx, inode, page, offset, bytes);
+	if (IS_ERR(req))
+		goto out;
+	error = nfs_inode_add_request(inode, req);
+	if (error != 0) {
+		nfs_release_request(req);
+		req = ERR_PTR(error);
+	}
+out:
+	return req;
+}
+
+static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
+		unsigned int offset, unsigned int count)
+{
+	struct nfs_page	*req;
+
+	req = nfs_setup_write_request(ctx, page, offset, count);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	/* Update file length */
+	nfs_grow_file(page, offset, count);
+	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+	nfs_mark_request_dirty(req);
+	nfs_clear_page_tag_locked(req);
+	return 0;
+}
+
+int nfs_flush_incompatible(struct file *file, struct page *page)
+{
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
+	struct nfs_page	*req;
+	int do_flush, status;
+	/*
+	 * Look for a request corresponding to this page. If there
+	 * is one, and it belongs to another file, we flush it out
+	 * before we try to copy anything into the page. Do this
+	 * due to the lack of an ACCESS-type call in NFSv2.
+	 * Also do the same if we find a request from an existing
+	 * dropped page.
+	 */
+	do {
+		req = nfs_page_find_request(page);
+		if (req == NULL)
+			return 0;
+		do_flush = req->wb_page != page || req->wb_context != ctx ||
+			req->wb_lock_context->lockowner != current->files ||
+			req->wb_lock_context->pid != current->tgid;
+		nfs_release_request(req);
+		if (!do_flush)
+			return 0;
+		status = nfs_wb_page(page->mapping->host, page);
+	} while (status == 0);
+	return status;
+}
+
+/*
+ * If the page cache is marked as unsafe or invalid, then we can't rely on
+ * the PageUptodate() flag. In this case, we will need to turn off
+ * write optimisations that depend on the page contents being correct.
+ */
+static int nfs_write_pageuptodate(struct page *page, struct inode *inode)
+{
+	return PageUptodate(page) &&
+		!(NFS_I(inode)->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA));
+}
+
+/*
+ * Update and possibly write a cached page of an NFS file.
+ *
+ * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad
+ * things with a page scheduled for an RPC call (e.g. invalidate it).
+ */
+int nfs_updatepage(struct file *file, struct page *page,
+		unsigned int offset, unsigned int count)
+{
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
+	struct inode	*inode = page->mapping->host;
+	int		status = 0;
+
+	nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
+
+	dprintk("NFS:       nfs_updatepage(%s/%s %d@%lld)\n",
+		file->f_path.dentry->d_parent->d_name.name,
+		file->f_path.dentry->d_name.name, count,
+		(long long)(page_offset(page) + offset));
+
+	/* If we're not using byte range locks, and we know the page
+	 * is up to date, it may be more efficient to extend the write
+	 * to cover the entire page in order to avoid fragmentation
+	 * inefficiencies.
+	 */
+	if (nfs_write_pageuptodate(page, inode) &&
+			inode->i_flock == NULL &&
+			!(file->f_flags & O_DSYNC)) {
+		count = max(count + offset, nfs_page_length(page));
+		offset = 0;
+	}
+
+	status = nfs_writepage_setup(ctx, page, offset, count);
+	if (status < 0)
+		nfs_set_pageerror(page);
+	else
+		__set_page_dirty_nobuffers(page);
+
+	dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
+			status, (long long)i_size_read(inode));
+	return status;
+}
+
+static void nfs_writepage_release(struct nfs_page *req,
+				  struct nfs_write_data *data)
+{
+	struct page *page = req->wb_page;
+
+	if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data))
+		nfs_inode_remove_request(req);
+	nfs_clear_page_tag_locked(req);
+	nfs_end_page_writeback(page);
+}
+
+static int flush_task_priority(int how)
+{
+	switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) {
+		case FLUSH_HIGHPRI:
+			return RPC_PRIORITY_HIGH;
+		case FLUSH_LOWPRI:
+			return RPC_PRIORITY_LOW;
+	}
+	return RPC_PRIORITY_NORMAL;
+}
+
+int nfs_initiate_write(struct nfs_write_data *data,
+		       struct rpc_clnt *clnt,
+		       const struct rpc_call_ops *call_ops,
+		       int how)
+{
+	struct inode *inode = data->inode;
+	int priority = flush_task_priority(how);
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = data->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clnt,
+		.task = &data->task,
+		.rpc_message = &msg,
+		.callback_ops = call_ops,
+		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+		.priority = priority,
+	};
+	int ret = 0;
+
+	/* Set up the initial task struct.  */
+	NFS_PROTO(inode)->write_setup(data, &msg);
+
+	dprintk("NFS: %5u initiated write call "
+		"(req %s/%lld, %u bytes @ offset %llu)\n",
+		data->task.tk_pid,
+		inode->i_sb->s_id,
+		(long long)NFS_FILEID(inode),
+		data->args.count,
+		(unsigned long long)data->args.offset);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task)) {
+		ret = PTR_ERR(task);
+		goto out;
+	}
+	if (how & FLUSH_SYNC) {
+		ret = rpc_wait_for_completion_task(task);
+		if (ret == 0)
+			ret = task->tk_status;
+	}
+	rpc_put_task(task);
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_write);
+
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+static int nfs_write_rpcsetup(struct nfs_page *req,
+		struct nfs_write_data *data,
+		const struct rpc_call_ops *call_ops,
+		unsigned int count, unsigned int offset,
+		struct pnfs_layout_segment *lseg,
+		int how)
+{
+	struct inode *inode = req->wb_context->path.dentry->d_inode;
+
+	/* Set up the RPC argument and reply structs
+	 * NB: take care not to mess about with data->commit et al. */
+
+	data->req = req;
+	data->inode = inode = req->wb_context->path.dentry->d_inode;
+	data->cred = req->wb_context->cred;
+	data->lseg = get_lseg(lseg);
+
+	data->args.fh     = NFS_FH(inode);
+	data->args.offset = req_offset(req) + offset;
+	/* pnfs_set_layoutcommit needs this */
+	data->mds_offset = data->args.offset;
+	data->args.pgbase = req->wb_pgbase + offset;
+	data->args.pages  = data->pagevec;
+	data->args.count  = count;
+	data->args.context = get_nfs_open_context(req->wb_context);
+	data->args.lock_context = req->wb_lock_context;
+	data->args.stable  = NFS_UNSTABLE;
+	if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
+		data->args.stable = NFS_DATA_SYNC;
+		if (!nfs_need_commit(NFS_I(inode)))
+			data->args.stable = NFS_FILE_SYNC;
+	}
+
+	data->res.fattr   = &data->fattr;
+	data->res.count   = count;
+	data->res.verf    = &data->verf;
+	nfs_fattr_init(&data->fattr);
+
+	if (data->lseg &&
+	    (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
+		return 0;
+
+	return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
+}
+
+/* If a nfs_flush_* function fails, it should remove reqs from @head and
+ * call this on each, which will prepare them to be retried on next
+ * writeback using standard nfs.
+ */
+static void nfs_redirty_request(struct nfs_page *req)
+{
+	struct page *page = req->wb_page;
+
+	nfs_mark_request_dirty(req);
+	nfs_clear_page_tag_locked(req);
+	nfs_end_page_writeback(page);
+}
+
+/*
+ * Generate multiple small requests to write out a single
+ * contiguous dirty area on one page.
+ */
+static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
+{
+	struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
+	struct page *page = req->wb_page;
+	struct nfs_write_data *data;
+	size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes;
+	unsigned int offset;
+	int requests = 0;
+	int ret = 0;
+	struct pnfs_layout_segment *lseg;
+	LIST_HEAD(list);
+
+	nfs_list_remove_request(req);
+
+	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
+	    (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit ||
+	     desc->pg_count > wsize))
+		desc->pg_ioflags &= ~FLUSH_COND_STABLE;
+
+
+	nbytes = desc->pg_count;
+	do {
+		size_t len = min(nbytes, wsize);
+
+		data = nfs_writedata_alloc(1);
+		if (!data)
+			goto out_bad;
+		list_add(&data->pages, &list);
+		requests++;
+		nbytes -= len;
+	} while (nbytes != 0);
+	atomic_set(&req->wb_complete, requests);
+
+	BUG_ON(desc->pg_lseg);
+	lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+				  req_offset(req), desc->pg_count,
+				  IOMODE_RW, GFP_NOFS);
+	ClearPageError(page);
+	offset = 0;
+	nbytes = desc->pg_count;
+	do {
+		int ret2;
+
+		data = list_entry(list.next, struct nfs_write_data, pages);
+		list_del_init(&data->pages);
+
+		data->pagevec[0] = page;
+
+		if (nbytes < wsize)
+			wsize = nbytes;
+		ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
+					  wsize, offset, lseg, desc->pg_ioflags);
+		if (ret == 0)
+			ret = ret2;
+		offset += wsize;
+		nbytes -= wsize;
+	} while (nbytes != 0);
+
+	put_lseg(lseg);
+	desc->pg_lseg = NULL;
+	return ret;
+
+out_bad:
+	while (!list_empty(&list)) {
+		data = list_entry(list.next, struct nfs_write_data, pages);
+		list_del(&data->pages);
+		nfs_writedata_free(data);
+	}
+	nfs_redirty_request(req);
+	return -ENOMEM;
+}
+
+/*
+ * Create an RPC task for the given write request and kick it.
+ * The page must have been locked by the caller.
+ *
+ * It may happen that the page we're passed is not marked dirty.
+ * This is the case if nfs_updatepage detects a conflicting request
+ * that has been written but not committed.
+ */
+static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
+{
+	struct nfs_page		*req;
+	struct page		**pages;
+	struct nfs_write_data	*data;
+	struct list_head *head = &desc->pg_list;
+	struct pnfs_layout_segment *lseg = desc->pg_lseg;
+	int ret;
+
+	data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
+						      desc->pg_count));
+	if (!data) {
+		while (!list_empty(head)) {
+			req = nfs_list_entry(head->next);
+			nfs_list_remove_request(req);
+			nfs_redirty_request(req);
+		}
+		ret = -ENOMEM;
+		goto out;
+	}
+	pages = data->pagevec;
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_list_add_request(req, &data->pages);
+		ClearPageError(req->wb_page);
+		*pages++ = req->wb_page;
+	}
+	req = nfs_list_entry(data->pages.next);
+	if ((!lseg) && list_is_singular(&data->pages))
+		lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+					  req_offset(req), desc->pg_count,
+					  IOMODE_RW, GFP_NOFS);
+
+	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
+	    (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
+		desc->pg_ioflags &= ~FLUSH_COND_STABLE;
+
+	/* Set up the argument struct */
+	ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags);
+out:
+	put_lseg(lseg); /* Cleans any gotten in ->pg_test */
+	desc->pg_lseg = NULL;
+	return ret;
+}
+
+static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+				  struct inode *inode, int ioflags)
+{
+	size_t wsize = NFS_SERVER(inode)->wsize;
+
+	if (wsize < PAGE_CACHE_SIZE)
+		nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
+	else
+		nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags);
+}
+
+/*
+ * Handle a write reply that flushed part of a page.
+ */
+static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
+{
+	struct nfs_write_data	*data = calldata;
+
+	dprintk("NFS: %5u write(%s/%lld %d@%lld)",
+		task->tk_pid,
+		data->req->wb_context->path.dentry->d_inode->i_sb->s_id,
+		(long long)
+		  NFS_FILEID(data->req->wb_context->path.dentry->d_inode),
+		data->req->wb_bytes, (long long)req_offset(data->req));
+
+	nfs_writeback_done(task, data);
+}
+
+static void nfs_writeback_release_partial(void *calldata)
+{
+	struct nfs_write_data	*data = calldata;
+	struct nfs_page		*req = data->req;
+	struct page		*page = req->wb_page;
+	int status = data->task.tk_status;
+
+	if (status < 0) {
+		nfs_set_pageerror(page);
+		nfs_context_set_write_error(req->wb_context, status);
+		dprintk(", error = %d\n", status);
+		goto out;
+	}
+
+	if (nfs_write_need_commit(data)) {
+		struct inode *inode = page->mapping->host;
+
+		spin_lock(&inode->i_lock);
+		if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) {
+			/* Do nothing we need to resend the writes */
+		} else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+			memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
+			dprintk(" defer commit\n");
+		} else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) {
+			set_bit(PG_NEED_RESCHED, &req->wb_flags);
+			clear_bit(PG_NEED_COMMIT, &req->wb_flags);
+			dprintk(" server reboot detected\n");
+		}
+		spin_unlock(&inode->i_lock);
+	} else
+		dprintk(" OK\n");
+
+out:
+	if (atomic_dec_and_test(&req->wb_complete))
+		nfs_writepage_release(req, data);
+	nfs_writedata_release(calldata);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+void nfs_write_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_write_data *data = calldata;
+
+	if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+				&data->args.seq_args,
+				&data->res.seq_res, 1, task))
+		return;
+	rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static const struct rpc_call_ops nfs_write_partial_ops = {
+#if defined(CONFIG_NFS_V4_1)
+	.rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+	.rpc_call_done = nfs_writeback_done_partial,
+	.rpc_release = nfs_writeback_release_partial,
+};
+
+/*
+ * Handle a write reply that flushes a whole page.
+ *
+ * FIXME: There is an inherent race with invalidate_inode_pages and
+ *	  writebacks since the page->count is kept > 1 for as long
+ *	  as the page has a write request pending.
+ */
+static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
+{
+	struct nfs_write_data	*data = calldata;
+
+	nfs_writeback_done(task, data);
+}
+
+static void nfs_writeback_release_full(void *calldata)
+{
+	struct nfs_write_data	*data = calldata;
+	int status = data->task.tk_status;
+
+	/* Update attributes as result of writeback. */
+	while (!list_empty(&data->pages)) {
+		struct nfs_page *req = nfs_list_entry(data->pages.next);
+		struct page *page = req->wb_page;
+
+		nfs_list_remove_request(req);
+
+		dprintk("NFS: %5u write (%s/%lld %d@%lld)",
+			data->task.tk_pid,
+			req->wb_context->path.dentry->d_inode->i_sb->s_id,
+			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
+			req->wb_bytes,
+			(long long)req_offset(req));
+
+		if (status < 0) {
+			nfs_set_pageerror(page);
+			nfs_context_set_write_error(req->wb_context, status);
+			dprintk(", error = %d\n", status);
+			goto remove_request;
+		}
+
+		if (nfs_write_need_commit(data)) {
+			memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
+			nfs_mark_request_commit(req, data->lseg);
+			dprintk(" marked for commit\n");
+			goto next;
+		}
+		dprintk(" OK\n");
+remove_request:
+		nfs_inode_remove_request(req);
+	next:
+		nfs_clear_page_tag_locked(req);
+		nfs_end_page_writeback(page);
+	}
+	nfs_writedata_release(calldata);
+}
+
+static const struct rpc_call_ops nfs_write_full_ops = {
+#if defined(CONFIG_NFS_V4_1)
+	.rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+	.rpc_call_done = nfs_writeback_done_full,
+	.rpc_release = nfs_writeback_release_full,
+};
+
+
+/*
+ * This function is called when the WRITE call is complete.
+ */
+void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+{
+	struct nfs_writeargs	*argp = &data->args;
+	struct nfs_writeres	*resp = &data->res;
+	struct nfs_server	*server = NFS_SERVER(data->inode);
+	int status;
+
+	dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
+		task->tk_pid, task->tk_status);
+
+	/*
+	 * ->write_done will attempt to use post-op attributes to detect
+	 * conflicting writes by other clients.  A strict interpretation
+	 * of close-to-open would allow us to continue caching even if
+	 * another writer had changed the file, but some applications
+	 * depend on tighter cache coherency when writing.
+	 */
+	status = NFS_PROTO(data->inode)->write_done(task, data);
+	if (status != 0)
+		return;
+	nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
+
+#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+	if (resp->verf->committed < argp->stable && task->tk_status >= 0) {
+		/* We tried a write call, but the server did not
+		 * commit data to stable storage even though we
+		 * requested it.
+		 * Note: There is a known bug in Tru64 < 5.0 in which
+		 *	 the server reports NFS_DATA_SYNC, but performs
+		 *	 NFS_FILE_SYNC. We therefore implement this checking
+		 *	 as a dprintk() in order to avoid filling syslog.
+		 */
+		static unsigned long    complain;
+
+		/* Note this will print the MDS for a DS write */
+		if (time_before(complain, jiffies)) {
+			dprintk("NFS:       faulty NFS server %s:"
+				" (committed = %d) != (stable = %d)\n",
+				server->nfs_client->cl_hostname,
+				resp->verf->committed, argp->stable);
+			complain = jiffies + 300 * HZ;
+		}
+	}
+#endif
+	/* Is this a short write? */
+	if (task->tk_status >= 0 && resp->count < argp->count) {
+		static unsigned long    complain;
+
+		nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE);
+
+		/* Has the server at least made some progress? */
+		if (resp->count != 0) {
+			/* Was this an NFSv2 write or an NFSv3 stable write? */
+			if (resp->verf->committed != NFS_UNSTABLE) {
+				/* Resend from where the server left off */
+				data->mds_offset += resp->count;
+				argp->offset += resp->count;
+				argp->pgbase += resp->count;
+				argp->count -= resp->count;
+			} else {
+				/* Resend as a stable write in order to avoid
+				 * headaches in the case of a server crash.
+				 */
+				argp->stable = NFS_FILE_SYNC;
+			}
+			nfs_restart_rpc(task, server->nfs_client);
+			return;
+		}
+		if (time_before(complain, jiffies)) {
+			printk(KERN_WARNING
+			       "NFS: Server wrote zero bytes, expected %u.\n",
+					argp->count);
+			complain = jiffies + 300 * HZ;
+		}
+		/* Can't do anything about it except throw an error. */
+		task->tk_status = -EIO;
+	}
+	return;
+}
+
+
+#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
+{
+	int ret;
+
+	if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
+		return 1;
+	if (!may_wait)
+		return 0;
+	ret = out_of_line_wait_on_bit_lock(&nfsi->flags,
+				NFS_INO_COMMIT,
+				nfs_wait_bit_killable,
+				TASK_KILLABLE);
+	return (ret < 0) ? ret : 1;
+}
+
+void nfs_commit_clear_lock(struct nfs_inode *nfsi)
+{
+	clear_bit(NFS_INO_COMMIT, &nfsi->flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
+}
+EXPORT_SYMBOL_GPL(nfs_commit_clear_lock);
+
+void nfs_commitdata_release(void *data)
+{
+	struct nfs_write_data *wdata = data;
+
+	put_lseg(wdata->lseg);
+	put_nfs_open_context(wdata->args.context);
+	nfs_commit_free(wdata);
+}
+EXPORT_SYMBOL_GPL(nfs_commitdata_release);
+
+int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt,
+			const struct rpc_call_ops *call_ops,
+			int how)
+{
+	struct rpc_task *task;
+	int priority = flush_task_priority(how);
+	struct rpc_message msg = {
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = data->cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.task = &data->task,
+		.rpc_client = clnt,
+		.rpc_message = &msg,
+		.callback_ops = call_ops,
+		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
+		.flags = RPC_TASK_ASYNC,
+		.priority = priority,
+	};
+	/* Set up the initial task struct.  */
+	NFS_PROTO(data->inode)->commit_setup(data, &msg);
+
+	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	if (how & FLUSH_SYNC)
+		rpc_wait_for_completion_task(task);
+	rpc_put_task(task);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_commit);
+
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+void nfs_init_commit(struct nfs_write_data *data,
+			    struct list_head *head,
+			    struct pnfs_layout_segment *lseg)
+{
+	struct nfs_page *first = nfs_list_entry(head->next);
+	struct inode *inode = first->wb_context->path.dentry->d_inode;
+
+	/* Set up the RPC argument and reply structs
+	 * NB: take care not to mess about with data->commit et al. */
+
+	list_splice_init(head, &data->pages);
+
+	data->inode	  = inode;
+	data->cred	  = first->wb_context->cred;
+	data->lseg	  = lseg; /* reference transferred */
+	data->mds_ops     = &nfs_commit_ops;
+
+	data->args.fh     = NFS_FH(data->inode);
+	/* Note: we always request a commit of the entire inode */
+	data->args.offset = 0;
+	data->args.count  = 0;
+	data->args.context = get_nfs_open_context(first->wb_context);
+	data->res.count   = 0;
+	data->res.fattr   = &data->fattr;
+	data->res.verf    = &data->verf;
+	nfs_fattr_init(&data->fattr);
+}
+EXPORT_SYMBOL_GPL(nfs_init_commit);
+
+void nfs_retry_commit(struct list_head *page_list,
+		      struct pnfs_layout_segment *lseg)
+{
+	struct nfs_page *req;
+
+	while (!list_empty(page_list)) {
+		req = nfs_list_entry(page_list->next);
+		nfs_list_remove_request(req);
+		nfs_mark_request_commit(req, lseg);
+		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+			     BDI_RECLAIMABLE);
+		nfs_clear_page_tag_locked(req);
+	}
+}
+EXPORT_SYMBOL_GPL(nfs_retry_commit);
+
+/*
+ * Commit dirty pages
+ */
+static int
+nfs_commit_list(struct inode *inode, struct list_head *head, int how)
+{
+	struct nfs_write_data	*data;
+
+	data = nfs_commitdata_alloc();
+
+	if (!data)
+		goto out_bad;
+
+	/* Set up the argument struct */
+	nfs_init_commit(data, head, NULL);
+	return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how);
+ out_bad:
+	nfs_retry_commit(head, NULL);
+	nfs_commit_clear_lock(NFS_I(inode));
+	return -ENOMEM;
+}
+
+/*
+ * COMMIT call returned
+ */
+static void nfs_commit_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs_write_data	*data = calldata;
+
+        dprintk("NFS: %5u nfs_commit_done (status %d)\n",
+                                task->tk_pid, task->tk_status);
+
+	/* Call the NFS version-specific code */
+	NFS_PROTO(data->inode)->commit_done(task, data);
+}
+
+void nfs_commit_release_pages(struct nfs_write_data *data)
+{
+	struct nfs_page	*req;
+	int status = data->task.tk_status;
+
+	while (!list_empty(&data->pages)) {
+		req = nfs_list_entry(data->pages.next);
+		nfs_list_remove_request(req);
+		nfs_clear_request_commit(req);
+
+		dprintk("NFS:       commit (%s/%lld %d@%lld)",
+			req->wb_context->path.dentry->d_inode->i_sb->s_id,
+			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
+			req->wb_bytes,
+			(long long)req_offset(req));
+		if (status < 0) {
+			nfs_context_set_write_error(req->wb_context, status);
+			nfs_inode_remove_request(req);
+			dprintk(", error = %d\n", status);
+			goto next;
+		}
+
+		/* Okay, COMMIT succeeded, apparently. Check the verifier
+		 * returned by the server against all stored verfs. */
+		if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) {
+			/* We have a match */
+			nfs_inode_remove_request(req);
+			dprintk(" OK\n");
+			goto next;
+		}
+		/* We have a mismatch. Write the page again */
+		dprintk(" mismatch\n");
+		nfs_mark_request_dirty(req);
+	next:
+		nfs_clear_page_tag_locked(req);
+	}
+}
+EXPORT_SYMBOL_GPL(nfs_commit_release_pages);
+
+static void nfs_commit_release(void *calldata)
+{
+	struct nfs_write_data *data = calldata;
+
+	nfs_commit_release_pages(data);
+	nfs_commit_clear_lock(NFS_I(data->inode));
+	nfs_commitdata_release(calldata);
+}
+
+static const struct rpc_call_ops nfs_commit_ops = {
+#if defined(CONFIG_NFS_V4_1)
+	.rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+	.rpc_call_done = nfs_commit_done,
+	.rpc_release = nfs_commit_release,
+};
+
+int nfs_commit_inode(struct inode *inode, int how)
+{
+	LIST_HEAD(head);
+	int may_wait = how & FLUSH_SYNC;
+	int res;
+
+	res = nfs_commit_set_lock(NFS_I(inode), may_wait);
+	if (res <= 0)
+		goto out_mark_dirty;
+	res = nfs_scan_commit(inode, &head, 0, 0);
+	if (res) {
+		int error;
+
+		error = pnfs_commit_list(inode, &head, how);
+		if (error == PNFS_NOT_ATTEMPTED)
+			error = nfs_commit_list(inode, &head, how);
+		if (error < 0)
+			return error;
+		if (!may_wait)
+			goto out_mark_dirty;
+		error = wait_on_bit(&NFS_I(inode)->flags,
+				NFS_INO_COMMIT,
+				nfs_wait_bit_killable,
+				TASK_KILLABLE);
+		if (error < 0)
+			return error;
+	} else
+		nfs_commit_clear_lock(NFS_I(inode));
+	return res;
+	/* Note: If we exit without ensuring that the commit is complete,
+	 * we must mark the inode as dirty. Otherwise, future calls to
+	 * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
+	 * that the data is on the disk.
+	 */
+out_mark_dirty:
+	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	return res;
+}
+
+static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int flags = FLUSH_SYNC;
+	int ret = 0;
+
+	/* no commits means nothing needs to be done */
+	if (!nfsi->ncommit)
+		return ret;
+
+	if (wbc->sync_mode == WB_SYNC_NONE) {
+		/* Don't commit yet if this is a non-blocking flush and there
+		 * are a lot of outstanding writes for this mapping.
+		 */
+		if (nfsi->ncommit <= (nfsi->npages >> 1))
+			goto out_mark_dirty;
+
+		/* don't wait for the COMMIT response */
+		flags = 0;
+	}
+
+	ret = nfs_commit_inode(inode, flags);
+	if (ret >= 0) {
+		if (wbc->sync_mode == WB_SYNC_NONE) {
+			if (ret < wbc->nr_to_write)
+				wbc->nr_to_write -= ret;
+			else
+				wbc->nr_to_write = 0;
+		}
+		return 0;
+	}
+out_mark_dirty:
+	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	return ret;
+}
+#else
+static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
+{
+	return 0;
+}
+#endif
+
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int ret;
+
+	ret = nfs_commit_unstable_pages(inode, wbc);
+	if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) {
+		int status;
+		bool sync = true;
+
+		if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking ||
+		    wbc->for_background)
+			sync = false;
+
+		status = pnfs_layoutcommit_inode(inode, sync);
+		if (status < 0)
+			return status;
+	}
+	return ret;
+}
+
+/*
+ * flush the inode to disk.
+ */
+int nfs_wb_all(struct inode *inode)
+{
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = LONG_MAX,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+	};
+
+	return sync_inode(inode, &wbc);
+}
+
+int nfs_wb_page_cancel(struct inode *inode, struct page *page)
+{
+	struct nfs_page *req;
+	int ret = 0;
+
+	BUG_ON(!PageLocked(page));
+	for (;;) {
+		wait_on_page_writeback(page);
+		req = nfs_page_find_request(page);
+		if (req == NULL)
+			break;
+		if (nfs_lock_request_dontget(req)) {
+			nfs_inode_remove_request(req);
+			/*
+			 * In case nfs_inode_remove_request has marked the
+			 * page as being dirty
+			 */
+			cancel_dirty_page(page, PAGE_CACHE_SIZE);
+			nfs_unlock_request(req);
+			break;
+		}
+		ret = nfs_wait_on_request(req);
+		nfs_release_request(req);
+		if (ret < 0)
+			break;
+	}
+	return ret;
+}
+
+/*
+ * Write back all requests on one page - we do this before reading it.
+ */
+int nfs_wb_page(struct inode *inode, struct page *page)
+{
+	loff_t range_start = page_offset(page);
+	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = 0,
+		.range_start = range_start,
+		.range_end = range_end,
+	};
+	int ret;
+
+	for (;;) {
+		wait_on_page_writeback(page);
+		if (clear_page_dirty_for_io(page)) {
+			ret = nfs_writepage_locked(page, &wbc);
+			if (ret < 0)
+				goto out_error;
+			continue;
+		}
+		if (!PagePrivate(page))
+			break;
+		ret = nfs_commit_inode(inode, FLUSH_SYNC);
+		if (ret < 0)
+			goto out_error;
+	}
+	return 0;
+out_error:
+	return ret;
+}
+
+#ifdef CONFIG_MIGRATION
+int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
+		struct page *page)
+{
+	/*
+	 * If PagePrivate is set, then the page is currently associated with
+	 * an in-progress read or write request. Don't try to migrate it.
+	 *
+	 * FIXME: we could do this in principle, but we'll need a way to ensure
+	 *        that we can safely release the inode reference while holding
+	 *        the page lock.
+	 */
+	if (PagePrivate(page))
+		return -EBUSY;
+
+	nfs_fscache_release_page(page, GFP_KERNEL);
+
+	return migrate_page(mapping, newpage, page);
+}
+#endif
+
+int __init nfs_init_writepagecache(void)
+{
+	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
+					     sizeof(struct nfs_write_data),
+					     0, SLAB_HWCACHE_ALIGN,
+					     NULL);
+	if (nfs_wdata_cachep == NULL)
+		return -ENOMEM;
+
+	nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,
+						     nfs_wdata_cachep);
+	if (nfs_wdata_mempool == NULL)
+		return -ENOMEM;
+
+	nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
+						      nfs_wdata_cachep);
+	if (nfs_commit_mempool == NULL)
+		return -ENOMEM;
+
+	/*
+	 * NFS congestion size, scale with available memory.
+	 *
+	 *  64MB:    8192k
+	 * 128MB:   11585k
+	 * 256MB:   16384k
+	 * 512MB:   23170k
+	 *   1GB:   32768k
+	 *   2GB:   46340k
+	 *   4GB:   65536k
+	 *   8GB:   92681k
+	 *  16GB:  131072k
+	 *
+	 * This allows larger machines to have larger/more transfers.
+	 * Limit the default to 256M
+	 */
+	nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+	if (nfs_congestion_kb > 256*1024)
+		nfs_congestion_kb = 256*1024;
+
+	return 0;
+}
+
+void nfs_destroy_writepagecache(void)
+{
+	mempool_destroy(nfs_commit_mempool);
+	mempool_destroy(nfs_wdata_mempool);
+	kmem_cache_destroy(nfs_wdata_cachep);
+}
+
diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
new file mode 100644
index 00000000..f689ed82
--- /dev/null
+++ b/fs/nfs_common/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for Linux filesystem routines that are shared by client and server.
+#
+
+obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
+
+nfs_acl-objs := nfsacl.o
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
new file mode 100644
index 00000000..6940439b
--- /dev/null
+++ b/fs/nfs_common/nfsacl.c
@@ -0,0 +1,286 @@
+/*
+ * fs/nfs_common/nfsacl.c
+ *
+ *  Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
+ */
+
+/*
+ * The Solaris nfsacl protocol represents some ACLs slightly differently
+ * than POSIX 1003.1e draft 17 does (and we do):
+ *
+ *  - Minimal ACLs always have an ACL_MASK entry, so they have
+ *    four instead of three entries.
+ *  - The ACL_MASK entry in such minimal ACLs always has the same
+ *    permissions as the ACL_GROUP_OBJ entry. (In extended ACLs
+ *    the ACL_MASK and ACL_GROUP_OBJ entries may differ.)
+ *  - The identifier fields of the ACL_USER_OBJ and ACL_GROUP_OBJ
+ *    entries contain the identifiers of the owner and owning group.
+ *    (In POSIX ACLs we always set them to ACL_UNDEFINED_ID).
+ *  - ACL entries in the kernel are kept sorted in ascending order
+ *    of (e_tag, e_id). Solaris ACLs are unsorted.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/nfsacl.h>
+#include <linux/nfs3.h>
+#include <linux/sort.h>
+
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL_GPL(nfsacl_encode);
+EXPORT_SYMBOL_GPL(nfsacl_decode);
+
+struct nfsacl_encode_desc {
+	struct xdr_array2_desc desc;
+	unsigned int count;
+	struct posix_acl *acl;
+	int typeflag;
+	uid_t uid;
+	gid_t gid;
+};
+
+struct nfsacl_simple_acl {
+	struct posix_acl acl;
+	struct posix_acl_entry ace[4];
+};
+
+static int
+xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
+{
+	struct nfsacl_encode_desc *nfsacl_desc =
+		(struct nfsacl_encode_desc *) desc;
+	__be32 *p = elem;
+
+	struct posix_acl_entry *entry =
+		&nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
+
+	*p++ = htonl(entry->e_tag | nfsacl_desc->typeflag);
+	switch(entry->e_tag) {
+		case ACL_USER_OBJ:
+			*p++ = htonl(nfsacl_desc->uid);
+			break;
+		case ACL_GROUP_OBJ:
+			*p++ = htonl(nfsacl_desc->gid);
+			break;
+		case ACL_USER:
+		case ACL_GROUP:
+			*p++ = htonl(entry->e_id);
+			break;
+		default:  /* Solaris depends on that! */
+			*p++ = 0;
+			break;
+	}
+	*p++ = htonl(entry->e_perm & S_IRWXO);
+	return 0;
+}
+
+/**
+ * nfsacl_encode - Encode an NFSv3 ACL
+ *
+ * @buf: destination xdr_buf to contain XDR encoded ACL
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @inode: inode of file whose ACL this is
+ * @acl: posix_acl to encode
+ * @encode_entries: whether to encode ACEs as well
+ * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
+ *
+ * Returns size of encoded ACL in bytes or a negative errno value.
+ */
+int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+		  struct posix_acl *acl, int encode_entries, int typeflag)
+{
+	int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
+	struct nfsacl_encode_desc nfsacl_desc = {
+		.desc = {
+			.elem_size = 12,
+			.array_len = encode_entries ? entries : 0,
+			.xcode = xdr_nfsace_encode,
+		},
+		.acl = acl,
+		.typeflag = typeflag,
+		.uid = inode->i_uid,
+		.gid = inode->i_gid,
+	};
+	struct nfsacl_simple_acl aclbuf;
+	int err;
+
+	if (entries > NFS_ACL_MAX_ENTRIES ||
+	    xdr_encode_word(buf, base, entries))
+		return -EINVAL;
+	if (encode_entries && acl && acl->a_count == 3) {
+		struct posix_acl *acl2 = &aclbuf.acl;
+
+		/* Avoid the use of posix_acl_alloc().  nfsacl_encode() is
+		 * invoked in contexts where a memory allocation failure is
+		 * fatal.  Fortunately this fake ACL is small enough to
+		 * construct on the stack. */
+		posix_acl_init(acl2, 4);
+
+		/* Insert entries in canonical order: other orders seem
+		 to confuse Solaris VxFS. */
+		acl2->a_entries[0] = acl->a_entries[0];  /* ACL_USER_OBJ */
+		acl2->a_entries[1] = acl->a_entries[1];  /* ACL_GROUP_OBJ */
+		acl2->a_entries[2] = acl->a_entries[1];  /* ACL_MASK */
+		acl2->a_entries[2].e_tag = ACL_MASK;
+		acl2->a_entries[3] = acl->a_entries[2];  /* ACL_OTHER */
+		nfsacl_desc.acl = acl2;
+	}
+	err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc);
+	if (!err)
+		err = 8 + nfsacl_desc.desc.elem_size *
+			  nfsacl_desc.desc.array_len;
+	return err;
+}
+
+struct nfsacl_decode_desc {
+	struct xdr_array2_desc desc;
+	unsigned int count;
+	struct posix_acl *acl;
+};
+
+static int
+xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)
+{
+	struct nfsacl_decode_desc *nfsacl_desc =
+		(struct nfsacl_decode_desc *) desc;
+	__be32 *p = elem;
+	struct posix_acl_entry *entry;
+
+	if (!nfsacl_desc->acl) {
+		if (desc->array_len > NFS_ACL_MAX_ENTRIES)
+			return -EINVAL;
+		nfsacl_desc->acl = posix_acl_alloc(desc->array_len, GFP_KERNEL);
+		if (!nfsacl_desc->acl)
+			return -ENOMEM;
+		nfsacl_desc->count = 0;
+	}
+
+	entry = &nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
+	entry->e_tag = ntohl(*p++) & ~NFS_ACL_DEFAULT;
+	entry->e_id = ntohl(*p++);
+	entry->e_perm = ntohl(*p++);
+
+	switch(entry->e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_USER:
+		case ACL_GROUP_OBJ:
+		case ACL_GROUP:
+		case ACL_OTHER:
+			if (entry->e_perm & ~S_IRWXO)
+				return -EINVAL;
+			break;
+		case ACL_MASK:
+			/* Solaris sometimes sets additional bits in the mask */
+			entry->e_perm &= S_IRWXO;
+			break;
+		default:
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+cmp_acl_entry(const void *x, const void *y)
+{
+	const struct posix_acl_entry *a = x, *b = y;
+
+	if (a->e_tag != b->e_tag)
+		return a->e_tag - b->e_tag;
+	else if (a->e_id > b->e_id)
+		return 1;
+	else if (a->e_id < b->e_id)
+		return -1;
+	else
+		return 0;
+}
+
+/*
+ * Convert from a Solaris ACL to a POSIX 1003.1e draft 17 ACL.
+ */
+static int
+posix_acl_from_nfsacl(struct posix_acl *acl)
+{
+	struct posix_acl_entry *pa, *pe,
+	       *group_obj = NULL, *mask = NULL;
+
+	if (!acl)
+		return 0;
+
+	sort(acl->a_entries, acl->a_count, sizeof(struct posix_acl_entry),
+	     cmp_acl_entry, NULL);
+
+	/* Clear undefined identifier fields and find the ACL_GROUP_OBJ
+	   and ACL_MASK entries. */
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+		switch(pa->e_tag) {
+			case ACL_USER_OBJ:
+				pa->e_id = ACL_UNDEFINED_ID;
+				break;
+			case ACL_GROUP_OBJ:
+				pa->e_id = ACL_UNDEFINED_ID;
+				group_obj = pa;
+				break;
+			case ACL_MASK:
+				mask = pa;
+				/* fall through */
+			case ACL_OTHER:
+				pa->e_id = ACL_UNDEFINED_ID;
+				break;
+		}
+	}
+	if (acl->a_count == 4 && group_obj && mask &&
+	    mask->e_perm == group_obj->e_perm) {
+		/* remove bogus ACL_MASK entry */
+		memmove(mask, mask+1, (3 - (mask - acl->a_entries)) *
+				      sizeof(struct posix_acl_entry));
+		acl->a_count = 3;
+	}
+	return 0;
+}
+
+/**
+ * nfsacl_decode - Decode an NFSv3 ACL
+ *
+ * @buf: xdr_buf containing XDR'd ACL data to decode
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @aclcnt: count of ACEs in decoded posix_acl
+ * @pacl: buffer in which to place decoded posix_acl
+ *
+ * Returns the length of the decoded ACL in bytes, or a negative errno value.
+ */
+int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+		  struct posix_acl **pacl)
+{
+	struct nfsacl_decode_desc nfsacl_desc = {
+		.desc = {
+			.elem_size = 12,
+			.xcode = pacl ? xdr_nfsace_decode : NULL,
+		},
+	};
+	u32 entries;
+	int err;
+
+	if (xdr_decode_word(buf, base, &entries) ||
+	    entries > NFS_ACL_MAX_ENTRIES)
+		return -EINVAL;
+	nfsacl_desc.desc.array_maxlen = entries;
+	err = xdr_decode_array2(buf, base + 4, &nfsacl_desc.desc);
+	if (err)
+		return err;
+	if (pacl) {
+		if (entries != nfsacl_desc.desc.array_len ||
+		    posix_acl_from_nfsacl(nfsacl_desc.acl) != 0) {
+			posix_acl_release(nfsacl_desc.acl);
+			return -EINVAL;
+		}
+		*pacl = nfsacl_desc.acl;
+	}
+	if (aclcnt)
+		*aclcnt = entries;
+	return 8 + nfsacl_desc.desc.elem_size *
+		   nfsacl_desc.desc.array_len;
+}
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
new file mode 100644
index 00000000..124e8fcb
--- /dev/null
+++ b/fs/nfsctl.c
@@ -0,0 +1,100 @@
+/*
+ *	fs/nfsctl.c
+ *
+ *	This should eventually move to userland.
+ *
+ */
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/nfsd/syscall.h>
+#include <linux/cred.h>
+#include <linux/sched.h>
+#include <linux/linkage.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/syscalls.h>
+#include <asm/uaccess.h>
+
+/*
+ * open a file on nfsd fs
+ */
+
+static struct file *do_open(char *name, int flags)
+{
+	struct vfsmount *mnt;
+	struct file *file;
+
+	mnt = do_kern_mount("nfsd", 0, "nfsd", NULL);
+	if (IS_ERR(mnt))
+		return (struct file *)mnt;
+
+	file = file_open_root(mnt->mnt_root, mnt, name, flags);
+
+	mntput(mnt);	/* drop do_kern_mount reference */
+	return file;
+}
+
+static struct {
+	char *name; int wsize; int rsize;
+} map[] = {
+	[NFSCTL_SVC] = {
+		.name	= ".svc",
+		.wsize	= sizeof(struct nfsctl_svc)
+	},
+	[NFSCTL_ADDCLIENT] = {
+		.name	= ".add",
+		.wsize	= sizeof(struct nfsctl_client)
+	},
+	[NFSCTL_DELCLIENT] = {
+		.name	= ".del",
+		.wsize	= sizeof(struct nfsctl_client)
+	},
+	[NFSCTL_EXPORT] = {
+		.name	= ".export",
+		.wsize	= sizeof(struct nfsctl_export)
+	},
+	[NFSCTL_UNEXPORT] = {
+		.name	= ".unexport",
+		.wsize	= sizeof(struct nfsctl_export)
+	},
+	[NFSCTL_GETFD] = {
+		.name	= ".getfd",
+		.wsize	= sizeof(struct nfsctl_fdparm),
+		.rsize	= NFS_FHSIZE
+	},
+	[NFSCTL_GETFS] = {
+		.name	= ".getfs",
+		.wsize	= sizeof(struct nfsctl_fsparm),
+		.rsize	= sizeof(struct knfsd_fh)
+	},
+};
+
+SYSCALL_DEFINE3(nfsservctl, int, cmd, struct nfsctl_arg __user *, arg,
+		void __user *, res)
+{
+	struct file *file;
+	void __user *p = &arg->u;
+	int version;
+	int err;
+
+	if (copy_from_user(&version, &arg->ca_version, sizeof(int)))
+		return -EFAULT;
+
+	if (version != NFSCTL_VERSION)
+		return -EINVAL;
+
+	if (cmd < 0 || cmd >= ARRAY_SIZE(map) || !map[cmd].name)
+		return -EINVAL;
+
+	file = do_open(map[cmd].name, map[cmd].rsize ? O_RDWR : O_WRONLY);	
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+	err = file->f_op->write(file, p, map[cmd].wsize, &file->f_pos);
+	if (err >= 0 && map[cmd].rsize)
+		err = file->f_op->read(file, res, map[cmd].rsize, &file->f_pos);
+	if (err >= 0)
+		err = 0;
+	fput(file);
+	return err;
+}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
new file mode 100644
index 00000000..fbb2a5ef
--- /dev/null
+++ b/fs/nfsd/Kconfig
@@ -0,0 +1,94 @@
+config NFSD
+	tristate "NFS server support"
+	depends on INET
+	depends on FILE_LOCKING
+	select LOCKD
+	select SUNRPC
+	select EXPORTFS
+	select NFS_ACL_SUPPORT if NFSD_V2_ACL
+	help
+	  Choose Y here if you want to allow other computers to access
+	  files residing on this system using Sun's Network File System
+	  protocol.  To compile the NFS server support as a module,
+	  choose M here: the module will be called nfsd.
+
+	  You may choose to use a user-space NFS server instead, in which
+	  case you can choose N here.
+
+	  To export local file systems using NFS, you also need to install
+	  user space programs which can be found in the Linux nfs-utils
+	  package, available from http://linux-nfs.org/.  More detail about
+	  the Linux NFS server implementation is available via the
+	  exports(5) man page.
+
+	  Below you can choose which versions of the NFS protocol are
+	  available to clients mounting the NFS server on this system.
+	  Support for NFS version 2 (RFC 1094) is always available when
+	  CONFIG_NFSD is selected.
+
+	  If unsure, say N.
+
+config NFSD_DEPRECATED
+	bool "Include support for deprecated syscall interface to NFSD"
+	depends on NFSD
+	default y
+	help
+	  The syscall interface to nfsd was obsoleted in 2.6.0 by a new
+	  filesystem based interface.  The old interface is due for removal
+	  in 2.6.40.  If you wish to remove the interface before then
+	  say N.
+
+	  In unsure, say Y.
+
+config NFSD_V2_ACL
+	bool
+	depends on NFSD
+
+config NFSD_V3
+	bool "NFS server support for NFS version 3"
+	depends on NFSD
+	help
+	  This option enables support in your system's NFS server for
+	  version 3 of the NFS protocol (RFC 1813).
+
+	  If unsure, say Y.
+
+config NFSD_V3_ACL
+	bool "NFS server support for the NFSv3 ACL protocol extension"
+	depends on NFSD_V3
+	select NFSD_V2_ACL
+	help
+	  Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
+	  never became an official part of the NFS version 3 protocol.
+	  This protocol extension allows applications on NFS clients to
+	  manipulate POSIX Access Control Lists on files residing on NFS
+	  servers.  NFS servers enforce POSIX ACLs on local files whether
+	  this protocol is available or not.
+
+	  This option enables support in your system's NFS server for the
+	  NFSv3 ACL protocol extension allowing NFS clients to manipulate
+	  POSIX ACLs on files exported by your system's NFS server.  NFS
+	  clients which support the Solaris NFSv3 ACL protocol can then
+	  access and modify ACLs on your NFS server.
+
+	  To store ACLs on your NFS server, you also need to enable ACL-
+	  related CONFIG options for your local file systems of choice.
+
+	  If unsure, say N.
+
+config NFSD_V4
+	bool "NFS server support for NFS version 4 (EXPERIMENTAL)"
+	depends on NFSD && PROC_FS && EXPERIMENTAL
+	select NFSD_V3
+	select FS_POSIX_ACL
+	select SUNRPC_GSS
+	select CRYPTO
+	help
+	  This option enables support in your system's NFS server for
+	  version 4 of the NFS protocol (RFC 3530).
+
+	  To export files using NFSv4, you need to install additional user
+	  space programs which can be found in the Linux nfs-utils package,
+	  available from http://linux-nfs.org/.
+
+	  If unsure, say N.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
new file mode 100644
index 00000000..9b118ee2
--- /dev/null
+++ b/fs/nfsd/Makefile
@@ -0,0 +1,13 @@
+#
+# Makefile for the Linux nfs server
+#
+
+obj-$(CONFIG_NFSD)	+= nfsd.o
+
+nfsd-y 			:= nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
+			   export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
+nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
+nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
+nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
+nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
+			   nfs4acl.o nfs4callback.o nfs4recover.o
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
new file mode 100644
index 00000000..34e5c40a
--- /dev/null
+++ b/fs/nfsd/acl.h
@@ -0,0 +1,59 @@
+/*
+ *  Common NFSv4 ACL handling definitions.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LINUX_NFS4_ACL_H
+#define LINUX_NFS4_ACL_H
+
+#include <linux/posix_acl.h>
+
+/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
+ * fit in a page: */
+#define NFS4_ACL_MAX 170
+
+struct nfs4_acl *nfs4_acl_new(int);
+int nfs4_acl_get_whotype(char *, u32);
+int nfs4_acl_write_who(int who, char *p);
+int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
+		                        uid_t who, u32 mask);
+
+#define NFS4_ACL_TYPE_DEFAULT	0x01
+#define NFS4_ACL_DIR		0x02
+#define NFS4_ACL_OWNER		0x04
+
+struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
+				struct posix_acl *, unsigned int flags);
+int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
+				struct posix_acl **, unsigned int flags);
+
+#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
new file mode 100644
index 00000000..79717a40
--- /dev/null
+++ b/fs/nfsd/auth.c
@@ -0,0 +1,95 @@
+/* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */
+
+#include <linux/sched.h>
+#include "nfsd.h"
+#include "auth.h"
+
+int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
+{
+	struct exp_flavor_info *f;
+	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+
+	for (f = exp->ex_flavors; f < end; f++) {
+		if (f->pseudoflavor == rqstp->rq_flavor)
+			return f->flags;
+	}
+	return exp->ex_flags;
+
+}
+
+int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
+{
+	struct group_info *rqgi;
+	struct group_info *gi;
+	struct cred *new;
+	int i;
+	int flags = nfsexp_flags(rqstp, exp);
+	int ret;
+
+	validate_process_creds();
+
+	/* discard any old override before preparing the new set */
+	revert_creds(get_cred(current->real_cred));
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	new->fsuid = rqstp->rq_cred.cr_uid;
+	new->fsgid = rqstp->rq_cred.cr_gid;
+
+	rqgi = rqstp->rq_cred.cr_group_info;
+
+	if (flags & NFSEXP_ALLSQUASH) {
+		new->fsuid = exp->ex_anon_uid;
+		new->fsgid = exp->ex_anon_gid;
+		gi = groups_alloc(0);
+		if (!gi)
+			goto oom;
+	} else if (flags & NFSEXP_ROOTSQUASH) {
+		if (!new->fsuid)
+			new->fsuid = exp->ex_anon_uid;
+		if (!new->fsgid)
+			new->fsgid = exp->ex_anon_gid;
+
+		gi = groups_alloc(rqgi->ngroups);
+		if (!gi)
+			goto oom;
+
+		for (i = 0; i < rqgi->ngroups; i++) {
+			if (!GROUP_AT(rqgi, i))
+				GROUP_AT(gi, i) = exp->ex_anon_gid;
+			else
+				GROUP_AT(gi, i) = GROUP_AT(rqgi, i);
+		}
+	} else {
+		gi = get_group_info(rqgi);
+	}
+
+	if (new->fsuid == (uid_t) -1)
+		new->fsuid = exp->ex_anon_uid;
+	if (new->fsgid == (gid_t) -1)
+		new->fsgid = exp->ex_anon_gid;
+
+	ret = set_groups(new, gi);
+	put_group_info(gi);
+	if (ret < 0)
+		goto error;
+
+	if (new->fsuid)
+		new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
+	else
+		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
+							new->cap_permitted);
+	validate_process_creds();
+	put_cred(override_creds(new));
+	put_cred(new);
+	validate_process_creds();
+	return 0;
+
+oom:
+	ret = -ENOMEM;
+error:
+	abort_creds(new);
+	return ret;
+}
+
diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h
new file mode 100644
index 00000000..78b3c0e9
--- /dev/null
+++ b/fs/nfsd/auth.h
@@ -0,0 +1,22 @@
+/*
+ * nfsd-specific authentication stuff.
+ * uid/gid mapping not yet implemented.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef LINUX_NFSD_AUTH_H
+#define LINUX_NFSD_AUTH_H
+
+#define nfsd_luid(rq, uid)	((u32)(uid))
+#define nfsd_lgid(rq, gid)	((u32)(gid))
+#define nfsd_ruid(rq, uid)	((u32)(uid))
+#define nfsd_rgid(rq, gid)	((u32)(gid))
+
+/*
+ * Set the current process's fsuid/fsgid etc to those of the NFS
+ * client user
+ */
+int nfsd_setuser(struct svc_rqst *, struct svc_export *);
+
+#endif /* LINUX_NFSD_AUTH_H */
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
new file mode 100644
index 00000000..d892be61
--- /dev/null
+++ b/fs/nfsd/cache.h
@@ -0,0 +1,83 @@
+/*
+ * Request reply cache. This was heavily inspired by the
+ * implementation in 4.3BSD/4.4BSD.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef NFSCACHE_H
+#define NFSCACHE_H
+
+#include <linux/sunrpc/svc.h>
+
+/*
+ * Representation of a reply cache entry.
+ */
+struct svc_cacherep {
+	struct hlist_node	c_hash;
+	struct list_head	c_lru;
+
+	unsigned char		c_state,	/* unused, inprog, done */
+				c_type,		/* status, buffer */
+				c_secure : 1;	/* req came from port < 1024 */
+	struct sockaddr_in	c_addr;
+	__be32			c_xid;
+	u32			c_prot;
+	u32			c_proc;
+	u32			c_vers;
+	unsigned long		c_timestamp;
+	union {
+		struct kvec	u_vec;
+		__be32		u_status;
+	}			c_u;
+};
+
+#define c_replvec		c_u.u_vec
+#define c_replstat		c_u.u_status
+
+/* cache entry states */
+enum {
+	RC_UNUSED,
+	RC_INPROG,
+	RC_DONE
+};
+
+/* return values */
+enum {
+	RC_DROPIT,
+	RC_REPLY,
+	RC_DOIT,
+	RC_INTR
+};
+
+/*
+ * Cache types.
+ * We may want to add more types one day, e.g. for diropres and
+ * attrstat replies. Using cache entries with fixed length instead
+ * of buffer pointers may be more efficient.
+ */
+enum {
+	RC_NOCACHE,
+	RC_REPLSTAT,
+	RC_REPLBUFF,
+};
+
+/*
+ * If requests are retransmitted within this interval, they're dropped.
+ */
+#define RC_DELAY		(HZ/5)
+
+int	nfsd_reply_cache_init(void);
+void	nfsd_reply_cache_shutdown(void);
+int	nfsd_cache_lookup(struct svc_rqst *, int);
+void	nfsd_cache_update(struct svc_rqst *, int, __be32 *);
+
+#ifdef CONFIG_NFSD_V4
+void	nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp);
+#else  /* CONFIG_NFSD_V4 */
+static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
+{
+}
+#endif /* CONFIG_NFSD_V4 */
+
+#endif /* NFSCACHE_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
new file mode 100644
index 00000000..4b470f60
--- /dev/null
+++ b/fs/nfsd/export.c
@@ -0,0 +1,1693 @@
+/*
+ * NFS exporting and validation.
+ *
+ * We maintain a list of clients, each of which has a list of
+ * exports. To export an fs to a given client, you first have
+ * to create the client entry with NFSCTL_ADDCLIENT, which
+ * creates a client control block and adds it to the hash
+ * table. Then, you call NFSCTL_EXPORT for each fs.
+ *
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de>
+ */
+
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/module.h>
+#include <linux/exportfs.h>
+
+#include <linux/nfsd/syscall.h>
+#include <net/ipv6.h>
+
+#include "nfsd.h"
+#include "nfsfh.h"
+
+#define NFSDDBG_FACILITY	NFSDDBG_EXPORT
+
+typedef struct auth_domain	svc_client;
+typedef struct svc_export	svc_export;
+
+/*
+ * We have two caches.
+ * One maps client+vfsmnt+dentry to export options - the export map
+ * The other maps client+filehandle-fragment to export options. - the expkey map
+ *
+ * The export options are actually stored in the first map, and the
+ * second map contains a reference to the entry in the first map.
+ */
+
+#define	EXPKEY_HASHBITS		8
+#define	EXPKEY_HASHMAX		(1 << EXPKEY_HASHBITS)
+#define	EXPKEY_HASHMASK		(EXPKEY_HASHMAX -1)
+static struct cache_head *expkey_table[EXPKEY_HASHMAX];
+
+static void expkey_put(struct kref *ref)
+{
+	struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref);
+
+	if (test_bit(CACHE_VALID, &key->h.flags) &&
+	    !test_bit(CACHE_NEGATIVE, &key->h.flags))
+		path_put(&key->ek_path);
+	auth_domain_put(key->ek_client);
+	kfree(key);
+}
+
+static void expkey_request(struct cache_detail *cd,
+			   struct cache_head *h,
+			   char **bpp, int *blen)
+{
+	/* client fsidtype \xfsid */
+	struct svc_expkey *ek = container_of(h, struct svc_expkey, h);
+	char type[5];
+
+	qword_add(bpp, blen, ek->ek_client->name);
+	snprintf(type, 5, "%d", ek->ek_fsidtype);
+	qword_add(bpp, blen, type);
+	qword_addhex(bpp, blen, (char*)ek->ek_fsid, key_len(ek->ek_fsidtype));
+	(*bpp)[-1] = '\n';
+}
+
+static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h, expkey_request);
+}
+
+static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old);
+static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *);
+static struct cache_detail svc_expkey_cache;
+
+static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+	/* client fsidtype fsid [path] */
+	char *buf;
+	int len;
+	struct auth_domain *dom = NULL;
+	int err;
+	int fsidtype;
+	char *ep;
+	struct svc_expkey key;
+	struct svc_expkey *ek = NULL;
+
+	if (mlen < 1 || mesg[mlen-1] != '\n')
+		return -EINVAL;
+	mesg[mlen-1] = 0;
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	err = -ENOMEM;
+	if (!buf)
+		goto out;
+
+	err = -EINVAL;
+	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+		goto out;
+
+	err = -ENOENT;
+	dom = auth_domain_find(buf);
+	if (!dom)
+		goto out;
+	dprintk("found domain %s\n", buf);
+
+	err = -EINVAL;
+	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+		goto out;
+	fsidtype = simple_strtoul(buf, &ep, 10);
+	if (*ep)
+		goto out;
+	dprintk("found fsidtype %d\n", fsidtype);
+	if (key_len(fsidtype)==0) /* invalid type */
+		goto out;
+	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+		goto out;
+	dprintk("found fsid length %d\n", len);
+	if (len != key_len(fsidtype))
+		goto out;
+
+	/* OK, we seem to have a valid key */
+	key.h.flags = 0;
+	key.h.expiry_time = get_expiry(&mesg);
+	if (key.h.expiry_time == 0)
+		goto out;
+
+	key.ek_client = dom;	
+	key.ek_fsidtype = fsidtype;
+	memcpy(key.ek_fsid, buf, len);
+
+	ek = svc_expkey_lookup(&key);
+	err = -ENOMEM;
+	if (!ek)
+		goto out;
+
+	/* now we want a pathname, or empty meaning NEGATIVE  */
+	err = -EINVAL;
+	len = qword_get(&mesg, buf, PAGE_SIZE);
+	if (len < 0)
+		goto out;
+	dprintk("Path seems to be <%s>\n", buf);
+	err = 0;
+	if (len == 0) {
+		set_bit(CACHE_NEGATIVE, &key.h.flags);
+		ek = svc_expkey_update(&key, ek);
+		if (!ek)
+			err = -ENOMEM;
+	} else {
+		err = kern_path(buf, 0, &key.ek_path);
+		if (err)
+			goto out;
+
+		dprintk("Found the path %s\n", buf);
+
+		ek = svc_expkey_update(&key, ek);
+		if (!ek)
+			err = -ENOMEM;
+		path_put(&key.ek_path);
+	}
+	cache_flush();
+ out:
+	if (ek)
+		cache_put(&ek->h, &svc_expkey_cache);
+	if (dom)
+		auth_domain_put(dom);
+	kfree(buf);
+	return err;
+}
+
+static int expkey_show(struct seq_file *m,
+		       struct cache_detail *cd,
+		       struct cache_head *h)
+{
+	struct svc_expkey *ek ;
+	int i;
+
+	if (h ==NULL) {
+		seq_puts(m, "#domain fsidtype fsid [path]\n");
+		return 0;
+	}
+	ek = container_of(h, struct svc_expkey, h);
+	seq_printf(m, "%s %d 0x", ek->ek_client->name,
+		   ek->ek_fsidtype);
+	for (i=0; i < key_len(ek->ek_fsidtype)/4; i++)
+		seq_printf(m, "%08x", ek->ek_fsid[i]);
+	if (test_bit(CACHE_VALID, &h->flags) && 
+	    !test_bit(CACHE_NEGATIVE, &h->flags)) {
+		seq_printf(m, " ");
+		seq_path(m, &ek->ek_path, "\\ \t\n");
+	}
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static inline int expkey_match (struct cache_head *a, struct cache_head *b)
+{
+	struct svc_expkey *orig = container_of(a, struct svc_expkey, h);
+	struct svc_expkey *new = container_of(b, struct svc_expkey, h);
+
+	if (orig->ek_fsidtype != new->ek_fsidtype ||
+	    orig->ek_client != new->ek_client ||
+	    memcmp(orig->ek_fsid, new->ek_fsid, key_len(orig->ek_fsidtype)) != 0)
+		return 0;
+	return 1;
+}
+
+static inline void expkey_init(struct cache_head *cnew,
+				   struct cache_head *citem)
+{
+	struct svc_expkey *new = container_of(cnew, struct svc_expkey, h);
+	struct svc_expkey *item = container_of(citem, struct svc_expkey, h);
+
+	kref_get(&item->ek_client->ref);
+	new->ek_client = item->ek_client;
+	new->ek_fsidtype = item->ek_fsidtype;
+
+	memcpy(new->ek_fsid, item->ek_fsid, sizeof(new->ek_fsid));
+}
+
+static inline void expkey_update(struct cache_head *cnew,
+				   struct cache_head *citem)
+{
+	struct svc_expkey *new = container_of(cnew, struct svc_expkey, h);
+	struct svc_expkey *item = container_of(citem, struct svc_expkey, h);
+
+	new->ek_path = item->ek_path;
+	path_get(&item->ek_path);
+}
+
+static struct cache_head *expkey_alloc(void)
+{
+	struct svc_expkey *i = kmalloc(sizeof(*i), GFP_KERNEL);
+	if (i)
+		return &i->h;
+	else
+		return NULL;
+}
+
+static struct cache_detail svc_expkey_cache = {
+	.owner		= THIS_MODULE,
+	.hash_size	= EXPKEY_HASHMAX,
+	.hash_table	= expkey_table,
+	.name		= "nfsd.fh",
+	.cache_put	= expkey_put,
+	.cache_upcall	= expkey_upcall,
+	.cache_parse	= expkey_parse,
+	.cache_show	= expkey_show,
+	.match		= expkey_match,
+	.init		= expkey_init,
+	.update       	= expkey_update,
+	.alloc		= expkey_alloc,
+};
+
+static int
+svc_expkey_hash(struct svc_expkey *item)
+{
+	int hash = item->ek_fsidtype;
+	char * cp = (char*)item->ek_fsid;
+	int len = key_len(item->ek_fsidtype);
+
+	hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
+	hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS);
+	hash &= EXPKEY_HASHMASK;
+	return hash;
+}
+
+static struct svc_expkey *
+svc_expkey_lookup(struct svc_expkey *item)
+{
+	struct cache_head *ch;
+	int hash = svc_expkey_hash(item);
+
+	ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h,
+				 hash);
+	if (ch)
+		return container_of(ch, struct svc_expkey, h);
+	else
+		return NULL;
+}
+
+static struct svc_expkey *
+svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
+{
+	struct cache_head *ch;
+	int hash = svc_expkey_hash(new);
+
+	ch = sunrpc_cache_update(&svc_expkey_cache, &new->h,
+				 &old->h, hash);
+	if (ch)
+		return container_of(ch, struct svc_expkey, h);
+	else
+		return NULL;
+}
+
+
+#define	EXPORT_HASHBITS		8
+#define	EXPORT_HASHMAX		(1<< EXPORT_HASHBITS)
+
+static struct cache_head *export_table[EXPORT_HASHMAX];
+
+static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
+{
+	int i;
+
+	for (i = 0; i < fsloc->locations_count; i++) {
+		kfree(fsloc->locations[i].path);
+		kfree(fsloc->locations[i].hosts);
+	}
+	kfree(fsloc->locations);
+}
+
+static void svc_export_put(struct kref *ref)
+{
+	struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
+	path_put(&exp->ex_path);
+	auth_domain_put(exp->ex_client);
+	kfree(exp->ex_pathname);
+	nfsd4_fslocs_free(&exp->ex_fslocs);
+	kfree(exp);
+}
+
+static void svc_export_request(struct cache_detail *cd,
+			       struct cache_head *h,
+			       char **bpp, int *blen)
+{
+	/*  client path */
+	struct svc_export *exp = container_of(h, struct svc_export, h);
+	char *pth;
+
+	qword_add(bpp, blen, exp->ex_client->name);
+	pth = d_path(&exp->ex_path, *bpp, *blen);
+	if (IS_ERR(pth)) {
+		/* is this correct? */
+		(*bpp)[0] = '\n';
+		return;
+	}
+	qword_add(bpp, blen, pth);
+	(*bpp)[-1] = '\n';
+}
+
+static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
+}
+
+static struct svc_export *svc_export_update(struct svc_export *new,
+					    struct svc_export *old);
+static struct svc_export *svc_export_lookup(struct svc_export *);
+
+static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
+{
+
+	/*
+	 * We currently export only dirs, regular files, and (for v4
+	 * pseudoroot) symlinks.
+	 */
+	if (!S_ISDIR(inode->i_mode) &&
+	    !S_ISLNK(inode->i_mode) &&
+	    !S_ISREG(inode->i_mode))
+		return -ENOTDIR;
+
+	/*
+	 * Mountd should never pass down a writeable V4ROOT export, but,
+	 * just to make sure:
+	 */
+	if (*flags & NFSEXP_V4ROOT)
+		*flags |= NFSEXP_READONLY;
+
+	/* There are two requirements on a filesystem to be exportable.
+	 * 1:  We must be able to identify the filesystem from a number.
+	 *       either a device number (so FS_REQUIRES_DEV needed)
+	 *       or an FSID number (so NFSEXP_FSID or ->uuid is needed).
+	 * 2:  We must be able to find an inode from a filehandle.
+	 *       This means that s_export_op must be set.
+	 */
+	if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) &&
+	    !(*flags & NFSEXP_FSID) &&
+	    uuid == NULL) {
+		dprintk("exp_export: export of non-dev fs without fsid\n");
+		return -EINVAL;
+	}
+
+	if (!inode->i_sb->s_export_op ||
+	    !inode->i_sb->s_export_op->fh_to_dentry) {
+		dprintk("exp_export: export of invalid fs type.\n");
+		return -EINVAL;
+	}
+
+	return 0;
+
+}
+
+#ifdef CONFIG_NFSD_V4
+
+static int
+fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc)
+{
+	int len;
+	int migrated, i, err;
+
+	/* listsize */
+	err = get_int(mesg, &fsloc->locations_count);
+	if (err)
+		return err;
+	if (fsloc->locations_count > MAX_FS_LOCATIONS)
+		return -EINVAL;
+	if (fsloc->locations_count == 0)
+		return 0;
+
+	fsloc->locations = kzalloc(fsloc->locations_count
+			* sizeof(struct nfsd4_fs_location), GFP_KERNEL);
+	if (!fsloc->locations)
+		return -ENOMEM;
+	for (i=0; i < fsloc->locations_count; i++) {
+		/* colon separated host list */
+		err = -EINVAL;
+		len = qword_get(mesg, buf, PAGE_SIZE);
+		if (len <= 0)
+			goto out_free_all;
+		err = -ENOMEM;
+		fsloc->locations[i].hosts = kstrdup(buf, GFP_KERNEL);
+		if (!fsloc->locations[i].hosts)
+			goto out_free_all;
+		err = -EINVAL;
+		/* slash separated path component list */
+		len = qword_get(mesg, buf, PAGE_SIZE);
+		if (len <= 0)
+			goto out_free_all;
+		err = -ENOMEM;
+		fsloc->locations[i].path = kstrdup(buf, GFP_KERNEL);
+		if (!fsloc->locations[i].path)
+			goto out_free_all;
+	}
+	/* migrated */
+	err = get_int(mesg, &migrated);
+	if (err)
+		goto out_free_all;
+	err = -EINVAL;
+	if (migrated < 0 || migrated > 1)
+		goto out_free_all;
+	fsloc->migrated = migrated;
+	return 0;
+out_free_all:
+	nfsd4_fslocs_free(fsloc);
+	return err;
+}
+
+static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp)
+{
+	int listsize, err;
+	struct exp_flavor_info *f;
+
+	err = get_int(mesg, &listsize);
+	if (err)
+		return err;
+	if (listsize < 0 || listsize > MAX_SECINFO_LIST)
+		return -EINVAL;
+
+	for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) {
+		err = get_int(mesg, &f->pseudoflavor);
+		if (err)
+			return err;
+		/*
+		 * XXX: It would be nice to also check whether this
+		 * pseudoflavor is supported, so we can discover the
+		 * problem at export time instead of when a client fails
+		 * to authenticate.
+		 */
+		err = get_int(mesg, &f->flags);
+		if (err)
+			return err;
+		/* Only some flags are allowed to differ between flavors: */
+		if (~NFSEXP_SECINFO_FLAGS & (f->flags ^ exp->ex_flags))
+			return -EINVAL;
+	}
+	exp->ex_nflavors = listsize;
+	return 0;
+}
+
+#else /* CONFIG_NFSD_V4 */
+static inline int
+fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc){return 0;}
+static inline int
+secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; }
+#endif
+
+static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+	/* client path expiry [flags anonuid anongid fsid] */
+	char *buf;
+	int len;
+	int err;
+	struct auth_domain *dom = NULL;
+	struct svc_export exp = {}, *expp;
+	int an_int;
+
+	if (mesg[mlen-1] != '\n')
+		return -EINVAL;
+	mesg[mlen-1] = 0;
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* client */
+	err = -EINVAL;
+	len = qword_get(&mesg, buf, PAGE_SIZE);
+	if (len <= 0)
+		goto out;
+
+	err = -ENOENT;
+	dom = auth_domain_find(buf);
+	if (!dom)
+		goto out;
+
+	/* path */
+	err = -EINVAL;
+	if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+		goto out1;
+
+	err = kern_path(buf, 0, &exp.ex_path);
+	if (err)
+		goto out1;
+
+	exp.ex_client = dom;
+
+	err = -ENOMEM;
+	exp.ex_pathname = kstrdup(buf, GFP_KERNEL);
+	if (!exp.ex_pathname)
+		goto out2;
+
+	/* expiry */
+	err = -EINVAL;
+	exp.h.expiry_time = get_expiry(&mesg);
+	if (exp.h.expiry_time == 0)
+		goto out3;
+
+	/* flags */
+	err = get_int(&mesg, &an_int);
+	if (err == -ENOENT) {
+		err = 0;
+		set_bit(CACHE_NEGATIVE, &exp.h.flags);
+	} else {
+		if (err || an_int < 0)
+			goto out3;
+		exp.ex_flags= an_int;
+	
+		/* anon uid */
+		err = get_int(&mesg, &an_int);
+		if (err)
+			goto out3;
+		exp.ex_anon_uid= an_int;
+
+		/* anon gid */
+		err = get_int(&mesg, &an_int);
+		if (err)
+			goto out3;
+		exp.ex_anon_gid= an_int;
+
+		/* fsid */
+		err = get_int(&mesg, &an_int);
+		if (err)
+			goto out3;
+		exp.ex_fsid = an_int;
+
+		while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) {
+			if (strcmp(buf, "fsloc") == 0)
+				err = fsloc_parse(&mesg, buf, &exp.ex_fslocs);
+			else if (strcmp(buf, "uuid") == 0) {
+				/* expect a 16 byte uuid encoded as \xXXXX... */
+				len = qword_get(&mesg, buf, PAGE_SIZE);
+				if (len != 16)
+					err  = -EINVAL;
+				else {
+					exp.ex_uuid =
+						kmemdup(buf, 16, GFP_KERNEL);
+					if (exp.ex_uuid == NULL)
+						err = -ENOMEM;
+				}
+			} else if (strcmp(buf, "secinfo") == 0)
+				err = secinfo_parse(&mesg, buf, &exp);
+			else
+				/* quietly ignore unknown words and anything
+				 * following. Newer user-space can try to set
+				 * new values, then see what the result was.
+				 */
+				break;
+			if (err)
+				goto out4;
+		}
+
+		err = check_export(exp.ex_path.dentry->d_inode, &exp.ex_flags,
+				   exp.ex_uuid);
+		if (err)
+			goto out4;
+	}
+
+	expp = svc_export_lookup(&exp);
+	if (expp)
+		expp = svc_export_update(&exp, expp);
+	else
+		err = -ENOMEM;
+	cache_flush();
+	if (expp == NULL)
+		err = -ENOMEM;
+	else
+		exp_put(expp);
+out4:
+	nfsd4_fslocs_free(&exp.ex_fslocs);
+	kfree(exp.ex_uuid);
+out3:
+	kfree(exp.ex_pathname);
+out2:
+	path_put(&exp.ex_path);
+out1:
+	auth_domain_put(dom);
+out:
+	kfree(buf);
+	return err;
+}
+
+static void exp_flags(struct seq_file *m, int flag, int fsid,
+		uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs);
+static void show_secinfo(struct seq_file *m, struct svc_export *exp);
+
+static int svc_export_show(struct seq_file *m,
+			   struct cache_detail *cd,
+			   struct cache_head *h)
+{
+	struct svc_export *exp ;
+
+	if (h ==NULL) {
+		seq_puts(m, "#path domain(flags)\n");
+		return 0;
+	}
+	exp = container_of(h, struct svc_export, h);
+	seq_path(m, &exp->ex_path, " \t\n\\");
+	seq_putc(m, '\t');
+	seq_escape(m, exp->ex_client->name, " \t\n\\");
+	seq_putc(m, '(');
+	if (test_bit(CACHE_VALID, &h->flags) && 
+	    !test_bit(CACHE_NEGATIVE, &h->flags)) {
+		exp_flags(m, exp->ex_flags, exp->ex_fsid,
+			  exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs);
+		if (exp->ex_uuid) {
+			int i;
+			seq_puts(m, ",uuid=");
+			for (i=0; i<16; i++) {
+				if ((i&3) == 0 && i)
+					seq_putc(m, ':');
+				seq_printf(m, "%02x", exp->ex_uuid[i]);
+			}
+		}
+		show_secinfo(m, exp);
+	}
+	seq_puts(m, ")\n");
+	return 0;
+}
+static int svc_export_match(struct cache_head *a, struct cache_head *b)
+{
+	struct svc_export *orig = container_of(a, struct svc_export, h);
+	struct svc_export *new = container_of(b, struct svc_export, h);
+	return orig->ex_client == new->ex_client &&
+		orig->ex_path.dentry == new->ex_path.dentry &&
+		orig->ex_path.mnt == new->ex_path.mnt;
+}
+
+static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
+{
+	struct svc_export *new = container_of(cnew, struct svc_export, h);
+	struct svc_export *item = container_of(citem, struct svc_export, h);
+
+	kref_get(&item->ex_client->ref);
+	new->ex_client = item->ex_client;
+	new->ex_path.dentry = dget(item->ex_path.dentry);
+	new->ex_path.mnt = mntget(item->ex_path.mnt);
+	new->ex_pathname = NULL;
+	new->ex_fslocs.locations = NULL;
+	new->ex_fslocs.locations_count = 0;
+	new->ex_fslocs.migrated = 0;
+}
+
+static void export_update(struct cache_head *cnew, struct cache_head *citem)
+{
+	struct svc_export *new = container_of(cnew, struct svc_export, h);
+	struct svc_export *item = container_of(citem, struct svc_export, h);
+	int i;
+
+	new->ex_flags = item->ex_flags;
+	new->ex_anon_uid = item->ex_anon_uid;
+	new->ex_anon_gid = item->ex_anon_gid;
+	new->ex_fsid = item->ex_fsid;
+	new->ex_uuid = item->ex_uuid;
+	item->ex_uuid = NULL;
+	new->ex_pathname = item->ex_pathname;
+	item->ex_pathname = NULL;
+	new->ex_fslocs.locations = item->ex_fslocs.locations;
+	item->ex_fslocs.locations = NULL;
+	new->ex_fslocs.locations_count = item->ex_fslocs.locations_count;
+	item->ex_fslocs.locations_count = 0;
+	new->ex_fslocs.migrated = item->ex_fslocs.migrated;
+	item->ex_fslocs.migrated = 0;
+	new->ex_nflavors = item->ex_nflavors;
+	for (i = 0; i < MAX_SECINFO_LIST; i++) {
+		new->ex_flavors[i] = item->ex_flavors[i];
+	}
+}
+
+static struct cache_head *svc_export_alloc(void)
+{
+	struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL);
+	if (i)
+		return &i->h;
+	else
+		return NULL;
+}
+
+struct cache_detail svc_export_cache = {
+	.owner		= THIS_MODULE,
+	.hash_size	= EXPORT_HASHMAX,
+	.hash_table	= export_table,
+	.name		= "nfsd.export",
+	.cache_put	= svc_export_put,
+	.cache_upcall	= svc_export_upcall,
+	.cache_parse	= svc_export_parse,
+	.cache_show	= svc_export_show,
+	.match		= svc_export_match,
+	.init		= svc_export_init,
+	.update		= export_update,
+	.alloc		= svc_export_alloc,
+};
+
+static int
+svc_export_hash(struct svc_export *exp)
+{
+	int hash;
+
+	hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS);
+	hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS);
+	hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS);
+	return hash;
+}
+
+static struct svc_export *
+svc_export_lookup(struct svc_export *exp)
+{
+	struct cache_head *ch;
+	int hash = svc_export_hash(exp);
+
+	ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h,
+				 hash);
+	if (ch)
+		return container_of(ch, struct svc_export, h);
+	else
+		return NULL;
+}
+
+static struct svc_export *
+svc_export_update(struct svc_export *new, struct svc_export *old)
+{
+	struct cache_head *ch;
+	int hash = svc_export_hash(old);
+
+	ch = sunrpc_cache_update(&svc_export_cache, &new->h,
+				 &old->h,
+				 hash);
+	if (ch)
+		return container_of(ch, struct svc_export, h);
+	else
+		return NULL;
+}
+
+
+static struct svc_expkey *
+exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
+{
+	struct svc_expkey key, *ek;
+	int err;
+	
+	if (!clp)
+		return ERR_PTR(-ENOENT);
+
+	key.ek_client = clp;
+	key.ek_fsidtype = fsid_type;
+	memcpy(key.ek_fsid, fsidv, key_len(fsid_type));
+
+	ek = svc_expkey_lookup(&key);
+	if (ek == NULL)
+		return ERR_PTR(-ENOMEM);
+	err = cache_check(&svc_expkey_cache, &ek->h, reqp);
+	if (err)
+		return ERR_PTR(err);
+	return ek;
+}
+
+#ifdef CONFIG_NFSD_DEPRECATED
+static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv,
+		       struct svc_export *exp)
+{
+	struct svc_expkey key, *ek;
+
+	key.ek_client = clp;
+	key.ek_fsidtype = fsid_type;
+	memcpy(key.ek_fsid, fsidv, key_len(fsid_type));
+	key.ek_path = exp->ex_path;
+	key.h.expiry_time = NEVER;
+	key.h.flags = 0;
+
+	ek = svc_expkey_lookup(&key);
+	if (ek)
+		ek = svc_expkey_update(&key,ek);
+	if (ek) {
+		cache_put(&ek->h, &svc_expkey_cache);
+		return 0;
+	}
+	return -ENOMEM;
+}
+
+/*
+ * Find the client's export entry matching xdev/xino.
+ */
+static inline struct svc_expkey *
+exp_get_key(svc_client *clp, dev_t dev, ino_t ino)
+{
+	u32 fsidv[3];
+	
+	if (old_valid_dev(dev)) {
+		mk_fsid(FSID_DEV, fsidv, dev, ino, 0, NULL);
+		return exp_find_key(clp, FSID_DEV, fsidv, NULL);
+	}
+	mk_fsid(FSID_ENCODE_DEV, fsidv, dev, ino, 0, NULL);
+	return exp_find_key(clp, FSID_ENCODE_DEV, fsidv, NULL);
+}
+
+/*
+ * Find the client's export entry matching fsid
+ */
+static inline struct svc_expkey *
+exp_get_fsid_key(svc_client *clp, int fsid)
+{
+	u32 fsidv[2];
+
+	mk_fsid(FSID_NUM, fsidv, 0, 0, fsid, NULL);
+
+	return exp_find_key(clp, FSID_NUM, fsidv, NULL);
+}
+#endif
+
+static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
+				     struct cache_req *reqp)
+{
+	struct svc_export *exp, key;
+	int err;
+
+	if (!clp)
+		return ERR_PTR(-ENOENT);
+
+	key.ex_client = clp;
+	key.ex_path = *path;
+
+	exp = svc_export_lookup(&key);
+	if (exp == NULL)
+		return ERR_PTR(-ENOMEM);
+	err = cache_check(&svc_export_cache, &exp->h, reqp);
+	if (err)
+		return ERR_PTR(err);
+	return exp;
+}
+
+/*
+ * Find the export entry for a given dentry.
+ */
+static struct svc_export *exp_parent(svc_client *clp, struct path *path)
+{
+	struct dentry *saved = dget(path->dentry);
+	svc_export *exp = exp_get_by_name(clp, path, NULL);
+
+	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+		struct dentry *parent = dget_parent(path->dentry);
+		dput(path->dentry);
+		path->dentry = parent;
+		exp = exp_get_by_name(clp, path, NULL);
+	}
+	dput(path->dentry);
+	path->dentry = saved;
+	return exp;
+}
+
+#ifdef CONFIG_NFSD_DEPRECATED
+/*
+ * Hashtable locking. Write locks are placed only by user processes
+ * wanting to modify export information.
+ * Write locking only done in this file.  Read locking
+ * needed externally.
+ */
+
+static DECLARE_RWSEM(hash_sem);
+
+void
+exp_readlock(void)
+{
+	down_read(&hash_sem);
+}
+
+static inline void
+exp_writelock(void)
+{
+	down_write(&hash_sem);
+}
+
+void
+exp_readunlock(void)
+{
+	up_read(&hash_sem);
+}
+
+static inline void
+exp_writeunlock(void)
+{
+	up_write(&hash_sem);
+}
+#else
+
+/* hash_sem not needed once deprecated interface is removed */
+void exp_readlock(void) {}
+static inline void exp_writelock(void){}
+void exp_readunlock(void) {}
+static inline void exp_writeunlock(void){}
+
+#endif
+
+#ifdef CONFIG_NFSD_DEPRECATED
+static void		exp_do_unexport(svc_export *unexp);
+static int		exp_verify_string(char *cp, int max);
+
+static void exp_fsid_unhash(struct svc_export *exp)
+{
+	struct svc_expkey *ek;
+
+	if ((exp->ex_flags & NFSEXP_FSID) == 0)
+		return;
+
+	ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
+	if (!IS_ERR(ek)) {
+		sunrpc_invalidate(&ek->h, &svc_expkey_cache);
+		cache_put(&ek->h, &svc_expkey_cache);
+	}
+}
+
+static int exp_fsid_hash(svc_client *clp, struct svc_export *exp)
+{
+	u32 fsid[2];
+ 
+	if ((exp->ex_flags & NFSEXP_FSID) == 0)
+		return 0;
+
+	mk_fsid(FSID_NUM, fsid, 0, 0, exp->ex_fsid, NULL);
+	return exp_set_key(clp, FSID_NUM, fsid, exp);
+}
+
+static int exp_hash(struct auth_domain *clp, struct svc_export *exp)
+{
+	u32 fsid[2];
+	struct inode *inode = exp->ex_path.dentry->d_inode;
+	dev_t dev = inode->i_sb->s_dev;
+
+	if (old_valid_dev(dev)) {
+		mk_fsid(FSID_DEV, fsid, dev, inode->i_ino, 0, NULL);
+		return exp_set_key(clp, FSID_DEV, fsid, exp);
+	}
+	mk_fsid(FSID_ENCODE_DEV, fsid, dev, inode->i_ino, 0, NULL);
+	return exp_set_key(clp, FSID_ENCODE_DEV, fsid, exp);
+}
+
+static void exp_unhash(struct svc_export *exp)
+{
+	struct svc_expkey *ek;
+	struct inode *inode = exp->ex_path.dentry->d_inode;
+
+	ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
+	if (!IS_ERR(ek)) {
+		sunrpc_invalidate(&ek->h, &svc_expkey_cache);
+		cache_put(&ek->h, &svc_expkey_cache);
+	}
+}
+	
+/*
+ * Export a file system.
+ */
+int
+exp_export(struct nfsctl_export *nxp)
+{
+	svc_client	*clp;
+	struct svc_export	*exp = NULL;
+	struct svc_export	new;
+	struct svc_expkey	*fsid_key = NULL;
+	struct path path;
+	int		err;
+
+	/* Consistency check */
+	err = -EINVAL;
+	if (!exp_verify_string(nxp->ex_path, NFS_MAXPATHLEN) ||
+	    !exp_verify_string(nxp->ex_client, NFSCLNT_IDMAX))
+		goto out;
+
+	dprintk("exp_export called for %s:%s (%x/%ld fl %x).\n",
+			nxp->ex_client, nxp->ex_path,
+			(unsigned)nxp->ex_dev, (long)nxp->ex_ino,
+			nxp->ex_flags);
+
+	/* Try to lock the export table for update */
+	exp_writelock();
+
+	/* Look up client info */
+	if (!(clp = auth_domain_find(nxp->ex_client)))
+		goto out_unlock;
+
+
+	/* Look up the dentry */
+	err = kern_path(nxp->ex_path, 0, &path);
+	if (err)
+		goto out_put_clp;
+	err = -EINVAL;
+
+	exp = exp_get_by_name(clp, &path, NULL);
+
+	memset(&new, 0, sizeof(new));
+
+	/* must make sure there won't be an ex_fsid clash */
+	if ((nxp->ex_flags & NFSEXP_FSID) &&
+	    (!IS_ERR(fsid_key = exp_get_fsid_key(clp, nxp->ex_dev))) &&
+	    fsid_key->ek_path.mnt &&
+	    (fsid_key->ek_path.mnt != path.mnt ||
+	     fsid_key->ek_path.dentry != path.dentry))
+		goto finish;
+
+	if (!IS_ERR(exp)) {
+		/* just a flags/id/fsid update */
+
+		exp_fsid_unhash(exp);
+		exp->ex_flags    = nxp->ex_flags;
+		exp->ex_anon_uid = nxp->ex_anon_uid;
+		exp->ex_anon_gid = nxp->ex_anon_gid;
+		exp->ex_fsid     = nxp->ex_dev;
+
+		err = exp_fsid_hash(clp, exp);
+		goto finish;
+	}
+
+	err = check_export(path.dentry->d_inode, &nxp->ex_flags, NULL);
+	if (err) goto finish;
+
+	err = -ENOMEM;
+
+	dprintk("nfsd: creating export entry %p for client %p\n", exp, clp);
+
+	new.h.expiry_time = NEVER;
+	new.h.flags = 0;
+	new.ex_pathname = kstrdup(nxp->ex_path, GFP_KERNEL);
+	if (!new.ex_pathname)
+		goto finish;
+	new.ex_client = clp;
+	new.ex_path = path;
+	new.ex_flags = nxp->ex_flags;
+	new.ex_anon_uid = nxp->ex_anon_uid;
+	new.ex_anon_gid = nxp->ex_anon_gid;
+	new.ex_fsid = nxp->ex_dev;
+
+	exp = svc_export_lookup(&new);
+	if (exp)
+		exp = svc_export_update(&new, exp);
+
+	if (!exp)
+		goto finish;
+
+	if (exp_hash(clp, exp) ||
+	    exp_fsid_hash(clp, exp)) {
+		/* failed to create at least one index */
+		exp_do_unexport(exp);
+		cache_flush();
+	} else
+		err = 0;
+finish:
+	kfree(new.ex_pathname);
+	if (!IS_ERR_OR_NULL(exp))
+		exp_put(exp);
+	if (!IS_ERR_OR_NULL(fsid_key))
+		cache_put(&fsid_key->h, &svc_expkey_cache);
+	path_put(&path);
+out_put_clp:
+	auth_domain_put(clp);
+out_unlock:
+	exp_writeunlock();
+out:
+	return err;
+}
+
+/*
+ * Unexport a file system. The export entry has already
+ * been removed from the client's list of exported fs's.
+ */
+static void
+exp_do_unexport(svc_export *unexp)
+{
+	sunrpc_invalidate(&unexp->h, &svc_export_cache);
+	exp_unhash(unexp);
+	exp_fsid_unhash(unexp);
+}
+
+
+/*
+ * unexport syscall.
+ */
+int
+exp_unexport(struct nfsctl_export *nxp)
+{
+	struct auth_domain *dom;
+	svc_export *exp;
+	struct path path;
+	int		err;
+
+	/* Consistency check */
+	if (!exp_verify_string(nxp->ex_path, NFS_MAXPATHLEN) ||
+	    !exp_verify_string(nxp->ex_client, NFSCLNT_IDMAX))
+		return -EINVAL;
+
+	exp_writelock();
+
+	err = -EINVAL;
+	dom = auth_domain_find(nxp->ex_client);
+	if (!dom) {
+		dprintk("nfsd: unexport couldn't find %s\n", nxp->ex_client);
+		goto out_unlock;
+	}
+
+	err = kern_path(nxp->ex_path, 0, &path);
+	if (err)
+		goto out_domain;
+
+	err = -EINVAL;
+	exp = exp_get_by_name(dom, &path, NULL);
+	path_put(&path);
+	if (IS_ERR(exp))
+		goto out_domain;
+
+	exp_do_unexport(exp);
+	exp_put(exp);
+	err = 0;
+
+out_domain:
+	auth_domain_put(dom);
+	cache_flush();
+out_unlock:
+	exp_writeunlock();
+	return err;
+}
+#endif /* CONFIG_NFSD_DEPRECATED */
+
+/*
+ * Obtain the root fh on behalf of a client.
+ * This could be done in user space, but I feel that it adds some safety
+ * since its harder to fool a kernel module than a user space program.
+ */
+int
+exp_rootfh(svc_client *clp, char *name, struct knfsd_fh *f, int maxsize)
+{
+	struct svc_export	*exp;
+	struct path		path;
+	struct inode		*inode;
+	struct svc_fh		fh;
+	int			err;
+
+	err = -EPERM;
+	/* NB: we probably ought to check that it's NUL-terminated */
+	if (kern_path(name, 0, &path)) {
+		printk("nfsd: exp_rootfh path not found %s", name);
+		return err;
+	}
+	inode = path.dentry->d_inode;
+
+	dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n",
+		 name, path.dentry, clp->name,
+		 inode->i_sb->s_id, inode->i_ino);
+	exp = exp_parent(clp, &path);
+	if (IS_ERR(exp)) {
+		err = PTR_ERR(exp);
+		goto out;
+	}
+
+	/*
+	 * fh must be initialized before calling fh_compose
+	 */
+	fh_init(&fh, maxsize);
+	if (fh_compose(&fh, exp, path.dentry, NULL))
+		err = -EINVAL;
+	else
+		err = 0;
+	memcpy(f, &fh.fh_handle, sizeof(struct knfsd_fh));
+	fh_put(&fh);
+	exp_put(exp);
+out:
+	path_put(&path);
+	return err;
+}
+
+static struct svc_export *exp_find(struct auth_domain *clp, int fsid_type,
+				   u32 *fsidv, struct cache_req *reqp)
+{
+	struct svc_export *exp;
+	struct svc_expkey *ek = exp_find_key(clp, fsid_type, fsidv, reqp);
+	if (IS_ERR(ek))
+		return ERR_CAST(ek);
+
+	exp = exp_get_by_name(clp, &ek->ek_path, reqp);
+	cache_put(&ek->h, &svc_expkey_cache);
+
+	if (IS_ERR(exp))
+		return ERR_CAST(exp);
+	return exp;
+}
+
+__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
+{
+	struct exp_flavor_info *f;
+	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+
+	/* legacy gss-only clients are always OK: */
+	if (exp->ex_client == rqstp->rq_gssclient)
+		return 0;
+	/* ip-address based client; check sec= export option: */
+	for (f = exp->ex_flavors; f < end; f++) {
+		if (f->pseudoflavor == rqstp->rq_flavor)
+			return 0;
+	}
+	/* defaults in absence of sec= options: */
+	if (exp->ex_nflavors == 0) {
+		if (rqstp->rq_flavor == RPC_AUTH_NULL ||
+		    rqstp->rq_flavor == RPC_AUTH_UNIX)
+			return 0;
+	}
+	return nfserr_wrongsec;
+}
+
+/*
+ * Uses rq_client and rq_gssclient to find an export; uses rq_client (an
+ * auth_unix client) if it's available and has secinfo information;
+ * otherwise, will try to use rq_gssclient.
+ *
+ * Called from functions that handle requests; functions that do work on
+ * behalf of mountd are passed a single client name to use, and should
+ * use exp_get_by_name() or exp_find().
+ */
+struct svc_export *
+rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
+{
+	struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
+
+	if (rqstp->rq_client == NULL)
+		goto gss;
+
+	/* First try the auth_unix client: */
+	exp = exp_get_by_name(rqstp->rq_client, path, &rqstp->rq_chandle);
+	if (PTR_ERR(exp) == -ENOENT)
+		goto gss;
+	if (IS_ERR(exp))
+		return exp;
+	/* If it has secinfo, assume there are no gss/... clients */
+	if (exp->ex_nflavors > 0)
+		return exp;
+gss:
+	/* Otherwise, try falling back on gss client */
+	if (rqstp->rq_gssclient == NULL)
+		return exp;
+	gssexp = exp_get_by_name(rqstp->rq_gssclient, path, &rqstp->rq_chandle);
+	if (PTR_ERR(gssexp) == -ENOENT)
+		return exp;
+	if (!IS_ERR(exp))
+		exp_put(exp);
+	return gssexp;
+}
+
+struct svc_export *
+rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv)
+{
+	struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
+
+	if (rqstp->rq_client == NULL)
+		goto gss;
+
+	/* First try the auth_unix client: */
+	exp = exp_find(rqstp->rq_client, fsid_type, fsidv, &rqstp->rq_chandle);
+	if (PTR_ERR(exp) == -ENOENT)
+		goto gss;
+	if (IS_ERR(exp))
+		return exp;
+	/* If it has secinfo, assume there are no gss/... clients */
+	if (exp->ex_nflavors > 0)
+		return exp;
+gss:
+	/* Otherwise, try falling back on gss client */
+	if (rqstp->rq_gssclient == NULL)
+		return exp;
+	gssexp = exp_find(rqstp->rq_gssclient, fsid_type, fsidv,
+						&rqstp->rq_chandle);
+	if (PTR_ERR(gssexp) == -ENOENT)
+		return exp;
+	if (!IS_ERR(exp))
+		exp_put(exp);
+	return gssexp;
+}
+
+struct svc_export *
+rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
+{
+	struct dentry *saved = dget(path->dentry);
+	struct svc_export *exp = rqst_exp_get_by_name(rqstp, path);
+
+	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+		struct dentry *parent = dget_parent(path->dentry);
+		dput(path->dentry);
+		path->dentry = parent;
+		exp = rqst_exp_get_by_name(rqstp, path);
+	}
+	dput(path->dentry);
+	path->dentry = saved;
+	return exp;
+}
+
+static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp)
+{
+	u32 fsidv[2];
+
+	mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
+
+	return rqst_exp_find(rqstp, FSID_NUM, fsidv);
+}
+
+/*
+ * Called when we need the filehandle for the root of the pseudofs,
+ * for a given NFSv4 client.   The root is defined to be the
+ * export point with fsid==0
+ */
+__be32
+exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
+{
+	struct svc_export *exp;
+	__be32 rv;
+
+	exp = find_fsidzero_export(rqstp);
+	if (IS_ERR(exp))
+		return nfserrno(PTR_ERR(exp));
+	rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
+	exp_put(exp);
+	return rv;
+}
+
+/* Iterator */
+
+static void *e_start(struct seq_file *m, loff_t *pos)
+	__acquires(svc_export_cache.hash_lock)
+{
+	loff_t n = *pos;
+	unsigned hash, export;
+	struct cache_head *ch;
+	
+	exp_readlock();
+	read_lock(&svc_export_cache.hash_lock);
+	if (!n--)
+		return SEQ_START_TOKEN;
+	hash = n >> 32;
+	export = n & ((1LL<<32) - 1);
+
+	
+	for (ch=export_table[hash]; ch; ch=ch->next)
+		if (!export--)
+			return ch;
+	n &= ~((1LL<<32) - 1);
+	do {
+		hash++;
+		n += 1LL<<32;
+	} while(hash < EXPORT_HASHMAX && export_table[hash]==NULL);
+	if (hash >= EXPORT_HASHMAX)
+		return NULL;
+	*pos = n+1;
+	return export_table[hash];
+}
+
+static void *e_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct cache_head *ch = p;
+	int hash = (*pos >> 32);
+
+	if (p == SEQ_START_TOKEN)
+		hash = 0;
+	else if (ch->next == NULL) {
+		hash++;
+		*pos += 1LL<<32;
+	} else {
+		++*pos;
+		return ch->next;
+	}
+	*pos &= ~((1LL<<32) - 1);
+	while (hash < EXPORT_HASHMAX && export_table[hash] == NULL) {
+		hash++;
+		*pos += 1LL<<32;
+	}
+	if (hash >= EXPORT_HASHMAX)
+		return NULL;
+	++*pos;
+	return export_table[hash];
+}
+
+static void e_stop(struct seq_file *m, void *p)
+	__releases(svc_export_cache.hash_lock)
+{
+	read_unlock(&svc_export_cache.hash_lock);
+	exp_readunlock();
+}
+
+static struct flags {
+	int flag;
+	char *name[2];
+} expflags[] = {
+	{ NFSEXP_READONLY, {"ro", "rw"}},
+	{ NFSEXP_INSECURE_PORT, {"insecure", ""}},
+	{ NFSEXP_ROOTSQUASH, {"root_squash", "no_root_squash"}},
+	{ NFSEXP_ALLSQUASH, {"all_squash", ""}},
+	{ NFSEXP_ASYNC, {"async", "sync"}},
+	{ NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}},
+	{ NFSEXP_NOHIDE, {"nohide", ""}},
+	{ NFSEXP_CROSSMOUNT, {"crossmnt", ""}},
+	{ NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
+	{ NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
+	{ NFSEXP_V4ROOT, {"v4root", ""}},
+	{ 0, {"", ""}}
+};
+
+static void show_expflags(struct seq_file *m, int flags, int mask)
+{
+	struct flags *flg;
+	int state, first = 0;
+
+	for (flg = expflags; flg->flag; flg++) {
+		if (flg->flag & ~mask)
+			continue;
+		state = (flg->flag & flags) ? 0 : 1;
+		if (*flg->name[state])
+			seq_printf(m, "%s%s", first++?",":"", flg->name[state]);
+	}
+}
+
+static void show_secinfo_flags(struct seq_file *m, int flags)
+{
+	seq_printf(m, ",");
+	show_expflags(m, flags, NFSEXP_SECINFO_FLAGS);
+}
+
+static bool secinfo_flags_equal(int f, int g)
+{
+	f &= NFSEXP_SECINFO_FLAGS;
+	g &= NFSEXP_SECINFO_FLAGS;
+	return f == g;
+}
+
+static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end)
+{
+	int flags;
+
+	flags = (*fp)->flags;
+	seq_printf(m, ",sec=%d", (*fp)->pseudoflavor);
+	(*fp)++;
+	while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) {
+		seq_printf(m, ":%d", (*fp)->pseudoflavor);
+		(*fp)++;
+	}
+	return flags;
+}
+
+static void show_secinfo(struct seq_file *m, struct svc_export *exp)
+{
+	struct exp_flavor_info *f;
+	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+	int flags;
+
+	if (exp->ex_nflavors == 0)
+		return;
+	f = exp->ex_flavors;
+	flags = show_secinfo_run(m, &f, end);
+	if (!secinfo_flags_equal(flags, exp->ex_flags))
+		show_secinfo_flags(m, flags);
+	while (f != end) {
+		flags = show_secinfo_run(m, &f, end);
+		show_secinfo_flags(m, flags);
+	}
+}
+
+static void exp_flags(struct seq_file *m, int flag, int fsid,
+		uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc)
+{
+	show_expflags(m, flag, NFSEXP_ALLFLAGS);
+	if (flag & NFSEXP_FSID)
+		seq_printf(m, ",fsid=%d", fsid);
+	if (anonu != (uid_t)-2 && anonu != (0x10000-2))
+		seq_printf(m, ",anonuid=%u", anonu);
+	if (anong != (gid_t)-2 && anong != (0x10000-2))
+		seq_printf(m, ",anongid=%u", anong);
+	if (fsloc && fsloc->locations_count > 0) {
+		char *loctype = (fsloc->migrated) ? "refer" : "replicas";
+		int i;
+
+		seq_printf(m, ",%s=", loctype);
+		seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\");
+		seq_putc(m, '@');
+		seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\");
+		for (i = 1; i < fsloc->locations_count; i++) {
+			seq_putc(m, ';');
+			seq_escape(m, fsloc->locations[i].path, ",;@ \t\n\\");
+			seq_putc(m, '@');
+			seq_escape(m, fsloc->locations[i].hosts, ",;@ \t\n\\");
+		}
+	}
+}
+
+static int e_show(struct seq_file *m, void *p)
+{
+	struct cache_head *cp = p;
+	struct svc_export *exp = container_of(cp, struct svc_export, h);
+
+	if (p == SEQ_START_TOKEN) {
+		seq_puts(m, "# Version 1.1\n");
+		seq_puts(m, "# Path Client(Flags) # IPs\n");
+		return 0;
+	}
+
+	cache_get(&exp->h);
+	if (cache_check(&svc_export_cache, &exp->h, NULL))
+		return 0;
+	cache_put(&exp->h, &svc_export_cache);
+	return svc_export_show(m, &svc_export_cache, cp);
+}
+
+const struct seq_operations nfs_exports_op = {
+	.start	= e_start,
+	.next	= e_next,
+	.stop	= e_stop,
+	.show	= e_show,
+};
+
+#ifdef CONFIG_NFSD_DEPRECATED
+/*
+ * Add or modify a client.
+ * Change requests may involve the list of host addresses. The list of
+ * exports and possibly existing uid maps are left untouched.
+ */
+int
+exp_addclient(struct nfsctl_client *ncp)
+{
+	struct auth_domain	*dom;
+	int			i, err;
+	struct in6_addr addr6;
+
+	/* First, consistency check. */
+	err = -EINVAL;
+	if (! exp_verify_string(ncp->cl_ident, NFSCLNT_IDMAX))
+		goto out;
+	if (ncp->cl_naddr > NFSCLNT_ADDRMAX)
+		goto out;
+
+	/* Lock the hashtable */
+	exp_writelock();
+
+	dom = unix_domain_find(ncp->cl_ident);
+
+	err = -ENOMEM;
+	if (!dom)
+		goto out_unlock;
+
+	/* Insert client into hashtable. */
+	for (i = 0; i < ncp->cl_naddr; i++) {
+		ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6);
+		auth_unix_add_addr(&init_net, &addr6, dom);
+	}
+	auth_unix_forget_old(dom);
+	auth_domain_put(dom);
+
+	err = 0;
+
+out_unlock:
+	exp_writeunlock();
+out:
+	return err;
+}
+
+/*
+ * Delete a client given an identifier.
+ */
+int
+exp_delclient(struct nfsctl_client *ncp)
+{
+	int		err;
+	struct auth_domain *dom;
+
+	err = -EINVAL;
+	if (!exp_verify_string(ncp->cl_ident, NFSCLNT_IDMAX))
+		goto out;
+
+	/* Lock the hashtable */
+	exp_writelock();
+
+	dom = auth_domain_find(ncp->cl_ident);
+	/* just make sure that no addresses work 
+	 * and that it will expire soon 
+	 */
+	if (dom) {
+		err = auth_unix_forget_old(dom);
+		auth_domain_put(dom);
+	}
+
+	exp_writeunlock();
+out:
+	return err;
+}
+
+/*
+ * Verify that string is non-empty and does not exceed max length.
+ */
+static int
+exp_verify_string(char *cp, int max)
+{
+	int	i;
+
+	for (i = 0; i < max; i++)
+		if (!cp[i])
+			return i;
+	cp[i] = 0;
+	printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp);
+	return 0;
+}
+#endif /* CONFIG_NFSD_DEPRECATED */
+
+/*
+ * Initialize the exports module.
+ */
+int
+nfsd_export_init(void)
+{
+	int rv;
+	dprintk("nfsd: initializing export module.\n");
+
+	rv = cache_register(&svc_export_cache);
+	if (rv)
+		return rv;
+	rv = cache_register(&svc_expkey_cache);
+	if (rv)
+		cache_unregister(&svc_export_cache);
+	return rv;
+
+}
+
+/*
+ * Flush exports table - called when last nfsd thread is killed
+ */
+void
+nfsd_export_flush(void)
+{
+	exp_writelock();
+	cache_purge(&svc_expkey_cache);
+	cache_purge(&svc_export_cache);
+	exp_writeunlock();
+}
+
+/*
+ * Shutdown the exports module.
+ */
+void
+nfsd_export_shutdown(void)
+{
+
+	dprintk("nfsd: shutting down export module.\n");
+
+	exp_writelock();
+
+	cache_unregister(&svc_expkey_cache);
+	cache_unregister(&svc_export_cache);
+	svcauth_unix_purge();
+
+	exp_writeunlock();
+	dprintk("nfsd: export shutdown complete.\n");
+}
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
new file mode 100644
index 00000000..2f3be132
--- /dev/null
+++ b/fs/nfsd/idmap.h
@@ -0,0 +1,62 @@
+/*
+ *  Mapping of UID to name and vice versa.
+ *
+ *  Copyright (c) 2002, 2003 The Regents of the University of
+ *  Michigan.  All rights reserved.
+> *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LINUX_NFSD_IDMAP_H
+#define LINUX_NFSD_IDMAP_H
+
+#include <linux/in.h>
+#include <linux/sunrpc/svc.h>
+
+/* XXX from linux/nfs_idmap.h */
+#define IDMAP_NAMESZ 128
+
+#ifdef CONFIG_NFSD_V4
+int nfsd_idmap_init(void);
+void nfsd_idmap_shutdown(void);
+#else
+static inline int nfsd_idmap_init(void)
+{
+	return 0;
+}
+static inline void nfsd_idmap_shutdown(void)
+{
+}
+#endif
+
+__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
+__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
+int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
+int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
+
+#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
new file mode 100644
index 00000000..7c831a27
--- /dev/null
+++ b/fs/nfsd/lockd.c
@@ -0,0 +1,79 @@
+/*
+ * This file contains all the stubs needed when communicating with lockd.
+ * This level of indirection is necessary so we can run nfsd+lockd without
+ * requiring the nfs client to be compiled in/loaded, and vice versa.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/file.h>
+#include <linux/lockd/bind.h>
+#include "nfsd.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_LOCKD
+
+#ifdef CONFIG_LOCKD_V4
+#define nlm_stale_fh	nlm4_stale_fh
+#define nlm_failed	nlm4_failed
+#else
+#define nlm_stale_fh	nlm_lck_denied_nolocks
+#define nlm_failed	nlm_lck_denied_nolocks
+#endif
+/*
+ * Note: we hold the dentry use count while the file is open.
+ */
+static __be32
+nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
+{
+	__be32		nfserr;
+	struct svc_fh	fh;
+
+	/* must initialize before using! but maxsize doesn't matter */
+	fh_init(&fh,0);
+	fh.fh_handle.fh_size = f->size;
+	memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size);
+	fh.fh_export = NULL;
+
+	exp_readlock();
+	nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
+	fh_put(&fh);
+	exp_readunlock();
+ 	/* We return nlm error codes as nlm doesn't know
+	 * about nfsd, but nfsd does know about nlm..
+	 */
+	switch (nfserr) {
+	case nfs_ok:
+		return 0;
+	case nfserr_dropit:
+		return nlm_drop_reply;
+	case nfserr_stale:
+		return nlm_stale_fh;
+	default:
+		return nlm_failed;
+	}
+}
+
+static void
+nlm_fclose(struct file *filp)
+{
+	fput(filp);
+}
+
+static struct nlmsvc_binding	nfsd_nlm_ops = {
+	.fopen		= nlm_fopen,		/* open file for locking */
+	.fclose		= nlm_fclose,		/* close file */
+};
+
+void
+nfsd_lockd_init(void)
+{
+	dprintk("nfsd: initializing lockd\n");
+	nlmsvc_ops = &nfsd_nlm_ops;
+}
+
+void
+nfsd_lockd_shutdown(void)
+{
+	nlmsvc_ops = NULL;
+}
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
new file mode 100644
index 00000000..6aa5590c
--- /dev/null
+++ b/fs/nfsd/nfs2acl.c
@@ -0,0 +1,356 @@
+/*
+ * Process version 2 NFSACL requests.
+ *
+ * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
+ */
+
+#include "nfsd.h"
+/* FIXME: nfsacl.h is a broken header */
+#include <linux/nfsacl.h>
+#include <linux/gfp.h>
+#include "cache.h"
+#include "xdr3.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_PROC
+#define RETURN_STATUS(st)	{ resp->status = (st); return (st); }
+
+/*
+ * NULL call.
+ */
+static __be32
+nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	return nfs_ok;
+}
+
+/*
+ * Get the Access and/or Default ACL of a file.
+ */
+static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
+		struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
+{
+	svc_fh *fh;
+	struct posix_acl *acl;
+	__be32 nfserr = 0;
+
+	dprintk("nfsd: GETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
+
+	fh = fh_copy(&resp->fh, &argp->fh);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+	if (nfserr)
+		RETURN_STATUS(nfserr);
+
+	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+		RETURN_STATUS(nfserr_inval);
+	resp->mask = argp->mask;
+
+	if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
+		acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS);
+		if (IS_ERR(acl)) {
+			int err = PTR_ERR(acl);
+
+			if (err == -ENODATA || err == -EOPNOTSUPP)
+				acl = NULL;
+			else {
+				nfserr = nfserrno(err);
+				goto fail;
+			}
+		}
+		if (acl == NULL) {
+			/* Solaris returns the inode's minimum ACL. */
+
+			struct inode *inode = fh->fh_dentry->d_inode;
+			acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+		}
+		resp->acl_access = acl;
+	}
+	if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
+		/* Check how Solaris handles requests for the Default ACL
+		   of a non-directory! */
+
+		acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl)) {
+			int err = PTR_ERR(acl);
+
+			if (err == -ENODATA || err == -EOPNOTSUPP)
+				acl = NULL;
+			else {
+				nfserr = nfserrno(err);
+				goto fail;
+			}
+		}
+		resp->acl_default = acl;
+	}
+
+	/* resp->acl_{access,default} are released in nfssvc_release_getacl. */
+	RETURN_STATUS(0);
+
+fail:
+	posix_acl_release(resp->acl_access);
+	posix_acl_release(resp->acl_default);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Set the Access and/or Default ACL of a file.
+ */
+static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
+		struct nfsd3_setaclargs *argp,
+		struct nfsd_attrstat *resp)
+{
+	svc_fh *fh;
+	__be32 nfserr = 0;
+
+	dprintk("nfsd: SETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
+
+	fh = fh_copy(&resp->fh, &argp->fh);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
+
+	if (!nfserr) {
+		nfserr = nfserrno( nfsd_set_posix_acl(
+			fh, ACL_TYPE_ACCESS, argp->acl_access) );
+	}
+	if (!nfserr) {
+		nfserr = nfserrno( nfsd_set_posix_acl(
+			fh, ACL_TYPE_DEFAULT, argp->acl_default) );
+	}
+
+	/* argp->acl_{access,default} may have been allocated in
+	   nfssvc_decode_setaclargs. */
+	posix_acl_release(argp->acl_access);
+	posix_acl_release(argp->acl_default);
+	return nfserr;
+}
+
+/*
+ * Check file attributes
+ */
+static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp,
+		struct nfsd_fhandle *argp, struct nfsd_attrstat *resp)
+{
+	dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
+
+	fh_copy(&resp->fh, &argp->fh);
+	return fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+}
+
+/*
+ * Check file access
+ */
+static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
+		struct nfsd3_accessres *resp)
+{
+	__be32 nfserr;
+
+	dprintk("nfsd: ACCESS(2acl)   %s 0x%x\n",
+			SVCFH_fmt(&argp->fh),
+			argp->access);
+
+	fh_copy(&resp->fh, &argp->fh);
+	resp->access = argp->access;
+	nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
+	return nfserr;
+}
+
+/*
+ * XDR decode functions
+ */
+static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_getaclargs *argp)
+{
+	if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+		return 0;
+	argp->mask = ntohl(*p); p++;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+
+static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_setaclargs *argp)
+{
+	struct kvec *head = rqstp->rq_arg.head;
+	unsigned int base;
+	int n;
+
+	if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+		return 0;
+	argp->mask = ntohl(*p++);
+	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
+	    !xdr_argsize_check(rqstp, p))
+		return 0;
+
+	base = (char *)p - (char *)head->iov_base;
+	n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
+			  (argp->mask & NFS_ACL) ?
+			  &argp->acl_access : NULL);
+	if (n > 0)
+		n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
+				  (argp->mask & NFS_DFACL) ?
+				  &argp->acl_default : NULL);
+	return (n > 0);
+}
+
+static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd_fhandle *argp)
+{
+	if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+		return 0;
+	return xdr_argsize_check(rqstp, p);
+}
+
+static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_accessargs *argp)
+{
+	if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+		return 0;
+	argp->access = ntohl(*p++);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+/*
+ * XDR encode functions
+ */
+
+/*
+ * There must be an encoding function for void results so svc_process
+ * will work properly.
+ */
+int
+nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* GETACL */
+static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_getaclres *resp)
+{
+	struct dentry *dentry = resp->fh.fh_dentry;
+	struct inode *inode;
+	struct kvec *head = rqstp->rq_res.head;
+	unsigned int base;
+	int n;
+	int w;
+
+	/*
+	 * Since this is version 2, the check for nfserr in
+	 * nfsd_dispatch actually ensures the following cannot happen.
+	 * However, it seems fragile to depend on that.
+	 */
+	if (dentry == NULL || dentry->d_inode == NULL)
+		return 0;
+	inode = dentry->d_inode;
+
+	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
+	*p++ = htonl(resp->mask);
+	if (!xdr_ressize_check(rqstp, p))
+		return 0;
+	base = (char *)p - (char *)head->iov_base;
+
+	rqstp->rq_res.page_len = w = nfsacl_size(
+		(resp->mask & NFS_ACL)   ? resp->acl_access  : NULL,
+		(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
+	while (w > 0) {
+		if (!rqstp->rq_respages[rqstp->rq_resused++])
+			return 0;
+		w -= PAGE_SIZE;
+	}
+
+	n = nfsacl_encode(&rqstp->rq_res, base, inode,
+			  resp->acl_access,
+			  resp->mask & NFS_ACL, 0);
+	if (n > 0)
+		n = nfsacl_encode(&rqstp->rq_res, base + n, inode,
+				  resp->acl_default,
+				  resp->mask & NFS_DFACL,
+				  NFS_ACL_DEFAULT);
+	if (n <= 0)
+		return 0;
+	return 1;
+}
+
+static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd_attrstat *resp)
+{
+	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* ACCESS */
+static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_accessres *resp)
+{
+	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
+	*p++ = htonl(resp->access);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/*
+ * XDR release functions
+ */
+static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_getaclres *resp)
+{
+	fh_put(&resp->fh);
+	posix_acl_release(resp->acl_access);
+	posix_acl_release(resp->acl_default);
+	return 1;
+}
+
+static int nfsaclsvc_release_attrstat(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd_attrstat *resp)
+{
+	fh_put(&resp->fh);
+	return 1;
+}
+
+static int nfsaclsvc_release_access(struct svc_rqst *rqstp, __be32 *p,
+               struct nfsd3_accessres *resp)
+{
+       fh_put(&resp->fh);
+       return 1;
+}
+
+#define nfsaclsvc_decode_voidargs	NULL
+#define nfsaclsvc_release_void		NULL
+#define nfsd3_fhandleargs	nfsd_fhandle
+#define nfsd3_attrstatres	nfsd_attrstat
+#define nfsd3_voidres		nfsd3_voidargs
+struct nfsd3_voidargs { int dummy; };
+
+#define PROC(name, argt, rest, relt, cache, respsize)	\
+ { (svc_procfunc) nfsacld_proc_##name,		\
+   (kxdrproc_t) nfsaclsvc_decode_##argt##args,	\
+   (kxdrproc_t) nfsaclsvc_encode_##rest##res,	\
+   (kxdrproc_t) nfsaclsvc_release_##relt,		\
+   sizeof(struct nfsd3_##argt##args),		\
+   sizeof(struct nfsd3_##rest##res),		\
+   0,						\
+   cache,					\
+   respsize,					\
+ }
+
+#define ST 1		/* status*/
+#define AT 21		/* attributes */
+#define pAT (1+AT)	/* post attributes - conditional */
+#define ACL (1+NFS_ACL_MAX_ENTRIES*3)  /* Access Control List */
+
+static struct svc_procedure		nfsd_acl_procedures2[] = {
+  PROC(null,	void,		void,		void,	  RC_NOCACHE, ST),
+  PROC(getacl,	getacl,		getacl,		getacl,	  RC_NOCACHE, ST+1+2*(1+ACL)),
+  PROC(setacl,	setacl,		attrstat,	attrstat, RC_NOCACHE, ST+AT),
+  PROC(getattr, fhandle,	attrstat,	attrstat, RC_NOCACHE, ST+AT),
+  PROC(access,	access,		access,		access,   RC_NOCACHE, ST+AT+1),
+};
+
+struct svc_version	nfsd_acl_version2 = {
+		.vs_vers	= 2,
+		.vs_nproc	= 5,
+		.vs_proc	= nfsd_acl_procedures2,
+		.vs_dispatch	= nfsd_dispatch,
+		.vs_xdrsize	= NFS3_SVC_XDRSIZE,
+		.vs_hidden	= 0,
+};
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
new file mode 100644
index 00000000..a596e9d9
--- /dev/null
+++ b/fs/nfsd/nfs3acl.c
@@ -0,0 +1,267 @@
+/*
+ * Process version 3 NFSACL requests.
+ *
+ * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
+ */
+
+#include "nfsd.h"
+/* FIXME: nfsacl.h is a broken header */
+#include <linux/nfsacl.h>
+#include <linux/gfp.h>
+#include "cache.h"
+#include "xdr3.h"
+#include "vfs.h"
+
+#define RETURN_STATUS(st)	{ resp->status = (st); return (st); }
+
+/*
+ * NULL call.
+ */
+static __be32
+nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	return nfs_ok;
+}
+
+/*
+ * Get the Access and/or Default ACL of a file.
+ */
+static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
+		struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
+{
+	svc_fh *fh;
+	struct posix_acl *acl;
+	__be32 nfserr = 0;
+
+	fh = fh_copy(&resp->fh, &argp->fh);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+	if (nfserr)
+		RETURN_STATUS(nfserr);
+
+	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+		RETURN_STATUS(nfserr_inval);
+	resp->mask = argp->mask;
+
+	if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
+		acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS);
+		if (IS_ERR(acl)) {
+			int err = PTR_ERR(acl);
+
+			if (err == -ENODATA || err == -EOPNOTSUPP)
+				acl = NULL;
+			else {
+				nfserr = nfserrno(err);
+				goto fail;
+			}
+		}
+		if (acl == NULL) {
+			/* Solaris returns the inode's minimum ACL. */
+
+			struct inode *inode = fh->fh_dentry->d_inode;
+			acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+		}
+		resp->acl_access = acl;
+	}
+	if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
+		/* Check how Solaris handles requests for the Default ACL
+		   of a non-directory! */
+
+		acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl)) {
+			int err = PTR_ERR(acl);
+
+			if (err == -ENODATA || err == -EOPNOTSUPP)
+				acl = NULL;
+			else {
+				nfserr = nfserrno(err);
+				goto fail;
+			}
+		}
+		resp->acl_default = acl;
+	}
+
+	/* resp->acl_{access,default} are released in nfs3svc_release_getacl. */
+	RETURN_STATUS(0);
+
+fail:
+	posix_acl_release(resp->acl_access);
+	posix_acl_release(resp->acl_default);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Set the Access and/or Default ACL of a file.
+ */
+static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
+		struct nfsd3_setaclargs *argp,
+		struct nfsd3_attrstat *resp)
+{
+	svc_fh *fh;
+	__be32 nfserr = 0;
+
+	fh = fh_copy(&resp->fh, &argp->fh);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
+
+	if (!nfserr) {
+		nfserr = nfserrno( nfsd_set_posix_acl(
+			fh, ACL_TYPE_ACCESS, argp->acl_access) );
+	}
+	if (!nfserr) {
+		nfserr = nfserrno( nfsd_set_posix_acl(
+			fh, ACL_TYPE_DEFAULT, argp->acl_default) );
+	}
+
+	/* argp->acl_{access,default} may have been allocated in
+	   nfs3svc_decode_setaclargs. */
+	posix_acl_release(argp->acl_access);
+	posix_acl_release(argp->acl_default);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * XDR decode functions
+ */
+static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_getaclargs *args)
+{
+	if (!(p = nfs3svc_decode_fh(p, &args->fh)))
+		return 0;
+	args->mask = ntohl(*p); p++;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+
+static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_setaclargs *args)
+{
+	struct kvec *head = rqstp->rq_arg.head;
+	unsigned int base;
+	int n;
+
+	if (!(p = nfs3svc_decode_fh(p, &args->fh)))
+		return 0;
+	args->mask = ntohl(*p++);
+	if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
+	    !xdr_argsize_check(rqstp, p))
+		return 0;
+
+	base = (char *)p - (char *)head->iov_base;
+	n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
+			  (args->mask & NFS_ACL) ?
+			  &args->acl_access : NULL);
+	if (n > 0)
+		n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
+				  (args->mask & NFS_DFACL) ?
+				  &args->acl_default : NULL);
+	return (n > 0);
+}
+
+/*
+ * XDR encode functions
+ */
+
+/* GETACL */
+static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_getaclres *resp)
+{
+	struct dentry *dentry = resp->fh.fh_dentry;
+
+	p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
+	if (resp->status == 0 && dentry && dentry->d_inode) {
+		struct inode *inode = dentry->d_inode;
+		struct kvec *head = rqstp->rq_res.head;
+		unsigned int base;
+		int n;
+		int w;
+
+		*p++ = htonl(resp->mask);
+		if (!xdr_ressize_check(rqstp, p))
+			return 0;
+		base = (char *)p - (char *)head->iov_base;
+
+		rqstp->rq_res.page_len = w = nfsacl_size(
+			(resp->mask & NFS_ACL)   ? resp->acl_access  : NULL,
+			(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
+		while (w > 0) {
+			if (!rqstp->rq_respages[rqstp->rq_resused++])
+				return 0;
+			w -= PAGE_SIZE;
+		}
+
+		n = nfsacl_encode(&rqstp->rq_res, base, inode,
+				  resp->acl_access,
+				  resp->mask & NFS_ACL, 0);
+		if (n > 0)
+			n = nfsacl_encode(&rqstp->rq_res, base + n, inode,
+					  resp->acl_default,
+					  resp->mask & NFS_DFACL,
+					  NFS_ACL_DEFAULT);
+		if (n <= 0)
+			return 0;
+	} else
+		if (!xdr_ressize_check(rqstp, p))
+			return 0;
+
+	return 1;
+}
+
+/* SETACL */
+static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_attrstat *resp)
+{
+	p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
+
+	return xdr_ressize_check(rqstp, p);
+}
+
+/*
+ * XDR release functions
+ */
+static int nfs3svc_release_getacl(struct svc_rqst *rqstp, __be32 *p,
+		struct nfsd3_getaclres *resp)
+{
+	fh_put(&resp->fh);
+	posix_acl_release(resp->acl_access);
+	posix_acl_release(resp->acl_default);
+	return 1;
+}
+
+#define nfs3svc_decode_voidargs		NULL
+#define nfs3svc_release_void		NULL
+#define nfsd3_setaclres			nfsd3_attrstat
+#define nfsd3_voidres			nfsd3_voidargs
+struct nfsd3_voidargs { int dummy; };
+
+#define PROC(name, argt, rest, relt, cache, respsize)	\
+ { (svc_procfunc) nfsd3_proc_##name,		\
+   (kxdrproc_t) nfs3svc_decode_##argt##args,	\
+   (kxdrproc_t) nfs3svc_encode_##rest##res,	\
+   (kxdrproc_t) nfs3svc_release_##relt,		\
+   sizeof(struct nfsd3_##argt##args),		\
+   sizeof(struct nfsd3_##rest##res),		\
+   0,						\
+   cache,					\
+   respsize,					\
+ }
+
+#define ST 1		/* status*/
+#define AT 21		/* attributes */
+#define pAT (1+AT)	/* post attributes - conditional */
+#define ACL (1+NFS_ACL_MAX_ENTRIES*3)  /* Access Control List */
+
+static struct svc_procedure		nfsd_acl_procedures3[] = {
+  PROC(null,	void,		void,		void,	  RC_NOCACHE, ST),
+  PROC(getacl,	getacl,		getacl,		getacl,	  RC_NOCACHE, ST+1+2*(1+ACL)),
+  PROC(setacl,	setacl,		setacl,		fhandle,  RC_NOCACHE, ST+pAT),
+};
+
+struct svc_version	nfsd_acl_version3 = {
+		.vs_vers	= 3,
+		.vs_nproc	= 3,
+		.vs_proc	= nfsd_acl_procedures3,
+		.vs_dispatch	= nfsd_dispatch,
+		.vs_xdrsize	= NFS3_SVC_XDRSIZE,
+		.vs_hidden	= 0,
+};
+
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
new file mode 100644
index 00000000..9095f3c2
--- /dev/null
+++ b/fs/nfsd/nfs3proc.c
@@ -0,0 +1,896 @@
+/*
+ * Process version 3 NFS requests.
+ *
+ * Copyright (C) 1996, 1997, 1998 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/fs.h>
+#include <linux/ext2_fs.h>
+#include <linux/magic.h>
+
+#include "cache.h"
+#include "xdr3.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_PROC
+
+#define RETURN_STATUS(st)	{ resp->status = (st); return (st); }
+
+static int	nfs3_ftypes[] = {
+	0,			/* NF3NON */
+	S_IFREG,		/* NF3REG */
+	S_IFDIR,		/* NF3DIR */
+	S_IFBLK,		/* NF3BLK */
+	S_IFCHR,		/* NF3CHR */
+	S_IFLNK,		/* NF3LNK */
+	S_IFSOCK,		/* NF3SOCK */
+	S_IFIFO,		/* NF3FIFO */
+};
+
+/*
+ * NULL call.
+ */
+static __be32
+nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	return nfs_ok;
+}
+
+/*
+ * Get a file's attributes
+ */
+static __be32
+nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
+					   struct nfsd3_attrstat *resp)
+{
+	int	err;
+	__be32	nfserr;
+
+	dprintk("nfsd: GETATTR(3)  %s\n",
+		SVCFH_fmt(&argp->fh));
+
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = fh_verify(rqstp, &resp->fh, 0,
+			NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
+	if (nfserr)
+		RETURN_STATUS(nfserr);
+
+	err = vfs_getattr(resp->fh.fh_export->ex_path.mnt,
+			  resp->fh.fh_dentry, &resp->stat);
+	nfserr = nfserrno(err);
+
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Set a file's attributes
+ */
+static __be32
+nfsd3_proc_setattr(struct svc_rqst *rqstp, struct nfsd3_sattrargs *argp,
+					   struct nfsd3_attrstat  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: SETATTR(3)  %s\n",
+				SVCFH_fmt(&argp->fh));
+
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,
+			      argp->check_guard, argp->guardtime);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Look up a path name component
+ */
+static __be32
+nfsd3_proc_lookup(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
+					  struct nfsd3_diropres  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: LOOKUP(3)   %s %.*s\n",
+				SVCFH_fmt(&argp->fh),
+				argp->len,
+				argp->name);
+
+	fh_copy(&resp->dirfh, &argp->fh);
+	fh_init(&resp->fh, NFS3_FHSIZE);
+
+	nfserr = nfsd_lookup(rqstp, &resp->dirfh,
+				    argp->name,
+				    argp->len,
+				    &resp->fh);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Check file access
+ */
+static __be32
+nfsd3_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
+					  struct nfsd3_accessres *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: ACCESS(3)   %s 0x%x\n",
+				SVCFH_fmt(&argp->fh),
+				argp->access);
+
+	fh_copy(&resp->fh, &argp->fh);
+	resp->access = argp->access;
+	nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Read a symlink.
+ */
+static __be32
+nfsd3_proc_readlink(struct svc_rqst *rqstp, struct nfsd3_readlinkargs *argp,
+					   struct nfsd3_readlinkres *resp)
+{
+	__be32 nfserr;
+
+	dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh));
+
+	/* Read the symlink. */
+	fh_copy(&resp->fh, &argp->fh);
+	resp->len = NFS3_MAXPATHLEN;
+	nfserr = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Read a portion of a file.
+ */
+static __be32
+nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
+				        struct nfsd3_readres  *resp)
+{
+	__be32	nfserr;
+	u32	max_blocksize = svc_max_payload(rqstp);
+
+	dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
+				SVCFH_fmt(&argp->fh),
+				(unsigned long) argp->count,
+				(unsigned long long) argp->offset);
+
+	/* Obtain buffer pointer for payload.
+	 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
+	 * + 1 (xdr opaque byte count) = 26
+	 */
+
+	resp->count = argp->count;
+	if (max_blocksize < resp->count)
+		resp->count = max_blocksize;
+
+	svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
+
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = nfsd_read(rqstp, &resp->fh,
+				  argp->offset,
+			   	  rqstp->rq_vec, argp->vlen,
+				  &resp->count);
+	if (nfserr == 0) {
+		struct inode	*inode = resp->fh.fh_dentry->d_inode;
+
+		resp->eof = (argp->offset + resp->count) >= inode->i_size;
+	}
+
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Write data to a file
+ */
+static __be32
+nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
+					 struct nfsd3_writeres  *resp)
+{
+	__be32	nfserr;
+	unsigned long cnt = argp->len;
+
+	dprintk("nfsd: WRITE(3)    %s %d bytes at %Lu%s\n",
+				SVCFH_fmt(&argp->fh),
+				argp->len,
+				(unsigned long long) argp->offset,
+				argp->stable? " stable" : "");
+
+	fh_copy(&resp->fh, &argp->fh);
+	resp->committed = argp->stable;
+	nfserr = nfsd_write(rqstp, &resp->fh, NULL,
+				   argp->offset,
+				   rqstp->rq_vec, argp->vlen,
+				   &cnt,
+				   &resp->committed);
+	resp->count = cnt;
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * With NFSv3, CREATE processing is a lot easier than with NFSv2.
+ * At least in theory; we'll see how it fares in practice when the
+ * first reports about SunOS compatibility problems start to pour in...
+ */
+static __be32
+nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
+					  struct nfsd3_diropres   *resp)
+{
+	svc_fh		*dirfhp, *newfhp = NULL;
+	struct iattr	*attr;
+	__be32		nfserr;
+
+	dprintk("nfsd: CREATE(3)   %s %.*s\n",
+				SVCFH_fmt(&argp->fh),
+				argp->len,
+				argp->name);
+
+	dirfhp = fh_copy(&resp->dirfh, &argp->fh);
+	newfhp = fh_init(&resp->fh, NFS3_FHSIZE);
+	attr   = &argp->attrs;
+
+	/* Get the directory inode */
+	nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_CREATE);
+	if (nfserr)
+		RETURN_STATUS(nfserr);
+
+	/* Unfudge the mode bits */
+	attr->ia_mode &= ~S_IFMT;
+	if (!(attr->ia_valid & ATTR_MODE)) { 
+		attr->ia_valid |= ATTR_MODE;
+		attr->ia_mode = S_IFREG;
+	} else {
+		attr->ia_mode = (attr->ia_mode & ~S_IFMT) | S_IFREG;
+	}
+
+	/* Now create the file and set attributes */
+	nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
+				attr, newfhp,
+				argp->createmode, argp->verf, NULL, NULL);
+
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Make directory. This operation is not idempotent.
+ */
+static __be32
+nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
+					 struct nfsd3_diropres   *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: MKDIR(3)    %s %.*s\n",
+				SVCFH_fmt(&argp->fh),
+				argp->len,
+				argp->name);
+
+	argp->attrs.ia_valid &= ~ATTR_SIZE;
+	fh_copy(&resp->dirfh, &argp->fh);
+	fh_init(&resp->fh, NFS3_FHSIZE);
+	nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
+				    &argp->attrs, S_IFDIR, 0, &resp->fh);
+	fh_unlock(&resp->dirfh);
+	RETURN_STATUS(nfserr);
+}
+
+static __be32
+nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp,
+					   struct nfsd3_diropres    *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: SYMLINK(3)  %s %.*s -> %.*s\n",
+				SVCFH_fmt(&argp->ffh),
+				argp->flen, argp->fname,
+				argp->tlen, argp->tname);
+
+	fh_copy(&resp->dirfh, &argp->ffh);
+	fh_init(&resp->fh, NFS3_FHSIZE);
+	nfserr = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, argp->flen,
+						   argp->tname, argp->tlen,
+						   &resp->fh, &argp->attrs);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Make socket/fifo/device.
+ */
+static __be32
+nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
+					 struct nfsd3_diropres  *resp)
+{
+	__be32	nfserr;
+	int type;
+	dev_t	rdev = 0;
+
+	dprintk("nfsd: MKNOD(3)    %s %.*s\n",
+				SVCFH_fmt(&argp->fh),
+				argp->len,
+				argp->name);
+
+	fh_copy(&resp->dirfh, &argp->fh);
+	fh_init(&resp->fh, NFS3_FHSIZE);
+
+	if (argp->ftype == 0 || argp->ftype >= NF3BAD)
+		RETURN_STATUS(nfserr_inval);
+	if (argp->ftype == NF3CHR || argp->ftype == NF3BLK) {
+		rdev = MKDEV(argp->major, argp->minor);
+		if (MAJOR(rdev) != argp->major ||
+		    MINOR(rdev) != argp->minor)
+			RETURN_STATUS(nfserr_inval);
+	} else
+		if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO)
+			RETURN_STATUS(nfserr_inval);
+
+	type = nfs3_ftypes[argp->ftype];
+	nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
+				    &argp->attrs, type, rdev, &resp->fh);
+	fh_unlock(&resp->dirfh);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Remove file/fifo/socket etc.
+ */
+static __be32
+nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
+					  struct nfsd3_attrstat  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: REMOVE(3)   %s %.*s\n",
+				SVCFH_fmt(&argp->fh),
+				argp->len,
+				argp->name);
+
+	/* Unlink. -S_IFDIR means file must not be a directory */
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len);
+	fh_unlock(&resp->fh);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Remove a directory
+ */
+static __be32
+nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
+					 struct nfsd3_attrstat  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: RMDIR(3)    %s %.*s\n",
+				SVCFH_fmt(&argp->fh),
+				argp->len,
+				argp->name);
+
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len);
+	fh_unlock(&resp->fh);
+	RETURN_STATUS(nfserr);
+}
+
+static __be32
+nfsd3_proc_rename(struct svc_rqst *rqstp, struct nfsd3_renameargs *argp,
+					  struct nfsd3_renameres  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: RENAME(3)   %s %.*s ->\n",
+				SVCFH_fmt(&argp->ffh),
+				argp->flen,
+				argp->fname);
+	dprintk("nfsd: -> %s %.*s\n",
+				SVCFH_fmt(&argp->tfh),
+				argp->tlen,
+				argp->tname);
+
+	fh_copy(&resp->ffh, &argp->ffh);
+	fh_copy(&resp->tfh, &argp->tfh);
+	nfserr = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen,
+				    &resp->tfh, argp->tname, argp->tlen);
+	RETURN_STATUS(nfserr);
+}
+
+static __be32
+nfsd3_proc_link(struct svc_rqst *rqstp, struct nfsd3_linkargs *argp,
+					struct nfsd3_linkres  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: LINK(3)     %s ->\n",
+				SVCFH_fmt(&argp->ffh));
+	dprintk("nfsd:   -> %s %.*s\n",
+				SVCFH_fmt(&argp->tfh),
+				argp->tlen,
+				argp->tname);
+
+	fh_copy(&resp->fh,  &argp->ffh);
+	fh_copy(&resp->tfh, &argp->tfh);
+	nfserr = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen,
+				  &resp->fh);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Read a portion of a directory.
+ */
+static __be32
+nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
+					   struct nfsd3_readdirres  *resp)
+{
+	__be32		nfserr;
+	int		count;
+
+	dprintk("nfsd: READDIR(3)  %s %d bytes at %d\n",
+				SVCFH_fmt(&argp->fh),
+				argp->count, (u32) argp->cookie);
+
+	/* Make sure we've room for the NULL ptr & eof flag, and shrink to
+	 * client read size */
+	count = (argp->count >> 2) - 2;
+
+	/* Read directory and encode entries on the fly */
+	fh_copy(&resp->fh, &argp->fh);
+
+	resp->buflen = count;
+	resp->common.err = nfs_ok;
+	resp->buffer = argp->buffer;
+	resp->rqstp = rqstp;
+	nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t*) &argp->cookie, 
+					&resp->common, nfs3svc_encode_entry);
+	memcpy(resp->verf, argp->verf, 8);
+	resp->count = resp->buffer - argp->buffer;
+	if (resp->offset)
+		xdr_encode_hyper(resp->offset, argp->cookie);
+
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Read a portion of a directory, including file handles and attrs.
+ * For now, we choose to ignore the dircount parameter.
+ */
+static __be32
+nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
+					       struct nfsd3_readdirres  *resp)
+{
+	__be32	nfserr;
+	int	count = 0;
+	loff_t	offset;
+	int	i;
+	caddr_t	page_addr = NULL;
+
+	dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n",
+				SVCFH_fmt(&argp->fh),
+				argp->count, (u32) argp->cookie);
+
+	/* Convert byte count to number of words (i.e. >> 2),
+	 * and reserve room for the NULL ptr & eof flag (-2 words) */
+	resp->count = (argp->count >> 2) - 2;
+
+	/* Read directory and encode entries on the fly */
+	fh_copy(&resp->fh, &argp->fh);
+
+	resp->common.err = nfs_ok;
+	resp->buffer = argp->buffer;
+	resp->buflen = resp->count;
+	resp->rqstp = rqstp;
+	offset = argp->cookie;
+	nfserr = nfsd_readdir(rqstp, &resp->fh,
+				     &offset,
+				     &resp->common,
+				     nfs3svc_encode_entry_plus);
+	memcpy(resp->verf, argp->verf, 8);
+	for (i=1; i<rqstp->rq_resused ; i++) {
+		page_addr = page_address(rqstp->rq_respages[i]);
+
+		if (((caddr_t)resp->buffer >= page_addr) &&
+		    ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
+			count += (caddr_t)resp->buffer - page_addr;
+			break;
+		}
+		count += PAGE_SIZE;
+	}
+	resp->count = count >> 2;
+	if (resp->offset) {
+		if (unlikely(resp->offset1)) {
+			/* we ended up with offset on a page boundary */
+			*resp->offset = htonl(offset >> 32);
+			*resp->offset1 = htonl(offset & 0xffffffff);
+			resp->offset1 = NULL;
+		} else {
+			xdr_encode_hyper(resp->offset, offset);
+		}
+	}
+
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Get file system stats
+ */
+static __be32
+nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
+					   struct nfsd3_fsstatres *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: FSSTAT(3)   %s\n",
+				SVCFH_fmt(&argp->fh));
+
+	nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
+	fh_put(&argp->fh);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Get file system info
+ */
+static __be32
+nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
+					   struct nfsd3_fsinfores *resp)
+{
+	__be32	nfserr;
+	u32	max_blocksize = svc_max_payload(rqstp);
+
+	dprintk("nfsd: FSINFO(3)   %s\n",
+				SVCFH_fmt(&argp->fh));
+
+	resp->f_rtmax  = max_blocksize;
+	resp->f_rtpref = max_blocksize;
+	resp->f_rtmult = PAGE_SIZE;
+	resp->f_wtmax  = max_blocksize;
+	resp->f_wtpref = max_blocksize;
+	resp->f_wtmult = PAGE_SIZE;
+	resp->f_dtpref = PAGE_SIZE;
+	resp->f_maxfilesize = ~(u32) 0;
+	resp->f_properties = NFS3_FSF_DEFAULT;
+
+	nfserr = fh_verify(rqstp, &argp->fh, 0,
+			NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
+
+	/* Check special features of the file system. May request
+	 * different read/write sizes for file systems known to have
+	 * problems with large blocks */
+	if (nfserr == 0) {
+		struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
+
+		/* Note that we don't care for remote fs's here */
+		if (sb->s_magic == MSDOS_SUPER_MAGIC) {
+			resp->f_properties = NFS3_FSF_BILLYBOY;
+		}
+		resp->f_maxfilesize = sb->s_maxbytes;
+	}
+
+	fh_put(&argp->fh);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Get pathconf info for the specified file
+ */
+static __be32
+nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
+					     struct nfsd3_pathconfres *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: PATHCONF(3) %s\n",
+				SVCFH_fmt(&argp->fh));
+
+	/* Set default pathconf */
+	resp->p_link_max = 255;		/* at least */
+	resp->p_name_max = 255;		/* at least */
+	resp->p_no_trunc = 0;
+	resp->p_chown_restricted = 1;
+	resp->p_case_insensitive = 0;
+	resp->p_case_preserving = 1;
+
+	nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
+
+	if (nfserr == 0) {
+		struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
+
+		/* Note that we don't care for remote fs's here */
+		switch (sb->s_magic) {
+		case EXT2_SUPER_MAGIC:
+			resp->p_link_max = EXT2_LINK_MAX;
+			resp->p_name_max = EXT2_NAME_LEN;
+			break;
+		case MSDOS_SUPER_MAGIC:
+			resp->p_case_insensitive = 1;
+			resp->p_case_preserving  = 0;
+			break;
+		}
+	}
+
+	fh_put(&argp->fh);
+	RETURN_STATUS(nfserr);
+}
+
+
+/*
+ * Commit a file (range) to stable storage.
+ */
+static __be32
+nfsd3_proc_commit(struct svc_rqst * rqstp, struct nfsd3_commitargs *argp,
+					   struct nfsd3_commitres  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: COMMIT(3)   %s %u@%Lu\n",
+				SVCFH_fmt(&argp->fh),
+				argp->count,
+				(unsigned long long) argp->offset);
+
+	if (argp->offset > NFS_OFFSET_MAX)
+		RETURN_STATUS(nfserr_inval);
+
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = nfsd_commit(rqstp, &resp->fh, argp->offset, argp->count);
+
+	RETURN_STATUS(nfserr);
+}
+
+
+/*
+ * NFSv3 Server procedures.
+ * Only the results of non-idempotent operations are cached.
+ */
+#define nfs3svc_decode_fhandleargs	nfs3svc_decode_fhandle
+#define nfs3svc_encode_attrstatres	nfs3svc_encode_attrstat
+#define nfs3svc_encode_wccstatres	nfs3svc_encode_wccstat
+#define nfsd3_mkdirargs			nfsd3_createargs
+#define nfsd3_readdirplusargs		nfsd3_readdirargs
+#define nfsd3_fhandleargs		nfsd_fhandle
+#define nfsd3_fhandleres		nfsd3_attrstat
+#define nfsd3_attrstatres		nfsd3_attrstat
+#define nfsd3_wccstatres		nfsd3_attrstat
+#define nfsd3_createres			nfsd3_diropres
+#define nfsd3_voidres			nfsd3_voidargs
+struct nfsd3_voidargs { int dummy; };
+
+#define PROC(name, argt, rest, relt, cache, respsize)	\
+ { (svc_procfunc) nfsd3_proc_##name,		\
+   (kxdrproc_t) nfs3svc_decode_##argt##args,	\
+   (kxdrproc_t) nfs3svc_encode_##rest##res,	\
+   (kxdrproc_t) nfs3svc_release_##relt,		\
+   sizeof(struct nfsd3_##argt##args),		\
+   sizeof(struct nfsd3_##rest##res),		\
+   0,						\
+   cache,					\
+   respsize,					\
+ }
+
+#define ST 1		/* status*/
+#define FH 17		/* filehandle with length */
+#define AT 21		/* attributes */
+#define pAT (1+AT)	/* post attributes - conditional */
+#define WC (7+pAT)	/* WCC attributes */
+
+static struct svc_procedure		nfsd_procedures3[22] = {
+	[NFS3PROC_NULL] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_null,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_voidres,
+		.pc_argsize = sizeof(struct nfsd3_voidargs),
+		.pc_ressize = sizeof(struct nfsd3_voidres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST,
+	},
+	[NFS3PROC_GETATTR] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_getattr,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_attrstatres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+		.pc_ressize = sizeof(struct nfsd3_attrstatres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+AT,
+	},
+	[NFS3PROC_SETATTR] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_setattr,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_sattrargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_sattrargs),
+		.pc_ressize = sizeof(struct nfsd3_wccstatres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+WC,
+	},
+	[NFS3PROC_LOOKUP] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_lookup,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_diropres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+		.pc_argsize = sizeof(struct nfsd3_diropargs),
+		.pc_ressize = sizeof(struct nfsd3_diropres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+FH+pAT+pAT,
+	},
+	[NFS3PROC_ACCESS] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_access,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_accessargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_accessres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_accessargs),
+		.pc_ressize = sizeof(struct nfsd3_accessres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+pAT+1,
+	},
+	[NFS3PROC_READLINK] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_readlink,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_readlinkargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_readlinkres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_readlinkargs),
+		.pc_ressize = sizeof(struct nfsd3_readlinkres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4,
+	},
+	[NFS3PROC_READ] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_read,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_readargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_readres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_readargs),
+		.pc_ressize = sizeof(struct nfsd3_readres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4,
+	},
+	[NFS3PROC_WRITE] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_write,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_writeargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_writeres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_writeargs),
+		.pc_ressize = sizeof(struct nfsd3_writeres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+WC+4,
+	},
+	[NFS3PROC_CREATE] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_create,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_createargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+		.pc_argsize = sizeof(struct nfsd3_createargs),
+		.pc_ressize = sizeof(struct nfsd3_createres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+(1+FH+pAT)+WC,
+	},
+	[NFS3PROC_MKDIR] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_mkdir,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_mkdirargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+		.pc_argsize = sizeof(struct nfsd3_mkdirargs),
+		.pc_ressize = sizeof(struct nfsd3_createres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+(1+FH+pAT)+WC,
+	},
+	[NFS3PROC_SYMLINK] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_symlink,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_symlinkargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+		.pc_argsize = sizeof(struct nfsd3_symlinkargs),
+		.pc_ressize = sizeof(struct nfsd3_createres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+(1+FH+pAT)+WC,
+	},
+	[NFS3PROC_MKNOD] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_mknod,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_mknodargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+		.pc_argsize = sizeof(struct nfsd3_mknodargs),
+		.pc_ressize = sizeof(struct nfsd3_createres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+(1+FH+pAT)+WC,
+	},
+	[NFS3PROC_REMOVE] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_remove,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_diropargs),
+		.pc_ressize = sizeof(struct nfsd3_wccstatres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+WC,
+	},
+	[NFS3PROC_RMDIR] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_rmdir,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_diropargs),
+		.pc_ressize = sizeof(struct nfsd3_wccstatres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+WC,
+	},
+	[NFS3PROC_RENAME] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_rename,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_renameargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_renameres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+		.pc_argsize = sizeof(struct nfsd3_renameargs),
+		.pc_ressize = sizeof(struct nfsd3_renameres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+WC+WC,
+	},
+	[NFS3PROC_LINK] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_link,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_linkargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_linkres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+		.pc_argsize = sizeof(struct nfsd3_linkargs),
+		.pc_ressize = sizeof(struct nfsd3_linkres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+pAT+WC,
+	},
+	[NFS3PROC_READDIR] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_readdir,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_readdirargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_readdirres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_readdirargs),
+		.pc_ressize = sizeof(struct nfsd3_readdirres),
+		.pc_cachetype = RC_NOCACHE,
+	},
+	[NFS3PROC_READDIRPLUS] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_readdirplus,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_readdirplusargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_readdirres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_readdirplusargs),
+		.pc_ressize = sizeof(struct nfsd3_readdirres),
+		.pc_cachetype = RC_NOCACHE,
+	},
+	[NFS3PROC_FSSTAT] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_fsstat,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_fsstatres,
+		.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+		.pc_ressize = sizeof(struct nfsd3_fsstatres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+pAT+2*6+1,
+	},
+	[NFS3PROC_FSINFO] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_fsinfo,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_fsinfores,
+		.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+		.pc_ressize = sizeof(struct nfsd3_fsinfores),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+pAT+12,
+	},
+	[NFS3PROC_PATHCONF] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_pathconf,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_pathconfres,
+		.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+		.pc_ressize = sizeof(struct nfsd3_pathconfres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+pAT+6,
+	},
+	[NFS3PROC_COMMIT] = {
+		.pc_func = (svc_procfunc) nfsd3_proc_commit,
+		.pc_decode = (kxdrproc_t) nfs3svc_decode_commitargs,
+		.pc_encode = (kxdrproc_t) nfs3svc_encode_commitres,
+		.pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd3_commitargs),
+		.pc_ressize = sizeof(struct nfsd3_commitres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+WC+2,
+	},
+};
+
+struct svc_version	nfsd_version3 = {
+		.vs_vers	= 3,
+		.vs_nproc	= 22,
+		.vs_proc	= nfsd_procedures3,
+		.vs_dispatch	= nfsd_dispatch,
+		.vs_xdrsize	= NFS3_SVC_XDRSIZE,
+};
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
new file mode 100644
index 00000000..43f46cd9
--- /dev/null
+++ b/fs/nfsd/nfs3xdr.c
@@ -0,0 +1,1112 @@
+/*
+ * XDR support for nfsd/protocol version 3.
+ *
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ *
+ * 2003-08-09 Jamie Lokier: Use htonl() for nanoseconds, not htons()!
+ */
+
+#include <linux/namei.h>
+#include "xdr3.h"
+#include "auth.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_XDR
+
+
+/*
+ * Mapping of S_IF* types to NFS file types
+ */
+static u32	nfs3_ftypes[] = {
+	NF3NON,  NF3FIFO, NF3CHR, NF3BAD,
+	NF3DIR,  NF3BAD,  NF3BLK, NF3BAD,
+	NF3REG,  NF3BAD,  NF3LNK, NF3BAD,
+	NF3SOCK, NF3BAD,  NF3LNK, NF3BAD,
+};
+
+/*
+ * XDR functions for basic NFS types
+ */
+static __be32 *
+encode_time3(__be32 *p, struct timespec *time)
+{
+	*p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec);
+	return p;
+}
+
+static __be32 *
+decode_time3(__be32 *p, struct timespec *time)
+{
+	time->tv_sec = ntohl(*p++);
+	time->tv_nsec = ntohl(*p++);
+	return p;
+}
+
+static __be32 *
+decode_fh(__be32 *p, struct svc_fh *fhp)
+{
+	unsigned int size;
+	fh_init(fhp, NFS3_FHSIZE);
+	size = ntohl(*p++);
+	if (size > NFS3_FHSIZE)
+		return NULL;
+
+	memcpy(&fhp->fh_handle.fh_base, p, size);
+	fhp->fh_handle.fh_size = size;
+	return p + XDR_QUADLEN(size);
+}
+
+/* Helper function for NFSv3 ACL code */
+__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp)
+{
+	return decode_fh(p, fhp);
+}
+
+static __be32 *
+encode_fh(__be32 *p, struct svc_fh *fhp)
+{
+	unsigned int size = fhp->fh_handle.fh_size;
+	*p++ = htonl(size);
+	if (size) p[XDR_QUADLEN(size)-1]=0;
+	memcpy(p, &fhp->fh_handle.fh_base, size);
+	return p + XDR_QUADLEN(size);
+}
+
+/*
+ * Decode a file name and make sure that the path contains
+ * no slashes or null bytes.
+ */
+static __be32 *
+decode_filename(__be32 *p, char **namp, unsigned int *lenp)
+{
+	char		*name;
+	unsigned int	i;
+
+	if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) {
+		for (i = 0, name = *namp; i < *lenp; i++, name++) {
+			if (*name == '\0' || *name == '/')
+				return NULL;
+		}
+	}
+
+	return p;
+}
+
+static __be32 *
+decode_sattr3(__be32 *p, struct iattr *iap)
+{
+	u32	tmp;
+
+	iap->ia_valid = 0;
+
+	if (*p++) {
+		iap->ia_valid |= ATTR_MODE;
+		iap->ia_mode = ntohl(*p++);
+	}
+	if (*p++) {
+		iap->ia_valid |= ATTR_UID;
+		iap->ia_uid = ntohl(*p++);
+	}
+	if (*p++) {
+		iap->ia_valid |= ATTR_GID;
+		iap->ia_gid = ntohl(*p++);
+	}
+	if (*p++) {
+		u64	newsize;
+
+		iap->ia_valid |= ATTR_SIZE;
+		p = xdr_decode_hyper(p, &newsize);
+		if (newsize <= NFS_OFFSET_MAX)
+			iap->ia_size = newsize;
+		else
+			iap->ia_size = NFS_OFFSET_MAX;
+	}
+	if ((tmp = ntohl(*p++)) == 1) {	/* set to server time */
+		iap->ia_valid |= ATTR_ATIME;
+	} else if (tmp == 2) {		/* set to client time */
+		iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET;
+		iap->ia_atime.tv_sec = ntohl(*p++);
+		iap->ia_atime.tv_nsec = ntohl(*p++);
+	}
+	if ((tmp = ntohl(*p++)) == 1) {	/* set to server time */
+		iap->ia_valid |= ATTR_MTIME;
+	} else if (tmp == 2) {		/* set to client time */
+		iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET;
+		iap->ia_mtime.tv_sec = ntohl(*p++);
+		iap->ia_mtime.tv_nsec = ntohl(*p++);
+	}
+	return p;
+}
+
+static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp)
+{
+	u64 f;
+	switch(fsid_source(fhp)) {
+	default:
+	case FSIDSOURCE_DEV:
+		p = xdr_encode_hyper(p, (u64)huge_encode_dev
+				     (fhp->fh_dentry->d_inode->i_sb->s_dev));
+		break;
+	case FSIDSOURCE_FSID:
+		p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
+		break;
+	case FSIDSOURCE_UUID:
+		f = ((u64*)fhp->fh_export->ex_uuid)[0];
+		f ^= ((u64*)fhp->fh_export->ex_uuid)[1];
+		p = xdr_encode_hyper(p, f);
+		break;
+	}
+	return p;
+}
+
+static __be32 *
+encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
+	      struct kstat *stat)
+{
+	*p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
+	*p++ = htonl((u32) stat->mode);
+	*p++ = htonl((u32) stat->nlink);
+	*p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
+	*p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
+	if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {
+		p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
+	} else {
+		p = xdr_encode_hyper(p, (u64) stat->size);
+	}
+	p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9);
+	*p++ = htonl((u32) MAJOR(stat->rdev));
+	*p++ = htonl((u32) MINOR(stat->rdev));
+	p = encode_fsid(p, fhp);
+	p = xdr_encode_hyper(p, stat->ino);
+	p = encode_time3(p, &stat->atime);
+	p = encode_time3(p, &stat->mtime);
+	p = encode_time3(p, &stat->ctime);
+
+	return p;
+}
+
+static __be32 *
+encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+{
+	/* Attributes to follow */
+	*p++ = xdr_one;
+	return encode_fattr3(rqstp, p, fhp, &fhp->fh_post_attr);
+}
+
+/*
+ * Encode post-operation attributes.
+ * The inode may be NULL if the call failed because of a stale file
+ * handle. In this case, no attributes are returned.
+ */
+static __be32 *
+encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+{
+	struct dentry *dentry = fhp->fh_dentry;
+	if (dentry && dentry->d_inode) {
+	        int err;
+		struct kstat stat;
+
+		err = vfs_getattr(fhp->fh_export->ex_path.mnt, dentry, &stat);
+		if (!err) {
+			*p++ = xdr_one;		/* attributes follow */
+			lease_get_mtime(dentry->d_inode, &stat.mtime);
+			return encode_fattr3(rqstp, p, fhp, &stat);
+		}
+	}
+	*p++ = xdr_zero;
+	return p;
+}
+
+/* Helper for NFSv3 ACLs */
+__be32 *
+nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+{
+	return encode_post_op_attr(rqstp, p, fhp);
+}
+
+/*
+ * Enocde weak cache consistency data
+ */
+static __be32 *
+encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+{
+	struct dentry	*dentry = fhp->fh_dentry;
+
+	if (dentry && dentry->d_inode && fhp->fh_post_saved) {
+		if (fhp->fh_pre_saved) {
+			*p++ = xdr_one;
+			p = xdr_encode_hyper(p, (u64) fhp->fh_pre_size);
+			p = encode_time3(p, &fhp->fh_pre_mtime);
+			p = encode_time3(p, &fhp->fh_pre_ctime);
+		} else {
+			*p++ = xdr_zero;
+		}
+		return encode_saved_post_attr(rqstp, p, fhp);
+	}
+	/* no pre- or post-attrs */
+	*p++ = xdr_zero;
+	return encode_post_op_attr(rqstp, p, fhp);
+}
+
+/*
+ * Fill in the post_op attr for the wcc data
+ */
+void fill_post_wcc(struct svc_fh *fhp)
+{
+	int err;
+
+	if (fhp->fh_post_saved)
+		printk("nfsd: inode locked twice during operation.\n");
+
+	err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
+			&fhp->fh_post_attr);
+	fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
+	if (err) {
+		fhp->fh_post_saved = 0;
+		/* Grab the ctime anyway - set_change_info might use it */
+		fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime;
+	} else
+		fhp->fh_post_saved = 1;
+}
+
+/*
+ * XDR decode functions
+ */
+int
+nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
+{
+	if (!(p = decode_fh(p, &args->fh)))
+		return 0;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_sattrargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh)))
+		return 0;
+	p = decode_sattr3(p, &args->attrs);
+
+	if ((args->check_guard = ntohl(*p++)) != 0) { 
+		struct timespec time; 
+		p = decode_time3(p, &time);
+		args->guardtime = time.tv_sec;
+	}
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_diropargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh))
+	 || !(p = decode_filename(p, &args->name, &args->len)))
+		return 0;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_accessargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh)))
+		return 0;
+	args->access = ntohl(*p++);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_readargs *args)
+{
+	unsigned int len;
+	int v,pn;
+	u32 max_blocksize = svc_max_payload(rqstp);
+
+	if (!(p = decode_fh(p, &args->fh)))
+		return 0;
+	p = xdr_decode_hyper(p, &args->offset);
+
+	len = args->count = ntohl(*p++);
+
+	if (len > max_blocksize)
+		len = max_blocksize;
+
+	/* set up the kvec */
+	v=0;
+	while (len > 0) {
+		pn = rqstp->rq_resused++;
+		rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+		rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
+		len -= rqstp->rq_vec[v].iov_len;
+		v++;
+	}
+	args->vlen = v;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_writeargs *args)
+{
+	unsigned int len, v, hdr, dlen;
+	u32 max_blocksize = svc_max_payload(rqstp);
+
+	if (!(p = decode_fh(p, &args->fh)))
+		return 0;
+	p = xdr_decode_hyper(p, &args->offset);
+
+	args->count = ntohl(*p++);
+	args->stable = ntohl(*p++);
+	len = args->len = ntohl(*p++);
+	/*
+	 * The count must equal the amount of data passed.
+	 */
+	if (args->count != args->len)
+		return 0;
+
+	/*
+	 * Check to make sure that we got the right number of
+	 * bytes.
+	 */
+	hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
+	dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
+		- hdr;
+	/*
+	 * Round the length of the data which was specified up to
+	 * the next multiple of XDR units and then compare that
+	 * against the length which was actually received.
+	 * Note that when RPCSEC/GSS (for example) is used, the
+	 * data buffer can be padded so dlen might be larger
+	 * than required.  It must never be smaller.
+	 */
+	if (dlen < XDR_QUADLEN(len)*4)
+		return 0;
+
+	if (args->count > max_blocksize) {
+		args->count = max_blocksize;
+		len = args->len = max_blocksize;
+	}
+	rqstp->rq_vec[0].iov_base = (void*)p;
+	rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+	v = 0;
+	while (len > rqstp->rq_vec[v].iov_len) {
+		len -= rqstp->rq_vec[v].iov_len;
+		v++;
+		rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
+		rqstp->rq_vec[v].iov_len = PAGE_SIZE;
+	}
+	rqstp->rq_vec[v].iov_len = len;
+	args->vlen = v + 1;
+	return 1;
+}
+
+int
+nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_createargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh))
+	 || !(p = decode_filename(p, &args->name, &args->len)))
+		return 0;
+
+	switch (args->createmode = ntohl(*p++)) {
+	case NFS3_CREATE_UNCHECKED:
+	case NFS3_CREATE_GUARDED:
+		p = decode_sattr3(p, &args->attrs);
+		break;
+	case NFS3_CREATE_EXCLUSIVE:
+		args->verf = p;
+		p += 2;
+		break;
+	default:
+		return 0;
+	}
+
+	return xdr_argsize_check(rqstp, p);
+}
+int
+nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_createargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh)) ||
+	    !(p = decode_filename(p, &args->name, &args->len)))
+		return 0;
+	p = decode_sattr3(p, &args->attrs);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_symlinkargs *args)
+{
+	unsigned int len, avail;
+	char *old, *new;
+	struct kvec *vec;
+
+	if (!(p = decode_fh(p, &args->ffh)) ||
+	    !(p = decode_filename(p, &args->fname, &args->flen))
+		)
+		return 0;
+	p = decode_sattr3(p, &args->attrs);
+
+	/* now decode the pathname, which might be larger than the first page.
+	 * As we have to check for nul's anyway, we copy it into a new page
+	 * This page appears in the rq_res.pages list, but as pages_len is always
+	 * 0, it won't get in the way
+	 */
+	len = ntohl(*p++);
+	if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE)
+		return 0;
+	args->tname = new =
+		page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+	args->tlen = len;
+	/* first copy and check from the first page */
+	old = (char*)p;
+	vec = &rqstp->rq_arg.head[0];
+	avail = vec->iov_len - (old - (char*)vec->iov_base);
+	while (len && avail && *old) {
+		*new++ = *old++;
+		len--;
+		avail--;
+	}
+	/* now copy next page if there is one */
+	if (len && !avail && rqstp->rq_arg.page_len) {
+		avail = rqstp->rq_arg.page_len;
+		if (avail > PAGE_SIZE)
+			avail = PAGE_SIZE;
+		old = page_address(rqstp->rq_arg.pages[0]);
+	}
+	while (len && avail && *old) {
+		*new++ = *old++;
+		len--;
+		avail--;
+	}
+	*new = '\0';
+	if (len)
+		return 0;
+
+	return 1;
+}
+
+int
+nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_mknodargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh))
+	 || !(p = decode_filename(p, &args->name, &args->len)))
+		return 0;
+
+	args->ftype = ntohl(*p++);
+
+	if (args->ftype == NF3BLK  || args->ftype == NF3CHR
+	 || args->ftype == NF3SOCK || args->ftype == NF3FIFO)
+		p = decode_sattr3(p, &args->attrs);
+
+	if (args->ftype == NF3BLK || args->ftype == NF3CHR) {
+		args->major = ntohl(*p++);
+		args->minor = ntohl(*p++);
+	}
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_renameargs *args)
+{
+	if (!(p = decode_fh(p, &args->ffh))
+	 || !(p = decode_filename(p, &args->fname, &args->flen))
+	 || !(p = decode_fh(p, &args->tfh))
+	 || !(p = decode_filename(p, &args->tname, &args->tlen)))
+		return 0;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_readlinkargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh)))
+		return 0;
+	args->buffer =
+		page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_linkargs *args)
+{
+	if (!(p = decode_fh(p, &args->ffh))
+	 || !(p = decode_fh(p, &args->tfh))
+	 || !(p = decode_filename(p, &args->tname, &args->tlen)))
+		return 0;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_readdirargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh)))
+		return 0;
+	p = xdr_decode_hyper(p, &args->cookie);
+	args->verf   = p; p += 2;
+	args->dircount = ~0;
+	args->count  = ntohl(*p++);
+
+	if (args->count > PAGE_SIZE)
+		args->count = PAGE_SIZE;
+
+	args->buffer =
+		page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_readdirargs *args)
+{
+	int len, pn;
+	u32 max_blocksize = svc_max_payload(rqstp);
+
+	if (!(p = decode_fh(p, &args->fh)))
+		return 0;
+	p = xdr_decode_hyper(p, &args->cookie);
+	args->verf     = p; p += 2;
+	args->dircount = ntohl(*p++);
+	args->count    = ntohl(*p++);
+
+	len = (args->count > max_blocksize) ? max_blocksize :
+						  args->count;
+	args->count = len;
+
+	while (len > 0) {
+		pn = rqstp->rq_resused++;
+		if (!args->buffer)
+			args->buffer = page_address(rqstp->rq_respages[pn]);
+		len -= PAGE_SIZE;
+	}
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_commitargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh)))
+		return 0;
+	p = xdr_decode_hyper(p, &args->offset);
+	args->count = ntohl(*p++);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+/*
+ * XDR encode functions
+ */
+/*
+ * There must be an encoding function for void results so svc_process
+ * will work properly.
+ */
+int
+nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* GETATTR */
+int
+nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_attrstat *resp)
+{
+	if (resp->status == 0) {
+		lease_get_mtime(resp->fh.fh_dentry->d_inode,
+				&resp->stat.mtime);
+		p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat);
+	}
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* SETATTR, REMOVE, RMDIR */
+int
+nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_attrstat *resp)
+{
+	p = encode_wcc_data(rqstp, p, &resp->fh);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* LOOKUP */
+int
+nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_diropres *resp)
+{
+	if (resp->status == 0) {
+		p = encode_fh(p, &resp->fh);
+		p = encode_post_op_attr(rqstp, p, &resp->fh);
+	}
+	p = encode_post_op_attr(rqstp, p, &resp->dirfh);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* ACCESS */
+int
+nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_accessres *resp)
+{
+	p = encode_post_op_attr(rqstp, p, &resp->fh);
+	if (resp->status == 0)
+		*p++ = htonl(resp->access);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* READLINK */
+int
+nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_readlinkres *resp)
+{
+	p = encode_post_op_attr(rqstp, p, &resp->fh);
+	if (resp->status == 0) {
+		*p++ = htonl(resp->len);
+		xdr_ressize_check(rqstp, p);
+		rqstp->rq_res.page_len = resp->len;
+		if (resp->len & 3) {
+			/* need to pad the tail */
+			rqstp->rq_res.tail[0].iov_base = p;
+			*p = 0;
+			rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
+		}
+		return 1;
+	} else
+		return xdr_ressize_check(rqstp, p);
+}
+
+/* READ */
+int
+nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_readres *resp)
+{
+	p = encode_post_op_attr(rqstp, p, &resp->fh);
+	if (resp->status == 0) {
+		*p++ = htonl(resp->count);
+		*p++ = htonl(resp->eof);
+		*p++ = htonl(resp->count);	/* xdr opaque count */
+		xdr_ressize_check(rqstp, p);
+		/* now update rqstp->rq_res to reflect data as well */
+		rqstp->rq_res.page_len = resp->count;
+		if (resp->count & 3) {
+			/* need to pad the tail */
+			rqstp->rq_res.tail[0].iov_base = p;
+			*p = 0;
+			rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3);
+		}
+		return 1;
+	} else
+		return xdr_ressize_check(rqstp, p);
+}
+
+/* WRITE */
+int
+nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_writeres *resp)
+{
+	p = encode_wcc_data(rqstp, p, &resp->fh);
+	if (resp->status == 0) {
+		*p++ = htonl(resp->count);
+		*p++ = htonl(resp->committed);
+		*p++ = htonl(nfssvc_boot.tv_sec);
+		*p++ = htonl(nfssvc_boot.tv_usec);
+	}
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* CREATE, MKDIR, SYMLINK, MKNOD */
+int
+nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_diropres *resp)
+{
+	if (resp->status == 0) {
+		*p++ = xdr_one;
+		p = encode_fh(p, &resp->fh);
+		p = encode_post_op_attr(rqstp, p, &resp->fh);
+	}
+	p = encode_wcc_data(rqstp, p, &resp->dirfh);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* RENAME */
+int
+nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_renameres *resp)
+{
+	p = encode_wcc_data(rqstp, p, &resp->ffh);
+	p = encode_wcc_data(rqstp, p, &resp->tfh);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* LINK */
+int
+nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_linkres *resp)
+{
+	p = encode_post_op_attr(rqstp, p, &resp->fh);
+	p = encode_wcc_data(rqstp, p, &resp->tfh);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* READDIR */
+int
+nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_readdirres *resp)
+{
+	p = encode_post_op_attr(rqstp, p, &resp->fh);
+
+	if (resp->status == 0) {
+		/* stupid readdir cookie */
+		memcpy(p, resp->verf, 8); p += 2;
+		xdr_ressize_check(rqstp, p);
+		if (rqstp->rq_res.head[0].iov_len + (2<<2) > PAGE_SIZE)
+			return 1; /*No room for trailer */
+		rqstp->rq_res.page_len = (resp->count) << 2;
+
+		/* add the 'tail' to the end of the 'head' page - page 0. */
+		rqstp->rq_res.tail[0].iov_base = p;
+		*p++ = 0;		/* no more entries */
+		*p++ = htonl(resp->common.err == nfserr_eof);
+		rqstp->rq_res.tail[0].iov_len = 2<<2;
+		return 1;
+	} else
+		return xdr_ressize_check(rqstp, p);
+}
+
+static __be32 *
+encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
+	     int namlen, u64 ino)
+{
+	*p++ = xdr_one;				 /* mark entry present */
+	p    = xdr_encode_hyper(p, ino);	 /* file id */
+	p    = xdr_encode_array(p, name, namlen);/* name length & name */
+
+	cd->offset = p;				/* remember pointer */
+	p = xdr_encode_hyper(p, NFS_OFFSET_MAX);/* offset of next entry */
+
+	return p;
+}
+
+static __be32
+compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
+		const char *name, int namlen)
+{
+	struct svc_export	*exp;
+	struct dentry		*dparent, *dchild;
+	__be32 rv = nfserr_noent;
+
+	dparent = cd->fh.fh_dentry;
+	exp  = cd->fh.fh_export;
+
+	if (isdotent(name, namlen)) {
+		if (namlen == 2) {
+			dchild = dget_parent(dparent);
+			/* filesystem root - cannot return filehandle for ".." */
+			if (dchild == dparent)
+				goto out;
+		} else
+			dchild = dget(dparent);
+	} else
+		dchild = lookup_one_len(name, dparent, namlen);
+	if (IS_ERR(dchild))
+		return rv;
+	if (d_mountpoint(dchild))
+		goto out;
+	if (!dchild->d_inode)
+		goto out;
+	rv = fh_compose(fhp, exp, dchild, &cd->fh);
+out:
+	dput(dchild);
+	return rv;
+}
+
+static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
+{
+	struct svc_fh	fh;
+	__be32 err;
+
+	fh_init(&fh, NFS3_FHSIZE);
+	err = compose_entry_fh(cd, &fh, name, namlen);
+	if (err) {
+		*p++ = 0;
+		*p++ = 0;
+		goto out;
+	}
+	p = encode_post_op_attr(cd->rqstp, p, &fh);
+	*p++ = xdr_one;			/* yes, a file handle follows */
+	p = encode_fh(p, &fh);
+out:
+	fh_put(&fh);
+	return p;
+}
+
+/*
+ * Encode a directory entry. This one works for both normal readdir
+ * and readdirplus.
+ * The normal readdir reply requires 2 (fileid) + 1 (stringlen)
+ * + string + 2 (cookie) + 1 (next) words, i.e. 6 + strlen.
+ * 
+ * The readdirplus baggage is 1+21 words for post_op_attr, plus the
+ * file handle.
+ */
+
+#define NFS3_ENTRY_BAGGAGE	(2 + 1 + 2 + 1)
+#define NFS3_ENTRYPLUS_BAGGAGE	(1 + 21 + 1 + (NFS3_FHSIZE >> 2))
+static int
+encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
+	     loff_t offset, u64 ino, unsigned int d_type, int plus)
+{
+	struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres,
+		       					common);
+	__be32		*p = cd->buffer;
+	caddr_t		curr_page_addr = NULL;
+	int		pn;		/* current page number */
+	int		slen;		/* string (name) length */
+	int		elen;		/* estimated entry length in words */
+	int		num_entry_words = 0;	/* actual number of words */
+
+	if (cd->offset) {
+		u64 offset64 = offset;
+
+		if (unlikely(cd->offset1)) {
+			/* we ended up with offset on a page boundary */
+			*cd->offset = htonl(offset64 >> 32);
+			*cd->offset1 = htonl(offset64 & 0xffffffff);
+			cd->offset1 = NULL;
+		} else {
+			xdr_encode_hyper(cd->offset, offset64);
+		}
+	}
+
+	/*
+	dprintk("encode_entry(%.*s @%ld%s)\n",
+		namlen, name, (long) offset, plus? " plus" : "");
+	 */
+
+	/* truncate filename if too long */
+	if (namlen > NFS3_MAXNAMLEN)
+		namlen = NFS3_MAXNAMLEN;
+
+	slen = XDR_QUADLEN(namlen);
+	elen = slen + NFS3_ENTRY_BAGGAGE
+		+ (plus? NFS3_ENTRYPLUS_BAGGAGE : 0);
+
+	if (cd->buflen < elen) {
+		cd->common.err = nfserr_toosmall;
+		return -EINVAL;
+	}
+
+	/* determine which page in rq_respages[] we are currently filling */
+	for (pn=1; pn < cd->rqstp->rq_resused; pn++) {
+		curr_page_addr = page_address(cd->rqstp->rq_respages[pn]);
+
+		if (((caddr_t)cd->buffer >= curr_page_addr) &&
+		    ((caddr_t)cd->buffer <  curr_page_addr + PAGE_SIZE))
+			break;
+	}
+
+	if ((caddr_t)(cd->buffer + elen) < (curr_page_addr + PAGE_SIZE)) {
+		/* encode entry in current page */
+
+		p = encode_entry_baggage(cd, p, name, namlen, ino);
+
+		if (plus)
+			p = encode_entryplus_baggage(cd, p, name, namlen);
+		num_entry_words = p - cd->buffer;
+	} else if (cd->rqstp->rq_respages[pn+1] != NULL) {
+		/* temporarily encode entry into next page, then move back to
+		 * current and next page in rq_respages[] */
+		__be32 *p1, *tmp;
+		int len1, len2;
+
+		/* grab next page for temporary storage of entry */
+		p1 = tmp = page_address(cd->rqstp->rq_respages[pn+1]);
+
+		p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
+
+		if (plus)
+			p1 = encode_entryplus_baggage(cd, p1, name, namlen);
+
+		/* determine entry word length and lengths to go in pages */
+		num_entry_words = p1 - tmp;
+		len1 = curr_page_addr + PAGE_SIZE - (caddr_t)cd->buffer;
+		if ((num_entry_words << 2) < len1) {
+			/* the actual number of words in the entry is less
+			 * than elen and can still fit in the current page
+			 */
+			memmove(p, tmp, num_entry_words << 2);
+			p += num_entry_words;
+
+			/* update offset */
+			cd->offset = cd->buffer + (cd->offset - tmp);
+		} else {
+			unsigned int offset_r = (cd->offset - tmp) << 2;
+
+			/* update pointer to offset location.
+			 * This is a 64bit quantity, so we need to
+			 * deal with 3 cases:
+			 *  -	entirely in first page
+			 *  -	entirely in second page
+			 *  -	4 bytes in each page
+			 */
+			if (offset_r + 8 <= len1) {
+				cd->offset = p + (cd->offset - tmp);
+			} else if (offset_r >= len1) {
+				cd->offset -= len1 >> 2;
+			} else {
+				/* sitting on the fence */
+				BUG_ON(offset_r != len1 - 4);
+				cd->offset = p + (cd->offset - tmp);
+				cd->offset1 = tmp;
+			}
+
+			len2 = (num_entry_words << 2) - len1;
+
+			/* move from temp page to current and next pages */
+			memmove(p, tmp, len1);
+			memmove(tmp, (caddr_t)tmp+len1, len2);
+
+			p = tmp + (len2 >> 2);
+		}
+	}
+	else {
+		cd->common.err = nfserr_toosmall;
+		return -EINVAL;
+	}
+
+	cd->buflen -= num_entry_words;
+	cd->buffer = p;
+	cd->common.err = nfs_ok;
+	return 0;
+
+}
+
+int
+nfs3svc_encode_entry(void *cd, const char *name,
+		     int namlen, loff_t offset, u64 ino, unsigned int d_type)
+{
+	return encode_entry(cd, name, namlen, offset, ino, d_type, 0);
+}
+
+int
+nfs3svc_encode_entry_plus(void *cd, const char *name,
+			  int namlen, loff_t offset, u64 ino,
+			  unsigned int d_type)
+{
+	return encode_entry(cd, name, namlen, offset, ino, d_type, 1);
+}
+
+/* FSSTAT */
+int
+nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_fsstatres *resp)
+{
+	struct kstatfs	*s = &resp->stats;
+	u64		bs = s->f_bsize;
+
+	*p++ = xdr_zero;	/* no post_op_attr */
+
+	if (resp->status == 0) {
+		p = xdr_encode_hyper(p, bs * s->f_blocks);	/* total bytes */
+		p = xdr_encode_hyper(p, bs * s->f_bfree);	/* free bytes */
+		p = xdr_encode_hyper(p, bs * s->f_bavail);	/* user available bytes */
+		p = xdr_encode_hyper(p, s->f_files);	/* total inodes */
+		p = xdr_encode_hyper(p, s->f_ffree);	/* free inodes */
+		p = xdr_encode_hyper(p, s->f_ffree);	/* user available inodes */
+		*p++ = htonl(resp->invarsec);	/* mean unchanged time */
+	}
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* FSINFO */
+int
+nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_fsinfores *resp)
+{
+	*p++ = xdr_zero;	/* no post_op_attr */
+
+	if (resp->status == 0) {
+		*p++ = htonl(resp->f_rtmax);
+		*p++ = htonl(resp->f_rtpref);
+		*p++ = htonl(resp->f_rtmult);
+		*p++ = htonl(resp->f_wtmax);
+		*p++ = htonl(resp->f_wtpref);
+		*p++ = htonl(resp->f_wtmult);
+		*p++ = htonl(resp->f_dtpref);
+		p = xdr_encode_hyper(p, resp->f_maxfilesize);
+		*p++ = xdr_one;
+		*p++ = xdr_zero;
+		*p++ = htonl(resp->f_properties);
+	}
+
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* PATHCONF */
+int
+nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_pathconfres *resp)
+{
+	*p++ = xdr_zero;	/* no post_op_attr */
+
+	if (resp->status == 0) {
+		*p++ = htonl(resp->p_link_max);
+		*p++ = htonl(resp->p_name_max);
+		*p++ = htonl(resp->p_no_trunc);
+		*p++ = htonl(resp->p_chown_restricted);
+		*p++ = htonl(resp->p_case_insensitive);
+		*p++ = htonl(resp->p_case_preserving);
+	}
+
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* COMMIT */
+int
+nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_commitres *resp)
+{
+	p = encode_wcc_data(rqstp, p, &resp->fh);
+	/* Write verifier */
+	if (resp->status == 0) {
+		*p++ = htonl(nfssvc_boot.tv_sec);
+		*p++ = htonl(nfssvc_boot.tv_usec);
+	}
+	return xdr_ressize_check(rqstp, p);
+}
+
+/*
+ * XDR release functions
+ */
+int
+nfs3svc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_attrstat *resp)
+{
+	fh_put(&resp->fh);
+	return 1;
+}
+
+int
+nfs3svc_release_fhandle2(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd3_fhandle_pair *resp)
+{
+	fh_put(&resp->fh1);
+	fh_put(&resp->fh2);
+	return 1;
+}
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
new file mode 100644
index 00000000..ad88f1c0
--- /dev/null
+++ b/fs/nfsd/nfs4acl.c
@@ -0,0 +1,838 @@
+/*
+ *  Common NFSv4 ACL handling code.
+ *
+ *  Copyright (c) 2002, 2003 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *  Jeff Sedlak <jsedlak@umich.edu>
+ *  J. Bruce Fields <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+#include <linux/nfs_fs.h>
+#include "acl.h"
+
+
+/* mode bit translations: */
+#define NFS4_READ_MODE (NFS4_ACE_READ_DATA)
+#define NFS4_WRITE_MODE (NFS4_ACE_WRITE_DATA | NFS4_ACE_APPEND_DATA)
+#define NFS4_EXECUTE_MODE NFS4_ACE_EXECUTE
+#define NFS4_ANYONE_MODE (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL | NFS4_ACE_SYNCHRONIZE)
+#define NFS4_OWNER_MODE (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL)
+
+/* We don't support these bits; insist they be neither allowed nor denied */
+#define NFS4_MASK_UNSUPP (NFS4_ACE_DELETE | NFS4_ACE_WRITE_OWNER \
+		| NFS4_ACE_READ_NAMED_ATTRS | NFS4_ACE_WRITE_NAMED_ATTRS)
+
+/* flags used to simulate posix default ACLs */
+#define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \
+		| NFS4_ACE_DIRECTORY_INHERIT_ACE)
+
+#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS \
+		| NFS4_ACE_INHERIT_ONLY_ACE \
+		| NFS4_ACE_IDENTIFIER_GROUP)
+
+#define MASK_EQUAL(mask1, mask2) \
+	( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) )
+
+static u32
+mask_from_posix(unsigned short perm, unsigned int flags)
+{
+	int mask = NFS4_ANYONE_MODE;
+
+	if (flags & NFS4_ACL_OWNER)
+		mask |= NFS4_OWNER_MODE;
+	if (perm & ACL_READ)
+		mask |= NFS4_READ_MODE;
+	if (perm & ACL_WRITE)
+		mask |= NFS4_WRITE_MODE;
+	if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR))
+		mask |= NFS4_ACE_DELETE_CHILD;
+	if (perm & ACL_EXECUTE)
+		mask |= NFS4_EXECUTE_MODE;
+	return mask;
+}
+
+static u32
+deny_mask_from_posix(unsigned short perm, u32 flags)
+{
+	u32 mask = 0;
+
+	if (perm & ACL_READ)
+		mask |= NFS4_READ_MODE;
+	if (perm & ACL_WRITE)
+		mask |= NFS4_WRITE_MODE;
+	if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR))
+		mask |= NFS4_ACE_DELETE_CHILD;
+	if (perm & ACL_EXECUTE)
+		mask |= NFS4_EXECUTE_MODE;
+	return mask;
+}
+
+/* XXX: modify functions to return NFS errors; they're only ever
+ * used by nfs code, after all.... */
+
+/* We only map from NFSv4 to POSIX ACLs when setting ACLs, when we err on the
+ * side of being more restrictive, so the mode bit mapping below is
+ * pessimistic.  An optimistic version would be needed to handle DENY's,
+ * but we espect to coalesce all ALLOWs and DENYs before mapping to mode
+ * bits. */
+
+static void
+low_mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
+{
+	u32 write_mode = NFS4_WRITE_MODE;
+
+	if (flags & NFS4_ACL_DIR)
+		write_mode |= NFS4_ACE_DELETE_CHILD;
+	*mode = 0;
+	if ((perm & NFS4_READ_MODE) == NFS4_READ_MODE)
+		*mode |= ACL_READ;
+	if ((perm & write_mode) == write_mode)
+		*mode |= ACL_WRITE;
+	if ((perm & NFS4_EXECUTE_MODE) == NFS4_EXECUTE_MODE)
+		*mode |= ACL_EXECUTE;
+}
+
+struct ace_container {
+	struct nfs4_ace  *ace;
+	struct list_head  ace_l;
+};
+
+static short ace2type(struct nfs4_ace *);
+static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *,
+				unsigned int);
+
+struct nfs4_acl *
+nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl,
+			unsigned int flags)
+{
+	struct nfs4_acl *acl;
+	int size = 0;
+
+	if (pacl) {
+		if (posix_acl_valid(pacl) < 0)
+			return ERR_PTR(-EINVAL);
+		size += 2*pacl->a_count;
+	}
+	if (dpacl) {
+		if (posix_acl_valid(dpacl) < 0)
+			return ERR_PTR(-EINVAL);
+		size += 2*dpacl->a_count;
+	}
+
+	/* Allocate for worst case: one (deny, allow) pair each: */
+	acl = nfs4_acl_new(size);
+	if (acl == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	if (pacl)
+		_posix_to_nfsv4_one(pacl, acl, flags & ~NFS4_ACL_TYPE_DEFAULT);
+
+	if (dpacl)
+		_posix_to_nfsv4_one(dpacl, acl, flags | NFS4_ACL_TYPE_DEFAULT);
+
+	return acl;
+}
+
+struct posix_acl_summary {
+	unsigned short owner;
+	unsigned short users;
+	unsigned short group;
+	unsigned short groups;
+	unsigned short other;
+	unsigned short mask;
+};
+
+static void
+summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas)
+{
+	struct posix_acl_entry *pa, *pe;
+
+	/*
+	 * Only pas.users and pas.groups need initialization; previous
+	 * posix_acl_valid() calls ensure that the other fields will be
+	 * initialized in the following loop.  But, just to placate gcc:
+	 */
+	memset(pas, 0, sizeof(*pas));
+	pas->mask = 07;
+
+	pe = acl->a_entries + acl->a_count;
+
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+		switch (pa->e_tag) {
+			case ACL_USER_OBJ:
+				pas->owner = pa->e_perm;
+				break;
+			case ACL_GROUP_OBJ:
+				pas->group = pa->e_perm;
+				break;
+			case ACL_USER:
+				pas->users |= pa->e_perm;
+				break;
+			case ACL_GROUP:
+				pas->groups |= pa->e_perm;
+				break;
+			case ACL_OTHER:
+				pas->other = pa->e_perm;
+				break;
+			case ACL_MASK:
+				pas->mask = pa->e_perm;
+				break;
+		}
+	}
+	/* We'll only care about effective permissions: */
+	pas->users &= pas->mask;
+	pas->group &= pas->mask;
+	pas->groups &= pas->mask;
+}
+
+/* We assume the acl has been verified with posix_acl_valid. */
+static void
+_posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
+						unsigned int flags)
+{
+	struct posix_acl_entry *pa, *group_owner_entry;
+	struct nfs4_ace *ace;
+	struct posix_acl_summary pas;
+	unsigned short deny;
+	int eflag = ((flags & NFS4_ACL_TYPE_DEFAULT) ?
+		NFS4_INHERITANCE_FLAGS | NFS4_ACE_INHERIT_ONLY_ACE : 0);
+
+	BUG_ON(pacl->a_count < 3);
+	summarize_posix_acl(pacl, &pas);
+
+	pa = pacl->a_entries;
+	ace = acl->aces + acl->naces;
+
+	/* We could deny everything not granted by the owner: */
+	deny = ~pas.owner;
+	/*
+	 * but it is equivalent (and simpler) to deny only what is not
+	 * granted by later entries:
+	 */
+	deny &= pas.users | pas.group | pas.groups | pas.other;
+	if (deny) {
+		ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
+		ace->flag = eflag;
+		ace->access_mask = deny_mask_from_posix(deny, flags);
+		ace->whotype = NFS4_ACL_WHO_OWNER;
+		ace++;
+		acl->naces++;
+	}
+
+	ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
+	ace->flag = eflag;
+	ace->access_mask = mask_from_posix(pa->e_perm, flags | NFS4_ACL_OWNER);
+	ace->whotype = NFS4_ACL_WHO_OWNER;
+	ace++;
+	acl->naces++;
+	pa++;
+
+	while (pa->e_tag == ACL_USER) {
+		deny = ~(pa->e_perm & pas.mask);
+		deny &= pas.groups | pas.group | pas.other;
+		if (deny) {
+			ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
+			ace->flag = eflag;
+			ace->access_mask = deny_mask_from_posix(deny, flags);
+			ace->whotype = NFS4_ACL_WHO_NAMED;
+			ace->who = pa->e_id;
+			ace++;
+			acl->naces++;
+		}
+		ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
+		ace->flag = eflag;
+		ace->access_mask = mask_from_posix(pa->e_perm & pas.mask,
+						   flags);
+		ace->whotype = NFS4_ACL_WHO_NAMED;
+		ace->who = pa->e_id;
+		ace++;
+		acl->naces++;
+		pa++;
+	}
+
+	/* In the case of groups, we apply allow ACEs first, then deny ACEs,
+	 * since a user can be in more than one group.  */
+
+	/* allow ACEs */
+
+	group_owner_entry = pa;
+
+	ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
+	ace->flag = eflag;
+	ace->access_mask = mask_from_posix(pas.group, flags);
+	ace->whotype = NFS4_ACL_WHO_GROUP;
+	ace++;
+	acl->naces++;
+	pa++;
+
+	while (pa->e_tag == ACL_GROUP) {
+		ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
+		ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
+		ace->access_mask = mask_from_posix(pa->e_perm & pas.mask,
+						   flags);
+		ace->whotype = NFS4_ACL_WHO_NAMED;
+		ace->who = pa->e_id;
+		ace++;
+		acl->naces++;
+		pa++;
+	}
+
+	/* deny ACEs */
+
+	pa = group_owner_entry;
+
+	deny = ~pas.group & pas.other;
+	if (deny) {
+		ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
+		ace->flag = eflag;
+		ace->access_mask = deny_mask_from_posix(deny, flags);
+		ace->whotype = NFS4_ACL_WHO_GROUP;
+		ace++;
+		acl->naces++;
+	}
+	pa++;
+
+	while (pa->e_tag == ACL_GROUP) {
+		deny = ~(pa->e_perm & pas.mask);
+		deny &= pas.other;
+		if (deny) {
+			ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
+			ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
+			ace->access_mask = deny_mask_from_posix(deny, flags);
+			ace->whotype = NFS4_ACL_WHO_NAMED;
+			ace->who = pa->e_id;
+			ace++;
+			acl->naces++;
+		}
+		pa++;
+	}
+
+	if (pa->e_tag == ACL_MASK)
+		pa++;
+	ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
+	ace->flag = eflag;
+	ace->access_mask = mask_from_posix(pa->e_perm, flags);
+	ace->whotype = NFS4_ACL_WHO_EVERYONE;
+	acl->naces++;
+}
+
+static void
+sort_pacl_range(struct posix_acl *pacl, int start, int end) {
+	int sorted = 0, i;
+	struct posix_acl_entry tmp;
+
+	/* We just do a bubble sort; easy to do in place, and we're not
+	 * expecting acl's to be long enough to justify anything more. */
+	while (!sorted) {
+		sorted = 1;
+		for (i = start; i < end; i++) {
+			if (pacl->a_entries[i].e_id
+					> pacl->a_entries[i+1].e_id) {
+				sorted = 0;
+				tmp = pacl->a_entries[i];
+				pacl->a_entries[i] = pacl->a_entries[i+1];
+				pacl->a_entries[i+1] = tmp;
+			}
+		}
+	}
+}
+
+static void
+sort_pacl(struct posix_acl *pacl)
+{
+	/* posix_acl_valid requires that users and groups be in order
+	 * by uid/gid. */
+	int i, j;
+
+	if (pacl->a_count <= 4)
+		return; /* no users or groups */
+	i = 1;
+	while (pacl->a_entries[i].e_tag == ACL_USER)
+		i++;
+	sort_pacl_range(pacl, 1, i-1);
+
+	BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ);
+	j = ++i;
+	while (pacl->a_entries[j].e_tag == ACL_GROUP)
+		j++;
+	sort_pacl_range(pacl, i, j-1);
+	return;
+}
+
+/*
+ * While processing the NFSv4 ACE, this maintains bitmasks representing
+ * which permission bits have been allowed and which denied to a given
+ * entity: */
+struct posix_ace_state {
+	u32 allow;
+	u32 deny;
+};
+
+struct posix_user_ace_state {
+	uid_t uid;
+	struct posix_ace_state perms;
+};
+
+struct posix_ace_state_array {
+	int n;
+	struct posix_user_ace_state aces[];
+};
+
+/*
+ * While processing the NFSv4 ACE, this maintains the partial permissions
+ * calculated so far: */
+
+struct posix_acl_state {
+	int empty;
+	struct posix_ace_state owner;
+	struct posix_ace_state group;
+	struct posix_ace_state other;
+	struct posix_ace_state everyone;
+	struct posix_ace_state mask; /* Deny unused in this case */
+	struct posix_ace_state_array *users;
+	struct posix_ace_state_array *groups;
+};
+
+static int
+init_state(struct posix_acl_state *state, int cnt)
+{
+	int alloc;
+
+	memset(state, 0, sizeof(struct posix_acl_state));
+	state->empty = 1;
+	/*
+	 * In the worst case, each individual acl could be for a distinct
+	 * named user or group, but we don't no which, so we allocate
+	 * enough space for either:
+	 */
+	alloc = sizeof(struct posix_ace_state_array)
+		+ cnt*sizeof(struct posix_user_ace_state);
+	state->users = kzalloc(alloc, GFP_KERNEL);
+	if (!state->users)
+		return -ENOMEM;
+	state->groups = kzalloc(alloc, GFP_KERNEL);
+	if (!state->groups) {
+		kfree(state->users);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void
+free_state(struct posix_acl_state *state) {
+	kfree(state->users);
+	kfree(state->groups);
+}
+
+static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_state *astate)
+{
+	state->mask.allow |= astate->allow;
+}
+
+/*
+ * Certain bits (SYNCHRONIZE, DELETE, WRITE_OWNER, READ/WRITE_NAMED_ATTRS,
+ * READ_ATTRIBUTES, READ_ACL) are currently unenforceable and don't translate
+ * to traditional read/write/execute permissions.
+ *
+ * It's problematic to reject acls that use certain mode bits, because it
+ * places the burden on users to learn the rules about which bits one
+ * particular server sets, without giving the user a lot of help--we return an
+ * error that could mean any number of different things.  To make matters
+ * worse, the problematic bits might be introduced by some application that's
+ * automatically mapping from some other acl model.
+ *
+ * So wherever possible we accept anything, possibly erring on the side of
+ * denying more permissions than necessary.
+ *
+ * However we do reject *explicit* DENY's of a few bits representing
+ * permissions we could never deny:
+ */
+
+static inline int check_deny(u32 mask, int isowner)
+{
+	if (mask & (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL))
+		return -EINVAL;
+	if (!isowner)
+		return 0;
+	if (mask & (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL))
+		return -EINVAL;
+	return 0;
+}
+
+static struct posix_acl *
+posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
+{
+	struct posix_acl_entry *pace;
+	struct posix_acl *pacl;
+	int nace;
+	int i, error = 0;
+
+	/*
+	 * ACLs with no ACEs are treated differently in the inheritable
+	 * and effective cases: when there are no inheritable ACEs, we
+	 * set a zero-length default posix acl:
+	 */
+	if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) {
+		pacl = posix_acl_alloc(0, GFP_KERNEL);
+		return pacl ? pacl : ERR_PTR(-ENOMEM);
+	}
+	/*
+	 * When there are no effective ACEs, the following will end
+	 * up setting a 3-element effective posix ACL with all
+	 * permissions zero.
+	 */
+	nace = 4 + state->users->n + state->groups->n;
+	pacl = posix_acl_alloc(nace, GFP_KERNEL);
+	if (!pacl)
+		return ERR_PTR(-ENOMEM);
+
+	pace = pacl->a_entries;
+	pace->e_tag = ACL_USER_OBJ;
+	error = check_deny(state->owner.deny, 1);
+	if (error)
+		goto out_err;
+	low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags);
+	pace->e_id = ACL_UNDEFINED_ID;
+
+	for (i=0; i < state->users->n; i++) {
+		pace++;
+		pace->e_tag = ACL_USER;
+		error = check_deny(state->users->aces[i].perms.deny, 0);
+		if (error)
+			goto out_err;
+		low_mode_from_nfs4(state->users->aces[i].perms.allow,
+					&pace->e_perm, flags);
+		pace->e_id = state->users->aces[i].uid;
+		add_to_mask(state, &state->users->aces[i].perms);
+	}
+
+	pace++;
+	pace->e_tag = ACL_GROUP_OBJ;
+	error = check_deny(state->group.deny, 0);
+	if (error)
+		goto out_err;
+	low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags);
+	pace->e_id = ACL_UNDEFINED_ID;
+	add_to_mask(state, &state->group);
+
+	for (i=0; i < state->groups->n; i++) {
+		pace++;
+		pace->e_tag = ACL_GROUP;
+		error = check_deny(state->groups->aces[i].perms.deny, 0);
+		if (error)
+			goto out_err;
+		low_mode_from_nfs4(state->groups->aces[i].perms.allow,
+					&pace->e_perm, flags);
+		pace->e_id = state->groups->aces[i].uid;
+		add_to_mask(state, &state->groups->aces[i].perms);
+	}
+
+	pace++;
+	pace->e_tag = ACL_MASK;
+	low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
+	pace->e_id = ACL_UNDEFINED_ID;
+
+	pace++;
+	pace->e_tag = ACL_OTHER;
+	error = check_deny(state->other.deny, 0);
+	if (error)
+		goto out_err;
+	low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags);
+	pace->e_id = ACL_UNDEFINED_ID;
+
+	return pacl;
+out_err:
+	posix_acl_release(pacl);
+	return ERR_PTR(error);
+}
+
+static inline void allow_bits(struct posix_ace_state *astate, u32 mask)
+{
+	/* Allow all bits in the mask not already denied: */
+	astate->allow |= mask & ~astate->deny;
+}
+
+static inline void deny_bits(struct posix_ace_state *astate, u32 mask)
+{
+	/* Deny all bits in the mask not already allowed: */
+	astate->deny |= mask & ~astate->allow;
+}
+
+static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array *a, uid_t uid)
+{
+	int i;
+
+	for (i = 0; i < a->n; i++)
+		if (a->aces[i].uid == uid)
+			return i;
+	/* Not found: */
+	a->n++;
+	a->aces[i].uid = uid;
+	a->aces[i].perms.allow = state->everyone.allow;
+	a->aces[i].perms.deny  = state->everyone.deny;
+
+	return i;
+}
+
+static void deny_bits_array(struct posix_ace_state_array *a, u32 mask)
+{
+	int i;
+
+	for (i=0; i < a->n; i++)
+		deny_bits(&a->aces[i].perms, mask);
+}
+
+static void allow_bits_array(struct posix_ace_state_array *a, u32 mask)
+{
+	int i;
+
+	for (i=0; i < a->n; i++)
+		allow_bits(&a->aces[i].perms, mask);
+}
+
+static void process_one_v4_ace(struct posix_acl_state *state,
+				struct nfs4_ace *ace)
+{
+	u32 mask = ace->access_mask;
+	int i;
+
+	state->empty = 0;
+
+	switch (ace2type(ace)) {
+	case ACL_USER_OBJ:
+		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			allow_bits(&state->owner, mask);
+		} else {
+			deny_bits(&state->owner, mask);
+		}
+		break;
+	case ACL_USER:
+		i = find_uid(state, state->users, ace->who);
+		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			allow_bits(&state->users->aces[i].perms, mask);
+		} else {
+			deny_bits(&state->users->aces[i].perms, mask);
+			mask = state->users->aces[i].perms.deny;
+			deny_bits(&state->owner, mask);
+		}
+		break;
+	case ACL_GROUP_OBJ:
+		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			allow_bits(&state->group, mask);
+		} else {
+			deny_bits(&state->group, mask);
+			mask = state->group.deny;
+			deny_bits(&state->owner, mask);
+			deny_bits(&state->everyone, mask);
+			deny_bits_array(state->users, mask);
+			deny_bits_array(state->groups, mask);
+		}
+		break;
+	case ACL_GROUP:
+		i = find_uid(state, state->groups, ace->who);
+		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			allow_bits(&state->groups->aces[i].perms, mask);
+		} else {
+			deny_bits(&state->groups->aces[i].perms, mask);
+			mask = state->groups->aces[i].perms.deny;
+			deny_bits(&state->owner, mask);
+			deny_bits(&state->group, mask);
+			deny_bits(&state->everyone, mask);
+			deny_bits_array(state->users, mask);
+			deny_bits_array(state->groups, mask);
+		}
+		break;
+	case ACL_OTHER:
+		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			allow_bits(&state->owner, mask);
+			allow_bits(&state->group, mask);
+			allow_bits(&state->other, mask);
+			allow_bits(&state->everyone, mask);
+			allow_bits_array(state->users, mask);
+			allow_bits_array(state->groups, mask);
+		} else {
+			deny_bits(&state->owner, mask);
+			deny_bits(&state->group, mask);
+			deny_bits(&state->other, mask);
+			deny_bits(&state->everyone, mask);
+			deny_bits_array(state->users, mask);
+			deny_bits_array(state->groups, mask);
+		}
+	}
+}
+
+int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
+			    struct posix_acl **dpacl, unsigned int flags)
+{
+	struct posix_acl_state effective_acl_state, default_acl_state;
+	struct nfs4_ace *ace;
+	int ret;
+
+	ret = init_state(&effective_acl_state, acl->naces);
+	if (ret)
+		return ret;
+	ret = init_state(&default_acl_state, acl->naces);
+	if (ret)
+		goto out_estate;
+	ret = -EINVAL;
+	for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
+		if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE &&
+		    ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
+			goto out_dstate;
+		if (ace->flag & ~NFS4_SUPPORTED_FLAGS)
+			goto out_dstate;
+		if ((ace->flag & NFS4_INHERITANCE_FLAGS) == 0) {
+			process_one_v4_ace(&effective_acl_state, ace);
+			continue;
+		}
+		if (!(flags & NFS4_ACL_DIR))
+			goto out_dstate;
+		/*
+		 * Note that when only one of FILE_INHERIT or DIRECTORY_INHERIT
+		 * is set, we're effectively turning on the other.  That's OK,
+		 * according to rfc 3530.
+		 */
+		process_one_v4_ace(&default_acl_state, ace);
+
+		if (!(ace->flag & NFS4_ACE_INHERIT_ONLY_ACE))
+			process_one_v4_ace(&effective_acl_state, ace);
+	}
+	*pacl = posix_state_to_acl(&effective_acl_state, flags);
+	if (IS_ERR(*pacl)) {
+		ret = PTR_ERR(*pacl);
+		*pacl = NULL;
+		goto out_dstate;
+	}
+	*dpacl = posix_state_to_acl(&default_acl_state,
+						flags | NFS4_ACL_TYPE_DEFAULT);
+	if (IS_ERR(*dpacl)) {
+		ret = PTR_ERR(*dpacl);
+		*dpacl = NULL;
+		posix_acl_release(*pacl);
+		*pacl = NULL;
+		goto out_dstate;
+	}
+	sort_pacl(*pacl);
+	sort_pacl(*dpacl);
+	ret = 0;
+out_dstate:
+	free_state(&default_acl_state);
+out_estate:
+	free_state(&effective_acl_state);
+	return ret;
+}
+
+static short
+ace2type(struct nfs4_ace *ace)
+{
+	switch (ace->whotype) {
+		case NFS4_ACL_WHO_NAMED:
+			return (ace->flag & NFS4_ACE_IDENTIFIER_GROUP ?
+					ACL_GROUP : ACL_USER);
+		case NFS4_ACL_WHO_OWNER:
+			return ACL_USER_OBJ;
+		case NFS4_ACL_WHO_GROUP:
+			return ACL_GROUP_OBJ;
+		case NFS4_ACL_WHO_EVERYONE:
+			return ACL_OTHER;
+	}
+	BUG();
+	return -1;
+}
+
+EXPORT_SYMBOL(nfs4_acl_posix_to_nfsv4);
+EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix);
+
+struct nfs4_acl *
+nfs4_acl_new(int n)
+{
+	struct nfs4_acl *acl;
+
+	acl = kmalloc(sizeof(*acl) + n*sizeof(struct nfs4_ace), GFP_KERNEL);
+	if (acl == NULL)
+		return NULL;
+	acl->naces = 0;
+	return acl;
+}
+
+static struct {
+	char *string;
+	int   stringlen;
+	int type;
+} s2t_map[] = {
+	{
+		.string    = "OWNER@",
+		.stringlen = sizeof("OWNER@") - 1,
+		.type      = NFS4_ACL_WHO_OWNER,
+	},
+	{
+		.string    = "GROUP@",
+		.stringlen = sizeof("GROUP@") - 1,
+		.type      = NFS4_ACL_WHO_GROUP,
+	},
+	{
+		.string    = "EVERYONE@",
+		.stringlen = sizeof("EVERYONE@") - 1,
+		.type      = NFS4_ACL_WHO_EVERYONE,
+	},
+};
+
+int
+nfs4_acl_get_whotype(char *p, u32 len)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(s2t_map); i++) {
+		if (s2t_map[i].stringlen == len &&
+				0 == memcmp(s2t_map[i].string, p, len))
+			return s2t_map[i].type;
+	}
+	return NFS4_ACL_WHO_NAMED;
+}
+
+int
+nfs4_acl_write_who(int who, char *p)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(s2t_map); i++) {
+		if (s2t_map[i].type == who) {
+			memcpy(p, s2t_map[i].string, s2t_map[i].stringlen);
+			return s2t_map[i].stringlen;
+		}
+	}
+	BUG();
+	return -1;
+}
+
+EXPORT_SYMBOL(nfs4_acl_new);
+EXPORT_SYMBOL(nfs4_acl_get_whotype);
+EXPORT_SYMBOL(nfs4_acl_write_who);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
new file mode 100644
index 00000000..02eb4edf
--- /dev/null
+++ b/fs/nfsd/nfs4callback.c
@@ -0,0 +1,1026 @@
+/*
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/slab.h>
+#include "nfsd.h"
+#include "state.h"
+
+#define NFSDDBG_FACILITY                NFSDDBG_PROC
+
+#define NFSPROC4_CB_NULL 0
+#define NFSPROC4_CB_COMPOUND 1
+
+/* Index of predefined Linux callback client operations */
+
+enum {
+	NFSPROC4_CLNT_CB_NULL = 0,
+	NFSPROC4_CLNT_CB_RECALL,
+	NFSPROC4_CLNT_CB_SEQUENCE,
+};
+
+#define NFS4_MAXTAGLEN		20
+
+#define NFS4_enc_cb_null_sz		0
+#define NFS4_dec_cb_null_sz		0
+#define cb_compound_enc_hdr_sz		4
+#define cb_compound_dec_hdr_sz		(3 + (NFS4_MAXTAGLEN >> 2))
+#define sessionid_sz			(NFS4_MAX_SESSIONID_LEN >> 2)
+#define cb_sequence_enc_sz		(sessionid_sz + 4 +             \
+					1 /* no referring calls list yet */)
+#define cb_sequence_dec_sz		(op_dec_sz + sessionid_sz + 4)
+
+#define op_enc_sz			1
+#define op_dec_sz			2
+#define enc_nfs4_fh_sz			(1 + (NFS4_FHSIZE >> 2))
+#define enc_stateid_sz			(NFS4_STATEID_SIZE >> 2)
+#define NFS4_enc_cb_recall_sz		(cb_compound_enc_hdr_sz +       \
+					cb_sequence_enc_sz +            \
+					1 + enc_stateid_sz +            \
+					enc_nfs4_fh_sz)
+
+#define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
+					cb_sequence_dec_sz +            \
+					op_dec_sz)
+
+struct nfs4_cb_compound_hdr {
+	/* args */
+	u32		ident;	/* minorversion 0 only */
+	u32		nops;
+	__be32		*nops_p;
+	u32		minorversion;
+	/* res */
+	int		status;
+};
+
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("NFS: %s prematurely hit the end of our receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
+static __be32 *xdr_encode_empty_array(__be32 *p)
+{
+	*p++ = xdr_zero;
+	return p;
+}
+
+/*
+ * Encode/decode NFSv4 CB basic data types
+ *
+ * Basic NFSv4 callback data types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section
+ * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version
+ * 1 Protocol"
+ */
+
+/*
+ *	nfs_cb_opnum4
+ *
+ *	enum nfs_cb_opnum4 {
+ *		OP_CB_GETATTR		= 3,
+ *		  ...
+ *	};
+ */
+enum nfs_cb_opnum4 {
+	OP_CB_GETATTR			= 3,
+	OP_CB_RECALL			= 4,
+	OP_CB_LAYOUTRECALL		= 5,
+	OP_CB_NOTIFY			= 6,
+	OP_CB_PUSH_DELEG		= 7,
+	OP_CB_RECALL_ANY		= 8,
+	OP_CB_RECALLABLE_OBJ_AVAIL	= 9,
+	OP_CB_RECALL_SLOT		= 10,
+	OP_CB_SEQUENCE			= 11,
+	OP_CB_WANTS_CANCELLED		= 12,
+	OP_CB_NOTIFY_LOCK		= 13,
+	OP_CB_NOTIFY_DEVICEID		= 14,
+	OP_CB_ILLEGAL			= 10044
+};
+
+static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(op);
+}
+
+/*
+ * nfs_fh4
+ *
+ *	typedef opaque nfs_fh4<NFS4_FHSIZE>;
+ */
+static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
+{
+	u32 length = fh->fh_size;
+	__be32 *p;
+
+	BUG_ON(length > NFS4_FHSIZE);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, &fh->fh_base, length);
+}
+
+/*
+ * stateid4
+ *
+ *	struct stateid4 {
+ *		uint32_t	seqid;
+ *		opaque		other[12];
+ *	};
+ */
+static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
+	*p++ = cpu_to_be32(sid->si_generation);
+	xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE);
+}
+
+/*
+ * sessionid4
+ *
+ *	typedef opaque sessionid4[NFS4_SESSIONID_SIZE];
+ */
+static void encode_sessionid4(struct xdr_stream *xdr,
+			      const struct nfsd4_session *session)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
+	xdr_encode_opaque_fixed(p, session->se_sessionid.data,
+					NFS4_MAX_SESSIONID_LEN);
+}
+
+/*
+ * nfsstat4
+ */
+static const struct {
+	int stat;
+	int errno;
+} nfs_cb_errtbl[] = {
+	{ NFS4_OK,		0		},
+	{ NFS4ERR_PERM,		-EPERM		},
+	{ NFS4ERR_NOENT,	-ENOENT		},
+	{ NFS4ERR_IO,		-EIO		},
+	{ NFS4ERR_NXIO,		-ENXIO		},
+	{ NFS4ERR_ACCESS,	-EACCES		},
+	{ NFS4ERR_EXIST,	-EEXIST		},
+	{ NFS4ERR_XDEV,		-EXDEV		},
+	{ NFS4ERR_NOTDIR,	-ENOTDIR	},
+	{ NFS4ERR_ISDIR,	-EISDIR		},
+	{ NFS4ERR_INVAL,	-EINVAL		},
+	{ NFS4ERR_FBIG,		-EFBIG		},
+	{ NFS4ERR_NOSPC,	-ENOSPC		},
+	{ NFS4ERR_ROFS,		-EROFS		},
+	{ NFS4ERR_MLINK,	-EMLINK		},
+	{ NFS4ERR_NAMETOOLONG,	-ENAMETOOLONG	},
+	{ NFS4ERR_NOTEMPTY,	-ENOTEMPTY	},
+	{ NFS4ERR_DQUOT,	-EDQUOT		},
+	{ NFS4ERR_STALE,	-ESTALE		},
+	{ NFS4ERR_BADHANDLE,	-EBADHANDLE	},
+	{ NFS4ERR_BAD_COOKIE,	-EBADCOOKIE	},
+	{ NFS4ERR_NOTSUPP,	-ENOTSUPP	},
+	{ NFS4ERR_TOOSMALL,	-ETOOSMALL	},
+	{ NFS4ERR_SERVERFAULT,	-ESERVERFAULT	},
+	{ NFS4ERR_BADTYPE,	-EBADTYPE	},
+	{ NFS4ERR_LOCKED,	-EAGAIN		},
+	{ NFS4ERR_RESOURCE,	-EREMOTEIO	},
+	{ NFS4ERR_SYMLINK,	-ELOOP		},
+	{ NFS4ERR_OP_ILLEGAL,	-EOPNOTSUPP	},
+	{ NFS4ERR_DEADLOCK,	-EDEADLK	},
+	{ -1,			-EIO		}
+};
+
+/*
+ * If we cannot translate the error, the recovery routines should
+ * handle it.
+ *
+ * Note: remaining NFSv4 error codes have values > 10000, so should
+ * not conflict with native Linux error codes.
+ */
+static int nfs_cb_stat_to_errno(int status)
+{
+	int i;
+
+	for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
+		if (nfs_cb_errtbl[i].stat == status)
+			return nfs_cb_errtbl[i].errno;
+	}
+
+	dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status);
+	return -status;
+}
+
+static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
+			       enum nfsstat4 *status)
+{
+	__be32 *p;
+	u32 op;
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	op = be32_to_cpup(p++);
+	if (unlikely(op != expected))
+		goto out_unexpected;
+	*status = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+out_unexpected:
+	dprintk("NFSD: Callback server returned operation %d but "
+		"we issued a request for %d\n", op, expected);
+	return -EIO;
+}
+
+/*
+ * CB_COMPOUND4args
+ *
+ *	struct CB_COMPOUND4args {
+ *		utf8str_cs	tag;
+ *		uint32_t	minorversion;
+ *		uint32_t	callback_ident;
+ *		nfs_cb_argop4	argarray<>;
+ *	};
+*/
+static void encode_cb_compound4args(struct xdr_stream *xdr,
+				    struct nfs4_cb_compound_hdr *hdr)
+{
+	__be32 * p;
+
+	p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
+	p = xdr_encode_empty_array(p);		/* empty tag */
+	*p++ = cpu_to_be32(hdr->minorversion);
+	*p++ = cpu_to_be32(hdr->ident);
+
+	hdr->nops_p = p;
+	*p = cpu_to_be32(hdr->nops);		/* argarray element count */
+}
+
+/*
+ * Update argarray element count
+ */
+static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+{
+	BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS);
+	*hdr->nops_p = cpu_to_be32(hdr->nops);
+}
+
+/*
+ * CB_COMPOUND4res
+ *
+ *	struct CB_COMPOUND4res {
+ *		nfsstat4	status;
+ *		utf8str_cs	tag;
+ *		nfs_cb_resop4	resarray<>;
+ *	};
+ */
+static int decode_cb_compound4res(struct xdr_stream *xdr,
+				  struct nfs4_cb_compound_hdr *hdr)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	hdr->status = be32_to_cpup(p++);
+	/* Ignore the tag */
+	length = be32_to_cpup(p++);
+	p = xdr_inline_decode(xdr, length + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	hdr->nops = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * CB_RECALL4args
+ *
+ *	struct CB_RECALL4args {
+ *		stateid4	stateid;
+ *		bool		truncate;
+ *		nfs_fh4		fh;
+ *	};
+ */
+static void encode_cb_recall4args(struct xdr_stream *xdr,
+				  const struct nfs4_delegation *dp,
+				  struct nfs4_cb_compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
+	encode_stateid4(xdr, &dp->dl_stateid);
+
+	p = xdr_reserve_space(xdr, 4);
+	*p++ = xdr_zero;			/* truncate */
+
+	encode_nfs_fh4(xdr, &dp->dl_fh);
+
+	hdr->nops++;
+}
+
+/*
+ * CB_SEQUENCE4args
+ *
+ *	struct CB_SEQUENCE4args {
+ *		sessionid4		csa_sessionid;
+ *		sequenceid4		csa_sequenceid;
+ *		slotid4			csa_slotid;
+ *		slotid4			csa_highest_slotid;
+ *		bool			csa_cachethis;
+ *		referring_call_list4	csa_referring_call_lists<>;
+ *	};
+ */
+static void encode_cb_sequence4args(struct xdr_stream *xdr,
+				    const struct nfsd4_callback *cb,
+				    struct nfs4_cb_compound_hdr *hdr)
+{
+	struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
+	__be32 *p;
+
+	if (hdr->minorversion == 0)
+		return;
+
+	encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE);
+	encode_sessionid4(xdr, session);
+
+	p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
+	*p++ = cpu_to_be32(session->se_cb_seq_nr);	/* csa_sequenceid */
+	*p++ = xdr_zero;			/* csa_slotid */
+	*p++ = xdr_zero;			/* csa_highest_slotid */
+	*p++ = xdr_zero;			/* csa_cachethis */
+	xdr_encode_empty_array(p);		/* csa_referring_call_lists */
+
+	hdr->nops++;
+}
+
+/*
+ * CB_SEQUENCE4resok
+ *
+ *	struct CB_SEQUENCE4resok {
+ *		sessionid4	csr_sessionid;
+ *		sequenceid4	csr_sequenceid;
+ *		slotid4		csr_slotid;
+ *		slotid4		csr_highest_slotid;
+ *		slotid4		csr_target_highest_slotid;
+ *	};
+ *
+ *	union CB_SEQUENCE4res switch (nfsstat4 csr_status) {
+ *	case NFS4_OK:
+ *		CB_SEQUENCE4resok	csr_resok4;
+ *	default:
+ *		void;
+ *	};
+ *
+ * Our current back channel implmentation supports a single backchannel
+ * with a single slot.
+ */
+static int decode_cb_sequence4resok(struct xdr_stream *xdr,
+				    struct nfsd4_callback *cb)
+{
+	struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
+	struct nfs4_sessionid id;
+	int status;
+	__be32 *p;
+	u32 dummy;
+
+	status = -ESERVERFAULT;
+
+	/*
+	 * If the server returns different values for sessionID, slotID or
+	 * sequence number, the server is looney tunes.
+	 */
+	p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
+	if (memcmp(id.data, session->se_sessionid.data,
+					NFS4_MAX_SESSIONID_LEN) != 0) {
+		dprintk("NFS: %s Invalid session id\n", __func__);
+		goto out;
+	}
+	p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
+
+	dummy = be32_to_cpup(p++);
+	if (dummy != session->se_cb_seq_nr) {
+		dprintk("NFS: %s Invalid sequence number\n", __func__);
+		goto out;
+	}
+
+	dummy = be32_to_cpup(p++);
+	if (dummy != 0) {
+		dprintk("NFS: %s Invalid slotid\n", __func__);
+		goto out;
+	}
+
+	/*
+	 * FIXME: process highest slotid and target highest slotid
+	 */
+	status = 0;
+out:
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_cb_sequence4res(struct xdr_stream *xdr,
+				  struct nfsd4_callback *cb)
+{
+	enum nfsstat4 nfserr;
+	int status;
+
+	if (cb->cb_minorversion == 0)
+		return 0;
+
+	status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr);
+	if (unlikely(status))
+		goto out;
+	if (unlikely(nfserr != NFS4_OK))
+		goto out_default;
+	status = decode_cb_sequence4resok(xdr, cb);
+out:
+	return status;
+out_default:
+	return nfs_cb_stat_to_errno(nfserr);
+}
+
+/*
+ * NFSv4.0 and NFSv4.1 XDR encode functions
+ *
+ * NFSv4.0 callback argument types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+
+/*
+ * NB: Without this zero space reservation, callbacks over krb5p fail
+ */
+static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 void *__unused)
+{
+	xdr_reserve_space(xdr, 0);
+}
+
+/*
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
+ */
+static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
+				   const struct nfsd4_callback *cb)
+{
+	const struct nfs4_delegation *args = cb->cb_op;
+	struct nfs4_cb_compound_hdr hdr = {
+		.ident = cb->cb_clp->cl_cb_ident,
+		.minorversion = cb->cb_minorversion,
+	};
+
+	encode_cb_compound4args(xdr, &hdr);
+	encode_cb_sequence4args(xdr, cb, &hdr);
+	encode_cb_recall4args(xdr, args, &hdr);
+	encode_cb_nops(&hdr);
+}
+
+
+/*
+ * NFSv4.0 and NFSv4.1 XDR decode functions
+ *
+ * NFSv4.0 callback result types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+
+static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+				void *__unused)
+{
+	return 0;
+}
+
+/*
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
+ */
+static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
+				  struct nfsd4_callback *cb)
+{
+	struct nfs4_cb_compound_hdr hdr;
+	enum nfsstat4 nfserr;
+	int status;
+
+	status = decode_cb_compound4res(xdr, &hdr);
+	if (unlikely(status))
+		goto out;
+
+	if (cb != NULL) {
+		status = decode_cb_sequence4res(xdr, cb);
+		if (unlikely(status))
+			goto out;
+	}
+
+	status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr);
+	if (unlikely(status))
+		goto out;
+	if (unlikely(nfserr != NFS4_OK))
+		status = nfs_cb_stat_to_errno(nfserr);
+out:
+	return status;
+}
+
+/*
+ * RPC procedure tables
+ */
+#define PROC(proc, call, argtype, restype)				\
+[NFSPROC4_CLNT_##proc] = {						\
+	.p_proc    = NFSPROC4_CB_##call,				\
+	.p_encode  = (kxdreproc_t)nfs4_xdr_enc_##argtype,		\
+	.p_decode  = (kxdrdproc_t)nfs4_xdr_dec_##restype,		\
+	.p_arglen  = NFS4_enc_##argtype##_sz,				\
+	.p_replen  = NFS4_dec_##restype##_sz,				\
+	.p_statidx = NFSPROC4_CB_##call,				\
+	.p_name    = #proc,						\
+}
+
+static struct rpc_procinfo nfs4_cb_procedures[] = {
+	PROC(CB_NULL,	NULL,		cb_null,	cb_null),
+	PROC(CB_RECALL,	COMPOUND,	cb_recall,	cb_recall),
+};
+
+static struct rpc_version nfs_cb_version4 = {
+/*
+ * Note on the callback rpc program version number: despite language in rfc
+ * 5661 section 18.36.3 requiring servers to use 4 in this field, the
+ * official xdr descriptions for both 4.0 and 4.1 specify version 1, and
+ * in practice that appears to be what implementations use.  The section
+ * 18.36.3 language is expected to be fixed in an erratum.
+ */
+	.number			= 1,
+	.nrprocs		= ARRAY_SIZE(nfs4_cb_procedures),
+	.procs			= nfs4_cb_procedures
+};
+
+static struct rpc_version *nfs_cb_version[] = {
+	&nfs_cb_version4,
+};
+
+static struct rpc_program cb_program;
+
+static struct rpc_stat cb_stats = {
+	.program		= &cb_program
+};
+
+#define NFS4_CALLBACK 0x40000000
+static struct rpc_program cb_program = {
+	.name			= "nfs4_cb",
+	.number			= NFS4_CALLBACK,
+	.nrvers			= ARRAY_SIZE(nfs_cb_version),
+	.version		= nfs_cb_version,
+	.stats			= &cb_stats,
+	.pipe_dir_name		= "/nfsd4_cb",
+};
+
+static int max_cb_time(void)
+{
+	return max(nfsd4_lease/10, (time_t)1) * HZ;
+}
+
+
+static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
+{
+	struct rpc_timeout	timeparms = {
+		.to_initval	= max_cb_time(),
+		.to_retries	= 0,
+	};
+	struct rpc_create_args args = {
+		.net		= &init_net,
+		.address	= (struct sockaddr *) &conn->cb_addr,
+		.addrsize	= conn->cb_addrlen,
+		.saddress	= (struct sockaddr *) &conn->cb_saddr,
+		.timeout	= &timeparms,
+		.program	= &cb_program,
+		.version	= 0,
+		.authflavor	= clp->cl_flavor,
+		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
+	};
+	struct rpc_clnt *client;
+
+	if (clp->cl_minorversion == 0) {
+		if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+			return -EINVAL;
+		args.client_name = clp->cl_principal;
+		args.prognumber	= conn->cb_prog,
+		args.protocol = XPRT_TRANSPORT_TCP;
+		clp->cl_cb_ident = conn->cb_ident;
+	} else {
+		if (!conn->cb_xprt)
+			return -EINVAL;
+		clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
+		clp->cl_cb_session = ses;
+		args.bc_xprt = conn->cb_xprt;
+		args.prognumber = clp->cl_cb_session->se_cb_prog;
+		args.protocol = XPRT_TRANSPORT_BC_TCP;
+	}
+	/* Create RPC client */
+	client = rpc_create(&args);
+	if (IS_ERR(client)) {
+		dprintk("NFSD: couldn't create callback client: %ld\n",
+			PTR_ERR(client));
+		return PTR_ERR(client);
+	}
+	clp->cl_cb_client = client;
+	return 0;
+
+}
+
+static void warn_no_callback_path(struct nfs4_client *clp, int reason)
+{
+	dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
+		(int)clp->cl_name.len, clp->cl_name.data, reason);
+}
+
+static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
+{
+	clp->cl_cb_state = NFSD4_CB_DOWN;
+	warn_no_callback_path(clp, reason);
+}
+
+static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
+
+	if (task->tk_status)
+		nfsd4_mark_cb_down(clp, task->tk_status);
+	else
+		clp->cl_cb_state = NFSD4_CB_UP;
+}
+
+static const struct rpc_call_ops nfsd4_cb_probe_ops = {
+	/* XXX: release method to ensure we set the cb channel down if
+	 * necessary on early failure? */
+	.rpc_call_done = nfsd4_cb_probe_done,
+};
+
+static struct rpc_cred *callback_cred;
+
+int set_callback_cred(void)
+{
+	if (callback_cred)
+		return 0;
+	callback_cred = rpc_lookup_machine_cred();
+	if (!callback_cred)
+		return -ENOMEM;
+	return 0;
+}
+
+static struct workqueue_struct *callback_wq;
+
+static void run_nfsd4_cb(struct nfsd4_callback *cb)
+{
+	queue_work(callback_wq, &cb->cb_work);
+}
+
+static void do_probe_callback(struct nfs4_client *clp)
+{
+	struct nfsd4_callback *cb = &clp->cl_cb_null;
+
+	cb->cb_op = NULL;
+	cb->cb_clp = clp;
+
+	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
+	cb->cb_msg.rpc_argp = NULL;
+	cb->cb_msg.rpc_resp = NULL;
+	cb->cb_msg.rpc_cred = callback_cred;
+
+	cb->cb_ops = &nfsd4_cb_probe_ops;
+
+	run_nfsd4_cb(cb);
+}
+
+/*
+ * Poke the callback thread to process any updates to the callback
+ * parameters, and send a null probe.
+ */
+void nfsd4_probe_callback(struct nfs4_client *clp)
+{
+	/* XXX: atomicity?  Also, should we be using cl_cb_flags? */
+	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+	set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
+	do_probe_callback(clp);
+}
+
+void nfsd4_probe_callback_sync(struct nfs4_client *clp)
+{
+	nfsd4_probe_callback(clp);
+	flush_workqueue(callback_wq);
+}
+
+void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+{
+	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+	spin_lock(&clp->cl_lock);
+	memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
+	spin_unlock(&clp->cl_lock);
+}
+
+/*
+ * There's currently a single callback channel slot.
+ * If the slot is available, then mark it busy.  Otherwise, set the
+ * thread for sleeping on the callback RPC wait queue.
+ */
+static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
+{
+	if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
+		rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
+		dprintk("%s slot is busy\n", __func__);
+		return false;
+	}
+	return true;
+}
+
+/*
+ * TODO: cb_sequence should support referring call lists, cachethis, multiple
+ * slots, and mark callback channel down on communication errors.
+ */
+static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
+	struct nfs4_client *clp = dp->dl_client;
+	u32 minorversion = clp->cl_minorversion;
+
+	cb->cb_minorversion = minorversion;
+	if (minorversion) {
+		if (!nfsd41_cb_get_slot(clp, task))
+			return;
+	}
+	spin_lock(&clp->cl_lock);
+	if (list_empty(&cb->cb_per_client)) {
+		/* This is the first call, not a restart */
+		cb->cb_done = false;
+		list_add(&cb->cb_per_client, &clp->cl_callbacks);
+	}
+	spin_unlock(&clp->cl_lock);
+	rpc_call_start(task);
+}
+
+static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
+{
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
+	struct nfs4_client *clp = dp->dl_client;
+
+	dprintk("%s: minorversion=%d\n", __func__,
+		clp->cl_minorversion);
+
+	if (clp->cl_minorversion) {
+		/* No need for lock, access serialized in nfsd4_cb_prepare */
+		++clp->cl_cb_session->se_cb_seq_nr;
+		clear_bit(0, &clp->cl_cb_slot_busy);
+		rpc_wake_up_next(&clp->cl_cb_waitq);
+		dprintk("%s: freed slot, new seqid=%d\n", __func__,
+			clp->cl_cb_session->se_cb_seq_nr);
+
+		/* We're done looking into the sequence information */
+		task->tk_msg.rpc_resp = NULL;
+	}
+}
+
+
+static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
+{
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
+	struct nfs4_client *clp = dp->dl_client;
+	struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
+
+	nfsd4_cb_done(task, calldata);
+
+	if (current_rpc_client != task->tk_client) {
+		/* We're shutting down or changing cl_cb_client; leave
+		 * it to nfsd4_process_cb_update to restart the call if
+		 * necessary. */
+		return;
+	}
+
+	if (cb->cb_done)
+		return;
+	switch (task->tk_status) {
+	case 0:
+		cb->cb_done = true;
+		return;
+	case -EBADHANDLE:
+	case -NFS4ERR_BAD_STATEID:
+		/* Race: client probably got cb_recall
+		 * before open reply granting delegation */
+		break;
+	default:
+		/* Network partition? */
+		nfsd4_mark_cb_down(clp, task->tk_status);
+	}
+	if (dp->dl_retries--) {
+		rpc_delay(task, 2*HZ);
+		task->tk_status = 0;
+		rpc_restart_call_prepare(task);
+		return;
+	}
+	nfsd4_mark_cb_down(clp, task->tk_status);
+	cb->cb_done = true;
+}
+
+static void nfsd4_cb_recall_release(void *calldata)
+{
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_client *clp = cb->cb_clp;
+	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
+
+	if (cb->cb_done) {
+		spin_lock(&clp->cl_lock);
+		list_del(&cb->cb_per_client);
+		spin_unlock(&clp->cl_lock);
+		nfs4_put_delegation(dp);
+	}
+}
+
+static const struct rpc_call_ops nfsd4_cb_recall_ops = {
+	.rpc_call_prepare = nfsd4_cb_prepare,
+	.rpc_call_done = nfsd4_cb_recall_done,
+	.rpc_release = nfsd4_cb_recall_release,
+};
+
+int nfsd4_create_callback_queue(void)
+{
+	callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
+	if (!callback_wq)
+		return -ENOMEM;
+	return 0;
+}
+
+void nfsd4_destroy_callback_queue(void)
+{
+	destroy_workqueue(callback_wq);
+}
+
+/* must be called under the state lock */
+void nfsd4_shutdown_callback(struct nfs4_client *clp)
+{
+	set_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags);
+	/*
+	 * Note this won't actually result in a null callback;
+	 * instead, nfsd4_do_callback_rpc() will detect the killed
+	 * client, destroy the rpc client, and stop:
+	 */
+	do_probe_callback(clp);
+	flush_workqueue(callback_wq);
+}
+
+static void nfsd4_release_cb(struct nfsd4_callback *cb)
+{
+	if (cb->cb_ops->rpc_release)
+		cb->cb_ops->rpc_release(cb);
+}
+
+/* requires cl_lock: */
+static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
+{
+	struct nfsd4_session *s;
+	struct nfsd4_conn *c;
+
+	list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
+		list_for_each_entry(c, &s->se_conns, cn_persession) {
+			if (c->cn_flags & NFS4_CDFC4_BACK)
+				return c;
+		}
+	}
+	return NULL;
+}
+
+static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
+{
+	struct nfs4_cb_conn conn;
+	struct nfs4_client *clp = cb->cb_clp;
+	struct nfsd4_session *ses = NULL;
+	struct nfsd4_conn *c;
+	int err;
+
+	/*
+	 * This is either an update, or the client dying; in either case,
+	 * kill the old client:
+	 */
+	if (clp->cl_cb_client) {
+		rpc_shutdown_client(clp->cl_cb_client);
+		clp->cl_cb_client = NULL;
+	}
+	if (clp->cl_cb_conn.cb_xprt) {
+		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+		clp->cl_cb_conn.cb_xprt = NULL;
+	}
+	if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
+		return;
+	spin_lock(&clp->cl_lock);
+	/*
+	 * Only serialized callback code is allowed to clear these
+	 * flags; main nfsd code can only set them:
+	 */
+	BUG_ON(!clp->cl_cb_flags);
+	clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
+	memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
+	c = __nfsd4_find_backchannel(clp);
+	if (c) {
+		svc_xprt_get(c->cn_xprt);
+		conn.cb_xprt = c->cn_xprt;
+		ses = c->cn_session;
+	}
+	spin_unlock(&clp->cl_lock);
+
+	err = setup_callback_client(clp, &conn, ses);
+	if (err) {
+		warn_no_callback_path(clp, err);
+		return;
+	}
+	/* Yay, the callback channel's back! Restart any callbacks: */
+	list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
+		run_nfsd4_cb(cb);
+}
+
+void nfsd4_do_callback_rpc(struct work_struct *w)
+{
+	struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
+	struct nfs4_client *clp = cb->cb_clp;
+	struct rpc_clnt *clnt;
+
+	if (clp->cl_cb_flags)
+		nfsd4_process_cb_update(cb);
+
+	clnt = clp->cl_cb_client;
+	if (!clnt) {
+		/* Callback channel broken, or client killed; give up: */
+		nfsd4_release_cb(cb);
+		return;
+	}
+	rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+			cb->cb_ops, cb);
+}
+
+void nfsd4_cb_recall(struct nfs4_delegation *dp)
+{
+	struct nfsd4_callback *cb = &dp->dl_recall;
+	struct nfs4_client *clp = dp->dl_client;
+
+	dp->dl_retries = 1;
+	cb->cb_op = dp;
+	cb->cb_clp = clp;
+	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
+	cb->cb_msg.rpc_argp = cb;
+	cb->cb_msg.rpc_resp = cb;
+	cb->cb_msg.rpc_cred = callback_cred;
+
+	cb->cb_ops = &nfsd4_cb_recall_ops;
+	dp->dl_retries = 1;
+
+	INIT_LIST_HEAD(&cb->cb_per_client);
+	cb->cb_done = true;
+
+	run_nfsd4_cb(&dp->dl_recall);
+}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
new file mode 100644
index 00000000..55780a22
--- /dev/null
+++ b/fs/nfsd/nfs4idmap.c
@@ -0,0 +1,587 @@
+/*
+ *  Mapping of UID/GIDs to name and vice versa.
+ *
+ *  Copyright (c) 2002, 2003 The Regents of the University of
+ *  Michigan.  All rights reserved.
+ *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "idmap.h"
+#include "nfsd.h"
+
+/*
+ * Cache entry
+ */
+
+/*
+ * XXX we know that IDMAP_NAMESZ < PAGE_SIZE, but it's ugly to rely on
+ * that.
+ */
+
+#define IDMAP_TYPE_USER  0
+#define IDMAP_TYPE_GROUP 1
+
+struct ent {
+	struct cache_head h;
+	int               type;		       /* User / Group */
+	uid_t             id;
+	char              name[IDMAP_NAMESZ];
+	char              authname[IDMAP_NAMESZ];
+};
+
+/* Common entry handling */
+
+#define ENT_HASHBITS          8
+#define ENT_HASHMAX           (1 << ENT_HASHBITS)
+
+static void
+ent_init(struct cache_head *cnew, struct cache_head *citm)
+{
+	struct ent *new = container_of(cnew, struct ent, h);
+	struct ent *itm = container_of(citm, struct ent, h);
+
+	new->id = itm->id;
+	new->type = itm->type;
+
+	strlcpy(new->name, itm->name, sizeof(new->name));
+	strlcpy(new->authname, itm->authname, sizeof(new->name));
+}
+
+static void
+ent_put(struct kref *ref)
+{
+	struct ent *map = container_of(ref, struct ent, h.ref);
+	kfree(map);
+}
+
+static struct cache_head *
+ent_alloc(void)
+{
+	struct ent *e = kmalloc(sizeof(*e), GFP_KERNEL);
+	if (e)
+		return &e->h;
+	else
+		return NULL;
+}
+
+/*
+ * ID -> Name cache
+ */
+
+static struct cache_head *idtoname_table[ENT_HASHMAX];
+
+static uint32_t
+idtoname_hash(struct ent *ent)
+{
+	uint32_t hash;
+
+	hash = hash_str(ent->authname, ENT_HASHBITS);
+	hash = hash_long(hash ^ ent->id, ENT_HASHBITS);
+
+	/* Flip LSB for user/group */
+	if (ent->type == IDMAP_TYPE_GROUP)
+		hash ^= 1;
+
+	return hash;
+}
+
+static void
+idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
+    int *blen)
+{
+ 	struct ent *ent = container_of(ch, struct ent, h);
+	char idstr[11];
+
+	qword_add(bpp, blen, ent->authname);
+	snprintf(idstr, sizeof(idstr), "%u", ent->id);
+	qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user");
+	qword_add(bpp, blen, idstr);
+
+	(*bpp)[-1] = '\n';
+}
+
+static int
+idtoname_upcall(struct cache_detail *cd, struct cache_head *ch)
+{
+	return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request);
+}
+
+static int
+idtoname_match(struct cache_head *ca, struct cache_head *cb)
+{
+	struct ent *a = container_of(ca, struct ent, h);
+	struct ent *b = container_of(cb, struct ent, h);
+
+	return (a->id == b->id && a->type == b->type &&
+	    strcmp(a->authname, b->authname) == 0);
+}
+
+static int
+idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h)
+{
+	struct ent *ent;
+
+	if (h == NULL) {
+		seq_puts(m, "#domain type id [name]\n");
+		return 0;
+	}
+	ent = container_of(h, struct ent, h);
+	seq_printf(m, "%s %s %u", ent->authname,
+			ent->type == IDMAP_TYPE_GROUP ? "group" : "user",
+			ent->id);
+	if (test_bit(CACHE_VALID, &h->flags))
+		seq_printf(m, " %s", ent->name);
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static void
+warn_no_idmapd(struct cache_detail *detail, int has_died)
+{
+	printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n",
+			has_died ? "died" : "not been started");
+}
+
+
+static int         idtoname_parse(struct cache_detail *, char *, int);
+static struct ent *idtoname_lookup(struct ent *);
+static struct ent *idtoname_update(struct ent *, struct ent *);
+
+static struct cache_detail idtoname_cache = {
+	.owner		= THIS_MODULE,
+	.hash_size	= ENT_HASHMAX,
+	.hash_table	= idtoname_table,
+	.name		= "nfs4.idtoname",
+	.cache_put	= ent_put,
+	.cache_upcall	= idtoname_upcall,
+	.cache_parse	= idtoname_parse,
+	.cache_show	= idtoname_show,
+	.warn_no_listener = warn_no_idmapd,
+	.match		= idtoname_match,
+	.init		= ent_init,
+	.update		= ent_init,
+	.alloc		= ent_alloc,
+};
+
+static int
+idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+	struct ent ent, *res;
+	char *buf1, *bp;
+	int len;
+	int error = -EINVAL;
+
+	if (buf[buflen - 1] != '\n')
+		return (-EINVAL);
+	buf[buflen - 1]= '\0';
+
+	buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (buf1 == NULL)
+		return (-ENOMEM);
+
+	memset(&ent, 0, sizeof(ent));
+
+	/* Authentication name */
+	if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+		goto out;
+	memcpy(ent.authname, buf1, sizeof(ent.authname));
+
+	/* Type */
+	if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+		goto out;
+	ent.type = strcmp(buf1, "user") == 0 ?
+		IDMAP_TYPE_USER : IDMAP_TYPE_GROUP;
+
+	/* ID */
+	if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+		goto out;
+	ent.id = simple_strtoul(buf1, &bp, 10);
+	if (bp == buf1)
+		goto out;
+
+	/* expiry */
+	ent.h.expiry_time = get_expiry(&buf);
+	if (ent.h.expiry_time == 0)
+		goto out;
+
+	error = -ENOMEM;
+	res = idtoname_lookup(&ent);
+	if (!res)
+		goto out;
+
+	/* Name */
+	error = -EINVAL;
+	len = qword_get(&buf, buf1, PAGE_SIZE);
+	if (len < 0)
+		goto out;
+	if (len == 0)
+		set_bit(CACHE_NEGATIVE, &ent.h.flags);
+	else if (len >= IDMAP_NAMESZ)
+		goto out;
+	else
+		memcpy(ent.name, buf1, sizeof(ent.name));
+	error = -ENOMEM;
+	res = idtoname_update(&ent, res);
+	if (res == NULL)
+		goto out;
+
+	cache_put(&res->h, &idtoname_cache);
+
+	error = 0;
+out:
+	kfree(buf1);
+
+	return error;
+}
+
+
+static struct ent *
+idtoname_lookup(struct ent *item)
+{
+	struct cache_head *ch = sunrpc_cache_lookup(&idtoname_cache,
+						    &item->h,
+						    idtoname_hash(item));
+	if (ch)
+		return container_of(ch, struct ent, h);
+	else
+		return NULL;
+}
+
+static struct ent *
+idtoname_update(struct ent *new, struct ent *old)
+{
+	struct cache_head *ch = sunrpc_cache_update(&idtoname_cache,
+						    &new->h, &old->h,
+						    idtoname_hash(new));
+	if (ch)
+		return container_of(ch, struct ent, h);
+	else
+		return NULL;
+}
+
+
+/*
+ * Name -> ID cache
+ */
+
+static struct cache_head *nametoid_table[ENT_HASHMAX];
+
+static inline int
+nametoid_hash(struct ent *ent)
+{
+	return hash_str(ent->name, ENT_HASHBITS);
+}
+
+static void
+nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
+    int *blen)
+{
+ 	struct ent *ent = container_of(ch, struct ent, h);
+
+	qword_add(bpp, blen, ent->authname);
+	qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user");
+	qword_add(bpp, blen, ent->name);
+
+	(*bpp)[-1] = '\n';
+}
+
+static int
+nametoid_upcall(struct cache_detail *cd, struct cache_head *ch)
+{
+	return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request);
+}
+
+static int
+nametoid_match(struct cache_head *ca, struct cache_head *cb)
+{
+	struct ent *a = container_of(ca, struct ent, h);
+	struct ent *b = container_of(cb, struct ent, h);
+
+	return (a->type == b->type && strcmp(a->name, b->name) == 0 &&
+	    strcmp(a->authname, b->authname) == 0);
+}
+
+static int
+nametoid_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h)
+{
+	struct ent *ent;
+
+	if (h == NULL) {
+		seq_puts(m, "#domain type name [id]\n");
+		return 0;
+	}
+	ent = container_of(h, struct ent, h);
+	seq_printf(m, "%s %s %s", ent->authname,
+			ent->type == IDMAP_TYPE_GROUP ? "group" : "user",
+			ent->name);
+	if (test_bit(CACHE_VALID, &h->flags))
+		seq_printf(m, " %u", ent->id);
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static struct ent *nametoid_lookup(struct ent *);
+static struct ent *nametoid_update(struct ent *, struct ent *);
+static int         nametoid_parse(struct cache_detail *, char *, int);
+
+static struct cache_detail nametoid_cache = {
+	.owner		= THIS_MODULE,
+	.hash_size	= ENT_HASHMAX,
+	.hash_table	= nametoid_table,
+	.name		= "nfs4.nametoid",
+	.cache_put	= ent_put,
+	.cache_upcall	= nametoid_upcall,
+	.cache_parse	= nametoid_parse,
+	.cache_show	= nametoid_show,
+	.warn_no_listener = warn_no_idmapd,
+	.match		= nametoid_match,
+	.init		= ent_init,
+	.update		= ent_init,
+	.alloc		= ent_alloc,
+};
+
+static int
+nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+	struct ent ent, *res;
+	char *buf1;
+	int error = -EINVAL;
+
+	if (buf[buflen - 1] != '\n')
+		return (-EINVAL);
+	buf[buflen - 1]= '\0';
+
+	buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (buf1 == NULL)
+		return (-ENOMEM);
+
+	memset(&ent, 0, sizeof(ent));
+
+	/* Authentication name */
+	if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+		goto out;
+	memcpy(ent.authname, buf1, sizeof(ent.authname));
+
+	/* Type */
+	if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+		goto out;
+	ent.type = strcmp(buf1, "user") == 0 ?
+		IDMAP_TYPE_USER : IDMAP_TYPE_GROUP;
+
+	/* Name */
+	error = qword_get(&buf, buf1, PAGE_SIZE);
+	if (error <= 0 || error >= IDMAP_NAMESZ)
+		goto out;
+	memcpy(ent.name, buf1, sizeof(ent.name));
+
+	/* expiry */
+	ent.h.expiry_time = get_expiry(&buf);
+	if (ent.h.expiry_time == 0)
+		goto out;
+
+	/* ID */
+	error = get_int(&buf, &ent.id);
+	if (error == -EINVAL)
+		goto out;
+	if (error == -ENOENT)
+		set_bit(CACHE_NEGATIVE, &ent.h.flags);
+
+	error = -ENOMEM;
+	res = nametoid_lookup(&ent);
+	if (res == NULL)
+		goto out;
+	res = nametoid_update(&ent, res);
+	if (res == NULL)
+		goto out;
+
+	cache_put(&res->h, &nametoid_cache);
+	error = 0;
+out:
+	kfree(buf1);
+
+	return (error);
+}
+
+
+static struct ent *
+nametoid_lookup(struct ent *item)
+{
+	struct cache_head *ch = sunrpc_cache_lookup(&nametoid_cache,
+						    &item->h,
+						    nametoid_hash(item));
+	if (ch)
+		return container_of(ch, struct ent, h);
+	else
+		return NULL;
+}
+
+static struct ent *
+nametoid_update(struct ent *new, struct ent *old)
+{
+	struct cache_head *ch = sunrpc_cache_update(&nametoid_cache,
+						    &new->h, &old->h,
+						    nametoid_hash(new));
+	if (ch)
+		return container_of(ch, struct ent, h);
+	else
+		return NULL;
+}
+
+/*
+ * Exported API
+ */
+
+int
+nfsd_idmap_init(void)
+{
+	int rv;
+
+	rv = cache_register(&idtoname_cache);
+	if (rv)
+		return rv;
+	rv = cache_register(&nametoid_cache);
+	if (rv)
+		cache_unregister(&idtoname_cache);
+	return rv;
+}
+
+void
+nfsd_idmap_shutdown(void)
+{
+	cache_unregister(&idtoname_cache);
+	cache_unregister(&nametoid_cache);
+}
+
+static int
+idmap_lookup(struct svc_rqst *rqstp,
+		struct ent *(*lookup_fn)(struct ent *), struct ent *key,
+		struct cache_detail *detail, struct ent **item)
+{
+	int ret;
+
+	*item = lookup_fn(key);
+	if (!*item)
+		return -ENOMEM;
+ retry:
+	ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle);
+
+	if (ret == -ETIMEDOUT) {
+		struct ent *prev_item = *item;
+		*item = lookup_fn(key);
+		if (*item != prev_item)
+			goto retry;
+		cache_put(&(*item)->h, detail);
+	}
+	return ret;
+}
+
+static char *
+rqst_authname(struct svc_rqst *rqstp)
+{
+	struct auth_domain *clp;
+
+	clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client;
+	return clp->name;
+}
+
+static __be32
+idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
+		uid_t *id)
+{
+	struct ent *item, key = {
+		.type = type,
+	};
+	int ret;
+
+	if (namelen + 1 > sizeof(key.name))
+		return nfserr_badowner;
+	memcpy(key.name, name, namelen);
+	key.name[namelen] = '\0';
+	strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+	ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
+	if (ret == -ENOENT)
+		return nfserr_badowner;
+	if (ret)
+		return nfserrno(ret);
+	*id = item->id;
+	cache_put(&item->h, &nametoid_cache);
+	return 0;
+}
+
+static int
+idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
+{
+	struct ent *item, key = {
+		.id = id,
+		.type = type,
+	};
+	int ret;
+
+	strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+	ret = idmap_lookup(rqstp, idtoname_lookup, &key, &idtoname_cache, &item);
+	if (ret == -ENOENT)
+		return sprintf(name, "%u", id);
+	if (ret)
+		return ret;
+	ret = strlen(item->name);
+	BUG_ON(ret > IDMAP_NAMESZ);
+	memcpy(name, item->name, ret);
+	cache_put(&item->h, &idtoname_cache);
+	return ret;
+}
+
+__be32
+nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
+		__u32 *id)
+{
+	return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id);
+}
+
+__be32
+nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
+		__u32 *id)
+{
+	return idmap_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, id);
+}
+
+int
+nfsd_map_uid_to_name(struct svc_rqst *rqstp, __u32 id, char *name)
+{
+	return idmap_id_to_name(rqstp, IDMAP_TYPE_USER, id, name);
+}
+
+int
+nfsd_map_gid_to_name(struct svc_rqst *rqstp, __u32 id, char *name)
+{
+	return idmap_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name);
+}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
new file mode 100644
index 00000000..d06a02c1
--- /dev/null
+++ b/fs/nfsd/nfs4proc.c
@@ -0,0 +1,1472 @@
+/*
+ *  Server-side procedures for NFSv4.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/file.h>
+#include <linux/slab.h>
+
+#include "cache.h"
+#include "xdr4.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_PROC
+
+static u32 nfsd_attrmask[] = {
+	NFSD_WRITEABLE_ATTRS_WORD0,
+	NFSD_WRITEABLE_ATTRS_WORD1,
+	NFSD_WRITEABLE_ATTRS_WORD2
+};
+
+static u32 nfsd41_ex_attrmask[] = {
+	NFSD_SUPPATTR_EXCLCREAT_WORD0,
+	NFSD_SUPPATTR_EXCLCREAT_WORD1,
+	NFSD_SUPPATTR_EXCLCREAT_WORD2
+};
+
+static __be32
+check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		   u32 *bmval, u32 *writable)
+{
+	struct dentry *dentry = cstate->current_fh.fh_dentry;
+
+	/*
+	 * Check about attributes are supported by the NFSv4 server or not.
+	 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP.
+	 */
+	if ((bmval[0] & ~nfsd_suppattrs0(cstate->minorversion)) ||
+	    (bmval[1] & ~nfsd_suppattrs1(cstate->minorversion)) ||
+	    (bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
+		return nfserr_attrnotsupp;
+
+	/*
+	 * Check FATTR4_WORD0_ACL can be supported
+	 * in current environment or not.
+	 */
+	if (bmval[0] & FATTR4_WORD0_ACL) {
+		if (!IS_POSIXACL(dentry->d_inode))
+			return nfserr_attrnotsupp;
+	}
+
+	/*
+	 * According to spec, read-only attributes return ERR_INVAL.
+	 */
+	if (writable) {
+		if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
+		    (bmval[2] & ~writable[2]))
+			return nfserr_inval;
+	}
+
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_check_open_attributes(struct svc_rqst *rqstp,
+	struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
+{
+	__be32 status = nfs_ok;
+
+	if (open->op_create == NFS4_OPEN_CREATE) {
+		if (open->op_createmode == NFS4_CREATE_UNCHECKED
+		    || open->op_createmode == NFS4_CREATE_GUARDED)
+			status = check_attr_support(rqstp, cstate,
+					open->op_bmval, nfsd_attrmask);
+		else if (open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1)
+			status = check_attr_support(rqstp, cstate,
+					open->op_bmval, nfsd41_ex_attrmask);
+	}
+
+	return status;
+}
+
+static int
+is_create_with_attrs(struct nfsd4_open *open)
+{
+	return open->op_create == NFS4_OPEN_CREATE
+		&& (open->op_createmode == NFS4_CREATE_UNCHECKED
+		    || open->op_createmode == NFS4_CREATE_GUARDED
+		    || open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1);
+}
+
+/*
+ * if error occurs when setting the acl, just clear the acl bit
+ * in the returned attr bitmap.
+ */
+static void
+do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		struct nfs4_acl *acl, u32 *bmval)
+{
+	__be32 status;
+
+	status = nfsd4_set_nfs4_acl(rqstp, fhp, acl);
+	if (status)
+		/*
+		 * We should probably fail the whole open at this point,
+		 * but we've already created the file, so it's too late;
+		 * So this seems the least of evils:
+		 */
+		bmval[0] &= ~FATTR4_WORD0_ACL;
+}
+
+static inline void
+fh_dup2(struct svc_fh *dst, struct svc_fh *src)
+{
+	fh_put(dst);
+	dget(src->fh_dentry);
+	if (src->fh_export)
+		cache_get(&src->fh_export->h);
+	*dst = *src;
+}
+
+static __be32
+do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode)
+{
+	__be32 status;
+
+	if (open->op_truncate &&
+		!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
+		return nfserr_inval;
+
+	accmode |= NFSD_MAY_READ_IF_EXEC;
+
+	if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
+		accmode |= NFSD_MAY_READ;
+	if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
+		accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC);
+	if (open->op_share_deny & NFS4_SHARE_DENY_READ)
+		accmode |= NFSD_MAY_WRITE;
+
+	status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
+
+	return status;
+}
+
+static __be32
+do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
+{
+	struct svc_fh resfh;
+	__be32 status;
+	int created = 0;
+
+	fh_init(&resfh, NFS4_FHSIZE);
+	open->op_truncate = 0;
+
+	if (open->op_create) {
+		/* FIXME: check session persistence and pnfs flags.
+		 * The nfsv4.1 spec requires the following semantics:
+		 *
+		 * Persistent   | pNFS   | Server REQUIRED | Client Allowed
+		 * Reply Cache  | server |                 |
+		 * -------------+--------+-----------------+--------------------
+		 * no           | no     | EXCLUSIVE4_1    | EXCLUSIVE4_1
+		 *              |        |                 | (SHOULD)
+		 *              |        | and EXCLUSIVE4  | or EXCLUSIVE4
+		 *              |        |                 | (SHOULD NOT)
+		 * no           | yes    | EXCLUSIVE4_1    | EXCLUSIVE4_1
+		 * yes          | no     | GUARDED4        | GUARDED4
+		 * yes          | yes    | GUARDED4        | GUARDED4
+		 */
+
+		/*
+		 * Note: create modes (UNCHECKED,GUARDED...) are the same
+		 * in NFSv4 as in v3 except EXCLUSIVE4_1.
+		 */
+		status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
+					open->op_fname.len, &open->op_iattr,
+					&resfh, open->op_createmode,
+					(u32 *)open->op_verf.data,
+					&open->op_truncate, &created);
+
+		/*
+		 * Following rfc 3530 14.2.16, use the returned bitmask
+		 * to indicate which attributes we used to store the
+		 * verifier:
+		 */
+		if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
+			open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
+						FATTR4_WORD1_TIME_MODIFY);
+	} else {
+		status = nfsd_lookup(rqstp, current_fh,
+				     open->op_fname.data, open->op_fname.len, &resfh);
+		fh_unlock(current_fh);
+	}
+	if (status)
+		goto out;
+
+	if (is_create_with_attrs(open) && open->op_acl != NULL)
+		do_set_nfs4_acl(rqstp, &resfh, open->op_acl, open->op_bmval);
+
+	set_change_info(&open->op_cinfo, current_fh);
+	fh_dup2(current_fh, &resfh);
+
+	/* set reply cache */
+	fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+			&resfh.fh_handle);
+	if (!created)
+		status = do_open_permission(rqstp, current_fh, open,
+					    NFSD_MAY_NOP);
+
+out:
+	fh_put(&resfh);
+	return status;
+}
+
+static __be32
+do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
+{
+	__be32 status;
+
+	/* Only reclaims from previously confirmed clients are valid */
+	if ((status = nfs4_check_open_reclaim(&open->op_clientid)))
+		return status;
+
+	/* We don't know the target directory, and therefore can not
+	* set the change info
+	*/
+
+	memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
+
+	/* set replay cache */
+	fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+			&current_fh->fh_handle);
+
+	open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
+		(open->op_iattr.ia_size == 0);
+
+	status = do_open_permission(rqstp, current_fh, open,
+				    NFSD_MAY_OWNER_OVERRIDE);
+
+	return status;
+}
+
+static void
+copy_clientid(clientid_t *clid, struct nfsd4_session *session)
+{
+	struct nfsd4_sessionid *sid =
+			(struct nfsd4_sessionid *)session->se_sessionid.data;
+
+	clid->cl_boot = sid->clientid.cl_boot;
+	clid->cl_id = sid->clientid.cl_id;
+}
+
+static __be32
+nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	   struct nfsd4_open *open)
+{
+	__be32 status;
+	struct nfsd4_compoundres *resp;
+
+	dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
+		(int)open->op_fname.len, open->op_fname.data,
+		open->op_stateowner);
+
+	/* This check required by spec. */
+	if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
+		return nfserr_inval;
+
+	if (nfsd4_has_session(cstate))
+		copy_clientid(&open->op_clientid, cstate->session);
+
+	nfs4_lock_state();
+
+	/* check seqid for replay. set nfs4_owner */
+	resp = rqstp->rq_resp;
+	status = nfsd4_process_open1(&resp->cstate, open);
+	if (status == nfserr_replay_me) {
+		struct nfs4_replay *rp = &open->op_stateowner->so_replay;
+		fh_put(&cstate->current_fh);
+		fh_copy_shallow(&cstate->current_fh.fh_handle,
+				&rp->rp_openfh);
+		status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
+		if (status)
+			dprintk("nfsd4_open: replay failed"
+				" restoring previous filehandle\n");
+		else
+			status = nfserr_replay_me;
+	}
+	if (status)
+		goto out;
+
+	status = nfsd4_check_open_attributes(rqstp, cstate, open);
+	if (status)
+		goto out;
+
+	/* Openowner is now set, so sequence id will get bumped.  Now we need
+	 * these checks before we do any creates: */
+	status = nfserr_grace;
+	if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+		goto out;
+	status = nfserr_no_grace;
+	if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+		goto out;
+
+	switch (open->op_claim_type) {
+		case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+		case NFS4_OPEN_CLAIM_NULL:
+			/*
+			 * (1) set CURRENT_FH to the file being opened,
+			 * creating it if necessary, (2) set open->op_cinfo,
+			 * (3) set open->op_truncate if the file is to be
+			 * truncated after opening, (4) do permission checking.
+			 */
+			status = do_open_lookup(rqstp, &cstate->current_fh,
+						open);
+			if (status)
+				goto out;
+			break;
+		case NFS4_OPEN_CLAIM_PREVIOUS:
+			open->op_stateowner->so_confirmed = 1;
+			/*
+			 * The CURRENT_FH is already set to the file being
+			 * opened.  (1) set open->op_cinfo, (2) set
+			 * open->op_truncate if the file is to be truncated
+			 * after opening, (3) do permission checking.
+			*/
+			status = do_open_fhandle(rqstp, &cstate->current_fh,
+						 open);
+			if (status)
+				goto out;
+			break;
+             	case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+			open->op_stateowner->so_confirmed = 1;
+			dprintk("NFSD: unsupported OPEN claim type %d\n",
+				open->op_claim_type);
+			status = nfserr_notsupp;
+			goto out;
+		default:
+			dprintk("NFSD: Invalid OPEN claim type %d\n",
+				open->op_claim_type);
+			status = nfserr_inval;
+			goto out;
+	}
+	/*
+	 * nfsd4_process_open2() does the actual opening of the file.  If
+	 * successful, it (1) truncates the file if open->op_truncate was
+	 * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
+	 */
+	status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
+out:
+	if (open->op_stateowner) {
+		nfs4_get_stateowner(open->op_stateowner);
+		cstate->replay_owner = open->op_stateowner;
+	}
+	nfs4_unlock_state();
+	return status;
+}
+
+/*
+ * filehandle-manipulating ops.
+ */
+static __be32
+nfsd4_getfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	    struct svc_fh **getfh)
+{
+	if (!cstate->current_fh.fh_dentry)
+		return nfserr_nofilehandle;
+
+	*getfh = &cstate->current_fh;
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	    struct nfsd4_putfh *putfh)
+{
+	fh_put(&cstate->current_fh);
+	cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
+	memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
+	       putfh->pf_fhlen);
+	return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS);
+}
+
+static __be32
+nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		void *arg)
+{
+	__be32 status;
+
+	fh_put(&cstate->current_fh);
+	status = exp_pseudoroot(rqstp, &cstate->current_fh);
+	return status;
+}
+
+static __be32
+nfsd4_restorefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		void *arg)
+{
+	if (!cstate->save_fh.fh_dentry)
+		return nfserr_restorefh;
+
+	fh_dup2(&cstate->current_fh, &cstate->save_fh);
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_savefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     void *arg)
+{
+	if (!cstate->current_fh.fh_dentry)
+		return nfserr_nofilehandle;
+
+	fh_dup2(&cstate->save_fh, &cstate->current_fh);
+	return nfs_ok;
+}
+
+/*
+ * misc nfsv4 ops
+ */
+static __be32
+nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_access *access)
+{
+	if (access->ac_req_access & ~NFS3_ACCESS_FULL)
+		return nfserr_inval;
+
+	access->ac_resp_access = access->ac_req_access;
+	return nfsd_access(rqstp, &cstate->current_fh, &access->ac_resp_access,
+			   &access->ac_supported);
+}
+
+static __be32
+nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_commit *commit)
+{
+	__be32 status;
+
+	u32 *p = (u32 *)commit->co_verf.data;
+	*p++ = nfssvc_boot.tv_sec;
+	*p++ = nfssvc_boot.tv_usec;
+
+	status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
+			     commit->co_count);
+	if (status == nfserr_symlink)
+		status = nfserr_inval;
+	return status;
+}
+
+static __be32
+nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_create *create)
+{
+	struct svc_fh resfh;
+	__be32 status;
+	dev_t rdev;
+
+	fh_init(&resfh, NFS4_FHSIZE);
+
+	status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
+			   NFSD_MAY_CREATE);
+	if (status == nfserr_symlink)
+		status = nfserr_notdir;
+	if (status)
+		return status;
+
+	status = check_attr_support(rqstp, cstate, create->cr_bmval,
+				    nfsd_attrmask);
+	if (status)
+		return status;
+
+	switch (create->cr_type) {
+	case NF4LNK:
+		/* ugh! we have to null-terminate the linktext, or
+		 * vfs_symlink() will choke.  it is always safe to
+		 * null-terminate by brute force, since at worst we
+		 * will overwrite the first byte of the create namelen
+		 * in the XDR buffer, which has already been extracted
+		 * during XDR decode.
+		 */
+		create->cr_linkname[create->cr_linklen] = 0;
+
+		status = nfsd_symlink(rqstp, &cstate->current_fh,
+				      create->cr_name, create->cr_namelen,
+				      create->cr_linkname, create->cr_linklen,
+				      &resfh, &create->cr_iattr);
+		break;
+
+	case NF4BLK:
+		rdev = MKDEV(create->cr_specdata1, create->cr_specdata2);
+		if (MAJOR(rdev) != create->cr_specdata1 ||
+		    MINOR(rdev) != create->cr_specdata2)
+			return nfserr_inval;
+		status = nfsd_create(rqstp, &cstate->current_fh,
+				     create->cr_name, create->cr_namelen,
+				     &create->cr_iattr, S_IFBLK, rdev, &resfh);
+		break;
+
+	case NF4CHR:
+		rdev = MKDEV(create->cr_specdata1, create->cr_specdata2);
+		if (MAJOR(rdev) != create->cr_specdata1 ||
+		    MINOR(rdev) != create->cr_specdata2)
+			return nfserr_inval;
+		status = nfsd_create(rqstp, &cstate->current_fh,
+				     create->cr_name, create->cr_namelen,
+				     &create->cr_iattr,S_IFCHR, rdev, &resfh);
+		break;
+
+	case NF4SOCK:
+		status = nfsd_create(rqstp, &cstate->current_fh,
+				     create->cr_name, create->cr_namelen,
+				     &create->cr_iattr, S_IFSOCK, 0, &resfh);
+		break;
+
+	case NF4FIFO:
+		status = nfsd_create(rqstp, &cstate->current_fh,
+				     create->cr_name, create->cr_namelen,
+				     &create->cr_iattr, S_IFIFO, 0, &resfh);
+		break;
+
+	case NF4DIR:
+		create->cr_iattr.ia_valid &= ~ATTR_SIZE;
+		status = nfsd_create(rqstp, &cstate->current_fh,
+				     create->cr_name, create->cr_namelen,
+				     &create->cr_iattr, S_IFDIR, 0, &resfh);
+		break;
+
+	default:
+		status = nfserr_badtype;
+	}
+
+	if (status)
+		goto out;
+
+	if (create->cr_acl != NULL)
+		do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
+				create->cr_bmval);
+
+	fh_unlock(&cstate->current_fh);
+	set_change_info(&create->cr_cinfo, &cstate->current_fh);
+	fh_dup2(&cstate->current_fh, &resfh);
+out:
+	fh_put(&resfh);
+	return status;
+}
+
+static __be32
+nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_getattr *getattr)
+{
+	__be32 status;
+
+	status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
+	if (status)
+		return status;
+
+	if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
+		return nfserr_inval;
+
+	getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
+	getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+	getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
+
+	getattr->ga_fhp = &cstate->current_fh;
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	   struct nfsd4_link *link)
+{
+	__be32 status = nfserr_nofilehandle;
+
+	if (!cstate->save_fh.fh_dentry)
+		return status;
+	status = nfsd_link(rqstp, &cstate->current_fh,
+			   link->li_name, link->li_namelen, &cstate->save_fh);
+	if (!status)
+		set_change_info(&link->li_cinfo, &cstate->current_fh);
+	return status;
+}
+
+static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
+{
+	struct svc_fh tmp_fh;
+	__be32 ret;
+
+	fh_init(&tmp_fh, NFS4_FHSIZE);
+	ret = exp_pseudoroot(rqstp, &tmp_fh);
+	if (ret)
+		return ret;
+	if (tmp_fh.fh_dentry == fh->fh_dentry) {
+		fh_put(&tmp_fh);
+		return nfserr_noent;
+	}
+	fh_put(&tmp_fh);
+	return nfsd_lookup(rqstp, fh, "..", 2, fh);
+}
+
+static __be32
+nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      void *arg)
+{
+	return nfsd4_do_lookupp(rqstp, &cstate->current_fh);
+}
+
+static __be32
+nfsd4_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_lookup *lookup)
+{
+	return nfsd_lookup(rqstp, &cstate->current_fh,
+			   lookup->lo_name, lookup->lo_len,
+			   &cstate->current_fh);
+}
+
+static __be32
+nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	   struct nfsd4_read *read)
+{
+	__be32 status;
+
+	/* no need to check permission - this will be done in nfsd_read() */
+
+	read->rd_filp = NULL;
+	if (read->rd_offset >= OFFSET_MAX)
+		return nfserr_inval;
+
+	nfs4_lock_state();
+	/* check stateid */
+	if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid,
+						 RD_STATE, &read->rd_filp))) {
+		dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
+		goto out;
+	}
+	if (read->rd_filp)
+		get_file(read->rd_filp);
+	status = nfs_ok;
+out:
+	nfs4_unlock_state();
+	read->rd_rqstp = rqstp;
+	read->rd_fhp = &cstate->current_fh;
+	return status;
+}
+
+static __be32
+nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_readdir *readdir)
+{
+	u64 cookie = readdir->rd_cookie;
+	static const nfs4_verifier zeroverf;
+
+	/* no need to check permission - this will be done in nfsd_readdir() */
+
+	if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
+		return nfserr_inval;
+
+	readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
+	readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+	readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
+
+	if ((cookie == 1) || (cookie == 2) ||
+	    (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
+		return nfserr_bad_cookie;
+
+	readdir->rd_rqstp = rqstp;
+	readdir->rd_fhp = &cstate->current_fh;
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_readlink(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	       struct nfsd4_readlink *readlink)
+{
+	readlink->rl_rqstp = rqstp;
+	readlink->rl_fhp = &cstate->current_fh;
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_remove *remove)
+{
+	__be32 status;
+
+	if (locks_in_grace())
+		return nfserr_grace;
+	status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
+			     remove->rm_name, remove->rm_namelen);
+	if (status == nfserr_symlink)
+		return nfserr_notdir;
+	if (!status) {
+		fh_unlock(&cstate->current_fh);
+		set_change_info(&remove->rm_cinfo, &cstate->current_fh);
+	}
+	return status;
+}
+
+static __be32
+nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_rename *rename)
+{
+	__be32 status = nfserr_nofilehandle;
+
+	if (!cstate->save_fh.fh_dentry)
+		return status;
+	if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags
+					& NFSEXP_NOSUBTREECHECK))
+		return nfserr_grace;
+	status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
+			     rename->rn_snamelen, &cstate->current_fh,
+			     rename->rn_tname, rename->rn_tnamelen);
+
+	/* the underlying filesystem returns different error's than required
+	 * by NFSv4. both save_fh and current_fh have been verified.. */
+	if (status == nfserr_isdir)
+		status = nfserr_exist;
+	else if ((status == nfserr_notdir) &&
+                  (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) &&
+                   S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode)))
+		status = nfserr_exist;
+	else if (status == nfserr_symlink)
+		status = nfserr_notdir;
+
+	if (!status) {
+		set_change_info(&rename->rn_sinfo, &cstate->current_fh);
+		set_change_info(&rename->rn_tinfo, &cstate->save_fh);
+	}
+	return status;
+}
+
+static __be32
+nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_secinfo *secinfo)
+{
+	struct svc_fh resfh;
+	struct svc_export *exp;
+	struct dentry *dentry;
+	__be32 err;
+
+	fh_init(&resfh, NFS4_FHSIZE);
+	err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC);
+	if (err)
+		return err;
+	err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
+				    secinfo->si_name, secinfo->si_namelen,
+				    &exp, &dentry);
+	if (err)
+		return err;
+	if (dentry->d_inode == NULL) {
+		exp_put(exp);
+		err = nfserr_noent;
+	} else
+		secinfo->si_exp = exp;
+	dput(dentry);
+	if (cstate->minorversion)
+		/* See rfc 5661 section 2.6.3.1.1.8 */
+		fh_put(&cstate->current_fh);
+	return err;
+}
+
+static __be32
+nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_secinfo_no_name *sin)
+{
+	__be32 err;
+
+	switch (sin->sin_style) {
+	case NFS4_SECINFO_STYLE4_CURRENT_FH:
+		break;
+	case NFS4_SECINFO_STYLE4_PARENT:
+		err = nfsd4_do_lookupp(rqstp, &cstate->current_fh);
+		if (err)
+			return err;
+		break;
+	default:
+		return nfserr_inval;
+	}
+	exp_get(cstate->current_fh.fh_export);
+	sin->sin_exp = cstate->current_fh.fh_export;
+	fh_put(&cstate->current_fh);
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_setattr *setattr)
+{
+	__be32 status = nfs_ok;
+	int err;
+
+	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
+		nfs4_lock_state();
+		status = nfs4_preprocess_stateid_op(cstate,
+			&setattr->sa_stateid, WR_STATE, NULL);
+		nfs4_unlock_state();
+		if (status) {
+			dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
+			return status;
+		}
+	}
+	err = mnt_want_write(cstate->current_fh.fh_export->ex_path.mnt);
+	if (err)
+		return nfserrno(err);
+	status = nfs_ok;
+
+	status = check_attr_support(rqstp, cstate, setattr->sa_bmval,
+				    nfsd_attrmask);
+	if (status)
+		goto out;
+
+	if (setattr->sa_acl != NULL)
+		status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
+					    setattr->sa_acl);
+	if (status)
+		goto out;
+	status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
+				0, (time_t)0);
+out:
+	mnt_drop_write(cstate->current_fh.fh_export->ex_path.mnt);
+	return status;
+}
+
+static __be32
+nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	    struct nfsd4_write *write)
+{
+	stateid_t *stateid = &write->wr_stateid;
+	struct file *filp = NULL;
+	u32 *p;
+	__be32 status = nfs_ok;
+	unsigned long cnt;
+
+	/* no need to check permission - this will be done in nfsd_write() */
+
+	if (write->wr_offset >= OFFSET_MAX)
+		return nfserr_inval;
+
+	nfs4_lock_state();
+	status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp);
+	if (filp)
+		get_file(filp);
+	nfs4_unlock_state();
+
+	if (status) {
+		dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
+		return status;
+	}
+
+	cnt = write->wr_buflen;
+	write->wr_how_written = write->wr_stable_how;
+	p = (u32 *)write->wr_verifier.data;
+	*p++ = nfssvc_boot.tv_sec;
+	*p++ = nfssvc_boot.tv_usec;
+
+	status =  nfsd_write(rqstp, &cstate->current_fh, filp,
+			     write->wr_offset, rqstp->rq_vec, write->wr_vlen,
+			     &cnt, &write->wr_how_written);
+	if (filp)
+		fput(filp);
+
+	write->wr_bytes_written = cnt;
+
+	if (status == nfserr_symlink)
+		status = nfserr_inval;
+	return status;
+}
+
+/* This routine never returns NFS_OK!  If there are no other errors, it
+ * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
+ * attributes matched.  VERIFY is implemented by mapping NFSERR_SAME
+ * to NFS_OK after the call; NVERIFY by mapping NFSERR_NOT_SAME to NFS_OK.
+ */
+static __be32
+_nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_verify *verify)
+{
+	__be32 *buf, *p;
+	int count;
+	__be32 status;
+
+	status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
+	if (status)
+		return status;
+
+	status = check_attr_support(rqstp, cstate, verify->ve_bmval, NULL);
+	if (status)
+		return status;
+
+	if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
+	    || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
+		return nfserr_inval;
+	if (verify->ve_attrlen & 3)
+		return nfserr_inval;
+
+	/* count in words:
+	 *   bitmap_len(1) + bitmap(2) + attr_len(1) = 4
+	 */
+	count = 4 + (verify->ve_attrlen >> 2);
+	buf = kmalloc(count << 2, GFP_KERNEL);
+	if (!buf)
+		return nfserr_jukebox;
+
+	status = nfsd4_encode_fattr(&cstate->current_fh,
+				    cstate->current_fh.fh_export,
+				    cstate->current_fh.fh_dentry, buf,
+				    &count, verify->ve_bmval,
+				    rqstp, 0);
+
+	/* this means that nfsd4_encode_fattr() ran out of space */
+	if (status == nfserr_resource && count == 0)
+		status = nfserr_not_same;
+	if (status)
+		goto out_kfree;
+
+	/* skip bitmap */
+	p = buf + 1 + ntohl(buf[0]);
+	status = nfserr_not_same;
+	if (ntohl(*p++) != verify->ve_attrlen)
+		goto out_kfree;
+	if (!memcmp(p, verify->ve_attrval, verify->ve_attrlen))
+		status = nfserr_same;
+
+out_kfree:
+	kfree(buf);
+	return status;
+}
+
+static __be32
+nfsd4_nverify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_verify *verify)
+{
+	__be32 status;
+
+	status = _nfsd4_verify(rqstp, cstate, verify);
+	return status == nfserr_not_same ? nfs_ok : status;
+}
+
+static __be32
+nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	     struct nfsd4_verify *verify)
+{
+	__be32 status;
+
+	status = _nfsd4_verify(rqstp, cstate, verify);
+	return status == nfserr_same ? nfs_ok : status;
+}
+
+/*
+ * NULL call.
+ */
+static __be32
+nfsd4_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	return nfs_ok;
+}
+
+static inline void nfsd4_increment_op_stats(u32 opnum)
+{
+	if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP)
+		nfsdstats.nfs4_opcount[opnum]++;
+}
+
+typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
+			      void *);
+enum nfsd4_op_flags {
+	ALLOWED_WITHOUT_FH = 1 << 0,	/* No current filehandle required */
+	ALLOWED_ON_ABSENT_FS = 1 << 1,	/* ops processed on absent fs */
+	ALLOWED_AS_FIRST_OP = 1 << 2,	/* ops reqired first in compound */
+	/* For rfc 5661 section 2.6.3.1.1: */
+	OP_HANDLES_WRONGSEC = 1 << 3,
+	OP_IS_PUTFH_LIKE = 1 << 4,
+};
+
+struct nfsd4_operation {
+	nfsd4op_func op_func;
+	u32 op_flags;
+	char *op_name;
+};
+
+static struct nfsd4_operation nfsd4_ops[];
+
+static const char *nfsd4_op_name(unsigned opnum);
+
+/*
+ * Enforce NFSv4.1 COMPOUND ordering rules:
+ *
+ * Also note, enforced elsewhere:
+ *	- SEQUENCE other than as first op results in
+ *	  NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
+ *	- BIND_CONN_TO_SESSION must be the only op in its compound.
+ *	  (Enforced in nfsd4_bind_conn_to_session().)
+ *	- DESTROY_SESSION must be the final operation in a compound, if
+ *	  sessionid's in SEQUENCE and DESTROY_SESSION are the same.
+ *	  (Enforced in nfsd4_destroy_session().)
+ */
+static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
+{
+	struct nfsd4_op *op = &args->ops[0];
+
+	/* These ordering requirements don't apply to NFSv4.0: */
+	if (args->minorversion == 0)
+		return nfs_ok;
+	/* This is weird, but OK, not our problem: */
+	if (args->opcnt == 0)
+		return nfs_ok;
+	if (op->status == nfserr_op_illegal)
+		return nfs_ok;
+	if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP))
+		return nfserr_op_not_in_session;
+	if (op->opnum == OP_SEQUENCE)
+		return nfs_ok;
+	if (args->opcnt != 1)
+		return nfserr_not_only_op;
+	return nfs_ok;
+}
+
+static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
+{
+	return &nfsd4_ops[op->opnum];
+}
+
+static bool need_wrongsec_check(struct svc_rqst *rqstp)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+	struct nfsd4_op *this = &argp->ops[resp->opcnt - 1];
+	struct nfsd4_op *next = &argp->ops[resp->opcnt];
+	struct nfsd4_operation *thisd;
+	struct nfsd4_operation *nextd;
+
+	thisd = OPDESC(this);
+	/*
+	 * Most ops check wronsec on our own; only the putfh-like ops
+	 * have special rules.
+	 */
+	if (!(thisd->op_flags & OP_IS_PUTFH_LIKE))
+		return false;
+	/*
+	 * rfc 5661 2.6.3.1.1.6: don't bother erroring out a
+	 * put-filehandle operation if we're not going to use the
+	 * result:
+	 */
+	if (argp->opcnt == resp->opcnt)
+		return false;
+
+	nextd = OPDESC(next);
+	/*
+	 * Rest of 2.6.3.1.1: certain operations will return WRONGSEC
+	 * errors themselves as necessary; others should check for them
+	 * now:
+	 */
+	return !(nextd->op_flags & OP_HANDLES_WRONGSEC);
+}
+
+/*
+ * COMPOUND call.
+ */
+static __be32
+nfsd4_proc_compound(struct svc_rqst *rqstp,
+		    struct nfsd4_compoundargs *args,
+		    struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_op	*op;
+	struct nfsd4_operation *opdesc;
+	struct nfsd4_compound_state *cstate = &resp->cstate;
+	int		slack_bytes;
+	__be32		status;
+
+	resp->xbuf = &rqstp->rq_res;
+	resp->p = rqstp->rq_res.head[0].iov_base +
+						rqstp->rq_res.head[0].iov_len;
+	resp->tagp = resp->p;
+	/* reserve space for: taglen, tag, and opcnt */
+	resp->p += 2 + XDR_QUADLEN(args->taglen);
+	resp->end = rqstp->rq_res.head[0].iov_base + PAGE_SIZE;
+	resp->taglen = args->taglen;
+	resp->tag = args->tag;
+	resp->opcnt = 0;
+	resp->rqstp = rqstp;
+	resp->cstate.minorversion = args->minorversion;
+	resp->cstate.replay_owner = NULL;
+	resp->cstate.session = NULL;
+	fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
+	fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
+	/*
+	 * Don't use the deferral mechanism for NFSv4; compounds make it
+	 * too hard to avoid non-idempotency problems.
+	 */
+	rqstp->rq_usedeferral = 0;
+
+	/*
+	 * According to RFC3010, this takes precedence over all other errors.
+	 */
+	status = nfserr_minor_vers_mismatch;
+	if (args->minorversion > nfsd_supported_minorversion)
+		goto out;
+
+	status = nfs41_check_op_ordering(args);
+	if (status) {
+		op = &args->ops[0];
+		op->status = status;
+		goto encode_op;
+	}
+
+	while (!status && resp->opcnt < args->opcnt) {
+		op = &args->ops[resp->opcnt++];
+
+		dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
+			resp->opcnt, args->opcnt, op->opnum,
+			nfsd4_op_name(op->opnum));
+		/*
+		 * The XDR decode routines may have pre-set op->status;
+		 * for example, if there is a miscellaneous XDR error
+		 * it will be set to nfserr_bad_xdr.
+		 */
+		if (op->status)
+			goto encode_op;
+
+		/* We must be able to encode a successful response to
+		 * this operation, with enough room left over to encode a
+		 * failed response to the next operation.  If we don't
+		 * have enough room, fail with ERR_RESOURCE.
+		 */
+		slack_bytes = (char *)resp->end - (char *)resp->p;
+		if (slack_bytes < COMPOUND_SLACK_SPACE
+				+ COMPOUND_ERR_SLACK_SPACE) {
+			BUG_ON(slack_bytes < COMPOUND_ERR_SLACK_SPACE);
+			op->status = nfserr_resource;
+			goto encode_op;
+		}
+
+		opdesc = OPDESC(op);
+
+		if (!cstate->current_fh.fh_dentry) {
+			if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) {
+				op->status = nfserr_nofilehandle;
+				goto encode_op;
+			}
+		} else if (cstate->current_fh.fh_export->ex_fslocs.migrated &&
+			  !(opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) {
+			op->status = nfserr_moved;
+			goto encode_op;
+		}
+
+		if (opdesc->op_func)
+			op->status = opdesc->op_func(rqstp, cstate, &op->u);
+		else
+			BUG_ON(op->status == nfs_ok);
+
+		if (!op->status && need_wrongsec_check(rqstp))
+			op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp);
+
+encode_op:
+		/* Only from SEQUENCE */
+		if (resp->cstate.status == nfserr_replay_cache) {
+			dprintk("%s NFS4.1 replay from cache\n", __func__);
+			status = op->status;
+			goto out;
+		}
+		if (op->status == nfserr_replay_me) {
+			op->replay = &cstate->replay_owner->so_replay;
+			nfsd4_encode_replay(resp, op);
+			status = op->status = op->replay->rp_status;
+		} else {
+			nfsd4_encode_operation(resp, op);
+			status = op->status;
+		}
+
+		dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n",
+			args->ops, args->opcnt, resp->opcnt, op->opnum,
+			be32_to_cpu(status));
+
+		if (cstate->replay_owner) {
+			nfs4_put_stateowner(cstate->replay_owner);
+			cstate->replay_owner = NULL;
+		}
+		/* XXX Ugh, we need to get rid of this kind of special case: */
+		if (op->opnum == OP_READ && op->u.read.rd_filp)
+			fput(op->u.read.rd_filp);
+
+		nfsd4_increment_op_stats(op->opnum);
+	}
+
+	resp->cstate.status = status;
+	fh_put(&resp->cstate.current_fh);
+	fh_put(&resp->cstate.save_fh);
+	BUG_ON(resp->cstate.replay_owner);
+out:
+	nfsd4_release_compoundargs(args);
+	/* Reset deferral mechanism for RPC deferrals */
+	rqstp->rq_usedeferral = 1;
+	dprintk("nfsv4 compound returned %d\n", ntohl(status));
+	return status;
+}
+
+static struct nfsd4_operation nfsd4_ops[] = {
+	[OP_ACCESS] = {
+		.op_func = (nfsd4op_func)nfsd4_access,
+		.op_name = "OP_ACCESS",
+	},
+	[OP_CLOSE] = {
+		.op_func = (nfsd4op_func)nfsd4_close,
+		.op_name = "OP_CLOSE",
+	},
+	[OP_COMMIT] = {
+		.op_func = (nfsd4op_func)nfsd4_commit,
+		.op_name = "OP_COMMIT",
+	},
+	[OP_CREATE] = {
+		.op_func = (nfsd4op_func)nfsd4_create,
+		.op_name = "OP_CREATE",
+	},
+	[OP_DELEGRETURN] = {
+		.op_func = (nfsd4op_func)nfsd4_delegreturn,
+		.op_name = "OP_DELEGRETURN",
+	},
+	[OP_GETATTR] = {
+		.op_func = (nfsd4op_func)nfsd4_getattr,
+		.op_flags = ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_GETATTR",
+	},
+	[OP_GETFH] = {
+		.op_func = (nfsd4op_func)nfsd4_getfh,
+		.op_name = "OP_GETFH",
+	},
+	[OP_LINK] = {
+		.op_func = (nfsd4op_func)nfsd4_link,
+		.op_name = "OP_LINK",
+	},
+	[OP_LOCK] = {
+		.op_func = (nfsd4op_func)nfsd4_lock,
+		.op_name = "OP_LOCK",
+	},
+	[OP_LOCKT] = {
+		.op_func = (nfsd4op_func)nfsd4_lockt,
+		.op_name = "OP_LOCKT",
+	},
+	[OP_LOCKU] = {
+		.op_func = (nfsd4op_func)nfsd4_locku,
+		.op_name = "OP_LOCKU",
+	},
+	[OP_LOOKUP] = {
+		.op_func = (nfsd4op_func)nfsd4_lookup,
+		.op_flags = OP_HANDLES_WRONGSEC,
+		.op_name = "OP_LOOKUP",
+	},
+	[OP_LOOKUPP] = {
+		.op_func = (nfsd4op_func)nfsd4_lookupp,
+		.op_flags = OP_HANDLES_WRONGSEC,
+		.op_name = "OP_LOOKUPP",
+	},
+	[OP_NVERIFY] = {
+		.op_func = (nfsd4op_func)nfsd4_nverify,
+		.op_name = "OP_NVERIFY",
+	},
+	[OP_OPEN] = {
+		.op_func = (nfsd4op_func)nfsd4_open,
+		.op_flags = OP_HANDLES_WRONGSEC,
+		.op_name = "OP_OPEN",
+	},
+	[OP_OPEN_CONFIRM] = {
+		.op_func = (nfsd4op_func)nfsd4_open_confirm,
+		.op_name = "OP_OPEN_CONFIRM",
+	},
+	[OP_OPEN_DOWNGRADE] = {
+		.op_func = (nfsd4op_func)nfsd4_open_downgrade,
+		.op_name = "OP_OPEN_DOWNGRADE",
+	},
+	[OP_PUTFH] = {
+		.op_func = (nfsd4op_func)nfsd4_putfh,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_IS_PUTFH_LIKE,
+		.op_name = "OP_PUTFH",
+	},
+	[OP_PUTPUBFH] = {
+		.op_func = (nfsd4op_func)nfsd4_putrootfh,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_IS_PUTFH_LIKE,
+		.op_name = "OP_PUTPUBFH",
+	},
+	[OP_PUTROOTFH] = {
+		.op_func = (nfsd4op_func)nfsd4_putrootfh,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_IS_PUTFH_LIKE,
+		.op_name = "OP_PUTROOTFH",
+	},
+	[OP_READ] = {
+		.op_func = (nfsd4op_func)nfsd4_read,
+		.op_name = "OP_READ",
+	},
+	[OP_READDIR] = {
+		.op_func = (nfsd4op_func)nfsd4_readdir,
+		.op_name = "OP_READDIR",
+	},
+	[OP_READLINK] = {
+		.op_func = (nfsd4op_func)nfsd4_readlink,
+		.op_name = "OP_READLINK",
+	},
+	[OP_REMOVE] = {
+		.op_func = (nfsd4op_func)nfsd4_remove,
+		.op_name = "OP_REMOVE",
+	},
+	[OP_RENAME] = {
+		.op_name = "OP_RENAME",
+		.op_func = (nfsd4op_func)nfsd4_rename,
+	},
+	[OP_RENEW] = {
+		.op_func = (nfsd4op_func)nfsd4_renew,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_RENEW",
+	},
+	[OP_RESTOREFH] = {
+		.op_func = (nfsd4op_func)nfsd4_restorefh,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_IS_PUTFH_LIKE,
+		.op_name = "OP_RESTOREFH",
+	},
+	[OP_SAVEFH] = {
+		.op_func = (nfsd4op_func)nfsd4_savefh,
+		.op_flags = OP_HANDLES_WRONGSEC,
+		.op_name = "OP_SAVEFH",
+	},
+	[OP_SECINFO] = {
+		.op_func = (nfsd4op_func)nfsd4_secinfo,
+		.op_flags = OP_HANDLES_WRONGSEC,
+		.op_name = "OP_SECINFO",
+	},
+	[OP_SETATTR] = {
+		.op_func = (nfsd4op_func)nfsd4_setattr,
+		.op_name = "OP_SETATTR",
+	},
+	[OP_SETCLIENTID] = {
+		.op_func = (nfsd4op_func)nfsd4_setclientid,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_SETCLIENTID",
+	},
+	[OP_SETCLIENTID_CONFIRM] = {
+		.op_func = (nfsd4op_func)nfsd4_setclientid_confirm,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_SETCLIENTID_CONFIRM",
+	},
+	[OP_VERIFY] = {
+		.op_func = (nfsd4op_func)nfsd4_verify,
+		.op_name = "OP_VERIFY",
+	},
+	[OP_WRITE] = {
+		.op_func = (nfsd4op_func)nfsd4_write,
+		.op_name = "OP_WRITE",
+	},
+	[OP_RELEASE_LOCKOWNER] = {
+		.op_func = (nfsd4op_func)nfsd4_release_lockowner,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_RELEASE_LOCKOWNER",
+	},
+
+	/* NFSv4.1 operations */
+	[OP_EXCHANGE_ID] = {
+		.op_func = (nfsd4op_func)nfsd4_exchange_id,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_EXCHANGE_ID",
+	},
+	[OP_BIND_CONN_TO_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_BIND_CONN_TO_SESSION",
+	},
+	[OP_CREATE_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_create_session,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_CREATE_SESSION",
+	},
+	[OP_DESTROY_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_destroy_session,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_DESTROY_SESSION",
+	},
+	[OP_SEQUENCE] = {
+		.op_func = (nfsd4op_func)nfsd4_sequence,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_SEQUENCE",
+	},
+	[OP_RECLAIM_COMPLETE] = {
+		.op_func = (nfsd4op_func)nfsd4_reclaim_complete,
+		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_name = "OP_RECLAIM_COMPLETE",
+	},
+	[OP_SECINFO_NO_NAME] = {
+		.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
+		.op_flags = OP_HANDLES_WRONGSEC,
+		.op_name = "OP_SECINFO_NO_NAME",
+	},
+};
+
+static const char *nfsd4_op_name(unsigned opnum)
+{
+	if (opnum < ARRAY_SIZE(nfsd4_ops))
+		return nfsd4_ops[opnum].op_name;
+	return "unknown_operation";
+}
+
+#define nfsd4_voidres			nfsd4_voidargs
+struct nfsd4_voidargs { int dummy; };
+
+/*
+ * TODO: At the present time, the NFSv4 server does not do XID caching
+ * of requests.  Implementing XID caching would not be a serious problem,
+ * although it would require a mild change in interfaces since one
+ * doesn't know whether an NFSv4 request is idempotent until after the
+ * XDR decode.  However, XID caching totally confuses pynfs (Peter
+ * Astrand's regression testsuite for NFSv4 servers), which reuses
+ * XID's liberally, so I've left it unimplemented until pynfs generates
+ * better XID's.
+ */
+static struct svc_procedure		nfsd_procedures4[2] = {
+	[NFSPROC4_NULL] = {
+		.pc_func = (svc_procfunc) nfsd4_proc_null,
+		.pc_encode = (kxdrproc_t) nfs4svc_encode_voidres,
+		.pc_argsize = sizeof(struct nfsd4_voidargs),
+		.pc_ressize = sizeof(struct nfsd4_voidres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = 1,
+	},
+	[NFSPROC4_COMPOUND] = {
+		.pc_func = (svc_procfunc) nfsd4_proc_compound,
+		.pc_decode = (kxdrproc_t) nfs4svc_decode_compoundargs,
+		.pc_encode = (kxdrproc_t) nfs4svc_encode_compoundres,
+		.pc_argsize = sizeof(struct nfsd4_compoundargs),
+		.pc_ressize = sizeof(struct nfsd4_compoundres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = NFSD_BUFSIZE/4,
+	},
+};
+
+struct svc_version	nfsd_version4 = {
+		.vs_vers	= 4,
+		.vs_nproc	= 2,
+		.vs_proc	= nfsd_procedures4,
+		.vs_dispatch	= nfsd_dispatch,
+		.vs_xdrsize	= NFS4_SVC_XDRSIZE,
+};
+
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
new file mode 100644
index 00000000..be268148
--- /dev/null
+++ b/fs/nfsd/nfs4recover.c
@@ -0,0 +1,402 @@
+/*
+*  Copyright (c) 2004 The Regents of the University of Michigan.
+*  All rights reserved.
+*
+*  Andy Adamson <andros@citi.umich.edu>
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions
+*  are met:
+*
+*  1. Redistributions of source code must retain the above copyright
+*     notice, this list of conditions and the following disclaimer.
+*  2. Redistributions in binary form must reproduce the above copyright
+*     notice, this list of conditions and the following disclaimer in the
+*     documentation and/or other materials provided with the distribution.
+*  3. Neither the name of the University nor the names of its
+*     contributors may be used to endorse or promote products derived
+*     from this software without specific prior written permission.
+*
+*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+*  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+*  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+*  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+*  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+*  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+*  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+*  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/crypto.h>
+#include <linux/sched.h>
+
+#include "nfsd.h"
+#include "state.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY                NFSDDBG_PROC
+
+/* Globals */
+static struct file *rec_file;
+
+static int
+nfs4_save_creds(const struct cred **original_creds)
+{
+	struct cred *new;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	new->fsuid = 0;
+	new->fsgid = 0;
+	*original_creds = override_creds(new);
+	put_cred(new);
+	return 0;
+}
+
+static void
+nfs4_reset_creds(const struct cred *original)
+{
+	revert_creds(original);
+}
+
+static void
+md5_to_hex(char *out, char *md5)
+{
+	int i;
+
+	for (i=0; i<16; i++) {
+		unsigned char c = md5[i];
+
+		*out++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1);
+		*out++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1);
+	}
+	*out = '\0';
+}
+
+__be32
+nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
+{
+	struct xdr_netobj cksum;
+	struct hash_desc desc;
+	struct scatterlist sg;
+	__be32 status = nfserr_jukebox;
+
+	dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
+			clname->len, clname->data);
+	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(desc.tfm))
+		goto out_no_tfm;
+	cksum.len = crypto_hash_digestsize(desc.tfm);
+	cksum.data = kmalloc(cksum.len, GFP_KERNEL);
+	if (cksum.data == NULL)
+ 		goto out;
+
+	sg_init_one(&sg, clname->data, clname->len);
+
+	if (crypto_hash_digest(&desc, &sg, sg.length, cksum.data))
+		goto out;
+
+	md5_to_hex(dname, cksum.data);
+
+	status = nfs_ok;
+out:
+	kfree(cksum.data);
+	crypto_free_hash(desc.tfm);
+out_no_tfm:
+	return status;
+}
+
+int
+nfsd4_create_clid_dir(struct nfs4_client *clp)
+{
+	const struct cred *original_cred;
+	char *dname = clp->cl_recdir;
+	struct dentry *dir, *dentry;
+	int status;
+
+	dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
+
+	if (!rec_file || clp->cl_firststate)
+		return 0;
+
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0)
+		return status;
+
+	dir = rec_file->f_path.dentry;
+	/* lock the parent */
+	mutex_lock(&dir->d_inode->i_mutex);
+
+	dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
+	if (IS_ERR(dentry)) {
+		status = PTR_ERR(dentry);
+		goto out_unlock;
+	}
+	status = -EEXIST;
+	if (dentry->d_inode) {
+		dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
+		goto out_put;
+	}
+	status = mnt_want_write(rec_file->f_path.mnt);
+	if (status)
+		goto out_put;
+	status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
+	mnt_drop_write(rec_file->f_path.mnt);
+out_put:
+	dput(dentry);
+out_unlock:
+	mutex_unlock(&dir->d_inode->i_mutex);
+	if (status == 0) {
+		clp->cl_firststate = 1;
+		vfs_fsync(rec_file, 0);
+	}
+	nfs4_reset_creds(original_cred);
+	dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
+	return status;
+}
+
+typedef int (recdir_func)(struct dentry *, struct dentry *);
+
+struct name_list {
+	char name[HEXDIR_LEN];
+	struct list_head list;
+};
+
+static int
+nfsd4_build_namelist(void *arg, const char *name, int namlen,
+		loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct list_head *names = arg;
+	struct name_list *entry;
+
+	if (namlen != HEXDIR_LEN - 1)
+		return 0;
+	entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
+	if (entry == NULL)
+		return -ENOMEM;
+	memcpy(entry->name, name, HEXDIR_LEN - 1);
+	entry->name[HEXDIR_LEN - 1] = '\0';
+	list_add(&entry->list, names);
+	return 0;
+}
+
+static int
+nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
+{
+	const struct cred *original_cred;
+	struct file *filp;
+	LIST_HEAD(names);
+	struct name_list *entry;
+	struct dentry *dentry;
+	int status;
+
+	if (!rec_file)
+		return 0;
+
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0)
+		return status;
+
+	filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY,
+			   current_cred());
+	status = PTR_ERR(filp);
+	if (IS_ERR(filp))
+		goto out;
+	status = vfs_readdir(filp, nfsd4_build_namelist, &names);
+	fput(filp);
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+	while (!list_empty(&names)) {
+		entry = list_entry(names.next, struct name_list, list);
+
+		dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
+		if (IS_ERR(dentry)) {
+			status = PTR_ERR(dentry);
+			break;
+		}
+		status = f(dir, dentry);
+		dput(dentry);
+		if (status)
+			break;
+		list_del(&entry->list);
+		kfree(entry);
+	}
+	mutex_unlock(&dir->d_inode->i_mutex);
+out:
+	while (!list_empty(&names)) {
+		entry = list_entry(names.next, struct name_list, list);
+		list_del(&entry->list);
+		kfree(entry);
+	}
+	nfs4_reset_creds(original_cred);
+	return status;
+}
+
+static int
+nfsd4_unlink_clid_dir(char *name, int namlen)
+{
+	struct dentry *dir, *dentry;
+	int status;
+
+	dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
+
+	dir = rec_file->f_path.dentry;
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_one_len(name, dir, namlen);
+	if (IS_ERR(dentry)) {
+		status = PTR_ERR(dentry);
+		goto out_unlock;
+	}
+	status = -ENOENT;
+	if (!dentry->d_inode)
+		goto out;
+	status = vfs_rmdir(dir->d_inode, dentry);
+out:
+	dput(dentry);
+out_unlock:
+	mutex_unlock(&dir->d_inode->i_mutex);
+	return status;
+}
+
+void
+nfsd4_remove_clid_dir(struct nfs4_client *clp)
+{
+	const struct cred *original_cred;
+	int status;
+
+	if (!rec_file || !clp->cl_firststate)
+		return;
+
+	status = mnt_want_write(rec_file->f_path.mnt);
+	if (status)
+		goto out;
+	clp->cl_firststate = 0;
+
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0)
+		goto out;
+
+	status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
+	nfs4_reset_creds(original_cred);
+	if (status == 0)
+		vfs_fsync(rec_file, 0);
+	mnt_drop_write(rec_file->f_path.mnt);
+out:
+	if (status)
+		printk("NFSD: Failed to remove expired client state directory"
+				" %.*s\n", HEXDIR_LEN, clp->cl_recdir);
+	return;
+}
+
+static int
+purge_old(struct dentry *parent, struct dentry *child)
+{
+	int status;
+
+	if (nfs4_has_reclaimed_state(child->d_name.name, false))
+		return 0;
+
+	status = vfs_rmdir(parent->d_inode, child);
+	if (status)
+		printk("failed to remove client recovery directory %s\n",
+				child->d_name.name);
+	/* Keep trying, success or failure: */
+	return 0;
+}
+
+void
+nfsd4_recdir_purge_old(void) {
+	int status;
+
+	if (!rec_file)
+		return;
+	status = mnt_want_write(rec_file->f_path.mnt);
+	if (status)
+		goto out;
+	status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old);
+	if (status == 0)
+		vfs_fsync(rec_file, 0);
+	mnt_drop_write(rec_file->f_path.mnt);
+out:
+	if (status)
+		printk("nfsd4: failed to purge old clients from recovery"
+			" directory %s\n", rec_file->f_path.dentry->d_name.name);
+}
+
+static int
+load_recdir(struct dentry *parent, struct dentry *child)
+{
+	if (child->d_name.len != HEXDIR_LEN - 1) {
+		printk("nfsd4: illegal name %s in recovery directory\n",
+				child->d_name.name);
+		/* Keep trying; maybe the others are OK: */
+		return 0;
+	}
+	nfs4_client_to_reclaim(child->d_name.name);
+	return 0;
+}
+
+int
+nfsd4_recdir_load(void) {
+	int status;
+
+	if (!rec_file)
+		return 0;
+
+	status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir);
+	if (status)
+		printk("nfsd4: failed loading clients from recovery"
+			" directory %s\n", rec_file->f_path.dentry->d_name.name);
+	return status;
+}
+
+/*
+ * Hold reference to the recovery directory.
+ */
+
+void
+nfsd4_init_recdir(char *rec_dirname)
+{
+	const struct cred *original_cred;
+	int status;
+
+	printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
+			rec_dirname);
+
+	BUG_ON(rec_file);
+
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0) {
+		printk("NFSD: Unable to change credentials to find recovery"
+		       " directory: error %d\n",
+		       status);
+		return;
+	}
+
+	rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
+	if (IS_ERR(rec_file)) {
+		printk("NFSD: unable to find recovery directory %s\n",
+				rec_dirname);
+		rec_file = NULL;
+	}
+
+	nfs4_reset_creds(original_cred);
+}
+
+void
+nfsd4_shutdown_recdir(void)
+{
+	if (!rec_file)
+		return;
+	fput(rec_file);
+	rec_file = NULL;
+}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
new file mode 100644
index 00000000..92f7eb7c
--- /dev/null
+++ b/fs/nfsd/nfs4state.c
@@ -0,0 +1,4485 @@
+/*
+*  Copyright (c) 2001 The Regents of the University of Michigan.
+*  All rights reserved.
+*
+*  Kendrick Smith <kmsmith@umich.edu>
+*  Andy Adamson <kandros@umich.edu>
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions
+*  are met:
+*
+*  1. Redistributions of source code must retain the above copyright
+*     notice, this list of conditions and the following disclaimer.
+*  2. Redistributions in binary form must reproduce the above copyright
+*     notice, this list of conditions and the following disclaimer in the
+*     documentation and/or other materials provided with the distribution.
+*  3. Neither the name of the University nor the names of its
+*     contributors may be used to endorse or promote products derived
+*     from this software without specific prior written permission.
+*
+*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+*  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+*  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+*  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+*  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+*  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+*  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+*  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/swap.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/clnt.h>
+#include "xdr4.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY                NFSDDBG_PROC
+
+/* Globals */
+time_t nfsd4_lease = 90;     /* default lease time */
+time_t nfsd4_grace = 90;
+static time_t boot_time;
+static u32 current_ownerid = 1;
+static u32 current_fileid = 1;
+static u32 current_delegid = 1;
+static stateid_t zerostateid;             /* bits all 0 */
+static stateid_t onestateid;              /* bits all 1 */
+static u64 current_sessionid = 1;
+
+#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
+#define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
+
+/* forward declarations */
+static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
+static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
+static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
+static void nfs4_set_recdir(char *recdir);
+
+/* Locking: */
+
+/* Currently used for almost all code touching nfsv4 state: */
+static DEFINE_MUTEX(client_mutex);
+
+/*
+ * Currently used for the del_recall_lru and file hash table.  In an
+ * effort to decrease the scope of the client_mutex, this spinlock may
+ * eventually cover more:
+ */
+static DEFINE_SPINLOCK(recall_lock);
+
+static struct kmem_cache *stateowner_slab = NULL;
+static struct kmem_cache *file_slab = NULL;
+static struct kmem_cache *stateid_slab = NULL;
+static struct kmem_cache *deleg_slab = NULL;
+
+void
+nfs4_lock_state(void)
+{
+	mutex_lock(&client_mutex);
+}
+
+void
+nfs4_unlock_state(void)
+{
+	mutex_unlock(&client_mutex);
+}
+
+static inline u32
+opaque_hashval(const void *ptr, int nbytes)
+{
+	unsigned char *cptr = (unsigned char *) ptr;
+
+	u32 x = 0;
+	while (nbytes--) {
+		x *= 37;
+		x += *cptr++;
+	}
+	return x;
+}
+
+static struct list_head del_recall_lru;
+
+static inline void
+put_nfs4_file(struct nfs4_file *fi)
+{
+	if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
+		list_del(&fi->fi_hash);
+		spin_unlock(&recall_lock);
+		iput(fi->fi_inode);
+		kmem_cache_free(file_slab, fi);
+	}
+}
+
+static inline void
+get_nfs4_file(struct nfs4_file *fi)
+{
+	atomic_inc(&fi->fi_ref);
+}
+
+static int num_delegations;
+unsigned int max_delegations;
+
+/*
+ * Open owner state (share locks)
+ */
+
+/* hash tables for nfs4_stateowner */
+#define OWNER_HASH_BITS              8
+#define OWNER_HASH_SIZE             (1 << OWNER_HASH_BITS)
+#define OWNER_HASH_MASK             (OWNER_HASH_SIZE - 1)
+
+#define ownerid_hashval(id) \
+        ((id) & OWNER_HASH_MASK)
+#define ownerstr_hashval(clientid, ownername) \
+        (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
+
+static struct list_head	ownerid_hashtbl[OWNER_HASH_SIZE];
+static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
+
+/* hash table for nfs4_file */
+#define FILE_HASH_BITS                   8
+#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
+
+/* hash table for (open)nfs4_stateid */
+#define STATEID_HASH_BITS              10
+#define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
+#define STATEID_HASH_MASK              (STATEID_HASH_SIZE - 1)
+
+#define file_hashval(x) \
+        hash_ptr(x, FILE_HASH_BITS)
+#define stateid_hashval(owner_id, file_id)  \
+        (((owner_id) + (file_id)) & STATEID_HASH_MASK)
+
+static struct list_head file_hashtbl[FILE_HASH_SIZE];
+static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
+
+static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
+{
+	BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
+	atomic_inc(&fp->fi_access[oflag]);
+}
+
+static void nfs4_file_get_access(struct nfs4_file *fp, int oflag)
+{
+	if (oflag == O_RDWR) {
+		__nfs4_file_get_access(fp, O_RDONLY);
+		__nfs4_file_get_access(fp, O_WRONLY);
+	} else
+		__nfs4_file_get_access(fp, oflag);
+}
+
+static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
+{
+	if (fp->fi_fds[oflag]) {
+		fput(fp->fi_fds[oflag]);
+		fp->fi_fds[oflag] = NULL;
+	}
+}
+
+static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
+{
+	if (atomic_dec_and_test(&fp->fi_access[oflag])) {
+		nfs4_file_put_fd(fp, oflag);
+		/*
+		 * It's also safe to get rid of the RDWR open *if*
+		 * we no longer have need of the other kind of access
+		 * or if we already have the other kind of open:
+		 */
+		if (fp->fi_fds[1-oflag]
+			|| atomic_read(&fp->fi_access[1 - oflag]) == 0)
+			nfs4_file_put_fd(fp, O_RDWR);
+	}
+}
+
+static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
+{
+	if (oflag == O_RDWR) {
+		__nfs4_file_put_access(fp, O_RDONLY);
+		__nfs4_file_put_access(fp, O_WRONLY);
+	} else
+		__nfs4_file_put_access(fp, oflag);
+}
+
+static struct nfs4_delegation *
+alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
+{
+	struct nfs4_delegation *dp;
+	struct nfs4_file *fp = stp->st_file;
+
+	dprintk("NFSD alloc_init_deleg\n");
+	/*
+	 * Major work on the lease subsystem (for example, to support
+	 * calbacks on stat) will be required before we can support
+	 * write delegations properly.
+	 */
+	if (type != NFS4_OPEN_DELEGATE_READ)
+		return NULL;
+	if (fp->fi_had_conflict)
+		return NULL;
+	if (num_delegations > max_delegations)
+		return NULL;
+	dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
+	if (dp == NULL)
+		return dp;
+	num_delegations++;
+	INIT_LIST_HEAD(&dp->dl_perfile);
+	INIT_LIST_HEAD(&dp->dl_perclnt);
+	INIT_LIST_HEAD(&dp->dl_recall_lru);
+	dp->dl_client = clp;
+	get_nfs4_file(fp);
+	dp->dl_file = fp;
+	dp->dl_type = type;
+	dp->dl_stateid.si_boot = boot_time;
+	dp->dl_stateid.si_stateownerid = current_delegid++;
+	dp->dl_stateid.si_fileid = 0;
+	dp->dl_stateid.si_generation = 0;
+	fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
+	dp->dl_time = 0;
+	atomic_set(&dp->dl_count, 1);
+	INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
+	return dp;
+}
+
+void
+nfs4_put_delegation(struct nfs4_delegation *dp)
+{
+	if (atomic_dec_and_test(&dp->dl_count)) {
+		dprintk("NFSD: freeing dp %p\n",dp);
+		put_nfs4_file(dp->dl_file);
+		kmem_cache_free(deleg_slab, dp);
+		num_delegations--;
+	}
+}
+
+static void nfs4_put_deleg_lease(struct nfs4_file *fp)
+{
+	if (atomic_dec_and_test(&fp->fi_delegees)) {
+		vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease);
+		fp->fi_lease = NULL;
+		fput(fp->fi_deleg_file);
+		fp->fi_deleg_file = NULL;
+	}
+}
+
+/* Called under the state lock. */
+static void
+unhash_delegation(struct nfs4_delegation *dp)
+{
+	list_del_init(&dp->dl_perclnt);
+	spin_lock(&recall_lock);
+	list_del_init(&dp->dl_perfile);
+	list_del_init(&dp->dl_recall_lru);
+	spin_unlock(&recall_lock);
+	nfs4_put_deleg_lease(dp->dl_file);
+	nfs4_put_delegation(dp);
+}
+
+/* 
+ * SETCLIENTID state 
+ */
+
+/* client_lock protects the client lru list and session hash table */
+static DEFINE_SPINLOCK(client_lock);
+
+/* Hash tables for nfs4_clientid state */
+#define CLIENT_HASH_BITS                 4
+#define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS)
+#define CLIENT_HASH_MASK                (CLIENT_HASH_SIZE - 1)
+
+#define clientid_hashval(id) \
+	((id) & CLIENT_HASH_MASK)
+#define clientstr_hashval(name) \
+	(opaque_hashval((name), 8) & CLIENT_HASH_MASK)
+/*
+ * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
+ * used in reboot/reset lease grace period processing
+ *
+ * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed
+ * setclientid_confirmed info. 
+ *
+ * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed 
+ * setclientid info.
+ *
+ * client_lru holds client queue ordered by nfs4_client.cl_time
+ * for lease renewal.
+ *
+ * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
+ * for last close replay.
+ */
+static struct list_head	reclaim_str_hashtbl[CLIENT_HASH_SIZE];
+static int reclaim_str_hashtbl_size = 0;
+static struct list_head	conf_id_hashtbl[CLIENT_HASH_SIZE];
+static struct list_head	conf_str_hashtbl[CLIENT_HASH_SIZE];
+static struct list_head	unconf_str_hashtbl[CLIENT_HASH_SIZE];
+static struct list_head	unconf_id_hashtbl[CLIENT_HASH_SIZE];
+static struct list_head client_lru;
+static struct list_head close_lru;
+
+/*
+ * We store the NONE, READ, WRITE, and BOTH bits separately in the
+ * st_{access,deny}_bmap field of the stateid, in order to track not
+ * only what share bits are currently in force, but also what
+ * combinations of share bits previous opens have used.  This allows us
+ * to enforce the recommendation of rfc 3530 14.2.19 that the server
+ * return an error if the client attempt to downgrade to a combination
+ * of share bits not explicable by closing some of its previous opens.
+ *
+ * XXX: This enforcement is actually incomplete, since we don't keep
+ * track of access/deny bit combinations; so, e.g., we allow:
+ *
+ *	OPEN allow read, deny write
+ *	OPEN allow both, deny none
+ *	DOWNGRADE allow read, deny none
+ *
+ * which we should reject.
+ */
+static void
+set_access(unsigned int *access, unsigned long bmap) {
+	int i;
+
+	*access = 0;
+	for (i = 1; i < 4; i++) {
+		if (test_bit(i, &bmap))
+			*access |= i;
+	}
+}
+
+static void
+set_deny(unsigned int *deny, unsigned long bmap) {
+	int i;
+
+	*deny = 0;
+	for (i = 0; i < 4; i++) {
+		if (test_bit(i, &bmap))
+			*deny |= i ;
+	}
+}
+
+static int
+test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
+	unsigned int access, deny;
+
+	set_access(&access, stp->st_access_bmap);
+	set_deny(&deny, stp->st_deny_bmap);
+	if ((access & open->op_share_deny) || (deny & open->op_share_access))
+		return 0;
+	return 1;
+}
+
+static int nfs4_access_to_omode(u32 access)
+{
+	switch (access & NFS4_SHARE_ACCESS_BOTH) {
+	case NFS4_SHARE_ACCESS_READ:
+		return O_RDONLY;
+	case NFS4_SHARE_ACCESS_WRITE:
+		return O_WRONLY;
+	case NFS4_SHARE_ACCESS_BOTH:
+		return O_RDWR;
+	}
+	BUG();
+}
+
+static void unhash_generic_stateid(struct nfs4_stateid *stp)
+{
+	list_del(&stp->st_hash);
+	list_del(&stp->st_perfile);
+	list_del(&stp->st_perstateowner);
+}
+
+static void free_generic_stateid(struct nfs4_stateid *stp)
+{
+	int i;
+
+	if (stp->st_access_bmap) {
+		for (i = 1; i < 4; i++) {
+			if (test_bit(i, &stp->st_access_bmap))
+				nfs4_file_put_access(stp->st_file,
+						nfs4_access_to_omode(i));
+		}
+	}
+	put_nfs4_file(stp->st_file);
+	kmem_cache_free(stateid_slab, stp);
+}
+
+static void release_lock_stateid(struct nfs4_stateid *stp)
+{
+	struct file *file;
+
+	unhash_generic_stateid(stp);
+	file = find_any_file(stp->st_file);
+	if (file)
+		locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
+	free_generic_stateid(stp);
+}
+
+static void unhash_lockowner(struct nfs4_stateowner *sop)
+{
+	struct nfs4_stateid *stp;
+
+	list_del(&sop->so_idhash);
+	list_del(&sop->so_strhash);
+	list_del(&sop->so_perstateid);
+	while (!list_empty(&sop->so_stateids)) {
+		stp = list_first_entry(&sop->so_stateids,
+				struct nfs4_stateid, st_perstateowner);
+		release_lock_stateid(stp);
+	}
+}
+
+static void release_lockowner(struct nfs4_stateowner *sop)
+{
+	unhash_lockowner(sop);
+	nfs4_put_stateowner(sop);
+}
+
+static void
+release_stateid_lockowners(struct nfs4_stateid *open_stp)
+{
+	struct nfs4_stateowner *lock_sop;
+
+	while (!list_empty(&open_stp->st_lockowners)) {
+		lock_sop = list_entry(open_stp->st_lockowners.next,
+				struct nfs4_stateowner, so_perstateid);
+		/* list_del(&open_stp->st_lockowners);  */
+		BUG_ON(lock_sop->so_is_open_owner);
+		release_lockowner(lock_sop);
+	}
+}
+
+static void release_open_stateid(struct nfs4_stateid *stp)
+{
+	unhash_generic_stateid(stp);
+	release_stateid_lockowners(stp);
+	free_generic_stateid(stp);
+}
+
+static void unhash_openowner(struct nfs4_stateowner *sop)
+{
+	struct nfs4_stateid *stp;
+
+	list_del(&sop->so_idhash);
+	list_del(&sop->so_strhash);
+	list_del(&sop->so_perclient);
+	list_del(&sop->so_perstateid); /* XXX: necessary? */
+	while (!list_empty(&sop->so_stateids)) {
+		stp = list_first_entry(&sop->so_stateids,
+				struct nfs4_stateid, st_perstateowner);
+		release_open_stateid(stp);
+	}
+}
+
+static void release_openowner(struct nfs4_stateowner *sop)
+{
+	unhash_openowner(sop);
+	list_del(&sop->so_close_lru);
+	nfs4_put_stateowner(sop);
+}
+
+#define SESSION_HASH_SIZE	512
+static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
+
+static inline int
+hash_sessionid(struct nfs4_sessionid *sessionid)
+{
+	struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
+
+	return sid->sequence % SESSION_HASH_SIZE;
+}
+
+static inline void
+dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
+{
+	u32 *ptr = (u32 *)(&sessionid->data[0]);
+	dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+
+static void
+gen_sessionid(struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+	struct nfsd4_sessionid *sid;
+
+	sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
+	sid->clientid = clp->cl_clientid;
+	sid->sequence = current_sessionid++;
+	sid->reserved = 0;
+}
+
+/*
+ * The protocol defines ca_maxresponssize_cached to include the size of
+ * the rpc header, but all we need to cache is the data starting after
+ * the end of the initial SEQUENCE operation--the rest we regenerate
+ * each time.  Therefore we can advertise a ca_maxresponssize_cached
+ * value that is the number of bytes in our cache plus a few additional
+ * bytes.  In order to stay on the safe side, and not promise more than
+ * we can cache, those additional bytes must be the minimum possible: 24
+ * bytes of rpc header (xid through accept state, with AUTH_NULL
+ * verifier), 12 for the compound header (with zero-length tag), and 44
+ * for the SEQUENCE op response:
+ */
+#define NFSD_MIN_HDR_SEQ_SZ  (24 + 12 + 44)
+
+static void
+free_session_slots(struct nfsd4_session *ses)
+{
+	int i;
+
+	for (i = 0; i < ses->se_fchannel.maxreqs; i++)
+		kfree(ses->se_slots[i]);
+}
+
+/*
+ * We don't actually need to cache the rpc and session headers, so we
+ * can allocate a little less for each slot:
+ */
+static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
+{
+	return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+}
+
+static int nfsd4_sanitize_slot_size(u32 size)
+{
+	size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */
+	size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE);
+
+	return size;
+}
+
+/*
+ * XXX: If we run out of reserved DRC memory we could (up to a point)
+ * re-negotiate active sessions and reduce their slot usage to make
+ * rooom for new connections. For now we just fail the create session.
+ */
+static int nfsd4_get_drc_mem(int slotsize, u32 num)
+{
+	int avail;
+
+	num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
+
+	spin_lock(&nfsd_drc_lock);
+	avail = min_t(int, NFSD_MAX_MEM_PER_SESSION,
+			nfsd_drc_max_mem - nfsd_drc_mem_used);
+	num = min_t(int, num, avail / slotsize);
+	nfsd_drc_mem_used += num * slotsize;
+	spin_unlock(&nfsd_drc_lock);
+
+	return num;
+}
+
+static void nfsd4_put_drc_mem(int slotsize, int num)
+{
+	spin_lock(&nfsd_drc_lock);
+	nfsd_drc_mem_used -= slotsize * num;
+	spin_unlock(&nfsd_drc_lock);
+}
+
+static struct nfsd4_session *alloc_session(int slotsize, int numslots)
+{
+	struct nfsd4_session *new;
+	int mem, i;
+
+	BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
+			+ sizeof(struct nfsd4_session) > PAGE_SIZE);
+	mem = numslots * sizeof(struct nfsd4_slot *);
+
+	new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
+	if (!new)
+		return NULL;
+	/* allocate each struct nfsd4_slot and data cache in one piece */
+	for (i = 0; i < numslots; i++) {
+		mem = sizeof(struct nfsd4_slot) + slotsize;
+		new->se_slots[i] = kzalloc(mem, GFP_KERNEL);
+		if (!new->se_slots[i])
+			goto out_free;
+	}
+	return new;
+out_free:
+	while (i--)
+		kfree(new->se_slots[i]);
+	kfree(new);
+	return NULL;
+}
+
+static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize)
+{
+	u32 maxrpc = nfsd_serv->sv_max_mesg;
+
+	new->maxreqs = numslots;
+	new->maxresp_cached = min_t(u32, req->maxresp_cached,
+					slotsize + NFSD_MIN_HDR_SEQ_SZ);
+	new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
+	new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
+	new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
+}
+
+static void free_conn(struct nfsd4_conn *c)
+{
+	svc_xprt_put(c->cn_xprt);
+	kfree(c);
+}
+
+static void nfsd4_conn_lost(struct svc_xpt_user *u)
+{
+	struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user);
+	struct nfs4_client *clp = c->cn_session->se_client;
+
+	spin_lock(&clp->cl_lock);
+	if (!list_empty(&c->cn_persession)) {
+		list_del(&c->cn_persession);
+		free_conn(c);
+	}
+	spin_unlock(&clp->cl_lock);
+	nfsd4_probe_callback(clp);
+}
+
+static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
+{
+	struct nfsd4_conn *conn;
+
+	conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL);
+	if (!conn)
+		return NULL;
+	svc_xprt_get(rqstp->rq_xprt);
+	conn->cn_xprt = rqstp->rq_xprt;
+	conn->cn_flags = flags;
+	INIT_LIST_HEAD(&conn->cn_xpt_user.list);
+	return conn;
+}
+
+static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
+{
+	conn->cn_session = ses;
+	list_add(&conn->cn_persession, &ses->se_conns);
+}
+
+static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+
+	spin_lock(&clp->cl_lock);
+	__nfsd4_hash_conn(conn, ses);
+	spin_unlock(&clp->cl_lock);
+}
+
+static int nfsd4_register_conn(struct nfsd4_conn *conn)
+{
+	conn->cn_xpt_user.callback = nfsd4_conn_lost;
+	return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
+}
+
+static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir)
+{
+	struct nfsd4_conn *conn;
+	int ret;
+
+	conn = alloc_conn(rqstp, dir);
+	if (!conn)
+		return nfserr_jukebox;
+	nfsd4_hash_conn(conn, ses);
+	ret = nfsd4_register_conn(conn);
+	if (ret)
+		/* oops; xprt is already down: */
+		nfsd4_conn_lost(&conn->cn_xpt_user);
+	return nfs_ok;
+}
+
+static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+{
+	u32 dir = NFS4_CDFC4_FORE;
+
+	if (ses->se_flags & SESSION4_BACK_CHAN)
+		dir |= NFS4_CDFC4_BACK;
+
+	return nfsd4_new_conn(rqstp, ses, dir);
+}
+
+/* must be called under client_lock */
+static void nfsd4_del_conns(struct nfsd4_session *s)
+{
+	struct nfs4_client *clp = s->se_client;
+	struct nfsd4_conn *c;
+
+	spin_lock(&clp->cl_lock);
+	while (!list_empty(&s->se_conns)) {
+		c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession);
+		list_del_init(&c->cn_persession);
+		spin_unlock(&clp->cl_lock);
+
+		unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user);
+		free_conn(c);
+
+		spin_lock(&clp->cl_lock);
+	}
+	spin_unlock(&clp->cl_lock);
+}
+
+void free_session(struct kref *kref)
+{
+	struct nfsd4_session *ses;
+	int mem;
+
+	ses = container_of(kref, struct nfsd4_session, se_ref);
+	nfsd4_del_conns(ses);
+	spin_lock(&nfsd_drc_lock);
+	mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
+	nfsd_drc_mem_used -= mem;
+	spin_unlock(&nfsd_drc_lock);
+	free_session_slots(ses);
+	kfree(ses);
+}
+
+static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
+{
+	struct nfsd4_session *new;
+	struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
+	int numslots, slotsize;
+	int status;
+	int idx;
+
+	/*
+	 * Note decreasing slot size below client's request may
+	 * make it difficult for client to function correctly, whereas
+	 * decreasing the number of slots will (just?) affect
+	 * performance.  When short on memory we therefore prefer to
+	 * decrease number of slots instead of their size.
+	 */
+	slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
+	numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
+	if (numslots < 1)
+		return NULL;
+
+	new = alloc_session(slotsize, numslots);
+	if (!new) {
+		nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
+		return NULL;
+	}
+	init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
+
+	new->se_client = clp;
+	gen_sessionid(new);
+
+	INIT_LIST_HEAD(&new->se_conns);
+
+	new->se_cb_seq_nr = 1;
+	new->se_flags = cses->flags;
+	new->se_cb_prog = cses->callback_prog;
+	kref_init(&new->se_ref);
+	idx = hash_sessionid(&new->se_sessionid);
+	spin_lock(&client_lock);
+	list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+	spin_lock(&clp->cl_lock);
+	list_add(&new->se_perclnt, &clp->cl_sessions);
+	spin_unlock(&clp->cl_lock);
+	spin_unlock(&client_lock);
+
+	status = nfsd4_new_conn_from_crses(rqstp, new);
+	/* whoops: benny points out, status is ignored! (err, or bogus) */
+	if (status) {
+		free_session(&new->se_ref);
+		return NULL;
+	}
+	if (cses->flags & SESSION4_BACK_CHAN) {
+		struct sockaddr *sa = svc_addr(rqstp);
+		/*
+		 * This is a little silly; with sessions there's no real
+		 * use for the callback address.  Use the peer address
+		 * as a reasonable default for now, but consider fixing
+		 * the rpc client not to require an address in the
+		 * future:
+		 */
+		rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
+		clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
+	}
+	nfsd4_probe_callback(clp);
+	return new;
+}
+
+/* caller must hold client_lock */
+static struct nfsd4_session *
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
+{
+	struct nfsd4_session *elem;
+	int idx;
+
+	dump_sessionid(__func__, sessionid);
+	idx = hash_sessionid(sessionid);
+	/* Search in the appropriate list */
+	list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
+		if (!memcmp(elem->se_sessionid.data, sessionid->data,
+			    NFS4_MAX_SESSIONID_LEN)) {
+			return elem;
+		}
+	}
+
+	dprintk("%s: session not found\n", __func__);
+	return NULL;
+}
+
+/* caller must hold client_lock */
+static void
+unhash_session(struct nfsd4_session *ses)
+{
+	list_del(&ses->se_hash);
+	spin_lock(&ses->se_client->cl_lock);
+	list_del(&ses->se_perclnt);
+	spin_unlock(&ses->se_client->cl_lock);
+}
+
+/* must be called under the client_lock */
+static inline void
+renew_client_locked(struct nfs4_client *clp)
+{
+	if (is_client_expired(clp)) {
+		dprintk("%s: client (clientid %08x/%08x) already expired\n",
+			__func__,
+			clp->cl_clientid.cl_boot,
+			clp->cl_clientid.cl_id);
+		return;
+	}
+
+	/*
+	* Move client to the end to the LRU list.
+	*/
+	dprintk("renewing client (clientid %08x/%08x)\n", 
+			clp->cl_clientid.cl_boot, 
+			clp->cl_clientid.cl_id);
+	list_move_tail(&clp->cl_lru, &client_lru);
+	clp->cl_time = get_seconds();
+}
+
+static inline void
+renew_client(struct nfs4_client *clp)
+{
+	spin_lock(&client_lock);
+	renew_client_locked(clp);
+	spin_unlock(&client_lock);
+}
+
+/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
+static int
+STALE_CLIENTID(clientid_t *clid)
+{
+	if (clid->cl_boot == boot_time)
+		return 0;
+	dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
+		clid->cl_boot, clid->cl_id, boot_time);
+	return 1;
+}
+
+/* 
+ * XXX Should we use a slab cache ?
+ * This type of memory management is somewhat inefficient, but we use it
+ * anyway since SETCLIENTID is not a common operation.
+ */
+static struct nfs4_client *alloc_client(struct xdr_netobj name)
+{
+	struct nfs4_client *clp;
+
+	clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
+	if (clp == NULL)
+		return NULL;
+	clp->cl_name.data = kmalloc(name.len, GFP_KERNEL);
+	if (clp->cl_name.data == NULL) {
+		kfree(clp);
+		return NULL;
+	}
+	memcpy(clp->cl_name.data, name.data, name.len);
+	clp->cl_name.len = name.len;
+	return clp;
+}
+
+static inline void
+free_client(struct nfs4_client *clp)
+{
+	while (!list_empty(&clp->cl_sessions)) {
+		struct nfsd4_session *ses;
+		ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+				se_perclnt);
+		list_del(&ses->se_perclnt);
+		nfsd4_put_session(ses);
+	}
+	if (clp->cl_cred.cr_group_info)
+		put_group_info(clp->cl_cred.cr_group_info);
+	kfree(clp->cl_principal);
+	kfree(clp->cl_name.data);
+	kfree(clp);
+}
+
+void
+release_session_client(struct nfsd4_session *session)
+{
+	struct nfs4_client *clp = session->se_client;
+
+	if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock))
+		return;
+	if (is_client_expired(clp)) {
+		free_client(clp);
+		session->se_client = NULL;
+	} else
+		renew_client_locked(clp);
+	spin_unlock(&client_lock);
+}
+
+/* must be called under the client_lock */
+static inline void
+unhash_client_locked(struct nfs4_client *clp)
+{
+	struct nfsd4_session *ses;
+
+	mark_client_expired(clp);
+	list_del(&clp->cl_lru);
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
+		list_del_init(&ses->se_hash);
+	spin_unlock(&clp->cl_lock);
+}
+
+static void
+expire_client(struct nfs4_client *clp)
+{
+	struct nfs4_stateowner *sop;
+	struct nfs4_delegation *dp;
+	struct list_head reaplist;
+
+	INIT_LIST_HEAD(&reaplist);
+	spin_lock(&recall_lock);
+	while (!list_empty(&clp->cl_delegations)) {
+		dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
+		list_del_init(&dp->dl_perclnt);
+		list_move(&dp->dl_recall_lru, &reaplist);
+	}
+	spin_unlock(&recall_lock);
+	while (!list_empty(&reaplist)) {
+		dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
+		list_del_init(&dp->dl_recall_lru);
+		unhash_delegation(dp);
+	}
+	while (!list_empty(&clp->cl_openowners)) {
+		sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
+		release_openowner(sop);
+	}
+	nfsd4_shutdown_callback(clp);
+	if (clp->cl_cb_conn.cb_xprt)
+		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+	list_del(&clp->cl_idhash);
+	list_del(&clp->cl_strhash);
+	spin_lock(&client_lock);
+	unhash_client_locked(clp);
+	if (atomic_read(&clp->cl_refcount) == 0)
+		free_client(clp);
+	spin_unlock(&client_lock);
+}
+
+static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
+{
+	memcpy(target->cl_verifier.data, source->data,
+			sizeof(target->cl_verifier.data));
+}
+
+static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
+{
+	target->cl_clientid.cl_boot = source->cl_clientid.cl_boot; 
+	target->cl_clientid.cl_id = source->cl_clientid.cl_id; 
+}
+
+static void copy_cred(struct svc_cred *target, struct svc_cred *source)
+{
+	target->cr_uid = source->cr_uid;
+	target->cr_gid = source->cr_gid;
+	target->cr_group_info = source->cr_group_info;
+	get_group_info(target->cr_group_info);
+}
+
+static int same_name(const char *n1, const char *n2)
+{
+	return 0 == memcmp(n1, n2, HEXDIR_LEN);
+}
+
+static int
+same_verf(nfs4_verifier *v1, nfs4_verifier *v2)
+{
+	return 0 == memcmp(v1->data, v2->data, sizeof(v1->data));
+}
+
+static int
+same_clid(clientid_t *cl1, clientid_t *cl2)
+{
+	return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id);
+}
+
+/* XXX what about NGROUP */
+static int
+same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
+{
+	return cr1->cr_uid == cr2->cr_uid;
+}
+
+static void gen_clid(struct nfs4_client *clp)
+{
+	static u32 current_clientid = 1;
+
+	clp->cl_clientid.cl_boot = boot_time;
+	clp->cl_clientid.cl_id = current_clientid++; 
+}
+
+static void gen_confirm(struct nfs4_client *clp)
+{
+	static u32 i;
+	u32 *p;
+
+	p = (u32 *)clp->cl_confirm.data;
+	*p++ = get_seconds();
+	*p++ = i++;
+}
+
+static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
+		struct svc_rqst *rqstp, nfs4_verifier *verf)
+{
+	struct nfs4_client *clp;
+	struct sockaddr *sa = svc_addr(rqstp);
+	char *princ;
+
+	clp = alloc_client(name);
+	if (clp == NULL)
+		return NULL;
+
+	INIT_LIST_HEAD(&clp->cl_sessions);
+
+	princ = svc_gss_principal(rqstp);
+	if (princ) {
+		clp->cl_principal = kstrdup(princ, GFP_KERNEL);
+		if (clp->cl_principal == NULL) {
+			free_client(clp);
+			return NULL;
+		}
+	}
+
+	memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
+	atomic_set(&clp->cl_refcount, 0);
+	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+	INIT_LIST_HEAD(&clp->cl_idhash);
+	INIT_LIST_HEAD(&clp->cl_strhash);
+	INIT_LIST_HEAD(&clp->cl_openowners);
+	INIT_LIST_HEAD(&clp->cl_delegations);
+	INIT_LIST_HEAD(&clp->cl_lru);
+	INIT_LIST_HEAD(&clp->cl_callbacks);
+	spin_lock_init(&clp->cl_lock);
+	INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
+	clp->cl_time = get_seconds();
+	clear_bit(0, &clp->cl_cb_slot_busy);
+	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
+	copy_verf(clp, verf);
+	rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
+	clp->cl_flavor = rqstp->rq_flavor;
+	copy_cred(&clp->cl_cred, &rqstp->rq_cred);
+	gen_confirm(clp);
+	clp->cl_cb_session = NULL;
+	return clp;
+}
+
+static int check_name(struct xdr_netobj name)
+{
+	if (name.len == 0) 
+		return 0;
+	if (name.len > NFS4_OPAQUE_LIMIT) {
+		dprintk("NFSD: check_name: name too long(%d)!\n", name.len);
+		return 0;
+	}
+	return 1;
+}
+
+static void
+add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
+{
+	unsigned int idhashval;
+
+	list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]);
+	idhashval = clientid_hashval(clp->cl_clientid.cl_id);
+	list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]);
+	renew_client(clp);
+}
+
+static void
+move_to_confirmed(struct nfs4_client *clp)
+{
+	unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id);
+	unsigned int strhashval;
+
+	dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
+	list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
+	strhashval = clientstr_hashval(clp->cl_recdir);
+	list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
+	renew_client(clp);
+}
+
+static struct nfs4_client *
+find_confirmed_client(clientid_t *clid)
+{
+	struct nfs4_client *clp;
+	unsigned int idhashval = clientid_hashval(clid->cl_id);
+
+	list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
+		if (same_clid(&clp->cl_clientid, clid))
+			return clp;
+	}
+	return NULL;
+}
+
+static struct nfs4_client *
+find_unconfirmed_client(clientid_t *clid)
+{
+	struct nfs4_client *clp;
+	unsigned int idhashval = clientid_hashval(clid->cl_id);
+
+	list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) {
+		if (same_clid(&clp->cl_clientid, clid))
+			return clp;
+	}
+	return NULL;
+}
+
+static bool clp_used_exchangeid(struct nfs4_client *clp)
+{
+	return clp->cl_exchange_flags != 0;
+} 
+
+static struct nfs4_client *
+find_confirmed_client_by_str(const char *dname, unsigned int hashval)
+{
+	struct nfs4_client *clp;
+
+	list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
+		if (same_name(clp->cl_recdir, dname))
+			return clp;
+	}
+	return NULL;
+}
+
+static struct nfs4_client *
+find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
+{
+	struct nfs4_client *clp;
+
+	list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
+		if (same_name(clp->cl_recdir, dname))
+			return clp;
+	}
+	return NULL;
+}
+
+static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
+{
+	switch (family) {
+	case AF_INET:
+		((struct sockaddr_in *)sa)->sin_family = AF_INET;
+		((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
+		return;
+	case AF_INET6:
+		((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
+		((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
+		return;
+	}
+}
+
+static void
+gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
+{
+	struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
+	struct sockaddr	*sa = svc_addr(rqstp);
+	u32 scopeid = rpc_get_scope_id(sa);
+	unsigned short expected_family;
+
+	/* Currently, we only support tcp and tcp6 for the callback channel */
+	if (se->se_callback_netid_len == 3 &&
+	    !memcmp(se->se_callback_netid_val, "tcp", 3))
+		expected_family = AF_INET;
+	else if (se->se_callback_netid_len == 4 &&
+		 !memcmp(se->se_callback_netid_val, "tcp6", 4))
+		expected_family = AF_INET6;
+	else
+		goto out_err;
+
+	conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
+					    se->se_callback_addr_len,
+					    (struct sockaddr *)&conn->cb_addr,
+					    sizeof(conn->cb_addr));
+
+	if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family)
+		goto out_err;
+
+	if (conn->cb_addr.ss_family == AF_INET6)
+		((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid;
+
+	conn->cb_prog = se->se_callback_prog;
+	conn->cb_ident = se->se_callback_ident;
+	rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
+	return;
+out_err:
+	conn->cb_addr.ss_family = AF_UNSPEC;
+	conn->cb_addrlen = 0;
+	dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
+		"will not receive delegations\n",
+		clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
+
+	return;
+}
+
+/*
+ * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size.
+ */
+void
+nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_slot *slot = resp->cstate.slot;
+	unsigned int base;
+
+	dprintk("--> %s slot %p\n", __func__, slot);
+
+	slot->sl_opcnt = resp->opcnt;
+	slot->sl_status = resp->cstate.status;
+
+	if (nfsd4_not_cached(resp)) {
+		slot->sl_datalen = 0;
+		return;
+	}
+	slot->sl_datalen = (char *)resp->p - (char *)resp->cstate.datap;
+	base = (char *)resp->cstate.datap -
+					(char *)resp->xbuf->head[0].iov_base;
+	if (read_bytes_from_xdr_buf(resp->xbuf, base, slot->sl_data,
+				    slot->sl_datalen))
+		WARN("%s: sessions DRC could not cache compound\n", __func__);
+	return;
+}
+
+/*
+ * Encode the replay sequence operation from the slot values.
+ * If cachethis is FALSE encode the uncached rep error on the next
+ * operation which sets resp->p and increments resp->opcnt for
+ * nfs4svc_encode_compoundres.
+ *
+ */
+static __be32
+nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
+			  struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_op *op;
+	struct nfsd4_slot *slot = resp->cstate.slot;
+
+	dprintk("--> %s resp->opcnt %d cachethis %u \n", __func__,
+		resp->opcnt, resp->cstate.slot->sl_cachethis);
+
+	/* Encode the replayed sequence operation */
+	op = &args->ops[resp->opcnt - 1];
+	nfsd4_encode_operation(resp, op);
+
+	/* Return nfserr_retry_uncached_rep in next operation. */
+	if (args->opcnt > 1 && slot->sl_cachethis == 0) {
+		op = &args->ops[resp->opcnt++];
+		op->status = nfserr_retry_uncached_rep;
+		nfsd4_encode_operation(resp, op);
+	}
+	return op->status;
+}
+
+/*
+ * The sequence operation is not cached because we can use the slot and
+ * session values.
+ */
+__be32
+nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+			 struct nfsd4_sequence *seq)
+{
+	struct nfsd4_slot *slot = resp->cstate.slot;
+	__be32 status;
+
+	dprintk("--> %s slot %p\n", __func__, slot);
+
+	/* Either returns 0 or nfserr_retry_uncached */
+	status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp);
+	if (status == nfserr_retry_uncached_rep)
+		return status;
+
+	/* The sequence operation has been encoded, cstate->datap set. */
+	memcpy(resp->cstate.datap, slot->sl_data, slot->sl_datalen);
+
+	resp->opcnt = slot->sl_opcnt;
+	resp->p = resp->cstate.datap + XDR_QUADLEN(slot->sl_datalen);
+	status = slot->sl_status;
+
+	return status;
+}
+
+/*
+ * Set the exchange_id flags returned by the server.
+ */
+static void
+nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
+{
+	/* pNFS is not supported */
+	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+
+	/* Referrals are supported, Migration is not. */
+	new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
+
+	/* set the wire flags to return to client. */
+	clid->flags = new->cl_exchange_flags;
+}
+
+__be32
+nfsd4_exchange_id(struct svc_rqst *rqstp,
+		  struct nfsd4_compound_state *cstate,
+		  struct nfsd4_exchange_id *exid)
+{
+	struct nfs4_client *unconf, *conf, *new;
+	int status;
+	unsigned int		strhashval;
+	char			dname[HEXDIR_LEN];
+	char			addr_str[INET6_ADDRSTRLEN];
+	nfs4_verifier		verf = exid->verifier;
+	struct sockaddr		*sa = svc_addr(rqstp);
+
+	rpc_ntop(sa, addr_str, sizeof(addr_str));
+	dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
+		"ip_addr=%s flags %x, spa_how %d\n",
+		__func__, rqstp, exid, exid->clname.len, exid->clname.data,
+		addr_str, exid->flags, exid->spa_how);
+
+	if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
+		return nfserr_inval;
+
+	/* Currently only support SP4_NONE */
+	switch (exid->spa_how) {
+	case SP4_NONE:
+		break;
+	case SP4_SSV:
+		return nfserr_serverfault;
+	default:
+		BUG();				/* checked by xdr code */
+	case SP4_MACH_CRED:
+		return nfserr_serverfault;	/* no excuse :-/ */
+	}
+
+	status = nfs4_make_rec_clidname(dname, &exid->clname);
+
+	if (status)
+		goto error;
+
+	strhashval = clientstr_hashval(dname);
+
+	nfs4_lock_state();
+	status = nfs_ok;
+
+	conf = find_confirmed_client_by_str(dname, strhashval);
+	if (conf) {
+		if (!clp_used_exchangeid(conf)) {
+			status = nfserr_clid_inuse; /* XXX: ? */
+			goto out;
+		}
+		if (!same_verf(&verf, &conf->cl_verifier)) {
+			/* 18.35.4 case 8 */
+			if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+				status = nfserr_not_same;
+				goto out;
+			}
+			/* Client reboot: destroy old state */
+			expire_client(conf);
+			goto out_new;
+		}
+		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+			/* 18.35.4 case 9 */
+			if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+				status = nfserr_perm;
+				goto out;
+			}
+			expire_client(conf);
+			goto out_new;
+		}
+		/*
+		 * Set bit when the owner id and verifier map to an already
+		 * confirmed client id (18.35.3).
+		 */
+		exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
+
+		/*
+		 * Falling into 18.35.4 case 2, possible router replay.
+		 * Leave confirmed record intact and return same result.
+		 */
+		copy_verf(conf, &verf);
+		new = conf;
+		goto out_copy;
+	}
+
+	/* 18.35.4 case 7 */
+	if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+		status = nfserr_noent;
+		goto out;
+	}
+
+	unconf  = find_unconfirmed_client_by_str(dname, strhashval);
+	if (unconf) {
+		/*
+		 * Possible retry or client restart.  Per 18.35.4 case 4,
+		 * a new unconfirmed record should be generated regardless
+		 * of whether any properties have changed.
+		 */
+		expire_client(unconf);
+	}
+
+out_new:
+	/* Normal case */
+	new = create_client(exid->clname, dname, rqstp, &verf);
+	if (new == NULL) {
+		status = nfserr_jukebox;
+		goto out;
+	}
+
+	gen_clid(new);
+	add_to_unconfirmed(new, strhashval);
+out_copy:
+	exid->clientid.cl_boot = new->cl_clientid.cl_boot;
+	exid->clientid.cl_id = new->cl_clientid.cl_id;
+
+	exid->seqid = 1;
+	nfsd4_set_ex_flags(new, exid);
+
+	dprintk("nfsd4_exchange_id seqid %d flags %x\n",
+		new->cl_cs_slot.sl_seqid, new->cl_exchange_flags);
+	status = nfs_ok;
+
+out:
+	nfs4_unlock_state();
+error:
+	dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
+	return status;
+}
+
+static int
+check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
+{
+	dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid,
+		slot_seqid);
+
+	/* The slot is in use, and no response has been sent. */
+	if (slot_inuse) {
+		if (seqid == slot_seqid)
+			return nfserr_jukebox;
+		else
+			return nfserr_seq_misordered;
+	}
+	/* Normal */
+	if (likely(seqid == slot_seqid + 1))
+		return nfs_ok;
+	/* Replay */
+	if (seqid == slot_seqid)
+		return nfserr_replay_cache;
+	/* Wraparound */
+	if (seqid == 1 && (slot_seqid + 1) == 0)
+		return nfs_ok;
+	/* Misordered replay or misordered new request */
+	return nfserr_seq_misordered;
+}
+
+/*
+ * Cache the create session result into the create session single DRC
+ * slot cache by saving the xdr structure. sl_seqid has been set.
+ * Do this for solo or embedded create session operations.
+ */
+static void
+nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses,
+			   struct nfsd4_clid_slot *slot, int nfserr)
+{
+	slot->sl_status = nfserr;
+	memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses));
+}
+
+static __be32
+nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
+			    struct nfsd4_clid_slot *slot)
+{
+	memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses));
+	return slot->sl_status;
+}
+
+__be32
+nfsd4_create_session(struct svc_rqst *rqstp,
+		     struct nfsd4_compound_state *cstate,
+		     struct nfsd4_create_session *cr_ses)
+{
+	struct sockaddr *sa = svc_addr(rqstp);
+	struct nfs4_client *conf, *unconf;
+	struct nfsd4_session *new;
+	struct nfsd4_clid_slot *cs_slot = NULL;
+	bool confirm_me = false;
+	int status = 0;
+
+	if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
+		return nfserr_inval;
+
+	nfs4_lock_state();
+	unconf = find_unconfirmed_client(&cr_ses->clientid);
+	conf = find_confirmed_client(&cr_ses->clientid);
+
+	if (conf) {
+		cs_slot = &conf->cl_cs_slot;
+		status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
+		if (status == nfserr_replay_cache) {
+			dprintk("Got a create_session replay! seqid= %d\n",
+				cs_slot->sl_seqid);
+			/* Return the cached reply status */
+			status = nfsd4_replay_create_session(cr_ses, cs_slot);
+			goto out;
+		} else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
+			status = nfserr_seq_misordered;
+			dprintk("Sequence misordered!\n");
+			dprintk("Expected seqid= %d but got seqid= %d\n",
+				cs_slot->sl_seqid, cr_ses->seqid);
+			goto out;
+		}
+	} else if (unconf) {
+		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
+		    !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
+			status = nfserr_clid_inuse;
+			goto out;
+		}
+
+		cs_slot = &unconf->cl_cs_slot;
+		status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
+		if (status) {
+			/* an unconfirmed replay returns misordered */
+			status = nfserr_seq_misordered;
+			goto out;
+		}
+
+		confirm_me = true;
+		conf = unconf;
+	} else {
+		status = nfserr_stale_clientid;
+		goto out;
+	}
+
+	/*
+	 * XXX: we should probably set this at creation time, and check
+	 * for consistent minorversion use throughout:
+	 */
+	conf->cl_minorversion = 1;
+	/*
+	 * We do not support RDMA or persistent sessions
+	 */
+	cr_ses->flags &= ~SESSION4_PERSIST;
+	cr_ses->flags &= ~SESSION4_RDMA;
+
+	status = nfserr_jukebox;
+	new = alloc_init_session(rqstp, conf, cr_ses);
+	if (!new)
+		goto out;
+	status = nfs_ok;
+	memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
+	       NFS4_MAX_SESSIONID_LEN);
+	memcpy(&cr_ses->fore_channel, &new->se_fchannel,
+		sizeof(struct nfsd4_channel_attrs));
+	cs_slot->sl_seqid++;
+	cr_ses->seqid = cs_slot->sl_seqid;
+
+	/* cache solo and embedded create sessions under the state lock */
+	nfsd4_cache_create_session(cr_ses, cs_slot, status);
+	if (confirm_me)
+		move_to_confirmed(conf);
+out:
+	nfs4_unlock_state();
+	dprintk("%s returns %d\n", __func__, ntohl(status));
+	return status;
+}
+
+static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+
+	return argp->opcnt == resp->opcnt;
+}
+
+static __be32 nfsd4_map_bcts_dir(u32 *dir)
+{
+	switch (*dir) {
+	case NFS4_CDFC4_FORE:
+	case NFS4_CDFC4_BACK:
+		return nfs_ok;
+	case NFS4_CDFC4_FORE_OR_BOTH:
+	case NFS4_CDFC4_BACK_OR_BOTH:
+		*dir = NFS4_CDFC4_BOTH;
+		return nfs_ok;
+	};
+	return nfserr_inval;
+}
+
+__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
+		     struct nfsd4_compound_state *cstate,
+		     struct nfsd4_bind_conn_to_session *bcts)
+{
+	__be32 status;
+
+	if (!nfsd4_last_compound_op(rqstp))
+		return nfserr_not_only_op;
+	spin_lock(&client_lock);
+	cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid);
+	/* Sorta weird: we only need the refcnt'ing because new_conn acquires
+	 * client_lock iself: */
+	if (cstate->session) {
+		nfsd4_get_session(cstate->session);
+		atomic_inc(&cstate->session->se_client->cl_refcount);
+	}
+	spin_unlock(&client_lock);
+	if (!cstate->session)
+		return nfserr_badsession;
+
+	status = nfsd4_map_bcts_dir(&bcts->dir);
+	if (!status)
+		nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
+	return status;
+}
+
+static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
+{
+	if (!session)
+		return 0;
+	return !memcmp(sid, &session->se_sessionid, sizeof(*sid));
+}
+
+__be32
+nfsd4_destroy_session(struct svc_rqst *r,
+		      struct nfsd4_compound_state *cstate,
+		      struct nfsd4_destroy_session *sessionid)
+{
+	struct nfsd4_session *ses;
+	u32 status = nfserr_badsession;
+
+	/* Notes:
+	 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
+	 * - Should we return nfserr_back_chan_busy if waiting for
+	 *   callbacks on to-be-destroyed session?
+	 * - Do we need to clear any callback info from previous session?
+	 */
+
+	if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
+		if (!nfsd4_last_compound_op(r))
+			return nfserr_not_only_op;
+	}
+	dump_sessionid(__func__, &sessionid->sessionid);
+	spin_lock(&client_lock);
+	ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
+	if (!ses) {
+		spin_unlock(&client_lock);
+		goto out;
+	}
+
+	unhash_session(ses);
+	spin_unlock(&client_lock);
+
+	nfs4_lock_state();
+	nfsd4_probe_callback_sync(ses->se_client);
+	nfs4_unlock_state();
+
+	nfsd4_del_conns(ses);
+
+	nfsd4_put_session(ses);
+	status = nfs_ok;
+out:
+	dprintk("%s returns %d\n", __func__, ntohl(status));
+	return status;
+}
+
+static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s)
+{
+	struct nfsd4_conn *c;
+
+	list_for_each_entry(c, &s->se_conns, cn_persession) {
+		if (c->cn_xprt == xpt) {
+			return c;
+		}
+	}
+	return NULL;
+}
+
+static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+	struct nfsd4_conn *c;
+	int ret;
+
+	spin_lock(&clp->cl_lock);
+	c = __nfsd4_find_conn(new->cn_xprt, ses);
+	if (c) {
+		spin_unlock(&clp->cl_lock);
+		free_conn(new);
+		return;
+	}
+	__nfsd4_hash_conn(new, ses);
+	spin_unlock(&clp->cl_lock);
+	ret = nfsd4_register_conn(new);
+	if (ret)
+		/* oops; xprt is already down: */
+		nfsd4_conn_lost(&new->cn_xpt_user);
+	return;
+}
+
+static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
+{
+	struct nfsd4_compoundargs *args = rqstp->rq_argp;
+
+	return args->opcnt > session->se_fchannel.maxops;
+}
+
+__be32
+nfsd4_sequence(struct svc_rqst *rqstp,
+	       struct nfsd4_compound_state *cstate,
+	       struct nfsd4_sequence *seq)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfsd4_session *session;
+	struct nfsd4_slot *slot;
+	struct nfsd4_conn *conn;
+	int status;
+
+	if (resp->opcnt != 1)
+		return nfserr_sequence_pos;
+
+	/*
+	 * Will be either used or freed by nfsd4_sequence_check_conn
+	 * below.
+	 */
+	conn = alloc_conn(rqstp, NFS4_CDFC4_FORE);
+	if (!conn)
+		return nfserr_jukebox;
+
+	spin_lock(&client_lock);
+	status = nfserr_badsession;
+	session = find_in_sessionid_hashtbl(&seq->sessionid);
+	if (!session)
+		goto out;
+
+	status = nfserr_too_many_ops;
+	if (nfsd4_session_too_many_ops(rqstp, session))
+		goto out;
+
+	status = nfserr_badslot;
+	if (seq->slotid >= session->se_fchannel.maxreqs)
+		goto out;
+
+	slot = session->se_slots[seq->slotid];
+	dprintk("%s: slotid %d\n", __func__, seq->slotid);
+
+	/* We do not negotiate the number of slots yet, so set the
+	 * maxslots to the session maxreqs which is used to encode
+	 * sr_highest_slotid and the sr_target_slot id to maxslots */
+	seq->maxslots = session->se_fchannel.maxreqs;
+
+	status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_inuse);
+	if (status == nfserr_replay_cache) {
+		cstate->slot = slot;
+		cstate->session = session;
+		/* Return the cached reply status and set cstate->status
+		 * for nfsd4_proc_compound processing */
+		status = nfsd4_replay_cache_entry(resp, seq);
+		cstate->status = nfserr_replay_cache;
+		goto out;
+	}
+	if (status)
+		goto out;
+
+	nfsd4_sequence_check_conn(conn, session);
+	conn = NULL;
+
+	/* Success! bump slot seqid */
+	slot->sl_inuse = true;
+	slot->sl_seqid = seq->seqid;
+	slot->sl_cachethis = seq->cachethis;
+
+	cstate->slot = slot;
+	cstate->session = session;
+
+out:
+	/* Hold a session reference until done processing the compound. */
+	if (cstate->session) {
+		struct nfs4_client *clp = session->se_client;
+
+		nfsd4_get_session(cstate->session);
+		atomic_inc(&clp->cl_refcount);
+		if (clp->cl_cb_state == NFSD4_CB_DOWN)
+			seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
+	}
+	kfree(conn);
+	spin_unlock(&client_lock);
+	dprintk("%s: return %d\n", __func__, ntohl(status));
+	return status;
+}
+
+__be32
+nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
+{
+	int status = 0;
+
+	if (rc->rca_one_fs) {
+		if (!cstate->current_fh.fh_dentry)
+			return nfserr_nofilehandle;
+		/*
+		 * We don't take advantage of the rca_one_fs case.
+		 * That's OK, it's optional, we can safely ignore it.
+		 */
+		 return nfs_ok;
+	}
+
+	nfs4_lock_state();
+	status = nfserr_complete_already;
+	if (cstate->session->se_client->cl_firststate)
+		goto out;
+
+	status = nfserr_stale_clientid;
+	if (is_client_expired(cstate->session->se_client))
+		/*
+		 * The following error isn't really legal.
+		 * But we only get here if the client just explicitly
+		 * destroyed the client.  Surely it no longer cares what
+		 * error it gets back on an operation for the dead
+		 * client.
+		 */
+		goto out;
+
+	status = nfs_ok;
+	nfsd4_create_clid_dir(cstate->session->se_client);
+out:
+	nfs4_unlock_state();
+	return status;
+}
+
+__be32
+nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		  struct nfsd4_setclientid *setclid)
+{
+	struct xdr_netobj 	clname = { 
+		.len = setclid->se_namelen,
+		.data = setclid->se_name,
+	};
+	nfs4_verifier		clverifier = setclid->se_verf;
+	unsigned int 		strhashval;
+	struct nfs4_client	*conf, *unconf, *new;
+	__be32 			status;
+	char                    dname[HEXDIR_LEN];
+	
+	if (!check_name(clname))
+		return nfserr_inval;
+
+	status = nfs4_make_rec_clidname(dname, &clname);
+	if (status)
+		return status;
+
+	/* 
+	 * XXX The Duplicate Request Cache (DRC) has been checked (??)
+	 * We get here on a DRC miss.
+	 */
+
+	strhashval = clientstr_hashval(dname);
+
+	nfs4_lock_state();
+	conf = find_confirmed_client_by_str(dname, strhashval);
+	if (conf) {
+		/* RFC 3530 14.2.33 CASE 0: */
+		status = nfserr_clid_inuse;
+		if (clp_used_exchangeid(conf))
+			goto out;
+		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+			char addr_str[INET6_ADDRSTRLEN];
+			rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
+				 sizeof(addr_str));
+			dprintk("NFSD: setclientid: string in use by client "
+				"at %s\n", addr_str);
+			goto out;
+		}
+	}
+	/*
+	 * section 14.2.33 of RFC 3530 (under the heading "IMPLEMENTATION")
+	 * has a description of SETCLIENTID request processing consisting
+	 * of 5 bullet points, labeled as CASE0 - CASE4 below.
+	 */
+	unconf = find_unconfirmed_client_by_str(dname, strhashval);
+	status = nfserr_jukebox;
+	if (!conf) {
+		/*
+		 * RFC 3530 14.2.33 CASE 4:
+		 * placed first, because it is the normal case
+		 */
+		if (unconf)
+			expire_client(unconf);
+		new = create_client(clname, dname, rqstp, &clverifier);
+		if (new == NULL)
+			goto out;
+		gen_clid(new);
+	} else if (same_verf(&conf->cl_verifier, &clverifier)) {
+		/*
+		 * RFC 3530 14.2.33 CASE 1:
+		 * probable callback update
+		 */
+		if (unconf) {
+			/* Note this is removing unconfirmed {*x***},
+			 * which is stronger than RFC recommended {vxc**}.
+			 * This has the advantage that there is at most
+			 * one {*x***} in either list at any time.
+			 */
+			expire_client(unconf);
+		}
+		new = create_client(clname, dname, rqstp, &clverifier);
+		if (new == NULL)
+			goto out;
+		copy_clid(new, conf);
+	} else if (!unconf) {
+		/*
+		 * RFC 3530 14.2.33 CASE 2:
+		 * probable client reboot; state will be removed if
+		 * confirmed.
+		 */
+		new = create_client(clname, dname, rqstp, &clverifier);
+		if (new == NULL)
+			goto out;
+		gen_clid(new);
+	} else {
+		/*
+		 * RFC 3530 14.2.33 CASE 3:
+		 * probable client reboot; state will be removed if
+		 * confirmed.
+		 */
+		expire_client(unconf);
+		new = create_client(clname, dname, rqstp, &clverifier);
+		if (new == NULL)
+			goto out;
+		gen_clid(new);
+	}
+	/*
+	 * XXX: we should probably set this at creation time, and check
+	 * for consistent minorversion use throughout:
+	 */
+	new->cl_minorversion = 0;
+	gen_callback(new, setclid, rqstp);
+	add_to_unconfirmed(new, strhashval);
+	setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
+	setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
+	memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data));
+	status = nfs_ok;
+out:
+	nfs4_unlock_state();
+	return status;
+}
+
+
+/*
+ * Section 14.2.34 of RFC 3530 (under the heading "IMPLEMENTATION") has
+ * a description of SETCLIENTID_CONFIRM request processing consisting of 4
+ * bullets, labeled as CASE1 - CASE4 below.
+ */
+__be32
+nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
+			 struct nfsd4_compound_state *cstate,
+			 struct nfsd4_setclientid_confirm *setclientid_confirm)
+{
+	struct sockaddr *sa = svc_addr(rqstp);
+	struct nfs4_client *conf, *unconf;
+	nfs4_verifier confirm = setclientid_confirm->sc_confirm; 
+	clientid_t * clid = &setclientid_confirm->sc_clientid;
+	__be32 status;
+
+	if (STALE_CLIENTID(clid))
+		return nfserr_stale_clientid;
+	/* 
+	 * XXX The Duplicate Request Cache (DRC) has been checked (??)
+	 * We get here on a DRC miss.
+	 */
+
+	nfs4_lock_state();
+
+	conf = find_confirmed_client(clid);
+	unconf = find_unconfirmed_client(clid);
+
+	status = nfserr_clid_inuse;
+	if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa))
+		goto out;
+	if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa))
+		goto out;
+
+	/*
+	 * section 14.2.34 of RFC 3530 has a description of
+	 * SETCLIENTID_CONFIRM request processing consisting
+	 * of 4 bullet points, labeled as CASE1 - CASE4 below.
+	 */
+	if (conf && unconf && same_verf(&confirm, &unconf->cl_confirm)) {
+		/*
+		 * RFC 3530 14.2.34 CASE 1:
+		 * callback update
+		 */
+		if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
+			status = nfserr_clid_inuse;
+		else {
+			nfsd4_change_callback(conf, &unconf->cl_cb_conn);
+			nfsd4_probe_callback(conf);
+			expire_client(unconf);
+			status = nfs_ok;
+
+		}
+	} else if (conf && !unconf) {
+		/*
+		 * RFC 3530 14.2.34 CASE 2:
+		 * probable retransmitted request; play it safe and
+		 * do nothing.
+		 */
+		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred))
+			status = nfserr_clid_inuse;
+		else
+			status = nfs_ok;
+	} else if (!conf && unconf
+			&& same_verf(&unconf->cl_confirm, &confirm)) {
+		/*
+		 * RFC 3530 14.2.34 CASE 3:
+		 * Normal case; new or rebooted client:
+		 */
+		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) {
+			status = nfserr_clid_inuse;
+		} else {
+			unsigned int hash =
+				clientstr_hashval(unconf->cl_recdir);
+			conf = find_confirmed_client_by_str(unconf->cl_recdir,
+							    hash);
+			if (conf) {
+				nfsd4_remove_clid_dir(conf);
+				expire_client(conf);
+			}
+			move_to_confirmed(unconf);
+			conf = unconf;
+			nfsd4_probe_callback(conf);
+			status = nfs_ok;
+		}
+	} else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
+	    && (!unconf || (unconf && !same_verf(&unconf->cl_confirm,
+				    				&confirm)))) {
+		/*
+		 * RFC 3530 14.2.34 CASE 4:
+		 * Client probably hasn't noticed that we rebooted yet.
+		 */
+		status = nfserr_stale_clientid;
+	} else {
+		/* check that we have hit one of the cases...*/
+		status = nfserr_clid_inuse;
+	}
+out:
+	nfs4_unlock_state();
+	return status;
+}
+
+/* OPEN Share state helper functions */
+static inline struct nfs4_file *
+alloc_init_file(struct inode *ino)
+{
+	struct nfs4_file *fp;
+	unsigned int hashval = file_hashval(ino);
+
+	fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
+	if (fp) {
+		atomic_set(&fp->fi_ref, 1);
+		INIT_LIST_HEAD(&fp->fi_hash);
+		INIT_LIST_HEAD(&fp->fi_stateids);
+		INIT_LIST_HEAD(&fp->fi_delegations);
+		fp->fi_inode = igrab(ino);
+		fp->fi_id = current_fileid++;
+		fp->fi_had_conflict = false;
+		fp->fi_lease = NULL;
+		memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
+		memset(fp->fi_access, 0, sizeof(fp->fi_access));
+		spin_lock(&recall_lock);
+		list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+		spin_unlock(&recall_lock);
+		return fp;
+	}
+	return NULL;
+}
+
+static void
+nfsd4_free_slab(struct kmem_cache **slab)
+{
+	if (*slab == NULL)
+		return;
+	kmem_cache_destroy(*slab);
+	*slab = NULL;
+}
+
+void
+nfsd4_free_slabs(void)
+{
+	nfsd4_free_slab(&stateowner_slab);
+	nfsd4_free_slab(&file_slab);
+	nfsd4_free_slab(&stateid_slab);
+	nfsd4_free_slab(&deleg_slab);
+}
+
+static int
+nfsd4_init_slabs(void)
+{
+	stateowner_slab = kmem_cache_create("nfsd4_stateowners",
+			sizeof(struct nfs4_stateowner), 0, 0, NULL);
+	if (stateowner_slab == NULL)
+		goto out_nomem;
+	file_slab = kmem_cache_create("nfsd4_files",
+			sizeof(struct nfs4_file), 0, 0, NULL);
+	if (file_slab == NULL)
+		goto out_nomem;
+	stateid_slab = kmem_cache_create("nfsd4_stateids",
+			sizeof(struct nfs4_stateid), 0, 0, NULL);
+	if (stateid_slab == NULL)
+		goto out_nomem;
+	deleg_slab = kmem_cache_create("nfsd4_delegations",
+			sizeof(struct nfs4_delegation), 0, 0, NULL);
+	if (deleg_slab == NULL)
+		goto out_nomem;
+	return 0;
+out_nomem:
+	nfsd4_free_slabs();
+	dprintk("nfsd4: out of memory while initializing nfsv4\n");
+	return -ENOMEM;
+}
+
+void
+nfs4_free_stateowner(struct kref *kref)
+{
+	struct nfs4_stateowner *sop =
+		container_of(kref, struct nfs4_stateowner, so_ref);
+	kfree(sop->so_owner.data);
+	kmem_cache_free(stateowner_slab, sop);
+}
+
+static inline struct nfs4_stateowner *
+alloc_stateowner(struct xdr_netobj *owner)
+{
+	struct nfs4_stateowner *sop;
+
+	if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) {
+		if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) {
+			memcpy(sop->so_owner.data, owner->data, owner->len);
+			sop->so_owner.len = owner->len;
+			kref_init(&sop->so_ref);
+			return sop;
+		} 
+		kmem_cache_free(stateowner_slab, sop);
+	}
+	return NULL;
+}
+
+static struct nfs4_stateowner *
+alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
+	struct nfs4_stateowner *sop;
+	struct nfs4_replay *rp;
+	unsigned int idhashval;
+
+	if (!(sop = alloc_stateowner(&open->op_owner)))
+		return NULL;
+	idhashval = ownerid_hashval(current_ownerid);
+	INIT_LIST_HEAD(&sop->so_idhash);
+	INIT_LIST_HEAD(&sop->so_strhash);
+	INIT_LIST_HEAD(&sop->so_perclient);
+	INIT_LIST_HEAD(&sop->so_stateids);
+	INIT_LIST_HEAD(&sop->so_perstateid);  /* not used */
+	INIT_LIST_HEAD(&sop->so_close_lru);
+	sop->so_time = 0;
+	list_add(&sop->so_idhash, &ownerid_hashtbl[idhashval]);
+	list_add(&sop->so_strhash, &ownerstr_hashtbl[strhashval]);
+	list_add(&sop->so_perclient, &clp->cl_openowners);
+	sop->so_is_open_owner = 1;
+	sop->so_id = current_ownerid++;
+	sop->so_client = clp;
+	sop->so_seqid = open->op_seqid;
+	sop->so_confirmed = 0;
+	rp = &sop->so_replay;
+	rp->rp_status = nfserr_serverfault;
+	rp->rp_buflen = 0;
+	rp->rp_buf = rp->rp_ibuf;
+	return sop;
+}
+
+static inline void
+init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+	struct nfs4_stateowner *sop = open->op_stateowner;
+	unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id);
+
+	INIT_LIST_HEAD(&stp->st_hash);
+	INIT_LIST_HEAD(&stp->st_perstateowner);
+	INIT_LIST_HEAD(&stp->st_lockowners);
+	INIT_LIST_HEAD(&stp->st_perfile);
+	list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
+	list_add(&stp->st_perstateowner, &sop->so_stateids);
+	list_add(&stp->st_perfile, &fp->fi_stateids);
+	stp->st_stateowner = sop;
+	get_nfs4_file(fp);
+	stp->st_file = fp;
+	stp->st_stateid.si_boot = boot_time;
+	stp->st_stateid.si_stateownerid = sop->so_id;
+	stp->st_stateid.si_fileid = fp->fi_id;
+	stp->st_stateid.si_generation = 0;
+	stp->st_access_bmap = 0;
+	stp->st_deny_bmap = 0;
+	__set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
+		  &stp->st_access_bmap);
+	__set_bit(open->op_share_deny, &stp->st_deny_bmap);
+	stp->st_openstp = NULL;
+}
+
+static void
+move_to_close_lru(struct nfs4_stateowner *sop)
+{
+	dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
+
+	list_move_tail(&sop->so_close_lru, &close_lru);
+	sop->so_time = get_seconds();
+}
+
+static int
+same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
+							clientid_t *clid)
+{
+	return (sop->so_owner.len == owner->len) &&
+		0 == memcmp(sop->so_owner.data, owner->data, owner->len) &&
+		(sop->so_client->cl_clientid.cl_id == clid->cl_id);
+}
+
+static struct nfs4_stateowner *
+find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
+{
+	struct nfs4_stateowner *so = NULL;
+
+	list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
+		if (same_owner_str(so, &open->op_owner, &open->op_clientid))
+			return so;
+	}
+	return NULL;
+}
+
+/* search file_hashtbl[] for file */
+static struct nfs4_file *
+find_file(struct inode *ino)
+{
+	unsigned int hashval = file_hashval(ino);
+	struct nfs4_file *fp;
+
+	spin_lock(&recall_lock);
+	list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
+		if (fp->fi_inode == ino) {
+			get_nfs4_file(fp);
+			spin_unlock(&recall_lock);
+			return fp;
+		}
+	}
+	spin_unlock(&recall_lock);
+	return NULL;
+}
+
+static inline int access_valid(u32 x, u32 minorversion)
+{
+	if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
+		return 0;
+	if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
+		return 0;
+	x &= ~NFS4_SHARE_ACCESS_MASK;
+	if (minorversion && x) {
+		if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
+			return 0;
+		if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
+			return 0;
+		x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
+	}
+	if (x)
+		return 0;
+	return 1;
+}
+
+static inline int deny_valid(u32 x)
+{
+	/* Note: unlike access bits, deny bits may be zero. */
+	return x <= NFS4_SHARE_DENY_BOTH;
+}
+
+/*
+ * Called to check deny when READ with all zero stateid or
+ * WRITE with all zero or all one stateid
+ */
+static __be32
+nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
+{
+	struct inode *ino = current_fh->fh_dentry->d_inode;
+	struct nfs4_file *fp;
+	struct nfs4_stateid *stp;
+	__be32 ret;
+
+	dprintk("NFSD: nfs4_share_conflict\n");
+
+	fp = find_file(ino);
+	if (!fp)
+		return nfs_ok;
+	ret = nfserr_locked;
+	/* Search for conflicting share reservations */
+	list_for_each_entry(stp, &fp->fi_stateids, st_perfile) {
+		if (test_bit(deny_type, &stp->st_deny_bmap) ||
+		    test_bit(NFS4_SHARE_DENY_BOTH, &stp->st_deny_bmap))
+			goto out;
+	}
+	ret = nfs_ok;
+out:
+	put_nfs4_file(fp);
+	return ret;
+}
+
+static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
+{
+	/* We're assuming the state code never drops its reference
+	 * without first removing the lease.  Since we're in this lease
+	 * callback (and since the lease code is serialized by the kernel
+	 * lock) we know the server hasn't removed the lease yet, we know
+	 * it's safe to take a reference: */
+	atomic_inc(&dp->dl_count);
+
+	list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
+
+	/* only place dl_time is set. protected by lock_flocks*/
+	dp->dl_time = get_seconds();
+
+	nfsd4_cb_recall(dp);
+}
+
+/* Called from break_lease() with lock_flocks() held. */
+static void nfsd_break_deleg_cb(struct file_lock *fl)
+{
+	struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
+	struct nfs4_delegation *dp;
+
+	BUG_ON(!fp);
+	/* We assume break_lease is only called once per lease: */
+	BUG_ON(fp->fi_had_conflict);
+	/*
+	 * We don't want the locks code to timeout the lease for us;
+	 * we'll remove it ourself if a delegation isn't returned
+	 * in time:
+	 */
+	fl->fl_break_time = 0;
+
+	spin_lock(&recall_lock);
+	fp->fi_had_conflict = true;
+	list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
+		nfsd_break_one_deleg(dp);
+	spin_unlock(&recall_lock);
+}
+
+static
+int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
+{
+	if (arg & F_UNLCK)
+		return lease_modify(onlist, arg);
+	else
+		return -EAGAIN;
+}
+
+static const struct lock_manager_operations nfsd_lease_mng_ops = {
+	.fl_break = nfsd_break_deleg_cb,
+	.fl_change = nfsd_change_deleg_cb,
+};
+
+
+__be32
+nfsd4_process_open1(struct nfsd4_compound_state *cstate,
+		    struct nfsd4_open *open)
+{
+	clientid_t *clientid = &open->op_clientid;
+	struct nfs4_client *clp = NULL;
+	unsigned int strhashval;
+	struct nfs4_stateowner *sop = NULL;
+
+	if (!check_name(open->op_owner))
+		return nfserr_inval;
+
+	if (STALE_CLIENTID(&open->op_clientid))
+		return nfserr_stale_clientid;
+
+	strhashval = ownerstr_hashval(clientid->cl_id, open->op_owner);
+	sop = find_openstateowner_str(strhashval, open);
+	open->op_stateowner = sop;
+	if (!sop) {
+		/* Make sure the client's lease hasn't expired. */
+		clp = find_confirmed_client(clientid);
+		if (clp == NULL)
+			return nfserr_expired;
+		goto renew;
+	}
+	/* When sessions are used, skip open sequenceid processing */
+	if (nfsd4_has_session(cstate))
+		goto renew;
+	if (!sop->so_confirmed) {
+		/* Replace unconfirmed owners without checking for replay. */
+		clp = sop->so_client;
+		release_openowner(sop);
+		open->op_stateowner = NULL;
+		goto renew;
+	}
+	if (open->op_seqid == sop->so_seqid - 1) {
+		if (sop->so_replay.rp_buflen)
+			return nfserr_replay_me;
+		/* The original OPEN failed so spectacularly
+		 * that we don't even have replay data saved!
+		 * Therefore, we have no choice but to continue
+		 * processing this OPEN; presumably, we'll
+		 * fail again for the same reason.
+		 */
+		dprintk("nfsd4_process_open1: replay with no replay cache\n");
+		goto renew;
+	}
+	if (open->op_seqid != sop->so_seqid)
+		return nfserr_bad_seqid;
+renew:
+	if (open->op_stateowner == NULL) {
+		sop = alloc_init_open_stateowner(strhashval, clp, open);
+		if (sop == NULL)
+			return nfserr_jukebox;
+		open->op_stateowner = sop;
+	}
+	list_del_init(&sop->so_close_lru);
+	renew_client(sop->so_client);
+	return nfs_ok;
+}
+
+static inline __be32
+nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
+{
+	if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ))
+		return nfserr_openmode;
+	else
+		return nfs_ok;
+}
+
+static struct nfs4_delegation *
+find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
+{
+	struct nfs4_delegation *dp;
+
+	spin_lock(&recall_lock);
+	list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
+		if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
+			spin_unlock(&recall_lock);
+			return dp;
+		}
+	spin_unlock(&recall_lock);
+	return NULL;
+}
+
+static int share_access_to_flags(u32 share_access)
+{
+	share_access &= ~NFS4_SHARE_WANT_MASK;
+
+	return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
+}
+
+static __be32
+nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
+		struct nfs4_delegation **dp)
+{
+	int flags;
+	__be32 status = nfserr_bad_stateid;
+
+	*dp = find_delegation_file(fp, &open->op_delegate_stateid);
+	if (*dp == NULL)
+		goto out;
+	flags = share_access_to_flags(open->op_share_access);
+	status = nfs4_check_delegmode(*dp, flags);
+	if (status)
+		*dp = NULL;
+out:
+	if (open->op_claim_type != NFS4_OPEN_CLAIM_DELEGATE_CUR)
+		return nfs_ok;
+	if (status)
+		return status;
+	open->op_stateowner->so_confirmed = 1;
+	return nfs_ok;
+}
+
+static __be32
+nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp)
+{
+	struct nfs4_stateid *local;
+	__be32 status = nfserr_share_denied;
+	struct nfs4_stateowner *sop = open->op_stateowner;
+
+	list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
+		/* ignore lock owners */
+		if (local->st_stateowner->so_is_open_owner == 0)
+			continue;
+		/* remember if we have seen this open owner */
+		if (local->st_stateowner == sop)
+			*stpp = local;
+		/* check for conflicting share reservations */
+		if (!test_share(local, open))
+			goto out;
+	}
+	status = 0;
+out:
+	return status;
+}
+
+static inline struct nfs4_stateid *
+nfs4_alloc_stateid(void)
+{
+	return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
+}
+
+static inline int nfs4_access_to_access(u32 nfs4_access)
+{
+	int flags = 0;
+
+	if (nfs4_access & NFS4_SHARE_ACCESS_READ)
+		flags |= NFSD_MAY_READ;
+	if (nfs4_access & NFS4_SHARE_ACCESS_WRITE)
+		flags |= NFSD_MAY_WRITE;
+	return flags;
+}
+
+static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
+		struct svc_fh *cur_fh, struct nfsd4_open *open)
+{
+	__be32 status;
+	int oflag = nfs4_access_to_omode(open->op_share_access);
+	int access = nfs4_access_to_access(open->op_share_access);
+
+	/* CLAIM_DELEGATE_CUR is used in response to a broken lease;
+	 * allowing it to break the lease and return EAGAIN leaves the
+	 * client unable to make progress in returning the delegation */
+	if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
+		access |= NFSD_MAY_NOT_BREAK_LEASE;
+
+	if (!fp->fi_fds[oflag]) {
+		status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
+			&fp->fi_fds[oflag]);
+		if (status)
+			return status;
+	}
+	nfs4_file_get_access(fp, oflag);
+
+	return nfs_ok;
+}
+
+static __be32
+nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
+		struct nfs4_file *fp, struct svc_fh *cur_fh,
+		struct nfsd4_open *open)
+{
+	struct nfs4_stateid *stp;
+	__be32 status;
+
+	stp = nfs4_alloc_stateid();
+	if (stp == NULL)
+		return nfserr_jukebox;
+
+	status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
+	if (status) {
+		kmem_cache_free(stateid_slab, stp);
+		return status;
+	}
+	*stpp = stp;
+	return 0;
+}
+
+static inline __be32
+nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
+		struct nfsd4_open *open)
+{
+	struct iattr iattr = {
+		.ia_valid = ATTR_SIZE,
+		.ia_size = 0,
+	};
+	if (!open->op_truncate)
+		return 0;
+	if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
+		return nfserr_inval;
+	return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0);
+}
+
+static __be32
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
+{
+	u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
+	bool new_access;
+	__be32 status;
+
+	new_access = !test_bit(op_share_access, &stp->st_access_bmap);
+	if (new_access) {
+		status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
+		if (status)
+			return status;
+	}
+	status = nfsd4_truncate(rqstp, cur_fh, open);
+	if (status) {
+		if (new_access) {
+			int oflag = nfs4_access_to_omode(op_share_access);
+			nfs4_file_put_access(fp, oflag);
+		}
+		return status;
+	}
+	/* remember the open */
+	__set_bit(op_share_access, &stp->st_access_bmap);
+	__set_bit(open->op_share_deny, &stp->st_deny_bmap);
+
+	return nfs_ok;
+}
+
+
+static void
+nfs4_set_claim_prev(struct nfsd4_open *open)
+{
+	open->op_stateowner->so_confirmed = 1;
+	open->op_stateowner->so_client->cl_firststate = 1;
+}
+
+/* Should we give out recallable state?: */
+static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
+{
+	if (clp->cl_cb_state == NFSD4_CB_UP)
+		return true;
+	/*
+	 * In the sessions case, since we don't have to establish a
+	 * separate connection for callbacks, we assume it's OK
+	 * until we hear otherwise:
+	 */
+	return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
+}
+
+static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag)
+{
+	struct file_lock *fl;
+
+	fl = locks_alloc_lock();
+	if (!fl)
+		return NULL;
+	locks_init_lock(fl);
+	fl->fl_lmops = &nfsd_lease_mng_ops;
+	fl->fl_flags = FL_LEASE;
+	fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+	fl->fl_end = OFFSET_MAX;
+	fl->fl_owner = (fl_owner_t)(dp->dl_file);
+	fl->fl_pid = current->tgid;
+	return fl;
+}
+
+static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
+{
+	struct nfs4_file *fp = dp->dl_file;
+	struct file_lock *fl;
+	int status;
+
+	fl = nfs4_alloc_init_lease(dp, flag);
+	if (!fl)
+		return -ENOMEM;
+	fl->fl_file = find_readable_file(fp);
+	list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+	status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
+	if (status) {
+		list_del_init(&dp->dl_perclnt);
+		locks_free_lock(fl);
+		return -ENOMEM;
+	}
+	fp->fi_lease = fl;
+	fp->fi_deleg_file = fl->fl_file;
+	get_file(fp->fi_deleg_file);
+	atomic_set(&fp->fi_delegees, 1);
+	list_add(&dp->dl_perfile, &fp->fi_delegations);
+	return 0;
+}
+
+static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
+{
+	struct nfs4_file *fp = dp->dl_file;
+
+	if (!fp->fi_lease)
+		return nfs4_setlease(dp, flag);
+	spin_lock(&recall_lock);
+	if (fp->fi_had_conflict) {
+		spin_unlock(&recall_lock);
+		return -EAGAIN;
+	}
+	atomic_inc(&fp->fi_delegees);
+	list_add(&dp->dl_perfile, &fp->fi_delegations);
+	spin_unlock(&recall_lock);
+	list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+	return 0;
+}
+
+/*
+ * Attempt to hand out a delegation.
+ */
+static void
+nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp)
+{
+	struct nfs4_delegation *dp;
+	struct nfs4_stateowner *sop = stp->st_stateowner;
+	int cb_up;
+	int status, flag = 0;
+
+	cb_up = nfsd4_cb_channel_good(sop->so_client);
+	flag = NFS4_OPEN_DELEGATE_NONE;
+	open->op_recall = 0;
+	switch (open->op_claim_type) {
+		case NFS4_OPEN_CLAIM_PREVIOUS:
+			if (!cb_up)
+				open->op_recall = 1;
+			flag = open->op_delegate_type;
+			if (flag == NFS4_OPEN_DELEGATE_NONE)
+				goto out;
+			break;
+		case NFS4_OPEN_CLAIM_NULL:
+			/* Let's not give out any delegations till everyone's
+			 * had the chance to reclaim theirs.... */
+			if (locks_in_grace())
+				goto out;
+			if (!cb_up || !sop->so_confirmed)
+				goto out;
+			if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
+				flag = NFS4_OPEN_DELEGATE_WRITE;
+			else
+				flag = NFS4_OPEN_DELEGATE_READ;
+			break;
+		default:
+			goto out;
+	}
+
+	dp = alloc_init_deleg(sop->so_client, stp, fh, flag);
+	if (dp == NULL)
+		goto out_no_deleg;
+	status = nfs4_set_delegation(dp, flag);
+	if (status)
+		goto out_free;
+
+	memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
+
+	dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
+		STATEID_VAL(&dp->dl_stateid));
+out:
+	if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS
+			&& flag == NFS4_OPEN_DELEGATE_NONE
+			&& open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
+		dprintk("NFSD: WARNING: refusing delegation reclaim\n");
+	open->op_delegate_type = flag;
+	return;
+out_free:
+	nfs4_put_delegation(dp);
+out_no_deleg:
+	flag = NFS4_OPEN_DELEGATE_NONE;
+	goto out;
+}
+
+/*
+ * called with nfs4_lock_state() held.
+ */
+__be32
+nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfs4_file *fp = NULL;
+	struct inode *ino = current_fh->fh_dentry->d_inode;
+	struct nfs4_stateid *stp = NULL;
+	struct nfs4_delegation *dp = NULL;
+	__be32 status;
+
+	status = nfserr_inval;
+	if (!access_valid(open->op_share_access, resp->cstate.minorversion)
+			|| !deny_valid(open->op_share_deny))
+		goto out;
+	/*
+	 * Lookup file; if found, lookup stateid and check open request,
+	 * and check for delegations in the process of being recalled.
+	 * If not found, create the nfs4_file struct
+	 */
+	fp = find_file(ino);
+	if (fp) {
+		if ((status = nfs4_check_open(fp, open, &stp)))
+			goto out;
+		status = nfs4_check_deleg(fp, open, &dp);
+		if (status)
+			goto out;
+	} else {
+		status = nfserr_bad_stateid;
+		if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
+			goto out;
+		status = nfserr_jukebox;
+		fp = alloc_init_file(ino);
+		if (fp == NULL)
+			goto out;
+	}
+
+	/*
+	 * OPEN the file, or upgrade an existing OPEN.
+	 * If truncate fails, the OPEN fails.
+	 */
+	if (stp) {
+		/* Stateid was found, this is an OPEN upgrade */
+		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
+		if (status)
+			goto out;
+		update_stateid(&stp->st_stateid);
+	} else {
+		status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
+		if (status)
+			goto out;
+		init_stateid(stp, fp, open);
+		status = nfsd4_truncate(rqstp, current_fh, open);
+		if (status) {
+			release_open_stateid(stp);
+			goto out;
+		}
+		if (nfsd4_has_session(&resp->cstate))
+			update_stateid(&stp->st_stateid);
+	}
+	memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
+
+	if (nfsd4_has_session(&resp->cstate))
+		open->op_stateowner->so_confirmed = 1;
+
+	/*
+	* Attempt to hand out a delegation. No error return, because the
+	* OPEN succeeds even if we fail.
+	*/
+	nfs4_open_delegation(current_fh, open, stp);
+
+	status = nfs_ok;
+
+	dprintk("%s: stateid=" STATEID_FMT "\n", __func__,
+		STATEID_VAL(&stp->st_stateid));
+out:
+	if (fp)
+		put_nfs4_file(fp);
+	if (status == 0 && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+		nfs4_set_claim_prev(open);
+	/*
+	* To finish the open response, we just need to set the rflags.
+	*/
+	open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
+	if (!open->op_stateowner->so_confirmed &&
+	    !nfsd4_has_session(&resp->cstate))
+		open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
+
+	return status;
+}
+
+__be32
+nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	    clientid_t *clid)
+{
+	struct nfs4_client *clp;
+	__be32 status;
+
+	nfs4_lock_state();
+	dprintk("process_renew(%08x/%08x): starting\n", 
+			clid->cl_boot, clid->cl_id);
+	status = nfserr_stale_clientid;
+	if (STALE_CLIENTID(clid))
+		goto out;
+	clp = find_confirmed_client(clid);
+	status = nfserr_expired;
+	if (clp == NULL) {
+		/* We assume the client took too long to RENEW. */
+		dprintk("nfsd4_renew: clientid not found!\n");
+		goto out;
+	}
+	renew_client(clp);
+	status = nfserr_cb_path_down;
+	if (!list_empty(&clp->cl_delegations)
+			&& clp->cl_cb_state != NFSD4_CB_UP)
+		goto out;
+	status = nfs_ok;
+out:
+	nfs4_unlock_state();
+	return status;
+}
+
+static struct lock_manager nfsd4_manager = {
+};
+
+static void
+nfsd4_end_grace(void)
+{
+	dprintk("NFSD: end of grace period\n");
+	nfsd4_recdir_purge_old();
+	locks_end_grace(&nfsd4_manager);
+	/*
+	 * Now that every NFSv4 client has had the chance to recover and
+	 * to see the (possibly new, possibly shorter) lease time, we
+	 * can safely set the next grace time to the current lease time:
+	 */
+	nfsd4_grace = nfsd4_lease;
+}
+
+static time_t
+nfs4_laundromat(void)
+{
+	struct nfs4_client *clp;
+	struct nfs4_stateowner *sop;
+	struct nfs4_delegation *dp;
+	struct list_head *pos, *next, reaplist;
+	time_t cutoff = get_seconds() - nfsd4_lease;
+	time_t t, clientid_val = nfsd4_lease;
+	time_t u, test_val = nfsd4_lease;
+
+	nfs4_lock_state();
+
+	dprintk("NFSD: laundromat service - starting\n");
+	if (locks_in_grace())
+		nfsd4_end_grace();
+	INIT_LIST_HEAD(&reaplist);
+	spin_lock(&client_lock);
+	list_for_each_safe(pos, next, &client_lru) {
+		clp = list_entry(pos, struct nfs4_client, cl_lru);
+		if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
+			t = clp->cl_time - cutoff;
+			if (clientid_val > t)
+				clientid_val = t;
+			break;
+		}
+		if (atomic_read(&clp->cl_refcount)) {
+			dprintk("NFSD: client in use (clientid %08x)\n",
+				clp->cl_clientid.cl_id);
+			continue;
+		}
+		unhash_client_locked(clp);
+		list_add(&clp->cl_lru, &reaplist);
+	}
+	spin_unlock(&client_lock);
+	list_for_each_safe(pos, next, &reaplist) {
+		clp = list_entry(pos, struct nfs4_client, cl_lru);
+		dprintk("NFSD: purging unused client (clientid %08x)\n",
+			clp->cl_clientid.cl_id);
+		nfsd4_remove_clid_dir(clp);
+		expire_client(clp);
+	}
+	spin_lock(&recall_lock);
+	list_for_each_safe(pos, next, &del_recall_lru) {
+		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+		if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) {
+			u = dp->dl_time - cutoff;
+			if (test_val > u)
+				test_val = u;
+			break;
+		}
+		list_move(&dp->dl_recall_lru, &reaplist);
+	}
+	spin_unlock(&recall_lock);
+	list_for_each_safe(pos, next, &reaplist) {
+		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+		list_del_init(&dp->dl_recall_lru);
+		unhash_delegation(dp);
+	}
+	test_val = nfsd4_lease;
+	list_for_each_safe(pos, next, &close_lru) {
+		sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
+		if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
+			u = sop->so_time - cutoff;
+			if (test_val > u)
+				test_val = u;
+			break;
+		}
+		dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
+			sop->so_id);
+		release_openowner(sop);
+	}
+	if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
+		clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
+	nfs4_unlock_state();
+	return clientid_val;
+}
+
+static struct workqueue_struct *laundry_wq;
+static void laundromat_main(struct work_struct *);
+static DECLARE_DELAYED_WORK(laundromat_work, laundromat_main);
+
+static void
+laundromat_main(struct work_struct *not_used)
+{
+	time_t t;
+
+	t = nfs4_laundromat();
+	dprintk("NFSD: laundromat_main - sleeping for %ld seconds\n", t);
+	queue_delayed_work(laundry_wq, &laundromat_work, t*HZ);
+}
+
+static struct nfs4_stateowner *
+search_close_lru(u32 st_id, int flags)
+{
+	struct nfs4_stateowner *local = NULL;
+
+	if (flags & CLOSE_STATE) {
+		list_for_each_entry(local, &close_lru, so_close_lru) {
+			if (local->so_id == st_id)
+				return local;
+		}
+	}
+	return NULL;
+}
+
+static inline int
+nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
+{
+	return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
+}
+
+static int
+STALE_STATEID(stateid_t *stateid)
+{
+	if (stateid->si_boot == boot_time)
+		return 0;
+	dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
+		STATEID_VAL(stateid));
+	return 1;
+}
+
+static inline int
+access_permit_read(unsigned long access_bmap)
+{
+	return test_bit(NFS4_SHARE_ACCESS_READ, &access_bmap) ||
+		test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap) ||
+		test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap);
+}
+
+static inline int
+access_permit_write(unsigned long access_bmap)
+{
+	return test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap) ||
+		test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap);
+}
+
+static
+__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
+{
+        __be32 status = nfserr_openmode;
+
+	/* For lock stateid's, we test the parent open, not the lock: */
+	if (stp->st_openstp)
+		stp = stp->st_openstp;
+	if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
+                goto out;
+	if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap)))
+                goto out;
+	status = nfs_ok;
+out:
+	return status;
+}
+
+static inline __be32
+check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
+{
+	if (ONE_STATEID(stateid) && (flags & RD_STATE))
+		return nfs_ok;
+	else if (locks_in_grace()) {
+		/* Answer in remaining cases depends on existence of
+		 * conflicting state; so we must wait out the grace period. */
+		return nfserr_grace;
+	} else if (flags & WR_STATE)
+		return nfs4_share_conflict(current_fh,
+				NFS4_SHARE_DENY_WRITE);
+	else /* (flags & RD_STATE) && ZERO_STATEID(stateid) */
+		return nfs4_share_conflict(current_fh,
+				NFS4_SHARE_DENY_READ);
+}
+
+/*
+ * Allow READ/WRITE during grace period on recovered state only for files
+ * that are not able to provide mandatory locking.
+ */
+static inline int
+grace_disallows_io(struct inode *inode)
+{
+	return locks_in_grace() && mandatory_lock(inode);
+}
+
+static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
+{
+	/*
+	 * When sessions are used the stateid generation number is ignored
+	 * when it is zero.
+	 */
+	if ((flags & HAS_SESSION) && in->si_generation == 0)
+		goto out;
+
+	/* If the client sends us a stateid from the future, it's buggy: */
+	if (in->si_generation > ref->si_generation)
+		return nfserr_bad_stateid;
+	/*
+	 * The following, however, can happen.  For example, if the
+	 * client sends an open and some IO at the same time, the open
+	 * may bump si_generation while the IO is still in flight.
+	 * Thanks to hard links and renames, the client never knows what
+	 * file an open will affect.  So it could avoid that situation
+	 * only by serializing all opens and IO from the same open
+	 * owner.  To recover from the old_stateid error, the client
+	 * will just have to retry the IO:
+	 */
+	if (in->si_generation < ref->si_generation)
+		return nfserr_old_stateid;
+out:
+	return nfs_ok;
+}
+
+static int is_delegation_stateid(stateid_t *stateid)
+{
+	return stateid->si_fileid == 0;
+}
+
+/*
+* Checks for stateid operations
+*/
+__be32
+nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
+			   stateid_t *stateid, int flags, struct file **filpp)
+{
+	struct nfs4_stateid *stp = NULL;
+	struct nfs4_delegation *dp = NULL;
+	struct svc_fh *current_fh = &cstate->current_fh;
+	struct inode *ino = current_fh->fh_dentry->d_inode;
+	__be32 status;
+
+	if (filpp)
+		*filpp = NULL;
+
+	if (grace_disallows_io(ino))
+		return nfserr_grace;
+
+	if (nfsd4_has_session(cstate))
+		flags |= HAS_SESSION;
+
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+		return check_special_stateids(current_fh, stateid, flags);
+
+	status = nfserr_stale_stateid;
+	if (STALE_STATEID(stateid)) 
+		goto out;
+
+	/*
+	 * We assume that any stateid that has the current boot time,
+	 * but that we can't find, is expired:
+	 */
+	status = nfserr_expired;
+	if (is_delegation_stateid(stateid)) {
+		dp = find_delegation_stateid(ino, stateid);
+		if (!dp)
+			goto out;
+		status = check_stateid_generation(stateid, &dp->dl_stateid,
+						  flags);
+		if (status)
+			goto out;
+		status = nfs4_check_delegmode(dp, flags);
+		if (status)
+			goto out;
+		renew_client(dp->dl_client);
+		if (filpp) {
+			*filpp = dp->dl_file->fi_deleg_file;
+			BUG_ON(!*filpp);
+		}
+	} else { /* open or lock stateid */
+		stp = find_stateid(stateid, flags);
+		if (!stp)
+			goto out;
+		status = nfserr_bad_stateid;
+		if (nfs4_check_fh(current_fh, stp))
+			goto out;
+		if (!stp->st_stateowner->so_confirmed)
+			goto out;
+		status = check_stateid_generation(stateid, &stp->st_stateid,
+						  flags);
+		if (status)
+			goto out;
+		status = nfs4_check_openmode(stp, flags);
+		if (status)
+			goto out;
+		renew_client(stp->st_stateowner->so_client);
+		if (filpp) {
+			if (flags & RD_STATE)
+				*filpp = find_readable_file(stp->st_file);
+			else
+				*filpp = find_writeable_file(stp->st_file);
+		}
+	}
+	status = nfs_ok;
+out:
+	return status;
+}
+
+static inline int
+setlkflg (int type)
+{
+	return (type == NFS4_READW_LT || type == NFS4_READ_LT) ?
+		RD_STATE : WR_STATE;
+}
+
+/* 
+ * Checks for sequence id mutating operations. 
+ */
+static __be32
+nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+			 stateid_t *stateid, int flags,
+			 struct nfs4_stateowner **sopp,
+			 struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
+{
+	struct nfs4_stateid *stp;
+	struct nfs4_stateowner *sop;
+	struct svc_fh *current_fh = &cstate->current_fh;
+	__be32 status;
+
+	dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
+		seqid, STATEID_VAL(stateid));
+
+	*stpp = NULL;
+	*sopp = NULL;
+
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
+		dprintk("NFSD: preprocess_seqid_op: magic stateid!\n");
+		return nfserr_bad_stateid;
+	}
+
+	if (STALE_STATEID(stateid))
+		return nfserr_stale_stateid;
+
+	if (nfsd4_has_session(cstate))
+		flags |= HAS_SESSION;
+
+	/*
+	* We return BAD_STATEID if filehandle doesn't match stateid, 
+	* the confirmed flag is incorrecly set, or the generation 
+	* number is incorrect.  
+	*/
+	stp = find_stateid(stateid, flags);
+	if (stp == NULL) {
+		/*
+		 * Also, we should make sure this isn't just the result of
+		 * a replayed close:
+		 */
+		sop = search_close_lru(stateid->si_stateownerid, flags);
+		/* It's not stale; let's assume it's expired: */
+		if (sop == NULL)
+			return nfserr_expired;
+		*sopp = sop;
+		goto check_replay;
+	}
+
+	*stpp = stp;
+	*sopp = sop = stp->st_stateowner;
+
+	if (lock) {
+		clientid_t *lockclid = &lock->v.new.clientid;
+		struct nfs4_client *clp = sop->so_client;
+		int lkflg = 0;
+		__be32 status;
+
+		lkflg = setlkflg(lock->lk_type);
+
+		if (lock->lk_is_new) {
+			if (!sop->so_is_open_owner)
+				return nfserr_bad_stateid;
+			if (!(flags & HAS_SESSION) &&
+			    !same_clid(&clp->cl_clientid, lockclid))
+				return nfserr_bad_stateid;
+			/* stp is the open stateid */
+			status = nfs4_check_openmode(stp, lkflg);
+			if (status)
+				return status;
+		} else {
+			/* stp is the lock stateid */
+			status = nfs4_check_openmode(stp->st_openstp, lkflg);
+			if (status)
+				return status;
+               }
+	}
+
+	if (nfs4_check_fh(current_fh, stp)) {
+		dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n");
+		return nfserr_bad_stateid;
+	}
+
+	/*
+	*  We now validate the seqid and stateid generation numbers.
+	*  For the moment, we ignore the possibility of 
+	*  generation number wraparound.
+	*/
+	if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
+		goto check_replay;
+
+	if (sop->so_confirmed && flags & CONFIRM) {
+		dprintk("NFSD: preprocess_seqid_op: expected"
+				" unconfirmed stateowner!\n");
+		return nfserr_bad_stateid;
+	}
+	if (!sop->so_confirmed && !(flags & CONFIRM)) {
+		dprintk("NFSD: preprocess_seqid_op: stateowner not"
+				" confirmed yet!\n");
+		return nfserr_bad_stateid;
+	}
+	status = check_stateid_generation(stateid, &stp->st_stateid, flags);
+	if (status)
+		return status;
+	renew_client(sop->so_client);
+	return nfs_ok;
+
+check_replay:
+	if (seqid == sop->so_seqid - 1) {
+		dprintk("NFSD: preprocess_seqid_op: retransmission?\n");
+		/* indicate replay to calling function */
+		return nfserr_replay_me;
+	}
+	dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n",
+			sop->so_seqid, seqid);
+	*sopp = NULL;
+	return nfserr_bad_seqid;
+}
+
+__be32
+nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		   struct nfsd4_open_confirm *oc)
+{
+	__be32 status;
+	struct nfs4_stateowner *sop;
+	struct nfs4_stateid *stp;
+
+	dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
+			(int)cstate->current_fh.fh_dentry->d_name.len,
+			cstate->current_fh.fh_dentry->d_name.name);
+
+	status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0);
+	if (status)
+		return status;
+
+	nfs4_lock_state();
+
+	if ((status = nfs4_preprocess_seqid_op(cstate,
+					oc->oc_seqid, &oc->oc_req_stateid,
+					CONFIRM | OPEN_STATE,
+					&oc->oc_stateowner, &stp, NULL)))
+		goto out; 
+
+	sop = oc->oc_stateowner;
+	sop->so_confirmed = 1;
+	update_stateid(&stp->st_stateid);
+	memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t));
+	dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
+		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid));
+
+	nfsd4_create_clid_dir(sop->so_client);
+out:
+	if (oc->oc_stateowner) {
+		nfs4_get_stateowner(oc->oc_stateowner);
+		cstate->replay_owner = oc->oc_stateowner;
+	}
+	nfs4_unlock_state();
+	return status;
+}
+
+static inline void nfs4_file_downgrade(struct nfs4_stateid *stp, unsigned int to_access)
+{
+	int i;
+
+	for (i = 1; i < 4; i++) {
+		if (test_bit(i, &stp->st_access_bmap)
+					&& ((i & to_access) != i)) {
+			nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(i));
+			__clear_bit(i, &stp->st_access_bmap);
+		}
+	}
+}
+
+static void
+reset_union_bmap_deny(unsigned long deny, unsigned long *bmap)
+{
+	int i;
+	for (i = 0; i < 4; i++) {
+		if ((i & deny) != i)
+			__clear_bit(i, bmap);
+	}
+}
+
+__be32
+nfsd4_open_downgrade(struct svc_rqst *rqstp,
+		     struct nfsd4_compound_state *cstate,
+		     struct nfsd4_open_downgrade *od)
+{
+	__be32 status;
+	struct nfs4_stateid *stp;
+
+	dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", 
+			(int)cstate->current_fh.fh_dentry->d_name.len,
+			cstate->current_fh.fh_dentry->d_name.name);
+
+	if (!access_valid(od->od_share_access, cstate->minorversion)
+			|| !deny_valid(od->od_share_deny))
+		return nfserr_inval;
+	/* We don't yet support WANT bits: */
+	od->od_share_access &= NFS4_SHARE_ACCESS_MASK;
+
+	nfs4_lock_state();
+	if ((status = nfs4_preprocess_seqid_op(cstate,
+					od->od_seqid,
+					&od->od_stateid, 
+					OPEN_STATE,
+					&od->od_stateowner, &stp, NULL)))
+		goto out; 
+
+	status = nfserr_inval;
+	if (!test_bit(od->od_share_access, &stp->st_access_bmap)) {
+		dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n",
+			stp->st_access_bmap, od->od_share_access);
+		goto out;
+	}
+	if (!test_bit(od->od_share_deny, &stp->st_deny_bmap)) {
+		dprintk("NFSD:deny not a subset current bitmap: 0x%lx, input deny=%08x\n",
+			stp->st_deny_bmap, od->od_share_deny);
+		goto out;
+	}
+	nfs4_file_downgrade(stp, od->od_share_access);
+
+	reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
+
+	update_stateid(&stp->st_stateid);
+	memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t));
+	status = nfs_ok;
+out:
+	if (od->od_stateowner) {
+		nfs4_get_stateowner(od->od_stateowner);
+		cstate->replay_owner = od->od_stateowner;
+	}
+	nfs4_unlock_state();
+	return status;
+}
+
+/*
+ * nfs4_unlock_state() called after encode
+ */
+__be32
+nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	    struct nfsd4_close *close)
+{
+	__be32 status;
+	struct nfs4_stateid *stp;
+
+	dprintk("NFSD: nfsd4_close on file %.*s\n", 
+			(int)cstate->current_fh.fh_dentry->d_name.len,
+			cstate->current_fh.fh_dentry->d_name.name);
+
+	nfs4_lock_state();
+	/* check close_lru for replay */
+	if ((status = nfs4_preprocess_seqid_op(cstate,
+					close->cl_seqid,
+					&close->cl_stateid, 
+					OPEN_STATE | CLOSE_STATE,
+					&close->cl_stateowner, &stp, NULL)))
+		goto out; 
+	status = nfs_ok;
+	update_stateid(&stp->st_stateid);
+	memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
+
+	/* release_stateid() calls nfsd_close() if needed */
+	release_open_stateid(stp);
+
+	/* place unused nfs4_stateowners on so_close_lru list to be
+	 * released by the laundromat service after the lease period
+	 * to enable us to handle CLOSE replay
+	 */
+	if (list_empty(&close->cl_stateowner->so_stateids))
+		move_to_close_lru(close->cl_stateowner);
+out:
+	if (close->cl_stateowner) {
+		nfs4_get_stateowner(close->cl_stateowner);
+		cstate->replay_owner = close->cl_stateowner;
+	}
+	nfs4_unlock_state();
+	return status;
+}
+
+__be32
+nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		  struct nfsd4_delegreturn *dr)
+{
+	struct nfs4_delegation *dp;
+	stateid_t *stateid = &dr->dr_stateid;
+	struct inode *inode;
+	__be32 status;
+	int flags = 0;
+
+	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
+		return status;
+	inode = cstate->current_fh.fh_dentry->d_inode;
+
+	if (nfsd4_has_session(cstate))
+		flags |= HAS_SESSION;
+	nfs4_lock_state();
+	status = nfserr_bad_stateid;
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+		goto out;
+	status = nfserr_stale_stateid;
+	if (STALE_STATEID(stateid))
+		goto out;
+	status = nfserr_bad_stateid;
+	if (!is_delegation_stateid(stateid))
+		goto out;
+	status = nfserr_expired;
+	dp = find_delegation_stateid(inode, stateid);
+	if (!dp)
+		goto out;
+	status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
+	if (status)
+		goto out;
+	renew_client(dp->dl_client);
+
+	unhash_delegation(dp);
+out:
+	nfs4_unlock_state();
+
+	return status;
+}
+
+
+/* 
+ * Lock owner state (byte-range locks)
+ */
+#define LOFF_OVERFLOW(start, len)      ((u64)(len) > ~(u64)(start))
+#define LOCK_HASH_BITS              8
+#define LOCK_HASH_SIZE             (1 << LOCK_HASH_BITS)
+#define LOCK_HASH_MASK             (LOCK_HASH_SIZE - 1)
+
+static inline u64
+end_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	end = start + len;
+	return end >= start ? end: NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	BUG_ON(!len);
+	end = start + len;
+	return end > start ? end - 1: NFS4_MAX_UINT64;
+}
+
+#define lockownerid_hashval(id) \
+        ((id) & LOCK_HASH_MASK)
+
+static inline unsigned int
+lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
+		struct xdr_netobj *ownername)
+{
+	return (file_hashval(inode) + cl_id
+			+ opaque_hashval(ownername->data, ownername->len))
+		& LOCK_HASH_MASK;
+}
+
+static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE];
+static struct list_head	lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
+static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
+
+static struct nfs4_stateid *
+find_stateid(stateid_t *stid, int flags)
+{
+	struct nfs4_stateid *local;
+	u32 st_id = stid->si_stateownerid;
+	u32 f_id = stid->si_fileid;
+	unsigned int hashval;
+
+	dprintk("NFSD: find_stateid flags 0x%x\n",flags);
+	if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
+		hashval = stateid_hashval(st_id, f_id);
+		list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
+			if ((local->st_stateid.si_stateownerid == st_id) &&
+			    (local->st_stateid.si_fileid == f_id))
+				return local;
+		}
+	} 
+
+	if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
+		hashval = stateid_hashval(st_id, f_id);
+		list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
+			if ((local->st_stateid.si_stateownerid == st_id) &&
+			    (local->st_stateid.si_fileid == f_id))
+				return local;
+		}
+	}
+	return NULL;
+}
+
+static struct nfs4_delegation *
+find_delegation_stateid(struct inode *ino, stateid_t *stid)
+{
+	struct nfs4_file *fp;
+	struct nfs4_delegation *dl;
+
+	dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__,
+		STATEID_VAL(stid));
+
+	fp = find_file(ino);
+	if (!fp)
+		return NULL;
+	dl = find_delegation_file(fp, stid);
+	put_nfs4_file(fp);
+	return dl;
+}
+
+/*
+ * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
+ * we can't properly handle lock requests that go beyond the (2^63 - 1)-th
+ * byte, because of sign extension problems.  Since NFSv4 calls for 64-bit
+ * locking, this prevents us from being completely protocol-compliant.  The
+ * real solution to this problem is to start using unsigned file offsets in
+ * the VFS, but this is a very deep change!
+ */
+static inline void
+nfs4_transform_lock_offset(struct file_lock *lock)
+{
+	if (lock->fl_start < 0)
+		lock->fl_start = OFFSET_MAX;
+	if (lock->fl_end < 0)
+		lock->fl_end = OFFSET_MAX;
+}
+
+/* Hack!: For now, we're defining this just so we can use a pointer to it
+ * as a unique cookie to identify our (NFSv4's) posix locks. */
+static const struct lock_manager_operations nfsd_posix_mng_ops  = {
+};
+
+static inline void
+nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
+{
+	struct nfs4_stateowner *sop;
+
+	if (fl->fl_lmops == &nfsd_posix_mng_ops) {
+		sop = (struct nfs4_stateowner *) fl->fl_owner;
+		kref_get(&sop->so_ref);
+		deny->ld_sop = sop;
+		deny->ld_clientid = sop->so_client->cl_clientid;
+	} else {
+		deny->ld_sop = NULL;
+		deny->ld_clientid.cl_boot = 0;
+		deny->ld_clientid.cl_id = 0;
+	}
+	deny->ld_start = fl->fl_start;
+	deny->ld_length = NFS4_MAX_UINT64;
+	if (fl->fl_end != NFS4_MAX_UINT64)
+		deny->ld_length = fl->fl_end - fl->fl_start + 1;        
+	deny->ld_type = NFS4_READ_LT;
+	if (fl->fl_type != F_RDLCK)
+		deny->ld_type = NFS4_WRITE_LT;
+}
+
+static struct nfs4_stateowner *
+find_lockstateowner_str(struct inode *inode, clientid_t *clid,
+		struct xdr_netobj *owner)
+{
+	unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner);
+	struct nfs4_stateowner *op;
+
+	list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) {
+		if (same_owner_str(op, owner, clid))
+			return op;
+	}
+	return NULL;
+}
+
+/*
+ * Alloc a lock owner structure.
+ * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 
+ * occurred. 
+ *
+ * strhashval = lock_ownerstr_hashval 
+ */
+
+static struct nfs4_stateowner *
+alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) {
+	struct nfs4_stateowner *sop;
+	struct nfs4_replay *rp;
+	unsigned int idhashval;
+
+	if (!(sop = alloc_stateowner(&lock->lk_new_owner)))
+		return NULL;
+	idhashval = lockownerid_hashval(current_ownerid);
+	INIT_LIST_HEAD(&sop->so_idhash);
+	INIT_LIST_HEAD(&sop->so_strhash);
+	INIT_LIST_HEAD(&sop->so_perclient);
+	INIT_LIST_HEAD(&sop->so_stateids);
+	INIT_LIST_HEAD(&sop->so_perstateid);
+	INIT_LIST_HEAD(&sop->so_close_lru); /* not used */
+	sop->so_time = 0;
+	list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]);
+	list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]);
+	list_add(&sop->so_perstateid, &open_stp->st_lockowners);
+	sop->so_is_open_owner = 0;
+	sop->so_id = current_ownerid++;
+	sop->so_client = clp;
+	/* It is the openowner seqid that will be incremented in encode in the
+	 * case of new lockowners; so increment the lock seqid manually: */
+	sop->so_seqid = lock->lk_new_lock_seqid + 1;
+	sop->so_confirmed = 1;
+	rp = &sop->so_replay;
+	rp->rp_status = nfserr_serverfault;
+	rp->rp_buflen = 0;
+	rp->rp_buf = rp->rp_ibuf;
+	return sop;
+}
+
+static struct nfs4_stateid *
+alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struct nfs4_stateid *open_stp)
+{
+	struct nfs4_stateid *stp;
+	unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id);
+
+	stp = nfs4_alloc_stateid();
+	if (stp == NULL)
+		goto out;
+	INIT_LIST_HEAD(&stp->st_hash);
+	INIT_LIST_HEAD(&stp->st_perfile);
+	INIT_LIST_HEAD(&stp->st_perstateowner);
+	INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
+	list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]);
+	list_add(&stp->st_perfile, &fp->fi_stateids);
+	list_add(&stp->st_perstateowner, &sop->so_stateids);
+	stp->st_stateowner = sop;
+	get_nfs4_file(fp);
+	stp->st_file = fp;
+	stp->st_stateid.si_boot = boot_time;
+	stp->st_stateid.si_stateownerid = sop->so_id;
+	stp->st_stateid.si_fileid = fp->fi_id;
+	stp->st_stateid.si_generation = 0;
+	stp->st_access_bmap = 0;
+	stp->st_deny_bmap = open_stp->st_deny_bmap;
+	stp->st_openstp = open_stp;
+
+out:
+	return stp;
+}
+
+static int
+check_lock_length(u64 offset, u64 length)
+{
+	return ((length == 0)  || ((length != NFS4_MAX_UINT64) &&
+	     LOFF_OVERFLOW(offset, length)));
+}
+
+static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access)
+{
+	struct nfs4_file *fp = lock_stp->st_file;
+	int oflag = nfs4_access_to_omode(access);
+
+	if (test_bit(access, &lock_stp->st_access_bmap))
+		return;
+	nfs4_file_get_access(fp, oflag);
+	__set_bit(access, &lock_stp->st_access_bmap);
+}
+
+/*
+ *  LOCK operation 
+ */
+__be32
+nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	   struct nfsd4_lock *lock)
+{
+	struct nfs4_stateowner *open_sop = NULL;
+	struct nfs4_stateowner *lock_sop = NULL;
+	struct nfs4_stateid *lock_stp;
+	struct nfs4_file *fp;
+	struct file *filp = NULL;
+	struct file_lock file_lock;
+	struct file_lock conflock;
+	__be32 status = 0;
+	unsigned int strhashval;
+	int err;
+
+	dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
+		(long long) lock->lk_offset,
+		(long long) lock->lk_length);
+
+	if (check_lock_length(lock->lk_offset, lock->lk_length))
+		 return nfserr_inval;
+
+	if ((status = fh_verify(rqstp, &cstate->current_fh,
+				S_IFREG, NFSD_MAY_LOCK))) {
+		dprintk("NFSD: nfsd4_lock: permission denied!\n");
+		return status;
+	}
+
+	nfs4_lock_state();
+
+	if (lock->lk_is_new) {
+		/*
+		 * Client indicates that this is a new lockowner.
+		 * Use open owner and open stateid to create lock owner and
+		 * lock stateid.
+		 */
+		struct nfs4_stateid *open_stp = NULL;
+		
+		status = nfserr_stale_clientid;
+		if (!nfsd4_has_session(cstate) &&
+		    STALE_CLIENTID(&lock->lk_new_clientid))
+			goto out;
+
+		/* validate and update open stateid and open seqid */
+		status = nfs4_preprocess_seqid_op(cstate,
+				        lock->lk_new_open_seqid,
+		                        &lock->lk_new_open_stateid,
+					OPEN_STATE,
+		                        &lock->lk_replay_owner, &open_stp,
+					lock);
+		if (status)
+			goto out;
+		open_sop = lock->lk_replay_owner;
+		/* create lockowner and lock stateid */
+		fp = open_stp->st_file;
+		strhashval = lock_ownerstr_hashval(fp->fi_inode, 
+				open_sop->so_client->cl_clientid.cl_id, 
+				&lock->v.new.owner);
+		/* XXX: Do we need to check for duplicate stateowners on
+		 * the same file, or should they just be allowed (and
+		 * create new stateids)? */
+		status = nfserr_jukebox;
+		lock_sop = alloc_init_lock_stateowner(strhashval,
+				open_sop->so_client, open_stp, lock);
+		if (lock_sop == NULL)
+			goto out;
+		lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp);
+		if (lock_stp == NULL)
+			goto out;
+	} else {
+		/* lock (lock owner + lock stateid) already exists */
+		status = nfs4_preprocess_seqid_op(cstate,
+				       lock->lk_old_lock_seqid, 
+				       &lock->lk_old_lock_stateid, 
+				       LOCK_STATE,
+				       &lock->lk_replay_owner, &lock_stp, lock);
+		if (status)
+			goto out;
+		lock_sop = lock->lk_replay_owner;
+		fp = lock_stp->st_file;
+	}
+	/* lock->lk_replay_owner and lock_stp have been created or found */
+
+	status = nfserr_grace;
+	if (locks_in_grace() && !lock->lk_reclaim)
+		goto out;
+	status = nfserr_no_grace;
+	if (!locks_in_grace() && lock->lk_reclaim)
+		goto out;
+
+	locks_init_lock(&file_lock);
+	switch (lock->lk_type) {
+		case NFS4_READ_LT:
+		case NFS4_READW_LT:
+			filp = find_readable_file(lock_stp->st_file);
+			if (filp)
+				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
+			file_lock.fl_type = F_RDLCK;
+			break;
+		case NFS4_WRITE_LT:
+		case NFS4_WRITEW_LT:
+			filp = find_writeable_file(lock_stp->st_file);
+			if (filp)
+				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
+			file_lock.fl_type = F_WRLCK;
+			break;
+		default:
+			status = nfserr_inval;
+		goto out;
+	}
+	if (!filp) {
+		status = nfserr_openmode;
+		goto out;
+	}
+	file_lock.fl_owner = (fl_owner_t)lock_sop;
+	file_lock.fl_pid = current->tgid;
+	file_lock.fl_file = filp;
+	file_lock.fl_flags = FL_POSIX;
+	file_lock.fl_lmops = &nfsd_posix_mng_ops;
+
+	file_lock.fl_start = lock->lk_offset;
+	file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
+	nfs4_transform_lock_offset(&file_lock);
+
+	/*
+	* Try to lock the file in the VFS.
+	* Note: locks.c uses the BKL to protect the inode's lock list.
+	*/
+
+	err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock);
+	switch (-err) {
+	case 0: /* success! */
+		update_stateid(&lock_stp->st_stateid);
+		memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid, 
+				sizeof(stateid_t));
+		status = 0;
+		break;
+	case (EAGAIN):		/* conflock holds conflicting lock */
+		status = nfserr_denied;
+		dprintk("NFSD: nfsd4_lock: conflicting lock found!\n");
+		nfs4_set_lock_denied(&conflock, &lock->lk_denied);
+		break;
+	case (EDEADLK):
+		status = nfserr_deadlock;
+		break;
+	default:
+		dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err);
+		status = nfserrno(err);
+		break;
+	}
+out:
+	if (status && lock->lk_is_new && lock_sop)
+		release_lockowner(lock_sop);
+	if (lock->lk_replay_owner) {
+		nfs4_get_stateowner(lock->lk_replay_owner);
+		cstate->replay_owner = lock->lk_replay_owner;
+	}
+	nfs4_unlock_state();
+	return status;
+}
+
+/*
+ * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
+ * so we do a temporary open here just to get an open file to pass to
+ * vfs_test_lock.  (Arguably perhaps test_lock should be done with an
+ * inode operation.)
+ */
+static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
+{
+	struct file *file;
+	__be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+	if (!err) {
+		err = nfserrno(vfs_test_lock(file, lock));
+		nfsd_close(file);
+	}
+	return err;
+}
+
+/*
+ * LOCKT operation
+ */
+__be32
+nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	    struct nfsd4_lockt *lockt)
+{
+	struct inode *inode;
+	struct file_lock file_lock;
+	__be32 status;
+
+	if (locks_in_grace())
+		return nfserr_grace;
+
+	if (check_lock_length(lockt->lt_offset, lockt->lt_length))
+		 return nfserr_inval;
+
+	lockt->lt_stateowner = NULL;
+	nfs4_lock_state();
+
+	status = nfserr_stale_clientid;
+	if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
+		goto out;
+
+	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
+		dprintk("NFSD: nfsd4_lockt: fh_verify() failed!\n");
+		if (status == nfserr_symlink)
+			status = nfserr_inval;
+		goto out;
+	}
+
+	inode = cstate->current_fh.fh_dentry->d_inode;
+	locks_init_lock(&file_lock);
+	switch (lockt->lt_type) {
+		case NFS4_READ_LT:
+		case NFS4_READW_LT:
+			file_lock.fl_type = F_RDLCK;
+		break;
+		case NFS4_WRITE_LT:
+		case NFS4_WRITEW_LT:
+			file_lock.fl_type = F_WRLCK;
+		break;
+		default:
+			dprintk("NFSD: nfs4_lockt: bad lock type!\n");
+			status = nfserr_inval;
+		goto out;
+	}
+
+	lockt->lt_stateowner = find_lockstateowner_str(inode,
+			&lockt->lt_clientid, &lockt->lt_owner);
+	if (lockt->lt_stateowner)
+		file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
+	file_lock.fl_pid = current->tgid;
+	file_lock.fl_flags = FL_POSIX;
+
+	file_lock.fl_start = lockt->lt_offset;
+	file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
+
+	nfs4_transform_lock_offset(&file_lock);
+
+	status = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock);
+	if (status)
+		goto out;
+
+	if (file_lock.fl_type != F_UNLCK) {
+		status = nfserr_denied;
+		nfs4_set_lock_denied(&file_lock, &lockt->lt_denied);
+	}
+out:
+	nfs4_unlock_state();
+	return status;
+}
+
+__be32
+nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	    struct nfsd4_locku *locku)
+{
+	struct nfs4_stateid *stp;
+	struct file *filp = NULL;
+	struct file_lock file_lock;
+	__be32 status;
+	int err;
+						        
+	dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n",
+		(long long) locku->lu_offset,
+		(long long) locku->lu_length);
+
+	if (check_lock_length(locku->lu_offset, locku->lu_length))
+		 return nfserr_inval;
+
+	nfs4_lock_state();
+									        
+	if ((status = nfs4_preprocess_seqid_op(cstate,
+					locku->lu_seqid, 
+					&locku->lu_stateid, 
+					LOCK_STATE,
+					&locku->lu_stateowner, &stp, NULL)))
+		goto out;
+
+	filp = find_any_file(stp->st_file);
+	if (!filp) {
+		status = nfserr_lock_range;
+		goto out;
+	}
+	BUG_ON(!filp);
+	locks_init_lock(&file_lock);
+	file_lock.fl_type = F_UNLCK;
+	file_lock.fl_owner = (fl_owner_t) locku->lu_stateowner;
+	file_lock.fl_pid = current->tgid;
+	file_lock.fl_file = filp;
+	file_lock.fl_flags = FL_POSIX; 
+	file_lock.fl_lmops = &nfsd_posix_mng_ops;
+	file_lock.fl_start = locku->lu_offset;
+
+	file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length);
+	nfs4_transform_lock_offset(&file_lock);
+
+	/*
+	*  Try to unlock the file in the VFS.
+	*/
+	err = vfs_lock_file(filp, F_SETLK, &file_lock, NULL);
+	if (err) {
+		dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
+		goto out_nfserr;
+	}
+	/*
+	* OK, unlock succeeded; the only thing left to do is update the stateid.
+	*/
+	update_stateid(&stp->st_stateid);
+	memcpy(&locku->lu_stateid, &stp->st_stateid, sizeof(stateid_t));
+
+out:
+	if (locku->lu_stateowner) {
+		nfs4_get_stateowner(locku->lu_stateowner);
+		cstate->replay_owner = locku->lu_stateowner;
+	}
+	nfs4_unlock_state();
+	return status;
+
+out_nfserr:
+	status = nfserrno(err);
+	goto out;
+}
+
+/*
+ * returns
+ * 	1: locks held by lockowner
+ * 	0: no locks held by lockowner
+ */
+static int
+check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
+{
+	struct file_lock **flpp;
+	struct inode *inode = filp->fi_inode;
+	int status = 0;
+
+	lock_flocks();
+	for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
+		if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
+			status = 1;
+			goto out;
+		}
+	}
+out:
+	unlock_flocks();
+	return status;
+}
+
+__be32
+nfsd4_release_lockowner(struct svc_rqst *rqstp,
+			struct nfsd4_compound_state *cstate,
+			struct nfsd4_release_lockowner *rlockowner)
+{
+	clientid_t *clid = &rlockowner->rl_clientid;
+	struct nfs4_stateowner *sop;
+	struct nfs4_stateid *stp;
+	struct xdr_netobj *owner = &rlockowner->rl_owner;
+	struct list_head matches;
+	int i;
+	__be32 status;
+
+	dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
+		clid->cl_boot, clid->cl_id);
+
+	/* XXX check for lease expiration */
+
+	status = nfserr_stale_clientid;
+	if (STALE_CLIENTID(clid))
+		return status;
+
+	nfs4_lock_state();
+
+	status = nfserr_locks_held;
+	/* XXX: we're doing a linear search through all the lockowners.
+	 * Yipes!  For now we'll just hope clients aren't really using
+	 * release_lockowner much, but eventually we have to fix these
+	 * data structures. */
+	INIT_LIST_HEAD(&matches);
+	for (i = 0; i < LOCK_HASH_SIZE; i++) {
+		list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) {
+			if (!same_owner_str(sop, owner, clid))
+				continue;
+			list_for_each_entry(stp, &sop->so_stateids,
+					st_perstateowner) {
+				if (check_for_locks(stp->st_file, sop))
+					goto out;
+				/* Note: so_perclient unused for lockowners,
+				 * so it's OK to fool with here. */
+				list_add(&sop->so_perclient, &matches);
+			}
+		}
+	}
+	/* Clients probably won't expect us to return with some (but not all)
+	 * of the lockowner state released; so don't release any until all
+	 * have been checked. */
+	status = nfs_ok;
+	while (!list_empty(&matches)) {
+		sop = list_entry(matches.next, struct nfs4_stateowner,
+								so_perclient);
+		/* unhash_stateowner deletes so_perclient only
+		 * for openowners. */
+		list_del(&sop->so_perclient);
+		release_lockowner(sop);
+	}
+out:
+	nfs4_unlock_state();
+	return status;
+}
+
+static inline struct nfs4_client_reclaim *
+alloc_reclaim(void)
+{
+	return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL);
+}
+
+int
+nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
+{
+	unsigned int strhashval = clientstr_hashval(name);
+	struct nfs4_client *clp;
+
+	clp = find_confirmed_client_by_str(name, strhashval);
+	return clp ? 1 : 0;
+}
+
+/*
+ * failure => all reset bets are off, nfserr_no_grace...
+ */
+int
+nfs4_client_to_reclaim(const char *name)
+{
+	unsigned int strhashval;
+	struct nfs4_client_reclaim *crp = NULL;
+
+	dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name);
+	crp = alloc_reclaim();
+	if (!crp)
+		return 0;
+	strhashval = clientstr_hashval(name);
+	INIT_LIST_HEAD(&crp->cr_strhash);
+	list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]);
+	memcpy(crp->cr_recdir, name, HEXDIR_LEN);
+	reclaim_str_hashtbl_size++;
+	return 1;
+}
+
+static void
+nfs4_release_reclaim(void)
+{
+	struct nfs4_client_reclaim *crp = NULL;
+	int i;
+
+	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+		while (!list_empty(&reclaim_str_hashtbl[i])) {
+			crp = list_entry(reclaim_str_hashtbl[i].next,
+			                struct nfs4_client_reclaim, cr_strhash);
+			list_del(&crp->cr_strhash);
+			kfree(crp);
+			reclaim_str_hashtbl_size--;
+		}
+	}
+	BUG_ON(reclaim_str_hashtbl_size);
+}
+
+/*
+ * called from OPEN, CLAIM_PREVIOUS with a new clientid. */
+static struct nfs4_client_reclaim *
+nfs4_find_reclaim_client(clientid_t *clid)
+{
+	unsigned int strhashval;
+	struct nfs4_client *clp;
+	struct nfs4_client_reclaim *crp = NULL;
+
+
+	/* find clientid in conf_id_hashtbl */
+	clp = find_confirmed_client(clid);
+	if (clp == NULL)
+		return NULL;
+
+	dprintk("NFSD: nfs4_find_reclaim_client for %.*s with recdir %s\n",
+		            clp->cl_name.len, clp->cl_name.data,
+			    clp->cl_recdir);
+
+	/* find clp->cl_name in reclaim_str_hashtbl */
+	strhashval = clientstr_hashval(clp->cl_recdir);
+	list_for_each_entry(crp, &reclaim_str_hashtbl[strhashval], cr_strhash) {
+		if (same_name(crp->cr_recdir, clp->cl_recdir)) {
+			return crp;
+		}
+	}
+	return NULL;
+}
+
+/*
+* Called from OPEN. Look for clientid in reclaim list.
+*/
+__be32
+nfs4_check_open_reclaim(clientid_t *clid)
+{
+	return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad;
+}
+
+/* initialization to perform at module load time: */
+
+int
+nfs4_state_init(void)
+{
+	int i, status;
+
+	status = nfsd4_init_slabs();
+	if (status)
+		return status;
+	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+		INIT_LIST_HEAD(&conf_id_hashtbl[i]);
+		INIT_LIST_HEAD(&conf_str_hashtbl[i]);
+		INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
+		INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
+		INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
+	}
+	for (i = 0; i < SESSION_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&sessionid_hashtbl[i]);
+	for (i = 0; i < FILE_HASH_SIZE; i++) {
+		INIT_LIST_HEAD(&file_hashtbl[i]);
+	}
+	for (i = 0; i < OWNER_HASH_SIZE; i++) {
+		INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
+		INIT_LIST_HEAD(&ownerid_hashtbl[i]);
+	}
+	for (i = 0; i < STATEID_HASH_SIZE; i++) {
+		INIT_LIST_HEAD(&stateid_hashtbl[i]);
+		INIT_LIST_HEAD(&lockstateid_hashtbl[i]);
+	}
+	for (i = 0; i < LOCK_HASH_SIZE; i++) {
+		INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]);
+		INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
+	}
+	memset(&onestateid, ~0, sizeof(stateid_t));
+	INIT_LIST_HEAD(&close_lru);
+	INIT_LIST_HEAD(&client_lru);
+	INIT_LIST_HEAD(&del_recall_lru);
+	reclaim_str_hashtbl_size = 0;
+	return 0;
+}
+
+static void
+nfsd4_load_reboot_recovery_data(void)
+{
+	int status;
+
+	nfs4_lock_state();
+	nfsd4_init_recdir(user_recovery_dirname);
+	status = nfsd4_recdir_load();
+	nfs4_unlock_state();
+	if (status)
+		printk("NFSD: Failure reading reboot recovery data\n");
+}
+
+/*
+ * Since the lifetime of a delegation isn't limited to that of an open, a
+ * client may quite reasonably hang on to a delegation as long as it has
+ * the inode cached.  This becomes an obvious problem the first time a
+ * client's inode cache approaches the size of the server's total memory.
+ *
+ * For now we avoid this problem by imposing a hard limit on the number
+ * of delegations, which varies according to the server's memory size.
+ */
+static void
+set_max_delegations(void)
+{
+	/*
+	 * Allow at most 4 delegations per megabyte of RAM.  Quick
+	 * estimates suggest that in the worst case (where every delegation
+	 * is for a different inode), a delegation could take about 1.5K,
+	 * giving a worst case usage of about 6% of memory.
+	 */
+	max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT);
+}
+
+/* initialization to perform when the nfsd service is started: */
+
+static int
+__nfs4_state_start(void)
+{
+	int ret;
+
+	boot_time = get_seconds();
+	locks_start_grace(&nfsd4_manager);
+	printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
+	       nfsd4_grace);
+	ret = set_callback_cred();
+	if (ret)
+		return -ENOMEM;
+	laundry_wq = create_singlethread_workqueue("nfsd4");
+	if (laundry_wq == NULL)
+		return -ENOMEM;
+	ret = nfsd4_create_callback_queue();
+	if (ret)
+		goto out_free_laundry;
+	queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ);
+	set_max_delegations();
+	return 0;
+out_free_laundry:
+	destroy_workqueue(laundry_wq);
+	return ret;
+}
+
+int
+nfs4_state_start(void)
+{
+	nfsd4_load_reboot_recovery_data();
+	return __nfs4_state_start();
+}
+
+static void
+__nfs4_state_shutdown(void)
+{
+	int i;
+	struct nfs4_client *clp = NULL;
+	struct nfs4_delegation *dp = NULL;
+	struct list_head *pos, *next, reaplist;
+
+	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+		while (!list_empty(&conf_id_hashtbl[i])) {
+			clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
+			expire_client(clp);
+		}
+		while (!list_empty(&unconf_str_hashtbl[i])) {
+			clp = list_entry(unconf_str_hashtbl[i].next, struct nfs4_client, cl_strhash);
+			expire_client(clp);
+		}
+	}
+	INIT_LIST_HEAD(&reaplist);
+	spin_lock(&recall_lock);
+	list_for_each_safe(pos, next, &del_recall_lru) {
+		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+		list_move(&dp->dl_recall_lru, &reaplist);
+	}
+	spin_unlock(&recall_lock);
+	list_for_each_safe(pos, next, &reaplist) {
+		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+		list_del_init(&dp->dl_recall_lru);
+		unhash_delegation(dp);
+	}
+
+	nfsd4_shutdown_recdir();
+}
+
+void
+nfs4_state_shutdown(void)
+{
+	cancel_delayed_work_sync(&laundromat_work);
+	destroy_workqueue(laundry_wq);
+	locks_end_grace(&nfsd4_manager);
+	nfs4_lock_state();
+	nfs4_release_reclaim();
+	__nfs4_state_shutdown();
+	nfs4_unlock_state();
+	nfsd4_destroy_callback_queue();
+}
+
+/*
+ * user_recovery_dirname is protected by the nfsd_mutex since it's only
+ * accessed when nfsd is starting.
+ */
+static void
+nfs4_set_recdir(char *recdir)
+{
+	strcpy(user_recovery_dirname, recdir);
+}
+
+/*
+ * Change the NFSv4 recovery directory to recdir.
+ */
+int
+nfs4_reset_recoverydir(char *recdir)
+{
+	int status;
+	struct path path;
+
+	status = kern_path(recdir, LOOKUP_FOLLOW, &path);
+	if (status)
+		return status;
+	status = -ENOTDIR;
+	if (S_ISDIR(path.dentry->d_inode->i_mode)) {
+		nfs4_set_recdir(recdir);
+		status = 0;
+	}
+	path_put(&path);
+	return status;
+}
+
+char *
+nfs4_recoverydir(void)
+{
+	return user_recovery_dirname;
+}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
new file mode 100644
index 00000000..6c740974
--- /dev/null
+++ b/fs/nfsd/nfs4xdr.c
@@ -0,0 +1,3408 @@
+/*
+ *  Server-side XDR for NFSv4
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * TODO: Neil Brown made the following observation:  We currently
+ * initially reserve NFSD_BUFSIZE space on the transmit queue and
+ * never release any of that until the request is complete.
+ * It would be good to calculate a new maximum response size while
+ * decoding the COMPOUND, and call svc_reserve with this number
+ * at the end of nfs4svc_decode_compoundargs.
+ */
+
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/utsname.h>
+#include <linux/sunrpc/svcauth_gss.h>
+
+#include "idmap.h"
+#include "acl.h"
+#include "xdr4.h"
+#include "vfs.h"
+
+
+#define NFSDDBG_FACILITY		NFSDDBG_XDR
+
+/*
+ * As per referral draft, the fsid for a referral MUST be different from the fsid of the containing
+ * directory in order to indicate to the client that a filesystem boundary is present
+ * We use a fixed fsid for a referral
+ */
+#define NFS4_REFERRAL_FSID_MAJOR	0x8000000ULL
+#define NFS4_REFERRAL_FSID_MINOR	0x8000000ULL
+
+static __be32
+check_filename(char *str, int len, __be32 err)
+{
+	int i;
+
+	if (len == 0)
+		return nfserr_inval;
+	if (isdotent(str, len))
+		return err;
+	for (i = 0; i < len; i++)
+		if (str[i] == '/')
+			return err;
+	return 0;
+}
+
+#define DECODE_HEAD				\
+	__be32 *p;				\
+	__be32 status
+#define DECODE_TAIL				\
+	status = 0;				\
+out:						\
+	return status;				\
+xdr_error:					\
+	dprintk("NFSD: xdr error (%s:%d)\n",	\
+			__FILE__, __LINE__);	\
+	status = nfserr_bad_xdr;		\
+	goto out
+
+#define READ32(x)         (x) = ntohl(*p++)
+#define READ64(x)         do {			\
+	(x) = (u64)ntohl(*p++) << 32;		\
+	(x) |= ntohl(*p++);			\
+} while (0)
+#define READTIME(x)       do {			\
+	p++;					\
+	(x) = ntohl(*p++);			\
+	p++;					\
+} while (0)
+#define READMEM(x,nbytes) do {			\
+	x = (char *)p;				\
+	p += XDR_QUADLEN(nbytes);		\
+} while (0)
+#define SAVEMEM(x,nbytes) do {			\
+	if (!(x = (p==argp->tmp || p == argp->tmpp) ? \
+ 		savemem(argp, p, nbytes) :	\
+ 		(char *)p)) {			\
+		dprintk("NFSD: xdr error (%s:%d)\n", \
+				__FILE__, __LINE__); \
+		goto xdr_error;			\
+		}				\
+	p += XDR_QUADLEN(nbytes);		\
+} while (0)
+#define COPYMEM(x,nbytes) do {			\
+	memcpy((x), p, nbytes);			\
+	p += XDR_QUADLEN(nbytes);		\
+} while (0)
+
+/* READ_BUF, read_buf(): nbytes must be <= PAGE_SIZE */
+#define READ_BUF(nbytes)  do {			\
+	if (nbytes <= (u32)((char *)argp->end - (char *)argp->p)) {	\
+		p = argp->p;			\
+		argp->p += XDR_QUADLEN(nbytes);	\
+	} else if (!(p = read_buf(argp, nbytes))) { \
+		dprintk("NFSD: xdr error (%s:%d)\n", \
+				__FILE__, __LINE__); \
+		goto xdr_error;			\
+	}					\
+} while (0)
+
+static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
+{
+	/* We want more bytes than seem to be available.
+	 * Maybe we need a new page, maybe we have just run out
+	 */
+	unsigned int avail = (char *)argp->end - (char *)argp->p;
+	__be32 *p;
+	if (avail + argp->pagelen < nbytes)
+		return NULL;
+	if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */
+		return NULL;
+	/* ok, we can do it with the current plus the next page */
+	if (nbytes <= sizeof(argp->tmp))
+		p = argp->tmp;
+	else {
+		kfree(argp->tmpp);
+		p = argp->tmpp = kmalloc(nbytes, GFP_KERNEL);
+		if (!p)
+			return NULL;
+		
+	}
+	/*
+	 * The following memcpy is safe because read_buf is always
+	 * called with nbytes > avail, and the two cases above both
+	 * guarantee p points to at least nbytes bytes.
+	 */
+	memcpy(p, argp->p, avail);
+	/* step to next page */
+	argp->p = page_address(argp->pagelist[0]);
+	argp->pagelist++;
+	if (argp->pagelen < PAGE_SIZE) {
+		argp->end = argp->p + (argp->pagelen>>2);
+		argp->pagelen = 0;
+	} else {
+		argp->end = argp->p + (PAGE_SIZE>>2);
+		argp->pagelen -= PAGE_SIZE;
+	}
+	memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
+	argp->p += XDR_QUADLEN(nbytes - avail);
+	return p;
+}
+
+static int zero_clientid(clientid_t *clid)
+{
+	return (clid->cl_boot == 0) && (clid->cl_id == 0);
+}
+
+static int
+defer_free(struct nfsd4_compoundargs *argp,
+		void (*release)(const void *), void *p)
+{
+	struct tmpbuf *tb;
+
+	tb = kmalloc(sizeof(*tb), GFP_KERNEL);
+	if (!tb)
+		return -ENOMEM;
+	tb->buf = p;
+	tb->release = release;
+	tb->next = argp->to_free;
+	argp->to_free = tb;
+	return 0;
+}
+
+static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
+{
+	if (p == argp->tmp) {
+		p = kmalloc(nbytes, GFP_KERNEL);
+		if (!p)
+			return NULL;
+		memcpy(p, argp->tmp, nbytes);
+	} else {
+		BUG_ON(p != argp->tmpp);
+		argp->tmpp = NULL;
+	}
+	if (defer_free(argp, kfree, p)) {
+		kfree(p);
+		return NULL;
+	} else
+		return (char *)p;
+}
+
+static __be32
+nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
+{
+	u32 bmlen;
+	DECODE_HEAD;
+
+	bmval[0] = 0;
+	bmval[1] = 0;
+	bmval[2] = 0;
+
+	READ_BUF(4);
+	READ32(bmlen);
+	if (bmlen > 1000)
+		goto xdr_error;
+
+	READ_BUF(bmlen << 2);
+	if (bmlen > 0)
+		READ32(bmval[0]);
+	if (bmlen > 1)
+		READ32(bmval[1]);
+	if (bmlen > 2)
+		READ32(bmval[2]);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
+		   struct iattr *iattr, struct nfs4_acl **acl)
+{
+	int expected_len, len = 0;
+	u32 dummy32;
+	char *buf;
+	int host_err;
+
+	DECODE_HEAD;
+	iattr->ia_valid = 0;
+	if ((status = nfsd4_decode_bitmap(argp, bmval)))
+		return status;
+
+	READ_BUF(4);
+	READ32(expected_len);
+
+	if (bmval[0] & FATTR4_WORD0_SIZE) {
+		READ_BUF(8);
+		len += 8;
+		READ64(iattr->ia_size);
+		iattr->ia_valid |= ATTR_SIZE;
+	}
+	if (bmval[0] & FATTR4_WORD0_ACL) {
+		int nace;
+		struct nfs4_ace *ace;
+
+		READ_BUF(4); len += 4;
+		READ32(nace);
+
+		if (nace > NFS4_ACL_MAX)
+			return nfserr_resource;
+
+		*acl = nfs4_acl_new(nace);
+		if (*acl == NULL) {
+			host_err = -ENOMEM;
+			goto out_nfserr;
+		}
+		defer_free(argp, kfree, *acl);
+
+		(*acl)->naces = nace;
+		for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) {
+			READ_BUF(16); len += 16;
+			READ32(ace->type);
+			READ32(ace->flag);
+			READ32(ace->access_mask);
+			READ32(dummy32);
+			READ_BUF(dummy32);
+			len += XDR_QUADLEN(dummy32) << 2;
+			READMEM(buf, dummy32);
+			ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
+			status = nfs_ok;
+			if (ace->whotype != NFS4_ACL_WHO_NAMED)
+				ace->who = 0;
+			else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+				status = nfsd_map_name_to_gid(argp->rqstp,
+						buf, dummy32, &ace->who);
+			else
+				status = nfsd_map_name_to_uid(argp->rqstp,
+						buf, dummy32, &ace->who);
+			if (status)
+				return status;
+		}
+	} else
+		*acl = NULL;
+	if (bmval[1] & FATTR4_WORD1_MODE) {
+		READ_BUF(4);
+		len += 4;
+		READ32(iattr->ia_mode);
+		iattr->ia_mode &= (S_IFMT | S_IALLUGO);
+		iattr->ia_valid |= ATTR_MODE;
+	}
+	if (bmval[1] & FATTR4_WORD1_OWNER) {
+		READ_BUF(4);
+		len += 4;
+		READ32(dummy32);
+		READ_BUF(dummy32);
+		len += (XDR_QUADLEN(dummy32) << 2);
+		READMEM(buf, dummy32);
+		if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
+			return status;
+		iattr->ia_valid |= ATTR_UID;
+	}
+	if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) {
+		READ_BUF(4);
+		len += 4;
+		READ32(dummy32);
+		READ_BUF(dummy32);
+		len += (XDR_QUADLEN(dummy32) << 2);
+		READMEM(buf, dummy32);
+		if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
+			return status;
+		iattr->ia_valid |= ATTR_GID;
+	}
+	if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
+		READ_BUF(4);
+		len += 4;
+		READ32(dummy32);
+		switch (dummy32) {
+		case NFS4_SET_TO_CLIENT_TIME:
+			/* We require the high 32 bits of 'seconds' to be 0, and we ignore
+			   all 32 bits of 'nseconds'. */
+			READ_BUF(12);
+			len += 12;
+			READ32(dummy32);
+			if (dummy32)
+				return nfserr_inval;
+			READ32(iattr->ia_atime.tv_sec);
+			READ32(iattr->ia_atime.tv_nsec);
+			if (iattr->ia_atime.tv_nsec >= (u32)1000000000)
+				return nfserr_inval;
+			iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
+			break;
+		case NFS4_SET_TO_SERVER_TIME:
+			iattr->ia_valid |= ATTR_ATIME;
+			break;
+		default:
+			goto xdr_error;
+		}
+	}
+	if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
+		READ_BUF(4);
+		len += 4;
+		READ32(dummy32);
+		switch (dummy32) {
+		case NFS4_SET_TO_CLIENT_TIME:
+			/* We require the high 32 bits of 'seconds' to be 0, and we ignore
+			   all 32 bits of 'nseconds'. */
+			READ_BUF(12);
+			len += 12;
+			READ32(dummy32);
+			if (dummy32)
+				return nfserr_inval;
+			READ32(iattr->ia_mtime.tv_sec);
+			READ32(iattr->ia_mtime.tv_nsec);
+			if (iattr->ia_mtime.tv_nsec >= (u32)1000000000)
+				return nfserr_inval;
+			iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
+			break;
+		case NFS4_SET_TO_SERVER_TIME:
+			iattr->ia_valid |= ATTR_MTIME;
+			break;
+		default:
+			goto xdr_error;
+		}
+	}
+	if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
+	    || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
+	    || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2)
+		READ_BUF(expected_len - len);
+	else if (len != expected_len)
+		goto xdr_error;
+
+	DECODE_TAIL;
+
+out_nfserr:
+	status = nfserrno(host_err);
+	goto out;
+}
+
+static __be32
+nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
+{
+	DECODE_HEAD;
+
+	READ_BUF(sizeof(stateid_t));
+	READ32(sid->si_generation);
+	COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(access->ac_req_access);
+
+	DECODE_TAIL;
+}
+
+static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
+{
+	DECODE_HEAD;
+
+	READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
+	COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	READ32(bcts->dir);
+	/* XXX: skipping ctsa_use_conn_in_rdma_mode.  Perhaps Tom Tucker
+	 * could help us figure out we should be using it. */
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
+{
+	DECODE_HEAD;
+
+	close->cl_stateowner = NULL;
+	READ_BUF(4);
+	READ32(close->cl_seqid);
+	return nfsd4_decode_stateid(argp, &close->cl_stateid);
+
+	DECODE_TAIL;
+}
+
+
+static __be32
+nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit)
+{
+	DECODE_HEAD;
+
+	READ_BUF(12);
+	READ64(commit->co_offset);
+	READ32(commit->co_count);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(create->cr_type);
+	switch (create->cr_type) {
+	case NF4LNK:
+		READ_BUF(4);
+		READ32(create->cr_linklen);
+		READ_BUF(create->cr_linklen);
+		SAVEMEM(create->cr_linkname, create->cr_linklen);
+		break;
+	case NF4BLK:
+	case NF4CHR:
+		READ_BUF(8);
+		READ32(create->cr_specdata1);
+		READ32(create->cr_specdata2);
+		break;
+	case NF4SOCK:
+	case NF4FIFO:
+	case NF4DIR:
+	default:
+		break;
+	}
+
+	READ_BUF(4);
+	READ32(create->cr_namelen);
+	READ_BUF(create->cr_namelen);
+	SAVEMEM(create->cr_name, create->cr_namelen);
+	if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
+		return status;
+
+	status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
+				    &create->cr_acl);
+	if (status)
+		goto out;
+
+	DECODE_TAIL;
+}
+
+static inline __be32
+nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
+{
+	return nfsd4_decode_stateid(argp, &dr->dr_stateid);
+}
+
+static inline __be32
+nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
+{
+	return nfsd4_decode_bitmap(argp, getattr->ga_bmval);
+}
+
+static __be32
+nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(link->li_namelen);
+	READ_BUF(link->li_namelen);
+	SAVEMEM(link->li_name, link->li_namelen);
+	if ((status = check_filename(link->li_name, link->li_namelen, nfserr_inval)))
+		return status;
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
+{
+	DECODE_HEAD;
+
+	lock->lk_replay_owner = NULL;
+	/*
+	* type, reclaim(boolean), offset, length, new_lock_owner(boolean)
+	*/
+	READ_BUF(28);
+	READ32(lock->lk_type);
+	if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT))
+		goto xdr_error;
+	READ32(lock->lk_reclaim);
+	READ64(lock->lk_offset);
+	READ64(lock->lk_length);
+	READ32(lock->lk_is_new);
+
+	if (lock->lk_is_new) {
+		READ_BUF(4);
+		READ32(lock->lk_new_open_seqid);
+		status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
+		if (status)
+			return status;
+		READ_BUF(8 + sizeof(clientid_t));
+		READ32(lock->lk_new_lock_seqid);
+		COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
+		READ32(lock->lk_new_owner.len);
+		READ_BUF(lock->lk_new_owner.len);
+		READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
+	} else {
+		status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid);
+		if (status)
+			return status;
+		READ_BUF(4);
+		READ32(lock->lk_old_lock_seqid);
+	}
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
+{
+	DECODE_HEAD;
+		        
+	READ_BUF(32);
+	READ32(lockt->lt_type);
+	if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT))
+		goto xdr_error;
+	READ64(lockt->lt_offset);
+	READ64(lockt->lt_length);
+	COPYMEM(&lockt->lt_clientid, 8);
+	READ32(lockt->lt_owner.len);
+	READ_BUF(lockt->lt_owner.len);
+	READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
+{
+	DECODE_HEAD;
+
+	locku->lu_stateowner = NULL;
+	READ_BUF(8);
+	READ32(locku->lu_type);
+	if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
+		goto xdr_error;
+	READ32(locku->lu_seqid);
+	status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
+	if (status)
+		return status;
+	READ_BUF(16);
+	READ64(locku->lu_offset);
+	READ64(locku->lu_length);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(lookup->lo_len);
+	READ_BUF(lookup->lo_len);
+	SAVEMEM(lookup->lo_name, lookup->lo_len);
+	if ((status = check_filename(lookup->lo_name, lookup->lo_len, nfserr_noent)))
+		return status;
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
+{
+	DECODE_HEAD;
+
+	memset(open->op_bmval, 0, sizeof(open->op_bmval));
+	open->op_iattr.ia_valid = 0;
+	open->op_stateowner = NULL;
+
+	/* seqid, share_access, share_deny, clientid, ownerlen */
+	READ_BUF(16 + sizeof(clientid_t));
+	READ32(open->op_seqid);
+	READ32(open->op_share_access);
+	READ32(open->op_share_deny);
+	COPYMEM(&open->op_clientid, sizeof(clientid_t));
+	READ32(open->op_owner.len);
+
+	/* owner, open_flag */
+	READ_BUF(open->op_owner.len + 4);
+	SAVEMEM(open->op_owner.data, open->op_owner.len);
+	READ32(open->op_create);
+	switch (open->op_create) {
+	case NFS4_OPEN_NOCREATE:
+		break;
+	case NFS4_OPEN_CREATE:
+		READ_BUF(4);
+		READ32(open->op_createmode);
+		switch (open->op_createmode) {
+		case NFS4_CREATE_UNCHECKED:
+		case NFS4_CREATE_GUARDED:
+			status = nfsd4_decode_fattr(argp, open->op_bmval,
+				&open->op_iattr, &open->op_acl);
+			if (status)
+				goto out;
+			break;
+		case NFS4_CREATE_EXCLUSIVE:
+			READ_BUF(8);
+			COPYMEM(open->op_verf.data, 8);
+			break;
+		case NFS4_CREATE_EXCLUSIVE4_1:
+			if (argp->minorversion < 1)
+				goto xdr_error;
+			READ_BUF(8);
+			COPYMEM(open->op_verf.data, 8);
+			status = nfsd4_decode_fattr(argp, open->op_bmval,
+				&open->op_iattr, &open->op_acl);
+			if (status)
+				goto out;
+			break;
+		default:
+			goto xdr_error;
+		}
+		break;
+	default:
+		goto xdr_error;
+	}
+
+	/* open_claim */
+	READ_BUF(4);
+	READ32(open->op_claim_type);
+	switch (open->op_claim_type) {
+	case NFS4_OPEN_CLAIM_NULL:
+	case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+		READ_BUF(4);
+		READ32(open->op_fname.len);
+		READ_BUF(open->op_fname.len);
+		SAVEMEM(open->op_fname.data, open->op_fname.len);
+		if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval)))
+			return status;
+		break;
+	case NFS4_OPEN_CLAIM_PREVIOUS:
+		READ_BUF(4);
+		READ32(open->op_delegate_type);
+		break;
+	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+		status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+		if (status)
+			return status;
+		READ_BUF(4);
+		READ32(open->op_fname.len);
+		READ_BUF(open->op_fname.len);
+		SAVEMEM(open->op_fname.data, open->op_fname.len);
+		if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval)))
+			return status;
+		break;
+	default:
+		goto xdr_error;
+	}
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf)
+{
+	DECODE_HEAD;
+		    
+	open_conf->oc_stateowner = NULL;
+	status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
+	if (status)
+		return status;
+	READ_BUF(4);
+	READ32(open_conf->oc_seqid);
+						        
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down)
+{
+	DECODE_HEAD;
+		    
+	open_down->od_stateowner = NULL;
+	status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
+	if (status)
+		return status;
+	READ_BUF(12);
+	READ32(open_down->od_seqid);
+	READ32(open_down->od_share_access);
+	READ32(open_down->od_share_deny);
+						        
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(putfh->pf_fhlen);
+	if (putfh->pf_fhlen > NFS4_FHSIZE)
+		goto xdr_error;
+	READ_BUF(putfh->pf_fhlen);
+	SAVEMEM(putfh->pf_fhval, putfh->pf_fhlen);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
+{
+	DECODE_HEAD;
+
+	status = nfsd4_decode_stateid(argp, &read->rd_stateid);
+	if (status)
+		return status;
+	READ_BUF(12);
+	READ64(read->rd_offset);
+	READ32(read->rd_length);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir)
+{
+	DECODE_HEAD;
+
+	READ_BUF(24);
+	READ64(readdir->rd_cookie);
+	COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data));
+	READ32(readdir->rd_dircount);    /* just in case you needed a useless field... */
+	READ32(readdir->rd_maxcount);
+	if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval)))
+		goto out;
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(remove->rm_namelen);
+	READ_BUF(remove->rm_namelen);
+	SAVEMEM(remove->rm_name, remove->rm_namelen);
+	if ((status = check_filename(remove->rm_name, remove->rm_namelen, nfserr_noent)))
+		return status;
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(rename->rn_snamelen);
+	READ_BUF(rename->rn_snamelen + 4);
+	SAVEMEM(rename->rn_sname, rename->rn_snamelen);
+	READ32(rename->rn_tnamelen);
+	READ_BUF(rename->rn_tnamelen);
+	SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
+	if ((status = check_filename(rename->rn_sname, rename->rn_snamelen, nfserr_noent)))
+		return status;
+	if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen, nfserr_inval)))
+		return status;
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
+{
+	DECODE_HEAD;
+
+	READ_BUF(sizeof(clientid_t));
+	COPYMEM(clientid, sizeof(clientid_t));
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
+		     struct nfsd4_secinfo *secinfo)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(secinfo->si_namelen);
+	READ_BUF(secinfo->si_namelen);
+	SAVEMEM(secinfo->si_name, secinfo->si_namelen);
+	status = check_filename(secinfo->si_name, secinfo->si_namelen,
+								nfserr_noent);
+	if (status)
+		return status;
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
+		     struct nfsd4_secinfo_no_name *sin)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(sin->sin_style);
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
+{
+	__be32 status;
+
+	status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
+	if (status)
+		return status;
+	return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
+				  &setattr->sa_acl);
+}
+
+static __be32
+nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid)
+{
+	DECODE_HEAD;
+
+	READ_BUF(12);
+	COPYMEM(setclientid->se_verf.data, 8);
+	READ32(setclientid->se_namelen);
+
+	READ_BUF(setclientid->se_namelen + 8);
+	SAVEMEM(setclientid->se_name, setclientid->se_namelen);
+	READ32(setclientid->se_callback_prog);
+	READ32(setclientid->se_callback_netid_len);
+
+	READ_BUF(setclientid->se_callback_netid_len + 4);
+	SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len);
+	READ32(setclientid->se_callback_addr_len);
+
+	READ_BUF(setclientid->se_callback_addr_len + 4);
+	SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len);
+	READ32(setclientid->se_callback_ident);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c)
+{
+	DECODE_HEAD;
+
+	READ_BUF(8 + sizeof(nfs4_verifier));
+	COPYMEM(&scd_c->sc_clientid, 8);
+	COPYMEM(&scd_c->sc_confirm, sizeof(nfs4_verifier));
+
+	DECODE_TAIL;
+}
+
+/* Also used for NVERIFY */
+static __be32
+nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
+{
+#if 0
+	struct nfsd4_compoundargs save = {
+		.p = argp->p,
+		.end = argp->end,
+		.rqstp = argp->rqstp,
+	};
+	u32             ve_bmval[2];
+	struct iattr    ve_iattr;           /* request */
+	struct nfs4_acl *ve_acl;            /* request */
+#endif
+	DECODE_HEAD;
+
+	if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval)))
+		goto out;
+
+	/* For convenience's sake, we compare raw xdr'd attributes in
+	 * nfsd4_proc_verify; however we still decode here just to return
+	 * correct error in case of bad xdr. */
+#if 0
+	status = nfsd4_decode_fattr(ve_bmval, &ve_iattr, &ve_acl);
+	if (status == nfserr_inval) {
+		status = nfserrno(status);
+		goto out;
+	}
+#endif
+	READ_BUF(4);
+	READ32(verify->ve_attrlen);
+	READ_BUF(verify->ve_attrlen);
+	SAVEMEM(verify->ve_attrval, verify->ve_attrlen);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
+{
+	int avail;
+	int v;
+	int len;
+	DECODE_HEAD;
+
+	status = nfsd4_decode_stateid(argp, &write->wr_stateid);
+	if (status)
+		return status;
+	READ_BUF(16);
+	READ64(write->wr_offset);
+	READ32(write->wr_stable_how);
+	if (write->wr_stable_how > 2)
+		goto xdr_error;
+	READ32(write->wr_buflen);
+
+	/* Sorry .. no magic macros for this.. *
+	 * READ_BUF(write->wr_buflen);
+	 * SAVEMEM(write->wr_buf, write->wr_buflen);
+	 */
+	avail = (char*)argp->end - (char*)argp->p;
+	if (avail + argp->pagelen < write->wr_buflen) {
+		dprintk("NFSD: xdr error (%s:%d)\n",
+				__FILE__, __LINE__);
+		goto xdr_error;
+	}
+	argp->rqstp->rq_vec[0].iov_base = p;
+	argp->rqstp->rq_vec[0].iov_len = avail;
+	v = 0;
+	len = write->wr_buflen;
+	while (len > argp->rqstp->rq_vec[v].iov_len) {
+		len -= argp->rqstp->rq_vec[v].iov_len;
+		v++;
+		argp->rqstp->rq_vec[v].iov_base = page_address(argp->pagelist[0]);
+		argp->pagelist++;
+		if (argp->pagelen >= PAGE_SIZE) {
+			argp->rqstp->rq_vec[v].iov_len = PAGE_SIZE;
+			argp->pagelen -= PAGE_SIZE;
+		} else {
+			argp->rqstp->rq_vec[v].iov_len = argp->pagelen;
+			argp->pagelen -= len;
+		}
+	}
+	argp->end = (__be32*) (argp->rqstp->rq_vec[v].iov_base + argp->rqstp->rq_vec[v].iov_len);
+	argp->p = (__be32*)  (argp->rqstp->rq_vec[v].iov_base + (XDR_QUADLEN(len) << 2));
+	argp->rqstp->rq_vec[v].iov_len = len;
+	write->wr_vlen = v+1;
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner)
+{
+	DECODE_HEAD;
+
+	READ_BUF(12);
+	COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t));
+	READ32(rlockowner->rl_owner.len);
+	READ_BUF(rlockowner->rl_owner.len);
+	READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
+
+	if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
+		return nfserr_inval;
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
+			 struct nfsd4_exchange_id *exid)
+{
+	int dummy, tmp;
+	DECODE_HEAD;
+
+	READ_BUF(NFS4_VERIFIER_SIZE);
+	COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
+
+	READ_BUF(4);
+	READ32(exid->clname.len);
+
+	READ_BUF(exid->clname.len);
+	SAVEMEM(exid->clname.data, exid->clname.len);
+
+	READ_BUF(4);
+	READ32(exid->flags);
+
+	/* Ignore state_protect4_a */
+	READ_BUF(4);
+	READ32(exid->spa_how);
+	switch (exid->spa_how) {
+	case SP4_NONE:
+		break;
+	case SP4_MACH_CRED:
+		/* spo_must_enforce */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		/* spo_must_allow */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+		break;
+	case SP4_SSV:
+		/* ssp_ops */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		/* ssp_hash_algs<> */
+		READ_BUF(4);
+		READ32(tmp);
+		while (tmp--) {
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+		}
+
+		/* ssp_encr_algs<> */
+		READ_BUF(4);
+		READ32(tmp);
+		while (tmp--) {
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+		}
+
+		/* ssp_window and ssp_num_gss_handles */
+		READ_BUF(8);
+		READ32(dummy);
+		READ32(dummy);
+		break;
+	default:
+		goto xdr_error;
+	}
+
+	/* Ignore Implementation ID */
+	READ_BUF(4);    /* nfs_impl_id4 array length */
+	READ32(dummy);
+
+	if (dummy > 1)
+		goto xdr_error;
+
+	if (dummy == 1) {
+		/* nii_domain */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* nii_name */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* nii_date */
+		READ_BUF(12);
+		p += 3;
+	}
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
+			    struct nfsd4_create_session *sess)
+{
+	DECODE_HEAD;
+
+	u32 dummy;
+	char *machine_name;
+	int i;
+	int nr_secflavs;
+
+	READ_BUF(16);
+	COPYMEM(&sess->clientid, 8);
+	READ32(sess->seqid);
+	READ32(sess->flags);
+
+	/* Fore channel attrs */
+	READ_BUF(28);
+	READ32(dummy); /* headerpadsz is always 0 */
+	READ32(sess->fore_channel.maxreq_sz);
+	READ32(sess->fore_channel.maxresp_sz);
+	READ32(sess->fore_channel.maxresp_cached);
+	READ32(sess->fore_channel.maxops);
+	READ32(sess->fore_channel.maxreqs);
+	READ32(sess->fore_channel.nr_rdma_attrs);
+	if (sess->fore_channel.nr_rdma_attrs == 1) {
+		READ_BUF(4);
+		READ32(sess->fore_channel.rdma_attrs);
+	} else if (sess->fore_channel.nr_rdma_attrs > 1) {
+		dprintk("Too many fore channel attr bitmaps!\n");
+		goto xdr_error;
+	}
+
+	/* Back channel attrs */
+	READ_BUF(28);
+	READ32(dummy); /* headerpadsz is always 0 */
+	READ32(sess->back_channel.maxreq_sz);
+	READ32(sess->back_channel.maxresp_sz);
+	READ32(sess->back_channel.maxresp_cached);
+	READ32(sess->back_channel.maxops);
+	READ32(sess->back_channel.maxreqs);
+	READ32(sess->back_channel.nr_rdma_attrs);
+	if (sess->back_channel.nr_rdma_attrs == 1) {
+		READ_BUF(4);
+		READ32(sess->back_channel.rdma_attrs);
+	} else if (sess->back_channel.nr_rdma_attrs > 1) {
+		dprintk("Too many back channel attr bitmaps!\n");
+		goto xdr_error;
+	}
+
+	READ_BUF(8);
+	READ32(sess->callback_prog);
+
+	/* callback_sec_params4 */
+	READ32(nr_secflavs);
+	for (i = 0; i < nr_secflavs; ++i) {
+		READ_BUF(4);
+		READ32(dummy);
+		switch (dummy) {
+		case RPC_AUTH_NULL:
+			/* Nothing to read */
+			break;
+		case RPC_AUTH_UNIX:
+			READ_BUF(8);
+			/* stamp */
+			READ32(dummy);
+
+			/* machine name */
+			READ32(dummy);
+			READ_BUF(dummy);
+			SAVEMEM(machine_name, dummy);
+
+			/* uid, gid */
+			READ_BUF(8);
+			READ32(sess->uid);
+			READ32(sess->gid);
+
+			/* more gids */
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy * 4);
+			break;
+		case RPC_AUTH_GSS:
+			dprintk("RPC_AUTH_GSS callback secflavor "
+				"not supported!\n");
+			READ_BUF(8);
+			/* gcbp_service */
+			READ32(dummy);
+			/* gcbp_handle_from_server */
+			READ32(dummy);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+			/* gcbp_handle_from_client */
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy);
+			break;
+		default:
+			dprintk("Illegal callback secflavor\n");
+			return nfserr_inval;
+		}
+	}
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
+			     struct nfsd4_destroy_session *destroy_session)
+{
+	DECODE_HEAD;
+	READ_BUF(NFS4_MAX_SESSIONID_LEN);
+	COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
+		      struct nfsd4_sequence *seq)
+{
+	DECODE_HEAD;
+
+	READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+	COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	READ32(seq->seqid);
+	READ32(seq->slotid);
+	READ32(seq->maxslots);
+	READ32(seq->cachethis);
+
+	DECODE_TAIL;
+}
+
+static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(rc->rca_one_fs);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
+{
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
+{
+	return nfserr_notsupp;
+}
+
+typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
+
+static nfsd4_dec nfsd4_dec_ops[] = {
+	[OP_ACCESS]		= (nfsd4_dec)nfsd4_decode_access,
+	[OP_CLOSE]		= (nfsd4_dec)nfsd4_decode_close,
+	[OP_COMMIT]		= (nfsd4_dec)nfsd4_decode_commit,
+	[OP_CREATE]		= (nfsd4_dec)nfsd4_decode_create,
+	[OP_DELEGPURGE]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DELEGRETURN]	= (nfsd4_dec)nfsd4_decode_delegreturn,
+	[OP_GETATTR]		= (nfsd4_dec)nfsd4_decode_getattr,
+	[OP_GETFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_LINK]		= (nfsd4_dec)nfsd4_decode_link,
+	[OP_LOCK]		= (nfsd4_dec)nfsd4_decode_lock,
+	[OP_LOCKT]		= (nfsd4_dec)nfsd4_decode_lockt,
+	[OP_LOCKU]		= (nfsd4_dec)nfsd4_decode_locku,
+	[OP_LOOKUP]		= (nfsd4_dec)nfsd4_decode_lookup,
+	[OP_LOOKUPP]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_NVERIFY]		= (nfsd4_dec)nfsd4_decode_verify,
+	[OP_OPEN]		= (nfsd4_dec)nfsd4_decode_open,
+	[OP_OPENATTR]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OPEN_CONFIRM]	= (nfsd4_dec)nfsd4_decode_open_confirm,
+	[OP_OPEN_DOWNGRADE]	= (nfsd4_dec)nfsd4_decode_open_downgrade,
+	[OP_PUTFH]		= (nfsd4_dec)nfsd4_decode_putfh,
+	[OP_PUTPUBFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_PUTROOTFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_READ]		= (nfsd4_dec)nfsd4_decode_read,
+	[OP_READDIR]		= (nfsd4_dec)nfsd4_decode_readdir,
+	[OP_READLINK]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_REMOVE]		= (nfsd4_dec)nfsd4_decode_remove,
+	[OP_RENAME]		= (nfsd4_dec)nfsd4_decode_rename,
+	[OP_RENEW]		= (nfsd4_dec)nfsd4_decode_renew,
+	[OP_RESTOREFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_SAVEFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_SECINFO]		= (nfsd4_dec)nfsd4_decode_secinfo,
+	[OP_SETATTR]		= (nfsd4_dec)nfsd4_decode_setattr,
+	[OP_SETCLIENTID]	= (nfsd4_dec)nfsd4_decode_setclientid,
+	[OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm,
+	[OP_VERIFY]		= (nfsd4_dec)nfsd4_decode_verify,
+	[OP_WRITE]		= (nfsd4_dec)nfsd4_decode_write,
+	[OP_RELEASE_LOCKOWNER]	= (nfsd4_dec)nfsd4_decode_release_lockowner,
+};
+
+static nfsd4_dec nfsd41_dec_ops[] = {
+	[OP_ACCESS]		= (nfsd4_dec)nfsd4_decode_access,
+	[OP_CLOSE]		= (nfsd4_dec)nfsd4_decode_close,
+	[OP_COMMIT]		= (nfsd4_dec)nfsd4_decode_commit,
+	[OP_CREATE]		= (nfsd4_dec)nfsd4_decode_create,
+	[OP_DELEGPURGE]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DELEGRETURN]	= (nfsd4_dec)nfsd4_decode_delegreturn,
+	[OP_GETATTR]		= (nfsd4_dec)nfsd4_decode_getattr,
+	[OP_GETFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_LINK]		= (nfsd4_dec)nfsd4_decode_link,
+	[OP_LOCK]		= (nfsd4_dec)nfsd4_decode_lock,
+	[OP_LOCKT]		= (nfsd4_dec)nfsd4_decode_lockt,
+	[OP_LOCKU]		= (nfsd4_dec)nfsd4_decode_locku,
+	[OP_LOOKUP]		= (nfsd4_dec)nfsd4_decode_lookup,
+	[OP_LOOKUPP]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_NVERIFY]		= (nfsd4_dec)nfsd4_decode_verify,
+	[OP_OPEN]		= (nfsd4_dec)nfsd4_decode_open,
+	[OP_OPENATTR]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OPEN_CONFIRM]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OPEN_DOWNGRADE]	= (nfsd4_dec)nfsd4_decode_open_downgrade,
+	[OP_PUTFH]		= (nfsd4_dec)nfsd4_decode_putfh,
+	[OP_PUTPUBFH]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_PUTROOTFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_READ]		= (nfsd4_dec)nfsd4_decode_read,
+	[OP_READDIR]		= (nfsd4_dec)nfsd4_decode_readdir,
+	[OP_READLINK]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_REMOVE]		= (nfsd4_dec)nfsd4_decode_remove,
+	[OP_RENAME]		= (nfsd4_dec)nfsd4_decode_rename,
+	[OP_RENEW]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_RESTOREFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_SAVEFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_SECINFO]		= (nfsd4_dec)nfsd4_decode_secinfo,
+	[OP_SETATTR]		= (nfsd4_dec)nfsd4_decode_setattr,
+	[OP_SETCLIENTID]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SETCLIENTID_CONFIRM]= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_VERIFY]		= (nfsd4_dec)nfsd4_decode_verify,
+	[OP_WRITE]		= (nfsd4_dec)nfsd4_decode_write,
+	[OP_RELEASE_LOCKOWNER]	= (nfsd4_dec)nfsd4_decode_notsupp,
+
+	/* new operations for NFSv4.1 */
+	[OP_BACKCHANNEL_CTL]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
+	[OP_EXCHANGE_ID]	= (nfsd4_dec)nfsd4_decode_exchange_id,
+	[OP_CREATE_SESSION]	= (nfsd4_dec)nfsd4_decode_create_session,
+	[OP_DESTROY_SESSION]	= (nfsd4_dec)nfsd4_decode_destroy_session,
+	[OP_FREE_STATEID]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GET_DIR_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SECINFO_NO_NAME]	= (nfsd4_dec)nfsd4_decode_secinfo_no_name,
+	[OP_SEQUENCE]		= (nfsd4_dec)nfsd4_decode_sequence,
+	[OP_SET_SSV]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_TEST_STATEID]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_WANT_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DESTROY_CLIENTID]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_RECLAIM_COMPLETE]	= (nfsd4_dec)nfsd4_decode_reclaim_complete,
+};
+
+struct nfsd4_minorversion_ops {
+	nfsd4_dec *decoders;
+	int nops;
+};
+
+static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
+	[0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
+	[1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
+};
+
+static __be32
+nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
+{
+	DECODE_HEAD;
+	struct nfsd4_op *op;
+	struct nfsd4_minorversion_ops *ops;
+	int i;
+
+	/*
+	 * XXX: According to spec, we should check the tag
+	 * for UTF-8 compliance.  I'm postponing this for
+	 * now because it seems that some clients do use
+	 * binary tags.
+	 */
+	READ_BUF(4);
+	READ32(argp->taglen);
+	READ_BUF(argp->taglen + 8);
+	SAVEMEM(argp->tag, argp->taglen);
+	READ32(argp->minorversion);
+	READ32(argp->opcnt);
+
+	if (argp->taglen > NFSD4_MAX_TAGLEN)
+		goto xdr_error;
+	if (argp->opcnt > 100)
+		goto xdr_error;
+
+	if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
+		argp->ops = kmalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
+		if (!argp->ops) {
+			argp->ops = argp->iops;
+			dprintk("nfsd: couldn't allocate room for COMPOUND\n");
+			goto xdr_error;
+		}
+	}
+
+	if (argp->minorversion >= ARRAY_SIZE(nfsd4_minorversion))
+		argp->opcnt = 0;
+
+	ops = &nfsd4_minorversion[argp->minorversion];
+	for (i = 0; i < argp->opcnt; i++) {
+		op = &argp->ops[i];
+		op->replay = NULL;
+
+		/*
+		 * We can't use READ_BUF() here because we need to handle
+		 * a missing opcode as an OP_WRITE + 1. So we need to check
+		 * to see if we're truly at the end of our buffer or if there
+		 * is another page we need to flip to.
+		 */
+
+		if (argp->p == argp->end) {
+			if (argp->pagelen < 4) {
+				/* There isn't an opcode still on the wire */
+				op->opnum = OP_WRITE + 1;
+				op->status = nfserr_bad_xdr;
+				argp->opcnt = i+1;
+				break;
+			}
+
+			/*
+			 * False alarm. We just hit a page boundary, but there
+			 * is still data available.  Move pointer across page
+			 * boundary.  *snip from READ_BUF*
+			 */
+			argp->p = page_address(argp->pagelist[0]);
+			argp->pagelist++;
+			if (argp->pagelen < PAGE_SIZE) {
+				argp->end = argp->p + (argp->pagelen>>2);
+				argp->pagelen = 0;
+			} else {
+				argp->end = argp->p + (PAGE_SIZE>>2);
+				argp->pagelen -= PAGE_SIZE;
+			}
+		}
+		op->opnum = ntohl(*argp->p++);
+
+		if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP)
+			op->status = ops->decoders[op->opnum](argp, &op->u);
+		else {
+			op->opnum = OP_ILLEGAL;
+			op->status = nfserr_op_illegal;
+		}
+
+		if (op->status) {
+			argp->opcnt = i+1;
+			break;
+		}
+	}
+
+	DECODE_TAIL;
+}
+
+#define WRITE32(n)               *p++ = htonl(n)
+#define WRITE64(n)               do {				\
+	*p++ = htonl((u32)((n) >> 32));				\
+	*p++ = htonl((u32)(n));					\
+} while (0)
+#define WRITEMEM(ptr,nbytes)     do { if (nbytes > 0) {		\
+	*(p + XDR_QUADLEN(nbytes) -1) = 0;                      \
+	memcpy(p, ptr, nbytes);					\
+	p += XDR_QUADLEN(nbytes);				\
+}} while (0)
+
+static void write32(__be32 **p, u32 n)
+{
+	*(*p)++ = n;
+}
+
+static void write64(__be32 **p, u64 n)
+{
+	write32(p, (u32)(n >> 32));
+	write32(p, (u32)n);
+}
+
+static void write_change(__be32 **p, struct kstat *stat, struct inode *inode)
+{
+	if (IS_I_VERSION(inode)) {
+		write64(p, inode->i_version);
+	} else {
+		write32(p, stat->ctime.tv_sec);
+		write32(p, stat->ctime.tv_nsec);
+	}
+}
+
+static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
+{
+	write32(p, c->atomic);
+	if (c->change_supported) {
+		write64(p, c->before_change);
+		write64(p, c->after_change);
+	} else {
+		write32(p, c->before_ctime_sec);
+		write32(p, c->before_ctime_nsec);
+		write32(p, c->after_ctime_sec);
+		write32(p, c->after_ctime_nsec);
+	}
+}
+
+#define RESERVE_SPACE(nbytes)	do {				\
+	p = resp->p;						\
+	BUG_ON(p + XDR_QUADLEN(nbytes) > resp->end);		\
+} while (0)
+#define ADJUST_ARGS()		resp->p = p
+
+/*
+ * Header routine to setup seqid operation replay cache
+ */
+#define ENCODE_SEQID_OP_HEAD					\
+	__be32 *save;						\
+								\
+	save = resp->p;
+
+static bool seqid_mutating_err(__be32 err)
+{
+	/* rfc 3530 section 8.1.5: */
+	return	err != nfserr_stale_clientid &&
+		err != nfserr_stale_stateid &&
+		err != nfserr_bad_stateid &&
+		err != nfserr_bad_seqid &&
+		err != nfserr_bad_xdr &&
+		err != nfserr_resource &&
+		err != nfserr_nofilehandle;
+}
+
+/*
+ * Routine for encoding the result of a "seqid-mutating" NFSv4 operation.  This
+ * is where sequence id's are incremented, and the replay cache is filled.
+ * Note that we increment sequence id's here, at the last moment, so we're sure
+ * we know whether the error to be returned is a sequence id mutating error.
+ */
+
+#define ENCODE_SEQID_OP_TAIL(stateowner) do {			\
+	if (seqid_mutating_err(nfserr) && stateowner) { 	\
+		stateowner->so_seqid++;				\
+		stateowner->so_replay.rp_status = nfserr;   	\
+		stateowner->so_replay.rp_buflen = 		\
+			  (((char *)(resp)->p - (char *)save)); \
+		memcpy(stateowner->so_replay.rp_buf, save,      \
+ 			stateowner->so_replay.rp_buflen); 	\
+	} } while (0);
+
+/* Encode as an array of strings the string given with components
+ * separated @sep.
+ */
+static __be32 nfsd4_encode_components(char sep, char *components,
+				   __be32 **pp, int *buflen)
+{
+	__be32 *p = *pp;
+	__be32 *countp = p;
+	int strlen, count=0;
+	char *str, *end;
+
+	dprintk("nfsd4_encode_components(%s)\n", components);
+	if ((*buflen -= 4) < 0)
+		return nfserr_resource;
+	WRITE32(0); /* We will fill this in with @count later */
+	end = str = components;
+	while (*end) {
+		for (; *end && (*end != sep); end++)
+			; /* Point to end of component */
+		strlen = end - str;
+		if (strlen) {
+			if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0)
+				return nfserr_resource;
+			WRITE32(strlen);
+			WRITEMEM(str, strlen);
+			count++;
+		}
+		else
+			end++;
+		str = end;
+	}
+	*pp = p;
+	p = countp;
+	WRITE32(count);
+	return 0;
+}
+
+/*
+ * encode a location element of a fs_locations structure
+ */
+static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
+				    __be32 **pp, int *buflen)
+{
+	__be32 status;
+	__be32 *p = *pp;
+
+	status = nfsd4_encode_components(':', location->hosts, &p, buflen);
+	if (status)
+		return status;
+	status = nfsd4_encode_components('/', location->path, &p, buflen);
+	if (status)
+		return status;
+	*pp = p;
+	return 0;
+}
+
+/*
+ * Return the path to an export point in the pseudo filesystem namespace
+ * Returned string is safe to use as long as the caller holds a reference
+ * to @exp.
+ */
+static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat)
+{
+	struct svc_fh tmp_fh;
+	char *path = NULL, *rootpath;
+	size_t rootlen;
+
+	fh_init(&tmp_fh, NFS4_FHSIZE);
+	*stat = exp_pseudoroot(rqstp, &tmp_fh);
+	if (*stat)
+		return NULL;
+	rootpath = tmp_fh.fh_export->ex_pathname;
+
+	path = exp->ex_pathname;
+
+	rootlen = strlen(rootpath);
+	if (strncmp(path, rootpath, rootlen)) {
+		dprintk("nfsd: fs_locations failed;"
+			"%s is not contained in %s\n", path, rootpath);
+		*stat = nfserr_notsupp;
+		path = NULL;
+		goto out;
+	}
+	path += rootlen;
+out:
+	fh_put(&tmp_fh);
+	return path;
+}
+
+/*
+ *  encode a fs_locations structure
+ */
+static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
+				     struct svc_export *exp,
+				     __be32 **pp, int *buflen)
+{
+	__be32 status;
+	int i;
+	__be32 *p = *pp;
+	struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
+	char *root = nfsd4_path(rqstp, exp, &status);
+
+	if (status)
+		return status;
+	status = nfsd4_encode_components('/', root, &p, buflen);
+	if (status)
+		return status;
+	if ((*buflen -= 4) < 0)
+		return nfserr_resource;
+	WRITE32(fslocs->locations_count);
+	for (i=0; i<fslocs->locations_count; i++) {
+		status = nfsd4_encode_fs_location4(&fslocs->locations[i],
+						   &p, buflen);
+		if (status)
+			return status;
+	}
+	*pp = p;
+	return 0;
+}
+
+static u32 nfs4_ftypes[16] = {
+        NF4BAD,  NF4FIFO, NF4CHR, NF4BAD,
+        NF4DIR,  NF4BAD,  NF4BLK, NF4BAD,
+        NF4REG,  NF4BAD,  NF4LNK, NF4BAD,
+        NF4SOCK, NF4BAD,  NF4LNK, NF4BAD,
+};
+
+static __be32
+nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
+			__be32 **p, int *buflen)
+{
+	int status;
+
+	if (*buflen < (XDR_QUADLEN(IDMAP_NAMESZ) << 2) + 4)
+		return nfserr_resource;
+	if (whotype != NFS4_ACL_WHO_NAMED)
+		status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1));
+	else if (group)
+		status = nfsd_map_gid_to_name(rqstp, id, (u8 *)(*p + 1));
+	else
+		status = nfsd_map_uid_to_name(rqstp, id, (u8 *)(*p + 1));
+	if (status < 0)
+		return nfserrno(status);
+	*p = xdr_encode_opaque(*p, NULL, status);
+	*buflen -= (XDR_QUADLEN(status) << 2) + 4;
+	BUG_ON(*buflen < 0);
+	return 0;
+}
+
+static inline __be32
+nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, __be32 **p, int *buflen)
+{
+	return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, uid, 0, p, buflen);
+}
+
+static inline __be32
+nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, __be32 **p, int *buflen)
+{
+	return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, gid, 1, p, buflen);
+}
+
+static inline __be32
+nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
+		__be32 **p, int *buflen)
+{
+	return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen);
+}
+
+#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
+			      FATTR4_WORD0_RDATTR_ERROR)
+#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
+
+static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
+{
+	/* As per referral draft:  */
+	if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS ||
+	    *bmval1 & ~WORD1_ABSENT_FS_ATTRS) {
+		if (*bmval0 & FATTR4_WORD0_RDATTR_ERROR ||
+	            *bmval0 & FATTR4_WORD0_FS_LOCATIONS)
+			*rdattr_err = NFSERR_MOVED;
+		else
+			return nfserr_moved;
+	}
+	*bmval0 &= WORD0_ABSENT_FS_ATTRS;
+	*bmval1 &= WORD1_ABSENT_FS_ATTRS;
+	return 0;
+}
+
+/*
+ * Note: @fhp can be NULL; in this case, we might have to compose the filehandle
+ * ourselves.
+ *
+ * @countp is the buffer size in _words_; upon successful return this becomes
+ * replaced with the number of words written.
+ */
+__be32
+nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
+		struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval,
+		struct svc_rqst *rqstp, int ignore_crossmnt)
+{
+	u32 bmval0 = bmval[0];
+	u32 bmval1 = bmval[1];
+	u32 bmval2 = bmval[2];
+	struct kstat stat;
+	struct svc_fh tempfh;
+	struct kstatfs statfs;
+	int buflen = *countp << 2;
+	__be32 *attrlenp;
+	u32 dummy;
+	u64 dummy64;
+	u32 rdattr_err = 0;
+	__be32 *p = buffer;
+	__be32 status;
+	int err;
+	int aclsupport = 0;
+	struct nfs4_acl *acl = NULL;
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	u32 minorversion = resp->cstate.minorversion;
+	struct path path = {
+		.mnt	= exp->ex_path.mnt,
+		.dentry	= dentry,
+	};
+
+	BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
+	BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
+	BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
+	BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
+
+	if (exp->ex_fslocs.migrated) {
+		BUG_ON(bmval[2]);
+		status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
+		if (status)
+			goto out;
+	}
+
+	err = vfs_getattr(exp->ex_path.mnt, dentry, &stat);
+	if (err)
+		goto out_nfserr;
+	if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL |
+			FATTR4_WORD0_MAXNAME)) ||
+	    (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
+		       FATTR4_WORD1_SPACE_TOTAL))) {
+		err = vfs_statfs(&path, &statfs);
+		if (err)
+			goto out_nfserr;
+	}
+	if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
+		fh_init(&tempfh, NFS4_FHSIZE);
+		status = fh_compose(&tempfh, exp, dentry, NULL);
+		if (status)
+			goto out;
+		fhp = &tempfh;
+	}
+	if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT
+			| FATTR4_WORD0_SUPPORTED_ATTRS)) {
+		err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
+		aclsupport = (err == 0);
+		if (bmval0 & FATTR4_WORD0_ACL) {
+			if (err == -EOPNOTSUPP)
+				bmval0 &= ~FATTR4_WORD0_ACL;
+			else if (err == -EINVAL) {
+				status = nfserr_attrnotsupp;
+				goto out;
+			} else if (err != 0)
+				goto out_nfserr;
+		}
+	}
+
+	if (bmval2) {
+		if ((buflen -= 16) < 0)
+			goto out_resource;
+		WRITE32(3);
+		WRITE32(bmval0);
+		WRITE32(bmval1);
+		WRITE32(bmval2);
+	} else if (bmval1) {
+		if ((buflen -= 12) < 0)
+			goto out_resource;
+		WRITE32(2);
+		WRITE32(bmval0);
+		WRITE32(bmval1);
+	} else {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
+		WRITE32(1);
+		WRITE32(bmval0);
+	}
+	attrlenp = p++;                /* to be backfilled later */
+
+	if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
+		u32 word0 = nfsd_suppattrs0(minorversion);
+		u32 word1 = nfsd_suppattrs1(minorversion);
+		u32 word2 = nfsd_suppattrs2(minorversion);
+
+		if (!aclsupport)
+			word0 &= ~FATTR4_WORD0_ACL;
+		if (!word2) {
+			if ((buflen -= 12) < 0)
+				goto out_resource;
+			WRITE32(2);
+			WRITE32(word0);
+			WRITE32(word1);
+		} else {
+			if ((buflen -= 16) < 0)
+				goto out_resource;
+			WRITE32(3);
+			WRITE32(word0);
+			WRITE32(word1);
+			WRITE32(word2);
+		}
+	}
+	if (bmval0 & FATTR4_WORD0_TYPE) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		dummy = nfs4_ftypes[(stat.mode & S_IFMT) >> 12];
+		if (dummy == NF4BAD)
+			goto out_serverfault;
+		WRITE32(dummy);
+	}
+	if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		if (exp->ex_flags & NFSEXP_NOSUBTREECHECK)
+			WRITE32(NFS4_FH_PERSISTENT);
+		else
+			WRITE32(NFS4_FH_PERSISTENT|NFS4_FH_VOL_RENAME);
+	}
+	if (bmval0 & FATTR4_WORD0_CHANGE) {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
+		write_change(&p, &stat, dentry->d_inode);
+	}
+	if (bmval0 & FATTR4_WORD0_SIZE) {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
+		WRITE64(stat.size);
+	}
+	if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(1);
+	}
+	if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(1);
+	}
+	if (bmval0 & FATTR4_WORD0_NAMED_ATTR) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(0);
+	}
+	if (bmval0 & FATTR4_WORD0_FSID) {
+		if ((buflen -= 16) < 0)
+			goto out_resource;
+		if (exp->ex_fslocs.migrated) {
+			WRITE64(NFS4_REFERRAL_FSID_MAJOR);
+			WRITE64(NFS4_REFERRAL_FSID_MINOR);
+		} else switch(fsid_source(fhp)) {
+		case FSIDSOURCE_FSID:
+			WRITE64((u64)exp->ex_fsid);
+			WRITE64((u64)0);
+			break;
+		case FSIDSOURCE_DEV:
+			WRITE32(0);
+			WRITE32(MAJOR(stat.dev));
+			WRITE32(0);
+			WRITE32(MINOR(stat.dev));
+			break;
+		case FSIDSOURCE_UUID:
+			WRITEMEM(exp->ex_uuid, 16);
+			break;
+		}
+	}
+	if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(0);
+	}
+	if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(nfsd4_lease);
+	}
+	if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(rdattr_err);
+	}
+	if (bmval0 & FATTR4_WORD0_ACL) {
+		struct nfs4_ace *ace;
+
+		if (acl == NULL) {
+			if ((buflen -= 4) < 0)
+				goto out_resource;
+
+			WRITE32(0);
+			goto out_acl;
+		}
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(acl->naces);
+
+		for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
+			if ((buflen -= 4*3) < 0)
+				goto out_resource;
+			WRITE32(ace->type);
+			WRITE32(ace->flag);
+			WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL);
+			status = nfsd4_encode_aclname(rqstp, ace->whotype,
+				ace->who, ace->flag & NFS4_ACE_IDENTIFIER_GROUP,
+				&p, &buflen);
+			if (status == nfserr_resource)
+				goto out_resource;
+			if (status)
+				goto out;
+		}
+	}
+out_acl:
+	if (bmval0 & FATTR4_WORD0_ACLSUPPORT) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(aclsupport ?
+			ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0);
+	}
+	if (bmval0 & FATTR4_WORD0_CANSETTIME) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(1);
+	}
+	if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(1);
+	}
+	if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(1);
+	}
+	if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(1);
+	}
+	if (bmval0 & FATTR4_WORD0_FILEHANDLE) {
+		buflen -= (XDR_QUADLEN(fhp->fh_handle.fh_size) << 2) + 4;
+		if (buflen < 0)
+			goto out_resource;
+		WRITE32(fhp->fh_handle.fh_size);
+		WRITEMEM(&fhp->fh_handle.fh_base, fhp->fh_handle.fh_size);
+	}
+	if (bmval0 & FATTR4_WORD0_FILEID) {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
+		WRITE64(stat.ino);
+	}
+	if (bmval0 & FATTR4_WORD0_FILES_AVAIL) {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
+		WRITE64((u64) statfs.f_ffree);
+	}
+	if (bmval0 & FATTR4_WORD0_FILES_FREE) {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
+		WRITE64((u64) statfs.f_ffree);
+	}
+	if (bmval0 & FATTR4_WORD0_FILES_TOTAL) {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
+		WRITE64((u64) statfs.f_files);
+	}
+	if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
+		status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen);
+		if (status == nfserr_resource)
+			goto out_resource;
+		if (status)
+			goto out;
+	}
+	if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(1);
+	}
+	if (bmval0 & FATTR4_WORD0_MAXFILESIZE) {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
+		WRITE64(~(u64)0);
+	}
+	if (bmval0 & FATTR4_WORD0_MAXLINK) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(255);
+	}
+	if (bmval0 & FATTR4_WORD0_MAXNAME) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(statfs.f_namelen);
+	}
+	if (bmval0 & FATTR4_WORD0_MAXREAD) {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
+		WRITE64((u64) svc_max_payload(rqstp));
+	}
+	if (bmval0 & FATTR4_WORD0_MAXWRITE) {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
+		WRITE64((u64) svc_max_payload(rqstp));
+	}
+	if (bmval1 & FATTR4_WORD1_MODE) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(stat.mode & S_IALLUGO);
+	}
+	if (bmval1 & FATTR4_WORD1_NO_TRUNC) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(1);
+	}
+	if (bmval1 & FATTR4_WORD1_NUMLINKS) {
+		if ((buflen -= 4) < 0)
+			goto out_resource;
+		WRITE32(stat.nlink);
+	}
+	if (bmval1 & FATTR4_WORD1_OWNER) {
+		status = nfsd4_encode_user(rqstp, stat.uid, &p, &buflen);
+		if (status == nfserr_resource)
+			goto out_resource;
+		if (status)
+			goto out;
+	}
+	if (bmval1 & FATTR4_WORD1_OWNER_GROUP) {
+		status = nfsd4_encode_group(rqstp, stat.gid, &p, &buflen);
+		if (status == nfserr_resource)
+			goto out_resource;
+		if (status)
+			goto out;
+	}
+	if (bmval1 & FATTR4_WORD1_RAWDEV) {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
+		WRITE32((u32) MAJOR(stat.rdev));
+		WRITE32((u32) MINOR(stat.rdev));
+	}
+	if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
+		dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize;
+		WRITE64(dummy64);
+	}
+	if (bmval1 & FATTR4_WORD1_SPACE_FREE) {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
+		dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize;
+		WRITE64(dummy64);
+	}
+	if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
+		dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize;
+		WRITE64(dummy64);
+	}
+	if (bmval1 & FATTR4_WORD1_SPACE_USED) {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
+		dummy64 = (u64)stat.blocks << 9;
+		WRITE64(dummy64);
+	}
+	if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
+		if ((buflen -= 12) < 0)
+			goto out_resource;
+		WRITE32(0);
+		WRITE32(stat.atime.tv_sec);
+		WRITE32(stat.atime.tv_nsec);
+	}
+	if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
+		if ((buflen -= 12) < 0)
+			goto out_resource;
+		WRITE32(0);
+		WRITE32(1);
+		WRITE32(0);
+	}
+	if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
+		if ((buflen -= 12) < 0)
+			goto out_resource;
+		WRITE32(0);
+		WRITE32(stat.ctime.tv_sec);
+		WRITE32(stat.ctime.tv_nsec);
+	}
+	if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
+		if ((buflen -= 12) < 0)
+			goto out_resource;
+		WRITE32(0);
+		WRITE32(stat.mtime.tv_sec);
+		WRITE32(stat.mtime.tv_nsec);
+	}
+	if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
+		if ((buflen -= 8) < 0)
+                	goto out_resource;
+		/*
+		 * Get parent's attributes if not ignoring crossmount
+		 * and this is the root of a cross-mounted filesystem.
+		 */
+		if (ignore_crossmnt == 0 &&
+		    dentry == exp->ex_path.mnt->mnt_root) {
+			struct path path = exp->ex_path;
+			path_get(&path);
+			while (follow_up(&path)) {
+				if (path.dentry != path.mnt->mnt_root)
+					break;
+			}
+			err = vfs_getattr(path.mnt, path.dentry, &stat);
+			path_put(&path);
+			if (err)
+				goto out_nfserr;
+		}
+		WRITE64(stat.ino);
+	}
+	if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+		WRITE32(3);
+		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
+		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
+		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
+	}
+
+	*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
+	*countp = p - buffer;
+	status = nfs_ok;
+
+out:
+	kfree(acl);
+	if (fhp == &tempfh)
+		fh_put(&tempfh);
+	return status;
+out_nfserr:
+	status = nfserrno(err);
+	goto out;
+out_resource:
+	*countp = 0;
+	status = nfserr_resource;
+	goto out;
+out_serverfault:
+	status = nfserr_serverfault;
+	goto out;
+}
+
+static inline int attributes_need_mount(u32 *bmval)
+{
+	if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME))
+		return 1;
+	if (bmval[1] & ~FATTR4_WORD1_MOUNTED_ON_FILEID)
+		return 1;
+	return 0;
+}
+
+static __be32
+nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
+		const char *name, int namlen, __be32 *p, int *buflen)
+{
+	struct svc_export *exp = cd->rd_fhp->fh_export;
+	struct dentry *dentry;
+	__be32 nfserr;
+	int ignore_crossmnt = 0;
+
+	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	if (IS_ERR(dentry))
+		return nfserrno(PTR_ERR(dentry));
+	if (!dentry->d_inode) {
+		/*
+		 * nfsd_buffered_readdir drops the i_mutex between
+		 * readdir and calling this callback, leaving a window
+		 * where this directory entry could have gone away.
+		 */
+		dput(dentry);
+		return nfserr_noent;
+	}
+
+	exp_get(exp);
+	/*
+	 * In the case of a mountpoint, the client may be asking for
+	 * attributes that are only properties of the underlying filesystem
+	 * as opposed to the cross-mounted file system. In such a case,
+	 * we will not follow the cross mount and will fill the attribtutes
+	 * directly from the mountpoint dentry.
+	 */
+	if (nfsd_mountpoint(dentry, exp)) {
+		int err;
+
+		if (!(exp->ex_flags & NFSEXP_V4ROOT)
+				&& !attributes_need_mount(cd->rd_bmval)) {
+			ignore_crossmnt = 1;
+			goto out_encode;
+		}
+		/*
+		 * Why the heck aren't we just using nfsd_lookup??
+		 * Different "."/".." handling?  Something else?
+		 * At least, add a comment here to explain....
+		 */
+		err = nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp);
+		if (err) {
+			nfserr = nfserrno(err);
+			goto out_put;
+		}
+		nfserr = check_nfsd_access(exp, cd->rd_rqstp);
+		if (nfserr)
+			goto out_put;
+
+	}
+out_encode:
+	nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
+					cd->rd_rqstp, ignore_crossmnt);
+out_put:
+	dput(dentry);
+	exp_put(exp);
+	return nfserr;
+}
+
+static __be32 *
+nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr)
+{
+	__be32 *attrlenp;
+
+	if (buflen < 6)
+		return NULL;
+	*p++ = htonl(2);
+	*p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */
+	*p++ = htonl(0);			 /* bmval1 */
+
+	attrlenp = p++;
+	*p++ = nfserr;       /* no htonl */
+	*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
+	return p;
+}
+
+static int
+nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
+		    loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct readdir_cd *ccd = ccdv;
+	struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
+	int buflen;
+	__be32 *p = cd->buffer;
+	__be32 *cookiep;
+	__be32 nfserr = nfserr_toosmall;
+
+	/* In nfsv4, "." and ".." never make it onto the wire.. */
+	if (name && isdotent(name, namlen)) {
+		cd->common.err = nfs_ok;
+		return 0;
+	}
+
+	if (cd->offset)
+		xdr_encode_hyper(cd->offset, (u64) offset);
+
+	buflen = cd->buflen - 4 - XDR_QUADLEN(namlen);
+	if (buflen < 0)
+		goto fail;
+
+	*p++ = xdr_one;                             /* mark entry present */
+	cookiep = p;
+	p = xdr_encode_hyper(p, NFS_OFFSET_MAX);    /* offset of next entry */
+	p = xdr_encode_array(p, name, namlen);      /* name length & name */
+
+	nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, p, &buflen);
+	switch (nfserr) {
+	case nfs_ok:
+		p += buflen;
+		break;
+	case nfserr_resource:
+		nfserr = nfserr_toosmall;
+		goto fail;
+	case nfserr_noent:
+		goto skip_entry;
+	default:
+		/*
+		 * If the client requested the RDATTR_ERROR attribute,
+		 * we stuff the error code into this attribute
+		 * and continue.  If this attribute was not requested,
+		 * then in accordance with the spec, we fail the
+		 * entire READDIR operation(!)
+		 */
+		if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR))
+			goto fail;
+		p = nfsd4_encode_rdattr_error(p, buflen, nfserr);
+		if (p == NULL) {
+			nfserr = nfserr_toosmall;
+			goto fail;
+		}
+	}
+	cd->buflen -= (p - cd->buffer);
+	cd->buffer = p;
+	cd->offset = cookiep;
+skip_entry:
+	cd->common.err = nfs_ok;
+	return 0;
+fail:
+	cd->common.err = nfserr;
+	return -EINVAL;
+}
+
+static void
+nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
+{
+	__be32 *p;
+
+	RESERVE_SPACE(sizeof(stateid_t));
+	WRITE32(sid->si_generation);
+	WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+	ADJUST_ARGS();
+}
+
+static __be32
+nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
+{
+	__be32 *p;
+
+	if (!nfserr) {
+		RESERVE_SPACE(8);
+		WRITE32(access->ac_supported);
+		WRITE32(access->ac_resp_access);
+		ADJUST_ARGS();
+	}
+	return nfserr;
+}
+
+static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
+{
+	__be32 *p;
+
+	if (!nfserr) {
+		RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
+		WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+		WRITE32(bcts->dir);
+		/* XXX: ? */
+		WRITE32(0);
+		ADJUST_ARGS();
+	}
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
+{
+	ENCODE_SEQID_OP_HEAD;
+
+	if (!nfserr)
+		nfsd4_encode_stateid(resp, &close->cl_stateid);
+
+	ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
+	return nfserr;
+}
+
+
+static __be32
+nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
+{
+	__be32 *p;
+
+	if (!nfserr) {
+		RESERVE_SPACE(8);
+		WRITEMEM(commit->co_verf.data, 8);
+		ADJUST_ARGS();
+	}
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
+{
+	__be32 *p;
+
+	if (!nfserr) {
+		RESERVE_SPACE(32);
+		write_cinfo(&p, &create->cr_cinfo);
+		WRITE32(2);
+		WRITE32(create->cr_bmval[0]);
+		WRITE32(create->cr_bmval[1]);
+		ADJUST_ARGS();
+	}
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr)
+{
+	struct svc_fh *fhp = getattr->ga_fhp;
+	int buflen;
+
+	if (nfserr)
+		return nfserr;
+
+	buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2);
+	nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry,
+				    resp->p, &buflen, getattr->ga_bmval,
+				    resp->rqstp, 0);
+	if (!nfserr)
+		resp->p += buflen;
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp)
+{
+	struct svc_fh *fhp = *fhpp;
+	unsigned int len;
+	__be32 *p;
+
+	if (!nfserr) {
+		len = fhp->fh_handle.fh_size;
+		RESERVE_SPACE(len + 4);
+		WRITE32(len);
+		WRITEMEM(&fhp->fh_handle.fh_base, len);
+		ADJUST_ARGS();
+	}
+	return nfserr;
+}
+
+/*
+* Including all fields other than the name, a LOCK4denied structure requires
+*   8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes.
+*/
+static void
+nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld)
+{
+	__be32 *p;
+
+	RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0));
+	WRITE64(ld->ld_start);
+	WRITE64(ld->ld_length);
+	WRITE32(ld->ld_type);
+	if (ld->ld_sop) {
+		WRITEMEM(&ld->ld_clientid, 8);
+		WRITE32(ld->ld_sop->so_owner.len);
+		WRITEMEM(ld->ld_sop->so_owner.data, ld->ld_sop->so_owner.len);
+		kref_put(&ld->ld_sop->so_ref, nfs4_free_stateowner);
+	}  else {  /* non - nfsv4 lock in conflict, no clientid nor owner */
+		WRITE64((u64)0); /* clientid */
+		WRITE32(0); /* length of owner name */
+	}
+	ADJUST_ARGS();
+}
+
+static __be32
+nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
+{
+	ENCODE_SEQID_OP_HEAD;
+
+	if (!nfserr)
+		nfsd4_encode_stateid(resp, &lock->lk_resp_stateid);
+	else if (nfserr == nfserr_denied)
+		nfsd4_encode_lock_denied(resp, &lock->lk_denied);
+
+	ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
+{
+	if (nfserr == nfserr_denied)
+		nfsd4_encode_lock_denied(resp, &lockt->lt_denied);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
+{
+	ENCODE_SEQID_OP_HEAD;
+
+	if (!nfserr)
+		nfsd4_encode_stateid(resp, &locku->lu_stateid);
+
+	ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
+	return nfserr;
+}
+
+
+static __be32
+nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
+{
+	__be32 *p;
+
+	if (!nfserr) {
+		RESERVE_SPACE(20);
+		write_cinfo(&p, &link->li_cinfo);
+		ADJUST_ARGS();
+	}
+	return nfserr;
+}
+
+
+static __be32
+nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
+{
+	__be32 *p;
+	ENCODE_SEQID_OP_HEAD;
+
+	if (nfserr)
+		goto out;
+
+	nfsd4_encode_stateid(resp, &open->op_stateid);
+	RESERVE_SPACE(40);
+	write_cinfo(&p, &open->op_cinfo);
+	WRITE32(open->op_rflags);
+	WRITE32(2);
+	WRITE32(open->op_bmval[0]);
+	WRITE32(open->op_bmval[1]);
+	WRITE32(open->op_delegate_type);
+	ADJUST_ARGS();
+
+	switch (open->op_delegate_type) {
+	case NFS4_OPEN_DELEGATE_NONE:
+		break;
+	case NFS4_OPEN_DELEGATE_READ:
+		nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
+		RESERVE_SPACE(20);
+		WRITE32(open->op_recall);
+
+		/*
+		 * TODO: ACE's in delegations
+		 */
+		WRITE32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
+		WRITE32(0);
+		WRITE32(0);
+		WRITE32(0);   /* XXX: is NULL principal ok? */
+		ADJUST_ARGS();
+		break;
+	case NFS4_OPEN_DELEGATE_WRITE:
+		nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
+		RESERVE_SPACE(32);
+		WRITE32(0);
+
+		/*
+		 * TODO: space_limit's in delegations
+		 */
+		WRITE32(NFS4_LIMIT_SIZE);
+		WRITE32(~(u32)0);
+		WRITE32(~(u32)0);
+
+		/*
+		 * TODO: ACE's in delegations
+		 */
+		WRITE32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
+		WRITE32(0);
+		WRITE32(0);
+		WRITE32(0);   /* XXX: is NULL principal ok? */
+		ADJUST_ARGS();
+		break;
+	default:
+		BUG();
+	}
+	/* XXX save filehandle here */
+out:
+	ENCODE_SEQID_OP_TAIL(open->op_stateowner);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
+{
+	ENCODE_SEQID_OP_HEAD;
+
+	if (!nfserr)
+		nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
+
+	ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
+{
+	ENCODE_SEQID_OP_HEAD;
+
+	if (!nfserr)
+		nfsd4_encode_stateid(resp, &od->od_stateid);
+
+	ENCODE_SEQID_OP_TAIL(od->od_stateowner);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  struct nfsd4_read *read)
+{
+	u32 eof;
+	int v, pn;
+	unsigned long maxcount; 
+	long len;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+	if (resp->xbuf->page_len)
+		return nfserr_resource;
+
+	RESERVE_SPACE(8); /* eof flag and byte count */
+
+	maxcount = svc_max_payload(resp->rqstp);
+	if (maxcount > read->rd_length)
+		maxcount = read->rd_length;
+
+	len = maxcount;
+	v = 0;
+	while (len > 0) {
+		pn = resp->rqstp->rq_resused++;
+		resp->rqstp->rq_vec[v].iov_base =
+			page_address(resp->rqstp->rq_respages[pn]);
+		resp->rqstp->rq_vec[v].iov_len =
+			len < PAGE_SIZE ? len : PAGE_SIZE;
+		v++;
+		len -= PAGE_SIZE;
+	}
+	read->rd_vlen = v;
+
+	nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp,
+			read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
+			&maxcount);
+
+	if (nfserr == nfserr_symlink)
+		nfserr = nfserr_inval;
+	if (nfserr)
+		return nfserr;
+	eof = (read->rd_offset + maxcount >=
+	       read->rd_fhp->fh_dentry->d_inode->i_size);
+
+	WRITE32(eof);
+	WRITE32(maxcount);
+	ADJUST_ARGS();
+	resp->xbuf->head[0].iov_len = (char*)p
+					- (char*)resp->xbuf->head[0].iov_base;
+	resp->xbuf->page_len = maxcount;
+
+	/* Use rest of head for padding and remaining ops: */
+	resp->xbuf->tail[0].iov_base = p;
+	resp->xbuf->tail[0].iov_len = 0;
+	if (maxcount&3) {
+		RESERVE_SPACE(4);
+		WRITE32(0);
+		resp->xbuf->tail[0].iov_base += maxcount&3;
+		resp->xbuf->tail[0].iov_len = 4 - (maxcount&3);
+		ADJUST_ARGS();
+	}
+	return 0;
+}
+
+static __be32
+nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
+{
+	int maxcount;
+	char *page;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+	if (resp->xbuf->page_len)
+		return nfserr_resource;
+
+	page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
+
+	maxcount = PAGE_SIZE;
+	RESERVE_SPACE(4);
+
+	/*
+	 * XXX: By default, the ->readlink() VFS op will truncate symlinks
+	 * if they would overflow the buffer.  Is this kosher in NFSv4?  If
+	 * not, one easy fix is: if ->readlink() precisely fills the buffer,
+	 * assume that truncation occurred, and return NFS4ERR_RESOURCE.
+	 */
+	nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp, page, &maxcount);
+	if (nfserr == nfserr_isdir)
+		return nfserr_inval;
+	if (nfserr)
+		return nfserr;
+
+	WRITE32(maxcount);
+	ADJUST_ARGS();
+	resp->xbuf->head[0].iov_len = (char*)p
+				- (char*)resp->xbuf->head[0].iov_base;
+	resp->xbuf->page_len = maxcount;
+
+	/* Use rest of head for padding and remaining ops: */
+	resp->xbuf->tail[0].iov_base = p;
+	resp->xbuf->tail[0].iov_len = 0;
+	if (maxcount&3) {
+		RESERVE_SPACE(4);
+		WRITE32(0);
+		resp->xbuf->tail[0].iov_base += maxcount&3;
+		resp->xbuf->tail[0].iov_len = 4 - (maxcount&3);
+		ADJUST_ARGS();
+	}
+	return 0;
+}
+
+static __be32
+nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir)
+{
+	int maxcount;
+	loff_t offset;
+	__be32 *page, *savep, *tailbase;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+	if (resp->xbuf->page_len)
+		return nfserr_resource;
+
+	RESERVE_SPACE(8);  /* verifier */
+	savep = p;
+
+	/* XXX: Following NFSv3, we ignore the READDIR verifier for now. */
+	WRITE32(0);
+	WRITE32(0);
+	ADJUST_ARGS();
+	resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base;
+	tailbase = p;
+
+	maxcount = PAGE_SIZE;
+	if (maxcount > readdir->rd_maxcount)
+		maxcount = readdir->rd_maxcount;
+
+	/*
+	 * Convert from bytes to words, account for the two words already
+	 * written, make sure to leave two words at the end for the next
+	 * pointer and eof field.
+	 */
+	maxcount = (maxcount >> 2) - 4;
+	if (maxcount < 0) {
+		nfserr =  nfserr_toosmall;
+		goto err_no_verf;
+	}
+
+	page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
+	readdir->common.err = 0;
+	readdir->buflen = maxcount;
+	readdir->buffer = page;
+	readdir->offset = NULL;
+
+	offset = readdir->rd_cookie;
+	nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp,
+			      &offset,
+			      &readdir->common, nfsd4_encode_dirent);
+	if (nfserr == nfs_ok &&
+	    readdir->common.err == nfserr_toosmall &&
+	    readdir->buffer == page) 
+		nfserr = nfserr_toosmall;
+	if (nfserr == nfserr_symlink)
+		nfserr = nfserr_notdir;
+	if (nfserr)
+		goto err_no_verf;
+
+	if (readdir->offset)
+		xdr_encode_hyper(readdir->offset, offset);
+
+	p = readdir->buffer;
+	*p++ = 0;	/* no more entries */
+	*p++ = htonl(readdir->common.err == nfserr_eof);
+	resp->xbuf->page_len = ((char*)p) - (char*)page_address(
+		resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
+
+	/* Use rest of head for padding and remaining ops: */
+	resp->xbuf->tail[0].iov_base = tailbase;
+	resp->xbuf->tail[0].iov_len = 0;
+	resp->p = resp->xbuf->tail[0].iov_base;
+	resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4;
+
+	return 0;
+err_no_verf:
+	p = savep;
+	ADJUST_ARGS();
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
+{
+	__be32 *p;
+
+	if (!nfserr) {
+		RESERVE_SPACE(20);
+		write_cinfo(&p, &remove->rm_cinfo);
+		ADJUST_ARGS();
+	}
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
+{
+	__be32 *p;
+
+	if (!nfserr) {
+		RESERVE_SPACE(40);
+		write_cinfo(&p, &rename->rn_sinfo);
+		write_cinfo(&p, &rename->rn_tinfo);
+		ADJUST_ARGS();
+	}
+	return nfserr;
+}
+
+static __be32
+nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
+			 __be32 nfserr,struct svc_export *exp)
+{
+	int i = 0;
+	u32 nflavs;
+	struct exp_flavor_info *flavs;
+	struct exp_flavor_info def_flavs[2];
+	__be32 *p;
+
+	if (nfserr)
+		goto out;
+	if (exp->ex_nflavors) {
+		flavs = exp->ex_flavors;
+		nflavs = exp->ex_nflavors;
+	} else { /* Handling of some defaults in absence of real secinfo: */
+		flavs = def_flavs;
+		if (exp->ex_client->flavour->flavour == RPC_AUTH_UNIX) {
+			nflavs = 2;
+			flavs[0].pseudoflavor = RPC_AUTH_UNIX;
+			flavs[1].pseudoflavor = RPC_AUTH_NULL;
+		} else if (exp->ex_client->flavour->flavour == RPC_AUTH_GSS) {
+			nflavs = 1;
+			flavs[0].pseudoflavor
+					= svcauth_gss_flavor(exp->ex_client);
+		} else {
+			nflavs = 1;
+			flavs[0].pseudoflavor
+					= exp->ex_client->flavour->flavour;
+		}
+	}
+
+	RESERVE_SPACE(4);
+	WRITE32(nflavs);
+	ADJUST_ARGS();
+	for (i = 0; i < nflavs; i++) {
+		u32 flav = flavs[i].pseudoflavor;
+		struct gss_api_mech *gm = gss_mech_get_by_pseudoflavor(flav);
+
+		if (gm) {
+			RESERVE_SPACE(4);
+			WRITE32(RPC_AUTH_GSS);
+			ADJUST_ARGS();
+			RESERVE_SPACE(4 + gm->gm_oid.len);
+			WRITE32(gm->gm_oid.len);
+			WRITEMEM(gm->gm_oid.data, gm->gm_oid.len);
+			ADJUST_ARGS();
+			RESERVE_SPACE(4);
+			WRITE32(0); /* qop */
+			ADJUST_ARGS();
+			RESERVE_SPACE(4);
+			WRITE32(gss_pseudoflavor_to_service(gm, flav));
+			ADJUST_ARGS();
+			gss_mech_put(gm);
+		} else {
+			RESERVE_SPACE(4);
+			WRITE32(flav);
+			ADJUST_ARGS();
+		}
+	}
+out:
+	if (exp)
+		exp_put(exp);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+		     struct nfsd4_secinfo *secinfo)
+{
+	return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp);
+}
+
+static __be32
+nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
+		     struct nfsd4_secinfo_no_name *secinfo)
+{
+	return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp);
+}
+
+/*
+ * The SETATTR encode routine is special -- it always encodes a bitmap,
+ * regardless of the error status.
+ */
+static __be32
+nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
+{
+	__be32 *p;
+
+	RESERVE_SPACE(12);
+	if (nfserr) {
+		WRITE32(2);
+		WRITE32(0);
+		WRITE32(0);
+	}
+	else {
+		WRITE32(2);
+		WRITE32(setattr->sa_bmval[0]);
+		WRITE32(setattr->sa_bmval[1]);
+	}
+	ADJUST_ARGS();
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
+{
+	__be32 *p;
+
+	if (!nfserr) {
+		RESERVE_SPACE(8 + sizeof(nfs4_verifier));
+		WRITEMEM(&scd->se_clientid, 8);
+		WRITEMEM(&scd->se_confirm, sizeof(nfs4_verifier));
+		ADJUST_ARGS();
+	}
+	else if (nfserr == nfserr_clid_inuse) {
+		RESERVE_SPACE(8);
+		WRITE32(0);
+		WRITE32(0);
+		ADJUST_ARGS();
+	}
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
+{
+	__be32 *p;
+
+	if (!nfserr) {
+		RESERVE_SPACE(16);
+		WRITE32(write->wr_bytes_written);
+		WRITE32(write->wr_how_written);
+		WRITEMEM(write->wr_verifier.data, 8);
+		ADJUST_ARGS();
+	}
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
+			 struct nfsd4_exchange_id *exid)
+{
+	__be32 *p;
+	char *major_id;
+	char *server_scope;
+	int major_id_sz;
+	int server_scope_sz;
+	uint64_t minor_id = 0;
+
+	if (nfserr)
+		return nfserr;
+
+	major_id = utsname()->nodename;
+	major_id_sz = strlen(major_id);
+	server_scope = utsname()->nodename;
+	server_scope_sz = strlen(server_scope);
+
+	RESERVE_SPACE(
+		8 /* eir_clientid */ +
+		4 /* eir_sequenceid */ +
+		4 /* eir_flags */ +
+		4 /* spr_how (SP4_NONE) */ +
+		8 /* so_minor_id */ +
+		4 /* so_major_id.len */ +
+		(XDR_QUADLEN(major_id_sz) * 4) +
+		4 /* eir_server_scope.len */ +
+		(XDR_QUADLEN(server_scope_sz) * 4) +
+		4 /* eir_server_impl_id.count (0) */);
+
+	WRITEMEM(&exid->clientid, 8);
+	WRITE32(exid->seqid);
+	WRITE32(exid->flags);
+
+	/* state_protect4_r. Currently only support SP4_NONE */
+	BUG_ON(exid->spa_how != SP4_NONE);
+	WRITE32(exid->spa_how);
+
+	/* The server_owner struct */
+	WRITE64(minor_id);      /* Minor id */
+	/* major id */
+	WRITE32(major_id_sz);
+	WRITEMEM(major_id, major_id_sz);
+
+	/* Server scope */
+	WRITE32(server_scope_sz);
+	WRITEMEM(server_scope, server_scope_sz);
+
+	/* Implementation id */
+	WRITE32(0);	/* zero length nfs_impl_id4 array */
+	ADJUST_ARGS();
+	return 0;
+}
+
+static __be32
+nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
+			    struct nfsd4_create_session *sess)
+{
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	RESERVE_SPACE(24);
+	WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	WRITE32(sess->seqid);
+	WRITE32(sess->flags);
+	ADJUST_ARGS();
+
+	RESERVE_SPACE(28);
+	WRITE32(0); /* headerpadsz */
+	WRITE32(sess->fore_channel.maxreq_sz);
+	WRITE32(sess->fore_channel.maxresp_sz);
+	WRITE32(sess->fore_channel.maxresp_cached);
+	WRITE32(sess->fore_channel.maxops);
+	WRITE32(sess->fore_channel.maxreqs);
+	WRITE32(sess->fore_channel.nr_rdma_attrs);
+	ADJUST_ARGS();
+
+	if (sess->fore_channel.nr_rdma_attrs) {
+		RESERVE_SPACE(4);
+		WRITE32(sess->fore_channel.rdma_attrs);
+		ADJUST_ARGS();
+	}
+
+	RESERVE_SPACE(28);
+	WRITE32(0); /* headerpadsz */
+	WRITE32(sess->back_channel.maxreq_sz);
+	WRITE32(sess->back_channel.maxresp_sz);
+	WRITE32(sess->back_channel.maxresp_cached);
+	WRITE32(sess->back_channel.maxops);
+	WRITE32(sess->back_channel.maxreqs);
+	WRITE32(sess->back_channel.nr_rdma_attrs);
+	ADJUST_ARGS();
+
+	if (sess->back_channel.nr_rdma_attrs) {
+		RESERVE_SPACE(4);
+		WRITE32(sess->back_channel.rdma_attrs);
+		ADJUST_ARGS();
+	}
+	return 0;
+}
+
+static __be32
+nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
+			     struct nfsd4_destroy_session *destroy_session)
+{
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
+		      struct nfsd4_sequence *seq)
+{
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
+	WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	WRITE32(seq->seqid);
+	WRITE32(seq->slotid);
+	WRITE32(seq->maxslots);
+	/* For now: target_maxslots = maxslots */
+	WRITE32(seq->maxslots);
+	WRITE32(seq->status_flags);
+
+	ADJUST_ARGS();
+	resp->cstate.datap = p; /* DRC cache data pointer */
+	return 0;
+}
+
+static __be32
+nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
+{
+	return nfserr;
+}
+
+typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
+
+/*
+ * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
+ * since we don't need to filter out obsolete ops as this is
+ * done in the decoding phase.
+ */
+static nfsd4_enc nfsd4_enc_ops[] = {
+	[OP_ACCESS]		= (nfsd4_enc)nfsd4_encode_access,
+	[OP_CLOSE]		= (nfsd4_enc)nfsd4_encode_close,
+	[OP_COMMIT]		= (nfsd4_enc)nfsd4_encode_commit,
+	[OP_CREATE]		= (nfsd4_enc)nfsd4_encode_create,
+	[OP_DELEGPURGE]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_DELEGRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETATTR]		= (nfsd4_enc)nfsd4_encode_getattr,
+	[OP_GETFH]		= (nfsd4_enc)nfsd4_encode_getfh,
+	[OP_LINK]		= (nfsd4_enc)nfsd4_encode_link,
+	[OP_LOCK]		= (nfsd4_enc)nfsd4_encode_lock,
+	[OP_LOCKT]		= (nfsd4_enc)nfsd4_encode_lockt,
+	[OP_LOCKU]		= (nfsd4_enc)nfsd4_encode_locku,
+	[OP_LOOKUP]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LOOKUPP]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_NVERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_OPEN]		= (nfsd4_enc)nfsd4_encode_open,
+	[OP_OPENATTR]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_OPEN_CONFIRM]	= (nfsd4_enc)nfsd4_encode_open_confirm,
+	[OP_OPEN_DOWNGRADE]	= (nfsd4_enc)nfsd4_encode_open_downgrade,
+	[OP_PUTFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_PUTPUBFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_PUTROOTFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_READ]		= (nfsd4_enc)nfsd4_encode_read,
+	[OP_READDIR]		= (nfsd4_enc)nfsd4_encode_readdir,
+	[OP_READLINK]		= (nfsd4_enc)nfsd4_encode_readlink,
+	[OP_REMOVE]		= (nfsd4_enc)nfsd4_encode_remove,
+	[OP_RENAME]		= (nfsd4_enc)nfsd4_encode_rename,
+	[OP_RENEW]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_RESTOREFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SAVEFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SECINFO]		= (nfsd4_enc)nfsd4_encode_secinfo,
+	[OP_SETATTR]		= (nfsd4_enc)nfsd4_encode_setattr,
+	[OP_SETCLIENTID]	= (nfsd4_enc)nfsd4_encode_setclientid,
+	[OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop,
+	[OP_VERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_WRITE]		= (nfsd4_enc)nfsd4_encode_write,
+	[OP_RELEASE_LOCKOWNER]	= (nfsd4_enc)nfsd4_encode_noop,
+
+	/* NFSv4.1 operations */
+	[OP_BACKCHANNEL_CTL]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
+	[OP_EXCHANGE_ID]	= (nfsd4_enc)nfsd4_encode_exchange_id,
+	[OP_CREATE_SESSION]	= (nfsd4_enc)nfsd4_encode_create_session,
+	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_destroy_session,
+	[OP_FREE_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GET_DIR_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_secinfo_no_name,
+	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
+	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_TEST_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_WANT_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_DESTROY_CLIENTID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_RECLAIM_COMPLETE]	= (nfsd4_enc)nfsd4_encode_noop,
+};
+
+/*
+ * Calculate the total amount of memory that the compound response has taken
+ * after encoding the current operation.
+ *
+ * pad: add on 8 bytes for the next operation's op_code and status so that
+ * there is room to cache a failure on the next operation.
+ *
+ * Compare this length to the session se_fmaxresp_cached.
+ *
+ * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
+ * will be at least a page and will therefore hold the xdr_buf head.
+ */
+static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
+{
+	int status = 0;
+	struct xdr_buf *xb = &resp->rqstp->rq_res;
+	struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
+	struct nfsd4_session *session = NULL;
+	struct nfsd4_slot *slot = resp->cstate.slot;
+	u32 length, tlen = 0, pad = 8;
+
+	if (!nfsd4_has_session(&resp->cstate))
+		return status;
+
+	session = resp->cstate.session;
+	if (session == NULL || slot->sl_cachethis == 0)
+		return status;
+
+	if (resp->opcnt >= args->opcnt)
+		pad = 0; /* this is the last operation */
+
+	if (xb->page_len == 0) {
+		length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
+	} else {
+		if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0)
+			tlen = (char *)resp->p - (char *)xb->tail[0].iov_base;
+
+		length = xb->head[0].iov_len + xb->page_len + tlen + pad;
+	}
+	dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
+		length, xb->page_len, tlen, pad);
+
+	if (length <= session->se_fchannel.maxresp_cached)
+		return status;
+	else
+		return nfserr_rep_too_big_to_cache;
+}
+
+void
+nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
+{
+	__be32 *statp;
+	__be32 *p;
+
+	RESERVE_SPACE(8);
+	WRITE32(op->opnum);
+	statp = p++;	/* to be backfilled at the end */
+	ADJUST_ARGS();
+
+	if (op->opnum == OP_ILLEGAL)
+		goto status;
+	BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
+	       !nfsd4_enc_ops[op->opnum]);
+	op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
+	/* nfsd4_check_drc_limit guarantees enough room for error status */
+	if (!op->status && nfsd4_check_drc_limit(resp))
+		op->status = nfserr_rep_too_big_to_cache;
+status:
+	/*
+	 * Note: We write the status directly, instead of using WRITE32(),
+	 * since it is already in network byte order.
+	 */
+	*statp = op->status;
+}
+
+/* 
+ * Encode the reply stored in the stateowner reply cache 
+ * 
+ * XDR note: do not encode rp->rp_buflen: the buffer contains the
+ * previously sent already encoded operation.
+ *
+ * called with nfs4_lock_state() held
+ */
+void
+nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
+{
+	__be32 *p;
+	struct nfs4_replay *rp = op->replay;
+
+	BUG_ON(!rp);
+
+	RESERVE_SPACE(8);
+	WRITE32(op->opnum);
+	*p++ = rp->rp_status;  /* already xdr'ed */
+	ADJUST_ARGS();
+
+	RESERVE_SPACE(rp->rp_buflen);
+	WRITEMEM(rp->rp_buf, rp->rp_buflen);
+	ADJUST_ARGS();
+}
+
+int
+nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+        return xdr_ressize_check(rqstp, p);
+}
+
+void nfsd4_release_compoundargs(struct nfsd4_compoundargs *args)
+{
+	if (args->ops != args->iops) {
+		kfree(args->ops);
+		args->ops = args->iops;
+	}
+	kfree(args->tmpp);
+	args->tmpp = NULL;
+	while (args->to_free) {
+		struct tmpbuf *tb = args->to_free;
+		args->to_free = tb->next;
+		tb->release(tb->buf);
+		kfree(tb);
+	}
+}
+
+int
+nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args)
+{
+	__be32 status;
+
+	args->p = p;
+	args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len;
+	args->pagelist = rqstp->rq_arg.pages;
+	args->pagelen = rqstp->rq_arg.page_len;
+	args->tmpp = NULL;
+	args->to_free = NULL;
+	args->ops = args->iops;
+	args->rqstp = rqstp;
+
+	status = nfsd4_decode_compound(args);
+	if (status) {
+		nfsd4_release_compoundargs(args);
+	}
+	return !status;
+}
+
+int
+nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundres *resp)
+{
+	/*
+	 * All that remains is to write the tag and operation count...
+	 */
+	struct nfsd4_compound_state *cs = &resp->cstate;
+	struct kvec *iov;
+	p = resp->tagp;
+	*p++ = htonl(resp->taglen);
+	memcpy(p, resp->tag, resp->taglen);
+	p += XDR_QUADLEN(resp->taglen);
+	*p++ = htonl(resp->opcnt);
+
+	if (rqstp->rq_res.page_len) 
+		iov = &rqstp->rq_res.tail[0];
+	else
+		iov = &rqstp->rq_res.head[0];
+	iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
+	BUG_ON(iov->iov_len > PAGE_SIZE);
+	if (nfsd4_has_session(cs)) {
+		if (cs->status != nfserr_replay_cache) {
+			nfsd4_store_cache_entry(resp);
+			dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+			cs->slot->sl_inuse = false;
+		}
+		/* Renew the clientid on success and on replay */
+		release_session_client(cs->session);
+		nfsd4_put_session(cs->session);
+	}
+	return 1;
+}
+
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
new file mode 100644
index 00000000..4666a209
--- /dev/null
+++ b/fs/nfsd/nfscache.c
@@ -0,0 +1,322 @@
+/*
+ * Request reply cache. This is currently a global cache, but this may
+ * change in the future and be a per-client cache.
+ *
+ * This code is heavily inspired by the 44BSD implementation, although
+ * it does things a bit differently.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/slab.h>
+
+#include "nfsd.h"
+#include "cache.h"
+
+/* Size of reply cache. Common values are:
+ * 4.3BSD:	128
+ * 4.4BSD:	256
+ * Solaris2:	1024
+ * DEC Unix:	512-4096
+ */
+#define CACHESIZE		1024
+#define HASHSIZE		64
+
+static struct hlist_head *	cache_hash;
+static struct list_head 	lru_head;
+static int			cache_disabled = 1;
+
+/*
+ * Calculate the hash index from an XID.
+ */
+static inline u32 request_hash(u32 xid)
+{
+	u32 h = xid;
+	h ^= (xid >> 24);
+	return h & (HASHSIZE-1);
+}
+
+static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
+
+/*
+ * locking for the reply cache:
+ * A cache entry is "single use" if c_state == RC_INPROG
+ * Otherwise, it when accessing _prev or _next, the lock must be held.
+ */
+static DEFINE_SPINLOCK(cache_lock);
+
+int nfsd_reply_cache_init(void)
+{
+	struct svc_cacherep	*rp;
+	int			i;
+
+	INIT_LIST_HEAD(&lru_head);
+	i = CACHESIZE;
+	while (i) {
+		rp = kmalloc(sizeof(*rp), GFP_KERNEL);
+		if (!rp)
+			goto out_nomem;
+		list_add(&rp->c_lru, &lru_head);
+		rp->c_state = RC_UNUSED;
+		rp->c_type = RC_NOCACHE;
+		INIT_HLIST_NODE(&rp->c_hash);
+		i--;
+	}
+
+	cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
+	if (!cache_hash)
+		goto out_nomem;
+
+	cache_disabled = 0;
+	return 0;
+out_nomem:
+	printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
+	nfsd_reply_cache_shutdown();
+	return -ENOMEM;
+}
+
+void nfsd_reply_cache_shutdown(void)
+{
+	struct svc_cacherep	*rp;
+
+	while (!list_empty(&lru_head)) {
+		rp = list_entry(lru_head.next, struct svc_cacherep, c_lru);
+		if (rp->c_state == RC_DONE && rp->c_type == RC_REPLBUFF)
+			kfree(rp->c_replvec.iov_base);
+		list_del(&rp->c_lru);
+		kfree(rp);
+	}
+
+	cache_disabled = 1;
+
+	kfree (cache_hash);
+	cache_hash = NULL;
+}
+
+/*
+ * Move cache entry to end of LRU list
+ */
+static void
+lru_put_end(struct svc_cacherep *rp)
+{
+	list_move_tail(&rp->c_lru, &lru_head);
+}
+
+/*
+ * Move a cache entry from one hash list to another
+ */
+static void
+hash_refile(struct svc_cacherep *rp)
+{
+	hlist_del_init(&rp->c_hash);
+	hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));
+}
+
+/*
+ * Try to find an entry matching the current call in the cache. When none
+ * is found, we grab the oldest unlocked entry off the LRU list.
+ * Note that no operation within the loop may sleep.
+ */
+int
+nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
+{
+	struct hlist_node	*hn;
+	struct hlist_head 	*rh;
+	struct svc_cacherep	*rp;
+	__be32			xid = rqstp->rq_xid;
+	u32			proto =  rqstp->rq_prot,
+				vers = rqstp->rq_vers,
+				proc = rqstp->rq_proc;
+	unsigned long		age;
+	int rtn;
+
+	rqstp->rq_cacherep = NULL;
+	if (cache_disabled || type == RC_NOCACHE) {
+		nfsdstats.rcnocache++;
+		return RC_DOIT;
+	}
+
+	spin_lock(&cache_lock);
+	rtn = RC_DOIT;
+
+	rh = &cache_hash[request_hash(xid)];
+	hlist_for_each_entry(rp, hn, rh, c_hash) {
+		if (rp->c_state != RC_UNUSED &&
+		    xid == rp->c_xid && proc == rp->c_proc &&
+		    proto == rp->c_prot && vers == rp->c_vers &&
+		    time_before(jiffies, rp->c_timestamp + 120*HZ) &&
+		    memcmp((char*)&rqstp->rq_addr, (char*)&rp->c_addr, sizeof(rp->c_addr))==0) {
+			nfsdstats.rchits++;
+			goto found_entry;
+		}
+	}
+	nfsdstats.rcmisses++;
+
+	/* This loop shouldn't take more than a few iterations normally */
+	{
+	int	safe = 0;
+	list_for_each_entry(rp, &lru_head, c_lru) {
+		if (rp->c_state != RC_INPROG)
+			break;
+		if (safe++ > CACHESIZE) {
+			printk("nfsd: loop in repcache LRU list\n");
+			cache_disabled = 1;
+			goto out;
+		}
+	}
+	}
+
+	/* All entries on the LRU are in-progress. This should not happen */
+	if (&rp->c_lru == &lru_head) {
+		static int	complaints;
+
+		printk(KERN_WARNING "nfsd: all repcache entries locked!\n");
+		if (++complaints > 5) {
+			printk(KERN_WARNING "nfsd: disabling repcache.\n");
+			cache_disabled = 1;
+		}
+		goto out;
+	}
+
+	rqstp->rq_cacherep = rp;
+	rp->c_state = RC_INPROG;
+	rp->c_xid = xid;
+	rp->c_proc = proc;
+	memcpy(&rp->c_addr, svc_addr_in(rqstp), sizeof(rp->c_addr));
+	rp->c_prot = proto;
+	rp->c_vers = vers;
+	rp->c_timestamp = jiffies;
+
+	hash_refile(rp);
+
+	/* release any buffer */
+	if (rp->c_type == RC_REPLBUFF) {
+		kfree(rp->c_replvec.iov_base);
+		rp->c_replvec.iov_base = NULL;
+	}
+	rp->c_type = RC_NOCACHE;
+ out:
+	spin_unlock(&cache_lock);
+	return rtn;
+
+found_entry:
+	/* We found a matching entry which is either in progress or done. */
+	age = jiffies - rp->c_timestamp;
+	rp->c_timestamp = jiffies;
+	lru_put_end(rp);
+
+	rtn = RC_DROPIT;
+	/* Request being processed or excessive rexmits */
+	if (rp->c_state == RC_INPROG || age < RC_DELAY)
+		goto out;
+
+	/* From the hall of fame of impractical attacks:
+	 * Is this a user who tries to snoop on the cache? */
+	rtn = RC_DOIT;
+	if (!rqstp->rq_secure && rp->c_secure)
+		goto out;
+
+	/* Compose RPC reply header */
+	switch (rp->c_type) {
+	case RC_NOCACHE:
+		break;
+	case RC_REPLSTAT:
+		svc_putu32(&rqstp->rq_res.head[0], rp->c_replstat);
+		rtn = RC_REPLY;
+		break;
+	case RC_REPLBUFF:
+		if (!nfsd_cache_append(rqstp, &rp->c_replvec))
+			goto out;	/* should not happen */
+		rtn = RC_REPLY;
+		break;
+	default:
+		printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type);
+		rp->c_state = RC_UNUSED;
+	}
+
+	goto out;
+}
+
+/*
+ * Update a cache entry. This is called from nfsd_dispatch when
+ * the procedure has been executed and the complete reply is in
+ * rqstp->rq_res.
+ *
+ * We're copying around data here rather than swapping buffers because
+ * the toplevel loop requires max-sized buffers, which would be a waste
+ * of memory for a cache with a max reply size of 100 bytes (diropokres).
+ *
+ * If we should start to use different types of cache entries tailored
+ * specifically for attrstat and fh's, we may save even more space.
+ *
+ * Also note that a cachetype of RC_NOCACHE can legally be passed when
+ * nfsd failed to encode a reply that otherwise would have been cached.
+ * In this case, nfsd_cache_update is called with statp == NULL.
+ */
+void
+nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
+{
+	struct svc_cacherep *rp;
+	struct kvec	*resv = &rqstp->rq_res.head[0], *cachv;
+	int		len;
+
+	if (!(rp = rqstp->rq_cacherep) || cache_disabled)
+		return;
+
+	len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
+	len >>= 2;
+
+	/* Don't cache excessive amounts of data and XDR failures */
+	if (!statp || len > (256 >> 2)) {
+		rp->c_state = RC_UNUSED;
+		return;
+	}
+
+	switch (cachetype) {
+	case RC_REPLSTAT:
+		if (len != 1)
+			printk("nfsd: RC_REPLSTAT/reply len %d!\n",len);
+		rp->c_replstat = *statp;
+		break;
+	case RC_REPLBUFF:
+		cachv = &rp->c_replvec;
+		cachv->iov_base = kmalloc(len << 2, GFP_KERNEL);
+		if (!cachv->iov_base) {
+			spin_lock(&cache_lock);
+			rp->c_state = RC_UNUSED;
+			spin_unlock(&cache_lock);
+			return;
+		}
+		cachv->iov_len = len << 2;
+		memcpy(cachv->iov_base, statp, len << 2);
+		break;
+	}
+	spin_lock(&cache_lock);
+	lru_put_end(rp);
+	rp->c_secure = rqstp->rq_secure;
+	rp->c_type = cachetype;
+	rp->c_state = RC_DONE;
+	rp->c_timestamp = jiffies;
+	spin_unlock(&cache_lock);
+	return;
+}
+
+/*
+ * Copy cached reply to current reply buffer. Should always fit.
+ * FIXME as reply is in a page, we should just attach the page, and
+ * keep a refcount....
+ */
+static int
+nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
+{
+	struct kvec	*vec = &rqstp->rq_res.head[0];
+
+	if (vec->iov_len + data->iov_len > PAGE_SIZE) {
+		printk(KERN_WARNING "nfsd: cached reply too large (%Zd).\n",
+				data->iov_len);
+		return 0;
+	}
+	memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len);
+	vec->iov_len += data->iov_len;
+	return 1;
+}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
new file mode 100644
index 00000000..2b1449dd
--- /dev/null
+++ b/fs/nfsd/nfsctl.c
@@ -0,0 +1,1527 @@
+/*
+ * Syscall interface to knfsd.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/ctype.h>
+
+#include <linux/sunrpc/svcsock.h>
+#include <linux/nfsd/syscall.h>
+#include <linux/lockd/lockd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/gss_api.h>
+#include <linux/sunrpc/gss_krb5_enctypes.h>
+
+#include "idmap.h"
+#include "nfsd.h"
+#include "cache.h"
+
+/*
+ *	We have a single directory with several nodes in it.
+ */
+enum {
+	NFSD_Root = 1,
+#ifdef CONFIG_NFSD_DEPRECATED
+	NFSD_Svc,
+	NFSD_Add,
+	NFSD_Del,
+	NFSD_Export,
+	NFSD_Unexport,
+	NFSD_Getfd,
+	NFSD_Getfs,
+#endif
+	NFSD_List,
+	NFSD_Export_features,
+	NFSD_Fh,
+	NFSD_FO_UnlockIP,
+	NFSD_FO_UnlockFS,
+	NFSD_Threads,
+	NFSD_Pool_Threads,
+	NFSD_Pool_Stats,
+	NFSD_Versions,
+	NFSD_Ports,
+	NFSD_MaxBlkSize,
+	NFSD_SupportedEnctypes,
+	/*
+	 * The below MUST come last.  Otherwise we leave a hole in nfsd_files[]
+	 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
+	 */
+#ifdef CONFIG_NFSD_V4
+	NFSD_Leasetime,
+	NFSD_Gracetime,
+	NFSD_RecoveryDir,
+#endif
+};
+
+/*
+ * write() for these nodes.
+ */
+#ifdef CONFIG_NFSD_DEPRECATED
+static ssize_t write_svc(struct file *file, char *buf, size_t size);
+static ssize_t write_add(struct file *file, char *buf, size_t size);
+static ssize_t write_del(struct file *file, char *buf, size_t size);
+static ssize_t write_export(struct file *file, char *buf, size_t size);
+static ssize_t write_unexport(struct file *file, char *buf, size_t size);
+static ssize_t write_getfd(struct file *file, char *buf, size_t size);
+static ssize_t write_getfs(struct file *file, char *buf, size_t size);
+#endif
+static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
+static ssize_t write_threads(struct file *file, char *buf, size_t size);
+static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
+static ssize_t write_versions(struct file *file, char *buf, size_t size);
+static ssize_t write_ports(struct file *file, char *buf, size_t size);
+static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
+#ifdef CONFIG_NFSD_V4
+static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
+static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
+#endif
+
+static ssize_t (*write_op[])(struct file *, char *, size_t) = {
+#ifdef CONFIG_NFSD_DEPRECATED
+	[NFSD_Svc] = write_svc,
+	[NFSD_Add] = write_add,
+	[NFSD_Del] = write_del,
+	[NFSD_Export] = write_export,
+	[NFSD_Unexport] = write_unexport,
+	[NFSD_Getfd] = write_getfd,
+	[NFSD_Getfs] = write_getfs,
+#endif
+	[NFSD_Fh] = write_filehandle,
+	[NFSD_FO_UnlockIP] = write_unlock_ip,
+	[NFSD_FO_UnlockFS] = write_unlock_fs,
+	[NFSD_Threads] = write_threads,
+	[NFSD_Pool_Threads] = write_pool_threads,
+	[NFSD_Versions] = write_versions,
+	[NFSD_Ports] = write_ports,
+	[NFSD_MaxBlkSize] = write_maxblksize,
+#ifdef CONFIG_NFSD_V4
+	[NFSD_Leasetime] = write_leasetime,
+	[NFSD_Gracetime] = write_gracetime,
+	[NFSD_RecoveryDir] = write_recoverydir,
+#endif
+};
+
+static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)
+{
+	ino_t ino =  file->f_path.dentry->d_inode->i_ino;
+	char *data;
+	ssize_t rv;
+
+	if (ino >= ARRAY_SIZE(write_op) || !write_op[ino])
+		return -EINVAL;
+
+	data = simple_transaction_get(file, buf, size);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	rv =  write_op[ino](file, data, size);
+	if (rv >= 0) {
+		simple_transaction_set(file, rv);
+		rv = size;
+	}
+	return rv;
+}
+
+static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
+{
+#ifdef CONFIG_NFSD_DEPRECATED
+	static int warned;
+	if (file->f_dentry->d_name.name[0] == '.' && !warned) {
+		printk(KERN_INFO
+		       "Warning: \"%s\" uses deprecated NFSD interface: %s."
+		       "  This will be removed in 2.6.40\n",
+		       current->comm, file->f_dentry->d_name.name);
+		warned = 1;
+	}
+#endif
+	if (! file->private_data) {
+		/* An attempt to read a transaction file without writing
+		 * causes a 0-byte write so that the file can return
+		 * state information
+		 */
+		ssize_t rv = nfsctl_transaction_write(file, buf, 0, pos);
+		if (rv < 0)
+			return rv;
+	}
+	return simple_transaction_read(file, buf, size, pos);
+}
+
+static const struct file_operations transaction_ops = {
+	.write		= nfsctl_transaction_write,
+	.read		= nfsctl_transaction_read,
+	.release	= simple_transaction_release,
+	.llseek		= default_llseek,
+};
+
+static int exports_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &nfs_exports_op);
+}
+
+static const struct file_operations exports_operations = {
+	.open		= exports_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+	.owner		= THIS_MODULE,
+};
+
+static int export_features_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "0x%x 0x%x\n", NFSEXP_ALLFLAGS, NFSEXP_SECINFO_FLAGS);
+	return 0;
+}
+
+static int export_features_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, export_features_show, NULL);
+}
+
+static struct file_operations export_features_operations = {
+	.open		= export_features_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
+static int supported_enctypes_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, KRB5_SUPPORTED_ENCTYPES);
+	return 0;
+}
+
+static int supported_enctypes_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, supported_enctypes_show, NULL);
+}
+
+static struct file_operations supported_enctypes_ops = {
+	.open		= supported_enctypes_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
+
+extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
+extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
+
+static const struct file_operations pool_stats_operations = {
+	.open		= nfsd_pool_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= nfsd_pool_stats_release,
+	.owner		= THIS_MODULE,
+};
+
+/*----------------------------------------------------------------------------*/
+/*
+ * payload - write methods
+ */
+
+#ifdef CONFIG_NFSD_DEPRECATED
+/**
+ * write_svc - Start kernel's NFSD server
+ *
+ * Deprecated.  /proc/fs/nfsd/threads is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_svc
+ *				svc_port:	port number of this
+ *						server's listener
+ *				svc_nthreads:	number of threads to start
+ *			size:	size in bytes of passed in nfsctl_svc
+ * Output:
+ *	On success:	returns zero
+ *	On error:	return code is negative errno value
+ */
+static ssize_t write_svc(struct file *file, char *buf, size_t size)
+{
+	struct nfsctl_svc *data;
+	int err;
+	if (size < sizeof(*data))
+		return -EINVAL;
+	data = (struct nfsctl_svc*) buf;
+	err = nfsd_svc(data->svc_port, data->svc_nthreads);
+	if (err < 0)
+		return err;
+	return 0;
+}
+
+/**
+ * write_add - Add or modify client entry in auth unix cache
+ *
+ * Deprecated.  /proc/net/rpc/auth.unix.ip is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_client
+ *				cl_ident:	'\0'-terminated C string
+ *						containing domain name
+ *						of client
+ *				cl_naddr:	no. of items in cl_addrlist
+ *				cl_addrlist:	array of client addresses
+ *				cl_fhkeytype:	ignored
+ *				cl_fhkeylen:	ignored
+ *				cl_fhkey:	ignored
+ *			size:	size in bytes of passed in nfsctl_client
+ * Output:
+ *	On success:	returns zero
+ *	On error:	return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since
+ * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
+ */
+static ssize_t write_add(struct file *file, char *buf, size_t size)
+{
+	struct nfsctl_client *data;
+	if (size < sizeof(*data))
+		return -EINVAL;
+	data = (struct nfsctl_client *)buf;
+	return exp_addclient(data);
+}
+
+/**
+ * write_del - Remove client from auth unix cache
+ *
+ * Deprecated.  /proc/net/rpc/auth.unix.ip is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_client
+ *				cl_ident:	'\0'-terminated C string
+ *						containing domain name
+ *						of client
+ *				cl_naddr:	ignored
+ *				cl_addrlist:	ignored
+ *				cl_fhkeytype:	ignored
+ *				cl_fhkeylen:	ignored
+ *				cl_fhkey:	ignored
+ *			size:	size in bytes of passed in nfsctl_client
+ * Output:
+ *	On success:	returns zero
+ *	On error:	return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since
+ * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
+ */
+static ssize_t write_del(struct file *file, char *buf, size_t size)
+{
+	struct nfsctl_client *data;
+	if (size < sizeof(*data))
+		return -EINVAL;
+	data = (struct nfsctl_client *)buf;
+	return exp_delclient(data);
+}
+
+/**
+ * write_export - Export part or all of a local file system
+ *
+ * Deprecated.  /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_export
+ *				ex_client:	'\0'-terminated C string
+ *						containing domain name
+ *						of client allowed to access
+ *						this export
+ *				ex_path:	'\0'-terminated C string
+ *						containing pathname of
+ *						directory in local file system
+ *				ex_dev:		fsid to use for this export
+ *				ex_ino:		ignored
+ *				ex_flags:	export flags for this export
+ *				ex_anon_uid:	UID to use for anonymous
+ *						requests
+ *				ex_anon_gid:	GID to use for anonymous
+ *						requests
+ *			size:	size in bytes of passed in nfsctl_export
+ * Output:
+ *	On success:	returns zero
+ *	On error:	return code is negative errno value
+ */
+static ssize_t write_export(struct file *file, char *buf, size_t size)
+{
+	struct nfsctl_export *data;
+	if (size < sizeof(*data))
+		return -EINVAL;
+	data = (struct nfsctl_export*)buf;
+	return exp_export(data);
+}
+
+/**
+ * write_unexport - Unexport a previously exported file system
+ *
+ * Deprecated.  /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_export
+ *				ex_client:	'\0'-terminated C string
+ *						containing domain name
+ *						of client no longer allowed
+ *						to access this export
+ *				ex_path:	'\0'-terminated C string
+ *						containing pathname of
+ *						directory in local file system
+ *				ex_dev:		ignored
+ *				ex_ino:		ignored
+ *				ex_flags:	ignored
+ *				ex_anon_uid:	ignored
+ *				ex_anon_gid:	ignored
+ *			size:	size in bytes of passed in nfsctl_export
+ * Output:
+ *	On success:	returns zero
+ *	On error:	return code is negative errno value
+ */
+static ssize_t write_unexport(struct file *file, char *buf, size_t size)
+{
+	struct nfsctl_export *data;
+
+	if (size < sizeof(*data))
+		return -EINVAL;
+	data = (struct nfsctl_export*)buf;
+	return exp_unexport(data);
+}
+
+/**
+ * write_getfs - Get a variable-length NFS file handle by path
+ *
+ * Deprecated.  /proc/fs/nfsd/filehandle is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_fsparm
+ *				gd_addr:	socket address of client
+ *				gd_path:	'\0'-terminated C string
+ *						containing pathname of
+ *						directory in local file system
+ *				gd_maxlen:	maximum size of returned file
+ *						handle
+ *			size:	size in bytes of passed in nfsctl_fsparm
+ * Output:
+ *	On success:	passed-in buffer filled with a knfsd_fh structure
+ *			(a variable-length raw NFS file handle);
+ *			return code is the size in bytes of the file handle
+ *	On error:	return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since gd_addr
+ * is the same size as a struct sockaddr_in.
+ */
+static ssize_t write_getfs(struct file *file, char *buf, size_t size)
+{
+	struct nfsctl_fsparm *data;
+	struct sockaddr_in *sin;
+	struct auth_domain *clp;
+	int err = 0;
+	struct knfsd_fh *res;
+	struct in6_addr in6;
+
+	if (size < sizeof(*data))
+		return -EINVAL;
+	data = (struct nfsctl_fsparm*)buf;
+	err = -EPROTONOSUPPORT;
+	if (data->gd_addr.sa_family != AF_INET)
+		goto out;
+	sin = (struct sockaddr_in *)&data->gd_addr;
+	if (data->gd_maxlen > NFS3_FHSIZE)
+		data->gd_maxlen = NFS3_FHSIZE;
+
+	res = (struct knfsd_fh*)buf;
+
+	exp_readlock();
+
+	ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
+
+	clp = auth_unix_lookup(&init_net, &in6);
+	if (!clp)
+		err = -EPERM;
+	else {
+		err = exp_rootfh(clp, data->gd_path, res, data->gd_maxlen);
+		auth_domain_put(clp);
+	}
+	exp_readunlock();
+	if (err == 0)
+		err = res->fh_size + offsetof(struct knfsd_fh, fh_base);
+ out:
+	return err;
+}
+
+/**
+ * write_getfd - Get a fixed-length NFS file handle by path (used by mountd)
+ *
+ * Deprecated.  /proc/fs/nfsd/filehandle is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_fdparm
+ *				gd_addr:	socket address of client
+ *				gd_path:	'\0'-terminated C string
+ *						containing pathname of
+ *						directory in local file system
+ *				gd_version:	fdparm structure version
+ *			size:	size in bytes of passed in nfsctl_fdparm
+ * Output:
+ *	On success:	passed-in buffer filled with nfsctl_res
+ *			(a fixed-length raw NFS file handle);
+ *			return code is the size in bytes of the file handle
+ *	On error:	return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since gd_addr
+ * is the same size as a struct sockaddr_in.
+ */
+static ssize_t write_getfd(struct file *file, char *buf, size_t size)
+{
+	struct nfsctl_fdparm *data;
+	struct sockaddr_in *sin;
+	struct auth_domain *clp;
+	int err = 0;
+	struct knfsd_fh fh;
+	char *res;
+	struct in6_addr in6;
+
+	if (size < sizeof(*data))
+		return -EINVAL;
+	data = (struct nfsctl_fdparm*)buf;
+	err = -EPROTONOSUPPORT;
+	if (data->gd_addr.sa_family != AF_INET)
+		goto out;
+	err = -EINVAL;
+	if (data->gd_version < 2 || data->gd_version > NFSSVC_MAXVERS)
+		goto out;
+
+	res = buf;
+	sin = (struct sockaddr_in *)&data->gd_addr;
+	exp_readlock();
+
+	ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
+
+	clp = auth_unix_lookup(&init_net, &in6);
+	if (!clp)
+		err = -EPERM;
+	else {
+		err = exp_rootfh(clp, data->gd_path, &fh, NFS_FHSIZE);
+		auth_domain_put(clp);
+	}
+	exp_readunlock();
+
+	if (err == 0) {
+		memset(res,0, NFS_FHSIZE);
+		memcpy(res, &fh.fh_base, fh.fh_size);
+		err = NFS_FHSIZE;
+	}
+ out:
+	return err;
+}
+#endif /* CONFIG_NFSD_DEPRECATED */
+
+/**
+ * write_unlock_ip - Release all locks used by a client
+ *
+ * Experimental.
+ *
+ * Input:
+ *			buf:	'\n'-terminated C string containing a
+ *				presentation format IP address
+ *			size:	length of C string in @buf
+ * Output:
+ *	On success:	returns zero if all specified locks were released;
+ *			returns one if one or more locks were not released
+ *	On error:	return code is negative errno value
+ */
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
+{
+	struct sockaddr_storage address;
+	struct sockaddr *sap = (struct sockaddr *)&address;
+	size_t salen = sizeof(address);
+	char *fo_path;
+
+	/* sanity check */
+	if (size == 0)
+		return -EINVAL;
+
+	if (buf[size-1] != '\n')
+		return -EINVAL;
+
+	fo_path = buf;
+	if (qword_get(&buf, fo_path, size) < 0)
+		return -EINVAL;
+
+	if (rpc_pton(fo_path, size, sap, salen) == 0)
+		return -EINVAL;
+
+	return nlmsvc_unlock_all_by_ip(sap);
+}
+
+/**
+ * write_unlock_fs - Release all locks on a local file system
+ *
+ * Experimental.
+ *
+ * Input:
+ *			buf:	'\n'-terminated C string containing the
+ *				absolute pathname of a local file system
+ *			size:	length of C string in @buf
+ * Output:
+ *	On success:	returns zero if all specified locks were released;
+ *			returns one if one or more locks were not released
+ *	On error:	return code is negative errno value
+ */
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
+{
+	struct path path;
+	char *fo_path;
+	int error;
+
+	/* sanity check */
+	if (size == 0)
+		return -EINVAL;
+
+	if (buf[size-1] != '\n')
+		return -EINVAL;
+
+	fo_path = buf;
+	if (qword_get(&buf, fo_path, size) < 0)
+		return -EINVAL;
+
+	error = kern_path(fo_path, 0, &path);
+	if (error)
+		return error;
+
+	/*
+	 * XXX: Needs better sanity checking.  Otherwise we could end up
+	 * releasing locks on the wrong file system.
+	 *
+	 * For example:
+	 * 1.  Does the path refer to a directory?
+	 * 2.  Is that directory a mount point, or
+	 * 3.  Is that directory the root of an exported file system?
+	 */
+	error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb);
+
+	path_put(&path);
+	return error;
+}
+
+/**
+ * write_filehandle - Get a variable-length NFS file handle by path
+ *
+ * On input, the buffer contains a '\n'-terminated C string comprised of
+ * three alphanumeric words separated by whitespace.  The string may
+ * contain escape sequences.
+ *
+ * Input:
+ *			buf:
+ *				domain:		client domain name
+ *				path:		export pathname
+ *				maxsize:	numeric maximum size of
+ *						@buf
+ *			size:	length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing a ASCII hex text version
+ *			of the NFS file handle;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is negative errno value
+ */
+static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
+{
+	char *dname, *path;
+	int uninitialized_var(maxsize);
+	char *mesg = buf;
+	int len;
+	struct auth_domain *dom;
+	struct knfsd_fh fh;
+
+	if (size == 0)
+		return -EINVAL;
+
+	if (buf[size-1] != '\n')
+		return -EINVAL;
+	buf[size-1] = 0;
+
+	dname = mesg;
+	len = qword_get(&mesg, dname, size);
+	if (len <= 0)
+		return -EINVAL;
+	
+	path = dname+len+1;
+	len = qword_get(&mesg, path, size);
+	if (len <= 0)
+		return -EINVAL;
+
+	len = get_int(&mesg, &maxsize);
+	if (len)
+		return len;
+
+	if (maxsize < NFS_FHSIZE)
+		return -EINVAL;
+	if (maxsize > NFS3_FHSIZE)
+		maxsize = NFS3_FHSIZE;
+
+	if (qword_get(&mesg, mesg, size)>0)
+		return -EINVAL;
+
+	/* we have all the words, they are in buf.. */
+	dom = unix_domain_find(dname);
+	if (!dom)
+		return -ENOMEM;
+
+	len = exp_rootfh(dom, path, &fh,  maxsize);
+	auth_domain_put(dom);
+	if (len)
+		return len;
+	
+	mesg = buf;
+	len = SIMPLE_TRANSACTION_LIMIT;
+	qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
+	mesg[-1] = '\n';
+	return mesg - buf;	
+}
+
+/**
+ * write_threads - Start NFSD, or report the current number of running threads
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string numeric value representing the number of
+ *			running NFSD threads;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing an unsigned
+ *					integer value representing the
+ *					number of NFSD threads to start
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	NFS service is started;
+ *			passed-in buffer filled with '\n'-terminated C
+ *			string numeric value representing the number of
+ *			running NFSD threads;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_threads(struct file *file, char *buf, size_t size)
+{
+	char *mesg = buf;
+	int rv;
+	if (size > 0) {
+		int newthreads;
+		rv = get_int(&mesg, &newthreads);
+		if (rv)
+			return rv;
+		if (newthreads < 0)
+			return -EINVAL;
+		rv = nfsd_svc(NFS_PORT, newthreads);
+		if (rv < 0)
+			return rv;
+	} else
+		rv = nfsd_nrthreads();
+
+	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv);
+}
+
+/**
+ * write_pool_threads - Set or report the current number of threads per pool
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ * 			buf:		C string containing whitespace-
+ * 					separated unsigned integer values
+ *					representing the number of NFSD
+ *					threads to start in each pool
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing integer values representing the
+ *			number of NFSD threads in each pool;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
+{
+	/* if size > 0, look for an array of number of threads per node
+	 * and apply them  then write out number of threads per node as reply
+	 */
+	char *mesg = buf;
+	int i;
+	int rv;
+	int len;
+	int npools;
+	int *nthreads;
+
+	mutex_lock(&nfsd_mutex);
+	npools = nfsd_nrpools();
+	if (npools == 0) {
+		/*
+		 * NFS is shut down.  The admin can start it by
+		 * writing to the threads file but NOT the pool_threads
+		 * file, sorry.  Report zero threads.
+		 */
+		mutex_unlock(&nfsd_mutex);
+		strcpy(buf, "0\n");
+		return strlen(buf);
+	}
+
+	nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL);
+	rv = -ENOMEM;
+	if (nthreads == NULL)
+		goto out_free;
+
+	if (size > 0) {
+		for (i = 0; i < npools; i++) {
+			rv = get_int(&mesg, &nthreads[i]);
+			if (rv == -ENOENT)
+				break;		/* fewer numbers than pools */
+			if (rv)
+				goto out_free;	/* syntax error */
+			rv = -EINVAL;
+			if (nthreads[i] < 0)
+				goto out_free;
+		}
+		rv = nfsd_set_nrthreads(i, nthreads);
+		if (rv)
+			goto out_free;
+	}
+
+	rv = nfsd_get_nrthreads(npools, nthreads);
+	if (rv)
+		goto out_free;
+
+	mesg = buf;
+	size = SIMPLE_TRANSACTION_LIMIT;
+	for (i = 0; i < npools && size > 0; i++) {
+		snprintf(mesg, size, "%d%c", nthreads[i], (i == npools-1 ? '\n' : ' '));
+		len = strlen(mesg);
+		size -= len;
+		mesg += len;
+	}
+	rv = mesg - buf;
+out_free:
+	kfree(nthreads);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+static ssize_t __write_versions(struct file *file, char *buf, size_t size)
+{
+	char *mesg = buf;
+	char *vers, *minorp, sign;
+	int len, num, remaining;
+	unsigned minor;
+	ssize_t tlen = 0;
+	char *sep;
+
+	if (size>0) {
+		if (nfsd_serv)
+			/* Cannot change versions without updating
+			 * nfsd_serv->sv_xdrsize, and reallocing
+			 * rq_argp and rq_resp
+			 */
+			return -EBUSY;
+		if (buf[size-1] != '\n')
+			return -EINVAL;
+		buf[size-1] = 0;
+
+		vers = mesg;
+		len = qword_get(&mesg, vers, size);
+		if (len <= 0) return -EINVAL;
+		do {
+			sign = *vers;
+			if (sign == '+' || sign == '-')
+				num = simple_strtol((vers+1), &minorp, 0);
+			else
+				num = simple_strtol(vers, &minorp, 0);
+			if (*minorp == '.') {
+				if (num < 4)
+					return -EINVAL;
+				minor = simple_strtoul(minorp+1, NULL, 0);
+				if (minor == 0)
+					return -EINVAL;
+				if (nfsd_minorversion(minor, sign == '-' ?
+						     NFSD_CLEAR : NFSD_SET) < 0)
+					return -EINVAL;
+				goto next;
+			}
+			switch(num) {
+			case 2:
+			case 3:
+			case 4:
+				nfsd_vers(num, sign == '-' ? NFSD_CLEAR : NFSD_SET);
+				break;
+			default:
+				return -EINVAL;
+			}
+		next:
+			vers += len + 1;
+		} while ((len = qword_get(&mesg, vers, size)) > 0);
+		/* If all get turned off, turn them back on, as
+		 * having no versions is BAD
+		 */
+		nfsd_reset_versions();
+	}
+
+	/* Now write current state into reply buffer */
+	len = 0;
+	sep = "";
+	remaining = SIMPLE_TRANSACTION_LIMIT;
+	for (num=2 ; num <= 4 ; num++)
+		if (nfsd_vers(num, NFSD_AVAIL)) {
+			len = snprintf(buf, remaining, "%s%c%d", sep,
+				       nfsd_vers(num, NFSD_TEST)?'+':'-',
+				       num);
+			sep = " ";
+
+			if (len > remaining)
+				break;
+			remaining -= len;
+			buf += len;
+			tlen += len;
+		}
+	if (nfsd_vers(4, NFSD_AVAIL))
+		for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION;
+		     minor++) {
+			len = snprintf(buf, remaining, " %c4.%u",
+					(nfsd_vers(4, NFSD_TEST) &&
+					 nfsd_minorversion(minor, NFSD_TEST)) ?
+						'+' : '-',
+					minor);
+
+			if (len > remaining)
+				break;
+			remaining -= len;
+			buf += len;
+			tlen += len;
+		}
+
+	len = snprintf(buf, remaining, "\n");
+	if (len > remaining)
+		return -EINVAL;
+	return tlen + len;
+}
+
+/**
+ * write_versions - Set or report the available NFS protocol versions
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing positive or negative integer
+ *			values representing the current status of each
+ *			protocol version;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ * 			buf:		C string containing whitespace-
+ * 					separated positive or negative
+ * 					integer values representing NFS
+ * 					protocol versions to enable ("+n")
+ * 					or disable ("-n")
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	status of zero or more protocol versions has
+ *			been updated; passed-in buffer filled with
+ *			'\n'-terminated C string containing positive
+ *			or negative integer values representing the
+ *			current status of each protocol version;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_versions(struct file *file, char *buf, size_t size)
+{
+	ssize_t rv;
+
+	mutex_lock(&nfsd_mutex);
+	rv = __write_versions(file, buf, size);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+/*
+ * Zero-length write.  Return a list of NFSD's current listener
+ * transports.
+ */
+static ssize_t __write_ports_names(char *buf)
+{
+	if (nfsd_serv == NULL)
+		return 0;
+	return svc_xprt_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
+}
+
+/*
+ * A single 'fd' number was written, in which case it must be for
+ * a socket of a supported family/protocol, and we use it as an
+ * nfsd listener.
+ */
+static ssize_t __write_ports_addfd(char *buf)
+{
+	char *mesg = buf;
+	int fd, err;
+
+	err = get_int(&mesg, &fd);
+	if (err != 0 || fd < 0)
+		return -EINVAL;
+
+	err = nfsd_create_serv();
+	if (err != 0)
+		return err;
+
+	err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
+	if (err < 0) {
+		svc_destroy(nfsd_serv);
+		return err;
+	}
+
+	/* Decrease the count, but don't shut down the service */
+	nfsd_serv->sv_nrthreads--;
+	return err;
+}
+
+/*
+ * A '-' followed by the 'name' of a socket means we close the socket.
+ */
+static ssize_t __write_ports_delfd(char *buf)
+{
+	char *toclose;
+	int len = 0;
+
+	toclose = kstrdup(buf + 1, GFP_KERNEL);
+	if (toclose == NULL)
+		return -ENOMEM;
+
+	if (nfsd_serv != NULL)
+		len = svc_sock_names(nfsd_serv, buf,
+					SIMPLE_TRANSACTION_LIMIT, toclose);
+	kfree(toclose);
+	return len;
+}
+
+/*
+ * A transport listener is added by writing it's transport name and
+ * a port number.
+ */
+static ssize_t __write_ports_addxprt(char *buf)
+{
+	char transport[16];
+	struct svc_xprt *xprt;
+	int port, err;
+
+	if (sscanf(buf, "%15s %4u", transport, &port) != 2)
+		return -EINVAL;
+
+	if (port < 1 || port > USHRT_MAX)
+		return -EINVAL;
+
+	err = nfsd_create_serv();
+	if (err != 0)
+		return err;
+
+	err = svc_create_xprt(nfsd_serv, transport, &init_net,
+				PF_INET, port, SVC_SOCK_ANONYMOUS);
+	if (err < 0)
+		goto out_err;
+
+	err = svc_create_xprt(nfsd_serv, transport, &init_net,
+				PF_INET6, port, SVC_SOCK_ANONYMOUS);
+	if (err < 0 && err != -EAFNOSUPPORT)
+		goto out_close;
+
+	/* Decrease the count, but don't shut down the service */
+	nfsd_serv->sv_nrthreads--;
+	return 0;
+out_close:
+	xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port);
+	if (xprt != NULL) {
+		svc_close_xprt(xprt);
+		svc_xprt_put(xprt);
+	}
+out_err:
+	svc_destroy(nfsd_serv);
+	return err;
+}
+
+/*
+ * A transport listener is removed by writing a "-", it's transport
+ * name, and it's port number.
+ */
+static ssize_t __write_ports_delxprt(char *buf)
+{
+	struct svc_xprt *xprt;
+	char transport[16];
+	int port;
+
+	if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
+		return -EINVAL;
+
+	if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
+		return -EINVAL;
+
+	xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
+	if (xprt == NULL)
+		return -ENOTCONN;
+
+	svc_close_xprt(xprt);
+	svc_xprt_put(xprt);
+	return 0;
+}
+
+static ssize_t __write_ports(struct file *file, char *buf, size_t size)
+{
+	if (size == 0)
+		return __write_ports_names(buf);
+
+	if (isdigit(buf[0]))
+		return __write_ports_addfd(buf);
+
+	if (buf[0] == '-' && isdigit(buf[1]))
+		return __write_ports_delfd(buf);
+
+	if (isalpha(buf[0]))
+		return __write_ports_addxprt(buf);
+
+	if (buf[0] == '-' && isalpha(buf[1]))
+		return __write_ports_delxprt(buf);
+
+	return -EINVAL;
+}
+
+/**
+ * write_ports - Pass a socket file descriptor or transport name to listen on
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * Output:
+ *	On success:	passed-in buffer filled with a '\n'-terminated C
+ *			string containing a whitespace-separated list of
+ *			named NFSD listeners;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing an unsigned
+ *					integer value representing a bound
+ *					but unconnected socket that is to be
+ *					used as an NFSD listener; listen(3)
+ *					must be called for a SOCK_STREAM
+ *					socket, otherwise it is ignored
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	NFS service is started;
+ *			passed-in buffer filled with a '\n'-terminated C
+ *			string containing a unique alphanumeric name of
+ *			the listener;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing a "-" followed
+ *					by an integer value representing a
+ *					previously passed in socket file
+ *					descriptor
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	NFS service no longer listens on that socket;
+ *			passed-in buffer filled with a '\n'-terminated C
+ *			string containing a unique name of the listener;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing a transport
+ *					name and an unsigned integer value
+ *					representing the port to listen on,
+ *					separated by whitespace
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	returns zero; NFS service is started
+ *	On error:	return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing a "-" followed
+ *					by a transport name and an unsigned
+ *					integer value representing the port
+ *					to listen on, separated by whitespace
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	returns zero; NFS service no longer listens
+ *			on that transport
+ *	On error:	return code is a negative errno value
+ */
+static ssize_t write_ports(struct file *file, char *buf, size_t size)
+{
+	ssize_t rv;
+
+	mutex_lock(&nfsd_mutex);
+	rv = __write_ports(file, buf, size);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+
+int nfsd_max_blksize;
+
+/**
+ * write_maxblksize - Set or report the current NFS blksize
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ * 			buf:		C string containing an unsigned
+ * 					integer value representing the new
+ * 					NFS blksize
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C string
+ *			containing numeric value of the current NFS blksize
+ *			setting;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
+{
+	char *mesg = buf;
+	if (size > 0) {
+		int bsize;
+		int rv = get_int(&mesg, &bsize);
+		if (rv)
+			return rv;
+		/* force bsize into allowed range and
+		 * required alignment.
+		 */
+		if (bsize < 1024)
+			bsize = 1024;
+		if (bsize > NFSSVC_MAXBLKSIZE)
+			bsize = NFSSVC_MAXBLKSIZE;
+		bsize &= ~(1024-1);
+		mutex_lock(&nfsd_mutex);
+		if (nfsd_serv) {
+			mutex_unlock(&nfsd_mutex);
+			return -EBUSY;
+		}
+		nfsd_max_blksize = bsize;
+		mutex_unlock(&nfsd_mutex);
+	}
+
+	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n",
+							nfsd_max_blksize);
+}
+
+#ifdef CONFIG_NFSD_V4
+static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
+{
+	char *mesg = buf;
+	int rv, i;
+
+	if (size > 0) {
+		if (nfsd_serv)
+			return -EBUSY;
+		rv = get_int(&mesg, &i);
+		if (rv)
+			return rv;
+		/*
+		 * Some sanity checking.  We don't have a reason for
+		 * these particular numbers, but problems with the
+		 * extremes are:
+		 *	- Too short: the briefest network outage may
+		 *	  cause clients to lose all their locks.  Also,
+		 *	  the frequent polling may be wasteful.
+		 *	- Too long: do you really want reboot recovery
+		 *	  to take more than an hour?  Or to make other
+		 *	  clients wait an hour before being able to
+		 *	  revoke a dead client's locks?
+		 */
+		if (i < 10 || i > 3600)
+			return -EINVAL;
+		*time = i;
+	}
+
+	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);
+}
+
+static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
+{
+	ssize_t rv;
+
+	mutex_lock(&nfsd_mutex);
+	rv = __nfsd4_write_time(file, buf, size, time);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+/**
+ * write_leasetime - Set or report the current NFSv4 lease time
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing an unsigned
+ *					integer value representing the new
+ *					NFSv4 lease expiry time
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing unsigned integer value of the
+ *			current lease expiry time;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
+{
+	return nfsd4_write_time(file, buf, size, &nfsd4_lease);
+}
+
+/**
+ * write_gracetime - Set or report current NFSv4 grace period time
+ *
+ * As above, but sets the time of the NFSv4 grace period.
+ *
+ * Note this should never be set to less than the *previous*
+ * lease-period time, but we don't try to enforce this.  (In the common
+ * case (a new boot), we don't know what the previous lease time was
+ * anyway.)
+ */
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
+{
+	return nfsd4_write_time(file, buf, size, &nfsd4_grace);
+}
+
+extern char *nfs4_recoverydir(void);
+
+static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
+{
+	char *mesg = buf;
+	char *recdir;
+	int len, status;
+
+	if (size > 0) {
+		if (nfsd_serv)
+			return -EBUSY;
+		if (size > PATH_MAX || buf[size-1] != '\n')
+			return -EINVAL;
+		buf[size-1] = 0;
+
+		recdir = mesg;
+		len = qword_get(&mesg, recdir, size);
+		if (len <= 0)
+			return -EINVAL;
+
+		status = nfs4_reset_recoverydir(recdir);
+		if (status)
+			return status;
+	}
+
+	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n",
+							nfs4_recoverydir());
+}
+
+/**
+ * write_recoverydir - Set or report the pathname of the recovery directory
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing the pathname
+ *					of the directory on a local file
+ *					system containing permanent NFSv4
+ *					recovery data
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C string
+ *			containing the current recovery pathname setting;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
+{
+	ssize_t rv;
+
+	mutex_lock(&nfsd_mutex);
+	rv = __write_recoverydir(file, buf, size);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+#endif
+
+/*----------------------------------------------------------------------------*/
+/*
+ *	populating the filesystem.
+ */
+
+static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
+{
+	static struct tree_descr nfsd_files[] = {
+#ifdef CONFIG_NFSD_DEPRECATED
+		[NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR},
+		[NFSD_Add] = {".add", &transaction_ops, S_IWUSR},
+		[NFSD_Del] = {".del", &transaction_ops, S_IWUSR},
+		[NFSD_Export] = {".export", &transaction_ops, S_IWUSR},
+		[NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR},
+		[NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
+#endif
+		[NFSD_List] = {"exports", &exports_operations, S_IRUGO},
+		[NFSD_Export_features] = {"export_features",
+					&export_features_operations, S_IRUGO},
+		[NFSD_FO_UnlockIP] = {"unlock_ip",
+					&transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_FO_UnlockFS] = {"unlock_filesystem",
+					&transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
+		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
+		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
+#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
+		[NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
+#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
+#ifdef CONFIG_NFSD_V4
+		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
+#endif
+		/* last one */ {""}
+	};
+	return simple_fill_super(sb, 0x6e667364, nfsd_files);
+}
+
+static struct dentry *nfsd_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_single(fs_type, flags, data, nfsd_fill_super);
+}
+
+static struct file_system_type nfsd_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfsd",
+	.mount		= nfsd_mount,
+	.kill_sb	= kill_litter_super,
+};
+
+#ifdef CONFIG_PROC_FS
+static int create_proc_exports_entry(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = proc_mkdir("fs/nfs", NULL);
+	if (!entry)
+		return -ENOMEM;
+	entry = proc_create("exports", 0, entry, &exports_operations);
+	if (!entry)
+		return -ENOMEM;
+	return 0;
+}
+#else /* CONFIG_PROC_FS */
+static int create_proc_exports_entry(void)
+{
+	return 0;
+}
+#endif
+
+static int __init init_nfsd(void)
+{
+	int retval;
+	printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
+
+	retval = nfs4_state_init(); /* nfs4 locking state */
+	if (retval)
+		return retval;
+	nfsd_stat_init();	/* Statistics */
+	retval = nfsd_reply_cache_init();
+	if (retval)
+		goto out_free_stat;
+	retval = nfsd_export_init();
+	if (retval)
+		goto out_free_cache;
+	nfsd_lockd_init();	/* lockd->nfsd callbacks */
+	retval = nfsd_idmap_init();
+	if (retval)
+		goto out_free_lockd;
+	retval = create_proc_exports_entry();
+	if (retval)
+		goto out_free_idmap;
+	retval = register_filesystem(&nfsd_fs_type);
+	if (retval)
+		goto out_free_all;
+	return 0;
+out_free_all:
+	remove_proc_entry("fs/nfs/exports", NULL);
+	remove_proc_entry("fs/nfs", NULL);
+out_free_idmap:
+	nfsd_idmap_shutdown();
+out_free_lockd:
+	nfsd_lockd_shutdown();
+	nfsd_export_shutdown();
+out_free_cache:
+	nfsd_reply_cache_shutdown();
+out_free_stat:
+	nfsd_stat_shutdown();
+	nfsd4_free_slabs();
+	return retval;
+}
+
+static void __exit exit_nfsd(void)
+{
+	nfsd_export_shutdown();
+	nfsd_reply_cache_shutdown();
+	remove_proc_entry("fs/nfs/exports", NULL);
+	remove_proc_entry("fs/nfs", NULL);
+	nfsd_stat_shutdown();
+	nfsd_lockd_shutdown();
+	nfsd_idmap_shutdown();
+	nfsd4_free_slabs();
+	unregister_filesystem(&nfsd_fs_type);
+}
+
+MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
+MODULE_LICENSE("GPL");
+module_init(init_nfsd)
+module_exit(exit_nfsd)
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
new file mode 100644
index 00000000..7ecfa242
--- /dev/null
+++ b/fs/nfsd/nfsd.h
@@ -0,0 +1,340 @@
+/*
+ * Hodge-podge collection of knfsd-related stuff.
+ * I will sort this out later.
+ *
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef LINUX_NFSD_NFSD_H
+#define LINUX_NFSD_NFSD_H
+
+#include <linux/types.h>
+#include <linux/mount.h>
+
+#include <linux/nfsd/debug.h>
+#include <linux/nfsd/export.h>
+#include <linux/nfsd/stats.h>
+/*
+ * nfsd version
+ */
+#define NFSD_SUPPORTED_MINOR_VERSION	1
+
+struct readdir_cd {
+	__be32			err;	/* 0, nfserr, or nfserr_eof */
+};
+
+
+extern struct svc_program	nfsd_program;
+extern struct svc_version	nfsd_version2, nfsd_version3,
+				nfsd_version4;
+extern u32			nfsd_supported_minorversion;
+extern struct mutex		nfsd_mutex;
+extern struct svc_serv		*nfsd_serv;
+extern spinlock_t		nfsd_drc_lock;
+extern unsigned int		nfsd_drc_max_mem;
+extern unsigned int		nfsd_drc_mem_used;
+
+extern const struct seq_operations nfs_exports_op;
+
+/*
+ * Function prototypes.
+ */
+int		nfsd_svc(unsigned short port, int nrservs);
+int		nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
+
+int		nfsd_nrthreads(void);
+int		nfsd_nrpools(void);
+int		nfsd_get_nrthreads(int n, int *);
+int		nfsd_set_nrthreads(int n, int *);
+
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+#ifdef CONFIG_NFSD_V2_ACL
+extern struct svc_version nfsd_acl_version2;
+#else
+#define nfsd_acl_version2 NULL
+#endif
+#ifdef CONFIG_NFSD_V3_ACL
+extern struct svc_version nfsd_acl_version3;
+#else
+#define nfsd_acl_version3 NULL
+#endif
+#endif
+
+enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL };
+int nfsd_vers(int vers, enum vers_op change);
+int nfsd_minorversion(u32 minorversion, enum vers_op change);
+void nfsd_reset_versions(void);
+int nfsd_create_serv(void);
+
+extern int nfsd_max_blksize;
+
+static inline int nfsd_v4client(struct svc_rqst *rq)
+{
+	return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
+}
+
+/* 
+ * NFSv4 State
+ */
+#ifdef CONFIG_NFSD_V4
+extern unsigned int max_delegations;
+int nfs4_state_init(void);
+void nfsd4_free_slabs(void);
+int nfs4_state_start(void);
+void nfs4_state_shutdown(void);
+void nfs4_reset_lease(time_t leasetime);
+int nfs4_reset_recoverydir(char *recdir);
+#else
+static inline int nfs4_state_init(void) { return 0; }
+static inline void nfsd4_free_slabs(void) { }
+static inline int nfs4_state_start(void) { return 0; }
+static inline void nfs4_state_shutdown(void) { }
+static inline void nfs4_reset_lease(time_t leasetime) { }
+static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
+#endif
+
+/*
+ * lockd binding
+ */
+void		nfsd_lockd_init(void);
+void		nfsd_lockd_shutdown(void);
+
+
+/*
+ * These macros provide pre-xdr'ed values for faster operation.
+ */
+#define	nfs_ok			cpu_to_be32(NFS_OK)
+#define	nfserr_perm		cpu_to_be32(NFSERR_PERM)
+#define	nfserr_noent		cpu_to_be32(NFSERR_NOENT)
+#define	nfserr_io		cpu_to_be32(NFSERR_IO)
+#define	nfserr_nxio		cpu_to_be32(NFSERR_NXIO)
+#define	nfserr_eagain		cpu_to_be32(NFSERR_EAGAIN)
+#define	nfserr_acces		cpu_to_be32(NFSERR_ACCES)
+#define	nfserr_exist		cpu_to_be32(NFSERR_EXIST)
+#define	nfserr_xdev		cpu_to_be32(NFSERR_XDEV)
+#define	nfserr_nodev		cpu_to_be32(NFSERR_NODEV)
+#define	nfserr_notdir		cpu_to_be32(NFSERR_NOTDIR)
+#define	nfserr_isdir		cpu_to_be32(NFSERR_ISDIR)
+#define	nfserr_inval		cpu_to_be32(NFSERR_INVAL)
+#define	nfserr_fbig		cpu_to_be32(NFSERR_FBIG)
+#define	nfserr_nospc		cpu_to_be32(NFSERR_NOSPC)
+#define	nfserr_rofs		cpu_to_be32(NFSERR_ROFS)
+#define	nfserr_mlink		cpu_to_be32(NFSERR_MLINK)
+#define	nfserr_opnotsupp	cpu_to_be32(NFSERR_OPNOTSUPP)
+#define	nfserr_nametoolong	cpu_to_be32(NFSERR_NAMETOOLONG)
+#define	nfserr_notempty		cpu_to_be32(NFSERR_NOTEMPTY)
+#define	nfserr_dquot		cpu_to_be32(NFSERR_DQUOT)
+#define	nfserr_stale		cpu_to_be32(NFSERR_STALE)
+#define	nfserr_remote		cpu_to_be32(NFSERR_REMOTE)
+#define	nfserr_wflush		cpu_to_be32(NFSERR_WFLUSH)
+#define	nfserr_badhandle	cpu_to_be32(NFSERR_BADHANDLE)
+#define	nfserr_notsync		cpu_to_be32(NFSERR_NOT_SYNC)
+#define	nfserr_badcookie	cpu_to_be32(NFSERR_BAD_COOKIE)
+#define	nfserr_notsupp		cpu_to_be32(NFSERR_NOTSUPP)
+#define	nfserr_toosmall		cpu_to_be32(NFSERR_TOOSMALL)
+#define	nfserr_serverfault	cpu_to_be32(NFSERR_SERVERFAULT)
+#define	nfserr_badtype		cpu_to_be32(NFSERR_BADTYPE)
+#define	nfserr_jukebox		cpu_to_be32(NFSERR_JUKEBOX)
+#define	nfserr_denied		cpu_to_be32(NFSERR_DENIED)
+#define	nfserr_deadlock		cpu_to_be32(NFSERR_DEADLOCK)
+#define nfserr_expired          cpu_to_be32(NFSERR_EXPIRED)
+#define	nfserr_bad_cookie	cpu_to_be32(NFSERR_BAD_COOKIE)
+#define	nfserr_same		cpu_to_be32(NFSERR_SAME)
+#define	nfserr_clid_inuse	cpu_to_be32(NFSERR_CLID_INUSE)
+#define	nfserr_stale_clientid	cpu_to_be32(NFSERR_STALE_CLIENTID)
+#define	nfserr_resource		cpu_to_be32(NFSERR_RESOURCE)
+#define	nfserr_moved		cpu_to_be32(NFSERR_MOVED)
+#define	nfserr_nofilehandle	cpu_to_be32(NFSERR_NOFILEHANDLE)
+#define	nfserr_minor_vers_mismatch	cpu_to_be32(NFSERR_MINOR_VERS_MISMATCH)
+#define nfserr_share_denied	cpu_to_be32(NFSERR_SHARE_DENIED)
+#define nfserr_stale_stateid	cpu_to_be32(NFSERR_STALE_STATEID)
+#define nfserr_old_stateid	cpu_to_be32(NFSERR_OLD_STATEID)
+#define nfserr_bad_stateid	cpu_to_be32(NFSERR_BAD_STATEID)
+#define nfserr_bad_seqid	cpu_to_be32(NFSERR_BAD_SEQID)
+#define	nfserr_symlink		cpu_to_be32(NFSERR_SYMLINK)
+#define	nfserr_not_same		cpu_to_be32(NFSERR_NOT_SAME)
+#define nfserr_lock_range	cpu_to_be32(NFSERR_LOCK_RANGE)
+#define	nfserr_restorefh	cpu_to_be32(NFSERR_RESTOREFH)
+#define	nfserr_attrnotsupp	cpu_to_be32(NFSERR_ATTRNOTSUPP)
+#define	nfserr_bad_xdr		cpu_to_be32(NFSERR_BAD_XDR)
+#define	nfserr_openmode		cpu_to_be32(NFSERR_OPENMODE)
+#define	nfserr_badowner		cpu_to_be32(NFSERR_BADOWNER)
+#define	nfserr_locks_held	cpu_to_be32(NFSERR_LOCKS_HELD)
+#define	nfserr_op_illegal	cpu_to_be32(NFSERR_OP_ILLEGAL)
+#define	nfserr_grace		cpu_to_be32(NFSERR_GRACE)
+#define	nfserr_no_grace		cpu_to_be32(NFSERR_NO_GRACE)
+#define	nfserr_reclaim_bad	cpu_to_be32(NFSERR_RECLAIM_BAD)
+#define	nfserr_badname		cpu_to_be32(NFSERR_BADNAME)
+#define	nfserr_cb_path_down	cpu_to_be32(NFSERR_CB_PATH_DOWN)
+#define	nfserr_locked		cpu_to_be32(NFSERR_LOCKED)
+#define	nfserr_wrongsec		cpu_to_be32(NFSERR_WRONGSEC)
+#define nfserr_badiomode		cpu_to_be32(NFS4ERR_BADIOMODE)
+#define nfserr_badlayout		cpu_to_be32(NFS4ERR_BADLAYOUT)
+#define nfserr_bad_session_digest	cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST)
+#define nfserr_badsession		cpu_to_be32(NFS4ERR_BADSESSION)
+#define nfserr_badslot			cpu_to_be32(NFS4ERR_BADSLOT)
+#define nfserr_complete_already		cpu_to_be32(NFS4ERR_COMPLETE_ALREADY)
+#define nfserr_conn_not_bound_to_session cpu_to_be32(NFS4ERR_CONN_NOT_BOUND_TO_SESSION)
+#define nfserr_deleg_already_wanted	cpu_to_be32(NFS4ERR_DELEG_ALREADY_WANTED)
+#define nfserr_back_chan_busy		cpu_to_be32(NFS4ERR_BACK_CHAN_BUSY)
+#define nfserr_layouttrylater		cpu_to_be32(NFS4ERR_LAYOUTTRYLATER)
+#define nfserr_layoutunavailable	cpu_to_be32(NFS4ERR_LAYOUTUNAVAILABLE)
+#define nfserr_nomatching_layout	cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT)
+#define nfserr_recallconflict		cpu_to_be32(NFS4ERR_RECALLCONFLICT)
+#define nfserr_unknown_layouttype	cpu_to_be32(NFS4ERR_UNKNOWN_LAYOUTTYPE)
+#define nfserr_seq_misordered		cpu_to_be32(NFS4ERR_SEQ_MISORDERED)
+#define nfserr_sequence_pos		cpu_to_be32(NFS4ERR_SEQUENCE_POS)
+#define nfserr_req_too_big		cpu_to_be32(NFS4ERR_REQ_TOO_BIG)
+#define nfserr_rep_too_big		cpu_to_be32(NFS4ERR_REP_TOO_BIG)
+#define nfserr_rep_too_big_to_cache	cpu_to_be32(NFS4ERR_REP_TOO_BIG_TO_CACHE)
+#define nfserr_retry_uncached_rep	cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP)
+#define nfserr_unsafe_compound		cpu_to_be32(NFS4ERR_UNSAFE_COMPOUND)
+#define nfserr_too_many_ops		cpu_to_be32(NFS4ERR_TOO_MANY_OPS)
+#define nfserr_op_not_in_session	cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION)
+#define nfserr_hash_alg_unsupp		cpu_to_be32(NFS4ERR_HASH_ALG_UNSUPP)
+#define nfserr_clientid_busy		cpu_to_be32(NFS4ERR_CLIENTID_BUSY)
+#define nfserr_pnfs_io_hole		cpu_to_be32(NFS4ERR_PNFS_IO_HOLE)
+#define nfserr_seq_false_retry		cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY)
+#define nfserr_bad_high_slot		cpu_to_be32(NFS4ERR_BAD_HIGH_SLOT)
+#define nfserr_deadsession		cpu_to_be32(NFS4ERR_DEADSESSION)
+#define nfserr_encr_alg_unsupp		cpu_to_be32(NFS4ERR_ENCR_ALG_UNSUPP)
+#define nfserr_pnfs_no_layout		cpu_to_be32(NFS4ERR_PNFS_NO_LAYOUT)
+#define nfserr_not_only_op		cpu_to_be32(NFS4ERR_NOT_ONLY_OP)
+#define nfserr_wrong_cred		cpu_to_be32(NFS4ERR_WRONG_CRED)
+#define nfserr_wrong_type		cpu_to_be32(NFS4ERR_WRONG_TYPE)
+#define nfserr_dirdeleg_unavail		cpu_to_be32(NFS4ERR_DIRDELEG_UNAVAIL)
+#define nfserr_reject_deleg		cpu_to_be32(NFS4ERR_REJECT_DELEG)
+#define nfserr_returnconflict		cpu_to_be32(NFS4ERR_RETURNCONFLICT)
+#define nfserr_deleg_revoked		cpu_to_be32(NFS4ERR_DELEG_REVOKED)
+
+/* error codes for internal use */
+/* if a request fails due to kmalloc failure, it gets dropped.
+ *  Client should resend eventually
+ */
+#define	nfserr_dropit		cpu_to_be32(30000)
+/* end-of-file indicator in readdir */
+#define	nfserr_eof		cpu_to_be32(30001)
+/* replay detected */
+#define	nfserr_replay_me	cpu_to_be32(11001)
+/* nfs41 replay detected */
+#define	nfserr_replay_cache	cpu_to_be32(11002)
+
+/* Check for dir entries '.' and '..' */
+#define isdotent(n, l)	(l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
+
+/*
+ * Time of server startup
+ */
+extern struct timeval	nfssvc_boot;
+
+#ifdef CONFIG_NFSD_V4
+
+extern time_t nfsd4_lease;
+extern time_t nfsd4_grace;
+
+/* before processing a COMPOUND operation, we have to check that there
+ * is enough space in the buffer for XDR encode to succeed.  otherwise,
+ * we might process an operation with side effects, and be unable to
+ * tell the client that the operation succeeded.
+ *
+ * COMPOUND_SLACK_SPACE - this is the minimum bytes of buffer space
+ * needed to encode an "ordinary" _successful_ operation.  (GETATTR,
+ * READ, READDIR, and READLINK have their own buffer checks.)  if we
+ * fall below this level, we fail the next operation with NFS4ERR_RESOURCE.
+ *
+ * COMPOUND_ERR_SLACK_SPACE - this is the minimum bytes of buffer space
+ * needed to encode an operation which has failed with NFS4ERR_RESOURCE.
+ * care is taken to ensure that we never fall below this level for any
+ * reason.
+ */
+#define	COMPOUND_SLACK_SPACE		140    /* OP_GETFH */
+#define COMPOUND_ERR_SLACK_SPACE	12     /* OP_SETATTR */
+
+#define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
+
+/*
+ * The following attributes are currently not supported by the NFSv4 server:
+ *    ARCHIVE       (deprecated anyway)
+ *    HIDDEN        (unlikely to be supported any time soon)
+ *    MIMETYPE      (unlikely to be supported any time soon)
+ *    QUOTA_*       (will be supported in a forthcoming patch)
+ *    SYSTEM        (unlikely to be supported any time soon)
+ *    TIME_BACKUP   (unlikely to be supported any time soon)
+ *    TIME_CREATE   (unlikely to be supported any time soon)
+ */
+#define NFSD4_SUPPORTED_ATTRS_WORD0                                                         \
+(FATTR4_WORD0_SUPPORTED_ATTRS   | FATTR4_WORD0_TYPE         | FATTR4_WORD0_FH_EXPIRE_TYPE   \
+ | FATTR4_WORD0_CHANGE          | FATTR4_WORD0_SIZE         | FATTR4_WORD0_LINK_SUPPORT     \
+ | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR   | FATTR4_WORD0_FSID             \
+ | FATTR4_WORD0_UNIQUE_HANDLES  | FATTR4_WORD0_LEASE_TIME   | FATTR4_WORD0_RDATTR_ERROR     \
+ | FATTR4_WORD0_ACLSUPPORT      | FATTR4_WORD0_CANSETTIME   | FATTR4_WORD0_CASE_INSENSITIVE \
+ | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED                             \
+ | FATTR4_WORD0_FILEHANDLE      | FATTR4_WORD0_FILEID       | FATTR4_WORD0_FILES_AVAIL      \
+ | FATTR4_WORD0_FILES_FREE      | FATTR4_WORD0_FILES_TOTAL  | FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_HOMOGENEOUS      \
+ | FATTR4_WORD0_MAXFILESIZE     | FATTR4_WORD0_MAXLINK      | FATTR4_WORD0_MAXNAME          \
+ | FATTR4_WORD0_MAXREAD         | FATTR4_WORD0_MAXWRITE     | FATTR4_WORD0_ACL)
+
+#define NFSD4_SUPPORTED_ATTRS_WORD1                                                         \
+(FATTR4_WORD1_MODE              | FATTR4_WORD1_NO_TRUNC     | FATTR4_WORD1_NUMLINKS         \
+ | FATTR4_WORD1_OWNER	        | FATTR4_WORD1_OWNER_GROUP  | FATTR4_WORD1_RAWDEV           \
+ | FATTR4_WORD1_SPACE_AVAIL     | FATTR4_WORD1_SPACE_FREE   | FATTR4_WORD1_SPACE_TOTAL      \
+ | FATTR4_WORD1_SPACE_USED      | FATTR4_WORD1_TIME_ACCESS  | FATTR4_WORD1_TIME_ACCESS_SET  \
+ | FATTR4_WORD1_TIME_DELTA   | FATTR4_WORD1_TIME_METADATA    \
+ | FATTR4_WORD1_TIME_MODIFY     | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID)
+
+#define NFSD4_SUPPORTED_ATTRS_WORD2 0
+
+#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
+	NFSD4_SUPPORTED_ATTRS_WORD0
+
+#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
+	NFSD4_SUPPORTED_ATTRS_WORD1
+
+#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
+	(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+
+static inline u32 nfsd_suppattrs0(u32 minorversion)
+{
+	return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
+			    : NFSD4_SUPPORTED_ATTRS_WORD0;
+}
+
+static inline u32 nfsd_suppattrs1(u32 minorversion)
+{
+	return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1
+			    : NFSD4_SUPPORTED_ATTRS_WORD1;
+}
+
+static inline u32 nfsd_suppattrs2(u32 minorversion)
+{
+	return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2
+			    : NFSD4_SUPPORTED_ATTRS_WORD2;
+}
+
+/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
+#define NFSD_WRITEONLY_ATTRS_WORD1							    \
+(FATTR4_WORD1_TIME_ACCESS_SET   | FATTR4_WORD1_TIME_MODIFY_SET)
+
+/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
+#define NFSD_WRITEABLE_ATTRS_WORD0                                                          \
+(FATTR4_WORD0_SIZE              | FATTR4_WORD0_ACL                                         )
+#define NFSD_WRITEABLE_ATTRS_WORD1                                                          \
+(FATTR4_WORD1_MODE              | FATTR4_WORD1_OWNER         | FATTR4_WORD1_OWNER_GROUP     \
+ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+#define NFSD_WRITEABLE_ATTRS_WORD2 0
+
+#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
+	NFSD_WRITEABLE_ATTRS_WORD0
+/*
+ * we currently store the exclusive create verifier in the v_{a,m}time
+ * attributes so the client can't set these at create time using EXCLUSIVE4_1
+ */
+#define NFSD_SUPPATTR_EXCLCREAT_WORD1 \
+	(NFSD_WRITEABLE_ATTRS_WORD1 & \
+	 ~(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET))
+#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \
+	NFSD_WRITEABLE_ATTRS_WORD2
+
+#endif /* CONFIG_NFSD_V4 */
+
+#endif /* LINUX_NFSD_NFSD_H */
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
new file mode 100644
index 00000000..90c6aa6d
--- /dev/null
+++ b/fs/nfsd/nfsfh.c
@@ -0,0 +1,693 @@
+/*
+ * NFS server file handle treatment.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ * Portions Copyright (C) 1999 G. Allen Morris III <gam3@acm.org>
+ * Extensive rewrite by Neil Brown <neilb@cse.unsw.edu.au> Southern-Spring 1999
+ * ... and again Southern-Winter 2001 to support export_operations
+ */
+
+#include <linux/exportfs.h>
+
+#include <linux/sunrpc/svcauth_gss.h>
+#include "nfsd.h"
+#include "vfs.h"
+#include "auth.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_FH
+
+
+/*
+ * our acceptability function.
+ * if NOSUBTREECHECK, accept anything
+ * if not, require that we can walk up to exp->ex_dentry
+ * doing some checks on the 'x' bits
+ */
+static int nfsd_acceptable(void *expv, struct dentry *dentry)
+{
+	struct svc_export *exp = expv;
+	int rv;
+	struct dentry *tdentry;
+	struct dentry *parent;
+
+	if (exp->ex_flags & NFSEXP_NOSUBTREECHECK)
+		return 1;
+
+	tdentry = dget(dentry);
+	while (tdentry != exp->ex_path.dentry && !IS_ROOT(tdentry)) {
+		/* make sure parents give x permission to user */
+		int err;
+		parent = dget_parent(tdentry);
+		err = inode_permission(parent->d_inode, MAY_EXEC);
+		if (err < 0) {
+			dput(parent);
+			break;
+		}
+		dput(tdentry);
+		tdentry = parent;
+	}
+	if (tdentry != exp->ex_path.dentry)
+		dprintk("nfsd_acceptable failed at %p %s\n", tdentry, tdentry->d_name.name);
+	rv = (tdentry == exp->ex_path.dentry);
+	dput(tdentry);
+	return rv;
+}
+
+/* Type check. The correct error return for type mismatches does not seem to be
+ * generally agreed upon. SunOS seems to use EISDIR if file isn't S_IFREG; a
+ * comment in the NFSv3 spec says this is incorrect (implementation notes for
+ * the write call).
+ */
+static inline __be32
+nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
+{
+	/* Type can be negative when creating hardlinks - not to a dir */
+	if (type > 0 && (mode & S_IFMT) != type) {
+		if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK)
+			return nfserr_symlink;
+		else if (type == S_IFDIR)
+			return nfserr_notdir;
+		else if ((mode & S_IFMT) == S_IFDIR)
+			return nfserr_isdir;
+		else
+			return nfserr_inval;
+	}
+	if (type < 0 && (mode & S_IFMT) == -type) {
+		if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK)
+			return nfserr_symlink;
+		else if (type == -S_IFDIR)
+			return nfserr_isdir;
+		else
+			return nfserr_notdir;
+	}
+	return 0;
+}
+
+static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
+					  struct svc_export *exp)
+{
+	int flags = nfsexp_flags(rqstp, exp);
+
+	/* Check if the request originated from a secure port. */
+	if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) {
+		RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+		dprintk(KERN_WARNING
+		       "nfsd: request from insecure port %s!\n",
+		       svc_print_addr(rqstp, buf, sizeof(buf)));
+		return nfserr_perm;
+	}
+
+	/* Set user creds for this exportpoint */
+	return nfserrno(nfsd_setuser(rqstp, exp));
+}
+
+static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
+	struct dentry *dentry, struct svc_export *exp)
+{
+	if (!(exp->ex_flags & NFSEXP_V4ROOT))
+		return nfs_ok;
+	/*
+	 * v2/v3 clients have no need for the V4ROOT export--they use
+	 * the mount protocl instead; also, further V4ROOT checks may be
+	 * in v4-specific code, in which case v2/v3 clients could bypass
+	 * them.
+	 */
+	if (!nfsd_v4client(rqstp))
+		return nfserr_stale;
+	/*
+	 * We're exposing only the directories and symlinks that have to be
+	 * traversed on the way to real exports:
+	 */
+	if (unlikely(!S_ISDIR(dentry->d_inode->i_mode) &&
+		     !S_ISLNK(dentry->d_inode->i_mode)))
+		return nfserr_stale;
+	/*
+	 * A pseudoroot export gives permission to access only one
+	 * single directory; the kernel has to make another upcall
+	 * before granting access to anything else under it:
+	 */
+	if (unlikely(dentry != exp->ex_path.dentry))
+		return nfserr_stale;
+	return nfs_ok;
+}
+
+/*
+ * Use the given filehandle to look up the corresponding export and
+ * dentry.  On success, the results are used to set fh_export and
+ * fh_dentry.
+ */
+static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
+{
+	struct knfsd_fh	*fh = &fhp->fh_handle;
+	struct fid *fid = NULL, sfid;
+	struct svc_export *exp;
+	struct dentry *dentry;
+	int fileid_type;
+	int data_left = fh->fh_size/4;
+	__be32 error;
+
+	error = nfserr_stale;
+	if (rqstp->rq_vers > 2)
+		error = nfserr_badhandle;
+	if (rqstp->rq_vers == 4 && fh->fh_size == 0)
+		return nfserr_nofilehandle;
+
+	if (fh->fh_version == 1) {
+		int len;
+
+		if (--data_left < 0)
+			return error;
+		if (fh->fh_auth_type != 0)
+			return error;
+		len = key_len(fh->fh_fsid_type) / 4;
+		if (len == 0)
+			return error;
+		if  (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
+			/* deprecated, convert to type 3 */
+			len = key_len(FSID_ENCODE_DEV)/4;
+			fh->fh_fsid_type = FSID_ENCODE_DEV;
+			fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl(fh->fh_fsid[0]), ntohl(fh->fh_fsid[1])));
+			fh->fh_fsid[1] = fh->fh_fsid[2];
+		}
+		data_left -= len;
+		if (data_left < 0)
+			return error;
+		exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth);
+		fid = (struct fid *)(fh->fh_auth + len);
+	} else {
+		__u32 tfh[2];
+		dev_t xdev;
+		ino_t xino;
+
+		if (fh->fh_size != NFS_FHSIZE)
+			return error;
+		/* assume old filehandle format */
+		xdev = old_decode_dev(fh->ofh_xdev);
+		xino = u32_to_ino_t(fh->ofh_xino);
+		mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL);
+		exp = rqst_exp_find(rqstp, FSID_DEV, tfh);
+	}
+
+	error = nfserr_stale;
+	if (PTR_ERR(exp) == -ENOENT)
+		return error;
+
+	if (IS_ERR(exp))
+		return nfserrno(PTR_ERR(exp));
+
+	if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) {
+		/* Elevate privileges so that the lack of 'r' or 'x'
+		 * permission on some parent directory will
+		 * not stop exportfs_decode_fh from being able
+		 * to reconnect a directory into the dentry cache.
+		 * The same problem can affect "SUBTREECHECK" exports,
+		 * but as nfsd_acceptable depends on correct
+		 * access control settings being in effect, we cannot
+		 * fix that case easily.
+		 */
+		struct cred *new = prepare_creds();
+		if (!new)
+			return nfserrno(-ENOMEM);
+		new->cap_effective =
+			cap_raise_nfsd_set(new->cap_effective,
+					   new->cap_permitted);
+		put_cred(override_creds(new));
+		put_cred(new);
+	} else {
+		error = nfsd_setuser_and_check_port(rqstp, exp);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * Look up the dentry using the NFS file handle.
+	 */
+	error = nfserr_stale;
+	if (rqstp->rq_vers > 2)
+		error = nfserr_badhandle;
+
+	if (fh->fh_version != 1) {
+		sfid.i32.ino = fh->ofh_ino;
+		sfid.i32.gen = fh->ofh_generation;
+		sfid.i32.parent_ino = fh->ofh_dirino;
+		fid = &sfid;
+		data_left = 3;
+		if (fh->ofh_dirino == 0)
+			fileid_type = FILEID_INO32_GEN;
+		else
+			fileid_type = FILEID_INO32_GEN_PARENT;
+	} else
+		fileid_type = fh->fh_fileid_type;
+
+	if (fileid_type == FILEID_ROOT)
+		dentry = dget(exp->ex_path.dentry);
+	else {
+		dentry = exportfs_decode_fh(exp->ex_path.mnt, fid,
+				data_left, fileid_type,
+				nfsd_acceptable, exp);
+	}
+	if (dentry == NULL)
+		goto out;
+	if (IS_ERR(dentry)) {
+		if (PTR_ERR(dentry) != -EINVAL)
+			error = nfserrno(PTR_ERR(dentry));
+		goto out;
+	}
+
+	if (S_ISDIR(dentry->d_inode->i_mode) &&
+			(dentry->d_flags & DCACHE_DISCONNECTED)) {
+		printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n",
+				dentry->d_parent->d_name.name, dentry->d_name.name);
+	}
+
+	fhp->fh_dentry = dentry;
+	fhp->fh_export = exp;
+	return 0;
+out:
+	exp_put(exp);
+	return error;
+}
+
+/**
+ * fh_verify - filehandle lookup and access checking
+ * @rqstp: pointer to current rpc request
+ * @fhp: filehandle to be verified
+ * @type: expected type of object pointed to by filehandle
+ * @access: type of access needed to object
+ *
+ * Look up a dentry from the on-the-wire filehandle, check the client's
+ * access to the export, and set the current task's credentials.
+ *
+ * Regardless of success or failure of fh_verify(), fh_put() should be
+ * called on @fhp when the caller is finished with the filehandle.
+ *
+ * fh_verify() may be called multiple times on a given filehandle, for
+ * example, when processing an NFSv4 compound.  The first call will look
+ * up a dentry using the on-the-wire filehandle.  Subsequent calls will
+ * skip the lookup and just perform the other checks and possibly change
+ * the current task's credentials.
+ *
+ * @type specifies the type of object expected using one of the S_IF*
+ * constants defined in include/linux/stat.h.  The caller may use zero
+ * to indicate that it doesn't care, or a negative integer to indicate
+ * that it expects something not of the given type.
+ *
+ * @access is formed from the NFSD_MAY_* constants defined in
+ * include/linux/nfsd/nfsd.h.
+ */
+__be32
+fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
+{
+	struct svc_export *exp;
+	struct dentry	*dentry;
+	__be32		error;
+
+	dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp));
+
+	if (!fhp->fh_dentry) {
+		error = nfsd_set_fh_dentry(rqstp, fhp);
+		if (error)
+			goto out;
+	}
+	dentry = fhp->fh_dentry;
+	exp = fhp->fh_export;
+	/*
+	 * We still have to do all these permission checks, even when
+	 * fh_dentry is already set:
+	 * 	- fh_verify may be called multiple times with different
+	 * 	  "access" arguments (e.g. nfsd_proc_create calls
+	 * 	  fh_verify(...,NFSD_MAY_EXEC) first, then later (in
+	 * 	  nfsd_create) calls fh_verify(...,NFSD_MAY_CREATE).
+	 *	- in the NFSv4 case, the filehandle may have been filled
+	 *	  in by fh_compose, and given a dentry, but further
+	 *	  compound operations performed with that filehandle
+	 *	  still need permissions checks.  In the worst case, a
+	 *	  mountpoint crossing may have changed the export
+	 *	  options, and we may now need to use a different uid
+	 *	  (for example, if different id-squashing options are in
+	 *	  effect on the new filesystem).
+	 */
+	error = check_pseudo_root(rqstp, dentry, exp);
+	if (error)
+		goto out;
+
+	error = nfsd_setuser_and_check_port(rqstp, exp);
+	if (error)
+		goto out;
+
+	error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type);
+	if (error)
+		goto out;
+
+	/*
+	 * pseudoflavor restrictions are not enforced on NLM,
+	 * which clients virtually always use auth_sys for,
+	 * even while using RPCSEC_GSS for NFS.
+	 */
+	if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS)
+		goto skip_pseudoflavor_check;
+	/*
+	 * Clients may expect to be able to use auth_sys during mount,
+	 * even if they use gss for everything else; see section 2.3.2
+	 * of rfc 2623.
+	 */
+	if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT
+			&& exp->ex_path.dentry == dentry)
+		goto skip_pseudoflavor_check;
+
+	error = check_nfsd_access(exp, rqstp);
+	if (error)
+		goto out;
+
+skip_pseudoflavor_check:
+	/* Finally, check access permissions. */
+	error = nfsd_permission(rqstp, exp, dentry, access);
+
+	if (error) {
+		dprintk("fh_verify: %s/%s permission failure, "
+			"acc=%x, error=%d\n",
+			dentry->d_parent->d_name.name,
+			dentry->d_name.name,
+			access, ntohl(error));
+	}
+out:
+	if (error == nfserr_stale)
+		nfsdstats.fh_stale++;
+	return error;
+}
+
+
+/*
+ * Compose a file handle for an NFS reply.
+ *
+ * Note that when first composed, the dentry may not yet have
+ * an inode.  In this case a call to fh_update should be made
+ * before the fh goes out on the wire ...
+ */
+static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
+		struct dentry *dentry)
+{
+	if (dentry != exp->ex_path.dentry) {
+		struct fid *fid = (struct fid *)
+			(fhp->fh_handle.fh_auth + fhp->fh_handle.fh_size/4 - 1);
+		int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4;
+		int subtreecheck = !(exp->ex_flags & NFSEXP_NOSUBTREECHECK);
+
+		fhp->fh_handle.fh_fileid_type =
+			exportfs_encode_fh(dentry, fid, &maxsize, subtreecheck);
+		fhp->fh_handle.fh_size += maxsize * 4;
+	} else {
+		fhp->fh_handle.fh_fileid_type = FILEID_ROOT;
+	}
+}
+
+/*
+ * for composing old style file handles
+ */
+static inline void _fh_update_old(struct dentry *dentry,
+				  struct svc_export *exp,
+				  struct knfsd_fh *fh)
+{
+	fh->ofh_ino = ino_t_to_u32(dentry->d_inode->i_ino);
+	fh->ofh_generation = dentry->d_inode->i_generation;
+	if (S_ISDIR(dentry->d_inode->i_mode) ||
+	    (exp->ex_flags & NFSEXP_NOSUBTREECHECK))
+		fh->ofh_dirino = 0;
+}
+
+static bool is_root_export(struct svc_export *exp)
+{
+	return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root;
+}
+
+static struct super_block *exp_sb(struct svc_export *exp)
+{
+	return exp->ex_path.dentry->d_inode->i_sb;
+}
+
+static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp)
+{
+	switch (fsid_type) {
+	case FSID_DEV:
+		if (!old_valid_dev(exp_sb(exp)->s_dev))
+			return 0;
+		/* FALL THROUGH */
+	case FSID_MAJOR_MINOR:
+	case FSID_ENCODE_DEV:
+		return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV;
+	case FSID_NUM:
+		return exp->ex_flags & NFSEXP_FSID;
+	case FSID_UUID8:
+	case FSID_UUID16:
+		if (!is_root_export(exp))
+			return 0;
+		/* fall through */
+	case FSID_UUID4_INUM:
+	case FSID_UUID16_INUM:
+		return exp->ex_uuid != NULL;
+	}
+	return 1;
+}
+
+
+static void set_version_and_fsid_type(struct svc_fh *fhp, struct svc_export *exp, struct svc_fh *ref_fh)
+{
+	u8 version;
+	u8 fsid_type;
+retry:
+	version = 1;
+	if (ref_fh && ref_fh->fh_export == exp) {
+		version = ref_fh->fh_handle.fh_version;
+		fsid_type = ref_fh->fh_handle.fh_fsid_type;
+
+		ref_fh = NULL;
+
+		switch (version) {
+		case 0xca:
+			fsid_type = FSID_DEV;
+			break;
+		case 1:
+			break;
+		default:
+			goto retry;
+		}
+
+		/*
+		 * As the fsid -> filesystem mapping was guided by
+		 * user-space, there is no guarantee that the filesystem
+		 * actually supports that fsid type. If it doesn't we
+		 * loop around again without ref_fh set.
+		 */
+		if (!fsid_type_ok_for_exp(fsid_type, exp))
+			goto retry;
+	} else if (exp->ex_flags & NFSEXP_FSID) {
+		fsid_type = FSID_NUM;
+	} else if (exp->ex_uuid) {
+		if (fhp->fh_maxsize >= 64) {
+			if (is_root_export(exp))
+				fsid_type = FSID_UUID16;
+			else
+				fsid_type = FSID_UUID16_INUM;
+		} else {
+			if (is_root_export(exp))
+				fsid_type = FSID_UUID8;
+			else
+				fsid_type = FSID_UUID4_INUM;
+		}
+	} else if (!old_valid_dev(exp_sb(exp)->s_dev))
+		/* for newer device numbers, we must use a newer fsid format */
+		fsid_type = FSID_ENCODE_DEV;
+	else
+		fsid_type = FSID_DEV;
+	fhp->fh_handle.fh_version = version;
+	if (version)
+		fhp->fh_handle.fh_fsid_type = fsid_type;
+}
+
+__be32
+fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
+	   struct svc_fh *ref_fh)
+{
+	/* ref_fh is a reference file handle.
+	 * if it is non-null and for the same filesystem, then we should compose
+	 * a filehandle which is of the same version, where possible.
+	 * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
+	 * Then create a 32byte filehandle using nfs_fhbase_old
+	 *
+	 */
+
+	struct inode * inode = dentry->d_inode;
+	struct dentry *parent = dentry->d_parent;
+	__u32 *datap;
+	dev_t ex_dev = exp_sb(exp)->s_dev;
+
+	dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n",
+		MAJOR(ex_dev), MINOR(ex_dev),
+		(long) exp->ex_path.dentry->d_inode->i_ino,
+		parent->d_name.name, dentry->d_name.name,
+		(inode ? inode->i_ino : 0));
+
+	/* Choose filehandle version and fsid type based on
+	 * the reference filehandle (if it is in the same export)
+	 * or the export options.
+	 */
+	 set_version_and_fsid_type(fhp, exp, ref_fh);
+
+	if (ref_fh == fhp)
+		fh_put(ref_fh);
+
+	if (fhp->fh_locked || fhp->fh_dentry) {
+		printk(KERN_ERR "fh_compose: fh %s/%s not initialized!\n",
+		       parent->d_name.name, dentry->d_name.name);
+	}
+	if (fhp->fh_maxsize < NFS_FHSIZE)
+		printk(KERN_ERR "fh_compose: called with maxsize %d! %s/%s\n",
+		       fhp->fh_maxsize,
+		       parent->d_name.name, dentry->d_name.name);
+
+	fhp->fh_dentry = dget(dentry); /* our internal copy */
+	fhp->fh_export = exp;
+	cache_get(&exp->h);
+
+	if (fhp->fh_handle.fh_version == 0xca) {
+		/* old style filehandle please */
+		memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE);
+		fhp->fh_handle.fh_size = NFS_FHSIZE;
+		fhp->fh_handle.ofh_dcookie = 0xfeebbaca;
+		fhp->fh_handle.ofh_dev =  old_encode_dev(ex_dev);
+		fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev;
+		fhp->fh_handle.ofh_xino =
+			ino_t_to_u32(exp->ex_path.dentry->d_inode->i_ino);
+		fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry));
+		if (inode)
+			_fh_update_old(dentry, exp, &fhp->fh_handle);
+	} else {
+		int len;
+		fhp->fh_handle.fh_auth_type = 0;
+		datap = fhp->fh_handle.fh_auth+0;
+		mk_fsid(fhp->fh_handle.fh_fsid_type, datap, ex_dev,
+			exp->ex_path.dentry->d_inode->i_ino,
+			exp->ex_fsid, exp->ex_uuid);
+
+		len = key_len(fhp->fh_handle.fh_fsid_type);
+		datap += len/4;
+		fhp->fh_handle.fh_size = 4 + len;
+
+		if (inode)
+			_fh_update(fhp, exp, dentry);
+		if (fhp->fh_handle.fh_fileid_type == 255) {
+			fh_put(fhp);
+			return nfserr_opnotsupp;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Update file handle information after changing a dentry.
+ * This is only called by nfsd_create, nfsd_create_v3 and nfsd_proc_create
+ */
+__be32
+fh_update(struct svc_fh *fhp)
+{
+	struct dentry *dentry;
+
+	if (!fhp->fh_dentry)
+		goto out_bad;
+
+	dentry = fhp->fh_dentry;
+	if (!dentry->d_inode)
+		goto out_negative;
+	if (fhp->fh_handle.fh_version != 1) {
+		_fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle);
+	} else {
+		if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
+			goto out;
+
+		_fh_update(fhp, fhp->fh_export, dentry);
+		if (fhp->fh_handle.fh_fileid_type == 255)
+			return nfserr_opnotsupp;
+	}
+out:
+	return 0;
+
+out_bad:
+	printk(KERN_ERR "fh_update: fh not verified!\n");
+	goto out;
+out_negative:
+	printk(KERN_ERR "fh_update: %s/%s still negative!\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+	goto out;
+}
+
+/*
+ * Release a file handle.
+ */
+void
+fh_put(struct svc_fh *fhp)
+{
+	struct dentry * dentry = fhp->fh_dentry;
+	struct svc_export * exp = fhp->fh_export;
+	if (dentry) {
+		fh_unlock(fhp);
+		fhp->fh_dentry = NULL;
+		dput(dentry);
+#ifdef CONFIG_NFSD_V3
+		fhp->fh_pre_saved = 0;
+		fhp->fh_post_saved = 0;
+#endif
+	}
+	if (exp) {
+		cache_put(&exp->h, &svc_export_cache);
+		fhp->fh_export = NULL;
+	}
+	return;
+}
+
+/*
+ * Shorthand for dprintk()'s
+ */
+char * SVCFH_fmt(struct svc_fh *fhp)
+{
+	struct knfsd_fh *fh = &fhp->fh_handle;
+
+	static char buf[80];
+	sprintf(buf, "%d: %08x %08x %08x %08x %08x %08x",
+		fh->fh_size,
+		fh->fh_base.fh_pad[0],
+		fh->fh_base.fh_pad[1],
+		fh->fh_base.fh_pad[2],
+		fh->fh_base.fh_pad[3],
+		fh->fh_base.fh_pad[4],
+		fh->fh_base.fh_pad[5]);
+	return buf;
+}
+
+enum fsid_source fsid_source(struct svc_fh *fhp)
+{
+	if (fhp->fh_handle.fh_version != 1)
+		return FSIDSOURCE_DEV;
+	switch(fhp->fh_handle.fh_fsid_type) {
+	case FSID_DEV:
+	case FSID_ENCODE_DEV:
+	case FSID_MAJOR_MINOR:
+		if (exp_sb(fhp->fh_export)->s_type->fs_flags & FS_REQUIRES_DEV)
+			return FSIDSOURCE_DEV;
+		break;
+	case FSID_NUM:
+		if (fhp->fh_export->ex_flags & NFSEXP_FSID)
+			return FSIDSOURCE_FSID;
+		break;
+	default:
+		break;
+	}
+	/* either a UUID type filehandle, or the filehandle doesn't
+	 * match the export.
+	 */
+	if (fhp->fh_export->ex_flags & NFSEXP_FSID)
+		return FSIDSOURCE_FSID;
+	if (fhp->fh_export->ex_uuid)
+		return FSIDSOURCE_UUID;
+	return FSIDSOURCE_DEV;
+}
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
new file mode 100644
index 00000000..c16f8d83
--- /dev/null
+++ b/fs/nfsd/nfsfh.h
@@ -0,0 +1,206 @@
+/* Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */
+
+#ifndef _LINUX_NFSD_FH_INT_H
+#define _LINUX_NFSD_FH_INT_H
+
+#include <linux/nfsd/nfsfh.h>
+
+enum nfsd_fsid {
+	FSID_DEV = 0,
+	FSID_NUM,
+	FSID_MAJOR_MINOR,
+	FSID_ENCODE_DEV,
+	FSID_UUID4_INUM,
+	FSID_UUID8,
+	FSID_UUID16,
+	FSID_UUID16_INUM,
+};
+
+enum fsid_source {
+	FSIDSOURCE_DEV,
+	FSIDSOURCE_FSID,
+	FSIDSOURCE_UUID,
+};
+extern enum fsid_source fsid_source(struct svc_fh *fhp);
+
+
+/* This might look a little large to "inline" but in all calls except
+ * one, 'vers' is constant so moste of the function disappears.
+ */
+static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino,
+			   u32 fsid, unsigned char *uuid)
+{
+	u32 *up;
+	switch(vers) {
+	case FSID_DEV:
+		fsidv[0] = htonl((MAJOR(dev)<<16) |
+				 MINOR(dev));
+		fsidv[1] = ino_t_to_u32(ino);
+		break;
+	case FSID_NUM:
+		fsidv[0] = fsid;
+		break;
+	case FSID_MAJOR_MINOR:
+		fsidv[0] = htonl(MAJOR(dev));
+		fsidv[1] = htonl(MINOR(dev));
+		fsidv[2] = ino_t_to_u32(ino);
+		break;
+
+	case FSID_ENCODE_DEV:
+		fsidv[0] = new_encode_dev(dev);
+		fsidv[1] = ino_t_to_u32(ino);
+		break;
+
+	case FSID_UUID4_INUM:
+		/* 4 byte fsid and inode number */
+		up = (u32*)uuid;
+		fsidv[0] = ino_t_to_u32(ino);
+		fsidv[1] = up[0] ^ up[1] ^ up[2] ^ up[3];
+		break;
+
+	case FSID_UUID8:
+		/* 8 byte fsid  */
+		up = (u32*)uuid;
+		fsidv[0] = up[0] ^ up[2];
+		fsidv[1] = up[1] ^ up[3];
+		break;
+
+	case FSID_UUID16:
+		/* 16 byte fsid - NFSv3+ only */
+		memcpy(fsidv, uuid, 16);
+		break;
+
+	case FSID_UUID16_INUM:
+		/* 8 byte inode and 16 byte fsid */
+		*(u64*)fsidv = (u64)ino;
+		memcpy(fsidv+2, uuid, 16);
+		break;
+	default: BUG();
+	}
+}
+
+static inline int key_len(int type)
+{
+	switch(type) {
+	case FSID_DEV:		return 8;
+	case FSID_NUM: 		return 4;
+	case FSID_MAJOR_MINOR:	return 12;
+	case FSID_ENCODE_DEV:	return 8;
+	case FSID_UUID4_INUM:	return 8;
+	case FSID_UUID8:	return 8;
+	case FSID_UUID16:	return 16;
+	case FSID_UUID16_INUM:	return 24;
+	default: return 0;
+	}
+}
+
+/*
+ * Shorthand for dprintk()'s
+ */
+extern char * SVCFH_fmt(struct svc_fh *fhp);
+
+/*
+ * Function prototypes
+ */
+__be32	fh_verify(struct svc_rqst *, struct svc_fh *, int, int);
+__be32	fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *);
+__be32	fh_update(struct svc_fh *);
+void	fh_put(struct svc_fh *);
+
+static __inline__ struct svc_fh *
+fh_copy(struct svc_fh *dst, struct svc_fh *src)
+{
+	WARN_ON(src->fh_dentry || src->fh_locked);
+			
+	*dst = *src;
+	return dst;
+}
+
+static inline void
+fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
+{
+	dst->fh_size = src->fh_size;
+	memcpy(&dst->fh_base, &src->fh_base, src->fh_size);
+}
+
+static __inline__ struct svc_fh *
+fh_init(struct svc_fh *fhp, int maxsize)
+{
+	memset(fhp, 0, sizeof(*fhp));
+	fhp->fh_maxsize = maxsize;
+	return fhp;
+}
+
+#ifdef CONFIG_NFSD_V3
+/*
+ * Fill in the pre_op attr for the wcc data
+ */
+static inline void
+fill_pre_wcc(struct svc_fh *fhp)
+{
+	struct inode    *inode;
+
+	inode = fhp->fh_dentry->d_inode;
+	if (!fhp->fh_pre_saved) {
+		fhp->fh_pre_mtime = inode->i_mtime;
+		fhp->fh_pre_ctime = inode->i_ctime;
+		fhp->fh_pre_size  = inode->i_size;
+		fhp->fh_pre_change = inode->i_version;
+		fhp->fh_pre_saved = 1;
+	}
+}
+
+extern void fill_post_wcc(struct svc_fh *);
+#else
+#define	fill_pre_wcc(ignored)
+#define fill_post_wcc(notused)
+#endif /* CONFIG_NFSD_V3 */
+
+
+/*
+ * Lock a file handle/inode
+ * NOTE: both fh_lock and fh_unlock are done "by hand" in
+ * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once
+ * so, any changes here should be reflected there.
+ */
+
+static inline void
+fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
+{
+	struct dentry	*dentry = fhp->fh_dentry;
+	struct inode	*inode;
+
+	BUG_ON(!dentry);
+
+	if (fhp->fh_locked) {
+		printk(KERN_WARNING "fh_lock: %s/%s already locked!\n",
+			dentry->d_parent->d_name.name, dentry->d_name.name);
+		return;
+	}
+
+	inode = dentry->d_inode;
+	mutex_lock_nested(&inode->i_mutex, subclass);
+	fill_pre_wcc(fhp);
+	fhp->fh_locked = 1;
+}
+
+static inline void
+fh_lock(struct svc_fh *fhp)
+{
+	fh_lock_nested(fhp, I_MUTEX_NORMAL);
+}
+
+/*
+ * Unlock a file handle/inode
+ */
+static inline void
+fh_unlock(struct svc_fh *fhp)
+{
+	if (fhp->fh_locked) {
+		fill_post_wcc(fhp);
+		mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
+		fhp->fh_locked = 0;
+	}
+}
+
+#endif /* _LINUX_NFSD_FH_INT_H */
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
new file mode 100644
index 00000000..e15dc45f
--- /dev/null
+++ b/fs/nfsd/nfsproc.c
@@ -0,0 +1,755 @@
+/*
+ * Process version 2 NFS requests.
+ *
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/namei.h>
+
+#include "cache.h"
+#include "xdr.h"
+#include "vfs.h"
+
+typedef struct svc_rqst	svc_rqst;
+typedef struct svc_buf	svc_buf;
+
+#define NFSDDBG_FACILITY		NFSDDBG_PROC
+
+
+static __be32
+nfsd_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	return nfs_ok;
+}
+
+static __be32
+nfsd_return_attrs(__be32 err, struct nfsd_attrstat *resp)
+{
+	if (err) return err;
+	return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt,
+				    resp->fh.fh_dentry,
+				    &resp->stat));
+}
+static __be32
+nfsd_return_dirop(__be32 err, struct nfsd_diropres *resp)
+{
+	if (err) return err;
+	return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt,
+				    resp->fh.fh_dentry,
+				    &resp->stat));
+}
+/*
+ * Get a file's attributes
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
+					  struct nfsd_attrstat *resp)
+{
+	__be32 nfserr;
+	dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
+
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = fh_verify(rqstp, &resp->fh, 0,
+			NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
+	return nfsd_return_attrs(nfserr, resp);
+}
+
+/*
+ * Set a file's attributes
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp,
+					  struct nfsd_attrstat  *resp)
+{
+	__be32 nfserr;
+	dprintk("nfsd: SETATTR  %s, valid=%x, size=%ld\n",
+		SVCFH_fmt(&argp->fh),
+		argp->attrs.ia_valid, (long) argp->attrs.ia_size);
+
+	fh_copy(&resp->fh, &argp->fh);
+	nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,0, (time_t)0);
+	return nfsd_return_attrs(nfserr, resp);
+}
+
+/*
+ * Look up a path name component
+ * Note: the dentry in the resp->fh may be negative if the file
+ * doesn't exist yet.
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_lookup(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
+					 struct nfsd_diropres  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: LOOKUP   %s %.*s\n",
+		SVCFH_fmt(&argp->fh), argp->len, argp->name);
+
+	fh_init(&resp->fh, NFS_FHSIZE);
+	nfserr = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len,
+				 &resp->fh);
+
+	fh_put(&argp->fh);
+	return nfsd_return_dirop(nfserr, resp);
+}
+
+/*
+ * Read a symlink.
+ */
+static __be32
+nfsd_proc_readlink(struct svc_rqst *rqstp, struct nfsd_readlinkargs *argp,
+					   struct nfsd_readlinkres *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh));
+
+	/* Read the symlink. */
+	resp->len = NFS_MAXPATHLEN;
+	nfserr = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len);
+
+	fh_put(&argp->fh);
+	return nfserr;
+}
+
+/*
+ * Read a portion of a file.
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
+				       struct nfsd_readres  *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: READ    %s %d bytes at %d\n",
+		SVCFH_fmt(&argp->fh),
+		argp->count, argp->offset);
+
+	/* Obtain buffer pointer for payload. 19 is 1 word for
+	 * status, 17 words for fattr, and 1 word for the byte count.
+	 */
+
+	if (NFSSVC_MAXBLKSIZE_V2 < argp->count) {
+		char buf[RPC_MAX_ADDRBUFLEN];
+		printk(KERN_NOTICE
+			"oversized read request from %s (%d bytes)\n",
+				svc_print_addr(rqstp, buf, sizeof(buf)),
+				argp->count);
+		argp->count = NFSSVC_MAXBLKSIZE_V2;
+	}
+	svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
+
+	resp->count = argp->count;
+	nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
+				  argp->offset,
+			   	  rqstp->rq_vec, argp->vlen,
+				  &resp->count);
+
+	if (nfserr) return nfserr;
+	return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt,
+				    resp->fh.fh_dentry,
+				    &resp->stat));
+}
+
+/*
+ * Write data to a file
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
+					struct nfsd_attrstat  *resp)
+{
+	__be32	nfserr;
+	int	stable = 1;
+	unsigned long cnt = argp->len;
+
+	dprintk("nfsd: WRITE    %s %d bytes at %d\n",
+		SVCFH_fmt(&argp->fh),
+		argp->len, argp->offset);
+
+	nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
+				   argp->offset,
+				   rqstp->rq_vec, argp->vlen,
+			           &cnt,
+				   &stable);
+	return nfsd_return_attrs(nfserr, resp);
+}
+
+/*
+ * CREATE processing is complicated. The keyword here is `overloaded.'
+ * The parent directory is kept locked between the check for existence
+ * and the actual create() call in compliance with VFS protocols.
+ * N.B. After this call _both_ argp->fh and resp->fh need an fh_put
+ */
+static __be32
+nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
+					 struct nfsd_diropres   *resp)
+{
+	svc_fh		*dirfhp = &argp->fh;
+	svc_fh		*newfhp = &resp->fh;
+	struct iattr	*attr = &argp->attrs;
+	struct inode	*inode;
+	struct dentry	*dchild;
+	int		type, mode;
+	__be32		nfserr;
+	dev_t		rdev = 0, wanted = new_decode_dev(attr->ia_size);
+
+	dprintk("nfsd: CREATE   %s %.*s\n",
+		SVCFH_fmt(dirfhp), argp->len, argp->name);
+
+	/* First verify the parent file handle */
+	nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC);
+	if (nfserr)
+		goto done; /* must fh_put dirfhp even on error */
+
+	/* Check for NFSD_MAY_WRITE in nfsd_create if necessary */
+
+	nfserr = nfserr_acces;
+	if (!argp->len)
+		goto done;
+	nfserr = nfserr_exist;
+	if (isdotent(argp->name, argp->len))
+		goto done;
+	fh_lock_nested(dirfhp, I_MUTEX_PARENT);
+	dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
+	if (IS_ERR(dchild)) {
+		nfserr = nfserrno(PTR_ERR(dchild));
+		goto out_unlock;
+	}
+	fh_init(newfhp, NFS_FHSIZE);
+	nfserr = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp);
+	if (!nfserr && !dchild->d_inode)
+		nfserr = nfserr_noent;
+	dput(dchild);
+	if (nfserr) {
+		if (nfserr != nfserr_noent)
+			goto out_unlock;
+		/*
+		 * If the new file handle wasn't verified, we can't tell
+		 * whether the file exists or not. Time to bail ...
+		 */
+		nfserr = nfserr_acces;
+		if (!newfhp->fh_dentry) {
+			printk(KERN_WARNING 
+				"nfsd_proc_create: file handle not verified\n");
+			goto out_unlock;
+		}
+	}
+
+	inode = newfhp->fh_dentry->d_inode;
+
+	/* Unfudge the mode bits */
+	if (attr->ia_valid & ATTR_MODE) {
+		type = attr->ia_mode & S_IFMT;
+		mode = attr->ia_mode & ~S_IFMT;
+		if (!type) {
+			/* no type, so if target exists, assume same as that,
+			 * else assume a file */
+			if (inode) {
+				type = inode->i_mode & S_IFMT;
+				switch(type) {
+				case S_IFCHR:
+				case S_IFBLK:
+					/* reserve rdev for later checking */
+					rdev = inode->i_rdev;
+					attr->ia_valid |= ATTR_SIZE;
+
+					/* FALLTHROUGH */
+				case S_IFIFO:
+					/* this is probably a permission check..
+					 * at least IRIX implements perm checking on
+					 *   echo thing > device-special-file-or-pipe
+					 * by doing a CREATE with type==0
+					 */
+					nfserr = nfsd_permission(rqstp,
+								 newfhp->fh_export,
+								 newfhp->fh_dentry,
+								 NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS);
+					if (nfserr && nfserr != nfserr_rofs)
+						goto out_unlock;
+				}
+			} else
+				type = S_IFREG;
+		}
+	} else if (inode) {
+		type = inode->i_mode & S_IFMT;
+		mode = inode->i_mode & ~S_IFMT;
+	} else {
+		type = S_IFREG;
+		mode = 0;	/* ??? */
+	}
+
+	attr->ia_valid |= ATTR_MODE;
+	attr->ia_mode = mode;
+
+	/* Special treatment for non-regular files according to the
+	 * gospel of sun micro
+	 */
+	if (type != S_IFREG) {
+		if (type != S_IFBLK && type != S_IFCHR) {
+			rdev = 0;
+		} else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) {
+			/* If you think you've seen the worst, grok this. */
+			type = S_IFIFO;
+		} else {
+			/* Okay, char or block special */
+			if (!rdev)
+				rdev = wanted;
+		}
+
+		/* we've used the SIZE information, so discard it */
+		attr->ia_valid &= ~ATTR_SIZE;
+
+		/* Make sure the type and device matches */
+		nfserr = nfserr_exist;
+		if (inode && type != (inode->i_mode & S_IFMT))
+			goto out_unlock;
+	}
+
+	nfserr = 0;
+	if (!inode) {
+		/* File doesn't exist. Create it and set attrs */
+		nfserr = nfsd_create(rqstp, dirfhp, argp->name, argp->len,
+					attr, type, rdev, newfhp);
+	} else if (type == S_IFREG) {
+		dprintk("nfsd:   existing %s, valid=%x, size=%ld\n",
+			argp->name, attr->ia_valid, (long) attr->ia_size);
+		/* File already exists. We ignore all attributes except
+		 * size, so that creat() behaves exactly like
+		 * open(..., O_CREAT|O_TRUNC|O_WRONLY).
+		 */
+		attr->ia_valid &= ATTR_SIZE;
+		if (attr->ia_valid)
+			nfserr = nfsd_setattr(rqstp, newfhp, attr, 0, (time_t)0);
+	}
+
+out_unlock:
+	/* We don't really need to unlock, as fh_put does it. */
+	fh_unlock(dirfhp);
+
+done:
+	fh_put(dirfhp);
+	return nfsd_return_dirop(nfserr, resp);
+}
+
+static __be32
+nfsd_proc_remove(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
+					 void		       *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: REMOVE   %s %.*s\n", SVCFH_fmt(&argp->fh),
+		argp->len, argp->name);
+
+	/* Unlink. -SIFDIR means file must not be a directory */
+	nfserr = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR, argp->name, argp->len);
+	fh_put(&argp->fh);
+	return nfserr;
+}
+
+static __be32
+nfsd_proc_rename(struct svc_rqst *rqstp, struct nfsd_renameargs *argp,
+				  	 void		        *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: RENAME   %s %.*s -> \n",
+		SVCFH_fmt(&argp->ffh), argp->flen, argp->fname);
+	dprintk("nfsd:        ->  %s %.*s\n",
+		SVCFH_fmt(&argp->tfh), argp->tlen, argp->tname);
+
+	nfserr = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen,
+				    &argp->tfh, argp->tname, argp->tlen);
+	fh_put(&argp->ffh);
+	fh_put(&argp->tfh);
+	return nfserr;
+}
+
+static __be32
+nfsd_proc_link(struct svc_rqst *rqstp, struct nfsd_linkargs *argp,
+				void			    *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: LINK     %s ->\n",
+		SVCFH_fmt(&argp->ffh));
+	dprintk("nfsd:    %s %.*s\n",
+		SVCFH_fmt(&argp->tfh),
+		argp->tlen,
+		argp->tname);
+
+	nfserr = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen,
+				  &argp->ffh);
+	fh_put(&argp->ffh);
+	fh_put(&argp->tfh);
+	return nfserr;
+}
+
+static __be32
+nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp,
+				          void			  *resp)
+{
+	struct svc_fh	newfh;
+	__be32		nfserr;
+
+	dprintk("nfsd: SYMLINK  %s %.*s -> %.*s\n",
+		SVCFH_fmt(&argp->ffh), argp->flen, argp->fname,
+		argp->tlen, argp->tname);
+
+	fh_init(&newfh, NFS_FHSIZE);
+	/*
+	 * Create the link, look up new file and set attrs.
+	 */
+	nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen,
+						 argp->tname, argp->tlen,
+				 		 &newfh, &argp->attrs);
+
+
+	fh_put(&argp->ffh);
+	fh_put(&newfh);
+	return nfserr;
+}
+
+/*
+ * Make directory. This operation is not idempotent.
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_mkdir(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
+					struct nfsd_diropres   *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: MKDIR    %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
+
+	if (resp->fh.fh_dentry) {
+		printk(KERN_WARNING
+			"nfsd_proc_mkdir: response already verified??\n");
+	}
+
+	argp->attrs.ia_valid &= ~ATTR_SIZE;
+	fh_init(&resp->fh, NFS_FHSIZE);
+	nfserr = nfsd_create(rqstp, &argp->fh, argp->name, argp->len,
+				    &argp->attrs, S_IFDIR, 0, &resp->fh);
+	fh_put(&argp->fh);
+	return nfsd_return_dirop(nfserr, resp);
+}
+
+/*
+ * Remove a directory
+ */
+static __be32
+nfsd_proc_rmdir(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
+				 	void		      *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: RMDIR    %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
+
+	nfserr = nfsd_unlink(rqstp, &argp->fh, S_IFDIR, argp->name, argp->len);
+	fh_put(&argp->fh);
+	return nfserr;
+}
+
+/*
+ * Read a portion of a directory.
+ */
+static __be32
+nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp,
+					  struct nfsd_readdirres  *resp)
+{
+	int		count;
+	__be32		nfserr;
+	loff_t		offset;
+
+	dprintk("nfsd: READDIR  %s %d bytes at %d\n",
+		SVCFH_fmt(&argp->fh),		
+		argp->count, argp->cookie);
+
+	/* Shrink to the client read size */
+	count = (argp->count >> 2) - 2;
+
+	/* Make sure we've room for the NULL ptr & eof flag */
+	count -= 2;
+	if (count < 0)
+		count = 0;
+
+	resp->buffer = argp->buffer;
+	resp->offset = NULL;
+	resp->buflen = count;
+	resp->common.err = nfs_ok;
+	/* Read directory and encode entries on the fly */
+	offset = argp->cookie;
+	nfserr = nfsd_readdir(rqstp, &argp->fh, &offset, 
+			      &resp->common, nfssvc_encode_entry);
+
+	resp->count = resp->buffer - argp->buffer;
+	if (resp->offset)
+		*resp->offset = htonl(offset);
+
+	fh_put(&argp->fh);
+	return nfserr;
+}
+
+/*
+ * Get file system info
+ */
+static __be32
+nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle   *argp,
+					  struct nfsd_statfsres *resp)
+{
+	__be32	nfserr;
+
+	dprintk("nfsd: STATFS   %s\n", SVCFH_fmt(&argp->fh));
+
+	nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
+			NFSD_MAY_BYPASS_GSS_ON_ROOT);
+	fh_put(&argp->fh);
+	return nfserr;
+}
+
+/*
+ * NFSv2 Server procedures.
+ * Only the results of non-idempotent operations are cached.
+ */
+struct nfsd_void { int dummy; };
+
+#define ST 1		/* status */
+#define FH 8		/* filehandle */
+#define	AT 18		/* attributes */
+
+static struct svc_procedure		nfsd_procedures2[18] = {
+	[NFSPROC_NULL] = {
+		.pc_func = (svc_procfunc) nfsd_proc_null,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_void,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_void),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_GETATTR] = {
+		.pc_func = (svc_procfunc) nfsd_proc_getattr,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_fhandle,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_attrstat,
+		.pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd_fhandle),
+		.pc_ressize = sizeof(struct nfsd_attrstat),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+AT,
+	},
+	[NFSPROC_SETATTR] = {
+		.pc_func = (svc_procfunc) nfsd_proc_setattr,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_sattrargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_attrstat,
+		.pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd_sattrargs),
+		.pc_ressize = sizeof(struct nfsd_attrstat),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+AT,
+	},
+	[NFSPROC_ROOT] = {
+		.pc_decode = (kxdrproc_t) nfssvc_decode_void,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_void),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_LOOKUP] = {
+		.pc_func = (svc_procfunc) nfsd_proc_lookup,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_diropargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_diropres,
+		.pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd_diropargs),
+		.pc_ressize = sizeof(struct nfsd_diropres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+FH+AT,
+	},
+	[NFSPROC_READLINK] = {
+		.pc_func = (svc_procfunc) nfsd_proc_readlink,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_readlinkargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_readlinkres,
+		.pc_argsize = sizeof(struct nfsd_readlinkargs),
+		.pc_ressize = sizeof(struct nfsd_readlinkres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+1+NFS_MAXPATHLEN/4,
+	},
+	[NFSPROC_READ] = {
+		.pc_func = (svc_procfunc) nfsd_proc_read,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_readargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_readres,
+		.pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd_readargs),
+		.pc_ressize = sizeof(struct nfsd_readres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
+	},
+	[NFSPROC_WRITECACHE] = {
+		.pc_decode = (kxdrproc_t) nfssvc_decode_void,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_void),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_WRITE] = {
+		.pc_func = (svc_procfunc) nfsd_proc_write,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_writeargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_attrstat,
+		.pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd_writeargs),
+		.pc_ressize = sizeof(struct nfsd_attrstat),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+AT,
+	},
+	[NFSPROC_CREATE] = {
+		.pc_func = (svc_procfunc) nfsd_proc_create,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_createargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_diropres,
+		.pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd_createargs),
+		.pc_ressize = sizeof(struct nfsd_diropres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+FH+AT,
+	},
+	[NFSPROC_REMOVE] = {
+		.pc_func = (svc_procfunc) nfsd_proc_remove,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_diropargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_diropargs),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_REPLSTAT,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_RENAME] = {
+		.pc_func = (svc_procfunc) nfsd_proc_rename,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_renameargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_renameargs),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_REPLSTAT,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_LINK] = {
+		.pc_func = (svc_procfunc) nfsd_proc_link,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_linkargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_linkargs),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_REPLSTAT,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_SYMLINK] = {
+		.pc_func = (svc_procfunc) nfsd_proc_symlink,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_symlinkargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_symlinkargs),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_REPLSTAT,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_MKDIR] = {
+		.pc_func = (svc_procfunc) nfsd_proc_mkdir,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_createargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_diropres,
+		.pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+		.pc_argsize = sizeof(struct nfsd_createargs),
+		.pc_ressize = sizeof(struct nfsd_diropres),
+		.pc_cachetype = RC_REPLBUFF,
+		.pc_xdrressize = ST+FH+AT,
+	},
+	[NFSPROC_RMDIR] = {
+		.pc_func = (svc_procfunc) nfsd_proc_rmdir,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_diropargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_void,
+		.pc_argsize = sizeof(struct nfsd_diropargs),
+		.pc_ressize = sizeof(struct nfsd_void),
+		.pc_cachetype = RC_REPLSTAT,
+		.pc_xdrressize = ST,
+	},
+	[NFSPROC_READDIR] = {
+		.pc_func = (svc_procfunc) nfsd_proc_readdir,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_readdirargs,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_readdirres,
+		.pc_argsize = sizeof(struct nfsd_readdirargs),
+		.pc_ressize = sizeof(struct nfsd_readdirres),
+		.pc_cachetype = RC_NOCACHE,
+	},
+	[NFSPROC_STATFS] = {
+		.pc_func = (svc_procfunc) nfsd_proc_statfs,
+		.pc_decode = (kxdrproc_t) nfssvc_decode_fhandle,
+		.pc_encode = (kxdrproc_t) nfssvc_encode_statfsres,
+		.pc_argsize = sizeof(struct nfsd_fhandle),
+		.pc_ressize = sizeof(struct nfsd_statfsres),
+		.pc_cachetype = RC_NOCACHE,
+		.pc_xdrressize = ST+5,
+	},
+};
+
+
+struct svc_version	nfsd_version2 = {
+		.vs_vers	= 2,
+		.vs_nproc	= 18,
+		.vs_proc	= nfsd_procedures2,
+		.vs_dispatch	= nfsd_dispatch,
+		.vs_xdrsize	= NFS2_SVC_XDRSIZE,
+};
+
+/*
+ * Map errnos to NFS errnos.
+ */
+__be32
+nfserrno (int errno)
+{
+	static struct {
+		__be32	nfserr;
+		int	syserr;
+	} nfs_errtbl[] = {
+		{ nfs_ok, 0 },
+		{ nfserr_perm, -EPERM },
+		{ nfserr_noent, -ENOENT },
+		{ nfserr_io, -EIO },
+		{ nfserr_nxio, -ENXIO },
+		{ nfserr_acces, -EACCES },
+		{ nfserr_exist, -EEXIST },
+		{ nfserr_xdev, -EXDEV },
+		{ nfserr_mlink, -EMLINK },
+		{ nfserr_nodev, -ENODEV },
+		{ nfserr_notdir, -ENOTDIR },
+		{ nfserr_isdir, -EISDIR },
+		{ nfserr_inval, -EINVAL },
+		{ nfserr_fbig, -EFBIG },
+		{ nfserr_nospc, -ENOSPC },
+		{ nfserr_rofs, -EROFS },
+		{ nfserr_mlink, -EMLINK },
+		{ nfserr_nametoolong, -ENAMETOOLONG },
+		{ nfserr_notempty, -ENOTEMPTY },
+#ifdef EDQUOT
+		{ nfserr_dquot, -EDQUOT },
+#endif
+		{ nfserr_stale, -ESTALE },
+		{ nfserr_jukebox, -ETIMEDOUT },
+		{ nfserr_jukebox, -ERESTARTSYS },
+		{ nfserr_jukebox, -EAGAIN },
+		{ nfserr_jukebox, -EWOULDBLOCK },
+		{ nfserr_jukebox, -ENOMEM },
+		{ nfserr_io, -ETXTBSY },
+		{ nfserr_notsupp, -EOPNOTSUPP },
+		{ nfserr_toosmall, -ETOOSMALL },
+		{ nfserr_serverfault, -ESERVERFAULT },
+	};
+	int	i;
+
+	for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
+		if (nfs_errtbl[i].syserr == errno)
+			return nfs_errtbl[i].nfserr;
+	}
+	printk (KERN_INFO "nfsd: non-standard errno: %d\n", errno);
+	return nfserr_io;
+}
+
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
new file mode 100644
index 00000000..18743c4d
--- /dev/null
+++ b/fs/nfsd/nfssvc.c
@@ -0,0 +1,663 @@
+/*
+ * Central processing for nfsd.
+ *
+ * Authors:	Olaf Kirch (okir@monad.swb.de)
+ *
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/sched.h>
+#include <linux/freezer.h>
+#include <linux/fs_struct.h>
+#include <linux/swap.h>
+
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/lockd/bind.h>
+#include <linux/nfsacl.h>
+#include <linux/seq_file.h>
+#include <net/net_namespace.h>
+#include "nfsd.h"
+#include "cache.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY	NFSDDBG_SVC
+
+extern struct svc_program	nfsd_program;
+static int			nfsd(void *vrqstp);
+struct timeval			nfssvc_boot;
+
+/*
+ * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
+ * of the svc_serv struct. In particular, ->sv_nrthreads but also to some
+ * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt
+ *
+ * If (out side the lock) nfsd_serv is non-NULL, then it must point to a
+ * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number
+ * of nfsd threads must exist and each must listed in ->sp_all_threads in each
+ * entry of ->sv_pools[].
+ *
+ * Transitions of the thread count between zero and non-zero are of particular
+ * interest since the svc_serv needs to be created and initialized at that
+ * point, or freed.
+ *
+ * Finally, the nfsd_mutex also protects some of the global variables that are
+ * accessed when nfsd starts and that are settable via the write_* routines in
+ * nfsctl.c. In particular:
+ *
+ *	user_recovery_dirname
+ *	user_lease_time
+ *	nfsd_versions
+ */
+DEFINE_MUTEX(nfsd_mutex);
+struct svc_serv 		*nfsd_serv;
+
+/*
+ * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
+ * nfsd_drc_max_pages limits the total amount of memory available for
+ * version 4.1 DRC caches.
+ * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
+ */
+spinlock_t	nfsd_drc_lock;
+unsigned int	nfsd_drc_max_mem;
+unsigned int	nfsd_drc_mem_used;
+
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+static struct svc_stat	nfsd_acl_svcstats;
+static struct svc_version *	nfsd_acl_version[] = {
+	[2] = &nfsd_acl_version2,
+	[3] = &nfsd_acl_version3,
+};
+
+#define NFSD_ACL_MINVERS            2
+#define NFSD_ACL_NRVERS		ARRAY_SIZE(nfsd_acl_version)
+static struct svc_version *nfsd_acl_versions[NFSD_ACL_NRVERS];
+
+static struct svc_program	nfsd_acl_program = {
+	.pg_prog		= NFS_ACL_PROGRAM,
+	.pg_nvers		= NFSD_ACL_NRVERS,
+	.pg_vers		= nfsd_acl_versions,
+	.pg_name		= "nfsacl",
+	.pg_class		= "nfsd",
+	.pg_stats		= &nfsd_acl_svcstats,
+	.pg_authenticate	= &svc_set_client,
+};
+
+static struct svc_stat	nfsd_acl_svcstats = {
+	.program	= &nfsd_acl_program,
+};
+#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
+
+static struct svc_version *	nfsd_version[] = {
+	[2] = &nfsd_version2,
+#if defined(CONFIG_NFSD_V3)
+	[3] = &nfsd_version3,
+#endif
+#if defined(CONFIG_NFSD_V4)
+	[4] = &nfsd_version4,
+#endif
+};
+
+#define NFSD_MINVERS    	2
+#define NFSD_NRVERS		ARRAY_SIZE(nfsd_version)
+static struct svc_version *nfsd_versions[NFSD_NRVERS];
+
+struct svc_program		nfsd_program = {
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+	.pg_next		= &nfsd_acl_program,
+#endif
+	.pg_prog		= NFS_PROGRAM,		/* program number */
+	.pg_nvers		= NFSD_NRVERS,		/* nr of entries in nfsd_version */
+	.pg_vers		= nfsd_versions,	/* version table */
+	.pg_name		= "nfsd",		/* program name */
+	.pg_class		= "nfsd",		/* authentication class */
+	.pg_stats		= &nfsd_svcstats,	/* version table */
+	.pg_authenticate	= &svc_set_client,	/* export authentication */
+
+};
+
+u32 nfsd_supported_minorversion;
+
+int nfsd_vers(int vers, enum vers_op change)
+{
+	if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
+		return 0;
+	switch(change) {
+	case NFSD_SET:
+		nfsd_versions[vers] = nfsd_version[vers];
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+		if (vers < NFSD_ACL_NRVERS)
+			nfsd_acl_versions[vers] = nfsd_acl_version[vers];
+#endif
+		break;
+	case NFSD_CLEAR:
+		nfsd_versions[vers] = NULL;
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+		if (vers < NFSD_ACL_NRVERS)
+			nfsd_acl_versions[vers] = NULL;
+#endif
+		break;
+	case NFSD_TEST:
+		return nfsd_versions[vers] != NULL;
+	case NFSD_AVAIL:
+		return nfsd_version[vers] != NULL;
+	}
+	return 0;
+}
+
+int nfsd_minorversion(u32 minorversion, enum vers_op change)
+{
+	if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+		return -1;
+	switch(change) {
+	case NFSD_SET:
+		nfsd_supported_minorversion = minorversion;
+		break;
+	case NFSD_CLEAR:
+		if (minorversion == 0)
+			return -1;
+		nfsd_supported_minorversion = minorversion - 1;
+		break;
+	case NFSD_TEST:
+		return minorversion <= nfsd_supported_minorversion;
+	case NFSD_AVAIL:
+		return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
+	}
+	return 0;
+}
+
+/*
+ * Maximum number of nfsd processes
+ */
+#define	NFSD_MAXSERVS		8192
+
+int nfsd_nrthreads(void)
+{
+	int rv = 0;
+	mutex_lock(&nfsd_mutex);
+	if (nfsd_serv)
+		rv = nfsd_serv->sv_nrthreads;
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+static int nfsd_init_socks(int port)
+{
+	int error;
+	if (!list_empty(&nfsd_serv->sv_permsocks))
+		return 0;
+
+	error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, port,
+					SVC_SOCK_DEFAULTS);
+	if (error < 0)
+		return error;
+
+	error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, port,
+					SVC_SOCK_DEFAULTS);
+	if (error < 0)
+		return error;
+
+	return 0;
+}
+
+static bool nfsd_up = false;
+
+static int nfsd_startup(unsigned short port, int nrservs)
+{
+	int ret;
+
+	if (nfsd_up)
+		return 0;
+	/*
+	 * Readahead param cache - will no-op if it already exists.
+	 * (Note therefore results will be suboptimal if number of
+	 * threads is modified after nfsd start.)
+	 */
+	ret = nfsd_racache_init(2*nrservs);
+	if (ret)
+		return ret;
+	ret = nfsd_init_socks(port);
+	if (ret)
+		goto out_racache;
+	ret = lockd_up();
+	if (ret)
+		goto out_racache;
+	ret = nfs4_state_start();
+	if (ret)
+		goto out_lockd;
+	nfsd_up = true;
+	return 0;
+out_lockd:
+	lockd_down();
+out_racache:
+	nfsd_racache_shutdown();
+	return ret;
+}
+
+static void nfsd_shutdown(void)
+{
+	/*
+	 * write_ports can create the server without actually starting
+	 * any threads--if we get shut down before any threads are
+	 * started, then nfsd_last_thread will be run before any of this
+	 * other initialization has been done.
+	 */
+	if (!nfsd_up)
+		return;
+	nfs4_state_shutdown();
+	lockd_down();
+	nfsd_racache_shutdown();
+	nfsd_up = false;
+}
+
+static void nfsd_last_thread(struct svc_serv *serv)
+{
+	/* When last nfsd thread exits we need to do some clean-up */
+	nfsd_serv = NULL;
+	nfsd_shutdown();
+
+	printk(KERN_WARNING "nfsd: last server has exited, flushing export "
+			    "cache\n");
+	nfsd_export_flush();
+}
+
+void nfsd_reset_versions(void)
+{
+	int found_one = 0;
+	int i;
+
+	for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) {
+		if (nfsd_program.pg_vers[i])
+			found_one = 1;
+	}
+
+	if (!found_one) {
+		for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++)
+			nfsd_program.pg_vers[i] = nfsd_version[i];
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+		for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++)
+			nfsd_acl_program.pg_vers[i] =
+				nfsd_acl_version[i];
+#endif
+	}
+}
+
+/*
+ * Each session guarantees a negotiated per slot memory cache for replies
+ * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
+ * NFSv4.1 server might want to use more memory for a DRC than a machine
+ * with mutiple services.
+ *
+ * Impose a hard limit on the number of pages for the DRC which varies
+ * according to the machines free pages. This is of course only a default.
+ *
+ * For now this is a #defined shift which could be under admin control
+ * in the future.
+ */
+static void set_max_drc(void)
+{
+	#define NFSD_DRC_SIZE_SHIFT	10
+	nfsd_drc_max_mem = (nr_free_buffer_pages()
+					>> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
+	nfsd_drc_mem_used = 0;
+	spin_lock_init(&nfsd_drc_lock);
+	dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem);
+}
+
+int nfsd_create_serv(void)
+{
+	int err = 0;
+
+	WARN_ON(!mutex_is_locked(&nfsd_mutex));
+	if (nfsd_serv) {
+		svc_get(nfsd_serv);
+		return 0;
+	}
+	if (nfsd_max_blksize == 0) {
+		/* choose a suitable default */
+		struct sysinfo i;
+		si_meminfo(&i);
+		/* Aim for 1/4096 of memory per thread
+		 * This gives 1MB on 4Gig machines
+		 * But only uses 32K on 128M machines.
+		 * Bottom out at 8K on 32M and smaller.
+		 * Of course, this is only a default.
+		 */
+		nfsd_max_blksize = NFSSVC_MAXBLKSIZE;
+		i.totalram <<= PAGE_SHIFT - 12;
+		while (nfsd_max_blksize > i.totalram &&
+		       nfsd_max_blksize >= 8*1024*2)
+			nfsd_max_blksize /= 2;
+	}
+	nfsd_reset_versions();
+
+	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+				      nfsd_last_thread, nfsd, THIS_MODULE);
+	if (nfsd_serv == NULL)
+		return -ENOMEM;
+
+	set_max_drc();
+	do_gettimeofday(&nfssvc_boot);		/* record boot time */
+	return err;
+}
+
+int nfsd_nrpools(void)
+{
+	if (nfsd_serv == NULL)
+		return 0;
+	else
+		return nfsd_serv->sv_nrpools;
+}
+
+int nfsd_get_nrthreads(int n, int *nthreads)
+{
+	int i = 0;
+
+	if (nfsd_serv != NULL) {
+		for (i = 0; i < nfsd_serv->sv_nrpools && i < n; i++)
+			nthreads[i] = nfsd_serv->sv_pools[i].sp_nrthreads;
+	}
+
+	return 0;
+}
+
+int nfsd_set_nrthreads(int n, int *nthreads)
+{
+	int i = 0;
+	int tot = 0;
+	int err = 0;
+
+	WARN_ON(!mutex_is_locked(&nfsd_mutex));
+
+	if (nfsd_serv == NULL || n <= 0)
+		return 0;
+
+	if (n > nfsd_serv->sv_nrpools)
+		n = nfsd_serv->sv_nrpools;
+
+	/* enforce a global maximum number of threads */
+	tot = 0;
+	for (i = 0; i < n; i++) {
+		if (nthreads[i] > NFSD_MAXSERVS)
+			nthreads[i] = NFSD_MAXSERVS;
+		tot += nthreads[i];
+	}
+	if (tot > NFSD_MAXSERVS) {
+		/* total too large: scale down requested numbers */
+		for (i = 0; i < n && tot > 0; i++) {
+		    	int new = nthreads[i] * NFSD_MAXSERVS / tot;
+			tot -= (nthreads[i] - new);
+			nthreads[i] = new;
+		}
+		for (i = 0; i < n && tot > 0; i++) {
+			nthreads[i]--;
+			tot--;
+		}
+	}
+
+	/*
+	 * There must always be a thread in pool 0; the admin
+	 * can't shut down NFS completely using pool_threads.
+	 */
+	if (nthreads[0] == 0)
+		nthreads[0] = 1;
+
+	/* apply the new numbers */
+	svc_get(nfsd_serv);
+	for (i = 0; i < n; i++) {
+		err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i],
+				    	  nthreads[i]);
+		if (err)
+			break;
+	}
+	svc_destroy(nfsd_serv);
+
+	return err;
+}
+
+/*
+ * Adjust the number of threads and return the new number of threads.
+ * This is also the function that starts the server if necessary, if
+ * this is the first time nrservs is nonzero.
+ */
+int
+nfsd_svc(unsigned short port, int nrservs)
+{
+	int	error;
+	bool	nfsd_up_before;
+
+	mutex_lock(&nfsd_mutex);
+	dprintk("nfsd: creating service\n");
+	if (nrservs <= 0)
+		nrservs = 0;
+	if (nrservs > NFSD_MAXSERVS)
+		nrservs = NFSD_MAXSERVS;
+	error = 0;
+	if (nrservs == 0 && nfsd_serv == NULL)
+		goto out;
+
+	error = nfsd_create_serv();
+	if (error)
+		goto out;
+
+	nfsd_up_before = nfsd_up;
+
+	error = nfsd_startup(port, nrservs);
+	if (error)
+		goto out_destroy;
+	error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
+	if (error)
+		goto out_shutdown;
+	/* We are holding a reference to nfsd_serv which
+	 * we don't want to count in the return value,
+	 * so subtract 1
+	 */
+	error = nfsd_serv->sv_nrthreads - 1;
+out_shutdown:
+	if (error < 0 && !nfsd_up_before)
+		nfsd_shutdown();
+out_destroy:
+	svc_destroy(nfsd_serv);		/* Release server */
+out:
+	mutex_unlock(&nfsd_mutex);
+	return error;
+}
+
+
+/*
+ * This is the NFS server kernel thread
+ */
+static int
+nfsd(void *vrqstp)
+{
+	struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
+	int err, preverr = 0;
+
+	/* Lock module and set up kernel thread */
+	mutex_lock(&nfsd_mutex);
+
+	/* At this point, the thread shares current->fs
+	 * with the init process. We need to create files with a
+	 * umask of 0 instead of init's umask. */
+	if (unshare_fs_struct() < 0) {
+		printk("Unable to start nfsd thread: out of memory\n");
+		goto out;
+	}
+
+	current->fs->umask = 0;
+
+	/*
+	 * thread is spawned with all signals set to SIG_IGN, re-enable
+	 * the ones that will bring down the thread
+	 */
+	allow_signal(SIGKILL);
+	allow_signal(SIGHUP);
+	allow_signal(SIGINT);
+	allow_signal(SIGQUIT);
+
+	nfsdstats.th_cnt++;
+	mutex_unlock(&nfsd_mutex);
+
+	/*
+	 * We want less throttling in balance_dirty_pages() so that nfs to
+	 * localhost doesn't cause nfsd to lock up due to all the client's
+	 * dirty pages.
+	 */
+	current->flags |= PF_LESS_THROTTLE;
+	set_freezable();
+
+	/*
+	 * The main request loop
+	 */
+	for (;;) {
+		/*
+		 * Find a socket with data available and call its
+		 * recvfrom routine.
+		 */
+		while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN)
+			;
+		if (err == -EINTR)
+			break;
+		else if (err < 0) {
+			if (err != preverr) {
+				printk(KERN_WARNING "%s: unexpected error "
+					"from svc_recv (%d)\n", __func__, -err);
+				preverr = err;
+			}
+			schedule_timeout_uninterruptible(HZ);
+			continue;
+		}
+
+
+		/* Lock the export hash tables for reading. */
+		exp_readlock();
+
+		validate_process_creds();
+		svc_process(rqstp);
+		validate_process_creds();
+
+		/* Unlock export hash tables */
+		exp_readunlock();
+	}
+
+	/* Clear signals before calling svc_exit_thread() */
+	flush_signals(current);
+
+	mutex_lock(&nfsd_mutex);
+	nfsdstats.th_cnt --;
+
+out:
+	/* Release the thread */
+	svc_exit_thread(rqstp);
+
+	/* Release module */
+	mutex_unlock(&nfsd_mutex);
+	module_put_and_exit(0);
+	return 0;
+}
+
+static __be32 map_new_errors(u32 vers, __be32 nfserr)
+{
+	if (nfserr == nfserr_jukebox && vers == 2)
+		return nfserr_dropit;
+	if (nfserr == nfserr_wrongsec && vers < 4)
+		return nfserr_acces;
+	return nfserr;
+}
+
+int
+nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+{
+	struct svc_procedure	*proc;
+	kxdrproc_t		xdr;
+	__be32			nfserr;
+	__be32			*nfserrp;
+
+	dprintk("nfsd_dispatch: vers %d proc %d\n",
+				rqstp->rq_vers, rqstp->rq_proc);
+	proc = rqstp->rq_procinfo;
+
+	/* Check whether we have this call in the cache. */
+	switch (nfsd_cache_lookup(rqstp, proc->pc_cachetype)) {
+	case RC_INTR:
+	case RC_DROPIT:
+		return 0;
+	case RC_REPLY:
+		return 1;
+	case RC_DOIT:;
+		/* do it */
+	}
+
+	/* Decode arguments */
+	xdr = proc->pc_decode;
+	if (xdr && !xdr(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base,
+			rqstp->rq_argp)) {
+		dprintk("nfsd: failed to decode arguments!\n");
+		nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
+		*statp = rpc_garbage_args;
+		return 1;
+	}
+
+	/* need to grab the location to store the status, as
+	 * nfsv4 does some encoding while processing 
+	 */
+	nfserrp = rqstp->rq_res.head[0].iov_base
+		+ rqstp->rq_res.head[0].iov_len;
+	rqstp->rq_res.head[0].iov_len += sizeof(__be32);
+
+	/* Now call the procedure handler, and encode NFS status. */
+	nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
+	nfserr = map_new_errors(rqstp->rq_vers, nfserr);
+	if (nfserr == nfserr_dropit || rqstp->rq_dropme) {
+		dprintk("nfsd: Dropping request; may be revisited later\n");
+		nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
+		return 0;
+	}
+
+	if (rqstp->rq_proc != 0)
+		*nfserrp++ = nfserr;
+
+	/* Encode result.
+	 * For NFSv2, additional info is never returned in case of an error.
+	 */
+	if (!(nfserr && rqstp->rq_vers == 2)) {
+		xdr = proc->pc_encode;
+		if (xdr && !xdr(rqstp, nfserrp,
+				rqstp->rq_resp)) {
+			/* Failed to encode result. Release cache entry */
+			dprintk("nfsd: failed to encode result!\n");
+			nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
+			*statp = rpc_system_err;
+			return 1;
+		}
+	}
+
+	/* Store reply in cache. */
+	nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
+	return 1;
+}
+
+int nfsd_pool_stats_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	mutex_lock(&nfsd_mutex);
+	if (nfsd_serv == NULL) {
+		mutex_unlock(&nfsd_mutex);
+		return -ENODEV;
+	}
+	/* bump up the psudo refcount while traversing */
+	svc_get(nfsd_serv);
+	ret = svc_pool_stats_open(nfsd_serv, file);
+	mutex_unlock(&nfsd_mutex);
+	return ret;
+}
+
+int nfsd_pool_stats_release(struct inode *inode, struct file *file)
+{
+	int ret = seq_release(inode, file);
+	mutex_lock(&nfsd_mutex);
+	/* this function really, really should have been called svc_put() */
+	svc_destroy(nfsd_serv);
+	mutex_unlock(&nfsd_mutex);
+	return ret;
+}
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
new file mode 100644
index 00000000..65ec595e
--- /dev/null
+++ b/fs/nfsd/nfsxdr.c
@@ -0,0 +1,545 @@
+/*
+ * XDR support for nfsd
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include "xdr.h"
+#include "auth.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_XDR
+
+/*
+ * Mapping of S_IF* types to NFS file types
+ */
+static u32	nfs_ftypes[] = {
+	NFNON,  NFCHR,  NFCHR, NFBAD,
+	NFDIR,  NFBAD,  NFBLK, NFBAD,
+	NFREG,  NFBAD,  NFLNK, NFBAD,
+	NFSOCK, NFBAD,  NFLNK, NFBAD,
+};
+
+
+/*
+ * XDR functions for basic NFS types
+ */
+static __be32 *
+decode_fh(__be32 *p, struct svc_fh *fhp)
+{
+	fh_init(fhp, NFS_FHSIZE);
+	memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE);
+	fhp->fh_handle.fh_size = NFS_FHSIZE;
+
+	/* FIXME: Look up export pointer here and verify
+	 * Sun Secure RPC if requested */
+	return p + (NFS_FHSIZE >> 2);
+}
+
+/* Helper function for NFSv2 ACL code */
+__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp)
+{
+	return decode_fh(p, fhp);
+}
+
+static __be32 *
+encode_fh(__be32 *p, struct svc_fh *fhp)
+{
+	memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE);
+	return p + (NFS_FHSIZE>> 2);
+}
+
+/*
+ * Decode a file name and make sure that the path contains
+ * no slashes or null bytes.
+ */
+static __be32 *
+decode_filename(__be32 *p, char **namp, unsigned int *lenp)
+{
+	char		*name;
+	unsigned int	i;
+
+	if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) {
+		for (i = 0, name = *namp; i < *lenp; i++, name++) {
+			if (*name == '\0' || *name == '/')
+				return NULL;
+		}
+	}
+
+	return p;
+}
+
+static __be32 *
+decode_pathname(__be32 *p, char **namp, unsigned int *lenp)
+{
+	char		*name;
+	unsigned int	i;
+
+	if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) {
+		for (i = 0, name = *namp; i < *lenp; i++, name++) {
+			if (*name == '\0')
+				return NULL;
+		}
+	}
+
+	return p;
+}
+
+static __be32 *
+decode_sattr(__be32 *p, struct iattr *iap)
+{
+	u32	tmp, tmp1;
+
+	iap->ia_valid = 0;
+
+	/* Sun client bug compatibility check: some sun clients seem to
+	 * put 0xffff in the mode field when they mean 0xffffffff.
+	 * Quoting the 4.4BSD nfs server code: Nah nah nah nah na nah.
+	 */
+	if ((tmp = ntohl(*p++)) != (u32)-1 && tmp != 0xffff) {
+		iap->ia_valid |= ATTR_MODE;
+		iap->ia_mode = tmp;
+	}
+	if ((tmp = ntohl(*p++)) != (u32)-1) {
+		iap->ia_valid |= ATTR_UID;
+		iap->ia_uid = tmp;
+	}
+	if ((tmp = ntohl(*p++)) != (u32)-1) {
+		iap->ia_valid |= ATTR_GID;
+		iap->ia_gid = tmp;
+	}
+	if ((tmp = ntohl(*p++)) != (u32)-1) {
+		iap->ia_valid |= ATTR_SIZE;
+		iap->ia_size = tmp;
+	}
+	tmp  = ntohl(*p++); tmp1 = ntohl(*p++);
+	if (tmp != (u32)-1 && tmp1 != (u32)-1) {
+		iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET;
+		iap->ia_atime.tv_sec = tmp;
+		iap->ia_atime.tv_nsec = tmp1 * 1000; 
+	}
+	tmp  = ntohl(*p++); tmp1 = ntohl(*p++);
+	if (tmp != (u32)-1 && tmp1 != (u32)-1) {
+		iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET;
+		iap->ia_mtime.tv_sec = tmp;
+		iap->ia_mtime.tv_nsec = tmp1 * 1000; 
+		/*
+		 * Passing the invalid value useconds=1000000 for mtime
+		 * is a Sun convention for "set both mtime and atime to
+		 * current server time".  It's needed to make permissions
+		 * checks for the "touch" program across v2 mounts to
+		 * Solaris and Irix boxes work correctly. See description of
+		 * sattr in section 6.1 of "NFS Illustrated" by
+		 * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5
+		 */
+		if (tmp1 == 1000000)
+			iap->ia_valid &= ~(ATTR_ATIME_SET|ATTR_MTIME_SET);
+	}
+	return p;
+}
+
+static __be32 *
+encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
+	     struct kstat *stat)
+{
+	struct dentry	*dentry = fhp->fh_dentry;
+	int type;
+	struct timespec time;
+	u32 f;
+
+	type = (stat->mode & S_IFMT);
+
+	*p++ = htonl(nfs_ftypes[type >> 12]);
+	*p++ = htonl((u32) stat->mode);
+	*p++ = htonl((u32) stat->nlink);
+	*p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
+	*p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
+
+	if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {
+		*p++ = htonl(NFS_MAXPATHLEN);
+	} else {
+		*p++ = htonl((u32) stat->size);
+	}
+	*p++ = htonl((u32) stat->blksize);
+	if (S_ISCHR(type) || S_ISBLK(type))
+		*p++ = htonl(new_encode_dev(stat->rdev));
+	else
+		*p++ = htonl(0xffffffff);
+	*p++ = htonl((u32) stat->blocks);
+	switch (fsid_source(fhp)) {
+	default:
+	case FSIDSOURCE_DEV:
+		*p++ = htonl(new_encode_dev(stat->dev));
+		break;
+	case FSIDSOURCE_FSID:
+		*p++ = htonl((u32) fhp->fh_export->ex_fsid);
+		break;
+	case FSIDSOURCE_UUID:
+		f = ((u32*)fhp->fh_export->ex_uuid)[0];
+		f ^= ((u32*)fhp->fh_export->ex_uuid)[1];
+		f ^= ((u32*)fhp->fh_export->ex_uuid)[2];
+		f ^= ((u32*)fhp->fh_export->ex_uuid)[3];
+		*p++ = htonl(f);
+		break;
+	}
+	*p++ = htonl((u32) stat->ino);
+	*p++ = htonl((u32) stat->atime.tv_sec);
+	*p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0);
+	lease_get_mtime(dentry->d_inode, &time); 
+	*p++ = htonl((u32) time.tv_sec);
+	*p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); 
+	*p++ = htonl((u32) stat->ctime.tv_sec);
+	*p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0);
+
+	return p;
+}
+
+/* Helper function for NFSv2 ACL code */
+__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+{
+	struct kstat stat;
+	vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, &stat);
+	return encode_fattr(rqstp, p, fhp, &stat);
+}
+
+/*
+ * XDR decode functions
+ */
+int
+nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
+{
+	if (!(p = decode_fh(p, &args->fh)))
+		return 0;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_sattrargs *args)
+{
+	p = decode_fh(p, &args->fh);
+	if (!p)
+		return 0;
+	p = decode_sattr(p, &args->attrs);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_diropargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh))
+	 || !(p = decode_filename(p, &args->name, &args->len)))
+		return 0;
+
+	 return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_readargs *args)
+{
+	unsigned int len;
+	int v,pn;
+	if (!(p = decode_fh(p, &args->fh)))
+		return 0;
+
+	args->offset    = ntohl(*p++);
+	len = args->count     = ntohl(*p++);
+	p++; /* totalcount - unused */
+
+	if (len > NFSSVC_MAXBLKSIZE_V2)
+		len = NFSSVC_MAXBLKSIZE_V2;
+
+	/* set up somewhere to store response.
+	 * We take pages, put them on reslist and include in iovec
+	 */
+	v=0;
+	while (len > 0) {
+		pn = rqstp->rq_resused++;
+		rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+		rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
+		len -= rqstp->rq_vec[v].iov_len;
+		v++;
+	}
+	args->vlen = v;
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_writeargs *args)
+{
+	unsigned int len, hdr, dlen;
+	int v;
+
+	if (!(p = decode_fh(p, &args->fh)))
+		return 0;
+
+	p++;				/* beginoffset */
+	args->offset = ntohl(*p++);	/* offset */
+	p++;				/* totalcount */
+	len = args->len = ntohl(*p++);
+	/*
+	 * The protocol specifies a maximum of 8192 bytes.
+	 */
+	if (len > NFSSVC_MAXBLKSIZE_V2)
+		return 0;
+
+	/*
+	 * Check to make sure that we got the right number of
+	 * bytes.
+	 */
+	hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
+	dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
+		- hdr;
+
+	/*
+	 * Round the length of the data which was specified up to
+	 * the next multiple of XDR units and then compare that
+	 * against the length which was actually received.
+	 * Note that when RPCSEC/GSS (for example) is used, the
+	 * data buffer can be padded so dlen might be larger
+	 * than required.  It must never be smaller.
+	 */
+	if (dlen < XDR_QUADLEN(len)*4)
+		return 0;
+
+	rqstp->rq_vec[0].iov_base = (void*)p;
+	rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+	v = 0;
+	while (len > rqstp->rq_vec[v].iov_len) {
+		len -= rqstp->rq_vec[v].iov_len;
+		v++;
+		rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
+		rqstp->rq_vec[v].iov_len = PAGE_SIZE;
+	}
+	rqstp->rq_vec[v].iov_len = len;
+	args->vlen = v + 1;
+	return 1;
+}
+
+int
+nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_createargs *args)
+{
+	if (   !(p = decode_fh(p, &args->fh))
+	    || !(p = decode_filename(p, &args->name, &args->len)))
+		return 0;
+	p = decode_sattr(p, &args->attrs);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_renameargs *args)
+{
+	if (!(p = decode_fh(p, &args->ffh))
+	 || !(p = decode_filename(p, &args->fname, &args->flen))
+	 || !(p = decode_fh(p, &args->tfh))
+	 || !(p = decode_filename(p, &args->tname, &args->tlen)))
+		return 0;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readlinkargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh)))
+		return 0;
+	args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_linkargs *args)
+{
+	if (!(p = decode_fh(p, &args->ffh))
+	 || !(p = decode_fh(p, &args->tfh))
+	 || !(p = decode_filename(p, &args->tname, &args->tlen)))
+		return 0;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_symlinkargs *args)
+{
+	if (   !(p = decode_fh(p, &args->ffh))
+	    || !(p = decode_filename(p, &args->fname, &args->flen))
+	    || !(p = decode_pathname(p, &args->tname, &args->tlen)))
+		return 0;
+	p = decode_sattr(p, &args->attrs);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_readdirargs *args)
+{
+	if (!(p = decode_fh(p, &args->fh)))
+		return 0;
+	args->cookie = ntohl(*p++);
+	args->count  = ntohl(*p++);
+	if (args->count > PAGE_SIZE)
+		args->count = PAGE_SIZE;
+
+	args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+/*
+ * XDR encode functions
+ */
+int
+nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_attrstat *resp)
+{
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_diropres *resp)
+{
+	p = encode_fh(p, &resp->fh);
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_readlinkres *resp)
+{
+	*p++ = htonl(resp->len);
+	xdr_ressize_check(rqstp, p);
+	rqstp->rq_res.page_len = resp->len;
+	if (resp->len & 3) {
+		/* need to pad the tail */
+		rqstp->rq_res.tail[0].iov_base = p;
+		*p = 0;
+		rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
+	}
+	return 1;
+}
+
+int
+nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_readres *resp)
+{
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+	*p++ = htonl(resp->count);
+	xdr_ressize_check(rqstp, p);
+
+	/* now update rqstp->rq_res to reflect data as well */
+	rqstp->rq_res.page_len = resp->count;
+	if (resp->count & 3) {
+		/* need to pad the tail */
+		rqstp->rq_res.tail[0].iov_base = p;
+		*p = 0;
+		rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3);
+	}
+	return 1;
+}
+
+int
+nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_readdirres *resp)
+{
+	xdr_ressize_check(rqstp, p);
+	p = resp->buffer;
+	*p++ = 0;			/* no more entries */
+	*p++ = htonl((resp->common.err == nfserr_eof));
+	rqstp->rq_res.page_len = (((unsigned long)p-1) & ~PAGE_MASK)+1;
+
+	return 1;
+}
+
+int
+nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_statfsres *resp)
+{
+	struct kstatfs	*stat = &resp->stats;
+
+	*p++ = htonl(NFSSVC_MAXBLKSIZE_V2);	/* max transfer size */
+	*p++ = htonl(stat->f_bsize);
+	*p++ = htonl(stat->f_blocks);
+	*p++ = htonl(stat->f_bfree);
+	*p++ = htonl(stat->f_bavail);
+	return xdr_ressize_check(rqstp, p);
+}
+
+int
+nfssvc_encode_entry(void *ccdv, const char *name,
+		    int namlen, loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct readdir_cd *ccd = ccdv;
+	struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common);
+	__be32	*p = cd->buffer;
+	int	buflen, slen;
+
+	/*
+	dprintk("nfsd: entry(%.*s off %ld ino %ld)\n",
+			namlen, name, offset, ino);
+	 */
+
+	if (offset > ~((u32) 0)) {
+		cd->common.err = nfserr_fbig;
+		return -EINVAL;
+	}
+	if (cd->offset)
+		*cd->offset = htonl(offset);
+	if (namlen > NFS2_MAXNAMLEN)
+		namlen = NFS2_MAXNAMLEN;/* truncate filename */
+
+	slen = XDR_QUADLEN(namlen);
+	if ((buflen = cd->buflen - slen - 4) < 0) {
+		cd->common.err = nfserr_toosmall;
+		return -EINVAL;
+	}
+	if (ino > ~((u32) 0)) {
+		cd->common.err = nfserr_fbig;
+		return -EINVAL;
+	}
+	*p++ = xdr_one;				/* mark entry present */
+	*p++ = htonl((u32) ino);		/* file id */
+	p    = xdr_encode_array(p, name, namlen);/* name length & name */
+	cd->offset = p;			/* remember pointer */
+	*p++ = htonl(~0U);		/* offset of next entry */
+
+	cd->buflen = buflen;
+	cd->buffer = p;
+	cd->common.err = nfs_ok;
+	return 0;
+}
+
+/*
+ * XDR release functions
+ */
+int
+nfssvc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
+					struct nfsd_fhandle *resp)
+{
+	fh_put(&resp->fh);
+	return 1;
+}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
new file mode 100644
index 00000000..858c7bae
--- /dev/null
+++ b/fs/nfsd/state.h
@@ -0,0 +1,492 @@
+/*
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson <andros@umich.edu>
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _NFSD4_STATE_H
+#define _NFSD4_STATE_H
+
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/nfsd/nfsfh.h>
+#include "nfsfh.h"
+
+typedef struct {
+	u32             cl_boot;
+	u32             cl_id;
+} clientid_t;
+
+typedef struct {
+	u32             so_boot;
+	u32             so_stateownerid;
+	u32             so_fileid;
+} stateid_opaque_t;
+
+typedef struct {
+	u32                     si_generation;
+	stateid_opaque_t        si_opaque;
+} stateid_t;
+#define si_boot           si_opaque.so_boot
+#define si_stateownerid   si_opaque.so_stateownerid
+#define si_fileid         si_opaque.so_fileid
+
+#define STATEID_FMT	"(%08x/%08x/%08x/%08x)"
+#define STATEID_VAL(s) \
+	(s)->si_boot, \
+	(s)->si_stateownerid, \
+	(s)->si_fileid, \
+	(s)->si_generation
+
+struct nfsd4_callback {
+	void *cb_op;
+	struct nfs4_client *cb_clp;
+	struct list_head cb_per_client;
+	u32 cb_minorversion;
+	struct rpc_message cb_msg;
+	const struct rpc_call_ops *cb_ops;
+	struct work_struct cb_work;
+	bool cb_done;
+};
+
+struct nfs4_delegation {
+	struct list_head	dl_perfile;
+	struct list_head	dl_perclnt;
+	struct list_head	dl_recall_lru;  /* delegation recalled */
+	atomic_t		dl_count;       /* ref count */
+	struct nfs4_client	*dl_client;
+	struct nfs4_file	*dl_file;
+	u32			dl_type;
+	time_t			dl_time;
+/* For recall: */
+	stateid_t		dl_stateid;
+	struct knfsd_fh		dl_fh;
+	int			dl_retries;
+	struct nfsd4_callback	dl_recall;
+};
+
+/* client delegation callback info */
+struct nfs4_cb_conn {
+	/* SETCLIENTID info */
+	struct sockaddr_storage	cb_addr;
+	struct sockaddr_storage	cb_saddr;
+	size_t			cb_addrlen;
+	u32                     cb_prog; /* used only in 4.0 case;
+					    per-session otherwise */
+	u32                     cb_ident;	/* minorversion 0 only */
+	struct svc_xprt		*cb_xprt;	/* minorversion 1 only */
+};
+
+/* Maximum number of slots per session. 160 is useful for long haul TCP */
+#define NFSD_MAX_SLOTS_PER_SESSION     160
+/* Maximum number of operations per session compound */
+#define NFSD_MAX_OPS_PER_COMPOUND	16
+/* Maximum  session per slot cache size */
+#define NFSD_SLOT_CACHE_SIZE		1024
+/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
+#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION	32
+#define NFSD_MAX_MEM_PER_SESSION  \
+		(NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE)
+
+struct nfsd4_slot {
+	bool	sl_inuse;
+	bool	sl_cachethis;
+	u16	sl_opcnt;
+	u32	sl_seqid;
+	__be32	sl_status;
+	u32	sl_datalen;
+	char	sl_data[];
+};
+
+struct nfsd4_channel_attrs {
+	u32		headerpadsz;
+	u32		maxreq_sz;
+	u32		maxresp_sz;
+	u32		maxresp_cached;
+	u32		maxops;
+	u32		maxreqs;
+	u32		nr_rdma_attrs;
+	u32		rdma_attrs;
+};
+
+struct nfsd4_create_session {
+	clientid_t			clientid;
+	struct nfs4_sessionid		sessionid;
+	u32				seqid;
+	u32				flags;
+	struct nfsd4_channel_attrs	fore_channel;
+	struct nfsd4_channel_attrs	back_channel;
+	u32				callback_prog;
+	u32				uid;
+	u32				gid;
+};
+
+struct nfsd4_bind_conn_to_session {
+	struct nfs4_sessionid		sessionid;
+	u32				dir;
+};
+
+/* The single slot clientid cache structure */
+struct nfsd4_clid_slot {
+	u32				sl_seqid;
+	__be32				sl_status;
+	struct nfsd4_create_session	sl_cr_ses;
+};
+
+struct nfsd4_conn {
+	struct list_head cn_persession;
+	struct svc_xprt *cn_xprt;
+	struct svc_xpt_user cn_xpt_user;
+	struct nfsd4_session *cn_session;
+/* CDFC4_FORE, CDFC4_BACK: */
+	unsigned char cn_flags;
+};
+
+struct nfsd4_session {
+	struct kref		se_ref;
+	struct list_head	se_hash;	/* hash by sessionid */
+	struct list_head	se_perclnt;
+	u32			se_flags;
+	struct nfs4_client	*se_client;
+	struct nfs4_sessionid	se_sessionid;
+	struct nfsd4_channel_attrs se_fchannel;
+	struct nfsd4_channel_attrs se_bchannel;
+	struct list_head	se_conns;
+	u32			se_cb_prog;
+	u32			se_cb_seq_nr;
+	struct nfsd4_slot	*se_slots[];	/* forward channel slots */
+};
+
+static inline void
+nfsd4_put_session(struct nfsd4_session *ses)
+{
+	extern void free_session(struct kref *kref);
+	kref_put(&ses->se_ref, free_session);
+}
+
+static inline void
+nfsd4_get_session(struct nfsd4_session *ses)
+{
+	kref_get(&ses->se_ref);
+}
+
+/* formatted contents of nfs4_sessionid */
+struct nfsd4_sessionid {
+	clientid_t	clientid;
+	u32		sequence;
+	u32		reserved;
+};
+
+#define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
+
+/*
+ * struct nfs4_client - one per client.  Clientids live here.
+ * 	o Each nfs4_client is hashed by clientid.
+ *
+ * 	o Each nfs4_clients is also hashed by name 
+ * 	  (the opaque quantity initially sent by the client to identify itself).
+ * 	  
+ *	o cl_perclient list is used to ensure no dangling stateowner references
+ *	  when we expire the nfs4_client
+ */
+struct nfs4_client {
+	struct list_head	cl_idhash; 	/* hash by cl_clientid.id */
+	struct list_head	cl_strhash; 	/* hash by cl_name */
+	struct list_head	cl_openowners;
+	struct list_head	cl_delegations;
+	struct list_head        cl_lru;         /* tail queue */
+	struct xdr_netobj	cl_name; 	/* id generated by client */
+	char                    cl_recdir[HEXDIR_LEN]; /* recovery dir */
+	nfs4_verifier		cl_verifier; 	/* generated by client */
+	time_t                  cl_time;        /* time of last lease renewal */
+	struct sockaddr_storage	cl_addr; 	/* client ipaddress */
+	u32			cl_flavor;	/* setclientid pseudoflavor */
+	char			*cl_principal;	/* setclientid principal name */
+	struct svc_cred		cl_cred; 	/* setclientid principal */
+	clientid_t		cl_clientid;	/* generated by server */
+	nfs4_verifier		cl_confirm;	/* generated by server */
+	u32			cl_firststate;	/* recovery dir creation */
+	u32			cl_minorversion;
+
+	/* for v4.0 and v4.1 callbacks: */
+	struct nfs4_cb_conn	cl_cb_conn;
+#define NFSD4_CLIENT_CB_UPDATE	1
+#define NFSD4_CLIENT_KILL	2
+	unsigned long		cl_cb_flags;
+	struct rpc_clnt		*cl_cb_client;
+	u32			cl_cb_ident;
+#define NFSD4_CB_UP		0
+#define NFSD4_CB_UNKNOWN	1
+#define NFSD4_CB_DOWN		2
+	int			cl_cb_state;
+	struct nfsd4_callback	cl_cb_null;
+	struct nfsd4_session	*cl_cb_session;
+	struct list_head	cl_callbacks; /* list of in-progress callbacks */
+
+	/* for all client information that callback code might need: */
+	spinlock_t		cl_lock;
+
+	/* for nfs41 */
+	struct list_head	cl_sessions;
+	struct nfsd4_clid_slot	cl_cs_slot;	/* create_session slot */
+	u32			cl_exchange_flags;
+	/* number of rpc's in progress over an associated session: */
+	atomic_t		cl_refcount;
+
+	/* for nfs41 callbacks */
+	/* We currently support a single back channel with a single slot */
+	unsigned long		cl_cb_slot_busy;
+	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
+						/* wait here for slots */
+};
+
+static inline void
+mark_client_expired(struct nfs4_client *clp)
+{
+	clp->cl_time = 0;
+}
+
+static inline bool
+is_client_expired(struct nfs4_client *clp)
+{
+	return clp->cl_time == 0;
+}
+
+/* struct nfs4_client_reset
+ * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
+ * upon lease reset, or from upcall to state_daemon (to read in state
+ * from non-volitile storage) upon reboot.
+ */
+struct nfs4_client_reclaim {
+	struct list_head	cr_strhash;	/* hash by cr_name */
+	char			cr_recdir[HEXDIR_LEN]; /* recover dir */
+};
+
+static inline void
+update_stateid(stateid_t *stateid)
+{
+	stateid->si_generation++;
+}
+
+/* A reasonable value for REPLAY_ISIZE was estimated as follows:  
+ * The OPEN response, typically the largest, requires 
+ *   4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) +  8(verifier) + 
+ *   4(deleg. type) + 8(deleg. stateid) + 4(deleg. recall flag) + 
+ *   20(deleg. space limit) + ~32(deleg. ace) = 112 bytes 
+ */
+
+#define NFSD4_REPLAY_ISIZE       112 
+
+/*
+ * Replay buffer, where the result of the last seqid-mutating operation 
+ * is cached. 
+ */
+struct nfs4_replay {
+	__be32			rp_status;
+	unsigned int		rp_buflen;
+	char			*rp_buf;
+	unsigned		intrp_allocated;
+	struct knfsd_fh		rp_openfh;
+	char			rp_ibuf[NFSD4_REPLAY_ISIZE];
+};
+
+/*
+* nfs4_stateowner can either be an open_owner, or a lock_owner
+*
+*    so_idhash:  stateid_hashtbl[] for open owner, lockstateid_hashtbl[]
+*         for lock_owner
+*    so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[]
+*         for lock_owner
+*    so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client
+*         struct is reaped.
+*    so_perfilestate: heads the list of nfs4_stateid (either open or lock) 
+*         and is used to ensure no dangling nfs4_stateid references when we 
+*         release a stateowner.
+*    so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when
+*         close is called to reap associated byte-range locks
+*    so_close_lru: (open) stateowner is placed on this list instead of being
+*         reaped (when so_perfilestate is empty) to hold the last close replay.
+*         reaped by laundramat thread after lease period.
+*/
+struct nfs4_stateowner {
+	struct kref		so_ref;
+	struct list_head        so_idhash;   /* hash by so_id */
+	struct list_head        so_strhash;   /* hash by op_name */
+	struct list_head        so_perclient;
+	struct list_head        so_stateids;
+	struct list_head        so_perstateid; /* for lockowners only */
+	struct list_head	so_close_lru; /* tail queue */
+	time_t			so_time; /* time of placement on so_close_lru */
+	int			so_is_open_owner; /* 1=openowner,0=lockowner */
+	u32                     so_id;
+	struct nfs4_client *    so_client;
+	/* after increment in ENCODE_SEQID_OP_TAIL, represents the next
+	 * sequence id expected from the client: */
+	u32                     so_seqid;
+	struct xdr_netobj       so_owner;     /* open owner name */
+	int                     so_confirmed; /* successful OPEN_CONFIRM? */
+	struct nfs4_replay	so_replay;
+};
+
+/*
+*  nfs4_file: a file opened by some number of (open) nfs4_stateowners.
+*    o fi_perfile list is used to search for conflicting 
+*      share_acces, share_deny on the file.
+*/
+struct nfs4_file {
+	atomic_t		fi_ref;
+	struct list_head        fi_hash;    /* hash by "struct inode *" */
+	struct list_head        fi_stateids;
+	struct list_head	fi_delegations;
+	/* One each for O_RDONLY, O_WRONLY, O_RDWR: */
+	struct file *		fi_fds[3];
+	/*
+	 * Each open or lock stateid contributes 1 to either
+	 * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending
+	 * on open or lock mode:
+	 */
+	atomic_t		fi_access[2];
+	struct file		*fi_deleg_file;
+	struct file_lock	*fi_lease;
+	atomic_t		fi_delegees;
+	struct inode		*fi_inode;
+	u32                     fi_id;      /* used with stateowner->so_id 
+					     * for stateid_hashtbl hash */
+	bool			fi_had_conflict;
+};
+
+/* XXX: for first cut may fall back on returning file that doesn't work
+ * at all? */
+static inline struct file *find_writeable_file(struct nfs4_file *f)
+{
+	if (f->fi_fds[O_WRONLY])
+		return f->fi_fds[O_WRONLY];
+	return f->fi_fds[O_RDWR];
+}
+
+static inline struct file *find_readable_file(struct nfs4_file *f)
+{
+	if (f->fi_fds[O_RDONLY])
+		return f->fi_fds[O_RDONLY];
+	return f->fi_fds[O_RDWR];
+}
+
+static inline struct file *find_any_file(struct nfs4_file *f)
+{
+	if (f->fi_fds[O_RDWR])
+		return f->fi_fds[O_RDWR];
+	else if (f->fi_fds[O_WRONLY])
+		return f->fi_fds[O_WRONLY];
+	else
+		return f->fi_fds[O_RDONLY];
+}
+
+/*
+* nfs4_stateid can either be an open stateid or (eventually) a lock stateid
+*
+* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file
+*
+* 	st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry
+* 	st_perfile: file_hashtbl[] entry.
+* 	st_perfile_state: nfs4_stateowner->so_perfilestate
+*       st_perlockowner: (open stateid) list of lock nfs4_stateowners
+* 	st_access_bmap: used only for open stateid
+* 	st_deny_bmap: used only for open stateid
+*	st_openstp: open stateid lock stateid was derived from
+*
+* XXX: open stateids and lock stateids have diverged sufficiently that
+* we should consider defining separate structs for the two cases.
+*/
+
+struct nfs4_stateid {
+	struct list_head              st_hash; 
+	struct list_head              st_perfile;
+	struct list_head              st_perstateowner;
+	struct list_head              st_lockowners;
+	struct nfs4_stateowner      * st_stateowner;
+	struct nfs4_file            * st_file;
+	stateid_t                     st_stateid;
+	unsigned long                 st_access_bmap;
+	unsigned long                 st_deny_bmap;
+	struct nfs4_stateid         * st_openstp;
+};
+
+/* flags for preprocess_seqid_op() */
+#define HAS_SESSION             0x00000001
+#define CONFIRM                 0x00000002
+#define OPEN_STATE              0x00000004
+#define LOCK_STATE              0x00000008
+#define RD_STATE	        0x00000010
+#define WR_STATE	        0x00000020
+#define CLOSE_STATE             0x00000040
+
+struct nfsd4_compound_state;
+
+extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
+		stateid_t *stateid, int flags, struct file **filp);
+extern void nfs4_lock_state(void);
+extern void nfs4_unlock_state(void);
+extern int nfs4_in_grace(void);
+extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
+extern void nfs4_free_stateowner(struct kref *kref);
+extern int set_callback_cred(void);
+extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
+extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+extern void nfsd4_do_callback_rpc(struct work_struct *);
+extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
+extern int nfsd4_create_callback_queue(void);
+extern void nfsd4_destroy_callback_queue(void);
+extern void nfsd4_shutdown_callback(struct nfs4_client *);
+extern void nfs4_put_delegation(struct nfs4_delegation *dp);
+extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
+extern void nfsd4_init_recdir(char *recdir_name);
+extern int nfsd4_recdir_load(void);
+extern void nfsd4_shutdown_recdir(void);
+extern int nfs4_client_to_reclaim(const char *name);
+extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
+extern void nfsd4_recdir_purge_old(void);
+extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
+extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
+extern void release_session_client(struct nfsd4_session *);
+
+static inline void
+nfs4_put_stateowner(struct nfs4_stateowner *so)
+{
+	kref_put(&so->so_ref, nfs4_free_stateowner);
+}
+
+static inline void
+nfs4_get_stateowner(struct nfs4_stateowner *so)
+{
+	kref_get(&so->so_ref);
+}
+
+#endif   /* NFSD4_STATE_H */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
new file mode 100644
index 00000000..a2e2402b
--- /dev/null
+++ b/fs/nfsd/stats.c
@@ -0,0 +1,104 @@
+/*
+ * procfs-based user access to knfsd statistics
+ *
+ * /proc/net/rpc/nfsd
+ *
+ * Format:
+ *	rc <hits> <misses> <nocache>
+ *			Statistsics for the reply cache
+ *	fh <stale> <total-lookups> <anonlookups> <dir-not-in-dcache> <nondir-not-in-dcache>
+ *			statistics for filehandle lookup
+ *	io <bytes-read> <bytes-written>
+ *			statistics for IO throughput
+ *	th <threads> <fullcnt> <10%-20%> <20%-30%> ... <90%-100%> <100%> 
+ *			time (seconds) when nfsd thread usage above thresholds
+ *			and number of times that all threads were in use
+ *	ra cache-size  <10%  <20%  <30% ... <100% not-found
+ *			number of times that read-ahead entry was found that deep in
+ *			the cache.
+ *	plus generic RPC stats (see net/sunrpc/stats.c)
+ *
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/nfsd/stats.h>
+
+#include "nfsd.h"
+
+struct nfsd_stats	nfsdstats;
+struct svc_stat		nfsd_svcstats = {
+	.program	= &nfsd_program,
+};
+
+static int nfsd_proc_show(struct seq_file *seq, void *v)
+{
+	int i;
+
+	seq_printf(seq, "rc %u %u %u\nfh %u %u %u %u %u\nio %u %u\n",
+		      nfsdstats.rchits,
+		      nfsdstats.rcmisses,
+		      nfsdstats.rcnocache,
+		      nfsdstats.fh_stale,
+		      nfsdstats.fh_lookup,
+		      nfsdstats.fh_anon,
+		      nfsdstats.fh_nocache_dir,
+		      nfsdstats.fh_nocache_nondir,
+		      nfsdstats.io_read,
+		      nfsdstats.io_write);
+	/* thread usage: */
+	seq_printf(seq, "th %u %u", nfsdstats.th_cnt, nfsdstats.th_fullcnt);
+	for (i=0; i<10; i++) {
+		unsigned int jifs = nfsdstats.th_usage[i];
+		unsigned int sec = jifs / HZ, msec = (jifs % HZ)*1000/HZ;
+		seq_printf(seq, " %u.%03u", sec, msec);
+	}
+
+	/* newline and ra-cache */
+	seq_printf(seq, "\nra %u", nfsdstats.ra_size);
+	for (i=0; i<11; i++)
+		seq_printf(seq, " %u", nfsdstats.ra_depth[i]);
+	seq_putc(seq, '\n');
+	
+	/* show my rpc info */
+	svc_seq_show(seq, &nfsd_svcstats);
+
+#ifdef CONFIG_NFSD_V4
+	/* Show count for individual nfsv4 operations */
+	/* Writing operation numbers 0 1 2 also for maintaining uniformity */
+	seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1);
+	for (i = 0; i <= LAST_NFS4_OP; i++)
+		seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]);
+
+	seq_putc(seq, '\n');
+#endif
+
+	return 0;
+}
+
+static int nfsd_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, nfsd_proc_show, NULL);
+}
+
+static const struct file_operations nfsd_proc_fops = {
+	.owner = THIS_MODULE,
+	.open = nfsd_proc_open,
+	.read  = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+void
+nfsd_stat_init(void)
+{
+	svc_proc_register(&nfsd_svcstats, &nfsd_proc_fops);
+}
+
+void
+nfsd_stat_shutdown(void)
+{
+	svc_proc_unregister("nfsd");
+}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
new file mode 100644
index 00000000..acf88aea
--- /dev/null
+++ b/fs/nfsd/vfs.c
@@ -0,0 +1,2273 @@
+/*
+ * File operations used by nfsd. Some of these have been ripped from
+ * other parts of the kernel because they weren't exported, others
+ * are partial duplicates with added or changed functionality.
+ *
+ * Note that several functions dget() the dentry upon which they want
+ * to act, most notably those that create directory entries. Response
+ * dentry's are dput()'d if necessary in the release callback.
+ * So if you notice code paths that apparently fail to dput() the
+ * dentry, don't worry--they have been taken care of.
+ *
+ * Copyright (C) 1995-1999 Olaf Kirch <okir@monad.swb.de>
+ * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp>
+ */
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/splice.h>
+#include <linux/fcntl.h>
+#include <linux/namei.h>
+#include <linux/delay.h>
+#include <linux/fsnotify.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+#include <linux/jhash.h>
+#include <linux/ima.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/exportfs.h>
+#include <linux/writeback.h>
+
+#ifdef CONFIG_NFSD_V3
+#include "xdr3.h"
+#endif /* CONFIG_NFSD_V3 */
+
+#ifdef CONFIG_NFSD_V4
+#include "acl.h"
+#include "idmap.h"
+#endif /* CONFIG_NFSD_V4 */
+
+#include "nfsd.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_FILEOP
+
+
+/*
+ * This is a cache of readahead params that help us choose the proper
+ * readahead strategy. Initially, we set all readahead parameters to 0
+ * and let the VFS handle things.
+ * If you increase the number of cached files very much, you'll need to
+ * add a hash table here.
+ */
+struct raparms {
+	struct raparms		*p_next;
+	unsigned int		p_count;
+	ino_t			p_ino;
+	dev_t			p_dev;
+	int			p_set;
+	struct file_ra_state	p_ra;
+	unsigned int		p_hindex;
+};
+
+struct raparm_hbucket {
+	struct raparms		*pb_head;
+	spinlock_t		pb_lock;
+} ____cacheline_aligned_in_smp;
+
+#define RAPARM_HASH_BITS	4
+#define RAPARM_HASH_SIZE	(1<<RAPARM_HASH_BITS)
+#define RAPARM_HASH_MASK	(RAPARM_HASH_SIZE-1)
+static struct raparm_hbucket	raparm_hash[RAPARM_HASH_SIZE];
+
+/* 
+ * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
+ * a mount point.
+ * Returns -EAGAIN or -ETIMEDOUT leaving *dpp and *expp unchanged,
+ *  or nfs_ok having possibly changed *dpp and *expp
+ */
+int
+nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, 
+		        struct svc_export **expp)
+{
+	struct svc_export *exp = *expp, *exp2 = NULL;
+	struct dentry *dentry = *dpp;
+	struct path path = {.mnt = mntget(exp->ex_path.mnt),
+			    .dentry = dget(dentry)};
+	int err = 0;
+
+	err = follow_down(&path);
+	if (err < 0)
+		goto out;
+
+	exp2 = rqst_exp_get_by_name(rqstp, &path);
+	if (IS_ERR(exp2)) {
+		err = PTR_ERR(exp2);
+		/*
+		 * We normally allow NFS clients to continue
+		 * "underneath" a mountpoint that is not exported.
+		 * The exception is V4ROOT, where no traversal is ever
+		 * allowed without an explicit export of the new
+		 * directory.
+		 */
+		if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT))
+			err = 0;
+		path_put(&path);
+		goto out;
+	}
+	if (nfsd_v4client(rqstp) ||
+		(exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
+		/* successfully crossed mount point */
+		/*
+		 * This is subtle: path.dentry is *not* on path.mnt
+		 * at this point.  The only reason we are safe is that
+		 * original mnt is pinned down by exp, so we should
+		 * put path *before* putting exp
+		 */
+		*dpp = path.dentry;
+		path.dentry = dentry;
+		*expp = exp2;
+		exp2 = exp;
+	}
+	path_put(&path);
+	exp_put(exp2);
+out:
+	return err;
+}
+
+static void follow_to_parent(struct path *path)
+{
+	struct dentry *dp;
+
+	while (path->dentry == path->mnt->mnt_root && follow_up(path))
+		;
+	dp = dget_parent(path->dentry);
+	dput(path->dentry);
+	path->dentry = dp;
+}
+
+static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp)
+{
+	struct svc_export *exp2;
+	struct path path = {.mnt = mntget((*exp)->ex_path.mnt),
+			    .dentry = dget(dparent)};
+
+	follow_to_parent(&path);
+
+	exp2 = rqst_exp_parent(rqstp, &path);
+	if (PTR_ERR(exp2) == -ENOENT) {
+		*dentryp = dget(dparent);
+	} else if (IS_ERR(exp2)) {
+		path_put(&path);
+		return PTR_ERR(exp2);
+	} else {
+		*dentryp = dget(path.dentry);
+		exp_put(*exp);
+		*exp = exp2;
+	}
+	path_put(&path);
+	return 0;
+}
+
+/*
+ * For nfsd purposes, we treat V4ROOT exports as though there was an
+ * export at *every* directory.
+ */
+int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
+{
+	if (d_mountpoint(dentry))
+		return 1;
+	if (!(exp->ex_flags & NFSEXP_V4ROOT))
+		return 0;
+	return dentry->d_inode != NULL;
+}
+
+__be32
+nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		   const char *name, unsigned int len,
+		   struct svc_export **exp_ret, struct dentry **dentry_ret)
+{
+	struct svc_export	*exp;
+	struct dentry		*dparent;
+	struct dentry		*dentry;
+	int			host_err;
+
+	dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
+
+	dparent = fhp->fh_dentry;
+	exp  = fhp->fh_export;
+	exp_get(exp);
+
+	/* Lookup the name, but don't follow links */
+	if (isdotent(name, len)) {
+		if (len==1)
+			dentry = dget(dparent);
+		else if (dparent != exp->ex_path.dentry)
+			dentry = dget_parent(dparent);
+		else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp))
+			dentry = dget(dparent); /* .. == . just like at / */
+		else {
+			/* checking mountpoint crossing is very different when stepping up */
+			host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry);
+			if (host_err)
+				goto out_nfserr;
+		}
+	} else {
+		fh_lock(fhp);
+		dentry = lookup_one_len(name, dparent, len);
+		host_err = PTR_ERR(dentry);
+		if (IS_ERR(dentry))
+			goto out_nfserr;
+		/*
+		 * check if we have crossed a mount point ...
+		 */
+		if (nfsd_mountpoint(dentry, exp)) {
+			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
+				dput(dentry);
+				goto out_nfserr;
+			}
+		}
+	}
+	*dentry_ret = dentry;
+	*exp_ret = exp;
+	return 0;
+
+out_nfserr:
+	exp_put(exp);
+	return nfserrno(host_err);
+}
+
+/*
+ * Look up one component of a pathname.
+ * N.B. After this call _both_ fhp and resfh need an fh_put
+ *
+ * If the lookup would cross a mountpoint, and the mounted filesystem
+ * is exported to the client with NFSEXP_NOHIDE, then the lookup is
+ * accepted as it stands and the mounted directory is
+ * returned. Otherwise the covered directory is returned.
+ * NOTE: this mountpoint crossing is not supported properly by all
+ *   clients and is explicitly disallowed for NFSv3
+ *      NeilBrown <neilb@cse.unsw.edu.au>
+ */
+__be32
+nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
+				unsigned int len, struct svc_fh *resfh)
+{
+	struct svc_export	*exp;
+	struct dentry		*dentry;
+	__be32 err;
+
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+	if (err)
+		return err;
+	err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
+	if (err)
+		return err;
+	err = check_nfsd_access(exp, rqstp);
+	if (err)
+		goto out;
+	/*
+	 * Note: we compose the file handle now, but as the
+	 * dentry may be negative, it may need to be updated.
+	 */
+	err = fh_compose(resfh, exp, dentry, fhp);
+	if (!err && !dentry->d_inode)
+		err = nfserr_noent;
+out:
+	dput(dentry);
+	exp_put(exp);
+	return err;
+}
+
+static int nfsd_break_lease(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	return break_lease(inode, O_WRONLY | O_NONBLOCK);
+}
+
+/*
+ * Commit metadata changes to stable storage.
+ */
+static int
+commit_metadata(struct svc_fh *fhp)
+{
+	struct inode *inode = fhp->fh_dentry->d_inode;
+	const struct export_operations *export_ops = inode->i_sb->s_export_op;
+
+	if (!EX_ISSYNC(fhp->fh_export))
+		return 0;
+
+	if (export_ops->commit_metadata)
+		return export_ops->commit_metadata(inode);
+	return sync_inode_metadata(inode, 1);
+}
+
+/*
+ * Set various file attributes.
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32
+nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
+	     int check_guard, time_t guardtime)
+{
+	struct dentry	*dentry;
+	struct inode	*inode;
+	int		accmode = NFSD_MAY_SATTR;
+	int		ftype = 0;
+	__be32		err;
+	int		host_err;
+	int		size_change = 0;
+
+	if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
+		accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
+	if (iap->ia_valid & ATTR_SIZE)
+		ftype = S_IFREG;
+
+	/* Get inode */
+	err = fh_verify(rqstp, fhp, ftype, accmode);
+	if (err)
+		goto out;
+
+	dentry = fhp->fh_dentry;
+	inode = dentry->d_inode;
+
+	/* Ignore any mode updates on symlinks */
+	if (S_ISLNK(inode->i_mode))
+		iap->ia_valid &= ~ATTR_MODE;
+
+	if (!iap->ia_valid)
+		goto out;
+
+	/*
+	 * NFSv2 does not differentiate between "set-[ac]time-to-now"
+	 * which only requires access, and "set-[ac]time-to-X" which
+	 * requires ownership.
+	 * So if it looks like it might be "set both to the same time which
+	 * is close to now", and if inode_change_ok fails, then we
+	 * convert to "set to now" instead of "set to explicit time"
+	 *
+	 * We only call inode_change_ok as the last test as technically
+	 * it is not an interface that we should be using.  It is only
+	 * valid if the filesystem does not define it's own i_op->setattr.
+	 */
+#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
+#define	MAX_TOUCH_TIME_ERROR (30*60)
+	if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET &&
+	    iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) {
+		/*
+		 * Looks probable.
+		 *
+		 * Now just make sure time is in the right ballpark.
+		 * Solaris, at least, doesn't seem to care what the time
+		 * request is.  We require it be within 30 minutes of now.
+		 */
+		time_t delta = iap->ia_atime.tv_sec - get_seconds();
+		if (delta < 0)
+			delta = -delta;
+		if (delta < MAX_TOUCH_TIME_ERROR &&
+		    inode_change_ok(inode, iap) != 0) {
+			/*
+			 * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME.
+			 * This will cause notify_change to set these times
+			 * to "now"
+			 */
+			iap->ia_valid &= ~BOTH_TIME_SET;
+		}
+	}
+	    
+	/*
+	 * The size case is special.
+	 * It changes the file as well as the attributes.
+	 */
+	if (iap->ia_valid & ATTR_SIZE) {
+		if (iap->ia_size < inode->i_size) {
+			err = nfsd_permission(rqstp, fhp->fh_export, dentry,
+					NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE);
+			if (err)
+				goto out;
+		}
+
+		host_err = get_write_access(inode);
+		if (host_err)
+			goto out_nfserr;
+
+		size_change = 1;
+		host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
+		if (host_err) {
+			put_write_access(inode);
+			goto out_nfserr;
+		}
+	}
+
+	/* sanitize the mode change */
+	if (iap->ia_valid & ATTR_MODE) {
+		iap->ia_mode &= S_IALLUGO;
+		iap->ia_mode |= (inode->i_mode & ~S_IALLUGO);
+	}
+
+	/* Revoke setuid/setgid on chown */
+	if (!S_ISDIR(inode->i_mode) &&
+	    (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
+	     ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) {
+		iap->ia_valid |= ATTR_KILL_PRIV;
+		if (iap->ia_valid & ATTR_MODE) {
+			/* we're setting mode too, just clear the s*id bits */
+			iap->ia_mode &= ~S_ISUID;
+			if (iap->ia_mode & S_IXGRP)
+				iap->ia_mode &= ~S_ISGID;
+		} else {
+			/* set ATTR_KILL_* bits and let VFS handle it */
+			iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID);
+		}
+	}
+
+	/* Change the attributes. */
+
+	iap->ia_valid |= ATTR_CTIME;
+
+	err = nfserr_notsync;
+	if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
+		host_err = nfsd_break_lease(inode);
+		if (host_err)
+			goto out_nfserr;
+		fh_lock(fhp);
+
+		host_err = notify_change(dentry, iap);
+		err = nfserrno(host_err);
+		fh_unlock(fhp);
+	}
+	if (size_change)
+		put_write_access(inode);
+	if (!err)
+		commit_metadata(fhp);
+out:
+	return err;
+
+out_nfserr:
+	err = nfserrno(host_err);
+	goto out;
+}
+
+#if defined(CONFIG_NFSD_V2_ACL) || \
+    defined(CONFIG_NFSD_V3_ACL) || \
+    defined(CONFIG_NFSD_V4)
+static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
+{
+	ssize_t buflen;
+	ssize_t ret;
+
+	buflen = vfs_getxattr(dentry, key, NULL, 0);
+	if (buflen <= 0)
+		return buflen;
+
+	*buf = kmalloc(buflen, GFP_KERNEL);
+	if (!*buf)
+		return -ENOMEM;
+
+	ret = vfs_getxattr(dentry, key, *buf, buflen);
+	if (ret < 0)
+		kfree(*buf);
+	return ret;
+}
+#endif
+
+#if defined(CONFIG_NFSD_V4)
+static int
+set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
+{
+	int len;
+	size_t buflen;
+	char *buf = NULL;
+	int error = 0;
+
+	buflen = posix_acl_xattr_size(pacl->a_count);
+	buf = kmalloc(buflen, GFP_KERNEL);
+	error = -ENOMEM;
+	if (buf == NULL)
+		goto out;
+
+	len = posix_acl_to_xattr(pacl, buf, buflen);
+	if (len < 0) {
+		error = len;
+		goto out;
+	}
+
+	error = vfs_setxattr(dentry, key, buf, len, 0);
+out:
+	kfree(buf);
+	return error;
+}
+
+__be32
+nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+    struct nfs4_acl *acl)
+{
+	__be32 error;
+	int host_error;
+	struct dentry *dentry;
+	struct inode *inode;
+	struct posix_acl *pacl = NULL, *dpacl = NULL;
+	unsigned int flags = 0;
+
+	/* Get inode */
+	error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
+	if (error)
+		return error;
+
+	dentry = fhp->fh_dentry;
+	inode = dentry->d_inode;
+	if (S_ISDIR(inode->i_mode))
+		flags = NFS4_ACL_DIR;
+
+	host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
+	if (host_error == -EINVAL) {
+		return nfserr_attrnotsupp;
+	} else if (host_error < 0)
+		goto out_nfserr;
+
+	host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
+	if (host_error < 0)
+		goto out_release;
+
+	if (S_ISDIR(inode->i_mode))
+		host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
+
+out_release:
+	posix_acl_release(pacl);
+	posix_acl_release(dpacl);
+out_nfserr:
+	if (host_error == -EOPNOTSUPP)
+		return nfserr_attrnotsupp;
+	else
+		return nfserrno(host_error);
+}
+
+static struct posix_acl *
+_get_posix_acl(struct dentry *dentry, char *key)
+{
+	void *buf = NULL;
+	struct posix_acl *pacl = NULL;
+	int buflen;
+
+	buflen = nfsd_getxattr(dentry, key, &buf);
+	if (!buflen)
+		buflen = -ENODATA;
+	if (buflen <= 0)
+		return ERR_PTR(buflen);
+
+	pacl = posix_acl_from_xattr(buf, buflen);
+	kfree(buf);
+	return pacl;
+}
+
+int
+nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl)
+{
+	struct inode *inode = dentry->d_inode;
+	int error = 0;
+	struct posix_acl *pacl = NULL, *dpacl = NULL;
+	unsigned int flags = 0;
+
+	pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS);
+	if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA)
+		pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+	if (IS_ERR(pacl)) {
+		error = PTR_ERR(pacl);
+		pacl = NULL;
+		goto out;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+		dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT);
+		if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA)
+			dpacl = NULL;
+		else if (IS_ERR(dpacl)) {
+			error = PTR_ERR(dpacl);
+			dpacl = NULL;
+			goto out;
+		}
+		flags = NFS4_ACL_DIR;
+	}
+
+	*acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags);
+	if (IS_ERR(*acl)) {
+		error = PTR_ERR(*acl);
+		*acl = NULL;
+	}
+ out:
+	posix_acl_release(pacl);
+	posix_acl_release(dpacl);
+	return error;
+}
+
+#endif /* defined(CONFIG_NFSD_V4) */
+
+#ifdef CONFIG_NFSD_V3
+/*
+ * Check server access rights to a file system object
+ */
+struct accessmap {
+	u32		access;
+	int		how;
+};
+static struct accessmap	nfs3_regaccess[] = {
+    {	NFS3_ACCESS_READ,	NFSD_MAY_READ			},
+    {	NFS3_ACCESS_EXECUTE,	NFSD_MAY_EXEC			},
+    {	NFS3_ACCESS_MODIFY,	NFSD_MAY_WRITE|NFSD_MAY_TRUNC	},
+    {	NFS3_ACCESS_EXTEND,	NFSD_MAY_WRITE			},
+
+    {	0,			0				}
+};
+
+static struct accessmap	nfs3_diraccess[] = {
+    {	NFS3_ACCESS_READ,	NFSD_MAY_READ			},
+    {	NFS3_ACCESS_LOOKUP,	NFSD_MAY_EXEC			},
+    {	NFS3_ACCESS_MODIFY,	NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC},
+    {	NFS3_ACCESS_EXTEND,	NFSD_MAY_EXEC|NFSD_MAY_WRITE	},
+    {	NFS3_ACCESS_DELETE,	NFSD_MAY_REMOVE			},
+
+    {	0,			0				}
+};
+
+static struct accessmap	nfs3_anyaccess[] = {
+	/* Some clients - Solaris 2.6 at least, make an access call
+	 * to the server to check for access for things like /dev/null
+	 * (which really, the server doesn't care about).  So
+	 * We provide simple access checking for them, looking
+	 * mainly at mode bits, and we make sure to ignore read-only
+	 * filesystem checks
+	 */
+    {	NFS3_ACCESS_READ,	NFSD_MAY_READ			},
+    {	NFS3_ACCESS_EXECUTE,	NFSD_MAY_EXEC			},
+    {	NFS3_ACCESS_MODIFY,	NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS	},
+    {	NFS3_ACCESS_EXTEND,	NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS	},
+
+    {	0,			0				}
+};
+
+__be32
+nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported)
+{
+	struct accessmap	*map;
+	struct svc_export	*export;
+	struct dentry		*dentry;
+	u32			query, result = 0, sresult = 0;
+	__be32			error;
+
+	error = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
+	if (error)
+		goto out;
+
+	export = fhp->fh_export;
+	dentry = fhp->fh_dentry;
+
+	if (S_ISREG(dentry->d_inode->i_mode))
+		map = nfs3_regaccess;
+	else if (S_ISDIR(dentry->d_inode->i_mode))
+		map = nfs3_diraccess;
+	else
+		map = nfs3_anyaccess;
+
+
+	query = *access;
+	for  (; map->access; map++) {
+		if (map->access & query) {
+			__be32 err2;
+
+			sresult |= map->access;
+
+			err2 = nfsd_permission(rqstp, export, dentry, map->how);
+			switch (err2) {
+			case nfs_ok:
+				result |= map->access;
+				break;
+				
+			/* the following error codes just mean the access was not allowed,
+			 * rather than an error occurred */
+			case nfserr_rofs:
+			case nfserr_acces:
+			case nfserr_perm:
+				/* simply don't "or" in the access bit. */
+				break;
+			default:
+				error = err2;
+				goto out;
+			}
+		}
+	}
+	*access = result;
+	if (supported)
+		*supported = sresult;
+
+ out:
+	return error;
+}
+#endif /* CONFIG_NFSD_V3 */
+
+static int nfsd_open_break_lease(struct inode *inode, int access)
+{
+	unsigned int mode;
+
+	if (access & NFSD_MAY_NOT_BREAK_LEASE)
+		return 0;
+	mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY;
+	return break_lease(inode, mode | O_NONBLOCK);
+}
+
+/*
+ * Open an existing file or directory.
+ * The access argument indicates the type of open (read/write/lock)
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32
+nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
+			int access, struct file **filp)
+{
+	struct dentry	*dentry;
+	struct inode	*inode;
+	int		flags = O_RDONLY|O_LARGEFILE;
+	__be32		err;
+	int		host_err = 0;
+
+	validate_process_creds();
+
+	/*
+	 * If we get here, then the client has already done an "open",
+	 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
+	 * in case a chmod has now revoked permission.
+	 */
+	err = fh_verify(rqstp, fhp, type, access | NFSD_MAY_OWNER_OVERRIDE);
+	if (err)
+		goto out;
+
+	dentry = fhp->fh_dentry;
+	inode = dentry->d_inode;
+
+	/* Disallow write access to files with the append-only bit set
+	 * or any access when mandatory locking enabled
+	 */
+	err = nfserr_perm;
+	if (IS_APPEND(inode) && (access & NFSD_MAY_WRITE))
+		goto out;
+	/*
+	 * We must ignore files (but only files) which might have mandatory
+	 * locks on them because there is no way to know if the accesser has
+	 * the lock.
+	 */
+	if (S_ISREG((inode)->i_mode) && mandatory_lock(inode))
+		goto out;
+
+	if (!inode->i_fop)
+		goto out;
+
+	host_err = nfsd_open_break_lease(inode, access);
+	if (host_err) /* NOMEM or WOULDBLOCK */
+		goto out_nfserr;
+
+	if (access & NFSD_MAY_WRITE) {
+		if (access & NFSD_MAY_READ)
+			flags = O_RDWR|O_LARGEFILE;
+		else
+			flags = O_WRONLY|O_LARGEFILE;
+	}
+	*filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
+			    flags, current_cred());
+	if (IS_ERR(*filp))
+		host_err = PTR_ERR(*filp);
+	else
+		host_err = ima_file_check(*filp, access);
+out_nfserr:
+	err = nfserrno(host_err);
+out:
+	validate_process_creds();
+	return err;
+}
+
+/*
+ * Close a file.
+ */
+void
+nfsd_close(struct file *filp)
+{
+	fput(filp);
+}
+
+/*
+ * Obtain the readahead parameters for the file
+ * specified by (dev, ino).
+ */
+
+static inline struct raparms *
+nfsd_get_raparms(dev_t dev, ino_t ino)
+{
+	struct raparms	*ra, **rap, **frap = NULL;
+	int depth = 0;
+	unsigned int hash;
+	struct raparm_hbucket *rab;
+
+	hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK;
+	rab = &raparm_hash[hash];
+
+	spin_lock(&rab->pb_lock);
+	for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) {
+		if (ra->p_ino == ino && ra->p_dev == dev)
+			goto found;
+		depth++;
+		if (ra->p_count == 0)
+			frap = rap;
+	}
+	depth = nfsdstats.ra_size;
+	if (!frap) {	
+		spin_unlock(&rab->pb_lock);
+		return NULL;
+	}
+	rap = frap;
+	ra = *frap;
+	ra->p_dev = dev;
+	ra->p_ino = ino;
+	ra->p_set = 0;
+	ra->p_hindex = hash;
+found:
+	if (rap != &rab->pb_head) {
+		*rap = ra->p_next;
+		ra->p_next   = rab->pb_head;
+		rab->pb_head = ra;
+	}
+	ra->p_count++;
+	nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;
+	spin_unlock(&rab->pb_lock);
+	return ra;
+}
+
+/*
+ * Grab and keep cached pages associated with a file in the svc_rqst
+ * so that they can be passed to the network sendmsg/sendpage routines
+ * directly. They will be released after the sending has completed.
+ */
+static int
+nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+		  struct splice_desc *sd)
+{
+	struct svc_rqst *rqstp = sd->u.data;
+	struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
+	struct page *page = buf->page;
+	size_t size;
+
+	size = sd->len;
+
+	if (rqstp->rq_res.page_len == 0) {
+		get_page(page);
+		put_page(*pp);
+		*pp = page;
+		rqstp->rq_resused++;
+		rqstp->rq_res.page_base = buf->offset;
+		rqstp->rq_res.page_len = size;
+	} else if (page != pp[-1]) {
+		get_page(page);
+		if (*pp)
+			put_page(*pp);
+		*pp = page;
+		rqstp->rq_resused++;
+		rqstp->rq_res.page_len += size;
+	} else
+		rqstp->rq_res.page_len += size;
+
+	return size;
+}
+
+static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
+				    struct splice_desc *sd)
+{
+	return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
+}
+
+static __be32
+nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
+              loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+{
+	mm_segment_t	oldfs;
+	__be32		err;
+	int		host_err;
+
+	err = nfserr_perm;
+
+	if (file->f_op->splice_read && rqstp->rq_splice_ok) {
+		struct splice_desc sd = {
+			.len		= 0,
+			.total_len	= *count,
+			.pos		= offset,
+			.u.data		= rqstp,
+		};
+
+		rqstp->rq_resused = 1;
+		host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
+	} else {
+		oldfs = get_fs();
+		set_fs(KERNEL_DS);
+		host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
+		set_fs(oldfs);
+	}
+
+	if (host_err >= 0) {
+		nfsdstats.io_read += host_err;
+		*count = host_err;
+		err = 0;
+		fsnotify_access(file);
+	} else 
+		err = nfserrno(host_err);
+	return err;
+}
+
+static void kill_suid(struct dentry *dentry)
+{
+	struct iattr	ia;
+	ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
+
+	mutex_lock(&dentry->d_inode->i_mutex);
+	notify_change(dentry, &ia);
+	mutex_unlock(&dentry->d_inode->i_mutex);
+}
+
+/*
+ * Gathered writes: If another process is currently writing to the file,
+ * there's a high chance this is another nfsd (triggered by a bulk write
+ * from a client's biod). Rather than syncing the file with each write
+ * request, we sleep for 10 msec.
+ *
+ * I don't know if this roughly approximates C. Juszak's idea of
+ * gathered writes, but it's a nice and simple solution (IMHO), and it
+ * seems to work:-)
+ *
+ * Note: we do this only in the NFSv2 case, since v3 and higher have a
+ * better tool (separate unstable writes and commits) for solving this
+ * problem.
+ */
+static int wait_for_concurrent_writes(struct file *file)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	static ino_t last_ino;
+	static dev_t last_dev;
+	int err = 0;
+
+	if (atomic_read(&inode->i_writecount) > 1
+	    || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
+		dprintk("nfsd: write defer %d\n", task_pid_nr(current));
+		msleep(10);
+		dprintk("nfsd: write resume %d\n", task_pid_nr(current));
+	}
+
+	if (inode->i_state & I_DIRTY) {
+		dprintk("nfsd: write sync %d\n", task_pid_nr(current));
+		err = vfs_fsync(file, 0);
+	}
+	last_ino = inode->i_ino;
+	last_dev = inode->i_sb->s_dev;
+	return err;
+}
+
+static __be32
+nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
+				loff_t offset, struct kvec *vec, int vlen,
+				unsigned long *cnt, int *stablep)
+{
+	struct svc_export	*exp;
+	struct dentry		*dentry;
+	struct inode		*inode;
+	mm_segment_t		oldfs;
+	__be32			err = 0;
+	int			host_err;
+	int			stable = *stablep;
+	int			use_wgather;
+
+	dentry = file->f_path.dentry;
+	inode = dentry->d_inode;
+	exp   = fhp->fh_export;
+
+	/*
+	 * Request sync writes if
+	 *  -	the sync export option has been set, or
+	 *  -	the client requested O_SYNC behavior (NFSv3 feature).
+	 *  -   The file system doesn't support fsync().
+	 * When NFSv2 gathered writes have been configured for this volume,
+	 * flushing the data to disk is handled separately below.
+	 */
+	use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
+
+	if (!file->f_op->fsync) {/* COMMIT3 cannot work */
+	       stable = 2;
+	       *stablep = 2; /* FILE_SYNC */
+	}
+
+	if (!EX_ISSYNC(exp))
+		stable = 0;
+	if (stable && !use_wgather) {
+		spin_lock(&file->f_lock);
+		file->f_flags |= O_SYNC;
+		spin_unlock(&file->f_lock);
+	}
+
+	/* Write the data. */
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
+	set_fs(oldfs);
+	if (host_err < 0)
+		goto out_nfserr;
+	*cnt = host_err;
+	nfsdstats.io_write += host_err;
+	fsnotify_modify(file);
+
+	/* clear setuid/setgid flag after write */
+	if (inode->i_mode & (S_ISUID | S_ISGID))
+		kill_suid(dentry);
+
+	if (stable && use_wgather)
+		host_err = wait_for_concurrent_writes(file);
+
+out_nfserr:
+	dprintk("nfsd: write complete host_err=%d\n", host_err);
+	if (host_err >= 0)
+		err = 0;
+	else
+		err = nfserrno(host_err);
+	return err;
+}
+
+/*
+ * Read data from a file. count must contain the requested read count
+ * on entry. On return, *count contains the number of bytes actually read.
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+	loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+{
+	struct file *file;
+	struct inode *inode;
+	struct raparms	*ra;
+	__be32 err;
+
+	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+	if (err)
+		return err;
+
+	inode = file->f_path.dentry->d_inode;
+
+	/* Get readahead parameters */
+	ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
+
+	if (ra && ra->p_set)
+		file->f_ra = ra->p_ra;
+
+	err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
+
+	/* Write back readahead params */
+	if (ra) {
+		struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
+		spin_lock(&rab->pb_lock);
+		ra->p_ra = file->f_ra;
+		ra->p_set = 1;
+		ra->p_count--;
+		spin_unlock(&rab->pb_lock);
+	}
+
+	nfsd_close(file);
+	return err;
+}
+
+/* As above, but use the provided file descriptor. */
+__be32
+nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
+		loff_t offset, struct kvec *vec, int vlen,
+		unsigned long *count)
+{
+	__be32		err;
+
+	if (file) {
+		err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
+				NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE);
+		if (err)
+			goto out;
+		err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
+	} else /* Note file may still be NULL in NFSv4 special stateid case: */
+		err = nfsd_read(rqstp, fhp, offset, vec, vlen, count);
+out:
+	return err;
+}
+
+/*
+ * Write data to a file.
+ * The stable flag requests synchronous writes.
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32
+nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
+		loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
+		int *stablep)
+{
+	__be32			err = 0;
+
+	if (file) {
+		err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
+				NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE);
+		if (err)
+			goto out;
+		err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
+				stablep);
+	} else {
+		err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+		if (err)
+			goto out;
+
+		if (cnt)
+			err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
+					     cnt, stablep);
+		nfsd_close(file);
+	}
+out:
+	return err;
+}
+
+#ifdef CONFIG_NFSD_V3
+/*
+ * Commit all pending writes to stable storage.
+ *
+ * Note: we only guarantee that data that lies within the range specified
+ * by the 'offset' and 'count' parameters will be synced.
+ *
+ * Unfortunately we cannot lock the file to make sure we return full WCC
+ * data to the client, as locking happens lower down in the filesystem.
+ */
+__be32
+nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
+               loff_t offset, unsigned long count)
+{
+	struct file	*file;
+	loff_t		end = LLONG_MAX;
+	__be32		err = nfserr_inval;
+
+	if (offset < 0)
+		goto out;
+	if (count != 0) {
+		end = offset + (loff_t)count - 1;
+		if (end < offset)
+			goto out;
+	}
+
+	err = nfsd_open(rqstp, fhp, S_IFREG,
+			NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file);
+	if (err)
+		goto out;
+	if (EX_ISSYNC(fhp->fh_export)) {
+		int err2 = vfs_fsync_range(file, offset, end, 0);
+
+		if (err2 != -EINVAL)
+			err = nfserrno(err2);
+		else
+			err = nfserr_notsupp;
+	}
+
+	nfsd_close(file);
+out:
+	return err;
+}
+#endif /* CONFIG_NFSD_V3 */
+
+static __be32
+nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
+			struct iattr *iap)
+{
+	/*
+	 * Mode has already been set earlier in create:
+	 */
+	iap->ia_valid &= ~ATTR_MODE;
+	/*
+	 * Setting uid/gid works only for root.  Irix appears to
+	 * send along the gid on create when it tries to implement
+	 * setgid directories via NFS:
+	 */
+	if (current_fsuid() != 0)
+		iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
+	if (iap->ia_valid)
+		return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+	return 0;
+}
+
+/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
+ * setting size to 0 may fail for some specific file systems by the permission
+ * checking which requires WRITE permission but the mode is 000.
+ * we ignore the resizing(to 0) on the just new created file, since the size is
+ * 0 after file created.
+ *
+ * call this only after vfs_create() is called.
+ * */
+static void
+nfsd_check_ignore_resizing(struct iattr *iap)
+{
+	if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+		iap->ia_valid &= ~ATTR_SIZE;
+}
+
+/*
+ * Create a file (regular, directory, device, fifo); UNIX sockets 
+ * not yet implemented.
+ * If the response fh has been verified, the parent directory should
+ * already be locked. Note that the parent directory is left locked.
+ *
+ * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
+ */
+__be32
+nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		char *fname, int flen, struct iattr *iap,
+		int type, dev_t rdev, struct svc_fh *resfhp)
+{
+	struct dentry	*dentry, *dchild = NULL;
+	struct inode	*dirp;
+	__be32		err;
+	__be32		err2;
+	int		host_err;
+
+	err = nfserr_perm;
+	if (!flen)
+		goto out;
+	err = nfserr_exist;
+	if (isdotent(fname, flen))
+		goto out;
+
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+	if (err)
+		goto out;
+
+	dentry = fhp->fh_dentry;
+	dirp = dentry->d_inode;
+
+	err = nfserr_notdir;
+	if (!dirp->i_op->lookup)
+		goto out;
+	/*
+	 * Check whether the response file handle has been verified yet.
+	 * If it has, the parent directory should already be locked.
+	 */
+	if (!resfhp->fh_dentry) {
+		/* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
+		fh_lock_nested(fhp, I_MUTEX_PARENT);
+		dchild = lookup_one_len(fname, dentry, flen);
+		host_err = PTR_ERR(dchild);
+		if (IS_ERR(dchild))
+			goto out_nfserr;
+		err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
+		if (err)
+			goto out;
+	} else {
+		/* called from nfsd_proc_create */
+		dchild = dget(resfhp->fh_dentry);
+		if (!fhp->fh_locked) {
+			/* not actually possible */
+			printk(KERN_ERR
+				"nfsd_create: parent %s/%s not locked!\n",
+				dentry->d_parent->d_name.name,
+				dentry->d_name.name);
+			err = nfserr_io;
+			goto out;
+		}
+	}
+	/*
+	 * Make sure the child dentry is still negative ...
+	 */
+	err = nfserr_exist;
+	if (dchild->d_inode) {
+		dprintk("nfsd_create: dentry %s/%s not negative!\n",
+			dentry->d_name.name, dchild->d_name.name);
+		goto out; 
+	}
+
+	if (!(iap->ia_valid & ATTR_MODE))
+		iap->ia_mode = 0;
+	iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
+
+	err = nfserr_inval;
+	if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) {
+		printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
+		       type);
+		goto out;
+	}
+
+	host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+	if (host_err)
+		goto out_nfserr;
+
+	/*
+	 * Get the dir op function pointer.
+	 */
+	err = 0;
+	switch (type) {
+	case S_IFREG:
+		host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+		if (!host_err)
+			nfsd_check_ignore_resizing(iap);
+		break;
+	case S_IFDIR:
+		host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
+		break;
+	}
+	if (host_err < 0) {
+		mnt_drop_write(fhp->fh_export->ex_path.mnt);
+		goto out_nfserr;
+	}
+
+	err = nfsd_create_setattr(rqstp, resfhp, iap);
+
+	/*
+	 * nfsd_setattr already committed the child.  Transactional filesystems
+	 * had a chance to commit changes for both parent and child
+	 * simultaneously making the following commit_metadata a noop.
+	 */
+	err2 = nfserrno(commit_metadata(fhp));
+	if (err2)
+		err = err2;
+	mnt_drop_write(fhp->fh_export->ex_path.mnt);
+	/*
+	 * Update the file handle to get the new inode info.
+	 */
+	if (!err)
+		err = fh_update(resfhp);
+out:
+	if (dchild && !IS_ERR(dchild))
+		dput(dchild);
+	return err;
+
+out_nfserr:
+	err = nfserrno(host_err);
+	goto out;
+}
+
+#ifdef CONFIG_NFSD_V3
+
+static inline int nfsd_create_is_exclusive(int createmode)
+{
+	return createmode == NFS3_CREATE_EXCLUSIVE
+	       || createmode == NFS4_CREATE_EXCLUSIVE4_1;
+}
+
+/*
+ * NFSv3 and NFSv4 version of nfsd_create
+ */
+__be32
+do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		char *fname, int flen, struct iattr *iap,
+		struct svc_fh *resfhp, int createmode, u32 *verifier,
+	        int *truncp, int *created)
+{
+	struct dentry	*dentry, *dchild = NULL;
+	struct inode	*dirp;
+	__be32		err;
+	int		host_err;
+	__u32		v_mtime=0, v_atime=0;
+
+	err = nfserr_perm;
+	if (!flen)
+		goto out;
+	err = nfserr_exist;
+	if (isdotent(fname, flen))
+		goto out;
+	if (!(iap->ia_valid & ATTR_MODE))
+		iap->ia_mode = 0;
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+	if (err)
+		goto out;
+
+	dentry = fhp->fh_dentry;
+	dirp = dentry->d_inode;
+
+	/* Get all the sanity checks out of the way before
+	 * we lock the parent. */
+	err = nfserr_notdir;
+	if (!dirp->i_op->lookup)
+		goto out;
+	fh_lock_nested(fhp, I_MUTEX_PARENT);
+
+	/*
+	 * Compose the response file handle.
+	 */
+	dchild = lookup_one_len(fname, dentry, flen);
+	host_err = PTR_ERR(dchild);
+	if (IS_ERR(dchild))
+		goto out_nfserr;
+
+	/* If file doesn't exist, check for permissions to create one */
+	if (!dchild->d_inode) {
+		err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+		if (err)
+			goto out;
+	}
+
+	err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
+	if (err)
+		goto out;
+
+	if (nfsd_create_is_exclusive(createmode)) {
+		/* solaris7 gets confused (bugid 4218508) if these have
+		 * the high bit set, so just clear the high bits. If this is
+		 * ever changed to use different attrs for storing the
+		 * verifier, then do_open_lookup() will also need to be fixed
+		 * accordingly.
+		 */
+		v_mtime = verifier[0]&0x7fffffff;
+		v_atime = verifier[1]&0x7fffffff;
+	}
+	
+	host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+	if (host_err)
+		goto out_nfserr;
+	if (dchild->d_inode) {
+		err = 0;
+
+		switch (createmode) {
+		case NFS3_CREATE_UNCHECKED:
+			if (! S_ISREG(dchild->d_inode->i_mode))
+				err = nfserr_exist;
+			else if (truncp) {
+				/* in nfsv4, we need to treat this case a little
+				 * differently.  we don't want to truncate the
+				 * file now; this would be wrong if the OPEN
+				 * fails for some other reason.  furthermore,
+				 * if the size is nonzero, we should ignore it
+				 * according to spec!
+				 */
+				*truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size;
+			}
+			else {
+				iap->ia_valid &= ATTR_SIZE;
+				goto set_attr;
+			}
+			break;
+		case NFS3_CREATE_EXCLUSIVE:
+			if (   dchild->d_inode->i_mtime.tv_sec == v_mtime
+			    && dchild->d_inode->i_atime.tv_sec == v_atime
+			    && dchild->d_inode->i_size  == 0 )
+				break;
+		case NFS4_CREATE_EXCLUSIVE4_1:
+			if (   dchild->d_inode->i_mtime.tv_sec == v_mtime
+			    && dchild->d_inode->i_atime.tv_sec == v_atime
+			    && dchild->d_inode->i_size  == 0 )
+				goto set_attr;
+			 /* fallthru */
+		case NFS3_CREATE_GUARDED:
+			err = nfserr_exist;
+		}
+		mnt_drop_write(fhp->fh_export->ex_path.mnt);
+		goto out;
+	}
+
+	host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+	if (host_err < 0) {
+		mnt_drop_write(fhp->fh_export->ex_path.mnt);
+		goto out_nfserr;
+	}
+	if (created)
+		*created = 1;
+
+	nfsd_check_ignore_resizing(iap);
+
+	if (nfsd_create_is_exclusive(createmode)) {
+		/* Cram the verifier into atime/mtime */
+		iap->ia_valid = ATTR_MTIME|ATTR_ATIME
+			| ATTR_MTIME_SET|ATTR_ATIME_SET;
+		/* XXX someone who knows this better please fix it for nsec */ 
+		iap->ia_mtime.tv_sec = v_mtime;
+		iap->ia_atime.tv_sec = v_atime;
+		iap->ia_mtime.tv_nsec = 0;
+		iap->ia_atime.tv_nsec = 0;
+	}
+
+ set_attr:
+	err = nfsd_create_setattr(rqstp, resfhp, iap);
+
+	/*
+	 * nfsd_setattr already committed the child (and possibly also the parent).
+	 */
+	if (!err)
+		err = nfserrno(commit_metadata(fhp));
+
+	mnt_drop_write(fhp->fh_export->ex_path.mnt);
+	/*
+	 * Update the filehandle to get the new inode info.
+	 */
+	if (!err)
+		err = fh_update(resfhp);
+
+ out:
+	fh_unlock(fhp);
+	if (dchild && !IS_ERR(dchild))
+		dput(dchild);
+ 	return err;
+ 
+ out_nfserr:
+	err = nfserrno(host_err);
+	goto out;
+}
+#endif /* CONFIG_NFSD_V3 */
+
+/*
+ * Read a symlink. On entry, *lenp must contain the maximum path length that
+ * fits into the buffer. On return, it contains the true length.
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32
+nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
+{
+	struct dentry	*dentry;
+	struct inode	*inode;
+	mm_segment_t	oldfs;
+	__be32		err;
+	int		host_err;
+
+	err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP);
+	if (err)
+		goto out;
+
+	dentry = fhp->fh_dentry;
+	inode = dentry->d_inode;
+
+	err = nfserr_inval;
+	if (!inode->i_op->readlink)
+		goto out;
+
+	touch_atime(fhp->fh_export->ex_path.mnt, dentry);
+	/* N.B. Why does this call need a get_fs()??
+	 * Remove the set_fs and watch the fireworks:-) --okir
+	 */
+
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	host_err = inode->i_op->readlink(dentry, buf, *lenp);
+	set_fs(oldfs);
+
+	if (host_err < 0)
+		goto out_nfserr;
+	*lenp = host_err;
+	err = 0;
+out:
+	return err;
+
+out_nfserr:
+	err = nfserrno(host_err);
+	goto out;
+}
+
+/*
+ * Create a symlink and look up its inode
+ * N.B. After this call _both_ fhp and resfhp need an fh_put
+ */
+__be32
+nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
+				char *fname, int flen,
+				char *path,  int plen,
+				struct svc_fh *resfhp,
+				struct iattr *iap)
+{
+	struct dentry	*dentry, *dnew;
+	__be32		err, cerr;
+	int		host_err;
+
+	err = nfserr_noent;
+	if (!flen || !plen)
+		goto out;
+	err = nfserr_exist;
+	if (isdotent(fname, flen))
+		goto out;
+
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+	if (err)
+		goto out;
+	fh_lock(fhp);
+	dentry = fhp->fh_dentry;
+	dnew = lookup_one_len(fname, dentry, flen);
+	host_err = PTR_ERR(dnew);
+	if (IS_ERR(dnew))
+		goto out_nfserr;
+
+	host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+	if (host_err)
+		goto out_nfserr;
+
+	if (unlikely(path[plen] != 0)) {
+		char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
+		if (path_alloced == NULL)
+			host_err = -ENOMEM;
+		else {
+			strncpy(path_alloced, path, plen);
+			path_alloced[plen] = 0;
+			host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced);
+			kfree(path_alloced);
+		}
+	} else
+		host_err = vfs_symlink(dentry->d_inode, dnew, path);
+	err = nfserrno(host_err);
+	if (!err)
+		err = nfserrno(commit_metadata(fhp));
+	fh_unlock(fhp);
+
+	mnt_drop_write(fhp->fh_export->ex_path.mnt);
+
+	cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
+	dput(dnew);
+	if (err==0) err = cerr;
+out:
+	return err;
+
+out_nfserr:
+	err = nfserrno(host_err);
+	goto out;
+}
+
+/*
+ * Create a hardlink
+ * N.B. After this call _both_ ffhp and tfhp need an fh_put
+ */
+__be32
+nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
+				char *name, int len, struct svc_fh *tfhp)
+{
+	struct dentry	*ddir, *dnew, *dold;
+	struct inode	*dirp;
+	__be32		err;
+	int		host_err;
+
+	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
+	if (err)
+		goto out;
+	err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP);
+	if (err)
+		goto out;
+
+	err = nfserr_perm;
+	if (!len)
+		goto out;
+	err = nfserr_exist;
+	if (isdotent(name, len))
+		goto out;
+
+	fh_lock_nested(ffhp, I_MUTEX_PARENT);
+	ddir = ffhp->fh_dentry;
+	dirp = ddir->d_inode;
+
+	dnew = lookup_one_len(name, ddir, len);
+	host_err = PTR_ERR(dnew);
+	if (IS_ERR(dnew))
+		goto out_nfserr;
+
+	dold = tfhp->fh_dentry;
+
+	host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt);
+	if (host_err) {
+		err = nfserrno(host_err);
+		goto out_dput;
+	}
+	err = nfserr_noent;
+	if (!dold->d_inode)
+		goto out_drop_write;
+	host_err = nfsd_break_lease(dold->d_inode);
+	if (host_err) {
+		err = nfserrno(host_err);
+		goto out_drop_write;
+	}
+	host_err = vfs_link(dold, dirp, dnew);
+	if (!host_err) {
+		err = nfserrno(commit_metadata(ffhp));
+		if (!err)
+			err = nfserrno(commit_metadata(tfhp));
+	} else {
+		if (host_err == -EXDEV && rqstp->rq_vers == 2)
+			err = nfserr_acces;
+		else
+			err = nfserrno(host_err);
+	}
+out_drop_write:
+	mnt_drop_write(tfhp->fh_export->ex_path.mnt);
+out_dput:
+	dput(dnew);
+out_unlock:
+	fh_unlock(ffhp);
+out:
+	return err;
+
+out_nfserr:
+	err = nfserrno(host_err);
+	goto out_unlock;
+}
+
+/*
+ * Rename a file
+ * N.B. After this call _both_ ffhp and tfhp need an fh_put
+ */
+__be32
+nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
+			    struct svc_fh *tfhp, char *tname, int tlen)
+{
+	struct dentry	*fdentry, *tdentry, *odentry, *ndentry, *trap;
+	struct inode	*fdir, *tdir;
+	__be32		err;
+	int		host_err;
+
+	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
+	if (err)
+		goto out;
+	err = fh_verify(rqstp, tfhp, S_IFDIR, NFSD_MAY_CREATE);
+	if (err)
+		goto out;
+
+	fdentry = ffhp->fh_dentry;
+	fdir = fdentry->d_inode;
+
+	tdentry = tfhp->fh_dentry;
+	tdir = tdentry->d_inode;
+
+	err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
+	if (ffhp->fh_export != tfhp->fh_export)
+		goto out;
+
+	err = nfserr_perm;
+	if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
+		goto out;
+
+	/* cannot use fh_lock as we need deadlock protective ordering
+	 * so do it by hand */
+	trap = lock_rename(tdentry, fdentry);
+	ffhp->fh_locked = tfhp->fh_locked = 1;
+	fill_pre_wcc(ffhp);
+	fill_pre_wcc(tfhp);
+
+	odentry = lookup_one_len(fname, fdentry, flen);
+	host_err = PTR_ERR(odentry);
+	if (IS_ERR(odentry))
+		goto out_nfserr;
+
+	host_err = -ENOENT;
+	if (!odentry->d_inode)
+		goto out_dput_old;
+	host_err = -EINVAL;
+	if (odentry == trap)
+		goto out_dput_old;
+
+	ndentry = lookup_one_len(tname, tdentry, tlen);
+	host_err = PTR_ERR(ndentry);
+	if (IS_ERR(ndentry))
+		goto out_dput_old;
+	host_err = -ENOTEMPTY;
+	if (ndentry == trap)
+		goto out_dput_new;
+
+	host_err = -EXDEV;
+	if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
+		goto out_dput_new;
+	host_err = mnt_want_write(ffhp->fh_export->ex_path.mnt);
+	if (host_err)
+		goto out_dput_new;
+
+	host_err = nfsd_break_lease(odentry->d_inode);
+	if (host_err)
+		goto out_drop_write;
+	if (ndentry->d_inode) {
+		host_err = nfsd_break_lease(ndentry->d_inode);
+		if (host_err)
+			goto out_drop_write;
+	}
+	host_err = vfs_rename(fdir, odentry, tdir, ndentry);
+	if (!host_err) {
+		host_err = commit_metadata(tfhp);
+		if (!host_err)
+			host_err = commit_metadata(ffhp);
+	}
+out_drop_write:
+	mnt_drop_write(ffhp->fh_export->ex_path.mnt);
+ out_dput_new:
+	dput(ndentry);
+ out_dput_old:
+	dput(odentry);
+ out_nfserr:
+	err = nfserrno(host_err);
+
+	/* we cannot reply on fh_unlock on the two filehandles,
+	 * as that would do the wrong thing if the two directories
+	 * were the same, so again we do it by hand
+	 */
+	fill_post_wcc(ffhp);
+	fill_post_wcc(tfhp);
+	unlock_rename(tdentry, fdentry);
+	ffhp->fh_locked = tfhp->fh_locked = 0;
+
+out:
+	return err;
+}
+
+/*
+ * Unlink a file or directory
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32
+nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
+				char *fname, int flen)
+{
+	struct dentry	*dentry, *rdentry;
+	struct inode	*dirp;
+	__be32		err;
+	int		host_err;
+
+	err = nfserr_acces;
+	if (!flen || isdotent(fname, flen))
+		goto out;
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_REMOVE);
+	if (err)
+		goto out;
+
+	fh_lock_nested(fhp, I_MUTEX_PARENT);
+	dentry = fhp->fh_dentry;
+	dirp = dentry->d_inode;
+
+	rdentry = lookup_one_len(fname, dentry, flen);
+	host_err = PTR_ERR(rdentry);
+	if (IS_ERR(rdentry))
+		goto out_nfserr;
+
+	if (!rdentry->d_inode) {
+		dput(rdentry);
+		err = nfserr_noent;
+		goto out;
+	}
+
+	if (!type)
+		type = rdentry->d_inode->i_mode & S_IFMT;
+
+	host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+	if (host_err)
+		goto out_put;
+
+	host_err = nfsd_break_lease(rdentry->d_inode);
+	if (host_err)
+		goto out_drop_write;
+	if (type != S_IFDIR)
+		host_err = vfs_unlink(dirp, rdentry);
+	else
+		host_err = vfs_rmdir(dirp, rdentry);
+	if (!host_err)
+		host_err = commit_metadata(fhp);
+out_drop_write:
+	mnt_drop_write(fhp->fh_export->ex_path.mnt);
+out_put:
+	dput(rdentry);
+
+out_nfserr:
+	err = nfserrno(host_err);
+out:
+	return err;
+}
+
+/*
+ * We do this buffering because we must not call back into the file
+ * system's ->lookup() method from the filldir callback. That may well
+ * deadlock a number of file systems.
+ *
+ * This is based heavily on the implementation of same in XFS.
+ */
+struct buffered_dirent {
+	u64		ino;
+	loff_t		offset;
+	int		namlen;
+	unsigned int	d_type;
+	char		name[];
+};
+
+struct readdir_data {
+	char		*dirent;
+	size_t		used;
+	int		full;
+};
+
+static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
+				 loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct readdir_data *buf = __buf;
+	struct buffered_dirent *de = (void *)(buf->dirent + buf->used);
+	unsigned int reclen;
+
+	reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64));
+	if (buf->used + reclen > PAGE_SIZE) {
+		buf->full = 1;
+		return -EINVAL;
+	}
+
+	de->namlen = namlen;
+	de->offset = offset;
+	de->ino = ino;
+	de->d_type = d_type;
+	memcpy(de->name, name, namlen);
+	buf->used += reclen;
+
+	return 0;
+}
+
+static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
+				    struct readdir_cd *cdp, loff_t *offsetp)
+{
+	struct readdir_data buf;
+	struct buffered_dirent *de;
+	int host_err;
+	int size;
+	loff_t offset;
+
+	buf.dirent = (void *)__get_free_page(GFP_KERNEL);
+	if (!buf.dirent)
+		return nfserrno(-ENOMEM);
+
+	offset = *offsetp;
+
+	while (1) {
+		struct inode *dir_inode = file->f_path.dentry->d_inode;
+		unsigned int reclen;
+
+		cdp->err = nfserr_eof; /* will be cleared on successful read */
+		buf.used = 0;
+		buf.full = 0;
+
+		host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf);
+		if (buf.full)
+			host_err = 0;
+
+		if (host_err < 0)
+			break;
+
+		size = buf.used;
+
+		if (!size)
+			break;
+
+		/*
+		 * Various filldir functions may end up calling back into
+		 * lookup_one_len() and the file system's ->lookup() method.
+		 * These expect i_mutex to be held, as it would within readdir.
+		 */
+		host_err = mutex_lock_killable(&dir_inode->i_mutex);
+		if (host_err)
+			break;
+
+		de = (struct buffered_dirent *)buf.dirent;
+		while (size > 0) {
+			offset = de->offset;
+
+			if (func(cdp, de->name, de->namlen, de->offset,
+				 de->ino, de->d_type))
+				break;
+
+			if (cdp->err != nfs_ok)
+				break;
+
+			reclen = ALIGN(sizeof(*de) + de->namlen,
+				       sizeof(u64));
+			size -= reclen;
+			de = (struct buffered_dirent *)((char *)de + reclen);
+		}
+		mutex_unlock(&dir_inode->i_mutex);
+		if (size > 0) /* We bailed out early */
+			break;
+
+		offset = vfs_llseek(file, 0, SEEK_CUR);
+	}
+
+	free_page((unsigned long)(buf.dirent));
+
+	if (host_err)
+		return nfserrno(host_err);
+
+	*offsetp = offset;
+	return cdp->err;
+}
+
+/*
+ * Read entries from a directory.
+ * The  NFSv3/4 verifier we ignore for now.
+ */
+__be32
+nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, 
+	     struct readdir_cd *cdp, filldir_t func)
+{
+	__be32		err;
+	struct file	*file;
+	loff_t		offset = *offsetp;
+
+	err = nfsd_open(rqstp, fhp, S_IFDIR, NFSD_MAY_READ, &file);
+	if (err)
+		goto out;
+
+	offset = vfs_llseek(file, offset, 0);
+	if (offset < 0) {
+		err = nfserrno((int)offset);
+		goto out_close;
+	}
+
+	err = nfsd_buffered_readdir(file, func, cdp, offsetp);
+
+	if (err == nfserr_eof || err == nfserr_toosmall)
+		err = nfs_ok; /* can still be found in ->err */
+out_close:
+	nfsd_close(file);
+out:
+	return err;
+}
+
+/*
+ * Get file system stats
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32
+nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
+{
+	__be32 err;
+
+	err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
+	if (!err) {
+		struct path path = {
+			.mnt	= fhp->fh_export->ex_path.mnt,
+			.dentry	= fhp->fh_dentry,
+		};
+		if (vfs_statfs(&path, stat))
+			err = nfserr_io;
+	}
+	return err;
+}
+
+static int exp_rdonly(struct svc_rqst *rqstp, struct svc_export *exp)
+{
+	return nfsexp_flags(rqstp, exp) & NFSEXP_READONLY;
+}
+
+/*
+ * Check for a user's access permissions to this inode.
+ */
+__be32
+nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
+					struct dentry *dentry, int acc)
+{
+	struct inode	*inode = dentry->d_inode;
+	int		err;
+
+	if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP)
+		return 0;
+#if 0
+	dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
+		acc,
+		(acc & NFSD_MAY_READ)?	" read"  : "",
+		(acc & NFSD_MAY_WRITE)?	" write" : "",
+		(acc & NFSD_MAY_EXEC)?	" exec"  : "",
+		(acc & NFSD_MAY_SATTR)?	" sattr" : "",
+		(acc & NFSD_MAY_TRUNC)?	" trunc" : "",
+		(acc & NFSD_MAY_LOCK)?	" lock"  : "",
+		(acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "",
+		inode->i_mode,
+		IS_IMMUTABLE(inode)?	" immut" : "",
+		IS_APPEND(inode)?	" append" : "",
+		__mnt_is_readonly(exp->ex_path.mnt)?	" ro" : "");
+	dprintk("      owner %d/%d user %d/%d\n",
+		inode->i_uid, inode->i_gid, current_fsuid(), current_fsgid());
+#endif
+
+	/* Normally we reject any write/sattr etc access on a read-only file
+	 * system.  But if it is IRIX doing check on write-access for a 
+	 * device special file, we ignore rofs.
+	 */
+	if (!(acc & NFSD_MAY_LOCAL_ACCESS))
+		if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) {
+			if (exp_rdonly(rqstp, exp) ||
+			    __mnt_is_readonly(exp->ex_path.mnt))
+				return nfserr_rofs;
+			if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode))
+				return nfserr_perm;
+		}
+	if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode))
+		return nfserr_perm;
+
+	if (acc & NFSD_MAY_LOCK) {
+		/* If we cannot rely on authentication in NLM requests,
+		 * just allow locks, otherwise require read permission, or
+		 * ownership
+		 */
+		if (exp->ex_flags & NFSEXP_NOAUTHNLM)
+			return 0;
+		else
+			acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE;
+	}
+	/*
+	 * The file owner always gets access permission for accesses that
+	 * would normally be checked at open time. This is to make
+	 * file access work even when the client has done a fchmod(fd, 0).
+	 *
+	 * However, `cp foo bar' should fail nevertheless when bar is
+	 * readonly. A sensible way to do this might be to reject all
+	 * attempts to truncate a read-only file, because a creat() call
+	 * always implies file truncation.
+	 * ... but this isn't really fair.  A process may reasonably call
+	 * ftruncate on an open file descriptor on a file with perm 000.
+	 * We must trust the client to do permission checking - using "ACCESS"
+	 * with NFSv3.
+	 */
+	if ((acc & NFSD_MAY_OWNER_OVERRIDE) &&
+	    inode->i_uid == current_fsuid())
+		return 0;
+
+	/* This assumes  NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
+	err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC));
+
+	/* Allow read access to binaries even when mode 111 */
+	if (err == -EACCES && S_ISREG(inode->i_mode) &&
+	     (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) ||
+	      acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC)))
+		err = inode_permission(inode, MAY_EXEC);
+
+	return err? nfserrno(err) : 0;
+}
+
+void
+nfsd_racache_shutdown(void)
+{
+	struct raparms *raparm, *last_raparm;
+	unsigned int i;
+
+	dprintk("nfsd: freeing readahead buffers.\n");
+
+	for (i = 0; i < RAPARM_HASH_SIZE; i++) {
+		raparm = raparm_hash[i].pb_head;
+		while(raparm) {
+			last_raparm = raparm;
+			raparm = raparm->p_next;
+			kfree(last_raparm);
+		}
+		raparm_hash[i].pb_head = NULL;
+	}
+}
+/*
+ * Initialize readahead param cache
+ */
+int
+nfsd_racache_init(int cache_size)
+{
+	int	i;
+	int	j = 0;
+	int	nperbucket;
+	struct raparms **raparm = NULL;
+
+
+	if (raparm_hash[0].pb_head)
+		return 0;
+	nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
+	if (nperbucket < 2)
+		nperbucket = 2;
+	cache_size = nperbucket * RAPARM_HASH_SIZE;
+
+	dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
+
+	for (i = 0; i < RAPARM_HASH_SIZE; i++) {
+		spin_lock_init(&raparm_hash[i].pb_lock);
+
+		raparm = &raparm_hash[i].pb_head;
+		for (j = 0; j < nperbucket; j++) {
+			*raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
+			if (!*raparm)
+				goto out_nomem;
+			raparm = &(*raparm)->p_next;
+		}
+		*raparm = NULL;
+	}
+
+	nfsdstats.ra_size = cache_size;
+	return 0;
+
+out_nomem:
+	dprintk("nfsd: kmalloc failed, freeing readahead buffers\n");
+	nfsd_racache_shutdown();
+	return -ENOMEM;
+}
+
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+struct posix_acl *
+nfsd_get_posix_acl(struct svc_fh *fhp, int type)
+{
+	struct inode *inode = fhp->fh_dentry->d_inode;
+	char *name;
+	void *value = NULL;
+	ssize_t size;
+	struct posix_acl *acl;
+
+	if (!IS_POSIXACL(inode))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	size = nfsd_getxattr(fhp->fh_dentry, name, &value);
+	if (size < 0)
+		return ERR_PTR(size);
+
+	acl = posix_acl_from_xattr(value, size);
+	kfree(value);
+	return acl;
+}
+
+int
+nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
+{
+	struct inode *inode = fhp->fh_dentry->d_inode;
+	char *name;
+	void *value = NULL;
+	size_t size;
+	int error;
+
+	if (!IS_POSIXACL(inode) ||
+	    !inode->i_op->setxattr || !inode->i_op->removexattr)
+		return -EOPNOTSUPP;
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			name = POSIX_ACL_XATTR_ACCESS;
+			break;
+		case ACL_TYPE_DEFAULT:
+			name = POSIX_ACL_XATTR_DEFAULT;
+			break;
+		default:
+			return -EOPNOTSUPP;
+	}
+
+	if (acl && acl->a_count) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(size, GFP_KERNEL);
+		if (!value)
+			return -ENOMEM;
+		error = posix_acl_to_xattr(acl, value, size);
+		if (error < 0)
+			goto getout;
+		size = error;
+	} else
+		size = 0;
+
+	error = mnt_want_write(fhp->fh_export->ex_path.mnt);
+	if (error)
+		goto getout;
+	if (size)
+		error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
+	else {
+		if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT)
+			error = 0;
+		else {
+			error = vfs_removexattr(fhp->fh_dentry, name);
+			if (error == -ENODATA)
+				error = 0;
+		}
+	}
+	mnt_drop_write(fhp->fh_export->ex_path.mnt);
+
+getout:
+	kfree(value);
+	return error;
+}
+#endif  /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
new file mode 100644
index 00000000..a22e40e2
--- /dev/null
+++ b/fs/nfsd/vfs.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef LINUX_NFSD_VFS_H
+#define LINUX_NFSD_VFS_H
+
+#include "nfsfh.h"
+
+/*
+ * Flags for nfsd_permission
+ */
+#define NFSD_MAY_NOP		0
+#define NFSD_MAY_EXEC		1 /* == MAY_EXEC */
+#define NFSD_MAY_WRITE		2 /* == MAY_WRITE */
+#define NFSD_MAY_READ		4 /* == MAY_READ */
+#define NFSD_MAY_SATTR		8
+#define NFSD_MAY_TRUNC		16
+#define NFSD_MAY_LOCK		32
+#define NFSD_MAY_MASK		63
+
+/* extra hints to permission and open routines: */
+#define NFSD_MAY_OWNER_OVERRIDE	64
+#define NFSD_MAY_LOCAL_ACCESS	128 /* IRIX doing local access check on device special file*/
+#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
+#define NFSD_MAY_NOT_BREAK_LEASE 512
+#define NFSD_MAY_BYPASS_GSS	1024
+#define NFSD_MAY_READ_IF_EXEC	2048
+
+#define NFSD_MAY_CREATE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE)
+#define NFSD_MAY_REMOVE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
+
+/*
+ * Callback function for readdir
+ */
+typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
+
+/* nfsd/vfs.c */
+int		fh_lock_parent(struct svc_fh *, struct dentry *);
+int		nfsd_racache_init(int);
+void		nfsd_racache_shutdown(void);
+int		nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
+		                struct svc_export **expp);
+__be32		nfsd_lookup(struct svc_rqst *, struct svc_fh *,
+				const char *, unsigned int, struct svc_fh *);
+__be32		 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
+				const char *, unsigned int,
+				struct svc_export **, struct dentry **);
+__be32		nfsd_setattr(struct svc_rqst *, struct svc_fh *,
+				struct iattr *, int, time_t);
+int nfsd_mountpoint(struct dentry *, struct svc_export *);
+#ifdef CONFIG_NFSD_V4
+__be32          nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
+                    struct nfs4_acl *);
+int             nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
+#endif /* CONFIG_NFSD_V4 */
+__be32		nfsd_create(struct svc_rqst *, struct svc_fh *,
+				char *name, int len, struct iattr *attrs,
+				int type, dev_t rdev, struct svc_fh *res);
+#ifdef CONFIG_NFSD_V3
+__be32		nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
+__be32		do_nfsd_create(struct svc_rqst *, struct svc_fh *,
+				char *name, int len, struct iattr *attrs,
+				struct svc_fh *res, int createmode,
+				u32 *verifier, int *truncp, int *created);
+__be32		nfsd_commit(struct svc_rqst *, struct svc_fh *,
+				loff_t, unsigned long);
+#endif /* CONFIG_NFSD_V3 */
+__be32		nfsd_open(struct svc_rqst *, struct svc_fh *, int,
+				int, struct file **);
+void		nfsd_close(struct file *);
+__be32 		nfsd_read(struct svc_rqst *, struct svc_fh *,
+				loff_t, struct kvec *, int, unsigned long *);
+__be32 		nfsd_read_file(struct svc_rqst *, struct svc_fh *, struct file *,
+				loff_t, struct kvec *, int, unsigned long *);
+__be32 		nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
+				loff_t, struct kvec *,int, unsigned long *, int *);
+__be32		nfsd_readlink(struct svc_rqst *, struct svc_fh *,
+				char *, int *);
+__be32		nfsd_symlink(struct svc_rqst *, struct svc_fh *,
+				char *name, int len, char *path, int plen,
+				struct svc_fh *res, struct iattr *);
+__be32		nfsd_link(struct svc_rqst *, struct svc_fh *,
+				char *, int, struct svc_fh *);
+__be32		nfsd_rename(struct svc_rqst *,
+				struct svc_fh *, char *, int,
+				struct svc_fh *, char *, int);
+__be32		nfsd_remove(struct svc_rqst *,
+				struct svc_fh *, char *, int);
+__be32		nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
+				char *name, int len);
+int		nfsd_truncate(struct svc_rqst *, struct svc_fh *,
+				unsigned long size);
+__be32		nfsd_readdir(struct svc_rqst *, struct svc_fh *,
+			     loff_t *, struct readdir_cd *, filldir_t);
+__be32		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
+				struct kstatfs *, int access);
+
+int		nfsd_notify_change(struct inode *, struct iattr *);
+__be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
+				struct dentry *, int);
+int		nfsd_sync_dir(struct dentry *dp);
+
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
+int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
+#endif
+
+#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
new file mode 100644
index 00000000..53b1863d
--- /dev/null
+++ b/fs/nfsd/xdr.h
@@ -0,0 +1,173 @@
+/* XDR types for nfsd. This is mainly a typing exercise. */
+
+#ifndef LINUX_NFSD_H
+#define LINUX_NFSD_H
+
+#include <linux/vfs.h>
+#include "nfsd.h"
+#include "nfsfh.h"
+
+struct nfsd_fhandle {
+	struct svc_fh		fh;
+};
+
+struct nfsd_sattrargs {
+	struct svc_fh		fh;
+	struct iattr		attrs;
+};
+
+struct nfsd_diropargs {
+	struct svc_fh		fh;
+	char *			name;
+	unsigned int		len;
+};
+
+struct nfsd_readargs {
+	struct svc_fh		fh;
+	__u32			offset;
+	__u32			count;
+	int			vlen;
+};
+
+struct nfsd_writeargs {
+	svc_fh			fh;
+	__u32			offset;
+	int			len;
+	int			vlen;
+};
+
+struct nfsd_createargs {
+	struct svc_fh		fh;
+	char *			name;
+	unsigned int		len;
+	struct iattr		attrs;
+};
+
+struct nfsd_renameargs {
+	struct svc_fh		ffh;
+	char *			fname;
+	unsigned int		flen;
+	struct svc_fh		tfh;
+	char *			tname;
+	unsigned int		tlen;
+};
+
+struct nfsd_readlinkargs {
+	struct svc_fh		fh;
+	char *			buffer;
+};
+	
+struct nfsd_linkargs {
+	struct svc_fh		ffh;
+	struct svc_fh		tfh;
+	char *			tname;
+	unsigned int		tlen;
+};
+
+struct nfsd_symlinkargs {
+	struct svc_fh		ffh;
+	char *			fname;
+	unsigned int		flen;
+	char *			tname;
+	unsigned int		tlen;
+	struct iattr		attrs;
+};
+
+struct nfsd_readdirargs {
+	struct svc_fh		fh;
+	__u32			cookie;
+	__u32			count;
+	__be32 *		buffer;
+};
+
+struct nfsd_attrstat {
+	struct svc_fh		fh;
+	struct kstat		stat;
+};
+
+struct nfsd_diropres  {
+	struct svc_fh		fh;
+	struct kstat		stat;
+};
+
+struct nfsd_readlinkres {
+	int			len;
+};
+
+struct nfsd_readres {
+	struct svc_fh		fh;
+	unsigned long		count;
+	struct kstat		stat;
+};
+
+struct nfsd_readdirres {
+	int			count;
+
+	struct readdir_cd	common;
+	__be32 *		buffer;
+	int			buflen;
+	__be32 *		offset;
+};
+
+struct nfsd_statfsres {
+	struct kstatfs		stats;
+};
+
+/*
+ * Storage requirements for XDR arguments and results.
+ */
+union nfsd_xdrstore {
+	struct nfsd_sattrargs	sattr;
+	struct nfsd_diropargs	dirop;
+	struct nfsd_readargs	read;
+	struct nfsd_writeargs	write;
+	struct nfsd_createargs	create;
+	struct nfsd_renameargs	rename;
+	struct nfsd_linkargs	link;
+	struct nfsd_symlinkargs	symlink;
+	struct nfsd_readdirargs	readdir;
+};
+
+#define NFS2_SVC_XDRSIZE	sizeof(union nfsd_xdrstore)
+
+
+int nfssvc_decode_void(struct svc_rqst *, __be32 *, void *);
+int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
+int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *,
+				struct nfsd_sattrargs *);
+int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *,
+				struct nfsd_diropargs *);
+int nfssvc_decode_readargs(struct svc_rqst *, __be32 *,
+				struct nfsd_readargs *);
+int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *,
+				struct nfsd_writeargs *);
+int nfssvc_decode_createargs(struct svc_rqst *, __be32 *,
+				struct nfsd_createargs *);
+int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *,
+				struct nfsd_renameargs *);
+int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *,
+				struct nfsd_readlinkargs *);
+int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *,
+				struct nfsd_linkargs *);
+int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *,
+				struct nfsd_symlinkargs *);
+int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *,
+				struct nfsd_readdirargs *);
+int nfssvc_encode_void(struct svc_rqst *, __be32 *, void *);
+int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *, struct nfsd_attrstat *);
+int nfssvc_encode_diropres(struct svc_rqst *, __be32 *, struct nfsd_diropres *);
+int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *, struct nfsd_readlinkres *);
+int nfssvc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd_readres *);
+int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *, struct nfsd_statfsres *);
+int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *, struct nfsd_readdirres *);
+
+int nfssvc_encode_entry(void *, const char *name,
+			int namlen, loff_t offset, u64 ino, unsigned int);
+
+int nfssvc_release_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
+
+/* Helper functions for NFSv2 ACL code */
+__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp);
+__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp);
+
+#endif /* LINUX_NFSD_H */
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
new file mode 100644
index 00000000..7df980eb
--- /dev/null
+++ b/fs/nfsd/xdr3.h
@@ -0,0 +1,344 @@
+/*
+ * XDR types for NFSv3 in nfsd.
+ *
+ * Copyright (C) 1996-1998, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef _LINUX_NFSD_XDR3_H
+#define _LINUX_NFSD_XDR3_H
+
+#include "xdr.h"
+
+struct nfsd3_sattrargs {
+	struct svc_fh		fh;
+	struct iattr		attrs;
+	int			check_guard;
+	time_t			guardtime;
+};
+
+struct nfsd3_diropargs {
+	struct svc_fh		fh;
+	char *			name;
+	unsigned int		len;
+};
+
+struct nfsd3_accessargs {
+	struct svc_fh		fh;
+	unsigned int		access;
+};
+
+struct nfsd3_readargs {
+	struct svc_fh		fh;
+	__u64			offset;
+	__u32			count;
+	int			vlen;
+};
+
+struct nfsd3_writeargs {
+	svc_fh			fh;
+	__u64			offset;
+	__u32			count;
+	int			stable;
+	__u32			len;
+	int			vlen;
+};
+
+struct nfsd3_createargs {
+	struct svc_fh		fh;
+	char *			name;
+	unsigned int		len;
+	int			createmode;
+	struct iattr		attrs;
+	__be32 *		verf;
+};
+
+struct nfsd3_mknodargs {
+	struct svc_fh		fh;
+	char *			name;
+	unsigned int		len;
+	__u32			ftype;
+	__u32			major, minor;
+	struct iattr		attrs;
+};
+
+struct nfsd3_renameargs {
+	struct svc_fh		ffh;
+	char *			fname;
+	unsigned int		flen;
+	struct svc_fh		tfh;
+	char *			tname;
+	unsigned int		tlen;
+};
+
+struct nfsd3_readlinkargs {
+	struct svc_fh		fh;
+	char *			buffer;
+};
+
+struct nfsd3_linkargs {
+	struct svc_fh		ffh;
+	struct svc_fh		tfh;
+	char *			tname;
+	unsigned int		tlen;
+};
+
+struct nfsd3_symlinkargs {
+	struct svc_fh		ffh;
+	char *			fname;
+	unsigned int		flen;
+	char *			tname;
+	unsigned int		tlen;
+	struct iattr		attrs;
+};
+
+struct nfsd3_readdirargs {
+	struct svc_fh		fh;
+	__u64			cookie;
+	__u32			dircount;
+	__u32			count;
+	__be32 *		verf;
+	__be32 *		buffer;
+};
+
+struct nfsd3_commitargs {
+	struct svc_fh		fh;
+	__u64			offset;
+	__u32			count;
+};
+
+struct nfsd3_getaclargs {
+	struct svc_fh		fh;
+	int			mask;
+};
+
+struct posix_acl;
+struct nfsd3_setaclargs {
+	struct svc_fh		fh;
+	int			mask;
+	struct posix_acl	*acl_access;
+	struct posix_acl	*acl_default;
+};
+
+struct nfsd3_attrstat {
+	__be32			status;
+	struct svc_fh		fh;
+	struct kstat            stat;
+};
+
+/* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */
+struct nfsd3_diropres  {
+	__be32			status;
+	struct svc_fh		dirfh;
+	struct svc_fh		fh;
+};
+
+struct nfsd3_accessres {
+	__be32			status;
+	struct svc_fh		fh;
+	__u32			access;
+};
+
+struct nfsd3_readlinkres {
+	__be32			status;
+	struct svc_fh		fh;
+	__u32			len;
+};
+
+struct nfsd3_readres {
+	__be32			status;
+	struct svc_fh		fh;
+	unsigned long		count;
+	int			eof;
+};
+
+struct nfsd3_writeres {
+	__be32			status;
+	struct svc_fh		fh;
+	unsigned long		count;
+	int			committed;
+};
+
+struct nfsd3_renameres {
+	__be32			status;
+	struct svc_fh		ffh;
+	struct svc_fh		tfh;
+};
+
+struct nfsd3_linkres {
+	__be32			status;
+	struct svc_fh		tfh;
+	struct svc_fh		fh;
+};
+
+struct nfsd3_readdirres {
+	__be32			status;
+	struct svc_fh		fh;
+	int			count;
+	__be32			verf[2];
+
+	struct readdir_cd	common;
+	__be32 *		buffer;
+	int			buflen;
+	__be32 *		offset;
+	__be32 *		offset1;
+	struct svc_rqst *	rqstp;
+
+};
+
+struct nfsd3_fsstatres {
+	__be32			status;
+	struct kstatfs		stats;
+	__u32			invarsec;
+};
+
+struct nfsd3_fsinfores {
+	__be32			status;
+	__u32			f_rtmax;
+	__u32			f_rtpref;
+	__u32			f_rtmult;
+	__u32			f_wtmax;
+	__u32			f_wtpref;
+	__u32			f_wtmult;
+	__u32			f_dtpref;
+	__u64			f_maxfilesize;
+	__u32			f_properties;
+};
+
+struct nfsd3_pathconfres {
+	__be32			status;
+	__u32			p_link_max;
+	__u32			p_name_max;
+	__u32			p_no_trunc;
+	__u32			p_chown_restricted;
+	__u32			p_case_insensitive;
+	__u32			p_case_preserving;
+};
+
+struct nfsd3_commitres {
+	__be32			status;
+	struct svc_fh		fh;
+};
+
+struct nfsd3_getaclres {
+	__be32			status;
+	struct svc_fh		fh;
+	int			mask;
+	struct posix_acl	*acl_access;
+	struct posix_acl	*acl_default;
+};
+
+/* dummy type for release */
+struct nfsd3_fhandle_pair {
+	__u32			dummy;
+	struct svc_fh		fh1;
+	struct svc_fh		fh2;
+};
+
+/*
+ * Storage requirements for XDR arguments and results.
+ */
+union nfsd3_xdrstore {
+	struct nfsd3_sattrargs		sattrargs;
+	struct nfsd3_diropargs		diropargs;
+	struct nfsd3_readargs		readargs;
+	struct nfsd3_writeargs		writeargs;
+	struct nfsd3_createargs		createargs;
+	struct nfsd3_renameargs		renameargs;
+	struct nfsd3_linkargs		linkargs;
+	struct nfsd3_symlinkargs	symlinkargs;
+	struct nfsd3_readdirargs	readdirargs;
+	struct nfsd3_diropres 		diropres;
+	struct nfsd3_accessres		accessres;
+	struct nfsd3_readlinkres	readlinkres;
+	struct nfsd3_readres		readres;
+	struct nfsd3_writeres		writeres;
+	struct nfsd3_renameres		renameres;
+	struct nfsd3_linkres		linkres;
+	struct nfsd3_readdirres		readdirres;
+	struct nfsd3_fsstatres		fsstatres;
+	struct nfsd3_fsinfores		fsinfores;
+	struct nfsd3_pathconfres	pathconfres;
+	struct nfsd3_commitres		commitres;
+	struct nfsd3_getaclres		getaclres;
+};
+
+#define NFS3_SVC_XDRSIZE		sizeof(union nfsd3_xdrstore)
+
+int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
+int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_sattrargs *);
+int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_diropargs *);
+int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_accessargs *);
+int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_readargs *);
+int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_writeargs *);
+int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_createargs *);
+int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_createargs *);
+int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_mknodargs *);
+int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_renameargs *);
+int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_readlinkargs *);
+int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_linkargs *);
+int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_symlinkargs *);
+int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_readdirargs *);
+int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_readdirargs *);
+int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *,
+				struct nfsd3_commitargs *);
+int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
+int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *,
+				struct nfsd3_attrstat *);
+int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *,
+				struct nfsd3_attrstat *);
+int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *,
+				struct nfsd3_diropres *);
+int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *,
+				struct nfsd3_accessres *);
+int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *,
+				struct nfsd3_readlinkres *);
+int nfs3svc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd3_readres *);
+int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *, struct nfsd3_writeres *);
+int nfs3svc_encode_createres(struct svc_rqst *, __be32 *,
+				struct nfsd3_diropres *);
+int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *,
+				struct nfsd3_renameres *);
+int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *,
+				struct nfsd3_linkres *);
+int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *,
+				struct nfsd3_readdirres *);
+int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *,
+				struct nfsd3_fsstatres *);
+int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *,
+				struct nfsd3_fsinfores *);
+int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *,
+				struct nfsd3_pathconfres *);
+int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *,
+				struct nfsd3_commitres *);
+
+int nfs3svc_release_fhandle(struct svc_rqst *, __be32 *,
+				struct nfsd3_attrstat *);
+int nfs3svc_release_fhandle2(struct svc_rqst *, __be32 *,
+				struct nfsd3_fhandle_pair *);
+int nfs3svc_encode_entry(void *, const char *name,
+				int namlen, loff_t offset, u64 ino,
+				unsigned int);
+int nfs3svc_encode_entry_plus(void *, const char *name,
+				int namlen, loff_t offset, u64 ino,
+				unsigned int);
+/* Helper functions for NFSv3 ACL code */
+__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p,
+				struct svc_fh *fhp);
+__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp);
+
+
+#endif /* _LINUX_NFSD_XDR3_H */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
new file mode 100644
index 00000000..366401e1
--- /dev/null
+++ b/fs/nfsd/xdr4.h
@@ -0,0 +1,573 @@
+/*
+ *  Server-side types for NFSv4.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _LINUX_NFSD_XDR4_H
+#define _LINUX_NFSD_XDR4_H
+
+#include "state.h"
+#include "nfsd.h"
+
+#define NFSD4_MAX_TAGLEN	128
+#define XDR_LEN(n)                     (((n) + 3) & ~3)
+
+struct nfsd4_compound_state {
+	struct svc_fh		current_fh;
+	struct svc_fh		save_fh;
+	struct nfs4_stateowner	*replay_owner;
+	/* For sessions DRC */
+	struct nfsd4_session	*session;
+	struct nfsd4_slot	*slot;
+	__be32			*datap;
+	size_t			iovlen;
+	u32			minorversion;
+	u32			status;
+};
+
+static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs)
+{
+	return cs->slot != NULL;
+}
+
+struct nfsd4_change_info {
+	u32		atomic;
+	bool		change_supported;
+	u32		before_ctime_sec;
+	u32		before_ctime_nsec;
+	u64		before_change;
+	u32		after_ctime_sec;
+	u32		after_ctime_nsec;
+	u64		after_change;
+};
+
+struct nfsd4_access {
+	u32		ac_req_access;      /* request */
+	u32		ac_supported;       /* response */
+	u32		ac_resp_access;     /* response */
+};
+
+struct nfsd4_close {
+	u32		cl_seqid;           /* request */
+	stateid_t	cl_stateid;         /* request+response */
+	struct nfs4_stateowner * cl_stateowner;	/* response */
+};
+
+struct nfsd4_commit {
+	u64		co_offset;          /* request */
+	u32		co_count;           /* request */
+	nfs4_verifier	co_verf;            /* response */
+};
+
+struct nfsd4_create {
+	u32		cr_namelen;         /* request */
+	char *		cr_name;            /* request */
+	u32		cr_type;            /* request */
+	union {                             /* request */
+		struct {
+			u32 namelen;
+			char *name;
+		} link;   /* NF4LNK */
+		struct {
+			u32 specdata1;
+			u32 specdata2;
+		} dev;    /* NF4BLK, NF4CHR */
+	} u;
+	u32		cr_bmval[3];        /* request */
+	struct iattr	cr_iattr;           /* request */
+	struct nfsd4_change_info  cr_cinfo; /* response */
+	struct nfs4_acl *cr_acl;
+};
+#define cr_linklen	u.link.namelen
+#define cr_linkname	u.link.name
+#define cr_specdata1	u.dev.specdata1
+#define cr_specdata2	u.dev.specdata2
+
+struct nfsd4_delegreturn {
+	stateid_t	dr_stateid;
+};
+
+struct nfsd4_getattr {
+	u32		ga_bmval[3];        /* request */
+	struct svc_fh	*ga_fhp;            /* response */
+};
+
+struct nfsd4_link {
+	u32		li_namelen;         /* request */
+	char *		li_name;            /* request */
+	struct nfsd4_change_info  li_cinfo; /* response */
+};
+
+struct nfsd4_lock_denied {
+	clientid_t	ld_clientid;
+	struct nfs4_stateowner   *ld_sop;
+	u64             ld_start;
+	u64             ld_length;
+	u32             ld_type;
+};
+
+struct nfsd4_lock {
+	/* request */
+	u32             lk_type;
+	u32             lk_reclaim;         /* boolean */
+	u64             lk_offset;
+	u64             lk_length;
+	u32             lk_is_new;
+	union {
+		struct {
+			u32             open_seqid;
+			stateid_t       open_stateid;
+			u32             lock_seqid;
+			clientid_t      clientid;
+			struct xdr_netobj owner;
+		} new;
+		struct {
+			stateid_t       lock_stateid;
+			u32             lock_seqid;
+		} old;
+	} v;
+
+	/* response */
+	union {
+		struct {
+			stateid_t               stateid;
+		} ok;
+		struct nfsd4_lock_denied        denied;
+	} u;
+	/* The lk_replay_owner is the open owner in the open_to_lock_owner
+	 * case and the lock owner otherwise: */
+	struct nfs4_stateowner *lk_replay_owner;
+};
+#define lk_new_open_seqid       v.new.open_seqid
+#define lk_new_open_stateid     v.new.open_stateid
+#define lk_new_lock_seqid       v.new.lock_seqid
+#define lk_new_clientid         v.new.clientid
+#define lk_new_owner            v.new.owner
+#define lk_old_lock_stateid     v.old.lock_stateid
+#define lk_old_lock_seqid       v.old.lock_seqid
+
+#define lk_rflags       u.ok.rflags
+#define lk_resp_stateid u.ok.stateid
+#define lk_denied       u.denied
+
+
+struct nfsd4_lockt {
+	u32				lt_type;
+	clientid_t			lt_clientid;
+	struct xdr_netobj		lt_owner;
+	u64				lt_offset;
+	u64				lt_length;
+	struct nfs4_stateowner * 	lt_stateowner;
+	struct nfsd4_lock_denied  	lt_denied;
+};
+
+ 
+struct nfsd4_locku {
+	u32             lu_type;
+	u32             lu_seqid;
+	stateid_t       lu_stateid;
+	u64             lu_offset;
+	u64             lu_length;
+	struct nfs4_stateowner  *lu_stateowner;
+};
+
+
+struct nfsd4_lookup {
+	u32		lo_len;             /* request */
+	char *		lo_name;            /* request */
+};
+
+struct nfsd4_putfh {
+	u32		pf_fhlen;           /* request */
+	char		*pf_fhval;          /* request */
+};
+
+struct nfsd4_open {
+	u32		op_claim_type;      /* request */
+	struct xdr_netobj op_fname;	    /* request - everything but CLAIM_PREV */
+	u32		op_delegate_type;   /* request - CLAIM_PREV only */
+	stateid_t       op_delegate_stateid; /* request - response */
+	u32		op_create;     	    /* request */
+	u32		op_createmode;      /* request */
+	u32		op_bmval[3];        /* request */
+	struct iattr	iattr;              /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
+	nfs4_verifier	verf;               /* EXCLUSIVE4 */
+	clientid_t	op_clientid;        /* request */
+	struct xdr_netobj op_owner;           /* request */
+	u32		op_seqid;           /* request */
+	u32		op_share_access;    /* request */
+	u32		op_share_deny;      /* request */
+	stateid_t	op_stateid;         /* response */
+	u32		op_recall;          /* recall */
+	struct nfsd4_change_info  op_cinfo; /* response */
+	u32		op_rflags;          /* response */
+	int		op_truncate;        /* used during processing */
+	struct nfs4_stateowner *op_stateowner; /* used during processing */
+	struct nfs4_acl *op_acl;
+};
+#define op_iattr	iattr
+#define op_verf		verf
+
+struct nfsd4_open_confirm {
+	stateid_t	oc_req_stateid		/* request */;
+	u32		oc_seqid    		/* request */;
+	stateid_t	oc_resp_stateid		/* response */;
+	struct nfs4_stateowner * oc_stateowner;	/* response */
+};
+
+struct nfsd4_open_downgrade {
+	stateid_t       od_stateid;
+	u32             od_seqid;
+	u32             od_share_access;
+	u32             od_share_deny;
+	struct nfs4_stateowner *od_stateowner;
+};
+
+
+struct nfsd4_read {
+	stateid_t	rd_stateid;         /* request */
+	u64		rd_offset;          /* request */
+	u32		rd_length;          /* request */
+	int		rd_vlen;
+	struct file     *rd_filp;
+	
+	struct svc_rqst *rd_rqstp;          /* response */
+	struct svc_fh * rd_fhp;             /* response */
+};
+
+struct nfsd4_readdir {
+	u64		rd_cookie;          /* request */
+	nfs4_verifier	rd_verf;            /* request */
+	u32		rd_dircount;        /* request */
+	u32		rd_maxcount;        /* request */
+	u32		rd_bmval[3];        /* request */
+	struct svc_rqst *rd_rqstp;          /* response */
+	struct svc_fh * rd_fhp;             /* response */
+
+	struct readdir_cd	common;
+	__be32 *		buffer;
+	int			buflen;
+	__be32 *		offset;
+};
+
+struct nfsd4_release_lockowner {
+	clientid_t        rl_clientid;
+	struct xdr_netobj rl_owner;
+};
+struct nfsd4_readlink {
+	struct svc_rqst *rl_rqstp;          /* request */
+	struct svc_fh *	rl_fhp;             /* request */
+};
+
+struct nfsd4_remove {
+	u32		rm_namelen;         /* request */
+	char *		rm_name;            /* request */
+	struct nfsd4_change_info  rm_cinfo; /* response */
+};
+
+struct nfsd4_rename {
+	u32		rn_snamelen;        /* request */
+	char *		rn_sname;           /* request */
+	u32		rn_tnamelen;        /* request */
+	char *		rn_tname;           /* request */
+	struct nfsd4_change_info  rn_sinfo; /* response */
+	struct nfsd4_change_info  rn_tinfo; /* response */
+};
+
+struct nfsd4_secinfo {
+	u32 si_namelen;					/* request */
+	char *si_name;					/* request */
+	struct svc_export *si_exp;			/* response */
+};
+
+struct nfsd4_secinfo_no_name {
+	u32 sin_style;					/* request */
+	struct svc_export *sin_exp;			/* response */
+};
+
+struct nfsd4_setattr {
+	stateid_t	sa_stateid;         /* request */
+	u32		sa_bmval[3];        /* request */
+	struct iattr	sa_iattr;           /* request */
+	struct nfs4_acl *sa_acl;
+};
+
+struct nfsd4_setclientid {
+	nfs4_verifier	se_verf;            /* request */
+	u32		se_namelen;         /* request */
+	char *		se_name;            /* request */
+	u32		se_callback_prog;   /* request */
+	u32		se_callback_netid_len;  /* request */
+	char *		se_callback_netid_val;  /* request */
+	u32		se_callback_addr_len;   /* request */
+	char *		se_callback_addr_val;   /* request */
+	u32		se_callback_ident;  /* request */
+	clientid_t	se_clientid;        /* response */
+	nfs4_verifier	se_confirm;         /* response */
+};
+
+struct nfsd4_setclientid_confirm {
+	clientid_t	sc_clientid;
+	nfs4_verifier	sc_confirm;
+};
+
+/* also used for NVERIFY */
+struct nfsd4_verify {
+	u32		ve_bmval[3];        /* request */
+	u32		ve_attrlen;         /* request */
+	char *		ve_attrval;         /* request */
+};
+
+struct nfsd4_write {
+	stateid_t	wr_stateid;         /* request */
+	u64		wr_offset;          /* request */
+	u32		wr_stable_how;      /* request */
+	u32		wr_buflen;          /* request */
+	int		wr_vlen;
+
+	u32		wr_bytes_written;   /* response */
+	u32		wr_how_written;     /* response */
+	nfs4_verifier	wr_verifier;        /* response */
+};
+
+struct nfsd4_exchange_id {
+	nfs4_verifier	verifier;
+	struct xdr_netobj clname;
+	u32		flags;
+	clientid_t	clientid;
+	u32		seqid;
+	int		spa_how;
+};
+
+struct nfsd4_sequence {
+	struct nfs4_sessionid	sessionid;		/* request/response */
+	u32			seqid;			/* request/response */
+	u32			slotid;			/* request/response */
+	u32			maxslots;		/* request/response */
+	u32			cachethis;		/* request */
+#if 0
+	u32			target_maxslots;	/* response */
+#endif /* not yet */
+	u32			status_flags;		/* response */
+};
+
+struct nfsd4_destroy_session {
+	struct nfs4_sessionid	sessionid;
+};
+
+struct nfsd4_reclaim_complete {
+	u32 rca_one_fs;
+};
+
+struct nfsd4_op {
+	int					opnum;
+	__be32					status;
+	union {
+		struct nfsd4_access		access;
+		struct nfsd4_close		close;
+		struct nfsd4_commit		commit;
+		struct nfsd4_create		create;
+		struct nfsd4_delegreturn	delegreturn;
+		struct nfsd4_getattr		getattr;
+		struct svc_fh *			getfh;
+		struct nfsd4_link		link;
+		struct nfsd4_lock		lock;
+		struct nfsd4_lockt		lockt;
+		struct nfsd4_locku		locku;
+		struct nfsd4_lookup		lookup;
+		struct nfsd4_verify		nverify;
+		struct nfsd4_open		open;
+		struct nfsd4_open_confirm	open_confirm;
+		struct nfsd4_open_downgrade	open_downgrade;
+		struct nfsd4_putfh		putfh;
+		struct nfsd4_read		read;
+		struct nfsd4_readdir		readdir;
+		struct nfsd4_readlink		readlink;
+		struct nfsd4_remove		remove;
+		struct nfsd4_rename		rename;
+		clientid_t			renew;
+		struct nfsd4_secinfo		secinfo;
+		struct nfsd4_setattr		setattr;
+		struct nfsd4_setclientid	setclientid;
+		struct nfsd4_setclientid_confirm setclientid_confirm;
+		struct nfsd4_verify		verify;
+		struct nfsd4_write		write;
+		struct nfsd4_release_lockowner	release_lockowner;
+
+		/* NFSv4.1 */
+		struct nfsd4_exchange_id	exchange_id;
+		struct nfsd4_bind_conn_to_session bind_conn_to_session;
+		struct nfsd4_create_session	create_session;
+		struct nfsd4_destroy_session	destroy_session;
+		struct nfsd4_sequence		sequence;
+		struct nfsd4_reclaim_complete	reclaim_complete;
+	} u;
+	struct nfs4_replay *			replay;
+};
+
+struct nfsd4_compoundargs {
+	/* scratch variables for XDR decode */
+	__be32 *			p;
+	__be32 *			end;
+	struct page **			pagelist;
+	int				pagelen;
+	__be32				tmp[8];
+	__be32 *			tmpp;
+	struct tmpbuf {
+		struct tmpbuf *next;
+		void (*release)(const void *);
+		void *buf;
+	}				*to_free;
+
+	struct svc_rqst			*rqstp;
+
+	u32				taglen;
+	char *				tag;
+	u32				minorversion;
+	u32				opcnt;
+	struct nfsd4_op			*ops;
+	struct nfsd4_op			iops[8];
+};
+
+struct nfsd4_compoundres {
+	/* scratch variables for XDR encode */
+	__be32 *			p;
+	__be32 *			end;
+	struct xdr_buf *		xbuf;
+	struct svc_rqst *		rqstp;
+
+	u32				taglen;
+	char *				tag;
+	u32				opcnt;
+	__be32 *			tagp; /* tag, opcount encode location */
+	struct nfsd4_compound_state	cstate;
+};
+
+static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
+	return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE;
+}
+
+static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
+{
+	return !resp->cstate.slot->sl_cachethis || nfsd4_is_solo_sequence(resp);
+}
+
+#define NFS4_SVC_XDRSIZE		sizeof(struct nfsd4_compoundargs)
+
+static inline void
+set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
+{
+	BUG_ON(!fhp->fh_pre_saved);
+	cinfo->atomic = fhp->fh_post_saved;
+	cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
+
+	cinfo->before_change = fhp->fh_pre_change;
+	cinfo->after_change = fhp->fh_post_change;
+	cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
+	cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
+	cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
+	cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
+
+}
+
+int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
+int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
+		struct nfsd4_compoundargs *);
+int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
+		struct nfsd4_compoundres *);
+void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
+void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
+__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
+		       struct dentry *dentry, __be32 *buffer, int *countp,
+		       u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
+extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *,
+		struct nfsd4_setclientid *setclid);
+extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *,
+		struct nfsd4_setclientid_confirm *setclientid_confirm);
+extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
+extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+		struct nfsd4_sequence *seq);
+extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
+extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
+extern __be32 nfsd4_create_session(struct svc_rqst *,
+		struct nfsd4_compound_state *,
+		struct nfsd4_create_session *);
+extern __be32 nfsd4_sequence(struct svc_rqst *,
+		struct nfsd4_compound_state *,
+		struct nfsd4_sequence *);
+extern __be32 nfsd4_destroy_session(struct svc_rqst *,
+		struct nfsd4_compound_state *,
+		struct nfsd4_destroy_session *);
+__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
+extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
+		struct nfsd4_open *open);
+extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
+		struct svc_fh *current_fh, struct nfsd4_open *open);
+extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
+extern __be32 nfsd4_close(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *,
+		struct nfsd4_close *close);
+extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *,
+		struct nfsd4_open_downgrade *od);
+extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
+		struct nfsd4_lock *lock);
+extern __be32 nfsd4_lockt(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *,
+		struct nfsd4_lockt *lockt);
+extern __be32 nfsd4_locku(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *,
+		struct nfsd4_locku *locku);
+extern __be32
+nfsd4_release_lockowner(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *,
+		struct nfsd4_release_lockowner *rlockowner);
+extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *);
+extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *, struct nfsd4_delegreturn *dr);
+extern __be32 nfsd4_renew(struct svc_rqst *rqstp,
+			  struct nfsd4_compound_state *, clientid_t *clid);
+#endif
+
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
new file mode 100644
index 00000000..251da07b
--- /dev/null
+++ b/fs/nilfs2/Kconfig
@@ -0,0 +1,25 @@
+config NILFS2_FS
+	tristate "NILFS2 file system support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	select CRC32
+	help
+	  NILFS2 is a log-structured file system (LFS) supporting continuous
+	  snapshotting.  In addition to versioning capability of the entire
+	  file system, users can even restore files mistakenly overwritten or
+	  destroyed just a few seconds ago.  Since this file system can keep
+	  consistency like conventional LFS, it achieves quick recovery after
+	  system crashes.
+
+	  NILFS2 creates a number of checkpoints every few seconds or per
+	  synchronous write basis (unless there is no change).  Users can
+	  select significant versions among continuously created checkpoints,
+	  and can change them into snapshots which will be preserved for long
+	  periods until they are changed back to checkpoints.  Each
+	  snapshot is mountable as a read-only file system concurrently with
+	  its writable mount, and this feature is convenient for online backup.
+
+	  Some features including atime, extended attributes, and POSIX ACLs,
+	  are not supported yet.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called nilfs2.  If unsure, say N.
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
new file mode 100644
index 00000000..85c98737
--- /dev/null
+++ b/fs/nilfs2/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_NILFS2_FS) += nilfs2.o
+nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
+	btnode.o bmap.o btree.o direct.o dat.o recovery.o \
+	the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
+	ifile.o alloc.o gcinode.o ioctl.o
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
new file mode 100644
index 00000000..eed4d7b2
--- /dev/null
+++ b/fs/nilfs2/alloc.c
@@ -0,0 +1,721 @@
+/*
+ * alloc.c - NILFS dat/inode allocator
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Original code was written by Koji Sato <koji@osrg.net>.
+ * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
+ *                                Amagai Yoshiji <amagai@osrg.net>.
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include "mdt.h"
+#include "alloc.h"
+
+
+/**
+ * nilfs_palloc_groups_per_desc_block - get the number of groups that a group
+ *					descriptor block can maintain
+ * @inode: inode of metadata file using this allocator
+ */
+static inline unsigned long
+nilfs_palloc_groups_per_desc_block(const struct inode *inode)
+{
+	return (1UL << inode->i_blkbits) /
+		sizeof(struct nilfs_palloc_group_desc);
+}
+
+/**
+ * nilfs_palloc_groups_count - get maximum number of groups
+ * @inode: inode of metadata file using this allocator
+ */
+static inline unsigned long
+nilfs_palloc_groups_count(const struct inode *inode)
+{
+	return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
+}
+
+/**
+ * nilfs_palloc_init_blockgroup - initialize private variables for allocator
+ * @inode: inode of metadata file using this allocator
+ * @entry_size: size of the persistent object
+ */
+int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+
+	mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS);
+	if (!mi->mi_bgl)
+		return -ENOMEM;
+
+	bgl_lock_init(mi->mi_bgl);
+
+	nilfs_mdt_set_entry_size(inode, entry_size, 0);
+
+	mi->mi_blocks_per_group =
+		DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
+			     mi->mi_entries_per_block) + 1;
+		/* Number of blocks in a group including entry blocks and
+		   a bitmap block */
+	mi->mi_blocks_per_desc_block =
+		nilfs_palloc_groups_per_desc_block(inode) *
+		mi->mi_blocks_per_group + 1;
+		/* Number of blocks per descriptor including the
+		   descriptor block */
+	return 0;
+}
+
+/**
+ * nilfs_palloc_group - get group number and offset from an entry number
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @offset: pointer to store offset number in the group
+ */
+static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
+					unsigned long *offset)
+{
+	__u64 group = nr;
+
+	*offset = do_div(group, nilfs_palloc_entries_per_group(inode));
+	return group;
+}
+
+/**
+ * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_desc_blkoff() returns block offset of the descriptor
+ * block which contains a descriptor of the specified group.
+ */
+static unsigned long
+nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
+{
+	unsigned long desc_block =
+		group / nilfs_palloc_groups_per_desc_block(inode);
+	return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
+}
+
+/**
+ * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap
+ * block used to allocate/deallocate entries in the specified group.
+ */
+static unsigned long
+nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
+{
+	unsigned long desc_offset =
+		group % nilfs_palloc_groups_per_desc_block(inode);
+	return nilfs_palloc_desc_blkoff(inode, group) + 1 +
+		desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
+}
+
+/**
+ * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ */
+static unsigned long
+nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
+			       const struct nilfs_palloc_group_desc *desc)
+{
+	unsigned long nfree;
+
+	spin_lock(nilfs_mdt_bgl_lock(inode, group));
+	nfree = le32_to_cpu(desc->pg_nfrees);
+	spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+	return nfree;
+}
+
+/**
+ * nilfs_palloc_group_desc_add_entries - adjust count of free entries
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ * @n: delta to be added
+ */
+static void
+nilfs_palloc_group_desc_add_entries(struct inode *inode,
+				    unsigned long group,
+				    struct nilfs_palloc_group_desc *desc,
+				    u32 n)
+{
+	spin_lock(nilfs_mdt_bgl_lock(inode, group));
+	le32_add_cpu(&desc->pg_nfrees, n);
+	spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+}
+
+/**
+ * nilfs_palloc_entry_blkoff - get block offset of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ */
+static unsigned long
+nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
+{
+	unsigned long group, group_offset;
+
+	group = nilfs_palloc_group(inode, nr, &group_offset);
+
+	return nilfs_palloc_bitmap_blkoff(inode, group) + 1 +
+		group_offset / NILFS_MDT(inode)->mi_entries_per_block;
+}
+
+/**
+ * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block
+ * @inode: inode of metadata file
+ * @bh: buffer head of the buffer to be initialized
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
+static void nilfs_palloc_desc_block_init(struct inode *inode,
+					 struct buffer_head *bh, void *kaddr)
+{
+	struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh);
+	unsigned long n = nilfs_palloc_groups_per_desc_block(inode);
+	__le32 nfrees;
+
+	nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode));
+	while (n-- > 0) {
+		desc->pg_nfrees = nfrees;
+		desc++;
+	}
+}
+
+static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
+				  int create,
+				  void (*init_block)(struct inode *,
+						     struct buffer_head *,
+						     void *),
+				  struct buffer_head **bhp,
+				  struct nilfs_bh_assoc *prev,
+				  spinlock_t *lock)
+{
+	int ret;
+
+	spin_lock(lock);
+	if (prev->bh && blkoff == prev->blkoff) {
+		get_bh(prev->bh);
+		*bhp = prev->bh;
+		spin_unlock(lock);
+		return 0;
+	}
+	spin_unlock(lock);
+
+	ret = nilfs_mdt_get_block(inode, blkoff, create, init_block, bhp);
+	if (!ret) {
+		spin_lock(lock);
+		/*
+		 * The following code must be safe for change of the
+		 * cache contents during the get block call.
+		 */
+		brelse(prev->bh);
+		get_bh(*bhp);
+		prev->bh = *bhp;
+		prev->blkoff = blkoff;
+		spin_unlock(lock);
+	}
+	return ret;
+}
+
+/**
+ * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
+static int nilfs_palloc_get_desc_block(struct inode *inode,
+				       unsigned long group,
+				       int create, struct buffer_head **bhp)
+{
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	return nilfs_palloc_get_block(inode,
+				      nilfs_palloc_desc_blkoff(inode, group),
+				      create, nilfs_palloc_desc_block_init,
+				      bhp, &cache->prev_desc, &cache->lock);
+}
+
+/**
+ * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
+static int nilfs_palloc_get_bitmap_block(struct inode *inode,
+					 unsigned long group,
+					 int create, struct buffer_head **bhp)
+{
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	return nilfs_palloc_get_block(inode,
+				      nilfs_palloc_bitmap_blkoff(inode, group),
+				      create, NULL, bhp,
+				      &cache->prev_bitmap, &cache->lock);
+}
+
+/**
+ * nilfs_palloc_get_entry_block - get buffer head of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
+int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
+				 int create, struct buffer_head **bhp)
+{
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	return nilfs_palloc_get_block(inode,
+				      nilfs_palloc_entry_blkoff(inode, nr),
+				      create, NULL, bhp,
+				      &cache->prev_entry, &cache->lock);
+}
+
+/**
+ * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @bh: buffer head of the buffer storing the group descriptor block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
+static struct nilfs_palloc_group_desc *
+nilfs_palloc_block_get_group_desc(const struct inode *inode,
+				  unsigned long group,
+				  const struct buffer_head *bh, void *kaddr)
+{
+	return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) +
+		group % nilfs_palloc_groups_per_desc_block(inode);
+}
+
+/**
+ * nilfs_palloc_block_get_entry - get kernel address of an entry
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @bh: buffer head of the buffer storing the entry block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
+void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
+				   const struct buffer_head *bh, void *kaddr)
+{
+	unsigned long entry_offset, group_offset;
+
+	nilfs_palloc_group(inode, nr, &group_offset);
+	entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block;
+
+	return kaddr + bh_offset(bh) +
+		entry_offset * NILFS_MDT(inode)->mi_entry_size;
+}
+
+/**
+ * nilfs_palloc_find_available_slot - find available slot in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @target: offset number of an entry in the group (start point)
+ * @bitmap: bitmap of the group
+ * @bsize: size in bits
+ */
+static int nilfs_palloc_find_available_slot(struct inode *inode,
+					    unsigned long group,
+					    unsigned long target,
+					    unsigned char *bitmap,
+					    int bsize)
+{
+	int curr, pos, end, i;
+
+	if (target > 0) {
+		end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
+		if (end > bsize)
+			end = bsize;
+		pos = nilfs_find_next_zero_bit(bitmap, end, target);
+		if (pos < end &&
+		    !nilfs_set_bit_atomic(
+			    nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
+			return pos;
+	} else
+		end = 0;
+
+	for (i = 0, curr = end;
+	     i < bsize;
+	     i += BITS_PER_LONG, curr += BITS_PER_LONG) {
+		/* wrap around */
+		if (curr >= bsize)
+			curr = 0;
+		while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
+		       != ~0UL) {
+			end = curr + BITS_PER_LONG;
+			if (end > bsize)
+				end = bsize;
+			pos = nilfs_find_next_zero_bit(bitmap, end, curr);
+			if ((pos < end) &&
+			    !nilfs_set_bit_atomic(
+				    nilfs_mdt_bgl_lock(inode, group), pos,
+				    bitmap))
+				return pos;
+		}
+	}
+	return -ENOSPC;
+}
+
+/**
+ * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups
+ *					    in a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @curr: current group number
+ * @max: maximum number of groups
+ */
+static unsigned long
+nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
+				       unsigned long curr, unsigned long max)
+{
+	return min_t(unsigned long,
+		     nilfs_palloc_groups_per_desc_block(inode) -
+		     curr % nilfs_palloc_groups_per_desc_block(inode),
+		     max - curr + 1);
+}
+
+/**
+ * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
+int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
+				     struct nilfs_palloc_req *req)
+{
+	struct buffer_head *desc_bh, *bitmap_bh;
+	struct nilfs_palloc_group_desc *desc;
+	unsigned char *bitmap;
+	void *desc_kaddr, *bitmap_kaddr;
+	unsigned long group, maxgroup, ngroups;
+	unsigned long group_offset, maxgroup_offset;
+	unsigned long n, entries_per_group, groups_per_desc_block;
+	unsigned long i, j;
+	int pos, ret;
+
+	ngroups = nilfs_palloc_groups_count(inode);
+	maxgroup = ngroups - 1;
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	entries_per_group = nilfs_palloc_entries_per_group(inode);
+	groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
+
+	for (i = 0; i < ngroups; i += n) {
+		if (group >= ngroups) {
+			/* wrap around */
+			group = 0;
+			maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
+						      &maxgroup_offset) - 1;
+		}
+		ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
+		if (ret < 0)
+			return ret;
+		desc_kaddr = kmap(desc_bh->b_page);
+		desc = nilfs_palloc_block_get_group_desc(
+			inode, group, desc_bh, desc_kaddr);
+		n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
+							   maxgroup);
+		for (j = 0; j < n; j++, desc++, group++) {
+			if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
+			    > 0) {
+				ret = nilfs_palloc_get_bitmap_block(
+					inode, group, 1, &bitmap_bh);
+				if (ret < 0)
+					goto out_desc;
+				bitmap_kaddr = kmap(bitmap_bh->b_page);
+				bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
+				pos = nilfs_palloc_find_available_slot(
+					inode, group, group_offset, bitmap,
+					entries_per_group);
+				if (pos >= 0) {
+					/* found a free entry */
+					nilfs_palloc_group_desc_add_entries(
+						inode, group, desc, -1);
+					req->pr_entry_nr =
+						entries_per_group * group + pos;
+					kunmap(desc_bh->b_page);
+					kunmap(bitmap_bh->b_page);
+
+					req->pr_desc_bh = desc_bh;
+					req->pr_bitmap_bh = bitmap_bh;
+					return 0;
+				}
+				kunmap(bitmap_bh->b_page);
+				brelse(bitmap_bh);
+			}
+
+			group_offset = 0;
+		}
+
+		kunmap(desc_bh->b_page);
+		brelse(desc_bh);
+	}
+
+	/* no entries left */
+	return -ENOSPC;
+
+ out_desc:
+	kunmap(desc_bh->b_page);
+	brelse(desc_bh);
+	return ret;
+}
+
+/**
+ * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
+void nilfs_palloc_commit_alloc_entry(struct inode *inode,
+				     struct nilfs_palloc_req *req)
+{
+	mark_buffer_dirty(req->pr_bitmap_bh);
+	mark_buffer_dirty(req->pr_desc_bh);
+	nilfs_mdt_mark_dirty(inode);
+
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+}
+
+/**
+ * nilfs_palloc_commit_free_entry - finish deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
+void nilfs_palloc_commit_free_entry(struct inode *inode,
+				    struct nilfs_palloc_req *req)
+{
+	struct nilfs_palloc_group_desc *desc;
+	unsigned long group, group_offset;
+	unsigned char *bitmap;
+	void *desc_kaddr, *bitmap_kaddr;
+
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	desc_kaddr = kmap(req->pr_desc_bh->b_page);
+	desc = nilfs_palloc_block_get_group_desc(inode, group,
+						 req->pr_desc_bh, desc_kaddr);
+	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
+
+	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
+				    group_offset, bitmap))
+		printk(KERN_WARNING "%s: entry number %llu already freed\n",
+		       __func__, (unsigned long long)req->pr_entry_nr);
+	else
+		nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+
+	kunmap(req->pr_bitmap_bh->b_page);
+	kunmap(req->pr_desc_bh->b_page);
+
+	mark_buffer_dirty(req->pr_desc_bh);
+	mark_buffer_dirty(req->pr_bitmap_bh);
+	nilfs_mdt_mark_dirty(inode);
+
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+}
+
+/**
+ * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
+void nilfs_palloc_abort_alloc_entry(struct inode *inode,
+				    struct nilfs_palloc_req *req)
+{
+	struct nilfs_palloc_group_desc *desc;
+	void *desc_kaddr, *bitmap_kaddr;
+	unsigned char *bitmap;
+	unsigned long group, group_offset;
+
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	desc_kaddr = kmap(req->pr_desc_bh->b_page);
+	desc = nilfs_palloc_block_get_group_desc(inode, group,
+						 req->pr_desc_bh, desc_kaddr);
+	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
+	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
+				    group_offset, bitmap))
+		printk(KERN_WARNING "%s: entry number %llu already freed\n",
+		       __func__, (unsigned long long)req->pr_entry_nr);
+	else
+		nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+
+	kunmap(req->pr_bitmap_bh->b_page);
+	kunmap(req->pr_desc_bh->b_page);
+
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+
+	req->pr_entry_nr = 0;
+	req->pr_bitmap_bh = NULL;
+	req->pr_desc_bh = NULL;
+}
+
+/**
+ * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
+int nilfs_palloc_prepare_free_entry(struct inode *inode,
+				    struct nilfs_palloc_req *req)
+{
+	struct buffer_head *desc_bh, *bitmap_bh;
+	unsigned long group, group_offset;
+	int ret;
+
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
+	if (ret < 0)
+		return ret;
+	ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh);
+	if (ret < 0) {
+		brelse(desc_bh);
+		return ret;
+	}
+
+	req->pr_desc_bh = desc_bh;
+	req->pr_bitmap_bh = bitmap_bh;
+	return 0;
+}
+
+/**
+ * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
+void nilfs_palloc_abort_free_entry(struct inode *inode,
+				   struct nilfs_palloc_req *req)
+{
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+
+	req->pr_entry_nr = 0;
+	req->pr_bitmap_bh = NULL;
+	req->pr_desc_bh = NULL;
+}
+
+/**
+ * nilfs_palloc_group_is_in - judge if an entry is in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @nr: serial number of the entry (e.g. inode number)
+ */
+static int
+nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
+{
+	__u64 first, last;
+
+	first = group * nilfs_palloc_entries_per_group(inode);
+	last = first + nilfs_palloc_entries_per_group(inode) - 1;
+	return (nr >= first) && (nr <= last);
+}
+
+/**
+ * nilfs_palloc_freev - deallocate a set of persistent objects
+ * @inode: inode of metadata file using this allocator
+ * @entry_nrs: array of entry numbers to be deallocated
+ * @nitems: number of entries stored in @entry_nrs
+ */
+int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
+{
+	struct buffer_head *desc_bh, *bitmap_bh;
+	struct nilfs_palloc_group_desc *desc;
+	unsigned char *bitmap;
+	void *desc_kaddr, *bitmap_kaddr;
+	unsigned long group, group_offset;
+	int i, j, n, ret;
+
+	for (i = 0; i < nitems; i = j) {
+		group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
+		ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
+		if (ret < 0)
+			return ret;
+		ret = nilfs_palloc_get_bitmap_block(inode, group, 0,
+						    &bitmap_bh);
+		if (ret < 0) {
+			brelse(desc_bh);
+			return ret;
+		}
+		desc_kaddr = kmap(desc_bh->b_page);
+		desc = nilfs_palloc_block_get_group_desc(
+			inode, group, desc_bh, desc_kaddr);
+		bitmap_kaddr = kmap(bitmap_bh->b_page);
+		bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
+		for (j = i, n = 0;
+		     (j < nitems) && nilfs_palloc_group_is_in(inode, group,
+							      entry_nrs[j]);
+		     j++) {
+			nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
+			if (!nilfs_clear_bit_atomic(
+				    nilfs_mdt_bgl_lock(inode, group),
+				    group_offset, bitmap)) {
+				printk(KERN_WARNING
+				       "%s: entry number %llu already freed\n",
+				       __func__,
+				       (unsigned long long)entry_nrs[j]);
+			} else {
+				n++;
+			}
+		}
+		nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
+
+		kunmap(bitmap_bh->b_page);
+		kunmap(desc_bh->b_page);
+
+		mark_buffer_dirty(desc_bh);
+		mark_buffer_dirty(bitmap_bh);
+		nilfs_mdt_mark_dirty(inode);
+
+		brelse(bitmap_bh);
+		brelse(desc_bh);
+	}
+	return 0;
+}
+
+void nilfs_palloc_setup_cache(struct inode *inode,
+			      struct nilfs_palloc_cache *cache)
+{
+	NILFS_MDT(inode)->mi_palloc_cache = cache;
+	spin_lock_init(&cache->lock);
+}
+
+void nilfs_palloc_clear_cache(struct inode *inode)
+{
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	spin_lock(&cache->lock);
+	brelse(cache->prev_desc.bh);
+	brelse(cache->prev_bitmap.bh);
+	brelse(cache->prev_entry.bh);
+	cache->prev_desc.bh = NULL;
+	cache->prev_bitmap.bh = NULL;
+	cache->prev_entry.bh = NULL;
+	spin_unlock(&cache->lock);
+}
+
+void nilfs_palloc_destroy_cache(struct inode *inode)
+{
+	nilfs_palloc_clear_cache(inode);
+	NILFS_MDT(inode)->mi_palloc_cache = NULL;
+}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
new file mode 100644
index 00000000..f5fde36b
--- /dev/null
+++ b/fs/nilfs2/alloc.h
@@ -0,0 +1,100 @@
+/*
+ * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Original code was written by Koji Sato <koji@osrg.net>.
+ * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
+ *                                Amagai Yoshiji <amagai@osrg.net>.
+ */
+
+#ifndef _NILFS_ALLOC_H
+#define _NILFS_ALLOC_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+
+/**
+ * nilfs_palloc_entries_per_group - get the number of entries per group
+ * @inode: inode of metadata file using this allocator
+ *
+ * The number of entries per group is defined by the number of bits
+ * that a bitmap block can maintain.
+ */
+static inline unsigned long
+nilfs_palloc_entries_per_group(const struct inode *inode)
+{
+	return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */);
+}
+
+int nilfs_palloc_init_blockgroup(struct inode *, unsigned);
+int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
+				 struct buffer_head **);
+void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
+				   const struct buffer_head *, void *);
+
+/**
+ * nilfs_palloc_req - persistent allocator request and reply
+ * @pr_entry_nr: entry number (vblocknr or inode number)
+ * @pr_desc_bh: buffer head of the buffer containing block group descriptors
+ * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
+ * @pr_entry_bh: buffer head of the buffer containing translation entries
+ */
+struct nilfs_palloc_req {
+	__u64 pr_entry_nr;
+	struct buffer_head *pr_desc_bh;
+	struct buffer_head *pr_bitmap_bh;
+	struct buffer_head *pr_entry_bh;
+};
+
+int nilfs_palloc_prepare_alloc_entry(struct inode *,
+				     struct nilfs_palloc_req *);
+void nilfs_palloc_commit_alloc_entry(struct inode *,
+				     struct nilfs_palloc_req *);
+void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
+void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *);
+int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *);
+void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *);
+int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
+
+#define nilfs_set_bit_atomic		ext2_set_bit_atomic
+#define nilfs_clear_bit_atomic		ext2_clear_bit_atomic
+#define nilfs_find_next_zero_bit	find_next_zero_bit_le
+
+/*
+ * persistent object allocator cache
+ */
+
+struct nilfs_bh_assoc {
+	unsigned long blkoff;
+	struct buffer_head *bh;
+};
+
+struct nilfs_palloc_cache {
+	spinlock_t lock;
+	struct nilfs_bh_assoc prev_desc;
+	struct nilfs_bh_assoc prev_bitmap;
+	struct nilfs_bh_assoc prev_entry;
+};
+
+void nilfs_palloc_setup_cache(struct inode *inode,
+			      struct nilfs_palloc_cache *cache);
+void nilfs_palloc_clear_cache(struct inode *inode);
+void nilfs_palloc_destroy_cache(struct inode *inode);
+
+#endif	/* _NILFS_ALLOC_H */
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
new file mode 100644
index 00000000..aadbd0b5
--- /dev/null
+++ b/fs/nilfs2/bmap.c
@@ -0,0 +1,567 @@
+/*
+ * bmap.c - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "bmap.h"
+#include "btree.h"
+#include "direct.h"
+#include "btnode.h"
+#include "mdt.h"
+#include "dat.h"
+#include "alloc.h"
+
+struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
+{
+	struct the_nilfs *nilfs = bmap->b_inode->i_sb->s_fs_info;
+
+	return nilfs->ns_dat;
+}
+
+static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
+				     const char *fname, int err)
+{
+	struct inode *inode = bmap->b_inode;
+
+	if (err == -EINVAL) {
+		nilfs_error(inode->i_sb, fname,
+			    "broken bmap (inode number=%lu)\n", inode->i_ino);
+		err = -EIO;
+	}
+	return err;
+}
+
+/**
+ * nilfs_bmap_lookup_at_level - find a data block or node block
+ * @bmap: bmap
+ * @key: key
+ * @level: level
+ * @ptrp: place to store the value associated to @key
+ *
+ * Description: nilfs_bmap_lookup_at_level() finds a record whose key
+ * matches @key in the block at @level of the bmap.
+ *
+ * Return Value: On success, 0 is returned and the record associated with @key
+ * is stored in the place pointed by @ptrp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
+int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
+			       __u64 *ptrp)
+{
+	sector_t blocknr;
+	int ret;
+
+	down_read(&bmap->b_sem);
+	ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
+	if (ret < 0) {
+		ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+		goto out;
+	}
+	if (NILFS_BMAP_USE_VBN(bmap)) {
+		ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
+					  &blocknr);
+		if (!ret)
+			*ptrp = blocknr;
+	}
+
+ out:
+	up_read(&bmap->b_sem);
+	return ret;
+}
+
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
+			     unsigned maxblocks)
+{
+	int ret;
+
+	down_read(&bmap->b_sem);
+	ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
+	up_read(&bmap->b_sem);
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
+}
+
+static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+	__u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
+	__u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1];
+	int ret, n;
+
+	if (bmap->b_ops->bop_check_insert != NULL) {
+		ret = bmap->b_ops->bop_check_insert(bmap, key);
+		if (ret > 0) {
+			n = bmap->b_ops->bop_gather_data(
+				bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1);
+			if (n < 0)
+				return n;
+			ret = nilfs_btree_convert_and_insert(
+				bmap, key, ptr, keys, ptrs, n);
+			if (ret == 0)
+				bmap->b_u.u_flags |= NILFS_BMAP_LARGE;
+
+			return ret;
+		} else if (ret < 0)
+			return ret;
+	}
+
+	return bmap->b_ops->bop_insert(bmap, key, ptr);
+}
+
+/**
+ * nilfs_bmap_insert - insert a new key-record pair into a bmap
+ * @bmap: bmap
+ * @key: key
+ * @rec: record
+ *
+ * Description: nilfs_bmap_insert() inserts the new key-record pair specified
+ * by @key and @rec into @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EEXIST - A record associated with @key already exist.
+ */
+int nilfs_bmap_insert(struct nilfs_bmap *bmap,
+		      unsigned long key,
+		      unsigned long rec)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_do_insert(bmap, key, rec);
+	up_write(&bmap->b_sem);
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
+}
+
+static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+	__u64 keys[NILFS_BMAP_LARGE_LOW + 1];
+	__u64 ptrs[NILFS_BMAP_LARGE_LOW + 1];
+	int ret, n;
+
+	if (bmap->b_ops->bop_check_delete != NULL) {
+		ret = bmap->b_ops->bop_check_delete(bmap, key);
+		if (ret > 0) {
+			n = bmap->b_ops->bop_gather_data(
+				bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1);
+			if (n < 0)
+				return n;
+			ret = nilfs_direct_delete_and_convert(
+				bmap, key, keys, ptrs, n);
+			if (ret == 0)
+				bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;
+
+			return ret;
+		} else if (ret < 0)
+			return ret;
+	}
+
+	return bmap->b_ops->bop_delete(bmap, key);
+}
+
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
+{
+	__u64 lastkey;
+	int ret;
+
+	down_read(&bmap->b_sem);
+	ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+	up_read(&bmap->b_sem);
+
+	if (ret < 0)
+		ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+	else
+		*key = lastkey;
+	return ret;
+}
+
+/**
+ * nilfs_bmap_delete - delete a key-record pair from a bmap
+ * @bmap: bmap
+ * @key: key
+ *
+ * Description: nilfs_bmap_delete() deletes the key-record pair specified by
+ * @key from @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_do_delete(bmap, key);
+	up_write(&bmap->b_sem);
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
+}
+
+static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
+{
+	__u64 lastkey;
+	int ret;
+
+	ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+	if (ret < 0) {
+		if (ret == -ENOENT)
+			ret = 0;
+		return ret;
+	}
+
+	while (key <= lastkey) {
+		ret = nilfs_bmap_do_delete(bmap, lastkey);
+		if (ret < 0)
+			return ret;
+		ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
+			return ret;
+		}
+	}
+	return 0;
+}
+
+/**
+ * nilfs_bmap_truncate - truncate a bmap to a specified key
+ * @bmap: bmap
+ * @key: key
+ *
+ * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are
+ * greater than or equal to @key from @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_do_truncate(bmap, key);
+	up_write(&bmap->b_sem);
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
+}
+
+/**
+ * nilfs_bmap_clear - free resources a bmap holds
+ * @bmap: bmap
+ *
+ * Description: nilfs_bmap_clear() frees resources associated with @bmap.
+ */
+void nilfs_bmap_clear(struct nilfs_bmap *bmap)
+{
+	down_write(&bmap->b_sem);
+	if (bmap->b_ops->bop_clear != NULL)
+		bmap->b_ops->bop_clear(bmap);
+	up_write(&bmap->b_sem);
+}
+
+/**
+ * nilfs_bmap_propagate - propagate dirty state
+ * @bmap: bmap
+ * @bh: buffer head
+ *
+ * Description: nilfs_bmap_propagate() marks the buffers that directly or
+ * indirectly refer to the block specified by @bh dirty.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = bmap->b_ops->bop_propagate(bmap, bh);
+	up_write(&bmap->b_sem);
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
+}
+
+/**
+ * nilfs_bmap_lookup_dirty_buffers -
+ * @bmap: bmap
+ * @listp: pointer to buffer head list
+ */
+void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap,
+				     struct list_head *listp)
+{
+	if (bmap->b_ops->bop_lookup_dirty_buffers != NULL)
+		bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp);
+}
+
+/**
+ * nilfs_bmap_assign - assign a new block number to a block
+ * @bmap: bmap
+ * @bhp: pointer to buffer head
+ * @blocknr: block number
+ * @binfo: block information
+ *
+ * Description: nilfs_bmap_assign() assigns the block number @blocknr to the
+ * buffer specified by @bh.
+ *
+ * Return Value: On success, 0 is returned and the buffer head of a newly
+ * create buffer and the block information associated with the buffer are
+ * stored in the place pointed by @bh and @binfo, respectively. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_assign(struct nilfs_bmap *bmap,
+		      struct buffer_head **bh,
+		      unsigned long blocknr,
+		      union nilfs_binfo *binfo)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
+	up_write(&bmap->b_sem);
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
+}
+
+/**
+ * nilfs_bmap_mark - mark block dirty
+ * @bmap: bmap
+ * @key: key
+ * @level: level
+ *
+ * Description: nilfs_bmap_mark() marks the block specified by @key and @level
+ * as dirty.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
+{
+	int ret;
+
+	if (bmap->b_ops->bop_mark == NULL)
+		return 0;
+
+	down_write(&bmap->b_sem);
+	ret = bmap->b_ops->bop_mark(bmap, key, level);
+	up_write(&bmap->b_sem);
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
+}
+
+/**
+ * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state
+ * @bmap: bmap
+ *
+ * Description: nilfs_test_and_clear() is the atomic operation to test and
+ * clear the dirty state of @bmap.
+ *
+ * Return Value: 1 is returned if @bmap is dirty, or 0 if clear.
+ */
+int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_dirty(bmap);
+	nilfs_bmap_clear_dirty(bmap);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+
+/*
+ * Internal use only
+ */
+__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
+			      const struct buffer_head *bh)
+{
+	struct buffer_head *pbh;
+	__u64 key;
+
+	key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
+					 bmap->b_inode->i_blkbits);
+	for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page)
+		key++;
+
+	return key;
+}
+
+__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
+{
+	__s64 diff;
+
+	diff = key - bmap->b_last_allocated_key;
+	if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) &&
+	    (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) &&
+	    (bmap->b_last_allocated_ptr + diff > 0))
+		return bmap->b_last_allocated_ptr + diff;
+	else
+		return NILFS_BMAP_INVALID_PTR;
+}
+
+#define NILFS_BMAP_GROUP_DIV	8
+__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
+{
+	struct inode *dat = nilfs_bmap_get_dat(bmap);
+	unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat);
+	unsigned long group = bmap->b_inode->i_ino / entries_per_group;
+
+	return group * entries_per_group +
+		(bmap->b_inode->i_ino % NILFS_BMAP_GROUP_DIV) *
+		(entries_per_group / NILFS_BMAP_GROUP_DIV);
+}
+
+static struct lock_class_key nilfs_bmap_dat_lock_key;
+static struct lock_class_key nilfs_bmap_mdt_lock_key;
+
+/**
+ * nilfs_bmap_read - read a bmap from an inode
+ * @bmap: bmap
+ * @raw_inode: on-disk inode
+ *
+ * Description: nilfs_bmap_read() initializes the bmap @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
+{
+	if (raw_inode == NULL)
+		memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE);
+	else
+		memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE);
+
+	init_rwsem(&bmap->b_sem);
+	bmap->b_state = 0;
+	bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+	switch (bmap->b_inode->i_ino) {
+	case NILFS_DAT_INO:
+		bmap->b_ptr_type = NILFS_BMAP_PTR_P;
+		bmap->b_last_allocated_key = 0;
+		bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
+		lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
+		break;
+	case NILFS_CPFILE_INO:
+	case NILFS_SUFILE_INO:
+		bmap->b_ptr_type = NILFS_BMAP_PTR_VS;
+		bmap->b_last_allocated_key = 0;
+		bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+		lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key);
+		break;
+	case NILFS_IFILE_INO:
+		lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key);
+		/* Fall through */
+	default:
+		bmap->b_ptr_type = NILFS_BMAP_PTR_VM;
+		bmap->b_last_allocated_key = 0;
+		bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+		break;
+	}
+
+	return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
+		nilfs_btree_init(bmap) : nilfs_direct_init(bmap);
+}
+
+/**
+ * nilfs_bmap_write - write back a bmap to an inode
+ * @bmap: bmap
+ * @raw_inode: on-disk inode
+ *
+ * Description: nilfs_bmap_write() stores @bmap in @raw_inode.
+ */
+void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
+{
+	down_write(&bmap->b_sem);
+	memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
+	       NILFS_INODE_BMAP_SIZE * sizeof(__le64));
+	if (bmap->b_inode->i_ino == NILFS_DAT_INO)
+		bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
+
+	up_write(&bmap->b_sem);
+}
+
+void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
+{
+	memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
+	init_rwsem(&bmap->b_sem);
+	bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+	bmap->b_ptr_type = NILFS_BMAP_PTR_U;
+	bmap->b_last_allocated_key = 0;
+	bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+	bmap->b_state = 0;
+	nilfs_btree_init_gc(bmap);
+}
+
+void nilfs_bmap_save(const struct nilfs_bmap *bmap,
+		     struct nilfs_bmap_store *store)
+{
+	memcpy(store->data, bmap->b_u.u_data, sizeof(store->data));
+	store->last_allocated_key = bmap->b_last_allocated_key;
+	store->last_allocated_ptr = bmap->b_last_allocated_ptr;
+	store->state = bmap->b_state;
+}
+
+void nilfs_bmap_restore(struct nilfs_bmap *bmap,
+			const struct nilfs_bmap_store *store)
+{
+	memcpy(bmap->b_u.u_data, store->data, sizeof(store->data));
+	bmap->b_last_allocated_key = store->last_allocated_key;
+	bmap->b_last_allocated_ptr = store->last_allocated_ptr;
+	bmap->b_state = store->state;
+}
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
new file mode 100644
index 00000000..40d9f453
--- /dev/null
+++ b/fs/nilfs2/bmap.h
@@ -0,0 +1,270 @@
+/*
+ * bmap.h - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_BMAP_H
+#define _NILFS_BMAP_H
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "alloc.h"
+#include "dat.h"
+
+#define NILFS_BMAP_INVALID_PTR	0
+
+#define nilfs_bmap_keydiff_abs(diff)	((diff) < 0 ? -(diff) : (diff))
+
+
+struct nilfs_bmap;
+
+/**
+ * union nilfs_bmap_ptr_req - request for bmap ptr
+ * @bpr_ptr: bmap pointer
+ * @bpr_req: request for persistent allocator
+ */
+union nilfs_bmap_ptr_req {
+	__u64 bpr_ptr;
+	struct nilfs_palloc_req bpr_req;
+};
+
+/**
+ * struct nilfs_bmap_stats - bmap statistics
+ * @bs_nblocks: number of blocks created or deleted
+ */
+struct nilfs_bmap_stats {
+	unsigned int bs_nblocks;
+};
+
+/**
+ * struct nilfs_bmap_operations - bmap operation table
+ */
+struct nilfs_bmap_operations {
+	int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
+	int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *,
+				 unsigned);
+	int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
+	int (*bop_delete)(struct nilfs_bmap *, __u64);
+	void (*bop_clear)(struct nilfs_bmap *);
+
+	int (*bop_propagate)(struct nilfs_bmap *, struct buffer_head *);
+	void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
+					 struct list_head *);
+
+	int (*bop_assign)(struct nilfs_bmap *,
+			  struct buffer_head **,
+			  sector_t,
+			  union nilfs_binfo *);
+	int (*bop_mark)(struct nilfs_bmap *, __u64, int);
+
+	/* The following functions are internal use only. */
+	int (*bop_last_key)(const struct nilfs_bmap *, __u64 *);
+	int (*bop_check_insert)(const struct nilfs_bmap *, __u64);
+	int (*bop_check_delete)(struct nilfs_bmap *, __u64);
+	int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int);
+};
+
+
+#define NILFS_BMAP_SIZE		(NILFS_INODE_BMAP_SIZE * sizeof(__le64))
+#define NILFS_BMAP_KEY_BIT	(sizeof(unsigned long) * 8 /* CHAR_BIT */)
+#define NILFS_BMAP_NEW_PTR_INIT	\
+	(1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1))
+
+static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
+{
+	return !!(ptr & NILFS_BMAP_NEW_PTR_INIT);
+}
+
+
+/**
+ * struct nilfs_bmap - bmap structure
+ * @b_u: raw data
+ * @b_sem: semaphore
+ * @b_inode: owner of bmap
+ * @b_ops: bmap operation table
+ * @b_last_allocated_key: last allocated key for data block
+ * @b_last_allocated_ptr: last allocated ptr for data block
+ * @b_ptr_type: pointer type
+ * @b_state: state
+ * @b_nchildren_per_block: maximum number of child nodes for non-root nodes
+ */
+struct nilfs_bmap {
+	union {
+		__u8 u_flags;
+		__le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)];
+	} b_u;
+	struct rw_semaphore b_sem;
+	struct inode *b_inode;
+	const struct nilfs_bmap_operations *b_ops;
+	__u64 b_last_allocated_key;
+	__u64 b_last_allocated_ptr;
+	int b_ptr_type;
+	int b_state;
+	__u16 b_nchildren_per_block;
+};
+
+/* pointer type */
+#define NILFS_BMAP_PTR_P	0	/* physical block number (i.e. LBN) */
+#define NILFS_BMAP_PTR_VS	1	/* virtual block number (single
+					   version) */
+#define NILFS_BMAP_PTR_VM	2	/* virtual block number (has multiple
+					   versions) */
+#define NILFS_BMAP_PTR_U	(-1)	/* never perform pointer operations */
+
+#define NILFS_BMAP_USE_VBN(bmap)	((bmap)->b_ptr_type > 0)
+
+/* state */
+#define NILFS_BMAP_DIRTY	0x00000001
+
+struct nilfs_bmap_store {
+	__le64 data[NILFS_BMAP_SIZE / sizeof(__le64)];
+	__u64 last_allocated_key;
+	__u64 last_allocated_ptr;
+	int state;
+};
+
+int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
+int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
+void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
+int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
+int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
+int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
+int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long);
+void nilfs_bmap_clear(struct nilfs_bmap *);
+int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
+void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
+int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **,
+		      unsigned long, union nilfs_binfo *);
+int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
+int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
+
+void nilfs_bmap_init_gc(struct nilfs_bmap *);
+
+void nilfs_bmap_save(const struct nilfs_bmap *, struct nilfs_bmap_store *);
+void nilfs_bmap_restore(struct nilfs_bmap *, const struct nilfs_bmap_store *);
+
+static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
+				    __u64 *ptr)
+{
+	return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr);
+}
+
+/*
+ * Internal use only
+ */
+struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
+
+static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
+					       union nilfs_bmap_ptr_req *req,
+					       struct inode *dat)
+{
+	if (dat)
+		return nilfs_dat_prepare_alloc(dat, &req->bpr_req);
+	/* ignore target ptr */
+	req->bpr_ptr = bmap->b_last_allocated_ptr++;
+	return 0;
+}
+
+static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
+					       union nilfs_bmap_ptr_req *req,
+					       struct inode *dat)
+{
+	if (dat)
+		nilfs_dat_commit_alloc(dat, &req->bpr_req);
+}
+
+static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
+					      union nilfs_bmap_ptr_req *req,
+					      struct inode *dat)
+{
+	if (dat)
+		nilfs_dat_abort_alloc(dat, &req->bpr_req);
+	else
+		bmap->b_last_allocated_ptr--;
+}
+
+static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
+					     union nilfs_bmap_ptr_req *req,
+					     struct inode *dat)
+{
+	return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0;
+}
+
+static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
+					     union nilfs_bmap_ptr_req *req,
+					     struct inode *dat)
+{
+	if (dat)
+		nilfs_dat_commit_end(dat, &req->bpr_req,
+				     bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
+}
+
+static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
+					    union nilfs_bmap_ptr_req *req,
+					    struct inode *dat)
+{
+	if (dat)
+		nilfs_dat_abort_end(dat, &req->bpr_req);
+}
+
+static inline void nilfs_bmap_set_target_v(struct nilfs_bmap *bmap, __u64 key,
+					   __u64 ptr)
+{
+	bmap->b_last_allocated_key = key;
+	bmap->b_last_allocated_ptr = ptr;
+}
+
+__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
+			      const struct buffer_head *);
+
+__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
+__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
+
+
+/* Assume that bmap semaphore is locked. */
+static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
+{
+	return !!(bmap->b_state & NILFS_BMAP_DIRTY);
+}
+
+/* Assume that bmap semaphore is locked. */
+static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap)
+{
+	bmap->b_state |= NILFS_BMAP_DIRTY;
+}
+
+/* Assume that bmap semaphore is locked. */
+static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap)
+{
+	bmap->b_state &= ~NILFS_BMAP_DIRTY;
+}
+
+
+#define NILFS_BMAP_LARGE	0x1
+
+#define NILFS_BMAP_SMALL_LOW	NILFS_DIRECT_KEY_MIN
+#define NILFS_BMAP_SMALL_HIGH	NILFS_DIRECT_KEY_MAX
+#define NILFS_BMAP_LARGE_LOW	NILFS_BTREE_ROOT_NCHILDREN_MAX
+#define NILFS_BMAP_LARGE_HIGH	NILFS_BTREE_KEY_MAX
+
+#endif	/* _NILFS_BMAP_H */
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
new file mode 100644
index 00000000..a35ae35e
--- /dev/null
+++ b/fs/nilfs2/btnode.c
@@ -0,0 +1,297 @@
+/*
+ * btnode.c - NILFS B-tree node cache
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * This file was originally written by Seiji Kihara <kihara@osrg.net>
+ * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
+ * stabilization and simplification.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+#include <linux/gfp.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "dat.h"
+#include "page.h"
+#include "btnode.h"
+
+void nilfs_btnode_cache_clear(struct address_space *btnc)
+{
+	invalidate_mapping_pages(btnc, 0, -1);
+	truncate_inode_pages(btnc, 0);
+}
+
+struct buffer_head *
+nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
+{
+	struct inode *inode = NILFS_BTNC_I(btnc);
+	struct buffer_head *bh;
+
+	bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+	if (unlikely(!bh))
+		return NULL;
+
+	if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
+		     buffer_dirty(bh))) {
+		brelse(bh);
+		BUG();
+	}
+	memset(bh->b_data, 0, 1 << inode->i_blkbits);
+	bh->b_bdev = inode->i_sb->s_bdev;
+	bh->b_blocknr = blocknr;
+	set_buffer_mapped(bh);
+	set_buffer_uptodate(bh);
+
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	return bh;
+}
+
+int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
+			      sector_t pblocknr, int mode,
+			      struct buffer_head **pbh, sector_t *submit_ptr)
+{
+	struct buffer_head *bh;
+	struct inode *inode = NILFS_BTNC_I(btnc);
+	struct page *page;
+	int err;
+
+	bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	err = -EEXIST; /* internal code */
+	page = bh->b_page;
+
+	if (buffer_uptodate(bh) || buffer_dirty(bh))
+		goto found;
+
+	if (pblocknr == 0) {
+		pblocknr = blocknr;
+		if (inode->i_ino != NILFS_DAT_INO) {
+			struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+			/* blocknr is a virtual block number */
+			err = nilfs_dat_translate(nilfs->ns_dat, blocknr,
+						  &pblocknr);
+			if (unlikely(err)) {
+				brelse(bh);
+				goto out_locked;
+			}
+		}
+	}
+
+	if (mode == READA) {
+		if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) {
+			err = -EBUSY; /* internal code */
+			brelse(bh);
+			goto out_locked;
+		}
+	} else { /* mode == READ */
+		lock_buffer(bh);
+	}
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		err = -EEXIST; /* internal code */
+		goto found;
+	}
+	set_buffer_mapped(bh);
+	bh->b_bdev = inode->i_sb->s_bdev;
+	bh->b_blocknr = pblocknr; /* set block address for read */
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(mode, bh);
+	bh->b_blocknr = blocknr; /* set back to the given block address */
+	*submit_ptr = pblocknr;
+	err = 0;
+found:
+	*pbh = bh;
+
+out_locked:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+/**
+ * nilfs_btnode_delete - delete B-tree node buffer
+ * @bh: buffer to be deleted
+ *
+ * nilfs_btnode_delete() invalidates the specified buffer and delete the page
+ * including the buffer if the page gets unbusy.
+ */
+void nilfs_btnode_delete(struct buffer_head *bh)
+{
+	struct address_space *mapping;
+	struct page *page = bh->b_page;
+	pgoff_t index = page_index(page);
+	int still_dirty;
+
+	page_cache_get(page);
+	lock_page(page);
+	wait_on_page_writeback(page);
+
+	nilfs_forget_buffer(bh);
+	still_dirty = PageDirty(page);
+	mapping = page->mapping;
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (!still_dirty && mapping)
+		invalidate_inode_pages2_range(mapping, index, index);
+}
+
+/**
+ * nilfs_btnode_prepare_change_key
+ *  prepare to move contents of the block for old key to one of new key.
+ *  the old buffer will not be removed, but might be reused for new buffer.
+ *  it might return -ENOMEM because of memory allocation errors,
+ *  and might return -EIO because of disk read errors.
+ */
+int nilfs_btnode_prepare_change_key(struct address_space *btnc,
+				    struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+	struct buffer_head *obh, *nbh;
+	struct inode *inode = NILFS_BTNC_I(btnc);
+	__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+	int err;
+
+	if (oldkey == newkey)
+		return 0;
+
+	obh = ctxt->bh;
+	ctxt->newbh = NULL;
+
+	if (inode->i_blkbits == PAGE_CACHE_SHIFT) {
+		lock_page(obh->b_page);
+		/*
+		 * We cannot call radix_tree_preload for the kernels older
+		 * than 2.6.23, because it is not exported for modules.
+		 */
+retry:
+		err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+		if (err)
+			goto failed_unlock;
+		/* BUG_ON(oldkey != obh->b_page->index); */
+		if (unlikely(oldkey != obh->b_page->index))
+			NILFS_PAGE_BUG(obh->b_page,
+				       "invalid oldkey %lld (newkey=%lld)",
+				       (unsigned long long)oldkey,
+				       (unsigned long long)newkey);
+
+		spin_lock_irq(&btnc->tree_lock);
+		err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
+		spin_unlock_irq(&btnc->tree_lock);
+		/*
+		 * Note: page->index will not change to newkey until
+		 * nilfs_btnode_commit_change_key() will be called.
+		 * To protect the page in intermediate state, the page lock
+		 * is held.
+		 */
+		radix_tree_preload_end();
+		if (!err)
+			return 0;
+		else if (err != -EEXIST)
+			goto failed_unlock;
+
+		err = invalidate_inode_pages2_range(btnc, newkey, newkey);
+		if (!err)
+			goto retry;
+		/* fallback to copy mode */
+		unlock_page(obh->b_page);
+	}
+
+	nbh = nilfs_btnode_create_block(btnc, newkey);
+	if (!nbh)
+		return -ENOMEM;
+
+	BUG_ON(nbh == obh);
+	ctxt->newbh = nbh;
+	return 0;
+
+ failed_unlock:
+	unlock_page(obh->b_page);
+	return err;
+}
+
+/**
+ * nilfs_btnode_commit_change_key
+ *  commit the change_key operation prepared by prepare_change_key().
+ */
+void nilfs_btnode_commit_change_key(struct address_space *btnc,
+				    struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+	struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh;
+	__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+	struct page *opage;
+
+	if (oldkey == newkey)
+		return;
+
+	if (nbh == NULL) {	/* blocksize == pagesize */
+		opage = obh->b_page;
+		if (unlikely(oldkey != opage->index))
+			NILFS_PAGE_BUG(opage,
+				       "invalid oldkey %lld (newkey=%lld)",
+				       (unsigned long long)oldkey,
+				       (unsigned long long)newkey);
+		mark_buffer_dirty(obh);
+
+		spin_lock_irq(&btnc->tree_lock);
+		radix_tree_delete(&btnc->page_tree, oldkey);
+		radix_tree_tag_set(&btnc->page_tree, newkey,
+				   PAGECACHE_TAG_DIRTY);
+		spin_unlock_irq(&btnc->tree_lock);
+
+		opage->index = obh->b_blocknr = newkey;
+		unlock_page(opage);
+	} else {
+		nilfs_copy_buffer(nbh, obh);
+		mark_buffer_dirty(nbh);
+
+		nbh->b_blocknr = newkey;
+		ctxt->bh = nbh;
+		nilfs_btnode_delete(obh); /* will decrement bh->b_count */
+	}
+}
+
+/**
+ * nilfs_btnode_abort_change_key
+ *  abort the change_key operation prepared by prepare_change_key().
+ */
+void nilfs_btnode_abort_change_key(struct address_space *btnc,
+				   struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+	struct buffer_head *nbh = ctxt->newbh;
+	__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+
+	if (oldkey == newkey)
+		return;
+
+	if (nbh == NULL) {	/* blocksize == pagesize */
+		spin_lock_irq(&btnc->tree_lock);
+		radix_tree_delete(&btnc->page_tree, newkey);
+		spin_unlock_irq(&btnc->tree_lock);
+		unlock_page(ctxt->bh->b_page);
+	} else
+		brelse(nbh);
+}
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
new file mode 100644
index 00000000..3a4dd2d8
--- /dev/null
+++ b/fs/nilfs2/btnode.h
@@ -0,0 +1,53 @@
+/*
+ * btnode.h - NILFS B-tree node cache
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#ifndef _NILFS_BTNODE_H
+#define _NILFS_BTNODE_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/backing-dev.h>
+
+
+struct nilfs_btnode_chkey_ctxt {
+	__u64 oldkey;
+	__u64 newkey;
+	struct buffer_head *bh;
+	struct buffer_head *newbh;
+};
+
+void nilfs_btnode_cache_clear(struct address_space *);
+struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
+					      __u64 blocknr);
+int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int,
+			      struct buffer_head **, sector_t *);
+void nilfs_btnode_delete(struct buffer_head *);
+int nilfs_btnode_prepare_change_key(struct address_space *,
+				    struct nilfs_btnode_chkey_ctxt *);
+void nilfs_btnode_commit_change_key(struct address_space *,
+				    struct nilfs_btnode_chkey_ctxt *);
+void nilfs_btnode_abort_change_key(struct address_space *,
+				   struct nilfs_btnode_chkey_ctxt *);
+
+#endif	/* _NILFS_BTNODE_H */
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
new file mode 100644
index 00000000..b2e3ff34
--- /dev/null
+++ b/fs/nilfs2/btree.c
@@ -0,0 +1,2310 @@
+/*
+ * btree.c - NILFS B-tree.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/pagevec.h>
+#include "nilfs.h"
+#include "page.h"
+#include "btnode.h"
+#include "btree.h"
+#include "alloc.h"
+#include "dat.h"
+
+static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
+{
+	struct nilfs_btree_path *path;
+	int level = NILFS_BTREE_LEVEL_DATA;
+
+	path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
+	if (path == NULL)
+		goto out;
+
+	for (; level < NILFS_BTREE_LEVEL_MAX; level++) {
+		path[level].bp_bh = NULL;
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index = 0;
+		path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+		path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+		path[level].bp_op = NULL;
+	}
+
+out:
+	return path;
+}
+
+static void nilfs_btree_free_path(struct nilfs_btree_path *path)
+{
+	int level = NILFS_BTREE_LEVEL_DATA;
+
+	for (; level < NILFS_BTREE_LEVEL_MAX; level++)
+		brelse(path[level].bp_bh);
+
+	kmem_cache_free(nilfs_btree_path_cache, path);
+}
+
+/*
+ * B-tree node operations
+ */
+static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
+				     __u64 ptr, struct buffer_head **bhp)
+{
+	struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
+	struct buffer_head *bh;
+
+	bh = nilfs_btnode_create_block(btnc, ptr);
+	if (!bh)
+		return -ENOMEM;
+
+	set_buffer_nilfs_volatile(bh);
+	*bhp = bh;
+	return 0;
+}
+
+static int nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
+{
+	return node->bn_flags;
+}
+
+static void
+nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
+{
+	node->bn_flags = flags;
+}
+
+static int nilfs_btree_node_root(const struct nilfs_btree_node *node)
+{
+	return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
+}
+
+static int nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
+{
+	return node->bn_level;
+}
+
+static void
+nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
+{
+	node->bn_level = level;
+}
+
+static int nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
+{
+	return le16_to_cpu(node->bn_nchildren);
+}
+
+static void
+nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
+{
+	node->bn_nchildren = cpu_to_le16(nchildren);
+}
+
+static int nilfs_btree_node_size(const struct nilfs_bmap *btree)
+{
+	return 1 << btree->b_inode->i_blkbits;
+}
+
+static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree)
+{
+	return btree->b_nchildren_per_block;
+}
+
+static __le64 *
+nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
+{
+	return (__le64 *)((char *)(node + 1) +
+			  (nilfs_btree_node_root(node) ?
+			   0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
+}
+
+static __le64 *
+nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, int ncmax)
+{
+	return (__le64 *)(nilfs_btree_node_dkeys(node) + ncmax);
+}
+
+static __u64
+nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
+{
+	return le64_to_cpu(*(nilfs_btree_node_dkeys(node) + index));
+}
+
+static void
+nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
+{
+	*(nilfs_btree_node_dkeys(node) + index) = cpu_to_le64(key);
+}
+
+static __u64
+nilfs_btree_node_get_ptr(const struct nilfs_btree_node *node, int index,
+			 int ncmax)
+{
+	return le64_to_cpu(*(nilfs_btree_node_dptrs(node, ncmax) + index));
+}
+
+static void
+nilfs_btree_node_set_ptr(struct nilfs_btree_node *node, int index, __u64 ptr,
+			 int ncmax)
+{
+	*(nilfs_btree_node_dptrs(node, ncmax) + index) = cpu_to_le64(ptr);
+}
+
+static void nilfs_btree_node_init(struct nilfs_btree_node *node, int flags,
+				  int level, int nchildren, int ncmax,
+				  const __u64 *keys, const __u64 *ptrs)
+{
+	__le64 *dkeys;
+	__le64 *dptrs;
+	int i;
+
+	nilfs_btree_node_set_flags(node, flags);
+	nilfs_btree_node_set_level(node, level);
+	nilfs_btree_node_set_nchildren(node, nchildren);
+
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, ncmax);
+	for (i = 0; i < nchildren; i++) {
+		dkeys[i] = cpu_to_le64(keys[i]);
+		dptrs[i] = cpu_to_le64(ptrs[i]);
+	}
+}
+
+/* Assume the buffer heads corresponding to left and right are locked. */
+static void nilfs_btree_node_move_left(struct nilfs_btree_node *left,
+				       struct nilfs_btree_node *right,
+				       int n, int lncmax, int rncmax)
+{
+	__le64 *ldkeys, *rdkeys;
+	__le64 *ldptrs, *rdptrs;
+	int lnchildren, rnchildren;
+
+	ldkeys = nilfs_btree_node_dkeys(left);
+	ldptrs = nilfs_btree_node_dptrs(left, lncmax);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
+
+	rdkeys = nilfs_btree_node_dkeys(right);
+	rdptrs = nilfs_btree_node_dptrs(right, rncmax);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
+
+	memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
+	memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
+	memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys));
+	memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs));
+
+	lnchildren += n;
+	rnchildren -= n;
+	nilfs_btree_node_set_nchildren(left, lnchildren);
+	nilfs_btree_node_set_nchildren(right, rnchildren);
+}
+
+/* Assume that the buffer heads corresponding to left and right are locked. */
+static void nilfs_btree_node_move_right(struct nilfs_btree_node *left,
+					struct nilfs_btree_node *right,
+					int n, int lncmax, int rncmax)
+{
+	__le64 *ldkeys, *rdkeys;
+	__le64 *ldptrs, *rdptrs;
+	int lnchildren, rnchildren;
+
+	ldkeys = nilfs_btree_node_dkeys(left);
+	ldptrs = nilfs_btree_node_dptrs(left, lncmax);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
+
+	rdkeys = nilfs_btree_node_dkeys(right);
+	rdptrs = nilfs_btree_node_dptrs(right, rncmax);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
+
+	memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
+	memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
+	memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys));
+	memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs));
+
+	lnchildren -= n;
+	rnchildren += n;
+	nilfs_btree_node_set_nchildren(left, lnchildren);
+	nilfs_btree_node_set_nchildren(right, rnchildren);
+}
+
+/* Assume that the buffer head corresponding to node is locked. */
+static void nilfs_btree_node_insert(struct nilfs_btree_node *node, int index,
+				    __u64 key, __u64 ptr, int ncmax)
+{
+	__le64 *dkeys;
+	__le64 *dptrs;
+	int nchildren;
+
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, ncmax);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	if (index < nchildren) {
+		memmove(dkeys + index + 1, dkeys + index,
+			(nchildren - index) * sizeof(*dkeys));
+		memmove(dptrs + index + 1, dptrs + index,
+			(nchildren - index) * sizeof(*dptrs));
+	}
+	dkeys[index] = cpu_to_le64(key);
+	dptrs[index] = cpu_to_le64(ptr);
+	nchildren++;
+	nilfs_btree_node_set_nchildren(node, nchildren);
+}
+
+/* Assume that the buffer head corresponding to node is locked. */
+static void nilfs_btree_node_delete(struct nilfs_btree_node *node, int index,
+				    __u64 *keyp, __u64 *ptrp, int ncmax)
+{
+	__u64 key;
+	__u64 ptr;
+	__le64 *dkeys;
+	__le64 *dptrs;
+	int nchildren;
+
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, ncmax);
+	key = le64_to_cpu(dkeys[index]);
+	ptr = le64_to_cpu(dptrs[index]);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	if (keyp != NULL)
+		*keyp = key;
+	if (ptrp != NULL)
+		*ptrp = ptr;
+
+	if (index < nchildren - 1) {
+		memmove(dkeys + index, dkeys + index + 1,
+			(nchildren - index - 1) * sizeof(*dkeys));
+		memmove(dptrs + index, dptrs + index + 1,
+			(nchildren - index - 1) * sizeof(*dptrs));
+	}
+	nchildren--;
+	nilfs_btree_node_set_nchildren(node, nchildren);
+}
+
+static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
+				   __u64 key, int *indexp)
+{
+	__u64 nkey;
+	int index, low, high, s;
+
+	/* binary search */
+	low = 0;
+	high = nilfs_btree_node_get_nchildren(node) - 1;
+	index = 0;
+	s = 0;
+	while (low <= high) {
+		index = (low + high) / 2;
+		nkey = nilfs_btree_node_get_key(node, index);
+		if (nkey == key) {
+			s = 0;
+			goto out;
+		} else if (nkey < key) {
+			low = index + 1;
+			s = -1;
+		} else {
+			high = index - 1;
+			s = 1;
+		}
+	}
+
+	/* adjust index */
+	if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) {
+		if (s > 0 && index > 0)
+			index--;
+	} else if (s < 0)
+		index++;
+
+ out:
+	*indexp = index;
+
+	return s == 0;
+}
+
+/**
+ * nilfs_btree_node_broken - verify consistency of btree node
+ * @node: btree node block to be examined
+ * @size: node size (in bytes)
+ * @blocknr: block number
+ *
+ * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
+ */
+static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
+				   size_t size, sector_t blocknr)
+{
+	int level, flags, nchildren;
+	int ret = 0;
+
+	level = nilfs_btree_node_get_level(node);
+	flags = nilfs_btree_node_get_flags(node);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+
+	if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
+		     level >= NILFS_BTREE_LEVEL_MAX ||
+		     (flags & NILFS_BTREE_NODE_ROOT) ||
+		     nchildren < 0 ||
+		     nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
+		printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): "
+		       "level = %d, flags = 0x%x, nchildren = %d\n",
+		       (unsigned long long)blocknr, level, flags, nchildren);
+		ret = 1;
+	}
+	return ret;
+}
+
+int nilfs_btree_broken_node_block(struct buffer_head *bh)
+{
+	int ret;
+
+	if (buffer_nilfs_checked(bh))
+		return 0;
+
+	ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data,
+				       bh->b_size, bh->b_blocknr);
+	if (likely(!ret))
+		set_buffer_nilfs_checked(bh);
+	return ret;
+}
+
+static struct nilfs_btree_node *
+nilfs_btree_get_root(const struct nilfs_bmap *btree)
+{
+	return (struct nilfs_btree_node *)btree->b_u.u_data;
+}
+
+static struct nilfs_btree_node *
+nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
+{
+	return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
+}
+
+static struct nilfs_btree_node *
+nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
+{
+	return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
+}
+
+static int nilfs_btree_height(const struct nilfs_bmap *btree)
+{
+	return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
+}
+
+static struct nilfs_btree_node *
+nilfs_btree_get_node(const struct nilfs_bmap *btree,
+		     const struct nilfs_btree_path *path,
+		     int level, int *ncmaxp)
+{
+	struct nilfs_btree_node *node;
+
+	if (level == nilfs_btree_height(btree) - 1) {
+		node = nilfs_btree_get_root(btree);
+		*ncmaxp = NILFS_BTREE_ROOT_NCHILDREN_MAX;
+	} else {
+		node = nilfs_btree_get_nonroot_node(path, level);
+		*ncmaxp = nilfs_btree_nchildren_per_block(btree);
+	}
+	return node;
+}
+
+static int
+nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
+{
+	if (unlikely(nilfs_btree_node_get_level(node) != level)) {
+		dump_stack();
+		printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n",
+		       nilfs_btree_node_get_level(node), level);
+		return 1;
+	}
+	return 0;
+}
+
+struct nilfs_btree_readahead_info {
+	struct nilfs_btree_node *node;	/* parent node */
+	int max_ra_blocks;		/* max nof blocks to read ahead */
+	int index;			/* current index on the parent node */
+	int ncmax;			/* nof children in the parent node */
+};
+
+static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
+				   struct buffer_head **bhp,
+				   const struct nilfs_btree_readahead_info *ra)
+{
+	struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
+	struct buffer_head *bh, *ra_bh;
+	sector_t submit_ptr = 0;
+	int ret;
+
+	ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr);
+	if (ret) {
+		if (ret != -EEXIST)
+			return ret;
+		goto out_check;
+	}
+
+	if (ra) {
+		int i, n;
+		__u64 ptr2;
+
+		/* read ahead sibling nodes */
+		for (n = ra->max_ra_blocks, i = ra->index + 1;
+		     n > 0 && i < ra->ncmax; n--, i++) {
+			ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax);
+
+			ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA,
+							&ra_bh, &submit_ptr);
+			if (likely(!ret || ret == -EEXIST))
+				brelse(ra_bh);
+			else if (ret != -EBUSY)
+				break;
+			if (!buffer_locked(bh))
+				goto out_no_wait;
+		}
+	}
+
+	wait_on_buffer(bh);
+
+ out_no_wait:
+	if (!buffer_uptodate(bh)) {
+		brelse(bh);
+		return -EIO;
+	}
+
+ out_check:
+	if (nilfs_btree_broken_node_block(bh)) {
+		clear_buffer_uptodate(bh);
+		brelse(bh);
+		return -EINVAL;
+	}
+
+	*bhp = bh;
+	return 0;
+}
+
+static int nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
+				   struct buffer_head **bhp)
+{
+	return __nilfs_btree_get_block(btree, ptr, bhp, NULL);
+}
+
+static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree,
+				 struct nilfs_btree_path *path,
+				 __u64 key, __u64 *ptrp, int minlevel,
+				 int readahead)
+{
+	struct nilfs_btree_node *node;
+	struct nilfs_btree_readahead_info p, *ra;
+	__u64 ptr;
+	int level, index, found, ncmax, ret;
+
+	node = nilfs_btree_get_root(btree);
+	level = nilfs_btree_node_get_level(node);
+	if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0)
+		return -ENOENT;
+
+	found = nilfs_btree_node_lookup(node, key, &index);
+	ptr = nilfs_btree_node_get_ptr(node, index,
+				       NILFS_BTREE_ROOT_NCHILDREN_MAX);
+	path[level].bp_bh = NULL;
+	path[level].bp_index = index;
+
+	ncmax = nilfs_btree_nchildren_per_block(btree);
+
+	while (--level >= minlevel) {
+		ra = NULL;
+		if (level == NILFS_BTREE_LEVEL_NODE_MIN && readahead) {
+			p.node = nilfs_btree_get_node(btree, path, level + 1,
+						      &p.ncmax);
+			p.index = index;
+			p.max_ra_blocks = 7;
+			ra = &p;
+		}
+		ret = __nilfs_btree_get_block(btree, ptr, &path[level].bp_bh,
+					      ra);
+		if (ret < 0)
+			return ret;
+
+		node = nilfs_btree_get_nonroot_node(path, level);
+		if (nilfs_btree_bad_node(node, level))
+			return -EINVAL;
+		if (!found)
+			found = nilfs_btree_node_lookup(node, key, &index);
+		else
+			index = 0;
+		if (index < ncmax) {
+			ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
+		} else {
+			WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
+			/* insert */
+			ptr = NILFS_BMAP_INVALID_PTR;
+		}
+		path[level].bp_index = index;
+	}
+	if (!found)
+		return -ENOENT;
+
+	if (ptrp != NULL)
+		*ptrp = ptr;
+
+	return 0;
+}
+
+static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
+				      struct nilfs_btree_path *path,
+				      __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node;
+	__u64 ptr;
+	int index, level, ncmax, ret;
+
+	node = nilfs_btree_get_root(btree);
+	index = nilfs_btree_node_get_nchildren(node) - 1;
+	if (index < 0)
+		return -ENOENT;
+	level = nilfs_btree_node_get_level(node);
+	ptr = nilfs_btree_node_get_ptr(node, index,
+				       NILFS_BTREE_ROOT_NCHILDREN_MAX);
+	path[level].bp_bh = NULL;
+	path[level].bp_index = index;
+	ncmax = nilfs_btree_nchildren_per_block(btree);
+
+	for (level--; level > 0; level--) {
+		ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
+		if (ret < 0)
+			return ret;
+		node = nilfs_btree_get_nonroot_node(path, level);
+		if (nilfs_btree_bad_node(node, level))
+			return -EINVAL;
+		index = nilfs_btree_node_get_nchildren(node) - 1;
+		ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
+		path[level].bp_index = index;
+	}
+
+	if (keyp != NULL)
+		*keyp = nilfs_btree_node_get_key(node, index);
+	if (ptrp != NULL)
+		*ptrp = ptr;
+
+	return 0;
+}
+
+static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
+			      __u64 key, int level, __u64 *ptrp)
+{
+	struct nilfs_btree_path *path;
+	int ret;
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	ret = nilfs_btree_do_lookup(btree, path, key, ptrp, level, 0);
+
+	nilfs_btree_free_path(path);
+
+	return ret;
+}
+
+static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
+				     __u64 key, __u64 *ptrp, unsigned maxblocks)
+{
+	struct nilfs_btree_path *path;
+	struct nilfs_btree_node *node;
+	struct inode *dat = NULL;
+	__u64 ptr, ptr2;
+	sector_t blocknr;
+	int level = NILFS_BTREE_LEVEL_NODE_MIN;
+	int ret, cnt, index, maxlevel, ncmax;
+	struct nilfs_btree_readahead_info p;
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level, 1);
+	if (ret < 0)
+		goto out;
+
+	if (NILFS_BMAP_USE_VBN(btree)) {
+		dat = nilfs_bmap_get_dat(btree);
+		ret = nilfs_dat_translate(dat, ptr, &blocknr);
+		if (ret < 0)
+			goto out;
+		ptr = blocknr;
+	}
+	cnt = 1;
+	if (cnt == maxblocks)
+		goto end;
+
+	maxlevel = nilfs_btree_height(btree) - 1;
+	node = nilfs_btree_get_node(btree, path, level, &ncmax);
+	index = path[level].bp_index + 1;
+	for (;;) {
+		while (index < nilfs_btree_node_get_nchildren(node)) {
+			if (nilfs_btree_node_get_key(node, index) !=
+			    key + cnt)
+				goto end;
+			ptr2 = nilfs_btree_node_get_ptr(node, index, ncmax);
+			if (dat) {
+				ret = nilfs_dat_translate(dat, ptr2, &blocknr);
+				if (ret < 0)
+					goto out;
+				ptr2 = blocknr;
+			}
+			if (ptr2 != ptr + cnt || ++cnt == maxblocks)
+				goto end;
+			index++;
+			continue;
+		}
+		if (level == maxlevel)
+			break;
+
+		/* look-up right sibling node */
+		p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax);
+		p.index = path[level + 1].bp_index + 1;
+		p.max_ra_blocks = 7;
+		if (p.index >= nilfs_btree_node_get_nchildren(p.node) ||
+		    nilfs_btree_node_get_key(p.node, p.index) != key + cnt)
+			break;
+		ptr2 = nilfs_btree_node_get_ptr(p.node, p.index, p.ncmax);
+		path[level + 1].bp_index = p.index;
+
+		brelse(path[level].bp_bh);
+		path[level].bp_bh = NULL;
+
+		ret = __nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh,
+					      &p);
+		if (ret < 0)
+			goto out;
+		node = nilfs_btree_get_nonroot_node(path, level);
+		ncmax = nilfs_btree_nchildren_per_block(btree);
+		index = 0;
+		path[level].bp_index = index;
+	}
+ end:
+	*ptrp = ptr;
+	ret = cnt;
+ out:
+	nilfs_btree_free_path(path);
+	return ret;
+}
+
+static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 key)
+{
+	if (level < nilfs_btree_height(btree) - 1) {
+		do {
+			nilfs_btree_node_set_key(
+				nilfs_btree_get_nonroot_node(path, level),
+				path[level].bp_index, key);
+			if (!buffer_dirty(path[level].bp_bh))
+				mark_buffer_dirty(path[level].bp_bh);
+		} while ((path[level].bp_index == 0) &&
+			 (++level < nilfs_btree_height(btree) - 1));
+	}
+
+	/* root */
+	if (level == nilfs_btree_height(btree) - 1) {
+		nilfs_btree_node_set_key(nilfs_btree_get_root(btree),
+					 path[level].bp_index, key);
+	}
+}
+
+static void nilfs_btree_do_insert(struct nilfs_bmap *btree,
+				  struct nilfs_btree_path *path,
+				  int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node;
+	int ncblk;
+
+	if (level < nilfs_btree_height(btree) - 1) {
+		node = nilfs_btree_get_nonroot_node(path, level);
+		ncblk = nilfs_btree_nchildren_per_block(btree);
+		nilfs_btree_node_insert(node, path[level].bp_index,
+					*keyp, *ptrp, ncblk);
+		if (!buffer_dirty(path[level].bp_bh))
+			mark_buffer_dirty(path[level].bp_bh);
+
+		if (path[level].bp_index == 0)
+			nilfs_btree_promote_key(btree, path, level + 1,
+						nilfs_btree_node_get_key(node,
+									 0));
+	} else {
+		node = nilfs_btree_get_root(btree);
+		nilfs_btree_node_insert(node, path[level].bp_index,
+					*keyp, *ptrp,
+					NILFS_BTREE_ROOT_NCHILDREN_MAX);
+	}
+}
+
+static void nilfs_btree_carry_left(struct nilfs_bmap *btree,
+				   struct nilfs_btree_path *path,
+				   int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *left;
+	int nchildren, lnchildren, n, move, ncblk;
+
+	node = nilfs_btree_get_nonroot_node(path, level);
+	left = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+	move = 0;
+
+	n = (nchildren + lnchildren + 1) / 2 - lnchildren;
+	if (n > path[level].bp_index) {
+		/* move insert point */
+		n--;
+		move = 1;
+	}
+
+	nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		mark_buffer_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		mark_buffer_dirty(path[level].bp_sib_bh);
+
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(node, 0));
+
+	if (move) {
+		brelse(path[level].bp_bh);
+		path[level].bp_bh = path[level].bp_sib_bh;
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index += lnchildren;
+		path[level + 1].bp_index--;
+	} else {
+		brelse(path[level].bp_sib_bh);
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index -= n;
+	}
+
+	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+}
+
+static void nilfs_btree_carry_right(struct nilfs_bmap *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	int nchildren, rnchildren, n, move, ncblk;
+
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+	move = 0;
+
+	n = (nchildren + rnchildren + 1) / 2 - rnchildren;
+	if (n > nchildren - path[level].bp_index) {
+		/* move insert point */
+		n--;
+		move = 1;
+	}
+
+	nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		mark_buffer_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		mark_buffer_dirty(path[level].bp_sib_bh);
+
+	path[level + 1].bp_index++;
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(right, 0));
+	path[level + 1].bp_index--;
+
+	if (move) {
+		brelse(path[level].bp_bh);
+		path[level].bp_bh = path[level].bp_sib_bh;
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
+		path[level + 1].bp_index++;
+	} else {
+		brelse(path[level].bp_sib_bh);
+		path[level].bp_sib_bh = NULL;
+	}
+
+	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+}
+
+static void nilfs_btree_split(struct nilfs_bmap *btree,
+			      struct nilfs_btree_path *path,
+			      int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	__u64 newkey;
+	__u64 newptr;
+	int nchildren, n, move, ncblk;
+
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+	move = 0;
+
+	n = (nchildren + 1) / 2;
+	if (n > nchildren - path[level].bp_index) {
+		n--;
+		move = 1;
+	}
+
+	nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		mark_buffer_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		mark_buffer_dirty(path[level].bp_sib_bh);
+
+	newkey = nilfs_btree_node_get_key(right, 0);
+	newptr = path[level].bp_newreq.bpr_ptr;
+
+	if (move) {
+		path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
+		nilfs_btree_node_insert(right, path[level].bp_index,
+					*keyp, *ptrp, ncblk);
+
+		*keyp = nilfs_btree_node_get_key(right, 0);
+		*ptrp = path[level].bp_newreq.bpr_ptr;
+
+		brelse(path[level].bp_bh);
+		path[level].bp_bh = path[level].bp_sib_bh;
+		path[level].bp_sib_bh = NULL;
+	} else {
+		nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+
+		*keyp = nilfs_btree_node_get_key(right, 0);
+		*ptrp = path[level].bp_newreq.bpr_ptr;
+
+		brelse(path[level].bp_sib_bh);
+		path[level].bp_sib_bh = NULL;
+	}
+
+	path[level + 1].bp_index++;
+}
+
+static void nilfs_btree_grow(struct nilfs_bmap *btree,
+			     struct nilfs_btree_path *path,
+			     int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *root, *child;
+	int n, ncblk;
+
+	root = nilfs_btree_get_root(btree);
+	child = nilfs_btree_get_sib_node(path, level);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	n = nilfs_btree_node_get_nchildren(root);
+
+	nilfs_btree_node_move_right(root, child, n,
+				    NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
+	nilfs_btree_node_set_level(root, level + 1);
+
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		mark_buffer_dirty(path[level].bp_sib_bh);
+
+	path[level].bp_bh = path[level].bp_sib_bh;
+	path[level].bp_sib_bh = NULL;
+
+	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+
+	*keyp = nilfs_btree_node_get_key(child, 0);
+	*ptrp = path[level].bp_newreq.bpr_ptr;
+}
+
+static __u64 nilfs_btree_find_near(const struct nilfs_bmap *btree,
+				   const struct nilfs_btree_path *path)
+{
+	struct nilfs_btree_node *node;
+	int level, ncmax;
+
+	if (path == NULL)
+		return NILFS_BMAP_INVALID_PTR;
+
+	/* left sibling */
+	level = NILFS_BTREE_LEVEL_NODE_MIN;
+	if (path[level].bp_index > 0) {
+		node = nilfs_btree_get_node(btree, path, level, &ncmax);
+		return nilfs_btree_node_get_ptr(node,
+						path[level].bp_index - 1,
+						ncmax);
+	}
+
+	/* parent */
+	level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
+	if (level <= nilfs_btree_height(btree) - 1) {
+		node = nilfs_btree_get_node(btree, path, level, &ncmax);
+		return nilfs_btree_node_get_ptr(node, path[level].bp_index,
+						ncmax);
+	}
+
+	return NILFS_BMAP_INVALID_PTR;
+}
+
+static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree,
+				       const struct nilfs_btree_path *path,
+				       __u64 key)
+{
+	__u64 ptr;
+
+	ptr = nilfs_bmap_find_target_seq(btree, key);
+	if (ptr != NILFS_BMAP_INVALID_PTR)
+		/* sequential access */
+		return ptr;
+	else {
+		ptr = nilfs_btree_find_near(btree, path);
+		if (ptr != NILFS_BMAP_INVALID_PTR)
+			/* near */
+			return ptr;
+	}
+	/* block group */
+	return nilfs_bmap_find_target_in_group(btree);
+}
+
+static int nilfs_btree_prepare_insert(struct nilfs_bmap *btree,
+				      struct nilfs_btree_path *path,
+				      int *levelp, __u64 key, __u64 ptr,
+				      struct nilfs_bmap_stats *stats)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree_node *node, *parent, *sib;
+	__u64 sibptr;
+	int pindex, level, ncmax, ncblk, ret;
+	struct inode *dat = NULL;
+
+	stats->bs_nblocks = 0;
+	level = NILFS_BTREE_LEVEL_DATA;
+
+	/* allocate a new ptr for data block */
+	if (NILFS_BMAP_USE_VBN(btree)) {
+		path[level].bp_newreq.bpr_ptr =
+			nilfs_btree_find_target_v(btree, path, key);
+		dat = nilfs_bmap_get_dat(btree);
+	}
+
+	ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
+	if (ret < 0)
+		goto err_out_data;
+
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+	     level < nilfs_btree_height(btree) - 1;
+	     level++) {
+		node = nilfs_btree_get_nonroot_node(path, level);
+		if (nilfs_btree_node_get_nchildren(node) < ncblk) {
+			path[level].bp_op = nilfs_btree_do_insert;
+			stats->bs_nblocks++;
+			goto out;
+		}
+
+		parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+		pindex = path[level + 1].bp_index;
+
+		/* left sibling */
+		if (pindex > 0) {
+			sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
+							  ncmax);
+			ret = nilfs_btree_get_block(btree, sibptr, &bh);
+			if (ret < 0)
+				goto err_out_child_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_carry_left;
+				stats->bs_nblocks++;
+				goto out;
+			} else {
+				brelse(bh);
+			}
+		}
+
+		/* right sibling */
+		if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) {
+			sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
+							  ncmax);
+			ret = nilfs_btree_get_block(btree, sibptr, &bh);
+			if (ret < 0)
+				goto err_out_child_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_carry_right;
+				stats->bs_nblocks++;
+				goto out;
+			} else {
+				brelse(bh);
+			}
+		}
+
+		/* split */
+		path[level].bp_newreq.bpr_ptr =
+			path[level - 1].bp_newreq.bpr_ptr + 1;
+		ret = nilfs_bmap_prepare_alloc_ptr(btree,
+						   &path[level].bp_newreq, dat);
+		if (ret < 0)
+			goto err_out_child_node;
+		ret = nilfs_btree_get_new_block(btree,
+						path[level].bp_newreq.bpr_ptr,
+						&bh);
+		if (ret < 0)
+			goto err_out_curr_node;
+
+		stats->bs_nblocks++;
+
+		sib = (struct nilfs_btree_node *)bh->b_data;
+		nilfs_btree_node_init(sib, 0, level, 0, ncblk, NULL, NULL);
+		path[level].bp_sib_bh = bh;
+		path[level].bp_op = nilfs_btree_split;
+	}
+
+	/* root */
+	node = nilfs_btree_get_root(btree);
+	if (nilfs_btree_node_get_nchildren(node) <
+	    NILFS_BTREE_ROOT_NCHILDREN_MAX) {
+		path[level].bp_op = nilfs_btree_do_insert;
+		stats->bs_nblocks++;
+		goto out;
+	}
+
+	/* grow */
+	path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
+	ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
+	if (ret < 0)
+		goto err_out_child_node;
+	ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
+					&bh);
+	if (ret < 0)
+		goto err_out_curr_node;
+
+	nilfs_btree_node_init((struct nilfs_btree_node *)bh->b_data,
+			      0, level, 0, ncblk, NULL, NULL);
+	path[level].bp_sib_bh = bh;
+	path[level].bp_op = nilfs_btree_grow;
+
+	level++;
+	path[level].bp_op = nilfs_btree_do_insert;
+
+	/* a newly-created node block and a data block are added */
+	stats->bs_nblocks += 2;
+
+	/* success */
+ out:
+	*levelp = level;
+	return ret;
+
+	/* error */
+ err_out_curr_node:
+	nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
+ err_out_child_node:
+	for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
+		nilfs_btnode_delete(path[level].bp_sib_bh);
+		nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
+
+	}
+
+	nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
+ err_out_data:
+	*levelp = level;
+	stats->bs_nblocks = 0;
+	return ret;
+}
+
+static void nilfs_btree_commit_insert(struct nilfs_bmap *btree,
+				      struct nilfs_btree_path *path,
+				      int maxlevel, __u64 key, __u64 ptr)
+{
+	struct inode *dat = NULL;
+	int level;
+
+	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
+	ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
+	if (NILFS_BMAP_USE_VBN(btree)) {
+		nilfs_bmap_set_target_v(btree, key, ptr);
+		dat = nilfs_bmap_get_dat(btree);
+	}
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
+		nilfs_bmap_commit_alloc_ptr(btree,
+					    &path[level - 1].bp_newreq, dat);
+		path[level].bp_op(btree, path, level, &key, &ptr);
+	}
+
+	if (!nilfs_bmap_dirty(btree))
+		nilfs_bmap_set_dirty(btree);
+}
+
+static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr)
+{
+	struct nilfs_btree_path *path;
+	struct nilfs_bmap_stats stats;
+	int level, ret;
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
+				    NILFS_BTREE_LEVEL_NODE_MIN, 0);
+	if (ret != -ENOENT) {
+		if (ret == 0)
+			ret = -EEXIST;
+		goto out;
+	}
+
+	ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats);
+	if (ret < 0)
+		goto out;
+	nilfs_btree_commit_insert(btree, path, level, key, ptr);
+	nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
+
+ out:
+	nilfs_btree_free_path(path);
+	return ret;
+}
+
+static void nilfs_btree_do_delete(struct nilfs_bmap *btree,
+				  struct nilfs_btree_path *path,
+				  int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node;
+	int ncblk;
+
+	if (level < nilfs_btree_height(btree) - 1) {
+		node = nilfs_btree_get_nonroot_node(path, level);
+		ncblk = nilfs_btree_nchildren_per_block(btree);
+		nilfs_btree_node_delete(node, path[level].bp_index,
+					keyp, ptrp, ncblk);
+		if (!buffer_dirty(path[level].bp_bh))
+			mark_buffer_dirty(path[level].bp_bh);
+		if (path[level].bp_index == 0)
+			nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(node, 0));
+	} else {
+		node = nilfs_btree_get_root(btree);
+		nilfs_btree_node_delete(node, path[level].bp_index,
+					keyp, ptrp,
+					NILFS_BTREE_ROOT_NCHILDREN_MAX);
+	}
+}
+
+static void nilfs_btree_borrow_left(struct nilfs_bmap *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *left;
+	int nchildren, lnchildren, n, ncblk;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	node = nilfs_btree_get_nonroot_node(path, level);
+	left = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	n = (nchildren + lnchildren) / 2 - nchildren;
+
+	nilfs_btree_node_move_right(left, node, n, ncblk, ncblk);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		mark_buffer_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		mark_buffer_dirty(path[level].bp_sib_bh);
+
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(node, 0));
+
+	brelse(path[level].bp_sib_bh);
+	path[level].bp_sib_bh = NULL;
+	path[level].bp_index += n;
+}
+
+static void nilfs_btree_borrow_right(struct nilfs_bmap *btree,
+				     struct nilfs_btree_path *path,
+				     int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	int nchildren, rnchildren, n, ncblk;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	n = (nchildren + rnchildren) / 2 - nchildren;
+
+	nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		mark_buffer_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		mark_buffer_dirty(path[level].bp_sib_bh);
+
+	path[level + 1].bp_index++;
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(right, 0));
+	path[level + 1].bp_index--;
+
+	brelse(path[level].bp_sib_bh);
+	path[level].bp_sib_bh = NULL;
+}
+
+static void nilfs_btree_concat_left(struct nilfs_bmap *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *left;
+	int n, ncblk;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	node = nilfs_btree_get_nonroot_node(path, level);
+	left = nilfs_btree_get_sib_node(path, level);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	n = nilfs_btree_node_get_nchildren(node);
+
+	nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
+
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		mark_buffer_dirty(path[level].bp_sib_bh);
+
+	nilfs_btnode_delete(path[level].bp_bh);
+	path[level].bp_bh = path[level].bp_sib_bh;
+	path[level].bp_sib_bh = NULL;
+	path[level].bp_index += nilfs_btree_node_get_nchildren(left);
+}
+
+static void nilfs_btree_concat_right(struct nilfs_bmap *btree,
+				     struct nilfs_btree_path *path,
+				     int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	int n, ncblk;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	n = nilfs_btree_node_get_nchildren(right);
+
+	nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		mark_buffer_dirty(path[level].bp_bh);
+
+	nilfs_btnode_delete(path[level].bp_sib_bh);
+	path[level].bp_sib_bh = NULL;
+	path[level + 1].bp_index++;
+}
+
+static void nilfs_btree_shrink(struct nilfs_bmap *btree,
+			       struct nilfs_btree_path *path,
+			       int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *root, *child;
+	int n, ncblk;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	root = nilfs_btree_get_root(btree);
+	child = nilfs_btree_get_nonroot_node(path, level);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	nilfs_btree_node_delete(root, 0, NULL, NULL,
+				NILFS_BTREE_ROOT_NCHILDREN_MAX);
+	nilfs_btree_node_set_level(root, level);
+	n = nilfs_btree_node_get_nchildren(child);
+	nilfs_btree_node_move_left(root, child, n,
+				   NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
+
+	nilfs_btnode_delete(path[level].bp_bh);
+	path[level].bp_bh = NULL;
+}
+
+static void nilfs_btree_nop(struct nilfs_bmap *btree,
+			    struct nilfs_btree_path *path,
+			    int level, __u64 *keyp, __u64 *ptrp)
+{
+}
+
+static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree,
+				      struct nilfs_btree_path *path,
+				      int *levelp,
+				      struct nilfs_bmap_stats *stats,
+				      struct inode *dat)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree_node *node, *parent, *sib;
+	__u64 sibptr;
+	int pindex, dindex, level, ncmin, ncmax, ncblk, ret;
+
+	ret = 0;
+	stats->bs_nblocks = 0;
+	ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN, dindex = path[level].bp_index;
+	     level < nilfs_btree_height(btree) - 1;
+	     level++) {
+		node = nilfs_btree_get_nonroot_node(path, level);
+		path[level].bp_oldreq.bpr_ptr =
+			nilfs_btree_node_get_ptr(node, dindex, ncblk);
+		ret = nilfs_bmap_prepare_end_ptr(btree,
+						 &path[level].bp_oldreq, dat);
+		if (ret < 0)
+			goto err_out_child_node;
+
+		if (nilfs_btree_node_get_nchildren(node) > ncmin) {
+			path[level].bp_op = nilfs_btree_do_delete;
+			stats->bs_nblocks++;
+			goto out;
+		}
+
+		parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+		pindex = path[level + 1].bp_index;
+		dindex = pindex;
+
+		if (pindex > 0) {
+			/* left sibling */
+			sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
+							  ncmax);
+			ret = nilfs_btree_get_block(btree, sibptr, &bh);
+			if (ret < 0)
+				goto err_out_curr_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_borrow_left;
+				stats->bs_nblocks++;
+				goto out;
+			} else {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_concat_left;
+				stats->bs_nblocks++;
+				/* continue; */
+			}
+		} else if (pindex <
+			   nilfs_btree_node_get_nchildren(parent) - 1) {
+			/* right sibling */
+			sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
+							  ncmax);
+			ret = nilfs_btree_get_block(btree, sibptr, &bh);
+			if (ret < 0)
+				goto err_out_curr_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_borrow_right;
+				stats->bs_nblocks++;
+				goto out;
+			} else {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_concat_right;
+				stats->bs_nblocks++;
+				/*
+				 * When merging right sibling node
+				 * into the current node, pointer to
+				 * the right sibling node must be
+				 * terminated instead.  The adjustment
+				 * below is required for that.
+				 */
+				dindex = pindex + 1;
+				/* continue; */
+			}
+		} else {
+			/* no siblings */
+			/* the only child of the root node */
+			WARN_ON(level != nilfs_btree_height(btree) - 2);
+			if (nilfs_btree_node_get_nchildren(node) - 1 <=
+			    NILFS_BTREE_ROOT_NCHILDREN_MAX) {
+				path[level].bp_op = nilfs_btree_shrink;
+				stats->bs_nblocks += 2;
+				level++;
+				path[level].bp_op = nilfs_btree_nop;
+				goto shrink_root_child;
+			} else {
+				path[level].bp_op = nilfs_btree_do_delete;
+				stats->bs_nblocks++;
+				goto out;
+			}
+		}
+	}
+
+	/* child of the root node is deleted */
+	path[level].bp_op = nilfs_btree_do_delete;
+	stats->bs_nblocks++;
+
+shrink_root_child:
+	node = nilfs_btree_get_root(btree);
+	path[level].bp_oldreq.bpr_ptr =
+		nilfs_btree_node_get_ptr(node, dindex,
+					 NILFS_BTREE_ROOT_NCHILDREN_MAX);
+
+	ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat);
+	if (ret < 0)
+		goto err_out_child_node;
+
+	/* success */
+ out:
+	*levelp = level;
+	return ret;
+
+	/* error */
+ err_out_curr_node:
+	nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
+ err_out_child_node:
+	for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
+		brelse(path[level].bp_sib_bh);
+		nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
+	}
+	*levelp = level;
+	stats->bs_nblocks = 0;
+	return ret;
+}
+
+static void nilfs_btree_commit_delete(struct nilfs_bmap *btree,
+				      struct nilfs_btree_path *path,
+				      int maxlevel, struct inode *dat)
+{
+	int level;
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
+		nilfs_bmap_commit_end_ptr(btree, &path[level].bp_oldreq, dat);
+		path[level].bp_op(btree, path, level, NULL, NULL);
+	}
+
+	if (!nilfs_bmap_dirty(btree))
+		nilfs_bmap_set_dirty(btree);
+}
+
+static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key)
+
+{
+	struct nilfs_btree_path *path;
+	struct nilfs_bmap_stats stats;
+	struct inode *dat;
+	int level, ret;
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
+				    NILFS_BTREE_LEVEL_NODE_MIN, 0);
+	if (ret < 0)
+		goto out;
+
+
+	dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
+
+	ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
+	if (ret < 0)
+		goto out;
+	nilfs_btree_commit_delete(btree, path, level, dat);
+	nilfs_inode_sub_blocks(btree->b_inode, stats.bs_nblocks);
+
+out:
+	nilfs_btree_free_path(path);
+	return ret;
+}
+
+static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp)
+{
+	struct nilfs_btree_path *path;
+	int ret;
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
+
+	nilfs_btree_free_path(path);
+
+	return ret;
+}
+
+static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree_node *root, *node;
+	__u64 maxkey, nextmaxkey;
+	__u64 ptr;
+	int nchildren, ret;
+
+	root = nilfs_btree_get_root(btree);
+	switch (nilfs_btree_height(btree)) {
+	case 2:
+		bh = NULL;
+		node = root;
+		break;
+	case 3:
+		nchildren = nilfs_btree_node_get_nchildren(root);
+		if (nchildren > 1)
+			return 0;
+		ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
+					       NILFS_BTREE_ROOT_NCHILDREN_MAX);
+		ret = nilfs_btree_get_block(btree, ptr, &bh);
+		if (ret < 0)
+			return ret;
+		node = (struct nilfs_btree_node *)bh->b_data;
+		break;
+	default:
+		return 0;
+	}
+
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
+	nextmaxkey = (nchildren > 1) ?
+		nilfs_btree_node_get_key(node, nchildren - 2) : 0;
+	if (bh != NULL)
+		brelse(bh);
+
+	return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
+}
+
+static int nilfs_btree_gather_data(struct nilfs_bmap *btree,
+				   __u64 *keys, __u64 *ptrs, int nitems)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree_node *node, *root;
+	__le64 *dkeys;
+	__le64 *dptrs;
+	__u64 ptr;
+	int nchildren, ncmax, i, ret;
+
+	root = nilfs_btree_get_root(btree);
+	switch (nilfs_btree_height(btree)) {
+	case 2:
+		bh = NULL;
+		node = root;
+		ncmax = NILFS_BTREE_ROOT_NCHILDREN_MAX;
+		break;
+	case 3:
+		nchildren = nilfs_btree_node_get_nchildren(root);
+		WARN_ON(nchildren > 1);
+		ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
+					       NILFS_BTREE_ROOT_NCHILDREN_MAX);
+		ret = nilfs_btree_get_block(btree, ptr, &bh);
+		if (ret < 0)
+			return ret;
+		node = (struct nilfs_btree_node *)bh->b_data;
+		ncmax = nilfs_btree_nchildren_per_block(btree);
+		break;
+	default:
+		node = NULL;
+		return -EINVAL;
+	}
+
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	if (nchildren < nitems)
+		nitems = nchildren;
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, ncmax);
+	for (i = 0; i < nitems; i++) {
+		keys[i] = le64_to_cpu(dkeys[i]);
+		ptrs[i] = le64_to_cpu(dptrs[i]);
+	}
+
+	if (bh != NULL)
+		brelse(bh);
+
+	return nitems;
+}
+
+static int
+nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key,
+				       union nilfs_bmap_ptr_req *dreq,
+				       union nilfs_bmap_ptr_req *nreq,
+				       struct buffer_head **bhp,
+				       struct nilfs_bmap_stats *stats)
+{
+	struct buffer_head *bh;
+	struct inode *dat = NULL;
+	int ret;
+
+	stats->bs_nblocks = 0;
+
+	/* for data */
+	/* cannot find near ptr */
+	if (NILFS_BMAP_USE_VBN(btree)) {
+		dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
+		dat = nilfs_bmap_get_dat(btree);
+	}
+
+	ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat);
+	if (ret < 0)
+		return ret;
+
+	*bhp = NULL;
+	stats->bs_nblocks++;
+	if (nreq != NULL) {
+		nreq->bpr_ptr = dreq->bpr_ptr + 1;
+		ret = nilfs_bmap_prepare_alloc_ptr(btree, nreq, dat);
+		if (ret < 0)
+			goto err_out_dreq;
+
+		ret = nilfs_btree_get_new_block(btree, nreq->bpr_ptr, &bh);
+		if (ret < 0)
+			goto err_out_nreq;
+
+		*bhp = bh;
+		stats->bs_nblocks++;
+	}
+
+	/* success */
+	return 0;
+
+	/* error */
+ err_out_nreq:
+	nilfs_bmap_abort_alloc_ptr(btree, nreq, dat);
+ err_out_dreq:
+	nilfs_bmap_abort_alloc_ptr(btree, dreq, dat);
+	stats->bs_nblocks = 0;
+	return ret;
+
+}
+
+static void
+nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
+				      __u64 key, __u64 ptr,
+				      const __u64 *keys, const __u64 *ptrs,
+				      int n,
+				      union nilfs_bmap_ptr_req *dreq,
+				      union nilfs_bmap_ptr_req *nreq,
+				      struct buffer_head *bh)
+{
+	struct nilfs_btree_node *node;
+	struct inode *dat;
+	__u64 tmpptr;
+	int ncblk;
+
+	/* free resources */
+	if (btree->b_ops->bop_clear != NULL)
+		btree->b_ops->bop_clear(btree);
+
+	/* ptr must be a pointer to a buffer head. */
+	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
+
+	/* convert and insert */
+	dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
+	nilfs_btree_init(btree);
+	if (nreq != NULL) {
+		nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
+		nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);
+
+		/* create child node at level 1 */
+		node = (struct nilfs_btree_node *)bh->b_data;
+		ncblk = nilfs_btree_nchildren_per_block(btree);
+		nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs);
+		nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk);
+		if (!buffer_dirty(bh))
+			mark_buffer_dirty(bh);
+		if (!nilfs_bmap_dirty(btree))
+			nilfs_bmap_set_dirty(btree);
+
+		brelse(bh);
+
+		/* create root node at level 2 */
+		node = nilfs_btree_get_root(btree);
+		tmpptr = nreq->bpr_ptr;
+		nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 2, 1,
+				      NILFS_BTREE_ROOT_NCHILDREN_MAX,
+				      &keys[0], &tmpptr);
+	} else {
+		nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
+
+		/* create root node at level 1 */
+		node = nilfs_btree_get_root(btree);
+		nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 1, n,
+				      NILFS_BTREE_ROOT_NCHILDREN_MAX,
+				      keys, ptrs);
+		nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr,
+					NILFS_BTREE_ROOT_NCHILDREN_MAX);
+		if (!nilfs_bmap_dirty(btree))
+			nilfs_bmap_set_dirty(btree);
+	}
+
+	if (NILFS_BMAP_USE_VBN(btree))
+		nilfs_bmap_set_target_v(btree, key, dreq->bpr_ptr);
+}
+
+/**
+ * nilfs_btree_convert_and_insert -
+ * @bmap:
+ * @key:
+ * @ptr:
+ * @keys:
+ * @ptrs:
+ * @n:
+ */
+int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
+				   __u64 key, __u64 ptr,
+				   const __u64 *keys, const __u64 *ptrs, int n)
+{
+	struct buffer_head *bh;
+	union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
+	struct nilfs_bmap_stats stats;
+	int ret;
+
+	if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) {
+		di = &dreq;
+		ni = NULL;
+	} else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
+			   1 << btree->b_inode->i_blkbits)) {
+		di = &dreq;
+		ni = &nreq;
+	} else {
+		di = NULL;
+		ni = NULL;
+		BUG();
+	}
+
+	ret = nilfs_btree_prepare_convert_and_insert(btree, key, di, ni, &bh,
+						     &stats);
+	if (ret < 0)
+		return ret;
+	nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n,
+					      di, ni, bh);
+	nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
+	return 0;
+}
+
+static int nilfs_btree_propagate_p(struct nilfs_bmap *btree,
+				   struct nilfs_btree_path *path,
+				   int level,
+				   struct buffer_head *bh)
+{
+	while ((++level < nilfs_btree_height(btree) - 1) &&
+	       !buffer_dirty(path[level].bp_bh))
+		mark_buffer_dirty(path[level].bp_bh);
+
+	return 0;
+}
+
+static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree,
+					struct nilfs_btree_path *path,
+					int level, struct inode *dat)
+{
+	struct nilfs_btree_node *parent;
+	int ncmax, ret;
+
+	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+	path[level].bp_oldreq.bpr_ptr =
+		nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
+					 ncmax);
+	path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
+	ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
+				       &path[level].bp_newreq.bpr_req);
+	if (ret < 0)
+		return ret;
+
+	if (buffer_nilfs_node(path[level].bp_bh)) {
+		path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr;
+		path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
+		path[level].bp_ctxt.bh = path[level].bp_bh;
+		ret = nilfs_btnode_prepare_change_key(
+			&NILFS_BMAP_I(btree)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		if (ret < 0) {
+			nilfs_dat_abort_update(dat,
+					       &path[level].bp_oldreq.bpr_req,
+					       &path[level].bp_newreq.bpr_req);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree,
+					struct nilfs_btree_path *path,
+					int level, struct inode *dat)
+{
+	struct nilfs_btree_node *parent;
+	int ncmax;
+
+	nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
+				&path[level].bp_newreq.bpr_req,
+				btree->b_ptr_type == NILFS_BMAP_PTR_VS);
+
+	if (buffer_nilfs_node(path[level].bp_bh)) {
+		nilfs_btnode_commit_change_key(
+			&NILFS_BMAP_I(btree)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		path[level].bp_bh = path[level].bp_ctxt.bh;
+	}
+	set_buffer_nilfs_volatile(path[level].bp_bh);
+
+	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+	nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index,
+				 path[level].bp_newreq.bpr_ptr, ncmax);
+}
+
+static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree,
+				       struct nilfs_btree_path *path,
+				       int level, struct inode *dat)
+{
+	nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req,
+			       &path[level].bp_newreq.bpr_req);
+	if (buffer_nilfs_node(path[level].bp_bh))
+		nilfs_btnode_abort_change_key(
+			&NILFS_BMAP_I(btree)->i_btnode_cache,
+			&path[level].bp_ctxt);
+}
+
+static int nilfs_btree_prepare_propagate_v(struct nilfs_bmap *btree,
+					   struct nilfs_btree_path *path,
+					   int minlevel, int *maxlevelp,
+					   struct inode *dat)
+{
+	int level, ret;
+
+	level = minlevel;
+	if (!buffer_nilfs_volatile(path[level].bp_bh)) {
+		ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
+		if (ret < 0)
+			return ret;
+	}
+	while ((++level < nilfs_btree_height(btree) - 1) &&
+	       !buffer_dirty(path[level].bp_bh)) {
+
+		WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
+		ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
+		if (ret < 0)
+			goto out;
+	}
+
+	/* success */
+	*maxlevelp = level - 1;
+	return 0;
+
+	/* error */
+ out:
+	while (--level > minlevel)
+		nilfs_btree_abort_update_v(btree, path, level, dat);
+	if (!buffer_nilfs_volatile(path[level].bp_bh))
+		nilfs_btree_abort_update_v(btree, path, level, dat);
+	return ret;
+}
+
+static void nilfs_btree_commit_propagate_v(struct nilfs_bmap *btree,
+					   struct nilfs_btree_path *path,
+					   int minlevel, int maxlevel,
+					   struct buffer_head *bh,
+					   struct inode *dat)
+{
+	int level;
+
+	if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
+		nilfs_btree_commit_update_v(btree, path, minlevel, dat);
+
+	for (level = minlevel + 1; level <= maxlevel; level++)
+		nilfs_btree_commit_update_v(btree, path, level, dat);
+}
+
+static int nilfs_btree_propagate_v(struct nilfs_bmap *btree,
+				   struct nilfs_btree_path *path,
+				   int level, struct buffer_head *bh)
+{
+	int maxlevel = 0, ret;
+	struct nilfs_btree_node *parent;
+	struct inode *dat = nilfs_bmap_get_dat(btree);
+	__u64 ptr;
+	int ncmax;
+
+	get_bh(bh);
+	path[level].bp_bh = bh;
+	ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel,
+					      dat);
+	if (ret < 0)
+		goto out;
+
+	if (buffer_nilfs_volatile(path[level].bp_bh)) {
+		parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+		ptr = nilfs_btree_node_get_ptr(parent,
+					       path[level + 1].bp_index,
+					       ncmax);
+		ret = nilfs_dat_mark_dirty(dat, ptr);
+		if (ret < 0)
+			goto out;
+	}
+
+	nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat);
+
+ out:
+	brelse(path[level].bp_bh);
+	path[level].bp_bh = NULL;
+	return ret;
+}
+
+static int nilfs_btree_propagate(struct nilfs_bmap *btree,
+				 struct buffer_head *bh)
+{
+	struct nilfs_btree_path *path;
+	struct nilfs_btree_node *node;
+	__u64 key;
+	int level, ret;
+
+	WARN_ON(!buffer_dirty(bh));
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	if (buffer_nilfs_node(bh)) {
+		node = (struct nilfs_btree_node *)bh->b_data;
+		key = nilfs_btree_node_get_key(node, 0);
+		level = nilfs_btree_node_get_level(node);
+	} else {
+		key = nilfs_bmap_data_get_key(btree, bh);
+		level = NILFS_BTREE_LEVEL_DATA;
+	}
+
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
+	if (ret < 0) {
+		if (unlikely(ret == -ENOENT))
+			printk(KERN_CRIT "%s: key = %llu, level == %d\n",
+			       __func__, (unsigned long long)key, level);
+		goto out;
+	}
+
+	ret = NILFS_BMAP_USE_VBN(btree) ?
+		nilfs_btree_propagate_v(btree, path, level, bh) :
+		nilfs_btree_propagate_p(btree, path, level, bh);
+
+ out:
+	nilfs_btree_free_path(path);
+
+	return ret;
+}
+
+static int nilfs_btree_propagate_gc(struct nilfs_bmap *btree,
+				    struct buffer_head *bh)
+{
+	return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(btree), bh->b_blocknr);
+}
+
+static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
+					 struct list_head *lists,
+					 struct buffer_head *bh)
+{
+	struct list_head *head;
+	struct buffer_head *cbh;
+	struct nilfs_btree_node *node, *cnode;
+	__u64 key, ckey;
+	int level;
+
+	get_bh(bh);
+	node = (struct nilfs_btree_node *)bh->b_data;
+	key = nilfs_btree_node_get_key(node, 0);
+	level = nilfs_btree_node_get_level(node);
+	if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
+	    level >= NILFS_BTREE_LEVEL_MAX) {
+		dump_stack();
+		printk(KERN_WARNING
+		       "%s: invalid btree level: %d (key=%llu, ino=%lu, "
+		       "blocknr=%llu)\n",
+		       __func__, level, (unsigned long long)key,
+		       NILFS_BMAP_I(btree)->vfs_inode.i_ino,
+		       (unsigned long long)bh->b_blocknr);
+		return;
+	}
+
+	list_for_each(head, &lists[level]) {
+		cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
+		cnode = (struct nilfs_btree_node *)cbh->b_data;
+		ckey = nilfs_btree_node_get_key(cnode, 0);
+		if (key < ckey)
+			break;
+	}
+	list_add_tail(&bh->b_assoc_buffers, head);
+}
+
+static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
+					     struct list_head *listp)
+{
+	struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache;
+	struct list_head lists[NILFS_BTREE_LEVEL_MAX];
+	struct pagevec pvec;
+	struct buffer_head *bh, *head;
+	pgoff_t index = 0;
+	int level, i;
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+	     level < NILFS_BTREE_LEVEL_MAX;
+	     level++)
+		INIT_LIST_HEAD(&lists[level]);
+
+	pagevec_init(&pvec, 0);
+
+	while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY,
+				  PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			bh = head = page_buffers(pvec.pages[i]);
+			do {
+				if (buffer_dirty(bh))
+					nilfs_btree_add_dirty_buffer(btree,
+								     lists, bh);
+			} while ((bh = bh->b_this_page) != head);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+	     level < NILFS_BTREE_LEVEL_MAX;
+	     level++)
+		list_splice_tail(&lists[level], listp);
+}
+
+static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
+				struct nilfs_btree_path *path,
+				int level,
+				struct buffer_head **bh,
+				sector_t blocknr,
+				union nilfs_binfo *binfo)
+{
+	struct nilfs_btree_node *parent;
+	__u64 key;
+	__u64 ptr;
+	int ncmax, ret;
+
+	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+	ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
+				       ncmax);
+	if (buffer_nilfs_node(*bh)) {
+		path[level].bp_ctxt.oldkey = ptr;
+		path[level].bp_ctxt.newkey = blocknr;
+		path[level].bp_ctxt.bh = *bh;
+		ret = nilfs_btnode_prepare_change_key(
+			&NILFS_BMAP_I(btree)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		if (ret < 0)
+			return ret;
+		nilfs_btnode_commit_change_key(
+			&NILFS_BMAP_I(btree)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		*bh = path[level].bp_ctxt.bh;
+	}
+
+	nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, blocknr,
+				 ncmax);
+
+	key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
+	/* on-disk format */
+	binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
+	binfo->bi_dat.bi_level = level;
+
+	return 0;
+}
+
+static int nilfs_btree_assign_v(struct nilfs_bmap *btree,
+				struct nilfs_btree_path *path,
+				int level,
+				struct buffer_head **bh,
+				sector_t blocknr,
+				union nilfs_binfo *binfo)
+{
+	struct nilfs_btree_node *parent;
+	struct inode *dat = nilfs_bmap_get_dat(btree);
+	__u64 key;
+	__u64 ptr;
+	union nilfs_bmap_ptr_req req;
+	int ncmax, ret;
+
+	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+	ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
+				       ncmax);
+	req.bpr_ptr = ptr;
+	ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
+	if (ret < 0)
+		return ret;
+	nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
+
+	key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
+	/* on-disk format */
+	binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
+	binfo->bi_v.bi_blkoff = cpu_to_le64(key);
+
+	return 0;
+}
+
+static int nilfs_btree_assign(struct nilfs_bmap *btree,
+			      struct buffer_head **bh,
+			      sector_t blocknr,
+			      union nilfs_binfo *binfo)
+{
+	struct nilfs_btree_path *path;
+	struct nilfs_btree_node *node;
+	__u64 key;
+	int level, ret;
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	if (buffer_nilfs_node(*bh)) {
+		node = (struct nilfs_btree_node *)(*bh)->b_data;
+		key = nilfs_btree_node_get_key(node, 0);
+		level = nilfs_btree_node_get_level(node);
+	} else {
+		key = nilfs_bmap_data_get_key(btree, *bh);
+		level = NILFS_BTREE_LEVEL_DATA;
+	}
+
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		goto out;
+	}
+
+	ret = NILFS_BMAP_USE_VBN(btree) ?
+		nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) :
+		nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
+
+ out:
+	nilfs_btree_free_path(path);
+
+	return ret;
+}
+
+static int nilfs_btree_assign_gc(struct nilfs_bmap *btree,
+				 struct buffer_head **bh,
+				 sector_t blocknr,
+				 union nilfs_binfo *binfo)
+{
+	struct nilfs_btree_node *node;
+	__u64 key;
+	int ret;
+
+	ret = nilfs_dat_move(nilfs_bmap_get_dat(btree), (*bh)->b_blocknr,
+			     blocknr);
+	if (ret < 0)
+		return ret;
+
+	if (buffer_nilfs_node(*bh)) {
+		node = (struct nilfs_btree_node *)(*bh)->b_data;
+		key = nilfs_btree_node_get_key(node, 0);
+	} else
+		key = nilfs_bmap_data_get_key(btree, *bh);
+
+	/* on-disk format */
+	binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
+	binfo->bi_v.bi_blkoff = cpu_to_le64(key);
+
+	return 0;
+}
+
+static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree_path *path;
+	__u64 ptr;
+	int ret;
+
+	path = nilfs_btree_alloc_path();
+	if (path == NULL)
+		return -ENOMEM;
+
+	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1, 0);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		goto out;
+	}
+	ret = nilfs_btree_get_block(btree, ptr, &bh);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		goto out;
+	}
+
+	if (!buffer_dirty(bh))
+		mark_buffer_dirty(bh);
+	brelse(bh);
+	if (!nilfs_bmap_dirty(btree))
+		nilfs_bmap_set_dirty(btree);
+
+ out:
+	nilfs_btree_free_path(path);
+	return ret;
+}
+
+static const struct nilfs_bmap_operations nilfs_btree_ops = {
+	.bop_lookup		=	nilfs_btree_lookup,
+	.bop_lookup_contig	=	nilfs_btree_lookup_contig,
+	.bop_insert		=	nilfs_btree_insert,
+	.bop_delete		=	nilfs_btree_delete,
+	.bop_clear		=	NULL,
+
+	.bop_propagate		=	nilfs_btree_propagate,
+
+	.bop_lookup_dirty_buffers =	nilfs_btree_lookup_dirty_buffers,
+
+	.bop_assign		=	nilfs_btree_assign,
+	.bop_mark		=	nilfs_btree_mark,
+
+	.bop_last_key		=	nilfs_btree_last_key,
+	.bop_check_insert	=	NULL,
+	.bop_check_delete	=	nilfs_btree_check_delete,
+	.bop_gather_data	=	nilfs_btree_gather_data,
+};
+
+static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
+	.bop_lookup		=	NULL,
+	.bop_lookup_contig	=	NULL,
+	.bop_insert		=	NULL,
+	.bop_delete		=	NULL,
+	.bop_clear		=	NULL,
+
+	.bop_propagate		=	nilfs_btree_propagate_gc,
+
+	.bop_lookup_dirty_buffers =	nilfs_btree_lookup_dirty_buffers,
+
+	.bop_assign		=	nilfs_btree_assign_gc,
+	.bop_mark		=	NULL,
+
+	.bop_last_key		=	NULL,
+	.bop_check_insert	=	NULL,
+	.bop_check_delete	=	NULL,
+	.bop_gather_data	=	NULL,
+};
+
+int nilfs_btree_init(struct nilfs_bmap *bmap)
+{
+	bmap->b_ops = &nilfs_btree_ops;
+	bmap->b_nchildren_per_block =
+		NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
+	return 0;
+}
+
+void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
+{
+	bmap->b_ops = &nilfs_btree_ops_gc;
+	bmap->b_nchildren_per_block =
+		NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
+}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
new file mode 100644
index 00000000..22c02e35
--- /dev/null
+++ b/fs/nilfs2/btree.h
@@ -0,0 +1,77 @@
+/*
+ * btree.h - NILFS B-tree.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_BTREE_H
+#define _NILFS_BTREE_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/list.h>
+#include <linux/nilfs2_fs.h>
+#include "btnode.h"
+#include "bmap.h"
+
+/**
+ * struct nilfs_btree_path - A path on which B-tree operations are executed
+ * @bp_bh: buffer head of node block
+ * @bp_sib_bh: buffer head of sibling node block
+ * @bp_index: index of child node
+ * @bp_oldreq: ptr end request for old ptr
+ * @bp_newreq: ptr alloc request for new ptr
+ * @bp_op: rebalance operation
+ */
+struct nilfs_btree_path {
+	struct buffer_head *bp_bh;
+	struct buffer_head *bp_sib_bh;
+	int bp_index;
+	union nilfs_bmap_ptr_req bp_oldreq;
+	union nilfs_bmap_ptr_req bp_newreq;
+	struct nilfs_btnode_chkey_ctxt bp_ctxt;
+	void (*bp_op)(struct nilfs_bmap *, struct nilfs_btree_path *,
+		      int, __u64 *, __u64 *);
+};
+
+#define NILFS_BTREE_ROOT_SIZE		NILFS_BMAP_SIZE
+#define NILFS_BTREE_ROOT_NCHILDREN_MAX					\
+	((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) /	\
+	 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
+#define NILFS_BTREE_ROOT_NCHILDREN_MIN	0
+#define NILFS_BTREE_NODE_EXTRA_PAD_SIZE	(sizeof(__le64))
+#define NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize)			\
+	(((nodesize) - sizeof(struct nilfs_btree_node) -		\
+		NILFS_BTREE_NODE_EXTRA_PAD_SIZE) /			\
+	 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
+#define NILFS_BTREE_NODE_NCHILDREN_MIN(nodesize)			\
+	((NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) - 1) / 2 + 1)
+#define NILFS_BTREE_KEY_MIN	((__u64)0)
+#define NILFS_BTREE_KEY_MAX	(~(__u64)0)
+
+extern struct kmem_cache *nilfs_btree_path_cache;
+
+int nilfs_btree_init(struct nilfs_bmap *);
+int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
+				   const __u64 *, const __u64 *, int);
+void nilfs_btree_init_gc(struct nilfs_bmap *);
+
+int nilfs_btree_broken_node_block(struct buffer_head *bh);
+
+#endif	/* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
new file mode 100644
index 00000000..c9b342c8
--- /dev/null
+++ b/fs/nilfs2/cpfile.c
@@ -0,0 +1,965 @@
+/*
+ * cpfile.c - NILFS checkpoint file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/errno.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "cpfile.h"
+
+
+static inline unsigned long
+nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile)
+{
+	return NILFS_MDT(cpfile)->mi_entries_per_block;
+}
+
+/* block number from the beginning of the file */
+static unsigned long
+nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
+{
+	__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+	do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
+	return (unsigned long)tcno;
+}
+
+/* offset in block */
+static unsigned long
+nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
+{
+	__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+	return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
+}
+
+static unsigned long
+nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
+				  __u64 curr,
+				  __u64 max)
+{
+	return min_t(__u64,
+		     nilfs_cpfile_checkpoints_per_block(cpfile) -
+		     nilfs_cpfile_get_offset(cpfile, curr),
+		     max - curr);
+}
+
+static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile,
+					   __u64 cno)
+{
+	return nilfs_cpfile_get_blkoff(cpfile, cno) == 0;
+}
+
+static unsigned int
+nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile,
+					 struct buffer_head *bh,
+					 void *kaddr,
+					 unsigned int n)
+{
+	struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+	unsigned int count;
+
+	count = le32_to_cpu(cp->cp_checkpoints_count) + n;
+	cp->cp_checkpoints_count = cpu_to_le32(count);
+	return count;
+}
+
+static unsigned int
+nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile,
+					 struct buffer_head *bh,
+					 void *kaddr,
+					 unsigned int n)
+{
+	struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+	unsigned int count;
+
+	WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n);
+	count = le32_to_cpu(cp->cp_checkpoints_count) - n;
+	cp->cp_checkpoints_count = cpu_to_le32(count);
+	return count;
+}
+
+static inline struct nilfs_cpfile_header *
+nilfs_cpfile_block_get_header(const struct inode *cpfile,
+			      struct buffer_head *bh,
+			      void *kaddr)
+{
+	return kaddr + bh_offset(bh);
+}
+
+static struct nilfs_checkpoint *
+nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno,
+				  struct buffer_head *bh,
+				  void *kaddr)
+{
+	return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) *
+		NILFS_MDT(cpfile)->mi_entry_size;
+}
+
+static void nilfs_cpfile_block_init(struct inode *cpfile,
+				    struct buffer_head *bh,
+				    void *kaddr)
+{
+	struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+	size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+	int n = nilfs_cpfile_checkpoints_per_block(cpfile);
+
+	while (n-- > 0) {
+		nilfs_checkpoint_set_invalid(cp);
+		cp = (void *)cp + cpsz;
+	}
+}
+
+static inline int nilfs_cpfile_get_header_block(struct inode *cpfile,
+						struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp);
+}
+
+static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile,
+						    __u64 cno,
+						    int create,
+						    struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(cpfile,
+				   nilfs_cpfile_get_blkoff(cpfile, cno),
+				   create, nilfs_cpfile_block_init, bhp);
+}
+
+static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
+						       __u64 cno)
+{
+	return nilfs_mdt_delete_block(cpfile,
+				      nilfs_cpfile_get_blkoff(cpfile, cno));
+}
+
+/**
+ * nilfs_cpfile_get_checkpoint - get a checkpoint
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @create: create flag
+ * @cpp: pointer to a checkpoint
+ * @bhp: pointer to a buffer head
+ *
+ * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint
+ * specified by @cno. A new checkpoint will be created if @cno is the current
+ * checkpoint number and @create is nonzero.
+ *
+ * Return Value: On success, 0 is returned, and the checkpoint and the
+ * buffer head of the buffer on which the checkpoint is located are stored in
+ * the place pointed by @cpp and @bhp, respectively. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ *
+ * %-EINVAL - invalid checkpoint.
+ */
+int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
+				__u64 cno,
+				int create,
+				struct nilfs_checkpoint **cpp,
+				struct buffer_head **bhp)
+{
+	struct buffer_head *header_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) ||
+		     (cno < nilfs_mdt_cno(cpfile) && create)))
+		return -EINVAL;
+
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh);
+	if (ret < 0)
+		goto out_header;
+	kaddr = kmap(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		if (!create) {
+			kunmap(cp_bh->b_page);
+			brelse(cp_bh);
+			ret = -ENOENT;
+			goto out_header;
+		}
+		/* a newly-created checkpoint */
+		nilfs_checkpoint_clear_invalid(cp);
+		if (!nilfs_cpfile_is_in_first(cpfile, cno))
+			nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
+								 kaddr, 1);
+		mark_buffer_dirty(cp_bh);
+
+		kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
+						       kaddr);
+		le64_add_cpu(&header->ch_ncheckpoints, 1);
+		kunmap_atomic(kaddr, KM_USER0);
+		mark_buffer_dirty(header_bh);
+		nilfs_mdt_mark_dirty(cpfile);
+	}
+
+	if (cpp != NULL)
+		*cpp = cp;
+	*bhp = cp_bh;
+
+ out_header:
+	brelse(header_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_put_checkpoint - put a checkpoint
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @bh: buffer head
+ *
+ * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint
+ * specified by @cno. @bh must be the buffer head which has been returned by
+ * a previous call to nilfs_cpfile_get_checkpoint() with @cno.
+ */
+void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
+				 struct buffer_head *bh)
+{
+	kunmap(bh->b_page);
+	brelse(bh);
+}
+
+/**
+ * nilfs_cpfile_delete_checkpoints - delete checkpoints
+ * @cpfile: inode of checkpoint file
+ * @start: start checkpoint number
+ * @end: end checkpoint numer
+ *
+ * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in
+ * the period from @start to @end, excluding @end itself. The checkpoints
+ * which have been already deleted are ignored.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - invalid checkpoints.
+ */
+int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
+				    __u64 start,
+				    __u64 end)
+{
+	struct buffer_head *header_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+	__u64 cno;
+	void *kaddr;
+	unsigned long tnicps;
+	int ret, ncps, nicps, count, i;
+
+	if (unlikely(start == 0 || start > end)) {
+		printk(KERN_ERR "%s: invalid range of checkpoint numbers: "
+		       "[%llu, %llu)\n", __func__,
+		       (unsigned long long)start, (unsigned long long)end);
+		return -EINVAL;
+	}
+
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+	tnicps = 0;
+
+	for (cno = start; cno < end; cno += ncps) {
+		ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end);
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				break;
+			/* skip hole */
+			ret = 0;
+			continue;
+		}
+
+		kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+		cp = nilfs_cpfile_block_get_checkpoint(
+			cpfile, cno, cp_bh, kaddr);
+		nicps = 0;
+		for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) {
+			WARN_ON(nilfs_checkpoint_snapshot(cp));
+			if (!nilfs_checkpoint_invalid(cp)) {
+				nilfs_checkpoint_set_invalid(cp);
+				nicps++;
+			}
+		}
+		if (nicps > 0) {
+			tnicps += nicps;
+			mark_buffer_dirty(cp_bh);
+			nilfs_mdt_mark_dirty(cpfile);
+			if (!nilfs_cpfile_is_in_first(cpfile, cno)) {
+				count =
+				  nilfs_cpfile_block_sub_valid_checkpoints(
+						cpfile, cp_bh, kaddr, nicps);
+				if (count == 0) {
+					/* make hole */
+					kunmap_atomic(kaddr, KM_USER0);
+					brelse(cp_bh);
+					ret =
+					  nilfs_cpfile_delete_checkpoint_block(
+								   cpfile, cno);
+					if (ret == 0)
+						continue;
+					printk(KERN_ERR
+					       "%s: cannot delete block\n",
+					       __func__);
+					break;
+				}
+			}
+		}
+
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(cp_bh);
+	}
+
+	if (tnicps > 0) {
+		kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
+						       kaddr);
+		le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
+		mark_buffer_dirty(header_bh);
+		nilfs_mdt_mark_dirty(cpfile);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+
+	brelse(header_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
+					      struct nilfs_checkpoint *cp,
+					      struct nilfs_cpinfo *ci)
+{
+	ci->ci_flags = le32_to_cpu(cp->cp_flags);
+	ci->ci_cno = le64_to_cpu(cp->cp_cno);
+	ci->ci_create = le64_to_cpu(cp->cp_create);
+	ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc);
+	ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count);
+	ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count);
+	ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
+}
+
+static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
+					  void *buf, unsigned cisz, size_t nci)
+{
+	struct nilfs_checkpoint *cp;
+	struct nilfs_cpinfo *ci = buf;
+	struct buffer_head *bh;
+	size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+	__u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
+	void *kaddr;
+	int n, ret;
+	int ncps, i;
+
+	if (cno == 0)
+		return -ENOENT; /* checkpoint number 0 is invalid */
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	for (n = 0; cno < cur_cno && n < nci; cno += ncps) {
+		ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				goto out;
+			continue; /* skip hole */
+		}
+
+		kaddr = kmap_atomic(bh->b_page, KM_USER0);
+		cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+		for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
+			if (!nilfs_checkpoint_invalid(cp)) {
+				nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp,
+								  ci);
+				ci = (void *)ci + cisz;
+				n++;
+			}
+		}
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(bh);
+	}
+
+	ret = n;
+	if (n > 0) {
+		ci = (void *)ci - cisz;
+		*cnop = ci->ci_cno + 1;
+	}
+
+ out:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
+					  void *buf, unsigned cisz, size_t nci)
+{
+	struct buffer_head *bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	struct nilfs_cpinfo *ci = buf;
+	__u64 curr = *cnop, next;
+	unsigned long curr_blkoff, next_blkoff;
+	void *kaddr;
+	int n = 0, ret;
+
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	if (curr == 0) {
+		ret = nilfs_cpfile_get_header_block(cpfile, &bh);
+		if (ret < 0)
+			goto out;
+		kaddr = kmap_atomic(bh->b_page, KM_USER0);
+		header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+		curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(bh);
+		if (curr == 0) {
+			ret = 0;
+			goto out;
+		}
+	} else if (unlikely(curr == ~(__u64)0)) {
+		ret = 0;
+		goto out;
+	}
+
+	curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr);
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh);
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT)
+			ret = 0; /* No snapshots (started from a hole block) */
+		goto out;
+	}
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	while (n < nci) {
+		cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
+		curr = ~(__u64)0; /* Terminator */
+		if (unlikely(nilfs_checkpoint_invalid(cp) ||
+			     !nilfs_checkpoint_snapshot(cp)))
+			break;
+		nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, ci);
+		ci = (void *)ci + cisz;
+		n++;
+		next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
+		if (next == 0)
+			break; /* reach end of the snapshot list */
+
+		next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
+		if (curr_blkoff != next_blkoff) {
+			kunmap_atomic(kaddr, KM_USER0);
+			brelse(bh);
+			ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
+								0, &bh);
+			if (unlikely(ret < 0)) {
+				WARN_ON(ret == -ENOENT);
+				goto out;
+			}
+			kaddr = kmap_atomic(bh->b_page, KM_USER0);
+		}
+		curr = next;
+		curr_blkoff = next_blkoff;
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(bh);
+	*cnop = curr;
+	ret = n;
+
+ out:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_get_cpinfo -
+ * @cpfile:
+ * @cno:
+ * @ci:
+ * @nci:
+ */
+
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
+				void *buf, unsigned cisz, size_t nci)
+{
+	switch (mode) {
+	case NILFS_CHECKPOINT:
+		return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, buf, cisz, nci);
+	case NILFS_SNAPSHOT:
+		return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, buf, cisz, nci);
+	default:
+		return -EINVAL;
+	}
+}
+
+/**
+ * nilfs_cpfile_delete_checkpoint -
+ * @cpfile:
+ * @cno:
+ */
+int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
+{
+	struct nilfs_cpinfo ci;
+	__u64 tcno = cno;
+	ssize_t nci;
+
+	nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, sizeof(ci), 1);
+	if (nci < 0)
+		return nci;
+	else if (nci == 0 || ci.ci_cno != cno)
+		return -ENOENT;
+	else if (nilfs_cpinfo_snapshot(&ci))
+		return -EBUSY;
+
+	return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
+}
+
+static struct nilfs_snapshot_list *
+nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile,
+				     __u64 cno,
+				     struct buffer_head *bh,
+				     void *kaddr)
+{
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	struct nilfs_snapshot_list *list;
+
+	if (cno != 0) {
+		cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+		list = &cp->cp_snapshot_list;
+	} else {
+		header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+		list = &header->ch_snapshot_list;
+	}
+	return list;
+}
+
+static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
+{
+	struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	struct nilfs_snapshot_list *list;
+	__u64 curr, prev;
+	unsigned long curr_blkoff, prev_blkoff;
+	void *kaddr;
+	int ret;
+
+	if (cno == 0)
+		return -ENOENT; /* checkpoint number 0 is invalid */
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		ret = -ENOENT;
+		kunmap_atomic(kaddr, KM_USER0);
+		goto out_cp;
+	}
+	if (nilfs_checkpoint_snapshot(cp)) {
+		ret = 0;
+		kunmap_atomic(kaddr, KM_USER0);
+		goto out_cp;
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_cp;
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+	list = &header->ch_snapshot_list;
+	curr_bh = header_bh;
+	get_bh(curr_bh);
+	curr = 0;
+	curr_blkoff = 0;
+	prev = le64_to_cpu(list->ssl_prev);
+	while (prev > cno) {
+		prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
+		curr = prev;
+		if (curr_blkoff != prev_blkoff) {
+			kunmap_atomic(kaddr, KM_USER0);
+			brelse(curr_bh);
+			ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
+								0, &curr_bh);
+			if (ret < 0)
+				goto out_header;
+			kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
+		}
+		curr_blkoff = prev_blkoff;
+		cp = nilfs_cpfile_block_get_checkpoint(
+			cpfile, curr, curr_bh, kaddr);
+		list = &cp->cp_snapshot_list;
+		prev = le64_to_cpu(list->ssl_prev);
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (prev != 0) {
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
+							&prev_bh);
+		if (ret < 0)
+			goto out_curr;
+	} else {
+		prev_bh = header_bh;
+		get_bh(prev_bh);
+	}
+
+	kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, curr, curr_bh, kaddr);
+	list->ssl_prev = cpu_to_le64(cno);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
+	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
+	nilfs_checkpoint_set_snapshot(cp);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, prev, prev_bh, kaddr);
+	list->ssl_next = cpu_to_le64(cno);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+	le64_add_cpu(&header->ch_nsnapshots, 1);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	mark_buffer_dirty(prev_bh);
+	mark_buffer_dirty(curr_bh);
+	mark_buffer_dirty(cp_bh);
+	mark_buffer_dirty(header_bh);
+	nilfs_mdt_mark_dirty(cpfile);
+
+	brelse(prev_bh);
+
+ out_curr:
+	brelse(curr_bh);
+
+ out_header:
+	brelse(header_bh);
+
+ out_cp:
+	brelse(cp_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
+{
+	struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	struct nilfs_snapshot_list *list;
+	__u64 next, prev;
+	void *kaddr;
+	int ret;
+
+	if (cno == 0)
+		return -ENOENT; /* checkpoint number 0 is invalid */
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		ret = -ENOENT;
+		kunmap_atomic(kaddr, KM_USER0);
+		goto out_cp;
+	}
+	if (!nilfs_checkpoint_snapshot(cp)) {
+		ret = 0;
+		kunmap_atomic(kaddr, KM_USER0);
+		goto out_cp;
+	}
+
+	list = &cp->cp_snapshot_list;
+	next = le64_to_cpu(list->ssl_next);
+	prev = le64_to_cpu(list->ssl_prev);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_cp;
+	if (next != 0) {
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0,
+							&next_bh);
+		if (ret < 0)
+			goto out_header;
+	} else {
+		next_bh = header_bh;
+		get_bh(next_bh);
+	}
+	if (prev != 0) {
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
+							&prev_bh);
+		if (ret < 0)
+			goto out_next;
+	} else {
+		prev_bh = header_bh;
+		get_bh(prev_bh);
+	}
+
+	kaddr = kmap_atomic(next_bh->b_page, KM_USER0);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, next, next_bh, kaddr);
+	list->ssl_prev = cpu_to_le64(prev);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, prev, prev_bh, kaddr);
+	list->ssl_next = cpu_to_le64(next);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
+	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
+	nilfs_checkpoint_clear_snapshot(cp);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+	le64_add_cpu(&header->ch_nsnapshots, -1);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	mark_buffer_dirty(next_bh);
+	mark_buffer_dirty(prev_bh);
+	mark_buffer_dirty(cp_bh);
+	mark_buffer_dirty(header_bh);
+	nilfs_mdt_mark_dirty(cpfile);
+
+	brelse(prev_bh);
+
+ out_next:
+	brelse(next_bh);
+
+ out_header:
+	brelse(header_bh);
+
+ out_cp:
+	brelse(cp_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_is_snapshot -
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ *
+ * Description:
+ *
+ * Return Value: On success, 1 is returned if the checkpoint specified by
+ * @cno is a snapshot, or 0 if not. On error, one of the following negative
+ * error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ */
+int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
+{
+	struct buffer_head *bh;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	/* CP number is invalid if it's zero or larger than the
+	largest	exist one.*/
+	if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
+		return -ENOENT;
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
+	if (ret < 0)
+		goto out;
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp))
+		ret = -ENOENT;
+	else
+		ret = nilfs_checkpoint_snapshot(cp);
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(bh);
+
+ out:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_change_cpmode - change checkpoint mode
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @status: mode of checkpoint
+ *
+ * Description: nilfs_change_cpmode() changes the mode of the checkpoint
+ * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ */
+int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
+{
+	int ret;
+
+	switch (mode) {
+	case NILFS_CHECKPOINT:
+		if (nilfs_checkpoint_is_mounted(cpfile->i_sb, cno))
+			/*
+			 * Current implementation does not have to protect
+			 * plain read-only mounts since they are exclusive
+			 * with a read/write mount and are protected from the
+			 * cleaner.
+			 */
+			ret = -EBUSY;
+		else
+			ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
+		return ret;
+	case NILFS_SNAPSHOT:
+		return nilfs_cpfile_set_snapshot(cpfile, cno);
+	default:
+		return -EINVAL;
+	}
+}
+
+/**
+ * nilfs_cpfile_get_stat - get checkpoint statistics
+ * @cpfile: inode of checkpoint file
+ * @stat: pointer to a structure of checkpoint statistics
+ *
+ * Description: nilfs_cpfile_get_stat() returns information about checkpoints.
+ *
+ * Return Value: On success, 0 is returned, and checkpoints information is
+ * stored in the place pointed by @stat. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
+{
+	struct buffer_head *bh;
+	struct nilfs_cpfile_header *header;
+	void *kaddr;
+	int ret;
+
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+	cpstat->cs_cno = nilfs_mdt_cno(cpfile);
+	cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
+	cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(bh);
+
+ out_sem:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_read - read or get cpfile inode
+ * @sb: super block instance
+ * @cpsize: size of a checkpoint entry
+ * @raw_inode: on-disk cpfile inode
+ * @inodep: buffer to store the inode
+ */
+int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep)
+{
+	struct inode *cpfile;
+	int err;
+
+	cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
+	if (unlikely(!cpfile))
+		return -ENOMEM;
+	if (!(cpfile->i_state & I_NEW))
+		goto out;
+
+	err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0);
+	if (err)
+		goto failed;
+
+	nilfs_mdt_set_entry_size(cpfile, cpsize,
+				 sizeof(struct nilfs_cpfile_header));
+
+	err = nilfs_read_inode_common(cpfile, raw_inode);
+	if (err)
+		goto failed;
+
+	unlock_new_inode(cpfile);
+ out:
+	*inodep = cpfile;
+	return 0;
+ failed:
+	iget_failed(cpfile);
+	return err;
+}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
new file mode 100644
index 00000000..a242b9a3
--- /dev/null
+++ b/fs/nilfs2/cpfile.h
@@ -0,0 +1,46 @@
+/*
+ * cpfile.h - NILFS checkpoint file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_CPFILE_H
+#define _NILFS_CPFILE_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+
+
+int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
+				struct nilfs_checkpoint **,
+				struct buffer_head **);
+void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
+int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
+int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
+int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
+int nilfs_cpfile_is_snapshot(struct inode *, __u64);
+int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
+				size_t);
+
+int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep);
+
+#endif	/* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
new file mode 100644
index 00000000..fcc2f869
--- /dev/null
+++ b/fs/nilfs2/dat.c
@@ -0,0 +1,511 @@
+/*
+ * dat.c - NILFS disk address translation.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "dat.h"
+
+
+#define NILFS_CNO_MIN	((__u64)1)
+#define NILFS_CNO_MAX	(~(__u64)0)
+
+struct nilfs_dat_info {
+	struct nilfs_mdt_info mi;
+	struct nilfs_palloc_cache palloc_cache;
+	struct nilfs_shadow_map shadow;
+};
+
+static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
+{
+	return (struct nilfs_dat_info *)NILFS_MDT(dat);
+}
+
+static int nilfs_dat_prepare_entry(struct inode *dat,
+				   struct nilfs_palloc_req *req, int create)
+{
+	return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
+					    create, &req->pr_entry_bh);
+}
+
+static void nilfs_dat_commit_entry(struct inode *dat,
+				   struct nilfs_palloc_req *req)
+{
+	mark_buffer_dirty(req->pr_entry_bh);
+	nilfs_mdt_mark_dirty(dat);
+	brelse(req->pr_entry_bh);
+}
+
+static void nilfs_dat_abort_entry(struct inode *dat,
+				  struct nilfs_palloc_req *req)
+{
+	brelse(req->pr_entry_bh);
+}
+
+int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	int ret;
+
+	ret = nilfs_palloc_prepare_alloc_entry(dat, req);
+	if (ret < 0)
+		return ret;
+
+	ret = nilfs_dat_prepare_entry(dat, req, 1);
+	if (ret < 0)
+		nilfs_palloc_abort_alloc_entry(dat, req);
+
+	return ret;
+}
+
+void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
+	entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
+	entry->de_blocknr = cpu_to_le64(0);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_palloc_commit_alloc_entry(dat, req);
+	nilfs_dat_commit_entry(dat, req);
+}
+
+void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	nilfs_dat_abort_entry(dat, req);
+	nilfs_palloc_abort_alloc_entry(dat, req);
+}
+
+static void nilfs_dat_commit_free(struct inode *dat,
+				  struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
+	entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
+	entry->de_blocknr = cpu_to_le64(0);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_dat_commit_entry(dat, req);
+	nilfs_palloc_commit_free_entry(dat, req);
+}
+
+int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	int ret;
+
+	ret = nilfs_dat_prepare_entry(dat, req, 0);
+	WARN_ON(ret == -ENOENT);
+	return ret;
+}
+
+void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
+			    sector_t blocknr)
+{
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
+	entry->de_blocknr = cpu_to_le64(blocknr);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_dat_commit_entry(dat, req);
+}
+
+int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	__u64 start;
+	sector_t blocknr;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_dat_prepare_entry(dat, req, 0);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		return ret;
+	}
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	start = le64_to_cpu(entry->de_start);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (blocknr == 0) {
+		ret = nilfs_palloc_prepare_free_entry(dat, req);
+		if (ret < 0) {
+			nilfs_dat_abort_entry(dat, req);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
+			  int dead)
+{
+	struct nilfs_dat_entry *entry;
+	__u64 start, end;
+	sector_t blocknr;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	end = start = le64_to_cpu(entry->de_start);
+	if (!dead) {
+		end = nilfs_mdt_cno(dat);
+		WARN_ON(start > end);
+	}
+	entry->de_end = cpu_to_le64(end);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (blocknr == 0)
+		nilfs_dat_commit_free(dat, req);
+	else
+		nilfs_dat_commit_entry(dat, req);
+}
+
+void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	__u64 start;
+	sector_t blocknr;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	start = le64_to_cpu(entry->de_start);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (start == nilfs_mdt_cno(dat) && blocknr == 0)
+		nilfs_palloc_abort_free_entry(dat, req);
+	nilfs_dat_abort_entry(dat, req);
+}
+
+int nilfs_dat_prepare_update(struct inode *dat,
+			     struct nilfs_palloc_req *oldreq,
+			     struct nilfs_palloc_req *newreq)
+{
+	int ret;
+
+	ret = nilfs_dat_prepare_end(dat, oldreq);
+	if (!ret) {
+		ret = nilfs_dat_prepare_alloc(dat, newreq);
+		if (ret < 0)
+			nilfs_dat_abort_end(dat, oldreq);
+	}
+	return ret;
+}
+
+void nilfs_dat_commit_update(struct inode *dat,
+			     struct nilfs_palloc_req *oldreq,
+			     struct nilfs_palloc_req *newreq, int dead)
+{
+	nilfs_dat_commit_end(dat, oldreq, dead);
+	nilfs_dat_commit_alloc(dat, newreq);
+}
+
+void nilfs_dat_abort_update(struct inode *dat,
+			    struct nilfs_palloc_req *oldreq,
+			    struct nilfs_palloc_req *newreq)
+{
+	nilfs_dat_abort_end(dat, oldreq);
+	nilfs_dat_abort_alloc(dat, newreq);
+}
+
+/**
+ * nilfs_dat_mark_dirty -
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
+{
+	struct nilfs_palloc_req req;
+	int ret;
+
+	req.pr_entry_nr = vblocknr;
+	ret = nilfs_dat_prepare_entry(dat, &req, 0);
+	if (ret == 0)
+		nilfs_dat_commit_entry(dat, &req);
+	return ret;
+}
+
+/**
+ * nilfs_dat_freev - free virtual block numbers
+ * @dat: DAT file inode
+ * @vblocknrs: array of virtual block numbers
+ * @nitems: number of virtual block numbers
+ *
+ * Description: nilfs_dat_freev() frees the virtual block numbers specified by
+ * @vblocknrs and @nitems.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The virtual block number have not been allocated.
+ */
+int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems)
+{
+	return nilfs_palloc_freev(dat, vblocknrs, nitems);
+}
+
+/**
+ * nilfs_dat_move - change a block number
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ * @blocknr: block number
+ *
+ * Description: nilfs_dat_move() changes the block number associated with
+ * @vblocknr to @blocknr.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
+{
+	struct buffer_head *entry_bh;
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * The given disk block number (blocknr) is not yet written to
+	 * the device at this point.
+	 *
+	 * To prevent nilfs_dat_translate() from returning the
+	 * uncommitted block number, this makes a copy of the entry
+	 * buffer and redirects nilfs_dat_translate() to the copy.
+	 */
+	if (!buffer_nilfs_redirected(entry_bh)) {
+		ret = nilfs_mdt_freeze_buffer(dat, entry_bh);
+		if (ret) {
+			brelse(entry_bh);
+			return ret;
+		}
+	}
+
+	kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
+	if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
+		printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__,
+		       (unsigned long long)vblocknr,
+		       (unsigned long long)le64_to_cpu(entry->de_start),
+		       (unsigned long long)le64_to_cpu(entry->de_end));
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(entry_bh);
+		return -EINVAL;
+	}
+	WARN_ON(blocknr == 0);
+	entry->de_blocknr = cpu_to_le64(blocknr);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	mark_buffer_dirty(entry_bh);
+	nilfs_mdt_mark_dirty(dat);
+
+	brelse(entry_bh);
+
+	return 0;
+}
+
+/**
+ * nilfs_dat_translate - translate a virtual block number to a block number
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ * @blocknrp: pointer to a block number
+ *
+ * Description: nilfs_dat_translate() maps the virtual block number @vblocknr
+ * to the corresponding block number.
+ *
+ * Return Value: On success, 0 is returned and the block number associated
+ * with @vblocknr is stored in the place pointed by @blocknrp. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A block number associated with @vblocknr does not exist.
+ */
+int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
+{
+	struct buffer_head *entry_bh, *bh;
+	struct nilfs_dat_entry *entry;
+	sector_t blocknr;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
+	if (ret < 0)
+		return ret;
+
+	if (!nilfs_doing_gc() && buffer_nilfs_redirected(entry_bh)) {
+		bh = nilfs_mdt_get_frozen_buffer(dat, entry_bh);
+		if (bh) {
+			WARN_ON(!buffer_uptodate(bh));
+			brelse(entry_bh);
+			entry_bh = bh;
+		}
+	}
+
+	kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	if (blocknr == 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+	*blocknrp = blocknr;
+
+ out:
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(entry_bh);
+	return ret;
+}
+
+ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
+			    size_t nvi)
+{
+	struct buffer_head *entry_bh;
+	struct nilfs_dat_entry *entry;
+	struct nilfs_vinfo *vinfo = buf;
+	__u64 first, last;
+	void *kaddr;
+	unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
+	int i, j, n, ret;
+
+	for (i = 0; i < nvi; i += n) {
+		ret = nilfs_palloc_get_entry_block(dat, vinfo->vi_vblocknr,
+						   0, &entry_bh);
+		if (ret < 0)
+			return ret;
+		kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+		/* last virtual block number in this block */
+		first = vinfo->vi_vblocknr;
+		do_div(first, entries_per_block);
+		first *= entries_per_block;
+		last = first + entries_per_block - 1;
+		for (j = i, n = 0;
+		     j < nvi && vinfo->vi_vblocknr >= first &&
+			     vinfo->vi_vblocknr <= last;
+		     j++, n++, vinfo = (void *)vinfo + visz) {
+			entry = nilfs_palloc_block_get_entry(
+				dat, vinfo->vi_vblocknr, entry_bh, kaddr);
+			vinfo->vi_start = le64_to_cpu(entry->de_start);
+			vinfo->vi_end = le64_to_cpu(entry->de_end);
+			vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr);
+		}
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(entry_bh);
+	}
+
+	return nvi;
+}
+
+/**
+ * nilfs_dat_read - read or get dat inode
+ * @sb: super block instance
+ * @entry_size: size of a dat entry
+ * @raw_inode: on-disk dat inode
+ * @inodep: buffer to store the inode
+ */
+int nilfs_dat_read(struct super_block *sb, size_t entry_size,
+		   struct nilfs_inode *raw_inode, struct inode **inodep)
+{
+	static struct lock_class_key dat_lock_key;
+	struct inode *dat;
+	struct nilfs_dat_info *di;
+	int err;
+
+	dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
+	if (unlikely(!dat))
+		return -ENOMEM;
+	if (!(dat->i_state & I_NEW))
+		goto out;
+
+	err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di));
+	if (err)
+		goto failed;
+
+	err = nilfs_palloc_init_blockgroup(dat, entry_size);
+	if (err)
+		goto failed;
+
+	di = NILFS_DAT_I(dat);
+	lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
+	nilfs_palloc_setup_cache(dat, &di->palloc_cache);
+	nilfs_mdt_setup_shadow_map(dat, &di->shadow);
+
+	err = nilfs_read_inode_common(dat, raw_inode);
+	if (err)
+		goto failed;
+
+	unlock_new_inode(dat);
+ out:
+	*inodep = dat;
+	return 0;
+ failed:
+	iget_failed(dat);
+	return err;
+}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
new file mode 100644
index 00000000..cbd8e973
--- /dev/null
+++ b/fs/nilfs2/dat.h
@@ -0,0 +1,59 @@
+/*
+ * dat.h - NILFS disk address translation.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_DAT_H
+#define _NILFS_DAT_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+
+
+struct nilfs_palloc_req;
+
+int nilfs_dat_translate(struct inode *, __u64, sector_t *);
+
+int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
+			    sector_t);
+int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
+void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *,
+			     struct nilfs_palloc_req *);
+void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *,
+			     struct nilfs_palloc_req *, int);
+void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
+			    struct nilfs_palloc_req *);
+
+int nilfs_dat_mark_dirty(struct inode *, __u64);
+int nilfs_dat_freev(struct inode *, __u64 *, size_t);
+int nilfs_dat_move(struct inode *, __u64, sector_t);
+ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
+
+int nilfs_dat_read(struct super_block *sb, size_t entry_size,
+		   struct nilfs_inode *raw_inode, struct inode **inodep);
+
+#endif	/* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
new file mode 100644
index 00000000..3a192394
--- /dev/null
+++ b/fs/nilfs2/dir.c
@@ -0,0 +1,688 @@
+/*
+ * dir.c - NILFS directory entry operations
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 directory handling functions
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * All code that works with directory layout had been switched to pagecache
+ * and moved here. AV
+ */
+
+#include <linux/pagemap.h>
+#include "nilfs.h"
+#include "page.h"
+
+/*
+ * nilfs uses block-sized chunks. Arguably, sector-sized ones would be
+ * more robust, but we have what we have
+ */
+static inline unsigned nilfs_chunk_size(struct inode *inode)
+{
+	return inode->i_sb->s_blocksize;
+}
+
+static inline void nilfs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+
+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+	unsigned last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
+}
+
+static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to)
+{
+	loff_t pos = page_offset(page) + from;
+	return __block_write_begin(page, pos, to - from, nilfs_get_block);
+}
+
+static void nilfs_commit_chunk(struct page *page,
+			       struct address_space *mapping,
+			       unsigned from, unsigned to)
+{
+	struct inode *dir = mapping->host;
+	loff_t pos = page_offset(page) + from;
+	unsigned len = to - from;
+	unsigned nr_dirty, copied;
+	int err;
+
+	nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
+	copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
+	if (pos + copied > dir->i_size)
+		i_size_write(dir, pos + copied);
+	if (IS_DIRSYNC(dir))
+		nilfs_set_transaction_flag(NILFS_TI_SYNC);
+	err = nilfs_set_file_dirty(dir, nr_dirty);
+	WARN_ON(err); /* do not happen */
+	unlock_page(page);
+}
+
+static void nilfs_check_page(struct page *page)
+{
+	struct inode *dir = page->mapping->host;
+	struct super_block *sb = dir->i_sb;
+	unsigned chunk_size = nilfs_chunk_size(dir);
+	char *kaddr = page_address(page);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	struct nilfs_dir_entry *p;
+	char *error;
+
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & (chunk_size - 1))
+			goto Ebadsize;
+		if (!limit)
+			goto out;
+	}
+	for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
+		p = (struct nilfs_dir_entry *)(kaddr + offs);
+		rec_len = nilfs_rec_len_from_disk(p->rec_len);
+
+		if (rec_len < NILFS_DIR_REC_LEN(1))
+			goto Eshort;
+		if (rec_len & 3)
+			goto Ealign;
+		if (rec_len < NILFS_DIR_REC_LEN(p->name_len))
+			goto Enamelen;
+		if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+			goto Espan;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+	/* Too bad, we had an error */
+
+Ebadsize:
+	nilfs_error(sb, "nilfs_check_page",
+		    "size of directory #%lu is not a multiple of chunk size",
+		    dir->i_ino
+	);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+bad_entry:
+	nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - "
+		    "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+		    dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		    (unsigned long) le64_to_cpu(p->inode),
+		    rec_len, p->name_len);
+	goto fail;
+Eend:
+	p = (struct nilfs_dir_entry *)(kaddr + offs);
+	nilfs_error(sb, "nilfs_check_page",
+		    "entry in directory #%lu spans the page boundary"
+		    "offset=%lu, inode=%lu",
+		    dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		    (unsigned long) le64_to_cpu(p->inode));
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
+
+static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (!PageChecked(page))
+			nilfs_check_page(page);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	nilfs_put_page(page);
+	return ERR_PTR(-EIO);
+}
+
+/*
+ * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure.
+ *
+ * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
+ */
+static int
+nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
+{
+	return (struct nilfs_dir_entry *)((char *)p +
+					  nilfs_rec_len_from_disk(p->rec_len));
+}
+
+static unsigned char
+nilfs_filetype_table[NILFS_FT_MAX] = {
+	[NILFS_FT_UNKNOWN]	= DT_UNKNOWN,
+	[NILFS_FT_REG_FILE]	= DT_REG,
+	[NILFS_FT_DIR]		= DT_DIR,
+	[NILFS_FT_CHRDEV]	= DT_CHR,
+	[NILFS_FT_BLKDEV]	= DT_BLK,
+	[NILFS_FT_FIFO]		= DT_FIFO,
+	[NILFS_FT_SOCK]		= DT_SOCK,
+	[NILFS_FT_SYMLINK]	= DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char
+nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= NILFS_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= NILFS_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= NILFS_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= NILFS_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= NILFS_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= NILFS_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= NILFS_FT_SYMLINK,
+};
+
+static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
+{
+	mode_t mode = inode->i_mode;
+
+	de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	loff_t pos = filp->f_pos;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = dir_pages(inode);
+/*	unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
+	unsigned char *types = NULL;
+	int ret;
+
+	if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
+		goto success;
+
+	types = nilfs_filetype_table;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct nilfs_dir_entry *de;
+		struct page *page = nilfs_get_page(inode, n);
+
+		if (IS_ERR(page)) {
+			nilfs_error(sb, __func__, "bad page in #%lu",
+				    inode->i_ino);
+			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			ret = -EIO;
+			goto done;
+		}
+		kaddr = page_address(page);
+		de = (struct nilfs_dir_entry *)(kaddr + offset);
+		limit = kaddr + nilfs_last_byte(inode, n) -
+			NILFS_DIR_REC_LEN(1);
+		for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) {
+			if (de->rec_len == 0) {
+				nilfs_error(sb, __func__,
+					    "zero-length directory entry");
+				ret = -EIO;
+				nilfs_put_page(page);
+				goto done;
+			}
+			if (de->inode) {
+				int over;
+				unsigned char d_type = DT_UNKNOWN;
+
+				if (types && de->file_type < NILFS_FT_MAX)
+					d_type = types[de->file_type];
+
+				offset = (char *)de - kaddr;
+				over = filldir(dirent, de->name, de->name_len,
+						(n<<PAGE_CACHE_SHIFT) | offset,
+						le64_to_cpu(de->inode), d_type);
+				if (over) {
+					nilfs_put_page(page);
+					goto success;
+				}
+			}
+			filp->f_pos += nilfs_rec_len_from_disk(de->rec_len);
+		}
+		nilfs_put_page(page);
+	}
+
+success:
+	ret = 0;
+done:
+	return ret;
+}
+
+/*
+ *	nilfs_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the page in which the entry was found, and the entry itself
+ * (as a parameter - res_dir). Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct nilfs_dir_entry *
+nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
+		 struct page **res_page)
+{
+	const unsigned char *name = qstr->name;
+	int namelen = qstr->len;
+	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = dir_pages(dir);
+	struct page *page = NULL;
+	struct nilfs_inode_info *ei = NILFS_I(dir);
+	struct nilfs_dir_entry *de;
+
+	if (npages == 0)
+		goto out;
+
+	/* OFFSET_CACHE */
+	*res_page = NULL;
+
+	start = ei->i_dir_start_lookup;
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = nilfs_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (struct nilfs_dir_entry *)kaddr;
+			kaddr += nilfs_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->rec_len == 0) {
+					nilfs_error(dir->i_sb, __func__,
+						"zero-length directory entry");
+					nilfs_put_page(page);
+					goto out;
+				}
+				if (nilfs_match(namelen, name, de))
+					goto found;
+				de = nilfs_next_entry(de);
+			}
+			nilfs_put_page(page);
+		}
+		if (++n >= npages)
+			n = 0;
+		/* next page is past the blocks we've got */
+		if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
+			nilfs_error(dir->i_sb, __func__,
+			       "dir %lu size %lld exceeds block count %llu",
+			       dir->i_ino, dir->i_size,
+			       (unsigned long long)dir->i_blocks);
+			goto out;
+		}
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	ei->i_dir_start_lookup = n;
+	return de;
+}
+
+struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
+{
+	struct page *page = nilfs_get_page(dir, 0);
+	struct nilfs_dir_entry *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = nilfs_next_entry(
+			(struct nilfs_dir_entry *)page_address(page));
+		*p = page;
+	}
+	return de;
+}
+
+ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
+{
+	ino_t res = 0;
+	struct nilfs_dir_entry *de;
+	struct page *page;
+
+	de = nilfs_find_entry(dir, qstr, &page);
+	if (de) {
+		res = le64_to_cpu(de->inode);
+		kunmap(page);
+		page_cache_release(page);
+	}
+	return res;
+}
+
+/* Releases the page */
+void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
+		    struct page *page, struct inode *inode)
+{
+	unsigned from = (char *) de - (char *) page_address(page);
+	unsigned to = from + nilfs_rec_len_from_disk(de->rec_len);
+	struct address_space *mapping = page->mapping;
+	int err;
+
+	lock_page(page);
+	err = nilfs_prepare_chunk(page, from, to);
+	BUG_ON(err);
+	de->inode = cpu_to_le64(inode->i_ino);
+	nilfs_set_de_type(de, inode);
+	nilfs_commit_chunk(page, mapping, from, to);
+	nilfs_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+}
+
+/*
+ *	Parent is locked.
+ */
+int nilfs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const unsigned char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned chunk_size = nilfs_chunk_size(dir);
+	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	struct nilfs_dir_entry *de;
+	unsigned long npages = dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	unsigned from, to;
+	int err;
+
+	/*
+	 * We take care of directory expansion in the same loop.
+	 * This code plays outside i_size, so it locks the page
+	 * to protect that region.
+	 */
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = nilfs_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + nilfs_last_byte(dir, n);
+		de = (struct nilfs_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				/* We hit i_size */
+				name_len = 0;
+				rec_len = chunk_size;
+				de->rec_len = nilfs_rec_len_to_disk(chunk_size);
+				de->inode = 0;
+				goto got_it;
+			}
+			if (de->rec_len == 0) {
+				nilfs_error(dir->i_sb, __func__,
+					    "zero-length directory entry");
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (nilfs_match(namelen, name, de))
+				goto out_unlock;
+			name_len = NILFS_DIR_REC_LEN(de->name_len);
+			rec_len = nilfs_rec_len_from_disk(de->rec_len);
+			if (!de->inode && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (struct nilfs_dir_entry *)((char *)de + rec_len);
+		}
+		unlock_page(page);
+		nilfs_put_page(page);
+	}
+	BUG();
+	return -EINVAL;
+
+got_it:
+	from = (char *)de - (char *)page_address(page);
+	to = from + rec_len;
+	err = nilfs_prepare_chunk(page, from, to);
+	if (err)
+		goto out_unlock;
+	if (de->inode) {
+		struct nilfs_dir_entry *de1;
+
+		de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
+		de1->rec_len = nilfs_rec_len_to_disk(rec_len - name_len);
+		de->rec_len = nilfs_rec_len_to_disk(name_len);
+		de = de1;
+	}
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+	de->inode = cpu_to_le64(inode->i_ino);
+	nilfs_set_de_type(de, inode);
+	nilfs_commit_chunk(page, page->mapping, from, to);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	nilfs_mark_inode_dirty(dir);
+	/* OFFSET_CACHE */
+out_put:
+	nilfs_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+/*
+ * nilfs_delete_entry deletes a directory entry by merging it with the
+ * previous entry. Page is up-to-date. Releases the page.
+ */
+int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	char *kaddr = page_address(page);
+	unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
+	unsigned to = ((char *)dir - kaddr) +
+		nilfs_rec_len_from_disk(dir->rec_len);
+	struct nilfs_dir_entry *pde = NULL;
+	struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
+	int err;
+
+	while ((char *)de < (char *)dir) {
+		if (de->rec_len == 0) {
+			nilfs_error(inode->i_sb, __func__,
+				    "zero-length directory entry");
+			err = -EIO;
+			goto out;
+		}
+		pde = de;
+		de = nilfs_next_entry(de);
+	}
+	if (pde)
+		from = (char *)pde - (char *)page_address(page);
+	lock_page(page);
+	err = nilfs_prepare_chunk(page, from, to);
+	BUG_ON(err);
+	if (pde)
+		pde->rec_len = nilfs_rec_len_to_disk(to - from);
+	dir->inode = 0;
+	nilfs_commit_chunk(page, mapping, from, to);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+out:
+	nilfs_put_page(page);
+	return err;
+}
+
+/*
+ * Set the first fragment of directory.
+ */
+int nilfs_make_empty(struct inode *inode, struct inode *parent)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
+	unsigned chunk_size = nilfs_chunk_size(inode);
+	struct nilfs_dir_entry *de;
+	int err;
+	void *kaddr;
+
+	if (!page)
+		return -ENOMEM;
+
+	err = nilfs_prepare_chunk(page, 0, chunk_size);
+	if (unlikely(err)) {
+		unlock_page(page);
+		goto fail;
+	}
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr, 0, chunk_size);
+	de = (struct nilfs_dir_entry *)kaddr;
+	de->name_len = 1;
+	de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1));
+	memcpy(de->name, ".\0\0", 4);
+	de->inode = cpu_to_le64(inode->i_ino);
+	nilfs_set_de_type(de, inode);
+
+	de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
+	de->name_len = 2;
+	de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1));
+	de->inode = cpu_to_le64(parent->i_ino);
+	memcpy(de->name, "..\0", 4);
+	nilfs_set_de_type(de, inode);
+	kunmap_atomic(kaddr, KM_USER0);
+	nilfs_commit_chunk(page, mapping, 0, chunk_size);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int nilfs_empty_dir(struct inode *inode)
+{
+	struct page *page = NULL;
+	unsigned long i, npages = dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct nilfs_dir_entry *de;
+
+		page = nilfs_get_page(inode, i);
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = page_address(page);
+		de = (struct nilfs_dir_entry *)kaddr;
+		kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->rec_len == 0) {
+				nilfs_error(inode->i_sb, __func__,
+					    "zero-length directory entry "
+					    "(kaddr=%p, de=%p)\n", kaddr, de);
+				goto not_empty;
+			}
+			if (de->inode != 0) {
+				/* check for . and .. */
+				if (de->name[0] != '.')
+					goto not_empty;
+				if (de->name_len > 2)
+					goto not_empty;
+				if (de->name_len < 2) {
+					if (de->inode !=
+					    cpu_to_le64(inode->i_ino))
+						goto not_empty;
+				} else if (de->name[1] != '.')
+					goto not_empty;
+			}
+			de = nilfs_next_entry(de);
+		}
+		nilfs_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	nilfs_put_page(page);
+	return 0;
+}
+
+const struct file_operations nilfs_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= nilfs_readdir,
+	.unlocked_ioctl	= nilfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= nilfs_compat_ioctl,
+#endif	/* CONFIG_COMPAT */
+	.fsync		= nilfs_sync_file,
+
+};
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
new file mode 100644
index 00000000..82f4865e
--- /dev/null
+++ b/fs/nilfs2/direct.c
@@ -0,0 +1,369 @@
+/*
+ * direct.c - NILFS direct block pointer.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "page.h"
+#include "direct.h"
+#include "alloc.h"
+#include "dat.h"
+
+static inline __le64 *nilfs_direct_dptrs(const struct nilfs_bmap *direct)
+{
+	return (__le64 *)
+		((struct nilfs_direct_node *)direct->b_u.u_data + 1);
+}
+
+static inline __u64
+nilfs_direct_get_ptr(const struct nilfs_bmap *direct, __u64 key)
+{
+	return le64_to_cpu(*(nilfs_direct_dptrs(direct) + key));
+}
+
+static inline void nilfs_direct_set_ptr(struct nilfs_bmap *direct,
+					__u64 key, __u64 ptr)
+{
+	*(nilfs_direct_dptrs(direct) + key) = cpu_to_le64(ptr);
+}
+
+static int nilfs_direct_lookup(const struct nilfs_bmap *direct,
+			       __u64 key, int level, __u64 *ptrp)
+{
+	__u64 ptr;
+
+	if (key > NILFS_DIRECT_KEY_MAX || level != 1)
+		return -ENOENT;
+	ptr = nilfs_direct_get_ptr(direct, key);
+	if (ptr == NILFS_BMAP_INVALID_PTR)
+		return -ENOENT;
+
+	*ptrp = ptr;
+	return 0;
+}
+
+static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
+				      __u64 key, __u64 *ptrp,
+				      unsigned maxblocks)
+{
+	struct inode *dat = NULL;
+	__u64 ptr, ptr2;
+	sector_t blocknr;
+	int ret, cnt;
+
+	if (key > NILFS_DIRECT_KEY_MAX)
+		return -ENOENT;
+	ptr = nilfs_direct_get_ptr(direct, key);
+	if (ptr == NILFS_BMAP_INVALID_PTR)
+		return -ENOENT;
+
+	if (NILFS_BMAP_USE_VBN(direct)) {
+		dat = nilfs_bmap_get_dat(direct);
+		ret = nilfs_dat_translate(dat, ptr, &blocknr);
+		if (ret < 0)
+			return ret;
+		ptr = blocknr;
+	}
+
+	maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1);
+	for (cnt = 1; cnt < maxblocks &&
+		     (ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) !=
+		     NILFS_BMAP_INVALID_PTR;
+	     cnt++) {
+		if (dat) {
+			ret = nilfs_dat_translate(dat, ptr2, &blocknr);
+			if (ret < 0)
+				return ret;
+			ptr2 = blocknr;
+		}
+		if (ptr2 != ptr + cnt)
+			break;
+	}
+	*ptrp = ptr;
+	return cnt;
+}
+
+static __u64
+nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key)
+{
+	__u64 ptr;
+
+	ptr = nilfs_bmap_find_target_seq(direct, key);
+	if (ptr != NILFS_BMAP_INVALID_PTR)
+		/* sequential access */
+		return ptr;
+	else
+		/* block group */
+		return nilfs_bmap_find_target_in_group(direct);
+}
+
+static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+	union nilfs_bmap_ptr_req req;
+	struct inode *dat = NULL;
+	struct buffer_head *bh;
+	int ret;
+
+	if (key > NILFS_DIRECT_KEY_MAX)
+		return -ENOENT;
+	if (nilfs_direct_get_ptr(bmap, key) != NILFS_BMAP_INVALID_PTR)
+		return -EEXIST;
+
+	if (NILFS_BMAP_USE_VBN(bmap)) {
+		req.bpr_ptr = nilfs_direct_find_target_v(bmap, key);
+		dat = nilfs_bmap_get_dat(bmap);
+	}
+	ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
+	if (!ret) {
+		/* ptr must be a pointer to a buffer head. */
+		bh = (struct buffer_head *)((unsigned long)ptr);
+		set_buffer_nilfs_volatile(bh);
+
+		nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
+		nilfs_direct_set_ptr(bmap, key, req.bpr_ptr);
+
+		if (!nilfs_bmap_dirty(bmap))
+			nilfs_bmap_set_dirty(bmap);
+
+		if (NILFS_BMAP_USE_VBN(bmap))
+			nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr);
+
+		nilfs_inode_add_blocks(bmap->b_inode, 1);
+	}
+	return ret;
+}
+
+static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+	union nilfs_bmap_ptr_req req;
+	struct inode *dat;
+	int ret;
+
+	if (key > NILFS_DIRECT_KEY_MAX ||
+	    nilfs_direct_get_ptr(bmap, key) == NILFS_BMAP_INVALID_PTR)
+		return -ENOENT;
+
+	dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
+	req.bpr_ptr = nilfs_direct_get_ptr(bmap, key);
+
+	ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
+	if (!ret) {
+		nilfs_bmap_commit_end_ptr(bmap, &req, dat);
+		nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR);
+		nilfs_inode_sub_blocks(bmap->b_inode, 1);
+	}
+	return ret;
+}
+
+static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp)
+{
+	__u64 key, lastkey;
+
+	lastkey = NILFS_DIRECT_KEY_MAX + 1;
+	for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
+		if (nilfs_direct_get_ptr(direct, key) !=
+		    NILFS_BMAP_INVALID_PTR)
+			lastkey = key;
+
+	if (lastkey == NILFS_DIRECT_KEY_MAX + 1)
+		return -ENOENT;
+
+	*keyp = lastkey;
+
+	return 0;
+}
+
+static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
+{
+	return key > NILFS_DIRECT_KEY_MAX;
+}
+
+static int nilfs_direct_gather_data(struct nilfs_bmap *direct,
+				    __u64 *keys, __u64 *ptrs, int nitems)
+{
+	__u64 key;
+	__u64 ptr;
+	int n;
+
+	if (nitems > NILFS_DIRECT_NBLOCKS)
+		nitems = NILFS_DIRECT_NBLOCKS;
+	n = 0;
+	for (key = 0; key < nitems; key++) {
+		ptr = nilfs_direct_get_ptr(direct, key);
+		if (ptr != NILFS_BMAP_INVALID_PTR) {
+			keys[n] = key;
+			ptrs[n] = ptr;
+			n++;
+		}
+	}
+	return n;
+}
+
+int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
+				    __u64 key, __u64 *keys, __u64 *ptrs, int n)
+{
+	__le64 *dptrs;
+	int ret, i, j;
+
+	/* no need to allocate any resource for conversion */
+
+	/* delete */
+	ret = bmap->b_ops->bop_delete(bmap, key);
+	if (ret < 0)
+		return ret;
+
+	/* free resources */
+	if (bmap->b_ops->bop_clear != NULL)
+		bmap->b_ops->bop_clear(bmap);
+
+	/* convert */
+	dptrs = nilfs_direct_dptrs(bmap);
+	for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
+		if ((j < n) && (i == keys[j])) {
+			dptrs[i] = (i != key) ?
+				cpu_to_le64(ptrs[j]) :
+				NILFS_BMAP_INVALID_PTR;
+			j++;
+		} else
+			dptrs[i] = NILFS_BMAP_INVALID_PTR;
+	}
+
+	nilfs_direct_init(bmap);
+	return 0;
+}
+
+static int nilfs_direct_propagate(struct nilfs_bmap *bmap,
+				  struct buffer_head *bh)
+{
+	struct nilfs_palloc_req oldreq, newreq;
+	struct inode *dat;
+	__u64 key;
+	__u64 ptr;
+	int ret;
+
+	if (!NILFS_BMAP_USE_VBN(bmap))
+		return 0;
+
+	dat = nilfs_bmap_get_dat(bmap);
+	key = nilfs_bmap_data_get_key(bmap, bh);
+	ptr = nilfs_direct_get_ptr(bmap, key);
+	if (!buffer_nilfs_volatile(bh)) {
+		oldreq.pr_entry_nr = ptr;
+		newreq.pr_entry_nr = ptr;
+		ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq);
+		if (ret < 0)
+			return ret;
+		nilfs_dat_commit_update(dat, &oldreq, &newreq,
+					bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
+		set_buffer_nilfs_volatile(bh);
+		nilfs_direct_set_ptr(bmap, key, newreq.pr_entry_nr);
+	} else
+		ret = nilfs_dat_mark_dirty(dat, ptr);
+
+	return ret;
+}
+
+static int nilfs_direct_assign_v(struct nilfs_bmap *direct,
+				 __u64 key, __u64 ptr,
+				 struct buffer_head **bh,
+				 sector_t blocknr,
+				 union nilfs_binfo *binfo)
+{
+	struct inode *dat = nilfs_bmap_get_dat(direct);
+	union nilfs_bmap_ptr_req req;
+	int ret;
+
+	req.bpr_ptr = ptr;
+	ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
+	if (!ret) {
+		nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
+		binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
+		binfo->bi_v.bi_blkoff = cpu_to_le64(key);
+	}
+	return ret;
+}
+
+static int nilfs_direct_assign_p(struct nilfs_bmap *direct,
+				 __u64 key, __u64 ptr,
+				 struct buffer_head **bh,
+				 sector_t blocknr,
+				 union nilfs_binfo *binfo)
+{
+	nilfs_direct_set_ptr(direct, key, blocknr);
+
+	binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
+	binfo->bi_dat.bi_level = 0;
+
+	return 0;
+}
+
+static int nilfs_direct_assign(struct nilfs_bmap *bmap,
+			       struct buffer_head **bh,
+			       sector_t blocknr,
+			       union nilfs_binfo *binfo)
+{
+	__u64 key;
+	__u64 ptr;
+
+	key = nilfs_bmap_data_get_key(bmap, *bh);
+	if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
+		printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
+		       (unsigned long long)key);
+		return -EINVAL;
+	}
+	ptr = nilfs_direct_get_ptr(bmap, key);
+	if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
+		printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
+		       (unsigned long long)ptr);
+		return -EINVAL;
+	}
+
+	return NILFS_BMAP_USE_VBN(bmap) ?
+		nilfs_direct_assign_v(bmap, key, ptr, bh, blocknr, binfo) :
+		nilfs_direct_assign_p(bmap, key, ptr, bh, blocknr, binfo);
+}
+
+static const struct nilfs_bmap_operations nilfs_direct_ops = {
+	.bop_lookup		=	nilfs_direct_lookup,
+	.bop_lookup_contig	=	nilfs_direct_lookup_contig,
+	.bop_insert		=	nilfs_direct_insert,
+	.bop_delete		=	nilfs_direct_delete,
+	.bop_clear		=	NULL,
+
+	.bop_propagate		=	nilfs_direct_propagate,
+
+	.bop_lookup_dirty_buffers	=	NULL,
+
+	.bop_assign		=	nilfs_direct_assign,
+	.bop_mark		=	NULL,
+
+	.bop_last_key		=	nilfs_direct_last_key,
+	.bop_check_insert	=	nilfs_direct_check_insert,
+	.bop_check_delete	=	NULL,
+	.bop_gather_data	=	nilfs_direct_gather_data,
+};
+
+
+int nilfs_direct_init(struct nilfs_bmap *bmap)
+{
+	bmap->b_ops = &nilfs_direct_ops;
+	return 0;
+}
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
new file mode 100644
index 00000000..dc643de2
--- /dev/null
+++ b/fs/nilfs2/direct.h
@@ -0,0 +1,51 @@
+/*
+ * direct.h - NILFS direct block pointer.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_DIRECT_H
+#define _NILFS_DIRECT_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include "bmap.h"
+
+
+/**
+ * struct nilfs_direct_node - direct node
+ * @dn_flags: flags
+ * @dn_pad: padding
+ */
+struct nilfs_direct_node {
+	__u8 dn_flags;
+	__u8 pad[7];
+};
+
+#define NILFS_DIRECT_NBLOCKS	(NILFS_BMAP_SIZE / sizeof(__le64) - 1)
+#define NILFS_DIRECT_KEY_MIN	0
+#define NILFS_DIRECT_KEY_MAX	(NILFS_DIRECT_NBLOCKS - 1)
+
+
+int nilfs_direct_init(struct nilfs_bmap *);
+int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *,
+				    __u64 *, int);
+
+
+#endif	/* _NILFS_DIRECT_H */
diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h
new file mode 100644
index 00000000..a71cc412
--- /dev/null
+++ b/fs/nilfs2/export.h
@@ -0,0 +1,17 @@
+#ifndef NILFS_EXPORT_H
+#define NILFS_EXPORT_H
+
+#include <linux/exportfs.h>
+
+extern const struct export_operations nilfs_export_ops;
+
+struct nilfs_fid {
+	u64 cno;
+	u64 ino;
+	u32 gen;
+
+	u32 parent_gen;
+	u64 parent_ino;
+} __attribute__ ((packed));
+
+#endif
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
new file mode 100644
index 00000000..d7eeca62
--- /dev/null
+++ b/fs/nilfs2/file.c
@@ -0,0 +1,159 @@
+/*
+ * file.c - NILFS regular file handling primitives including fsync().
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>,
+ *            Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/writeback.h>
+#include "nilfs.h"
+#include "segment.h"
+
+int nilfs_sync_file(struct file *file, int datasync)
+{
+	/*
+	 * Called from fsync() system call
+	 * This is the only entry point that can catch write and synch
+	 * timing for both data blocks and intermediate blocks.
+	 *
+	 * This function should be implemented when the writeback function
+	 * will be implemented.
+	 */
+	struct inode *inode = file->f_mapping->host;
+	int err;
+
+	if (!nilfs_inode_dirty(inode))
+		return 0;
+
+	if (datasync)
+		err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0,
+						    LLONG_MAX);
+	else
+		err = nilfs_construct_segment(inode->i_sb);
+
+	return err;
+}
+
+static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct nilfs_transaction_info ti;
+	int ret;
+
+	if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))
+		return VM_FAULT_SIGBUS; /* -ENOSPC */
+
+	lock_page(page);
+	if (page->mapping != inode->i_mapping ||
+	    page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
+		unlock_page(page);
+		return VM_FAULT_NOPAGE; /* make the VM retry the fault */
+	}
+
+	/*
+	 * check to see if the page is mapped already (no holes)
+	 */
+	if (PageMappedToDisk(page))
+		goto mapped;
+
+	if (page_has_buffers(page)) {
+		struct buffer_head *bh, *head;
+		int fully_mapped = 1;
+
+		bh = head = page_buffers(page);
+		do {
+			if (!buffer_mapped(bh)) {
+				fully_mapped = 0;
+				break;
+			}
+		} while (bh = bh->b_this_page, bh != head);
+
+		if (fully_mapped) {
+			SetPageMappedToDisk(page);
+			goto mapped;
+		}
+	}
+	unlock_page(page);
+
+	/*
+	 * fill hole blocks
+	 */
+	ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+	/* never returns -ENOMEM, but may return -ENOSPC */
+	if (unlikely(ret))
+		return VM_FAULT_SIGBUS;
+
+	ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
+	if (ret != VM_FAULT_LOCKED) {
+		nilfs_transaction_abort(inode->i_sb);
+		return ret;
+	}
+	nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits));
+	nilfs_transaction_commit(inode->i_sb);
+
+ mapped:
+	wait_on_page_writeback(page);
+	return VM_FAULT_LOCKED;
+}
+
+static const struct vm_operations_struct nilfs_file_vm_ops = {
+	.fault		= filemap_fault,
+	.page_mkwrite	= nilfs_page_mkwrite,
+};
+
+static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+	vma->vm_ops = &nilfs_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+	return 0;
+}
+
+/*
+ * We have mostly NULL's here: the current defaults are ok for
+ * the nilfs filesystem.
+ */
+const struct file_operations nilfs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.unlocked_ioctl	= nilfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= nilfs_compat_ioctl,
+#endif	/* CONFIG_COMPAT */
+	.mmap		= nilfs_file_mmap,
+	.open		= generic_file_open,
+	/* .release	= nilfs_release_file, */
+	.fsync		= nilfs_sync_file,
+	.splice_read	= generic_file_splice_read,
+};
+
+const struct inode_operations nilfs_file_inode_operations = {
+	.truncate	= nilfs_truncate,
+	.setattr	= nilfs_setattr,
+	.permission     = nilfs_permission,
+	.fiemap		= nilfs_fiemap,
+};
+
+/* end of file */
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
new file mode 100644
index 00000000..08a07a21
--- /dev/null
+++ b/fs/nilfs2/gcinode.c
@@ -0,0 +1,196 @@
+/*
+ * gcinode.c - dummy inodes to buffer blocks for garbage collection
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
+ *            and Ryusuke Konishi <ryusuke@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+/*
+ * This file adds the cache of on-disk blocks to be moved in garbage
+ * collection.  The disk blocks are held with dummy inodes (called
+ * gcinodes), and this file provides lookup function of the dummy
+ * inodes and their buffer read function.
+ *
+ * Buffers and pages held by the dummy inodes will be released each
+ * time after they are copied to a new log.  Dirty blocks made on the
+ * current generation and the blocks to be moved by GC never overlap
+ * because the dirty blocks make a new generation; they rather must be
+ * written individually.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include "nilfs.h"
+#include "btree.h"
+#include "btnode.h"
+#include "page.h"
+#include "mdt.h"
+#include "dat.h"
+#include "ifile.h"
+
+/*
+ * nilfs_gccache_submit_read_data() - add data buffer and submit read request
+ * @inode - gc inode
+ * @blkoff - dummy offset treated as the key for the page cache
+ * @pbn - physical block number of the block
+ * @vbn - virtual block number of the block, 0 for non-virtual block
+ * @out_bh - indirect pointer to a buffer_head struct to receive the results
+ *
+ * Description: nilfs_gccache_submit_read_data() registers the data buffer
+ * specified by @pbn to the GC pagecache with the key @blkoff.
+ * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer.
+ *
+ * Return Value: On success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The block specified with @pbn does not exist.
+ */
+int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
+				   sector_t pbn, __u64 vbn,
+				   struct buffer_head **out_bh)
+{
+	struct buffer_head *bh;
+	int err;
+
+	bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	if (buffer_uptodate(bh))
+		goto out;
+
+	if (pbn == 0) {
+		struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+		err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn);
+		if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
+			brelse(bh);
+			goto failed;
+		}
+	}
+
+	lock_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		goto out;
+	}
+
+	if (!buffer_mapped(bh)) {
+		bh->b_bdev = inode->i_sb->s_bdev;
+		set_buffer_mapped(bh);
+	}
+	bh->b_blocknr = pbn;
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(READ, bh);
+	if (vbn)
+		bh->b_blocknr = vbn;
+ out:
+	err = 0;
+	*out_bh = bh;
+
+ failed:
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	return err;
+}
+
+/*
+ * nilfs_gccache_submit_read_node() - add node buffer and submit read request
+ * @inode - gc inode
+ * @pbn - physical block number for the block
+ * @vbn - virtual block number for the block
+ * @out_bh - indirect pointer to a buffer_head struct to receive the results
+ *
+ * Description: nilfs_gccache_submit_read_node() registers the node buffer
+ * specified by @vbn to the GC pagecache.  @pbn can be supplied by the
+ * caller to avoid translation of the disk block address.
+ *
+ * Return Value: On success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
+				   __u64 vbn, struct buffer_head **out_bh)
+{
+	int ret;
+
+	ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
+					vbn ? : pbn, pbn, READ, out_bh, &pbn);
+	if (ret == -EEXIST) /* internal code (cache hit) */
+		ret = 0;
+	return ret;
+}
+
+int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
+{
+	wait_on_buffer(bh);
+	if (!buffer_uptodate(bh))
+		return -EIO;
+	if (buffer_dirty(bh))
+		return -EEXIST;
+
+	if (buffer_nilfs_node(bh) && nilfs_btree_broken_node_block(bh)) {
+		clear_buffer_uptodate(bh);
+		return -EIO;
+	}
+	mark_buffer_dirty(bh);
+	return 0;
+}
+
+int nilfs_init_gcinode(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	inode->i_mode = S_IFREG;
+	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+	inode->i_mapping->a_ops = &empty_aops;
+	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
+
+	ii->i_flags = 0;
+	nilfs_bmap_init_gc(ii->i_bmap);
+
+	return 0;
+}
+
+/**
+ * nilfs_remove_all_gcinodes() - remove all unprocessed gc inodes
+ */
+void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs)
+{
+	struct list_head *head = &nilfs->ns_gc_inodes;
+	struct nilfs_inode_info *ii;
+
+	while (!list_empty(head)) {
+		ii = list_first_entry(head, struct nilfs_inode_info, i_dirty);
+		list_del_init(&ii->i_dirty);
+		iput(&ii->vfs_inode);
+	}
+}
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
new file mode 100644
index 00000000..684d7630
--- /dev/null
+++ b/fs/nilfs2/ifile.c
@@ -0,0 +1,201 @@
+/*
+ * ifile.c - NILFS inode file
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "ifile.h"
+
+
+struct nilfs_ifile_info {
+	struct nilfs_mdt_info mi;
+	struct nilfs_palloc_cache palloc_cache;
+};
+
+static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile)
+{
+	return (struct nilfs_ifile_info *)NILFS_MDT(ifile);
+}
+
+/**
+ * nilfs_ifile_create_inode - create a new disk inode
+ * @ifile: ifile inode
+ * @out_ino: pointer to a variable to store inode number
+ * @out_bh: buffer_head contains newly allocated disk inode
+ *
+ * Return Value: On success, 0 is returned and the newly allocated inode
+ * number is stored in the place pointed by @ino, and buffer_head pointer
+ * that contains newly allocated disk inode structure is stored in the
+ * place pointed by @out_bh
+ * On error, one of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - No inode left.
+ */
+int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
+			     struct buffer_head **out_bh)
+{
+	struct nilfs_palloc_req req;
+	int ret;
+
+	req.pr_entry_nr = 0;  /* 0 says find free inode from beginning of
+				 a group. dull code!! */
+	req.pr_entry_bh = NULL;
+
+	ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
+	if (!ret) {
+		ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
+						   &req.pr_entry_bh);
+		if (ret < 0)
+			nilfs_palloc_abort_alloc_entry(ifile, &req);
+	}
+	if (ret < 0) {
+		brelse(req.pr_entry_bh);
+		return ret;
+	}
+	nilfs_palloc_commit_alloc_entry(ifile, &req);
+	mark_buffer_dirty(req.pr_entry_bh);
+	nilfs_mdt_mark_dirty(ifile);
+	*out_ino = (ino_t)req.pr_entry_nr;
+	*out_bh = req.pr_entry_bh;
+	return 0;
+}
+
+/**
+ * nilfs_ifile_delete_inode - delete a disk inode
+ * @ifile: ifile inode
+ * @ino: inode number
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The inode number @ino have not been allocated.
+ */
+int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
+{
+	struct nilfs_palloc_req req = {
+		.pr_entry_nr = ino, .pr_entry_bh = NULL
+	};
+	struct nilfs_inode *raw_inode;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_palloc_prepare_free_entry(ifile, &req);
+	if (!ret) {
+		ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0,
+						   &req.pr_entry_bh);
+		if (ret < 0)
+			nilfs_palloc_abort_free_entry(ifile, &req);
+	}
+	if (ret < 0) {
+		brelse(req.pr_entry_bh);
+		return ret;
+	}
+
+	kaddr = kmap_atomic(req.pr_entry_bh->b_page, KM_USER0);
+	raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
+						 req.pr_entry_bh, kaddr);
+	raw_inode->i_flags = 0;
+	kunmap_atomic(kaddr, KM_USER0);
+
+	mark_buffer_dirty(req.pr_entry_bh);
+	brelse(req.pr_entry_bh);
+
+	nilfs_palloc_commit_free_entry(ifile, &req);
+
+	return 0;
+}
+
+int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
+				struct buffer_head **out_bh)
+{
+	struct super_block *sb = ifile->i_sb;
+	int err;
+
+	if (unlikely(!NILFS_VALID_INODE(sb, ino))) {
+		nilfs_error(sb, __func__, "bad inode number: %lu",
+			    (unsigned long) ino);
+		return -EINVAL;
+	}
+
+	err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
+	if (unlikely(err))
+		nilfs_warning(sb, __func__, "unable to read inode: %lu",
+			      (unsigned long) ino);
+	return err;
+}
+
+/**
+ * nilfs_ifile_read - read or get ifile inode
+ * @sb: super block instance
+ * @root: root object
+ * @inode_size: size of an inode
+ * @raw_inode: on-disk ifile inode
+ * @inodep: buffer to store the inode
+ */
+int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
+		     size_t inode_size, struct nilfs_inode *raw_inode,
+		     struct inode **inodep)
+{
+	struct inode *ifile;
+	int err;
+
+	ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO);
+	if (unlikely(!ifile))
+		return -ENOMEM;
+	if (!(ifile->i_state & I_NEW))
+		goto out;
+
+	err = nilfs_mdt_init(ifile, NILFS_MDT_GFP,
+			     sizeof(struct nilfs_ifile_info));
+	if (err)
+		goto failed;
+
+	err = nilfs_palloc_init_blockgroup(ifile, inode_size);
+	if (err)
+		goto failed;
+
+	nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);
+
+	err = nilfs_read_inode_common(ifile, raw_inode);
+	if (err)
+		goto failed;
+
+	unlock_new_inode(ifile);
+ out:
+	*inodep = ifile;
+	return 0;
+ failed:
+	iget_failed(ifile);
+	return err;
+}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
new file mode 100644
index 00000000..59b6f2b5
--- /dev/null
+++ b/fs/nilfs2/ifile.h
@@ -0,0 +1,56 @@
+/*
+ * ifile.h - NILFS inode file
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#ifndef _NILFS_IFILE_H
+#define _NILFS_IFILE_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "alloc.h"
+
+
+static inline struct nilfs_inode *
+nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
+{
+	void *kaddr = kmap(ibh->b_page);
+	return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
+}
+
+static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino,
+					   struct buffer_head *ibh)
+{
+	kunmap(ibh->b_page);
+}
+
+int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
+int nilfs_ifile_delete_inode(struct inode *, ino_t);
+int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
+
+int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
+		     size_t inode_size, struct nilfs_inode *raw_inode,
+		     struct inode **inodep);
+
+#endif	/* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
new file mode 100644
index 00000000..b9b45fc2
--- /dev/null
+++ b/fs/nilfs2/inode.c
@@ -0,0 +1,1064 @@
+/*
+ * inode.c - NILFS inode operations.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/gfp.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/uio.h>
+#include "nilfs.h"
+#include "btnode.h"
+#include "segment.h"
+#include "page.h"
+#include "mdt.h"
+#include "cpfile.h"
+#include "ifile.h"
+
+struct nilfs_iget_args {
+	u64 ino;
+	__u64 cno;
+	struct nilfs_root *root;
+	int for_gc;
+};
+
+void nilfs_inode_add_blocks(struct inode *inode, int n)
+{
+	struct nilfs_root *root = NILFS_I(inode)->i_root;
+
+	inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
+	if (root)
+		atomic_add(n, &root->blocks_count);
+}
+
+void nilfs_inode_sub_blocks(struct inode *inode, int n)
+{
+	struct nilfs_root *root = NILFS_I(inode)->i_root;
+
+	inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
+	if (root)
+		atomic_sub(n, &root->blocks_count);
+}
+
+/**
+ * nilfs_get_block() - get a file block on the filesystem (callback function)
+ * @inode - inode struct of the target file
+ * @blkoff - file block number
+ * @bh_result - buffer head to be mapped on
+ * @create - indicate whether allocating the block or not when it has not
+ *      been allocated yet.
+ *
+ * This function does not issue actual read request of the specified data
+ * block. It is done by VFS.
+ */
+int nilfs_get_block(struct inode *inode, sector_t blkoff,
+		    struct buffer_head *bh_result, int create)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	__u64 blknum = 0;
+	int err = 0, ret;
+	unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
+
+	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+	ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
+	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+	if (ret >= 0) {	/* found */
+		map_bh(bh_result, inode->i_sb, blknum);
+		if (ret > 0)
+			bh_result->b_size = (ret << inode->i_blkbits);
+		goto out;
+	}
+	/* data block was not found */
+	if (ret == -ENOENT && create) {
+		struct nilfs_transaction_info ti;
+
+		bh_result->b_blocknr = 0;
+		err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+		if (unlikely(err))
+			goto out;
+		err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
+					(unsigned long)bh_result);
+		if (unlikely(err != 0)) {
+			if (err == -EEXIST) {
+				/*
+				 * The get_block() function could be called
+				 * from multiple callers for an inode.
+				 * However, the page having this block must
+				 * be locked in this case.
+				 */
+				printk(KERN_WARNING
+				       "nilfs_get_block: a race condition "
+				       "while inserting a data block. "
+				       "(inode number=%lu, file block "
+				       "offset=%llu)\n",
+				       inode->i_ino,
+				       (unsigned long long)blkoff);
+				err = 0;
+			}
+			nilfs_transaction_abort(inode->i_sb);
+			goto out;
+		}
+		nilfs_mark_inode_dirty(inode);
+		nilfs_transaction_commit(inode->i_sb); /* never fails */
+		/* Error handling should be detailed */
+		set_buffer_new(bh_result);
+		set_buffer_delay(bh_result);
+		map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
+						      to proper value */
+	} else if (ret == -ENOENT) {
+		/* not found is not error (e.g. hole); must return without
+		   the mapped state flag. */
+		;
+	} else {
+		err = ret;
+	}
+
+ out:
+	return err;
+}
+
+/**
+ * nilfs_readpage() - implement readpage() method of nilfs_aops {}
+ * address_space_operations.
+ * @file - file struct of the file to be read
+ * @page - the page to be read
+ */
+static int nilfs_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, nilfs_get_block);
+}
+
+/**
+ * nilfs_readpages() - implement readpages() method of nilfs_aops {}
+ * address_space_operations.
+ * @file - file struct of the file to be read
+ * @mapping - address_space struct used for reading multiple pages
+ * @pages - the pages to be read
+ * @nr_pages - number of pages to be read
+ */
+static int nilfs_readpages(struct file *file, struct address_space *mapping,
+			   struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
+}
+
+static int nilfs_writepages(struct address_space *mapping,
+			    struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	int err = 0;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		err = nilfs_construct_dsync_segment(inode->i_sb, inode,
+						    wbc->range_start,
+						    wbc->range_end);
+	return err;
+}
+
+static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	int err;
+
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		err = nilfs_construct_segment(inode->i_sb);
+		if (unlikely(err))
+			return err;
+	} else if (wbc->for_reclaim)
+		nilfs_flush_segment(inode->i_sb, inode->i_ino);
+
+	return 0;
+}
+
+static int nilfs_set_page_dirty(struct page *page)
+{
+	int ret = __set_page_dirty_buffers(page);
+
+	if (ret) {
+		struct inode *inode = page->mapping->host;
+		unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
+
+		nilfs_set_file_dirty(inode, nr_dirty);
+	}
+	return ret;
+}
+
+static int nilfs_write_begin(struct file *file, struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata)
+
+{
+	struct inode *inode = mapping->host;
+	int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);
+
+	if (unlikely(err))
+		return err;
+
+	err = block_write_begin(mapping, pos, len, flags, pagep,
+				nilfs_get_block);
+	if (unlikely(err)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+
+		nilfs_transaction_abort(inode->i_sb);
+	}
+	return err;
+}
+
+static int nilfs_write_end(struct file *file, struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned nr_dirty;
+	int err;
+
+	nr_dirty = nilfs_page_count_clean_buffers(page, start,
+						  start + copied);
+	copied = generic_write_end(file, mapping, pos, len, copied, page,
+				   fsdata);
+	nilfs_set_file_dirty(inode, nr_dirty);
+	err = nilfs_transaction_commit(inode->i_sb);
+	return err ? : copied;
+}
+
+static ssize_t
+nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+		loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t size;
+
+	if (rw == WRITE)
+		return 0;
+
+	/* Needs synchronization with the cleaner */
+	size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				  offset, nr_segs, nilfs_get_block, NULL);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && size < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+
+	return size;
+}
+
+const struct address_space_operations nilfs_aops = {
+	.writepage		= nilfs_writepage,
+	.readpage		= nilfs_readpage,
+	.writepages		= nilfs_writepages,
+	.set_page_dirty		= nilfs_set_page_dirty,
+	.readpages		= nilfs_readpages,
+	.write_begin		= nilfs_write_begin,
+	.write_end		= nilfs_write_end,
+	/* .releasepage		= nilfs_releasepage, */
+	.invalidatepage		= block_invalidatepage,
+	.direct_IO		= nilfs_direct_IO,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+};
+
+struct inode *nilfs_new_inode(struct inode *dir, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct inode *inode;
+	struct nilfs_inode_info *ii;
+	struct nilfs_root *root;
+	int err = -ENOMEM;
+	ino_t ino;
+
+	inode = new_inode(sb);
+	if (unlikely(!inode))
+		goto failed;
+
+	mapping_set_gfp_mask(inode->i_mapping,
+			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+
+	root = NILFS_I(dir)->i_root;
+	ii = NILFS_I(inode);
+	ii->i_state = 1 << NILFS_I_NEW;
+	ii->i_root = root;
+
+	err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
+	if (unlikely(err))
+		goto failed_ifile_create_inode;
+	/* reference count of i_bh inherits from nilfs_mdt_read_block() */
+
+	atomic_inc(&root->inodes_count);
+	inode_init_owner(inode, dir, mode);
+	inode->i_ino = ino;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
+		err = nilfs_bmap_read(ii->i_bmap, NULL);
+		if (err < 0)
+			goto failed_bmap;
+
+		set_bit(NILFS_I_BMAP, &ii->i_state);
+		/* No lock is needed; iget() ensures it. */
+	}
+
+	ii->i_flags = nilfs_mask_flags(
+		mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED);
+
+	/* ii->i_file_acl = 0; */
+	/* ii->i_dir_acl = 0; */
+	ii->i_dir_start_lookup = 0;
+	nilfs_set_inode_flags(inode);
+	spin_lock(&nilfs->ns_next_gen_lock);
+	inode->i_generation = nilfs->ns_next_generation++;
+	spin_unlock(&nilfs->ns_next_gen_lock);
+	insert_inode_hash(inode);
+
+	err = nilfs_init_acl(inode, dir);
+	if (unlikely(err))
+		goto failed_acl; /* never occur. When supporting
+				    nilfs_init_acl(), proper cancellation of
+				    above jobs should be considered */
+
+	return inode;
+
+ failed_acl:
+ failed_bmap:
+	inode->i_nlink = 0;
+	iput(inode);  /* raw_inode will be deleted through
+			 generic_delete_inode() */
+	goto failed;
+
+ failed_ifile_create_inode:
+	make_bad_inode(inode);
+	iput(inode);  /* if i_nlink == 1, generic_forget_inode() will be
+			 called */
+ failed:
+	return ERR_PTR(err);
+}
+
+void nilfs_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = NILFS_I(inode)->i_flags;
+
+	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
+			    S_DIRSYNC);
+	if (flags & FS_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & FS_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & FS_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (flags & FS_NOATIME_FL)
+		inode->i_flags |= S_NOATIME;
+	if (flags & FS_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+	mapping_set_gfp_mask(inode->i_mapping,
+			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+}
+
+int nilfs_read_inode_common(struct inode *inode,
+			    struct nilfs_inode *raw_inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	int err;
+
+	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+	inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
+	inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
+	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+	inode->i_size = le64_to_cpu(raw_inode->i_size);
+	inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+	inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
+	inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+	inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+	inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
+	inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+	if (inode->i_nlink == 0 && inode->i_mode == 0)
+		return -EINVAL; /* this inode is deleted */
+
+	inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
+	ii->i_flags = le32_to_cpu(raw_inode->i_flags);
+#if 0
+	ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+	ii->i_dir_acl = S_ISREG(inode->i_mode) ?
+		0 : le32_to_cpu(raw_inode->i_dir_acl);
+#endif
+	ii->i_dir_start_lookup = 0;
+	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+
+	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	    S_ISLNK(inode->i_mode)) {
+		err = nilfs_bmap_read(ii->i_bmap, raw_inode);
+		if (err < 0)
+			return err;
+		set_bit(NILFS_I_BMAP, &ii->i_state);
+		/* No lock is needed; iget() ensures it. */
+	}
+	return 0;
+}
+
+static int __nilfs_read_inode(struct super_block *sb,
+			      struct nilfs_root *root, unsigned long ino,
+			      struct inode *inode)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct buffer_head *bh;
+	struct nilfs_inode *raw_inode;
+	int err;
+
+	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+	err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
+	if (unlikely(err))
+		goto bad_inode;
+
+	raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
+
+	err = nilfs_read_inode_common(inode, raw_inode);
+	if (err)
+		goto failed_unmap;
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &nilfs_file_inode_operations;
+		inode->i_fop = &nilfs_file_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &nilfs_dir_inode_operations;
+		inode->i_fop = &nilfs_dir_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &nilfs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+	} else {
+		inode->i_op = &nilfs_special_inode_operations;
+		init_special_inode(
+			inode, inode->i_mode,
+			huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
+	}
+	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
+	brelse(bh);
+	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+	nilfs_set_inode_flags(inode);
+	return 0;
+
+ failed_unmap:
+	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
+	brelse(bh);
+
+ bad_inode:
+	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+	return err;
+}
+
+static int nilfs_iget_test(struct inode *inode, void *opaque)
+{
+	struct nilfs_iget_args *args = opaque;
+	struct nilfs_inode_info *ii;
+
+	if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
+		return 0;
+
+	ii = NILFS_I(inode);
+	if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
+		return !args->for_gc;
+
+	return args->for_gc && args->cno == ii->i_cno;
+}
+
+static int nilfs_iget_set(struct inode *inode, void *opaque)
+{
+	struct nilfs_iget_args *args = opaque;
+
+	inode->i_ino = args->ino;
+	if (args->for_gc) {
+		NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
+		NILFS_I(inode)->i_cno = args->cno;
+		NILFS_I(inode)->i_root = NULL;
+	} else {
+		if (args->root && args->ino == NILFS_ROOT_INO)
+			nilfs_get_root(args->root);
+		NILFS_I(inode)->i_root = args->root;
+	}
+	return 0;
+}
+
+struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
+			    unsigned long ino)
+{
+	struct nilfs_iget_args args = {
+		.ino = ino, .root = root, .cno = 0, .for_gc = 0
+	};
+
+	return ilookup5(sb, ino, nilfs_iget_test, &args);
+}
+
+struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
+				unsigned long ino)
+{
+	struct nilfs_iget_args args = {
+		.ino = ino, .root = root, .cno = 0, .for_gc = 0
+	};
+
+	return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
+}
+
+struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
+			 unsigned long ino)
+{
+	struct inode *inode;
+	int err;
+
+	inode = nilfs_iget_locked(sb, root, ino);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	err = __nilfs_read_inode(sb, root, ino, inode);
+	if (unlikely(err)) {
+		iget_failed(inode);
+		return ERR_PTR(err);
+	}
+	unlock_new_inode(inode);
+	return inode;
+}
+
+struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
+				__u64 cno)
+{
+	struct nilfs_iget_args args = {
+		.ino = ino, .root = NULL, .cno = cno, .for_gc = 1
+	};
+	struct inode *inode;
+	int err;
+
+	inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	err = nilfs_init_gcinode(inode);
+	if (unlikely(err)) {
+		iget_failed(inode);
+		return ERR_PTR(err);
+	}
+	unlock_new_inode(inode);
+	return inode;
+}
+
+void nilfs_write_inode_common(struct inode *inode,
+			      struct nilfs_inode *raw_inode, int has_bmap)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+	raw_inode->i_uid = cpu_to_le32(inode->i_uid);
+	raw_inode->i_gid = cpu_to_le32(inode->i_gid);
+	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+	raw_inode->i_size = cpu_to_le64(inode->i_size);
+	raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+	raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
+
+	raw_inode->i_flags = cpu_to_le32(ii->i_flags);
+	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+
+	if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
+		struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+		/* zero-fill unused portion in the case of super root block */
+		raw_inode->i_xattr = 0;
+		raw_inode->i_pad = 0;
+		memset((void *)raw_inode + sizeof(*raw_inode), 0,
+		       nilfs->ns_inode_size - sizeof(*raw_inode));
+	}
+
+	if (has_bmap)
+		nilfs_bmap_write(ii->i_bmap, raw_inode);
+	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		raw_inode->i_device_code =
+			cpu_to_le64(huge_encode_dev(inode->i_rdev));
+	/* When extending inode, nilfs->ns_inode_size should be checked
+	   for substitutions of appended fields */
+}
+
+void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
+{
+	ino_t ino = inode->i_ino;
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct inode *ifile = ii->i_root->ifile;
+	struct nilfs_inode *raw_inode;
+
+	raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
+
+	if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
+		memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
+	set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
+
+	nilfs_write_inode_common(inode, raw_inode, 0);
+		/* XXX: call with has_bmap = 0 is a workaround to avoid
+		   deadlock of bmap. This delays update of i_bmap to just
+		   before writing */
+	nilfs_ifile_unmap_inode(ifile, ino, ibh);
+}
+
+#define NILFS_MAX_TRUNCATE_BLOCKS	16384  /* 64MB for 4KB block */
+
+static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
+				unsigned long from)
+{
+	unsigned long b;
+	int ret;
+
+	if (!test_bit(NILFS_I_BMAP, &ii->i_state))
+		return;
+repeat:
+	ret = nilfs_bmap_last_key(ii->i_bmap, &b);
+	if (ret == -ENOENT)
+		return;
+	else if (ret < 0)
+		goto failed;
+
+	if (b < from)
+		return;
+
+	b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
+	ret = nilfs_bmap_truncate(ii->i_bmap, b);
+	nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
+	if (!ret || (ret == -ENOMEM &&
+		     nilfs_bmap_truncate(ii->i_bmap, b) == 0))
+		goto repeat;
+
+failed:
+	nilfs_warning(ii->vfs_inode.i_sb, __func__,
+		      "failed to truncate bmap (ino=%lu, err=%d)",
+		      ii->vfs_inode.i_ino, ret);
+}
+
+void nilfs_truncate(struct inode *inode)
+{
+	unsigned long blkoff;
+	unsigned int blocksize;
+	struct nilfs_transaction_info ti;
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	if (!test_bit(NILFS_I_BMAP, &ii->i_state))
+		return;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return;
+
+	blocksize = sb->s_blocksize;
+	blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
+	nilfs_transaction_begin(sb, &ti, 0); /* never fails */
+
+	block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);
+
+	nilfs_truncate_bmap(ii, blkoff);
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	if (IS_SYNC(inode))
+		nilfs_set_transaction_flag(NILFS_TI_SYNC);
+
+	nilfs_mark_inode_dirty(inode);
+	nilfs_set_file_dirty(inode, 0);
+	nilfs_transaction_commit(sb);
+	/* May construct a logical segment and may fail in sync mode.
+	   But truncate has no return value. */
+}
+
+static void nilfs_clear_inode(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+	/*
+	 * Free resources allocated in nilfs_read_inode(), here.
+	 */
+	BUG_ON(!list_empty(&ii->i_dirty));
+	brelse(ii->i_bh);
+	ii->i_bh = NULL;
+
+	if (mdi && mdi->mi_palloc_cache)
+		nilfs_palloc_destroy_cache(inode);
+
+	if (test_bit(NILFS_I_BMAP, &ii->i_state))
+		nilfs_bmap_clear(ii->i_bmap);
+
+	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+
+	if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
+		nilfs_put_root(ii->i_root);
+}
+
+void nilfs_evict_inode(struct inode *inode)
+{
+	struct nilfs_transaction_info ti;
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	int ret;
+
+	if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
+		if (inode->i_data.nrpages)
+			truncate_inode_pages(&inode->i_data, 0);
+		end_writeback(inode);
+		nilfs_clear_inode(inode);
+		return;
+	}
+	nilfs_transaction_begin(sb, &ti, 0); /* never fails */
+
+	if (inode->i_data.nrpages)
+		truncate_inode_pages(&inode->i_data, 0);
+
+	/* TODO: some of the following operations may fail.  */
+	nilfs_truncate_bmap(ii, 0);
+	nilfs_mark_inode_dirty(inode);
+	end_writeback(inode);
+
+	ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
+	if (!ret)
+		atomic_dec(&ii->i_root->inodes_count);
+
+	nilfs_clear_inode(inode);
+
+	if (IS_SYNC(inode))
+		nilfs_set_transaction_flag(NILFS_TI_SYNC);
+	nilfs_transaction_commit(sb);
+	/* May construct a logical segment and may fail in sync mode.
+	   But delete_inode has no return value. */
+}
+
+int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct nilfs_transaction_info ti;
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	int err;
+
+	err = inode_change_ok(inode, iattr);
+	if (err)
+		return err;
+
+	err = nilfs_transaction_begin(sb, &ti, 0);
+	if (unlikely(err))
+		return err;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		err = vmtruncate(inode, iattr->ia_size);
+		if (unlikely(err))
+			goto out_err;
+	}
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+
+	if (iattr->ia_valid & ATTR_MODE) {
+		err = nilfs_acl_chmod(inode);
+		if (unlikely(err))
+			goto out_err;
+	}
+
+	return nilfs_transaction_commit(sb);
+
+out_err:
+	nilfs_transaction_abort(sb);
+	return err;
+}
+
+int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	struct nilfs_root *root = NILFS_I(inode)->i_root;
+	if ((mask & MAY_WRITE) && root &&
+	    root->cno != NILFS_CPTREE_CURRENT_CNO)
+		return -EROFS; /* snapshot is not writable */
+
+	return generic_permission(inode, mask, flags, NULL);
+}
+
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	int err;
+
+	spin_lock(&nilfs->ns_inode_lock);
+	if (ii->i_bh == NULL) {
+		spin_unlock(&nilfs->ns_inode_lock);
+		err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
+						  inode->i_ino, pbh);
+		if (unlikely(err))
+			return err;
+		spin_lock(&nilfs->ns_inode_lock);
+		if (ii->i_bh == NULL)
+			ii->i_bh = *pbh;
+		else {
+			brelse(*pbh);
+			*pbh = ii->i_bh;
+		}
+	} else
+		*pbh = ii->i_bh;
+
+	get_bh(*pbh);
+	spin_unlock(&nilfs->ns_inode_lock);
+	return 0;
+}
+
+int nilfs_inode_dirty(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	int ret = 0;
+
+	if (!list_empty(&ii->i_dirty)) {
+		spin_lock(&nilfs->ns_inode_lock);
+		ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
+			test_bit(NILFS_I_BUSY, &ii->i_state);
+		spin_unlock(&nilfs->ns_inode_lock);
+	}
+	return ret;
+}
+
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+	atomic_add(nr_dirty, &nilfs->ns_ndirtyblks);
+
+	if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
+		return 0;
+
+	spin_lock(&nilfs->ns_inode_lock);
+	if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
+	    !test_bit(NILFS_I_BUSY, &ii->i_state)) {
+		/* Because this routine may race with nilfs_dispose_list(),
+		   we have to check NILFS_I_QUEUED here, too. */
+		if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
+			/* This will happen when somebody is freeing
+			   this inode. */
+			nilfs_warning(inode->i_sb, __func__,
+				      "cannot get inode (ino=%lu)\n",
+				      inode->i_ino);
+			spin_unlock(&nilfs->ns_inode_lock);
+			return -EINVAL; /* NILFS_I_DIRTY may remain for
+					   freeing inode */
+		}
+		list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
+		set_bit(NILFS_I_QUEUED, &ii->i_state);
+	}
+	spin_unlock(&nilfs->ns_inode_lock);
+	return 0;
+}
+
+int nilfs_mark_inode_dirty(struct inode *inode)
+{
+	struct buffer_head *ibh;
+	int err;
+
+	err = nilfs_load_inode_block(inode, &ibh);
+	if (unlikely(err)) {
+		nilfs_warning(inode->i_sb, __func__,
+			      "failed to reget inode block.\n");
+		return err;
+	}
+	nilfs_update_inode(inode, ibh);
+	mark_buffer_dirty(ibh);
+	nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
+	brelse(ibh);
+	return 0;
+}
+
+/**
+ * nilfs_dirty_inode - reflect changes on given inode to an inode block.
+ * @inode: inode of the file to be registered.
+ *
+ * nilfs_dirty_inode() loads a inode block containing the specified
+ * @inode and copies data from a nilfs_inode to a corresponding inode
+ * entry in the inode block. This operation is excluded from the segment
+ * construction. This function can be called both as a single operation
+ * and as a part of indivisible file operations.
+ */
+void nilfs_dirty_inode(struct inode *inode, int flags)
+{
+	struct nilfs_transaction_info ti;
+	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+	if (is_bad_inode(inode)) {
+		nilfs_warning(inode->i_sb, __func__,
+			      "tried to mark bad_inode dirty. ignored.\n");
+		dump_stack();
+		return;
+	}
+	if (mdi) {
+		nilfs_mdt_mark_dirty(inode);
+		return;
+	}
+	nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	nilfs_mark_inode_dirty(inode);
+	nilfs_transaction_commit(inode->i_sb); /* never fails */
+}
+
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 __u64 start, __u64 len)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	__u64 logical = 0, phys = 0, size = 0;
+	__u32 flags = 0;
+	loff_t isize;
+	sector_t blkoff, end_blkoff;
+	sector_t delalloc_blkoff;
+	unsigned long delalloc_blklen;
+	unsigned int blkbits = inode->i_blkbits;
+	int ret, n;
+
+	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+
+	isize = i_size_read(inode);
+
+	blkoff = start >> blkbits;
+	end_blkoff = (start + len - 1) >> blkbits;
+
+	delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
+							&delalloc_blkoff);
+
+	do {
+		__u64 blkphy;
+		unsigned int maxblocks;
+
+		if (delalloc_blklen && blkoff == delalloc_blkoff) {
+			if (size) {
+				/* End of the current extent */
+				ret = fiemap_fill_next_extent(
+					fieinfo, logical, phys, size, flags);
+				if (ret)
+					break;
+			}
+			if (blkoff > end_blkoff)
+				break;
+
+			flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
+			logical = blkoff << blkbits;
+			phys = 0;
+			size = delalloc_blklen << blkbits;
+
+			blkoff = delalloc_blkoff + delalloc_blklen;
+			delalloc_blklen = nilfs_find_uncommitted_extent(
+				inode, blkoff, &delalloc_blkoff);
+			continue;
+		}
+
+		/*
+		 * Limit the number of blocks that we look up so as
+		 * not to get into the next delayed allocation extent.
+		 */
+		maxblocks = INT_MAX;
+		if (delalloc_blklen)
+			maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
+					  maxblocks);
+		blkphy = 0;
+
+		down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+		n = nilfs_bmap_lookup_contig(
+			NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
+		up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+
+		if (n < 0) {
+			int past_eof;
+
+			if (unlikely(n != -ENOENT))
+				break; /* error */
+
+			/* HOLE */
+			blkoff++;
+			past_eof = ((blkoff << blkbits) >= isize);
+
+			if (size) {
+				/* End of the current extent */
+
+				if (past_eof)
+					flags |= FIEMAP_EXTENT_LAST;
+
+				ret = fiemap_fill_next_extent(
+					fieinfo, logical, phys, size, flags);
+				if (ret)
+					break;
+				size = 0;
+			}
+			if (blkoff > end_blkoff || past_eof)
+				break;
+		} else {
+			if (size) {
+				if (phys && blkphy << blkbits == phys + size) {
+					/* The current extent goes on */
+					size += n << blkbits;
+				} else {
+					/* Terminate the current extent */
+					ret = fiemap_fill_next_extent(
+						fieinfo, logical, phys, size,
+						flags);
+					if (ret || blkoff > end_blkoff)
+						break;
+
+					/* Start another extent */
+					flags = FIEMAP_EXTENT_MERGED;
+					logical = blkoff << blkbits;
+					phys = blkphy << blkbits;
+					size = n << blkbits;
+				}
+			} else {
+				/* Start a new extent */
+				flags = FIEMAP_EXTENT_MERGED;
+				logical = blkoff << blkbits;
+				phys = blkphy << blkbits;
+				size = n << blkbits;
+			}
+			blkoff += n;
+		}
+		cond_resched();
+	} while (true);
+
+	/* If ret is 1 then we just hit the end of the extent array */
+	if (ret == 1)
+		ret = 0;
+
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
new file mode 100644
index 00000000..3e654273
--- /dev/null
+++ b/fs/nilfs2/ioctl.c
@@ -0,0 +1,863 @@
+/*
+ * ioctl.c - NILFS ioctl operations.
+ *
+ * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/capability.h>	/* capable() */
+#include <linux/uaccess.h>	/* copy_from_user(), copy_to_user() */
+#include <linux/vmalloc.h>
+#include <linux/compat.h>	/* compat_ptr() */
+#include <linux/mount.h>	/* mnt_want_write(), mnt_drop_write() */
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "bmap.h"
+#include "cpfile.h"
+#include "sufile.h"
+#include "dat.h"
+
+
+static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
+				 struct nilfs_argv *argv, int dir,
+				 ssize_t (*dofunc)(struct the_nilfs *,
+						   __u64 *, int,
+						   void *, size_t, size_t))
+{
+	void *buf;
+	void __user *base = (void __user *)(unsigned long)argv->v_base;
+	size_t maxmembs, total, n;
+	ssize_t nr;
+	int ret, i;
+	__u64 pos, ppos;
+
+	if (argv->v_nmembs == 0)
+		return 0;
+
+	if (argv->v_size > PAGE_SIZE)
+		return -EINVAL;
+
+	buf = (void *)__get_free_pages(GFP_NOFS, 0);
+	if (unlikely(!buf))
+		return -ENOMEM;
+	maxmembs = PAGE_SIZE / argv->v_size;
+
+	ret = 0;
+	total = 0;
+	pos = argv->v_index;
+	for (i = 0; i < argv->v_nmembs; i += n) {
+		n = (argv->v_nmembs - i < maxmembs) ?
+			argv->v_nmembs - i : maxmembs;
+		if ((dir & _IOC_WRITE) &&
+		    copy_from_user(buf, base + argv->v_size * i,
+				   argv->v_size * n)) {
+			ret = -EFAULT;
+			break;
+		}
+		ppos = pos;
+		nr = dofunc(nilfs, &pos, argv->v_flags, buf, argv->v_size,
+			       n);
+		if (nr < 0) {
+			ret = nr;
+			break;
+		}
+		if ((dir & _IOC_READ) &&
+		    copy_to_user(base + argv->v_size * i, buf,
+				 argv->v_size * nr)) {
+			ret = -EFAULT;
+			break;
+		}
+		total += nr;
+		if ((size_t)nr < n)
+			break;
+		if (pos == ppos)
+			pos += n;
+	}
+	argv->v_nmembs = total;
+
+	free_pages((unsigned long)buf, 0);
+	return ret;
+}
+
+static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
+{
+	unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE;
+
+	return put_user(flags, (int __user *)argp);
+}
+
+static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
+				void __user *argp)
+{
+	struct nilfs_transaction_info ti;
+	unsigned int flags, oldflags;
+	int ret;
+
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
+	if (get_user(flags, (int __user *)argp))
+		return -EFAULT;
+
+	ret = mnt_want_write(filp->f_path.mnt);
+	if (ret)
+		return ret;
+
+	flags = nilfs_mask_flags(inode->i_mode, flags);
+
+	mutex_lock(&inode->i_mutex);
+
+	oldflags = NILFS_I(inode)->i_flags;
+
+	/*
+	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by the
+	 * relevant capability.
+	 */
+	ret = -EPERM;
+	if (((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		goto out;
+
+	ret = nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	if (ret)
+		goto out;
+
+	NILFS_I(inode)->i_flags = (oldflags & ~FS_FL_USER_MODIFIABLE) |
+		(flags & FS_FL_USER_MODIFIABLE);
+
+	nilfs_set_inode_flags(inode);
+	inode->i_ctime = CURRENT_TIME;
+	if (IS_SYNC(inode))
+		nilfs_set_transaction_flag(NILFS_TI_SYNC);
+
+	nilfs_mark_inode_dirty(inode);
+	ret = nilfs_transaction_commit(inode->i_sb);
+out:
+	mutex_unlock(&inode->i_mutex);
+	mnt_drop_write(filp->f_path.mnt);
+	return ret;
+}
+
+static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp)
+{
+	return put_user(inode->i_generation, (int __user *)argp);
+}
+
+static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
+				     unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct nilfs_transaction_info ti;
+	struct nilfs_cpmode cpmode;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write(filp->f_path.mnt);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
+	if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
+		goto out;
+
+	down_read(&inode->i_sb->s_umount);
+
+	nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	ret = nilfs_cpfile_change_cpmode(
+		nilfs->ns_cpfile, cpmode.cm_cno, cpmode.cm_mode);
+	if (unlikely(ret < 0))
+		nilfs_transaction_abort(inode->i_sb);
+	else
+		nilfs_transaction_commit(inode->i_sb); /* never fails */
+
+	up_read(&inode->i_sb->s_umount);
+out:
+	mnt_drop_write(filp->f_path.mnt);
+	return ret;
+}
+
+static int
+nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
+			      unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct nilfs_transaction_info ti;
+	__u64 cno;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write(filp->f_path.mnt);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
+	if (copy_from_user(&cno, argp, sizeof(cno)))
+		goto out;
+
+	nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	ret = nilfs_cpfile_delete_checkpoint(nilfs->ns_cpfile, cno);
+	if (unlikely(ret < 0))
+		nilfs_transaction_abort(inode->i_sb);
+	else
+		nilfs_transaction_commit(inode->i_sb); /* never fails */
+out:
+	mnt_drop_write(filp->f_path.mnt);
+	return ret;
+}
+
+static ssize_t
+nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			  void *buf, size_t size, size_t nmembs)
+{
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf,
+				      size, nmembs);
+	up_read(&nilfs->ns_segctor_sem);
+	return ret;
+}
+
+static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
+				  unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct nilfs_cpstat cpstat;
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &cpstat, sizeof(cpstat)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static ssize_t
+nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			  void *buf, size_t size, size_t nmembs)
+{
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, size,
+				      nmembs);
+	up_read(&nilfs->ns_segctor_sem);
+	return ret;
+}
+
+static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
+				  unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct nilfs_sustat sustat;
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &sustat, sizeof(sustat)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static ssize_t
+nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			 void *buf, size_t size, size_t nmembs)
+{
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_dat_get_vinfo(nilfs->ns_dat, buf, size, nmembs);
+	up_read(&nilfs->ns_segctor_sem);
+	return ret;
+}
+
+static ssize_t
+nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			  void *buf, size_t size, size_t nmembs)
+{
+	struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
+	struct nilfs_bdesc *bdescs = buf;
+	int ret, i;
+
+	down_read(&nilfs->ns_segctor_sem);
+	for (i = 0; i < nmembs; i++) {
+		ret = nilfs_bmap_lookup_at_level(bmap,
+						 bdescs[i].bd_offset,
+						 bdescs[i].bd_level + 1,
+						 &bdescs[i].bd_blocknr);
+		if (ret < 0) {
+			if (ret != -ENOENT) {
+				up_read(&nilfs->ns_segctor_sem);
+				return ret;
+			}
+			bdescs[i].bd_blocknr = 0;
+		}
+	}
+	up_read(&nilfs->ns_segctor_sem);
+	return nmembs;
+}
+
+static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
+				  unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct nilfs_argv argv;
+	int ret;
+
+	if (copy_from_user(&argv, argp, sizeof(argv)))
+		return -EFAULT;
+
+	if (argv.v_size != sizeof(struct nilfs_bdesc))
+		return -EINVAL;
+
+	ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
+				    nilfs_ioctl_do_get_bdescs);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &argv, sizeof(argv)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static int nilfs_ioctl_move_inode_block(struct inode *inode,
+					struct nilfs_vdesc *vdesc,
+					struct list_head *buffers)
+{
+	struct buffer_head *bh;
+	int ret;
+
+	if (vdesc->vd_flags == 0)
+		ret = nilfs_gccache_submit_read_data(
+			inode, vdesc->vd_offset, vdesc->vd_blocknr,
+			vdesc->vd_vblocknr, &bh);
+	else
+		ret = nilfs_gccache_submit_read_node(
+			inode, vdesc->vd_blocknr, vdesc->vd_vblocknr, &bh);
+
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT)
+			printk(KERN_CRIT
+			       "%s: invalid virtual block address (%s): "
+			       "ino=%llu, cno=%llu, offset=%llu, "
+			       "blocknr=%llu, vblocknr=%llu\n",
+			       __func__, vdesc->vd_flags ? "node" : "data",
+			       (unsigned long long)vdesc->vd_ino,
+			       (unsigned long long)vdesc->vd_cno,
+			       (unsigned long long)vdesc->vd_offset,
+			       (unsigned long long)vdesc->vd_blocknr,
+			       (unsigned long long)vdesc->vd_vblocknr);
+		return ret;
+	}
+	if (unlikely(!list_empty(&bh->b_assoc_buffers))) {
+		printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, "
+		       "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n",
+		       __func__, vdesc->vd_flags ? "node" : "data",
+		       (unsigned long long)vdesc->vd_ino,
+		       (unsigned long long)vdesc->vd_cno,
+		       (unsigned long long)vdesc->vd_offset,
+		       (unsigned long long)vdesc->vd_blocknr,
+		       (unsigned long long)vdesc->vd_vblocknr);
+		brelse(bh);
+		return -EEXIST;
+	}
+	list_add_tail(&bh->b_assoc_buffers, buffers);
+	return 0;
+}
+
+static int nilfs_ioctl_move_blocks(struct super_block *sb,
+				   struct nilfs_argv *argv, void *buf)
+{
+	size_t nmembs = argv->v_nmembs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct inode *inode;
+	struct nilfs_vdesc *vdesc;
+	struct buffer_head *bh, *n;
+	LIST_HEAD(buffers);
+	ino_t ino;
+	__u64 cno;
+	int i, ret;
+
+	for (i = 0, vdesc = buf; i < nmembs; ) {
+		ino = vdesc->vd_ino;
+		cno = vdesc->vd_cno;
+		inode = nilfs_iget_for_gc(sb, ino, cno);
+		if (IS_ERR(inode)) {
+			ret = PTR_ERR(inode);
+			goto failed;
+		}
+		if (list_empty(&NILFS_I(inode)->i_dirty)) {
+			/*
+			 * Add the inode to GC inode list. Garbage Collection
+			 * is serialized and no two processes manipulate the
+			 * list simultaneously.
+			 */
+			igrab(inode);
+			list_add(&NILFS_I(inode)->i_dirty,
+				 &nilfs->ns_gc_inodes);
+		}
+
+		do {
+			ret = nilfs_ioctl_move_inode_block(inode, vdesc,
+							   &buffers);
+			if (unlikely(ret < 0)) {
+				iput(inode);
+				goto failed;
+			}
+			vdesc++;
+		} while (++i < nmembs &&
+			 vdesc->vd_ino == ino && vdesc->vd_cno == cno);
+
+		iput(inode); /* The inode still remains in GC inode list */
+	}
+
+	list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
+		ret = nilfs_gccache_wait_and_mark_dirty(bh);
+		if (unlikely(ret < 0)) {
+			WARN_ON(ret == -EEXIST);
+			goto failed;
+		}
+		list_del_init(&bh->b_assoc_buffers);
+		brelse(bh);
+	}
+	return nmembs;
+
+ failed:
+	list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
+		list_del_init(&bh->b_assoc_buffers);
+		brelse(bh);
+	}
+	return ret;
+}
+
+static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
+					  struct nilfs_argv *argv, void *buf)
+{
+	size_t nmembs = argv->v_nmembs;
+	struct inode *cpfile = nilfs->ns_cpfile;
+	struct nilfs_period *periods = buf;
+	int ret, i;
+
+	for (i = 0; i < nmembs; i++) {
+		ret = nilfs_cpfile_delete_checkpoints(
+			cpfile, periods[i].p_start, periods[i].p_end);
+		if (ret < 0)
+			return ret;
+	}
+	return nmembs;
+}
+
+static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
+				      struct nilfs_argv *argv, void *buf)
+{
+	size_t nmembs = argv->v_nmembs;
+	int ret;
+
+	ret = nilfs_dat_freev(nilfs->ns_dat, buf, nmembs);
+
+	return (ret < 0) ? ret : nmembs;
+}
+
+static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
+					 struct nilfs_argv *argv, void *buf)
+{
+	size_t nmembs = argv->v_nmembs;
+	struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
+	struct nilfs_bdesc *bdescs = buf;
+	int ret, i;
+
+	for (i = 0; i < nmembs; i++) {
+		/* XXX: use macro or inline func to check liveness */
+		ret = nilfs_bmap_lookup_at_level(bmap,
+						 bdescs[i].bd_offset,
+						 bdescs[i].bd_level + 1,
+						 &bdescs[i].bd_blocknr);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				return ret;
+			bdescs[i].bd_blocknr = 0;
+		}
+		if (bdescs[i].bd_blocknr != bdescs[i].bd_oblocknr)
+			/* skip dead block */
+			continue;
+		if (bdescs[i].bd_level == 0) {
+			ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat,
+							 bdescs[i].bd_offset);
+			if (ret < 0) {
+				WARN_ON(ret == -ENOENT);
+				return ret;
+			}
+		} else {
+			ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset,
+					      bdescs[i].bd_level);
+			if (ret < 0) {
+				WARN_ON(ret == -ENOENT);
+				return ret;
+			}
+		}
+	}
+	return nmembs;
+}
+
+int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
+				       struct nilfs_argv *argv, void **kbufs)
+{
+	const char *msg;
+	int ret;
+
+	ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]);
+	if (ret < 0) {
+		/*
+		 * can safely abort because checkpoints can be removed
+		 * independently.
+		 */
+		msg = "cannot delete checkpoints";
+		goto failed;
+	}
+	ret = nilfs_ioctl_free_vblocknrs(nilfs, &argv[2], kbufs[2]);
+	if (ret < 0) {
+		/*
+		 * can safely abort because DAT file is updated atomically
+		 * using a copy-on-write technique.
+		 */
+		msg = "cannot delete virtual blocks from DAT file";
+		goto failed;
+	}
+	ret = nilfs_ioctl_mark_blocks_dirty(nilfs, &argv[3], kbufs[3]);
+	if (ret < 0) {
+		/*
+		 * can safely abort because the operation is nondestructive.
+		 */
+		msg = "cannot mark copying blocks dirty";
+		goto failed;
+	}
+	return 0;
+
+ failed:
+	printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
+	       msg, ret);
+	return ret;
+}
+
+static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
+				      unsigned int cmd, void __user *argp)
+{
+	struct nilfs_argv argv[5];
+	static const size_t argsz[5] = {
+		sizeof(struct nilfs_vdesc),
+		sizeof(struct nilfs_period),
+		sizeof(__u64),
+		sizeof(struct nilfs_bdesc),
+		sizeof(__u64),
+	};
+	void __user *base;
+	void *kbufs[5];
+	struct the_nilfs *nilfs;
+	size_t len, nsegs;
+	int n, ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write(filp->f_path.mnt);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
+	if (copy_from_user(argv, argp, sizeof(argv)))
+		goto out;
+
+	ret = -EINVAL;
+	nsegs = argv[4].v_nmembs;
+	if (argv[4].v_size != argsz[4])
+		goto out;
+
+	/*
+	 * argv[4] points to segment numbers this ioctl cleans.  We
+	 * use kmalloc() for its buffer because memory used for the
+	 * segment numbers is enough small.
+	 */
+	kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
+			       nsegs * sizeof(__u64));
+	if (IS_ERR(kbufs[4])) {
+		ret = PTR_ERR(kbufs[4]);
+		goto out;
+	}
+	nilfs = inode->i_sb->s_fs_info;
+
+	for (n = 0; n < 4; n++) {
+		ret = -EINVAL;
+		if (argv[n].v_size != argsz[n])
+			goto out_free;
+
+		if (argv[n].v_nmembs > nsegs * nilfs->ns_blocks_per_segment)
+			goto out_free;
+
+		len = argv[n].v_size * argv[n].v_nmembs;
+		base = (void __user *)(unsigned long)argv[n].v_base;
+		if (len == 0) {
+			kbufs[n] = NULL;
+			continue;
+		}
+
+		kbufs[n] = vmalloc(len);
+		if (!kbufs[n]) {
+			ret = -ENOMEM;
+			goto out_free;
+		}
+		if (copy_from_user(kbufs[n], base, len)) {
+			ret = -EFAULT;
+			vfree(kbufs[n]);
+			goto out_free;
+		}
+	}
+
+	/*
+	 * nilfs_ioctl_move_blocks() will call nilfs_iget_for_gc(),
+	 * which will operates an inode list without blocking.
+	 * To protect the list from concurrent operations,
+	 * nilfs_ioctl_move_blocks should be atomic operation.
+	 */
+	if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) {
+		ret = -EBUSY;
+		goto out_free;
+	}
+
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+	ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
+	if (ret < 0)
+		printk(KERN_ERR "NILFS: GC failed during preparation: "
+			"cannot read source blocks: err=%d\n", ret);
+	else
+		ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+
+	nilfs_remove_all_gcinodes(nilfs);
+	clear_nilfs_gc_running(nilfs);
+
+out_free:
+	while (--n >= 0)
+		vfree(kbufs[n]);
+	kfree(kbufs[4]);
+out:
+	mnt_drop_write(filp->f_path.mnt);
+	return ret;
+}
+
+static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
+			    unsigned int cmd, void __user *argp)
+{
+	__u64 cno;
+	int ret;
+	struct the_nilfs *nilfs;
+
+	ret = nilfs_construct_segment(inode->i_sb);
+	if (ret < 0)
+		return ret;
+
+	if (argp != NULL) {
+		nilfs = inode->i_sb->s_fs_info;
+		down_read(&nilfs->ns_segctor_sem);
+		cno = nilfs->ns_cno - 1;
+		up_read(&nilfs->ns_segctor_sem);
+		if (copy_to_user(argp, &cno, sizeof(cno)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+static int nilfs_ioctl_resize(struct inode *inode, struct file *filp,
+			      void __user *argp)
+{
+	__u64 newsize;
+	int ret = -EPERM;
+
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	ret = mnt_want_write(filp->f_path.mnt);
+	if (ret)
+		goto out;
+
+	ret = -EFAULT;
+	if (copy_from_user(&newsize, argp, sizeof(newsize)))
+		goto out_drop_write;
+
+	ret = nilfs_resize_fs(inode->i_sb, newsize);
+
+out_drop_write:
+	mnt_drop_write(filp->f_path.mnt);
+out:
+	return ret;
+}
+
+static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	__u64 range[2];
+	__u64 minseg, maxseg;
+	unsigned long segbytes;
+	int ret = -EPERM;
+
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	ret = -EFAULT;
+	if (copy_from_user(range, argp, sizeof(__u64[2])))
+		goto out;
+
+	ret = -ERANGE;
+	if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode))
+		goto out;
+
+	segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize;
+
+	minseg = range[0] + segbytes - 1;
+	do_div(minseg, segbytes);
+	maxseg = NILFS_SB2_OFFSET_BYTES(range[1]);
+	do_div(maxseg, segbytes);
+	maxseg--;
+
+	ret = nilfs_sufile_set_alloc_range(nilfs->ns_sufile, minseg, maxseg);
+out:
+	return ret;
+}
+
+static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
+				unsigned int cmd, void __user *argp,
+				size_t membsz,
+				ssize_t (*dofunc)(struct the_nilfs *,
+						  __u64 *, int,
+						  void *, size_t, size_t))
+
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	struct nilfs_argv argv;
+	int ret;
+
+	if (copy_from_user(&argv, argp, sizeof(argv)))
+		return -EFAULT;
+
+	if (argv.v_size < membsz)
+		return -EINVAL;
+
+	ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), dofunc);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &argv, sizeof(argv)))
+		ret = -EFAULT;
+	return ret;
+}
+
+long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		return nilfs_ioctl_getflags(inode, argp);
+	case FS_IOC_SETFLAGS:
+		return nilfs_ioctl_setflags(inode, filp, argp);
+	case FS_IOC_GETVERSION:
+		return nilfs_ioctl_getversion(inode, argp);
+	case NILFS_IOCTL_CHANGE_CPMODE:
+		return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp);
+	case NILFS_IOCTL_DELETE_CHECKPOINT:
+		return nilfs_ioctl_delete_checkpoint(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_CPINFO:
+		return nilfs_ioctl_get_info(inode, filp, cmd, argp,
+					    sizeof(struct nilfs_cpinfo),
+					    nilfs_ioctl_do_get_cpinfo);
+	case NILFS_IOCTL_GET_CPSTAT:
+		return nilfs_ioctl_get_cpstat(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_SUINFO:
+		return nilfs_ioctl_get_info(inode, filp, cmd, argp,
+					    sizeof(struct nilfs_suinfo),
+					    nilfs_ioctl_do_get_suinfo);
+	case NILFS_IOCTL_GET_SUSTAT:
+		return nilfs_ioctl_get_sustat(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_VINFO:
+		return nilfs_ioctl_get_info(inode, filp, cmd, argp,
+					    sizeof(struct nilfs_vinfo),
+					    nilfs_ioctl_do_get_vinfo);
+	case NILFS_IOCTL_GET_BDESCS:
+		return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp);
+	case NILFS_IOCTL_CLEAN_SEGMENTS:
+		return nilfs_ioctl_clean_segments(inode, filp, cmd, argp);
+	case NILFS_IOCTL_SYNC:
+		return nilfs_ioctl_sync(inode, filp, cmd, argp);
+	case NILFS_IOCTL_RESIZE:
+		return nilfs_ioctl_resize(inode, filp, argp);
+	case NILFS_IOCTL_SET_ALLOC_RANGE:
+		return nilfs_ioctl_set_alloc_range(inode, argp);
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case FS_IOC32_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	case FS_IOC32_GETVERSION:
+		cmd = FS_IOC_GETVERSION;
+		break;
+	case NILFS_IOCTL_CHANGE_CPMODE:
+	case NILFS_IOCTL_DELETE_CHECKPOINT:
+	case NILFS_IOCTL_GET_CPINFO:
+	case NILFS_IOCTL_GET_CPSTAT:
+	case NILFS_IOCTL_GET_SUINFO:
+	case NILFS_IOCTL_GET_SUSTAT:
+	case NILFS_IOCTL_GET_VINFO:
+	case NILFS_IOCTL_GET_BDESCS:
+	case NILFS_IOCTL_CLEAN_SEGMENTS:
+	case NILFS_IOCTL_SYNC:
+	case NILFS_IOCTL_RESIZE:
+	case NILFS_IOCTL_SET_ALLOC_RANGE:
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return nilfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
new file mode 100644
index 00000000..800e8d78
--- /dev/null
+++ b/fs/nilfs2/mdt.c
@@ -0,0 +1,589 @@
+/*
+ * mdt.c - meta data file for NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/mm.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include "nilfs.h"
+#include "btnode.h"
+#include "segment.h"
+#include "page.h"
+#include "mdt.h"
+
+
+#define NILFS_MDT_MAX_RA_BLOCKS		(16 - 1)
+
+
+static int
+nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
+			   struct buffer_head *bh,
+			   void (*init_block)(struct inode *,
+					      struct buffer_head *, void *))
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	void *kaddr;
+	int ret;
+
+	/* Caller exclude read accesses using page lock */
+
+	/* set_buffer_new(bh); */
+	bh->b_blocknr = 0;
+
+	ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh);
+	if (unlikely(ret))
+		return ret;
+
+	set_buffer_mapped(bh);
+
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits);
+	if (init_block)
+		init_block(inode, bh, kaddr);
+	flush_dcache_page(bh->b_page);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	set_buffer_uptodate(bh);
+	mark_buffer_dirty(bh);
+	nilfs_mdt_mark_dirty(inode);
+	return 0;
+}
+
+static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
+				  struct buffer_head **out_bh,
+				  void (*init_block)(struct inode *,
+						     struct buffer_head *,
+						     void *))
+{
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_transaction_info ti;
+	struct buffer_head *bh;
+	int err;
+
+	nilfs_transaction_begin(sb, &ti, 0);
+
+	err = -ENOMEM;
+	bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0);
+	if (unlikely(!bh))
+		goto failed_unlock;
+
+	err = -EEXIST;
+	if (buffer_uptodate(bh))
+		goto failed_bh;
+
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		goto failed_bh;
+
+	bh->b_bdev = sb->s_bdev;
+	err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
+	if (likely(!err)) {
+		get_bh(bh);
+		*out_bh = bh;
+	}
+
+ failed_bh:
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	brelse(bh);
+
+ failed_unlock:
+	if (likely(!err))
+		err = nilfs_transaction_commit(sb);
+	else
+		nilfs_transaction_abort(sb);
+
+	return err;
+}
+
+static int
+nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
+		       int mode, struct buffer_head **out_bh)
+{
+	struct buffer_head *bh;
+	__u64 blknum = 0;
+	int ret = -ENOMEM;
+
+	bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
+	if (unlikely(!bh))
+		goto failed;
+
+	ret = -EEXIST; /* internal code */
+	if (buffer_uptodate(bh))
+		goto out;
+
+	if (mode == READA) {
+		if (!trylock_buffer(bh)) {
+			ret = -EBUSY;
+			goto failed_bh;
+		}
+	} else /* mode == READ */
+		lock_buffer(bh);
+
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		goto out;
+	}
+
+	ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum);
+	if (unlikely(ret)) {
+		unlock_buffer(bh);
+		goto failed_bh;
+	}
+	map_bh(bh, inode->i_sb, (sector_t)blknum);
+
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(mode, bh);
+	ret = 0;
+ out:
+	get_bh(bh);
+	*out_bh = bh;
+
+ failed_bh:
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	brelse(bh);
+ failed:
+	return ret;
+}
+
+static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
+				int readahead, struct buffer_head **out_bh)
+{
+	struct buffer_head *first_bh, *bh;
+	unsigned long blkoff;
+	int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS;
+	int err;
+
+	err = nilfs_mdt_submit_block(inode, block, READ, &first_bh);
+	if (err == -EEXIST) /* internal code */
+		goto out;
+
+	if (unlikely(err))
+		goto failed;
+
+	if (readahead) {
+		blkoff = block + 1;
+		for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
+			err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
+			if (likely(!err || err == -EEXIST))
+				brelse(bh);
+			else if (err != -EBUSY)
+				break;
+				/* abort readahead if bmap lookup failed */
+			if (!buffer_locked(first_bh))
+				goto out_no_wait;
+		}
+	}
+
+	wait_on_buffer(first_bh);
+
+ out_no_wait:
+	err = -EIO;
+	if (!buffer_uptodate(first_bh))
+		goto failed_bh;
+ out:
+	*out_bh = first_bh;
+	return 0;
+
+ failed_bh:
+	brelse(first_bh);
+ failed:
+	return err;
+}
+
+/**
+ * nilfs_mdt_get_block - read or create a buffer on meta data file.
+ * @inode: inode of the meta data file
+ * @blkoff: block offset
+ * @create: create flag
+ * @init_block: initializer used for newly allocated block
+ * @out_bh: output of a pointer to the buffer_head
+ *
+ * nilfs_mdt_get_block() looks up the specified buffer and tries to create
+ * a new buffer if @create is not zero.  On success, the returned buffer is
+ * assured to be either existing or formatted using a buffer lock on success.
+ * @out_bh is substituted only when zero is returned.
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ *
+ * %-EROFS - Read only filesystem (for create mode)
+ */
+int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
+			void (*init_block)(struct inode *,
+					   struct buffer_head *, void *),
+			struct buffer_head **out_bh)
+{
+	int ret;
+
+	/* Should be rewritten with merging nilfs_mdt_read_block() */
+ retry:
+	ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh);
+	if (!create || ret != -ENOENT)
+		return ret;
+
+	ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block);
+	if (unlikely(ret == -EEXIST)) {
+		/* create = 0; */  /* limit read-create loop retries */
+		goto retry;
+	}
+	return ret;
+}
+
+/**
+ * nilfs_mdt_delete_block - make a hole on the meta data file.
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * Return Value: On success, zero is returned.
+ * On error, one of the following negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ */
+int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	int err;
+
+	err = nilfs_bmap_delete(ii->i_bmap, block);
+	if (!err || err == -ENOENT) {
+		nilfs_mdt_mark_dirty(inode);
+		nilfs_mdt_forget_block(inode, block);
+	}
+	return err;
+}
+
+/**
+ * nilfs_mdt_forget_block - discard dirty state and try to remove the page
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and
+ * tries to release the page including the buffer from a page cache.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EBUSY - page has an active buffer.
+ *
+ * %-ENOENT - page cache has no page addressed by the offset.
+ */
+int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
+{
+	pgoff_t index = (pgoff_t)block >>
+		(PAGE_CACHE_SHIFT - inode->i_blkbits);
+	struct page *page;
+	unsigned long first_block;
+	int ret = 0;
+	int still_dirty;
+
+	page = find_lock_page(inode->i_mapping, index);
+	if (!page)
+		return -ENOENT;
+
+	wait_on_page_writeback(page);
+
+	first_block = (unsigned long)index <<
+		(PAGE_CACHE_SHIFT - inode->i_blkbits);
+	if (page_has_buffers(page)) {
+		struct buffer_head *bh;
+
+		bh = nilfs_page_get_nth_block(page, block - first_block);
+		nilfs_forget_buffer(bh);
+	}
+	still_dirty = PageDirty(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (still_dirty ||
+	    invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
+		ret = -EBUSY;
+	return ret;
+}
+
+/**
+ * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty.
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ */
+int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
+{
+	struct buffer_head *bh;
+	int err;
+
+	err = nilfs_mdt_read_block(inode, block, 0, &bh);
+	if (unlikely(err))
+		return err;
+	mark_buffer_dirty(bh);
+	nilfs_mdt_mark_dirty(inode);
+	brelse(bh);
+	return 0;
+}
+
+int nilfs_mdt_fetch_dirty(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) {
+		set_bit(NILFS_I_DIRTY, &ii->i_state);
+		return 1;
+	}
+	return test_bit(NILFS_I_DIRTY, &ii->i_state);
+}
+
+static int
+nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode;
+	struct super_block *sb;
+	int err = 0;
+
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+
+	inode = page->mapping->host;
+	if (!inode)
+		return 0;
+
+	sb = inode->i_sb;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		err = nilfs_construct_segment(sb);
+	else if (wbc->for_reclaim)
+		nilfs_flush_segment(sb, inode->i_ino);
+
+	return err;
+}
+
+
+static const struct address_space_operations def_mdt_aops = {
+	.writepage		= nilfs_mdt_write_page,
+};
+
+static const struct inode_operations def_mdt_iops;
+static const struct file_operations def_mdt_fops;
+
+
+int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
+{
+	struct nilfs_mdt_info *mi;
+
+	mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
+	if (!mi)
+		return -ENOMEM;
+
+	init_rwsem(&mi->mi_sem);
+	inode->i_private = mi;
+
+	inode->i_mode = S_IFREG;
+	mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
+	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
+
+	inode->i_op = &def_mdt_iops;
+	inode->i_fop = &def_mdt_fops;
+	inode->i_mapping->a_ops = &def_mdt_aops;
+
+	return 0;
+}
+
+void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
+			      unsigned header_size)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+
+	mi->mi_entry_size = entry_size;
+	mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size;
+	mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
+}
+
+/**
+ * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
+ * @inode: inode of the metadata file
+ * @shadow: shadow mapping
+ */
+int nilfs_mdt_setup_shadow_map(struct inode *inode,
+			       struct nilfs_shadow_map *shadow)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+	struct backing_dev_info *bdi = inode->i_sb->s_bdi;
+
+	INIT_LIST_HEAD(&shadow->frozen_buffers);
+	address_space_init_once(&shadow->frozen_data);
+	nilfs_mapping_init(&shadow->frozen_data, inode, bdi);
+	address_space_init_once(&shadow->frozen_btnodes);
+	nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi);
+	mi->mi_shadow = shadow;
+	return 0;
+}
+
+/**
+ * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map
+ * @inode: inode of the metadata file
+ */
+int nilfs_mdt_save_to_shadow_map(struct inode *inode)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct nilfs_shadow_map *shadow = mi->mi_shadow;
+	int ret;
+
+	ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping);
+	if (ret)
+		goto out;
+
+	ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes,
+				     &ii->i_btnode_cache);
+	if (ret)
+		goto out;
+
+	nilfs_bmap_save(ii->i_bmap, &shadow->bmap_store);
+ out:
+	return ret;
+}
+
+int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
+{
+	struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
+	struct buffer_head *bh_frozen;
+	struct page *page;
+	int blkbits = inode->i_blkbits;
+
+	page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
+	if (!page)
+		return -ENOMEM;
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << blkbits, 0);
+
+	bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
+
+	if (!buffer_uptodate(bh_frozen))
+		nilfs_copy_buffer(bh_frozen, bh);
+	if (list_empty(&bh_frozen->b_assoc_buffers)) {
+		list_add_tail(&bh_frozen->b_assoc_buffers,
+			      &shadow->frozen_buffers);
+		set_buffer_nilfs_redirected(bh);
+	} else {
+		brelse(bh_frozen); /* already frozen */
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+	return 0;
+}
+
+struct buffer_head *
+nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
+{
+	struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
+	struct buffer_head *bh_frozen = NULL;
+	struct page *page;
+	int n;
+
+	page = find_lock_page(&shadow->frozen_data, bh->b_page->index);
+	if (page) {
+		if (page_has_buffers(page)) {
+			n = bh_offset(bh) >> inode->i_blkbits;
+			bh_frozen = nilfs_page_get_nth_block(page, n);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	return bh_frozen;
+}
+
+static void nilfs_release_frozen_buffers(struct nilfs_shadow_map *shadow)
+{
+	struct list_head *head = &shadow->frozen_buffers;
+	struct buffer_head *bh;
+
+	while (!list_empty(head)) {
+		bh = list_first_entry(head, struct buffer_head,
+				      b_assoc_buffers);
+		list_del_init(&bh->b_assoc_buffers);
+		brelse(bh); /* drop ref-count to make it releasable */
+	}
+}
+
+/**
+ * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct nilfs_shadow_map *shadow = mi->mi_shadow;
+
+	down_write(&mi->mi_sem);
+
+	if (mi->mi_palloc_cache)
+		nilfs_palloc_clear_cache(inode);
+
+	nilfs_clear_dirty_pages(inode->i_mapping);
+	nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
+
+	nilfs_clear_dirty_pages(&ii->i_btnode_cache);
+	nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
+
+	nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
+
+	up_write(&mi->mi_sem);
+}
+
+/**
+ * nilfs_mdt_clear_shadow_map - truncate pages in shadow map caches
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_clear_shadow_map(struct inode *inode)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+	struct nilfs_shadow_map *shadow = mi->mi_shadow;
+
+	down_write(&mi->mi_sem);
+	nilfs_release_frozen_buffers(shadow);
+	truncate_inode_pages(&shadow->frozen_data, 0);
+	truncate_inode_pages(&shadow->frozen_btnodes, 0);
+	up_write(&mi->mi_sem);
+}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
new file mode 100644
index 00000000..ab20a4ba
--- /dev/null
+++ b/fs/nilfs2/mdt.h
@@ -0,0 +1,110 @@
+/*
+ * mdt.h - NILFS meta data file prototype and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#ifndef _NILFS_MDT_H
+#define _NILFS_MDT_H
+
+#include <linux/buffer_head.h>
+#include <linux/blockgroup_lock.h>
+#include "nilfs.h"
+#include "page.h"
+
+struct nilfs_shadow_map {
+	struct nilfs_bmap_store bmap_store;
+	struct address_space frozen_data;
+	struct address_space frozen_btnodes;
+	struct list_head frozen_buffers;
+};
+
+/**
+ * struct nilfs_mdt_info - on-memory private data of meta data files
+ * @mi_sem: reader/writer semaphore for meta data operations
+ * @mi_bgl: per-blockgroup locking
+ * @mi_entry_size: size of an entry
+ * @mi_first_entry_offset: offset to the first entry
+ * @mi_entries_per_block: number of entries in a block
+ * @mi_palloc_cache: persistent object allocator cache
+ * @mi_shadow: shadow of bmap and page caches
+ * @mi_blocks_per_group: number of blocks in a group
+ * @mi_blocks_per_desc_block: number of blocks per descriptor block
+ */
+struct nilfs_mdt_info {
+	struct rw_semaphore	mi_sem;
+	struct blockgroup_lock *mi_bgl;
+	unsigned		mi_entry_size;
+	unsigned		mi_first_entry_offset;
+	unsigned long		mi_entries_per_block;
+	struct nilfs_palloc_cache *mi_palloc_cache;
+	struct nilfs_shadow_map *mi_shadow;
+	unsigned long		mi_blocks_per_group;
+	unsigned long		mi_blocks_per_desc_block;
+};
+
+static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
+{
+	return inode->i_private;
+}
+
+/* Default GFP flags using highmem */
+#define NILFS_MDT_GFP      (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
+
+int nilfs_mdt_get_block(struct inode *, unsigned long, int,
+			void (*init_block)(struct inode *,
+					   struct buffer_head *, void *),
+			struct buffer_head **);
+int nilfs_mdt_delete_block(struct inode *, unsigned long);
+int nilfs_mdt_forget_block(struct inode *, unsigned long);
+int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
+int nilfs_mdt_fetch_dirty(struct inode *);
+
+int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
+void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
+
+int nilfs_mdt_setup_shadow_map(struct inode *inode,
+			       struct nilfs_shadow_map *shadow);
+int nilfs_mdt_save_to_shadow_map(struct inode *inode);
+void nilfs_mdt_restore_from_shadow_map(struct inode *inode);
+void nilfs_mdt_clear_shadow_map(struct inode *inode);
+int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh);
+struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode,
+						struct buffer_head *bh);
+
+static inline void nilfs_mdt_mark_dirty(struct inode *inode)
+{
+	if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state))
+		set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
+}
+
+static inline void nilfs_mdt_clear_dirty(struct inode *inode)
+{
+	clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
+}
+
+static inline __u64 nilfs_mdt_cno(struct inode *inode)
+{
+	return ((struct the_nilfs *)inode->i_sb->s_fs_info)->ns_cno;
+}
+
+#define nilfs_mdt_bgl_lock(inode, bg) \
+	(&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
+
+#endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
new file mode 100644
index 00000000..546849b3
--- /dev/null
+++ b/fs/nilfs2/namei.c
@@ -0,0 +1,594 @@
+/*
+ * namei.c - NILFS pathname lookup operations.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
+ *                       Ryusuke Konishi <ryusuke@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/pagemap.h>
+#include "nilfs.h"
+#include "export.h"
+
+#define NILFS_FID_SIZE_NON_CONNECTABLE \
+	(offsetof(struct nilfs_fid, parent_gen) / 4)
+#define NILFS_FID_SIZE_CONNECTABLE	(sizeof(struct nilfs_fid) / 4)
+
+static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = nilfs_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+/*
+ * Methods themselves.
+ */
+
+static struct dentry *
+nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode;
+	ino_t ino;
+
+	if (dentry->d_name.len > NILFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ino = nilfs_inode_by_name(dir, &dentry->d_name);
+	inode = NULL;
+	if (ino) {
+		inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode,
+			struct nameidata *nd)
+{
+	struct inode *inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+	inode = nilfs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &nilfs_file_inode_operations;
+		inode->i_fop = &nilfs_file_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+		nilfs_mark_inode_dirty(inode);
+		err = nilfs_add_nondir(dentry, inode);
+	}
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int
+nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+{
+	struct inode *inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+	inode = nilfs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+		nilfs_mark_inode_dirty(inode);
+		err = nilfs_add_nondir(dentry, inode);
+	}
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
+			 const char *symname)
+{
+	struct nilfs_transaction_info ti;
+	struct super_block *sb = dir->i_sb;
+	unsigned l = strlen(symname)+1;
+	struct inode *inode;
+	int err;
+
+	if (l > sb->s_blocksize)
+		return -ENAMETOOLONG;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+
+	inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	/* slow symlink */
+	inode->i_op = &nilfs_symlink_inode_operations;
+	inode->i_mapping->a_ops = &nilfs_aops;
+	err = page_symlink(inode, symname, l);
+	if (err)
+		goto out_fail;
+
+	/* mark_inode_dirty(inode); */
+	/* page_symlink() do this */
+
+	err = nilfs_add_nondir(dentry, inode);
+out:
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+
+out_fail:
+	drop_nlink(inode);
+	nilfs_mark_inode_dirty(inode);
+	iput(inode);
+	goto out;
+}
+
+static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	if (inode->i_nlink >= NILFS_LINK_MAX)
+		return -EMLINK;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+
+	inode->i_ctime = CURRENT_TIME;
+	inode_inc_link_count(inode);
+	ihold(inode);
+
+	err = nilfs_add_nondir(dentry, inode);
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	if (dir->i_nlink >= NILFS_LINK_MAX)
+		return -EMLINK;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+
+	inc_nlink(dir);
+
+	inode = nilfs_new_inode(dir, S_IFDIR | mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_dir;
+
+	inode->i_op = &nilfs_dir_inode_operations;
+	inode->i_fop = &nilfs_dir_operations;
+	inode->i_mapping->a_ops = &nilfs_aops;
+
+	inc_nlink(inode);
+
+	err = nilfs_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = nilfs_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+
+	nilfs_mark_inode_dirty(inode);
+	d_instantiate(dentry, inode);
+out:
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+
+out_fail:
+	drop_nlink(inode);
+	drop_nlink(inode);
+	nilfs_mark_inode_dirty(inode);
+	iput(inode);
+out_dir:
+	drop_nlink(dir);
+	nilfs_mark_inode_dirty(dir);
+	goto out;
+}
+
+static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode;
+	struct nilfs_dir_entry *de;
+	struct page *page;
+	int err;
+
+	err = -ENOENT;
+	de = nilfs_find_entry(dir, &dentry->d_name, &page);
+	if (!de)
+		goto out;
+
+	inode = dentry->d_inode;
+	err = -EIO;
+	if (le64_to_cpu(de->inode) != inode->i_ino)
+		goto out;
+
+	if (!inode->i_nlink) {
+		nilfs_warning(inode->i_sb, __func__,
+			      "deleting nonexistent file (%lu), %d\n",
+			      inode->i_ino, inode->i_nlink);
+		inode->i_nlink = 1;
+	}
+	err = nilfs_delete_entry(de, page);
+	if (err)
+		goto out;
+
+	inode->i_ctime = dir->i_ctime;
+	drop_nlink(inode);
+	err = 0;
+out:
+	return err;
+}
+
+static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+	if (err)
+		return err;
+
+	err = nilfs_do_unlink(dir, dentry);
+
+	if (!err) {
+		nilfs_mark_inode_dirty(dir);
+		nilfs_mark_inode_dirty(dentry->d_inode);
+		err = nilfs_transaction_commit(dir->i_sb);
+	} else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+	if (err)
+		return err;
+
+	err = -ENOTEMPTY;
+	if (nilfs_empty_dir(inode)) {
+		err = nilfs_do_unlink(dir, dentry);
+		if (!err) {
+			inode->i_size = 0;
+			drop_nlink(inode);
+			nilfs_mark_inode_dirty(inode);
+			drop_nlink(dir);
+			nilfs_mark_inode_dirty(dir);
+		}
+	}
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir,	struct dentry *new_dentry)
+{
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct page *dir_page = NULL;
+	struct nilfs_dir_entry *dir_de = NULL;
+	struct page *old_page;
+	struct nilfs_dir_entry *old_de;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
+	if (unlikely(err))
+		return err;
+
+	err = -ENOENT;
+	old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = nilfs_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page *new_page;
+		struct nilfs_dir_entry *new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !nilfs_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
+		if (!new_de)
+			goto out_dir;
+		nilfs_set_link(new_dir, new_de, new_page, old_inode);
+		nilfs_mark_inode_dirty(new_dir);
+		new_inode->i_ctime = CURRENT_TIME;
+		if (dir_de)
+			drop_nlink(new_inode);
+		drop_nlink(new_inode);
+		nilfs_mark_inode_dirty(new_inode);
+	} else {
+		if (dir_de) {
+			err = -EMLINK;
+			if (new_dir->i_nlink >= NILFS_LINK_MAX)
+				goto out_dir;
+		}
+		err = nilfs_add_link(new_dentry, old_inode);
+		if (err)
+			goto out_dir;
+		if (dir_de) {
+			inc_nlink(new_dir);
+			nilfs_mark_inode_dirty(new_dir);
+		}
+	}
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+	 * rename.
+	 */
+	old_inode->i_ctime = CURRENT_TIME;
+
+	nilfs_delete_entry(old_de, old_page);
+
+	if (dir_de) {
+		nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
+		drop_nlink(old_dir);
+	}
+	nilfs_mark_inode_dirty(old_dir);
+	nilfs_mark_inode_dirty(old_inode);
+
+	err = nilfs_transaction_commit(old_dir->i_sb);
+	return err;
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	nilfs_transaction_abort(old_dir->i_sb);
+	return err;
+}
+
+/*
+ * Export operations
+ */
+static struct dentry *nilfs_get_parent(struct dentry *child)
+{
+	unsigned long ino;
+	struct inode *inode;
+	struct qstr dotdot = {.name = "..", .len = 2};
+	struct nilfs_root *root;
+
+	ino = nilfs_inode_by_name(child->d_inode, &dotdot);
+	if (!ino)
+		return ERR_PTR(-ENOENT);
+
+	root = NILFS_I(child->d_inode)->i_root;
+
+	inode = nilfs_iget(child->d_inode->i_sb, root, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	return d_obtain_alias(inode);
+}
+
+static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
+				       u64 ino, u32 gen)
+{
+	struct nilfs_root *root;
+	struct inode *inode;
+
+	if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO)
+		return ERR_PTR(-ESTALE);
+
+	root = nilfs_lookup_root(sb->s_fs_info, cno);
+	if (!root)
+		return ERR_PTR(-ESTALE);
+
+	inode = nilfs_iget(sb, root, ino);
+	nilfs_put_root(root);
+
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (gen && inode->i_generation != gen) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return d_obtain_alias(inode);
+}
+
+static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+
+	if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE &&
+	     fh_len != NILFS_FID_SIZE_CONNECTABLE) ||
+	    (fh_type != FILEID_NILFS_WITH_PARENT &&
+	     fh_type != FILEID_NILFS_WITHOUT_PARENT))
+		return NULL;
+
+	return nilfs_get_dentry(sb, fid->cno, fid->ino, fid->gen);
+}
+
+static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+
+	if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
+	    fh_type != FILEID_NILFS_WITH_PARENT)
+		return NULL;
+
+	return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen);
+}
+
+static int nilfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
+			   int connectable)
+{
+	struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+	struct inode *inode = dentry->d_inode;
+	struct nilfs_root *root = NILFS_I(inode)->i_root;
+	int type;
+
+	if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE ||
+	    (connectable && *lenp < NILFS_FID_SIZE_CONNECTABLE))
+		return 255;
+
+	fid->cno = root->cno;
+	fid->ino = inode->i_ino;
+	fid->gen = inode->i_generation;
+
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		struct inode *parent;
+
+		spin_lock(&dentry->d_lock);
+		parent = dentry->d_parent->d_inode;
+		fid->parent_ino = parent->i_ino;
+		fid->parent_gen = parent->i_generation;
+		spin_unlock(&dentry->d_lock);
+
+		type = FILEID_NILFS_WITH_PARENT;
+		*lenp = NILFS_FID_SIZE_CONNECTABLE;
+	} else {
+		type = FILEID_NILFS_WITHOUT_PARENT;
+		*lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
+	}
+
+	return type;
+}
+
+const struct inode_operations nilfs_dir_inode_operations = {
+	.create		= nilfs_create,
+	.lookup		= nilfs_lookup,
+	.link		= nilfs_link,
+	.unlink		= nilfs_unlink,
+	.symlink	= nilfs_symlink,
+	.mkdir		= nilfs_mkdir,
+	.rmdir		= nilfs_rmdir,
+	.mknod		= nilfs_mknod,
+	.rename		= nilfs_rename,
+	.setattr	= nilfs_setattr,
+	.permission	= nilfs_permission,
+	.fiemap		= nilfs_fiemap,
+};
+
+const struct inode_operations nilfs_special_inode_operations = {
+	.setattr	= nilfs_setattr,
+	.permission	= nilfs_permission,
+};
+
+const struct inode_operations nilfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.permission     = nilfs_permission,
+};
+
+const struct export_operations nilfs_export_ops = {
+	.encode_fh = nilfs_encode_fh,
+	.fh_to_dentry = nilfs_fh_to_dentry,
+	.fh_to_parent = nilfs_fh_to_parent,
+	.get_parent = nilfs_get_parent,
+};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
new file mode 100644
index 00000000..f02b9ad4
--- /dev/null
+++ b/fs/nilfs2/nilfs.h
@@ -0,0 +1,326 @@
+/*
+ * nilfs.h - NILFS local header file.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>
+ *            Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#ifndef _NILFS_H
+#define _NILFS_H
+
+#include <linux/kernel.h>
+#include <linux/buffer_head.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/nilfs2_fs.h>
+#include "the_nilfs.h"
+#include "bmap.h"
+
+/*
+ * nilfs inode data in memory
+ */
+struct nilfs_inode_info {
+	__u32 i_flags;
+	unsigned long  i_state;		/* Dynamic state flags */
+	struct nilfs_bmap *i_bmap;
+	struct nilfs_bmap i_bmap_data;
+	__u64 i_xattr;	/* sector_t ??? */
+	__u32 i_dir_start_lookup;
+	__u64 i_cno;		/* check point number for GC inode */
+	struct address_space i_btnode_cache;
+	struct list_head i_dirty;	/* List for connecting dirty files */
+
+#ifdef CONFIG_NILFS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_sem even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
+	struct buffer_head *i_bh;	/* i_bh contains a new or dirty
+					   disk inode */
+	struct nilfs_root *i_root;
+	struct inode vfs_inode;
+};
+
+static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
+{
+	return container_of(inode, struct nilfs_inode_info, vfs_inode);
+}
+
+static inline struct nilfs_inode_info *
+NILFS_BMAP_I(const struct nilfs_bmap *bmap)
+{
+	return container_of(bmap, struct nilfs_inode_info, i_bmap_data);
+}
+
+static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
+{
+	struct nilfs_inode_info *ii =
+		container_of(btnc, struct nilfs_inode_info, i_btnode_cache);
+	return &ii->vfs_inode;
+}
+
+/*
+ * Dynamic state flags of NILFS on-memory inode (i_state)
+ */
+enum {
+	NILFS_I_NEW = 0,		/* Inode is newly created */
+	NILFS_I_DIRTY,			/* The file is dirty */
+	NILFS_I_QUEUED,			/* inode is in dirty_files list */
+	NILFS_I_BUSY,			/* inode is grabbed by a segment
+					   constructor */
+	NILFS_I_COLLECTED,		/* All dirty blocks are collected */
+	NILFS_I_UPDATED,		/* The file has been written back */
+	NILFS_I_INODE_DIRTY,		/* write_inode is requested */
+	NILFS_I_BMAP,			/* has bmap and btnode_cache */
+	NILFS_I_GCINODE,		/* inode for GC, on memory only */
+};
+
+/*
+ * commit flags for nilfs_commit_super and nilfs_sync_super
+ */
+enum {
+	NILFS_SB_COMMIT = 0,	/* Commit a super block alternately */
+	NILFS_SB_COMMIT_ALL	/* Commit both super blocks */
+};
+
+/*
+ * Macros to check inode numbers
+ */
+#define NILFS_MDT_INO_BITS   \
+	((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO |	\
+			1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO |	\
+			1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
+
+#define NILFS_SYS_INO_BITS   \
+	((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
+
+#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
+
+#define NILFS_MDT_INODE(sb, ino) \
+	((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
+#define NILFS_VALID_INODE(sb, ino) \
+	((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
+
+/**
+ * struct nilfs_transaction_info: context information for synchronization
+ * @ti_magic: Magic number
+ * @ti_save: Backup of journal_info field of task_struct
+ * @ti_flags: Flags
+ * @ti_count: Nest level
+ * @ti_garbage:	List of inode to be put when releasing semaphore
+ */
+struct nilfs_transaction_info {
+	u32			ti_magic;
+	void		       *ti_save;
+				/* This should never used. If this happens,
+				   one of other filesystems has a bug. */
+	unsigned short		ti_flags;
+	unsigned short		ti_count;
+	struct list_head	ti_garbage;
+};
+
+/* ti_magic */
+#define NILFS_TI_MAGIC		0xd9e392fb
+
+/* ti_flags */
+#define NILFS_TI_DYNAMIC_ALLOC	0x0001  /* Allocated from slab */
+#define NILFS_TI_SYNC		0x0002	/* Force to construct segment at the
+					   end of transaction. */
+#define NILFS_TI_GC		0x0004	/* GC context */
+#define NILFS_TI_COMMIT		0x0008	/* Change happened or not */
+#define NILFS_TI_WRITER		0x0010	/* Constructor context */
+
+
+int nilfs_transaction_begin(struct super_block *,
+			    struct nilfs_transaction_info *, int);
+int nilfs_transaction_commit(struct super_block *);
+void nilfs_transaction_abort(struct super_block *);
+
+static inline void nilfs_set_transaction_flag(unsigned int flag)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+
+	ti->ti_flags |= flag;
+}
+
+static inline int nilfs_test_transaction_flag(unsigned int flag)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+
+	if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC)
+		return 0;
+	return !!(ti->ti_flags & flag);
+}
+
+static inline int nilfs_doing_gc(void)
+{
+	return nilfs_test_transaction_flag(NILFS_TI_GC);
+}
+
+static inline int nilfs_doing_construction(void)
+{
+	return nilfs_test_transaction_flag(NILFS_TI_WRITER);
+}
+
+/*
+ * function prototype
+ */
+#ifdef CONFIG_NILFS_POSIX_ACL
+#error "NILFS: not yet supported POSIX ACL"
+extern int nilfs_acl_chmod(struct inode *);
+extern int nilfs_init_acl(struct inode *, struct inode *);
+#else
+static inline int nilfs_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+
+static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
+{
+	inode->i_mode &= ~current_umask();
+	return 0;
+}
+#endif
+
+#define NILFS_ATIME_DISABLE
+
+/* Flags that should be inherited by new inodes from their parent. */
+#define NILFS_FL_INHERITED						\
+	(FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | FS_SYNC_FL |		\
+	 FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL |\
+	 FS_COMPRBLK_FL | FS_NOCOMP_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & ~(FS_DIRSYNC_FL | FS_TOPDIR_FL);
+	else
+		return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
+}
+
+/* dir.c */
+extern int nilfs_add_link(struct dentry *, struct inode *);
+extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
+extern int nilfs_make_empty(struct inode *, struct inode *);
+extern struct nilfs_dir_entry *
+nilfs_find_entry(struct inode *, const struct qstr *, struct page **);
+extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
+extern int nilfs_empty_dir(struct inode *);
+extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
+extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
+			   struct page *, struct inode *);
+
+/* file.c */
+extern int nilfs_sync_file(struct file *, int);
+
+/* ioctl.c */
+long nilfs_ioctl(struct file *, unsigned int, unsigned long);
+long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
+				       void **);
+
+/* inode.c */
+void nilfs_inode_add_blocks(struct inode *inode, int n);
+void nilfs_inode_sub_blocks(struct inode *inode, int n);
+extern struct inode *nilfs_new_inode(struct inode *, int);
+extern void nilfs_free_inode(struct inode *);
+extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern void nilfs_set_inode_flags(struct inode *);
+extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
+extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
+struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
+			    unsigned long ino);
+struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
+				unsigned long ino);
+struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
+			 unsigned long ino);
+extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
+				       unsigned long ino, __u64 cno);
+extern void nilfs_update_inode(struct inode *, struct buffer_head *);
+extern void nilfs_truncate(struct inode *);
+extern void nilfs_evict_inode(struct inode *);
+extern int nilfs_setattr(struct dentry *, struct iattr *);
+int nilfs_permission(struct inode *inode, int mask, unsigned int flags);
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
+extern int nilfs_inode_dirty(struct inode *);
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
+extern int nilfs_mark_inode_dirty(struct inode *);
+extern void nilfs_dirty_inode(struct inode *, int flags);
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 __u64 start, __u64 len);
+
+/* super.c */
+extern struct inode *nilfs_alloc_inode(struct super_block *);
+extern void nilfs_destroy_inode(struct inode *);
+extern void nilfs_error(struct super_block *, const char *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern struct nilfs_super_block *
+nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
+extern int nilfs_store_magic_and_option(struct super_block *,
+					struct nilfs_super_block *, char *);
+extern int nilfs_check_feature_compatibility(struct super_block *,
+					     struct nilfs_super_block *);
+extern void nilfs_set_log_cursor(struct nilfs_super_block *,
+				 struct the_nilfs *);
+struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
+					       int flip);
+int nilfs_commit_super(struct super_block *sb, int flag);
+int nilfs_cleanup_super(struct super_block *sb);
+int nilfs_resize_fs(struct super_block *sb, __u64 newsize);
+int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
+			    struct nilfs_root **root);
+int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
+
+/* gcinode.c */
+int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
+				   struct buffer_head **);
+int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
+				   struct buffer_head **);
+int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
+int nilfs_init_gcinode(struct inode *inode);
+void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);
+
+/*
+ * Inodes and files operations
+ */
+extern const struct file_operations nilfs_dir_operations;
+extern const struct inode_operations nilfs_file_inode_operations;
+extern const struct file_operations nilfs_file_operations;
+extern const struct address_space_operations nilfs_aops;
+extern const struct inode_operations nilfs_dir_inode_operations;
+extern const struct inode_operations nilfs_special_inode_operations;
+extern const struct inode_operations nilfs_symlink_inode_operations;
+
+/*
+ * filesystem type
+ */
+extern struct file_system_type nilfs_fs_type;
+
+
+#endif	/* _NILFS_H */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
new file mode 100644
index 00000000..65221a04
--- /dev/null
+++ b/fs/nilfs2/page.c
@@ -0,0 +1,551 @@
+/*
+ * page.c - buffer/page management specific to NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>,
+ *            Seiji Kihara <kihara@osrg.net>.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/bitops.h>
+#include <linux/page-flags.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/pagevec.h>
+#include <linux/gfp.h>
+#include "nilfs.h"
+#include "page.h"
+#include "mdt.h"
+
+
+#define NILFS_BUFFER_INHERENT_BITS  \
+	((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
+	 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Checked))
+
+static struct buffer_head *
+__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
+		       int blkbits, unsigned long b_state)
+
+{
+	unsigned long first_block;
+	struct buffer_head *bh;
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << blkbits, b_state);
+
+	first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits);
+	bh = nilfs_page_get_nth_block(page, block - first_block);
+
+	touch_buffer(bh);
+	wait_on_buffer(bh);
+	return bh;
+}
+
+struct buffer_head *nilfs_grab_buffer(struct inode *inode,
+				      struct address_space *mapping,
+				      unsigned long blkoff,
+				      unsigned long b_state)
+{
+	int blkbits = inode->i_blkbits;
+	pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
+	struct page *page;
+	struct buffer_head *bh;
+
+	page = grab_cache_page(mapping, index);
+	if (unlikely(!page))
+		return NULL;
+
+	bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
+	if (unlikely(!bh)) {
+		unlock_page(page);
+		page_cache_release(page);
+		return NULL;
+	}
+	return bh;
+}
+
+/**
+ * nilfs_forget_buffer - discard dirty state
+ * @inode: owner inode of the buffer
+ * @bh: buffer head of the buffer to be discarded
+ */
+void nilfs_forget_buffer(struct buffer_head *bh)
+{
+	struct page *page = bh->b_page;
+
+	lock_buffer(bh);
+	clear_buffer_nilfs_volatile(bh);
+	clear_buffer_nilfs_checked(bh);
+	clear_buffer_nilfs_redirected(bh);
+	clear_buffer_dirty(bh);
+	if (nilfs_page_buffers_clean(page))
+		__nilfs_clear_page_dirty(page);
+
+	clear_buffer_uptodate(bh);
+	clear_buffer_mapped(bh);
+	bh->b_blocknr = -1;
+	ClearPageUptodate(page);
+	ClearPageMappedToDisk(page);
+	unlock_buffer(bh);
+	brelse(bh);
+}
+
+/**
+ * nilfs_copy_buffer -- copy buffer data and flags
+ * @dbh: destination buffer
+ * @sbh: source buffer
+ */
+void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
+{
+	void *kaddr0, *kaddr1;
+	unsigned long bits;
+	struct page *spage = sbh->b_page, *dpage = dbh->b_page;
+	struct buffer_head *bh;
+
+	kaddr0 = kmap_atomic(spage, KM_USER0);
+	kaddr1 = kmap_atomic(dpage, KM_USER1);
+	memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
+	kunmap_atomic(kaddr1, KM_USER1);
+	kunmap_atomic(kaddr0, KM_USER0);
+
+	dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
+	dbh->b_blocknr = sbh->b_blocknr;
+	dbh->b_bdev = sbh->b_bdev;
+
+	bh = dbh;
+	bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped));
+	while ((bh = bh->b_this_page) != dbh) {
+		lock_buffer(bh);
+		bits &= bh->b_state;
+		unlock_buffer(bh);
+	}
+	if (bits & (1UL << BH_Uptodate))
+		SetPageUptodate(dpage);
+	else
+		ClearPageUptodate(dpage);
+	if (bits & (1UL << BH_Mapped))
+		SetPageMappedToDisk(dpage);
+	else
+		ClearPageMappedToDisk(dpage);
+}
+
+/**
+ * nilfs_page_buffers_clean - check if a page has dirty buffers or not.
+ * @page: page to be checked
+ *
+ * nilfs_page_buffers_clean() returns zero if the page has dirty buffers.
+ * Otherwise, it returns non-zero value.
+ */
+int nilfs_page_buffers_clean(struct page *page)
+{
+	struct buffer_head *bh, *head;
+
+	bh = head = page_buffers(page);
+	do {
+		if (buffer_dirty(bh))
+			return 0;
+		bh = bh->b_this_page;
+	} while (bh != head);
+	return 1;
+}
+
+void nilfs_page_bug(struct page *page)
+{
+	struct address_space *m;
+	unsigned long ino;
+
+	if (unlikely(!page)) {
+		printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n");
+		return;
+	}
+
+	m = page->mapping;
+	ino = m ? m->host->i_ino : 0;
+
+	printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
+	       "mapping=%p ino=%lu\n",
+	       page, atomic_read(&page->_count),
+	       (unsigned long long)page->index, page->flags, m, ino);
+
+	if (page_has_buffers(page)) {
+		struct buffer_head *bh, *head;
+		int i = 0;
+
+		bh = head = page_buffers(page);
+		do {
+			printk(KERN_CRIT
+			       " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n",
+			       i++, bh, atomic_read(&bh->b_count),
+			       (unsigned long long)bh->b_blocknr, bh->b_state);
+			bh = bh->b_this_page;
+		} while (bh != head);
+	}
+}
+
+/**
+ * nilfs_copy_page -- copy the page with buffers
+ * @dst: destination page
+ * @src: source page
+ * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
+ *
+ * This function is for both data pages and btnode pages.  The dirty flag
+ * should be treated by caller.  The page must not be under i/o.
+ * Both src and dst page must be locked
+ */
+static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
+{
+	struct buffer_head *dbh, *dbufs, *sbh, *sbufs;
+	unsigned long mask = NILFS_BUFFER_INHERENT_BITS;
+
+	BUG_ON(PageWriteback(dst));
+
+	sbh = sbufs = page_buffers(src);
+	if (!page_has_buffers(dst))
+		create_empty_buffers(dst, sbh->b_size, 0);
+
+	if (copy_dirty)
+		mask |= (1UL << BH_Dirty);
+
+	dbh = dbufs = page_buffers(dst);
+	do {
+		lock_buffer(sbh);
+		lock_buffer(dbh);
+		dbh->b_state = sbh->b_state & mask;
+		dbh->b_blocknr = sbh->b_blocknr;
+		dbh->b_bdev = sbh->b_bdev;
+		sbh = sbh->b_this_page;
+		dbh = dbh->b_this_page;
+	} while (dbh != dbufs);
+
+	copy_highpage(dst, src);
+
+	if (PageUptodate(src) && !PageUptodate(dst))
+		SetPageUptodate(dst);
+	else if (!PageUptodate(src) && PageUptodate(dst))
+		ClearPageUptodate(dst);
+	if (PageMappedToDisk(src) && !PageMappedToDisk(dst))
+		SetPageMappedToDisk(dst);
+	else if (!PageMappedToDisk(src) && PageMappedToDisk(dst))
+		ClearPageMappedToDisk(dst);
+
+	do {
+		unlock_buffer(sbh);
+		unlock_buffer(dbh);
+		sbh = sbh->b_this_page;
+		dbh = dbh->b_this_page;
+	} while (dbh != dbufs);
+}
+
+int nilfs_copy_dirty_pages(struct address_space *dmap,
+			   struct address_space *smap)
+{
+	struct pagevec pvec;
+	unsigned int i;
+	pgoff_t index = 0;
+	int err = 0;
+
+	pagevec_init(&pvec, 0);
+repeat:
+	if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY,
+				PAGEVEC_SIZE))
+		return 0;
+
+	for (i = 0; i < pagevec_count(&pvec); i++) {
+		struct page *page = pvec.pages[i], *dpage;
+
+		lock_page(page);
+		if (unlikely(!PageDirty(page)))
+			NILFS_PAGE_BUG(page, "inconsistent dirty state");
+
+		dpage = grab_cache_page(dmap, page->index);
+		if (unlikely(!dpage)) {
+			/* No empty page is added to the page cache */
+			err = -ENOMEM;
+			unlock_page(page);
+			break;
+		}
+		if (unlikely(!page_has_buffers(page)))
+			NILFS_PAGE_BUG(page,
+				       "found empty page in dat page cache");
+
+		nilfs_copy_page(dpage, page, 1);
+		__set_page_dirty_nobuffers(dpage);
+
+		unlock_page(dpage);
+		page_cache_release(dpage);
+		unlock_page(page);
+	}
+	pagevec_release(&pvec);
+	cond_resched();
+
+	if (likely(!err))
+		goto repeat;
+	return err;
+}
+
+/**
+ * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache
+ * @dmap: destination page cache
+ * @smap: source page cache
+ *
+ * No pages must no be added to the cache during this process.
+ * This must be ensured by the caller.
+ */
+void nilfs_copy_back_pages(struct address_space *dmap,
+			   struct address_space *smap)
+{
+	struct pagevec pvec;
+	unsigned int i, n;
+	pgoff_t index = 0;
+	int err;
+
+	pagevec_init(&pvec, 0);
+repeat:
+	n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
+	if (!n)
+		return;
+	index = pvec.pages[n - 1]->index + 1;
+
+	for (i = 0; i < pagevec_count(&pvec); i++) {
+		struct page *page = pvec.pages[i], *dpage;
+		pgoff_t offset = page->index;
+
+		lock_page(page);
+		dpage = find_lock_page(dmap, offset);
+		if (dpage) {
+			/* override existing page on the destination cache */
+			WARN_ON(PageDirty(dpage));
+			nilfs_copy_page(dpage, page, 0);
+			unlock_page(dpage);
+			page_cache_release(dpage);
+		} else {
+			struct page *page2;
+
+			/* move the page to the destination cache */
+			spin_lock_irq(&smap->tree_lock);
+			page2 = radix_tree_delete(&smap->page_tree, offset);
+			WARN_ON(page2 != page);
+
+			smap->nrpages--;
+			spin_unlock_irq(&smap->tree_lock);
+
+			spin_lock_irq(&dmap->tree_lock);
+			err = radix_tree_insert(&dmap->page_tree, offset, page);
+			if (unlikely(err < 0)) {
+				WARN_ON(err == -EEXIST);
+				page->mapping = NULL;
+				page_cache_release(page); /* for cache */
+			} else {
+				page->mapping = dmap;
+				dmap->nrpages++;
+				if (PageDirty(page))
+					radix_tree_tag_set(&dmap->page_tree,
+							   offset,
+							   PAGECACHE_TAG_DIRTY);
+			}
+			spin_unlock_irq(&dmap->tree_lock);
+		}
+		unlock_page(page);
+	}
+	pagevec_release(&pvec);
+	cond_resched();
+
+	goto repeat;
+}
+
+void nilfs_clear_dirty_pages(struct address_space *mapping)
+{
+	struct pagevec pvec;
+	unsigned int i;
+	pgoff_t index = 0;
+
+	pagevec_init(&pvec, 0);
+
+	while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+				  PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			struct buffer_head *bh, *head;
+
+			lock_page(page);
+			ClearPageUptodate(page);
+			ClearPageMappedToDisk(page);
+			bh = head = page_buffers(page);
+			do {
+				lock_buffer(bh);
+				clear_buffer_dirty(bh);
+				clear_buffer_nilfs_volatile(bh);
+				clear_buffer_nilfs_checked(bh);
+				clear_buffer_nilfs_redirected(bh);
+				clear_buffer_uptodate(bh);
+				clear_buffer_mapped(bh);
+				unlock_buffer(bh);
+				bh = bh->b_this_page;
+			} while (bh != head);
+
+			__nilfs_clear_page_dirty(page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+unsigned nilfs_page_count_clean_buffers(struct page *page,
+					unsigned from, unsigned to)
+{
+	unsigned block_start, block_end;
+	struct buffer_head *bh, *head;
+	unsigned nc = 0;
+
+	for (bh = head = page_buffers(page), block_start = 0;
+	     bh != head || !block_start;
+	     block_start = block_end, bh = bh->b_this_page) {
+		block_end = block_start + bh->b_size;
+		if (block_end > from && block_start < to && !buffer_dirty(bh))
+			nc++;
+	}
+	return nc;
+}
+
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
+			struct backing_dev_info *bdi)
+{
+	mapping->host = inode;
+	mapping->flags = 0;
+	mapping_set_gfp_mask(mapping, GFP_NOFS);
+	mapping->assoc_mapping = NULL;
+	mapping->backing_dev_info = bdi;
+	mapping->a_ops = &empty_aops;
+}
+
+/*
+ * NILFS2 needs clear_page_dirty() in the following two cases:
+ *
+ * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears
+ *    page dirty flags when it copies back pages from the shadow cache
+ *    (gcdat->{i_mapping,i_btnode_cache}) to its original cache
+ *    (dat->{i_mapping,i_btnode_cache}).
+ *
+ * 2) Some B-tree operations like insertion or deletion may dispose buffers
+ *    in dirty state, and this needs to cancel the dirty state of their pages.
+ */
+int __nilfs_clear_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	if (mapping) {
+		spin_lock_irq(&mapping->tree_lock);
+		if (test_bit(PG_dirty, &page->flags)) {
+			radix_tree_tag_clear(&mapping->page_tree,
+					     page_index(page),
+					     PAGECACHE_TAG_DIRTY);
+			spin_unlock_irq(&mapping->tree_lock);
+			return clear_page_dirty_for_io(page);
+		}
+		spin_unlock_irq(&mapping->tree_lock);
+		return 0;
+	}
+	return TestClearPageDirty(page);
+}
+
+/**
+ * nilfs_find_uncommitted_extent - find extent of uncommitted data
+ * @inode: inode
+ * @start_blk: start block offset (in)
+ * @blkoff: start offset of the found extent (out)
+ *
+ * This function searches an extent of buffers marked "delayed" which
+ * starts from a block offset equal to or larger than @start_blk.  If
+ * such an extent was found, this will store the start offset in
+ * @blkoff and return its length in blocks.  Otherwise, zero is
+ * returned.
+ */
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+					    sector_t start_blk,
+					    sector_t *blkoff)
+{
+	unsigned int i;
+	pgoff_t index;
+	unsigned int nblocks_in_page;
+	unsigned long length = 0;
+	sector_t b;
+	struct pagevec pvec;
+	struct page *page;
+
+	if (inode->i_mapping->nrpages == 0)
+		return 0;
+
+	index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	pagevec_init(&pvec, 0);
+
+repeat:
+	pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
+					pvec.pages);
+	if (pvec.nr == 0)
+		return length;
+
+	if (length > 0 && pvec.pages[0]->index > index)
+		goto out;
+
+	b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	i = 0;
+	do {
+		page = pvec.pages[i];
+
+		lock_page(page);
+		if (page_has_buffers(page)) {
+			struct buffer_head *bh, *head;
+
+			bh = head = page_buffers(page);
+			do {
+				if (b < start_blk)
+					continue;
+				if (buffer_delay(bh)) {
+					if (length == 0)
+						*blkoff = b;
+					length++;
+				} else if (length > 0) {
+					goto out_locked;
+				}
+			} while (++b, bh = bh->b_this_page, bh != head);
+		} else {
+			if (length > 0)
+				goto out_locked;
+
+			b += nblocks_in_page;
+		}
+		unlock_page(page);
+
+	} while (++i < pagevec_count(&pvec));
+
+	index = page->index + 1;
+	pagevec_release(&pvec);
+	cond_resched();
+	goto repeat;
+
+out_locked:
+	unlock_page(page);
+out:
+	pagevec_release(&pvec);
+	return length;
+}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
new file mode 100644
index 00000000..fb7de716
--- /dev/null
+++ b/fs/nilfs2/page.h
@@ -0,0 +1,80 @@
+/*
+ * page.h - buffer/page management specific to NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>,
+ *            Seiji Kihara <kihara@osrg.net>.
+ */
+
+#ifndef _NILFS_PAGE_H
+#define _NILFS_PAGE_H
+
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+
+/*
+ * Extended buffer state bits
+ */
+enum {
+	BH_NILFS_Allocated = BH_PrivateStart,
+	BH_NILFS_Node,
+	BH_NILFS_Volatile,
+	BH_NILFS_Checked,
+	BH_NILFS_Redirected,
+};
+
+BUFFER_FNS(NILFS_Node, nilfs_node)		/* nilfs node buffers */
+BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
+BUFFER_FNS(NILFS_Checked, nilfs_checked)	/* buffer is verified */
+BUFFER_FNS(NILFS_Redirected, nilfs_redirected)	/* redirected to a copy */
+
+
+int __nilfs_clear_page_dirty(struct page *);
+
+struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
+				      unsigned long, unsigned long);
+void nilfs_forget_buffer(struct buffer_head *);
+void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
+int nilfs_page_buffers_clean(struct page *);
+void nilfs_page_bug(struct page *);
+
+int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
+void nilfs_copy_back_pages(struct address_space *, struct address_space *);
+void nilfs_clear_dirty_pages(struct address_space *);
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
+			struct backing_dev_info *bdi);
+unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+					    sector_t start_blk,
+					    sector_t *blkoff);
+
+#define NILFS_PAGE_BUG(page, m, a...) \
+	do { nilfs_page_bug(page); BUG(); } while (0)
+
+static inline struct buffer_head *
+nilfs_page_get_nth_block(struct page *page, unsigned int count)
+{
+	struct buffer_head *bh = page_buffers(page);
+
+	while (count-- > 0)
+		bh = bh->b_this_page;
+	get_bh(bh);
+	return bh;
+}
+
+#endif /* _NILFS_PAGE_H */
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
new file mode 100644
index 00000000..a604ac03
--- /dev/null
+++ b/fs/nilfs2/recovery.c
@@ -0,0 +1,963 @@
+/*
+ * recovery.c - NILFS recovery logic
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "sufile.h"
+#include "page.h"
+#include "segbuf.h"
+
+/*
+ * Segment check result
+ */
+enum {
+	NILFS_SEG_VALID,
+	NILFS_SEG_NO_SUPER_ROOT,
+	NILFS_SEG_FAIL_IO,
+	NILFS_SEG_FAIL_MAGIC,
+	NILFS_SEG_FAIL_SEQ,
+	NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
+	NILFS_SEG_FAIL_CHECKSUM_FULL,
+	NILFS_SEG_FAIL_CONSISTENCY,
+};
+
+/* work structure for recovery */
+struct nilfs_recovery_block {
+	ino_t ino;		/* Inode number of the file that this block
+				   belongs to */
+	sector_t blocknr;	/* block number */
+	__u64 vblocknr;		/* virtual block number */
+	unsigned long blkoff;	/* File offset of the data block (per block) */
+	struct list_head list;
+};
+
+
+static int nilfs_warn_segment_error(int err)
+{
+	switch (err) {
+	case NILFS_SEG_FAIL_IO:
+		printk(KERN_WARNING
+		       "NILFS warning: I/O error on loading last segment\n");
+		return -EIO;
+	case NILFS_SEG_FAIL_MAGIC:
+		printk(KERN_WARNING
+		       "NILFS warning: Segment magic number invalid\n");
+		break;
+	case NILFS_SEG_FAIL_SEQ:
+		printk(KERN_WARNING
+		       "NILFS warning: Sequence number mismatch\n");
+		break;
+	case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
+		printk(KERN_WARNING
+		       "NILFS warning: Checksum error in super root\n");
+		break;
+	case NILFS_SEG_FAIL_CHECKSUM_FULL:
+		printk(KERN_WARNING
+		       "NILFS warning: Checksum error in segment payload\n");
+		break;
+	case NILFS_SEG_FAIL_CONSISTENCY:
+		printk(KERN_WARNING
+		       "NILFS warning: Inconsistent segment\n");
+		break;
+	case NILFS_SEG_NO_SUPER_ROOT:
+		printk(KERN_WARNING
+		       "NILFS warning: No super root in the last segment\n");
+		break;
+	}
+	return -EINVAL;
+}
+
+/**
+ * nilfs_compute_checksum - compute checksum of blocks continuously
+ * @nilfs: nilfs object
+ * @bhs: buffer head of start block
+ * @sum: place to store result
+ * @offset: offset bytes in the first block
+ * @check_bytes: number of bytes to be checked
+ * @start: DBN of start block
+ * @nblock: number of blocks to be checked
+ */
+static int nilfs_compute_checksum(struct the_nilfs *nilfs,
+				  struct buffer_head *bhs, u32 *sum,
+				  unsigned long offset, u64 check_bytes,
+				  sector_t start, unsigned long nblock)
+{
+	unsigned int blocksize = nilfs->ns_blocksize;
+	unsigned long size;
+	u32 crc;
+
+	BUG_ON(offset >= blocksize);
+	check_bytes -= offset;
+	size = min_t(u64, check_bytes, blocksize - offset);
+	crc = crc32_le(nilfs->ns_crc_seed,
+		       (unsigned char *)bhs->b_data + offset, size);
+	if (--nblock > 0) {
+		do {
+			struct buffer_head *bh;
+
+			bh = __bread(nilfs->ns_bdev, ++start, blocksize);
+			if (!bh)
+				return -EIO;
+			check_bytes -= size;
+			size = min_t(u64, check_bytes, blocksize);
+			crc = crc32_le(crc, bh->b_data, size);
+			brelse(bh);
+		} while (--nblock > 0);
+	}
+	*sum = crc;
+	return 0;
+}
+
+/**
+ * nilfs_read_super_root_block - read super root block
+ * @nilfs: nilfs object
+ * @sr_block: disk block number of the super root block
+ * @pbh: address of a buffer_head pointer to return super root buffer
+ * @check: CRC check flag
+ */
+int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
+				struct buffer_head **pbh, int check)
+{
+	struct buffer_head *bh_sr;
+	struct nilfs_super_root *sr;
+	u32 crc;
+	int ret;
+
+	*pbh = NULL;
+	bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize);
+	if (unlikely(!bh_sr)) {
+		ret = NILFS_SEG_FAIL_IO;
+		goto failed;
+	}
+
+	sr = (struct nilfs_super_root *)bh_sr->b_data;
+	if (check) {
+		unsigned bytes = le16_to_cpu(sr->sr_bytes);
+
+		if (bytes == 0 || bytes > nilfs->ns_blocksize) {
+			ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
+			goto failed_bh;
+		}
+		if (nilfs_compute_checksum(
+			    nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes,
+			    sr_block, 1)) {
+			ret = NILFS_SEG_FAIL_IO;
+			goto failed_bh;
+		}
+		if (crc != le32_to_cpu(sr->sr_sum)) {
+			ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
+			goto failed_bh;
+		}
+	}
+	*pbh = bh_sr;
+	return 0;
+
+ failed_bh:
+	brelse(bh_sr);
+
+ failed:
+	return nilfs_warn_segment_error(ret);
+}
+
+/**
+ * nilfs_read_log_header - read summary header of the specified log
+ * @nilfs: nilfs object
+ * @start_blocknr: start block number of the log
+ * @sum: pointer to return segment summary structure
+ */
+static struct buffer_head *
+nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr,
+		      struct nilfs_segment_summary **sum)
+{
+	struct buffer_head *bh_sum;
+
+	bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
+	if (bh_sum)
+		*sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+	return bh_sum;
+}
+
+/**
+ * nilfs_validate_log - verify consistency of log
+ * @nilfs: nilfs object
+ * @seg_seq: sequence number of segment
+ * @bh_sum: buffer head of summary block
+ * @sum: segment summary struct
+ */
+static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq,
+			      struct buffer_head *bh_sum,
+			      struct nilfs_segment_summary *sum)
+{
+	unsigned long nblock;
+	u32 crc;
+	int ret;
+
+	ret = NILFS_SEG_FAIL_MAGIC;
+	if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC)
+		goto out;
+
+	ret = NILFS_SEG_FAIL_SEQ;
+	if (le64_to_cpu(sum->ss_seq) != seg_seq)
+		goto out;
+
+	nblock = le32_to_cpu(sum->ss_nblocks);
+	ret = NILFS_SEG_FAIL_CONSISTENCY;
+	if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment))
+		/* This limits the number of blocks read in the CRC check */
+		goto out;
+
+	ret = NILFS_SEG_FAIL_IO;
+	if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum),
+				   ((u64)nblock << nilfs->ns_blocksize_bits),
+				   bh_sum->b_blocknr, nblock))
+		goto out;
+
+	ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
+	if (crc != le32_to_cpu(sum->ss_datasum))
+		goto out;
+	ret = 0;
+out:
+	return ret;
+}
+
+/**
+ * nilfs_read_summary_info - read an item on summary blocks of a log
+ * @nilfs: nilfs object
+ * @pbh: the current buffer head on summary blocks [in, out]
+ * @offset: the current byte offset on summary blocks [in, out]
+ * @bytes: byte size of the item to be read
+ */
+static void *nilfs_read_summary_info(struct the_nilfs *nilfs,
+				     struct buffer_head **pbh,
+				     unsigned int *offset, unsigned int bytes)
+{
+	void *ptr;
+	sector_t blocknr;
+
+	BUG_ON((*pbh)->b_size < *offset);
+	if (bytes > (*pbh)->b_size - *offset) {
+		blocknr = (*pbh)->b_blocknr;
+		brelse(*pbh);
+		*pbh = __bread(nilfs->ns_bdev, blocknr + 1,
+			       nilfs->ns_blocksize);
+		if (unlikely(!*pbh))
+			return NULL;
+		*offset = 0;
+	}
+	ptr = (*pbh)->b_data + *offset;
+	*offset += bytes;
+	return ptr;
+}
+
+/**
+ * nilfs_skip_summary_info - skip items on summary blocks of a log
+ * @nilfs: nilfs object
+ * @pbh: the current buffer head on summary blocks [in, out]
+ * @offset: the current byte offset on summary blocks [in, out]
+ * @bytes: byte size of the item to be skipped
+ * @count: number of items to be skipped
+ */
+static void nilfs_skip_summary_info(struct the_nilfs *nilfs,
+				    struct buffer_head **pbh,
+				    unsigned int *offset, unsigned int bytes,
+				    unsigned long count)
+{
+	unsigned int rest_item_in_current_block
+		= ((*pbh)->b_size - *offset) / bytes;
+
+	if (count <= rest_item_in_current_block) {
+		*offset += bytes * count;
+	} else {
+		sector_t blocknr = (*pbh)->b_blocknr;
+		unsigned int nitem_per_block = (*pbh)->b_size / bytes;
+		unsigned int bcnt;
+
+		count -= rest_item_in_current_block;
+		bcnt = DIV_ROUND_UP(count, nitem_per_block);
+		*offset = bytes * (count - (bcnt - 1) * nitem_per_block);
+
+		brelse(*pbh);
+		*pbh = __bread(nilfs->ns_bdev, blocknr + bcnt,
+			       nilfs->ns_blocksize);
+	}
+}
+
+/**
+ * nilfs_scan_dsync_log - get block information of a log written for data sync
+ * @nilfs: nilfs object
+ * @start_blocknr: start block number of the log
+ * @sum: log summary information
+ * @head: list head to add nilfs_recovery_block struct
+ */
+static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr,
+				struct nilfs_segment_summary *sum,
+				struct list_head *head)
+{
+	struct buffer_head *bh;
+	unsigned int offset;
+	u32 nfinfo, sumbytes;
+	sector_t blocknr;
+	ino_t ino;
+	int err = -EIO;
+
+	nfinfo = le32_to_cpu(sum->ss_nfinfo);
+	if (!nfinfo)
+		return 0;
+
+	sumbytes = le32_to_cpu(sum->ss_sumbytes);
+	blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize);
+	bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
+	if (unlikely(!bh))
+		goto out;
+
+	offset = le16_to_cpu(sum->ss_bytes);
+	for (;;) {
+		unsigned long nblocks, ndatablk, nnodeblk;
+		struct nilfs_finfo *finfo;
+
+		finfo = nilfs_read_summary_info(nilfs, &bh, &offset,
+						sizeof(*finfo));
+		if (unlikely(!finfo))
+			goto out;
+
+		ino = le64_to_cpu(finfo->fi_ino);
+		nblocks = le32_to_cpu(finfo->fi_nblocks);
+		ndatablk = le32_to_cpu(finfo->fi_ndatablk);
+		nnodeblk = nblocks - ndatablk;
+
+		while (ndatablk-- > 0) {
+			struct nilfs_recovery_block *rb;
+			struct nilfs_binfo_v *binfo;
+
+			binfo = nilfs_read_summary_info(nilfs, &bh, &offset,
+							sizeof(*binfo));
+			if (unlikely(!binfo))
+				goto out;
+
+			rb = kmalloc(sizeof(*rb), GFP_NOFS);
+			if (unlikely(!rb)) {
+				err = -ENOMEM;
+				goto out;
+			}
+			rb->ino = ino;
+			rb->blocknr = blocknr++;
+			rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr);
+			rb->blkoff = le64_to_cpu(binfo->bi_blkoff);
+			/* INIT_LIST_HEAD(&rb->list); */
+			list_add_tail(&rb->list, head);
+		}
+		if (--nfinfo == 0)
+			break;
+		blocknr += nnodeblk; /* always 0 for data sync logs */
+		nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64),
+					nnodeblk);
+		if (unlikely(!bh))
+			goto out;
+	}
+	err = 0;
+ out:
+	brelse(bh);   /* brelse(NULL) is just ignored */
+	return err;
+}
+
+static void dispose_recovery_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct nilfs_recovery_block *rb;
+
+		rb = list_first_entry(head, struct nilfs_recovery_block, list);
+		list_del(&rb->list);
+		kfree(rb);
+	}
+}
+
+struct nilfs_segment_entry {
+	struct list_head	list;
+	__u64			segnum;
+};
+
+static int nilfs_segment_list_add(struct list_head *head, __u64 segnum)
+{
+	struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
+
+	if (unlikely(!ent))
+		return -ENOMEM;
+
+	ent->segnum = segnum;
+	INIT_LIST_HEAD(&ent->list);
+	list_add_tail(&ent->list, head);
+	return 0;
+}
+
+void nilfs_dispose_segment_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct nilfs_segment_entry *ent;
+
+		ent = list_first_entry(head, struct nilfs_segment_entry, list);
+		list_del(&ent->list);
+		kfree(ent);
+	}
+}
+
+static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
+					      struct super_block *sb,
+					      struct nilfs_recovery_info *ri)
+{
+	struct list_head *head = &ri->ri_used_segments;
+	struct nilfs_segment_entry *ent, *n;
+	struct inode *sufile = nilfs->ns_sufile;
+	__u64 segnum[4];
+	int err;
+	int i;
+
+	segnum[0] = nilfs->ns_segnum;
+	segnum[1] = nilfs->ns_nextnum;
+	segnum[2] = ri->ri_segnum;
+	segnum[3] = ri->ri_nextnum;
+
+	/*
+	 * Releasing the next segment of the latest super root.
+	 * The next segment is invalidated by this recovery.
+	 */
+	err = nilfs_sufile_free(sufile, segnum[1]);
+	if (unlikely(err))
+		goto failed;
+
+	for (i = 1; i < 4; i++) {
+		err = nilfs_segment_list_add(head, segnum[i]);
+		if (unlikely(err))
+			goto failed;
+	}
+
+	/*
+	 * Collecting segments written after the latest super root.
+	 * These are marked dirty to avoid being reallocated in the next write.
+	 */
+	list_for_each_entry_safe(ent, n, head, list) {
+		if (ent->segnum != segnum[0]) {
+			err = nilfs_sufile_scrap(sufile, ent->segnum);
+			if (unlikely(err))
+				goto failed;
+		}
+		list_del(&ent->list);
+		kfree(ent);
+	}
+
+	/* Allocate new segments for recovery */
+	err = nilfs_sufile_alloc(sufile, &segnum[0]);
+	if (unlikely(err))
+		goto failed;
+
+	nilfs->ns_pseg_offset = 0;
+	nilfs->ns_seg_seq = ri->ri_seq + 2;
+	nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0];
+
+ failed:
+	/* No need to recover sufile because it will be destroyed on error */
+	return err;
+}
+
+static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
+				     struct nilfs_recovery_block *rb,
+				     struct page *page)
+{
+	struct buffer_head *bh_org;
+	void *kaddr;
+
+	bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize);
+	if (unlikely(!bh_org))
+		return -EIO;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(bh_org);
+	return 0;
+}
+
+static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
+				      struct super_block *sb,
+				      struct nilfs_root *root,
+				      struct list_head *head,
+				      unsigned long *nr_salvaged_blocks)
+{
+	struct inode *inode;
+	struct nilfs_recovery_block *rb, *n;
+	unsigned blocksize = nilfs->ns_blocksize;
+	struct page *page;
+	loff_t pos;
+	int err = 0, err2 = 0;
+
+	list_for_each_entry_safe(rb, n, head, list) {
+		inode = nilfs_iget(sb, root, rb->ino);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			inode = NULL;
+			goto failed_inode;
+		}
+
+		pos = rb->blkoff << inode->i_blkbits;
+		err = block_write_begin(inode->i_mapping, pos, blocksize,
+					0, &page, nilfs_get_block);
+		if (unlikely(err)) {
+			loff_t isize = inode->i_size;
+			if (pos + blocksize > isize)
+				vmtruncate(inode, isize);
+			goto failed_inode;
+		}
+
+		err = nilfs_recovery_copy_block(nilfs, rb, page);
+		if (unlikely(err))
+			goto failed_page;
+
+		err = nilfs_set_file_dirty(inode, 1);
+		if (unlikely(err))
+			goto failed_page;
+
+		block_write_end(NULL, inode->i_mapping, pos, blocksize,
+				blocksize, page, NULL);
+
+		unlock_page(page);
+		page_cache_release(page);
+
+		(*nr_salvaged_blocks)++;
+		goto next;
+
+ failed_page:
+		unlock_page(page);
+		page_cache_release(page);
+
+ failed_inode:
+		printk(KERN_WARNING
+		       "NILFS warning: error recovering data block "
+		       "(err=%d, ino=%lu, block-offset=%llu)\n",
+		       err, (unsigned long)rb->ino,
+		       (unsigned long long)rb->blkoff);
+		if (!err2)
+			err2 = err;
+ next:
+		iput(inode); /* iput(NULL) is just ignored */
+		list_del_init(&rb->list);
+		kfree(rb);
+	}
+	return err2;
+}
+
+/**
+ * nilfs_do_roll_forward - salvage logical segments newer than the latest
+ * checkpoint
+ * @nilfs: nilfs object
+ * @sb: super block instance
+ * @ri: pointer to a nilfs_recovery_info
+ */
+static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
+				 struct super_block *sb,
+				 struct nilfs_root *root,
+				 struct nilfs_recovery_info *ri)
+{
+	struct buffer_head *bh_sum = NULL;
+	struct nilfs_segment_summary *sum;
+	sector_t pseg_start;
+	sector_t seg_start, seg_end;  /* Starting/ending DBN of full segment */
+	unsigned long nsalvaged_blocks = 0;
+	unsigned int flags;
+	u64 seg_seq;
+	__u64 segnum, nextnum = 0;
+	int empty_seg = 0;
+	int err = 0, ret;
+	LIST_HEAD(dsync_blocks);  /* list of data blocks to be recovered */
+	enum {
+		RF_INIT_ST,
+		RF_DSYNC_ST,   /* scanning data-sync segments */
+	};
+	int state = RF_INIT_ST;
+
+	pseg_start = ri->ri_lsegs_start;
+	seg_seq = ri->ri_lsegs_start_seq;
+	segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
+	nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+
+	while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
+		brelse(bh_sum);
+		bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
+		if (!bh_sum) {
+			err = -EIO;
+			goto failed;
+		}
+
+		ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
+		if (ret) {
+			if (ret == NILFS_SEG_FAIL_IO) {
+				err = -EIO;
+				goto failed;
+			}
+			goto strayed;
+		}
+
+		flags = le16_to_cpu(sum->ss_flags);
+		if (flags & NILFS_SS_SR)
+			goto confused;
+
+		/* Found a valid partial segment; do recovery actions */
+		nextnum = nilfs_get_segnum_of_block(nilfs,
+						    le64_to_cpu(sum->ss_next));
+		empty_seg = 0;
+		nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
+		if (!(flags & NILFS_SS_GC))
+			nilfs->ns_nongc_ctime = nilfs->ns_ctime;
+
+		switch (state) {
+		case RF_INIT_ST:
+			if (!(flags & NILFS_SS_LOGBGN) ||
+			    !(flags & NILFS_SS_SYNDT))
+				goto try_next_pseg;
+			state = RF_DSYNC_ST;
+			/* Fall through */
+		case RF_DSYNC_ST:
+			if (!(flags & NILFS_SS_SYNDT))
+				goto confused;
+
+			err = nilfs_scan_dsync_log(nilfs, pseg_start, sum,
+						   &dsync_blocks);
+			if (unlikely(err))
+				goto failed;
+			if (flags & NILFS_SS_LOGEND) {
+				err = nilfs_recover_dsync_blocks(
+					nilfs, sb, root, &dsync_blocks,
+					&nsalvaged_blocks);
+				if (unlikely(err))
+					goto failed;
+				state = RF_INIT_ST;
+			}
+			break; /* Fall through to try_next_pseg */
+		}
+
+ try_next_pseg:
+		if (pseg_start == ri->ri_lsegs_end)
+			break;
+		pseg_start += le32_to_cpu(sum->ss_nblocks);
+		if (pseg_start < seg_end)
+			continue;
+		goto feed_segment;
+
+ strayed:
+		if (pseg_start == ri->ri_lsegs_end)
+			break;
+
+ feed_segment:
+		/* Looking to the next full segment */
+		if (empty_seg++)
+			break;
+		seg_seq++;
+		segnum = nextnum;
+		nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+		pseg_start = seg_start;
+	}
+
+	if (nsalvaged_blocks) {
+		printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
+		       sb->s_id, nsalvaged_blocks);
+		ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
+	}
+ out:
+	brelse(bh_sum);
+	dispose_recovery_list(&dsync_blocks);
+	return err;
+
+ confused:
+	err = -EINVAL;
+ failed:
+	printk(KERN_ERR
+	       "NILFS (device %s): Error roll-forwarding "
+	       "(err=%d, pseg block=%llu). ",
+	       sb->s_id, err, (unsigned long long)pseg_start);
+	goto out;
+}
+
+static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
+				      struct nilfs_recovery_info *ri)
+{
+	struct buffer_head *bh;
+	int err;
+
+	if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) !=
+	    nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
+		return;
+
+	bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize);
+	BUG_ON(!bh);
+	memset(bh->b_data, 0, bh->b_size);
+	set_buffer_dirty(bh);
+	err = sync_dirty_buffer(bh);
+	if (unlikely(err))
+		printk(KERN_WARNING
+		       "NILFS warning: buffer sync write failed during "
+		       "post-cleaning of recovery.\n");
+	brelse(bh);
+}
+
+/**
+ * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint
+ * @nilfs: nilfs object
+ * @sb: super block instance
+ * @ri: pointer to a nilfs_recovery_info struct to store search results.
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EINVAL - Inconsistent filesystem state.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
+			      struct super_block *sb,
+			      struct nilfs_recovery_info *ri)
+{
+	struct nilfs_root *root;
+	int err;
+
+	if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
+		return 0;
+
+	err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root);
+	if (unlikely(err)) {
+		printk(KERN_ERR
+		       "NILFS: error loading the latest checkpoint.\n");
+		return err;
+	}
+
+	err = nilfs_do_roll_forward(nilfs, sb, root, ri);
+	if (unlikely(err))
+		goto failed;
+
+	if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
+		err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri);
+		if (unlikely(err)) {
+			printk(KERN_ERR "NILFS: Error preparing segments for "
+			       "recovery.\n");
+			goto failed;
+		}
+
+		err = nilfs_attach_log_writer(sb, root);
+		if (unlikely(err))
+			goto failed;
+
+		set_nilfs_discontinued(nilfs);
+		err = nilfs_construct_segment(sb);
+		nilfs_detach_log_writer(sb);
+
+		if (unlikely(err)) {
+			printk(KERN_ERR "NILFS: Oops! recovery failed. "
+			       "(err=%d)\n", err);
+			goto failed;
+		}
+
+		nilfs_finish_roll_forward(nilfs, ri);
+	}
+
+ failed:
+	nilfs_put_root(root);
+	return err;
+}
+
+/**
+ * nilfs_search_super_root - search the latest valid super root
+ * @nilfs: the_nilfs
+ * @ri: pointer to a nilfs_recovery_info struct to store search results.
+ *
+ * nilfs_search_super_root() looks for the latest super-root from a partial
+ * segment pointed by the superblock.  It sets up struct the_nilfs through
+ * this search. It fills nilfs_recovery_info (ri) required for recovery.
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EINVAL - No valid segment found
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_search_super_root(struct the_nilfs *nilfs,
+			    struct nilfs_recovery_info *ri)
+{
+	struct buffer_head *bh_sum = NULL;
+	struct nilfs_segment_summary *sum;
+	sector_t pseg_start, pseg_end, sr_pseg_start = 0;
+	sector_t seg_start, seg_end; /* range of full segment (block number) */
+	sector_t b, end;
+	unsigned long nblocks;
+	unsigned int flags;
+	u64 seg_seq;
+	__u64 segnum, nextnum = 0;
+	__u64 cno;
+	LIST_HEAD(segments);
+	int empty_seg = 0, scan_newer = 0;
+	int ret;
+
+	pseg_start = nilfs->ns_last_pseg;
+	seg_seq = nilfs->ns_last_seq;
+	cno = nilfs->ns_last_cno;
+	segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
+
+	/* Calculate range of segment */
+	nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+
+	/* Read ahead segment */
+	b = seg_start;
+	while (b <= seg_end)
+		__breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize);
+
+	for (;;) {
+		brelse(bh_sum);
+		ret = NILFS_SEG_FAIL_IO;
+		bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
+		if (!bh_sum)
+			goto failed;
+
+		ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
+		if (ret) {
+			if (ret == NILFS_SEG_FAIL_IO)
+				goto failed;
+			goto strayed;
+		}
+
+		nblocks = le32_to_cpu(sum->ss_nblocks);
+		pseg_end = pseg_start + nblocks - 1;
+		if (unlikely(pseg_end > seg_end)) {
+			ret = NILFS_SEG_FAIL_CONSISTENCY;
+			goto strayed;
+		}
+
+		/* A valid partial segment */
+		ri->ri_pseg_start = pseg_start;
+		ri->ri_seq = seg_seq;
+		ri->ri_segnum = segnum;
+		nextnum = nilfs_get_segnum_of_block(nilfs,
+						    le64_to_cpu(sum->ss_next));
+		ri->ri_nextnum = nextnum;
+		empty_seg = 0;
+
+		flags = le16_to_cpu(sum->ss_flags);
+		if (!(flags & NILFS_SS_SR) && !scan_newer) {
+			/* This will never happen because a superblock
+			   (last_segment) always points to a pseg
+			   having a super root. */
+			ret = NILFS_SEG_FAIL_CONSISTENCY;
+			goto failed;
+		}
+
+		if (pseg_start == seg_start) {
+			nilfs_get_segment_range(nilfs, nextnum, &b, &end);
+			while (b <= end)
+				__breadahead(nilfs->ns_bdev, b++,
+					     nilfs->ns_blocksize);
+		}
+		if (!(flags & NILFS_SS_SR)) {
+			if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) {
+				ri->ri_lsegs_start = pseg_start;
+				ri->ri_lsegs_start_seq = seg_seq;
+			}
+			if (flags & NILFS_SS_LOGEND)
+				ri->ri_lsegs_end = pseg_start;
+			goto try_next_pseg;
+		}
+
+		/* A valid super root was found. */
+		ri->ri_cno = cno++;
+		ri->ri_super_root = pseg_end;
+		ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
+
+		nilfs_dispose_segment_list(&segments);
+		sr_pseg_start = pseg_start;
+		nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start;
+		nilfs->ns_seg_seq = seg_seq;
+		nilfs->ns_segnum = segnum;
+		nilfs->ns_cno = cno;  /* nilfs->ns_cno = ri->ri_cno + 1 */
+		nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
+		nilfs->ns_nextnum = nextnum;
+
+		if (scan_newer)
+			ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED;
+		else {
+			if (nilfs->ns_mount_state & NILFS_VALID_FS)
+				goto super_root_found;
+			scan_newer = 1;
+		}
+
+ try_next_pseg:
+		/* Standing on a course, or met an inconsistent state */
+		pseg_start += nblocks;
+		if (pseg_start < seg_end)
+			continue;
+		goto feed_segment;
+
+ strayed:
+		/* Off the trail */
+		if (!scan_newer)
+			/*
+			 * This can happen if a checkpoint was written without
+			 * barriers, or as a result of an I/O failure.
+			 */
+			goto failed;
+
+ feed_segment:
+		/* Looking to the next full segment */
+		if (empty_seg++)
+			goto super_root_found; /* found a valid super root */
+
+		ret = nilfs_segment_list_add(&segments, segnum);
+		if (unlikely(ret))
+			goto failed;
+
+		seg_seq++;
+		segnum = nextnum;
+		nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+		pseg_start = seg_start;
+	}
+
+ super_root_found:
+	/* Updating pointers relating to the latest checkpoint */
+	brelse(bh_sum);
+	list_splice_tail(&segments, &ri->ri_used_segments);
+	nilfs->ns_last_pseg = sr_pseg_start;
+	nilfs->ns_last_seq = nilfs->ns_seg_seq;
+	nilfs->ns_last_cno = ri->ri_cno;
+	return 0;
+
+ failed:
+	brelse(bh_sum);
+	nilfs_dispose_segment_list(&segments);
+	return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
+}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
new file mode 100644
index 00000000..850a7c02
--- /dev/null
+++ b/fs/nilfs2/segbuf.c
@@ -0,0 +1,536 @@
+/*
+ * segbuf.c - NILFS segment buffer
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/crc32.h>
+#include <linux/backing-dev.h>
+#include <linux/slab.h>
+#include "page.h"
+#include "segbuf.h"
+
+
+struct nilfs_write_info {
+	struct the_nilfs       *nilfs;
+	struct bio	       *bio;
+	int			start, end; /* The region to be submitted */
+	int			rest_blocks;
+	int			max_pages;
+	int			nr_vecs;
+	sector_t		blocknr;
+};
+
+static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+			      struct the_nilfs *nilfs);
+static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
+
+struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS);
+	if (unlikely(!segbuf))
+		return NULL;
+
+	segbuf->sb_super = sb;
+	INIT_LIST_HEAD(&segbuf->sb_list);
+	INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
+	INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+	segbuf->sb_super_root = NULL;
+
+	init_completion(&segbuf->sb_bio_event);
+	atomic_set(&segbuf->sb_err, 0);
+	segbuf->sb_nbio = 0;
+
+	return segbuf;
+}
+
+void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf)
+{
+	kmem_cache_free(nilfs_segbuf_cachep, segbuf);
+}
+
+void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum,
+		     unsigned long offset, struct the_nilfs *nilfs)
+{
+	segbuf->sb_segnum = segnum;
+	nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start,
+				&segbuf->sb_fseg_end);
+
+	segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset;
+	segbuf->sb_rest_blocks =
+		segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
+}
+
+/**
+ * nilfs_segbuf_map_cont - map a new log behind a given log
+ * @segbuf: new segment buffer
+ * @prev: segment buffer containing a log to be continued
+ */
+void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
+			   struct nilfs_segment_buffer *prev)
+{
+	segbuf->sb_segnum = prev->sb_segnum;
+	segbuf->sb_fseg_start = prev->sb_fseg_start;
+	segbuf->sb_fseg_end = prev->sb_fseg_end;
+	segbuf->sb_pseg_start = prev->sb_pseg_start + prev->sb_sum.nblocks;
+	segbuf->sb_rest_blocks =
+		segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
+}
+
+void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf,
+				  __u64 nextnum, struct the_nilfs *nilfs)
+{
+	segbuf->sb_nextnum = nextnum;
+	segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum);
+}
+
+int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf)
+{
+	struct buffer_head *bh;
+
+	bh = sb_getblk(segbuf->sb_super,
+		       segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	nilfs_segbuf_add_segsum_buffer(segbuf, bh);
+	return 0;
+}
+
+int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
+				struct buffer_head **bhp)
+{
+	struct buffer_head *bh;
+
+	bh = sb_getblk(segbuf->sb_super,
+		       segbuf->sb_pseg_start + segbuf->sb_sum.nblocks);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	nilfs_segbuf_add_payload_buffer(segbuf, bh);
+	*bhp = bh;
+	return 0;
+}
+
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
+		       time_t ctime, __u64 cno)
+{
+	int err;
+
+	segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0;
+	err = nilfs_segbuf_extend_segsum(segbuf);
+	if (unlikely(err))
+		return err;
+
+	segbuf->sb_sum.flags = flags;
+	segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
+	segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
+	segbuf->sb_sum.ctime = ctime;
+	segbuf->sb_sum.cno = cno;
+	return 0;
+}
+
+/*
+ * Setup segment summary
+ */
+void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
+{
+	struct nilfs_segment_summary *raw_sum;
+	struct buffer_head *bh_sum;
+
+	bh_sum = list_entry(segbuf->sb_segsum_buffers.next,
+			    struct buffer_head, b_assoc_buffers);
+	raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+
+	raw_sum->ss_magic    = cpu_to_le32(NILFS_SEGSUM_MAGIC);
+	raw_sum->ss_bytes    = cpu_to_le16(sizeof(*raw_sum));
+	raw_sum->ss_flags    = cpu_to_le16(segbuf->sb_sum.flags);
+	raw_sum->ss_seq      = cpu_to_le64(segbuf->sb_sum.seg_seq);
+	raw_sum->ss_create   = cpu_to_le64(segbuf->sb_sum.ctime);
+	raw_sum->ss_next     = cpu_to_le64(segbuf->sb_sum.next);
+	raw_sum->ss_nblocks  = cpu_to_le32(segbuf->sb_sum.nblocks);
+	raw_sum->ss_nfinfo   = cpu_to_le32(segbuf->sb_sum.nfinfo);
+	raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
+	raw_sum->ss_pad      = 0;
+	raw_sum->ss_cno      = cpu_to_le64(segbuf->sb_sum.cno);
+}
+
+/*
+ * CRC calculation routines
+ */
+static void
+nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed)
+{
+	struct buffer_head *bh;
+	struct nilfs_segment_summary *raw_sum;
+	unsigned long size, bytes = segbuf->sb_sum.sumbytes;
+	u32 crc;
+
+	bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
+			b_assoc_buffers);
+
+	raw_sum = (struct nilfs_segment_summary *)bh->b_data;
+	size = min_t(unsigned long, bytes, bh->b_size);
+	crc = crc32_le(seed,
+		       (unsigned char *)raw_sum +
+		       sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum),
+		       size - (sizeof(raw_sum->ss_datasum) +
+			       sizeof(raw_sum->ss_sumsum)));
+
+	list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
+				     b_assoc_buffers) {
+		bytes -= size;
+		size = min_t(unsigned long, bytes, bh->b_size);
+		crc = crc32_le(crc, bh->b_data, size);
+	}
+	raw_sum->ss_sumsum = cpu_to_le32(crc);
+}
+
+static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
+					  u32 seed)
+{
+	struct buffer_head *bh;
+	struct nilfs_segment_summary *raw_sum;
+	void *kaddr;
+	u32 crc;
+
+	bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
+			b_assoc_buffers);
+	raw_sum = (struct nilfs_segment_summary *)bh->b_data;
+	crc = crc32_le(seed,
+		       (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum),
+		       bh->b_size - sizeof(raw_sum->ss_datasum));
+
+	list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
+				     b_assoc_buffers) {
+		crc = crc32_le(crc, bh->b_data, bh->b_size);
+	}
+	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+		kaddr = kmap_atomic(bh->b_page, KM_USER0);
+		crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+	raw_sum->ss_datasum = cpu_to_le32(crc);
+}
+
+static void
+nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
+				    u32 seed)
+{
+	struct nilfs_super_root *raw_sr;
+	struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info;
+	unsigned srsize;
+	u32 crc;
+
+	raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
+	srsize = NILFS_SR_BYTES(nilfs->ns_inode_size);
+	crc = crc32_le(seed,
+		       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
+		       srsize - sizeof(raw_sr->sr_sum));
+	raw_sr->sr_sum = cpu_to_le32(crc);
+}
+
+static void nilfs_release_buffers(struct list_head *list)
+{
+	struct buffer_head *bh, *n;
+
+	list_for_each_entry_safe(bh, n, list, b_assoc_buffers) {
+		list_del_init(&bh->b_assoc_buffers);
+		brelse(bh);
+	}
+}
+
+static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
+{
+	nilfs_release_buffers(&segbuf->sb_segsum_buffers);
+	nilfs_release_buffers(&segbuf->sb_payload_buffers);
+	segbuf->sb_super_root = NULL;
+}
+
+/*
+ * Iterators for segment buffers
+ */
+void nilfs_clear_logs(struct list_head *logs)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	list_for_each_entry(segbuf, logs, sb_list)
+		nilfs_segbuf_clear(segbuf);
+}
+
+void nilfs_truncate_logs(struct list_head *logs,
+			 struct nilfs_segment_buffer *last)
+{
+	struct nilfs_segment_buffer *n, *segbuf;
+
+	segbuf = list_prepare_entry(last, logs, sb_list);
+	list_for_each_entry_safe_continue(segbuf, n, logs, sb_list) {
+		list_del_init(&segbuf->sb_list);
+		nilfs_segbuf_clear(segbuf);
+		nilfs_segbuf_free(segbuf);
+	}
+}
+
+int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int ret = 0;
+
+	list_for_each_entry(segbuf, logs, sb_list) {
+		ret = nilfs_segbuf_write(segbuf, nilfs);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+int nilfs_wait_on_logs(struct list_head *logs)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int err, ret = 0;
+
+	list_for_each_entry(segbuf, logs, sb_list) {
+		err = nilfs_segbuf_wait(segbuf);
+		if (err && !ret)
+			ret = err;
+	}
+	return ret;
+}
+
+/**
+ * nilfs_add_checksums_on_logs - add checksums on the logs
+ * @logs: list of segment buffers storing target logs
+ * @seed: checksum seed value
+ */
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	list_for_each_entry(segbuf, logs, sb_list) {
+		if (segbuf->sb_super_root)
+			nilfs_segbuf_fill_in_super_root_crc(segbuf, seed);
+		nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
+		nilfs_segbuf_fill_in_data_crc(segbuf, seed);
+	}
+}
+
+/*
+ * BIO operations
+ */
+static void nilfs_end_bio_write(struct bio *bio, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct nilfs_segment_buffer *segbuf = bio->bi_private;
+
+	if (err == -EOPNOTSUPP) {
+		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		bio_put(bio);
+		/* to be detected by submit_seg_bio() */
+	}
+
+	if (!uptodate)
+		atomic_inc(&segbuf->sb_err);
+
+	bio_put(bio);
+	complete(&segbuf->sb_bio_event);
+}
+
+static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
+				   struct nilfs_write_info *wi, int mode)
+{
+	struct bio *bio = wi->bio;
+	int err;
+
+	if (segbuf->sb_nbio > 0 &&
+	    bdi_write_congested(segbuf->sb_super->s_bdi)) {
+		wait_for_completion(&segbuf->sb_bio_event);
+		segbuf->sb_nbio--;
+		if (unlikely(atomic_read(&segbuf->sb_err))) {
+			bio_put(bio);
+			err = -EIO;
+			goto failed;
+		}
+	}
+
+	bio->bi_end_io = nilfs_end_bio_write;
+	bio->bi_private = segbuf;
+	bio_get(bio);
+	submit_bio(mode, bio);
+	if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+		bio_put(bio);
+		err = -EOPNOTSUPP;
+		goto failed;
+	}
+	segbuf->sb_nbio++;
+	bio_put(bio);
+
+	wi->bio = NULL;
+	wi->rest_blocks -= wi->end - wi->start;
+	wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
+	wi->start = wi->end;
+	return 0;
+
+ failed:
+	wi->bio = NULL;
+	return err;
+}
+
+/**
+ * nilfs_alloc_seg_bio - allocate a new bio for writing log
+ * @nilfs: nilfs object
+ * @start: start block number of the bio
+ * @nr_vecs: request size of page vector.
+ *
+ * Return Value: On success, pointer to the struct bio is returned.
+ * On error, NULL is returned.
+ */
+static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
+				       int nr_vecs)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_NOIO, nr_vecs);
+	if (bio == NULL) {
+		while (!bio && (nr_vecs >>= 1))
+			bio = bio_alloc(GFP_NOIO, nr_vecs);
+	}
+	if (likely(bio)) {
+		bio->bi_bdev = nilfs->ns_bdev;
+		bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9);
+	}
+	return bio;
+}
+
+static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
+				       struct nilfs_write_info *wi)
+{
+	wi->bio = NULL;
+	wi->rest_blocks = segbuf->sb_sum.nblocks;
+	wi->max_pages = bio_get_nr_vecs(wi->nilfs->ns_bdev);
+	wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
+	wi->start = wi->end = 0;
+	wi->blocknr = segbuf->sb_pseg_start;
+}
+
+static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
+				  struct nilfs_write_info *wi,
+				  struct buffer_head *bh, int mode)
+{
+	int len, err;
+
+	BUG_ON(wi->nr_vecs <= 0);
+ repeat:
+	if (!wi->bio) {
+		wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end,
+					      wi->nr_vecs);
+		if (unlikely(!wi->bio))
+			return -ENOMEM;
+	}
+
+	len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));
+	if (len == bh->b_size) {
+		wi->end++;
+		return 0;
+	}
+	/* bio is FULL */
+	err = nilfs_segbuf_submit_bio(segbuf, wi, mode);
+	/* never submit current bh */
+	if (likely(!err))
+		goto repeat;
+	return err;
+}
+
+/**
+ * nilfs_segbuf_write - submit write requests of a log
+ * @segbuf: buffer storing a log to be written
+ * @nilfs: nilfs object
+ *
+ * Return Value: On Success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+			      struct the_nilfs *nilfs)
+{
+	struct nilfs_write_info wi;
+	struct buffer_head *bh;
+	int res = 0, rw = WRITE;
+
+	wi.nilfs = nilfs;
+	nilfs_segbuf_prepare_write(segbuf, &wi);
+
+	list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
+		res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
+		if (unlikely(res))
+			goto failed_bio;
+	}
+
+	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+		res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
+		if (unlikely(res))
+			goto failed_bio;
+	}
+
+	if (wi.bio) {
+		/*
+		 * Last BIO is always sent through the following
+		 * submission.
+		 */
+		rw |= REQ_SYNC;
+		res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
+	}
+
+ failed_bio:
+	return res;
+}
+
+/**
+ * nilfs_segbuf_wait - wait for completion of requested BIOs
+ * @segbuf: segment buffer
+ *
+ * Return Value: On Success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error
+ */
+static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
+{
+	int err = 0;
+
+	if (!segbuf->sb_nbio)
+		return 0;
+
+	do {
+		wait_for_completion(&segbuf->sb_bio_event);
+	} while (--segbuf->sb_nbio > 0);
+
+	if (unlikely(atomic_read(&segbuf->sb_err) > 0)) {
+		printk(KERN_ERR "NILFS: IO error writing segment\n");
+		err = -EIO;
+	}
+	return err;
+}
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
new file mode 100644
index 00000000..b04f08cc
--- /dev/null
+++ b/fs/nilfs2/segbuf.h
@@ -0,0 +1,184 @@
+/*
+ * segbuf.h - NILFS Segment buffer prototypes and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGBUF_H
+#define _NILFS_SEGBUF_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/completion.h>
+
+/**
+ * struct nilfs_segsum_info - On-memory segment summary
+ * @flags: Flags
+ * @nfinfo: Number of file information structures
+ * @nblocks: Number of blocks included in the partial segment
+ * @nsumblk: Number of summary blocks
+ * @sumbytes: Byte count of segment summary
+ * @nfileblk: Total number of file blocks
+ * @seg_seq: Segment sequence number
+ * @cno: Checkpoint number
+ * @ctime: Creation time
+ * @next: Block number of the next full segment
+ */
+struct nilfs_segsum_info {
+	unsigned int		flags;
+	unsigned long		nfinfo;
+	unsigned long		nblocks;
+	unsigned long		nsumblk;
+	unsigned long		sumbytes;
+	unsigned long		nfileblk;
+	u64			seg_seq;
+	__u64			cno;
+	time_t			ctime;
+	sector_t		next;
+};
+
+/**
+ * struct nilfs_segment_buffer - Segment buffer
+ * @sb_super: back pointer to a superblock struct
+ * @sb_list: List head to chain this structure
+ * @sb_sum: On-memory segment summary
+ * @sb_segnum: Index number of the full segment
+ * @sb_nextnum: Index number of the next full segment
+ * @sb_fseg_start: Start block number of the full segment
+ * @sb_fseg_end: End block number of the full segment
+ * @sb_pseg_start: Disk block number of partial segment
+ * @sb_rest_blocks: Number of residual blocks in the current segment
+ * @sb_segsum_buffers: List of buffers for segment summaries
+ * @sb_payload_buffers: List of buffers for segment payload
+ * @sb_super_root: Pointer to buffer storing a super root block (if exists)
+ * @sb_nbio: Number of flying bio requests
+ * @sb_err: I/O error status
+ * @sb_bio_event: Completion event of log writing
+ */
+struct nilfs_segment_buffer {
+	struct super_block     *sb_super;
+	struct list_head	sb_list;
+
+	/* Segment information */
+	struct nilfs_segsum_info sb_sum;
+	__u64			sb_segnum;
+	__u64			sb_nextnum;
+	sector_t		sb_fseg_start, sb_fseg_end;
+	sector_t		sb_pseg_start;
+	unsigned		sb_rest_blocks;
+
+	/* Buffers */
+	struct list_head	sb_segsum_buffers;
+	struct list_head	sb_payload_buffers; /* including super root */
+	struct buffer_head     *sb_super_root;
+
+	/* io status */
+	int			sb_nbio;
+	atomic_t		sb_err;
+	struct completion	sb_bio_event;
+};
+
+#define NILFS_LIST_SEGBUF(head)  \
+	list_entry((head), struct nilfs_segment_buffer, sb_list)
+#define NILFS_NEXT_SEGBUF(segbuf)  NILFS_LIST_SEGBUF((segbuf)->sb_list.next)
+#define NILFS_PREV_SEGBUF(segbuf)  NILFS_LIST_SEGBUF((segbuf)->sb_list.prev)
+#define NILFS_LAST_SEGBUF(head)    NILFS_LIST_SEGBUF((head)->prev)
+#define NILFS_FIRST_SEGBUF(head)   NILFS_LIST_SEGBUF((head)->next)
+#define NILFS_SEGBUF_IS_LAST(segbuf, head)  ((segbuf)->sb_list.next == (head))
+
+#define nilfs_for_each_segbuf_before(s, t, h) \
+	for ((s) = NILFS_FIRST_SEGBUF(h); (s) != (t); \
+	     (s) = NILFS_NEXT_SEGBUF(s))
+
+#define NILFS_SEGBUF_FIRST_BH(head)  \
+	(list_entry((head)->next, struct buffer_head, b_assoc_buffers))
+#define NILFS_SEGBUF_NEXT_BH(bh)  \
+	(list_entry((bh)->b_assoc_buffers.next, struct buffer_head, \
+		    b_assoc_buffers))
+#define NILFS_SEGBUF_BH_IS_LAST(bh, head)  ((bh)->b_assoc_buffers.next == head)
+
+extern struct kmem_cache *nilfs_segbuf_cachep;
+
+struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
+void nilfs_segbuf_free(struct nilfs_segment_buffer *);
+void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
+		      struct the_nilfs *);
+void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
+			   struct nilfs_segment_buffer *prev);
+void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
+				  struct the_nilfs *);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
+int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
+int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
+				struct buffer_head **);
+void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
+
+static inline int nilfs_segbuf_simplex(struct nilfs_segment_buffer *segbuf)
+{
+	unsigned int flags = segbuf->sb_sum.flags;
+
+	return (flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) ==
+		(NILFS_SS_LOGBGN | NILFS_SS_LOGEND);
+}
+
+static inline int nilfs_segbuf_empty(struct nilfs_segment_buffer *segbuf)
+{
+	return segbuf->sb_sum.nblocks == segbuf->sb_sum.nsumblk;
+}
+
+static inline void
+nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
+			       struct buffer_head *bh)
+{
+	list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_segsum_buffers);
+	segbuf->sb_sum.nblocks++;
+	segbuf->sb_sum.nsumblk++;
+}
+
+static inline void
+nilfs_segbuf_add_payload_buffer(struct nilfs_segment_buffer *segbuf,
+				struct buffer_head *bh)
+{
+	list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_payload_buffers);
+	segbuf->sb_sum.nblocks++;
+}
+
+static inline void
+nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
+			     struct buffer_head *bh)
+{
+	get_bh(bh);
+	nilfs_segbuf_add_payload_buffer(segbuf, bh);
+	segbuf->sb_sum.nfileblk++;
+}
+
+void nilfs_clear_logs(struct list_head *logs);
+void nilfs_truncate_logs(struct list_head *logs,
+			 struct nilfs_segment_buffer *last);
+int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
+int nilfs_wait_on_logs(struct list_head *logs);
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed);
+
+static inline void nilfs_destroy_logs(struct list_head *logs)
+{
+	nilfs_truncate_logs(logs, NULL);
+}
+
+#endif /* _NILFS_SEGBUF_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
new file mode 100644
index 00000000..bb24ab6c
--- /dev/null
+++ b/fs/nilfs2/segment.c
@@ -0,0 +1,2707 @@
+/*
+ * segment.c - NILFS segment constructor.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/bio.h>
+#include <linux/completion.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/crc32.h>
+#include <linux/pagevec.h>
+#include <linux/slab.h>
+#include "nilfs.h"
+#include "btnode.h"
+#include "page.h"
+#include "segment.h"
+#include "sufile.h"
+#include "cpfile.h"
+#include "ifile.h"
+#include "segbuf.h"
+
+
+/*
+ * Segment constructor
+ */
+#define SC_N_INODEVEC	16   /* Size of locally allocated inode vector */
+
+#define SC_MAX_SEGDELTA 64   /* Upper limit of the number of segments
+				appended in collection retry loop */
+
+/* Construction mode */
+enum {
+	SC_LSEG_SR = 1,	/* Make a logical segment having a super root */
+	SC_LSEG_DSYNC,	/* Flush data blocks of a given file and make
+			   a logical segment without a super root */
+	SC_FLUSH_FILE,	/* Flush data files, leads to segment writes without
+			   creating a checkpoint */
+	SC_FLUSH_DAT,	/* Flush DAT file. This also creates segments without
+			   a checkpoint */
+};
+
+/* Stage numbers of dirty block collection */
+enum {
+	NILFS_ST_INIT = 0,
+	NILFS_ST_GC,		/* Collecting dirty blocks for GC */
+	NILFS_ST_FILE,
+	NILFS_ST_IFILE,
+	NILFS_ST_CPFILE,
+	NILFS_ST_SUFILE,
+	NILFS_ST_DAT,
+	NILFS_ST_SR,		/* Super root */
+	NILFS_ST_DSYNC,		/* Data sync blocks */
+	NILFS_ST_DONE,
+};
+
+/* State flags of collection */
+#define NILFS_CF_NODE		0x0001	/* Collecting node blocks */
+#define NILFS_CF_IFILE_STARTED	0x0002	/* IFILE stage has started */
+#define NILFS_CF_SUFREED	0x0004	/* segment usages has been freed */
+#define NILFS_CF_HISTORY_MASK	(NILFS_CF_IFILE_STARTED | NILFS_CF_SUFREED)
+
+/* Operations depending on the construction mode and file type */
+struct nilfs_sc_operations {
+	int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *,
+			    struct inode *);
+	int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *,
+			    struct inode *);
+	int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *,
+			    struct inode *);
+	void (*write_data_binfo)(struct nilfs_sc_info *,
+				 struct nilfs_segsum_pointer *,
+				 union nilfs_binfo *);
+	void (*write_node_binfo)(struct nilfs_sc_info *,
+				 struct nilfs_segsum_pointer *,
+				 union nilfs_binfo *);
+};
+
+/*
+ * Other definitions
+ */
+static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
+static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
+static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
+static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int);
+
+#define nilfs_cnt32_gt(a, b)   \
+	(typecheck(__u32, a) && typecheck(__u32, b) && \
+	 ((__s32)(b) - (__s32)(a) < 0))
+#define nilfs_cnt32_ge(a, b)   \
+	(typecheck(__u32, a) && typecheck(__u32, b) && \
+	 ((__s32)(a) - (__s32)(b) >= 0))
+#define nilfs_cnt32_lt(a, b)  nilfs_cnt32_gt(b, a)
+#define nilfs_cnt32_le(a, b)  nilfs_cnt32_ge(b, a)
+
+static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
+{
+	struct nilfs_transaction_info *cur_ti = current->journal_info;
+	void *save = NULL;
+
+	if (cur_ti) {
+		if (cur_ti->ti_magic == NILFS_TI_MAGIC)
+			return ++cur_ti->ti_count;
+		else {
+			/*
+			 * If journal_info field is occupied by other FS,
+			 * it is saved and will be restored on
+			 * nilfs_transaction_commit().
+			 */
+			printk(KERN_WARNING
+			       "NILFS warning: journal info from a different "
+			       "FS\n");
+			save = current->journal_info;
+		}
+	}
+	if (!ti) {
+		ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS);
+		if (!ti)
+			return -ENOMEM;
+		ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC;
+	} else {
+		ti->ti_flags = 0;
+	}
+	ti->ti_count = 0;
+	ti->ti_save = save;
+	ti->ti_magic = NILFS_TI_MAGIC;
+	current->journal_info = ti;
+	return 0;
+}
+
+/**
+ * nilfs_transaction_begin - start indivisible file operations.
+ * @sb: super block
+ * @ti: nilfs_transaction_info
+ * @vacancy_check: flags for vacancy rate checks
+ *
+ * nilfs_transaction_begin() acquires a reader/writer semaphore, called
+ * the segment semaphore, to make a segment construction and write tasks
+ * exclusive.  The function is used with nilfs_transaction_commit() in pairs.
+ * The region enclosed by these two functions can be nested.  To avoid a
+ * deadlock, the semaphore is only acquired or released in the outermost call.
+ *
+ * This function allocates a nilfs_transaction_info struct to keep context
+ * information on it.  It is initialized and hooked onto the current task in
+ * the outermost call.  If a pre-allocated struct is given to @ti, it is used
+ * instead; otherwise a new struct is assigned from a slab.
+ *
+ * When @vacancy_check flag is set, this function will check the amount of
+ * free space, and will wait for the GC to reclaim disk space if low capacity.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-ENOSPC - No space left on device
+ */
+int nilfs_transaction_begin(struct super_block *sb,
+			    struct nilfs_transaction_info *ti,
+			    int vacancy_check)
+{
+	struct the_nilfs *nilfs;
+	int ret = nilfs_prepare_segment_lock(ti);
+
+	if (unlikely(ret < 0))
+		return ret;
+	if (ret > 0)
+		return 0;
+
+	vfs_check_frozen(sb, SB_FREEZE_WRITE);
+
+	nilfs = sb->s_fs_info;
+	down_read(&nilfs->ns_segctor_sem);
+	if (vacancy_check && nilfs_near_disk_full(nilfs)) {
+		up_read(&nilfs->ns_segctor_sem);
+		ret = -ENOSPC;
+		goto failed;
+	}
+	return 0;
+
+ failed:
+	ti = current->journal_info;
+	current->journal_info = ti->ti_save;
+	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+		kmem_cache_free(nilfs_transaction_cachep, ti);
+	return ret;
+}
+
+/**
+ * nilfs_transaction_commit - commit indivisible file operations.
+ * @sb: super block
+ *
+ * nilfs_transaction_commit() releases the read semaphore which is
+ * acquired by nilfs_transaction_begin(). This is only performed
+ * in outermost call of this function.  If a commit flag is set,
+ * nilfs_transaction_commit() sets a timer to start the segment
+ * constructor.  If a sync flag is set, it starts construction
+ * directly.
+ */
+int nilfs_transaction_commit(struct super_block *sb)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	int err = 0;
+
+	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+	ti->ti_flags |= NILFS_TI_COMMIT;
+	if (ti->ti_count > 0) {
+		ti->ti_count--;
+		return 0;
+	}
+	if (nilfs->ns_writer) {
+		struct nilfs_sc_info *sci = nilfs->ns_writer;
+
+		if (ti->ti_flags & NILFS_TI_COMMIT)
+			nilfs_segctor_start_timer(sci);
+		if (atomic_read(&nilfs->ns_ndirtyblks) > sci->sc_watermark)
+			nilfs_segctor_do_flush(sci, 0);
+	}
+	up_read(&nilfs->ns_segctor_sem);
+	current->journal_info = ti->ti_save;
+
+	if (ti->ti_flags & NILFS_TI_SYNC)
+		err = nilfs_construct_segment(sb);
+	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+		kmem_cache_free(nilfs_transaction_cachep, ti);
+	return err;
+}
+
+void nilfs_transaction_abort(struct super_block *sb)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+	struct the_nilfs *nilfs = sb->s_fs_info;
+
+	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+	if (ti->ti_count > 0) {
+		ti->ti_count--;
+		return;
+	}
+	up_read(&nilfs->ns_segctor_sem);
+
+	current->journal_info = ti->ti_save;
+	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+		kmem_cache_free(nilfs_transaction_cachep, ti);
+}
+
+void nilfs_relax_pressure_in_lock(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
+
+	if (!sci || !sci->sc_flush_request)
+		return;
+
+	set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
+	up_read(&nilfs->ns_segctor_sem);
+
+	down_write(&nilfs->ns_segctor_sem);
+	if (sci->sc_flush_request &&
+	    test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) {
+		struct nilfs_transaction_info *ti = current->journal_info;
+
+		ti->ti_flags |= NILFS_TI_WRITER;
+		nilfs_segctor_do_immediate_flush(sci);
+		ti->ti_flags &= ~NILFS_TI_WRITER;
+	}
+	downgrade_write(&nilfs->ns_segctor_sem);
+}
+
+static void nilfs_transaction_lock(struct super_block *sb,
+				   struct nilfs_transaction_info *ti,
+				   int gcflag)
+{
+	struct nilfs_transaction_info *cur_ti = current->journal_info;
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
+
+	WARN_ON(cur_ti);
+	ti->ti_flags = NILFS_TI_WRITER;
+	ti->ti_count = 0;
+	ti->ti_save = cur_ti;
+	ti->ti_magic = NILFS_TI_MAGIC;
+	INIT_LIST_HEAD(&ti->ti_garbage);
+	current->journal_info = ti;
+
+	for (;;) {
+		down_write(&nilfs->ns_segctor_sem);
+		if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags))
+			break;
+
+		nilfs_segctor_do_immediate_flush(sci);
+
+		up_write(&nilfs->ns_segctor_sem);
+		yield();
+	}
+	if (gcflag)
+		ti->ti_flags |= NILFS_TI_GC;
+}
+
+static void nilfs_transaction_unlock(struct super_block *sb)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+	struct the_nilfs *nilfs = sb->s_fs_info;
+
+	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+	BUG_ON(ti->ti_count > 0);
+
+	up_write(&nilfs->ns_segctor_sem);
+	current->journal_info = ti->ti_save;
+	if (!list_empty(&ti->ti_garbage))
+		nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
+}
+
+static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
+					    struct nilfs_segsum_pointer *ssp,
+					    unsigned bytes)
+{
+	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+	unsigned blocksize = sci->sc_super->s_blocksize;
+	void *p;
+
+	if (unlikely(ssp->offset + bytes > blocksize)) {
+		ssp->offset = 0;
+		BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh,
+					       &segbuf->sb_segsum_buffers));
+		ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh);
+	}
+	p = ssp->bh->b_data + ssp->offset;
+	ssp->offset += bytes;
+	return p;
+}
+
+/**
+ * nilfs_segctor_reset_segment_buffer - reset the current segment buffer
+ * @sci: nilfs_sc_info
+ */
+static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+	struct buffer_head *sumbh;
+	unsigned sumbytes;
+	unsigned flags = 0;
+	int err;
+
+	if (nilfs_doing_gc())
+		flags = NILFS_SS_GC;
+	err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, sci->sc_cno);
+	if (unlikely(err))
+		return err;
+
+	sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
+	sumbytes = segbuf->sb_sum.sumbytes;
+	sci->sc_finfo_ptr.bh = sumbh;  sci->sc_finfo_ptr.offset = sumbytes;
+	sci->sc_binfo_ptr.bh = sumbh;  sci->sc_binfo_ptr.offset = sumbytes;
+	sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
+	return 0;
+}
+
+static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
+{
+	sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
+	if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
+		return -E2BIG; /* The current segment is filled up
+				  (internal code) */
+	sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
+	return nilfs_segctor_reset_segment_buffer(sci);
+}
+
+static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+	int err;
+
+	if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) {
+		err = nilfs_segctor_feed_segment(sci);
+		if (err)
+			return err;
+		segbuf = sci->sc_curseg;
+	}
+	err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root);
+	if (likely(!err))
+		segbuf->sb_sum.flags |= NILFS_SS_SR;
+	return err;
+}
+
+/*
+ * Functions for making segment summary and payloads
+ */
+static int nilfs_segctor_segsum_block_required(
+	struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
+	unsigned binfo_size)
+{
+	unsigned blocksize = sci->sc_super->s_blocksize;
+	/* Size of finfo and binfo is enough small against blocksize */
+
+	return ssp->offset + binfo_size +
+		(!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) >
+		blocksize;
+}
+
+static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
+				      struct inode *inode)
+{
+	sci->sc_curseg->sb_sum.nfinfo++;
+	sci->sc_binfo_ptr = sci->sc_finfo_ptr;
+	nilfs_segctor_map_segsum_entry(
+		sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
+
+	if (NILFS_I(inode)->i_root &&
+	    !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+		set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
+	/* skip finfo */
+}
+
+static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
+				    struct inode *inode)
+{
+	struct nilfs_finfo *finfo;
+	struct nilfs_inode_info *ii;
+	struct nilfs_segment_buffer *segbuf;
+	__u64 cno;
+
+	if (sci->sc_blk_cnt == 0)
+		return;
+
+	ii = NILFS_I(inode);
+
+	if (test_bit(NILFS_I_GCINODE, &ii->i_state))
+		cno = ii->i_cno;
+	else if (NILFS_ROOT_METADATA_FILE(inode->i_ino))
+		cno = 0;
+	else
+		cno = sci->sc_cno;
+
+	finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
+						 sizeof(*finfo));
+	finfo->fi_ino = cpu_to_le64(inode->i_ino);
+	finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
+	finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
+	finfo->fi_cno = cpu_to_le64(cno);
+
+	segbuf = sci->sc_curseg;
+	segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
+		sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1);
+	sci->sc_finfo_ptr = sci->sc_binfo_ptr;
+	sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
+}
+
+static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
+					struct buffer_head *bh,
+					struct inode *inode,
+					unsigned binfo_size)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int required, err = 0;
+
+ retry:
+	segbuf = sci->sc_curseg;
+	required = nilfs_segctor_segsum_block_required(
+		sci, &sci->sc_binfo_ptr, binfo_size);
+	if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) {
+		nilfs_segctor_end_finfo(sci, inode);
+		err = nilfs_segctor_feed_segment(sci);
+		if (err)
+			return err;
+		goto retry;
+	}
+	if (unlikely(required)) {
+		err = nilfs_segbuf_extend_segsum(segbuf);
+		if (unlikely(err))
+			goto failed;
+	}
+	if (sci->sc_blk_cnt == 0)
+		nilfs_segctor_begin_finfo(sci, inode);
+
+	nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size);
+	/* Substitution to vblocknr is delayed until update_blocknr() */
+	nilfs_segbuf_add_file_buffer(segbuf, bh);
+	sci->sc_blk_cnt++;
+ failed:
+	return err;
+}
+
+/*
+ * Callback functions that enumerate, mark, and collect dirty blocks
+ */
+static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
+				   struct buffer_head *bh, struct inode *inode)
+{
+	int err;
+
+	err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+	if (err < 0)
+		return err;
+
+	err = nilfs_segctor_add_file_block(sci, bh, inode,
+					   sizeof(struct nilfs_binfo_v));
+	if (!err)
+		sci->sc_datablk_cnt++;
+	return err;
+}
+
+static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
+				   struct buffer_head *bh,
+				   struct inode *inode)
+{
+	return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+}
+
+static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
+				   struct buffer_head *bh,
+				   struct inode *inode)
+{
+	WARN_ON(!buffer_dirty(bh));
+	return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
+}
+
+static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci,
+					struct nilfs_segsum_pointer *ssp,
+					union nilfs_binfo *binfo)
+{
+	struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry(
+		sci, ssp, sizeof(*binfo_v));
+	*binfo_v = binfo->bi_v;
+}
+
+static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
+					struct nilfs_segsum_pointer *ssp,
+					union nilfs_binfo *binfo)
+{
+	__le64 *vblocknr = nilfs_segctor_map_segsum_entry(
+		sci, ssp, sizeof(*vblocknr));
+	*vblocknr = binfo->bi_v.bi_vblocknr;
+}
+
+static struct nilfs_sc_operations nilfs_sc_file_ops = {
+	.collect_data = nilfs_collect_file_data,
+	.collect_node = nilfs_collect_file_node,
+	.collect_bmap = nilfs_collect_file_bmap,
+	.write_data_binfo = nilfs_write_file_data_binfo,
+	.write_node_binfo = nilfs_write_file_node_binfo,
+};
+
+static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
+				  struct buffer_head *bh, struct inode *inode)
+{
+	int err;
+
+	err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+	if (err < 0)
+		return err;
+
+	err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
+	if (!err)
+		sci->sc_datablk_cnt++;
+	return err;
+}
+
+static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci,
+				  struct buffer_head *bh, struct inode *inode)
+{
+	WARN_ON(!buffer_dirty(bh));
+	return nilfs_segctor_add_file_block(sci, bh, inode,
+					    sizeof(struct nilfs_binfo_dat));
+}
+
+static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci,
+				       struct nilfs_segsum_pointer *ssp,
+				       union nilfs_binfo *binfo)
+{
+	__le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp,
+							  sizeof(*blkoff));
+	*blkoff = binfo->bi_dat.bi_blkoff;
+}
+
+static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
+				       struct nilfs_segsum_pointer *ssp,
+				       union nilfs_binfo *binfo)
+{
+	struct nilfs_binfo_dat *binfo_dat =
+		nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat));
+	*binfo_dat = binfo->bi_dat;
+}
+
+static struct nilfs_sc_operations nilfs_sc_dat_ops = {
+	.collect_data = nilfs_collect_dat_data,
+	.collect_node = nilfs_collect_file_node,
+	.collect_bmap = nilfs_collect_dat_bmap,
+	.write_data_binfo = nilfs_write_dat_data_binfo,
+	.write_node_binfo = nilfs_write_dat_node_binfo,
+};
+
+static struct nilfs_sc_operations nilfs_sc_dsync_ops = {
+	.collect_data = nilfs_collect_file_data,
+	.collect_node = NULL,
+	.collect_bmap = NULL,
+	.write_data_binfo = nilfs_write_file_data_binfo,
+	.write_node_binfo = NULL,
+};
+
+static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
+					      struct list_head *listp,
+					      size_t nlimit,
+					      loff_t start, loff_t end)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	pgoff_t index = 0, last = ULONG_MAX;
+	size_t ndirties = 0;
+	int i;
+
+	if (unlikely(start != 0 || end != LLONG_MAX)) {
+		/*
+		 * A valid range is given for sync-ing data pages. The
+		 * range is rounded to per-page; extra dirty buffers
+		 * may be included if blocksize < pagesize.
+		 */
+		index = start >> PAGE_SHIFT;
+		last = end >> PAGE_SHIFT;
+	}
+	pagevec_init(&pvec, 0);
+ repeat:
+	if (unlikely(index > last) ||
+	    !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+				min_t(pgoff_t, last - index,
+				      PAGEVEC_SIZE - 1) + 1))
+		return ndirties;
+
+	for (i = 0; i < pagevec_count(&pvec); i++) {
+		struct buffer_head *bh, *head;
+		struct page *page = pvec.pages[i];
+
+		if (unlikely(page->index > last))
+			break;
+
+		lock_page(page);
+		if (!page_has_buffers(page))
+			create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+		unlock_page(page);
+
+		bh = head = page_buffers(page);
+		do {
+			if (!buffer_dirty(bh))
+				continue;
+			get_bh(bh);
+			list_add_tail(&bh->b_assoc_buffers, listp);
+			ndirties++;
+			if (unlikely(ndirties >= nlimit)) {
+				pagevec_release(&pvec);
+				cond_resched();
+				return ndirties;
+			}
+		} while (bh = bh->b_this_page, bh != head);
+	}
+	pagevec_release(&pvec);
+	cond_resched();
+	goto repeat;
+}
+
+static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
+					    struct list_head *listp)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct address_space *mapping = &ii->i_btnode_cache;
+	struct pagevec pvec;
+	struct buffer_head *bh, *head;
+	unsigned int i;
+	pgoff_t index = 0;
+
+	pagevec_init(&pvec, 0);
+
+	while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+				  PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			bh = head = page_buffers(pvec.pages[i]);
+			do {
+				if (buffer_dirty(bh)) {
+					get_bh(bh);
+					list_add_tail(&bh->b_assoc_buffers,
+						      listp);
+				}
+				bh = bh->b_this_page;
+			} while (bh != head);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+static void nilfs_dispose_list(struct the_nilfs *nilfs,
+			       struct list_head *head, int force)
+{
+	struct nilfs_inode_info *ii, *n;
+	struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
+	unsigned nv = 0;
+
+	while (!list_empty(head)) {
+		spin_lock(&nilfs->ns_inode_lock);
+		list_for_each_entry_safe(ii, n, head, i_dirty) {
+			list_del_init(&ii->i_dirty);
+			if (force) {
+				if (unlikely(ii->i_bh)) {
+					brelse(ii->i_bh);
+					ii->i_bh = NULL;
+				}
+			} else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
+				set_bit(NILFS_I_QUEUED, &ii->i_state);
+				list_add_tail(&ii->i_dirty,
+					      &nilfs->ns_dirty_files);
+				continue;
+			}
+			ivec[nv++] = ii;
+			if (nv == SC_N_INODEVEC)
+				break;
+		}
+		spin_unlock(&nilfs->ns_inode_lock);
+
+		for (pii = ivec; nv > 0; pii++, nv--)
+			iput(&(*pii)->vfs_inode);
+	}
+}
+
+static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
+				     struct nilfs_root *root)
+{
+	int ret = 0;
+
+	if (nilfs_mdt_fetch_dirty(root->ifile))
+		ret++;
+	if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
+		ret++;
+	if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
+		ret++;
+	if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat))
+		ret++;
+	return ret;
+}
+
+static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
+{
+	return list_empty(&sci->sc_dirty_files) &&
+		!test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
+		sci->sc_nfreesegs == 0 &&
+		(!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
+}
+
+static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	int ret = 0;
+
+	if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
+		set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+
+	spin_lock(&nilfs->ns_inode_lock);
+	if (list_empty(&nilfs->ns_dirty_files) && nilfs_segctor_clean(sci))
+		ret++;
+
+	spin_unlock(&nilfs->ns_inode_lock);
+	return ret;
+}
+
+static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+
+	nilfs_mdt_clear_dirty(sci->sc_root->ifile);
+	nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
+	nilfs_mdt_clear_dirty(nilfs->ns_sufile);
+	nilfs_mdt_clear_dirty(nilfs->ns_dat);
+}
+
+static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	struct buffer_head *bh_cp;
+	struct nilfs_checkpoint *raw_cp;
+	int err;
+
+	/* XXX: this interface will be changed */
+	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
+					  &raw_cp, &bh_cp);
+	if (likely(!err)) {
+		/* The following code is duplicated with cpfile.  But, it is
+		   needed to collect the checkpoint even if it was not newly
+		   created */
+		mark_buffer_dirty(bh_cp);
+		nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
+		nilfs_cpfile_put_checkpoint(
+			nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
+	} else
+		WARN_ON(err == -EINVAL || err == -ENOENT);
+
+	return err;
+}
+
+static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	struct buffer_head *bh_cp;
+	struct nilfs_checkpoint *raw_cp;
+	int err;
+
+	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
+					  &raw_cp, &bh_cp);
+	if (unlikely(err)) {
+		WARN_ON(err == -EINVAL || err == -ENOENT);
+		goto failed_ibh;
+	}
+	raw_cp->cp_snapshot_list.ssl_next = 0;
+	raw_cp->cp_snapshot_list.ssl_prev = 0;
+	raw_cp->cp_inodes_count =
+		cpu_to_le64(atomic_read(&sci->sc_root->inodes_count));
+	raw_cp->cp_blocks_count =
+		cpu_to_le64(atomic_read(&sci->sc_root->blocks_count));
+	raw_cp->cp_nblk_inc =
+		cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
+	raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
+	raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
+
+	if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+		nilfs_checkpoint_clear_minor(raw_cp);
+	else
+		nilfs_checkpoint_set_minor(raw_cp);
+
+	nilfs_write_inode_common(sci->sc_root->ifile,
+				 &raw_cp->cp_ifile_inode, 1);
+	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
+	return 0;
+
+ failed_ibh:
+	return err;
+}
+
+static void nilfs_fill_in_file_bmap(struct inode *ifile,
+				    struct nilfs_inode_info *ii)
+
+{
+	struct buffer_head *ibh;
+	struct nilfs_inode *raw_inode;
+
+	if (test_bit(NILFS_I_BMAP, &ii->i_state)) {
+		ibh = ii->i_bh;
+		BUG_ON(!ibh);
+		raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
+						  ibh);
+		nilfs_bmap_write(ii->i_bmap, raw_inode);
+		nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh);
+	}
+}
+
+static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
+{
+	struct nilfs_inode_info *ii;
+
+	list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
+		nilfs_fill_in_file_bmap(sci->sc_root->ifile, ii);
+		set_bit(NILFS_I_COLLECTED, &ii->i_state);
+	}
+}
+
+static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
+					     struct the_nilfs *nilfs)
+{
+	struct buffer_head *bh_sr;
+	struct nilfs_super_root *raw_sr;
+	unsigned isz, srsz;
+
+	bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
+	raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
+	isz = nilfs->ns_inode_size;
+	srsz = NILFS_SR_BYTES(isz);
+
+	raw_sr->sr_bytes = cpu_to_le16(srsz);
+	raw_sr->sr_nongc_ctime
+		= cpu_to_le64(nilfs_doing_gc() ?
+			      nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
+	raw_sr->sr_flags = 0;
+
+	nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
+				 NILFS_SR_DAT_OFFSET(isz), 1);
+	nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
+				 NILFS_SR_CPFILE_OFFSET(isz), 1);
+	nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
+				 NILFS_SR_SUFILE_OFFSET(isz), 1);
+	memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz);
+}
+
+static void nilfs_redirty_inodes(struct list_head *head)
+{
+	struct nilfs_inode_info *ii;
+
+	list_for_each_entry(ii, head, i_dirty) {
+		if (test_bit(NILFS_I_COLLECTED, &ii->i_state))
+			clear_bit(NILFS_I_COLLECTED, &ii->i_state);
+	}
+}
+
+static void nilfs_drop_collected_inodes(struct list_head *head)
+{
+	struct nilfs_inode_info *ii;
+
+	list_for_each_entry(ii, head, i_dirty) {
+		if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state))
+			continue;
+
+		clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
+		set_bit(NILFS_I_UPDATED, &ii->i_state);
+	}
+}
+
+static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
+				       struct inode *inode,
+				       struct list_head *listp,
+				       int (*collect)(struct nilfs_sc_info *,
+						      struct buffer_head *,
+						      struct inode *))
+{
+	struct buffer_head *bh, *n;
+	int err = 0;
+
+	if (collect) {
+		list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) {
+			list_del_init(&bh->b_assoc_buffers);
+			err = collect(sci, bh, inode);
+			brelse(bh);
+			if (unlikely(err))
+				goto dispose_buffers;
+		}
+		return 0;
+	}
+
+ dispose_buffers:
+	while (!list_empty(listp)) {
+		bh = list_first_entry(listp, struct buffer_head,
+				      b_assoc_buffers);
+		list_del_init(&bh->b_assoc_buffers);
+		brelse(bh);
+	}
+	return err;
+}
+
+static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci)
+{
+	/* Remaining number of blocks within segment buffer */
+	return sci->sc_segbuf_nblocks -
+		(sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks);
+}
+
+static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci,
+				   struct inode *inode,
+				   struct nilfs_sc_operations *sc_ops)
+{
+	LIST_HEAD(data_buffers);
+	LIST_HEAD(node_buffers);
+	int err;
+
+	if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
+		size_t n, rest = nilfs_segctor_buffer_rest(sci);
+
+		n = nilfs_lookup_dirty_data_buffers(
+			inode, &data_buffers, rest + 1, 0, LLONG_MAX);
+		if (n > rest) {
+			err = nilfs_segctor_apply_buffers(
+				sci, inode, &data_buffers,
+				sc_ops->collect_data);
+			BUG_ON(!err); /* always receive -E2BIG or true error */
+			goto break_or_fail;
+		}
+	}
+	nilfs_lookup_dirty_node_buffers(inode, &node_buffers);
+
+	if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
+		err = nilfs_segctor_apply_buffers(
+			sci, inode, &data_buffers, sc_ops->collect_data);
+		if (unlikely(err)) {
+			/* dispose node list */
+			nilfs_segctor_apply_buffers(
+				sci, inode, &node_buffers, NULL);
+			goto break_or_fail;
+		}
+		sci->sc_stage.flags |= NILFS_CF_NODE;
+	}
+	/* Collect node */
+	err = nilfs_segctor_apply_buffers(
+		sci, inode, &node_buffers, sc_ops->collect_node);
+	if (unlikely(err))
+		goto break_or_fail;
+
+	nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers);
+	err = nilfs_segctor_apply_buffers(
+		sci, inode, &node_buffers, sc_ops->collect_bmap);
+	if (unlikely(err))
+		goto break_or_fail;
+
+	nilfs_segctor_end_finfo(sci, inode);
+	sci->sc_stage.flags &= ~NILFS_CF_NODE;
+
+ break_or_fail:
+	return err;
+}
+
+static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
+					 struct inode *inode)
+{
+	LIST_HEAD(data_buffers);
+	size_t n, rest = nilfs_segctor_buffer_rest(sci);
+	int err;
+
+	n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1,
+					    sci->sc_dsync_start,
+					    sci->sc_dsync_end);
+
+	err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers,
+					  nilfs_collect_file_data);
+	if (!err) {
+		nilfs_segctor_end_finfo(sci, inode);
+		BUG_ON(n > rest);
+		/* always receive -E2BIG or true error if n > rest */
+	}
+	return err;
+}
+
+static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	struct list_head *head;
+	struct nilfs_inode_info *ii;
+	size_t ndone;
+	int err = 0;
+
+	switch (sci->sc_stage.scnt) {
+	case NILFS_ST_INIT:
+		/* Pre-processes */
+		sci->sc_stage.flags = 0;
+
+		if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) {
+			sci->sc_nblk_inc = 0;
+			sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
+			if (mode == SC_LSEG_DSYNC) {
+				sci->sc_stage.scnt = NILFS_ST_DSYNC;
+				goto dsync_mode;
+			}
+		}
+
+		sci->sc_stage.dirty_file_ptr = NULL;
+		sci->sc_stage.gc_inode_ptr = NULL;
+		if (mode == SC_FLUSH_DAT) {
+			sci->sc_stage.scnt = NILFS_ST_DAT;
+			goto dat_stage;
+		}
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_GC:
+		if (nilfs_doing_gc()) {
+			head = &sci->sc_gc_inodes;
+			ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr,
+						head, i_dirty);
+			list_for_each_entry_continue(ii, head, i_dirty) {
+				err = nilfs_segctor_scan_file(
+					sci, &ii->vfs_inode,
+					&nilfs_sc_file_ops);
+				if (unlikely(err)) {
+					sci->sc_stage.gc_inode_ptr = list_entry(
+						ii->i_dirty.prev,
+						struct nilfs_inode_info,
+						i_dirty);
+					goto break_or_fail;
+				}
+				set_bit(NILFS_I_COLLECTED, &ii->i_state);
+			}
+			sci->sc_stage.gc_inode_ptr = NULL;
+		}
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_FILE:
+		head = &sci->sc_dirty_files;
+		ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
+					i_dirty);
+		list_for_each_entry_continue(ii, head, i_dirty) {
+			clear_bit(NILFS_I_DIRTY, &ii->i_state);
+
+			err = nilfs_segctor_scan_file(sci, &ii->vfs_inode,
+						      &nilfs_sc_file_ops);
+			if (unlikely(err)) {
+				sci->sc_stage.dirty_file_ptr =
+					list_entry(ii->i_dirty.prev,
+						   struct nilfs_inode_info,
+						   i_dirty);
+				goto break_or_fail;
+			}
+			/* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */
+			/* XXX: required ? */
+		}
+		sci->sc_stage.dirty_file_ptr = NULL;
+		if (mode == SC_FLUSH_FILE) {
+			sci->sc_stage.scnt = NILFS_ST_DONE;
+			return 0;
+		}
+		sci->sc_stage.scnt++;
+		sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
+		/* Fall through */
+	case NILFS_ST_IFILE:
+		err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile,
+					      &nilfs_sc_file_ops);
+		if (unlikely(err))
+			break;
+		sci->sc_stage.scnt++;
+		/* Creating a checkpoint */
+		err = nilfs_segctor_create_checkpoint(sci);
+		if (unlikely(err))
+			break;
+		/* Fall through */
+	case NILFS_ST_CPFILE:
+		err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile,
+					      &nilfs_sc_file_ops);
+		if (unlikely(err))
+			break;
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_SUFILE:
+		err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
+					 sci->sc_nfreesegs, &ndone);
+		if (unlikely(err)) {
+			nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+						  sci->sc_freesegs, ndone,
+						  NULL);
+			break;
+		}
+		sci->sc_stage.flags |= NILFS_CF_SUFREED;
+
+		err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
+					      &nilfs_sc_file_ops);
+		if (unlikely(err))
+			break;
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_DAT:
+ dat_stage:
+		err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
+					      &nilfs_sc_dat_ops);
+		if (unlikely(err))
+			break;
+		if (mode == SC_FLUSH_DAT) {
+			sci->sc_stage.scnt = NILFS_ST_DONE;
+			return 0;
+		}
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_SR:
+		if (mode == SC_LSEG_SR) {
+			/* Appending a super root */
+			err = nilfs_segctor_add_super_root(sci);
+			if (unlikely(err))
+				break;
+		}
+		/* End of a logical segment */
+		sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
+		sci->sc_stage.scnt = NILFS_ST_DONE;
+		return 0;
+	case NILFS_ST_DSYNC:
+ dsync_mode:
+		sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT;
+		ii = sci->sc_dsync_inode;
+		if (!test_bit(NILFS_I_BUSY, &ii->i_state))
+			break;
+
+		err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode);
+		if (unlikely(err))
+			break;
+		sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
+		sci->sc_stage.scnt = NILFS_ST_DONE;
+		return 0;
+	case NILFS_ST_DONE:
+		return 0;
+	default:
+		BUG();
+	}
+
+ break_or_fail:
+	return err;
+}
+
+/**
+ * nilfs_segctor_begin_construction - setup segment buffer to make a new log
+ * @sci: nilfs_sc_info
+ * @nilfs: nilfs object
+ */
+static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
+					    struct the_nilfs *nilfs)
+{
+	struct nilfs_segment_buffer *segbuf, *prev;
+	__u64 nextnum;
+	int err, alloc = 0;
+
+	segbuf = nilfs_segbuf_new(sci->sc_super);
+	if (unlikely(!segbuf))
+		return -ENOMEM;
+
+	if (list_empty(&sci->sc_write_logs)) {
+		nilfs_segbuf_map(segbuf, nilfs->ns_segnum,
+				 nilfs->ns_pseg_offset, nilfs);
+		if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+			nilfs_shift_to_next_segment(nilfs);
+			nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
+		}
+
+		segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
+		nextnum = nilfs->ns_nextnum;
+
+		if (nilfs->ns_segnum == nilfs->ns_nextnum)
+			/* Start from the head of a new full segment */
+			alloc++;
+	} else {
+		/* Continue logs */
+		prev = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
+		nilfs_segbuf_map_cont(segbuf, prev);
+		segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq;
+		nextnum = prev->sb_nextnum;
+
+		if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+			nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
+			segbuf->sb_sum.seg_seq++;
+			alloc++;
+		}
+	}
+
+	err = nilfs_sufile_mark_dirty(nilfs->ns_sufile, segbuf->sb_segnum);
+	if (err)
+		goto failed;
+
+	if (alloc) {
+		err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
+		if (err)
+			goto failed;
+	}
+	nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);
+
+	BUG_ON(!list_empty(&sci->sc_segbufs));
+	list_add_tail(&segbuf->sb_list, &sci->sc_segbufs);
+	sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
+	return 0;
+
+ failed:
+	nilfs_segbuf_free(segbuf);
+	return err;
+}
+
+static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
+					 struct the_nilfs *nilfs, int nadd)
+{
+	struct nilfs_segment_buffer *segbuf, *prev;
+	struct inode *sufile = nilfs->ns_sufile;
+	__u64 nextnextnum;
+	LIST_HEAD(list);
+	int err, ret, i;
+
+	prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
+	/*
+	 * Since the segment specified with nextnum might be allocated during
+	 * the previous construction, the buffer including its segusage may
+	 * not be dirty.  The following call ensures that the buffer is dirty
+	 * and will pin the buffer on memory until the sufile is written.
+	 */
+	err = nilfs_sufile_mark_dirty(sufile, prev->sb_nextnum);
+	if (unlikely(err))
+		return err;
+
+	for (i = 0; i < nadd; i++) {
+		/* extend segment info */
+		err = -ENOMEM;
+		segbuf = nilfs_segbuf_new(sci->sc_super);
+		if (unlikely(!segbuf))
+			goto failed;
+
+		/* map this buffer to region of segment on-disk */
+		nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
+		sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks;
+
+		/* allocate the next next full segment */
+		err = nilfs_sufile_alloc(sufile, &nextnextnum);
+		if (unlikely(err))
+			goto failed_segbuf;
+
+		segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1;
+		nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs);
+
+		list_add_tail(&segbuf->sb_list, &list);
+		prev = segbuf;
+	}
+	list_splice_tail(&list, &sci->sc_segbufs);
+	return 0;
+
+ failed_segbuf:
+	nilfs_segbuf_free(segbuf);
+ failed:
+	list_for_each_entry(segbuf, &list, sb_list) {
+		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+		WARN_ON(ret); /* never fails */
+	}
+	nilfs_destroy_logs(&list);
+	return err;
+}
+
+static void nilfs_free_incomplete_logs(struct list_head *logs,
+				       struct the_nilfs *nilfs)
+{
+	struct nilfs_segment_buffer *segbuf, *prev;
+	struct inode *sufile = nilfs->ns_sufile;
+	int ret;
+
+	segbuf = NILFS_FIRST_SEGBUF(logs);
+	if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
+		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+		WARN_ON(ret); /* never fails */
+	}
+	if (atomic_read(&segbuf->sb_err)) {
+		/* Case 1: The first segment failed */
+		if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
+			/* Case 1a:  Partial segment appended into an existing
+			   segment */
+			nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
+						segbuf->sb_fseg_end);
+		else /* Case 1b:  New full segment */
+			set_nilfs_discontinued(nilfs);
+	}
+
+	prev = segbuf;
+	list_for_each_entry_continue(segbuf, logs, sb_list) {
+		if (prev->sb_nextnum != segbuf->sb_nextnum) {
+			ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+			WARN_ON(ret); /* never fails */
+		}
+		if (atomic_read(&segbuf->sb_err) &&
+		    segbuf->sb_segnum != nilfs->ns_nextnum)
+			/* Case 2: extended segment (!= next) failed */
+			nilfs_sufile_set_error(sufile, segbuf->sb_segnum);
+		prev = segbuf;
+	}
+}
+
+static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
+					  struct inode *sufile)
+{
+	struct nilfs_segment_buffer *segbuf;
+	unsigned long live_blocks;
+	int ret;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		live_blocks = segbuf->sb_sum.nblocks +
+			(segbuf->sb_pseg_start - segbuf->sb_fseg_start);
+		ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
+						     live_blocks,
+						     sci->sc_seg_ctime);
+		WARN_ON(ret); /* always succeed because the segusage is dirty */
+	}
+}
+
+static void nilfs_cancel_segusage(struct list_head *logs, struct inode *sufile)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int ret;
+
+	segbuf = NILFS_FIRST_SEGBUF(logs);
+	ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
+					     segbuf->sb_pseg_start -
+					     segbuf->sb_fseg_start, 0);
+	WARN_ON(ret); /* always succeed because the segusage is dirty */
+
+	list_for_each_entry_continue(segbuf, logs, sb_list) {
+		ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
+						     0, 0);
+		WARN_ON(ret); /* always succeed */
+	}
+}
+
+static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci,
+					    struct nilfs_segment_buffer *last,
+					    struct inode *sufile)
+{
+	struct nilfs_segment_buffer *segbuf = last;
+	int ret;
+
+	list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
+		sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
+		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+		WARN_ON(ret);
+	}
+	nilfs_truncate_logs(&sci->sc_segbufs, last);
+}
+
+
+static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
+				 struct the_nilfs *nilfs, int mode)
+{
+	struct nilfs_cstage prev_stage = sci->sc_stage;
+	int err, nadd = 1;
+
+	/* Collection retry loop */
+	for (;;) {
+		sci->sc_nblk_this_inc = 0;
+		sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+
+		err = nilfs_segctor_reset_segment_buffer(sci);
+		if (unlikely(err))
+			goto failed;
+
+		err = nilfs_segctor_collect_blocks(sci, mode);
+		sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
+		if (!err)
+			break;
+
+		if (unlikely(err != -E2BIG))
+			goto failed;
+
+		/* The current segment is filled up */
+		if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
+			break;
+
+		nilfs_clear_logs(&sci->sc_segbufs);
+
+		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+		if (unlikely(err))
+			return err;
+
+		if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+			err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+							sci->sc_freesegs,
+							sci->sc_nfreesegs,
+							NULL);
+			WARN_ON(err); /* do not happen */
+		}
+		nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
+		sci->sc_stage = prev_stage;
+	}
+	nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile);
+	return 0;
+
+ failed:
+	return err;
+}
+
+static void nilfs_list_replace_buffer(struct buffer_head *old_bh,
+				      struct buffer_head *new_bh)
+{
+	BUG_ON(!list_empty(&new_bh->b_assoc_buffers));
+
+	list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers);
+	/* The caller must release old_bh */
+}
+
+static int
+nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
+				     struct nilfs_segment_buffer *segbuf,
+				     int mode)
+{
+	struct inode *inode = NULL;
+	sector_t blocknr;
+	unsigned long nfinfo = segbuf->sb_sum.nfinfo;
+	unsigned long nblocks = 0, ndatablk = 0;
+	struct nilfs_sc_operations *sc_op = NULL;
+	struct nilfs_segsum_pointer ssp;
+	struct nilfs_finfo *finfo = NULL;
+	union nilfs_binfo binfo;
+	struct buffer_head *bh, *bh_org;
+	ino_t ino = 0;
+	int err = 0;
+
+	if (!nfinfo)
+		goto out;
+
+	blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk;
+	ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
+	ssp.offset = sizeof(struct nilfs_segment_summary);
+
+	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+		if (bh == segbuf->sb_super_root)
+			break;
+		if (!finfo) {
+			finfo =	nilfs_segctor_map_segsum_entry(
+				sci, &ssp, sizeof(*finfo));
+			ino = le64_to_cpu(finfo->fi_ino);
+			nblocks = le32_to_cpu(finfo->fi_nblocks);
+			ndatablk = le32_to_cpu(finfo->fi_ndatablk);
+
+			inode = bh->b_page->mapping->host;
+
+			if (mode == SC_LSEG_DSYNC)
+				sc_op = &nilfs_sc_dsync_ops;
+			else if (ino == NILFS_DAT_INO)
+				sc_op = &nilfs_sc_dat_ops;
+			else /* file blocks */
+				sc_op = &nilfs_sc_file_ops;
+		}
+		bh_org = bh;
+		get_bh(bh_org);
+		err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr,
+					&binfo);
+		if (bh != bh_org)
+			nilfs_list_replace_buffer(bh_org, bh);
+		brelse(bh_org);
+		if (unlikely(err))
+			goto failed_bmap;
+
+		if (ndatablk > 0)
+			sc_op->write_data_binfo(sci, &ssp, &binfo);
+		else
+			sc_op->write_node_binfo(sci, &ssp, &binfo);
+
+		blocknr++;
+		if (--nblocks == 0) {
+			finfo = NULL;
+			if (--nfinfo == 0)
+				break;
+		} else if (ndatablk > 0)
+			ndatablk--;
+	}
+ out:
+	return 0;
+
+ failed_bmap:
+	return err;
+}
+
+static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int err;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode);
+		if (unlikely(err))
+			return err;
+		nilfs_segbuf_fill_in_segsum(segbuf);
+	}
+	return 0;
+}
+
+static void nilfs_begin_page_io(struct page *page)
+{
+	if (!page || PageWriteback(page))
+		/* For split b-tree node pages, this function may be called
+		   twice.  We ignore the 2nd or later calls by this check. */
+		return;
+
+	lock_page(page);
+	clear_page_dirty_for_io(page);
+	set_page_writeback(page);
+	unlock_page(page);
+}
+
+static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct page *bd_page = NULL, *fs_page = NULL;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		struct buffer_head *bh;
+
+		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+				    b_assoc_buffers) {
+			if (bh->b_page != bd_page) {
+				if (bd_page) {
+					lock_page(bd_page);
+					clear_page_dirty_for_io(bd_page);
+					set_page_writeback(bd_page);
+					unlock_page(bd_page);
+				}
+				bd_page = bh->b_page;
+			}
+		}
+
+		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+				    b_assoc_buffers) {
+			if (bh == segbuf->sb_super_root) {
+				if (bh->b_page != bd_page) {
+					lock_page(bd_page);
+					clear_page_dirty_for_io(bd_page);
+					set_page_writeback(bd_page);
+					unlock_page(bd_page);
+					bd_page = bh->b_page;
+				}
+				break;
+			}
+			if (bh->b_page != fs_page) {
+				nilfs_begin_page_io(fs_page);
+				fs_page = bh->b_page;
+			}
+		}
+	}
+	if (bd_page) {
+		lock_page(bd_page);
+		clear_page_dirty_for_io(bd_page);
+		set_page_writeback(bd_page);
+		unlock_page(bd_page);
+	}
+	nilfs_begin_page_io(fs_page);
+}
+
+static int nilfs_segctor_write(struct nilfs_sc_info *sci,
+			       struct the_nilfs *nilfs)
+{
+	int ret;
+
+	ret = nilfs_write_logs(&sci->sc_segbufs, nilfs);
+	list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs);
+	return ret;
+}
+
+static void nilfs_end_page_io(struct page *page, int err)
+{
+	if (!page)
+		return;
+
+	if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) {
+		/*
+		 * For b-tree node pages, this function may be called twice
+		 * or more because they might be split in a segment.
+		 */
+		if (PageDirty(page)) {
+			/*
+			 * For pages holding split b-tree node buffers, dirty
+			 * flag on the buffers may be cleared discretely.
+			 * In that case, the page is once redirtied for
+			 * remaining buffers, and it must be cancelled if
+			 * all the buffers get cleaned later.
+			 */
+			lock_page(page);
+			if (nilfs_page_buffers_clean(page))
+				__nilfs_clear_page_dirty(page);
+			unlock_page(page);
+		}
+		return;
+	}
+
+	if (!err) {
+		if (!nilfs_page_buffers_clean(page))
+			__set_page_dirty_nobuffers(page);
+		ClearPageError(page);
+	} else {
+		__set_page_dirty_nobuffers(page);
+		SetPageError(page);
+	}
+
+	end_page_writeback(page);
+}
+
+static void nilfs_abort_logs(struct list_head *logs, int err)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct page *bd_page = NULL, *fs_page = NULL;
+	struct buffer_head *bh;
+
+	if (list_empty(logs))
+		return;
+
+	list_for_each_entry(segbuf, logs, sb_list) {
+		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+				    b_assoc_buffers) {
+			if (bh->b_page != bd_page) {
+				if (bd_page)
+					end_page_writeback(bd_page);
+				bd_page = bh->b_page;
+			}
+		}
+
+		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+				    b_assoc_buffers) {
+			if (bh == segbuf->sb_super_root) {
+				if (bh->b_page != bd_page) {
+					end_page_writeback(bd_page);
+					bd_page = bh->b_page;
+				}
+				break;
+			}
+			if (bh->b_page != fs_page) {
+				nilfs_end_page_io(fs_page, err);
+				fs_page = bh->b_page;
+			}
+		}
+	}
+	if (bd_page)
+		end_page_writeback(bd_page);
+
+	nilfs_end_page_io(fs_page, err);
+}
+
+static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
+					     struct the_nilfs *nilfs, int err)
+{
+	LIST_HEAD(logs);
+	int ret;
+
+	list_splice_tail_init(&sci->sc_write_logs, &logs);
+	ret = nilfs_wait_on_logs(&logs);
+	nilfs_abort_logs(&logs, ret ? : err);
+
+	list_splice_tail_init(&sci->sc_segbufs, &logs);
+	nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
+	nilfs_free_incomplete_logs(&logs, nilfs);
+
+	if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+		ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+						sci->sc_freesegs,
+						sci->sc_nfreesegs,
+						NULL);
+		WARN_ON(ret); /* do not happen */
+	}
+
+	nilfs_destroy_logs(&logs);
+}
+
+static void nilfs_set_next_segment(struct the_nilfs *nilfs,
+				   struct nilfs_segment_buffer *segbuf)
+{
+	nilfs->ns_segnum = segbuf->sb_segnum;
+	nilfs->ns_nextnum = segbuf->sb_nextnum;
+	nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start
+		+ segbuf->sb_sum.nblocks;
+	nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq;
+	nilfs->ns_ctime = segbuf->sb_sum.ctime;
+}
+
+static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct page *bd_page = NULL, *fs_page = NULL;
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	int update_sr = false;
+
+	list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
+		struct buffer_head *bh;
+
+		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+				    b_assoc_buffers) {
+			set_buffer_uptodate(bh);
+			clear_buffer_dirty(bh);
+			if (bh->b_page != bd_page) {
+				if (bd_page)
+					end_page_writeback(bd_page);
+				bd_page = bh->b_page;
+			}
+		}
+		/*
+		 * We assume that the buffers which belong to the same page
+		 * continue over the buffer list.
+		 * Under this assumption, the last BHs of pages is
+		 * identifiable by the discontinuity of bh->b_page
+		 * (page != fs_page).
+		 *
+		 * For B-tree node blocks, however, this assumption is not
+		 * guaranteed.  The cleanup code of B-tree node pages needs
+		 * special care.
+		 */
+		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+				    b_assoc_buffers) {
+			set_buffer_uptodate(bh);
+			clear_buffer_dirty(bh);
+			clear_buffer_delay(bh);
+			clear_buffer_nilfs_volatile(bh);
+			clear_buffer_nilfs_redirected(bh);
+			if (bh == segbuf->sb_super_root) {
+				if (bh->b_page != bd_page) {
+					end_page_writeback(bd_page);
+					bd_page = bh->b_page;
+				}
+				update_sr = true;
+				break;
+			}
+			if (bh->b_page != fs_page) {
+				nilfs_end_page_io(fs_page, 0);
+				fs_page = bh->b_page;
+			}
+		}
+
+		if (!nilfs_segbuf_simplex(segbuf)) {
+			if (segbuf->sb_sum.flags & NILFS_SS_LOGBGN) {
+				set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
+				sci->sc_lseg_stime = jiffies;
+			}
+			if (segbuf->sb_sum.flags & NILFS_SS_LOGEND)
+				clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
+		}
+	}
+	/*
+	 * Since pages may continue over multiple segment buffers,
+	 * end of the last page must be checked outside of the loop.
+	 */
+	if (bd_page)
+		end_page_writeback(bd_page);
+
+	nilfs_end_page_io(fs_page, 0);
+
+	nilfs_drop_collected_inodes(&sci->sc_dirty_files);
+
+	if (nilfs_doing_gc())
+		nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
+	else
+		nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
+
+	sci->sc_nblk_inc += sci->sc_nblk_this_inc;
+
+	segbuf = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
+	nilfs_set_next_segment(nilfs, segbuf);
+
+	if (update_sr) {
+		nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
+				       segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
+
+		clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
+		clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+		set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+		nilfs_segctor_clear_metadata_dirty(sci);
+	} else
+		clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+}
+
+static int nilfs_segctor_wait(struct nilfs_sc_info *sci)
+{
+	int ret;
+
+	ret = nilfs_wait_on_logs(&sci->sc_write_logs);
+	if (!ret) {
+		nilfs_segctor_complete_write(sci);
+		nilfs_destroy_logs(&sci->sc_write_logs);
+	}
+	return ret;
+}
+
+static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
+					     struct the_nilfs *nilfs)
+{
+	struct nilfs_inode_info *ii, *n;
+	struct inode *ifile = sci->sc_root->ifile;
+
+	spin_lock(&nilfs->ns_inode_lock);
+ retry:
+	list_for_each_entry_safe(ii, n, &nilfs->ns_dirty_files, i_dirty) {
+		if (!ii->i_bh) {
+			struct buffer_head *ibh;
+			int err;
+
+			spin_unlock(&nilfs->ns_inode_lock);
+			err = nilfs_ifile_get_inode_block(
+				ifile, ii->vfs_inode.i_ino, &ibh);
+			if (unlikely(err)) {
+				nilfs_warning(sci->sc_super, __func__,
+					      "failed to get inode block.\n");
+				return err;
+			}
+			mark_buffer_dirty(ibh);
+			nilfs_mdt_mark_dirty(ifile);
+			spin_lock(&nilfs->ns_inode_lock);
+			if (likely(!ii->i_bh))
+				ii->i_bh = ibh;
+			else
+				brelse(ibh);
+			goto retry;
+		}
+
+		clear_bit(NILFS_I_QUEUED, &ii->i_state);
+		set_bit(NILFS_I_BUSY, &ii->i_state);
+		list_move_tail(&ii->i_dirty, &sci->sc_dirty_files);
+	}
+	spin_unlock(&nilfs->ns_inode_lock);
+
+	return 0;
+}
+
+static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
+					     struct the_nilfs *nilfs)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+	struct nilfs_inode_info *ii, *n;
+
+	spin_lock(&nilfs->ns_inode_lock);
+	list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
+		if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
+		    test_bit(NILFS_I_DIRTY, &ii->i_state))
+			continue;
+
+		clear_bit(NILFS_I_BUSY, &ii->i_state);
+		brelse(ii->i_bh);
+		ii->i_bh = NULL;
+		list_move_tail(&ii->i_dirty, &ti->ti_garbage);
+	}
+	spin_unlock(&nilfs->ns_inode_lock);
+}
+
+/*
+ * Main procedure of segment constructor
+ */
+static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	int err;
+
+	sci->sc_stage.scnt = NILFS_ST_INIT;
+	sci->sc_cno = nilfs->ns_cno;
+
+	err = nilfs_segctor_collect_dirty_files(sci, nilfs);
+	if (unlikely(err))
+		goto out;
+
+	if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
+		set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+
+	if (nilfs_segctor_clean(sci))
+		goto out;
+
+	do {
+		sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK;
+
+		err = nilfs_segctor_begin_construction(sci, nilfs);
+		if (unlikely(err))
+			goto out;
+
+		/* Update time stamp */
+		sci->sc_seg_ctime = get_seconds();
+
+		err = nilfs_segctor_collect(sci, nilfs, mode);
+		if (unlikely(err))
+			goto failed;
+
+		/* Avoid empty segment */
+		if (sci->sc_stage.scnt == NILFS_ST_DONE &&
+		    nilfs_segbuf_empty(sci->sc_curseg)) {
+			nilfs_segctor_abort_construction(sci, nilfs, 1);
+			goto out;
+		}
+
+		err = nilfs_segctor_assign(sci, mode);
+		if (unlikely(err))
+			goto failed;
+
+		if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
+			nilfs_segctor_fill_in_file_bmap(sci);
+
+		if (mode == SC_LSEG_SR &&
+		    sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
+			err = nilfs_segctor_fill_in_checkpoint(sci);
+			if (unlikely(err))
+				goto failed_to_write;
+
+			nilfs_segctor_fill_in_super_root(sci, nilfs);
+		}
+		nilfs_segctor_update_segusage(sci, nilfs->ns_sufile);
+
+		/* Write partial segments */
+		nilfs_segctor_prepare_write(sci);
+
+		nilfs_add_checksums_on_logs(&sci->sc_segbufs,
+					    nilfs->ns_crc_seed);
+
+		err = nilfs_segctor_write(sci, nilfs);
+		if (unlikely(err))
+			goto failed_to_write;
+
+		if (sci->sc_stage.scnt == NILFS_ST_DONE ||
+		    nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
+			/*
+			 * At this point, we avoid double buffering
+			 * for blocksize < pagesize because page dirty
+			 * flag is turned off during write and dirty
+			 * buffers are not properly collected for
+			 * pages crossing over segments.
+			 */
+			err = nilfs_segctor_wait(sci);
+			if (err)
+				goto failed_to_write;
+		}
+	} while (sci->sc_stage.scnt != NILFS_ST_DONE);
+
+ out:
+	nilfs_segctor_drop_written_files(sci, nilfs);
+	return err;
+
+ failed_to_write:
+	if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
+		nilfs_redirty_inodes(&sci->sc_dirty_files);
+
+ failed:
+	if (nilfs_doing_gc())
+		nilfs_redirty_inodes(&sci->sc_gc_inodes);
+	nilfs_segctor_abort_construction(sci, nilfs, err);
+	goto out;
+}
+
+/**
+ * nilfs_segctor_start_timer - set timer of background write
+ * @sci: nilfs_sc_info
+ *
+ * If the timer has already been set, it ignores the new request.
+ * This function MUST be called within a section locking the segment
+ * semaphore.
+ */
+static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
+{
+	spin_lock(&sci->sc_state_lock);
+	if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
+		sci->sc_timer.expires = jiffies + sci->sc_interval;
+		add_timer(&sci->sc_timer);
+		sci->sc_state |= NILFS_SEGCTOR_COMMIT;
+	}
+	spin_unlock(&sci->sc_state_lock);
+}
+
+static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
+{
+	spin_lock(&sci->sc_state_lock);
+	if (!(sci->sc_flush_request & (1 << bn))) {
+		unsigned long prev_req = sci->sc_flush_request;
+
+		sci->sc_flush_request |= (1 << bn);
+		if (!prev_req)
+			wake_up(&sci->sc_wait_daemon);
+	}
+	spin_unlock(&sci->sc_state_lock);
+}
+
+/**
+ * nilfs_flush_segment - trigger a segment construction for resource control
+ * @sb: super block
+ * @ino: inode number of the file to be flushed out.
+ */
+void nilfs_flush_segment(struct super_block *sb, ino_t ino)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
+
+	if (!sci || nilfs_doing_construction())
+		return;
+	nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0);
+					/* assign bit 0 to data files */
+}
+
+struct nilfs_segctor_wait_request {
+	wait_queue_t	wq;
+	__u32		seq;
+	int		err;
+	atomic_t	done;
+};
+
+static int nilfs_segctor_sync(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segctor_wait_request wait_req;
+	int err = 0;
+
+	spin_lock(&sci->sc_state_lock);
+	init_wait(&wait_req.wq);
+	wait_req.err = 0;
+	atomic_set(&wait_req.done, 0);
+	wait_req.seq = ++sci->sc_seq_request;
+	spin_unlock(&sci->sc_state_lock);
+
+	init_waitqueue_entry(&wait_req.wq, current);
+	add_wait_queue(&sci->sc_wait_request, &wait_req.wq);
+	set_current_state(TASK_INTERRUPTIBLE);
+	wake_up(&sci->sc_wait_daemon);
+
+	for (;;) {
+		if (atomic_read(&wait_req.done)) {
+			err = wait_req.err;
+			break;
+		}
+		if (!signal_pending(current)) {
+			schedule();
+			continue;
+		}
+		err = -ERESTARTSYS;
+		break;
+	}
+	finish_wait(&sci->sc_wait_request, &wait_req.wq);
+	return err;
+}
+
+static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
+{
+	struct nilfs_segctor_wait_request *wrq, *n;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sci->sc_wait_request.lock, flags);
+	list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list,
+				 wq.task_list) {
+		if (!atomic_read(&wrq->done) &&
+		    nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) {
+			wrq->err = err;
+			atomic_set(&wrq->done, 1);
+		}
+		if (atomic_read(&wrq->done)) {
+			wrq->wq.func(&wrq->wq,
+				     TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+				     0, NULL);
+		}
+	}
+	spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags);
+}
+
+/**
+ * nilfs_construct_segment - construct a logical segment
+ * @sb: super block
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_construct_segment(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
+	struct nilfs_transaction_info *ti;
+	int err;
+
+	if (!sci)
+		return -EROFS;
+
+	/* A call inside transactions causes a deadlock. */
+	BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC);
+
+	err = nilfs_segctor_sync(sci);
+	return err;
+}
+
+/**
+ * nilfs_construct_dsync_segment - construct a data-only logical segment
+ * @sb: super block
+ * @inode: inode whose data blocks should be written out
+ * @start: start byte offset
+ * @end: end byte offset (inclusive)
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
+				  loff_t start, loff_t end)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
+	struct nilfs_inode_info *ii;
+	struct nilfs_transaction_info ti;
+	int err = 0;
+
+	if (!sci)
+		return -EROFS;
+
+	nilfs_transaction_lock(sb, &ti, 0);
+
+	ii = NILFS_I(inode);
+	if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
+	    nilfs_test_opt(nilfs, STRICT_ORDER) ||
+	    test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
+	    nilfs_discontinued(nilfs)) {
+		nilfs_transaction_unlock(sb);
+		err = nilfs_segctor_sync(sci);
+		return err;
+	}
+
+	spin_lock(&nilfs->ns_inode_lock);
+	if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
+	    !test_bit(NILFS_I_BUSY, &ii->i_state)) {
+		spin_unlock(&nilfs->ns_inode_lock);
+		nilfs_transaction_unlock(sb);
+		return 0;
+	}
+	spin_unlock(&nilfs->ns_inode_lock);
+	sci->sc_dsync_inode = ii;
+	sci->sc_dsync_start = start;
+	sci->sc_dsync_end = end;
+
+	err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
+
+	nilfs_transaction_unlock(sb);
+	return err;
+}
+
+#define FLUSH_FILE_BIT	(0x1) /* data file only */
+#define FLUSH_DAT_BIT	(1 << NILFS_DAT_INO) /* DAT only */
+
+/**
+ * nilfs_segctor_accept - record accepted sequence count of log-write requests
+ * @sci: segment constructor object
+ */
+static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
+{
+	spin_lock(&sci->sc_state_lock);
+	sci->sc_seq_accepted = sci->sc_seq_request;
+	spin_unlock(&sci->sc_state_lock);
+	del_timer_sync(&sci->sc_timer);
+}
+
+/**
+ * nilfs_segctor_notify - notify the result of request to caller threads
+ * @sci: segment constructor object
+ * @mode: mode of log forming
+ * @err: error code to be notified
+ */
+static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
+{
+	/* Clear requests (even when the construction failed) */
+	spin_lock(&sci->sc_state_lock);
+
+	if (mode == SC_LSEG_SR) {
+		sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
+		sci->sc_seq_done = sci->sc_seq_accepted;
+		nilfs_segctor_wakeup(sci, err);
+		sci->sc_flush_request = 0;
+	} else {
+		if (mode == SC_FLUSH_FILE)
+			sci->sc_flush_request &= ~FLUSH_FILE_BIT;
+		else if (mode == SC_FLUSH_DAT)
+			sci->sc_flush_request &= ~FLUSH_DAT_BIT;
+
+		/* re-enable timer if checkpoint creation was not done */
+		if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+		    time_before(jiffies, sci->sc_timer.expires))
+			add_timer(&sci->sc_timer);
+	}
+	spin_unlock(&sci->sc_state_lock);
+}
+
+/**
+ * nilfs_segctor_construct - form logs and write them to disk
+ * @sci: segment constructor object
+ * @mode: mode of log forming
+ */
+static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	struct nilfs_super_block **sbp;
+	int err = 0;
+
+	nilfs_segctor_accept(sci);
+
+	if (nilfs_discontinued(nilfs))
+		mode = SC_LSEG_SR;
+	if (!nilfs_segctor_confirm(sci))
+		err = nilfs_segctor_do_construct(sci, mode);
+
+	if (likely(!err)) {
+		if (mode != SC_FLUSH_DAT)
+			atomic_set(&nilfs->ns_ndirtyblks, 0);
+		if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
+		    nilfs_discontinued(nilfs)) {
+			down_write(&nilfs->ns_sem);
+			err = -EIO;
+			sbp = nilfs_prepare_super(sci->sc_super,
+						  nilfs_sb_will_flip(nilfs));
+			if (likely(sbp)) {
+				nilfs_set_log_cursor(sbp[0], nilfs);
+				err = nilfs_commit_super(sci->sc_super,
+							 NILFS_SB_COMMIT);
+			}
+			up_write(&nilfs->ns_sem);
+		}
+	}
+
+	nilfs_segctor_notify(sci, mode, err);
+	return err;
+}
+
+static void nilfs_construction_timeout(unsigned long data)
+{
+	struct task_struct *p = (struct task_struct *)data;
+	wake_up_process(p);
+}
+
+static void
+nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
+{
+	struct nilfs_inode_info *ii, *n;
+
+	list_for_each_entry_safe(ii, n, head, i_dirty) {
+		if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
+			continue;
+		list_del_init(&ii->i_dirty);
+		iput(&ii->vfs_inode);
+	}
+}
+
+int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
+			 void **kbufs)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	if (unlikely(!sci))
+		return -EROFS;
+
+	nilfs_transaction_lock(sb, &ti, 1);
+
+	err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
+	if (unlikely(err))
+		goto out_unlock;
+
+	err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
+	if (unlikely(err)) {
+		nilfs_mdt_restore_from_shadow_map(nilfs->ns_dat);
+		goto out_unlock;
+	}
+
+	sci->sc_freesegs = kbufs[4];
+	sci->sc_nfreesegs = argv[4].v_nmembs;
+	list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes);
+
+	for (;;) {
+		err = nilfs_segctor_construct(sci, SC_LSEG_SR);
+		nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
+
+		if (likely(!err))
+			break;
+
+		nilfs_warning(sb, __func__,
+			      "segment construction failed. (err=%d)", err);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(sci->sc_interval);
+	}
+	if (nilfs_test_opt(nilfs, DISCARD)) {
+		int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
+						 sci->sc_nfreesegs);
+		if (ret) {
+			printk(KERN_WARNING
+			       "NILFS warning: error %d on discard request, "
+			       "turning discards off for the device\n", ret);
+			nilfs_clear_opt(nilfs, DISCARD);
+		}
+	}
+
+ out_unlock:
+	sci->sc_freesegs = NULL;
+	sci->sc_nfreesegs = 0;
+	nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
+	nilfs_transaction_unlock(sb);
+	return err;
+}
+
+static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
+{
+	struct nilfs_transaction_info ti;
+
+	nilfs_transaction_lock(sci->sc_super, &ti, 0);
+	nilfs_segctor_construct(sci, mode);
+
+	/*
+	 * Unclosed segment should be retried.  We do this using sc_timer.
+	 * Timeout of sc_timer will invoke complete construction which leads
+	 * to close the current logical segment.
+	 */
+	if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
+		nilfs_segctor_start_timer(sci);
+
+	nilfs_transaction_unlock(sci->sc_super);
+}
+
+static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
+{
+	int mode = 0;
+	int err;
+
+	spin_lock(&sci->sc_state_lock);
+	mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
+		SC_FLUSH_DAT : SC_FLUSH_FILE;
+	spin_unlock(&sci->sc_state_lock);
+
+	if (mode) {
+		err = nilfs_segctor_do_construct(sci, mode);
+
+		spin_lock(&sci->sc_state_lock);
+		sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
+			~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT;
+		spin_unlock(&sci->sc_state_lock);
+	}
+	clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
+}
+
+static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
+{
+	if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
+	    time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) {
+		if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT))
+			return SC_FLUSH_FILE;
+		else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT))
+			return SC_FLUSH_DAT;
+	}
+	return SC_LSEG_SR;
+}
+
+/**
+ * nilfs_segctor_thread - main loop of the segment constructor thread.
+ * @arg: pointer to a struct nilfs_sc_info.
+ *
+ * nilfs_segctor_thread() initializes a timer and serves as a daemon
+ * to execute segment constructions.
+ */
+static int nilfs_segctor_thread(void *arg)
+{
+	struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	int timeout = 0;
+
+	sci->sc_timer.data = (unsigned long)current;
+	sci->sc_timer.function = nilfs_construction_timeout;
+
+	/* start sync. */
+	sci->sc_task = current;
+	wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
+	printk(KERN_INFO
+	       "segctord starting. Construction interval = %lu seconds, "
+	       "CP frequency < %lu seconds\n",
+	       sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
+
+	spin_lock(&sci->sc_state_lock);
+ loop:
+	for (;;) {
+		int mode;
+
+		if (sci->sc_state & NILFS_SEGCTOR_QUIT)
+			goto end_thread;
+
+		if (timeout || sci->sc_seq_request != sci->sc_seq_done)
+			mode = SC_LSEG_SR;
+		else if (!sci->sc_flush_request)
+			break;
+		else
+			mode = nilfs_segctor_flush_mode(sci);
+
+		spin_unlock(&sci->sc_state_lock);
+		nilfs_segctor_thread_construct(sci, mode);
+		spin_lock(&sci->sc_state_lock);
+		timeout = 0;
+	}
+
+
+	if (freezing(current)) {
+		spin_unlock(&sci->sc_state_lock);
+		refrigerator();
+		spin_lock(&sci->sc_state_lock);
+	} else {
+		DEFINE_WAIT(wait);
+		int should_sleep = 1;
+
+		prepare_to_wait(&sci->sc_wait_daemon, &wait,
+				TASK_INTERRUPTIBLE);
+
+		if (sci->sc_seq_request != sci->sc_seq_done)
+			should_sleep = 0;
+		else if (sci->sc_flush_request)
+			should_sleep = 0;
+		else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
+			should_sleep = time_before(jiffies,
+					sci->sc_timer.expires);
+
+		if (should_sleep) {
+			spin_unlock(&sci->sc_state_lock);
+			schedule();
+			spin_lock(&sci->sc_state_lock);
+		}
+		finish_wait(&sci->sc_wait_daemon, &wait);
+		timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+			   time_after_eq(jiffies, sci->sc_timer.expires));
+
+		if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
+			set_nilfs_discontinued(nilfs);
+	}
+	goto loop;
+
+ end_thread:
+	spin_unlock(&sci->sc_state_lock);
+
+	/* end sync. */
+	sci->sc_task = NULL;
+	wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
+	return 0;
+}
+
+static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
+{
+	struct task_struct *t;
+
+	t = kthread_run(nilfs_segctor_thread, sci, "segctord");
+	if (IS_ERR(t)) {
+		int err = PTR_ERR(t);
+
+		printk(KERN_ERR "NILFS: error %d creating segctord thread\n",
+		       err);
+		return err;
+	}
+	wait_event(sci->sc_wait_task, sci->sc_task != NULL);
+	return 0;
+}
+
+static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
+	__acquires(&sci->sc_state_lock)
+	__releases(&sci->sc_state_lock)
+{
+	sci->sc_state |= NILFS_SEGCTOR_QUIT;
+
+	while (sci->sc_task) {
+		wake_up(&sci->sc_wait_daemon);
+		spin_unlock(&sci->sc_state_lock);
+		wait_event(sci->sc_wait_task, sci->sc_task == NULL);
+		spin_lock(&sci->sc_state_lock);
+	}
+}
+
+/*
+ * Setup & clean-up functions
+ */
+static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
+					       struct nilfs_root *root)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_sc_info *sci;
+
+	sci = kzalloc(sizeof(*sci), GFP_KERNEL);
+	if (!sci)
+		return NULL;
+
+	sci->sc_super = sb;
+
+	nilfs_get_root(root);
+	sci->sc_root = root;
+
+	init_waitqueue_head(&sci->sc_wait_request);
+	init_waitqueue_head(&sci->sc_wait_daemon);
+	init_waitqueue_head(&sci->sc_wait_task);
+	spin_lock_init(&sci->sc_state_lock);
+	INIT_LIST_HEAD(&sci->sc_dirty_files);
+	INIT_LIST_HEAD(&sci->sc_segbufs);
+	INIT_LIST_HEAD(&sci->sc_write_logs);
+	INIT_LIST_HEAD(&sci->sc_gc_inodes);
+	init_timer(&sci->sc_timer);
+
+	sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
+	sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
+	sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
+
+	if (nilfs->ns_interval)
+		sci->sc_interval = HZ * nilfs->ns_interval;
+	if (nilfs->ns_watermark)
+		sci->sc_watermark = nilfs->ns_watermark;
+	return sci;
+}
+
+static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
+{
+	int ret, retrycount = NILFS_SC_CLEANUP_RETRY;
+
+	/* The segctord thread was stopped and its timer was removed.
+	   But some tasks remain. */
+	do {
+		struct nilfs_transaction_info ti;
+
+		nilfs_transaction_lock(sci->sc_super, &ti, 0);
+		ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
+		nilfs_transaction_unlock(sci->sc_super);
+
+	} while (ret && retrycount-- > 0);
+}
+
+/**
+ * nilfs_segctor_destroy - destroy the segment constructor.
+ * @sci: nilfs_sc_info
+ *
+ * nilfs_segctor_destroy() kills the segctord thread and frees
+ * the nilfs_sc_info struct.
+ * Caller must hold the segment semaphore.
+ */
+static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
+{
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+	int flag;
+
+	up_write(&nilfs->ns_segctor_sem);
+
+	spin_lock(&sci->sc_state_lock);
+	nilfs_segctor_kill_thread(sci);
+	flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request
+		|| sci->sc_seq_request != sci->sc_seq_done);
+	spin_unlock(&sci->sc_state_lock);
+
+	if (flag || !nilfs_segctor_confirm(sci))
+		nilfs_segctor_write_out(sci);
+
+	if (!list_empty(&sci->sc_dirty_files)) {
+		nilfs_warning(sci->sc_super, __func__,
+			      "dirty file(s) after the final construction\n");
+		nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
+	}
+
+	WARN_ON(!list_empty(&sci->sc_segbufs));
+	WARN_ON(!list_empty(&sci->sc_write_logs));
+
+	nilfs_put_root(sci->sc_root);
+
+	down_write(&nilfs->ns_segctor_sem);
+
+	del_timer_sync(&sci->sc_timer);
+	kfree(sci);
+}
+
+/**
+ * nilfs_attach_log_writer - attach log writer
+ * @sb: super block instance
+ * @root: root object of the current filesystem tree
+ *
+ * This allocates a log writer object, initializes it, and starts the
+ * log writer.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	int err;
+
+	if (nilfs->ns_writer) {
+		/*
+		 * This happens if the filesystem was remounted
+		 * read/write after nilfs_error degenerated it into a
+		 * read-only mount.
+		 */
+		nilfs_detach_log_writer(sb);
+	}
+
+	nilfs->ns_writer = nilfs_segctor_new(sb, root);
+	if (!nilfs->ns_writer)
+		return -ENOMEM;
+
+	err = nilfs_segctor_start_thread(nilfs->ns_writer);
+	if (err) {
+		kfree(nilfs->ns_writer);
+		nilfs->ns_writer = NULL;
+	}
+	return err;
+}
+
+/**
+ * nilfs_detach_log_writer - destroy log writer
+ * @sb: super block instance
+ *
+ * This kills log writer daemon, frees the log writer object, and
+ * destroys list of dirty files.
+ */
+void nilfs_detach_log_writer(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	LIST_HEAD(garbage_list);
+
+	down_write(&nilfs->ns_segctor_sem);
+	if (nilfs->ns_writer) {
+		nilfs_segctor_destroy(nilfs->ns_writer);
+		nilfs->ns_writer = NULL;
+	}
+
+	/* Force to free the list of dirty files */
+	spin_lock(&nilfs->ns_inode_lock);
+	if (!list_empty(&nilfs->ns_dirty_files)) {
+		list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
+		nilfs_warning(sb, __func__,
+			      "Hit dirty file after stopped log writer\n");
+	}
+	spin_unlock(&nilfs->ns_inode_lock);
+	up_write(&nilfs->ns_segctor_sem);
+
+	nilfs_dispose_list(nilfs, &garbage_list, 1);
+}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
new file mode 100644
index 00000000..38a1d001
--- /dev/null
+++ b/fs/nilfs2/segment.h
@@ -0,0 +1,246 @@
+/*
+ * segment.h - NILFS Segment constructor prototypes and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGMENT_H
+#define _NILFS_SEGMENT_H
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "nilfs.h"
+
+struct nilfs_root;
+
+/**
+ * struct nilfs_recovery_info - Recovery information
+ * @ri_need_recovery: Recovery status
+ * @ri_super_root: Block number of the last super root
+ * @ri_ri_cno: Number of the last checkpoint
+ * @ri_lsegs_start: Region for roll-forwarding (start block number)
+ * @ri_lsegs_end: Region for roll-forwarding (end block number)
+ * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start
+ * @ri_used_segments: List of segments to be mark active
+ * @ri_pseg_start: Block number of the last partial segment
+ * @ri_seq: Sequence number on the last partial segment
+ * @ri_segnum: Segment number on the last partial segment
+ * @ri_nextnum: Next segment number on the last partial segment
+ */
+struct nilfs_recovery_info {
+	int			ri_need_recovery;
+	sector_t		ri_super_root;
+	__u64			ri_cno;
+
+	sector_t		ri_lsegs_start;
+	sector_t		ri_lsegs_end;
+	u64			ri_lsegs_start_seq;
+	struct list_head	ri_used_segments;
+	sector_t		ri_pseg_start;
+	u64			ri_seq;
+	__u64			ri_segnum;
+	__u64			ri_nextnum;
+};
+
+/* ri_need_recovery */
+#define NILFS_RECOVERY_SR_UPDATED	 1  /* The super root was updated */
+#define NILFS_RECOVERY_ROLLFORWARD_DONE	 2  /* Rollforward was carried out */
+
+/**
+ * struct nilfs_cstage - Context of collection stage
+ * @scnt: Stage count
+ * @flags: State flags
+ * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
+ * @gc_inode_ptr: Pointer on the list of gc-inodes
+ */
+struct nilfs_cstage {
+	int			scnt;
+	unsigned		flags;
+	struct nilfs_inode_info *dirty_file_ptr;
+	struct nilfs_inode_info *gc_inode_ptr;
+};
+
+struct nilfs_segment_buffer;
+
+struct nilfs_segsum_pointer {
+	struct buffer_head     *bh;
+	unsigned		offset; /* offset in bytes */
+};
+
+/**
+ * struct nilfs_sc_info - Segment constructor information
+ * @sc_super: Back pointer to super_block struct
+ * @sc_root: root object of the current filesystem tree
+ * @sc_nblk_inc: Block count of current generation
+ * @sc_dirty_files: List of files to be written
+ * @sc_gc_inodes: List of GC inodes having blocks to be written
+ * @sc_freesegs: array of segment numbers to be freed
+ * @sc_nfreesegs: number of segments on @sc_freesegs
+ * @sc_dsync_inode: inode whose data pages are written for a sync operation
+ * @sc_dsync_start: start byte offset of data pages
+ * @sc_dsync_end: end byte offset of data pages (inclusive)
+ * @sc_segbufs: List of segment buffers
+ * @sc_write_logs: List of segment buffers to hold logs under writing
+ * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
+ * @sc_curseg: Current segment buffer
+ * @sc_stage: Collection stage
+ * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
+ * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
+ * @sc_blk_cnt:	Block count of a file
+ * @sc_datablk_cnt: Data block count of a file
+ * @sc_nblk_this_inc: Number of blocks included in the current logical segment
+ * @sc_seg_ctime: Creation time
+ * @sc_cno: checkpoint number of current log
+ * @sc_flags: Internal flags
+ * @sc_state_lock: spinlock for sc_state and so on
+ * @sc_state: Segctord state flags
+ * @sc_flush_request: inode bitmap of metadata files to be flushed
+ * @sc_wait_request: Client request queue
+ * @sc_wait_daemon: Daemon wait queue
+ * @sc_wait_task: Start/end wait queue to control segctord task
+ * @sc_seq_request: Request counter
+ * @sc_seq_accept: Accepted request count
+ * @sc_seq_done: Completion counter
+ * @sc_sync: Request of explicit sync operation
+ * @sc_interval: Timeout value of background construction
+ * @sc_mjcp_freq: Frequency of creating checkpoints
+ * @sc_lseg_stime: Start time of the latest logical segment
+ * @sc_watermark: Watermark for the number of dirty buffers
+ * @sc_timer: Timer for segctord
+ * @sc_task: current thread of segctord
+ */
+struct nilfs_sc_info {
+	struct super_block     *sc_super;
+	struct nilfs_root      *sc_root;
+
+	unsigned long		sc_nblk_inc;
+
+	struct list_head	sc_dirty_files;
+	struct list_head	sc_gc_inodes;
+
+	__u64		       *sc_freesegs;
+	size_t			sc_nfreesegs;
+
+	struct nilfs_inode_info *sc_dsync_inode;
+	loff_t			sc_dsync_start;
+	loff_t			sc_dsync_end;
+
+	/* Segment buffers */
+	struct list_head	sc_segbufs;
+	struct list_head	sc_write_logs;
+	unsigned long		sc_segbuf_nblocks;
+	struct nilfs_segment_buffer *sc_curseg;
+
+	struct nilfs_cstage	sc_stage;
+
+	struct nilfs_segsum_pointer sc_finfo_ptr;
+	struct nilfs_segsum_pointer sc_binfo_ptr;
+	unsigned long		sc_blk_cnt;
+	unsigned long		sc_datablk_cnt;
+	unsigned long		sc_nblk_this_inc;
+	time_t			sc_seg_ctime;
+	__u64			sc_cno;
+	unsigned long		sc_flags;
+
+	spinlock_t		sc_state_lock;
+	unsigned long		sc_state;
+	unsigned long		sc_flush_request;
+
+	wait_queue_head_t	sc_wait_request;
+	wait_queue_head_t	sc_wait_daemon;
+	wait_queue_head_t	sc_wait_task;
+
+	__u32			sc_seq_request;
+	__u32			sc_seq_accepted;
+	__u32			sc_seq_done;
+
+	int			sc_sync;
+	unsigned long		sc_interval;
+	unsigned long		sc_mjcp_freq;
+	unsigned long		sc_lseg_stime;	/* in 1/HZ seconds */
+	unsigned long		sc_watermark;
+
+	struct timer_list	sc_timer;
+	struct task_struct     *sc_task;
+};
+
+/* sc_flags */
+enum {
+	NILFS_SC_DIRTY,		/* One or more dirty meta-data blocks exist */
+	NILFS_SC_UNCLOSED,	/* Logical segment is not closed */
+	NILFS_SC_SUPER_ROOT,	/* The latest segment has a super root */
+	NILFS_SC_PRIOR_FLUSH,	/* Requesting immediate flush without making a
+				   checkpoint */
+	NILFS_SC_HAVE_DELTA,	/* Next checkpoint will have update of files
+				   other than DAT, cpfile, sufile, or files
+				   moved by GC */
+};
+
+/* sc_state */
+#define NILFS_SEGCTOR_QUIT	    0x0001  /* segctord is being destroyed */
+#define NILFS_SEGCTOR_COMMIT	    0x0004  /* committed transaction exists */
+
+/*
+ * Constant parameters
+ */
+#define NILFS_SC_CLEANUP_RETRY	    3  /* Retry count of construction when
+					  destroying segctord */
+
+/*
+ * Default values of timeout, in seconds.
+ */
+#define NILFS_SC_DEFAULT_TIMEOUT    5   /* Timeout value of dirty blocks.
+					   It triggers construction of a
+					   logical segment with a super root */
+#define NILFS_SC_DEFAULT_SR_FREQ    30  /* Maximum frequency of super root
+					   creation */
+
+/*
+ * The default threshold amount of data, in block counts.
+ */
+#define NILFS_SC_DEFAULT_WATERMARK  3600
+
+/* super.c */
+extern struct kmem_cache *nilfs_transaction_cachep;
+
+/* segment.c */
+extern void nilfs_relax_pressure_in_lock(struct super_block *);
+
+extern int nilfs_construct_segment(struct super_block *);
+extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *,
+					 loff_t, loff_t);
+extern void nilfs_flush_segment(struct super_block *, ino_t);
+extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
+				void **);
+
+int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root);
+void nilfs_detach_log_writer(struct super_block *sb);
+
+/* recovery.c */
+extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t,
+				       struct buffer_head **, int);
+extern int nilfs_search_super_root(struct the_nilfs *,
+				   struct nilfs_recovery_info *);
+int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, struct super_block *sb,
+			      struct nilfs_recovery_info *ri);
+extern void nilfs_dispose_segment_list(struct list_head *);
+
+#endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
new file mode 100644
index 00000000..0a0aba61
--- /dev/null
+++ b/fs/nilfs2/sufile.c
@@ -0,0 +1,921 @@
+/*
+ * sufile.c - NILFS segment usage file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/errno.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "sufile.h"
+
+
+struct nilfs_sufile_info {
+	struct nilfs_mdt_info mi;
+	unsigned long ncleansegs;/* number of clean segments */
+	__u64 allocmin;		/* lower limit of allocatable segment range */
+	__u64 allocmax;		/* upper limit of allocatable segment range */
+};
+
+static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile)
+{
+	return (struct nilfs_sufile_info *)NILFS_MDT(sufile);
+}
+
+static inline unsigned long
+nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
+{
+	return NILFS_MDT(sufile)->mi_entries_per_block;
+}
+
+static unsigned long
+nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
+{
+	__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+	do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
+	return (unsigned long)t;
+}
+
+static unsigned long
+nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum)
+{
+	__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+	return do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
+}
+
+static unsigned long
+nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
+				     __u64 max)
+{
+	return min_t(unsigned long,
+		     nilfs_sufile_segment_usages_per_block(sufile) -
+		     nilfs_sufile_get_offset(sufile, curr),
+		     max - curr + 1);
+}
+
+static struct nilfs_segment_usage *
+nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
+				     struct buffer_head *bh, void *kaddr)
+{
+	return kaddr + bh_offset(bh) +
+		nilfs_sufile_get_offset(sufile, segnum) *
+		NILFS_MDT(sufile)->mi_entry_size;
+}
+
+static inline int nilfs_sufile_get_header_block(struct inode *sufile,
+						struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp);
+}
+
+static inline int
+nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
+				     int create, struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(sufile,
+				   nilfs_sufile_get_blkoff(sufile, segnum),
+				   create, NULL, bhp);
+}
+
+static int nilfs_sufile_delete_segment_usage_block(struct inode *sufile,
+						   __u64 segnum)
+{
+	return nilfs_mdt_delete_block(sufile,
+				      nilfs_sufile_get_blkoff(sufile, segnum));
+}
+
+static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
+				     u64 ncleanadd, u64 ndirtyadd)
+{
+	struct nilfs_sufile_header *header;
+	void *kaddr;
+
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = kaddr + bh_offset(header_bh);
+	le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
+	le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	mark_buffer_dirty(header_bh);
+}
+
+/**
+ * nilfs_sufile_get_ncleansegs - return the number of clean segments
+ * @sufile: inode of segment usage file
+ */
+unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile)
+{
+	return NILFS_SUI(sufile)->ncleansegs;
+}
+
+/**
+ * nilfs_sufile_updatev - modify multiple segment usages at a time
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @create: creation flag
+ * @ndone: place to store number of modified segments on @segnumv
+ * @dofunc: primitive operation for the update
+ *
+ * Description: nilfs_sufile_updatev() repeatedly calls @dofunc
+ * against the given array of segments.  The @dofunc is called with
+ * buffers of a header block and the sufile block in which the target
+ * segment usage entry is contained.  If @ndone is given, the number
+ * of successfully modified segments from the head is stored in the
+ * place @ndone points to.
+ *
+ * Return Value: On success, zero is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - Given segment usage is in hole block (may be returned if
+ *            @create is zero)
+ *
+ * %-EINVAL - Invalid segment usage number
+ */
+int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs,
+			 int create, size_t *ndone,
+			 void (*dofunc)(struct inode *, __u64,
+					struct buffer_head *,
+					struct buffer_head *))
+{
+	struct buffer_head *header_bh, *bh;
+	unsigned long blkoff, prev_blkoff;
+	__u64 *seg;
+	size_t nerr = 0, n = 0;
+	int ret = 0;
+
+	if (unlikely(nsegs == 0))
+		goto out;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+	for (seg = segnumv; seg < segnumv + nsegs; seg++) {
+		if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) {
+			printk(KERN_WARNING
+			       "%s: invalid segment number: %llu\n", __func__,
+			       (unsigned long long)*seg);
+			nerr++;
+		}
+	}
+	if (nerr > 0) {
+		ret = -EINVAL;
+		goto out_sem;
+	}
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+
+	seg = segnumv;
+	blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
+	ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
+	if (ret < 0)
+		goto out_header;
+
+	for (;;) {
+		dofunc(sufile, *seg, header_bh, bh);
+
+		if (++seg >= segnumv + nsegs)
+			break;
+		prev_blkoff = blkoff;
+		blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
+		if (blkoff == prev_blkoff)
+			continue;
+
+		/* get different block */
+		brelse(bh);
+		ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
+		if (unlikely(ret < 0))
+			goto out_header;
+	}
+	brelse(bh);
+
+ out_header:
+	n = seg - segnumv;
+	brelse(header_bh);
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+ out:
+	if (ndone)
+		*ndone = n;
+	return ret;
+}
+
+int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
+			void (*dofunc)(struct inode *, __u64,
+				       struct buffer_head *,
+				       struct buffer_head *))
+{
+	struct buffer_head *header_bh, *bh;
+	int ret;
+
+	if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
+		printk(KERN_WARNING "%s: invalid segment number: %llu\n",
+		       __func__, (unsigned long long)segnum);
+		return -EINVAL;
+	}
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+
+	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh);
+	if (!ret) {
+		dofunc(sufile, segnum, header_bh, bh);
+		brelse(bh);
+	}
+	brelse(header_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_set_alloc_range - limit range of segment to be allocated
+ * @sufile: inode of segment usage file
+ * @start: minimum segment number of allocatable region (inclusive)
+ * @end: maximum segment number of allocatable region (inclusive)
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-ERANGE - invalid segment region
+ */
+int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end)
+{
+	struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+	__u64 nsegs;
+	int ret = -ERANGE;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+	nsegs = nilfs_sufile_get_nsegments(sufile);
+
+	if (start <= end && end < nsegs) {
+		sui->allocmin = start;
+		sui->allocmax = end;
+		ret = 0;
+	}
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_alloc - allocate a segment
+ * @sufile: inode of segment usage file
+ * @segnump: pointer to segment number
+ *
+ * Description: nilfs_sufile_alloc() allocates a clean segment.
+ *
+ * Return Value: On success, 0 is returned and the segment number of the
+ * allocated segment is stored in the place pointed by @segnump. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - No clean segment left.
+ */
+int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
+{
+	struct buffer_head *header_bh, *su_bh;
+	struct nilfs_sufile_header *header;
+	struct nilfs_segment_usage *su;
+	struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+	__u64 segnum, maxsegnum, last_alloc;
+	void *kaddr;
+	unsigned long nsegments, ncleansegs, nsus, cnt;
+	int ret, j;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = kaddr + bh_offset(header_bh);
+	ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+	last_alloc = le64_to_cpu(header->sh_last_alloc);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nsegments = nilfs_sufile_get_nsegments(sufile);
+	maxsegnum = sui->allocmax;
+	segnum = last_alloc + 1;
+	if (segnum < sui->allocmin || segnum > sui->allocmax)
+		segnum = sui->allocmin;
+
+	for (cnt = 0; cnt < nsegments; cnt += nsus) {
+		if (segnum > maxsegnum) {
+			if (cnt < sui->allocmax - sui->allocmin + 1) {
+				/*
+				 * wrap around in the limited region.
+				 * if allocation started from
+				 * sui->allocmin, this never happens.
+				 */
+				segnum = sui->allocmin;
+				maxsegnum = last_alloc;
+			} else if (segnum > sui->allocmin &&
+				   sui->allocmax + 1 < nsegments) {
+				segnum = sui->allocmax + 1;
+				maxsegnum = nsegments - 1;
+			} else if (sui->allocmin > 0)  {
+				segnum = 0;
+				maxsegnum = sui->allocmin - 1;
+			} else {
+				break; /* never happens */
+			}
+		}
+		ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
+							   &su_bh);
+		if (ret < 0)
+			goto out_header;
+		kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+		su = nilfs_sufile_block_get_segment_usage(
+			sufile, segnum, su_bh, kaddr);
+
+		nsus = nilfs_sufile_segment_usages_in_block(
+			sufile, segnum, maxsegnum);
+		for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) {
+			if (!nilfs_segment_usage_clean(su))
+				continue;
+			/* found a clean segment */
+			nilfs_segment_usage_set_dirty(su);
+			kunmap_atomic(kaddr, KM_USER0);
+
+			kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+			header = kaddr + bh_offset(header_bh);
+			le64_add_cpu(&header->sh_ncleansegs, -1);
+			le64_add_cpu(&header->sh_ndirtysegs, 1);
+			header->sh_last_alloc = cpu_to_le64(segnum);
+			kunmap_atomic(kaddr, KM_USER0);
+
+			sui->ncleansegs--;
+			mark_buffer_dirty(header_bh);
+			mark_buffer_dirty(su_bh);
+			nilfs_mdt_mark_dirty(sufile);
+			brelse(su_bh);
+			*segnump = segnum;
+			goto out_header;
+		}
+
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(su_bh);
+	}
+
+	/* no segments left */
+	ret = -ENOSPC;
+
+ out_header:
+	brelse(header_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
+				 struct buffer_head *header_bh,
+				 struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+
+	kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (unlikely(!nilfs_segment_usage_clean(su))) {
+		printk(KERN_WARNING "%s: segment %llu must be clean\n",
+		       __func__, (unsigned long long)segnum);
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+	}
+	nilfs_segment_usage_set_dirty(su);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_sufile_mod_counter(header_bh, -1, 1);
+	NILFS_SUI(sufile)->ncleansegs--;
+
+	mark_buffer_dirty(su_bh);
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
+			   struct buffer_head *header_bh,
+			   struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int clean, dirty;
+
+	kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) &&
+	    su->su_nblocks == cpu_to_le32(0)) {
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+	}
+	clean = nilfs_segment_usage_clean(su);
+	dirty = nilfs_segment_usage_dirty(su);
+
+	/* make the segment garbage */
+	su->su_lastmod = cpu_to_le64(0);
+	su->su_nblocks = cpu_to_le32(0);
+	su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
+	NILFS_SUI(sufile)->ncleansegs -= clean;
+
+	mark_buffer_dirty(su_bh);
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
+			  struct buffer_head *header_bh,
+			  struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int sudirty;
+
+	kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (nilfs_segment_usage_clean(su)) {
+		printk(KERN_WARNING "%s: segment %llu is already clean\n",
+		       __func__, (unsigned long long)segnum);
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+	}
+	WARN_ON(nilfs_segment_usage_error(su));
+	WARN_ON(!nilfs_segment_usage_dirty(su));
+
+	sudirty = nilfs_segment_usage_dirty(su);
+	nilfs_segment_usage_set_clean(su);
+	kunmap_atomic(kaddr, KM_USER0);
+	mark_buffer_dirty(su_bh);
+
+	nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
+	NILFS_SUI(sufile)->ncleansegs++;
+
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+/**
+ * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ */
+int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
+{
+	struct buffer_head *bh;
+	int ret;
+
+	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
+	if (!ret) {
+		mark_buffer_dirty(bh);
+		nilfs_mdt_mark_dirty(sufile);
+		brelse(bh);
+	}
+	return ret;
+}
+
+/**
+ * nilfs_sufile_set_segment_usage - set usage of a segment
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ * @nblocks: number of live blocks in the segment
+ * @modtime: modification time (option)
+ */
+int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
+				   unsigned long nblocks, time_t modtime)
+{
+	struct buffer_head *bh;
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int ret;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
+	if (ret < 0)
+		goto out_sem;
+
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+	WARN_ON(nilfs_segment_usage_error(su));
+	if (modtime)
+		su->su_lastmod = cpu_to_le64(modtime);
+	su->su_nblocks = cpu_to_le32(nblocks);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	mark_buffer_dirty(bh);
+	nilfs_mdt_mark_dirty(sufile);
+	brelse(bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_get_stat - get segment usage statistics
+ * @sufile: inode of segment usage file
+ * @stat: pointer to a structure of segment usage statistics
+ *
+ * Description: nilfs_sufile_get_stat() returns information about segment
+ * usage.
+ *
+ * Return Value: On success, 0 is returned, and segment usage information is
+ * stored in the place pointed by @stat. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
+{
+	struct buffer_head *header_bh;
+	struct nilfs_sufile_header *header;
+	struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+	void *kaddr;
+	int ret;
+
+	down_read(&NILFS_MDT(sufile)->mi_sem);
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = kaddr + bh_offset(header_bh);
+	sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
+	sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+	sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
+	sustat->ss_ctime = nilfs->ns_ctime;
+	sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime;
+	spin_lock(&nilfs->ns_last_segment_lock);
+	sustat->ss_prot_seq = nilfs->ns_prot_seq;
+	spin_unlock(&nilfs->ns_last_segment_lock);
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(header_bh);
+
+ out_sem:
+	up_read(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
+			       struct buffer_head *header_bh,
+			       struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int suclean;
+
+	kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (nilfs_segment_usage_error(su)) {
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+	}
+	suclean = nilfs_segment_usage_clean(su);
+	nilfs_segment_usage_set_error(su);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (suclean) {
+		nilfs_sufile_mod_counter(header_bh, -1, 0);
+		NILFS_SUI(sufile)->ncleansegs--;
+	}
+	mark_buffer_dirty(su_bh);
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+/**
+  * nilfs_sufile_truncate_range - truncate range of segment array
+  * @sufile: inode of segment usage file
+  * @start: start segment number (inclusive)
+  * @end: end segment number (inclusive)
+  *
+  * Return Value: On success, 0 is returned.  On error, one of the
+  * following negative error codes is returned.
+  *
+  * %-EIO - I/O error.
+  *
+  * %-ENOMEM - Insufficient amount of memory available.
+  *
+  * %-EINVAL - Invalid number of segments specified
+  *
+  * %-EBUSY - Dirty or active segments are present in the range
+  */
+static int nilfs_sufile_truncate_range(struct inode *sufile,
+				       __u64 start, __u64 end)
+{
+	struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+	struct buffer_head *header_bh;
+	struct buffer_head *su_bh;
+	struct nilfs_segment_usage *su, *su2;
+	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+	unsigned long segusages_per_block;
+	unsigned long nsegs, ncleaned;
+	__u64 segnum;
+	void *kaddr;
+	ssize_t n, nc;
+	int ret;
+	int j;
+
+	nsegs = nilfs_sufile_get_nsegments(sufile);
+
+	ret = -EINVAL;
+	if (start > end || start >= nsegs)
+		goto out;
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out;
+
+	segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
+	ncleaned = 0;
+
+	for (segnum = start; segnum <= end; segnum += n) {
+		n = min_t(unsigned long,
+			  segusages_per_block -
+				  nilfs_sufile_get_offset(sufile, segnum),
+			  end - segnum + 1);
+		ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+							   &su_bh);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				goto out_header;
+			/* hole */
+			continue;
+		}
+		kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+		su = nilfs_sufile_block_get_segment_usage(
+			sufile, segnum, su_bh, kaddr);
+		su2 = su;
+		for (j = 0; j < n; j++, su = (void *)su + susz) {
+			if ((le32_to_cpu(su->su_flags) &
+			     ~(1UL << NILFS_SEGMENT_USAGE_ERROR)) ||
+			    nilfs_segment_is_active(nilfs, segnum + j)) {
+				ret = -EBUSY;
+				kunmap_atomic(kaddr, KM_USER0);
+				brelse(su_bh);
+				goto out_header;
+			}
+		}
+		nc = 0;
+		for (su = su2, j = 0; j < n; j++, su = (void *)su + susz) {
+			if (nilfs_segment_usage_error(su)) {
+				nilfs_segment_usage_set_clean(su);
+				nc++;
+			}
+		}
+		kunmap_atomic(kaddr, KM_USER0);
+		if (nc > 0) {
+			mark_buffer_dirty(su_bh);
+			ncleaned += nc;
+		}
+		brelse(su_bh);
+
+		if (n == segusages_per_block) {
+			/* make hole */
+			nilfs_sufile_delete_segment_usage_block(sufile, segnum);
+		}
+	}
+	ret = 0;
+
+out_header:
+	if (ncleaned > 0) {
+		NILFS_SUI(sufile)->ncleansegs += ncleaned;
+		nilfs_sufile_mod_counter(header_bh, ncleaned, 0);
+		nilfs_mdt_mark_dirty(sufile);
+	}
+	brelse(header_bh);
+out:
+	return ret;
+}
+
+/**
+ * nilfs_sufile_resize - resize segment array
+ * @sufile: inode of segment usage file
+ * @newnsegs: new number of segments
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - Enough free space is not left for shrinking
+ *
+ * %-EBUSY - Dirty or active segments exist in the region to be truncated
+ */
+int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)
+{
+	struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+	struct buffer_head *header_bh;
+	struct nilfs_sufile_header *header;
+	struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+	void *kaddr;
+	unsigned long nsegs, nrsvsegs;
+	int ret = 0;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+
+	nsegs = nilfs_sufile_get_nsegments(sufile);
+	if (nsegs == newnsegs)
+		goto out;
+
+	ret = -ENOSPC;
+	nrsvsegs = nilfs_nrsvsegs(nilfs, newnsegs);
+	if (newnsegs < nsegs && nsegs - newnsegs + nrsvsegs > sui->ncleansegs)
+		goto out;
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out;
+
+	if (newnsegs > nsegs) {
+		sui->ncleansegs += newnsegs - nsegs;
+	} else /* newnsegs < nsegs */ {
+		ret = nilfs_sufile_truncate_range(sufile, newnsegs, nsegs - 1);
+		if (ret < 0)
+			goto out_header;
+
+		sui->ncleansegs -= nsegs - newnsegs;
+	}
+
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = kaddr + bh_offset(header_bh);
+	header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	mark_buffer_dirty(header_bh);
+	nilfs_mdt_mark_dirty(sufile);
+	nilfs_set_nsegments(nilfs, newnsegs);
+
+out_header:
+	brelse(header_bh);
+out:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_get_suinfo -
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to start looking
+ * @buf: array of suinfo
+ * @sisz: byte size of suinfo
+ * @nsi: size of suinfo array
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned and .... On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
+				unsigned sisz, size_t nsi)
+{
+	struct buffer_head *su_bh;
+	struct nilfs_segment_usage *su;
+	struct nilfs_suinfo *si = buf;
+	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+	struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+	void *kaddr;
+	unsigned long nsegs, segusages_per_block;
+	ssize_t n;
+	int ret, i, j;
+
+	down_read(&NILFS_MDT(sufile)->mi_sem);
+
+	segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
+	nsegs = min_t(unsigned long,
+		      nilfs_sufile_get_nsegments(sufile) - segnum,
+		      nsi);
+	for (i = 0; i < nsegs; i += n, segnum += n) {
+		n = min_t(unsigned long,
+			  segusages_per_block -
+				  nilfs_sufile_get_offset(sufile, segnum),
+			  nsegs - i);
+		ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+							   &su_bh);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				goto out;
+			/* hole */
+			memset(si, 0, sisz * n);
+			si = (void *)si + sisz * n;
+			continue;
+		}
+
+		kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+		su = nilfs_sufile_block_get_segment_usage(
+			sufile, segnum, su_bh, kaddr);
+		for (j = 0; j < n;
+		     j++, su = (void *)su + susz, si = (void *)si + sisz) {
+			si->sui_lastmod = le64_to_cpu(su->su_lastmod);
+			si->sui_nblocks = le32_to_cpu(su->su_nblocks);
+			si->sui_flags = le32_to_cpu(su->su_flags) &
+				~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+			if (nilfs_segment_is_active(nilfs, segnum + j))
+				si->sui_flags |=
+					(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+		}
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(su_bh);
+	}
+	ret = nsegs;
+
+ out:
+	up_read(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_read - read or get sufile inode
+ * @sb: super block instance
+ * @susize: size of a segment usage entry
+ * @raw_inode: on-disk sufile inode
+ * @inodep: buffer to store the inode
+ */
+int nilfs_sufile_read(struct super_block *sb, size_t susize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep)
+{
+	struct inode *sufile;
+	struct nilfs_sufile_info *sui;
+	struct buffer_head *header_bh;
+	struct nilfs_sufile_header *header;
+	void *kaddr;
+	int err;
+
+	sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
+	if (unlikely(!sufile))
+		return -ENOMEM;
+	if (!(sufile->i_state & I_NEW))
+		goto out;
+
+	err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui));
+	if (err)
+		goto failed;
+
+	nilfs_mdt_set_entry_size(sufile, susize,
+				 sizeof(struct nilfs_sufile_header));
+
+	err = nilfs_read_inode_common(sufile, raw_inode);
+	if (err)
+		goto failed;
+
+	err = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (err)
+		goto failed;
+
+	sui = NILFS_SUI(sufile);
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = kaddr + bh_offset(header_bh);
+	sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(header_bh);
+
+	sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1;
+	sui->allocmin = 0;
+
+	unlock_new_inode(sufile);
+ out:
+	*inodep = sufile;
+	return 0;
+ failed:
+	iget_failed(sufile);
+	return err;
+}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
new file mode 100644
index 00000000..e84bc5b5
--- /dev/null
+++ b/fs/nilfs2/sufile.h
@@ -0,0 +1,144 @@
+/*
+ * sufile.h - NILFS segment usage file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_SUFILE_H
+#define _NILFS_SUFILE_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+
+
+static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
+{
+	return ((struct the_nilfs *)sufile->i_sb->s_fs_info)->ns_nsegments;
+}
+
+unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
+
+int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end);
+int nilfs_sufile_alloc(struct inode *, __u64 *);
+int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
+int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
+				   unsigned long nblocks, time_t modtime);
+int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
+ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
+				size_t);
+
+int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *,
+			 void (*dofunc)(struct inode *, __u64,
+					struct buffer_head *,
+					struct buffer_head *));
+int nilfs_sufile_update(struct inode *, __u64, int,
+			void (*dofunc)(struct inode *, __u64,
+				       struct buffer_head *,
+				       struct buffer_head *));
+void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *,
+			   struct buffer_head *);
+void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *,
+			  struct buffer_head *);
+void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
+				 struct buffer_head *);
+void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
+			       struct buffer_head *);
+
+int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs);
+int nilfs_sufile_read(struct super_block *sb, size_t susize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep);
+
+/**
+ * nilfs_sufile_scrap - make a segment garbage
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to be freed
+ */
+static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum)
+{
+	return nilfs_sufile_update(sufile, segnum, 1, nilfs_sufile_do_scrap);
+}
+
+/**
+ * nilfs_sufile_free - free segment
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to be freed
+ */
+static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
+{
+	return nilfs_sufile_update(sufile, segnum, 0, nilfs_sufile_do_free);
+}
+
+/**
+ * nilfs_sufile_freev - free segments
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @ndone: place to store the number of freed segments
+ */
+static inline int nilfs_sufile_freev(struct inode *sufile, __u64 *segnumv,
+				     size_t nsegs, size_t *ndone)
+{
+	return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
+				    nilfs_sufile_do_free);
+}
+
+/**
+ * nilfs_sufile_cancel_freev - reallocate freeing segments
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @ndone: place to store the number of cancelled segments
+ *
+ * Return Value: On success, 0 is returned. On error, a negative error codes
+ * is returned.
+ */
+static inline int nilfs_sufile_cancel_freev(struct inode *sufile,
+					    __u64 *segnumv, size_t nsegs,
+					    size_t *ndone)
+{
+	return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
+				    nilfs_sufile_do_cancel_free);
+}
+
+/**
+ * nilfs_sufile_set_error - mark a segment as erroneous
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ *
+ * Description: nilfs_sufile_set_error() marks the segment specified by
+ * @segnum as erroneous. The error segment will never be used again.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid segment usage number.
+ */
+static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
+{
+	return nilfs_sufile_update(sufile, segnum, 0,
+				   nilfs_sufile_do_set_error);
+}
+
+#endif	/* _NILFS_SUFILE_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
new file mode 100644
index 00000000..8351c44a
--- /dev/null
+++ b/fs/nilfs2/super.c
@@ -0,0 +1,1462 @@
+/*
+ * super.c - NILFS module and super block management.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/crc32.h>
+#include <linux/vfs.h>
+#include <linux/writeback.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include "nilfs.h"
+#include "export.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "btree.h"
+#include "btnode.h"
+#include "page.h"
+#include "cpfile.h"
+#include "sufile.h" /* nilfs_sufile_resize(), nilfs_sufile_set_alloc_range() */
+#include "ifile.h"
+#include "dat.h"
+#include "segment.h"
+#include "segbuf.h"
+
+MODULE_AUTHOR("NTT Corp.");
+MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
+		   "(NILFS)");
+MODULE_LICENSE("GPL");
+
+static struct kmem_cache *nilfs_inode_cachep;
+struct kmem_cache *nilfs_transaction_cachep;
+struct kmem_cache *nilfs_segbuf_cachep;
+struct kmem_cache *nilfs_btree_path_cache;
+
+static int nilfs_setup_super(struct super_block *sb, int is_mount);
+static int nilfs_remount(struct super_block *sb, int *flags, char *data);
+
+static void nilfs_set_error(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_super_block **sbp;
+
+	down_write(&nilfs->ns_sem);
+	if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
+		nilfs->ns_mount_state |= NILFS_ERROR_FS;
+		sbp = nilfs_prepare_super(sb, 0);
+		if (likely(sbp)) {
+			sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
+			if (sbp[1])
+				sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
+			nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
+		}
+	}
+	up_write(&nilfs->ns_sem);
+}
+
+/**
+ * nilfs_error() - report failure condition on a filesystem
+ *
+ * nilfs_error() sets an ERROR_FS flag on the superblock as well as
+ * reporting an error message.  It should be called when NILFS detects
+ * incoherences or defects of meta data on disk.  As for sustainable
+ * errors such as a single-shot I/O error, nilfs_warning() or the printk()
+ * function should be used instead.
+ *
+ * The segment constructor must not call this function because it can
+ * kill itself.
+ */
+void nilfs_error(struct super_block *sb, const char *function,
+		 const char *fmt, ...)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
+	va_end(args);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		nilfs_set_error(sb);
+
+		if (nilfs_test_opt(nilfs, ERRORS_RO)) {
+			printk(KERN_CRIT "Remounting filesystem read-only\n");
+			sb->s_flags |= MS_RDONLY;
+		}
+	}
+
+	if (nilfs_test_opt(nilfs, ERRORS_PANIC))
+		panic("NILFS (device %s): panic forced after error\n",
+		      sb->s_id);
+}
+
+void nilfs_warning(struct super_block *sb, const char *function,
+		   const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
+	va_end(args);
+}
+
+
+struct inode *nilfs_alloc_inode(struct super_block *sb)
+{
+	struct nilfs_inode_info *ii;
+
+	ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS);
+	if (!ii)
+		return NULL;
+	ii->i_bh = NULL;
+	ii->i_state = 0;
+	ii->i_cno = 0;
+	ii->vfs_inode.i_version = 1;
+	nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi);
+	return &ii->vfs_inode;
+}
+
+static void nilfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+	INIT_LIST_HEAD(&inode->i_dentry);
+
+	if (mdi) {
+		kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
+		kfree(mdi);
+	}
+	kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
+}
+
+void nilfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, nilfs_i_callback);
+}
+
+static int nilfs_sync_super(struct super_block *sb, int flag)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	int err;
+
+ retry:
+	set_buffer_dirty(nilfs->ns_sbh[0]);
+	if (nilfs_test_opt(nilfs, BARRIER)) {
+		err = __sync_dirty_buffer(nilfs->ns_sbh[0],
+					  WRITE_SYNC | WRITE_FLUSH_FUA);
+	} else {
+		err = sync_dirty_buffer(nilfs->ns_sbh[0]);
+	}
+
+	if (unlikely(err)) {
+		printk(KERN_ERR
+		       "NILFS: unable to write superblock (err=%d)\n", err);
+		if (err == -EIO && nilfs->ns_sbh[1]) {
+			/*
+			 * sbp[0] points to newer log than sbp[1],
+			 * so copy sbp[0] to sbp[1] to take over sbp[0].
+			 */
+			memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0],
+			       nilfs->ns_sbsize);
+			nilfs_fall_back_super_block(nilfs);
+			goto retry;
+		}
+	} else {
+		struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
+
+		nilfs->ns_sbwcount++;
+
+		/*
+		 * The latest segment becomes trailable from the position
+		 * written in superblock.
+		 */
+		clear_nilfs_discontinued(nilfs);
+
+		/* update GC protection for recent segments */
+		if (nilfs->ns_sbh[1]) {
+			if (flag == NILFS_SB_COMMIT_ALL) {
+				set_buffer_dirty(nilfs->ns_sbh[1]);
+				if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0)
+					goto out;
+			}
+			if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) <
+			    le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno))
+				sbp = nilfs->ns_sbp[1];
+		}
+
+		spin_lock(&nilfs->ns_last_segment_lock);
+		nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
+		spin_unlock(&nilfs->ns_last_segment_lock);
+	}
+ out:
+	return err;
+}
+
+void nilfs_set_log_cursor(struct nilfs_super_block *sbp,
+			  struct the_nilfs *nilfs)
+{
+	sector_t nfreeblocks;
+
+	/* nilfs->ns_sem must be locked by the caller. */
+	nilfs_count_free_blocks(nilfs, &nfreeblocks);
+	sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks);
+
+	spin_lock(&nilfs->ns_last_segment_lock);
+	sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
+	sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
+	sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
+	spin_unlock(&nilfs->ns_last_segment_lock);
+}
+
+struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
+					       int flip)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+
+	/* nilfs->ns_sem must be locked by the caller. */
+	if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
+		if (sbp[1] &&
+		    sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) {
+			memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
+		} else {
+			printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
+			       sb->s_id);
+			return NULL;
+		}
+	} else if (sbp[1] &&
+		   sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
+			memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+	}
+
+	if (flip && sbp[1])
+		nilfs_swap_super_block(nilfs);
+
+	return sbp;
+}
+
+int nilfs_commit_super(struct super_block *sb, int flag)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	time_t t;
+
+	/* nilfs->ns_sem must be locked by the caller. */
+	t = get_seconds();
+	nilfs->ns_sbwtime = t;
+	sbp[0]->s_wtime = cpu_to_le64(t);
+	sbp[0]->s_sum = 0;
+	sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
+					     (unsigned char *)sbp[0],
+					     nilfs->ns_sbsize));
+	if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) {
+		sbp[1]->s_wtime = sbp[0]->s_wtime;
+		sbp[1]->s_sum = 0;
+		sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
+					    (unsigned char *)sbp[1],
+					    nilfs->ns_sbsize));
+	}
+	clear_nilfs_sb_dirty(nilfs);
+	return nilfs_sync_super(sb, flag);
+}
+
+/**
+ * nilfs_cleanup_super() - write filesystem state for cleanup
+ * @sb: super block instance to be unmounted or degraded to read-only
+ *
+ * This function restores state flags in the on-disk super block.
+ * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the
+ * filesystem was not clean previously.
+ */
+int nilfs_cleanup_super(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_super_block **sbp;
+	int flag = NILFS_SB_COMMIT;
+	int ret = -EIO;
+
+	sbp = nilfs_prepare_super(sb, 0);
+	if (sbp) {
+		sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+		nilfs_set_log_cursor(sbp[0], nilfs);
+		if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) {
+			/*
+			 * make the "clean" flag also to the opposite
+			 * super block if both super blocks point to
+			 * the same checkpoint.
+			 */
+			sbp[1]->s_state = sbp[0]->s_state;
+			flag = NILFS_SB_COMMIT_ALL;
+		}
+		ret = nilfs_commit_super(sb, flag);
+	}
+	return ret;
+}
+
+/**
+ * nilfs_move_2nd_super - relocate secondary super block
+ * @sb: super block instance
+ * @sb2off: new offset of the secondary super block (in bytes)
+ */
+static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct buffer_head *nsbh;
+	struct nilfs_super_block *nsbp;
+	sector_t blocknr, newblocknr;
+	unsigned long offset;
+	int sb2i = -1;  /* array index of the secondary superblock */
+	int ret = 0;
+
+	/* nilfs->ns_sem must be locked by the caller. */
+	if (nilfs->ns_sbh[1] &&
+	    nilfs->ns_sbh[1]->b_blocknr > nilfs->ns_first_data_block) {
+		sb2i = 1;
+		blocknr = nilfs->ns_sbh[1]->b_blocknr;
+	} else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) {
+		sb2i = 0;
+		blocknr = nilfs->ns_sbh[0]->b_blocknr;
+	}
+	if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off)
+		goto out;  /* super block location is unchanged */
+
+	/* Get new super block buffer */
+	newblocknr = sb2off >> nilfs->ns_blocksize_bits;
+	offset = sb2off & (nilfs->ns_blocksize - 1);
+	nsbh = sb_getblk(sb, newblocknr);
+	if (!nsbh) {
+		printk(KERN_WARNING
+		       "NILFS warning: unable to move secondary superblock "
+		       "to block %llu\n", (unsigned long long)newblocknr);
+		ret = -EIO;
+		goto out;
+	}
+	nsbp = (void *)nsbh->b_data + offset;
+	memset(nsbp, 0, nilfs->ns_blocksize);
+
+	if (sb2i >= 0) {
+		memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize);
+		brelse(nilfs->ns_sbh[sb2i]);
+		nilfs->ns_sbh[sb2i] = nsbh;
+		nilfs->ns_sbp[sb2i] = nsbp;
+	} else if (nilfs->ns_sbh[0]->b_blocknr < nilfs->ns_first_data_block) {
+		/* secondary super block will be restored to index 1 */
+		nilfs->ns_sbh[1] = nsbh;
+		nilfs->ns_sbp[1] = nsbp;
+	} else {
+		brelse(nsbh);
+	}
+out:
+	return ret;
+}
+
+/**
+ * nilfs_resize_fs - resize the filesystem
+ * @sb: super block instance
+ * @newsize: new size of the filesystem (in bytes)
+ */
+int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_super_block **sbp;
+	__u64 devsize, newnsegs;
+	loff_t sb2off;
+	int ret;
+
+	ret = -ERANGE;
+	devsize = i_size_read(sb->s_bdev->bd_inode);
+	if (newsize > devsize)
+		goto out;
+
+	/*
+	 * Write lock is required to protect some functions depending
+	 * on the number of segments, the number of reserved segments,
+	 * and so forth.
+	 */
+	down_write(&nilfs->ns_segctor_sem);
+
+	sb2off = NILFS_SB2_OFFSET_BYTES(newsize);
+	newnsegs = sb2off >> nilfs->ns_blocksize_bits;
+	do_div(newnsegs, nilfs->ns_blocks_per_segment);
+
+	ret = nilfs_sufile_resize(nilfs->ns_sufile, newnsegs);
+	up_write(&nilfs->ns_segctor_sem);
+	if (ret < 0)
+		goto out;
+
+	ret = nilfs_construct_segment(sb);
+	if (ret < 0)
+		goto out;
+
+	down_write(&nilfs->ns_sem);
+	nilfs_move_2nd_super(sb, sb2off);
+	ret = -EIO;
+	sbp = nilfs_prepare_super(sb, 0);
+	if (likely(sbp)) {
+		nilfs_set_log_cursor(sbp[0], nilfs);
+		/*
+		 * Drop NILFS_RESIZE_FS flag for compatibility with
+		 * mount-time resize which may be implemented in a
+		 * future release.
+		 */
+		sbp[0]->s_state = cpu_to_le16(le16_to_cpu(sbp[0]->s_state) &
+					      ~NILFS_RESIZE_FS);
+		sbp[0]->s_dev_size = cpu_to_le64(newsize);
+		sbp[0]->s_nsegments = cpu_to_le64(nilfs->ns_nsegments);
+		if (sbp[1])
+			memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+		ret = nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
+	}
+	up_write(&nilfs->ns_sem);
+
+	/*
+	 * Reset the range of allocatable segments last.  This order
+	 * is important in the case of expansion because the secondary
+	 * superblock must be protected from log write until migration
+	 * completes.
+	 */
+	if (!ret)
+		nilfs_sufile_set_alloc_range(nilfs->ns_sufile, 0, newnsegs - 1);
+out:
+	return ret;
+}
+
+static void nilfs_put_super(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+
+	nilfs_detach_log_writer(sb);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		down_write(&nilfs->ns_sem);
+		nilfs_cleanup_super(sb);
+		up_write(&nilfs->ns_sem);
+	}
+
+	iput(nilfs->ns_sufile);
+	iput(nilfs->ns_cpfile);
+	iput(nilfs->ns_dat);
+
+	destroy_nilfs(nilfs);
+	sb->s_fs_info = NULL;
+}
+
+static int nilfs_sync_fs(struct super_block *sb, int wait)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_super_block **sbp;
+	int err = 0;
+
+	/* This function is called when super block should be written back */
+	if (wait)
+		err = nilfs_construct_segment(sb);
+
+	down_write(&nilfs->ns_sem);
+	if (nilfs_sb_dirty(nilfs)) {
+		sbp = nilfs_prepare_super(sb, nilfs_sb_will_flip(nilfs));
+		if (likely(sbp)) {
+			nilfs_set_log_cursor(sbp[0], nilfs);
+			nilfs_commit_super(sb, NILFS_SB_COMMIT);
+		}
+	}
+	up_write(&nilfs->ns_sem);
+
+	return err;
+}
+
+int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
+			    struct nilfs_root **rootp)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_root *root;
+	struct nilfs_checkpoint *raw_cp;
+	struct buffer_head *bh_cp;
+	int err = -ENOMEM;
+
+	root = nilfs_find_or_create_root(
+		nilfs, curr_mnt ? NILFS_CPTREE_CURRENT_CNO : cno);
+	if (!root)
+		return err;
+
+	if (root->ifile)
+		goto reuse; /* already attached checkpoint */
+
+	down_read(&nilfs->ns_segctor_sem);
+	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
+					  &bh_cp);
+	up_read(&nilfs->ns_segctor_sem);
+	if (unlikely(err)) {
+		if (err == -ENOENT || err == -EINVAL) {
+			printk(KERN_ERR
+			       "NILFS: Invalid checkpoint "
+			       "(checkpoint number=%llu)\n",
+			       (unsigned long long)cno);
+			err = -EINVAL;
+		}
+		goto failed;
+	}
+
+	err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size,
+			       &raw_cp->cp_ifile_inode, &root->ifile);
+	if (err)
+		goto failed_bh;
+
+	atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
+	atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
+
+	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+
+ reuse:
+	*rootp = root;
+	return 0;
+
+ failed_bh:
+	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+ failed:
+	nilfs_put_root(root);
+
+	return err;
+}
+
+static int nilfs_freeze(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	int err;
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	/* Mark super block clean */
+	down_write(&nilfs->ns_sem);
+	err = nilfs_cleanup_super(sb);
+	up_write(&nilfs->ns_sem);
+	return err;
+}
+
+static int nilfs_unfreeze(struct super_block *sb)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	down_write(&nilfs->ns_sem);
+	nilfs_setup_super(sb, false);
+	up_write(&nilfs->ns_sem);
+	return 0;
+}
+
+static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root;
+	struct the_nilfs *nilfs = root->nilfs;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+	unsigned long long blocks;
+	unsigned long overhead;
+	unsigned long nrsvblocks;
+	sector_t nfreeblocks;
+	int err;
+
+	/*
+	 * Compute all of the segment blocks
+	 *
+	 * The blocks before first segment and after last segment
+	 * are excluded.
+	 */
+	blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments
+		- nilfs->ns_first_data_block;
+	nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment;
+
+	/*
+	 * Compute the overhead
+	 *
+	 * When distributing meta data blocks outside segment structure,
+	 * We must count them as the overhead.
+	 */
+	overhead = 0;
+
+	err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
+	if (unlikely(err))
+		return err;
+
+	buf->f_type = NILFS_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = blocks - overhead;
+	buf->f_bfree = nfreeblocks;
+	buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
+		(buf->f_bfree - nrsvblocks) : 0;
+	buf->f_files = atomic_read(&root->inodes_count);
+	buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
+	buf->f_namelen = NILFS_NAME_LEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct super_block *sb = vfs->mnt_sb;
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
+
+	if (!nilfs_test_opt(nilfs, BARRIER))
+		seq_puts(seq, ",nobarrier");
+	if (root->cno != NILFS_CPTREE_CURRENT_CNO)
+		seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
+	if (nilfs_test_opt(nilfs, ERRORS_PANIC))
+		seq_puts(seq, ",errors=panic");
+	if (nilfs_test_opt(nilfs, ERRORS_CONT))
+		seq_puts(seq, ",errors=continue");
+	if (nilfs_test_opt(nilfs, STRICT_ORDER))
+		seq_puts(seq, ",order=strict");
+	if (nilfs_test_opt(nilfs, NORECOVERY))
+		seq_puts(seq, ",norecovery");
+	if (nilfs_test_opt(nilfs, DISCARD))
+		seq_puts(seq, ",discard");
+
+	return 0;
+}
+
+static const struct super_operations nilfs_sops = {
+	.alloc_inode    = nilfs_alloc_inode,
+	.destroy_inode  = nilfs_destroy_inode,
+	.dirty_inode    = nilfs_dirty_inode,
+	/* .write_inode    = nilfs_write_inode, */
+	/* .put_inode      = nilfs_put_inode, */
+	/* .drop_inode	  = nilfs_drop_inode, */
+	.evict_inode    = nilfs_evict_inode,
+	.put_super      = nilfs_put_super,
+	/* .write_super    = nilfs_write_super, */
+	.sync_fs        = nilfs_sync_fs,
+	.freeze_fs	= nilfs_freeze,
+	.unfreeze_fs	= nilfs_unfreeze,
+	/* .write_super_lockfs */
+	/* .unlockfs */
+	.statfs         = nilfs_statfs,
+	.remount_fs     = nilfs_remount,
+	/* .umount_begin */
+	.show_options = nilfs_show_options
+};
+
+enum {
+	Opt_err_cont, Opt_err_panic, Opt_err_ro,
+	Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
+	Opt_discard, Opt_nodiscard, Opt_err,
+};
+
+static match_table_t tokens = {
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_snapshot, "cp=%u"},
+	{Opt_order, "order=%s"},
+	{Opt_norecovery, "norecovery"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(char *options, struct super_block *sb, int is_remount)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_barrier:
+			nilfs_set_opt(nilfs, BARRIER);
+			break;
+		case Opt_nobarrier:
+			nilfs_clear_opt(nilfs, BARRIER);
+			break;
+		case Opt_order:
+			if (strcmp(args[0].from, "relaxed") == 0)
+				/* Ordered data semantics */
+				nilfs_clear_opt(nilfs, STRICT_ORDER);
+			else if (strcmp(args[0].from, "strict") == 0)
+				/* Strict in-order semantics */
+				nilfs_set_opt(nilfs, STRICT_ORDER);
+			else
+				return 0;
+			break;
+		case Opt_err_panic:
+			nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_PANIC);
+			break;
+		case Opt_err_ro:
+			nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_RO);
+			break;
+		case Opt_err_cont:
+			nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_CONT);
+			break;
+		case Opt_snapshot:
+			if (is_remount) {
+				printk(KERN_ERR
+				       "NILFS: \"%s\" option is invalid "
+				       "for remount.\n", p);
+				return 0;
+			}
+			break;
+		case Opt_norecovery:
+			nilfs_set_opt(nilfs, NORECOVERY);
+			break;
+		case Opt_discard:
+			nilfs_set_opt(nilfs, DISCARD);
+			break;
+		case Opt_nodiscard:
+			nilfs_clear_opt(nilfs, DISCARD);
+			break;
+		default:
+			printk(KERN_ERR
+			       "NILFS: Unrecognized mount option \"%s\"\n", p);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static inline void
+nilfs_set_default_options(struct super_block *sb,
+			  struct nilfs_super_block *sbp)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+
+	nilfs->ns_mount_opt =
+		NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
+}
+
+static int nilfs_setup_super(struct super_block *sb, int is_mount)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_super_block **sbp;
+	int max_mnt_count;
+	int mnt_count;
+
+	/* nilfs->ns_sem must be locked by the caller. */
+	sbp = nilfs_prepare_super(sb, 0);
+	if (!sbp)
+		return -EIO;
+
+	if (!is_mount)
+		goto skip_mount_setup;
+
+	max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count);
+	mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
+
+	if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
+		printk(KERN_WARNING
+		       "NILFS warning: mounting fs with errors\n");
+#if 0
+	} else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
+		printk(KERN_WARNING
+		       "NILFS warning: maximal mount count reached\n");
+#endif
+	}
+	if (!max_mnt_count)
+		sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
+
+	sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
+	sbp[0]->s_mtime = cpu_to_le64(get_seconds());
+
+skip_mount_setup:
+	sbp[0]->s_state =
+		cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
+	/* synchronize sbp[1] with sbp[0] */
+	if (sbp[1])
+		memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+	return nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
+}
+
+struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
+						 u64 pos, int blocksize,
+						 struct buffer_head **pbh)
+{
+	unsigned long long sb_index = pos;
+	unsigned long offset;
+
+	offset = do_div(sb_index, blocksize);
+	*pbh = sb_bread(sb, sb_index);
+	if (!*pbh)
+		return NULL;
+	return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset);
+}
+
+int nilfs_store_magic_and_option(struct super_block *sb,
+				 struct nilfs_super_block *sbp,
+				 char *data)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+
+	sb->s_magic = le16_to_cpu(sbp->s_magic);
+
+	/* FS independent flags */
+#ifdef NILFS_ATIME_DISABLE
+	sb->s_flags |= MS_NOATIME;
+#endif
+
+	nilfs_set_default_options(sb, sbp);
+
+	nilfs->ns_resuid = le16_to_cpu(sbp->s_def_resuid);
+	nilfs->ns_resgid = le16_to_cpu(sbp->s_def_resgid);
+	nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval);
+	nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max);
+
+	return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
+}
+
+int nilfs_check_feature_compatibility(struct super_block *sb,
+				      struct nilfs_super_block *sbp)
+{
+	__u64 features;
+
+	features = le64_to_cpu(sbp->s_feature_incompat) &
+		~NILFS_FEATURE_INCOMPAT_SUPP;
+	if (features) {
+		printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
+		       "optional features (%llx)\n",
+		       (unsigned long long)features);
+		return -EINVAL;
+	}
+	features = le64_to_cpu(sbp->s_feature_compat_ro) &
+		~NILFS_FEATURE_COMPAT_RO_SUPP;
+	if (!(sb->s_flags & MS_RDONLY) && features) {
+		printk(KERN_ERR "NILFS: couldn't mount RDWR because of "
+		       "unsupported optional features (%llx)\n",
+		       (unsigned long long)features);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int nilfs_get_root_dentry(struct super_block *sb,
+				 struct nilfs_root *root,
+				 struct dentry **root_dentry)
+{
+	struct inode *inode;
+	struct dentry *dentry;
+	int ret = 0;
+
+	inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
+	if (IS_ERR(inode)) {
+		printk(KERN_ERR "NILFS: get root inode failed\n");
+		ret = PTR_ERR(inode);
+		goto out;
+	}
+	if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
+		iput(inode);
+		printk(KERN_ERR "NILFS: corrupt root inode.\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
+		dentry = d_find_alias(inode);
+		if (!dentry) {
+			dentry = d_alloc_root(inode);
+			if (!dentry) {
+				iput(inode);
+				ret = -ENOMEM;
+				goto failed_dentry;
+			}
+		} else {
+			iput(inode);
+		}
+	} else {
+		dentry = d_obtain_alias(inode);
+		if (IS_ERR(dentry)) {
+			ret = PTR_ERR(dentry);
+			goto failed_dentry;
+		}
+	}
+	*root_dentry = dentry;
+ out:
+	return ret;
+
+ failed_dentry:
+	printk(KERN_ERR "NILFS: get root dentry failed\n");
+	goto out;
+}
+
+static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
+				 struct dentry **root_dentry)
+{
+	struct the_nilfs *nilfs = s->s_fs_info;
+	struct nilfs_root *root;
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0) {
+		ret = (ret == -ENOENT) ? -EINVAL : ret;
+		goto out;
+	} else if (!ret) {
+		printk(KERN_ERR "NILFS: The specified checkpoint is "
+		       "not a snapshot (checkpoint number=%llu).\n",
+		       (unsigned long long)cno);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = nilfs_attach_checkpoint(s, cno, false, &root);
+	if (ret) {
+		printk(KERN_ERR "NILFS: error loading snapshot "
+		       "(checkpoint number=%llu).\n",
+	       (unsigned long long)cno);
+		goto out;
+	}
+	ret = nilfs_get_root_dentry(s, root, root_dentry);
+	nilfs_put_root(root);
+ out:
+	return ret;
+}
+
+static int nilfs_tree_was_touched(struct dentry *root_dentry)
+{
+	return root_dentry->d_count > 1;
+}
+
+/**
+ * nilfs_try_to_shrink_tree() - try to shrink dentries of a checkpoint
+ * @root_dentry: root dentry of the tree to be shrunk
+ *
+ * This function returns true if the tree was in-use.
+ */
+static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
+{
+	if (have_submounts(root_dentry))
+		return true;
+	shrink_dcache_parent(root_dentry);
+	return nilfs_tree_was_touched(root_dentry);
+}
+
+int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_root *root;
+	struct inode *inode;
+	struct dentry *dentry;
+	int ret;
+
+	if (cno < 0 || cno > nilfs->ns_cno)
+		return false;
+
+	if (cno >= nilfs_last_cno(nilfs))
+		return true;	/* protect recent checkpoints */
+
+	ret = false;
+	root = nilfs_lookup_root(nilfs, cno);
+	if (root) {
+		inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
+		if (inode) {
+			dentry = d_find_alias(inode);
+			if (dentry) {
+				if (nilfs_tree_was_touched(dentry))
+					ret = nilfs_try_to_shrink_tree(dentry);
+				dput(dentry);
+			}
+			iput(inode);
+		}
+		nilfs_put_root(root);
+	}
+	return ret;
+}
+
+/**
+ * nilfs_fill_super() - initialize a super block instance
+ * @sb: super_block
+ * @data: mount options
+ * @silent: silent mode flag
+ *
+ * This function is called exclusively by nilfs->ns_mount_mutex.
+ * So, the recovery process is protected from other simultaneous mounts.
+ */
+static int
+nilfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct the_nilfs *nilfs;
+	struct nilfs_root *fsroot;
+	struct backing_dev_info *bdi;
+	__u64 cno;
+	int err;
+
+	nilfs = alloc_nilfs(sb->s_bdev);
+	if (!nilfs)
+		return -ENOMEM;
+
+	sb->s_fs_info = nilfs;
+
+	err = init_nilfs(nilfs, sb, (char *)data);
+	if (err)
+		goto failed_nilfs;
+
+	sb->s_op = &nilfs_sops;
+	sb->s_export_op = &nilfs_export_ops;
+	sb->s_root = NULL;
+	sb->s_time_gran = 1;
+
+	bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+	sb->s_bdi = bdi ? : &default_backing_dev_info;
+
+	err = load_nilfs(nilfs, sb);
+	if (err)
+		goto failed_nilfs;
+
+	cno = nilfs_last_cno(nilfs);
+	err = nilfs_attach_checkpoint(sb, cno, true, &fsroot);
+	if (err) {
+		printk(KERN_ERR "NILFS: error loading last checkpoint "
+		       "(checkpoint number=%llu).\n", (unsigned long long)cno);
+		goto failed_unload;
+	}
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		err = nilfs_attach_log_writer(sb, fsroot);
+		if (err)
+			goto failed_checkpoint;
+	}
+
+	err = nilfs_get_root_dentry(sb, fsroot, &sb->s_root);
+	if (err)
+		goto failed_segctor;
+
+	nilfs_put_root(fsroot);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		down_write(&nilfs->ns_sem);
+		nilfs_setup_super(sb, true);
+		up_write(&nilfs->ns_sem);
+	}
+
+	return 0;
+
+ failed_segctor:
+	nilfs_detach_log_writer(sb);
+
+ failed_checkpoint:
+	nilfs_put_root(fsroot);
+
+ failed_unload:
+	iput(nilfs->ns_sufile);
+	iput(nilfs->ns_cpfile);
+	iput(nilfs->ns_dat);
+
+ failed_nilfs:
+	destroy_nilfs(nilfs);
+	return err;
+}
+
+static int nilfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	unsigned long old_sb_flags;
+	unsigned long old_mount_opt;
+	int err;
+
+	old_sb_flags = sb->s_flags;
+	old_mount_opt = nilfs->ns_mount_opt;
+
+	if (!parse_options(data, sb, 1)) {
+		err = -EINVAL;
+		goto restore_opts;
+	}
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
+
+	err = -EINVAL;
+
+	if (!nilfs_valid_fs(nilfs)) {
+		printk(KERN_WARNING "NILFS (device %s): couldn't "
+		       "remount because the filesystem is in an "
+		       "incomplete recovery state.\n", sb->s_id);
+		goto restore_opts;
+	}
+
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		goto out;
+	if (*flags & MS_RDONLY) {
+		/* Shutting down log writer */
+		nilfs_detach_log_writer(sb);
+		sb->s_flags |= MS_RDONLY;
+
+		/*
+		 * Remounting a valid RW partition RDONLY, so set
+		 * the RDONLY flag and then mark the partition as valid again.
+		 */
+		down_write(&nilfs->ns_sem);
+		nilfs_cleanup_super(sb);
+		up_write(&nilfs->ns_sem);
+	} else {
+		__u64 features;
+		struct nilfs_root *root;
+
+		/*
+		 * Mounting a RDONLY partition read-write, so reread and
+		 * store the current valid flag.  (It may have been changed
+		 * by fsck since we originally mounted the partition.)
+		 */
+		down_read(&nilfs->ns_sem);
+		features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
+			~NILFS_FEATURE_COMPAT_RO_SUPP;
+		up_read(&nilfs->ns_sem);
+		if (features) {
+			printk(KERN_WARNING "NILFS (device %s): couldn't "
+			       "remount RDWR because of unsupported optional "
+			       "features (%llx)\n",
+			       sb->s_id, (unsigned long long)features);
+			err = -EROFS;
+			goto restore_opts;
+		}
+
+		sb->s_flags &= ~MS_RDONLY;
+
+		root = NILFS_I(sb->s_root->d_inode)->i_root;
+		err = nilfs_attach_log_writer(sb, root);
+		if (err)
+			goto restore_opts;
+
+		down_write(&nilfs->ns_sem);
+		nilfs_setup_super(sb, true);
+		up_write(&nilfs->ns_sem);
+	}
+ out:
+	return 0;
+
+ restore_opts:
+	sb->s_flags = old_sb_flags;
+	nilfs->ns_mount_opt = old_mount_opt;
+	return err;
+}
+
+struct nilfs_super_data {
+	struct block_device *bdev;
+	__u64 cno;
+	int flags;
+};
+
+/**
+ * nilfs_identify - pre-read mount options needed to identify mount instance
+ * @data: mount options
+ * @sd: nilfs_super_data
+ */
+static int nilfs_identify(char *data, struct nilfs_super_data *sd)
+{
+	char *p, *options = data;
+	substring_t args[MAX_OPT_ARGS];
+	int token;
+	int ret = 0;
+
+	do {
+		p = strsep(&options, ",");
+		if (p != NULL && *p) {
+			token = match_token(p, tokens, args);
+			if (token == Opt_snapshot) {
+				if (!(sd->flags & MS_RDONLY)) {
+					ret++;
+				} else {
+					sd->cno = simple_strtoull(args[0].from,
+								  NULL, 0);
+					/*
+					 * No need to see the end pointer;
+					 * match_token() has done syntax
+					 * checking.
+					 */
+					if (sd->cno == 0)
+						ret++;
+				}
+			}
+			if (ret)
+				printk(KERN_ERR
+				       "NILFS: invalid mount option: %s\n", p);
+		}
+		if (!options)
+			break;
+		BUG_ON(options == data);
+		*(options - 1) = ',';
+	} while (!ret);
+	return ret;
+}
+
+static int nilfs_set_bdev_super(struct super_block *s, void *data)
+{
+	s->s_bdev = data;
+	s->s_dev = s->s_bdev->bd_dev;
+	return 0;
+}
+
+static int nilfs_test_bdev_super(struct super_block *s, void *data)
+{
+	return (void *)s->s_bdev == data;
+}
+
+static struct dentry *
+nilfs_mount(struct file_system_type *fs_type, int flags,
+	     const char *dev_name, void *data)
+{
+	struct nilfs_super_data sd;
+	struct super_block *s;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
+	struct dentry *root_dentry;
+	int err, s_new = false;
+
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
+	if (IS_ERR(sd.bdev))
+		return ERR_CAST(sd.bdev);
+
+	sd.cno = 0;
+	sd.flags = flags;
+	if (nilfs_identify((char *)data, &sd)) {
+		err = -EINVAL;
+		goto failed;
+	}
+
+	/*
+	 * once the super is inserted into the list by sget, s_umount
+	 * will protect the lockfs code from trying to start a snapshot
+	 * while we are mounting
+	 */
+	mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
+	if (sd.bdev->bd_fsfreeze_count > 0) {
+		mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
+		err = -EBUSY;
+		goto failed;
+	}
+	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, sd.bdev);
+	mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
+	if (IS_ERR(s)) {
+		err = PTR_ERR(s);
+		goto failed;
+	}
+
+	if (!s->s_root) {
+		char b[BDEVNAME_SIZE];
+
+		s_new = true;
+
+		/* New superblock instance created */
+		s->s_flags = flags;
+		s->s_mode = mode;
+		strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
+		sb_set_blocksize(s, block_size(sd.bdev));
+
+		err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		if (err)
+			goto failed_super;
+
+		s->s_flags |= MS_ACTIVE;
+	} else if (!sd.cno) {
+		int busy = false;
+
+		if (nilfs_tree_was_touched(s->s_root)) {
+			busy = nilfs_try_to_shrink_tree(s->s_root);
+			if (busy && (flags ^ s->s_flags) & MS_RDONLY) {
+				printk(KERN_ERR "NILFS: the device already "
+				       "has a %s mount.\n",
+				       (s->s_flags & MS_RDONLY) ?
+				       "read-only" : "read/write");
+				err = -EBUSY;
+				goto failed_super;
+			}
+		}
+		if (!busy) {
+			/*
+			 * Try remount to setup mount states if the current
+			 * tree is not mounted and only snapshots use this sb.
+			 */
+			err = nilfs_remount(s, &flags, data);
+			if (err)
+				goto failed_super;
+		}
+	}
+
+	if (sd.cno) {
+		err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
+		if (err)
+			goto failed_super;
+	} else {
+		root_dentry = dget(s->s_root);
+	}
+
+	if (!s_new)
+		blkdev_put(sd.bdev, mode);
+
+	return root_dentry;
+
+ failed_super:
+	deactivate_locked_super(s);
+
+ failed:
+	if (!s_new)
+		blkdev_put(sd.bdev, mode);
+	return ERR_PTR(err);
+}
+
+struct file_system_type nilfs_fs_type = {
+	.owner    = THIS_MODULE,
+	.name     = "nilfs2",
+	.mount    = nilfs_mount,
+	.kill_sb  = kill_block_super,
+	.fs_flags = FS_REQUIRES_DEV,
+};
+
+static void nilfs_inode_init_once(void *obj)
+{
+	struct nilfs_inode_info *ii = obj;
+
+	INIT_LIST_HEAD(&ii->i_dirty);
+#ifdef CONFIG_NILFS_XATTR
+	init_rwsem(&ii->xattr_sem);
+#endif
+	address_space_init_once(&ii->i_btnode_cache);
+	ii->i_bmap = &ii->i_bmap_data;
+	inode_init_once(&ii->vfs_inode);
+}
+
+static void nilfs_segbuf_init_once(void *obj)
+{
+	memset(obj, 0, sizeof(struct nilfs_segment_buffer));
+}
+
+static void nilfs_destroy_cachep(void)
+{
+	if (nilfs_inode_cachep)
+		kmem_cache_destroy(nilfs_inode_cachep);
+	if (nilfs_transaction_cachep)
+		kmem_cache_destroy(nilfs_transaction_cachep);
+	if (nilfs_segbuf_cachep)
+		kmem_cache_destroy(nilfs_segbuf_cachep);
+	if (nilfs_btree_path_cache)
+		kmem_cache_destroy(nilfs_btree_path_cache);
+}
+
+static int __init nilfs_init_cachep(void)
+{
+	nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
+			sizeof(struct nilfs_inode_info), 0,
+			SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+	if (!nilfs_inode_cachep)
+		goto fail;
+
+	nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache",
+			sizeof(struct nilfs_transaction_info), 0,
+			SLAB_RECLAIM_ACCOUNT, NULL);
+	if (!nilfs_transaction_cachep)
+		goto fail;
+
+	nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache",
+			sizeof(struct nilfs_segment_buffer), 0,
+			SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once);
+	if (!nilfs_segbuf_cachep)
+		goto fail;
+
+	nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache",
+			sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX,
+			0, 0, NULL);
+	if (!nilfs_btree_path_cache)
+		goto fail;
+
+	return 0;
+
+fail:
+	nilfs_destroy_cachep();
+	return -ENOMEM;
+}
+
+static int __init init_nilfs_fs(void)
+{
+	int err;
+
+	err = nilfs_init_cachep();
+	if (err)
+		goto fail;
+
+	err = register_filesystem(&nilfs_fs_type);
+	if (err)
+		goto free_cachep;
+
+	printk(KERN_INFO "NILFS version 2 loaded\n");
+	return 0;
+
+free_cachep:
+	nilfs_destroy_cachep();
+fail:
+	return err;
+}
+
+static void __exit exit_nilfs_fs(void)
+{
+	nilfs_destroy_cachep();
+	unregister_filesystem(&nilfs_fs_type);
+}
+
+module_init(init_nilfs_fs)
+module_exit(exit_nilfs_fs)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
new file mode 100644
index 00000000..35a89708
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.c
@@ -0,0 +1,784 @@
+/*
+ * the_nilfs.c - the_nilfs shared structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/random.h>
+#include <linux/crc32.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "alloc.h"
+#include "cpfile.h"
+#include "sufile.h"
+#include "dat.h"
+#include "segbuf.h"
+
+
+static int nilfs_valid_sb(struct nilfs_super_block *sbp);
+
+void nilfs_set_last_segment(struct the_nilfs *nilfs,
+			    sector_t start_blocknr, u64 seq, __u64 cno)
+{
+	spin_lock(&nilfs->ns_last_segment_lock);
+	nilfs->ns_last_pseg = start_blocknr;
+	nilfs->ns_last_seq = seq;
+	nilfs->ns_last_cno = cno;
+
+	if (!nilfs_sb_dirty(nilfs)) {
+		if (nilfs->ns_prev_seq == nilfs->ns_last_seq)
+			goto stay_cursor;
+
+		set_nilfs_sb_dirty(nilfs);
+	}
+	nilfs->ns_prev_seq = nilfs->ns_last_seq;
+
+ stay_cursor:
+	spin_unlock(&nilfs->ns_last_segment_lock);
+}
+
+/**
+ * alloc_nilfs - allocate a nilfs object
+ * @bdev: block device to which the_nilfs is related
+ *
+ * Return Value: On success, pointer to the_nilfs is returned.
+ * On error, NULL is returned.
+ */
+struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+{
+	struct the_nilfs *nilfs;
+
+	nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL);
+	if (!nilfs)
+		return NULL;
+
+	nilfs->ns_bdev = bdev;
+	atomic_set(&nilfs->ns_ndirtyblks, 0);
+	init_rwsem(&nilfs->ns_sem);
+	INIT_LIST_HEAD(&nilfs->ns_dirty_files);
+	INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
+	spin_lock_init(&nilfs->ns_inode_lock);
+	spin_lock_init(&nilfs->ns_next_gen_lock);
+	spin_lock_init(&nilfs->ns_last_segment_lock);
+	nilfs->ns_cptree = RB_ROOT;
+	spin_lock_init(&nilfs->ns_cptree_lock);
+	init_rwsem(&nilfs->ns_segctor_sem);
+
+	return nilfs;
+}
+
+/**
+ * destroy_nilfs - destroy nilfs object
+ * @nilfs: nilfs object to be released
+ */
+void destroy_nilfs(struct the_nilfs *nilfs)
+{
+	might_sleep();
+	if (nilfs_init(nilfs)) {
+		brelse(nilfs->ns_sbh[0]);
+		brelse(nilfs->ns_sbh[1]);
+	}
+	kfree(nilfs);
+}
+
+static int nilfs_load_super_root(struct the_nilfs *nilfs,
+				 struct super_block *sb, sector_t sr_block)
+{
+	struct buffer_head *bh_sr;
+	struct nilfs_super_root *raw_sr;
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	struct nilfs_inode *rawi;
+	unsigned dat_entry_size, segment_usage_size, checkpoint_size;
+	unsigned inode_size;
+	int err;
+
+	err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1);
+	if (unlikely(err))
+		return err;
+
+	down_read(&nilfs->ns_sem);
+	dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size);
+	checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size);
+	segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size);
+	up_read(&nilfs->ns_sem);
+
+	inode_size = nilfs->ns_inode_size;
+
+	rawi = (void *)bh_sr->b_data + NILFS_SR_DAT_OFFSET(inode_size);
+	err = nilfs_dat_read(sb, dat_entry_size, rawi, &nilfs->ns_dat);
+	if (err)
+		goto failed;
+
+	rawi = (void *)bh_sr->b_data + NILFS_SR_CPFILE_OFFSET(inode_size);
+	err = nilfs_cpfile_read(sb, checkpoint_size, rawi, &nilfs->ns_cpfile);
+	if (err)
+		goto failed_dat;
+
+	rawi = (void *)bh_sr->b_data + NILFS_SR_SUFILE_OFFSET(inode_size);
+	err = nilfs_sufile_read(sb, segment_usage_size, rawi,
+				&nilfs->ns_sufile);
+	if (err)
+		goto failed_cpfile;
+
+	raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
+	nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
+
+ failed:
+	brelse(bh_sr);
+	return err;
+
+ failed_cpfile:
+	iput(nilfs->ns_cpfile);
+
+ failed_dat:
+	iput(nilfs->ns_dat);
+	goto failed;
+}
+
+static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri)
+{
+	memset(ri, 0, sizeof(*ri));
+	INIT_LIST_HEAD(&ri->ri_used_segments);
+}
+
+static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
+{
+	nilfs_dispose_segment_list(&ri->ri_used_segments);
+}
+
+/**
+ * nilfs_store_log_cursor - load log cursor from a super block
+ * @nilfs: nilfs object
+ * @sbp: buffer storing super block to be read
+ *
+ * nilfs_store_log_cursor() reads the last position of the log
+ * containing a super root from a given super block, and initializes
+ * relevant information on the nilfs object preparatory for log
+ * scanning and recovery.
+ */
+static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
+				  struct nilfs_super_block *sbp)
+{
+	int ret = 0;
+
+	nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
+	nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
+	nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
+
+	nilfs->ns_prev_seq = nilfs->ns_last_seq;
+	nilfs->ns_seg_seq = nilfs->ns_last_seq;
+	nilfs->ns_segnum =
+		nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
+	nilfs->ns_cno = nilfs->ns_last_cno + 1;
+	if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
+		printk(KERN_ERR "NILFS invalid last segment number.\n");
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/**
+ * load_nilfs - load and recover the nilfs
+ * @nilfs: the_nilfs structure to be released
+ * @sb: super block isntance used to recover past segment
+ *
+ * load_nilfs() searches and load the latest super root,
+ * attaches the last segment, and does recovery if needed.
+ * The caller must call this exclusively for simultaneous mounts.
+ */
+int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
+{
+	struct nilfs_recovery_info ri;
+	unsigned int s_flags = sb->s_flags;
+	int really_read_only = bdev_read_only(nilfs->ns_bdev);
+	int valid_fs = nilfs_valid_fs(nilfs);
+	int err;
+
+	if (!valid_fs) {
+		printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
+		if (s_flags & MS_RDONLY) {
+			printk(KERN_INFO "NILFS: INFO: recovery "
+			       "required for readonly filesystem.\n");
+			printk(KERN_INFO "NILFS: write access will "
+			       "be enabled during recovery.\n");
+		}
+	}
+
+	nilfs_init_recovery_info(&ri);
+
+	err = nilfs_search_super_root(nilfs, &ri);
+	if (unlikely(err)) {
+		struct nilfs_super_block **sbp = nilfs->ns_sbp;
+		int blocksize;
+
+		if (err != -EINVAL)
+			goto scan_error;
+
+		if (!nilfs_valid_sb(sbp[1])) {
+			printk(KERN_WARNING
+			       "NILFS warning: unable to fall back to spare"
+			       "super block\n");
+			goto scan_error;
+		}
+		printk(KERN_INFO
+		       "NILFS: try rollback from an earlier position\n");
+
+		/*
+		 * restore super block with its spare and reconfigure
+		 * relevant states of the nilfs object.
+		 */
+		memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
+		nilfs->ns_crc_seed = le32_to_cpu(sbp[0]->s_crc_seed);
+		nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
+
+		/* verify consistency between two super blocks */
+		blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
+		if (blocksize != nilfs->ns_blocksize) {
+			printk(KERN_WARNING
+			       "NILFS warning: blocksize differs between "
+			       "two super blocks (%d != %d)\n",
+			       blocksize, nilfs->ns_blocksize);
+			goto scan_error;
+		}
+
+		err = nilfs_store_log_cursor(nilfs, sbp[0]);
+		if (err)
+			goto scan_error;
+
+		/* drop clean flag to allow roll-forward and recovery */
+		nilfs->ns_mount_state &= ~NILFS_VALID_FS;
+		valid_fs = 0;
+
+		err = nilfs_search_super_root(nilfs, &ri);
+		if (err)
+			goto scan_error;
+	}
+
+	err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root);
+	if (unlikely(err)) {
+		printk(KERN_ERR "NILFS: error loading super root.\n");
+		goto failed;
+	}
+
+	if (valid_fs)
+		goto skip_recovery;
+
+	if (s_flags & MS_RDONLY) {
+		__u64 features;
+
+		if (nilfs_test_opt(nilfs, NORECOVERY)) {
+			printk(KERN_INFO "NILFS: norecovery option specified. "
+			       "skipping roll-forward recovery\n");
+			goto skip_recovery;
+		}
+		features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
+			~NILFS_FEATURE_COMPAT_RO_SUPP;
+		if (features) {
+			printk(KERN_ERR "NILFS: couldn't proceed with "
+			       "recovery because of unsupported optional "
+			       "features (%llx)\n",
+			       (unsigned long long)features);
+			err = -EROFS;
+			goto failed_unload;
+		}
+		if (really_read_only) {
+			printk(KERN_ERR "NILFS: write access "
+			       "unavailable, cannot proceed.\n");
+			err = -EROFS;
+			goto failed_unload;
+		}
+		sb->s_flags &= ~MS_RDONLY;
+	} else if (nilfs_test_opt(nilfs, NORECOVERY)) {
+		printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
+		       "option was specified for a read/write mount\n");
+		err = -EINVAL;
+		goto failed_unload;
+	}
+
+	err = nilfs_salvage_orphan_logs(nilfs, sb, &ri);
+	if (err)
+		goto failed_unload;
+
+	down_write(&nilfs->ns_sem);
+	nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */
+	err = nilfs_cleanup_super(sb);
+	up_write(&nilfs->ns_sem);
+
+	if (err) {
+		printk(KERN_ERR "NILFS: failed to update super block. "
+		       "recovery unfinished.\n");
+		goto failed_unload;
+	}
+	printk(KERN_INFO "NILFS: recovery complete.\n");
+
+ skip_recovery:
+	nilfs_clear_recovery_info(&ri);
+	sb->s_flags = s_flags;
+	return 0;
+
+ scan_error:
+	printk(KERN_ERR "NILFS: error searching super root.\n");
+	goto failed;
+
+ failed_unload:
+	iput(nilfs->ns_cpfile);
+	iput(nilfs->ns_sufile);
+	iput(nilfs->ns_dat);
+
+ failed:
+	nilfs_clear_recovery_info(&ri);
+	sb->s_flags = s_flags;
+	return err;
+}
+
+static unsigned long long nilfs_max_size(unsigned int blkbits)
+{
+	unsigned int max_bits;
+	unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */
+
+	max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */
+	if (max_bits < 64)
+		res = min_t(unsigned long long, res, (1ULL << max_bits) - 1);
+	return res;
+}
+
+/**
+ * nilfs_nrsvsegs - calculate the number of reserved segments
+ * @nilfs: nilfs object
+ * @nsegs: total number of segments
+ */
+unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs)
+{
+	return max_t(unsigned long, NILFS_MIN_NRSVSEGS,
+		     DIV_ROUND_UP(nsegs * nilfs->ns_r_segments_percentage,
+				  100));
+}
+
+void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs)
+{
+	nilfs->ns_nsegments = nsegs;
+	nilfs->ns_nrsvsegs = nilfs_nrsvsegs(nilfs, nsegs);
+}
+
+static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
+				   struct nilfs_super_block *sbp)
+{
+	if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
+		printk(KERN_ERR "NILFS: unsupported revision "
+		       "(superblock rev.=%d.%d, current rev.=%d.%d). "
+		       "Please check the version of mkfs.nilfs.\n",
+		       le32_to_cpu(sbp->s_rev_level),
+		       le16_to_cpu(sbp->s_minor_rev_level),
+		       NILFS_CURRENT_REV, NILFS_MINOR_REV);
+		return -EINVAL;
+	}
+	nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes);
+	if (nilfs->ns_sbsize > BLOCK_SIZE)
+		return -EINVAL;
+
+	nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
+	nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
+
+	nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
+	if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
+		printk(KERN_ERR "NILFS: too short segment.\n");
+		return -EINVAL;
+	}
+
+	nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
+	nilfs->ns_r_segments_percentage =
+		le32_to_cpu(sbp->s_r_segments_percentage);
+	nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments));
+	nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
+	return 0;
+}
+
+static int nilfs_valid_sb(struct nilfs_super_block *sbp)
+{
+	static unsigned char sum[4];
+	const int sumoff = offsetof(struct nilfs_super_block, s_sum);
+	size_t bytes;
+	u32 crc;
+
+	if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC)
+		return 0;
+	bytes = le16_to_cpu(sbp->s_bytes);
+	if (bytes > BLOCK_SIZE)
+		return 0;
+	crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp,
+		       sumoff);
+	crc = crc32_le(crc, sum, 4);
+	crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4,
+		       bytes - sumoff - 4);
+	return crc == le32_to_cpu(sbp->s_sum);
+}
+
+static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
+{
+	return offset < ((le64_to_cpu(sbp->s_nsegments) *
+			  le32_to_cpu(sbp->s_blocks_per_segment)) <<
+			 (le32_to_cpu(sbp->s_log_block_size) + 10));
+}
+
+static void nilfs_release_super_block(struct the_nilfs *nilfs)
+{
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		if (nilfs->ns_sbp[i]) {
+			brelse(nilfs->ns_sbh[i]);
+			nilfs->ns_sbh[i] = NULL;
+			nilfs->ns_sbp[i] = NULL;
+		}
+	}
+}
+
+void nilfs_fall_back_super_block(struct the_nilfs *nilfs)
+{
+	brelse(nilfs->ns_sbh[0]);
+	nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
+	nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
+	nilfs->ns_sbh[1] = NULL;
+	nilfs->ns_sbp[1] = NULL;
+}
+
+void nilfs_swap_super_block(struct the_nilfs *nilfs)
+{
+	struct buffer_head *tsbh = nilfs->ns_sbh[0];
+	struct nilfs_super_block *tsbp = nilfs->ns_sbp[0];
+
+	nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
+	nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
+	nilfs->ns_sbh[1] = tsbh;
+	nilfs->ns_sbp[1] = tsbp;
+}
+
+static int nilfs_load_super_block(struct the_nilfs *nilfs,
+				  struct super_block *sb, int blocksize,
+				  struct nilfs_super_block **sbpp)
+{
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	struct buffer_head **sbh = nilfs->ns_sbh;
+	u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
+	int valid[2], swp = 0;
+
+	sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
+					&sbh[0]);
+	sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]);
+
+	if (!sbp[0]) {
+		if (!sbp[1]) {
+			printk(KERN_ERR "NILFS: unable to read superblock\n");
+			return -EIO;
+		}
+		printk(KERN_WARNING
+		       "NILFS warning: unable to read primary superblock "
+		       "(blocksize = %d)\n", blocksize);
+	} else if (!sbp[1]) {
+		printk(KERN_WARNING
+		       "NILFS warning: unable to read secondary superblock "
+		       "(blocksize = %d)\n", blocksize);
+	}
+
+	/*
+	 * Compare two super blocks and set 1 in swp if the secondary
+	 * super block is valid and newer.  Otherwise, set 0 in swp.
+	 */
+	valid[0] = nilfs_valid_sb(sbp[0]);
+	valid[1] = nilfs_valid_sb(sbp[1]);
+	swp = valid[1] && (!valid[0] ||
+			   le64_to_cpu(sbp[1]->s_last_cno) >
+			   le64_to_cpu(sbp[0]->s_last_cno));
+
+	if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
+		brelse(sbh[1]);
+		sbh[1] = NULL;
+		sbp[1] = NULL;
+		valid[1] = 0;
+		swp = 0;
+	}
+	if (!valid[swp]) {
+		nilfs_release_super_block(nilfs);
+		printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n",
+		       sb->s_id);
+		return -EINVAL;
+	}
+
+	if (!valid[!swp])
+		printk(KERN_WARNING "NILFS warning: broken superblock. "
+		       "using spare superblock (blocksize = %d).\n", blocksize);
+	if (swp)
+		nilfs_swap_super_block(nilfs);
+
+	nilfs->ns_sbwcount = 0;
+	nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
+	nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
+	*sbpp = sbp[0];
+	return 0;
+}
+
+/**
+ * init_nilfs - initialize a NILFS instance.
+ * @nilfs: the_nilfs structure
+ * @sb: super block
+ * @data: mount options
+ *
+ * init_nilfs() performs common initialization per block device (e.g.
+ * reading the super block, getting disk layout information, initializing
+ * shared fields in the_nilfs).
+ *
+ * Return Value: On success, 0 is returned. On error, a negative error
+ * code is returned.
+ */
+int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
+{
+	struct nilfs_super_block *sbp;
+	int blocksize;
+	int err;
+
+	down_write(&nilfs->ns_sem);
+
+	blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
+	if (!blocksize) {
+		printk(KERN_ERR "NILFS: unable to set blocksize\n");
+		err = -EINVAL;
+		goto out;
+	}
+	err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
+	if (err)
+		goto out;
+
+	err = nilfs_store_magic_and_option(sb, sbp, data);
+	if (err)
+		goto failed_sbh;
+
+	err = nilfs_check_feature_compatibility(sb, sbp);
+	if (err)
+		goto failed_sbh;
+
+	blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
+	if (blocksize < NILFS_MIN_BLOCK_SIZE ||
+	    blocksize > NILFS_MAX_BLOCK_SIZE) {
+		printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
+		       "filesystem blocksize %d\n", blocksize);
+		err = -EINVAL;
+		goto failed_sbh;
+	}
+	if (sb->s_blocksize != blocksize) {
+		int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
+
+		if (blocksize < hw_blocksize) {
+			printk(KERN_ERR
+			       "NILFS: blocksize %d too small for device "
+			       "(sector-size = %d).\n",
+			       blocksize, hw_blocksize);
+			err = -EINVAL;
+			goto failed_sbh;
+		}
+		nilfs_release_super_block(nilfs);
+		sb_set_blocksize(sb, blocksize);
+
+		err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
+		if (err)
+			goto out;
+			/* not failed_sbh; sbh is released automatically
+			   when reloading fails. */
+	}
+	nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
+	nilfs->ns_blocksize = blocksize;
+
+	get_random_bytes(&nilfs->ns_next_generation,
+			 sizeof(nilfs->ns_next_generation));
+
+	err = nilfs_store_disk_layout(nilfs, sbp);
+	if (err)
+		goto failed_sbh;
+
+	sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
+
+	nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
+
+	err = nilfs_store_log_cursor(nilfs, sbp);
+	if (err)
+		goto failed_sbh;
+
+	set_nilfs_init(nilfs);
+	err = 0;
+ out:
+	up_write(&nilfs->ns_sem);
+	return err;
+
+ failed_sbh:
+	nilfs_release_super_block(nilfs);
+	goto out;
+}
+
+int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
+			    size_t nsegs)
+{
+	sector_t seg_start, seg_end;
+	sector_t start = 0, nblocks = 0;
+	unsigned int sects_per_block;
+	__u64 *sn;
+	int ret = 0;
+
+	sects_per_block = (1 << nilfs->ns_blocksize_bits) /
+		bdev_logical_block_size(nilfs->ns_bdev);
+	for (sn = segnump; sn < segnump + nsegs; sn++) {
+		nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end);
+
+		if (!nblocks) {
+			start = seg_start;
+			nblocks = seg_end - seg_start + 1;
+		} else if (start + nblocks == seg_start) {
+			nblocks += seg_end - seg_start + 1;
+		} else {
+			ret = blkdev_issue_discard(nilfs->ns_bdev,
+						   start * sects_per_block,
+						   nblocks * sects_per_block,
+						   GFP_NOFS, 0);
+			if (ret < 0)
+				return ret;
+			nblocks = 0;
+		}
+	}
+	if (nblocks)
+		ret = blkdev_issue_discard(nilfs->ns_bdev,
+					   start * sects_per_block,
+					   nblocks * sects_per_block,
+					   GFP_NOFS, 0);
+	return ret;
+}
+
+int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
+{
+	unsigned long ncleansegs;
+
+	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+	ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
+	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+	*nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
+	return 0;
+}
+
+int nilfs_near_disk_full(struct the_nilfs *nilfs)
+{
+	unsigned long ncleansegs, nincsegs;
+
+	ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
+	nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
+		nilfs->ns_blocks_per_segment + 1;
+
+	return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
+}
+
+struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno)
+{
+	struct rb_node *n;
+	struct nilfs_root *root;
+
+	spin_lock(&nilfs->ns_cptree_lock);
+	n = nilfs->ns_cptree.rb_node;
+	while (n) {
+		root = rb_entry(n, struct nilfs_root, rb_node);
+
+		if (cno < root->cno) {
+			n = n->rb_left;
+		} else if (cno > root->cno) {
+			n = n->rb_right;
+		} else {
+			atomic_inc(&root->count);
+			spin_unlock(&nilfs->ns_cptree_lock);
+			return root;
+		}
+	}
+	spin_unlock(&nilfs->ns_cptree_lock);
+
+	return NULL;
+}
+
+struct nilfs_root *
+nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
+{
+	struct rb_node **p, *parent;
+	struct nilfs_root *root, *new;
+
+	root = nilfs_lookup_root(nilfs, cno);
+	if (root)
+		return root;
+
+	new = kmalloc(sizeof(*root), GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	spin_lock(&nilfs->ns_cptree_lock);
+
+	p = &nilfs->ns_cptree.rb_node;
+	parent = NULL;
+
+	while (*p) {
+		parent = *p;
+		root = rb_entry(parent, struct nilfs_root, rb_node);
+
+		if (cno < root->cno) {
+			p = &(*p)->rb_left;
+		} else if (cno > root->cno) {
+			p = &(*p)->rb_right;
+		} else {
+			atomic_inc(&root->count);
+			spin_unlock(&nilfs->ns_cptree_lock);
+			kfree(new);
+			return root;
+		}
+	}
+
+	new->cno = cno;
+	new->ifile = NULL;
+	new->nilfs = nilfs;
+	atomic_set(&new->count, 1);
+	atomic_set(&new->inodes_count, 0);
+	atomic_set(&new->blocks_count, 0);
+
+	rb_link_node(&new->rb_node, parent, p);
+	rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
+
+	spin_unlock(&nilfs->ns_cptree_lock);
+
+	return new;
+}
+
+void nilfs_put_root(struct nilfs_root *root)
+{
+	if (atomic_dec_and_test(&root->count)) {
+		struct the_nilfs *nilfs = root->nilfs;
+
+		spin_lock(&nilfs->ns_cptree_lock);
+		rb_erase(&root->rb_node, &nilfs->ns_cptree);
+		spin_unlock(&nilfs->ns_cptree_lock);
+		if (root->ifile)
+			iput(root->ifile);
+
+		kfree(root);
+	}
+}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
new file mode 100644
index 00000000..9992b113
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.h
@@ -0,0 +1,356 @@
+/*
+ * the_nilfs.h - the_nilfs shared structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#ifndef _THE_NILFS_H
+#define _THE_NILFS_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/rbtree.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/slab.h>
+
+struct nilfs_sc_info;
+
+/* the_nilfs struct */
+enum {
+	THE_NILFS_INIT = 0,     /* Information from super_block is set */
+	THE_NILFS_DISCONTINUED,	/* 'next' pointer chain has broken */
+	THE_NILFS_GC_RUNNING,	/* gc process is running */
+	THE_NILFS_SB_DIRTY,	/* super block is dirty */
+};
+
+/**
+ * struct the_nilfs - struct to supervise multiple nilfs mount points
+ * @ns_flags: flags
+ * @ns_bdev: block device
+ * @ns_sem: semaphore for shared states
+ * @ns_sbh: buffer heads of on-disk super blocks
+ * @ns_sbp: pointers to super block data
+ * @ns_sbwtime: previous write time of super block
+ * @ns_sbwcount: write count of super block
+ * @ns_sbsize: size of valid data in super block
+ * @ns_seg_seq: segment sequence counter
+ * @ns_segnum: index number of the latest full segment.
+ * @ns_nextnum: index number of the full segment index to be used next
+ * @ns_pseg_offset: offset of next partial segment in the current full segment
+ * @ns_cno: next checkpoint number
+ * @ns_ctime: write time of the last segment
+ * @ns_nongc_ctime: write time of the last segment not for cleaner operation
+ * @ns_ndirtyblks: Number of dirty data blocks
+ * @ns_last_segment_lock: lock protecting fields for the latest segment
+ * @ns_last_pseg: start block number of the latest segment
+ * @ns_last_seq: sequence value of the latest segment
+ * @ns_last_cno: checkpoint number of the latest segment
+ * @ns_prot_seq: least sequence number of segments which must not be reclaimed
+ * @ns_prev_seq: base sequence number used to decide if advance log cursor
+ * @ns_writer: log writer
+ * @ns_segctor_sem: semaphore protecting log write
+ * @ns_dat: DAT file inode
+ * @ns_cpfile: checkpoint file inode
+ * @ns_sufile: segusage file inode
+ * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
+ * @ns_cptree_lock: lock protecting @ns_cptree
+ * @ns_dirty_files: list of dirty files
+ * @ns_inode_lock: lock protecting @ns_dirty_files
+ * @ns_gc_inodes: dummy inodes to keep live blocks
+ * @ns_next_generation: next generation number for inodes
+ * @ns_next_gen_lock: lock protecting @ns_next_generation
+ * @ns_mount_opt: mount options
+ * @ns_resuid: uid for reserved blocks
+ * @ns_resgid: gid for reserved blocks
+ * @ns_interval: checkpoint creation interval
+ * @ns_watermark: watermark for the number of dirty buffers
+ * @ns_blocksize_bits: bit length of block size
+ * @ns_blocksize: block size
+ * @ns_nsegments: number of segments in filesystem
+ * @ns_blocks_per_segment: number of blocks per segment
+ * @ns_r_segments_percentage: reserved segments percentage
+ * @ns_nrsvsegs: number of reserved segments
+ * @ns_first_data_block: block number of first data block
+ * @ns_inode_size: size of on-disk inode
+ * @ns_first_ino: first not-special inode number
+ * @ns_crc_seed: seed value of CRC32 calculation
+ */
+struct the_nilfs {
+	unsigned long		ns_flags;
+
+	struct block_device    *ns_bdev;
+	struct rw_semaphore	ns_sem;
+
+	/*
+	 * used for
+	 * - loading the latest checkpoint exclusively.
+	 * - allocating a new full segment.
+	 * - protecting s_dirt in the super_block struct
+	 *   (see nilfs_write_super) and the following fields.
+	 */
+	struct buffer_head     *ns_sbh[2];
+	struct nilfs_super_block *ns_sbp[2];
+	time_t			ns_sbwtime;
+	unsigned		ns_sbwcount;
+	unsigned		ns_sbsize;
+	unsigned		ns_mount_state;
+
+	/*
+	 * Following fields are dedicated to a writable FS-instance.
+	 * Except for the period seeking checkpoint, code outside the segment
+	 * constructor must lock a segment semaphore while accessing these
+	 * fields.
+	 * The writable FS-instance is sole during a lifetime of the_nilfs.
+	 */
+	u64			ns_seg_seq;
+	__u64			ns_segnum;
+	__u64			ns_nextnum;
+	unsigned long		ns_pseg_offset;
+	__u64			ns_cno;
+	time_t			ns_ctime;
+	time_t			ns_nongc_ctime;
+	atomic_t		ns_ndirtyblks;
+
+	/*
+	 * The following fields hold information on the latest partial segment
+	 * written to disk with a super root.  These fields are protected by
+	 * ns_last_segment_lock.
+	 */
+	spinlock_t		ns_last_segment_lock;
+	sector_t		ns_last_pseg;
+	u64			ns_last_seq;
+	__u64			ns_last_cno;
+	u64			ns_prot_seq;
+	u64			ns_prev_seq;
+
+	struct nilfs_sc_info   *ns_writer;
+	struct rw_semaphore	ns_segctor_sem;
+
+	/*
+	 * Following fields are lock free except for the period before
+	 * the_nilfs is initialized.
+	 */
+	struct inode	       *ns_dat;
+	struct inode	       *ns_cpfile;
+	struct inode	       *ns_sufile;
+
+	/* Checkpoint tree */
+	struct rb_root		ns_cptree;
+	spinlock_t		ns_cptree_lock;
+
+	/* Dirty inode list */
+	struct list_head	ns_dirty_files;
+	spinlock_t		ns_inode_lock;
+
+	/* GC inode list */
+	struct list_head	ns_gc_inodes;
+
+	/* Inode allocator */
+	u32			ns_next_generation;
+	spinlock_t		ns_next_gen_lock;
+
+	/* Mount options */
+	unsigned long		ns_mount_opt;
+
+	uid_t			ns_resuid;
+	gid_t			ns_resgid;
+	unsigned long		ns_interval;
+	unsigned long		ns_watermark;
+
+	/* Disk layout information (static) */
+	unsigned int		ns_blocksize_bits;
+	unsigned int		ns_blocksize;
+	unsigned long		ns_nsegments;
+	unsigned long		ns_blocks_per_segment;
+	unsigned long		ns_r_segments_percentage;
+	unsigned long		ns_nrsvsegs;
+	unsigned long		ns_first_data_block;
+	int			ns_inode_size;
+	int			ns_first_ino;
+	u32			ns_crc_seed;
+};
+
+#define THE_NILFS_FNS(bit, name)					\
+static inline void set_nilfs_##name(struct the_nilfs *nilfs)		\
+{									\
+	set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);			\
+}									\
+static inline void clear_nilfs_##name(struct the_nilfs *nilfs)		\
+{									\
+	clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);			\
+}									\
+static inline int nilfs_##name(struct the_nilfs *nilfs)			\
+{									\
+	return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);		\
+}
+
+THE_NILFS_FNS(INIT, init)
+THE_NILFS_FNS(DISCONTINUED, discontinued)
+THE_NILFS_FNS(GC_RUNNING, gc_running)
+THE_NILFS_FNS(SB_DIRTY, sb_dirty)
+
+/*
+ * Mount option operations
+ */
+#define nilfs_clear_opt(nilfs, opt)  \
+	do { (nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
+#define nilfs_set_opt(nilfs, opt)  \
+	do { (nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt; } while (0)
+#define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt)
+#define nilfs_write_opt(nilfs, mask, opt)				\
+	do { (nilfs)->ns_mount_opt =					\
+		(((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) |	\
+		 NILFS_MOUNT_##opt);					\
+	} while (0)
+
+/**
+ * struct nilfs_root - nilfs root object
+ * @cno: checkpoint number
+ * @rb_node: red-black tree node
+ * @count: refcount of this structure
+ * @nilfs: nilfs object
+ * @ifile: inode file
+ * @root: root inode
+ * @inodes_count: number of inodes
+ * @blocks_count: number of blocks (Reserved)
+ */
+struct nilfs_root {
+	__u64 cno;
+	struct rb_node rb_node;
+
+	atomic_t count;
+	struct the_nilfs *nilfs;
+	struct inode *ifile;
+
+	atomic_t inodes_count;
+	atomic_t blocks_count;
+};
+
+/* Special checkpoint number */
+#define NILFS_CPTREE_CURRENT_CNO	0
+
+/* Minimum interval of periodical update of superblocks (in seconds) */
+#define NILFS_SB_FREQ		10
+
+static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
+{
+	u64 t = get_seconds();
+	return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ;
+}
+
+static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
+{
+	int flip_bits = nilfs->ns_sbwcount & 0x0FL;
+	return (flip_bits != 0x08 && flip_bits != 0x0F);
+}
+
+void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
+struct the_nilfs *alloc_nilfs(struct block_device *bdev);
+void destroy_nilfs(struct the_nilfs *nilfs);
+int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data);
+int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
+unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs);
+void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs);
+int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
+int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
+struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
+struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
+					     __u64 cno);
+void nilfs_put_root(struct nilfs_root *root);
+int nilfs_near_disk_full(struct the_nilfs *);
+void nilfs_fall_back_super_block(struct the_nilfs *);
+void nilfs_swap_super_block(struct the_nilfs *);
+
+
+static inline void nilfs_get_root(struct nilfs_root *root)
+{
+	atomic_inc(&root->count);
+}
+
+static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
+{
+	unsigned valid_fs;
+
+	down_read(&nilfs->ns_sem);
+	valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
+	up_read(&nilfs->ns_sem);
+	return valid_fs;
+}
+
+static inline void
+nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
+			sector_t *seg_start, sector_t *seg_end)
+{
+	*seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum;
+	*seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1;
+	if (segnum == 0)
+		*seg_start = nilfs->ns_first_data_block;
+}
+
+static inline sector_t
+nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum)
+{
+	return (segnum == 0) ? nilfs->ns_first_data_block :
+		(sector_t)nilfs->ns_blocks_per_segment * segnum;
+}
+
+static inline __u64
+nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr)
+{
+	sector_t segnum = blocknr;
+
+	sector_div(segnum, nilfs->ns_blocks_per_segment);
+	return segnum;
+}
+
+static inline void
+nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start,
+			sector_t seg_end)
+{
+	/* terminate the current full segment (used in case of I/O-error) */
+	nilfs->ns_pseg_offset = seg_end - seg_start + 1;
+}
+
+static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs)
+{
+	/* move forward with a full segment */
+	nilfs->ns_segnum = nilfs->ns_nextnum;
+	nilfs->ns_pseg_offset = 0;
+	nilfs->ns_seg_seq++;
+}
+
+static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs)
+{
+	__u64 cno;
+
+	spin_lock(&nilfs->ns_last_segment_lock);
+	cno = nilfs->ns_last_cno;
+	spin_unlock(&nilfs->ns_last_segment_lock);
+	return cno;
+}
+
+static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n)
+{
+	return n == nilfs->ns_segnum || n == nilfs->ns_nextnum;
+}
+
+#endif /* _THE_NILFS_H */
diff --git a/fs/nls/Kconfig b/fs/nls/Kconfig
new file mode 100644
index 00000000..a39edc41
--- /dev/null
+++ b/fs/nls/Kconfig
@@ -0,0 +1,464 @@
+#
+# Native language support configuration
+#
+
+menuconfig NLS
+	tristate "Native language support"
+	---help---
+	  The base Native Language Support. A number of filesystems
+	  depend on it (e.g. FAT, JOLIET, NT, BEOS filesystems), as well
+	  as the ability of some filesystems to use native languages
+	  (NCP, SMB).
+
+	  If unsure, say Y.
+
+	  To compile this code as a module, choose M here: the module
+	  will be called nls_base.
+
+if NLS
+
+config NLS_DEFAULT
+	string "Default NLS Option"
+	default "iso8859-1"
+	---help---
+	  The default NLS used when mounting file system. Note, that this is
+	  the NLS used by your console, not the NLS used by a specific file
+	  system (if different) to store data (filenames) on a disk.
+	  Currently, the valid values are:
+	  big5, cp437, cp737, cp775, cp850, cp852, cp855, cp857, cp860, cp861,
+	  cp862, cp863, cp864, cp865, cp866, cp869, cp874, cp932, cp936,
+	  cp949, cp950, cp1251, cp1255, euc-jp, euc-kr, gb2312, iso8859-1,
+	  iso8859-2, iso8859-3, iso8859-4, iso8859-5, iso8859-6, iso8859-7,
+	  iso8859-8, iso8859-9, iso8859-13, iso8859-14, iso8859-15,
+	  koi8-r, koi8-ru, koi8-u, sjis, tis-620, utf8.
+	  If you specify a wrong value, it will use the built-in NLS;
+	  compatible with iso8859-1.
+
+	  If unsure, specify it as "iso8859-1".
+
+config NLS_CODEPAGE_437
+	tristate "Codepage 437 (United States, Canada)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored
+	  in so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage that is used in
+	  the United States and parts of Canada. This is recommended.
+
+config NLS_CODEPAGE_737
+	tristate "Codepage 737 (Greek)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored
+	  in so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage that is used for
+	  Greek. If unsure, say N.
+
+config NLS_CODEPAGE_775
+	tristate "Codepage 775 (Baltic Rim)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored
+	  in so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage that is used
+	  for the Baltic Rim Languages (Latvian and Lithuanian). If unsure,
+	  say N.
+
+config NLS_CODEPAGE_850
+	tristate "Codepage 850 (Europe)"
+	---help---
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage that is used for
+	  much of Europe -- United Kingdom, Germany, Spain, Italy, and [add
+	  more countries here]. It has some characters useful to many European
+	  languages that are not part of the US codepage 437.
+
+	  If unsure, say Y.
+
+config NLS_CODEPAGE_852
+	tristate "Codepage 852 (Central/Eastern Europe)"
+	---help---
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the Latin 2 codepage used by DOS
+	  for much of Central and Eastern Europe. It has all the required
+	  characters for these languages: Albanian, Croatian, Czech, English,
+	  Finnish, Hungarian, Irish, German, Polish, Romanian, Serbian (Latin
+	  transcription), Slovak, Slovenian, and Sorbian.
+
+config NLS_CODEPAGE_855
+	tristate "Codepage 855 (Cyrillic)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Cyrillic.
+
+config NLS_CODEPAGE_857
+	tristate "Codepage 857 (Turkish)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Turkish.
+
+config NLS_CODEPAGE_860
+	tristate "Codepage 860 (Portuguese)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Portuguese.
+
+config NLS_CODEPAGE_861
+	tristate "Codepage 861 (Icelandic)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Icelandic.
+
+config NLS_CODEPAGE_862
+	tristate "Codepage 862 (Hebrew)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Hebrew.
+
+config NLS_CODEPAGE_863
+	tristate "Codepage 863 (Canadian French)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Canadian
+	  French.
+
+config NLS_CODEPAGE_864
+	tristate "Codepage 864 (Arabic)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Arabic.
+
+config NLS_CODEPAGE_865
+	tristate "Codepage 865 (Norwegian, Danish)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for the Nordic
+	  European countries.
+
+config NLS_CODEPAGE_866
+	tristate "Codepage 866 (Cyrillic/Russian)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for
+	  Cyrillic/Russian.
+
+config NLS_CODEPAGE_869
+	tristate "Codepage 869 (Greek)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Greek.
+
+config NLS_CODEPAGE_936
+	tristate "Simplified Chinese charset (CP936, GB2312)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Simplified
+	  Chinese(GBK).
+
+config NLS_CODEPAGE_950
+	tristate "Traditional Chinese charset (Big5)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Traditional
+	  Chinese(Big5).
+
+config NLS_CODEPAGE_932
+	tristate "Japanese charsets (Shift-JIS, EUC-JP)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Shift-JIS
+	  or EUC-JP. To use EUC-JP, you can use 'euc-jp' as mount option or
+	  NLS Default value during kernel configuration, instead of 'cp932'.
+
+config NLS_CODEPAGE_949
+	tristate "Korean charset (CP949, EUC-KR)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for UHC.
+
+config NLS_CODEPAGE_874
+	tristate "Thai charset (CP874, TIS-620)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Thai.
+
+config NLS_ISO8859_8
+	tristate "Hebrew charsets (ISO-8859-8, CP1255)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for ISO8859-8, the Hebrew
+	  character set.
+
+config NLS_CODEPAGE_1250
+	tristate "Windows CP1250 (Slavic/Central European Languages)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CDROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Windows CP-1250
+	  character set, which works for most Latin-written Slavic and Central
+	  European languages: Czech, German, Hungarian, Polish, Rumanian, Croatian,
+	  Slovak, Slovene.
+
+config NLS_CODEPAGE_1251
+	tristate "Windows CP1251 (Bulgarian, Belarusian)"
+	help
+	  The Microsoft FAT file system family can deal with filenames in
+	  native language character sets. These character sets are stored in
+	  so-called DOS codepages. You need to include the appropriate
+	  codepage if you want to be able to read/write these filenames on
+	  DOS/Windows partitions correctly. This does apply to the filenames
+	  only, not to the file contents. You can include several codepages;
+	  say Y here if you want to include the DOS codepage for Russian and
+	  Bulgarian and Belarusian.
+
+config NLS_ASCII
+	tristate "ASCII (United States)"
+	help
+	  An ASCII NLS module is needed if you want to override the
+	  DEFAULT NLS with this very basic charset and don't want any
+	  non-ASCII characters to be translated.
+
+config NLS_ISO8859_1
+	tristate "NLS ISO 8859-1  (Latin 1; Western European Languages)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 1 character
+	  set, which covers most West European languages such as Albanian,
+	  Catalan, Danish, Dutch, English, Faeroese, Finnish, French, German,
+	  Galician, Irish, Icelandic, Italian, Norwegian, Portuguese, Spanish,
+	  and Swedish. It is also the default for the US. If unsure, say Y.
+
+config NLS_ISO8859_2
+	tristate "NLS ISO 8859-2  (Latin 2; Slavic/Central European Languages)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 2 character
+	  set, which works for most Latin-written Slavic and Central European
+	  languages: Czech, German, Hungarian, Polish, Rumanian, Croatian,
+	  Slovak, Slovene.
+
+config NLS_ISO8859_3
+	tristate "NLS ISO 8859-3  (Latin 3; Esperanto, Galician, Maltese, Turkish)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 3 character
+	  set, which is popular with authors of Esperanto, Galician, Maltese,
+	  and Turkish.
+
+config NLS_ISO8859_4
+	tristate "NLS ISO 8859-4  (Latin 4; old Baltic charset)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 4 character
+	  set which introduces letters for Estonian, Latvian, and
+	  Lithuanian. It is an incomplete predecessor of Latin 7.
+
+config NLS_ISO8859_5
+	tristate "NLS ISO 8859-5  (Cyrillic)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for ISO8859-5, a Cyrillic
+	  character set with which you can type Bulgarian, Belarusian,
+	  Macedonian, Russian, Serbian, and Ukrainian. Note that the charset
+	  KOI8-R is preferred in Russia.
+
+config NLS_ISO8859_6
+	tristate "NLS ISO 8859-6  (Arabic)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for ISO8859-6, the Arabic
+	  character set.
+
+config NLS_ISO8859_7
+	tristate "NLS ISO 8859-7  (Modern Greek)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for ISO8859-7, the Modern
+	  Greek character set.
+
+config NLS_ISO8859_9
+	tristate "NLS ISO 8859-9  (Latin 5; Turkish)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 5 character
+	  set, and it replaces the rarely needed Icelandic letters in Latin 1
+	  with the Turkish ones. Useful in Turkey.
+
+config NLS_ISO8859_13
+	tristate "NLS ISO 8859-13 (Latin 7; Baltic)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 7 character
+	  set, which supports modern Baltic languages including Latvian
+	  and Lithuanian.
+
+config NLS_ISO8859_14
+	tristate "NLS ISO 8859-14 (Latin 8; Celtic)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 8 character
+	  set, which adds the last accented vowels for Welsh (aka Cymraeg)
+	  (and Manx Gaelic) that were missing in Latin 1.
+	  <http://linux.speech.cymru.org/> has further information.
+
+config NLS_ISO8859_15
+	tristate "NLS ISO 8859-15 (Latin 9; Western European Languages with Euro)"
+	---help---
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the Latin 9 character
+	  set, which covers most West European languages such as Albanian,
+	  Catalan, Danish, Dutch, English, Estonian, Faeroese, Finnish,
+	  French, German, Galician, Irish, Icelandic, Italian, Norwegian,
+	  Portuguese, Spanish, and Swedish. Latin 9 is an update to
+	  Latin 1 (ISO 8859-1) that removes a handful of rarely used
+	  characters and instead adds support for Estonian, corrects the
+	  support for French and Finnish, and adds the new Euro character.
+	  If unsure, say Y.
+
+config NLS_KOI8_R
+	tristate "NLS KOI8-R (Russian)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the preferred Russian
+	  character set.
+
+config NLS_KOI8_U
+	tristate "NLS KOI8-U/RU (Ukrainian, Belarusian)"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the preferred Ukrainian
+	  (koi8-u) and Belarusian (koi8-ru) character sets.
+
+config NLS_UTF8
+	tristate "NLS UTF-8"
+	help
+	  If you want to display filenames with native language characters
+	  from the Microsoft FAT file system family or from JOLIET CD-ROMs
+	  correctly on the screen, you need to include the appropriate
+	  input/output character sets. Say Y here for the UTF-8 encoding of
+	  the Unicode/ISO9646 universal character set.
+
+endif # NLS
diff --git a/fs/nls/Makefile b/fs/nls/Makefile
new file mode 100644
index 00000000..f499dd7c
--- /dev/null
+++ b/fs/nls/Makefile
@@ -0,0 +1,44 @@
+#
+# Makefile for native language support
+#
+
+obj-$(CONFIG_NLS)		+= nls_base.o
+
+obj-$(CONFIG_NLS_CODEPAGE_437)	+= nls_cp437.o
+obj-$(CONFIG_NLS_CODEPAGE_737)	+= nls_cp737.o
+obj-$(CONFIG_NLS_CODEPAGE_775)	+= nls_cp775.o
+obj-$(CONFIG_NLS_CODEPAGE_850)	+= nls_cp850.o
+obj-$(CONFIG_NLS_CODEPAGE_852)	+= nls_cp852.o
+obj-$(CONFIG_NLS_CODEPAGE_855)	+= nls_cp855.o
+obj-$(CONFIG_NLS_CODEPAGE_857)	+= nls_cp857.o
+obj-$(CONFIG_NLS_CODEPAGE_860)	+= nls_cp860.o
+obj-$(CONFIG_NLS_CODEPAGE_861)	+= nls_cp861.o
+obj-$(CONFIG_NLS_CODEPAGE_862)	+= nls_cp862.o
+obj-$(CONFIG_NLS_CODEPAGE_863)	+= nls_cp863.o
+obj-$(CONFIG_NLS_CODEPAGE_864)	+= nls_cp864.o
+obj-$(CONFIG_NLS_CODEPAGE_865)	+= nls_cp865.o
+obj-$(CONFIG_NLS_CODEPAGE_866)	+= nls_cp866.o
+obj-$(CONFIG_NLS_CODEPAGE_869)	+= nls_cp869.o
+obj-$(CONFIG_NLS_CODEPAGE_874)	+= nls_cp874.o
+obj-$(CONFIG_NLS_CODEPAGE_932)	+= nls_cp932.o nls_euc-jp.o
+obj-$(CONFIG_NLS_CODEPAGE_936)	+= nls_cp936.o
+obj-$(CONFIG_NLS_CODEPAGE_949)	+= nls_cp949.o
+obj-$(CONFIG_NLS_CODEPAGE_950)	+= nls_cp950.o
+obj-$(CONFIG_NLS_CODEPAGE_1250) += nls_cp1250.o
+obj-$(CONFIG_NLS_CODEPAGE_1251)	+= nls_cp1251.o
+obj-$(CONFIG_NLS_ASCII)		+= nls_ascii.o
+obj-$(CONFIG_NLS_ISO8859_1)	+= nls_iso8859-1.o
+obj-$(CONFIG_NLS_ISO8859_2)	+= nls_iso8859-2.o
+obj-$(CONFIG_NLS_ISO8859_3)	+= nls_iso8859-3.o
+obj-$(CONFIG_NLS_ISO8859_4)	+= nls_iso8859-4.o
+obj-$(CONFIG_NLS_ISO8859_5)	+= nls_iso8859-5.o
+obj-$(CONFIG_NLS_ISO8859_6)	+= nls_iso8859-6.o
+obj-$(CONFIG_NLS_ISO8859_7)	+= nls_iso8859-7.o
+obj-$(CONFIG_NLS_ISO8859_8)	+= nls_cp1255.o
+obj-$(CONFIG_NLS_ISO8859_9)	+= nls_iso8859-9.o
+obj-$(CONFIG_NLS_ISO8859_13)	+= nls_iso8859-13.o
+obj-$(CONFIG_NLS_ISO8859_14)	+= nls_iso8859-14.o
+obj-$(CONFIG_NLS_ISO8859_15)	+= nls_iso8859-15.o
+obj-$(CONFIG_NLS_KOI8_R)	+= nls_koi8-r.o
+obj-$(CONFIG_NLS_KOI8_U)	+= nls_koi8-u.o nls_koi8-ru.o
+obj-$(CONFIG_NLS_UTF8)		+= nls_utf8.o
diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c
new file mode 100644
index 00000000..7020e940
--- /dev/null
+++ b/fs/nls/nls_ascii.c
@@ -0,0 +1,167 @@
+/*
+ * linux/fs/nls/nls_ascii.c
+ *
+ * Charset ascii translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00,
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "ascii",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_ascii(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_ascii(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_ascii)
+module_exit(exit_nls_ascii)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
new file mode 100644
index 00000000..44a88a9f
--- /dev/null
+++ b/fs/nls/nls_base.c
@@ -0,0 +1,524 @@
+/*
+ * linux/fs/nls/nls_base.c
+ *
+ * Native language support--charsets and unicode translations.
+ * By Gordon Chaffee 1996, 1997
+ *
+ * Unicode based case conversion 1999 by Wolfram Pienkoss
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/kmod.h>
+#include <linux/spinlock.h>
+#include <asm/byteorder.h>
+
+static struct nls_table default_table;
+static struct nls_table *tables = &default_table;
+static DEFINE_SPINLOCK(nls_lock);
+
+/*
+ * Sample implementation from Unicode home page.
+ * http://www.stonehand.com/unicode/standard/fss-utf.html
+ */
+struct utf8_table {
+	int     cmask;
+	int     cval;
+	int     shift;
+	long    lmask;
+	long    lval;
+};
+
+static const struct utf8_table utf8_table[] =
+{
+    {0x80,  0x00,   0*6,    0x7F,           0,         /* 1 byte sequence */},
+    {0xE0,  0xC0,   1*6,    0x7FF,          0x80,      /* 2 byte sequence */},
+    {0xF0,  0xE0,   2*6,    0xFFFF,         0x800,     /* 3 byte sequence */},
+    {0xF8,  0xF0,   3*6,    0x1FFFFF,       0x10000,   /* 4 byte sequence */},
+    {0xFC,  0xF8,   4*6,    0x3FFFFFF,      0x200000,  /* 5 byte sequence */},
+    {0xFE,  0xFC,   5*6,    0x7FFFFFFF,     0x4000000, /* 6 byte sequence */},
+    {0,						       /* end of table    */}
+};
+
+#define UNICODE_MAX	0x0010ffff
+#define PLANE_SIZE	0x00010000
+
+#define SURROGATE_MASK	0xfffff800
+#define SURROGATE_PAIR	0x0000d800
+#define SURROGATE_LOW	0x00000400
+#define SURROGATE_BITS	0x000003ff
+
+int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
+{
+	unsigned long l;
+	int c0, c, nc;
+	const struct utf8_table *t;
+  
+	nc = 0;
+	c0 = *s;
+	l = c0;
+	for (t = utf8_table; t->cmask; t++) {
+		nc++;
+		if ((c0 & t->cmask) == t->cval) {
+			l &= t->lmask;
+			if (l < t->lval || l > UNICODE_MAX ||
+					(l & SURROGATE_MASK) == SURROGATE_PAIR)
+				return -1;
+			*pu = (unicode_t) l;
+			return nc;
+		}
+		if (len <= nc)
+			return -1;
+		s++;
+		c = (*s ^ 0x80) & 0xFF;
+		if (c & 0xC0)
+			return -1;
+		l = (l << 6) | c;
+	}
+	return -1;
+}
+EXPORT_SYMBOL(utf8_to_utf32);
+
+int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
+{
+	unsigned long l;
+	int c, nc;
+	const struct utf8_table *t;
+
+	if (!s)
+		return 0;
+
+	l = u;
+	if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR)
+		return -1;
+
+	nc = 0;
+	for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
+		nc++;
+		if (l <= t->lmask) {
+			c = t->shift;
+			*s = (u8) (t->cval | (l >> c));
+			while (c > 0) {
+				c -= 6;
+				s++;
+				*s = (u8) (0x80 | ((l >> c) & 0x3F));
+			}
+			return nc;
+		}
+	}
+	return -1;
+}
+EXPORT_SYMBOL(utf32_to_utf8);
+
+int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
+{
+	u16 *op;
+	int size;
+	unicode_t u;
+
+	op = pwcs;
+	while (*s && len > 0) {
+		if (*s & 0x80) {
+			size = utf8_to_utf32(s, len, &u);
+			if (size < 0)
+				return -EINVAL;
+
+			if (u >= PLANE_SIZE) {
+				u -= PLANE_SIZE;
+				*op++ = (wchar_t) (SURROGATE_PAIR |
+						((u >> 10) & SURROGATE_BITS));
+				*op++ = (wchar_t) (SURROGATE_PAIR |
+						SURROGATE_LOW |
+						(u & SURROGATE_BITS));
+			} else {
+				*op++ = (wchar_t) u;
+			}
+			s += size;
+			len -= size;
+		} else {
+			*op++ = *s++;
+			len--;
+		}
+	}
+	return op - pwcs;
+}
+EXPORT_SYMBOL(utf8s_to_utf16s);
+
+static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
+{
+	switch (endian) {
+	default:
+		return c;
+	case UTF16_LITTLE_ENDIAN:
+		return __le16_to_cpu(c);
+	case UTF16_BIG_ENDIAN:
+		return __be16_to_cpu(c);
+	}
+}
+
+int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
+		u8 *s, int maxlen)
+{
+	u8 *op;
+	int size;
+	unsigned long u, v;
+
+	op = s;
+	while (len > 0 && maxlen > 0) {
+		u = get_utf16(*pwcs, endian);
+		if (!u)
+			break;
+		pwcs++;
+		len--;
+		if (u > 0x7f) {
+			if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
+				if (u & SURROGATE_LOW) {
+					/* Ignore character and move on */
+					continue;
+				}
+				if (len <= 0)
+					break;
+				v = get_utf16(*pwcs, endian);
+				if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
+						!(v & SURROGATE_LOW)) {
+					/* Ignore character and move on */
+					continue;
+				}
+				u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
+						+ (v & SURROGATE_BITS);
+				pwcs++;
+				len--;
+			}
+			size = utf32_to_utf8(u, op, maxlen);
+			if (size == -1) {
+				/* Ignore character and move on */
+			} else {
+				op += size;
+				maxlen -= size;
+			}
+		} else {
+			*op++ = (u8) u;
+			maxlen--;
+		}
+	}
+	return op - s;
+}
+EXPORT_SYMBOL(utf16s_to_utf8s);
+
+int register_nls(struct nls_table * nls)
+{
+	struct nls_table ** tmp = &tables;
+
+	if (nls->next)
+		return -EBUSY;
+
+	spin_lock(&nls_lock);
+	while (*tmp) {
+		if (nls == *tmp) {
+			spin_unlock(&nls_lock);
+			return -EBUSY;
+		}
+		tmp = &(*tmp)->next;
+	}
+	nls->next = tables;
+	tables = nls;
+	spin_unlock(&nls_lock);
+	return 0;	
+}
+
+int unregister_nls(struct nls_table * nls)
+{
+	struct nls_table ** tmp = &tables;
+
+	spin_lock(&nls_lock);
+	while (*tmp) {
+		if (nls == *tmp) {
+			*tmp = nls->next;
+			spin_unlock(&nls_lock);
+			return 0;
+		}
+		tmp = &(*tmp)->next;
+	}
+	spin_unlock(&nls_lock);
+	return -EINVAL;
+}
+
+static struct nls_table *find_nls(char *charset)
+{
+	struct nls_table *nls;
+	spin_lock(&nls_lock);
+	for (nls = tables; nls; nls = nls->next) {
+		if (!strcmp(nls->charset, charset))
+			break;
+		if (nls->alias && !strcmp(nls->alias, charset))
+			break;
+	}
+	if (nls && !try_module_get(nls->owner))
+		nls = NULL;
+	spin_unlock(&nls_lock);
+	return nls;
+}
+
+struct nls_table *load_nls(char *charset)
+{
+	return try_then_request_module(find_nls(charset), "nls_%s", charset);
+}
+
+void unload_nls(struct nls_table *nls)
+{
+	if (nls)
+		module_put(nls->owner);
+}
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x00a1, 0x00a2, 0x00a3,
+	0x00a4, 0x00a5, 0x00a6, 0x00a7,
+	0x00a8, 0x00a9, 0x00aa, 0x00ab,
+	0x00ac, 0x00ad, 0x00ae, 0x00af,
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x00b2, 0x00b3,
+	0x00b4, 0x00b5, 0x00b6, 0x00b7,
+	0x00b8, 0x00b9, 0x00ba, 0x00bb,
+	0x00bc, 0x00bd, 0x00be, 0x00bf,
+	/* 0xc0*/
+	0x00c0, 0x00c1, 0x00c2, 0x00c3,
+	0x00c4, 0x00c5, 0x00c6, 0x00c7,
+	0x00c8, 0x00c9, 0x00ca, 0x00cb,
+	0x00cc, 0x00cd, 0x00ce, 0x00cf,
+	/* 0xd0*/
+	0x00d0, 0x00d1, 0x00d2, 0x00d3,
+	0x00d4, 0x00d5, 0x00d6, 0x00d7,
+	0x00d8, 0x00d9, 0x00da, 0x00db,
+	0x00dc, 0x00dd, 0x00de, 0x00df,
+	/* 0xe0*/
+	0x00e0, 0x00e1, 0x00e2, 0x00e3,
+	0x00e4, 0x00e5, 0x00e6, 0x00e7,
+	0x00e8, 0x00e9, 0x00ea, 0x00eb,
+	0x00ec, 0x00ed, 0x00ee, 0x00ef,
+	/* 0xf0*/
+	0x00f0, 0x00f1, 0x00f2, 0x00f3,
+	0x00f4, 0x00f5, 0x00f6, 0x00f7,
+	0x00f8, 0x00f9, 0x00fa, 0x00fb,
+	0x00fc, 0x00fd, 0x00fe, 0x00ff,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table default_table = {
+	.charset	= "default",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+};
+
+/* Returns a simple default translation table */
+struct nls_table *load_nls_default(void)
+{
+	struct nls_table *default_nls;
+	
+	default_nls = load_nls(CONFIG_NLS_DEFAULT);
+	if (default_nls != NULL)
+		return default_nls;
+	else
+		return &default_table;
+}
+
+EXPORT_SYMBOL(register_nls);
+EXPORT_SYMBOL(unregister_nls);
+EXPORT_SYMBOL(unload_nls);
+EXPORT_SYMBOL(load_nls);
+EXPORT_SYMBOL(load_nls_default);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c
new file mode 100644
index 00000000..c8471fe7
--- /dev/null
+++ b/fs/nls/nls_cp1250.c
@@ -0,0 +1,347 @@
+/*
+ * linux/fs/nls/nls_cp1250.c
+ *
+ * Charset cp1250 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003, 
+	0x0004, 0x0005, 0x0006, 0x0007, 
+	0x0008, 0x0009, 0x000a, 0x000b, 
+	0x000c, 0x000d, 0x000e, 0x000f, 
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013, 
+	0x0014, 0x0015, 0x0016, 0x0017, 
+	0x0018, 0x0019, 0x001a, 0x001b, 
+	0x001c, 0x001d, 0x001e, 0x001f, 
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023, 
+	0x0024, 0x0025, 0x0026, 0x0027, 
+	0x0028, 0x0029, 0x002a, 0x002b, 
+	0x002c, 0x002d, 0x002e, 0x002f, 
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033, 
+	0x0034, 0x0035, 0x0036, 0x0037, 
+	0x0038, 0x0039, 0x003a, 0x003b, 
+	0x003c, 0x003d, 0x003e, 0x003f, 
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043, 
+	0x0044, 0x0045, 0x0046, 0x0047, 
+	0x0048, 0x0049, 0x004a, 0x004b, 
+	0x004c, 0x004d, 0x004e, 0x004f, 
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053, 
+	0x0054, 0x0055, 0x0056, 0x0057, 
+	0x0058, 0x0059, 0x005a, 0x005b, 
+	0x005c, 0x005d, 0x005e, 0x005f, 
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063, 
+	0x0064, 0x0065, 0x0066, 0x0067, 
+	0x0068, 0x0069, 0x006a, 0x006b, 
+	0x006c, 0x006d, 0x006e, 0x006f, 
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073, 
+	0x0074, 0x0075, 0x0076, 0x0077, 
+	0x0078, 0x0079, 0x007a, 0x007b, 
+	0x007c, 0x007d, 0x007e, 0x007f, 
+	/* 0x80*/
+	0x20ac, 0x0000, 0x201a, 0x0000, 
+	0x201e, 0x2026, 0x2020, 0x2021, 
+	0x0000, 0x2030, 0x0160, 0x2039, 
+	0x015a, 0x0164, 0x017d, 0x0179, 
+	/* 0x90*/
+	0x0000, 0x2018, 0x2019, 0x201c, 
+	0x201d, 0x2022, 0x2013, 0x2014, 
+	0x0000, 0x2122, 0x0161, 0x203a, 
+	0x015b, 0x0165, 0x017e, 0x017a, 
+	/* 0xa0*/
+	0x00a0, 0x02c7, 0x02d8, 0x0141, 
+	0x00a4, 0x0104, 0x00a6, 0x00a7, 
+	0x00a8, 0x00a9, 0x015e, 0x00ab, 
+	0x00ac, 0x00ad, 0x00ae, 0x017b, 
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x02db, 0x0142, 
+	0x00b4, 0x00b5, 0x00b6, 0x00b7, 
+	0x00b8, 0x0105, 0x015f, 0x00bb, 
+	0x013d, 0x02dd, 0x013e, 0x017c, 
+	/* 0xc0*/
+	0x0154, 0x00c1, 0x00c2, 0x0102, 
+	0x00c4, 0x0139, 0x0106, 0x00c7, 
+	0x010c, 0x00c9, 0x0118, 0x00cb, 
+	0x011a, 0x00cd, 0x00ce, 0x010e, 
+	/* 0xd0*/
+	0x0110, 0x0143, 0x0147, 0x00d3, 
+	0x00d4, 0x0150, 0x00d6, 0x00d7, 
+	0x0158, 0x016e, 0x00da, 0x0170, 
+	0x00dc, 0x00dd, 0x0162, 0x00df, 
+	/* 0xe0*/
+	0x0155, 0x00e1, 0x00e2, 0x0103, 
+	0x00e4, 0x013a, 0x0107, 0x00e7, 
+	0x010d, 0x00e9, 0x0119, 0x00eb, 
+	0x011b, 0x00ed, 0x00ee, 0x010f, 
+	/* 0xf0*/
+	0x0111, 0x0144, 0x0148, 0x00f3, 
+	0x00f4, 0x0151, 0x00f6, 0x00f7, 
+	0x0159, 0x016f, 0x00fa, 0x0171, 
+	0x00fc, 0x00fd, 0x0163, 0x02d9, 
+	};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0x00, 0x00, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */
+	0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0x00, 0x00, 0xda, 0x00, 0xdc, 0xdd, 0x00, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */
+	0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0x00, 0x00, 0xfa, 0x00, 0xfc, 0xfd, 0x00, 0x00, /* 0xf8-0xff */
+	};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0xc3, 0xe3, 0xa5, 0xb9, 0xc6, 0xe6, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0xcf, 0xef, /* 0x08-0x0f */
+	0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xca, 0xea, 0xcc, 0xec, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0xc5, 0xe5, 0x00, 0x00, 0xbc, 0xbe, 0x00, /* 0x38-0x3f */
+	0x00, 0xa3, 0xb3, 0xd1, 0xf1, 0x00, 0x00, 0xd2, /* 0x40-0x47 */
+	0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xd5, 0xf5, 0x00, 0x00, 0xc0, 0xe0, 0x00, 0x00, /* 0x50-0x57 */
+	0xd8, 0xf8, 0x8c, 0x9c, 0x00, 0x00, 0xaa, 0xba, /* 0x58-0x5f */
+	0x8a, 0x9a, 0xde, 0xfe, 0x8d, 0x9d, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd9, 0xf9, /* 0x68-0x6f */
+	0xdb, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x8f, 0x9f, 0xaf, 0xbf, 0x8e, 0x9e, 0x00, /* 0x78-0x7f */
+
+	};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa1, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0xa2, 0xff, 0x00, 0xb2, 0x00, 0xbd, 0x00, 0x00, /* 0xd8-0xdf */
+	};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */
+	0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00,	page01,	page02,	NULL,	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	page20,	page21,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x00, 0x82, 0x00, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x00, 0x89, 0x9a, 0x8b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x88-0x8f */
+	0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x00, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xb3, 0xa4, 0xb9, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xba, 0xab, 0xac, 0xad, 0xae, 0xbf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbe, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+	};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x00, 0x82, 0x00, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x00, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x00, 0x99, 0x8a, 0x9b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xa3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xa5, 0xaa, 0xbb, 0xbc, 0xbd, 0xbc, 0xaf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0x00, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */
+	};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+        const unsigned char *uni2charset;
+        unsigned char cl = uni & 0x00ff;
+        unsigned char ch = (uni & 0xff00) >> 8;
+
+        if (boundlen <= 0)
+                return -ENAMETOOLONG;
+
+        uni2charset = page_uni2charset[ch];
+        if (uni2charset && uni2charset[cl])
+                out[0] = uni2charset[cl];
+        else
+                return -EINVAL;
+        return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+        *uni = charset2uni[*rawstring];
+        if (*uni == 0x0000)
+                return -EINVAL;
+        return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp1250",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp1250(void)
+{
+        return register_nls(&table);
+}
+static void __exit exit_nls_cp1250(void)
+{
+        unregister_nls(&table);
+}
+
+module_init(init_nls_cp1250)
+module_exit(exit_nls_cp1250)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c
new file mode 100644
index 00000000..1939b46e
--- /dev/null
+++ b/fs/nls/nls_cp1251.c
@@ -0,0 +1,302 @@
+/*
+ * linux/fs/nls/nls_cp1251.c
+ *
+ * Charset cp1251 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003, 
+	0x0004, 0x0005, 0x0006, 0x0007, 
+	0x0008, 0x0009, 0x000a, 0x000b, 
+	0x000c, 0x000d, 0x000e, 0x000f, 
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013, 
+	0x0014, 0x0015, 0x0016, 0x0017, 
+	0x0018, 0x0019, 0x001a, 0x001b, 
+	0x001c, 0x001d, 0x001e, 0x001f, 
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023, 
+	0x0024, 0x0025, 0x0026, 0x0027, 
+	0x0028, 0x0029, 0x002a, 0x002b, 
+	0x002c, 0x002d, 0x002e, 0x002f, 
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033, 
+	0x0034, 0x0035, 0x0036, 0x0037, 
+	0x0038, 0x0039, 0x003a, 0x003b, 
+	0x003c, 0x003d, 0x003e, 0x003f, 
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043, 
+	0x0044, 0x0045, 0x0046, 0x0047, 
+	0x0048, 0x0049, 0x004a, 0x004b, 
+	0x004c, 0x004d, 0x004e, 0x004f, 
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053, 
+	0x0054, 0x0055, 0x0056, 0x0057, 
+	0x0058, 0x0059, 0x005a, 0x005b, 
+	0x005c, 0x005d, 0x005e, 0x005f, 
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063, 
+	0x0064, 0x0065, 0x0066, 0x0067, 
+	0x0068, 0x0069, 0x006a, 0x006b, 
+	0x006c, 0x006d, 0x006e, 0x006f, 
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073, 
+	0x0074, 0x0075, 0x0076, 0x0077, 
+	0x0078, 0x0079, 0x007a, 0x007b, 
+	0x007c, 0x007d, 0x007e, 0x007f, 
+	/* 0x80*/
+	0x0402, 0x0403, 0x201a, 0x0453, 
+	0x201e, 0x2026, 0x2020, 0x2021, 
+	0x20ac, 0x2030, 0x0409, 0x2039, 
+	0x040a, 0x040c, 0x040b, 0x040f, 
+	/* 0x90*/
+	0x0452, 0x2018, 0x2019, 0x201c, 
+	0x201d, 0x2022, 0x2013, 0x2014, 
+	0x0000, 0x2122, 0x0459, 0x203a, 
+	0x045a, 0x045c, 0x045b, 0x045f, 
+	/* 0xa0*/
+	0x00a0, 0x040e, 0x045e, 0x0408, 
+	0x00a4, 0x0490, 0x00a6, 0x00a7, 
+	0x0401, 0x00a9, 0x0404, 0x00ab, 
+	0x00ac, 0x00ad, 0x00ae, 0x0407, 
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x0406, 0x0456, 
+	0x0491, 0x00b5, 0x00b6, 0x00b7, 
+	0x0451, 0x2116, 0x0454, 0x00bb, 
+	0x0458, 0x0405, 0x0455, 0x0457, 
+	/* 0xc0*/
+	0x0410, 0x0411, 0x0412, 0x0413, 
+	0x0414, 0x0415, 0x0416, 0x0417, 
+	0x0418, 0x0419, 0x041a, 0x041b, 
+	0x041c, 0x041d, 0x041e, 0x041f, 
+	/* 0xd0*/
+	0x0420, 0x0421, 0x0422, 0x0423, 
+	0x0424, 0x0425, 0x0426, 0x0427, 
+	0x0428, 0x0429, 0x042a, 0x042b, 
+	0x042c, 0x042d, 0x042e, 0x042f, 
+	/* 0xe0*/
+	0x0430, 0x0431, 0x0432, 0x0433, 
+	0x0434, 0x0435, 0x0436, 0x0437, 
+	0x0438, 0x0439, 0x043a, 0x043b, 
+	0x043c, 0x043d, 0x043e, 0x043f, 
+	/* 0xf0*/
+	0x0440, 0x0441, 0x0442, 0x0443, 
+	0x0444, 0x0445, 0x0446, 0x0447, 
+	0x0448, 0x0449, 0x044a, 0x044b, 
+	0x044c, 0x044d, 0x044e, 0x044f, 
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0x00, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0x00, 0x00, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page04[256] = {
+	0x00, 0xa8, 0x80, 0x81, 0xaa, 0xbd, 0xb2, 0xaf, /* 0x00-0x07 */
+	0xa3, 0x8a, 0x8c, 0x8e, 0x8d, 0x00, 0xa1, 0x8f, /* 0x08-0x0f */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x10-0x17 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x18-0x1f */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x20-0x27 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0x28-0x2f */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x30-0x37 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x38-0x3f */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x40-0x47 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0x48-0x4f */
+	0x00, 0xb8, 0x90, 0x83, 0xba, 0xbe, 0xb3, 0xbf, /* 0x50-0x57 */
+	0xbc, 0x9a, 0x9c, 0x9e, 0x9d, 0x00, 0xa2, 0x9f, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0xa5, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */
+	0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb9, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   page04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, page21, NULL,	NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+	0x90, 0x83, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x9a, 0x8b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa2, 0xa2, 0xbc, 0xa4, 0xb4, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xb8, 0xa9, 0xba, 0xab, 0xac, 0xad, 0xae, 0xbf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb3, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+	0x80, 0x81, 0x82, 0x81, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x80, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x8a, 0x9b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa1, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb2, 0xa5, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xa8, 0xb9, 0xaa, 0xbb, 0xa3, 0xbd, 0xbd, 0xaf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp1251",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp1251(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp1251(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp1251)
+module_exit(exit_nls_cp1251)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c
new file mode 100644
index 00000000..8120ae2e
--- /dev/null
+++ b/fs/nls/nls_cp1255.c
@@ -0,0 +1,385 @@
+/*
+ * linux/fs/nls/nls_cp1255.c
+ *
+ * Charset cp1255 translation tables.
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x20ac, 0x0000, 0x201a, 0x0192,
+	0x201e, 0x2026, 0x2020, 0x2021,
+	0x02c6, 0x2030, 0x0000, 0x2039,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	/* 0x90*/
+	0x0000, 0x2018, 0x2019, 0x201c,
+	0x201d, 0x2022, 0x2013, 0x2014,
+	0x02dc, 0x2122, 0x0000, 0x203a,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	/* 0xa0*/
+	0x00a0, 0x00a1, 0x00a2, 0x00a3,
+	0x20aa, 0x00a5, 0x00a6, 0x00a7,
+	0x00a8, 0x00a9, 0x00d7, 0x00ab,
+	0x00ac, 0x00ad, 0x00ae, 0x203e,
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x00b2, 0x00b3,
+	0x00b4, 0x00b5, 0x00b6, 0x00b7,
+	0x00b8, 0x00b9, 0x00f7, 0x00bb,
+	0x00bc, 0x00bd, 0x00be, 0x00bf,
+	/* 0xc0*/
+	0x05b0, 0x05b1, 0x05b2, 0x05b3,
+	0x05b4, 0x05b5, 0x05b6, 0x05b7,
+	0x05b8, 0x05b9, 0x0000, 0x05bb,
+	0x05bc, 0x05bd, 0x05be, 0x05bf,
+	/* 0xd0*/
+	0x05c0, 0x05c1, 0x05c2, 0x05c3,
+	0x05f0, 0x05f1, 0x05f2, 0x05f3,
+	0x05f4, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x2017,
+	/* 0xe0*/
+	0x05d0, 0x05d1, 0x05d2, 0x05d3,
+	0x05d4, 0x05d5, 0x05d6, 0x05d7,
+	0x05d8, 0x05d9, 0x05da, 0x05db,
+	0x05dc, 0x05dd, 0x05de, 0x05df,
+	/* 0xf0*/
+	0x05e0, 0x05e1, 0x05e2, 0x05e3,
+	0x05e4, 0x05e5, 0x05e6, 0x05e7,
+	0x05e8, 0x05e9, 0x05ea, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0x00, 0xa2, 0xa3, 0x00, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0x00, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xba, /* 0xf0-0xf7 */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+};
+
+static const unsigned char page05[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xb0-0xb7 */
+	0xc8, 0xc9, 0x00, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xb8-0xbf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xd0-0xd7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xd8-0xdf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xe0-0xe7 */
+	0xf8, 0xf9, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0xfe, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */
+	0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0xa4, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb9, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, NULL,   NULL,   page05, NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, page21, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0x00, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0x00, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp1255",
+	.alias		= "iso8859-8",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp1255(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp1255(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp1255)
+module_exit(exit_nls_cp1255)
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NLS(iso8859-8);
diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c
new file mode 100644
index 00000000..ff37a462
--- /dev/null
+++ b/fs/nls/nls_cp437.c
@@ -0,0 +1,388 @@
+/*
+ * linux/fs/nls/nls_cp437.c
+ *
+ * Charset cp437 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00e4, 0x00e0, 0x00e5, 0x00e7,
+	0x00ea, 0x00eb, 0x00e8, 0x00ef,
+	0x00ee, 0x00ec, 0x00c4, 0x00c5,
+	/* 0x90*/
+	0x00c9, 0x00e6, 0x00c6, 0x00f4,
+	0x00f6, 0x00f2, 0x00fb, 0x00f9,
+	0x00ff, 0x00d6, 0x00dc, 0x00a2,
+	0x00a3, 0x00a5, 0x20a7, 0x0192,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x00f1, 0x00d1, 0x00aa, 0x00ba,
+	0x00bf, 0x2310, 0x00ac, 0x00bd,
+	0x00bc, 0x00a1, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x03b1, 0x00df, 0x0393, 0x03c0,
+	0x03a3, 0x03c3, 0x00b5, 0x03c4,
+	0x03a6, 0x0398, 0x03a9, 0x03b4,
+	0x221e, 0x03c6, 0x03b5, 0x2229,
+	/* 0xf0*/
+	0x2261, 0x00b1, 0x2265, 0x2264,
+	0x2320, 0x2321, 0x00f7, 0x2248,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x207f, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0xad, 0x9b, 0x9c, 0x00, 0x9d, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */
+	0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */
+	0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */
+	0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */
+	0x00, 0xa4, 0x95, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */
+	0x00, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x98, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */
+	0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0x00, 0x8e, 0x00, 0x8f, 0x80, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x92, 0x92, 0x00, 0x99, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp437",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp437(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp437(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp437)
+module_exit(exit_nls_cp437)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c
new file mode 100644
index 00000000..f5576b8b
--- /dev/null
+++ b/fs/nls/nls_cp737.c
@@ -0,0 +1,351 @@
+/*
+ * linux/fs/nls/nls_cp737.c
+ *
+ * Charset cp737 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0391, 0x0392, 0x0393, 0x0394,
+	0x0395, 0x0396, 0x0397, 0x0398,
+	0x0399, 0x039a, 0x039b, 0x039c,
+	0x039d, 0x039e, 0x039f, 0x03a0,
+	/* 0x90*/
+	0x03a1, 0x03a3, 0x03a4, 0x03a5,
+	0x03a6, 0x03a7, 0x03a8, 0x03a9,
+	0x03b1, 0x03b2, 0x03b3, 0x03b4,
+	0x03b5, 0x03b6, 0x03b7, 0x03b8,
+	/* 0xa0*/
+	0x03b9, 0x03ba, 0x03bb, 0x03bc,
+	0x03bd, 0x03be, 0x03bf, 0x03c0,
+	0x03c1, 0x03c3, 0x03c2, 0x03c4,
+	0x03c5, 0x03c6, 0x03c7, 0x03c8,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x03c9, 0x03ac, 0x03ad, 0x03ae,
+	0x03ca, 0x03af, 0x03cc, 0x03cd,
+	0x03cb, 0x03ce, 0x0386, 0x0388,
+	0x0389, 0x038a, 0x038c, 0x038e,
+	/* 0xf0*/
+	0x038f, 0x00b1, 0x2265, 0x2264,
+	0x03aa, 0x03ab, 0x00f7, 0x2248,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x207f, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0x00, 0x00, 0x00, 0x00, 0xfa, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, /* 0xf0-0xf7 */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xea, 0x00, /* 0x80-0x87 */
+	0xeb, 0xec, 0xed, 0x00, 0xee, 0x00, 0xef, 0xf0, /* 0x88-0x8f */
+	0x00, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, /* 0x90-0x97 */
+	0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, /* 0x98-0x9f */
+	0x8f, 0x90, 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, /* 0xa0-0xa7 */
+	0x96, 0x97, 0xf4, 0xf5, 0xe1, 0xe2, 0xe3, 0xe5, /* 0xa8-0xaf */
+	0x00, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, /* 0xb0-0xb7 */
+	0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, /* 0xb8-0xbf */
+	0xa7, 0xa8, 0xaa, 0xa9, 0xab, 0xac, 0xad, 0xae, /* 0xc0-0xc7 */
+	0xaf, 0xe0, 0xe4, 0xe8, 0xe6, 0xe7, 0xe9, 0x00, /* 0xc8-0xcf */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x80-0x87 */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x88-0x8f */
+	0xa8, 0xa9, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xe0, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xe1, 0xe2, 0xe3, 0xe5, 0xe6, 0xe7, /* 0xe8-0xef */
+	0xe9, 0xf1, 0xf2, 0xf3, 0xe4, 0xe8, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x98-0x9f */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0xa0-0xa7 */
+	0x90, 0x91, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x97, 0xea, 0xeb, 0xec, 0xf4, 0xed, 0xee, 0xef, /* 0xe0-0xe7 */
+	0xf5, 0xf0, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp737",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp737(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp737(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp737)
+module_exit(exit_nls_cp737)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c
new file mode 100644
index 00000000..4905635d
--- /dev/null
+++ b/fs/nls/nls_cp775.c
@@ -0,0 +1,320 @@
+/*
+ * linux/fs/nls/nls_cp775.c
+ *
+ * Charset cp775 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0106, 0x00fc, 0x00e9, 0x0101,
+	0x00e4, 0x0123, 0x00e5, 0x0107,
+	0x0142, 0x0113, 0x0156, 0x0157,
+	0x012b, 0x0179, 0x00c4, 0x00c5,
+	/* 0x90*/
+	0x00c9, 0x00e6, 0x00c6, 0x014d,
+	0x00f6, 0x0122, 0x00a2, 0x015a,
+	0x015b, 0x00d6, 0x00dc, 0x00f8,
+	0x00a3, 0x00d8, 0x00d7, 0x00a4,
+	/* 0xa0*/
+	0x0100, 0x012a, 0x00f3, 0x017b,
+	0x017c, 0x017a, 0x201d, 0x00a6,
+	0x00a9, 0x00ae, 0x00ac, 0x00bd,
+	0x00bc, 0x0141, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x0104, 0x010c, 0x0118,
+	0x0116, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x012e, 0x0160, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x0172, 0x016a,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x017d,
+	/* 0xd0*/
+	0x0105, 0x010d, 0x0119, 0x0117,
+	0x012f, 0x0161, 0x0173, 0x016b,
+	0x017e, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x00d3, 0x00df, 0x014c, 0x0143,
+	0x00f5, 0x00d5, 0x00b5, 0x0144,
+	0x0136, 0x0137, 0x013b, 0x013c,
+	0x0146, 0x0112, 0x0145, 0x2019,
+	/* 0xf0*/
+	0x00ad, 0x00b1, 0x201c, 0x00be,
+	0x00b6, 0x00a7, 0x00f7, 0x201e,
+	0x00b0, 0x2219, 0x00b7, 0x00b9,
+	0x00b3, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0x00, 0x96, 0x9c, 0x9f, 0x00, 0xa7, 0xf5, /* 0xa0-0xa7 */
+	0x00, 0xa8, 0x00, 0xae, 0xaa, 0xf0, 0xa9, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0xfc, 0x00, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */
+	0x00, 0xfb, 0x00, 0xaf, 0xac, 0xab, 0xf3, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0xe0, 0x00, 0xe5, 0x99, 0x9e, /* 0xd0-0xd7 */
+	0x9d, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x84, 0x86, 0x91, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0xa2, 0x00, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */
+	0x9b, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0xa0, 0x83, 0x00, 0x00, 0xb5, 0xd0, 0x80, 0x87, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xb6, 0xd1, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0xed, 0x89, 0x00, 0x00, 0xb8, 0xd3, /* 0x10-0x17 */
+	0xb7, 0xd2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x95, 0x85, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0xa1, 0x8c, 0x00, 0x00, 0xbd, 0xd4, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, 0xe9, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0xea, 0xeb, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0xad, 0x88, 0xe3, 0xe7, 0xee, 0xec, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xe2, 0x93, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8a, 0x8b, /* 0x50-0x57 */
+	0x00, 0x00, 0x97, 0x98, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xbe, 0xd5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0xc7, 0xd7, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0xc6, 0xd6, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x8d, 0xa5, 0xa3, 0xa4, 0xcf, 0xd8, 0x00, /* 0x78-0x7f */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xef, 0x00, 0x00, 0xf2, 0xa6, 0xf7, 0x00, /* 0x18-0x1f */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */
+	0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */
+	0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */
+	0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8b, 0x8b, 0x8c, 0xa5, 0x84, 0x86, /* 0x88-0x8f */
+	0x82, 0x91, 0x91, 0x93, 0x94, 0x85, 0x96, 0x98, /* 0x90-0x97 */
+	0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */
+	0x83, 0x8c, 0xa2, 0xa4, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0x88, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xd0, 0xd1, 0xd2, /* 0xb0-0xb7 */
+	0xd3, 0xb9, 0xba, 0xbb, 0xbc, 0xd4, 0xd5, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xd6, 0xd7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xd8, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xa2, 0xe1, 0x93, 0xe7, 0xe4, 0xe4, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe9, 0xe9, 0xeb, 0xeb, 0xec, 0x89, 0xec, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0xa0, 0x8e, 0x95, 0x8f, 0x80, /* 0x80-0x87 */
+	0xad, 0xed, 0x8a, 0x8a, 0xa1, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x92, 0x92, 0xe2, 0x99, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x97, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xe0, 0xa3, 0xa3, 0x8d, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xb5, 0xb6, 0xb7, 0xb8, 0xbd, 0xbe, 0xc6, 0xc7, /* 0xd0-0xd7 */
+	0xcf, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0xe3, /* 0xe0-0xe7 */
+	0xe8, 0xe8, 0xea, 0xea, 0xee, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp775",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp775(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp775(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp775)
+module_exit(exit_nls_cp775)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c
new file mode 100644
index 00000000..fe5bdad5
--- /dev/null
+++ b/fs/nls/nls_cp850.c
@@ -0,0 +1,316 @@
+/*
+ * linux/fs/nls/nls_cp850.c
+ *
+ * Charset cp850 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00e4, 0x00e0, 0x00e5, 0x00e7,
+	0x00ea, 0x00eb, 0x00e8, 0x00ef,
+	0x00ee, 0x00ec, 0x00c4, 0x00c5,
+	/* 0x90*/
+	0x00c9, 0x00e6, 0x00c6, 0x00f4,
+	0x00f6, 0x00f2, 0x00fb, 0x00f9,
+	0x00ff, 0x00d6, 0x00dc, 0x00f8,
+	0x00a3, 0x00d8, 0x00d7, 0x0192,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x00f1, 0x00d1, 0x00aa, 0x00ba,
+	0x00bf, 0x00ae, 0x00ac, 0x00bd,
+	0x00bc, 0x00a1, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x00c1, 0x00c2, 0x00c0,
+	0x00a9, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x00a2, 0x00a5, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x00e3, 0x00c3,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x00a4,
+	/* 0xd0*/
+	0x00f0, 0x00d0, 0x00ca, 0x00cb,
+	0x00c8, 0x0131, 0x00cd, 0x00ce,
+	0x00cf, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x00a6, 0x00cc, 0x2580,
+	/* 0xe0*/
+	0x00d3, 0x00df, 0x00d4, 0x00d2,
+	0x00f5, 0x00d5, 0x00b5, 0x00fe,
+	0x00de, 0x00da, 0x00db, 0x00d9,
+	0x00fd, 0x00dd, 0x00af, 0x00b4,
+	/* 0xf0*/
+	0x00ad, 0x00b1, 0x2017, 0x00be,
+	0x00b6, 0x00a7, 0x00f7, 0x00b8,
+	0x00b0, 0x00a8, 0x00b7, 0x00b9,
+	0x00b3, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0xad, 0xbd, 0x9c, 0xcf, 0xbe, 0xdd, 0xf5, /* 0xa0-0xa7 */
+	0xf9, 0xb8, 0xa6, 0xae, 0xaa, 0xf0, 0xa9, 0xee, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0xfc, 0xef, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */
+	0xf7, 0xfb, 0xa7, 0xaf, 0xac, 0xab, 0xf3, 0xa8, /* 0xb8-0xbf */
+	0xb7, 0xb5, 0xb6, 0xc7, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */
+	0xd4, 0x90, 0xd2, 0xd3, 0xde, 0xd6, 0xd7, 0xd8, /* 0xc8-0xcf */
+	0xd1, 0xa5, 0xe3, 0xe0, 0xe2, 0xe5, 0x99, 0x9e, /* 0xd0-0xd7 */
+	0x9d, 0xeb, 0xe9, 0xea, 0x9a, 0xed, 0xe8, 0xe1, /* 0xd8-0xdf */
+	0x85, 0xa0, 0x83, 0xc6, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */
+	0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */
+	0xd0, 0xa4, 0x95, 0xa2, 0x93, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */
+	0x9b, 0x97, 0xa3, 0x96, 0x81, 0xec, 0xe7, 0x98, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0xd5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, /* 0x10-0x17 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */
+	0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */
+	0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */
+	0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   NULL,   NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */
+	0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xa0, 0x83, 0x85, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd0, 0x88, 0x89, 0x8a, 0xd5, 0xa1, 0x8c, /* 0xd0-0xd7 */
+	0x8b, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0x8d, 0xdf, /* 0xd8-0xdf */
+	0xa2, 0xe1, 0x93, 0x95, 0xe4, 0xe4, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe7, 0xa3, 0x96, 0x97, 0xec, 0xec, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0xb6, 0x8e, 0xb7, 0x8f, 0x80, /* 0x80-0x87 */
+	0xd2, 0xd3, 0xd4, 0xd8, 0xd7, 0xde, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x92, 0x92, 0xe2, 0x99, 0xe3, 0xea, 0xeb, /* 0x90-0x97 */
+	0x00, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0xb5, 0xd6, 0xe0, 0xe9, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd1, 0xd1, 0xd2, 0xd3, 0xd4, 0x49, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0xe8, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xed, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp850",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp850(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp850(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp850)
+module_exit(exit_nls_cp850)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c
new file mode 100644
index 00000000..ceb1c016
--- /dev/null
+++ b/fs/nls/nls_cp852.c
@@ -0,0 +1,338 @@
+/*
+ * linux/fs/nls/nls_cp852.c
+ *
+ * Charset cp852 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00e4, 0x016f, 0x0107, 0x00e7,
+	0x0142, 0x00eb, 0x0150, 0x0151,
+	0x00ee, 0x0179, 0x00c4, 0x0106,
+	/* 0x90*/
+	0x00c9, 0x0139, 0x013a, 0x00f4,
+	0x00f6, 0x013d, 0x013e, 0x015a,
+	0x015b, 0x00d6, 0x00dc, 0x0164,
+	0x0165, 0x0141, 0x00d7, 0x010d,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x0104, 0x0105, 0x017d, 0x017e,
+	0x0118, 0x0119, 0x00ac, 0x017a,
+	0x010c, 0x015f, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x00c1, 0x00c2, 0x011a,
+	0x015e, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x017b, 0x017c, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x0102, 0x0103,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x00a4,
+	/* 0xd0*/
+	0x0111, 0x0110, 0x010e, 0x00cb,
+	0x010f, 0x0147, 0x00cd, 0x00ce,
+	0x011b, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x0162, 0x016e, 0x2580,
+	/* 0xe0*/
+	0x00d3, 0x00df, 0x00d4, 0x0143,
+	0x0144, 0x0148, 0x0160, 0x0161,
+	0x0154, 0x00da, 0x0155, 0x0170,
+	0x00fd, 0x00dd, 0x0163, 0x00b4,
+	/* 0xf0*/
+	0x00ad, 0x02dd, 0x02db, 0x02c7,
+	0x02d8, 0x00a7, 0x00f7, 0x00b8,
+	0x00b0, 0x00a8, 0x02d9, 0x0171,
+	0x0158, 0x0159, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0xf5, /* 0xa0-0xa7 */
+	0xf9, 0x00, 0x00, 0xae, 0xaa, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0xf7, 0x00, 0x00, 0xaf, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0xb5, 0xb6, 0x00, 0x8e, 0x00, 0x00, 0x80, /* 0xc0-0xc7 */
+	0x00, 0x90, 0x00, 0xd3, 0x00, 0xd6, 0xd7, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0xe0, 0xe2, 0x00, 0x99, 0x9e, /* 0xd0-0xd7 */
+	0x00, 0x00, 0xe9, 0x00, 0x9a, 0xed, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x00, 0xa0, 0x83, 0x00, 0x84, 0x00, 0x00, 0x87, /* 0xe0-0xe7 */
+	0x00, 0x82, 0x00, 0x89, 0x00, 0xa1, 0x8c, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */
+	0x00, 0x00, 0xa3, 0x00, 0x81, 0xec, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0xc6, 0xc7, 0xa4, 0xa5, 0x8f, 0x86, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xac, 0x9f, 0xd2, 0xd4, /* 0x08-0x0f */
+	0xd1, 0xd0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xa8, 0xa9, 0xb7, 0xd8, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x91, 0x92, 0x00, 0x00, 0x95, 0x96, 0x00, /* 0x38-0x3f */
+	0x00, 0x9d, 0x88, 0xe3, 0xe4, 0x00, 0x00, 0xd5, /* 0x40-0x47 */
+	0xe5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x8a, 0x8b, 0x00, 0x00, 0xe8, 0xea, 0x00, 0x00, /* 0x50-0x57 */
+	0xfc, 0xfd, 0x97, 0x98, 0x00, 0x00, 0xb8, 0xad, /* 0x58-0x5f */
+	0xe6, 0xe7, 0xdd, 0xee, 0x9b, 0x9c, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0x85, /* 0x68-0x6f */
+	0xeb, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x8d, 0xab, 0xbd, 0xbe, 0xa6, 0xa7, 0x00, /* 0x78-0x7f */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf3, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0xf4, 0xfa, 0x00, 0xf2, 0x00, 0xf1, 0x00, 0x00, /* 0xd8-0xdf */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */
+	0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */
+	0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */
+	0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8b, 0x8b, 0x8c, 0xab, 0x84, 0x86, /* 0x88-0x8f */
+	0x82, 0x92, 0x92, 0x93, 0x94, 0x96, 0x96, 0x98, /* 0x90-0x97 */
+	0x98, 0x94, 0x81, 0x9c, 0x9c, 0x88, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa5, 0xa5, 0xa7, 0xa7, /* 0xa0-0xa7 */
+	0xa9, 0xa9, 0xaa, 0xab, 0x9f, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xa0, 0x83, 0xd8, /* 0xb0-0xb7 */
+	0xad, 0xb9, 0xba, 0xbb, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd0, 0xd4, 0x89, 0xd4, 0xe5, 0xa1, 0x8c, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xee, 0x85, 0xdf, /* 0xd8-0xdf */
+	0xa2, 0xe1, 0x93, 0xe4, 0xe4, 0xe5, 0xe7, 0xe7, /* 0xe0-0xe7 */
+	0xea, 0xa3, 0xea, 0xfb, 0xec, 0xec, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfd, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0xb6, 0x8e, 0xde, 0x8f, 0x80, /* 0x80-0x87 */
+	0x9d, 0xd3, 0x8a, 0x8a, 0xd7, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x91, 0xe2, 0x99, 0x95, 0x95, 0x97, /* 0x90-0x97 */
+	0x97, 0x99, 0x9a, 0x9b, 0x9b, 0x9d, 0x9e, 0xac, /* 0x98-0x9f */
+	0xb5, 0xd6, 0xe0, 0xe9, 0xa4, 0xa4, 0xa6, 0xa6, /* 0xa0-0xa7 */
+	0xa8, 0xa8, 0xaa, 0x8d, 0xac, 0xb8, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbd, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd1, 0xd1, 0xd2, 0xd3, 0xd2, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xb7, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe3, 0xd5, 0xe6, 0xe6, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xe8, 0xeb, 0xed, 0xed, 0xdd, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xeb, 0xfc, 0xfc, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp852",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp852(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp852(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp852)
+module_exit(exit_nls_cp852)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c
new file mode 100644
index 00000000..cc7f5fb2
--- /dev/null
+++ b/fs/nls/nls_cp855.c
@@ -0,0 +1,300 @@
+/*
+ * linux/fs/nls/nls_cp855.c
+ *
+ * Charset cp855 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0452, 0x0402, 0x0453, 0x0403,
+	0x0451, 0x0401, 0x0454, 0x0404,
+	0x0455, 0x0405, 0x0456, 0x0406,
+	0x0457, 0x0407, 0x0458, 0x0408,
+	/* 0x90*/
+	0x0459, 0x0409, 0x045a, 0x040a,
+	0x045b, 0x040b, 0x045c, 0x040c,
+	0x045e, 0x040e, 0x045f, 0x040f,
+	0x044e, 0x042e, 0x044a, 0x042a,
+	/* 0xa0*/
+	0x0430, 0x0410, 0x0431, 0x0411,
+	0x0446, 0x0426, 0x0434, 0x0414,
+	0x0435, 0x0415, 0x0444, 0x0424,
+	0x0433, 0x0413, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x0445, 0x0425, 0x0438,
+	0x0418, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x0439, 0x0419, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x043a, 0x041a,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x00a4,
+	/* 0xd0*/
+	0x043b, 0x041b, 0x043c, 0x041c,
+	0x043d, 0x041d, 0x043e, 0x041e,
+	0x043f, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x041f, 0x044f, 0x2580,
+	/* 0xe0*/
+	0x042f, 0x0440, 0x0420, 0x0441,
+	0x0421, 0x0442, 0x0422, 0x0443,
+	0x0423, 0x0436, 0x0416, 0x0432,
+	0x0412, 0x044c, 0x042c, 0x2116,
+	/* 0xf0*/
+	0x00ad, 0x044b, 0x042b, 0x0437,
+	0x0417, 0x0448, 0x0428, 0x044d,
+	0x042d, 0x0449, 0x0429, 0x0447,
+	0x0427, 0x00a7, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0xfd, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0xae, 0x00, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xaf, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+};
+
+static const unsigned char page04[256] = {
+	0x00, 0x85, 0x81, 0x83, 0x87, 0x89, 0x8b, 0x8d, /* 0x00-0x07 */
+	0x8f, 0x91, 0x93, 0x95, 0x97, 0x00, 0x99, 0x9b, /* 0x08-0x0f */
+	0xa1, 0xa3, 0xec, 0xad, 0xa7, 0xa9, 0xea, 0xf4, /* 0x10-0x17 */
+	0xb8, 0xbe, 0xc7, 0xd1, 0xd3, 0xd5, 0xd7, 0xdd, /* 0x18-0x1f */
+	0xe2, 0xe4, 0xe6, 0xe8, 0xab, 0xb6, 0xa5, 0xfc, /* 0x20-0x27 */
+	0xf6, 0xfa, 0x9f, 0xf2, 0xee, 0xf8, 0x9d, 0xe0, /* 0x28-0x2f */
+	0xa0, 0xa2, 0xeb, 0xac, 0xa6, 0xa8, 0xe9, 0xf3, /* 0x30-0x37 */
+	0xb7, 0xbd, 0xc6, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, /* 0x38-0x3f */
+	0xe1, 0xe3, 0xe5, 0xe7, 0xaa, 0xb5, 0xa4, 0xfb, /* 0x40-0x47 */
+	0xf5, 0xf9, 0x9e, 0xf1, 0xed, 0xf7, 0x9c, 0xde, /* 0x48-0x4f */
+	0x00, 0x84, 0x80, 0x82, 0x86, 0x88, 0x8a, 0x8c, /* 0x50-0x57 */
+	0x8e, 0x90, 0x92, 0x94, 0x96, 0x00, 0x98, 0x9a, /* 0x58-0x5f */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xef, 0x00, /* 0x10-0x17 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */
+	0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */
+	0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */
+	0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   page04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   page21, NULL,   NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x80, 0x82, 0x82, 0x84, 0x84, 0x86, 0x86, /* 0x80-0x87 */
+	0x88, 0x88, 0x8a, 0x8a, 0x8c, 0x8c, 0x8e, 0x8e, /* 0x88-0x8f */
+	0x90, 0x90, 0x92, 0x92, 0x94, 0x94, 0x96, 0x96, /* 0x90-0x97 */
+	0x98, 0x98, 0x9a, 0x9a, 0x9c, 0x9c, 0x9e, 0x9e, /* 0x98-0x9f */
+	0xa0, 0xa0, 0xa2, 0xa2, 0xa4, 0xa4, 0xa6, 0xa6, /* 0xa0-0xa7 */
+	0xa8, 0xa8, 0xaa, 0xaa, 0xac, 0xac, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb5, 0xb7, /* 0xb0-0xb7 */
+	0xb7, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbd, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd0, 0xd2, 0xd2, 0xd4, 0xd4, 0xd6, 0xd6, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xd8, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xde, 0xe1, 0xe1, 0xe3, 0xe3, 0xe5, 0xe5, 0xe7, /* 0xe0-0xe7 */
+	0xe7, 0xe9, 0xe9, 0xeb, 0xeb, 0xed, 0xed, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf1, 0xf3, 0xf3, 0xf5, 0xf5, 0xf7, /* 0xf0-0xf7 */
+	0xf7, 0xf9, 0xf9, 0xfb, 0xfb, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x81, 0x81, 0x83, 0x83, 0x85, 0x85, 0x87, 0x87, /* 0x80-0x87 */
+	0x89, 0x89, 0x8b, 0x8b, 0x8d, 0x8d, 0x8f, 0x8f, /* 0x88-0x8f */
+	0x91, 0x91, 0x93, 0x93, 0x95, 0x95, 0x97, 0x97, /* 0x90-0x97 */
+	0x99, 0x99, 0x9b, 0x9b, 0x9d, 0x9d, 0x9f, 0x9f, /* 0x98-0x9f */
+	0xa1, 0xa1, 0xa3, 0xa3, 0xa5, 0xa5, 0xa7, 0xa7, /* 0xa0-0xa7 */
+	0xa9, 0xa9, 0xab, 0xab, 0xad, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb6, 0xb6, 0xb8, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd1, 0xd1, 0xd3, 0xd3, 0xd5, 0xd5, 0xd7, 0xd7, /* 0xd0-0xd7 */
+	0xdd, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xe0, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe2, 0xe2, 0xe4, 0xe4, 0xe6, 0xe6, 0xe8, /* 0xe0-0xe7 */
+	0xe8, 0xea, 0xea, 0xec, 0xec, 0xee, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf2, 0xf2, 0xf4, 0xf4, 0xf6, 0xf6, 0xf8, /* 0xf0-0xf7 */
+	0xf8, 0xfa, 0xfa, 0xfc, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp855",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp855(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp855(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp855)
+module_exit(exit_nls_cp855)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c
new file mode 100644
index 00000000..e418e198
--- /dev/null
+++ b/fs/nls/nls_cp857.c
@@ -0,0 +1,302 @@
+/*
+ * linux/fs/nls/nls_cp857.c
+ *
+ * Charset cp857 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00e4, 0x00e0, 0x00e5, 0x00e7,
+	0x00ea, 0x00eb, 0x00e8, 0x00ef,
+	0x00ee, 0x0131, 0x00c4, 0x00c5,
+	/* 0x90*/
+	0x00c9, 0x00e6, 0x00c6, 0x00f4,
+	0x00f6, 0x00f2, 0x00fb, 0x00f9,
+	0x0130, 0x00d6, 0x00dc, 0x00f8,
+	0x00a3, 0x00d8, 0x015e, 0x015f,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x00f1, 0x00d1, 0x011e, 0x011f,
+	0x00bf, 0x00ae, 0x00ac, 0x00bd,
+	0x00bc, 0x00a1, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x00c1, 0x00c2, 0x00c0,
+	0x00a9, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x00a2, 0x00a5, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x00e3, 0x00c3,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x00a4,
+	/* 0xd0*/
+	0x00ba, 0x00aa, 0x00ca, 0x00cb,
+	0x00c8, 0x0000, 0x00cd, 0x00ce,
+	0x00cf, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x00a6, 0x00cc, 0x2580,
+	/* 0xe0*/
+	0x00d3, 0x00df, 0x00d4, 0x00d2,
+	0x00f5, 0x00d5, 0x00b5, 0x0000,
+	0x00d7, 0x00da, 0x00db, 0x00d9,
+	0x00ec, 0x00ff, 0x00af, 0x00b4,
+	/* 0xf0*/
+	0x00ad, 0x00b1, 0x0000, 0x00be,
+	0x00b6, 0x00a7, 0x00f7, 0x00b8,
+	0x00b0, 0x00a8, 0x00b7, 0x00b9,
+	0x00b3, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0xad, 0xbd, 0x9c, 0xcf, 0xbe, 0xdd, 0xf5, /* 0xa0-0xa7 */
+	0xf9, 0xb8, 0xd1, 0xae, 0xaa, 0xf0, 0xa9, 0xee, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0xfc, 0xef, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */
+	0xf7, 0xfb, 0xd0, 0xaf, 0xac, 0xab, 0xf3, 0xa8, /* 0xb8-0xbf */
+	0xb7, 0xb5, 0xb6, 0xc7, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */
+	0xd4, 0x90, 0xd2, 0xd3, 0xde, 0xd6, 0xd7, 0xd8, /* 0xc8-0xcf */
+	0x00, 0xa5, 0xe3, 0xe0, 0xe2, 0xe5, 0x99, 0xe8, /* 0xd0-0xd7 */
+	0x9d, 0xeb, 0xe9, 0xea, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x85, 0xa0, 0x83, 0xc6, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */
+	0x8a, 0x82, 0x88, 0x89, 0xec, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */
+	0x00, 0xa4, 0x95, 0xa2, 0x93, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */
+	0x9b, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0xed, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa6, 0xa7, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x98, 0x8d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, 0x9f, /* 0x58-0x5f */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */
+	0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */
+	0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */
+	0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */
+	0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x69, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9f, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa7, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xa0, 0x83, 0x85, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0x88, 0x89, 0x8a, 0x00, 0xa1, 0x8c, /* 0xd0-0xd7 */
+	0x8b, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xec, 0xdf, /* 0xd8-0xdf */
+	0xa2, 0xe1, 0x93, 0x95, 0xe4, 0xe4, 0xe6, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xa3, 0x96, 0x97, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0xb6, 0x8e, 0xb7, 0x8f, 0x80, /* 0x80-0x87 */
+	0xd2, 0xd3, 0xd4, 0xd8, 0xd7, 0x49, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x92, 0x92, 0xe2, 0x99, 0xe3, 0xea, 0xeb, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x9e, /* 0x98-0x9f */
+	0xb5, 0xd6, 0xe0, 0xe9, 0xa5, 0xa5, 0xa6, 0xa6, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xde, 0x00, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp857",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp857(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp857(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp857)
+module_exit(exit_nls_cp857)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c
new file mode 100644
index 00000000..a86c97d1
--- /dev/null
+++ b/fs/nls/nls_cp860.c
@@ -0,0 +1,365 @@
+/*
+ * linux/fs/nls/nls_cp860.c
+ *
+ * Charset cp860 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00e3, 0x00e0, 0x00c1, 0x00e7,
+	0x00ea, 0x00ca, 0x00e8, 0x00cd,
+	0x00d4, 0x00ec, 0x00c3, 0x00c2,
+	/* 0x90*/
+	0x00c9, 0x00c0, 0x00c8, 0x00f4,
+	0x00f5, 0x00f2, 0x00da, 0x00f9,
+	0x00cc, 0x00d5, 0x00dc, 0x00a2,
+	0x00a3, 0x00d9, 0x20a7, 0x00d3,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x00f1, 0x00d1, 0x00aa, 0x00ba,
+	0x00bf, 0x00d2, 0x00ac, 0x00bd,
+	0x00bc, 0x00a1, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x03b1, 0x00df, 0x0393, 0x03c0,
+	0x03a3, 0x03c3, 0x00b5, 0x03c4,
+	0x03a6, 0x0398, 0x03a9, 0x03b4,
+	0x221e, 0x03c6, 0x03b5, 0x2229,
+	/* 0xf0*/
+	0x2261, 0x00b1, 0x2265, 0x2264,
+	0x2320, 0x2321, 0x00f7, 0x2248,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x207f, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0xad, 0x9b, 0x9c, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */
+	0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */
+	0x91, 0x86, 0x8f, 0x8e, 0x00, 0x00, 0x00, 0x80, /* 0xc0-0xc7 */
+	0x92, 0x90, 0x89, 0x00, 0x98, 0x8b, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0xa5, 0xa9, 0x9f, 0x8c, 0x99, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x9d, 0x96, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x85, 0xa0, 0x83, 0x84, 0x00, 0x00, 0x00, 0x87, /* 0xe0-0xe7 */
+	0x8a, 0x82, 0x88, 0x00, 0x8d, 0xa1, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0xa4, 0x95, 0xa2, 0x93, 0x94, 0x00, 0xf6, /* 0xf0-0xf7 */
+	0x00, 0x97, 0xa3, 0x00, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0xa0, 0x87, /* 0x80-0x87 */
+	0x88, 0x88, 0x8a, 0xa1, 0x93, 0x8d, 0x84, 0x83, /* 0x88-0x8f */
+	0x82, 0x85, 0x8a, 0x93, 0x94, 0x95, 0xa3, 0x97, /* 0x90-0x97 */
+	0x8d, 0x94, 0x81, 0x9b, 0x9c, 0x97, 0x9e, 0xa2, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0x95, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0x8f, 0x8e, 0x91, 0x86, 0x80, /* 0x80-0x87 */
+	0x89, 0x89, 0x92, 0x8b, 0x8c, 0x98, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x8c, 0x99, 0xa9, 0x96, 0x9d, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0x86, 0x8b, 0x9f, 0x96, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp860",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp860(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp860(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp860)
+module_exit(exit_nls_cp860)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c
new file mode 100644
index 00000000..bd920227
--- /dev/null
+++ b/fs/nls/nls_cp861.c
@@ -0,0 +1,388 @@
+/*
+ * linux/fs/nls/nls_cp861.c
+ *
+ * Charset cp861 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00e4, 0x00e0, 0x00e5, 0x00e7,
+	0x00ea, 0x00eb, 0x00e8, 0x00d0,
+	0x00f0, 0x00de, 0x00c4, 0x00c5,
+	/* 0x90*/
+	0x00c9, 0x00e6, 0x00c6, 0x00f4,
+	0x00f6, 0x00fe, 0x00fb, 0x00dd,
+	0x00fd, 0x00d6, 0x00dc, 0x00f8,
+	0x00a3, 0x00d8, 0x20a7, 0x0192,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x00c1, 0x00cd, 0x00d3, 0x00da,
+	0x00bf, 0x2310, 0x00ac, 0x00bd,
+	0x00bc, 0x00a1, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x03b1, 0x00df, 0x0393, 0x03c0,
+	0x03a3, 0x03c3, 0x00b5, 0x03c4,
+	0x03a6, 0x0398, 0x03a9, 0x03b4,
+	0x221e, 0x03c6, 0x03b5, 0x2229,
+	/* 0xf0*/
+	0x2261, 0x00b1, 0x2265, 0x2264,
+	0x2320, 0x2321, 0x00f7, 0x2248,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x207f, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0xad, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */
+	0x00, 0xa4, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */
+	0x00, 0x90, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, /* 0xc8-0xcf */
+	0x8b, 0x00, 0x00, 0xa6, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */
+	0x9d, 0x00, 0xa7, 0x00, 0x9a, 0x97, 0x8d, 0xe1, /* 0xd8-0xdf */
+	0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */
+	0x8a, 0x82, 0x88, 0x89, 0x00, 0xa1, 0x00, 0x00, /* 0xe8-0xef */
+	0x8c, 0x00, 0x00, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */
+	0x9b, 0x00, 0xa3, 0x96, 0x81, 0x98, 0x95, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8c, 0x8c, 0x95, 0x84, 0x86, /* 0x88-0x8f */
+	0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x98, /* 0x90-0x97 */
+	0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa0, 0xa1, 0xa2, 0xa3, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0x00, 0x8e, 0x00, 0x8f, 0x80, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x8b, 0x8b, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x92, 0x92, 0x00, 0x99, 0x8d, 0x00, 0x97, /* 0x90-0x97 */
+	0x97, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0xa4, 0xa5, 0xa6, 0xa7, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp861",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp861(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp861(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp861)
+module_exit(exit_nls_cp861)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c
new file mode 100644
index 00000000..e9b68eb3
--- /dev/null
+++ b/fs/nls/nls_cp862.c
@@ -0,0 +1,422 @@
+/*
+ * linux/fs/nls/nls_cp862.c
+ *
+ * Charset cp862 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x05d0, 0x05d1, 0x05d2, 0x05d3,
+	0x05d4, 0x05d5, 0x05d6, 0x05d7,
+	0x05d8, 0x05d9, 0x05da, 0x05db,
+	0x05dc, 0x05dd, 0x05de, 0x05df,
+	/* 0x90*/
+	0x05e0, 0x05e1, 0x05e2, 0x05e3,
+	0x05e4, 0x05e5, 0x05e6, 0x05e7,
+	0x05e8, 0x05e9, 0x05ea, 0x00a2,
+	0x00a3, 0x00a5, 0x20a7, 0x0192,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x00f1, 0x00d1, 0x00aa, 0x00ba,
+	0x00bf, 0x2310, 0x00ac, 0x00bd,
+	0x00bc, 0x00a1, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x03b1, 0x00df, 0x0393, 0x03c0,
+	0x03a3, 0x03c3, 0x00b5, 0x03c4,
+	0x03a6, 0x0398, 0x03a9, 0x03b4,
+	0x221e, 0x03c6, 0x03b5, 0x2229,
+	/* 0xf0*/
+	0x2261, 0x00b1, 0x2265, 0x2264,
+	0x2320, 0x2321, 0x00f7, 0x2248,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x207f, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0xad, 0x9b, 0x9c, 0x00, 0x9d, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */
+	0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0xa4, 0x00, 0xa2, 0x00, 0x00, 0x00, 0xf6, /* 0xf0-0xf7 */
+	0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
+};
+
+static const unsigned char page05[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0xd0-0xd7 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0xd8-0xdf */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0xe0-0xe7 */
+	0x98, 0x99, 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   page03, NULL,   page05, NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp862",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp862(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp862(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp862)
+module_exit(exit_nls_cp862)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c
new file mode 100644
index 00000000..f8a9b07a
--- /dev/null
+++ b/fs/nls/nls_cp863.c
@@ -0,0 +1,382 @@
+/*
+ * linux/fs/nls/nls_cp863.c
+ *
+ * Charset cp863 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00c2, 0x00e0, 0x00b6, 0x00e7,
+	0x00ea, 0x00eb, 0x00e8, 0x00ef,
+	0x00ee, 0x2017, 0x00c0, 0x00a7,
+	/* 0x90*/
+	0x00c9, 0x00c8, 0x00ca, 0x00f4,
+	0x00cb, 0x00cf, 0x00fb, 0x00f9,
+	0x00a4, 0x00d4, 0x00dc, 0x00a2,
+	0x00a3, 0x00d9, 0x00db, 0x0192,
+	/* 0xa0*/
+	0x00a6, 0x00b4, 0x00f3, 0x00fa,
+	0x00a8, 0x00b8, 0x00b3, 0x00af,
+	0x00ce, 0x2310, 0x00ac, 0x00bd,
+	0x00bc, 0x00be, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x03b1, 0x00df, 0x0393, 0x03c0,
+	0x03a3, 0x03c3, 0x00b5, 0x03c4,
+	0x03a6, 0x0398, 0x03a9, 0x03b4,
+	0x221e, 0x03c6, 0x03b5, 0x2229,
+	/* 0xf0*/
+	0x2261, 0x00b1, 0x2265, 0x2264,
+	0x2320, 0x2321, 0x00f7, 0x2248,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x207f, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0x00, 0x9b, 0x9c, 0x98, 0x00, 0xa0, 0x8f, /* 0xa0-0xa7 */
+	0xa4, 0x00, 0x00, 0xae, 0xaa, 0x00, 0x00, 0xa7, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0xa6, 0xa1, 0xe6, 0x86, 0xfa, /* 0xb0-0xb7 */
+	0xa5, 0x00, 0x00, 0xaf, 0xac, 0xab, 0xad, 0x00, /* 0xb8-0xbf */
+	0x8e, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xc0-0xc7 */
+	0x91, 0x90, 0x92, 0x94, 0x00, 0x00, 0xa8, 0x95, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x9d, 0x00, 0x9e, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x85, 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x87, /* 0xe0-0xe7 */
+	0x8a, 0x82, 0x88, 0x89, 0x00, 0x00, 0x8c, 0x8b, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0xa2, 0x93, 0x00, 0x00, 0xf6, /* 0xf0-0xf7 */
+	0x00, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8d, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x83, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x85, 0x8f, /* 0x88-0x8f */
+	0x82, 0x8a, 0x88, 0x93, 0x89, 0x8b, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x93, 0x81, 0x9b, 0x9c, 0x97, 0x96, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0x8c, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0x84, 0x84, 0x8e, 0x86, 0x80, /* 0x80-0x87 */
+	0x92, 0x94, 0x91, 0x95, 0xa8, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x99, 0x94, 0x95, 0x9e, 0x9d, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0xa0, 0xa1, 0x00, 0x00, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp863",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp863(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp863(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp863)
+module_exit(exit_nls_cp863)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c
new file mode 100644
index 00000000..8d31f435
--- /dev/null
+++ b/fs/nls/nls_cp864.c
@@ -0,0 +1,408 @@
+/*
+ * linux/fs/nls/nls_cp864.c
+ *
+ * Charset cp864 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x066a, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00b0, 0x00b7, 0x2219, 0x221a,
+	0x2592, 0x2500, 0x2502, 0x253c,
+	0x2524, 0x252c, 0x251c, 0x2534,
+	0x2510, 0x250c, 0x2514, 0x2518,
+	/* 0x90*/
+	0x03b2, 0x221e, 0x03c6, 0x00b1,
+	0x00bd, 0x00bc, 0x2248, 0x00ab,
+	0x00bb, 0xfef7, 0xfef8, 0x0000,
+	0x0000, 0xfefb, 0xfefc, 0x0000,
+	/* 0xa0*/
+	0x00a0, 0x00ad, 0xfe82, 0x00a3,
+	0x00a4, 0xfe84, 0x0000, 0x0000,
+	0xfe8e, 0xfe8f, 0xfe95, 0xfe99,
+	0x060c, 0xfe9d, 0xfea1, 0xfea5,
+	/* 0xb0*/
+	0x0660, 0x0661, 0x0662, 0x0663,
+	0x0664, 0x0665, 0x0666, 0x0667,
+	0x0668, 0x0669, 0xfed1, 0x061b,
+	0xfeb1, 0xfeb5, 0xfeb9, 0x061f,
+	/* 0xc0*/
+	0x00a2, 0xfe80, 0xfe81, 0xfe83,
+	0xfe85, 0xfeca, 0xfe8b, 0xfe8d,
+	0xfe91, 0xfe93, 0xfe97, 0xfe9b,
+	0xfe9f, 0xfea3, 0xfea7, 0xfea9,
+	/* 0xd0*/
+	0xfeab, 0xfead, 0xfeaf, 0xfeb3,
+	0xfeb7, 0xfebb, 0xfebf, 0xfec1,
+	0xfec5, 0xfecb, 0xfecf, 0x00a6,
+	0x00ac, 0x00f7, 0x00d7, 0xfec9,
+	/* 0xe0*/
+	0x0640, 0xfed3, 0xfed7, 0xfedb,
+	0xfedf, 0xfee3, 0xfee7, 0xfeeb,
+	0xfeed, 0xfeef, 0xfef3, 0xfebd,
+	0xfecc, 0xfece, 0xfecd, 0xfee1,
+	/* 0xf0*/
+	0xfe7d, 0x0651, 0xfee5, 0xfee9,
+	0xfeec, 0xfef0, 0xfef2, 0xfed0,
+	0xfed5, 0xfef5, 0xfef6, 0xfedd,
+	0xfed9, 0xfef1, 0x25a0, 0x0000,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x00, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0x00, 0xc0, 0xa3, 0xa4, 0x00, 0xdb, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x97, 0xdc, 0xa1, 0x00, 0x00, /* 0xa8-0xaf */
+	0x80, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x98, 0x95, 0x94, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdd, /* 0xf0-0xf7 */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x00, /* 0xc0-0xc7 */
+};
+
+static const unsigned char page06[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0xe0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0xf1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x60-0x67 */
+	0xb8, 0xb9, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x82, 0x83, 0x00, 0x00, 0x00, 0x91, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+};
+
+static const unsigned char page25[256] = {
+	0x85, 0x00, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x8c, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x8f, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char pagefe[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xc1, 0xc2, 0xa2, 0xc3, 0xa5, 0xc4, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0xc6, 0x00, 0xc7, 0xa8, 0xa9, /* 0x88-0x8f */
+	0x00, 0xc8, 0x00, 0xc9, 0x00, 0xaa, 0x00, 0xca, /* 0x90-0x97 */
+	0x00, 0xab, 0x00, 0xcb, 0x00, 0xad, 0x00, 0xcc, /* 0x98-0x9f */
+	0x00, 0xae, 0x00, 0xcd, 0x00, 0xaf, 0x00, 0xce, /* 0xa0-0xa7 */
+	0x00, 0xcf, 0x00, 0xd0, 0x00, 0xd1, 0x00, 0xd2, /* 0xa8-0xaf */
+	0x00, 0xbc, 0x00, 0xd3, 0x00, 0xbd, 0x00, 0xd4, /* 0xb0-0xb7 */
+	0x00, 0xbe, 0x00, 0xd5, 0x00, 0xeb, 0x00, 0xd6, /* 0xb8-0xbf */
+	0x00, 0xd7, 0x00, 0x00, 0x00, 0xd8, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0xdf, 0xc5, 0xd9, 0xec, 0xee, 0xed, 0xda, /* 0xc8-0xcf */
+	0xf7, 0xba, 0x00, 0xe1, 0x00, 0xf8, 0x00, 0xe2, /* 0xd0-0xd7 */
+	0x00, 0xfc, 0x00, 0xe3, 0x00, 0xfb, 0x00, 0xe4, /* 0xd8-0xdf */
+	0x00, 0xef, 0x00, 0xe5, 0x00, 0xf2, 0x00, 0xe6, /* 0xe0-0xe7 */
+	0x00, 0xf3, 0x00, 0xe7, 0xf4, 0xe8, 0x00, 0xe9, /* 0xe8-0xef */
+	0xf5, 0xfd, 0xf6, 0xea, 0x00, 0xf9, 0xfa, 0x99, /* 0xf0-0xf7 */
+	0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   page03, NULL,   NULL,   page06, NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   page22, NULL,   NULL,   page25, NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   pagefe, NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0x00, 0x00, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x00, 0x91, 0x00, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0x00, 0x00, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp864",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp864(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp864(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp864)
+module_exit(exit_nls_cp864)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c
new file mode 100644
index 00000000..4bd902fe
--- /dev/null
+++ b/fs/nls/nls_cp865.c
@@ -0,0 +1,388 @@
+/*
+ * linux/fs/nls/nls_cp865.c
+ *
+ * Charset cp865 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x00c7, 0x00fc, 0x00e9, 0x00e2,
+	0x00e4, 0x00e0, 0x00e5, 0x00e7,
+	0x00ea, 0x00eb, 0x00e8, 0x00ef,
+	0x00ee, 0x00ec, 0x00c4, 0x00c5,
+	/* 0x90*/
+	0x00c9, 0x00e6, 0x00c6, 0x00f4,
+	0x00f6, 0x00f2, 0x00fb, 0x00f9,
+	0x00ff, 0x00d6, 0x00dc, 0x00f8,
+	0x00a3, 0x00d8, 0x20a7, 0x0192,
+	/* 0xa0*/
+	0x00e1, 0x00ed, 0x00f3, 0x00fa,
+	0x00f1, 0x00d1, 0x00aa, 0x00ba,
+	0x00bf, 0x2310, 0x00ac, 0x00bd,
+	0x00bc, 0x00a1, 0x00ab, 0x00a4,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x03b1, 0x00df, 0x0393, 0x03c0,
+	0x03a3, 0x03c3, 0x00b5, 0x03c4,
+	0x03a6, 0x0398, 0x03a9, 0x03b4,
+	0x221e, 0x03c6, 0x03b5, 0x2229,
+	/* 0xf0*/
+	0x2261, 0x00b1, 0x2265, 0x2264,
+	0x2320, 0x2321, 0x00f7, 0x2248,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x207f, 0x00b2, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0xad, 0x00, 0x9c, 0xaf, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */
+	0x00, 0x00, 0xa7, 0x00, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */
+	0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */
+	0x9d, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
+	0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */
+	0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */
+	0x00, 0xa4, 0x95, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */
+	0x9b, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x98, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */
+	0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x9a, 0x90, 0x00, 0x8e, 0x00, 0x8f, 0x80, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x92, 0x92, 0x00, 0x99, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp865",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp865(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp865(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp865)
+module_exit(exit_nls_cp865)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c
new file mode 100644
index 00000000..bdc7cb39
--- /dev/null
+++ b/fs/nls/nls_cp866.c
@@ -0,0 +1,306 @@
+/*
+ * linux/fs/nls/nls_cp866.c
+ *
+ * Charset cp866 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0410, 0x0411, 0x0412, 0x0413,
+	0x0414, 0x0415, 0x0416, 0x0417,
+	0x0418, 0x0419, 0x041a, 0x041b,
+	0x041c, 0x041d, 0x041e, 0x041f,
+	/* 0x90*/
+	0x0420, 0x0421, 0x0422, 0x0423,
+	0x0424, 0x0425, 0x0426, 0x0427,
+	0x0428, 0x0429, 0x042a, 0x042b,
+	0x042c, 0x042d, 0x042e, 0x042f,
+	/* 0xa0*/
+	0x0430, 0x0431, 0x0432, 0x0433,
+	0x0434, 0x0435, 0x0436, 0x0437,
+	0x0438, 0x0439, 0x043a, 0x043b,
+	0x043c, 0x043d, 0x043e, 0x043f,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x2561, 0x2562, 0x2556,
+	0x2555, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x255c, 0x255b, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x255e, 0x255f,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x2567,
+	/* 0xd0*/
+	0x2568, 0x2564, 0x2565, 0x2559,
+	0x2558, 0x2552, 0x2553, 0x256b,
+	0x256a, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x258c, 0x2590, 0x2580,
+	/* 0xe0*/
+	0x0440, 0x0441, 0x0442, 0x0443,
+	0x0444, 0x0445, 0x0446, 0x0447,
+	0x0448, 0x0449, 0x044a, 0x044b,
+	0x044c, 0x044d, 0x044e, 0x044f,
+	/* 0xf0*/
+	0x0401, 0x0451, 0x0404, 0x0454,
+	0x0407, 0x0457, 0x040e, 0x045e,
+	0x00b0, 0x2219, 0x00b7, 0x221a,
+	0x2116, 0x00a4, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, /* 0xb0-0xb7 */
+};
+
+static const unsigned char page04[256] = {
+	0x00, 0xf0, 0x00, 0x00, 0xf2, 0x00, 0x00, 0xf4, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0x00, /* 0x08-0x0f */
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x10-0x17 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x18-0x1f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x20-0x27 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x28-0x2f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x30-0x37 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0x38-0x3f */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */
+	0x00, 0xf1, 0x00, 0x00, 0xf3, 0x00, 0x00, 0xf5, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf7, 0x00, /* 0x58-0x5f */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, /* 0x10-0x17 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+	0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+	0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+	0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   page04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   page21, page22, NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x80-0x87 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0x88-0x8f */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x90-0x97 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf1, 0xf1, 0xf3, 0xf3, 0xf5, 0xf5, 0xf7, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0xa0-0xa7 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0xe0-0xe7 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0xe8-0xef */
+	0xf0, 0xf0, 0xf2, 0xf2, 0xf4, 0xf4, 0xf6, 0xf6, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp866",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp866(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp866(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp866)
+module_exit(exit_nls_cp866)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c
new file mode 100644
index 00000000..9f283a2b
--- /dev/null
+++ b/fs/nls/nls_cp869.c
@@ -0,0 +1,316 @@
+/*
+ * linux/fs/nls/nls_cp869.c
+ *
+ * Charset cp869 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0386, 0x0000,
+	0x00b7, 0x00ac, 0x00a6, 0x2018,
+	0x2019, 0x0388, 0x2015, 0x0389,
+	/* 0x90*/
+	0x038a, 0x03aa, 0x038c, 0x0000,
+	0x0000, 0x038e, 0x03ab, 0x00a9,
+	0x038f, 0x00b2, 0x00b3, 0x03ac,
+	0x00a3, 0x03ad, 0x03ae, 0x03af,
+	/* 0xa0*/
+	0x03ca, 0x0390, 0x03cc, 0x03cd,
+	0x0391, 0x0392, 0x0393, 0x0394,
+	0x0395, 0x0396, 0x0397, 0x00bd,
+	0x0398, 0x0399, 0x00ab, 0x00bb,
+	/* 0xb0*/
+	0x2591, 0x2592, 0x2593, 0x2502,
+	0x2524, 0x039a, 0x039b, 0x039c,
+	0x039d, 0x2563, 0x2551, 0x2557,
+	0x255d, 0x039e, 0x039f, 0x2510,
+	/* 0xc0*/
+	0x2514, 0x2534, 0x252c, 0x251c,
+	0x2500, 0x253c, 0x03a0, 0x03a1,
+	0x255a, 0x2554, 0x2569, 0x2566,
+	0x2560, 0x2550, 0x256c, 0x03a3,
+	/* 0xd0*/
+	0x03a4, 0x03a5, 0x03a6, 0x03a7,
+	0x03a8, 0x03a9, 0x03b1, 0x03b2,
+	0x03b3, 0x2518, 0x250c, 0x2588,
+	0x2584, 0x03b4, 0x03b5, 0x2580,
+	/* 0xe0*/
+	0x03b6, 0x03b7, 0x03b8, 0x03b9,
+	0x03ba, 0x03bb, 0x03bc, 0x03bd,
+	0x03be, 0x03bf, 0x03c0, 0x03c1,
+	0x03c3, 0x03c2, 0x03c4, 0x0384,
+	/* 0xf0*/
+	0x00ad, 0x00b1, 0x03c5, 0x03c6,
+	0x03c7, 0x00a7, 0x03c8, 0x0385,
+	0x00b0, 0x00a8, 0x03c9, 0x03cb,
+	0x03b0, 0x03ce, 0x25a0, 0x00a0,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xff, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x8a, 0xf5, /* 0xa0-0xa7 */
+	0xf9, 0x97, 0x00, 0xae, 0x89, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */
+	0xf8, 0xf1, 0x99, 0x9a, 0x00, 0x00, 0x00, 0x88, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xaf, 0x00, 0xab, 0x00, 0x00, /* 0xb8-0xbf */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0xef, 0xf7, 0x86, 0x00, /* 0x80-0x87 */
+	0x8d, 0x8f, 0x90, 0x00, 0x92, 0x00, 0x95, 0x98, /* 0x88-0x8f */
+	0xa1, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, /* 0x90-0x97 */
+	0xac, 0xad, 0xb5, 0xb6, 0xb7, 0xb8, 0xbd, 0xbe, /* 0x98-0x9f */
+	0xc6, 0xc7, 0x00, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, /* 0xa0-0xa7 */
+	0xd4, 0xd5, 0x91, 0x96, 0x9b, 0x9d, 0x9e, 0x9f, /* 0xa8-0xaf */
+	0xfc, 0xd6, 0xd7, 0xd8, 0xdd, 0xde, 0xe0, 0xe1, /* 0xb0-0xb7 */
+	0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, /* 0xb8-0xbf */
+	0xea, 0xeb, 0xed, 0xec, 0xee, 0xf2, 0xf3, 0xf4, /* 0xc0-0xc7 */
+	0xf6, 0xfa, 0xa0, 0xfb, 0xa2, 0xa3, 0xfd, 0x00, /* 0xc8-0xcf */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, /* 0x10-0x17 */
+	0x8b, 0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+};
+
+static const unsigned char page25[256] = {
+	0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */
+	0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */
+	0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */
+	0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   NULL,   NULL,   NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9b, 0x00, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x9d, 0x8e, 0x9e, /* 0x88-0x8f */
+	0x9f, 0xa0, 0xa2, 0x00, 0x00, 0xa3, 0xfb, 0x97, /* 0x90-0x97 */
+	0xfd, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xd6, 0xd7, 0xd8, 0xdd, /* 0xa0-0xa7 */
+	0xde, 0xe0, 0xe1, 0xab, 0xe2, 0xe3, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xe4, 0xe5, 0xe6, /* 0xb0-0xb7 */
+	0xe7, 0xb9, 0xba, 0xbb, 0xbc, 0xe8, 0xe9, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xea, 0xeb, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xec, /* 0xc8-0xcf */
+	0xee, 0xf2, 0xf3, 0xf4, 0xf6, 0xfa, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0x00, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x00, 0x00, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x86, 0x9c, 0x8d, 0x8f, 0x90, /* 0x98-0x9f */
+	0x91, 0xa1, 0x92, 0x95, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xa4, 0xa5, /* 0xd0-0xd7 */
+	0xa6, 0xd9, 0xda, 0xdb, 0xdc, 0xa7, 0xa8, 0xdf, /* 0xd8-0xdf */
+	0xa9, 0xaa, 0xac, 0xad, 0xb5, 0xb6, 0xb7, 0xb8, /* 0xe0-0xe7 */
+	0xbd, 0xbe, 0xc6, 0xc7, 0xcf, 0xcf, 0xd0, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xd1, 0xd2, 0xd3, 0xf5, 0xd4, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xd5, 0x96, 0xfc, 0x98, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp869",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp869(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp869(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp869)
+module_exit(exit_nls_cp869)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c
new file mode 100644
index 00000000..0b3c4886
--- /dev/null
+++ b/fs/nls/nls_cp874.c
@@ -0,0 +1,276 @@
+/*
+ * linux/fs/nls/nls_cp874.c
+ *
+ * Charset cp874 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x2026, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	/* 0x90*/
+	0x0000, 0x2018, 0x2019, 0x201c,
+	0x201d, 0x2022, 0x2013, 0x2014,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	/* 0xa0*/
+	0x00a0, 0x0e01, 0x0e02, 0x0e03,
+	0x0e04, 0x0e05, 0x0e06, 0x0e07,
+	0x0e08, 0x0e09, 0x0e0a, 0x0e0b,
+	0x0e0c, 0x0e0d, 0x0e0e, 0x0e0f,
+	/* 0xb0*/
+	0x0e10, 0x0e11, 0x0e12, 0x0e13,
+	0x0e14, 0x0e15, 0x0e16, 0x0e17,
+	0x0e18, 0x0e19, 0x0e1a, 0x0e1b,
+	0x0e1c, 0x0e1d, 0x0e1e, 0x0e1f,
+	/* 0xc0*/
+	0x0e20, 0x0e21, 0x0e22, 0x0e23,
+	0x0e24, 0x0e25, 0x0e26, 0x0e27,
+	0x0e28, 0x0e29, 0x0e2a, 0x0e2b,
+	0x0e2c, 0x0e2d, 0x0e2e, 0x0e2f,
+	/* 0xd0*/
+	0x0e30, 0x0e31, 0x0e32, 0x0e33,
+	0x0e34, 0x0e35, 0x0e36, 0x0e37,
+	0x0e38, 0x0e39, 0x0e3a, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0e3f,
+	/* 0xe0*/
+	0x0e40, 0x0e41, 0x0e42, 0x0e43,
+	0x0e44, 0x0e45, 0x0e46, 0x0e47,
+	0x0e48, 0x0e49, 0x0e4a, 0x0e4b,
+	0x0e4c, 0x0e4d, 0x0e4e, 0x0e4f,
+	/* 0xf0*/
+	0x0e50, 0x0e51, 0x0e52, 0x0e53,
+	0x0e54, 0x0e55, 0x0e56, 0x0e57,
+	0x0e58, 0x0e59, 0x0e5a, 0x0e5b,
+	0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char page0e[256] = {
+	0x00, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x00-0x07 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0x08-0x0f */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x10-0x17 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0x18-0x1f */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */
+	0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0x38-0x3f */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x50-0x57 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x91, 0x92, 0x00, 0x00, 0x93, 0x94, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   page0e, NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "cp874",
+	.alias		= "tis-620",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp874(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp874(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp874)
+module_exit(exit_nls_cp874)
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NLS(tis-620);
diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c
new file mode 100644
index 00000000..0ffed6f1
--- /dev/null
+++ b/fs/nls/nls_cp932.c
@@ -0,0 +1,7934 @@
+/*
+ * linux/fs/nls/nls_cp932.c
+ *
+ * Charset cp932 translation tables.
+ * This translation table was generated automatically, the
+ * original table can be download from the Microsoft website.
+ * (http://www.microsoft.com/typography/unicode/unicodecp.htm)
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t c2u_81[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x3000,0x3001,0x3002,0xFF0C,0xFF0E,0x30FB,0xFF1A,0xFF1B,/* 0x40-0x47 */
+	0xFF1F,0xFF01,0x309B,0x309C,0x00B4,0xFF40,0x00A8,0xFF3E,/* 0x48-0x4F */
+	0xFFE3,0xFF3F,0x30FD,0x30FE,0x309D,0x309E,0x3003,0x4EDD,/* 0x50-0x57 */
+	0x3005,0x3006,0x3007,0x30FC,0x2015,0x2010,0xFF0F,0xFF3C,/* 0x58-0x5F */
+	0xFF5E,0x2225,0xFF5C,0x2026,0x2025,0x2018,0x2019,0x201C,/* 0x60-0x67 */
+	0x201D,0xFF08,0xFF09,0x3014,0x3015,0xFF3B,0xFF3D,0xFF5B,/* 0x68-0x6F */
+	0xFF5D,0x3008,0x3009,0x300A,0x300B,0x300C,0x300D,0x300E,/* 0x70-0x77 */
+	0x300F,0x3010,0x3011,0xFF0B,0xFF0D,0x00B1,0x00D7,0x0000,/* 0x78-0x7F */
+
+	0x00F7,0xFF1D,0x2260,0xFF1C,0xFF1E,0x2266,0x2267,0x221E,/* 0x80-0x87 */
+	0x2234,0x2642,0x2640,0x00B0,0x2032,0x2033,0x2103,0xFFE5,/* 0x88-0x8F */
+	0xFF04,0xFFE0,0xFFE1,0xFF05,0xFF03,0xFF06,0xFF0A,0xFF20,/* 0x90-0x97 */
+	0x00A7,0x2606,0x2605,0x25CB,0x25CF,0x25CE,0x25C7,0x25C6,/* 0x98-0x9F */
+	0x25A1,0x25A0,0x25B3,0x25B2,0x25BD,0x25BC,0x203B,0x3012,/* 0xA0-0xA7 */
+	0x2192,0x2190,0x2191,0x2193,0x3013,0x0000,0x0000,0x0000,/* 0xA8-0xAF */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xB0-0xB7 */
+	0x2208,0x220B,0x2286,0x2287,0x2282,0x2283,0x222A,0x2229,/* 0xB8-0xBF */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */
+	0x2227,0x2228,0xFFE2,0x21D2,0x21D4,0x2200,0x2203,0x0000,/* 0xC8-0xCF */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD0-0xD7 */
+	0x0000,0x0000,0x2220,0x22A5,0x2312,0x2202,0x2207,0x2261,/* 0xD8-0xDF */
+	0x2252,0x226A,0x226B,0x221A,0x223D,0x221D,0x2235,0x222B,/* 0xE0-0xE7 */
+	0x222C,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xE8-0xEF */
+	0x212B,0x2030,0x266F,0x266D,0x266A,0x2020,0x2021,0x00B6,/* 0xF0-0xF7 */
+	0x0000,0x0000,0x0000,0x0000,0x25EF,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_82[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xFF10,/* 0x48-0x4F */
+	0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,0xFF18,/* 0x50-0x57 */
+	0xFF19,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0xFF21,0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,0xFF28,/* 0x60-0x67 */
+	0xFF29,0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,0xFF30,/* 0x68-0x6F */
+	0xFF31,0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,0xFF38,/* 0x70-0x77 */
+	0xFF39,0xFF3A,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0x80-0x87 */
+	0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0x88-0x8F */
+	0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0xFF57,/* 0x90-0x97 */
+	0xFF58,0xFF59,0xFF5A,0x0000,0x0000,0x0000,0x0000,0x3041,/* 0x98-0x9F */
+	0x3042,0x3043,0x3044,0x3045,0x3046,0x3047,0x3048,0x3049,/* 0xA0-0xA7 */
+	0x304A,0x304B,0x304C,0x304D,0x304E,0x304F,0x3050,0x3051,/* 0xA8-0xAF */
+	0x3052,0x3053,0x3054,0x3055,0x3056,0x3057,0x3058,0x3059,/* 0xB0-0xB7 */
+	0x305A,0x305B,0x305C,0x305D,0x305E,0x305F,0x3060,0x3061,/* 0xB8-0xBF */
+	0x3062,0x3063,0x3064,0x3065,0x3066,0x3067,0x3068,0x3069,/* 0xC0-0xC7 */
+	0x306A,0x306B,0x306C,0x306D,0x306E,0x306F,0x3070,0x3071,/* 0xC8-0xCF */
+	0x3072,0x3073,0x3074,0x3075,0x3076,0x3077,0x3078,0x3079,/* 0xD0-0xD7 */
+	0x307A,0x307B,0x307C,0x307D,0x307E,0x307F,0x3080,0x3081,/* 0xD8-0xDF */
+	0x3082,0x3083,0x3084,0x3085,0x3086,0x3087,0x3088,0x3089,/* 0xE0-0xE7 */
+	0x308A,0x308B,0x308C,0x308D,0x308E,0x308F,0x3090,0x3091,/* 0xE8-0xEF */
+	0x3092,0x3093,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_83[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x30A1,0x30A2,0x30A3,0x30A4,0x30A5,0x30A6,0x30A7,0x30A8,/* 0x40-0x47 */
+	0x30A9,0x30AA,0x30AB,0x30AC,0x30AD,0x30AE,0x30AF,0x30B0,/* 0x48-0x4F */
+	0x30B1,0x30B2,0x30B3,0x30B4,0x30B5,0x30B6,0x30B7,0x30B8,/* 0x50-0x57 */
+	0x30B9,0x30BA,0x30BB,0x30BC,0x30BD,0x30BE,0x30BF,0x30C0,/* 0x58-0x5F */
+	0x30C1,0x30C2,0x30C3,0x30C4,0x30C5,0x30C6,0x30C7,0x30C8,/* 0x60-0x67 */
+	0x30C9,0x30CA,0x30CB,0x30CC,0x30CD,0x30CE,0x30CF,0x30D0,/* 0x68-0x6F */
+	0x30D1,0x30D2,0x30D3,0x30D4,0x30D5,0x30D6,0x30D7,0x30D8,/* 0x70-0x77 */
+	0x30D9,0x30DA,0x30DB,0x30DC,0x30DD,0x30DE,0x30DF,0x0000,/* 0x78-0x7F */
+
+	0x30E0,0x30E1,0x30E2,0x30E3,0x30E4,0x30E5,0x30E6,0x30E7,/* 0x80-0x87 */
+	0x30E8,0x30E9,0x30EA,0x30EB,0x30EC,0x30ED,0x30EE,0x30EF,/* 0x88-0x8F */
+	0x30F0,0x30F1,0x30F2,0x30F3,0x30F4,0x30F5,0x30F6,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0391,/* 0x98-0x9F */
+	0x0392,0x0393,0x0394,0x0395,0x0396,0x0397,0x0398,0x0399,/* 0xA0-0xA7 */
+	0x039A,0x039B,0x039C,0x039D,0x039E,0x039F,0x03A0,0x03A1,/* 0xA8-0xAF */
+	0x03A3,0x03A4,0x03A5,0x03A6,0x03A7,0x03A8,0x03A9,0x0000,/* 0xB0-0xB7 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x03B1,/* 0xB8-0xBF */
+	0x03B2,0x03B3,0x03B4,0x03B5,0x03B6,0x03B7,0x03B8,0x03B9,/* 0xC0-0xC7 */
+	0x03BA,0x03BB,0x03BC,0x03BD,0x03BE,0x03BF,0x03C0,0x03C1,/* 0xC8-0xCF */
+	0x03C3,0x03C4,0x03C5,0x03C6,0x03C7,0x03C8,0x03C9,0x0000,/* 0xD0-0xD7 */
+};
+
+static const wchar_t c2u_84[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0401,0x0416,/* 0x40-0x47 */
+	0x0417,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,0x041E,/* 0x48-0x4F */
+	0x041F,0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,0x0426,/* 0x50-0x57 */
+	0x0427,0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,0x042E,/* 0x58-0x5F */
+	0x042F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0451,0x0436,/* 0x70-0x77 */
+	0x0437,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,0x0000,/* 0x78-0x7F */
+
+	0x043E,0x043F,0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,/* 0x80-0x87 */
+	0x0446,0x0447,0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,/* 0x88-0x8F */
+	0x044E,0x044F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x2500,/* 0x98-0x9F */
+	0x2502,0x250C,0x2510,0x2518,0x2514,0x251C,0x252C,0x2524,/* 0xA0-0xA7 */
+	0x2534,0x253C,0x2501,0x2503,0x250F,0x2513,0x251B,0x2517,/* 0xA8-0xAF */
+	0x2523,0x2533,0x252B,0x253B,0x254B,0x2520,0x252F,0x2528,/* 0xB0-0xB7 */
+	0x2537,0x253F,0x251D,0x2530,0x2525,0x2538,0x2542,0x0000,/* 0xB8-0xBF */
+};
+
+static const wchar_t c2u_87[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x2460,0x2461,0x2462,0x2463,0x2464,0x2465,0x2466,0x2467,/* 0x40-0x47 */
+	0x2468,0x2469,0x246A,0x246B,0x246C,0x246D,0x246E,0x246F,/* 0x48-0x4F */
+	0x2470,0x2471,0x2472,0x2473,0x2160,0x2161,0x2162,0x2163,/* 0x50-0x57 */
+	0x2164,0x2165,0x2166,0x2167,0x2168,0x2169,0x0000,0x3349,/* 0x58-0x5F */
+	0x3314,0x3322,0x334D,0x3318,0x3327,0x3303,0x3336,0x3351,/* 0x60-0x67 */
+	0x3357,0x330D,0x3326,0x3323,0x332B,0x334A,0x333B,0x339C,/* 0x68-0x6F */
+	0x339D,0x339E,0x338E,0x338F,0x33C4,0x33A1,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x337B,0x0000,/* 0x78-0x7F */
+
+	0x301D,0x301F,0x2116,0x33CD,0x2121,0x32A4,0x32A5,0x32A6,/* 0x80-0x87 */
+	0x32A7,0x32A8,0x3231,0x3232,0x3239,0x337E,0x337D,0x337C,/* 0x88-0x8F */
+	0x2252,0x2261,0x222B,0x222E,0x2211,0x221A,0x22A5,0x2220,/* 0x90-0x97 */
+	0x221F,0x22BF,0x2235,0x2229,0x222A,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+};
+
+static const wchar_t c2u_88[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x4E9C,/* 0x98-0x9F */
+	0x5516,0x5A03,0x963F,0x54C0,0x611B,0x6328,0x59F6,0x9022,/* 0xA0-0xA7 */
+	0x8475,0x831C,0x7A50,0x60AA,0x63E1,0x6E25,0x65ED,0x8466,/* 0xA8-0xAF */
+	0x82A6,0x9BF5,0x6893,0x5727,0x65A1,0x6271,0x5B9B,0x59D0,/* 0xB0-0xB7 */
+	0x867B,0x98F4,0x7D62,0x7DBE,0x9B8E,0x6216,0x7C9F,0x88B7,/* 0xB8-0xBF */
+	0x5B89,0x5EB5,0x6309,0x6697,0x6848,0x95C7,0x978D,0x674F,/* 0xC0-0xC7 */
+	0x4EE5,0x4F0A,0x4F4D,0x4F9D,0x5049,0x56F2,0x5937,0x59D4,/* 0xC8-0xCF */
+	0x5A01,0x5C09,0x60DF,0x610F,0x6170,0x6613,0x6905,0x70BA,/* 0xD0-0xD7 */
+	0x754F,0x7570,0x79FB,0x7DAD,0x7DEF,0x80C3,0x840E,0x8863,/* 0xD8-0xDF */
+	0x8B02,0x9055,0x907A,0x533B,0x4E95,0x4EA5,0x57DF,0x80B2,/* 0xE0-0xE7 */
+	0x90C1,0x78EF,0x4E00,0x58F1,0x6EA2,0x9038,0x7A32,0x8328,/* 0xE8-0xEF */
+	0x828B,0x9C2F,0x5141,0x5370,0x54BD,0x54E1,0x56E0,0x59FB,/* 0xF0-0xF7 */
+	0x5F15,0x98F2,0x6DEB,0x80E4,0x852D,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_89[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9662,0x9670,0x96A0,0x97FB,0x540B,0x53F3,0x5B87,0x70CF,/* 0x40-0x47 */
+	0x7FBD,0x8FC2,0x96E8,0x536F,0x9D5C,0x7ABA,0x4E11,0x7893,/* 0x48-0x4F */
+	0x81FC,0x6E26,0x5618,0x5504,0x6B1D,0x851A,0x9C3B,0x59E5,/* 0x50-0x57 */
+	0x53A9,0x6D66,0x74DC,0x958F,0x5642,0x4E91,0x904B,0x96F2,/* 0x58-0x5F */
+	0x834F,0x990C,0x53E1,0x55B6,0x5B30,0x5F71,0x6620,0x66F3,/* 0x60-0x67 */
+	0x6804,0x6C38,0x6CF3,0x6D29,0x745B,0x76C8,0x7A4E,0x9834,/* 0x68-0x6F */
+	0x82F1,0x885B,0x8A60,0x92ED,0x6DB2,0x75AB,0x76CA,0x99C5,/* 0x70-0x77 */
+	0x60A6,0x8B01,0x8D8A,0x95B2,0x698E,0x53AD,0x5186,0x0000,/* 0x78-0x7F */
+
+	0x5712,0x5830,0x5944,0x5BB4,0x5EF6,0x6028,0x63A9,0x63F4,/* 0x80-0x87 */
+	0x6CBF,0x6F14,0x708E,0x7114,0x7159,0x71D5,0x733F,0x7E01,/* 0x88-0x8F */
+	0x8276,0x82D1,0x8597,0x9060,0x925B,0x9D1B,0x5869,0x65BC,/* 0x90-0x97 */
+	0x6C5A,0x7525,0x51F9,0x592E,0x5965,0x5F80,0x5FDC,0x62BC,/* 0x98-0x9F */
+	0x65FA,0x6A2A,0x6B27,0x6BB4,0x738B,0x7FC1,0x8956,0x9D2C,/* 0xA0-0xA7 */
+	0x9D0E,0x9EC4,0x5CA1,0x6C96,0x837B,0x5104,0x5C4B,0x61B6,/* 0xA8-0xAF */
+	0x81C6,0x6876,0x7261,0x4E59,0x4FFA,0x5378,0x6069,0x6E29,/* 0xB0-0xB7 */
+	0x7A4F,0x97F3,0x4E0B,0x5316,0x4EEE,0x4F55,0x4F3D,0x4FA1,/* 0xB8-0xBF */
+	0x4F73,0x52A0,0x53EF,0x5609,0x590F,0x5AC1,0x5BB6,0x5BE1,/* 0xC0-0xC7 */
+	0x79D1,0x6687,0x679C,0x67B6,0x6B4C,0x6CB3,0x706B,0x73C2,/* 0xC8-0xCF */
+	0x798D,0x79BE,0x7A3C,0x7B87,0x82B1,0x82DB,0x8304,0x8377,/* 0xD0-0xD7 */
+	0x83EF,0x83D3,0x8766,0x8AB2,0x5629,0x8CA8,0x8FE6,0x904E,/* 0xD8-0xDF */
+	0x971E,0x868A,0x4FC4,0x5CE8,0x6211,0x7259,0x753B,0x81E5,/* 0xE0-0xE7 */
+	0x82BD,0x86FE,0x8CC0,0x96C5,0x9913,0x99D5,0x4ECB,0x4F1A,/* 0xE8-0xEF */
+	0x89E3,0x56DE,0x584A,0x58CA,0x5EFB,0x5FEB,0x602A,0x6094,/* 0xF0-0xF7 */
+	0x6062,0x61D0,0x6212,0x62D0,0x6539,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8A[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9B41,0x6666,0x68B0,0x6D77,0x7070,0x754C,0x7686,0x7D75,/* 0x40-0x47 */
+	0x82A5,0x87F9,0x958B,0x968E,0x8C9D,0x51F1,0x52BE,0x5916,/* 0x48-0x4F */
+	0x54B3,0x5BB3,0x5D16,0x6168,0x6982,0x6DAF,0x788D,0x84CB,/* 0x50-0x57 */
+	0x8857,0x8A72,0x93A7,0x9AB8,0x6D6C,0x99A8,0x86D9,0x57A3,/* 0x58-0x5F */
+	0x67FF,0x86CE,0x920E,0x5283,0x5687,0x5404,0x5ED3,0x62E1,/* 0x60-0x67 */
+	0x64B9,0x683C,0x6838,0x6BBB,0x7372,0x78BA,0x7A6B,0x899A,/* 0x68-0x6F */
+	0x89D2,0x8D6B,0x8F03,0x90ED,0x95A3,0x9694,0x9769,0x5B66,/* 0x70-0x77 */
+	0x5CB3,0x697D,0x984D,0x984E,0x639B,0x7B20,0x6A2B,0x0000,/* 0x78-0x7F */
+
+	0x6A7F,0x68B6,0x9C0D,0x6F5F,0x5272,0x559D,0x6070,0x62EC,/* 0x80-0x87 */
+	0x6D3B,0x6E07,0x6ED1,0x845B,0x8910,0x8F44,0x4E14,0x9C39,/* 0x88-0x8F */
+	0x53F6,0x691B,0x6A3A,0x9784,0x682A,0x515C,0x7AC3,0x84B2,/* 0x90-0x97 */
+	0x91DC,0x938C,0x565B,0x9D28,0x6822,0x8305,0x8431,0x7CA5,/* 0x98-0x9F */
+	0x5208,0x82C5,0x74E6,0x4E7E,0x4F83,0x51A0,0x5BD2,0x520A,/* 0xA0-0xA7 */
+	0x52D8,0x52E7,0x5DFB,0x559A,0x582A,0x59E6,0x5B8C,0x5B98,/* 0xA8-0xAF */
+	0x5BDB,0x5E72,0x5E79,0x60A3,0x611F,0x6163,0x61BE,0x63DB,/* 0xB0-0xB7 */
+	0x6562,0x67D1,0x6853,0x68FA,0x6B3E,0x6B53,0x6C57,0x6F22,/* 0xB8-0xBF */
+	0x6F97,0x6F45,0x74B0,0x7518,0x76E3,0x770B,0x7AFF,0x7BA1,/* 0xC0-0xC7 */
+	0x7C21,0x7DE9,0x7F36,0x7FF0,0x809D,0x8266,0x839E,0x89B3,/* 0xC8-0xCF */
+	0x8ACC,0x8CAB,0x9084,0x9451,0x9593,0x9591,0x95A2,0x9665,/* 0xD0-0xD7 */
+	0x97D3,0x9928,0x8218,0x4E38,0x542B,0x5CB8,0x5DCC,0x73A9,/* 0xD8-0xDF */
+	0x764C,0x773C,0x5CA9,0x7FEB,0x8D0B,0x96C1,0x9811,0x9854,/* 0xE0-0xE7 */
+	0x9858,0x4F01,0x4F0E,0x5371,0x559C,0x5668,0x57FA,0x5947,/* 0xE8-0xEF */
+	0x5B09,0x5BC4,0x5C90,0x5E0C,0x5E7E,0x5FCC,0x63EE,0x673A,/* 0xF0-0xF7 */
+	0x65D7,0x65E2,0x671F,0x68CB,0x68C4,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8B[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6A5F,0x5E30,0x6BC5,0x6C17,0x6C7D,0x757F,0x7948,0x5B63,/* 0x40-0x47 */
+	0x7A00,0x7D00,0x5FBD,0x898F,0x8A18,0x8CB4,0x8D77,0x8ECC,/* 0x48-0x4F */
+	0x8F1D,0x98E2,0x9A0E,0x9B3C,0x4E80,0x507D,0x5100,0x5993,/* 0x50-0x57 */
+	0x5B9C,0x622F,0x6280,0x64EC,0x6B3A,0x72A0,0x7591,0x7947,/* 0x58-0x5F */
+	0x7FA9,0x87FB,0x8ABC,0x8B70,0x63AC,0x83CA,0x97A0,0x5409,/* 0x60-0x67 */
+	0x5403,0x55AB,0x6854,0x6A58,0x8A70,0x7827,0x6775,0x9ECD,/* 0x68-0x6F */
+	0x5374,0x5BA2,0x811A,0x8650,0x9006,0x4E18,0x4E45,0x4EC7,/* 0x70-0x77 */
+	0x4F11,0x53CA,0x5438,0x5BAE,0x5F13,0x6025,0x6551,0x0000,/* 0x78-0x7F */
+
+	0x673D,0x6C42,0x6C72,0x6CE3,0x7078,0x7403,0x7A76,0x7AAE,/* 0x80-0x87 */
+	0x7B08,0x7D1A,0x7CFE,0x7D66,0x65E7,0x725B,0x53BB,0x5C45,/* 0x88-0x8F */
+	0x5DE8,0x62D2,0x62E0,0x6319,0x6E20,0x865A,0x8A31,0x8DDD,/* 0x90-0x97 */
+	0x92F8,0x6F01,0x79A6,0x9B5A,0x4EA8,0x4EAB,0x4EAC,0x4F9B,/* 0x98-0x9F */
+	0x4FA0,0x50D1,0x5147,0x7AF6,0x5171,0x51F6,0x5354,0x5321,/* 0xA0-0xA7 */
+	0x537F,0x53EB,0x55AC,0x5883,0x5CE1,0x5F37,0x5F4A,0x602F,/* 0xA8-0xAF */
+	0x6050,0x606D,0x631F,0x6559,0x6A4B,0x6CC1,0x72C2,0x72ED,/* 0xB0-0xB7 */
+	0x77EF,0x80F8,0x8105,0x8208,0x854E,0x90F7,0x93E1,0x97FF,/* 0xB8-0xBF */
+	0x9957,0x9A5A,0x4EF0,0x51DD,0x5C2D,0x6681,0x696D,0x5C40,/* 0xC0-0xC7 */
+	0x66F2,0x6975,0x7389,0x6850,0x7C81,0x50C5,0x52E4,0x5747,/* 0xC8-0xCF */
+	0x5DFE,0x9326,0x65A4,0x6B23,0x6B3D,0x7434,0x7981,0x79BD,/* 0xD0-0xD7 */
+	0x7B4B,0x7DCA,0x82B9,0x83CC,0x887F,0x895F,0x8B39,0x8FD1,/* 0xD8-0xDF */
+	0x91D1,0x541F,0x9280,0x4E5D,0x5036,0x53E5,0x533A,0x72D7,/* 0xE0-0xE7 */
+	0x7396,0x77E9,0x82E6,0x8EAF,0x99C6,0x99C8,0x99D2,0x5177,/* 0xE8-0xEF */
+	0x611A,0x865E,0x55B0,0x7A7A,0x5076,0x5BD3,0x9047,0x9685,/* 0xF0-0xF7 */
+	0x4E32,0x6ADB,0x91E7,0x5C51,0x5C48,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8C[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6398,0x7A9F,0x6C93,0x9774,0x8F61,0x7AAA,0x718A,0x9688,/* 0x40-0x47 */
+	0x7C82,0x6817,0x7E70,0x6851,0x936C,0x52F2,0x541B,0x85AB,/* 0x48-0x4F */
+	0x8A13,0x7FA4,0x8ECD,0x90E1,0x5366,0x8888,0x7941,0x4FC2,/* 0x50-0x57 */
+	0x50BE,0x5211,0x5144,0x5553,0x572D,0x73EA,0x578B,0x5951,/* 0x58-0x5F */
+	0x5F62,0x5F84,0x6075,0x6176,0x6167,0x61A9,0x63B2,0x643A,/* 0x60-0x67 */
+	0x656C,0x666F,0x6842,0x6E13,0x7566,0x7A3D,0x7CFB,0x7D4C,/* 0x68-0x6F */
+	0x7D99,0x7E4B,0x7F6B,0x830E,0x834A,0x86CD,0x8A08,0x8A63,/* 0x70-0x77 */
+	0x8B66,0x8EFD,0x981A,0x9D8F,0x82B8,0x8FCE,0x9BE8,0x0000,/* 0x78-0x7F */
+
+	0x5287,0x621F,0x6483,0x6FC0,0x9699,0x6841,0x5091,0x6B20,/* 0x80-0x87 */
+	0x6C7A,0x6F54,0x7A74,0x7D50,0x8840,0x8A23,0x6708,0x4EF6,/* 0x88-0x8F */
+	0x5039,0x5026,0x5065,0x517C,0x5238,0x5263,0x55A7,0x570F,/* 0x90-0x97 */
+	0x5805,0x5ACC,0x5EFA,0x61B2,0x61F8,0x62F3,0x6372,0x691C,/* 0x98-0x9F */
+	0x6A29,0x727D,0x72AC,0x732E,0x7814,0x786F,0x7D79,0x770C,/* 0xA0-0xA7 */
+	0x80A9,0x898B,0x8B19,0x8CE2,0x8ED2,0x9063,0x9375,0x967A,/* 0xA8-0xAF */
+	0x9855,0x9A13,0x9E78,0x5143,0x539F,0x53B3,0x5E7B,0x5F26,/* 0xB0-0xB7 */
+	0x6E1B,0x6E90,0x7384,0x73FE,0x7D43,0x8237,0x8A00,0x8AFA,/* 0xB8-0xBF */
+	0x9650,0x4E4E,0x500B,0x53E4,0x547C,0x56FA,0x59D1,0x5B64,/* 0xC0-0xC7 */
+	0x5DF1,0x5EAB,0x5F27,0x6238,0x6545,0x67AF,0x6E56,0x72D0,/* 0xC8-0xCF */
+	0x7CCA,0x88B4,0x80A1,0x80E1,0x83F0,0x864E,0x8A87,0x8DE8,/* 0xD0-0xD7 */
+	0x9237,0x96C7,0x9867,0x9F13,0x4E94,0x4E92,0x4F0D,0x5348,/* 0xD8-0xDF */
+	0x5449,0x543E,0x5A2F,0x5F8C,0x5FA1,0x609F,0x68A7,0x6A8E,/* 0xE0-0xE7 */
+	0x745A,0x7881,0x8A9E,0x8AA4,0x8B77,0x9190,0x4E5E,0x9BC9,/* 0xE8-0xEF */
+	0x4EA4,0x4F7C,0x4FAF,0x5019,0x5016,0x5149,0x516C,0x529F,/* 0xF0-0xF7 */
+	0x52B9,0x52FE,0x539A,0x53E3,0x5411,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8D[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x540E,0x5589,0x5751,0x57A2,0x597D,0x5B54,0x5B5D,0x5B8F,/* 0x40-0x47 */
+	0x5DE5,0x5DE7,0x5DF7,0x5E78,0x5E83,0x5E9A,0x5EB7,0x5F18,/* 0x48-0x4F */
+	0x6052,0x614C,0x6297,0x62D8,0x63A7,0x653B,0x6602,0x6643,/* 0x50-0x57 */
+	0x66F4,0x676D,0x6821,0x6897,0x69CB,0x6C5F,0x6D2A,0x6D69,/* 0x58-0x5F */
+	0x6E2F,0x6E9D,0x7532,0x7687,0x786C,0x7A3F,0x7CE0,0x7D05,/* 0x60-0x67 */
+	0x7D18,0x7D5E,0x7DB1,0x8015,0x8003,0x80AF,0x80B1,0x8154,/* 0x68-0x6F */
+	0x818F,0x822A,0x8352,0x884C,0x8861,0x8B1B,0x8CA2,0x8CFC,/* 0x70-0x77 */
+	0x90CA,0x9175,0x9271,0x783F,0x92FC,0x95A4,0x964D,0x0000,/* 0x78-0x7F */
+
+	0x9805,0x9999,0x9AD8,0x9D3B,0x525B,0x52AB,0x53F7,0x5408,/* 0x80-0x87 */
+	0x58D5,0x62F7,0x6FE0,0x8C6A,0x8F5F,0x9EB9,0x514B,0x523B,/* 0x88-0x8F */
+	0x544A,0x56FD,0x7A40,0x9177,0x9D60,0x9ED2,0x7344,0x6F09,/* 0x90-0x97 */
+	0x8170,0x7511,0x5FFD,0x60DA,0x9AA8,0x72DB,0x8FBC,0x6B64,/* 0x98-0x9F */
+	0x9803,0x4ECA,0x56F0,0x5764,0x58BE,0x5A5A,0x6068,0x61C7,/* 0xA0-0xA7 */
+	0x660F,0x6606,0x6839,0x68B1,0x6DF7,0x75D5,0x7D3A,0x826E,/* 0xA8-0xAF */
+	0x9B42,0x4E9B,0x4F50,0x53C9,0x5506,0x5D6F,0x5DE6,0x5DEE,/* 0xB0-0xB7 */
+	0x67FB,0x6C99,0x7473,0x7802,0x8A50,0x9396,0x88DF,0x5750,/* 0xB8-0xBF */
+	0x5EA7,0x632B,0x50B5,0x50AC,0x518D,0x6700,0x54C9,0x585E,/* 0xC0-0xC7 */
+	0x59BB,0x5BB0,0x5F69,0x624D,0x63A1,0x683D,0x6B73,0x6E08,/* 0xC8-0xCF */
+	0x707D,0x91C7,0x7280,0x7815,0x7826,0x796D,0x658E,0x7D30,/* 0xD0-0xD7 */
+	0x83DC,0x88C1,0x8F09,0x969B,0x5264,0x5728,0x6750,0x7F6A,/* 0xD8-0xDF */
+	0x8CA1,0x51B4,0x5742,0x962A,0x583A,0x698A,0x80B4,0x54B2,/* 0xE0-0xE7 */
+	0x5D0E,0x57FC,0x7895,0x9DFA,0x4F5C,0x524A,0x548B,0x643E,/* 0xE8-0xEF */
+	0x6628,0x6714,0x67F5,0x7A84,0x7B56,0x7D22,0x932F,0x685C,/* 0xF0-0xF7 */
+	0x9BAD,0x7B39,0x5319,0x518A,0x5237,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8E[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5BDF,0x62F6,0x64AE,0x64E6,0x672D,0x6BBA,0x85A9,0x96D1,/* 0x40-0x47 */
+	0x7690,0x9BD6,0x634C,0x9306,0x9BAB,0x76BF,0x6652,0x4E09,/* 0x48-0x4F */
+	0x5098,0x53C2,0x5C71,0x60E8,0x6492,0x6563,0x685F,0x71E6,/* 0x50-0x57 */
+	0x73CA,0x7523,0x7B97,0x7E82,0x8695,0x8B83,0x8CDB,0x9178,/* 0x58-0x5F */
+	0x9910,0x65AC,0x66AB,0x6B8B,0x4ED5,0x4ED4,0x4F3A,0x4F7F,/* 0x60-0x67 */
+	0x523A,0x53F8,0x53F2,0x55E3,0x56DB,0x58EB,0x59CB,0x59C9,/* 0x68-0x6F */
+	0x59FF,0x5B50,0x5C4D,0x5E02,0x5E2B,0x5FD7,0x601D,0x6307,/* 0x70-0x77 */
+	0x652F,0x5B5C,0x65AF,0x65BD,0x65E8,0x679D,0x6B62,0x0000,/* 0x78-0x7F */
+
+	0x6B7B,0x6C0F,0x7345,0x7949,0x79C1,0x7CF8,0x7D19,0x7D2B,/* 0x80-0x87 */
+	0x80A2,0x8102,0x81F3,0x8996,0x8A5E,0x8A69,0x8A66,0x8A8C,/* 0x88-0x8F */
+	0x8AEE,0x8CC7,0x8CDC,0x96CC,0x98FC,0x6B6F,0x4E8B,0x4F3C,/* 0x90-0x97 */
+	0x4F8D,0x5150,0x5B57,0x5BFA,0x6148,0x6301,0x6642,0x6B21,/* 0x98-0x9F */
+	0x6ECB,0x6CBB,0x723E,0x74BD,0x75D4,0x78C1,0x793A,0x800C,/* 0xA0-0xA7 */
+	0x8033,0x81EA,0x8494,0x8F9E,0x6C50,0x9E7F,0x5F0F,0x8B58,/* 0xA8-0xAF */
+	0x9D2B,0x7AFA,0x8EF8,0x5B8D,0x96EB,0x4E03,0x53F1,0x57F7,/* 0xB0-0xB7 */
+	0x5931,0x5AC9,0x5BA4,0x6089,0x6E7F,0x6F06,0x75BE,0x8CEA,/* 0xB8-0xBF */
+	0x5B9F,0x8500,0x7BE0,0x5072,0x67F4,0x829D,0x5C61,0x854A,/* 0xC0-0xC7 */
+	0x7E1E,0x820E,0x5199,0x5C04,0x6368,0x8D66,0x659C,0x716E,/* 0xC8-0xCF */
+	0x793E,0x7D17,0x8005,0x8B1D,0x8ECA,0x906E,0x86C7,0x90AA,/* 0xD0-0xD7 */
+	0x501F,0x52FA,0x5C3A,0x6753,0x707C,0x7235,0x914C,0x91C8,/* 0xD8-0xDF */
+	0x932B,0x82E5,0x5BC2,0x5F31,0x60F9,0x4E3B,0x53D6,0x5B88,/* 0xE0-0xE7 */
+	0x624B,0x6731,0x6B8A,0x72E9,0x73E0,0x7A2E,0x816B,0x8DA3,/* 0xE8-0xEF */
+	0x9152,0x9996,0x5112,0x53D7,0x546A,0x5BFF,0x6388,0x6A39,/* 0xF0-0xF7 */
+	0x7DAC,0x9700,0x56DA,0x53CE,0x5468,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8F[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5B97,0x5C31,0x5DDE,0x4FEE,0x6101,0x62FE,0x6D32,0x79C0,/* 0x40-0x47 */
+	0x79CB,0x7D42,0x7E4D,0x7FD2,0x81ED,0x821F,0x8490,0x8846,/* 0x48-0x4F */
+	0x8972,0x8B90,0x8E74,0x8F2F,0x9031,0x914B,0x916C,0x96C6,/* 0x50-0x57 */
+	0x919C,0x4EC0,0x4F4F,0x5145,0x5341,0x5F93,0x620E,0x67D4,/* 0x58-0x5F */
+	0x6C41,0x6E0B,0x7363,0x7E26,0x91CD,0x9283,0x53D4,0x5919,/* 0x60-0x67 */
+	0x5BBF,0x6DD1,0x795D,0x7E2E,0x7C9B,0x587E,0x719F,0x51FA,/* 0x68-0x6F */
+	0x8853,0x8FF0,0x4FCA,0x5CFB,0x6625,0x77AC,0x7AE3,0x821C,/* 0x70-0x77 */
+	0x99FF,0x51C6,0x5FAA,0x65EC,0x696F,0x6B89,0x6DF3,0x0000,/* 0x78-0x7F */
+
+	0x6E96,0x6F64,0x76FE,0x7D14,0x5DE1,0x9075,0x9187,0x9806,/* 0x80-0x87 */
+	0x51E6,0x521D,0x6240,0x6691,0x66D9,0x6E1A,0x5EB6,0x7DD2,/* 0x88-0x8F */
+	0x7F72,0x66F8,0x85AF,0x85F7,0x8AF8,0x52A9,0x53D9,0x5973,/* 0x90-0x97 */
+	0x5E8F,0x5F90,0x6055,0x92E4,0x9664,0x50B7,0x511F,0x52DD,/* 0x98-0x9F */
+	0x5320,0x5347,0x53EC,0x54E8,0x5546,0x5531,0x5617,0x5968,/* 0xA0-0xA7 */
+	0x59BE,0x5A3C,0x5BB5,0x5C06,0x5C0F,0x5C11,0x5C1A,0x5E84,/* 0xA8-0xAF */
+	0x5E8A,0x5EE0,0x5F70,0x627F,0x6284,0x62DB,0x638C,0x6377,/* 0xB0-0xB7 */
+	0x6607,0x660C,0x662D,0x6676,0x677E,0x68A2,0x6A1F,0x6A35,/* 0xB8-0xBF */
+	0x6CBC,0x6D88,0x6E09,0x6E58,0x713C,0x7126,0x7167,0x75C7,/* 0xC0-0xC7 */
+	0x7701,0x785D,0x7901,0x7965,0x79F0,0x7AE0,0x7B11,0x7CA7,/* 0xC8-0xCF */
+	0x7D39,0x8096,0x83D6,0x848B,0x8549,0x885D,0x88F3,0x8A1F,/* 0xD0-0xD7 */
+	0x8A3C,0x8A54,0x8A73,0x8C61,0x8CDE,0x91A4,0x9266,0x937E,/* 0xD8-0xDF */
+	0x9418,0x969C,0x9798,0x4E0A,0x4E08,0x4E1E,0x4E57,0x5197,/* 0xE0-0xE7 */
+	0x5270,0x57CE,0x5834,0x58CC,0x5B22,0x5E38,0x60C5,0x64FE,/* 0xE8-0xEF */
+	0x6761,0x6756,0x6D44,0x72B6,0x7573,0x7A63,0x84B8,0x8B72,/* 0xF0-0xF7 */
+	0x91B8,0x9320,0x5631,0x57F4,0x98FE,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_90[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x62ED,0x690D,0x6B96,0x71ED,0x7E54,0x8077,0x8272,0x89E6,/* 0x40-0x47 */
+	0x98DF,0x8755,0x8FB1,0x5C3B,0x4F38,0x4FE1,0x4FB5,0x5507,/* 0x48-0x4F */
+	0x5A20,0x5BDD,0x5BE9,0x5FC3,0x614E,0x632F,0x65B0,0x664B,/* 0x50-0x57 */
+	0x68EE,0x699B,0x6D78,0x6DF1,0x7533,0x75B9,0x771F,0x795E,/* 0x58-0x5F */
+	0x79E6,0x7D33,0x81E3,0x82AF,0x85AA,0x89AA,0x8A3A,0x8EAB,/* 0x60-0x67 */
+	0x8F9B,0x9032,0x91DD,0x9707,0x4EBA,0x4EC1,0x5203,0x5875,/* 0x68-0x6F */
+	0x58EC,0x5C0B,0x751A,0x5C3D,0x814E,0x8A0A,0x8FC5,0x9663,/* 0x70-0x77 */
+	0x976D,0x7B25,0x8ACF,0x9808,0x9162,0x56F3,0x53A8,0x0000,/* 0x78-0x7F */
+
+	0x9017,0x5439,0x5782,0x5E25,0x63A8,0x6C34,0x708A,0x7761,/* 0x80-0x87 */
+	0x7C8B,0x7FE0,0x8870,0x9042,0x9154,0x9310,0x9318,0x968F,/* 0x88-0x8F */
+	0x745E,0x9AC4,0x5D07,0x5D69,0x6570,0x67A2,0x8DA8,0x96DB,/* 0x90-0x97 */
+	0x636E,0x6749,0x6919,0x83C5,0x9817,0x96C0,0x88FE,0x6F84,/* 0x98-0x9F */
+	0x647A,0x5BF8,0x4E16,0x702C,0x755D,0x662F,0x51C4,0x5236,/* 0xA0-0xA7 */
+	0x52E2,0x59D3,0x5F81,0x6027,0x6210,0x653F,0x6574,0x661F,/* 0xA8-0xAF */
+	0x6674,0x68F2,0x6816,0x6B63,0x6E05,0x7272,0x751F,0x76DB,/* 0xB0-0xB7 */
+	0x7CBE,0x8056,0x58F0,0x88FD,0x897F,0x8AA0,0x8A93,0x8ACB,/* 0xB8-0xBF */
+	0x901D,0x9192,0x9752,0x9759,0x6589,0x7A0E,0x8106,0x96BB,/* 0xC0-0xC7 */
+	0x5E2D,0x60DC,0x621A,0x65A5,0x6614,0x6790,0x77F3,0x7A4D,/* 0xC8-0xCF */
+	0x7C4D,0x7E3E,0x810A,0x8CAC,0x8D64,0x8DE1,0x8E5F,0x78A9,/* 0xD0-0xD7 */
+	0x5207,0x62D9,0x63A5,0x6442,0x6298,0x8A2D,0x7A83,0x7BC0,/* 0xD8-0xDF */
+	0x8AAC,0x96EA,0x7D76,0x820C,0x8749,0x4ED9,0x5148,0x5343,/* 0xE0-0xE7 */
+	0x5360,0x5BA3,0x5C02,0x5C16,0x5DDD,0x6226,0x6247,0x64B0,/* 0xE8-0xEF */
+	0x6813,0x6834,0x6CC9,0x6D45,0x6D17,0x67D3,0x6F5C,0x714E,/* 0xF0-0xF7 */
+	0x717D,0x65CB,0x7A7F,0x7BAD,0x7DDA,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_91[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7E4A,0x7FA8,0x817A,0x821B,0x8239,0x85A6,0x8A6E,0x8CCE,/* 0x40-0x47 */
+	0x8DF5,0x9078,0x9077,0x92AD,0x9291,0x9583,0x9BAE,0x524D,/* 0x48-0x4F */
+	0x5584,0x6F38,0x7136,0x5168,0x7985,0x7E55,0x81B3,0x7CCE,/* 0x50-0x57 */
+	0x564C,0x5851,0x5CA8,0x63AA,0x66FE,0x66FD,0x695A,0x72D9,/* 0x58-0x5F */
+	0x758F,0x758E,0x790E,0x7956,0x79DF,0x7C97,0x7D20,0x7D44,/* 0x60-0x67 */
+	0x8607,0x8A34,0x963B,0x9061,0x9F20,0x50E7,0x5275,0x53CC,/* 0x68-0x6F */
+	0x53E2,0x5009,0x55AA,0x58EE,0x594F,0x723D,0x5B8B,0x5C64,/* 0x70-0x77 */
+	0x531D,0x60E3,0x60F3,0x635C,0x6383,0x633F,0x63BB,0x0000,/* 0x78-0x7F */
+
+	0x64CD,0x65E9,0x66F9,0x5DE3,0x69CD,0x69FD,0x6F15,0x71E5,/* 0x80-0x87 */
+	0x4E89,0x75E9,0x76F8,0x7A93,0x7CDF,0x7DCF,0x7D9C,0x8061,/* 0x88-0x8F */
+	0x8349,0x8358,0x846C,0x84BC,0x85FB,0x88C5,0x8D70,0x9001,/* 0x90-0x97 */
+	0x906D,0x9397,0x971C,0x9A12,0x50CF,0x5897,0x618E,0x81D3,/* 0x98-0x9F */
+	0x8535,0x8D08,0x9020,0x4FC3,0x5074,0x5247,0x5373,0x606F,/* 0xA0-0xA7 */
+	0x6349,0x675F,0x6E2C,0x8DB3,0x901F,0x4FD7,0x5C5E,0x8CCA,/* 0xA8-0xAF */
+	0x65CF,0x7D9A,0x5352,0x8896,0x5176,0x63C3,0x5B58,0x5B6B,/* 0xB0-0xB7 */
+	0x5C0A,0x640D,0x6751,0x905C,0x4ED6,0x591A,0x592A,0x6C70,/* 0xB8-0xBF */
+	0x8A51,0x553E,0x5815,0x59A5,0x60F0,0x6253,0x67C1,0x8235,/* 0xC0-0xC7 */
+	0x6955,0x9640,0x99C4,0x9A28,0x4F53,0x5806,0x5BFE,0x8010,/* 0xC8-0xCF */
+	0x5CB1,0x5E2F,0x5F85,0x6020,0x614B,0x6234,0x66FF,0x6CF0,/* 0xD0-0xD7 */
+	0x6EDE,0x80CE,0x817F,0x82D4,0x888B,0x8CB8,0x9000,0x902E,/* 0xD8-0xDF */
+	0x968A,0x9EDB,0x9BDB,0x4EE3,0x53F0,0x5927,0x7B2C,0x918D,/* 0xE0-0xE7 */
+	0x984C,0x9DF9,0x6EDD,0x7027,0x5353,0x5544,0x5B85,0x6258,/* 0xE8-0xEF */
+	0x629E,0x62D3,0x6CA2,0x6FEF,0x7422,0x8A17,0x9438,0x6FC1,/* 0xF0-0xF7 */
+	0x8AFE,0x8338,0x51E7,0x86F8,0x53EA,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_92[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x53E9,0x4F46,0x9054,0x8FB0,0x596A,0x8131,0x5DFD,0x7AEA,/* 0x40-0x47 */
+	0x8FBF,0x68DA,0x8C37,0x72F8,0x9C48,0x6A3D,0x8AB0,0x4E39,/* 0x48-0x4F */
+	0x5358,0x5606,0x5766,0x62C5,0x63A2,0x65E6,0x6B4E,0x6DE1,/* 0x50-0x57 */
+	0x6E5B,0x70AD,0x77ED,0x7AEF,0x7BAA,0x7DBB,0x803D,0x80C6,/* 0x58-0x5F */
+	0x86CB,0x8A95,0x935B,0x56E3,0x58C7,0x5F3E,0x65AD,0x6696,/* 0x60-0x67 */
+	0x6A80,0x6BB5,0x7537,0x8AC7,0x5024,0x77E5,0x5730,0x5F1B,/* 0x68-0x6F */
+	0x6065,0x667A,0x6C60,0x75F4,0x7A1A,0x7F6E,0x81F4,0x8718,/* 0x70-0x77 */
+	0x9045,0x99B3,0x7BC9,0x755C,0x7AF9,0x7B51,0x84C4,0x0000,/* 0x78-0x7F */
+
+	0x9010,0x79E9,0x7A92,0x8336,0x5AE1,0x7740,0x4E2D,0x4EF2,/* 0x80-0x87 */
+	0x5B99,0x5FE0,0x62BD,0x663C,0x67F1,0x6CE8,0x866B,0x8877,/* 0x88-0x8F */
+	0x8A3B,0x914E,0x92F3,0x99D0,0x6A17,0x7026,0x732A,0x82E7,/* 0x90-0x97 */
+	0x8457,0x8CAF,0x4E01,0x5146,0x51CB,0x558B,0x5BF5,0x5E16,/* 0x98-0x9F */
+	0x5E33,0x5E81,0x5F14,0x5F35,0x5F6B,0x5FB4,0x61F2,0x6311,/* 0xA0-0xA7 */
+	0x66A2,0x671D,0x6F6E,0x7252,0x753A,0x773A,0x8074,0x8139,/* 0xA8-0xAF */
+	0x8178,0x8776,0x8ABF,0x8ADC,0x8D85,0x8DF3,0x929A,0x9577,/* 0xB0-0xB7 */
+	0x9802,0x9CE5,0x52C5,0x6357,0x76F4,0x6715,0x6C88,0x73CD,/* 0xB8-0xBF */
+	0x8CC3,0x93AE,0x9673,0x6D25,0x589C,0x690E,0x69CC,0x8FFD,/* 0xC0-0xC7 */
+	0x939A,0x75DB,0x901A,0x585A,0x6802,0x63B4,0x69FB,0x4F43,/* 0xC8-0xCF */
+	0x6F2C,0x67D8,0x8FBB,0x8526,0x7DB4,0x9354,0x693F,0x6F70,/* 0xD0-0xD7 */
+	0x576A,0x58F7,0x5B2C,0x7D2C,0x722A,0x540A,0x91E3,0x9DB4,/* 0xD8-0xDF */
+	0x4EAD,0x4F4E,0x505C,0x5075,0x5243,0x8C9E,0x5448,0x5824,/* 0xE0-0xE7 */
+	0x5B9A,0x5E1D,0x5E95,0x5EAD,0x5EF7,0x5F1F,0x608C,0x62B5,/* 0xE8-0xEF */
+	0x633A,0x63D0,0x68AF,0x6C40,0x7887,0x798E,0x7A0B,0x7DE0,/* 0xF0-0xF7 */
+	0x8247,0x8A02,0x8AE6,0x8E44,0x9013,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_93[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x90B8,0x912D,0x91D8,0x9F0E,0x6CE5,0x6458,0x64E2,0x6575,/* 0x40-0x47 */
+	0x6EF4,0x7684,0x7B1B,0x9069,0x93D1,0x6EBA,0x54F2,0x5FB9,/* 0x48-0x4F */
+	0x64A4,0x8F4D,0x8FED,0x9244,0x5178,0x586B,0x5929,0x5C55,/* 0x50-0x57 */
+	0x5E97,0x6DFB,0x7E8F,0x751C,0x8CBC,0x8EE2,0x985B,0x70B9,/* 0x58-0x5F */
+	0x4F1D,0x6BBF,0x6FB1,0x7530,0x96FB,0x514E,0x5410,0x5835,/* 0x60-0x67 */
+	0x5857,0x59AC,0x5C60,0x5F92,0x6597,0x675C,0x6E21,0x767B,/* 0x68-0x6F */
+	0x83DF,0x8CED,0x9014,0x90FD,0x934D,0x7825,0x783A,0x52AA,/* 0x70-0x77 */
+	0x5EA6,0x571F,0x5974,0x6012,0x5012,0x515A,0x51AC,0x0000,/* 0x78-0x7F */
+
+	0x51CD,0x5200,0x5510,0x5854,0x5858,0x5957,0x5B95,0x5CF6,/* 0x80-0x87 */
+	0x5D8B,0x60BC,0x6295,0x642D,0x6771,0x6843,0x68BC,0x68DF,/* 0x88-0x8F */
+	0x76D7,0x6DD8,0x6E6F,0x6D9B,0x706F,0x71C8,0x5F53,0x75D8,/* 0x90-0x97 */
+	0x7977,0x7B49,0x7B54,0x7B52,0x7CD6,0x7D71,0x5230,0x8463,/* 0x98-0x9F */
+	0x8569,0x85E4,0x8A0E,0x8B04,0x8C46,0x8E0F,0x9003,0x900F,/* 0xA0-0xA7 */
+	0x9419,0x9676,0x982D,0x9A30,0x95D8,0x50CD,0x52D5,0x540C,/* 0xA8-0xAF */
+	0x5802,0x5C0E,0x61A7,0x649E,0x6D1E,0x77B3,0x7AE5,0x80F4,/* 0xB0-0xB7 */
+	0x8404,0x9053,0x9285,0x5CE0,0x9D07,0x533F,0x5F97,0x5FB3,/* 0xB8-0xBF */
+	0x6D9C,0x7279,0x7763,0x79BF,0x7BE4,0x6BD2,0x72EC,0x8AAD,/* 0xC0-0xC7 */
+	0x6803,0x6A61,0x51F8,0x7A81,0x6934,0x5C4A,0x9CF6,0x82EB,/* 0xC8-0xCF */
+	0x5BC5,0x9149,0x701E,0x5678,0x5C6F,0x60C7,0x6566,0x6C8C,/* 0xD0-0xD7 */
+	0x8C5A,0x9041,0x9813,0x5451,0x66C7,0x920D,0x5948,0x90A3,/* 0xD8-0xDF */
+	0x5185,0x4E4D,0x51EA,0x8599,0x8B0E,0x7058,0x637A,0x934B,/* 0xE0-0xE7 */
+	0x6962,0x99B4,0x7E04,0x7577,0x5357,0x6960,0x8EDF,0x96E3,/* 0xE8-0xEF */
+	0x6C5D,0x4E8C,0x5C3C,0x5F10,0x8FE9,0x5302,0x8CD1,0x8089,/* 0xF0-0xF7 */
+	0x8679,0x5EFF,0x65E5,0x4E73,0x5165,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_94[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5982,0x5C3F,0x97EE,0x4EFB,0x598A,0x5FCD,0x8A8D,0x6FE1,/* 0x40-0x47 */
+	0x79B0,0x7962,0x5BE7,0x8471,0x732B,0x71B1,0x5E74,0x5FF5,/* 0x48-0x4F */
+	0x637B,0x649A,0x71C3,0x7C98,0x4E43,0x5EFC,0x4E4B,0x57DC,/* 0x50-0x57 */
+	0x56A2,0x60A9,0x6FC3,0x7D0D,0x80FD,0x8133,0x81BF,0x8FB2,/* 0x58-0x5F */
+	0x8997,0x86A4,0x5DF4,0x628A,0x64AD,0x8987,0x6777,0x6CE2,/* 0x60-0x67 */
+	0x6D3E,0x7436,0x7834,0x5A46,0x7F75,0x82AD,0x99AC,0x4FF3,/* 0x68-0x6F */
+	0x5EC3,0x62DD,0x6392,0x6557,0x676F,0x76C3,0x724C,0x80CC,/* 0x70-0x77 */
+	0x80BA,0x8F29,0x914D,0x500D,0x57F9,0x5A92,0x6885,0x0000,/* 0x78-0x7F */
+
+	0x6973,0x7164,0x72FD,0x8CB7,0x58F2,0x8CE0,0x966A,0x9019,/* 0x80-0x87 */
+	0x877F,0x79E4,0x77E7,0x8429,0x4F2F,0x5265,0x535A,0x62CD,/* 0x88-0x8F */
+	0x67CF,0x6CCA,0x767D,0x7B94,0x7C95,0x8236,0x8584,0x8FEB,/* 0x90-0x97 */
+	0x66DD,0x6F20,0x7206,0x7E1B,0x83AB,0x99C1,0x9EA6,0x51FD,/* 0x98-0x9F */
+	0x7BB1,0x7872,0x7BB8,0x8087,0x7B48,0x6AE8,0x5E61,0x808C,/* 0xA0-0xA7 */
+	0x7551,0x7560,0x516B,0x9262,0x6E8C,0x767A,0x9197,0x9AEA,/* 0xA8-0xAF */
+	0x4F10,0x7F70,0x629C,0x7B4F,0x95A5,0x9CE9,0x567A,0x5859,/* 0xB0-0xB7 */
+	0x86E4,0x96BC,0x4F34,0x5224,0x534A,0x53CD,0x53DB,0x5E06,/* 0xB8-0xBF */
+	0x642C,0x6591,0x677F,0x6C3E,0x6C4E,0x7248,0x72AF,0x73ED,/* 0xC0-0xC7 */
+	0x7554,0x7E41,0x822C,0x85E9,0x8CA9,0x7BC4,0x91C6,0x7169,/* 0xC8-0xCF */
+	0x9812,0x98EF,0x633D,0x6669,0x756A,0x76E4,0x78D0,0x8543,/* 0xD0-0xD7 */
+	0x86EE,0x532A,0x5351,0x5426,0x5983,0x5E87,0x5F7C,0x60B2,/* 0xD8-0xDF */
+	0x6249,0x6279,0x62AB,0x6590,0x6BD4,0x6CCC,0x75B2,0x76AE,/* 0xE0-0xE7 */
+	0x7891,0x79D8,0x7DCB,0x7F77,0x80A5,0x88AB,0x8AB9,0x8CBB,/* 0xE8-0xEF */
+	0x907F,0x975E,0x98DB,0x6A0B,0x7C38,0x5099,0x5C3E,0x5FAE,/* 0xF0-0xF7 */
+	0x6787,0x6BD8,0x7435,0x7709,0x7F8E,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_95[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9F3B,0x67CA,0x7A17,0x5339,0x758B,0x9AED,0x5F66,0x819D,/* 0x40-0x47 */
+	0x83F1,0x8098,0x5F3C,0x5FC5,0x7562,0x7B46,0x903C,0x6867,/* 0x48-0x4F */
+	0x59EB,0x5A9B,0x7D10,0x767E,0x8B2C,0x4FF5,0x5F6A,0x6A19,/* 0x50-0x57 */
+	0x6C37,0x6F02,0x74E2,0x7968,0x8868,0x8A55,0x8C79,0x5EDF,/* 0x58-0x5F */
+	0x63CF,0x75C5,0x79D2,0x82D7,0x9328,0x92F2,0x849C,0x86ED,/* 0x60-0x67 */
+	0x9C2D,0x54C1,0x5F6C,0x658C,0x6D5C,0x7015,0x8CA7,0x8CD3,/* 0x68-0x6F */
+	0x983B,0x654F,0x74F6,0x4E0D,0x4ED8,0x57E0,0x592B,0x5A66,/* 0x70-0x77 */
+	0x5BCC,0x51A8,0x5E03,0x5E9C,0x6016,0x6276,0x6577,0x0000,/* 0x78-0x7F */
+
+	0x65A7,0x666E,0x6D6E,0x7236,0x7B26,0x8150,0x819A,0x8299,/* 0x80-0x87 */
+	0x8B5C,0x8CA0,0x8CE6,0x8D74,0x961C,0x9644,0x4FAE,0x64AB,/* 0x88-0x8F */
+	0x6B66,0x821E,0x8461,0x856A,0x90E8,0x5C01,0x6953,0x98A8,/* 0x90-0x97 */
+	0x847A,0x8557,0x4F0F,0x526F,0x5FA9,0x5E45,0x670D,0x798F,/* 0x98-0x9F */
+	0x8179,0x8907,0x8986,0x6DF5,0x5F17,0x6255,0x6CB8,0x4ECF,/* 0xA0-0xA7 */
+	0x7269,0x9B92,0x5206,0x543B,0x5674,0x58B3,0x61A4,0x626E,/* 0xA8-0xAF */
+	0x711A,0x596E,0x7C89,0x7CDE,0x7D1B,0x96F0,0x6587,0x805E,/* 0xB0-0xB7 */
+	0x4E19,0x4F75,0x5175,0x5840,0x5E63,0x5E73,0x5F0A,0x67C4,/* 0xB8-0xBF */
+	0x4E26,0x853D,0x9589,0x965B,0x7C73,0x9801,0x50FB,0x58C1,/* 0xC0-0xC7 */
+	0x7656,0x78A7,0x5225,0x77A5,0x8511,0x7B86,0x504F,0x5909,/* 0xC8-0xCF */
+	0x7247,0x7BC7,0x7DE8,0x8FBA,0x8FD4,0x904D,0x4FBF,0x52C9,/* 0xD0-0xD7 */
+	0x5A29,0x5F01,0x97AD,0x4FDD,0x8217,0x92EA,0x5703,0x6355,/* 0xD8-0xDF */
+	0x6B69,0x752B,0x88DC,0x8F14,0x7A42,0x52DF,0x5893,0x6155,/* 0xE0-0xE7 */
+	0x620A,0x66AE,0x6BCD,0x7C3F,0x83E9,0x5023,0x4FF8,0x5305,/* 0xE8-0xEF */
+	0x5446,0x5831,0x5949,0x5B9D,0x5CF0,0x5CEF,0x5D29,0x5E96,/* 0xF0-0xF7 */
+	0x62B1,0x6367,0x653E,0x65B9,0x670B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_96[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6CD5,0x6CE1,0x70F9,0x7832,0x7E2B,0x80DE,0x82B3,0x840C,/* 0x40-0x47 */
+	0x84EC,0x8702,0x8912,0x8A2A,0x8C4A,0x90A6,0x92D2,0x98FD,/* 0x48-0x4F */
+	0x9CF3,0x9D6C,0x4E4F,0x4EA1,0x508D,0x5256,0x574A,0x59A8,/* 0x50-0x57 */
+	0x5E3D,0x5FD8,0x5FD9,0x623F,0x66B4,0x671B,0x67D0,0x68D2,/* 0x58-0x5F */
+	0x5192,0x7D21,0x80AA,0x81A8,0x8B00,0x8C8C,0x8CBF,0x927E,/* 0x60-0x67 */
+	0x9632,0x5420,0x982C,0x5317,0x50D5,0x535C,0x58A8,0x64B2,/* 0x68-0x6F */
+	0x6734,0x7267,0x7766,0x7A46,0x91E6,0x52C3,0x6CA1,0x6B86,/* 0x70-0x77 */
+	0x5800,0x5E4C,0x5954,0x672C,0x7FFB,0x51E1,0x76C6,0x0000,/* 0x78-0x7F */
+
+	0x6469,0x78E8,0x9B54,0x9EBB,0x57CB,0x59B9,0x6627,0x679A,/* 0x80-0x87 */
+	0x6BCE,0x54E9,0x69D9,0x5E55,0x819C,0x6795,0x9BAA,0x67FE,/* 0x88-0x8F */
+	0x9C52,0x685D,0x4EA6,0x4FE3,0x53C8,0x62B9,0x672B,0x6CAB,/* 0x90-0x97 */
+	0x8FC4,0x4FAD,0x7E6D,0x9EBF,0x4E07,0x6162,0x6E80,0x6F2B,/* 0x98-0x9F */
+	0x8513,0x5473,0x672A,0x9B45,0x5DF3,0x7B95,0x5CAC,0x5BC6,/* 0xA0-0xA7 */
+	0x871C,0x6E4A,0x84D1,0x7A14,0x8108,0x5999,0x7C8D,0x6C11,/* 0xA8-0xAF */
+	0x7720,0x52D9,0x5922,0x7121,0x725F,0x77DB,0x9727,0x9D61,/* 0xB0-0xB7 */
+	0x690B,0x5A7F,0x5A18,0x51A5,0x540D,0x547D,0x660E,0x76DF,/* 0xB8-0xBF */
+	0x8FF7,0x9298,0x9CF4,0x59EA,0x725D,0x6EC5,0x514D,0x68C9,/* 0xC0-0xC7 */
+	0x7DBF,0x7DEC,0x9762,0x9EBA,0x6478,0x6A21,0x8302,0x5984,/* 0xC8-0xCF */
+	0x5B5F,0x6BDB,0x731B,0x76F2,0x7DB2,0x8017,0x8499,0x5132,/* 0xD0-0xD7 */
+	0x6728,0x9ED9,0x76EE,0x6762,0x52FF,0x9905,0x5C24,0x623B,/* 0xD8-0xDF */
+	0x7C7E,0x8CB0,0x554F,0x60B6,0x7D0B,0x9580,0x5301,0x4E5F,/* 0xE0-0xE7 */
+	0x51B6,0x591C,0x723A,0x8036,0x91CE,0x5F25,0x77E2,0x5384,/* 0xE8-0xEF */
+	0x5F79,0x7D04,0x85AC,0x8A33,0x8E8D,0x9756,0x67F3,0x85AE,/* 0xF0-0xF7 */
+	0x9453,0x6109,0x6108,0x6CB9,0x7652,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_97[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8AED,0x8F38,0x552F,0x4F51,0x512A,0x52C7,0x53CB,0x5BA5,/* 0x40-0x47 */
+	0x5E7D,0x60A0,0x6182,0x63D6,0x6709,0x67DA,0x6E67,0x6D8C,/* 0x48-0x4F */
+	0x7336,0x7337,0x7531,0x7950,0x88D5,0x8A98,0x904A,0x9091,/* 0x50-0x57 */
+	0x90F5,0x96C4,0x878D,0x5915,0x4E88,0x4F59,0x4E0E,0x8A89,/* 0x58-0x5F */
+	0x8F3F,0x9810,0x50AD,0x5E7C,0x5996,0x5BB9,0x5EB8,0x63DA,/* 0x60-0x67 */
+	0x63FA,0x64C1,0x66DC,0x694A,0x69D8,0x6D0B,0x6EB6,0x7194,/* 0x68-0x6F */
+	0x7528,0x7AAF,0x7F8A,0x8000,0x8449,0x84C9,0x8981,0x8B21,/* 0x70-0x77 */
+	0x8E0A,0x9065,0x967D,0x990A,0x617E,0x6291,0x6B32,0x0000,/* 0x78-0x7F */
+
+	0x6C83,0x6D74,0x7FCC,0x7FFC,0x6DC0,0x7F85,0x87BA,0x88F8,/* 0x80-0x87 */
+	0x6765,0x83B1,0x983C,0x96F7,0x6D1B,0x7D61,0x843D,0x916A,/* 0x88-0x8F */
+	0x4E71,0x5375,0x5D50,0x6B04,0x6FEB,0x85CD,0x862D,0x89A7,/* 0x90-0x97 */
+	0x5229,0x540F,0x5C65,0x674E,0x68A8,0x7406,0x7483,0x75E2,/* 0x98-0x9F */
+	0x88CF,0x88E1,0x91CC,0x96E2,0x9678,0x5F8B,0x7387,0x7ACB,/* 0xA0-0xA7 */
+	0x844E,0x63A0,0x7565,0x5289,0x6D41,0x6E9C,0x7409,0x7559,/* 0xA8-0xAF */
+	0x786B,0x7C92,0x9686,0x7ADC,0x9F8D,0x4FB6,0x616E,0x65C5,/* 0xB0-0xB7 */
+	0x865C,0x4E86,0x4EAE,0x50DA,0x4E21,0x51CC,0x5BEE,0x6599,/* 0xB8-0xBF */
+	0x6881,0x6DBC,0x731F,0x7642,0x77AD,0x7A1C,0x7CE7,0x826F,/* 0xC0-0xC7 */
+	0x8AD2,0x907C,0x91CF,0x9675,0x9818,0x529B,0x7DD1,0x502B,/* 0xC8-0xCF */
+	0x5398,0x6797,0x6DCB,0x71D0,0x7433,0x81E8,0x8F2A,0x96A3,/* 0xD0-0xD7 */
+	0x9C57,0x9E9F,0x7460,0x5841,0x6D99,0x7D2F,0x985E,0x4EE4,/* 0xD8-0xDF */
+	0x4F36,0x4F8B,0x51B7,0x52B1,0x5DBA,0x601C,0x73B2,0x793C,/* 0xE0-0xE7 */
+	0x82D3,0x9234,0x96B7,0x96F6,0x970A,0x9E97,0x9F62,0x66A6,/* 0xE8-0xEF */
+	0x6B74,0x5217,0x52A3,0x70C8,0x88C2,0x5EC9,0x604B,0x6190,/* 0xF0-0xF7 */
+	0x6F23,0x7149,0x7C3E,0x7DF4,0x806F,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_98[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x84EE,0x9023,0x932C,0x5442,0x9B6F,0x6AD3,0x7089,0x8CC2,/* 0x40-0x47 */
+	0x8DEF,0x9732,0x52B4,0x5A41,0x5ECA,0x5F04,0x6717,0x697C,/* 0x48-0x4F */
+	0x6994,0x6D6A,0x6F0F,0x7262,0x72FC,0x7BED,0x8001,0x807E,/* 0x50-0x57 */
+	0x874B,0x90CE,0x516D,0x9E93,0x7984,0x808B,0x9332,0x8AD6,/* 0x58-0x5F */
+	0x502D,0x548C,0x8A71,0x6B6A,0x8CC4,0x8107,0x60D1,0x67A0,/* 0x60-0x67 */
+	0x9DF2,0x4E99,0x4E98,0x9C10,0x8A6B,0x85C1,0x8568,0x6900,/* 0x68-0x6F */
+	0x6E7E,0x7897,0x8155,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x5F0C,/* 0x98-0x9F */
+	0x4E10,0x4E15,0x4E2A,0x4E31,0x4E36,0x4E3C,0x4E3F,0x4E42,/* 0xA0-0xA7 */
+	0x4E56,0x4E58,0x4E82,0x4E85,0x8C6B,0x4E8A,0x8212,0x5F0D,/* 0xA8-0xAF */
+	0x4E8E,0x4E9E,0x4E9F,0x4EA0,0x4EA2,0x4EB0,0x4EB3,0x4EB6,/* 0xB0-0xB7 */
+	0x4ECE,0x4ECD,0x4EC4,0x4EC6,0x4EC2,0x4ED7,0x4EDE,0x4EED,/* 0xB8-0xBF */
+	0x4EDF,0x4EF7,0x4F09,0x4F5A,0x4F30,0x4F5B,0x4F5D,0x4F57,/* 0xC0-0xC7 */
+	0x4F47,0x4F76,0x4F88,0x4F8F,0x4F98,0x4F7B,0x4F69,0x4F70,/* 0xC8-0xCF */
+	0x4F91,0x4F6F,0x4F86,0x4F96,0x5118,0x4FD4,0x4FDF,0x4FCE,/* 0xD0-0xD7 */
+	0x4FD8,0x4FDB,0x4FD1,0x4FDA,0x4FD0,0x4FE4,0x4FE5,0x501A,/* 0xD8-0xDF */
+	0x5028,0x5014,0x502A,0x5025,0x5005,0x4F1C,0x4FF6,0x5021,/* 0xE0-0xE7 */
+	0x5029,0x502C,0x4FFE,0x4FEF,0x5011,0x5006,0x5043,0x5047,/* 0xE8-0xEF */
+	0x6703,0x5055,0x5050,0x5048,0x505A,0x5056,0x506C,0x5078,/* 0xF0-0xF7 */
+	0x5080,0x509A,0x5085,0x50B4,0x50B2,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_99[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x50C9,0x50CA,0x50B3,0x50C2,0x50D6,0x50DE,0x50E5,0x50ED,/* 0x40-0x47 */
+	0x50E3,0x50EE,0x50F9,0x50F5,0x5109,0x5101,0x5102,0x5116,/* 0x48-0x4F */
+	0x5115,0x5114,0x511A,0x5121,0x513A,0x5137,0x513C,0x513B,/* 0x50-0x57 */
+	0x513F,0x5140,0x5152,0x514C,0x5154,0x5162,0x7AF8,0x5169,/* 0x58-0x5F */
+	0x516A,0x516E,0x5180,0x5182,0x56D8,0x518C,0x5189,0x518F,/* 0x60-0x67 */
+	0x5191,0x5193,0x5195,0x5196,0x51A4,0x51A6,0x51A2,0x51A9,/* 0x68-0x6F */
+	0x51AA,0x51AB,0x51B3,0x51B1,0x51B2,0x51B0,0x51B5,0x51BD,/* 0x70-0x77 */
+	0x51C5,0x51C9,0x51DB,0x51E0,0x8655,0x51E9,0x51ED,0x0000,/* 0x78-0x7F */
+
+	0x51F0,0x51F5,0x51FE,0x5204,0x520B,0x5214,0x520E,0x5227,/* 0x80-0x87 */
+	0x522A,0x522E,0x5233,0x5239,0x524F,0x5244,0x524B,0x524C,/* 0x88-0x8F */
+	0x525E,0x5254,0x526A,0x5274,0x5269,0x5273,0x527F,0x527D,/* 0x90-0x97 */
+	0x528D,0x5294,0x5292,0x5271,0x5288,0x5291,0x8FA8,0x8FA7,/* 0x98-0x9F */
+	0x52AC,0x52AD,0x52BC,0x52B5,0x52C1,0x52CD,0x52D7,0x52DE,/* 0xA0-0xA7 */
+	0x52E3,0x52E6,0x98ED,0x52E0,0x52F3,0x52F5,0x52F8,0x52F9,/* 0xA8-0xAF */
+	0x5306,0x5308,0x7538,0x530D,0x5310,0x530F,0x5315,0x531A,/* 0xB0-0xB7 */
+	0x5323,0x532F,0x5331,0x5333,0x5338,0x5340,0x5346,0x5345,/* 0xB8-0xBF */
+	0x4E17,0x5349,0x534D,0x51D6,0x535E,0x5369,0x536E,0x5918,/* 0xC0-0xC7 */
+	0x537B,0x5377,0x5382,0x5396,0x53A0,0x53A6,0x53A5,0x53AE,/* 0xC8-0xCF */
+	0x53B0,0x53B6,0x53C3,0x7C12,0x96D9,0x53DF,0x66FC,0x71EE,/* 0xD0-0xD7 */
+	0x53EE,0x53E8,0x53ED,0x53FA,0x5401,0x543D,0x5440,0x542C,/* 0xD8-0xDF */
+	0x542D,0x543C,0x542E,0x5436,0x5429,0x541D,0x544E,0x548F,/* 0xE0-0xE7 */
+	0x5475,0x548E,0x545F,0x5471,0x5477,0x5470,0x5492,0x547B,/* 0xE8-0xEF */
+	0x5480,0x5476,0x5484,0x5490,0x5486,0x54C7,0x54A2,0x54B8,/* 0xF0-0xF7 */
+	0x54A5,0x54AC,0x54C4,0x54C8,0x54A8,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9A[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x54AB,0x54C2,0x54A4,0x54BE,0x54BC,0x54D8,0x54E5,0x54E6,/* 0x40-0x47 */
+	0x550F,0x5514,0x54FD,0x54EE,0x54ED,0x54FA,0x54E2,0x5539,/* 0x48-0x4F */
+	0x5540,0x5563,0x554C,0x552E,0x555C,0x5545,0x5556,0x5557,/* 0x50-0x57 */
+	0x5538,0x5533,0x555D,0x5599,0x5580,0x54AF,0x558A,0x559F,/* 0x58-0x5F */
+	0x557B,0x557E,0x5598,0x559E,0x55AE,0x557C,0x5583,0x55A9,/* 0x60-0x67 */
+	0x5587,0x55A8,0x55DA,0x55C5,0x55DF,0x55C4,0x55DC,0x55E4,/* 0x68-0x6F */
+	0x55D4,0x5614,0x55F7,0x5616,0x55FE,0x55FD,0x561B,0x55F9,/* 0x70-0x77 */
+	0x564E,0x5650,0x71DF,0x5634,0x5636,0x5632,0x5638,0x0000,/* 0x78-0x7F */
+
+	0x566B,0x5664,0x562F,0x566C,0x566A,0x5686,0x5680,0x568A,/* 0x80-0x87 */
+	0x56A0,0x5694,0x568F,0x56A5,0x56AE,0x56B6,0x56B4,0x56C2,/* 0x88-0x8F */
+	0x56BC,0x56C1,0x56C3,0x56C0,0x56C8,0x56CE,0x56D1,0x56D3,/* 0x90-0x97 */
+	0x56D7,0x56EE,0x56F9,0x5700,0x56FF,0x5704,0x5709,0x5708,/* 0x98-0x9F */
+	0x570B,0x570D,0x5713,0x5718,0x5716,0x55C7,0x571C,0x5726,/* 0xA0-0xA7 */
+	0x5737,0x5738,0x574E,0x573B,0x5740,0x574F,0x5769,0x57C0,/* 0xA8-0xAF */
+	0x5788,0x5761,0x577F,0x5789,0x5793,0x57A0,0x57B3,0x57A4,/* 0xB0-0xB7 */
+	0x57AA,0x57B0,0x57C3,0x57C6,0x57D4,0x57D2,0x57D3,0x580A,/* 0xB8-0xBF */
+	0x57D6,0x57E3,0x580B,0x5819,0x581D,0x5872,0x5821,0x5862,/* 0xC0-0xC7 */
+	0x584B,0x5870,0x6BC0,0x5852,0x583D,0x5879,0x5885,0x58B9,/* 0xC8-0xCF */
+	0x589F,0x58AB,0x58BA,0x58DE,0x58BB,0x58B8,0x58AE,0x58C5,/* 0xD0-0xD7 */
+	0x58D3,0x58D1,0x58D7,0x58D9,0x58D8,0x58E5,0x58DC,0x58E4,/* 0xD8-0xDF */
+	0x58DF,0x58EF,0x58FA,0x58F9,0x58FB,0x58FC,0x58FD,0x5902,/* 0xE0-0xE7 */
+	0x590A,0x5910,0x591B,0x68A6,0x5925,0x592C,0x592D,0x5932,/* 0xE8-0xEF */
+	0x5938,0x593E,0x7AD2,0x5955,0x5950,0x594E,0x595A,0x5958,/* 0xF0-0xF7 */
+	0x5962,0x5960,0x5967,0x596C,0x5969,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9B[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5978,0x5981,0x599D,0x4F5E,0x4FAB,0x59A3,0x59B2,0x59C6,/* 0x40-0x47 */
+	0x59E8,0x59DC,0x598D,0x59D9,0x59DA,0x5A25,0x5A1F,0x5A11,/* 0x48-0x4F */
+	0x5A1C,0x5A09,0x5A1A,0x5A40,0x5A6C,0x5A49,0x5A35,0x5A36,/* 0x50-0x57 */
+	0x5A62,0x5A6A,0x5A9A,0x5ABC,0x5ABE,0x5ACB,0x5AC2,0x5ABD,/* 0x58-0x5F */
+	0x5AE3,0x5AD7,0x5AE6,0x5AE9,0x5AD6,0x5AFA,0x5AFB,0x5B0C,/* 0x60-0x67 */
+	0x5B0B,0x5B16,0x5B32,0x5AD0,0x5B2A,0x5B36,0x5B3E,0x5B43,/* 0x68-0x6F */
+	0x5B45,0x5B40,0x5B51,0x5B55,0x5B5A,0x5B5B,0x5B65,0x5B69,/* 0x70-0x77 */
+	0x5B70,0x5B73,0x5B75,0x5B78,0x6588,0x5B7A,0x5B80,0x0000,/* 0x78-0x7F */
+
+	0x5B83,0x5BA6,0x5BB8,0x5BC3,0x5BC7,0x5BC9,0x5BD4,0x5BD0,/* 0x80-0x87 */
+	0x5BE4,0x5BE6,0x5BE2,0x5BDE,0x5BE5,0x5BEB,0x5BF0,0x5BF6,/* 0x88-0x8F */
+	0x5BF3,0x5C05,0x5C07,0x5C08,0x5C0D,0x5C13,0x5C20,0x5C22,/* 0x90-0x97 */
+	0x5C28,0x5C38,0x5C39,0x5C41,0x5C46,0x5C4E,0x5C53,0x5C50,/* 0x98-0x9F */
+	0x5C4F,0x5B71,0x5C6C,0x5C6E,0x4E62,0x5C76,0x5C79,0x5C8C,/* 0xA0-0xA7 */
+	0x5C91,0x5C94,0x599B,0x5CAB,0x5CBB,0x5CB6,0x5CBC,0x5CB7,/* 0xA8-0xAF */
+	0x5CC5,0x5CBE,0x5CC7,0x5CD9,0x5CE9,0x5CFD,0x5CFA,0x5CED,/* 0xB0-0xB7 */
+	0x5D8C,0x5CEA,0x5D0B,0x5D15,0x5D17,0x5D5C,0x5D1F,0x5D1B,/* 0xB8-0xBF */
+	0x5D11,0x5D14,0x5D22,0x5D1A,0x5D19,0x5D18,0x5D4C,0x5D52,/* 0xC0-0xC7 */
+	0x5D4E,0x5D4B,0x5D6C,0x5D73,0x5D76,0x5D87,0x5D84,0x5D82,/* 0xC8-0xCF */
+	0x5DA2,0x5D9D,0x5DAC,0x5DAE,0x5DBD,0x5D90,0x5DB7,0x5DBC,/* 0xD0-0xD7 */
+	0x5DC9,0x5DCD,0x5DD3,0x5DD2,0x5DD6,0x5DDB,0x5DEB,0x5DF2,/* 0xD8-0xDF */
+	0x5DF5,0x5E0B,0x5E1A,0x5E19,0x5E11,0x5E1B,0x5E36,0x5E37,/* 0xE0-0xE7 */
+	0x5E44,0x5E43,0x5E40,0x5E4E,0x5E57,0x5E54,0x5E5F,0x5E62,/* 0xE8-0xEF */
+	0x5E64,0x5E47,0x5E75,0x5E76,0x5E7A,0x9EBC,0x5E7F,0x5EA0,/* 0xF0-0xF7 */
+	0x5EC1,0x5EC2,0x5EC8,0x5ED0,0x5ECF,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9C[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5ED6,0x5EE3,0x5EDD,0x5EDA,0x5EDB,0x5EE2,0x5EE1,0x5EE8,/* 0x40-0x47 */
+	0x5EE9,0x5EEC,0x5EF1,0x5EF3,0x5EF0,0x5EF4,0x5EF8,0x5EFE,/* 0x48-0x4F */
+	0x5F03,0x5F09,0x5F5D,0x5F5C,0x5F0B,0x5F11,0x5F16,0x5F29,/* 0x50-0x57 */
+	0x5F2D,0x5F38,0x5F41,0x5F48,0x5F4C,0x5F4E,0x5F2F,0x5F51,/* 0x58-0x5F */
+	0x5F56,0x5F57,0x5F59,0x5F61,0x5F6D,0x5F73,0x5F77,0x5F83,/* 0x60-0x67 */
+	0x5F82,0x5F7F,0x5F8A,0x5F88,0x5F91,0x5F87,0x5F9E,0x5F99,/* 0x68-0x6F */
+	0x5F98,0x5FA0,0x5FA8,0x5FAD,0x5FBC,0x5FD6,0x5FFB,0x5FE4,/* 0x70-0x77 */
+	0x5FF8,0x5FF1,0x5FDD,0x60B3,0x5FFF,0x6021,0x6060,0x0000,/* 0x78-0x7F */
+
+	0x6019,0x6010,0x6029,0x600E,0x6031,0x601B,0x6015,0x602B,/* 0x80-0x87 */
+	0x6026,0x600F,0x603A,0x605A,0x6041,0x606A,0x6077,0x605F,/* 0x88-0x8F */
+	0x604A,0x6046,0x604D,0x6063,0x6043,0x6064,0x6042,0x606C,/* 0x90-0x97 */
+	0x606B,0x6059,0x6081,0x608D,0x60E7,0x6083,0x609A,0x6084,/* 0x98-0x9F */
+	0x609B,0x6096,0x6097,0x6092,0x60A7,0x608B,0x60E1,0x60B8,/* 0xA0-0xA7 */
+	0x60E0,0x60D3,0x60B4,0x5FF0,0x60BD,0x60C6,0x60B5,0x60D8,/* 0xA8-0xAF */
+	0x614D,0x6115,0x6106,0x60F6,0x60F7,0x6100,0x60F4,0x60FA,/* 0xB0-0xB7 */
+	0x6103,0x6121,0x60FB,0x60F1,0x610D,0x610E,0x6147,0x613E,/* 0xB8-0xBF */
+	0x6128,0x6127,0x614A,0x613F,0x613C,0x612C,0x6134,0x613D,/* 0xC0-0xC7 */
+	0x6142,0x6144,0x6173,0x6177,0x6158,0x6159,0x615A,0x616B,/* 0xC8-0xCF */
+	0x6174,0x616F,0x6165,0x6171,0x615F,0x615D,0x6153,0x6175,/* 0xD0-0xD7 */
+	0x6199,0x6196,0x6187,0x61AC,0x6194,0x619A,0x618A,0x6191,/* 0xD8-0xDF */
+	0x61AB,0x61AE,0x61CC,0x61CA,0x61C9,0x61F7,0x61C8,0x61C3,/* 0xE0-0xE7 */
+	0x61C6,0x61BA,0x61CB,0x7F79,0x61CD,0x61E6,0x61E3,0x61F6,/* 0xE8-0xEF */
+	0x61FA,0x61F4,0x61FF,0x61FD,0x61FC,0x61FE,0x6200,0x6208,/* 0xF0-0xF7 */
+	0x6209,0x620D,0x620C,0x6214,0x621B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9D[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x621E,0x6221,0x622A,0x622E,0x6230,0x6232,0x6233,0x6241,/* 0x40-0x47 */
+	0x624E,0x625E,0x6263,0x625B,0x6260,0x6268,0x627C,0x6282,/* 0x48-0x4F */
+	0x6289,0x627E,0x6292,0x6293,0x6296,0x62D4,0x6283,0x6294,/* 0x50-0x57 */
+	0x62D7,0x62D1,0x62BB,0x62CF,0x62FF,0x62C6,0x64D4,0x62C8,/* 0x58-0x5F */
+	0x62DC,0x62CC,0x62CA,0x62C2,0x62C7,0x629B,0x62C9,0x630C,/* 0x60-0x67 */
+	0x62EE,0x62F1,0x6327,0x6302,0x6308,0x62EF,0x62F5,0x6350,/* 0x68-0x6F */
+	0x633E,0x634D,0x641C,0x634F,0x6396,0x638E,0x6380,0x63AB,/* 0x70-0x77 */
+	0x6376,0x63A3,0x638F,0x6389,0x639F,0x63B5,0x636B,0x0000,/* 0x78-0x7F */
+
+	0x6369,0x63BE,0x63E9,0x63C0,0x63C6,0x63E3,0x63C9,0x63D2,/* 0x80-0x87 */
+	0x63F6,0x63C4,0x6416,0x6434,0x6406,0x6413,0x6426,0x6436,/* 0x88-0x8F */
+	0x651D,0x6417,0x6428,0x640F,0x6467,0x646F,0x6476,0x644E,/* 0x90-0x97 */
+	0x652A,0x6495,0x6493,0x64A5,0x64A9,0x6488,0x64BC,0x64DA,/* 0x98-0x9F */
+	0x64D2,0x64C5,0x64C7,0x64BB,0x64D8,0x64C2,0x64F1,0x64E7,/* 0xA0-0xA7 */
+	0x8209,0x64E0,0x64E1,0x62AC,0x64E3,0x64EF,0x652C,0x64F6,/* 0xA8-0xAF */
+	0x64F4,0x64F2,0x64FA,0x6500,0x64FD,0x6518,0x651C,0x6505,/* 0xB0-0xB7 */
+	0x6524,0x6523,0x652B,0x6534,0x6535,0x6537,0x6536,0x6538,/* 0xB8-0xBF */
+	0x754B,0x6548,0x6556,0x6555,0x654D,0x6558,0x655E,0x655D,/* 0xC0-0xC7 */
+	0x6572,0x6578,0x6582,0x6583,0x8B8A,0x659B,0x659F,0x65AB,/* 0xC8-0xCF */
+	0x65B7,0x65C3,0x65C6,0x65C1,0x65C4,0x65CC,0x65D2,0x65DB,/* 0xD0-0xD7 */
+	0x65D9,0x65E0,0x65E1,0x65F1,0x6772,0x660A,0x6603,0x65FB,/* 0xD8-0xDF */
+	0x6773,0x6635,0x6636,0x6634,0x661C,0x664F,0x6644,0x6649,/* 0xE0-0xE7 */
+	0x6641,0x665E,0x665D,0x6664,0x6667,0x6668,0x665F,0x6662,/* 0xE8-0xEF */
+	0x6670,0x6683,0x6688,0x668E,0x6689,0x6684,0x6698,0x669D,/* 0xF0-0xF7 */
+	0x66C1,0x66B9,0x66C9,0x66BE,0x66BC,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9E[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x66C4,0x66B8,0x66D6,0x66DA,0x66E0,0x663F,0x66E6,0x66E9,/* 0x40-0x47 */
+	0x66F0,0x66F5,0x66F7,0x670F,0x6716,0x671E,0x6726,0x6727,/* 0x48-0x4F */
+	0x9738,0x672E,0x673F,0x6736,0x6741,0x6738,0x6737,0x6746,/* 0x50-0x57 */
+	0x675E,0x6760,0x6759,0x6763,0x6764,0x6789,0x6770,0x67A9,/* 0x58-0x5F */
+	0x677C,0x676A,0x678C,0x678B,0x67A6,0x67A1,0x6785,0x67B7,/* 0x60-0x67 */
+	0x67EF,0x67B4,0x67EC,0x67B3,0x67E9,0x67B8,0x67E4,0x67DE,/* 0x68-0x6F */
+	0x67DD,0x67E2,0x67EE,0x67B9,0x67CE,0x67C6,0x67E7,0x6A9C,/* 0x70-0x77 */
+	0x681E,0x6846,0x6829,0x6840,0x684D,0x6832,0x684E,0x0000,/* 0x78-0x7F */
+
+	0x68B3,0x682B,0x6859,0x6863,0x6877,0x687F,0x689F,0x688F,/* 0x80-0x87 */
+	0x68AD,0x6894,0x689D,0x689B,0x6883,0x6AAE,0x68B9,0x6874,/* 0x88-0x8F */
+	0x68B5,0x68A0,0x68BA,0x690F,0x688D,0x687E,0x6901,0x68CA,/* 0x90-0x97 */
+	0x6908,0x68D8,0x6922,0x6926,0x68E1,0x690C,0x68CD,0x68D4,/* 0x98-0x9F */
+	0x68E7,0x68D5,0x6936,0x6912,0x6904,0x68D7,0x68E3,0x6925,/* 0xA0-0xA7 */
+	0x68F9,0x68E0,0x68EF,0x6928,0x692A,0x691A,0x6923,0x6921,/* 0xA8-0xAF */
+	0x68C6,0x6979,0x6977,0x695C,0x6978,0x696B,0x6954,0x697E,/* 0xB0-0xB7 */
+	0x696E,0x6939,0x6974,0x693D,0x6959,0x6930,0x6961,0x695E,/* 0xB8-0xBF */
+	0x695D,0x6981,0x696A,0x69B2,0x69AE,0x69D0,0x69BF,0x69C1,/* 0xC0-0xC7 */
+	0x69D3,0x69BE,0x69CE,0x5BE8,0x69CA,0x69DD,0x69BB,0x69C3,/* 0xC8-0xCF */
+	0x69A7,0x6A2E,0x6991,0x69A0,0x699C,0x6995,0x69B4,0x69DE,/* 0xD0-0xD7 */
+	0x69E8,0x6A02,0x6A1B,0x69FF,0x6B0A,0x69F9,0x69F2,0x69E7,/* 0xD8-0xDF */
+	0x6A05,0x69B1,0x6A1E,0x69ED,0x6A14,0x69EB,0x6A0A,0x6A12,/* 0xE0-0xE7 */
+	0x6AC1,0x6A23,0x6A13,0x6A44,0x6A0C,0x6A72,0x6A36,0x6A78,/* 0xE8-0xEF */
+	0x6A47,0x6A62,0x6A59,0x6A66,0x6A48,0x6A38,0x6A22,0x6A90,/* 0xF0-0xF7 */
+	0x6A8D,0x6AA0,0x6A84,0x6AA2,0x6AA3,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9F[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6A97,0x8617,0x6ABB,0x6AC3,0x6AC2,0x6AB8,0x6AB3,0x6AAC,/* 0x40-0x47 */
+	0x6ADE,0x6AD1,0x6ADF,0x6AAA,0x6ADA,0x6AEA,0x6AFB,0x6B05,/* 0x48-0x4F */
+	0x8616,0x6AFA,0x6B12,0x6B16,0x9B31,0x6B1F,0x6B38,0x6B37,/* 0x50-0x57 */
+	0x76DC,0x6B39,0x98EE,0x6B47,0x6B43,0x6B49,0x6B50,0x6B59,/* 0x58-0x5F */
+	0x6B54,0x6B5B,0x6B5F,0x6B61,0x6B78,0x6B79,0x6B7F,0x6B80,/* 0x60-0x67 */
+	0x6B84,0x6B83,0x6B8D,0x6B98,0x6B95,0x6B9E,0x6BA4,0x6BAA,/* 0x68-0x6F */
+	0x6BAB,0x6BAF,0x6BB2,0x6BB1,0x6BB3,0x6BB7,0x6BBC,0x6BC6,/* 0x70-0x77 */
+	0x6BCB,0x6BD3,0x6BDF,0x6BEC,0x6BEB,0x6BF3,0x6BEF,0x0000,/* 0x78-0x7F */
+
+	0x9EBE,0x6C08,0x6C13,0x6C14,0x6C1B,0x6C24,0x6C23,0x6C5E,/* 0x80-0x87 */
+	0x6C55,0x6C62,0x6C6A,0x6C82,0x6C8D,0x6C9A,0x6C81,0x6C9B,/* 0x88-0x8F */
+	0x6C7E,0x6C68,0x6C73,0x6C92,0x6C90,0x6CC4,0x6CF1,0x6CD3,/* 0x90-0x97 */
+	0x6CBD,0x6CD7,0x6CC5,0x6CDD,0x6CAE,0x6CB1,0x6CBE,0x6CBA,/* 0x98-0x9F */
+	0x6CDB,0x6CEF,0x6CD9,0x6CEA,0x6D1F,0x884D,0x6D36,0x6D2B,/* 0xA0-0xA7 */
+	0x6D3D,0x6D38,0x6D19,0x6D35,0x6D33,0x6D12,0x6D0C,0x6D63,/* 0xA8-0xAF */
+	0x6D93,0x6D64,0x6D5A,0x6D79,0x6D59,0x6D8E,0x6D95,0x6FE4,/* 0xB0-0xB7 */
+	0x6D85,0x6DF9,0x6E15,0x6E0A,0x6DB5,0x6DC7,0x6DE6,0x6DB8,/* 0xB8-0xBF */
+	0x6DC6,0x6DEC,0x6DDE,0x6DCC,0x6DE8,0x6DD2,0x6DC5,0x6DFA,/* 0xC0-0xC7 */
+	0x6DD9,0x6DE4,0x6DD5,0x6DEA,0x6DEE,0x6E2D,0x6E6E,0x6E2E,/* 0xC8-0xCF */
+	0x6E19,0x6E72,0x6E5F,0x6E3E,0x6E23,0x6E6B,0x6E2B,0x6E76,/* 0xD0-0xD7 */
+	0x6E4D,0x6E1F,0x6E43,0x6E3A,0x6E4E,0x6E24,0x6EFF,0x6E1D,/* 0xD8-0xDF */
+	0x6E38,0x6E82,0x6EAA,0x6E98,0x6EC9,0x6EB7,0x6ED3,0x6EBD,/* 0xE0-0xE7 */
+	0x6EAF,0x6EC4,0x6EB2,0x6ED4,0x6ED5,0x6E8F,0x6EA5,0x6EC2,/* 0xE8-0xEF */
+	0x6E9F,0x6F41,0x6F11,0x704C,0x6EEC,0x6EF8,0x6EFE,0x6F3F,/* 0xF0-0xF7 */
+	0x6EF2,0x6F31,0x6EEF,0x6F32,0x6ECC,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6F3E,0x6F13,0x6EF7,0x6F86,0x6F7A,0x6F78,0x6F81,0x6F80,/* 0x40-0x47 */
+	0x6F6F,0x6F5B,0x6FF3,0x6F6D,0x6F82,0x6F7C,0x6F58,0x6F8E,/* 0x48-0x4F */
+	0x6F91,0x6FC2,0x6F66,0x6FB3,0x6FA3,0x6FA1,0x6FA4,0x6FB9,/* 0x50-0x57 */
+	0x6FC6,0x6FAA,0x6FDF,0x6FD5,0x6FEC,0x6FD4,0x6FD8,0x6FF1,/* 0x58-0x5F */
+	0x6FEE,0x6FDB,0x7009,0x700B,0x6FFA,0x7011,0x7001,0x700F,/* 0x60-0x67 */
+	0x6FFE,0x701B,0x701A,0x6F74,0x701D,0x7018,0x701F,0x7030,/* 0x68-0x6F */
+	0x703E,0x7032,0x7051,0x7063,0x7099,0x7092,0x70AF,0x70F1,/* 0x70-0x77 */
+	0x70AC,0x70B8,0x70B3,0x70AE,0x70DF,0x70CB,0x70DD,0x0000,/* 0x78-0x7F */
+
+	0x70D9,0x7109,0x70FD,0x711C,0x7119,0x7165,0x7155,0x7188,/* 0x80-0x87 */
+	0x7166,0x7162,0x714C,0x7156,0x716C,0x718F,0x71FB,0x7184,/* 0x88-0x8F */
+	0x7195,0x71A8,0x71AC,0x71D7,0x71B9,0x71BE,0x71D2,0x71C9,/* 0x90-0x97 */
+	0x71D4,0x71CE,0x71E0,0x71EC,0x71E7,0x71F5,0x71FC,0x71F9,/* 0x98-0x9F */
+	0x71FF,0x720D,0x7210,0x721B,0x7228,0x722D,0x722C,0x7230,/* 0xA0-0xA7 */
+	0x7232,0x723B,0x723C,0x723F,0x7240,0x7246,0x724B,0x7258,/* 0xA8-0xAF */
+	0x7274,0x727E,0x7282,0x7281,0x7287,0x7292,0x7296,0x72A2,/* 0xB0-0xB7 */
+	0x72A7,0x72B9,0x72B2,0x72C3,0x72C6,0x72C4,0x72CE,0x72D2,/* 0xB8-0xBF */
+	0x72E2,0x72E0,0x72E1,0x72F9,0x72F7,0x500F,0x7317,0x730A,/* 0xC0-0xC7 */
+	0x731C,0x7316,0x731D,0x7334,0x732F,0x7329,0x7325,0x733E,/* 0xC8-0xCF */
+	0x734E,0x734F,0x9ED8,0x7357,0x736A,0x7368,0x7370,0x7378,/* 0xD0-0xD7 */
+	0x7375,0x737B,0x737A,0x73C8,0x73B3,0x73CE,0x73BB,0x73C0,/* 0xD8-0xDF */
+	0x73E5,0x73EE,0x73DE,0x74A2,0x7405,0x746F,0x7425,0x73F8,/* 0xE0-0xE7 */
+	0x7432,0x743A,0x7455,0x743F,0x745F,0x7459,0x7441,0x745C,/* 0xE8-0xEF */
+	0x7469,0x7470,0x7463,0x746A,0x7476,0x747E,0x748B,0x749E,/* 0xF0-0xF7 */
+	0x74A7,0x74CA,0x74CF,0x74D4,0x73F1,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x74E0,0x74E3,0x74E7,0x74E9,0x74EE,0x74F2,0x74F0,0x74F1,/* 0x40-0x47 */
+	0x74F8,0x74F7,0x7504,0x7503,0x7505,0x750C,0x750E,0x750D,/* 0x48-0x4F */
+	0x7515,0x7513,0x751E,0x7526,0x752C,0x753C,0x7544,0x754D,/* 0x50-0x57 */
+	0x754A,0x7549,0x755B,0x7546,0x755A,0x7569,0x7564,0x7567,/* 0x58-0x5F */
+	0x756B,0x756D,0x7578,0x7576,0x7586,0x7587,0x7574,0x758A,/* 0x60-0x67 */
+	0x7589,0x7582,0x7594,0x759A,0x759D,0x75A5,0x75A3,0x75C2,/* 0x68-0x6F */
+	0x75B3,0x75C3,0x75B5,0x75BD,0x75B8,0x75BC,0x75B1,0x75CD,/* 0x70-0x77 */
+	0x75CA,0x75D2,0x75D9,0x75E3,0x75DE,0x75FE,0x75FF,0x0000,/* 0x78-0x7F */
+
+	0x75FC,0x7601,0x75F0,0x75FA,0x75F2,0x75F3,0x760B,0x760D,/* 0x80-0x87 */
+	0x7609,0x761F,0x7627,0x7620,0x7621,0x7622,0x7624,0x7634,/* 0x88-0x8F */
+	0x7630,0x763B,0x7647,0x7648,0x7646,0x765C,0x7658,0x7661,/* 0x90-0x97 */
+	0x7662,0x7668,0x7669,0x766A,0x7667,0x766C,0x7670,0x7672,/* 0x98-0x9F */
+	0x7676,0x7678,0x767C,0x7680,0x7683,0x7688,0x768B,0x768E,/* 0xA0-0xA7 */
+	0x7696,0x7693,0x7699,0x769A,0x76B0,0x76B4,0x76B8,0x76B9,/* 0xA8-0xAF */
+	0x76BA,0x76C2,0x76CD,0x76D6,0x76D2,0x76DE,0x76E1,0x76E5,/* 0xB0-0xB7 */
+	0x76E7,0x76EA,0x862F,0x76FB,0x7708,0x7707,0x7704,0x7729,/* 0xB8-0xBF */
+	0x7724,0x771E,0x7725,0x7726,0x771B,0x7737,0x7738,0x7747,/* 0xC0-0xC7 */
+	0x775A,0x7768,0x776B,0x775B,0x7765,0x777F,0x777E,0x7779,/* 0xC8-0xCF */
+	0x778E,0x778B,0x7791,0x77A0,0x779E,0x77B0,0x77B6,0x77B9,/* 0xD0-0xD7 */
+	0x77BF,0x77BC,0x77BD,0x77BB,0x77C7,0x77CD,0x77D7,0x77DA,/* 0xD8-0xDF */
+	0x77DC,0x77E3,0x77EE,0x77FC,0x780C,0x7812,0x7926,0x7820,/* 0xE0-0xE7 */
+	0x792A,0x7845,0x788E,0x7874,0x7886,0x787C,0x789A,0x788C,/* 0xE8-0xEF */
+	0x78A3,0x78B5,0x78AA,0x78AF,0x78D1,0x78C6,0x78CB,0x78D4,/* 0xF0-0xF7 */
+	0x78BE,0x78BC,0x78C5,0x78CA,0x78EC,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x78E7,0x78DA,0x78FD,0x78F4,0x7907,0x7912,0x7911,0x7919,/* 0x40-0x47 */
+	0x792C,0x792B,0x7940,0x7960,0x7957,0x795F,0x795A,0x7955,/* 0x48-0x4F */
+	0x7953,0x797A,0x797F,0x798A,0x799D,0x79A7,0x9F4B,0x79AA,/* 0x50-0x57 */
+	0x79AE,0x79B3,0x79B9,0x79BA,0x79C9,0x79D5,0x79E7,0x79EC,/* 0x58-0x5F */
+	0x79E1,0x79E3,0x7A08,0x7A0D,0x7A18,0x7A19,0x7A20,0x7A1F,/* 0x60-0x67 */
+	0x7980,0x7A31,0x7A3B,0x7A3E,0x7A37,0x7A43,0x7A57,0x7A49,/* 0x68-0x6F */
+	0x7A61,0x7A62,0x7A69,0x9F9D,0x7A70,0x7A79,0x7A7D,0x7A88,/* 0x70-0x77 */
+	0x7A97,0x7A95,0x7A98,0x7A96,0x7AA9,0x7AC8,0x7AB0,0x0000,/* 0x78-0x7F */
+
+	0x7AB6,0x7AC5,0x7AC4,0x7ABF,0x9083,0x7AC7,0x7ACA,0x7ACD,/* 0x80-0x87 */
+	0x7ACF,0x7AD5,0x7AD3,0x7AD9,0x7ADA,0x7ADD,0x7AE1,0x7AE2,/* 0x88-0x8F */
+	0x7AE6,0x7AED,0x7AF0,0x7B02,0x7B0F,0x7B0A,0x7B06,0x7B33,/* 0x90-0x97 */
+	0x7B18,0x7B19,0x7B1E,0x7B35,0x7B28,0x7B36,0x7B50,0x7B7A,/* 0x98-0x9F */
+	0x7B04,0x7B4D,0x7B0B,0x7B4C,0x7B45,0x7B75,0x7B65,0x7B74,/* 0xA0-0xA7 */
+	0x7B67,0x7B70,0x7B71,0x7B6C,0x7B6E,0x7B9D,0x7B98,0x7B9F,/* 0xA8-0xAF */
+	0x7B8D,0x7B9C,0x7B9A,0x7B8B,0x7B92,0x7B8F,0x7B5D,0x7B99,/* 0xB0-0xB7 */
+	0x7BCB,0x7BC1,0x7BCC,0x7BCF,0x7BB4,0x7BC6,0x7BDD,0x7BE9,/* 0xB8-0xBF */
+	0x7C11,0x7C14,0x7BE6,0x7BE5,0x7C60,0x7C00,0x7C07,0x7C13,/* 0xC0-0xC7 */
+	0x7BF3,0x7BF7,0x7C17,0x7C0D,0x7BF6,0x7C23,0x7C27,0x7C2A,/* 0xC8-0xCF */
+	0x7C1F,0x7C37,0x7C2B,0x7C3D,0x7C4C,0x7C43,0x7C54,0x7C4F,/* 0xD0-0xD7 */
+	0x7C40,0x7C50,0x7C58,0x7C5F,0x7C64,0x7C56,0x7C65,0x7C6C,/* 0xD8-0xDF */
+	0x7C75,0x7C83,0x7C90,0x7CA4,0x7CAD,0x7CA2,0x7CAB,0x7CA1,/* 0xE0-0xE7 */
+	0x7CA8,0x7CB3,0x7CB2,0x7CB1,0x7CAE,0x7CB9,0x7CBD,0x7CC0,/* 0xE8-0xEF */
+	0x7CC5,0x7CC2,0x7CD8,0x7CD2,0x7CDC,0x7CE2,0x9B3B,0x7CEF,/* 0xF0-0xF7 */
+	0x7CF2,0x7CF4,0x7CF6,0x7CFA,0x7D06,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7D02,0x7D1C,0x7D15,0x7D0A,0x7D45,0x7D4B,0x7D2E,0x7D32,/* 0x40-0x47 */
+	0x7D3F,0x7D35,0x7D46,0x7D73,0x7D56,0x7D4E,0x7D72,0x7D68,/* 0x48-0x4F */
+	0x7D6E,0x7D4F,0x7D63,0x7D93,0x7D89,0x7D5B,0x7D8F,0x7D7D,/* 0x50-0x57 */
+	0x7D9B,0x7DBA,0x7DAE,0x7DA3,0x7DB5,0x7DC7,0x7DBD,0x7DAB,/* 0x58-0x5F */
+	0x7E3D,0x7DA2,0x7DAF,0x7DDC,0x7DB8,0x7D9F,0x7DB0,0x7DD8,/* 0x60-0x67 */
+	0x7DDD,0x7DE4,0x7DDE,0x7DFB,0x7DF2,0x7DE1,0x7E05,0x7E0A,/* 0x68-0x6F */
+	0x7E23,0x7E21,0x7E12,0x7E31,0x7E1F,0x7E09,0x7E0B,0x7E22,/* 0x70-0x77 */
+	0x7E46,0x7E66,0x7E3B,0x7E35,0x7E39,0x7E43,0x7E37,0x0000,/* 0x78-0x7F */
+
+	0x7E32,0x7E3A,0x7E67,0x7E5D,0x7E56,0x7E5E,0x7E59,0x7E5A,/* 0x80-0x87 */
+	0x7E79,0x7E6A,0x7E69,0x7E7C,0x7E7B,0x7E83,0x7DD5,0x7E7D,/* 0x88-0x8F */
+	0x8FAE,0x7E7F,0x7E88,0x7E89,0x7E8C,0x7E92,0x7E90,0x7E93,/* 0x90-0x97 */
+	0x7E94,0x7E96,0x7E8E,0x7E9B,0x7E9C,0x7F38,0x7F3A,0x7F45,/* 0x98-0x9F */
+	0x7F4C,0x7F4D,0x7F4E,0x7F50,0x7F51,0x7F55,0x7F54,0x7F58,/* 0xA0-0xA7 */
+	0x7F5F,0x7F60,0x7F68,0x7F69,0x7F67,0x7F78,0x7F82,0x7F86,/* 0xA8-0xAF */
+	0x7F83,0x7F88,0x7F87,0x7F8C,0x7F94,0x7F9E,0x7F9D,0x7F9A,/* 0xB0-0xB7 */
+	0x7FA3,0x7FAF,0x7FB2,0x7FB9,0x7FAE,0x7FB6,0x7FB8,0x8B71,/* 0xB8-0xBF */
+	0x7FC5,0x7FC6,0x7FCA,0x7FD5,0x7FD4,0x7FE1,0x7FE6,0x7FE9,/* 0xC0-0xC7 */
+	0x7FF3,0x7FF9,0x98DC,0x8006,0x8004,0x800B,0x8012,0x8018,/* 0xC8-0xCF */
+	0x8019,0x801C,0x8021,0x8028,0x803F,0x803B,0x804A,0x8046,/* 0xD0-0xD7 */
+	0x8052,0x8058,0x805A,0x805F,0x8062,0x8068,0x8073,0x8072,/* 0xD8-0xDF */
+	0x8070,0x8076,0x8079,0x807D,0x807F,0x8084,0x8086,0x8085,/* 0xE0-0xE7 */
+	0x809B,0x8093,0x809A,0x80AD,0x5190,0x80AC,0x80DB,0x80E5,/* 0xE8-0xEF */
+	0x80D9,0x80DD,0x80C4,0x80DA,0x80D6,0x8109,0x80EF,0x80F1,/* 0xF0-0xF7 */
+	0x811B,0x8129,0x8123,0x812F,0x814B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x968B,0x8146,0x813E,0x8153,0x8151,0x80FC,0x8171,0x816E,/* 0x40-0x47 */
+	0x8165,0x8166,0x8174,0x8183,0x8188,0x818A,0x8180,0x8182,/* 0x48-0x4F */
+	0x81A0,0x8195,0x81A4,0x81A3,0x815F,0x8193,0x81A9,0x81B0,/* 0x50-0x57 */
+	0x81B5,0x81BE,0x81B8,0x81BD,0x81C0,0x81C2,0x81BA,0x81C9,/* 0x58-0x5F */
+	0x81CD,0x81D1,0x81D9,0x81D8,0x81C8,0x81DA,0x81DF,0x81E0,/* 0x60-0x67 */
+	0x81E7,0x81FA,0x81FB,0x81FE,0x8201,0x8202,0x8205,0x8207,/* 0x68-0x6F */
+	0x820A,0x820D,0x8210,0x8216,0x8229,0x822B,0x8238,0x8233,/* 0x70-0x77 */
+	0x8240,0x8259,0x8258,0x825D,0x825A,0x825F,0x8264,0x0000,/* 0x78-0x7F */
+
+	0x8262,0x8268,0x826A,0x826B,0x822E,0x8271,0x8277,0x8278,/* 0x80-0x87 */
+	0x827E,0x828D,0x8292,0x82AB,0x829F,0x82BB,0x82AC,0x82E1,/* 0x88-0x8F */
+	0x82E3,0x82DF,0x82D2,0x82F4,0x82F3,0x82FA,0x8393,0x8303,/* 0x90-0x97 */
+	0x82FB,0x82F9,0x82DE,0x8306,0x82DC,0x8309,0x82D9,0x8335,/* 0x98-0x9F */
+	0x8334,0x8316,0x8332,0x8331,0x8340,0x8339,0x8350,0x8345,/* 0xA0-0xA7 */
+	0x832F,0x832B,0x8317,0x8318,0x8385,0x839A,0x83AA,0x839F,/* 0xA8-0xAF */
+	0x83A2,0x8396,0x8323,0x838E,0x8387,0x838A,0x837C,0x83B5,/* 0xB0-0xB7 */
+	0x8373,0x8375,0x83A0,0x8389,0x83A8,0x83F4,0x8413,0x83EB,/* 0xB8-0xBF */
+	0x83CE,0x83FD,0x8403,0x83D8,0x840B,0x83C1,0x83F7,0x8407,/* 0xC0-0xC7 */
+	0x83E0,0x83F2,0x840D,0x8422,0x8420,0x83BD,0x8438,0x8506,/* 0xC8-0xCF */
+	0x83FB,0x846D,0x842A,0x843C,0x855A,0x8484,0x8477,0x846B,/* 0xD0-0xD7 */
+	0x84AD,0x846E,0x8482,0x8469,0x8446,0x842C,0x846F,0x8479,/* 0xD8-0xDF */
+	0x8435,0x84CA,0x8462,0x84B9,0x84BF,0x849F,0x84D9,0x84CD,/* 0xE0-0xE7 */
+	0x84BB,0x84DA,0x84D0,0x84C1,0x84C6,0x84D6,0x84A1,0x8521,/* 0xE8-0xEF */
+	0x84FF,0x84F4,0x8517,0x8518,0x852C,0x851F,0x8515,0x8514,/* 0xF0-0xF7 */
+	0x84FC,0x8540,0x8563,0x8558,0x8548,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8541,0x8602,0x854B,0x8555,0x8580,0x85A4,0x8588,0x8591,/* 0x40-0x47 */
+	0x858A,0x85A8,0x856D,0x8594,0x859B,0x85EA,0x8587,0x859C,/* 0x48-0x4F */
+	0x8577,0x857E,0x8590,0x85C9,0x85BA,0x85CF,0x85B9,0x85D0,/* 0x50-0x57 */
+	0x85D5,0x85DD,0x85E5,0x85DC,0x85F9,0x860A,0x8613,0x860B,/* 0x58-0x5F */
+	0x85FE,0x85FA,0x8606,0x8622,0x861A,0x8630,0x863F,0x864D,/* 0x60-0x67 */
+	0x4E55,0x8654,0x865F,0x8667,0x8671,0x8693,0x86A3,0x86A9,/* 0x68-0x6F */
+	0x86AA,0x868B,0x868C,0x86B6,0x86AF,0x86C4,0x86C6,0x86B0,/* 0x70-0x77 */
+	0x86C9,0x8823,0x86AB,0x86D4,0x86DE,0x86E9,0x86EC,0x0000,/* 0x78-0x7F */
+
+	0x86DF,0x86DB,0x86EF,0x8712,0x8706,0x8708,0x8700,0x8703,/* 0x80-0x87 */
+	0x86FB,0x8711,0x8709,0x870D,0x86F9,0x870A,0x8734,0x873F,/* 0x88-0x8F */
+	0x8737,0x873B,0x8725,0x8729,0x871A,0x8760,0x875F,0x8778,/* 0x90-0x97 */
+	0x874C,0x874E,0x8774,0x8757,0x8768,0x876E,0x8759,0x8753,/* 0x98-0x9F */
+	0x8763,0x876A,0x8805,0x87A2,0x879F,0x8782,0x87AF,0x87CB,/* 0xA0-0xA7 */
+	0x87BD,0x87C0,0x87D0,0x96D6,0x87AB,0x87C4,0x87B3,0x87C7,/* 0xA8-0xAF */
+	0x87C6,0x87BB,0x87EF,0x87F2,0x87E0,0x880F,0x880D,0x87FE,/* 0xB0-0xB7 */
+	0x87F6,0x87F7,0x880E,0x87D2,0x8811,0x8816,0x8815,0x8822,/* 0xB8-0xBF */
+	0x8821,0x8831,0x8836,0x8839,0x8827,0x883B,0x8844,0x8842,/* 0xC0-0xC7 */
+	0x8852,0x8859,0x885E,0x8862,0x886B,0x8881,0x887E,0x889E,/* 0xC8-0xCF */
+	0x8875,0x887D,0x88B5,0x8872,0x8882,0x8897,0x8892,0x88AE,/* 0xD0-0xD7 */
+	0x8899,0x88A2,0x888D,0x88A4,0x88B0,0x88BF,0x88B1,0x88C3,/* 0xD8-0xDF */
+	0x88C4,0x88D4,0x88D8,0x88D9,0x88DD,0x88F9,0x8902,0x88FC,/* 0xE0-0xE7 */
+	0x88F4,0x88E8,0x88F2,0x8904,0x890C,0x890A,0x8913,0x8943,/* 0xE8-0xEF */
+	0x891E,0x8925,0x892A,0x892B,0x8941,0x8944,0x893B,0x8936,/* 0xF0-0xF7 */
+	0x8938,0x894C,0x891D,0x8960,0x895E,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8966,0x8964,0x896D,0x896A,0x896F,0x8974,0x8977,0x897E,/* 0x40-0x47 */
+	0x8983,0x8988,0x898A,0x8993,0x8998,0x89A1,0x89A9,0x89A6,/* 0x48-0x4F */
+	0x89AC,0x89AF,0x89B2,0x89BA,0x89BD,0x89BF,0x89C0,0x89DA,/* 0x50-0x57 */
+	0x89DC,0x89DD,0x89E7,0x89F4,0x89F8,0x8A03,0x8A16,0x8A10,/* 0x58-0x5F */
+	0x8A0C,0x8A1B,0x8A1D,0x8A25,0x8A36,0x8A41,0x8A5B,0x8A52,/* 0x60-0x67 */
+	0x8A46,0x8A48,0x8A7C,0x8A6D,0x8A6C,0x8A62,0x8A85,0x8A82,/* 0x68-0x6F */
+	0x8A84,0x8AA8,0x8AA1,0x8A91,0x8AA5,0x8AA6,0x8A9A,0x8AA3,/* 0x70-0x77 */
+	0x8AC4,0x8ACD,0x8AC2,0x8ADA,0x8AEB,0x8AF3,0x8AE7,0x0000,/* 0x78-0x7F */
+
+	0x8AE4,0x8AF1,0x8B14,0x8AE0,0x8AE2,0x8AF7,0x8ADE,0x8ADB,/* 0x80-0x87 */
+	0x8B0C,0x8B07,0x8B1A,0x8AE1,0x8B16,0x8B10,0x8B17,0x8B20,/* 0x88-0x8F */
+	0x8B33,0x97AB,0x8B26,0x8B2B,0x8B3E,0x8B28,0x8B41,0x8B4C,/* 0x90-0x97 */
+	0x8B4F,0x8B4E,0x8B49,0x8B56,0x8B5B,0x8B5A,0x8B6B,0x8B5F,/* 0x98-0x9F */
+	0x8B6C,0x8B6F,0x8B74,0x8B7D,0x8B80,0x8B8C,0x8B8E,0x8B92,/* 0xA0-0xA7 */
+	0x8B93,0x8B96,0x8B99,0x8B9A,0x8C3A,0x8C41,0x8C3F,0x8C48,/* 0xA8-0xAF */
+	0x8C4C,0x8C4E,0x8C50,0x8C55,0x8C62,0x8C6C,0x8C78,0x8C7A,/* 0xB0-0xB7 */
+	0x8C82,0x8C89,0x8C85,0x8C8A,0x8C8D,0x8C8E,0x8C94,0x8C7C,/* 0xB8-0xBF */
+	0x8C98,0x621D,0x8CAD,0x8CAA,0x8CBD,0x8CB2,0x8CB3,0x8CAE,/* 0xC0-0xC7 */
+	0x8CB6,0x8CC8,0x8CC1,0x8CE4,0x8CE3,0x8CDA,0x8CFD,0x8CFA,/* 0xC8-0xCF */
+	0x8CFB,0x8D04,0x8D05,0x8D0A,0x8D07,0x8D0F,0x8D0D,0x8D10,/* 0xD0-0xD7 */
+	0x9F4E,0x8D13,0x8CCD,0x8D14,0x8D16,0x8D67,0x8D6D,0x8D71,/* 0xD8-0xDF */
+	0x8D73,0x8D81,0x8D99,0x8DC2,0x8DBE,0x8DBA,0x8DCF,0x8DDA,/* 0xE0-0xE7 */
+	0x8DD6,0x8DCC,0x8DDB,0x8DCB,0x8DEA,0x8DEB,0x8DDF,0x8DE3,/* 0xE8-0xEF */
+	0x8DFC,0x8E08,0x8E09,0x8DFF,0x8E1D,0x8E1E,0x8E10,0x8E1F,/* 0xF0-0xF7 */
+	0x8E42,0x8E35,0x8E30,0x8E34,0x8E4A,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8E47,0x8E49,0x8E4C,0x8E50,0x8E48,0x8E59,0x8E64,0x8E60,/* 0x40-0x47 */
+	0x8E2A,0x8E63,0x8E55,0x8E76,0x8E72,0x8E7C,0x8E81,0x8E87,/* 0x48-0x4F */
+	0x8E85,0x8E84,0x8E8B,0x8E8A,0x8E93,0x8E91,0x8E94,0x8E99,/* 0x50-0x57 */
+	0x8EAA,0x8EA1,0x8EAC,0x8EB0,0x8EC6,0x8EB1,0x8EBE,0x8EC5,/* 0x58-0x5F */
+	0x8EC8,0x8ECB,0x8EDB,0x8EE3,0x8EFC,0x8EFB,0x8EEB,0x8EFE,/* 0x60-0x67 */
+	0x8F0A,0x8F05,0x8F15,0x8F12,0x8F19,0x8F13,0x8F1C,0x8F1F,/* 0x68-0x6F */
+	0x8F1B,0x8F0C,0x8F26,0x8F33,0x8F3B,0x8F39,0x8F45,0x8F42,/* 0x70-0x77 */
+	0x8F3E,0x8F4C,0x8F49,0x8F46,0x8F4E,0x8F57,0x8F5C,0x0000,/* 0x78-0x7F */
+
+	0x8F62,0x8F63,0x8F64,0x8F9C,0x8F9F,0x8FA3,0x8FAD,0x8FAF,/* 0x80-0x87 */
+	0x8FB7,0x8FDA,0x8FE5,0x8FE2,0x8FEA,0x8FEF,0x9087,0x8FF4,/* 0x88-0x8F */
+	0x9005,0x8FF9,0x8FFA,0x9011,0x9015,0x9021,0x900D,0x901E,/* 0x90-0x97 */
+	0x9016,0x900B,0x9027,0x9036,0x9035,0x9039,0x8FF8,0x904F,/* 0x98-0x9F */
+	0x9050,0x9051,0x9052,0x900E,0x9049,0x903E,0x9056,0x9058,/* 0xA0-0xA7 */
+	0x905E,0x9068,0x906F,0x9076,0x96A8,0x9072,0x9082,0x907D,/* 0xA8-0xAF */
+	0x9081,0x9080,0x908A,0x9089,0x908F,0x90A8,0x90AF,0x90B1,/* 0xB0-0xB7 */
+	0x90B5,0x90E2,0x90E4,0x6248,0x90DB,0x9102,0x9112,0x9119,/* 0xB8-0xBF */
+	0x9132,0x9130,0x914A,0x9156,0x9158,0x9163,0x9165,0x9169,/* 0xC0-0xC7 */
+	0x9173,0x9172,0x918B,0x9189,0x9182,0x91A2,0x91AB,0x91AF,/* 0xC8-0xCF */
+	0x91AA,0x91B5,0x91B4,0x91BA,0x91C0,0x91C1,0x91C9,0x91CB,/* 0xD0-0xD7 */
+	0x91D0,0x91D6,0x91DF,0x91E1,0x91DB,0x91FC,0x91F5,0x91F6,/* 0xD8-0xDF */
+	0x921E,0x91FF,0x9214,0x922C,0x9215,0x9211,0x925E,0x9257,/* 0xE0-0xE7 */
+	0x9245,0x9249,0x9264,0x9248,0x9295,0x923F,0x924B,0x9250,/* 0xE8-0xEF */
+	0x929C,0x9296,0x9293,0x929B,0x925A,0x92CF,0x92B9,0x92B7,/* 0xF0-0xF7 */
+	0x92E9,0x930F,0x92FA,0x9344,0x932E,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9319,0x9322,0x931A,0x9323,0x933A,0x9335,0x933B,0x935C,/* 0x40-0x47 */
+	0x9360,0x937C,0x936E,0x9356,0x93B0,0x93AC,0x93AD,0x9394,/* 0x48-0x4F */
+	0x93B9,0x93D6,0x93D7,0x93E8,0x93E5,0x93D8,0x93C3,0x93DD,/* 0x50-0x57 */
+	0x93D0,0x93C8,0x93E4,0x941A,0x9414,0x9413,0x9403,0x9407,/* 0x58-0x5F */
+	0x9410,0x9436,0x942B,0x9435,0x9421,0x943A,0x9441,0x9452,/* 0x60-0x67 */
+	0x9444,0x945B,0x9460,0x9462,0x945E,0x946A,0x9229,0x9470,/* 0x68-0x6F */
+	0x9475,0x9477,0x947D,0x945A,0x947C,0x947E,0x9481,0x947F,/* 0x70-0x77 */
+	0x9582,0x9587,0x958A,0x9594,0x9596,0x9598,0x9599,0x0000,/* 0x78-0x7F */
+
+	0x95A0,0x95A8,0x95A7,0x95AD,0x95BC,0x95BB,0x95B9,0x95BE,/* 0x80-0x87 */
+	0x95CA,0x6FF6,0x95C3,0x95CD,0x95CC,0x95D5,0x95D4,0x95D6,/* 0x88-0x8F */
+	0x95DC,0x95E1,0x95E5,0x95E2,0x9621,0x9628,0x962E,0x962F,/* 0x90-0x97 */
+	0x9642,0x964C,0x964F,0x964B,0x9677,0x965C,0x965E,0x965D,/* 0x98-0x9F */
+	0x965F,0x9666,0x9672,0x966C,0x968D,0x9698,0x9695,0x9697,/* 0xA0-0xA7 */
+	0x96AA,0x96A7,0x96B1,0x96B2,0x96B0,0x96B4,0x96B6,0x96B8,/* 0xA8-0xAF */
+	0x96B9,0x96CE,0x96CB,0x96C9,0x96CD,0x894D,0x96DC,0x970D,/* 0xB0-0xB7 */
+	0x96D5,0x96F9,0x9704,0x9706,0x9708,0x9713,0x970E,0x9711,/* 0xB8-0xBF */
+	0x970F,0x9716,0x9719,0x9724,0x972A,0x9730,0x9739,0x973D,/* 0xC0-0xC7 */
+	0x973E,0x9744,0x9746,0x9748,0x9742,0x9749,0x975C,0x9760,/* 0xC8-0xCF */
+	0x9764,0x9766,0x9768,0x52D2,0x976B,0x9771,0x9779,0x9785,/* 0xD0-0xD7 */
+	0x977C,0x9781,0x977A,0x9786,0x978B,0x978F,0x9790,0x979C,/* 0xD8-0xDF */
+	0x97A8,0x97A6,0x97A3,0x97B3,0x97B4,0x97C3,0x97C6,0x97C8,/* 0xE0-0xE7 */
+	0x97CB,0x97DC,0x97ED,0x9F4F,0x97F2,0x7ADF,0x97F6,0x97F5,/* 0xE8-0xEF */
+	0x980F,0x980C,0x9838,0x9824,0x9821,0x9837,0x983D,0x9846,/* 0xF0-0xF7 */
+	0x984F,0x984B,0x986B,0x986F,0x9870,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9871,0x9874,0x9873,0x98AA,0x98AF,0x98B1,0x98B6,0x98C4,/* 0x40-0x47 */
+	0x98C3,0x98C6,0x98E9,0x98EB,0x9903,0x9909,0x9912,0x9914,/* 0x48-0x4F */
+	0x9918,0x9921,0x991D,0x991E,0x9924,0x9920,0x992C,0x992E,/* 0x50-0x57 */
+	0x993D,0x993E,0x9942,0x9949,0x9945,0x9950,0x994B,0x9951,/* 0x58-0x5F */
+	0x9952,0x994C,0x9955,0x9997,0x9998,0x99A5,0x99AD,0x99AE,/* 0x60-0x67 */
+	0x99BC,0x99DF,0x99DB,0x99DD,0x99D8,0x99D1,0x99ED,0x99EE,/* 0x68-0x6F */
+	0x99F1,0x99F2,0x99FB,0x99F8,0x9A01,0x9A0F,0x9A05,0x99E2,/* 0x70-0x77 */
+	0x9A19,0x9A2B,0x9A37,0x9A45,0x9A42,0x9A40,0x9A43,0x0000,/* 0x78-0x7F */
+
+	0x9A3E,0x9A55,0x9A4D,0x9A5B,0x9A57,0x9A5F,0x9A62,0x9A65,/* 0x80-0x87 */
+	0x9A64,0x9A69,0x9A6B,0x9A6A,0x9AAD,0x9AB0,0x9ABC,0x9AC0,/* 0x88-0x8F */
+	0x9ACF,0x9AD1,0x9AD3,0x9AD4,0x9ADE,0x9ADF,0x9AE2,0x9AE3,/* 0x90-0x97 */
+	0x9AE6,0x9AEF,0x9AEB,0x9AEE,0x9AF4,0x9AF1,0x9AF7,0x9AFB,/* 0x98-0x9F */
+	0x9B06,0x9B18,0x9B1A,0x9B1F,0x9B22,0x9B23,0x9B25,0x9B27,/* 0xA0-0xA7 */
+	0x9B28,0x9B29,0x9B2A,0x9B2E,0x9B2F,0x9B32,0x9B44,0x9B43,/* 0xA8-0xAF */
+	0x9B4F,0x9B4D,0x9B4E,0x9B51,0x9B58,0x9B74,0x9B93,0x9B83,/* 0xB0-0xB7 */
+	0x9B91,0x9B96,0x9B97,0x9B9F,0x9BA0,0x9BA8,0x9BB4,0x9BC0,/* 0xB8-0xBF */
+	0x9BCA,0x9BB9,0x9BC6,0x9BCF,0x9BD1,0x9BD2,0x9BE3,0x9BE2,/* 0xC0-0xC7 */
+	0x9BE4,0x9BD4,0x9BE1,0x9C3A,0x9BF2,0x9BF1,0x9BF0,0x9C15,/* 0xC8-0xCF */
+	0x9C14,0x9C09,0x9C13,0x9C0C,0x9C06,0x9C08,0x9C12,0x9C0A,/* 0xD0-0xD7 */
+	0x9C04,0x9C2E,0x9C1B,0x9C25,0x9C24,0x9C21,0x9C30,0x9C47,/* 0xD8-0xDF */
+	0x9C32,0x9C46,0x9C3E,0x9C5A,0x9C60,0x9C67,0x9C76,0x9C78,/* 0xE0-0xE7 */
+	0x9CE7,0x9CEC,0x9CF0,0x9D09,0x9D08,0x9CEB,0x9D03,0x9D06,/* 0xE8-0xEF */
+	0x9D2A,0x9D26,0x9DAF,0x9D23,0x9D1F,0x9D44,0x9D15,0x9D12,/* 0xF0-0xF7 */
+	0x9D41,0x9D3F,0x9D3E,0x9D46,0x9D48,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9D5D,0x9D5E,0x9D64,0x9D51,0x9D50,0x9D59,0x9D72,0x9D89,/* 0x40-0x47 */
+	0x9D87,0x9DAB,0x9D6F,0x9D7A,0x9D9A,0x9DA4,0x9DA9,0x9DB2,/* 0x48-0x4F */
+	0x9DC4,0x9DC1,0x9DBB,0x9DB8,0x9DBA,0x9DC6,0x9DCF,0x9DC2,/* 0x50-0x57 */
+	0x9DD9,0x9DD3,0x9DF8,0x9DE6,0x9DED,0x9DEF,0x9DFD,0x9E1A,/* 0x58-0x5F */
+	0x9E1B,0x9E1E,0x9E75,0x9E79,0x9E7D,0x9E81,0x9E88,0x9E8B,/* 0x60-0x67 */
+	0x9E8C,0x9E92,0x9E95,0x9E91,0x9E9D,0x9EA5,0x9EA9,0x9EB8,/* 0x68-0x6F */
+	0x9EAA,0x9EAD,0x9761,0x9ECC,0x9ECE,0x9ECF,0x9ED0,0x9ED4,/* 0x70-0x77 */
+	0x9EDC,0x9EDE,0x9EDD,0x9EE0,0x9EE5,0x9EE8,0x9EEF,0x0000,/* 0x78-0x7F */
+
+	0x9EF4,0x9EF6,0x9EF7,0x9EF9,0x9EFB,0x9EFC,0x9EFD,0x9F07,/* 0x80-0x87 */
+	0x9F08,0x76B7,0x9F15,0x9F21,0x9F2C,0x9F3E,0x9F4A,0x9F52,/* 0x88-0x8F */
+	0x9F54,0x9F63,0x9F5F,0x9F60,0x9F61,0x9F66,0x9F67,0x9F6C,/* 0x90-0x97 */
+	0x9F6A,0x9F77,0x9F72,0x9F76,0x9F95,0x9F9C,0x9FA0,0x582F,/* 0x98-0x9F */
+	0x69C7,0x9059,0x7464,0x51DC,0x7199,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_ED[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7E8A,0x891C,0x9348,0x9288,0x84DC,0x4FC9,0x70BB,0x6631,/* 0x40-0x47 */
+	0x68C8,0x92F9,0x66FB,0x5F45,0x4E28,0x4EE1,0x4EFC,0x4F00,/* 0x48-0x4F */
+	0x4F03,0x4F39,0x4F56,0x4F92,0x4F8A,0x4F9A,0x4F94,0x4FCD,/* 0x50-0x57 */
+	0x5040,0x5022,0x4FFF,0x501E,0x5046,0x5070,0x5042,0x5094,/* 0x58-0x5F */
+	0x50F4,0x50D8,0x514A,0x5164,0x519D,0x51BE,0x51EC,0x5215,/* 0x60-0x67 */
+	0x529C,0x52A6,0x52C0,0x52DB,0x5300,0x5307,0x5324,0x5372,/* 0x68-0x6F */
+	0x5393,0x53B2,0x53DD,0xFA0E,0x549C,0x548A,0x54A9,0x54FF,/* 0x70-0x77 */
+	0x5586,0x5759,0x5765,0x57AC,0x57C8,0x57C7,0xFA0F,0x0000,/* 0x78-0x7F */
+
+	0xFA10,0x589E,0x58B2,0x590B,0x5953,0x595B,0x595D,0x5963,/* 0x80-0x87 */
+	0x59A4,0x59BA,0x5B56,0x5BC0,0x752F,0x5BD8,0x5BEC,0x5C1E,/* 0x88-0x8F */
+	0x5CA6,0x5CBA,0x5CF5,0x5D27,0x5D53,0xFA11,0x5D42,0x5D6D,/* 0x90-0x97 */
+	0x5DB8,0x5DB9,0x5DD0,0x5F21,0x5F34,0x5F67,0x5FB7,0x5FDE,/* 0x98-0x9F */
+	0x605D,0x6085,0x608A,0x60DE,0x60D5,0x6120,0x60F2,0x6111,/* 0xA0-0xA7 */
+	0x6137,0x6130,0x6198,0x6213,0x62A6,0x63F5,0x6460,0x649D,/* 0xA8-0xAF */
+	0x64CE,0x654E,0x6600,0x6615,0x663B,0x6609,0x662E,0x661E,/* 0xB0-0xB7 */
+	0x6624,0x6665,0x6657,0x6659,0xFA12,0x6673,0x6699,0x66A0,/* 0xB8-0xBF */
+	0x66B2,0x66BF,0x66FA,0x670E,0xF929,0x6766,0x67BB,0x6852,/* 0xC0-0xC7 */
+	0x67C0,0x6801,0x6844,0x68CF,0xFA13,0x6968,0xFA14,0x6998,/* 0xC8-0xCF */
+	0x69E2,0x6A30,0x6A6B,0x6A46,0x6A73,0x6A7E,0x6AE2,0x6AE4,/* 0xD0-0xD7 */
+	0x6BD6,0x6C3F,0x6C5C,0x6C86,0x6C6F,0x6CDA,0x6D04,0x6D87,/* 0xD8-0xDF */
+	0x6D6F,0x6D96,0x6DAC,0x6DCF,0x6DF8,0x6DF2,0x6DFC,0x6E39,/* 0xE0-0xE7 */
+	0x6E5C,0x6E27,0x6E3C,0x6EBF,0x6F88,0x6FB5,0x6FF5,0x7005,/* 0xE8-0xEF */
+	0x7007,0x7028,0x7085,0x70AB,0x710F,0x7104,0x715C,0x7146,/* 0xF0-0xF7 */
+	0x7147,0xFA15,0x71C1,0x71FE,0x72B1,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x72BE,0x7324,0xFA16,0x7377,0x73BD,0x73C9,0x73D6,0x73E3,/* 0x40-0x47 */
+	0x73D2,0x7407,0x73F5,0x7426,0x742A,0x7429,0x742E,0x7462,/* 0x48-0x4F */
+	0x7489,0x749F,0x7501,0x756F,0x7682,0x769C,0x769E,0x769B,/* 0x50-0x57 */
+	0x76A6,0xFA17,0x7746,0x52AF,0x7821,0x784E,0x7864,0x787A,/* 0x58-0x5F */
+	0x7930,0xFA18,0xFA19,0xFA1A,0x7994,0xFA1B,0x799B,0x7AD1,/* 0x60-0x67 */
+	0x7AE7,0xFA1C,0x7AEB,0x7B9E,0xFA1D,0x7D48,0x7D5C,0x7DB7,/* 0x68-0x6F */
+	0x7DA0,0x7DD6,0x7E52,0x7F47,0x7FA1,0xFA1E,0x8301,0x8362,/* 0x70-0x77 */
+	0x837F,0x83C7,0x83F6,0x8448,0x84B4,0x8553,0x8559,0x0000,/* 0x78-0x7F */
+
+	0x856B,0xFA1F,0x85B0,0xFA20,0xFA21,0x8807,0x88F5,0x8A12,/* 0x80-0x87 */
+	0x8A37,0x8A79,0x8AA7,0x8ABE,0x8ADF,0xFA22,0x8AF6,0x8B53,/* 0x88-0x8F */
+	0x8B7F,0x8CF0,0x8CF4,0x8D12,0x8D76,0xFA23,0x8ECF,0xFA24,/* 0x90-0x97 */
+	0xFA25,0x9067,0x90DE,0xFA26,0x9115,0x9127,0x91DA,0x91D7,/* 0x98-0x9F */
+	0x91DE,0x91ED,0x91EE,0x91E4,0x91E5,0x9206,0x9210,0x920A,/* 0xA0-0xA7 */
+	0x923A,0x9240,0x923C,0x924E,0x9259,0x9251,0x9239,0x9267,/* 0xA8-0xAF */
+	0x92A7,0x9277,0x9278,0x92E7,0x92D7,0x92D9,0x92D0,0xFA27,/* 0xB0-0xB7 */
+	0x92D5,0x92E0,0x92D3,0x9325,0x9321,0x92FB,0xFA28,0x931E,/* 0xB8-0xBF */
+	0x92FF,0x931D,0x9302,0x9370,0x9357,0x93A4,0x93C6,0x93DE,/* 0xC0-0xC7 */
+	0x93F8,0x9431,0x9445,0x9448,0x9592,0xF9DC,0xFA29,0x969D,/* 0xC8-0xCF */
+	0x96AF,0x9733,0x973B,0x9743,0x974D,0x974F,0x9751,0x9755,/* 0xD0-0xD7 */
+	0x9857,0x9865,0xFA2A,0xFA2B,0x9927,0xFA2C,0x999E,0x9A4E,/* 0xD8-0xDF */
+	0x9AD9,0x9ADC,0x9B75,0x9B72,0x9B8F,0x9BB1,0x9BBB,0x9C00,/* 0xE0-0xE7 */
+	0x9D70,0x9D6B,0xFA2D,0x9E19,0x9ED1,0x0000,0x0000,0x2170,/* 0xE8-0xEF */
+	0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,0x2177,0x2178,/* 0xF0-0xF7 */
+	0x2179,0xFFE2,0xFFE4,0xFF07,0xFF02,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_FA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x2170,0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,0x2177,/* 0x40-0x47 */
+	0x2178,0x2179,0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,/* 0x48-0x4F */
+	0x2166,0x2167,0x2168,0x2169,0xFFE2,0xFFE4,0xFF07,0xFF02,/* 0x50-0x57 */
+	0x3231,0x2116,0x2121,0x2235,0x7E8A,0x891C,0x9348,0x9288,/* 0x58-0x5F */
+	0x84DC,0x4FC9,0x70BB,0x6631,0x68C8,0x92F9,0x66FB,0x5F45,/* 0x60-0x67 */
+	0x4E28,0x4EE1,0x4EFC,0x4F00,0x4F03,0x4F39,0x4F56,0x4F92,/* 0x68-0x6F */
+	0x4F8A,0x4F9A,0x4F94,0x4FCD,0x5040,0x5022,0x4FFF,0x501E,/* 0x70-0x77 */
+	0x5046,0x5070,0x5042,0x5094,0x50F4,0x50D8,0x514A,0x0000,/* 0x78-0x7F */
+
+	0x5164,0x519D,0x51BE,0x51EC,0x5215,0x529C,0x52A6,0x52C0,/* 0x80-0x87 */
+	0x52DB,0x5300,0x5307,0x5324,0x5372,0x5393,0x53B2,0x53DD,/* 0x88-0x8F */
+	0xFA0E,0x549C,0x548A,0x54A9,0x54FF,0x5586,0x5759,0x5765,/* 0x90-0x97 */
+	0x57AC,0x57C8,0x57C7,0xFA0F,0xFA10,0x589E,0x58B2,0x590B,/* 0x98-0x9F */
+	0x5953,0x595B,0x595D,0x5963,0x59A4,0x59BA,0x5B56,0x5BC0,/* 0xA0-0xA7 */
+	0x752F,0x5BD8,0x5BEC,0x5C1E,0x5CA6,0x5CBA,0x5CF5,0x5D27,/* 0xA8-0xAF */
+	0x5D53,0xFA11,0x5D42,0x5D6D,0x5DB8,0x5DB9,0x5DD0,0x5F21,/* 0xB0-0xB7 */
+	0x5F34,0x5F67,0x5FB7,0x5FDE,0x605D,0x6085,0x608A,0x60DE,/* 0xB8-0xBF */
+	0x60D5,0x6120,0x60F2,0x6111,0x6137,0x6130,0x6198,0x6213,/* 0xC0-0xC7 */
+	0x62A6,0x63F5,0x6460,0x649D,0x64CE,0x654E,0x6600,0x6615,/* 0xC8-0xCF */
+	0x663B,0x6609,0x662E,0x661E,0x6624,0x6665,0x6657,0x6659,/* 0xD0-0xD7 */
+	0xFA12,0x6673,0x6699,0x66A0,0x66B2,0x66BF,0x66FA,0x670E,/* 0xD8-0xDF */
+	0xF929,0x6766,0x67BB,0x6852,0x67C0,0x6801,0x6844,0x68CF,/* 0xE0-0xE7 */
+	0xFA13,0x6968,0xFA14,0x6998,0x69E2,0x6A30,0x6A6B,0x6A46,/* 0xE8-0xEF */
+	0x6A73,0x6A7E,0x6AE2,0x6AE4,0x6BD6,0x6C3F,0x6C5C,0x6C86,/* 0xF0-0xF7 */
+	0x6C6F,0x6CDA,0x6D04,0x6D87,0x6D6F,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_FB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6D96,0x6DAC,0x6DCF,0x6DF8,0x6DF2,0x6DFC,0x6E39,0x6E5C,/* 0x40-0x47 */
+	0x6E27,0x6E3C,0x6EBF,0x6F88,0x6FB5,0x6FF5,0x7005,0x7007,/* 0x48-0x4F */
+	0x7028,0x7085,0x70AB,0x710F,0x7104,0x715C,0x7146,0x7147,/* 0x50-0x57 */
+	0xFA15,0x71C1,0x71FE,0x72B1,0x72BE,0x7324,0xFA16,0x7377,/* 0x58-0x5F */
+	0x73BD,0x73C9,0x73D6,0x73E3,0x73D2,0x7407,0x73F5,0x7426,/* 0x60-0x67 */
+	0x742A,0x7429,0x742E,0x7462,0x7489,0x749F,0x7501,0x756F,/* 0x68-0x6F */
+	0x7682,0x769C,0x769E,0x769B,0x76A6,0xFA17,0x7746,0x52AF,/* 0x70-0x77 */
+	0x7821,0x784E,0x7864,0x787A,0x7930,0xFA18,0xFA19,0x0000,/* 0x78-0x7F */
+
+	0xFA1A,0x7994,0xFA1B,0x799B,0x7AD1,0x7AE7,0xFA1C,0x7AEB,/* 0x80-0x87 */
+	0x7B9E,0xFA1D,0x7D48,0x7D5C,0x7DB7,0x7DA0,0x7DD6,0x7E52,/* 0x88-0x8F */
+	0x7F47,0x7FA1,0xFA1E,0x8301,0x8362,0x837F,0x83C7,0x83F6,/* 0x90-0x97 */
+	0x8448,0x84B4,0x8553,0x8559,0x856B,0xFA1F,0x85B0,0xFA20,/* 0x98-0x9F */
+	0xFA21,0x8807,0x88F5,0x8A12,0x8A37,0x8A79,0x8AA7,0x8ABE,/* 0xA0-0xA7 */
+	0x8ADF,0xFA22,0x8AF6,0x8B53,0x8B7F,0x8CF0,0x8CF4,0x8D12,/* 0xA8-0xAF */
+	0x8D76,0xFA23,0x8ECF,0xFA24,0xFA25,0x9067,0x90DE,0xFA26,/* 0xB0-0xB7 */
+	0x9115,0x9127,0x91DA,0x91D7,0x91DE,0x91ED,0x91EE,0x91E4,/* 0xB8-0xBF */
+	0x91E5,0x9206,0x9210,0x920A,0x923A,0x9240,0x923C,0x924E,/* 0xC0-0xC7 */
+	0x9259,0x9251,0x9239,0x9267,0x92A7,0x9277,0x9278,0x92E7,/* 0xC8-0xCF */
+	0x92D7,0x92D9,0x92D0,0xFA27,0x92D5,0x92E0,0x92D3,0x9325,/* 0xD0-0xD7 */
+	0x9321,0x92FB,0xFA28,0x931E,0x92FF,0x931D,0x9302,0x9370,/* 0xD8-0xDF */
+	0x9357,0x93A4,0x93C6,0x93DE,0x93F8,0x9431,0x9445,0x9448,/* 0xE0-0xE7 */
+	0x9592,0xF9DC,0xFA29,0x969D,0x96AF,0x9733,0x973B,0x9743,/* 0xE8-0xEF */
+	0x974D,0x974F,0x9751,0x9755,0x9857,0x9865,0xFA2A,0xFA2B,/* 0xF0-0xF7 */
+	0x9927,0xFA2C,0x999E,0x9A4E,0x9AD9,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_FC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9ADC,0x9B75,0x9B72,0x9B8F,0x9BB1,0x9BBB,0x9C00,0x9D70,/* 0x40-0x47 */
+	0x9D6B,0xFA2D,0x9E19,0x9ED1,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+};
+
+static const wchar_t *page_charset2uni[256] = {
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   c2u_81, c2u_82, c2u_83, c2u_84, NULL,   NULL,   c2u_87, 
+	c2u_88, c2u_89, c2u_8A, c2u_8B, c2u_8C, c2u_8D, c2u_8E, c2u_8F, 
+	c2u_90, c2u_91, c2u_92, c2u_93, c2u_94, c2u_95, c2u_96, c2u_97, 
+	c2u_98, c2u_99, c2u_9A, c2u_9B, c2u_9C, c2u_9D, c2u_9E, c2u_9F, 
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, 
+	c2u_E8, c2u_E9, c2u_EA, NULL,   NULL,   c2u_ED, c2u_EE, NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   c2u_FA, c2u_FB, c2u_FC, NULL,   NULL,   NULL,   
+};
+
+static const unsigned char u2c_00hi[256 - 0xA0][2] = {
+	{0x00, 0x00}, {0x00, 0x00}, {0x81, 0x91}, {0x81, 0x92},/* 0xA0-0xA3 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x81, 0x98},/* 0xA4-0xA7 */
+	{0x81, 0x4E}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xA8-0xAB */
+	{0x81, 0xCA}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xAC-0xAF */
+	{0x81, 0x8B}, {0x81, 0x7D}, {0x00, 0x00}, {0x00, 0x00},/* 0xB0-0xB3 */
+	{0x81, 0x4C}, {0x00, 0x00}, {0x81, 0xF7}, {0x00, 0x00},/* 0xB4-0xB7 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xB8-0xBB */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xBC-0xBF */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xC0-0xC3 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xC4-0xC7 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xC8-0xCB */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xCC-0xCF */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xD0-0xD3 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x81, 0x7E},/* 0xD4-0xD7 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xD8-0xDB */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xDC-0xDF */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xE0-0xE3 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xE4-0xE7 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xE8-0xEB */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xEC-0xEF */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xF0-0xF3 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x81, 0x80},/* 0xF4-0xF7 */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xF8-0xFB */
+	{0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_03[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x83, 0x9F, 0x83, 0xA0, 0x83, 0xA1, /* 0x90-0x93 */
+	0x83, 0xA2, 0x83, 0xA3, 0x83, 0xA4, 0x83, 0xA5, /* 0x94-0x97 */
+	0x83, 0xA6, 0x83, 0xA7, 0x83, 0xA8, 0x83, 0xA9, /* 0x98-0x9B */
+	0x83, 0xAA, 0x83, 0xAB, 0x83, 0xAC, 0x83, 0xAD, /* 0x9C-0x9F */
+	0x83, 0xAE, 0x83, 0xAF, 0x00, 0x00, 0x83, 0xB0, /* 0xA0-0xA3 */
+	0x83, 0xB1, 0x83, 0xB2, 0x83, 0xB3, 0x83, 0xB4, /* 0xA4-0xA7 */
+	0x83, 0xB5, 0x83, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x83, 0xBF, 0x83, 0xC0, 0x83, 0xC1, /* 0xB0-0xB3 */
+	0x83, 0xC2, 0x83, 0xC3, 0x83, 0xC4, 0x83, 0xC5, /* 0xB4-0xB7 */
+	0x83, 0xC6, 0x83, 0xC7, 0x83, 0xC8, 0x83, 0xC9, /* 0xB8-0xBB */
+	0x83, 0xCA, 0x83, 0xCB, 0x83, 0xCC, 0x83, 0xCD, /* 0xBC-0xBF */
+	0x83, 0xCE, 0x83, 0xCF, 0x00, 0x00, 0x83, 0xD0, /* 0xC0-0xC3 */
+	0x83, 0xD1, 0x83, 0xD2, 0x83, 0xD3, 0x83, 0xD4, /* 0xC4-0xC7 */
+	0x83, 0xD5, 0x83, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+};
+
+static const unsigned char u2c_04[512] = {
+	0x00, 0x00, 0x84, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x84, 0x40, 0x84, 0x41, 0x84, 0x42, 0x84, 0x43, /* 0x10-0x13 */
+	0x84, 0x44, 0x84, 0x45, 0x84, 0x47, 0x84, 0x48, /* 0x14-0x17 */
+	0x84, 0x49, 0x84, 0x4A, 0x84, 0x4B, 0x84, 0x4C, /* 0x18-0x1B */
+	0x84, 0x4D, 0x84, 0x4E, 0x84, 0x4F, 0x84, 0x50, /* 0x1C-0x1F */
+	0x84, 0x51, 0x84, 0x52, 0x84, 0x53, 0x84, 0x54, /* 0x20-0x23 */
+	0x84, 0x55, 0x84, 0x56, 0x84, 0x57, 0x84, 0x58, /* 0x24-0x27 */
+	0x84, 0x59, 0x84, 0x5A, 0x84, 0x5B, 0x84, 0x5C, /* 0x28-0x2B */
+	0x84, 0x5D, 0x84, 0x5E, 0x84, 0x5F, 0x84, 0x60, /* 0x2C-0x2F */
+	0x84, 0x70, 0x84, 0x71, 0x84, 0x72, 0x84, 0x73, /* 0x30-0x33 */
+	0x84, 0x74, 0x84, 0x75, 0x84, 0x77, 0x84, 0x78, /* 0x34-0x37 */
+	0x84, 0x79, 0x84, 0x7A, 0x84, 0x7B, 0x84, 0x7C, /* 0x38-0x3B */
+	0x84, 0x7D, 0x84, 0x7E, 0x84, 0x80, 0x84, 0x81, /* 0x3C-0x3F */
+	0x84, 0x82, 0x84, 0x83, 0x84, 0x84, 0x84, 0x85, /* 0x40-0x43 */
+	0x84, 0x86, 0x84, 0x87, 0x84, 0x88, 0x84, 0x89, /* 0x44-0x47 */
+	0x84, 0x8A, 0x84, 0x8B, 0x84, 0x8C, 0x84, 0x8D, /* 0x48-0x4B */
+	0x84, 0x8E, 0x84, 0x8F, 0x84, 0x90, 0x84, 0x91, /* 0x4C-0x4F */
+	0x00, 0x00, 0x84, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+};
+
+static const unsigned char u2c_20[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x81, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x81, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x81, 0x65, 0x81, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x81, 0x67, 0x81, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x81, 0xF5, 0x81, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x81, 0x64, 0x81, 0x63, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x81, 0xF1, 0x00, 0x00, 0x81, 0x8C, 0x81, 0x8D, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xA6, /* 0x38-0x3B */
+};
+
+static const unsigned char u2c_21[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x8E, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0x59, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xFA, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xF0, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xFA, 0x4A, 0xFA, 0x4B, 0xFA, 0x4C, 0xFA, 0x4D, /* 0x60-0x63 */
+	0xFA, 0x4E, 0xFA, 0x4F, 0xFA, 0x50, 0xFA, 0x51, /* 0x64-0x67 */
+	0xFA, 0x52, 0xFA, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xEE, 0xEF, 0xEE, 0xF0, 0xEE, 0xF1, 0xEE, 0xF2, /* 0x70-0x73 */
+	0xEE, 0xF3, 0xEE, 0xF4, 0xEE, 0xF5, 0xEE, 0xF6, /* 0x74-0x77 */
+	0xEE, 0xF7, 0xEE, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x81, 0xA9, 0x81, 0xAA, 0x81, 0xA8, 0x81, 0xAB, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0xCB, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x81, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+};
+
+static const unsigned char u2c_22[512] = {
+	0x81, 0xCD, 0x00, 0x00, 0x81, 0xDD, 0x81, 0xCE, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xDE, /* 0x04-0x07 */
+	0x81, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x81, 0xB9, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x87, 0x94, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x95, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x81, 0xE5, 0x81, 0x87, 0x87, 0x98, /* 0x1C-0x1F */
+	0x87, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x81, 0x61, 0x00, 0x00, 0x81, 0xC8, /* 0x24-0x27 */
+	0x81, 0xC9, 0x87, 0x9B, 0x87, 0x9C, 0x87, 0x92, /* 0x28-0x2B */
+	0x81, 0xE8, 0x00, 0x00, 0x87, 0x93, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x81, 0x88, 0xFA, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x81, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x90, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x81, 0x82, 0x87, 0x91, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0x85, 0x81, 0x86, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0xE1, 0x81, 0xE2, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x81, 0xBC, 0x81, 0xBD, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0xBA, 0x81, 0xBB, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x87, 0x96, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x99, /* 0xBC-0xBF */
+};
+
+static const unsigned char u2c_23[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0xDC, 0x00, 0x00, /* 0x10-0x13 */
+};
+
+static const unsigned char u2c_24[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x87, 0x40, 0x87, 0x41, 0x87, 0x42, 0x87, 0x43, /* 0x60-0x63 */
+	0x87, 0x44, 0x87, 0x45, 0x87, 0x46, 0x87, 0x47, /* 0x64-0x67 */
+	0x87, 0x48, 0x87, 0x49, 0x87, 0x4A, 0x87, 0x4B, /* 0x68-0x6B */
+	0x87, 0x4C, 0x87, 0x4D, 0x87, 0x4E, 0x87, 0x4F, /* 0x6C-0x6F */
+	0x87, 0x50, 0x87, 0x51, 0x87, 0x52, 0x87, 0x53, /* 0x70-0x73 */
+};
+
+static const unsigned char u2c_25[512] = {
+	0x84, 0x9F, 0x84, 0xAA, 0x84, 0xA0, 0x84, 0xAB, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x84, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAC, /* 0x0C-0x0F */
+	0x84, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAD, /* 0x10-0x13 */
+	0x84, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAF, /* 0x14-0x17 */
+	0x84, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAE, /* 0x18-0x1B */
+	0x84, 0xA5, 0x84, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x84, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB0, /* 0x20-0x23 */
+	0x84, 0xA7, 0x84, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x84, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB2, /* 0x28-0x2B */
+	0x84, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB6, /* 0x2C-0x2F */
+	0x84, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB1, /* 0x30-0x33 */
+	0x84, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB8, /* 0x34-0x37 */
+	0x84, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB3, /* 0x38-0x3B */
+	0x84, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB9, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x84, 0xBE, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB4, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x81, 0xA1, 0x81, 0xA0, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0xA3, 0x81, 0xA2, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x81, 0xA5, 0x81, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0x9F, 0x81, 0x9E, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x9B, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0x9D, 0x81, 0x9C, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xFC, /* 0xEC-0xEF */
+};
+
+static const unsigned char u2c_26[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x81, 0x9A, 0x81, 0x99, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x81, 0x8A, 0x00, 0x00, 0x81, 0x89, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x81, 0xF4, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x81, 0xF3, 0x00, 0x00, 0x81, 0xF2, /* 0x6C-0x6F */
+};
+
+static const unsigned char u2c_30[512] = {
+	0x81, 0x40, 0x81, 0x41, 0x81, 0x42, 0x81, 0x56, /* 0x00-0x03 */
+	0x00, 0x00, 0x81, 0x58, 0x81, 0x59, 0x81, 0x5A, /* 0x04-0x07 */
+	0x81, 0x71, 0x81, 0x72, 0x81, 0x73, 0x81, 0x74, /* 0x08-0x0B */
+	0x81, 0x75, 0x81, 0x76, 0x81, 0x77, 0x81, 0x78, /* 0x0C-0x0F */
+	0x81, 0x79, 0x81, 0x7A, 0x81, 0xA7, 0x81, 0xAC, /* 0x10-0x13 */
+	0x81, 0x6B, 0x81, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x87, 0x80, 0x00, 0x00, 0x87, 0x81, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x82, 0x9F, 0x82, 0xA0, 0x82, 0xA1, /* 0x40-0x43 */
+	0x82, 0xA2, 0x82, 0xA3, 0x82, 0xA4, 0x82, 0xA5, /* 0x44-0x47 */
+	0x82, 0xA6, 0x82, 0xA7, 0x82, 0xA8, 0x82, 0xA9, /* 0x48-0x4B */
+	0x82, 0xAA, 0x82, 0xAB, 0x82, 0xAC, 0x82, 0xAD, /* 0x4C-0x4F */
+	0x82, 0xAE, 0x82, 0xAF, 0x82, 0xB0, 0x82, 0xB1, /* 0x50-0x53 */
+	0x82, 0xB2, 0x82, 0xB3, 0x82, 0xB4, 0x82, 0xB5, /* 0x54-0x57 */
+	0x82, 0xB6, 0x82, 0xB7, 0x82, 0xB8, 0x82, 0xB9, /* 0x58-0x5B */
+	0x82, 0xBA, 0x82, 0xBB, 0x82, 0xBC, 0x82, 0xBD, /* 0x5C-0x5F */
+	0x82, 0xBE, 0x82, 0xBF, 0x82, 0xC0, 0x82, 0xC1, /* 0x60-0x63 */
+	0x82, 0xC2, 0x82, 0xC3, 0x82, 0xC4, 0x82, 0xC5, /* 0x64-0x67 */
+	0x82, 0xC6, 0x82, 0xC7, 0x82, 0xC8, 0x82, 0xC9, /* 0x68-0x6B */
+	0x82, 0xCA, 0x82, 0xCB, 0x82, 0xCC, 0x82, 0xCD, /* 0x6C-0x6F */
+	0x82, 0xCE, 0x82, 0xCF, 0x82, 0xD0, 0x82, 0xD1, /* 0x70-0x73 */
+	0x82, 0xD2, 0x82, 0xD3, 0x82, 0xD4, 0x82, 0xD5, /* 0x74-0x77 */
+	0x82, 0xD6, 0x82, 0xD7, 0x82, 0xD8, 0x82, 0xD9, /* 0x78-0x7B */
+	0x82, 0xDA, 0x82, 0xDB, 0x82, 0xDC, 0x82, 0xDD, /* 0x7C-0x7F */
+	
+	0x82, 0xDE, 0x82, 0xDF, 0x82, 0xE0, 0x82, 0xE1, /* 0x80-0x83 */
+	0x82, 0xE2, 0x82, 0xE3, 0x82, 0xE4, 0x82, 0xE5, /* 0x84-0x87 */
+	0x82, 0xE6, 0x82, 0xE7, 0x82, 0xE8, 0x82, 0xE9, /* 0x88-0x8B */
+	0x82, 0xEA, 0x82, 0xEB, 0x82, 0xEC, 0x82, 0xED, /* 0x8C-0x8F */
+	0x82, 0xEE, 0x82, 0xEF, 0x82, 0xF0, 0x82, 0xF1, /* 0x90-0x93 */
+	0x83, 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x4A, /* 0x98-0x9B */
+	0x81, 0x4B, 0x81, 0x54, 0x81, 0x55, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x83, 0x40, 0x83, 0x41, 0x83, 0x42, /* 0xA0-0xA3 */
+	0x83, 0x43, 0x83, 0x44, 0x83, 0x45, 0x83, 0x46, /* 0xA4-0xA7 */
+	0x83, 0x47, 0x83, 0x48, 0x83, 0x49, 0x83, 0x4A, /* 0xA8-0xAB */
+	0x83, 0x4B, 0x83, 0x4C, 0x83, 0x4D, 0x83, 0x4E, /* 0xAC-0xAF */
+	0x83, 0x4F, 0x83, 0x50, 0x83, 0x51, 0x83, 0x52, /* 0xB0-0xB3 */
+	0x83, 0x53, 0x83, 0x54, 0x83, 0x55, 0x83, 0x56, /* 0xB4-0xB7 */
+	0x83, 0x57, 0x83, 0x58, 0x83, 0x59, 0x83, 0x5A, /* 0xB8-0xBB */
+	0x83, 0x5B, 0x83, 0x5C, 0x83, 0x5D, 0x83, 0x5E, /* 0xBC-0xBF */
+	0x83, 0x5F, 0x83, 0x60, 0x83, 0x61, 0x83, 0x62, /* 0xC0-0xC3 */
+	0x83, 0x63, 0x83, 0x64, 0x83, 0x65, 0x83, 0x66, /* 0xC4-0xC7 */
+	0x83, 0x67, 0x83, 0x68, 0x83, 0x69, 0x83, 0x6A, /* 0xC8-0xCB */
+	0x83, 0x6B, 0x83, 0x6C, 0x83, 0x6D, 0x83, 0x6E, /* 0xCC-0xCF */
+	0x83, 0x6F, 0x83, 0x70, 0x83, 0x71, 0x83, 0x72, /* 0xD0-0xD3 */
+	0x83, 0x73, 0x83, 0x74, 0x83, 0x75, 0x83, 0x76, /* 0xD4-0xD7 */
+	0x83, 0x77, 0x83, 0x78, 0x83, 0x79, 0x83, 0x7A, /* 0xD8-0xDB */
+	0x83, 0x7B, 0x83, 0x7C, 0x83, 0x7D, 0x83, 0x7E, /* 0xDC-0xDF */
+	0x83, 0x80, 0x83, 0x81, 0x83, 0x82, 0x83, 0x83, /* 0xE0-0xE3 */
+	0x83, 0x84, 0x83, 0x85, 0x83, 0x86, 0x83, 0x87, /* 0xE4-0xE7 */
+	0x83, 0x88, 0x83, 0x89, 0x83, 0x8A, 0x83, 0x8B, /* 0xE8-0xEB */
+	0x83, 0x8C, 0x83, 0x8D, 0x83, 0x8E, 0x83, 0x8F, /* 0xEC-0xEF */
+	0x83, 0x90, 0x83, 0x91, 0x83, 0x92, 0x83, 0x93, /* 0xF0-0xF3 */
+	0x83, 0x94, 0x83, 0x95, 0x83, 0x96, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x45, /* 0xF8-0xFB */
+	0x81, 0x5B, 0x81, 0x52, 0x81, 0x53, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_32[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xFA, 0x58, 0x87, 0x8B, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x87, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x87, 0x85, 0x87, 0x86, 0x87, 0x87, 0x87, 0x88, /* 0xA4-0xA7 */
+	0x87, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+};
+
+static const unsigned char u2c_33[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x65, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x87, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x87, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x87, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x61, 0x87, 0x6B, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x6A, 0x87, 0x64, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x6C, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x66, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x6E, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x87, 0x5F, 0x87, 0x6D, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x87, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x87, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x68, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x7E, /* 0x78-0x7B */
+	0x87, 0x8F, 0x87, 0x8E, 0x87, 0x8D, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x72, 0x87, 0x73, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x87, 0x6F, 0x87, 0x70, 0x87, 0x71, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x87, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x87, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x87, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+};
+
+static const unsigned char u2c_4E[512] = {
+	0x88, 0xEA, 0x92, 0x9A, 0x00, 0x00, 0x8E, 0xB5, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x9C, /* 0x04-0x07 */
+	0x8F, 0xE4, 0x8E, 0x4F, 0x8F, 0xE3, 0x89, 0xBA, /* 0x08-0x0B */
+	0x00, 0x00, 0x95, 0x73, 0x97, 0x5E, 0x00, 0x00, /* 0x0C-0x0F */
+	0x98, 0xA0, 0x89, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x8A, 0x8E, 0x98, 0xA1, 0x90, 0xA2, 0x99, 0xC0, /* 0x14-0x17 */
+	0x8B, 0x75, 0x95, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xE5, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x97, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xC0, 0x00, 0x00, /* 0x24-0x27 */
+	0xED, 0x4C, 0x00, 0x00, 0x98, 0xA2, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x92, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x98, 0xA3, 0x8B, 0xF8, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0xA4, 0x00, 0x00, /* 0x34-0x37 */
+	0x8A, 0xDB, 0x92, 0x4F, 0x00, 0x00, 0x8E, 0xE5, /* 0x38-0x3B */
+	0x98, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x98, 0xA6, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0xA7, 0x94, 0x54, /* 0x40-0x43 */
+	0x00, 0x00, 0x8B, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x56, /* 0x48-0x4B */
+	0x00, 0x00, 0x93, 0xE1, 0x8C, 0xC1, 0x96, 0x52, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE5, 0x68, 0x98, 0xA8, 0x8F, 0xE6, /* 0x54-0x57 */
+	0x98, 0xA9, 0x89, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x8B, 0xE3, 0x8C, 0xEE, 0x96, 0xE7, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xA4, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x97, 0x90, 0x00, 0x00, 0x93, 0xFB, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xA3, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x8B, 0x54, 0x00, 0x00, 0x98, 0xAA, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x98, 0xAB, 0x97, 0xB9, 0x00, 0x00, /* 0x84-0x87 */
+	0x97, 0x5C, 0x91, 0x88, 0x98, 0xAD, 0x8E, 0x96, /* 0x88-0x8B */
+	0x93, 0xF1, 0x00, 0x00, 0x98, 0xB0, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x89, 0x5D, 0x8C, 0xDD, 0x00, 0x00, /* 0x90-0x93 */
+	0x8C, 0xDC, 0x88, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x98, 0x6A, 0x98, 0x69, 0x00, 0x00, 0x8D, 0xB1, /* 0x98-0x9B */
+	0x88, 0x9F, 0x00, 0x00, 0x98, 0xB1, 0x98, 0xB2, /* 0x9C-0x9F */
+	0x98, 0xB3, 0x96, 0x53, 0x98, 0xB4, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x8C, 0xF0, 0x88, 0xE5, 0x96, 0x92, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x8B, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x9D, /* 0xA8-0xAB */
+	0x8B, 0x9E, 0x92, 0xE0, 0x97, 0xBA, 0x00, 0x00, /* 0xAC-0xAF */
+	0x98, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x98, 0xB6, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0xB7, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x6C, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x8F, 0x59, 0x90, 0x6D, 0x98, 0xBC, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x98, 0xBA, 0x00, 0x00, 0x98, 0xBB, 0x8B, 0x77, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xA1, 0x89, 0xEE, /* 0xC8-0xCB */
+	0x00, 0x00, 0x98, 0xB9, 0x98, 0xB8, 0x95, 0xA7, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x8E, 0x65, 0x8E, 0x64, 0x91, 0xBC, 0x98, 0xBD, /* 0xD4-0xD7 */
+	0x95, 0x74, 0x90, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x81, 0x57, 0x98, 0xBE, 0x98, 0xC0, /* 0xDC-0xDF */
+	0x00, 0x00, 0xED, 0x4D, 0x00, 0x00, 0x91, 0xE3, /* 0xE0-0xE3 */
+	0x97, 0xDF, 0x88, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x98, 0xBF, 0x89, 0xBC, 0x00, 0x00, /* 0xEC-0xEF */
+	0x8B, 0xC2, 0x00, 0x00, 0x92, 0x87, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x8F, 0x98, 0xC1, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x43, /* 0xF8-0xFB */
+	0xED, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_4F[512] = {
+	0xED, 0x4F, 0x8A, 0xE9, 0x00, 0x00, 0xED, 0x50, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x98, 0xC2, 0x88, 0xC9, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x8C, 0xDE, 0x8A, 0xEA, 0x95, 0x9A, /* 0x0C-0x0F */
+	0x94, 0xB0, 0x8B, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xEF, 0x00, 0x00, /* 0x18-0x1B */
+	0x98, 0xE5, 0x93, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x8C, /* 0x2C-0x2F */
+	0x98, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x94, 0xBA, 0x00, 0x00, 0x97, 0xE0, 0x00, 0x00, /* 0x34-0x37 */
+	0x90, 0x4C, 0xED, 0x51, 0x8E, 0x66, 0x00, 0x00, /* 0x38-0x3B */
+	0x8E, 0x97, 0x89, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xCF, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x41, 0x98, 0xC8, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x88, 0xCA, 0x92, 0xE1, 0x8F, 0x5A, /* 0x4C-0x4F */
+	0x8D, 0xB2, 0x97, 0x43, 0x00, 0x00, 0x91, 0xCC, /* 0x50-0x53 */
+	0x00, 0x00, 0x89, 0xBD, 0xED, 0x52, 0x98, 0xC7, /* 0x54-0x57 */
+	0x00, 0x00, 0x97, 0x5D, 0x98, 0xC3, 0x98, 0xC5, /* 0x58-0x5B */
+	0x8D, 0xEC, 0x98, 0xC6, 0x9B, 0x43, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x98, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0xD1, /* 0x6C-0x6F */
+	0x98, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x89, 0xC0, /* 0x70-0x73 */
+	0x00, 0x00, 0x95, 0xB9, 0x98, 0xC9, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0xCD, /* 0x78-0x7B */
+	0x8C, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x67, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xA4, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0xD2, 0x00, 0x00, /* 0x84-0x87 */
+	0x98, 0xCA, 0x00, 0x00, 0xED, 0x54, 0x97, 0xE1, /* 0x88-0x8B */
+	0x00, 0x00, 0x8E, 0x98, 0x00, 0x00, 0x98, 0xCB, /* 0x8C-0x8F */
+	0x00, 0x00, 0x98, 0xD0, 0xED, 0x53, 0x00, 0x00, /* 0x90-0x93 */
+	0xED, 0x56, 0x00, 0x00, 0x98, 0xD3, 0x00, 0x00, /* 0x94-0x97 */
+	0x98, 0xCC, 0x00, 0x00, 0xED, 0x55, 0x8B, 0x9F, /* 0x98-0x9B */
+	0x00, 0x00, 0x88, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x8B, 0xA0, 0x89, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x44, /* 0xA8-0xAB */
+	0x00, 0x00, 0x96, 0x99, 0x95, 0x8E, 0x8C, 0xF2, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x90, 0x4E, 0x97, 0xB5, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xD6, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x57, 0x91, 0xA3, /* 0xC0-0xC3 */
+	0x89, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xED, 0x45, 0x8F, 0x72, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xED, 0x57, 0x98, 0xD7, 0x00, 0x00, /* 0xCC-0xCF */
+	0x98, 0xDC, 0x98, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x98, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x91, 0xAD, /* 0xD4-0xD7 */
+	0x98, 0xD8, 0x00, 0x00, 0x98, 0xDB, 0x98, 0xD9, /* 0xD8-0xDB */
+	0x00, 0x00, 0x95, 0xDB, 0x00, 0x00, 0x98, 0xD6, /* 0xDC-0xDF */
+	0x00, 0x00, 0x90, 0x4D, 0x00, 0x00, 0x96, 0x93, /* 0xE0-0xE3 */
+	0x98, 0xDD, 0x98, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x43, 0x98, 0xEB, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x6F, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x95, 0x55, 0x98, 0xE6, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x95, 0xEE, 0x00, 0x00, 0x89, 0xB4, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0xEA, 0xED, 0x5A, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_50[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x98, 0xE4, 0x98, 0xED, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x91, 0x71, 0x00, 0x00, 0x8C, 0xC2, /* 0x08-0x0B */
+	0x00, 0x00, 0x94, 0x7B, 0x00, 0x00, 0xE0, 0xC5, /* 0x0C-0x0F */
+	0x00, 0x00, 0x98, 0xEC, 0x93, 0x7C, 0x00, 0x00, /* 0x10-0x13 */
+	0x98, 0xE1, 0x00, 0x00, 0x8C, 0xF4, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x8C, 0xF3, 0x98, 0xDF, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x5B, 0x8E, 0xD8, /* 0x1C-0x1F */
+	0x00, 0x00, 0x98, 0xE7, 0xED, 0x59, 0x95, 0xED, /* 0x20-0x23 */
+	0x92, 0x6C, 0x98, 0xE3, 0x8C, 0x91, 0x00, 0x00, /* 0x24-0x27 */
+	0x98, 0xE0, 0x98, 0xE8, 0x98, 0xE2, 0x97, 0xCF, /* 0x28-0x2B */
+	0x98, 0xE9, 0x98, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xE4, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x8C, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xED, 0x58, 0x00, 0x00, 0xED, 0x5E, 0x98, 0xEE, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x5C, 0x98, 0xEF, /* 0x44-0x47 */
+	0x98, 0xF3, 0x88, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xCE, /* 0x4C-0x4F */
+	0x98, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x98, 0xF1, 0x98, 0xF5, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0xF4, 0x00, 0x00, /* 0x58-0x5B */
+	0x92, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x8C, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x98, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xED, 0x5D, 0x00, 0x00, 0x8E, 0xC3, 0x00, 0x00, /* 0x70-0x73 */
+	0x91, 0xA4, 0x92, 0xE3, 0x8B, 0xF4, 0x00, 0x00, /* 0x74-0x77 */
+	0x98, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x8B, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x98, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x98, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x96, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x8C, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xED, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x8E, 0x50, 0x94, 0xF5, 0x98, 0xF9, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x8D, 0xC3, 0x97, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0xFC, 0x99, 0x42, /* 0xB0-0xB3 */
+	0x98, 0xFB, 0x8D, 0xC2, 0x00, 0x00, 0x8F, 0x9D, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x58, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x43, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x8B, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x99, 0x40, 0x99, 0x41, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x93, 0xAD, 0x00, 0x00, 0x91, 0x9C, /* 0xCC-0xCF */
+	0x00, 0x00, 0x8B, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x96, 0x6C, 0x99, 0x44, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xED, 0x61, 0x00, 0x00, 0x97, 0xBB, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x45, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x48, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x99, 0x46, 0x00, 0x00, 0x91, 0x6D, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x99, 0x47, 0x99, 0x49, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xED, 0x60, 0x99, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x99, 0x4A, 0x00, 0x00, 0x95, 0xC6, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_51[512] = {
+	0x8B, 0x56, 0x99, 0x4D, 0x99, 0x4E, 0x00, 0x00, /* 0x00-0x03 */
+	0x89, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x99, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xF2, 0x00, 0x00, /* 0x10-0x13 */
+	0x99, 0x51, 0x99, 0x50, 0x99, 0x4F, 0x00, 0x00, /* 0x14-0x17 */
+	0x98, 0xD4, 0x00, 0x00, 0x99, 0x52, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x9E, /* 0x1C-0x1F */
+	0x00, 0x00, 0x99, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x44, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xD7, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x55, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x54, 0x99, 0x57, /* 0x38-0x3B */
+	0x99, 0x56, 0x00, 0x00, 0x00, 0x00, 0x99, 0x58, /* 0x3C-0x3F */
+	0x99, 0x59, 0x88, 0xF2, 0x00, 0x00, 0x8C, 0xB3, /* 0x40-0x43 */
+	0x8C, 0x5A, 0x8F, 0x5B, 0x92, 0x9B, 0x8B, 0xA2, /* 0x44-0x47 */
+	0x90, 0xE6, 0x8C, 0xF5, 0xED, 0x62, 0x8D, 0x8E, /* 0x48-0x4B */
+	0x99, 0x5B, 0x96, 0xC6, 0x93, 0x65, 0x00, 0x00, /* 0x4C-0x4F */
+	0x8E, 0x99, 0x00, 0x00, 0x99, 0x5A, 0x00, 0x00, /* 0x50-0x53 */
+	0x99, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x7D, 0x00, 0x00, /* 0x58-0x5B */
+	0x8A, 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x5D, 0x00, 0x00, /* 0x60-0x63 */
+	0xED, 0x63, 0x93, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x91, 0x53, 0x99, 0x5F, 0x99, 0x60, 0x94, 0xAA, /* 0x68-0x6B */
+	0x8C, 0xF6, 0x98, 0x5A, 0x99, 0x61, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x8B, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x95, 0xBA, 0x91, 0xB4, 0x8B, 0xEF, /* 0x74-0x77 */
+	0x93, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x8C, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x99, 0x62, 0x00, 0x00, 0x99, 0x63, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x93, 0xE0, 0x89, 0x7E, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x99, 0x66, 0x8D, 0xFB, 0x00, 0x00, /* 0x88-0x8B */
+	0x99, 0x65, 0x8D, 0xC4, 0x00, 0x00, 0x99, 0x67, /* 0x8C-0x8F */
+	0xE3, 0xEC, 0x99, 0x68, 0x96, 0x60, 0x99, 0x69, /* 0x90-0x93 */
+	0x00, 0x00, 0x99, 0x6A, 0x99, 0x6B, 0x8F, 0xE7, /* 0x94-0x97 */
+	0x00, 0x00, 0x8E, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xED, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x8A, 0xA5, 0x00, 0x00, 0x99, 0x6E, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x99, 0x6C, 0x96, 0xBB, 0x99, 0x6D, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x95, 0x79, 0x99, 0x6F, 0x99, 0x70, 0x99, 0x71, /* 0xA8-0xAB */
+	0x93, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x99, 0x75, 0x99, 0x73, 0x99, 0x74, 0x99, 0x72, /* 0xB0-0xB3 */
+	0x8D, 0xE1, 0x99, 0x76, 0x96, 0xE8, 0x97, 0xE2, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x99, 0x77, 0xED, 0x65, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x90, 0xA6, 0x99, 0x78, 0x8F, 0x79, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x99, 0x79, 0x00, 0x00, 0x92, 0x9C, /* 0xC8-0xCB */
+	0x97, 0xBD, 0x93, 0x80, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xC3, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x7A, /* 0xD8-0xDB */
+	0xEA, 0xA3, 0x8B, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x99, 0x7B, 0x96, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x88, 0x91, 0xFA, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x99, 0x7D, 0x93, 0xE2, 0x00, 0x00, /* 0xE8-0xEB */
+	0xED, 0x66, 0x99, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x99, 0x80, 0x8A, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x99, 0x81, 0x8B, 0xA5, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x93, 0xCA, 0x89, 0x9A, 0x8F, 0x6F, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x94, 0x9F, 0x99, 0x82, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_52[512] = {
+	0x93, 0x81, 0x00, 0x00, 0x00, 0x00, 0x90, 0x6E, /* 0x00-0x03 */
+	0x99, 0x83, 0x00, 0x00, 0x95, 0xAA, 0x90, 0xD8, /* 0x04-0x07 */
+	0x8A, 0xA0, 0x00, 0x00, 0x8A, 0xA7, 0x99, 0x84, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x86, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x8C, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x99, 0x85, 0xED, 0x67, 0x00, 0x00, 0x97, 0xF1, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x8F, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x94, 0xBB, 0x95, 0xCA, 0x00, 0x00, 0x99, 0x87, /* 0x24-0x27 */
+	0x00, 0x00, 0x97, 0x98, 0x99, 0x88, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x89, 0x00, 0x00, /* 0x2C-0x2F */
+	0x93, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x99, 0x8A, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xA7, 0x8D, 0xFC, /* 0x34-0x37 */
+	0x8C, 0x94, 0x99, 0x8B, 0x8E, 0x68, 0x8D, 0x8F, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xE4, /* 0x40-0x43 */
+	0x99, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x91, 0xA5, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xED, 0x99, 0x8E, /* 0x48-0x4B */
+	0x99, 0x8F, 0x91, 0x4F, 0x00, 0x00, 0x99, 0x8C, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x99, 0x91, 0x00, 0x00, 0x96, 0x55, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x84, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0x90, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x95, /* 0x60-0x63 */
+	0x8D, 0xDC, 0x94, 0x8D, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x99, 0x94, 0x99, 0x92, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x9B, /* 0x6C-0x6F */
+	0x8F, 0xE8, 0x99, 0x9B, 0x8A, 0x84, 0x99, 0x95, /* 0x70-0x73 */
+	0x99, 0x93, 0x91, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x99, 0x97, 0x00, 0x00, 0x99, 0x96, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x63, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x80, /* 0x84-0x87 */
+	0x99, 0x9C, 0x97, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x99, 0x98, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x99, 0x9D, 0x99, 0x9A, 0x00, 0x00, /* 0x90-0x93 */
+	0x99, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xCD, /* 0x98-0x9B */
+	0xED, 0x68, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xF7, /* 0x9C-0x9F */
+	0x89, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x97, 0xF2, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x69, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x8F, 0x95, 0x93, 0x77, 0x8D, 0x85, /* 0xA8-0xAB */
+	0x99, 0xA0, 0x99, 0xA1, 0x00, 0x00, 0xEE, 0x5B, /* 0xAC-0xAF */
+	0x00, 0x00, 0x97, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x98, 0x4A, 0x99, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x8C, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x99, 0xA2, 0x00, 0x00, 0x8A, 0x4E, 0x00, 0x00, /* 0xBC-0xBF */
+	0xED, 0x6A, 0x99, 0xA4, 0x00, 0x00, 0x96, 0x75, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x92, 0xBA, 0x00, 0x00, 0x97, 0x45, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x95, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x99, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xD3, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x93, 0xAE, 0x00, 0x00, 0x99, 0xA6, /* 0xD4-0xD7 */
+	0x8A, 0xA8, 0x96, 0xB1, 0x00, 0x00, 0xED, 0x6B, /* 0xD8-0xDB */
+	0x00, 0x00, 0x8F, 0x9F, 0x99, 0xA7, 0x95, 0xE5, /* 0xDC-0xDF */
+	0x99, 0xAB, 0x00, 0x00, 0x90, 0xA8, 0x99, 0xA8, /* 0xE0-0xE3 */
+	0x8B, 0xCE, 0x00, 0x00, 0x99, 0xA9, 0x8A, 0xA9, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x4D, 0x99, 0xAC, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x99, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x99, 0xAE, 0x99, 0xAF, 0x8E, 0xD9, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0xF9, 0x96, 0xDC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_53[512] = {
+	0xED, 0x6C, 0x96, 0xE6, 0x93, 0xF5, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x95, 0xEF, 0x99, 0xB0, 0xED, 0x6D, /* 0x04-0x07 */
+	0x99, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x99, 0xB3, 0x00, 0x00, 0x99, 0xB5, /* 0x0C-0x0F */
+	0x99, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x99, 0xB6, 0x89, 0xBB, 0x96, 0x6B, /* 0x14-0x17 */
+	0x00, 0x00, 0x8D, 0xFA, 0x99, 0xB7, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x91, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x8F, 0xA0, 0x8B, 0xA7, 0x00, 0x00, 0x99, 0xB8, /* 0x20-0x23 */
+	0xED, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xD9, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0xB9, /* 0x2C-0x2F */
+	0x00, 0x00, 0x99, 0xBA, 0x00, 0x00, 0x99, 0xBB, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x99, 0xBC, 0x95, 0x43, 0x8B, 0xE6, 0x88, 0xE3, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xBD, /* 0x3C-0x3F */
+	0x99, 0xBD, 0x8F, 0x5C, 0x00, 0x00, 0x90, 0xE7, /* 0x40-0x43 */
+	0x00, 0x00, 0x99, 0xBF, 0x99, 0xBE, 0x8F, 0xA1, /* 0x44-0x47 */
+	0x8C, 0xDF, 0x99, 0xC1, 0x94, 0xBC, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x99, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x94, 0xDA, 0x91, 0xB2, 0x91, 0xEC, /* 0x50-0x53 */
+	0x8B, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x93, 0xEC, /* 0x54-0x57 */
+	0x92, 0x50, 0x00, 0x00, 0x94, 0x8E, 0x00, 0x00, /* 0x58-0x5B */
+	0x96, 0x6D, 0x00, 0x00, 0x99, 0xC4, 0x00, 0x00, /* 0x5C-0x5F */
+	0x90, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x54, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x99, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xC6, 0x89, 0x4B, /* 0x6C-0x6F */
+	0x88, 0xF3, 0x8A, 0xEB, 0xED, 0x6F, 0x91, 0xA6, /* 0x70-0x73 */
+	0x8B, 0x70, 0x97, 0x91, 0x00, 0x00, 0x99, 0xC9, /* 0x74-0x77 */
+	0x89, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x99, 0xC8, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xA8, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xCA, 0x00, 0x00, /* 0x80-0x83 */
+	0x96, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0x70, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xCB, 0x00, 0x00, /* 0x94-0x97 */
+	0x97, 0xD0, 0x00, 0x00, 0x8C, 0xFA, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xB4, /* 0x9C-0x9F */
+	0x99, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x99, 0xCE, 0x99, 0xCD, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x90, 0x7E, 0x89, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x89, 0x7D, 0x99, 0xCF, 0x00, 0x00, /* 0xAC-0xAF */
+	0x99, 0xD0, 0x00, 0x00, 0xED, 0x71, 0x8C, 0xB5, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xD1, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x8E, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x51, 0x99, 0xD2, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x96, 0x94, 0x8D, 0xB3, 0x8B, 0x79, 0x97, 0x46, /* 0xC8-0xCB */
+	0x91, 0x6F, 0x94, 0xBD, 0x8E, 0xFB, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x8F, 0x66, 0x00, 0x00, 0x8E, 0xE6, 0x8E, 0xF3, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x8F, 0x96, 0x00, 0x00, 0x94, 0xBE, /* 0xD8-0xDB */
+	0x00, 0x00, 0xED, 0x72, 0x00, 0x00, 0x99, 0xD5, /* 0xDC-0xDF */
+	0x00, 0x00, 0x89, 0x62, 0x91, 0x70, 0x8C, 0xFB, /* 0xE0-0xE3 */
+	0x8C, 0xC3, 0x8B, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x99, 0xD9, 0x92, 0x40, 0x91, 0xFC, 0x8B, 0xA9, /* 0xE8-0xEB */
+	0x8F, 0xA2, 0x99, 0xDA, 0x99, 0xD8, 0x89, 0xC2, /* 0xEC-0xEF */
+	0x91, 0xE4, 0x8E, 0xB6, 0x8E, 0x6A, 0x89, 0x45, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0x90, 0x8D, 0x86, /* 0xF4-0xF7 */
+	0x8E, 0x69, 0x00, 0x00, 0x99, 0xDB, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_54[512] = {
+	0x00, 0x00, 0x99, 0xDC, 0x00, 0x00, 0x8B, 0x68, /* 0x00-0x03 */
+	0x8A, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x8D, 0x87, 0x8B, 0x67, 0x92, 0xDD, 0x89, 0x44, /* 0x08-0x0B */
+	0x93, 0xAF, 0x96, 0xBC, 0x8D, 0x40, 0x97, 0x99, /* 0x0C-0x0F */
+	0x93, 0x66, 0x8C, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x4E, /* 0x18-0x1B */
+	0x00, 0x00, 0x99, 0xE5, 0x00, 0x00, 0x8B, 0xE1, /* 0x1C-0x1F */
+	0x96, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xDB, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x99, 0xE4, 0x00, 0x00, 0x8A, 0xDC, /* 0x28-0x2B */
+	0x99, 0xDF, 0x99, 0xE0, 0x99, 0xE2, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xE3, 0x00, 0x00, /* 0x34-0x37 */
+	0x8B, 0x7A, 0x90, 0x81, 0x00, 0x00, 0x95, 0xAB, /* 0x38-0x3B */
+	0x99, 0xE1, 0x99, 0xDD, 0x8C, 0xE1, 0x00, 0x00, /* 0x3C-0x3F */
+	0x99, 0xDE, 0x00, 0x00, 0x98, 0x43, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xF0, 0x00, 0x00, /* 0x44-0x47 */
+	0x92, 0xE6, 0x8C, 0xE0, 0x8D, 0x90, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xE6, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x93, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0xEA, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x8E, 0xFC, 0x00, 0x00, 0x8E, 0xF4, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x99, 0xED, 0x99, 0xEB, 0x00, 0x00, 0x96, 0xA1, /* 0x70-0x73 */
+	0x00, 0x00, 0x99, 0xE8, 0x99, 0xF1, 0x99, 0xEC, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0xEF, /* 0x78-0x7B */
+	0x8C, 0xC4, 0x96, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x99, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x99, 0xF2, 0x00, 0x00, 0x99, 0xF4, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x75, 0x8D, 0xEE, /* 0x88-0x8B */
+	0x98, 0x61, 0x00, 0x00, 0x99, 0xE9, 0x99, 0xE7, /* 0x8C-0x8F */
+	0x99, 0xF3, 0x00, 0x00, 0x99, 0xEE, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xED, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x99, 0xF6, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x9A, 0x42, 0x99, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x99, 0xFC, 0xED, 0x76, 0x00, 0x00, 0x9A, 0x40, /* 0xA8-0xAB */
+	0x99, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x5D, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xE7, 0x8A, 0x50, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x99, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x9A, 0x44, 0x88, 0xF4, 0x9A, 0x43, 0x00, 0x00, /* 0xBC-0xBF */
+	0x88, 0xA3, 0x95, 0x69, 0x9A, 0x41, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x99, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x99, 0xF5, /* 0xC4-0xC7 */
+	0x99, 0xFB, 0x8D, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x9A, 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x88, 0xF5, 0x9A, 0x4E, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x9A, 0x46, 0x9A, 0x47, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x8F, 0xA3, 0x96, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x9A, 0x4C, 0x9A, 0x4B, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x4E, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x4D, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x9A, 0x4A, 0x00, 0x00, 0xED, 0x77, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_55[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x89, 0x53, 0x00, 0x00, 0x8D, 0xB4, 0x90, 0x4F, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x48, /* 0x0C-0x0F */
+	0x93, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x9A, 0x49, 0x00, 0x00, 0x88, 0xA0, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x53, 0x97, 0x42, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8F, 0xA5, 0x00, 0x00, 0x9A, 0x59, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x9A, 0x58, 0x9A, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0xC1, 0x00, 0x00, /* 0x3C-0x3F */
+	0x9A, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x91, 0xED, 0x9A, 0x55, 0x8F, 0xA4, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x9A, 0x52, 0x00, 0x00, 0x00, 0x00, 0x96, 0xE2, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x5B, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x56, 0x9A, 0x57, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x9A, 0x54, 0x9A, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x51, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x60, /* 0x78-0x7B */
+	0x9A, 0x65, 0x00, 0x00, 0x9A, 0x61, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x9A, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x66, /* 0x80-0x83 */
+	0x91, 0x50, 0x00, 0x00, 0xED, 0x78, 0x9A, 0x68, /* 0x84-0x87 */
+	0x00, 0x00, 0x8D, 0x41, 0x9A, 0x5E, 0x92, 0x9D, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x9A, 0x62, 0x9A, 0x5B, 0x8A, 0xAB, 0x00, 0x00, /* 0x98-0x9B */
+	0x8A, 0xEC, 0x8A, 0x85, 0x9A, 0x63, 0x9A, 0x5F, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x96, /* 0xA4-0xA7 */
+	0x9A, 0x69, 0x9A, 0x67, 0x91, 0x72, 0x8B, 0x69, /* 0xA8-0xAB */
+	0x8B, 0xAA, 0x00, 0x00, 0x9A, 0x64, 0x00, 0x00, /* 0xAC-0xAF */
+	0x8B, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x63, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x9A, 0x6D, 0x9A, 0x6B, 0x00, 0x00, 0x9A, 0xA5, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x9A, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x6A, 0x00, 0x00, /* 0xD8-0xDB */
+	0x9A, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x6C, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x6B, /* 0xE0-0xE3 */
+	0x9A, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x72, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x9A, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x9A, 0x75, 0x9A, 0x74, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_56[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x51, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x89, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x9A, 0x71, 0x00, 0x00, 0x9A, 0x73, 0x8F, 0xA6, /* 0x14-0x17 */
+	0x89, 0x52, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x76, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x89, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x82, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8F, 0xFA, 0x9A, 0x7D, 0x00, 0x00, /* 0x30-0x33 */
+	0x9A, 0x7B, 0x00, 0x00, 0x9A, 0x7C, 0x00, 0x00, /* 0x34-0x37 */
+	0x9A, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x5C, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x91, 0x58, 0x00, 0x00, 0x9A, 0x78, 0x00, 0x00, /* 0x4C-0x4F */
+	0x9A, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x9A, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x9A, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x8A, 0xED, 0x00, 0x00, 0x9A, 0x84, 0x9A, 0x80, /* 0x68-0x6B */
+	0x9A, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x95, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x93, 0xD3, 0x00, 0x00, 0x94, 0xB6, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x9A, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x85, 0x8A, 0x64, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x87, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x8A, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x9A, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x9A, 0x88, 0x00, 0x00, 0x94, 0x58, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x9A, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x8C, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x9A, 0x8E, 0x00, 0x00, 0x9A, 0x8D, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x9A, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x9A, 0x93, 0x9A, 0x91, 0x9A, 0x8F, 0x9A, 0x92, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x9A, 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x95, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x9A, 0x96, 0x00, 0x00, 0x9A, 0x97, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x98, /* 0xD4-0xD7 */
+	0x99, 0x64, 0x00, 0x00, 0x8E, 0xFA, 0x8E, 0x6C, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xF1, 0x00, 0x00, /* 0xDC-0xDF */
+	0x88, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x92, 0x63, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0x99, 0x00, 0x00, /* 0xEC-0xEF */
+	0x8D, 0xA2, 0x00, 0x00, 0x88, 0xCD, 0x90, 0x7D, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x9A, 0x9A, 0x8C, 0xC5, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x8D, 0x91, 0x00, 0x00, 0x9A, 0x9C, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_57[512] = {
+	0x9A, 0x9B, 0x00, 0x00, 0x00, 0x00, 0x95, 0xDE, /* 0x00-0x03 */
+	0x9A, 0x9D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x9A, 0x9F, 0x9A, 0x9E, 0x00, 0x00, 0x9A, 0xA0, /* 0x08-0x0B */
+	0x00, 0x00, 0x9A, 0xA1, 0x00, 0x00, 0x8C, 0x97, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x80, 0x9A, 0xA2, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xA4, 0x00, 0x00, /* 0x14-0x17 */
+	0x9A, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x9A, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x93, 0x79, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xA7, 0x88, 0xB3, /* 0x24-0x27 */
+	0x8D, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x8C, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x92, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xA8, /* 0x34-0x37 */
+	0x9A, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xAB, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x9A, 0xAC, 0x00, 0x00, 0x8D, 0xE2, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xCF, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x56, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xAA, 0x9A, 0xAD, /* 0x4C-0x4F */
+	0x8D, 0xBF, 0x8D, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xED, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x9A, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x8D, 0xA3, 0xED, 0x7A, 0x92, 0x52, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x9A, 0xAE, 0x92, 0xD8, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xB2, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x82, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x9A, 0xB0, 0x9A, 0xB3, 0x00, 0x00, 0x8C, 0x5E, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xB4, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x9A, 0xB5, 0x00, 0x00, 0x8D, 0x43, 0x8A, 0x5F, /* 0xA0-0xA3 */
+	0x9A, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xB8, 0x00, 0x00, /* 0xA8-0xAB */
+	0xED, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x9A, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xB6, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x9A, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xBA, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xBB, 0xED, 0x7D, /* 0xC4-0xC7 */
+	0xED, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x96, 0x84, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xE9, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xBD, 0x9A, 0xBE, /* 0xD0-0xD3 */
+	0x9A, 0xBC, 0x00, 0x00, 0x9A, 0xC0, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x94, 0x57, 0x00, 0x00, 0x00, 0x00, 0x88, 0xE6, /* 0xDC-0xDF */
+	0x95, 0x75, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xC1, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x8F, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xB7, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x94, 0x7C, 0x8A, 0xEE, 0x00, 0x00, /* 0xF8-0xFB */
+	0x8D, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_58[512] = {
+	0x96, 0x78, 0x00, 0x00, 0x93, 0xB0, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x8C, 0x98, 0x91, 0xCD, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xBF, 0x9A, 0xC2, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x91, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x9A, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x9A, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x9A, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x92, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xAC, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x9F, /* 0x2C-0x2F */
+	0x89, 0x81, 0x95, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x8F, 0xEA, 0x93, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xE4, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x9A, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x95, 0xBB, 0x97, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xF2, 0x9A, 0xC8, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x91, 0x59, 0x9A, 0xCB, 0x00, 0x00, /* 0x50-0x53 */
+	0x93, 0x83, 0x00, 0x00, 0x00, 0x00, 0x93, 0x68, /* 0x54-0x57 */
+	0x93, 0x84, 0x94, 0xB7, 0x92, 0xCB, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xC7, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xC7, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x89, 0x96, 0x00, 0x00, 0x93, 0x55, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x9A, 0xC9, 0x00, 0x00, 0x9A, 0xC5, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x90, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x9A, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x6D, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xAB, /* 0x80-0x83 */
+	0x00, 0x00, 0x9A, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xE6, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x9D, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x92, 0xC4, 0x00, 0x00, 0xED, 0x81, 0x9A, 0xD0, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x96, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xD1, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xD6, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x82, 0x95, 0xAD, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x9A, 0xD5, 0x9A, 0xCF, 0x9A, 0xD2, 0x9A, 0xD4, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xA4, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x95, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x9A, 0xD7, 0x00, 0x00, 0x92, 0x64, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xF3, 0x00, 0x00, /* 0xC8-0xCB */
+	0x8F, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x9A, 0xD9, 0x00, 0x00, 0x9A, 0xD8, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x8D, 0x88, 0x00, 0x00, 0x9A, 0xDA, /* 0xD4-0xD7 */
+	0x9A, 0xDC, 0x9A, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x9A, 0xDE, 0x00, 0x00, 0x9A, 0xD3, 0x9A, 0xE0, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x9A, 0xDF, 0x9A, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x6D, /* 0xE8-0xEB */
+	0x90, 0x70, 0x00, 0x00, 0x91, 0x73, 0x9A, 0xE1, /* 0xEC-0xEF */
+	0x90, 0xBA, 0x88, 0xEB, 0x94, 0x84, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xD9, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x9A, 0xE3, 0x9A, 0xE2, 0x9A, 0xE4, /* 0xF8-0xFB */
+	0x9A, 0xE5, 0x9A, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_59[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xE7, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x95, 0xCF, 0x9A, 0xE8, 0xED, 0x83, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xC4, /* 0x0C-0x0F */
+	0x9A, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x97, 0x5B, 0x8A, 0x4F, 0x00, 0x00, /* 0x14-0x17 */
+	0x99, 0xC7, 0x8F, 0x67, 0x91, 0xBD, 0x9A, 0xEA, /* 0x18-0x1B */
+	0x96, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xB2, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x9A, 0xEC, 0x00, 0x00, 0x91, 0xE5, /* 0x24-0x27 */
+	0x00, 0x00, 0x93, 0x56, 0x91, 0xBE, 0x95, 0x76, /* 0x28-0x2B */
+	0x9A, 0xED, 0x9A, 0xEE, 0x89, 0x9B, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8E, 0xB8, 0x9A, 0xEF, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xCE, /* 0x34-0x37 */
+	0x9A, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xF1, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x89, 0x82, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xEF, /* 0x44-0x47 */
+	0x93, 0xDE, 0x95, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xF5, 0x91, 0x74, /* 0x4C-0x4F */
+	0x9A, 0xF4, 0x8C, 0x5F, 0x00, 0x00, 0xED, 0x84, /* 0x50-0x53 */
+	0x96, 0x7A, 0x9A, 0xF3, 0x00, 0x00, 0x93, 0x85, /* 0x54-0x57 */
+	0x9A, 0xF7, 0x00, 0x00, 0x9A, 0xF6, 0xED, 0x85, /* 0x58-0x5B */
+	0x00, 0x00, 0xED, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x9A, 0xF9, 0x00, 0x00, 0x9A, 0xF8, 0xED, 0x87, /* 0x60-0x63 */
+	0x00, 0x00, 0x89, 0x9C, 0x00, 0x00, 0x9A, 0xFA, /* 0x64-0x67 */
+	0x8F, 0xA7, 0x9A, 0xFC, 0x92, 0x44, 0x00, 0x00, /* 0x68-0x6B */
+	0x9A, 0xFB, 0x00, 0x00, 0x95, 0xB1, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x97, /* 0x70-0x73 */
+	0x93, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x9B, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x8D, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x9B, 0x41, 0x94, 0x40, 0x94, 0xDC, /* 0x80-0x83 */
+	0x96, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x44, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x9B, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x57, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x64, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x96, 0xAD, 0x00, 0x00, 0x9B, 0xAA, /* 0x98-0x9B */
+	0x00, 0x00, 0x9B, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x45, /* 0xA0-0xA3 */
+	0xED, 0x88, 0x91, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x96, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x93, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x46, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x96, 0x85, 0xED, 0x89, 0x8D, 0xC8, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xA8, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x47, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x8E, 0x6F, 0x00, 0x00, 0x8E, 0x6E, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x88, 0xB7, 0x8C, 0xC6, 0x00, 0x00, 0x90, 0xA9, /* 0xD0-0xD3 */
+	0x88, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x9B, 0x4B, 0x9B, 0x4C, 0x00, 0x00, /* 0xD8-0xDB */
+	0x9B, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x89, 0x57, 0x8A, 0xAD, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x9B, 0x48, 0x00, 0x00, 0x96, 0xC3, 0x95, 0x50, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0xA6, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xF7, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x70, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5A[512] = {
+	0x00, 0x00, 0x88, 0xD0, 0x00, 0x00, 0x88, 0xA1, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x9B, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x9B, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x96, 0xBA, 0x00, 0x00, 0x9B, 0x52, 0x00, 0x00, /* 0x18-0x1B */
+	0x9B, 0x50, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x4E, /* 0x1C-0x1F */
+	0x90, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x9B, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x95, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xE2, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x9B, 0x56, 0x9B, 0x57, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x8F, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x9B, 0x53, 0x98, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x6B, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x9B, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xA5, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x58, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x77, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x59, 0x00, 0x00, /* 0x68-0x6B */
+	0x9B, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0xB9, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x7D, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x5A, 0x95, 0x51, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x9B, 0x5B, 0x9B, 0x5F, 0x9B, 0x5C, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x89, 0xC5, 0x9B, 0x5E, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x8E, 0xB9, 0x00, 0x00, 0x9B, 0x5D, /* 0xC8-0xCB */
+	0x8C, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x9B, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x64, 0x9B, 0x61, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x92, 0x84, 0x00, 0x00, 0x9B, 0x60, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x62, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x9B, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x65, 0x9B, 0x66, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_5B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x8A, 0xF0, 0x00, 0x00, 0x9B, 0x68, /* 0x08-0x0B */
+	0x9B, 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x69, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xEC, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x6C, 0x00, 0x00, /* 0x28-0x2B */
+	0x92, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x89, 0x64, 0x00, 0x00, 0x9B, 0x6A, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x6D, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0x6E, 0x00, 0x00, /* 0x3C-0x3F */
+	0x9B, 0x71, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x6F, /* 0x40-0x43 */
+	0x00, 0x00, 0x9B, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x8E, 0x71, 0x9B, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x8D, 0x45, 0x9B, 0x73, 0xED, 0x8A, 0x8E, 0x9A, /* 0x54-0x57 */
+	0x91, 0xB6, 0x00, 0x00, 0x9B, 0x74, 0x9B, 0x75, /* 0x58-0x5B */
+	0x8E, 0x79, 0x8D, 0x46, 0x00, 0x00, 0x96, 0xD0, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x47, /* 0x60-0x63 */
+	0x8C, 0xC7, 0x9B, 0x76, 0x8A, 0x77, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x9B, 0x77, 0x00, 0x00, 0x91, 0xB7, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x9B, 0x78, 0x9B, 0xA1, 0x00, 0x00, 0x9B, 0x79, /* 0x70-0x73 */
+	0x00, 0x00, 0x9B, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x9B, 0x7B, 0x00, 0x00, 0x9B, 0x7D, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x9B, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x80, /* 0x80-0x83 */
+	0x00, 0x00, 0x91, 0xEE, 0x00, 0x00, 0x89, 0x46, /* 0x84-0x87 */
+	0x8E, 0xE7, 0x88, 0xC0, 0x00, 0x00, 0x91, 0x76, /* 0x88-0x8B */
+	0x8A, 0xAE, 0x8E, 0xB3, 0x00, 0x00, 0x8D, 0x47, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x93, 0x86, 0x00, 0x00, 0x8F, 0x40, /* 0x94-0x97 */
+	0x8A, 0xAF, 0x92, 0x88, 0x92, 0xE8, 0x88, 0xB6, /* 0x98-0x9B */
+	0x8B, 0x58, 0x95, 0xF3, 0x00, 0x00, 0x8E, 0xC0, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x71, 0x90, 0xE9, /* 0xA0-0xA3 */
+	0x8E, 0xBA, 0x97, 0x47, 0x9B, 0x81, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x7B, 0x00, 0x00, /* 0xAC-0xAF */
+	0x8D, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x51, /* 0xB0-0xB3 */
+	0x89, 0x83, 0x8F, 0xAA, 0x89, 0xC6, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x9B, 0x82, 0x97, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x68, /* 0xBC-0xBF */
+	0xED, 0x8B, 0x00, 0x00, 0x8E, 0xE2, 0x9B, 0x83, /* 0xC0-0xC3 */
+	0x8A, 0xF1, 0x93, 0xD0, 0x96, 0xA7, 0x9B, 0x84, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x9B, 0x85, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x95, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x9B, 0x87, 0x00, 0x00, 0x8A, 0xA6, 0x8B, 0xF5, /* 0xD0-0xD3 */
+	0x9B, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xED, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB0, /* 0xD8-0xDB */
+	0x00, 0x00, 0x90, 0x51, 0x9B, 0x8B, 0x8E, 0x40, /* 0xDC-0xDF */
+	0x00, 0x00, 0x89, 0xC7, 0x9B, 0x8A, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x9B, 0x88, 0x9B, 0x8C, 0x9B, 0x89, 0x94, 0x4A, /* 0xE4-0xE7 */
+	0x9E, 0xCB, 0x90, 0x52, 0x00, 0x00, 0x9B, 0x8D, /* 0xE8-0xEB */
+	0xED, 0x8E, 0x00, 0x00, 0x97, 0xBE, 0x00, 0x00, /* 0xEC-0xEF */
+	0x9B, 0x8E, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x90, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x92, 0x9E, 0x9B, 0x8F, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x90, 0xA1, 0x00, 0x00, 0x8E, 0x9B, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0xCE, 0x8E, 0xF5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5C[512] = {
+	0x00, 0x00, 0x95, 0x95, 0x90, 0xEA, 0x00, 0x00, /* 0x00-0x03 */
+	0x8E, 0xCB, 0x9B, 0x91, 0x8F, 0xAB, 0x9B, 0x92, /* 0x04-0x07 */
+	0x9B, 0x93, 0x88, 0xD1, 0x91, 0xB8, 0x90, 0x71, /* 0x08-0x0B */
+	0x00, 0x00, 0x9B, 0x94, 0x93, 0xB1, 0x8F, 0xAC, /* 0x0C-0x0F */
+	0x00, 0x00, 0x8F, 0xAD, 0x00, 0x00, 0x9B, 0x95, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xEB, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xAE, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x8F, 0x00, 0x00, /* 0x1C-0x1F */
+	0x9B, 0x96, 0x00, 0x00, 0x9B, 0x97, 0x00, 0x00, /* 0x20-0x23 */
+	0x96, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x9B, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x8B, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8F, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x9B, 0x99, 0x9B, 0x9A, 0x8E, 0xDA, 0x90, 0x4B, /* 0x38-0x3B */
+	0x93, 0xF2, 0x90, 0x73, 0x94, 0xF6, 0x94, 0x41, /* 0x3C-0x3F */
+	0x8B, 0xC7, 0x9B, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x8B, 0x8F, 0x9B, 0x9C, 0x00, 0x00, /* 0x44-0x47 */
+	0x8B, 0xFC, 0x00, 0x00, 0x93, 0xCD, 0x89, 0xAE, /* 0x48-0x4B */
+	0x00, 0x00, 0x8E, 0x72, 0x9B, 0x9D, 0x9B, 0xA0, /* 0x4C-0x4F */
+	0x9B, 0x9F, 0x8B, 0xFB, 0x00, 0x00, 0x9B, 0x9E, /* 0x50-0x53 */
+	0x00, 0x00, 0x93, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0xAE, 0x00, 0x00, /* 0x5C-0x5F */
+	0x93, 0x6A, 0x8E, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x91, 0x77, 0x97, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x9B, 0xA2, 0x00, 0x00, 0x9B, 0xA3, 0x93, 0xD4, /* 0x6C-0x6F */
+	0x00, 0x00, 0x8E, 0x52, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xA5, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x9B, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x9B, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x8A, 0xF2, 0x9B, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x9B, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x89, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x90, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x91, 0x5A, 0x8A, 0xE2, 0x00, 0x00, 0x9B, 0xAB, /* 0xA8-0xAB */
+	0x96, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x91, 0xD0, 0x00, 0x00, 0x8A, 0x78, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xAD, 0x9B, 0xAF, /* 0xB4-0xB7 */
+	0x8A, 0xDD, 0x00, 0x00, 0xED, 0x91, 0x9B, 0xAC, /* 0xB8-0xBB */
+	0x9B, 0xAE, 0x00, 0x00, 0x9B, 0xB1, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x9B, 0xB0, 0x00, 0x00, 0x9B, 0xB2, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x9B, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x93, 0xBB, 0x8B, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x89, 0xE3, 0x9B, 0xB4, 0x9B, 0xB9, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x9B, 0xB7, 0x00, 0x00, 0x95, 0xF5, /* 0xEC-0xEF */
+	0x95, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xED, 0x92, 0x93, 0x87, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xB6, 0x8F, 0x73, /* 0xF8-0xFB */
+	0x00, 0x00, 0x9B, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x92, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xBA, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xE8, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x9B, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x9B, 0xC1, 0x9B, 0xBB, 0x8A, 0x52, 0x9B, 0xBC, /* 0x14-0x17 */
+	0x9B, 0xC5, 0x9B, 0xC4, 0x9B, 0xC3, 0x9B, 0xBF, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xBE, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xC2, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0x93, /* 0x24-0x27 */
+	0x00, 0x00, 0x95, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x96, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xC9, /* 0x48-0x4B */
+	0x9B, 0xC6, 0x00, 0x00, 0x9B, 0xC8, 0x00, 0x00, /* 0x4C-0x4F */
+	0x97, 0x92, 0x00, 0x00, 0x9B, 0xC7, 0xED, 0x94, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x9B, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x90, 0x93, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x9B, 0xCA, 0xED, 0x97, 0x00, 0x00, 0x8D, 0xB5, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xCB, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xCC, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xCF, 0x00, 0x00, /* 0x80-0x83 */
+	0x9B, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xCD, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x88, /* 0x88-0x8B */
+	0x9B, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x9B, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x9B, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xD0, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x9B, 0xD2, 0x00, 0x00, 0x9B, 0xD3, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xD6, /* 0xB4-0xB7 */
+	0xED, 0x98, 0xED, 0x99, 0x97, 0xE4, 0x00, 0x00, /* 0xB8-0xBB */
+	0x9B, 0xD7, 0x9B, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x9B, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x8A, 0xDE, 0x9B, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xED, 0x9A, 0x00, 0x00, 0x9B, 0xDB, 0x9B, 0xDA, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xDC, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xDD, /* 0xD8-0xDB */
+	0x00, 0x00, 0x90, 0xEC, 0x8F, 0x42, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x8F, 0x84, 0x00, 0x00, 0x91, 0x83, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x8D, 0x48, 0x8D, 0xB6, 0x8D, 0x49, /* 0xE4-0xE7 */
+	0x8B, 0x90, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xDE, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xB7, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x8C, 0xC8, 0x9B, 0xDF, 0x96, 0xA4, /* 0xF0-0xF3 */
+	0x94, 0x62, 0x9B, 0xE0, 0x00, 0x00, 0x8D, 0x4A, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xAA, /* 0xF8-0xFB */
+	0x00, 0x00, 0x92, 0x46, 0x8B, 0xD0, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x73, 0x95, 0x7A, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xBF, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xE1, /* 0x08-0x0B */
+	0x8A, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x9B, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x9F, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x9B, 0xE3, 0x9B, 0xE2, 0x9B, 0xE5, /* 0x18-0x1B */
+	0x00, 0x00, 0x92, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x90, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x74, /* 0x28-0x2B */
+	0x00, 0x00, 0x90, 0xC8, 0x00, 0x00, 0x91, 0xD1, /* 0x2C-0x2F */
+	0x8B, 0x41, 0x00, 0x00, 0x00, 0x00, 0x92, 0xA0, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x9B, 0xE6, 0x9B, 0xE7, /* 0x34-0x37 */
+	0x8F, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x96, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x9B, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xE9, /* 0x40-0x43 */
+	0x9B, 0xE8, 0x95, 0x9D, 0x00, 0x00, 0x9B, 0xF1, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x96, 0x79, 0x00, 0x00, 0x9B, 0xEB, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x9B, 0xED, 0x96, 0x8B, 0x00, 0x00, 0x9B, 0xEC, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xEE, /* 0x5C-0x5F */
+	0x00, 0x00, 0x94, 0xA6, 0x9B, 0xEF, 0x95, 0xBC, /* 0x60-0x63 */
+	0x9B, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xB1, 0x95, 0xBD, /* 0x70-0x73 */
+	0x94, 0x4E, 0x9B, 0xF2, 0x9B, 0xF3, 0x00, 0x00, /* 0x74-0x77 */
+	0x8D, 0x4B, 0x8A, 0xB2, 0x9B, 0xF4, 0x8C, 0xB6, /* 0x78-0x7B */
+	0x97, 0x63, 0x97, 0x48, 0x8A, 0xF4, 0x9B, 0xF6, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x92, 0xA1, 0x00, 0x00, 0x8D, 0x4C, /* 0x80-0x83 */
+	0x8F, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x94, 0xDD, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xB0, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x98, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x92, 0xEA, 0x95, 0xF7, 0x93, 0x58, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0x4D, 0x00, 0x00, /* 0x98-0x9B */
+	0x95, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x9B, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x78, 0x8D, 0xC0, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xC9, /* 0xA8-0xAB */
+	0x00, 0x00, 0x92, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x88, 0xC1, 0x8F, 0x8E, 0x8D, 0x4E, /* 0xB4-0xB7 */
+	0x97, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x9B, 0xF8, 0x9B, 0xF9, 0x94, 0x70, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x9B, 0xFA, 0x97, 0xF5, 0x98, 0x4C, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xFC, /* 0xCC-0xCF */
+	0x9B, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x66, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x40, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x43, 0x9C, 0x44, /* 0xD8-0xDB */
+	0x00, 0x00, 0x9C, 0x42, 0x00, 0x00, 0x95, 0x5F, /* 0xDC-0xDF */
+	0x8F, 0xB1, 0x9C, 0x46, 0x9C, 0x45, 0x9C, 0x41, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x9C, 0x47, 0x9C, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x9C, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x9C, 0x4C, 0x9C, 0x4A, 0x00, 0x00, 0x9C, 0x4B, /* 0xF0-0xF3 */
+	0x9C, 0x4D, 0x00, 0x00, 0x89, 0x84, 0x92, 0xEC, /* 0xF4-0xF7 */
+	0x9C, 0x4E, 0x00, 0x00, 0x8C, 0x9A, 0x89, 0xF4, /* 0xF8-0xFB */
+	0x94, 0x55, 0x00, 0x00, 0x9C, 0x4F, 0x93, 0xF9, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5F[512] = {
+	0x00, 0x00, 0x95, 0xD9, 0x00, 0x00, 0x9C, 0x50, /* 0x00-0x03 */
+	0x98, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x9C, 0x51, 0x95, 0xBE, 0x9C, 0x54, /* 0x08-0x0B */
+	0x98, 0x9F, 0x98, 0xAF, 0x00, 0x00, 0x8E, 0xAE, /* 0x0C-0x0F */
+	0x93, 0xF3, 0x9C, 0x55, 0x00, 0x00, 0x8B, 0x7C, /* 0x10-0x13 */
+	0x92, 0xA2, 0x88, 0xF8, 0x9C, 0x56, 0x95, 0xA4, /* 0x14-0x17 */
+	0x8D, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x92, 0x6F, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xED, /* 0x1C-0x1F */
+	0x00, 0x00, 0xED, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x96, 0xED, 0x8C, 0xB7, 0x8C, 0xCA, /* 0x24-0x27 */
+	0x00, 0x00, 0x9C, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x9C, 0x58, 0x00, 0x00, 0x9C, 0x5E, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8E, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xED, 0x9C, 0x92, 0xA3, 0x00, 0x00, 0x8B, 0xAD, /* 0x34-0x37 */
+	0x9C, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x95, 0x4A, 0x00, 0x00, 0x92, 0x65, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x9C, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xED, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x9C, 0x5B, 0x00, 0x00, 0x8B, 0xAE, 0x00, 0x00, /* 0x48-0x4B */
+	0x9C, 0x5C, 0x00, 0x00, 0x9C, 0x5D, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x9C, 0x5F, 0x00, 0x00, 0x93, 0x96, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x60, 0x9C, 0x61, /* 0x54-0x57 */
+	0x00, 0x00, 0x9C, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x9C, 0x53, 0x9C, 0x52, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x9C, 0x63, 0x8C, 0x60, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x46, 0xED, 0x9D, /* 0x64-0x67 */
+	0x00, 0x00, 0x8D, 0xCA, 0x95, 0x56, 0x92, 0xA4, /* 0x68-0x6B */
+	0x95, 0x6A, 0x9C, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x8F, 0xB2, 0x89, 0x65, 0x00, 0x00, 0x9C, 0x65, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x66, /* 0x74-0x77 */
+	0x00, 0x00, 0x96, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x94, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x69, /* 0x7C-0x7F */
+	
+	0x89, 0x9D, 0x90, 0xAA, 0x9C, 0x68, 0x9C, 0x67, /* 0x80-0x83 */
+	0x8C, 0x61, 0x91, 0xD2, 0x00, 0x00, 0x9C, 0x6D, /* 0x84-0x87 */
+	0x9C, 0x6B, 0x00, 0x00, 0x9C, 0x6A, 0x97, 0xA5, /* 0x88-0x8B */
+	0x8C, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x8F, 0x99, 0x9C, 0x6C, 0x93, 0x6B, 0x8F, 0x5D, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xBE, /* 0x94-0x97 */
+	0x9C, 0x70, 0x9C, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x6E, 0x00, 0x00, /* 0x9C-0x9F */
+	0x9C, 0x71, 0x8C, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x9C, 0x72, 0x95, 0x9C, 0x8F, 0x7A, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x9C, 0x73, 0x94, 0xF7, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xBF, /* 0xB0-0xB3 */
+	0x92, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xED, 0x9E, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x93, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x9C, 0x74, 0x8B, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x53, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x95, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x8A, 0xF5, 0x94, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x75, 0x8E, 0x75, /* 0xD4-0xD7 */
+	0x96, 0x59, 0x96, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x89, 0x9E, 0x9C, 0x7A, 0xED, 0x9F, 0x00, 0x00, /* 0xDC-0xDF */
+	0x92, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x9C, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xF5, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x9C, 0xAB, 0x9C, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x94, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x9C, 0x78, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x76, /* 0xF8-0xFB */
+	0x00, 0x00, 0x8D, 0x9A, 0x00, 0x00, 0x9C, 0x7C, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_60[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x83, 0x9C, 0x89, /* 0x0C-0x0F */
+	0x9C, 0x81, 0x00, 0x00, 0x93, 0x7B, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x9C, 0x86, 0x95, 0x7C, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x9C, 0x80, 0x00, 0x00, 0x9C, 0x85, /* 0x18-0x1B */
+	0x97, 0xE5, 0x8E, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x91, 0xD3, 0x9C, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x8B, 0x7D, 0x9C, 0x88, 0x90, 0xAB, /* 0x24-0x27 */
+	0x89, 0x85, 0x9C, 0x82, 0x89, 0xF6, 0x9C, 0x87, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xAF, /* 0x2C-0x2F */
+	0x00, 0x00, 0x9C, 0x84, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x8A, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x9C, 0x8C, 0x9C, 0x96, 0x9C, 0x94, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x91, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x90, 0x97, 0xF6, /* 0x48-0x4B */
+	0x00, 0x00, 0x9C, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x8B, 0xB0, 0x00, 0x00, 0x8D, 0x50, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x8F, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x9C, 0x99, 0x9C, 0x8B, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xED, 0xA0, 0x00, 0x00, 0x9C, 0x8F, /* 0x5C-0x5F */
+	0x9C, 0x7E, 0x00, 0x00, 0x89, 0xF8, 0x9C, 0x93, /* 0x60-0x63 */
+	0x9C, 0x95, 0x92, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x8D, 0xA6, 0x89, 0xB6, 0x9C, 0x8D, 0x9C, 0x98, /* 0x68-0x6B */
+	0x9C, 0x97, 0x8B, 0xB1, 0x00, 0x00, 0x91, 0xA7, /* 0x6C-0x6F */
+	0x8A, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x8C, 0x62, 0x00, 0x00, 0x9C, 0x8E, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x9C, 0x9A, 0x00, 0x00, 0x9C, 0x9D, /* 0x80-0x83 */
+	0x9C, 0x9F, 0xED, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x8E, 0xBB, 0xED, 0xA2, 0x9C, 0xA5, /* 0x88-0x8B */
+	0x92, 0xEE, 0x9C, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0xA3, 0x00, 0x00, /* 0x90-0x93 */
+	0x89, 0xF7, 0x00, 0x00, 0x9C, 0xA1, 0x9C, 0xA2, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0x9E, 0x9C, 0xA0, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xE5, /* 0x9C-0x9F */
+	0x97, 0x49, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB3, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x78, 0x9C, 0xA4, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x94, 0x59, 0x88, 0xAB, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xDF, 0x9C, 0x7B, /* 0xB0-0xB3 */
+	0x9C, 0xAA, 0x9C, 0xAE, 0x96, 0xE3, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x9C, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x93, 0x89, 0x9C, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x8F, 0xEE, 0x9C, 0xAD, 0x93, 0xD5, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x98, 0x66, 0x00, 0x00, 0x9C, 0xA9, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xED, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x9C, 0xAF, 0x00, 0x00, 0x8D, 0x9B, 0x00, 0x00, /* 0xD8-0xDB */
+	0x90, 0xC9, 0x00, 0x00, 0xED, 0xA3, 0x88, 0xD2, /* 0xDC-0xDF */
+	0x9C, 0xA8, 0x9C, 0xA6, 0x00, 0x00, 0x91, 0x79, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x9C, /* 0xE4-0xE7 */
+	0x8E, 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x91, 0xC4, 0x9C, 0xBB, 0xED, 0xA6, 0x91, 0x7A, /* 0xF0-0xF3 */
+	0x9C, 0xB6, 0x00, 0x00, 0x9C, 0xB3, 0x9C, 0xB4, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x8E, 0xE4, 0x9C, 0xB7, 0x9C, 0xBA, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_61[512] = {
+	0x9C, 0xB5, 0x8F, 0x44, 0x00, 0x00, 0x9C, 0xB8, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0xB2, 0x00, 0x00, /* 0x04-0x07 */
+	0x96, 0xFA, 0x96, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x9C, 0xBC, 0x9C, 0xBD, 0x88, 0xD3, /* 0x0C-0x0F */
+	0x00, 0x00, 0xED, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x9C, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xF0, 0x88, 0xA4, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB4, /* 0x1C-0x1F */
+	0xED, 0xA5, 0x9C, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xC1, /* 0x24-0x27 */
+	0x9C, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x9C, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xED, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x9C, 0xC6, 0x00, 0x00, 0x00, 0x00, 0xED, 0xA8, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x9C, 0xC4, 0x9C, 0xC7, 0x9C, 0xBF, 0x9C, 0xC3, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0xC8, 0x00, 0x00, /* 0x40-0x43 */
+	0x9C, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xBE, /* 0x44-0x47 */
+	0x8E, 0x9C, 0x00, 0x00, 0x9C, 0xC2, 0x91, 0xD4, /* 0x48-0x4B */
+	0x8D, 0x51, 0x9C, 0xB0, 0x90, 0x54, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xD6, /* 0x50-0x53 */
+	0x00, 0x00, 0x95, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x9C, 0xCC, 0x9C, 0xCD, 0x9C, 0xCE, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x9C, 0xD5, 0x00, 0x00, 0x9C, 0xD4, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x9D, 0x8A, 0xB5, /* 0x60-0x63 */
+	0x00, 0x00, 0x9C, 0xD2, 0x00, 0x00, 0x8C, 0x64, /* 0x64-0x67 */
+	0x8A, 0x53, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xCF, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xB6, 0x9C, 0xD1, /* 0x6C-0x6F */
+	0x88, 0xD4, 0x9C, 0xD3, 0x00, 0x00, 0x9C, 0xCA, /* 0x70-0x73 */
+	0x9C, 0xD0, 0x9C, 0xD7, 0x8C, 0x63, 0x9C, 0xCB, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x7C, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x4A, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xDA, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0xDE, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0x9E, 0x00, 0x00, /* 0x8C-0x8F */
+	0x97, 0xF7, 0x9C, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x9C, 0xDC, 0x00, 0x00, 0x9C, 0xD9, 0x00, 0x00, /* 0x94-0x97 */
+	0xED, 0xAA, 0x9C, 0xD8, 0x9C, 0xDD, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x95, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x93, 0xB2, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x8C, 0x65, 0x00, 0x00, 0x9C, 0xE0, /* 0xA8-0xAB */
+	0x9C, 0xDB, 0x00, 0x00, 0x9C, 0xE1, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x9B, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xAF, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0xE9, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xB6, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xE7, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0xE8, 0x8D, 0xA7, /* 0xC4-0xC7 */
+	0x9C, 0xE6, 0x9C, 0xE4, 0x9C, 0xE3, 0x9C, 0xEA, /* 0xC8-0xCB */
+	0x9C, 0xE2, 0x9C, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x89, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xEE, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x9C, 0xED, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0xA6, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x9C, 0xF1, 0x00, 0x00, 0x9C, 0xEF, 0x9C, 0xE5, /* 0xF4-0xF7 */
+	0x8C, 0x9C, 0x00, 0x00, 0x9C, 0xF0, 0x00, 0x00, /* 0xF8-0xFB */
+	0x9C, 0xF4, 0x9C, 0xF3, 0x9C, 0xF5, 0x9C, 0xF2, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_62[512] = {
+	0x9C, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x9C, 0xF7, 0x9C, 0xF8, 0x95, 0xE8, 0x00, 0x00, /* 0x08-0x0B */
+	0x9C, 0xFA, 0x9C, 0xF9, 0x8F, 0x5E, 0x00, 0x00, /* 0x0C-0x0F */
+	0x90, 0xAC, 0x89, 0xE4, 0x89, 0xFA, 0xED, 0xAB, /* 0x10-0x13 */
+	0x9C, 0xFB, 0x00, 0x00, 0x88, 0xBD, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xCA, 0x9C, 0xFC, /* 0x18-0x1B */
+	0x00, 0x00, 0xE6, 0xC1, 0x9D, 0x40, 0x8C, 0x81, /* 0x1C-0x1F */
+	0x00, 0x00, 0x9D, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xED, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x42, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x43, 0x8B, 0x59, /* 0x2C-0x2F */
+	0x9D, 0x44, 0x00, 0x00, 0x9D, 0x45, 0x9D, 0x46, /* 0x30-0x33 */
+	0x91, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x8C, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x96, 0xDF, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x5B, /* 0x3C-0x3F */
+	0x8F, 0x8A, 0x9D, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xEE, /* 0x44-0x47 */
+	0xE7, 0xBB, 0x94, 0xE0, 0x00, 0x00, 0x8E, 0xE8, /* 0x48-0x4B */
+	0x00, 0x00, 0x8D, 0xCB, 0x9D, 0x48, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xC5, /* 0x50-0x53 */
+	0x00, 0x00, 0x95, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x91, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x4B, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x49, 0x00, 0x00, /* 0x5C-0x5F */
+	0x9D, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x4A, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x9D, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xAF, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x88, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x7D, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x94, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x9D, 0x4E, 0x00, 0x00, 0x9D, 0x51, 0x8F, 0xB3, /* 0x7C-0x7F */
+	
+	0x8B, 0x5A, 0x00, 0x00, 0x9D, 0x4F, 0x9D, 0x56, /* 0x80-0x83 */
+	0x8F, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x9D, 0x50, 0x94, 0x63, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x97, 0x7D, 0x9D, 0x52, 0x9D, 0x53, /* 0x90-0x93 */
+	0x9D, 0x57, 0x93, 0x8A, 0x9D, 0x54, 0x8D, 0x52, /* 0x94-0x97 */
+	0x90, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x65, /* 0x98-0x9B */
+	0x94, 0xB2, 0x00, 0x00, 0x91, 0xF0, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xAC, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xE2, /* 0xA8-0xAB */
+	0x9D, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x95, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x92, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x96, 0x95, 0x00, 0x00, 0x9D, 0x5A, /* 0xB8-0xBB */
+	0x89, 0x9F, 0x92, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x63, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x92, 0x53, 0x9D, 0x5D, 0x9D, 0x64, /* 0xC4-0xC7 */
+	0x9D, 0x5F, 0x9D, 0x66, 0x9D, 0x62, 0x00, 0x00, /* 0xC8-0xCB */
+	0x9D, 0x61, 0x94, 0x8F, 0x00, 0x00, 0x9D, 0x5B, /* 0xCC-0xCF */
+	0x89, 0xFB, 0x9D, 0x59, 0x8B, 0x91, 0x91, 0xF1, /* 0xD0-0xD3 */
+	0x9D, 0x55, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x58, /* 0xD4-0xD7 */
+	0x8D, 0x53, 0x90, 0xD9, 0x00, 0x00, 0x8F, 0xB5, /* 0xD8-0xDB */
+	0x9D, 0x60, 0x94, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x8B, 0x92, 0x8A, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x8A, 0x87, 0x90, 0x40, 0x9D, 0x68, 0x9D, 0x6D, /* 0xEC-0xEF */
+	0x00, 0x00, 0x9D, 0x69, 0x00, 0x00, 0x8C, 0x9D, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x9D, 0x6E, 0x8E, 0x41, 0x8D, 0x89, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x45, 0x9D, 0x5C, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_63[512] = {
+	0x00, 0x00, 0x8E, 0x9D, 0x9D, 0x6B, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x77, /* 0x04-0x07 */
+	0x9D, 0x6C, 0x88, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x9D, 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x92, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x8B, 0x93, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xB2, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x6A, /* 0x24-0x27 */
+	0x88, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xC1, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x55, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0xF0, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x94, 0xD2, 0x9D, 0x70, 0x91, 0x7D, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x91, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x8E, 0x4A, 0x9D, 0x71, 0x00, 0x00, 0x9D, 0x73, /* 0x4C-0x4F */
+	0x9D, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x95, 0xDF, 0x00, 0x00, 0x92, 0xBB, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x91, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xF9, /* 0x64-0x67 */
+	0x8E, 0xCC, 0x9D, 0x80, 0x00, 0x00, 0x9D, 0x7E, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x98, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x9E, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x78, 0x8F, 0xB7, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0xE6, 0x94, 0x50, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x9D, 0x76, 0x00, 0x00, 0x00, 0x00, 0x91, 0x7C, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x8E, 0xF6, 0x9D, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x8F, 0xB6, 0x00, 0x00, 0x9D, 0x75, 0x9D, 0x7A, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x72, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x74, 0x00, 0x00, /* 0x94-0x97 */
+	0x8C, 0x40, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x7C, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x7C, /* 0x9C-0x9F */
+	0x97, 0xA9, 0x8D, 0xCC, 0x92, 0x54, 0x9D, 0x79, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x90, 0xDA, 0x00, 0x00, 0x8D, 0x54, /* 0xA4-0xA7 */
+	0x90, 0x84, 0x89, 0x86, 0x91, 0x5B, 0x9D, 0x77, /* 0xA8-0xAB */
+	0x8B, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x66, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x92, 0xCD, 0x9D, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x7E, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x81, 0x00, 0x00, /* 0xBC-0xBF */
+	0x9D, 0x83, 0x00, 0x00, 0x00, 0x00, 0x91, 0xB5, /* 0xC0-0xC3 */
+	0x9D, 0x89, 0x00, 0x00, 0x9D, 0x84, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x9D, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x60, /* 0xCC-0xCF */
+	0x92, 0xF1, 0x00, 0x00, 0x9D, 0x87, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x4B, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x67, 0x8A, 0xB7, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x88, 0xAC, 0x00, 0x00, 0x9D, 0x85, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x9D, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xF6, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x89, 0x87, 0xED, 0xAD, 0x9D, 0x88, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x68, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_64[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x8C, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x91, 0xB9, 0x00, 0x00, 0x9D, 0x93, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x8D, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x8A, 0x9D, 0x91, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x9D, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x8E, 0x00, 0x00, /* 0x24-0x27 */
+	0x9D, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x94, 0xC0, 0x93, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x9D, 0x8B, 0x00, 0x00, 0x9D, 0x8F, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x67, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xEF, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xDB, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x97, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x93, 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xED, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x94, /* 0x64-0x67 */
+	0x00, 0x00, 0x96, 0x80, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x95, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x96, 0x00, 0x00, /* 0x74-0x77 */
+	0x96, 0xCC, 0x00, 0x00, 0x90, 0xA0, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x82, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x9D, 0x9D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x54, 0x9D, 0x9A, /* 0x90-0x93 */
+	0x00, 0x00, 0x9D, 0x99, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x51, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xED, 0xAF, 0x93, 0xB3, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x93, 0x50, 0x9D, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x9D, 0x9C, 0x00, 0x00, 0x95, 0x8F, /* 0xA8-0xAB */
+	0x00, 0x00, 0x94, 0x64, 0x8E, 0x42, 0x00, 0x00, /* 0xAC-0xAF */
+	0x90, 0xEF, 0x00, 0x00, 0x96, 0x6F, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x8A, 0x68, 0x00, 0x00, 0x9D, 0xA3, /* 0xB8-0xBB */
+	0x9D, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x97, 0x69, 0x9D, 0xA5, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x9D, 0xA1, 0x00, 0x00, 0x9D, 0xA2, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x91, 0x80, 0xED, 0xB0, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0xA0, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x9D, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x9D, 0xA4, 0x00, 0x00, 0x9D, 0x9F, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x9D, 0xA9, 0x9D, 0xAA, 0x93, 0x46, 0x9D, 0xAC, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x43, 0x9D, 0xA7, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x8B, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xAD, /* 0xEC-0xEF */
+	0x00, 0x00, 0x9D, 0xA6, 0x9D, 0xB1, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x9D, 0xB0, 0x00, 0x00, 0x9D, 0xAF, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0xB2, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x9D, 0xB4, 0x8F, 0xEF, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_65[512] = {
+	0x9D, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x9D, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x9D, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x9D, 0xB6, 0x9D, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xB9, /* 0x20-0x23 */
+	0x9D, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0x98, 0x9D, 0xBA, /* 0x28-0x2B */
+	0x9D, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x78, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x9D, 0xBB, 0x9D, 0xBC, 0x9D, 0xBE, 0x9D, 0xBD, /* 0x34-0x37 */
+	0x9D, 0xBF, 0x89, 0xFC, 0x00, 0x00, 0x8D, 0x55, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xFA, 0x90, 0xAD, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x8C, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x9D, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x9D, 0xC4, 0xED, 0xB1, 0x95, 0x71, /* 0x4C-0x4F */
+	0x00, 0x00, 0x8B, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x9D, 0xC3, 0x9D, 0xC2, 0x94, 0x73, /* 0x54-0x57 */
+	0x9D, 0xC5, 0x8B, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x9D, 0xC7, 0x9D, 0xC6, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xB8, 0x8E, 0x55, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0xD6, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x8C, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x90, 0x94, 0x00, 0x00, 0x9D, 0xC8, 0x00, 0x00, /* 0x70-0x73 */
+	0x90, 0xAE, 0x93, 0x47, 0x00, 0x00, 0x95, 0x7E, /* 0x74-0x77 */
+	0x9D, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0xCA, 0x9D, 0xCB, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xB6, /* 0x84-0x87 */
+	0x9B, 0x7C, 0x90, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x95, 0x6B, 0x00, 0x00, 0x8D, 0xD6, 0x00, 0x00, /* 0x8C-0x8F */
+	0x94, 0xE3, 0x94, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x6C, /* 0x94-0x97 */
+	0x00, 0x00, 0x97, 0xBF, 0x00, 0x00, 0x9D, 0xCD, /* 0x98-0x9B */
+	0x8E, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xCE, /* 0x9C-0x9F */
+	0x00, 0x00, 0x88, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x8B, 0xD2, 0x90, 0xCB, 0x00, 0x00, 0x95, 0x80, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xCF, /* 0xA8-0xAB */
+	0x8E, 0x61, 0x92, 0x66, 0x00, 0x00, 0x8E, 0x7A, /* 0xAC-0xAF */
+	0x90, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xD0, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x95, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x89, 0x97, 0x8E, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x9D, 0xD3, 0x00, 0x00, 0x9D, 0xD1, /* 0xC0-0xC3 */
+	0x9D, 0xD4, 0x97, 0xB7, 0x9D, 0xD2, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xF9, /* 0xC8-0xCB */
+	0x9D, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x91, 0xB0, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0xD6, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xF8, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x9D, 0xD8, 0x00, 0x00, 0x9D, 0xD7, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x9D, 0xD9, 0x9D, 0xDA, 0x8A, 0xF9, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x93, 0xFA, 0x92, 0x55, 0x8B, 0x8C, /* 0xE4-0xE7 */
+	0x8E, 0x7C, 0x91, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x8F, 0x7B, 0x88, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x9D, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xA0, 0x9D, 0xDF, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_66[512] = {
+	0xED, 0xB2, 0x00, 0x00, 0x8D, 0x56, 0x9D, 0xDE, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xA9, 0x8F, 0xB8, /* 0x04-0x07 */
+	0x00, 0x00, 0xED, 0xB5, 0x9D, 0xDD, 0x00, 0x00, /* 0x08-0x0B */
+	0x8F, 0xB9, 0x00, 0x00, 0x96, 0xBE, 0x8D, 0xA8, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xD5, /* 0x10-0x13 */
+	0x90, 0xCC, 0xED, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x9D, 0xE4, 0x00, 0x00, 0xED, 0xB7, 0x90, 0xAF, /* 0x1C-0x1F */
+	0x89, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xED, 0xB8, 0x8F, 0x74, 0x00, 0x00, 0x96, 0x86, /* 0x24-0x27 */
+	0x8D, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x8F, 0xBA, 0xED, 0xB6, 0x90, 0xA5, /* 0x2C-0x2F */
+	0x00, 0x00, 0xED, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x9D, 0xE3, 0x9D, 0xE1, 0x9D, 0xE2, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xB4, /* 0x38-0x3B */
+	0x92, 0x8B, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x45, /* 0x3C-0x3F */
+	0x00, 0x00, 0x9D, 0xE8, 0x8E, 0x9E, 0x8D, 0x57, /* 0x40-0x43 */
+	0x9D, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x9D, 0xE7, 0x00, 0x00, 0x90, 0x57, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xE5, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x4E, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xBA, /* 0x54-0x57 */
+	0x00, 0x00, 0xED, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x9D, 0xEA, 0x9D, 0xE9, 0x9D, 0xEE, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0xEF, 0x00, 0x00, /* 0x60-0x63 */
+	0x9D, 0xEB, 0xED, 0xB9, 0x8A, 0x41, 0x9D, 0xEC, /* 0x64-0x67 */
+	0x9D, 0xED, 0x94, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x81, 0x8C, 0x69, /* 0x6C-0x6F */
+	0x9D, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xED, 0xBD, /* 0x70-0x73 */
+	0x90, 0xB0, 0x00, 0x00, 0x8F, 0xBB, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x71, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x8B, 0xC5, 0x00, 0x00, 0x9D, 0xF1, /* 0x80-0x83 */
+	0x9D, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x89, 0xC9, /* 0x84-0x87 */
+	0x9D, 0xF2, 0x9D, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0xF3, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x8F, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x67, 0x88, 0xC3, /* 0x94-0x97 */
+	0x9D, 0xF6, 0xED, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x9D, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xED, 0xBF, 0x00, 0x00, 0x92, 0xA8, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xEF, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x62, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xE9, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xC0, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x96, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x9E, 0x41, 0x9D, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x9D, 0xFC, 0x00, 0x00, 0x9D, 0xFB, 0xED, 0xC1, /* 0xBC-0xBF */
+	0x00, 0x00, 0x9D, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x9E, 0x40, 0x00, 0x00, 0x00, 0x00, 0x93, 0xDC, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x9D, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x42, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x8F, 0x8C, 0x9E, 0x43, 0x00, 0x00, /* 0xD8-0xDB */
+	0x97, 0x6A, 0x94, 0x98, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x9E, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x46, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x9E, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x9E, 0x48, 0x00, 0x00, 0x8B, 0xC8, 0x89, 0x67, /* 0xF0-0xF3 */
+	0x8D, 0x58, 0x9E, 0x49, 0x00, 0x00, 0x9E, 0x4A, /* 0xF4-0xF7 */
+	0x8F, 0x91, 0x91, 0x82, 0xED, 0xC2, 0xED, 0x4A, /* 0xF8-0xFB */
+	0x99, 0xD6, 0x91, 0x5D, 0x91, 0x5C, 0x91, 0xD6, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_67[512] = {
+	0x8D, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x98, 0xF0, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x8C, 0x8E, 0x97, 0x4C, 0x00, 0x00, 0x95, 0xFC, /* 0x08-0x0B */
+	0x00, 0x00, 0x95, 0x9E, 0xED, 0xC3, 0x9E, 0x4B, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x8D, 0xF1, 0x92, 0xBD, 0x9E, 0x4C, 0x98, 0x4E, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x5D, /* 0x18-0x1B */
+	0x00, 0x00, 0x92, 0xA9, 0x9E, 0x4D, 0x8A, 0xFA, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x4E, 0x9E, 0x4F, /* 0x24-0x27 */
+	0x96, 0xD8, 0x00, 0x00, 0x96, 0xA2, 0x96, 0x96, /* 0x28-0x2B */
+	0x96, 0x7B, 0x8E, 0x44, 0x9E, 0x51, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8E, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x96, 0x70, 0x00, 0x00, 0x9E, 0x53, 0x9E, 0x56, /* 0x34-0x37 */
+	0x9E, 0x55, 0x00, 0x00, 0x8A, 0xF7, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x8B, 0x80, 0x00, 0x00, 0x9E, 0x52, /* 0x3C-0x3F */
+	0x00, 0x00, 0x9E, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x57, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x90, 0x99, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x9B, 0x88, 0xC7, /* 0x4C-0x4F */
+	0x8D, 0xDE, 0x91, 0xBA, 0x00, 0x00, 0x8E, 0xDB, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xF1, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x9E, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x93, 0x6D, 0x00, 0x00, 0x9E, 0x58, 0x91, 0xA9, /* 0x5C-0x5F */
+	0x9E, 0x59, 0x8F, 0xF0, 0x96, 0xDB, 0x9E, 0x5B, /* 0x60-0x63 */
+	0x9E, 0x5C, 0x97, 0x88, 0xED, 0xC5, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x61, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x8D, 0x59, 0x00, 0x00, 0x94, 0x74, /* 0x6C-0x6F */
+	0x9E, 0x5E, 0x93, 0x8C, 0x9D, 0xDC, 0x9D, 0xE0, /* 0x70-0x73 */
+	0x00, 0x00, 0x8B, 0x6E, 0x00, 0x00, 0x94, 0x66, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x9E, 0x60, 0x00, 0x00, 0x8F, 0xBC, 0x94, 0xC2, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x9E, 0x66, 0x00, 0x00, 0x94, 0xF8, /* 0x84-0x87 */
+	0x00, 0x00, 0x9E, 0x5D, 0x00, 0x00, 0x9E, 0x63, /* 0x88-0x8B */
+	0x9E, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x90, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x96, 0x8D, 0x00, 0x00, 0x97, 0xD1, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x87, 0x00, 0x00, /* 0x98-0x9B */
+	0x89, 0xCA, 0x8E, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x98, 0x67, 0x9E, 0x65, 0x90, 0x95, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x64, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x9E, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xCD, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x6B, /* 0xB0-0xB3 */
+	0x9E, 0x69, 0x00, 0x00, 0x89, 0xCB, 0x9E, 0x67, /* 0xB4-0xB7 */
+	0x9E, 0x6D, 0x9E, 0x73, 0x00, 0x00, 0xED, 0xC6, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xED, 0xC8, 0x91, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x95, 0xBF, 0x00, 0x00, 0x9E, 0x75, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x41, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x74, 0x94, 0x90, /* 0xCC-0xCF */
+	0x96, 0x5E, 0x8A, 0xB9, 0x00, 0x00, 0x90, 0xF5, /* 0xD0-0xD3 */
+	0x8F, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x92, 0xD1, 0x00, 0x00, 0x97, 0x4D, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x9E, 0x70, 0x9E, 0x6F, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x71, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x9E, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x76, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x9E, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x9E, 0x6A, 0x00, 0x00, 0x9E, 0x72, 0x9E, 0x68, /* 0xEC-0xEF */
+	0x00, 0x00, 0x92, 0x8C, 0x00, 0x00, 0x96, 0xF6, /* 0xF0-0xF3 */
+	0x8E, 0xC4, 0x8D, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xB8, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x8F, 0x8A, 0x60, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_68[512] = {
+	0x00, 0x00, 0xED, 0xC9, 0x92, 0xCC, 0x93, 0xC8, /* 0x00-0x03 */
+	0x89, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xF0, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xB2, 0x8C, 0x49, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x78, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x8D, 0x5A, 0x8A, 0x9C, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x9E, 0x7A, 0x8A, 0x94, 0x9E, 0x81, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x7D, 0x00, 0x00, /* 0x30-0x33 */
+	0x90, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x8A, 0x6A, 0x8D, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x8A, 0x69, 0x8D, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x9E, 0x7B, 0x8C, 0x85, 0x8C, 0x6A, 0x93, 0x8D, /* 0x40-0x43 */
+	0xED, 0xCA, 0x00, 0x00, 0x9E, 0x79, 0x00, 0x00, /* 0x44-0x47 */
+	0x88, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x9E, 0x7C, 0x9E, 0x7E, 0x00, 0x00, /* 0x4C-0x4F */
+	0x8B, 0xCB, 0x8C, 0x4B, 0xED, 0xC7, 0x8A, 0xBA, /* 0x50-0x53 */
+	0x8B, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x9E, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x8D, 0xF7, 0x96, 0x91, 0x00, 0x00, 0x8E, 0x56, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x83, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x4F, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x9E, 0x8F, 0x00, 0x00, 0x89, 0xB1, 0x9E, 0x84, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0x95, 0x9E, 0x85, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x97, 0xC0, 0x00, 0x00, 0x9E, 0x8C, /* 0x80-0x83 */
+	0x00, 0x00, 0x94, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x9E, 0x94, 0x00, 0x00, 0x9E, 0x87, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xB2, /* 0x90-0x93 */
+	0x9E, 0x89, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x5B, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x8B, /* 0x98-0x9B */
+	0x00, 0x00, 0x9E, 0x8A, 0x00, 0x00, 0x9E, 0x86, /* 0x9C-0x9F */
+	0x9E, 0x91, 0x00, 0x00, 0x8F, 0xBD, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x9A, 0xEB, 0x8C, 0xE6, /* 0xA4-0xA7 */
+	0x97, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x9E, 0x88, 0x00, 0x00, 0x92, 0xF2, /* 0xAC-0xAF */
+	0x8A, 0x42, 0x8D, 0xAB, 0x00, 0x00, 0x9E, 0x80, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x9E, 0x90, 0x8A, 0x81, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x9E, 0x8E, 0x9E, 0x92, 0x00, 0x00, /* 0xB8-0xBB */
+	0x93, 0x8E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x8A, 0xFC, 0x00, 0x00, 0x9E, 0xB0, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xED, 0x48, 0x96, 0xC7, 0x9E, 0x97, 0x8A, 0xFB, /* 0xC8-0xCB */
+	0x00, 0x00, 0x9E, 0x9E, 0x00, 0x00, 0xED, 0xCB, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x5F, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x9E, 0x9F, 0x9E, 0xA1, 0x00, 0x00, 0x9E, 0xA5, /* 0xD4-0xD7 */
+	0x9E, 0x99, 0x00, 0x00, 0x92, 0x49, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x8F, /* 0xDC-0xDF */
+	0x9E, 0xA9, 0x9E, 0x9C, 0x00, 0x00, 0x9E, 0xA6, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xA0, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x58, 0x9E, 0xAA, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xB1, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x9E, 0xA8, 0x8A, 0xBB, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_69[512] = {
+	0x98, 0x6F, 0x9E, 0x96, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x9E, 0xA4, 0x88, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x9E, 0x98, 0x00, 0x00, 0x00, 0x00, 0x96, 0xB8, /* 0x08-0x0B */
+	0x9E, 0x9D, 0x90, 0x41, 0x92, 0xC5, 0x9E, 0x93, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xA3, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x90, 0x9A, 0x9E, 0xAD, 0x8A, 0x91, /* 0x18-0x1B */
+	0x8C, 0x9F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x9E, 0xAF, 0x9E, 0x9A, 0x9E, 0xAE, /* 0x20-0x23 */
+	0x00, 0x00, 0x9E, 0xA7, 0x9E, 0x9B, 0x00, 0x00, /* 0x24-0x27 */
+	0x9E, 0xAB, 0x00, 0x00, 0x9E, 0xAC, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x9E, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x93, 0xCC, 0x00, 0x00, 0x9E, 0xA2, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x9E, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x9E, 0xBB, 0x00, 0x00, 0x92, 0xD6, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x6B, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x96, /* 0x50-0x53 */
+	0x9E, 0xB6, 0x91, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x9E, 0xBC, 0x91, 0x5E, 0x00, 0x00, /* 0x58-0x5B */
+	0x9E, 0xB3, 0x9E, 0xC0, 0x9E, 0xBF, 0x00, 0x00, /* 0x5C-0x5F */
+	0x93, 0xED, 0x9E, 0xBE, 0x93, 0xE8, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xED, 0xCD, 0x00, 0x00, 0x9E, 0xC2, 0x9E, 0xB5, /* 0x68-0x6B */
+	0x00, 0x00, 0x8B, 0xC6, 0x9E, 0xB8, 0x8F, 0x7C, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x80, /* 0x70-0x73 */
+	0x9E, 0xBA, 0x8B, 0xC9, 0x00, 0x00, 0x9E, 0xB2, /* 0x74-0x77 */
+	0x9E, 0xB4, 0x9E, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x98, 0x4F, 0x8A, 0x79, 0x9E, 0xB7, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x9E, 0xC1, 0x8A, 0x54, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xE5, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x7C, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x9E, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x98, 0x50, 0x9E, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xED, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x90, 0x59, /* 0x98-0x9B */
+	0x9E, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x9E, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xD0, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xC4, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x9E, 0xE1, 0x9E, 0xC3, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x9E, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xCE, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xC9, 0x9E, 0xC6, /* 0xBC-0xBF */
+	0x00, 0x00, 0x9E, 0xC7, 0x00, 0x00, 0x9E, 0xCF, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xA0, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xCC, 0x8D, 0x5C, /* 0xC8-0xCB */
+	0x92, 0xC6, 0x91, 0x84, 0x9E, 0xCA, 0x00, 0x00, /* 0xCC-0xCF */
+	0x9E, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xC8, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x97, 0x6C, 0x96, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x9E, 0xCD, 0x9E, 0xD7, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xD0, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xDF, /* 0xE4-0xE7 */
+	0x9E, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xE5, /* 0xE8-0xEB */
+	0x00, 0x00, 0x9E, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xDE, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x9E, 0xDD, 0x00, 0x00, 0x92, 0xCE, /* 0xF8-0xFB */
+	0x00, 0x00, 0x91, 0x85, 0x00, 0x00, 0x9E, 0xDB, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6A[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xD9, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x9E, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xE6, 0x94, 0xF3, /* 0x08-0x0B */
+	0x9E, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xE7, 0x9E, 0xEA, /* 0x10-0x13 */
+	0x9E, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x92, 0x94, /* 0x14-0x17 */
+	0x00, 0x00, 0x95, 0x57, 0x00, 0x00, 0x9E, 0xDA, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xE2, 0x8F, 0xBE, /* 0x1C-0x1F */
+	0x00, 0x00, 0x96, 0xCD, 0x9E, 0xF6, 0x9E, 0xE9, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x8C, 0xA0, 0x89, 0xA1, 0x8A, 0x7E, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xD1, 0x00, 0x00, /* 0x2C-0x2F */
+	0xED, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x8F, 0xBF, 0x9E, 0xEE, 0x00, 0x00, /* 0x34-0x37 */
+	0x9E, 0xF5, 0x8E, 0xF7, 0x8A, 0x92, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x92, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x9E, 0xEB, 0x00, 0x00, 0xED, 0xD3, 0x9E, 0xF0, /* 0x44-0x47 */
+	0x9E, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xB4, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x8B, 0x6B, 0x9E, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x40, /* 0x5C-0x5F */
+	0x00, 0x00, 0x93, 0xC9, 0x9E, 0xF1, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xF3, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xD2, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xED, 0xED, 0xD4, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x9E, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xD5, 0x8A, 0x80, /* 0x7C-0x7F */
+	
+	0x92, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x9E, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x9E, 0xF8, 0x8C, 0xE7, 0x00, 0x00, /* 0x8C-0x8F */
+	0x9E, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x40, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x9E, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x9E, 0xF9, 0x00, 0x00, 0x9E, 0xFB, 0x9E, 0xFC, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x4B, 0x00, 0x00, /* 0xA8-0xAB */
+	0x9F, 0x47, 0x00, 0x00, 0x9E, 0x8D, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x46, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x9F, 0x45, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x42, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x9E, 0xE8, 0x9F, 0x44, 0x9F, 0x43, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x9F, 0x49, 0x00, 0x00, 0x98, 0x45, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x4C, 0x8B, 0xF9, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x48, 0x9F, 0x4A, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xD6, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xED, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x94, 0xA5, 0x00, 0x00, 0x9F, 0x4D, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x51, 0x9F, 0x4E, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_6B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x97, 0x93, 0x9F, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x9E, 0xDC, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x52, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x53, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x89, 0x54, 0x00, 0x00, 0x9F, 0x55, /* 0x1C-0x1F */
+	0x8C, 0x87, 0x8E, 0x9F, 0x00, 0x00, 0x8B, 0xD3, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xA2, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x7E, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x57, /* 0x34-0x37 */
+	0x9F, 0x56, 0x9F, 0x59, 0x8B, 0x5C, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x8B, 0xD4, 0x8A, 0xBC, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x5C, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x5B, /* 0x44-0x47 */
+	0x00, 0x00, 0x9F, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x89, 0xCC, 0x00, 0x00, 0x92, 0x56, 0x00, 0x00, /* 0x4C-0x4F */
+	0x9F, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xBD, /* 0x50-0x53 */
+	0x9F, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x9F, 0x5F, 0x00, 0x00, 0x9F, 0x61, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x62, /* 0x5C-0x5F */
+	0x00, 0x00, 0x9F, 0x63, 0x8E, 0x7E, 0x90, 0xB3, /* 0x60-0x63 */
+	0x8D, 0x9F, 0x00, 0x00, 0x95, 0x90, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x95, 0xE0, 0x98, 0x63, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x95, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xCE, /* 0x70-0x73 */
+	0x97, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x9F, 0x64, 0x9F, 0x65, 0x00, 0x00, 0x8E, 0x80, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x66, /* 0x7C-0x7F */
+	
+	0x9F, 0x67, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x69, /* 0x80-0x83 */
+	0x9F, 0x68, 0x00, 0x00, 0x96, 0x77, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x8F, 0x7D, 0x8E, 0xEA, 0x8E, 0x63, /* 0x88-0x8B */
+	0x00, 0x00, 0x9F, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x9F, 0x6C, 0x90, 0x42, 0x00, 0x00, /* 0x94-0x97 */
+	0x9F, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x6D, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x9F, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x6F, 0x9F, 0x70, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x71, /* 0xAC-0xAF */
+	0x00, 0x00, 0x9F, 0x73, 0x9F, 0x72, 0x9F, 0x74, /* 0xB0-0xB3 */
+	0x89, 0xA3, 0x92, 0x69, 0x00, 0x00, 0x9F, 0x75, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x45, 0x8A, 0x6B, /* 0xB8-0xBB */
+	0x9F, 0x76, 0x00, 0x00, 0x00, 0x00, 0x93, 0x61, /* 0xBC-0xBF */
+	0x9A, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x8B, 0x42, 0x9F, 0x77, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x78, /* 0xC8-0xCB */
+	0x00, 0x00, 0x95, 0xEA, 0x96, 0x88, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0xC5, 0x9F, 0x79, /* 0xD0-0xD3 */
+	0x94, 0xE4, 0x00, 0x00, 0xED, 0xD8, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x94, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x96, 0xD1, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7A, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7C, /* 0xE8-0xEB */
+	0x9F, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7E, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7D, /* 0xF0-0xF3 */
+};
+
+static const unsigned char u2c_6C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x9F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x81, /* 0x0C-0x0F */
+	0x00, 0x00, 0x96, 0xAF, 0x00, 0x00, 0x9F, 0x82, /* 0x10-0x13 */
+	0x9F, 0x83, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x43, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x84, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x86, /* 0x20-0x23 */
+	0x9F, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x90, 0x85, 0x00, 0x00, 0x00, 0x00, 0x95, 0x58, /* 0x34-0x37 */
+	0x89, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xC3, 0xED, 0xD9, /* 0x3C-0x3F */
+	0x92, 0xF3, 0x8F, 0x60, 0x8B, 0x81, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xC4, 0x00, 0x00, /* 0x4C-0x4F */
+	0x8E, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x9F, 0x88, 0x00, 0x00, 0x8A, 0xBE, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x98, 0x00, 0x00, /* 0x58-0x5B */
+	0xED, 0xDA, 0x93, 0xF0, 0x9F, 0x87, 0x8D, 0x5D, /* 0x5C-0x5F */
+	0x92, 0x72, 0x00, 0x00, 0x9F, 0x89, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x9F, 0x91, 0x00, 0x00, 0x9F, 0x8A, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xDC, /* 0x6C-0x6F */
+	0x91, 0xBF, 0x00, 0x00, 0x8B, 0x82, 0x9F, 0x92, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x88, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x8B, 0x44, 0x9F, 0x90, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x9F, 0x8E, 0x9F, 0x8B, 0x97, 0x80, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xDB, 0x00, 0x00, /* 0x84-0x87 */
+	0x92, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x93, 0xD7, 0x9F, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x9F, 0x94, 0x00, 0x00, 0x9F, 0x93, 0x8C, 0x42, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xAB, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x8D, 0xB9, 0x9F, 0x8D, 0x9F, 0x8F, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x96, 0x76, 0x91, 0xF2, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x97, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x9C, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x9F, 0x9D, 0x00, 0x00, 0x89, 0xCD, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x95, 0xA6, 0x96, 0xFB, 0x9F, 0x9F, 0x8E, 0xA1, /* 0xB8-0xBB */
+	0x8F, 0xC0, 0x9F, 0x98, 0x9F, 0x9E, 0x89, 0x88, /* 0xBC-0xBF */
+	0x00, 0x00, 0x8B, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x9F, 0x95, 0x9F, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x90, 0xF2, 0x94, 0x91, 0x00, 0x00, /* 0xC8-0xCB */
+	0x94, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x97, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x96, 0x40, 0x00, 0x00, 0x9F, 0x99, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x9F, 0xA2, 0xED, 0xDD, 0x9F, 0xA0, /* 0xD8-0xDB */
+	0x00, 0x00, 0x9F, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x96, 0x41, 0x94, 0x67, 0x8B, 0x83, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x93, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x92, 0x8D, 0x00, 0x00, 0x9F, 0xA3, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xA1, /* 0xEC-0xEF */
+	0x91, 0xD7, 0x9F, 0x96, 0x00, 0x00, 0x89, 0x6A, /* 0xF0-0xF3 */
+};
+
+static const unsigned char u2c_6D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xED, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x6D, /* 0x08-0x0B */
+	0x9F, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xAD, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xF4, /* 0x14-0x17 */
+	0x00, 0x00, 0x9F, 0xAA, 0x00, 0x00, 0x97, 0x8C, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0xB4, 0x9F, 0xA4, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x92, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x89, 0x6B, 0x8D, 0x5E, 0x9F, 0xA7, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x46, 0x9F, 0xAC, /* 0x30-0x33 */
+	0x00, 0x00, 0x9F, 0xAB, 0x9F, 0xA6, 0x00, 0x00, /* 0x34-0x37 */
+	0x9F, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x88, /* 0x38-0x3B */
+	0x00, 0x00, 0x9F, 0xA8, 0x94, 0x68, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x97, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x8F, 0xF2, 0x90, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x9F, 0xB4, 0x9F, 0xB2, 0x00, 0x00, /* 0x58-0x5B */
+	0x95, 0x6C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xAF, /* 0x60-0x63 */
+	0x9F, 0xB1, 0x00, 0x00, 0x89, 0x59, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x8D, 0x5F, 0x98, 0x51, 0x00, 0x00, /* 0x68-0x6B */
+	0x8A, 0x5C, 0x00, 0x00, 0x95, 0x82, 0xED, 0xE0, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x97, 0x81, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x43, /* 0x74-0x77 */
+	0x90, 0x5A, 0x9F, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x9F, 0xB8, 0x00, 0x00, 0xED, 0xDF, /* 0x84-0x87 */
+	0x8F, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x97, 0x4F, 0x00, 0x00, 0x9F, 0xB5, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xB0, /* 0x90-0x93 */
+	0x00, 0x00, 0x9F, 0xB6, 0xED, 0xE1, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x97, 0xDC, 0x00, 0x00, 0x93, 0x93, /* 0x98-0x9B */
+	0x93, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xED, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x55, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x74, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x9F, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x9F, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x97, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x97, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x9F, 0xC6, 0x9F, 0xC0, 0x9F, 0xBD, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xD2, /* 0xC8-0xCB */
+	0x9F, 0xC3, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE3, /* 0xCC-0xCF */
+	0x00, 0x00, 0x8F, 0x69, 0x9F, 0xC5, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x9F, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x93, 0x91, 0x9F, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xC2, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x92, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x9F, 0xC9, 0x00, 0x00, 0x9F, 0xBE, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x9F, 0xC4, 0x00, 0x00, 0x9F, 0xCB, 0x88, 0xFA, /* 0xE8-0xEB */
+	0x9F, 0xC1, 0x00, 0x00, 0x9F, 0xCC, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x90, 0x5B, 0xED, 0xE5, 0x8F, 0x7E, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x95, 0xA3, 0x00, 0x00, 0x8D, 0xAC, /* 0xF4-0xF7 */
+	0xED, 0xE4, 0x9F, 0xB9, 0x9F, 0xC7, 0x93, 0x59, /* 0xF8-0xFB */
+	0xED, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x90, 0xB4, 0x00, 0x00, 0x8A, 0x89, /* 0x04-0x07 */
+	0x8D, 0xCF, 0x8F, 0xC2, 0x9F, 0xBB, 0x8F, 0x61, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x6B, /* 0x10-0x13 */
+	0x00, 0x00, 0x9F, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x9F, 0xD0, 0x8F, 0x8D, 0x8C, 0xB8, /* 0x18-0x1B */
+	0x00, 0x00, 0x9F, 0xDF, 0x00, 0x00, 0x9F, 0xD9, /* 0x1C-0x1F */
+	0x8B, 0x94, 0x93, 0x6E, 0x00, 0x00, 0x9F, 0xD4, /* 0x20-0x23 */
+	0x9F, 0xDD, 0x88, 0xAD, 0x89, 0x51, 0xED, 0xE9, /* 0x24-0x27 */
+	0x00, 0x00, 0x89, 0xB7, 0x00, 0x00, 0x9F, 0xD6, /* 0x28-0x2B */
+	0x91, 0xAA, 0x9F, 0xCD, 0x9F, 0xCF, 0x8D, 0x60, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x9F, 0xE0, 0xED, 0xE7, 0x9F, 0xDB, 0x00, 0x00, /* 0x38-0x3B */
+	0xED, 0xEA, 0x00, 0x00, 0x9F, 0xD3, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xDA, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xA9, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x9F, 0xD8, 0x9F, 0xDC, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0xCE, 0x00, 0x00, /* 0x54-0x57 */
+	0x8F, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x92, 0x58, /* 0x58-0x5B */
+	0xED, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xD2, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x4E, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xD5, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xCE, 0x93, 0x92, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xD1, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xD7, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x70, 0x8E, 0xBC, /* 0x7C-0x7F */
+	
+	0x96, 0x9E, 0x00, 0x00, 0x9F, 0xE1, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x94, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xED, /* 0x8C-0x8F */
+	0x8C, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x80, 0x00, 0x00, /* 0x94-0x97 */
+	0x9F, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x97, 0xAD, 0x8D, 0x61, 0x00, 0x00, 0x9F, 0xF0, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0xEC, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x9F, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xE2, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xE8, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xEA, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x6E, 0x9F, 0xE5, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x4D, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x9F, 0xE7, 0x00, 0x00, 0xED, 0xEB, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xEF, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x9F, 0xE9, 0x96, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x9F, 0xE4, 0x00, 0x00, 0x8E, 0xA0, /* 0xC8-0xCB */
+	0x9F, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x8A, 0x8A, 0x00, 0x00, 0x9F, 0xE6, /* 0xD0-0xD3 */
+	0x9F, 0xEB, 0x9F, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x91, 0xEA, 0x91, 0xD8, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x9F, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xFA, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xF8, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x93, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x42, /* 0xF4-0xF7 */
+	0x9F, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0xF6, 0x9F, 0xDE, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6F[512] = {
+	0x00, 0x00, 0x8B, 0x99, 0x95, 0x59, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xBD, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x8D, 0x97, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x52, /* 0x0C-0x0F */
+	0x00, 0x00, 0x9F, 0xF2, 0x00, 0x00, 0xE0, 0x41, /* 0x10-0x13 */
+	0x89, 0x89, 0x91, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x94, 0x99, 0x00, 0x00, 0x8A, 0xBF, 0x97, 0xF8, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x9F, /* 0x28-0x2B */
+	0x92, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x9F, 0xF9, 0x9F, 0xFB, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x91, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x40, 0x9F, 0xF7, /* 0x3C-0x3F */
+	0x00, 0x00, 0x9F, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x8A, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x8C, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xE0, 0x4E, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x49, /* 0x58-0x5B */
+	0x90, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x83, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x8F, 0x81, 0x00, 0x00, 0xE0, 0x52, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xE0, 0x4B, 0x92, 0xAA, 0xE0, 0x48, /* 0x6C-0x6F */
+	0x92, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xE0, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xE0, 0x45, 0x00, 0x00, 0xE0, 0x44, 0x00, 0x00, /* 0x78-0x7B */
+	0xE0, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xE0, 0x47, 0xE0, 0x46, 0xE0, 0x4C, 0x00, 0x00, /* 0x80-0x83 */
+	0x90, 0x9F, 0x00, 0x00, 0xE0, 0x43, 0x00, 0x00, /* 0x84-0x87 */
+	0xED, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x4F, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE0, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xC0, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE0, 0x55, 0x00, 0x00, 0xE0, 0x54, /* 0xA0-0xA3 */
+	0xE0, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x59, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x93, 0x62, 0x00, 0x00, 0xE0, 0x53, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xED, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE0, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x8C, 0x83, 0x91, 0xF7, 0xE0, 0x51, 0x94, 0x5A, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x58, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE0, 0x5D, 0xE0, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE0, 0x5E, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x61, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x5A, /* 0xDC-0xDF */
+	0x8D, 0x8A, 0x94, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x9F, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x94, /* 0xE8-0xEB */
+	0xE0, 0x5C, 0x00, 0x00, 0xE0, 0x60, 0x91, 0xF3, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE0, 0x5F, 0x00, 0x00, 0xE0, 0x4A, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xED, 0xEE, 0xE8, 0x89, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x64, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x68, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_70[512] = {
+	0x00, 0x00, 0xE0, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xED, 0xEF, 0x00, 0x00, 0xED, 0xF0, /* 0x04-0x07 */
+	0x00, 0x00, 0xE0, 0x62, 0x00, 0x00, 0xE0, 0x63, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x67, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE0, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x95, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE0, 0x6D, 0x00, 0x00, 0xE0, 0x6A, 0xE0, 0x69, /* 0x18-0x1B */
+	0x00, 0x00, 0xE0, 0x6C, 0x93, 0xD2, 0xE0, 0x6E, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x95, 0x91, 0xEB, /* 0x24-0x27 */
+	0xED, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x90, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE0, 0x6F, 0x00, 0x00, 0xE0, 0x71, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x70, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x9F, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xE0, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x93, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x73, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xCE, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x94, /* 0x6C-0x6F */
+	0x8A, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x8B, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x8E, 0xDC, 0x8D, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xED, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x98, 0x46, 0x90, 0x86, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x8A, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x75, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xE0, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xF3, /* 0xA8-0xAB */
+	0xE0, 0x78, 0x92, 0x59, 0xE0, 0x7B, 0xE0, 0x76, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x7A, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE0, 0x79, 0x93, 0x5F, 0x88, 0xD7, 0xED, 0x46, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x97, 0xF3, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x7D, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x47, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE0, 0x80, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE0, 0x7E, 0x00, 0x00, 0xE0, 0x7C, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE0, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x96, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE0, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_71[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xED, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xE0, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xF4, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x89, 0x8B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE0, 0x84, 0x95, 0xB0, 0x00, 0x00, /* 0x18-0x1B */
+	0xE0, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x96, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xC5, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0x52, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x8F, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xF7, 0xED, 0xF8, /* 0x44-0x47 */
+	0x00, 0x00, 0x97, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE0, 0x8A, 0x00, 0x00, 0x90, 0xF7, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE0, 0x86, 0xE0, 0x8B, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x89, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xED, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x89, 0x00, 0x00, /* 0x60-0x63 */
+	0x94, 0x81, 0xE0, 0x85, 0xE0, 0x88, 0x8F, 0xC6, /* 0x64-0x67 */
+	0x00, 0x00, 0x94, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE0, 0x8C, 0x00, 0x00, 0x8E, 0xCF, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x90, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xE0, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE0, 0x87, 0x00, 0x00, 0x8C, 0x46, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x8D, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x97, 0x6F, 0xE0, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xEA, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x6E, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE0, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE0, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x94, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE0, 0x94, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x95, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xED, 0xFA, 0x00, 0x00, 0x94, 0x52, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x93, 0x95, 0xE0, 0x97, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0x99, 0x00, 0x00, /* 0xCC-0xCF */
+	0x97, 0xD3, 0x00, 0x00, 0xE0, 0x96, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE0, 0x98, 0x89, 0x8D, 0x00, 0x00, 0xE0, 0x93, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x7A, /* 0xDC-0xDF */
+	0xE0, 0x9A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x91, 0x87, 0x8E, 0x57, 0xE0, 0x9C, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xE0, 0x9B, 0x90, 0x43, 0x99, 0xD7, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xE0, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE0, 0x9F, 0x00, 0x00, 0xE0, 0x8E, /* 0xF8-0xFB */
+	0xE0, 0x9E, 0x00, 0x00, 0xED, 0xFB, 0xE0, 0xA0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_72[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x9A, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE0, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE0, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA3, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xE0, 0xA4, 0x00, 0x00, 0x92, 0xDC, 0x00, 0x00, /* 0x28-0x2B */
+	0xE0, 0xA6, 0xE0, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE0, 0xA7, 0x00, 0x00, 0xE0, 0xA8, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x8E, 0xDD, 0x95, 0x83, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xEA, 0xE0, 0xA9, /* 0x38-0x3B */
+	0xE0, 0xAA, 0x91, 0x75, 0x8E, 0xA2, 0xE0, 0xAB, /* 0x3C-0x3F */
+	0xE0, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xAD, 0x95, 0xD0, /* 0x44-0x47 */
+	0x94, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xAE, /* 0x48-0x4B */
+	0x94, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0xAB, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xE0, 0xAF, 0x89, 0xE5, 0x00, 0x00, 0x8B, 0x8D, /* 0x58-0x5B */
+	0x00, 0x00, 0x96, 0xC4, 0x00, 0x00, 0x96, 0xB4, /* 0x5C-0x5F */
+	0x00, 0x00, 0x89, 0xB2, 0x98, 0x53, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x71, /* 0x64-0x67 */
+	0x00, 0x00, 0x95, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xB5, 0x00, 0x00, /* 0x70-0x73 */
+	0xE0, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x93, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x8C, 0xA1, 0xE0, 0xB1, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x8D, 0xD2, 0xE0, 0xB3, 0xE0, 0xB2, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB4, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xB5, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xB6, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x8B, 0x5D, 0x00, 0x00, 0xE0, 0xB7, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB8, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x8C, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x94, 0xC6, /* 0xAC-0xAF */
+	0x00, 0x00, 0xED, 0xFC, 0xE0, 0xBA, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xF3, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE0, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x40, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xB6, 0xE0, 0xBB, /* 0xC0-0xC3 */
+	0xE0, 0xBD, 0x00, 0x00, 0xE0, 0xBC, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xBE, 0x00, 0x00, /* 0xCC-0xCF */
+	0x8C, 0xCF, 0x00, 0x00, 0xE0, 0xBF, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xE7, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x91, 0x5F, 0x00, 0x00, 0x8D, 0x9D, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xE0, 0xC1, 0xE0, 0xC2, 0xE0, 0xC0, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x8E, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x93, 0xC6, 0x8B, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC4, /* 0xF4-0xF7 */
+	0x92, 0x4B, 0xE0, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x98, 0x54, 0x94, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_73[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xC7, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xC9, 0xE0, 0xC6, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0xD2, /* 0x18-0x1B */
+	0xE0, 0xC8, 0xE0, 0xCA, 0x00, 0x00, 0x97, 0xC2, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xEE, 0x41, 0xE0, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE0, 0xCD, 0x92, 0x96, 0x94, 0x4C, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0xA3, 0xE0, 0xCC, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xE0, 0xCB, 0x00, 0x00, 0x97, 0x50, 0x97, 0x51, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xCF, 0x89, 0x8E, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x8D, 0x96, 0x8E, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xD0, 0xE0, 0xD1, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xD3, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x62, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xE0, 0xD5, 0x00, 0x00, 0xE0, 0xD4, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE0, 0xD6, 0x00, 0x00, 0x8A, 0x6C, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xE0, 0xD8, 0x00, 0x00, 0xEE, 0x43, /* 0x74-0x77 */
+	0xE0, 0xD7, 0x00, 0x00, 0xE0, 0xDA, 0xE0, 0xD9, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x8C, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x97, 0xA6, /* 0x84-0x87 */
+	0x00, 0x00, 0x8B, 0xCA, 0x00, 0x00, 0x89, 0xA4, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xE8, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x8A, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xE6, 0xE0, 0xDC, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xDE, /* 0xB8-0xBB */
+	0x00, 0x00, 0xEE, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE0, 0xDF, 0x00, 0x00, 0x89, 0xCF, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE0, 0xDB, 0xEE, 0x45, 0x8E, 0x58, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x92, 0xBF, 0xE0, 0xDD, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x48, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x46, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xE2, 0x00, 0x00, /* 0xDC-0xDF */
+	0x8E, 0xEC, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x47, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE0, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x5D, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x94, 0xC7, 0xE0, 0xE1, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE0, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xEE, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xE0, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0xBB, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_74[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x85, /* 0x00-0x03 */
+	0x00, 0x00, 0xE0, 0xE4, 0x97, 0x9D, 0xEE, 0x49, /* 0x04-0x07 */
+	0x00, 0x00, 0x97, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0xF4, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE0, 0xE6, 0xEE, 0x4B, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xEE, 0x4D, 0xEE, 0x4C, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x4E, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xE8, 0x97, 0xD4, /* 0x30-0x33 */
+	0x8B, 0xD5, 0x94, 0xFA, 0x94, 0x69, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xE9, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xEB, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE0, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE0, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xE0, 0xED, 0x8C, 0xE8, 0x89, 0x6C, /* 0x58-0x5B */
+	0xE0, 0xEF, 0x00, 0x00, 0x90, 0x90, 0xE0, 0xEC, /* 0x5C-0x5F */
+	0x97, 0xDA, 0x00, 0x00, 0xEE, 0x4F, 0xE0, 0xF2, /* 0x60-0x63 */
+	0xEA, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE0, 0xF0, 0xE0, 0xF3, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE5, /* 0x6C-0x6F */
+	0xE0, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xBA, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xF4, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xF5, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x9E, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xEE, 0x50, 0x00, 0x00, 0xE0, 0xF6, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xF7, 0xEE, 0x51, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xE3, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xF8, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x8A, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x8E, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xF9, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xFA, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE0, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x89, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xE1, 0x40, 0x00, 0x00, 0x95, 0x5A, 0xE1, 0x41, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xA2, 0xE1, 0x42, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE1, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x44, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE1, 0x46, 0xE1, 0x47, 0xE1, 0x45, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x72, 0xE1, 0x49, /* 0xF4-0xF7 */
+	0xE1, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_75[512] = {
+	0x00, 0x00, 0xEE, 0x52, 0x00, 0x00, 0xE1, 0x4B, /* 0x00-0x03 */
+	0xE1, 0x4A, 0xE1, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xE1, 0x4D, 0xE1, 0x4F, 0xE1, 0x4E, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x8D, 0x99, 0x00, 0x00, 0xE1, 0x51, /* 0x10-0x13 */
+	0x00, 0x00, 0xE1, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x8A, 0xC3, 0x00, 0x00, 0x90, 0x72, 0x00, 0x00, /* 0x18-0x1B */
+	0x93, 0x5B, 0x00, 0x00, 0xE1, 0x52, 0x90, 0xB6, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x59, /* 0x20-0x23 */
+	0x00, 0x00, 0x89, 0x99, 0xE1, 0x53, 0x00, 0x00, /* 0x24-0x27 */
+	0x97, 0x70, 0x00, 0x00, 0x00, 0x00, 0x95, 0xE1, /* 0x28-0x2B */
+	0xE1, 0x54, 0x00, 0x00, 0x00, 0x00, 0xED, 0x8C, /* 0x2C-0x2F */
+	0x93, 0x63, 0x97, 0x52, 0x8D, 0x62, 0x90, 0x5C, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x6A, /* 0x34-0x37 */
+	0x99, 0xB2, 0x00, 0x00, 0x92, 0xAC, 0x89, 0xE6, /* 0x38-0x3B */
+	0xE1, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE1, 0x56, 0x00, 0x00, 0xE1, 0x5B, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xE1, 0x59, 0xE1, 0x58, 0x9D, 0xC0, /* 0x48-0x4B */
+	0x8A, 0x45, 0xE1, 0x57, 0x00, 0x00, 0x88, 0xD8, /* 0x4C-0x4F */
+	0x00, 0x00, 0x94, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x94, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x97, 0xAF, 0xE1, 0x5C, 0xE1, 0x5A, /* 0x58-0x5B */
+	0x92, 0x7B, 0x90, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x94, 0xA9, 0x00, 0x00, 0x95, 0x4C, 0x00, 0x00, /* 0x60-0x63 */
+	0xE1, 0x5E, 0x97, 0xAA, 0x8C, 0x6C, 0xE1, 0x5F, /* 0x64-0x67 */
+	0x00, 0x00, 0xE1, 0x5D, 0x94, 0xD4, 0xE1, 0x60, /* 0x68-0x6B */
+	0x00, 0x00, 0xE1, 0x61, 0x00, 0x00, 0xEE, 0x53, /* 0x6C-0x6F */
+	0x88, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x8F, 0xF4, /* 0x70-0x73 */
+	0xE1, 0x66, 0x00, 0x00, 0xE1, 0x63, 0x93, 0xEB, /* 0x74-0x77 */
+	0xE1, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x45, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x69, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x64, 0xE1, 0x65, /* 0x84-0x87 */
+	0x00, 0x00, 0xE1, 0x68, 0xE1, 0x67, 0x95, 0x44, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0x61, 0x91, 0x60, /* 0x8C-0x8F */
+	0x00, 0x00, 0x8B, 0x5E, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE1, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x6B, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xE1, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x6E, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xE1, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x75, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xE1, 0x76, 0x94, 0xE6, 0xE1, 0x70, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE1, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE1, 0x74, 0x90, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xE1, 0x75, 0xE1, 0x73, 0x8E, 0xBE, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x6F, 0xE1, 0x71, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x95, 0x61, 0x00, 0x00, 0x8F, 0xC7, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x78, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE1, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x79, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x8E, 0xA4, 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x93, 0x97, 0xE1, 0x7A, 0x00, 0x00, 0x92, 0xC9, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x7C, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x9F, 0xE1, 0x7B, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x91, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE1, 0x82, 0x00, 0x00, 0xE1, 0x84, 0xE1, 0x85, /* 0xF0-0xF3 */
+	0x92, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x83, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE1, 0x80, 0x00, 0x00, 0xE1, 0x7D, 0xE1, 0x7E, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_76[512] = {
+	0x00, 0x00, 0xE1, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xE1, 0x88, 0x00, 0x00, 0xE1, 0x86, /* 0x08-0x0B */
+	0x00, 0x00, 0xE1, 0x87, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x89, /* 0x1C-0x1F */
+	0xE1, 0x8B, 0xE1, 0x8C, 0xE1, 0x8D, 0x00, 0x00, /* 0x20-0x23 */
+	0xE1, 0x8E, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x8A, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE1, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xE1, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x91, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xC3, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x94, 0xE1, 0x92, /* 0x44-0x47 */
+	0xE1, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x8A, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xFC, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xC8, 0x00, 0x00, /* 0x54-0x57 */
+	0xE1, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xE1, 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xE1, 0x97, 0xE1, 0x98, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x9C, /* 0x64-0x67 */
+	0xE1, 0x99, 0xE1, 0x9A, 0xE1, 0x9B, 0x00, 0x00, /* 0x68-0x6B */
+	0xE1, 0x9D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE1, 0x9E, 0x00, 0x00, 0xE1, 0x9F, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xA0, 0x00, 0x00, /* 0x74-0x77 */
+	0xE1, 0xA1, 0x00, 0x00, 0x94, 0xAD, 0x93, 0x6F, /* 0x78-0x7B */
+	0xE1, 0xA2, 0x94, 0x92, 0x95, 0x53, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xE1, 0xA3, 0x00, 0x00, 0xEE, 0x54, 0xE1, 0xA4, /* 0x80-0x83 */
+	0x93, 0x49, 0x00, 0x00, 0x8A, 0x46, 0x8D, 0x63, /* 0x84-0x87 */
+	0xE1, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA6, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xA7, 0x00, 0x00, /* 0x8C-0x8F */
+	0x8E, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA9, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xA8, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xE1, 0xAA, 0xE1, 0xAB, 0xEE, 0x57, /* 0x98-0x9B */
+	0xEE, 0x55, 0x00, 0x00, 0xEE, 0x56, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x58, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xE7, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE1, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE1, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x89, /* 0xB4-0xB7 */
+	0xE1, 0xAE, 0xE1, 0xAF, 0xE1, 0xB0, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x4D, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xB1, 0x94, 0x75, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x7E, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x89, 0x6D, 0x00, 0x00, 0x89, 0x76, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE1, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xB4, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xB3, 0x93, 0x90, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xB7, /* 0xD8-0xDB */
+	0x9F, 0x58, 0x00, 0x00, 0xE1, 0xB5, 0x96, 0xBF, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE1, 0xB6, 0x00, 0x00, 0x8A, 0xC4, /* 0xE0-0xE3 */
+	0x94, 0xD5, 0xE1, 0xB7, 0x00, 0x00, 0xE1, 0xB8, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xB9, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xDA, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xD3, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x92, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x91, 0x8A, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBB, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x82, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_77[512] = {
+	0x00, 0x00, 0x8F, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xE1, 0xBE, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBD, /* 0x04-0x07 */
+	0xE1, 0xBC, 0x94, 0xFB, 0x00, 0x00, 0x8A, 0xC5, /* 0x08-0x0B */
+	0x8C, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC4, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xC1, 0x90, 0x5E, /* 0x1C-0x1F */
+	0x96, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xE1, 0xC0, 0xE1, 0xC2, 0xE1, 0xC3, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE1, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC5, /* 0x34-0x37 */
+	0xE1, 0xC6, 0x00, 0x00, 0x92, 0xAD, 0x00, 0x00, /* 0x38-0x3B */
+	0x8A, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x92, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x5A, 0xE1, 0xC7, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xC8, 0xE1, 0xCB, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x90, 0x87, 0x00, 0x00, 0x93, 0xC2, /* 0x60-0x63 */
+	0x00, 0x00, 0xE1, 0xCC, 0x96, 0x72, 0x00, 0x00, /* 0x64-0x67 */
+	0xE1, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xCA, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xE1, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xCE, 0xE1, 0xCD, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD1, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xD0, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE1, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xD4, 0x00, 0x00, /* 0x9C-0x9F */
+	0xE1, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x95, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x8F, 0x75, 0x97, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE1, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x93, 0xB5, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xD6, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE1, 0xD7, 0x00, 0x00, 0xE1, 0xDB, /* 0xB8-0xBB */
+	0xE1, 0xD9, 0xE1, 0xDA, 0x00, 0x00, 0xE1, 0xD8, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xDC, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE1, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xDE, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xDF, 0x96, 0xB5, /* 0xD8-0xDB */
+	0xE1, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xEE, 0xE1, 0xE1, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x92, 0x6D, 0x00, 0x00, 0x94, 0x8A, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x8B, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x92, 0x5A, 0xE1, 0xE2, 0x8B, 0xB8, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xCE, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE1, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_78[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xBB, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xE1, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xE5, 0x00, 0x00, /* 0x10-0x13 */
+	0x8C, 0xA4, 0x8D, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE1, 0xE7, 0xEE, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x93, 0x75, 0x8D, 0xD4, 0x8B, 0x6D, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x43, 0x00, 0x00, /* 0x30-0x33 */
+	0x94, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x76, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x7B, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xE1, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x5D, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x8F, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xEE, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xB0, /* 0x68-0x6B */
+	0x8D, 0x64, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xA5, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xA1, 0x00, 0x00, /* 0x70-0x73 */
+	0xE1, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x5F, 0x00, 0x00, /* 0x78-0x7B */
+	0xE1, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x8C, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xEC, 0x92, 0xF4, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xE1, 0xEF, 0x8A, 0x56, 0xE1, 0xEA, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x94, 0xE8, 0x00, 0x00, 0x89, 0x4F, /* 0x90-0x93 */
+	0x00, 0x00, 0x8D, 0xEA, 0x00, 0x00, 0x98, 0x71, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xEE, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF0, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xC9, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x90, 0xD7, 0xE1, 0xF2, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF3, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE1, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0x6D, 0x00, 0x00, /* 0xB8-0xBB */
+	0xE1, 0xF9, 0x00, 0x00, 0xE1, 0xF8, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x8E, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE1, 0xFA, 0xE1, 0xF5, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xFB, 0xE1, 0xF6, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x94, 0xD6, 0xE1, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE1, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x41, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x40, /* 0xE4-0xE7 */
+	0x96, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xE1, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x88, 0xE9, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE2, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE2, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_79[512] = {
+	0x00, 0x00, 0x8F, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x44, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0x62, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE2, 0x46, 0xE2, 0x45, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE2, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xE6, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xE8, 0xE2, 0x49, /* 0x28-0x2B */
+	0xE2, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xEE, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xA6, 0x00, 0x00, /* 0x38-0x3B */
+	0x97, 0xE7, 0x00, 0x00, 0x8E, 0xD0, 0x00, 0x00, /* 0x3C-0x3F */
+	0xE2, 0x4A, 0x8C, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x5F, /* 0x44-0x47 */
+	0x8B, 0x46, 0x8E, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x97, 0x53, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x50, /* 0x50-0x53 */
+	0x00, 0x00, 0xE2, 0x4F, 0x91, 0x63, 0xE2, 0x4C, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x4E, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x8F, 0x6A, 0x90, 0x5F, 0xE2, 0x4D, /* 0x5C-0x5F */
+	0xE2, 0x4B, 0x00, 0x00, 0x94, 0x49, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x8F, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x95, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x8D, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x98, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x51, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x52, /* 0x7C-0x7F */
+	
+	0xE2, 0x68, 0x8B, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x98, 0x5C, 0x91, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x53, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x89, 0xD0, 0x92, 0xF5, 0x95, 0x9F, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xEE, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x66, /* 0x98-0x9B */
+	0x00, 0x00, 0xE2, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x9A, 0xE2, 0x55, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x57, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x58, 0x00, 0x00, /* 0xAC-0xAF */
+	0x94, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x59, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE2, 0x5A, 0xE2, 0x5B, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x8B, 0xD7, 0x89, 0xD1, 0x93, 0xC3, /* 0xBC-0xBF */
+	0x8F, 0x47, 0x8E, 0x84, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE2, 0x5C, 0x00, 0x00, 0x8F, 0x48, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x89, 0xC8, 0x95, 0x62, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE2, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x94, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x64, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE2, 0x60, 0x00, 0x00, 0xE2, 0x61, /* 0xE0-0xE3 */
+	0x94, 0x89, 0x00, 0x00, 0x90, 0x60, 0xE2, 0x5E, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x92, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xE2, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x8F, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xDA, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_7A[512] = {
+	0x8B, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xE2, 0x62, 0x00, 0x00, 0x00, 0x00, 0x92, 0xF6, /* 0x08-0x0B */
+	0x00, 0x00, 0xE2, 0x63, 0x90, 0xC5, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x96, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x95, 0x42, /* 0x14-0x17 */
+	0xE2, 0x64, 0xE2, 0x65, 0x92, 0x74, 0x00, 0x00, /* 0x18-0x1B */
+	0x97, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x67, /* 0x1C-0x1F */
+	0xE2, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xED, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE2, 0x69, 0x88, 0xEE, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x6C, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x6A, /* 0x38-0x3B */
+	0x89, 0xD2, 0x8C, 0x6D, 0xE2, 0x6B, 0x8D, 0x65, /* 0x3C-0x3F */
+	0x8D, 0x92, 0x00, 0x00, 0x95, 0xE4, 0xE2, 0x6D, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x73, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xE2, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x90, 0xCF, 0x89, 0x6E, 0x89, 0xB8, /* 0x4C-0x4F */
+	0x88, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x6E, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xE2, 0x70, 0xE2, 0x71, 0x8F, 0xF5, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE2, 0x72, 0x00, 0x00, 0x8A, 0x6E, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE2, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x8C, 0x8A, 0x00, 0x00, 0x8B, 0x86, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xE2, 0x75, 0x8B, 0xF3, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE2, 0x76, 0x00, 0x00, 0x90, 0xFA, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x93, 0xCB, 0x00, 0x00, 0x90, 0xDE, /* 0x80-0x83 */
+	0x8D, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE2, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x82, 0x91, 0x8B, /* 0x90-0x93 */
+	0x00, 0x00, 0xE2, 0x79, 0xE2, 0x7B, 0xE2, 0x78, /* 0x94-0x97 */
+	0xE2, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x41, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE2, 0x7C, 0x8C, 0x45, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x87, 0x97, 0x71, /* 0xAC-0xAF */
+	0xE2, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x80, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x4D, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x83, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x96, /* 0xC0-0xC3 */
+	0xE2, 0x82, 0xE2, 0x81, 0x00, 0x00, 0xE2, 0x85, /* 0xC4-0xC7 */
+	0xE2, 0x7D, 0x00, 0x00, 0xE2, 0x86, 0x97, 0xA7, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE2, 0x87, 0x00, 0x00, 0xE2, 0x88, /* 0xCC-0xCF */
+	0x00, 0x00, 0xEE, 0x67, 0x9A, 0xF2, 0xE2, 0x8A, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE2, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE2, 0x8B, 0xE2, 0x8C, 0x00, 0x00, /* 0xD8-0xDB */
+	0x97, 0xB3, 0xE2, 0x8D, 0x00, 0x00, 0xE8, 0xED, /* 0xDC-0xDF */
+	0x8F, 0xCD, 0xE2, 0x8E, 0xE2, 0x8F, 0x8F, 0x76, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x93, 0xB6, 0xE2, 0x90, 0xEE, 0x68, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x47, 0xEE, 0x6A, /* 0xE8-0xEB */
+	0x00, 0x00, 0xE2, 0x91, 0x00, 0x00, 0x92, 0x5B, /* 0xEC-0xEF */
+	0xE2, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xA3, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x99, 0x5E, 0x92, 0x7C, 0x8E, 0xB1, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xC6, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x93, 0x00, 0x00, /* 0x00-0x03 */
+	0xE2, 0xA0, 0x00, 0x00, 0xE2, 0x96, 0x00, 0x00, /* 0x04-0x07 */
+	0x8B, 0x88, 0x00, 0x00, 0xE2, 0x95, 0xE2, 0xA2, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x94, /* 0x0C-0x0F */
+	0x00, 0x00, 0x8F, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE2, 0x98, 0xE2, 0x99, 0x00, 0x00, 0x93, 0x4A, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x9A, 0x00, 0x00, /* 0x1C-0x1F */
+	0x8A, 0x7D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x90, 0x79, 0x95, 0x84, 0x00, 0x00, /* 0x24-0x27 */
+	0xE2, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x91, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x97, /* 0x30-0x33 */
+	0x00, 0x00, 0xE2, 0x9B, 0xE2, 0x9D, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x8D, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xE2, 0xA4, 0x95, 0x4D, 0x00, 0x00, /* 0x44-0x47 */
+	0x94, 0xA4, 0x93, 0x99, 0x00, 0x00, 0x8B, 0xD8, /* 0x48-0x4B */
+	0xE2, 0xA3, 0xE2, 0xA1, 0x00, 0x00, 0x94, 0xB3, /* 0x4C-0x4F */
+	0xE2, 0x9E, 0x92, 0x7D, 0x93, 0x9B, 0x00, 0x00, /* 0x50-0x53 */
+	0x93, 0x9A, 0x00, 0x00, 0x8D, 0xF4, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xE2, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xE2, 0xA6, 0x00, 0x00, 0xE2, 0xA8, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE2, 0xAB, 0x00, 0x00, 0xE2, 0xAC, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE2, 0xA9, 0xE2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xE2, 0xA7, 0xE2, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x9F, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xCD, 0x89, 0xD3, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xB3, /* 0x88-0x8B */
+	0x00, 0x00, 0xE2, 0xB0, 0x00, 0x00, 0xE2, 0xB5, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xB4, 0x00, 0x00, /* 0x90-0x93 */
+	0x94, 0x93, 0x96, 0xA5, 0x00, 0x00, 0x8E, 0x5A, /* 0x94-0x97 */
+	0xE2, 0xAE, 0xE2, 0xB7, 0xE2, 0xB2, 0x00, 0x00, /* 0x98-0x9B */
+	0xE2, 0xB1, 0xE2, 0xAD, 0xEE, 0x6B, 0xE2, 0xAF, /* 0x9C-0x9F */
+	0x00, 0x00, 0x8A, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x5C, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x90, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x94, 0xA0, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE2, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x94, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x90, 0xDF, 0xE2, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x94, 0xCD, 0x00, 0x00, 0xE2, 0xBD, 0x95, 0xD1, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x92, 0x7A, 0x00, 0x00, 0xE2, 0xB8, /* 0xC8-0xCB */
+	0xE2, 0xBA, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xBB, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE2, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x8E, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x93, 0xC4, 0xE2, 0xC3, 0xE2, 0xC2, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE2, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x98, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC8, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xCC, 0xE2, 0xC9, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_7C[512] = {
+	0xE2, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC6, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE2, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE2, 0xC0, 0x99, 0xD3, 0xE2, 0xC7, /* 0x10-0x13 */
+	0xE2, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCA, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD0, /* 0x1C-0x1F */
+	0x00, 0x00, 0x8A, 0xC8, 0x00, 0x00, 0xE2, 0xCD, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCE, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xCF, 0xE2, 0xD2, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD1, /* 0x34-0x37 */
+	0x94, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xE2, 0xD3, 0x97, 0xFA, 0x95, 0xEB, /* 0x3C-0x3F */
+	0xE2, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD5, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE2, 0xD4, 0x90, 0xD0, 0x00, 0x00, 0xE2, 0xD7, /* 0x4C-0x4F */
+	0xE2, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xE2, 0xD6, 0x00, 0x00, 0xE2, 0xDD, 0x00, 0x00, /* 0x54-0x57 */
+	0xE2, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xDB, /* 0x5C-0x5F */
+	0xE2, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xE2, 0xDC, 0xE2, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE2, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xC4, /* 0x70-0x73 */
+	0x00, 0x00, 0xE2, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xE0, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x8B, 0xCC, 0x8C, 0x48, 0xE2, 0xE1, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x95, 0xB2, 0x00, 0x00, 0x90, 0x88, /* 0x88-0x8B */
+	0x00, 0x00, 0x96, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xE2, 0xE2, 0x00, 0x00, 0x97, 0xB1, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x94, 0x94, 0x00, 0x00, 0x91, 0x65, /* 0x94-0x97 */
+	0x94, 0x53, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x6C, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xBE, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE2, 0xE7, 0xE2, 0xE5, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE2, 0xE3, 0x8A, 0x9F, 0x00, 0x00, 0x8F, 0xCF, /* 0xA4-0xA7 */
+	0xE2, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xE6, /* 0xA8-0xAB */
+	0x00, 0x00, 0xE2, 0xE4, 0xE2, 0xEC, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xE2, 0xEB, 0xE2, 0xEA, 0xE2, 0xE9, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE2, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE2, 0xEE, 0x90, 0xB8, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE2, 0xEF, 0x00, 0x00, 0xE2, 0xF1, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE2, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0xD0, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0x57, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xF3, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x9C, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE2, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE2, 0xF4, 0x00, 0x00, 0x95, 0xB3, 0x91, 0x8C, /* 0xDC-0xDF */
+	0x8D, 0x66, 0x00, 0x00, 0xE2, 0xF5, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xC6, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF7, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xF8, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE2, 0xF9, 0x00, 0x00, 0xE2, 0xFA, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x8E, 0x85, 0x00, 0x00, 0xE2, 0xFB, 0x8C, 0x6E, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x8A, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7D[512] = {
+	0x8B, 0x49, 0x00, 0x00, 0xE3, 0x40, 0x00, 0x00, /* 0x00-0x03 */
+	0x96, 0xF1, 0x8D, 0x67, 0xE2, 0xFC, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x43, 0x96, 0xE4, /* 0x08-0x0B */
+	0x00, 0x00, 0x94, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x95, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x8F, 0x83, 0xE3, 0x42, 0x00, 0x00, 0x8E, 0xD1, /* 0x14-0x17 */
+	0x8D, 0x68, 0x8E, 0x86, 0x8B, 0x89, 0x95, 0xB4, /* 0x18-0x1B */
+	0xE3, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x91, 0x66, 0x96, 0x61, 0x8D, 0xF5, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x87, /* 0x28-0x2B */
+	0x92, 0xDB, 0x00, 0x00, 0xE3, 0x46, 0x97, 0xDD, /* 0x2C-0x2F */
+	0x8D, 0xD7, 0x00, 0x00, 0xE3, 0x47, 0x90, 0x61, /* 0x30-0x33 */
+	0x00, 0x00, 0xE3, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x8F, 0xD0, 0x8D, 0xAE, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x48, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x49, 0x8C, 0xBC, /* 0x40-0x43 */
+	0x91, 0x67, 0xE3, 0x44, 0xE3, 0x4A, 0x00, 0x00, /* 0x44-0x47 */
+	0xEE, 0x6D, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x45, /* 0x48-0x4B */
+	0x8C, 0x6F, 0x00, 0x00, 0xE3, 0x4D, 0xE3, 0x51, /* 0x4C-0x4F */
+	0x8C, 0x8B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x4C, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x55, /* 0x58-0x5B */
+	0xEE, 0x6E, 0x00, 0x00, 0x8D, 0x69, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x97, 0x8D, 0x88, 0xBA, 0xE3, 0x52, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x8B, 0x00, 0x00, /* 0x64-0x67 */
+	0xE3, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x50, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x93, 0x9D, 0xE3, 0x4E, 0xE3, 0x4B, /* 0x70-0x73 */
+	0x00, 0x00, 0x8A, 0x47, 0x90, 0xE2, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x8C, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE3, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xE3, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x56, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x53, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x8C, 0x70, 0x91, 0xB1, 0xE3, 0x58, /* 0x98-0x9B */
+	0x91, 0x8E, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x65, /* 0x9C-0x9F */
+	0xEE, 0x70, 0x00, 0x00, 0xE3, 0x61, 0xE3, 0x5B, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x5F, /* 0xA8-0xAB */
+	0x8E, 0xF8, 0x88, 0xDB, 0xE3, 0x5A, 0xE3, 0x62, /* 0xAC-0xAF */
+	0xE3, 0x66, 0x8D, 0x6A, 0x96, 0xD4, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x92, 0xD4, 0xE3, 0x5C, 0x00, 0x00, 0xEE, 0x6F, /* 0xB4-0xB7 */
+	0xE3, 0x64, 0x00, 0x00, 0xE3, 0x59, 0x92, 0x5D, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE3, 0x5E, 0x88, 0xBB, 0x96, 0xC8, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x5D, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xD9, 0x94, 0xEA, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x8D, /* 0xCC-0xCF */
+	0x00, 0x00, 0x97, 0xCE, 0x8F, 0x8F, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE3, 0x8E, 0xEE, 0x71, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE3, 0x67, 0x00, 0x00, 0x90, 0xFC, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE3, 0x63, 0xE3, 0x68, 0xE3, 0x6A, 0x00, 0x00, /* 0xDC-0xDF */
+	0x92, 0xF7, 0xE3, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE3, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x95, 0xD2, 0x8A, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x96, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x88, 0xDC, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x6C, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x97, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x6B, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_7E[512] = {
+	0x00, 0x00, 0x89, 0x8F, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x93, 0xEA, 0xE3, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xE3, 0x75, 0xE3, 0x6F, 0xE3, 0x76, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x72, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x9B, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xC8, 0xE3, 0x74, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE3, 0x71, 0xE3, 0x77, 0xE3, 0x70, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x63, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x44, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x6B, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE3, 0x73, 0xE3, 0x80, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xE3, 0x7B, 0x00, 0x00, 0xE3, 0x7E, /* 0x34-0x37 */
+	0x00, 0x00, 0xE3, 0x7C, 0xE3, 0x81, 0xE3, 0x7A, /* 0x38-0x3B */
+	0x00, 0x00, 0xE3, 0x60, 0x90, 0xD1, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x94, 0xC9, 0x00, 0x00, 0xE3, 0x7D, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x78, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0x40, 0x8C, 0x71, /* 0x48-0x4B */
+	0x00, 0x00, 0x8F, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x72, 0x00, 0x00, /* 0x50-0x53 */
+	0x90, 0x44, 0x91, 0x55, 0xE3, 0x84, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xE3, 0x86, 0xE3, 0x87, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xE3, 0x83, 0xE3, 0x85, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x79, 0xE3, 0x82, /* 0x64-0x67 */
+	0x00, 0x00, 0xE3, 0x8A, 0xE3, 0x89, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x96, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x8C, 0x4A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xE3, 0x88, 0x00, 0x00, 0xE3, 0x8C, /* 0x78-0x7B */
+	0xE3, 0x8B, 0xE3, 0x8F, 0x00, 0x00, 0xE3, 0x91, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x5B, 0xE3, 0x8D, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE3, 0x92, 0xE3, 0x93, 0xED, 0x40, 0x00, 0x00, /* 0x88-0x8B */
+	0xE3, 0x94, 0x00, 0x00, 0xE3, 0x9A, 0x93, 0x5A, /* 0x8C-0x8F */
+	0xE3, 0x96, 0x00, 0x00, 0xE3, 0x95, 0xE3, 0x97, /* 0x90-0x93 */
+	0xE3, 0x98, 0x00, 0x00, 0xE3, 0x99, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x9B, /* 0x98-0x9B */
+	0xE3, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_7F[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xCA, 0x00, 0x00, /* 0x34-0x37 */
+	0xE3, 0x9D, 0x00, 0x00, 0xE3, 0x9E, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xE3, 0x9F, 0x00, 0x00, 0xEE, 0x73, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE3, 0xA0, 0xE3, 0xA1, 0xE3, 0xA2, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE3, 0xA3, 0xE3, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xE3, 0xA6, 0xE3, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xE3, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xA8, /* 0x5C-0x5F */
+	0xE3, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAC, /* 0x64-0x67 */
+	0xE3, 0xAA, 0xE3, 0xAB, 0x8D, 0xDF, 0x8C, 0x72, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0x75, 0x00, 0x00, /* 0x6C-0x6F */
+	0x94, 0xB1, 0x00, 0x00, 0x8F, 0x90, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x94, 0x6C, 0x00, 0x00, 0x94, 0xEB, /* 0x74-0x77 */
+	0xE3, 0xAD, 0x9C, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xAE, 0xE3, 0xB0, /* 0x80-0x83 */
+	0x00, 0x00, 0x97, 0x85, 0xE3, 0xAF, 0xE3, 0xB2, /* 0x84-0x87 */
+	0xE3, 0xB1, 0x00, 0x00, 0x97, 0x72, 0x00, 0x00, /* 0x88-0x8B */
+	0xE3, 0xB3, 0x00, 0x00, 0x94, 0xFC, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE3, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xB7, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xE3, 0xB6, 0xE3, 0xB5, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xEE, 0x74, 0x00, 0x00, 0xE3, 0xB8, /* 0xA0-0xA3 */
+	0x8C, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x91, 0x41, 0x8B, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xBC, 0xE3, 0xB9, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xBA, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xBD, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE3, 0xBE, 0xE3, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x89, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x89, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE3, 0xC0, 0xE3, 0xC1, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xC2, 0x00, 0x00, /* 0xC8-0xCB */
+	0x97, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x4B, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE3, 0xC4, 0xE3, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x90, 0x89, 0xE3, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xC6, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE3, 0xC7, 0x00, 0x00, 0x8A, 0xE3, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x8A, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xC8, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE3, 0xC9, 0x00, 0x00, 0x96, 0x7C, /* 0xF8-0xFB */
+	0x97, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_80[512] = {
+	0x97, 0x73, 0x98, 0x56, 0x00, 0x00, 0x8D, 0x6C, /* 0x00-0x03 */
+	0xE3, 0xCC, 0x8E, 0xD2, 0xE3, 0xCB, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xCD, /* 0x08-0x0B */
+	0x8E, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x91, 0xCF, 0x00, 0x00, 0xE3, 0xCE, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x8D, 0x6B, 0x00, 0x00, 0x96, 0xD5, /* 0x14-0x17 */
+	0xE3, 0xCF, 0xE3, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xE3, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE3, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xE3, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xA8, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0xEB, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD5, /* 0x38-0x3B */
+	0x00, 0x00, 0x92, 0x5E, 0x00, 0x00, 0xE3, 0xD4, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xD7, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xD6, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xD8, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0xB9, 0x00, 0x00, /* 0x54-0x57 */
+	0xE3, 0xD9, 0x00, 0x00, 0xE3, 0xDA, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xB7, 0xE3, 0xDB, /* 0x5C-0x5F */
+	0x00, 0x00, 0x91, 0x8F, 0xE3, 0xDC, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xE3, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xFC, /* 0x6C-0x6F */
+	0xE3, 0xE0, 0x00, 0x00, 0xE3, 0xDF, 0xE3, 0xDE, /* 0x70-0x73 */
+	0x92, 0xAE, 0x00, 0x00, 0xE3, 0xE1, 0x90, 0x45, /* 0x74-0x77 */
+	0x00, 0x00, 0xE3, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE3, 0xE3, 0x98, 0x57, 0xE3, 0xE4, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xE3, 0xE5, 0xE3, 0xE7, 0xE3, 0xE6, 0x94, 0xA3, /* 0x84-0x87 */
+	0x00, 0x00, 0x93, 0xF7, 0x00, 0x00, 0x98, 0x5D, /* 0x88-0x8B */
+	0x94, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xE9, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xD1, 0x00, 0x00, /* 0x94-0x97 */
+	0x95, 0x49, 0x00, 0x00, 0xE3, 0xEA, 0xE3, 0xE8, /* 0x98-0x9B */
+	0x00, 0x00, 0x8A, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x8C, 0xD2, 0x8E, 0x88, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x94, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x8C, 0xA8, 0x96, 0x62, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE3, 0xED, 0xE3, 0xEB, 0x00, 0x00, 0x8D, 0x6D, /* 0xAC-0xAF */
+	0x00, 0x00, 0x8D, 0x6E, 0x88, 0xE7, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x8D, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x78, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xDD, /* 0xC0-0xC3 */
+	0xE3, 0xF2, 0x00, 0x00, 0x92, 0x5F, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x94, 0x77, 0x00, 0x00, 0x91, 0xD9, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xF4, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE3, 0xF0, 0xE3, 0xF3, 0xE3, 0xEE, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE3, 0xF1, 0x96, 0x45, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x8C, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x88, 0xFB, 0xE3, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xF6, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE3, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x93, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x8B, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE4, 0x45, 0x94, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_81[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x89, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x8B, 0xBA, 0x90, 0xC6, 0x98, 0x65, /* 0x04-0x07 */
+	0x96, 0xAC, 0xE3, 0xF5, 0x90, 0xD2, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x72, 0xE3, 0xF8, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFA, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE3, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFB, /* 0x2C-0x2F */
+	0x00, 0x00, 0x92, 0x45, 0x00, 0x00, 0x94, 0x5D, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x92, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x42, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x41, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFC, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x74, 0x00, 0x00, /* 0x4C-0x4F */
+	0x95, 0x85, 0xE4, 0x44, 0x00, 0x00, 0xE4, 0x43, /* 0x50-0x53 */
+	0x8D, 0x6F, 0x98, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x54, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xE4, 0x48, 0xE4, 0x49, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xEE, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x47, 0x00, 0x00, /* 0x6C-0x6F */
+	0x8D, 0x98, 0xE4, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xE4, 0x4A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x92, 0xB0, 0x95, 0xA0, 0x91, 0x42, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xDA, /* 0x7C-0x7F */
+	
+	0xE4, 0x4E, 0x00, 0x00, 0xE4, 0x4F, 0xE4, 0x4B, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE4, 0x4C, 0x00, 0x00, 0xE4, 0x4D, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x70, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x55, /* 0x90-0x93 */
+	0x00, 0x00, 0xE4, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x86, 0x00, 0x00, /* 0x98-0x9B */
+	0x96, 0x8C, 0x95, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xE4, 0x50, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x53, /* 0xA0-0xA3 */
+	0xE4, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x96, 0x63, 0xE4, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE4, 0x57, 0x00, 0x00, 0x00, 0x00, 0x91, 0x56, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE4, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE4, 0x5A, 0x00, 0x00, 0xE4, 0x5E, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE4, 0x5B, 0xE4, 0x59, 0x94, 0x5E, /* 0xBC-0xBF */
+	0xE4, 0x5C, 0x00, 0x00, 0xE4, 0x5D, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xB0, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE4, 0x64, 0xE4, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE4, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xE4, 0x61, 0x00, 0x00, 0x91, 0x9F, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE4, 0x63, 0xE4, 0x62, 0xE4, 0x65, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x66, /* 0xDC-0xDF */
+	0xE4, 0x67, 0x00, 0x00, 0x00, 0x00, 0x90, 0x62, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x89, 0xE7, 0x00, 0x00, 0xE4, 0x68, /* 0xE4-0xE7 */
+	0x97, 0xD5, 0x00, 0x00, 0x8E, 0xA9, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x8F, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x8A, /* 0xF0-0xF3 */
+	0x92, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x69, 0xE4, 0x6A, /* 0xF8-0xFB */
+	0x89, 0x50, 0x00, 0x00, 0xE4, 0x6B, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_82[512] = {
+	0x00, 0x00, 0xE4, 0x6C, 0xE4, 0x6D, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xE4, 0x6E, 0x00, 0x00, 0xE4, 0x6F, /* 0x04-0x07 */
+	0x8B, 0xBB, 0x9D, 0xA8, 0xE4, 0x70, 0x00, 0x00, /* 0x08-0x0B */
+	0x90, 0xE3, 0xE4, 0x71, 0x8E, 0xC9, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE4, 0x72, 0x00, 0x00, 0x98, 0xAE, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x73, 0x95, 0xDC, /* 0x14-0x17 */
+	0x8A, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x91, 0x43, /* 0x18-0x1B */
+	0x8F, 0x77, 0x00, 0x00, 0x95, 0x91, 0x8F, 0x4D, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE4, 0x74, 0x8D, 0x71, 0xE4, 0x75, /* 0x28-0x2B */
+	0x94, 0xCA, 0x00, 0x00, 0xE4, 0x84, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x77, /* 0x30-0x33 */
+	0x00, 0x00, 0x91, 0xC7, 0x94, 0x95, 0x8C, 0xBD, /* 0x34-0x37 */
+	0xE4, 0x76, 0x91, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xE4, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xF8, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xE4, 0x7A, 0xE4, 0x79, 0xE4, 0x7C, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xE4, 0x7B, 0x00, 0x00, 0xE4, 0x7D, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x80, 0x00, 0x00, /* 0x60-0x63 */
+	0xE4, 0x7E, 0x00, 0x00, 0x8A, 0xCD, 0x00, 0x00, /* 0x64-0x67 */
+	0xE4, 0x81, 0x00, 0x00, 0xE4, 0x82, 0xE4, 0x83, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0xAF, 0x97, 0xC7, /* 0x6C-0x6F */
+	0x00, 0x00, 0xE4, 0x85, 0x90, 0x46, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x90, 0xE4, 0x86, /* 0x74-0x77 */
+	0xE4, 0x87, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x88, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xF0, /* 0x88-0x8B */
+	0x00, 0x00, 0xE4, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x8A, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x95, 0x87, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x8E, 0xC5, 0x00, 0x00, 0xE4, 0x8C, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x8A, 0x48, 0x88, 0xB0, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x8B, /* 0xA8-0xAB */
+	0xE4, 0x8E, 0x94, 0x6D, 0x00, 0x00, 0x90, 0x63, /* 0xAC-0xAF */
+	0x00, 0x00, 0x89, 0xD4, 0x00, 0x00, 0x96, 0x46, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x8C, 0x7C, 0x8B, 0xDA, 0x00, 0x00, 0xE4, 0x8D, /* 0xB8-0xBB */
+	0x00, 0x00, 0x89, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x8A, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x89, 0x91, 0xE4, 0x92, 0x97, 0xE8, /* 0xD0-0xD3 */
+	0x91, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x95, 0x63, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE4, 0x9E, 0x00, 0x00, 0x89, 0xD5, /* 0xD8-0xDB */
+	0xE4, 0x9C, 0x00, 0x00, 0xE4, 0x9A, 0xE4, 0x91, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE4, 0x8F, 0x00, 0x00, 0xE4, 0x90, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x8E, 0xE1, 0x8B, 0xEA, 0x92, 0x97, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xCF, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x89, 0x70, 0x00, 0x00, 0xE4, 0x94, /* 0xF0-0xF3 */
+	0xE4, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE4, 0x99, 0xE4, 0x95, 0xE4, 0x98, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_83[512] = {
+	0x00, 0x00, 0xEE, 0x76, 0x96, 0xCE, 0xE4, 0x97, /* 0x00-0x03 */
+	0x89, 0xD6, 0x8A, 0x9D, 0xE4, 0x9B, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xE4, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x73, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xA1, 0xE4, 0xAA, /* 0x14-0x17 */
+	0xE4, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x88, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB2, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x88, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xA9, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xA8, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE4, 0xA3, 0xE4, 0xA2, 0x00, 0x00, /* 0x30-0x33 */
+	0xE4, 0xA0, 0xE4, 0x9F, 0x92, 0x83, 0x00, 0x00, /* 0x34-0x37 */
+	0x91, 0xF9, 0xE4, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xE4, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xE4, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x91, 0x90, 0x8C, 0x74, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x60, /* 0x4C-0x4F */
+	0xE4, 0xA6, 0x00, 0x00, 0x8D, 0x72, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x91, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x77, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB8, /* 0x70-0x73 */
+	0x00, 0x00, 0xE4, 0xB9, 0x00, 0x00, 0x89, 0xD7, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xAC, /* 0x78-0x7B */
+	0xE4, 0xB6, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x78, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xE4, 0xAC, 0x00, 0x00, 0xE4, 0xB4, /* 0x84-0x87 */
+	0x00, 0x00, 0xE4, 0xBB, 0xE4, 0xB5, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xB3, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x96, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xB1, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xAD, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0xCE, 0xE4, 0xAF, /* 0x9C-0x9F */
+	0xE4, 0xBA, 0x00, 0x00, 0xE4, 0xB0, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE4, 0xBC, 0x00, 0x00, 0xE4, 0xAE, 0x94, 0x9C, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x97, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE4, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE4, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xE4, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x90, 0x9B, 0x00, 0x00, 0xEE, 0x79, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x65, 0x00, 0x00, /* 0xC8-0xCB */
+	0x8B, 0xDB, 0x00, 0x00, 0xE4, 0xC0, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xD9, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0xD2, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE4, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x8D, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x93, 0x70, /* 0xDC-0xDF */
+	0xE4, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x95, 0xEC, 0x00, 0x00, 0xE4, 0xBF, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xD8, /* 0xEC-0xEF */
+	0x8C, 0xD4, 0x95, 0x48, 0xE4, 0xC9, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE4, 0xBD, 0x00, 0x00, 0xEE, 0x7A, 0xE4, 0xC6, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xD0, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE4, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_84[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC2, /* 0x00-0x03 */
+	0x93, 0xB8, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC7, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC4, /* 0x08-0x0B */
+	0x96, 0x47, 0xE4, 0xCA, 0x88, 0xDE, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xBE, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE4, 0xCC, 0x00, 0x00, 0xE4, 0xCB, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x94, 0x8B, 0xE4, 0xD2, 0x00, 0x00, /* 0x28-0x2B */
+	0xE4, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8A, 0x9E, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xE4, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xE4, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xE4, 0xD3, 0x97, 0x8E, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xDC, 0x00, 0x00, /* 0x44-0x47 */
+	0xEE, 0x7B, 0x97, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xA8, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x98, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x8B, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x95, 0x92, 0xE4, 0xE2, 0x93, 0x9F, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0xAF, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE4, 0xDB, 0x00, 0x00, 0xE4, 0xD7, /* 0x68-0x6B */
+	0x91, 0x92, 0xE4, 0xD1, 0xE4, 0xD9, 0xE4, 0xDE, /* 0x6C-0x6F */
+	0x00, 0x00, 0x94, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x88, 0xA8, 0x00, 0x00, 0xE4, 0xD6, /* 0x74-0x77 */
+	0x00, 0x00, 0xE4, 0xDF, 0x95, 0x98, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xDA, 0x00, 0x00, /* 0x80-0x83 */
+	0xE4, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0xD3, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x8F, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x8E, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x96, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x95, 0x66, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE5, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE4, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xE4, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0x97, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xEE, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x8F, 0xF6, 0xE4, 0xE3, 0x00, 0x00, 0xE4, 0xE8, /* 0xB8-0xBB */
+	0x91, 0x93, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE4, /* 0xBC-0xBF */
+	0x00, 0x00, 0xE4, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x92, 0x7E, 0x00, 0x00, 0xE4, 0xEC, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x97, 0x75, 0xE4, 0xE1, 0x8A, 0x57, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE4, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xE4, 0xEA, 0x96, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xED, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE4, 0xE6, 0xE4, 0xE9, 0x00, 0x00, /* 0xD8-0xDB */
+	0xED, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x96, 0x48, 0x00, 0x00, 0x98, 0x40, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE4, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE4, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_85[512] = {
+	0x8E, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xCF, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x95, 0xCC, 0x00, 0x00, 0x96, 0xA0, /* 0x10-0x13 */
+	0xE4, 0xF7, 0xE4, 0xF6, 0x00, 0x00, 0xE4, 0xF2, /* 0x14-0x17 */
+	0xE4, 0xF3, 0x00, 0x00, 0x89, 0x55, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF5, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE4, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0xD3, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xE4, 0xF4, 0x88, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x91, 0xA0, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x95, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xE4, 0xF9, 0xE5, 0x40, 0x00, 0x00, 0x94, 0xD7, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xE4, 0xFC, 0x8F, 0xD4, 0x8E, 0xC7, 0xE5, 0x42, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xBC, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x7D, /* 0x50-0x53 */
+	0x00, 0x00, 0xE5, 0x43, 0x00, 0x00, 0x95, 0x99, /* 0x54-0x57 */
+	0xE4, 0xFB, 0xEE, 0x7E, 0xE4, 0xD4, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xFA, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x98, 0x6E, 0x93, 0xA0, 0x95, 0x93, 0xEE, 0x80, /* 0x68-0x6B */
+	0x00, 0x00, 0xE5, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x50, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x51, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xE5, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x94, 0x96, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x4E, /* 0x84-0x87 */
+	0xE5, 0x46, 0x00, 0x00, 0xE5, 0x48, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xE5, 0x52, 0xE5, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE5, 0x4B, 0x00, 0x00, 0x00, 0x00, 0x89, 0x92, /* 0x94-0x97 */
+	0x00, 0x00, 0x93, 0xE3, 0x00, 0x00, 0xE5, 0x4C, /* 0x98-0x9B */
+	0xE5, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE5, 0x45, 0x00, 0x00, 0x91, 0x45, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE5, 0x49, 0x8E, 0x46, 0x90, 0x64, 0x8C, 0x4F, /* 0xA8-0xAB */
+	0x96, 0xF2, 0x00, 0x00, 0x96, 0xF7, 0x8F, 0x92, /* 0xAC-0xAF */
+	0xEE, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE5, 0x56, 0xE5, 0x54, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x98, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE5, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x97, 0x95, 0x00, 0x00, 0xE5, 0x55, /* 0xCC-0xCF */
+	0xE5, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE5, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE5, 0x5B, 0xE5, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x93, 0xA1, 0xE5, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x94, 0xCB, 0xE5, 0x4D, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x93, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE5, 0x5C, 0xE5, 0x61, 0x91, 0x94, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x60, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_86[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x41, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x62, 0x91, 0x68, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x5D, 0xE5, 0x5F, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x5E, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x9F, 0x50, 0x9F, 0x41, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x64, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x63, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x97, 0x96, 0x00, 0x00, 0xE1, 0xBA, /* 0x2C-0x2F */
+	0xE5, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x66, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xE5, 0x67, 0x8C, 0xD5, 0x00, 0x00, /* 0x4C-0x4F */
+	0x8B, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xE5, 0x69, 0x99, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x95, 0x00, 0x00, /* 0x58-0x5B */
+	0x97, 0xB8, 0x00, 0x00, 0x8B, 0xF1, 0xE5, 0x6A, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x6B, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x8E, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xE5, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x93, 0xF8, 0x00, 0x00, 0x88, 0xB8, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xE1, 0xE5, 0x71, /* 0x88-0x8B */
+	0xE5, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x6D, /* 0x90-0x93 */
+	0x00, 0x00, 0x8E, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x6E, /* 0xA0-0xA3 */
+	0x94, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE5, 0x6F, 0xE5, 0x70, 0xE5, 0x7A, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x74, /* 0xAC-0xAF */
+	0xE5, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x73, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xE5, 0x75, 0x00, 0x00, 0xE5, 0x76, 0x8E, 0xD6, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE5, 0x78, 0x00, 0x00, 0x92, 0x60, /* 0xC8-0xCB */
+	0x00, 0x00, 0x8C, 0x75, 0x8A, 0x61, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE5, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x8A, 0x5E, 0x00, 0x00, 0xE5, 0x81, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x7C, 0xE5, 0x80, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x94, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE5, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xE5, 0x7E, 0x95, 0x67, 0x94, 0xD8, 0xE5, 0x82, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x91, 0xFB, 0xE5, 0x8C, 0x00, 0x00, 0xE5, 0x88, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xE9, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_87[512] = {
+	0xE5, 0x86, 0x00, 0x00, 0x96, 0x49, 0xE5, 0x87, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x84, 0x00, 0x00, /* 0x04-0x07 */
+	0xE5, 0x85, 0xE5, 0x8A, 0xE5, 0x8D, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE5, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE5, 0x89, 0xE5, 0x83, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x92, 0x77, 0x00, 0x00, 0xE5, 0x94, 0x00, 0x00, /* 0x18-0x1B */
+	0x96, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE5, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE5, 0x93, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xE5, 0x8E, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x90, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x91, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x8F, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x90, 0xE4, 0x00, 0x00, 0x98, 0x58, /* 0x48-0x4B */
+	0xE5, 0x98, 0x00, 0x00, 0xE5, 0x99, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x9F, /* 0x50-0x53 */
+	0x00, 0x00, 0x90, 0x49, 0x00, 0x00, 0xE5, 0x9B, /* 0x54-0x57 */
+	0x00, 0x00, 0xE5, 0x9E, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x96, /* 0x5C-0x5F */
+	0xE5, 0x95, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA0, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xDA, 0x00, 0x00, /* 0x64-0x67 */
+	0xE5, 0x9C, 0x00, 0x00, 0xE5, 0xA1, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x9D, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xE5, 0x9A, 0x00, 0x00, 0x92, 0xB1, 0x00, 0x00, /* 0x74-0x77 */
+	0xE5, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x88, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xA5, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x97, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA4, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xA3, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xAC, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA6, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xAE, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0x86, 0xE5, 0xB1, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE5, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE5, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xE5, 0xAD, 0x00, 0x00, 0xE5, 0xB0, 0xE5, 0xAF, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA7, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xE5, 0xAA, 0x00, 0x00, 0xE5, 0xBB, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xE5, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB2, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xB3, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xB8, 0xE5, 0xB9, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x8A, 0x49, 0x00, 0x00, 0x8B, 0x61, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xB7, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_88[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xE5, 0xA2, 0x00, 0x00, 0xEE, 0x85, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE5, 0xB6, 0xE5, 0xBA, 0xE5, 0xB5, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE5, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xE5, 0xBE, 0xE5, 0xBD, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE5, 0xC0, 0xE5, 0xBF, 0xE5, 0x79, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC4, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE5, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xC2, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xE5, 0xC3, 0x00, 0x00, 0xE5, 0xC5, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x8C, 0x8C, 0x00, 0x00, 0xE5, 0xC7, 0x00, 0x00, /* 0x40-0x43 */
+	0xE5, 0xC6, 0x00, 0x00, 0x8F, 0x4F, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x8D, 0x73, 0x9F, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xC8, 0x8F, 0x70, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x58, /* 0x54-0x57 */
+	0x00, 0x00, 0xE5, 0xC9, 0x00, 0x00, 0x89, 0x71, /* 0x58-0x5B */
+	0x00, 0x00, 0x8F, 0xD5, 0xE5, 0xCA, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x8D, 0x74, 0xE5, 0xCB, 0x88, 0xDF, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x95, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xCC, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x90, 0x8A, 0x00, 0x00, 0xE5, 0xD3, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xE5, 0xD0, 0x00, 0x00, 0x92, 0x8F, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE5, 0xD1, 0xE5, 0xCE, 0x8B, 0xDC, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE5, 0xCD, 0xE5, 0xD4, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x8C, 0x55, 0x00, 0x00, 0x00, 0x00, 0x91, 0xDC, /* 0x88-0x8B */
+	0x00, 0x00, 0xE5, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xD6, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0xB3, 0xE5, 0xD5, /* 0x94-0x97 */
+	0x00, 0x00, 0xE5, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xCF, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xD9, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE5, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xED, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xD7, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE5, 0xDC, 0xE5, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x8C, 0xD1, 0xE5, 0xD2, 0x00, 0x00, 0x88, 0xBF, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xDD, /* 0xBC-0xBF */
+	0x00, 0x00, 0x8D, 0xD9, 0x97, 0xF4, 0xE5, 0xDF, /* 0xC0-0xC3 */
+	0xE5, 0xE0, 0x91, 0x95, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xA0, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE5, 0xE1, 0x97, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE5, 0xE2, 0xE5, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x95, 0xE2, 0xE5, 0xE4, 0x00, 0x00, 0x8D, 0xBE, /* 0xDC-0xDF */
+	0x00, 0x00, 0x97, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xE5, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xEA, 0x8F, 0xD6, /* 0xF0-0xF3 */
+	0xE5, 0xE8, 0xEE, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x97, 0x87, 0xE5, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE5, 0xE7, 0x90, 0xBB, 0x90, 0x9E, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_89[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xE6, 0x00, 0x00, /* 0x00-0x03 */
+	0xE5, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x95, 0xA1, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xED, 0x00, 0x00, /* 0x08-0x0B */
+	0xE5, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x8A, 0x8C, 0x00, 0x00, 0x96, 0x4A, 0xE5, 0xEE, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xED, 0x41, 0xE5, 0xFA, 0xE5, 0xF0, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE5, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xF2, 0xE5, 0xF3, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xF7, 0x00, 0x00, /* 0x34-0x37 */
+	0xE5, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xF6, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE5, 0xF4, 0x00, 0x00, 0xE5, 0xEF, /* 0x40-0x43 */
+	0xE5, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE5, 0xF9, 0xE8, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xA6, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xFC, 0x8B, 0xDD, /* 0x5C-0x5F */
+	0xE5, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xE6, 0x41, 0x00, 0x00, 0xE6, 0x40, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x43, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xE6, 0x42, 0x00, 0x00, 0xE6, 0x44, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x8F, 0x50, 0x00, 0x00, /* 0x70-0x73 */
+	0xE6, 0x45, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x46, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x47, 0x90, 0xBC, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x97, 0x76, 0x00, 0x00, 0xE6, 0x48, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xA2, 0x94, 0x65, /* 0x84-0x87 */
+	0xE6, 0x49, 0x00, 0x00, 0xE6, 0x4A, 0x8C, 0xA9, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x4B, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x4B, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x8B, 0x94, 0x60, /* 0x94-0x97 */
+	0xE6, 0x4C, 0x00, 0x00, 0x8A, 0x6F, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE6, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x4F, 0x97, 0x97, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE6, 0x4E, 0x90, 0x65, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE6, 0x50, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x51, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x52, 0x8A, 0xCF, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x53, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE6, 0x54, 0x00, 0x00, 0xE6, 0x55, /* 0xBC-0xBF */
+	0xE6, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x8A, 0x70, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x57, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE6, 0x58, 0xE6, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xF0, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x47, 0xE6, 0x5A, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE6, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xE6, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_8A[512] = {
+	0x8C, 0xBE, 0x00, 0x00, 0x92, 0xF9, 0xE6, 0x5D, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x8C, 0x76, 0x00, 0x00, 0x90, 0x75, 0x00, 0x00, /* 0x08-0x0B */
+	0xE6, 0x60, 0x00, 0x00, 0x93, 0xA2, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE6, 0x5F, 0x00, 0x00, 0xEE, 0x87, 0x8C, 0x50, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x5E, 0x91, 0xF5, /* 0x14-0x17 */
+	0x8B, 0x4C, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x61, /* 0x18-0x1B */
+	0x00, 0x00, 0xE6, 0x62, 0x00, 0x00, 0x8F, 0xD7, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x8D, /* 0x20-0x23 */
+	0x00, 0x00, 0xE6, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x4B, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x90, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8B, 0x96, 0x00, 0x00, 0x96, 0xF3, /* 0x30-0x33 */
+	0x91, 0x69, 0x00, 0x00, 0xE6, 0x64, 0xEE, 0x88, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x66, 0x92, 0x90, /* 0x38-0x3B */
+	0x8F, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE6, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x68, 0x00, 0x00, /* 0x44-0x47 */
+	0xE6, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x8D, 0xBC, 0x91, 0xC0, 0xE6, 0x67, 0x00, 0x00, /* 0x50-0x53 */
+	0x8F, 0xD9, 0x95, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x66, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x8C, 0x00, 0x00, /* 0x5C-0x5F */
+	0x89, 0x72, 0x00, 0x00, 0xE6, 0x6D, 0x8C, 0x77, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x8E, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x8E, 0x8D, 0x00, 0x00, 0x98, 0x6C, /* 0x68-0x6B */
+	0xE6, 0x6C, 0xE6, 0x6B, 0x91, 0x46, 0x00, 0x00, /* 0x6C-0x6F */
+	0x8B, 0x6C, 0x98, 0x62, 0x8A, 0x59, 0x8F, 0xDA, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xEE, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xE6, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x6F, 0x00, 0x00, /* 0x80-0x83 */
+	0xE6, 0x70, 0xE6, 0x6E, 0x00, 0x00, 0x8C, 0xD6, /* 0x84-0x87 */
+	0x00, 0x00, 0x97, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x8E, 0x8F, 0x94, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE6, 0x73, 0x00, 0x00, 0x90, 0xBE, /* 0x90-0x93 */
+	0x00, 0x00, 0x92, 0x61, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x97, 0x55, 0x00, 0x00, 0xE6, 0x76, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0xEA, 0x00, 0x00, /* 0x9C-0x9F */
+	0x90, 0xBD, 0xE6, 0x72, 0x00, 0x00, 0xE6, 0x77, /* 0xA0-0xA3 */
+	0x8C, 0xEB, 0xE6, 0x74, 0xE6, 0x75, 0xEE, 0x8A, /* 0xA4-0xA7 */
+	0xE6, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x90, 0xE0, 0x93, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x92, 0x4E, 0x00, 0x00, 0x89, 0xDB, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x94, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x8B, 0x62, 0x00, 0x00, 0xEE, 0x8B, 0x92, 0xB2, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x7A, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xE6, 0x78, 0x00, 0x00, 0x00, 0x00, 0x92, 0x6B, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xBF, /* 0xC8-0xCB */
+	0x8A, 0xD0, 0xE6, 0x79, 0x00, 0x00, 0x90, 0x7A, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xC8, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x5F, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x7B, 0xE6, 0x87, /* 0xD8-0xDB */
+	0x92, 0xB3, 0x00, 0x00, 0xE6, 0x86, 0xEE, 0x8C, /* 0xDC-0xDF */
+	0xE6, 0x83, 0xE6, 0x8B, 0xE6, 0x84, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE6, 0x80, 0x00, 0x00, 0x92, 0xFA, 0xE6, 0x7E, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x7C, /* 0xE8-0xEB */
+	0x00, 0x00, 0x97, 0x40, 0x8E, 0x90, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE6, 0x81, 0x00, 0x00, 0xE6, 0x7D, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x8E, 0xE6, 0x85, /* 0xF4-0xF7 */
+	0x8F, 0x94, 0x00, 0x00, 0x8C, 0xBF, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0xF8, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8B[512] = {
+	0x96, 0x64, 0x89, 0x79, 0x88, 0xE0, 0x00, 0x00, /* 0x00-0x03 */
+	0x93, 0xA3, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x89, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xE6, 0x88, 0x00, 0x00, 0x93, 0xE4, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE6, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xE6, 0x82, 0x00, 0x00, 0xE6, 0x8C, 0xE6, 0x8E, /* 0x14-0x17 */
+	0x00, 0x00, 0x8C, 0xAA, 0xE6, 0x8A, 0x8D, 0x75, /* 0x18-0x1B */
+	0x00, 0x00, 0x8E, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE6, 0x8F, 0x97, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x92, 0x00, 0x00, /* 0x24-0x27 */
+	0xE6, 0x95, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x93, /* 0x28-0x2B */
+	0x95, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x90, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x8B, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x94, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE6, 0x96, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xE6, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE6, 0x97, 0x00, 0x00, 0xE6, 0x99, 0xE6, 0x98, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x8F, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x9B, 0x00, 0x00, /* 0x54-0x57 */
+	0x8E, 0xAF, 0x00, 0x00, 0xE6, 0x9D, 0xE6, 0x9C, /* 0x58-0x5B */
+	0x95, 0x88, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x9F, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x78, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x9E, /* 0x68-0x6B */
+	0xE6, 0xA0, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xA1, /* 0x6C-0x6F */
+	0x8B, 0x63, 0xE3, 0xBF, 0x8F, 0xF7, 0x00, 0x00, /* 0x70-0x73 */
+	0xE6, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xEC, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE6, 0xA3, 0x00, 0x00, 0xEE, 0x90, /* 0x7C-0x7F */
+	
+	0xE6, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x5D, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x9D, 0xCC, 0x00, 0x00, /* 0x88-0x8B */
+	0xE6, 0xA5, 0x00, 0x00, 0xE6, 0xA6, 0x00, 0x00, /* 0x8C-0x8F */
+	0x8F, 0x51, 0x00, 0x00, 0xE6, 0xA7, 0xE6, 0xA8, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xA9, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xE6, 0xAA, 0xE6, 0xAB, 0x00, 0x00, /* 0x98-0x9B */
+};
+
+static const unsigned char u2c_8C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x4A, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xAC, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xAE, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE6, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0xA4, 0x00, 0x00, /* 0x44-0x47 */
+	0xE6, 0xAF, 0x00, 0x00, 0x96, 0x4C, 0x00, 0x00, /* 0x48-0x4B */
+	0xE6, 0xB0, 0x00, 0x00, 0xE6, 0xB1, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE6, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE6, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0xD8, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x8F, 0xDB, 0xE6, 0xB4, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0x8B, 0x98, 0xAC, /* 0x68-0x6B */
+	0xE6, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xE6, 0xB6, 0x95, 0x5E, 0xE6, 0xB7, 0x00, 0x00, /* 0x78-0x7B */
+	0xE6, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xB8, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xE6, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xE6, 0xB9, 0xE6, 0xBB, 0x00, 0x00, /* 0x88-0x8B */
+	0x96, 0x65, 0xE6, 0xBC, 0xE6, 0xBD, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE6, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xE6, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x8A, 0x4C, 0x92, 0xE5, 0x00, 0x00, /* 0x9C-0x9F */
+	0x95, 0x89, 0x8D, 0xE0, 0x8D, 0x76, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x6E, /* 0xA4-0xA7 */
+	0x89, 0xDD, 0x94, 0xCC, 0xE6, 0xC3, 0x8A, 0xD1, /* 0xA8-0xAB */
+	0x90, 0xD3, 0xE6, 0xC2, 0xE6, 0xC7, 0x92, 0x99, /* 0xAC-0xAF */
+	0x96, 0xE1, 0x00, 0x00, 0xE6, 0xC5, 0xE6, 0xC6, /* 0xB0-0xB3 */
+	0x8B, 0x4D, 0x00, 0x00, 0xE6, 0xC8, 0x94, 0x83, /* 0xB4-0xB7 */
+	0x91, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x94, 0xEF, /* 0xB8-0xBB */
+	0x93, 0x5C, 0xE6, 0xC4, 0x00, 0x00, 0x96, 0x66, /* 0xBC-0xBF */
+	0x89, 0xEA, 0xE6, 0xCA, 0x98, 0x47, 0x92, 0xC0, /* 0xC0-0xC3 */
+	0x98, 0x64, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x91, /* 0xC4-0xC7 */
+	0xE6, 0xC9, 0x00, 0x00, 0x91, 0xAF, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE6, 0xDA, 0x91, 0x47, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x93, 0xF6, 0x00, 0x00, 0x95, 0x6F, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xCD, 0x8E, 0x5E, /* 0xD8-0xDB */
+	0x8E, 0x92, 0x00, 0x00, 0x8F, 0xDC, 0x00, 0x00, /* 0xDC-0xDF */
+	0x94, 0x85, 0x00, 0x00, 0x8C, 0xAB, 0xE6, 0xCC, /* 0xE0-0xE3 */
+	0xE6, 0xCB, 0x00, 0x00, 0x95, 0x8A, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xBF, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x93, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xEE, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xEE, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xCF, 0xE6, 0xD0, /* 0xF8-0xFB */
+	0x8D, 0x77, 0xE6, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xE6, 0xD1, 0xE6, 0xD2, 0x00, 0x00, 0xE6, 0xD4, /* 0x04-0x07 */
+	0x91, 0xA1, 0x00, 0x00, 0xE6, 0xD3, 0x8A, 0xE4, /* 0x08-0x0B */
+	0x00, 0x00, 0xE6, 0xD6, 0x00, 0x00, 0xE6, 0xD5, /* 0x0C-0x0F */
+	0xE6, 0xD7, 0x00, 0x00, 0xEE, 0x93, 0xE6, 0xD9, /* 0x10-0x13 */
+	0xE6, 0xDB, 0x00, 0x00, 0xE6, 0xDC, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x90, 0xD4, 0x00, 0x00, 0x8E, 0xCD, 0xE6, 0xDD, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x71, /* 0x68-0x6B */
+	0x00, 0x00, 0xE6, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x91, 0x96, 0xE6, 0xDF, 0x00, 0x00, 0xE6, 0xE0, /* 0x70-0x73 */
+	0x95, 0x8B, 0x00, 0x00, 0xEE, 0x94, 0x8B, 0x4E, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE6, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x92, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x7A, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xE6, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xEF, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x90, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xAB, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xE5, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xE4, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xE3, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xEB, /* 0xC8-0xCB */
+	0xE6, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE6, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xE8, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xE7, 0xE6, 0xEA, /* 0xD8-0xDB */
+	0x00, 0x00, 0x8B, 0x97, 0x00, 0x00, 0xE6, 0xEE, /* 0xDC-0xDF */
+	0x00, 0x00, 0x90, 0xD5, 0x00, 0x00, 0xE6, 0xEF, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x8C, 0xD7, 0x00, 0x00, 0xE6, 0xEC, 0xE6, 0xED, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x48, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xB5, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x91, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE6, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xF3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xE6, 0xF1, 0xE6, 0xF2, 0x97, 0x78, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xA5, /* 0x0C-0x0F */
+	0xE6, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xE6, 0xF4, 0xE6, 0xF5, 0xE6, 0xF7, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x48, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE6, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xE6, 0xFB, 0xE6, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xF8, 0x00, 0x00, /* 0x40-0x43 */
+	0x92, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x40, /* 0x44-0x47 */
+	0xE7, 0x44, 0xE7, 0x41, 0xE6, 0xFC, 0x00, 0x00, /* 0x48-0x4B */
+	0xE7, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE7, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE7, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xE7, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xD6, /* 0x5C-0x5F */
+	0xE7, 0x47, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x49, /* 0x60-0x63 */
+	0xE7, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x4C, 0x00, 0x00, /* 0x70-0x73 */
+	0x8F, 0x52, 0x00, 0x00, 0xE7, 0x4B, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xE7, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE7, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xE7, 0x51, 0xE7, 0x50, 0x00, 0x00, 0xE7, 0x4F, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x53, 0xE7, 0x52, /* 0x88-0x8B */
+	0x00, 0x00, 0x96, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE7, 0x55, 0x00, 0x00, 0xE7, 0x54, /* 0x90-0x93 */
+	0xE7, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xE7, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE7, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x58, 0x90, 0x67, /* 0xA8-0xAB */
+	0xE7, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xEB, /* 0xAC-0xAF */
+	0xE7, 0x5B, 0xE7, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x5E, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE7, 0x5F, 0xE7, 0x5C, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE7, 0x60, 0x00, 0x00, 0x8E, 0xD4, 0xE7, 0x61, /* 0xC8-0xCB */
+	0x8B, 0x4F, 0x8C, 0x52, 0x00, 0x00, 0xEE, 0x96, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0xAC, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x62, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xEE, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x5D, 0xE7, 0x63, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x66, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x8E, 0xB2, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x65, /* 0xF8-0xFB */
+	0xE7, 0x64, 0x8C, 0x79, 0xE7, 0x67, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8F[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x72, /* 0x00-0x03 */
+	0x00, 0x00, 0xE7, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x8D, 0xDA, 0xE7, 0x68, 0x00, 0x00, /* 0x08-0x0B */
+	0xE7, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x6B, 0xE7, 0x6D, /* 0x10-0x13 */
+	0x95, 0xE3, 0xE7, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE7, 0x6C, 0x00, 0x00, 0xE7, 0x70, /* 0x18-0x1B */
+	0xE7, 0x6E, 0x8B, 0x50, 0x00, 0x00, 0xE7, 0x6F, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x72, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x94, 0x79, 0x97, 0xD6, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x53, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x73, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x97, 0x41, 0xE7, 0x75, 0x00, 0x00, 0xE7, 0x74, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x78, 0x97, 0x60, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x77, 0x00, 0x00, /* 0x40-0x43 */
+	0x8A, 0x8D, 0xE7, 0x76, 0xE7, 0x7B, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xE7, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE7, 0x79, 0x93, 0x51, 0xE7, 0x7C, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x7D, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xE7, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x8C, /* 0x5C-0x5F */
+	0x00, 0x00, 0x8C, 0x44, 0xE7, 0x80, 0xE7, 0x81, /* 0x60-0x63 */
+	0xE7, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x68, /* 0x98-0x9B */
+	0xE7, 0x83, 0x00, 0x00, 0x8E, 0xAB, 0xE7, 0x84, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x85, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x9F, /* 0xA4-0xA7 */
+	0x99, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xE7, 0x86, 0xE3, 0x90, 0xE7, 0x87, /* 0xAC-0xAF */
+	0x92, 0x43, 0x90, 0x4A, 0x94, 0x5F, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x88, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0xD3, 0x92, 0xD2, /* 0xB8-0xBB */
+	0x8D, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x92, 0x48, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x49, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x96, 0x98, 0x90, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x8C, 0x7D, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x8B, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x95, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x89, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x8B, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE7, 0x8A, 0x89, 0xDE, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x93, 0xF4, 0xE7, 0x8C, 0x94, 0x97, /* 0xE8-0xEB */
+	0x00, 0x00, 0x93, 0x52, 0x00, 0x00, 0xE7, 0x8D, /* 0xEC-0xEF */
+	0x8F, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE7, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x96, 0xC0, /* 0xF4-0xF7 */
+	0xE7, 0x9E, 0xE7, 0x91, 0xE7, 0x92, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x92, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_90[512] = {
+	0x91, 0xDE, 0x91, 0x97, 0x00, 0x00, 0x93, 0xA6, /* 0x00-0x03 */
+	0x00, 0x00, 0xE7, 0x90, 0x8B, 0x74, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x99, /* 0x08-0x0B */
+	0x00, 0x00, 0xE7, 0x96, 0xE7, 0xA3, 0x93, 0xA7, /* 0x0C-0x0F */
+	0x92, 0x80, 0xE7, 0x93, 0x00, 0x00, 0x92, 0xFC, /* 0x10-0x13 */
+	0x93, 0x72, 0xE7, 0x94, 0xE7, 0x98, 0x90, 0x80, /* 0x14-0x17 */
+	0x00, 0x00, 0x94, 0x87, 0x92, 0xCA, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x90, 0xC0, 0xE7, 0x97, 0x91, 0xAC, /* 0x1C-0x1F */
+	0x91, 0xA2, 0xE7, 0x95, 0x88, 0xA7, 0x98, 0x41, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x9A, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0xDF, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x8F, 0x54, 0x90, 0x69, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xE7, 0x9C, 0xE7, 0x9B, 0x00, 0x00, /* 0x34-0x37 */
+	0x88, 0xED, 0xE7, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x95, 0x4E, 0x00, 0x00, 0xE7, 0xA5, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x93, 0xD9, 0x90, 0x8B, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x92, 0x78, 0x00, 0x00, 0x8B, 0xF6, /* 0x44-0x47 */
+	0x00, 0x00, 0xE7, 0xA4, 0x97, 0x56, 0x89, 0x5E, /* 0x48-0x4B */
+	0x00, 0x00, 0x95, 0xD5, 0x89, 0xDF, 0xE7, 0x9F, /* 0x4C-0x4F */
+	0xE7, 0xA0, 0xE7, 0xA1, 0xE7, 0xA2, 0x93, 0xB9, /* 0x50-0x53 */
+	0x92, 0x42, 0x88, 0xE1, 0xE7, 0xA6, 0x00, 0x00, /* 0x54-0x57 */
+	0xE7, 0xA7, 0xEA, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x91, 0xBB, 0x00, 0x00, 0xE7, 0xA8, 0x00, 0x00, /* 0x5C-0x5F */
+	0x89, 0x93, 0x91, 0x6B, 0x00, 0x00, 0x8C, 0xAD, /* 0x60-0x63 */
+	0x00, 0x00, 0x97, 0x79, 0x00, 0x00, 0xEE, 0x99, /* 0x64-0x67 */
+	0xE7, 0xA9, 0x93, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x91, 0x98, 0x8E, 0xD5, 0xE7, 0xAA, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xAD, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x8F, 0x85, 0xE7, 0xAB, 0x91, 0x4A, /* 0x74-0x77 */
+	0x91, 0x49, 0x00, 0x00, 0x88, 0xE2, 0x00, 0x00, /* 0x78-0x7B */
+	0x97, 0xC9, 0xE7, 0xAF, 0x00, 0x00, 0x94, 0xF0, /* 0x7C-0x7F */
+	
+	0xE7, 0xB1, 0xE7, 0xB0, 0xE7, 0xAE, 0xE2, 0x84, /* 0x80-0x83 */
+	0x8A, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x8E, /* 0x84-0x87 */
+	0x00, 0x00, 0xE7, 0xB3, 0xE7, 0xB2, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB4, /* 0x8C-0x8F */
+	0x00, 0x00, 0x97, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xDF, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x4D, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE7, 0xB5, 0x00, 0x00, 0x8E, 0xD7, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB6, /* 0xAC-0xAF */
+	0x00, 0x00, 0xE7, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE7, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x93, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x88, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x8D, 0x78, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x59, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBC, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x9A, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x8C, 0x53, 0xE7, 0xB9, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE7, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x95, 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x8A, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x97, 0x58, 0x00, 0x00, 0x8B, 0xBD, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x93, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_91[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xBD, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xBE, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xEE, 0x9C, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE7, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x9D, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x93, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE7, 0xC1, 0x00, 0x00, 0xE7, 0xC0, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x93, 0xD1, 0xE7, 0xC2, 0x8F, 0x55, /* 0x48-0x4B */
+	0x8E, 0xDE, 0x94, 0x7A, 0x92, 0x91, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xF0, 0x00, 0x00, /* 0x50-0x53 */
+	0x90, 0x8C, 0x00, 0x00, 0xE7, 0xC3, 0x00, 0x00, /* 0x54-0x57 */
+	0xE7, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x90, 0x7C, 0xE7, 0xC5, /* 0x60-0x63 */
+	0x00, 0x00, 0xE7, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE7, 0xC7, 0x97, 0x8F, 0x00, 0x00, /* 0x68-0x6B */
+	0x8F, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xC9, 0xE7, 0xC8, /* 0x70-0x73 */
+	0x00, 0x00, 0x8D, 0x79, 0x00, 0x00, 0x8D, 0x93, /* 0x74-0x77 */
+	0x8E, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xCC, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x86, /* 0x84-0x87 */
+	0x00, 0x00, 0xE7, 0xCB, 0x00, 0x00, 0xE7, 0xCA, /* 0x88-0x8B */
+	0x00, 0x00, 0x91, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x8C, 0xED, 0x00, 0x00, 0x90, 0xC1, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xAE, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x8F, 0x58, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xCD, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x8F, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xD0, 0xE7, 0xCE, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xCF, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE7, 0xD2, 0xE7, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x8F, 0xF8, 0x00, 0x00, 0xE7, 0xD3, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE7, 0xD4, 0xE7, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xCE, 0x8D, 0xD1, /* 0xC4-0xC7 */
+	0x8E, 0xDF, 0xE7, 0xD6, 0x00, 0x00, 0xE7, 0xD7, /* 0xC8-0xCB */
+	0x97, 0xA2, 0x8F, 0x64, 0x96, 0xEC, 0x97, 0xCA, /* 0xCC-0xCF */
+	0xE7, 0xD8, 0x8B, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xD9, 0xEE, 0x9F, /* 0xD4-0xD7 */
+	0x93, 0x42, 0x00, 0x00, 0xEE, 0x9E, 0xE7, 0xDC, /* 0xD8-0xDB */
+	0x8A, 0x98, 0x90, 0x6A, 0xEE, 0xA0, 0xE7, 0xDA, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE7, 0xDB, 0x00, 0x00, 0x92, 0xDE, /* 0xE0-0xE3 */
+	0xEE, 0xA3, 0xEE, 0xA4, 0x96, 0x74, 0x8B, 0xFA, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xEE, 0xA1, 0xEE, 0xA2, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xE7, 0xDE, 0xE7, 0xDF, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE7, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_92[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xA5, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xA7, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x93, 0xDD, 0x8A, 0x62, 0x00, 0x00, /* 0x0C-0x0F */
+	0xEE, 0xA6, 0xE7, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xE7, 0xE2, 0xE7, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xE0, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE8, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xE7, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x97, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xD8, /* 0x34-0x37 */
+	0x00, 0x00, 0xEE, 0xAE, 0xEE, 0xA8, 0x00, 0x00, /* 0x38-0x3B */
+	0xEE, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xED, /* 0x3C-0x3F */
+	0xEE, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x93, 0x53, 0xE7, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xE7, 0xEB, 0xE7, 0xE9, 0x00, 0x00, 0xE7, 0xEE, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xAB, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE7, 0xEF, 0xEE, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE7, /* 0x54-0x57 */
+	0x00, 0x00, 0xEE, 0xAC, 0xE7, 0xF4, 0x89, 0x94, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xE6, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xAB, 0x00, 0x00, /* 0x60-0x63 */
+	0xE7, 0xEA, 0x00, 0x00, 0x8F, 0xDE, 0xEE, 0xAF, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x8D, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB1, /* 0x74-0x77 */
+	0xEE, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x67, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x8B, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x65, /* 0x80-0x83 */
+	0x00, 0x00, 0x93, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xED, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x91, 0x4C, 0x00, 0x00, 0xE7, 0xF2, /* 0x90-0x93 */
+	0x00, 0x00, 0xE7, 0xEC, 0xE7, 0xF1, 0x00, 0x00, /* 0x94-0x97 */
+	0x96, 0xC1, 0x00, 0x00, 0x92, 0xB6, 0xE7, 0xF3, /* 0x98-0x9B */
+	0xE7, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB0, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x91, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF7, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE7, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF5, /* 0xCC-0xCF */
+	0xEE, 0xB6, 0x00, 0x00, 0x96, 0x4E, 0xEE, 0xBA, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xEE, 0xB8, 0x00, 0x00, 0xEE, 0xB4, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xEE, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xEE, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x8F, 0x9B, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB3, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE7, 0xF8, 0x95, 0xDD, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x89, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x95, 0x65, 0x92, 0x92, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x8B, 0x98, 0xED, 0x49, 0xE7, 0xFA, 0xEE, 0xBD, /* 0xF8-0xFB */
+	0x8D, 0x7C, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xC0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_93[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xC2, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0x4B, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF9, /* 0x0C-0x0F */
+	0x90, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x90, 0x8E, 0xE8, 0x40, 0xE8, 0x42, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xEE, 0xC1, 0xEE, 0xBF, 0x00, 0x00, /* 0x1C-0x1F */
+	0x8F, 0xF9, 0xEE, 0xBC, 0xE8, 0x41, 0xE8, 0x43, /* 0x20-0x23 */
+	0x00, 0x00, 0xEE, 0xBB, 0x8B, 0xD1, 0x00, 0x00, /* 0x24-0x27 */
+	0x95, 0x64, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xE0, /* 0x28-0x2B */
+	0x98, 0x42, 0x00, 0x00, 0xE7, 0xFC, 0x8D, 0xF6, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x5E, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xE8, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0x44, 0xE8, 0x46, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE7, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xED, 0x42, 0x00, 0x00, 0x00, 0x00, 0x93, 0xE7, /* 0x48-0x4B */
+	0x00, 0x00, 0x93, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x92, 0xD5, 0x00, 0x00, 0xE8, 0x4B, 0xEE, 0xC4, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x62, /* 0x58-0x5B */
+	0xE8, 0x47, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xE8, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x8C, 0x4C, 0x00, 0x00, 0xE8, 0x4A, 0x00, 0x00, /* 0x6C-0x6F */
+	0xEE, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x8C, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xE8, 0x49, 0x00, 0x00, 0x8F, 0xDF, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x8A, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE8, 0x4F, 0x00, 0x00, 0x8D, 0xBD, 0x91, 0x99, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x92, 0xC8, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xEE, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x5A, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE8, 0x4D, 0xE8, 0x4E, 0x92, 0xC1, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE8, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE8, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x56, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xC6, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE8, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xE8, 0x58, 0x93, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0x51, 0xE8, 0x52, /* 0xD4-0xD7 */
+	0xE8, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE8, 0x57, 0xEE, 0xC7, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x8B, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE8, 0x5A, 0xE8, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xE8, 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xEE, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_94[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x5E, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x5F, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE8, 0x60, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x5D, /* 0x10-0x13 */
+	0xE8, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x8F, 0xE0, 0x93, 0xA8, 0xE8, 0x5B, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE8, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x62, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xEE, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xE8, 0x63, 0xE8, 0x61, 0x00, 0x00, /* 0x34-0x37 */
+	0x91, 0xF6, 0x00, 0x00, 0xE8, 0x65, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE8, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE8, 0x68, 0xEE, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xEE, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x8A, 0xD3, 0xE8, 0x67, 0x96, 0xF8, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0x73, 0xE8, 0x69, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0x6C, 0x00, 0x00, /* 0x5C-0x5F */
+	0xE8, 0x6A, 0x00, 0x00, 0xE8, 0x6B, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0x6D, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE8, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xE8, 0x70, 0x00, 0x00, 0xE8, 0x71, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xE8, 0x74, 0xE8, 0x72, 0xE8, 0x75, 0xE8, 0x77, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE8, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+};
+
+static const unsigned char u2c_95[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xB7, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x96, 0xE5, 0x00, 0x00, 0xE8, 0x78, 0x91, 0x4D, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x79, /* 0x84-0x87 */
+	0x00, 0x00, 0x95, 0xC2, 0xE8, 0x7A, 0x8A, 0x4A, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x5B, /* 0x8C-0x8F */
+	0x00, 0x00, 0x8A, 0xD5, 0xEE, 0xCC, 0x8A, 0xD4, /* 0x90-0x93 */
+	0xE8, 0x7B, 0x00, 0x00, 0xE8, 0x7C, 0x00, 0x00, /* 0x94-0x97 */
+	0xE8, 0x7D, 0xE8, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xE8, 0x80, 0x00, 0x00, 0x8A, 0xD6, 0x8A, 0x74, /* 0xA0-0xA3 */
+	0x8D, 0x7D, 0x94, 0xB4, 0x00, 0x00, 0xE8, 0x82, /* 0xA4-0xA7 */
+	0xE8, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xE8, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x7B, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE8, 0x86, 0x00, 0x00, 0xE8, 0x85, /* 0xB8-0xBB */
+	0xE8, 0x84, 0x00, 0x00, 0xE8, 0x87, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x8A, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xC5, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0x88, 0x00, 0x00, /* 0xC8-0xCB */
+	0xE8, 0x8C, 0xE8, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE8, 0x8E, 0xE8, 0x8D, 0xE8, 0x8F, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x93, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE8, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE8, 0x91, 0xE8, 0x93, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE8, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+};
+
+static const unsigned char u2c_96[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x95, 0x8C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE8, 0x94, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xE8, 0x95, 0x00, 0x00, 0x8D, 0xE3, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0x96, 0xE8, 0x97, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x68, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x6A, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xA2, /* 0x3C-0x3F */
+	0x91, 0xC9, 0x00, 0x00, 0xE8, 0x98, 0x00, 0x00, /* 0x40-0x43 */
+	0x95, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x9B, /* 0x48-0x4B */
+	0xE8, 0x99, 0x8D, 0x7E, 0x00, 0x00, 0xE8, 0x9A, /* 0x4C-0x4F */
+	0x8C, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xC3, /* 0x58-0x5B */
+	0xE8, 0x9D, 0xE8, 0x9F, 0xE8, 0x9E, 0xE8, 0xA0, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x40, 0x90, 0x77, /* 0x60-0x63 */
+	0x8F, 0x9C, 0x8A, 0xD7, 0xE8, 0xA1, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0x86, 0x00, 0x00, /* 0x68-0x6B */
+	0xE8, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x89, 0x41, 0x00, 0x00, 0xE8, 0xA2, 0x92, 0xC2, /* 0x70-0x73 */
+	0x00, 0x00, 0x97, 0xCB, 0x93, 0xA9, 0xE8, 0x9C, /* 0x74-0x77 */
+	0x97, 0xA4, 0x00, 0x00, 0x8C, 0xAF, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x97, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x8B, 0xF7, 0x97, 0xB2, 0x00, 0x00, /* 0x84-0x87 */
+	0x8C, 0x47, 0x00, 0x00, 0x91, 0xE0, 0xE4, 0x40, /* 0x88-0x8B */
+	0x00, 0x00, 0xE8, 0xA4, 0x8A, 0x4B, 0x90, 0x8F, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x8A, 0x75, 0xE8, 0xA6, 0x00, 0x00, 0xE8, 0xA7, /* 0x94-0x97 */
+	0xE8, 0xA5, 0x8C, 0x84, 0x00, 0x00, 0x8D, 0xDB, /* 0x98-0x9B */
+	0x8F, 0xE1, 0xEE, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x89, 0x42, 0x00, 0x00, 0x00, 0x00, 0x97, 0xD7, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA9, /* 0xA4-0xA7 */
+	0xE7, 0xAC, 0x00, 0x00, 0xE8, 0xA8, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xD0, /* 0xAC-0xAF */
+	0xE8, 0xAC, 0xE8, 0xAA, 0xE8, 0xAB, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE8, 0xAD, 0x00, 0x00, 0xE8, 0xAE, 0x97, 0xEA, /* 0xB4-0xB7 */
+	0xE8, 0xAF, 0xE8, 0xB0, 0x00, 0x00, 0x90, 0xC7, /* 0xB8-0xBB */
+	0x94, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x90, 0x9D, 0x8A, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x97, 0x59, 0x89, 0xEB, 0x8F, 0x57, 0x8C, 0xD9, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE8, 0xB3, 0x00, 0x00, 0xE8, 0xB2, /* 0xC8-0xCB */
+	0x8E, 0x93, 0xE8, 0xB4, 0xE8, 0xB1, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x8E, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE8, 0xB8, 0xE5, 0xAB, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x99, 0xD4, 0x00, 0x00, 0x90, 0x97, /* 0xD8-0xDB */
+	0xE8, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xA3, 0x93, 0xEF, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x89, 0x4A, 0x00, 0x00, 0x90, 0xE1, 0x8E, 0xB4, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x95, 0xB5, 0x00, 0x00, 0x89, 0x5F, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xEB, 0x97, 0x8B, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE8, 0xB9, 0x00, 0x00, 0x93, 0x64, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_97[512] = {
+	0x8E, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xE8, 0xBA, 0x00, 0x00, 0xE8, 0xBB, 0x90, 0x6B, /* 0x04-0x07 */
+	0xE8, 0xBC, 0x00, 0x00, 0x97, 0xEC, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE8, 0xB7, 0xE8, 0xBE, 0xE8, 0xC0, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE8, 0xBF, 0x00, 0x00, 0xE8, 0xBD, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xC1, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE8, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x91, 0x9A, 0x00, 0x00, 0x89, 0xE0, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xE8, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x96, 0xB6, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xC4, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE8, 0xC5, 0x00, 0x00, 0x98, 0x49, 0xEE, 0xD1, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x9E, 0x50, 0xE8, 0xC6, 0x00, 0x00, 0xEE, 0xD2, /* 0x38-0x3B */
+	0x00, 0x00, 0xE8, 0xC7, 0xE8, 0xC8, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xCC, 0xEE, 0xD3, /* 0x40-0x43 */
+	0xE8, 0xC9, 0x00, 0x00, 0xE8, 0xCA, 0x00, 0x00, /* 0x44-0x47 */
+	0xE8, 0xCB, 0xE8, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xEE, 0xD4, 0x00, 0x00, 0xEE, 0xD5, /* 0x4C-0x4F */
+	0x00, 0x00, 0xEE, 0xD6, 0x90, 0xC2, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xEE, 0xD7, 0x96, 0xF5, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x90, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xE8, 0xCE, 0x00, 0x00, 0x94, 0xF1, 0x00, 0x00, /* 0x5C-0x5F */
+	0xE8, 0xCF, 0xEA, 0x72, 0x96, 0xCA, 0x00, 0x00, /* 0x60-0x63 */
+	0xE8, 0xD0, 0x00, 0x00, 0xE8, 0xD1, 0x00, 0x00, /* 0x64-0x67 */
+	0xE8, 0xD2, 0x8A, 0x76, 0x00, 0x00, 0xE8, 0xD4, /* 0x68-0x6B */
+	0x00, 0x00, 0x90, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xE8, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x8C, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xE8, 0xD6, 0xE8, 0xDA, 0x00, 0x00, /* 0x78-0x7B */
+	0xE8, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE8, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x8A, 0x93, 0xE8, 0xD7, 0xE8, 0xDB, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xDC, /* 0x88-0x8B */
+	0x00, 0x00, 0x88, 0xC6, 0x00, 0x00, 0xE8, 0xDD, /* 0x8C-0x8F */
+	0xE8, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x8F, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xE8, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x8B, 0x66, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE2, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xE1, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE8, 0xE0, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x91, /* 0xA8-0xAB */
+	0x00, 0x00, 0x95, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE3, /* 0xB0-0xB3 */
+	0xE8, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE5, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xE6, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE8, 0xE7, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE8, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xD8, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE8, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xE8, 0xEA, 0x94, 0x42, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xEC, 0x89, 0xB9, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xE8, 0xEF, 0xE8, 0xEE, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x43, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xBF, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_98[512] = {
+	0x00, 0x00, 0x95, 0xC5, 0x92, 0xB8, 0x8D, 0xA0, /* 0x00-0x03 */
+	0x00, 0x00, 0x8D, 0x80, 0x8F, 0x87, 0x00, 0x00, /* 0x04-0x07 */
+	0x90, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xE8, 0xF1, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF0, /* 0x0C-0x0F */
+	0x97, 0x61, 0x8A, 0xE6, 0x94, 0xD0, 0x93, 0xDA, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x9C, /* 0x14-0x17 */
+	0x97, 0xCC, 0x00, 0x00, 0x8C, 0x7A, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE8, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xE8, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x96, 0x6A, 0x93, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x89, 0x6F, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF5, /* 0x34-0x37 */
+	0xE8, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x95, 0x70, /* 0x38-0x3B */
+	0x97, 0x8A, 0xE8, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xF7, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF9, /* 0x48-0x4B */
+	0x91, 0xE8, 0x8A, 0x7A, 0x8A, 0x7B, 0xE8, 0xF8, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x8A, 0xE7, 0x8C, 0xB0, 0x00, 0x00, 0xEE, 0xD8, /* 0x54-0x57 */
+	0x8A, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x93, 0x5E, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x97, 0xDE, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xEE, 0xD9, 0x00, 0x00, 0x8C, 0xDA, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xFA, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xFB, /* 0x6C-0x6F */
+	0xE8, 0xFC, 0xE9, 0x40, 0x00, 0x00, 0xE9, 0x42, /* 0x70-0x73 */
+	0xE9, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x95, 0x97, 0x00, 0x00, 0xE9, 0x43, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x44, /* 0xAC-0xAF */
+	0x00, 0x00, 0xE9, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x46, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x48, /* 0xC0-0xC3 */
+	0xE9, 0x47, 0x00, 0x00, 0xE9, 0x49, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xF2, /* 0xD8-0xDB */
+	0xE3, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x90, 0x48, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x51, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE9, 0x4A, 0x00, 0x00, 0xE9, 0x4B, /* 0xE8-0xEB */
+	0x00, 0x00, 0x99, 0xAA, 0x9F, 0x5A, 0x94, 0xD1, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0xF9, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x88, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x8E, 0x94, 0x96, 0x4F, 0x8F, 0xFC, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_99[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x4C, /* 0x00-0x03 */
+	0x00, 0x00, 0x96, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xE9, 0x4D, 0x97, 0x7B, 0x00, 0x00, /* 0x08-0x0B */
+	0x89, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x8E, 0x60, 0x00, 0x00, 0xE9, 0x4E, 0x89, 0xEC, /* 0x10-0x13 */
+	0xE9, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE9, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xE9, 0x52, 0xE9, 0x53, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE9, 0x55, 0xE9, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xE9, 0x54, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xDC, /* 0x24-0x27 */
+	0x8A, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xE9, 0x56, 0x00, 0x00, 0xE9, 0x57, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xE9, 0x58, 0xE9, 0x59, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x5A, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xE9, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xE9, 0x5B, 0x00, 0x00, 0xE9, 0x5E, /* 0x48-0x4B */
+	0xE9, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE9, 0x5D, 0xE9, 0x5F, 0xE9, 0x60, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE9, 0x62, 0x00, 0x00, 0x8B, 0xC0, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x8E, 0xF1, 0xE9, 0x63, /* 0x94-0x97 */
+	0xE9, 0x64, 0x8D, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xDE, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xE9, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x8A, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x94, 0x6E, 0xE9, 0x66, 0xE9, 0x67, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x79, /* 0xB0-0xB3 */
+	0x93, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xE9, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x94, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x91, 0xCA, 0x89, 0x77, 0x8B, 0xEC, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x8B, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x92, 0x93, 0xE9, 0x6D, 0x8B, 0xEE, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x89, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE9, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x6A, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE9, 0x6B, 0x00, 0x00, 0xE9, 0x69, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x77, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xE9, 0x6E, 0xE9, 0x6F, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE9, 0x70, 0xE9, 0x71, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xE9, 0x73, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x72, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x78, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9A[512] = {
+	0x00, 0x00, 0xE9, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xE9, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0x52, 0xE9, 0x75, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x91, 0x9B, 0x8C, 0xB1, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE9, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x91, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x79, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x93, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x7A, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x80, 0x00, 0x00, /* 0x3C-0x3F */
+	0xE9, 0x7D, 0x00, 0x00, 0xE9, 0x7C, 0xE9, 0x7E, /* 0x40-0x43 */
+	0x00, 0x00, 0xE9, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xE9, 0x82, 0xEE, 0xDF, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE9, 0x81, 0x00, 0x00, 0xE9, 0x84, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x8B, 0xC1, 0xE9, 0x83, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x85, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x86, 0x00, 0x00, /* 0x60-0x63 */
+	0xE9, 0x88, 0xE9, 0x87, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE9, 0x89, 0xE9, 0x8B, 0xE9, 0x8A, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x8D, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xE9, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE9, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x8A, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xE9, 0x8E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE9, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x90, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x90, /* 0xCC-0xCF */
+	0x00, 0x00, 0xE9, 0x91, 0x00, 0x00, 0xE9, 0x92, /* 0xD0-0xD3 */
+	0xE9, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x8D, 0x82, 0xEE, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xEE, 0xE1, 0x00, 0x00, 0xE9, 0x94, 0xE9, 0x95, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x96, 0xE9, 0x97, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x98, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x94, 0xAF, 0xE9, 0x9A, /* 0xE8-0xEB */
+	0x00, 0x00, 0x95, 0x45, 0xE9, 0x9B, 0xE9, 0x99, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE9, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE9, 0x9C, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x9E, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x9F, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_9B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xA0, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE9, 0xA1, 0x00, 0x00, 0xE9, 0xA2, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xA3, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xA4, 0xE9, 0xA5, /* 0x20-0x23 */
+	0x00, 0x00, 0xE9, 0xA6, 0x00, 0x00, 0xE9, 0xA7, /* 0x24-0x27 */
+	0xE9, 0xA8, 0xE9, 0xA9, 0xE9, 0xAA, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xAB, 0xE9, 0xAC, /* 0x2C-0x2F */
+	0x00, 0x00, 0x9F, 0x54, 0xE9, 0xAD, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF6, /* 0x38-0x3B */
+	0x8B, 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x8A, 0x40, 0x8D, 0xB0, 0xE9, 0xAF, /* 0x40-0x43 */
+	0xE9, 0xAE, 0x96, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xE9, 0xB1, 0xE9, 0xB2, 0xE9, 0xB0, /* 0x4C-0x4F */
+	0x00, 0x00, 0xE9, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x96, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xE9, 0xB4, 0x00, 0x00, 0x8B, 0x9B, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x44, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xE3, 0x00, 0x00, /* 0x70-0x73 */
+	0xE9, 0xB5, 0xEE, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB7, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0xBC, 0xEE, 0xE4, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE9, 0xB8, 0x95, 0xA9, 0xE9, 0xB6, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xB9, 0xE9, 0xBA, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xBB, /* 0x9C-0x9F */
+	0xE9, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE9, 0xBD, 0x00, 0x00, 0x96, 0x8E, 0x8E, 0x4C, /* 0xA8-0xAB */
+	0x00, 0x00, 0x8D, 0xF8, 0x91, 0x4E, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xEE, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE9, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE9, 0xC1, 0x00, 0x00, 0xEE, 0xE6, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE9, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xC2, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x8C, 0xEF, 0xE9, 0xC0, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC3, /* 0xCC-0xCF */
+	0x00, 0x00, 0xE9, 0xC4, 0xE9, 0xC5, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE9, 0xC9, 0x00, 0x00, 0x8E, 0x49, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xE2, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE9, 0xCA, 0xE9, 0xC7, 0xE9, 0xC6, /* 0xE0-0xE3 */
+	0xE9, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x8C, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE9, 0xCE, 0xE9, 0xCD, 0xE9, 0xCC, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x88, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_9C[512] = {
+	0xEE, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xE9, 0xD8, 0x00, 0x00, 0xE9, 0xD4, 0x00, 0x00, /* 0x04-0x07 */
+	0xE9, 0xD5, 0xE9, 0xD1, 0xE9, 0xD7, 0x00, 0x00, /* 0x08-0x0B */
+	0xE9, 0xD3, 0x8A, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x98, 0x6B, 0x00, 0x00, 0xE9, 0xD6, 0xE9, 0xD2, /* 0x10-0x13 */
+	0xE9, 0xD0, 0xE9, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xDA, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xE9, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xE9, 0xDC, 0xE9, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x95, 0x68, 0xE9, 0xD9, 0x88, 0xF1, /* 0x2C-0x2F */
+	0xE9, 0xDE, 0x00, 0x00, 0xE9, 0xE0, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x8A, 0x8F, 0xE9, 0xCB, 0x89, 0x56, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xE2, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xE1, 0xE9, 0xDF, /* 0x44-0x47 */
+	0x92, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x96, 0x90, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xD8, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xE3, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xE9, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE5, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xE6, 0x00, 0x00, /* 0x74-0x77 */
+	0xE9, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x92, 0xB9, 0x00, 0x00, 0xE9, 0xE8, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x94, 0xB5, 0x00, 0x00, 0xE9, 0xED, /* 0xE8-0xEB */
+	0xE9, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE9, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x96, 0x50, /* 0xF0-0xF3 */
+	0x96, 0xC2, 0x00, 0x00, 0x93, 0xCE, 0x00, 0x00, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_9D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xEE, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xEF, 0x93, 0xBC, /* 0x04-0x07 */
+	0xE9, 0xEC, 0xE9, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0xA8, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xF7, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xE9, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x95, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF4, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF3, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xF1, 0x00, 0x00, /* 0x24-0x27 */
+	0x8A, 0x9B, 0x00, 0x00, 0xE9, 0xF0, 0x8E, 0xB0, /* 0x28-0x2B */
+	0x89, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x83, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xFA, 0xE9, 0xF9, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE9, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE9, 0xF5, 0x00, 0x00, 0xE9, 0xFB, 0x00, 0x00, /* 0x44-0x47 */
+	0xE9, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xEA, 0x44, 0xEA, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xEA, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x89, 0x4C, 0xEA, 0x40, 0xEA, 0x41, 0x00, 0x00, /* 0x5C-0x5F */
+	0x8D, 0x94, 0x96, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xEA, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE9, /* 0x68-0x6B */
+	0x96, 0x51, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x4A, /* 0x6C-0x6F */
+	0xEE, 0xE8, 0x00, 0x00, 0xEA, 0x46, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x4B, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x48, /* 0x84-0x87 */
+	0x00, 0x00, 0xEA, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x7B, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x4C, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xEA, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xEA, 0x4E, 0x00, 0x00, 0xEA, 0x49, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF2, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x4F, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x92, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xEA, 0x53, 0x00, 0x00, 0xEA, 0x54, 0xEA, 0x52, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xEA, 0x51, 0xEA, 0x57, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xEA, 0x50, 0x00, 0x00, 0xEA, 0x55, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x56, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x59, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xEA, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x5B, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xEA, 0x5C, 0x00, 0x00, 0xEA, 0x5D, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x68, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xEA, 0x5A, 0x91, 0xE9, 0x8D, 0xEB, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xEA, 0x5E, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xEE, 0xEB, 0xEA, 0x5F, 0xEA, 0x60, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x61, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xEA, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x8C, 0xB2, 0xEA, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xEA, 0x64, 0x00, 0x00, 0x8E, 0xAD, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xEA, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xEA, 0x66, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x67, /* 0x88-0x8B */
+	0xEA, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xEA, 0x6B, 0xEA, 0x69, 0x98, 0x5B, /* 0x90-0x93 */
+	0x00, 0x00, 0xEA, 0x6A, 0x00, 0x00, 0x97, 0xED, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xEA, 0x6C, 0x00, 0x00, 0x97, 0xD9, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xEA, 0x6D, 0x94, 0x9E, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xEA, 0x6E, 0xEA, 0x70, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xEA, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xEA, 0x6F, 0x8D, 0x8D, 0x96, 0xCB, 0x96, 0x83, /* 0xB8-0xBB */
+	0x9B, 0xF5, 0x00, 0x00, 0x9F, 0x80, 0x96, 0x9B, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x89, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xEA, 0x73, 0x8B, 0x6F, 0xEA, 0x74, 0xEA, 0x75, /* 0xCC-0xCF */
+	0xEA, 0x76, 0xEE, 0xEC, 0x8D, 0x95, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xEA, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xE0, 0xD2, 0x96, 0xD9, 0x00, 0x00, 0x91, 0xE1, /* 0xD8-0xDB */
+	0xEA, 0x78, 0xEA, 0x7A, 0xEA, 0x79, 0x00, 0x00, /* 0xDC-0xDF */
+	0xEA, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xEA, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xEA, 0x7D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x7E, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xEA, 0x80, 0x00, 0x00, 0xEA, 0x81, 0xEA, 0x82, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xEA, 0x83, 0x00, 0x00, 0xEA, 0x84, /* 0xF8-0xFB */
+	0xEA, 0x85, 0xEA, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9F[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x87, /* 0x04-0x07 */
+	0xEA, 0x88, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x93, 0x43, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xDB, /* 0x10-0x13 */
+	0x00, 0x00, 0xEA, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x91, 0x6C, 0xEA, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xEA, 0x8C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x40, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x8D, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x8E, 0xE2, 0x56, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xD8, 0xE8, 0xEB, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x8F, 0x00, 0x00, /* 0x50-0x53 */
+	0xEA, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x92, /* 0x5C-0x5F */
+	0xEA, 0x93, 0xEA, 0x94, 0x97, 0xEE, 0xEA, 0x91, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x95, 0xEA, 0x96, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x98, 0x00, 0x00, /* 0x68-0x6B */
+	0xEA, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x9A, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0x9B, 0xEA, 0x99, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x97, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xEA, 0x9C, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xEA, 0x9D, 0xE2, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xEA, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+};
+
+static const unsigned char u2c_DC[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+};
+
+static const unsigned char u2c_F9[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xED, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xEE, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+};
+
+static const unsigned char u2c_FA[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x73, 0xED, 0x7E, /* 0x0C-0x0F */
+	0xED, 0x80, 0xED, 0x95, 0xED, 0xBC, 0xED, 0xCC, /* 0x10-0x13 */
+	0xED, 0xCE, 0xED, 0xF9, 0xEE, 0x42, 0xEE, 0x59, /* 0x14-0x17 */
+	0xEE, 0x61, 0xEE, 0x62, 0xEE, 0x63, 0xEE, 0x65, /* 0x18-0x1B */
+	0xEE, 0x69, 0xEE, 0x6C, 0xEE, 0x75, 0xEE, 0x81, /* 0x1C-0x1F */
+	0xEE, 0x83, 0xEE, 0x84, 0xEE, 0x8D, 0xEE, 0x95, /* 0x20-0x23 */
+	0xEE, 0x97, 0xEE, 0x98, 0xEE, 0x9B, 0xEE, 0xB7, /* 0x24-0x27 */
+	0xEE, 0xBE, 0xEE, 0xCE, 0xEE, 0xDA, 0xEE, 0xDB, /* 0x28-0x2B */
+	0xEE, 0xDD, 0xEE, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+};
+
+static const unsigned char u2c_FF[512] = {
+	0x00, 0x00, 0x81, 0x49, 0xEE, 0xFC, 0x81, 0x94, /* 0x00-0x03 */
+	0x81, 0x90, 0x81, 0x93, 0x81, 0x95, 0xEE, 0xFB, /* 0x04-0x07 */
+	0x81, 0x69, 0x81, 0x6A, 0x81, 0x96, 0x81, 0x7B, /* 0x08-0x0B */
+	0x81, 0x43, 0x81, 0x7C, 0x81, 0x44, 0x81, 0x5E, /* 0x0C-0x0F */
+	0x82, 0x4F, 0x82, 0x50, 0x82, 0x51, 0x82, 0x52, /* 0x10-0x13 */
+	0x82, 0x53, 0x82, 0x54, 0x82, 0x55, 0x82, 0x56, /* 0x14-0x17 */
+	0x82, 0x57, 0x82, 0x58, 0x81, 0x46, 0x81, 0x47, /* 0x18-0x1B */
+	0x81, 0x83, 0x81, 0x81, 0x81, 0x84, 0x81, 0x48, /* 0x1C-0x1F */
+	0x81, 0x97, 0x82, 0x60, 0x82, 0x61, 0x82, 0x62, /* 0x20-0x23 */
+	0x82, 0x63, 0x82, 0x64, 0x82, 0x65, 0x82, 0x66, /* 0x24-0x27 */
+	0x82, 0x67, 0x82, 0x68, 0x82, 0x69, 0x82, 0x6A, /* 0x28-0x2B */
+	0x82, 0x6B, 0x82, 0x6C, 0x82, 0x6D, 0x82, 0x6E, /* 0x2C-0x2F */
+	0x82, 0x6F, 0x82, 0x70, 0x82, 0x71, 0x82, 0x72, /* 0x30-0x33 */
+	0x82, 0x73, 0x82, 0x74, 0x82, 0x75, 0x82, 0x76, /* 0x34-0x37 */
+	0x82, 0x77, 0x82, 0x78, 0x82, 0x79, 0x81, 0x6D, /* 0x38-0x3B */
+	0x81, 0x5F, 0x81, 0x6E, 0x81, 0x4F, 0x81, 0x51, /* 0x3C-0x3F */
+	0x81, 0x4D, 0x82, 0x81, 0x82, 0x82, 0x82, 0x83, /* 0x40-0x43 */
+	0x82, 0x84, 0x82, 0x85, 0x82, 0x86, 0x82, 0x87, /* 0x44-0x47 */
+	0x82, 0x88, 0x82, 0x89, 0x82, 0x8A, 0x82, 0x8B, /* 0x48-0x4B */
+	0x82, 0x8C, 0x82, 0x8D, 0x82, 0x8E, 0x82, 0x8F, /* 0x4C-0x4F */
+	0x82, 0x90, 0x82, 0x91, 0x82, 0x92, 0x82, 0x93, /* 0x50-0x53 */
+	0x82, 0x94, 0x82, 0x95, 0x82, 0x96, 0x82, 0x97, /* 0x54-0x57 */
+	0x82, 0x98, 0x82, 0x99, 0x82, 0x9A, 0x81, 0x6F, /* 0x58-0x5B */
+	0x81, 0x62, 0x81, 0x70, 0x81, 0x60, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0xA1, 0x00, 0xA2, 0x00, 0xA3, /* 0x60-0x63 */
+	0x00, 0xA4, 0x00, 0xA5, 0x00, 0xA6, 0x00, 0xA7, /* 0x64-0x67 */
+	0x00, 0xA8, 0x00, 0xA9, 0x00, 0xAA, 0x00, 0xAB, /* 0x68-0x6B */
+	0x00, 0xAC, 0x00, 0xAD, 0x00, 0xAE, 0x00, 0xAF, /* 0x6C-0x6F */
+	0x00, 0xB0, 0x00, 0xB1, 0x00, 0xB2, 0x00, 0xB3, /* 0x70-0x73 */
+	0x00, 0xB4, 0x00, 0xB5, 0x00, 0xB6, 0x00, 0xB7, /* 0x74-0x77 */
+	0x00, 0xB8, 0x00, 0xB9, 0x00, 0xBA, 0x00, 0xBB, /* 0x78-0x7B */
+	0x00, 0xBC, 0x00, 0xBD, 0x00, 0xBE, 0x00, 0xBF, /* 0x7C-0x7F */
+	
+	0x00, 0xC0, 0x00, 0xC1, 0x00, 0xC2, 0x00, 0xC3, /* 0x80-0x83 */
+	0x00, 0xC4, 0x00, 0xC5, 0x00, 0xC6, 0x00, 0xC7, /* 0x84-0x87 */
+	0x00, 0xC8, 0x00, 0xC9, 0x00, 0xCA, 0x00, 0xCB, /* 0x88-0x8B */
+	0x00, 0xCC, 0x00, 0xCD, 0x00, 0xCE, 0x00, 0xCF, /* 0x8C-0x8F */
+	0x00, 0xD0, 0x00, 0xD1, 0x00, 0xD2, 0x00, 0xD3, /* 0x90-0x93 */
+	0x00, 0xD4, 0x00, 0xD5, 0x00, 0xD6, 0x00, 0xD7, /* 0x94-0x97 */
+	0x00, 0xD8, 0x00, 0xD9, 0x00, 0xDA, 0x00, 0xDB, /* 0x98-0x9B */
+	0x00, 0xDC, 0x00, 0xDD, 0x00, 0xDE, 0x00, 0xDF, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x81, 0x91, 0x81, 0x92, 0x81, 0xCA, 0x81, 0x50, /* 0xE0-0xE3 */
+	0xEE, 0xFA, 0x81, 0x8F, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	NULL,   NULL,   NULL,   u2c_03, u2c_04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_20, u2c_21, u2c_22, u2c_23, u2c_24, u2c_25, u2c_26, NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_30, NULL,   u2c_32, u2c_33, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   u2c_4E, u2c_4F, 
+	u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, 
+	u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, 
+	u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, 
+	u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, 
+	u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, 
+	u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, 
+	u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, 
+	u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, 
+	u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, 
+	u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, 
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   u2c_DC, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   u2c_F9, u2c_FA, NULL,   NULL,   NULL,   NULL,   u2c_FF, };
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(const wchar_t uni,
+		    unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni&0xFF;
+	unsigned char ch = (uni>>8)&0xFF;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	if (ch == 0xFF && 0x61 <= cl && cl <= 0x9F) {
+		out[0] = cl + 0x40;
+		return 1;
+	}
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset) {
+		if (boundlen < 2)
+			return -ENAMETOOLONG;
+
+		out[0] = uni2charset[cl*2];
+		out[1] = uni2charset[cl*2+1];
+		if (out[0] == 0x00 && out[1] == 0x00)
+			return -EINVAL;
+		return 2;
+	} else if (ch == 0) {
+		if (cl <= 0x7F) {
+			out[0] = cl;
+			return 1;
+		} else if (0xA0 <= cl) {
+			out[0] = u2c_00hi[cl - 0xA0][0];
+			out[1] = u2c_00hi[cl - 0xA0][1];
+			if (out[0] && out[1])
+				return 2;
+		}
+		return -EINVAL;
+	}
+	else
+		return -EINVAL;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen,
+		    wchar_t *uni)
+{
+	unsigned char ch, cl;
+	const wchar_t *charset2uni;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	if (rawstring[0] <= 0x7F) {
+		*uni = rawstring[0];
+		return 1;
+	}
+	if (0xA1 <= rawstring[0] && rawstring[0] <= 0xDF) {
+		*uni = 0xFF00 | (rawstring[0] - 0x40);
+		return 1;
+	}
+
+	if (boundlen < 2)
+		return -ENAMETOOLONG;
+	ch = rawstring[0];
+	cl = rawstring[1];
+	charset2uni = page_charset2uni[ch];
+	if (charset2uni && cl) {
+		*uni = charset2uni[cl];
+		if (*uni == 0x0000)
+			return -EINVAL;
+		return 2;
+	}
+	else
+		return -EINVAL;
+}
+
+static struct nls_table table = {
+	.charset	= "cp932",
+	.alias		= "sjis",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp932(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp932(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp932)
+module_exit(exit_nls_cp932)
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NLS(sjis);
diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c
new file mode 100644
index 00000000..82770301
--- /dev/null
+++ b/fs/nls/nls_cp936.c
@@ -0,0 +1,11112 @@
+/*
+ * linux/fs/nls/nls_cp936.c
+ *
+ * Charset cp936 translation tables.
+ * This translation table was generated automatically, the
+ * original table can be download from the Microsoft website.
+ * (http://www.microsoft.com/typography/unicode/unicodecp.htm)
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t c2u_81[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x4E02,0x4E04,0x4E05,0x4E06,0x4E0F,0x4E12,0x4E17,0x4E1F,/* 0x40-0x47 */
+	0x4E20,0x4E21,0x4E23,0x4E26,0x4E29,0x4E2E,0x4E2F,0x4E31,/* 0x48-0x4F */
+	0x4E33,0x4E35,0x4E37,0x4E3C,0x4E40,0x4E41,0x4E42,0x4E44,/* 0x50-0x57 */
+	0x4E46,0x4E4A,0x4E51,0x4E55,0x4E57,0x4E5A,0x4E5B,0x4E62,/* 0x58-0x5F */
+	0x4E63,0x4E64,0x4E65,0x4E67,0x4E68,0x4E6A,0x4E6B,0x4E6C,/* 0x60-0x67 */
+	0x4E6D,0x4E6E,0x4E6F,0x4E72,0x4E74,0x4E75,0x4E76,0x4E77,/* 0x68-0x6F */
+	0x4E78,0x4E79,0x4E7A,0x4E7B,0x4E7C,0x4E7D,0x4E7F,0x4E80,/* 0x70-0x77 */
+	0x4E81,0x4E82,0x4E83,0x4E84,0x4E85,0x4E87,0x4E8A,0x0000,/* 0x78-0x7F */
+
+	0x4E90,0x4E96,0x4E97,0x4E99,0x4E9C,0x4E9D,0x4E9E,0x4EA3,/* 0x80-0x87 */
+	0x4EAA,0x4EAF,0x4EB0,0x4EB1,0x4EB4,0x4EB6,0x4EB7,0x4EB8,/* 0x88-0x8F */
+	0x4EB9,0x4EBC,0x4EBD,0x4EBE,0x4EC8,0x4ECC,0x4ECF,0x4ED0,/* 0x90-0x97 */
+	0x4ED2,0x4EDA,0x4EDB,0x4EDC,0x4EE0,0x4EE2,0x4EE6,0x4EE7,/* 0x98-0x9F */
+	0x4EE9,0x4EED,0x4EEE,0x4EEF,0x4EF1,0x4EF4,0x4EF8,0x4EF9,/* 0xA0-0xA7 */
+	0x4EFA,0x4EFC,0x4EFE,0x4F00,0x4F02,0x4F03,0x4F04,0x4F05,/* 0xA8-0xAF */
+	0x4F06,0x4F07,0x4F08,0x4F0B,0x4F0C,0x4F12,0x4F13,0x4F14,/* 0xB0-0xB7 */
+	0x4F15,0x4F16,0x4F1C,0x4F1D,0x4F21,0x4F23,0x4F28,0x4F29,/* 0xB8-0xBF */
+	0x4F2C,0x4F2D,0x4F2E,0x4F31,0x4F33,0x4F35,0x4F37,0x4F39,/* 0xC0-0xC7 */
+	0x4F3B,0x4F3E,0x4F3F,0x4F40,0x4F41,0x4F42,0x4F44,0x4F45,/* 0xC8-0xCF */
+	0x4F47,0x4F48,0x4F49,0x4F4A,0x4F4B,0x4F4C,0x4F52,0x4F54,/* 0xD0-0xD7 */
+	0x4F56,0x4F61,0x4F62,0x4F66,0x4F68,0x4F6A,0x4F6B,0x4F6D,/* 0xD8-0xDF */
+	0x4F6E,0x4F71,0x4F72,0x4F75,0x4F77,0x4F78,0x4F79,0x4F7A,/* 0xE0-0xE7 */
+	0x4F7D,0x4F80,0x4F81,0x4F82,0x4F85,0x4F86,0x4F87,0x4F8A,/* 0xE8-0xEF */
+	0x4F8C,0x4F8E,0x4F90,0x4F92,0x4F93,0x4F95,0x4F96,0x4F98,/* 0xF0-0xF7 */
+	0x4F99,0x4F9A,0x4F9C,0x4F9E,0x4F9F,0x4FA1,0x4FA2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_82[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x4FA4,0x4FAB,0x4FAD,0x4FB0,0x4FB1,0x4FB2,0x4FB3,0x4FB4,/* 0x40-0x47 */
+	0x4FB6,0x4FB7,0x4FB8,0x4FB9,0x4FBA,0x4FBB,0x4FBC,0x4FBD,/* 0x48-0x4F */
+	0x4FBE,0x4FC0,0x4FC1,0x4FC2,0x4FC6,0x4FC7,0x4FC8,0x4FC9,/* 0x50-0x57 */
+	0x4FCB,0x4FCC,0x4FCD,0x4FD2,0x4FD3,0x4FD4,0x4FD5,0x4FD6,/* 0x58-0x5F */
+	0x4FD9,0x4FDB,0x4FE0,0x4FE2,0x4FE4,0x4FE5,0x4FE7,0x4FEB,/* 0x60-0x67 */
+	0x4FEC,0x4FF0,0x4FF2,0x4FF4,0x4FF5,0x4FF6,0x4FF7,0x4FF9,/* 0x68-0x6F */
+	0x4FFB,0x4FFC,0x4FFD,0x4FFF,0x5000,0x5001,0x5002,0x5003,/* 0x70-0x77 */
+	0x5004,0x5005,0x5006,0x5007,0x5008,0x5009,0x500A,0x0000,/* 0x78-0x7F */
+
+	0x500B,0x500E,0x5010,0x5011,0x5013,0x5015,0x5016,0x5017,/* 0x80-0x87 */
+	0x501B,0x501D,0x501E,0x5020,0x5022,0x5023,0x5024,0x5027,/* 0x88-0x8F */
+	0x502B,0x502F,0x5030,0x5031,0x5032,0x5033,0x5034,0x5035,/* 0x90-0x97 */
+	0x5036,0x5037,0x5038,0x5039,0x503B,0x503D,0x503F,0x5040,/* 0x98-0x9F */
+	0x5041,0x5042,0x5044,0x5045,0x5046,0x5049,0x504A,0x504B,/* 0xA0-0xA7 */
+	0x504D,0x5050,0x5051,0x5052,0x5053,0x5054,0x5056,0x5057,/* 0xA8-0xAF */
+	0x5058,0x5059,0x505B,0x505D,0x505E,0x505F,0x5060,0x5061,/* 0xB0-0xB7 */
+	0x5062,0x5063,0x5064,0x5066,0x5067,0x5068,0x5069,0x506A,/* 0xB8-0xBF */
+	0x506B,0x506D,0x506E,0x506F,0x5070,0x5071,0x5072,0x5073,/* 0xC0-0xC7 */
+	0x5074,0x5075,0x5078,0x5079,0x507A,0x507C,0x507D,0x5081,/* 0xC8-0xCF */
+	0x5082,0x5083,0x5084,0x5086,0x5087,0x5089,0x508A,0x508B,/* 0xD0-0xD7 */
+	0x508C,0x508E,0x508F,0x5090,0x5091,0x5092,0x5093,0x5094,/* 0xD8-0xDF */
+	0x5095,0x5096,0x5097,0x5098,0x5099,0x509A,0x509B,0x509C,/* 0xE0-0xE7 */
+	0x509D,0x509E,0x509F,0x50A0,0x50A1,0x50A2,0x50A4,0x50A6,/* 0xE8-0xEF */
+	0x50AA,0x50AB,0x50AD,0x50AE,0x50AF,0x50B0,0x50B1,0x50B3,/* 0xF0-0xF7 */
+	0x50B4,0x50B5,0x50B6,0x50B7,0x50B8,0x50B9,0x50BC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_83[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x50BD,0x50BE,0x50BF,0x50C0,0x50C1,0x50C2,0x50C3,0x50C4,/* 0x40-0x47 */
+	0x50C5,0x50C6,0x50C7,0x50C8,0x50C9,0x50CA,0x50CB,0x50CC,/* 0x48-0x4F */
+	0x50CD,0x50CE,0x50D0,0x50D1,0x50D2,0x50D3,0x50D4,0x50D5,/* 0x50-0x57 */
+	0x50D7,0x50D8,0x50D9,0x50DB,0x50DC,0x50DD,0x50DE,0x50DF,/* 0x58-0x5F */
+	0x50E0,0x50E1,0x50E2,0x50E3,0x50E4,0x50E5,0x50E8,0x50E9,/* 0x60-0x67 */
+	0x50EA,0x50EB,0x50EF,0x50F0,0x50F1,0x50F2,0x50F4,0x50F6,/* 0x68-0x6F */
+	0x50F7,0x50F8,0x50F9,0x50FA,0x50FC,0x50FD,0x50FE,0x50FF,/* 0x70-0x77 */
+	0x5100,0x5101,0x5102,0x5103,0x5104,0x5105,0x5108,0x0000,/* 0x78-0x7F */
+
+	0x5109,0x510A,0x510C,0x510D,0x510E,0x510F,0x5110,0x5111,/* 0x80-0x87 */
+	0x5113,0x5114,0x5115,0x5116,0x5117,0x5118,0x5119,0x511A,/* 0x88-0x8F */
+	0x511B,0x511C,0x511D,0x511E,0x511F,0x5120,0x5122,0x5123,/* 0x90-0x97 */
+	0x5124,0x5125,0x5126,0x5127,0x5128,0x5129,0x512A,0x512B,/* 0x98-0x9F */
+	0x512C,0x512D,0x512E,0x512F,0x5130,0x5131,0x5132,0x5133,/* 0xA0-0xA7 */
+	0x5134,0x5135,0x5136,0x5137,0x5138,0x5139,0x513A,0x513B,/* 0xA8-0xAF */
+	0x513C,0x513D,0x513E,0x5142,0x5147,0x514A,0x514C,0x514E,/* 0xB0-0xB7 */
+	0x514F,0x5150,0x5152,0x5153,0x5157,0x5158,0x5159,0x515B,/* 0xB8-0xBF */
+	0x515D,0x515E,0x515F,0x5160,0x5161,0x5163,0x5164,0x5166,/* 0xC0-0xC7 */
+	0x5167,0x5169,0x516A,0x516F,0x5172,0x517A,0x517E,0x517F,/* 0xC8-0xCF */
+	0x5183,0x5184,0x5186,0x5187,0x518A,0x518B,0x518E,0x518F,/* 0xD0-0xD7 */
+	0x5190,0x5191,0x5193,0x5194,0x5198,0x519A,0x519D,0x519E,/* 0xD8-0xDF */
+	0x519F,0x51A1,0x51A3,0x51A6,0x51A7,0x51A8,0x51A9,0x51AA,/* 0xE0-0xE7 */
+	0x51AD,0x51AE,0x51B4,0x51B8,0x51B9,0x51BA,0x51BE,0x51BF,/* 0xE8-0xEF */
+	0x51C1,0x51C2,0x51C3,0x51C5,0x51C8,0x51CA,0x51CD,0x51CE,/* 0xF0-0xF7 */
+	0x51D0,0x51D2,0x51D3,0x51D4,0x51D5,0x51D6,0x51D7,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_84[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x51D8,0x51D9,0x51DA,0x51DC,0x51DE,0x51DF,0x51E2,0x51E3,/* 0x40-0x47 */
+	0x51E5,0x51E6,0x51E7,0x51E8,0x51E9,0x51EA,0x51EC,0x51EE,/* 0x48-0x4F */
+	0x51F1,0x51F2,0x51F4,0x51F7,0x51FE,0x5204,0x5205,0x5209,/* 0x50-0x57 */
+	0x520B,0x520C,0x520F,0x5210,0x5213,0x5214,0x5215,0x521C,/* 0x58-0x5F */
+	0x521E,0x521F,0x5221,0x5222,0x5223,0x5225,0x5226,0x5227,/* 0x60-0x67 */
+	0x522A,0x522C,0x522F,0x5231,0x5232,0x5234,0x5235,0x523C,/* 0x68-0x6F */
+	0x523E,0x5244,0x5245,0x5246,0x5247,0x5248,0x5249,0x524B,/* 0x70-0x77 */
+	0x524E,0x524F,0x5252,0x5253,0x5255,0x5257,0x5258,0x0000,/* 0x78-0x7F */
+
+	0x5259,0x525A,0x525B,0x525D,0x525F,0x5260,0x5262,0x5263,/* 0x80-0x87 */
+	0x5264,0x5266,0x5268,0x526B,0x526C,0x526D,0x526E,0x5270,/* 0x88-0x8F */
+	0x5271,0x5273,0x5274,0x5275,0x5276,0x5277,0x5278,0x5279,/* 0x90-0x97 */
+	0x527A,0x527B,0x527C,0x527E,0x5280,0x5283,0x5284,0x5285,/* 0x98-0x9F */
+	0x5286,0x5287,0x5289,0x528A,0x528B,0x528C,0x528D,0x528E,/* 0xA0-0xA7 */
+	0x528F,0x5291,0x5292,0x5294,0x5295,0x5296,0x5297,0x5298,/* 0xA8-0xAF */
+	0x5299,0x529A,0x529C,0x52A4,0x52A5,0x52A6,0x52A7,0x52AE,/* 0xB0-0xB7 */
+	0x52AF,0x52B0,0x52B4,0x52B5,0x52B6,0x52B7,0x52B8,0x52B9,/* 0xB8-0xBF */
+	0x52BA,0x52BB,0x52BC,0x52BD,0x52C0,0x52C1,0x52C2,0x52C4,/* 0xC0-0xC7 */
+	0x52C5,0x52C6,0x52C8,0x52CA,0x52CC,0x52CD,0x52CE,0x52CF,/* 0xC8-0xCF */
+	0x52D1,0x52D3,0x52D4,0x52D5,0x52D7,0x52D9,0x52DA,0x52DB,/* 0xD0-0xD7 */
+	0x52DC,0x52DD,0x52DE,0x52E0,0x52E1,0x52E2,0x52E3,0x52E5,/* 0xD8-0xDF */
+	0x52E6,0x52E7,0x52E8,0x52E9,0x52EA,0x52EB,0x52EC,0x52ED,/* 0xE0-0xE7 */
+	0x52EE,0x52EF,0x52F1,0x52F2,0x52F3,0x52F4,0x52F5,0x52F6,/* 0xE8-0xEF */
+	0x52F7,0x52F8,0x52FB,0x52FC,0x52FD,0x5301,0x5302,0x5303,/* 0xF0-0xF7 */
+	0x5304,0x5307,0x5309,0x530A,0x530B,0x530C,0x530E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_85[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5311,0x5312,0x5313,0x5314,0x5318,0x531B,0x531C,0x531E,/* 0x40-0x47 */
+	0x531F,0x5322,0x5324,0x5325,0x5327,0x5328,0x5329,0x532B,/* 0x48-0x4F */
+	0x532C,0x532D,0x532F,0x5330,0x5331,0x5332,0x5333,0x5334,/* 0x50-0x57 */
+	0x5335,0x5336,0x5337,0x5338,0x533C,0x533D,0x5340,0x5342,/* 0x58-0x5F */
+	0x5344,0x5346,0x534B,0x534C,0x534D,0x5350,0x5354,0x5358,/* 0x60-0x67 */
+	0x5359,0x535B,0x535D,0x5365,0x5368,0x536A,0x536C,0x536D,/* 0x68-0x6F */
+	0x5372,0x5376,0x5379,0x537B,0x537C,0x537D,0x537E,0x5380,/* 0x70-0x77 */
+	0x5381,0x5383,0x5387,0x5388,0x538A,0x538E,0x538F,0x0000,/* 0x78-0x7F */
+
+	0x5390,0x5391,0x5392,0x5393,0x5394,0x5396,0x5397,0x5399,/* 0x80-0x87 */
+	0x539B,0x539C,0x539E,0x53A0,0x53A1,0x53A4,0x53A7,0x53AA,/* 0x88-0x8F */
+	0x53AB,0x53AC,0x53AD,0x53AF,0x53B0,0x53B1,0x53B2,0x53B3,/* 0x90-0x97 */
+	0x53B4,0x53B5,0x53B7,0x53B8,0x53B9,0x53BA,0x53BC,0x53BD,/* 0x98-0x9F */
+	0x53BE,0x53C0,0x53C3,0x53C4,0x53C5,0x53C6,0x53C7,0x53CE,/* 0xA0-0xA7 */
+	0x53CF,0x53D0,0x53D2,0x53D3,0x53D5,0x53DA,0x53DC,0x53DD,/* 0xA8-0xAF */
+	0x53DE,0x53E1,0x53E2,0x53E7,0x53F4,0x53FA,0x53FE,0x53FF,/* 0xB0-0xB7 */
+	0x5400,0x5402,0x5405,0x5407,0x540B,0x5414,0x5418,0x5419,/* 0xB8-0xBF */
+	0x541A,0x541C,0x5422,0x5424,0x5425,0x542A,0x5430,0x5433,/* 0xC0-0xC7 */
+	0x5436,0x5437,0x543A,0x543D,0x543F,0x5441,0x5442,0x5444,/* 0xC8-0xCF */
+	0x5445,0x5447,0x5449,0x544C,0x544D,0x544E,0x544F,0x5451,/* 0xD0-0xD7 */
+	0x545A,0x545D,0x545E,0x545F,0x5460,0x5461,0x5463,0x5465,/* 0xD8-0xDF */
+	0x5467,0x5469,0x546A,0x546B,0x546C,0x546D,0x546E,0x546F,/* 0xE0-0xE7 */
+	0x5470,0x5474,0x5479,0x547A,0x547E,0x547F,0x5481,0x5483,/* 0xE8-0xEF */
+	0x5485,0x5487,0x5488,0x5489,0x548A,0x548D,0x5491,0x5493,/* 0xF0-0xF7 */
+	0x5497,0x5498,0x549C,0x549E,0x549F,0x54A0,0x54A1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_86[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x54A2,0x54A5,0x54AE,0x54B0,0x54B2,0x54B5,0x54B6,0x54B7,/* 0x40-0x47 */
+	0x54B9,0x54BA,0x54BC,0x54BE,0x54C3,0x54C5,0x54CA,0x54CB,/* 0x48-0x4F */
+	0x54D6,0x54D8,0x54DB,0x54E0,0x54E1,0x54E2,0x54E3,0x54E4,/* 0x50-0x57 */
+	0x54EB,0x54EC,0x54EF,0x54F0,0x54F1,0x54F4,0x54F5,0x54F6,/* 0x58-0x5F */
+	0x54F7,0x54F8,0x54F9,0x54FB,0x54FE,0x5500,0x5502,0x5503,/* 0x60-0x67 */
+	0x5504,0x5505,0x5508,0x550A,0x550B,0x550C,0x550D,0x550E,/* 0x68-0x6F */
+	0x5512,0x5513,0x5515,0x5516,0x5517,0x5518,0x5519,0x551A,/* 0x70-0x77 */
+	0x551C,0x551D,0x551E,0x551F,0x5521,0x5525,0x5526,0x0000,/* 0x78-0x7F */
+
+	0x5528,0x5529,0x552B,0x552D,0x5532,0x5534,0x5535,0x5536,/* 0x80-0x87 */
+	0x5538,0x5539,0x553A,0x553B,0x553D,0x5540,0x5542,0x5545,/* 0x88-0x8F */
+	0x5547,0x5548,0x554B,0x554C,0x554D,0x554E,0x554F,0x5551,/* 0x90-0x97 */
+	0x5552,0x5553,0x5554,0x5557,0x5558,0x5559,0x555A,0x555B,/* 0x98-0x9F */
+	0x555D,0x555E,0x555F,0x5560,0x5562,0x5563,0x5568,0x5569,/* 0xA0-0xA7 */
+	0x556B,0x556F,0x5570,0x5571,0x5572,0x5573,0x5574,0x5579,/* 0xA8-0xAF */
+	0x557A,0x557D,0x557F,0x5585,0x5586,0x558C,0x558D,0x558E,/* 0xB0-0xB7 */
+	0x5590,0x5592,0x5593,0x5595,0x5596,0x5597,0x559A,0x559B,/* 0xB8-0xBF */
+	0x559E,0x55A0,0x55A1,0x55A2,0x55A3,0x55A4,0x55A5,0x55A6,/* 0xC0-0xC7 */
+	0x55A8,0x55A9,0x55AA,0x55AB,0x55AC,0x55AD,0x55AE,0x55AF,/* 0xC8-0xCF */
+	0x55B0,0x55B2,0x55B4,0x55B6,0x55B8,0x55BA,0x55BC,0x55BF,/* 0xD0-0xD7 */
+	0x55C0,0x55C1,0x55C2,0x55C3,0x55C6,0x55C7,0x55C8,0x55CA,/* 0xD8-0xDF */
+	0x55CB,0x55CE,0x55CF,0x55D0,0x55D5,0x55D7,0x55D8,0x55D9,/* 0xE0-0xE7 */
+	0x55DA,0x55DB,0x55DE,0x55E0,0x55E2,0x55E7,0x55E9,0x55ED,/* 0xE8-0xEF */
+	0x55EE,0x55F0,0x55F1,0x55F4,0x55F6,0x55F8,0x55F9,0x55FA,/* 0xF0-0xF7 */
+	0x55FB,0x55FC,0x55FF,0x5602,0x5603,0x5604,0x5605,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_87[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5606,0x5607,0x560A,0x560B,0x560D,0x5610,0x5611,0x5612,/* 0x40-0x47 */
+	0x5613,0x5614,0x5615,0x5616,0x5617,0x5619,0x561A,0x561C,/* 0x48-0x4F */
+	0x561D,0x5620,0x5621,0x5622,0x5625,0x5626,0x5628,0x5629,/* 0x50-0x57 */
+	0x562A,0x562B,0x562E,0x562F,0x5630,0x5633,0x5635,0x5637,/* 0x58-0x5F */
+	0x5638,0x563A,0x563C,0x563D,0x563E,0x5640,0x5641,0x5642,/* 0x60-0x67 */
+	0x5643,0x5644,0x5645,0x5646,0x5647,0x5648,0x5649,0x564A,/* 0x68-0x6F */
+	0x564B,0x564F,0x5650,0x5651,0x5652,0x5653,0x5655,0x5656,/* 0x70-0x77 */
+	0x565A,0x565B,0x565D,0x565E,0x565F,0x5660,0x5661,0x0000,/* 0x78-0x7F */
+
+	0x5663,0x5665,0x5666,0x5667,0x566D,0x566E,0x566F,0x5670,/* 0x80-0x87 */
+	0x5672,0x5673,0x5674,0x5675,0x5677,0x5678,0x5679,0x567A,/* 0x88-0x8F */
+	0x567D,0x567E,0x567F,0x5680,0x5681,0x5682,0x5683,0x5684,/* 0x90-0x97 */
+	0x5687,0x5688,0x5689,0x568A,0x568B,0x568C,0x568D,0x5690,/* 0x98-0x9F */
+	0x5691,0x5692,0x5694,0x5695,0x5696,0x5697,0x5698,0x5699,/* 0xA0-0xA7 */
+	0x569A,0x569B,0x569C,0x569D,0x569E,0x569F,0x56A0,0x56A1,/* 0xA8-0xAF */
+	0x56A2,0x56A4,0x56A5,0x56A6,0x56A7,0x56A8,0x56A9,0x56AA,/* 0xB0-0xB7 */
+	0x56AB,0x56AC,0x56AD,0x56AE,0x56B0,0x56B1,0x56B2,0x56B3,/* 0xB8-0xBF */
+	0x56B4,0x56B5,0x56B6,0x56B8,0x56B9,0x56BA,0x56BB,0x56BD,/* 0xC0-0xC7 */
+	0x56BE,0x56BF,0x56C0,0x56C1,0x56C2,0x56C3,0x56C4,0x56C5,/* 0xC8-0xCF */
+	0x56C6,0x56C7,0x56C8,0x56C9,0x56CB,0x56CC,0x56CD,0x56CE,/* 0xD0-0xD7 */
+	0x56CF,0x56D0,0x56D1,0x56D2,0x56D3,0x56D5,0x56D6,0x56D8,/* 0xD8-0xDF */
+	0x56D9,0x56DC,0x56E3,0x56E5,0x56E6,0x56E7,0x56E8,0x56E9,/* 0xE0-0xE7 */
+	0x56EA,0x56EC,0x56EE,0x56EF,0x56F2,0x56F3,0x56F6,0x56F7,/* 0xE8-0xEF */
+	0x56F8,0x56FB,0x56FC,0x5700,0x5701,0x5702,0x5705,0x5707,/* 0xF0-0xF7 */
+	0x570B,0x570C,0x570D,0x570E,0x570F,0x5710,0x5711,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_88[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5712,0x5713,0x5714,0x5715,0x5716,0x5717,0x5718,0x5719,/* 0x40-0x47 */
+	0x571A,0x571B,0x571D,0x571E,0x5720,0x5721,0x5722,0x5724,/* 0x48-0x4F */
+	0x5725,0x5726,0x5727,0x572B,0x5731,0x5732,0x5734,0x5735,/* 0x50-0x57 */
+	0x5736,0x5737,0x5738,0x573C,0x573D,0x573F,0x5741,0x5743,/* 0x58-0x5F */
+	0x5744,0x5745,0x5746,0x5748,0x5749,0x574B,0x5752,0x5753,/* 0x60-0x67 */
+	0x5754,0x5755,0x5756,0x5758,0x5759,0x5762,0x5763,0x5765,/* 0x68-0x6F */
+	0x5767,0x576C,0x576E,0x5770,0x5771,0x5772,0x5774,0x5775,/* 0x70-0x77 */
+	0x5778,0x5779,0x577A,0x577D,0x577E,0x577F,0x5780,0x0000,/* 0x78-0x7F */
+
+	0x5781,0x5787,0x5788,0x5789,0x578A,0x578D,0x578E,0x578F,/* 0x80-0x87 */
+	0x5790,0x5791,0x5794,0x5795,0x5796,0x5797,0x5798,0x5799,/* 0x88-0x8F */
+	0x579A,0x579C,0x579D,0x579E,0x579F,0x57A5,0x57A8,0x57AA,/* 0x90-0x97 */
+	0x57AC,0x57AF,0x57B0,0x57B1,0x57B3,0x57B5,0x57B6,0x57B7,/* 0x98-0x9F */
+	0x57B9,0x57BA,0x57BB,0x57BC,0x57BD,0x57BE,0x57BF,0x57C0,/* 0xA0-0xA7 */
+	0x57C1,0x57C4,0x57C5,0x57C6,0x57C7,0x57C8,0x57C9,0x57CA,/* 0xA8-0xAF */
+	0x57CC,0x57CD,0x57D0,0x57D1,0x57D3,0x57D6,0x57D7,0x57DB,/* 0xB0-0xB7 */
+	0x57DC,0x57DE,0x57E1,0x57E2,0x57E3,0x57E5,0x57E6,0x57E7,/* 0xB8-0xBF */
+	0x57E8,0x57E9,0x57EA,0x57EB,0x57EC,0x57EE,0x57F0,0x57F1,/* 0xC0-0xC7 */
+	0x57F2,0x57F3,0x57F5,0x57F6,0x57F7,0x57FB,0x57FC,0x57FE,/* 0xC8-0xCF */
+	0x57FF,0x5801,0x5803,0x5804,0x5805,0x5808,0x5809,0x580A,/* 0xD0-0xD7 */
+	0x580C,0x580E,0x580F,0x5810,0x5812,0x5813,0x5814,0x5816,/* 0xD8-0xDF */
+	0x5817,0x5818,0x581A,0x581B,0x581C,0x581D,0x581F,0x5822,/* 0xE0-0xE7 */
+	0x5823,0x5825,0x5826,0x5827,0x5828,0x5829,0x582B,0x582C,/* 0xE8-0xEF */
+	0x582D,0x582E,0x582F,0x5831,0x5832,0x5833,0x5834,0x5836,/* 0xF0-0xF7 */
+	0x5837,0x5838,0x5839,0x583A,0x583B,0x583C,0x583D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_89[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x583E,0x583F,0x5840,0x5841,0x5842,0x5843,0x5845,0x5846,/* 0x40-0x47 */
+	0x5847,0x5848,0x5849,0x584A,0x584B,0x584E,0x584F,0x5850,/* 0x48-0x4F */
+	0x5852,0x5853,0x5855,0x5856,0x5857,0x5859,0x585A,0x585B,/* 0x50-0x57 */
+	0x585C,0x585D,0x585F,0x5860,0x5861,0x5862,0x5863,0x5864,/* 0x58-0x5F */
+	0x5866,0x5867,0x5868,0x5869,0x586A,0x586D,0x586E,0x586F,/* 0x60-0x67 */
+	0x5870,0x5871,0x5872,0x5873,0x5874,0x5875,0x5876,0x5877,/* 0x68-0x6F */
+	0x5878,0x5879,0x587A,0x587B,0x587C,0x587D,0x587F,0x5882,/* 0x70-0x77 */
+	0x5884,0x5886,0x5887,0x5888,0x588A,0x588B,0x588C,0x0000,/* 0x78-0x7F */
+
+	0x588D,0x588E,0x588F,0x5890,0x5891,0x5894,0x5895,0x5896,/* 0x80-0x87 */
+	0x5897,0x5898,0x589B,0x589C,0x589D,0x58A0,0x58A1,0x58A2,/* 0x88-0x8F */
+	0x58A3,0x58A4,0x58A5,0x58A6,0x58A7,0x58AA,0x58AB,0x58AC,/* 0x90-0x97 */
+	0x58AD,0x58AE,0x58AF,0x58B0,0x58B1,0x58B2,0x58B3,0x58B4,/* 0x98-0x9F */
+	0x58B5,0x58B6,0x58B7,0x58B8,0x58B9,0x58BA,0x58BB,0x58BD,/* 0xA0-0xA7 */
+	0x58BE,0x58BF,0x58C0,0x58C2,0x58C3,0x58C4,0x58C6,0x58C7,/* 0xA8-0xAF */
+	0x58C8,0x58C9,0x58CA,0x58CB,0x58CC,0x58CD,0x58CE,0x58CF,/* 0xB0-0xB7 */
+	0x58D0,0x58D2,0x58D3,0x58D4,0x58D6,0x58D7,0x58D8,0x58D9,/* 0xB8-0xBF */
+	0x58DA,0x58DB,0x58DC,0x58DD,0x58DE,0x58DF,0x58E0,0x58E1,/* 0xC0-0xC7 */
+	0x58E2,0x58E3,0x58E5,0x58E6,0x58E7,0x58E8,0x58E9,0x58EA,/* 0xC8-0xCF */
+	0x58ED,0x58EF,0x58F1,0x58F2,0x58F4,0x58F5,0x58F7,0x58F8,/* 0xD0-0xD7 */
+	0x58FA,0x58FB,0x58FC,0x58FD,0x58FE,0x58FF,0x5900,0x5901,/* 0xD8-0xDF */
+	0x5903,0x5905,0x5906,0x5908,0x5909,0x590A,0x590B,0x590C,/* 0xE0-0xE7 */
+	0x590E,0x5910,0x5911,0x5912,0x5913,0x5917,0x5918,0x591B,/* 0xE8-0xEF */
+	0x591D,0x591E,0x5920,0x5921,0x5922,0x5923,0x5926,0x5928,/* 0xF0-0xF7 */
+	0x592C,0x5930,0x5932,0x5933,0x5935,0x5936,0x593B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8A[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x593D,0x593E,0x593F,0x5940,0x5943,0x5945,0x5946,0x594A,/* 0x40-0x47 */
+	0x594C,0x594D,0x5950,0x5952,0x5953,0x5959,0x595B,0x595C,/* 0x48-0x4F */
+	0x595D,0x595E,0x595F,0x5961,0x5963,0x5964,0x5966,0x5967,/* 0x50-0x57 */
+	0x5968,0x5969,0x596A,0x596B,0x596C,0x596D,0x596E,0x596F,/* 0x58-0x5F */
+	0x5970,0x5971,0x5972,0x5975,0x5977,0x597A,0x597B,0x597C,/* 0x60-0x67 */
+	0x597E,0x597F,0x5980,0x5985,0x5989,0x598B,0x598C,0x598E,/* 0x68-0x6F */
+	0x598F,0x5990,0x5991,0x5994,0x5995,0x5998,0x599A,0x599B,/* 0x70-0x77 */
+	0x599C,0x599D,0x599F,0x59A0,0x59A1,0x59A2,0x59A6,0x0000,/* 0x78-0x7F */
+
+	0x59A7,0x59AC,0x59AD,0x59B0,0x59B1,0x59B3,0x59B4,0x59B5,/* 0x80-0x87 */
+	0x59B6,0x59B7,0x59B8,0x59BA,0x59BC,0x59BD,0x59BF,0x59C0,/* 0x88-0x8F */
+	0x59C1,0x59C2,0x59C3,0x59C4,0x59C5,0x59C7,0x59C8,0x59C9,/* 0x90-0x97 */
+	0x59CC,0x59CD,0x59CE,0x59CF,0x59D5,0x59D6,0x59D9,0x59DB,/* 0x98-0x9F */
+	0x59DE,0x59DF,0x59E0,0x59E1,0x59E2,0x59E4,0x59E6,0x59E7,/* 0xA0-0xA7 */
+	0x59E9,0x59EA,0x59EB,0x59ED,0x59EE,0x59EF,0x59F0,0x59F1,/* 0xA8-0xAF */
+	0x59F2,0x59F3,0x59F4,0x59F5,0x59F6,0x59F7,0x59F8,0x59FA,/* 0xB0-0xB7 */
+	0x59FC,0x59FD,0x59FE,0x5A00,0x5A02,0x5A0A,0x5A0B,0x5A0D,/* 0xB8-0xBF */
+	0x5A0E,0x5A0F,0x5A10,0x5A12,0x5A14,0x5A15,0x5A16,0x5A17,/* 0xC0-0xC7 */
+	0x5A19,0x5A1A,0x5A1B,0x5A1D,0x5A1E,0x5A21,0x5A22,0x5A24,/* 0xC8-0xCF */
+	0x5A26,0x5A27,0x5A28,0x5A2A,0x5A2B,0x5A2C,0x5A2D,0x5A2E,/* 0xD0-0xD7 */
+	0x5A2F,0x5A30,0x5A33,0x5A35,0x5A37,0x5A38,0x5A39,0x5A3A,/* 0xD8-0xDF */
+	0x5A3B,0x5A3D,0x5A3E,0x5A3F,0x5A41,0x5A42,0x5A43,0x5A44,/* 0xE0-0xE7 */
+	0x5A45,0x5A47,0x5A48,0x5A4B,0x5A4C,0x5A4D,0x5A4E,0x5A4F,/* 0xE8-0xEF */
+	0x5A50,0x5A51,0x5A52,0x5A53,0x5A54,0x5A56,0x5A57,0x5A58,/* 0xF0-0xF7 */
+	0x5A59,0x5A5B,0x5A5C,0x5A5D,0x5A5E,0x5A5F,0x5A60,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8B[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5A61,0x5A63,0x5A64,0x5A65,0x5A66,0x5A68,0x5A69,0x5A6B,/* 0x40-0x47 */
+	0x5A6C,0x5A6D,0x5A6E,0x5A6F,0x5A70,0x5A71,0x5A72,0x5A73,/* 0x48-0x4F */
+	0x5A78,0x5A79,0x5A7B,0x5A7C,0x5A7D,0x5A7E,0x5A80,0x5A81,/* 0x50-0x57 */
+	0x5A82,0x5A83,0x5A84,0x5A85,0x5A86,0x5A87,0x5A88,0x5A89,/* 0x58-0x5F */
+	0x5A8A,0x5A8B,0x5A8C,0x5A8D,0x5A8E,0x5A8F,0x5A90,0x5A91,/* 0x60-0x67 */
+	0x5A93,0x5A94,0x5A95,0x5A96,0x5A97,0x5A98,0x5A99,0x5A9C,/* 0x68-0x6F */
+	0x5A9D,0x5A9E,0x5A9F,0x5AA0,0x5AA1,0x5AA2,0x5AA3,0x5AA4,/* 0x70-0x77 */
+	0x5AA5,0x5AA6,0x5AA7,0x5AA8,0x5AA9,0x5AAB,0x5AAC,0x0000,/* 0x78-0x7F */
+
+	0x5AAD,0x5AAE,0x5AAF,0x5AB0,0x5AB1,0x5AB4,0x5AB6,0x5AB7,/* 0x80-0x87 */
+	0x5AB9,0x5ABA,0x5ABB,0x5ABC,0x5ABD,0x5ABF,0x5AC0,0x5AC3,/* 0x88-0x8F */
+	0x5AC4,0x5AC5,0x5AC6,0x5AC7,0x5AC8,0x5ACA,0x5ACB,0x5ACD,/* 0x90-0x97 */
+	0x5ACE,0x5ACF,0x5AD0,0x5AD1,0x5AD3,0x5AD5,0x5AD7,0x5AD9,/* 0x98-0x9F */
+	0x5ADA,0x5ADB,0x5ADD,0x5ADE,0x5ADF,0x5AE2,0x5AE4,0x5AE5,/* 0xA0-0xA7 */
+	0x5AE7,0x5AE8,0x5AEA,0x5AEC,0x5AED,0x5AEE,0x5AEF,0x5AF0,/* 0xA8-0xAF */
+	0x5AF2,0x5AF3,0x5AF4,0x5AF5,0x5AF6,0x5AF7,0x5AF8,0x5AF9,/* 0xB0-0xB7 */
+	0x5AFA,0x5AFB,0x5AFC,0x5AFD,0x5AFE,0x5AFF,0x5B00,0x5B01,/* 0xB8-0xBF */
+	0x5B02,0x5B03,0x5B04,0x5B05,0x5B06,0x5B07,0x5B08,0x5B0A,/* 0xC0-0xC7 */
+	0x5B0B,0x5B0C,0x5B0D,0x5B0E,0x5B0F,0x5B10,0x5B11,0x5B12,/* 0xC8-0xCF */
+	0x5B13,0x5B14,0x5B15,0x5B18,0x5B19,0x5B1A,0x5B1B,0x5B1C,/* 0xD0-0xD7 */
+	0x5B1D,0x5B1E,0x5B1F,0x5B20,0x5B21,0x5B22,0x5B23,0x5B24,/* 0xD8-0xDF */
+	0x5B25,0x5B26,0x5B27,0x5B28,0x5B29,0x5B2A,0x5B2B,0x5B2C,/* 0xE0-0xE7 */
+	0x5B2D,0x5B2E,0x5B2F,0x5B30,0x5B31,0x5B33,0x5B35,0x5B36,/* 0xE8-0xEF */
+	0x5B38,0x5B39,0x5B3A,0x5B3B,0x5B3C,0x5B3D,0x5B3E,0x5B3F,/* 0xF0-0xF7 */
+	0x5B41,0x5B42,0x5B43,0x5B44,0x5B45,0x5B46,0x5B47,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8C[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5B48,0x5B49,0x5B4A,0x5B4B,0x5B4C,0x5B4D,0x5B4E,0x5B4F,/* 0x40-0x47 */
+	0x5B52,0x5B56,0x5B5E,0x5B60,0x5B61,0x5B67,0x5B68,0x5B6B,/* 0x48-0x4F */
+	0x5B6D,0x5B6E,0x5B6F,0x5B72,0x5B74,0x5B76,0x5B77,0x5B78,/* 0x50-0x57 */
+	0x5B79,0x5B7B,0x5B7C,0x5B7E,0x5B7F,0x5B82,0x5B86,0x5B8A,/* 0x58-0x5F */
+	0x5B8D,0x5B8E,0x5B90,0x5B91,0x5B92,0x5B94,0x5B96,0x5B9F,/* 0x60-0x67 */
+	0x5BA7,0x5BA8,0x5BA9,0x5BAC,0x5BAD,0x5BAE,0x5BAF,0x5BB1,/* 0x68-0x6F */
+	0x5BB2,0x5BB7,0x5BBA,0x5BBB,0x5BBC,0x5BC0,0x5BC1,0x5BC3,/* 0x70-0x77 */
+	0x5BC8,0x5BC9,0x5BCA,0x5BCB,0x5BCD,0x5BCE,0x5BCF,0x0000,/* 0x78-0x7F */
+
+	0x5BD1,0x5BD4,0x5BD5,0x5BD6,0x5BD7,0x5BD8,0x5BD9,0x5BDA,/* 0x80-0x87 */
+	0x5BDB,0x5BDC,0x5BE0,0x5BE2,0x5BE3,0x5BE6,0x5BE7,0x5BE9,/* 0x88-0x8F */
+	0x5BEA,0x5BEB,0x5BEC,0x5BED,0x5BEF,0x5BF1,0x5BF2,0x5BF3,/* 0x90-0x97 */
+	0x5BF4,0x5BF5,0x5BF6,0x5BF7,0x5BFD,0x5BFE,0x5C00,0x5C02,/* 0x98-0x9F */
+	0x5C03,0x5C05,0x5C07,0x5C08,0x5C0B,0x5C0C,0x5C0D,0x5C0E,/* 0xA0-0xA7 */
+	0x5C10,0x5C12,0x5C13,0x5C17,0x5C19,0x5C1B,0x5C1E,0x5C1F,/* 0xA8-0xAF */
+	0x5C20,0x5C21,0x5C23,0x5C26,0x5C28,0x5C29,0x5C2A,0x5C2B,/* 0xB0-0xB7 */
+	0x5C2D,0x5C2E,0x5C2F,0x5C30,0x5C32,0x5C33,0x5C35,0x5C36,/* 0xB8-0xBF */
+	0x5C37,0x5C43,0x5C44,0x5C46,0x5C47,0x5C4C,0x5C4D,0x5C52,/* 0xC0-0xC7 */
+	0x5C53,0x5C54,0x5C56,0x5C57,0x5C58,0x5C5A,0x5C5B,0x5C5C,/* 0xC8-0xCF */
+	0x5C5D,0x5C5F,0x5C62,0x5C64,0x5C67,0x5C68,0x5C69,0x5C6A,/* 0xD0-0xD7 */
+	0x5C6B,0x5C6C,0x5C6D,0x5C70,0x5C72,0x5C73,0x5C74,0x5C75,/* 0xD8-0xDF */
+	0x5C76,0x5C77,0x5C78,0x5C7B,0x5C7C,0x5C7D,0x5C7E,0x5C80,/* 0xE0-0xE7 */
+	0x5C83,0x5C84,0x5C85,0x5C86,0x5C87,0x5C89,0x5C8A,0x5C8B,/* 0xE8-0xEF */
+	0x5C8E,0x5C8F,0x5C92,0x5C93,0x5C95,0x5C9D,0x5C9E,0x5C9F,/* 0xF0-0xF7 */
+	0x5CA0,0x5CA1,0x5CA4,0x5CA5,0x5CA6,0x5CA7,0x5CA8,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8D[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5CAA,0x5CAE,0x5CAF,0x5CB0,0x5CB2,0x5CB4,0x5CB6,0x5CB9,/* 0x40-0x47 */
+	0x5CBA,0x5CBB,0x5CBC,0x5CBE,0x5CC0,0x5CC2,0x5CC3,0x5CC5,/* 0x48-0x4F */
+	0x5CC6,0x5CC7,0x5CC8,0x5CC9,0x5CCA,0x5CCC,0x5CCD,0x5CCE,/* 0x50-0x57 */
+	0x5CCF,0x5CD0,0x5CD1,0x5CD3,0x5CD4,0x5CD5,0x5CD6,0x5CD7,/* 0x58-0x5F */
+	0x5CD8,0x5CDA,0x5CDB,0x5CDC,0x5CDD,0x5CDE,0x5CDF,0x5CE0,/* 0x60-0x67 */
+	0x5CE2,0x5CE3,0x5CE7,0x5CE9,0x5CEB,0x5CEC,0x5CEE,0x5CEF,/* 0x68-0x6F */
+	0x5CF1,0x5CF2,0x5CF3,0x5CF4,0x5CF5,0x5CF6,0x5CF7,0x5CF8,/* 0x70-0x77 */
+	0x5CF9,0x5CFA,0x5CFC,0x5CFD,0x5CFE,0x5CFF,0x5D00,0x0000,/* 0x78-0x7F */
+
+	0x5D01,0x5D04,0x5D05,0x5D08,0x5D09,0x5D0A,0x5D0B,0x5D0C,/* 0x80-0x87 */
+	0x5D0D,0x5D0F,0x5D10,0x5D11,0x5D12,0x5D13,0x5D15,0x5D17,/* 0x88-0x8F */
+	0x5D18,0x5D19,0x5D1A,0x5D1C,0x5D1D,0x5D1F,0x5D20,0x5D21,/* 0x90-0x97 */
+	0x5D22,0x5D23,0x5D25,0x5D28,0x5D2A,0x5D2B,0x5D2C,0x5D2F,/* 0x98-0x9F */
+	0x5D30,0x5D31,0x5D32,0x5D33,0x5D35,0x5D36,0x5D37,0x5D38,/* 0xA0-0xA7 */
+	0x5D39,0x5D3A,0x5D3B,0x5D3C,0x5D3F,0x5D40,0x5D41,0x5D42,/* 0xA8-0xAF */
+	0x5D43,0x5D44,0x5D45,0x5D46,0x5D48,0x5D49,0x5D4D,0x5D4E,/* 0xB0-0xB7 */
+	0x5D4F,0x5D50,0x5D51,0x5D52,0x5D53,0x5D54,0x5D55,0x5D56,/* 0xB8-0xBF */
+	0x5D57,0x5D59,0x5D5A,0x5D5C,0x5D5E,0x5D5F,0x5D60,0x5D61,/* 0xC0-0xC7 */
+	0x5D62,0x5D63,0x5D64,0x5D65,0x5D66,0x5D67,0x5D68,0x5D6A,/* 0xC8-0xCF */
+	0x5D6D,0x5D6E,0x5D70,0x5D71,0x5D72,0x5D73,0x5D75,0x5D76,/* 0xD0-0xD7 */
+	0x5D77,0x5D78,0x5D79,0x5D7A,0x5D7B,0x5D7C,0x5D7D,0x5D7E,/* 0xD8-0xDF */
+	0x5D7F,0x5D80,0x5D81,0x5D83,0x5D84,0x5D85,0x5D86,0x5D87,/* 0xE0-0xE7 */
+	0x5D88,0x5D89,0x5D8A,0x5D8B,0x5D8C,0x5D8D,0x5D8E,0x5D8F,/* 0xE8-0xEF */
+	0x5D90,0x5D91,0x5D92,0x5D93,0x5D94,0x5D95,0x5D96,0x5D97,/* 0xF0-0xF7 */
+	0x5D98,0x5D9A,0x5D9B,0x5D9C,0x5D9E,0x5D9F,0x5DA0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8E[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5DA1,0x5DA2,0x5DA3,0x5DA4,0x5DA5,0x5DA6,0x5DA7,0x5DA8,/* 0x40-0x47 */
+	0x5DA9,0x5DAA,0x5DAB,0x5DAC,0x5DAD,0x5DAE,0x5DAF,0x5DB0,/* 0x48-0x4F */
+	0x5DB1,0x5DB2,0x5DB3,0x5DB4,0x5DB5,0x5DB6,0x5DB8,0x5DB9,/* 0x50-0x57 */
+	0x5DBA,0x5DBB,0x5DBC,0x5DBD,0x5DBE,0x5DBF,0x5DC0,0x5DC1,/* 0x58-0x5F */
+	0x5DC2,0x5DC3,0x5DC4,0x5DC6,0x5DC7,0x5DC8,0x5DC9,0x5DCA,/* 0x60-0x67 */
+	0x5DCB,0x5DCC,0x5DCE,0x5DCF,0x5DD0,0x5DD1,0x5DD2,0x5DD3,/* 0x68-0x6F */
+	0x5DD4,0x5DD5,0x5DD6,0x5DD7,0x5DD8,0x5DD9,0x5DDA,0x5DDC,/* 0x70-0x77 */
+	0x5DDF,0x5DE0,0x5DE3,0x5DE4,0x5DEA,0x5DEC,0x5DED,0x0000,/* 0x78-0x7F */
+
+	0x5DF0,0x5DF5,0x5DF6,0x5DF8,0x5DF9,0x5DFA,0x5DFB,0x5DFC,/* 0x80-0x87 */
+	0x5DFF,0x5E00,0x5E04,0x5E07,0x5E09,0x5E0A,0x5E0B,0x5E0D,/* 0x88-0x8F */
+	0x5E0E,0x5E12,0x5E13,0x5E17,0x5E1E,0x5E1F,0x5E20,0x5E21,/* 0x90-0x97 */
+	0x5E22,0x5E23,0x5E24,0x5E25,0x5E28,0x5E29,0x5E2A,0x5E2B,/* 0x98-0x9F */
+	0x5E2C,0x5E2F,0x5E30,0x5E32,0x5E33,0x5E34,0x5E35,0x5E36,/* 0xA0-0xA7 */
+	0x5E39,0x5E3A,0x5E3E,0x5E3F,0x5E40,0x5E41,0x5E43,0x5E46,/* 0xA8-0xAF */
+	0x5E47,0x5E48,0x5E49,0x5E4A,0x5E4B,0x5E4D,0x5E4E,0x5E4F,/* 0xB0-0xB7 */
+	0x5E50,0x5E51,0x5E52,0x5E53,0x5E56,0x5E57,0x5E58,0x5E59,/* 0xB8-0xBF */
+	0x5E5A,0x5E5C,0x5E5D,0x5E5F,0x5E60,0x5E63,0x5E64,0x5E65,/* 0xC0-0xC7 */
+	0x5E66,0x5E67,0x5E68,0x5E69,0x5E6A,0x5E6B,0x5E6C,0x5E6D,/* 0xC8-0xCF */
+	0x5E6E,0x5E6F,0x5E70,0x5E71,0x5E75,0x5E77,0x5E79,0x5E7E,/* 0xD0-0xD7 */
+	0x5E81,0x5E82,0x5E83,0x5E85,0x5E88,0x5E89,0x5E8C,0x5E8D,/* 0xD8-0xDF */
+	0x5E8E,0x5E92,0x5E98,0x5E9B,0x5E9D,0x5EA1,0x5EA2,0x5EA3,/* 0xE0-0xE7 */
+	0x5EA4,0x5EA8,0x5EA9,0x5EAA,0x5EAB,0x5EAC,0x5EAE,0x5EAF,/* 0xE8-0xEF */
+	0x5EB0,0x5EB1,0x5EB2,0x5EB4,0x5EBA,0x5EBB,0x5EBC,0x5EBD,/* 0xF0-0xF7 */
+	0x5EBF,0x5EC0,0x5EC1,0x5EC2,0x5EC3,0x5EC4,0x5EC5,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8F[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5EC6,0x5EC7,0x5EC8,0x5ECB,0x5ECC,0x5ECD,0x5ECE,0x5ECF,/* 0x40-0x47 */
+	0x5ED0,0x5ED4,0x5ED5,0x5ED7,0x5ED8,0x5ED9,0x5EDA,0x5EDC,/* 0x48-0x4F */
+	0x5EDD,0x5EDE,0x5EDF,0x5EE0,0x5EE1,0x5EE2,0x5EE3,0x5EE4,/* 0x50-0x57 */
+	0x5EE5,0x5EE6,0x5EE7,0x5EE9,0x5EEB,0x5EEC,0x5EED,0x5EEE,/* 0x58-0x5F */
+	0x5EEF,0x5EF0,0x5EF1,0x5EF2,0x5EF3,0x5EF5,0x5EF8,0x5EF9,/* 0x60-0x67 */
+	0x5EFB,0x5EFC,0x5EFD,0x5F05,0x5F06,0x5F07,0x5F09,0x5F0C,/* 0x68-0x6F */
+	0x5F0D,0x5F0E,0x5F10,0x5F12,0x5F14,0x5F16,0x5F19,0x5F1A,/* 0x70-0x77 */
+	0x5F1C,0x5F1D,0x5F1E,0x5F21,0x5F22,0x5F23,0x5F24,0x0000,/* 0x78-0x7F */
+
+	0x5F28,0x5F2B,0x5F2C,0x5F2E,0x5F30,0x5F32,0x5F33,0x5F34,/* 0x80-0x87 */
+	0x5F35,0x5F36,0x5F37,0x5F38,0x5F3B,0x5F3D,0x5F3E,0x5F3F,/* 0x88-0x8F */
+	0x5F41,0x5F42,0x5F43,0x5F44,0x5F45,0x5F46,0x5F47,0x5F48,/* 0x90-0x97 */
+	0x5F49,0x5F4A,0x5F4B,0x5F4C,0x5F4D,0x5F4E,0x5F4F,0x5F51,/* 0x98-0x9F */
+	0x5F54,0x5F59,0x5F5A,0x5F5B,0x5F5C,0x5F5E,0x5F5F,0x5F60,/* 0xA0-0xA7 */
+	0x5F63,0x5F65,0x5F67,0x5F68,0x5F6B,0x5F6E,0x5F6F,0x5F72,/* 0xA8-0xAF */
+	0x5F74,0x5F75,0x5F76,0x5F78,0x5F7A,0x5F7D,0x5F7E,0x5F7F,/* 0xB0-0xB7 */
+	0x5F83,0x5F86,0x5F8D,0x5F8E,0x5F8F,0x5F91,0x5F93,0x5F94,/* 0xB8-0xBF */
+	0x5F96,0x5F9A,0x5F9B,0x5F9D,0x5F9E,0x5F9F,0x5FA0,0x5FA2,/* 0xC0-0xC7 */
+	0x5FA3,0x5FA4,0x5FA5,0x5FA6,0x5FA7,0x5FA9,0x5FAB,0x5FAC,/* 0xC8-0xCF */
+	0x5FAF,0x5FB0,0x5FB1,0x5FB2,0x5FB3,0x5FB4,0x5FB6,0x5FB8,/* 0xD0-0xD7 */
+	0x5FB9,0x5FBA,0x5FBB,0x5FBE,0x5FBF,0x5FC0,0x5FC1,0x5FC2,/* 0xD8-0xDF */
+	0x5FC7,0x5FC8,0x5FCA,0x5FCB,0x5FCE,0x5FD3,0x5FD4,0x5FD5,/* 0xE0-0xE7 */
+	0x5FDA,0x5FDB,0x5FDC,0x5FDE,0x5FDF,0x5FE2,0x5FE3,0x5FE5,/* 0xE8-0xEF */
+	0x5FE6,0x5FE8,0x5FE9,0x5FEC,0x5FEF,0x5FF0,0x5FF2,0x5FF3,/* 0xF0-0xF7 */
+	0x5FF4,0x5FF6,0x5FF7,0x5FF9,0x5FFA,0x5FFC,0x6007,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_90[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6008,0x6009,0x600B,0x600C,0x6010,0x6011,0x6013,0x6017,/* 0x40-0x47 */
+	0x6018,0x601A,0x601E,0x601F,0x6022,0x6023,0x6024,0x602C,/* 0x48-0x4F */
+	0x602D,0x602E,0x6030,0x6031,0x6032,0x6033,0x6034,0x6036,/* 0x50-0x57 */
+	0x6037,0x6038,0x6039,0x603A,0x603D,0x603E,0x6040,0x6044,/* 0x58-0x5F */
+	0x6045,0x6046,0x6047,0x6048,0x6049,0x604A,0x604C,0x604E,/* 0x60-0x67 */
+	0x604F,0x6051,0x6053,0x6054,0x6056,0x6057,0x6058,0x605B,/* 0x68-0x6F */
+	0x605C,0x605E,0x605F,0x6060,0x6061,0x6065,0x6066,0x606E,/* 0x70-0x77 */
+	0x6071,0x6072,0x6074,0x6075,0x6077,0x607E,0x6080,0x0000,/* 0x78-0x7F */
+
+	0x6081,0x6082,0x6085,0x6086,0x6087,0x6088,0x608A,0x608B,/* 0x80-0x87 */
+	0x608E,0x608F,0x6090,0x6091,0x6093,0x6095,0x6097,0x6098,/* 0x88-0x8F */
+	0x6099,0x609C,0x609E,0x60A1,0x60A2,0x60A4,0x60A5,0x60A7,/* 0x90-0x97 */
+	0x60A9,0x60AA,0x60AE,0x60B0,0x60B3,0x60B5,0x60B6,0x60B7,/* 0x98-0x9F */
+	0x60B9,0x60BA,0x60BD,0x60BE,0x60BF,0x60C0,0x60C1,0x60C2,/* 0xA0-0xA7 */
+	0x60C3,0x60C4,0x60C7,0x60C8,0x60C9,0x60CC,0x60CD,0x60CE,/* 0xA8-0xAF */
+	0x60CF,0x60D0,0x60D2,0x60D3,0x60D4,0x60D6,0x60D7,0x60D9,/* 0xB0-0xB7 */
+	0x60DB,0x60DE,0x60E1,0x60E2,0x60E3,0x60E4,0x60E5,0x60EA,/* 0xB8-0xBF */
+	0x60F1,0x60F2,0x60F5,0x60F7,0x60F8,0x60FB,0x60FC,0x60FD,/* 0xC0-0xC7 */
+	0x60FE,0x60FF,0x6102,0x6103,0x6104,0x6105,0x6107,0x610A,/* 0xC8-0xCF */
+	0x610B,0x610C,0x6110,0x6111,0x6112,0x6113,0x6114,0x6116,/* 0xD0-0xD7 */
+	0x6117,0x6118,0x6119,0x611B,0x611C,0x611D,0x611E,0x6121,/* 0xD8-0xDF */
+	0x6122,0x6125,0x6128,0x6129,0x612A,0x612C,0x612D,0x612E,/* 0xE0-0xE7 */
+	0x612F,0x6130,0x6131,0x6132,0x6133,0x6134,0x6135,0x6136,/* 0xE8-0xEF */
+	0x6137,0x6138,0x6139,0x613A,0x613B,0x613C,0x613D,0x613E,/* 0xF0-0xF7 */
+	0x6140,0x6141,0x6142,0x6143,0x6144,0x6145,0x6146,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_91[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6147,0x6149,0x614B,0x614D,0x614F,0x6150,0x6152,0x6153,/* 0x40-0x47 */
+	0x6154,0x6156,0x6157,0x6158,0x6159,0x615A,0x615B,0x615C,/* 0x48-0x4F */
+	0x615E,0x615F,0x6160,0x6161,0x6163,0x6164,0x6165,0x6166,/* 0x50-0x57 */
+	0x6169,0x616A,0x616B,0x616C,0x616D,0x616E,0x616F,0x6171,/* 0x58-0x5F */
+	0x6172,0x6173,0x6174,0x6176,0x6178,0x6179,0x617A,0x617B,/* 0x60-0x67 */
+	0x617C,0x617D,0x617E,0x617F,0x6180,0x6181,0x6182,0x6183,/* 0x68-0x6F */
+	0x6184,0x6185,0x6186,0x6187,0x6188,0x6189,0x618A,0x618C,/* 0x70-0x77 */
+	0x618D,0x618F,0x6190,0x6191,0x6192,0x6193,0x6195,0x0000,/* 0x78-0x7F */
+
+	0x6196,0x6197,0x6198,0x6199,0x619A,0x619B,0x619C,0x619E,/* 0x80-0x87 */
+	0x619F,0x61A0,0x61A1,0x61A2,0x61A3,0x61A4,0x61A5,0x61A6,/* 0x88-0x8F */
+	0x61AA,0x61AB,0x61AD,0x61AE,0x61AF,0x61B0,0x61B1,0x61B2,/* 0x90-0x97 */
+	0x61B3,0x61B4,0x61B5,0x61B6,0x61B8,0x61B9,0x61BA,0x61BB,/* 0x98-0x9F */
+	0x61BC,0x61BD,0x61BF,0x61C0,0x61C1,0x61C3,0x61C4,0x61C5,/* 0xA0-0xA7 */
+	0x61C6,0x61C7,0x61C9,0x61CC,0x61CD,0x61CE,0x61CF,0x61D0,/* 0xA8-0xAF */
+	0x61D3,0x61D5,0x61D6,0x61D7,0x61D8,0x61D9,0x61DA,0x61DB,/* 0xB0-0xB7 */
+	0x61DC,0x61DD,0x61DE,0x61DF,0x61E0,0x61E1,0x61E2,0x61E3,/* 0xB8-0xBF */
+	0x61E4,0x61E5,0x61E7,0x61E8,0x61E9,0x61EA,0x61EB,0x61EC,/* 0xC0-0xC7 */
+	0x61ED,0x61EE,0x61EF,0x61F0,0x61F1,0x61F2,0x61F3,0x61F4,/* 0xC8-0xCF */
+	0x61F6,0x61F7,0x61F8,0x61F9,0x61FA,0x61FB,0x61FC,0x61FD,/* 0xD0-0xD7 */
+	0x61FE,0x6200,0x6201,0x6202,0x6203,0x6204,0x6205,0x6207,/* 0xD8-0xDF */
+	0x6209,0x6213,0x6214,0x6219,0x621C,0x621D,0x621E,0x6220,/* 0xE0-0xE7 */
+	0x6223,0x6226,0x6227,0x6228,0x6229,0x622B,0x622D,0x622F,/* 0xE8-0xEF */
+	0x6230,0x6231,0x6232,0x6235,0x6236,0x6238,0x6239,0x623A,/* 0xF0-0xF7 */
+	0x623B,0x623C,0x6242,0x6244,0x6245,0x6246,0x624A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_92[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x624F,0x6250,0x6255,0x6256,0x6257,0x6259,0x625A,0x625C,/* 0x40-0x47 */
+	0x625D,0x625E,0x625F,0x6260,0x6261,0x6262,0x6264,0x6265,/* 0x48-0x4F */
+	0x6268,0x6271,0x6272,0x6274,0x6275,0x6277,0x6278,0x627A,/* 0x50-0x57 */
+	0x627B,0x627D,0x6281,0x6282,0x6283,0x6285,0x6286,0x6287,/* 0x58-0x5F */
+	0x6288,0x628B,0x628C,0x628D,0x628E,0x628F,0x6290,0x6294,/* 0x60-0x67 */
+	0x6299,0x629C,0x629D,0x629E,0x62A3,0x62A6,0x62A7,0x62A9,/* 0x68-0x6F */
+	0x62AA,0x62AD,0x62AE,0x62AF,0x62B0,0x62B2,0x62B3,0x62B4,/* 0x70-0x77 */
+	0x62B6,0x62B7,0x62B8,0x62BA,0x62BE,0x62C0,0x62C1,0x0000,/* 0x78-0x7F */
+
+	0x62C3,0x62CB,0x62CF,0x62D1,0x62D5,0x62DD,0x62DE,0x62E0,/* 0x80-0x87 */
+	0x62E1,0x62E4,0x62EA,0x62EB,0x62F0,0x62F2,0x62F5,0x62F8,/* 0x88-0x8F */
+	0x62F9,0x62FA,0x62FB,0x6300,0x6303,0x6304,0x6305,0x6306,/* 0x90-0x97 */
+	0x630A,0x630B,0x630C,0x630D,0x630F,0x6310,0x6312,0x6313,/* 0x98-0x9F */
+	0x6314,0x6315,0x6317,0x6318,0x6319,0x631C,0x6326,0x6327,/* 0xA0-0xA7 */
+	0x6329,0x632C,0x632D,0x632E,0x6330,0x6331,0x6333,0x6334,/* 0xA8-0xAF */
+	0x6335,0x6336,0x6337,0x6338,0x633B,0x633C,0x633E,0x633F,/* 0xB0-0xB7 */
+	0x6340,0x6341,0x6344,0x6347,0x6348,0x634A,0x6351,0x6352,/* 0xB8-0xBF */
+	0x6353,0x6354,0x6356,0x6357,0x6358,0x6359,0x635A,0x635B,/* 0xC0-0xC7 */
+	0x635C,0x635D,0x6360,0x6364,0x6365,0x6366,0x6368,0x636A,/* 0xC8-0xCF */
+	0x636B,0x636C,0x636F,0x6370,0x6372,0x6373,0x6374,0x6375,/* 0xD0-0xD7 */
+	0x6378,0x6379,0x637C,0x637D,0x637E,0x637F,0x6381,0x6383,/* 0xD8-0xDF */
+	0x6384,0x6385,0x6386,0x638B,0x638D,0x6391,0x6393,0x6394,/* 0xE0-0xE7 */
+	0x6395,0x6397,0x6399,0x639A,0x639B,0x639C,0x639D,0x639E,/* 0xE8-0xEF */
+	0x639F,0x63A1,0x63A4,0x63A6,0x63AB,0x63AF,0x63B1,0x63B2,/* 0xF0-0xF7 */
+	0x63B5,0x63B6,0x63B9,0x63BB,0x63BD,0x63BF,0x63C0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_93[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x63C1,0x63C2,0x63C3,0x63C5,0x63C7,0x63C8,0x63CA,0x63CB,/* 0x40-0x47 */
+	0x63CC,0x63D1,0x63D3,0x63D4,0x63D5,0x63D7,0x63D8,0x63D9,/* 0x48-0x4F */
+	0x63DA,0x63DB,0x63DC,0x63DD,0x63DF,0x63E2,0x63E4,0x63E5,/* 0x50-0x57 */
+	0x63E6,0x63E7,0x63E8,0x63EB,0x63EC,0x63EE,0x63EF,0x63F0,/* 0x58-0x5F */
+	0x63F1,0x63F3,0x63F5,0x63F7,0x63F9,0x63FA,0x63FB,0x63FC,/* 0x60-0x67 */
+	0x63FE,0x6403,0x6404,0x6406,0x6407,0x6408,0x6409,0x640A,/* 0x68-0x6F */
+	0x640D,0x640E,0x6411,0x6412,0x6415,0x6416,0x6417,0x6418,/* 0x70-0x77 */
+	0x6419,0x641A,0x641D,0x641F,0x6422,0x6423,0x6424,0x0000,/* 0x78-0x7F */
+
+	0x6425,0x6427,0x6428,0x6429,0x642B,0x642E,0x642F,0x6430,/* 0x80-0x87 */
+	0x6431,0x6432,0x6433,0x6435,0x6436,0x6437,0x6438,0x6439,/* 0x88-0x8F */
+	0x643B,0x643C,0x643E,0x6440,0x6442,0x6443,0x6449,0x644B,/* 0x90-0x97 */
+	0x644C,0x644D,0x644E,0x644F,0x6450,0x6451,0x6453,0x6455,/* 0x98-0x9F */
+	0x6456,0x6457,0x6459,0x645A,0x645B,0x645C,0x645D,0x645F,/* 0xA0-0xA7 */
+	0x6460,0x6461,0x6462,0x6463,0x6464,0x6465,0x6466,0x6468,/* 0xA8-0xAF */
+	0x646A,0x646B,0x646C,0x646E,0x646F,0x6470,0x6471,0x6472,/* 0xB0-0xB7 */
+	0x6473,0x6474,0x6475,0x6476,0x6477,0x647B,0x647C,0x647D,/* 0xB8-0xBF */
+	0x647E,0x647F,0x6480,0x6481,0x6483,0x6486,0x6488,0x6489,/* 0xC0-0xC7 */
+	0x648A,0x648B,0x648C,0x648D,0x648E,0x648F,0x6490,0x6493,/* 0xC8-0xCF */
+	0x6494,0x6497,0x6498,0x649A,0x649B,0x649C,0x649D,0x649F,/* 0xD0-0xD7 */
+	0x64A0,0x64A1,0x64A2,0x64A3,0x64A5,0x64A6,0x64A7,0x64A8,/* 0xD8-0xDF */
+	0x64AA,0x64AB,0x64AF,0x64B1,0x64B2,0x64B3,0x64B4,0x64B6,/* 0xE0-0xE7 */
+	0x64B9,0x64BB,0x64BD,0x64BE,0x64BF,0x64C1,0x64C3,0x64C4,/* 0xE8-0xEF */
+	0x64C6,0x64C7,0x64C8,0x64C9,0x64CA,0x64CB,0x64CC,0x64CF,/* 0xF0-0xF7 */
+	0x64D1,0x64D3,0x64D4,0x64D5,0x64D6,0x64D9,0x64DA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_94[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x64DB,0x64DC,0x64DD,0x64DF,0x64E0,0x64E1,0x64E3,0x64E5,/* 0x40-0x47 */
+	0x64E7,0x64E8,0x64E9,0x64EA,0x64EB,0x64EC,0x64ED,0x64EE,/* 0x48-0x4F */
+	0x64EF,0x64F0,0x64F1,0x64F2,0x64F3,0x64F4,0x64F5,0x64F6,/* 0x50-0x57 */
+	0x64F7,0x64F8,0x64F9,0x64FA,0x64FB,0x64FC,0x64FD,0x64FE,/* 0x58-0x5F */
+	0x64FF,0x6501,0x6502,0x6503,0x6504,0x6505,0x6506,0x6507,/* 0x60-0x67 */
+	0x6508,0x650A,0x650B,0x650C,0x650D,0x650E,0x650F,0x6510,/* 0x68-0x6F */
+	0x6511,0x6513,0x6514,0x6515,0x6516,0x6517,0x6519,0x651A,/* 0x70-0x77 */
+	0x651B,0x651C,0x651D,0x651E,0x651F,0x6520,0x6521,0x0000,/* 0x78-0x7F */
+
+	0x6522,0x6523,0x6524,0x6526,0x6527,0x6528,0x6529,0x652A,/* 0x80-0x87 */
+	0x652C,0x652D,0x6530,0x6531,0x6532,0x6533,0x6537,0x653A,/* 0x88-0x8F */
+	0x653C,0x653D,0x6540,0x6541,0x6542,0x6543,0x6544,0x6546,/* 0x90-0x97 */
+	0x6547,0x654A,0x654B,0x654D,0x654E,0x6550,0x6552,0x6553,/* 0x98-0x9F */
+	0x6554,0x6557,0x6558,0x655A,0x655C,0x655F,0x6560,0x6561,/* 0xA0-0xA7 */
+	0x6564,0x6565,0x6567,0x6568,0x6569,0x656A,0x656D,0x656E,/* 0xA8-0xAF */
+	0x656F,0x6571,0x6573,0x6575,0x6576,0x6578,0x6579,0x657A,/* 0xB0-0xB7 */
+	0x657B,0x657C,0x657D,0x657E,0x657F,0x6580,0x6581,0x6582,/* 0xB8-0xBF */
+	0x6583,0x6584,0x6585,0x6586,0x6588,0x6589,0x658A,0x658D,/* 0xC0-0xC7 */
+	0x658E,0x658F,0x6592,0x6594,0x6595,0x6596,0x6598,0x659A,/* 0xC8-0xCF */
+	0x659D,0x659E,0x65A0,0x65A2,0x65A3,0x65A6,0x65A8,0x65AA,/* 0xD0-0xD7 */
+	0x65AC,0x65AE,0x65B1,0x65B2,0x65B3,0x65B4,0x65B5,0x65B6,/* 0xD8-0xDF */
+	0x65B7,0x65B8,0x65BA,0x65BB,0x65BE,0x65BF,0x65C0,0x65C2,/* 0xE0-0xE7 */
+	0x65C7,0x65C8,0x65C9,0x65CA,0x65CD,0x65D0,0x65D1,0x65D3,/* 0xE8-0xEF */
+	0x65D4,0x65D5,0x65D8,0x65D9,0x65DA,0x65DB,0x65DC,0x65DD,/* 0xF0-0xF7 */
+	0x65DE,0x65DF,0x65E1,0x65E3,0x65E4,0x65EA,0x65EB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_95[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x65F2,0x65F3,0x65F4,0x65F5,0x65F8,0x65F9,0x65FB,0x65FC,/* 0x40-0x47 */
+	0x65FD,0x65FE,0x65FF,0x6601,0x6604,0x6605,0x6607,0x6608,/* 0x48-0x4F */
+	0x6609,0x660B,0x660D,0x6610,0x6611,0x6612,0x6616,0x6617,/* 0x50-0x57 */
+	0x6618,0x661A,0x661B,0x661C,0x661E,0x6621,0x6622,0x6623,/* 0x58-0x5F */
+	0x6624,0x6626,0x6629,0x662A,0x662B,0x662C,0x662E,0x6630,/* 0x60-0x67 */
+	0x6632,0x6633,0x6637,0x6638,0x6639,0x663A,0x663B,0x663D,/* 0x68-0x6F */
+	0x663F,0x6640,0x6642,0x6644,0x6645,0x6646,0x6647,0x6648,/* 0x70-0x77 */
+	0x6649,0x664A,0x664D,0x664E,0x6650,0x6651,0x6658,0x0000,/* 0x78-0x7F */
+
+	0x6659,0x665B,0x665C,0x665D,0x665E,0x6660,0x6662,0x6663,/* 0x80-0x87 */
+	0x6665,0x6667,0x6669,0x666A,0x666B,0x666C,0x666D,0x6671,/* 0x88-0x8F */
+	0x6672,0x6673,0x6675,0x6678,0x6679,0x667B,0x667C,0x667D,/* 0x90-0x97 */
+	0x667F,0x6680,0x6681,0x6683,0x6685,0x6686,0x6688,0x6689,/* 0x98-0x9F */
+	0x668A,0x668B,0x668D,0x668E,0x668F,0x6690,0x6692,0x6693,/* 0xA0-0xA7 */
+	0x6694,0x6695,0x6698,0x6699,0x669A,0x669B,0x669C,0x669E,/* 0xA8-0xAF */
+	0x669F,0x66A0,0x66A1,0x66A2,0x66A3,0x66A4,0x66A5,0x66A6,/* 0xB0-0xB7 */
+	0x66A9,0x66AA,0x66AB,0x66AC,0x66AD,0x66AF,0x66B0,0x66B1,/* 0xB8-0xBF */
+	0x66B2,0x66B3,0x66B5,0x66B6,0x66B7,0x66B8,0x66BA,0x66BB,/* 0xC0-0xC7 */
+	0x66BC,0x66BD,0x66BF,0x66C0,0x66C1,0x66C2,0x66C3,0x66C4,/* 0xC8-0xCF */
+	0x66C5,0x66C6,0x66C7,0x66C8,0x66C9,0x66CA,0x66CB,0x66CC,/* 0xD0-0xD7 */
+	0x66CD,0x66CE,0x66CF,0x66D0,0x66D1,0x66D2,0x66D3,0x66D4,/* 0xD8-0xDF */
+	0x66D5,0x66D6,0x66D7,0x66D8,0x66DA,0x66DE,0x66DF,0x66E0,/* 0xE0-0xE7 */
+	0x66E1,0x66E2,0x66E3,0x66E4,0x66E5,0x66E7,0x66E8,0x66EA,/* 0xE8-0xEF */
+	0x66EB,0x66EC,0x66ED,0x66EE,0x66EF,0x66F1,0x66F5,0x66F6,/* 0xF0-0xF7 */
+	0x66F8,0x66FA,0x66FB,0x66FD,0x6701,0x6702,0x6703,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_96[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6704,0x6705,0x6706,0x6707,0x670C,0x670E,0x670F,0x6711,/* 0x40-0x47 */
+	0x6712,0x6713,0x6716,0x6718,0x6719,0x671A,0x671C,0x671E,/* 0x48-0x4F */
+	0x6720,0x6721,0x6722,0x6723,0x6724,0x6725,0x6727,0x6729,/* 0x50-0x57 */
+	0x672E,0x6730,0x6732,0x6733,0x6736,0x6737,0x6738,0x6739,/* 0x58-0x5F */
+	0x673B,0x673C,0x673E,0x673F,0x6741,0x6744,0x6745,0x6747,/* 0x60-0x67 */
+	0x674A,0x674B,0x674D,0x6752,0x6754,0x6755,0x6757,0x6758,/* 0x68-0x6F */
+	0x6759,0x675A,0x675B,0x675D,0x6762,0x6763,0x6764,0x6766,/* 0x70-0x77 */
+	0x6767,0x676B,0x676C,0x676E,0x6771,0x6774,0x6776,0x0000,/* 0x78-0x7F */
+
+	0x6778,0x6779,0x677A,0x677B,0x677D,0x6780,0x6782,0x6783,/* 0x80-0x87 */
+	0x6785,0x6786,0x6788,0x678A,0x678C,0x678D,0x678E,0x678F,/* 0x88-0x8F */
+	0x6791,0x6792,0x6793,0x6794,0x6796,0x6799,0x679B,0x679F,/* 0x90-0x97 */
+	0x67A0,0x67A1,0x67A4,0x67A6,0x67A9,0x67AC,0x67AE,0x67B1,/* 0x98-0x9F */
+	0x67B2,0x67B4,0x67B9,0x67BA,0x67BB,0x67BC,0x67BD,0x67BE,/* 0xA0-0xA7 */
+	0x67BF,0x67C0,0x67C2,0x67C5,0x67C6,0x67C7,0x67C8,0x67C9,/* 0xA8-0xAF */
+	0x67CA,0x67CB,0x67CC,0x67CD,0x67CE,0x67D5,0x67D6,0x67D7,/* 0xB0-0xB7 */
+	0x67DB,0x67DF,0x67E1,0x67E3,0x67E4,0x67E6,0x67E7,0x67E8,/* 0xB8-0xBF */
+	0x67EA,0x67EB,0x67ED,0x67EE,0x67F2,0x67F5,0x67F6,0x67F7,/* 0xC0-0xC7 */
+	0x67F8,0x67F9,0x67FA,0x67FB,0x67FC,0x67FE,0x6801,0x6802,/* 0xC8-0xCF */
+	0x6803,0x6804,0x6806,0x680D,0x6810,0x6812,0x6814,0x6815,/* 0xD0-0xD7 */
+	0x6818,0x6819,0x681A,0x681B,0x681C,0x681E,0x681F,0x6820,/* 0xD8-0xDF */
+	0x6822,0x6823,0x6824,0x6825,0x6826,0x6827,0x6828,0x682B,/* 0xE0-0xE7 */
+	0x682C,0x682D,0x682E,0x682F,0x6830,0x6831,0x6834,0x6835,/* 0xE8-0xEF */
+	0x6836,0x683A,0x683B,0x683F,0x6847,0x684B,0x684D,0x684F,/* 0xF0-0xF7 */
+	0x6852,0x6856,0x6857,0x6858,0x6859,0x685A,0x685B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_97[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x685C,0x685D,0x685E,0x685F,0x686A,0x686C,0x686D,0x686E,/* 0x40-0x47 */
+	0x686F,0x6870,0x6871,0x6872,0x6873,0x6875,0x6878,0x6879,/* 0x48-0x4F */
+	0x687A,0x687B,0x687C,0x687D,0x687E,0x687F,0x6880,0x6882,/* 0x50-0x57 */
+	0x6884,0x6887,0x6888,0x6889,0x688A,0x688B,0x688C,0x688D,/* 0x58-0x5F */
+	0x688E,0x6890,0x6891,0x6892,0x6894,0x6895,0x6896,0x6898,/* 0x60-0x67 */
+	0x6899,0x689A,0x689B,0x689C,0x689D,0x689E,0x689F,0x68A0,/* 0x68-0x6F */
+	0x68A1,0x68A3,0x68A4,0x68A5,0x68A9,0x68AA,0x68AB,0x68AC,/* 0x70-0x77 */
+	0x68AE,0x68B1,0x68B2,0x68B4,0x68B6,0x68B7,0x68B8,0x0000,/* 0x78-0x7F */
+
+	0x68B9,0x68BA,0x68BB,0x68BC,0x68BD,0x68BE,0x68BF,0x68C1,/* 0x80-0x87 */
+	0x68C3,0x68C4,0x68C5,0x68C6,0x68C7,0x68C8,0x68CA,0x68CC,/* 0x88-0x8F */
+	0x68CE,0x68CF,0x68D0,0x68D1,0x68D3,0x68D4,0x68D6,0x68D7,/* 0x90-0x97 */
+	0x68D9,0x68DB,0x68DC,0x68DD,0x68DE,0x68DF,0x68E1,0x68E2,/* 0x98-0x9F */
+	0x68E4,0x68E5,0x68E6,0x68E7,0x68E8,0x68E9,0x68EA,0x68EB,/* 0xA0-0xA7 */
+	0x68EC,0x68ED,0x68EF,0x68F2,0x68F3,0x68F4,0x68F6,0x68F7,/* 0xA8-0xAF */
+	0x68F8,0x68FB,0x68FD,0x68FE,0x68FF,0x6900,0x6902,0x6903,/* 0xB0-0xB7 */
+	0x6904,0x6906,0x6907,0x6908,0x6909,0x690A,0x690C,0x690F,/* 0xB8-0xBF */
+	0x6911,0x6913,0x6914,0x6915,0x6916,0x6917,0x6918,0x6919,/* 0xC0-0xC7 */
+	0x691A,0x691B,0x691C,0x691D,0x691E,0x6921,0x6922,0x6923,/* 0xC8-0xCF */
+	0x6925,0x6926,0x6927,0x6928,0x6929,0x692A,0x692B,0x692C,/* 0xD0-0xD7 */
+	0x692E,0x692F,0x6931,0x6932,0x6933,0x6935,0x6936,0x6937,/* 0xD8-0xDF */
+	0x6938,0x693A,0x693B,0x693C,0x693E,0x6940,0x6941,0x6943,/* 0xE0-0xE7 */
+	0x6944,0x6945,0x6946,0x6947,0x6948,0x6949,0x694A,0x694B,/* 0xE8-0xEF */
+	0x694C,0x694D,0x694E,0x694F,0x6950,0x6951,0x6952,0x6953,/* 0xF0-0xF7 */
+	0x6955,0x6956,0x6958,0x6959,0x695B,0x695C,0x695F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_98[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6961,0x6962,0x6964,0x6965,0x6967,0x6968,0x6969,0x696A,/* 0x40-0x47 */
+	0x696C,0x696D,0x696F,0x6970,0x6972,0x6973,0x6974,0x6975,/* 0x48-0x4F */
+	0x6976,0x697A,0x697B,0x697D,0x697E,0x697F,0x6981,0x6983,/* 0x50-0x57 */
+	0x6985,0x698A,0x698B,0x698C,0x698E,0x698F,0x6990,0x6991,/* 0x58-0x5F */
+	0x6992,0x6993,0x6996,0x6997,0x6999,0x699A,0x699D,0x699E,/* 0x60-0x67 */
+	0x699F,0x69A0,0x69A1,0x69A2,0x69A3,0x69A4,0x69A5,0x69A6,/* 0x68-0x6F */
+	0x69A9,0x69AA,0x69AC,0x69AE,0x69AF,0x69B0,0x69B2,0x69B3,/* 0x70-0x77 */
+	0x69B5,0x69B6,0x69B8,0x69B9,0x69BA,0x69BC,0x69BD,0x0000,/* 0x78-0x7F */
+
+	0x69BE,0x69BF,0x69C0,0x69C2,0x69C3,0x69C4,0x69C5,0x69C6,/* 0x80-0x87 */
+	0x69C7,0x69C8,0x69C9,0x69CB,0x69CD,0x69CF,0x69D1,0x69D2,/* 0x88-0x8F */
+	0x69D3,0x69D5,0x69D6,0x69D7,0x69D8,0x69D9,0x69DA,0x69DC,/* 0x90-0x97 */
+	0x69DD,0x69DE,0x69E1,0x69E2,0x69E3,0x69E4,0x69E5,0x69E6,/* 0x98-0x9F */
+	0x69E7,0x69E8,0x69E9,0x69EA,0x69EB,0x69EC,0x69EE,0x69EF,/* 0xA0-0xA7 */
+	0x69F0,0x69F1,0x69F3,0x69F4,0x69F5,0x69F6,0x69F7,0x69F8,/* 0xA8-0xAF */
+	0x69F9,0x69FA,0x69FB,0x69FC,0x69FE,0x6A00,0x6A01,0x6A02,/* 0xB0-0xB7 */
+	0x6A03,0x6A04,0x6A05,0x6A06,0x6A07,0x6A08,0x6A09,0x6A0B,/* 0xB8-0xBF */
+	0x6A0C,0x6A0D,0x6A0E,0x6A0F,0x6A10,0x6A11,0x6A12,0x6A13,/* 0xC0-0xC7 */
+	0x6A14,0x6A15,0x6A16,0x6A19,0x6A1A,0x6A1B,0x6A1C,0x6A1D,/* 0xC8-0xCF */
+	0x6A1E,0x6A20,0x6A22,0x6A23,0x6A24,0x6A25,0x6A26,0x6A27,/* 0xD0-0xD7 */
+	0x6A29,0x6A2B,0x6A2C,0x6A2D,0x6A2E,0x6A30,0x6A32,0x6A33,/* 0xD8-0xDF */
+	0x6A34,0x6A36,0x6A37,0x6A38,0x6A39,0x6A3A,0x6A3B,0x6A3C,/* 0xE0-0xE7 */
+	0x6A3F,0x6A40,0x6A41,0x6A42,0x6A43,0x6A45,0x6A46,0x6A48,/* 0xE8-0xEF */
+	0x6A49,0x6A4A,0x6A4B,0x6A4C,0x6A4D,0x6A4E,0x6A4F,0x6A51,/* 0xF0-0xF7 */
+	0x6A52,0x6A53,0x6A54,0x6A55,0x6A56,0x6A57,0x6A5A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_99[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6A5C,0x6A5D,0x6A5E,0x6A5F,0x6A60,0x6A62,0x6A63,0x6A64,/* 0x40-0x47 */
+	0x6A66,0x6A67,0x6A68,0x6A69,0x6A6A,0x6A6B,0x6A6C,0x6A6D,/* 0x48-0x4F */
+	0x6A6E,0x6A6F,0x6A70,0x6A72,0x6A73,0x6A74,0x6A75,0x6A76,/* 0x50-0x57 */
+	0x6A77,0x6A78,0x6A7A,0x6A7B,0x6A7D,0x6A7E,0x6A7F,0x6A81,/* 0x58-0x5F */
+	0x6A82,0x6A83,0x6A85,0x6A86,0x6A87,0x6A88,0x6A89,0x6A8A,/* 0x60-0x67 */
+	0x6A8B,0x6A8C,0x6A8D,0x6A8F,0x6A92,0x6A93,0x6A94,0x6A95,/* 0x68-0x6F */
+	0x6A96,0x6A98,0x6A99,0x6A9A,0x6A9B,0x6A9C,0x6A9D,0x6A9E,/* 0x70-0x77 */
+	0x6A9F,0x6AA1,0x6AA2,0x6AA3,0x6AA4,0x6AA5,0x6AA6,0x0000,/* 0x78-0x7F */
+
+	0x6AA7,0x6AA8,0x6AAA,0x6AAD,0x6AAE,0x6AAF,0x6AB0,0x6AB1,/* 0x80-0x87 */
+	0x6AB2,0x6AB3,0x6AB4,0x6AB5,0x6AB6,0x6AB7,0x6AB8,0x6AB9,/* 0x88-0x8F */
+	0x6ABA,0x6ABB,0x6ABC,0x6ABD,0x6ABE,0x6ABF,0x6AC0,0x6AC1,/* 0x90-0x97 */
+	0x6AC2,0x6AC3,0x6AC4,0x6AC5,0x6AC6,0x6AC7,0x6AC8,0x6AC9,/* 0x98-0x9F */
+	0x6ACA,0x6ACB,0x6ACC,0x6ACD,0x6ACE,0x6ACF,0x6AD0,0x6AD1,/* 0xA0-0xA7 */
+	0x6AD2,0x6AD3,0x6AD4,0x6AD5,0x6AD6,0x6AD7,0x6AD8,0x6AD9,/* 0xA8-0xAF */
+	0x6ADA,0x6ADB,0x6ADC,0x6ADD,0x6ADE,0x6ADF,0x6AE0,0x6AE1,/* 0xB0-0xB7 */
+	0x6AE2,0x6AE3,0x6AE4,0x6AE5,0x6AE6,0x6AE7,0x6AE8,0x6AE9,/* 0xB8-0xBF */
+	0x6AEA,0x6AEB,0x6AEC,0x6AED,0x6AEE,0x6AEF,0x6AF0,0x6AF1,/* 0xC0-0xC7 */
+	0x6AF2,0x6AF3,0x6AF4,0x6AF5,0x6AF6,0x6AF7,0x6AF8,0x6AF9,/* 0xC8-0xCF */
+	0x6AFA,0x6AFB,0x6AFC,0x6AFD,0x6AFE,0x6AFF,0x6B00,0x6B01,/* 0xD0-0xD7 */
+	0x6B02,0x6B03,0x6B04,0x6B05,0x6B06,0x6B07,0x6B08,0x6B09,/* 0xD8-0xDF */
+	0x6B0A,0x6B0B,0x6B0C,0x6B0D,0x6B0E,0x6B0F,0x6B10,0x6B11,/* 0xE0-0xE7 */
+	0x6B12,0x6B13,0x6B14,0x6B15,0x6B16,0x6B17,0x6B18,0x6B19,/* 0xE8-0xEF */
+	0x6B1A,0x6B1B,0x6B1C,0x6B1D,0x6B1E,0x6B1F,0x6B25,0x6B26,/* 0xF0-0xF7 */
+	0x6B28,0x6B29,0x6B2A,0x6B2B,0x6B2C,0x6B2D,0x6B2E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9A[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6B2F,0x6B30,0x6B31,0x6B33,0x6B34,0x6B35,0x6B36,0x6B38,/* 0x40-0x47 */
+	0x6B3B,0x6B3C,0x6B3D,0x6B3F,0x6B40,0x6B41,0x6B42,0x6B44,/* 0x48-0x4F */
+	0x6B45,0x6B48,0x6B4A,0x6B4B,0x6B4D,0x6B4E,0x6B4F,0x6B50,/* 0x50-0x57 */
+	0x6B51,0x6B52,0x6B53,0x6B54,0x6B55,0x6B56,0x6B57,0x6B58,/* 0x58-0x5F */
+	0x6B5A,0x6B5B,0x6B5C,0x6B5D,0x6B5E,0x6B5F,0x6B60,0x6B61,/* 0x60-0x67 */
+	0x6B68,0x6B69,0x6B6B,0x6B6C,0x6B6D,0x6B6E,0x6B6F,0x6B70,/* 0x68-0x6F */
+	0x6B71,0x6B72,0x6B73,0x6B74,0x6B75,0x6B76,0x6B77,0x6B78,/* 0x70-0x77 */
+	0x6B7A,0x6B7D,0x6B7E,0x6B7F,0x6B80,0x6B85,0x6B88,0x0000,/* 0x78-0x7F */
+
+	0x6B8C,0x6B8E,0x6B8F,0x6B90,0x6B91,0x6B94,0x6B95,0x6B97,/* 0x80-0x87 */
+	0x6B98,0x6B99,0x6B9C,0x6B9D,0x6B9E,0x6B9F,0x6BA0,0x6BA2,/* 0x88-0x8F */
+	0x6BA3,0x6BA4,0x6BA5,0x6BA6,0x6BA7,0x6BA8,0x6BA9,0x6BAB,/* 0x90-0x97 */
+	0x6BAC,0x6BAD,0x6BAE,0x6BAF,0x6BB0,0x6BB1,0x6BB2,0x6BB6,/* 0x98-0x9F */
+	0x6BB8,0x6BB9,0x6BBA,0x6BBB,0x6BBC,0x6BBD,0x6BBE,0x6BC0,/* 0xA0-0xA7 */
+	0x6BC3,0x6BC4,0x6BC6,0x6BC7,0x6BC8,0x6BC9,0x6BCA,0x6BCC,/* 0xA8-0xAF */
+	0x6BCE,0x6BD0,0x6BD1,0x6BD8,0x6BDA,0x6BDC,0x6BDD,0x6BDE,/* 0xB0-0xB7 */
+	0x6BDF,0x6BE0,0x6BE2,0x6BE3,0x6BE4,0x6BE5,0x6BE6,0x6BE7,/* 0xB8-0xBF */
+	0x6BE8,0x6BE9,0x6BEC,0x6BED,0x6BEE,0x6BF0,0x6BF1,0x6BF2,/* 0xC0-0xC7 */
+	0x6BF4,0x6BF6,0x6BF7,0x6BF8,0x6BFA,0x6BFB,0x6BFC,0x6BFE,/* 0xC8-0xCF */
+	0x6BFF,0x6C00,0x6C01,0x6C02,0x6C03,0x6C04,0x6C08,0x6C09,/* 0xD0-0xD7 */
+	0x6C0A,0x6C0B,0x6C0C,0x6C0E,0x6C12,0x6C17,0x6C1C,0x6C1D,/* 0xD8-0xDF */
+	0x6C1E,0x6C20,0x6C23,0x6C25,0x6C2B,0x6C2C,0x6C2D,0x6C31,/* 0xE0-0xE7 */
+	0x6C33,0x6C36,0x6C37,0x6C39,0x6C3A,0x6C3B,0x6C3C,0x6C3E,/* 0xE8-0xEF */
+	0x6C3F,0x6C43,0x6C44,0x6C45,0x6C48,0x6C4B,0x6C4C,0x6C4D,/* 0xF0-0xF7 */
+	0x6C4E,0x6C4F,0x6C51,0x6C52,0x6C53,0x6C56,0x6C58,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9B[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6C59,0x6C5A,0x6C62,0x6C63,0x6C65,0x6C66,0x6C67,0x6C6B,/* 0x40-0x47 */
+	0x6C6C,0x6C6D,0x6C6E,0x6C6F,0x6C71,0x6C73,0x6C75,0x6C77,/* 0x48-0x4F */
+	0x6C78,0x6C7A,0x6C7B,0x6C7C,0x6C7F,0x6C80,0x6C84,0x6C87,/* 0x50-0x57 */
+	0x6C8A,0x6C8B,0x6C8D,0x6C8E,0x6C91,0x6C92,0x6C95,0x6C96,/* 0x58-0x5F */
+	0x6C97,0x6C98,0x6C9A,0x6C9C,0x6C9D,0x6C9E,0x6CA0,0x6CA2,/* 0x60-0x67 */
+	0x6CA8,0x6CAC,0x6CAF,0x6CB0,0x6CB4,0x6CB5,0x6CB6,0x6CB7,/* 0x68-0x6F */
+	0x6CBA,0x6CC0,0x6CC1,0x6CC2,0x6CC3,0x6CC6,0x6CC7,0x6CC8,/* 0x70-0x77 */
+	0x6CCB,0x6CCD,0x6CCE,0x6CCF,0x6CD1,0x6CD2,0x6CD8,0x0000,/* 0x78-0x7F */
+
+	0x6CD9,0x6CDA,0x6CDC,0x6CDD,0x6CDF,0x6CE4,0x6CE6,0x6CE7,/* 0x80-0x87 */
+	0x6CE9,0x6CEC,0x6CED,0x6CF2,0x6CF4,0x6CF9,0x6CFF,0x6D00,/* 0x88-0x8F */
+	0x6D02,0x6D03,0x6D05,0x6D06,0x6D08,0x6D09,0x6D0A,0x6D0D,/* 0x90-0x97 */
+	0x6D0F,0x6D10,0x6D11,0x6D13,0x6D14,0x6D15,0x6D16,0x6D18,/* 0x98-0x9F */
+	0x6D1C,0x6D1D,0x6D1F,0x6D20,0x6D21,0x6D22,0x6D23,0x6D24,/* 0xA0-0xA7 */
+	0x6D26,0x6D28,0x6D29,0x6D2C,0x6D2D,0x6D2F,0x6D30,0x6D34,/* 0xA8-0xAF */
+	0x6D36,0x6D37,0x6D38,0x6D3A,0x6D3F,0x6D40,0x6D42,0x6D44,/* 0xB0-0xB7 */
+	0x6D49,0x6D4C,0x6D50,0x6D55,0x6D56,0x6D57,0x6D58,0x6D5B,/* 0xB8-0xBF */
+	0x6D5D,0x6D5F,0x6D61,0x6D62,0x6D64,0x6D65,0x6D67,0x6D68,/* 0xC0-0xC7 */
+	0x6D6B,0x6D6C,0x6D6D,0x6D70,0x6D71,0x6D72,0x6D73,0x6D75,/* 0xC8-0xCF */
+	0x6D76,0x6D79,0x6D7A,0x6D7B,0x6D7D,0x6D7E,0x6D7F,0x6D80,/* 0xD0-0xD7 */
+	0x6D81,0x6D83,0x6D84,0x6D86,0x6D87,0x6D8A,0x6D8B,0x6D8D,/* 0xD8-0xDF */
+	0x6D8F,0x6D90,0x6D92,0x6D96,0x6D97,0x6D98,0x6D99,0x6D9A,/* 0xE0-0xE7 */
+	0x6D9C,0x6DA2,0x6DA5,0x6DAC,0x6DAD,0x6DB0,0x6DB1,0x6DB3,/* 0xE8-0xEF */
+	0x6DB4,0x6DB6,0x6DB7,0x6DB9,0x6DBA,0x6DBB,0x6DBC,0x6DBD,/* 0xF0-0xF7 */
+	0x6DBE,0x6DC1,0x6DC2,0x6DC3,0x6DC8,0x6DC9,0x6DCA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9C[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6DCD,0x6DCE,0x6DCF,0x6DD0,0x6DD2,0x6DD3,0x6DD4,0x6DD5,/* 0x40-0x47 */
+	0x6DD7,0x6DDA,0x6DDB,0x6DDC,0x6DDF,0x6DE2,0x6DE3,0x6DE5,/* 0x48-0x4F */
+	0x6DE7,0x6DE8,0x6DE9,0x6DEA,0x6DED,0x6DEF,0x6DF0,0x6DF2,/* 0x50-0x57 */
+	0x6DF4,0x6DF5,0x6DF6,0x6DF8,0x6DFA,0x6DFD,0x6DFE,0x6DFF,/* 0x58-0x5F */
+	0x6E00,0x6E01,0x6E02,0x6E03,0x6E04,0x6E06,0x6E07,0x6E08,/* 0x60-0x67 */
+	0x6E09,0x6E0B,0x6E0F,0x6E12,0x6E13,0x6E15,0x6E18,0x6E19,/* 0x68-0x6F */
+	0x6E1B,0x6E1C,0x6E1E,0x6E1F,0x6E22,0x6E26,0x6E27,0x6E28,/* 0x70-0x77 */
+	0x6E2A,0x6E2C,0x6E2E,0x6E30,0x6E31,0x6E33,0x6E35,0x0000,/* 0x78-0x7F */
+
+	0x6E36,0x6E37,0x6E39,0x6E3B,0x6E3C,0x6E3D,0x6E3E,0x6E3F,/* 0x80-0x87 */
+	0x6E40,0x6E41,0x6E42,0x6E45,0x6E46,0x6E47,0x6E48,0x6E49,/* 0x88-0x8F */
+	0x6E4A,0x6E4B,0x6E4C,0x6E4F,0x6E50,0x6E51,0x6E52,0x6E55,/* 0x90-0x97 */
+	0x6E57,0x6E59,0x6E5A,0x6E5C,0x6E5D,0x6E5E,0x6E60,0x6E61,/* 0x98-0x9F */
+	0x6E62,0x6E63,0x6E64,0x6E65,0x6E66,0x6E67,0x6E68,0x6E69,/* 0xA0-0xA7 */
+	0x6E6A,0x6E6C,0x6E6D,0x6E6F,0x6E70,0x6E71,0x6E72,0x6E73,/* 0xA8-0xAF */
+	0x6E74,0x6E75,0x6E76,0x6E77,0x6E78,0x6E79,0x6E7A,0x6E7B,/* 0xB0-0xB7 */
+	0x6E7C,0x6E7D,0x6E80,0x6E81,0x6E82,0x6E84,0x6E87,0x6E88,/* 0xB8-0xBF */
+	0x6E8A,0x6E8B,0x6E8C,0x6E8D,0x6E8E,0x6E91,0x6E92,0x6E93,/* 0xC0-0xC7 */
+	0x6E94,0x6E95,0x6E96,0x6E97,0x6E99,0x6E9A,0x6E9B,0x6E9D,/* 0xC8-0xCF */
+	0x6E9E,0x6EA0,0x6EA1,0x6EA3,0x6EA4,0x6EA6,0x6EA8,0x6EA9,/* 0xD0-0xD7 */
+	0x6EAB,0x6EAC,0x6EAD,0x6EAE,0x6EB0,0x6EB3,0x6EB5,0x6EB8,/* 0xD8-0xDF */
+	0x6EB9,0x6EBC,0x6EBE,0x6EBF,0x6EC0,0x6EC3,0x6EC4,0x6EC5,/* 0xE0-0xE7 */
+	0x6EC6,0x6EC8,0x6EC9,0x6ECA,0x6ECC,0x6ECD,0x6ECE,0x6ED0,/* 0xE8-0xEF */
+	0x6ED2,0x6ED6,0x6ED8,0x6ED9,0x6EDB,0x6EDC,0x6EDD,0x6EE3,/* 0xF0-0xF7 */
+	0x6EE7,0x6EEA,0x6EEB,0x6EEC,0x6EED,0x6EEE,0x6EEF,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9D[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6EF0,0x6EF1,0x6EF2,0x6EF3,0x6EF5,0x6EF6,0x6EF7,0x6EF8,/* 0x40-0x47 */
+	0x6EFA,0x6EFB,0x6EFC,0x6EFD,0x6EFE,0x6EFF,0x6F00,0x6F01,/* 0x48-0x4F */
+	0x6F03,0x6F04,0x6F05,0x6F07,0x6F08,0x6F0A,0x6F0B,0x6F0C,/* 0x50-0x57 */
+	0x6F0D,0x6F0E,0x6F10,0x6F11,0x6F12,0x6F16,0x6F17,0x6F18,/* 0x58-0x5F */
+	0x6F19,0x6F1A,0x6F1B,0x6F1C,0x6F1D,0x6F1E,0x6F1F,0x6F21,/* 0x60-0x67 */
+	0x6F22,0x6F23,0x6F25,0x6F26,0x6F27,0x6F28,0x6F2C,0x6F2E,/* 0x68-0x6F */
+	0x6F30,0x6F32,0x6F34,0x6F35,0x6F37,0x6F38,0x6F39,0x6F3A,/* 0x70-0x77 */
+	0x6F3B,0x6F3C,0x6F3D,0x6F3F,0x6F40,0x6F41,0x6F42,0x0000,/* 0x78-0x7F */
+
+	0x6F43,0x6F44,0x6F45,0x6F48,0x6F49,0x6F4A,0x6F4C,0x6F4E,/* 0x80-0x87 */
+	0x6F4F,0x6F50,0x6F51,0x6F52,0x6F53,0x6F54,0x6F55,0x6F56,/* 0x88-0x8F */
+	0x6F57,0x6F59,0x6F5A,0x6F5B,0x6F5D,0x6F5F,0x6F60,0x6F61,/* 0x90-0x97 */
+	0x6F63,0x6F64,0x6F65,0x6F67,0x6F68,0x6F69,0x6F6A,0x6F6B,/* 0x98-0x9F */
+	0x6F6C,0x6F6F,0x6F70,0x6F71,0x6F73,0x6F75,0x6F76,0x6F77,/* 0xA0-0xA7 */
+	0x6F79,0x6F7B,0x6F7D,0x6F7E,0x6F7F,0x6F80,0x6F81,0x6F82,/* 0xA8-0xAF */
+	0x6F83,0x6F85,0x6F86,0x6F87,0x6F8A,0x6F8B,0x6F8F,0x6F90,/* 0xB0-0xB7 */
+	0x6F91,0x6F92,0x6F93,0x6F94,0x6F95,0x6F96,0x6F97,0x6F98,/* 0xB8-0xBF */
+	0x6F99,0x6F9A,0x6F9B,0x6F9D,0x6F9E,0x6F9F,0x6FA0,0x6FA2,/* 0xC0-0xC7 */
+	0x6FA3,0x6FA4,0x6FA5,0x6FA6,0x6FA8,0x6FA9,0x6FAA,0x6FAB,/* 0xC8-0xCF */
+	0x6FAC,0x6FAD,0x6FAE,0x6FAF,0x6FB0,0x6FB1,0x6FB2,0x6FB4,/* 0xD0-0xD7 */
+	0x6FB5,0x6FB7,0x6FB8,0x6FBA,0x6FBB,0x6FBC,0x6FBD,0x6FBE,/* 0xD8-0xDF */
+	0x6FBF,0x6FC1,0x6FC3,0x6FC4,0x6FC5,0x6FC6,0x6FC7,0x6FC8,/* 0xE0-0xE7 */
+	0x6FCA,0x6FCB,0x6FCC,0x6FCD,0x6FCE,0x6FCF,0x6FD0,0x6FD3,/* 0xE8-0xEF */
+	0x6FD4,0x6FD5,0x6FD6,0x6FD7,0x6FD8,0x6FD9,0x6FDA,0x6FDB,/* 0xF0-0xF7 */
+	0x6FDC,0x6FDD,0x6FDF,0x6FE2,0x6FE3,0x6FE4,0x6FE5,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9E[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6FE6,0x6FE7,0x6FE8,0x6FE9,0x6FEA,0x6FEB,0x6FEC,0x6FED,/* 0x40-0x47 */
+	0x6FF0,0x6FF1,0x6FF2,0x6FF3,0x6FF4,0x6FF5,0x6FF6,0x6FF7,/* 0x48-0x4F */
+	0x6FF8,0x6FF9,0x6FFA,0x6FFB,0x6FFC,0x6FFD,0x6FFE,0x6FFF,/* 0x50-0x57 */
+	0x7000,0x7001,0x7002,0x7003,0x7004,0x7005,0x7006,0x7007,/* 0x58-0x5F */
+	0x7008,0x7009,0x700A,0x700B,0x700C,0x700D,0x700E,0x700F,/* 0x60-0x67 */
+	0x7010,0x7012,0x7013,0x7014,0x7015,0x7016,0x7017,0x7018,/* 0x68-0x6F */
+	0x7019,0x701C,0x701D,0x701E,0x701F,0x7020,0x7021,0x7022,/* 0x70-0x77 */
+	0x7024,0x7025,0x7026,0x7027,0x7028,0x7029,0x702A,0x0000,/* 0x78-0x7F */
+
+	0x702B,0x702C,0x702D,0x702E,0x702F,0x7030,0x7031,0x7032,/* 0x80-0x87 */
+	0x7033,0x7034,0x7036,0x7037,0x7038,0x703A,0x703B,0x703C,/* 0x88-0x8F */
+	0x703D,0x703E,0x703F,0x7040,0x7041,0x7042,0x7043,0x7044,/* 0x90-0x97 */
+	0x7045,0x7046,0x7047,0x7048,0x7049,0x704A,0x704B,0x704D,/* 0x98-0x9F */
+	0x704E,0x7050,0x7051,0x7052,0x7053,0x7054,0x7055,0x7056,/* 0xA0-0xA7 */
+	0x7057,0x7058,0x7059,0x705A,0x705B,0x705C,0x705D,0x705F,/* 0xA8-0xAF */
+	0x7060,0x7061,0x7062,0x7063,0x7064,0x7065,0x7066,0x7067,/* 0xB0-0xB7 */
+	0x7068,0x7069,0x706A,0x706E,0x7071,0x7072,0x7073,0x7074,/* 0xB8-0xBF */
+	0x7077,0x7079,0x707A,0x707B,0x707D,0x7081,0x7082,0x7083,/* 0xC0-0xC7 */
+	0x7084,0x7086,0x7087,0x7088,0x708B,0x708C,0x708D,0x708F,/* 0xC8-0xCF */
+	0x7090,0x7091,0x7093,0x7097,0x7098,0x709A,0x709B,0x709E,/* 0xD0-0xD7 */
+	0x709F,0x70A0,0x70A1,0x70A2,0x70A3,0x70A4,0x70A5,0x70A6,/* 0xD8-0xDF */
+	0x70A7,0x70A8,0x70A9,0x70AA,0x70B0,0x70B2,0x70B4,0x70B5,/* 0xE0-0xE7 */
+	0x70B6,0x70BA,0x70BE,0x70BF,0x70C4,0x70C5,0x70C6,0x70C7,/* 0xE8-0xEF */
+	0x70C9,0x70CB,0x70CC,0x70CD,0x70CE,0x70CF,0x70D0,0x70D1,/* 0xF0-0xF7 */
+	0x70D2,0x70D3,0x70D4,0x70D5,0x70D6,0x70D7,0x70DA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9F[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x70DC,0x70DD,0x70DE,0x70E0,0x70E1,0x70E2,0x70E3,0x70E5,/* 0x40-0x47 */
+	0x70EA,0x70EE,0x70F0,0x70F1,0x70F2,0x70F3,0x70F4,0x70F5,/* 0x48-0x4F */
+	0x70F6,0x70F8,0x70FA,0x70FB,0x70FC,0x70FE,0x70FF,0x7100,/* 0x50-0x57 */
+	0x7101,0x7102,0x7103,0x7104,0x7105,0x7106,0x7107,0x7108,/* 0x58-0x5F */
+	0x710B,0x710C,0x710D,0x710E,0x710F,0x7111,0x7112,0x7114,/* 0x60-0x67 */
+	0x7117,0x711B,0x711C,0x711D,0x711E,0x711F,0x7120,0x7121,/* 0x68-0x6F */
+	0x7122,0x7123,0x7124,0x7125,0x7127,0x7128,0x7129,0x712A,/* 0x70-0x77 */
+	0x712B,0x712C,0x712D,0x712E,0x7132,0x7133,0x7134,0x0000,/* 0x78-0x7F */
+
+	0x7135,0x7137,0x7138,0x7139,0x713A,0x713B,0x713C,0x713D,/* 0x80-0x87 */
+	0x713E,0x713F,0x7140,0x7141,0x7142,0x7143,0x7144,0x7146,/* 0x88-0x8F */
+	0x7147,0x7148,0x7149,0x714B,0x714D,0x714F,0x7150,0x7151,/* 0x90-0x97 */
+	0x7152,0x7153,0x7154,0x7155,0x7156,0x7157,0x7158,0x7159,/* 0x98-0x9F */
+	0x715A,0x715B,0x715D,0x715F,0x7160,0x7161,0x7162,0x7163,/* 0xA0-0xA7 */
+	0x7165,0x7169,0x716A,0x716B,0x716C,0x716D,0x716F,0x7170,/* 0xA8-0xAF */
+	0x7171,0x7174,0x7175,0x7176,0x7177,0x7179,0x717B,0x717C,/* 0xB0-0xB7 */
+	0x717E,0x717F,0x7180,0x7181,0x7182,0x7183,0x7185,0x7186,/* 0xB8-0xBF */
+	0x7187,0x7188,0x7189,0x718B,0x718C,0x718D,0x718E,0x7190,/* 0xC0-0xC7 */
+	0x7191,0x7192,0x7193,0x7195,0x7196,0x7197,0x719A,0x719B,/* 0xC8-0xCF */
+	0x719C,0x719D,0x719E,0x71A1,0x71A2,0x71A3,0x71A4,0x71A5,/* 0xD0-0xD7 */
+	0x71A6,0x71A7,0x71A9,0x71AA,0x71AB,0x71AD,0x71AE,0x71AF,/* 0xD8-0xDF */
+	0x71B0,0x71B1,0x71B2,0x71B4,0x71B6,0x71B7,0x71B8,0x71BA,/* 0xE0-0xE7 */
+	0x71BB,0x71BC,0x71BD,0x71BE,0x71BF,0x71C0,0x71C1,0x71C2,/* 0xE8-0xEF */
+	0x71C4,0x71C5,0x71C6,0x71C7,0x71C8,0x71C9,0x71CA,0x71CB,/* 0xF0-0xF7 */
+	0x71CC,0x71CD,0x71CF,0x71D0,0x71D1,0x71D2,0x71D3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x71D6,0x71D7,0x71D8,0x71D9,0x71DA,0x71DB,0x71DC,0x71DD,/* 0x40-0x47 */
+	0x71DE,0x71DF,0x71E1,0x71E2,0x71E3,0x71E4,0x71E6,0x71E8,/* 0x48-0x4F */
+	0x71E9,0x71EA,0x71EB,0x71EC,0x71ED,0x71EF,0x71F0,0x71F1,/* 0x50-0x57 */
+	0x71F2,0x71F3,0x71F4,0x71F5,0x71F6,0x71F7,0x71F8,0x71FA,/* 0x58-0x5F */
+	0x71FB,0x71FC,0x71FD,0x71FE,0x71FF,0x7200,0x7201,0x7202,/* 0x60-0x67 */
+	0x7203,0x7204,0x7205,0x7207,0x7208,0x7209,0x720A,0x720B,/* 0x68-0x6F */
+	0x720C,0x720D,0x720E,0x720F,0x7210,0x7211,0x7212,0x7213,/* 0x70-0x77 */
+	0x7214,0x7215,0x7216,0x7217,0x7218,0x7219,0x721A,0x0000,/* 0x78-0x7F */
+
+	0x721B,0x721C,0x721E,0x721F,0x7220,0x7221,0x7222,0x7223,/* 0x80-0x87 */
+	0x7224,0x7225,0x7226,0x7227,0x7229,0x722B,0x722D,0x722E,/* 0x88-0x8F */
+	0x722F,0x7232,0x7233,0x7234,0x723A,0x723C,0x723E,0x7240,/* 0x90-0x97 */
+	0x7241,0x7242,0x7243,0x7244,0x7245,0x7246,0x7249,0x724A,/* 0x98-0x9F */
+	0x724B,0x724E,0x724F,0x7250,0x7251,0x7253,0x7254,0x7255,/* 0xA0-0xA7 */
+	0x7257,0x7258,0x725A,0x725C,0x725E,0x7260,0x7263,0x7264,/* 0xA8-0xAF */
+	0x7265,0x7268,0x726A,0x726B,0x726C,0x726D,0x7270,0x7271,/* 0xB0-0xB7 */
+	0x7273,0x7274,0x7276,0x7277,0x7278,0x727B,0x727C,0x727D,/* 0xB8-0xBF */
+	0x7282,0x7283,0x7285,0x7286,0x7287,0x7288,0x7289,0x728C,/* 0xC0-0xC7 */
+	0x728E,0x7290,0x7291,0x7293,0x7294,0x7295,0x7296,0x7297,/* 0xC8-0xCF */
+	0x7298,0x7299,0x729A,0x729B,0x729C,0x729D,0x729E,0x72A0,/* 0xD0-0xD7 */
+	0x72A1,0x72A2,0x72A3,0x72A4,0x72A5,0x72A6,0x72A7,0x72A8,/* 0xD8-0xDF */
+	0x72A9,0x72AA,0x72AB,0x72AE,0x72B1,0x72B2,0x72B3,0x72B5,/* 0xE0-0xE7 */
+	0x72BA,0x72BB,0x72BC,0x72BD,0x72BE,0x72BF,0x72C0,0x72C5,/* 0xE8-0xEF */
+	0x72C6,0x72C7,0x72C9,0x72CA,0x72CB,0x72CC,0x72CF,0x72D1,/* 0xF0-0xF7 */
+	0x72D3,0x72D4,0x72D5,0x72D6,0x72D8,0x72DA,0x72DB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x3000,0x3001,0x3002,0x00B7,0x02C9,0x02C7,0x00A8,/* 0xA0-0xA7 */
+	0x3003,0x3005,0x2014,0xFF5E,0x2016,0x2026,0x2018,0x2019,/* 0xA8-0xAF */
+	0x201C,0x201D,0x3014,0x3015,0x3008,0x3009,0x300A,0x300B,/* 0xB0-0xB7 */
+	0x300C,0x300D,0x300E,0x300F,0x3016,0x3017,0x3010,0x3011,/* 0xB8-0xBF */
+	0x00B1,0x00D7,0x00F7,0x2236,0x2227,0x2228,0x2211,0x220F,/* 0xC0-0xC7 */
+	0x222A,0x2229,0x2208,0x2237,0x221A,0x22A5,0x2225,0x2220,/* 0xC8-0xCF */
+	0x2312,0x2299,0x222B,0x222E,0x2261,0x224C,0x2248,0x223D,/* 0xD0-0xD7 */
+	0x221D,0x2260,0x226E,0x226F,0x2264,0x2265,0x221E,0x2235,/* 0xD8-0xDF */
+	0x2234,0x2642,0x2640,0x00B0,0x2032,0x2033,0x2103,0xFF04,/* 0xE0-0xE7 */
+	0x00A4,0xFFE0,0xFFE1,0x2030,0x00A7,0x2116,0x2606,0x2605,/* 0xE8-0xEF */
+	0x25CB,0x25CF,0x25CE,0x25C7,0x25C6,0x25A1,0x25A0,0x25B3,/* 0xF0-0xF7 */
+	0x25B2,0x203B,0x2192,0x2190,0x2191,0x2193,0x3013,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x2170,0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,/* 0xA0-0xA7 */
+	0x2177,0x2178,0x2179,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA8-0xAF */
+	0x0000,0x2488,0x2489,0x248A,0x248B,0x248C,0x248D,0x248E,/* 0xB0-0xB7 */
+	0x248F,0x2490,0x2491,0x2492,0x2493,0x2494,0x2495,0x2496,/* 0xB8-0xBF */
+	0x2497,0x2498,0x2499,0x249A,0x249B,0x2474,0x2475,0x2476,/* 0xC0-0xC7 */
+	0x2477,0x2478,0x2479,0x247A,0x247B,0x247C,0x247D,0x247E,/* 0xC8-0xCF */
+	0x247F,0x2480,0x2481,0x2482,0x2483,0x2484,0x2485,0x2486,/* 0xD0-0xD7 */
+	0x2487,0x2460,0x2461,0x2462,0x2463,0x2464,0x2465,0x2466,/* 0xD8-0xDF */
+	0x2467,0x2468,0x2469,0x0000,0x0000,0x3220,0x3221,0x3222,/* 0xE0-0xE7 */
+	0x3223,0x3224,0x3225,0x3226,0x3227,0x3228,0x3229,0x0000,/* 0xE8-0xEF */
+	0x0000,0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,0x2166,/* 0xF0-0xF7 */
+	0x2167,0x2168,0x2169,0x216A,0x216B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xFF01,0xFF02,0xFF03,0xFFE5,0xFF05,0xFF06,0xFF07,/* 0xA0-0xA7 */
+	0xFF08,0xFF09,0xFF0A,0xFF0B,0xFF0C,0xFF0D,0xFF0E,0xFF0F,/* 0xA8-0xAF */
+	0xFF10,0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,/* 0xB0-0xB7 */
+	0xFF18,0xFF19,0xFF1A,0xFF1B,0xFF1C,0xFF1D,0xFF1E,0xFF1F,/* 0xB8-0xBF */
+	0xFF20,0xFF21,0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,/* 0xC0-0xC7 */
+	0xFF28,0xFF29,0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,/* 0xC8-0xCF */
+	0xFF30,0xFF31,0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,/* 0xD0-0xD7 */
+	0xFF38,0xFF39,0xFF3A,0xFF3B,0xFF3C,0xFF3D,0xFF3E,0xFF3F,/* 0xD8-0xDF */
+	0xFF40,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0xE0-0xE7 */
+	0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0xE8-0xEF */
+	0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0xFF57,/* 0xF0-0xF7 */
+	0xFF58,0xFF59,0xFF5A,0xFF5B,0xFF5C,0xFF5D,0xFFE3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x3041,0x3042,0x3043,0x3044,0x3045,0x3046,0x3047,/* 0xA0-0xA7 */
+	0x3048,0x3049,0x304A,0x304B,0x304C,0x304D,0x304E,0x304F,/* 0xA8-0xAF */
+	0x3050,0x3051,0x3052,0x3053,0x3054,0x3055,0x3056,0x3057,/* 0xB0-0xB7 */
+	0x3058,0x3059,0x305A,0x305B,0x305C,0x305D,0x305E,0x305F,/* 0xB8-0xBF */
+	0x3060,0x3061,0x3062,0x3063,0x3064,0x3065,0x3066,0x3067,/* 0xC0-0xC7 */
+	0x3068,0x3069,0x306A,0x306B,0x306C,0x306D,0x306E,0x306F,/* 0xC8-0xCF */
+	0x3070,0x3071,0x3072,0x3073,0x3074,0x3075,0x3076,0x3077,/* 0xD0-0xD7 */
+	0x3078,0x3079,0x307A,0x307B,0x307C,0x307D,0x307E,0x307F,/* 0xD8-0xDF */
+	0x3080,0x3081,0x3082,0x3083,0x3084,0x3085,0x3086,0x3087,/* 0xE0-0xE7 */
+	0x3088,0x3089,0x308A,0x308B,0x308C,0x308D,0x308E,0x308F,/* 0xE8-0xEF */
+	0x3090,0x3091,0x3092,0x3093,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_A5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x30A1,0x30A2,0x30A3,0x30A4,0x30A5,0x30A6,0x30A7,/* 0xA0-0xA7 */
+	0x30A8,0x30A9,0x30AA,0x30AB,0x30AC,0x30AD,0x30AE,0x30AF,/* 0xA8-0xAF */
+	0x30B0,0x30B1,0x30B2,0x30B3,0x30B4,0x30B5,0x30B6,0x30B7,/* 0xB0-0xB7 */
+	0x30B8,0x30B9,0x30BA,0x30BB,0x30BC,0x30BD,0x30BE,0x30BF,/* 0xB8-0xBF */
+	0x30C0,0x30C1,0x30C2,0x30C3,0x30C4,0x30C5,0x30C6,0x30C7,/* 0xC0-0xC7 */
+	0x30C8,0x30C9,0x30CA,0x30CB,0x30CC,0x30CD,0x30CE,0x30CF,/* 0xC8-0xCF */
+	0x30D0,0x30D1,0x30D2,0x30D3,0x30D4,0x30D5,0x30D6,0x30D7,/* 0xD0-0xD7 */
+	0x30D8,0x30D9,0x30DA,0x30DB,0x30DC,0x30DD,0x30DE,0x30DF,/* 0xD8-0xDF */
+	0x30E0,0x30E1,0x30E2,0x30E3,0x30E4,0x30E5,0x30E6,0x30E7,/* 0xE0-0xE7 */
+	0x30E8,0x30E9,0x30EA,0x30EB,0x30EC,0x30ED,0x30EE,0x30EF,/* 0xE8-0xEF */
+	0x30F0,0x30F1,0x30F2,0x30F3,0x30F4,0x30F5,0x30F6,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_A6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x0391,0x0392,0x0393,0x0394,0x0395,0x0396,0x0397,/* 0xA0-0xA7 */
+	0x0398,0x0399,0x039A,0x039B,0x039C,0x039D,0x039E,0x039F,/* 0xA8-0xAF */
+	0x03A0,0x03A1,0x03A3,0x03A4,0x03A5,0x03A6,0x03A7,0x03A8,/* 0xB0-0xB7 */
+	0x03A9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xB8-0xBF */
+	0x0000,0x03B1,0x03B2,0x03B3,0x03B4,0x03B5,0x03B6,0x03B7,/* 0xC0-0xC7 */
+	0x03B8,0x03B9,0x03BA,0x03BB,0x03BC,0x03BD,0x03BE,0x03BF,/* 0xC8-0xCF */
+	0x03C0,0x03C1,0x03C3,0x03C4,0x03C5,0x03C6,0x03C7,0x03C8,/* 0xD0-0xD7 */
+	0x03C9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD8-0xDF */
+	0xFE35,0xFE36,0xFE39,0xFE3A,0xFE3F,0xFE40,0xFE3D,0xFE3E,/* 0xE0-0xE7 */
+	0xFE41,0xFE42,0xFE43,0xFE44,0x0000,0x0000,0xFE3B,0xFE3C,/* 0xE8-0xEF */
+	0xFE37,0xFE38,0xFE31,0x0000,0xFE33,0xFE34,0x0000,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_A7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0401,/* 0xA0-0xA7 */
+	0x0416,0x0417,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,/* 0xA8-0xAF */
+	0x041E,0x041F,0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,/* 0xB0-0xB7 */
+	0x0426,0x0427,0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,/* 0xB8-0xBF */
+	0x042E,0x042F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC8-0xCF */
+	0x0000,0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0451,/* 0xD0-0xD7 */
+	0x0436,0x0437,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,/* 0xD8-0xDF */
+	0x043E,0x043F,0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,/* 0xE0-0xE7 */
+	0x0446,0x0447,0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,/* 0xE8-0xEF */
+	0x044E,0x044F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_A8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x02CA,0x02CB,0x02D9,0x2013,0x2015,0x2025,0x2035,0x2105,/* 0x40-0x47 */
+	0x2109,0x2196,0x2197,0x2198,0x2199,0x2215,0x221F,0x2223,/* 0x48-0x4F */
+	0x2252,0x2266,0x2267,0x22BF,0x2550,0x2551,0x2552,0x2553,/* 0x50-0x57 */
+	0x2554,0x2555,0x2556,0x2557,0x2558,0x2559,0x255A,0x255B,/* 0x58-0x5F */
+	0x255C,0x255D,0x255E,0x255F,0x2560,0x2561,0x2562,0x2563,/* 0x60-0x67 */
+	0x2564,0x2565,0x2566,0x2567,0x2568,0x2569,0x256A,0x256B,/* 0x68-0x6F */
+	0x256C,0x256D,0x256E,0x256F,0x2570,0x2571,0x2572,0x2573,/* 0x70-0x77 */
+	0x2581,0x2582,0x2583,0x2584,0x2585,0x2586,0x2587,0x0000,/* 0x78-0x7F */
+
+	0x2588,0x2589,0x258A,0x258B,0x258C,0x258D,0x258E,0x258F,/* 0x80-0x87 */
+	0x2593,0x2594,0x2595,0x25BC,0x25BD,0x25E2,0x25E3,0x25E4,/* 0x88-0x8F */
+	0x25E5,0x2609,0x2295,0x3012,0x301D,0x301E,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x0101,0x00E1,0x01CE,0x00E0,0x0113,0x00E9,0x011B,/* 0xA0-0xA7 */
+	0x00E8,0x012B,0x00ED,0x01D0,0x00EC,0x014D,0x00F3,0x01D2,/* 0xA8-0xAF */
+	0x00F2,0x016B,0x00FA,0x01D4,0x00F9,0x01D6,0x01D8,0x01DA,/* 0xB0-0xB7 */
+	0x01DC,0x00FC,0x00EA,0x0251,0x0000,0x0144,0x0148,0x0000,/* 0xB8-0xBF */
+	0x0261,0x0000,0x0000,0x0000,0x0000,0x3105,0x3106,0x3107,/* 0xC0-0xC7 */
+	0x3108,0x3109,0x310A,0x310B,0x310C,0x310D,0x310E,0x310F,/* 0xC8-0xCF */
+	0x3110,0x3111,0x3112,0x3113,0x3114,0x3115,0x3116,0x3117,/* 0xD0-0xD7 */
+	0x3118,0x3119,0x311A,0x311B,0x311C,0x311D,0x311E,0x311F,/* 0xD8-0xDF */
+	0x3120,0x3121,0x3122,0x3123,0x3124,0x3125,0x3126,0x3127,/* 0xE0-0xE7 */
+	0x3128,0x3129,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xE8-0xEF */
+};
+
+static const wchar_t c2u_A9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x3021,0x3022,0x3023,0x3024,0x3025,0x3026,0x3027,0x3028,/* 0x40-0x47 */
+	0x3029,0x32A3,0x338E,0x338F,0x339C,0x339D,0x339E,0x33A1,/* 0x48-0x4F */
+	0x33C4,0x33CE,0x33D1,0x33D2,0x33D5,0xFE30,0xFFE2,0xFFE4,/* 0x50-0x57 */
+	0x0000,0x2121,0x3231,0x0000,0x2010,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x30FC,0x309B,0x309C,0x30FD,0x30FE,0x3006,0x309D,0x309E,/* 0x60-0x67 */
+	0xFE49,0xFE4A,0xFE4B,0xFE4C,0xFE4D,0xFE4E,0xFE4F,0xFE50,/* 0x68-0x6F */
+	0xFE51,0xFE52,0xFE54,0xFE55,0xFE56,0xFE57,0xFE59,0xFE5A,/* 0x70-0x77 */
+	0xFE5B,0xFE5C,0xFE5D,0xFE5E,0xFE5F,0xFE60,0xFE61,0x0000,/* 0x78-0x7F */
+
+	0xFE62,0xFE63,0xFE64,0xFE65,0xFE66,0xFE68,0xFE69,0xFE6A,/* 0x80-0x87 */
+	0xFE6B,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x3007,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x0000,0x0000,0x0000,0x2500,0x2501,0x2502,0x2503,/* 0xA0-0xA7 */
+	0x2504,0x2505,0x2506,0x2507,0x2508,0x2509,0x250A,0x250B,/* 0xA8-0xAF */
+	0x250C,0x250D,0x250E,0x250F,0x2510,0x2511,0x2512,0x2513,/* 0xB0-0xB7 */
+	0x2514,0x2515,0x2516,0x2517,0x2518,0x2519,0x251A,0x251B,/* 0xB8-0xBF */
+	0x251C,0x251D,0x251E,0x251F,0x2520,0x2521,0x2522,0x2523,/* 0xC0-0xC7 */
+	0x2524,0x2525,0x2526,0x2527,0x2528,0x2529,0x252A,0x252B,/* 0xC8-0xCF */
+	0x252C,0x252D,0x252E,0x252F,0x2530,0x2531,0x2532,0x2533,/* 0xD0-0xD7 */
+	0x2534,0x2535,0x2536,0x2537,0x2538,0x2539,0x253A,0x253B,/* 0xD8-0xDF */
+	0x253C,0x253D,0x253E,0x253F,0x2540,0x2541,0x2542,0x2543,/* 0xE0-0xE7 */
+	0x2544,0x2545,0x2546,0x2547,0x2548,0x2549,0x254A,0x254B,/* 0xE8-0xEF */
+};
+
+static const wchar_t c2u_AA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x72DC,0x72DD,0x72DF,0x72E2,0x72E3,0x72E4,0x72E5,0x72E6,/* 0x40-0x47 */
+	0x72E7,0x72EA,0x72EB,0x72F5,0x72F6,0x72F9,0x72FD,0x72FE,/* 0x48-0x4F */
+	0x72FF,0x7300,0x7302,0x7304,0x7305,0x7306,0x7307,0x7308,/* 0x50-0x57 */
+	0x7309,0x730B,0x730C,0x730D,0x730F,0x7310,0x7311,0x7312,/* 0x58-0x5F */
+	0x7314,0x7318,0x7319,0x731A,0x731F,0x7320,0x7323,0x7324,/* 0x60-0x67 */
+	0x7326,0x7327,0x7328,0x732D,0x732F,0x7330,0x7332,0x7333,/* 0x68-0x6F */
+	0x7335,0x7336,0x733A,0x733B,0x733C,0x733D,0x7340,0x7341,/* 0x70-0x77 */
+	0x7342,0x7343,0x7344,0x7345,0x7346,0x7347,0x7348,0x0000,/* 0x78-0x7F */
+
+	0x7349,0x734A,0x734B,0x734C,0x734E,0x734F,0x7351,0x7353,/* 0x80-0x87 */
+	0x7354,0x7355,0x7356,0x7358,0x7359,0x735A,0x735B,0x735C,/* 0x88-0x8F */
+	0x735D,0x735E,0x735F,0x7361,0x7362,0x7363,0x7364,0x7365,/* 0x90-0x97 */
+	0x7366,0x7367,0x7368,0x7369,0x736A,0x736B,0x736E,0x7370,/* 0x98-0x9F */
+	0x7371,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_AB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7372,0x7373,0x7374,0x7375,0x7376,0x7377,0x7378,0x7379,/* 0x40-0x47 */
+	0x737A,0x737B,0x737C,0x737D,0x737F,0x7380,0x7381,0x7382,/* 0x48-0x4F */
+	0x7383,0x7385,0x7386,0x7388,0x738A,0x738C,0x738D,0x738F,/* 0x50-0x57 */
+	0x7390,0x7392,0x7393,0x7394,0x7395,0x7397,0x7398,0x7399,/* 0x58-0x5F */
+	0x739A,0x739C,0x739D,0x739E,0x73A0,0x73A1,0x73A3,0x73A4,/* 0x60-0x67 */
+	0x73A5,0x73A6,0x73A7,0x73A8,0x73AA,0x73AC,0x73AD,0x73B1,/* 0x68-0x6F */
+	0x73B4,0x73B5,0x73B6,0x73B8,0x73B9,0x73BC,0x73BD,0x73BE,/* 0x70-0x77 */
+	0x73BF,0x73C1,0x73C3,0x73C4,0x73C5,0x73C6,0x73C7,0x0000,/* 0x78-0x7F */
+
+	0x73CB,0x73CC,0x73CE,0x73D2,0x73D3,0x73D4,0x73D5,0x73D6,/* 0x80-0x87 */
+	0x73D7,0x73D8,0x73DA,0x73DB,0x73DC,0x73DD,0x73DF,0x73E1,/* 0x88-0x8F */
+	0x73E2,0x73E3,0x73E4,0x73E6,0x73E8,0x73EA,0x73EB,0x73EC,/* 0x90-0x97 */
+	0x73EE,0x73EF,0x73F0,0x73F1,0x73F3,0x73F4,0x73F5,0x73F6,/* 0x98-0x9F */
+	0x73F7,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_AC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x73F8,0x73F9,0x73FA,0x73FB,0x73FC,0x73FD,0x73FE,0x73FF,/* 0x40-0x47 */
+	0x7400,0x7401,0x7402,0x7404,0x7407,0x7408,0x740B,0x740C,/* 0x48-0x4F */
+	0x740D,0x740E,0x7411,0x7412,0x7413,0x7414,0x7415,0x7416,/* 0x50-0x57 */
+	0x7417,0x7418,0x7419,0x741C,0x741D,0x741E,0x741F,0x7420,/* 0x58-0x5F */
+	0x7421,0x7423,0x7424,0x7427,0x7429,0x742B,0x742D,0x742F,/* 0x60-0x67 */
+	0x7431,0x7432,0x7437,0x7438,0x7439,0x743A,0x743B,0x743D,/* 0x68-0x6F */
+	0x743E,0x743F,0x7440,0x7442,0x7443,0x7444,0x7445,0x7446,/* 0x70-0x77 */
+	0x7447,0x7448,0x7449,0x744A,0x744B,0x744C,0x744D,0x0000,/* 0x78-0x7F */
+
+	0x744E,0x744F,0x7450,0x7451,0x7452,0x7453,0x7454,0x7456,/* 0x80-0x87 */
+	0x7458,0x745D,0x7460,0x7461,0x7462,0x7463,0x7464,0x7465,/* 0x88-0x8F */
+	0x7466,0x7467,0x7468,0x7469,0x746A,0x746B,0x746C,0x746E,/* 0x90-0x97 */
+	0x746F,0x7471,0x7472,0x7473,0x7474,0x7475,0x7478,0x7479,/* 0x98-0x9F */
+	0x747A,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_AD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x747B,0x747C,0x747D,0x747F,0x7482,0x7484,0x7485,0x7486,/* 0x40-0x47 */
+	0x7488,0x7489,0x748A,0x748C,0x748D,0x748F,0x7491,0x7492,/* 0x48-0x4F */
+	0x7493,0x7494,0x7495,0x7496,0x7497,0x7498,0x7499,0x749A,/* 0x50-0x57 */
+	0x749B,0x749D,0x749F,0x74A0,0x74A1,0x74A2,0x74A3,0x74A4,/* 0x58-0x5F */
+	0x74A5,0x74A6,0x74AA,0x74AB,0x74AC,0x74AD,0x74AE,0x74AF,/* 0x60-0x67 */
+	0x74B0,0x74B1,0x74B2,0x74B3,0x74B4,0x74B5,0x74B6,0x74B7,/* 0x68-0x6F */
+	0x74B8,0x74B9,0x74BB,0x74BC,0x74BD,0x74BE,0x74BF,0x74C0,/* 0x70-0x77 */
+	0x74C1,0x74C2,0x74C3,0x74C4,0x74C5,0x74C6,0x74C7,0x0000,/* 0x78-0x7F */
+
+	0x74C8,0x74C9,0x74CA,0x74CB,0x74CC,0x74CD,0x74CE,0x74CF,/* 0x80-0x87 */
+	0x74D0,0x74D1,0x74D3,0x74D4,0x74D5,0x74D6,0x74D7,0x74D8,/* 0x88-0x8F */
+	0x74D9,0x74DA,0x74DB,0x74DD,0x74DF,0x74E1,0x74E5,0x74E7,/* 0x90-0x97 */
+	0x74E8,0x74E9,0x74EA,0x74EB,0x74EC,0x74ED,0x74F0,0x74F1,/* 0x98-0x9F */
+	0x74F2,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_AE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x74F3,0x74F5,0x74F8,0x74F9,0x74FA,0x74FB,0x74FC,0x74FD,/* 0x40-0x47 */
+	0x74FE,0x7500,0x7501,0x7502,0x7503,0x7505,0x7506,0x7507,/* 0x48-0x4F */
+	0x7508,0x7509,0x750A,0x750B,0x750C,0x750E,0x7510,0x7512,/* 0x50-0x57 */
+	0x7514,0x7515,0x7516,0x7517,0x751B,0x751D,0x751E,0x7520,/* 0x58-0x5F */
+	0x7521,0x7522,0x7523,0x7524,0x7526,0x7527,0x752A,0x752E,/* 0x60-0x67 */
+	0x7534,0x7536,0x7539,0x753C,0x753D,0x753F,0x7541,0x7542,/* 0x68-0x6F */
+	0x7543,0x7544,0x7546,0x7547,0x7549,0x754A,0x754D,0x7550,/* 0x70-0x77 */
+	0x7551,0x7552,0x7553,0x7555,0x7556,0x7557,0x7558,0x0000,/* 0x78-0x7F */
+
+	0x755D,0x755E,0x755F,0x7560,0x7561,0x7562,0x7563,0x7564,/* 0x80-0x87 */
+	0x7567,0x7568,0x7569,0x756B,0x756C,0x756D,0x756E,0x756F,/* 0x88-0x8F */
+	0x7570,0x7571,0x7573,0x7575,0x7576,0x7577,0x757A,0x757B,/* 0x90-0x97 */
+	0x757C,0x757D,0x757E,0x7580,0x7581,0x7582,0x7584,0x7585,/* 0x98-0x9F */
+	0x7587,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_AF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7588,0x7589,0x758A,0x758C,0x758D,0x758E,0x7590,0x7593,/* 0x40-0x47 */
+	0x7595,0x7598,0x759B,0x759C,0x759E,0x75A2,0x75A6,0x75A7,/* 0x48-0x4F */
+	0x75A8,0x75A9,0x75AA,0x75AD,0x75B6,0x75B7,0x75BA,0x75BB,/* 0x50-0x57 */
+	0x75BF,0x75C0,0x75C1,0x75C6,0x75CB,0x75CC,0x75CE,0x75CF,/* 0x58-0x5F */
+	0x75D0,0x75D1,0x75D3,0x75D7,0x75D9,0x75DA,0x75DC,0x75DD,/* 0x60-0x67 */
+	0x75DF,0x75E0,0x75E1,0x75E5,0x75E9,0x75EC,0x75ED,0x75EE,/* 0x68-0x6F */
+	0x75EF,0x75F2,0x75F3,0x75F5,0x75F6,0x75F7,0x75F8,0x75FA,/* 0x70-0x77 */
+	0x75FB,0x75FD,0x75FE,0x7602,0x7604,0x7606,0x7607,0x0000,/* 0x78-0x7F */
+
+	0x7608,0x7609,0x760B,0x760D,0x760E,0x760F,0x7611,0x7612,/* 0x80-0x87 */
+	0x7613,0x7614,0x7616,0x761A,0x761C,0x761D,0x761E,0x7621,/* 0x88-0x8F */
+	0x7623,0x7627,0x7628,0x762C,0x762E,0x762F,0x7631,0x7632,/* 0x90-0x97 */
+	0x7636,0x7637,0x7639,0x763A,0x763B,0x763D,0x7641,0x7642,/* 0x98-0x9F */
+	0x7644,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_B0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7645,0x7646,0x7647,0x7648,0x7649,0x764A,0x764B,0x764E,/* 0x40-0x47 */
+	0x764F,0x7650,0x7651,0x7652,0x7653,0x7655,0x7657,0x7658,/* 0x48-0x4F */
+	0x7659,0x765A,0x765B,0x765D,0x765F,0x7660,0x7661,0x7662,/* 0x50-0x57 */
+	0x7664,0x7665,0x7666,0x7667,0x7668,0x7669,0x766A,0x766C,/* 0x58-0x5F */
+	0x766D,0x766E,0x7670,0x7671,0x7672,0x7673,0x7674,0x7675,/* 0x60-0x67 */
+	0x7676,0x7677,0x7679,0x767A,0x767C,0x767F,0x7680,0x7681,/* 0x68-0x6F */
+	0x7683,0x7685,0x7689,0x768A,0x768C,0x768D,0x768F,0x7690,/* 0x70-0x77 */
+	0x7692,0x7694,0x7695,0x7697,0x7698,0x769A,0x769B,0x0000,/* 0x78-0x7F */
+
+	0x769C,0x769D,0x769E,0x769F,0x76A0,0x76A1,0x76A2,0x76A3,/* 0x80-0x87 */
+	0x76A5,0x76A6,0x76A7,0x76A8,0x76A9,0x76AA,0x76AB,0x76AC,/* 0x88-0x8F */
+	0x76AD,0x76AF,0x76B0,0x76B3,0x76B5,0x76B6,0x76B7,0x76B8,/* 0x90-0x97 */
+	0x76B9,0x76BA,0x76BB,0x76BC,0x76BD,0x76BE,0x76C0,0x76C1,/* 0x98-0x9F */
+	0x76C3,0x554A,0x963F,0x57C3,0x6328,0x54CE,0x5509,0x54C0,/* 0xA0-0xA7 */
+	0x7691,0x764C,0x853C,0x77EE,0x827E,0x788D,0x7231,0x9698,/* 0xA8-0xAF */
+	0x978D,0x6C28,0x5B89,0x4FFA,0x6309,0x6697,0x5CB8,0x80FA,/* 0xB0-0xB7 */
+	0x6848,0x80AE,0x6602,0x76CE,0x51F9,0x6556,0x71AC,0x7FF1,/* 0xB8-0xBF */
+	0x8884,0x50B2,0x5965,0x61CA,0x6FB3,0x82AD,0x634C,0x6252,/* 0xC0-0xC7 */
+	0x53ED,0x5427,0x7B06,0x516B,0x75A4,0x5DF4,0x62D4,0x8DCB,/* 0xC8-0xCF */
+	0x9776,0x628A,0x8019,0x575D,0x9738,0x7F62,0x7238,0x767D,/* 0xD0-0xD7 */
+	0x67CF,0x767E,0x6446,0x4F70,0x8D25,0x62DC,0x7A17,0x6591,/* 0xD8-0xDF */
+	0x73ED,0x642C,0x6273,0x822C,0x9881,0x677F,0x7248,0x626E,/* 0xE0-0xE7 */
+	0x62CC,0x4F34,0x74E3,0x534A,0x529E,0x7ECA,0x90A6,0x5E2E,/* 0xE8-0xEF */
+	0x6886,0x699C,0x8180,0x7ED1,0x68D2,0x78C5,0x868C,0x9551,/* 0xF0-0xF7 */
+	0x508D,0x8C24,0x82DE,0x80DE,0x5305,0x8912,0x5265,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x76C4,0x76C7,0x76C9,0x76CB,0x76CC,0x76D3,0x76D5,0x76D9,/* 0x40-0x47 */
+	0x76DA,0x76DC,0x76DD,0x76DE,0x76E0,0x76E1,0x76E2,0x76E3,/* 0x48-0x4F */
+	0x76E4,0x76E6,0x76E7,0x76E8,0x76E9,0x76EA,0x76EB,0x76EC,/* 0x50-0x57 */
+	0x76ED,0x76F0,0x76F3,0x76F5,0x76F6,0x76F7,0x76FA,0x76FB,/* 0x58-0x5F */
+	0x76FD,0x76FF,0x7700,0x7702,0x7703,0x7705,0x7706,0x770A,/* 0x60-0x67 */
+	0x770C,0x770E,0x770F,0x7710,0x7711,0x7712,0x7713,0x7714,/* 0x68-0x6F */
+	0x7715,0x7716,0x7717,0x7718,0x771B,0x771C,0x771D,0x771E,/* 0x70-0x77 */
+	0x7721,0x7723,0x7724,0x7725,0x7727,0x772A,0x772B,0x0000,/* 0x78-0x7F */
+
+	0x772C,0x772E,0x7730,0x7731,0x7732,0x7733,0x7734,0x7739,/* 0x80-0x87 */
+	0x773B,0x773D,0x773E,0x773F,0x7742,0x7744,0x7745,0x7746,/* 0x88-0x8F */
+	0x7748,0x7749,0x774A,0x774B,0x774C,0x774D,0x774E,0x774F,/* 0x90-0x97 */
+	0x7752,0x7753,0x7754,0x7755,0x7756,0x7757,0x7758,0x7759,/* 0x98-0x9F */
+	0x775C,0x8584,0x96F9,0x4FDD,0x5821,0x9971,0x5B9D,0x62B1,/* 0xA0-0xA7 */
+	0x62A5,0x66B4,0x8C79,0x9C8D,0x7206,0x676F,0x7891,0x60B2,/* 0xA8-0xAF */
+	0x5351,0x5317,0x8F88,0x80CC,0x8D1D,0x94A1,0x500D,0x72C8,/* 0xB0-0xB7 */
+	0x5907,0x60EB,0x7119,0x88AB,0x5954,0x82EF,0x672C,0x7B28,/* 0xB8-0xBF */
+	0x5D29,0x7EF7,0x752D,0x6CF5,0x8E66,0x8FF8,0x903C,0x9F3B,/* 0xC0-0xC7 */
+	0x6BD4,0x9119,0x7B14,0x5F7C,0x78A7,0x84D6,0x853D,0x6BD5,/* 0xC8-0xCF */
+	0x6BD9,0x6BD6,0x5E01,0x5E87,0x75F9,0x95ED,0x655D,0x5F0A,/* 0xD0-0xD7 */
+	0x5FC5,0x8F9F,0x58C1,0x81C2,0x907F,0x965B,0x97AD,0x8FB9,/* 0xD8-0xDF */
+	0x7F16,0x8D2C,0x6241,0x4FBF,0x53D8,0x535E,0x8FA8,0x8FA9,/* 0xE0-0xE7 */
+	0x8FAB,0x904D,0x6807,0x5F6A,0x8198,0x8868,0x9CD6,0x618B,/* 0xE8-0xEF */
+	0x522B,0x762A,0x5F6C,0x658C,0x6FD2,0x6EE8,0x5BBE,0x6448,/* 0xF0-0xF7 */
+	0x5175,0x51B0,0x67C4,0x4E19,0x79C9,0x997C,0x70B3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x775D,0x775E,0x775F,0x7760,0x7764,0x7767,0x7769,0x776A,/* 0x40-0x47 */
+	0x776D,0x776E,0x776F,0x7770,0x7771,0x7772,0x7773,0x7774,/* 0x48-0x4F */
+	0x7775,0x7776,0x7777,0x7778,0x777A,0x777B,0x777C,0x7781,/* 0x50-0x57 */
+	0x7782,0x7783,0x7786,0x7787,0x7788,0x7789,0x778A,0x778B,/* 0x58-0x5F */
+	0x778F,0x7790,0x7793,0x7794,0x7795,0x7796,0x7797,0x7798,/* 0x60-0x67 */
+	0x7799,0x779A,0x779B,0x779C,0x779D,0x779E,0x77A1,0x77A3,/* 0x68-0x6F */
+	0x77A4,0x77A6,0x77A8,0x77AB,0x77AD,0x77AE,0x77AF,0x77B1,/* 0x70-0x77 */
+	0x77B2,0x77B4,0x77B6,0x77B7,0x77B8,0x77B9,0x77BA,0x0000,/* 0x78-0x7F */
+
+	0x77BC,0x77BE,0x77C0,0x77C1,0x77C2,0x77C3,0x77C4,0x77C5,/* 0x80-0x87 */
+	0x77C6,0x77C7,0x77C8,0x77C9,0x77CA,0x77CB,0x77CC,0x77CE,/* 0x88-0x8F */
+	0x77CF,0x77D0,0x77D1,0x77D2,0x77D3,0x77D4,0x77D5,0x77D6,/* 0x90-0x97 */
+	0x77D8,0x77D9,0x77DA,0x77DD,0x77DE,0x77DF,0x77E0,0x77E1,/* 0x98-0x9F */
+	0x77E4,0x75C5,0x5E76,0x73BB,0x83E0,0x64AD,0x62E8,0x94B5,/* 0xA0-0xA7 */
+	0x6CE2,0x535A,0x52C3,0x640F,0x94C2,0x7B94,0x4F2F,0x5E1B,/* 0xA8-0xAF */
+	0x8236,0x8116,0x818A,0x6E24,0x6CCA,0x9A73,0x6355,0x535C,/* 0xB0-0xB7 */
+	0x54FA,0x8865,0x57E0,0x4E0D,0x5E03,0x6B65,0x7C3F,0x90E8,/* 0xB8-0xBF */
+	0x6016,0x64E6,0x731C,0x88C1,0x6750,0x624D,0x8D22,0x776C,/* 0xC0-0xC7 */
+	0x8E29,0x91C7,0x5F69,0x83DC,0x8521,0x9910,0x53C2,0x8695,/* 0xC8-0xCF */
+	0x6B8B,0x60ED,0x60E8,0x707F,0x82CD,0x8231,0x4ED3,0x6CA7,/* 0xD0-0xD7 */
+	0x85CF,0x64CD,0x7CD9,0x69FD,0x66F9,0x8349,0x5395,0x7B56,/* 0xD8-0xDF */
+	0x4FA7,0x518C,0x6D4B,0x5C42,0x8E6D,0x63D2,0x53C9,0x832C,/* 0xE0-0xE7 */
+	0x8336,0x67E5,0x78B4,0x643D,0x5BDF,0x5C94,0x5DEE,0x8BE7,/* 0xE8-0xEF */
+	0x62C6,0x67F4,0x8C7A,0x6400,0x63BA,0x8749,0x998B,0x8C17,/* 0xF0-0xF7 */
+	0x7F20,0x94F2,0x4EA7,0x9610,0x98A4,0x660C,0x7316,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x77E6,0x77E8,0x77EA,0x77EF,0x77F0,0x77F1,0x77F2,0x77F4,/* 0x40-0x47 */
+	0x77F5,0x77F7,0x77F9,0x77FA,0x77FB,0x77FC,0x7803,0x7804,/* 0x48-0x4F */
+	0x7805,0x7806,0x7807,0x7808,0x780A,0x780B,0x780E,0x780F,/* 0x50-0x57 */
+	0x7810,0x7813,0x7815,0x7819,0x781B,0x781E,0x7820,0x7821,/* 0x58-0x5F */
+	0x7822,0x7824,0x7828,0x782A,0x782B,0x782E,0x782F,0x7831,/* 0x60-0x67 */
+	0x7832,0x7833,0x7835,0x7836,0x783D,0x783F,0x7841,0x7842,/* 0x68-0x6F */
+	0x7843,0x7844,0x7846,0x7848,0x7849,0x784A,0x784B,0x784D,/* 0x70-0x77 */
+	0x784F,0x7851,0x7853,0x7854,0x7858,0x7859,0x785A,0x0000,/* 0x78-0x7F */
+
+	0x785B,0x785C,0x785E,0x785F,0x7860,0x7861,0x7862,0x7863,/* 0x80-0x87 */
+	0x7864,0x7865,0x7866,0x7867,0x7868,0x7869,0x786F,0x7870,/* 0x88-0x8F */
+	0x7871,0x7872,0x7873,0x7874,0x7875,0x7876,0x7878,0x7879,/* 0x90-0x97 */
+	0x787A,0x787B,0x787D,0x787E,0x787F,0x7880,0x7881,0x7882,/* 0x98-0x9F */
+	0x7883,0x573A,0x5C1D,0x5E38,0x957F,0x507F,0x80A0,0x5382,/* 0xA0-0xA7 */
+	0x655E,0x7545,0x5531,0x5021,0x8D85,0x6284,0x949E,0x671D,/* 0xA8-0xAF */
+	0x5632,0x6F6E,0x5DE2,0x5435,0x7092,0x8F66,0x626F,0x64A4,/* 0xB0-0xB7 */
+	0x63A3,0x5F7B,0x6F88,0x90F4,0x81E3,0x8FB0,0x5C18,0x6668,/* 0xB8-0xBF */
+	0x5FF1,0x6C89,0x9648,0x8D81,0x886C,0x6491,0x79F0,0x57CE,/* 0xC0-0xC7 */
+	0x6A59,0x6210,0x5448,0x4E58,0x7A0B,0x60E9,0x6F84,0x8BDA,/* 0xC8-0xCF */
+	0x627F,0x901E,0x9A8B,0x79E4,0x5403,0x75F4,0x6301,0x5319,/* 0xD0-0xD7 */
+	0x6C60,0x8FDF,0x5F1B,0x9A70,0x803B,0x9F7F,0x4F88,0x5C3A,/* 0xD8-0xDF */
+	0x8D64,0x7FC5,0x65A5,0x70BD,0x5145,0x51B2,0x866B,0x5D07,/* 0xE0-0xE7 */
+	0x5BA0,0x62BD,0x916C,0x7574,0x8E0C,0x7A20,0x6101,0x7B79,/* 0xE8-0xEF */
+	0x4EC7,0x7EF8,0x7785,0x4E11,0x81ED,0x521D,0x51FA,0x6A71,/* 0xF0-0xF7 */
+	0x53A8,0x8E87,0x9504,0x96CF,0x6EC1,0x9664,0x695A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7884,0x7885,0x7886,0x7888,0x788A,0x788B,0x788F,0x7890,/* 0x40-0x47 */
+	0x7892,0x7894,0x7895,0x7896,0x7899,0x789D,0x789E,0x78A0,/* 0x48-0x4F */
+	0x78A2,0x78A4,0x78A6,0x78A8,0x78A9,0x78AA,0x78AB,0x78AC,/* 0x50-0x57 */
+	0x78AD,0x78AE,0x78AF,0x78B5,0x78B6,0x78B7,0x78B8,0x78BA,/* 0x58-0x5F */
+	0x78BB,0x78BC,0x78BD,0x78BF,0x78C0,0x78C2,0x78C3,0x78C4,/* 0x60-0x67 */
+	0x78C6,0x78C7,0x78C8,0x78CC,0x78CD,0x78CE,0x78CF,0x78D1,/* 0x68-0x6F */
+	0x78D2,0x78D3,0x78D6,0x78D7,0x78D8,0x78DA,0x78DB,0x78DC,/* 0x70-0x77 */
+	0x78DD,0x78DE,0x78DF,0x78E0,0x78E1,0x78E2,0x78E3,0x0000,/* 0x78-0x7F */
+
+	0x78E4,0x78E5,0x78E6,0x78E7,0x78E9,0x78EA,0x78EB,0x78ED,/* 0x80-0x87 */
+	0x78EE,0x78EF,0x78F0,0x78F1,0x78F3,0x78F5,0x78F6,0x78F8,/* 0x88-0x8F */
+	0x78F9,0x78FB,0x78FC,0x78FD,0x78FE,0x78FF,0x7900,0x7902,/* 0x90-0x97 */
+	0x7903,0x7904,0x7906,0x7907,0x7908,0x7909,0x790A,0x790B,/* 0x98-0x9F */
+	0x790C,0x7840,0x50A8,0x77D7,0x6410,0x89E6,0x5904,0x63E3,/* 0xA0-0xA7 */
+	0x5DDD,0x7A7F,0x693D,0x4F20,0x8239,0x5598,0x4E32,0x75AE,/* 0xA8-0xAF */
+	0x7A97,0x5E62,0x5E8A,0x95EF,0x521B,0x5439,0x708A,0x6376,/* 0xB0-0xB7 */
+	0x9524,0x5782,0x6625,0x693F,0x9187,0x5507,0x6DF3,0x7EAF,/* 0xB8-0xBF */
+	0x8822,0x6233,0x7EF0,0x75B5,0x8328,0x78C1,0x96CC,0x8F9E,/* 0xC0-0xC7 */
+	0x6148,0x74F7,0x8BCD,0x6B64,0x523A,0x8D50,0x6B21,0x806A,/* 0xC8-0xCF */
+	0x8471,0x56F1,0x5306,0x4ECE,0x4E1B,0x51D1,0x7C97,0x918B,/* 0xD0-0xD7 */
+	0x7C07,0x4FC3,0x8E7F,0x7BE1,0x7A9C,0x6467,0x5D14,0x50AC,/* 0xD8-0xDF */
+	0x8106,0x7601,0x7CB9,0x6DEC,0x7FE0,0x6751,0x5B58,0x5BF8,/* 0xE0-0xE7 */
+	0x78CB,0x64AE,0x6413,0x63AA,0x632B,0x9519,0x642D,0x8FBE,/* 0xE8-0xEF */
+	0x7B54,0x7629,0x6253,0x5927,0x5446,0x6B79,0x50A3,0x6234,/* 0xF0-0xF7 */
+	0x5E26,0x6B86,0x4EE3,0x8D37,0x888B,0x5F85,0x902E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x790D,0x790E,0x790F,0x7910,0x7911,0x7912,0x7914,0x7915,/* 0x40-0x47 */
+	0x7916,0x7917,0x7918,0x7919,0x791A,0x791B,0x791C,0x791D,/* 0x48-0x4F */
+	0x791F,0x7920,0x7921,0x7922,0x7923,0x7925,0x7926,0x7927,/* 0x50-0x57 */
+	0x7928,0x7929,0x792A,0x792B,0x792C,0x792D,0x792E,0x792F,/* 0x58-0x5F */
+	0x7930,0x7931,0x7932,0x7933,0x7935,0x7936,0x7937,0x7938,/* 0x60-0x67 */
+	0x7939,0x793D,0x793F,0x7942,0x7943,0x7944,0x7945,0x7947,/* 0x68-0x6F */
+	0x794A,0x794B,0x794C,0x794D,0x794E,0x794F,0x7950,0x7951,/* 0x70-0x77 */
+	0x7952,0x7954,0x7955,0x7958,0x7959,0x7961,0x7963,0x0000,/* 0x78-0x7F */
+
+	0x7964,0x7966,0x7969,0x796A,0x796B,0x796C,0x796E,0x7970,/* 0x80-0x87 */
+	0x7971,0x7972,0x7973,0x7974,0x7975,0x7976,0x7979,0x797B,/* 0x88-0x8F */
+	0x797C,0x797D,0x797E,0x797F,0x7982,0x7983,0x7986,0x7987,/* 0x90-0x97 */
+	0x7988,0x7989,0x798B,0x798C,0x798D,0x798E,0x7990,0x7991,/* 0x98-0x9F */
+	0x7992,0x6020,0x803D,0x62C5,0x4E39,0x5355,0x90F8,0x63B8,/* 0xA0-0xA7 */
+	0x80C6,0x65E6,0x6C2E,0x4F46,0x60EE,0x6DE1,0x8BDE,0x5F39,/* 0xA8-0xAF */
+	0x86CB,0x5F53,0x6321,0x515A,0x8361,0x6863,0x5200,0x6363,/* 0xB0-0xB7 */
+	0x8E48,0x5012,0x5C9B,0x7977,0x5BFC,0x5230,0x7A3B,0x60BC,/* 0xB8-0xBF */
+	0x9053,0x76D7,0x5FB7,0x5F97,0x7684,0x8E6C,0x706F,0x767B,/* 0xC0-0xC7 */
+	0x7B49,0x77AA,0x51F3,0x9093,0x5824,0x4F4E,0x6EF4,0x8FEA,/* 0xC8-0xCF */
+	0x654C,0x7B1B,0x72C4,0x6DA4,0x7FDF,0x5AE1,0x62B5,0x5E95,/* 0xD0-0xD7 */
+	0x5730,0x8482,0x7B2C,0x5E1D,0x5F1F,0x9012,0x7F14,0x98A0,/* 0xD8-0xDF */
+	0x6382,0x6EC7,0x7898,0x70B9,0x5178,0x975B,0x57AB,0x7535,/* 0xE0-0xE7 */
+	0x4F43,0x7538,0x5E97,0x60E6,0x5960,0x6DC0,0x6BBF,0x7889,/* 0xE8-0xEF */
+	0x53FC,0x96D5,0x51CB,0x5201,0x6389,0x540A,0x9493,0x8C03,/* 0xF0-0xF7 */
+	0x8DCC,0x7239,0x789F,0x8776,0x8FED,0x8C0D,0x53E0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7993,0x7994,0x7995,0x7996,0x7997,0x7998,0x7999,0x799B,/* 0x40-0x47 */
+	0x799C,0x799D,0x799E,0x799F,0x79A0,0x79A1,0x79A2,0x79A3,/* 0x48-0x4F */
+	0x79A4,0x79A5,0x79A6,0x79A8,0x79A9,0x79AA,0x79AB,0x79AC,/* 0x50-0x57 */
+	0x79AD,0x79AE,0x79AF,0x79B0,0x79B1,0x79B2,0x79B4,0x79B5,/* 0x58-0x5F */
+	0x79B6,0x79B7,0x79B8,0x79BC,0x79BF,0x79C2,0x79C4,0x79C5,/* 0x60-0x67 */
+	0x79C7,0x79C8,0x79CA,0x79CC,0x79CE,0x79CF,0x79D0,0x79D3,/* 0x68-0x6F */
+	0x79D4,0x79D6,0x79D7,0x79D9,0x79DA,0x79DB,0x79DC,0x79DD,/* 0x70-0x77 */
+	0x79DE,0x79E0,0x79E1,0x79E2,0x79E5,0x79E8,0x79EA,0x0000,/* 0x78-0x7F */
+
+	0x79EC,0x79EE,0x79F1,0x79F2,0x79F3,0x79F4,0x79F5,0x79F6,/* 0x80-0x87 */
+	0x79F7,0x79F9,0x79FA,0x79FC,0x79FE,0x79FF,0x7A01,0x7A04,/* 0x88-0x8F */
+	0x7A05,0x7A07,0x7A08,0x7A09,0x7A0A,0x7A0C,0x7A0F,0x7A10,/* 0x90-0x97 */
+	0x7A11,0x7A12,0x7A13,0x7A15,0x7A16,0x7A18,0x7A19,0x7A1B,/* 0x98-0x9F */
+	0x7A1C,0x4E01,0x76EF,0x53EE,0x9489,0x9876,0x9F0E,0x952D,/* 0xA0-0xA7 */
+	0x5B9A,0x8BA2,0x4E22,0x4E1C,0x51AC,0x8463,0x61C2,0x52A8,/* 0xA8-0xAF */
+	0x680B,0x4F97,0x606B,0x51BB,0x6D1E,0x515C,0x6296,0x6597,/* 0xB0-0xB7 */
+	0x9661,0x8C46,0x9017,0x75D8,0x90FD,0x7763,0x6BD2,0x728A,/* 0xB8-0xBF */
+	0x72EC,0x8BFB,0x5835,0x7779,0x8D4C,0x675C,0x9540,0x809A,/* 0xC0-0xC7 */
+	0x5EA6,0x6E21,0x5992,0x7AEF,0x77ED,0x953B,0x6BB5,0x65AD,/* 0xC8-0xCF */
+	0x7F0E,0x5806,0x5151,0x961F,0x5BF9,0x58A9,0x5428,0x8E72,/* 0xD0-0xD7 */
+	0x6566,0x987F,0x56E4,0x949D,0x76FE,0x9041,0x6387,0x54C6,/* 0xD8-0xDF */
+	0x591A,0x593A,0x579B,0x8EB2,0x6735,0x8DFA,0x8235,0x5241,/* 0xE0-0xE7 */
+	0x60F0,0x5815,0x86FE,0x5CE8,0x9E45,0x4FC4,0x989D,0x8BB9,/* 0xE8-0xEF */
+	0x5A25,0x6076,0x5384,0x627C,0x904F,0x9102,0x997F,0x6069,/* 0xF0-0xF7 */
+	0x800C,0x513F,0x8033,0x5C14,0x9975,0x6D31,0x4E8C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7A1D,0x7A1F,0x7A21,0x7A22,0x7A24,0x7A25,0x7A26,0x7A27,/* 0x40-0x47 */
+	0x7A28,0x7A29,0x7A2A,0x7A2B,0x7A2C,0x7A2D,0x7A2E,0x7A2F,/* 0x48-0x4F */
+	0x7A30,0x7A31,0x7A32,0x7A34,0x7A35,0x7A36,0x7A38,0x7A3A,/* 0x50-0x57 */
+	0x7A3E,0x7A40,0x7A41,0x7A42,0x7A43,0x7A44,0x7A45,0x7A47,/* 0x58-0x5F */
+	0x7A48,0x7A49,0x7A4A,0x7A4B,0x7A4C,0x7A4D,0x7A4E,0x7A4F,/* 0x60-0x67 */
+	0x7A50,0x7A52,0x7A53,0x7A54,0x7A55,0x7A56,0x7A58,0x7A59,/* 0x68-0x6F */
+	0x7A5A,0x7A5B,0x7A5C,0x7A5D,0x7A5E,0x7A5F,0x7A60,0x7A61,/* 0x70-0x77 */
+	0x7A62,0x7A63,0x7A64,0x7A65,0x7A66,0x7A67,0x7A68,0x0000,/* 0x78-0x7F */
+
+	0x7A69,0x7A6A,0x7A6B,0x7A6C,0x7A6D,0x7A6E,0x7A6F,0x7A71,/* 0x80-0x87 */
+	0x7A72,0x7A73,0x7A75,0x7A7B,0x7A7C,0x7A7D,0x7A7E,0x7A82,/* 0x88-0x8F */
+	0x7A85,0x7A87,0x7A89,0x7A8A,0x7A8B,0x7A8C,0x7A8E,0x7A8F,/* 0x90-0x97 */
+	0x7A90,0x7A93,0x7A94,0x7A99,0x7A9A,0x7A9B,0x7A9E,0x7AA1,/* 0x98-0x9F */
+	0x7AA2,0x8D30,0x53D1,0x7F5A,0x7B4F,0x4F10,0x4E4F,0x9600,/* 0xA0-0xA7 */
+	0x6CD5,0x73D0,0x85E9,0x5E06,0x756A,0x7FFB,0x6A0A,0x77FE,/* 0xA8-0xAF */
+	0x9492,0x7E41,0x51E1,0x70E6,0x53CD,0x8FD4,0x8303,0x8D29,/* 0xB0-0xB7 */
+	0x72AF,0x996D,0x6CDB,0x574A,0x82B3,0x65B9,0x80AA,0x623F,/* 0xB8-0xBF */
+	0x9632,0x59A8,0x4EFF,0x8BBF,0x7EBA,0x653E,0x83F2,0x975E,/* 0xC0-0xC7 */
+	0x5561,0x98DE,0x80A5,0x532A,0x8BFD,0x5420,0x80BA,0x5E9F,/* 0xC8-0xCF */
+	0x6CB8,0x8D39,0x82AC,0x915A,0x5429,0x6C1B,0x5206,0x7EB7,/* 0xD0-0xD7 */
+	0x575F,0x711A,0x6C7E,0x7C89,0x594B,0x4EFD,0x5FFF,0x6124,/* 0xD8-0xDF */
+	0x7CAA,0x4E30,0x5C01,0x67AB,0x8702,0x5CF0,0x950B,0x98CE,/* 0xE0-0xE7 */
+	0x75AF,0x70FD,0x9022,0x51AF,0x7F1D,0x8BBD,0x5949,0x51E4,/* 0xE8-0xEF */
+	0x4F5B,0x5426,0x592B,0x6577,0x80A4,0x5B75,0x6276,0x62C2,/* 0xF0-0xF7 */
+	0x8F90,0x5E45,0x6C1F,0x7B26,0x4F0F,0x4FD8,0x670D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7AA3,0x7AA4,0x7AA7,0x7AA9,0x7AAA,0x7AAB,0x7AAE,0x7AAF,/* 0x40-0x47 */
+	0x7AB0,0x7AB1,0x7AB2,0x7AB4,0x7AB5,0x7AB6,0x7AB7,0x7AB8,/* 0x48-0x4F */
+	0x7AB9,0x7ABA,0x7ABB,0x7ABC,0x7ABD,0x7ABE,0x7AC0,0x7AC1,/* 0x50-0x57 */
+	0x7AC2,0x7AC3,0x7AC4,0x7AC5,0x7AC6,0x7AC7,0x7AC8,0x7AC9,/* 0x58-0x5F */
+	0x7ACA,0x7ACC,0x7ACD,0x7ACE,0x7ACF,0x7AD0,0x7AD1,0x7AD2,/* 0x60-0x67 */
+	0x7AD3,0x7AD4,0x7AD5,0x7AD7,0x7AD8,0x7ADA,0x7ADB,0x7ADC,/* 0x68-0x6F */
+	0x7ADD,0x7AE1,0x7AE2,0x7AE4,0x7AE7,0x7AE8,0x7AE9,0x7AEA,/* 0x70-0x77 */
+	0x7AEB,0x7AEC,0x7AEE,0x7AF0,0x7AF1,0x7AF2,0x7AF3,0x0000,/* 0x78-0x7F */
+
+	0x7AF4,0x7AF5,0x7AF6,0x7AF7,0x7AF8,0x7AFB,0x7AFC,0x7AFE,/* 0x80-0x87 */
+	0x7B00,0x7B01,0x7B02,0x7B05,0x7B07,0x7B09,0x7B0C,0x7B0D,/* 0x88-0x8F */
+	0x7B0E,0x7B10,0x7B12,0x7B13,0x7B16,0x7B17,0x7B18,0x7B1A,/* 0x90-0x97 */
+	0x7B1C,0x7B1D,0x7B1F,0x7B21,0x7B22,0x7B23,0x7B27,0x7B29,/* 0x98-0x9F */
+	0x7B2D,0x6D6E,0x6DAA,0x798F,0x88B1,0x5F17,0x752B,0x629A,/* 0xA0-0xA7 */
+	0x8F85,0x4FEF,0x91DC,0x65A7,0x812F,0x8151,0x5E9C,0x8150,/* 0xA8-0xAF */
+	0x8D74,0x526F,0x8986,0x8D4B,0x590D,0x5085,0x4ED8,0x961C,/* 0xB0-0xB7 */
+	0x7236,0x8179,0x8D1F,0x5BCC,0x8BA3,0x9644,0x5987,0x7F1A,/* 0xB8-0xBF */
+	0x5490,0x5676,0x560E,0x8BE5,0x6539,0x6982,0x9499,0x76D6,/* 0xC0-0xC7 */
+	0x6E89,0x5E72,0x7518,0x6746,0x67D1,0x7AFF,0x809D,0x8D76,/* 0xC8-0xCF */
+	0x611F,0x79C6,0x6562,0x8D63,0x5188,0x521A,0x94A2,0x7F38,/* 0xD0-0xD7 */
+	0x809B,0x7EB2,0x5C97,0x6E2F,0x6760,0x7BD9,0x768B,0x9AD8,/* 0xD8-0xDF */
+	0x818F,0x7F94,0x7CD5,0x641E,0x9550,0x7A3F,0x544A,0x54E5,/* 0xE0-0xE7 */
+	0x6B4C,0x6401,0x6208,0x9E3D,0x80F3,0x7599,0x5272,0x9769,/* 0xE8-0xEF */
+	0x845B,0x683C,0x86E4,0x9601,0x9694,0x94EC,0x4E2A,0x5404,/* 0xF0-0xF7 */
+	0x7ED9,0x6839,0x8DDF,0x8015,0x66F4,0x5E9A,0x7FB9,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7B2F,0x7B30,0x7B32,0x7B34,0x7B35,0x7B36,0x7B37,0x7B39,/* 0x40-0x47 */
+	0x7B3B,0x7B3D,0x7B3F,0x7B40,0x7B41,0x7B42,0x7B43,0x7B44,/* 0x48-0x4F */
+	0x7B46,0x7B48,0x7B4A,0x7B4D,0x7B4E,0x7B53,0x7B55,0x7B57,/* 0x50-0x57 */
+	0x7B59,0x7B5C,0x7B5E,0x7B5F,0x7B61,0x7B63,0x7B64,0x7B65,/* 0x58-0x5F */
+	0x7B66,0x7B67,0x7B68,0x7B69,0x7B6A,0x7B6B,0x7B6C,0x7B6D,/* 0x60-0x67 */
+	0x7B6F,0x7B70,0x7B73,0x7B74,0x7B76,0x7B78,0x7B7A,0x7B7C,/* 0x68-0x6F */
+	0x7B7D,0x7B7F,0x7B81,0x7B82,0x7B83,0x7B84,0x7B86,0x7B87,/* 0x70-0x77 */
+	0x7B88,0x7B89,0x7B8A,0x7B8B,0x7B8C,0x7B8E,0x7B8F,0x0000,/* 0x78-0x7F */
+
+	0x7B91,0x7B92,0x7B93,0x7B96,0x7B98,0x7B99,0x7B9A,0x7B9B,/* 0x80-0x87 */
+	0x7B9E,0x7B9F,0x7BA0,0x7BA3,0x7BA4,0x7BA5,0x7BAE,0x7BAF,/* 0x88-0x8F */
+	0x7BB0,0x7BB2,0x7BB3,0x7BB5,0x7BB6,0x7BB7,0x7BB9,0x7BBA,/* 0x90-0x97 */
+	0x7BBB,0x7BBC,0x7BBD,0x7BBE,0x7BBF,0x7BC0,0x7BC2,0x7BC3,/* 0x98-0x9F */
+	0x7BC4,0x57C2,0x803F,0x6897,0x5DE5,0x653B,0x529F,0x606D,/* 0xA0-0xA7 */
+	0x9F9A,0x4F9B,0x8EAC,0x516C,0x5BAB,0x5F13,0x5DE9,0x6C5E,/* 0xA8-0xAF */
+	0x62F1,0x8D21,0x5171,0x94A9,0x52FE,0x6C9F,0x82DF,0x72D7,/* 0xB0-0xB7 */
+	0x57A2,0x6784,0x8D2D,0x591F,0x8F9C,0x83C7,0x5495,0x7B8D,/* 0xB8-0xBF */
+	0x4F30,0x6CBD,0x5B64,0x59D1,0x9F13,0x53E4,0x86CA,0x9AA8,/* 0xC0-0xC7 */
+	0x8C37,0x80A1,0x6545,0x987E,0x56FA,0x96C7,0x522E,0x74DC,/* 0xC8-0xCF */
+	0x5250,0x5BE1,0x6302,0x8902,0x4E56,0x62D0,0x602A,0x68FA,/* 0xD0-0xD7 */
+	0x5173,0x5B98,0x51A0,0x89C2,0x7BA1,0x9986,0x7F50,0x60EF,/* 0xD8-0xDF */
+	0x704C,0x8D2F,0x5149,0x5E7F,0x901B,0x7470,0x89C4,0x572D,/* 0xE0-0xE7 */
+	0x7845,0x5F52,0x9F9F,0x95FA,0x8F68,0x9B3C,0x8BE1,0x7678,/* 0xE8-0xEF */
+	0x6842,0x67DC,0x8DEA,0x8D35,0x523D,0x8F8A,0x6EDA,0x68CD,/* 0xF0-0xF7 */
+	0x9505,0x90ED,0x56FD,0x679C,0x88F9,0x8FC7,0x54C8,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7BC5,0x7BC8,0x7BC9,0x7BCA,0x7BCB,0x7BCD,0x7BCE,0x7BCF,/* 0x40-0x47 */
+	0x7BD0,0x7BD2,0x7BD4,0x7BD5,0x7BD6,0x7BD7,0x7BD8,0x7BDB,/* 0x48-0x4F */
+	0x7BDC,0x7BDE,0x7BDF,0x7BE0,0x7BE2,0x7BE3,0x7BE4,0x7BE7,/* 0x50-0x57 */
+	0x7BE8,0x7BE9,0x7BEB,0x7BEC,0x7BED,0x7BEF,0x7BF0,0x7BF2,/* 0x58-0x5F */
+	0x7BF3,0x7BF4,0x7BF5,0x7BF6,0x7BF8,0x7BF9,0x7BFA,0x7BFB,/* 0x60-0x67 */
+	0x7BFD,0x7BFF,0x7C00,0x7C01,0x7C02,0x7C03,0x7C04,0x7C05,/* 0x68-0x6F */
+	0x7C06,0x7C08,0x7C09,0x7C0A,0x7C0D,0x7C0E,0x7C10,0x7C11,/* 0x70-0x77 */
+	0x7C12,0x7C13,0x7C14,0x7C15,0x7C17,0x7C18,0x7C19,0x0000,/* 0x78-0x7F */
+
+	0x7C1A,0x7C1B,0x7C1C,0x7C1D,0x7C1E,0x7C20,0x7C21,0x7C22,/* 0x80-0x87 */
+	0x7C23,0x7C24,0x7C25,0x7C28,0x7C29,0x7C2B,0x7C2C,0x7C2D,/* 0x88-0x8F */
+	0x7C2E,0x7C2F,0x7C30,0x7C31,0x7C32,0x7C33,0x7C34,0x7C35,/* 0x90-0x97 */
+	0x7C36,0x7C37,0x7C39,0x7C3A,0x7C3B,0x7C3C,0x7C3D,0x7C3E,/* 0x98-0x9F */
+	0x7C42,0x9AB8,0x5B69,0x6D77,0x6C26,0x4EA5,0x5BB3,0x9A87,/* 0xA0-0xA7 */
+	0x9163,0x61A8,0x90AF,0x97E9,0x542B,0x6DB5,0x5BD2,0x51FD,/* 0xA8-0xAF */
+	0x558A,0x7F55,0x7FF0,0x64BC,0x634D,0x65F1,0x61BE,0x608D,/* 0xB0-0xB7 */
+	0x710A,0x6C57,0x6C49,0x592F,0x676D,0x822A,0x58D5,0x568E,/* 0xB8-0xBF */
+	0x8C6A,0x6BEB,0x90DD,0x597D,0x8017,0x53F7,0x6D69,0x5475,/* 0xC0-0xC7 */
+	0x559D,0x8377,0x83CF,0x6838,0x79BE,0x548C,0x4F55,0x5408,/* 0xC8-0xCF */
+	0x76D2,0x8C89,0x9602,0x6CB3,0x6DB8,0x8D6B,0x8910,0x9E64,/* 0xD0-0xD7 */
+	0x8D3A,0x563F,0x9ED1,0x75D5,0x5F88,0x72E0,0x6068,0x54FC,/* 0xD8-0xDF */
+	0x4EA8,0x6A2A,0x8861,0x6052,0x8F70,0x54C4,0x70D8,0x8679,/* 0xE0-0xE7 */
+	0x9E3F,0x6D2A,0x5B8F,0x5F18,0x7EA2,0x5589,0x4FAF,0x7334,/* 0xE8-0xEF */
+	0x543C,0x539A,0x5019,0x540E,0x547C,0x4E4E,0x5FFD,0x745A,/* 0xF0-0xF7 */
+	0x58F6,0x846B,0x80E1,0x8774,0x72D0,0x7CCA,0x6E56,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7C43,0x7C44,0x7C45,0x7C46,0x7C47,0x7C48,0x7C49,0x7C4A,/* 0x40-0x47 */
+	0x7C4B,0x7C4C,0x7C4E,0x7C4F,0x7C50,0x7C51,0x7C52,0x7C53,/* 0x48-0x4F */
+	0x7C54,0x7C55,0x7C56,0x7C57,0x7C58,0x7C59,0x7C5A,0x7C5B,/* 0x50-0x57 */
+	0x7C5C,0x7C5D,0x7C5E,0x7C5F,0x7C60,0x7C61,0x7C62,0x7C63,/* 0x58-0x5F */
+	0x7C64,0x7C65,0x7C66,0x7C67,0x7C68,0x7C69,0x7C6A,0x7C6B,/* 0x60-0x67 */
+	0x7C6C,0x7C6D,0x7C6E,0x7C6F,0x7C70,0x7C71,0x7C72,0x7C75,/* 0x68-0x6F */
+	0x7C76,0x7C77,0x7C78,0x7C79,0x7C7A,0x7C7E,0x7C7F,0x7C80,/* 0x70-0x77 */
+	0x7C81,0x7C82,0x7C83,0x7C84,0x7C85,0x7C86,0x7C87,0x0000,/* 0x78-0x7F */
+
+	0x7C88,0x7C8A,0x7C8B,0x7C8C,0x7C8D,0x7C8E,0x7C8F,0x7C90,/* 0x80-0x87 */
+	0x7C93,0x7C94,0x7C96,0x7C99,0x7C9A,0x7C9B,0x7CA0,0x7CA1,/* 0x88-0x8F */
+	0x7CA3,0x7CA6,0x7CA7,0x7CA8,0x7CA9,0x7CAB,0x7CAC,0x7CAD,/* 0x90-0x97 */
+	0x7CAF,0x7CB0,0x7CB4,0x7CB5,0x7CB6,0x7CB7,0x7CB8,0x7CBA,/* 0x98-0x9F */
+	0x7CBB,0x5F27,0x864E,0x552C,0x62A4,0x4E92,0x6CAA,0x6237,/* 0xA0-0xA7 */
+	0x82B1,0x54D7,0x534E,0x733E,0x6ED1,0x753B,0x5212,0x5316,/* 0xA8-0xAF */
+	0x8BDD,0x69D0,0x5F8A,0x6000,0x6DEE,0x574F,0x6B22,0x73AF,/* 0xB0-0xB7 */
+	0x6853,0x8FD8,0x7F13,0x6362,0x60A3,0x5524,0x75EA,0x8C62,/* 0xB8-0xBF */
+	0x7115,0x6DA3,0x5BA6,0x5E7B,0x8352,0x614C,0x9EC4,0x78FA,/* 0xC0-0xC7 */
+	0x8757,0x7C27,0x7687,0x51F0,0x60F6,0x714C,0x6643,0x5E4C,/* 0xC8-0xCF */
+	0x604D,0x8C0E,0x7070,0x6325,0x8F89,0x5FBD,0x6062,0x86D4,/* 0xD0-0xD7 */
+	0x56DE,0x6BC1,0x6094,0x6167,0x5349,0x60E0,0x6666,0x8D3F,/* 0xD8-0xDF */
+	0x79FD,0x4F1A,0x70E9,0x6C47,0x8BB3,0x8BF2,0x7ED8,0x8364,/* 0xE0-0xE7 */
+	0x660F,0x5A5A,0x9B42,0x6D51,0x6DF7,0x8C41,0x6D3B,0x4F19,/* 0xE8-0xEF */
+	0x706B,0x83B7,0x6216,0x60D1,0x970D,0x8D27,0x7978,0x51FB,/* 0xF0-0xF7 */
+	0x573E,0x57FA,0x673A,0x7578,0x7A3D,0x79EF,0x7B95,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7CBF,0x7CC0,0x7CC2,0x7CC3,0x7CC4,0x7CC6,0x7CC9,0x7CCB,/* 0x40-0x47 */
+	0x7CCE,0x7CCF,0x7CD0,0x7CD1,0x7CD2,0x7CD3,0x7CD4,0x7CD8,/* 0x48-0x4F */
+	0x7CDA,0x7CDB,0x7CDD,0x7CDE,0x7CE1,0x7CE2,0x7CE3,0x7CE4,/* 0x50-0x57 */
+	0x7CE5,0x7CE6,0x7CE7,0x7CE9,0x7CEA,0x7CEB,0x7CEC,0x7CED,/* 0x58-0x5F */
+	0x7CEE,0x7CF0,0x7CF1,0x7CF2,0x7CF3,0x7CF4,0x7CF5,0x7CF6,/* 0x60-0x67 */
+	0x7CF7,0x7CF9,0x7CFA,0x7CFC,0x7CFD,0x7CFE,0x7CFF,0x7D00,/* 0x68-0x6F */
+	0x7D01,0x7D02,0x7D03,0x7D04,0x7D05,0x7D06,0x7D07,0x7D08,/* 0x70-0x77 */
+	0x7D09,0x7D0B,0x7D0C,0x7D0D,0x7D0E,0x7D0F,0x7D10,0x0000,/* 0x78-0x7F */
+
+	0x7D11,0x7D12,0x7D13,0x7D14,0x7D15,0x7D16,0x7D17,0x7D18,/* 0x80-0x87 */
+	0x7D19,0x7D1A,0x7D1B,0x7D1C,0x7D1D,0x7D1E,0x7D1F,0x7D21,/* 0x88-0x8F */
+	0x7D23,0x7D24,0x7D25,0x7D26,0x7D28,0x7D29,0x7D2A,0x7D2C,/* 0x90-0x97 */
+	0x7D2D,0x7D2E,0x7D30,0x7D31,0x7D32,0x7D33,0x7D34,0x7D35,/* 0x98-0x9F */
+	0x7D36,0x808C,0x9965,0x8FF9,0x6FC0,0x8BA5,0x9E21,0x59EC,/* 0xA0-0xA7 */
+	0x7EE9,0x7F09,0x5409,0x6781,0x68D8,0x8F91,0x7C4D,0x96C6,/* 0xA8-0xAF */
+	0x53CA,0x6025,0x75BE,0x6C72,0x5373,0x5AC9,0x7EA7,0x6324,/* 0xB0-0xB7 */
+	0x51E0,0x810A,0x5DF1,0x84DF,0x6280,0x5180,0x5B63,0x4F0E,/* 0xB8-0xBF */
+	0x796D,0x5242,0x60B8,0x6D4E,0x5BC4,0x5BC2,0x8BA1,0x8BB0,/* 0xC0-0xC7 */
+	0x65E2,0x5FCC,0x9645,0x5993,0x7EE7,0x7EAA,0x5609,0x67B7,/* 0xC8-0xCF */
+	0x5939,0x4F73,0x5BB6,0x52A0,0x835A,0x988A,0x8D3E,0x7532,/* 0xD0-0xD7 */
+	0x94BE,0x5047,0x7A3C,0x4EF7,0x67B6,0x9A7E,0x5AC1,0x6B7C,/* 0xD8-0xDF */
+	0x76D1,0x575A,0x5C16,0x7B3A,0x95F4,0x714E,0x517C,0x80A9,/* 0xE0-0xE7 */
+	0x8270,0x5978,0x7F04,0x8327,0x68C0,0x67EC,0x78B1,0x7877,/* 0xE8-0xEF */
+	0x62E3,0x6361,0x7B80,0x4FED,0x526A,0x51CF,0x8350,0x69DB,/* 0xF0-0xF7 */
+	0x9274,0x8DF5,0x8D31,0x89C1,0x952E,0x7BAD,0x4EF6,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7D37,0x7D38,0x7D39,0x7D3A,0x7D3B,0x7D3C,0x7D3D,0x7D3E,/* 0x40-0x47 */
+	0x7D3F,0x7D40,0x7D41,0x7D42,0x7D43,0x7D44,0x7D45,0x7D46,/* 0x48-0x4F */
+	0x7D47,0x7D48,0x7D49,0x7D4A,0x7D4B,0x7D4C,0x7D4D,0x7D4E,/* 0x50-0x57 */
+	0x7D4F,0x7D50,0x7D51,0x7D52,0x7D53,0x7D54,0x7D55,0x7D56,/* 0x58-0x5F */
+	0x7D57,0x7D58,0x7D59,0x7D5A,0x7D5B,0x7D5C,0x7D5D,0x7D5E,/* 0x60-0x67 */
+	0x7D5F,0x7D60,0x7D61,0x7D62,0x7D63,0x7D64,0x7D65,0x7D66,/* 0x68-0x6F */
+	0x7D67,0x7D68,0x7D69,0x7D6A,0x7D6B,0x7D6C,0x7D6D,0x7D6F,/* 0x70-0x77 */
+	0x7D70,0x7D71,0x7D72,0x7D73,0x7D74,0x7D75,0x7D76,0x0000,/* 0x78-0x7F */
+
+	0x7D78,0x7D79,0x7D7A,0x7D7B,0x7D7C,0x7D7D,0x7D7E,0x7D7F,/* 0x80-0x87 */
+	0x7D80,0x7D81,0x7D82,0x7D83,0x7D84,0x7D85,0x7D86,0x7D87,/* 0x88-0x8F */
+	0x7D88,0x7D89,0x7D8A,0x7D8B,0x7D8C,0x7D8D,0x7D8E,0x7D8F,/* 0x90-0x97 */
+	0x7D90,0x7D91,0x7D92,0x7D93,0x7D94,0x7D95,0x7D96,0x7D97,/* 0x98-0x9F */
+	0x7D98,0x5065,0x8230,0x5251,0x996F,0x6E10,0x6E85,0x6DA7,/* 0xA0-0xA7 */
+	0x5EFA,0x50F5,0x59DC,0x5C06,0x6D46,0x6C5F,0x7586,0x848B,/* 0xA8-0xAF */
+	0x6868,0x5956,0x8BB2,0x5320,0x9171,0x964D,0x8549,0x6912,/* 0xB0-0xB7 */
+	0x7901,0x7126,0x80F6,0x4EA4,0x90CA,0x6D47,0x9A84,0x5A07,/* 0xB8-0xBF */
+	0x56BC,0x6405,0x94F0,0x77EB,0x4FA5,0x811A,0x72E1,0x89D2,/* 0xC0-0xC7 */
+	0x997A,0x7F34,0x7EDE,0x527F,0x6559,0x9175,0x8F7F,0x8F83,/* 0xC8-0xCF */
+	0x53EB,0x7A96,0x63ED,0x63A5,0x7686,0x79F8,0x8857,0x9636,/* 0xD0-0xD7 */
+	0x622A,0x52AB,0x8282,0x6854,0x6770,0x6377,0x776B,0x7AED,/* 0xD8-0xDF */
+	0x6D01,0x7ED3,0x89E3,0x59D0,0x6212,0x85C9,0x82A5,0x754C,/* 0xE0-0xE7 */
+	0x501F,0x4ECB,0x75A5,0x8BEB,0x5C4A,0x5DFE,0x7B4B,0x65A4,/* 0xE8-0xEF */
+	0x91D1,0x4ECA,0x6D25,0x895F,0x7D27,0x9526,0x4EC5,0x8C28,/* 0xF0-0xF7 */
+	0x8FDB,0x9773,0x664B,0x7981,0x8FD1,0x70EC,0x6D78,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7D99,0x7D9A,0x7D9B,0x7D9C,0x7D9D,0x7D9E,0x7D9F,0x7DA0,/* 0x40-0x47 */
+	0x7DA1,0x7DA2,0x7DA3,0x7DA4,0x7DA5,0x7DA7,0x7DA8,0x7DA9,/* 0x48-0x4F */
+	0x7DAA,0x7DAB,0x7DAC,0x7DAD,0x7DAF,0x7DB0,0x7DB1,0x7DB2,/* 0x50-0x57 */
+	0x7DB3,0x7DB4,0x7DB5,0x7DB6,0x7DB7,0x7DB8,0x7DB9,0x7DBA,/* 0x58-0x5F */
+	0x7DBB,0x7DBC,0x7DBD,0x7DBE,0x7DBF,0x7DC0,0x7DC1,0x7DC2,/* 0x60-0x67 */
+	0x7DC3,0x7DC4,0x7DC5,0x7DC6,0x7DC7,0x7DC8,0x7DC9,0x7DCA,/* 0x68-0x6F */
+	0x7DCB,0x7DCC,0x7DCD,0x7DCE,0x7DCF,0x7DD0,0x7DD1,0x7DD2,/* 0x70-0x77 */
+	0x7DD3,0x7DD4,0x7DD5,0x7DD6,0x7DD7,0x7DD8,0x7DD9,0x0000,/* 0x78-0x7F */
+
+	0x7DDA,0x7DDB,0x7DDC,0x7DDD,0x7DDE,0x7DDF,0x7DE0,0x7DE1,/* 0x80-0x87 */
+	0x7DE2,0x7DE3,0x7DE4,0x7DE5,0x7DE6,0x7DE7,0x7DE8,0x7DE9,/* 0x88-0x8F */
+	0x7DEA,0x7DEB,0x7DEC,0x7DED,0x7DEE,0x7DEF,0x7DF0,0x7DF1,/* 0x90-0x97 */
+	0x7DF2,0x7DF3,0x7DF4,0x7DF5,0x7DF6,0x7DF7,0x7DF8,0x7DF9,/* 0x98-0x9F */
+	0x7DFA,0x5C3D,0x52B2,0x8346,0x5162,0x830E,0x775B,0x6676,/* 0xA0-0xA7 */
+	0x9CB8,0x4EAC,0x60CA,0x7CBE,0x7CB3,0x7ECF,0x4E95,0x8B66,/* 0xA8-0xAF */
+	0x666F,0x9888,0x9759,0x5883,0x656C,0x955C,0x5F84,0x75C9,/* 0xB0-0xB7 */
+	0x9756,0x7ADF,0x7ADE,0x51C0,0x70AF,0x7A98,0x63EA,0x7A76,/* 0xB8-0xBF */
+	0x7EA0,0x7396,0x97ED,0x4E45,0x7078,0x4E5D,0x9152,0x53A9,/* 0xC0-0xC7 */
+	0x6551,0x65E7,0x81FC,0x8205,0x548E,0x5C31,0x759A,0x97A0,/* 0xC8-0xCF */
+	0x62D8,0x72D9,0x75BD,0x5C45,0x9A79,0x83CA,0x5C40,0x5480,/* 0xD0-0xD7 */
+	0x77E9,0x4E3E,0x6CAE,0x805A,0x62D2,0x636E,0x5DE8,0x5177,/* 0xD8-0xDF */
+	0x8DDD,0x8E1E,0x952F,0x4FF1,0x53E5,0x60E7,0x70AC,0x5267,/* 0xE0-0xE7 */
+	0x6350,0x9E43,0x5A1F,0x5026,0x7737,0x5377,0x7EE2,0x6485,/* 0xE8-0xEF */
+	0x652B,0x6289,0x6398,0x5014,0x7235,0x89C9,0x51B3,0x8BC0,/* 0xF0-0xF7 */
+	0x7EDD,0x5747,0x83CC,0x94A7,0x519B,0x541B,0x5CFB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7DFB,0x7DFC,0x7DFD,0x7DFE,0x7DFF,0x7E00,0x7E01,0x7E02,/* 0x40-0x47 */
+	0x7E03,0x7E04,0x7E05,0x7E06,0x7E07,0x7E08,0x7E09,0x7E0A,/* 0x48-0x4F */
+	0x7E0B,0x7E0C,0x7E0D,0x7E0E,0x7E0F,0x7E10,0x7E11,0x7E12,/* 0x50-0x57 */
+	0x7E13,0x7E14,0x7E15,0x7E16,0x7E17,0x7E18,0x7E19,0x7E1A,/* 0x58-0x5F */
+	0x7E1B,0x7E1C,0x7E1D,0x7E1E,0x7E1F,0x7E20,0x7E21,0x7E22,/* 0x60-0x67 */
+	0x7E23,0x7E24,0x7E25,0x7E26,0x7E27,0x7E28,0x7E29,0x7E2A,/* 0x68-0x6F */
+	0x7E2B,0x7E2C,0x7E2D,0x7E2E,0x7E2F,0x7E30,0x7E31,0x7E32,/* 0x70-0x77 */
+	0x7E33,0x7E34,0x7E35,0x7E36,0x7E37,0x7E38,0x7E39,0x0000,/* 0x78-0x7F */
+
+	0x7E3A,0x7E3C,0x7E3D,0x7E3E,0x7E3F,0x7E40,0x7E42,0x7E43,/* 0x80-0x87 */
+	0x7E44,0x7E45,0x7E46,0x7E48,0x7E49,0x7E4A,0x7E4B,0x7E4C,/* 0x88-0x8F */
+	0x7E4D,0x7E4E,0x7E4F,0x7E50,0x7E51,0x7E52,0x7E53,0x7E54,/* 0x90-0x97 */
+	0x7E55,0x7E56,0x7E57,0x7E58,0x7E59,0x7E5A,0x7E5B,0x7E5C,/* 0x98-0x9F */
+	0x7E5D,0x4FCA,0x7AE3,0x6D5A,0x90E1,0x9A8F,0x5580,0x5496,/* 0xA0-0xA7 */
+	0x5361,0x54AF,0x5F00,0x63E9,0x6977,0x51EF,0x6168,0x520A,/* 0xA8-0xAF */
+	0x582A,0x52D8,0x574E,0x780D,0x770B,0x5EB7,0x6177,0x7CE0,/* 0xB0-0xB7 */
+	0x625B,0x6297,0x4EA2,0x7095,0x8003,0x62F7,0x70E4,0x9760,/* 0xB8-0xBF */
+	0x5777,0x82DB,0x67EF,0x68F5,0x78D5,0x9897,0x79D1,0x58F3,/* 0xC0-0xC7 */
+	0x54B3,0x53EF,0x6E34,0x514B,0x523B,0x5BA2,0x8BFE,0x80AF,/* 0xC8-0xCF */
+	0x5543,0x57A6,0x6073,0x5751,0x542D,0x7A7A,0x6050,0x5B54,/* 0xD0-0xD7 */
+	0x63A7,0x62A0,0x53E3,0x6263,0x5BC7,0x67AF,0x54ED,0x7A9F,/* 0xD8-0xDF */
+	0x82E6,0x9177,0x5E93,0x88E4,0x5938,0x57AE,0x630E,0x8DE8,/* 0xE0-0xE7 */
+	0x80EF,0x5757,0x7B77,0x4FA9,0x5FEB,0x5BBD,0x6B3E,0x5321,/* 0xE8-0xEF */
+	0x7B50,0x72C2,0x6846,0x77FF,0x7736,0x65F7,0x51B5,0x4E8F,/* 0xF0-0xF7 */
+	0x76D4,0x5CBF,0x7AA5,0x8475,0x594E,0x9B41,0x5080,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7E5E,0x7E5F,0x7E60,0x7E61,0x7E62,0x7E63,0x7E64,0x7E65,/* 0x40-0x47 */
+	0x7E66,0x7E67,0x7E68,0x7E69,0x7E6A,0x7E6B,0x7E6C,0x7E6D,/* 0x48-0x4F */
+	0x7E6E,0x7E6F,0x7E70,0x7E71,0x7E72,0x7E73,0x7E74,0x7E75,/* 0x50-0x57 */
+	0x7E76,0x7E77,0x7E78,0x7E79,0x7E7A,0x7E7B,0x7E7C,0x7E7D,/* 0x58-0x5F */
+	0x7E7E,0x7E7F,0x7E80,0x7E81,0x7E83,0x7E84,0x7E85,0x7E86,/* 0x60-0x67 */
+	0x7E87,0x7E88,0x7E89,0x7E8A,0x7E8B,0x7E8C,0x7E8D,0x7E8E,/* 0x68-0x6F */
+	0x7E8F,0x7E90,0x7E91,0x7E92,0x7E93,0x7E94,0x7E95,0x7E96,/* 0x70-0x77 */
+	0x7E97,0x7E98,0x7E99,0x7E9A,0x7E9C,0x7E9D,0x7E9E,0x0000,/* 0x78-0x7F */
+
+	0x7EAE,0x7EB4,0x7EBB,0x7EBC,0x7ED6,0x7EE4,0x7EEC,0x7EF9,/* 0x80-0x87 */
+	0x7F0A,0x7F10,0x7F1E,0x7F37,0x7F39,0x7F3B,0x7F3C,0x7F3D,/* 0x88-0x8F */
+	0x7F3E,0x7F3F,0x7F40,0x7F41,0x7F43,0x7F46,0x7F47,0x7F48,/* 0x90-0x97 */
+	0x7F49,0x7F4A,0x7F4B,0x7F4C,0x7F4D,0x7F4E,0x7F4F,0x7F52,/* 0x98-0x9F */
+	0x7F53,0x9988,0x6127,0x6E83,0x5764,0x6606,0x6346,0x56F0,/* 0xA0-0xA7 */
+	0x62EC,0x6269,0x5ED3,0x9614,0x5783,0x62C9,0x5587,0x8721,/* 0xA8-0xAF */
+	0x814A,0x8FA3,0x5566,0x83B1,0x6765,0x8D56,0x84DD,0x5A6A,/* 0xB0-0xB7 */
+	0x680F,0x62E6,0x7BEE,0x9611,0x5170,0x6F9C,0x8C30,0x63FD,/* 0xB8-0xBF */
+	0x89C8,0x61D2,0x7F06,0x70C2,0x6EE5,0x7405,0x6994,0x72FC,/* 0xC0-0xC7 */
+	0x5ECA,0x90CE,0x6717,0x6D6A,0x635E,0x52B3,0x7262,0x8001,/* 0xC8-0xCF */
+	0x4F6C,0x59E5,0x916A,0x70D9,0x6D9D,0x52D2,0x4E50,0x96F7,/* 0xD0-0xD7 */
+	0x956D,0x857E,0x78CA,0x7D2F,0x5121,0x5792,0x64C2,0x808B,/* 0xD8-0xDF */
+	0x7C7B,0x6CEA,0x68F1,0x695E,0x51B7,0x5398,0x68A8,0x7281,/* 0xE0-0xE7 */
+	0x9ECE,0x7BF1,0x72F8,0x79BB,0x6F13,0x7406,0x674E,0x91CC,/* 0xE8-0xEF */
+	0x9CA4,0x793C,0x8389,0x8354,0x540F,0x6817,0x4E3D,0x5389,/* 0xF0-0xF7 */
+	0x52B1,0x783E,0x5386,0x5229,0x5088,0x4F8B,0x4FD0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7F56,0x7F59,0x7F5B,0x7F5C,0x7F5D,0x7F5E,0x7F60,0x7F63,/* 0x40-0x47 */
+	0x7F64,0x7F65,0x7F66,0x7F67,0x7F6B,0x7F6C,0x7F6D,0x7F6F,/* 0x48-0x4F */
+	0x7F70,0x7F73,0x7F75,0x7F76,0x7F77,0x7F78,0x7F7A,0x7F7B,/* 0x50-0x57 */
+	0x7F7C,0x7F7D,0x7F7F,0x7F80,0x7F82,0x7F83,0x7F84,0x7F85,/* 0x58-0x5F */
+	0x7F86,0x7F87,0x7F88,0x7F89,0x7F8B,0x7F8D,0x7F8F,0x7F90,/* 0x60-0x67 */
+	0x7F91,0x7F92,0x7F93,0x7F95,0x7F96,0x7F97,0x7F98,0x7F99,/* 0x68-0x6F */
+	0x7F9B,0x7F9C,0x7FA0,0x7FA2,0x7FA3,0x7FA5,0x7FA6,0x7FA8,/* 0x70-0x77 */
+	0x7FA9,0x7FAA,0x7FAB,0x7FAC,0x7FAD,0x7FAE,0x7FB1,0x0000,/* 0x78-0x7F */
+
+	0x7FB3,0x7FB4,0x7FB5,0x7FB6,0x7FB7,0x7FBA,0x7FBB,0x7FBE,/* 0x80-0x87 */
+	0x7FC0,0x7FC2,0x7FC3,0x7FC4,0x7FC6,0x7FC7,0x7FC8,0x7FC9,/* 0x88-0x8F */
+	0x7FCB,0x7FCD,0x7FCF,0x7FD0,0x7FD1,0x7FD2,0x7FD3,0x7FD6,/* 0x90-0x97 */
+	0x7FD7,0x7FD9,0x7FDA,0x7FDB,0x7FDC,0x7FDD,0x7FDE,0x7FE2,/* 0x98-0x9F */
+	0x7FE3,0x75E2,0x7ACB,0x7C92,0x6CA5,0x96B6,0x529B,0x7483,/* 0xA0-0xA7 */
+	0x54E9,0x4FE9,0x8054,0x83B2,0x8FDE,0x9570,0x5EC9,0x601C,/* 0xA8-0xAF */
+	0x6D9F,0x5E18,0x655B,0x8138,0x94FE,0x604B,0x70BC,0x7EC3,/* 0xB0-0xB7 */
+	0x7CAE,0x51C9,0x6881,0x7CB1,0x826F,0x4E24,0x8F86,0x91CF,/* 0xB8-0xBF */
+	0x667E,0x4EAE,0x8C05,0x64A9,0x804A,0x50DA,0x7597,0x71CE,/* 0xC0-0xC7 */
+	0x5BE5,0x8FBD,0x6F66,0x4E86,0x6482,0x9563,0x5ED6,0x6599,/* 0xC8-0xCF */
+	0x5217,0x88C2,0x70C8,0x52A3,0x730E,0x7433,0x6797,0x78F7,/* 0xD0-0xD7 */
+	0x9716,0x4E34,0x90BB,0x9CDE,0x6DCB,0x51DB,0x8D41,0x541D,/* 0xD8-0xDF */
+	0x62CE,0x73B2,0x83F1,0x96F6,0x9F84,0x94C3,0x4F36,0x7F9A,/* 0xE0-0xE7 */
+	0x51CC,0x7075,0x9675,0x5CAD,0x9886,0x53E6,0x4EE4,0x6E9C,/* 0xE8-0xEF */
+	0x7409,0x69B4,0x786B,0x998F,0x7559,0x5218,0x7624,0x6D41,/* 0xF0-0xF7 */
+	0x67F3,0x516D,0x9F99,0x804B,0x5499,0x7B3C,0x7ABF,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7FE4,0x7FE7,0x7FE8,0x7FEA,0x7FEB,0x7FEC,0x7FED,0x7FEF,/* 0x40-0x47 */
+	0x7FF2,0x7FF4,0x7FF5,0x7FF6,0x7FF7,0x7FF8,0x7FF9,0x7FFA,/* 0x48-0x4F */
+	0x7FFD,0x7FFE,0x7FFF,0x8002,0x8007,0x8008,0x8009,0x800A,/* 0x50-0x57 */
+	0x800E,0x800F,0x8011,0x8013,0x801A,0x801B,0x801D,0x801E,/* 0x58-0x5F */
+	0x801F,0x8021,0x8023,0x8024,0x802B,0x802C,0x802D,0x802E,/* 0x60-0x67 */
+	0x802F,0x8030,0x8032,0x8034,0x8039,0x803A,0x803C,0x803E,/* 0x68-0x6F */
+	0x8040,0x8041,0x8044,0x8045,0x8047,0x8048,0x8049,0x804E,/* 0x70-0x77 */
+	0x804F,0x8050,0x8051,0x8053,0x8055,0x8056,0x8057,0x0000,/* 0x78-0x7F */
+
+	0x8059,0x805B,0x805C,0x805D,0x805E,0x805F,0x8060,0x8061,/* 0x80-0x87 */
+	0x8062,0x8063,0x8064,0x8065,0x8066,0x8067,0x8068,0x806B,/* 0x88-0x8F */
+	0x806C,0x806D,0x806E,0x806F,0x8070,0x8072,0x8073,0x8074,/* 0x90-0x97 */
+	0x8075,0x8076,0x8077,0x8078,0x8079,0x807A,0x807B,0x807C,/* 0x98-0x9F */
+	0x807D,0x9686,0x5784,0x62E2,0x9647,0x697C,0x5A04,0x6402,/* 0xA0-0xA7 */
+	0x7BD3,0x6F0F,0x964B,0x82A6,0x5362,0x9885,0x5E90,0x7089,/* 0xA8-0xAF */
+	0x63B3,0x5364,0x864F,0x9C81,0x9E93,0x788C,0x9732,0x8DEF,/* 0xB0-0xB7 */
+	0x8D42,0x9E7F,0x6F5E,0x7984,0x5F55,0x9646,0x622E,0x9A74,/* 0xB8-0xBF */
+	0x5415,0x94DD,0x4FA3,0x65C5,0x5C65,0x5C61,0x7F15,0x8651,/* 0xC0-0xC7 */
+	0x6C2F,0x5F8B,0x7387,0x6EE4,0x7EFF,0x5CE6,0x631B,0x5B6A,/* 0xC8-0xCF */
+	0x6EE6,0x5375,0x4E71,0x63A0,0x7565,0x62A1,0x8F6E,0x4F26,/* 0xD0-0xD7 */
+	0x4ED1,0x6CA6,0x7EB6,0x8BBA,0x841D,0x87BA,0x7F57,0x903B,/* 0xD8-0xDF */
+	0x9523,0x7BA9,0x9AA1,0x88F8,0x843D,0x6D1B,0x9A86,0x7EDC,/* 0xE0-0xE7 */
+	0x5988,0x9EBB,0x739B,0x7801,0x8682,0x9A6C,0x9A82,0x561B,/* 0xE8-0xEF */
+	0x5417,0x57CB,0x4E70,0x9EA6,0x5356,0x8FC8,0x8109,0x7792,/* 0xF0-0xF7 */
+	0x9992,0x86EE,0x6EE1,0x8513,0x66FC,0x6162,0x6F2B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x807E,0x8081,0x8082,0x8085,0x8088,0x808A,0x808D,0x808E,/* 0x40-0x47 */
+	0x808F,0x8090,0x8091,0x8092,0x8094,0x8095,0x8097,0x8099,/* 0x48-0x4F */
+	0x809E,0x80A3,0x80A6,0x80A7,0x80A8,0x80AC,0x80B0,0x80B3,/* 0x50-0x57 */
+	0x80B5,0x80B6,0x80B8,0x80B9,0x80BB,0x80C5,0x80C7,0x80C8,/* 0x58-0x5F */
+	0x80C9,0x80CA,0x80CB,0x80CF,0x80D0,0x80D1,0x80D2,0x80D3,/* 0x60-0x67 */
+	0x80D4,0x80D5,0x80D8,0x80DF,0x80E0,0x80E2,0x80E3,0x80E6,/* 0x68-0x6F */
+	0x80EE,0x80F5,0x80F7,0x80F9,0x80FB,0x80FE,0x80FF,0x8100,/* 0x70-0x77 */
+	0x8101,0x8103,0x8104,0x8105,0x8107,0x8108,0x810B,0x0000,/* 0x78-0x7F */
+
+	0x810C,0x8115,0x8117,0x8119,0x811B,0x811C,0x811D,0x811F,/* 0x80-0x87 */
+	0x8120,0x8121,0x8122,0x8123,0x8124,0x8125,0x8126,0x8127,/* 0x88-0x8F */
+	0x8128,0x8129,0x812A,0x812B,0x812D,0x812E,0x8130,0x8133,/* 0x90-0x97 */
+	0x8134,0x8135,0x8137,0x8139,0x813A,0x813B,0x813C,0x813D,/* 0x98-0x9F */
+	0x813F,0x8C29,0x8292,0x832B,0x76F2,0x6C13,0x5FD9,0x83BD,/* 0xA0-0xA7 */
+	0x732B,0x8305,0x951A,0x6BDB,0x77DB,0x94C6,0x536F,0x8302,/* 0xA8-0xAF */
+	0x5192,0x5E3D,0x8C8C,0x8D38,0x4E48,0x73AB,0x679A,0x6885,/* 0xB0-0xB7 */
+	0x9176,0x9709,0x7164,0x6CA1,0x7709,0x5A92,0x9541,0x6BCF,/* 0xB8-0xBF */
+	0x7F8E,0x6627,0x5BD0,0x59B9,0x5A9A,0x95E8,0x95F7,0x4EEC,/* 0xC0-0xC7 */
+	0x840C,0x8499,0x6AAC,0x76DF,0x9530,0x731B,0x68A6,0x5B5F,/* 0xC8-0xCF */
+	0x772F,0x919A,0x9761,0x7CDC,0x8FF7,0x8C1C,0x5F25,0x7C73,/* 0xD0-0xD7 */
+	0x79D8,0x89C5,0x6CCC,0x871C,0x5BC6,0x5E42,0x68C9,0x7720,/* 0xD8-0xDF */
+	0x7EF5,0x5195,0x514D,0x52C9,0x5A29,0x7F05,0x9762,0x82D7,/* 0xE0-0xE7 */
+	0x63CF,0x7784,0x85D0,0x79D2,0x6E3A,0x5E99,0x5999,0x8511,/* 0xE8-0xEF */
+	0x706D,0x6C11,0x62BF,0x76BF,0x654F,0x60AF,0x95FD,0x660E,/* 0xF0-0xF7 */
+	0x879F,0x9E23,0x94ED,0x540D,0x547D,0x8C2C,0x6478,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8140,0x8141,0x8142,0x8143,0x8144,0x8145,0x8147,0x8149,/* 0x40-0x47 */
+	0x814D,0x814E,0x814F,0x8152,0x8156,0x8157,0x8158,0x815B,/* 0x48-0x4F */
+	0x815C,0x815D,0x815E,0x815F,0x8161,0x8162,0x8163,0x8164,/* 0x50-0x57 */
+	0x8166,0x8168,0x816A,0x816B,0x816C,0x816F,0x8172,0x8173,/* 0x58-0x5F */
+	0x8175,0x8176,0x8177,0x8178,0x8181,0x8183,0x8184,0x8185,/* 0x60-0x67 */
+	0x8186,0x8187,0x8189,0x818B,0x818C,0x818D,0x818E,0x8190,/* 0x68-0x6F */
+	0x8192,0x8193,0x8194,0x8195,0x8196,0x8197,0x8199,0x819A,/* 0x70-0x77 */
+	0x819E,0x819F,0x81A0,0x81A1,0x81A2,0x81A4,0x81A5,0x0000,/* 0x78-0x7F */
+
+	0x81A7,0x81A9,0x81AB,0x81AC,0x81AD,0x81AE,0x81AF,0x81B0,/* 0x80-0x87 */
+	0x81B1,0x81B2,0x81B4,0x81B5,0x81B6,0x81B7,0x81B8,0x81B9,/* 0x88-0x8F */
+	0x81BC,0x81BD,0x81BE,0x81BF,0x81C4,0x81C5,0x81C7,0x81C8,/* 0x90-0x97 */
+	0x81C9,0x81CB,0x81CD,0x81CE,0x81CF,0x81D0,0x81D1,0x81D2,/* 0x98-0x9F */
+	0x81D3,0x6479,0x8611,0x6A21,0x819C,0x78E8,0x6469,0x9B54,/* 0xA0-0xA7 */
+	0x62B9,0x672B,0x83AB,0x58A8,0x9ED8,0x6CAB,0x6F20,0x5BDE,/* 0xA8-0xAF */
+	0x964C,0x8C0B,0x725F,0x67D0,0x62C7,0x7261,0x4EA9,0x59C6,/* 0xB0-0xB7 */
+	0x6BCD,0x5893,0x66AE,0x5E55,0x52DF,0x6155,0x6728,0x76EE,/* 0xB8-0xBF */
+	0x7766,0x7267,0x7A46,0x62FF,0x54EA,0x5450,0x94A0,0x90A3,/* 0xC0-0xC7 */
+	0x5A1C,0x7EB3,0x6C16,0x4E43,0x5976,0x8010,0x5948,0x5357,/* 0xC8-0xCF */
+	0x7537,0x96BE,0x56CA,0x6320,0x8111,0x607C,0x95F9,0x6DD6,/* 0xD0-0xD7 */
+	0x5462,0x9981,0x5185,0x5AE9,0x80FD,0x59AE,0x9713,0x502A,/* 0xD8-0xDF */
+	0x6CE5,0x5C3C,0x62DF,0x4F60,0x533F,0x817B,0x9006,0x6EBA,/* 0xE0-0xE7 */
+	0x852B,0x62C8,0x5E74,0x78BE,0x64B5,0x637B,0x5FF5,0x5A18,/* 0xE8-0xEF */
+	0x917F,0x9E1F,0x5C3F,0x634F,0x8042,0x5B7D,0x556E,0x954A,/* 0xF0-0xF7 */
+	0x954D,0x6D85,0x60A8,0x67E0,0x72DE,0x51DD,0x5B81,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x81D4,0x81D5,0x81D6,0x81D7,0x81D8,0x81D9,0x81DA,0x81DB,/* 0x40-0x47 */
+	0x81DC,0x81DD,0x81DE,0x81DF,0x81E0,0x81E1,0x81E2,0x81E4,/* 0x48-0x4F */
+	0x81E5,0x81E6,0x81E8,0x81E9,0x81EB,0x81EE,0x81EF,0x81F0,/* 0x50-0x57 */
+	0x81F1,0x81F2,0x81F5,0x81F6,0x81F7,0x81F8,0x81F9,0x81FA,/* 0x58-0x5F */
+	0x81FD,0x81FF,0x8203,0x8207,0x8208,0x8209,0x820A,0x820B,/* 0x60-0x67 */
+	0x820E,0x820F,0x8211,0x8213,0x8215,0x8216,0x8217,0x8218,/* 0x68-0x6F */
+	0x8219,0x821A,0x821D,0x8220,0x8224,0x8225,0x8226,0x8227,/* 0x70-0x77 */
+	0x8229,0x822E,0x8232,0x823A,0x823C,0x823D,0x823F,0x0000,/* 0x78-0x7F */
+
+	0x8240,0x8241,0x8242,0x8243,0x8245,0x8246,0x8248,0x824A,/* 0x80-0x87 */
+	0x824C,0x824D,0x824E,0x8250,0x8251,0x8252,0x8253,0x8254,/* 0x88-0x8F */
+	0x8255,0x8256,0x8257,0x8259,0x825B,0x825C,0x825D,0x825E,/* 0x90-0x97 */
+	0x8260,0x8261,0x8262,0x8263,0x8264,0x8265,0x8266,0x8267,/* 0x98-0x9F */
+	0x8269,0x62E7,0x6CDE,0x725B,0x626D,0x94AE,0x7EBD,0x8113,/* 0xA0-0xA7 */
+	0x6D53,0x519C,0x5F04,0x5974,0x52AA,0x6012,0x5973,0x6696,/* 0xA8-0xAF */
+	0x8650,0x759F,0x632A,0x61E6,0x7CEF,0x8BFA,0x54E6,0x6B27,/* 0xB0-0xB7 */
+	0x9E25,0x6BB4,0x85D5,0x5455,0x5076,0x6CA4,0x556A,0x8DB4,/* 0xB8-0xBF */
+	0x722C,0x5E15,0x6015,0x7436,0x62CD,0x6392,0x724C,0x5F98,/* 0xC0-0xC7 */
+	0x6E43,0x6D3E,0x6500,0x6F58,0x76D8,0x78D0,0x76FC,0x7554,/* 0xC8-0xCF */
+	0x5224,0x53DB,0x4E53,0x5E9E,0x65C1,0x802A,0x80D6,0x629B,/* 0xD0-0xD7 */
+	0x5486,0x5228,0x70AE,0x888D,0x8DD1,0x6CE1,0x5478,0x80DA,/* 0xD8-0xDF */
+	0x57F9,0x88F4,0x8D54,0x966A,0x914D,0x4F69,0x6C9B,0x55B7,/* 0xE0-0xE7 */
+	0x76C6,0x7830,0x62A8,0x70F9,0x6F8E,0x5F6D,0x84EC,0x68DA,/* 0xE8-0xEF */
+	0x787C,0x7BF7,0x81A8,0x670B,0x9E4F,0x6367,0x78B0,0x576F,/* 0xF0-0xF7 */
+	0x7812,0x9739,0x6279,0x62AB,0x5288,0x7435,0x6BD7,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x826A,0x826B,0x826C,0x826D,0x8271,0x8275,0x8276,0x8277,/* 0x40-0x47 */
+	0x8278,0x827B,0x827C,0x8280,0x8281,0x8283,0x8285,0x8286,/* 0x48-0x4F */
+	0x8287,0x8289,0x828C,0x8290,0x8293,0x8294,0x8295,0x8296,/* 0x50-0x57 */
+	0x829A,0x829B,0x829E,0x82A0,0x82A2,0x82A3,0x82A7,0x82B2,/* 0x58-0x5F */
+	0x82B5,0x82B6,0x82BA,0x82BB,0x82BC,0x82BF,0x82C0,0x82C2,/* 0x60-0x67 */
+	0x82C3,0x82C5,0x82C6,0x82C9,0x82D0,0x82D6,0x82D9,0x82DA,/* 0x68-0x6F */
+	0x82DD,0x82E2,0x82E7,0x82E8,0x82E9,0x82EA,0x82EC,0x82ED,/* 0x70-0x77 */
+	0x82EE,0x82F0,0x82F2,0x82F3,0x82F5,0x82F6,0x82F8,0x0000,/* 0x78-0x7F */
+
+	0x82FA,0x82FC,0x82FD,0x82FE,0x82FF,0x8300,0x830A,0x830B,/* 0x80-0x87 */
+	0x830D,0x8310,0x8312,0x8313,0x8316,0x8318,0x8319,0x831D,/* 0x88-0x8F */
+	0x831E,0x831F,0x8320,0x8321,0x8322,0x8323,0x8324,0x8325,/* 0x90-0x97 */
+	0x8326,0x8329,0x832A,0x832E,0x8330,0x8332,0x8337,0x833B,/* 0x98-0x9F */
+	0x833D,0x5564,0x813E,0x75B2,0x76AE,0x5339,0x75DE,0x50FB,/* 0xA0-0xA7 */
+	0x5C41,0x8B6C,0x7BC7,0x504F,0x7247,0x9A97,0x98D8,0x6F02,/* 0xA8-0xAF */
+	0x74E2,0x7968,0x6487,0x77A5,0x62FC,0x9891,0x8D2B,0x54C1,/* 0xB0-0xB7 */
+	0x8058,0x4E52,0x576A,0x82F9,0x840D,0x5E73,0x51ED,0x74F6,/* 0xB8-0xBF */
+	0x8BC4,0x5C4F,0x5761,0x6CFC,0x9887,0x5A46,0x7834,0x9B44,/* 0xC0-0xC7 */
+	0x8FEB,0x7C95,0x5256,0x6251,0x94FA,0x4EC6,0x8386,0x8461,/* 0xC8-0xCF */
+	0x83E9,0x84B2,0x57D4,0x6734,0x5703,0x666E,0x6D66,0x8C31,/* 0xD0-0xD7 */
+	0x66DD,0x7011,0x671F,0x6B3A,0x6816,0x621A,0x59BB,0x4E03,/* 0xD8-0xDF */
+	0x51C4,0x6F06,0x67D2,0x6C8F,0x5176,0x68CB,0x5947,0x6B67,/* 0xE0-0xE7 */
+	0x7566,0x5D0E,0x8110,0x9F50,0x65D7,0x7948,0x7941,0x9A91,/* 0xE8-0xEF */
+	0x8D77,0x5C82,0x4E5E,0x4F01,0x542F,0x5951,0x780C,0x5668,/* 0xF0-0xF7 */
+	0x6C14,0x8FC4,0x5F03,0x6C7D,0x6CE3,0x8BAB,0x6390,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x833E,0x833F,0x8341,0x8342,0x8344,0x8345,0x8348,0x834A,/* 0x40-0x47 */
+	0x834B,0x834C,0x834D,0x834E,0x8353,0x8355,0x8356,0x8357,/* 0x48-0x4F */
+	0x8358,0x8359,0x835D,0x8362,0x8370,0x8371,0x8372,0x8373,/* 0x50-0x57 */
+	0x8374,0x8375,0x8376,0x8379,0x837A,0x837E,0x837F,0x8380,/* 0x58-0x5F */
+	0x8381,0x8382,0x8383,0x8384,0x8387,0x8388,0x838A,0x838B,/* 0x60-0x67 */
+	0x838C,0x838D,0x838F,0x8390,0x8391,0x8394,0x8395,0x8396,/* 0x68-0x6F */
+	0x8397,0x8399,0x839A,0x839D,0x839F,0x83A1,0x83A2,0x83A3,/* 0x70-0x77 */
+	0x83A4,0x83A5,0x83A6,0x83A7,0x83AC,0x83AD,0x83AE,0x0000,/* 0x78-0x7F */
+
+	0x83AF,0x83B5,0x83BB,0x83BE,0x83BF,0x83C2,0x83C3,0x83C4,/* 0x80-0x87 */
+	0x83C6,0x83C8,0x83C9,0x83CB,0x83CD,0x83CE,0x83D0,0x83D1,/* 0x88-0x8F */
+	0x83D2,0x83D3,0x83D5,0x83D7,0x83D9,0x83DA,0x83DB,0x83DE,/* 0x90-0x97 */
+	0x83E2,0x83E3,0x83E4,0x83E6,0x83E7,0x83E8,0x83EB,0x83EC,/* 0x98-0x9F */
+	0x83ED,0x6070,0x6D3D,0x7275,0x6266,0x948E,0x94C5,0x5343,/* 0xA0-0xA7 */
+	0x8FC1,0x7B7E,0x4EDF,0x8C26,0x4E7E,0x9ED4,0x94B1,0x94B3,/* 0xA8-0xAF */
+	0x524D,0x6F5C,0x9063,0x6D45,0x8C34,0x5811,0x5D4C,0x6B20,/* 0xB0-0xB7 */
+	0x6B49,0x67AA,0x545B,0x8154,0x7F8C,0x5899,0x8537,0x5F3A,/* 0xB8-0xBF */
+	0x62A2,0x6A47,0x9539,0x6572,0x6084,0x6865,0x77A7,0x4E54,/* 0xC0-0xC7 */
+	0x4FA8,0x5DE7,0x9798,0x64AC,0x7FD8,0x5CED,0x4FCF,0x7A8D,/* 0xC8-0xCF */
+	0x5207,0x8304,0x4E14,0x602F,0x7A83,0x94A6,0x4FB5,0x4EB2,/* 0xD0-0xD7 */
+	0x79E6,0x7434,0x52E4,0x82B9,0x64D2,0x79BD,0x5BDD,0x6C81,/* 0xD8-0xDF */
+	0x9752,0x8F7B,0x6C22,0x503E,0x537F,0x6E05,0x64CE,0x6674,/* 0xE0-0xE7 */
+	0x6C30,0x60C5,0x9877,0x8BF7,0x5E86,0x743C,0x7A77,0x79CB,/* 0xE8-0xEF */
+	0x4E18,0x90B1,0x7403,0x6C42,0x56DA,0x914B,0x6CC5,0x8D8B,/* 0xF0-0xF7 */
+	0x533A,0x86C6,0x66F2,0x8EAF,0x5C48,0x9A71,0x6E20,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x83EE,0x83EF,0x83F3,0x83F4,0x83F5,0x83F6,0x83F7,0x83FA,/* 0x40-0x47 */
+	0x83FB,0x83FC,0x83FE,0x83FF,0x8400,0x8402,0x8405,0x8407,/* 0x48-0x4F */
+	0x8408,0x8409,0x840A,0x8410,0x8412,0x8413,0x8414,0x8415,/* 0x50-0x57 */
+	0x8416,0x8417,0x8419,0x841A,0x841B,0x841E,0x841F,0x8420,/* 0x58-0x5F */
+	0x8421,0x8422,0x8423,0x8429,0x842A,0x842B,0x842C,0x842D,/* 0x60-0x67 */
+	0x842E,0x842F,0x8430,0x8432,0x8433,0x8434,0x8435,0x8436,/* 0x68-0x6F */
+	0x8437,0x8439,0x843A,0x843B,0x843E,0x843F,0x8440,0x8441,/* 0x70-0x77 */
+	0x8442,0x8443,0x8444,0x8445,0x8447,0x8448,0x8449,0x0000,/* 0x78-0x7F */
+
+	0x844A,0x844B,0x844C,0x844D,0x844E,0x844F,0x8450,0x8452,/* 0x80-0x87 */
+	0x8453,0x8454,0x8455,0x8456,0x8458,0x845D,0x845E,0x845F,/* 0x88-0x8F */
+	0x8460,0x8462,0x8464,0x8465,0x8466,0x8467,0x8468,0x846A,/* 0x90-0x97 */
+	0x846E,0x846F,0x8470,0x8472,0x8474,0x8477,0x8479,0x847B,/* 0x98-0x9F */
+	0x847C,0x53D6,0x5A36,0x9F8B,0x8DA3,0x53BB,0x5708,0x98A7,/* 0xA0-0xA7 */
+	0x6743,0x919B,0x6CC9,0x5168,0x75CA,0x62F3,0x72AC,0x5238,/* 0xA8-0xAF */
+	0x529D,0x7F3A,0x7094,0x7638,0x5374,0x9E4A,0x69B7,0x786E,/* 0xB0-0xB7 */
+	0x96C0,0x88D9,0x7FA4,0x7136,0x71C3,0x5189,0x67D3,0x74E4,/* 0xB8-0xBF */
+	0x58E4,0x6518,0x56B7,0x8BA9,0x9976,0x6270,0x7ED5,0x60F9,/* 0xC0-0xC7 */
+	0x70ED,0x58EC,0x4EC1,0x4EBA,0x5FCD,0x97E7,0x4EFB,0x8BA4,/* 0xC8-0xCF */
+	0x5203,0x598A,0x7EAB,0x6254,0x4ECD,0x65E5,0x620E,0x8338,/* 0xD0-0xD7 */
+	0x84C9,0x8363,0x878D,0x7194,0x6EB6,0x5BB9,0x7ED2,0x5197,/* 0xD8-0xDF */
+	0x63C9,0x67D4,0x8089,0x8339,0x8815,0x5112,0x5B7A,0x5982,/* 0xE0-0xE7 */
+	0x8FB1,0x4E73,0x6C5D,0x5165,0x8925,0x8F6F,0x962E,0x854A,/* 0xE8-0xEF */
+	0x745E,0x9510,0x95F0,0x6DA6,0x82E5,0x5F31,0x6492,0x6D12,/* 0xF0-0xF7 */
+	0x8428,0x816E,0x9CC3,0x585E,0x8D5B,0x4E09,0x53C1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x847D,0x847E,0x847F,0x8480,0x8481,0x8483,0x8484,0x8485,/* 0x40-0x47 */
+	0x8486,0x848A,0x848D,0x848F,0x8490,0x8491,0x8492,0x8493,/* 0x48-0x4F */
+	0x8494,0x8495,0x8496,0x8498,0x849A,0x849B,0x849D,0x849E,/* 0x50-0x57 */
+	0x849F,0x84A0,0x84A2,0x84A3,0x84A4,0x84A5,0x84A6,0x84A7,/* 0x58-0x5F */
+	0x84A8,0x84A9,0x84AA,0x84AB,0x84AC,0x84AD,0x84AE,0x84B0,/* 0x60-0x67 */
+	0x84B1,0x84B3,0x84B5,0x84B6,0x84B7,0x84BB,0x84BC,0x84BE,/* 0x68-0x6F */
+	0x84C0,0x84C2,0x84C3,0x84C5,0x84C6,0x84C7,0x84C8,0x84CB,/* 0x70-0x77 */
+	0x84CC,0x84CE,0x84CF,0x84D2,0x84D4,0x84D5,0x84D7,0x0000,/* 0x78-0x7F */
+
+	0x84D8,0x84D9,0x84DA,0x84DB,0x84DC,0x84DE,0x84E1,0x84E2,/* 0x80-0x87 */
+	0x84E4,0x84E7,0x84E8,0x84E9,0x84EA,0x84EB,0x84ED,0x84EE,/* 0x88-0x8F */
+	0x84EF,0x84F1,0x84F2,0x84F3,0x84F4,0x84F5,0x84F6,0x84F7,/* 0x90-0x97 */
+	0x84F8,0x84F9,0x84FA,0x84FB,0x84FD,0x84FE,0x8500,0x8501,/* 0x98-0x9F */
+	0x8502,0x4F1E,0x6563,0x6851,0x55D3,0x4E27,0x6414,0x9A9A,/* 0xA0-0xA7 */
+	0x626B,0x5AC2,0x745F,0x8272,0x6DA9,0x68EE,0x50E7,0x838E,/* 0xA8-0xAF */
+	0x7802,0x6740,0x5239,0x6C99,0x7EB1,0x50BB,0x5565,0x715E,/* 0xB0-0xB7 */
+	0x7B5B,0x6652,0x73CA,0x82EB,0x6749,0x5C71,0x5220,0x717D,/* 0xB8-0xBF */
+	0x886B,0x95EA,0x9655,0x64C5,0x8D61,0x81B3,0x5584,0x6C55,/* 0xC0-0xC7 */
+	0x6247,0x7F2E,0x5892,0x4F24,0x5546,0x8D4F,0x664C,0x4E0A,/* 0xC8-0xCF */
+	0x5C1A,0x88F3,0x68A2,0x634E,0x7A0D,0x70E7,0x828D,0x52FA,/* 0xD0-0xD7 */
+	0x97F6,0x5C11,0x54E8,0x90B5,0x7ECD,0x5962,0x8D4A,0x86C7,/* 0xD8-0xDF */
+	0x820C,0x820D,0x8D66,0x6444,0x5C04,0x6151,0x6D89,0x793E,/* 0xE0-0xE7 */
+	0x8BBE,0x7837,0x7533,0x547B,0x4F38,0x8EAB,0x6DF1,0x5A20,/* 0xE8-0xEF */
+	0x7EC5,0x795E,0x6C88,0x5BA1,0x5A76,0x751A,0x80BE,0x614E,/* 0xF0-0xF7 */
+	0x6E17,0x58F0,0x751F,0x7525,0x7272,0x5347,0x7EF3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8503,0x8504,0x8505,0x8506,0x8507,0x8508,0x8509,0x850A,/* 0x40-0x47 */
+	0x850B,0x850D,0x850E,0x850F,0x8510,0x8512,0x8514,0x8515,/* 0x48-0x4F */
+	0x8516,0x8518,0x8519,0x851B,0x851C,0x851D,0x851E,0x8520,/* 0x50-0x57 */
+	0x8522,0x8523,0x8524,0x8525,0x8526,0x8527,0x8528,0x8529,/* 0x58-0x5F */
+	0x852A,0x852D,0x852E,0x852F,0x8530,0x8531,0x8532,0x8533,/* 0x60-0x67 */
+	0x8534,0x8535,0x8536,0x853E,0x853F,0x8540,0x8541,0x8542,/* 0x68-0x6F */
+	0x8544,0x8545,0x8546,0x8547,0x854B,0x854C,0x854D,0x854E,/* 0x70-0x77 */
+	0x854F,0x8550,0x8551,0x8552,0x8553,0x8554,0x8555,0x0000,/* 0x78-0x7F */
+
+	0x8557,0x8558,0x855A,0x855B,0x855C,0x855D,0x855F,0x8560,/* 0x80-0x87 */
+	0x8561,0x8562,0x8563,0x8565,0x8566,0x8567,0x8569,0x856A,/* 0x88-0x8F */
+	0x856B,0x856C,0x856D,0x856E,0x856F,0x8570,0x8571,0x8573,/* 0x90-0x97 */
+	0x8575,0x8576,0x8577,0x8578,0x857C,0x857D,0x857F,0x8580,/* 0x98-0x9F */
+	0x8581,0x7701,0x76DB,0x5269,0x80DC,0x5723,0x5E08,0x5931,/* 0xA0-0xA7 */
+	0x72EE,0x65BD,0x6E7F,0x8BD7,0x5C38,0x8671,0x5341,0x77F3,/* 0xA8-0xAF */
+	0x62FE,0x65F6,0x4EC0,0x98DF,0x8680,0x5B9E,0x8BC6,0x53F2,/* 0xB0-0xB7 */
+	0x77E2,0x4F7F,0x5C4E,0x9A76,0x59CB,0x5F0F,0x793A,0x58EB,/* 0xB8-0xBF */
+	0x4E16,0x67FF,0x4E8B,0x62ED,0x8A93,0x901D,0x52BF,0x662F,/* 0xC0-0xC7 */
+	0x55DC,0x566C,0x9002,0x4ED5,0x4F8D,0x91CA,0x9970,0x6C0F,/* 0xC8-0xCF */
+	0x5E02,0x6043,0x5BA4,0x89C6,0x8BD5,0x6536,0x624B,0x9996,/* 0xD0-0xD7 */
+	0x5B88,0x5BFF,0x6388,0x552E,0x53D7,0x7626,0x517D,0x852C,/* 0xD8-0xDF */
+	0x67A2,0x68B3,0x6B8A,0x6292,0x8F93,0x53D4,0x8212,0x6DD1,/* 0xE0-0xE7 */
+	0x758F,0x4E66,0x8D4E,0x5B70,0x719F,0x85AF,0x6691,0x66D9,/* 0xE8-0xEF */
+	0x7F72,0x8700,0x9ECD,0x9F20,0x5C5E,0x672F,0x8FF0,0x6811,/* 0xF0-0xF7 */
+	0x675F,0x620D,0x7AD6,0x5885,0x5EB6,0x6570,0x6F31,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8582,0x8583,0x8586,0x8588,0x8589,0x858A,0x858B,0x858C,/* 0x40-0x47 */
+	0x858D,0x858E,0x8590,0x8591,0x8592,0x8593,0x8594,0x8595,/* 0x48-0x4F */
+	0x8596,0x8597,0x8598,0x8599,0x859A,0x859D,0x859E,0x859F,/* 0x50-0x57 */
+	0x85A0,0x85A1,0x85A2,0x85A3,0x85A5,0x85A6,0x85A7,0x85A9,/* 0x58-0x5F */
+	0x85AB,0x85AC,0x85AD,0x85B1,0x85B2,0x85B3,0x85B4,0x85B5,/* 0x60-0x67 */
+	0x85B6,0x85B8,0x85BA,0x85BB,0x85BC,0x85BD,0x85BE,0x85BF,/* 0x68-0x6F */
+	0x85C0,0x85C2,0x85C3,0x85C4,0x85C5,0x85C6,0x85C7,0x85C8,/* 0x70-0x77 */
+	0x85CA,0x85CB,0x85CC,0x85CD,0x85CE,0x85D1,0x85D2,0x0000,/* 0x78-0x7F */
+
+	0x85D4,0x85D6,0x85D7,0x85D8,0x85D9,0x85DA,0x85DB,0x85DD,/* 0x80-0x87 */
+	0x85DE,0x85DF,0x85E0,0x85E1,0x85E2,0x85E3,0x85E5,0x85E6,/* 0x88-0x8F */
+	0x85E7,0x85E8,0x85EA,0x85EB,0x85EC,0x85ED,0x85EE,0x85EF,/* 0x90-0x97 */
+	0x85F0,0x85F1,0x85F2,0x85F3,0x85F4,0x85F5,0x85F6,0x85F7,/* 0x98-0x9F */
+	0x85F8,0x6055,0x5237,0x800D,0x6454,0x8870,0x7529,0x5E05,/* 0xA0-0xA7 */
+	0x6813,0x62F4,0x971C,0x53CC,0x723D,0x8C01,0x6C34,0x7761,/* 0xA8-0xAF */
+	0x7A0E,0x542E,0x77AC,0x987A,0x821C,0x8BF4,0x7855,0x6714,/* 0xB0-0xB7 */
+	0x70C1,0x65AF,0x6495,0x5636,0x601D,0x79C1,0x53F8,0x4E1D,/* 0xB8-0xBF */
+	0x6B7B,0x8086,0x5BFA,0x55E3,0x56DB,0x4F3A,0x4F3C,0x9972,/* 0xC0-0xC7 */
+	0x5DF3,0x677E,0x8038,0x6002,0x9882,0x9001,0x5B8B,0x8BBC,/* 0xC8-0xCF */
+	0x8BF5,0x641C,0x8258,0x64DE,0x55FD,0x82CF,0x9165,0x4FD7,/* 0xD0-0xD7 */
+	0x7D20,0x901F,0x7C9F,0x50F3,0x5851,0x6EAF,0x5BBF,0x8BC9,/* 0xD8-0xDF */
+	0x8083,0x9178,0x849C,0x7B97,0x867D,0x968B,0x968F,0x7EE5,/* 0xE0-0xE7 */
+	0x9AD3,0x788E,0x5C81,0x7A57,0x9042,0x96A7,0x795F,0x5B59,/* 0xE8-0xEF */
+	0x635F,0x7B0B,0x84D1,0x68AD,0x5506,0x7F29,0x7410,0x7D22,/* 0xF0-0xF7 */
+	0x9501,0x6240,0x584C,0x4ED6,0x5B83,0x5979,0x5854,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x85F9,0x85FA,0x85FC,0x85FD,0x85FE,0x8600,0x8601,0x8602,/* 0x40-0x47 */
+	0x8603,0x8604,0x8606,0x8607,0x8608,0x8609,0x860A,0x860B,/* 0x48-0x4F */
+	0x860C,0x860D,0x860E,0x860F,0x8610,0x8612,0x8613,0x8614,/* 0x50-0x57 */
+	0x8615,0x8617,0x8618,0x8619,0x861A,0x861B,0x861C,0x861D,/* 0x58-0x5F */
+	0x861E,0x861F,0x8620,0x8621,0x8622,0x8623,0x8624,0x8625,/* 0x60-0x67 */
+	0x8626,0x8628,0x862A,0x862B,0x862C,0x862D,0x862E,0x862F,/* 0x68-0x6F */
+	0x8630,0x8631,0x8632,0x8633,0x8634,0x8635,0x8636,0x8637,/* 0x70-0x77 */
+	0x8639,0x863A,0x863B,0x863D,0x863E,0x863F,0x8640,0x0000,/* 0x78-0x7F */
+
+	0x8641,0x8642,0x8643,0x8644,0x8645,0x8646,0x8647,0x8648,/* 0x80-0x87 */
+	0x8649,0x864A,0x864B,0x864C,0x8652,0x8653,0x8655,0x8656,/* 0x88-0x8F */
+	0x8657,0x8658,0x8659,0x865B,0x865C,0x865D,0x865F,0x8660,/* 0x90-0x97 */
+	0x8661,0x8663,0x8664,0x8665,0x8666,0x8667,0x8668,0x8669,/* 0x98-0x9F */
+	0x866A,0x736D,0x631E,0x8E4B,0x8E0F,0x80CE,0x82D4,0x62AC,/* 0xA0-0xA7 */
+	0x53F0,0x6CF0,0x915E,0x592A,0x6001,0x6C70,0x574D,0x644A,/* 0xA8-0xAF */
+	0x8D2A,0x762B,0x6EE9,0x575B,0x6A80,0x75F0,0x6F6D,0x8C2D,/* 0xB0-0xB7 */
+	0x8C08,0x5766,0x6BEF,0x8892,0x78B3,0x63A2,0x53F9,0x70AD,/* 0xB8-0xBF */
+	0x6C64,0x5858,0x642A,0x5802,0x68E0,0x819B,0x5510,0x7CD6,/* 0xC0-0xC7 */
+	0x5018,0x8EBA,0x6DCC,0x8D9F,0x70EB,0x638F,0x6D9B,0x6ED4,/* 0xC8-0xCF */
+	0x7EE6,0x8404,0x6843,0x9003,0x6DD8,0x9676,0x8BA8,0x5957,/* 0xD0-0xD7 */
+	0x7279,0x85E4,0x817E,0x75BC,0x8A8A,0x68AF,0x5254,0x8E22,/* 0xD8-0xDF */
+	0x9511,0x63D0,0x9898,0x8E44,0x557C,0x4F53,0x66FF,0x568F,/* 0xE0-0xE7 */
+	0x60D5,0x6D95,0x5243,0x5C49,0x5929,0x6DFB,0x586B,0x7530,/* 0xE8-0xEF */
+	0x751C,0x606C,0x8214,0x8146,0x6311,0x6761,0x8FE2,0x773A,/* 0xF0-0xF7 */
+	0x8DF3,0x8D34,0x94C1,0x5E16,0x5385,0x542C,0x70C3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x866D,0x866F,0x8670,0x8672,0x8673,0x8674,0x8675,0x8676,/* 0x40-0x47 */
+	0x8677,0x8678,0x8683,0x8684,0x8685,0x8686,0x8687,0x8688,/* 0x48-0x4F */
+	0x8689,0x868E,0x868F,0x8690,0x8691,0x8692,0x8694,0x8696,/* 0x50-0x57 */
+	0x8697,0x8698,0x8699,0x869A,0x869B,0x869E,0x869F,0x86A0,/* 0x58-0x5F */
+	0x86A1,0x86A2,0x86A5,0x86A6,0x86AB,0x86AD,0x86AE,0x86B2,/* 0x60-0x67 */
+	0x86B3,0x86B7,0x86B8,0x86B9,0x86BB,0x86BC,0x86BD,0x86BE,/* 0x68-0x6F */
+	0x86BF,0x86C1,0x86C2,0x86C3,0x86C5,0x86C8,0x86CC,0x86CD,/* 0x70-0x77 */
+	0x86D2,0x86D3,0x86D5,0x86D6,0x86D7,0x86DA,0x86DC,0x0000,/* 0x78-0x7F */
+
+	0x86DD,0x86E0,0x86E1,0x86E2,0x86E3,0x86E5,0x86E6,0x86E7,/* 0x80-0x87 */
+	0x86E8,0x86EA,0x86EB,0x86EC,0x86EF,0x86F5,0x86F6,0x86F7,/* 0x88-0x8F */
+	0x86FA,0x86FB,0x86FC,0x86FD,0x86FF,0x8701,0x8704,0x8705,/* 0x90-0x97 */
+	0x8706,0x870B,0x870C,0x870E,0x870F,0x8710,0x8711,0x8714,/* 0x98-0x9F */
+	0x8716,0x6C40,0x5EF7,0x505C,0x4EAD,0x5EAD,0x633A,0x8247,/* 0xA0-0xA7 */
+	0x901A,0x6850,0x916E,0x77B3,0x540C,0x94DC,0x5F64,0x7AE5,/* 0xA8-0xAF */
+	0x6876,0x6345,0x7B52,0x7EDF,0x75DB,0x5077,0x6295,0x5934,/* 0xB0-0xB7 */
+	0x900F,0x51F8,0x79C3,0x7A81,0x56FE,0x5F92,0x9014,0x6D82,/* 0xB8-0xBF */
+	0x5C60,0x571F,0x5410,0x5154,0x6E4D,0x56E2,0x63A8,0x9893,/* 0xC0-0xC7 */
+	0x817F,0x8715,0x892A,0x9000,0x541E,0x5C6F,0x81C0,0x62D6,/* 0xC8-0xCF */
+	0x6258,0x8131,0x9E35,0x9640,0x9A6E,0x9A7C,0x692D,0x59A5,/* 0xD0-0xD7 */
+	0x62D3,0x553E,0x6316,0x54C7,0x86D9,0x6D3C,0x5A03,0x74E6,/* 0xD8-0xDF */
+	0x889C,0x6B6A,0x5916,0x8C4C,0x5F2F,0x6E7E,0x73A9,0x987D,/* 0xE0-0xE7 */
+	0x4E38,0x70F7,0x5B8C,0x7897,0x633D,0x665A,0x7696,0x60CB,/* 0xE8-0xEF */
+	0x5B9B,0x5A49,0x4E07,0x8155,0x6C6A,0x738B,0x4EA1,0x6789,/* 0xF0-0xF7 */
+	0x7F51,0x5F80,0x65FA,0x671B,0x5FD8,0x5984,0x5A01,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8719,0x871B,0x871D,0x871F,0x8720,0x8724,0x8726,0x8727,/* 0x40-0x47 */
+	0x8728,0x872A,0x872B,0x872C,0x872D,0x872F,0x8730,0x8732,/* 0x48-0x4F */
+	0x8733,0x8735,0x8736,0x8738,0x8739,0x873A,0x873C,0x873D,/* 0x50-0x57 */
+	0x8740,0x8741,0x8742,0x8743,0x8744,0x8745,0x8746,0x874A,/* 0x58-0x5F */
+	0x874B,0x874D,0x874F,0x8750,0x8751,0x8752,0x8754,0x8755,/* 0x60-0x67 */
+	0x8756,0x8758,0x875A,0x875B,0x875C,0x875D,0x875E,0x875F,/* 0x68-0x6F */
+	0x8761,0x8762,0x8766,0x8767,0x8768,0x8769,0x876A,0x876B,/* 0x70-0x77 */
+	0x876C,0x876D,0x876F,0x8771,0x8772,0x8773,0x8775,0x0000,/* 0x78-0x7F */
+
+	0x8777,0x8778,0x8779,0x877A,0x877F,0x8780,0x8781,0x8784,/* 0x80-0x87 */
+	0x8786,0x8787,0x8789,0x878A,0x878C,0x878E,0x878F,0x8790,/* 0x88-0x8F */
+	0x8791,0x8792,0x8794,0x8795,0x8796,0x8798,0x8799,0x879A,/* 0x90-0x97 */
+	0x879B,0x879C,0x879D,0x879E,0x87A0,0x87A1,0x87A2,0x87A3,/* 0x98-0x9F */
+	0x87A4,0x5DCD,0x5FAE,0x5371,0x97E6,0x8FDD,0x6845,0x56F4,/* 0xA0-0xA7 */
+	0x552F,0x60DF,0x4E3A,0x6F4D,0x7EF4,0x82C7,0x840E,0x59D4,/* 0xA8-0xAF */
+	0x4F1F,0x4F2A,0x5C3E,0x7EAC,0x672A,0x851A,0x5473,0x754F,/* 0xB0-0xB7 */
+	0x80C3,0x5582,0x9B4F,0x4F4D,0x6E2D,0x8C13,0x5C09,0x6170,/* 0xB8-0xBF */
+	0x536B,0x761F,0x6E29,0x868A,0x6587,0x95FB,0x7EB9,0x543B,/* 0xC0-0xC7 */
+	0x7A33,0x7D0A,0x95EE,0x55E1,0x7FC1,0x74EE,0x631D,0x8717,/* 0xC8-0xCF */
+	0x6DA1,0x7A9D,0x6211,0x65A1,0x5367,0x63E1,0x6C83,0x5DEB,/* 0xD0-0xD7 */
+	0x545C,0x94A8,0x4E4C,0x6C61,0x8BEC,0x5C4B,0x65E0,0x829C,/* 0xD8-0xDF */
+	0x68A7,0x543E,0x5434,0x6BCB,0x6B66,0x4E94,0x6342,0x5348,/* 0xE0-0xE7 */
+	0x821E,0x4F0D,0x4FAE,0x575E,0x620A,0x96FE,0x6664,0x7269,/* 0xE8-0xEF */
+	0x52FF,0x52A1,0x609F,0x8BEF,0x6614,0x7199,0x6790,0x897F,/* 0xF0-0xF7 */
+	0x7852,0x77FD,0x6670,0x563B,0x5438,0x9521,0x727A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x87A5,0x87A6,0x87A7,0x87A9,0x87AA,0x87AE,0x87B0,0x87B1,/* 0x40-0x47 */
+	0x87B2,0x87B4,0x87B6,0x87B7,0x87B8,0x87B9,0x87BB,0x87BC,/* 0x48-0x4F */
+	0x87BE,0x87BF,0x87C1,0x87C2,0x87C3,0x87C4,0x87C5,0x87C7,/* 0x50-0x57 */
+	0x87C8,0x87C9,0x87CC,0x87CD,0x87CE,0x87CF,0x87D0,0x87D4,/* 0x58-0x5F */
+	0x87D5,0x87D6,0x87D7,0x87D8,0x87D9,0x87DA,0x87DC,0x87DD,/* 0x60-0x67 */
+	0x87DE,0x87DF,0x87E1,0x87E2,0x87E3,0x87E4,0x87E6,0x87E7,/* 0x68-0x6F */
+	0x87E8,0x87E9,0x87EB,0x87EC,0x87ED,0x87EF,0x87F0,0x87F1,/* 0x70-0x77 */
+	0x87F2,0x87F3,0x87F4,0x87F5,0x87F6,0x87F7,0x87F8,0x0000,/* 0x78-0x7F */
+
+	0x87FA,0x87FB,0x87FC,0x87FD,0x87FF,0x8800,0x8801,0x8802,/* 0x80-0x87 */
+	0x8804,0x8805,0x8806,0x8807,0x8808,0x8809,0x880B,0x880C,/* 0x88-0x8F */
+	0x880D,0x880E,0x880F,0x8810,0x8811,0x8812,0x8814,0x8817,/* 0x90-0x97 */
+	0x8818,0x8819,0x881A,0x881C,0x881D,0x881E,0x881F,0x8820,/* 0x98-0x9F */
+	0x8823,0x7A00,0x606F,0x5E0C,0x6089,0x819D,0x5915,0x60DC,/* 0xA0-0xA7 */
+	0x7184,0x70EF,0x6EAA,0x6C50,0x7280,0x6A84,0x88AD,0x5E2D,/* 0xA8-0xAF */
+	0x4E60,0x5AB3,0x559C,0x94E3,0x6D17,0x7CFB,0x9699,0x620F,/* 0xB0-0xB7 */
+	0x7EC6,0x778E,0x867E,0x5323,0x971E,0x8F96,0x6687,0x5CE1,/* 0xB8-0xBF */
+	0x4FA0,0x72ED,0x4E0B,0x53A6,0x590F,0x5413,0x6380,0x9528,/* 0xC0-0xC7 */
+	0x5148,0x4ED9,0x9C9C,0x7EA4,0x54B8,0x8D24,0x8854,0x8237,/* 0xC8-0xCF */
+	0x95F2,0x6D8E,0x5F26,0x5ACC,0x663E,0x9669,0x73B0,0x732E,/* 0xD0-0xD7 */
+	0x53BF,0x817A,0x9985,0x7FA1,0x5BAA,0x9677,0x9650,0x7EBF,/* 0xD8-0xDF */
+	0x76F8,0x53A2,0x9576,0x9999,0x7BB1,0x8944,0x6E58,0x4E61,/* 0xE0-0xE7 */
+	0x7FD4,0x7965,0x8BE6,0x60F3,0x54CD,0x4EAB,0x9879,0x5DF7,/* 0xE8-0xEF */
+	0x6A61,0x50CF,0x5411,0x8C61,0x8427,0x785D,0x9704,0x524A,/* 0xF0-0xF7 */
+	0x54EE,0x56A3,0x9500,0x6D88,0x5BB5,0x6DC6,0x6653,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8824,0x8825,0x8826,0x8827,0x8828,0x8829,0x882A,0x882B,/* 0x40-0x47 */
+	0x882C,0x882D,0x882E,0x882F,0x8830,0x8831,0x8833,0x8834,/* 0x48-0x4F */
+	0x8835,0x8836,0x8837,0x8838,0x883A,0x883B,0x883D,0x883E,/* 0x50-0x57 */
+	0x883F,0x8841,0x8842,0x8843,0x8846,0x8847,0x8848,0x8849,/* 0x58-0x5F */
+	0x884A,0x884B,0x884E,0x884F,0x8850,0x8851,0x8852,0x8853,/* 0x60-0x67 */
+	0x8855,0x8856,0x8858,0x885A,0x885B,0x885C,0x885D,0x885E,/* 0x68-0x6F */
+	0x885F,0x8860,0x8866,0x8867,0x886A,0x886D,0x886F,0x8871,/* 0x70-0x77 */
+	0x8873,0x8874,0x8875,0x8876,0x8878,0x8879,0x887A,0x0000,/* 0x78-0x7F */
+
+	0x887B,0x887C,0x8880,0x8883,0x8886,0x8887,0x8889,0x888A,/* 0x80-0x87 */
+	0x888C,0x888E,0x888F,0x8890,0x8891,0x8893,0x8894,0x8895,/* 0x88-0x8F */
+	0x8897,0x8898,0x8899,0x889A,0x889B,0x889D,0x889E,0x889F,/* 0x90-0x97 */
+	0x88A0,0x88A1,0x88A3,0x88A5,0x88A6,0x88A7,0x88A8,0x88A9,/* 0x98-0x9F */
+	0x88AA,0x5C0F,0x5B5D,0x6821,0x8096,0x5578,0x7B11,0x6548,/* 0xA0-0xA7 */
+	0x6954,0x4E9B,0x6B47,0x874E,0x978B,0x534F,0x631F,0x643A,/* 0xA8-0xAF */
+	0x90AA,0x659C,0x80C1,0x8C10,0x5199,0x68B0,0x5378,0x87F9,/* 0xB0-0xB7 */
+	0x61C8,0x6CC4,0x6CFB,0x8C22,0x5C51,0x85AA,0x82AF,0x950C,/* 0xB8-0xBF */
+	0x6B23,0x8F9B,0x65B0,0x5FFB,0x5FC3,0x4FE1,0x8845,0x661F,/* 0xC0-0xC7 */
+	0x8165,0x7329,0x60FA,0x5174,0x5211,0x578B,0x5F62,0x90A2,/* 0xC8-0xCF */
+	0x884C,0x9192,0x5E78,0x674F,0x6027,0x59D3,0x5144,0x51F6,/* 0xD0-0xD7 */
+	0x80F8,0x5308,0x6C79,0x96C4,0x718A,0x4F11,0x4FEE,0x7F9E,/* 0xD8-0xDF */
+	0x673D,0x55C5,0x9508,0x79C0,0x8896,0x7EE3,0x589F,0x620C,/* 0xE0-0xE7 */
+	0x9700,0x865A,0x5618,0x987B,0x5F90,0x8BB8,0x84C4,0x9157,/* 0xE8-0xEF */
+	0x53D9,0x65ED,0x5E8F,0x755C,0x6064,0x7D6E,0x5A7F,0x7EEA,/* 0xF0-0xF7 */
+	0x7EED,0x8F69,0x55A7,0x5BA3,0x60AC,0x65CB,0x7384,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x88AC,0x88AE,0x88AF,0x88B0,0x88B2,0x88B3,0x88B4,0x88B5,/* 0x40-0x47 */
+	0x88B6,0x88B8,0x88B9,0x88BA,0x88BB,0x88BD,0x88BE,0x88BF,/* 0x48-0x4F */
+	0x88C0,0x88C3,0x88C4,0x88C7,0x88C8,0x88CA,0x88CB,0x88CC,/* 0x50-0x57 */
+	0x88CD,0x88CF,0x88D0,0x88D1,0x88D3,0x88D6,0x88D7,0x88DA,/* 0x58-0x5F */
+	0x88DB,0x88DC,0x88DD,0x88DE,0x88E0,0x88E1,0x88E6,0x88E7,/* 0x60-0x67 */
+	0x88E9,0x88EA,0x88EB,0x88EC,0x88ED,0x88EE,0x88EF,0x88F2,/* 0x68-0x6F */
+	0x88F5,0x88F6,0x88F7,0x88FA,0x88FB,0x88FD,0x88FF,0x8900,/* 0x70-0x77 */
+	0x8901,0x8903,0x8904,0x8905,0x8906,0x8907,0x8908,0x0000,/* 0x78-0x7F */
+
+	0x8909,0x890B,0x890C,0x890D,0x890E,0x890F,0x8911,0x8914,/* 0x80-0x87 */
+	0x8915,0x8916,0x8917,0x8918,0x891C,0x891D,0x891E,0x891F,/* 0x88-0x8F */
+	0x8920,0x8922,0x8923,0x8924,0x8926,0x8927,0x8928,0x8929,/* 0x90-0x97 */
+	0x892C,0x892D,0x892E,0x892F,0x8931,0x8932,0x8933,0x8935,/* 0x98-0x9F */
+	0x8937,0x9009,0x7663,0x7729,0x7EDA,0x9774,0x859B,0x5B66,/* 0xA0-0xA7 */
+	0x7A74,0x96EA,0x8840,0x52CB,0x718F,0x5FAA,0x65EC,0x8BE2,/* 0xA8-0xAF */
+	0x5BFB,0x9A6F,0x5DE1,0x6B89,0x6C5B,0x8BAD,0x8BAF,0x900A,/* 0xB0-0xB7 */
+	0x8FC5,0x538B,0x62BC,0x9E26,0x9E2D,0x5440,0x4E2B,0x82BD,/* 0xB8-0xBF */
+	0x7259,0x869C,0x5D16,0x8859,0x6DAF,0x96C5,0x54D1,0x4E9A,/* 0xC0-0xC7 */
+	0x8BB6,0x7109,0x54BD,0x9609,0x70DF,0x6DF9,0x76D0,0x4E25,/* 0xC8-0xCF */
+	0x7814,0x8712,0x5CA9,0x5EF6,0x8A00,0x989C,0x960E,0x708E,/* 0xD0-0xD7 */
+	0x6CBF,0x5944,0x63A9,0x773C,0x884D,0x6F14,0x8273,0x5830,/* 0xD8-0xDF */
+	0x71D5,0x538C,0x781A,0x96C1,0x5501,0x5F66,0x7130,0x5BB4,/* 0xE0-0xE7 */
+	0x8C1A,0x9A8C,0x6B83,0x592E,0x9E2F,0x79E7,0x6768,0x626C,/* 0xE8-0xEF */
+	0x4F6F,0x75A1,0x7F8A,0x6D0B,0x9633,0x6C27,0x4EF0,0x75D2,/* 0xF0-0xF7 */
+	0x517B,0x6837,0x6F3E,0x9080,0x8170,0x5996,0x7476,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8938,0x8939,0x893A,0x893B,0x893C,0x893D,0x893E,0x893F,/* 0x40-0x47 */
+	0x8940,0x8942,0x8943,0x8945,0x8946,0x8947,0x8948,0x8949,/* 0x48-0x4F */
+	0x894A,0x894B,0x894C,0x894D,0x894E,0x894F,0x8950,0x8951,/* 0x50-0x57 */
+	0x8952,0x8953,0x8954,0x8955,0x8956,0x8957,0x8958,0x8959,/* 0x58-0x5F */
+	0x895A,0x895B,0x895C,0x895D,0x8960,0x8961,0x8962,0x8963,/* 0x60-0x67 */
+	0x8964,0x8965,0x8967,0x8968,0x8969,0x896A,0x896B,0x896C,/* 0x68-0x6F */
+	0x896D,0x896E,0x896F,0x8970,0x8971,0x8972,0x8973,0x8974,/* 0x70-0x77 */
+	0x8975,0x8976,0x8977,0x8978,0x8979,0x897A,0x897C,0x0000,/* 0x78-0x7F */
+
+	0x897D,0x897E,0x8980,0x8982,0x8984,0x8985,0x8987,0x8988,/* 0x80-0x87 */
+	0x8989,0x898A,0x898B,0x898C,0x898D,0x898E,0x898F,0x8990,/* 0x88-0x8F */
+	0x8991,0x8992,0x8993,0x8994,0x8995,0x8996,0x8997,0x8998,/* 0x90-0x97 */
+	0x8999,0x899A,0x899B,0x899C,0x899D,0x899E,0x899F,0x89A0,/* 0x98-0x9F */
+	0x89A1,0x6447,0x5C27,0x9065,0x7A91,0x8C23,0x59DA,0x54AC,/* 0xA0-0xA7 */
+	0x8200,0x836F,0x8981,0x8000,0x6930,0x564E,0x8036,0x7237,/* 0xA8-0xAF */
+	0x91CE,0x51B6,0x4E5F,0x9875,0x6396,0x4E1A,0x53F6,0x66F3,/* 0xB0-0xB7 */
+	0x814B,0x591C,0x6DB2,0x4E00,0x58F9,0x533B,0x63D6,0x94F1,/* 0xB8-0xBF */
+	0x4F9D,0x4F0A,0x8863,0x9890,0x5937,0x9057,0x79FB,0x4EEA,/* 0xC0-0xC7 */
+	0x80F0,0x7591,0x6C82,0x5B9C,0x59E8,0x5F5D,0x6905,0x8681,/* 0xC8-0xCF */
+	0x501A,0x5DF2,0x4E59,0x77E3,0x4EE5,0x827A,0x6291,0x6613,/* 0xD0-0xD7 */
+	0x9091,0x5C79,0x4EBF,0x5F79,0x81C6,0x9038,0x8084,0x75AB,/* 0xD8-0xDF */
+	0x4EA6,0x88D4,0x610F,0x6BC5,0x5FC6,0x4E49,0x76CA,0x6EA2,/* 0xE0-0xE7 */
+	0x8BE3,0x8BAE,0x8C0A,0x8BD1,0x5F02,0x7FFC,0x7FCC,0x7ECE,/* 0xE8-0xEF */
+	0x8335,0x836B,0x56E0,0x6BB7,0x97F3,0x9634,0x59FB,0x541F,/* 0xF0-0xF7 */
+	0x94F6,0x6DEB,0x5BC5,0x996E,0x5C39,0x5F15,0x9690,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x89A2,0x89A3,0x89A4,0x89A5,0x89A6,0x89A7,0x89A8,0x89A9,/* 0x40-0x47 */
+	0x89AA,0x89AB,0x89AC,0x89AD,0x89AE,0x89AF,0x89B0,0x89B1,/* 0x48-0x4F */
+	0x89B2,0x89B3,0x89B4,0x89B5,0x89B6,0x89B7,0x89B8,0x89B9,/* 0x50-0x57 */
+	0x89BA,0x89BB,0x89BC,0x89BD,0x89BE,0x89BF,0x89C0,0x89C3,/* 0x58-0x5F */
+	0x89CD,0x89D3,0x89D4,0x89D5,0x89D7,0x89D8,0x89D9,0x89DB,/* 0x60-0x67 */
+	0x89DD,0x89DF,0x89E0,0x89E1,0x89E2,0x89E4,0x89E7,0x89E8,/* 0x68-0x6F */
+	0x89E9,0x89EA,0x89EC,0x89ED,0x89EE,0x89F0,0x89F1,0x89F2,/* 0x70-0x77 */
+	0x89F4,0x89F5,0x89F6,0x89F7,0x89F8,0x89F9,0x89FA,0x0000,/* 0x78-0x7F */
+
+	0x89FB,0x89FC,0x89FD,0x89FE,0x89FF,0x8A01,0x8A02,0x8A03,/* 0x80-0x87 */
+	0x8A04,0x8A05,0x8A06,0x8A08,0x8A09,0x8A0A,0x8A0B,0x8A0C,/* 0x88-0x8F */
+	0x8A0D,0x8A0E,0x8A0F,0x8A10,0x8A11,0x8A12,0x8A13,0x8A14,/* 0x90-0x97 */
+	0x8A15,0x8A16,0x8A17,0x8A18,0x8A19,0x8A1A,0x8A1B,0x8A1C,/* 0x98-0x9F */
+	0x8A1D,0x5370,0x82F1,0x6A31,0x5A74,0x9E70,0x5E94,0x7F28,/* 0xA0-0xA7 */
+	0x83B9,0x8424,0x8425,0x8367,0x8747,0x8FCE,0x8D62,0x76C8,/* 0xA8-0xAF */
+	0x5F71,0x9896,0x786C,0x6620,0x54DF,0x62E5,0x4F63,0x81C3,/* 0xB0-0xB7 */
+	0x75C8,0x5EB8,0x96CD,0x8E0A,0x86F9,0x548F,0x6CF3,0x6D8C,/* 0xB8-0xBF */
+	0x6C38,0x607F,0x52C7,0x7528,0x5E7D,0x4F18,0x60A0,0x5FE7,/* 0xC0-0xC7 */
+	0x5C24,0x7531,0x90AE,0x94C0,0x72B9,0x6CB9,0x6E38,0x9149,/* 0xC8-0xCF */
+	0x6709,0x53CB,0x53F3,0x4F51,0x91C9,0x8BF1,0x53C8,0x5E7C,/* 0xD0-0xD7 */
+	0x8FC2,0x6DE4,0x4E8E,0x76C2,0x6986,0x865E,0x611A,0x8206,/* 0xD8-0xDF */
+	0x4F59,0x4FDE,0x903E,0x9C7C,0x6109,0x6E1D,0x6E14,0x9685,/* 0xE0-0xE7 */
+	0x4E88,0x5A31,0x96E8,0x4E0E,0x5C7F,0x79B9,0x5B87,0x8BED,/* 0xE8-0xEF */
+	0x7FBD,0x7389,0x57DF,0x828B,0x90C1,0x5401,0x9047,0x55BB,/* 0xF0-0xF7 */
+	0x5CEA,0x5FA1,0x6108,0x6B32,0x72F1,0x80B2,0x8A89,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8A1E,0x8A1F,0x8A20,0x8A21,0x8A22,0x8A23,0x8A24,0x8A25,/* 0x40-0x47 */
+	0x8A26,0x8A27,0x8A28,0x8A29,0x8A2A,0x8A2B,0x8A2C,0x8A2D,/* 0x48-0x4F */
+	0x8A2E,0x8A2F,0x8A30,0x8A31,0x8A32,0x8A33,0x8A34,0x8A35,/* 0x50-0x57 */
+	0x8A36,0x8A37,0x8A38,0x8A39,0x8A3A,0x8A3B,0x8A3C,0x8A3D,/* 0x58-0x5F */
+	0x8A3F,0x8A40,0x8A41,0x8A42,0x8A43,0x8A44,0x8A45,0x8A46,/* 0x60-0x67 */
+	0x8A47,0x8A49,0x8A4A,0x8A4B,0x8A4C,0x8A4D,0x8A4E,0x8A4F,/* 0x68-0x6F */
+	0x8A50,0x8A51,0x8A52,0x8A53,0x8A54,0x8A55,0x8A56,0x8A57,/* 0x70-0x77 */
+	0x8A58,0x8A59,0x8A5A,0x8A5B,0x8A5C,0x8A5D,0x8A5E,0x0000,/* 0x78-0x7F */
+
+	0x8A5F,0x8A60,0x8A61,0x8A62,0x8A63,0x8A64,0x8A65,0x8A66,/* 0x80-0x87 */
+	0x8A67,0x8A68,0x8A69,0x8A6A,0x8A6B,0x8A6C,0x8A6D,0x8A6E,/* 0x88-0x8F */
+	0x8A6F,0x8A70,0x8A71,0x8A72,0x8A73,0x8A74,0x8A75,0x8A76,/* 0x90-0x97 */
+	0x8A77,0x8A78,0x8A7A,0x8A7B,0x8A7C,0x8A7D,0x8A7E,0x8A7F,/* 0x98-0x9F */
+	0x8A80,0x6D74,0x5BD3,0x88D5,0x9884,0x8C6B,0x9A6D,0x9E33,/* 0xA0-0xA7 */
+	0x6E0A,0x51A4,0x5143,0x57A3,0x8881,0x539F,0x63F4,0x8F95,/* 0xA8-0xAF */
+	0x56ED,0x5458,0x5706,0x733F,0x6E90,0x7F18,0x8FDC,0x82D1,/* 0xB0-0xB7 */
+	0x613F,0x6028,0x9662,0x66F0,0x7EA6,0x8D8A,0x8DC3,0x94A5,/* 0xB8-0xBF */
+	0x5CB3,0x7CA4,0x6708,0x60A6,0x9605,0x8018,0x4E91,0x90E7,/* 0xC0-0xC7 */
+	0x5300,0x9668,0x5141,0x8FD0,0x8574,0x915D,0x6655,0x97F5,/* 0xC8-0xCF */
+	0x5B55,0x531D,0x7838,0x6742,0x683D,0x54C9,0x707E,0x5BB0,/* 0xD0-0xD7 */
+	0x8F7D,0x518D,0x5728,0x54B1,0x6512,0x6682,0x8D5E,0x8D43,/* 0xD8-0xDF */
+	0x810F,0x846C,0x906D,0x7CDF,0x51FF,0x85FB,0x67A3,0x65E9,/* 0xE0-0xE7 */
+	0x6FA1,0x86A4,0x8E81,0x566A,0x9020,0x7682,0x7076,0x71E5,/* 0xE8-0xEF */
+	0x8D23,0x62E9,0x5219,0x6CFD,0x8D3C,0x600E,0x589E,0x618E,/* 0xF0-0xF7 */
+	0x66FE,0x8D60,0x624E,0x55B3,0x6E23,0x672D,0x8F67,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8A81,0x8A82,0x8A83,0x8A84,0x8A85,0x8A86,0x8A87,0x8A88,/* 0x40-0x47 */
+	0x8A8B,0x8A8C,0x8A8D,0x8A8E,0x8A8F,0x8A90,0x8A91,0x8A92,/* 0x48-0x4F */
+	0x8A94,0x8A95,0x8A96,0x8A97,0x8A98,0x8A99,0x8A9A,0x8A9B,/* 0x50-0x57 */
+	0x8A9C,0x8A9D,0x8A9E,0x8A9F,0x8AA0,0x8AA1,0x8AA2,0x8AA3,/* 0x58-0x5F */
+	0x8AA4,0x8AA5,0x8AA6,0x8AA7,0x8AA8,0x8AA9,0x8AAA,0x8AAB,/* 0x60-0x67 */
+	0x8AAC,0x8AAD,0x8AAE,0x8AAF,0x8AB0,0x8AB1,0x8AB2,0x8AB3,/* 0x68-0x6F */
+	0x8AB4,0x8AB5,0x8AB6,0x8AB7,0x8AB8,0x8AB9,0x8ABA,0x8ABB,/* 0x70-0x77 */
+	0x8ABC,0x8ABD,0x8ABE,0x8ABF,0x8AC0,0x8AC1,0x8AC2,0x0000,/* 0x78-0x7F */
+
+	0x8AC3,0x8AC4,0x8AC5,0x8AC6,0x8AC7,0x8AC8,0x8AC9,0x8ACA,/* 0x80-0x87 */
+	0x8ACB,0x8ACC,0x8ACD,0x8ACE,0x8ACF,0x8AD0,0x8AD1,0x8AD2,/* 0x88-0x8F */
+	0x8AD3,0x8AD4,0x8AD5,0x8AD6,0x8AD7,0x8AD8,0x8AD9,0x8ADA,/* 0x90-0x97 */
+	0x8ADB,0x8ADC,0x8ADD,0x8ADE,0x8ADF,0x8AE0,0x8AE1,0x8AE2,/* 0x98-0x9F */
+	0x8AE3,0x94E1,0x95F8,0x7728,0x6805,0x69A8,0x548B,0x4E4D,/* 0xA0-0xA7 */
+	0x70B8,0x8BC8,0x6458,0x658B,0x5B85,0x7A84,0x503A,0x5BE8,/* 0xA8-0xAF */
+	0x77BB,0x6BE1,0x8A79,0x7C98,0x6CBE,0x76CF,0x65A9,0x8F97,/* 0xB0-0xB7 */
+	0x5D2D,0x5C55,0x8638,0x6808,0x5360,0x6218,0x7AD9,0x6E5B,/* 0xB8-0xBF */
+	0x7EFD,0x6A1F,0x7AE0,0x5F70,0x6F33,0x5F20,0x638C,0x6DA8,/* 0xC0-0xC7 */
+	0x6756,0x4E08,0x5E10,0x8D26,0x4ED7,0x80C0,0x7634,0x969C,/* 0xC8-0xCF */
+	0x62DB,0x662D,0x627E,0x6CBC,0x8D75,0x7167,0x7F69,0x5146,/* 0xD0-0xD7 */
+	0x8087,0x53EC,0x906E,0x6298,0x54F2,0x86F0,0x8F99,0x8005,/* 0xD8-0xDF */
+	0x9517,0x8517,0x8FD9,0x6D59,0x73CD,0x659F,0x771F,0x7504,/* 0xE0-0xE7 */
+	0x7827,0x81FB,0x8D1E,0x9488,0x4FA6,0x6795,0x75B9,0x8BCA,/* 0xE8-0xEF */
+	0x9707,0x632F,0x9547,0x9635,0x84B8,0x6323,0x7741,0x5F81,/* 0xF0-0xF7 */
+	0x72F0,0x4E89,0x6014,0x6574,0x62EF,0x6B63,0x653F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8AE4,0x8AE5,0x8AE6,0x8AE7,0x8AE8,0x8AE9,0x8AEA,0x8AEB,/* 0x40-0x47 */
+	0x8AEC,0x8AED,0x8AEE,0x8AEF,0x8AF0,0x8AF1,0x8AF2,0x8AF3,/* 0x48-0x4F */
+	0x8AF4,0x8AF5,0x8AF6,0x8AF7,0x8AF8,0x8AF9,0x8AFA,0x8AFB,/* 0x50-0x57 */
+	0x8AFC,0x8AFD,0x8AFE,0x8AFF,0x8B00,0x8B01,0x8B02,0x8B03,/* 0x58-0x5F */
+	0x8B04,0x8B05,0x8B06,0x8B08,0x8B09,0x8B0A,0x8B0B,0x8B0C,/* 0x60-0x67 */
+	0x8B0D,0x8B0E,0x8B0F,0x8B10,0x8B11,0x8B12,0x8B13,0x8B14,/* 0x68-0x6F */
+	0x8B15,0x8B16,0x8B17,0x8B18,0x8B19,0x8B1A,0x8B1B,0x8B1C,/* 0x70-0x77 */
+	0x8B1D,0x8B1E,0x8B1F,0x8B20,0x8B21,0x8B22,0x8B23,0x0000,/* 0x78-0x7F */
+
+	0x8B24,0x8B25,0x8B27,0x8B28,0x8B29,0x8B2A,0x8B2B,0x8B2C,/* 0x80-0x87 */
+	0x8B2D,0x8B2E,0x8B2F,0x8B30,0x8B31,0x8B32,0x8B33,0x8B34,/* 0x88-0x8F */
+	0x8B35,0x8B36,0x8B37,0x8B38,0x8B39,0x8B3A,0x8B3B,0x8B3C,/* 0x90-0x97 */
+	0x8B3D,0x8B3E,0x8B3F,0x8B40,0x8B41,0x8B42,0x8B43,0x8B44,/* 0x98-0x9F */
+	0x8B45,0x5E27,0x75C7,0x90D1,0x8BC1,0x829D,0x679D,0x652F,/* 0xA0-0xA7 */
+	0x5431,0x8718,0x77E5,0x80A2,0x8102,0x6C41,0x4E4B,0x7EC7,/* 0xA8-0xAF */
+	0x804C,0x76F4,0x690D,0x6B96,0x6267,0x503C,0x4F84,0x5740,/* 0xB0-0xB7 */
+	0x6307,0x6B62,0x8DBE,0x53EA,0x65E8,0x7EB8,0x5FD7,0x631A,/* 0xB8-0xBF */
+	0x63B7,0x81F3,0x81F4,0x7F6E,0x5E1C,0x5CD9,0x5236,0x667A,/* 0xC0-0xC7 */
+	0x79E9,0x7A1A,0x8D28,0x7099,0x75D4,0x6EDE,0x6CBB,0x7A92,/* 0xC8-0xCF */
+	0x4E2D,0x76C5,0x5FE0,0x949F,0x8877,0x7EC8,0x79CD,0x80BF,/* 0xD0-0xD7 */
+	0x91CD,0x4EF2,0x4F17,0x821F,0x5468,0x5DDE,0x6D32,0x8BCC,/* 0xD8-0xDF */
+	0x7CA5,0x8F74,0x8098,0x5E1A,0x5492,0x76B1,0x5B99,0x663C,/* 0xE0-0xE7 */
+	0x9AA4,0x73E0,0x682A,0x86DB,0x6731,0x732A,0x8BF8,0x8BDB,/* 0xE8-0xEF */
+	0x9010,0x7AF9,0x70DB,0x716E,0x62C4,0x77A9,0x5631,0x4E3B,/* 0xF0-0xF7 */
+	0x8457,0x67F1,0x52A9,0x86C0,0x8D2E,0x94F8,0x7B51,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8B46,0x8B47,0x8B48,0x8B49,0x8B4A,0x8B4B,0x8B4C,0x8B4D,/* 0x40-0x47 */
+	0x8B4E,0x8B4F,0x8B50,0x8B51,0x8B52,0x8B53,0x8B54,0x8B55,/* 0x48-0x4F */
+	0x8B56,0x8B57,0x8B58,0x8B59,0x8B5A,0x8B5B,0x8B5C,0x8B5D,/* 0x50-0x57 */
+	0x8B5E,0x8B5F,0x8B60,0x8B61,0x8B62,0x8B63,0x8B64,0x8B65,/* 0x58-0x5F */
+	0x8B67,0x8B68,0x8B69,0x8B6A,0x8B6B,0x8B6D,0x8B6E,0x8B6F,/* 0x60-0x67 */
+	0x8B70,0x8B71,0x8B72,0x8B73,0x8B74,0x8B75,0x8B76,0x8B77,/* 0x68-0x6F */
+	0x8B78,0x8B79,0x8B7A,0x8B7B,0x8B7C,0x8B7D,0x8B7E,0x8B7F,/* 0x70-0x77 */
+	0x8B80,0x8B81,0x8B82,0x8B83,0x8B84,0x8B85,0x8B86,0x0000,/* 0x78-0x7F */
+
+	0x8B87,0x8B88,0x8B89,0x8B8A,0x8B8B,0x8B8C,0x8B8D,0x8B8E,/* 0x80-0x87 */
+	0x8B8F,0x8B90,0x8B91,0x8B92,0x8B93,0x8B94,0x8B95,0x8B96,/* 0x88-0x8F */
+	0x8B97,0x8B98,0x8B99,0x8B9A,0x8B9B,0x8B9C,0x8B9D,0x8B9E,/* 0x90-0x97 */
+	0x8B9F,0x8BAC,0x8BB1,0x8BBB,0x8BC7,0x8BD0,0x8BEA,0x8C09,/* 0x98-0x9F */
+	0x8C1E,0x4F4F,0x6CE8,0x795D,0x9A7B,0x6293,0x722A,0x62FD,/* 0xA0-0xA7 */
+	0x4E13,0x7816,0x8F6C,0x64B0,0x8D5A,0x7BC6,0x6869,0x5E84,/* 0xA8-0xAF */
+	0x88C5,0x5986,0x649E,0x58EE,0x72B6,0x690E,0x9525,0x8FFD,/* 0xB0-0xB7 */
+	0x8D58,0x5760,0x7F00,0x8C06,0x51C6,0x6349,0x62D9,0x5353,/* 0xB8-0xBF */
+	0x684C,0x7422,0x8301,0x914C,0x5544,0x7740,0x707C,0x6D4A,/* 0xC0-0xC7 */
+	0x5179,0x54A8,0x8D44,0x59FF,0x6ECB,0x6DC4,0x5B5C,0x7D2B,/* 0xC8-0xCF */
+	0x4ED4,0x7C7D,0x6ED3,0x5B50,0x81EA,0x6E0D,0x5B57,0x9B03,/* 0xD0-0xD7 */
+	0x68D5,0x8E2A,0x5B97,0x7EFC,0x603B,0x7EB5,0x90B9,0x8D70,/* 0xD8-0xDF */
+	0x594F,0x63CD,0x79DF,0x8DB3,0x5352,0x65CF,0x7956,0x8BC5,/* 0xE0-0xE7 */
+	0x963B,0x7EC4,0x94BB,0x7E82,0x5634,0x9189,0x6700,0x7F6A,/* 0xE8-0xEF */
+	0x5C0A,0x9075,0x6628,0x5DE6,0x4F50,0x67DE,0x505A,0x4F5C,/* 0xF0-0xF7 */
+	0x5750,0x5EA7,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8C38,0x8C39,0x8C3A,0x8C3B,0x8C3C,0x8C3D,0x8C3E,0x8C3F,/* 0x40-0x47 */
+	0x8C40,0x8C42,0x8C43,0x8C44,0x8C45,0x8C48,0x8C4A,0x8C4B,/* 0x48-0x4F */
+	0x8C4D,0x8C4E,0x8C4F,0x8C50,0x8C51,0x8C52,0x8C53,0x8C54,/* 0x50-0x57 */
+	0x8C56,0x8C57,0x8C58,0x8C59,0x8C5B,0x8C5C,0x8C5D,0x8C5E,/* 0x58-0x5F */
+	0x8C5F,0x8C60,0x8C63,0x8C64,0x8C65,0x8C66,0x8C67,0x8C68,/* 0x60-0x67 */
+	0x8C69,0x8C6C,0x8C6D,0x8C6E,0x8C6F,0x8C70,0x8C71,0x8C72,/* 0x68-0x6F */
+	0x8C74,0x8C75,0x8C76,0x8C77,0x8C7B,0x8C7C,0x8C7D,0x8C7E,/* 0x70-0x77 */
+	0x8C7F,0x8C80,0x8C81,0x8C83,0x8C84,0x8C86,0x8C87,0x0000,/* 0x78-0x7F */
+
+	0x8C88,0x8C8B,0x8C8D,0x8C8E,0x8C8F,0x8C90,0x8C91,0x8C92,/* 0x80-0x87 */
+	0x8C93,0x8C95,0x8C96,0x8C97,0x8C99,0x8C9A,0x8C9B,0x8C9C,/* 0x88-0x8F */
+	0x8C9D,0x8C9E,0x8C9F,0x8CA0,0x8CA1,0x8CA2,0x8CA3,0x8CA4,/* 0x90-0x97 */
+	0x8CA5,0x8CA6,0x8CA7,0x8CA8,0x8CA9,0x8CAA,0x8CAB,0x8CAC,/* 0x98-0x9F */
+	0x8CAD,0x4E8D,0x4E0C,0x5140,0x4E10,0x5EFF,0x5345,0x4E15,/* 0xA0-0xA7 */
+	0x4E98,0x4E1E,0x9B32,0x5B6C,0x5669,0x4E28,0x79BA,0x4E3F,/* 0xA8-0xAF */
+	0x5315,0x4E47,0x592D,0x723B,0x536E,0x6C10,0x56DF,0x80E4,/* 0xB0-0xB7 */
+	0x9997,0x6BD3,0x777E,0x9F17,0x4E36,0x4E9F,0x9F10,0x4E5C,/* 0xB8-0xBF */
+	0x4E69,0x4E93,0x8288,0x5B5B,0x556C,0x560F,0x4EC4,0x538D,/* 0xC0-0xC7 */
+	0x539D,0x53A3,0x53A5,0x53AE,0x9765,0x8D5D,0x531A,0x53F5,/* 0xC8-0xCF */
+	0x5326,0x532E,0x533E,0x8D5C,0x5366,0x5363,0x5202,0x5208,/* 0xD0-0xD7 */
+	0x520E,0x522D,0x5233,0x523F,0x5240,0x524C,0x525E,0x5261,/* 0xD8-0xDF */
+	0x525C,0x84AF,0x527D,0x5282,0x5281,0x5290,0x5293,0x5182,/* 0xE0-0xE7 */
+	0x7F54,0x4EBB,0x4EC3,0x4EC9,0x4EC2,0x4EE8,0x4EE1,0x4EEB,/* 0xE8-0xEF */
+	0x4EDE,0x4F1B,0x4EF3,0x4F22,0x4F64,0x4EF5,0x4F25,0x4F27,/* 0xF0-0xF7 */
+	0x4F09,0x4F2B,0x4F5E,0x4F67,0x6538,0x4F5A,0x4F5D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8CAE,0x8CAF,0x8CB0,0x8CB1,0x8CB2,0x8CB3,0x8CB4,0x8CB5,/* 0x40-0x47 */
+	0x8CB6,0x8CB7,0x8CB8,0x8CB9,0x8CBA,0x8CBB,0x8CBC,0x8CBD,/* 0x48-0x4F */
+	0x8CBE,0x8CBF,0x8CC0,0x8CC1,0x8CC2,0x8CC3,0x8CC4,0x8CC5,/* 0x50-0x57 */
+	0x8CC6,0x8CC7,0x8CC8,0x8CC9,0x8CCA,0x8CCB,0x8CCC,0x8CCD,/* 0x58-0x5F */
+	0x8CCE,0x8CCF,0x8CD0,0x8CD1,0x8CD2,0x8CD3,0x8CD4,0x8CD5,/* 0x60-0x67 */
+	0x8CD6,0x8CD7,0x8CD8,0x8CD9,0x8CDA,0x8CDB,0x8CDC,0x8CDD,/* 0x68-0x6F */
+	0x8CDE,0x8CDF,0x8CE0,0x8CE1,0x8CE2,0x8CE3,0x8CE4,0x8CE5,/* 0x70-0x77 */
+	0x8CE6,0x8CE7,0x8CE8,0x8CE9,0x8CEA,0x8CEB,0x8CEC,0x0000,/* 0x78-0x7F */
+
+	0x8CED,0x8CEE,0x8CEF,0x8CF0,0x8CF1,0x8CF2,0x8CF3,0x8CF4,/* 0x80-0x87 */
+	0x8CF5,0x8CF6,0x8CF7,0x8CF8,0x8CF9,0x8CFA,0x8CFB,0x8CFC,/* 0x88-0x8F */
+	0x8CFD,0x8CFE,0x8CFF,0x8D00,0x8D01,0x8D02,0x8D03,0x8D04,/* 0x90-0x97 */
+	0x8D05,0x8D06,0x8D07,0x8D08,0x8D09,0x8D0A,0x8D0B,0x8D0C,/* 0x98-0x9F */
+	0x8D0D,0x4F5F,0x4F57,0x4F32,0x4F3D,0x4F76,0x4F74,0x4F91,/* 0xA0-0xA7 */
+	0x4F89,0x4F83,0x4F8F,0x4F7E,0x4F7B,0x4FAA,0x4F7C,0x4FAC,/* 0xA8-0xAF */
+	0x4F94,0x4FE6,0x4FE8,0x4FEA,0x4FC5,0x4FDA,0x4FE3,0x4FDC,/* 0xB0-0xB7 */
+	0x4FD1,0x4FDF,0x4FF8,0x5029,0x504C,0x4FF3,0x502C,0x500F,/* 0xB8-0xBF */
+	0x502E,0x502D,0x4FFE,0x501C,0x500C,0x5025,0x5028,0x507E,/* 0xC0-0xC7 */
+	0x5043,0x5055,0x5048,0x504E,0x506C,0x507B,0x50A5,0x50A7,/* 0xC8-0xCF */
+	0x50A9,0x50BA,0x50D6,0x5106,0x50ED,0x50EC,0x50E6,0x50EE,/* 0xD0-0xD7 */
+	0x5107,0x510B,0x4EDD,0x6C3D,0x4F58,0x4F65,0x4FCE,0x9FA0,/* 0xD8-0xDF */
+	0x6C46,0x7C74,0x516E,0x5DFD,0x9EC9,0x9998,0x5181,0x5914,/* 0xE0-0xE7 */
+	0x52F9,0x530D,0x8A07,0x5310,0x51EB,0x5919,0x5155,0x4EA0,/* 0xE8-0xEF */
+	0x5156,0x4EB3,0x886E,0x88A4,0x4EB5,0x8114,0x88D2,0x7980,/* 0xF0-0xF7 */
+	0x5B34,0x8803,0x7FB8,0x51AB,0x51B1,0x51BD,0x51BC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8D0E,0x8D0F,0x8D10,0x8D11,0x8D12,0x8D13,0x8D14,0x8D15,/* 0x40-0x47 */
+	0x8D16,0x8D17,0x8D18,0x8D19,0x8D1A,0x8D1B,0x8D1C,0x8D20,/* 0x48-0x4F */
+	0x8D51,0x8D52,0x8D57,0x8D5F,0x8D65,0x8D68,0x8D69,0x8D6A,/* 0x50-0x57 */
+	0x8D6C,0x8D6E,0x8D6F,0x8D71,0x8D72,0x8D78,0x8D79,0x8D7A,/* 0x58-0x5F */
+	0x8D7B,0x8D7C,0x8D7D,0x8D7E,0x8D7F,0x8D80,0x8D82,0x8D83,/* 0x60-0x67 */
+	0x8D86,0x8D87,0x8D88,0x8D89,0x8D8C,0x8D8D,0x8D8E,0x8D8F,/* 0x68-0x6F */
+	0x8D90,0x8D92,0x8D93,0x8D95,0x8D96,0x8D97,0x8D98,0x8D99,/* 0x70-0x77 */
+	0x8D9A,0x8D9B,0x8D9C,0x8D9D,0x8D9E,0x8DA0,0x8DA1,0x0000,/* 0x78-0x7F */
+
+	0x8DA2,0x8DA4,0x8DA5,0x8DA6,0x8DA7,0x8DA8,0x8DA9,0x8DAA,/* 0x80-0x87 */
+	0x8DAB,0x8DAC,0x8DAD,0x8DAE,0x8DAF,0x8DB0,0x8DB2,0x8DB6,/* 0x88-0x8F */
+	0x8DB7,0x8DB9,0x8DBB,0x8DBD,0x8DC0,0x8DC1,0x8DC2,0x8DC5,/* 0x90-0x97 */
+	0x8DC7,0x8DC8,0x8DC9,0x8DCA,0x8DCD,0x8DD0,0x8DD2,0x8DD3,/* 0x98-0x9F */
+	0x8DD4,0x51C7,0x5196,0x51A2,0x51A5,0x8BA0,0x8BA6,0x8BA7,/* 0xA0-0xA7 */
+	0x8BAA,0x8BB4,0x8BB5,0x8BB7,0x8BC2,0x8BC3,0x8BCB,0x8BCF,/* 0xA8-0xAF */
+	0x8BCE,0x8BD2,0x8BD3,0x8BD4,0x8BD6,0x8BD8,0x8BD9,0x8BDC,/* 0xB0-0xB7 */
+	0x8BDF,0x8BE0,0x8BE4,0x8BE8,0x8BE9,0x8BEE,0x8BF0,0x8BF3,/* 0xB8-0xBF */
+	0x8BF6,0x8BF9,0x8BFC,0x8BFF,0x8C00,0x8C02,0x8C04,0x8C07,/* 0xC0-0xC7 */
+	0x8C0C,0x8C0F,0x8C11,0x8C12,0x8C14,0x8C15,0x8C16,0x8C19,/* 0xC8-0xCF */
+	0x8C1B,0x8C18,0x8C1D,0x8C1F,0x8C20,0x8C21,0x8C25,0x8C27,/* 0xD0-0xD7 */
+	0x8C2A,0x8C2B,0x8C2E,0x8C2F,0x8C32,0x8C33,0x8C35,0x8C36,/* 0xD8-0xDF */
+	0x5369,0x537A,0x961D,0x9622,0x9621,0x9631,0x962A,0x963D,/* 0xE0-0xE7 */
+	0x963C,0x9642,0x9649,0x9654,0x965F,0x9667,0x966C,0x9672,/* 0xE8-0xEF */
+	0x9674,0x9688,0x968D,0x9697,0x96B0,0x9097,0x909B,0x909D,/* 0xF0-0xF7 */
+	0x9099,0x90AC,0x90A1,0x90B4,0x90B3,0x90B6,0x90BA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8DD5,0x8DD8,0x8DD9,0x8DDC,0x8DE0,0x8DE1,0x8DE2,0x8DE5,/* 0x40-0x47 */
+	0x8DE6,0x8DE7,0x8DE9,0x8DED,0x8DEE,0x8DF0,0x8DF1,0x8DF2,/* 0x48-0x4F */
+	0x8DF4,0x8DF6,0x8DFC,0x8DFE,0x8DFF,0x8E00,0x8E01,0x8E02,/* 0x50-0x57 */
+	0x8E03,0x8E04,0x8E06,0x8E07,0x8E08,0x8E0B,0x8E0D,0x8E0E,/* 0x58-0x5F */
+	0x8E10,0x8E11,0x8E12,0x8E13,0x8E15,0x8E16,0x8E17,0x8E18,/* 0x60-0x67 */
+	0x8E19,0x8E1A,0x8E1B,0x8E1C,0x8E20,0x8E21,0x8E24,0x8E25,/* 0x68-0x6F */
+	0x8E26,0x8E27,0x8E28,0x8E2B,0x8E2D,0x8E30,0x8E32,0x8E33,/* 0x70-0x77 */
+	0x8E34,0x8E36,0x8E37,0x8E38,0x8E3B,0x8E3C,0x8E3E,0x0000,/* 0x78-0x7F */
+
+	0x8E3F,0x8E43,0x8E45,0x8E46,0x8E4C,0x8E4D,0x8E4E,0x8E4F,/* 0x80-0x87 */
+	0x8E50,0x8E53,0x8E54,0x8E55,0x8E56,0x8E57,0x8E58,0x8E5A,/* 0x88-0x8F */
+	0x8E5B,0x8E5C,0x8E5D,0x8E5E,0x8E5F,0x8E60,0x8E61,0x8E62,/* 0x90-0x97 */
+	0x8E63,0x8E64,0x8E65,0x8E67,0x8E68,0x8E6A,0x8E6B,0x8E6E,/* 0x98-0x9F */
+	0x8E71,0x90B8,0x90B0,0x90CF,0x90C5,0x90BE,0x90D0,0x90C4,/* 0xA0-0xA7 */
+	0x90C7,0x90D3,0x90E6,0x90E2,0x90DC,0x90D7,0x90DB,0x90EB,/* 0xA8-0xAF */
+	0x90EF,0x90FE,0x9104,0x9122,0x911E,0x9123,0x9131,0x912F,/* 0xB0-0xB7 */
+	0x9139,0x9143,0x9146,0x520D,0x5942,0x52A2,0x52AC,0x52AD,/* 0xB8-0xBF */
+	0x52BE,0x54FF,0x52D0,0x52D6,0x52F0,0x53DF,0x71EE,0x77CD,/* 0xC0-0xC7 */
+	0x5EF4,0x51F5,0x51FC,0x9B2F,0x53B6,0x5F01,0x755A,0x5DEF,/* 0xC8-0xCF */
+	0x574C,0x57A9,0x57A1,0x587E,0x58BC,0x58C5,0x58D1,0x5729,/* 0xD0-0xD7 */
+	0x572C,0x572A,0x5733,0x5739,0x572E,0x572F,0x575C,0x573B,/* 0xD8-0xDF */
+	0x5742,0x5769,0x5785,0x576B,0x5786,0x577C,0x577B,0x5768,/* 0xE0-0xE7 */
+	0x576D,0x5776,0x5773,0x57AD,0x57A4,0x578C,0x57B2,0x57CF,/* 0xE8-0xEF */
+	0x57A7,0x57B4,0x5793,0x57A0,0x57D5,0x57D8,0x57DA,0x57D9,/* 0xF0-0xF7 */
+	0x57D2,0x57B8,0x57F4,0x57EF,0x57F8,0x57E4,0x57DD,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8E73,0x8E75,0x8E77,0x8E78,0x8E79,0x8E7A,0x8E7B,0x8E7D,/* 0x40-0x47 */
+	0x8E7E,0x8E80,0x8E82,0x8E83,0x8E84,0x8E86,0x8E88,0x8E89,/* 0x48-0x4F */
+	0x8E8A,0x8E8B,0x8E8C,0x8E8D,0x8E8E,0x8E91,0x8E92,0x8E93,/* 0x50-0x57 */
+	0x8E95,0x8E96,0x8E97,0x8E98,0x8E99,0x8E9A,0x8E9B,0x8E9D,/* 0x58-0x5F */
+	0x8E9F,0x8EA0,0x8EA1,0x8EA2,0x8EA3,0x8EA4,0x8EA5,0x8EA6,/* 0x60-0x67 */
+	0x8EA7,0x8EA8,0x8EA9,0x8EAA,0x8EAD,0x8EAE,0x8EB0,0x8EB1,/* 0x68-0x6F */
+	0x8EB3,0x8EB4,0x8EB5,0x8EB6,0x8EB7,0x8EB8,0x8EB9,0x8EBB,/* 0x70-0x77 */
+	0x8EBC,0x8EBD,0x8EBE,0x8EBF,0x8EC0,0x8EC1,0x8EC2,0x0000,/* 0x78-0x7F */
+
+	0x8EC3,0x8EC4,0x8EC5,0x8EC6,0x8EC7,0x8EC8,0x8EC9,0x8ECA,/* 0x80-0x87 */
+	0x8ECB,0x8ECC,0x8ECD,0x8ECF,0x8ED0,0x8ED1,0x8ED2,0x8ED3,/* 0x88-0x8F */
+	0x8ED4,0x8ED5,0x8ED6,0x8ED7,0x8ED8,0x8ED9,0x8EDA,0x8EDB,/* 0x90-0x97 */
+	0x8EDC,0x8EDD,0x8EDE,0x8EDF,0x8EE0,0x8EE1,0x8EE2,0x8EE3,/* 0x98-0x9F */
+	0x8EE4,0x580B,0x580D,0x57FD,0x57ED,0x5800,0x581E,0x5819,/* 0xA0-0xA7 */
+	0x5844,0x5820,0x5865,0x586C,0x5881,0x5889,0x589A,0x5880,/* 0xA8-0xAF */
+	0x99A8,0x9F19,0x61FF,0x8279,0x827D,0x827F,0x828F,0x828A,/* 0xB0-0xB7 */
+	0x82A8,0x8284,0x828E,0x8291,0x8297,0x8299,0x82AB,0x82B8,/* 0xB8-0xBF */
+	0x82BE,0x82B0,0x82C8,0x82CA,0x82E3,0x8298,0x82B7,0x82AE,/* 0xC0-0xC7 */
+	0x82CB,0x82CC,0x82C1,0x82A9,0x82B4,0x82A1,0x82AA,0x829F,/* 0xC8-0xCF */
+	0x82C4,0x82CE,0x82A4,0x82E1,0x8309,0x82F7,0x82E4,0x830F,/* 0xD0-0xD7 */
+	0x8307,0x82DC,0x82F4,0x82D2,0x82D8,0x830C,0x82FB,0x82D3,/* 0xD8-0xDF */
+	0x8311,0x831A,0x8306,0x8314,0x8315,0x82E0,0x82D5,0x831C,/* 0xE0-0xE7 */
+	0x8351,0x835B,0x835C,0x8308,0x8392,0x833C,0x8334,0x8331,/* 0xE8-0xEF */
+	0x839B,0x835E,0x832F,0x834F,0x8347,0x8343,0x835F,0x8340,/* 0xF0-0xF7 */
+	0x8317,0x8360,0x832D,0x833A,0x8333,0x8366,0x8365,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8EE5,0x8EE6,0x8EE7,0x8EE8,0x8EE9,0x8EEA,0x8EEB,0x8EEC,/* 0x40-0x47 */
+	0x8EED,0x8EEE,0x8EEF,0x8EF0,0x8EF1,0x8EF2,0x8EF3,0x8EF4,/* 0x48-0x4F */
+	0x8EF5,0x8EF6,0x8EF7,0x8EF8,0x8EF9,0x8EFA,0x8EFB,0x8EFC,/* 0x50-0x57 */
+	0x8EFD,0x8EFE,0x8EFF,0x8F00,0x8F01,0x8F02,0x8F03,0x8F04,/* 0x58-0x5F */
+	0x8F05,0x8F06,0x8F07,0x8F08,0x8F09,0x8F0A,0x8F0B,0x8F0C,/* 0x60-0x67 */
+	0x8F0D,0x8F0E,0x8F0F,0x8F10,0x8F11,0x8F12,0x8F13,0x8F14,/* 0x68-0x6F */
+	0x8F15,0x8F16,0x8F17,0x8F18,0x8F19,0x8F1A,0x8F1B,0x8F1C,/* 0x70-0x77 */
+	0x8F1D,0x8F1E,0x8F1F,0x8F20,0x8F21,0x8F22,0x8F23,0x0000,/* 0x78-0x7F */
+
+	0x8F24,0x8F25,0x8F26,0x8F27,0x8F28,0x8F29,0x8F2A,0x8F2B,/* 0x80-0x87 */
+	0x8F2C,0x8F2D,0x8F2E,0x8F2F,0x8F30,0x8F31,0x8F32,0x8F33,/* 0x88-0x8F */
+	0x8F34,0x8F35,0x8F36,0x8F37,0x8F38,0x8F39,0x8F3A,0x8F3B,/* 0x90-0x97 */
+	0x8F3C,0x8F3D,0x8F3E,0x8F3F,0x8F40,0x8F41,0x8F42,0x8F43,/* 0x98-0x9F */
+	0x8F44,0x8368,0x831B,0x8369,0x836C,0x836A,0x836D,0x836E,/* 0xA0-0xA7 */
+	0x83B0,0x8378,0x83B3,0x83B4,0x83A0,0x83AA,0x8393,0x839C,/* 0xA8-0xAF */
+	0x8385,0x837C,0x83B6,0x83A9,0x837D,0x83B8,0x837B,0x8398,/* 0xB0-0xB7 */
+	0x839E,0x83A8,0x83BA,0x83BC,0x83C1,0x8401,0x83E5,0x83D8,/* 0xB8-0xBF */
+	0x5807,0x8418,0x840B,0x83DD,0x83FD,0x83D6,0x841C,0x8438,/* 0xC0-0xC7 */
+	0x8411,0x8406,0x83D4,0x83DF,0x840F,0x8403,0x83F8,0x83F9,/* 0xC8-0xCF */
+	0x83EA,0x83C5,0x83C0,0x8426,0x83F0,0x83E1,0x845C,0x8451,/* 0xD0-0xD7 */
+	0x845A,0x8459,0x8473,0x8487,0x8488,0x847A,0x8489,0x8478,/* 0xD8-0xDF */
+	0x843C,0x8446,0x8469,0x8476,0x848C,0x848E,0x8431,0x846D,/* 0xE0-0xE7 */
+	0x84C1,0x84CD,0x84D0,0x84E6,0x84BD,0x84D3,0x84CA,0x84BF,/* 0xE8-0xEF */
+	0x84BA,0x84E0,0x84A1,0x84B9,0x84B4,0x8497,0x84E5,0x84E3,/* 0xF0-0xF7 */
+	0x850C,0x750D,0x8538,0x84F0,0x8539,0x851F,0x853A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8F45,0x8F46,0x8F47,0x8F48,0x8F49,0x8F4A,0x8F4B,0x8F4C,/* 0x40-0x47 */
+	0x8F4D,0x8F4E,0x8F4F,0x8F50,0x8F51,0x8F52,0x8F53,0x8F54,/* 0x48-0x4F */
+	0x8F55,0x8F56,0x8F57,0x8F58,0x8F59,0x8F5A,0x8F5B,0x8F5C,/* 0x50-0x57 */
+	0x8F5D,0x8F5E,0x8F5F,0x8F60,0x8F61,0x8F62,0x8F63,0x8F64,/* 0x58-0x5F */
+	0x8F65,0x8F6A,0x8F80,0x8F8C,0x8F92,0x8F9D,0x8FA0,0x8FA1,/* 0x60-0x67 */
+	0x8FA2,0x8FA4,0x8FA5,0x8FA6,0x8FA7,0x8FAA,0x8FAC,0x8FAD,/* 0x68-0x6F */
+	0x8FAE,0x8FAF,0x8FB2,0x8FB3,0x8FB4,0x8FB5,0x8FB7,0x8FB8,/* 0x70-0x77 */
+	0x8FBA,0x8FBB,0x8FBC,0x8FBF,0x8FC0,0x8FC3,0x8FC6,0x0000,/* 0x78-0x7F */
+
+	0x8FC9,0x8FCA,0x8FCB,0x8FCC,0x8FCD,0x8FCF,0x8FD2,0x8FD6,/* 0x80-0x87 */
+	0x8FD7,0x8FDA,0x8FE0,0x8FE1,0x8FE3,0x8FE7,0x8FEC,0x8FEF,/* 0x88-0x8F */
+	0x8FF1,0x8FF2,0x8FF4,0x8FF5,0x8FF6,0x8FFA,0x8FFB,0x8FFC,/* 0x90-0x97 */
+	0x8FFE,0x8FFF,0x9007,0x9008,0x900C,0x900E,0x9013,0x9015,/* 0x98-0x9F */
+	0x9018,0x8556,0x853B,0x84FF,0x84FC,0x8559,0x8548,0x8568,/* 0xA0-0xA7 */
+	0x8564,0x855E,0x857A,0x77A2,0x8543,0x8572,0x857B,0x85A4,/* 0xA8-0xAF */
+	0x85A8,0x8587,0x858F,0x8579,0x85AE,0x859C,0x8585,0x85B9,/* 0xB0-0xB7 */
+	0x85B7,0x85B0,0x85D3,0x85C1,0x85DC,0x85FF,0x8627,0x8605,/* 0xB8-0xBF */
+	0x8629,0x8616,0x863C,0x5EFE,0x5F08,0x593C,0x5941,0x8037,/* 0xC0-0xC7 */
+	0x5955,0x595A,0x5958,0x530F,0x5C22,0x5C25,0x5C2C,0x5C34,/* 0xC8-0xCF */
+	0x624C,0x626A,0x629F,0x62BB,0x62CA,0x62DA,0x62D7,0x62EE,/* 0xD0-0xD7 */
+	0x6322,0x62F6,0x6339,0x634B,0x6343,0x63AD,0x63F6,0x6371,/* 0xD8-0xDF */
+	0x637A,0x638E,0x63B4,0x636D,0x63AC,0x638A,0x6369,0x63AE,/* 0xE0-0xE7 */
+	0x63BC,0x63F2,0x63F8,0x63E0,0x63FF,0x63C4,0x63DE,0x63CE,/* 0xE8-0xEF */
+	0x6452,0x63C6,0x63BE,0x6445,0x6441,0x640B,0x641B,0x6420,/* 0xF0-0xF7 */
+	0x640C,0x6426,0x6421,0x645E,0x6484,0x646D,0x6496,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9019,0x901C,0x9023,0x9024,0x9025,0x9027,0x9028,0x9029,/* 0x40-0x47 */
+	0x902A,0x902B,0x902C,0x9030,0x9031,0x9032,0x9033,0x9034,/* 0x48-0x4F */
+	0x9037,0x9039,0x903A,0x903D,0x903F,0x9040,0x9043,0x9045,/* 0x50-0x57 */
+	0x9046,0x9048,0x9049,0x904A,0x904B,0x904C,0x904E,0x9054,/* 0x58-0x5F */
+	0x9055,0x9056,0x9059,0x905A,0x905C,0x905D,0x905E,0x905F,/* 0x60-0x67 */
+	0x9060,0x9061,0x9064,0x9066,0x9067,0x9069,0x906A,0x906B,/* 0x68-0x6F */
+	0x906C,0x906F,0x9070,0x9071,0x9072,0x9073,0x9076,0x9077,/* 0x70-0x77 */
+	0x9078,0x9079,0x907A,0x907B,0x907C,0x907E,0x9081,0x0000,/* 0x78-0x7F */
+
+	0x9084,0x9085,0x9086,0x9087,0x9089,0x908A,0x908C,0x908D,/* 0x80-0x87 */
+	0x908E,0x908F,0x9090,0x9092,0x9094,0x9096,0x9098,0x909A,/* 0x88-0x8F */
+	0x909C,0x909E,0x909F,0x90A0,0x90A4,0x90A5,0x90A7,0x90A8,/* 0x90-0x97 */
+	0x90A9,0x90AB,0x90AD,0x90B2,0x90B7,0x90BC,0x90BD,0x90BF,/* 0x98-0x9F */
+	0x90C0,0x647A,0x64B7,0x64B8,0x6499,0x64BA,0x64C0,0x64D0,/* 0xA0-0xA7 */
+	0x64D7,0x64E4,0x64E2,0x6509,0x6525,0x652E,0x5F0B,0x5FD2,/* 0xA8-0xAF */
+	0x7519,0x5F11,0x535F,0x53F1,0x53FD,0x53E9,0x53E8,0x53FB,/* 0xB0-0xB7 */
+	0x5412,0x5416,0x5406,0x544B,0x5452,0x5453,0x5454,0x5456,/* 0xB8-0xBF */
+	0x5443,0x5421,0x5457,0x5459,0x5423,0x5432,0x5482,0x5494,/* 0xC0-0xC7 */
+	0x5477,0x5471,0x5464,0x549A,0x549B,0x5484,0x5476,0x5466,/* 0xC8-0xCF */
+	0x549D,0x54D0,0x54AD,0x54C2,0x54B4,0x54D2,0x54A7,0x54A6,/* 0xD0-0xD7 */
+	0x54D3,0x54D4,0x5472,0x54A3,0x54D5,0x54BB,0x54BF,0x54CC,/* 0xD8-0xDF */
+	0x54D9,0x54DA,0x54DC,0x54A9,0x54AA,0x54A4,0x54DD,0x54CF,/* 0xE0-0xE7 */
+	0x54DE,0x551B,0x54E7,0x5520,0x54FD,0x5514,0x54F3,0x5522,/* 0xE8-0xEF */
+	0x5523,0x550F,0x5511,0x5527,0x552A,0x5567,0x558F,0x55B5,/* 0xF0-0xF7 */
+	0x5549,0x556D,0x5541,0x5555,0x553F,0x5550,0x553C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x90C2,0x90C3,0x90C6,0x90C8,0x90C9,0x90CB,0x90CC,0x90CD,/* 0x40-0x47 */
+	0x90D2,0x90D4,0x90D5,0x90D6,0x90D8,0x90D9,0x90DA,0x90DE,/* 0x48-0x4F */
+	0x90DF,0x90E0,0x90E3,0x90E4,0x90E5,0x90E9,0x90EA,0x90EC,/* 0x50-0x57 */
+	0x90EE,0x90F0,0x90F1,0x90F2,0x90F3,0x90F5,0x90F6,0x90F7,/* 0x58-0x5F */
+	0x90F9,0x90FA,0x90FB,0x90FC,0x90FF,0x9100,0x9101,0x9103,/* 0x60-0x67 */
+	0x9105,0x9106,0x9107,0x9108,0x9109,0x910A,0x910B,0x910C,/* 0x68-0x6F */
+	0x910D,0x910E,0x910F,0x9110,0x9111,0x9112,0x9113,0x9114,/* 0x70-0x77 */
+	0x9115,0x9116,0x9117,0x9118,0x911A,0x911B,0x911C,0x0000,/* 0x78-0x7F */
+
+	0x911D,0x911F,0x9120,0x9121,0x9124,0x9125,0x9126,0x9127,/* 0x80-0x87 */
+	0x9128,0x9129,0x912A,0x912B,0x912C,0x912D,0x912E,0x9130,/* 0x88-0x8F */
+	0x9132,0x9133,0x9134,0x9135,0x9136,0x9137,0x9138,0x913A,/* 0x90-0x97 */
+	0x913B,0x913C,0x913D,0x913E,0x913F,0x9140,0x9141,0x9142,/* 0x98-0x9F */
+	0x9144,0x5537,0x5556,0x5575,0x5576,0x5577,0x5533,0x5530,/* 0xA0-0xA7 */
+	0x555C,0x558B,0x55D2,0x5583,0x55B1,0x55B9,0x5588,0x5581,/* 0xA8-0xAF */
+	0x559F,0x557E,0x55D6,0x5591,0x557B,0x55DF,0x55BD,0x55BE,/* 0xB0-0xB7 */
+	0x5594,0x5599,0x55EA,0x55F7,0x55C9,0x561F,0x55D1,0x55EB,/* 0xB8-0xBF */
+	0x55EC,0x55D4,0x55E6,0x55DD,0x55C4,0x55EF,0x55E5,0x55F2,/* 0xC0-0xC7 */
+	0x55F3,0x55CC,0x55CD,0x55E8,0x55F5,0x55E4,0x8F94,0x561E,/* 0xC8-0xCF */
+	0x5608,0x560C,0x5601,0x5624,0x5623,0x55FE,0x5600,0x5627,/* 0xD0-0xD7 */
+	0x562D,0x5658,0x5639,0x5657,0x562C,0x564D,0x5662,0x5659,/* 0xD8-0xDF */
+	0x565C,0x564C,0x5654,0x5686,0x5664,0x5671,0x566B,0x567B,/* 0xE0-0xE7 */
+	0x567C,0x5685,0x5693,0x56AF,0x56D4,0x56D7,0x56DD,0x56E1,/* 0xE8-0xEF */
+	0x56F5,0x56EB,0x56F9,0x56FF,0x5704,0x570A,0x5709,0x571C,/* 0xF0-0xF7 */
+	0x5E0F,0x5E19,0x5E14,0x5E11,0x5E31,0x5E3B,0x5E3C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9145,0x9147,0x9148,0x9151,0x9153,0x9154,0x9155,0x9156,/* 0x40-0x47 */
+	0x9158,0x9159,0x915B,0x915C,0x915F,0x9160,0x9166,0x9167,/* 0x48-0x4F */
+	0x9168,0x916B,0x916D,0x9173,0x917A,0x917B,0x917C,0x9180,/* 0x50-0x57 */
+	0x9181,0x9182,0x9183,0x9184,0x9186,0x9188,0x918A,0x918E,/* 0x58-0x5F */
+	0x918F,0x9193,0x9194,0x9195,0x9196,0x9197,0x9198,0x9199,/* 0x60-0x67 */
+	0x919C,0x919D,0x919E,0x919F,0x91A0,0x91A1,0x91A4,0x91A5,/* 0x68-0x6F */
+	0x91A6,0x91A7,0x91A8,0x91A9,0x91AB,0x91AC,0x91B0,0x91B1,/* 0x70-0x77 */
+	0x91B2,0x91B3,0x91B6,0x91B7,0x91B8,0x91B9,0x91BB,0x0000,/* 0x78-0x7F */
+
+	0x91BC,0x91BD,0x91BE,0x91BF,0x91C0,0x91C1,0x91C2,0x91C3,/* 0x80-0x87 */
+	0x91C4,0x91C5,0x91C6,0x91C8,0x91CB,0x91D0,0x91D2,0x91D3,/* 0x88-0x8F */
+	0x91D4,0x91D5,0x91D6,0x91D7,0x91D8,0x91D9,0x91DA,0x91DB,/* 0x90-0x97 */
+	0x91DD,0x91DE,0x91DF,0x91E0,0x91E1,0x91E2,0x91E3,0x91E4,/* 0x98-0x9F */
+	0x91E5,0x5E37,0x5E44,0x5E54,0x5E5B,0x5E5E,0x5E61,0x5C8C,/* 0xA0-0xA7 */
+	0x5C7A,0x5C8D,0x5C90,0x5C96,0x5C88,0x5C98,0x5C99,0x5C91,/* 0xA8-0xAF */
+	0x5C9A,0x5C9C,0x5CB5,0x5CA2,0x5CBD,0x5CAC,0x5CAB,0x5CB1,/* 0xB0-0xB7 */
+	0x5CA3,0x5CC1,0x5CB7,0x5CC4,0x5CD2,0x5CE4,0x5CCB,0x5CE5,/* 0xB8-0xBF */
+	0x5D02,0x5D03,0x5D27,0x5D26,0x5D2E,0x5D24,0x5D1E,0x5D06,/* 0xC0-0xC7 */
+	0x5D1B,0x5D58,0x5D3E,0x5D34,0x5D3D,0x5D6C,0x5D5B,0x5D6F,/* 0xC8-0xCF */
+	0x5D5D,0x5D6B,0x5D4B,0x5D4A,0x5D69,0x5D74,0x5D82,0x5D99,/* 0xD0-0xD7 */
+	0x5D9D,0x8C73,0x5DB7,0x5DC5,0x5F73,0x5F77,0x5F82,0x5F87,/* 0xD8-0xDF */
+	0x5F89,0x5F8C,0x5F95,0x5F99,0x5F9C,0x5FA8,0x5FAD,0x5FB5,/* 0xE0-0xE7 */
+	0x5FBC,0x8862,0x5F61,0x72AD,0x72B0,0x72B4,0x72B7,0x72B8,/* 0xE8-0xEF */
+	0x72C3,0x72C1,0x72CE,0x72CD,0x72D2,0x72E8,0x72EF,0x72E9,/* 0xF0-0xF7 */
+	0x72F2,0x72F4,0x72F7,0x7301,0x72F3,0x7303,0x72FA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x91E6,0x91E7,0x91E8,0x91E9,0x91EA,0x91EB,0x91EC,0x91ED,/* 0x40-0x47 */
+	0x91EE,0x91EF,0x91F0,0x91F1,0x91F2,0x91F3,0x91F4,0x91F5,/* 0x48-0x4F */
+	0x91F6,0x91F7,0x91F8,0x91F9,0x91FA,0x91FB,0x91FC,0x91FD,/* 0x50-0x57 */
+	0x91FE,0x91FF,0x9200,0x9201,0x9202,0x9203,0x9204,0x9205,/* 0x58-0x5F */
+	0x9206,0x9207,0x9208,0x9209,0x920A,0x920B,0x920C,0x920D,/* 0x60-0x67 */
+	0x920E,0x920F,0x9210,0x9211,0x9212,0x9213,0x9214,0x9215,/* 0x68-0x6F */
+	0x9216,0x9217,0x9218,0x9219,0x921A,0x921B,0x921C,0x921D,/* 0x70-0x77 */
+	0x921E,0x921F,0x9220,0x9221,0x9222,0x9223,0x9224,0x0000,/* 0x78-0x7F */
+
+	0x9225,0x9226,0x9227,0x9228,0x9229,0x922A,0x922B,0x922C,/* 0x80-0x87 */
+	0x922D,0x922E,0x922F,0x9230,0x9231,0x9232,0x9233,0x9234,/* 0x88-0x8F */
+	0x9235,0x9236,0x9237,0x9238,0x9239,0x923A,0x923B,0x923C,/* 0x90-0x97 */
+	0x923D,0x923E,0x923F,0x9240,0x9241,0x9242,0x9243,0x9244,/* 0x98-0x9F */
+	0x9245,0x72FB,0x7317,0x7313,0x7321,0x730A,0x731E,0x731D,/* 0xA0-0xA7 */
+	0x7315,0x7322,0x7339,0x7325,0x732C,0x7338,0x7331,0x7350,/* 0xA8-0xAF */
+	0x734D,0x7357,0x7360,0x736C,0x736F,0x737E,0x821B,0x5925,/* 0xB0-0xB7 */
+	0x98E7,0x5924,0x5902,0x9963,0x9967,0x9968,0x9969,0x996A,/* 0xB8-0xBF */
+	0x996B,0x996C,0x9974,0x9977,0x997D,0x9980,0x9984,0x9987,/* 0xC0-0xC7 */
+	0x998A,0x998D,0x9990,0x9991,0x9993,0x9994,0x9995,0x5E80,/* 0xC8-0xCF */
+	0x5E91,0x5E8B,0x5E96,0x5EA5,0x5EA0,0x5EB9,0x5EB5,0x5EBE,/* 0xD0-0xD7 */
+	0x5EB3,0x8D53,0x5ED2,0x5ED1,0x5EDB,0x5EE8,0x5EEA,0x81BA,/* 0xD8-0xDF */
+	0x5FC4,0x5FC9,0x5FD6,0x5FCF,0x6003,0x5FEE,0x6004,0x5FE1,/* 0xE0-0xE7 */
+	0x5FE4,0x5FFE,0x6005,0x6006,0x5FEA,0x5FED,0x5FF8,0x6019,/* 0xE8-0xEF */
+	0x6035,0x6026,0x601B,0x600F,0x600D,0x6029,0x602B,0x600A,/* 0xF0-0xF7 */
+	0x603F,0x6021,0x6078,0x6079,0x607B,0x607A,0x6042,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9246,0x9247,0x9248,0x9249,0x924A,0x924B,0x924C,0x924D,/* 0x40-0x47 */
+	0x924E,0x924F,0x9250,0x9251,0x9252,0x9253,0x9254,0x9255,/* 0x48-0x4F */
+	0x9256,0x9257,0x9258,0x9259,0x925A,0x925B,0x925C,0x925D,/* 0x50-0x57 */
+	0x925E,0x925F,0x9260,0x9261,0x9262,0x9263,0x9264,0x9265,/* 0x58-0x5F */
+	0x9266,0x9267,0x9268,0x9269,0x926A,0x926B,0x926C,0x926D,/* 0x60-0x67 */
+	0x926E,0x926F,0x9270,0x9271,0x9272,0x9273,0x9275,0x9276,/* 0x68-0x6F */
+	0x9277,0x9278,0x9279,0x927A,0x927B,0x927C,0x927D,0x927E,/* 0x70-0x77 */
+	0x927F,0x9280,0x9281,0x9282,0x9283,0x9284,0x9285,0x0000,/* 0x78-0x7F */
+
+	0x9286,0x9287,0x9288,0x9289,0x928A,0x928B,0x928C,0x928D,/* 0x80-0x87 */
+	0x928F,0x9290,0x9291,0x9292,0x9293,0x9294,0x9295,0x9296,/* 0x88-0x8F */
+	0x9297,0x9298,0x9299,0x929A,0x929B,0x929C,0x929D,0x929E,/* 0x90-0x97 */
+	0x929F,0x92A0,0x92A1,0x92A2,0x92A3,0x92A4,0x92A5,0x92A6,/* 0x98-0x9F */
+	0x92A7,0x606A,0x607D,0x6096,0x609A,0x60AD,0x609D,0x6083,/* 0xA0-0xA7 */
+	0x6092,0x608C,0x609B,0x60EC,0x60BB,0x60B1,0x60DD,0x60D8,/* 0xA8-0xAF */
+	0x60C6,0x60DA,0x60B4,0x6120,0x6126,0x6115,0x6123,0x60F4,/* 0xB0-0xB7 */
+	0x6100,0x610E,0x612B,0x614A,0x6175,0x61AC,0x6194,0x61A7,/* 0xB8-0xBF */
+	0x61B7,0x61D4,0x61F5,0x5FDD,0x96B3,0x95E9,0x95EB,0x95F1,/* 0xC0-0xC7 */
+	0x95F3,0x95F5,0x95F6,0x95FC,0x95FE,0x9603,0x9604,0x9606,/* 0xC8-0xCF */
+	0x9608,0x960A,0x960B,0x960C,0x960D,0x960F,0x9612,0x9615,/* 0xD0-0xD7 */
+	0x9616,0x9617,0x9619,0x961A,0x4E2C,0x723F,0x6215,0x6C35,/* 0xD8-0xDF */
+	0x6C54,0x6C5C,0x6C4A,0x6CA3,0x6C85,0x6C90,0x6C94,0x6C8C,/* 0xE0-0xE7 */
+	0x6C68,0x6C69,0x6C74,0x6C76,0x6C86,0x6CA9,0x6CD0,0x6CD4,/* 0xE8-0xEF */
+	0x6CAD,0x6CF7,0x6CF8,0x6CF1,0x6CD7,0x6CB2,0x6CE0,0x6CD6,/* 0xF0-0xF7 */
+	0x6CFA,0x6CEB,0x6CEE,0x6CB1,0x6CD3,0x6CEF,0x6CFE,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x92A8,0x92A9,0x92AA,0x92AB,0x92AC,0x92AD,0x92AF,0x92B0,/* 0x40-0x47 */
+	0x92B1,0x92B2,0x92B3,0x92B4,0x92B5,0x92B6,0x92B7,0x92B8,/* 0x48-0x4F */
+	0x92B9,0x92BA,0x92BB,0x92BC,0x92BD,0x92BE,0x92BF,0x92C0,/* 0x50-0x57 */
+	0x92C1,0x92C2,0x92C3,0x92C4,0x92C5,0x92C6,0x92C7,0x92C9,/* 0x58-0x5F */
+	0x92CA,0x92CB,0x92CC,0x92CD,0x92CE,0x92CF,0x92D0,0x92D1,/* 0x60-0x67 */
+	0x92D2,0x92D3,0x92D4,0x92D5,0x92D6,0x92D7,0x92D8,0x92D9,/* 0x68-0x6F */
+	0x92DA,0x92DB,0x92DC,0x92DD,0x92DE,0x92DF,0x92E0,0x92E1,/* 0x70-0x77 */
+	0x92E2,0x92E3,0x92E4,0x92E5,0x92E6,0x92E7,0x92E8,0x0000,/* 0x78-0x7F */
+
+	0x92E9,0x92EA,0x92EB,0x92EC,0x92ED,0x92EE,0x92EF,0x92F0,/* 0x80-0x87 */
+	0x92F1,0x92F2,0x92F3,0x92F4,0x92F5,0x92F6,0x92F7,0x92F8,/* 0x88-0x8F */
+	0x92F9,0x92FA,0x92FB,0x92FC,0x92FD,0x92FE,0x92FF,0x9300,/* 0x90-0x97 */
+	0x9301,0x9302,0x9303,0x9304,0x9305,0x9306,0x9307,0x9308,/* 0x98-0x9F */
+	0x9309,0x6D39,0x6D27,0x6D0C,0x6D43,0x6D48,0x6D07,0x6D04,/* 0xA0-0xA7 */
+	0x6D19,0x6D0E,0x6D2B,0x6D4D,0x6D2E,0x6D35,0x6D1A,0x6D4F,/* 0xA8-0xAF */
+	0x6D52,0x6D54,0x6D33,0x6D91,0x6D6F,0x6D9E,0x6DA0,0x6D5E,/* 0xB0-0xB7 */
+	0x6D93,0x6D94,0x6D5C,0x6D60,0x6D7C,0x6D63,0x6E1A,0x6DC7,/* 0xB8-0xBF */
+	0x6DC5,0x6DDE,0x6E0E,0x6DBF,0x6DE0,0x6E11,0x6DE6,0x6DDD,/* 0xC0-0xC7 */
+	0x6DD9,0x6E16,0x6DAB,0x6E0C,0x6DAE,0x6E2B,0x6E6E,0x6E4E,/* 0xC8-0xCF */
+	0x6E6B,0x6EB2,0x6E5F,0x6E86,0x6E53,0x6E54,0x6E32,0x6E25,/* 0xD0-0xD7 */
+	0x6E44,0x6EDF,0x6EB1,0x6E98,0x6EE0,0x6F2D,0x6EE2,0x6EA5,/* 0xD8-0xDF */
+	0x6EA7,0x6EBD,0x6EBB,0x6EB7,0x6ED7,0x6EB4,0x6ECF,0x6E8F,/* 0xE0-0xE7 */
+	0x6EC2,0x6E9F,0x6F62,0x6F46,0x6F47,0x6F24,0x6F15,0x6EF9,/* 0xE8-0xEF */
+	0x6F2F,0x6F36,0x6F4B,0x6F74,0x6F2A,0x6F09,0x6F29,0x6F89,/* 0xF0-0xF7 */
+	0x6F8D,0x6F8C,0x6F78,0x6F72,0x6F7C,0x6F7A,0x6FD1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x930A,0x930B,0x930C,0x930D,0x930E,0x930F,0x9310,0x9311,/* 0x40-0x47 */
+	0x9312,0x9313,0x9314,0x9315,0x9316,0x9317,0x9318,0x9319,/* 0x48-0x4F */
+	0x931A,0x931B,0x931C,0x931D,0x931E,0x931F,0x9320,0x9321,/* 0x50-0x57 */
+	0x9322,0x9323,0x9324,0x9325,0x9326,0x9327,0x9328,0x9329,/* 0x58-0x5F */
+	0x932A,0x932B,0x932C,0x932D,0x932E,0x932F,0x9330,0x9331,/* 0x60-0x67 */
+	0x9332,0x9333,0x9334,0x9335,0x9336,0x9337,0x9338,0x9339,/* 0x68-0x6F */
+	0x933A,0x933B,0x933C,0x933D,0x933F,0x9340,0x9341,0x9342,/* 0x70-0x77 */
+	0x9343,0x9344,0x9345,0x9346,0x9347,0x9348,0x9349,0x0000,/* 0x78-0x7F */
+
+	0x934A,0x934B,0x934C,0x934D,0x934E,0x934F,0x9350,0x9351,/* 0x80-0x87 */
+	0x9352,0x9353,0x9354,0x9355,0x9356,0x9357,0x9358,0x9359,/* 0x88-0x8F */
+	0x935A,0x935B,0x935C,0x935D,0x935E,0x935F,0x9360,0x9361,/* 0x90-0x97 */
+	0x9362,0x9363,0x9364,0x9365,0x9366,0x9367,0x9368,0x9369,/* 0x98-0x9F */
+	0x936B,0x6FC9,0x6FA7,0x6FB9,0x6FB6,0x6FC2,0x6FE1,0x6FEE,/* 0xA0-0xA7 */
+	0x6FDE,0x6FE0,0x6FEF,0x701A,0x7023,0x701B,0x7039,0x7035,/* 0xA8-0xAF */
+	0x704F,0x705E,0x5B80,0x5B84,0x5B95,0x5B93,0x5BA5,0x5BB8,/* 0xB0-0xB7 */
+	0x752F,0x9A9E,0x6434,0x5BE4,0x5BEE,0x8930,0x5BF0,0x8E47,/* 0xB8-0xBF */
+	0x8B07,0x8FB6,0x8FD3,0x8FD5,0x8FE5,0x8FEE,0x8FE4,0x8FE9,/* 0xC0-0xC7 */
+	0x8FE6,0x8FF3,0x8FE8,0x9005,0x9004,0x900B,0x9026,0x9011,/* 0xC8-0xCF */
+	0x900D,0x9016,0x9021,0x9035,0x9036,0x902D,0x902F,0x9044,/* 0xD0-0xD7 */
+	0x9051,0x9052,0x9050,0x9068,0x9058,0x9062,0x905B,0x66B9,/* 0xD8-0xDF */
+	0x9074,0x907D,0x9082,0x9088,0x9083,0x908B,0x5F50,0x5F57,/* 0xE0-0xE7 */
+	0x5F56,0x5F58,0x5C3B,0x54AB,0x5C50,0x5C59,0x5B71,0x5C63,/* 0xE8-0xEF */
+	0x5C66,0x7FBC,0x5F2A,0x5F29,0x5F2D,0x8274,0x5F3C,0x9B3B,/* 0xF0-0xF7 */
+	0x5C6E,0x5981,0x5983,0x598D,0x59A9,0x59AA,0x59A3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x936C,0x936D,0x936E,0x936F,0x9370,0x9371,0x9372,0x9373,/* 0x40-0x47 */
+	0x9374,0x9375,0x9376,0x9377,0x9378,0x9379,0x937A,0x937B,/* 0x48-0x4F */
+	0x937C,0x937D,0x937E,0x937F,0x9380,0x9381,0x9382,0x9383,/* 0x50-0x57 */
+	0x9384,0x9385,0x9386,0x9387,0x9388,0x9389,0x938A,0x938B,/* 0x58-0x5F */
+	0x938C,0x938D,0x938E,0x9390,0x9391,0x9392,0x9393,0x9394,/* 0x60-0x67 */
+	0x9395,0x9396,0x9397,0x9398,0x9399,0x939A,0x939B,0x939C,/* 0x68-0x6F */
+	0x939D,0x939E,0x939F,0x93A0,0x93A1,0x93A2,0x93A3,0x93A4,/* 0x70-0x77 */
+	0x93A5,0x93A6,0x93A7,0x93A8,0x93A9,0x93AA,0x93AB,0x0000,/* 0x78-0x7F */
+
+	0x93AC,0x93AD,0x93AE,0x93AF,0x93B0,0x93B1,0x93B2,0x93B3,/* 0x80-0x87 */
+	0x93B4,0x93B5,0x93B6,0x93B7,0x93B8,0x93B9,0x93BA,0x93BB,/* 0x88-0x8F */
+	0x93BC,0x93BD,0x93BE,0x93BF,0x93C0,0x93C1,0x93C2,0x93C3,/* 0x90-0x97 */
+	0x93C4,0x93C5,0x93C6,0x93C7,0x93C8,0x93C9,0x93CB,0x93CC,/* 0x98-0x9F */
+	0x93CD,0x5997,0x59CA,0x59AB,0x599E,0x59A4,0x59D2,0x59B2,/* 0xA0-0xA7 */
+	0x59AF,0x59D7,0x59BE,0x5A05,0x5A06,0x59DD,0x5A08,0x59E3,/* 0xA8-0xAF */
+	0x59D8,0x59F9,0x5A0C,0x5A09,0x5A32,0x5A34,0x5A11,0x5A23,/* 0xB0-0xB7 */
+	0x5A13,0x5A40,0x5A67,0x5A4A,0x5A55,0x5A3C,0x5A62,0x5A75,/* 0xB8-0xBF */
+	0x80EC,0x5AAA,0x5A9B,0x5A77,0x5A7A,0x5ABE,0x5AEB,0x5AB2,/* 0xC0-0xC7 */
+	0x5AD2,0x5AD4,0x5AB8,0x5AE0,0x5AE3,0x5AF1,0x5AD6,0x5AE6,/* 0xC8-0xCF */
+	0x5AD8,0x5ADC,0x5B09,0x5B17,0x5B16,0x5B32,0x5B37,0x5B40,/* 0xD0-0xD7 */
+	0x5C15,0x5C1C,0x5B5A,0x5B65,0x5B73,0x5B51,0x5B53,0x5B62,/* 0xD8-0xDF */
+	0x9A75,0x9A77,0x9A78,0x9A7A,0x9A7F,0x9A7D,0x9A80,0x9A81,/* 0xE0-0xE7 */
+	0x9A85,0x9A88,0x9A8A,0x9A90,0x9A92,0x9A93,0x9A96,0x9A98,/* 0xE8-0xEF */
+	0x9A9B,0x9A9C,0x9A9D,0x9A9F,0x9AA0,0x9AA2,0x9AA3,0x9AA5,/* 0xF0-0xF7 */
+	0x9AA7,0x7E9F,0x7EA1,0x7EA3,0x7EA5,0x7EA8,0x7EA9,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x93CE,0x93CF,0x93D0,0x93D1,0x93D2,0x93D3,0x93D4,0x93D5,/* 0x40-0x47 */
+	0x93D7,0x93D8,0x93D9,0x93DA,0x93DB,0x93DC,0x93DD,0x93DE,/* 0x48-0x4F */
+	0x93DF,0x93E0,0x93E1,0x93E2,0x93E3,0x93E4,0x93E5,0x93E6,/* 0x50-0x57 */
+	0x93E7,0x93E8,0x93E9,0x93EA,0x93EB,0x93EC,0x93ED,0x93EE,/* 0x58-0x5F */
+	0x93EF,0x93F0,0x93F1,0x93F2,0x93F3,0x93F4,0x93F5,0x93F6,/* 0x60-0x67 */
+	0x93F7,0x93F8,0x93F9,0x93FA,0x93FB,0x93FC,0x93FD,0x93FE,/* 0x68-0x6F */
+	0x93FF,0x9400,0x9401,0x9402,0x9403,0x9404,0x9405,0x9406,/* 0x70-0x77 */
+	0x9407,0x9408,0x9409,0x940A,0x940B,0x940C,0x940D,0x0000,/* 0x78-0x7F */
+
+	0x940E,0x940F,0x9410,0x9411,0x9412,0x9413,0x9414,0x9415,/* 0x80-0x87 */
+	0x9416,0x9417,0x9418,0x9419,0x941A,0x941B,0x941C,0x941D,/* 0x88-0x8F */
+	0x941E,0x941F,0x9420,0x9421,0x9422,0x9423,0x9424,0x9425,/* 0x90-0x97 */
+	0x9426,0x9427,0x9428,0x9429,0x942A,0x942B,0x942C,0x942D,/* 0x98-0x9F */
+	0x942E,0x7EAD,0x7EB0,0x7EBE,0x7EC0,0x7EC1,0x7EC2,0x7EC9,/* 0xA0-0xA7 */
+	0x7ECB,0x7ECC,0x7ED0,0x7ED4,0x7ED7,0x7EDB,0x7EE0,0x7EE1,/* 0xA8-0xAF */
+	0x7EE8,0x7EEB,0x7EEE,0x7EEF,0x7EF1,0x7EF2,0x7F0D,0x7EF6,/* 0xB0-0xB7 */
+	0x7EFA,0x7EFB,0x7EFE,0x7F01,0x7F02,0x7F03,0x7F07,0x7F08,/* 0xB8-0xBF */
+	0x7F0B,0x7F0C,0x7F0F,0x7F11,0x7F12,0x7F17,0x7F19,0x7F1C,/* 0xC0-0xC7 */
+	0x7F1B,0x7F1F,0x7F21,0x7F22,0x7F23,0x7F24,0x7F25,0x7F26,/* 0xC8-0xCF */
+	0x7F27,0x7F2A,0x7F2B,0x7F2C,0x7F2D,0x7F2F,0x7F30,0x7F31,/* 0xD0-0xD7 */
+	0x7F32,0x7F33,0x7F35,0x5E7A,0x757F,0x5DDB,0x753E,0x9095,/* 0xD8-0xDF */
+	0x738E,0x7391,0x73AE,0x73A2,0x739F,0x73CF,0x73C2,0x73D1,/* 0xE0-0xE7 */
+	0x73B7,0x73B3,0x73C0,0x73C9,0x73C8,0x73E5,0x73D9,0x987C,/* 0xE8-0xEF */
+	0x740A,0x73E9,0x73E7,0x73DE,0x73BA,0x73F2,0x740F,0x742A,/* 0xF0-0xF7 */
+	0x745B,0x7426,0x7425,0x7428,0x7430,0x742E,0x742C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x942F,0x9430,0x9431,0x9432,0x9433,0x9434,0x9435,0x9436,/* 0x40-0x47 */
+	0x9437,0x9438,0x9439,0x943A,0x943B,0x943C,0x943D,0x943F,/* 0x48-0x4F */
+	0x9440,0x9441,0x9442,0x9443,0x9444,0x9445,0x9446,0x9447,/* 0x50-0x57 */
+	0x9448,0x9449,0x944A,0x944B,0x944C,0x944D,0x944E,0x944F,/* 0x58-0x5F */
+	0x9450,0x9451,0x9452,0x9453,0x9454,0x9455,0x9456,0x9457,/* 0x60-0x67 */
+	0x9458,0x9459,0x945A,0x945B,0x945C,0x945D,0x945E,0x945F,/* 0x68-0x6F */
+	0x9460,0x9461,0x9462,0x9463,0x9464,0x9465,0x9466,0x9467,/* 0x70-0x77 */
+	0x9468,0x9469,0x946A,0x946C,0x946D,0x946E,0x946F,0x0000,/* 0x78-0x7F */
+
+	0x9470,0x9471,0x9472,0x9473,0x9474,0x9475,0x9476,0x9477,/* 0x80-0x87 */
+	0x9478,0x9479,0x947A,0x947B,0x947C,0x947D,0x947E,0x947F,/* 0x88-0x8F */
+	0x9480,0x9481,0x9482,0x9483,0x9484,0x9491,0x9496,0x9498,/* 0x90-0x97 */
+	0x94C7,0x94CF,0x94D3,0x94D4,0x94DA,0x94E6,0x94FB,0x951C,/* 0x98-0x9F */
+	0x9520,0x741B,0x741A,0x7441,0x745C,0x7457,0x7455,0x7459,/* 0xA0-0xA7 */
+	0x7477,0x746D,0x747E,0x749C,0x748E,0x7480,0x7481,0x7487,/* 0xA8-0xAF */
+	0x748B,0x749E,0x74A8,0x74A9,0x7490,0x74A7,0x74D2,0x74BA,/* 0xB0-0xB7 */
+	0x97EA,0x97EB,0x97EC,0x674C,0x6753,0x675E,0x6748,0x6769,/* 0xB8-0xBF */
+	0x67A5,0x6787,0x676A,0x6773,0x6798,0x67A7,0x6775,0x67A8,/* 0xC0-0xC7 */
+	0x679E,0x67AD,0x678B,0x6777,0x677C,0x67F0,0x6809,0x67D8,/* 0xC8-0xCF */
+	0x680A,0x67E9,0x67B0,0x680C,0x67D9,0x67B5,0x67DA,0x67B3,/* 0xD0-0xD7 */
+	0x67DD,0x6800,0x67C3,0x67B8,0x67E2,0x680E,0x67C1,0x67FD,/* 0xD8-0xDF */
+	0x6832,0x6833,0x6860,0x6861,0x684E,0x6862,0x6844,0x6864,/* 0xE0-0xE7 */
+	0x6883,0x681D,0x6855,0x6866,0x6841,0x6867,0x6840,0x683E,/* 0xE8-0xEF */
+	0x684A,0x6849,0x6829,0x68B5,0x688F,0x6874,0x6877,0x6893,/* 0xF0-0xF7 */
+	0x686B,0x68C2,0x696E,0x68FC,0x691F,0x6920,0x68F9,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9527,0x9533,0x953D,0x9543,0x9548,0x954B,0x9555,0x955A,/* 0x40-0x47 */
+	0x9560,0x956E,0x9574,0x9575,0x9577,0x9578,0x9579,0x957A,/* 0x48-0x4F */
+	0x957B,0x957C,0x957D,0x957E,0x9580,0x9581,0x9582,0x9583,/* 0x50-0x57 */
+	0x9584,0x9585,0x9586,0x9587,0x9588,0x9589,0x958A,0x958B,/* 0x58-0x5F */
+	0x958C,0x958D,0x958E,0x958F,0x9590,0x9591,0x9592,0x9593,/* 0x60-0x67 */
+	0x9594,0x9595,0x9596,0x9597,0x9598,0x9599,0x959A,0x959B,/* 0x68-0x6F */
+	0x959C,0x959D,0x959E,0x959F,0x95A0,0x95A1,0x95A2,0x95A3,/* 0x70-0x77 */
+	0x95A4,0x95A5,0x95A6,0x95A7,0x95A8,0x95A9,0x95AA,0x0000,/* 0x78-0x7F */
+
+	0x95AB,0x95AC,0x95AD,0x95AE,0x95AF,0x95B0,0x95B1,0x95B2,/* 0x80-0x87 */
+	0x95B3,0x95B4,0x95B5,0x95B6,0x95B7,0x95B8,0x95B9,0x95BA,/* 0x88-0x8F */
+	0x95BB,0x95BC,0x95BD,0x95BE,0x95BF,0x95C0,0x95C1,0x95C2,/* 0x90-0x97 */
+	0x95C3,0x95C4,0x95C5,0x95C6,0x95C7,0x95C8,0x95C9,0x95CA,/* 0x98-0x9F */
+	0x95CB,0x6924,0x68F0,0x690B,0x6901,0x6957,0x68E3,0x6910,/* 0xA0-0xA7 */
+	0x6971,0x6939,0x6960,0x6942,0x695D,0x6984,0x696B,0x6980,/* 0xA8-0xAF */
+	0x6998,0x6978,0x6934,0x69CC,0x6987,0x6988,0x69CE,0x6989,/* 0xB0-0xB7 */
+	0x6966,0x6963,0x6979,0x699B,0x69A7,0x69BB,0x69AB,0x69AD,/* 0xB8-0xBF */
+	0x69D4,0x69B1,0x69C1,0x69CA,0x69DF,0x6995,0x69E0,0x698D,/* 0xC0-0xC7 */
+	0x69FF,0x6A2F,0x69ED,0x6A17,0x6A18,0x6A65,0x69F2,0x6A44,/* 0xC8-0xCF */
+	0x6A3E,0x6AA0,0x6A50,0x6A5B,0x6A35,0x6A8E,0x6A79,0x6A3D,/* 0xD0-0xD7 */
+	0x6A28,0x6A58,0x6A7C,0x6A91,0x6A90,0x6AA9,0x6A97,0x6AAB,/* 0xD8-0xDF */
+	0x7337,0x7352,0x6B81,0x6B82,0x6B87,0x6B84,0x6B92,0x6B93,/* 0xE0-0xE7 */
+	0x6B8D,0x6B9A,0x6B9B,0x6BA1,0x6BAA,0x8F6B,0x8F6D,0x8F71,/* 0xE8-0xEF */
+	0x8F72,0x8F73,0x8F75,0x8F76,0x8F78,0x8F77,0x8F79,0x8F7A,/* 0xF0-0xF7 */
+	0x8F7C,0x8F7E,0x8F81,0x8F82,0x8F84,0x8F87,0x8F8B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x95CC,0x95CD,0x95CE,0x95CF,0x95D0,0x95D1,0x95D2,0x95D3,/* 0x40-0x47 */
+	0x95D4,0x95D5,0x95D6,0x95D7,0x95D8,0x95D9,0x95DA,0x95DB,/* 0x48-0x4F */
+	0x95DC,0x95DD,0x95DE,0x95DF,0x95E0,0x95E1,0x95E2,0x95E3,/* 0x50-0x57 */
+	0x95E4,0x95E5,0x95E6,0x95E7,0x95EC,0x95FF,0x9607,0x9613,/* 0x58-0x5F */
+	0x9618,0x961B,0x961E,0x9620,0x9623,0x9624,0x9625,0x9626,/* 0x60-0x67 */
+	0x9627,0x9628,0x9629,0x962B,0x962C,0x962D,0x962F,0x9630,/* 0x68-0x6F */
+	0x9637,0x9638,0x9639,0x963A,0x963E,0x9641,0x9643,0x964A,/* 0x70-0x77 */
+	0x964E,0x964F,0x9651,0x9652,0x9653,0x9656,0x9657,0x0000,/* 0x78-0x7F */
+
+	0x9658,0x9659,0x965A,0x965C,0x965D,0x965E,0x9660,0x9663,/* 0x80-0x87 */
+	0x9665,0x9666,0x966B,0x966D,0x966E,0x966F,0x9670,0x9671,/* 0x88-0x8F */
+	0x9673,0x9678,0x9679,0x967A,0x967B,0x967C,0x967D,0x967E,/* 0x90-0x97 */
+	0x967F,0x9680,0x9681,0x9682,0x9683,0x9684,0x9687,0x9689,/* 0x98-0x9F */
+	0x968A,0x8F8D,0x8F8E,0x8F8F,0x8F98,0x8F9A,0x8ECE,0x620B,/* 0xA0-0xA7 */
+	0x6217,0x621B,0x621F,0x6222,0x6221,0x6225,0x6224,0x622C,/* 0xA8-0xAF */
+	0x81E7,0x74EF,0x74F4,0x74FF,0x750F,0x7511,0x7513,0x6534,/* 0xB0-0xB7 */
+	0x65EE,0x65EF,0x65F0,0x660A,0x6619,0x6772,0x6603,0x6615,/* 0xB8-0xBF */
+	0x6600,0x7085,0x66F7,0x661D,0x6634,0x6631,0x6636,0x6635,/* 0xC0-0xC7 */
+	0x8006,0x665F,0x6654,0x6641,0x664F,0x6656,0x6661,0x6657,/* 0xC8-0xCF */
+	0x6677,0x6684,0x668C,0x66A7,0x669D,0x66BE,0x66DB,0x66DC,/* 0xD0-0xD7 */
+	0x66E6,0x66E9,0x8D32,0x8D33,0x8D36,0x8D3B,0x8D3D,0x8D40,/* 0xD8-0xDF */
+	0x8D45,0x8D46,0x8D48,0x8D49,0x8D47,0x8D4D,0x8D55,0x8D59,/* 0xE0-0xE7 */
+	0x89C7,0x89CA,0x89CB,0x89CC,0x89CE,0x89CF,0x89D0,0x89D1,/* 0xE8-0xEF */
+	0x726E,0x729F,0x725D,0x7266,0x726F,0x727E,0x727F,0x7284,/* 0xF0-0xF7 */
+	0x728B,0x728D,0x728F,0x7292,0x6308,0x6332,0x63B0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x968C,0x968E,0x9691,0x9692,0x9693,0x9695,0x9696,0x969A,/* 0x40-0x47 */
+	0x969B,0x969D,0x969E,0x969F,0x96A0,0x96A1,0x96A2,0x96A3,/* 0x48-0x4F */
+	0x96A4,0x96A5,0x96A6,0x96A8,0x96A9,0x96AA,0x96AB,0x96AC,/* 0x50-0x57 */
+	0x96AD,0x96AE,0x96AF,0x96B1,0x96B2,0x96B4,0x96B5,0x96B7,/* 0x58-0x5F */
+	0x96B8,0x96BA,0x96BB,0x96BF,0x96C2,0x96C3,0x96C8,0x96CA,/* 0x60-0x67 */
+	0x96CB,0x96D0,0x96D1,0x96D3,0x96D4,0x96D6,0x96D7,0x96D8,/* 0x68-0x6F */
+	0x96D9,0x96DA,0x96DB,0x96DC,0x96DD,0x96DE,0x96DF,0x96E1,/* 0x70-0x77 */
+	0x96E2,0x96E3,0x96E4,0x96E5,0x96E6,0x96E7,0x96EB,0x0000,/* 0x78-0x7F */
+
+	0x96EC,0x96ED,0x96EE,0x96F0,0x96F1,0x96F2,0x96F4,0x96F5,/* 0x80-0x87 */
+	0x96F8,0x96FA,0x96FB,0x96FC,0x96FD,0x96FF,0x9702,0x9703,/* 0x88-0x8F */
+	0x9705,0x970A,0x970B,0x970C,0x9710,0x9711,0x9712,0x9714,/* 0x90-0x97 */
+	0x9715,0x9717,0x9718,0x9719,0x971A,0x971B,0x971D,0x971F,/* 0x98-0x9F */
+	0x9720,0x643F,0x64D8,0x8004,0x6BEA,0x6BF3,0x6BFD,0x6BF5,/* 0xA0-0xA7 */
+	0x6BF9,0x6C05,0x6C07,0x6C06,0x6C0D,0x6C15,0x6C18,0x6C19,/* 0xA8-0xAF */
+	0x6C1A,0x6C21,0x6C29,0x6C24,0x6C2A,0x6C32,0x6535,0x6555,/* 0xB0-0xB7 */
+	0x656B,0x724D,0x7252,0x7256,0x7230,0x8662,0x5216,0x809F,/* 0xB8-0xBF */
+	0x809C,0x8093,0x80BC,0x670A,0x80BD,0x80B1,0x80AB,0x80AD,/* 0xC0-0xC7 */
+	0x80B4,0x80B7,0x80E7,0x80E8,0x80E9,0x80EA,0x80DB,0x80C2,/* 0xC8-0xCF */
+	0x80C4,0x80D9,0x80CD,0x80D7,0x6710,0x80DD,0x80EB,0x80F1,/* 0xD0-0xD7 */
+	0x80F4,0x80ED,0x810D,0x810E,0x80F2,0x80FC,0x6715,0x8112,/* 0xD8-0xDF */
+	0x8C5A,0x8136,0x811E,0x812C,0x8118,0x8132,0x8148,0x814C,/* 0xE0-0xE7 */
+	0x8153,0x8174,0x8159,0x815A,0x8171,0x8160,0x8169,0x817C,/* 0xE8-0xEF */
+	0x817D,0x816D,0x8167,0x584D,0x5AB5,0x8188,0x8182,0x8191,/* 0xF0-0xF7 */
+	0x6ED5,0x81A3,0x81AA,0x81CC,0x6726,0x81CA,0x81BB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9721,0x9722,0x9723,0x9724,0x9725,0x9726,0x9727,0x9728,/* 0x40-0x47 */
+	0x9729,0x972B,0x972C,0x972E,0x972F,0x9731,0x9733,0x9734,/* 0x48-0x4F */
+	0x9735,0x9736,0x9737,0x973A,0x973B,0x973C,0x973D,0x973F,/* 0x50-0x57 */
+	0x9740,0x9741,0x9742,0x9743,0x9744,0x9745,0x9746,0x9747,/* 0x58-0x5F */
+	0x9748,0x9749,0x974A,0x974B,0x974C,0x974D,0x974E,0x974F,/* 0x60-0x67 */
+	0x9750,0x9751,0x9754,0x9755,0x9757,0x9758,0x975A,0x975C,/* 0x68-0x6F */
+	0x975D,0x975F,0x9763,0x9764,0x9766,0x9767,0x9768,0x976A,/* 0x70-0x77 */
+	0x976B,0x976C,0x976D,0x976E,0x976F,0x9770,0x9771,0x0000,/* 0x78-0x7F */
+
+	0x9772,0x9775,0x9777,0x9778,0x9779,0x977A,0x977B,0x977D,/* 0x80-0x87 */
+	0x977E,0x977F,0x9780,0x9781,0x9782,0x9783,0x9784,0x9786,/* 0x88-0x8F */
+	0x9787,0x9788,0x9789,0x978A,0x978C,0x978E,0x978F,0x9790,/* 0x90-0x97 */
+	0x9793,0x9795,0x9796,0x9797,0x9799,0x979A,0x979B,0x979C,/* 0x98-0x9F */
+	0x979D,0x81C1,0x81A6,0x6B24,0x6B37,0x6B39,0x6B43,0x6B46,/* 0xA0-0xA7 */
+	0x6B59,0x98D1,0x98D2,0x98D3,0x98D5,0x98D9,0x98DA,0x6BB3,/* 0xA8-0xAF */
+	0x5F40,0x6BC2,0x89F3,0x6590,0x9F51,0x6593,0x65BC,0x65C6,/* 0xB0-0xB7 */
+	0x65C4,0x65C3,0x65CC,0x65CE,0x65D2,0x65D6,0x7080,0x709C,/* 0xB8-0xBF */
+	0x7096,0x709D,0x70BB,0x70C0,0x70B7,0x70AB,0x70B1,0x70E8,/* 0xC0-0xC7 */
+	0x70CA,0x7110,0x7113,0x7116,0x712F,0x7131,0x7173,0x715C,/* 0xC8-0xCF */
+	0x7168,0x7145,0x7172,0x714A,0x7178,0x717A,0x7198,0x71B3,/* 0xD0-0xD7 */
+	0x71B5,0x71A8,0x71A0,0x71E0,0x71D4,0x71E7,0x71F9,0x721D,/* 0xD8-0xDF */
+	0x7228,0x706C,0x7118,0x7166,0x71B9,0x623E,0x623D,0x6243,/* 0xE0-0xE7 */
+	0x6248,0x6249,0x793B,0x7940,0x7946,0x7949,0x795B,0x795C,/* 0xE8-0xEF */
+	0x7953,0x795A,0x7962,0x7957,0x7960,0x796F,0x7967,0x797A,/* 0xF0-0xF7 */
+	0x7985,0x798A,0x799A,0x79A7,0x79B3,0x5FD1,0x5FD0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_ED[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x979E,0x979F,0x97A1,0x97A2,0x97A4,0x97A5,0x97A6,0x97A7,/* 0x40-0x47 */
+	0x97A8,0x97A9,0x97AA,0x97AC,0x97AE,0x97B0,0x97B1,0x97B3,/* 0x48-0x4F */
+	0x97B5,0x97B6,0x97B7,0x97B8,0x97B9,0x97BA,0x97BB,0x97BC,/* 0x50-0x57 */
+	0x97BD,0x97BE,0x97BF,0x97C0,0x97C1,0x97C2,0x97C3,0x97C4,/* 0x58-0x5F */
+	0x97C5,0x97C6,0x97C7,0x97C8,0x97C9,0x97CA,0x97CB,0x97CC,/* 0x60-0x67 */
+	0x97CD,0x97CE,0x97CF,0x97D0,0x97D1,0x97D2,0x97D3,0x97D4,/* 0x68-0x6F */
+	0x97D5,0x97D6,0x97D7,0x97D8,0x97D9,0x97DA,0x97DB,0x97DC,/* 0x70-0x77 */
+	0x97DD,0x97DE,0x97DF,0x97E0,0x97E1,0x97E2,0x97E3,0x0000,/* 0x78-0x7F */
+
+	0x97E4,0x97E5,0x97E8,0x97EE,0x97EF,0x97F0,0x97F1,0x97F2,/* 0x80-0x87 */
+	0x97F4,0x97F7,0x97F8,0x97F9,0x97FA,0x97FB,0x97FC,0x97FD,/* 0x88-0x8F */
+	0x97FE,0x97FF,0x9800,0x9801,0x9802,0x9803,0x9804,0x9805,/* 0x90-0x97 */
+	0x9806,0x9807,0x9808,0x9809,0x980A,0x980B,0x980C,0x980D,/* 0x98-0x9F */
+	0x980E,0x603C,0x605D,0x605A,0x6067,0x6041,0x6059,0x6063,/* 0xA0-0xA7 */
+	0x60AB,0x6106,0x610D,0x615D,0x61A9,0x619D,0x61CB,0x61D1,/* 0xA8-0xAF */
+	0x6206,0x8080,0x807F,0x6C93,0x6CF6,0x6DFC,0x77F6,0x77F8,/* 0xB0-0xB7 */
+	0x7800,0x7809,0x7817,0x7818,0x7811,0x65AB,0x782D,0x781C,/* 0xB8-0xBF */
+	0x781D,0x7839,0x783A,0x783B,0x781F,0x783C,0x7825,0x782C,/* 0xC0-0xC7 */
+	0x7823,0x7829,0x784E,0x786D,0x7856,0x7857,0x7826,0x7850,/* 0xC8-0xCF */
+	0x7847,0x784C,0x786A,0x789B,0x7893,0x789A,0x7887,0x789C,/* 0xD0-0xD7 */
+	0x78A1,0x78A3,0x78B2,0x78B9,0x78A5,0x78D4,0x78D9,0x78C9,/* 0xD8-0xDF */
+	0x78EC,0x78F2,0x7905,0x78F4,0x7913,0x7924,0x791E,0x7934,/* 0xE0-0xE7 */
+	0x9F9B,0x9EF9,0x9EFB,0x9EFC,0x76F1,0x7704,0x770D,0x76F9,/* 0xE8-0xEF */
+	0x7707,0x7708,0x771A,0x7722,0x7719,0x772D,0x7726,0x7735,/* 0xF0-0xF7 */
+	0x7738,0x7750,0x7751,0x7747,0x7743,0x775A,0x7768,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x980F,0x9810,0x9811,0x9812,0x9813,0x9814,0x9815,0x9816,/* 0x40-0x47 */
+	0x9817,0x9818,0x9819,0x981A,0x981B,0x981C,0x981D,0x981E,/* 0x48-0x4F */
+	0x981F,0x9820,0x9821,0x9822,0x9823,0x9824,0x9825,0x9826,/* 0x50-0x57 */
+	0x9827,0x9828,0x9829,0x982A,0x982B,0x982C,0x982D,0x982E,/* 0x58-0x5F */
+	0x982F,0x9830,0x9831,0x9832,0x9833,0x9834,0x9835,0x9836,/* 0x60-0x67 */
+	0x9837,0x9838,0x9839,0x983A,0x983B,0x983C,0x983D,0x983E,/* 0x68-0x6F */
+	0x983F,0x9840,0x9841,0x9842,0x9843,0x9844,0x9845,0x9846,/* 0x70-0x77 */
+	0x9847,0x9848,0x9849,0x984A,0x984B,0x984C,0x984D,0x0000,/* 0x78-0x7F */
+
+	0x984E,0x984F,0x9850,0x9851,0x9852,0x9853,0x9854,0x9855,/* 0x80-0x87 */
+	0x9856,0x9857,0x9858,0x9859,0x985A,0x985B,0x985C,0x985D,/* 0x88-0x8F */
+	0x985E,0x985F,0x9860,0x9861,0x9862,0x9863,0x9864,0x9865,/* 0x90-0x97 */
+	0x9866,0x9867,0x9868,0x9869,0x986A,0x986B,0x986C,0x986D,/* 0x98-0x9F */
+	0x986E,0x7762,0x7765,0x777F,0x778D,0x777D,0x7780,0x778C,/* 0xA0-0xA7 */
+	0x7791,0x779F,0x77A0,0x77B0,0x77B5,0x77BD,0x753A,0x7540,/* 0xA8-0xAF */
+	0x754E,0x754B,0x7548,0x755B,0x7572,0x7579,0x7583,0x7F58,/* 0xB0-0xB7 */
+	0x7F61,0x7F5F,0x8A48,0x7F68,0x7F74,0x7F71,0x7F79,0x7F81,/* 0xB8-0xBF */
+	0x7F7E,0x76CD,0x76E5,0x8832,0x9485,0x9486,0x9487,0x948B,/* 0xC0-0xC7 */
+	0x948A,0x948C,0x948D,0x948F,0x9490,0x9494,0x9497,0x9495,/* 0xC8-0xCF */
+	0x949A,0x949B,0x949C,0x94A3,0x94A4,0x94AB,0x94AA,0x94AD,/* 0xD0-0xD7 */
+	0x94AC,0x94AF,0x94B0,0x94B2,0x94B4,0x94B6,0x94B7,0x94B8,/* 0xD8-0xDF */
+	0x94B9,0x94BA,0x94BC,0x94BD,0x94BF,0x94C4,0x94C8,0x94C9,/* 0xE0-0xE7 */
+	0x94CA,0x94CB,0x94CC,0x94CD,0x94CE,0x94D0,0x94D1,0x94D2,/* 0xE8-0xEF */
+	0x94D5,0x94D6,0x94D7,0x94D9,0x94D8,0x94DB,0x94DE,0x94DF,/* 0xF0-0xF7 */
+	0x94E0,0x94E2,0x94E4,0x94E5,0x94E7,0x94E8,0x94EA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x986F,0x9870,0x9871,0x9872,0x9873,0x9874,0x988B,0x988E,/* 0x40-0x47 */
+	0x9892,0x9895,0x9899,0x98A3,0x98A8,0x98A9,0x98AA,0x98AB,/* 0x48-0x4F */
+	0x98AC,0x98AD,0x98AE,0x98AF,0x98B0,0x98B1,0x98B2,0x98B3,/* 0x50-0x57 */
+	0x98B4,0x98B5,0x98B6,0x98B7,0x98B8,0x98B9,0x98BA,0x98BB,/* 0x58-0x5F */
+	0x98BC,0x98BD,0x98BE,0x98BF,0x98C0,0x98C1,0x98C2,0x98C3,/* 0x60-0x67 */
+	0x98C4,0x98C5,0x98C6,0x98C7,0x98C8,0x98C9,0x98CA,0x98CB,/* 0x68-0x6F */
+	0x98CC,0x98CD,0x98CF,0x98D0,0x98D4,0x98D6,0x98D7,0x98DB,/* 0x70-0x77 */
+	0x98DC,0x98DD,0x98E0,0x98E1,0x98E2,0x98E3,0x98E4,0x0000,/* 0x78-0x7F */
+
+	0x98E5,0x98E6,0x98E9,0x98EA,0x98EB,0x98EC,0x98ED,0x98EE,/* 0x80-0x87 */
+	0x98EF,0x98F0,0x98F1,0x98F2,0x98F3,0x98F4,0x98F5,0x98F6,/* 0x88-0x8F */
+	0x98F7,0x98F8,0x98F9,0x98FA,0x98FB,0x98FC,0x98FD,0x98FE,/* 0x90-0x97 */
+	0x98FF,0x9900,0x9901,0x9902,0x9903,0x9904,0x9905,0x9906,/* 0x98-0x9F */
+	0x9907,0x94E9,0x94EB,0x94EE,0x94EF,0x94F3,0x94F4,0x94F5,/* 0xA0-0xA7 */
+	0x94F7,0x94F9,0x94FC,0x94FD,0x94FF,0x9503,0x9502,0x9506,/* 0xA8-0xAF */
+	0x9507,0x9509,0x950A,0x950D,0x950E,0x950F,0x9512,0x9513,/* 0xB0-0xB7 */
+	0x9514,0x9515,0x9516,0x9518,0x951B,0x951D,0x951E,0x951F,/* 0xB8-0xBF */
+	0x9522,0x952A,0x952B,0x9529,0x952C,0x9531,0x9532,0x9534,/* 0xC0-0xC7 */
+	0x9536,0x9537,0x9538,0x953C,0x953E,0x953F,0x9542,0x9535,/* 0xC8-0xCF */
+	0x9544,0x9545,0x9546,0x9549,0x954C,0x954E,0x954F,0x9552,/* 0xD0-0xD7 */
+	0x9553,0x9554,0x9556,0x9557,0x9558,0x9559,0x955B,0x955E,/* 0xD8-0xDF */
+	0x955F,0x955D,0x9561,0x9562,0x9564,0x9565,0x9566,0x9567,/* 0xE0-0xE7 */
+	0x9568,0x9569,0x956A,0x956B,0x956C,0x956F,0x9571,0x9572,/* 0xE8-0xEF */
+	0x9573,0x953A,0x77E7,0x77EC,0x96C9,0x79D5,0x79ED,0x79E3,/* 0xF0-0xF7 */
+	0x79EB,0x7A06,0x5D47,0x7A03,0x7A02,0x7A1E,0x7A14,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9908,0x9909,0x990A,0x990B,0x990C,0x990E,0x990F,0x9911,/* 0x40-0x47 */
+	0x9912,0x9913,0x9914,0x9915,0x9916,0x9917,0x9918,0x9919,/* 0x48-0x4F */
+	0x991A,0x991B,0x991C,0x991D,0x991E,0x991F,0x9920,0x9921,/* 0x50-0x57 */
+	0x9922,0x9923,0x9924,0x9925,0x9926,0x9927,0x9928,0x9929,/* 0x58-0x5F */
+	0x992A,0x992B,0x992C,0x992D,0x992F,0x9930,0x9931,0x9932,/* 0x60-0x67 */
+	0x9933,0x9934,0x9935,0x9936,0x9937,0x9938,0x9939,0x993A,/* 0x68-0x6F */
+	0x993B,0x993C,0x993D,0x993E,0x993F,0x9940,0x9941,0x9942,/* 0x70-0x77 */
+	0x9943,0x9944,0x9945,0x9946,0x9947,0x9948,0x9949,0x0000,/* 0x78-0x7F */
+
+	0x994A,0x994B,0x994C,0x994D,0x994E,0x994F,0x9950,0x9951,/* 0x80-0x87 */
+	0x9952,0x9953,0x9956,0x9957,0x9958,0x9959,0x995A,0x995B,/* 0x88-0x8F */
+	0x995C,0x995D,0x995E,0x995F,0x9960,0x9961,0x9962,0x9964,/* 0x90-0x97 */
+	0x9966,0x9973,0x9978,0x9979,0x997B,0x997E,0x9982,0x9983,/* 0x98-0x9F */
+	0x9989,0x7A39,0x7A37,0x7A51,0x9ECF,0x99A5,0x7A70,0x7688,/* 0xA0-0xA7 */
+	0x768E,0x7693,0x7699,0x76A4,0x74DE,0x74E0,0x752C,0x9E20,/* 0xA8-0xAF */
+	0x9E22,0x9E28,0x9E29,0x9E2A,0x9E2B,0x9E2C,0x9E32,0x9E31,/* 0xB0-0xB7 */
+	0x9E36,0x9E38,0x9E37,0x9E39,0x9E3A,0x9E3E,0x9E41,0x9E42,/* 0xB8-0xBF */
+	0x9E44,0x9E46,0x9E47,0x9E48,0x9E49,0x9E4B,0x9E4C,0x9E4E,/* 0xC0-0xC7 */
+	0x9E51,0x9E55,0x9E57,0x9E5A,0x9E5B,0x9E5C,0x9E5E,0x9E63,/* 0xC8-0xCF */
+	0x9E66,0x9E67,0x9E68,0x9E69,0x9E6A,0x9E6B,0x9E6C,0x9E71,/* 0xD0-0xD7 */
+	0x9E6D,0x9E73,0x7592,0x7594,0x7596,0x75A0,0x759D,0x75AC,/* 0xD8-0xDF */
+	0x75A3,0x75B3,0x75B4,0x75B8,0x75C4,0x75B1,0x75B0,0x75C3,/* 0xE0-0xE7 */
+	0x75C2,0x75D6,0x75CD,0x75E3,0x75E8,0x75E6,0x75E4,0x75EB,/* 0xE8-0xEF */
+	0x75E7,0x7603,0x75F1,0x75FC,0x75FF,0x7610,0x7600,0x7605,/* 0xF0-0xF7 */
+	0x760C,0x7617,0x760A,0x7625,0x7618,0x7615,0x7619,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x998C,0x998E,0x999A,0x999B,0x999C,0x999D,0x999E,0x999F,/* 0x40-0x47 */
+	0x99A0,0x99A1,0x99A2,0x99A3,0x99A4,0x99A6,0x99A7,0x99A9,/* 0x48-0x4F */
+	0x99AA,0x99AB,0x99AC,0x99AD,0x99AE,0x99AF,0x99B0,0x99B1,/* 0x50-0x57 */
+	0x99B2,0x99B3,0x99B4,0x99B5,0x99B6,0x99B7,0x99B8,0x99B9,/* 0x58-0x5F */
+	0x99BA,0x99BB,0x99BC,0x99BD,0x99BE,0x99BF,0x99C0,0x99C1,/* 0x60-0x67 */
+	0x99C2,0x99C3,0x99C4,0x99C5,0x99C6,0x99C7,0x99C8,0x99C9,/* 0x68-0x6F */
+	0x99CA,0x99CB,0x99CC,0x99CD,0x99CE,0x99CF,0x99D0,0x99D1,/* 0x70-0x77 */
+	0x99D2,0x99D3,0x99D4,0x99D5,0x99D6,0x99D7,0x99D8,0x0000,/* 0x78-0x7F */
+
+	0x99D9,0x99DA,0x99DB,0x99DC,0x99DD,0x99DE,0x99DF,0x99E0,/* 0x80-0x87 */
+	0x99E1,0x99E2,0x99E3,0x99E4,0x99E5,0x99E6,0x99E7,0x99E8,/* 0x88-0x8F */
+	0x99E9,0x99EA,0x99EB,0x99EC,0x99ED,0x99EE,0x99EF,0x99F0,/* 0x90-0x97 */
+	0x99F1,0x99F2,0x99F3,0x99F4,0x99F5,0x99F6,0x99F7,0x99F8,/* 0x98-0x9F */
+	0x99F9,0x761B,0x763C,0x7622,0x7620,0x7640,0x762D,0x7630,/* 0xA0-0xA7 */
+	0x763F,0x7635,0x7643,0x763E,0x7633,0x764D,0x765E,0x7654,/* 0xA8-0xAF */
+	0x765C,0x7656,0x766B,0x766F,0x7FCA,0x7AE6,0x7A78,0x7A79,/* 0xB0-0xB7 */
+	0x7A80,0x7A86,0x7A88,0x7A95,0x7AA6,0x7AA0,0x7AAC,0x7AA8,/* 0xB8-0xBF */
+	0x7AAD,0x7AB3,0x8864,0x8869,0x8872,0x887D,0x887F,0x8882,/* 0xC0-0xC7 */
+	0x88A2,0x88C6,0x88B7,0x88BC,0x88C9,0x88E2,0x88CE,0x88E3,/* 0xC8-0xCF */
+	0x88E5,0x88F1,0x891A,0x88FC,0x88E8,0x88FE,0x88F0,0x8921,/* 0xD0-0xD7 */
+	0x8919,0x8913,0x891B,0x890A,0x8934,0x892B,0x8936,0x8941,/* 0xD8-0xDF */
+	0x8966,0x897B,0x758B,0x80E5,0x76B2,0x76B4,0x77DC,0x8012,/* 0xE0-0xE7 */
+	0x8014,0x8016,0x801C,0x8020,0x8022,0x8025,0x8026,0x8027,/* 0xE8-0xEF */
+	0x8029,0x8028,0x8031,0x800B,0x8035,0x8043,0x8046,0x804D,/* 0xF0-0xF7 */
+	0x8052,0x8069,0x8071,0x8983,0x9878,0x9880,0x9883,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x99FA,0x99FB,0x99FC,0x99FD,0x99FE,0x99FF,0x9A00,0x9A01,/* 0x40-0x47 */
+	0x9A02,0x9A03,0x9A04,0x9A05,0x9A06,0x9A07,0x9A08,0x9A09,/* 0x48-0x4F */
+	0x9A0A,0x9A0B,0x9A0C,0x9A0D,0x9A0E,0x9A0F,0x9A10,0x9A11,/* 0x50-0x57 */
+	0x9A12,0x9A13,0x9A14,0x9A15,0x9A16,0x9A17,0x9A18,0x9A19,/* 0x58-0x5F */
+	0x9A1A,0x9A1B,0x9A1C,0x9A1D,0x9A1E,0x9A1F,0x9A20,0x9A21,/* 0x60-0x67 */
+	0x9A22,0x9A23,0x9A24,0x9A25,0x9A26,0x9A27,0x9A28,0x9A29,/* 0x68-0x6F */
+	0x9A2A,0x9A2B,0x9A2C,0x9A2D,0x9A2E,0x9A2F,0x9A30,0x9A31,/* 0x70-0x77 */
+	0x9A32,0x9A33,0x9A34,0x9A35,0x9A36,0x9A37,0x9A38,0x0000,/* 0x78-0x7F */
+
+	0x9A39,0x9A3A,0x9A3B,0x9A3C,0x9A3D,0x9A3E,0x9A3F,0x9A40,/* 0x80-0x87 */
+	0x9A41,0x9A42,0x9A43,0x9A44,0x9A45,0x9A46,0x9A47,0x9A48,/* 0x88-0x8F */
+	0x9A49,0x9A4A,0x9A4B,0x9A4C,0x9A4D,0x9A4E,0x9A4F,0x9A50,/* 0x90-0x97 */
+	0x9A51,0x9A52,0x9A53,0x9A54,0x9A55,0x9A56,0x9A57,0x9A58,/* 0x98-0x9F */
+	0x9A59,0x9889,0x988C,0x988D,0x988F,0x9894,0x989A,0x989B,/* 0xA0-0xA7 */
+	0x989E,0x989F,0x98A1,0x98A2,0x98A5,0x98A6,0x864D,0x8654,/* 0xA8-0xAF */
+	0x866C,0x866E,0x867F,0x867A,0x867C,0x867B,0x86A8,0x868D,/* 0xB0-0xB7 */
+	0x868B,0x86AC,0x869D,0x86A7,0x86A3,0x86AA,0x8693,0x86A9,/* 0xB8-0xBF */
+	0x86B6,0x86C4,0x86B5,0x86CE,0x86B0,0x86BA,0x86B1,0x86AF,/* 0xC0-0xC7 */
+	0x86C9,0x86CF,0x86B4,0x86E9,0x86F1,0x86F2,0x86ED,0x86F3,/* 0xC8-0xCF */
+	0x86D0,0x8713,0x86DE,0x86F4,0x86DF,0x86D8,0x86D1,0x8703,/* 0xD0-0xD7 */
+	0x8707,0x86F8,0x8708,0x870A,0x870D,0x8709,0x8723,0x873B,/* 0xD8-0xDF */
+	0x871E,0x8725,0x872E,0x871A,0x873E,0x8748,0x8734,0x8731,/* 0xE0-0xE7 */
+	0x8729,0x8737,0x873F,0x8782,0x8722,0x877D,0x877E,0x877B,/* 0xE8-0xEF */
+	0x8760,0x8770,0x874C,0x876E,0x878B,0x8753,0x8763,0x877C,/* 0xF0-0xF7 */
+	0x8764,0x8759,0x8765,0x8793,0x87AF,0x87A8,0x87D2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9A5A,0x9A5B,0x9A5C,0x9A5D,0x9A5E,0x9A5F,0x9A60,0x9A61,/* 0x40-0x47 */
+	0x9A62,0x9A63,0x9A64,0x9A65,0x9A66,0x9A67,0x9A68,0x9A69,/* 0x48-0x4F */
+	0x9A6A,0x9A6B,0x9A72,0x9A83,0x9A89,0x9A8D,0x9A8E,0x9A94,/* 0x50-0x57 */
+	0x9A95,0x9A99,0x9AA6,0x9AA9,0x9AAA,0x9AAB,0x9AAC,0x9AAD,/* 0x58-0x5F */
+	0x9AAE,0x9AAF,0x9AB2,0x9AB3,0x9AB4,0x9AB5,0x9AB9,0x9ABB,/* 0x60-0x67 */
+	0x9ABD,0x9ABE,0x9ABF,0x9AC3,0x9AC4,0x9AC6,0x9AC7,0x9AC8,/* 0x68-0x6F */
+	0x9AC9,0x9ACA,0x9ACD,0x9ACE,0x9ACF,0x9AD0,0x9AD2,0x9AD4,/* 0x70-0x77 */
+	0x9AD5,0x9AD6,0x9AD7,0x9AD9,0x9ADA,0x9ADB,0x9ADC,0x0000,/* 0x78-0x7F */
+
+	0x9ADD,0x9ADE,0x9AE0,0x9AE2,0x9AE3,0x9AE4,0x9AE5,0x9AE7,/* 0x80-0x87 */
+	0x9AE8,0x9AE9,0x9AEA,0x9AEC,0x9AEE,0x9AF0,0x9AF1,0x9AF2,/* 0x88-0x8F */
+	0x9AF3,0x9AF4,0x9AF5,0x9AF6,0x9AF7,0x9AF8,0x9AFA,0x9AFC,/* 0x90-0x97 */
+	0x9AFD,0x9AFE,0x9AFF,0x9B00,0x9B01,0x9B02,0x9B04,0x9B05,/* 0x98-0x9F */
+	0x9B06,0x87C6,0x8788,0x8785,0x87AD,0x8797,0x8783,0x87AB,/* 0xA0-0xA7 */
+	0x87E5,0x87AC,0x87B5,0x87B3,0x87CB,0x87D3,0x87BD,0x87D1,/* 0xA8-0xAF */
+	0x87C0,0x87CA,0x87DB,0x87EA,0x87E0,0x87EE,0x8816,0x8813,/* 0xB0-0xB7 */
+	0x87FE,0x880A,0x881B,0x8821,0x8839,0x883C,0x7F36,0x7F42,/* 0xB8-0xBF */
+	0x7F44,0x7F45,0x8210,0x7AFA,0x7AFD,0x7B08,0x7B03,0x7B04,/* 0xC0-0xC7 */
+	0x7B15,0x7B0A,0x7B2B,0x7B0F,0x7B47,0x7B38,0x7B2A,0x7B19,/* 0xC8-0xCF */
+	0x7B2E,0x7B31,0x7B20,0x7B25,0x7B24,0x7B33,0x7B3E,0x7B1E,/* 0xD0-0xD7 */
+	0x7B58,0x7B5A,0x7B45,0x7B75,0x7B4C,0x7B5D,0x7B60,0x7B6E,/* 0xD8-0xDF */
+	0x7B7B,0x7B62,0x7B72,0x7B71,0x7B90,0x7BA6,0x7BA7,0x7BB8,/* 0xE0-0xE7 */
+	0x7BAC,0x7B9D,0x7BA8,0x7B85,0x7BAA,0x7B9C,0x7BA2,0x7BAB,/* 0xE8-0xEF */
+	0x7BB4,0x7BD1,0x7BC1,0x7BCC,0x7BDD,0x7BDA,0x7BE5,0x7BE6,/* 0xF0-0xF7 */
+	0x7BEA,0x7C0C,0x7BFE,0x7BFC,0x7C0F,0x7C16,0x7C0B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9B07,0x9B09,0x9B0A,0x9B0B,0x9B0C,0x9B0D,0x9B0E,0x9B10,/* 0x40-0x47 */
+	0x9B11,0x9B12,0x9B14,0x9B15,0x9B16,0x9B17,0x9B18,0x9B19,/* 0x48-0x4F */
+	0x9B1A,0x9B1B,0x9B1C,0x9B1D,0x9B1E,0x9B20,0x9B21,0x9B22,/* 0x50-0x57 */
+	0x9B24,0x9B25,0x9B26,0x9B27,0x9B28,0x9B29,0x9B2A,0x9B2B,/* 0x58-0x5F */
+	0x9B2C,0x9B2D,0x9B2E,0x9B30,0x9B31,0x9B33,0x9B34,0x9B35,/* 0x60-0x67 */
+	0x9B36,0x9B37,0x9B38,0x9B39,0x9B3A,0x9B3D,0x9B3E,0x9B3F,/* 0x68-0x6F */
+	0x9B40,0x9B46,0x9B4A,0x9B4B,0x9B4C,0x9B4E,0x9B50,0x9B52,/* 0x70-0x77 */
+	0x9B53,0x9B55,0x9B56,0x9B57,0x9B58,0x9B59,0x9B5A,0x0000,/* 0x78-0x7F */
+
+	0x9B5B,0x9B5C,0x9B5D,0x9B5E,0x9B5F,0x9B60,0x9B61,0x9B62,/* 0x80-0x87 */
+	0x9B63,0x9B64,0x9B65,0x9B66,0x9B67,0x9B68,0x9B69,0x9B6A,/* 0x88-0x8F */
+	0x9B6B,0x9B6C,0x9B6D,0x9B6E,0x9B6F,0x9B70,0x9B71,0x9B72,/* 0x90-0x97 */
+	0x9B73,0x9B74,0x9B75,0x9B76,0x9B77,0x9B78,0x9B79,0x9B7A,/* 0x98-0x9F */
+	0x9B7B,0x7C1F,0x7C2A,0x7C26,0x7C38,0x7C41,0x7C40,0x81FE,/* 0xA0-0xA7 */
+	0x8201,0x8202,0x8204,0x81EC,0x8844,0x8221,0x8222,0x8223,/* 0xA8-0xAF */
+	0x822D,0x822F,0x8228,0x822B,0x8238,0x823B,0x8233,0x8234,/* 0xB0-0xB7 */
+	0x823E,0x8244,0x8249,0x824B,0x824F,0x825A,0x825F,0x8268,/* 0xB8-0xBF */
+	0x887E,0x8885,0x8888,0x88D8,0x88DF,0x895E,0x7F9D,0x7F9F,/* 0xC0-0xC7 */
+	0x7FA7,0x7FAF,0x7FB0,0x7FB2,0x7C7C,0x6549,0x7C91,0x7C9D,/* 0xC8-0xCF */
+	0x7C9C,0x7C9E,0x7CA2,0x7CB2,0x7CBC,0x7CBD,0x7CC1,0x7CC7,/* 0xD0-0xD7 */
+	0x7CCC,0x7CCD,0x7CC8,0x7CC5,0x7CD7,0x7CE8,0x826E,0x66A8,/* 0xD8-0xDF */
+	0x7FBF,0x7FCE,0x7FD5,0x7FE5,0x7FE1,0x7FE6,0x7FE9,0x7FEE,/* 0xE0-0xE7 */
+	0x7FF3,0x7CF8,0x7D77,0x7DA6,0x7DAE,0x7E47,0x7E9B,0x9EB8,/* 0xE8-0xEF */
+	0x9EB4,0x8D73,0x8D84,0x8D94,0x8D91,0x8DB1,0x8D67,0x8D6D,/* 0xF0-0xF7 */
+	0x8C47,0x8C49,0x914A,0x9150,0x914E,0x914F,0x9164,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9B7C,0x9B7D,0x9B7E,0x9B7F,0x9B80,0x9B81,0x9B82,0x9B83,/* 0x40-0x47 */
+	0x9B84,0x9B85,0x9B86,0x9B87,0x9B88,0x9B89,0x9B8A,0x9B8B,/* 0x48-0x4F */
+	0x9B8C,0x9B8D,0x9B8E,0x9B8F,0x9B90,0x9B91,0x9B92,0x9B93,/* 0x50-0x57 */
+	0x9B94,0x9B95,0x9B96,0x9B97,0x9B98,0x9B99,0x9B9A,0x9B9B,/* 0x58-0x5F */
+	0x9B9C,0x9B9D,0x9B9E,0x9B9F,0x9BA0,0x9BA1,0x9BA2,0x9BA3,/* 0x60-0x67 */
+	0x9BA4,0x9BA5,0x9BA6,0x9BA7,0x9BA8,0x9BA9,0x9BAA,0x9BAB,/* 0x68-0x6F */
+	0x9BAC,0x9BAD,0x9BAE,0x9BAF,0x9BB0,0x9BB1,0x9BB2,0x9BB3,/* 0x70-0x77 */
+	0x9BB4,0x9BB5,0x9BB6,0x9BB7,0x9BB8,0x9BB9,0x9BBA,0x0000,/* 0x78-0x7F */
+
+	0x9BBB,0x9BBC,0x9BBD,0x9BBE,0x9BBF,0x9BC0,0x9BC1,0x9BC2,/* 0x80-0x87 */
+	0x9BC3,0x9BC4,0x9BC5,0x9BC6,0x9BC7,0x9BC8,0x9BC9,0x9BCA,/* 0x88-0x8F */
+	0x9BCB,0x9BCC,0x9BCD,0x9BCE,0x9BCF,0x9BD0,0x9BD1,0x9BD2,/* 0x90-0x97 */
+	0x9BD3,0x9BD4,0x9BD5,0x9BD6,0x9BD7,0x9BD8,0x9BD9,0x9BDA,/* 0x98-0x9F */
+	0x9BDB,0x9162,0x9161,0x9170,0x9169,0x916F,0x917D,0x917E,/* 0xA0-0xA7 */
+	0x9172,0x9174,0x9179,0x918C,0x9185,0x9190,0x918D,0x9191,/* 0xA8-0xAF */
+	0x91A2,0x91A3,0x91AA,0x91AD,0x91AE,0x91AF,0x91B5,0x91B4,/* 0xB0-0xB7 */
+	0x91BA,0x8C55,0x9E7E,0x8DB8,0x8DEB,0x8E05,0x8E59,0x8E69,/* 0xB8-0xBF */
+	0x8DB5,0x8DBF,0x8DBC,0x8DBA,0x8DC4,0x8DD6,0x8DD7,0x8DDA,/* 0xC0-0xC7 */
+	0x8DDE,0x8DCE,0x8DCF,0x8DDB,0x8DC6,0x8DEC,0x8DF7,0x8DF8,/* 0xC8-0xCF */
+	0x8DE3,0x8DF9,0x8DFB,0x8DE4,0x8E09,0x8DFD,0x8E14,0x8E1D,/* 0xD0-0xD7 */
+	0x8E1F,0x8E2C,0x8E2E,0x8E23,0x8E2F,0x8E3A,0x8E40,0x8E39,/* 0xD8-0xDF */
+	0x8E35,0x8E3D,0x8E31,0x8E49,0x8E41,0x8E42,0x8E51,0x8E52,/* 0xE0-0xE7 */
+	0x8E4A,0x8E70,0x8E76,0x8E7C,0x8E6F,0x8E74,0x8E85,0x8E8F,/* 0xE8-0xEF */
+	0x8E94,0x8E90,0x8E9C,0x8E9E,0x8C78,0x8C82,0x8C8A,0x8C85,/* 0xF0-0xF7 */
+	0x8C98,0x8C94,0x659B,0x89D6,0x89DE,0x89DA,0x89DC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9BDC,0x9BDD,0x9BDE,0x9BDF,0x9BE0,0x9BE1,0x9BE2,0x9BE3,/* 0x40-0x47 */
+	0x9BE4,0x9BE5,0x9BE6,0x9BE7,0x9BE8,0x9BE9,0x9BEA,0x9BEB,/* 0x48-0x4F */
+	0x9BEC,0x9BED,0x9BEE,0x9BEF,0x9BF0,0x9BF1,0x9BF2,0x9BF3,/* 0x50-0x57 */
+	0x9BF4,0x9BF5,0x9BF6,0x9BF7,0x9BF8,0x9BF9,0x9BFA,0x9BFB,/* 0x58-0x5F */
+	0x9BFC,0x9BFD,0x9BFE,0x9BFF,0x9C00,0x9C01,0x9C02,0x9C03,/* 0x60-0x67 */
+	0x9C04,0x9C05,0x9C06,0x9C07,0x9C08,0x9C09,0x9C0A,0x9C0B,/* 0x68-0x6F */
+	0x9C0C,0x9C0D,0x9C0E,0x9C0F,0x9C10,0x9C11,0x9C12,0x9C13,/* 0x70-0x77 */
+	0x9C14,0x9C15,0x9C16,0x9C17,0x9C18,0x9C19,0x9C1A,0x0000,/* 0x78-0x7F */
+
+	0x9C1B,0x9C1C,0x9C1D,0x9C1E,0x9C1F,0x9C20,0x9C21,0x9C22,/* 0x80-0x87 */
+	0x9C23,0x9C24,0x9C25,0x9C26,0x9C27,0x9C28,0x9C29,0x9C2A,/* 0x88-0x8F */
+	0x9C2B,0x9C2C,0x9C2D,0x9C2E,0x9C2F,0x9C30,0x9C31,0x9C32,/* 0x90-0x97 */
+	0x9C33,0x9C34,0x9C35,0x9C36,0x9C37,0x9C38,0x9C39,0x9C3A,/* 0x98-0x9F */
+	0x9C3B,0x89E5,0x89EB,0x89EF,0x8A3E,0x8B26,0x9753,0x96E9,/* 0xA0-0xA7 */
+	0x96F3,0x96EF,0x9706,0x9701,0x9708,0x970F,0x970E,0x972A,/* 0xA8-0xAF */
+	0x972D,0x9730,0x973E,0x9F80,0x9F83,0x9F85,0x9F86,0x9F87,/* 0xB0-0xB7 */
+	0x9F88,0x9F89,0x9F8A,0x9F8C,0x9EFE,0x9F0B,0x9F0D,0x96B9,/* 0xB8-0xBF */
+	0x96BC,0x96BD,0x96CE,0x96D2,0x77BF,0x96E0,0x928E,0x92AE,/* 0xC0-0xC7 */
+	0x92C8,0x933E,0x936A,0x93CA,0x938F,0x943E,0x946B,0x9C7F,/* 0xC8-0xCF */
+	0x9C82,0x9C85,0x9C86,0x9C87,0x9C88,0x7A23,0x9C8B,0x9C8E,/* 0xD0-0xD7 */
+	0x9C90,0x9C91,0x9C92,0x9C94,0x9C95,0x9C9A,0x9C9B,0x9C9E,/* 0xD8-0xDF */
+	0x9C9F,0x9CA0,0x9CA1,0x9CA2,0x9CA3,0x9CA5,0x9CA6,0x9CA7,/* 0xE0-0xE7 */
+	0x9CA8,0x9CA9,0x9CAB,0x9CAD,0x9CAE,0x9CB0,0x9CB1,0x9CB2,/* 0xE8-0xEF */
+	0x9CB3,0x9CB4,0x9CB5,0x9CB6,0x9CB7,0x9CBA,0x9CBB,0x9CBC,/* 0xF0-0xF7 */
+	0x9CBD,0x9CC4,0x9CC5,0x9CC6,0x9CC7,0x9CCA,0x9CCB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9C3C,0x9C3D,0x9C3E,0x9C3F,0x9C40,0x9C41,0x9C42,0x9C43,/* 0x40-0x47 */
+	0x9C44,0x9C45,0x9C46,0x9C47,0x9C48,0x9C49,0x9C4A,0x9C4B,/* 0x48-0x4F */
+	0x9C4C,0x9C4D,0x9C4E,0x9C4F,0x9C50,0x9C51,0x9C52,0x9C53,/* 0x50-0x57 */
+	0x9C54,0x9C55,0x9C56,0x9C57,0x9C58,0x9C59,0x9C5A,0x9C5B,/* 0x58-0x5F */
+	0x9C5C,0x9C5D,0x9C5E,0x9C5F,0x9C60,0x9C61,0x9C62,0x9C63,/* 0x60-0x67 */
+	0x9C64,0x9C65,0x9C66,0x9C67,0x9C68,0x9C69,0x9C6A,0x9C6B,/* 0x68-0x6F */
+	0x9C6C,0x9C6D,0x9C6E,0x9C6F,0x9C70,0x9C71,0x9C72,0x9C73,/* 0x70-0x77 */
+	0x9C74,0x9C75,0x9C76,0x9C77,0x9C78,0x9C79,0x9C7A,0x0000,/* 0x78-0x7F */
+
+	0x9C7B,0x9C7D,0x9C7E,0x9C80,0x9C83,0x9C84,0x9C89,0x9C8A,/* 0x80-0x87 */
+	0x9C8C,0x9C8F,0x9C93,0x9C96,0x9C97,0x9C98,0x9C99,0x9C9D,/* 0x88-0x8F */
+	0x9CAA,0x9CAC,0x9CAF,0x9CB9,0x9CBE,0x9CBF,0x9CC0,0x9CC1,/* 0x90-0x97 */
+	0x9CC2,0x9CC8,0x9CC9,0x9CD1,0x9CD2,0x9CDA,0x9CDB,0x9CE0,/* 0x98-0x9F */
+	0x9CE1,0x9CCC,0x9CCD,0x9CCE,0x9CCF,0x9CD0,0x9CD3,0x9CD4,/* 0xA0-0xA7 */
+	0x9CD5,0x9CD7,0x9CD8,0x9CD9,0x9CDC,0x9CDD,0x9CDF,0x9CE2,/* 0xA8-0xAF */
+	0x977C,0x9785,0x9791,0x9792,0x9794,0x97AF,0x97AB,0x97A3,/* 0xB0-0xB7 */
+	0x97B2,0x97B4,0x9AB1,0x9AB0,0x9AB7,0x9E58,0x9AB6,0x9ABA,/* 0xB8-0xBF */
+	0x9ABC,0x9AC1,0x9AC0,0x9AC5,0x9AC2,0x9ACB,0x9ACC,0x9AD1,/* 0xC0-0xC7 */
+	0x9B45,0x9B43,0x9B47,0x9B49,0x9B48,0x9B4D,0x9B51,0x98E8,/* 0xC8-0xCF */
+	0x990D,0x992E,0x9955,0x9954,0x9ADF,0x9AE1,0x9AE6,0x9AEF,/* 0xD0-0xD7 */
+	0x9AEB,0x9AFB,0x9AED,0x9AF9,0x9B08,0x9B0F,0x9B13,0x9B1F,/* 0xD8-0xDF */
+	0x9B23,0x9EBD,0x9EBE,0x7E3B,0x9E82,0x9E87,0x9E88,0x9E8B,/* 0xE0-0xE7 */
+	0x9E92,0x93D6,0x9E9D,0x9E9F,0x9EDB,0x9EDC,0x9EDD,0x9EE0,/* 0xE8-0xEF */
+	0x9EDF,0x9EE2,0x9EE9,0x9EE7,0x9EE5,0x9EEA,0x9EEF,0x9F22,/* 0xF0-0xF7 */
+	0x9F2C,0x9F2F,0x9F39,0x9F37,0x9F3D,0x9F3E,0x9F44,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9CE3,0x9CE4,0x9CE5,0x9CE6,0x9CE7,0x9CE8,0x9CE9,0x9CEA,/* 0x40-0x47 */
+	0x9CEB,0x9CEC,0x9CED,0x9CEE,0x9CEF,0x9CF0,0x9CF1,0x9CF2,/* 0x48-0x4F */
+	0x9CF3,0x9CF4,0x9CF5,0x9CF6,0x9CF7,0x9CF8,0x9CF9,0x9CFA,/* 0x50-0x57 */
+	0x9CFB,0x9CFC,0x9CFD,0x9CFE,0x9CFF,0x9D00,0x9D01,0x9D02,/* 0x58-0x5F */
+	0x9D03,0x9D04,0x9D05,0x9D06,0x9D07,0x9D08,0x9D09,0x9D0A,/* 0x60-0x67 */
+	0x9D0B,0x9D0C,0x9D0D,0x9D0E,0x9D0F,0x9D10,0x9D11,0x9D12,/* 0x68-0x6F */
+	0x9D13,0x9D14,0x9D15,0x9D16,0x9D17,0x9D18,0x9D19,0x9D1A,/* 0x70-0x77 */
+	0x9D1B,0x9D1C,0x9D1D,0x9D1E,0x9D1F,0x9D20,0x9D21,0x0000,/* 0x78-0x7F */
+
+	0x9D22,0x9D23,0x9D24,0x9D25,0x9D26,0x9D27,0x9D28,0x9D29,/* 0x80-0x87 */
+	0x9D2A,0x9D2B,0x9D2C,0x9D2D,0x9D2E,0x9D2F,0x9D30,0x9D31,/* 0x88-0x8F */
+	0x9D32,0x9D33,0x9D34,0x9D35,0x9D36,0x9D37,0x9D38,0x9D39,/* 0x90-0x97 */
+	0x9D3A,0x9D3B,0x9D3C,0x9D3D,0x9D3E,0x9D3F,0x9D40,0x9D41,/* 0x98-0x9F */
+	0x9D42,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_F9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9D43,0x9D44,0x9D45,0x9D46,0x9D47,0x9D48,0x9D49,0x9D4A,/* 0x40-0x47 */
+	0x9D4B,0x9D4C,0x9D4D,0x9D4E,0x9D4F,0x9D50,0x9D51,0x9D52,/* 0x48-0x4F */
+	0x9D53,0x9D54,0x9D55,0x9D56,0x9D57,0x9D58,0x9D59,0x9D5A,/* 0x50-0x57 */
+	0x9D5B,0x9D5C,0x9D5D,0x9D5E,0x9D5F,0x9D60,0x9D61,0x9D62,/* 0x58-0x5F */
+	0x9D63,0x9D64,0x9D65,0x9D66,0x9D67,0x9D68,0x9D69,0x9D6A,/* 0x60-0x67 */
+	0x9D6B,0x9D6C,0x9D6D,0x9D6E,0x9D6F,0x9D70,0x9D71,0x9D72,/* 0x68-0x6F */
+	0x9D73,0x9D74,0x9D75,0x9D76,0x9D77,0x9D78,0x9D79,0x9D7A,/* 0x70-0x77 */
+	0x9D7B,0x9D7C,0x9D7D,0x9D7E,0x9D7F,0x9D80,0x9D81,0x0000,/* 0x78-0x7F */
+
+	0x9D82,0x9D83,0x9D84,0x9D85,0x9D86,0x9D87,0x9D88,0x9D89,/* 0x80-0x87 */
+	0x9D8A,0x9D8B,0x9D8C,0x9D8D,0x9D8E,0x9D8F,0x9D90,0x9D91,/* 0x88-0x8F */
+	0x9D92,0x9D93,0x9D94,0x9D95,0x9D96,0x9D97,0x9D98,0x9D99,/* 0x90-0x97 */
+	0x9D9A,0x9D9B,0x9D9C,0x9D9D,0x9D9E,0x9D9F,0x9DA0,0x9DA1,/* 0x98-0x9F */
+	0x9DA2,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_FA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9DA3,0x9DA4,0x9DA5,0x9DA6,0x9DA7,0x9DA8,0x9DA9,0x9DAA,/* 0x40-0x47 */
+	0x9DAB,0x9DAC,0x9DAD,0x9DAE,0x9DAF,0x9DB0,0x9DB1,0x9DB2,/* 0x48-0x4F */
+	0x9DB3,0x9DB4,0x9DB5,0x9DB6,0x9DB7,0x9DB8,0x9DB9,0x9DBA,/* 0x50-0x57 */
+	0x9DBB,0x9DBC,0x9DBD,0x9DBE,0x9DBF,0x9DC0,0x9DC1,0x9DC2,/* 0x58-0x5F */
+	0x9DC3,0x9DC4,0x9DC5,0x9DC6,0x9DC7,0x9DC8,0x9DC9,0x9DCA,/* 0x60-0x67 */
+	0x9DCB,0x9DCC,0x9DCD,0x9DCE,0x9DCF,0x9DD0,0x9DD1,0x9DD2,/* 0x68-0x6F */
+	0x9DD3,0x9DD4,0x9DD5,0x9DD6,0x9DD7,0x9DD8,0x9DD9,0x9DDA,/* 0x70-0x77 */
+	0x9DDB,0x9DDC,0x9DDD,0x9DDE,0x9DDF,0x9DE0,0x9DE1,0x0000,/* 0x78-0x7F */
+
+	0x9DE2,0x9DE3,0x9DE4,0x9DE5,0x9DE6,0x9DE7,0x9DE8,0x9DE9,/* 0x80-0x87 */
+	0x9DEA,0x9DEB,0x9DEC,0x9DED,0x9DEE,0x9DEF,0x9DF0,0x9DF1,/* 0x88-0x8F */
+	0x9DF2,0x9DF3,0x9DF4,0x9DF5,0x9DF6,0x9DF7,0x9DF8,0x9DF9,/* 0x90-0x97 */
+	0x9DFA,0x9DFB,0x9DFC,0x9DFD,0x9DFE,0x9DFF,0x9E00,0x9E01,/* 0x98-0x9F */
+	0x9E02,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_FB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9E03,0x9E04,0x9E05,0x9E06,0x9E07,0x9E08,0x9E09,0x9E0A,/* 0x40-0x47 */
+	0x9E0B,0x9E0C,0x9E0D,0x9E0E,0x9E0F,0x9E10,0x9E11,0x9E12,/* 0x48-0x4F */
+	0x9E13,0x9E14,0x9E15,0x9E16,0x9E17,0x9E18,0x9E19,0x9E1A,/* 0x50-0x57 */
+	0x9E1B,0x9E1C,0x9E1D,0x9E1E,0x9E24,0x9E27,0x9E2E,0x9E30,/* 0x58-0x5F */
+	0x9E34,0x9E3B,0x9E3C,0x9E40,0x9E4D,0x9E50,0x9E52,0x9E53,/* 0x60-0x67 */
+	0x9E54,0x9E56,0x9E59,0x9E5D,0x9E5F,0x9E60,0x9E61,0x9E62,/* 0x68-0x6F */
+	0x9E65,0x9E6E,0x9E6F,0x9E72,0x9E74,0x9E75,0x9E76,0x9E77,/* 0x70-0x77 */
+	0x9E78,0x9E79,0x9E7A,0x9E7B,0x9E7C,0x9E7D,0x9E80,0x0000,/* 0x78-0x7F */
+
+	0x9E81,0x9E83,0x9E84,0x9E85,0x9E86,0x9E89,0x9E8A,0x9E8C,/* 0x80-0x87 */
+	0x9E8D,0x9E8E,0x9E8F,0x9E90,0x9E91,0x9E94,0x9E95,0x9E96,/* 0x88-0x8F */
+	0x9E97,0x9E98,0x9E99,0x9E9A,0x9E9B,0x9E9C,0x9E9E,0x9EA0,/* 0x90-0x97 */
+	0x9EA1,0x9EA2,0x9EA3,0x9EA4,0x9EA5,0x9EA7,0x9EA8,0x9EA9,/* 0x98-0x9F */
+	0x9EAA,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_FC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9EAB,0x9EAC,0x9EAD,0x9EAE,0x9EAF,0x9EB0,0x9EB1,0x9EB2,/* 0x40-0x47 */
+	0x9EB3,0x9EB5,0x9EB6,0x9EB7,0x9EB9,0x9EBA,0x9EBC,0x9EBF,/* 0x48-0x4F */
+	0x9EC0,0x9EC1,0x9EC2,0x9EC3,0x9EC5,0x9EC6,0x9EC7,0x9EC8,/* 0x50-0x57 */
+	0x9ECA,0x9ECB,0x9ECC,0x9ED0,0x9ED2,0x9ED3,0x9ED5,0x9ED6,/* 0x58-0x5F */
+	0x9ED7,0x9ED9,0x9EDA,0x9EDE,0x9EE1,0x9EE3,0x9EE4,0x9EE6,/* 0x60-0x67 */
+	0x9EE8,0x9EEB,0x9EEC,0x9EED,0x9EEE,0x9EF0,0x9EF1,0x9EF2,/* 0x68-0x6F */
+	0x9EF3,0x9EF4,0x9EF5,0x9EF6,0x9EF7,0x9EF8,0x9EFA,0x9EFD,/* 0x70-0x77 */
+	0x9EFF,0x9F00,0x9F01,0x9F02,0x9F03,0x9F04,0x9F05,0x0000,/* 0x78-0x7F */
+
+	0x9F06,0x9F07,0x9F08,0x9F09,0x9F0A,0x9F0C,0x9F0F,0x9F11,/* 0x80-0x87 */
+	0x9F12,0x9F14,0x9F15,0x9F16,0x9F18,0x9F1A,0x9F1B,0x9F1C,/* 0x88-0x8F */
+	0x9F1D,0x9F1E,0x9F1F,0x9F21,0x9F23,0x9F24,0x9F25,0x9F26,/* 0x90-0x97 */
+	0x9F27,0x9F28,0x9F29,0x9F2A,0x9F2B,0x9F2D,0x9F2E,0x9F30,/* 0x98-0x9F */
+	0x9F31,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_FD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9F32,0x9F33,0x9F34,0x9F35,0x9F36,0x9F38,0x9F3A,0x9F3C,/* 0x40-0x47 */
+	0x9F3F,0x9F40,0x9F41,0x9F42,0x9F43,0x9F45,0x9F46,0x9F47,/* 0x48-0x4F */
+	0x9F48,0x9F49,0x9F4A,0x9F4B,0x9F4C,0x9F4D,0x9F4E,0x9F4F,/* 0x50-0x57 */
+	0x9F52,0x9F53,0x9F54,0x9F55,0x9F56,0x9F57,0x9F58,0x9F59,/* 0x58-0x5F */
+	0x9F5A,0x9F5B,0x9F5C,0x9F5D,0x9F5E,0x9F5F,0x9F60,0x9F61,/* 0x60-0x67 */
+	0x9F62,0x9F63,0x9F64,0x9F65,0x9F66,0x9F67,0x9F68,0x9F69,/* 0x68-0x6F */
+	0x9F6A,0x9F6B,0x9F6C,0x9F6D,0x9F6E,0x9F6F,0x9F70,0x9F71,/* 0x70-0x77 */
+	0x9F72,0x9F73,0x9F74,0x9F75,0x9F76,0x9F77,0x9F78,0x0000,/* 0x78-0x7F */
+
+	0x9F79,0x9F7A,0x9F7B,0x9F7C,0x9F7D,0x9F7E,0x9F81,0x9F82,/* 0x80-0x87 */
+	0x9F8D,0x9F8E,0x9F8F,0x9F90,0x9F91,0x9F92,0x9F93,0x9F94,/* 0x88-0x8F */
+	0x9F95,0x9F96,0x9F97,0x9F98,0x9F9C,0x9F9D,0x9F9E,0x9FA1,/* 0x90-0x97 */
+	0x9FA2,0x9FA3,0x9FA4,0x9FA5,0xF92C,0xF979,0xF995,0xF9E7,/* 0x98-0x9F */
+	0xF9F1,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_FE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0xFA0C,0xFA0D,0xFA0E,0xFA0F,0xFA11,0xFA13,0xFA14,0xFA18,/* 0x40-0x47 */
+	0xFA1F,0xFA20,0xFA21,0xFA23,0xFA24,0xFA27,0xFA28,0xFA29,/* 0x48-0x4F */
+};
+
+static const wchar_t *page_charset2uni[256] = {
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   c2u_81, c2u_82, c2u_83, c2u_84, c2u_85, c2u_86, c2u_87, 
+	c2u_88, c2u_89, c2u_8A, c2u_8B, c2u_8C, c2u_8D, c2u_8E, c2u_8F, 
+	c2u_90, c2u_91, c2u_92, c2u_93, c2u_94, c2u_95, c2u_96, c2u_97, 
+	c2u_98, c2u_99, c2u_9A, c2u_9B, c2u_9C, c2u_9D, c2u_9E, c2u_9F, 
+	c2u_A0, c2u_A1, c2u_A2, c2u_A3, c2u_A4, c2u_A5, c2u_A6, c2u_A7, 
+	c2u_A8, c2u_A9, c2u_AA, c2u_AB, c2u_AC, c2u_AD, c2u_AE, c2u_AF, 
+	c2u_B0, c2u_B1, c2u_B2, c2u_B3, c2u_B4, c2u_B5, c2u_B6, c2u_B7, 
+	c2u_B8, c2u_B9, c2u_BA, c2u_BB, c2u_BC, c2u_BD, c2u_BE, c2u_BF, 
+	c2u_C0, c2u_C1, c2u_C2, c2u_C3, c2u_C4, c2u_C5, c2u_C6, c2u_C7, 
+	c2u_C8, c2u_C9, c2u_CA, c2u_CB, c2u_CC, c2u_CD, c2u_CE, c2u_CF, 
+	c2u_D0, c2u_D1, c2u_D2, c2u_D3, c2u_D4, c2u_D5, c2u_D6, c2u_D7, 
+	c2u_D8, c2u_D9, c2u_DA, c2u_DB, c2u_DC, c2u_DD, c2u_DE, c2u_DF, 
+	c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, 
+	c2u_E8, c2u_E9, c2u_EA, c2u_EB, c2u_EC, c2u_ED, c2u_EE, c2u_EF, 
+	c2u_F0, c2u_F1, c2u_F2, c2u_F3, c2u_F4, c2u_F5, c2u_F6, c2u_F7, 
+	c2u_F8, c2u_F9, c2u_FA, c2u_FB, c2u_FC, c2u_FD, c2u_FE, NULL,   
+};
+
+static const unsigned char u2c_00[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xA1, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xEC, /* 0xA4-0xA7 */
+	0xA1, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xA1, 0xE3, 0xA1, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xA4, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC1, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xA8, 0xA4, 0xA8, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xA8, 0xA8, 0xA8, 0xA6, 0xA8, 0xBA, 0x00, 0x00, /* 0xE8-0xEB */
+	0xA8, 0xAC, 0xA8, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xB0, 0xA8, 0xAE, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC2, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xA8, 0xB4, 0xA8, 0xB2, 0x00, 0x00, /* 0xF8-0xFB */
+	0xA8, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_01[512] = {
+	0xA8, 0xA1, 0xA8, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xA5, 0xA8, 0xA5, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xA7, 0xA8, 0xA7, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xA9, 0xA8, 0xA9, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xA8, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xA8, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xA8, 0xAD, 0xA8, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xB1, 0xA8, 0xB1, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xA1, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xA8, 0xA3, 0xA8, 0xA3, 0xA8, 0xAB, /* 0xCC-0xCF */
+	0xA8, 0xAB, 0xA8, 0xAF, 0xA8, 0xAF, 0xA8, 0xB3, /* 0xD0-0xD3 */
+	0xA8, 0xB3, 0xA8, 0xB5, 0xA8, 0xB5, 0xA8, 0xB6, /* 0xD4-0xD7 */
+	0xA8, 0xB6, 0xA8, 0xB7, 0xA8, 0xB7, 0xA8, 0xB8, /* 0xD8-0xDB */
+	0xA8, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+};
+
+static const unsigned char u2c_02[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xA8, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xA8, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xA6, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xA1, 0xA5, 0xA8, 0x40, 0xA8, 0x41, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xA8, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+};
+
+static const unsigned char u2c_03[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xA6, 0xA1, 0xA6, 0xA2, 0xA6, 0xA3, /* 0x90-0x93 */
+	0xA6, 0xA4, 0xA6, 0xA5, 0xA6, 0xA6, 0xA6, 0xA7, /* 0x94-0x97 */
+	0xA6, 0xA8, 0xA6, 0xA9, 0xA6, 0xAA, 0xA6, 0xAB, /* 0x98-0x9B */
+	0xA6, 0xAC, 0xA6, 0xAD, 0xA6, 0xAE, 0xA6, 0xAF, /* 0x9C-0x9F */
+	0xA6, 0xB0, 0xA6, 0xB1, 0x00, 0x00, 0xA6, 0xB2, /* 0xA0-0xA3 */
+	0xA6, 0xB3, 0xA6, 0xB4, 0xA6, 0xB5, 0xA6, 0xB6, /* 0xA4-0xA7 */
+	0xA6, 0xB7, 0xA6, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xA6, 0xC1, 0xA6, 0xC2, 0xA6, 0xC3, /* 0xB0-0xB3 */
+	0xA6, 0xC4, 0xA6, 0xC5, 0xA6, 0xC6, 0xA6, 0xC7, /* 0xB4-0xB7 */
+	0xA6, 0xC8, 0xA6, 0xC9, 0xA6, 0xCA, 0xA6, 0xCB, /* 0xB8-0xBB */
+	0xA6, 0xCC, 0xA6, 0xCD, 0xA6, 0xCE, 0xA6, 0xCF, /* 0xBC-0xBF */
+	0xA6, 0xD0, 0xA6, 0xD1, 0x00, 0x00, 0xA6, 0xD2, /* 0xC0-0xC3 */
+	0xA6, 0xD3, 0xA6, 0xD4, 0xA6, 0xD5, 0xA6, 0xD6, /* 0xC4-0xC7 */
+	0xA6, 0xD7, 0xA6, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+};
+
+static const unsigned char u2c_04[512] = {
+	0x00, 0x00, 0xA7, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xA7, 0xA1, 0xA7, 0xA2, 0xA7, 0xA3, 0xA7, 0xA4, /* 0x10-0x13 */
+	0xA7, 0xA5, 0xA7, 0xA6, 0xA7, 0xA8, 0xA7, 0xA9, /* 0x14-0x17 */
+	0xA7, 0xAA, 0xA7, 0xAB, 0xA7, 0xAC, 0xA7, 0xAD, /* 0x18-0x1B */
+	0xA7, 0xAE, 0xA7, 0xAF, 0xA7, 0xB0, 0xA7, 0xB1, /* 0x1C-0x1F */
+	0xA7, 0xB2, 0xA7, 0xB3, 0xA7, 0xB4, 0xA7, 0xB5, /* 0x20-0x23 */
+	0xA7, 0xB6, 0xA7, 0xB7, 0xA7, 0xB8, 0xA7, 0xB9, /* 0x24-0x27 */
+	0xA7, 0xBA, 0xA7, 0xBB, 0xA7, 0xBC, 0xA7, 0xBD, /* 0x28-0x2B */
+	0xA7, 0xBE, 0xA7, 0xBF, 0xA7, 0xC0, 0xA7, 0xC1, /* 0x2C-0x2F */
+	0xA7, 0xD1, 0xA7, 0xD2, 0xA7, 0xD3, 0xA7, 0xD4, /* 0x30-0x33 */
+	0xA7, 0xD5, 0xA7, 0xD6, 0xA7, 0xD8, 0xA7, 0xD9, /* 0x34-0x37 */
+	0xA7, 0xDA, 0xA7, 0xDB, 0xA7, 0xDC, 0xA7, 0xDD, /* 0x38-0x3B */
+	0xA7, 0xDE, 0xA7, 0xDF, 0xA7, 0xE0, 0xA7, 0xE1, /* 0x3C-0x3F */
+	0xA7, 0xE2, 0xA7, 0xE3, 0xA7, 0xE4, 0xA7, 0xE5, /* 0x40-0x43 */
+	0xA7, 0xE6, 0xA7, 0xE7, 0xA7, 0xE8, 0xA7, 0xE9, /* 0x44-0x47 */
+	0xA7, 0xEA, 0xA7, 0xEB, 0xA7, 0xEC, 0xA7, 0xED, /* 0x48-0x4B */
+	0xA7, 0xEE, 0xA7, 0xEF, 0xA7, 0xF0, 0xA7, 0xF1, /* 0x4C-0x4F */
+	0x00, 0x00, 0xA7, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+};
+
+static const unsigned char u2c_20[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xA9, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x43, /* 0x10-0x13 */
+	0xA1, 0xAA, 0xA8, 0x44, 0xA1, 0xAC, 0x00, 0x00, /* 0x14-0x17 */
+	0xA1, 0xAE, 0xA1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xA1, 0xB0, 0xA1, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xA8, 0x45, 0xA1, 0xAD, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xA1, 0xEB, 0x00, 0x00, 0xA1, 0xE4, 0xA1, 0xE5, /* 0x30-0x33 */
+	0x00, 0x00, 0xA8, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF9, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xA3, 0xFE, 0x00, 0x00, /* 0x3C-0x3F */
+};
+
+static const unsigned char u2c_21[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xE6, /* 0x00-0x03 */
+	0x00, 0x00, 0xA8, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xA8, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xED, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xA9, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA2, 0xF1, 0xA2, 0xF2, 0xA2, 0xF3, 0xA2, 0xF4, /* 0x60-0x63 */
+	0xA2, 0xF5, 0xA2, 0xF6, 0xA2, 0xF7, 0xA2, 0xF8, /* 0x64-0x67 */
+	0xA2, 0xF9, 0xA2, 0xFA, 0xA2, 0xFB, 0xA2, 0xFC, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xA2, 0xA1, 0xA2, 0xA2, 0xA2, 0xA3, 0xA2, 0xA4, /* 0x70-0x73 */
+	0xA2, 0xA5, 0xA2, 0xA6, 0xA2, 0xA7, 0xA2, 0xA8, /* 0x74-0x77 */
+	0xA2, 0xA9, 0xA2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xA1, 0xFB, 0xA1, 0xFC, 0xA1, 0xFA, 0xA1, 0xFD, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0x49, 0xA8, 0x4A, /* 0x94-0x97 */
+	0xA8, 0x4B, 0xA8, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+};
+
+static const unsigned char u2c_22[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xA1, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC7, /* 0x0C-0x0F */
+	0x00, 0x00, 0xA1, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xA8, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xA1, 0xE3, 0x00, 0x00, 0xA1, 0xCC, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xA1, 0xD8, 0xA1, 0xDE, 0xA8, 0x4E, /* 0x1C-0x1F */
+	0xA1, 0xCF, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x4F, /* 0x20-0x23 */
+	0x00, 0x00, 0xA1, 0xCE, 0x00, 0x00, 0xA1, 0xC4, /* 0x24-0x27 */
+	0xA1, 0xC5, 0xA1, 0xC9, 0xA1, 0xC8, 0xA1, 0xD2, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xD3, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xA1, 0xE0, 0xA1, 0xDF, 0xA1, 0xC3, 0xA1, 0xCB, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xA1, 0xAB, 0xA1, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xA1, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xA1, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0x50, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA1, 0xD9, 0xA1, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xA1, 0xDC, 0xA1, 0xDD, 0xA8, 0x51, 0xA8, 0x52, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xDA, 0xA1, 0xDB, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xA8, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xA1, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xA1, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x53, /* 0xBC-0xBF */
+};
+
+static const unsigned char u2c_23[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xD0, 0x00, 0x00, /* 0x10-0x13 */
+};
+
+static const unsigned char u2c_24[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA2, 0xD9, 0xA2, 0xDA, 0xA2, 0xDB, 0xA2, 0xDC, /* 0x60-0x63 */
+	0xA2, 0xDD, 0xA2, 0xDE, 0xA2, 0xDF, 0xA2, 0xE0, /* 0x64-0x67 */
+	0xA2, 0xE1, 0xA2, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xA2, 0xC5, 0xA2, 0xC6, 0xA2, 0xC7, 0xA2, 0xC8, /* 0x74-0x77 */
+	0xA2, 0xC9, 0xA2, 0xCA, 0xA2, 0xCB, 0xA2, 0xCC, /* 0x78-0x7B */
+	0xA2, 0xCD, 0xA2, 0xCE, 0xA2, 0xCF, 0xA2, 0xD0, /* 0x7C-0x7F */
+	
+	0xA2, 0xD1, 0xA2, 0xD2, 0xA2, 0xD3, 0xA2, 0xD4, /* 0x80-0x83 */
+	0xA2, 0xD5, 0xA2, 0xD6, 0xA2, 0xD7, 0xA2, 0xD8, /* 0x84-0x87 */
+	0xA2, 0xB1, 0xA2, 0xB2, 0xA2, 0xB3, 0xA2, 0xB4, /* 0x88-0x8B */
+	0xA2, 0xB5, 0xA2, 0xB6, 0xA2, 0xB7, 0xA2, 0xB8, /* 0x8C-0x8F */
+	0xA2, 0xB9, 0xA2, 0xBA, 0xA2, 0xBB, 0xA2, 0xBC, /* 0x90-0x93 */
+	0xA2, 0xBD, 0xA2, 0xBE, 0xA2, 0xBF, 0xA2, 0xC0, /* 0x94-0x97 */
+	0xA2, 0xC1, 0xA2, 0xC2, 0xA2, 0xC3, 0xA2, 0xC4, /* 0x98-0x9B */
+};
+
+static const unsigned char u2c_25[512] = {
+	0xA9, 0xA4, 0xA9, 0xA5, 0xA9, 0xA6, 0xA9, 0xA7, /* 0x00-0x03 */
+	0xA9, 0xA8, 0xA9, 0xA9, 0xA9, 0xAA, 0xA9, 0xAB, /* 0x04-0x07 */
+	0xA9, 0xAC, 0xA9, 0xAD, 0xA9, 0xAE, 0xA9, 0xAF, /* 0x08-0x0B */
+	0xA9, 0xB0, 0xA9, 0xB1, 0xA9, 0xB2, 0xA9, 0xB3, /* 0x0C-0x0F */
+	0xA9, 0xB4, 0xA9, 0xB5, 0xA9, 0xB6, 0xA9, 0xB7, /* 0x10-0x13 */
+	0xA9, 0xB8, 0xA9, 0xB9, 0xA9, 0xBA, 0xA9, 0xBB, /* 0x14-0x17 */
+	0xA9, 0xBC, 0xA9, 0xBD, 0xA9, 0xBE, 0xA9, 0xBF, /* 0x18-0x1B */
+	0xA9, 0xC0, 0xA9, 0xC1, 0xA9, 0xC2, 0xA9, 0xC3, /* 0x1C-0x1F */
+	0xA9, 0xC4, 0xA9, 0xC5, 0xA9, 0xC6, 0xA9, 0xC7, /* 0x20-0x23 */
+	0xA9, 0xC8, 0xA9, 0xC9, 0xA9, 0xCA, 0xA9, 0xCB, /* 0x24-0x27 */
+	0xA9, 0xCC, 0xA9, 0xCD, 0xA9, 0xCE, 0xA9, 0xCF, /* 0x28-0x2B */
+	0xA9, 0xD0, 0xA9, 0xD1, 0xA9, 0xD2, 0xA9, 0xD3, /* 0x2C-0x2F */
+	0xA9, 0xD4, 0xA9, 0xD5, 0xA9, 0xD6, 0xA9, 0xD7, /* 0x30-0x33 */
+	0xA9, 0xD8, 0xA9, 0xD9, 0xA9, 0xDA, 0xA9, 0xDB, /* 0x34-0x37 */
+	0xA9, 0xDC, 0xA9, 0xDD, 0xA9, 0xDE, 0xA9, 0xDF, /* 0x38-0x3B */
+	0xA9, 0xE0, 0xA9, 0xE1, 0xA9, 0xE2, 0xA9, 0xE3, /* 0x3C-0x3F */
+	0xA9, 0xE4, 0xA9, 0xE5, 0xA9, 0xE6, 0xA9, 0xE7, /* 0x40-0x43 */
+	0xA9, 0xE8, 0xA9, 0xE9, 0xA9, 0xEA, 0xA9, 0xEB, /* 0x44-0x47 */
+	0xA9, 0xEC, 0xA9, 0xED, 0xA9, 0xEE, 0xA9, 0xEF, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xA8, 0x54, 0xA8, 0x55, 0xA8, 0x56, 0xA8, 0x57, /* 0x50-0x53 */
+	0xA8, 0x58, 0xA8, 0x59, 0xA8, 0x5A, 0xA8, 0x5B, /* 0x54-0x57 */
+	0xA8, 0x5C, 0xA8, 0x5D, 0xA8, 0x5E, 0xA8, 0x5F, /* 0x58-0x5B */
+	0xA8, 0x60, 0xA8, 0x61, 0xA8, 0x62, 0xA8, 0x63, /* 0x5C-0x5F */
+	0xA8, 0x64, 0xA8, 0x65, 0xA8, 0x66, 0xA8, 0x67, /* 0x60-0x63 */
+	0xA8, 0x68, 0xA8, 0x69, 0xA8, 0x6A, 0xA8, 0x6B, /* 0x64-0x67 */
+	0xA8, 0x6C, 0xA8, 0x6D, 0xA8, 0x6E, 0xA8, 0x6F, /* 0x68-0x6B */
+	0xA8, 0x70, 0xA8, 0x71, 0xA8, 0x72, 0xA8, 0x73, /* 0x6C-0x6F */
+	0xA8, 0x74, 0xA8, 0x75, 0xA8, 0x76, 0xA8, 0x77, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xA8, 0x78, 0xA8, 0x79, 0xA8, 0x7A, /* 0x80-0x83 */
+	0xA8, 0x7B, 0xA8, 0x7C, 0xA8, 0x7D, 0xA8, 0x7E, /* 0x84-0x87 */
+	0xA8, 0x80, 0xA8, 0x81, 0xA8, 0x82, 0xA8, 0x83, /* 0x88-0x8B */
+	0xA8, 0x84, 0xA8, 0x85, 0xA8, 0x86, 0xA8, 0x87, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x88, /* 0x90-0x93 */
+	0xA8, 0x89, 0xA8, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xA1, 0xF6, 0xA1, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xF8, 0xA1, 0xF7, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xA8, 0x8B, 0xA8, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xF4, 0xA1, 0xF3, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF0, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xF2, 0xA1, 0xF1, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0x8D, 0xA8, 0x8E, /* 0xE0-0xE3 */
+	0xA8, 0x8F, 0xA8, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_26[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xA1, 0xEF, 0xA1, 0xEE, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xA8, 0x91, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xA1, 0xE2, 0x00, 0x00, 0xA1, 0xE1, 0x00, 0x00, /* 0x40-0x43 */
+};
+
+static const unsigned char u2c_30[512] = {
+	0xA1, 0xA1, 0xA1, 0xA2, 0xA1, 0xA3, 0xA1, 0xA8, /* 0x00-0x03 */
+	0x00, 0x00, 0xA1, 0xA9, 0xA9, 0x65, 0xA9, 0x96, /* 0x04-0x07 */
+	0xA1, 0xB4, 0xA1, 0xB5, 0xA1, 0xB6, 0xA1, 0xB7, /* 0x08-0x0B */
+	0xA1, 0xB8, 0xA1, 0xB9, 0xA1, 0xBA, 0xA1, 0xBB, /* 0x0C-0x0F */
+	0xA1, 0xBE, 0xA1, 0xBF, 0xA8, 0x93, 0xA1, 0xFE, /* 0x10-0x13 */
+	0xA1, 0xB2, 0xA1, 0xB3, 0xA1, 0xBC, 0xA1, 0xBD, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xA8, 0x94, 0xA8, 0x95, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xA9, 0x40, 0xA9, 0x41, 0xA9, 0x42, /* 0x20-0x23 */
+	0xA9, 0x43, 0xA9, 0x44, 0xA9, 0x45, 0xA9, 0x46, /* 0x24-0x27 */
+	0xA9, 0x47, 0xA9, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, /* 0x40-0x43 */
+	0xA4, 0xA4, 0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, /* 0x44-0x47 */
+	0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xAA, 0xA4, 0xAB, /* 0x48-0x4B */
+	0xA4, 0xAC, 0xA4, 0xAD, 0xA4, 0xAE, 0xA4, 0xAF, /* 0x4C-0x4F */
+	0xA4, 0xB0, 0xA4, 0xB1, 0xA4, 0xB2, 0xA4, 0xB3, /* 0x50-0x53 */
+	0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0x54-0x57 */
+	0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0x58-0x5B */
+	0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0xA4, 0xBF, /* 0x5C-0x5F */
+	0xA4, 0xC0, 0xA4, 0xC1, 0xA4, 0xC2, 0xA4, 0xC3, /* 0x60-0x63 */
+	0xA4, 0xC4, 0xA4, 0xC5, 0xA4, 0xC6, 0xA4, 0xC7, /* 0x64-0x67 */
+	0xA4, 0xC8, 0xA4, 0xC9, 0xA4, 0xCA, 0xA4, 0xCB, /* 0x68-0x6B */
+	0xA4, 0xCC, 0xA4, 0xCD, 0xA4, 0xCE, 0xA4, 0xCF, /* 0x6C-0x6F */
+	0xA4, 0xD0, 0xA4, 0xD1, 0xA4, 0xD2, 0xA4, 0xD3, /* 0x70-0x73 */
+	0xA4, 0xD4, 0xA4, 0xD5, 0xA4, 0xD6, 0xA4, 0xD7, /* 0x74-0x77 */
+	0xA4, 0xD8, 0xA4, 0xD9, 0xA4, 0xDA, 0xA4, 0xDB, /* 0x78-0x7B */
+	0xA4, 0xDC, 0xA4, 0xDD, 0xA4, 0xDE, 0xA4, 0xDF, /* 0x7C-0x7F */
+	
+	0xA4, 0xE0, 0xA4, 0xE1, 0xA4, 0xE2, 0xA4, 0xE3, /* 0x80-0x83 */
+	0xA4, 0xE4, 0xA4, 0xE5, 0xA4, 0xE6, 0xA4, 0xE7, /* 0x84-0x87 */
+	0xA4, 0xE8, 0xA4, 0xE9, 0xA4, 0xEA, 0xA4, 0xEB, /* 0x88-0x8B */
+	0xA4, 0xEC, 0xA4, 0xED, 0xA4, 0xEE, 0xA4, 0xEF, /* 0x8C-0x8F */
+	0xA4, 0xF0, 0xA4, 0xF1, 0xA4, 0xF2, 0xA4, 0xF3, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA9, 0x61, /* 0x98-0x9B */
+	0xA9, 0x62, 0xA9, 0x66, 0xA9, 0x67, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xA5, 0xA1, 0xA5, 0xA2, 0xA5, 0xA3, /* 0xA0-0xA3 */
+	0xA5, 0xA4, 0xA5, 0xA5, 0xA5, 0xA6, 0xA5, 0xA7, /* 0xA4-0xA7 */
+	0xA5, 0xA8, 0xA5, 0xA9, 0xA5, 0xAA, 0xA5, 0xAB, /* 0xA8-0xAB */
+	0xA5, 0xAC, 0xA5, 0xAD, 0xA5, 0xAE, 0xA5, 0xAF, /* 0xAC-0xAF */
+	0xA5, 0xB0, 0xA5, 0xB1, 0xA5, 0xB2, 0xA5, 0xB3, /* 0xB0-0xB3 */
+	0xA5, 0xB4, 0xA5, 0xB5, 0xA5, 0xB6, 0xA5, 0xB7, /* 0xB4-0xB7 */
+	0xA5, 0xB8, 0xA5, 0xB9, 0xA5, 0xBA, 0xA5, 0xBB, /* 0xB8-0xBB */
+	0xA5, 0xBC, 0xA5, 0xBD, 0xA5, 0xBE, 0xA5, 0xBF, /* 0xBC-0xBF */
+	0xA5, 0xC0, 0xA5, 0xC1, 0xA5, 0xC2, 0xA5, 0xC3, /* 0xC0-0xC3 */
+	0xA5, 0xC4, 0xA5, 0xC5, 0xA5, 0xC6, 0xA5, 0xC7, /* 0xC4-0xC7 */
+	0xA5, 0xC8, 0xA5, 0xC9, 0xA5, 0xCA, 0xA5, 0xCB, /* 0xC8-0xCB */
+	0xA5, 0xCC, 0xA5, 0xCD, 0xA5, 0xCE, 0xA5, 0xCF, /* 0xCC-0xCF */
+	0xA5, 0xD0, 0xA5, 0xD1, 0xA5, 0xD2, 0xA5, 0xD3, /* 0xD0-0xD3 */
+	0xA5, 0xD4, 0xA5, 0xD5, 0xA5, 0xD6, 0xA5, 0xD7, /* 0xD4-0xD7 */
+	0xA5, 0xD8, 0xA5, 0xD9, 0xA5, 0xDA, 0xA5, 0xDB, /* 0xD8-0xDB */
+	0xA5, 0xDC, 0xA5, 0xDD, 0xA5, 0xDE, 0xA5, 0xDF, /* 0xDC-0xDF */
+	0xA5, 0xE0, 0xA5, 0xE1, 0xA5, 0xE2, 0xA5, 0xE3, /* 0xE0-0xE3 */
+	0xA5, 0xE4, 0xA5, 0xE5, 0xA5, 0xE6, 0xA5, 0xE7, /* 0xE4-0xE7 */
+	0xA5, 0xE8, 0xA5, 0xE9, 0xA5, 0xEA, 0xA5, 0xEB, /* 0xE8-0xEB */
+	0xA5, 0xEC, 0xA5, 0xED, 0xA5, 0xEE, 0xA5, 0xEF, /* 0xEC-0xEF */
+	0xA5, 0xF0, 0xA5, 0xF1, 0xA5, 0xF2, 0xA5, 0xF3, /* 0xF0-0xF3 */
+	0xA5, 0xF4, 0xA5, 0xF5, 0xA5, 0xF6, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xA9, 0x60, 0xA9, 0x63, 0xA9, 0x64, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_31[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xA8, 0xC5, 0xA8, 0xC6, 0xA8, 0xC7, /* 0x04-0x07 */
+	0xA8, 0xC8, 0xA8, 0xC9, 0xA8, 0xCA, 0xA8, 0xCB, /* 0x08-0x0B */
+	0xA8, 0xCC, 0xA8, 0xCD, 0xA8, 0xCE, 0xA8, 0xCF, /* 0x0C-0x0F */
+	0xA8, 0xD0, 0xA8, 0xD1, 0xA8, 0xD2, 0xA8, 0xD3, /* 0x10-0x13 */
+	0xA8, 0xD4, 0xA8, 0xD5, 0xA8, 0xD6, 0xA8, 0xD7, /* 0x14-0x17 */
+	0xA8, 0xD8, 0xA8, 0xD9, 0xA8, 0xDA, 0xA8, 0xDB, /* 0x18-0x1B */
+	0xA8, 0xDC, 0xA8, 0xDD, 0xA8, 0xDE, 0xA8, 0xDF, /* 0x1C-0x1F */
+	0xA8, 0xE0, 0xA8, 0xE1, 0xA8, 0xE2, 0xA8, 0xE3, /* 0x20-0x23 */
+	0xA8, 0xE4, 0xA8, 0xE5, 0xA8, 0xE6, 0xA8, 0xE7, /* 0x24-0x27 */
+	0xA8, 0xE8, 0xA8, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0xBB, 0xB6, 0xFE, /* 0x90-0x93 */
+	0xC8, 0xFD, 0xCB, 0xC4, 0xC9, 0xCF, 0xD6, 0xD0, /* 0x94-0x97 */
+	0xCF, 0xC2, 0xBC, 0xD7, 0xD2, 0xD2, 0xB1, 0xFB, /* 0x98-0x9B */
+	0xB6, 0xA1, 0xCC, 0xEC, 0xB5, 0xD8, 0xC8, 0xCB, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_32[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xA2, 0xE5, 0xA2, 0xE6, 0xA2, 0xE7, 0xA2, 0xE8, /* 0x20-0x23 */
+	0xA2, 0xE9, 0xA2, 0xEA, 0xA2, 0xEB, 0xA2, 0xEC, /* 0x24-0x27 */
+	0xA2, 0xED, 0xA2, 0xEE, 0xD4, 0xC2, 0xBB, 0xF0, /* 0x28-0x2B */
+	0xCB, 0xAE, 0xC4, 0xBE, 0xBD, 0xF0, 0xCD, 0xC1, /* 0x2C-0x2F */
+	0xC8, 0xD5, 0xA9, 0x5A, 0xD3, 0xD0, 0xC9, 0xE7, /* 0x30-0x33 */
+	0xC3, 0xFB, 0xCC, 0xD8, 0xB2, 0xC6, 0xD7, 0xA3, /* 0x34-0x37 */
+	0xC0, 0xCD, 0xB4, 0xFA, 0xBA, 0xF4, 0xD1, 0xA7, /* 0x38-0x3B */
+	0xBC, 0xE0, 0xC6, 0xF3, 0xD7, 0xCA, 0xD0, 0xAD, /* 0x3C-0x3F */
+	0xBC, 0xC0, 0xD0, 0xDD, 0xD7, 0xD4, 0xD6, 0xC1, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xD2, 0xBB, 0xB6, 0xFE, 0xC8, 0xFD, 0xCB, 0xC4, /* 0x80-0x83 */
+	0xCE, 0xE5, 0xC1, 0xF9, 0xC6, 0xDF, 0xB0, 0xCB, /* 0x84-0x87 */
+	0xBE, 0xC5, 0xCA, 0xAE, 0xD4, 0xC2, 0xBB, 0xF0, /* 0x88-0x8B */
+	0xCB, 0xAE, 0xC4, 0xBE, 0xBD, 0xF0, 0xCD, 0xC1, /* 0x8C-0x8F */
+	0xC8, 0xD5, 0xD6, 0xEA, 0xD3, 0xD0, 0xC9, 0xE7, /* 0x90-0x93 */
+	0xC3, 0xFB, 0xCC, 0xD8, 0xB2, 0xC6, 0xD7, 0xA3, /* 0x94-0x97 */
+	0xC0, 0xCD, 0xC3, 0xD8, 0xC4, 0xD0, 0xC5, 0xAE, /* 0x98-0x9B */
+	0xCA, 0xCA, 0xD3, 0xC5, 0x00, 0x00, 0xD7, 0xA2, /* 0x9C-0x9F */
+	0xCF, 0xEE, 0xD0, 0xDD, 0xD0, 0xB4, 0xA9, 0x49, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xD2, 0xBD, 0xD7, 0xDA, 0xD1, 0xA7, /* 0xA8-0xAB */
+	0xBC, 0xE0, 0xC6, 0xF3, 0xD7, 0xCA, 0xD0, 0xAD, /* 0xAC-0xAF */
+	0xD2, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+};
+
+static const unsigned char u2c_33[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0x4A, 0xA9, 0x4B, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xA9, 0x4C, 0xA9, 0x4D, 0xA9, 0x4E, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xA9, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xA9, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0x51, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xA9, 0x52, 0xA9, 0x53, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xA9, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+};
+
+static const unsigned char u2c_4E[512] = {
+	0xD2, 0xBB, 0xB6, 0xA1, 0x81, 0x40, 0xC6, 0xDF, /* 0x00-0x03 */
+	0x81, 0x41, 0x81, 0x42, 0x81, 0x43, 0xCD, 0xF2, /* 0x04-0x07 */
+	0xD5, 0xC9, 0xC8, 0xFD, 0xC9, 0xCF, 0xCF, 0xC2, /* 0x08-0x0B */
+	0xD8, 0xA2, 0xB2, 0xBB, 0xD3, 0xEB, 0x81, 0x44, /* 0x0C-0x0F */
+	0xD8, 0xA4, 0xB3, 0xF3, 0x81, 0x45, 0xD7, 0xA8, /* 0x10-0x13 */
+	0xC7, 0xD2, 0xD8, 0xA7, 0xCA, 0xC0, 0x81, 0x46, /* 0x14-0x17 */
+	0xC7, 0xF0, 0xB1, 0xFB, 0xD2, 0xB5, 0xB4, 0xD4, /* 0x18-0x1B */
+	0xB6, 0xAB, 0xCB, 0xBF, 0xD8, 0xA9, 0x81, 0x47, /* 0x1C-0x1F */
+	0x81, 0x48, 0x81, 0x49, 0xB6, 0xAA, 0x81, 0x4A, /* 0x20-0x23 */
+	0xC1, 0xBD, 0xD1, 0xCF, 0x81, 0x4B, 0xC9, 0xA5, /* 0x24-0x27 */
+	0xD8, 0xAD, 0x81, 0x4C, 0xB8, 0xF6, 0xD1, 0xBE, /* 0x28-0x2B */
+	0xE3, 0xDC, 0xD6, 0xD0, 0x81, 0x4D, 0x81, 0x4E, /* 0x2C-0x2F */
+	0xB7, 0xE1, 0x81, 0x4F, 0xB4, 0xAE, 0x81, 0x50, /* 0x30-0x33 */
+	0xC1, 0xD9, 0x81, 0x51, 0xD8, 0xBC, 0x81, 0x52, /* 0x34-0x37 */
+	0xCD, 0xE8, 0xB5, 0xA4, 0xCE, 0xAA, 0xD6, 0xF7, /* 0x38-0x3B */
+	0x81, 0x53, 0xC0, 0xF6, 0xBE, 0xD9, 0xD8, 0xAF, /* 0x3C-0x3F */
+	0x81, 0x54, 0x81, 0x55, 0x81, 0x56, 0xC4, 0xCB, /* 0x40-0x43 */
+	0x81, 0x57, 0xBE, 0xC3, 0x81, 0x58, 0xD8, 0xB1, /* 0x44-0x47 */
+	0xC3, 0xB4, 0xD2, 0xE5, 0x81, 0x59, 0xD6, 0xAE, /* 0x48-0x4B */
+	0xCE, 0xDA, 0xD5, 0xA7, 0xBA, 0xF5, 0xB7, 0xA6, /* 0x4C-0x4F */
+	0xC0, 0xD6, 0x81, 0x5A, 0xC6, 0xB9, 0xC5, 0xD2, /* 0x50-0x53 */
+	0xC7, 0xC7, 0x81, 0x5B, 0xB9, 0xD4, 0x81, 0x5C, /* 0x54-0x57 */
+	0xB3, 0xCB, 0xD2, 0xD2, 0x81, 0x5D, 0x81, 0x5E, /* 0x58-0x5B */
+	0xD8, 0xBF, 0xBE, 0xC5, 0xC6, 0xF2, 0xD2, 0xB2, /* 0x5C-0x5F */
+	0xCF, 0xB0, 0xCF, 0xE7, 0x81, 0x5F, 0x81, 0x60, /* 0x60-0x63 */
+	0x81, 0x61, 0x81, 0x62, 0xCA, 0xE9, 0x81, 0x63, /* 0x64-0x67 */
+	0x81, 0x64, 0xD8, 0xC0, 0x81, 0x65, 0x81, 0x66, /* 0x68-0x6B */
+	0x81, 0x67, 0x81, 0x68, 0x81, 0x69, 0x81, 0x6A, /* 0x6C-0x6F */
+	0xC2, 0xF2, 0xC2, 0xD2, 0x81, 0x6B, 0xC8, 0xE9, /* 0x70-0x73 */
+	0x81, 0x6C, 0x81, 0x6D, 0x81, 0x6E, 0x81, 0x6F, /* 0x74-0x77 */
+	0x81, 0x70, 0x81, 0x71, 0x81, 0x72, 0x81, 0x73, /* 0x78-0x7B */
+	0x81, 0x74, 0x81, 0x75, 0xC7, 0xAC, 0x81, 0x76, /* 0x7C-0x7F */
+	
+	0x81, 0x77, 0x81, 0x78, 0x81, 0x79, 0x81, 0x7A, /* 0x80-0x83 */
+	0x81, 0x7B, 0x81, 0x7C, 0xC1, 0xCB, 0x81, 0x7D, /* 0x84-0x87 */
+	0xD3, 0xE8, 0xD5, 0xF9, 0x81, 0x7E, 0xCA, 0xC2, /* 0x88-0x8B */
+	0xB6, 0xFE, 0xD8, 0xA1, 0xD3, 0xDA, 0xBF, 0xF7, /* 0x8C-0x8F */
+	0x81, 0x80, 0xD4, 0xC6, 0xBB, 0xA5, 0xD8, 0xC1, /* 0x90-0x93 */
+	0xCE, 0xE5, 0xBE, 0xAE, 0x81, 0x81, 0x81, 0x82, /* 0x94-0x97 */
+	0xD8, 0xA8, 0x81, 0x83, 0xD1, 0xC7, 0xD0, 0xA9, /* 0x98-0x9B */
+	0x81, 0x84, 0x81, 0x85, 0x81, 0x86, 0xD8, 0xBD, /* 0x9C-0x9F */
+	0xD9, 0xEF, 0xCD, 0xF6, 0xBF, 0xBA, 0x81, 0x87, /* 0xA0-0xA3 */
+	0xBD, 0xBB, 0xBA, 0xA5, 0xD2, 0xE0, 0xB2, 0xFA, /* 0xA4-0xA7 */
+	0xBA, 0xE0, 0xC4, 0xB6, 0x81, 0x88, 0xCF, 0xED, /* 0xA8-0xAB */
+	0xBE, 0xA9, 0xCD, 0xA4, 0xC1, 0xC1, 0x81, 0x89, /* 0xAC-0xAF */
+	0x81, 0x8A, 0x81, 0x8B, 0xC7, 0xD7, 0xD9, 0xF1, /* 0xB0-0xB3 */
+	0x81, 0x8C, 0xD9, 0xF4, 0x81, 0x8D, 0x81, 0x8E, /* 0xB4-0xB7 */
+	0x81, 0x8F, 0x81, 0x90, 0xC8, 0xCB, 0xD8, 0xE9, /* 0xB8-0xBB */
+	0x81, 0x91, 0x81, 0x92, 0x81, 0x93, 0xD2, 0xDA, /* 0xBC-0xBF */
+	0xCA, 0xB2, 0xC8, 0xCA, 0xD8, 0xEC, 0xD8, 0xEA, /* 0xC0-0xC3 */
+	0xD8, 0xC6, 0xBD, 0xF6, 0xC6, 0xCD, 0xB3, 0xF0, /* 0xC4-0xC7 */
+	0x81, 0x94, 0xD8, 0xEB, 0xBD, 0xF1, 0xBD, 0xE9, /* 0xC8-0xCB */
+	0x81, 0x95, 0xC8, 0xD4, 0xB4, 0xD3, 0x81, 0x96, /* 0xCC-0xCF */
+	0x81, 0x97, 0xC2, 0xD8, 0x81, 0x98, 0xB2, 0xD6, /* 0xD0-0xD3 */
+	0xD7, 0xD0, 0xCA, 0xCB, 0xCB, 0xFB, 0xD5, 0xCC, /* 0xD4-0xD7 */
+	0xB8, 0xB6, 0xCF, 0xC9, 0x81, 0x99, 0x81, 0x9A, /* 0xD8-0xDB */
+	0x81, 0x9B, 0xD9, 0xDA, 0xD8, 0xF0, 0xC7, 0xAA, /* 0xDC-0xDF */
+	0x81, 0x9C, 0xD8, 0xEE, 0x81, 0x9D, 0xB4, 0xFA, /* 0xE0-0xE3 */
+	0xC1, 0xEE, 0xD2, 0xD4, 0x81, 0x9E, 0x81, 0x9F, /* 0xE4-0xE7 */
+	0xD8, 0xED, 0x81, 0xA0, 0xD2, 0xC7, 0xD8, 0xEF, /* 0xE8-0xEB */
+	0xC3, 0xC7, 0x81, 0xA1, 0x81, 0xA2, 0x81, 0xA3, /* 0xEC-0xEF */
+	0xD1, 0xF6, 0x81, 0xA4, 0xD6, 0xD9, 0xD8, 0xF2, /* 0xF0-0xF3 */
+	0x81, 0xA5, 0xD8, 0xF5, 0xBC, 0xFE, 0xBC, 0xDB, /* 0xF4-0xF7 */
+	0x81, 0xA6, 0x81, 0xA7, 0x81, 0xA8, 0xC8, 0xCE, /* 0xF8-0xFB */
+	0x81, 0xA9, 0xB7, 0xDD, 0x81, 0xAA, 0xB7, 0xC2, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_4F[512] = {
+	0x81, 0xAB, 0xC6, 0xF3, 0x81, 0xAC, 0x81, 0xAD, /* 0x00-0x03 */
+	0x81, 0xAE, 0x81, 0xAF, 0x81, 0xB0, 0x81, 0xB1, /* 0x04-0x07 */
+	0x81, 0xB2, 0xD8, 0xF8, 0xD2, 0xC1, 0x81, 0xB3, /* 0x08-0x0B */
+	0x81, 0xB4, 0xCE, 0xE9, 0xBC, 0xBF, 0xB7, 0xFC, /* 0x0C-0x0F */
+	0xB7, 0xA5, 0xD0, 0xDD, 0x81, 0xB5, 0x81, 0xB6, /* 0x10-0x13 */
+	0x81, 0xB7, 0x81, 0xB8, 0x81, 0xB9, 0xD6, 0xDA, /* 0x14-0x17 */
+	0xD3, 0xC5, 0xBB, 0xEF, 0xBB, 0xE1, 0xD8, 0xF1, /* 0x18-0x1B */
+	0x81, 0xBA, 0x81, 0xBB, 0xC9, 0xA1, 0xCE, 0xB0, /* 0x1C-0x1F */
+	0xB4, 0xAB, 0x81, 0xBC, 0xD8, 0xF3, 0x81, 0xBD, /* 0x20-0x23 */
+	0xC9, 0xCB, 0xD8, 0xF6, 0xC2, 0xD7, 0xD8, 0xF7, /* 0x24-0x27 */
+	0x81, 0xBE, 0x81, 0xBF, 0xCE, 0xB1, 0xD8, 0xF9, /* 0x28-0x2B */
+	0x81, 0xC0, 0x81, 0xC1, 0x81, 0xC2, 0xB2, 0xAE, /* 0x2C-0x2F */
+	0xB9, 0xC0, 0x81, 0xC3, 0xD9, 0xA3, 0x81, 0xC4, /* 0x30-0x33 */
+	0xB0, 0xE9, 0x81, 0xC5, 0xC1, 0xE6, 0x81, 0xC6, /* 0x34-0x37 */
+	0xC9, 0xEC, 0x81, 0xC7, 0xCB, 0xC5, 0x81, 0xC8, /* 0x38-0x3B */
+	0xCB, 0xC6, 0xD9, 0xA4, 0x81, 0xC9, 0x81, 0xCA, /* 0x3C-0x3F */
+	0x81, 0xCB, 0x81, 0xCC, 0x81, 0xCD, 0xB5, 0xE8, /* 0x40-0x43 */
+	0x81, 0xCE, 0x81, 0xCF, 0xB5, 0xAB, 0x81, 0xD0, /* 0x44-0x47 */
+	0x81, 0xD1, 0x81, 0xD2, 0x81, 0xD3, 0x81, 0xD4, /* 0x48-0x4B */
+	0x81, 0xD5, 0xCE, 0xBB, 0xB5, 0xCD, 0xD7, 0xA1, /* 0x4C-0x4F */
+	0xD7, 0xF4, 0xD3, 0xD3, 0x81, 0xD6, 0xCC, 0xE5, /* 0x50-0x53 */
+	0x81, 0xD7, 0xBA, 0xCE, 0x81, 0xD8, 0xD9, 0xA2, /* 0x54-0x57 */
+	0xD9, 0xDC, 0xD3, 0xE0, 0xD8, 0xFD, 0xB7, 0xF0, /* 0x58-0x5B */
+	0xD7, 0xF7, 0xD8, 0xFE, 0xD8, 0xFA, 0xD9, 0xA1, /* 0x5C-0x5F */
+	0xC4, 0xE3, 0x81, 0xD9, 0x81, 0xDA, 0xD3, 0xB6, /* 0x60-0x63 */
+	0xD8, 0xF4, 0xD9, 0xDD, 0x81, 0xDB, 0xD8, 0xFB, /* 0x64-0x67 */
+	0x81, 0xDC, 0xC5, 0xE5, 0x81, 0xDD, 0x81, 0xDE, /* 0x68-0x6B */
+	0xC0, 0xD0, 0x81, 0xDF, 0x81, 0xE0, 0xD1, 0xF0, /* 0x6C-0x6F */
+	0xB0, 0xDB, 0x81, 0xE1, 0x81, 0xE2, 0xBC, 0xD1, /* 0x70-0x73 */
+	0xD9, 0xA6, 0x81, 0xE3, 0xD9, 0xA5, 0x81, 0xE4, /* 0x74-0x77 */
+	0x81, 0xE5, 0x81, 0xE6, 0x81, 0xE7, 0xD9, 0xAC, /* 0x78-0x7B */
+	0xD9, 0xAE, 0x81, 0xE8, 0xD9, 0xAB, 0xCA, 0xB9, /* 0x7C-0x7F */
+	
+	0x81, 0xE9, 0x81, 0xEA, 0x81, 0xEB, 0xD9, 0xA9, /* 0x80-0x83 */
+	0xD6, 0xB6, 0x81, 0xEC, 0x81, 0xED, 0x81, 0xEE, /* 0x84-0x87 */
+	0xB3, 0xDE, 0xD9, 0xA8, 0x81, 0xEF, 0xC0, 0xFD, /* 0x88-0x8B */
+	0x81, 0xF0, 0xCA, 0xCC, 0x81, 0xF1, 0xD9, 0xAA, /* 0x8C-0x8F */
+	0x81, 0xF2, 0xD9, 0xA7, 0x81, 0xF3, 0x81, 0xF4, /* 0x90-0x93 */
+	0xD9, 0xB0, 0x81, 0xF5, 0x81, 0xF6, 0xB6, 0xB1, /* 0x94-0x97 */
+	0x81, 0xF7, 0x81, 0xF8, 0x81, 0xF9, 0xB9, 0xA9, /* 0x98-0x9B */
+	0x81, 0xFA, 0xD2, 0xC0, 0x81, 0xFB, 0x81, 0xFC, /* 0x9C-0x9F */
+	0xCF, 0xC0, 0x81, 0xFD, 0x81, 0xFE, 0xC2, 0xC2, /* 0xA0-0xA3 */
+	0x82, 0x40, 0xBD, 0xC4, 0xD5, 0xEC, 0xB2, 0xE0, /* 0xA4-0xA7 */
+	0xC7, 0xC8, 0xBF, 0xEB, 0xD9, 0xAD, 0x82, 0x41, /* 0xA8-0xAB */
+	0xD9, 0xAF, 0x82, 0x42, 0xCE, 0xEA, 0xBA, 0xEE, /* 0xAC-0xAF */
+	0x82, 0x43, 0x82, 0x44, 0x82, 0x45, 0x82, 0x46, /* 0xB0-0xB3 */
+	0x82, 0x47, 0xC7, 0xD6, 0x82, 0x48, 0x82, 0x49, /* 0xB4-0xB7 */
+	0x82, 0x4A, 0x82, 0x4B, 0x82, 0x4C, 0x82, 0x4D, /* 0xB8-0xBB */
+	0x82, 0x4E, 0x82, 0x4F, 0x82, 0x50, 0xB1, 0xE3, /* 0xBC-0xBF */
+	0x82, 0x51, 0x82, 0x52, 0x82, 0x53, 0xB4, 0xD9, /* 0xC0-0xC3 */
+	0xB6, 0xED, 0xD9, 0xB4, 0x82, 0x54, 0x82, 0x55, /* 0xC4-0xC7 */
+	0x82, 0x56, 0x82, 0x57, 0xBF, 0xA1, 0x82, 0x58, /* 0xC8-0xCB */
+	0x82, 0x59, 0x82, 0x5A, 0xD9, 0xDE, 0xC7, 0xCE, /* 0xCC-0xCF */
+	0xC0, 0xFE, 0xD9, 0xB8, 0x82, 0x5B, 0x82, 0x5C, /* 0xD0-0xD3 */
+	0x82, 0x5D, 0x82, 0x5E, 0x82, 0x5F, 0xCB, 0xD7, /* 0xD4-0xD7 */
+	0xB7, 0xFD, 0x82, 0x60, 0xD9, 0xB5, 0x82, 0x61, /* 0xD8-0xDB */
+	0xD9, 0xB7, 0xB1, 0xA3, 0xD3, 0xE1, 0xD9, 0xB9, /* 0xDC-0xDF */
+	0x82, 0x62, 0xD0, 0xC5, 0x82, 0x63, 0xD9, 0xB6, /* 0xE0-0xE3 */
+	0x82, 0x64, 0x82, 0x65, 0xD9, 0xB1, 0x82, 0x66, /* 0xE4-0xE7 */
+	0xD9, 0xB2, 0xC1, 0xA9, 0xD9, 0xB3, 0x82, 0x67, /* 0xE8-0xEB */
+	0x82, 0x68, 0xBC, 0xF3, 0xD0, 0xDE, 0xB8, 0xA9, /* 0xEC-0xEF */
+	0x82, 0x69, 0xBE, 0xE3, 0x82, 0x6A, 0xD9, 0xBD, /* 0xF0-0xF3 */
+	0x82, 0x6B, 0x82, 0x6C, 0x82, 0x6D, 0x82, 0x6E, /* 0xF4-0xF7 */
+	0xD9, 0xBA, 0x82, 0x6F, 0xB0, 0xB3, 0x82, 0x70, /* 0xF8-0xFB */
+	0x82, 0x71, 0x82, 0x72, 0xD9, 0xC2, 0x82, 0x73, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_50[512] = {
+	0x82, 0x74, 0x82, 0x75, 0x82, 0x76, 0x82, 0x77, /* 0x00-0x03 */
+	0x82, 0x78, 0x82, 0x79, 0x82, 0x7A, 0x82, 0x7B, /* 0x04-0x07 */
+	0x82, 0x7C, 0x82, 0x7D, 0x82, 0x7E, 0x82, 0x80, /* 0x08-0x0B */
+	0xD9, 0xC4, 0xB1, 0xB6, 0x82, 0x81, 0xD9, 0xBF, /* 0x0C-0x0F */
+	0x82, 0x82, 0x82, 0x83, 0xB5, 0xB9, 0x82, 0x84, /* 0x10-0x13 */
+	0xBE, 0xF3, 0x82, 0x85, 0x82, 0x86, 0x82, 0x87, /* 0x14-0x17 */
+	0xCC, 0xC8, 0xBA, 0xF2, 0xD2, 0xD0, 0x82, 0x88, /* 0x18-0x1B */
+	0xD9, 0xC3, 0x82, 0x89, 0x82, 0x8A, 0xBD, 0xE8, /* 0x1C-0x1F */
+	0x82, 0x8B, 0xB3, 0xAB, 0x82, 0x8C, 0x82, 0x8D, /* 0x20-0x23 */
+	0x82, 0x8E, 0xD9, 0xC5, 0xBE, 0xEB, 0x82, 0x8F, /* 0x24-0x27 */
+	0xD9, 0xC6, 0xD9, 0xBB, 0xC4, 0xDF, 0x82, 0x90, /* 0x28-0x2B */
+	0xD9, 0xBE, 0xD9, 0xC1, 0xD9, 0xC0, 0x82, 0x91, /* 0x2C-0x2F */
+	0x82, 0x92, 0x82, 0x93, 0x82, 0x94, 0x82, 0x95, /* 0x30-0x33 */
+	0x82, 0x96, 0x82, 0x97, 0x82, 0x98, 0x82, 0x99, /* 0x34-0x37 */
+	0x82, 0x9A, 0x82, 0x9B, 0xD5, 0xAE, 0x82, 0x9C, /* 0x38-0x3B */
+	0xD6, 0xB5, 0x82, 0x9D, 0xC7, 0xE3, 0x82, 0x9E, /* 0x3C-0x3F */
+	0x82, 0x9F, 0x82, 0xA0, 0x82, 0xA1, 0xD9, 0xC8, /* 0x40-0x43 */
+	0x82, 0xA2, 0x82, 0xA3, 0x82, 0xA4, 0xBC, 0xD9, /* 0x44-0x47 */
+	0xD9, 0xCA, 0x82, 0xA5, 0x82, 0xA6, 0x82, 0xA7, /* 0x48-0x4B */
+	0xD9, 0xBC, 0x82, 0xA8, 0xD9, 0xCB, 0xC6, 0xAB, /* 0x4C-0x4F */
+	0x82, 0xA9, 0x82, 0xAA, 0x82, 0xAB, 0x82, 0xAC, /* 0x50-0x53 */
+	0x82, 0xAD, 0xD9, 0xC9, 0x82, 0xAE, 0x82, 0xAF, /* 0x54-0x57 */
+	0x82, 0xB0, 0x82, 0xB1, 0xD7, 0xF6, 0x82, 0xB2, /* 0x58-0x5B */
+	0xCD, 0xA3, 0x82, 0xB3, 0x82, 0xB4, 0x82, 0xB5, /* 0x5C-0x5F */
+	0x82, 0xB6, 0x82, 0xB7, 0x82, 0xB8, 0x82, 0xB9, /* 0x60-0x63 */
+	0x82, 0xBA, 0xBD, 0xA1, 0x82, 0xBB, 0x82, 0xBC, /* 0x64-0x67 */
+	0x82, 0xBD, 0x82, 0xBE, 0x82, 0xBF, 0x82, 0xC0, /* 0x68-0x6B */
+	0xD9, 0xCC, 0x82, 0xC1, 0x82, 0xC2, 0x82, 0xC3, /* 0x6C-0x6F */
+	0x82, 0xC4, 0x82, 0xC5, 0x82, 0xC6, 0x82, 0xC7, /* 0x70-0x73 */
+	0x82, 0xC8, 0x82, 0xC9, 0xC5, 0xBC, 0xCD, 0xB5, /* 0x74-0x77 */
+	0x82, 0xCA, 0x82, 0xCB, 0x82, 0xCC, 0xD9, 0xCD, /* 0x78-0x7B */
+	0x82, 0xCD, 0x82, 0xCE, 0xD9, 0xC7, 0xB3, 0xA5, /* 0x7C-0x7F */
+	
+	0xBF, 0xFE, 0x82, 0xCF, 0x82, 0xD0, 0x82, 0xD1, /* 0x80-0x83 */
+	0x82, 0xD2, 0xB8, 0xB5, 0x82, 0xD3, 0x82, 0xD4, /* 0x84-0x87 */
+	0xC0, 0xFC, 0x82, 0xD5, 0x82, 0xD6, 0x82, 0xD7, /* 0x88-0x8B */
+	0x82, 0xD8, 0xB0, 0xF8, 0x82, 0xD9, 0x82, 0xDA, /* 0x8C-0x8F */
+	0x82, 0xDB, 0x82, 0xDC, 0x82, 0xDD, 0x82, 0xDE, /* 0x90-0x93 */
+	0x82, 0xDF, 0x82, 0xE0, 0x82, 0xE1, 0x82, 0xE2, /* 0x94-0x97 */
+	0x82, 0xE3, 0x82, 0xE4, 0x82, 0xE5, 0x82, 0xE6, /* 0x98-0x9B */
+	0x82, 0xE7, 0x82, 0xE8, 0x82, 0xE9, 0x82, 0xEA, /* 0x9C-0x9F */
+	0x82, 0xEB, 0x82, 0xEC, 0x82, 0xED, 0xB4, 0xF6, /* 0xA0-0xA3 */
+	0x82, 0xEE, 0xD9, 0xCE, 0x82, 0xEF, 0xD9, 0xCF, /* 0xA4-0xA7 */
+	0xB4, 0xA2, 0xD9, 0xD0, 0x82, 0xF0, 0x82, 0xF1, /* 0xA8-0xAB */
+	0xB4, 0xDF, 0x82, 0xF2, 0x82, 0xF3, 0x82, 0xF4, /* 0xAC-0xAF */
+	0x82, 0xF5, 0x82, 0xF6, 0xB0, 0xC1, 0x82, 0xF7, /* 0xB0-0xB3 */
+	0x82, 0xF8, 0x82, 0xF9, 0x82, 0xFA, 0x82, 0xFB, /* 0xB4-0xB7 */
+	0x82, 0xFC, 0x82, 0xFD, 0xD9, 0xD1, 0xC9, 0xB5, /* 0xB8-0xBB */
+	0x82, 0xFE, 0x83, 0x40, 0x83, 0x41, 0x83, 0x42, /* 0xBC-0xBF */
+	0x83, 0x43, 0x83, 0x44, 0x83, 0x45, 0x83, 0x46, /* 0xC0-0xC3 */
+	0x83, 0x47, 0x83, 0x48, 0x83, 0x49, 0x83, 0x4A, /* 0xC4-0xC7 */
+	0x83, 0x4B, 0x83, 0x4C, 0x83, 0x4D, 0x83, 0x4E, /* 0xC8-0xCB */
+	0x83, 0x4F, 0x83, 0x50, 0x83, 0x51, 0xCF, 0xF1, /* 0xCC-0xCF */
+	0x83, 0x52, 0x83, 0x53, 0x83, 0x54, 0x83, 0x55, /* 0xD0-0xD3 */
+	0x83, 0x56, 0x83, 0x57, 0xD9, 0xD2, 0x83, 0x58, /* 0xD4-0xD7 */
+	0x83, 0x59, 0x83, 0x5A, 0xC1, 0xC5, 0x83, 0x5B, /* 0xD8-0xDB */
+	0x83, 0x5C, 0x83, 0x5D, 0x83, 0x5E, 0x83, 0x5F, /* 0xDC-0xDF */
+	0x83, 0x60, 0x83, 0x61, 0x83, 0x62, 0x83, 0x63, /* 0xE0-0xE3 */
+	0x83, 0x64, 0x83, 0x65, 0xD9, 0xD6, 0xC9, 0xAE, /* 0xE4-0xE7 */
+	0x83, 0x66, 0x83, 0x67, 0x83, 0x68, 0x83, 0x69, /* 0xE8-0xEB */
+	0xD9, 0xD5, 0xD9, 0xD4, 0xD9, 0xD7, 0x83, 0x6A, /* 0xEC-0xEF */
+	0x83, 0x6B, 0x83, 0x6C, 0x83, 0x6D, 0xCB, 0xDB, /* 0xF0-0xF3 */
+	0x83, 0x6E, 0xBD, 0xA9, 0x83, 0x6F, 0x83, 0x70, /* 0xF4-0xF7 */
+	0x83, 0x71, 0x83, 0x72, 0x83, 0x73, 0xC6, 0xA7, /* 0xF8-0xFB */
+	0x83, 0x74, 0x83, 0x75, 0x83, 0x76, 0x83, 0x77, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_51[512] = {
+	0x83, 0x78, 0x83, 0x79, 0x83, 0x7A, 0x83, 0x7B, /* 0x00-0x03 */
+	0x83, 0x7C, 0x83, 0x7D, 0xD9, 0xD3, 0xD9, 0xD8, /* 0x04-0x07 */
+	0x83, 0x7E, 0x83, 0x80, 0x83, 0x81, 0xD9, 0xD9, /* 0x08-0x0B */
+	0x83, 0x82, 0x83, 0x83, 0x83, 0x84, 0x83, 0x85, /* 0x0C-0x0F */
+	0x83, 0x86, 0x83, 0x87, 0xC8, 0xE5, 0x83, 0x88, /* 0x10-0x13 */
+	0x83, 0x89, 0x83, 0x8A, 0x83, 0x8B, 0x83, 0x8C, /* 0x14-0x17 */
+	0x83, 0x8D, 0x83, 0x8E, 0x83, 0x8F, 0x83, 0x90, /* 0x18-0x1B */
+	0x83, 0x91, 0x83, 0x92, 0x83, 0x93, 0x83, 0x94, /* 0x1C-0x1F */
+	0x83, 0x95, 0xC0, 0xDC, 0x83, 0x96, 0x83, 0x97, /* 0x20-0x23 */
+	0x83, 0x98, 0x83, 0x99, 0x83, 0x9A, 0x83, 0x9B, /* 0x24-0x27 */
+	0x83, 0x9C, 0x83, 0x9D, 0x83, 0x9E, 0x83, 0x9F, /* 0x28-0x2B */
+	0x83, 0xA0, 0x83, 0xA1, 0x83, 0xA2, 0x83, 0xA3, /* 0x2C-0x2F */
+	0x83, 0xA4, 0x83, 0xA5, 0x83, 0xA6, 0x83, 0xA7, /* 0x30-0x33 */
+	0x83, 0xA8, 0x83, 0xA9, 0x83, 0xAA, 0x83, 0xAB, /* 0x34-0x37 */
+	0x83, 0xAC, 0x83, 0xAD, 0x83, 0xAE, 0x83, 0xAF, /* 0x38-0x3B */
+	0x83, 0xB0, 0x83, 0xB1, 0x83, 0xB2, 0xB6, 0xF9, /* 0x3C-0x3F */
+	0xD8, 0xA3, 0xD4, 0xCA, 0x83, 0xB3, 0xD4, 0xAA, /* 0x40-0x43 */
+	0xD0, 0xD6, 0xB3, 0xE4, 0xD5, 0xD7, 0x83, 0xB4, /* 0x44-0x47 */
+	0xCF, 0xC8, 0xB9, 0xE2, 0x83, 0xB5, 0xBF, 0xCB, /* 0x48-0x4B */
+	0x83, 0xB6, 0xC3, 0xE2, 0x83, 0xB7, 0x83, 0xB8, /* 0x4C-0x4F */
+	0x83, 0xB9, 0xB6, 0xD2, 0x83, 0xBA, 0x83, 0xBB, /* 0x50-0x53 */
+	0xCD, 0xC3, 0xD9, 0xEE, 0xD9, 0xF0, 0x83, 0xBC, /* 0x54-0x57 */
+	0x83, 0xBD, 0x83, 0xBE, 0xB5, 0xB3, 0x83, 0xBF, /* 0x58-0x5B */
+	0xB6, 0xB5, 0x83, 0xC0, 0x83, 0xC1, 0x83, 0xC2, /* 0x5C-0x5F */
+	0x83, 0xC3, 0x83, 0xC4, 0xBE, 0xA4, 0x83, 0xC5, /* 0x60-0x63 */
+	0x83, 0xC6, 0xC8, 0xEB, 0x83, 0xC7, 0x83, 0xC8, /* 0x64-0x67 */
+	0xC8, 0xAB, 0x83, 0xC9, 0x83, 0xCA, 0xB0, 0xCB, /* 0x68-0x6B */
+	0xB9, 0xAB, 0xC1, 0xF9, 0xD9, 0xE2, 0x83, 0xCB, /* 0x6C-0x6F */
+	0xC0, 0xBC, 0xB9, 0xB2, 0x83, 0xCC, 0xB9, 0xD8, /* 0x70-0x73 */
+	0xD0, 0xCB, 0xB1, 0xF8, 0xC6, 0xE4, 0xBE, 0xDF, /* 0x74-0x77 */
+	0xB5, 0xE4, 0xD7, 0xC8, 0x83, 0xCD, 0xD1, 0xF8, /* 0x78-0x7B */
+	0xBC, 0xE6, 0xCA, 0xDE, 0x83, 0xCE, 0x83, 0xCF, /* 0x7C-0x7F */
+	
+	0xBC, 0xBD, 0xD9, 0xE6, 0xD8, 0xE7, 0x83, 0xD0, /* 0x80-0x83 */
+	0x83, 0xD1, 0xC4, 0xDA, 0x83, 0xD2, 0x83, 0xD3, /* 0x84-0x87 */
+	0xB8, 0xD4, 0xC8, 0xBD, 0x83, 0xD4, 0x83, 0xD5, /* 0x88-0x8B */
+	0xB2, 0xE1, 0xD4, 0xD9, 0x83, 0xD6, 0x83, 0xD7, /* 0x8C-0x8F */
+	0x83, 0xD8, 0x83, 0xD9, 0xC3, 0xB0, 0x83, 0xDA, /* 0x90-0x93 */
+	0x83, 0xDB, 0xC3, 0xE1, 0xDA, 0xA2, 0xC8, 0xDF, /* 0x94-0x97 */
+	0x83, 0xDC, 0xD0, 0xB4, 0x83, 0xDD, 0xBE, 0xFC, /* 0x98-0x9B */
+	0xC5, 0xA9, 0x83, 0xDE, 0x83, 0xDF, 0x83, 0xE0, /* 0x9C-0x9F */
+	0xB9, 0xDA, 0x83, 0xE1, 0xDA, 0xA3, 0x83, 0xE2, /* 0xA0-0xA3 */
+	0xD4, 0xA9, 0xDA, 0xA4, 0x83, 0xE3, 0x83, 0xE4, /* 0xA4-0xA7 */
+	0x83, 0xE5, 0x83, 0xE6, 0x83, 0xE7, 0xD9, 0xFB, /* 0xA8-0xAB */
+	0xB6, 0xAC, 0x83, 0xE8, 0x83, 0xE9, 0xB7, 0xEB, /* 0xAC-0xAF */
+	0xB1, 0xF9, 0xD9, 0xFC, 0xB3, 0xE5, 0xBE, 0xF6, /* 0xB0-0xB3 */
+	0x83, 0xEA, 0xBF, 0xF6, 0xD2, 0xB1, 0xC0, 0xE4, /* 0xB4-0xB7 */
+	0x83, 0xEB, 0x83, 0xEC, 0x83, 0xED, 0xB6, 0xB3, /* 0xB8-0xBB */
+	0xD9, 0xFE, 0xD9, 0xFD, 0x83, 0xEE, 0x83, 0xEF, /* 0xBC-0xBF */
+	0xBE, 0xBB, 0x83, 0xF0, 0x83, 0xF1, 0x83, 0xF2, /* 0xC0-0xC3 */
+	0xC6, 0xE0, 0x83, 0xF3, 0xD7, 0xBC, 0xDA, 0xA1, /* 0xC4-0xC7 */
+	0x83, 0xF4, 0xC1, 0xB9, 0x83, 0xF5, 0xB5, 0xF2, /* 0xC8-0xCB */
+	0xC1, 0xE8, 0x83, 0xF6, 0x83, 0xF7, 0xBC, 0xF5, /* 0xCC-0xCF */
+	0x83, 0xF8, 0xB4, 0xD5, 0x83, 0xF9, 0x83, 0xFA, /* 0xD0-0xD3 */
+	0x83, 0xFB, 0x83, 0xFC, 0x83, 0xFD, 0x83, 0xFE, /* 0xD4-0xD7 */
+	0x84, 0x40, 0x84, 0x41, 0x84, 0x42, 0xC1, 0xDD, /* 0xD8-0xDB */
+	0x84, 0x43, 0xC4, 0xFD, 0x84, 0x44, 0x84, 0x45, /* 0xDC-0xDF */
+	0xBC, 0xB8, 0xB7, 0xB2, 0x84, 0x46, 0x84, 0x47, /* 0xE0-0xE3 */
+	0xB7, 0xEF, 0x84, 0x48, 0x84, 0x49, 0x84, 0x4A, /* 0xE4-0xE7 */
+	0x84, 0x4B, 0x84, 0x4C, 0x84, 0x4D, 0xD9, 0xEC, /* 0xE8-0xEB */
+	0x84, 0x4E, 0xC6, 0xBE, 0x84, 0x4F, 0xBF, 0xAD, /* 0xEC-0xEF */
+	0xBB, 0xCB, 0x84, 0x50, 0x84, 0x51, 0xB5, 0xCA, /* 0xF0-0xF3 */
+	0x84, 0x52, 0xDB, 0xC9, 0xD0, 0xD7, 0x84, 0x53, /* 0xF4-0xF7 */
+	0xCD, 0xB9, 0xB0, 0xBC, 0xB3, 0xF6, 0xBB, 0xF7, /* 0xF8-0xFB */
+	0xDB, 0xCA, 0xBA, 0xAF, 0x84, 0x54, 0xD4, 0xE4, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_52[512] = {
+	0xB5, 0xB6, 0xB5, 0xF3, 0xD8, 0xD6, 0xC8, 0xD0, /* 0x00-0x03 */
+	0x84, 0x55, 0x84, 0x56, 0xB7, 0xD6, 0xC7, 0xD0, /* 0x04-0x07 */
+	0xD8, 0xD7, 0x84, 0x57, 0xBF, 0xAF, 0x84, 0x58, /* 0x08-0x0B */
+	0x84, 0x59, 0xDB, 0xBB, 0xD8, 0xD8, 0x84, 0x5A, /* 0x0C-0x0F */
+	0x84, 0x5B, 0xD0, 0xCC, 0xBB, 0xAE, 0x84, 0x5C, /* 0x10-0x13 */
+	0x84, 0x5D, 0x84, 0x5E, 0xEB, 0xBE, 0xC1, 0xD0, /* 0x14-0x17 */
+	0xC1, 0xF5, 0xD4, 0xF2, 0xB8, 0xD5, 0xB4, 0xB4, /* 0x18-0x1B */
+	0x84, 0x5F, 0xB3, 0xF5, 0x84, 0x60, 0x84, 0x61, /* 0x1C-0x1F */
+	0xC9, 0xBE, 0x84, 0x62, 0x84, 0x63, 0x84, 0x64, /* 0x20-0x23 */
+	0xC5, 0xD0, 0x84, 0x65, 0x84, 0x66, 0x84, 0x67, /* 0x24-0x27 */
+	0xC5, 0xD9, 0xC0, 0xFB, 0x84, 0x68, 0xB1, 0xF0, /* 0x28-0x2B */
+	0x84, 0x69, 0xD8, 0xD9, 0xB9, 0xCE, 0x84, 0x6A, /* 0x2C-0x2F */
+	0xB5, 0xBD, 0x84, 0x6B, 0x84, 0x6C, 0xD8, 0xDA, /* 0x30-0x33 */
+	0x84, 0x6D, 0x84, 0x6E, 0xD6, 0xC6, 0xCB, 0xA2, /* 0x34-0x37 */
+	0xC8, 0xAF, 0xC9, 0xB2, 0xB4, 0xCC, 0xBF, 0xCC, /* 0x38-0x3B */
+	0x84, 0x6F, 0xB9, 0xF4, 0x84, 0x70, 0xD8, 0xDB, /* 0x3C-0x3F */
+	0xD8, 0xDC, 0xB6, 0xE7, 0xBC, 0xC1, 0xCC, 0xEA, /* 0x40-0x43 */
+	0x84, 0x71, 0x84, 0x72, 0x84, 0x73, 0x84, 0x74, /* 0x44-0x47 */
+	0x84, 0x75, 0x84, 0x76, 0xCF, 0xF7, 0x84, 0x77, /* 0x48-0x4B */
+	0xD8, 0xDD, 0xC7, 0xB0, 0x84, 0x78, 0x84, 0x79, /* 0x4C-0x4F */
+	0xB9, 0xD0, 0xBD, 0xA3, 0x84, 0x7A, 0x84, 0x7B, /* 0x50-0x53 */
+	0xCC, 0xDE, 0x84, 0x7C, 0xC6, 0xCA, 0x84, 0x7D, /* 0x54-0x57 */
+	0x84, 0x7E, 0x84, 0x80, 0x84, 0x81, 0x84, 0x82, /* 0x58-0x5B */
+	0xD8, 0xE0, 0x84, 0x83, 0xD8, 0xDE, 0x84, 0x84, /* 0x5C-0x5F */
+	0x84, 0x85, 0xD8, 0xDF, 0x84, 0x86, 0x84, 0x87, /* 0x60-0x63 */
+	0x84, 0x88, 0xB0, 0xFE, 0x84, 0x89, 0xBE, 0xE7, /* 0x64-0x67 */
+	0x84, 0x8A, 0xCA, 0xA3, 0xBC, 0xF4, 0x84, 0x8B, /* 0x68-0x6B */
+	0x84, 0x8C, 0x84, 0x8D, 0x84, 0x8E, 0xB8, 0xB1, /* 0x6C-0x6F */
+	0x84, 0x8F, 0x84, 0x90, 0xB8, 0xEE, 0x84, 0x91, /* 0x70-0x73 */
+	0x84, 0x92, 0x84, 0x93, 0x84, 0x94, 0x84, 0x95, /* 0x74-0x77 */
+	0x84, 0x96, 0x84, 0x97, 0x84, 0x98, 0x84, 0x99, /* 0x78-0x7B */
+	0x84, 0x9A, 0xD8, 0xE2, 0x84, 0x9B, 0xBD, 0xCB, /* 0x7C-0x7F */
+	
+	0x84, 0x9C, 0xD8, 0xE4, 0xD8, 0xE3, 0x84, 0x9D, /* 0x80-0x83 */
+	0x84, 0x9E, 0x84, 0x9F, 0x84, 0xA0, 0x84, 0xA1, /* 0x84-0x87 */
+	0xC5, 0xFC, 0x84, 0xA2, 0x84, 0xA3, 0x84, 0xA4, /* 0x88-0x8B */
+	0x84, 0xA5, 0x84, 0xA6, 0x84, 0xA7, 0x84, 0xA8, /* 0x8C-0x8F */
+	0xD8, 0xE5, 0x84, 0xA9, 0x84, 0xAA, 0xD8, 0xE6, /* 0x90-0x93 */
+	0x84, 0xAB, 0x84, 0xAC, 0x84, 0xAD, 0x84, 0xAE, /* 0x94-0x97 */
+	0x84, 0xAF, 0x84, 0xB0, 0x84, 0xB1, 0xC1, 0xA6, /* 0x98-0x9B */
+	0x84, 0xB2, 0xC8, 0xB0, 0xB0, 0xEC, 0xB9, 0xA6, /* 0x9C-0x9F */
+	0xBC, 0xD3, 0xCE, 0xF1, 0xDB, 0xBD, 0xC1, 0xD3, /* 0xA0-0xA3 */
+	0x84, 0xB3, 0x84, 0xB4, 0x84, 0xB5, 0x84, 0xB6, /* 0xA4-0xA7 */
+	0xB6, 0xAF, 0xD6, 0xFA, 0xC5, 0xAC, 0xBD, 0xD9, /* 0xA8-0xAB */
+	0xDB, 0xBE, 0xDB, 0xBF, 0x84, 0xB7, 0x84, 0xB8, /* 0xAC-0xAF */
+	0x84, 0xB9, 0xC0, 0xF8, 0xBE, 0xA2, 0xC0, 0xCD, /* 0xB0-0xB3 */
+	0x84, 0xBA, 0x84, 0xBB, 0x84, 0xBC, 0x84, 0xBD, /* 0xB4-0xB7 */
+	0x84, 0xBE, 0x84, 0xBF, 0x84, 0xC0, 0x84, 0xC1, /* 0xB8-0xBB */
+	0x84, 0xC2, 0x84, 0xC3, 0xDB, 0xC0, 0xCA, 0xC6, /* 0xBC-0xBF */
+	0x84, 0xC4, 0x84, 0xC5, 0x84, 0xC6, 0xB2, 0xAA, /* 0xC0-0xC3 */
+	0x84, 0xC7, 0x84, 0xC8, 0x84, 0xC9, 0xD3, 0xC2, /* 0xC4-0xC7 */
+	0x84, 0xCA, 0xC3, 0xE3, 0x84, 0xCB, 0xD1, 0xAB, /* 0xC8-0xCB */
+	0x84, 0xCC, 0x84, 0xCD, 0x84, 0xCE, 0x84, 0xCF, /* 0xCC-0xCF */
+	0xDB, 0xC2, 0x84, 0xD0, 0xC0, 0xD5, 0x84, 0xD1, /* 0xD0-0xD3 */
+	0x84, 0xD2, 0x84, 0xD3, 0xDB, 0xC3, 0x84, 0xD4, /* 0xD4-0xD7 */
+	0xBF, 0xB1, 0x84, 0xD5, 0x84, 0xD6, 0x84, 0xD7, /* 0xD8-0xDB */
+	0x84, 0xD8, 0x84, 0xD9, 0x84, 0xDA, 0xC4, 0xBC, /* 0xDC-0xDF */
+	0x84, 0xDB, 0x84, 0xDC, 0x84, 0xDD, 0x84, 0xDE, /* 0xE0-0xE3 */
+	0xC7, 0xDA, 0x84, 0xDF, 0x84, 0xE0, 0x84, 0xE1, /* 0xE4-0xE7 */
+	0x84, 0xE2, 0x84, 0xE3, 0x84, 0xE4, 0x84, 0xE5, /* 0xE8-0xEB */
+	0x84, 0xE6, 0x84, 0xE7, 0x84, 0xE8, 0x84, 0xE9, /* 0xEC-0xEF */
+	0xDB, 0xC4, 0x84, 0xEA, 0x84, 0xEB, 0x84, 0xEC, /* 0xF0-0xF3 */
+	0x84, 0xED, 0x84, 0xEE, 0x84, 0xEF, 0x84, 0xF0, /* 0xF4-0xF7 */
+	0x84, 0xF1, 0xD9, 0xE8, 0xC9, 0xD7, 0x84, 0xF2, /* 0xF8-0xFB */
+	0x84, 0xF3, 0x84, 0xF4, 0xB9, 0xB4, 0xCE, 0xF0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_53[512] = {
+	0xD4, 0xC8, 0x84, 0xF5, 0x84, 0xF6, 0x84, 0xF7, /* 0x00-0x03 */
+	0x84, 0xF8, 0xB0, 0xFC, 0xB4, 0xD2, 0x84, 0xF9, /* 0x04-0x07 */
+	0xD0, 0xD9, 0x84, 0xFA, 0x84, 0xFB, 0x84, 0xFC, /* 0x08-0x0B */
+	0x84, 0xFD, 0xD9, 0xE9, 0x84, 0xFE, 0xDE, 0xCB, /* 0x0C-0x0F */
+	0xD9, 0xEB, 0x85, 0x40, 0x85, 0x41, 0x85, 0x42, /* 0x10-0x13 */
+	0x85, 0x43, 0xD8, 0xB0, 0xBB, 0xAF, 0xB1, 0xB1, /* 0x14-0x17 */
+	0x85, 0x44, 0xB3, 0xD7, 0xD8, 0xCE, 0x85, 0x45, /* 0x18-0x1B */
+	0x85, 0x46, 0xD4, 0xD1, 0x85, 0x47, 0x85, 0x48, /* 0x1C-0x1F */
+	0xBD, 0xB3, 0xBF, 0xEF, 0x85, 0x49, 0xCF, 0xBB, /* 0x20-0x23 */
+	0x85, 0x4A, 0x85, 0x4B, 0xD8, 0xD0, 0x85, 0x4C, /* 0x24-0x27 */
+	0x85, 0x4D, 0x85, 0x4E, 0xB7, 0xCB, 0x85, 0x4F, /* 0x28-0x2B */
+	0x85, 0x50, 0x85, 0x51, 0xD8, 0xD1, 0x85, 0x52, /* 0x2C-0x2F */
+	0x85, 0x53, 0x85, 0x54, 0x85, 0x55, 0x85, 0x56, /* 0x30-0x33 */
+	0x85, 0x57, 0x85, 0x58, 0x85, 0x59, 0x85, 0x5A, /* 0x34-0x37 */
+	0x85, 0x5B, 0xC6, 0xA5, 0xC7, 0xF8, 0xD2, 0xBD, /* 0x38-0x3B */
+	0x85, 0x5C, 0x85, 0x5D, 0xD8, 0xD2, 0xC4, 0xE4, /* 0x3C-0x3F */
+	0x85, 0x5E, 0xCA, 0xAE, 0x85, 0x5F, 0xC7, 0xA7, /* 0x40-0x43 */
+	0x85, 0x60, 0xD8, 0xA6, 0x85, 0x61, 0xC9, 0xFD, /* 0x44-0x47 */
+	0xCE, 0xE7, 0xBB, 0xDC, 0xB0, 0xEB, 0x85, 0x62, /* 0x48-0x4B */
+	0x85, 0x63, 0x85, 0x64, 0xBB, 0xAA, 0xD0, 0xAD, /* 0x4C-0x4F */
+	0x85, 0x65, 0xB1, 0xB0, 0xD7, 0xE4, 0xD7, 0xBF, /* 0x50-0x53 */
+	0x85, 0x66, 0xB5, 0xA5, 0xC2, 0xF4, 0xC4, 0xCF, /* 0x54-0x57 */
+	0x85, 0x67, 0x85, 0x68, 0xB2, 0xA9, 0x85, 0x69, /* 0x58-0x5B */
+	0xB2, 0xB7, 0x85, 0x6A, 0xB1, 0xE5, 0xDF, 0xB2, /* 0x5C-0x5F */
+	0xD5, 0xBC, 0xBF, 0xA8, 0xC2, 0xAC, 0xD8, 0xD5, /* 0x60-0x63 */
+	0xC2, 0xB1, 0x85, 0x6B, 0xD8, 0xD4, 0xCE, 0xD4, /* 0x64-0x67 */
+	0x85, 0x6C, 0xDA, 0xE0, 0x85, 0x6D, 0xCE, 0xC0, /* 0x68-0x6B */
+	0x85, 0x6E, 0x85, 0x6F, 0xD8, 0xB4, 0xC3, 0xAE, /* 0x6C-0x6F */
+	0xD3, 0xA1, 0xCE, 0xA3, 0x85, 0x70, 0xBC, 0xB4, /* 0x70-0x73 */
+	0xC8, 0xB4, 0xC2, 0xD1, 0x85, 0x71, 0xBE, 0xED, /* 0x74-0x77 */
+	0xD0, 0xB6, 0x85, 0x72, 0xDA, 0xE1, 0x85, 0x73, /* 0x78-0x7B */
+	0x85, 0x74, 0x85, 0x75, 0x85, 0x76, 0xC7, 0xE4, /* 0x7C-0x7F */
+	
+	0x85, 0x77, 0x85, 0x78, 0xB3, 0xA7, 0x85, 0x79, /* 0x80-0x83 */
+	0xB6, 0xF2, 0xCC, 0xFC, 0xC0, 0xFA, 0x85, 0x7A, /* 0x84-0x87 */
+	0x85, 0x7B, 0xC0, 0xF7, 0x85, 0x7C, 0xD1, 0xB9, /* 0x88-0x8B */
+	0xD1, 0xE1, 0xD8, 0xC7, 0x85, 0x7D, 0x85, 0x7E, /* 0x8C-0x8F */
+	0x85, 0x80, 0x85, 0x81, 0x85, 0x82, 0x85, 0x83, /* 0x90-0x93 */
+	0x85, 0x84, 0xB2, 0xDE, 0x85, 0x85, 0x85, 0x86, /* 0x94-0x97 */
+	0xC0, 0xE5, 0x85, 0x87, 0xBA, 0xF1, 0x85, 0x88, /* 0x98-0x9B */
+	0x85, 0x89, 0xD8, 0xC8, 0x85, 0x8A, 0xD4, 0xAD, /* 0x9C-0x9F */
+	0x85, 0x8B, 0x85, 0x8C, 0xCF, 0xE1, 0xD8, 0xC9, /* 0xA0-0xA3 */
+	0x85, 0x8D, 0xD8, 0xCA, 0xCF, 0xC3, 0x85, 0x8E, /* 0xA4-0xA7 */
+	0xB3, 0xF8, 0xBE, 0xC7, 0x85, 0x8F, 0x85, 0x90, /* 0xA8-0xAB */
+	0x85, 0x91, 0x85, 0x92, 0xD8, 0xCB, 0x85, 0x93, /* 0xAC-0xAF */
+	0x85, 0x94, 0x85, 0x95, 0x85, 0x96, 0x85, 0x97, /* 0xB0-0xB3 */
+	0x85, 0x98, 0x85, 0x99, 0xDB, 0xCC, 0x85, 0x9A, /* 0xB4-0xB7 */
+	0x85, 0x9B, 0x85, 0x9C, 0x85, 0x9D, 0xC8, 0xA5, /* 0xB8-0xBB */
+	0x85, 0x9E, 0x85, 0x9F, 0x85, 0xA0, 0xCF, 0xD8, /* 0xBC-0xBF */
+	0x85, 0xA1, 0xC8, 0xFE, 0xB2, 0xCE, 0x85, 0xA2, /* 0xC0-0xC3 */
+	0x85, 0xA3, 0x85, 0xA4, 0x85, 0xA5, 0x85, 0xA6, /* 0xC4-0xC7 */
+	0xD3, 0xD6, 0xB2, 0xE6, 0xBC, 0xB0, 0xD3, 0xD1, /* 0xC8-0xCB */
+	0xCB, 0xAB, 0xB7, 0xB4, 0x85, 0xA7, 0x85, 0xA8, /* 0xCC-0xCF */
+	0x85, 0xA9, 0xB7, 0xA2, 0x85, 0xAA, 0x85, 0xAB, /* 0xD0-0xD3 */
+	0xCA, 0xE5, 0x85, 0xAC, 0xC8, 0xA1, 0xCA, 0xDC, /* 0xD4-0xD7 */
+	0xB1, 0xE4, 0xD0, 0xF0, 0x85, 0xAD, 0xC5, 0xD1, /* 0xD8-0xDB */
+	0x85, 0xAE, 0x85, 0xAF, 0x85, 0xB0, 0xDB, 0xC5, /* 0xDC-0xDF */
+	0xB5, 0xFE, 0x85, 0xB1, 0x85, 0xB2, 0xBF, 0xDA, /* 0xE0-0xE3 */
+	0xB9, 0xC5, 0xBE, 0xE4, 0xC1, 0xED, 0x85, 0xB3, /* 0xE4-0xE7 */
+	0xDF, 0xB6, 0xDF, 0xB5, 0xD6, 0xBB, 0xBD, 0xD0, /* 0xE8-0xEB */
+	0xD5, 0xD9, 0xB0, 0xC8, 0xB6, 0xA3, 0xBF, 0xC9, /* 0xEC-0xEF */
+	0xCC, 0xA8, 0xDF, 0xB3, 0xCA, 0xB7, 0xD3, 0xD2, /* 0xF0-0xF3 */
+	0x85, 0xB4, 0xD8, 0xCF, 0xD2, 0xB6, 0xBA, 0xC5, /* 0xF4-0xF7 */
+	0xCB, 0xBE, 0xCC, 0xBE, 0x85, 0xB5, 0xDF, 0xB7, /* 0xF8-0xFB */
+	0xB5, 0xF0, 0xDF, 0xB4, 0x85, 0xB6, 0x85, 0xB7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_54[512] = {
+	0x85, 0xB8, 0xD3, 0xF5, 0x85, 0xB9, 0xB3, 0xD4, /* 0x00-0x03 */
+	0xB8, 0xF7, 0x85, 0xBA, 0xDF, 0xBA, 0x85, 0xBB, /* 0x04-0x07 */
+	0xBA, 0xCF, 0xBC, 0xAA, 0xB5, 0xF5, 0x85, 0xBC, /* 0x08-0x0B */
+	0xCD, 0xAC, 0xC3, 0xFB, 0xBA, 0xF3, 0xC0, 0xF4, /* 0x0C-0x0F */
+	0xCD, 0xC2, 0xCF, 0xF2, 0xDF, 0xB8, 0xCF, 0xC5, /* 0x10-0x13 */
+	0x85, 0xBD, 0xC2, 0xC0, 0xDF, 0xB9, 0xC2, 0xF0, /* 0x14-0x17 */
+	0x85, 0xBE, 0x85, 0xBF, 0x85, 0xC0, 0xBE, 0xFD, /* 0x18-0x1B */
+	0x85, 0xC1, 0xC1, 0xDF, 0xCD, 0xCC, 0xD2, 0xF7, /* 0x1C-0x1F */
+	0xB7, 0xCD, 0xDF, 0xC1, 0x85, 0xC2, 0xDF, 0xC4, /* 0x20-0x23 */
+	0x85, 0xC3, 0x85, 0xC4, 0xB7, 0xF1, 0xB0, 0xC9, /* 0x24-0x27 */
+	0xB6, 0xD6, 0xB7, 0xD4, 0x85, 0xC5, 0xBA, 0xAC, /* 0x28-0x2B */
+	0xCC, 0xFD, 0xBF, 0xD4, 0xCB, 0xB1, 0xC6, 0xF4, /* 0x2C-0x2F */
+	0x85, 0xC6, 0xD6, 0xA8, 0xDF, 0xC5, 0x85, 0xC7, /* 0x30-0x33 */
+	0xCE, 0xE2, 0xB3, 0xB3, 0x85, 0xC8, 0x85, 0xC9, /* 0x34-0x37 */
+	0xCE, 0xFC, 0xB4, 0xB5, 0x85, 0xCA, 0xCE, 0xC7, /* 0x38-0x3B */
+	0xBA, 0xF0, 0x85, 0xCB, 0xCE, 0xE1, 0x85, 0xCC, /* 0x3C-0x3F */
+	0xD1, 0xBD, 0x85, 0xCD, 0x85, 0xCE, 0xDF, 0xC0, /* 0x40-0x43 */
+	0x85, 0xCF, 0x85, 0xD0, 0xB4, 0xF4, 0x85, 0xD1, /* 0x44-0x47 */
+	0xB3, 0xCA, 0x85, 0xD2, 0xB8, 0xE6, 0xDF, 0xBB, /* 0x48-0x4B */
+	0x85, 0xD3, 0x85, 0xD4, 0x85, 0xD5, 0x85, 0xD6, /* 0x4C-0x4F */
+	0xC4, 0xC5, 0x85, 0xD7, 0xDF, 0xBC, 0xDF, 0xBD, /* 0x50-0x53 */
+	0xDF, 0xBE, 0xC5, 0xBB, 0xDF, 0xBF, 0xDF, 0xC2, /* 0x54-0x57 */
+	0xD4, 0xB1, 0xDF, 0xC3, 0x85, 0xD8, 0xC7, 0xBA, /* 0x58-0x5B */
+	0xCE, 0xD8, 0x85, 0xD9, 0x85, 0xDA, 0x85, 0xDB, /* 0x5C-0x5F */
+	0x85, 0xDC, 0x85, 0xDD, 0xC4, 0xD8, 0x85, 0xDE, /* 0x60-0x63 */
+	0xDF, 0xCA, 0x85, 0xDF, 0xDF, 0xCF, 0x85, 0xE0, /* 0x64-0x67 */
+	0xD6, 0xDC, 0x85, 0xE1, 0x85, 0xE2, 0x85, 0xE3, /* 0x68-0x6B */
+	0x85, 0xE4, 0x85, 0xE5, 0x85, 0xE6, 0x85, 0xE7, /* 0x6C-0x6F */
+	0x85, 0xE8, 0xDF, 0xC9, 0xDF, 0xDA, 0xCE, 0xB6, /* 0x70-0x73 */
+	0x85, 0xE9, 0xBA, 0xC7, 0xDF, 0xCE, 0xDF, 0xC8, /* 0x74-0x77 */
+	0xC5, 0xDE, 0x85, 0xEA, 0x85, 0xEB, 0xC9, 0xEB, /* 0x78-0x7B */
+	0xBA, 0xF4, 0xC3, 0xFC, 0x85, 0xEC, 0x85, 0xED, /* 0x7C-0x7F */
+	
+	0xBE, 0xD7, 0x85, 0xEE, 0xDF, 0xC6, 0x85, 0xEF, /* 0x80-0x83 */
+	0xDF, 0xCD, 0x85, 0xF0, 0xC5, 0xD8, 0x85, 0xF1, /* 0x84-0x87 */
+	0x85, 0xF2, 0x85, 0xF3, 0x85, 0xF4, 0xD5, 0xA6, /* 0x88-0x8B */
+	0xBA, 0xCD, 0x85, 0xF5, 0xBE, 0xCC, 0xD3, 0xBD, /* 0x8C-0x8F */
+	0xB8, 0xC0, 0x85, 0xF6, 0xD6, 0xE4, 0x85, 0xF7, /* 0x90-0x93 */
+	0xDF, 0xC7, 0xB9, 0xBE, 0xBF, 0xA7, 0x85, 0xF8, /* 0x94-0x97 */
+	0x85, 0xF9, 0xC1, 0xFC, 0xDF, 0xCB, 0xDF, 0xCC, /* 0x98-0x9B */
+	0x85, 0xFA, 0xDF, 0xD0, 0x85, 0xFB, 0x85, 0xFC, /* 0x9C-0x9F */
+	0x85, 0xFD, 0x85, 0xFE, 0x86, 0x40, 0xDF, 0xDB, /* 0xA0-0xA3 */
+	0xDF, 0xE5, 0x86, 0x41, 0xDF, 0xD7, 0xDF, 0xD6, /* 0xA4-0xA7 */
+	0xD7, 0xC9, 0xDF, 0xE3, 0xDF, 0xE4, 0xE5, 0xEB, /* 0xA8-0xAB */
+	0xD2, 0xA7, 0xDF, 0xD2, 0x86, 0x42, 0xBF, 0xA9, /* 0xAC-0xAF */
+	0x86, 0x43, 0xD4, 0xDB, 0x86, 0x44, 0xBF, 0xC8, /* 0xB0-0xB3 */
+	0xDF, 0xD4, 0x86, 0x45, 0x86, 0x46, 0x86, 0x47, /* 0xB4-0xB7 */
+	0xCF, 0xCC, 0x86, 0x48, 0x86, 0x49, 0xDF, 0xDD, /* 0xB8-0xBB */
+	0x86, 0x4A, 0xD1, 0xCA, 0x86, 0x4B, 0xDF, 0xDE, /* 0xBC-0xBF */
+	0xB0, 0xA7, 0xC6, 0xB7, 0xDF, 0xD3, 0x86, 0x4C, /* 0xC0-0xC3 */
+	0xBA, 0xE5, 0x86, 0x4D, 0xB6, 0xDF, 0xCD, 0xDB, /* 0xC4-0xC7 */
+	0xB9, 0xFE, 0xD4, 0xD5, 0x86, 0x4E, 0x86, 0x4F, /* 0xC8-0xCB */
+	0xDF, 0xDF, 0xCF, 0xEC, 0xB0, 0xA5, 0xDF, 0xE7, /* 0xCC-0xCF */
+	0xDF, 0xD1, 0xD1, 0xC6, 0xDF, 0xD5, 0xDF, 0xD8, /* 0xD0-0xD3 */
+	0xDF, 0xD9, 0xDF, 0xDC, 0x86, 0x50, 0xBB, 0xA9, /* 0xD4-0xD7 */
+	0x86, 0x51, 0xDF, 0xE0, 0xDF, 0xE1, 0x86, 0x52, /* 0xD8-0xDB */
+	0xDF, 0xE2, 0xDF, 0xE6, 0xDF, 0xE8, 0xD3, 0xB4, /* 0xDC-0xDF */
+	0x86, 0x53, 0x86, 0x54, 0x86, 0x55, 0x86, 0x56, /* 0xE0-0xE3 */
+	0x86, 0x57, 0xB8, 0xE7, 0xC5, 0xB6, 0xDF, 0xEA, /* 0xE4-0xE7 */
+	0xC9, 0xDA, 0xC1, 0xA8, 0xC4, 0xC4, 0x86, 0x58, /* 0xE8-0xEB */
+	0x86, 0x59, 0xBF, 0xDE, 0xCF, 0xF8, 0x86, 0x5A, /* 0xEC-0xEF */
+	0x86, 0x5B, 0x86, 0x5C, 0xD5, 0xDC, 0xDF, 0xEE, /* 0xF0-0xF3 */
+	0x86, 0x5D, 0x86, 0x5E, 0x86, 0x5F, 0x86, 0x60, /* 0xF4-0xF7 */
+	0x86, 0x61, 0x86, 0x62, 0xB2, 0xB8, 0x86, 0x63, /* 0xF8-0xFB */
+	0xBA, 0xDF, 0xDF, 0xEC, 0x86, 0x64, 0xDB, 0xC1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_55[512] = {
+	0x86, 0x65, 0xD1, 0xE4, 0x86, 0x66, 0x86, 0x67, /* 0x00-0x03 */
+	0x86, 0x68, 0x86, 0x69, 0xCB, 0xF4, 0xB4, 0xBD, /* 0x04-0x07 */
+	0x86, 0x6A, 0xB0, 0xA6, 0x86, 0x6B, 0x86, 0x6C, /* 0x08-0x0B */
+	0x86, 0x6D, 0x86, 0x6E, 0x86, 0x6F, 0xDF, 0xF1, /* 0x0C-0x0F */
+	0xCC, 0xC6, 0xDF, 0xF2, 0x86, 0x70, 0x86, 0x71, /* 0x10-0x13 */
+	0xDF, 0xED, 0x86, 0x72, 0x86, 0x73, 0x86, 0x74, /* 0x14-0x17 */
+	0x86, 0x75, 0x86, 0x76, 0x86, 0x77, 0xDF, 0xE9, /* 0x18-0x1B */
+	0x86, 0x78, 0x86, 0x79, 0x86, 0x7A, 0x86, 0x7B, /* 0x1C-0x1F */
+	0xDF, 0xEB, 0x86, 0x7C, 0xDF, 0xEF, 0xDF, 0xF0, /* 0x20-0x23 */
+	0xBB, 0xBD, 0x86, 0x7D, 0x86, 0x7E, 0xDF, 0xF3, /* 0x24-0x27 */
+	0x86, 0x80, 0x86, 0x81, 0xDF, 0xF4, 0x86, 0x82, /* 0x28-0x2B */
+	0xBB, 0xA3, 0x86, 0x83, 0xCA, 0xDB, 0xCE, 0xA8, /* 0x2C-0x2F */
+	0xE0, 0xA7, 0xB3, 0xAA, 0x86, 0x84, 0xE0, 0xA6, /* 0x30-0x33 */
+	0x86, 0x85, 0x86, 0x86, 0x86, 0x87, 0xE0, 0xA1, /* 0x34-0x37 */
+	0x86, 0x88, 0x86, 0x89, 0x86, 0x8A, 0x86, 0x8B, /* 0x38-0x3B */
+	0xDF, 0xFE, 0x86, 0x8C, 0xCD, 0xD9, 0xDF, 0xFC, /* 0x3C-0x3F */
+	0x86, 0x8D, 0xDF, 0xFA, 0x86, 0x8E, 0xBF, 0xD0, /* 0x40-0x43 */
+	0xD7, 0xC4, 0x86, 0x8F, 0xC9, 0xCC, 0x86, 0x90, /* 0x44-0x47 */
+	0x86, 0x91, 0xDF, 0xF8, 0xB0, 0xA1, 0x86, 0x92, /* 0x48-0x4B */
+	0x86, 0x93, 0x86, 0x94, 0x86, 0x95, 0x86, 0x96, /* 0x4C-0x4F */
+	0xDF, 0xFD, 0x86, 0x97, 0x86, 0x98, 0x86, 0x99, /* 0x50-0x53 */
+	0x86, 0x9A, 0xDF, 0xFB, 0xE0, 0xA2, 0x86, 0x9B, /* 0x54-0x57 */
+	0x86, 0x9C, 0x86, 0x9D, 0x86, 0x9E, 0x86, 0x9F, /* 0x58-0x5B */
+	0xE0, 0xA8, 0x86, 0xA0, 0x86, 0xA1, 0x86, 0xA2, /* 0x5C-0x5F */
+	0x86, 0xA3, 0xB7, 0xC8, 0x86, 0xA4, 0x86, 0xA5, /* 0x60-0x63 */
+	0xC6, 0xA1, 0xC9, 0xB6, 0xC0, 0xB2, 0xDF, 0xF5, /* 0x64-0x67 */
+	0x86, 0xA6, 0x86, 0xA7, 0xC5, 0xBE, 0x86, 0xA8, /* 0x68-0x6B */
+	0xD8, 0xC4, 0xDF, 0xF9, 0xC4, 0xF6, 0x86, 0xA9, /* 0x6C-0x6F */
+	0x86, 0xAA, 0x86, 0xAB, 0x86, 0xAC, 0x86, 0xAD, /* 0x70-0x73 */
+	0x86, 0xAE, 0xE0, 0xA3, 0xE0, 0xA4, 0xE0, 0xA5, /* 0x74-0x77 */
+	0xD0, 0xA5, 0x86, 0xAF, 0x86, 0xB0, 0xE0, 0xB4, /* 0x78-0x7B */
+	0xCC, 0xE4, 0x86, 0xB1, 0xE0, 0xB1, 0x86, 0xB2, /* 0x7C-0x7F */
+	
+	0xBF, 0xA6, 0xE0, 0xAF, 0xCE, 0xB9, 0xE0, 0xAB, /* 0x80-0x83 */
+	0xC9, 0xC6, 0x86, 0xB3, 0x86, 0xB4, 0xC0, 0xAE, /* 0x84-0x87 */
+	0xE0, 0xAE, 0xBA, 0xED, 0xBA, 0xB0, 0xE0, 0xA9, /* 0x88-0x8B */
+	0x86, 0xB5, 0x86, 0xB6, 0x86, 0xB7, 0xDF, 0xF6, /* 0x8C-0x8F */
+	0x86, 0xB8, 0xE0, 0xB3, 0x86, 0xB9, 0x86, 0xBA, /* 0x90-0x93 */
+	0xE0, 0xB8, 0x86, 0xBB, 0x86, 0xBC, 0x86, 0xBD, /* 0x94-0x97 */
+	0xB4, 0xAD, 0xE0, 0xB9, 0x86, 0xBE, 0x86, 0xBF, /* 0x98-0x9B */
+	0xCF, 0xB2, 0xBA, 0xC8, 0x86, 0xC0, 0xE0, 0xB0, /* 0x9C-0x9F */
+	0x86, 0xC1, 0x86, 0xC2, 0x86, 0xC3, 0x86, 0xC4, /* 0xA0-0xA3 */
+	0x86, 0xC5, 0x86, 0xC6, 0x86, 0xC7, 0xD0, 0xFA, /* 0xA4-0xA7 */
+	0x86, 0xC8, 0x86, 0xC9, 0x86, 0xCA, 0x86, 0xCB, /* 0xA8-0xAB */
+	0x86, 0xCC, 0x86, 0xCD, 0x86, 0xCE, 0x86, 0xCF, /* 0xAC-0xAF */
+	0x86, 0xD0, 0xE0, 0xAC, 0x86, 0xD1, 0xD4, 0xFB, /* 0xB0-0xB3 */
+	0x86, 0xD2, 0xDF, 0xF7, 0x86, 0xD3, 0xC5, 0xE7, /* 0xB4-0xB7 */
+	0x86, 0xD4, 0xE0, 0xAD, 0x86, 0xD5, 0xD3, 0xF7, /* 0xB8-0xBB */
+	0x86, 0xD6, 0xE0, 0xB6, 0xE0, 0xB7, 0x86, 0xD7, /* 0xBC-0xBF */
+	0x86, 0xD8, 0x86, 0xD9, 0x86, 0xDA, 0x86, 0xDB, /* 0xC0-0xC3 */
+	0xE0, 0xC4, 0xD0, 0xE1, 0x86, 0xDC, 0x86, 0xDD, /* 0xC4-0xC7 */
+	0x86, 0xDE, 0xE0, 0xBC, 0x86, 0xDF, 0x86, 0xE0, /* 0xC8-0xCB */
+	0xE0, 0xC9, 0xE0, 0xCA, 0x86, 0xE1, 0x86, 0xE2, /* 0xCC-0xCF */
+	0x86, 0xE3, 0xE0, 0xBE, 0xE0, 0xAA, 0xC9, 0xA4, /* 0xD0-0xD3 */
+	0xE0, 0xC1, 0x86, 0xE4, 0xE0, 0xB2, 0x86, 0xE5, /* 0xD4-0xD7 */
+	0x86, 0xE6, 0x86, 0xE7, 0x86, 0xE8, 0x86, 0xE9, /* 0xD8-0xDB */
+	0xCA, 0xC8, 0xE0, 0xC3, 0x86, 0xEA, 0xE0, 0xB5, /* 0xDC-0xDF */
+	0x86, 0xEB, 0xCE, 0xCB, 0x86, 0xEC, 0xCB, 0xC3, /* 0xE0-0xE3 */
+	0xE0, 0xCD, 0xE0, 0xC6, 0xE0, 0xC2, 0x86, 0xED, /* 0xE4-0xE7 */
+	0xE0, 0xCB, 0x86, 0xEE, 0xE0, 0xBA, 0xE0, 0xBF, /* 0xE8-0xEB */
+	0xE0, 0xC0, 0x86, 0xEF, 0x86, 0xF0, 0xE0, 0xC5, /* 0xEC-0xEF */
+	0x86, 0xF1, 0x86, 0xF2, 0xE0, 0xC7, 0xE0, 0xC8, /* 0xF0-0xF3 */
+	0x86, 0xF3, 0xE0, 0xCC, 0x86, 0xF4, 0xE0, 0xBB, /* 0xF4-0xF7 */
+	0x86, 0xF5, 0x86, 0xF6, 0x86, 0xF7, 0x86, 0xF8, /* 0xF8-0xFB */
+	0x86, 0xF9, 0xCB, 0xD4, 0xE0, 0xD5, 0x86, 0xFA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_56[512] = {
+	0xE0, 0xD6, 0xE0, 0xD2, 0x86, 0xFB, 0x86, 0xFC, /* 0x00-0x03 */
+	0x86, 0xFD, 0x86, 0xFE, 0x87, 0x40, 0x87, 0x41, /* 0x04-0x07 */
+	0xE0, 0xD0, 0xBC, 0xCE, 0x87, 0x42, 0x87, 0x43, /* 0x08-0x0B */
+	0xE0, 0xD1, 0x87, 0x44, 0xB8, 0xC2, 0xD8, 0xC5, /* 0x0C-0x0F */
+	0x87, 0x45, 0x87, 0x46, 0x87, 0x47, 0x87, 0x48, /* 0x10-0x13 */
+	0x87, 0x49, 0x87, 0x4A, 0x87, 0x4B, 0x87, 0x4C, /* 0x14-0x17 */
+	0xD0, 0xEA, 0x87, 0x4D, 0x87, 0x4E, 0xC2, 0xEF, /* 0x18-0x1B */
+	0x87, 0x4F, 0x87, 0x50, 0xE0, 0xCF, 0xE0, 0xBD, /* 0x1C-0x1F */
+	0x87, 0x51, 0x87, 0x52, 0x87, 0x53, 0xE0, 0xD4, /* 0x20-0x23 */
+	0xE0, 0xD3, 0x87, 0x54, 0x87, 0x55, 0xE0, 0xD7, /* 0x24-0x27 */
+	0x87, 0x56, 0x87, 0x57, 0x87, 0x58, 0x87, 0x59, /* 0x28-0x2B */
+	0xE0, 0xDC, 0xE0, 0xD8, 0x87, 0x5A, 0x87, 0x5B, /* 0x2C-0x2F */
+	0x87, 0x5C, 0xD6, 0xF6, 0xB3, 0xB0, 0x87, 0x5D, /* 0x30-0x33 */
+	0xD7, 0xEC, 0x87, 0x5E, 0xCB, 0xBB, 0x87, 0x5F, /* 0x34-0x37 */
+	0x87, 0x60, 0xE0, 0xDA, 0x87, 0x61, 0xCE, 0xFB, /* 0x38-0x3B */
+	0x87, 0x62, 0x87, 0x63, 0x87, 0x64, 0xBA, 0xD9, /* 0x3C-0x3F */
+	0x87, 0x65, 0x87, 0x66, 0x87, 0x67, 0x87, 0x68, /* 0x40-0x43 */
+	0x87, 0x69, 0x87, 0x6A, 0x87, 0x6B, 0x87, 0x6C, /* 0x44-0x47 */
+	0x87, 0x6D, 0x87, 0x6E, 0x87, 0x6F, 0x87, 0x70, /* 0x48-0x4B */
+	0xE0, 0xE1, 0xE0, 0xDD, 0xD2, 0xAD, 0x87, 0x71, /* 0x4C-0x4F */
+	0x87, 0x72, 0x87, 0x73, 0x87, 0x74, 0x87, 0x75, /* 0x50-0x53 */
+	0xE0, 0xE2, 0x87, 0x76, 0x87, 0x77, 0xE0, 0xDB, /* 0x54-0x57 */
+	0xE0, 0xD9, 0xE0, 0xDF, 0x87, 0x78, 0x87, 0x79, /* 0x58-0x5B */
+	0xE0, 0xE0, 0x87, 0x7A, 0x87, 0x7B, 0x87, 0x7C, /* 0x5C-0x5F */
+	0x87, 0x7D, 0x87, 0x7E, 0xE0, 0xDE, 0x87, 0x80, /* 0x60-0x63 */
+	0xE0, 0xE4, 0x87, 0x81, 0x87, 0x82, 0x87, 0x83, /* 0x64-0x67 */
+	0xC6, 0xF7, 0xD8, 0xAC, 0xD4, 0xEB, 0xE0, 0xE6, /* 0x68-0x6B */
+	0xCA, 0xC9, 0x87, 0x84, 0x87, 0x85, 0x87, 0x86, /* 0x6C-0x6F */
+	0x87, 0x87, 0xE0, 0xE5, 0x87, 0x88, 0x87, 0x89, /* 0x70-0x73 */
+	0x87, 0x8A, 0x87, 0x8B, 0xB8, 0xC1, 0x87, 0x8C, /* 0x74-0x77 */
+	0x87, 0x8D, 0x87, 0x8E, 0x87, 0x8F, 0xE0, 0xE7, /* 0x78-0x7B */
+	0xE0, 0xE8, 0x87, 0x90, 0x87, 0x91, 0x87, 0x92, /* 0x7C-0x7F */
+	
+	0x87, 0x93, 0x87, 0x94, 0x87, 0x95, 0x87, 0x96, /* 0x80-0x83 */
+	0x87, 0x97, 0xE0, 0xE9, 0xE0, 0xE3, 0x87, 0x98, /* 0x84-0x87 */
+	0x87, 0x99, 0x87, 0x9A, 0x87, 0x9B, 0x87, 0x9C, /* 0x88-0x8B */
+	0x87, 0x9D, 0x87, 0x9E, 0xBA, 0xBF, 0xCC, 0xE7, /* 0x8C-0x8F */
+	0x87, 0x9F, 0x87, 0xA0, 0x87, 0xA1, 0xE0, 0xEA, /* 0x90-0x93 */
+	0x87, 0xA2, 0x87, 0xA3, 0x87, 0xA4, 0x87, 0xA5, /* 0x94-0x97 */
+	0x87, 0xA6, 0x87, 0xA7, 0x87, 0xA8, 0x87, 0xA9, /* 0x98-0x9B */
+	0x87, 0xAA, 0x87, 0xAB, 0x87, 0xAC, 0x87, 0xAD, /* 0x9C-0x9F */
+	0x87, 0xAE, 0x87, 0xAF, 0x87, 0xB0, 0xCF, 0xF9, /* 0xA0-0xA3 */
+	0x87, 0xB1, 0x87, 0xB2, 0x87, 0xB3, 0x87, 0xB4, /* 0xA4-0xA7 */
+	0x87, 0xB5, 0x87, 0xB6, 0x87, 0xB7, 0x87, 0xB8, /* 0xA8-0xAB */
+	0x87, 0xB9, 0x87, 0xBA, 0x87, 0xBB, 0xE0, 0xEB, /* 0xAC-0xAF */
+	0x87, 0xBC, 0x87, 0xBD, 0x87, 0xBE, 0x87, 0xBF, /* 0xB0-0xB3 */
+	0x87, 0xC0, 0x87, 0xC1, 0x87, 0xC2, 0xC8, 0xC2, /* 0xB4-0xB7 */
+	0x87, 0xC3, 0x87, 0xC4, 0x87, 0xC5, 0x87, 0xC6, /* 0xB8-0xBB */
+	0xBD, 0xC0, 0x87, 0xC7, 0x87, 0xC8, 0x87, 0xC9, /* 0xBC-0xBF */
+	0x87, 0xCA, 0x87, 0xCB, 0x87, 0xCC, 0x87, 0xCD, /* 0xC0-0xC3 */
+	0x87, 0xCE, 0x87, 0xCF, 0x87, 0xD0, 0x87, 0xD1, /* 0xC4-0xC7 */
+	0x87, 0xD2, 0x87, 0xD3, 0xC4, 0xD2, 0x87, 0xD4, /* 0xC8-0xCB */
+	0x87, 0xD5, 0x87, 0xD6, 0x87, 0xD7, 0x87, 0xD8, /* 0xCC-0xCF */
+	0x87, 0xD9, 0x87, 0xDA, 0x87, 0xDB, 0x87, 0xDC, /* 0xD0-0xD3 */
+	0xE0, 0xEC, 0x87, 0xDD, 0x87, 0xDE, 0xE0, 0xED, /* 0xD4-0xD7 */
+	0x87, 0xDF, 0x87, 0xE0, 0xC7, 0xF4, 0xCB, 0xC4, /* 0xD8-0xDB */
+	0x87, 0xE1, 0xE0, 0xEE, 0xBB, 0xD8, 0xD8, 0xB6, /* 0xDC-0xDF */
+	0xD2, 0xF2, 0xE0, 0xEF, 0xCD, 0xC5, 0x87, 0xE2, /* 0xE0-0xE3 */
+	0xB6, 0xDA, 0x87, 0xE3, 0x87, 0xE4, 0x87, 0xE5, /* 0xE4-0xE7 */
+	0x87, 0xE6, 0x87, 0xE7, 0x87, 0xE8, 0xE0, 0xF1, /* 0xE8-0xEB */
+	0x87, 0xE9, 0xD4, 0xB0, 0x87, 0xEA, 0x87, 0xEB, /* 0xEC-0xEF */
+	0xC0, 0xA7, 0xB4, 0xD1, 0x87, 0xEC, 0x87, 0xED, /* 0xF0-0xF3 */
+	0xCE, 0xA7, 0xE0, 0xF0, 0x87, 0xEE, 0x87, 0xEF, /* 0xF4-0xF7 */
+	0x87, 0xF0, 0xE0, 0xF2, 0xB9, 0xCC, 0x87, 0xF1, /* 0xF8-0xFB */
+	0x87, 0xF2, 0xB9, 0xFA, 0xCD, 0xBC, 0xE0, 0xF3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_57[512] = {
+	0x87, 0xF3, 0x87, 0xF4, 0x87, 0xF5, 0xC6, 0xD4, /* 0x00-0x03 */
+	0xE0, 0xF4, 0x87, 0xF6, 0xD4, 0xB2, 0x87, 0xF7, /* 0x04-0x07 */
+	0xC8, 0xA6, 0xE0, 0xF6, 0xE0, 0xF5, 0x87, 0xF8, /* 0x08-0x0B */
+	0x87, 0xF9, 0x87, 0xFA, 0x87, 0xFB, 0x87, 0xFC, /* 0x0C-0x0F */
+	0x87, 0xFD, 0x87, 0xFE, 0x88, 0x40, 0x88, 0x41, /* 0x10-0x13 */
+	0x88, 0x42, 0x88, 0x43, 0x88, 0x44, 0x88, 0x45, /* 0x14-0x17 */
+	0x88, 0x46, 0x88, 0x47, 0x88, 0x48, 0x88, 0x49, /* 0x18-0x1B */
+	0xE0, 0xF7, 0x88, 0x4A, 0x88, 0x4B, 0xCD, 0xC1, /* 0x1C-0x1F */
+	0x88, 0x4C, 0x88, 0x4D, 0x88, 0x4E, 0xCA, 0xA5, /* 0x20-0x23 */
+	0x88, 0x4F, 0x88, 0x50, 0x88, 0x51, 0x88, 0x52, /* 0x24-0x27 */
+	0xD4, 0xDA, 0xDB, 0xD7, 0xDB, 0xD9, 0x88, 0x53, /* 0x28-0x2B */
+	0xDB, 0xD8, 0xB9, 0xE7, 0xDB, 0xDC, 0xDB, 0xDD, /* 0x2C-0x2F */
+	0xB5, 0xD8, 0x88, 0x54, 0x88, 0x55, 0xDB, 0xDA, /* 0x30-0x33 */
+	0x88, 0x56, 0x88, 0x57, 0x88, 0x58, 0x88, 0x59, /* 0x34-0x37 */
+	0x88, 0x5A, 0xDB, 0xDB, 0xB3, 0xA1, 0xDB, 0xDF, /* 0x38-0x3B */
+	0x88, 0x5B, 0x88, 0x5C, 0xBB, 0xF8, 0x88, 0x5D, /* 0x3C-0x3F */
+	0xD6, 0xB7, 0x88, 0x5E, 0xDB, 0xE0, 0x88, 0x5F, /* 0x40-0x43 */
+	0x88, 0x60, 0x88, 0x61, 0x88, 0x62, 0xBE, 0xF9, /* 0x44-0x47 */
+	0x88, 0x63, 0x88, 0x64, 0xB7, 0xBB, 0x88, 0x65, /* 0x48-0x4B */
+	0xDB, 0xD0, 0xCC, 0xAE, 0xBF, 0xB2, 0xBB, 0xB5, /* 0x4C-0x4F */
+	0xD7, 0xF8, 0xBF, 0xD3, 0x88, 0x66, 0x88, 0x67, /* 0x50-0x53 */
+	0x88, 0x68, 0x88, 0x69, 0x88, 0x6A, 0xBF, 0xE9, /* 0x54-0x57 */
+	0x88, 0x6B, 0x88, 0x6C, 0xBC, 0xE1, 0xCC, 0xB3, /* 0x58-0x5B */
+	0xDB, 0xDE, 0xB0, 0xD3, 0xCE, 0xEB, 0xB7, 0xD8, /* 0x5C-0x5F */
+	0xD7, 0xB9, 0xC6, 0xC2, 0x88, 0x6D, 0x88, 0x6E, /* 0x60-0x63 */
+	0xC0, 0xA4, 0x88, 0x6F, 0xCC, 0xB9, 0x88, 0x70, /* 0x64-0x67 */
+	0xDB, 0xE7, 0xDB, 0xE1, 0xC6, 0xBA, 0xDB, 0xE3, /* 0x68-0x6B */
+	0x88, 0x71, 0xDB, 0xE8, 0x88, 0x72, 0xC5, 0xF7, /* 0x6C-0x6F */
+	0x88, 0x73, 0x88, 0x74, 0x88, 0x75, 0xDB, 0xEA, /* 0x70-0x73 */
+	0x88, 0x76, 0x88, 0x77, 0xDB, 0xE9, 0xBF, 0xC0, /* 0x74-0x77 */
+	0x88, 0x78, 0x88, 0x79, 0x88, 0x7A, 0xDB, 0xE6, /* 0x78-0x7B */
+	0xDB, 0xE5, 0x88, 0x7B, 0x88, 0x7C, 0x88, 0x7D, /* 0x7C-0x7F */
+	
+	0x88, 0x7E, 0x88, 0x80, 0xB4, 0xB9, 0xC0, 0xAC, /* 0x80-0x83 */
+	0xC2, 0xA2, 0xDB, 0xE2, 0xDB, 0xE4, 0x88, 0x81, /* 0x84-0x87 */
+	0x88, 0x82, 0x88, 0x83, 0x88, 0x84, 0xD0, 0xCD, /* 0x88-0x8B */
+	0xDB, 0xED, 0x88, 0x85, 0x88, 0x86, 0x88, 0x87, /* 0x8C-0x8F */
+	0x88, 0x88, 0x88, 0x89, 0xC0, 0xDD, 0xDB, 0xF2, /* 0x90-0x93 */
+	0x88, 0x8A, 0x88, 0x8B, 0x88, 0x8C, 0x88, 0x8D, /* 0x94-0x97 */
+	0x88, 0x8E, 0x88, 0x8F, 0x88, 0x90, 0xB6, 0xE2, /* 0x98-0x9B */
+	0x88, 0x91, 0x88, 0x92, 0x88, 0x93, 0x88, 0x94, /* 0x9C-0x9F */
+	0xDB, 0xF3, 0xDB, 0xD2, 0xB9, 0xB8, 0xD4, 0xAB, /* 0xA0-0xA3 */
+	0xDB, 0xEC, 0x88, 0x95, 0xBF, 0xD1, 0xDB, 0xF0, /* 0xA4-0xA7 */
+	0x88, 0x96, 0xDB, 0xD1, 0x88, 0x97, 0xB5, 0xE6, /* 0xA8-0xAB */
+	0x88, 0x98, 0xDB, 0xEB, 0xBF, 0xE5, 0x88, 0x99, /* 0xAC-0xAF */
+	0x88, 0x9A, 0x88, 0x9B, 0xDB, 0xEE, 0x88, 0x9C, /* 0xB0-0xB3 */
+	0xDB, 0xF1, 0x88, 0x9D, 0x88, 0x9E, 0x88, 0x9F, /* 0xB4-0xB7 */
+	0xDB, 0xF9, 0x88, 0xA0, 0x88, 0xA1, 0x88, 0xA2, /* 0xB8-0xBB */
+	0x88, 0xA3, 0x88, 0xA4, 0x88, 0xA5, 0x88, 0xA6, /* 0xBC-0xBF */
+	0x88, 0xA7, 0x88, 0xA8, 0xB9, 0xA1, 0xB0, 0xA3, /* 0xC0-0xC3 */
+	0x88, 0xA9, 0x88, 0xAA, 0x88, 0xAB, 0x88, 0xAC, /* 0xC4-0xC7 */
+	0x88, 0xAD, 0x88, 0xAE, 0x88, 0xAF, 0xC2, 0xF1, /* 0xC8-0xCB */
+	0x88, 0xB0, 0x88, 0xB1, 0xB3, 0xC7, 0xDB, 0xEF, /* 0xCC-0xCF */
+	0x88, 0xB2, 0x88, 0xB3, 0xDB, 0xF8, 0x88, 0xB4, /* 0xD0-0xD3 */
+	0xC6, 0xD2, 0xDB, 0xF4, 0x88, 0xB5, 0x88, 0xB6, /* 0xD4-0xD7 */
+	0xDB, 0xF5, 0xDB, 0xF7, 0xDB, 0xF6, 0x88, 0xB7, /* 0xD8-0xDB */
+	0x88, 0xB8, 0xDB, 0xFE, 0x88, 0xB9, 0xD3, 0xF2, /* 0xDC-0xDF */
+	0xB2, 0xBA, 0x88, 0xBA, 0x88, 0xBB, 0x88, 0xBC, /* 0xE0-0xE3 */
+	0xDB, 0xFD, 0x88, 0xBD, 0x88, 0xBE, 0x88, 0xBF, /* 0xE4-0xE7 */
+	0x88, 0xC0, 0x88, 0xC1, 0x88, 0xC2, 0x88, 0xC3, /* 0xE8-0xEB */
+	0x88, 0xC4, 0xDC, 0xA4, 0x88, 0xC5, 0xDB, 0xFB, /* 0xEC-0xEF */
+	0x88, 0xC6, 0x88, 0xC7, 0x88, 0xC8, 0x88, 0xC9, /* 0xF0-0xF3 */
+	0xDB, 0xFA, 0x88, 0xCA, 0x88, 0xCB, 0x88, 0xCC, /* 0xF4-0xF7 */
+	0xDB, 0xFC, 0xC5, 0xE0, 0xBB, 0xF9, 0x88, 0xCD, /* 0xF8-0xFB */
+	0x88, 0xCE, 0xDC, 0xA3, 0x88, 0xCF, 0x88, 0xD0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_58[512] = {
+	0xDC, 0xA5, 0x88, 0xD1, 0xCC, 0xC3, 0x88, 0xD2, /* 0x00-0x03 */
+	0x88, 0xD3, 0x88, 0xD4, 0xB6, 0xD1, 0xDD, 0xC0, /* 0x04-0x07 */
+	0x88, 0xD5, 0x88, 0xD6, 0x88, 0xD7, 0xDC, 0xA1, /* 0x08-0x0B */
+	0x88, 0xD8, 0xDC, 0xA2, 0x88, 0xD9, 0x88, 0xDA, /* 0x0C-0x0F */
+	0x88, 0xDB, 0xC7, 0xB5, 0x88, 0xDC, 0x88, 0xDD, /* 0x10-0x13 */
+	0x88, 0xDE, 0xB6, 0xE9, 0x88, 0xDF, 0x88, 0xE0, /* 0x14-0x17 */
+	0x88, 0xE1, 0xDC, 0xA7, 0x88, 0xE2, 0x88, 0xE3, /* 0x18-0x1B */
+	0x88, 0xE4, 0x88, 0xE5, 0xDC, 0xA6, 0x88, 0xE6, /* 0x1C-0x1F */
+	0xDC, 0xA9, 0xB1, 0xA4, 0x88, 0xE7, 0x88, 0xE8, /* 0x20-0x23 */
+	0xB5, 0xCC, 0x88, 0xE9, 0x88, 0xEA, 0x88, 0xEB, /* 0x24-0x27 */
+	0x88, 0xEC, 0x88, 0xED, 0xBF, 0xB0, 0x88, 0xEE, /* 0x28-0x2B */
+	0x88, 0xEF, 0x88, 0xF0, 0x88, 0xF1, 0x88, 0xF2, /* 0x2C-0x2F */
+	0xD1, 0xDF, 0x88, 0xF3, 0x88, 0xF4, 0x88, 0xF5, /* 0x30-0x33 */
+	0x88, 0xF6, 0xB6, 0xC2, 0x88, 0xF7, 0x88, 0xF8, /* 0x34-0x37 */
+	0x88, 0xF9, 0x88, 0xFA, 0x88, 0xFB, 0x88, 0xFC, /* 0x38-0x3B */
+	0x88, 0xFD, 0x88, 0xFE, 0x89, 0x40, 0x89, 0x41, /* 0x3C-0x3F */
+	0x89, 0x42, 0x89, 0x43, 0x89, 0x44, 0x89, 0x45, /* 0x40-0x43 */
+	0xDC, 0xA8, 0x89, 0x46, 0x89, 0x47, 0x89, 0x48, /* 0x44-0x47 */
+	0x89, 0x49, 0x89, 0x4A, 0x89, 0x4B, 0x89, 0x4C, /* 0x48-0x4B */
+	0xCB, 0xFA, 0xEB, 0xF3, 0x89, 0x4D, 0x89, 0x4E, /* 0x4C-0x4F */
+	0x89, 0x4F, 0xCB, 0xDC, 0x89, 0x50, 0x89, 0x51, /* 0x50-0x53 */
+	0xCB, 0xFE, 0x89, 0x52, 0x89, 0x53, 0x89, 0x54, /* 0x54-0x57 */
+	0xCC, 0xC1, 0x89, 0x55, 0x89, 0x56, 0x89, 0x57, /* 0x58-0x5B */
+	0x89, 0x58, 0x89, 0x59, 0xC8, 0xFB, 0x89, 0x5A, /* 0x5C-0x5F */
+	0x89, 0x5B, 0x89, 0x5C, 0x89, 0x5D, 0x89, 0x5E, /* 0x60-0x63 */
+	0x89, 0x5F, 0xDC, 0xAA, 0x89, 0x60, 0x89, 0x61, /* 0x64-0x67 */
+	0x89, 0x62, 0x89, 0x63, 0x89, 0x64, 0xCC, 0xEE, /* 0x68-0x6B */
+	0xDC, 0xAB, 0x89, 0x65, 0x89, 0x66, 0x89, 0x67, /* 0x6C-0x6F */
+	0x89, 0x68, 0x89, 0x69, 0x89, 0x6A, 0x89, 0x6B, /* 0x70-0x73 */
+	0x89, 0x6C, 0x89, 0x6D, 0x89, 0x6E, 0x89, 0x6F, /* 0x74-0x77 */
+	0x89, 0x70, 0x89, 0x71, 0x89, 0x72, 0x89, 0x73, /* 0x78-0x7B */
+	0x89, 0x74, 0x89, 0x75, 0xDB, 0xD3, 0x89, 0x76, /* 0x7C-0x7F */
+	
+	0xDC, 0xAF, 0xDC, 0xAC, 0x89, 0x77, 0xBE, 0xB3, /* 0x80-0x83 */
+	0x89, 0x78, 0xCA, 0xFB, 0x89, 0x79, 0x89, 0x7A, /* 0x84-0x87 */
+	0x89, 0x7B, 0xDC, 0xAD, 0x89, 0x7C, 0x89, 0x7D, /* 0x88-0x8B */
+	0x89, 0x7E, 0x89, 0x80, 0x89, 0x81, 0x89, 0x82, /* 0x8C-0x8F */
+	0x89, 0x83, 0x89, 0x84, 0xC9, 0xCA, 0xC4, 0xB9, /* 0x90-0x93 */
+	0x89, 0x85, 0x89, 0x86, 0x89, 0x87, 0x89, 0x88, /* 0x94-0x97 */
+	0x89, 0x89, 0xC7, 0xBD, 0xDC, 0xAE, 0x89, 0x8A, /* 0x98-0x9B */
+	0x89, 0x8B, 0x89, 0x8C, 0xD4, 0xF6, 0xD0, 0xE6, /* 0x9C-0x9F */
+	0x89, 0x8D, 0x89, 0x8E, 0x89, 0x8F, 0x89, 0x90, /* 0xA0-0xA3 */
+	0x89, 0x91, 0x89, 0x92, 0x89, 0x93, 0x89, 0x94, /* 0xA4-0xA7 */
+	0xC4, 0xAB, 0xB6, 0xD5, 0x89, 0x95, 0x89, 0x96, /* 0xA8-0xAB */
+	0x89, 0x97, 0x89, 0x98, 0x89, 0x99, 0x89, 0x9A, /* 0xAC-0xAF */
+	0x89, 0x9B, 0x89, 0x9C, 0x89, 0x9D, 0x89, 0x9E, /* 0xB0-0xB3 */
+	0x89, 0x9F, 0x89, 0xA0, 0x89, 0xA1, 0x89, 0xA2, /* 0xB4-0xB7 */
+	0x89, 0xA3, 0x89, 0xA4, 0x89, 0xA5, 0x89, 0xA6, /* 0xB8-0xBB */
+	0xDB, 0xD4, 0x89, 0xA7, 0x89, 0xA8, 0x89, 0xA9, /* 0xBC-0xBF */
+	0x89, 0xAA, 0xB1, 0xDA, 0x89, 0xAB, 0x89, 0xAC, /* 0xC0-0xC3 */
+	0x89, 0xAD, 0xDB, 0xD5, 0x89, 0xAE, 0x89, 0xAF, /* 0xC4-0xC7 */
+	0x89, 0xB0, 0x89, 0xB1, 0x89, 0xB2, 0x89, 0xB3, /* 0xC8-0xCB */
+	0x89, 0xB4, 0x89, 0xB5, 0x89, 0xB6, 0x89, 0xB7, /* 0xCC-0xCF */
+	0x89, 0xB8, 0xDB, 0xD6, 0x89, 0xB9, 0x89, 0xBA, /* 0xD0-0xD3 */
+	0x89, 0xBB, 0xBA, 0xBE, 0x89, 0xBC, 0x89, 0xBD, /* 0xD4-0xD7 */
+	0x89, 0xBE, 0x89, 0xBF, 0x89, 0xC0, 0x89, 0xC1, /* 0xD8-0xDB */
+	0x89, 0xC2, 0x89, 0xC3, 0x89, 0xC4, 0x89, 0xC5, /* 0xDC-0xDF */
+	0x89, 0xC6, 0x89, 0xC7, 0x89, 0xC8, 0x89, 0xC9, /* 0xE0-0xE3 */
+	0xC8, 0xC0, 0x89, 0xCA, 0x89, 0xCB, 0x89, 0xCC, /* 0xE4-0xE7 */
+	0x89, 0xCD, 0x89, 0xCE, 0x89, 0xCF, 0xCA, 0xBF, /* 0xE8-0xEB */
+	0xC8, 0xC9, 0x89, 0xD0, 0xD7, 0xB3, 0x89, 0xD1, /* 0xEC-0xEF */
+	0xC9, 0xF9, 0x89, 0xD2, 0x89, 0xD3, 0xBF, 0xC7, /* 0xF0-0xF3 */
+	0x89, 0xD4, 0x89, 0xD5, 0xBA, 0xF8, 0x89, 0xD6, /* 0xF4-0xF7 */
+	0x89, 0xD7, 0xD2, 0xBC, 0x89, 0xD8, 0x89, 0xD9, /* 0xF8-0xFB */
+	0x89, 0xDA, 0x89, 0xDB, 0x89, 0xDC, 0x89, 0xDD, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_59[512] = {
+	0x89, 0xDE, 0x89, 0xDF, 0xE2, 0xBA, 0x89, 0xE0, /* 0x00-0x03 */
+	0xB4, 0xA6, 0x89, 0xE1, 0x89, 0xE2, 0xB1, 0xB8, /* 0x04-0x07 */
+	0x89, 0xE3, 0x89, 0xE4, 0x89, 0xE5, 0x89, 0xE6, /* 0x08-0x0B */
+	0x89, 0xE7, 0xB8, 0xB4, 0x89, 0xE8, 0xCF, 0xC4, /* 0x0C-0x0F */
+	0x89, 0xE9, 0x89, 0xEA, 0x89, 0xEB, 0x89, 0xEC, /* 0x10-0x13 */
+	0xD9, 0xE7, 0xCF, 0xA6, 0xCD, 0xE2, 0x89, 0xED, /* 0x14-0x17 */
+	0x89, 0xEE, 0xD9, 0xED, 0xB6, 0xE0, 0x89, 0xEF, /* 0x18-0x1B */
+	0xD2, 0xB9, 0x89, 0xF0, 0x89, 0xF1, 0xB9, 0xBB, /* 0x1C-0x1F */
+	0x89, 0xF2, 0x89, 0xF3, 0x89, 0xF4, 0x89, 0xF5, /* 0x20-0x23 */
+	0xE2, 0xB9, 0xE2, 0xB7, 0x89, 0xF6, 0xB4, 0xF3, /* 0x24-0x27 */
+	0x89, 0xF7, 0xCC, 0xEC, 0xCC, 0xAB, 0xB7, 0xF2, /* 0x28-0x2B */
+	0x89, 0xF8, 0xD8, 0xB2, 0xD1, 0xEB, 0xBA, 0xBB, /* 0x2C-0x2F */
+	0x89, 0xF9, 0xCA, 0xA7, 0x89, 0xFA, 0x89, 0xFB, /* 0x30-0x33 */
+	0xCD, 0xB7, 0x89, 0xFC, 0x89, 0xFD, 0xD2, 0xC4, /* 0x34-0x37 */
+	0xBF, 0xE4, 0xBC, 0xD0, 0xB6, 0xE1, 0x89, 0xFE, /* 0x38-0x3B */
+	0xDE, 0xC5, 0x8A, 0x40, 0x8A, 0x41, 0x8A, 0x42, /* 0x3C-0x3F */
+	0x8A, 0x43, 0xDE, 0xC6, 0xDB, 0xBC, 0x8A, 0x44, /* 0x40-0x43 */
+	0xD1, 0xD9, 0x8A, 0x45, 0x8A, 0x46, 0xC6, 0xE6, /* 0x44-0x47 */
+	0xC4, 0xCE, 0xB7, 0xEE, 0x8A, 0x47, 0xB7, 0xDC, /* 0x48-0x4B */
+	0x8A, 0x48, 0x8A, 0x49, 0xBF, 0xFC, 0xD7, 0xE0, /* 0x4C-0x4F */
+	0x8A, 0x4A, 0xC6, 0xF5, 0x8A, 0x4B, 0x8A, 0x4C, /* 0x50-0x53 */
+	0xB1, 0xBC, 0xDE, 0xC8, 0xBD, 0xB1, 0xCC, 0xD7, /* 0x54-0x57 */
+	0xDE, 0xCA, 0x8A, 0x4D, 0xDE, 0xC9, 0x8A, 0x4E, /* 0x58-0x5B */
+	0x8A, 0x4F, 0x8A, 0x50, 0x8A, 0x51, 0x8A, 0x52, /* 0x5C-0x5F */
+	0xB5, 0xEC, 0x8A, 0x53, 0xC9, 0xDD, 0x8A, 0x54, /* 0x60-0x63 */
+	0x8A, 0x55, 0xB0, 0xC2, 0x8A, 0x56, 0x8A, 0x57, /* 0x64-0x67 */
+	0x8A, 0x58, 0x8A, 0x59, 0x8A, 0x5A, 0x8A, 0x5B, /* 0x68-0x6B */
+	0x8A, 0x5C, 0x8A, 0x5D, 0x8A, 0x5E, 0x8A, 0x5F, /* 0x6C-0x6F */
+	0x8A, 0x60, 0x8A, 0x61, 0x8A, 0x62, 0xC5, 0xAE, /* 0x70-0x73 */
+	0xC5, 0xAB, 0x8A, 0x63, 0xC4, 0xCC, 0x8A, 0x64, /* 0x74-0x77 */
+	0xBC, 0xE9, 0xCB, 0xFD, 0x8A, 0x65, 0x8A, 0x66, /* 0x78-0x7B */
+	0x8A, 0x67, 0xBA, 0xC3, 0x8A, 0x68, 0x8A, 0x69, /* 0x7C-0x7F */
+	
+	0x8A, 0x6A, 0xE5, 0xF9, 0xC8, 0xE7, 0xE5, 0xFA, /* 0x80-0x83 */
+	0xCD, 0xFD, 0x8A, 0x6B, 0xD7, 0xB1, 0xB8, 0xBE, /* 0x84-0x87 */
+	0xC2, 0xE8, 0x8A, 0x6C, 0xC8, 0xD1, 0x8A, 0x6D, /* 0x88-0x8B */
+	0x8A, 0x6E, 0xE5, 0xFB, 0x8A, 0x6F, 0x8A, 0x70, /* 0x8C-0x8F */
+	0x8A, 0x71, 0x8A, 0x72, 0xB6, 0xCA, 0xBC, 0xCB, /* 0x90-0x93 */
+	0x8A, 0x73, 0x8A, 0x74, 0xD1, 0xFD, 0xE6, 0xA1, /* 0x94-0x97 */
+	0x8A, 0x75, 0xC3, 0xEE, 0x8A, 0x76, 0x8A, 0x77, /* 0x98-0x9B */
+	0x8A, 0x78, 0x8A, 0x79, 0xE6, 0xA4, 0x8A, 0x7A, /* 0x9C-0x9F */
+	0x8A, 0x7B, 0x8A, 0x7C, 0x8A, 0x7D, 0xE5, 0xFE, /* 0xA0-0xA3 */
+	0xE6, 0xA5, 0xCD, 0xD7, 0x8A, 0x7E, 0x8A, 0x80, /* 0xA4-0xA7 */
+	0xB7, 0xC1, 0xE5, 0xFC, 0xE5, 0xFD, 0xE6, 0xA3, /* 0xA8-0xAB */
+	0x8A, 0x81, 0x8A, 0x82, 0xC4, 0xDD, 0xE6, 0xA8, /* 0xAC-0xAF */
+	0x8A, 0x83, 0x8A, 0x84, 0xE6, 0xA7, 0x8A, 0x85, /* 0xB0-0xB3 */
+	0x8A, 0x86, 0x8A, 0x87, 0x8A, 0x88, 0x8A, 0x89, /* 0xB4-0xB7 */
+	0x8A, 0x8A, 0xC3, 0xC3, 0x8A, 0x8B, 0xC6, 0xDE, /* 0xB8-0xBB */
+	0x8A, 0x8C, 0x8A, 0x8D, 0xE6, 0xAA, 0x8A, 0x8E, /* 0xBC-0xBF */
+	0x8A, 0x8F, 0x8A, 0x90, 0x8A, 0x91, 0x8A, 0x92, /* 0xC0-0xC3 */
+	0x8A, 0x93, 0x8A, 0x94, 0xC4, 0xB7, 0x8A, 0x95, /* 0xC4-0xC7 */
+	0x8A, 0x96, 0x8A, 0x97, 0xE6, 0xA2, 0xCA, 0xBC, /* 0xC8-0xCB */
+	0x8A, 0x98, 0x8A, 0x99, 0x8A, 0x9A, 0x8A, 0x9B, /* 0xCC-0xCF */
+	0xBD, 0xE3, 0xB9, 0xC3, 0xE6, 0xA6, 0xD0, 0xD5, /* 0xD0-0xD3 */
+	0xCE, 0xAF, 0x8A, 0x9C, 0x8A, 0x9D, 0xE6, 0xA9, /* 0xD4-0xD7 */
+	0xE6, 0xB0, 0x8A, 0x9E, 0xD2, 0xA6, 0x8A, 0x9F, /* 0xD8-0xDB */
+	0xBD, 0xAA, 0xE6, 0xAD, 0x8A, 0xA0, 0x8A, 0xA1, /* 0xDC-0xDF */
+	0x8A, 0xA2, 0x8A, 0xA3, 0x8A, 0xA4, 0xE6, 0xAF, /* 0xE0-0xE3 */
+	0x8A, 0xA5, 0xC0, 0xD1, 0x8A, 0xA6, 0x8A, 0xA7, /* 0xE4-0xE7 */
+	0xD2, 0xCC, 0x8A, 0xA8, 0x8A, 0xA9, 0x8A, 0xAA, /* 0xE8-0xEB */
+	0xBC, 0xA7, 0x8A, 0xAB, 0x8A, 0xAC, 0x8A, 0xAD, /* 0xEC-0xEF */
+	0x8A, 0xAE, 0x8A, 0xAF, 0x8A, 0xB0, 0x8A, 0xB1, /* 0xF0-0xF3 */
+	0x8A, 0xB2, 0x8A, 0xB3, 0x8A, 0xB4, 0x8A, 0xB5, /* 0xF4-0xF7 */
+	0x8A, 0xB6, 0xE6, 0xB1, 0x8A, 0xB7, 0xD2, 0xF6, /* 0xF8-0xFB */
+	0x8A, 0xB8, 0x8A, 0xB9, 0x8A, 0xBA, 0xD7, 0xCB, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5A[512] = {
+	0x8A, 0xBB, 0xCD, 0xFE, 0x8A, 0xBC, 0xCD, 0xDE, /* 0x00-0x03 */
+	0xC2, 0xA6, 0xE6, 0xAB, 0xE6, 0xAC, 0xBD, 0xBF, /* 0x04-0x07 */
+	0xE6, 0xAE, 0xE6, 0xB3, 0x8A, 0xBD, 0x8A, 0xBE, /* 0x08-0x0B */
+	0xE6, 0xB2, 0x8A, 0xBF, 0x8A, 0xC0, 0x8A, 0xC1, /* 0x0C-0x0F */
+	0x8A, 0xC2, 0xE6, 0xB6, 0x8A, 0xC3, 0xE6, 0xB8, /* 0x10-0x13 */
+	0x8A, 0xC4, 0x8A, 0xC5, 0x8A, 0xC6, 0x8A, 0xC7, /* 0x14-0x17 */
+	0xC4, 0xEF, 0x8A, 0xC8, 0x8A, 0xC9, 0x8A, 0xCA, /* 0x18-0x1B */
+	0xC4, 0xC8, 0x8A, 0xCB, 0x8A, 0xCC, 0xBE, 0xEA, /* 0x1C-0x1F */
+	0xC9, 0xEF, 0x8A, 0xCD, 0x8A, 0xCE, 0xE6, 0xB7, /* 0x20-0x23 */
+	0x8A, 0xCF, 0xB6, 0xF0, 0x8A, 0xD0, 0x8A, 0xD1, /* 0x24-0x27 */
+	0x8A, 0xD2, 0xC3, 0xE4, 0x8A, 0xD3, 0x8A, 0xD4, /* 0x28-0x2B */
+	0x8A, 0xD5, 0x8A, 0xD6, 0x8A, 0xD7, 0x8A, 0xD8, /* 0x2C-0x2F */
+	0x8A, 0xD9, 0xD3, 0xE9, 0xE6, 0xB4, 0x8A, 0xDA, /* 0x30-0x33 */
+	0xE6, 0xB5, 0x8A, 0xDB, 0xC8, 0xA2, 0x8A, 0xDC, /* 0x34-0x37 */
+	0x8A, 0xDD, 0x8A, 0xDE, 0x8A, 0xDF, 0x8A, 0xE0, /* 0x38-0x3B */
+	0xE6, 0xBD, 0x8A, 0xE1, 0x8A, 0xE2, 0x8A, 0xE3, /* 0x3C-0x3F */
+	0xE6, 0xB9, 0x8A, 0xE4, 0x8A, 0xE5, 0x8A, 0xE6, /* 0x40-0x43 */
+	0x8A, 0xE7, 0x8A, 0xE8, 0xC6, 0xC5, 0x8A, 0xE9, /* 0x44-0x47 */
+	0x8A, 0xEA, 0xCD, 0xF1, 0xE6, 0xBB, 0x8A, 0xEB, /* 0x48-0x4B */
+	0x8A, 0xEC, 0x8A, 0xED, 0x8A, 0xEE, 0x8A, 0xEF, /* 0x4C-0x4F */
+	0x8A, 0xF0, 0x8A, 0xF1, 0x8A, 0xF2, 0x8A, 0xF3, /* 0x50-0x53 */
+	0x8A, 0xF4, 0xE6, 0xBC, 0x8A, 0xF5, 0x8A, 0xF6, /* 0x54-0x57 */
+	0x8A, 0xF7, 0x8A, 0xF8, 0xBB, 0xE9, 0x8A, 0xF9, /* 0x58-0x5B */
+	0x8A, 0xFA, 0x8A, 0xFB, 0x8A, 0xFC, 0x8A, 0xFD, /* 0x5C-0x5F */
+	0x8A, 0xFE, 0x8B, 0x40, 0xE6, 0xBE, 0x8B, 0x41, /* 0x60-0x63 */
+	0x8B, 0x42, 0x8B, 0x43, 0x8B, 0x44, 0xE6, 0xBA, /* 0x64-0x67 */
+	0x8B, 0x45, 0x8B, 0x46, 0xC0, 0xB7, 0x8B, 0x47, /* 0x68-0x6B */
+	0x8B, 0x48, 0x8B, 0x49, 0x8B, 0x4A, 0x8B, 0x4B, /* 0x6C-0x6F */
+	0x8B, 0x4C, 0x8B, 0x4D, 0x8B, 0x4E, 0x8B, 0x4F, /* 0x70-0x73 */
+	0xD3, 0xA4, 0xE6, 0xBF, 0xC9, 0xF4, 0xE6, 0xC3, /* 0x74-0x77 */
+	0x8B, 0x50, 0x8B, 0x51, 0xE6, 0xC4, 0x8B, 0x52, /* 0x78-0x7B */
+	0x8B, 0x53, 0x8B, 0x54, 0x8B, 0x55, 0xD0, 0xF6, /* 0x7C-0x7F */
+	
+	0x8B, 0x56, 0x8B, 0x57, 0x8B, 0x58, 0x8B, 0x59, /* 0x80-0x83 */
+	0x8B, 0x5A, 0x8B, 0x5B, 0x8B, 0x5C, 0x8B, 0x5D, /* 0x84-0x87 */
+	0x8B, 0x5E, 0x8B, 0x5F, 0x8B, 0x60, 0x8B, 0x61, /* 0x88-0x8B */
+	0x8B, 0x62, 0x8B, 0x63, 0x8B, 0x64, 0x8B, 0x65, /* 0x8C-0x8F */
+	0x8B, 0x66, 0x8B, 0x67, 0xC3, 0xBD, 0x8B, 0x68, /* 0x90-0x93 */
+	0x8B, 0x69, 0x8B, 0x6A, 0x8B, 0x6B, 0x8B, 0x6C, /* 0x94-0x97 */
+	0x8B, 0x6D, 0x8B, 0x6E, 0xC3, 0xC4, 0xE6, 0xC2, /* 0x98-0x9B */
+	0x8B, 0x6F, 0x8B, 0x70, 0x8B, 0x71, 0x8B, 0x72, /* 0x9C-0x9F */
+	0x8B, 0x73, 0x8B, 0x74, 0x8B, 0x75, 0x8B, 0x76, /* 0xA0-0xA3 */
+	0x8B, 0x77, 0x8B, 0x78, 0x8B, 0x79, 0x8B, 0x7A, /* 0xA4-0xA7 */
+	0x8B, 0x7B, 0x8B, 0x7C, 0xE6, 0xC1, 0x8B, 0x7D, /* 0xA8-0xAB */
+	0x8B, 0x7E, 0x8B, 0x80, 0x8B, 0x81, 0x8B, 0x82, /* 0xAC-0xAF */
+	0x8B, 0x83, 0x8B, 0x84, 0xE6, 0xC7, 0xCF, 0xB1, /* 0xB0-0xB3 */
+	0x8B, 0x85, 0xEB, 0xF4, 0x8B, 0x86, 0x8B, 0x87, /* 0xB4-0xB7 */
+	0xE6, 0xCA, 0x8B, 0x88, 0x8B, 0x89, 0x8B, 0x8A, /* 0xB8-0xBB */
+	0x8B, 0x8B, 0x8B, 0x8C, 0xE6, 0xC5, 0x8B, 0x8D, /* 0xBC-0xBF */
+	0x8B, 0x8E, 0xBC, 0xDE, 0xC9, 0xA9, 0x8B, 0x8F, /* 0xC0-0xC3 */
+	0x8B, 0x90, 0x8B, 0x91, 0x8B, 0x92, 0x8B, 0x93, /* 0xC4-0xC7 */
+	0x8B, 0x94, 0xBC, 0xB5, 0x8B, 0x95, 0x8B, 0x96, /* 0xC8-0xCB */
+	0xCF, 0xD3, 0x8B, 0x97, 0x8B, 0x98, 0x8B, 0x99, /* 0xCC-0xCF */
+	0x8B, 0x9A, 0x8B, 0x9B, 0xE6, 0xC8, 0x8B, 0x9C, /* 0xD0-0xD3 */
+	0xE6, 0xC9, 0x8B, 0x9D, 0xE6, 0xCE, 0x8B, 0x9E, /* 0xD4-0xD7 */
+	0xE6, 0xD0, 0x8B, 0x9F, 0x8B, 0xA0, 0x8B, 0xA1, /* 0xD8-0xDB */
+	0xE6, 0xD1, 0x8B, 0xA2, 0x8B, 0xA3, 0x8B, 0xA4, /* 0xDC-0xDF */
+	0xE6, 0xCB, 0xB5, 0xD5, 0x8B, 0xA5, 0xE6, 0xCC, /* 0xE0-0xE3 */
+	0x8B, 0xA6, 0x8B, 0xA7, 0xE6, 0xCF, 0x8B, 0xA8, /* 0xE4-0xE7 */
+	0x8B, 0xA9, 0xC4, 0xDB, 0x8B, 0xAA, 0xE6, 0xC6, /* 0xE8-0xEB */
+	0x8B, 0xAB, 0x8B, 0xAC, 0x8B, 0xAD, 0x8B, 0xAE, /* 0xEC-0xEF */
+	0x8B, 0xAF, 0xE6, 0xCD, 0x8B, 0xB0, 0x8B, 0xB1, /* 0xF0-0xF3 */
+	0x8B, 0xB2, 0x8B, 0xB3, 0x8B, 0xB4, 0x8B, 0xB5, /* 0xF4-0xF7 */
+	0x8B, 0xB6, 0x8B, 0xB7, 0x8B, 0xB8, 0x8B, 0xB9, /* 0xF8-0xFB */
+	0x8B, 0xBA, 0x8B, 0xBB, 0x8B, 0xBC, 0x8B, 0xBD, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5B[512] = {
+	0x8B, 0xBE, 0x8B, 0xBF, 0x8B, 0xC0, 0x8B, 0xC1, /* 0x00-0x03 */
+	0x8B, 0xC2, 0x8B, 0xC3, 0x8B, 0xC4, 0x8B, 0xC5, /* 0x04-0x07 */
+	0x8B, 0xC6, 0xE6, 0xD2, 0x8B, 0xC7, 0x8B, 0xC8, /* 0x08-0x0B */
+	0x8B, 0xC9, 0x8B, 0xCA, 0x8B, 0xCB, 0x8B, 0xCC, /* 0x0C-0x0F */
+	0x8B, 0xCD, 0x8B, 0xCE, 0x8B, 0xCF, 0x8B, 0xD0, /* 0x10-0x13 */
+	0x8B, 0xD1, 0x8B, 0xD2, 0xE6, 0xD4, 0xE6, 0xD3, /* 0x14-0x17 */
+	0x8B, 0xD3, 0x8B, 0xD4, 0x8B, 0xD5, 0x8B, 0xD6, /* 0x18-0x1B */
+	0x8B, 0xD7, 0x8B, 0xD8, 0x8B, 0xD9, 0x8B, 0xDA, /* 0x1C-0x1F */
+	0x8B, 0xDB, 0x8B, 0xDC, 0x8B, 0xDD, 0x8B, 0xDE, /* 0x20-0x23 */
+	0x8B, 0xDF, 0x8B, 0xE0, 0x8B, 0xE1, 0x8B, 0xE2, /* 0x24-0x27 */
+	0x8B, 0xE3, 0x8B, 0xE4, 0x8B, 0xE5, 0x8B, 0xE6, /* 0x28-0x2B */
+	0x8B, 0xE7, 0x8B, 0xE8, 0x8B, 0xE9, 0x8B, 0xEA, /* 0x2C-0x2F */
+	0x8B, 0xEB, 0x8B, 0xEC, 0xE6, 0xD5, 0x8B, 0xED, /* 0x30-0x33 */
+	0xD9, 0xF8, 0x8B, 0xEE, 0x8B, 0xEF, 0xE6, 0xD6, /* 0x34-0x37 */
+	0x8B, 0xF0, 0x8B, 0xF1, 0x8B, 0xF2, 0x8B, 0xF3, /* 0x38-0x3B */
+	0x8B, 0xF4, 0x8B, 0xF5, 0x8B, 0xF6, 0x8B, 0xF7, /* 0x3C-0x3F */
+	0xE6, 0xD7, 0x8B, 0xF8, 0x8B, 0xF9, 0x8B, 0xFA, /* 0x40-0x43 */
+	0x8B, 0xFB, 0x8B, 0xFC, 0x8B, 0xFD, 0x8B, 0xFE, /* 0x44-0x47 */
+	0x8C, 0x40, 0x8C, 0x41, 0x8C, 0x42, 0x8C, 0x43, /* 0x48-0x4B */
+	0x8C, 0x44, 0x8C, 0x45, 0x8C, 0x46, 0x8C, 0x47, /* 0x4C-0x4F */
+	0xD7, 0xD3, 0xE6, 0xDD, 0x8C, 0x48, 0xE6, 0xDE, /* 0x50-0x53 */
+	0xBF, 0xD7, 0xD4, 0xD0, 0x8C, 0x49, 0xD7, 0xD6, /* 0x54-0x57 */
+	0xB4, 0xE6, 0xCB, 0xEF, 0xE6, 0xDA, 0xD8, 0xC3, /* 0x58-0x5B */
+	0xD7, 0xCE, 0xD0, 0xA2, 0x8C, 0x4A, 0xC3, 0xCF, /* 0x5C-0x5F */
+	0x8C, 0x4B, 0x8C, 0x4C, 0xE6, 0xDF, 0xBC, 0xBE, /* 0x60-0x63 */
+	0xB9, 0xC2, 0xE6, 0xDB, 0xD1, 0xA7, 0x8C, 0x4D, /* 0x64-0x67 */
+	0x8C, 0x4E, 0xBA, 0xA2, 0xC2, 0xCF, 0x8C, 0x4F, /* 0x68-0x6B */
+	0xD8, 0xAB, 0x8C, 0x50, 0x8C, 0x51, 0x8C, 0x52, /* 0x6C-0x6F */
+	0xCA, 0xEB, 0xE5, 0xEE, 0x8C, 0x53, 0xE6, 0xDC, /* 0x70-0x73 */
+	0x8C, 0x54, 0xB7, 0xF5, 0x8C, 0x55, 0x8C, 0x56, /* 0x74-0x77 */
+	0x8C, 0x57, 0x8C, 0x58, 0xC8, 0xE6, 0x8C, 0x59, /* 0x78-0x7B */
+	0x8C, 0x5A, 0xC4, 0xF5, 0x8C, 0x5B, 0x8C, 0x5C, /* 0x7C-0x7F */
+	
+	0xE5, 0xB2, 0xC4, 0xFE, 0x8C, 0x5D, 0xCB, 0xFC, /* 0x80-0x83 */
+	0xE5, 0xB3, 0xD5, 0xAC, 0x8C, 0x5E, 0xD3, 0xEE, /* 0x84-0x87 */
+	0xCA, 0xD8, 0xB0, 0xB2, 0x8C, 0x5F, 0xCB, 0xCE, /* 0x88-0x8B */
+	0xCD, 0xEA, 0x8C, 0x60, 0x8C, 0x61, 0xBA, 0xEA, /* 0x8C-0x8F */
+	0x8C, 0x62, 0x8C, 0x63, 0x8C, 0x64, 0xE5, 0xB5, /* 0x90-0x93 */
+	0x8C, 0x65, 0xE5, 0xB4, 0x8C, 0x66, 0xD7, 0xDA, /* 0x94-0x97 */
+	0xB9, 0xD9, 0xD6, 0xE6, 0xB6, 0xA8, 0xCD, 0xF0, /* 0x98-0x9B */
+	0xD2, 0xCB, 0xB1, 0xA6, 0xCA, 0xB5, 0x8C, 0x67, /* 0x9C-0x9F */
+	0xB3, 0xE8, 0xC9, 0xF3, 0xBF, 0xCD, 0xD0, 0xFB, /* 0xA0-0xA3 */
+	0xCA, 0xD2, 0xE5, 0xB6, 0xBB, 0xC2, 0x8C, 0x68, /* 0xA4-0xA7 */
+	0x8C, 0x69, 0x8C, 0x6A, 0xCF, 0xDC, 0xB9, 0xAC, /* 0xA8-0xAB */
+	0x8C, 0x6B, 0x8C, 0x6C, 0x8C, 0x6D, 0x8C, 0x6E, /* 0xAC-0xAF */
+	0xD4, 0xD7, 0x8C, 0x6F, 0x8C, 0x70, 0xBA, 0xA6, /* 0xB0-0xB3 */
+	0xD1, 0xE7, 0xCF, 0xFC, 0xBC, 0xD2, 0x8C, 0x71, /* 0xB4-0xB7 */
+	0xE5, 0xB7, 0xC8, 0xDD, 0x8C, 0x72, 0x8C, 0x73, /* 0xB8-0xBB */
+	0x8C, 0x74, 0xBF, 0xED, 0xB1, 0xF6, 0xCB, 0xDE, /* 0xBC-0xBF */
+	0x8C, 0x75, 0x8C, 0x76, 0xBC, 0xC5, 0x8C, 0x77, /* 0xC0-0xC3 */
+	0xBC, 0xC4, 0xD2, 0xFA, 0xC3, 0xDC, 0xBF, 0xDC, /* 0xC4-0xC7 */
+	0x8C, 0x78, 0x8C, 0x79, 0x8C, 0x7A, 0x8C, 0x7B, /* 0xC8-0xCB */
+	0xB8, 0xBB, 0x8C, 0x7C, 0x8C, 0x7D, 0x8C, 0x7E, /* 0xCC-0xCF */
+	0xC3, 0xC2, 0x8C, 0x80, 0xBA, 0xAE, 0xD4, 0xA2, /* 0xD0-0xD3 */
+	0x8C, 0x81, 0x8C, 0x82, 0x8C, 0x83, 0x8C, 0x84, /* 0xD4-0xD7 */
+	0x8C, 0x85, 0x8C, 0x86, 0x8C, 0x87, 0x8C, 0x88, /* 0xD8-0xDB */
+	0x8C, 0x89, 0xC7, 0xDE, 0xC4, 0xAF, 0xB2, 0xEC, /* 0xDC-0xDF */
+	0x8C, 0x8A, 0xB9, 0xD1, 0x8C, 0x8B, 0x8C, 0x8C, /* 0xE0-0xE3 */
+	0xE5, 0xBB, 0xC1, 0xC8, 0x8C, 0x8D, 0x8C, 0x8E, /* 0xE4-0xE7 */
+	0xD5, 0xAF, 0x8C, 0x8F, 0x8C, 0x90, 0x8C, 0x91, /* 0xE8-0xEB */
+	0x8C, 0x92, 0x8C, 0x93, 0xE5, 0xBC, 0x8C, 0x94, /* 0xEC-0xEF */
+	0xE5, 0xBE, 0x8C, 0x95, 0x8C, 0x96, 0x8C, 0x97, /* 0xF0-0xF3 */
+	0x8C, 0x98, 0x8C, 0x99, 0x8C, 0x9A, 0x8C, 0x9B, /* 0xF4-0xF7 */
+	0xB4, 0xE7, 0xB6, 0xD4, 0xCB, 0xC2, 0xD1, 0xB0, /* 0xF8-0xFB */
+	0xB5, 0xBC, 0x8C, 0x9C, 0x8C, 0x9D, 0xCA, 0xD9, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5C[512] = {
+	0x8C, 0x9E, 0xB7, 0xE2, 0x8C, 0x9F, 0x8C, 0xA0, /* 0x00-0x03 */
+	0xC9, 0xE4, 0x8C, 0xA1, 0xBD, 0xAB, 0x8C, 0xA2, /* 0x04-0x07 */
+	0x8C, 0xA3, 0xCE, 0xBE, 0xD7, 0xF0, 0x8C, 0xA4, /* 0x08-0x0B */
+	0x8C, 0xA5, 0x8C, 0xA6, 0x8C, 0xA7, 0xD0, 0xA1, /* 0x0C-0x0F */
+	0x8C, 0xA8, 0xC9, 0xD9, 0x8C, 0xA9, 0x8C, 0xAA, /* 0x10-0x13 */
+	0xB6, 0xFB, 0xE6, 0xD8, 0xBC, 0xE2, 0x8C, 0xAB, /* 0x14-0x17 */
+	0xB3, 0xBE, 0x8C, 0xAC, 0xC9, 0xD0, 0x8C, 0xAD, /* 0x18-0x1B */
+	0xE6, 0xD9, 0xB3, 0xA2, 0x8C, 0xAE, 0x8C, 0xAF, /* 0x1C-0x1F */
+	0x8C, 0xB0, 0x8C, 0xB1, 0xDE, 0xCC, 0x8C, 0xB2, /* 0x20-0x23 */
+	0xD3, 0xC8, 0xDE, 0xCD, 0x8C, 0xB3, 0xD2, 0xA2, /* 0x24-0x27 */
+	0x8C, 0xB4, 0x8C, 0xB5, 0x8C, 0xB6, 0x8C, 0xB7, /* 0x28-0x2B */
+	0xDE, 0xCE, 0x8C, 0xB8, 0x8C, 0xB9, 0x8C, 0xBA, /* 0x2C-0x2F */
+	0x8C, 0xBB, 0xBE, 0xCD, 0x8C, 0xBC, 0x8C, 0xBD, /* 0x30-0x33 */
+	0xDE, 0xCF, 0x8C, 0xBE, 0x8C, 0xBF, 0x8C, 0xC0, /* 0x34-0x37 */
+	0xCA, 0xAC, 0xD2, 0xFC, 0xB3, 0xDF, 0xE5, 0xEA, /* 0x38-0x3B */
+	0xC4, 0xE1, 0xBE, 0xA1, 0xCE, 0xB2, 0xC4, 0xF2, /* 0x3C-0x3F */
+	0xBE, 0xD6, 0xC6, 0xA8, 0xB2, 0xE3, 0x8C, 0xC1, /* 0x40-0x43 */
+	0x8C, 0xC2, 0xBE, 0xD3, 0x8C, 0xC3, 0x8C, 0xC4, /* 0x44-0x47 */
+	0xC7, 0xFC, 0xCC, 0xEB, 0xBD, 0xEC, 0xCE, 0xDD, /* 0x48-0x4B */
+	0x8C, 0xC5, 0x8C, 0xC6, 0xCA, 0xBA, 0xC6, 0xC1, /* 0x4C-0x4F */
+	0xE5, 0xEC, 0xD0, 0xBC, 0x8C, 0xC7, 0x8C, 0xC8, /* 0x50-0x53 */
+	0x8C, 0xC9, 0xD5, 0xB9, 0x8C, 0xCA, 0x8C, 0xCB, /* 0x54-0x57 */
+	0x8C, 0xCC, 0xE5, 0xED, 0x8C, 0xCD, 0x8C, 0xCE, /* 0x58-0x5B */
+	0x8C, 0xCF, 0x8C, 0xD0, 0xCA, 0xF4, 0x8C, 0xD1, /* 0x5C-0x5F */
+	0xCD, 0xC0, 0xC2, 0xC5, 0x8C, 0xD2, 0xE5, 0xEF, /* 0x60-0x63 */
+	0x8C, 0xD3, 0xC2, 0xC4, 0xE5, 0xF0, 0x8C, 0xD4, /* 0x64-0x67 */
+	0x8C, 0xD5, 0x8C, 0xD6, 0x8C, 0xD7, 0x8C, 0xD8, /* 0x68-0x6B */
+	0x8C, 0xD9, 0x8C, 0xDA, 0xE5, 0xF8, 0xCD, 0xCD, /* 0x6C-0x6F */
+	0x8C, 0xDB, 0xC9, 0xBD, 0x8C, 0xDC, 0x8C, 0xDD, /* 0x70-0x73 */
+	0x8C, 0xDE, 0x8C, 0xDF, 0x8C, 0xE0, 0x8C, 0xE1, /* 0x74-0x77 */
+	0x8C, 0xE2, 0xD2, 0xD9, 0xE1, 0xA8, 0x8C, 0xE3, /* 0x78-0x7B */
+	0x8C, 0xE4, 0x8C, 0xE5, 0x8C, 0xE6, 0xD3, 0xEC, /* 0x7C-0x7F */
+	
+	0x8C, 0xE7, 0xCB, 0xEA, 0xC6, 0xF1, 0x8C, 0xE8, /* 0x80-0x83 */
+	0x8C, 0xE9, 0x8C, 0xEA, 0x8C, 0xEB, 0x8C, 0xEC, /* 0x84-0x87 */
+	0xE1, 0xAC, 0x8C, 0xED, 0x8C, 0xEE, 0x8C, 0xEF, /* 0x88-0x8B */
+	0xE1, 0xA7, 0xE1, 0xA9, 0x8C, 0xF0, 0x8C, 0xF1, /* 0x8C-0x8F */
+	0xE1, 0xAA, 0xE1, 0xAF, 0x8C, 0xF2, 0x8C, 0xF3, /* 0x90-0x93 */
+	0xB2, 0xED, 0x8C, 0xF4, 0xE1, 0xAB, 0xB8, 0xDA, /* 0x94-0x97 */
+	0xE1, 0xAD, 0xE1, 0xAE, 0xE1, 0xB0, 0xB5, 0xBA, /* 0x98-0x9B */
+	0xE1, 0xB1, 0x8C, 0xF5, 0x8C, 0xF6, 0x8C, 0xF7, /* 0x9C-0x9F */
+	0x8C, 0xF8, 0x8C, 0xF9, 0xE1, 0xB3, 0xE1, 0xB8, /* 0xA0-0xA3 */
+	0x8C, 0xFA, 0x8C, 0xFB, 0x8C, 0xFC, 0x8C, 0xFD, /* 0xA4-0xA7 */
+	0x8C, 0xFE, 0xD1, 0xD2, 0x8D, 0x40, 0xE1, 0xB6, /* 0xA8-0xAB */
+	0xE1, 0xB5, 0xC1, 0xEB, 0x8D, 0x41, 0x8D, 0x42, /* 0xAC-0xAF */
+	0x8D, 0x43, 0xE1, 0xB7, 0x8D, 0x44, 0xD4, 0xC0, /* 0xB0-0xB3 */
+	0x8D, 0x45, 0xE1, 0xB2, 0x8D, 0x46, 0xE1, 0xBA, /* 0xB4-0xB7 */
+	0xB0, 0xB6, 0x8D, 0x47, 0x8D, 0x48, 0x8D, 0x49, /* 0xB8-0xBB */
+	0x8D, 0x4A, 0xE1, 0xB4, 0x8D, 0x4B, 0xBF, 0xF9, /* 0xBC-0xBF */
+	0x8D, 0x4C, 0xE1, 0xB9, 0x8D, 0x4D, 0x8D, 0x4E, /* 0xC0-0xC3 */
+	0xE1, 0xBB, 0x8D, 0x4F, 0x8D, 0x50, 0x8D, 0x51, /* 0xC4-0xC7 */
+	0x8D, 0x52, 0x8D, 0x53, 0x8D, 0x54, 0xE1, 0xBE, /* 0xC8-0xCB */
+	0x8D, 0x55, 0x8D, 0x56, 0x8D, 0x57, 0x8D, 0x58, /* 0xCC-0xCF */
+	0x8D, 0x59, 0x8D, 0x5A, 0xE1, 0xBC, 0x8D, 0x5B, /* 0xD0-0xD3 */
+	0x8D, 0x5C, 0x8D, 0x5D, 0x8D, 0x5E, 0x8D, 0x5F, /* 0xD4-0xD7 */
+	0x8D, 0x60, 0xD6, 0xC5, 0x8D, 0x61, 0x8D, 0x62, /* 0xD8-0xDB */
+	0x8D, 0x63, 0x8D, 0x64, 0x8D, 0x65, 0x8D, 0x66, /* 0xDC-0xDF */
+	0x8D, 0x67, 0xCF, 0xBF, 0x8D, 0x68, 0x8D, 0x69, /* 0xE0-0xE3 */
+	0xE1, 0xBD, 0xE1, 0xBF, 0xC2, 0xCD, 0x8D, 0x6A, /* 0xE4-0xE7 */
+	0xB6, 0xEB, 0x8D, 0x6B, 0xD3, 0xF8, 0x8D, 0x6C, /* 0xE8-0xEB */
+	0x8D, 0x6D, 0xC7, 0xCD, 0x8D, 0x6E, 0x8D, 0x6F, /* 0xEC-0xEF */
+	0xB7, 0xE5, 0x8D, 0x70, 0x8D, 0x71, 0x8D, 0x72, /* 0xF0-0xF3 */
+	0x8D, 0x73, 0x8D, 0x74, 0x8D, 0x75, 0x8D, 0x76, /* 0xF4-0xF7 */
+	0x8D, 0x77, 0x8D, 0x78, 0x8D, 0x79, 0xBE, 0xFE, /* 0xF8-0xFB */
+	0x8D, 0x7A, 0x8D, 0x7B, 0x8D, 0x7C, 0x8D, 0x7D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5D[512] = {
+	0x8D, 0x7E, 0x8D, 0x80, 0xE1, 0xC0, 0xE1, 0xC1, /* 0x00-0x03 */
+	0x8D, 0x81, 0x8D, 0x82, 0xE1, 0xC7, 0xB3, 0xE7, /* 0x04-0x07 */
+	0x8D, 0x83, 0x8D, 0x84, 0x8D, 0x85, 0x8D, 0x86, /* 0x08-0x0B */
+	0x8D, 0x87, 0x8D, 0x88, 0xC6, 0xE9, 0x8D, 0x89, /* 0x0C-0x0F */
+	0x8D, 0x8A, 0x8D, 0x8B, 0x8D, 0x8C, 0x8D, 0x8D, /* 0x10-0x13 */
+	0xB4, 0xDE, 0x8D, 0x8E, 0xD1, 0xC2, 0x8D, 0x8F, /* 0x14-0x17 */
+	0x8D, 0x90, 0x8D, 0x91, 0x8D, 0x92, 0xE1, 0xC8, /* 0x18-0x1B */
+	0x8D, 0x93, 0x8D, 0x94, 0xE1, 0xC6, 0x8D, 0x95, /* 0x1C-0x1F */
+	0x8D, 0x96, 0x8D, 0x97, 0x8D, 0x98, 0x8D, 0x99, /* 0x20-0x23 */
+	0xE1, 0xC5, 0x8D, 0x9A, 0xE1, 0xC3, 0xE1, 0xC2, /* 0x24-0x27 */
+	0x8D, 0x9B, 0xB1, 0xC0, 0x8D, 0x9C, 0x8D, 0x9D, /* 0x28-0x2B */
+	0x8D, 0x9E, 0xD5, 0xB8, 0xE1, 0xC4, 0x8D, 0x9F, /* 0x2C-0x2F */
+	0x8D, 0xA0, 0x8D, 0xA1, 0x8D, 0xA2, 0x8D, 0xA3, /* 0x30-0x33 */
+	0xE1, 0xCB, 0x8D, 0xA4, 0x8D, 0xA5, 0x8D, 0xA6, /* 0x34-0x37 */
+	0x8D, 0xA7, 0x8D, 0xA8, 0x8D, 0xA9, 0x8D, 0xAA, /* 0x38-0x3B */
+	0x8D, 0xAB, 0xE1, 0xCC, 0xE1, 0xCA, 0x8D, 0xAC, /* 0x3C-0x3F */
+	0x8D, 0xAD, 0x8D, 0xAE, 0x8D, 0xAF, 0x8D, 0xB0, /* 0x40-0x43 */
+	0x8D, 0xB1, 0x8D, 0xB2, 0x8D, 0xB3, 0xEF, 0xFA, /* 0x44-0x47 */
+	0x8D, 0xB4, 0x8D, 0xB5, 0xE1, 0xD3, 0xE1, 0xD2, /* 0x48-0x4B */
+	0xC7, 0xB6, 0x8D, 0xB6, 0x8D, 0xB7, 0x8D, 0xB8, /* 0x4C-0x4F */
+	0x8D, 0xB9, 0x8D, 0xBA, 0x8D, 0xBB, 0x8D, 0xBC, /* 0x50-0x53 */
+	0x8D, 0xBD, 0x8D, 0xBE, 0x8D, 0xBF, 0x8D, 0xC0, /* 0x54-0x57 */
+	0xE1, 0xC9, 0x8D, 0xC1, 0x8D, 0xC2, 0xE1, 0xCE, /* 0x58-0x5B */
+	0x8D, 0xC3, 0xE1, 0xD0, 0x8D, 0xC4, 0x8D, 0xC5, /* 0x5C-0x5F */
+	0x8D, 0xC6, 0x8D, 0xC7, 0x8D, 0xC8, 0x8D, 0xC9, /* 0x60-0x63 */
+	0x8D, 0xCA, 0x8D, 0xCB, 0x8D, 0xCC, 0x8D, 0xCD, /* 0x64-0x67 */
+	0x8D, 0xCE, 0xE1, 0xD4, 0x8D, 0xCF, 0xE1, 0xD1, /* 0x68-0x6B */
+	0xE1, 0xCD, 0x8D, 0xD0, 0x8D, 0xD1, 0xE1, 0xCF, /* 0x6C-0x6F */
+	0x8D, 0xD2, 0x8D, 0xD3, 0x8D, 0xD4, 0x8D, 0xD5, /* 0x70-0x73 */
+	0xE1, 0xD5, 0x8D, 0xD6, 0x8D, 0xD7, 0x8D, 0xD8, /* 0x74-0x77 */
+	0x8D, 0xD9, 0x8D, 0xDA, 0x8D, 0xDB, 0x8D, 0xDC, /* 0x78-0x7B */
+	0x8D, 0xDD, 0x8D, 0xDE, 0x8D, 0xDF, 0x8D, 0xE0, /* 0x7C-0x7F */
+	
+	0x8D, 0xE1, 0x8D, 0xE2, 0xE1, 0xD6, 0x8D, 0xE3, /* 0x80-0x83 */
+	0x8D, 0xE4, 0x8D, 0xE5, 0x8D, 0xE6, 0x8D, 0xE7, /* 0x84-0x87 */
+	0x8D, 0xE8, 0x8D, 0xE9, 0x8D, 0xEA, 0x8D, 0xEB, /* 0x88-0x8B */
+	0x8D, 0xEC, 0x8D, 0xED, 0x8D, 0xEE, 0x8D, 0xEF, /* 0x8C-0x8F */
+	0x8D, 0xF0, 0x8D, 0xF1, 0x8D, 0xF2, 0x8D, 0xF3, /* 0x90-0x93 */
+	0x8D, 0xF4, 0x8D, 0xF5, 0x8D, 0xF6, 0x8D, 0xF7, /* 0x94-0x97 */
+	0x8D, 0xF8, 0xE1, 0xD7, 0x8D, 0xF9, 0x8D, 0xFA, /* 0x98-0x9B */
+	0x8D, 0xFB, 0xE1, 0xD8, 0x8D, 0xFC, 0x8D, 0xFD, /* 0x9C-0x9F */
+	0x8D, 0xFE, 0x8E, 0x40, 0x8E, 0x41, 0x8E, 0x42, /* 0xA0-0xA3 */
+	0x8E, 0x43, 0x8E, 0x44, 0x8E, 0x45, 0x8E, 0x46, /* 0xA4-0xA7 */
+	0x8E, 0x47, 0x8E, 0x48, 0x8E, 0x49, 0x8E, 0x4A, /* 0xA8-0xAB */
+	0x8E, 0x4B, 0x8E, 0x4C, 0x8E, 0x4D, 0x8E, 0x4E, /* 0xAC-0xAF */
+	0x8E, 0x4F, 0x8E, 0x50, 0x8E, 0x51, 0x8E, 0x52, /* 0xB0-0xB3 */
+	0x8E, 0x53, 0x8E, 0x54, 0x8E, 0x55, 0xE1, 0xDA, /* 0xB4-0xB7 */
+	0x8E, 0x56, 0x8E, 0x57, 0x8E, 0x58, 0x8E, 0x59, /* 0xB8-0xBB */
+	0x8E, 0x5A, 0x8E, 0x5B, 0x8E, 0x5C, 0x8E, 0x5D, /* 0xBC-0xBF */
+	0x8E, 0x5E, 0x8E, 0x5F, 0x8E, 0x60, 0x8E, 0x61, /* 0xC0-0xC3 */
+	0x8E, 0x62, 0xE1, 0xDB, 0x8E, 0x63, 0x8E, 0x64, /* 0xC4-0xC7 */
+	0x8E, 0x65, 0x8E, 0x66, 0x8E, 0x67, 0x8E, 0x68, /* 0xC8-0xCB */
+	0x8E, 0x69, 0xCE, 0xA1, 0x8E, 0x6A, 0x8E, 0x6B, /* 0xCC-0xCF */
+	0x8E, 0x6C, 0x8E, 0x6D, 0x8E, 0x6E, 0x8E, 0x6F, /* 0xD0-0xD3 */
+	0x8E, 0x70, 0x8E, 0x71, 0x8E, 0x72, 0x8E, 0x73, /* 0xD4-0xD7 */
+	0x8E, 0x74, 0x8E, 0x75, 0x8E, 0x76, 0xE7, 0xDD, /* 0xD8-0xDB */
+	0x8E, 0x77, 0xB4, 0xA8, 0xD6, 0xDD, 0x8E, 0x78, /* 0xDC-0xDF */
+	0x8E, 0x79, 0xD1, 0xB2, 0xB3, 0xB2, 0x8E, 0x7A, /* 0xE0-0xE3 */
+	0x8E, 0x7B, 0xB9, 0xA4, 0xD7, 0xF3, 0xC7, 0xC9, /* 0xE4-0xE7 */
+	0xBE, 0xDE, 0xB9, 0xAE, 0x8E, 0x7C, 0xCE, 0xD7, /* 0xE8-0xEB */
+	0x8E, 0x7D, 0x8E, 0x7E, 0xB2, 0xEE, 0xDB, 0xCF, /* 0xEC-0xEF */
+	0x8E, 0x80, 0xBC, 0xBA, 0xD2, 0xD1, 0xCB, 0xC8, /* 0xF0-0xF3 */
+	0xB0, 0xCD, 0x8E, 0x81, 0x8E, 0x82, 0xCF, 0xEF, /* 0xF4-0xF7 */
+	0x8E, 0x83, 0x8E, 0x84, 0x8E, 0x85, 0x8E, 0x86, /* 0xF8-0xFB */
+	0x8E, 0x87, 0xD9, 0xE3, 0xBD, 0xED, 0x8E, 0x88, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5E[512] = {
+	0x8E, 0x89, 0xB1, 0xD2, 0xCA, 0xD0, 0xB2, 0xBC, /* 0x00-0x03 */
+	0x8E, 0x8A, 0xCB, 0xA7, 0xB7, 0xAB, 0x8E, 0x8B, /* 0x04-0x07 */
+	0xCA, 0xA6, 0x8E, 0x8C, 0x8E, 0x8D, 0x8E, 0x8E, /* 0x08-0x0B */
+	0xCF, 0xA3, 0x8E, 0x8F, 0x8E, 0x90, 0xE0, 0xF8, /* 0x0C-0x0F */
+	0xD5, 0xCA, 0xE0, 0xFB, 0x8E, 0x91, 0x8E, 0x92, /* 0x10-0x13 */
+	0xE0, 0xFA, 0xC5, 0xC1, 0xCC, 0xFB, 0x8E, 0x93, /* 0x14-0x17 */
+	0xC1, 0xB1, 0xE0, 0xF9, 0xD6, 0xE3, 0xB2, 0xAF, /* 0x18-0x1B */
+	0xD6, 0xC4, 0xB5, 0xDB, 0x8E, 0x94, 0x8E, 0x95, /* 0x1C-0x1F */
+	0x8E, 0x96, 0x8E, 0x97, 0x8E, 0x98, 0x8E, 0x99, /* 0x20-0x23 */
+	0x8E, 0x9A, 0x8E, 0x9B, 0xB4, 0xF8, 0xD6, 0xA1, /* 0x24-0x27 */
+	0x8E, 0x9C, 0x8E, 0x9D, 0x8E, 0x9E, 0x8E, 0x9F, /* 0x28-0x2B */
+	0x8E, 0xA0, 0xCF, 0xAF, 0xB0, 0xEF, 0x8E, 0xA1, /* 0x2C-0x2F */
+	0x8E, 0xA2, 0xE0, 0xFC, 0x8E, 0xA3, 0x8E, 0xA4, /* 0x30-0x33 */
+	0x8E, 0xA5, 0x8E, 0xA6, 0x8E, 0xA7, 0xE1, 0xA1, /* 0x34-0x37 */
+	0xB3, 0xA3, 0x8E, 0xA8, 0x8E, 0xA9, 0xE0, 0xFD, /* 0x38-0x3B */
+	0xE0, 0xFE, 0xC3, 0xB1, 0x8E, 0xAA, 0x8E, 0xAB, /* 0x3C-0x3F */
+	0x8E, 0xAC, 0x8E, 0xAD, 0xC3, 0xDD, 0x8E, 0xAE, /* 0x40-0x43 */
+	0xE1, 0xA2, 0xB7, 0xF9, 0x8E, 0xAF, 0x8E, 0xB0, /* 0x44-0x47 */
+	0x8E, 0xB1, 0x8E, 0xB2, 0x8E, 0xB3, 0x8E, 0xB4, /* 0x48-0x4B */
+	0xBB, 0xCF, 0x8E, 0xB5, 0x8E, 0xB6, 0x8E, 0xB7, /* 0x4C-0x4F */
+	0x8E, 0xB8, 0x8E, 0xB9, 0x8E, 0xBA, 0x8E, 0xBB, /* 0x50-0x53 */
+	0xE1, 0xA3, 0xC4, 0xBB, 0x8E, 0xBC, 0x8E, 0xBD, /* 0x54-0x57 */
+	0x8E, 0xBE, 0x8E, 0xBF, 0x8E, 0xC0, 0xE1, 0xA4, /* 0x58-0x5B */
+	0x8E, 0xC1, 0x8E, 0xC2, 0xE1, 0xA5, 0x8E, 0xC3, /* 0x5C-0x5F */
+	0x8E, 0xC4, 0xE1, 0xA6, 0xB4, 0xB1, 0x8E, 0xC5, /* 0x60-0x63 */
+	0x8E, 0xC6, 0x8E, 0xC7, 0x8E, 0xC8, 0x8E, 0xC9, /* 0x64-0x67 */
+	0x8E, 0xCA, 0x8E, 0xCB, 0x8E, 0xCC, 0x8E, 0xCD, /* 0x68-0x6B */
+	0x8E, 0xCE, 0x8E, 0xCF, 0x8E, 0xD0, 0x8E, 0xD1, /* 0x6C-0x6F */
+	0x8E, 0xD2, 0x8E, 0xD3, 0xB8, 0xC9, 0xC6, 0xBD, /* 0x70-0x73 */
+	0xC4, 0xEA, 0x8E, 0xD4, 0xB2, 0xA2, 0x8E, 0xD5, /* 0x74-0x77 */
+	0xD0, 0xD2, 0x8E, 0xD6, 0xE7, 0xDB, 0xBB, 0xC3, /* 0x78-0x7B */
+	0xD3, 0xD7, 0xD3, 0xC4, 0x8E, 0xD7, 0xB9, 0xE3, /* 0x7C-0x7F */
+	
+	0xE2, 0xCF, 0x8E, 0xD8, 0x8E, 0xD9, 0x8E, 0xDA, /* 0x80-0x83 */
+	0xD7, 0xAF, 0x8E, 0xDB, 0xC7, 0xEC, 0xB1, 0xD3, /* 0x84-0x87 */
+	0x8E, 0xDC, 0x8E, 0xDD, 0xB4, 0xB2, 0xE2, 0xD1, /* 0x88-0x8B */
+	0x8E, 0xDE, 0x8E, 0xDF, 0x8E, 0xE0, 0xD0, 0xF2, /* 0x8C-0x8F */
+	0xC2, 0xAE, 0xE2, 0xD0, 0x8E, 0xE1, 0xBF, 0xE2, /* 0x90-0x93 */
+	0xD3, 0xA6, 0xB5, 0xD7, 0xE2, 0xD2, 0xB5, 0xEA, /* 0x94-0x97 */
+	0x8E, 0xE2, 0xC3, 0xED, 0xB8, 0xFD, 0x8E, 0xE3, /* 0x98-0x9B */
+	0xB8, 0xAE, 0x8E, 0xE4, 0xC5, 0xD3, 0xB7, 0xCF, /* 0x9C-0x9F */
+	0xE2, 0xD4, 0x8E, 0xE5, 0x8E, 0xE6, 0x8E, 0xE7, /* 0xA0-0xA3 */
+	0x8E, 0xE8, 0xE2, 0xD3, 0xB6, 0xC8, 0xD7, 0xF9, /* 0xA4-0xA7 */
+	0x8E, 0xE9, 0x8E, 0xEA, 0x8E, 0xEB, 0x8E, 0xEC, /* 0xA8-0xAB */
+	0x8E, 0xED, 0xCD, 0xA5, 0x8E, 0xEE, 0x8E, 0xEF, /* 0xAC-0xAF */
+	0x8E, 0xF0, 0x8E, 0xF1, 0x8E, 0xF2, 0xE2, 0xD8, /* 0xB0-0xB3 */
+	0x8E, 0xF3, 0xE2, 0xD6, 0xCA, 0xFC, 0xBF, 0xB5, /* 0xB4-0xB7 */
+	0xD3, 0xB9, 0xE2, 0xD5, 0x8E, 0xF4, 0x8E, 0xF5, /* 0xB8-0xBB */
+	0x8E, 0xF6, 0x8E, 0xF7, 0xE2, 0xD7, 0x8E, 0xF8, /* 0xBC-0xBF */
+	0x8E, 0xF9, 0x8E, 0xFA, 0x8E, 0xFB, 0x8E, 0xFC, /* 0xC0-0xC3 */
+	0x8E, 0xFD, 0x8E, 0xFE, 0x8F, 0x40, 0x8F, 0x41, /* 0xC4-0xC7 */
+	0x8F, 0x42, 0xC1, 0xAE, 0xC0, 0xC8, 0x8F, 0x43, /* 0xC8-0xCB */
+	0x8F, 0x44, 0x8F, 0x45, 0x8F, 0x46, 0x8F, 0x47, /* 0xCC-0xCF */
+	0x8F, 0x48, 0xE2, 0xDB, 0xE2, 0xDA, 0xC0, 0xAA, /* 0xD0-0xD3 */
+	0x8F, 0x49, 0x8F, 0x4A, 0xC1, 0xCE, 0x8F, 0x4B, /* 0xD4-0xD7 */
+	0x8F, 0x4C, 0x8F, 0x4D, 0x8F, 0x4E, 0xE2, 0xDC, /* 0xD8-0xDB */
+	0x8F, 0x4F, 0x8F, 0x50, 0x8F, 0x51, 0x8F, 0x52, /* 0xDC-0xDF */
+	0x8F, 0x53, 0x8F, 0x54, 0x8F, 0x55, 0x8F, 0x56, /* 0xE0-0xE3 */
+	0x8F, 0x57, 0x8F, 0x58, 0x8F, 0x59, 0x8F, 0x5A, /* 0xE4-0xE7 */
+	0xE2, 0xDD, 0x8F, 0x5B, 0xE2, 0xDE, 0x8F, 0x5C, /* 0xE8-0xEB */
+	0x8F, 0x5D, 0x8F, 0x5E, 0x8F, 0x5F, 0x8F, 0x60, /* 0xEC-0xEF */
+	0x8F, 0x61, 0x8F, 0x62, 0x8F, 0x63, 0x8F, 0x64, /* 0xF0-0xF3 */
+	0xDB, 0xC8, 0x8F, 0x65, 0xD1, 0xD3, 0xCD, 0xA2, /* 0xF4-0xF7 */
+	0x8F, 0x66, 0x8F, 0x67, 0xBD, 0xA8, 0x8F, 0x68, /* 0xF8-0xFB */
+	0x8F, 0x69, 0x8F, 0x6A, 0xDE, 0xC3, 0xD8, 0xA5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5F[512] = {
+	0xBF, 0xAA, 0xDB, 0xCD, 0xD2, 0xEC, 0xC6, 0xFA, /* 0x00-0x03 */
+	0xC5, 0xAA, 0x8F, 0x6B, 0x8F, 0x6C, 0x8F, 0x6D, /* 0x04-0x07 */
+	0xDE, 0xC4, 0x8F, 0x6E, 0xB1, 0xD7, 0xDF, 0xAE, /* 0x08-0x0B */
+	0x8F, 0x6F, 0x8F, 0x70, 0x8F, 0x71, 0xCA, 0xBD, /* 0x0C-0x0F */
+	0x8F, 0x72, 0xDF, 0xB1, 0x8F, 0x73, 0xB9, 0xAD, /* 0x10-0x13 */
+	0x8F, 0x74, 0xD2, 0xFD, 0x8F, 0x75, 0xB8, 0xA5, /* 0x14-0x17 */
+	0xBA, 0xEB, 0x8F, 0x76, 0x8F, 0x77, 0xB3, 0xDA, /* 0x18-0x1B */
+	0x8F, 0x78, 0x8F, 0x79, 0x8F, 0x7A, 0xB5, 0xDC, /* 0x1C-0x1F */
+	0xD5, 0xC5, 0x8F, 0x7B, 0x8F, 0x7C, 0x8F, 0x7D, /* 0x20-0x23 */
+	0x8F, 0x7E, 0xC3, 0xD6, 0xCF, 0xD2, 0xBB, 0xA1, /* 0x24-0x27 */
+	0x8F, 0x80, 0xE5, 0xF3, 0xE5, 0xF2, 0x8F, 0x81, /* 0x28-0x2B */
+	0x8F, 0x82, 0xE5, 0xF4, 0x8F, 0x83, 0xCD, 0xE4, /* 0x2C-0x2F */
+	0x8F, 0x84, 0xC8, 0xF5, 0x8F, 0x85, 0x8F, 0x86, /* 0x30-0x33 */
+	0x8F, 0x87, 0x8F, 0x88, 0x8F, 0x89, 0x8F, 0x8A, /* 0x34-0x37 */
+	0x8F, 0x8B, 0xB5, 0xAF, 0xC7, 0xBF, 0x8F, 0x8C, /* 0x38-0x3B */
+	0xE5, 0xF6, 0x8F, 0x8D, 0x8F, 0x8E, 0x8F, 0x8F, /* 0x3C-0x3F */
+	0xEC, 0xB0, 0x8F, 0x90, 0x8F, 0x91, 0x8F, 0x92, /* 0x40-0x43 */
+	0x8F, 0x93, 0x8F, 0x94, 0x8F, 0x95, 0x8F, 0x96, /* 0x44-0x47 */
+	0x8F, 0x97, 0x8F, 0x98, 0x8F, 0x99, 0x8F, 0x9A, /* 0x48-0x4B */
+	0x8F, 0x9B, 0x8F, 0x9C, 0x8F, 0x9D, 0x8F, 0x9E, /* 0x4C-0x4F */
+	0xE5, 0xE6, 0x8F, 0x9F, 0xB9, 0xE9, 0xB5, 0xB1, /* 0x50-0x53 */
+	0x8F, 0xA0, 0xC2, 0xBC, 0xE5, 0xE8, 0xE5, 0xE7, /* 0x54-0x57 */
+	0xE5, 0xE9, 0x8F, 0xA1, 0x8F, 0xA2, 0x8F, 0xA3, /* 0x58-0x5B */
+	0x8F, 0xA4, 0xD2, 0xCD, 0x8F, 0xA5, 0x8F, 0xA6, /* 0x5C-0x5F */
+	0x8F, 0xA7, 0xE1, 0xEA, 0xD0, 0xCE, 0x8F, 0xA8, /* 0x60-0x63 */
+	0xCD, 0xAE, 0x8F, 0xA9, 0xD1, 0xE5, 0x8F, 0xAA, /* 0x64-0x67 */
+	0x8F, 0xAB, 0xB2, 0xCA, 0xB1, 0xEB, 0x8F, 0xAC, /* 0x68-0x6B */
+	0xB1, 0xF2, 0xC5, 0xED, 0x8F, 0xAD, 0x8F, 0xAE, /* 0x6C-0x6F */
+	0xD5, 0xC3, 0xD3, 0xB0, 0x8F, 0xAF, 0xE1, 0xDC, /* 0x70-0x73 */
+	0x8F, 0xB0, 0x8F, 0xB1, 0x8F, 0xB2, 0xE1, 0xDD, /* 0x74-0x77 */
+	0x8F, 0xB3, 0xD2, 0xDB, 0x8F, 0xB4, 0xB3, 0xB9, /* 0x78-0x7B */
+	0xB1, 0xCB, 0x8F, 0xB5, 0x8F, 0xB6, 0x8F, 0xB7, /* 0x7C-0x7F */
+	
+	0xCD, 0xF9, 0xD5, 0xF7, 0xE1, 0xDE, 0x8F, 0xB8, /* 0x80-0x83 */
+	0xBE, 0xB6, 0xB4, 0xFD, 0x8F, 0xB9, 0xE1, 0xDF, /* 0x84-0x87 */
+	0xBA, 0xDC, 0xE1, 0xE0, 0xBB, 0xB2, 0xC2, 0xC9, /* 0x88-0x8B */
+	0xE1, 0xE1, 0x8F, 0xBA, 0x8F, 0xBB, 0x8F, 0xBC, /* 0x8C-0x8F */
+	0xD0, 0xEC, 0x8F, 0xBD, 0xCD, 0xBD, 0x8F, 0xBE, /* 0x90-0x93 */
+	0x8F, 0xBF, 0xE1, 0xE2, 0x8F, 0xC0, 0xB5, 0xC3, /* 0x94-0x97 */
+	0xC5, 0xC7, 0xE1, 0xE3, 0x8F, 0xC1, 0x8F, 0xC2, /* 0x98-0x9B */
+	0xE1, 0xE4, 0x8F, 0xC3, 0x8F, 0xC4, 0x8F, 0xC5, /* 0x9C-0x9F */
+	0x8F, 0xC6, 0xD3, 0xF9, 0x8F, 0xC7, 0x8F, 0xC8, /* 0xA0-0xA3 */
+	0x8F, 0xC9, 0x8F, 0xCA, 0x8F, 0xCB, 0x8F, 0xCC, /* 0xA4-0xA7 */
+	0xE1, 0xE5, 0x8F, 0xCD, 0xD1, 0xAD, 0x8F, 0xCE, /* 0xA8-0xAB */
+	0x8F, 0xCF, 0xE1, 0xE6, 0xCE, 0xA2, 0x8F, 0xD0, /* 0xAC-0xAF */
+	0x8F, 0xD1, 0x8F, 0xD2, 0x8F, 0xD3, 0x8F, 0xD4, /* 0xB0-0xB3 */
+	0x8F, 0xD5, 0xE1, 0xE7, 0x8F, 0xD6, 0xB5, 0xC2, /* 0xB4-0xB7 */
+	0x8F, 0xD7, 0x8F, 0xD8, 0x8F, 0xD9, 0x8F, 0xDA, /* 0xB8-0xBB */
+	0xE1, 0xE8, 0xBB, 0xD5, 0x8F, 0xDB, 0x8F, 0xDC, /* 0xBC-0xBF */
+	0x8F, 0xDD, 0x8F, 0xDE, 0x8F, 0xDF, 0xD0, 0xC4, /* 0xC0-0xC3 */
+	0xE2, 0xE0, 0xB1, 0xD8, 0xD2, 0xE4, 0x8F, 0xE0, /* 0xC4-0xC7 */
+	0x8F, 0xE1, 0xE2, 0xE1, 0x8F, 0xE2, 0x8F, 0xE3, /* 0xC8-0xCB */
+	0xBC, 0xC9, 0xC8, 0xCC, 0x8F, 0xE4, 0xE2, 0xE3, /* 0xCC-0xCF */
+	0xEC, 0xFE, 0xEC, 0xFD, 0xDF, 0xAF, 0x8F, 0xE5, /* 0xD0-0xD3 */
+	0x8F, 0xE6, 0x8F, 0xE7, 0xE2, 0xE2, 0xD6, 0xBE, /* 0xD4-0xD7 */
+	0xCD, 0xFC, 0xC3, 0xA6, 0x8F, 0xE8, 0x8F, 0xE9, /* 0xD8-0xDB */
+	0x8F, 0xEA, 0xE3, 0xC3, 0x8F, 0xEB, 0x8F, 0xEC, /* 0xDC-0xDF */
+	0xD6, 0xD2, 0xE2, 0xE7, 0x8F, 0xED, 0x8F, 0xEE, /* 0xE0-0xE3 */
+	0xE2, 0xE8, 0x8F, 0xEF, 0x8F, 0xF0, 0xD3, 0xC7, /* 0xE4-0xE7 */
+	0x8F, 0xF1, 0x8F, 0xF2, 0xE2, 0xEC, 0xBF, 0xEC, /* 0xE8-0xEB */
+	0x8F, 0xF3, 0xE2, 0xED, 0xE2, 0xE5, 0x8F, 0xF4, /* 0xEC-0xEF */
+	0x8F, 0xF5, 0xB3, 0xC0, 0x8F, 0xF6, 0x8F, 0xF7, /* 0xF0-0xF3 */
+	0x8F, 0xF8, 0xC4, 0xEE, 0x8F, 0xF9, 0x8F, 0xFA, /* 0xF4-0xF7 */
+	0xE2, 0xEE, 0x8F, 0xFB, 0x8F, 0xFC, 0xD0, 0xC3, /* 0xF8-0xFB */
+	0x8F, 0xFD, 0xBA, 0xF6, 0xE2, 0xE9, 0xB7, 0xDE, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_60[512] = {
+	0xBB, 0xB3, 0xCC, 0xAC, 0xCB, 0xCB, 0xE2, 0xE4, /* 0x00-0x03 */
+	0xE2, 0xE6, 0xE2, 0xEA, 0xE2, 0xEB, 0x8F, 0xFE, /* 0x04-0x07 */
+	0x90, 0x40, 0x90, 0x41, 0xE2, 0xF7, 0x90, 0x42, /* 0x08-0x0B */
+	0x90, 0x43, 0xE2, 0xF4, 0xD4, 0xF5, 0xE2, 0xF3, /* 0x0C-0x0F */
+	0x90, 0x44, 0x90, 0x45, 0xC5, 0xAD, 0x90, 0x46, /* 0x10-0x13 */
+	0xD5, 0xFA, 0xC5, 0xC2, 0xB2, 0xC0, 0x90, 0x47, /* 0x14-0x17 */
+	0x90, 0x48, 0xE2, 0xEF, 0x90, 0x49, 0xE2, 0xF2, /* 0x18-0x1B */
+	0xC1, 0xAF, 0xCB, 0xBC, 0x90, 0x4A, 0x90, 0x4B, /* 0x1C-0x1F */
+	0xB5, 0xA1, 0xE2, 0xF9, 0x90, 0x4C, 0x90, 0x4D, /* 0x20-0x23 */
+	0x90, 0x4E, 0xBC, 0xB1, 0xE2, 0xF1, 0xD0, 0xD4, /* 0x24-0x27 */
+	0xD4, 0xB9, 0xE2, 0xF5, 0xB9, 0xD6, 0xE2, 0xF6, /* 0x28-0x2B */
+	0x90, 0x4F, 0x90, 0x50, 0x90, 0x51, 0xC7, 0xD3, /* 0x2C-0x2F */
+	0x90, 0x52, 0x90, 0x53, 0x90, 0x54, 0x90, 0x55, /* 0x30-0x33 */
+	0x90, 0x56, 0xE2, 0xF0, 0x90, 0x57, 0x90, 0x58, /* 0x34-0x37 */
+	0x90, 0x59, 0x90, 0x5A, 0x90, 0x5B, 0xD7, 0xDC, /* 0x38-0x3B */
+	0xED, 0xA1, 0x90, 0x5C, 0x90, 0x5D, 0xE2, 0xF8, /* 0x3C-0x3F */
+	0x90, 0x5E, 0xED, 0xA5, 0xE2, 0xFE, 0xCA, 0xD1, /* 0x40-0x43 */
+	0x90, 0x5F, 0x90, 0x60, 0x90, 0x61, 0x90, 0x62, /* 0x44-0x47 */
+	0x90, 0x63, 0x90, 0x64, 0x90, 0x65, 0xC1, 0xB5, /* 0x48-0x4B */
+	0x90, 0x66, 0xBB, 0xD0, 0x90, 0x67, 0x90, 0x68, /* 0x4C-0x4F */
+	0xBF, 0xD6, 0x90, 0x69, 0xBA, 0xE3, 0x90, 0x6A, /* 0x50-0x53 */
+	0x90, 0x6B, 0xCB, 0xA1, 0x90, 0x6C, 0x90, 0x6D, /* 0x54-0x57 */
+	0x90, 0x6E, 0xED, 0xA6, 0xED, 0xA3, 0x90, 0x6F, /* 0x58-0x5B */
+	0x90, 0x70, 0xED, 0xA2, 0x90, 0x71, 0x90, 0x72, /* 0x5C-0x5F */
+	0x90, 0x73, 0x90, 0x74, 0xBB, 0xD6, 0xED, 0xA7, /* 0x60-0x63 */
+	0xD0, 0xF4, 0x90, 0x75, 0x90, 0x76, 0xED, 0xA4, /* 0x64-0x67 */
+	0xBA, 0xDE, 0xB6, 0xF7, 0xE3, 0xA1, 0xB6, 0xB2, /* 0x68-0x6B */
+	0xCC, 0xF1, 0xB9, 0xA7, 0x90, 0x77, 0xCF, 0xA2, /* 0x6C-0x6F */
+	0xC7, 0xA1, 0x90, 0x78, 0x90, 0x79, 0xBF, 0xD2, /* 0x70-0x73 */
+	0x90, 0x7A, 0x90, 0x7B, 0xB6, 0xF1, 0x90, 0x7C, /* 0x74-0x77 */
+	0xE2, 0xFA, 0xE2, 0xFB, 0xE2, 0xFD, 0xE2, 0xFC, /* 0x78-0x7B */
+	0xC4, 0xD5, 0xE3, 0xA2, 0x90, 0x7D, 0xD3, 0xC1, /* 0x7C-0x7F */
+	
+	0x90, 0x7E, 0x90, 0x80, 0x90, 0x81, 0xE3, 0xA7, /* 0x80-0x83 */
+	0xC7, 0xC4, 0x90, 0x82, 0x90, 0x83, 0x90, 0x84, /* 0x84-0x87 */
+	0x90, 0x85, 0xCF, 0xA4, 0x90, 0x86, 0x90, 0x87, /* 0x88-0x8B */
+	0xE3, 0xA9, 0xBA, 0xB7, 0x90, 0x88, 0x90, 0x89, /* 0x8C-0x8F */
+	0x90, 0x8A, 0x90, 0x8B, 0xE3, 0xA8, 0x90, 0x8C, /* 0x90-0x93 */
+	0xBB, 0xDA, 0x90, 0x8D, 0xE3, 0xA3, 0x90, 0x8E, /* 0x94-0x97 */
+	0x90, 0x8F, 0x90, 0x90, 0xE3, 0xA4, 0xE3, 0xAA, /* 0x98-0x9B */
+	0x90, 0x91, 0xE3, 0xA6, 0x90, 0x92, 0xCE, 0xF2, /* 0x9C-0x9F */
+	0xD3, 0xC6, 0x90, 0x93, 0x90, 0x94, 0xBB, 0xBC, /* 0xA0-0xA3 */
+	0x90, 0x95, 0x90, 0x96, 0xD4, 0xC3, 0x90, 0x97, /* 0xA4-0xA7 */
+	0xC4, 0xFA, 0x90, 0x98, 0x90, 0x99, 0xED, 0xA8, /* 0xA8-0xAB */
+	0xD0, 0xFC, 0xE3, 0xA5, 0x90, 0x9A, 0xC3, 0xF5, /* 0xAC-0xAF */
+	0x90, 0x9B, 0xE3, 0xAD, 0xB1, 0xAF, 0x90, 0x9C, /* 0xB0-0xB3 */
+	0xE3, 0xB2, 0x90, 0x9D, 0x90, 0x9E, 0x90, 0x9F, /* 0xB4-0xB7 */
+	0xBC, 0xC2, 0x90, 0xA0, 0x90, 0xA1, 0xE3, 0xAC, /* 0xB8-0xBB */
+	0xB5, 0xBF, 0x90, 0xA2, 0x90, 0xA3, 0x90, 0xA4, /* 0xBC-0xBF */
+	0x90, 0xA5, 0x90, 0xA6, 0x90, 0xA7, 0x90, 0xA8, /* 0xC0-0xC3 */
+	0x90, 0xA9, 0xC7, 0xE9, 0xE3, 0xB0, 0x90, 0xAA, /* 0xC4-0xC7 */
+	0x90, 0xAB, 0x90, 0xAC, 0xBE, 0xAA, 0xCD, 0xEF, /* 0xC8-0xCB */
+	0x90, 0xAD, 0x90, 0xAE, 0x90, 0xAF, 0x90, 0xB0, /* 0xCC-0xCF */
+	0x90, 0xB1, 0xBB, 0xF3, 0x90, 0xB2, 0x90, 0xB3, /* 0xD0-0xD3 */
+	0x90, 0xB4, 0xCC, 0xE8, 0x90, 0xB5, 0x90, 0xB6, /* 0xD4-0xD7 */
+	0xE3, 0xAF, 0x90, 0xB7, 0xE3, 0xB1, 0x90, 0xB8, /* 0xD8-0xDB */
+	0xCF, 0xA7, 0xE3, 0xAE, 0x90, 0xB9, 0xCE, 0xA9, /* 0xDC-0xDF */
+	0xBB, 0xDD, 0x90, 0xBA, 0x90, 0xBB, 0x90, 0xBC, /* 0xE0-0xE3 */
+	0x90, 0xBD, 0x90, 0xBE, 0xB5, 0xEB, 0xBE, 0xE5, /* 0xE4-0xE7 */
+	0xB2, 0xD2, 0xB3, 0xCD, 0x90, 0xBF, 0xB1, 0xB9, /* 0xE8-0xEB */
+	0xE3, 0xAB, 0xB2, 0xD1, 0xB5, 0xAC, 0xB9, 0xDF, /* 0xEC-0xEF */
+	0xB6, 0xE8, 0x90, 0xC0, 0x90, 0xC1, 0xCF, 0xEB, /* 0xF0-0xF3 */
+	0xE3, 0xB7, 0x90, 0xC2, 0xBB, 0xCC, 0x90, 0xC3, /* 0xF4-0xF7 */
+	0x90, 0xC4, 0xC8, 0xC7, 0xD0, 0xCA, 0x90, 0xC5, /* 0xF8-0xFB */
+	0x90, 0xC6, 0x90, 0xC7, 0x90, 0xC8, 0x90, 0xC9, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_61[512] = {
+	0xE3, 0xB8, 0xB3, 0xEE, 0x90, 0xCA, 0x90, 0xCB, /* 0x00-0x03 */
+	0x90, 0xCC, 0x90, 0xCD, 0xED, 0xA9, 0x90, 0xCE, /* 0x04-0x07 */
+	0xD3, 0xFA, 0xD3, 0xE4, 0x90, 0xCF, 0x90, 0xD0, /* 0x08-0x0B */
+	0x90, 0xD1, 0xED, 0xAA, 0xE3, 0xB9, 0xD2, 0xE2, /* 0x0C-0x0F */
+	0x90, 0xD2, 0x90, 0xD3, 0x90, 0xD4, 0x90, 0xD5, /* 0x10-0x13 */
+	0x90, 0xD6, 0xE3, 0xB5, 0x90, 0xD7, 0x90, 0xD8, /* 0x14-0x17 */
+	0x90, 0xD9, 0x90, 0xDA, 0xD3, 0xDE, 0x90, 0xDB, /* 0x18-0x1B */
+	0x90, 0xDC, 0x90, 0xDD, 0x90, 0xDE, 0xB8, 0xD0, /* 0x1C-0x1F */
+	0xE3, 0xB3, 0x90, 0xDF, 0x90, 0xE0, 0xE3, 0xB6, /* 0x20-0x23 */
+	0xB7, 0xDF, 0x90, 0xE1, 0xE3, 0xB4, 0xC0, 0xA2, /* 0x24-0x27 */
+	0x90, 0xE2, 0x90, 0xE3, 0x90, 0xE4, 0xE3, 0xBA, /* 0x28-0x2B */
+	0x90, 0xE5, 0x90, 0xE6, 0x90, 0xE7, 0x90, 0xE8, /* 0x2C-0x2F */
+	0x90, 0xE9, 0x90, 0xEA, 0x90, 0xEB, 0x90, 0xEC, /* 0x30-0x33 */
+	0x90, 0xED, 0x90, 0xEE, 0x90, 0xEF, 0x90, 0xF0, /* 0x34-0x37 */
+	0x90, 0xF1, 0x90, 0xF2, 0x90, 0xF3, 0x90, 0xF4, /* 0x38-0x3B */
+	0x90, 0xF5, 0x90, 0xF6, 0x90, 0xF7, 0xD4, 0xB8, /* 0x3C-0x3F */
+	0x90, 0xF8, 0x90, 0xF9, 0x90, 0xFA, 0x90, 0xFB, /* 0x40-0x43 */
+	0x90, 0xFC, 0x90, 0xFD, 0x90, 0xFE, 0x91, 0x40, /* 0x44-0x47 */
+	0xB4, 0xC8, 0x91, 0x41, 0xE3, 0xBB, 0x91, 0x42, /* 0x48-0x4B */
+	0xBB, 0xC5, 0x91, 0x43, 0xC9, 0xF7, 0x91, 0x44, /* 0x4C-0x4F */
+	0x91, 0x45, 0xC9, 0xE5, 0x91, 0x46, 0x91, 0x47, /* 0x50-0x53 */
+	0x91, 0x48, 0xC4, 0xBD, 0x91, 0x49, 0x91, 0x4A, /* 0x54-0x57 */
+	0x91, 0x4B, 0x91, 0x4C, 0x91, 0x4D, 0x91, 0x4E, /* 0x58-0x5B */
+	0x91, 0x4F, 0xED, 0xAB, 0x91, 0x50, 0x91, 0x51, /* 0x5C-0x5F */
+	0x91, 0x52, 0x91, 0x53, 0xC2, 0xFD, 0x91, 0x54, /* 0x60-0x63 */
+	0x91, 0x55, 0x91, 0x56, 0x91, 0x57, 0xBB, 0xDB, /* 0x64-0x67 */
+	0xBF, 0xAE, 0x91, 0x58, 0x91, 0x59, 0x91, 0x5A, /* 0x68-0x6B */
+	0x91, 0x5B, 0x91, 0x5C, 0x91, 0x5D, 0x91, 0x5E, /* 0x6C-0x6F */
+	0xCE, 0xBF, 0x91, 0x5F, 0x91, 0x60, 0x91, 0x61, /* 0x70-0x73 */
+	0x91, 0x62, 0xE3, 0xBC, 0x91, 0x63, 0xBF, 0xB6, /* 0x74-0x77 */
+	0x91, 0x64, 0x91, 0x65, 0x91, 0x66, 0x91, 0x67, /* 0x78-0x7B */
+	0x91, 0x68, 0x91, 0x69, 0x91, 0x6A, 0x91, 0x6B, /* 0x7C-0x7F */
+	
+	0x91, 0x6C, 0x91, 0x6D, 0x91, 0x6E, 0x91, 0x6F, /* 0x80-0x83 */
+	0x91, 0x70, 0x91, 0x71, 0x91, 0x72, 0x91, 0x73, /* 0x84-0x87 */
+	0x91, 0x74, 0x91, 0x75, 0x91, 0x76, 0xB1, 0xEF, /* 0x88-0x8B */
+	0x91, 0x77, 0x91, 0x78, 0xD4, 0xF7, 0x91, 0x79, /* 0x8C-0x8F */
+	0x91, 0x7A, 0x91, 0x7B, 0x91, 0x7C, 0x91, 0x7D, /* 0x90-0x93 */
+	0xE3, 0xBE, 0x91, 0x7E, 0x91, 0x80, 0x91, 0x81, /* 0x94-0x97 */
+	0x91, 0x82, 0x91, 0x83, 0x91, 0x84, 0x91, 0x85, /* 0x98-0x9B */
+	0x91, 0x86, 0xED, 0xAD, 0x91, 0x87, 0x91, 0x88, /* 0x9C-0x9F */
+	0x91, 0x89, 0x91, 0x8A, 0x91, 0x8B, 0x91, 0x8C, /* 0xA0-0xA3 */
+	0x91, 0x8D, 0x91, 0x8E, 0x91, 0x8F, 0xE3, 0xBF, /* 0xA4-0xA7 */
+	0xBA, 0xA9, 0xED, 0xAC, 0x91, 0x90, 0x91, 0x91, /* 0xA8-0xAB */
+	0xE3, 0xBD, 0x91, 0x92, 0x91, 0x93, 0x91, 0x94, /* 0xAC-0xAF */
+	0x91, 0x95, 0x91, 0x96, 0x91, 0x97, 0x91, 0x98, /* 0xB0-0xB3 */
+	0x91, 0x99, 0x91, 0x9A, 0x91, 0x9B, 0xE3, 0xC0, /* 0xB4-0xB7 */
+	0x91, 0x9C, 0x91, 0x9D, 0x91, 0x9E, 0x91, 0x9F, /* 0xB8-0xBB */
+	0x91, 0xA0, 0x91, 0xA1, 0xBA, 0xB6, 0x91, 0xA2, /* 0xBC-0xBF */
+	0x91, 0xA3, 0x91, 0xA4, 0xB6, 0xAE, 0x91, 0xA5, /* 0xC0-0xC3 */
+	0x91, 0xA6, 0x91, 0xA7, 0x91, 0xA8, 0x91, 0xA9, /* 0xC4-0xC7 */
+	0xD0, 0xB8, 0x91, 0xAA, 0xB0, 0xC3, 0xED, 0xAE, /* 0xC8-0xCB */
+	0x91, 0xAB, 0x91, 0xAC, 0x91, 0xAD, 0x91, 0xAE, /* 0xCC-0xCF */
+	0x91, 0xAF, 0xED, 0xAF, 0xC0, 0xC1, 0x91, 0xB0, /* 0xD0-0xD3 */
+	0xE3, 0xC1, 0x91, 0xB1, 0x91, 0xB2, 0x91, 0xB3, /* 0xD4-0xD7 */
+	0x91, 0xB4, 0x91, 0xB5, 0x91, 0xB6, 0x91, 0xB7, /* 0xD8-0xDB */
+	0x91, 0xB8, 0x91, 0xB9, 0x91, 0xBA, 0x91, 0xBB, /* 0xDC-0xDF */
+	0x91, 0xBC, 0x91, 0xBD, 0x91, 0xBE, 0x91, 0xBF, /* 0xE0-0xE3 */
+	0x91, 0xC0, 0x91, 0xC1, 0xC5, 0xB3, 0x91, 0xC2, /* 0xE4-0xE7 */
+	0x91, 0xC3, 0x91, 0xC4, 0x91, 0xC5, 0x91, 0xC6, /* 0xE8-0xEB */
+	0x91, 0xC7, 0x91, 0xC8, 0x91, 0xC9, 0x91, 0xCA, /* 0xEC-0xEF */
+	0x91, 0xCB, 0x91, 0xCC, 0x91, 0xCD, 0x91, 0xCE, /* 0xF0-0xF3 */
+	0x91, 0xCF, 0xE3, 0xC2, 0x91, 0xD0, 0x91, 0xD1, /* 0xF4-0xF7 */
+	0x91, 0xD2, 0x91, 0xD3, 0x91, 0xD4, 0x91, 0xD5, /* 0xF8-0xFB */
+	0x91, 0xD6, 0x91, 0xD7, 0x91, 0xD8, 0xDC, 0xB2, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_62[512] = {
+	0x91, 0xD9, 0x91, 0xDA, 0x91, 0xDB, 0x91, 0xDC, /* 0x00-0x03 */
+	0x91, 0xDD, 0x91, 0xDE, 0xED, 0xB0, 0x91, 0xDF, /* 0x04-0x07 */
+	0xB8, 0xEA, 0x91, 0xE0, 0xCE, 0xEC, 0xEA, 0xA7, /* 0x08-0x0B */
+	0xD0, 0xE7, 0xCA, 0xF9, 0xC8, 0xD6, 0xCF, 0xB7, /* 0x0C-0x0F */
+	0xB3, 0xC9, 0xCE, 0xD2, 0xBD, 0xE4, 0x91, 0xE1, /* 0x10-0x13 */
+	0x91, 0xE2, 0xE3, 0xDE, 0xBB, 0xF2, 0xEA, 0xA8, /* 0x14-0x17 */
+	0xD5, 0xBD, 0x91, 0xE3, 0xC6, 0xDD, 0xEA, 0xA9, /* 0x18-0x1B */
+	0x91, 0xE4, 0x91, 0xE5, 0x91, 0xE6, 0xEA, 0xAA, /* 0x1C-0x1F */
+	0x91, 0xE7, 0xEA, 0xAC, 0xEA, 0xAB, 0x91, 0xE8, /* 0x20-0x23 */
+	0xEA, 0xAE, 0xEA, 0xAD, 0x91, 0xE9, 0x91, 0xEA, /* 0x24-0x27 */
+	0x91, 0xEB, 0x91, 0xEC, 0xBD, 0xD8, 0x91, 0xED, /* 0x28-0x2B */
+	0xEA, 0xAF, 0x91, 0xEE, 0xC2, 0xBE, 0x91, 0xEF, /* 0x2C-0x2F */
+	0x91, 0xF0, 0x91, 0xF1, 0x91, 0xF2, 0xB4, 0xC1, /* 0x30-0x33 */
+	0xB4, 0xF7, 0x91, 0xF3, 0x91, 0xF4, 0xBB, 0xA7, /* 0x34-0x37 */
+	0x91, 0xF5, 0x91, 0xF6, 0x91, 0xF7, 0x91, 0xF8, /* 0x38-0x3B */
+	0x91, 0xF9, 0xEC, 0xE6, 0xEC, 0xE5, 0xB7, 0xBF, /* 0x3C-0x3F */
+	0xCB, 0xF9, 0xB1, 0xE2, 0x91, 0xFA, 0xEC, 0xE7, /* 0x40-0x43 */
+	0x91, 0xFB, 0x91, 0xFC, 0x91, 0xFD, 0xC9, 0xC8, /* 0x44-0x47 */
+	0xEC, 0xE8, 0xEC, 0xE9, 0x91, 0xFE, 0xCA, 0xD6, /* 0x48-0x4B */
+	0xDE, 0xD0, 0xB2, 0xC5, 0xD4, 0xFA, 0x92, 0x40, /* 0x4C-0x4F */
+	0x92, 0x41, 0xC6, 0xCB, 0xB0, 0xC7, 0xB4, 0xF2, /* 0x50-0x53 */
+	0xC8, 0xD3, 0x92, 0x42, 0x92, 0x43, 0x92, 0x44, /* 0x54-0x57 */
+	0xCD, 0xD0, 0x92, 0x45, 0x92, 0x46, 0xBF, 0xB8, /* 0x58-0x5B */
+	0x92, 0x47, 0x92, 0x48, 0x92, 0x49, 0x92, 0x4A, /* 0x5C-0x5F */
+	0x92, 0x4B, 0x92, 0x4C, 0x92, 0x4D, 0xBF, 0xDB, /* 0x60-0x63 */
+	0x92, 0x4E, 0x92, 0x4F, 0xC7, 0xA4, 0xD6, 0xB4, /* 0x64-0x67 */
+	0x92, 0x50, 0xC0, 0xA9, 0xDE, 0xD1, 0xC9, 0xA8, /* 0x68-0x6B */
+	0xD1, 0xEF, 0xC5, 0xA4, 0xB0, 0xE7, 0xB3, 0xB6, /* 0x6C-0x6F */
+	0xC8, 0xC5, 0x92, 0x51, 0x92, 0x52, 0xB0, 0xE2, /* 0x70-0x73 */
+	0x92, 0x53, 0x92, 0x54, 0xB7, 0xF6, 0x92, 0x55, /* 0x74-0x77 */
+	0x92, 0x56, 0xC5, 0xFA, 0x92, 0x57, 0x92, 0x58, /* 0x78-0x7B */
+	0xB6, 0xF3, 0x92, 0x59, 0xD5, 0xD2, 0xB3, 0xD0, /* 0x7C-0x7F */
+	
+	0xBC, 0xBC, 0x92, 0x5A, 0x92, 0x5B, 0x92, 0x5C, /* 0x80-0x83 */
+	0xB3, 0xAD, 0x92, 0x5D, 0x92, 0x5E, 0x92, 0x5F, /* 0x84-0x87 */
+	0x92, 0x60, 0xBE, 0xF1, 0xB0, 0xD1, 0x92, 0x61, /* 0x88-0x8B */
+	0x92, 0x62, 0x92, 0x63, 0x92, 0x64, 0x92, 0x65, /* 0x8C-0x8F */
+	0x92, 0x66, 0xD2, 0xD6, 0xCA, 0xE3, 0xD7, 0xA5, /* 0x90-0x93 */
+	0x92, 0x67, 0xCD, 0xB6, 0xB6, 0xB6, 0xBF, 0xB9, /* 0x94-0x97 */
+	0xD5, 0xDB, 0x92, 0x68, 0xB8, 0xA7, 0xC5, 0xD7, /* 0x98-0x9B */
+	0x92, 0x69, 0x92, 0x6A, 0x92, 0x6B, 0xDE, 0xD2, /* 0x9C-0x9F */
+	0xBF, 0xD9, 0xC2, 0xD5, 0xC7, 0xC0, 0x92, 0x6C, /* 0xA0-0xA3 */
+	0xBB, 0xA4, 0xB1, 0xA8, 0x92, 0x6D, 0x92, 0x6E, /* 0xA4-0xA7 */
+	0xC5, 0xEA, 0x92, 0x6F, 0x92, 0x70, 0xC5, 0xFB, /* 0xA8-0xAB */
+	0xCC, 0xA7, 0x92, 0x71, 0x92, 0x72, 0x92, 0x73, /* 0xAC-0xAF */
+	0x92, 0x74, 0xB1, 0xA7, 0x92, 0x75, 0x92, 0x76, /* 0xB0-0xB3 */
+	0x92, 0x77, 0xB5, 0xD6, 0x92, 0x78, 0x92, 0x79, /* 0xB4-0xB7 */
+	0x92, 0x7A, 0xC4, 0xA8, 0x92, 0x7B, 0xDE, 0xD3, /* 0xB8-0xBB */
+	0xD1, 0xBA, 0xB3, 0xE9, 0x92, 0x7C, 0xC3, 0xF2, /* 0xBC-0xBF */
+	0x92, 0x7D, 0x92, 0x7E, 0xB7, 0xF7, 0x92, 0x80, /* 0xC0-0xC3 */
+	0xD6, 0xF4, 0xB5, 0xA3, 0xB2, 0xF0, 0xC4, 0xB4, /* 0xC4-0xC7 */
+	0xC4, 0xE9, 0xC0, 0xAD, 0xDE, 0xD4, 0x92, 0x81, /* 0xC8-0xCB */
+	0xB0, 0xE8, 0xC5, 0xC4, 0xC1, 0xE0, 0x92, 0x82, /* 0xCC-0xCF */
+	0xB9, 0xD5, 0x92, 0x83, 0xBE, 0xDC, 0xCD, 0xD8, /* 0xD0-0xD3 */
+	0xB0, 0xCE, 0x92, 0x84, 0xCD, 0xCF, 0xDE, 0xD6, /* 0xD4-0xD7 */
+	0xBE, 0xD0, 0xD7, 0xBE, 0xDE, 0xD5, 0xD5, 0xD0, /* 0xD8-0xDB */
+	0xB0, 0xDD, 0x92, 0x85, 0x92, 0x86, 0xC4, 0xE2, /* 0xDC-0xDF */
+	0x92, 0x87, 0x92, 0x88, 0xC2, 0xA3, 0xBC, 0xF0, /* 0xE0-0xE3 */
+	0x92, 0x89, 0xD3, 0xB5, 0xC0, 0xB9, 0xC5, 0xA1, /* 0xE4-0xE7 */
+	0xB2, 0xA6, 0xD4, 0xF1, 0x92, 0x8A, 0x92, 0x8B, /* 0xE8-0xEB */
+	0xC0, 0xA8, 0xCA, 0xC3, 0xDE, 0xD7, 0xD5, 0xFC, /* 0xEC-0xEF */
+	0x92, 0x8C, 0xB9, 0xB0, 0x92, 0x8D, 0xC8, 0xAD, /* 0xF0-0xF3 */
+	0xCB, 0xA9, 0x92, 0x8E, 0xDE, 0xD9, 0xBF, 0xBD, /* 0xF4-0xF7 */
+	0x92, 0x8F, 0x92, 0x90, 0x92, 0x91, 0x92, 0x92, /* 0xF8-0xFB */
+	0xC6, 0xB4, 0xD7, 0xA7, 0xCA, 0xB0, 0xC4, 0xC3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_63[512] = {
+	0x92, 0x93, 0xB3, 0xD6, 0xB9, 0xD2, 0x92, 0x94, /* 0x00-0x03 */
+	0x92, 0x95, 0x92, 0x96, 0x92, 0x97, 0xD6, 0xB8, /* 0x04-0x07 */
+	0xEA, 0xFC, 0xB0, 0xB4, 0x92, 0x98, 0x92, 0x99, /* 0x08-0x0B */
+	0x92, 0x9A, 0x92, 0x9B, 0xBF, 0xE6, 0x92, 0x9C, /* 0x0C-0x0F */
+	0x92, 0x9D, 0xCC, 0xF4, 0x92, 0x9E, 0x92, 0x9F, /* 0x10-0x13 */
+	0x92, 0xA0, 0x92, 0xA1, 0xCD, 0xDA, 0x92, 0xA2, /* 0x14-0x17 */
+	0x92, 0xA3, 0x92, 0xA4, 0xD6, 0xBF, 0xC2, 0xCE, /* 0x18-0x1B */
+	0x92, 0xA5, 0xCE, 0xCE, 0xCC, 0xA2, 0xD0, 0xAE, /* 0x1C-0x1F */
+	0xC4, 0xD3, 0xB5, 0xB2, 0xDE, 0xD8, 0xD5, 0xF5, /* 0x20-0x23 */
+	0xBC, 0xB7, 0xBB, 0xD3, 0x92, 0xA6, 0x92, 0xA7, /* 0x24-0x27 */
+	0xB0, 0xA4, 0x92, 0xA8, 0xC5, 0xB2, 0xB4, 0xEC, /* 0x28-0x2B */
+	0x92, 0xA9, 0x92, 0xAA, 0x92, 0xAB, 0xD5, 0xF1, /* 0x2C-0x2F */
+	0x92, 0xAC, 0x92, 0xAD, 0xEA, 0xFD, 0x92, 0xAE, /* 0x30-0x33 */
+	0x92, 0xAF, 0x92, 0xB0, 0x92, 0xB1, 0x92, 0xB2, /* 0x34-0x37 */
+	0x92, 0xB3, 0xDE, 0xDA, 0xCD, 0xA6, 0x92, 0xB4, /* 0x38-0x3B */
+	0x92, 0xB5, 0xCD, 0xEC, 0x92, 0xB6, 0x92, 0xB7, /* 0x3C-0x3F */
+	0x92, 0xB8, 0x92, 0xB9, 0xCE, 0xE6, 0xDE, 0xDC, /* 0x40-0x43 */
+	0x92, 0xBA, 0xCD, 0xB1, 0xC0, 0xA6, 0x92, 0xBB, /* 0x44-0x47 */
+	0x92, 0xBC, 0xD7, 0xBD, 0x92, 0xBD, 0xDE, 0xDB, /* 0x48-0x4B */
+	0xB0, 0xC6, 0xBA, 0xB4, 0xC9, 0xD3, 0xC4, 0xF3, /* 0x4C-0x4F */
+	0xBE, 0xE8, 0x92, 0xBE, 0x92, 0xBF, 0x92, 0xC0, /* 0x50-0x53 */
+	0x92, 0xC1, 0xB2, 0xB6, 0x92, 0xC2, 0x92, 0xC3, /* 0x54-0x57 */
+	0x92, 0xC4, 0x92, 0xC5, 0x92, 0xC6, 0x92, 0xC7, /* 0x58-0x5B */
+	0x92, 0xC8, 0x92, 0xC9, 0xC0, 0xCC, 0xCB, 0xF0, /* 0x5C-0x5F */
+	0x92, 0xCA, 0xBC, 0xF1, 0xBB, 0xBB, 0xB5, 0xB7, /* 0x60-0x63 */
+	0x92, 0xCB, 0x92, 0xCC, 0x92, 0xCD, 0xC5, 0xF5, /* 0x64-0x67 */
+	0x92, 0xCE, 0xDE, 0xE6, 0x92, 0xCF, 0x92, 0xD0, /* 0x68-0x6B */
+	0x92, 0xD1, 0xDE, 0xE3, 0xBE, 0xDD, 0x92, 0xD2, /* 0x6C-0x6F */
+	0x92, 0xD3, 0xDE, 0xDF, 0x92, 0xD4, 0x92, 0xD5, /* 0x70-0x73 */
+	0x92, 0xD6, 0x92, 0xD7, 0xB4, 0xB7, 0xBD, 0xDD, /* 0x74-0x77 */
+	0x92, 0xD8, 0x92, 0xD9, 0xDE, 0xE0, 0xC4, 0xED, /* 0x78-0x7B */
+	0x92, 0xDA, 0x92, 0xDB, 0x92, 0xDC, 0x92, 0xDD, /* 0x7C-0x7F */
+	
+	0xCF, 0xC6, 0x92, 0xDE, 0xB5, 0xE0, 0x92, 0xDF, /* 0x80-0x83 */
+	0x92, 0xE0, 0x92, 0xE1, 0x92, 0xE2, 0xB6, 0xDE, /* 0x84-0x87 */
+	0xCA, 0xDA, 0xB5, 0xF4, 0xDE, 0xE5, 0x92, 0xE3, /* 0x88-0x8B */
+	0xD5, 0xC6, 0x92, 0xE4, 0xDE, 0xE1, 0xCC, 0xCD, /* 0x8C-0x8F */
+	0xC6, 0xFE, 0x92, 0xE5, 0xC5, 0xC5, 0x92, 0xE6, /* 0x90-0x93 */
+	0x92, 0xE7, 0x92, 0xE8, 0xD2, 0xB4, 0x92, 0xE9, /* 0x94-0x97 */
+	0xBE, 0xF2, 0x92, 0xEA, 0x92, 0xEB, 0x92, 0xEC, /* 0x98-0x9B */
+	0x92, 0xED, 0x92, 0xEE, 0x92, 0xEF, 0x92, 0xF0, /* 0x9C-0x9F */
+	0xC2, 0xD3, 0x92, 0xF1, 0xCC, 0xBD, 0xB3, 0xB8, /* 0xA0-0xA3 */
+	0x92, 0xF2, 0xBD, 0xD3, 0x92, 0xF3, 0xBF, 0xD8, /* 0xA4-0xA7 */
+	0xCD, 0xC6, 0xD1, 0xDA, 0xB4, 0xEB, 0x92, 0xF4, /* 0xA8-0xAB */
+	0xDE, 0xE4, 0xDE, 0xDD, 0xDE, 0xE7, 0x92, 0xF5, /* 0xAC-0xAF */
+	0xEA, 0xFE, 0x92, 0xF6, 0x92, 0xF7, 0xC2, 0xB0, /* 0xB0-0xB3 */
+	0xDE, 0xE2, 0x92, 0xF8, 0x92, 0xF9, 0xD6, 0xC0, /* 0xB4-0xB7 */
+	0xB5, 0xA7, 0x92, 0xFA, 0xB2, 0xF4, 0x92, 0xFB, /* 0xB8-0xBB */
+	0xDE, 0xE8, 0x92, 0xFC, 0xDE, 0xF2, 0x92, 0xFD, /* 0xBC-0xBF */
+	0x92, 0xFE, 0x93, 0x40, 0x93, 0x41, 0x93, 0x42, /* 0xC0-0xC3 */
+	0xDE, 0xED, 0x93, 0x43, 0xDE, 0xF1, 0x93, 0x44, /* 0xC4-0xC7 */
+	0x93, 0x45, 0xC8, 0xE0, 0x93, 0x46, 0x93, 0x47, /* 0xC8-0xCB */
+	0x93, 0x48, 0xD7, 0xE1, 0xDE, 0xEF, 0xC3, 0xE8, /* 0xCC-0xCF */
+	0xCC, 0xE1, 0x93, 0x49, 0xB2, 0xE5, 0x93, 0x4A, /* 0xD0-0xD3 */
+	0x93, 0x4B, 0x93, 0x4C, 0xD2, 0xBE, 0x93, 0x4D, /* 0xD4-0xD7 */
+	0x93, 0x4E, 0x93, 0x4F, 0x93, 0x50, 0x93, 0x51, /* 0xD8-0xDB */
+	0x93, 0x52, 0x93, 0x53, 0xDE, 0xEE, 0x93, 0x54, /* 0xDC-0xDF */
+	0xDE, 0xEB, 0xCE, 0xD5, 0x93, 0x55, 0xB4, 0xA7, /* 0xE0-0xE3 */
+	0x93, 0x56, 0x93, 0x57, 0x93, 0x58, 0x93, 0x59, /* 0xE4-0xE7 */
+	0x93, 0x5A, 0xBF, 0xAB, 0xBE, 0xBE, 0x93, 0x5B, /* 0xE8-0xEB */
+	0x93, 0x5C, 0xBD, 0xD2, 0x93, 0x5D, 0x93, 0x5E, /* 0xEC-0xEF */
+	0x93, 0x5F, 0x93, 0x60, 0xDE, 0xE9, 0x93, 0x61, /* 0xF0-0xF3 */
+	0xD4, 0xAE, 0x93, 0x62, 0xDE, 0xDE, 0x93, 0x63, /* 0xF4-0xF7 */
+	0xDE, 0xEA, 0x93, 0x64, 0x93, 0x65, 0x93, 0x66, /* 0xF8-0xFB */
+	0x93, 0x67, 0xC0, 0xBF, 0x93, 0x68, 0xDE, 0xEC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_64[512] = {
+	0xB2, 0xF3, 0xB8, 0xE9, 0xC2, 0xA7, 0x93, 0x69, /* 0x00-0x03 */
+	0x93, 0x6A, 0xBD, 0xC1, 0x93, 0x6B, 0x93, 0x6C, /* 0x04-0x07 */
+	0x93, 0x6D, 0x93, 0x6E, 0x93, 0x6F, 0xDE, 0xF5, /* 0x08-0x0B */
+	0xDE, 0xF8, 0x93, 0x70, 0x93, 0x71, 0xB2, 0xAB, /* 0x0C-0x0F */
+	0xB4, 0xA4, 0x93, 0x72, 0x93, 0x73, 0xB4, 0xEA, /* 0x10-0x13 */
+	0xC9, 0xA6, 0x93, 0x74, 0x93, 0x75, 0x93, 0x76, /* 0x14-0x17 */
+	0x93, 0x77, 0x93, 0x78, 0x93, 0x79, 0xDE, 0xF6, /* 0x18-0x1B */
+	0xCB, 0xD1, 0x93, 0x7A, 0xB8, 0xE3, 0x93, 0x7B, /* 0x1C-0x1F */
+	0xDE, 0xF7, 0xDE, 0xFA, 0x93, 0x7C, 0x93, 0x7D, /* 0x20-0x23 */
+	0x93, 0x7E, 0x93, 0x80, 0xDE, 0xF9, 0x93, 0x81, /* 0x24-0x27 */
+	0x93, 0x82, 0x93, 0x83, 0xCC, 0xC2, 0x93, 0x84, /* 0x28-0x2B */
+	0xB0, 0xE1, 0xB4, 0xEE, 0x93, 0x85, 0x93, 0x86, /* 0x2C-0x2F */
+	0x93, 0x87, 0x93, 0x88, 0x93, 0x89, 0x93, 0x8A, /* 0x30-0x33 */
+	0xE5, 0xBA, 0x93, 0x8B, 0x93, 0x8C, 0x93, 0x8D, /* 0x34-0x37 */
+	0x93, 0x8E, 0x93, 0x8F, 0xD0, 0xAF, 0x93, 0x90, /* 0x38-0x3B */
+	0x93, 0x91, 0xB2, 0xEB, 0x93, 0x92, 0xEB, 0xA1, /* 0x3C-0x3F */
+	0x93, 0x93, 0xDE, 0xF4, 0x93, 0x94, 0x93, 0x95, /* 0x40-0x43 */
+	0xC9, 0xE3, 0xDE, 0xF3, 0xB0, 0xDA, 0xD2, 0xA1, /* 0x44-0x47 */
+	0xB1, 0xF7, 0x93, 0x96, 0xCC, 0xAF, 0x93, 0x97, /* 0x48-0x4B */
+	0x93, 0x98, 0x93, 0x99, 0x93, 0x9A, 0x93, 0x9B, /* 0x4C-0x4F */
+	0x93, 0x9C, 0x93, 0x9D, 0xDE, 0xF0, 0x93, 0x9E, /* 0x50-0x53 */
+	0xCB, 0xA4, 0x93, 0x9F, 0x93, 0xA0, 0x93, 0xA1, /* 0x54-0x57 */
+	0xD5, 0xAA, 0x93, 0xA2, 0x93, 0xA3, 0x93, 0xA4, /* 0x58-0x5B */
+	0x93, 0xA5, 0x93, 0xA6, 0xDE, 0xFB, 0x93, 0xA7, /* 0x5C-0x5F */
+	0x93, 0xA8, 0x93, 0xA9, 0x93, 0xAA, 0x93, 0xAB, /* 0x60-0x63 */
+	0x93, 0xAC, 0x93, 0xAD, 0x93, 0xAE, 0xB4, 0xDD, /* 0x64-0x67 */
+	0x93, 0xAF, 0xC4, 0xA6, 0x93, 0xB0, 0x93, 0xB1, /* 0x68-0x6B */
+	0x93, 0xB2, 0xDE, 0xFD, 0x93, 0xB3, 0x93, 0xB4, /* 0x6C-0x6F */
+	0x93, 0xB5, 0x93, 0xB6, 0x93, 0xB7, 0x93, 0xB8, /* 0x70-0x73 */
+	0x93, 0xB9, 0x93, 0xBA, 0x93, 0xBB, 0x93, 0xBC, /* 0x74-0x77 */
+	0xC3, 0xFE, 0xC4, 0xA1, 0xDF, 0xA1, 0x93, 0xBD, /* 0x78-0x7B */
+	0x93, 0xBE, 0x93, 0xBF, 0x93, 0xC0, 0x93, 0xC1, /* 0x7C-0x7F */
+	
+	0x93, 0xC2, 0x93, 0xC3, 0xC1, 0xCC, 0x93, 0xC4, /* 0x80-0x83 */
+	0xDE, 0xFC, 0xBE, 0xEF, 0x93, 0xC5, 0xC6, 0xB2, /* 0x84-0x87 */
+	0x93, 0xC6, 0x93, 0xC7, 0x93, 0xC8, 0x93, 0xC9, /* 0x88-0x8B */
+	0x93, 0xCA, 0x93, 0xCB, 0x93, 0xCC, 0x93, 0xCD, /* 0x8C-0x8F */
+	0x93, 0xCE, 0xB3, 0xC5, 0xC8, 0xF6, 0x93, 0xCF, /* 0x90-0x93 */
+	0x93, 0xD0, 0xCB, 0xBA, 0xDE, 0xFE, 0x93, 0xD1, /* 0x94-0x97 */
+	0x93, 0xD2, 0xDF, 0xA4, 0x93, 0xD3, 0x93, 0xD4, /* 0x98-0x9B */
+	0x93, 0xD5, 0x93, 0xD6, 0xD7, 0xB2, 0x93, 0xD7, /* 0x9C-0x9F */
+	0x93, 0xD8, 0x93, 0xD9, 0x93, 0xDA, 0x93, 0xDB, /* 0xA0-0xA3 */
+	0xB3, 0xB7, 0x93, 0xDC, 0x93, 0xDD, 0x93, 0xDE, /* 0xA4-0xA7 */
+	0x93, 0xDF, 0xC1, 0xC3, 0x93, 0xE0, 0x93, 0xE1, /* 0xA8-0xAB */
+	0xC7, 0xCB, 0xB2, 0xA5, 0xB4, 0xE9, 0x93, 0xE2, /* 0xAC-0xAF */
+	0xD7, 0xAB, 0x93, 0xE3, 0x93, 0xE4, 0x93, 0xE5, /* 0xB0-0xB3 */
+	0x93, 0xE6, 0xC4, 0xEC, 0x93, 0xE7, 0xDF, 0xA2, /* 0xB4-0xB7 */
+	0xDF, 0xA3, 0x93, 0xE8, 0xDF, 0xA5, 0x93, 0xE9, /* 0xB8-0xBB */
+	0xBA, 0xB3, 0x93, 0xEA, 0x93, 0xEB, 0x93, 0xEC, /* 0xBC-0xBF */
+	0xDF, 0xA6, 0x93, 0xED, 0xC0, 0xDE, 0x93, 0xEE, /* 0xC0-0xC3 */
+	0x93, 0xEF, 0xC9, 0xC3, 0x93, 0xF0, 0x93, 0xF1, /* 0xC4-0xC7 */
+	0x93, 0xF2, 0x93, 0xF3, 0x93, 0xF4, 0x93, 0xF5, /* 0xC8-0xCB */
+	0x93, 0xF6, 0xB2, 0xD9, 0xC7, 0xE6, 0x93, 0xF7, /* 0xCC-0xCF */
+	0xDF, 0xA7, 0x93, 0xF8, 0xC7, 0xDC, 0x93, 0xF9, /* 0xD0-0xD3 */
+	0x93, 0xFA, 0x93, 0xFB, 0x93, 0xFC, 0xDF, 0xA8, /* 0xD4-0xD7 */
+	0xEB, 0xA2, 0x93, 0xFD, 0x93, 0xFE, 0x94, 0x40, /* 0xD8-0xDB */
+	0x94, 0x41, 0x94, 0x42, 0xCB, 0xD3, 0x94, 0x43, /* 0xDC-0xDF */
+	0x94, 0x44, 0x94, 0x45, 0xDF, 0xAA, 0x94, 0x46, /* 0xE0-0xE3 */
+	0xDF, 0xA9, 0x94, 0x47, 0xB2, 0xC1, 0x94, 0x48, /* 0xE4-0xE7 */
+	0x94, 0x49, 0x94, 0x4A, 0x94, 0x4B, 0x94, 0x4C, /* 0xE8-0xEB */
+	0x94, 0x4D, 0x94, 0x4E, 0x94, 0x4F, 0x94, 0x50, /* 0xEC-0xEF */
+	0x94, 0x51, 0x94, 0x52, 0x94, 0x53, 0x94, 0x54, /* 0xF0-0xF3 */
+	0x94, 0x55, 0x94, 0x56, 0x94, 0x57, 0x94, 0x58, /* 0xF4-0xF7 */
+	0x94, 0x59, 0x94, 0x5A, 0x94, 0x5B, 0x94, 0x5C, /* 0xF8-0xFB */
+	0x94, 0x5D, 0x94, 0x5E, 0x94, 0x5F, 0x94, 0x60, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_65[512] = {
+	0xC5, 0xCA, 0x94, 0x61, 0x94, 0x62, 0x94, 0x63, /* 0x00-0x03 */
+	0x94, 0x64, 0x94, 0x65, 0x94, 0x66, 0x94, 0x67, /* 0x04-0x07 */
+	0x94, 0x68, 0xDF, 0xAB, 0x94, 0x69, 0x94, 0x6A, /* 0x08-0x0B */
+	0x94, 0x6B, 0x94, 0x6C, 0x94, 0x6D, 0x94, 0x6E, /* 0x0C-0x0F */
+	0x94, 0x6F, 0x94, 0x70, 0xD4, 0xDC, 0x94, 0x71, /* 0x10-0x13 */
+	0x94, 0x72, 0x94, 0x73, 0x94, 0x74, 0x94, 0x75, /* 0x14-0x17 */
+	0xC8, 0xC1, 0x94, 0x76, 0x94, 0x77, 0x94, 0x78, /* 0x18-0x1B */
+	0x94, 0x79, 0x94, 0x7A, 0x94, 0x7B, 0x94, 0x7C, /* 0x1C-0x1F */
+	0x94, 0x7D, 0x94, 0x7E, 0x94, 0x80, 0x94, 0x81, /* 0x20-0x23 */
+	0x94, 0x82, 0xDF, 0xAC, 0x94, 0x83, 0x94, 0x84, /* 0x24-0x27 */
+	0x94, 0x85, 0x94, 0x86, 0x94, 0x87, 0xBE, 0xF0, /* 0x28-0x2B */
+	0x94, 0x88, 0x94, 0x89, 0xDF, 0xAD, 0xD6, 0xA7, /* 0x2C-0x2F */
+	0x94, 0x8A, 0x94, 0x8B, 0x94, 0x8C, 0x94, 0x8D, /* 0x30-0x33 */
+	0xEA, 0xB7, 0xEB, 0xB6, 0xCA, 0xD5, 0x94, 0x8E, /* 0x34-0x37 */
+	0xD8, 0xFC, 0xB8, 0xC4, 0x94, 0x8F, 0xB9, 0xA5, /* 0x38-0x3B */
+	0x94, 0x90, 0x94, 0x91, 0xB7, 0xC5, 0xD5, 0xFE, /* 0x3C-0x3F */
+	0x94, 0x92, 0x94, 0x93, 0x94, 0x94, 0x94, 0x95, /* 0x40-0x43 */
+	0x94, 0x96, 0xB9, 0xCA, 0x94, 0x97, 0x94, 0x98, /* 0x44-0x47 */
+	0xD0, 0xA7, 0xF4, 0xCD, 0x94, 0x99, 0x94, 0x9A, /* 0x48-0x4B */
+	0xB5, 0xD0, 0x94, 0x9B, 0x94, 0x9C, 0xC3, 0xF4, /* 0x4C-0x4F */
+	0x94, 0x9D, 0xBE, 0xC8, 0x94, 0x9E, 0x94, 0x9F, /* 0x50-0x53 */
+	0x94, 0xA0, 0xEB, 0xB7, 0xB0, 0xBD, 0x94, 0xA1, /* 0x54-0x57 */
+	0x94, 0xA2, 0xBD, 0xCC, 0x94, 0xA3, 0xC1, 0xB2, /* 0x58-0x5B */
+	0x94, 0xA4, 0xB1, 0xD6, 0xB3, 0xA8, 0x94, 0xA5, /* 0x5C-0x5F */
+	0x94, 0xA6, 0x94, 0xA7, 0xB8, 0xD2, 0xC9, 0xA2, /* 0x60-0x63 */
+	0x94, 0xA8, 0x94, 0xA9, 0xB6, 0xD8, 0x94, 0xAA, /* 0x64-0x67 */
+	0x94, 0xAB, 0x94, 0xAC, 0x94, 0xAD, 0xEB, 0xB8, /* 0x68-0x6B */
+	0xBE, 0xB4, 0x94, 0xAE, 0x94, 0xAF, 0x94, 0xB0, /* 0x6C-0x6F */
+	0xCA, 0xFD, 0x94, 0xB1, 0xC7, 0xC3, 0x94, 0xB2, /* 0x70-0x73 */
+	0xD5, 0xFB, 0x94, 0xB3, 0x94, 0xB4, 0xB7, 0xF3, /* 0x74-0x77 */
+	0x94, 0xB5, 0x94, 0xB6, 0x94, 0xB7, 0x94, 0xB8, /* 0x78-0x7B */
+	0x94, 0xB9, 0x94, 0xBA, 0x94, 0xBB, 0x94, 0xBC, /* 0x7C-0x7F */
+	
+	0x94, 0xBD, 0x94, 0xBE, 0x94, 0xBF, 0x94, 0xC0, /* 0x80-0x83 */
+	0x94, 0xC1, 0x94, 0xC2, 0x94, 0xC3, 0xCE, 0xC4, /* 0x84-0x87 */
+	0x94, 0xC4, 0x94, 0xC5, 0x94, 0xC6, 0xD5, 0xAB, /* 0x88-0x8B */
+	0xB1, 0xF3, 0x94, 0xC7, 0x94, 0xC8, 0x94, 0xC9, /* 0x8C-0x8F */
+	0xEC, 0xB3, 0xB0, 0xDF, 0x94, 0xCA, 0xEC, 0xB5, /* 0x90-0x93 */
+	0x94, 0xCB, 0x94, 0xCC, 0x94, 0xCD, 0xB6, 0xB7, /* 0x94-0x97 */
+	0x94, 0xCE, 0xC1, 0xCF, 0x94, 0xCF, 0xF5, 0xFA, /* 0x98-0x9B */
+	0xD0, 0xB1, 0x94, 0xD0, 0x94, 0xD1, 0xD5, 0xE5, /* 0x9C-0x9F */
+	0x94, 0xD2, 0xCE, 0xD3, 0x94, 0xD3, 0x94, 0xD4, /* 0xA0-0xA3 */
+	0xBD, 0xEF, 0xB3, 0xE2, 0x94, 0xD5, 0xB8, 0xAB, /* 0xA4-0xA7 */
+	0x94, 0xD6, 0xD5, 0xB6, 0x94, 0xD7, 0xED, 0xBD, /* 0xA8-0xAB */
+	0x94, 0xD8, 0xB6, 0xCF, 0x94, 0xD9, 0xCB, 0xB9, /* 0xAC-0xAF */
+	0xD0, 0xC2, 0x94, 0xDA, 0x94, 0xDB, 0x94, 0xDC, /* 0xB0-0xB3 */
+	0x94, 0xDD, 0x94, 0xDE, 0x94, 0xDF, 0x94, 0xE0, /* 0xB4-0xB7 */
+	0x94, 0xE1, 0xB7, 0xBD, 0x94, 0xE2, 0x94, 0xE3, /* 0xB8-0xBB */
+	0xEC, 0xB6, 0xCA, 0xA9, 0x94, 0xE4, 0x94, 0xE5, /* 0xBC-0xBF */
+	0x94, 0xE6, 0xC5, 0xD4, 0x94, 0xE7, 0xEC, 0xB9, /* 0xC0-0xC3 */
+	0xEC, 0xB8, 0xC2, 0xC3, 0xEC, 0xB7, 0x94, 0xE8, /* 0xC4-0xC7 */
+	0x94, 0xE9, 0x94, 0xEA, 0x94, 0xEB, 0xD0, 0xFD, /* 0xC8-0xCB */
+	0xEC, 0xBA, 0x94, 0xEC, 0xEC, 0xBB, 0xD7, 0xE5, /* 0xCC-0xCF */
+	0x94, 0xED, 0x94, 0xEE, 0xEC, 0xBC, 0x94, 0xEF, /* 0xD0-0xD3 */
+	0x94, 0xF0, 0x94, 0xF1, 0xEC, 0xBD, 0xC6, 0xEC, /* 0xD4-0xD7 */
+	0x94, 0xF2, 0x94, 0xF3, 0x94, 0xF4, 0x94, 0xF5, /* 0xD8-0xDB */
+	0x94, 0xF6, 0x94, 0xF7, 0x94, 0xF8, 0x94, 0xF9, /* 0xDC-0xDF */
+	0xCE, 0xDE, 0x94, 0xFA, 0xBC, 0xC8, 0x94, 0xFB, /* 0xE0-0xE3 */
+	0x94, 0xFC, 0xC8, 0xD5, 0xB5, 0xA9, 0xBE, 0xC9, /* 0xE4-0xE7 */
+	0xD6, 0xBC, 0xD4, 0xE7, 0x94, 0xFD, 0x94, 0xFE, /* 0xE8-0xEB */
+	0xD1, 0xAE, 0xD0, 0xF1, 0xEA, 0xB8, 0xEA, 0xB9, /* 0xEC-0xEF */
+	0xEA, 0xBA, 0xBA, 0xB5, 0x95, 0x40, 0x95, 0x41, /* 0xF0-0xF3 */
+	0x95, 0x42, 0x95, 0x43, 0xCA, 0xB1, 0xBF, 0xF5, /* 0xF4-0xF7 */
+	0x95, 0x44, 0x95, 0x45, 0xCD, 0xFA, 0x95, 0x46, /* 0xF8-0xFB */
+	0x95, 0x47, 0x95, 0x48, 0x95, 0x49, 0x95, 0x4A, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_66[512] = {
+	0xEA, 0xC0, 0x95, 0x4B, 0xB0, 0xBA, 0xEA, 0xBE, /* 0x00-0x03 */
+	0x95, 0x4C, 0x95, 0x4D, 0xC0, 0xA5, 0x95, 0x4E, /* 0x04-0x07 */
+	0x95, 0x4F, 0x95, 0x50, 0xEA, 0xBB, 0x95, 0x51, /* 0x08-0x0B */
+	0xB2, 0xFD, 0x95, 0x52, 0xC3, 0xF7, 0xBB, 0xE8, /* 0x0C-0x0F */
+	0x95, 0x53, 0x95, 0x54, 0x95, 0x55, 0xD2, 0xD7, /* 0x10-0x13 */
+	0xCE, 0xF4, 0xEA, 0xBF, 0x95, 0x56, 0x95, 0x57, /* 0x14-0x17 */
+	0x95, 0x58, 0xEA, 0xBC, 0x95, 0x59, 0x95, 0x5A, /* 0x18-0x1B */
+	0x95, 0x5B, 0xEA, 0xC3, 0x95, 0x5C, 0xD0, 0xC7, /* 0x1C-0x1F */
+	0xD3, 0xB3, 0x95, 0x5D, 0x95, 0x5E, 0x95, 0x5F, /* 0x20-0x23 */
+	0x95, 0x60, 0xB4, 0xBA, 0x95, 0x61, 0xC3, 0xC1, /* 0x24-0x27 */
+	0xD7, 0xF2, 0x95, 0x62, 0x95, 0x63, 0x95, 0x64, /* 0x28-0x2B */
+	0x95, 0x65, 0xD5, 0xD1, 0x95, 0x66, 0xCA, 0xC7, /* 0x2C-0x2F */
+	0x95, 0x67, 0xEA, 0xC5, 0x95, 0x68, 0x95, 0x69, /* 0x30-0x33 */
+	0xEA, 0xC4, 0xEA, 0xC7, 0xEA, 0xC6, 0x95, 0x6A, /* 0x34-0x37 */
+	0x95, 0x6B, 0x95, 0x6C, 0x95, 0x6D, 0x95, 0x6E, /* 0x38-0x3B */
+	0xD6, 0xE7, 0x95, 0x6F, 0xCF, 0xD4, 0x95, 0x70, /* 0x3C-0x3F */
+	0x95, 0x71, 0xEA, 0xCB, 0x95, 0x72, 0xBB, 0xCE, /* 0x40-0x43 */
+	0x95, 0x73, 0x95, 0x74, 0x95, 0x75, 0x95, 0x76, /* 0x44-0x47 */
+	0x95, 0x77, 0x95, 0x78, 0x95, 0x79, 0xBD, 0xFA, /* 0x48-0x4B */
+	0xC9, 0xCE, 0x95, 0x7A, 0x95, 0x7B, 0xEA, 0xCC, /* 0x4C-0x4F */
+	0x95, 0x7C, 0x95, 0x7D, 0xC9, 0xB9, 0xCF, 0xFE, /* 0x50-0x53 */
+	0xEA, 0xCA, 0xD4, 0xCE, 0xEA, 0xCD, 0xEA, 0xCF, /* 0x54-0x57 */
+	0x95, 0x7E, 0x95, 0x80, 0xCD, 0xED, 0x95, 0x81, /* 0x58-0x5B */
+	0x95, 0x82, 0x95, 0x83, 0x95, 0x84, 0xEA, 0xC9, /* 0x5C-0x5F */
+	0x95, 0x85, 0xEA, 0xCE, 0x95, 0x86, 0x95, 0x87, /* 0x60-0x63 */
+	0xCE, 0xEE, 0x95, 0x88, 0xBB, 0xDE, 0x95, 0x89, /* 0x64-0x67 */
+	0xB3, 0xBF, 0x95, 0x8A, 0x95, 0x8B, 0x95, 0x8C, /* 0x68-0x6B */
+	0x95, 0x8D, 0x95, 0x8E, 0xC6, 0xD5, 0xBE, 0xB0, /* 0x6C-0x6F */
+	0xCE, 0xFA, 0x95, 0x8F, 0x95, 0x90, 0x95, 0x91, /* 0x70-0x73 */
+	0xC7, 0xE7, 0x95, 0x92, 0xBE, 0xA7, 0xEA, 0xD0, /* 0x74-0x77 */
+	0x95, 0x93, 0x95, 0x94, 0xD6, 0xC7, 0x95, 0x95, /* 0x78-0x7B */
+	0x95, 0x96, 0x95, 0x97, 0xC1, 0xC0, 0x95, 0x98, /* 0x7C-0x7F */
+	
+	0x95, 0x99, 0x95, 0x9A, 0xD4, 0xDD, 0x95, 0x9B, /* 0x80-0x83 */
+	0xEA, 0xD1, 0x95, 0x9C, 0x95, 0x9D, 0xCF, 0xBE, /* 0x84-0x87 */
+	0x95, 0x9E, 0x95, 0x9F, 0x95, 0xA0, 0x95, 0xA1, /* 0x88-0x8B */
+	0xEA, 0xD2, 0x95, 0xA2, 0x95, 0xA3, 0x95, 0xA4, /* 0x8C-0x8F */
+	0x95, 0xA5, 0xCA, 0xEE, 0x95, 0xA6, 0x95, 0xA7, /* 0x90-0x93 */
+	0x95, 0xA8, 0x95, 0xA9, 0xC5, 0xAF, 0xB0, 0xB5, /* 0x94-0x97 */
+	0x95, 0xAA, 0x95, 0xAB, 0x95, 0xAC, 0x95, 0xAD, /* 0x98-0x9B */
+	0x95, 0xAE, 0xEA, 0xD4, 0x95, 0xAF, 0x95, 0xB0, /* 0x9C-0x9F */
+	0x95, 0xB1, 0x95, 0xB2, 0x95, 0xB3, 0x95, 0xB4, /* 0xA0-0xA3 */
+	0x95, 0xB5, 0x95, 0xB6, 0x95, 0xB7, 0xEA, 0xD3, /* 0xA4-0xA7 */
+	0xF4, 0xDF, 0x95, 0xB8, 0x95, 0xB9, 0x95, 0xBA, /* 0xA8-0xAB */
+	0x95, 0xBB, 0x95, 0xBC, 0xC4, 0xBA, 0x95, 0xBD, /* 0xAC-0xAF */
+	0x95, 0xBE, 0x95, 0xBF, 0x95, 0xC0, 0x95, 0xC1, /* 0xB0-0xB3 */
+	0xB1, 0xA9, 0x95, 0xC2, 0x95, 0xC3, 0x95, 0xC4, /* 0xB4-0xB7 */
+	0x95, 0xC5, 0xE5, 0xDF, 0x95, 0xC6, 0x95, 0xC7, /* 0xB8-0xBB */
+	0x95, 0xC8, 0x95, 0xC9, 0xEA, 0xD5, 0x95, 0xCA, /* 0xBC-0xBF */
+	0x95, 0xCB, 0x95, 0xCC, 0x95, 0xCD, 0x95, 0xCE, /* 0xC0-0xC3 */
+	0x95, 0xCF, 0x95, 0xD0, 0x95, 0xD1, 0x95, 0xD2, /* 0xC4-0xC7 */
+	0x95, 0xD3, 0x95, 0xD4, 0x95, 0xD5, 0x95, 0xD6, /* 0xC8-0xCB */
+	0x95, 0xD7, 0x95, 0xD8, 0x95, 0xD9, 0x95, 0xDA, /* 0xCC-0xCF */
+	0x95, 0xDB, 0x95, 0xDC, 0x95, 0xDD, 0x95, 0xDE, /* 0xD0-0xD3 */
+	0x95, 0xDF, 0x95, 0xE0, 0x95, 0xE1, 0x95, 0xE2, /* 0xD4-0xD7 */
+	0x95, 0xE3, 0xCA, 0xEF, 0x95, 0xE4, 0xEA, 0xD6, /* 0xD8-0xDB */
+	0xEA, 0xD7, 0xC6, 0xD8, 0x95, 0xE5, 0x95, 0xE6, /* 0xDC-0xDF */
+	0x95, 0xE7, 0x95, 0xE8, 0x95, 0xE9, 0x95, 0xEA, /* 0xE0-0xE3 */
+	0x95, 0xEB, 0x95, 0xEC, 0xEA, 0xD8, 0x95, 0xED, /* 0xE4-0xE7 */
+	0x95, 0xEE, 0xEA, 0xD9, 0x95, 0xEF, 0x95, 0xF0, /* 0xE8-0xEB */
+	0x95, 0xF1, 0x95, 0xF2, 0x95, 0xF3, 0x95, 0xF4, /* 0xEC-0xEF */
+	0xD4, 0xBB, 0x95, 0xF5, 0xC7, 0xFA, 0xD2, 0xB7, /* 0xF0-0xF3 */
+	0xB8, 0xFC, 0x95, 0xF6, 0x95, 0xF7, 0xEA, 0xC2, /* 0xF4-0xF7 */
+	0x95, 0xF8, 0xB2, 0xDC, 0x95, 0xF9, 0x95, 0xFA, /* 0xF8-0xFB */
+	0xC2, 0xFC, 0x95, 0xFB, 0xD4, 0xF8, 0xCC, 0xE6, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_67[512] = {
+	0xD7, 0xEE, 0x95, 0xFC, 0x95, 0xFD, 0x95, 0xFE, /* 0x00-0x03 */
+	0x96, 0x40, 0x96, 0x41, 0x96, 0x42, 0x96, 0x43, /* 0x04-0x07 */
+	0xD4, 0xC2, 0xD3, 0xD0, 0xEB, 0xC3, 0xC5, 0xF3, /* 0x08-0x0B */
+	0x96, 0x44, 0xB7, 0xFE, 0x96, 0x45, 0x96, 0x46, /* 0x0C-0x0F */
+	0xEB, 0xD4, 0x96, 0x47, 0x96, 0x48, 0x96, 0x49, /* 0x10-0x13 */
+	0xCB, 0xB7, 0xEB, 0xDE, 0x96, 0x4A, 0xC0, 0xCA, /* 0x14-0x17 */
+	0x96, 0x4B, 0x96, 0x4C, 0x96, 0x4D, 0xCD, 0xFB, /* 0x18-0x1B */
+	0x96, 0x4E, 0xB3, 0xAF, 0x96, 0x4F, 0xC6, 0xDA, /* 0x1C-0x1F */
+	0x96, 0x50, 0x96, 0x51, 0x96, 0x52, 0x96, 0x53, /* 0x20-0x23 */
+	0x96, 0x54, 0x96, 0x55, 0xEB, 0xFC, 0x96, 0x56, /* 0x24-0x27 */
+	0xC4, 0xBE, 0x96, 0x57, 0xCE, 0xB4, 0xC4, 0xA9, /* 0x28-0x2B */
+	0xB1, 0xBE, 0xD4, 0xFD, 0x96, 0x58, 0xCA, 0xF5, /* 0x2C-0x2F */
+	0x96, 0x59, 0xD6, 0xEC, 0x96, 0x5A, 0x96, 0x5B, /* 0x30-0x33 */
+	0xC6, 0xD3, 0xB6, 0xE4, 0x96, 0x5C, 0x96, 0x5D, /* 0x34-0x37 */
+	0x96, 0x5E, 0x96, 0x5F, 0xBB, 0xFA, 0x96, 0x60, /* 0x38-0x3B */
+	0x96, 0x61, 0xD0, 0xE0, 0x96, 0x62, 0x96, 0x63, /* 0x3C-0x3F */
+	0xC9, 0xB1, 0x96, 0x64, 0xD4, 0xD3, 0xC8, 0xA8, /* 0x40-0x43 */
+	0x96, 0x65, 0x96, 0x66, 0xB8, 0xCB, 0x96, 0x67, /* 0x44-0x47 */
+	0xE8, 0xBE, 0xC9, 0xBC, 0x96, 0x68, 0x96, 0x69, /* 0x48-0x4B */
+	0xE8, 0xBB, 0x96, 0x6A, 0xC0, 0xEE, 0xD0, 0xD3, /* 0x4C-0x4F */
+	0xB2, 0xC4, 0xB4, 0xE5, 0x96, 0x6B, 0xE8, 0xBC, /* 0x50-0x53 */
+	0x96, 0x6C, 0x96, 0x6D, 0xD5, 0xC8, 0x96, 0x6E, /* 0x54-0x57 */
+	0x96, 0x6F, 0x96, 0x70, 0x96, 0x71, 0x96, 0x72, /* 0x58-0x5B */
+	0xB6, 0xC5, 0x96, 0x73, 0xE8, 0xBD, 0xCA, 0xF8, /* 0x5C-0x5F */
+	0xB8, 0xDC, 0xCC, 0xF5, 0x96, 0x74, 0x96, 0x75, /* 0x60-0x63 */
+	0x96, 0x76, 0xC0, 0xB4, 0x96, 0x77, 0x96, 0x78, /* 0x64-0x67 */
+	0xD1, 0xEE, 0xE8, 0xBF, 0xE8, 0xC2, 0x96, 0x79, /* 0x68-0x6B */
+	0x96, 0x7A, 0xBA, 0xBC, 0x96, 0x7B, 0xB1, 0xAD, /* 0x6C-0x6F */
+	0xBD, 0xDC, 0x96, 0x7C, 0xEA, 0xBD, 0xE8, 0xC3, /* 0x70-0x73 */
+	0x96, 0x7D, 0xE8, 0xC6, 0x96, 0x7E, 0xE8, 0xCB, /* 0x74-0x77 */
+	0x96, 0x80, 0x96, 0x81, 0x96, 0x82, 0x96, 0x83, /* 0x78-0x7B */
+	0xE8, 0xCC, 0x96, 0x84, 0xCB, 0xC9, 0xB0, 0xE5, /* 0x7C-0x7F */
+	
+	0x96, 0x85, 0xBC, 0xAB, 0x96, 0x86, 0x96, 0x87, /* 0x80-0x83 */
+	0xB9, 0xB9, 0x96, 0x88, 0x96, 0x89, 0xE8, 0xC1, /* 0x84-0x87 */
+	0x96, 0x8A, 0xCD, 0xF7, 0x96, 0x8B, 0xE8, 0xCA, /* 0x88-0x8B */
+	0x96, 0x8C, 0x96, 0x8D, 0x96, 0x8E, 0x96, 0x8F, /* 0x8C-0x8F */
+	0xCE, 0xF6, 0x96, 0x90, 0x96, 0x91, 0x96, 0x92, /* 0x90-0x93 */
+	0x96, 0x93, 0xD5, 0xED, 0x96, 0x94, 0xC1, 0xD6, /* 0x94-0x97 */
+	0xE8, 0xC4, 0x96, 0x95, 0xC3, 0xB6, 0x96, 0x96, /* 0x98-0x9B */
+	0xB9, 0xFB, 0xD6, 0xA6, 0xE8, 0xC8, 0x96, 0x97, /* 0x9C-0x9F */
+	0x96, 0x98, 0x96, 0x99, 0xCA, 0xE0, 0xD4, 0xE6, /* 0xA0-0xA3 */
+	0x96, 0x9A, 0xE8, 0xC0, 0x96, 0x9B, 0xE8, 0xC5, /* 0xA4-0xA7 */
+	0xE8, 0xC7, 0x96, 0x9C, 0xC7, 0xB9, 0xB7, 0xE3, /* 0xA8-0xAB */
+	0x96, 0x9D, 0xE8, 0xC9, 0x96, 0x9E, 0xBF, 0xDD, /* 0xAC-0xAF */
+	0xE8, 0xD2, 0x96, 0x9F, 0x96, 0xA0, 0xE8, 0xD7, /* 0xB0-0xB3 */
+	0x96, 0xA1, 0xE8, 0xD5, 0xBC, 0xDC, 0xBC, 0xCF, /* 0xB4-0xB7 */
+	0xE8, 0xDB, 0x96, 0xA2, 0x96, 0xA3, 0x96, 0xA4, /* 0xB8-0xBB */
+	0x96, 0xA5, 0x96, 0xA6, 0x96, 0xA7, 0x96, 0xA8, /* 0xBC-0xBF */
+	0x96, 0xA9, 0xE8, 0xDE, 0x96, 0xAA, 0xE8, 0xDA, /* 0xC0-0xC3 */
+	0xB1, 0xFA, 0x96, 0xAB, 0x96, 0xAC, 0x96, 0xAD, /* 0xC4-0xC7 */
+	0x96, 0xAE, 0x96, 0xAF, 0x96, 0xB0, 0x96, 0xB1, /* 0xC8-0xCB */
+	0x96, 0xB2, 0x96, 0xB3, 0x96, 0xB4, 0xB0, 0xD8, /* 0xCC-0xCF */
+	0xC4, 0xB3, 0xB8, 0xCC, 0xC6, 0xE2, 0xC8, 0xBE, /* 0xD0-0xD3 */
+	0xC8, 0xE1, 0x96, 0xB5, 0x96, 0xB6, 0x96, 0xB7, /* 0xD4-0xD7 */
+	0xE8, 0xCF, 0xE8, 0xD4, 0xE8, 0xD6, 0x96, 0xB8, /* 0xD8-0xDB */
+	0xB9, 0xF1, 0xE8, 0xD8, 0xD7, 0xF5, 0x96, 0xB9, /* 0xDC-0xDF */
+	0xC4, 0xFB, 0x96, 0xBA, 0xE8, 0xDC, 0x96, 0xBB, /* 0xE0-0xE3 */
+	0x96, 0xBC, 0xB2, 0xE9, 0x96, 0xBD, 0x96, 0xBE, /* 0xE4-0xE7 */
+	0x96, 0xBF, 0xE8, 0xD1, 0x96, 0xC0, 0x96, 0xC1, /* 0xE8-0xEB */
+	0xBC, 0xED, 0x96, 0xC2, 0x96, 0xC3, 0xBF, 0xC2, /* 0xEC-0xEF */
+	0xE8, 0xCD, 0xD6, 0xF9, 0x96, 0xC4, 0xC1, 0xF8, /* 0xF0-0xF3 */
+	0xB2, 0xF1, 0x96, 0xC5, 0x96, 0xC6, 0x96, 0xC7, /* 0xF4-0xF7 */
+	0x96, 0xC8, 0x96, 0xC9, 0x96, 0xCA, 0x96, 0xCB, /* 0xF8-0xFB */
+	0x96, 0xCC, 0xE8, 0xDF, 0x96, 0xCD, 0xCA, 0xC1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_68[512] = {
+	0xE8, 0xD9, 0x96, 0xCE, 0x96, 0xCF, 0x96, 0xD0, /* 0x00-0x03 */
+	0x96, 0xD1, 0xD5, 0xA4, 0x96, 0xD2, 0xB1, 0xEA, /* 0x04-0x07 */
+	0xD5, 0xBB, 0xE8, 0xCE, 0xE8, 0xD0, 0xB6, 0xB0, /* 0x08-0x0B */
+	0xE8, 0xD3, 0x96, 0xD3, 0xE8, 0xDD, 0xC0, 0xB8, /* 0x0C-0x0F */
+	0x96, 0xD4, 0xCA, 0xF7, 0x96, 0xD5, 0xCB, 0xA8, /* 0x10-0x13 */
+	0x96, 0xD6, 0x96, 0xD7, 0xC6, 0xDC, 0xC0, 0xF5, /* 0x14-0x17 */
+	0x96, 0xD8, 0x96, 0xD9, 0x96, 0xDA, 0x96, 0xDB, /* 0x18-0x1B */
+	0x96, 0xDC, 0xE8, 0xE9, 0x96, 0xDD, 0x96, 0xDE, /* 0x1C-0x1F */
+	0x96, 0xDF, 0xD0, 0xA3, 0x96, 0xE0, 0x96, 0xE1, /* 0x20-0x23 */
+	0x96, 0xE2, 0x96, 0xE3, 0x96, 0xE4, 0x96, 0xE5, /* 0x24-0x27 */
+	0x96, 0xE6, 0xE8, 0xF2, 0xD6, 0xEA, 0x96, 0xE7, /* 0x28-0x2B */
+	0x96, 0xE8, 0x96, 0xE9, 0x96, 0xEA, 0x96, 0xEB, /* 0x2C-0x2F */
+	0x96, 0xEC, 0x96, 0xED, 0xE8, 0xE0, 0xE8, 0xE1, /* 0x30-0x33 */
+	0x96, 0xEE, 0x96, 0xEF, 0x96, 0xF0, 0xD1, 0xF9, /* 0x34-0x37 */
+	0xBA, 0xCB, 0xB8, 0xF9, 0x96, 0xF1, 0x96, 0xF2, /* 0x38-0x3B */
+	0xB8, 0xF1, 0xD4, 0xD4, 0xE8, 0xEF, 0x96, 0xF3, /* 0x3C-0x3F */
+	0xE8, 0xEE, 0xE8, 0xEC, 0xB9, 0xF0, 0xCC, 0xD2, /* 0x40-0x43 */
+	0xE8, 0xE6, 0xCE, 0xA6, 0xBF, 0xF2, 0x96, 0xF4, /* 0x44-0x47 */
+	0xB0, 0xB8, 0xE8, 0xF1, 0xE8, 0xF0, 0x96, 0xF5, /* 0x48-0x4B */
+	0xD7, 0xC0, 0x96, 0xF6, 0xE8, 0xE4, 0x96, 0xF7, /* 0x4C-0x4F */
+	0xCD, 0xA9, 0xC9, 0xA3, 0x96, 0xF8, 0xBB, 0xB8, /* 0x50-0x53 */
+	0xBD, 0xDB, 0xE8, 0xEA, 0x96, 0xF9, 0x96, 0xFA, /* 0x54-0x57 */
+	0x96, 0xFB, 0x96, 0xFC, 0x96, 0xFD, 0x96, 0xFE, /* 0x58-0x5B */
+	0x97, 0x40, 0x97, 0x41, 0x97, 0x42, 0x97, 0x43, /* 0x5C-0x5F */
+	0xE8, 0xE2, 0xE8, 0xE3, 0xE8, 0xE5, 0xB5, 0xB5, /* 0x60-0x63 */
+	0xE8, 0xE7, 0xC7, 0xC5, 0xE8, 0xEB, 0xE8, 0xED, /* 0x64-0x67 */
+	0xBD, 0xB0, 0xD7, 0xAE, 0x97, 0x44, 0xE8, 0xF8, /* 0x68-0x6B */
+	0x97, 0x45, 0x97, 0x46, 0x97, 0x47, 0x97, 0x48, /* 0x6C-0x6F */
+	0x97, 0x49, 0x97, 0x4A, 0x97, 0x4B, 0x97, 0x4C, /* 0x70-0x73 */
+	0xE8, 0xF5, 0x97, 0x4D, 0xCD, 0xB0, 0xE8, 0xF6, /* 0x74-0x77 */
+	0x97, 0x4E, 0x97, 0x4F, 0x97, 0x50, 0x97, 0x51, /* 0x78-0x7B */
+	0x97, 0x52, 0x97, 0x53, 0x97, 0x54, 0x97, 0x55, /* 0x7C-0x7F */
+	
+	0x97, 0x56, 0xC1, 0xBA, 0x97, 0x57, 0xE8, 0xE8, /* 0x80-0x83 */
+	0x97, 0x58, 0xC3, 0xB7, 0xB0, 0xF0, 0x97, 0x59, /* 0x84-0x87 */
+	0x97, 0x5A, 0x97, 0x5B, 0x97, 0x5C, 0x97, 0x5D, /* 0x88-0x8B */
+	0x97, 0x5E, 0x97, 0x5F, 0x97, 0x60, 0xE8, 0xF4, /* 0x8C-0x8F */
+	0x97, 0x61, 0x97, 0x62, 0x97, 0x63, 0xE8, 0xF7, /* 0x90-0x93 */
+	0x97, 0x64, 0x97, 0x65, 0x97, 0x66, 0xB9, 0xA3, /* 0x94-0x97 */
+	0x97, 0x67, 0x97, 0x68, 0x97, 0x69, 0x97, 0x6A, /* 0x98-0x9B */
+	0x97, 0x6B, 0x97, 0x6C, 0x97, 0x6D, 0x97, 0x6E, /* 0x9C-0x9F */
+	0x97, 0x6F, 0x97, 0x70, 0xC9, 0xD2, 0x97, 0x71, /* 0xA0-0xA3 */
+	0x97, 0x72, 0x97, 0x73, 0xC3, 0xCE, 0xCE, 0xE0, /* 0xA4-0xA7 */
+	0xC0, 0xE6, 0x97, 0x74, 0x97, 0x75, 0x97, 0x76, /* 0xA8-0xAB */
+	0x97, 0x77, 0xCB, 0xF3, 0x97, 0x78, 0xCC, 0xDD, /* 0xAC-0xAF */
+	0xD0, 0xB5, 0x97, 0x79, 0x97, 0x7A, 0xCA, 0xE1, /* 0xB0-0xB3 */
+	0x97, 0x7B, 0xE8, 0xF3, 0x97, 0x7C, 0x97, 0x7D, /* 0xB4-0xB7 */
+	0x97, 0x7E, 0x97, 0x80, 0x97, 0x81, 0x97, 0x82, /* 0xB8-0xBB */
+	0x97, 0x83, 0x97, 0x84, 0x97, 0x85, 0x97, 0x86, /* 0xBC-0xBF */
+	0xBC, 0xEC, 0x97, 0x87, 0xE8, 0xF9, 0x97, 0x88, /* 0xC0-0xC3 */
+	0x97, 0x89, 0x97, 0x8A, 0x97, 0x8B, 0x97, 0x8C, /* 0xC4-0xC7 */
+	0x97, 0x8D, 0xC3, 0xDE, 0x97, 0x8E, 0xC6, 0xE5, /* 0xC8-0xCB */
+	0x97, 0x8F, 0xB9, 0xF7, 0x97, 0x90, 0x97, 0x91, /* 0xCC-0xCF */
+	0x97, 0x92, 0x97, 0x93, 0xB0, 0xF4, 0x97, 0x94, /* 0xD0-0xD3 */
+	0x97, 0x95, 0xD7, 0xD8, 0x97, 0x96, 0x97, 0x97, /* 0xD4-0xD7 */
+	0xBC, 0xAC, 0x97, 0x98, 0xC5, 0xEF, 0x97, 0x99, /* 0xD8-0xDB */
+	0x97, 0x9A, 0x97, 0x9B, 0x97, 0x9C, 0x97, 0x9D, /* 0xDC-0xDF */
+	0xCC, 0xC4, 0x97, 0x9E, 0x97, 0x9F, 0xE9, 0xA6, /* 0xE0-0xE3 */
+	0x97, 0xA0, 0x97, 0xA1, 0x97, 0xA2, 0x97, 0xA3, /* 0xE4-0xE7 */
+	0x97, 0xA4, 0x97, 0xA5, 0x97, 0xA6, 0x97, 0xA7, /* 0xE8-0xEB */
+	0x97, 0xA8, 0x97, 0xA9, 0xC9, 0xAD, 0x97, 0xAA, /* 0xEC-0xEF */
+	0xE9, 0xA2, 0xC0, 0xE2, 0x97, 0xAB, 0x97, 0xAC, /* 0xF0-0xF3 */
+	0x97, 0xAD, 0xBF, 0xC3, 0x97, 0xAE, 0x97, 0xAF, /* 0xF4-0xF7 */
+	0x97, 0xB0, 0xE8, 0xFE, 0xB9, 0xD7, 0x97, 0xB1, /* 0xF8-0xFB */
+	0xE8, 0xFB, 0x97, 0xB2, 0x97, 0xB3, 0x97, 0xB4, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_69[512] = {
+	0x97, 0xB5, 0xE9, 0xA4, 0x97, 0xB6, 0x97, 0xB7, /* 0x00-0x03 */
+	0x97, 0xB8, 0xD2, 0xCE, 0x97, 0xB9, 0x97, 0xBA, /* 0x04-0x07 */
+	0x97, 0xBB, 0x97, 0xBC, 0x97, 0xBD, 0xE9, 0xA3, /* 0x08-0x0B */
+	0x97, 0xBE, 0xD6, 0xB2, 0xD7, 0xB5, 0x97, 0xBF, /* 0x0C-0x0F */
+	0xE9, 0xA7, 0x97, 0xC0, 0xBD, 0xB7, 0x97, 0xC1, /* 0x10-0x13 */
+	0x97, 0xC2, 0x97, 0xC3, 0x97, 0xC4, 0x97, 0xC5, /* 0x14-0x17 */
+	0x97, 0xC6, 0x97, 0xC7, 0x97, 0xC8, 0x97, 0xC9, /* 0x18-0x1B */
+	0x97, 0xCA, 0x97, 0xCB, 0x97, 0xCC, 0xE8, 0xFC, /* 0x1C-0x1F */
+	0xE8, 0xFD, 0x97, 0xCD, 0x97, 0xCE, 0x97, 0xCF, /* 0x20-0x23 */
+	0xE9, 0xA1, 0x97, 0xD0, 0x97, 0xD1, 0x97, 0xD2, /* 0x24-0x27 */
+	0x97, 0xD3, 0x97, 0xD4, 0x97, 0xD5, 0x97, 0xD6, /* 0x28-0x2B */
+	0x97, 0xD7, 0xCD, 0xD6, 0x97, 0xD8, 0x97, 0xD9, /* 0x2C-0x2F */
+	0xD2, 0xAC, 0x97, 0xDA, 0x97, 0xDB, 0x97, 0xDC, /* 0x30-0x33 */
+	0xE9, 0xB2, 0x97, 0xDD, 0x97, 0xDE, 0x97, 0xDF, /* 0x34-0x37 */
+	0x97, 0xE0, 0xE9, 0xA9, 0x97, 0xE1, 0x97, 0xE2, /* 0x38-0x3B */
+	0x97, 0xE3, 0xB4, 0xAA, 0x97, 0xE4, 0xB4, 0xBB, /* 0x3C-0x3F */
+	0x97, 0xE5, 0x97, 0xE6, 0xE9, 0xAB, 0x97, 0xE7, /* 0x40-0x43 */
+	0x97, 0xE8, 0x97, 0xE9, 0x97, 0xEA, 0x97, 0xEB, /* 0x44-0x47 */
+	0x97, 0xEC, 0x97, 0xED, 0x97, 0xEE, 0x97, 0xEF, /* 0x48-0x4B */
+	0x97, 0xF0, 0x97, 0xF1, 0x97, 0xF2, 0x97, 0xF3, /* 0x4C-0x4F */
+	0x97, 0xF4, 0x97, 0xF5, 0x97, 0xF6, 0x97, 0xF7, /* 0x50-0x53 */
+	0xD0, 0xA8, 0x97, 0xF8, 0x97, 0xF9, 0xE9, 0xA5, /* 0x54-0x57 */
+	0x97, 0xFA, 0x97, 0xFB, 0xB3, 0xFE, 0x97, 0xFC, /* 0x58-0x5B */
+	0x97, 0xFD, 0xE9, 0xAC, 0xC0, 0xE3, 0x97, 0xFE, /* 0x5C-0x5F */
+	0xE9, 0xAA, 0x98, 0x40, 0x98, 0x41, 0xE9, 0xB9, /* 0x60-0x63 */
+	0x98, 0x42, 0x98, 0x43, 0xE9, 0xB8, 0x98, 0x44, /* 0x64-0x67 */
+	0x98, 0x45, 0x98, 0x46, 0x98, 0x47, 0xE9, 0xAE, /* 0x68-0x6B */
+	0x98, 0x48, 0x98, 0x49, 0xE8, 0xFA, 0x98, 0x4A, /* 0x6C-0x6F */
+	0x98, 0x4B, 0xE9, 0xA8, 0x98, 0x4C, 0x98, 0x4D, /* 0x70-0x73 */
+	0x98, 0x4E, 0x98, 0x4F, 0x98, 0x50, 0xBF, 0xAC, /* 0x74-0x77 */
+	0xE9, 0xB1, 0xE9, 0xBA, 0x98, 0x51, 0x98, 0x52, /* 0x78-0x7B */
+	0xC2, 0xA5, 0x98, 0x53, 0x98, 0x54, 0x98, 0x55, /* 0x7C-0x7F */
+	
+	0xE9, 0xAF, 0x98, 0x56, 0xB8, 0xC5, 0x98, 0x57, /* 0x80-0x83 */
+	0xE9, 0xAD, 0x98, 0x58, 0xD3, 0xDC, 0xE9, 0xB4, /* 0x84-0x87 */
+	0xE9, 0xB5, 0xE9, 0xB7, 0x98, 0x59, 0x98, 0x5A, /* 0x88-0x8B */
+	0x98, 0x5B, 0xE9, 0xC7, 0x98, 0x5C, 0x98, 0x5D, /* 0x8C-0x8F */
+	0x98, 0x5E, 0x98, 0x5F, 0x98, 0x60, 0x98, 0x61, /* 0x90-0x93 */
+	0xC0, 0xC6, 0xE9, 0xC5, 0x98, 0x62, 0x98, 0x63, /* 0x94-0x97 */
+	0xE9, 0xB0, 0x98, 0x64, 0x98, 0x65, 0xE9, 0xBB, /* 0x98-0x9B */
+	0xB0, 0xF1, 0x98, 0x66, 0x98, 0x67, 0x98, 0x68, /* 0x9C-0x9F */
+	0x98, 0x69, 0x98, 0x6A, 0x98, 0x6B, 0x98, 0x6C, /* 0xA0-0xA3 */
+	0x98, 0x6D, 0x98, 0x6E, 0x98, 0x6F, 0xE9, 0xBC, /* 0xA4-0xA7 */
+	0xD5, 0xA5, 0x98, 0x70, 0x98, 0x71, 0xE9, 0xBE, /* 0xA8-0xAB */
+	0x98, 0x72, 0xE9, 0xBF, 0x98, 0x73, 0x98, 0x74, /* 0xAC-0xAF */
+	0x98, 0x75, 0xE9, 0xC1, 0x98, 0x76, 0x98, 0x77, /* 0xB0-0xB3 */
+	0xC1, 0xF1, 0x98, 0x78, 0x98, 0x79, 0xC8, 0xB6, /* 0xB4-0xB7 */
+	0x98, 0x7A, 0x98, 0x7B, 0x98, 0x7C, 0xE9, 0xBD, /* 0xB8-0xBB */
+	0x98, 0x7D, 0x98, 0x7E, 0x98, 0x80, 0x98, 0x81, /* 0xBC-0xBF */
+	0x98, 0x82, 0xE9, 0xC2, 0x98, 0x83, 0x98, 0x84, /* 0xC0-0xC3 */
+	0x98, 0x85, 0x98, 0x86, 0x98, 0x87, 0x98, 0x88, /* 0xC4-0xC7 */
+	0x98, 0x89, 0x98, 0x8A, 0xE9, 0xC3, 0x98, 0x8B, /* 0xC8-0xCB */
+	0xE9, 0xB3, 0x98, 0x8C, 0xE9, 0xB6, 0x98, 0x8D, /* 0xCC-0xCF */
+	0xBB, 0xB1, 0x98, 0x8E, 0x98, 0x8F, 0x98, 0x90, /* 0xD0-0xD3 */
+	0xE9, 0xC0, 0x98, 0x91, 0x98, 0x92, 0x98, 0x93, /* 0xD4-0xD7 */
+	0x98, 0x94, 0x98, 0x95, 0x98, 0x96, 0xBC, 0xF7, /* 0xD8-0xDB */
+	0x98, 0x97, 0x98, 0x98, 0x98, 0x99, 0xE9, 0xC4, /* 0xDC-0xDF */
+	0xE9, 0xC6, 0x98, 0x9A, 0x98, 0x9B, 0x98, 0x9C, /* 0xE0-0xE3 */
+	0x98, 0x9D, 0x98, 0x9E, 0x98, 0x9F, 0x98, 0xA0, /* 0xE4-0xE7 */
+	0x98, 0xA1, 0x98, 0xA2, 0x98, 0xA3, 0x98, 0xA4, /* 0xE8-0xEB */
+	0x98, 0xA5, 0xE9, 0xCA, 0x98, 0xA6, 0x98, 0xA7, /* 0xEC-0xEF */
+	0x98, 0xA8, 0x98, 0xA9, 0xE9, 0xCE, 0x98, 0xAA, /* 0xF0-0xF3 */
+	0x98, 0xAB, 0x98, 0xAC, 0x98, 0xAD, 0x98, 0xAE, /* 0xF4-0xF7 */
+	0x98, 0xAF, 0x98, 0xB0, 0x98, 0xB1, 0x98, 0xB2, /* 0xF8-0xFB */
+	0x98, 0xB3, 0xB2, 0xDB, 0x98, 0xB4, 0xE9, 0xC8, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6A[512] = {
+	0x98, 0xB5, 0x98, 0xB6, 0x98, 0xB7, 0x98, 0xB8, /* 0x00-0x03 */
+	0x98, 0xB9, 0x98, 0xBA, 0x98, 0xBB, 0x98, 0xBC, /* 0x04-0x07 */
+	0x98, 0xBD, 0x98, 0xBE, 0xB7, 0xAE, 0x98, 0xBF, /* 0x08-0x0B */
+	0x98, 0xC0, 0x98, 0xC1, 0x98, 0xC2, 0x98, 0xC3, /* 0x0C-0x0F */
+	0x98, 0xC4, 0x98, 0xC5, 0x98, 0xC6, 0x98, 0xC7, /* 0x10-0x13 */
+	0x98, 0xC8, 0x98, 0xC9, 0x98, 0xCA, 0xE9, 0xCB, /* 0x14-0x17 */
+	0xE9, 0xCC, 0x98, 0xCB, 0x98, 0xCC, 0x98, 0xCD, /* 0x18-0x1B */
+	0x98, 0xCE, 0x98, 0xCF, 0x98, 0xD0, 0xD5, 0xC1, /* 0x1C-0x1F */
+	0x98, 0xD1, 0xC4, 0xA3, 0x98, 0xD2, 0x98, 0xD3, /* 0x20-0x23 */
+	0x98, 0xD4, 0x98, 0xD5, 0x98, 0xD6, 0x98, 0xD7, /* 0x24-0x27 */
+	0xE9, 0xD8, 0x98, 0xD8, 0xBA, 0xE1, 0x98, 0xD9, /* 0x28-0x2B */
+	0x98, 0xDA, 0x98, 0xDB, 0x98, 0xDC, 0xE9, 0xC9, /* 0x2C-0x2F */
+	0x98, 0xDD, 0xD3, 0xA3, 0x98, 0xDE, 0x98, 0xDF, /* 0x30-0x33 */
+	0x98, 0xE0, 0xE9, 0xD4, 0x98, 0xE1, 0x98, 0xE2, /* 0x34-0x37 */
+	0x98, 0xE3, 0x98, 0xE4, 0x98, 0xE5, 0x98, 0xE6, /* 0x38-0x3B */
+	0x98, 0xE7, 0xE9, 0xD7, 0xE9, 0xD0, 0x98, 0xE8, /* 0x3C-0x3F */
+	0x98, 0xE9, 0x98, 0xEA, 0x98, 0xEB, 0x98, 0xEC, /* 0x40-0x43 */
+	0xE9, 0xCF, 0x98, 0xED, 0x98, 0xEE, 0xC7, 0xC1, /* 0x44-0x47 */
+	0x98, 0xEF, 0x98, 0xF0, 0x98, 0xF1, 0x98, 0xF2, /* 0x48-0x4B */
+	0x98, 0xF3, 0x98, 0xF4, 0x98, 0xF5, 0x98, 0xF6, /* 0x4C-0x4F */
+	0xE9, 0xD2, 0x98, 0xF7, 0x98, 0xF8, 0x98, 0xF9, /* 0x50-0x53 */
+	0x98, 0xFA, 0x98, 0xFB, 0x98, 0xFC, 0x98, 0xFD, /* 0x54-0x57 */
+	0xE9, 0xD9, 0xB3, 0xC8, 0x98, 0xFE, 0xE9, 0xD3, /* 0x58-0x5B */
+	0x99, 0x40, 0x99, 0x41, 0x99, 0x42, 0x99, 0x43, /* 0x5C-0x5F */
+	0x99, 0x44, 0xCF, 0xF0, 0x99, 0x45, 0x99, 0x46, /* 0x60-0x63 */
+	0x99, 0x47, 0xE9, 0xCD, 0x99, 0x48, 0x99, 0x49, /* 0x64-0x67 */
+	0x99, 0x4A, 0x99, 0x4B, 0x99, 0x4C, 0x99, 0x4D, /* 0x68-0x6B */
+	0x99, 0x4E, 0x99, 0x4F, 0x99, 0x50, 0x99, 0x51, /* 0x6C-0x6F */
+	0x99, 0x52, 0xB3, 0xF7, 0x99, 0x53, 0x99, 0x54, /* 0x70-0x73 */
+	0x99, 0x55, 0x99, 0x56, 0x99, 0x57, 0x99, 0x58, /* 0x74-0x77 */
+	0x99, 0x59, 0xE9, 0xD6, 0x99, 0x5A, 0x99, 0x5B, /* 0x78-0x7B */
+	0xE9, 0xDA, 0x99, 0x5C, 0x99, 0x5D, 0x99, 0x5E, /* 0x7C-0x7F */
+	
+	0xCC, 0xB4, 0x99, 0x5F, 0x99, 0x60, 0x99, 0x61, /* 0x80-0x83 */
+	0xCF, 0xAD, 0x99, 0x62, 0x99, 0x63, 0x99, 0x64, /* 0x84-0x87 */
+	0x99, 0x65, 0x99, 0x66, 0x99, 0x67, 0x99, 0x68, /* 0x88-0x8B */
+	0x99, 0x69, 0x99, 0x6A, 0xE9, 0xD5, 0x99, 0x6B, /* 0x8C-0x8F */
+	0xE9, 0xDC, 0xE9, 0xDB, 0x99, 0x6C, 0x99, 0x6D, /* 0x90-0x93 */
+	0x99, 0x6E, 0x99, 0x6F, 0x99, 0x70, 0xE9, 0xDE, /* 0x94-0x97 */
+	0x99, 0x71, 0x99, 0x72, 0x99, 0x73, 0x99, 0x74, /* 0x98-0x9B */
+	0x99, 0x75, 0x99, 0x76, 0x99, 0x77, 0x99, 0x78, /* 0x9C-0x9F */
+	0xE9, 0xD1, 0x99, 0x79, 0x99, 0x7A, 0x99, 0x7B, /* 0xA0-0xA3 */
+	0x99, 0x7C, 0x99, 0x7D, 0x99, 0x7E, 0x99, 0x80, /* 0xA4-0xA7 */
+	0x99, 0x81, 0xE9, 0xDD, 0x99, 0x82, 0xE9, 0xDF, /* 0xA8-0xAB */
+	0xC3, 0xCA, 0x99, 0x83, 0x99, 0x84, 0x99, 0x85, /* 0xAC-0xAF */
+	0x99, 0x86, 0x99, 0x87, 0x99, 0x88, 0x99, 0x89, /* 0xB0-0xB3 */
+	0x99, 0x8A, 0x99, 0x8B, 0x99, 0x8C, 0x99, 0x8D, /* 0xB4-0xB7 */
+	0x99, 0x8E, 0x99, 0x8F, 0x99, 0x90, 0x99, 0x91, /* 0xB8-0xBB */
+	0x99, 0x92, 0x99, 0x93, 0x99, 0x94, 0x99, 0x95, /* 0xBC-0xBF */
+	0x99, 0x96, 0x99, 0x97, 0x99, 0x98, 0x99, 0x99, /* 0xC0-0xC3 */
+	0x99, 0x9A, 0x99, 0x9B, 0x99, 0x9C, 0x99, 0x9D, /* 0xC4-0xC7 */
+	0x99, 0x9E, 0x99, 0x9F, 0x99, 0xA0, 0x99, 0xA1, /* 0xC8-0xCB */
+	0x99, 0xA2, 0x99, 0xA3, 0x99, 0xA4, 0x99, 0xA5, /* 0xCC-0xCF */
+	0x99, 0xA6, 0x99, 0xA7, 0x99, 0xA8, 0x99, 0xA9, /* 0xD0-0xD3 */
+	0x99, 0xAA, 0x99, 0xAB, 0x99, 0xAC, 0x99, 0xAD, /* 0xD4-0xD7 */
+	0x99, 0xAE, 0x99, 0xAF, 0x99, 0xB0, 0x99, 0xB1, /* 0xD8-0xDB */
+	0x99, 0xB2, 0x99, 0xB3, 0x99, 0xB4, 0x99, 0xB5, /* 0xDC-0xDF */
+	0x99, 0xB6, 0x99, 0xB7, 0x99, 0xB8, 0x99, 0xB9, /* 0xE0-0xE3 */
+	0x99, 0xBA, 0x99, 0xBB, 0x99, 0xBC, 0x99, 0xBD, /* 0xE4-0xE7 */
+	0x99, 0xBE, 0x99, 0xBF, 0x99, 0xC0, 0x99, 0xC1, /* 0xE8-0xEB */
+	0x99, 0xC2, 0x99, 0xC3, 0x99, 0xC4, 0x99, 0xC5, /* 0xEC-0xEF */
+	0x99, 0xC6, 0x99, 0xC7, 0x99, 0xC8, 0x99, 0xC9, /* 0xF0-0xF3 */
+	0x99, 0xCA, 0x99, 0xCB, 0x99, 0xCC, 0x99, 0xCD, /* 0xF4-0xF7 */
+	0x99, 0xCE, 0x99, 0xCF, 0x99, 0xD0, 0x99, 0xD1, /* 0xF8-0xFB */
+	0x99, 0xD2, 0x99, 0xD3, 0x99, 0xD4, 0x99, 0xD5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6B[512] = {
+	0x99, 0xD6, 0x99, 0xD7, 0x99, 0xD8, 0x99, 0xD9, /* 0x00-0x03 */
+	0x99, 0xDA, 0x99, 0xDB, 0x99, 0xDC, 0x99, 0xDD, /* 0x04-0x07 */
+	0x99, 0xDE, 0x99, 0xDF, 0x99, 0xE0, 0x99, 0xE1, /* 0x08-0x0B */
+	0x99, 0xE2, 0x99, 0xE3, 0x99, 0xE4, 0x99, 0xE5, /* 0x0C-0x0F */
+	0x99, 0xE6, 0x99, 0xE7, 0x99, 0xE8, 0x99, 0xE9, /* 0x10-0x13 */
+	0x99, 0xEA, 0x99, 0xEB, 0x99, 0xEC, 0x99, 0xED, /* 0x14-0x17 */
+	0x99, 0xEE, 0x99, 0xEF, 0x99, 0xF0, 0x99, 0xF1, /* 0x18-0x1B */
+	0x99, 0xF2, 0x99, 0xF3, 0x99, 0xF4, 0x99, 0xF5, /* 0x1C-0x1F */
+	0xC7, 0xB7, 0xB4, 0xCE, 0xBB, 0xB6, 0xD0, 0xC0, /* 0x20-0x23 */
+	0xEC, 0xA3, 0x99, 0xF6, 0x99, 0xF7, 0xC5, 0xB7, /* 0x24-0x27 */
+	0x99, 0xF8, 0x99, 0xF9, 0x99, 0xFA, 0x99, 0xFB, /* 0x28-0x2B */
+	0x99, 0xFC, 0x99, 0xFD, 0x99, 0xFE, 0x9A, 0x40, /* 0x2C-0x2F */
+	0x9A, 0x41, 0x9A, 0x42, 0xD3, 0xFB, 0x9A, 0x43, /* 0x30-0x33 */
+	0x9A, 0x44, 0x9A, 0x45, 0x9A, 0x46, 0xEC, 0xA4, /* 0x34-0x37 */
+	0x9A, 0x47, 0xEC, 0xA5, 0xC6, 0xDB, 0x9A, 0x48, /* 0x38-0x3B */
+	0x9A, 0x49, 0x9A, 0x4A, 0xBF, 0xEE, 0x9A, 0x4B, /* 0x3C-0x3F */
+	0x9A, 0x4C, 0x9A, 0x4D, 0x9A, 0x4E, 0xEC, 0xA6, /* 0x40-0x43 */
+	0x9A, 0x4F, 0x9A, 0x50, 0xEC, 0xA7, 0xD0, 0xAA, /* 0x44-0x47 */
+	0x9A, 0x51, 0xC7, 0xB8, 0x9A, 0x52, 0x9A, 0x53, /* 0x48-0x4B */
+	0xB8, 0xE8, 0x9A, 0x54, 0x9A, 0x55, 0x9A, 0x56, /* 0x4C-0x4F */
+	0x9A, 0x57, 0x9A, 0x58, 0x9A, 0x59, 0x9A, 0x5A, /* 0x50-0x53 */
+	0x9A, 0x5B, 0x9A, 0x5C, 0x9A, 0x5D, 0x9A, 0x5E, /* 0x54-0x57 */
+	0x9A, 0x5F, 0xEC, 0xA8, 0x9A, 0x60, 0x9A, 0x61, /* 0x58-0x5B */
+	0x9A, 0x62, 0x9A, 0x63, 0x9A, 0x64, 0x9A, 0x65, /* 0x5C-0x5F */
+	0x9A, 0x66, 0x9A, 0x67, 0xD6, 0xB9, 0xD5, 0xFD, /* 0x60-0x63 */
+	0xB4, 0xCB, 0xB2, 0xBD, 0xCE, 0xE4, 0xC6, 0xE7, /* 0x64-0x67 */
+	0x9A, 0x68, 0x9A, 0x69, 0xCD, 0xE1, 0x9A, 0x6A, /* 0x68-0x6B */
+	0x9A, 0x6B, 0x9A, 0x6C, 0x9A, 0x6D, 0x9A, 0x6E, /* 0x6C-0x6F */
+	0x9A, 0x6F, 0x9A, 0x70, 0x9A, 0x71, 0x9A, 0x72, /* 0x70-0x73 */
+	0x9A, 0x73, 0x9A, 0x74, 0x9A, 0x75, 0x9A, 0x76, /* 0x74-0x77 */
+	0x9A, 0x77, 0xB4, 0xF5, 0x9A, 0x78, 0xCB, 0xC0, /* 0x78-0x7B */
+	0xBC, 0xDF, 0x9A, 0x79, 0x9A, 0x7A, 0x9A, 0x7B, /* 0x7C-0x7F */
+	
+	0x9A, 0x7C, 0xE9, 0xE2, 0xE9, 0xE3, 0xD1, 0xEA, /* 0x80-0x83 */
+	0xE9, 0xE5, 0x9A, 0x7D, 0xB4, 0xF9, 0xE9, 0xE4, /* 0x84-0x87 */
+	0x9A, 0x7E, 0xD1, 0xB3, 0xCA, 0xE2, 0xB2, 0xD0, /* 0x88-0x8B */
+	0x9A, 0x80, 0xE9, 0xE8, 0x9A, 0x81, 0x9A, 0x82, /* 0x8C-0x8F */
+	0x9A, 0x83, 0x9A, 0x84, 0xE9, 0xE6, 0xE9, 0xE7, /* 0x90-0x93 */
+	0x9A, 0x85, 0x9A, 0x86, 0xD6, 0xB3, 0x9A, 0x87, /* 0x94-0x97 */
+	0x9A, 0x88, 0x9A, 0x89, 0xE9, 0xE9, 0xE9, 0xEA, /* 0x98-0x9B */
+	0x9A, 0x8A, 0x9A, 0x8B, 0x9A, 0x8C, 0x9A, 0x8D, /* 0x9C-0x9F */
+	0x9A, 0x8E, 0xE9, 0xEB, 0x9A, 0x8F, 0x9A, 0x90, /* 0xA0-0xA3 */
+	0x9A, 0x91, 0x9A, 0x92, 0x9A, 0x93, 0x9A, 0x94, /* 0xA4-0xA7 */
+	0x9A, 0x95, 0x9A, 0x96, 0xE9, 0xEC, 0x9A, 0x97, /* 0xA8-0xAB */
+	0x9A, 0x98, 0x9A, 0x99, 0x9A, 0x9A, 0x9A, 0x9B, /* 0xAC-0xAF */
+	0x9A, 0x9C, 0x9A, 0x9D, 0x9A, 0x9E, 0xEC, 0xAF, /* 0xB0-0xB3 */
+	0xC5, 0xB9, 0xB6, 0xCE, 0x9A, 0x9F, 0xD2, 0xF3, /* 0xB4-0xB7 */
+	0x9A, 0xA0, 0x9A, 0xA1, 0x9A, 0xA2, 0x9A, 0xA3, /* 0xB8-0xBB */
+	0x9A, 0xA4, 0x9A, 0xA5, 0x9A, 0xA6, 0xB5, 0xEE, /* 0xBC-0xBF */
+	0x9A, 0xA7, 0xBB, 0xD9, 0xEC, 0xB1, 0x9A, 0xA8, /* 0xC0-0xC3 */
+	0x9A, 0xA9, 0xD2, 0xE3, 0x9A, 0xAA, 0x9A, 0xAB, /* 0xC4-0xC7 */
+	0x9A, 0xAC, 0x9A, 0xAD, 0x9A, 0xAE, 0xCE, 0xE3, /* 0xC8-0xCB */
+	0x9A, 0xAF, 0xC4, 0xB8, 0x9A, 0xB0, 0xC3, 0xBF, /* 0xCC-0xCF */
+	0x9A, 0xB1, 0x9A, 0xB2, 0xB6, 0xBE, 0xD8, 0xB9, /* 0xD0-0xD3 */
+	0xB1, 0xC8, 0xB1, 0xCF, 0xB1, 0xD1, 0xC5, 0xFE, /* 0xD4-0xD7 */
+	0x9A, 0xB3, 0xB1, 0xD0, 0x9A, 0xB4, 0xC3, 0xAB, /* 0xD8-0xDB */
+	0x9A, 0xB5, 0x9A, 0xB6, 0x9A, 0xB7, 0x9A, 0xB8, /* 0xDC-0xDF */
+	0x9A, 0xB9, 0xD5, 0xB1, 0x9A, 0xBA, 0x9A, 0xBB, /* 0xE0-0xE3 */
+	0x9A, 0xBC, 0x9A, 0xBD, 0x9A, 0xBE, 0x9A, 0xBF, /* 0xE4-0xE7 */
+	0x9A, 0xC0, 0x9A, 0xC1, 0xEB, 0xA4, 0xBA, 0xC1, /* 0xE8-0xEB */
+	0x9A, 0xC2, 0x9A, 0xC3, 0x9A, 0xC4, 0xCC, 0xBA, /* 0xEC-0xEF */
+	0x9A, 0xC5, 0x9A, 0xC6, 0x9A, 0xC7, 0xEB, 0xA5, /* 0xF0-0xF3 */
+	0x9A, 0xC8, 0xEB, 0xA7, 0x9A, 0xC9, 0x9A, 0xCA, /* 0xF4-0xF7 */
+	0x9A, 0xCB, 0xEB, 0xA8, 0x9A, 0xCC, 0x9A, 0xCD, /* 0xF8-0xFB */
+	0x9A, 0xCE, 0xEB, 0xA6, 0x9A, 0xCF, 0x9A, 0xD0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6C[512] = {
+	0x9A, 0xD1, 0x9A, 0xD2, 0x9A, 0xD3, 0x9A, 0xD4, /* 0x00-0x03 */
+	0x9A, 0xD5, 0xEB, 0xA9, 0xEB, 0xAB, 0xEB, 0xAA, /* 0x04-0x07 */
+	0x9A, 0xD6, 0x9A, 0xD7, 0x9A, 0xD8, 0x9A, 0xD9, /* 0x08-0x0B */
+	0x9A, 0xDA, 0xEB, 0xAC, 0x9A, 0xDB, 0xCA, 0xCF, /* 0x0C-0x0F */
+	0xD8, 0xB5, 0xC3, 0xF1, 0x9A, 0xDC, 0xC3, 0xA5, /* 0x10-0x13 */
+	0xC6, 0xF8, 0xEB, 0xAD, 0xC4, 0xCA, 0x9A, 0xDD, /* 0x14-0x17 */
+	0xEB, 0xAE, 0xEB, 0xAF, 0xEB, 0xB0, 0xB7, 0xD5, /* 0x18-0x1B */
+	0x9A, 0xDE, 0x9A, 0xDF, 0x9A, 0xE0, 0xB7, 0xFA, /* 0x1C-0x1F */
+	0x9A, 0xE1, 0xEB, 0xB1, 0xC7, 0xE2, 0x9A, 0xE2, /* 0x20-0x23 */
+	0xEB, 0xB3, 0x9A, 0xE3, 0xBA, 0xA4, 0xD1, 0xF5, /* 0x24-0x27 */
+	0xB0, 0xB1, 0xEB, 0xB2, 0xEB, 0xB4, 0x9A, 0xE4, /* 0x28-0x2B */
+	0x9A, 0xE5, 0x9A, 0xE6, 0xB5, 0xAA, 0xC2, 0xC8, /* 0x2C-0x2F */
+	0xC7, 0xE8, 0x9A, 0xE7, 0xEB, 0xB5, 0x9A, 0xE8, /* 0x30-0x33 */
+	0xCB, 0xAE, 0xE3, 0xDF, 0x9A, 0xE9, 0x9A, 0xEA, /* 0x34-0x37 */
+	0xD3, 0xC0, 0x9A, 0xEB, 0x9A, 0xEC, 0x9A, 0xED, /* 0x38-0x3B */
+	0x9A, 0xEE, 0xD9, 0xDB, 0x9A, 0xEF, 0x9A, 0xF0, /* 0x3C-0x3F */
+	0xCD, 0xA1, 0xD6, 0xAD, 0xC7, 0xF3, 0x9A, 0xF1, /* 0x40-0x43 */
+	0x9A, 0xF2, 0x9A, 0xF3, 0xD9, 0xE0, 0xBB, 0xE3, /* 0x44-0x47 */
+	0x9A, 0xF4, 0xBA, 0xBA, 0xE3, 0xE2, 0x9A, 0xF5, /* 0x48-0x4B */
+	0x9A, 0xF6, 0x9A, 0xF7, 0x9A, 0xF8, 0x9A, 0xF9, /* 0x4C-0x4F */
+	0xCF, 0xAB, 0x9A, 0xFA, 0x9A, 0xFB, 0x9A, 0xFC, /* 0x50-0x53 */
+	0xE3, 0xE0, 0xC9, 0xC7, 0x9A, 0xFD, 0xBA, 0xB9, /* 0x54-0x57 */
+	0x9A, 0xFE, 0x9B, 0x40, 0x9B, 0x41, 0xD1, 0xB4, /* 0x58-0x5B */
+	0xE3, 0xE1, 0xC8, 0xEA, 0xB9, 0xAF, 0xBD, 0xAD, /* 0x5C-0x5F */
+	0xB3, 0xD8, 0xCE, 0xDB, 0x9B, 0x42, 0x9B, 0x43, /* 0x60-0x63 */
+	0xCC, 0xC0, 0x9B, 0x44, 0x9B, 0x45, 0x9B, 0x46, /* 0x64-0x67 */
+	0xE3, 0xE8, 0xE3, 0xE9, 0xCD, 0xF4, 0x9B, 0x47, /* 0x68-0x6B */
+	0x9B, 0x48, 0x9B, 0x49, 0x9B, 0x4A, 0x9B, 0x4B, /* 0x6C-0x6F */
+	0xCC, 0xAD, 0x9B, 0x4C, 0xBC, 0xB3, 0x9B, 0x4D, /* 0x70-0x73 */
+	0xE3, 0xEA, 0x9B, 0x4E, 0xE3, 0xEB, 0x9B, 0x4F, /* 0x74-0x77 */
+	0x9B, 0x50, 0xD0, 0xDA, 0x9B, 0x51, 0x9B, 0x52, /* 0x78-0x7B */
+	0x9B, 0x53, 0xC6, 0xFB, 0xB7, 0xDA, 0x9B, 0x54, /* 0x7C-0x7F */
+	
+	0x9B, 0x55, 0xC7, 0xDF, 0xD2, 0xCA, 0xCE, 0xD6, /* 0x80-0x83 */
+	0x9B, 0x56, 0xE3, 0xE4, 0xE3, 0xEC, 0x9B, 0x57, /* 0x84-0x87 */
+	0xC9, 0xF2, 0xB3, 0xC1, 0x9B, 0x58, 0x9B, 0x59, /* 0x88-0x8B */
+	0xE3, 0xE7, 0x9B, 0x5A, 0x9B, 0x5B, 0xC6, 0xE3, /* 0x8C-0x8F */
+	0xE3, 0xE5, 0x9B, 0x5C, 0x9B, 0x5D, 0xED, 0xB3, /* 0x90-0x93 */
+	0xE3, 0xE6, 0x9B, 0x5E, 0x9B, 0x5F, 0x9B, 0x60, /* 0x94-0x97 */
+	0x9B, 0x61, 0xC9, 0xB3, 0x9B, 0x62, 0xC5, 0xE6, /* 0x98-0x9B */
+	0x9B, 0x63, 0x9B, 0x64, 0x9B, 0x65, 0xB9, 0xB5, /* 0x9C-0x9F */
+	0x9B, 0x66, 0xC3, 0xBB, 0x9B, 0x67, 0xE3, 0xE3, /* 0xA0-0xA3 */
+	0xC5, 0xBD, 0xC1, 0xA4, 0xC2, 0xD9, 0xB2, 0xD7, /* 0xA4-0xA7 */
+	0x9B, 0x68, 0xE3, 0xED, 0xBB, 0xA6, 0xC4, 0xAD, /* 0xA8-0xAB */
+	0x9B, 0x69, 0xE3, 0xF0, 0xBE, 0xDA, 0x9B, 0x6A, /* 0xAC-0xAF */
+	0x9B, 0x6B, 0xE3, 0xFB, 0xE3, 0xF5, 0xBA, 0xD3, /* 0xB0-0xB3 */
+	0x9B, 0x6C, 0x9B, 0x6D, 0x9B, 0x6E, 0x9B, 0x6F, /* 0xB4-0xB7 */
+	0xB7, 0xD0, 0xD3, 0xCD, 0x9B, 0x70, 0xD6, 0xCE, /* 0xB8-0xBB */
+	0xD5, 0xD3, 0xB9, 0xC1, 0xD5, 0xB4, 0xD1, 0xD8, /* 0xBC-0xBF */
+	0x9B, 0x71, 0x9B, 0x72, 0x9B, 0x73, 0x9B, 0x74, /* 0xC0-0xC3 */
+	0xD0, 0xB9, 0xC7, 0xF6, 0x9B, 0x75, 0x9B, 0x76, /* 0xC4-0xC7 */
+	0x9B, 0x77, 0xC8, 0xAA, 0xB2, 0xB4, 0x9B, 0x78, /* 0xC8-0xCB */
+	0xC3, 0xDA, 0x9B, 0x79, 0x9B, 0x7A, 0x9B, 0x7B, /* 0xCC-0xCF */
+	0xE3, 0xEE, 0x9B, 0x7C, 0x9B, 0x7D, 0xE3, 0xFC, /* 0xD0-0xD3 */
+	0xE3, 0xEF, 0xB7, 0xA8, 0xE3, 0xF7, 0xE3, 0xF4, /* 0xD4-0xD7 */
+	0x9B, 0x7E, 0x9B, 0x80, 0x9B, 0x81, 0xB7, 0xBA, /* 0xD8-0xDB */
+	0x9B, 0x82, 0x9B, 0x83, 0xC5, 0xA2, 0x9B, 0x84, /* 0xDC-0xDF */
+	0xE3, 0xF6, 0xC5, 0xDD, 0xB2, 0xA8, 0xC6, 0xFC, /* 0xE0-0xE3 */
+	0x9B, 0x85, 0xC4, 0xE0, 0x9B, 0x86, 0x9B, 0x87, /* 0xE4-0xE7 */
+	0xD7, 0xA2, 0x9B, 0x88, 0xC0, 0xE1, 0xE3, 0xF9, /* 0xE8-0xEB */
+	0x9B, 0x89, 0x9B, 0x8A, 0xE3, 0xFA, 0xE3, 0xFD, /* 0xEC-0xEF */
+	0xCC, 0xA9, 0xE3, 0xF3, 0x9B, 0x8B, 0xD3, 0xBE, /* 0xF0-0xF3 */
+	0x9B, 0x8C, 0xB1, 0xC3, 0xED, 0xB4, 0xE3, 0xF1, /* 0xF4-0xF7 */
+	0xE3, 0xF2, 0x9B, 0x8D, 0xE3, 0xF8, 0xD0, 0xBA, /* 0xF8-0xFB */
+	0xC6, 0xC3, 0xD4, 0xF3, 0xE3, 0xFE, 0x9B, 0x8E, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6D[512] = {
+	0x9B, 0x8F, 0xBD, 0xE0, 0x9B, 0x90, 0x9B, 0x91, /* 0x00-0x03 */
+	0xE4, 0xA7, 0x9B, 0x92, 0x9B, 0x93, 0xE4, 0xA6, /* 0x04-0x07 */
+	0x9B, 0x94, 0x9B, 0x95, 0x9B, 0x96, 0xD1, 0xF3, /* 0x08-0x0B */
+	0xE4, 0xA3, 0x9B, 0x97, 0xE4, 0xA9, 0x9B, 0x98, /* 0x0C-0x0F */
+	0x9B, 0x99, 0x9B, 0x9A, 0xC8, 0xF7, 0x9B, 0x9B, /* 0x10-0x13 */
+	0x9B, 0x9C, 0x9B, 0x9D, 0x9B, 0x9E, 0xCF, 0xB4, /* 0x14-0x17 */
+	0x9B, 0x9F, 0xE4, 0xA8, 0xE4, 0xAE, 0xC2, 0xE5, /* 0x18-0x1B */
+	0x9B, 0xA0, 0x9B, 0xA1, 0xB6, 0xB4, 0x9B, 0xA2, /* 0x1C-0x1F */
+	0x9B, 0xA3, 0x9B, 0xA4, 0x9B, 0xA5, 0x9B, 0xA6, /* 0x20-0x23 */
+	0x9B, 0xA7, 0xBD, 0xF2, 0x9B, 0xA8, 0xE4, 0xA2, /* 0x24-0x27 */
+	0x9B, 0xA9, 0x9B, 0xAA, 0xBA, 0xE9, 0xE4, 0xAA, /* 0x28-0x2B */
+	0x9B, 0xAB, 0x9B, 0xAC, 0xE4, 0xAC, 0x9B, 0xAD, /* 0x2C-0x2F */
+	0x9B, 0xAE, 0xB6, 0xFD, 0xD6, 0xDE, 0xE4, 0xB2, /* 0x30-0x33 */
+	0x9B, 0xAF, 0xE4, 0xAD, 0x9B, 0xB0, 0x9B, 0xB1, /* 0x34-0x37 */
+	0x9B, 0xB2, 0xE4, 0xA1, 0x9B, 0xB3, 0xBB, 0xEE, /* 0x38-0x3B */
+	0xCD, 0xDD, 0xC7, 0xA2, 0xC5, 0xC9, 0x9B, 0xB4, /* 0x3C-0x3F */
+	0x9B, 0xB5, 0xC1, 0xF7, 0x9B, 0xB6, 0xE4, 0xA4, /* 0x40-0x43 */
+	0x9B, 0xB7, 0xC7, 0xB3, 0xBD, 0xAC, 0xBD, 0xBD, /* 0x44-0x47 */
+	0xE4, 0xA5, 0x9B, 0xB8, 0xD7, 0xC7, 0xB2, 0xE2, /* 0x48-0x4B */
+	0x9B, 0xB9, 0xE4, 0xAB, 0xBC, 0xC3, 0xE4, 0xAF, /* 0x4C-0x4F */
+	0x9B, 0xBA, 0xBB, 0xEB, 0xE4, 0xB0, 0xC5, 0xA8, /* 0x50-0x53 */
+	0xE4, 0xB1, 0x9B, 0xBB, 0x9B, 0xBC, 0x9B, 0xBD, /* 0x54-0x57 */
+	0x9B, 0xBE, 0xD5, 0xE3, 0xBF, 0xA3, 0x9B, 0xBF, /* 0x58-0x5B */
+	0xE4, 0xBA, 0x9B, 0xC0, 0xE4, 0xB7, 0x9B, 0xC1, /* 0x5C-0x5F */
+	0xE4, 0xBB, 0x9B, 0xC2, 0x9B, 0xC3, 0xE4, 0xBD, /* 0x60-0x63 */
+	0x9B, 0xC4, 0x9B, 0xC5, 0xC6, 0xD6, 0x9B, 0xC6, /* 0x64-0x67 */
+	0x9B, 0xC7, 0xBA, 0xC6, 0xC0, 0xCB, 0x9B, 0xC8, /* 0x68-0x6B */
+	0x9B, 0xC9, 0x9B, 0xCA, 0xB8, 0xA1, 0xE4, 0xB4, /* 0x6C-0x6F */
+	0x9B, 0xCB, 0x9B, 0xCC, 0x9B, 0xCD, 0x9B, 0xCE, /* 0x70-0x73 */
+	0xD4, 0xA1, 0x9B, 0xCF, 0x9B, 0xD0, 0xBA, 0xA3, /* 0x74-0x77 */
+	0xBD, 0xFE, 0x9B, 0xD1, 0x9B, 0xD2, 0x9B, 0xD3, /* 0x78-0x7B */
+	0xE4, 0xBC, 0x9B, 0xD4, 0x9B, 0xD5, 0x9B, 0xD6, /* 0x7C-0x7F */
+	
+	0x9B, 0xD7, 0x9B, 0xD8, 0xCD, 0xBF, 0x9B, 0xD9, /* 0x80-0x83 */
+	0x9B, 0xDA, 0xC4, 0xF9, 0x9B, 0xDB, 0x9B, 0xDC, /* 0x84-0x87 */
+	0xCF, 0xFB, 0xC9, 0xE6, 0x9B, 0xDD, 0x9B, 0xDE, /* 0x88-0x8B */
+	0xD3, 0xBF, 0x9B, 0xDF, 0xCF, 0xD1, 0x9B, 0xE0, /* 0x8C-0x8F */
+	0x9B, 0xE1, 0xE4, 0xB3, 0x9B, 0xE2, 0xE4, 0xB8, /* 0x90-0x93 */
+	0xE4, 0xB9, 0xCC, 0xE9, 0x9B, 0xE3, 0x9B, 0xE4, /* 0x94-0x97 */
+	0x9B, 0xE5, 0x9B, 0xE6, 0x9B, 0xE7, 0xCC, 0xCE, /* 0x98-0x9B */
+	0x9B, 0xE8, 0xC0, 0xD4, 0xE4, 0xB5, 0xC1, 0xB0, /* 0x9C-0x9F */
+	0xE4, 0xB6, 0xCE, 0xD0, 0x9B, 0xE9, 0xBB, 0xC1, /* 0xA0-0xA3 */
+	0xB5, 0xD3, 0x9B, 0xEA, 0xC8, 0xF3, 0xBD, 0xA7, /* 0xA4-0xA7 */
+	0xD5, 0xC7, 0xC9, 0xAC, 0xB8, 0xA2, 0xE4, 0xCA, /* 0xA8-0xAB */
+	0x9B, 0xEB, 0x9B, 0xEC, 0xE4, 0xCC, 0xD1, 0xC4, /* 0xAC-0xAF */
+	0x9B, 0xED, 0x9B, 0xEE, 0xD2, 0xBA, 0x9B, 0xEF, /* 0xB0-0xB3 */
+	0x9B, 0xF0, 0xBA, 0xAD, 0x9B, 0xF1, 0x9B, 0xF2, /* 0xB4-0xB7 */
+	0xBA, 0xD4, 0x9B, 0xF3, 0x9B, 0xF4, 0x9B, 0xF5, /* 0xB8-0xBB */
+	0x9B, 0xF6, 0x9B, 0xF7, 0x9B, 0xF8, 0xE4, 0xC3, /* 0xBC-0xBF */
+	0xB5, 0xED, 0x9B, 0xF9, 0x9B, 0xFA, 0x9B, 0xFB, /* 0xC0-0xC3 */
+	0xD7, 0xCD, 0xE4, 0xC0, 0xCF, 0xFD, 0xE4, 0xBF, /* 0xC4-0xC7 */
+	0x9B, 0xFC, 0x9B, 0xFD, 0x9B, 0xFE, 0xC1, 0xDC, /* 0xC8-0xCB */
+	0xCC, 0xCA, 0x9C, 0x40, 0x9C, 0x41, 0x9C, 0x42, /* 0xCC-0xCF */
+	0x9C, 0x43, 0xCA, 0xE7, 0x9C, 0x44, 0x9C, 0x45, /* 0xD0-0xD3 */
+	0x9C, 0x46, 0x9C, 0x47, 0xC4, 0xD7, 0x9C, 0x48, /* 0xD4-0xD7 */
+	0xCC, 0xD4, 0xE4, 0xC8, 0x9C, 0x49, 0x9C, 0x4A, /* 0xD8-0xDB */
+	0x9C, 0x4B, 0xE4, 0xC7, 0xE4, 0xC1, 0x9C, 0x4C, /* 0xDC-0xDF */
+	0xE4, 0xC4, 0xB5, 0xAD, 0x9C, 0x4D, 0x9C, 0x4E, /* 0xE0-0xE3 */
+	0xD3, 0xD9, 0x9C, 0x4F, 0xE4, 0xC6, 0x9C, 0x50, /* 0xE4-0xE7 */
+	0x9C, 0x51, 0x9C, 0x52, 0x9C, 0x53, 0xD2, 0xF9, /* 0xE8-0xEB */
+	0xB4, 0xE3, 0x9C, 0x54, 0xBB, 0xB4, 0x9C, 0x55, /* 0xEC-0xEF */
+	0x9C, 0x56, 0xC9, 0xEE, 0x9C, 0x57, 0xB4, 0xBE, /* 0xF0-0xF3 */
+	0x9C, 0x58, 0x9C, 0x59, 0x9C, 0x5A, 0xBB, 0xEC, /* 0xF4-0xF7 */
+	0x9C, 0x5B, 0xD1, 0xCD, 0x9C, 0x5C, 0xCC, 0xED, /* 0xF8-0xFB */
+	0xED, 0xB5, 0x9C, 0x5D, 0x9C, 0x5E, 0x9C, 0x5F, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6E[512] = {
+	0x9C, 0x60, 0x9C, 0x61, 0x9C, 0x62, 0x9C, 0x63, /* 0x00-0x03 */
+	0x9C, 0x64, 0xC7, 0xE5, 0x9C, 0x65, 0x9C, 0x66, /* 0x04-0x07 */
+	0x9C, 0x67, 0x9C, 0x68, 0xD4, 0xA8, 0x9C, 0x69, /* 0x08-0x0B */
+	0xE4, 0xCB, 0xD7, 0xD5, 0xE4, 0xC2, 0x9C, 0x6A, /* 0x0C-0x0F */
+	0xBD, 0xA5, 0xE4, 0xC5, 0x9C, 0x6B, 0x9C, 0x6C, /* 0x10-0x13 */
+	0xD3, 0xE6, 0x9C, 0x6D, 0xE4, 0xC9, 0xC9, 0xF8, /* 0x14-0x17 */
+	0x9C, 0x6E, 0x9C, 0x6F, 0xE4, 0xBE, 0x9C, 0x70, /* 0x18-0x1B */
+	0x9C, 0x71, 0xD3, 0xE5, 0x9C, 0x72, 0x9C, 0x73, /* 0x1C-0x1F */
+	0xC7, 0xFE, 0xB6, 0xC9, 0x9C, 0x74, 0xD4, 0xFC, /* 0x20-0x23 */
+	0xB2, 0xB3, 0xE4, 0xD7, 0x9C, 0x75, 0x9C, 0x76, /* 0x24-0x27 */
+	0x9C, 0x77, 0xCE, 0xC2, 0x9C, 0x78, 0xE4, 0xCD, /* 0x28-0x2B */
+	0x9C, 0x79, 0xCE, 0xBC, 0x9C, 0x7A, 0xB8, 0xDB, /* 0x2C-0x2F */
+	0x9C, 0x7B, 0x9C, 0x7C, 0xE4, 0xD6, 0x9C, 0x7D, /* 0x30-0x33 */
+	0xBF, 0xCA, 0x9C, 0x7E, 0x9C, 0x80, 0x9C, 0x81, /* 0x34-0x37 */
+	0xD3, 0xCE, 0x9C, 0x82, 0xC3, 0xEC, 0x9C, 0x83, /* 0x38-0x3B */
+	0x9C, 0x84, 0x9C, 0x85, 0x9C, 0x86, 0x9C, 0x87, /* 0x3C-0x3F */
+	0x9C, 0x88, 0x9C, 0x89, 0x9C, 0x8A, 0xC5, 0xC8, /* 0x40-0x43 */
+	0xE4, 0xD8, 0x9C, 0x8B, 0x9C, 0x8C, 0x9C, 0x8D, /* 0x44-0x47 */
+	0x9C, 0x8E, 0x9C, 0x8F, 0x9C, 0x90, 0x9C, 0x91, /* 0x48-0x4B */
+	0x9C, 0x92, 0xCD, 0xC4, 0xE4, 0xCF, 0x9C, 0x93, /* 0x4C-0x4F */
+	0x9C, 0x94, 0x9C, 0x95, 0x9C, 0x96, 0xE4, 0xD4, /* 0x50-0x53 */
+	0xE4, 0xD5, 0x9C, 0x97, 0xBA, 0xFE, 0x9C, 0x98, /* 0x54-0x57 */
+	0xCF, 0xE6, 0x9C, 0x99, 0x9C, 0x9A, 0xD5, 0xBF, /* 0x58-0x5B */
+	0x9C, 0x9B, 0x9C, 0x9C, 0x9C, 0x9D, 0xE4, 0xD2, /* 0x5C-0x5F */
+	0x9C, 0x9E, 0x9C, 0x9F, 0x9C, 0xA0, 0x9C, 0xA1, /* 0x60-0x63 */
+	0x9C, 0xA2, 0x9C, 0xA3, 0x9C, 0xA4, 0x9C, 0xA5, /* 0x64-0x67 */
+	0x9C, 0xA6, 0x9C, 0xA7, 0x9C, 0xA8, 0xE4, 0xD0, /* 0x68-0x6B */
+	0x9C, 0xA9, 0x9C, 0xAA, 0xE4, 0xCE, 0x9C, 0xAB, /* 0x6C-0x6F */
+	0x9C, 0xAC, 0x9C, 0xAD, 0x9C, 0xAE, 0x9C, 0xAF, /* 0x70-0x73 */
+	0x9C, 0xB0, 0x9C, 0xB1, 0x9C, 0xB2, 0x9C, 0xB3, /* 0x74-0x77 */
+	0x9C, 0xB4, 0x9C, 0xB5, 0x9C, 0xB6, 0x9C, 0xB7, /* 0x78-0x7B */
+	0x9C, 0xB8, 0x9C, 0xB9, 0xCD, 0xE5, 0xCA, 0xAA, /* 0x7C-0x7F */
+	
+	0x9C, 0xBA, 0x9C, 0xBB, 0x9C, 0xBC, 0xC0, 0xA3, /* 0x80-0x83 */
+	0x9C, 0xBD, 0xBD, 0xA6, 0xE4, 0xD3, 0x9C, 0xBE, /* 0x84-0x87 */
+	0x9C, 0xBF, 0xB8, 0xC8, 0x9C, 0xC0, 0x9C, 0xC1, /* 0x88-0x8B */
+	0x9C, 0xC2, 0x9C, 0xC3, 0x9C, 0xC4, 0xE4, 0xE7, /* 0x8C-0x8F */
+	0xD4, 0xB4, 0x9C, 0xC5, 0x9C, 0xC6, 0x9C, 0xC7, /* 0x90-0x93 */
+	0x9C, 0xC8, 0x9C, 0xC9, 0x9C, 0xCA, 0x9C, 0xCB, /* 0x94-0x97 */
+	0xE4, 0xDB, 0x9C, 0xCC, 0x9C, 0xCD, 0x9C, 0xCE, /* 0x98-0x9B */
+	0xC1, 0xEF, 0x9C, 0xCF, 0x9C, 0xD0, 0xE4, 0xE9, /* 0x9C-0x9F */
+	0x9C, 0xD1, 0x9C, 0xD2, 0xD2, 0xE7, 0x9C, 0xD3, /* 0xA0-0xA3 */
+	0x9C, 0xD4, 0xE4, 0xDF, 0x9C, 0xD5, 0xE4, 0xE0, /* 0xA4-0xA7 */
+	0x9C, 0xD6, 0x9C, 0xD7, 0xCF, 0xAA, 0x9C, 0xD8, /* 0xA8-0xAB */
+	0x9C, 0xD9, 0x9C, 0xDA, 0x9C, 0xDB, 0xCB, 0xDD, /* 0xAC-0xAF */
+	0x9C, 0xDC, 0xE4, 0xDA, 0xE4, 0xD1, 0x9C, 0xDD, /* 0xB0-0xB3 */
+	0xE4, 0xE5, 0x9C, 0xDE, 0xC8, 0xDC, 0xE4, 0xE3, /* 0xB4-0xB7 */
+	0x9C, 0xDF, 0x9C, 0xE0, 0xC4, 0xE7, 0xE4, 0xE2, /* 0xB8-0xBB */
+	0x9C, 0xE1, 0xE4, 0xE1, 0x9C, 0xE2, 0x9C, 0xE3, /* 0xBC-0xBF */
+	0x9C, 0xE4, 0xB3, 0xFC, 0xE4, 0xE8, 0x9C, 0xE5, /* 0xC0-0xC3 */
+	0x9C, 0xE6, 0x9C, 0xE7, 0x9C, 0xE8, 0xB5, 0xE1, /* 0xC4-0xC7 */
+	0x9C, 0xE9, 0x9C, 0xEA, 0x9C, 0xEB, 0xD7, 0xCC, /* 0xC8-0xCB */
+	0x9C, 0xEC, 0x9C, 0xED, 0x9C, 0xEE, 0xE4, 0xE6, /* 0xCC-0xCF */
+	0x9C, 0xEF, 0xBB, 0xAC, 0x9C, 0xF0, 0xD7, 0xD2, /* 0xD0-0xD3 */
+	0xCC, 0xCF, 0xEB, 0xF8, 0x9C, 0xF1, 0xE4, 0xE4, /* 0xD4-0xD7 */
+	0x9C, 0xF2, 0x9C, 0xF3, 0xB9, 0xF6, 0x9C, 0xF4, /* 0xD8-0xDB */
+	0x9C, 0xF5, 0x9C, 0xF6, 0xD6, 0xCD, 0xE4, 0xD9, /* 0xDC-0xDF */
+	0xE4, 0xDC, 0xC2, 0xFA, 0xE4, 0xDE, 0x9C, 0xF7, /* 0xE0-0xE3 */
+	0xC2, 0xCB, 0xC0, 0xC4, 0xC2, 0xD0, 0x9C, 0xF8, /* 0xE4-0xE7 */
+	0xB1, 0xF5, 0xCC, 0xB2, 0x9C, 0xF9, 0x9C, 0xFA, /* 0xE8-0xEB */
+	0x9C, 0xFB, 0x9C, 0xFC, 0x9C, 0xFD, 0x9C, 0xFE, /* 0xEC-0xEF */
+	0x9D, 0x40, 0x9D, 0x41, 0x9D, 0x42, 0x9D, 0x43, /* 0xF0-0xF3 */
+	0xB5, 0xCE, 0x9D, 0x44, 0x9D, 0x45, 0x9D, 0x46, /* 0xF4-0xF7 */
+	0x9D, 0x47, 0xE4, 0xEF, 0x9D, 0x48, 0x9D, 0x49, /* 0xF8-0xFB */
+	0x9D, 0x4A, 0x9D, 0x4B, 0x9D, 0x4C, 0x9D, 0x4D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6F[512] = {
+	0x9D, 0x4E, 0x9D, 0x4F, 0xC6, 0xAF, 0x9D, 0x50, /* 0x00-0x03 */
+	0x9D, 0x51, 0x9D, 0x52, 0xC6, 0xE1, 0x9D, 0x53, /* 0x04-0x07 */
+	0x9D, 0x54, 0xE4, 0xF5, 0x9D, 0x55, 0x9D, 0x56, /* 0x08-0x0B */
+	0x9D, 0x57, 0x9D, 0x58, 0x9D, 0x59, 0xC2, 0xA9, /* 0x0C-0x0F */
+	0x9D, 0x5A, 0x9D, 0x5B, 0x9D, 0x5C, 0xC0, 0xEC, /* 0x10-0x13 */
+	0xD1, 0xDD, 0xE4, 0xEE, 0x9D, 0x5D, 0x9D, 0x5E, /* 0x14-0x17 */
+	0x9D, 0x5F, 0x9D, 0x60, 0x9D, 0x61, 0x9D, 0x62, /* 0x18-0x1B */
+	0x9D, 0x63, 0x9D, 0x64, 0x9D, 0x65, 0x9D, 0x66, /* 0x1C-0x1F */
+	0xC4, 0xAE, 0x9D, 0x67, 0x9D, 0x68, 0x9D, 0x69, /* 0x20-0x23 */
+	0xE4, 0xED, 0x9D, 0x6A, 0x9D, 0x6B, 0x9D, 0x6C, /* 0x24-0x27 */
+	0x9D, 0x6D, 0xE4, 0xF6, 0xE4, 0xF4, 0xC2, 0xFE, /* 0x28-0x2B */
+	0x9D, 0x6E, 0xE4, 0xDD, 0x9D, 0x6F, 0xE4, 0xF0, /* 0x2C-0x2F */
+	0x9D, 0x70, 0xCA, 0xFE, 0x9D, 0x71, 0xD5, 0xC4, /* 0x30-0x33 */
+	0x9D, 0x72, 0x9D, 0x73, 0xE4, 0xF1, 0x9D, 0x74, /* 0x34-0x37 */
+	0x9D, 0x75, 0x9D, 0x76, 0x9D, 0x77, 0x9D, 0x78, /* 0x38-0x3B */
+	0x9D, 0x79, 0x9D, 0x7A, 0xD1, 0xFA, 0x9D, 0x7B, /* 0x3C-0x3F */
+	0x9D, 0x7C, 0x9D, 0x7D, 0x9D, 0x7E, 0x9D, 0x80, /* 0x40-0x43 */
+	0x9D, 0x81, 0x9D, 0x82, 0xE4, 0xEB, 0xE4, 0xEC, /* 0x44-0x47 */
+	0x9D, 0x83, 0x9D, 0x84, 0x9D, 0x85, 0xE4, 0xF2, /* 0x48-0x4B */
+	0x9D, 0x86, 0xCE, 0xAB, 0x9D, 0x87, 0x9D, 0x88, /* 0x4C-0x4F */
+	0x9D, 0x89, 0x9D, 0x8A, 0x9D, 0x8B, 0x9D, 0x8C, /* 0x50-0x53 */
+	0x9D, 0x8D, 0x9D, 0x8E, 0x9D, 0x8F, 0x9D, 0x90, /* 0x54-0x57 */
+	0xC5, 0xCB, 0x9D, 0x91, 0x9D, 0x92, 0x9D, 0x93, /* 0x58-0x5B */
+	0xC7, 0xB1, 0x9D, 0x94, 0xC2, 0xBA, 0x9D, 0x95, /* 0x5C-0x5F */
+	0x9D, 0x96, 0x9D, 0x97, 0xE4, 0xEA, 0x9D, 0x98, /* 0x60-0x63 */
+	0x9D, 0x99, 0x9D, 0x9A, 0xC1, 0xCA, 0x9D, 0x9B, /* 0x64-0x67 */
+	0x9D, 0x9C, 0x9D, 0x9D, 0x9D, 0x9E, 0x9D, 0x9F, /* 0x68-0x6B */
+	0x9D, 0xA0, 0xCC, 0xB6, 0xB3, 0xB1, 0x9D, 0xA1, /* 0x6C-0x6F */
+	0x9D, 0xA2, 0x9D, 0xA3, 0xE4, 0xFB, 0x9D, 0xA4, /* 0x70-0x73 */
+	0xE4, 0xF3, 0x9D, 0xA5, 0x9D, 0xA6, 0x9D, 0xA7, /* 0x74-0x77 */
+	0xE4, 0xFA, 0x9D, 0xA8, 0xE4, 0xFD, 0x9D, 0xA9, /* 0x78-0x7B */
+	0xE4, 0xFC, 0x9D, 0xAA, 0x9D, 0xAB, 0x9D, 0xAC, /* 0x7C-0x7F */
+	
+	0x9D, 0xAD, 0x9D, 0xAE, 0x9D, 0xAF, 0x9D, 0xB0, /* 0x80-0x83 */
+	0xB3, 0xCE, 0x9D, 0xB1, 0x9D, 0xB2, 0x9D, 0xB3, /* 0x84-0x87 */
+	0xB3, 0xBA, 0xE4, 0xF7, 0x9D, 0xB4, 0x9D, 0xB5, /* 0x88-0x8B */
+	0xE4, 0xF9, 0xE4, 0xF8, 0xC5, 0xEC, 0x9D, 0xB6, /* 0x8C-0x8F */
+	0x9D, 0xB7, 0x9D, 0xB8, 0x9D, 0xB9, 0x9D, 0xBA, /* 0x90-0x93 */
+	0x9D, 0xBB, 0x9D, 0xBC, 0x9D, 0xBD, 0x9D, 0xBE, /* 0x94-0x97 */
+	0x9D, 0xBF, 0x9D, 0xC0, 0x9D, 0xC1, 0x9D, 0xC2, /* 0x98-0x9B */
+	0xC0, 0xBD, 0x9D, 0xC3, 0x9D, 0xC4, 0x9D, 0xC5, /* 0x9C-0x9F */
+	0x9D, 0xC6, 0xD4, 0xE8, 0x9D, 0xC7, 0x9D, 0xC8, /* 0xA0-0xA3 */
+	0x9D, 0xC9, 0x9D, 0xCA, 0x9D, 0xCB, 0xE5, 0xA2, /* 0xA4-0xA7 */
+	0x9D, 0xCC, 0x9D, 0xCD, 0x9D, 0xCE, 0x9D, 0xCF, /* 0xA8-0xAB */
+	0x9D, 0xD0, 0x9D, 0xD1, 0x9D, 0xD2, 0x9D, 0xD3, /* 0xAC-0xAF */
+	0x9D, 0xD4, 0x9D, 0xD5, 0x9D, 0xD6, 0xB0, 0xC4, /* 0xB0-0xB3 */
+	0x9D, 0xD7, 0x9D, 0xD8, 0xE5, 0xA4, 0x9D, 0xD9, /* 0xB4-0xB7 */
+	0x9D, 0xDA, 0xE5, 0xA3, 0x9D, 0xDB, 0x9D, 0xDC, /* 0xB8-0xBB */
+	0x9D, 0xDD, 0x9D, 0xDE, 0x9D, 0xDF, 0x9D, 0xE0, /* 0xBC-0xBF */
+	0xBC, 0xA4, 0x9D, 0xE1, 0xE5, 0xA5, 0x9D, 0xE2, /* 0xC0-0xC3 */
+	0x9D, 0xE3, 0x9D, 0xE4, 0x9D, 0xE5, 0x9D, 0xE6, /* 0xC4-0xC7 */
+	0x9D, 0xE7, 0xE5, 0xA1, 0x9D, 0xE8, 0x9D, 0xE9, /* 0xC8-0xCB */
+	0x9D, 0xEA, 0x9D, 0xEB, 0x9D, 0xEC, 0x9D, 0xED, /* 0xCC-0xCF */
+	0x9D, 0xEE, 0xE4, 0xFE, 0xB1, 0xF4, 0x9D, 0xEF, /* 0xD0-0xD3 */
+	0x9D, 0xF0, 0x9D, 0xF1, 0x9D, 0xF2, 0x9D, 0xF3, /* 0xD4-0xD7 */
+	0x9D, 0xF4, 0x9D, 0xF5, 0x9D, 0xF6, 0x9D, 0xF7, /* 0xD8-0xDB */
+	0x9D, 0xF8, 0x9D, 0xF9, 0xE5, 0xA8, 0x9D, 0xFA, /* 0xDC-0xDF */
+	0xE5, 0xA9, 0xE5, 0xA6, 0x9D, 0xFB, 0x9D, 0xFC, /* 0xE0-0xE3 */
+	0x9D, 0xFD, 0x9D, 0xFE, 0x9E, 0x40, 0x9E, 0x41, /* 0xE4-0xE7 */
+	0x9E, 0x42, 0x9E, 0x43, 0x9E, 0x44, 0x9E, 0x45, /* 0xE8-0xEB */
+	0x9E, 0x46, 0x9E, 0x47, 0xE5, 0xA7, 0xE5, 0xAA, /* 0xEC-0xEF */
+	0x9E, 0x48, 0x9E, 0x49, 0x9E, 0x4A, 0x9E, 0x4B, /* 0xF0-0xF3 */
+	0x9E, 0x4C, 0x9E, 0x4D, 0x9E, 0x4E, 0x9E, 0x4F, /* 0xF4-0xF7 */
+	0x9E, 0x50, 0x9E, 0x51, 0x9E, 0x52, 0x9E, 0x53, /* 0xF8-0xFB */
+	0x9E, 0x54, 0x9E, 0x55, 0x9E, 0x56, 0x9E, 0x57, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_70[512] = {
+	0x9E, 0x58, 0x9E, 0x59, 0x9E, 0x5A, 0x9E, 0x5B, /* 0x00-0x03 */
+	0x9E, 0x5C, 0x9E, 0x5D, 0x9E, 0x5E, 0x9E, 0x5F, /* 0x04-0x07 */
+	0x9E, 0x60, 0x9E, 0x61, 0x9E, 0x62, 0x9E, 0x63, /* 0x08-0x0B */
+	0x9E, 0x64, 0x9E, 0x65, 0x9E, 0x66, 0x9E, 0x67, /* 0x0C-0x0F */
+	0x9E, 0x68, 0xC6, 0xD9, 0x9E, 0x69, 0x9E, 0x6A, /* 0x10-0x13 */
+	0x9E, 0x6B, 0x9E, 0x6C, 0x9E, 0x6D, 0x9E, 0x6E, /* 0x14-0x17 */
+	0x9E, 0x6F, 0x9E, 0x70, 0xE5, 0xAB, 0xE5, 0xAD, /* 0x18-0x1B */
+	0x9E, 0x71, 0x9E, 0x72, 0x9E, 0x73, 0x9E, 0x74, /* 0x1C-0x1F */
+	0x9E, 0x75, 0x9E, 0x76, 0x9E, 0x77, 0xE5, 0xAC, /* 0x20-0x23 */
+	0x9E, 0x78, 0x9E, 0x79, 0x9E, 0x7A, 0x9E, 0x7B, /* 0x24-0x27 */
+	0x9E, 0x7C, 0x9E, 0x7D, 0x9E, 0x7E, 0x9E, 0x80, /* 0x28-0x2B */
+	0x9E, 0x81, 0x9E, 0x82, 0x9E, 0x83, 0x9E, 0x84, /* 0x2C-0x2F */
+	0x9E, 0x85, 0x9E, 0x86, 0x9E, 0x87, 0x9E, 0x88, /* 0x30-0x33 */
+	0x9E, 0x89, 0xE5, 0xAF, 0x9E, 0x8A, 0x9E, 0x8B, /* 0x34-0x37 */
+	0x9E, 0x8C, 0xE5, 0xAE, 0x9E, 0x8D, 0x9E, 0x8E, /* 0x38-0x3B */
+	0x9E, 0x8F, 0x9E, 0x90, 0x9E, 0x91, 0x9E, 0x92, /* 0x3C-0x3F */
+	0x9E, 0x93, 0x9E, 0x94, 0x9E, 0x95, 0x9E, 0x96, /* 0x40-0x43 */
+	0x9E, 0x97, 0x9E, 0x98, 0x9E, 0x99, 0x9E, 0x9A, /* 0x44-0x47 */
+	0x9E, 0x9B, 0x9E, 0x9C, 0x9E, 0x9D, 0x9E, 0x9E, /* 0x48-0x4B */
+	0xB9, 0xE0, 0x9E, 0x9F, 0x9E, 0xA0, 0xE5, 0xB0, /* 0x4C-0x4F */
+	0x9E, 0xA1, 0x9E, 0xA2, 0x9E, 0xA3, 0x9E, 0xA4, /* 0x50-0x53 */
+	0x9E, 0xA5, 0x9E, 0xA6, 0x9E, 0xA7, 0x9E, 0xA8, /* 0x54-0x57 */
+	0x9E, 0xA9, 0x9E, 0xAA, 0x9E, 0xAB, 0x9E, 0xAC, /* 0x58-0x5B */
+	0x9E, 0xAD, 0x9E, 0xAE, 0xE5, 0xB1, 0x9E, 0xAF, /* 0x5C-0x5F */
+	0x9E, 0xB0, 0x9E, 0xB1, 0x9E, 0xB2, 0x9E, 0xB3, /* 0x60-0x63 */
+	0x9E, 0xB4, 0x9E, 0xB5, 0x9E, 0xB6, 0x9E, 0xB7, /* 0x64-0x67 */
+	0x9E, 0xB8, 0x9E, 0xB9, 0x9E, 0xBA, 0xBB, 0xF0, /* 0x68-0x6B */
+	0xEC, 0xE1, 0xC3, 0xF0, 0x9E, 0xBB, 0xB5, 0xC6, /* 0x6C-0x6F */
+	0xBB, 0xD2, 0x9E, 0xBC, 0x9E, 0xBD, 0x9E, 0xBE, /* 0x70-0x73 */
+	0x9E, 0xBF, 0xC1, 0xE9, 0xD4, 0xEE, 0x9E, 0xC0, /* 0x74-0x77 */
+	0xBE, 0xC4, 0x9E, 0xC1, 0x9E, 0xC2, 0x9E, 0xC3, /* 0x78-0x7B */
+	0xD7, 0xC6, 0x9E, 0xC4, 0xD4, 0xD6, 0xB2, 0xD3, /* 0x7C-0x7F */
+	
+	0xEC, 0xBE, 0x9E, 0xC5, 0x9E, 0xC6, 0x9E, 0xC7, /* 0x80-0x83 */
+	0x9E, 0xC8, 0xEA, 0xC1, 0x9E, 0xC9, 0x9E, 0xCA, /* 0x84-0x87 */
+	0x9E, 0xCB, 0xC2, 0xAF, 0xB4, 0xB6, 0x9E, 0xCC, /* 0x88-0x8B */
+	0x9E, 0xCD, 0x9E, 0xCE, 0xD1, 0xD7, 0x9E, 0xCF, /* 0x8C-0x8F */
+	0x9E, 0xD0, 0x9E, 0xD1, 0xB3, 0xB4, 0x9E, 0xD2, /* 0x90-0x93 */
+	0xC8, 0xB2, 0xBF, 0xBB, 0xEC, 0xC0, 0x9E, 0xD3, /* 0x94-0x97 */
+	0x9E, 0xD4, 0xD6, 0xCB, 0x9E, 0xD5, 0x9E, 0xD6, /* 0x98-0x9B */
+	0xEC, 0xBF, 0xEC, 0xC1, 0x9E, 0xD7, 0x9E, 0xD8, /* 0x9C-0x9F */
+	0x9E, 0xD9, 0x9E, 0xDA, 0x9E, 0xDB, 0x9E, 0xDC, /* 0xA0-0xA3 */
+	0x9E, 0xDD, 0x9E, 0xDE, 0x9E, 0xDF, 0x9E, 0xE0, /* 0xA4-0xA7 */
+	0x9E, 0xE1, 0x9E, 0xE2, 0x9E, 0xE3, 0xEC, 0xC5, /* 0xA8-0xAB */
+	0xBE, 0xE6, 0xCC, 0xBF, 0xC5, 0xDA, 0xBE, 0xBC, /* 0xAC-0xAF */
+	0x9E, 0xE4, 0xEC, 0xC6, 0x9E, 0xE5, 0xB1, 0xFE, /* 0xB0-0xB3 */
+	0x9E, 0xE6, 0x9E, 0xE7, 0x9E, 0xE8, 0xEC, 0xC4, /* 0xB4-0xB7 */
+	0xD5, 0xA8, 0xB5, 0xE3, 0x9E, 0xE9, 0xEC, 0xC2, /* 0xB8-0xBB */
+	0xC1, 0xB6, 0xB3, 0xE3, 0x9E, 0xEA, 0x9E, 0xEB, /* 0xBC-0xBF */
+	0xEC, 0xC3, 0xCB, 0xB8, 0xC0, 0xC3, 0xCC, 0xFE, /* 0xC0-0xC3 */
+	0x9E, 0xEC, 0x9E, 0xED, 0x9E, 0xEE, 0x9E, 0xEF, /* 0xC4-0xC7 */
+	0xC1, 0xD2, 0x9E, 0xF0, 0xEC, 0xC8, 0x9E, 0xF1, /* 0xC8-0xCB */
+	0x9E, 0xF2, 0x9E, 0xF3, 0x9E, 0xF4, 0x9E, 0xF5, /* 0xCC-0xCF */
+	0x9E, 0xF6, 0x9E, 0xF7, 0x9E, 0xF8, 0x9E, 0xF9, /* 0xD0-0xD3 */
+	0x9E, 0xFA, 0x9E, 0xFB, 0x9E, 0xFC, 0x9E, 0xFD, /* 0xD4-0xD7 */
+	0xBA, 0xE6, 0xC0, 0xD3, 0x9E, 0xFE, 0xD6, 0xF2, /* 0xD8-0xDB */
+	0x9F, 0x40, 0x9F, 0x41, 0x9F, 0x42, 0xD1, 0xCC, /* 0xDC-0xDF */
+	0x9F, 0x43, 0x9F, 0x44, 0x9F, 0x45, 0x9F, 0x46, /* 0xE0-0xE3 */
+	0xBF, 0xBE, 0x9F, 0x47, 0xB7, 0xB3, 0xC9, 0xD5, /* 0xE4-0xE7 */
+	0xEC, 0xC7, 0xBB, 0xE2, 0x9F, 0x48, 0xCC, 0xCC, /* 0xE8-0xEB */
+	0xBD, 0xFD, 0xC8, 0xC8, 0x9F, 0x49, 0xCF, 0xA9, /* 0xEC-0xEF */
+	0x9F, 0x4A, 0x9F, 0x4B, 0x9F, 0x4C, 0x9F, 0x4D, /* 0xF0-0xF3 */
+	0x9F, 0x4E, 0x9F, 0x4F, 0x9F, 0x50, 0xCD, 0xE9, /* 0xF4-0xF7 */
+	0x9F, 0x51, 0xC5, 0xEB, 0x9F, 0x52, 0x9F, 0x53, /* 0xF8-0xFB */
+	0x9F, 0x54, 0xB7, 0xE9, 0x9F, 0x55, 0x9F, 0x56, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_71[512] = {
+	0x9F, 0x57, 0x9F, 0x58, 0x9F, 0x59, 0x9F, 0x5A, /* 0x00-0x03 */
+	0x9F, 0x5B, 0x9F, 0x5C, 0x9F, 0x5D, 0x9F, 0x5E, /* 0x04-0x07 */
+	0x9F, 0x5F, 0xD1, 0xC9, 0xBA, 0xB8, 0x9F, 0x60, /* 0x08-0x0B */
+	0x9F, 0x61, 0x9F, 0x62, 0x9F, 0x63, 0x9F, 0x64, /* 0x0C-0x0F */
+	0xEC, 0xC9, 0x9F, 0x65, 0x9F, 0x66, 0xEC, 0xCA, /* 0x10-0x13 */
+	0x9F, 0x67, 0xBB, 0xC0, 0xEC, 0xCB, 0x9F, 0x68, /* 0x14-0x17 */
+	0xEC, 0xE2, 0xB1, 0xBA, 0xB7, 0xD9, 0x9F, 0x69, /* 0x18-0x1B */
+	0x9F, 0x6A, 0x9F, 0x6B, 0x9F, 0x6C, 0x9F, 0x6D, /* 0x1C-0x1F */
+	0x9F, 0x6E, 0x9F, 0x6F, 0x9F, 0x70, 0x9F, 0x71, /* 0x20-0x23 */
+	0x9F, 0x72, 0x9F, 0x73, 0xBD, 0xB9, 0x9F, 0x74, /* 0x24-0x27 */
+	0x9F, 0x75, 0x9F, 0x76, 0x9F, 0x77, 0x9F, 0x78, /* 0x28-0x2B */
+	0x9F, 0x79, 0x9F, 0x7A, 0x9F, 0x7B, 0xEC, 0xCC, /* 0x2C-0x2F */
+	0xD1, 0xE6, 0xEC, 0xCD, 0x9F, 0x7C, 0x9F, 0x7D, /* 0x30-0x33 */
+	0x9F, 0x7E, 0x9F, 0x80, 0xC8, 0xBB, 0x9F, 0x81, /* 0x34-0x37 */
+	0x9F, 0x82, 0x9F, 0x83, 0x9F, 0x84, 0x9F, 0x85, /* 0x38-0x3B */
+	0x9F, 0x86, 0x9F, 0x87, 0x9F, 0x88, 0x9F, 0x89, /* 0x3C-0x3F */
+	0x9F, 0x8A, 0x9F, 0x8B, 0x9F, 0x8C, 0x9F, 0x8D, /* 0x40-0x43 */
+	0x9F, 0x8E, 0xEC, 0xD1, 0x9F, 0x8F, 0x9F, 0x90, /* 0x44-0x47 */
+	0x9F, 0x91, 0x9F, 0x92, 0xEC, 0xD3, 0x9F, 0x93, /* 0x48-0x4B */
+	0xBB, 0xCD, 0x9F, 0x94, 0xBC, 0xE5, 0x9F, 0x95, /* 0x4C-0x4F */
+	0x9F, 0x96, 0x9F, 0x97, 0x9F, 0x98, 0x9F, 0x99, /* 0x50-0x53 */
+	0x9F, 0x9A, 0x9F, 0x9B, 0x9F, 0x9C, 0x9F, 0x9D, /* 0x54-0x57 */
+	0x9F, 0x9E, 0x9F, 0x9F, 0x9F, 0xA0, 0x9F, 0xA1, /* 0x58-0x5B */
+	0xEC, 0xCF, 0x9F, 0xA2, 0xC9, 0xB7, 0x9F, 0xA3, /* 0x5C-0x5F */
+	0x9F, 0xA4, 0x9F, 0xA5, 0x9F, 0xA6, 0x9F, 0xA7, /* 0x60-0x63 */
+	0xC3, 0xBA, 0x9F, 0xA8, 0xEC, 0xE3, 0xD5, 0xD5, /* 0x64-0x67 */
+	0xEC, 0xD0, 0x9F, 0xA9, 0x9F, 0xAA, 0x9F, 0xAB, /* 0x68-0x6B */
+	0x9F, 0xAC, 0x9F, 0xAD, 0xD6, 0xF3, 0x9F, 0xAE, /* 0x6C-0x6F */
+	0x9F, 0xAF, 0x9F, 0xB0, 0xEC, 0xD2, 0xEC, 0xCE, /* 0x70-0x73 */
+	0x9F, 0xB1, 0x9F, 0xB2, 0x9F, 0xB3, 0x9F, 0xB4, /* 0x74-0x77 */
+	0xEC, 0xD4, 0x9F, 0xB5, 0xEC, 0xD5, 0x9F, 0xB6, /* 0x78-0x7B */
+	0x9F, 0xB7, 0xC9, 0xBF, 0x9F, 0xB8, 0x9F, 0xB9, /* 0x7C-0x7F */
+	
+	0x9F, 0xBA, 0x9F, 0xBB, 0x9F, 0xBC, 0x9F, 0xBD, /* 0x80-0x83 */
+	0xCF, 0xA8, 0x9F, 0xBE, 0x9F, 0xBF, 0x9F, 0xC0, /* 0x84-0x87 */
+	0x9F, 0xC1, 0x9F, 0xC2, 0xD0, 0xDC, 0x9F, 0xC3, /* 0x88-0x8B */
+	0x9F, 0xC4, 0x9F, 0xC5, 0x9F, 0xC6, 0xD1, 0xAC, /* 0x8C-0x8F */
+	0x9F, 0xC7, 0x9F, 0xC8, 0x9F, 0xC9, 0x9F, 0xCA, /* 0x90-0x93 */
+	0xC8, 0xDB, 0x9F, 0xCB, 0x9F, 0xCC, 0x9F, 0xCD, /* 0x94-0x97 */
+	0xEC, 0xD6, 0xCE, 0xF5, 0x9F, 0xCE, 0x9F, 0xCF, /* 0x98-0x9B */
+	0x9F, 0xD0, 0x9F, 0xD1, 0x9F, 0xD2, 0xCA, 0xEC, /* 0x9C-0x9F */
+	0xEC, 0xDA, 0x9F, 0xD3, 0x9F, 0xD4, 0x9F, 0xD5, /* 0xA0-0xA3 */
+	0x9F, 0xD6, 0x9F, 0xD7, 0x9F, 0xD8, 0x9F, 0xD9, /* 0xA4-0xA7 */
+	0xEC, 0xD9, 0x9F, 0xDA, 0x9F, 0xDB, 0x9F, 0xDC, /* 0xA8-0xAB */
+	0xB0, 0xBE, 0x9F, 0xDD, 0x9F, 0xDE, 0x9F, 0xDF, /* 0xAC-0xAF */
+	0x9F, 0xE0, 0x9F, 0xE1, 0x9F, 0xE2, 0xEC, 0xD7, /* 0xB0-0xB3 */
+	0x9F, 0xE3, 0xEC, 0xD8, 0x9F, 0xE4, 0x9F, 0xE5, /* 0xB4-0xB7 */
+	0x9F, 0xE6, 0xEC, 0xE4, 0x9F, 0xE7, 0x9F, 0xE8, /* 0xB8-0xBB */
+	0x9F, 0xE9, 0x9F, 0xEA, 0x9F, 0xEB, 0x9F, 0xEC, /* 0xBC-0xBF */
+	0x9F, 0xED, 0x9F, 0xEE, 0x9F, 0xEF, 0xC8, 0xBC, /* 0xC0-0xC3 */
+	0x9F, 0xF0, 0x9F, 0xF1, 0x9F, 0xF2, 0x9F, 0xF3, /* 0xC4-0xC7 */
+	0x9F, 0xF4, 0x9F, 0xF5, 0x9F, 0xF6, 0x9F, 0xF7, /* 0xC8-0xCB */
+	0x9F, 0xF8, 0x9F, 0xF9, 0xC1, 0xC7, 0x9F, 0xFA, /* 0xCC-0xCF */
+	0x9F, 0xFB, 0x9F, 0xFC, 0x9F, 0xFD, 0x9F, 0xFE, /* 0xD0-0xD3 */
+	0xEC, 0xDC, 0xD1, 0xE0, 0xA0, 0x40, 0xA0, 0x41, /* 0xD4-0xD7 */
+	0xA0, 0x42, 0xA0, 0x43, 0xA0, 0x44, 0xA0, 0x45, /* 0xD8-0xDB */
+	0xA0, 0x46, 0xA0, 0x47, 0xA0, 0x48, 0xA0, 0x49, /* 0xDC-0xDF */
+	0xEC, 0xDB, 0xA0, 0x4A, 0xA0, 0x4B, 0xA0, 0x4C, /* 0xE0-0xE3 */
+	0xA0, 0x4D, 0xD4, 0xEF, 0xA0, 0x4E, 0xEC, 0xDD, /* 0xE4-0xE7 */
+	0xA0, 0x4F, 0xA0, 0x50, 0xA0, 0x51, 0xA0, 0x52, /* 0xE8-0xEB */
+	0xA0, 0x53, 0xA0, 0x54, 0xDB, 0xC6, 0xA0, 0x55, /* 0xEC-0xEF */
+	0xA0, 0x56, 0xA0, 0x57, 0xA0, 0x58, 0xA0, 0x59, /* 0xF0-0xF3 */
+	0xA0, 0x5A, 0xA0, 0x5B, 0xA0, 0x5C, 0xA0, 0x5D, /* 0xF4-0xF7 */
+	0xA0, 0x5E, 0xEC, 0xDE, 0xA0, 0x5F, 0xA0, 0x60, /* 0xF8-0xFB */
+	0xA0, 0x61, 0xA0, 0x62, 0xA0, 0x63, 0xA0, 0x64, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_72[512] = {
+	0xA0, 0x65, 0xA0, 0x66, 0xA0, 0x67, 0xA0, 0x68, /* 0x00-0x03 */
+	0xA0, 0x69, 0xA0, 0x6A, 0xB1, 0xAC, 0xA0, 0x6B, /* 0x04-0x07 */
+	0xA0, 0x6C, 0xA0, 0x6D, 0xA0, 0x6E, 0xA0, 0x6F, /* 0x08-0x0B */
+	0xA0, 0x70, 0xA0, 0x71, 0xA0, 0x72, 0xA0, 0x73, /* 0x0C-0x0F */
+	0xA0, 0x74, 0xA0, 0x75, 0xA0, 0x76, 0xA0, 0x77, /* 0x10-0x13 */
+	0xA0, 0x78, 0xA0, 0x79, 0xA0, 0x7A, 0xA0, 0x7B, /* 0x14-0x17 */
+	0xA0, 0x7C, 0xA0, 0x7D, 0xA0, 0x7E, 0xA0, 0x80, /* 0x18-0x1B */
+	0xA0, 0x81, 0xEC, 0xDF, 0xA0, 0x82, 0xA0, 0x83, /* 0x1C-0x1F */
+	0xA0, 0x84, 0xA0, 0x85, 0xA0, 0x86, 0xA0, 0x87, /* 0x20-0x23 */
+	0xA0, 0x88, 0xA0, 0x89, 0xA0, 0x8A, 0xA0, 0x8B, /* 0x24-0x27 */
+	0xEC, 0xE0, 0xA0, 0x8C, 0xD7, 0xA6, 0xA0, 0x8D, /* 0x28-0x2B */
+	0xC5, 0xC0, 0xA0, 0x8E, 0xA0, 0x8F, 0xA0, 0x90, /* 0x2C-0x2F */
+	0xEB, 0xBC, 0xB0, 0xAE, 0xA0, 0x91, 0xA0, 0x92, /* 0x30-0x33 */
+	0xA0, 0x93, 0xBE, 0xF4, 0xB8, 0xB8, 0xD2, 0xAF, /* 0x34-0x37 */
+	0xB0, 0xD6, 0xB5, 0xF9, 0xA0, 0x94, 0xD8, 0xB3, /* 0x38-0x3B */
+	0xA0, 0x95, 0xCB, 0xAC, 0xA0, 0x96, 0xE3, 0xDD, /* 0x3C-0x3F */
+	0xA0, 0x97, 0xA0, 0x98, 0xA0, 0x99, 0xA0, 0x9A, /* 0x40-0x43 */
+	0xA0, 0x9B, 0xA0, 0x9C, 0xA0, 0x9D, 0xC6, 0xAC, /* 0x44-0x47 */
+	0xB0, 0xE6, 0xA0, 0x9E, 0xA0, 0x9F, 0xA0, 0xA0, /* 0x48-0x4B */
+	0xC5, 0xC6, 0xEB, 0xB9, 0xA0, 0xA1, 0xA0, 0xA2, /* 0x4C-0x4F */
+	0xA0, 0xA3, 0xA0, 0xA4, 0xEB, 0xBA, 0xA0, 0xA5, /* 0x50-0x53 */
+	0xA0, 0xA6, 0xA0, 0xA7, 0xEB, 0xBB, 0xA0, 0xA8, /* 0x54-0x57 */
+	0xA0, 0xA9, 0xD1, 0xC0, 0xA0, 0xAA, 0xC5, 0xA3, /* 0x58-0x5B */
+	0xA0, 0xAB, 0xEA, 0xF2, 0xA0, 0xAC, 0xC4, 0xB2, /* 0x5C-0x5F */
+	0xA0, 0xAD, 0xC4, 0xB5, 0xC0, 0xCE, 0xA0, 0xAE, /* 0x60-0x63 */
+	0xA0, 0xAF, 0xA0, 0xB0, 0xEA, 0xF3, 0xC4, 0xC1, /* 0x64-0x67 */
+	0xA0, 0xB1, 0xCE, 0xEF, 0xA0, 0xB2, 0xA0, 0xB3, /* 0x68-0x6B */
+	0xA0, 0xB4, 0xA0, 0xB5, 0xEA, 0xF0, 0xEA, 0xF4, /* 0x6C-0x6F */
+	0xA0, 0xB6, 0xA0, 0xB7, 0xC9, 0xFC, 0xA0, 0xB8, /* 0x70-0x73 */
+	0xA0, 0xB9, 0xC7, 0xA3, 0xA0, 0xBA, 0xA0, 0xBB, /* 0x74-0x77 */
+	0xA0, 0xBC, 0xCC, 0xD8, 0xCE, 0xFE, 0xA0, 0xBD, /* 0x78-0x7B */
+	0xA0, 0xBE, 0xA0, 0xBF, 0xEA, 0xF5, 0xEA, 0xF6, /* 0x7C-0x7F */
+	
+	0xCF, 0xAC, 0xC0, 0xE7, 0xA0, 0xC0, 0xA0, 0xC1, /* 0x80-0x83 */
+	0xEA, 0xF7, 0xA0, 0xC2, 0xA0, 0xC3, 0xA0, 0xC4, /* 0x84-0x87 */
+	0xA0, 0xC5, 0xA0, 0xC6, 0xB6, 0xBF, 0xEA, 0xF8, /* 0x88-0x8B */
+	0xA0, 0xC7, 0xEA, 0xF9, 0xA0, 0xC8, 0xEA, 0xFA, /* 0x8C-0x8F */
+	0xA0, 0xC9, 0xA0, 0xCA, 0xEA, 0xFB, 0xA0, 0xCB, /* 0x90-0x93 */
+	0xA0, 0xCC, 0xA0, 0xCD, 0xA0, 0xCE, 0xA0, 0xCF, /* 0x94-0x97 */
+	0xA0, 0xD0, 0xA0, 0xD1, 0xA0, 0xD2, 0xA0, 0xD3, /* 0x98-0x9B */
+	0xA0, 0xD4, 0xA0, 0xD5, 0xA0, 0xD6, 0xEA, 0xF1, /* 0x9C-0x9F */
+	0xA0, 0xD7, 0xA0, 0xD8, 0xA0, 0xD9, 0xA0, 0xDA, /* 0xA0-0xA3 */
+	0xA0, 0xDB, 0xA0, 0xDC, 0xA0, 0xDD, 0xA0, 0xDE, /* 0xA4-0xA7 */
+	0xA0, 0xDF, 0xA0, 0xE0, 0xA0, 0xE1, 0xA0, 0xE2, /* 0xA8-0xAB */
+	0xC8, 0xAE, 0xE1, 0xEB, 0xA0, 0xE3, 0xB7, 0xB8, /* 0xAC-0xAF */
+	0xE1, 0xEC, 0xA0, 0xE4, 0xA0, 0xE5, 0xA0, 0xE6, /* 0xB0-0xB3 */
+	0xE1, 0xED, 0xA0, 0xE7, 0xD7, 0xB4, 0xE1, 0xEE, /* 0xB4-0xB7 */
+	0xE1, 0xEF, 0xD3, 0xCC, 0xA0, 0xE8, 0xA0, 0xE9, /* 0xB8-0xBB */
+	0xA0, 0xEA, 0xA0, 0xEB, 0xA0, 0xEC, 0xA0, 0xED, /* 0xBC-0xBF */
+	0xA0, 0xEE, 0xE1, 0xF1, 0xBF, 0xF1, 0xE1, 0xF0, /* 0xC0-0xC3 */
+	0xB5, 0xD2, 0xA0, 0xEF, 0xA0, 0xF0, 0xA0, 0xF1, /* 0xC4-0xC7 */
+	0xB1, 0xB7, 0xA0, 0xF2, 0xA0, 0xF3, 0xA0, 0xF4, /* 0xC8-0xCB */
+	0xA0, 0xF5, 0xE1, 0xF3, 0xE1, 0xF2, 0xA0, 0xF6, /* 0xCC-0xCF */
+	0xBA, 0xFC, 0xA0, 0xF7, 0xE1, 0xF4, 0xA0, 0xF8, /* 0xD0-0xD3 */
+	0xA0, 0xF9, 0xA0, 0xFA, 0xA0, 0xFB, 0xB9, 0xB7, /* 0xD4-0xD7 */
+	0xA0, 0xFC, 0xBE, 0xD1, 0xA0, 0xFD, 0xA0, 0xFE, /* 0xD8-0xDB */
+	0xAA, 0x40, 0xAA, 0x41, 0xC4, 0xFC, 0xAA, 0x42, /* 0xDC-0xDF */
+	0xBA, 0xDD, 0xBD, 0xC6, 0xAA, 0x43, 0xAA, 0x44, /* 0xE0-0xE3 */
+	0xAA, 0x45, 0xAA, 0x46, 0xAA, 0x47, 0xAA, 0x48, /* 0xE4-0xE7 */
+	0xE1, 0xF5, 0xE1, 0xF7, 0xAA, 0x49, 0xAA, 0x4A, /* 0xE8-0xEB */
+	0xB6, 0xC0, 0xCF, 0xC1, 0xCA, 0xA8, 0xE1, 0xF6, /* 0xEC-0xEF */
+	0xD5, 0xF8, 0xD3, 0xFC, 0xE1, 0xF8, 0xE1, 0xFC, /* 0xF0-0xF3 */
+	0xE1, 0xF9, 0xAA, 0x4B, 0xAA, 0x4C, 0xE1, 0xFA, /* 0xF4-0xF7 */
+	0xC0, 0xEA, 0xAA, 0x4D, 0xE1, 0xFE, 0xE2, 0xA1, /* 0xF8-0xFB */
+	0xC0, 0xC7, 0xAA, 0x4E, 0xAA, 0x4F, 0xAA, 0x50, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_73[512] = {
+	0xAA, 0x51, 0xE1, 0xFB, 0xAA, 0x52, 0xE1, 0xFD, /* 0x00-0x03 */
+	0xAA, 0x53, 0xAA, 0x54, 0xAA, 0x55, 0xAA, 0x56, /* 0x04-0x07 */
+	0xAA, 0x57, 0xAA, 0x58, 0xE2, 0xA5, 0xAA, 0x59, /* 0x08-0x0B */
+	0xAA, 0x5A, 0xAA, 0x5B, 0xC1, 0xD4, 0xAA, 0x5C, /* 0x0C-0x0F */
+	0xAA, 0x5D, 0xAA, 0x5E, 0xAA, 0x5F, 0xE2, 0xA3, /* 0x10-0x13 */
+	0xAA, 0x60, 0xE2, 0xA8, 0xB2, 0xFE, 0xE2, 0xA2, /* 0x14-0x17 */
+	0xAA, 0x61, 0xAA, 0x62, 0xAA, 0x63, 0xC3, 0xCD, /* 0x18-0x1B */
+	0xB2, 0xC2, 0xE2, 0xA7, 0xE2, 0xA6, 0xAA, 0x64, /* 0x1C-0x1F */
+	0xAA, 0x65, 0xE2, 0xA4, 0xE2, 0xA9, 0xAA, 0x66, /* 0x20-0x23 */
+	0xAA, 0x67, 0xE2, 0xAB, 0xAA, 0x68, 0xAA, 0x69, /* 0x24-0x27 */
+	0xAA, 0x6A, 0xD0, 0xC9, 0xD6, 0xED, 0xC3, 0xA8, /* 0x28-0x2B */
+	0xE2, 0xAC, 0xAA, 0x6B, 0xCF, 0xD7, 0xAA, 0x6C, /* 0x2C-0x2F */
+	0xAA, 0x6D, 0xE2, 0xAE, 0xAA, 0x6E, 0xAA, 0x6F, /* 0x30-0x33 */
+	0xBA, 0xEF, 0xAA, 0x70, 0xAA, 0x71, 0xE9, 0xE0, /* 0x34-0x37 */
+	0xE2, 0xAD, 0xE2, 0xAA, 0xAA, 0x72, 0xAA, 0x73, /* 0x38-0x3B */
+	0xAA, 0x74, 0xAA, 0x75, 0xBB, 0xAB, 0xD4, 0xB3, /* 0x3C-0x3F */
+	0xAA, 0x76, 0xAA, 0x77, 0xAA, 0x78, 0xAA, 0x79, /* 0x40-0x43 */
+	0xAA, 0x7A, 0xAA, 0x7B, 0xAA, 0x7C, 0xAA, 0x7D, /* 0x44-0x47 */
+	0xAA, 0x7E, 0xAA, 0x80, 0xAA, 0x81, 0xAA, 0x82, /* 0x48-0x4B */
+	0xAA, 0x83, 0xE2, 0xB0, 0xAA, 0x84, 0xAA, 0x85, /* 0x4C-0x4F */
+	0xE2, 0xAF, 0xAA, 0x86, 0xE9, 0xE1, 0xAA, 0x87, /* 0x50-0x53 */
+	0xAA, 0x88, 0xAA, 0x89, 0xAA, 0x8A, 0xE2, 0xB1, /* 0x54-0x57 */
+	0xAA, 0x8B, 0xAA, 0x8C, 0xAA, 0x8D, 0xAA, 0x8E, /* 0x58-0x5B */
+	0xAA, 0x8F, 0xAA, 0x90, 0xAA, 0x91, 0xAA, 0x92, /* 0x5C-0x5F */
+	0xE2, 0xB2, 0xAA, 0x93, 0xAA, 0x94, 0xAA, 0x95, /* 0x60-0x63 */
+	0xAA, 0x96, 0xAA, 0x97, 0xAA, 0x98, 0xAA, 0x99, /* 0x64-0x67 */
+	0xAA, 0x9A, 0xAA, 0x9B, 0xAA, 0x9C, 0xAA, 0x9D, /* 0x68-0x6B */
+	0xE2, 0xB3, 0xCC, 0xA1, 0xAA, 0x9E, 0xE2, 0xB4, /* 0x6C-0x6F */
+	0xAA, 0x9F, 0xAA, 0xA0, 0xAB, 0x40, 0xAB, 0x41, /* 0x70-0x73 */
+	0xAB, 0x42, 0xAB, 0x43, 0xAB, 0x44, 0xAB, 0x45, /* 0x74-0x77 */
+	0xAB, 0x46, 0xAB, 0x47, 0xAB, 0x48, 0xAB, 0x49, /* 0x78-0x7B */
+	0xAB, 0x4A, 0xAB, 0x4B, 0xE2, 0xB5, 0xAB, 0x4C, /* 0x7C-0x7F */
+	
+	0xAB, 0x4D, 0xAB, 0x4E, 0xAB, 0x4F, 0xAB, 0x50, /* 0x80-0x83 */
+	0xD0, 0xFE, 0xAB, 0x51, 0xAB, 0x52, 0xC2, 0xCA, /* 0x84-0x87 */
+	0xAB, 0x53, 0xD3, 0xF1, 0xAB, 0x54, 0xCD, 0xF5, /* 0x88-0x8B */
+	0xAB, 0x55, 0xAB, 0x56, 0xE7, 0xE0, 0xAB, 0x57, /* 0x8C-0x8F */
+	0xAB, 0x58, 0xE7, 0xE1, 0xAB, 0x59, 0xAB, 0x5A, /* 0x90-0x93 */
+	0xAB, 0x5B, 0xAB, 0x5C, 0xBE, 0xC1, 0xAB, 0x5D, /* 0x94-0x97 */
+	0xAB, 0x5E, 0xAB, 0x5F, 0xAB, 0x60, 0xC2, 0xEA, /* 0x98-0x9B */
+	0xAB, 0x61, 0xAB, 0x62, 0xAB, 0x63, 0xE7, 0xE4, /* 0x9C-0x9F */
+	0xAB, 0x64, 0xAB, 0x65, 0xE7, 0xE3, 0xAB, 0x66, /* 0xA0-0xA3 */
+	0xAB, 0x67, 0xAB, 0x68, 0xAB, 0x69, 0xAB, 0x6A, /* 0xA4-0xA7 */
+	0xAB, 0x6B, 0xCD, 0xE6, 0xAB, 0x6C, 0xC3, 0xB5, /* 0xA8-0xAB */
+	0xAB, 0x6D, 0xAB, 0x6E, 0xE7, 0xE2, 0xBB, 0xB7, /* 0xAC-0xAF */
+	0xCF, 0xD6, 0xAB, 0x6F, 0xC1, 0xE1, 0xE7, 0xE9, /* 0xB0-0xB3 */
+	0xAB, 0x70, 0xAB, 0x71, 0xAB, 0x72, 0xE7, 0xE8, /* 0xB4-0xB7 */
+	0xAB, 0x73, 0xAB, 0x74, 0xE7, 0xF4, 0xB2, 0xA3, /* 0xB8-0xBB */
+	0xAB, 0x75, 0xAB, 0x76, 0xAB, 0x77, 0xAB, 0x78, /* 0xBC-0xBF */
+	0xE7, 0xEA, 0xAB, 0x79, 0xE7, 0xE6, 0xAB, 0x7A, /* 0xC0-0xC3 */
+	0xAB, 0x7B, 0xAB, 0x7C, 0xAB, 0x7D, 0xAB, 0x7E, /* 0xC4-0xC7 */
+	0xE7, 0xEC, 0xE7, 0xEB, 0xC9, 0xBA, 0xAB, 0x80, /* 0xC8-0xCB */
+	0xAB, 0x81, 0xD5, 0xE4, 0xAB, 0x82, 0xE7, 0xE5, /* 0xCC-0xCF */
+	0xB7, 0xA9, 0xE7, 0xE7, 0xAB, 0x83, 0xAB, 0x84, /* 0xD0-0xD3 */
+	0xAB, 0x85, 0xAB, 0x86, 0xAB, 0x87, 0xAB, 0x88, /* 0xD4-0xD7 */
+	0xAB, 0x89, 0xE7, 0xEE, 0xAB, 0x8A, 0xAB, 0x8B, /* 0xD8-0xDB */
+	0xAB, 0x8C, 0xAB, 0x8D, 0xE7, 0xF3, 0xAB, 0x8E, /* 0xDC-0xDF */
+	0xD6, 0xE9, 0xAB, 0x8F, 0xAB, 0x90, 0xAB, 0x91, /* 0xE0-0xE3 */
+	0xAB, 0x92, 0xE7, 0xED, 0xAB, 0x93, 0xE7, 0xF2, /* 0xE4-0xE7 */
+	0xAB, 0x94, 0xE7, 0xF1, 0xAB, 0x95, 0xAB, 0x96, /* 0xE8-0xEB */
+	0xAB, 0x97, 0xB0, 0xE0, 0xAB, 0x98, 0xAB, 0x99, /* 0xEC-0xEF */
+	0xAB, 0x9A, 0xAB, 0x9B, 0xE7, 0xF5, 0xAB, 0x9C, /* 0xF0-0xF3 */
+	0xAB, 0x9D, 0xAB, 0x9E, 0xAB, 0x9F, 0xAB, 0xA0, /* 0xF4-0xF7 */
+	0xAC, 0x40, 0xAC, 0x41, 0xAC, 0x42, 0xAC, 0x43, /* 0xF8-0xFB */
+	0xAC, 0x44, 0xAC, 0x45, 0xAC, 0x46, 0xAC, 0x47, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_74[512] = {
+	0xAC, 0x48, 0xAC, 0x49, 0xAC, 0x4A, 0xC7, 0xF2, /* 0x00-0x03 */
+	0xAC, 0x4B, 0xC0, 0xC5, 0xC0, 0xED, 0xAC, 0x4C, /* 0x04-0x07 */
+	0xAC, 0x4D, 0xC1, 0xF0, 0xE7, 0xF0, 0xAC, 0x4E, /* 0x08-0x0B */
+	0xAC, 0x4F, 0xAC, 0x50, 0xAC, 0x51, 0xE7, 0xF6, /* 0x0C-0x0F */
+	0xCB, 0xF6, 0xAC, 0x52, 0xAC, 0x53, 0xAC, 0x54, /* 0x10-0x13 */
+	0xAC, 0x55, 0xAC, 0x56, 0xAC, 0x57, 0xAC, 0x58, /* 0x14-0x17 */
+	0xAC, 0x59, 0xAC, 0x5A, 0xE8, 0xA2, 0xE8, 0xA1, /* 0x18-0x1B */
+	0xAC, 0x5B, 0xAC, 0x5C, 0xAC, 0x5D, 0xAC, 0x5E, /* 0x1C-0x1F */
+	0xAC, 0x5F, 0xAC, 0x60, 0xD7, 0xC1, 0xAC, 0x61, /* 0x20-0x23 */
+	0xAC, 0x62, 0xE7, 0xFA, 0xE7, 0xF9, 0xAC, 0x63, /* 0x24-0x27 */
+	0xE7, 0xFB, 0xAC, 0x64, 0xE7, 0xF7, 0xAC, 0x65, /* 0x28-0x2B */
+	0xE7, 0xFE, 0xAC, 0x66, 0xE7, 0xFD, 0xAC, 0x67, /* 0x2C-0x2F */
+	0xE7, 0xFC, 0xAC, 0x68, 0xAC, 0x69, 0xC1, 0xD5, /* 0x30-0x33 */
+	0xC7, 0xD9, 0xC5, 0xFD, 0xC5, 0xC3, 0xAC, 0x6A, /* 0x34-0x37 */
+	0xAC, 0x6B, 0xAC, 0x6C, 0xAC, 0x6D, 0xAC, 0x6E, /* 0x38-0x3B */
+	0xC7, 0xED, 0xAC, 0x6F, 0xAC, 0x70, 0xAC, 0x71, /* 0x3C-0x3F */
+	0xAC, 0x72, 0xE8, 0xA3, 0xAC, 0x73, 0xAC, 0x74, /* 0x40-0x43 */
+	0xAC, 0x75, 0xAC, 0x76, 0xAC, 0x77, 0xAC, 0x78, /* 0x44-0x47 */
+	0xAC, 0x79, 0xAC, 0x7A, 0xAC, 0x7B, 0xAC, 0x7C, /* 0x48-0x4B */
+	0xAC, 0x7D, 0xAC, 0x7E, 0xAC, 0x80, 0xAC, 0x81, /* 0x4C-0x4F */
+	0xAC, 0x82, 0xAC, 0x83, 0xAC, 0x84, 0xAC, 0x85, /* 0x50-0x53 */
+	0xAC, 0x86, 0xE8, 0xA6, 0xAC, 0x87, 0xE8, 0xA5, /* 0x54-0x57 */
+	0xAC, 0x88, 0xE8, 0xA7, 0xBA, 0xF7, 0xE7, 0xF8, /* 0x58-0x5B */
+	0xE8, 0xA4, 0xAC, 0x89, 0xC8, 0xF0, 0xC9, 0xAA, /* 0x5C-0x5F */
+	0xAC, 0x8A, 0xAC, 0x8B, 0xAC, 0x8C, 0xAC, 0x8D, /* 0x60-0x63 */
+	0xAC, 0x8E, 0xAC, 0x8F, 0xAC, 0x90, 0xAC, 0x91, /* 0x64-0x67 */
+	0xAC, 0x92, 0xAC, 0x93, 0xAC, 0x94, 0xAC, 0x95, /* 0x68-0x6B */
+	0xAC, 0x96, 0xE8, 0xA9, 0xAC, 0x97, 0xAC, 0x98, /* 0x6C-0x6F */
+	0xB9, 0xE5, 0xAC, 0x99, 0xAC, 0x9A, 0xAC, 0x9B, /* 0x70-0x73 */
+	0xAC, 0x9C, 0xAC, 0x9D, 0xD1, 0xFE, 0xE8, 0xA8, /* 0x74-0x77 */
+	0xAC, 0x9E, 0xAC, 0x9F, 0xAC, 0xA0, 0xAD, 0x40, /* 0x78-0x7B */
+	0xAD, 0x41, 0xAD, 0x42, 0xE8, 0xAA, 0xAD, 0x43, /* 0x7C-0x7F */
+	
+	0xE8, 0xAD, 0xE8, 0xAE, 0xAD, 0x44, 0xC1, 0xA7, /* 0x80-0x83 */
+	0xAD, 0x45, 0xAD, 0x46, 0xAD, 0x47, 0xE8, 0xAF, /* 0x84-0x87 */
+	0xAD, 0x48, 0xAD, 0x49, 0xAD, 0x4A, 0xE8, 0xB0, /* 0x88-0x8B */
+	0xAD, 0x4B, 0xAD, 0x4C, 0xE8, 0xAC, 0xAD, 0x4D, /* 0x8C-0x8F */
+	0xE8, 0xB4, 0xAD, 0x4E, 0xAD, 0x4F, 0xAD, 0x50, /* 0x90-0x93 */
+	0xAD, 0x51, 0xAD, 0x52, 0xAD, 0x53, 0xAD, 0x54, /* 0x94-0x97 */
+	0xAD, 0x55, 0xAD, 0x56, 0xAD, 0x57, 0xAD, 0x58, /* 0x98-0x9B */
+	0xE8, 0xAB, 0xAD, 0x59, 0xE8, 0xB1, 0xAD, 0x5A, /* 0x9C-0x9F */
+	0xAD, 0x5B, 0xAD, 0x5C, 0xAD, 0x5D, 0xAD, 0x5E, /* 0xA0-0xA3 */
+	0xAD, 0x5F, 0xAD, 0x60, 0xAD, 0x61, 0xE8, 0xB5, /* 0xA4-0xA7 */
+	0xE8, 0xB2, 0xE8, 0xB3, 0xAD, 0x62, 0xAD, 0x63, /* 0xA8-0xAB */
+	0xAD, 0x64, 0xAD, 0x65, 0xAD, 0x66, 0xAD, 0x67, /* 0xAC-0xAF */
+	0xAD, 0x68, 0xAD, 0x69, 0xAD, 0x6A, 0xAD, 0x6B, /* 0xB0-0xB3 */
+	0xAD, 0x6C, 0xAD, 0x6D, 0xAD, 0x6E, 0xAD, 0x6F, /* 0xB4-0xB7 */
+	0xAD, 0x70, 0xAD, 0x71, 0xE8, 0xB7, 0xAD, 0x72, /* 0xB8-0xBB */
+	0xAD, 0x73, 0xAD, 0x74, 0xAD, 0x75, 0xAD, 0x76, /* 0xBC-0xBF */
+	0xAD, 0x77, 0xAD, 0x78, 0xAD, 0x79, 0xAD, 0x7A, /* 0xC0-0xC3 */
+	0xAD, 0x7B, 0xAD, 0x7C, 0xAD, 0x7D, 0xAD, 0x7E, /* 0xC4-0xC7 */
+	0xAD, 0x80, 0xAD, 0x81, 0xAD, 0x82, 0xAD, 0x83, /* 0xC8-0xCB */
+	0xAD, 0x84, 0xAD, 0x85, 0xAD, 0x86, 0xAD, 0x87, /* 0xCC-0xCF */
+	0xAD, 0x88, 0xAD, 0x89, 0xE8, 0xB6, 0xAD, 0x8A, /* 0xD0-0xD3 */
+	0xAD, 0x8B, 0xAD, 0x8C, 0xAD, 0x8D, 0xAD, 0x8E, /* 0xD4-0xD7 */
+	0xAD, 0x8F, 0xAD, 0x90, 0xAD, 0x91, 0xAD, 0x92, /* 0xD8-0xDB */
+	0xB9, 0xCF, 0xAD, 0x93, 0xF0, 0xAC, 0xAD, 0x94, /* 0xDC-0xDF */
+	0xF0, 0xAD, 0xAD, 0x95, 0xC6, 0xB0, 0xB0, 0xEA, /* 0xE0-0xE3 */
+	0xC8, 0xBF, 0xAD, 0x96, 0xCD, 0xDF, 0xAD, 0x97, /* 0xE4-0xE7 */
+	0xAD, 0x98, 0xAD, 0x99, 0xAD, 0x9A, 0xAD, 0x9B, /* 0xE8-0xEB */
+	0xAD, 0x9C, 0xAD, 0x9D, 0xCE, 0xCD, 0xEA, 0xB1, /* 0xEC-0xEF */
+	0xAD, 0x9E, 0xAD, 0x9F, 0xAD, 0xA0, 0xAE, 0x40, /* 0xF0-0xF3 */
+	0xEA, 0xB2, 0xAE, 0x41, 0xC6, 0xBF, 0xB4, 0xC9, /* 0xF4-0xF7 */
+	0xAE, 0x42, 0xAE, 0x43, 0xAE, 0x44, 0xAE, 0x45, /* 0xF8-0xFB */
+	0xAE, 0x46, 0xAE, 0x47, 0xAE, 0x48, 0xEA, 0xB3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_75[512] = {
+	0xAE, 0x49, 0xAE, 0x4A, 0xAE, 0x4B, 0xAE, 0x4C, /* 0x00-0x03 */
+	0xD5, 0xE7, 0xAE, 0x4D, 0xAE, 0x4E, 0xAE, 0x4F, /* 0x04-0x07 */
+	0xAE, 0x50, 0xAE, 0x51, 0xAE, 0x52, 0xAE, 0x53, /* 0x08-0x0B */
+	0xAE, 0x54, 0xDD, 0xF9, 0xAE, 0x55, 0xEA, 0xB4, /* 0x0C-0x0F */
+	0xAE, 0x56, 0xEA, 0xB5, 0xAE, 0x57, 0xEA, 0xB6, /* 0x10-0x13 */
+	0xAE, 0x58, 0xAE, 0x59, 0xAE, 0x5A, 0xAE, 0x5B, /* 0x14-0x17 */
+	0xB8, 0xCA, 0xDF, 0xB0, 0xC9, 0xF5, 0xAE, 0x5C, /* 0x18-0x1B */
+	0xCC, 0xF0, 0xAE, 0x5D, 0xAE, 0x5E, 0xC9, 0xFA, /* 0x1C-0x1F */
+	0xAE, 0x5F, 0xAE, 0x60, 0xAE, 0x61, 0xAE, 0x62, /* 0x20-0x23 */
+	0xAE, 0x63, 0xC9, 0xFB, 0xAE, 0x64, 0xAE, 0x65, /* 0x24-0x27 */
+	0xD3, 0xC3, 0xCB, 0xA6, 0xAE, 0x66, 0xB8, 0xA6, /* 0x28-0x2B */
+	0xF0, 0xAE, 0xB1, 0xC2, 0xAE, 0x67, 0xE5, 0xB8, /* 0x2C-0x2F */
+	0xCC, 0xEF, 0xD3, 0xC9, 0xBC, 0xD7, 0xC9, 0xEA, /* 0x30-0x33 */
+	0xAE, 0x68, 0xB5, 0xE7, 0xAE, 0x69, 0xC4, 0xD0, /* 0x34-0x37 */
+	0xB5, 0xE9, 0xAE, 0x6A, 0xEE, 0xAE, 0xBB, 0xAD, /* 0x38-0x3B */
+	0xAE, 0x6B, 0xAE, 0x6C, 0xE7, 0xDE, 0xAE, 0x6D, /* 0x3C-0x3F */
+	0xEE, 0xAF, 0xAE, 0x6E, 0xAE, 0x6F, 0xAE, 0x70, /* 0x40-0x43 */
+	0xAE, 0x71, 0xB3, 0xA9, 0xAE, 0x72, 0xAE, 0x73, /* 0x44-0x47 */
+	0xEE, 0xB2, 0xAE, 0x74, 0xAE, 0x75, 0xEE, 0xB1, /* 0x48-0x4B */
+	0xBD, 0xE7, 0xAE, 0x76, 0xEE, 0xB0, 0xCE, 0xB7, /* 0x4C-0x4F */
+	0xAE, 0x77, 0xAE, 0x78, 0xAE, 0x79, 0xAE, 0x7A, /* 0x50-0x53 */
+	0xC5, 0xCF, 0xAE, 0x7B, 0xAE, 0x7C, 0xAE, 0x7D, /* 0x54-0x57 */
+	0xAE, 0x7E, 0xC1, 0xF4, 0xDB, 0xCE, 0xEE, 0xB3, /* 0x58-0x5B */
+	0xD0, 0xF3, 0xAE, 0x80, 0xAE, 0x81, 0xAE, 0x82, /* 0x5C-0x5F */
+	0xAE, 0x83, 0xAE, 0x84, 0xAE, 0x85, 0xAE, 0x86, /* 0x60-0x63 */
+	0xAE, 0x87, 0xC2, 0xD4, 0xC6, 0xE8, 0xAE, 0x88, /* 0x64-0x67 */
+	0xAE, 0x89, 0xAE, 0x8A, 0xB7, 0xAC, 0xAE, 0x8B, /* 0x68-0x6B */
+	0xAE, 0x8C, 0xAE, 0x8D, 0xAE, 0x8E, 0xAE, 0x8F, /* 0x6C-0x6F */
+	0xAE, 0x90, 0xAE, 0x91, 0xEE, 0xB4, 0xAE, 0x92, /* 0x70-0x73 */
+	0xB3, 0xEB, 0xAE, 0x93, 0xAE, 0x94, 0xAE, 0x95, /* 0x74-0x77 */
+	0xBB, 0xFB, 0xEE, 0xB5, 0xAE, 0x96, 0xAE, 0x97, /* 0x78-0x7B */
+	0xAE, 0x98, 0xAE, 0x99, 0xAE, 0x9A, 0xE7, 0xDC, /* 0x7C-0x7F */
+	
+	0xAE, 0x9B, 0xAE, 0x9C, 0xAE, 0x9D, 0xEE, 0xB6, /* 0x80-0x83 */
+	0xAE, 0x9E, 0xAE, 0x9F, 0xBD, 0xAE, 0xAE, 0xA0, /* 0x84-0x87 */
+	0xAF, 0x40, 0xAF, 0x41, 0xAF, 0x42, 0xF1, 0xE2, /* 0x88-0x8B */
+	0xAF, 0x43, 0xAF, 0x44, 0xAF, 0x45, 0xCA, 0xE8, /* 0x8C-0x8F */
+	0xAF, 0x46, 0xD2, 0xC9, 0xF0, 0xDA, 0xAF, 0x47, /* 0x90-0x93 */
+	0xF0, 0xDB, 0xAF, 0x48, 0xF0, 0xDC, 0xC1, 0xC6, /* 0x94-0x97 */
+	0xAF, 0x49, 0xB8, 0xED, 0xBE, 0xCE, 0xAF, 0x4A, /* 0x98-0x9B */
+	0xAF, 0x4B, 0xF0, 0xDE, 0xAF, 0x4C, 0xC5, 0xB1, /* 0x9C-0x9F */
+	0xF0, 0xDD, 0xD1, 0xF1, 0xAF, 0x4D, 0xF0, 0xE0, /* 0xA0-0xA3 */
+	0xB0, 0xCC, 0xBD, 0xEA, 0xAF, 0x4E, 0xAF, 0x4F, /* 0xA4-0xA7 */
+	0xAF, 0x50, 0xAF, 0x51, 0xAF, 0x52, 0xD2, 0xDF, /* 0xA8-0xAB */
+	0xF0, 0xDF, 0xAF, 0x53, 0xB4, 0xAF, 0xB7, 0xE8, /* 0xAC-0xAF */
+	0xF0, 0xE6, 0xF0, 0xE5, 0xC6, 0xA3, 0xF0, 0xE1, /* 0xB0-0xB3 */
+	0xF0, 0xE2, 0xB4, 0xC3, 0xAF, 0x54, 0xAF, 0x55, /* 0xB4-0xB7 */
+	0xF0, 0xE3, 0xD5, 0xEE, 0xAF, 0x56, 0xAF, 0x57, /* 0xB8-0xBB */
+	0xCC, 0xDB, 0xBE, 0xD2, 0xBC, 0xB2, 0xAF, 0x58, /* 0xBC-0xBF */
+	0xAF, 0x59, 0xAF, 0x5A, 0xF0, 0xE8, 0xF0, 0xE7, /* 0xC0-0xC3 */
+	0xF0, 0xE4, 0xB2, 0xA1, 0xAF, 0x5B, 0xD6, 0xA2, /* 0xC4-0xC7 */
+	0xD3, 0xB8, 0xBE, 0xB7, 0xC8, 0xAC, 0xAF, 0x5C, /* 0xC8-0xCB */
+	0xAF, 0x5D, 0xF0, 0xEA, 0xAF, 0x5E, 0xAF, 0x5F, /* 0xCC-0xCF */
+	0xAF, 0x60, 0xAF, 0x61, 0xD1, 0xF7, 0xAF, 0x62, /* 0xD0-0xD3 */
+	0xD6, 0xCC, 0xBA, 0xDB, 0xF0, 0xE9, 0xAF, 0x63, /* 0xD4-0xD7 */
+	0xB6, 0xBB, 0xAF, 0x64, 0xAF, 0x65, 0xCD, 0xB4, /* 0xD8-0xDB */
+	0xAF, 0x66, 0xAF, 0x67, 0xC6, 0xA6, 0xAF, 0x68, /* 0xDC-0xDF */
+	0xAF, 0x69, 0xAF, 0x6A, 0xC1, 0xA1, 0xF0, 0xEB, /* 0xE0-0xE3 */
+	0xF0, 0xEE, 0xAF, 0x6B, 0xF0, 0xED, 0xF0, 0xF0, /* 0xE4-0xE7 */
+	0xF0, 0xEC, 0xAF, 0x6C, 0xBB, 0xBE, 0xF0, 0xEF, /* 0xE8-0xEB */
+	0xAF, 0x6D, 0xAF, 0x6E, 0xAF, 0x6F, 0xAF, 0x70, /* 0xEC-0xEF */
+	0xCC, 0xB5, 0xF0, 0xF2, 0xAF, 0x71, 0xAF, 0x72, /* 0xF0-0xF3 */
+	0xB3, 0xD5, 0xAF, 0x73, 0xAF, 0x74, 0xAF, 0x75, /* 0xF4-0xF7 */
+	0xAF, 0x76, 0xB1, 0xD4, 0xAF, 0x77, 0xAF, 0x78, /* 0xF8-0xFB */
+	0xF0, 0xF3, 0xAF, 0x79, 0xAF, 0x7A, 0xF0, 0xF4, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_76[512] = {
+	0xF0, 0xF6, 0xB4, 0xE1, 0xAF, 0x7B, 0xF0, 0xF1, /* 0x00-0x03 */
+	0xAF, 0x7C, 0xF0, 0xF7, 0xAF, 0x7D, 0xAF, 0x7E, /* 0x04-0x07 */
+	0xAF, 0x80, 0xAF, 0x81, 0xF0, 0xFA, 0xAF, 0x82, /* 0x08-0x0B */
+	0xF0, 0xF8, 0xAF, 0x83, 0xAF, 0x84, 0xAF, 0x85, /* 0x0C-0x0F */
+	0xF0, 0xF5, 0xAF, 0x86, 0xAF, 0x87, 0xAF, 0x88, /* 0x10-0x13 */
+	0xAF, 0x89, 0xF0, 0xFD, 0xAF, 0x8A, 0xF0, 0xF9, /* 0x14-0x17 */
+	0xF0, 0xFC, 0xF0, 0xFE, 0xAF, 0x8B, 0xF1, 0xA1, /* 0x18-0x1B */
+	0xAF, 0x8C, 0xAF, 0x8D, 0xAF, 0x8E, 0xCE, 0xC1, /* 0x1C-0x1F */
+	0xF1, 0xA4, 0xAF, 0x8F, 0xF1, 0xA3, 0xAF, 0x90, /* 0x20-0x23 */
+	0xC1, 0xF6, 0xF0, 0xFB, 0xCA, 0xDD, 0xAF, 0x91, /* 0x24-0x27 */
+	0xAF, 0x92, 0xB4, 0xF1, 0xB1, 0xF1, 0xCC, 0xB1, /* 0x28-0x2B */
+	0xAF, 0x93, 0xF1, 0xA6, 0xAF, 0x94, 0xAF, 0x95, /* 0x2C-0x2F */
+	0xF1, 0xA7, 0xAF, 0x96, 0xAF, 0x97, 0xF1, 0xAC, /* 0x30-0x33 */
+	0xD5, 0xCE, 0xF1, 0xA9, 0xAF, 0x98, 0xAF, 0x99, /* 0x34-0x37 */
+	0xC8, 0xB3, 0xAF, 0x9A, 0xAF, 0x9B, 0xAF, 0x9C, /* 0x38-0x3B */
+	0xF1, 0xA2, 0xAF, 0x9D, 0xF1, 0xAB, 0xF1, 0xA8, /* 0x3C-0x3F */
+	0xF1, 0xA5, 0xAF, 0x9E, 0xAF, 0x9F, 0xF1, 0xAA, /* 0x40-0x43 */
+	0xAF, 0xA0, 0xB0, 0x40, 0xB0, 0x41, 0xB0, 0x42, /* 0x44-0x47 */
+	0xB0, 0x43, 0xB0, 0x44, 0xB0, 0x45, 0xB0, 0x46, /* 0x48-0x4B */
+	0xB0, 0xA9, 0xF1, 0xAD, 0xB0, 0x47, 0xB0, 0x48, /* 0x4C-0x4F */
+	0xB0, 0x49, 0xB0, 0x4A, 0xB0, 0x4B, 0xB0, 0x4C, /* 0x50-0x53 */
+	0xF1, 0xAF, 0xB0, 0x4D, 0xF1, 0xB1, 0xB0, 0x4E, /* 0x54-0x57 */
+	0xB0, 0x4F, 0xB0, 0x50, 0xB0, 0x51, 0xB0, 0x52, /* 0x58-0x5B */
+	0xF1, 0xB0, 0xB0, 0x53, 0xF1, 0xAE, 0xB0, 0x54, /* 0x5C-0x5F */
+	0xB0, 0x55, 0xB0, 0x56, 0xB0, 0x57, 0xD1, 0xA2, /* 0x60-0x63 */
+	0xB0, 0x58, 0xB0, 0x59, 0xB0, 0x5A, 0xB0, 0x5B, /* 0x64-0x67 */
+	0xB0, 0x5C, 0xB0, 0x5D, 0xB0, 0x5E, 0xF1, 0xB2, /* 0x68-0x6B */
+	0xB0, 0x5F, 0xB0, 0x60, 0xB0, 0x61, 0xF1, 0xB3, /* 0x6C-0x6F */
+	0xB0, 0x62, 0xB0, 0x63, 0xB0, 0x64, 0xB0, 0x65, /* 0x70-0x73 */
+	0xB0, 0x66, 0xB0, 0x67, 0xB0, 0x68, 0xB0, 0x69, /* 0x74-0x77 */
+	0xB9, 0xEF, 0xB0, 0x6A, 0xB0, 0x6B, 0xB5, 0xC7, /* 0x78-0x7B */
+	0xB0, 0x6C, 0xB0, 0xD7, 0xB0, 0xD9, 0xB0, 0x6D, /* 0x7C-0x7F */
+	
+	0xB0, 0x6E, 0xB0, 0x6F, 0xD4, 0xED, 0xB0, 0x70, /* 0x80-0x83 */
+	0xB5, 0xC4, 0xB0, 0x71, 0xBD, 0xD4, 0xBB, 0xCA, /* 0x84-0x87 */
+	0xF0, 0xA7, 0xB0, 0x72, 0xB0, 0x73, 0xB8, 0xDE, /* 0x88-0x8B */
+	0xB0, 0x74, 0xB0, 0x75, 0xF0, 0xA8, 0xB0, 0x76, /* 0x8C-0x8F */
+	0xB0, 0x77, 0xB0, 0xA8, 0xB0, 0x78, 0xF0, 0xA9, /* 0x90-0x93 */
+	0xB0, 0x79, 0xB0, 0x7A, 0xCD, 0xEE, 0xB0, 0x7B, /* 0x94-0x97 */
+	0xB0, 0x7C, 0xF0, 0xAA, 0xB0, 0x7D, 0xB0, 0x7E, /* 0x98-0x9B */
+	0xB0, 0x80, 0xB0, 0x81, 0xB0, 0x82, 0xB0, 0x83, /* 0x9C-0x9F */
+	0xB0, 0x84, 0xB0, 0x85, 0xB0, 0x86, 0xB0, 0x87, /* 0xA0-0xA3 */
+	0xF0, 0xAB, 0xB0, 0x88, 0xB0, 0x89, 0xB0, 0x8A, /* 0xA4-0xA7 */
+	0xB0, 0x8B, 0xB0, 0x8C, 0xB0, 0x8D, 0xB0, 0x8E, /* 0xA8-0xAB */
+	0xB0, 0x8F, 0xB0, 0x90, 0xC6, 0xA4, 0xB0, 0x91, /* 0xAC-0xAF */
+	0xB0, 0x92, 0xD6, 0xE5, 0xF1, 0xE4, 0xB0, 0x93, /* 0xB0-0xB3 */
+	0xF1, 0xE5, 0xB0, 0x94, 0xB0, 0x95, 0xB0, 0x96, /* 0xB4-0xB7 */
+	0xB0, 0x97, 0xB0, 0x98, 0xB0, 0x99, 0xB0, 0x9A, /* 0xB8-0xBB */
+	0xB0, 0x9B, 0xB0, 0x9C, 0xB0, 0x9D, 0xC3, 0xF3, /* 0xBC-0xBF */
+	0xB0, 0x9E, 0xB0, 0x9F, 0xD3, 0xDB, 0xB0, 0xA0, /* 0xC0-0xC3 */
+	0xB1, 0x40, 0xD6, 0xD1, 0xC5, 0xE8, 0xB1, 0x41, /* 0xC4-0xC7 */
+	0xD3, 0xAF, 0xB1, 0x42, 0xD2, 0xE6, 0xB1, 0x43, /* 0xC8-0xCB */
+	0xB1, 0x44, 0xEE, 0xC1, 0xB0, 0xBB, 0xD5, 0xB5, /* 0xCC-0xCF */
+	0xD1, 0xCE, 0xBC, 0xE0, 0xBA, 0xD0, 0xB1, 0x45, /* 0xD0-0xD3 */
+	0xBF, 0xF8, 0xB1, 0x46, 0xB8, 0xC7, 0xB5, 0xC1, /* 0xD4-0xD7 */
+	0xC5, 0xCC, 0xB1, 0x47, 0xB1, 0x48, 0xCA, 0xA2, /* 0xD8-0xDB */
+	0xB1, 0x49, 0xB1, 0x4A, 0xB1, 0x4B, 0xC3, 0xCB, /* 0xDC-0xDF */
+	0xB1, 0x4C, 0xB1, 0x4D, 0xB1, 0x4E, 0xB1, 0x4F, /* 0xE0-0xE3 */
+	0xB1, 0x50, 0xEE, 0xC2, 0xB1, 0x51, 0xB1, 0x52, /* 0xE4-0xE7 */
+	0xB1, 0x53, 0xB1, 0x54, 0xB1, 0x55, 0xB1, 0x56, /* 0xE8-0xEB */
+	0xB1, 0x57, 0xB1, 0x58, 0xC4, 0xBF, 0xB6, 0xA2, /* 0xEC-0xEF */
+	0xB1, 0x59, 0xED, 0xEC, 0xC3, 0xA4, 0xB1, 0x5A, /* 0xF0-0xF3 */
+	0xD6, 0xB1, 0xB1, 0x5B, 0xB1, 0x5C, 0xB1, 0x5D, /* 0xF4-0xF7 */
+	0xCF, 0xE0, 0xED, 0xEF, 0xB1, 0x5E, 0xB1, 0x5F, /* 0xF8-0xFB */
+	0xC5, 0xCE, 0xB1, 0x60, 0xB6, 0xDC, 0xB1, 0x61, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_77[512] = {
+	0xB1, 0x62, 0xCA, 0xA1, 0xB1, 0x63, 0xB1, 0x64, /* 0x00-0x03 */
+	0xED, 0xED, 0xB1, 0x65, 0xB1, 0x66, 0xED, 0xF0, /* 0x04-0x07 */
+	0xED, 0xF1, 0xC3, 0xBC, 0xB1, 0x67, 0xBF, 0xB4, /* 0x08-0x0B */
+	0xB1, 0x68, 0xED, 0xEE, 0xB1, 0x69, 0xB1, 0x6A, /* 0x0C-0x0F */
+	0xB1, 0x6B, 0xB1, 0x6C, 0xB1, 0x6D, 0xB1, 0x6E, /* 0x10-0x13 */
+	0xB1, 0x6F, 0xB1, 0x70, 0xB1, 0x71, 0xB1, 0x72, /* 0x14-0x17 */
+	0xB1, 0x73, 0xED, 0xF4, 0xED, 0xF2, 0xB1, 0x74, /* 0x18-0x1B */
+	0xB1, 0x75, 0xB1, 0x76, 0xB1, 0x77, 0xD5, 0xE6, /* 0x1C-0x1F */
+	0xC3, 0xDF, 0xB1, 0x78, 0xED, 0xF3, 0xB1, 0x79, /* 0x20-0x23 */
+	0xB1, 0x7A, 0xB1, 0x7B, 0xED, 0xF6, 0xB1, 0x7C, /* 0x24-0x27 */
+	0xD5, 0xA3, 0xD1, 0xA3, 0xB1, 0x7D, 0xB1, 0x7E, /* 0x28-0x2B */
+	0xB1, 0x80, 0xED, 0xF5, 0xB1, 0x81, 0xC3, 0xD0, /* 0x2C-0x2F */
+	0xB1, 0x82, 0xB1, 0x83, 0xB1, 0x84, 0xB1, 0x85, /* 0x30-0x33 */
+	0xB1, 0x86, 0xED, 0xF7, 0xBF, 0xF4, 0xBE, 0xEC, /* 0x34-0x37 */
+	0xED, 0xF8, 0xB1, 0x87, 0xCC, 0xF7, 0xB1, 0x88, /* 0x38-0x3B */
+	0xD1, 0xDB, 0xB1, 0x89, 0xB1, 0x8A, 0xB1, 0x8B, /* 0x3C-0x3F */
+	0xD7, 0xC5, 0xD5, 0xF6, 0xB1, 0x8C, 0xED, 0xFC, /* 0x40-0x43 */
+	0xB1, 0x8D, 0xB1, 0x8E, 0xB1, 0x8F, 0xED, 0xFB, /* 0x44-0x47 */
+	0xB1, 0x90, 0xB1, 0x91, 0xB1, 0x92, 0xB1, 0x93, /* 0x48-0x4B */
+	0xB1, 0x94, 0xB1, 0x95, 0xB1, 0x96, 0xB1, 0x97, /* 0x4C-0x4F */
+	0xED, 0xF9, 0xED, 0xFA, 0xB1, 0x98, 0xB1, 0x99, /* 0x50-0x53 */
+	0xB1, 0x9A, 0xB1, 0x9B, 0xB1, 0x9C, 0xB1, 0x9D, /* 0x54-0x57 */
+	0xB1, 0x9E, 0xB1, 0x9F, 0xED, 0xFD, 0xBE, 0xA6, /* 0x58-0x5B */
+	0xB1, 0xA0, 0xB2, 0x40, 0xB2, 0x41, 0xB2, 0x42, /* 0x5C-0x5F */
+	0xB2, 0x43, 0xCB, 0xAF, 0xEE, 0xA1, 0xB6, 0xBD, /* 0x60-0x63 */
+	0xB2, 0x44, 0xEE, 0xA2, 0xC4, 0xC0, 0xB2, 0x45, /* 0x64-0x67 */
+	0xED, 0xFE, 0xB2, 0x46, 0xB2, 0x47, 0xBD, 0xDE, /* 0x68-0x6B */
+	0xB2, 0xC7, 0xB2, 0x48, 0xB2, 0x49, 0xB2, 0x4A, /* 0x6C-0x6F */
+	0xB2, 0x4B, 0xB2, 0x4C, 0xB2, 0x4D, 0xB2, 0x4E, /* 0x70-0x73 */
+	0xB2, 0x4F, 0xB2, 0x50, 0xB2, 0x51, 0xB2, 0x52, /* 0x74-0x77 */
+	0xB2, 0x53, 0xB6, 0xC3, 0xB2, 0x54, 0xB2, 0x55, /* 0x78-0x7B */
+	0xB2, 0x56, 0xEE, 0xA5, 0xD8, 0xBA, 0xEE, 0xA3, /* 0x7C-0x7F */
+	
+	0xEE, 0xA6, 0xB2, 0x57, 0xB2, 0x58, 0xB2, 0x59, /* 0x80-0x83 */
+	0xC3, 0xE9, 0xB3, 0xF2, 0xB2, 0x5A, 0xB2, 0x5B, /* 0x84-0x87 */
+	0xB2, 0x5C, 0xB2, 0x5D, 0xB2, 0x5E, 0xB2, 0x5F, /* 0x88-0x8B */
+	0xEE, 0xA7, 0xEE, 0xA4, 0xCF, 0xB9, 0xB2, 0x60, /* 0x8C-0x8F */
+	0xB2, 0x61, 0xEE, 0xA8, 0xC2, 0xF7, 0xB2, 0x62, /* 0x90-0x93 */
+	0xB2, 0x63, 0xB2, 0x64, 0xB2, 0x65, 0xB2, 0x66, /* 0x94-0x97 */
+	0xB2, 0x67, 0xB2, 0x68, 0xB2, 0x69, 0xB2, 0x6A, /* 0x98-0x9B */
+	0xB2, 0x6B, 0xB2, 0x6C, 0xB2, 0x6D, 0xEE, 0xA9, /* 0x9C-0x9F */
+	0xEE, 0xAA, 0xB2, 0x6E, 0xDE, 0xAB, 0xB2, 0x6F, /* 0xA0-0xA3 */
+	0xB2, 0x70, 0xC6, 0xB3, 0xB2, 0x71, 0xC7, 0xC6, /* 0xA4-0xA7 */
+	0xB2, 0x72, 0xD6, 0xF5, 0xB5, 0xC9, 0xB2, 0x73, /* 0xA8-0xAB */
+	0xCB, 0xB2, 0xB2, 0x74, 0xB2, 0x75, 0xB2, 0x76, /* 0xAC-0xAF */
+	0xEE, 0xAB, 0xB2, 0x77, 0xB2, 0x78, 0xCD, 0xAB, /* 0xB0-0xB3 */
+	0xB2, 0x79, 0xEE, 0xAC, 0xB2, 0x7A, 0xB2, 0x7B, /* 0xB4-0xB7 */
+	0xB2, 0x7C, 0xB2, 0x7D, 0xB2, 0x7E, 0xD5, 0xB0, /* 0xB8-0xBB */
+	0xB2, 0x80, 0xEE, 0xAD, 0xB2, 0x81, 0xF6, 0xC4, /* 0xBC-0xBF */
+	0xB2, 0x82, 0xB2, 0x83, 0xB2, 0x84, 0xB2, 0x85, /* 0xC0-0xC3 */
+	0xB2, 0x86, 0xB2, 0x87, 0xB2, 0x88, 0xB2, 0x89, /* 0xC4-0xC7 */
+	0xB2, 0x8A, 0xB2, 0x8B, 0xB2, 0x8C, 0xB2, 0x8D, /* 0xC8-0xCB */
+	0xB2, 0x8E, 0xDB, 0xC7, 0xB2, 0x8F, 0xB2, 0x90, /* 0xCC-0xCF */
+	0xB2, 0x91, 0xB2, 0x92, 0xB2, 0x93, 0xB2, 0x94, /* 0xD0-0xD3 */
+	0xB2, 0x95, 0xB2, 0x96, 0xB2, 0x97, 0xB4, 0xA3, /* 0xD4-0xD7 */
+	0xB2, 0x98, 0xB2, 0x99, 0xB2, 0x9A, 0xC3, 0xAC, /* 0xD8-0xDB */
+	0xF1, 0xE6, 0xB2, 0x9B, 0xB2, 0x9C, 0xB2, 0x9D, /* 0xDC-0xDF */
+	0xB2, 0x9E, 0xB2, 0x9F, 0xCA, 0xB8, 0xD2, 0xD3, /* 0xE0-0xE3 */
+	0xB2, 0xA0, 0xD6, 0xAA, 0xB3, 0x40, 0xEF, 0xF2, /* 0xE4-0xE7 */
+	0xB3, 0x41, 0xBE, 0xD8, 0xB3, 0x42, 0xBD, 0xC3, /* 0xE8-0xEB */
+	0xEF, 0xF3, 0xB6, 0xCC, 0xB0, 0xAB, 0xB3, 0x43, /* 0xEC-0xEF */
+	0xB3, 0x44, 0xB3, 0x45, 0xB3, 0x46, 0xCA, 0xAF, /* 0xF0-0xF3 */
+	0xB3, 0x47, 0xB3, 0x48, 0xED, 0xB6, 0xB3, 0x49, /* 0xF4-0xF7 */
+	0xED, 0xB7, 0xB3, 0x4A, 0xB3, 0x4B, 0xB3, 0x4C, /* 0xF8-0xFB */
+	0xB3, 0x4D, 0xCE, 0xF9, 0xB7, 0xAF, 0xBF, 0xF3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_78[512] = {
+	0xED, 0xB8, 0xC2, 0xEB, 0xC9, 0xB0, 0xB3, 0x4E, /* 0x00-0x03 */
+	0xB3, 0x4F, 0xB3, 0x50, 0xB3, 0x51, 0xB3, 0x52, /* 0x04-0x07 */
+	0xB3, 0x53, 0xED, 0xB9, 0xB3, 0x54, 0xB3, 0x55, /* 0x08-0x0B */
+	0xC6, 0xF6, 0xBF, 0xB3, 0xB3, 0x56, 0xB3, 0x57, /* 0x0C-0x0F */
+	0xB3, 0x58, 0xED, 0xBC, 0xC5, 0xF8, 0xB3, 0x59, /* 0x10-0x13 */
+	0xD1, 0xD0, 0xB3, 0x5A, 0xD7, 0xA9, 0xED, 0xBA, /* 0x14-0x17 */
+	0xED, 0xBB, 0xB3, 0x5B, 0xD1, 0xE2, 0xB3, 0x5C, /* 0x18-0x1B */
+	0xED, 0xBF, 0xED, 0xC0, 0xB3, 0x5D, 0xED, 0xC4, /* 0x1C-0x1F */
+	0xB3, 0x5E, 0xB3, 0x5F, 0xB3, 0x60, 0xED, 0xC8, /* 0x20-0x23 */
+	0xB3, 0x61, 0xED, 0xC6, 0xED, 0xCE, 0xD5, 0xE8, /* 0x24-0x27 */
+	0xB3, 0x62, 0xED, 0xC9, 0xB3, 0x63, 0xB3, 0x64, /* 0x28-0x2B */
+	0xED, 0xC7, 0xED, 0xBE, 0xB3, 0x65, 0xB3, 0x66, /* 0x2C-0x2F */
+	0xC5, 0xE9, 0xB3, 0x67, 0xB3, 0x68, 0xB3, 0x69, /* 0x30-0x33 */
+	0xC6, 0xC6, 0xB3, 0x6A, 0xB3, 0x6B, 0xC9, 0xE9, /* 0x34-0x37 */
+	0xD4, 0xD2, 0xED, 0xC1, 0xED, 0xC2, 0xED, 0xC3, /* 0x38-0x3B */
+	0xED, 0xC5, 0xB3, 0x6C, 0xC0, 0xF9, 0xB3, 0x6D, /* 0x3C-0x3F */
+	0xB4, 0xA1, 0xB3, 0x6E, 0xB3, 0x6F, 0xB3, 0x70, /* 0x40-0x43 */
+	0xB3, 0x71, 0xB9, 0xE8, 0xB3, 0x72, 0xED, 0xD0, /* 0x44-0x47 */
+	0xB3, 0x73, 0xB3, 0x74, 0xB3, 0x75, 0xB3, 0x76, /* 0x48-0x4B */
+	0xED, 0xD1, 0xB3, 0x77, 0xED, 0xCA, 0xB3, 0x78, /* 0x4C-0x4F */
+	0xED, 0xCF, 0xB3, 0x79, 0xCE, 0xF8, 0xB3, 0x7A, /* 0x50-0x53 */
+	0xB3, 0x7B, 0xCB, 0xB6, 0xED, 0xCC, 0xED, 0xCD, /* 0x54-0x57 */
+	0xB3, 0x7C, 0xB3, 0x7D, 0xB3, 0x7E, 0xB3, 0x80, /* 0x58-0x5B */
+	0xB3, 0x81, 0xCF, 0xF5, 0xB3, 0x82, 0xB3, 0x83, /* 0x5C-0x5F */
+	0xB3, 0x84, 0xB3, 0x85, 0xB3, 0x86, 0xB3, 0x87, /* 0x60-0x63 */
+	0xB3, 0x88, 0xB3, 0x89, 0xB3, 0x8A, 0xB3, 0x8B, /* 0x64-0x67 */
+	0xB3, 0x8C, 0xB3, 0x8D, 0xED, 0xD2, 0xC1, 0xF2, /* 0x68-0x6B */
+	0xD3, 0xB2, 0xED, 0xCB, 0xC8, 0xB7, 0xB3, 0x8E, /* 0x6C-0x6F */
+	0xB3, 0x8F, 0xB3, 0x90, 0xB3, 0x91, 0xB3, 0x92, /* 0x70-0x73 */
+	0xB3, 0x93, 0xB3, 0x94, 0xB3, 0x95, 0xBC, 0xEF, /* 0x74-0x77 */
+	0xB3, 0x96, 0xB3, 0x97, 0xB3, 0x98, 0xB3, 0x99, /* 0x78-0x7B */
+	0xC5, 0xF0, 0xB3, 0x9A, 0xB3, 0x9B, 0xB3, 0x9C, /* 0x7C-0x7F */
+	
+	0xB3, 0x9D, 0xB3, 0x9E, 0xB3, 0x9F, 0xB3, 0xA0, /* 0x80-0x83 */
+	0xB4, 0x40, 0xB4, 0x41, 0xB4, 0x42, 0xED, 0xD6, /* 0x84-0x87 */
+	0xB4, 0x43, 0xB5, 0xEF, 0xB4, 0x44, 0xB4, 0x45, /* 0x88-0x8B */
+	0xC2, 0xB5, 0xB0, 0xAD, 0xCB, 0xE9, 0xB4, 0x46, /* 0x8C-0x8F */
+	0xB4, 0x47, 0xB1, 0xAE, 0xB4, 0x48, 0xED, 0xD4, /* 0x90-0x93 */
+	0xB4, 0x49, 0xB4, 0x4A, 0xB4, 0x4B, 0xCD, 0xEB, /* 0x94-0x97 */
+	0xB5, 0xE2, 0xB4, 0x4C, 0xED, 0xD5, 0xED, 0xD3, /* 0x98-0x9B */
+	0xED, 0xD7, 0xB4, 0x4D, 0xB4, 0x4E, 0xB5, 0xFA, /* 0x9C-0x9F */
+	0xB4, 0x4F, 0xED, 0xD8, 0xB4, 0x50, 0xED, 0xD9, /* 0xA0-0xA3 */
+	0xB4, 0x51, 0xED, 0xDC, 0xB4, 0x52, 0xB1, 0xCC, /* 0xA4-0xA7 */
+	0xB4, 0x53, 0xB4, 0x54, 0xB4, 0x55, 0xB4, 0x56, /* 0xA8-0xAB */
+	0xB4, 0x57, 0xB4, 0x58, 0xB4, 0x59, 0xB4, 0x5A, /* 0xAC-0xAF */
+	0xC5, 0xF6, 0xBC, 0xEE, 0xED, 0xDA, 0xCC, 0xBC, /* 0xB0-0xB3 */
+	0xB2, 0xEA, 0xB4, 0x5B, 0xB4, 0x5C, 0xB4, 0x5D, /* 0xB4-0xB7 */
+	0xB4, 0x5E, 0xED, 0xDB, 0xB4, 0x5F, 0xB4, 0x60, /* 0xB8-0xBB */
+	0xB4, 0x61, 0xB4, 0x62, 0xC4, 0xEB, 0xB4, 0x63, /* 0xBC-0xBF */
+	0xB4, 0x64, 0xB4, 0xC5, 0xB4, 0x65, 0xB4, 0x66, /* 0xC0-0xC3 */
+	0xB4, 0x67, 0xB0, 0xF5, 0xB4, 0x68, 0xB4, 0x69, /* 0xC4-0xC7 */
+	0xB4, 0x6A, 0xED, 0xDF, 0xC0, 0xDA, 0xB4, 0xE8, /* 0xC8-0xCB */
+	0xB4, 0x6B, 0xB4, 0x6C, 0xB4, 0x6D, 0xB4, 0x6E, /* 0xCC-0xCF */
+	0xC5, 0xCD, 0xB4, 0x6F, 0xB4, 0x70, 0xB4, 0x71, /* 0xD0-0xD3 */
+	0xED, 0xDD, 0xBF, 0xC4, 0xB4, 0x72, 0xB4, 0x73, /* 0xD4-0xD7 */
+	0xB4, 0x74, 0xED, 0xDE, 0xB4, 0x75, 0xB4, 0x76, /* 0xD8-0xDB */
+	0xB4, 0x77, 0xB4, 0x78, 0xB4, 0x79, 0xB4, 0x7A, /* 0xDC-0xDF */
+	0xB4, 0x7B, 0xB4, 0x7C, 0xB4, 0x7D, 0xB4, 0x7E, /* 0xE0-0xE3 */
+	0xB4, 0x80, 0xB4, 0x81, 0xB4, 0x82, 0xB4, 0x83, /* 0xE4-0xE7 */
+	0xC4, 0xA5, 0xB4, 0x84, 0xB4, 0x85, 0xB4, 0x86, /* 0xE8-0xEB */
+	0xED, 0xE0, 0xB4, 0x87, 0xB4, 0x88, 0xB4, 0x89, /* 0xEC-0xEF */
+	0xB4, 0x8A, 0xB4, 0x8B, 0xED, 0xE1, 0xB4, 0x8C, /* 0xF0-0xF3 */
+	0xED, 0xE3, 0xB4, 0x8D, 0xB4, 0x8E, 0xC1, 0xD7, /* 0xF4-0xF7 */
+	0xB4, 0x8F, 0xB4, 0x90, 0xBB, 0xC7, 0xB4, 0x91, /* 0xF8-0xFB */
+	0xB4, 0x92, 0xB4, 0x93, 0xB4, 0x94, 0xB4, 0x95, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_79[512] = {
+	0xB4, 0x96, 0xBD, 0xB8, 0xB4, 0x97, 0xB4, 0x98, /* 0x00-0x03 */
+	0xB4, 0x99, 0xED, 0xE2, 0xB4, 0x9A, 0xB4, 0x9B, /* 0x04-0x07 */
+	0xB4, 0x9C, 0xB4, 0x9D, 0xB4, 0x9E, 0xB4, 0x9F, /* 0x08-0x0B */
+	0xB4, 0xA0, 0xB5, 0x40, 0xB5, 0x41, 0xB5, 0x42, /* 0x0C-0x0F */
+	0xB5, 0x43, 0xB5, 0x44, 0xB5, 0x45, 0xED, 0xE4, /* 0x10-0x13 */
+	0xB5, 0x46, 0xB5, 0x47, 0xB5, 0x48, 0xB5, 0x49, /* 0x14-0x17 */
+	0xB5, 0x4A, 0xB5, 0x4B, 0xB5, 0x4C, 0xB5, 0x4D, /* 0x18-0x1B */
+	0xB5, 0x4E, 0xB5, 0x4F, 0xED, 0xE6, 0xB5, 0x50, /* 0x1C-0x1F */
+	0xB5, 0x51, 0xB5, 0x52, 0xB5, 0x53, 0xB5, 0x54, /* 0x20-0x23 */
+	0xED, 0xE5, 0xB5, 0x55, 0xB5, 0x56, 0xB5, 0x57, /* 0x24-0x27 */
+	0xB5, 0x58, 0xB5, 0x59, 0xB5, 0x5A, 0xB5, 0x5B, /* 0x28-0x2B */
+	0xB5, 0x5C, 0xB5, 0x5D, 0xB5, 0x5E, 0xB5, 0x5F, /* 0x2C-0x2F */
+	0xB5, 0x60, 0xB5, 0x61, 0xB5, 0x62, 0xB5, 0x63, /* 0x30-0x33 */
+	0xED, 0xE7, 0xB5, 0x64, 0xB5, 0x65, 0xB5, 0x66, /* 0x34-0x37 */
+	0xB5, 0x67, 0xB5, 0x68, 0xCA, 0xBE, 0xEC, 0xEA, /* 0x38-0x3B */
+	0xC0, 0xF1, 0xB5, 0x69, 0xC9, 0xE7, 0xB5, 0x6A, /* 0x3C-0x3F */
+	0xEC, 0xEB, 0xC6, 0xEE, 0xB5, 0x6B, 0xB5, 0x6C, /* 0x40-0x43 */
+	0xB5, 0x6D, 0xB5, 0x6E, 0xEC, 0xEC, 0xB5, 0x6F, /* 0x44-0x47 */
+	0xC6, 0xED, 0xEC, 0xED, 0xB5, 0x70, 0xB5, 0x71, /* 0x48-0x4B */
+	0xB5, 0x72, 0xB5, 0x73, 0xB5, 0x74, 0xB5, 0x75, /* 0x4C-0x4F */
+	0xB5, 0x76, 0xB5, 0x77, 0xB5, 0x78, 0xEC, 0xF0, /* 0x50-0x53 */
+	0xB5, 0x79, 0xB5, 0x7A, 0xD7, 0xE6, 0xEC, 0xF3, /* 0x54-0x57 */
+	0xB5, 0x7B, 0xB5, 0x7C, 0xEC, 0xF1, 0xEC, 0xEE, /* 0x58-0x5B */
+	0xEC, 0xEF, 0xD7, 0xA3, 0xC9, 0xF1, 0xCB, 0xEE, /* 0x5C-0x5F */
+	0xEC, 0xF4, 0xB5, 0x7D, 0xEC, 0xF2, 0xB5, 0x7E, /* 0x60-0x63 */
+	0xB5, 0x80, 0xCF, 0xE9, 0xB5, 0x81, 0xEC, 0xF6, /* 0x64-0x67 */
+	0xC6, 0xB1, 0xB5, 0x82, 0xB5, 0x83, 0xB5, 0x84, /* 0x68-0x6B */
+	0xB5, 0x85, 0xBC, 0xC0, 0xB5, 0x86, 0xEC, 0xF5, /* 0x6C-0x6F */
+	0xB5, 0x87, 0xB5, 0x88, 0xB5, 0x89, 0xB5, 0x8A, /* 0x70-0x73 */
+	0xB5, 0x8B, 0xB5, 0x8C, 0xB5, 0x8D, 0xB5, 0xBB, /* 0x74-0x77 */
+	0xBB, 0xF6, 0xB5, 0x8E, 0xEC, 0xF7, 0xB5, 0x8F, /* 0x78-0x7B */
+	0xB5, 0x90, 0xB5, 0x91, 0xB5, 0x92, 0xB5, 0x93, /* 0x7C-0x7F */
+	
+	0xD9, 0xF7, 0xBD, 0xFB, 0xB5, 0x94, 0xB5, 0x95, /* 0x80-0x83 */
+	0xC2, 0xBB, 0xEC, 0xF8, 0xB5, 0x96, 0xB5, 0x97, /* 0x84-0x87 */
+	0xB5, 0x98, 0xB5, 0x99, 0xEC, 0xF9, 0xB5, 0x9A, /* 0x88-0x8B */
+	0xB5, 0x9B, 0xB5, 0x9C, 0xB5, 0x9D, 0xB8, 0xA3, /* 0x8C-0x8F */
+	0xB5, 0x9E, 0xB5, 0x9F, 0xB5, 0xA0, 0xB6, 0x40, /* 0x90-0x93 */
+	0xB6, 0x41, 0xB6, 0x42, 0xB6, 0x43, 0xB6, 0x44, /* 0x94-0x97 */
+	0xB6, 0x45, 0xB6, 0x46, 0xEC, 0xFA, 0xB6, 0x47, /* 0x98-0x9B */
+	0xB6, 0x48, 0xB6, 0x49, 0xB6, 0x4A, 0xB6, 0x4B, /* 0x9C-0x9F */
+	0xB6, 0x4C, 0xB6, 0x4D, 0xB6, 0x4E, 0xB6, 0x4F, /* 0xA0-0xA3 */
+	0xB6, 0x50, 0xB6, 0x51, 0xB6, 0x52, 0xEC, 0xFB, /* 0xA4-0xA7 */
+	0xB6, 0x53, 0xB6, 0x54, 0xB6, 0x55, 0xB6, 0x56, /* 0xA8-0xAB */
+	0xB6, 0x57, 0xB6, 0x58, 0xB6, 0x59, 0xB6, 0x5A, /* 0xAC-0xAF */
+	0xB6, 0x5B, 0xB6, 0x5C, 0xB6, 0x5D, 0xEC, 0xFC, /* 0xB0-0xB3 */
+	0xB6, 0x5E, 0xB6, 0x5F, 0xB6, 0x60, 0xB6, 0x61, /* 0xB4-0xB7 */
+	0xB6, 0x62, 0xD3, 0xED, 0xD8, 0xAE, 0xC0, 0xEB, /* 0xB8-0xBB */
+	0xB6, 0x63, 0xC7, 0xDD, 0xBA, 0xCC, 0xB6, 0x64, /* 0xBC-0xBF */
+	0xD0, 0xE3, 0xCB, 0xBD, 0xB6, 0x65, 0xCD, 0xBA, /* 0xC0-0xC3 */
+	0xB6, 0x66, 0xB6, 0x67, 0xB8, 0xD1, 0xB6, 0x68, /* 0xC4-0xC7 */
+	0xB6, 0x69, 0xB1, 0xFC, 0xB6, 0x6A, 0xC7, 0xEF, /* 0xC8-0xCB */
+	0xB6, 0x6B, 0xD6, 0xD6, 0xB6, 0x6C, 0xB6, 0x6D, /* 0xCC-0xCF */
+	0xB6, 0x6E, 0xBF, 0xC6, 0xC3, 0xEB, 0xB6, 0x6F, /* 0xD0-0xD3 */
+	0xB6, 0x70, 0xEF, 0xF5, 0xB6, 0x71, 0xB6, 0x72, /* 0xD4-0xD7 */
+	0xC3, 0xD8, 0xB6, 0x73, 0xB6, 0x74, 0xB6, 0x75, /* 0xD8-0xDB */
+	0xB6, 0x76, 0xB6, 0x77, 0xB6, 0x78, 0xD7, 0xE2, /* 0xDC-0xDF */
+	0xB6, 0x79, 0xB6, 0x7A, 0xB6, 0x7B, 0xEF, 0xF7, /* 0xE0-0xE3 */
+	0xB3, 0xD3, 0xB6, 0x7C, 0xC7, 0xD8, 0xD1, 0xED, /* 0xE4-0xE7 */
+	0xB6, 0x7D, 0xD6, 0xC8, 0xB6, 0x7E, 0xEF, 0xF8, /* 0xE8-0xEB */
+	0xB6, 0x80, 0xEF, 0xF6, 0xB6, 0x81, 0xBB, 0xFD, /* 0xEC-0xEF */
+	0xB3, 0xC6, 0xB6, 0x82, 0xB6, 0x83, 0xB6, 0x84, /* 0xF0-0xF3 */
+	0xB6, 0x85, 0xB6, 0x86, 0xB6, 0x87, 0xB6, 0x88, /* 0xF4-0xF7 */
+	0xBD, 0xD5, 0xB6, 0x89, 0xB6, 0x8A, 0xD2, 0xC6, /* 0xF8-0xFB */
+	0xB6, 0x8B, 0xBB, 0xE0, 0xB6, 0x8C, 0xB6, 0x8D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7A[512] = {
+	0xCF, 0xA1, 0xB6, 0x8E, 0xEF, 0xFC, 0xEF, 0xFB, /* 0x00-0x03 */
+	0xB6, 0x8F, 0xB6, 0x90, 0xEF, 0xF9, 0xB6, 0x91, /* 0x04-0x07 */
+	0xB6, 0x92, 0xB6, 0x93, 0xB6, 0x94, 0xB3, 0xCC, /* 0x08-0x0B */
+	0xB6, 0x95, 0xC9, 0xD4, 0xCB, 0xB0, 0xB6, 0x96, /* 0x0C-0x0F */
+	0xB6, 0x97, 0xB6, 0x98, 0xB6, 0x99, 0xB6, 0x9A, /* 0x10-0x13 */
+	0xEF, 0xFE, 0xB6, 0x9B, 0xB6, 0x9C, 0xB0, 0xDE, /* 0x14-0x17 */
+	0xB6, 0x9D, 0xB6, 0x9E, 0xD6, 0xC9, 0xB6, 0x9F, /* 0x18-0x1B */
+	0xB6, 0xA0, 0xB7, 0x40, 0xEF, 0xFD, 0xB7, 0x41, /* 0x1C-0x1F */
+	0xB3, 0xED, 0xB7, 0x42, 0xB7, 0x43, 0xF6, 0xD5, /* 0x20-0x23 */
+	0xB7, 0x44, 0xB7, 0x45, 0xB7, 0x46, 0xB7, 0x47, /* 0x24-0x27 */
+	0xB7, 0x48, 0xB7, 0x49, 0xB7, 0x4A, 0xB7, 0x4B, /* 0x28-0x2B */
+	0xB7, 0x4C, 0xB7, 0x4D, 0xB7, 0x4E, 0xB7, 0x4F, /* 0x2C-0x2F */
+	0xB7, 0x50, 0xB7, 0x51, 0xB7, 0x52, 0xCE, 0xC8, /* 0x30-0x33 */
+	0xB7, 0x53, 0xB7, 0x54, 0xB7, 0x55, 0xF0, 0xA2, /* 0x34-0x37 */
+	0xB7, 0x56, 0xF0, 0xA1, 0xB7, 0x57, 0xB5, 0xBE, /* 0x38-0x3B */
+	0xBC, 0xDA, 0xBB, 0xFC, 0xB7, 0x58, 0xB8, 0xE5, /* 0x3C-0x3F */
+	0xB7, 0x59, 0xB7, 0x5A, 0xB7, 0x5B, 0xB7, 0x5C, /* 0x40-0x43 */
+	0xB7, 0x5D, 0xB7, 0x5E, 0xC4, 0xC2, 0xB7, 0x5F, /* 0x44-0x47 */
+	0xB7, 0x60, 0xB7, 0x61, 0xB7, 0x62, 0xB7, 0x63, /* 0x48-0x4B */
+	0xB7, 0x64, 0xB7, 0x65, 0xB7, 0x66, 0xB7, 0x67, /* 0x4C-0x4F */
+	0xB7, 0x68, 0xF0, 0xA3, 0xB7, 0x69, 0xB7, 0x6A, /* 0x50-0x53 */
+	0xB7, 0x6B, 0xB7, 0x6C, 0xB7, 0x6D, 0xCB, 0xEB, /* 0x54-0x57 */
+	0xB7, 0x6E, 0xB7, 0x6F, 0xB7, 0x70, 0xB7, 0x71, /* 0x58-0x5B */
+	0xB7, 0x72, 0xB7, 0x73, 0xB7, 0x74, 0xB7, 0x75, /* 0x5C-0x5F */
+	0xB7, 0x76, 0xB7, 0x77, 0xB7, 0x78, 0xB7, 0x79, /* 0x60-0x63 */
+	0xB7, 0x7A, 0xB7, 0x7B, 0xB7, 0x7C, 0xB7, 0x7D, /* 0x64-0x67 */
+	0xB7, 0x7E, 0xB7, 0x80, 0xB7, 0x81, 0xB7, 0x82, /* 0x68-0x6B */
+	0xB7, 0x83, 0xB7, 0x84, 0xB7, 0x85, 0xB7, 0x86, /* 0x6C-0x6F */
+	0xF0, 0xA6, 0xB7, 0x87, 0xB7, 0x88, 0xB7, 0x89, /* 0x70-0x73 */
+	0xD1, 0xA8, 0xB7, 0x8A, 0xBE, 0xBF, 0xC7, 0xEE, /* 0x74-0x77 */
+	0xF1, 0xB6, 0xF1, 0xB7, 0xBF, 0xD5, 0xB7, 0x8B, /* 0x78-0x7B */
+	0xB7, 0x8C, 0xB7, 0x8D, 0xB7, 0x8E, 0xB4, 0xA9, /* 0x7C-0x7F */
+	
+	0xF1, 0xB8, 0xCD, 0xBB, 0xB7, 0x8F, 0xC7, 0xD4, /* 0x80-0x83 */
+	0xD5, 0xAD, 0xB7, 0x90, 0xF1, 0xB9, 0xB7, 0x91, /* 0x84-0x87 */
+	0xF1, 0xBA, 0xB7, 0x92, 0xB7, 0x93, 0xB7, 0x94, /* 0x88-0x8B */
+	0xB7, 0x95, 0xC7, 0xCF, 0xB7, 0x96, 0xB7, 0x97, /* 0x8C-0x8F */
+	0xB7, 0x98, 0xD2, 0xA4, 0xD6, 0xCF, 0xB7, 0x99, /* 0x90-0x93 */
+	0xB7, 0x9A, 0xF1, 0xBB, 0xBD, 0xD1, 0xB4, 0xB0, /* 0x94-0x97 */
+	0xBE, 0xBD, 0xB7, 0x9B, 0xB7, 0x9C, 0xB7, 0x9D, /* 0x98-0x9B */
+	0xB4, 0xDC, 0xCE, 0xD1, 0xB7, 0x9E, 0xBF, 0xDF, /* 0x9C-0x9F */
+	0xF1, 0xBD, 0xB7, 0x9F, 0xB7, 0xA0, 0xB8, 0x40, /* 0xA0-0xA3 */
+	0xB8, 0x41, 0xBF, 0xFA, 0xF1, 0xBC, 0xB8, 0x42, /* 0xA4-0xA7 */
+	0xF1, 0xBF, 0xB8, 0x43, 0xB8, 0x44, 0xB8, 0x45, /* 0xA8-0xAB */
+	0xF1, 0xBE, 0xF1, 0xC0, 0xB8, 0x46, 0xB8, 0x47, /* 0xAC-0xAF */
+	0xB8, 0x48, 0xB8, 0x49, 0xB8, 0x4A, 0xF1, 0xC1, /* 0xB0-0xB3 */
+	0xB8, 0x4B, 0xB8, 0x4C, 0xB8, 0x4D, 0xB8, 0x4E, /* 0xB4-0xB7 */
+	0xB8, 0x4F, 0xB8, 0x50, 0xB8, 0x51, 0xB8, 0x52, /* 0xB8-0xBB */
+	0xB8, 0x53, 0xB8, 0x54, 0xB8, 0x55, 0xC1, 0xFE, /* 0xBC-0xBF */
+	0xB8, 0x56, 0xB8, 0x57, 0xB8, 0x58, 0xB8, 0x59, /* 0xC0-0xC3 */
+	0xB8, 0x5A, 0xB8, 0x5B, 0xB8, 0x5C, 0xB8, 0x5D, /* 0xC4-0xC7 */
+	0xB8, 0x5E, 0xB8, 0x5F, 0xB8, 0x60, 0xC1, 0xA2, /* 0xC8-0xCB */
+	0xB8, 0x61, 0xB8, 0x62, 0xB8, 0x63, 0xB8, 0x64, /* 0xCC-0xCF */
+	0xB8, 0x65, 0xB8, 0x66, 0xB8, 0x67, 0xB8, 0x68, /* 0xD0-0xD3 */
+	0xB8, 0x69, 0xB8, 0x6A, 0xCA, 0xFA, 0xB8, 0x6B, /* 0xD4-0xD7 */
+	0xB8, 0x6C, 0xD5, 0xBE, 0xB8, 0x6D, 0xB8, 0x6E, /* 0xD8-0xDB */
+	0xB8, 0x6F, 0xB8, 0x70, 0xBE, 0xBA, 0xBE, 0xB9, /* 0xDC-0xDF */
+	0xD5, 0xC2, 0xB8, 0x71, 0xB8, 0x72, 0xBF, 0xA2, /* 0xE0-0xE3 */
+	0xB8, 0x73, 0xCD, 0xAF, 0xF1, 0xB5, 0xB8, 0x74, /* 0xE4-0xE7 */
+	0xB8, 0x75, 0xB8, 0x76, 0xB8, 0x77, 0xB8, 0x78, /* 0xE8-0xEB */
+	0xB8, 0x79, 0xBD, 0xDF, 0xB8, 0x7A, 0xB6, 0xCB, /* 0xEC-0xEF */
+	0xB8, 0x7B, 0xB8, 0x7C, 0xB8, 0x7D, 0xB8, 0x7E, /* 0xF0-0xF3 */
+	0xB8, 0x80, 0xB8, 0x81, 0xB8, 0x82, 0xB8, 0x83, /* 0xF4-0xF7 */
+	0xB8, 0x84, 0xD6, 0xF1, 0xF3, 0xC3, 0xB8, 0x85, /* 0xF8-0xFB */
+	0xB8, 0x86, 0xF3, 0xC4, 0xB8, 0x87, 0xB8, 0xCD, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7B[512] = {
+	0xB8, 0x88, 0xB8, 0x89, 0xB8, 0x8A, 0xF3, 0xC6, /* 0x00-0x03 */
+	0xF3, 0xC7, 0xB8, 0x8B, 0xB0, 0xCA, 0xB8, 0x8C, /* 0x04-0x07 */
+	0xF3, 0xC5, 0xB8, 0x8D, 0xF3, 0xC9, 0xCB, 0xF1, /* 0x08-0x0B */
+	0xB8, 0x8E, 0xB8, 0x8F, 0xB8, 0x90, 0xF3, 0xCB, /* 0x0C-0x0F */
+	0xB8, 0x91, 0xD0, 0xA6, 0xB8, 0x92, 0xB8, 0x93, /* 0x10-0x13 */
+	0xB1, 0xCA, 0xF3, 0xC8, 0xB8, 0x94, 0xB8, 0x95, /* 0x14-0x17 */
+	0xB8, 0x96, 0xF3, 0xCF, 0xB8, 0x97, 0xB5, 0xD1, /* 0x18-0x1B */
+	0xB8, 0x98, 0xB8, 0x99, 0xF3, 0xD7, 0xB8, 0x9A, /* 0x1C-0x1F */
+	0xF3, 0xD2, 0xB8, 0x9B, 0xB8, 0x9C, 0xB8, 0x9D, /* 0x20-0x23 */
+	0xF3, 0xD4, 0xF3, 0xD3, 0xB7, 0xFB, 0xB8, 0x9E, /* 0x24-0x27 */
+	0xB1, 0xBF, 0xB8, 0x9F, 0xF3, 0xCE, 0xF3, 0xCA, /* 0x28-0x2B */
+	0xB5, 0xDA, 0xB8, 0xA0, 0xF3, 0xD0, 0xB9, 0x40, /* 0x2C-0x2F */
+	0xB9, 0x41, 0xF3, 0xD1, 0xB9, 0x42, 0xF3, 0xD5, /* 0x30-0x33 */
+	0xB9, 0x43, 0xB9, 0x44, 0xB9, 0x45, 0xB9, 0x46, /* 0x34-0x37 */
+	0xF3, 0xCD, 0xB9, 0x47, 0xBC, 0xE3, 0xB9, 0x48, /* 0x38-0x3B */
+	0xC1, 0xFD, 0xB9, 0x49, 0xF3, 0xD6, 0xB9, 0x4A, /* 0x3C-0x3F */
+	0xB9, 0x4B, 0xB9, 0x4C, 0xB9, 0x4D, 0xB9, 0x4E, /* 0x40-0x43 */
+	0xB9, 0x4F, 0xF3, 0xDA, 0xB9, 0x50, 0xF3, 0xCC, /* 0x44-0x47 */
+	0xB9, 0x51, 0xB5, 0xC8, 0xB9, 0x52, 0xBD, 0xEE, /* 0x48-0x4B */
+	0xF3, 0xDC, 0xB9, 0x53, 0xB9, 0x54, 0xB7, 0xA4, /* 0x4C-0x4F */
+	0xBF, 0xF0, 0xD6, 0xFE, 0xCD, 0xB2, 0xB9, 0x55, /* 0x50-0x53 */
+	0xB4, 0xF0, 0xB9, 0x56, 0xB2, 0xDF, 0xB9, 0x57, /* 0x54-0x57 */
+	0xF3, 0xD8, 0xB9, 0x58, 0xF3, 0xD9, 0xC9, 0xB8, /* 0x58-0x5B */
+	0xB9, 0x59, 0xF3, 0xDD, 0xB9, 0x5A, 0xB9, 0x5B, /* 0x5C-0x5F */
+	0xF3, 0xDE, 0xB9, 0x5C, 0xF3, 0xE1, 0xB9, 0x5D, /* 0x60-0x63 */
+	0xB9, 0x5E, 0xB9, 0x5F, 0xB9, 0x60, 0xB9, 0x61, /* 0x64-0x67 */
+	0xB9, 0x62, 0xB9, 0x63, 0xB9, 0x64, 0xB9, 0x65, /* 0x68-0x6B */
+	0xB9, 0x66, 0xB9, 0x67, 0xF3, 0xDF, 0xB9, 0x68, /* 0x6C-0x6F */
+	0xB9, 0x69, 0xF3, 0xE3, 0xF3, 0xE2, 0xB9, 0x6A, /* 0x70-0x73 */
+	0xB9, 0x6B, 0xF3, 0xDB, 0xB9, 0x6C, 0xBF, 0xEA, /* 0x74-0x77 */
+	0xB9, 0x6D, 0xB3, 0xEF, 0xB9, 0x6E, 0xF3, 0xE0, /* 0x78-0x7B */
+	0xB9, 0x6F, 0xB9, 0x70, 0xC7, 0xA9, 0xB9, 0x71, /* 0x7C-0x7F */
+	
+	0xBC, 0xF2, 0xB9, 0x72, 0xB9, 0x73, 0xB9, 0x74, /* 0x80-0x83 */
+	0xB9, 0x75, 0xF3, 0xEB, 0xB9, 0x76, 0xB9, 0x77, /* 0x84-0x87 */
+	0xB9, 0x78, 0xB9, 0x79, 0xB9, 0x7A, 0xB9, 0x7B, /* 0x88-0x8B */
+	0xB9, 0x7C, 0xB9, 0xBF, 0xB9, 0x7D, 0xB9, 0x7E, /* 0x8C-0x8F */
+	0xF3, 0xE4, 0xB9, 0x80, 0xB9, 0x81, 0xB9, 0x82, /* 0x90-0x93 */
+	0xB2, 0xAD, 0xBB, 0xFE, 0xB9, 0x83, 0xCB, 0xE3, /* 0x94-0x97 */
+	0xB9, 0x84, 0xB9, 0x85, 0xB9, 0x86, 0xB9, 0x87, /* 0x98-0x9B */
+	0xF3, 0xED, 0xF3, 0xE9, 0xB9, 0x88, 0xB9, 0x89, /* 0x9C-0x9F */
+	0xB9, 0x8A, 0xB9, 0xDC, 0xF3, 0xEE, 0xB9, 0x8B, /* 0xA0-0xA3 */
+	0xB9, 0x8C, 0xB9, 0x8D, 0xF3, 0xE5, 0xF3, 0xE6, /* 0xA4-0xA7 */
+	0xF3, 0xEA, 0xC2, 0xE1, 0xF3, 0xEC, 0xF3, 0xEF, /* 0xA8-0xAB */
+	0xF3, 0xE8, 0xBC, 0xFD, 0xB9, 0x8E, 0xB9, 0x8F, /* 0xAC-0xAF */
+	0xB9, 0x90, 0xCF, 0xE4, 0xB9, 0x91, 0xB9, 0x92, /* 0xB0-0xB3 */
+	0xF3, 0xF0, 0xB9, 0x93, 0xB9, 0x94, 0xB9, 0x95, /* 0xB4-0xB7 */
+	0xF3, 0xE7, 0xB9, 0x96, 0xB9, 0x97, 0xB9, 0x98, /* 0xB8-0xBB */
+	0xB9, 0x99, 0xB9, 0x9A, 0xB9, 0x9B, 0xB9, 0x9C, /* 0xBC-0xBF */
+	0xB9, 0x9D, 0xF3, 0xF2, 0xB9, 0x9E, 0xB9, 0x9F, /* 0xC0-0xC3 */
+	0xB9, 0xA0, 0xBA, 0x40, 0xD7, 0xAD, 0xC6, 0xAA, /* 0xC4-0xC7 */
+	0xBA, 0x41, 0xBA, 0x42, 0xBA, 0x43, 0xBA, 0x44, /* 0xC8-0xCB */
+	0xF3, 0xF3, 0xBA, 0x45, 0xBA, 0x46, 0xBA, 0x47, /* 0xCC-0xCF */
+	0xBA, 0x48, 0xF3, 0xF1, 0xBA, 0x49, 0xC2, 0xA8, /* 0xD0-0xD3 */
+	0xBA, 0x4A, 0xBA, 0x4B, 0xBA, 0x4C, 0xBA, 0x4D, /* 0xD4-0xD7 */
+	0xBA, 0x4E, 0xB8, 0xDD, 0xF3, 0xF5, 0xBA, 0x4F, /* 0xD8-0xDB */
+	0xBA, 0x50, 0xF3, 0xF4, 0xBA, 0x51, 0xBA, 0x52, /* 0xDC-0xDF */
+	0xBA, 0x53, 0xB4, 0xDB, 0xBA, 0x54, 0xBA, 0x55, /* 0xE0-0xE3 */
+	0xBA, 0x56, 0xF3, 0xF6, 0xF3, 0xF7, 0xBA, 0x57, /* 0xE4-0xE7 */
+	0xBA, 0x58, 0xBA, 0x59, 0xF3, 0xF8, 0xBA, 0x5A, /* 0xE8-0xEB */
+	0xBA, 0x5B, 0xBA, 0x5C, 0xC0, 0xBA, 0xBA, 0x5D, /* 0xEC-0xEF */
+	0xBA, 0x5E, 0xC0, 0xE9, 0xBA, 0x5F, 0xBA, 0x60, /* 0xF0-0xF3 */
+	0xBA, 0x61, 0xBA, 0x62, 0xBA, 0x63, 0xC5, 0xF1, /* 0xF4-0xF7 */
+	0xBA, 0x64, 0xBA, 0x65, 0xBA, 0x66, 0xBA, 0x67, /* 0xF8-0xFB */
+	0xF3, 0xFB, 0xBA, 0x68, 0xF3, 0xFA, 0xBA, 0x69, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7C[512] = {
+	0xBA, 0x6A, 0xBA, 0x6B, 0xBA, 0x6C, 0xBA, 0x6D, /* 0x00-0x03 */
+	0xBA, 0x6E, 0xBA, 0x6F, 0xBA, 0x70, 0xB4, 0xD8, /* 0x04-0x07 */
+	0xBA, 0x71, 0xBA, 0x72, 0xBA, 0x73, 0xF3, 0xFE, /* 0x08-0x0B */
+	0xF3, 0xF9, 0xBA, 0x74, 0xBA, 0x75, 0xF3, 0xFC, /* 0x0C-0x0F */
+	0xBA, 0x76, 0xBA, 0x77, 0xBA, 0x78, 0xBA, 0x79, /* 0x10-0x13 */
+	0xBA, 0x7A, 0xBA, 0x7B, 0xF3, 0xFD, 0xBA, 0x7C, /* 0x14-0x17 */
+	0xBA, 0x7D, 0xBA, 0x7E, 0xBA, 0x80, 0xBA, 0x81, /* 0x18-0x1B */
+	0xBA, 0x82, 0xBA, 0x83, 0xBA, 0x84, 0xF4, 0xA1, /* 0x1C-0x1F */
+	0xBA, 0x85, 0xBA, 0x86, 0xBA, 0x87, 0xBA, 0x88, /* 0x20-0x23 */
+	0xBA, 0x89, 0xBA, 0x8A, 0xF4, 0xA3, 0xBB, 0xC9, /* 0x24-0x27 */
+	0xBA, 0x8B, 0xBA, 0x8C, 0xF4, 0xA2, 0xBA, 0x8D, /* 0x28-0x2B */
+	0xBA, 0x8E, 0xBA, 0x8F, 0xBA, 0x90, 0xBA, 0x91, /* 0x2C-0x2F */
+	0xBA, 0x92, 0xBA, 0x93, 0xBA, 0x94, 0xBA, 0x95, /* 0x30-0x33 */
+	0xBA, 0x96, 0xBA, 0x97, 0xBA, 0x98, 0xBA, 0x99, /* 0x34-0x37 */
+	0xF4, 0xA4, 0xBA, 0x9A, 0xBA, 0x9B, 0xBA, 0x9C, /* 0x38-0x3B */
+	0xBA, 0x9D, 0xBA, 0x9E, 0xBA, 0x9F, 0xB2, 0xBE, /* 0x3C-0x3F */
+	0xF4, 0xA6, 0xF4, 0xA5, 0xBA, 0xA0, 0xBB, 0x40, /* 0x40-0x43 */
+	0xBB, 0x41, 0xBB, 0x42, 0xBB, 0x43, 0xBB, 0x44, /* 0x44-0x47 */
+	0xBB, 0x45, 0xBB, 0x46, 0xBB, 0x47, 0xBB, 0x48, /* 0x48-0x4B */
+	0xBB, 0x49, 0xBC, 0xAE, 0xBB, 0x4A, 0xBB, 0x4B, /* 0x4C-0x4F */
+	0xBB, 0x4C, 0xBB, 0x4D, 0xBB, 0x4E, 0xBB, 0x4F, /* 0x50-0x53 */
+	0xBB, 0x50, 0xBB, 0x51, 0xBB, 0x52, 0xBB, 0x53, /* 0x54-0x57 */
+	0xBB, 0x54, 0xBB, 0x55, 0xBB, 0x56, 0xBB, 0x57, /* 0x58-0x5B */
+	0xBB, 0x58, 0xBB, 0x59, 0xBB, 0x5A, 0xBB, 0x5B, /* 0x5C-0x5F */
+	0xBB, 0x5C, 0xBB, 0x5D, 0xBB, 0x5E, 0xBB, 0x5F, /* 0x60-0x63 */
+	0xBB, 0x60, 0xBB, 0x61, 0xBB, 0x62, 0xBB, 0x63, /* 0x64-0x67 */
+	0xBB, 0x64, 0xBB, 0x65, 0xBB, 0x66, 0xBB, 0x67, /* 0x68-0x6B */
+	0xBB, 0x68, 0xBB, 0x69, 0xBB, 0x6A, 0xBB, 0x6B, /* 0x6C-0x6F */
+	0xBB, 0x6C, 0xBB, 0x6D, 0xBB, 0x6E, 0xC3, 0xD7, /* 0x70-0x73 */
+	0xD9, 0xE1, 0xBB, 0x6F, 0xBB, 0x70, 0xBB, 0x71, /* 0x74-0x77 */
+	0xBB, 0x72, 0xBB, 0x73, 0xBB, 0x74, 0xC0, 0xE0, /* 0x78-0x7B */
+	0xF4, 0xCC, 0xD7, 0xD1, 0xBB, 0x75, 0xBB, 0x76, /* 0x7C-0x7F */
+	
+	0xBB, 0x77, 0xBB, 0x78, 0xBB, 0x79, 0xBB, 0x7A, /* 0x80-0x83 */
+	0xBB, 0x7B, 0xBB, 0x7C, 0xBB, 0x7D, 0xBB, 0x7E, /* 0x84-0x87 */
+	0xBB, 0x80, 0xB7, 0xDB, 0xBB, 0x81, 0xBB, 0x82, /* 0x88-0x8B */
+	0xBB, 0x83, 0xBB, 0x84, 0xBB, 0x85, 0xBB, 0x86, /* 0x8C-0x8F */
+	0xBB, 0x87, 0xF4, 0xCE, 0xC1, 0xA3, 0xBB, 0x88, /* 0x90-0x93 */
+	0xBB, 0x89, 0xC6, 0xC9, 0xBB, 0x8A, 0xB4, 0xD6, /* 0x94-0x97 */
+	0xD5, 0xB3, 0xBB, 0x8B, 0xBB, 0x8C, 0xBB, 0x8D, /* 0x98-0x9B */
+	0xF4, 0xD0, 0xF4, 0xCF, 0xF4, 0xD1, 0xCB, 0xDA, /* 0x9C-0x9F */
+	0xBB, 0x8E, 0xBB, 0x8F, 0xF4, 0xD2, 0xBB, 0x90, /* 0xA0-0xA3 */
+	0xD4, 0xC1, 0xD6, 0xE0, 0xBB, 0x91, 0xBB, 0x92, /* 0xA4-0xA7 */
+	0xBB, 0x93, 0xBB, 0x94, 0xB7, 0xE0, 0xBB, 0x95, /* 0xA8-0xAB */
+	0xBB, 0x96, 0xBB, 0x97, 0xC1, 0xB8, 0xBB, 0x98, /* 0xAC-0xAF */
+	0xBB, 0x99, 0xC1, 0xBB, 0xF4, 0xD3, 0xBE, 0xAC, /* 0xB0-0xB3 */
+	0xBB, 0x9A, 0xBB, 0x9B, 0xBB, 0x9C, 0xBB, 0x9D, /* 0xB4-0xB7 */
+	0xBB, 0x9E, 0xB4, 0xE2, 0xBB, 0x9F, 0xBB, 0xA0, /* 0xB8-0xBB */
+	0xF4, 0xD4, 0xF4, 0xD5, 0xBE, 0xAB, 0xBC, 0x40, /* 0xBC-0xBF */
+	0xBC, 0x41, 0xF4, 0xD6, 0xBC, 0x42, 0xBC, 0x43, /* 0xC0-0xC3 */
+	0xBC, 0x44, 0xF4, 0xDB, 0xBC, 0x45, 0xF4, 0xD7, /* 0xC4-0xC7 */
+	0xF4, 0xDA, 0xBC, 0x46, 0xBA, 0xFD, 0xBC, 0x47, /* 0xC8-0xCB */
+	0xF4, 0xD8, 0xF4, 0xD9, 0xBC, 0x48, 0xBC, 0x49, /* 0xCC-0xCF */
+	0xBC, 0x4A, 0xBC, 0x4B, 0xBC, 0x4C, 0xBC, 0x4D, /* 0xD0-0xD3 */
+	0xBC, 0x4E, 0xB8, 0xE2, 0xCC, 0xC7, 0xF4, 0xDC, /* 0xD4-0xD7 */
+	0xBC, 0x4F, 0xB2, 0xDA, 0xBC, 0x50, 0xBC, 0x51, /* 0xD8-0xDB */
+	0xC3, 0xD3, 0xBC, 0x52, 0xBC, 0x53, 0xD4, 0xE3, /* 0xDC-0xDF */
+	0xBF, 0xB7, 0xBC, 0x54, 0xBC, 0x55, 0xBC, 0x56, /* 0xE0-0xE3 */
+	0xBC, 0x57, 0xBC, 0x58, 0xBC, 0x59, 0xBC, 0x5A, /* 0xE4-0xE7 */
+	0xF4, 0xDD, 0xBC, 0x5B, 0xBC, 0x5C, 0xBC, 0x5D, /* 0xE8-0xEB */
+	0xBC, 0x5E, 0xBC, 0x5F, 0xBC, 0x60, 0xC5, 0xB4, /* 0xEC-0xEF */
+	0xBC, 0x61, 0xBC, 0x62, 0xBC, 0x63, 0xBC, 0x64, /* 0xF0-0xF3 */
+	0xBC, 0x65, 0xBC, 0x66, 0xBC, 0x67, 0xBC, 0x68, /* 0xF4-0xF7 */
+	0xF4, 0xE9, 0xBC, 0x69, 0xBC, 0x6A, 0xCF, 0xB5, /* 0xF8-0xFB */
+	0xBC, 0x6B, 0xBC, 0x6C, 0xBC, 0x6D, 0xBC, 0x6E, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7D[512] = {
+	0xBC, 0x6F, 0xBC, 0x70, 0xBC, 0x71, 0xBC, 0x72, /* 0x00-0x03 */
+	0xBC, 0x73, 0xBC, 0x74, 0xBC, 0x75, 0xBC, 0x76, /* 0x04-0x07 */
+	0xBC, 0x77, 0xBC, 0x78, 0xCE, 0xC9, 0xBC, 0x79, /* 0x08-0x0B */
+	0xBC, 0x7A, 0xBC, 0x7B, 0xBC, 0x7C, 0xBC, 0x7D, /* 0x0C-0x0F */
+	0xBC, 0x7E, 0xBC, 0x80, 0xBC, 0x81, 0xBC, 0x82, /* 0x10-0x13 */
+	0xBC, 0x83, 0xBC, 0x84, 0xBC, 0x85, 0xBC, 0x86, /* 0x14-0x17 */
+	0xBC, 0x87, 0xBC, 0x88, 0xBC, 0x89, 0xBC, 0x8A, /* 0x18-0x1B */
+	0xBC, 0x8B, 0xBC, 0x8C, 0xBC, 0x8D, 0xBC, 0x8E, /* 0x1C-0x1F */
+	0xCB, 0xD8, 0xBC, 0x8F, 0xCB, 0xF7, 0xBC, 0x90, /* 0x20-0x23 */
+	0xBC, 0x91, 0xBC, 0x92, 0xBC, 0x93, 0xBD, 0xF4, /* 0x24-0x27 */
+	0xBC, 0x94, 0xBC, 0x95, 0xBC, 0x96, 0xD7, 0xCF, /* 0x28-0x2B */
+	0xBC, 0x97, 0xBC, 0x98, 0xBC, 0x99, 0xC0, 0xDB, /* 0x2C-0x2F */
+	0xBC, 0x9A, 0xBC, 0x9B, 0xBC, 0x9C, 0xBC, 0x9D, /* 0x30-0x33 */
+	0xBC, 0x9E, 0xBC, 0x9F, 0xBC, 0xA0, 0xBD, 0x40, /* 0x34-0x37 */
+	0xBD, 0x41, 0xBD, 0x42, 0xBD, 0x43, 0xBD, 0x44, /* 0x38-0x3B */
+	0xBD, 0x45, 0xBD, 0x46, 0xBD, 0x47, 0xBD, 0x48, /* 0x3C-0x3F */
+	0xBD, 0x49, 0xBD, 0x4A, 0xBD, 0x4B, 0xBD, 0x4C, /* 0x40-0x43 */
+	0xBD, 0x4D, 0xBD, 0x4E, 0xBD, 0x4F, 0xBD, 0x50, /* 0x44-0x47 */
+	0xBD, 0x51, 0xBD, 0x52, 0xBD, 0x53, 0xBD, 0x54, /* 0x48-0x4B */
+	0xBD, 0x55, 0xBD, 0x56, 0xBD, 0x57, 0xBD, 0x58, /* 0x4C-0x4F */
+	0xBD, 0x59, 0xBD, 0x5A, 0xBD, 0x5B, 0xBD, 0x5C, /* 0x50-0x53 */
+	0xBD, 0x5D, 0xBD, 0x5E, 0xBD, 0x5F, 0xBD, 0x60, /* 0x54-0x57 */
+	0xBD, 0x61, 0xBD, 0x62, 0xBD, 0x63, 0xBD, 0x64, /* 0x58-0x5B */
+	0xBD, 0x65, 0xBD, 0x66, 0xBD, 0x67, 0xBD, 0x68, /* 0x5C-0x5F */
+	0xBD, 0x69, 0xBD, 0x6A, 0xBD, 0x6B, 0xBD, 0x6C, /* 0x60-0x63 */
+	0xBD, 0x6D, 0xBD, 0x6E, 0xBD, 0x6F, 0xBD, 0x70, /* 0x64-0x67 */
+	0xBD, 0x71, 0xBD, 0x72, 0xBD, 0x73, 0xBD, 0x74, /* 0x68-0x6B */
+	0xBD, 0x75, 0xBD, 0x76, 0xD0, 0xF5, 0xBD, 0x77, /* 0x6C-0x6F */
+	0xBD, 0x78, 0xBD, 0x79, 0xBD, 0x7A, 0xBD, 0x7B, /* 0x70-0x73 */
+	0xBD, 0x7C, 0xBD, 0x7D, 0xBD, 0x7E, 0xF4, 0xEA, /* 0x74-0x77 */
+	0xBD, 0x80, 0xBD, 0x81, 0xBD, 0x82, 0xBD, 0x83, /* 0x78-0x7B */
+	0xBD, 0x84, 0xBD, 0x85, 0xBD, 0x86, 0xBD, 0x87, /* 0x7C-0x7F */
+	
+	0xBD, 0x88, 0xBD, 0x89, 0xBD, 0x8A, 0xBD, 0x8B, /* 0x80-0x83 */
+	0xBD, 0x8C, 0xBD, 0x8D, 0xBD, 0x8E, 0xBD, 0x8F, /* 0x84-0x87 */
+	0xBD, 0x90, 0xBD, 0x91, 0xBD, 0x92, 0xBD, 0x93, /* 0x88-0x8B */
+	0xBD, 0x94, 0xBD, 0x95, 0xBD, 0x96, 0xBD, 0x97, /* 0x8C-0x8F */
+	0xBD, 0x98, 0xBD, 0x99, 0xBD, 0x9A, 0xBD, 0x9B, /* 0x90-0x93 */
+	0xBD, 0x9C, 0xBD, 0x9D, 0xBD, 0x9E, 0xBD, 0x9F, /* 0x94-0x97 */
+	0xBD, 0xA0, 0xBE, 0x40, 0xBE, 0x41, 0xBE, 0x42, /* 0x98-0x9B */
+	0xBE, 0x43, 0xBE, 0x44, 0xBE, 0x45, 0xBE, 0x46, /* 0x9C-0x9F */
+	0xBE, 0x47, 0xBE, 0x48, 0xBE, 0x49, 0xBE, 0x4A, /* 0xA0-0xA3 */
+	0xBE, 0x4B, 0xBE, 0x4C, 0xF4, 0xEB, 0xBE, 0x4D, /* 0xA4-0xA7 */
+	0xBE, 0x4E, 0xBE, 0x4F, 0xBE, 0x50, 0xBE, 0x51, /* 0xA8-0xAB */
+	0xBE, 0x52, 0xBE, 0x53, 0xF4, 0xEC, 0xBE, 0x54, /* 0xAC-0xAF */
+	0xBE, 0x55, 0xBE, 0x56, 0xBE, 0x57, 0xBE, 0x58, /* 0xB0-0xB3 */
+	0xBE, 0x59, 0xBE, 0x5A, 0xBE, 0x5B, 0xBE, 0x5C, /* 0xB4-0xB7 */
+	0xBE, 0x5D, 0xBE, 0x5E, 0xBE, 0x5F, 0xBE, 0x60, /* 0xB8-0xBB */
+	0xBE, 0x61, 0xBE, 0x62, 0xBE, 0x63, 0xBE, 0x64, /* 0xBC-0xBF */
+	0xBE, 0x65, 0xBE, 0x66, 0xBE, 0x67, 0xBE, 0x68, /* 0xC0-0xC3 */
+	0xBE, 0x69, 0xBE, 0x6A, 0xBE, 0x6B, 0xBE, 0x6C, /* 0xC4-0xC7 */
+	0xBE, 0x6D, 0xBE, 0x6E, 0xBE, 0x6F, 0xBE, 0x70, /* 0xC8-0xCB */
+	0xBE, 0x71, 0xBE, 0x72, 0xBE, 0x73, 0xBE, 0x74, /* 0xCC-0xCF */
+	0xBE, 0x75, 0xBE, 0x76, 0xBE, 0x77, 0xBE, 0x78, /* 0xD0-0xD3 */
+	0xBE, 0x79, 0xBE, 0x7A, 0xBE, 0x7B, 0xBE, 0x7C, /* 0xD4-0xD7 */
+	0xBE, 0x7D, 0xBE, 0x7E, 0xBE, 0x80, 0xBE, 0x81, /* 0xD8-0xDB */
+	0xBE, 0x82, 0xBE, 0x83, 0xBE, 0x84, 0xBE, 0x85, /* 0xDC-0xDF */
+	0xBE, 0x86, 0xBE, 0x87, 0xBE, 0x88, 0xBE, 0x89, /* 0xE0-0xE3 */
+	0xBE, 0x8A, 0xBE, 0x8B, 0xBE, 0x8C, 0xBE, 0x8D, /* 0xE4-0xE7 */
+	0xBE, 0x8E, 0xBE, 0x8F, 0xBE, 0x90, 0xBE, 0x91, /* 0xE8-0xEB */
+	0xBE, 0x92, 0xBE, 0x93, 0xBE, 0x94, 0xBE, 0x95, /* 0xEC-0xEF */
+	0xBE, 0x96, 0xBE, 0x97, 0xBE, 0x98, 0xBE, 0x99, /* 0xF0-0xF3 */
+	0xBE, 0x9A, 0xBE, 0x9B, 0xBE, 0x9C, 0xBE, 0x9D, /* 0xF4-0xF7 */
+	0xBE, 0x9E, 0xBE, 0x9F, 0xBE, 0xA0, 0xBF, 0x40, /* 0xF8-0xFB */
+	0xBF, 0x41, 0xBF, 0x42, 0xBF, 0x43, 0xBF, 0x44, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7E[512] = {
+	0xBF, 0x45, 0xBF, 0x46, 0xBF, 0x47, 0xBF, 0x48, /* 0x00-0x03 */
+	0xBF, 0x49, 0xBF, 0x4A, 0xBF, 0x4B, 0xBF, 0x4C, /* 0x04-0x07 */
+	0xBF, 0x4D, 0xBF, 0x4E, 0xBF, 0x4F, 0xBF, 0x50, /* 0x08-0x0B */
+	0xBF, 0x51, 0xBF, 0x52, 0xBF, 0x53, 0xBF, 0x54, /* 0x0C-0x0F */
+	0xBF, 0x55, 0xBF, 0x56, 0xBF, 0x57, 0xBF, 0x58, /* 0x10-0x13 */
+	0xBF, 0x59, 0xBF, 0x5A, 0xBF, 0x5B, 0xBF, 0x5C, /* 0x14-0x17 */
+	0xBF, 0x5D, 0xBF, 0x5E, 0xBF, 0x5F, 0xBF, 0x60, /* 0x18-0x1B */
+	0xBF, 0x61, 0xBF, 0x62, 0xBF, 0x63, 0xBF, 0x64, /* 0x1C-0x1F */
+	0xBF, 0x65, 0xBF, 0x66, 0xBF, 0x67, 0xBF, 0x68, /* 0x20-0x23 */
+	0xBF, 0x69, 0xBF, 0x6A, 0xBF, 0x6B, 0xBF, 0x6C, /* 0x24-0x27 */
+	0xBF, 0x6D, 0xBF, 0x6E, 0xBF, 0x6F, 0xBF, 0x70, /* 0x28-0x2B */
+	0xBF, 0x71, 0xBF, 0x72, 0xBF, 0x73, 0xBF, 0x74, /* 0x2C-0x2F */
+	0xBF, 0x75, 0xBF, 0x76, 0xBF, 0x77, 0xBF, 0x78, /* 0x30-0x33 */
+	0xBF, 0x79, 0xBF, 0x7A, 0xBF, 0x7B, 0xBF, 0x7C, /* 0x34-0x37 */
+	0xBF, 0x7D, 0xBF, 0x7E, 0xBF, 0x80, 0xF7, 0xE3, /* 0x38-0x3B */
+	0xBF, 0x81, 0xBF, 0x82, 0xBF, 0x83, 0xBF, 0x84, /* 0x3C-0x3F */
+	0xBF, 0x85, 0xB7, 0xB1, 0xBF, 0x86, 0xBF, 0x87, /* 0x40-0x43 */
+	0xBF, 0x88, 0xBF, 0x89, 0xBF, 0x8A, 0xF4, 0xED, /* 0x44-0x47 */
+	0xBF, 0x8B, 0xBF, 0x8C, 0xBF, 0x8D, 0xBF, 0x8E, /* 0x48-0x4B */
+	0xBF, 0x8F, 0xBF, 0x90, 0xBF, 0x91, 0xBF, 0x92, /* 0x4C-0x4F */
+	0xBF, 0x93, 0xBF, 0x94, 0xBF, 0x95, 0xBF, 0x96, /* 0x50-0x53 */
+	0xBF, 0x97, 0xBF, 0x98, 0xBF, 0x99, 0xBF, 0x9A, /* 0x54-0x57 */
+	0xBF, 0x9B, 0xBF, 0x9C, 0xBF, 0x9D, 0xBF, 0x9E, /* 0x58-0x5B */
+	0xBF, 0x9F, 0xBF, 0xA0, 0xC0, 0x40, 0xC0, 0x41, /* 0x5C-0x5F */
+	0xC0, 0x42, 0xC0, 0x43, 0xC0, 0x44, 0xC0, 0x45, /* 0x60-0x63 */
+	0xC0, 0x46, 0xC0, 0x47, 0xC0, 0x48, 0xC0, 0x49, /* 0x64-0x67 */
+	0xC0, 0x4A, 0xC0, 0x4B, 0xC0, 0x4C, 0xC0, 0x4D, /* 0x68-0x6B */
+	0xC0, 0x4E, 0xC0, 0x4F, 0xC0, 0x50, 0xC0, 0x51, /* 0x6C-0x6F */
+	0xC0, 0x52, 0xC0, 0x53, 0xC0, 0x54, 0xC0, 0x55, /* 0x70-0x73 */
+	0xC0, 0x56, 0xC0, 0x57, 0xC0, 0x58, 0xC0, 0x59, /* 0x74-0x77 */
+	0xC0, 0x5A, 0xC0, 0x5B, 0xC0, 0x5C, 0xC0, 0x5D, /* 0x78-0x7B */
+	0xC0, 0x5E, 0xC0, 0x5F, 0xC0, 0x60, 0xC0, 0x61, /* 0x7C-0x7F */
+	
+	0xC0, 0x62, 0xC0, 0x63, 0xD7, 0xEB, 0xC0, 0x64, /* 0x80-0x83 */
+	0xC0, 0x65, 0xC0, 0x66, 0xC0, 0x67, 0xC0, 0x68, /* 0x84-0x87 */
+	0xC0, 0x69, 0xC0, 0x6A, 0xC0, 0x6B, 0xC0, 0x6C, /* 0x88-0x8B */
+	0xC0, 0x6D, 0xC0, 0x6E, 0xC0, 0x6F, 0xC0, 0x70, /* 0x8C-0x8F */
+	0xC0, 0x71, 0xC0, 0x72, 0xC0, 0x73, 0xC0, 0x74, /* 0x90-0x93 */
+	0xC0, 0x75, 0xC0, 0x76, 0xC0, 0x77, 0xC0, 0x78, /* 0x94-0x97 */
+	0xC0, 0x79, 0xC0, 0x7A, 0xC0, 0x7B, 0xF4, 0xEE, /* 0x98-0x9B */
+	0xC0, 0x7C, 0xC0, 0x7D, 0xC0, 0x7E, 0xE6, 0xF9, /* 0x9C-0x9F */
+	0xBE, 0xC0, 0xE6, 0xFA, 0xBA, 0xEC, 0xE6, 0xFB, /* 0xA0-0xA3 */
+	0xCF, 0xCB, 0xE6, 0xFC, 0xD4, 0xBC, 0xBC, 0xB6, /* 0xA4-0xA7 */
+	0xE6, 0xFD, 0xE6, 0xFE, 0xBC, 0xCD, 0xC8, 0xD2, /* 0xA8-0xAB */
+	0xCE, 0xB3, 0xE7, 0xA1, 0xC0, 0x80, 0xB4, 0xBF, /* 0xAC-0xAF */
+	0xE7, 0xA2, 0xC9, 0xB4, 0xB8, 0xD9, 0xC4, 0xC9, /* 0xB0-0xB3 */
+	0xC0, 0x81, 0xD7, 0xDD, 0xC2, 0xDA, 0xB7, 0xD7, /* 0xB4-0xB7 */
+	0xD6, 0xBD, 0xCE, 0xC6, 0xB7, 0xC4, 0xC0, 0x82, /* 0xB8-0xBB */
+	0xC0, 0x83, 0xC5, 0xA6, 0xE7, 0xA3, 0xCF, 0xDF, /* 0xBC-0xBF */
+	0xE7, 0xA4, 0xE7, 0xA5, 0xE7, 0xA6, 0xC1, 0xB7, /* 0xC0-0xC3 */
+	0xD7, 0xE9, 0xC9, 0xF0, 0xCF, 0xB8, 0xD6, 0xAF, /* 0xC4-0xC7 */
+	0xD6, 0xD5, 0xE7, 0xA7, 0xB0, 0xED, 0xE7, 0xA8, /* 0xC8-0xCB */
+	0xE7, 0xA9, 0xC9, 0xDC, 0xD2, 0xEF, 0xBE, 0xAD, /* 0xCC-0xCF */
+	0xE7, 0xAA, 0xB0, 0xF3, 0xC8, 0xDE, 0xBD, 0xE1, /* 0xD0-0xD3 */
+	0xE7, 0xAB, 0xC8, 0xC6, 0xC0, 0x84, 0xE7, 0xAC, /* 0xD4-0xD7 */
+	0xBB, 0xE6, 0xB8, 0xF8, 0xD1, 0xA4, 0xE7, 0xAD, /* 0xD8-0xDB */
+	0xC2, 0xE7, 0xBE, 0xF8, 0xBD, 0xCA, 0xCD, 0xB3, /* 0xDC-0xDF */
+	0xE7, 0xAE, 0xE7, 0xAF, 0xBE, 0xEE, 0xD0, 0xE5, /* 0xE0-0xE3 */
+	0xC0, 0x85, 0xCB, 0xE7, 0xCC, 0xD0, 0xBC, 0xCC, /* 0xE4-0xE7 */
+	0xE7, 0xB0, 0xBC, 0xA8, 0xD0, 0xF7, 0xE7, 0xB1, /* 0xE8-0xEB */
+	0xC0, 0x86, 0xD0, 0xF8, 0xE7, 0xB2, 0xE7, 0xB3, /* 0xEC-0xEF */
+	0xB4, 0xC2, 0xE7, 0xB4, 0xE7, 0xB5, 0xC9, 0xFE, /* 0xF0-0xF3 */
+	0xCE, 0xAC, 0xC3, 0xE0, 0xE7, 0xB7, 0xB1, 0xC1, /* 0xF4-0xF7 */
+	0xB3, 0xF1, 0xC0, 0x87, 0xE7, 0xB8, 0xE7, 0xB9, /* 0xF8-0xFB */
+	0xD7, 0xDB, 0xD5, 0xC0, 0xE7, 0xBA, 0xC2, 0xCC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7F[512] = {
+	0xD7, 0xBA, 0xE7, 0xBB, 0xE7, 0xBC, 0xE7, 0xBD, /* 0x00-0x03 */
+	0xBC, 0xEA, 0xC3, 0xE5, 0xC0, 0xC2, 0xE7, 0xBE, /* 0x04-0x07 */
+	0xE7, 0xBF, 0xBC, 0xA9, 0xC0, 0x88, 0xE7, 0xC0, /* 0x08-0x0B */
+	0xE7, 0xC1, 0xE7, 0xB6, 0xB6, 0xD0, 0xE7, 0xC2, /* 0x0C-0x0F */
+	0xC0, 0x89, 0xE7, 0xC3, 0xE7, 0xC4, 0xBB, 0xBA, /* 0x10-0x13 */
+	0xB5, 0xDE, 0xC2, 0xC6, 0xB1, 0xE0, 0xE7, 0xC5, /* 0x14-0x17 */
+	0xD4, 0xB5, 0xE7, 0xC6, 0xB8, 0xBF, 0xE7, 0xC8, /* 0x18-0x1B */
+	0xE7, 0xC7, 0xB7, 0xEC, 0xC0, 0x8A, 0xE7, 0xC9, /* 0x1C-0x1F */
+	0xB2, 0xF8, 0xE7, 0xCA, 0xE7, 0xCB, 0xE7, 0xCC, /* 0x20-0x23 */
+	0xE7, 0xCD, 0xE7, 0xCE, 0xE7, 0xCF, 0xE7, 0xD0, /* 0x24-0x27 */
+	0xD3, 0xA7, 0xCB, 0xF5, 0xE7, 0xD1, 0xE7, 0xD2, /* 0x28-0x2B */
+	0xE7, 0xD3, 0xE7, 0xD4, 0xC9, 0xC9, 0xE7, 0xD5, /* 0x2C-0x2F */
+	0xE7, 0xD6, 0xE7, 0xD7, 0xE7, 0xD8, 0xE7, 0xD9, /* 0x30-0x33 */
+	0xBD, 0xC9, 0xE7, 0xDA, 0xF3, 0xBE, 0xC0, 0x8B, /* 0x34-0x37 */
+	0xB8, 0xD7, 0xC0, 0x8C, 0xC8, 0xB1, 0xC0, 0x8D, /* 0x38-0x3B */
+	0xC0, 0x8E, 0xC0, 0x8F, 0xC0, 0x90, 0xC0, 0x91, /* 0x3C-0x3F */
+	0xC0, 0x92, 0xC0, 0x93, 0xF3, 0xBF, 0xC0, 0x94, /* 0x40-0x43 */
+	0xF3, 0xC0, 0xF3, 0xC1, 0xC0, 0x95, 0xC0, 0x96, /* 0x44-0x47 */
+	0xC0, 0x97, 0xC0, 0x98, 0xC0, 0x99, 0xC0, 0x9A, /* 0x48-0x4B */
+	0xC0, 0x9B, 0xC0, 0x9C, 0xC0, 0x9D, 0xC0, 0x9E, /* 0x4C-0x4F */
+	0xB9, 0xDE, 0xCD, 0xF8, 0xC0, 0x9F, 0xC0, 0xA0, /* 0x50-0x53 */
+	0xD8, 0xE8, 0xBA, 0xB1, 0xC1, 0x40, 0xC2, 0xDE, /* 0x54-0x57 */
+	0xEE, 0xB7, 0xC1, 0x41, 0xB7, 0xA3, 0xC1, 0x42, /* 0x58-0x5B */
+	0xC1, 0x43, 0xC1, 0x44, 0xC1, 0x45, 0xEE, 0xB9, /* 0x5C-0x5F */
+	0xC1, 0x46, 0xEE, 0xB8, 0xB0, 0xD5, 0xC1, 0x47, /* 0x60-0x63 */
+	0xC1, 0x48, 0xC1, 0x49, 0xC1, 0x4A, 0xC1, 0x4B, /* 0x64-0x67 */
+	0xEE, 0xBB, 0xD5, 0xD6, 0xD7, 0xEF, 0xC1, 0x4C, /* 0x68-0x6B */
+	0xC1, 0x4D, 0xC1, 0x4E, 0xD6, 0xC3, 0xC1, 0x4F, /* 0x6C-0x6F */
+	0xC1, 0x50, 0xEE, 0xBD, 0xCA, 0xF0, 0xC1, 0x51, /* 0x70-0x73 */
+	0xEE, 0xBC, 0xC1, 0x52, 0xC1, 0x53, 0xC1, 0x54, /* 0x74-0x77 */
+	0xC1, 0x55, 0xEE, 0xBE, 0xC1, 0x56, 0xC1, 0x57, /* 0x78-0x7B */
+	0xC1, 0x58, 0xC1, 0x59, 0xEE, 0xC0, 0xC1, 0x5A, /* 0x7C-0x7F */
+	
+	0xC1, 0x5B, 0xEE, 0xBF, 0xC1, 0x5C, 0xC1, 0x5D, /* 0x80-0x83 */
+	0xC1, 0x5E, 0xC1, 0x5F, 0xC1, 0x60, 0xC1, 0x61, /* 0x84-0x87 */
+	0xC1, 0x62, 0xC1, 0x63, 0xD1, 0xF2, 0xC1, 0x64, /* 0x88-0x8B */
+	0xC7, 0xBC, 0xC1, 0x65, 0xC3, 0xC0, 0xC1, 0x66, /* 0x8C-0x8F */
+	0xC1, 0x67, 0xC1, 0x68, 0xC1, 0x69, 0xC1, 0x6A, /* 0x90-0x93 */
+	0xB8, 0xE1, 0xC1, 0x6B, 0xC1, 0x6C, 0xC1, 0x6D, /* 0x94-0x97 */
+	0xC1, 0x6E, 0xC1, 0x6F, 0xC1, 0xE7, 0xC1, 0x70, /* 0x98-0x9B */
+	0xC1, 0x71, 0xF4, 0xC6, 0xD0, 0xDF, 0xF4, 0xC7, /* 0x9C-0x9F */
+	0xC1, 0x72, 0xCF, 0xDB, 0xC1, 0x73, 0xC1, 0x74, /* 0xA0-0xA3 */
+	0xC8, 0xBA, 0xC1, 0x75, 0xC1, 0x76, 0xF4, 0xC8, /* 0xA4-0xA7 */
+	0xC1, 0x77, 0xC1, 0x78, 0xC1, 0x79, 0xC1, 0x7A, /* 0xA8-0xAB */
+	0xC1, 0x7B, 0xC1, 0x7C, 0xC1, 0x7D, 0xF4, 0xC9, /* 0xAC-0xAF */
+	0xF4, 0xCA, 0xC1, 0x7E, 0xF4, 0xCB, 0xC1, 0x80, /* 0xB0-0xB3 */
+	0xC1, 0x81, 0xC1, 0x82, 0xC1, 0x83, 0xC1, 0x84, /* 0xB4-0xB7 */
+	0xD9, 0xFA, 0xB8, 0xFE, 0xC1, 0x85, 0xC1, 0x86, /* 0xB8-0xBB */
+	0xE5, 0xF1, 0xD3, 0xF0, 0xC1, 0x87, 0xF4, 0xE0, /* 0xBC-0xBF */
+	0xC1, 0x88, 0xCE, 0xCC, 0xC1, 0x89, 0xC1, 0x8A, /* 0xC0-0xC3 */
+	0xC1, 0x8B, 0xB3, 0xE1, 0xC1, 0x8C, 0xC1, 0x8D, /* 0xC4-0xC7 */
+	0xC1, 0x8E, 0xC1, 0x8F, 0xF1, 0xB4, 0xC1, 0x90, /* 0xC8-0xCB */
+	0xD2, 0xEE, 0xC1, 0x91, 0xF4, 0xE1, 0xC1, 0x92, /* 0xCC-0xCF */
+	0xC1, 0x93, 0xC1, 0x94, 0xC1, 0x95, 0xC1, 0x96, /* 0xD0-0xD3 */
+	0xCF, 0xE8, 0xF4, 0xE2, 0xC1, 0x97, 0xC1, 0x98, /* 0xD4-0xD7 */
+	0xC7, 0xCC, 0xC1, 0x99, 0xC1, 0x9A, 0xC1, 0x9B, /* 0xD8-0xDB */
+	0xC1, 0x9C, 0xC1, 0x9D, 0xC1, 0x9E, 0xB5, 0xD4, /* 0xDC-0xDF */
+	0xB4, 0xE4, 0xF4, 0xE4, 0xC1, 0x9F, 0xC1, 0xA0, /* 0xE0-0xE3 */
+	0xC2, 0x40, 0xF4, 0xE3, 0xF4, 0xE5, 0xC2, 0x41, /* 0xE4-0xE7 */
+	0xC2, 0x42, 0xF4, 0xE6, 0xC2, 0x43, 0xC2, 0x44, /* 0xE8-0xEB */
+	0xC2, 0x45, 0xC2, 0x46, 0xF4, 0xE7, 0xC2, 0x47, /* 0xEC-0xEF */
+	0xBA, 0xB2, 0xB0, 0xBF, 0xC2, 0x48, 0xF4, 0xE8, /* 0xF0-0xF3 */
+	0xC2, 0x49, 0xC2, 0x4A, 0xC2, 0x4B, 0xC2, 0x4C, /* 0xF4-0xF7 */
+	0xC2, 0x4D, 0xC2, 0x4E, 0xC2, 0x4F, 0xB7, 0xAD, /* 0xF8-0xFB */
+	0xD2, 0xED, 0xC2, 0x50, 0xC2, 0x51, 0xC2, 0x52, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_80[512] = {
+	0xD2, 0xAB, 0xC0, 0xCF, 0xC2, 0x53, 0xBF, 0xBC, /* 0x00-0x03 */
+	0xEB, 0xA3, 0xD5, 0xDF, 0xEA, 0xC8, 0xC2, 0x54, /* 0x04-0x07 */
+	0xC2, 0x55, 0xC2, 0x56, 0xC2, 0x57, 0xF1, 0xF3, /* 0x08-0x0B */
+	0xB6, 0xF8, 0xCB, 0xA3, 0xC2, 0x58, 0xC2, 0x59, /* 0x0C-0x0F */
+	0xC4, 0xCD, 0xC2, 0x5A, 0xF1, 0xE7, 0xC2, 0x5B, /* 0x10-0x13 */
+	0xF1, 0xE8, 0xB8, 0xFB, 0xF1, 0xE9, 0xBA, 0xC4, /* 0x14-0x17 */
+	0xD4, 0xC5, 0xB0, 0xD2, 0xC2, 0x5C, 0xC2, 0x5D, /* 0x18-0x1B */
+	0xF1, 0xEA, 0xC2, 0x5E, 0xC2, 0x5F, 0xC2, 0x60, /* 0x1C-0x1F */
+	0xF1, 0xEB, 0xC2, 0x61, 0xF1, 0xEC, 0xC2, 0x62, /* 0x20-0x23 */
+	0xC2, 0x63, 0xF1, 0xED, 0xF1, 0xEE, 0xF1, 0xEF, /* 0x24-0x27 */
+	0xF1, 0xF1, 0xF1, 0xF0, 0xC5, 0xD5, 0xC2, 0x64, /* 0x28-0x2B */
+	0xC2, 0x65, 0xC2, 0x66, 0xC2, 0x67, 0xC2, 0x68, /* 0x2C-0x2F */
+	0xC2, 0x69, 0xF1, 0xF2, 0xC2, 0x6A, 0xB6, 0xFA, /* 0x30-0x33 */
+	0xC2, 0x6B, 0xF1, 0xF4, 0xD2, 0xAE, 0xDE, 0xC7, /* 0x34-0x37 */
+	0xCB, 0xCA, 0xC2, 0x6C, 0xC2, 0x6D, 0xB3, 0xDC, /* 0x38-0x3B */
+	0xC2, 0x6E, 0xB5, 0xA2, 0xC2, 0x6F, 0xB9, 0xA2, /* 0x3C-0x3F */
+	0xC2, 0x70, 0xC2, 0x71, 0xC4, 0xF4, 0xF1, 0xF5, /* 0x40-0x43 */
+	0xC2, 0x72, 0xC2, 0x73, 0xF1, 0xF6, 0xC2, 0x74, /* 0x44-0x47 */
+	0xC2, 0x75, 0xC2, 0x76, 0xC1, 0xC4, 0xC1, 0xFB, /* 0x48-0x4B */
+	0xD6, 0xB0, 0xF1, 0xF7, 0xC2, 0x77, 0xC2, 0x78, /* 0x4C-0x4F */
+	0xC2, 0x79, 0xC2, 0x7A, 0xF1, 0xF8, 0xC2, 0x7B, /* 0x50-0x53 */
+	0xC1, 0xAA, 0xC2, 0x7C, 0xC2, 0x7D, 0xC2, 0x7E, /* 0x54-0x57 */
+	0xC6, 0xB8, 0xC2, 0x80, 0xBE, 0xDB, 0xC2, 0x81, /* 0x58-0x5B */
+	0xC2, 0x82, 0xC2, 0x83, 0xC2, 0x84, 0xC2, 0x85, /* 0x5C-0x5F */
+	0xC2, 0x86, 0xC2, 0x87, 0xC2, 0x88, 0xC2, 0x89, /* 0x60-0x63 */
+	0xC2, 0x8A, 0xC2, 0x8B, 0xC2, 0x8C, 0xC2, 0x8D, /* 0x64-0x67 */
+	0xC2, 0x8E, 0xF1, 0xF9, 0xB4, 0xCF, 0xC2, 0x8F, /* 0x68-0x6B */
+	0xC2, 0x90, 0xC2, 0x91, 0xC2, 0x92, 0xC2, 0x93, /* 0x6C-0x6F */
+	0xC2, 0x94, 0xF1, 0xFA, 0xC2, 0x95, 0xC2, 0x96, /* 0x70-0x73 */
+	0xC2, 0x97, 0xC2, 0x98, 0xC2, 0x99, 0xC2, 0x9A, /* 0x74-0x77 */
+	0xC2, 0x9B, 0xC2, 0x9C, 0xC2, 0x9D, 0xC2, 0x9E, /* 0x78-0x7B */
+	0xC2, 0x9F, 0xC2, 0xA0, 0xC3, 0x40, 0xED, 0xB2, /* 0x7C-0x7F */
+	
+	0xED, 0xB1, 0xC3, 0x41, 0xC3, 0x42, 0xCB, 0xE0, /* 0x80-0x83 */
+	0xD2, 0xDE, 0xC3, 0x43, 0xCB, 0xC1, 0xD5, 0xD8, /* 0x84-0x87 */
+	0xC3, 0x44, 0xC8, 0xE2, 0xC3, 0x45, 0xC0, 0xDF, /* 0x88-0x8B */
+	0xBC, 0xA1, 0xC3, 0x46, 0xC3, 0x47, 0xC3, 0x48, /* 0x8C-0x8F */
+	0xC3, 0x49, 0xC3, 0x4A, 0xC3, 0x4B, 0xEB, 0xC1, /* 0x90-0x93 */
+	0xC3, 0x4C, 0xC3, 0x4D, 0xD0, 0xA4, 0xC3, 0x4E, /* 0x94-0x97 */
+	0xD6, 0xE2, 0xC3, 0x4F, 0xB6, 0xC7, 0xB8, 0xD8, /* 0x98-0x9B */
+	0xEB, 0xC0, 0xB8, 0xCE, 0xC3, 0x50, 0xEB, 0xBF, /* 0x9C-0x9F */
+	0xB3, 0xA6, 0xB9, 0xC9, 0xD6, 0xAB, 0xC3, 0x51, /* 0xA0-0xA3 */
+	0xB7, 0xF4, 0xB7, 0xCA, 0xC3, 0x52, 0xC3, 0x53, /* 0xA4-0xA7 */
+	0xC3, 0x54, 0xBC, 0xE7, 0xB7, 0xBE, 0xEB, 0xC6, /* 0xA8-0xAB */
+	0xC3, 0x55, 0xEB, 0xC7, 0xB0, 0xB9, 0xBF, 0xCF, /* 0xAC-0xAF */
+	0xC3, 0x56, 0xEB, 0xC5, 0xD3, 0xFD, 0xC3, 0x57, /* 0xB0-0xB3 */
+	0xEB, 0xC8, 0xC3, 0x58, 0xC3, 0x59, 0xEB, 0xC9, /* 0xB4-0xB7 */
+	0xC3, 0x5A, 0xC3, 0x5B, 0xB7, 0xCE, 0xC3, 0x5C, /* 0xB8-0xBB */
+	0xEB, 0xC2, 0xEB, 0xC4, 0xC9, 0xF6, 0xD6, 0xD7, /* 0xBC-0xBF */
+	0xD5, 0xCD, 0xD0, 0xB2, 0xEB, 0xCF, 0xCE, 0xB8, /* 0xC0-0xC3 */
+	0xEB, 0xD0, 0xC3, 0x5D, 0xB5, 0xA8, 0xC3, 0x5E, /* 0xC4-0xC7 */
+	0xC3, 0x5F, 0xC3, 0x60, 0xC3, 0x61, 0xC3, 0x62, /* 0xC8-0xCB */
+	0xB1, 0xB3, 0xEB, 0xD2, 0xCC, 0xA5, 0xC3, 0x63, /* 0xCC-0xCF */
+	0xC3, 0x64, 0xC3, 0x65, 0xC3, 0x66, 0xC3, 0x67, /* 0xD0-0xD3 */
+	0xC3, 0x68, 0xC3, 0x69, 0xC5, 0xD6, 0xEB, 0xD3, /* 0xD4-0xD7 */
+	0xC3, 0x6A, 0xEB, 0xD1, 0xC5, 0xDF, 0xEB, 0xCE, /* 0xD8-0xDB */
+	0xCA, 0xA4, 0xEB, 0xD5, 0xB0, 0xFB, 0xC3, 0x6B, /* 0xDC-0xDF */
+	0xC3, 0x6C, 0xBA, 0xFA, 0xC3, 0x6D, 0xC3, 0x6E, /* 0xE0-0xE3 */
+	0xD8, 0xB7, 0xF1, 0xE3, 0xC3, 0x6F, 0xEB, 0xCA, /* 0xE4-0xE7 */
+	0xEB, 0xCB, 0xEB, 0xCC, 0xEB, 0xCD, 0xEB, 0xD6, /* 0xE8-0xEB */
+	0xE6, 0xC0, 0xEB, 0xD9, 0xC3, 0x70, 0xBF, 0xE8, /* 0xEC-0xEF */
+	0xD2, 0xC8, 0xEB, 0xD7, 0xEB, 0xDC, 0xB8, 0xEC, /* 0xF0-0xF3 */
+	0xEB, 0xD8, 0xC3, 0x71, 0xBD, 0xBA, 0xC3, 0x72, /* 0xF4-0xF7 */
+	0xD0, 0xD8, 0xC3, 0x73, 0xB0, 0xB7, 0xC3, 0x74, /* 0xF8-0xFB */
+	0xEB, 0xDD, 0xC4, 0xDC, 0xC3, 0x75, 0xC3, 0x76, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_81[512] = {
+	0xC3, 0x77, 0xC3, 0x78, 0xD6, 0xAC, 0xC3, 0x79, /* 0x00-0x03 */
+	0xC3, 0x7A, 0xC3, 0x7B, 0xB4, 0xE0, 0xC3, 0x7C, /* 0x04-0x07 */
+	0xC3, 0x7D, 0xC2, 0xF6, 0xBC, 0xB9, 0xC3, 0x7E, /* 0x08-0x0B */
+	0xC3, 0x80, 0xEB, 0xDA, 0xEB, 0xDB, 0xD4, 0xE0, /* 0x0C-0x0F */
+	0xC6, 0xEA, 0xC4, 0xD4, 0xEB, 0xDF, 0xC5, 0xA7, /* 0x10-0x13 */
+	0xD9, 0xF5, 0xC3, 0x81, 0xB2, 0xB1, 0xC3, 0x82, /* 0x14-0x17 */
+	0xEB, 0xE4, 0xC3, 0x83, 0xBD, 0xC5, 0xC3, 0x84, /* 0x18-0x1B */
+	0xC3, 0x85, 0xC3, 0x86, 0xEB, 0xE2, 0xC3, 0x87, /* 0x1C-0x1F */
+	0xC3, 0x88, 0xC3, 0x89, 0xC3, 0x8A, 0xC3, 0x8B, /* 0x20-0x23 */
+	0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E, 0xC3, 0x8F, /* 0x24-0x27 */
+	0xC3, 0x90, 0xC3, 0x91, 0xC3, 0x92, 0xC3, 0x93, /* 0x28-0x2B */
+	0xEB, 0xE3, 0xC3, 0x94, 0xC3, 0x95, 0xB8, 0xAC, /* 0x2C-0x2F */
+	0xC3, 0x96, 0xCD, 0xD1, 0xEB, 0xE5, 0xC3, 0x97, /* 0x30-0x33 */
+	0xC3, 0x98, 0xC3, 0x99, 0xEB, 0xE1, 0xC3, 0x9A, /* 0x34-0x37 */
+	0xC1, 0xB3, 0xC3, 0x9B, 0xC3, 0x9C, 0xC3, 0x9D, /* 0x38-0x3B */
+	0xC3, 0x9E, 0xC3, 0x9F, 0xC6, 0xA2, 0xC3, 0xA0, /* 0x3C-0x3F */
+	0xC4, 0x40, 0xC4, 0x41, 0xC4, 0x42, 0xC4, 0x43, /* 0x40-0x43 */
+	0xC4, 0x44, 0xC4, 0x45, 0xCC, 0xF3, 0xC4, 0x46, /* 0x44-0x47 */
+	0xEB, 0xE6, 0xC4, 0x47, 0xC0, 0xB0, 0xD2, 0xB8, /* 0x48-0x4B */
+	0xEB, 0xE7, 0xC4, 0x48, 0xC4, 0x49, 0xC4, 0x4A, /* 0x4C-0x4F */
+	0xB8, 0xAF, 0xB8, 0xAD, 0xC4, 0x4B, 0xEB, 0xE8, /* 0x50-0x53 */
+	0xC7, 0xBB, 0xCD, 0xF3, 0xC4, 0x4C, 0xC4, 0x4D, /* 0x54-0x57 */
+	0xC4, 0x4E, 0xEB, 0xEA, 0xEB, 0xEB, 0xC4, 0x4F, /* 0x58-0x5B */
+	0xC4, 0x50, 0xC4, 0x51, 0xC4, 0x52, 0xC4, 0x53, /* 0x5C-0x5F */
+	0xEB, 0xED, 0xC4, 0x54, 0xC4, 0x55, 0xC4, 0x56, /* 0x60-0x63 */
+	0xC4, 0x57, 0xD0, 0xC8, 0xC4, 0x58, 0xEB, 0xF2, /* 0x64-0x67 */
+	0xC4, 0x59, 0xEB, 0xEE, 0xC4, 0x5A, 0xC4, 0x5B, /* 0x68-0x6B */
+	0xC4, 0x5C, 0xEB, 0xF1, 0xC8, 0xF9, 0xC4, 0x5D, /* 0x6C-0x6F */
+	0xD1, 0xFC, 0xEB, 0xEC, 0xC4, 0x5E, 0xC4, 0x5F, /* 0x70-0x73 */
+	0xEB, 0xE9, 0xC4, 0x60, 0xC4, 0x61, 0xC4, 0x62, /* 0x74-0x77 */
+	0xC4, 0x63, 0xB8, 0xB9, 0xCF, 0xD9, 0xC4, 0xE5, /* 0x78-0x7B */
+	0xEB, 0xEF, 0xEB, 0xF0, 0xCC, 0xDA, 0xCD, 0xC8, /* 0x7C-0x7F */
+	
+	0xB0, 0xF2, 0xC4, 0x64, 0xEB, 0xF6, 0xC4, 0x65, /* 0x80-0x83 */
+	0xC4, 0x66, 0xC4, 0x67, 0xC4, 0x68, 0xC4, 0x69, /* 0x84-0x87 */
+	0xEB, 0xF5, 0xC4, 0x6A, 0xB2, 0xB2, 0xC4, 0x6B, /* 0x88-0x8B */
+	0xC4, 0x6C, 0xC4, 0x6D, 0xC4, 0x6E, 0xB8, 0xE0, /* 0x8C-0x8F */
+	0xC4, 0x6F, 0xEB, 0xF7, 0xC4, 0x70, 0xC4, 0x71, /* 0x90-0x93 */
+	0xC4, 0x72, 0xC4, 0x73, 0xC4, 0x74, 0xC4, 0x75, /* 0x94-0x97 */
+	0xB1, 0xEC, 0xC4, 0x76, 0xC4, 0x77, 0xCC, 0xC5, /* 0x98-0x9B */
+	0xC4, 0xA4, 0xCF, 0xA5, 0xC4, 0x78, 0xC4, 0x79, /* 0x9C-0x9F */
+	0xC4, 0x7A, 0xC4, 0x7B, 0xC4, 0x7C, 0xEB, 0xF9, /* 0xA0-0xA3 */
+	0xC4, 0x7D, 0xC4, 0x7E, 0xEC, 0xA2, 0xC4, 0x80, /* 0xA4-0xA7 */
+	0xC5, 0xF2, 0xC4, 0x81, 0xEB, 0xFA, 0xC4, 0x82, /* 0xA8-0xAB */
+	0xC4, 0x83, 0xC4, 0x84, 0xC4, 0x85, 0xC4, 0x86, /* 0xAC-0xAF */
+	0xC4, 0x87, 0xC4, 0x88, 0xC4, 0x89, 0xC9, 0xC5, /* 0xB0-0xB3 */
+	0xC4, 0x8A, 0xC4, 0x8B, 0xC4, 0x8C, 0xC4, 0x8D, /* 0xB4-0xB7 */
+	0xC4, 0x8E, 0xC4, 0x8F, 0xE2, 0xDF, 0xEB, 0xFE, /* 0xB8-0xBB */
+	0xC4, 0x90, 0xC4, 0x91, 0xC4, 0x92, 0xC4, 0x93, /* 0xBC-0xBF */
+	0xCD, 0xCE, 0xEC, 0xA1, 0xB1, 0xDB, 0xD3, 0xB7, /* 0xC0-0xC3 */
+	0xC4, 0x94, 0xC4, 0x95, 0xD2, 0xDC, 0xC4, 0x96, /* 0xC4-0xC7 */
+	0xC4, 0x97, 0xC4, 0x98, 0xEB, 0xFD, 0xC4, 0x99, /* 0xC8-0xCB */
+	0xEB, 0xFB, 0xC4, 0x9A, 0xC4, 0x9B, 0xC4, 0x9C, /* 0xCC-0xCF */
+	0xC4, 0x9D, 0xC4, 0x9E, 0xC4, 0x9F, 0xC4, 0xA0, /* 0xD0-0xD3 */
+	0xC5, 0x40, 0xC5, 0x41, 0xC5, 0x42, 0xC5, 0x43, /* 0xD4-0xD7 */
+	0xC5, 0x44, 0xC5, 0x45, 0xC5, 0x46, 0xC5, 0x47, /* 0xD8-0xDB */
+	0xC5, 0x48, 0xC5, 0x49, 0xC5, 0x4A, 0xC5, 0x4B, /* 0xDC-0xDF */
+	0xC5, 0x4C, 0xC5, 0x4D, 0xC5, 0x4E, 0xB3, 0xBC, /* 0xE0-0xE3 */
+	0xC5, 0x4F, 0xC5, 0x50, 0xC5, 0x51, 0xEA, 0xB0, /* 0xE4-0xE7 */
+	0xC5, 0x52, 0xC5, 0x53, 0xD7, 0xD4, 0xC5, 0x54, /* 0xE8-0xEB */
+	0xF4, 0xAB, 0xB3, 0xF4, 0xC5, 0x55, 0xC5, 0x56, /* 0xEC-0xEF */
+	0xC5, 0x57, 0xC5, 0x58, 0xC5, 0x59, 0xD6, 0xC1, /* 0xF0-0xF3 */
+	0xD6, 0xC2, 0xC5, 0x5A, 0xC5, 0x5B, 0xC5, 0x5C, /* 0xF4-0xF7 */
+	0xC5, 0x5D, 0xC5, 0x5E, 0xC5, 0x5F, 0xD5, 0xE9, /* 0xF8-0xFB */
+	0xBE, 0xCA, 0xC5, 0x60, 0xF4, 0xA7, 0xC5, 0x61, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_82[512] = {
+	0xD2, 0xA8, 0xF4, 0xA8, 0xF4, 0xA9, 0xC5, 0x62, /* 0x00-0x03 */
+	0xF4, 0xAA, 0xBE, 0xCB, 0xD3, 0xDF, 0xC5, 0x63, /* 0x04-0x07 */
+	0xC5, 0x64, 0xC5, 0x65, 0xC5, 0x66, 0xC5, 0x67, /* 0x08-0x0B */
+	0xC9, 0xE0, 0xC9, 0xE1, 0xC5, 0x68, 0xC5, 0x69, /* 0x0C-0x0F */
+	0xF3, 0xC2, 0xC5, 0x6A, 0xCA, 0xE6, 0xC5, 0x6B, /* 0x10-0x13 */
+	0xCC, 0xF2, 0xC5, 0x6C, 0xC5, 0x6D, 0xC5, 0x6E, /* 0x14-0x17 */
+	0xC5, 0x6F, 0xC5, 0x70, 0xC5, 0x71, 0xE2, 0xB6, /* 0x18-0x1B */
+	0xCB, 0xB4, 0xC5, 0x72, 0xCE, 0xE8, 0xD6, 0xDB, /* 0x1C-0x1F */
+	0xC5, 0x73, 0xF4, 0xAD, 0xF4, 0xAE, 0xF4, 0xAF, /* 0x20-0x23 */
+	0xC5, 0x74, 0xC5, 0x75, 0xC5, 0x76, 0xC5, 0x77, /* 0x24-0x27 */
+	0xF4, 0xB2, 0xC5, 0x78, 0xBA, 0xBD, 0xF4, 0xB3, /* 0x28-0x2B */
+	0xB0, 0xE3, 0xF4, 0xB0, 0xC5, 0x79, 0xF4, 0xB1, /* 0x2C-0x2F */
+	0xBD, 0xA2, 0xB2, 0xD5, 0xC5, 0x7A, 0xF4, 0xB6, /* 0x30-0x33 */
+	0xF4, 0xB7, 0xB6, 0xE6, 0xB2, 0xB0, 0xCF, 0xCF, /* 0x34-0x37 */
+	0xF4, 0xB4, 0xB4, 0xAC, 0xC5, 0x7B, 0xF4, 0xB5, /* 0x38-0x3B */
+	0xC5, 0x7C, 0xC5, 0x7D, 0xF4, 0xB8, 0xC5, 0x7E, /* 0x3C-0x3F */
+	0xC5, 0x80, 0xC5, 0x81, 0xC5, 0x82, 0xC5, 0x83, /* 0x40-0x43 */
+	0xF4, 0xB9, 0xC5, 0x84, 0xC5, 0x85, 0xCD, 0xA7, /* 0x44-0x47 */
+	0xC5, 0x86, 0xF4, 0xBA, 0xC5, 0x87, 0xF4, 0xBB, /* 0x48-0x4B */
+	0xC5, 0x88, 0xC5, 0x89, 0xC5, 0x8A, 0xF4, 0xBC, /* 0x4C-0x4F */
+	0xC5, 0x8B, 0xC5, 0x8C, 0xC5, 0x8D, 0xC5, 0x8E, /* 0x50-0x53 */
+	0xC5, 0x8F, 0xC5, 0x90, 0xC5, 0x91, 0xC5, 0x92, /* 0x54-0x57 */
+	0xCB, 0xD2, 0xC5, 0x93, 0xF4, 0xBD, 0xC5, 0x94, /* 0x58-0x5B */
+	0xC5, 0x95, 0xC5, 0x96, 0xC5, 0x97, 0xF4, 0xBE, /* 0x5C-0x5F */
+	0xC5, 0x98, 0xC5, 0x99, 0xC5, 0x9A, 0xC5, 0x9B, /* 0x60-0x63 */
+	0xC5, 0x9C, 0xC5, 0x9D, 0xC5, 0x9E, 0xC5, 0x9F, /* 0x64-0x67 */
+	0xF4, 0xBF, 0xC5, 0xA0, 0xC6, 0x40, 0xC6, 0x41, /* 0x68-0x6B */
+	0xC6, 0x42, 0xC6, 0x43, 0xF4, 0xDE, 0xC1, 0xBC, /* 0x6C-0x6F */
+	0xBC, 0xE8, 0xC6, 0x44, 0xC9, 0xAB, 0xD1, 0xDE, /* 0x70-0x73 */
+	0xE5, 0xF5, 0xC6, 0x45, 0xC6, 0x46, 0xC6, 0x47, /* 0x74-0x77 */
+	0xC6, 0x48, 0xDC, 0xB3, 0xD2, 0xD5, 0xC6, 0x49, /* 0x78-0x7B */
+	0xC6, 0x4A, 0xDC, 0xB4, 0xB0, 0xAC, 0xDC, 0xB5, /* 0x7C-0x7F */
+	
+	0xC6, 0x4B, 0xC6, 0x4C, 0xBD, 0xDA, 0xC6, 0x4D, /* 0x80-0x83 */
+	0xDC, 0xB9, 0xC6, 0x4E, 0xC6, 0x4F, 0xC6, 0x50, /* 0x84-0x87 */
+	0xD8, 0xC2, 0xC6, 0x51, 0xDC, 0xB7, 0xD3, 0xF3, /* 0x88-0x8B */
+	0xC6, 0x52, 0xC9, 0xD6, 0xDC, 0xBA, 0xDC, 0xB6, /* 0x8C-0x8F */
+	0xC6, 0x53, 0xDC, 0xBB, 0xC3, 0xA2, 0xC6, 0x54, /* 0x90-0x93 */
+	0xC6, 0x55, 0xC6, 0x56, 0xC6, 0x57, 0xDC, 0xBC, /* 0x94-0x97 */
+	0xDC, 0xC5, 0xDC, 0xBD, 0xC6, 0x58, 0xC6, 0x59, /* 0x98-0x9B */
+	0xCE, 0xDF, 0xD6, 0xA5, 0xC6, 0x5A, 0xDC, 0xCF, /* 0x9C-0x9F */
+	0xC6, 0x5B, 0xDC, 0xCD, 0xC6, 0x5C, 0xC6, 0x5D, /* 0xA0-0xA3 */
+	0xDC, 0xD2, 0xBD, 0xE6, 0xC2, 0xAB, 0xC6, 0x5E, /* 0xA4-0xA7 */
+	0xDC, 0xB8, 0xDC, 0xCB, 0xDC, 0xCE, 0xDC, 0xBE, /* 0xA8-0xAB */
+	0xB7, 0xD2, 0xB0, 0xC5, 0xDC, 0xC7, 0xD0, 0xBE, /* 0xAC-0xAF */
+	0xDC, 0xC1, 0xBB, 0xA8, 0xC6, 0x5F, 0xB7, 0xBC, /* 0xB0-0xB3 */
+	0xDC, 0xCC, 0xC6, 0x60, 0xC6, 0x61, 0xDC, 0xC6, /* 0xB4-0xB7 */
+	0xDC, 0xBF, 0xC7, 0xDB, 0xC6, 0x62, 0xC6, 0x63, /* 0xB8-0xBB */
+	0xC6, 0x64, 0xD1, 0xBF, 0xDC, 0xC0, 0xC6, 0x65, /* 0xBC-0xBF */
+	0xC6, 0x66, 0xDC, 0xCA, 0xC6, 0x67, 0xC6, 0x68, /* 0xC0-0xC3 */
+	0xDC, 0xD0, 0xC6, 0x69, 0xC6, 0x6A, 0xCE, 0xAD, /* 0xC4-0xC7 */
+	0xDC, 0xC2, 0xC6, 0x6B, 0xDC, 0xC3, 0xDC, 0xC8, /* 0xC8-0xCB */
+	0xDC, 0xC9, 0xB2, 0xD4, 0xDC, 0xD1, 0xCB, 0xD5, /* 0xCC-0xCF */
+	0xC6, 0x6C, 0xD4, 0xB7, 0xDC, 0xDB, 0xDC, 0xDF, /* 0xD0-0xD3 */
+	0xCC, 0xA6, 0xDC, 0xE6, 0xC6, 0x6D, 0xC3, 0xE7, /* 0xD4-0xD7 */
+	0xDC, 0xDC, 0xC6, 0x6E, 0xC6, 0x6F, 0xBF, 0xC1, /* 0xD8-0xDB */
+	0xDC, 0xD9, 0xC6, 0x70, 0xB0, 0xFA, 0xB9, 0xB6, /* 0xDC-0xDF */
+	0xDC, 0xE5, 0xDC, 0xD3, 0xC6, 0x71, 0xDC, 0xC4, /* 0xE0-0xE3 */
+	0xDC, 0xD6, 0xC8, 0xF4, 0xBF, 0xE0, 0xC6, 0x72, /* 0xE4-0xE7 */
+	0xC6, 0x73, 0xC6, 0x74, 0xC6, 0x75, 0xC9, 0xBB, /* 0xE8-0xEB */
+	0xC6, 0x76, 0xC6, 0x77, 0xC6, 0x78, 0xB1, 0xBD, /* 0xEC-0xEF */
+	0xC6, 0x79, 0xD3, 0xA2, 0xC6, 0x7A, 0xC6, 0x7B, /* 0xF0-0xF3 */
+	0xDC, 0xDA, 0xC6, 0x7C, 0xC6, 0x7D, 0xDC, 0xD5, /* 0xF4-0xF7 */
+	0xC6, 0x7E, 0xC6, 0xBB, 0xC6, 0x80, 0xDC, 0xDE, /* 0xF8-0xFB */
+	0xC6, 0x81, 0xC6, 0x82, 0xC6, 0x83, 0xC6, 0x84, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_83[512] = {
+	0xC6, 0x85, 0xD7, 0xC2, 0xC3, 0xAF, 0xB7, 0xB6, /* 0x00-0x03 */
+	0xC7, 0xD1, 0xC3, 0xA9, 0xDC, 0xE2, 0xDC, 0xD8, /* 0x04-0x07 */
+	0xDC, 0xEB, 0xDC, 0xD4, 0xC6, 0x86, 0xC6, 0x87, /* 0x08-0x0B */
+	0xDC, 0xDD, 0xC6, 0x88, 0xBE, 0xA5, 0xDC, 0xD7, /* 0x0C-0x0F */
+	0xC6, 0x89, 0xDC, 0xE0, 0xC6, 0x8A, 0xC6, 0x8B, /* 0x10-0x13 */
+	0xDC, 0xE3, 0xDC, 0xE4, 0xC6, 0x8C, 0xDC, 0xF8, /* 0x14-0x17 */
+	0xC6, 0x8D, 0xC6, 0x8E, 0xDC, 0xE1, 0xDD, 0xA2, /* 0x18-0x1B */
+	0xDC, 0xE7, 0xC6, 0x8F, 0xC6, 0x90, 0xC6, 0x91, /* 0x1C-0x1F */
+	0xC6, 0x92, 0xC6, 0x93, 0xC6, 0x94, 0xC6, 0x95, /* 0x20-0x23 */
+	0xC6, 0x96, 0xC6, 0x97, 0xC6, 0x98, 0xBC, 0xEB, /* 0x24-0x27 */
+	0xB4, 0xC4, 0xC6, 0x99, 0xC6, 0x9A, 0xC3, 0xA3, /* 0x28-0x2B */
+	0xB2, 0xE7, 0xDC, 0xFA, 0xC6, 0x9B, 0xDC, 0xF2, /* 0x2C-0x2F */
+	0xC6, 0x9C, 0xDC, 0xEF, 0xC6, 0x9D, 0xDC, 0xFC, /* 0x30-0x33 */
+	0xDC, 0xEE, 0xD2, 0xF0, 0xB2, 0xE8, 0xC6, 0x9E, /* 0x34-0x37 */
+	0xC8, 0xD7, 0xC8, 0xE3, 0xDC, 0xFB, 0xC6, 0x9F, /* 0x38-0x3B */
+	0xDC, 0xED, 0xC6, 0xA0, 0xC7, 0x40, 0xC7, 0x41, /* 0x3C-0x3F */
+	0xDC, 0xF7, 0xC7, 0x42, 0xC7, 0x43, 0xDC, 0xF5, /* 0x40-0x43 */
+	0xC7, 0x44, 0xC7, 0x45, 0xBE, 0xA3, 0xDC, 0xF4, /* 0x44-0x47 */
+	0xC7, 0x46, 0xB2, 0xDD, 0xC7, 0x47, 0xC7, 0x48, /* 0x48-0x4B */
+	0xC7, 0x49, 0xC7, 0x4A, 0xC7, 0x4B, 0xDC, 0xF3, /* 0x4C-0x4F */
+	0xBC, 0xF6, 0xDC, 0xE8, 0xBB, 0xC4, 0xC7, 0x4C, /* 0x50-0x53 */
+	0xC0, 0xF3, 0xC7, 0x4D, 0xC7, 0x4E, 0xC7, 0x4F, /* 0x54-0x57 */
+	0xC7, 0x50, 0xC7, 0x51, 0xBC, 0xD4, 0xDC, 0xE9, /* 0x58-0x5B */
+	0xDC, 0xEA, 0xC7, 0x52, 0xDC, 0xF1, 0xDC, 0xF6, /* 0x5C-0x5F */
+	0xDC, 0xF9, 0xB5, 0xB4, 0xC7, 0x53, 0xC8, 0xD9, /* 0x60-0x63 */
+	0xBB, 0xE7, 0xDC, 0xFE, 0xDC, 0xFD, 0xD3, 0xAB, /* 0x64-0x67 */
+	0xDD, 0xA1, 0xDD, 0xA3, 0xDD, 0xA5, 0xD2, 0xF1, /* 0x68-0x6B */
+	0xDD, 0xA4, 0xDD, 0xA6, 0xDD, 0xA7, 0xD2, 0xA9, /* 0x6C-0x6F */
+	0xC7, 0x54, 0xC7, 0x55, 0xC7, 0x56, 0xC7, 0x57, /* 0x70-0x73 */
+	0xC7, 0x58, 0xC7, 0x59, 0xC7, 0x5A, 0xBA, 0xC9, /* 0x74-0x77 */
+	0xDD, 0xA9, 0xC7, 0x5B, 0xC7, 0x5C, 0xDD, 0xB6, /* 0x78-0x7B */
+	0xDD, 0xB1, 0xDD, 0xB4, 0xC7, 0x5D, 0xC7, 0x5E, /* 0x7C-0x7F */
+	
+	0xC7, 0x5F, 0xC7, 0x60, 0xC7, 0x61, 0xC7, 0x62, /* 0x80-0x83 */
+	0xC7, 0x63, 0xDD, 0xB0, 0xC6, 0xCE, 0xC7, 0x64, /* 0x84-0x87 */
+	0xC7, 0x65, 0xC0, 0xF2, 0xC7, 0x66, 0xC7, 0x67, /* 0x88-0x8B */
+	0xC7, 0x68, 0xC7, 0x69, 0xC9, 0xAF, 0xC7, 0x6A, /* 0x8C-0x8F */
+	0xC7, 0x6B, 0xC7, 0x6C, 0xDC, 0xEC, 0xDD, 0xAE, /* 0x90-0x93 */
+	0xC7, 0x6D, 0xC7, 0x6E, 0xC7, 0x6F, 0xC7, 0x70, /* 0x94-0x97 */
+	0xDD, 0xB7, 0xC7, 0x71, 0xC7, 0x72, 0xDC, 0xF0, /* 0x98-0x9B */
+	0xDD, 0xAF, 0xC7, 0x73, 0xDD, 0xB8, 0xC7, 0x74, /* 0x9C-0x9F */
+	0xDD, 0xAC, 0xC7, 0x75, 0xC7, 0x76, 0xC7, 0x77, /* 0xA0-0xA3 */
+	0xC7, 0x78, 0xC7, 0x79, 0xC7, 0x7A, 0xC7, 0x7B, /* 0xA4-0xA7 */
+	0xDD, 0xB9, 0xDD, 0xB3, 0xDD, 0xAD, 0xC4, 0xAA, /* 0xA8-0xAB */
+	0xC7, 0x7C, 0xC7, 0x7D, 0xC7, 0x7E, 0xC7, 0x80, /* 0xAC-0xAF */
+	0xDD, 0xA8, 0xC0, 0xB3, 0xC1, 0xAB, 0xDD, 0xAA, /* 0xB0-0xB3 */
+	0xDD, 0xAB, 0xC7, 0x81, 0xDD, 0xB2, 0xBB, 0xF1, /* 0xB4-0xB7 */
+	0xDD, 0xB5, 0xD3, 0xA8, 0xDD, 0xBA, 0xC7, 0x82, /* 0xB8-0xBB */
+	0xDD, 0xBB, 0xC3, 0xA7, 0xC7, 0x83, 0xC7, 0x84, /* 0xBC-0xBF */
+	0xDD, 0xD2, 0xDD, 0xBC, 0xC7, 0x85, 0xC7, 0x86, /* 0xC0-0xC3 */
+	0xC7, 0x87, 0xDD, 0xD1, 0xC7, 0x88, 0xB9, 0xBD, /* 0xC4-0xC7 */
+	0xC7, 0x89, 0xC7, 0x8A, 0xBE, 0xD5, 0xC7, 0x8B, /* 0xC8-0xCB */
+	0xBE, 0xFA, 0xC7, 0x8C, 0xC7, 0x8D, 0xBA, 0xCA, /* 0xCC-0xCF */
+	0xC7, 0x8E, 0xC7, 0x8F, 0xC7, 0x90, 0xC7, 0x91, /* 0xD0-0xD3 */
+	0xDD, 0xCA, 0xC7, 0x92, 0xDD, 0xC5, 0xC7, 0x93, /* 0xD4-0xD7 */
+	0xDD, 0xBF, 0xC7, 0x94, 0xC7, 0x95, 0xC7, 0x96, /* 0xD8-0xDB */
+	0xB2, 0xCB, 0xDD, 0xC3, 0xC7, 0x97, 0xDD, 0xCB, /* 0xDC-0xDF */
+	0xB2, 0xA4, 0xDD, 0xD5, 0xC7, 0x98, 0xC7, 0x99, /* 0xE0-0xE3 */
+	0xC7, 0x9A, 0xDD, 0xBE, 0xC7, 0x9B, 0xC7, 0x9C, /* 0xE4-0xE7 */
+	0xC7, 0x9D, 0xC6, 0xD0, 0xDD, 0xD0, 0xC7, 0x9E, /* 0xE8-0xEB */
+	0xC7, 0x9F, 0xC7, 0xA0, 0xC8, 0x40, 0xC8, 0x41, /* 0xEC-0xEF */
+	0xDD, 0xD4, 0xC1, 0xE2, 0xB7, 0xC6, 0xC8, 0x42, /* 0xF0-0xF3 */
+	0xC8, 0x43, 0xC8, 0x44, 0xC8, 0x45, 0xC8, 0x46, /* 0xF4-0xF7 */
+	0xDD, 0xCE, 0xDD, 0xCF, 0xC8, 0x47, 0xC8, 0x48, /* 0xF8-0xFB */
+	0xC8, 0x49, 0xDD, 0xC4, 0xC8, 0x4A, 0xC8, 0x4B, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_84[512] = {
+	0xC8, 0x4C, 0xDD, 0xBD, 0xC8, 0x4D, 0xDD, 0xCD, /* 0x00-0x03 */
+	0xCC, 0xD1, 0xC8, 0x4E, 0xDD, 0xC9, 0xC8, 0x4F, /* 0x04-0x07 */
+	0xC8, 0x50, 0xC8, 0x51, 0xC8, 0x52, 0xDD, 0xC2, /* 0x08-0x0B */
+	0xC3, 0xC8, 0xC6, 0xBC, 0xCE, 0xAE, 0xDD, 0xCC, /* 0x0C-0x0F */
+	0xC8, 0x53, 0xDD, 0xC8, 0xC8, 0x54, 0xC8, 0x55, /* 0x10-0x13 */
+	0xC8, 0x56, 0xC8, 0x57, 0xC8, 0x58, 0xC8, 0x59, /* 0x14-0x17 */
+	0xDD, 0xC1, 0xC8, 0x5A, 0xC8, 0x5B, 0xC8, 0x5C, /* 0x18-0x1B */
+	0xDD, 0xC6, 0xC2, 0xDC, 0xC8, 0x5D, 0xC8, 0x5E, /* 0x1C-0x1F */
+	0xC8, 0x5F, 0xC8, 0x60, 0xC8, 0x61, 0xC8, 0x62, /* 0x20-0x23 */
+	0xD3, 0xA9, 0xD3, 0xAA, 0xDD, 0xD3, 0xCF, 0xF4, /* 0x24-0x27 */
+	0xC8, 0xF8, 0xC8, 0x63, 0xC8, 0x64, 0xC8, 0x65, /* 0x28-0x2B */
+	0xC8, 0x66, 0xC8, 0x67, 0xC8, 0x68, 0xC8, 0x69, /* 0x2C-0x2F */
+	0xC8, 0x6A, 0xDD, 0xE6, 0xC8, 0x6B, 0xC8, 0x6C, /* 0x30-0x33 */
+	0xC8, 0x6D, 0xC8, 0x6E, 0xC8, 0x6F, 0xC8, 0x70, /* 0x34-0x37 */
+	0xDD, 0xC7, 0xC8, 0x71, 0xC8, 0x72, 0xC8, 0x73, /* 0x38-0x3B */
+	0xDD, 0xE0, 0xC2, 0xE4, 0xC8, 0x74, 0xC8, 0x75, /* 0x3C-0x3F */
+	0xC8, 0x76, 0xC8, 0x77, 0xC8, 0x78, 0xC8, 0x79, /* 0x40-0x43 */
+	0xC8, 0x7A, 0xC8, 0x7B, 0xDD, 0xE1, 0xC8, 0x7C, /* 0x44-0x47 */
+	0xC8, 0x7D, 0xC8, 0x7E, 0xC8, 0x80, 0xC8, 0x81, /* 0x48-0x4B */
+	0xC8, 0x82, 0xC8, 0x83, 0xC8, 0x84, 0xC8, 0x85, /* 0x4C-0x4F */
+	0xC8, 0x86, 0xDD, 0xD7, 0xC8, 0x87, 0xC8, 0x88, /* 0x50-0x53 */
+	0xC8, 0x89, 0xC8, 0x8A, 0xC8, 0x8B, 0xD6, 0xF8, /* 0x54-0x57 */
+	0xC8, 0x8C, 0xDD, 0xD9, 0xDD, 0xD8, 0xB8, 0xF0, /* 0x58-0x5B */
+	0xDD, 0xD6, 0xC8, 0x8D, 0xC8, 0x8E, 0xC8, 0x8F, /* 0x5C-0x5F */
+	0xC8, 0x90, 0xC6, 0xCF, 0xC8, 0x91, 0xB6, 0xAD, /* 0x60-0x63 */
+	0xC8, 0x92, 0xC8, 0x93, 0xC8, 0x94, 0xC8, 0x95, /* 0x64-0x67 */
+	0xC8, 0x96, 0xDD, 0xE2, 0xC8, 0x97, 0xBA, 0xF9, /* 0x68-0x6B */
+	0xD4, 0xE1, 0xDD, 0xE7, 0xC8, 0x98, 0xC8, 0x99, /* 0x6C-0x6F */
+	0xC8, 0x9A, 0xB4, 0xD0, 0xC8, 0x9B, 0xDD, 0xDA, /* 0x70-0x73 */
+	0xC8, 0x9C, 0xBF, 0xFB, 0xDD, 0xE3, 0xC8, 0x9D, /* 0x74-0x77 */
+	0xDD, 0xDF, 0xC8, 0x9E, 0xDD, 0xDD, 0xC8, 0x9F, /* 0x78-0x7B */
+	0xC8, 0xA0, 0xC9, 0x40, 0xC9, 0x41, 0xC9, 0x42, /* 0x7C-0x7F */
+	
+	0xC9, 0x43, 0xC9, 0x44, 0xB5, 0xD9, 0xC9, 0x45, /* 0x80-0x83 */
+	0xC9, 0x46, 0xC9, 0x47, 0xC9, 0x48, 0xDD, 0xDB, /* 0x84-0x87 */
+	0xDD, 0xDC, 0xDD, 0xDE, 0xC9, 0x49, 0xBD, 0xAF, /* 0x88-0x8B */
+	0xDD, 0xE4, 0xC9, 0x4A, 0xDD, 0xE5, 0xC9, 0x4B, /* 0x8C-0x8F */
+	0xC9, 0x4C, 0xC9, 0x4D, 0xC9, 0x4E, 0xC9, 0x4F, /* 0x90-0x93 */
+	0xC9, 0x50, 0xC9, 0x51, 0xC9, 0x52, 0xDD, 0xF5, /* 0x94-0x97 */
+	0xC9, 0x53, 0xC3, 0xC9, 0xC9, 0x54, 0xC9, 0x55, /* 0x98-0x9B */
+	0xCB, 0xE2, 0xC9, 0x56, 0xC9, 0x57, 0xC9, 0x58, /* 0x9C-0x9F */
+	0xC9, 0x59, 0xDD, 0xF2, 0xC9, 0x5A, 0xC9, 0x5B, /* 0xA0-0xA3 */
+	0xC9, 0x5C, 0xC9, 0x5D, 0xC9, 0x5E, 0xC9, 0x5F, /* 0xA4-0xA7 */
+	0xC9, 0x60, 0xC9, 0x61, 0xC9, 0x62, 0xC9, 0x63, /* 0xA8-0xAB */
+	0xC9, 0x64, 0xC9, 0x65, 0xC9, 0x66, 0xD8, 0xE1, /* 0xAC-0xAF */
+	0xC9, 0x67, 0xC9, 0x68, 0xC6, 0xD1, 0xC9, 0x69, /* 0xB0-0xB3 */
+	0xDD, 0xF4, 0xC9, 0x6A, 0xC9, 0x6B, 0xC9, 0x6C, /* 0xB4-0xB7 */
+	0xD5, 0xF4, 0xDD, 0xF3, 0xDD, 0xF0, 0xC9, 0x6D, /* 0xB8-0xBB */
+	0xC9, 0x6E, 0xDD, 0xEC, 0xC9, 0x6F, 0xDD, 0xEF, /* 0xBC-0xBF */
+	0xC9, 0x70, 0xDD, 0xE8, 0xC9, 0x71, 0xC9, 0x72, /* 0xC0-0xC3 */
+	0xD0, 0xEE, 0xC9, 0x73, 0xC9, 0x74, 0xC9, 0x75, /* 0xC4-0xC7 */
+	0xC9, 0x76, 0xC8, 0xD8, 0xDD, 0xEE, 0xC9, 0x77, /* 0xC8-0xCB */
+	0xC9, 0x78, 0xDD, 0xE9, 0xC9, 0x79, 0xC9, 0x7A, /* 0xCC-0xCF */
+	0xDD, 0xEA, 0xCB, 0xF2, 0xC9, 0x7B, 0xDD, 0xED, /* 0xD0-0xD3 */
+	0xC9, 0x7C, 0xC9, 0x7D, 0xB1, 0xCD, 0xC9, 0x7E, /* 0xD4-0xD7 */
+	0xC9, 0x80, 0xC9, 0x81, 0xC9, 0x82, 0xC9, 0x83, /* 0xD8-0xDB */
+	0xC9, 0x84, 0xC0, 0xB6, 0xC9, 0x85, 0xBC, 0xBB, /* 0xDC-0xDF */
+	0xDD, 0xF1, 0xC9, 0x86, 0xC9, 0x87, 0xDD, 0xF7, /* 0xE0-0xE3 */
+	0xC9, 0x88, 0xDD, 0xF6, 0xDD, 0xEB, 0xC9, 0x89, /* 0xE4-0xE7 */
+	0xC9, 0x8A, 0xC9, 0x8B, 0xC9, 0x8C, 0xC9, 0x8D, /* 0xE8-0xEB */
+	0xC5, 0xEE, 0xC9, 0x8E, 0xC9, 0x8F, 0xC9, 0x90, /* 0xEC-0xEF */
+	0xDD, 0xFB, 0xC9, 0x91, 0xC9, 0x92, 0xC9, 0x93, /* 0xF0-0xF3 */
+	0xC9, 0x94, 0xC9, 0x95, 0xC9, 0x96, 0xC9, 0x97, /* 0xF4-0xF7 */
+	0xC9, 0x98, 0xC9, 0x99, 0xC9, 0x9A, 0xC9, 0x9B, /* 0xF8-0xFB */
+	0xDE, 0xA4, 0xC9, 0x9C, 0xC9, 0x9D, 0xDE, 0xA3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_85[512] = {
+	0xC9, 0x9E, 0xC9, 0x9F, 0xC9, 0xA0, 0xCA, 0x40, /* 0x00-0x03 */
+	0xCA, 0x41, 0xCA, 0x42, 0xCA, 0x43, 0xCA, 0x44, /* 0x04-0x07 */
+	0xCA, 0x45, 0xCA, 0x46, 0xCA, 0x47, 0xCA, 0x48, /* 0x08-0x0B */
+	0xDD, 0xF8, 0xCA, 0x49, 0xCA, 0x4A, 0xCA, 0x4B, /* 0x0C-0x0F */
+	0xCA, 0x4C, 0xC3, 0xEF, 0xCA, 0x4D, 0xC2, 0xFB, /* 0x10-0x13 */
+	0xCA, 0x4E, 0xCA, 0x4F, 0xCA, 0x50, 0xD5, 0xE1, /* 0x14-0x17 */
+	0xCA, 0x51, 0xCA, 0x52, 0xCE, 0xB5, 0xCA, 0x53, /* 0x18-0x1B */
+	0xCA, 0x54, 0xCA, 0x55, 0xCA, 0x56, 0xDD, 0xFD, /* 0x1C-0x1F */
+	0xCA, 0x57, 0xB2, 0xCC, 0xCA, 0x58, 0xCA, 0x59, /* 0x20-0x23 */
+	0xCA, 0x5A, 0xCA, 0x5B, 0xCA, 0x5C, 0xCA, 0x5D, /* 0x24-0x27 */
+	0xCA, 0x5E, 0xCA, 0x5F, 0xCA, 0x60, 0xC4, 0xE8, /* 0x28-0x2B */
+	0xCA, 0xDF, 0xCA, 0x61, 0xCA, 0x62, 0xCA, 0x63, /* 0x2C-0x2F */
+	0xCA, 0x64, 0xCA, 0x65, 0xCA, 0x66, 0xCA, 0x67, /* 0x30-0x33 */
+	0xCA, 0x68, 0xCA, 0x69, 0xCA, 0x6A, 0xC7, 0xBE, /* 0x34-0x37 */
+	0xDD, 0xFA, 0xDD, 0xFC, 0xDD, 0xFE, 0xDE, 0xA2, /* 0x38-0x3B */
+	0xB0, 0xAA, 0xB1, 0xCE, 0xCA, 0x6B, 0xCA, 0x6C, /* 0x3C-0x3F */
+	0xCA, 0x6D, 0xCA, 0x6E, 0xCA, 0x6F, 0xDE, 0xAC, /* 0x40-0x43 */
+	0xCA, 0x70, 0xCA, 0x71, 0xCA, 0x72, 0xCA, 0x73, /* 0x44-0x47 */
+	0xDE, 0xA6, 0xBD, 0xB6, 0xC8, 0xEF, 0xCA, 0x74, /* 0x48-0x4B */
+	0xCA, 0x75, 0xCA, 0x76, 0xCA, 0x77, 0xCA, 0x78, /* 0x4C-0x4F */
+	0xCA, 0x79, 0xCA, 0x7A, 0xCA, 0x7B, 0xCA, 0x7C, /* 0x50-0x53 */
+	0xCA, 0x7D, 0xCA, 0x7E, 0xDE, 0xA1, 0xCA, 0x80, /* 0x54-0x57 */
+	0xCA, 0x81, 0xDE, 0xA5, 0xCA, 0x82, 0xCA, 0x83, /* 0x58-0x5B */
+	0xCA, 0x84, 0xCA, 0x85, 0xDE, 0xA9, 0xCA, 0x86, /* 0x5C-0x5F */
+	0xCA, 0x87, 0xCA, 0x88, 0xCA, 0x89, 0xCA, 0x8A, /* 0x60-0x63 */
+	0xDE, 0xA8, 0xCA, 0x8B, 0xCA, 0x8C, 0xCA, 0x8D, /* 0x64-0x67 */
+	0xDE, 0xA7, 0xCA, 0x8E, 0xCA, 0x8F, 0xCA, 0x90, /* 0x68-0x6B */
+	0xCA, 0x91, 0xCA, 0x92, 0xCA, 0x93, 0xCA, 0x94, /* 0x6C-0x6F */
+	0xCA, 0x95, 0xCA, 0x96, 0xDE, 0xAD, 0xCA, 0x97, /* 0x70-0x73 */
+	0xD4, 0xCC, 0xCA, 0x98, 0xCA, 0x99, 0xCA, 0x9A, /* 0x74-0x77 */
+	0xCA, 0x9B, 0xDE, 0xB3, 0xDE, 0xAA, 0xDE, 0xAE, /* 0x78-0x7B */
+	0xCA, 0x9C, 0xCA, 0x9D, 0xC0, 0xD9, 0xCA, 0x9E, /* 0x7C-0x7F */
+	
+	0xCA, 0x9F, 0xCA, 0xA0, 0xCB, 0x40, 0xCB, 0x41, /* 0x80-0x83 */
+	0xB1, 0xA1, 0xDE, 0xB6, 0xCB, 0x42, 0xDE, 0xB1, /* 0x84-0x87 */
+	0xCB, 0x43, 0xCB, 0x44, 0xCB, 0x45, 0xCB, 0x46, /* 0x88-0x8B */
+	0xCB, 0x47, 0xCB, 0x48, 0xCB, 0x49, 0xDE, 0xB2, /* 0x8C-0x8F */
+	0xCB, 0x4A, 0xCB, 0x4B, 0xCB, 0x4C, 0xCB, 0x4D, /* 0x90-0x93 */
+	0xCB, 0x4E, 0xCB, 0x4F, 0xCB, 0x50, 0xCB, 0x51, /* 0x94-0x97 */
+	0xCB, 0x52, 0xCB, 0x53, 0xCB, 0x54, 0xD1, 0xA6, /* 0x98-0x9B */
+	0xDE, 0xB5, 0xCB, 0x55, 0xCB, 0x56, 0xCB, 0x57, /* 0x9C-0x9F */
+	0xCB, 0x58, 0xCB, 0x59, 0xCB, 0x5A, 0xCB, 0x5B, /* 0xA0-0xA3 */
+	0xDE, 0xAF, 0xCB, 0x5C, 0xCB, 0x5D, 0xCB, 0x5E, /* 0xA4-0xA7 */
+	0xDE, 0xB0, 0xCB, 0x5F, 0xD0, 0xBD, 0xCB, 0x60, /* 0xA8-0xAB */
+	0xCB, 0x61, 0xCB, 0x62, 0xDE, 0xB4, 0xCA, 0xED, /* 0xAC-0xAF */
+	0xDE, 0xB9, 0xCB, 0x63, 0xCB, 0x64, 0xCB, 0x65, /* 0xB0-0xB3 */
+	0xCB, 0x66, 0xCB, 0x67, 0xCB, 0x68, 0xDE, 0xB8, /* 0xB4-0xB7 */
+	0xCB, 0x69, 0xDE, 0xB7, 0xCB, 0x6A, 0xCB, 0x6B, /* 0xB8-0xBB */
+	0xCB, 0x6C, 0xCB, 0x6D, 0xCB, 0x6E, 0xCB, 0x6F, /* 0xBC-0xBF */
+	0xCB, 0x70, 0xDE, 0xBB, 0xCB, 0x71, 0xCB, 0x72, /* 0xC0-0xC3 */
+	0xCB, 0x73, 0xCB, 0x74, 0xCB, 0x75, 0xCB, 0x76, /* 0xC4-0xC7 */
+	0xCB, 0x77, 0xBD, 0xE5, 0xCB, 0x78, 0xCB, 0x79, /* 0xC8-0xCB */
+	0xCB, 0x7A, 0xCB, 0x7B, 0xCB, 0x7C, 0xB2, 0xD8, /* 0xCC-0xCF */
+	0xC3, 0xEA, 0xCB, 0x7D, 0xCB, 0x7E, 0xDE, 0xBA, /* 0xD0-0xD3 */
+	0xCB, 0x80, 0xC5, 0xBA, 0xCB, 0x81, 0xCB, 0x82, /* 0xD4-0xD7 */
+	0xCB, 0x83, 0xCB, 0x84, 0xCB, 0x85, 0xCB, 0x86, /* 0xD8-0xDB */
+	0xDE, 0xBC, 0xCB, 0x87, 0xCB, 0x88, 0xCB, 0x89, /* 0xDC-0xDF */
+	0xCB, 0x8A, 0xCB, 0x8B, 0xCB, 0x8C, 0xCB, 0x8D, /* 0xE0-0xE3 */
+	0xCC, 0xD9, 0xCB, 0x8E, 0xCB, 0x8F, 0xCB, 0x90, /* 0xE4-0xE7 */
+	0xCB, 0x91, 0xB7, 0xAA, 0xCB, 0x92, 0xCB, 0x93, /* 0xE8-0xEB */
+	0xCB, 0x94, 0xCB, 0x95, 0xCB, 0x96, 0xCB, 0x97, /* 0xEC-0xEF */
+	0xCB, 0x98, 0xCB, 0x99, 0xCB, 0x9A, 0xCB, 0x9B, /* 0xF0-0xF3 */
+	0xCB, 0x9C, 0xCB, 0x9D, 0xCB, 0x9E, 0xCB, 0x9F, /* 0xF4-0xF7 */
+	0xCB, 0xA0, 0xCC, 0x40, 0xCC, 0x41, 0xD4, 0xE5, /* 0xF8-0xFB */
+	0xCC, 0x42, 0xCC, 0x43, 0xCC, 0x44, 0xDE, 0xBD, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_86[512] = {
+	0xCC, 0x45, 0xCC, 0x46, 0xCC, 0x47, 0xCC, 0x48, /* 0x00-0x03 */
+	0xCC, 0x49, 0xDE, 0xBF, 0xCC, 0x4A, 0xCC, 0x4B, /* 0x04-0x07 */
+	0xCC, 0x4C, 0xCC, 0x4D, 0xCC, 0x4E, 0xCC, 0x4F, /* 0x08-0x0B */
+	0xCC, 0x50, 0xCC, 0x51, 0xCC, 0x52, 0xCC, 0x53, /* 0x0C-0x0F */
+	0xCC, 0x54, 0xC4, 0xA2, 0xCC, 0x55, 0xCC, 0x56, /* 0x10-0x13 */
+	0xCC, 0x57, 0xCC, 0x58, 0xDE, 0xC1, 0xCC, 0x59, /* 0x14-0x17 */
+	0xCC, 0x5A, 0xCC, 0x5B, 0xCC, 0x5C, 0xCC, 0x5D, /* 0x18-0x1B */
+	0xCC, 0x5E, 0xCC, 0x5F, 0xCC, 0x60, 0xCC, 0x61, /* 0x1C-0x1F */
+	0xCC, 0x62, 0xCC, 0x63, 0xCC, 0x64, 0xCC, 0x65, /* 0x20-0x23 */
+	0xCC, 0x66, 0xCC, 0x67, 0xCC, 0x68, 0xDE, 0xBE, /* 0x24-0x27 */
+	0xCC, 0x69, 0xDE, 0xC0, 0xCC, 0x6A, 0xCC, 0x6B, /* 0x28-0x2B */
+	0xCC, 0x6C, 0xCC, 0x6D, 0xCC, 0x6E, 0xCC, 0x6F, /* 0x2C-0x2F */
+	0xCC, 0x70, 0xCC, 0x71, 0xCC, 0x72, 0xCC, 0x73, /* 0x30-0x33 */
+	0xCC, 0x74, 0xCC, 0x75, 0xCC, 0x76, 0xCC, 0x77, /* 0x34-0x37 */
+	0xD5, 0xBA, 0xCC, 0x78, 0xCC, 0x79, 0xCC, 0x7A, /* 0x38-0x3B */
+	0xDE, 0xC2, 0xCC, 0x7B, 0xCC, 0x7C, 0xCC, 0x7D, /* 0x3C-0x3F */
+	0xCC, 0x7E, 0xCC, 0x80, 0xCC, 0x81, 0xCC, 0x82, /* 0x40-0x43 */
+	0xCC, 0x83, 0xCC, 0x84, 0xCC, 0x85, 0xCC, 0x86, /* 0x44-0x47 */
+	0xCC, 0x87, 0xCC, 0x88, 0xCC, 0x89, 0xCC, 0x8A, /* 0x48-0x4B */
+	0xCC, 0x8B, 0xF2, 0xAE, 0xBB, 0xA2, 0xC2, 0xB2, /* 0x4C-0x4F */
+	0xC5, 0xB0, 0xC2, 0xC7, 0xCC, 0x8C, 0xCC, 0x8D, /* 0x50-0x53 */
+	0xF2, 0xAF, 0xCC, 0x8E, 0xCC, 0x8F, 0xCC, 0x90, /* 0x54-0x57 */
+	0xCC, 0x91, 0xCC, 0x92, 0xD0, 0xE9, 0xCC, 0x93, /* 0x58-0x5B */
+	0xCC, 0x94, 0xCC, 0x95, 0xD3, 0xDD, 0xCC, 0x96, /* 0x5C-0x5F */
+	0xCC, 0x97, 0xCC, 0x98, 0xEB, 0xBD, 0xCC, 0x99, /* 0x60-0x63 */
+	0xCC, 0x9A, 0xCC, 0x9B, 0xCC, 0x9C, 0xCC, 0x9D, /* 0x64-0x67 */
+	0xCC, 0x9E, 0xCC, 0x9F, 0xCC, 0xA0, 0xB3, 0xE6, /* 0x68-0x6B */
+	0xF2, 0xB0, 0xCD, 0x40, 0xF2, 0xB1, 0xCD, 0x41, /* 0x6C-0x6F */
+	0xCD, 0x42, 0xCA, 0xAD, 0xCD, 0x43, 0xCD, 0x44, /* 0x70-0x73 */
+	0xCD, 0x45, 0xCD, 0x46, 0xCD, 0x47, 0xCD, 0x48, /* 0x74-0x77 */
+	0xCD, 0x49, 0xBA, 0xE7, 0xF2, 0xB3, 0xF2, 0xB5, /* 0x78-0x7B */
+	0xF2, 0xB4, 0xCB, 0xE4, 0xCF, 0xBA, 0xF2, 0xB2, /* 0x7C-0x7F */
+	
+	0xCA, 0xB4, 0xD2, 0xCF, 0xC2, 0xEC, 0xCD, 0x4A, /* 0x80-0x83 */
+	0xCD, 0x4B, 0xCD, 0x4C, 0xCD, 0x4D, 0xCD, 0x4E, /* 0x84-0x87 */
+	0xCD, 0x4F, 0xCD, 0x50, 0xCE, 0xC3, 0xF2, 0xB8, /* 0x88-0x8B */
+	0xB0, 0xF6, 0xF2, 0xB7, 0xCD, 0x51, 0xCD, 0x52, /* 0x8C-0x8F */
+	0xCD, 0x53, 0xCD, 0x54, 0xCD, 0x55, 0xF2, 0xBE, /* 0x90-0x93 */
+	0xCD, 0x56, 0xB2, 0xCF, 0xCD, 0x57, 0xCD, 0x58, /* 0x94-0x97 */
+	0xCD, 0x59, 0xCD, 0x5A, 0xCD, 0x5B, 0xCD, 0x5C, /* 0x98-0x9B */
+	0xD1, 0xC1, 0xF2, 0xBA, 0xCD, 0x5D, 0xCD, 0x5E, /* 0x9C-0x9F */
+	0xCD, 0x5F, 0xCD, 0x60, 0xCD, 0x61, 0xF2, 0xBC, /* 0xA0-0xA3 */
+	0xD4, 0xE9, 0xCD, 0x62, 0xCD, 0x63, 0xF2, 0xBB, /* 0xA4-0xA7 */
+	0xF2, 0xB6, 0xF2, 0xBF, 0xF2, 0xBD, 0xCD, 0x64, /* 0xA8-0xAB */
+	0xF2, 0xB9, 0xCD, 0x65, 0xCD, 0x66, 0xF2, 0xC7, /* 0xAC-0xAF */
+	0xF2, 0xC4, 0xF2, 0xC6, 0xCD, 0x67, 0xCD, 0x68, /* 0xB0-0xB3 */
+	0xF2, 0xCA, 0xF2, 0xC2, 0xF2, 0xC0, 0xCD, 0x69, /* 0xB4-0xB7 */
+	0xCD, 0x6A, 0xCD, 0x6B, 0xF2, 0xC5, 0xCD, 0x6C, /* 0xB8-0xBB */
+	0xCD, 0x6D, 0xCD, 0x6E, 0xCD, 0x6F, 0xCD, 0x70, /* 0xBC-0xBF */
+	0xD6, 0xFB, 0xCD, 0x71, 0xCD, 0x72, 0xCD, 0x73, /* 0xC0-0xC3 */
+	0xF2, 0xC1, 0xCD, 0x74, 0xC7, 0xF9, 0xC9, 0xDF, /* 0xC4-0xC7 */
+	0xCD, 0x75, 0xF2, 0xC8, 0xB9, 0xC6, 0xB5, 0xB0, /* 0xC8-0xCB */
+	0xCD, 0x76, 0xCD, 0x77, 0xF2, 0xC3, 0xF2, 0xC9, /* 0xCC-0xCF */
+	0xF2, 0xD0, 0xF2, 0xD6, 0xCD, 0x78, 0xCD, 0x79, /* 0xD0-0xD3 */
+	0xBB, 0xD7, 0xCD, 0x7A, 0xCD, 0x7B, 0xCD, 0x7C, /* 0xD4-0xD7 */
+	0xF2, 0xD5, 0xCD, 0xDC, 0xCD, 0x7D, 0xD6, 0xEB, /* 0xD8-0xDB */
+	0xCD, 0x7E, 0xCD, 0x80, 0xF2, 0xD2, 0xF2, 0xD4, /* 0xDC-0xDF */
+	0xCD, 0x81, 0xCD, 0x82, 0xCD, 0x83, 0xCD, 0x84, /* 0xE0-0xE3 */
+	0xB8, 0xF2, 0xCD, 0x85, 0xCD, 0x86, 0xCD, 0x87, /* 0xE4-0xE7 */
+	0xCD, 0x88, 0xF2, 0xCB, 0xCD, 0x89, 0xCD, 0x8A, /* 0xE8-0xEB */
+	0xCD, 0x8B, 0xF2, 0xCE, 0xC2, 0xF9, 0xCD, 0x8C, /* 0xEC-0xEF */
+	0xD5, 0xDD, 0xF2, 0xCC, 0xF2, 0xCD, 0xF2, 0xCF, /* 0xF0-0xF3 */
+	0xF2, 0xD3, 0xCD, 0x8D, 0xCD, 0x8E, 0xCD, 0x8F, /* 0xF4-0xF7 */
+	0xF2, 0xD9, 0xD3, 0xBC, 0xCD, 0x90, 0xCD, 0x91, /* 0xF8-0xFB */
+	0xCD, 0x92, 0xCD, 0x93, 0xB6, 0xEA, 0xCD, 0x94, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_87[512] = {
+	0xCA, 0xF1, 0xCD, 0x95, 0xB7, 0xE4, 0xF2, 0xD7, /* 0x00-0x03 */
+	0xCD, 0x96, 0xCD, 0x97, 0xCD, 0x98, 0xF2, 0xD8, /* 0x04-0x07 */
+	0xF2, 0xDA, 0xF2, 0xDD, 0xF2, 0xDB, 0xCD, 0x99, /* 0x08-0x0B */
+	0xCD, 0x9A, 0xF2, 0xDC, 0xCD, 0x9B, 0xCD, 0x9C, /* 0x0C-0x0F */
+	0xCD, 0x9D, 0xCD, 0x9E, 0xD1, 0xD1, 0xF2, 0xD1, /* 0x10-0x13 */
+	0xCD, 0x9F, 0xCD, 0xC9, 0xCD, 0xA0, 0xCE, 0xCF, /* 0x14-0x17 */
+	0xD6, 0xA9, 0xCE, 0x40, 0xF2, 0xE3, 0xCE, 0x41, /* 0x18-0x1B */
+	0xC3, 0xDB, 0xCE, 0x42, 0xF2, 0xE0, 0xCE, 0x43, /* 0x1C-0x1F */
+	0xCE, 0x44, 0xC0, 0xAF, 0xF2, 0xEC, 0xF2, 0xDE, /* 0x20-0x23 */
+	0xCE, 0x45, 0xF2, 0xE1, 0xCE, 0x46, 0xCE, 0x47, /* 0x24-0x27 */
+	0xCE, 0x48, 0xF2, 0xE8, 0xCE, 0x49, 0xCE, 0x4A, /* 0x28-0x2B */
+	0xCE, 0x4B, 0xCE, 0x4C, 0xF2, 0xE2, 0xCE, 0x4D, /* 0x2C-0x2F */
+	0xCE, 0x4E, 0xF2, 0xE7, 0xCE, 0x4F, 0xCE, 0x50, /* 0x30-0x33 */
+	0xF2, 0xE6, 0xCE, 0x51, 0xCE, 0x52, 0xF2, 0xE9, /* 0x34-0x37 */
+	0xCE, 0x53, 0xCE, 0x54, 0xCE, 0x55, 0xF2, 0xDF, /* 0x38-0x3B */
+	0xCE, 0x56, 0xCE, 0x57, 0xF2, 0xE4, 0xF2, 0xEA, /* 0x3C-0x3F */
+	0xCE, 0x58, 0xCE, 0x59, 0xCE, 0x5A, 0xCE, 0x5B, /* 0x40-0x43 */
+	0xCE, 0x5C, 0xCE, 0x5D, 0xCE, 0x5E, 0xD3, 0xAC, /* 0x44-0x47 */
+	0xF2, 0xE5, 0xB2, 0xF5, 0xCE, 0x5F, 0xCE, 0x60, /* 0x48-0x4B */
+	0xF2, 0xF2, 0xCE, 0x61, 0xD0, 0xAB, 0xCE, 0x62, /* 0x4C-0x4F */
+	0xCE, 0x63, 0xCE, 0x64, 0xCE, 0x65, 0xF2, 0xF5, /* 0x50-0x53 */
+	0xCE, 0x66, 0xCE, 0x67, 0xCE, 0x68, 0xBB, 0xC8, /* 0x54-0x57 */
+	0xCE, 0x69, 0xF2, 0xF9, 0xCE, 0x6A, 0xCE, 0x6B, /* 0x58-0x5B */
+	0xCE, 0x6C, 0xCE, 0x6D, 0xCE, 0x6E, 0xCE, 0x6F, /* 0x5C-0x5F */
+	0xF2, 0xF0, 0xCE, 0x70, 0xCE, 0x71, 0xF2, 0xF6, /* 0x60-0x63 */
+	0xF2, 0xF8, 0xF2, 0xFA, 0xCE, 0x72, 0xCE, 0x73, /* 0x64-0x67 */
+	0xCE, 0x74, 0xCE, 0x75, 0xCE, 0x76, 0xCE, 0x77, /* 0x68-0x6B */
+	0xCE, 0x78, 0xCE, 0x79, 0xF2, 0xF3, 0xCE, 0x7A, /* 0x6C-0x6F */
+	0xF2, 0xF1, 0xCE, 0x7B, 0xCE, 0x7C, 0xCE, 0x7D, /* 0x70-0x73 */
+	0xBA, 0xFB, 0xCE, 0x7E, 0xB5, 0xFB, 0xCE, 0x80, /* 0x74-0x77 */
+	0xCE, 0x81, 0xCE, 0x82, 0xCE, 0x83, 0xF2, 0xEF, /* 0x78-0x7B */
+	0xF2, 0xF7, 0xF2, 0xED, 0xF2, 0xEE, 0xCE, 0x84, /* 0x7C-0x7F */
+	
+	0xCE, 0x85, 0xCE, 0x86, 0xF2, 0xEB, 0xF3, 0xA6, /* 0x80-0x83 */
+	0xCE, 0x87, 0xF3, 0xA3, 0xCE, 0x88, 0xCE, 0x89, /* 0x84-0x87 */
+	0xF3, 0xA2, 0xCE, 0x8A, 0xCE, 0x8B, 0xF2, 0xF4, /* 0x88-0x8B */
+	0xCE, 0x8C, 0xC8, 0xDA, 0xCE, 0x8D, 0xCE, 0x8E, /* 0x8C-0x8F */
+	0xCE, 0x8F, 0xCE, 0x90, 0xCE, 0x91, 0xF2, 0xFB, /* 0x90-0x93 */
+	0xCE, 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xF3, 0xA5, /* 0x94-0x97 */
+	0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, 0x98, /* 0x98-0x9B */
+	0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xC3, 0xF8, /* 0x9C-0x9F */
+	0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, /* 0xA0-0xA3 */
+	0xCE, 0xA0, 0xCF, 0x40, 0xCF, 0x41, 0xCF, 0x42, /* 0xA4-0xA7 */
+	0xF2, 0xFD, 0xCF, 0x43, 0xCF, 0x44, 0xF3, 0xA7, /* 0xA8-0xAB */
+	0xF3, 0xA9, 0xF3, 0xA4, 0xCF, 0x45, 0xF2, 0xFC, /* 0xAC-0xAF */
+	0xCF, 0x46, 0xCF, 0x47, 0xCF, 0x48, 0xF3, 0xAB, /* 0xB0-0xB3 */
+	0xCF, 0x49, 0xF3, 0xAA, 0xCF, 0x4A, 0xCF, 0x4B, /* 0xB4-0xB7 */
+	0xCF, 0x4C, 0xCF, 0x4D, 0xC2, 0xDD, 0xCF, 0x4E, /* 0xB8-0xBB */
+	0xCF, 0x4F, 0xF3, 0xAE, 0xCF, 0x50, 0xCF, 0x51, /* 0xBC-0xBF */
+	0xF3, 0xB0, 0xCF, 0x52, 0xCF, 0x53, 0xCF, 0x54, /* 0xC0-0xC3 */
+	0xCF, 0x55, 0xCF, 0x56, 0xF3, 0xA1, 0xCF, 0x57, /* 0xC4-0xC7 */
+	0xCF, 0x58, 0xCF, 0x59, 0xF3, 0xB1, 0xF3, 0xAC, /* 0xC8-0xCB */
+	0xCF, 0x5A, 0xCF, 0x5B, 0xCF, 0x5C, 0xCF, 0x5D, /* 0xCC-0xCF */
+	0xCF, 0x5E, 0xF3, 0xAF, 0xF2, 0xFE, 0xF3, 0xAD, /* 0xD0-0xD3 */
+	0xCF, 0x5F, 0xCF, 0x60, 0xCF, 0x61, 0xCF, 0x62, /* 0xD4-0xD7 */
+	0xCF, 0x63, 0xCF, 0x64, 0xCF, 0x65, 0xF3, 0xB2, /* 0xD8-0xDB */
+	0xCF, 0x66, 0xCF, 0x67, 0xCF, 0x68, 0xCF, 0x69, /* 0xDC-0xDF */
+	0xF3, 0xB4, 0xCF, 0x6A, 0xCF, 0x6B, 0xCF, 0x6C, /* 0xE0-0xE3 */
+	0xCF, 0x6D, 0xF3, 0xA8, 0xCF, 0x6E, 0xCF, 0x6F, /* 0xE4-0xE7 */
+	0xCF, 0x70, 0xCF, 0x71, 0xF3, 0xB3, 0xCF, 0x72, /* 0xE8-0xEB */
+	0xCF, 0x73, 0xCF, 0x74, 0xF3, 0xB5, 0xCF, 0x75, /* 0xEC-0xEF */
+	0xCF, 0x76, 0xCF, 0x77, 0xCF, 0x78, 0xCF, 0x79, /* 0xF0-0xF3 */
+	0xCF, 0x7A, 0xCF, 0x7B, 0xCF, 0x7C, 0xCF, 0x7D, /* 0xF4-0xF7 */
+	0xCF, 0x7E, 0xD0, 0xB7, 0xCF, 0x80, 0xCF, 0x81, /* 0xF8-0xFB */
+	0xCF, 0x82, 0xCF, 0x83, 0xF3, 0xB8, 0xCF, 0x84, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_88[512] = {
+	0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xD9, 0xF9, /* 0x00-0x03 */
+	0xCF, 0x88, 0xCF, 0x89, 0xCF, 0x8A, 0xCF, 0x8B, /* 0x04-0x07 */
+	0xCF, 0x8C, 0xCF, 0x8D, 0xF3, 0xB9, 0xCF, 0x8E, /* 0x08-0x0B */
+	0xCF, 0x8F, 0xCF, 0x90, 0xCF, 0x91, 0xCF, 0x92, /* 0x0C-0x0F */
+	0xCF, 0x93, 0xCF, 0x94, 0xCF, 0x95, 0xF3, 0xB7, /* 0x10-0x13 */
+	0xCF, 0x96, 0xC8, 0xE4, 0xF3, 0xB6, 0xCF, 0x97, /* 0x14-0x17 */
+	0xCF, 0x98, 0xCF, 0x99, 0xCF, 0x9A, 0xF3, 0xBA, /* 0x18-0x1B */
+	0xCF, 0x9B, 0xCF, 0x9C, 0xCF, 0x9D, 0xCF, 0x9E, /* 0x1C-0x1F */
+	0xCF, 0x9F, 0xF3, 0xBB, 0xB4, 0xC0, 0xCF, 0xA0, /* 0x20-0x23 */
+	0xD0, 0x40, 0xD0, 0x41, 0xD0, 0x42, 0xD0, 0x43, /* 0x24-0x27 */
+	0xD0, 0x44, 0xD0, 0x45, 0xD0, 0x46, 0xD0, 0x47, /* 0x28-0x2B */
+	0xD0, 0x48, 0xD0, 0x49, 0xD0, 0x4A, 0xD0, 0x4B, /* 0x2C-0x2F */
+	0xD0, 0x4C, 0xD0, 0x4D, 0xEE, 0xC3, 0xD0, 0x4E, /* 0x30-0x33 */
+	0xD0, 0x4F, 0xD0, 0x50, 0xD0, 0x51, 0xD0, 0x52, /* 0x34-0x37 */
+	0xD0, 0x53, 0xF3, 0xBC, 0xD0, 0x54, 0xD0, 0x55, /* 0x38-0x3B */
+	0xF3, 0xBD, 0xD0, 0x56, 0xD0, 0x57, 0xD0, 0x58, /* 0x3C-0x3F */
+	0xD1, 0xAA, 0xD0, 0x59, 0xD0, 0x5A, 0xD0, 0x5B, /* 0x40-0x43 */
+	0xF4, 0xAC, 0xD0, 0xC6, 0xD0, 0x5C, 0xD0, 0x5D, /* 0x44-0x47 */
+	0xD0, 0x5E, 0xD0, 0x5F, 0xD0, 0x60, 0xD0, 0x61, /* 0x48-0x4B */
+	0xD0, 0xD0, 0xD1, 0xDC, 0xD0, 0x62, 0xD0, 0x63, /* 0x4C-0x4F */
+	0xD0, 0x64, 0xD0, 0x65, 0xD0, 0x66, 0xD0, 0x67, /* 0x50-0x53 */
+	0xCF, 0xCE, 0xD0, 0x68, 0xD0, 0x69, 0xBD, 0xD6, /* 0x54-0x57 */
+	0xD0, 0x6A, 0xD1, 0xC3, 0xD0, 0x6B, 0xD0, 0x6C, /* 0x58-0x5B */
+	0xD0, 0x6D, 0xD0, 0x6E, 0xD0, 0x6F, 0xD0, 0x70, /* 0x5C-0x5F */
+	0xD0, 0x71, 0xBA, 0xE2, 0xE1, 0xE9, 0xD2, 0xC2, /* 0x60-0x63 */
+	0xF1, 0xC2, 0xB2, 0xB9, 0xD0, 0x72, 0xD0, 0x73, /* 0x64-0x67 */
+	0xB1, 0xED, 0xF1, 0xC3, 0xD0, 0x74, 0xC9, 0xC0, /* 0x68-0x6B */
+	0xB3, 0xC4, 0xD0, 0x75, 0xD9, 0xF2, 0xD0, 0x76, /* 0x6C-0x6F */
+	0xCB, 0xA5, 0xD0, 0x77, 0xF1, 0xC4, 0xD0, 0x78, /* 0x70-0x73 */
+	0xD0, 0x79, 0xD0, 0x7A, 0xD0, 0x7B, 0xD6, 0xD4, /* 0x74-0x77 */
+	0xD0, 0x7C, 0xD0, 0x7D, 0xD0, 0x7E, 0xD0, 0x80, /* 0x78-0x7B */
+	0xD0, 0x81, 0xF1, 0xC5, 0xF4, 0xC0, 0xF1, 0xC6, /* 0x7C-0x7F */
+	
+	0xD0, 0x82, 0xD4, 0xAC, 0xF1, 0xC7, 0xD0, 0x83, /* 0x80-0x83 */
+	0xB0, 0xC0, 0xF4, 0xC1, 0xD0, 0x84, 0xD0, 0x85, /* 0x84-0x87 */
+	0xF4, 0xC2, 0xD0, 0x86, 0xD0, 0x87, 0xB4, 0xFC, /* 0x88-0x8B */
+	0xD0, 0x88, 0xC5, 0xDB, 0xD0, 0x89, 0xD0, 0x8A, /* 0x8C-0x8F */
+	0xD0, 0x8B, 0xD0, 0x8C, 0xCC, 0xBB, 0xD0, 0x8D, /* 0x90-0x93 */
+	0xD0, 0x8E, 0xD0, 0x8F, 0xD0, 0xE4, 0xD0, 0x90, /* 0x94-0x97 */
+	0xD0, 0x91, 0xD0, 0x92, 0xD0, 0x93, 0xD0, 0x94, /* 0x98-0x9B */
+	0xCD, 0xE0, 0xD0, 0x95, 0xD0, 0x96, 0xD0, 0x97, /* 0x9C-0x9F */
+	0xD0, 0x98, 0xD0, 0x99, 0xF1, 0xC8, 0xD0, 0x9A, /* 0xA0-0xA3 */
+	0xD9, 0xF3, 0xD0, 0x9B, 0xD0, 0x9C, 0xD0, 0x9D, /* 0xA4-0xA7 */
+	0xD0, 0x9E, 0xD0, 0x9F, 0xD0, 0xA0, 0xB1, 0xBB, /* 0xA8-0xAB */
+	0xD1, 0x40, 0xCF, 0xAE, 0xD1, 0x41, 0xD1, 0x42, /* 0xAC-0xAF */
+	0xD1, 0x43, 0xB8, 0xA4, 0xD1, 0x44, 0xD1, 0x45, /* 0xB0-0xB3 */
+	0xD1, 0x46, 0xD1, 0x47, 0xD1, 0x48, 0xF1, 0xCA, /* 0xB4-0xB7 */
+	0xD1, 0x49, 0xD1, 0x4A, 0xD1, 0x4B, 0xD1, 0x4C, /* 0xB8-0xBB */
+	0xF1, 0xCB, 0xD1, 0x4D, 0xD1, 0x4E, 0xD1, 0x4F, /* 0xBC-0xBF */
+	0xD1, 0x50, 0xB2, 0xC3, 0xC1, 0xD1, 0xD1, 0x51, /* 0xC0-0xC3 */
+	0xD1, 0x52, 0xD7, 0xB0, 0xF1, 0xC9, 0xD1, 0x53, /* 0xC4-0xC7 */
+	0xD1, 0x54, 0xF1, 0xCC, 0xD1, 0x55, 0xD1, 0x56, /* 0xC8-0xCB */
+	0xD1, 0x57, 0xD1, 0x58, 0xF1, 0xCE, 0xD1, 0x59, /* 0xCC-0xCF */
+	0xD1, 0x5A, 0xD1, 0x5B, 0xD9, 0xF6, 0xD1, 0x5C, /* 0xD0-0xD3 */
+	0xD2, 0xE1, 0xD4, 0xA3, 0xD1, 0x5D, 0xD1, 0x5E, /* 0xD4-0xD7 */
+	0xF4, 0xC3, 0xC8, 0xB9, 0xD1, 0x5F, 0xD1, 0x60, /* 0xD8-0xDB */
+	0xD1, 0x61, 0xD1, 0x62, 0xD1, 0x63, 0xF4, 0xC4, /* 0xDC-0xDF */
+	0xD1, 0x64, 0xD1, 0x65, 0xF1, 0xCD, 0xF1, 0xCF, /* 0xE0-0xE3 */
+	0xBF, 0xE3, 0xF1, 0xD0, 0xD1, 0x66, 0xD1, 0x67, /* 0xE4-0xE7 */
+	0xF1, 0xD4, 0xD1, 0x68, 0xD1, 0x69, 0xD1, 0x6A, /* 0xE8-0xEB */
+	0xD1, 0x6B, 0xD1, 0x6C, 0xD1, 0x6D, 0xD1, 0x6E, /* 0xEC-0xEF */
+	0xF1, 0xD6, 0xF1, 0xD1, 0xD1, 0x6F, 0xC9, 0xD1, /* 0xF0-0xF3 */
+	0xC5, 0xE1, 0xD1, 0x70, 0xD1, 0x71, 0xD1, 0x72, /* 0xF4-0xF7 */
+	0xC2, 0xE3, 0xB9, 0xFC, 0xD1, 0x73, 0xD1, 0x74, /* 0xF8-0xFB */
+	0xF1, 0xD3, 0xD1, 0x75, 0xF1, 0xD5, 0xD1, 0x76, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_89[512] = {
+	0xD1, 0x77, 0xD1, 0x78, 0xB9, 0xD3, 0xD1, 0x79, /* 0x00-0x03 */
+	0xD1, 0x7A, 0xD1, 0x7B, 0xD1, 0x7C, 0xD1, 0x7D, /* 0x04-0x07 */
+	0xD1, 0x7E, 0xD1, 0x80, 0xF1, 0xDB, 0xD1, 0x81, /* 0x08-0x0B */
+	0xD1, 0x82, 0xD1, 0x83, 0xD1, 0x84, 0xD1, 0x85, /* 0x0C-0x0F */
+	0xBA, 0xD6, 0xD1, 0x86, 0xB0, 0xFD, 0xF1, 0xD9, /* 0x10-0x13 */
+	0xD1, 0x87, 0xD1, 0x88, 0xD1, 0x89, 0xD1, 0x8A, /* 0x14-0x17 */
+	0xD1, 0x8B, 0xF1, 0xD8, 0xF1, 0xD2, 0xF1, 0xDA, /* 0x18-0x1B */
+	0xD1, 0x8C, 0xD1, 0x8D, 0xD1, 0x8E, 0xD1, 0x8F, /* 0x1C-0x1F */
+	0xD1, 0x90, 0xF1, 0xD7, 0xD1, 0x91, 0xD1, 0x92, /* 0x20-0x23 */
+	0xD1, 0x93, 0xC8, 0xEC, 0xD1, 0x94, 0xD1, 0x95, /* 0x24-0x27 */
+	0xD1, 0x96, 0xD1, 0x97, 0xCD, 0xCA, 0xF1, 0xDD, /* 0x28-0x2B */
+	0xD1, 0x98, 0xD1, 0x99, 0xD1, 0x9A, 0xD1, 0x9B, /* 0x2C-0x2F */
+	0xE5, 0xBD, 0xD1, 0x9C, 0xD1, 0x9D, 0xD1, 0x9E, /* 0x30-0x33 */
+	0xF1, 0xDC, 0xD1, 0x9F, 0xF1, 0xDE, 0xD1, 0xA0, /* 0x34-0x37 */
+	0xD2, 0x40, 0xD2, 0x41, 0xD2, 0x42, 0xD2, 0x43, /* 0x38-0x3B */
+	0xD2, 0x44, 0xD2, 0x45, 0xD2, 0x46, 0xD2, 0x47, /* 0x3C-0x3F */
+	0xD2, 0x48, 0xF1, 0xDF, 0xD2, 0x49, 0xD2, 0x4A, /* 0x40-0x43 */
+	0xCF, 0xE5, 0xD2, 0x4B, 0xD2, 0x4C, 0xD2, 0x4D, /* 0x44-0x47 */
+	0xD2, 0x4E, 0xD2, 0x4F, 0xD2, 0x50, 0xD2, 0x51, /* 0x48-0x4B */
+	0xD2, 0x52, 0xD2, 0x53, 0xD2, 0x54, 0xD2, 0x55, /* 0x4C-0x4F */
+	0xD2, 0x56, 0xD2, 0x57, 0xD2, 0x58, 0xD2, 0x59, /* 0x50-0x53 */
+	0xD2, 0x5A, 0xD2, 0x5B, 0xD2, 0x5C, 0xD2, 0x5D, /* 0x54-0x57 */
+	0xD2, 0x5E, 0xD2, 0x5F, 0xD2, 0x60, 0xD2, 0x61, /* 0x58-0x5B */
+	0xD2, 0x62, 0xD2, 0x63, 0xF4, 0xC5, 0xBD, 0xF3, /* 0x5C-0x5F */
+	0xD2, 0x64, 0xD2, 0x65, 0xD2, 0x66, 0xD2, 0x67, /* 0x60-0x63 */
+	0xD2, 0x68, 0xD2, 0x69, 0xF1, 0xE0, 0xD2, 0x6A, /* 0x64-0x67 */
+	0xD2, 0x6B, 0xD2, 0x6C, 0xD2, 0x6D, 0xD2, 0x6E, /* 0x68-0x6B */
+	0xD2, 0x6F, 0xD2, 0x70, 0xD2, 0x71, 0xD2, 0x72, /* 0x6C-0x6F */
+	0xD2, 0x73, 0xD2, 0x74, 0xD2, 0x75, 0xD2, 0x76, /* 0x70-0x73 */
+	0xD2, 0x77, 0xD2, 0x78, 0xD2, 0x79, 0xD2, 0x7A, /* 0x74-0x77 */
+	0xD2, 0x7B, 0xD2, 0x7C, 0xD2, 0x7D, 0xF1, 0xE1, /* 0x78-0x7B */
+	0xD2, 0x7E, 0xD2, 0x80, 0xD2, 0x81, 0xCE, 0xF7, /* 0x7C-0x7F */
+	
+	0xD2, 0x82, 0xD2, 0xAA, 0xD2, 0x83, 0xF1, 0xFB, /* 0x80-0x83 */
+	0xD2, 0x84, 0xD2, 0x85, 0xB8, 0xB2, 0xD2, 0x86, /* 0x84-0x87 */
+	0xD2, 0x87, 0xD2, 0x88, 0xD2, 0x89, 0xD2, 0x8A, /* 0x88-0x8B */
+	0xD2, 0x8B, 0xD2, 0x8C, 0xD2, 0x8D, 0xD2, 0x8E, /* 0x8C-0x8F */
+	0xD2, 0x8F, 0xD2, 0x90, 0xD2, 0x91, 0xD2, 0x92, /* 0x90-0x93 */
+	0xD2, 0x93, 0xD2, 0x94, 0xD2, 0x95, 0xD2, 0x96, /* 0x94-0x97 */
+	0xD2, 0x97, 0xD2, 0x98, 0xD2, 0x99, 0xD2, 0x9A, /* 0x98-0x9B */
+	0xD2, 0x9B, 0xD2, 0x9C, 0xD2, 0x9D, 0xD2, 0x9E, /* 0x9C-0x9F */
+	0xD2, 0x9F, 0xD2, 0xA0, 0xD3, 0x40, 0xD3, 0x41, /* 0xA0-0xA3 */
+	0xD3, 0x42, 0xD3, 0x43, 0xD3, 0x44, 0xD3, 0x45, /* 0xA4-0xA7 */
+	0xD3, 0x46, 0xD3, 0x47, 0xD3, 0x48, 0xD3, 0x49, /* 0xA8-0xAB */
+	0xD3, 0x4A, 0xD3, 0x4B, 0xD3, 0x4C, 0xD3, 0x4D, /* 0xAC-0xAF */
+	0xD3, 0x4E, 0xD3, 0x4F, 0xD3, 0x50, 0xD3, 0x51, /* 0xB0-0xB3 */
+	0xD3, 0x52, 0xD3, 0x53, 0xD3, 0x54, 0xD3, 0x55, /* 0xB4-0xB7 */
+	0xD3, 0x56, 0xD3, 0x57, 0xD3, 0x58, 0xD3, 0x59, /* 0xB8-0xBB */
+	0xD3, 0x5A, 0xD3, 0x5B, 0xD3, 0x5C, 0xD3, 0x5D, /* 0xBC-0xBF */
+	0xD3, 0x5E, 0xBC, 0xFB, 0xB9, 0xDB, 0xD3, 0x5F, /* 0xC0-0xC3 */
+	0xB9, 0xE6, 0xC3, 0xD9, 0xCA, 0xD3, 0xEA, 0xE8, /* 0xC4-0xC7 */
+	0xC0, 0xC0, 0xBE, 0xF5, 0xEA, 0xE9, 0xEA, 0xEA, /* 0xC8-0xCB */
+	0xEA, 0xEB, 0xD3, 0x60, 0xEA, 0xEC, 0xEA, 0xED, /* 0xCC-0xCF */
+	0xEA, 0xEE, 0xEA, 0xEF, 0xBD, 0xC7, 0xD3, 0x61, /* 0xD0-0xD3 */
+	0xD3, 0x62, 0xD3, 0x63, 0xF5, 0xFB, 0xD3, 0x64, /* 0xD4-0xD7 */
+	0xD3, 0x65, 0xD3, 0x66, 0xF5, 0xFD, 0xD3, 0x67, /* 0xD8-0xDB */
+	0xF5, 0xFE, 0xD3, 0x68, 0xF5, 0xFC, 0xD3, 0x69, /* 0xDC-0xDF */
+	0xD3, 0x6A, 0xD3, 0x6B, 0xD3, 0x6C, 0xBD, 0xE2, /* 0xE0-0xE3 */
+	0xD3, 0x6D, 0xF6, 0xA1, 0xB4, 0xA5, 0xD3, 0x6E, /* 0xE4-0xE7 */
+	0xD3, 0x6F, 0xD3, 0x70, 0xD3, 0x71, 0xF6, 0xA2, /* 0xE8-0xEB */
+	0xD3, 0x72, 0xD3, 0x73, 0xD3, 0x74, 0xF6, 0xA3, /* 0xEC-0xEF */
+	0xD3, 0x75, 0xD3, 0x76, 0xD3, 0x77, 0xEC, 0xB2, /* 0xF0-0xF3 */
+	0xD3, 0x78, 0xD3, 0x79, 0xD3, 0x7A, 0xD3, 0x7B, /* 0xF4-0xF7 */
+	0xD3, 0x7C, 0xD3, 0x7D, 0xD3, 0x7E, 0xD3, 0x80, /* 0xF8-0xFB */
+	0xD3, 0x81, 0xD3, 0x82, 0xD3, 0x83, 0xD3, 0x84, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8A[512] = {
+	0xD1, 0xD4, 0xD3, 0x85, 0xD3, 0x86, 0xD3, 0x87, /* 0x00-0x03 */
+	0xD3, 0x88, 0xD3, 0x89, 0xD3, 0x8A, 0xD9, 0xEA, /* 0x04-0x07 */
+	0xD3, 0x8B, 0xD3, 0x8C, 0xD3, 0x8D, 0xD3, 0x8E, /* 0x08-0x0B */
+	0xD3, 0x8F, 0xD3, 0x90, 0xD3, 0x91, 0xD3, 0x92, /* 0x0C-0x0F */
+	0xD3, 0x93, 0xD3, 0x94, 0xD3, 0x95, 0xD3, 0x96, /* 0x10-0x13 */
+	0xD3, 0x97, 0xD3, 0x98, 0xD3, 0x99, 0xD3, 0x9A, /* 0x14-0x17 */
+	0xD3, 0x9B, 0xD3, 0x9C, 0xD3, 0x9D, 0xD3, 0x9E, /* 0x18-0x1B */
+	0xD3, 0x9F, 0xD3, 0xA0, 0xD4, 0x40, 0xD4, 0x41, /* 0x1C-0x1F */
+	0xD4, 0x42, 0xD4, 0x43, 0xD4, 0x44, 0xD4, 0x45, /* 0x20-0x23 */
+	0xD4, 0x46, 0xD4, 0x47, 0xD4, 0x48, 0xD4, 0x49, /* 0x24-0x27 */
+	0xD4, 0x4A, 0xD4, 0x4B, 0xD4, 0x4C, 0xD4, 0x4D, /* 0x28-0x2B */
+	0xD4, 0x4E, 0xD4, 0x4F, 0xD4, 0x50, 0xD4, 0x51, /* 0x2C-0x2F */
+	0xD4, 0x52, 0xD4, 0x53, 0xD4, 0x54, 0xD4, 0x55, /* 0x30-0x33 */
+	0xD4, 0x56, 0xD4, 0x57, 0xD4, 0x58, 0xD4, 0x59, /* 0x34-0x37 */
+	0xD4, 0x5A, 0xD4, 0x5B, 0xD4, 0x5C, 0xD4, 0x5D, /* 0x38-0x3B */
+	0xD4, 0x5E, 0xD4, 0x5F, 0xF6, 0xA4, 0xD4, 0x60, /* 0x3C-0x3F */
+	0xD4, 0x61, 0xD4, 0x62, 0xD4, 0x63, 0xD4, 0x64, /* 0x40-0x43 */
+	0xD4, 0x65, 0xD4, 0x66, 0xD4, 0x67, 0xD4, 0x68, /* 0x44-0x47 */
+	0xEE, 0xBA, 0xD4, 0x69, 0xD4, 0x6A, 0xD4, 0x6B, /* 0x48-0x4B */
+	0xD4, 0x6C, 0xD4, 0x6D, 0xD4, 0x6E, 0xD4, 0x6F, /* 0x4C-0x4F */
+	0xD4, 0x70, 0xD4, 0x71, 0xD4, 0x72, 0xD4, 0x73, /* 0x50-0x53 */
+	0xD4, 0x74, 0xD4, 0x75, 0xD4, 0x76, 0xD4, 0x77, /* 0x54-0x57 */
+	0xD4, 0x78, 0xD4, 0x79, 0xD4, 0x7A, 0xD4, 0x7B, /* 0x58-0x5B */
+	0xD4, 0x7C, 0xD4, 0x7D, 0xD4, 0x7E, 0xD4, 0x80, /* 0x5C-0x5F */
+	0xD4, 0x81, 0xD4, 0x82, 0xD4, 0x83, 0xD4, 0x84, /* 0x60-0x63 */
+	0xD4, 0x85, 0xD4, 0x86, 0xD4, 0x87, 0xD4, 0x88, /* 0x64-0x67 */
+	0xD4, 0x89, 0xD4, 0x8A, 0xD4, 0x8B, 0xD4, 0x8C, /* 0x68-0x6B */
+	0xD4, 0x8D, 0xD4, 0x8E, 0xD4, 0x8F, 0xD4, 0x90, /* 0x6C-0x6F */
+	0xD4, 0x91, 0xD4, 0x92, 0xD4, 0x93, 0xD4, 0x94, /* 0x70-0x73 */
+	0xD4, 0x95, 0xD4, 0x96, 0xD4, 0x97, 0xD4, 0x98, /* 0x74-0x77 */
+	0xD4, 0x99, 0xD5, 0xB2, 0xD4, 0x9A, 0xD4, 0x9B, /* 0x78-0x7B */
+	0xD4, 0x9C, 0xD4, 0x9D, 0xD4, 0x9E, 0xD4, 0x9F, /* 0x7C-0x7F */
+	
+	0xD4, 0xA0, 0xD5, 0x40, 0xD5, 0x41, 0xD5, 0x42, /* 0x80-0x83 */
+	0xD5, 0x43, 0xD5, 0x44, 0xD5, 0x45, 0xD5, 0x46, /* 0x84-0x87 */
+	0xD5, 0x47, 0xD3, 0xFE, 0xCC, 0xDC, 0xD5, 0x48, /* 0x88-0x8B */
+	0xD5, 0x49, 0xD5, 0x4A, 0xD5, 0x4B, 0xD5, 0x4C, /* 0x8C-0x8F */
+	0xD5, 0x4D, 0xD5, 0x4E, 0xD5, 0x4F, 0xCA, 0xC4, /* 0x90-0x93 */
+	0xD5, 0x50, 0xD5, 0x51, 0xD5, 0x52, 0xD5, 0x53, /* 0x94-0x97 */
+	0xD5, 0x54, 0xD5, 0x55, 0xD5, 0x56, 0xD5, 0x57, /* 0x98-0x9B */
+	0xD5, 0x58, 0xD5, 0x59, 0xD5, 0x5A, 0xD5, 0x5B, /* 0x9C-0x9F */
+	0xD5, 0x5C, 0xD5, 0x5D, 0xD5, 0x5E, 0xD5, 0x5F, /* 0xA0-0xA3 */
+	0xD5, 0x60, 0xD5, 0x61, 0xD5, 0x62, 0xD5, 0x63, /* 0xA4-0xA7 */
+	0xD5, 0x64, 0xD5, 0x65, 0xD5, 0x66, 0xD5, 0x67, /* 0xA8-0xAB */
+	0xD5, 0x68, 0xD5, 0x69, 0xD5, 0x6A, 0xD5, 0x6B, /* 0xAC-0xAF */
+	0xD5, 0x6C, 0xD5, 0x6D, 0xD5, 0x6E, 0xD5, 0x6F, /* 0xB0-0xB3 */
+	0xD5, 0x70, 0xD5, 0x71, 0xD5, 0x72, 0xD5, 0x73, /* 0xB4-0xB7 */
+	0xD5, 0x74, 0xD5, 0x75, 0xD5, 0x76, 0xD5, 0x77, /* 0xB8-0xBB */
+	0xD5, 0x78, 0xD5, 0x79, 0xD5, 0x7A, 0xD5, 0x7B, /* 0xBC-0xBF */
+	0xD5, 0x7C, 0xD5, 0x7D, 0xD5, 0x7E, 0xD5, 0x80, /* 0xC0-0xC3 */
+	0xD5, 0x81, 0xD5, 0x82, 0xD5, 0x83, 0xD5, 0x84, /* 0xC4-0xC7 */
+	0xD5, 0x85, 0xD5, 0x86, 0xD5, 0x87, 0xD5, 0x88, /* 0xC8-0xCB */
+	0xD5, 0x89, 0xD5, 0x8A, 0xD5, 0x8B, 0xD5, 0x8C, /* 0xCC-0xCF */
+	0xD5, 0x8D, 0xD5, 0x8E, 0xD5, 0x8F, 0xD5, 0x90, /* 0xD0-0xD3 */
+	0xD5, 0x91, 0xD5, 0x92, 0xD5, 0x93, 0xD5, 0x94, /* 0xD4-0xD7 */
+	0xD5, 0x95, 0xD5, 0x96, 0xD5, 0x97, 0xD5, 0x98, /* 0xD8-0xDB */
+	0xD5, 0x99, 0xD5, 0x9A, 0xD5, 0x9B, 0xD5, 0x9C, /* 0xDC-0xDF */
+	0xD5, 0x9D, 0xD5, 0x9E, 0xD5, 0x9F, 0xD5, 0xA0, /* 0xE0-0xE3 */
+	0xD6, 0x40, 0xD6, 0x41, 0xD6, 0x42, 0xD6, 0x43, /* 0xE4-0xE7 */
+	0xD6, 0x44, 0xD6, 0x45, 0xD6, 0x46, 0xD6, 0x47, /* 0xE8-0xEB */
+	0xD6, 0x48, 0xD6, 0x49, 0xD6, 0x4A, 0xD6, 0x4B, /* 0xEC-0xEF */
+	0xD6, 0x4C, 0xD6, 0x4D, 0xD6, 0x4E, 0xD6, 0x4F, /* 0xF0-0xF3 */
+	0xD6, 0x50, 0xD6, 0x51, 0xD6, 0x52, 0xD6, 0x53, /* 0xF4-0xF7 */
+	0xD6, 0x54, 0xD6, 0x55, 0xD6, 0x56, 0xD6, 0x57, /* 0xF8-0xFB */
+	0xD6, 0x58, 0xD6, 0x59, 0xD6, 0x5A, 0xD6, 0x5B, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8B[512] = {
+	0xD6, 0x5C, 0xD6, 0x5D, 0xD6, 0x5E, 0xD6, 0x5F, /* 0x00-0x03 */
+	0xD6, 0x60, 0xD6, 0x61, 0xD6, 0x62, 0xE5, 0xC0, /* 0x04-0x07 */
+	0xD6, 0x63, 0xD6, 0x64, 0xD6, 0x65, 0xD6, 0x66, /* 0x08-0x0B */
+	0xD6, 0x67, 0xD6, 0x68, 0xD6, 0x69, 0xD6, 0x6A, /* 0x0C-0x0F */
+	0xD6, 0x6B, 0xD6, 0x6C, 0xD6, 0x6D, 0xD6, 0x6E, /* 0x10-0x13 */
+	0xD6, 0x6F, 0xD6, 0x70, 0xD6, 0x71, 0xD6, 0x72, /* 0x14-0x17 */
+	0xD6, 0x73, 0xD6, 0x74, 0xD6, 0x75, 0xD6, 0x76, /* 0x18-0x1B */
+	0xD6, 0x77, 0xD6, 0x78, 0xD6, 0x79, 0xD6, 0x7A, /* 0x1C-0x1F */
+	0xD6, 0x7B, 0xD6, 0x7C, 0xD6, 0x7D, 0xD6, 0x7E, /* 0x20-0x23 */
+	0xD6, 0x80, 0xD6, 0x81, 0xF6, 0xA5, 0xD6, 0x82, /* 0x24-0x27 */
+	0xD6, 0x83, 0xD6, 0x84, 0xD6, 0x85, 0xD6, 0x86, /* 0x28-0x2B */
+	0xD6, 0x87, 0xD6, 0x88, 0xD6, 0x89, 0xD6, 0x8A, /* 0x2C-0x2F */
+	0xD6, 0x8B, 0xD6, 0x8C, 0xD6, 0x8D, 0xD6, 0x8E, /* 0x30-0x33 */
+	0xD6, 0x8F, 0xD6, 0x90, 0xD6, 0x91, 0xD6, 0x92, /* 0x34-0x37 */
+	0xD6, 0x93, 0xD6, 0x94, 0xD6, 0x95, 0xD6, 0x96, /* 0x38-0x3B */
+	0xD6, 0x97, 0xD6, 0x98, 0xD6, 0x99, 0xD6, 0x9A, /* 0x3C-0x3F */
+	0xD6, 0x9B, 0xD6, 0x9C, 0xD6, 0x9D, 0xD6, 0x9E, /* 0x40-0x43 */
+	0xD6, 0x9F, 0xD6, 0xA0, 0xD7, 0x40, 0xD7, 0x41, /* 0x44-0x47 */
+	0xD7, 0x42, 0xD7, 0x43, 0xD7, 0x44, 0xD7, 0x45, /* 0x48-0x4B */
+	0xD7, 0x46, 0xD7, 0x47, 0xD7, 0x48, 0xD7, 0x49, /* 0x4C-0x4F */
+	0xD7, 0x4A, 0xD7, 0x4B, 0xD7, 0x4C, 0xD7, 0x4D, /* 0x50-0x53 */
+	0xD7, 0x4E, 0xD7, 0x4F, 0xD7, 0x50, 0xD7, 0x51, /* 0x54-0x57 */
+	0xD7, 0x52, 0xD7, 0x53, 0xD7, 0x54, 0xD7, 0x55, /* 0x58-0x5B */
+	0xD7, 0x56, 0xD7, 0x57, 0xD7, 0x58, 0xD7, 0x59, /* 0x5C-0x5F */
+	0xD7, 0x5A, 0xD7, 0x5B, 0xD7, 0x5C, 0xD7, 0x5D, /* 0x60-0x63 */
+	0xD7, 0x5E, 0xD7, 0x5F, 0xBE, 0xAF, 0xD7, 0x60, /* 0x64-0x67 */
+	0xD7, 0x61, 0xD7, 0x62, 0xD7, 0x63, 0xD7, 0x64, /* 0x68-0x6B */
+	0xC6, 0xA9, 0xD7, 0x65, 0xD7, 0x66, 0xD7, 0x67, /* 0x6C-0x6F */
+	0xD7, 0x68, 0xD7, 0x69, 0xD7, 0x6A, 0xD7, 0x6B, /* 0x70-0x73 */
+	0xD7, 0x6C, 0xD7, 0x6D, 0xD7, 0x6E, 0xD7, 0x6F, /* 0x74-0x77 */
+	0xD7, 0x70, 0xD7, 0x71, 0xD7, 0x72, 0xD7, 0x73, /* 0x78-0x7B */
+	0xD7, 0x74, 0xD7, 0x75, 0xD7, 0x76, 0xD7, 0x77, /* 0x7C-0x7F */
+	
+	0xD7, 0x78, 0xD7, 0x79, 0xD7, 0x7A, 0xD7, 0x7B, /* 0x80-0x83 */
+	0xD7, 0x7C, 0xD7, 0x7D, 0xD7, 0x7E, 0xD7, 0x80, /* 0x84-0x87 */
+	0xD7, 0x81, 0xD7, 0x82, 0xD7, 0x83, 0xD7, 0x84, /* 0x88-0x8B */
+	0xD7, 0x85, 0xD7, 0x86, 0xD7, 0x87, 0xD7, 0x88, /* 0x8C-0x8F */
+	0xD7, 0x89, 0xD7, 0x8A, 0xD7, 0x8B, 0xD7, 0x8C, /* 0x90-0x93 */
+	0xD7, 0x8D, 0xD7, 0x8E, 0xD7, 0x8F, 0xD7, 0x90, /* 0x94-0x97 */
+	0xD7, 0x91, 0xD7, 0x92, 0xD7, 0x93, 0xD7, 0x94, /* 0x98-0x9B */
+	0xD7, 0x95, 0xD7, 0x96, 0xD7, 0x97, 0xD7, 0x98, /* 0x9C-0x9F */
+	0xDA, 0xA5, 0xBC, 0xC6, 0xB6, 0xA9, 0xB8, 0xBC, /* 0xA0-0xA3 */
+	0xC8, 0xCF, 0xBC, 0xA5, 0xDA, 0xA6, 0xDA, 0xA7, /* 0xA4-0xA7 */
+	0xCC, 0xD6, 0xC8, 0xC3, 0xDA, 0xA8, 0xC6, 0xFD, /* 0xA8-0xAB */
+	0xD7, 0x99, 0xD1, 0xB5, 0xD2, 0xE9, 0xD1, 0xB6, /* 0xAC-0xAF */
+	0xBC, 0xC7, 0xD7, 0x9A, 0xBD, 0xB2, 0xBB, 0xE4, /* 0xB0-0xB3 */
+	0xDA, 0xA9, 0xDA, 0xAA, 0xD1, 0xC8, 0xDA, 0xAB, /* 0xB4-0xB7 */
+	0xD0, 0xED, 0xB6, 0xEF, 0xC2, 0xDB, 0xD7, 0x9B, /* 0xB8-0xBB */
+	0xCB, 0xCF, 0xB7, 0xED, 0xC9, 0xE8, 0xB7, 0xC3, /* 0xBC-0xBF */
+	0xBE, 0xF7, 0xD6, 0xA4, 0xDA, 0xAC, 0xDA, 0xAD, /* 0xC0-0xC3 */
+	0xC6, 0xC0, 0xD7, 0xE7, 0xCA, 0xB6, 0xD7, 0x9C, /* 0xC4-0xC7 */
+	0xD5, 0xA9, 0xCB, 0xDF, 0xD5, 0xEF, 0xDA, 0xAE, /* 0xC8-0xCB */
+	0xD6, 0xDF, 0xB4, 0xCA, 0xDA, 0xB0, 0xDA, 0xAF, /* 0xCC-0xCF */
+	0xD7, 0x9D, 0xD2, 0xEB, 0xDA, 0xB1, 0xDA, 0xB2, /* 0xD0-0xD3 */
+	0xDA, 0xB3, 0xCA, 0xD4, 0xDA, 0xB4, 0xCA, 0xAB, /* 0xD4-0xD7 */
+	0xDA, 0xB5, 0xDA, 0xB6, 0xB3, 0xCF, 0xD6, 0xEF, /* 0xD8-0xDB */
+	0xDA, 0xB7, 0xBB, 0xB0, 0xB5, 0xAE, 0xDA, 0xB8, /* 0xDC-0xDF */
+	0xDA, 0xB9, 0xB9, 0xEE, 0xD1, 0xAF, 0xD2, 0xE8, /* 0xE0-0xE3 */
+	0xDA, 0xBA, 0xB8, 0xC3, 0xCF, 0xEA, 0xB2, 0xEF, /* 0xE4-0xE7 */
+	0xDA, 0xBB, 0xDA, 0xBC, 0xD7, 0x9E, 0xBD, 0xEB, /* 0xE8-0xEB */
+	0xCE, 0xDC, 0xD3, 0xEF, 0xDA, 0xBD, 0xCE, 0xF3, /* 0xEC-0xEF */
+	0xDA, 0xBE, 0xD3, 0xD5, 0xBB, 0xE5, 0xDA, 0xBF, /* 0xF0-0xF3 */
+	0xCB, 0xB5, 0xCB, 0xD0, 0xDA, 0xC0, 0xC7, 0xEB, /* 0xF4-0xF7 */
+	0xD6, 0xEE, 0xDA, 0xC1, 0xC5, 0xB5, 0xB6, 0xC1, /* 0xF8-0xFB */
+	0xDA, 0xC2, 0xB7, 0xCC, 0xBF, 0xCE, 0xDA, 0xC3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8C[512] = {
+	0xDA, 0xC4, 0xCB, 0xAD, 0xDA, 0xC5, 0xB5, 0xF7, /* 0x00-0x03 */
+	0xDA, 0xC6, 0xC1, 0xC2, 0xD7, 0xBB, 0xDA, 0xC7, /* 0x04-0x07 */
+	0xCC, 0xB8, 0xD7, 0x9F, 0xD2, 0xEA, 0xC4, 0xB1, /* 0x08-0x0B */
+	0xDA, 0xC8, 0xB5, 0xFD, 0xBB, 0xD1, 0xDA, 0xC9, /* 0x0C-0x0F */
+	0xD0, 0xB3, 0xDA, 0xCA, 0xDA, 0xCB, 0xCE, 0xBD, /* 0x10-0x13 */
+	0xDA, 0xCC, 0xDA, 0xCD, 0xDA, 0xCE, 0xB2, 0xF7, /* 0x14-0x17 */
+	0xDA, 0xD1, 0xDA, 0xCF, 0xD1, 0xE8, 0xDA, 0xD0, /* 0x18-0x1B */
+	0xC3, 0xD5, 0xDA, 0xD2, 0xD7, 0xA0, 0xDA, 0xD3, /* 0x1C-0x1F */
+	0xDA, 0xD4, 0xDA, 0xD5, 0xD0, 0xBB, 0xD2, 0xA5, /* 0x20-0x23 */
+	0xB0, 0xF9, 0xDA, 0xD6, 0xC7, 0xAB, 0xDA, 0xD7, /* 0x24-0x27 */
+	0xBD, 0xF7, 0xC3, 0xA1, 0xDA, 0xD8, 0xDA, 0xD9, /* 0x28-0x2B */
+	0xC3, 0xFD, 0xCC, 0xB7, 0xDA, 0xDA, 0xDA, 0xDB, /* 0x2C-0x2F */
+	0xC0, 0xBE, 0xC6, 0xD7, 0xDA, 0xDC, 0xDA, 0xDD, /* 0x30-0x33 */
+	0xC7, 0xB4, 0xDA, 0xDE, 0xDA, 0xDF, 0xB9, 0xC8, /* 0x34-0x37 */
+	0xD8, 0x40, 0xD8, 0x41, 0xD8, 0x42, 0xD8, 0x43, /* 0x38-0x3B */
+	0xD8, 0x44, 0xD8, 0x45, 0xD8, 0x46, 0xD8, 0x47, /* 0x3C-0x3F */
+	0xD8, 0x48, 0xBB, 0xED, 0xD8, 0x49, 0xD8, 0x4A, /* 0x40-0x43 */
+	0xD8, 0x4B, 0xD8, 0x4C, 0xB6, 0xB9, 0xF4, 0xF8, /* 0x44-0x47 */
+	0xD8, 0x4D, 0xF4, 0xF9, 0xD8, 0x4E, 0xD8, 0x4F, /* 0x48-0x4B */
+	0xCD, 0xE3, 0xD8, 0x50, 0xD8, 0x51, 0xD8, 0x52, /* 0x4C-0x4F */
+	0xD8, 0x53, 0xD8, 0x54, 0xD8, 0x55, 0xD8, 0x56, /* 0x50-0x53 */
+	0xD8, 0x57, 0xF5, 0xB9, 0xD8, 0x58, 0xD8, 0x59, /* 0x54-0x57 */
+	0xD8, 0x5A, 0xD8, 0x5B, 0xEB, 0xE0, 0xD8, 0x5C, /* 0x58-0x5B */
+	0xD8, 0x5D, 0xD8, 0x5E, 0xD8, 0x5F, 0xD8, 0x60, /* 0x5C-0x5F */
+	0xD8, 0x61, 0xCF, 0xF3, 0xBB, 0xBF, 0xD8, 0x62, /* 0x60-0x63 */
+	0xD8, 0x63, 0xD8, 0x64, 0xD8, 0x65, 0xD8, 0x66, /* 0x64-0x67 */
+	0xD8, 0x67, 0xD8, 0x68, 0xBA, 0xC0, 0xD4, 0xA5, /* 0x68-0x6B */
+	0xD8, 0x69, 0xD8, 0x6A, 0xD8, 0x6B, 0xD8, 0x6C, /* 0x6C-0x6F */
+	0xD8, 0x6D, 0xD8, 0x6E, 0xD8, 0x6F, 0xE1, 0xD9, /* 0x70-0x73 */
+	0xD8, 0x70, 0xD8, 0x71, 0xD8, 0x72, 0xD8, 0x73, /* 0x74-0x77 */
+	0xF5, 0xF4, 0xB1, 0xAA, 0xB2, 0xF2, 0xD8, 0x74, /* 0x78-0x7B */
+	0xD8, 0x75, 0xD8, 0x76, 0xD8, 0x77, 0xD8, 0x78, /* 0x7C-0x7F */
+	
+	0xD8, 0x79, 0xD8, 0x7A, 0xF5, 0xF5, 0xD8, 0x7B, /* 0x80-0x83 */
+	0xD8, 0x7C, 0xF5, 0xF7, 0xD8, 0x7D, 0xD8, 0x7E, /* 0x84-0x87 */
+	0xD8, 0x80, 0xBA, 0xD1, 0xF5, 0xF6, 0xD8, 0x81, /* 0x88-0x8B */
+	0xC3, 0xB2, 0xD8, 0x82, 0xD8, 0x83, 0xD8, 0x84, /* 0x8C-0x8F */
+	0xD8, 0x85, 0xD8, 0x86, 0xD8, 0x87, 0xD8, 0x88, /* 0x90-0x93 */
+	0xF5, 0xF9, 0xD8, 0x89, 0xD8, 0x8A, 0xD8, 0x8B, /* 0x94-0x97 */
+	0xF5, 0xF8, 0xD8, 0x8C, 0xD8, 0x8D, 0xD8, 0x8E, /* 0x98-0x9B */
+	0xD8, 0x8F, 0xD8, 0x90, 0xD8, 0x91, 0xD8, 0x92, /* 0x9C-0x9F */
+	0xD8, 0x93, 0xD8, 0x94, 0xD8, 0x95, 0xD8, 0x96, /* 0xA0-0xA3 */
+	0xD8, 0x97, 0xD8, 0x98, 0xD8, 0x99, 0xD8, 0x9A, /* 0xA4-0xA7 */
+	0xD8, 0x9B, 0xD8, 0x9C, 0xD8, 0x9D, 0xD8, 0x9E, /* 0xA8-0xAB */
+	0xD8, 0x9F, 0xD8, 0xA0, 0xD9, 0x40, 0xD9, 0x41, /* 0xAC-0xAF */
+	0xD9, 0x42, 0xD9, 0x43, 0xD9, 0x44, 0xD9, 0x45, /* 0xB0-0xB3 */
+	0xD9, 0x46, 0xD9, 0x47, 0xD9, 0x48, 0xD9, 0x49, /* 0xB4-0xB7 */
+	0xD9, 0x4A, 0xD9, 0x4B, 0xD9, 0x4C, 0xD9, 0x4D, /* 0xB8-0xBB */
+	0xD9, 0x4E, 0xD9, 0x4F, 0xD9, 0x50, 0xD9, 0x51, /* 0xBC-0xBF */
+	0xD9, 0x52, 0xD9, 0x53, 0xD9, 0x54, 0xD9, 0x55, /* 0xC0-0xC3 */
+	0xD9, 0x56, 0xD9, 0x57, 0xD9, 0x58, 0xD9, 0x59, /* 0xC4-0xC7 */
+	0xD9, 0x5A, 0xD9, 0x5B, 0xD9, 0x5C, 0xD9, 0x5D, /* 0xC8-0xCB */
+	0xD9, 0x5E, 0xD9, 0x5F, 0xD9, 0x60, 0xD9, 0x61, /* 0xCC-0xCF */
+	0xD9, 0x62, 0xD9, 0x63, 0xD9, 0x64, 0xD9, 0x65, /* 0xD0-0xD3 */
+	0xD9, 0x66, 0xD9, 0x67, 0xD9, 0x68, 0xD9, 0x69, /* 0xD4-0xD7 */
+	0xD9, 0x6A, 0xD9, 0x6B, 0xD9, 0x6C, 0xD9, 0x6D, /* 0xD8-0xDB */
+	0xD9, 0x6E, 0xD9, 0x6F, 0xD9, 0x70, 0xD9, 0x71, /* 0xDC-0xDF */
+	0xD9, 0x72, 0xD9, 0x73, 0xD9, 0x74, 0xD9, 0x75, /* 0xE0-0xE3 */
+	0xD9, 0x76, 0xD9, 0x77, 0xD9, 0x78, 0xD9, 0x79, /* 0xE4-0xE7 */
+	0xD9, 0x7A, 0xD9, 0x7B, 0xD9, 0x7C, 0xD9, 0x7D, /* 0xE8-0xEB */
+	0xD9, 0x7E, 0xD9, 0x80, 0xD9, 0x81, 0xD9, 0x82, /* 0xEC-0xEF */
+	0xD9, 0x83, 0xD9, 0x84, 0xD9, 0x85, 0xD9, 0x86, /* 0xF0-0xF3 */
+	0xD9, 0x87, 0xD9, 0x88, 0xD9, 0x89, 0xD9, 0x8A, /* 0xF4-0xF7 */
+	0xD9, 0x8B, 0xD9, 0x8C, 0xD9, 0x8D, 0xD9, 0x8E, /* 0xF8-0xFB */
+	0xD9, 0x8F, 0xD9, 0x90, 0xD9, 0x91, 0xD9, 0x92, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8D[512] = {
+	0xD9, 0x93, 0xD9, 0x94, 0xD9, 0x95, 0xD9, 0x96, /* 0x00-0x03 */
+	0xD9, 0x97, 0xD9, 0x98, 0xD9, 0x99, 0xD9, 0x9A, /* 0x04-0x07 */
+	0xD9, 0x9B, 0xD9, 0x9C, 0xD9, 0x9D, 0xD9, 0x9E, /* 0x08-0x0B */
+	0xD9, 0x9F, 0xD9, 0xA0, 0xDA, 0x40, 0xDA, 0x41, /* 0x0C-0x0F */
+	0xDA, 0x42, 0xDA, 0x43, 0xDA, 0x44, 0xDA, 0x45, /* 0x10-0x13 */
+	0xDA, 0x46, 0xDA, 0x47, 0xDA, 0x48, 0xDA, 0x49, /* 0x14-0x17 */
+	0xDA, 0x4A, 0xDA, 0x4B, 0xDA, 0x4C, 0xDA, 0x4D, /* 0x18-0x1B */
+	0xDA, 0x4E, 0xB1, 0xB4, 0xD5, 0xEA, 0xB8, 0xBA, /* 0x1C-0x1F */
+	0xDA, 0x4F, 0xB9, 0xB1, 0xB2, 0xC6, 0xD4, 0xF0, /* 0x20-0x23 */
+	0xCF, 0xCD, 0xB0, 0xDC, 0xD5, 0xCB, 0xBB, 0xF5, /* 0x24-0x27 */
+	0xD6, 0xCA, 0xB7, 0xB7, 0xCC, 0xB0, 0xC6, 0xB6, /* 0x28-0x2B */
+	0xB1, 0xE1, 0xB9, 0xBA, 0xD6, 0xFC, 0xB9, 0xE1, /* 0x2C-0x2F */
+	0xB7, 0xA1, 0xBC, 0xFA, 0xEA, 0xDA, 0xEA, 0xDB, /* 0x30-0x33 */
+	0xCC, 0xF9, 0xB9, 0xF3, 0xEA, 0xDC, 0xB4, 0xFB, /* 0x34-0x37 */
+	0xC3, 0xB3, 0xB7, 0xD1, 0xBA, 0xD8, 0xEA, 0xDD, /* 0x38-0x3B */
+	0xD4, 0xF4, 0xEA, 0xDE, 0xBC, 0xD6, 0xBB, 0xDF, /* 0x3C-0x3F */
+	0xEA, 0xDF, 0xC1, 0xDE, 0xC2, 0xB8, 0xD4, 0xDF, /* 0x40-0x43 */
+	0xD7, 0xCA, 0xEA, 0xE0, 0xEA, 0xE1, 0xEA, 0xE4, /* 0x44-0x47 */
+	0xEA, 0xE2, 0xEA, 0xE3, 0xC9, 0xDE, 0xB8, 0xB3, /* 0x48-0x4B */
+	0xB6, 0xC4, 0xEA, 0xE5, 0xCA, 0xEA, 0xC9, 0xCD, /* 0x4C-0x4F */
+	0xB4, 0xCD, 0xDA, 0x50, 0xDA, 0x51, 0xE2, 0xD9, /* 0x50-0x53 */
+	0xC5, 0xE2, 0xEA, 0xE6, 0xC0, 0xB5, 0xDA, 0x52, /* 0x54-0x57 */
+	0xD7, 0xB8, 0xEA, 0xE7, 0xD7, 0xAC, 0xC8, 0xFC, /* 0x58-0x5B */
+	0xD8, 0xD3, 0xD8, 0xCD, 0xD4, 0xDE, 0xDA, 0x53, /* 0x5C-0x5F */
+	0xD4, 0xF9, 0xC9, 0xC4, 0xD3, 0xAE, 0xB8, 0xD3, /* 0x60-0x63 */
+	0xB3, 0xE0, 0xDA, 0x54, 0xC9, 0xE2, 0xF4, 0xF6, /* 0x64-0x67 */
+	0xDA, 0x55, 0xDA, 0x56, 0xDA, 0x57, 0xBA, 0xD5, /* 0x68-0x6B */
+	0xDA, 0x58, 0xF4, 0xF7, 0xDA, 0x59, 0xDA, 0x5A, /* 0x6C-0x6F */
+	0xD7, 0xDF, 0xDA, 0x5B, 0xDA, 0x5C, 0xF4, 0xF1, /* 0x70-0x73 */
+	0xB8, 0xB0, 0xD5, 0xD4, 0xB8, 0xCF, 0xC6, 0xF0, /* 0x74-0x77 */
+	0xDA, 0x5D, 0xDA, 0x5E, 0xDA, 0x5F, 0xDA, 0x60, /* 0x78-0x7B */
+	0xDA, 0x61, 0xDA, 0x62, 0xDA, 0x63, 0xDA, 0x64, /* 0x7C-0x7F */
+	
+	0xDA, 0x65, 0xB3, 0xC3, 0xDA, 0x66, 0xDA, 0x67, /* 0x80-0x83 */
+	0xF4, 0xF2, 0xB3, 0xAC, 0xDA, 0x68, 0xDA, 0x69, /* 0x84-0x87 */
+	0xDA, 0x6A, 0xDA, 0x6B, 0xD4, 0xBD, 0xC7, 0xF7, /* 0x88-0x8B */
+	0xDA, 0x6C, 0xDA, 0x6D, 0xDA, 0x6E, 0xDA, 0x6F, /* 0x8C-0x8F */
+	0xDA, 0x70, 0xF4, 0xF4, 0xDA, 0x71, 0xDA, 0x72, /* 0x90-0x93 */
+	0xF4, 0xF3, 0xDA, 0x73, 0xDA, 0x74, 0xDA, 0x75, /* 0x94-0x97 */
+	0xDA, 0x76, 0xDA, 0x77, 0xDA, 0x78, 0xDA, 0x79, /* 0x98-0x9B */
+	0xDA, 0x7A, 0xDA, 0x7B, 0xDA, 0x7C, 0xCC, 0xCB, /* 0x9C-0x9F */
+	0xDA, 0x7D, 0xDA, 0x7E, 0xDA, 0x80, 0xC8, 0xA4, /* 0xA0-0xA3 */
+	0xDA, 0x81, 0xDA, 0x82, 0xDA, 0x83, 0xDA, 0x84, /* 0xA4-0xA7 */
+	0xDA, 0x85, 0xDA, 0x86, 0xDA, 0x87, 0xDA, 0x88, /* 0xA8-0xAB */
+	0xDA, 0x89, 0xDA, 0x8A, 0xDA, 0x8B, 0xDA, 0x8C, /* 0xAC-0xAF */
+	0xDA, 0x8D, 0xF4, 0xF5, 0xDA, 0x8E, 0xD7, 0xE3, /* 0xB0-0xB3 */
+	0xC5, 0xBF, 0xF5, 0xC0, 0xDA, 0x8F, 0xDA, 0x90, /* 0xB4-0xB7 */
+	0xF5, 0xBB, 0xDA, 0x91, 0xF5, 0xC3, 0xDA, 0x92, /* 0xB8-0xBB */
+	0xF5, 0xC2, 0xDA, 0x93, 0xD6, 0xBA, 0xF5, 0xC1, /* 0xBC-0xBF */
+	0xDA, 0x94, 0xDA, 0x95, 0xDA, 0x96, 0xD4, 0xBE, /* 0xC0-0xC3 */
+	0xF5, 0xC4, 0xDA, 0x97, 0xF5, 0xCC, 0xDA, 0x98, /* 0xC4-0xC7 */
+	0xDA, 0x99, 0xDA, 0x9A, 0xDA, 0x9B, 0xB0, 0xCF, /* 0xC8-0xCB */
+	0xB5, 0xF8, 0xDA, 0x9C, 0xF5, 0xC9, 0xF5, 0xCA, /* 0xCC-0xCF */
+	0xDA, 0x9D, 0xC5, 0xDC, 0xDA, 0x9E, 0xDA, 0x9F, /* 0xD0-0xD3 */
+	0xDA, 0xA0, 0xDB, 0x40, 0xF5, 0xC5, 0xF5, 0xC6, /* 0xD4-0xD7 */
+	0xDB, 0x41, 0xDB, 0x42, 0xF5, 0xC7, 0xF5, 0xCB, /* 0xD8-0xDB */
+	0xDB, 0x43, 0xBE, 0xE0, 0xF5, 0xC8, 0xB8, 0xFA, /* 0xDC-0xDF */
+	0xDB, 0x44, 0xDB, 0x45, 0xDB, 0x46, 0xF5, 0xD0, /* 0xE0-0xE3 */
+	0xF5, 0xD3, 0xDB, 0x47, 0xDB, 0x48, 0xDB, 0x49, /* 0xE4-0xE7 */
+	0xBF, 0xE7, 0xDB, 0x4A, 0xB9, 0xF2, 0xF5, 0xBC, /* 0xE8-0xEB */
+	0xF5, 0xCD, 0xDB, 0x4B, 0xDB, 0x4C, 0xC2, 0xB7, /* 0xEC-0xEF */
+	0xDB, 0x4D, 0xDB, 0x4E, 0xDB, 0x4F, 0xCC, 0xF8, /* 0xF0-0xF3 */
+	0xDB, 0x50, 0xBC, 0xF9, 0xDB, 0x51, 0xF5, 0xCE, /* 0xF4-0xF7 */
+	0xF5, 0xCF, 0xF5, 0xD1, 0xB6, 0xE5, 0xF5, 0xD2, /* 0xF8-0xFB */
+	0xDB, 0x52, 0xF5, 0xD5, 0xDB, 0x53, 0xDB, 0x54, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8E[512] = {
+	0xDB, 0x55, 0xDB, 0x56, 0xDB, 0x57, 0xDB, 0x58, /* 0x00-0x03 */
+	0xDB, 0x59, 0xF5, 0xBD, 0xDB, 0x5A, 0xDB, 0x5B, /* 0x04-0x07 */
+	0xDB, 0x5C, 0xF5, 0xD4, 0xD3, 0xBB, 0xDB, 0x5D, /* 0x08-0x0B */
+	0xB3, 0xEC, 0xDB, 0x5E, 0xDB, 0x5F, 0xCC, 0xA4, /* 0x0C-0x0F */
+	0xDB, 0x60, 0xDB, 0x61, 0xDB, 0x62, 0xDB, 0x63, /* 0x10-0x13 */
+	0xF5, 0xD6, 0xDB, 0x64, 0xDB, 0x65, 0xDB, 0x66, /* 0x14-0x17 */
+	0xDB, 0x67, 0xDB, 0x68, 0xDB, 0x69, 0xDB, 0x6A, /* 0x18-0x1B */
+	0xDB, 0x6B, 0xF5, 0xD7, 0xBE, 0xE1, 0xF5, 0xD8, /* 0x1C-0x1F */
+	0xDB, 0x6C, 0xDB, 0x6D, 0xCC, 0xDF, 0xF5, 0xDB, /* 0x20-0x23 */
+	0xDB, 0x6E, 0xDB, 0x6F, 0xDB, 0x70, 0xDB, 0x71, /* 0x24-0x27 */
+	0xDB, 0x72, 0xB2, 0xC8, 0xD7, 0xD9, 0xDB, 0x73, /* 0x28-0x2B */
+	0xF5, 0xD9, 0xDB, 0x74, 0xF5, 0xDA, 0xF5, 0xDC, /* 0x2C-0x2F */
+	0xDB, 0x75, 0xF5, 0xE2, 0xDB, 0x76, 0xDB, 0x77, /* 0x30-0x33 */
+	0xDB, 0x78, 0xF5, 0xE0, 0xDB, 0x79, 0xDB, 0x7A, /* 0x34-0x37 */
+	0xDB, 0x7B, 0xF5, 0xDF, 0xF5, 0xDD, 0xDB, 0x7C, /* 0x38-0x3B */
+	0xDB, 0x7D, 0xF5, 0xE1, 0xDB, 0x7E, 0xDB, 0x80, /* 0x3C-0x3F */
+	0xF5, 0xDE, 0xF5, 0xE4, 0xF5, 0xE5, 0xDB, 0x81, /* 0x40-0x43 */
+	0xCC, 0xE3, 0xDB, 0x82, 0xDB, 0x83, 0xE5, 0xBF, /* 0x44-0x47 */
+	0xB5, 0xB8, 0xF5, 0xE3, 0xF5, 0xE8, 0xCC, 0xA3, /* 0x48-0x4B */
+	0xDB, 0x84, 0xDB, 0x85, 0xDB, 0x86, 0xDB, 0x87, /* 0x4C-0x4F */
+	0xDB, 0x88, 0xF5, 0xE6, 0xF5, 0xE7, 0xDB, 0x89, /* 0x50-0x53 */
+	0xDB, 0x8A, 0xDB, 0x8B, 0xDB, 0x8C, 0xDB, 0x8D, /* 0x54-0x57 */
+	0xDB, 0x8E, 0xF5, 0xBE, 0xDB, 0x8F, 0xDB, 0x90, /* 0x58-0x5B */
+	0xDB, 0x91, 0xDB, 0x92, 0xDB, 0x93, 0xDB, 0x94, /* 0x5C-0x5F */
+	0xDB, 0x95, 0xDB, 0x96, 0xDB, 0x97, 0xDB, 0x98, /* 0x60-0x63 */
+	0xDB, 0x99, 0xDB, 0x9A, 0xB1, 0xC4, 0xDB, 0x9B, /* 0x64-0x67 */
+	0xDB, 0x9C, 0xF5, 0xBF, 0xDB, 0x9D, 0xDB, 0x9E, /* 0x68-0x6B */
+	0xB5, 0xC5, 0xB2, 0xE4, 0xDB, 0x9F, 0xF5, 0xEC, /* 0x6C-0x6F */
+	0xF5, 0xE9, 0xDB, 0xA0, 0xB6, 0xD7, 0xDC, 0x40, /* 0x70-0x73 */
+	0xF5, 0xED, 0xDC, 0x41, 0xF5, 0xEA, 0xDC, 0x42, /* 0x74-0x77 */
+	0xDC, 0x43, 0xDC, 0x44, 0xDC, 0x45, 0xDC, 0x46, /* 0x78-0x7B */
+	0xF5, 0xEB, 0xDC, 0x47, 0xDC, 0x48, 0xB4, 0xDA, /* 0x7C-0x7F */
+	
+	0xDC, 0x49, 0xD4, 0xEA, 0xDC, 0x4A, 0xDC, 0x4B, /* 0x80-0x83 */
+	0xDC, 0x4C, 0xF5, 0xEE, 0xDC, 0x4D, 0xB3, 0xF9, /* 0x84-0x87 */
+	0xDC, 0x4E, 0xDC, 0x4F, 0xDC, 0x50, 0xDC, 0x51, /* 0x88-0x8B */
+	0xDC, 0x52, 0xDC, 0x53, 0xDC, 0x54, 0xF5, 0xEF, /* 0x8C-0x8F */
+	0xF5, 0xF1, 0xDC, 0x55, 0xDC, 0x56, 0xDC, 0x57, /* 0x90-0x93 */
+	0xF5, 0xF0, 0xDC, 0x58, 0xDC, 0x59, 0xDC, 0x5A, /* 0x94-0x97 */
+	0xDC, 0x5B, 0xDC, 0x5C, 0xDC, 0x5D, 0xDC, 0x5E, /* 0x98-0x9B */
+	0xF5, 0xF2, 0xDC, 0x5F, 0xF5, 0xF3, 0xDC, 0x60, /* 0x9C-0x9F */
+	0xDC, 0x61, 0xDC, 0x62, 0xDC, 0x63, 0xDC, 0x64, /* 0xA0-0xA3 */
+	0xDC, 0x65, 0xDC, 0x66, 0xDC, 0x67, 0xDC, 0x68, /* 0xA4-0xA7 */
+	0xDC, 0x69, 0xDC, 0x6A, 0xDC, 0x6B, 0xC9, 0xED, /* 0xA8-0xAB */
+	0xB9, 0xAA, 0xDC, 0x6C, 0xDC, 0x6D, 0xC7, 0xFB, /* 0xAC-0xAF */
+	0xDC, 0x6E, 0xDC, 0x6F, 0xB6, 0xE3, 0xDC, 0x70, /* 0xB0-0xB3 */
+	0xDC, 0x71, 0xDC, 0x72, 0xDC, 0x73, 0xDC, 0x74, /* 0xB4-0xB7 */
+	0xDC, 0x75, 0xDC, 0x76, 0xCC, 0xC9, 0xDC, 0x77, /* 0xB8-0xBB */
+	0xDC, 0x78, 0xDC, 0x79, 0xDC, 0x7A, 0xDC, 0x7B, /* 0xBC-0xBF */
+	0xDC, 0x7C, 0xDC, 0x7D, 0xDC, 0x7E, 0xDC, 0x80, /* 0xC0-0xC3 */
+	0xDC, 0x81, 0xDC, 0x82, 0xDC, 0x83, 0xDC, 0x84, /* 0xC4-0xC7 */
+	0xDC, 0x85, 0xDC, 0x86, 0xDC, 0x87, 0xDC, 0x88, /* 0xC8-0xCB */
+	0xDC, 0x89, 0xDC, 0x8A, 0xEA, 0xA6, 0xDC, 0x8B, /* 0xCC-0xCF */
+	0xDC, 0x8C, 0xDC, 0x8D, 0xDC, 0x8E, 0xDC, 0x8F, /* 0xD0-0xD3 */
+	0xDC, 0x90, 0xDC, 0x91, 0xDC, 0x92, 0xDC, 0x93, /* 0xD4-0xD7 */
+	0xDC, 0x94, 0xDC, 0x95, 0xDC, 0x96, 0xDC, 0x97, /* 0xD8-0xDB */
+	0xDC, 0x98, 0xDC, 0x99, 0xDC, 0x9A, 0xDC, 0x9B, /* 0xDC-0xDF */
+	0xDC, 0x9C, 0xDC, 0x9D, 0xDC, 0x9E, 0xDC, 0x9F, /* 0xE0-0xE3 */
+	0xDC, 0xA0, 0xDD, 0x40, 0xDD, 0x41, 0xDD, 0x42, /* 0xE4-0xE7 */
+	0xDD, 0x43, 0xDD, 0x44, 0xDD, 0x45, 0xDD, 0x46, /* 0xE8-0xEB */
+	0xDD, 0x47, 0xDD, 0x48, 0xDD, 0x49, 0xDD, 0x4A, /* 0xEC-0xEF */
+	0xDD, 0x4B, 0xDD, 0x4C, 0xDD, 0x4D, 0xDD, 0x4E, /* 0xF0-0xF3 */
+	0xDD, 0x4F, 0xDD, 0x50, 0xDD, 0x51, 0xDD, 0x52, /* 0xF4-0xF7 */
+	0xDD, 0x53, 0xDD, 0x54, 0xDD, 0x55, 0xDD, 0x56, /* 0xF8-0xFB */
+	0xDD, 0x57, 0xDD, 0x58, 0xDD, 0x59, 0xDD, 0x5A, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8F[512] = {
+	0xDD, 0x5B, 0xDD, 0x5C, 0xDD, 0x5D, 0xDD, 0x5E, /* 0x00-0x03 */
+	0xDD, 0x5F, 0xDD, 0x60, 0xDD, 0x61, 0xDD, 0x62, /* 0x04-0x07 */
+	0xDD, 0x63, 0xDD, 0x64, 0xDD, 0x65, 0xDD, 0x66, /* 0x08-0x0B */
+	0xDD, 0x67, 0xDD, 0x68, 0xDD, 0x69, 0xDD, 0x6A, /* 0x0C-0x0F */
+	0xDD, 0x6B, 0xDD, 0x6C, 0xDD, 0x6D, 0xDD, 0x6E, /* 0x10-0x13 */
+	0xDD, 0x6F, 0xDD, 0x70, 0xDD, 0x71, 0xDD, 0x72, /* 0x14-0x17 */
+	0xDD, 0x73, 0xDD, 0x74, 0xDD, 0x75, 0xDD, 0x76, /* 0x18-0x1B */
+	0xDD, 0x77, 0xDD, 0x78, 0xDD, 0x79, 0xDD, 0x7A, /* 0x1C-0x1F */
+	0xDD, 0x7B, 0xDD, 0x7C, 0xDD, 0x7D, 0xDD, 0x7E, /* 0x20-0x23 */
+	0xDD, 0x80, 0xDD, 0x81, 0xDD, 0x82, 0xDD, 0x83, /* 0x24-0x27 */
+	0xDD, 0x84, 0xDD, 0x85, 0xDD, 0x86, 0xDD, 0x87, /* 0x28-0x2B */
+	0xDD, 0x88, 0xDD, 0x89, 0xDD, 0x8A, 0xDD, 0x8B, /* 0x2C-0x2F */
+	0xDD, 0x8C, 0xDD, 0x8D, 0xDD, 0x8E, 0xDD, 0x8F, /* 0x30-0x33 */
+	0xDD, 0x90, 0xDD, 0x91, 0xDD, 0x92, 0xDD, 0x93, /* 0x34-0x37 */
+	0xDD, 0x94, 0xDD, 0x95, 0xDD, 0x96, 0xDD, 0x97, /* 0x38-0x3B */
+	0xDD, 0x98, 0xDD, 0x99, 0xDD, 0x9A, 0xDD, 0x9B, /* 0x3C-0x3F */
+	0xDD, 0x9C, 0xDD, 0x9D, 0xDD, 0x9E, 0xDD, 0x9F, /* 0x40-0x43 */
+	0xDD, 0xA0, 0xDE, 0x40, 0xDE, 0x41, 0xDE, 0x42, /* 0x44-0x47 */
+	0xDE, 0x43, 0xDE, 0x44, 0xDE, 0x45, 0xDE, 0x46, /* 0x48-0x4B */
+	0xDE, 0x47, 0xDE, 0x48, 0xDE, 0x49, 0xDE, 0x4A, /* 0x4C-0x4F */
+	0xDE, 0x4B, 0xDE, 0x4C, 0xDE, 0x4D, 0xDE, 0x4E, /* 0x50-0x53 */
+	0xDE, 0x4F, 0xDE, 0x50, 0xDE, 0x51, 0xDE, 0x52, /* 0x54-0x57 */
+	0xDE, 0x53, 0xDE, 0x54, 0xDE, 0x55, 0xDE, 0x56, /* 0x58-0x5B */
+	0xDE, 0x57, 0xDE, 0x58, 0xDE, 0x59, 0xDE, 0x5A, /* 0x5C-0x5F */
+	0xDE, 0x5B, 0xDE, 0x5C, 0xDE, 0x5D, 0xDE, 0x5E, /* 0x60-0x63 */
+	0xDE, 0x5F, 0xDE, 0x60, 0xB3, 0xB5, 0xD4, 0xFE, /* 0x64-0x67 */
+	0xB9, 0xEC, 0xD0, 0xF9, 0xDE, 0x61, 0xE9, 0xED, /* 0x68-0x6B */
+	0xD7, 0xAA, 0xE9, 0xEE, 0xC2, 0xD6, 0xC8, 0xED, /* 0x6C-0x6F */
+	0xBA, 0xE4, 0xE9, 0xEF, 0xE9, 0xF0, 0xE9, 0xF1, /* 0x70-0x73 */
+	0xD6, 0xE1, 0xE9, 0xF2, 0xE9, 0xF3, 0xE9, 0xF5, /* 0x74-0x77 */
+	0xE9, 0xF4, 0xE9, 0xF6, 0xE9, 0xF7, 0xC7, 0xE1, /* 0x78-0x7B */
+	0xE9, 0xF8, 0xD4, 0xD8, 0xE9, 0xF9, 0xBD, 0xCE, /* 0x7C-0x7F */
+	
+	0xDE, 0x62, 0xE9, 0xFA, 0xE9, 0xFB, 0xBD, 0xCF, /* 0x80-0x83 */
+	0xE9, 0xFC, 0xB8, 0xA8, 0xC1, 0xBE, 0xE9, 0xFD, /* 0x84-0x87 */
+	0xB1, 0xB2, 0xBB, 0xD4, 0xB9, 0xF5, 0xE9, 0xFE, /* 0x88-0x8B */
+	0xDE, 0x63, 0xEA, 0xA1, 0xEA, 0xA2, 0xEA, 0xA3, /* 0x8C-0x8F */
+	0xB7, 0xF8, 0xBC, 0xAD, 0xDE, 0x64, 0xCA, 0xE4, /* 0x90-0x93 */
+	0xE0, 0xCE, 0xD4, 0xAF, 0xCF, 0xBD, 0xD5, 0xB7, /* 0x94-0x97 */
+	0xEA, 0xA4, 0xD5, 0xDE, 0xEA, 0xA5, 0xD0, 0xC1, /* 0x98-0x9B */
+	0xB9, 0xBC, 0xDE, 0x65, 0xB4, 0xC7, 0xB1, 0xD9, /* 0x9C-0x9F */
+	0xDE, 0x66, 0xDE, 0x67, 0xDE, 0x68, 0xC0, 0xB1, /* 0xA0-0xA3 */
+	0xDE, 0x69, 0xDE, 0x6A, 0xDE, 0x6B, 0xDE, 0x6C, /* 0xA4-0xA7 */
+	0xB1, 0xE6, 0xB1, 0xE7, 0xDE, 0x6D, 0xB1, 0xE8, /* 0xA8-0xAB */
+	0xDE, 0x6E, 0xDE, 0x6F, 0xDE, 0x70, 0xDE, 0x71, /* 0xAC-0xAF */
+	0xB3, 0xBD, 0xC8, 0xE8, 0xDE, 0x72, 0xDE, 0x73, /* 0xB0-0xB3 */
+	0xDE, 0x74, 0xDE, 0x75, 0xE5, 0xC1, 0xDE, 0x76, /* 0xB4-0xB7 */
+	0xDE, 0x77, 0xB1, 0xDF, 0xDE, 0x78, 0xDE, 0x79, /* 0xB8-0xBB */
+	0xDE, 0x7A, 0xC1, 0xC9, 0xB4, 0xEF, 0xDE, 0x7B, /* 0xBC-0xBF */
+	0xDE, 0x7C, 0xC7, 0xA8, 0xD3, 0xD8, 0xDE, 0x7D, /* 0xC0-0xC3 */
+	0xC6, 0xF9, 0xD1, 0xB8, 0xDE, 0x7E, 0xB9, 0xFD, /* 0xC4-0xC7 */
+	0xC2, 0xF5, 0xDE, 0x80, 0xDE, 0x81, 0xDE, 0x82, /* 0xC8-0xCB */
+	0xDE, 0x83, 0xDE, 0x84, 0xD3, 0xAD, 0xDE, 0x85, /* 0xCC-0xCF */
+	0xD4, 0xCB, 0xBD, 0xFC, 0xDE, 0x86, 0xE5, 0xC2, /* 0xD0-0xD3 */
+	0xB7, 0xB5, 0xE5, 0xC3, 0xDE, 0x87, 0xDE, 0x88, /* 0xD4-0xD7 */
+	0xBB, 0xB9, 0xD5, 0xE2, 0xDE, 0x89, 0xBD, 0xF8, /* 0xD8-0xDB */
+	0xD4, 0xB6, 0xCE, 0xA5, 0xC1, 0xAC, 0xB3, 0xD9, /* 0xDC-0xDF */
+	0xDE, 0x8A, 0xDE, 0x8B, 0xCC, 0xF6, 0xDE, 0x8C, /* 0xE0-0xE3 */
+	0xE5, 0xC6, 0xE5, 0xC4, 0xE5, 0xC8, 0xDE, 0x8D, /* 0xE4-0xE7 */
+	0xE5, 0xCA, 0xE5, 0xC7, 0xB5, 0xCF, 0xC6, 0xC8, /* 0xE8-0xEB */
+	0xDE, 0x8E, 0xB5, 0xFC, 0xE5, 0xC5, 0xDE, 0x8F, /* 0xEC-0xEF */
+	0xCA, 0xF6, 0xDE, 0x90, 0xDE, 0x91, 0xE5, 0xC9, /* 0xF0-0xF3 */
+	0xDE, 0x92, 0xDE, 0x93, 0xDE, 0x94, 0xC3, 0xD4, /* 0xF4-0xF7 */
+	0xB1, 0xC5, 0xBC, 0xA3, 0xDE, 0x95, 0xDE, 0x96, /* 0xF8-0xFB */
+	0xDE, 0x97, 0xD7, 0xB7, 0xDE, 0x98, 0xDE, 0x99, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_90[512] = {
+	0xCD, 0xCB, 0xCB, 0xCD, 0xCA, 0xCA, 0xCC, 0xD3, /* 0x00-0x03 */
+	0xE5, 0xCC, 0xE5, 0xCB, 0xC4, 0xE6, 0xDE, 0x9A, /* 0x04-0x07 */
+	0xDE, 0x9B, 0xD1, 0xA1, 0xD1, 0xB7, 0xE5, 0xCD, /* 0x08-0x0B */
+	0xDE, 0x9C, 0xE5, 0xD0, 0xDE, 0x9D, 0xCD, 0xB8, /* 0x0C-0x0F */
+	0xD6, 0xF0, 0xE5, 0xCF, 0xB5, 0xDD, 0xDE, 0x9E, /* 0x10-0x13 */
+	0xCD, 0xBE, 0xDE, 0x9F, 0xE5, 0xD1, 0xB6, 0xBA, /* 0x14-0x17 */
+	0xDE, 0xA0, 0xDF, 0x40, 0xCD, 0xA8, 0xB9, 0xE4, /* 0x18-0x1B */
+	0xDF, 0x41, 0xCA, 0xC5, 0xB3, 0xD1, 0xCB, 0xD9, /* 0x1C-0x1F */
+	0xD4, 0xEC, 0xE5, 0xD2, 0xB7, 0xEA, 0xDF, 0x42, /* 0x20-0x23 */
+	0xDF, 0x43, 0xDF, 0x44, 0xE5, 0xCE, 0xDF, 0x45, /* 0x24-0x27 */
+	0xDF, 0x46, 0xDF, 0x47, 0xDF, 0x48, 0xDF, 0x49, /* 0x28-0x2B */
+	0xDF, 0x4A, 0xE5, 0xD5, 0xB4, 0xFE, 0xE5, 0xD6, /* 0x2C-0x2F */
+	0xDF, 0x4B, 0xDF, 0x4C, 0xDF, 0x4D, 0xDF, 0x4E, /* 0x30-0x33 */
+	0xDF, 0x4F, 0xE5, 0xD3, 0xE5, 0xD4, 0xDF, 0x50, /* 0x34-0x37 */
+	0xD2, 0xDD, 0xDF, 0x51, 0xDF, 0x52, 0xC2, 0xDF, /* 0x38-0x3B */
+	0xB1, 0xC6, 0xDF, 0x53, 0xD3, 0xE2, 0xDF, 0x54, /* 0x3C-0x3F */
+	0xDF, 0x55, 0xB6, 0xDD, 0xCB, 0xEC, 0xDF, 0x56, /* 0x40-0x43 */
+	0xE5, 0xD7, 0xDF, 0x57, 0xDF, 0x58, 0xD3, 0xF6, /* 0x44-0x47 */
+	0xDF, 0x59, 0xDF, 0x5A, 0xDF, 0x5B, 0xDF, 0x5C, /* 0x48-0x4B */
+	0xDF, 0x5D, 0xB1, 0xE9, 0xDF, 0x5E, 0xB6, 0xF4, /* 0x4C-0x4F */
+	0xE5, 0xDA, 0xE5, 0xD8, 0xE5, 0xD9, 0xB5, 0xC0, /* 0x50-0x53 */
+	0xDF, 0x5F, 0xDF, 0x60, 0xDF, 0x61, 0xD2, 0xC5, /* 0x54-0x57 */
+	0xE5, 0xDC, 0xDF, 0x62, 0xDF, 0x63, 0xE5, 0xDE, /* 0x58-0x5B */
+	0xDF, 0x64, 0xDF, 0x65, 0xDF, 0x66, 0xDF, 0x67, /* 0x5C-0x5F */
+	0xDF, 0x68, 0xDF, 0x69, 0xE5, 0xDD, 0xC7, 0xB2, /* 0x60-0x63 */
+	0xDF, 0x6A, 0xD2, 0xA3, 0xDF, 0x6B, 0xDF, 0x6C, /* 0x64-0x67 */
+	0xE5, 0xDB, 0xDF, 0x6D, 0xDF, 0x6E, 0xDF, 0x6F, /* 0x68-0x6B */
+	0xDF, 0x70, 0xD4, 0xE2, 0xD5, 0xDA, 0xDF, 0x71, /* 0x6C-0x6F */
+	0xDF, 0x72, 0xDF, 0x73, 0xDF, 0x74, 0xDF, 0x75, /* 0x70-0x73 */
+	0xE5, 0xE0, 0xD7, 0xF1, 0xDF, 0x76, 0xDF, 0x77, /* 0x74-0x77 */
+	0xDF, 0x78, 0xDF, 0x79, 0xDF, 0x7A, 0xDF, 0x7B, /* 0x78-0x7B */
+	0xDF, 0x7C, 0xE5, 0xE1, 0xDF, 0x7D, 0xB1, 0xDC, /* 0x7C-0x7F */
+	
+	0xD1, 0xFB, 0xDF, 0x7E, 0xE5, 0xE2, 0xE5, 0xE4, /* 0x80-0x83 */
+	0xDF, 0x80, 0xDF, 0x81, 0xDF, 0x82, 0xDF, 0x83, /* 0x84-0x87 */
+	0xE5, 0xE3, 0xDF, 0x84, 0xDF, 0x85, 0xE5, 0xE5, /* 0x88-0x8B */
+	0xDF, 0x86, 0xDF, 0x87, 0xDF, 0x88, 0xDF, 0x89, /* 0x8C-0x8F */
+	0xDF, 0x8A, 0xD2, 0xD8, 0xDF, 0x8B, 0xB5, 0xCB, /* 0x90-0x93 */
+	0xDF, 0x8C, 0xE7, 0xDF, 0xDF, 0x8D, 0xDA, 0xF5, /* 0x94-0x97 */
+	0xDF, 0x8E, 0xDA, 0xF8, 0xDF, 0x8F, 0xDA, 0xF6, /* 0x98-0x9B */
+	0xDF, 0x90, 0xDA, 0xF7, 0xDF, 0x91, 0xDF, 0x92, /* 0x9C-0x9F */
+	0xDF, 0x93, 0xDA, 0xFA, 0xD0, 0xCF, 0xC4, 0xC7, /* 0xA0-0xA3 */
+	0xDF, 0x94, 0xDF, 0x95, 0xB0, 0xEE, 0xDF, 0x96, /* 0xA4-0xA7 */
+	0xDF, 0x97, 0xDF, 0x98, 0xD0, 0xB0, 0xDF, 0x99, /* 0xA8-0xAB */
+	0xDA, 0xF9, 0xDF, 0x9A, 0xD3, 0xCA, 0xBA, 0xAA, /* 0xAC-0xAF */
+	0xDB, 0xA2, 0xC7, 0xF1, 0xDF, 0x9B, 0xDA, 0xFC, /* 0xB0-0xB3 */
+	0xDA, 0xFB, 0xC9, 0xDB, 0xDA, 0xFD, 0xDF, 0x9C, /* 0xB4-0xB7 */
+	0xDB, 0xA1, 0xD7, 0xDE, 0xDA, 0xFE, 0xC1, 0xDA, /* 0xB8-0xBB */
+	0xDF, 0x9D, 0xDF, 0x9E, 0xDB, 0xA5, 0xDF, 0x9F, /* 0xBC-0xBF */
+	0xDF, 0xA0, 0xD3, 0xF4, 0xE0, 0x40, 0xE0, 0x41, /* 0xC0-0xC3 */
+	0xDB, 0xA7, 0xDB, 0xA4, 0xE0, 0x42, 0xDB, 0xA8, /* 0xC4-0xC7 */
+	0xE0, 0x43, 0xE0, 0x44, 0xBD, 0xBC, 0xE0, 0x45, /* 0xC8-0xCB */
+	0xE0, 0x46, 0xE0, 0x47, 0xC0, 0xC9, 0xDB, 0xA3, /* 0xCC-0xCF */
+	0xDB, 0xA6, 0xD6, 0xA3, 0xE0, 0x48, 0xDB, 0xA9, /* 0xD0-0xD3 */
+	0xE0, 0x49, 0xE0, 0x4A, 0xE0, 0x4B, 0xDB, 0xAD, /* 0xD4-0xD7 */
+	0xE0, 0x4C, 0xE0, 0x4D, 0xE0, 0x4E, 0xDB, 0xAE, /* 0xD8-0xDB */
+	0xDB, 0xAC, 0xBA, 0xC2, 0xE0, 0x4F, 0xE0, 0x50, /* 0xDC-0xDF */
+	0xE0, 0x51, 0xBF, 0xA4, 0xDB, 0xAB, 0xE0, 0x52, /* 0xE0-0xE3 */
+	0xE0, 0x53, 0xE0, 0x54, 0xDB, 0xAA, 0xD4, 0xC7, /* 0xE4-0xE7 */
+	0xB2, 0xBF, 0xE0, 0x55, 0xE0, 0x56, 0xDB, 0xAF, /* 0xE8-0xEB */
+	0xE0, 0x57, 0xB9, 0xF9, 0xE0, 0x58, 0xDB, 0xB0, /* 0xEC-0xEF */
+	0xE0, 0x59, 0xE0, 0x5A, 0xE0, 0x5B, 0xE0, 0x5C, /* 0xF0-0xF3 */
+	0xB3, 0xBB, 0xE0, 0x5D, 0xE0, 0x5E, 0xE0, 0x5F, /* 0xF4-0xF7 */
+	0xB5, 0xA6, 0xE0, 0x60, 0xE0, 0x61, 0xE0, 0x62, /* 0xF8-0xFB */
+	0xE0, 0x63, 0xB6, 0xBC, 0xDB, 0xB1, 0xE0, 0x64, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_91[512] = {
+	0xE0, 0x65, 0xE0, 0x66, 0xB6, 0xF5, 0xE0, 0x67, /* 0x00-0x03 */
+	0xDB, 0xB2, 0xE0, 0x68, 0xE0, 0x69, 0xE0, 0x6A, /* 0x04-0x07 */
+	0xE0, 0x6B, 0xE0, 0x6C, 0xE0, 0x6D, 0xE0, 0x6E, /* 0x08-0x0B */
+	0xE0, 0x6F, 0xE0, 0x70, 0xE0, 0x71, 0xE0, 0x72, /* 0x0C-0x0F */
+	0xE0, 0x73, 0xE0, 0x74, 0xE0, 0x75, 0xE0, 0x76, /* 0x10-0x13 */
+	0xE0, 0x77, 0xE0, 0x78, 0xE0, 0x79, 0xE0, 0x7A, /* 0x14-0x17 */
+	0xE0, 0x7B, 0xB1, 0xC9, 0xE0, 0x7C, 0xE0, 0x7D, /* 0x18-0x1B */
+	0xE0, 0x7E, 0xE0, 0x80, 0xDB, 0xB4, 0xE0, 0x81, /* 0x1C-0x1F */
+	0xE0, 0x82, 0xE0, 0x83, 0xDB, 0xB3, 0xDB, 0xB5, /* 0x20-0x23 */
+	0xE0, 0x84, 0xE0, 0x85, 0xE0, 0x86, 0xE0, 0x87, /* 0x24-0x27 */
+	0xE0, 0x88, 0xE0, 0x89, 0xE0, 0x8A, 0xE0, 0x8B, /* 0x28-0x2B */
+	0xE0, 0x8C, 0xE0, 0x8D, 0xE0, 0x8E, 0xDB, 0xB7, /* 0x2C-0x2F */
+	0xE0, 0x8F, 0xDB, 0xB6, 0xE0, 0x90, 0xE0, 0x91, /* 0x30-0x33 */
+	0xE0, 0x92, 0xE0, 0x93, 0xE0, 0x94, 0xE0, 0x95, /* 0x34-0x37 */
+	0xE0, 0x96, 0xDB, 0xB8, 0xE0, 0x97, 0xE0, 0x98, /* 0x38-0x3B */
+	0xE0, 0x99, 0xE0, 0x9A, 0xE0, 0x9B, 0xE0, 0x9C, /* 0x3C-0x3F */
+	0xE0, 0x9D, 0xE0, 0x9E, 0xE0, 0x9F, 0xDB, 0xB9, /* 0x40-0x43 */
+	0xE0, 0xA0, 0xE1, 0x40, 0xDB, 0xBA, 0xE1, 0x41, /* 0x44-0x47 */
+	0xE1, 0x42, 0xD3, 0xCF, 0xF4, 0xFA, 0xC7, 0xF5, /* 0x48-0x4B */
+	0xD7, 0xC3, 0xC5, 0xE4, 0xF4, 0xFC, 0xF4, 0xFD, /* 0x4C-0x4F */
+	0xF4, 0xFB, 0xE1, 0x43, 0xBE, 0xC6, 0xE1, 0x44, /* 0x50-0x53 */
+	0xE1, 0x45, 0xE1, 0x46, 0xE1, 0x47, 0xD0, 0xEF, /* 0x54-0x57 */
+	0xE1, 0x48, 0xE1, 0x49, 0xB7, 0xD3, 0xE1, 0x4A, /* 0x58-0x5B */
+	0xE1, 0x4B, 0xD4, 0xCD, 0xCC, 0xAA, 0xE1, 0x4C, /* 0x5C-0x5F */
+	0xE1, 0x4D, 0xF5, 0xA2, 0xF5, 0xA1, 0xBA, 0xA8, /* 0x60-0x63 */
+	0xF4, 0xFE, 0xCB, 0xD6, 0xE1, 0x4E, 0xE1, 0x4F, /* 0x64-0x67 */
+	0xE1, 0x50, 0xF5, 0xA4, 0xC0, 0xD2, 0xE1, 0x51, /* 0x68-0x6B */
+	0xB3, 0xEA, 0xE1, 0x52, 0xCD, 0xAA, 0xF5, 0xA5, /* 0x6C-0x6F */
+	0xF5, 0xA3, 0xBD, 0xB4, 0xF5, 0xA8, 0xE1, 0x53, /* 0x70-0x73 */
+	0xF5, 0xA9, 0xBD, 0xCD, 0xC3, 0xB8, 0xBF, 0xE1, /* 0x74-0x77 */
+	0xCB, 0xE1, 0xF5, 0xAA, 0xE1, 0x54, 0xE1, 0x55, /* 0x78-0x7B */
+	0xE1, 0x56, 0xF5, 0xA6, 0xF5, 0xA7, 0xC4, 0xF0, /* 0x7C-0x7F */
+	
+	0xE1, 0x57, 0xE1, 0x58, 0xE1, 0x59, 0xE1, 0x5A, /* 0x80-0x83 */
+	0xE1, 0x5B, 0xF5, 0xAC, 0xE1, 0x5C, 0xB4, 0xBC, /* 0x84-0x87 */
+	0xE1, 0x5D, 0xD7, 0xED, 0xE1, 0x5E, 0xB4, 0xD7, /* 0x88-0x8B */
+	0xF5, 0xAB, 0xF5, 0xAE, 0xE1, 0x5F, 0xE1, 0x60, /* 0x8C-0x8F */
+	0xF5, 0xAD, 0xF5, 0xAF, 0xD0, 0xD1, 0xE1, 0x61, /* 0x90-0x93 */
+	0xE1, 0x62, 0xE1, 0x63, 0xE1, 0x64, 0xE1, 0x65, /* 0x94-0x97 */
+	0xE1, 0x66, 0xE1, 0x67, 0xC3, 0xD1, 0xC8, 0xA9, /* 0x98-0x9B */
+	0xE1, 0x68, 0xE1, 0x69, 0xE1, 0x6A, 0xE1, 0x6B, /* 0x9C-0x9F */
+	0xE1, 0x6C, 0xE1, 0x6D, 0xF5, 0xB0, 0xF5, 0xB1, /* 0xA0-0xA3 */
+	0xE1, 0x6E, 0xE1, 0x6F, 0xE1, 0x70, 0xE1, 0x71, /* 0xA4-0xA7 */
+	0xE1, 0x72, 0xE1, 0x73, 0xF5, 0xB2, 0xE1, 0x74, /* 0xA8-0xAB */
+	0xE1, 0x75, 0xF5, 0xB3, 0xF5, 0xB4, 0xF5, 0xB5, /* 0xAC-0xAF */
+	0xE1, 0x76, 0xE1, 0x77, 0xE1, 0x78, 0xE1, 0x79, /* 0xB0-0xB3 */
+	0xF5, 0xB7, 0xF5, 0xB6, 0xE1, 0x7A, 0xE1, 0x7B, /* 0xB4-0xB7 */
+	0xE1, 0x7C, 0xE1, 0x7D, 0xF5, 0xB8, 0xE1, 0x7E, /* 0xB8-0xBB */
+	0xE1, 0x80, 0xE1, 0x81, 0xE1, 0x82, 0xE1, 0x83, /* 0xBC-0xBF */
+	0xE1, 0x84, 0xE1, 0x85, 0xE1, 0x86, 0xE1, 0x87, /* 0xC0-0xC3 */
+	0xE1, 0x88, 0xE1, 0x89, 0xE1, 0x8A, 0xB2, 0xC9, /* 0xC4-0xC7 */
+	0xE1, 0x8B, 0xD3, 0xD4, 0xCA, 0xCD, 0xE1, 0x8C, /* 0xC8-0xCB */
+	0xC0, 0xEF, 0xD6, 0xD8, 0xD2, 0xB0, 0xC1, 0xBF, /* 0xCC-0xCF */
+	0xE1, 0x8D, 0xBD, 0xF0, 0xE1, 0x8E, 0xE1, 0x8F, /* 0xD0-0xD3 */
+	0xE1, 0x90, 0xE1, 0x91, 0xE1, 0x92, 0xE1, 0x93, /* 0xD4-0xD7 */
+	0xE1, 0x94, 0xE1, 0x95, 0xE1, 0x96, 0xE1, 0x97, /* 0xD8-0xDB */
+	0xB8, 0xAA, 0xE1, 0x98, 0xE1, 0x99, 0xE1, 0x9A, /* 0xDC-0xDF */
+	0xE1, 0x9B, 0xE1, 0x9C, 0xE1, 0x9D, 0xE1, 0x9E, /* 0xE0-0xE3 */
+	0xE1, 0x9F, 0xE1, 0xA0, 0xE2, 0x40, 0xE2, 0x41, /* 0xE4-0xE7 */
+	0xE2, 0x42, 0xE2, 0x43, 0xE2, 0x44, 0xE2, 0x45, /* 0xE8-0xEB */
+	0xE2, 0x46, 0xE2, 0x47, 0xE2, 0x48, 0xE2, 0x49, /* 0xEC-0xEF */
+	0xE2, 0x4A, 0xE2, 0x4B, 0xE2, 0x4C, 0xE2, 0x4D, /* 0xF0-0xF3 */
+	0xE2, 0x4E, 0xE2, 0x4F, 0xE2, 0x50, 0xE2, 0x51, /* 0xF4-0xF7 */
+	0xE2, 0x52, 0xE2, 0x53, 0xE2, 0x54, 0xE2, 0x55, /* 0xF8-0xFB */
+	0xE2, 0x56, 0xE2, 0x57, 0xE2, 0x58, 0xE2, 0x59, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_92[512] = {
+	0xE2, 0x5A, 0xE2, 0x5B, 0xE2, 0x5C, 0xE2, 0x5D, /* 0x00-0x03 */
+	0xE2, 0x5E, 0xE2, 0x5F, 0xE2, 0x60, 0xE2, 0x61, /* 0x04-0x07 */
+	0xE2, 0x62, 0xE2, 0x63, 0xE2, 0x64, 0xE2, 0x65, /* 0x08-0x0B */
+	0xE2, 0x66, 0xE2, 0x67, 0xE2, 0x68, 0xE2, 0x69, /* 0x0C-0x0F */
+	0xE2, 0x6A, 0xE2, 0x6B, 0xE2, 0x6C, 0xE2, 0x6D, /* 0x10-0x13 */
+	0xE2, 0x6E, 0xE2, 0x6F, 0xE2, 0x70, 0xE2, 0x71, /* 0x14-0x17 */
+	0xE2, 0x72, 0xE2, 0x73, 0xE2, 0x74, 0xE2, 0x75, /* 0x18-0x1B */
+	0xE2, 0x76, 0xE2, 0x77, 0xE2, 0x78, 0xE2, 0x79, /* 0x1C-0x1F */
+	0xE2, 0x7A, 0xE2, 0x7B, 0xE2, 0x7C, 0xE2, 0x7D, /* 0x20-0x23 */
+	0xE2, 0x7E, 0xE2, 0x80, 0xE2, 0x81, 0xE2, 0x82, /* 0x24-0x27 */
+	0xE2, 0x83, 0xE2, 0x84, 0xE2, 0x85, 0xE2, 0x86, /* 0x28-0x2B */
+	0xE2, 0x87, 0xE2, 0x88, 0xE2, 0x89, 0xE2, 0x8A, /* 0x2C-0x2F */
+	0xE2, 0x8B, 0xE2, 0x8C, 0xE2, 0x8D, 0xE2, 0x8E, /* 0x30-0x33 */
+	0xE2, 0x8F, 0xE2, 0x90, 0xE2, 0x91, 0xE2, 0x92, /* 0x34-0x37 */
+	0xE2, 0x93, 0xE2, 0x94, 0xE2, 0x95, 0xE2, 0x96, /* 0x38-0x3B */
+	0xE2, 0x97, 0xE2, 0x98, 0xE2, 0x99, 0xE2, 0x9A, /* 0x3C-0x3F */
+	0xE2, 0x9B, 0xE2, 0x9C, 0xE2, 0x9D, 0xE2, 0x9E, /* 0x40-0x43 */
+	0xE2, 0x9F, 0xE2, 0xA0, 0xE3, 0x40, 0xE3, 0x41, /* 0x44-0x47 */
+	0xE3, 0x42, 0xE3, 0x43, 0xE3, 0x44, 0xE3, 0x45, /* 0x48-0x4B */
+	0xE3, 0x46, 0xE3, 0x47, 0xE3, 0x48, 0xE3, 0x49, /* 0x4C-0x4F */
+	0xE3, 0x4A, 0xE3, 0x4B, 0xE3, 0x4C, 0xE3, 0x4D, /* 0x50-0x53 */
+	0xE3, 0x4E, 0xE3, 0x4F, 0xE3, 0x50, 0xE3, 0x51, /* 0x54-0x57 */
+	0xE3, 0x52, 0xE3, 0x53, 0xE3, 0x54, 0xE3, 0x55, /* 0x58-0x5B */
+	0xE3, 0x56, 0xE3, 0x57, 0xE3, 0x58, 0xE3, 0x59, /* 0x5C-0x5F */
+	0xE3, 0x5A, 0xE3, 0x5B, 0xE3, 0x5C, 0xE3, 0x5D, /* 0x60-0x63 */
+	0xE3, 0x5E, 0xE3, 0x5F, 0xE3, 0x60, 0xE3, 0x61, /* 0x64-0x67 */
+	0xE3, 0x62, 0xE3, 0x63, 0xE3, 0x64, 0xE3, 0x65, /* 0x68-0x6B */
+	0xE3, 0x66, 0xE3, 0x67, 0xE3, 0x68, 0xE3, 0x69, /* 0x6C-0x6F */
+	0xE3, 0x6A, 0xE3, 0x6B, 0xE3, 0x6C, 0xE3, 0x6D, /* 0x70-0x73 */
+	0xBC, 0xF8, 0xE3, 0x6E, 0xE3, 0x6F, 0xE3, 0x70, /* 0x74-0x77 */
+	0xE3, 0x71, 0xE3, 0x72, 0xE3, 0x73, 0xE3, 0x74, /* 0x78-0x7B */
+	0xE3, 0x75, 0xE3, 0x76, 0xE3, 0x77, 0xE3, 0x78, /* 0x7C-0x7F */
+	
+	0xE3, 0x79, 0xE3, 0x7A, 0xE3, 0x7B, 0xE3, 0x7C, /* 0x80-0x83 */
+	0xE3, 0x7D, 0xE3, 0x7E, 0xE3, 0x80, 0xE3, 0x81, /* 0x84-0x87 */
+	0xE3, 0x82, 0xE3, 0x83, 0xE3, 0x84, 0xE3, 0x85, /* 0x88-0x8B */
+	0xE3, 0x86, 0xE3, 0x87, 0xF6, 0xC6, 0xE3, 0x88, /* 0x8C-0x8F */
+	0xE3, 0x89, 0xE3, 0x8A, 0xE3, 0x8B, 0xE3, 0x8C, /* 0x90-0x93 */
+	0xE3, 0x8D, 0xE3, 0x8E, 0xE3, 0x8F, 0xE3, 0x90, /* 0x94-0x97 */
+	0xE3, 0x91, 0xE3, 0x92, 0xE3, 0x93, 0xE3, 0x94, /* 0x98-0x9B */
+	0xE3, 0x95, 0xE3, 0x96, 0xE3, 0x97, 0xE3, 0x98, /* 0x9C-0x9F */
+	0xE3, 0x99, 0xE3, 0x9A, 0xE3, 0x9B, 0xE3, 0x9C, /* 0xA0-0xA3 */
+	0xE3, 0x9D, 0xE3, 0x9E, 0xE3, 0x9F, 0xE3, 0xA0, /* 0xA4-0xA7 */
+	0xE4, 0x40, 0xE4, 0x41, 0xE4, 0x42, 0xE4, 0x43, /* 0xA8-0xAB */
+	0xE4, 0x44, 0xE4, 0x45, 0xF6, 0xC7, 0xE4, 0x46, /* 0xAC-0xAF */
+	0xE4, 0x47, 0xE4, 0x48, 0xE4, 0x49, 0xE4, 0x4A, /* 0xB0-0xB3 */
+	0xE4, 0x4B, 0xE4, 0x4C, 0xE4, 0x4D, 0xE4, 0x4E, /* 0xB4-0xB7 */
+	0xE4, 0x4F, 0xE4, 0x50, 0xE4, 0x51, 0xE4, 0x52, /* 0xB8-0xBB */
+	0xE4, 0x53, 0xE4, 0x54, 0xE4, 0x55, 0xE4, 0x56, /* 0xBC-0xBF */
+	0xE4, 0x57, 0xE4, 0x58, 0xE4, 0x59, 0xE4, 0x5A, /* 0xC0-0xC3 */
+	0xE4, 0x5B, 0xE4, 0x5C, 0xE4, 0x5D, 0xE4, 0x5E, /* 0xC4-0xC7 */
+	0xF6, 0xC8, 0xE4, 0x5F, 0xE4, 0x60, 0xE4, 0x61, /* 0xC8-0xCB */
+	0xE4, 0x62, 0xE4, 0x63, 0xE4, 0x64, 0xE4, 0x65, /* 0xCC-0xCF */
+	0xE4, 0x66, 0xE4, 0x67, 0xE4, 0x68, 0xE4, 0x69, /* 0xD0-0xD3 */
+	0xE4, 0x6A, 0xE4, 0x6B, 0xE4, 0x6C, 0xE4, 0x6D, /* 0xD4-0xD7 */
+	0xE4, 0x6E, 0xE4, 0x6F, 0xE4, 0x70, 0xE4, 0x71, /* 0xD8-0xDB */
+	0xE4, 0x72, 0xE4, 0x73, 0xE4, 0x74, 0xE4, 0x75, /* 0xDC-0xDF */
+	0xE4, 0x76, 0xE4, 0x77, 0xE4, 0x78, 0xE4, 0x79, /* 0xE0-0xE3 */
+	0xE4, 0x7A, 0xE4, 0x7B, 0xE4, 0x7C, 0xE4, 0x7D, /* 0xE4-0xE7 */
+	0xE4, 0x7E, 0xE4, 0x80, 0xE4, 0x81, 0xE4, 0x82, /* 0xE8-0xEB */
+	0xE4, 0x83, 0xE4, 0x84, 0xE4, 0x85, 0xE4, 0x86, /* 0xEC-0xEF */
+	0xE4, 0x87, 0xE4, 0x88, 0xE4, 0x89, 0xE4, 0x8A, /* 0xF0-0xF3 */
+	0xE4, 0x8B, 0xE4, 0x8C, 0xE4, 0x8D, 0xE4, 0x8E, /* 0xF4-0xF7 */
+	0xE4, 0x8F, 0xE4, 0x90, 0xE4, 0x91, 0xE4, 0x92, /* 0xF8-0xFB */
+	0xE4, 0x93, 0xE4, 0x94, 0xE4, 0x95, 0xE4, 0x96, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_93[512] = {
+	0xE4, 0x97, 0xE4, 0x98, 0xE4, 0x99, 0xE4, 0x9A, /* 0x00-0x03 */
+	0xE4, 0x9B, 0xE4, 0x9C, 0xE4, 0x9D, 0xE4, 0x9E, /* 0x04-0x07 */
+	0xE4, 0x9F, 0xE4, 0xA0, 0xE5, 0x40, 0xE5, 0x41, /* 0x08-0x0B */
+	0xE5, 0x42, 0xE5, 0x43, 0xE5, 0x44, 0xE5, 0x45, /* 0x0C-0x0F */
+	0xE5, 0x46, 0xE5, 0x47, 0xE5, 0x48, 0xE5, 0x49, /* 0x10-0x13 */
+	0xE5, 0x4A, 0xE5, 0x4B, 0xE5, 0x4C, 0xE5, 0x4D, /* 0x14-0x17 */
+	0xE5, 0x4E, 0xE5, 0x4F, 0xE5, 0x50, 0xE5, 0x51, /* 0x18-0x1B */
+	0xE5, 0x52, 0xE5, 0x53, 0xE5, 0x54, 0xE5, 0x55, /* 0x1C-0x1F */
+	0xE5, 0x56, 0xE5, 0x57, 0xE5, 0x58, 0xE5, 0x59, /* 0x20-0x23 */
+	0xE5, 0x5A, 0xE5, 0x5B, 0xE5, 0x5C, 0xE5, 0x5D, /* 0x24-0x27 */
+	0xE5, 0x5E, 0xE5, 0x5F, 0xE5, 0x60, 0xE5, 0x61, /* 0x28-0x2B */
+	0xE5, 0x62, 0xE5, 0x63, 0xE5, 0x64, 0xE5, 0x65, /* 0x2C-0x2F */
+	0xE5, 0x66, 0xE5, 0x67, 0xE5, 0x68, 0xE5, 0x69, /* 0x30-0x33 */
+	0xE5, 0x6A, 0xE5, 0x6B, 0xE5, 0x6C, 0xE5, 0x6D, /* 0x34-0x37 */
+	0xE5, 0x6E, 0xE5, 0x6F, 0xE5, 0x70, 0xE5, 0x71, /* 0x38-0x3B */
+	0xE5, 0x72, 0xE5, 0x73, 0xF6, 0xC9, 0xE5, 0x74, /* 0x3C-0x3F */
+	0xE5, 0x75, 0xE5, 0x76, 0xE5, 0x77, 0xE5, 0x78, /* 0x40-0x43 */
+	0xE5, 0x79, 0xE5, 0x7A, 0xE5, 0x7B, 0xE5, 0x7C, /* 0x44-0x47 */
+	0xE5, 0x7D, 0xE5, 0x7E, 0xE5, 0x80, 0xE5, 0x81, /* 0x48-0x4B */
+	0xE5, 0x82, 0xE5, 0x83, 0xE5, 0x84, 0xE5, 0x85, /* 0x4C-0x4F */
+	0xE5, 0x86, 0xE5, 0x87, 0xE5, 0x88, 0xE5, 0x89, /* 0x50-0x53 */
+	0xE5, 0x8A, 0xE5, 0x8B, 0xE5, 0x8C, 0xE5, 0x8D, /* 0x54-0x57 */
+	0xE5, 0x8E, 0xE5, 0x8F, 0xE5, 0x90, 0xE5, 0x91, /* 0x58-0x5B */
+	0xE5, 0x92, 0xE5, 0x93, 0xE5, 0x94, 0xE5, 0x95, /* 0x5C-0x5F */
+	0xE5, 0x96, 0xE5, 0x97, 0xE5, 0x98, 0xE5, 0x99, /* 0x60-0x63 */
+	0xE5, 0x9A, 0xE5, 0x9B, 0xE5, 0x9C, 0xE5, 0x9D, /* 0x64-0x67 */
+	0xE5, 0x9E, 0xE5, 0x9F, 0xF6, 0xCA, 0xE5, 0xA0, /* 0x68-0x6B */
+	0xE6, 0x40, 0xE6, 0x41, 0xE6, 0x42, 0xE6, 0x43, /* 0x6C-0x6F */
+	0xE6, 0x44, 0xE6, 0x45, 0xE6, 0x46, 0xE6, 0x47, /* 0x70-0x73 */
+	0xE6, 0x48, 0xE6, 0x49, 0xE6, 0x4A, 0xE6, 0x4B, /* 0x74-0x77 */
+	0xE6, 0x4C, 0xE6, 0x4D, 0xE6, 0x4E, 0xE6, 0x4F, /* 0x78-0x7B */
+	0xE6, 0x50, 0xE6, 0x51, 0xE6, 0x52, 0xE6, 0x53, /* 0x7C-0x7F */
+	
+	0xE6, 0x54, 0xE6, 0x55, 0xE6, 0x56, 0xE6, 0x57, /* 0x80-0x83 */
+	0xE6, 0x58, 0xE6, 0x59, 0xE6, 0x5A, 0xE6, 0x5B, /* 0x84-0x87 */
+	0xE6, 0x5C, 0xE6, 0x5D, 0xE6, 0x5E, 0xE6, 0x5F, /* 0x88-0x8B */
+	0xE6, 0x60, 0xE6, 0x61, 0xE6, 0x62, 0xF6, 0xCC, /* 0x8C-0x8F */
+	0xE6, 0x63, 0xE6, 0x64, 0xE6, 0x65, 0xE6, 0x66, /* 0x90-0x93 */
+	0xE6, 0x67, 0xE6, 0x68, 0xE6, 0x69, 0xE6, 0x6A, /* 0x94-0x97 */
+	0xE6, 0x6B, 0xE6, 0x6C, 0xE6, 0x6D, 0xE6, 0x6E, /* 0x98-0x9B */
+	0xE6, 0x6F, 0xE6, 0x70, 0xE6, 0x71, 0xE6, 0x72, /* 0x9C-0x9F */
+	0xE6, 0x73, 0xE6, 0x74, 0xE6, 0x75, 0xE6, 0x76, /* 0xA0-0xA3 */
+	0xE6, 0x77, 0xE6, 0x78, 0xE6, 0x79, 0xE6, 0x7A, /* 0xA4-0xA7 */
+	0xE6, 0x7B, 0xE6, 0x7C, 0xE6, 0x7D, 0xE6, 0x7E, /* 0xA8-0xAB */
+	0xE6, 0x80, 0xE6, 0x81, 0xE6, 0x82, 0xE6, 0x83, /* 0xAC-0xAF */
+	0xE6, 0x84, 0xE6, 0x85, 0xE6, 0x86, 0xE6, 0x87, /* 0xB0-0xB3 */
+	0xE6, 0x88, 0xE6, 0x89, 0xE6, 0x8A, 0xE6, 0x8B, /* 0xB4-0xB7 */
+	0xE6, 0x8C, 0xE6, 0x8D, 0xE6, 0x8E, 0xE6, 0x8F, /* 0xB8-0xBB */
+	0xE6, 0x90, 0xE6, 0x91, 0xE6, 0x92, 0xE6, 0x93, /* 0xBC-0xBF */
+	0xE6, 0x94, 0xE6, 0x95, 0xE6, 0x96, 0xE6, 0x97, /* 0xC0-0xC3 */
+	0xE6, 0x98, 0xE6, 0x99, 0xE6, 0x9A, 0xE6, 0x9B, /* 0xC4-0xC7 */
+	0xE6, 0x9C, 0xE6, 0x9D, 0xF6, 0xCB, 0xE6, 0x9E, /* 0xC8-0xCB */
+	0xE6, 0x9F, 0xE6, 0xA0, 0xE7, 0x40, 0xE7, 0x41, /* 0xCC-0xCF */
+	0xE7, 0x42, 0xE7, 0x43, 0xE7, 0x44, 0xE7, 0x45, /* 0xD0-0xD3 */
+	0xE7, 0x46, 0xE7, 0x47, 0xF7, 0xE9, 0xE7, 0x48, /* 0xD4-0xD7 */
+	0xE7, 0x49, 0xE7, 0x4A, 0xE7, 0x4B, 0xE7, 0x4C, /* 0xD8-0xDB */
+	0xE7, 0x4D, 0xE7, 0x4E, 0xE7, 0x4F, 0xE7, 0x50, /* 0xDC-0xDF */
+	0xE7, 0x51, 0xE7, 0x52, 0xE7, 0x53, 0xE7, 0x54, /* 0xE0-0xE3 */
+	0xE7, 0x55, 0xE7, 0x56, 0xE7, 0x57, 0xE7, 0x58, /* 0xE4-0xE7 */
+	0xE7, 0x59, 0xE7, 0x5A, 0xE7, 0x5B, 0xE7, 0x5C, /* 0xE8-0xEB */
+	0xE7, 0x5D, 0xE7, 0x5E, 0xE7, 0x5F, 0xE7, 0x60, /* 0xEC-0xEF */
+	0xE7, 0x61, 0xE7, 0x62, 0xE7, 0x63, 0xE7, 0x64, /* 0xF0-0xF3 */
+	0xE7, 0x65, 0xE7, 0x66, 0xE7, 0x67, 0xE7, 0x68, /* 0xF4-0xF7 */
+	0xE7, 0x69, 0xE7, 0x6A, 0xE7, 0x6B, 0xE7, 0x6C, /* 0xF8-0xFB */
+	0xE7, 0x6D, 0xE7, 0x6E, 0xE7, 0x6F, 0xE7, 0x70, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_94[512] = {
+	0xE7, 0x71, 0xE7, 0x72, 0xE7, 0x73, 0xE7, 0x74, /* 0x00-0x03 */
+	0xE7, 0x75, 0xE7, 0x76, 0xE7, 0x77, 0xE7, 0x78, /* 0x04-0x07 */
+	0xE7, 0x79, 0xE7, 0x7A, 0xE7, 0x7B, 0xE7, 0x7C, /* 0x08-0x0B */
+	0xE7, 0x7D, 0xE7, 0x7E, 0xE7, 0x80, 0xE7, 0x81, /* 0x0C-0x0F */
+	0xE7, 0x82, 0xE7, 0x83, 0xE7, 0x84, 0xE7, 0x85, /* 0x10-0x13 */
+	0xE7, 0x86, 0xE7, 0x87, 0xE7, 0x88, 0xE7, 0x89, /* 0x14-0x17 */
+	0xE7, 0x8A, 0xE7, 0x8B, 0xE7, 0x8C, 0xE7, 0x8D, /* 0x18-0x1B */
+	0xE7, 0x8E, 0xE7, 0x8F, 0xE7, 0x90, 0xE7, 0x91, /* 0x1C-0x1F */
+	0xE7, 0x92, 0xE7, 0x93, 0xE7, 0x94, 0xE7, 0x95, /* 0x20-0x23 */
+	0xE7, 0x96, 0xE7, 0x97, 0xE7, 0x98, 0xE7, 0x99, /* 0x24-0x27 */
+	0xE7, 0x9A, 0xE7, 0x9B, 0xE7, 0x9C, 0xE7, 0x9D, /* 0x28-0x2B */
+	0xE7, 0x9E, 0xE7, 0x9F, 0xE7, 0xA0, 0xE8, 0x40, /* 0x2C-0x2F */
+	0xE8, 0x41, 0xE8, 0x42, 0xE8, 0x43, 0xE8, 0x44, /* 0x30-0x33 */
+	0xE8, 0x45, 0xE8, 0x46, 0xE8, 0x47, 0xE8, 0x48, /* 0x34-0x37 */
+	0xE8, 0x49, 0xE8, 0x4A, 0xE8, 0x4B, 0xE8, 0x4C, /* 0x38-0x3B */
+	0xE8, 0x4D, 0xE8, 0x4E, 0xF6, 0xCD, 0xE8, 0x4F, /* 0x3C-0x3F */
+	0xE8, 0x50, 0xE8, 0x51, 0xE8, 0x52, 0xE8, 0x53, /* 0x40-0x43 */
+	0xE8, 0x54, 0xE8, 0x55, 0xE8, 0x56, 0xE8, 0x57, /* 0x44-0x47 */
+	0xE8, 0x58, 0xE8, 0x59, 0xE8, 0x5A, 0xE8, 0x5B, /* 0x48-0x4B */
+	0xE8, 0x5C, 0xE8, 0x5D, 0xE8, 0x5E, 0xE8, 0x5F, /* 0x4C-0x4F */
+	0xE8, 0x60, 0xE8, 0x61, 0xE8, 0x62, 0xE8, 0x63, /* 0x50-0x53 */
+	0xE8, 0x64, 0xE8, 0x65, 0xE8, 0x66, 0xE8, 0x67, /* 0x54-0x57 */
+	0xE8, 0x68, 0xE8, 0x69, 0xE8, 0x6A, 0xE8, 0x6B, /* 0x58-0x5B */
+	0xE8, 0x6C, 0xE8, 0x6D, 0xE8, 0x6E, 0xE8, 0x6F, /* 0x5C-0x5F */
+	0xE8, 0x70, 0xE8, 0x71, 0xE8, 0x72, 0xE8, 0x73, /* 0x60-0x63 */
+	0xE8, 0x74, 0xE8, 0x75, 0xE8, 0x76, 0xE8, 0x77, /* 0x64-0x67 */
+	0xE8, 0x78, 0xE8, 0x79, 0xE8, 0x7A, 0xF6, 0xCE, /* 0x68-0x6B */
+	0xE8, 0x7B, 0xE8, 0x7C, 0xE8, 0x7D, 0xE8, 0x7E, /* 0x6C-0x6F */
+	0xE8, 0x80, 0xE8, 0x81, 0xE8, 0x82, 0xE8, 0x83, /* 0x70-0x73 */
+	0xE8, 0x84, 0xE8, 0x85, 0xE8, 0x86, 0xE8, 0x87, /* 0x74-0x77 */
+	0xE8, 0x88, 0xE8, 0x89, 0xE8, 0x8A, 0xE8, 0x8B, /* 0x78-0x7B */
+	0xE8, 0x8C, 0xE8, 0x8D, 0xE8, 0x8E, 0xE8, 0x8F, /* 0x7C-0x7F */
+	
+	0xE8, 0x90, 0xE8, 0x91, 0xE8, 0x92, 0xE8, 0x93, /* 0x80-0x83 */
+	0xE8, 0x94, 0xEE, 0xC4, 0xEE, 0xC5, 0xEE, 0xC6, /* 0x84-0x87 */
+	0xD5, 0xEB, 0xB6, 0xA4, 0xEE, 0xC8, 0xEE, 0xC7, /* 0x88-0x8B */
+	0xEE, 0xC9, 0xEE, 0xCA, 0xC7, 0xA5, 0xEE, 0xCB, /* 0x8C-0x8F */
+	0xEE, 0xCC, 0xE8, 0x95, 0xB7, 0xB0, 0xB5, 0xF6, /* 0x90-0x93 */
+	0xEE, 0xCD, 0xEE, 0xCF, 0xE8, 0x96, 0xEE, 0xCE, /* 0x94-0x97 */
+	0xE8, 0x97, 0xB8, 0xC6, 0xEE, 0xD0, 0xEE, 0xD1, /* 0x98-0x9B */
+	0xEE, 0xD2, 0xB6, 0xDB, 0xB3, 0xAE, 0xD6, 0xD3, /* 0x9C-0x9F */
+	0xC4, 0xC6, 0xB1, 0xB5, 0xB8, 0xD6, 0xEE, 0xD3, /* 0xA0-0xA3 */
+	0xEE, 0xD4, 0xD4, 0xBF, 0xC7, 0xD5, 0xBE, 0xFB, /* 0xA4-0xA7 */
+	0xCE, 0xD9, 0xB9, 0xB3, 0xEE, 0xD6, 0xEE, 0xD5, /* 0xA8-0xAB */
+	0xEE, 0xD8, 0xEE, 0xD7, 0xC5, 0xA5, 0xEE, 0xD9, /* 0xAC-0xAF */
+	0xEE, 0xDA, 0xC7, 0xAE, 0xEE, 0xDB, 0xC7, 0xAF, /* 0xB0-0xB3 */
+	0xEE, 0xDC, 0xB2, 0xA7, 0xEE, 0xDD, 0xEE, 0xDE, /* 0xB4-0xB7 */
+	0xEE, 0xDF, 0xEE, 0xE0, 0xEE, 0xE1, 0xD7, 0xEA, /* 0xB8-0xBB */
+	0xEE, 0xE2, 0xEE, 0xE3, 0xBC, 0xD8, 0xEE, 0xE4, /* 0xBC-0xBF */
+	0xD3, 0xCB, 0xCC, 0xFA, 0xB2, 0xAC, 0xC1, 0xE5, /* 0xC0-0xC3 */
+	0xEE, 0xE5, 0xC7, 0xA6, 0xC3, 0xAD, 0xE8, 0x98, /* 0xC4-0xC7 */
+	0xEE, 0xE6, 0xEE, 0xE7, 0xEE, 0xE8, 0xEE, 0xE9, /* 0xC8-0xCB */
+	0xEE, 0xEA, 0xEE, 0xEB, 0xEE, 0xEC, 0xE8, 0x99, /* 0xCC-0xCF */
+	0xEE, 0xED, 0xEE, 0xEE, 0xEE, 0xEF, 0xE8, 0x9A, /* 0xD0-0xD3 */
+	0xE8, 0x9B, 0xEE, 0xF0, 0xEE, 0xF1, 0xEE, 0xF2, /* 0xD4-0xD7 */
+	0xEE, 0xF4, 0xEE, 0xF3, 0xE8, 0x9C, 0xEE, 0xF5, /* 0xD8-0xDB */
+	0xCD, 0xAD, 0xC2, 0xC1, 0xEE, 0xF6, 0xEE, 0xF7, /* 0xDC-0xDF */
+	0xEE, 0xF8, 0xD5, 0xA1, 0xEE, 0xF9, 0xCF, 0xB3, /* 0xE0-0xE3 */
+	0xEE, 0xFA, 0xEE, 0xFB, 0xE8, 0x9D, 0xEE, 0xFC, /* 0xE4-0xE7 */
+	0xEE, 0xFD, 0xEF, 0xA1, 0xEE, 0xFE, 0xEF, 0xA2, /* 0xE8-0xEB */
+	0xB8, 0xF5, 0xC3, 0xFA, 0xEF, 0xA3, 0xEF, 0xA4, /* 0xEC-0xEF */
+	0xBD, 0xC2, 0xD2, 0xBF, 0xB2, 0xF9, 0xEF, 0xA5, /* 0xF0-0xF3 */
+	0xEF, 0xA6, 0xEF, 0xA7, 0xD2, 0xF8, 0xEF, 0xA8, /* 0xF4-0xF7 */
+	0xD6, 0xFD, 0xEF, 0xA9, 0xC6, 0xCC, 0xE8, 0x9E, /* 0xF8-0xFB */
+	0xEF, 0xAA, 0xEF, 0xAB, 0xC1, 0xB4, 0xEF, 0xAC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_95[512] = {
+	0xCF, 0xFA, 0xCB, 0xF8, 0xEF, 0xAE, 0xEF, 0xAD, /* 0x00-0x03 */
+	0xB3, 0xFA, 0xB9, 0xF8, 0xEF, 0xAF, 0xEF, 0xB0, /* 0x04-0x07 */
+	0xD0, 0xE2, 0xEF, 0xB1, 0xEF, 0xB2, 0xB7, 0xE6, /* 0x08-0x0B */
+	0xD0, 0xBF, 0xEF, 0xB3, 0xEF, 0xB4, 0xEF, 0xB5, /* 0x0C-0x0F */
+	0xC8, 0xF1, 0xCC, 0xE0, 0xEF, 0xB6, 0xEF, 0xB7, /* 0x10-0x13 */
+	0xEF, 0xB8, 0xEF, 0xB9, 0xEF, 0xBA, 0xD5, 0xE0, /* 0x14-0x17 */
+	0xEF, 0xBB, 0xB4, 0xED, 0xC3, 0xAA, 0xEF, 0xBC, /* 0x18-0x1B */
+	0xE8, 0x9F, 0xEF, 0xBD, 0xEF, 0xBE, 0xEF, 0xBF, /* 0x1C-0x1F */
+	0xE8, 0xA0, 0xCE, 0xFD, 0xEF, 0xC0, 0xC2, 0xE0, /* 0x20-0x23 */
+	0xB4, 0xB8, 0xD7, 0xB6, 0xBD, 0xF5, 0xE9, 0x40, /* 0x24-0x27 */
+	0xCF, 0xC7, 0xEF, 0xC3, 0xEF, 0xC1, 0xEF, 0xC2, /* 0x28-0x2B */
+	0xEF, 0xC4, 0xB6, 0xA7, 0xBC, 0xFC, 0xBE, 0xE2, /* 0x2C-0x2F */
+	0xC3, 0xCC, 0xEF, 0xC5, 0xEF, 0xC6, 0xE9, 0x41, /* 0x30-0x33 */
+	0xEF, 0xC7, 0xEF, 0xCF, 0xEF, 0xC8, 0xEF, 0xC9, /* 0x34-0x37 */
+	0xEF, 0xCA, 0xC7, 0xC2, 0xEF, 0xF1, 0xB6, 0xCD, /* 0x38-0x3B */
+	0xEF, 0xCB, 0xE9, 0x42, 0xEF, 0xCC, 0xEF, 0xCD, /* 0x3C-0x3F */
+	0xB6, 0xC6, 0xC3, 0xBE, 0xEF, 0xCE, 0xE9, 0x43, /* 0x40-0x43 */
+	0xEF, 0xD0, 0xEF, 0xD1, 0xEF, 0xD2, 0xD5, 0xF2, /* 0x44-0x47 */
+	0xE9, 0x44, 0xEF, 0xD3, 0xC4, 0xF7, 0xE9, 0x45, /* 0x48-0x4B */
+	0xEF, 0xD4, 0xC4, 0xF8, 0xEF, 0xD5, 0xEF, 0xD6, /* 0x4C-0x4F */
+	0xB8, 0xE4, 0xB0, 0xF7, 0xEF, 0xD7, 0xEF, 0xD8, /* 0x50-0x53 */
+	0xEF, 0xD9, 0xE9, 0x46, 0xEF, 0xDA, 0xEF, 0xDB, /* 0x54-0x57 */
+	0xEF, 0xDC, 0xEF, 0xDD, 0xE9, 0x47, 0xEF, 0xDE, /* 0x58-0x5B */
+	0xBE, 0xB5, 0xEF, 0xE1, 0xEF, 0xDF, 0xEF, 0xE0, /* 0x5C-0x5F */
+	0xE9, 0x48, 0xEF, 0xE2, 0xEF, 0xE3, 0xC1, 0xCD, /* 0x60-0x63 */
+	0xEF, 0xE4, 0xEF, 0xE5, 0xEF, 0xE6, 0xEF, 0xE7, /* 0x64-0x67 */
+	0xEF, 0xE8, 0xEF, 0xE9, 0xEF, 0xEA, 0xEF, 0xEB, /* 0x68-0x6B */
+	0xEF, 0xEC, 0xC0, 0xD8, 0xE9, 0x49, 0xEF, 0xED, /* 0x6C-0x6F */
+	0xC1, 0xAD, 0xEF, 0xEE, 0xEF, 0xEF, 0xEF, 0xF0, /* 0x70-0x73 */
+	0xE9, 0x4A, 0xE9, 0x4B, 0xCF, 0xE2, 0xE9, 0x4C, /* 0x74-0x77 */
+	0xE9, 0x4D, 0xE9, 0x4E, 0xE9, 0x4F, 0xE9, 0x50, /* 0x78-0x7B */
+	0xE9, 0x51, 0xE9, 0x52, 0xE9, 0x53, 0xB3, 0xA4, /* 0x7C-0x7F */
+	
+	0xE9, 0x54, 0xE9, 0x55, 0xE9, 0x56, 0xE9, 0x57, /* 0x80-0x83 */
+	0xE9, 0x58, 0xE9, 0x59, 0xE9, 0x5A, 0xE9, 0x5B, /* 0x84-0x87 */
+	0xE9, 0x5C, 0xE9, 0x5D, 0xE9, 0x5E, 0xE9, 0x5F, /* 0x88-0x8B */
+	0xE9, 0x60, 0xE9, 0x61, 0xE9, 0x62, 0xE9, 0x63, /* 0x8C-0x8F */
+	0xE9, 0x64, 0xE9, 0x65, 0xE9, 0x66, 0xE9, 0x67, /* 0x90-0x93 */
+	0xE9, 0x68, 0xE9, 0x69, 0xE9, 0x6A, 0xE9, 0x6B, /* 0x94-0x97 */
+	0xE9, 0x6C, 0xE9, 0x6D, 0xE9, 0x6E, 0xE9, 0x6F, /* 0x98-0x9B */
+	0xE9, 0x70, 0xE9, 0x71, 0xE9, 0x72, 0xE9, 0x73, /* 0x9C-0x9F */
+	0xE9, 0x74, 0xE9, 0x75, 0xE9, 0x76, 0xE9, 0x77, /* 0xA0-0xA3 */
+	0xE9, 0x78, 0xE9, 0x79, 0xE9, 0x7A, 0xE9, 0x7B, /* 0xA4-0xA7 */
+	0xE9, 0x7C, 0xE9, 0x7D, 0xE9, 0x7E, 0xE9, 0x80, /* 0xA8-0xAB */
+	0xE9, 0x81, 0xE9, 0x82, 0xE9, 0x83, 0xE9, 0x84, /* 0xAC-0xAF */
+	0xE9, 0x85, 0xE9, 0x86, 0xE9, 0x87, 0xE9, 0x88, /* 0xB0-0xB3 */
+	0xE9, 0x89, 0xE9, 0x8A, 0xE9, 0x8B, 0xE9, 0x8C, /* 0xB4-0xB7 */
+	0xE9, 0x8D, 0xE9, 0x8E, 0xE9, 0x8F, 0xE9, 0x90, /* 0xB8-0xBB */
+	0xE9, 0x91, 0xE9, 0x92, 0xE9, 0x93, 0xE9, 0x94, /* 0xBC-0xBF */
+	0xE9, 0x95, 0xE9, 0x96, 0xE9, 0x97, 0xE9, 0x98, /* 0xC0-0xC3 */
+	0xE9, 0x99, 0xE9, 0x9A, 0xE9, 0x9B, 0xE9, 0x9C, /* 0xC4-0xC7 */
+	0xE9, 0x9D, 0xE9, 0x9E, 0xE9, 0x9F, 0xE9, 0xA0, /* 0xC8-0xCB */
+	0xEA, 0x40, 0xEA, 0x41, 0xEA, 0x42, 0xEA, 0x43, /* 0xCC-0xCF */
+	0xEA, 0x44, 0xEA, 0x45, 0xEA, 0x46, 0xEA, 0x47, /* 0xD0-0xD3 */
+	0xEA, 0x48, 0xEA, 0x49, 0xEA, 0x4A, 0xEA, 0x4B, /* 0xD4-0xD7 */
+	0xEA, 0x4C, 0xEA, 0x4D, 0xEA, 0x4E, 0xEA, 0x4F, /* 0xD8-0xDB */
+	0xEA, 0x50, 0xEA, 0x51, 0xEA, 0x52, 0xEA, 0x53, /* 0xDC-0xDF */
+	0xEA, 0x54, 0xEA, 0x55, 0xEA, 0x56, 0xEA, 0x57, /* 0xE0-0xE3 */
+	0xEA, 0x58, 0xEA, 0x59, 0xEA, 0x5A, 0xEA, 0x5B, /* 0xE4-0xE7 */
+	0xC3, 0xC5, 0xE3, 0xC5, 0xC9, 0xC1, 0xE3, 0xC6, /* 0xE8-0xEB */
+	0xEA, 0x5C, 0xB1, 0xD5, 0xCE, 0xCA, 0xB4, 0xB3, /* 0xEC-0xEF */
+	0xC8, 0xF2, 0xE3, 0xC7, 0xCF, 0xD0, 0xE3, 0xC8, /* 0xF0-0xF3 */
+	0xBC, 0xE4, 0xE3, 0xC9, 0xE3, 0xCA, 0xC3, 0xC6, /* 0xF4-0xF7 */
+	0xD5, 0xA2, 0xC4, 0xD6, 0xB9, 0xEB, 0xCE, 0xC5, /* 0xF8-0xFB */
+	0xE3, 0xCB, 0xC3, 0xF6, 0xE3, 0xCC, 0xEA, 0x5D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_96[512] = {
+	0xB7, 0xA7, 0xB8, 0xF3, 0xBA, 0xD2, 0xE3, 0xCD, /* 0x00-0x03 */
+	0xE3, 0xCE, 0xD4, 0xC4, 0xE3, 0xCF, 0xEA, 0x5E, /* 0x04-0x07 */
+	0xE3, 0xD0, 0xD1, 0xCB, 0xE3, 0xD1, 0xE3, 0xD2, /* 0x08-0x0B */
+	0xE3, 0xD3, 0xE3, 0xD4, 0xD1, 0xD6, 0xE3, 0xD5, /* 0x0C-0x0F */
+	0xB2, 0xFB, 0xC0, 0xBB, 0xE3, 0xD6, 0xEA, 0x5F, /* 0x10-0x13 */
+	0xC0, 0xAB, 0xE3, 0xD7, 0xE3, 0xD8, 0xE3, 0xD9, /* 0x14-0x17 */
+	0xEA, 0x60, 0xE3, 0xDA, 0xE3, 0xDB, 0xEA, 0x61, /* 0x18-0x1B */
+	0xB8, 0xB7, 0xDA, 0xE2, 0xEA, 0x62, 0xB6, 0xD3, /* 0x1C-0x1F */
+	0xEA, 0x63, 0xDA, 0xE4, 0xDA, 0xE3, 0xEA, 0x64, /* 0x20-0x23 */
+	0xEA, 0x65, 0xEA, 0x66, 0xEA, 0x67, 0xEA, 0x68, /* 0x24-0x27 */
+	0xEA, 0x69, 0xEA, 0x6A, 0xDA, 0xE6, 0xEA, 0x6B, /* 0x28-0x2B */
+	0xEA, 0x6C, 0xEA, 0x6D, 0xC8, 0xEE, 0xEA, 0x6E, /* 0x2C-0x2F */
+	0xEA, 0x6F, 0xDA, 0xE5, 0xB7, 0xC0, 0xD1, 0xF4, /* 0x30-0x33 */
+	0xD2, 0xF5, 0xD5, 0xF3, 0xBD, 0xD7, 0xEA, 0x70, /* 0x34-0x37 */
+	0xEA, 0x71, 0xEA, 0x72, 0xEA, 0x73, 0xD7, 0xE8, /* 0x38-0x3B */
+	0xDA, 0xE8, 0xDA, 0xE7, 0xEA, 0x74, 0xB0, 0xA2, /* 0x3C-0x3F */
+	0xCD, 0xD3, 0xEA, 0x75, 0xDA, 0xE9, 0xEA, 0x76, /* 0x40-0x43 */
+	0xB8, 0xBD, 0xBC, 0xCA, 0xC2, 0xBD, 0xC2, 0xA4, /* 0x44-0x47 */
+	0xB3, 0xC2, 0xDA, 0xEA, 0xEA, 0x77, 0xC2, 0xAA, /* 0x48-0x4B */
+	0xC4, 0xB0, 0xBD, 0xB5, 0xEA, 0x78, 0xEA, 0x79, /* 0x4C-0x4F */
+	0xCF, 0xDE, 0xEA, 0x7A, 0xEA, 0x7B, 0xEA, 0x7C, /* 0x50-0x53 */
+	0xDA, 0xEB, 0xC9, 0xC2, 0xEA, 0x7D, 0xEA, 0x7E, /* 0x54-0x57 */
+	0xEA, 0x80, 0xEA, 0x81, 0xEA, 0x82, 0xB1, 0xDD, /* 0x58-0x5B */
+	0xEA, 0x83, 0xEA, 0x84, 0xEA, 0x85, 0xDA, 0xEC, /* 0x5C-0x5F */
+	0xEA, 0x86, 0xB6, 0xB8, 0xD4, 0xBA, 0xEA, 0x87, /* 0x60-0x63 */
+	0xB3, 0xFD, 0xEA, 0x88, 0xEA, 0x89, 0xDA, 0xED, /* 0x64-0x67 */
+	0xD4, 0xC9, 0xCF, 0xD5, 0xC5, 0xE3, 0xEA, 0x8A, /* 0x68-0x6B */
+	0xDA, 0xEE, 0xEA, 0x8B, 0xEA, 0x8C, 0xEA, 0x8D, /* 0x6C-0x6F */
+	0xEA, 0x8E, 0xEA, 0x8F, 0xDA, 0xEF, 0xEA, 0x90, /* 0x70-0x73 */
+	0xDA, 0xF0, 0xC1, 0xEA, 0xCC, 0xD5, 0xCF, 0xDD, /* 0x74-0x77 */
+	0xEA, 0x91, 0xEA, 0x92, 0xEA, 0x93, 0xEA, 0x94, /* 0x78-0x7B */
+	0xEA, 0x95, 0xEA, 0x96, 0xEA, 0x97, 0xEA, 0x98, /* 0x7C-0x7F */
+	
+	0xEA, 0x99, 0xEA, 0x9A, 0xEA, 0x9B, 0xEA, 0x9C, /* 0x80-0x83 */
+	0xEA, 0x9D, 0xD3, 0xE7, 0xC2, 0xA1, 0xEA, 0x9E, /* 0x84-0x87 */
+	0xDA, 0xF1, 0xEA, 0x9F, 0xEA, 0xA0, 0xCB, 0xE5, /* 0x88-0x8B */
+	0xEB, 0x40, 0xDA, 0xF2, 0xEB, 0x41, 0xCB, 0xE6, /* 0x8C-0x8F */
+	0xD2, 0xFE, 0xEB, 0x42, 0xEB, 0x43, 0xEB, 0x44, /* 0x90-0x93 */
+	0xB8, 0xF4, 0xEB, 0x45, 0xEB, 0x46, 0xDA, 0xF3, /* 0x94-0x97 */
+	0xB0, 0xAF, 0xCF, 0xB6, 0xEB, 0x47, 0xEB, 0x48, /* 0x98-0x9B */
+	0xD5, 0xCF, 0xEB, 0x49, 0xEB, 0x4A, 0xEB, 0x4B, /* 0x9C-0x9F */
+	0xEB, 0x4C, 0xEB, 0x4D, 0xEB, 0x4E, 0xEB, 0x4F, /* 0xA0-0xA3 */
+	0xEB, 0x50, 0xEB, 0x51, 0xEB, 0x52, 0xCB, 0xED, /* 0xA4-0xA7 */
+	0xEB, 0x53, 0xEB, 0x54, 0xEB, 0x55, 0xEB, 0x56, /* 0xA8-0xAB */
+	0xEB, 0x57, 0xEB, 0x58, 0xEB, 0x59, 0xEB, 0x5A, /* 0xAC-0xAF */
+	0xDA, 0xF4, 0xEB, 0x5B, 0xEB, 0x5C, 0xE3, 0xC4, /* 0xB0-0xB3 */
+	0xEB, 0x5D, 0xEB, 0x5E, 0xC1, 0xA5, 0xEB, 0x5F, /* 0xB4-0xB7 */
+	0xEB, 0x60, 0xF6, 0xBF, 0xEB, 0x61, 0xEB, 0x62, /* 0xB8-0xBB */
+	0xF6, 0xC0, 0xF6, 0xC1, 0xC4, 0xD1, 0xEB, 0x63, /* 0xBC-0xBF */
+	0xC8, 0xB8, 0xD1, 0xE3, 0xEB, 0x64, 0xEB, 0x65, /* 0xC0-0xC3 */
+	0xD0, 0xDB, 0xD1, 0xC5, 0xBC, 0xAF, 0xB9, 0xCD, /* 0xC4-0xC7 */
+	0xEB, 0x66, 0xEF, 0xF4, 0xEB, 0x67, 0xEB, 0x68, /* 0xC8-0xCB */
+	0xB4, 0xC6, 0xD3, 0xBA, 0xF6, 0xC2, 0xB3, 0xFB, /* 0xCC-0xCF */
+	0xEB, 0x69, 0xEB, 0x6A, 0xF6, 0xC3, 0xEB, 0x6B, /* 0xD0-0xD3 */
+	0xEB, 0x6C, 0xB5, 0xF1, 0xEB, 0x6D, 0xEB, 0x6E, /* 0xD4-0xD7 */
+	0xEB, 0x6F, 0xEB, 0x70, 0xEB, 0x71, 0xEB, 0x72, /* 0xD8-0xDB */
+	0xEB, 0x73, 0xEB, 0x74, 0xEB, 0x75, 0xEB, 0x76, /* 0xDC-0xDF */
+	0xF6, 0xC5, 0xEB, 0x77, 0xEB, 0x78, 0xEB, 0x79, /* 0xE0-0xE3 */
+	0xEB, 0x7A, 0xEB, 0x7B, 0xEB, 0x7C, 0xEB, 0x7D, /* 0xE4-0xE7 */
+	0xD3, 0xEA, 0xF6, 0xA7, 0xD1, 0xA9, 0xEB, 0x7E, /* 0xE8-0xEB */
+	0xEB, 0x80, 0xEB, 0x81, 0xEB, 0x82, 0xF6, 0xA9, /* 0xEC-0xEF */
+	0xEB, 0x83, 0xEB, 0x84, 0xEB, 0x85, 0xF6, 0xA8, /* 0xF0-0xF3 */
+	0xEB, 0x86, 0xEB, 0x87, 0xC1, 0xE3, 0xC0, 0xD7, /* 0xF4-0xF7 */
+	0xEB, 0x88, 0xB1, 0xA2, 0xEB, 0x89, 0xEB, 0x8A, /* 0xF8-0xFB */
+	0xEB, 0x8B, 0xEB, 0x8C, 0xCE, 0xED, 0xEB, 0x8D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_97[512] = {
+	0xD0, 0xE8, 0xF6, 0xAB, 0xEB, 0x8E, 0xEB, 0x8F, /* 0x00-0x03 */
+	0xCF, 0xF6, 0xEB, 0x90, 0xF6, 0xAA, 0xD5, 0xF0, /* 0x04-0x07 */
+	0xF6, 0xAC, 0xC3, 0xB9, 0xEB, 0x91, 0xEB, 0x92, /* 0x08-0x0B */
+	0xEB, 0x93, 0xBB, 0xF4, 0xF6, 0xAE, 0xF6, 0xAD, /* 0x0C-0x0F */
+	0xEB, 0x94, 0xEB, 0x95, 0xEB, 0x96, 0xC4, 0xDE, /* 0x10-0x13 */
+	0xEB, 0x97, 0xEB, 0x98, 0xC1, 0xD8, 0xEB, 0x99, /* 0x14-0x17 */
+	0xEB, 0x9A, 0xEB, 0x9B, 0xEB, 0x9C, 0xEB, 0x9D, /* 0x18-0x1B */
+	0xCB, 0xAA, 0xEB, 0x9E, 0xCF, 0xBC, 0xEB, 0x9F, /* 0x1C-0x1F */
+	0xEB, 0xA0, 0xEC, 0x40, 0xEC, 0x41, 0xEC, 0x42, /* 0x20-0x23 */
+	0xEC, 0x43, 0xEC, 0x44, 0xEC, 0x45, 0xEC, 0x46, /* 0x24-0x27 */
+	0xEC, 0x47, 0xEC, 0x48, 0xF6, 0xAF, 0xEC, 0x49, /* 0x28-0x2B */
+	0xEC, 0x4A, 0xF6, 0xB0, 0xEC, 0x4B, 0xEC, 0x4C, /* 0x2C-0x2F */
+	0xF6, 0xB1, 0xEC, 0x4D, 0xC2, 0xB6, 0xEC, 0x4E, /* 0x30-0x33 */
+	0xEC, 0x4F, 0xEC, 0x50, 0xEC, 0x51, 0xEC, 0x52, /* 0x34-0x37 */
+	0xB0, 0xD4, 0xC5, 0xF9, 0xEC, 0x53, 0xEC, 0x54, /* 0x38-0x3B */
+	0xEC, 0x55, 0xEC, 0x56, 0xF6, 0xB2, 0xEC, 0x57, /* 0x3C-0x3F */
+	0xEC, 0x58, 0xEC, 0x59, 0xEC, 0x5A, 0xEC, 0x5B, /* 0x40-0x43 */
+	0xEC, 0x5C, 0xEC, 0x5D, 0xEC, 0x5E, 0xEC, 0x5F, /* 0x44-0x47 */
+	0xEC, 0x60, 0xEC, 0x61, 0xEC, 0x62, 0xEC, 0x63, /* 0x48-0x4B */
+	0xEC, 0x64, 0xEC, 0x65, 0xEC, 0x66, 0xEC, 0x67, /* 0x4C-0x4F */
+	0xEC, 0x68, 0xEC, 0x69, 0xC7, 0xE0, 0xF6, 0xA6, /* 0x50-0x53 */
+	0xEC, 0x6A, 0xEC, 0x6B, 0xBE, 0xB8, 0xEC, 0x6C, /* 0x54-0x57 */
+	0xEC, 0x6D, 0xBE, 0xB2, 0xEC, 0x6E, 0xB5, 0xE5, /* 0x58-0x5B */
+	0xEC, 0x6F, 0xEC, 0x70, 0xB7, 0xC7, 0xEC, 0x71, /* 0x5C-0x5F */
+	0xBF, 0xBF, 0xC3, 0xD2, 0xC3, 0xE6, 0xEC, 0x72, /* 0x60-0x63 */
+	0xEC, 0x73, 0xD8, 0xCC, 0xEC, 0x74, 0xEC, 0x75, /* 0x64-0x67 */
+	0xEC, 0x76, 0xB8, 0xEF, 0xEC, 0x77, 0xEC, 0x78, /* 0x68-0x6B */
+	0xEC, 0x79, 0xEC, 0x7A, 0xEC, 0x7B, 0xEC, 0x7C, /* 0x6C-0x6F */
+	0xEC, 0x7D, 0xEC, 0x7E, 0xEC, 0x80, 0xBD, 0xF9, /* 0x70-0x73 */
+	0xD1, 0xA5, 0xEC, 0x81, 0xB0, 0xD0, 0xEC, 0x82, /* 0x74-0x77 */
+	0xEC, 0x83, 0xEC, 0x84, 0xEC, 0x85, 0xEC, 0x86, /* 0x78-0x7B */
+	0xF7, 0xB0, 0xEC, 0x87, 0xEC, 0x88, 0xEC, 0x89, /* 0x7C-0x7F */
+	
+	0xEC, 0x8A, 0xEC, 0x8B, 0xEC, 0x8C, 0xEC, 0x8D, /* 0x80-0x83 */
+	0xEC, 0x8E, 0xF7, 0xB1, 0xEC, 0x8F, 0xEC, 0x90, /* 0x84-0x87 */
+	0xEC, 0x91, 0xEC, 0x92, 0xEC, 0x93, 0xD0, 0xAC, /* 0x88-0x8B */
+	0xEC, 0x94, 0xB0, 0xB0, 0xEC, 0x95, 0xEC, 0x96, /* 0x8C-0x8F */
+	0xEC, 0x97, 0xF7, 0xB2, 0xF7, 0xB3, 0xEC, 0x98, /* 0x90-0x93 */
+	0xF7, 0xB4, 0xEC, 0x99, 0xEC, 0x9A, 0xEC, 0x9B, /* 0x94-0x97 */
+	0xC7, 0xCA, 0xEC, 0x9C, 0xEC, 0x9D, 0xEC, 0x9E, /* 0x98-0x9B */
+	0xEC, 0x9F, 0xEC, 0xA0, 0xED, 0x40, 0xED, 0x41, /* 0x9C-0x9F */
+	0xBE, 0xCF, 0xED, 0x42, 0xED, 0x43, 0xF7, 0xB7, /* 0xA0-0xA3 */
+	0xED, 0x44, 0xED, 0x45, 0xED, 0x46, 0xED, 0x47, /* 0xA4-0xA7 */
+	0xED, 0x48, 0xED, 0x49, 0xED, 0x4A, 0xF7, 0xB6, /* 0xA8-0xAB */
+	0xED, 0x4B, 0xB1, 0xDE, 0xED, 0x4C, 0xF7, 0xB5, /* 0xAC-0xAF */
+	0xED, 0x4D, 0xED, 0x4E, 0xF7, 0xB8, 0xED, 0x4F, /* 0xB0-0xB3 */
+	0xF7, 0xB9, 0xED, 0x50, 0xED, 0x51, 0xED, 0x52, /* 0xB4-0xB7 */
+	0xED, 0x53, 0xED, 0x54, 0xED, 0x55, 0xED, 0x56, /* 0xB8-0xBB */
+	0xED, 0x57, 0xED, 0x58, 0xED, 0x59, 0xED, 0x5A, /* 0xBC-0xBF */
+	0xED, 0x5B, 0xED, 0x5C, 0xED, 0x5D, 0xED, 0x5E, /* 0xC0-0xC3 */
+	0xED, 0x5F, 0xED, 0x60, 0xED, 0x61, 0xED, 0x62, /* 0xC4-0xC7 */
+	0xED, 0x63, 0xED, 0x64, 0xED, 0x65, 0xED, 0x66, /* 0xC8-0xCB */
+	0xED, 0x67, 0xED, 0x68, 0xED, 0x69, 0xED, 0x6A, /* 0xCC-0xCF */
+	0xED, 0x6B, 0xED, 0x6C, 0xED, 0x6D, 0xED, 0x6E, /* 0xD0-0xD3 */
+	0xED, 0x6F, 0xED, 0x70, 0xED, 0x71, 0xED, 0x72, /* 0xD4-0xD7 */
+	0xED, 0x73, 0xED, 0x74, 0xED, 0x75, 0xED, 0x76, /* 0xD8-0xDB */
+	0xED, 0x77, 0xED, 0x78, 0xED, 0x79, 0xED, 0x7A, /* 0xDC-0xDF */
+	0xED, 0x7B, 0xED, 0x7C, 0xED, 0x7D, 0xED, 0x7E, /* 0xE0-0xE3 */
+	0xED, 0x80, 0xED, 0x81, 0xCE, 0xA4, 0xC8, 0xCD, /* 0xE4-0xE7 */
+	0xED, 0x82, 0xBA, 0xAB, 0xE8, 0xB8, 0xE8, 0xB9, /* 0xE8-0xEB */
+	0xE8, 0xBA, 0xBE, 0xC2, 0xED, 0x83, 0xED, 0x84, /* 0xEC-0xEF */
+	0xED, 0x85, 0xED, 0x86, 0xED, 0x87, 0xD2, 0xF4, /* 0xF0-0xF3 */
+	0xED, 0x88, 0xD4, 0xCF, 0xC9, 0xD8, 0xED, 0x89, /* 0xF4-0xF7 */
+	0xED, 0x8A, 0xED, 0x8B, 0xED, 0x8C, 0xED, 0x8D, /* 0xF8-0xFB */
+	0xED, 0x8E, 0xED, 0x8F, 0xED, 0x90, 0xED, 0x91, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_98[512] = {
+	0xED, 0x92, 0xED, 0x93, 0xED, 0x94, 0xED, 0x95, /* 0x00-0x03 */
+	0xED, 0x96, 0xED, 0x97, 0xED, 0x98, 0xED, 0x99, /* 0x04-0x07 */
+	0xED, 0x9A, 0xED, 0x9B, 0xED, 0x9C, 0xED, 0x9D, /* 0x08-0x0B */
+	0xED, 0x9E, 0xED, 0x9F, 0xED, 0xA0, 0xEE, 0x40, /* 0x0C-0x0F */
+	0xEE, 0x41, 0xEE, 0x42, 0xEE, 0x43, 0xEE, 0x44, /* 0x10-0x13 */
+	0xEE, 0x45, 0xEE, 0x46, 0xEE, 0x47, 0xEE, 0x48, /* 0x14-0x17 */
+	0xEE, 0x49, 0xEE, 0x4A, 0xEE, 0x4B, 0xEE, 0x4C, /* 0x18-0x1B */
+	0xEE, 0x4D, 0xEE, 0x4E, 0xEE, 0x4F, 0xEE, 0x50, /* 0x1C-0x1F */
+	0xEE, 0x51, 0xEE, 0x52, 0xEE, 0x53, 0xEE, 0x54, /* 0x20-0x23 */
+	0xEE, 0x55, 0xEE, 0x56, 0xEE, 0x57, 0xEE, 0x58, /* 0x24-0x27 */
+	0xEE, 0x59, 0xEE, 0x5A, 0xEE, 0x5B, 0xEE, 0x5C, /* 0x28-0x2B */
+	0xEE, 0x5D, 0xEE, 0x5E, 0xEE, 0x5F, 0xEE, 0x60, /* 0x2C-0x2F */
+	0xEE, 0x61, 0xEE, 0x62, 0xEE, 0x63, 0xEE, 0x64, /* 0x30-0x33 */
+	0xEE, 0x65, 0xEE, 0x66, 0xEE, 0x67, 0xEE, 0x68, /* 0x34-0x37 */
+	0xEE, 0x69, 0xEE, 0x6A, 0xEE, 0x6B, 0xEE, 0x6C, /* 0x38-0x3B */
+	0xEE, 0x6D, 0xEE, 0x6E, 0xEE, 0x6F, 0xEE, 0x70, /* 0x3C-0x3F */
+	0xEE, 0x71, 0xEE, 0x72, 0xEE, 0x73, 0xEE, 0x74, /* 0x40-0x43 */
+	0xEE, 0x75, 0xEE, 0x76, 0xEE, 0x77, 0xEE, 0x78, /* 0x44-0x47 */
+	0xEE, 0x79, 0xEE, 0x7A, 0xEE, 0x7B, 0xEE, 0x7C, /* 0x48-0x4B */
+	0xEE, 0x7D, 0xEE, 0x7E, 0xEE, 0x80, 0xEE, 0x81, /* 0x4C-0x4F */
+	0xEE, 0x82, 0xEE, 0x83, 0xEE, 0x84, 0xEE, 0x85, /* 0x50-0x53 */
+	0xEE, 0x86, 0xEE, 0x87, 0xEE, 0x88, 0xEE, 0x89, /* 0x54-0x57 */
+	0xEE, 0x8A, 0xEE, 0x8B, 0xEE, 0x8C, 0xEE, 0x8D, /* 0x58-0x5B */
+	0xEE, 0x8E, 0xEE, 0x8F, 0xEE, 0x90, 0xEE, 0x91, /* 0x5C-0x5F */
+	0xEE, 0x92, 0xEE, 0x93, 0xEE, 0x94, 0xEE, 0x95, /* 0x60-0x63 */
+	0xEE, 0x96, 0xEE, 0x97, 0xEE, 0x98, 0xEE, 0x99, /* 0x64-0x67 */
+	0xEE, 0x9A, 0xEE, 0x9B, 0xEE, 0x9C, 0xEE, 0x9D, /* 0x68-0x6B */
+	0xEE, 0x9E, 0xEE, 0x9F, 0xEE, 0xA0, 0xEF, 0x40, /* 0x6C-0x6F */
+	0xEF, 0x41, 0xEF, 0x42, 0xEF, 0x43, 0xEF, 0x44, /* 0x70-0x73 */
+	0xEF, 0x45, 0xD2, 0xB3, 0xB6, 0xA5, 0xC7, 0xEA, /* 0x74-0x77 */
+	0xF1, 0xFC, 0xCF, 0xEE, 0xCB, 0xB3, 0xD0, 0xEB, /* 0x78-0x7B */
+	0xE7, 0xEF, 0xCD, 0xE7, 0xB9, 0xCB, 0xB6, 0xD9, /* 0x7C-0x7F */
+	
+	0xF1, 0xFD, 0xB0, 0xE4, 0xCB, 0xCC, 0xF1, 0xFE, /* 0x80-0x83 */
+	0xD4, 0xA4, 0xC2, 0xAD, 0xC1, 0xEC, 0xC6, 0xC4, /* 0x84-0x87 */
+	0xBE, 0xB1, 0xF2, 0xA1, 0xBC, 0xD5, 0xEF, 0x46, /* 0x88-0x8B */
+	0xF2, 0xA2, 0xF2, 0xA3, 0xEF, 0x47, 0xF2, 0xA4, /* 0x8C-0x8F */
+	0xD2, 0xC3, 0xC6, 0xB5, 0xEF, 0x48, 0xCD, 0xC7, /* 0x90-0x93 */
+	0xF2, 0xA5, 0xEF, 0x49, 0xD3, 0xB1, 0xBF, 0xC5, /* 0x94-0x97 */
+	0xCC, 0xE2, 0xEF, 0x4A, 0xF2, 0xA6, 0xF2, 0xA7, /* 0x98-0x9B */
+	0xD1, 0xD5, 0xB6, 0xEE, 0xF2, 0xA8, 0xF2, 0xA9, /* 0x9C-0x9F */
+	0xB5, 0xDF, 0xF2, 0xAA, 0xF2, 0xAB, 0xEF, 0x4B, /* 0xA0-0xA3 */
+	0xB2, 0xFC, 0xF2, 0xAC, 0xF2, 0xAD, 0xC8, 0xA7, /* 0xA4-0xA7 */
+	0xEF, 0x4C, 0xEF, 0x4D, 0xEF, 0x4E, 0xEF, 0x4F, /* 0xA8-0xAB */
+	0xEF, 0x50, 0xEF, 0x51, 0xEF, 0x52, 0xEF, 0x53, /* 0xAC-0xAF */
+	0xEF, 0x54, 0xEF, 0x55, 0xEF, 0x56, 0xEF, 0x57, /* 0xB0-0xB3 */
+	0xEF, 0x58, 0xEF, 0x59, 0xEF, 0x5A, 0xEF, 0x5B, /* 0xB4-0xB7 */
+	0xEF, 0x5C, 0xEF, 0x5D, 0xEF, 0x5E, 0xEF, 0x5F, /* 0xB8-0xBB */
+	0xEF, 0x60, 0xEF, 0x61, 0xEF, 0x62, 0xEF, 0x63, /* 0xBC-0xBF */
+	0xEF, 0x64, 0xEF, 0x65, 0xEF, 0x66, 0xEF, 0x67, /* 0xC0-0xC3 */
+	0xEF, 0x68, 0xEF, 0x69, 0xEF, 0x6A, 0xEF, 0x6B, /* 0xC4-0xC7 */
+	0xEF, 0x6C, 0xEF, 0x6D, 0xEF, 0x6E, 0xEF, 0x6F, /* 0xC8-0xCB */
+	0xEF, 0x70, 0xEF, 0x71, 0xB7, 0xE7, 0xEF, 0x72, /* 0xCC-0xCF */
+	0xEF, 0x73, 0xEC, 0xA9, 0xEC, 0xAA, 0xEC, 0xAB, /* 0xD0-0xD3 */
+	0xEF, 0x74, 0xEC, 0xAC, 0xEF, 0x75, 0xEF, 0x76, /* 0xD4-0xD7 */
+	0xC6, 0xAE, 0xEC, 0xAD, 0xEC, 0xAE, 0xEF, 0x77, /* 0xD8-0xDB */
+	0xEF, 0x78, 0xEF, 0x79, 0xB7, 0xC9, 0xCA, 0xB3, /* 0xDC-0xDF */
+	0xEF, 0x7A, 0xEF, 0x7B, 0xEF, 0x7C, 0xEF, 0x7D, /* 0xE0-0xE3 */
+	0xEF, 0x7E, 0xEF, 0x80, 0xEF, 0x81, 0xE2, 0xB8, /* 0xE4-0xE7 */
+	0xF7, 0xCF, 0xEF, 0x82, 0xEF, 0x83, 0xEF, 0x84, /* 0xE8-0xEB */
+	0xEF, 0x85, 0xEF, 0x86, 0xEF, 0x87, 0xEF, 0x88, /* 0xEC-0xEF */
+	0xEF, 0x89, 0xEF, 0x8A, 0xEF, 0x8B, 0xEF, 0x8C, /* 0xF0-0xF3 */
+	0xEF, 0x8D, 0xEF, 0x8E, 0xEF, 0x8F, 0xEF, 0x90, /* 0xF4-0xF7 */
+	0xEF, 0x91, 0xEF, 0x92, 0xEF, 0x93, 0xEF, 0x94, /* 0xF8-0xFB */
+	0xEF, 0x95, 0xEF, 0x96, 0xEF, 0x97, 0xEF, 0x98, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_99[512] = {
+	0xEF, 0x99, 0xEF, 0x9A, 0xEF, 0x9B, 0xEF, 0x9C, /* 0x00-0x03 */
+	0xEF, 0x9D, 0xEF, 0x9E, 0xEF, 0x9F, 0xEF, 0xA0, /* 0x04-0x07 */
+	0xF0, 0x40, 0xF0, 0x41, 0xF0, 0x42, 0xF0, 0x43, /* 0x08-0x0B */
+	0xF0, 0x44, 0xF7, 0xD0, 0xF0, 0x45, 0xF0, 0x46, /* 0x0C-0x0F */
+	0xB2, 0xCD, 0xF0, 0x47, 0xF0, 0x48, 0xF0, 0x49, /* 0x10-0x13 */
+	0xF0, 0x4A, 0xF0, 0x4B, 0xF0, 0x4C, 0xF0, 0x4D, /* 0x14-0x17 */
+	0xF0, 0x4E, 0xF0, 0x4F, 0xF0, 0x50, 0xF0, 0x51, /* 0x18-0x1B */
+	0xF0, 0x52, 0xF0, 0x53, 0xF0, 0x54, 0xF0, 0x55, /* 0x1C-0x1F */
+	0xF0, 0x56, 0xF0, 0x57, 0xF0, 0x58, 0xF0, 0x59, /* 0x20-0x23 */
+	0xF0, 0x5A, 0xF0, 0x5B, 0xF0, 0x5C, 0xF0, 0x5D, /* 0x24-0x27 */
+	0xF0, 0x5E, 0xF0, 0x5F, 0xF0, 0x60, 0xF0, 0x61, /* 0x28-0x2B */
+	0xF0, 0x62, 0xF0, 0x63, 0xF7, 0xD1, 0xF0, 0x64, /* 0x2C-0x2F */
+	0xF0, 0x65, 0xF0, 0x66, 0xF0, 0x67, 0xF0, 0x68, /* 0x30-0x33 */
+	0xF0, 0x69, 0xF0, 0x6A, 0xF0, 0x6B, 0xF0, 0x6C, /* 0x34-0x37 */
+	0xF0, 0x6D, 0xF0, 0x6E, 0xF0, 0x6F, 0xF0, 0x70, /* 0x38-0x3B */
+	0xF0, 0x71, 0xF0, 0x72, 0xF0, 0x73, 0xF0, 0x74, /* 0x3C-0x3F */
+	0xF0, 0x75, 0xF0, 0x76, 0xF0, 0x77, 0xF0, 0x78, /* 0x40-0x43 */
+	0xF0, 0x79, 0xF0, 0x7A, 0xF0, 0x7B, 0xF0, 0x7C, /* 0x44-0x47 */
+	0xF0, 0x7D, 0xF0, 0x7E, 0xF0, 0x80, 0xF0, 0x81, /* 0x48-0x4B */
+	0xF0, 0x82, 0xF0, 0x83, 0xF0, 0x84, 0xF0, 0x85, /* 0x4C-0x4F */
+	0xF0, 0x86, 0xF0, 0x87, 0xF0, 0x88, 0xF0, 0x89, /* 0x50-0x53 */
+	0xF7, 0xD3, 0xF7, 0xD2, 0xF0, 0x8A, 0xF0, 0x8B, /* 0x54-0x57 */
+	0xF0, 0x8C, 0xF0, 0x8D, 0xF0, 0x8E, 0xF0, 0x8F, /* 0x58-0x5B */
+	0xF0, 0x90, 0xF0, 0x91, 0xF0, 0x92, 0xF0, 0x93, /* 0x5C-0x5F */
+	0xF0, 0x94, 0xF0, 0x95, 0xF0, 0x96, 0xE2, 0xBB, /* 0x60-0x63 */
+	0xF0, 0x97, 0xBC, 0xA2, 0xF0, 0x98, 0xE2, 0xBC, /* 0x64-0x67 */
+	0xE2, 0xBD, 0xE2, 0xBE, 0xE2, 0xBF, 0xE2, 0xC0, /* 0x68-0x6B */
+	0xE2, 0xC1, 0xB7, 0xB9, 0xD2, 0xFB, 0xBD, 0xA4, /* 0x6C-0x6F */
+	0xCA, 0xCE, 0xB1, 0xA5, 0xCB, 0xC7, 0xF0, 0x99, /* 0x70-0x73 */
+	0xE2, 0xC2, 0xB6, 0xFC, 0xC8, 0xC4, 0xE2, 0xC3, /* 0x74-0x77 */
+	0xF0, 0x9A, 0xF0, 0x9B, 0xBD, 0xC8, 0xF0, 0x9C, /* 0x78-0x7B */
+	0xB1, 0xFD, 0xE2, 0xC4, 0xF0, 0x9D, 0xB6, 0xF6, /* 0x7C-0x7F */
+	
+	0xE2, 0xC5, 0xC4, 0xD9, 0xF0, 0x9E, 0xF0, 0x9F, /* 0x80-0x83 */
+	0xE2, 0xC6, 0xCF, 0xDA, 0xB9, 0xDD, 0xE2, 0xC7, /* 0x84-0x87 */
+	0xC0, 0xA1, 0xF0, 0xA0, 0xE2, 0xC8, 0xB2, 0xF6, /* 0x88-0x8B */
+	0xF1, 0x40, 0xE2, 0xC9, 0xF1, 0x41, 0xC1, 0xF3, /* 0x8C-0x8F */
+	0xE2, 0xCA, 0xE2, 0xCB, 0xC2, 0xF8, 0xE2, 0xCC, /* 0x90-0x93 */
+	0xE2, 0xCD, 0xE2, 0xCE, 0xCA, 0xD7, 0xD8, 0xB8, /* 0x94-0x97 */
+	0xD9, 0xE5, 0xCF, 0xE3, 0xF1, 0x42, 0xF1, 0x43, /* 0x98-0x9B */
+	0xF1, 0x44, 0xF1, 0x45, 0xF1, 0x46, 0xF1, 0x47, /* 0x9C-0x9F */
+	0xF1, 0x48, 0xF1, 0x49, 0xF1, 0x4A, 0xF1, 0x4B, /* 0xA0-0xA3 */
+	0xF1, 0x4C, 0xF0, 0xA5, 0xF1, 0x4D, 0xF1, 0x4E, /* 0xA4-0xA7 */
+	0xDC, 0xB0, 0xF1, 0x4F, 0xF1, 0x50, 0xF1, 0x51, /* 0xA8-0xAB */
+	0xF1, 0x52, 0xF1, 0x53, 0xF1, 0x54, 0xF1, 0x55, /* 0xAC-0xAF */
+	0xF1, 0x56, 0xF1, 0x57, 0xF1, 0x58, 0xF1, 0x59, /* 0xB0-0xB3 */
+	0xF1, 0x5A, 0xF1, 0x5B, 0xF1, 0x5C, 0xF1, 0x5D, /* 0xB4-0xB7 */
+	0xF1, 0x5E, 0xF1, 0x5F, 0xF1, 0x60, 0xF1, 0x61, /* 0xB8-0xBB */
+	0xF1, 0x62, 0xF1, 0x63, 0xF1, 0x64, 0xF1, 0x65, /* 0xBC-0xBF */
+	0xF1, 0x66, 0xF1, 0x67, 0xF1, 0x68, 0xF1, 0x69, /* 0xC0-0xC3 */
+	0xF1, 0x6A, 0xF1, 0x6B, 0xF1, 0x6C, 0xF1, 0x6D, /* 0xC4-0xC7 */
+	0xF1, 0x6E, 0xF1, 0x6F, 0xF1, 0x70, 0xF1, 0x71, /* 0xC8-0xCB */
+	0xF1, 0x72, 0xF1, 0x73, 0xF1, 0x74, 0xF1, 0x75, /* 0xCC-0xCF */
+	0xF1, 0x76, 0xF1, 0x77, 0xF1, 0x78, 0xF1, 0x79, /* 0xD0-0xD3 */
+	0xF1, 0x7A, 0xF1, 0x7B, 0xF1, 0x7C, 0xF1, 0x7D, /* 0xD4-0xD7 */
+	0xF1, 0x7E, 0xF1, 0x80, 0xF1, 0x81, 0xF1, 0x82, /* 0xD8-0xDB */
+	0xF1, 0x83, 0xF1, 0x84, 0xF1, 0x85, 0xF1, 0x86, /* 0xDC-0xDF */
+	0xF1, 0x87, 0xF1, 0x88, 0xF1, 0x89, 0xF1, 0x8A, /* 0xE0-0xE3 */
+	0xF1, 0x8B, 0xF1, 0x8C, 0xF1, 0x8D, 0xF1, 0x8E, /* 0xE4-0xE7 */
+	0xF1, 0x8F, 0xF1, 0x90, 0xF1, 0x91, 0xF1, 0x92, /* 0xE8-0xEB */
+	0xF1, 0x93, 0xF1, 0x94, 0xF1, 0x95, 0xF1, 0x96, /* 0xEC-0xEF */
+	0xF1, 0x97, 0xF1, 0x98, 0xF1, 0x99, 0xF1, 0x9A, /* 0xF0-0xF3 */
+	0xF1, 0x9B, 0xF1, 0x9C, 0xF1, 0x9D, 0xF1, 0x9E, /* 0xF4-0xF7 */
+	0xF1, 0x9F, 0xF1, 0xA0, 0xF2, 0x40, 0xF2, 0x41, /* 0xF8-0xFB */
+	0xF2, 0x42, 0xF2, 0x43, 0xF2, 0x44, 0xF2, 0x45, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9A[512] = {
+	0xF2, 0x46, 0xF2, 0x47, 0xF2, 0x48, 0xF2, 0x49, /* 0x00-0x03 */
+	0xF2, 0x4A, 0xF2, 0x4B, 0xF2, 0x4C, 0xF2, 0x4D, /* 0x04-0x07 */
+	0xF2, 0x4E, 0xF2, 0x4F, 0xF2, 0x50, 0xF2, 0x51, /* 0x08-0x0B */
+	0xF2, 0x52, 0xF2, 0x53, 0xF2, 0x54, 0xF2, 0x55, /* 0x0C-0x0F */
+	0xF2, 0x56, 0xF2, 0x57, 0xF2, 0x58, 0xF2, 0x59, /* 0x10-0x13 */
+	0xF2, 0x5A, 0xF2, 0x5B, 0xF2, 0x5C, 0xF2, 0x5D, /* 0x14-0x17 */
+	0xF2, 0x5E, 0xF2, 0x5F, 0xF2, 0x60, 0xF2, 0x61, /* 0x18-0x1B */
+	0xF2, 0x62, 0xF2, 0x63, 0xF2, 0x64, 0xF2, 0x65, /* 0x1C-0x1F */
+	0xF2, 0x66, 0xF2, 0x67, 0xF2, 0x68, 0xF2, 0x69, /* 0x20-0x23 */
+	0xF2, 0x6A, 0xF2, 0x6B, 0xF2, 0x6C, 0xF2, 0x6D, /* 0x24-0x27 */
+	0xF2, 0x6E, 0xF2, 0x6F, 0xF2, 0x70, 0xF2, 0x71, /* 0x28-0x2B */
+	0xF2, 0x72, 0xF2, 0x73, 0xF2, 0x74, 0xF2, 0x75, /* 0x2C-0x2F */
+	0xF2, 0x76, 0xF2, 0x77, 0xF2, 0x78, 0xF2, 0x79, /* 0x30-0x33 */
+	0xF2, 0x7A, 0xF2, 0x7B, 0xF2, 0x7C, 0xF2, 0x7D, /* 0x34-0x37 */
+	0xF2, 0x7E, 0xF2, 0x80, 0xF2, 0x81, 0xF2, 0x82, /* 0x38-0x3B */
+	0xF2, 0x83, 0xF2, 0x84, 0xF2, 0x85, 0xF2, 0x86, /* 0x3C-0x3F */
+	0xF2, 0x87, 0xF2, 0x88, 0xF2, 0x89, 0xF2, 0x8A, /* 0x40-0x43 */
+	0xF2, 0x8B, 0xF2, 0x8C, 0xF2, 0x8D, 0xF2, 0x8E, /* 0x44-0x47 */
+	0xF2, 0x8F, 0xF2, 0x90, 0xF2, 0x91, 0xF2, 0x92, /* 0x48-0x4B */
+	0xF2, 0x93, 0xF2, 0x94, 0xF2, 0x95, 0xF2, 0x96, /* 0x4C-0x4F */
+	0xF2, 0x97, 0xF2, 0x98, 0xF2, 0x99, 0xF2, 0x9A, /* 0x50-0x53 */
+	0xF2, 0x9B, 0xF2, 0x9C, 0xF2, 0x9D, 0xF2, 0x9E, /* 0x54-0x57 */
+	0xF2, 0x9F, 0xF2, 0xA0, 0xF3, 0x40, 0xF3, 0x41, /* 0x58-0x5B */
+	0xF3, 0x42, 0xF3, 0x43, 0xF3, 0x44, 0xF3, 0x45, /* 0x5C-0x5F */
+	0xF3, 0x46, 0xF3, 0x47, 0xF3, 0x48, 0xF3, 0x49, /* 0x60-0x63 */
+	0xF3, 0x4A, 0xF3, 0x4B, 0xF3, 0x4C, 0xF3, 0x4D, /* 0x64-0x67 */
+	0xF3, 0x4E, 0xF3, 0x4F, 0xF3, 0x50, 0xF3, 0x51, /* 0x68-0x6B */
+	0xC2, 0xED, 0xD4, 0xA6, 0xCD, 0xD4, 0xD1, 0xB1, /* 0x6C-0x6F */
+	0xB3, 0xDB, 0xC7, 0xFD, 0xF3, 0x52, 0xB2, 0xB5, /* 0x70-0x73 */
+	0xC2, 0xBF, 0xE6, 0xE0, 0xCA, 0xBB, 0xE6, 0xE1, /* 0x74-0x77 */
+	0xE6, 0xE2, 0xBE, 0xD4, 0xE6, 0xE3, 0xD7, 0xA4, /* 0x78-0x7B */
+	0xCD, 0xD5, 0xE6, 0xE5, 0xBC, 0xDD, 0xE6, 0xE4, /* 0x7C-0x7F */
+	
+	0xE6, 0xE6, 0xE6, 0xE7, 0xC2, 0xEE, 0xF3, 0x53, /* 0x80-0x83 */
+	0xBD, 0xBE, 0xE6, 0xE8, 0xC2, 0xE6, 0xBA, 0xA7, /* 0x84-0x87 */
+	0xE6, 0xE9, 0xF3, 0x54, 0xE6, 0xEA, 0xB3, 0xD2, /* 0x88-0x8B */
+	0xD1, 0xE9, 0xF3, 0x55, 0xF3, 0x56, 0xBF, 0xA5, /* 0x8C-0x8F */
+	0xE6, 0xEB, 0xC6, 0xEF, 0xE6, 0xEC, 0xE6, 0xED, /* 0x90-0x93 */
+	0xF3, 0x57, 0xF3, 0x58, 0xE6, 0xEE, 0xC6, 0xAD, /* 0x94-0x97 */
+	0xE6, 0xEF, 0xF3, 0x59, 0xC9, 0xA7, 0xE6, 0xF0, /* 0x98-0x9B */
+	0xE6, 0xF1, 0xE6, 0xF2, 0xE5, 0xB9, 0xE6, 0xF3, /* 0x9C-0x9F */
+	0xE6, 0xF4, 0xC2, 0xE2, 0xE6, 0xF5, 0xE6, 0xF6, /* 0xA0-0xA3 */
+	0xD6, 0xE8, 0xE6, 0xF7, 0xF3, 0x5A, 0xE6, 0xF8, /* 0xA4-0xA7 */
+	0xB9, 0xC7, 0xF3, 0x5B, 0xF3, 0x5C, 0xF3, 0x5D, /* 0xA8-0xAB */
+	0xF3, 0x5E, 0xF3, 0x5F, 0xF3, 0x60, 0xF3, 0x61, /* 0xAC-0xAF */
+	0xF7, 0xBB, 0xF7, 0xBA, 0xF3, 0x62, 0xF3, 0x63, /* 0xB0-0xB3 */
+	0xF3, 0x64, 0xF3, 0x65, 0xF7, 0xBE, 0xF7, 0xBC, /* 0xB4-0xB7 */
+	0xBA, 0xA1, 0xF3, 0x66, 0xF7, 0xBF, 0xF3, 0x67, /* 0xB8-0xBB */
+	0xF7, 0xC0, 0xF3, 0x68, 0xF3, 0x69, 0xF3, 0x6A, /* 0xBC-0xBF */
+	0xF7, 0xC2, 0xF7, 0xC1, 0xF7, 0xC4, 0xF3, 0x6B, /* 0xC0-0xC3 */
+	0xF3, 0x6C, 0xF7, 0xC3, 0xF3, 0x6D, 0xF3, 0x6E, /* 0xC4-0xC7 */
+	0xF3, 0x6F, 0xF3, 0x70, 0xF3, 0x71, 0xF7, 0xC5, /* 0xC8-0xCB */
+	0xF7, 0xC6, 0xF3, 0x72, 0xF3, 0x73, 0xF3, 0x74, /* 0xCC-0xCF */
+	0xF3, 0x75, 0xF7, 0xC7, 0xF3, 0x76, 0xCB, 0xE8, /* 0xD0-0xD3 */
+	0xF3, 0x77, 0xF3, 0x78, 0xF3, 0x79, 0xF3, 0x7A, /* 0xD4-0xD7 */
+	0xB8, 0xDF, 0xF3, 0x7B, 0xF3, 0x7C, 0xF3, 0x7D, /* 0xD8-0xDB */
+	0xF3, 0x7E, 0xF3, 0x80, 0xF3, 0x81, 0xF7, 0xD4, /* 0xDC-0xDF */
+	0xF3, 0x82, 0xF7, 0xD5, 0xF3, 0x83, 0xF3, 0x84, /* 0xE0-0xE3 */
+	0xF3, 0x85, 0xF3, 0x86, 0xF7, 0xD6, 0xF3, 0x87, /* 0xE4-0xE7 */
+	0xF3, 0x88, 0xF3, 0x89, 0xF3, 0x8A, 0xF7, 0xD8, /* 0xE8-0xEB */
+	0xF3, 0x8B, 0xF7, 0xDA, 0xF3, 0x8C, 0xF7, 0xD7, /* 0xEC-0xEF */
+	0xF3, 0x8D, 0xF3, 0x8E, 0xF3, 0x8F, 0xF3, 0x90, /* 0xF0-0xF3 */
+	0xF3, 0x91, 0xF3, 0x92, 0xF3, 0x93, 0xF3, 0x94, /* 0xF4-0xF7 */
+	0xF3, 0x95, 0xF7, 0xDB, 0xF3, 0x96, 0xF7, 0xD9, /* 0xF8-0xFB */
+	0xF3, 0x97, 0xF3, 0x98, 0xF3, 0x99, 0xF3, 0x9A, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9B[512] = {
+	0xF3, 0x9B, 0xF3, 0x9C, 0xF3, 0x9D, 0xD7, 0xD7, /* 0x00-0x03 */
+	0xF3, 0x9E, 0xF3, 0x9F, 0xF3, 0xA0, 0xF4, 0x40, /* 0x04-0x07 */
+	0xF7, 0xDC, 0xF4, 0x41, 0xF4, 0x42, 0xF4, 0x43, /* 0x08-0x0B */
+	0xF4, 0x44, 0xF4, 0x45, 0xF4, 0x46, 0xF7, 0xDD, /* 0x0C-0x0F */
+	0xF4, 0x47, 0xF4, 0x48, 0xF4, 0x49, 0xF7, 0xDE, /* 0x10-0x13 */
+	0xF4, 0x4A, 0xF4, 0x4B, 0xF4, 0x4C, 0xF4, 0x4D, /* 0x14-0x17 */
+	0xF4, 0x4E, 0xF4, 0x4F, 0xF4, 0x50, 0xF4, 0x51, /* 0x18-0x1B */
+	0xF4, 0x52, 0xF4, 0x53, 0xF4, 0x54, 0xF7, 0xDF, /* 0x1C-0x1F */
+	0xF4, 0x55, 0xF4, 0x56, 0xF4, 0x57, 0xF7, 0xE0, /* 0x20-0x23 */
+	0xF4, 0x58, 0xF4, 0x59, 0xF4, 0x5A, 0xF4, 0x5B, /* 0x24-0x27 */
+	0xF4, 0x5C, 0xF4, 0x5D, 0xF4, 0x5E, 0xF4, 0x5F, /* 0x28-0x2B */
+	0xF4, 0x60, 0xF4, 0x61, 0xF4, 0x62, 0xDB, 0xCB, /* 0x2C-0x2F */
+	0xF4, 0x63, 0xF4, 0x64, 0xD8, 0xAA, 0xF4, 0x65, /* 0x30-0x33 */
+	0xF4, 0x66, 0xF4, 0x67, 0xF4, 0x68, 0xF4, 0x69, /* 0x34-0x37 */
+	0xF4, 0x6A, 0xF4, 0x6B, 0xF4, 0x6C, 0xE5, 0xF7, /* 0x38-0x3B */
+	0xB9, 0xED, 0xF4, 0x6D, 0xF4, 0x6E, 0xF4, 0x6F, /* 0x3C-0x3F */
+	0xF4, 0x70, 0xBF, 0xFD, 0xBB, 0xEA, 0xF7, 0xC9, /* 0x40-0x43 */
+	0xC6, 0xC7, 0xF7, 0xC8, 0xF4, 0x71, 0xF7, 0xCA, /* 0x44-0x47 */
+	0xF7, 0xCC, 0xF7, 0xCB, 0xF4, 0x72, 0xF4, 0x73, /* 0x48-0x4B */
+	0xF4, 0x74, 0xF7, 0xCD, 0xF4, 0x75, 0xCE, 0xBA, /* 0x4C-0x4F */
+	0xF4, 0x76, 0xF7, 0xCE, 0xF4, 0x77, 0xF4, 0x78, /* 0x50-0x53 */
+	0xC4, 0xA7, 0xF4, 0x79, 0xF4, 0x7A, 0xF4, 0x7B, /* 0x54-0x57 */
+	0xF4, 0x7C, 0xF4, 0x7D, 0xF4, 0x7E, 0xF4, 0x80, /* 0x58-0x5B */
+	0xF4, 0x81, 0xF4, 0x82, 0xF4, 0x83, 0xF4, 0x84, /* 0x5C-0x5F */
+	0xF4, 0x85, 0xF4, 0x86, 0xF4, 0x87, 0xF4, 0x88, /* 0x60-0x63 */
+	0xF4, 0x89, 0xF4, 0x8A, 0xF4, 0x8B, 0xF4, 0x8C, /* 0x64-0x67 */
+	0xF4, 0x8D, 0xF4, 0x8E, 0xF4, 0x8F, 0xF4, 0x90, /* 0x68-0x6B */
+	0xF4, 0x91, 0xF4, 0x92, 0xF4, 0x93, 0xF4, 0x94, /* 0x6C-0x6F */
+	0xF4, 0x95, 0xF4, 0x96, 0xF4, 0x97, 0xF4, 0x98, /* 0x70-0x73 */
+	0xF4, 0x99, 0xF4, 0x9A, 0xF4, 0x9B, 0xF4, 0x9C, /* 0x74-0x77 */
+	0xF4, 0x9D, 0xF4, 0x9E, 0xF4, 0x9F, 0xF4, 0xA0, /* 0x78-0x7B */
+	0xF5, 0x40, 0xF5, 0x41, 0xF5, 0x42, 0xF5, 0x43, /* 0x7C-0x7F */
+	
+	0xF5, 0x44, 0xF5, 0x45, 0xF5, 0x46, 0xF5, 0x47, /* 0x80-0x83 */
+	0xF5, 0x48, 0xF5, 0x49, 0xF5, 0x4A, 0xF5, 0x4B, /* 0x84-0x87 */
+	0xF5, 0x4C, 0xF5, 0x4D, 0xF5, 0x4E, 0xF5, 0x4F, /* 0x88-0x8B */
+	0xF5, 0x50, 0xF5, 0x51, 0xF5, 0x52, 0xF5, 0x53, /* 0x8C-0x8F */
+	0xF5, 0x54, 0xF5, 0x55, 0xF5, 0x56, 0xF5, 0x57, /* 0x90-0x93 */
+	0xF5, 0x58, 0xF5, 0x59, 0xF5, 0x5A, 0xF5, 0x5B, /* 0x94-0x97 */
+	0xF5, 0x5C, 0xF5, 0x5D, 0xF5, 0x5E, 0xF5, 0x5F, /* 0x98-0x9B */
+	0xF5, 0x60, 0xF5, 0x61, 0xF5, 0x62, 0xF5, 0x63, /* 0x9C-0x9F */
+	0xF5, 0x64, 0xF5, 0x65, 0xF5, 0x66, 0xF5, 0x67, /* 0xA0-0xA3 */
+	0xF5, 0x68, 0xF5, 0x69, 0xF5, 0x6A, 0xF5, 0x6B, /* 0xA4-0xA7 */
+	0xF5, 0x6C, 0xF5, 0x6D, 0xF5, 0x6E, 0xF5, 0x6F, /* 0xA8-0xAB */
+	0xF5, 0x70, 0xF5, 0x71, 0xF5, 0x72, 0xF5, 0x73, /* 0xAC-0xAF */
+	0xF5, 0x74, 0xF5, 0x75, 0xF5, 0x76, 0xF5, 0x77, /* 0xB0-0xB3 */
+	0xF5, 0x78, 0xF5, 0x79, 0xF5, 0x7A, 0xF5, 0x7B, /* 0xB4-0xB7 */
+	0xF5, 0x7C, 0xF5, 0x7D, 0xF5, 0x7E, 0xF5, 0x80, /* 0xB8-0xBB */
+	0xF5, 0x81, 0xF5, 0x82, 0xF5, 0x83, 0xF5, 0x84, /* 0xBC-0xBF */
+	0xF5, 0x85, 0xF5, 0x86, 0xF5, 0x87, 0xF5, 0x88, /* 0xC0-0xC3 */
+	0xF5, 0x89, 0xF5, 0x8A, 0xF5, 0x8B, 0xF5, 0x8C, /* 0xC4-0xC7 */
+	0xF5, 0x8D, 0xF5, 0x8E, 0xF5, 0x8F, 0xF5, 0x90, /* 0xC8-0xCB */
+	0xF5, 0x91, 0xF5, 0x92, 0xF5, 0x93, 0xF5, 0x94, /* 0xCC-0xCF */
+	0xF5, 0x95, 0xF5, 0x96, 0xF5, 0x97, 0xF5, 0x98, /* 0xD0-0xD3 */
+	0xF5, 0x99, 0xF5, 0x9A, 0xF5, 0x9B, 0xF5, 0x9C, /* 0xD4-0xD7 */
+	0xF5, 0x9D, 0xF5, 0x9E, 0xF5, 0x9F, 0xF5, 0xA0, /* 0xD8-0xDB */
+	0xF6, 0x40, 0xF6, 0x41, 0xF6, 0x42, 0xF6, 0x43, /* 0xDC-0xDF */
+	0xF6, 0x44, 0xF6, 0x45, 0xF6, 0x46, 0xF6, 0x47, /* 0xE0-0xE3 */
+	0xF6, 0x48, 0xF6, 0x49, 0xF6, 0x4A, 0xF6, 0x4B, /* 0xE4-0xE7 */
+	0xF6, 0x4C, 0xF6, 0x4D, 0xF6, 0x4E, 0xF6, 0x4F, /* 0xE8-0xEB */
+	0xF6, 0x50, 0xF6, 0x51, 0xF6, 0x52, 0xF6, 0x53, /* 0xEC-0xEF */
+	0xF6, 0x54, 0xF6, 0x55, 0xF6, 0x56, 0xF6, 0x57, /* 0xF0-0xF3 */
+	0xF6, 0x58, 0xF6, 0x59, 0xF6, 0x5A, 0xF6, 0x5B, /* 0xF4-0xF7 */
+	0xF6, 0x5C, 0xF6, 0x5D, 0xF6, 0x5E, 0xF6, 0x5F, /* 0xF8-0xFB */
+	0xF6, 0x60, 0xF6, 0x61, 0xF6, 0x62, 0xF6, 0x63, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9C[512] = {
+	0xF6, 0x64, 0xF6, 0x65, 0xF6, 0x66, 0xF6, 0x67, /* 0x00-0x03 */
+	0xF6, 0x68, 0xF6, 0x69, 0xF6, 0x6A, 0xF6, 0x6B, /* 0x04-0x07 */
+	0xF6, 0x6C, 0xF6, 0x6D, 0xF6, 0x6E, 0xF6, 0x6F, /* 0x08-0x0B */
+	0xF6, 0x70, 0xF6, 0x71, 0xF6, 0x72, 0xF6, 0x73, /* 0x0C-0x0F */
+	0xF6, 0x74, 0xF6, 0x75, 0xF6, 0x76, 0xF6, 0x77, /* 0x10-0x13 */
+	0xF6, 0x78, 0xF6, 0x79, 0xF6, 0x7A, 0xF6, 0x7B, /* 0x14-0x17 */
+	0xF6, 0x7C, 0xF6, 0x7D, 0xF6, 0x7E, 0xF6, 0x80, /* 0x18-0x1B */
+	0xF6, 0x81, 0xF6, 0x82, 0xF6, 0x83, 0xF6, 0x84, /* 0x1C-0x1F */
+	0xF6, 0x85, 0xF6, 0x86, 0xF6, 0x87, 0xF6, 0x88, /* 0x20-0x23 */
+	0xF6, 0x89, 0xF6, 0x8A, 0xF6, 0x8B, 0xF6, 0x8C, /* 0x24-0x27 */
+	0xF6, 0x8D, 0xF6, 0x8E, 0xF6, 0x8F, 0xF6, 0x90, /* 0x28-0x2B */
+	0xF6, 0x91, 0xF6, 0x92, 0xF6, 0x93, 0xF6, 0x94, /* 0x2C-0x2F */
+	0xF6, 0x95, 0xF6, 0x96, 0xF6, 0x97, 0xF6, 0x98, /* 0x30-0x33 */
+	0xF6, 0x99, 0xF6, 0x9A, 0xF6, 0x9B, 0xF6, 0x9C, /* 0x34-0x37 */
+	0xF6, 0x9D, 0xF6, 0x9E, 0xF6, 0x9F, 0xF6, 0xA0, /* 0x38-0x3B */
+	0xF7, 0x40, 0xF7, 0x41, 0xF7, 0x42, 0xF7, 0x43, /* 0x3C-0x3F */
+	0xF7, 0x44, 0xF7, 0x45, 0xF7, 0x46, 0xF7, 0x47, /* 0x40-0x43 */
+	0xF7, 0x48, 0xF7, 0x49, 0xF7, 0x4A, 0xF7, 0x4B, /* 0x44-0x47 */
+	0xF7, 0x4C, 0xF7, 0x4D, 0xF7, 0x4E, 0xF7, 0x4F, /* 0x48-0x4B */
+	0xF7, 0x50, 0xF7, 0x51, 0xF7, 0x52, 0xF7, 0x53, /* 0x4C-0x4F */
+	0xF7, 0x54, 0xF7, 0x55, 0xF7, 0x56, 0xF7, 0x57, /* 0x50-0x53 */
+	0xF7, 0x58, 0xF7, 0x59, 0xF7, 0x5A, 0xF7, 0x5B, /* 0x54-0x57 */
+	0xF7, 0x5C, 0xF7, 0x5D, 0xF7, 0x5E, 0xF7, 0x5F, /* 0x58-0x5B */
+	0xF7, 0x60, 0xF7, 0x61, 0xF7, 0x62, 0xF7, 0x63, /* 0x5C-0x5F */
+	0xF7, 0x64, 0xF7, 0x65, 0xF7, 0x66, 0xF7, 0x67, /* 0x60-0x63 */
+	0xF7, 0x68, 0xF7, 0x69, 0xF7, 0x6A, 0xF7, 0x6B, /* 0x64-0x67 */
+	0xF7, 0x6C, 0xF7, 0x6D, 0xF7, 0x6E, 0xF7, 0x6F, /* 0x68-0x6B */
+	0xF7, 0x70, 0xF7, 0x71, 0xF7, 0x72, 0xF7, 0x73, /* 0x6C-0x6F */
+	0xF7, 0x74, 0xF7, 0x75, 0xF7, 0x76, 0xF7, 0x77, /* 0x70-0x73 */
+	0xF7, 0x78, 0xF7, 0x79, 0xF7, 0x7A, 0xF7, 0x7B, /* 0x74-0x77 */
+	0xF7, 0x7C, 0xF7, 0x7D, 0xF7, 0x7E, 0xF7, 0x80, /* 0x78-0x7B */
+	0xD3, 0xE3, 0xF7, 0x81, 0xF7, 0x82, 0xF6, 0xCF, /* 0x7C-0x7F */
+	
+	0xF7, 0x83, 0xC2, 0xB3, 0xF6, 0xD0, 0xF7, 0x84, /* 0x80-0x83 */
+	0xF7, 0x85, 0xF6, 0xD1, 0xF6, 0xD2, 0xF6, 0xD3, /* 0x84-0x87 */
+	0xF6, 0xD4, 0xF7, 0x86, 0xF7, 0x87, 0xF6, 0xD6, /* 0x88-0x8B */
+	0xF7, 0x88, 0xB1, 0xAB, 0xF6, 0xD7, 0xF7, 0x89, /* 0x8C-0x8F */
+	0xF6, 0xD8, 0xF6, 0xD9, 0xF6, 0xDA, 0xF7, 0x8A, /* 0x90-0x93 */
+	0xF6, 0xDB, 0xF6, 0xDC, 0xF7, 0x8B, 0xF7, 0x8C, /* 0x94-0x97 */
+	0xF7, 0x8D, 0xF7, 0x8E, 0xF6, 0xDD, 0xF6, 0xDE, /* 0x98-0x9B */
+	0xCF, 0xCA, 0xF7, 0x8F, 0xF6, 0xDF, 0xF6, 0xE0, /* 0x9C-0x9F */
+	0xF6, 0xE1, 0xF6, 0xE2, 0xF6, 0xE3, 0xF6, 0xE4, /* 0xA0-0xA3 */
+	0xC0, 0xF0, 0xF6, 0xE5, 0xF6, 0xE6, 0xF6, 0xE7, /* 0xA4-0xA7 */
+	0xF6, 0xE8, 0xF6, 0xE9, 0xF7, 0x90, 0xF6, 0xEA, /* 0xA8-0xAB */
+	0xF7, 0x91, 0xF6, 0xEB, 0xF6, 0xEC, 0xF7, 0x92, /* 0xAC-0xAF */
+	0xF6, 0xED, 0xF6, 0xEE, 0xF6, 0xEF, 0xF6, 0xF0, /* 0xB0-0xB3 */
+	0xF6, 0xF1, 0xF6, 0xF2, 0xF6, 0xF3, 0xF6, 0xF4, /* 0xB4-0xB7 */
+	0xBE, 0xA8, 0xF7, 0x93, 0xF6, 0xF5, 0xF6, 0xF6, /* 0xB8-0xBB */
+	0xF6, 0xF7, 0xF6, 0xF8, 0xF7, 0x94, 0xF7, 0x95, /* 0xBC-0xBF */
+	0xF7, 0x96, 0xF7, 0x97, 0xF7, 0x98, 0xC8, 0xFA, /* 0xC0-0xC3 */
+	0xF6, 0xF9, 0xF6, 0xFA, 0xF6, 0xFB, 0xF6, 0xFC, /* 0xC4-0xC7 */
+	0xF7, 0x99, 0xF7, 0x9A, 0xF6, 0xFD, 0xF6, 0xFE, /* 0xC8-0xCB */
+	0xF7, 0xA1, 0xF7, 0xA2, 0xF7, 0xA3, 0xF7, 0xA4, /* 0xCC-0xCF */
+	0xF7, 0xA5, 0xF7, 0x9B, 0xF7, 0x9C, 0xF7, 0xA6, /* 0xD0-0xD3 */
+	0xF7, 0xA7, 0xF7, 0xA8, 0xB1, 0xEE, 0xF7, 0xA9, /* 0xD4-0xD7 */
+	0xF7, 0xAA, 0xF7, 0xAB, 0xF7, 0x9D, 0xF7, 0x9E, /* 0xD8-0xDB */
+	0xF7, 0xAC, 0xF7, 0xAD, 0xC1, 0xDB, 0xF7, 0xAE, /* 0xDC-0xDF */
+	0xF7, 0x9F, 0xF7, 0xA0, 0xF7, 0xAF, 0xF8, 0x40, /* 0xE0-0xE3 */
+	0xF8, 0x41, 0xF8, 0x42, 0xF8, 0x43, 0xF8, 0x44, /* 0xE4-0xE7 */
+	0xF8, 0x45, 0xF8, 0x46, 0xF8, 0x47, 0xF8, 0x48, /* 0xE8-0xEB */
+	0xF8, 0x49, 0xF8, 0x4A, 0xF8, 0x4B, 0xF8, 0x4C, /* 0xEC-0xEF */
+	0xF8, 0x4D, 0xF8, 0x4E, 0xF8, 0x4F, 0xF8, 0x50, /* 0xF0-0xF3 */
+	0xF8, 0x51, 0xF8, 0x52, 0xF8, 0x53, 0xF8, 0x54, /* 0xF4-0xF7 */
+	0xF8, 0x55, 0xF8, 0x56, 0xF8, 0x57, 0xF8, 0x58, /* 0xF8-0xFB */
+	0xF8, 0x59, 0xF8, 0x5A, 0xF8, 0x5B, 0xF8, 0x5C, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9D[512] = {
+	0xF8, 0x5D, 0xF8, 0x5E, 0xF8, 0x5F, 0xF8, 0x60, /* 0x00-0x03 */
+	0xF8, 0x61, 0xF8, 0x62, 0xF8, 0x63, 0xF8, 0x64, /* 0x04-0x07 */
+	0xF8, 0x65, 0xF8, 0x66, 0xF8, 0x67, 0xF8, 0x68, /* 0x08-0x0B */
+	0xF8, 0x69, 0xF8, 0x6A, 0xF8, 0x6B, 0xF8, 0x6C, /* 0x0C-0x0F */
+	0xF8, 0x6D, 0xF8, 0x6E, 0xF8, 0x6F, 0xF8, 0x70, /* 0x10-0x13 */
+	0xF8, 0x71, 0xF8, 0x72, 0xF8, 0x73, 0xF8, 0x74, /* 0x14-0x17 */
+	0xF8, 0x75, 0xF8, 0x76, 0xF8, 0x77, 0xF8, 0x78, /* 0x18-0x1B */
+	0xF8, 0x79, 0xF8, 0x7A, 0xF8, 0x7B, 0xF8, 0x7C, /* 0x1C-0x1F */
+	0xF8, 0x7D, 0xF8, 0x7E, 0xF8, 0x80, 0xF8, 0x81, /* 0x20-0x23 */
+	0xF8, 0x82, 0xF8, 0x83, 0xF8, 0x84, 0xF8, 0x85, /* 0x24-0x27 */
+	0xF8, 0x86, 0xF8, 0x87, 0xF8, 0x88, 0xF8, 0x89, /* 0x28-0x2B */
+	0xF8, 0x8A, 0xF8, 0x8B, 0xF8, 0x8C, 0xF8, 0x8D, /* 0x2C-0x2F */
+	0xF8, 0x8E, 0xF8, 0x8F, 0xF8, 0x90, 0xF8, 0x91, /* 0x30-0x33 */
+	0xF8, 0x92, 0xF8, 0x93, 0xF8, 0x94, 0xF8, 0x95, /* 0x34-0x37 */
+	0xF8, 0x96, 0xF8, 0x97, 0xF8, 0x98, 0xF8, 0x99, /* 0x38-0x3B */
+	0xF8, 0x9A, 0xF8, 0x9B, 0xF8, 0x9C, 0xF8, 0x9D, /* 0x3C-0x3F */
+	0xF8, 0x9E, 0xF8, 0x9F, 0xF8, 0xA0, 0xF9, 0x40, /* 0x40-0x43 */
+	0xF9, 0x41, 0xF9, 0x42, 0xF9, 0x43, 0xF9, 0x44, /* 0x44-0x47 */
+	0xF9, 0x45, 0xF9, 0x46, 0xF9, 0x47, 0xF9, 0x48, /* 0x48-0x4B */
+	0xF9, 0x49, 0xF9, 0x4A, 0xF9, 0x4B, 0xF9, 0x4C, /* 0x4C-0x4F */
+	0xF9, 0x4D, 0xF9, 0x4E, 0xF9, 0x4F, 0xF9, 0x50, /* 0x50-0x53 */
+	0xF9, 0x51, 0xF9, 0x52, 0xF9, 0x53, 0xF9, 0x54, /* 0x54-0x57 */
+	0xF9, 0x55, 0xF9, 0x56, 0xF9, 0x57, 0xF9, 0x58, /* 0x58-0x5B */
+	0xF9, 0x59, 0xF9, 0x5A, 0xF9, 0x5B, 0xF9, 0x5C, /* 0x5C-0x5F */
+	0xF9, 0x5D, 0xF9, 0x5E, 0xF9, 0x5F, 0xF9, 0x60, /* 0x60-0x63 */
+	0xF9, 0x61, 0xF9, 0x62, 0xF9, 0x63, 0xF9, 0x64, /* 0x64-0x67 */
+	0xF9, 0x65, 0xF9, 0x66, 0xF9, 0x67, 0xF9, 0x68, /* 0x68-0x6B */
+	0xF9, 0x69, 0xF9, 0x6A, 0xF9, 0x6B, 0xF9, 0x6C, /* 0x6C-0x6F */
+	0xF9, 0x6D, 0xF9, 0x6E, 0xF9, 0x6F, 0xF9, 0x70, /* 0x70-0x73 */
+	0xF9, 0x71, 0xF9, 0x72, 0xF9, 0x73, 0xF9, 0x74, /* 0x74-0x77 */
+	0xF9, 0x75, 0xF9, 0x76, 0xF9, 0x77, 0xF9, 0x78, /* 0x78-0x7B */
+	0xF9, 0x79, 0xF9, 0x7A, 0xF9, 0x7B, 0xF9, 0x7C, /* 0x7C-0x7F */
+	
+	0xF9, 0x7D, 0xF9, 0x7E, 0xF9, 0x80, 0xF9, 0x81, /* 0x80-0x83 */
+	0xF9, 0x82, 0xF9, 0x83, 0xF9, 0x84, 0xF9, 0x85, /* 0x84-0x87 */
+	0xF9, 0x86, 0xF9, 0x87, 0xF9, 0x88, 0xF9, 0x89, /* 0x88-0x8B */
+	0xF9, 0x8A, 0xF9, 0x8B, 0xF9, 0x8C, 0xF9, 0x8D, /* 0x8C-0x8F */
+	0xF9, 0x8E, 0xF9, 0x8F, 0xF9, 0x90, 0xF9, 0x91, /* 0x90-0x93 */
+	0xF9, 0x92, 0xF9, 0x93, 0xF9, 0x94, 0xF9, 0x95, /* 0x94-0x97 */
+	0xF9, 0x96, 0xF9, 0x97, 0xF9, 0x98, 0xF9, 0x99, /* 0x98-0x9B */
+	0xF9, 0x9A, 0xF9, 0x9B, 0xF9, 0x9C, 0xF9, 0x9D, /* 0x9C-0x9F */
+	0xF9, 0x9E, 0xF9, 0x9F, 0xF9, 0xA0, 0xFA, 0x40, /* 0xA0-0xA3 */
+	0xFA, 0x41, 0xFA, 0x42, 0xFA, 0x43, 0xFA, 0x44, /* 0xA4-0xA7 */
+	0xFA, 0x45, 0xFA, 0x46, 0xFA, 0x47, 0xFA, 0x48, /* 0xA8-0xAB */
+	0xFA, 0x49, 0xFA, 0x4A, 0xFA, 0x4B, 0xFA, 0x4C, /* 0xAC-0xAF */
+	0xFA, 0x4D, 0xFA, 0x4E, 0xFA, 0x4F, 0xFA, 0x50, /* 0xB0-0xB3 */
+	0xFA, 0x51, 0xFA, 0x52, 0xFA, 0x53, 0xFA, 0x54, /* 0xB4-0xB7 */
+	0xFA, 0x55, 0xFA, 0x56, 0xFA, 0x57, 0xFA, 0x58, /* 0xB8-0xBB */
+	0xFA, 0x59, 0xFA, 0x5A, 0xFA, 0x5B, 0xFA, 0x5C, /* 0xBC-0xBF */
+	0xFA, 0x5D, 0xFA, 0x5E, 0xFA, 0x5F, 0xFA, 0x60, /* 0xC0-0xC3 */
+	0xFA, 0x61, 0xFA, 0x62, 0xFA, 0x63, 0xFA, 0x64, /* 0xC4-0xC7 */
+	0xFA, 0x65, 0xFA, 0x66, 0xFA, 0x67, 0xFA, 0x68, /* 0xC8-0xCB */
+	0xFA, 0x69, 0xFA, 0x6A, 0xFA, 0x6B, 0xFA, 0x6C, /* 0xCC-0xCF */
+	0xFA, 0x6D, 0xFA, 0x6E, 0xFA, 0x6F, 0xFA, 0x70, /* 0xD0-0xD3 */
+	0xFA, 0x71, 0xFA, 0x72, 0xFA, 0x73, 0xFA, 0x74, /* 0xD4-0xD7 */
+	0xFA, 0x75, 0xFA, 0x76, 0xFA, 0x77, 0xFA, 0x78, /* 0xD8-0xDB */
+	0xFA, 0x79, 0xFA, 0x7A, 0xFA, 0x7B, 0xFA, 0x7C, /* 0xDC-0xDF */
+	0xFA, 0x7D, 0xFA, 0x7E, 0xFA, 0x80, 0xFA, 0x81, /* 0xE0-0xE3 */
+	0xFA, 0x82, 0xFA, 0x83, 0xFA, 0x84, 0xFA, 0x85, /* 0xE4-0xE7 */
+	0xFA, 0x86, 0xFA, 0x87, 0xFA, 0x88, 0xFA, 0x89, /* 0xE8-0xEB */
+	0xFA, 0x8A, 0xFA, 0x8B, 0xFA, 0x8C, 0xFA, 0x8D, /* 0xEC-0xEF */
+	0xFA, 0x8E, 0xFA, 0x8F, 0xFA, 0x90, 0xFA, 0x91, /* 0xF0-0xF3 */
+	0xFA, 0x92, 0xFA, 0x93, 0xFA, 0x94, 0xFA, 0x95, /* 0xF4-0xF7 */
+	0xFA, 0x96, 0xFA, 0x97, 0xFA, 0x98, 0xFA, 0x99, /* 0xF8-0xFB */
+	0xFA, 0x9A, 0xFA, 0x9B, 0xFA, 0x9C, 0xFA, 0x9D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9E[512] = {
+	0xFA, 0x9E, 0xFA, 0x9F, 0xFA, 0xA0, 0xFB, 0x40, /* 0x00-0x03 */
+	0xFB, 0x41, 0xFB, 0x42, 0xFB, 0x43, 0xFB, 0x44, /* 0x04-0x07 */
+	0xFB, 0x45, 0xFB, 0x46, 0xFB, 0x47, 0xFB, 0x48, /* 0x08-0x0B */
+	0xFB, 0x49, 0xFB, 0x4A, 0xFB, 0x4B, 0xFB, 0x4C, /* 0x0C-0x0F */
+	0xFB, 0x4D, 0xFB, 0x4E, 0xFB, 0x4F, 0xFB, 0x50, /* 0x10-0x13 */
+	0xFB, 0x51, 0xFB, 0x52, 0xFB, 0x53, 0xFB, 0x54, /* 0x14-0x17 */
+	0xFB, 0x55, 0xFB, 0x56, 0xFB, 0x57, 0xFB, 0x58, /* 0x18-0x1B */
+	0xFB, 0x59, 0xFB, 0x5A, 0xFB, 0x5B, 0xC4, 0xF1, /* 0x1C-0x1F */
+	0xF0, 0xAF, 0xBC, 0xA6, 0xF0, 0xB0, 0xC3, 0xF9, /* 0x20-0x23 */
+	0xFB, 0x5C, 0xC5, 0xB8, 0xD1, 0xBB, 0xFB, 0x5D, /* 0x24-0x27 */
+	0xF0, 0xB1, 0xF0, 0xB2, 0xF0, 0xB3, 0xF0, 0xB4, /* 0x28-0x2B */
+	0xF0, 0xB5, 0xD1, 0xBC, 0xFB, 0x5E, 0xD1, 0xEC, /* 0x2C-0x2F */
+	0xFB, 0x5F, 0xF0, 0xB7, 0xF0, 0xB6, 0xD4, 0xA7, /* 0x30-0x33 */
+	0xFB, 0x60, 0xCD, 0xD2, 0xF0, 0xB8, 0xF0, 0xBA, /* 0x34-0x37 */
+	0xF0, 0xB9, 0xF0, 0xBB, 0xF0, 0xBC, 0xFB, 0x61, /* 0x38-0x3B */
+	0xFB, 0x62, 0xB8, 0xEB, 0xF0, 0xBD, 0xBA, 0xE8, /* 0x3C-0x3F */
+	0xFB, 0x63, 0xF0, 0xBE, 0xF0, 0xBF, 0xBE, 0xE9, /* 0x40-0x43 */
+	0xF0, 0xC0, 0xB6, 0xEC, 0xF0, 0xC1, 0xF0, 0xC2, /* 0x44-0x47 */
+	0xF0, 0xC3, 0xF0, 0xC4, 0xC8, 0xB5, 0xF0, 0xC5, /* 0x48-0x4B */
+	0xF0, 0xC6, 0xFB, 0x64, 0xF0, 0xC7, 0xC5, 0xF4, /* 0x4C-0x4F */
+	0xFB, 0x65, 0xF0, 0xC8, 0xFB, 0x66, 0xFB, 0x67, /* 0x50-0x53 */
+	0xFB, 0x68, 0xF0, 0xC9, 0xFB, 0x69, 0xF0, 0xCA, /* 0x54-0x57 */
+	0xF7, 0xBD, 0xFB, 0x6A, 0xF0, 0xCB, 0xF0, 0xCC, /* 0x58-0x5B */
+	0xF0, 0xCD, 0xFB, 0x6B, 0xF0, 0xCE, 0xFB, 0x6C, /* 0x5C-0x5F */
+	0xFB, 0x6D, 0xFB, 0x6E, 0xFB, 0x6F, 0xF0, 0xCF, /* 0x60-0x63 */
+	0xBA, 0xD7, 0xFB, 0x70, 0xF0, 0xD0, 0xF0, 0xD1, /* 0x64-0x67 */
+	0xF0, 0xD2, 0xF0, 0xD3, 0xF0, 0xD4, 0xF0, 0xD5, /* 0x68-0x6B */
+	0xF0, 0xD6, 0xF0, 0xD8, 0xFB, 0x71, 0xFB, 0x72, /* 0x6C-0x6F */
+	0xD3, 0xA5, 0xF0, 0xD7, 0xFB, 0x73, 0xF0, 0xD9, /* 0x70-0x73 */
+	0xFB, 0x74, 0xFB, 0x75, 0xFB, 0x76, 0xFB, 0x77, /* 0x74-0x77 */
+	0xFB, 0x78, 0xFB, 0x79, 0xFB, 0x7A, 0xFB, 0x7B, /* 0x78-0x7B */
+	0xFB, 0x7C, 0xFB, 0x7D, 0xF5, 0xBA, 0xC2, 0xB9, /* 0x7C-0x7F */
+	
+	0xFB, 0x7E, 0xFB, 0x80, 0xF7, 0xE4, 0xFB, 0x81, /* 0x80-0x83 */
+	0xFB, 0x82, 0xFB, 0x83, 0xFB, 0x84, 0xF7, 0xE5, /* 0x84-0x87 */
+	0xF7, 0xE6, 0xFB, 0x85, 0xFB, 0x86, 0xF7, 0xE7, /* 0x88-0x8B */
+	0xFB, 0x87, 0xFB, 0x88, 0xFB, 0x89, 0xFB, 0x8A, /* 0x8C-0x8F */
+	0xFB, 0x8B, 0xFB, 0x8C, 0xF7, 0xE8, 0xC2, 0xB4, /* 0x90-0x93 */
+	0xFB, 0x8D, 0xFB, 0x8E, 0xFB, 0x8F, 0xFB, 0x90, /* 0x94-0x97 */
+	0xFB, 0x91, 0xFB, 0x92, 0xFB, 0x93, 0xFB, 0x94, /* 0x98-0x9B */
+	0xFB, 0x95, 0xF7, 0xEA, 0xFB, 0x96, 0xF7, 0xEB, /* 0x9C-0x9F */
+	0xFB, 0x97, 0xFB, 0x98, 0xFB, 0x99, 0xFB, 0x9A, /* 0xA0-0xA3 */
+	0xFB, 0x9B, 0xFB, 0x9C, 0xC2, 0xF3, 0xFB, 0x9D, /* 0xA4-0xA7 */
+	0xFB, 0x9E, 0xFB, 0x9F, 0xFB, 0xA0, 0xFC, 0x40, /* 0xA8-0xAB */
+	0xFC, 0x41, 0xFC, 0x42, 0xFC, 0x43, 0xFC, 0x44, /* 0xAC-0xAF */
+	0xFC, 0x45, 0xFC, 0x46, 0xFC, 0x47, 0xFC, 0x48, /* 0xB0-0xB3 */
+	0xF4, 0xF0, 0xFC, 0x49, 0xFC, 0x4A, 0xFC, 0x4B, /* 0xB4-0xB7 */
+	0xF4, 0xEF, 0xFC, 0x4C, 0xFC, 0x4D, 0xC2, 0xE9, /* 0xB8-0xBB */
+	0xFC, 0x4E, 0xF7, 0xE1, 0xF7, 0xE2, 0xFC, 0x4F, /* 0xBC-0xBF */
+	0xFC, 0x50, 0xFC, 0x51, 0xFC, 0x52, 0xFC, 0x53, /* 0xC0-0xC3 */
+	0xBB, 0xC6, 0xFC, 0x54, 0xFC, 0x55, 0xFC, 0x56, /* 0xC4-0xC7 */
+	0xFC, 0x57, 0xD9, 0xE4, 0xFC, 0x58, 0xFC, 0x59, /* 0xC8-0xCB */
+	0xFC, 0x5A, 0xCA, 0xF2, 0xC0, 0xE8, 0xF0, 0xA4, /* 0xCC-0xCF */
+	0xFC, 0x5B, 0xBA, 0xDA, 0xFC, 0x5C, 0xFC, 0x5D, /* 0xD0-0xD3 */
+	0xC7, 0xAD, 0xFC, 0x5E, 0xFC, 0x5F, 0xFC, 0x60, /* 0xD4-0xD7 */
+	0xC4, 0xAC, 0xFC, 0x61, 0xFC, 0x62, 0xF7, 0xEC, /* 0xD8-0xDB */
+	0xF7, 0xED, 0xF7, 0xEE, 0xFC, 0x63, 0xF7, 0xF0, /* 0xDC-0xDF */
+	0xF7, 0xEF, 0xFC, 0x64, 0xF7, 0xF1, 0xFC, 0x65, /* 0xE0-0xE3 */
+	0xFC, 0x66, 0xF7, 0xF4, 0xFC, 0x67, 0xF7, 0xF3, /* 0xE4-0xE7 */
+	0xFC, 0x68, 0xF7, 0xF2, 0xF7, 0xF5, 0xFC, 0x69, /* 0xE8-0xEB */
+	0xFC, 0x6A, 0xFC, 0x6B, 0xFC, 0x6C, 0xF7, 0xF6, /* 0xEC-0xEF */
+	0xFC, 0x6D, 0xFC, 0x6E, 0xFC, 0x6F, 0xFC, 0x70, /* 0xF0-0xF3 */
+	0xFC, 0x71, 0xFC, 0x72, 0xFC, 0x73, 0xFC, 0x74, /* 0xF4-0xF7 */
+	0xFC, 0x75, 0xED, 0xE9, 0xFC, 0x76, 0xED, 0xEA, /* 0xF8-0xFB */
+	0xED, 0xEB, 0xFC, 0x77, 0xF6, 0xBC, 0xFC, 0x78, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9F[512] = {
+	0xFC, 0x79, 0xFC, 0x7A, 0xFC, 0x7B, 0xFC, 0x7C, /* 0x00-0x03 */
+	0xFC, 0x7D, 0xFC, 0x7E, 0xFC, 0x80, 0xFC, 0x81, /* 0x04-0x07 */
+	0xFC, 0x82, 0xFC, 0x83, 0xFC, 0x84, 0xF6, 0xBD, /* 0x08-0x0B */
+	0xFC, 0x85, 0xF6, 0xBE, 0xB6, 0xA6, 0xFC, 0x86, /* 0x0C-0x0F */
+	0xD8, 0xBE, 0xFC, 0x87, 0xFC, 0x88, 0xB9, 0xC4, /* 0x10-0x13 */
+	0xFC, 0x89, 0xFC, 0x8A, 0xFC, 0x8B, 0xD8, 0xBB, /* 0x14-0x17 */
+	0xFC, 0x8C, 0xDC, 0xB1, 0xFC, 0x8D, 0xFC, 0x8E, /* 0x18-0x1B */
+	0xFC, 0x8F, 0xFC, 0x90, 0xFC, 0x91, 0xFC, 0x92, /* 0x1C-0x1F */
+	0xCA, 0xF3, 0xFC, 0x93, 0xF7, 0xF7, 0xFC, 0x94, /* 0x20-0x23 */
+	0xFC, 0x95, 0xFC, 0x96, 0xFC, 0x97, 0xFC, 0x98, /* 0x24-0x27 */
+	0xFC, 0x99, 0xFC, 0x9A, 0xFC, 0x9B, 0xFC, 0x9C, /* 0x28-0x2B */
+	0xF7, 0xF8, 0xFC, 0x9D, 0xFC, 0x9E, 0xF7, 0xF9, /* 0x2C-0x2F */
+	0xFC, 0x9F, 0xFC, 0xA0, 0xFD, 0x40, 0xFD, 0x41, /* 0x30-0x33 */
+	0xFD, 0x42, 0xFD, 0x43, 0xFD, 0x44, 0xF7, 0xFB, /* 0x34-0x37 */
+	0xFD, 0x45, 0xF7, 0xFA, 0xFD, 0x46, 0xB1, 0xC7, /* 0x38-0x3B */
+	0xFD, 0x47, 0xF7, 0xFC, 0xF7, 0xFD, 0xFD, 0x48, /* 0x3C-0x3F */
+	0xFD, 0x49, 0xFD, 0x4A, 0xFD, 0x4B, 0xFD, 0x4C, /* 0x40-0x43 */
+	0xF7, 0xFE, 0xFD, 0x4D, 0xFD, 0x4E, 0xFD, 0x4F, /* 0x44-0x47 */
+	0xFD, 0x50, 0xFD, 0x51, 0xFD, 0x52, 0xFD, 0x53, /* 0x48-0x4B */
+	0xFD, 0x54, 0xFD, 0x55, 0xFD, 0x56, 0xFD, 0x57, /* 0x4C-0x4F */
+	0xC6, 0xEB, 0xEC, 0xB4, 0xFD, 0x58, 0xFD, 0x59, /* 0x50-0x53 */
+	0xFD, 0x5A, 0xFD, 0x5B, 0xFD, 0x5C, 0xFD, 0x5D, /* 0x54-0x57 */
+	0xFD, 0x5E, 0xFD, 0x5F, 0xFD, 0x60, 0xFD, 0x61, /* 0x58-0x5B */
+	0xFD, 0x62, 0xFD, 0x63, 0xFD, 0x64, 0xFD, 0x65, /* 0x5C-0x5F */
+	0xFD, 0x66, 0xFD, 0x67, 0xFD, 0x68, 0xFD, 0x69, /* 0x60-0x63 */
+	0xFD, 0x6A, 0xFD, 0x6B, 0xFD, 0x6C, 0xFD, 0x6D, /* 0x64-0x67 */
+	0xFD, 0x6E, 0xFD, 0x6F, 0xFD, 0x70, 0xFD, 0x71, /* 0x68-0x6B */
+	0xFD, 0x72, 0xFD, 0x73, 0xFD, 0x74, 0xFD, 0x75, /* 0x6C-0x6F */
+	0xFD, 0x76, 0xFD, 0x77, 0xFD, 0x78, 0xFD, 0x79, /* 0x70-0x73 */
+	0xFD, 0x7A, 0xFD, 0x7B, 0xFD, 0x7C, 0xFD, 0x7D, /* 0x74-0x77 */
+	0xFD, 0x7E, 0xFD, 0x80, 0xFD, 0x81, 0xFD, 0x82, /* 0x78-0x7B */
+	0xFD, 0x83, 0xFD, 0x84, 0xFD, 0x85, 0xB3, 0xDD, /* 0x7C-0x7F */
+	
+	0xF6, 0xB3, 0xFD, 0x86, 0xFD, 0x87, 0xF6, 0xB4, /* 0x80-0x83 */
+	0xC1, 0xE4, 0xF6, 0xB5, 0xF6, 0xB6, 0xF6, 0xB7, /* 0x84-0x87 */
+	0xF6, 0xB8, 0xF6, 0xB9, 0xF6, 0xBA, 0xC8, 0xA3, /* 0x88-0x8B */
+	0xF6, 0xBB, 0xFD, 0x88, 0xFD, 0x89, 0xFD, 0x8A, /* 0x8C-0x8F */
+	0xFD, 0x8B, 0xFD, 0x8C, 0xFD, 0x8D, 0xFD, 0x8E, /* 0x90-0x93 */
+	0xFD, 0x8F, 0xFD, 0x90, 0xFD, 0x91, 0xFD, 0x92, /* 0x94-0x97 */
+	0xFD, 0x93, 0xC1, 0xFA, 0xB9, 0xA8, 0xED, 0xE8, /* 0x98-0x9B */
+	0xFD, 0x94, 0xFD, 0x95, 0xFD, 0x96, 0xB9, 0xEA, /* 0x9C-0x9F */
+	0xD9, 0xDF, 0xFD, 0x97, 0xFD, 0x98, 0xFD, 0x99, /* 0xA0-0xA3 */
+	0xFD, 0x9A, 0xFD, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+};
+
+static const unsigned char u2c_DC[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+};
+
+static const unsigned char u2c_F9[512] = {
+	0xD8, 0x4D, 0xB8, 0xFC, 0xDC, 0x87, 0xD9, 0x5A, /* 0x00-0x03 */
+	0xBB, 0xAC, 0xB4, 0xAE, 0xBE, 0xE4, 0xFD, 0x94, /* 0x04-0x07 */
+	0xFD, 0x94, 0xC6, 0xF5, 0xBD, 0xF0, 0xC0, 0xAE, /* 0x08-0x0B */
+	0xC4, 0xCE, 0x91, 0xD0, 0xB0, 0x5D, 0xC1, 0x5F, /* 0x0C-0x0F */
+	0xCC, 0x7D, 0xC2, 0xDD, 0xC2, 0xE3, 0xDF, 0x89, /* 0x10-0x13 */
+	0x98, 0xB7, 0xC2, 0xE5, 0xC0, 0xD3, 0xE7, 0xF3, /* 0x14-0x17 */
+	0xC2, 0xE4, 0xC0, 0xD2, 0xF1, 0x98, 0x81, 0x79, /* 0x18-0x1B */
+	0xC2, 0xD1, 0x99, 0xDA, 0xA0, 0x80, 0xCC, 0x6D, /* 0x1C-0x1F */
+	0xFB, 0x5B, 0x8D, 0xB9, 0x9E, 0x45, 0xCB, 0x7B, /* 0x20-0x23 */
+	0xD2, 0x68, 0xC0, 0xAD, 0xC5, 0x44, 0xCF, 0x9E, /* 0x24-0x27 */
+	0xC0, 0xC8, 0xC0, 0xCA, 0xC0, 0xCB, 0xC0, 0xC7, /* 0x28-0x2B */
+	0xFD, 0x9C, 0x81, 0xED, 0xC0, 0xE4, 0x84, 0xDA, /* 0x2C-0x2F */
+	0x93, 0xEF, 0x99, 0xA9, 0xA0, 0x74, 0xB1, 0x52, /* 0x30-0x33 */
+	0xC0, 0xCF, 0xCC, 0x4A, 0xCC, 0x94, 0xC2, 0xB7, /* 0x34-0x37 */
+	0xC2, 0xB6, 0xF4, 0x94, 0xFA, 0x98, 0xC2, 0xB5, /* 0x38-0x3B */
+	0xB5, 0x93, 0xBE, 0x47, 0xC7, 0x8A, 0xE4, 0x9B, /* 0x3C-0x3F */
+	0xC2, 0xB9, 0xD5, 0x93, 0x89, 0xC5, 0xC5, 0xAA, /* 0x40-0x43 */
+	0xBB, 0x5C, 0xC3, 0x40, 0xC0, 0xCE, 0xC0, 0xDA, /* 0x44-0x47 */
+	0xD9, 0x54, 0xC0, 0xD7, 0x89, 0xBE, 0x8C, 0xD2, /* 0x48-0x4B */
+	0x98, 0xC7, 0x9C, 0x49, 0xC2, 0xA9, 0xC0, 0xDB, /* 0x4C-0x4F */
+	0xBF, 0x7C, 0xC2, 0xAA, 0xC0, 0xD5, 0xC0, 0xDF, /* 0x50-0x53 */
+	0x84, 0x43, 0xC1, 0xE8, 0xB6, 0xA0, 0xBE, 0x63, /* 0x54-0x57 */
+	0xC1, 0xE2, 0xC1, 0xEA, 0xD7, 0x78, 0x92, 0x82, /* 0x58-0x5B */
+	0x98, 0xB7, 0xD6, 0x5A, 0xB5, 0xA4, 0x8C, 0x8E, /* 0x5C-0x5F */
+	0xC5, 0xAD, 0xC2, 0xCA, 0xAE, 0x90, 0xB1, 0xB1, /* 0x60-0x63 */
+	0xB4, 0x91, 0xB1, 0xE3, 0x8F, 0xCD, 0xB2, 0xBB, /* 0x64-0x67 */
+	0xC3, 0xDA, 0x94, 0xB5, 0xCB, 0xF7, 0x85, 0xA2, /* 0x68-0x6B */
+	0xC8, 0xFB, 0xCA, 0xA1, 0xC8, 0x7E, 0xD5, 0x66, /* 0x6C-0x6F */
+	0x9A, 0xA2, 0xB3, 0xBD, 0xC9, 0xF2, 0xCA, 0xB0, /* 0x70-0x73 */
+	0xC8, 0xF4, 0xC2, 0xD3, 0xC2, 0xD4, 0xC1, 0xC1, /* 0x74-0x77 */
+	0x83, 0xC9, 0xFD, 0x9D, 0xC1, 0xBA, 0xBC, 0x5A, /* 0x78-0x7B */
+	0xC1, 0xBC, 0xD5, 0x8F, 0xC1, 0xBF, 0x84, 0xEE, /* 0x7C-0x7F */
+	
+	0x85, 0xCE, 0xC5, 0xAE, 0x8F, 0x5D, 0xC2, 0xC3, /* 0x80-0x83 */
+	0x9E, 0x56, 0xB5, 0x5A, 0xE9, 0x82, 0xF3, 0x50, /* 0x84-0x87 */
+	0xFB, 0x90, 0xC0, 0xE8, 0xC1, 0xA6, 0x95, 0xD1, /* 0x88-0x8B */
+	0x9A, 0x76, 0xDE, 0x5D, 0xC4, 0xEA, 0x91, 0x7A, /* 0x8C-0x8F */
+	0x91, 0xD9, 0x93, 0xD3, 0x9D, 0x69, 0x9F, 0x92, /* 0x90-0x93 */
+	0xAD, 0x49, 0xFD, 0x9E, 0xBE, 0x9A, 0xC2, 0x93, /* 0x94-0x97 */
+	0xDD, 0x82, 0xC9, 0x8F, 0xDF, 0x42, 0xE5, 0x80, /* 0x98-0x9B */
+	0xC1, 0xD0, 0xC1, 0xD3, 0xD1, 0xCA, 0xC1, 0xD2, /* 0x9C-0x9F */
+	0xC1, 0xD1, 0xD5, 0x66, 0xC1, 0xAE, 0xC4, 0xEE, /* 0xA0-0xA3 */
+	0xC4, 0xED, 0x9A, 0x9A, 0xBA, 0x9F, 0xAB, 0x43, /* 0xA4-0xA7 */
+	0xC1, 0xEE, 0xE0, 0xF2, 0x8C, 0x8E, 0x8E, 0x58, /* 0xA8-0xAB */
+	0xC1, 0xAF, 0xC1, 0xE1, 0xAC, 0x93, 0xC1, 0xE7, /* 0xAC-0xAF */
+	0xF1, 0xF6, 0xE2, 0x8F, 0xC1, 0xE3, 0xEC, 0x60, /* 0xB0-0xB3 */
+	0xEE, 0x49, 0xC0, 0xFD, 0xB6, 0x59, 0xF5, 0xB7, /* 0xB4-0xB7 */
+	0xEB, 0x60, 0x90, 0xBA, 0xC1, 0xCB, 0xC1, 0xC5, /* 0xB8-0xBB */
+	0xE5, 0xBC, 0xC4, 0xF2, 0xC1, 0xCF, 0x98, 0xB7, /* 0xBC-0xBF */
+	0xC1, 0xC7, 0xAF, 0x9F, 0xDE, 0xA4, 0xDF, 0x7C, /* 0xC0-0xC3 */
+	0xFD, 0x88, 0x95, 0x9E, 0xC8, 0xEE, 0x84, 0xA2, /* 0xC4-0xC7 */
+	0x96, 0x83, 0xC1, 0xF8, 0xC1, 0xF7, 0xC1, 0xEF, /* 0xC8-0xCB */
+	0xC1, 0xF0, 0xC1, 0xF4, 0xC1, 0xF2, 0xBC, 0x7E, /* 0xCC-0xCF */
+	0xEE, 0x90, 0xC1, 0xF9, 0xC2, 0xBE, 0xEA, 0x91, /* 0xD0-0xD3 */
+	0x82, 0x90, 0x8D, 0x91, 0x9C, 0x53, 0xDD, 0x86, /* 0xD4-0xD7 */
+	0xC2, 0xC9, 0x90, 0xFC, 0xC0, 0xF5, 0xC2, 0xCA, /* 0xD8-0xDB */
+	0xC2, 0xA1, 0xC0, 0xFB, 0xC0, 0xF4, 0xC2, 0xC4, /* 0xDC-0xDF */
+	0xD2, 0xD7, 0xC0, 0xEE, 0xC0, 0xE6, 0xC4, 0xE0, /* 0xE0-0xE3 */
+	0xC0, 0xED, 0xC1, 0xA1, 0xEE, 0xBE, 0xFD, 0x9F, /* 0xE4-0xE7 */
+	0xD1, 0x65, 0xC0, 0xEF, 0xEB, 0x78, 0xC4, 0xE4, /* 0xE8-0xEB */
+	0xC4, 0xE7, 0xC1, 0xDF, 0x9F, 0xFB, 0xAD, 0x55, /* 0xEC-0xEF */
+	0xCC, 0x41, 0xFD, 0xA0, 0xF7, 0x5B, 0xF7, 0xEB, /* 0xF0-0xF3 */
+	0xC1, 0xD6, 0xC1, 0xDC, 0xC5, 0x52, 0xC1, 0xA2, /* 0xF4-0xF7 */
+	0xF3, 0xD2, 0xC1, 0xA3, 0xA0, 0xEE, 0xD6, 0xCB, /* 0xF8-0xFB */
+	0xD7, 0x52, 0xCA, 0xB2, 0xB2, 0xE8, 0xB4, 0xCC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_FA[512] = {
+	0xC7, 0xD0, 0xB6, 0xC8, 0xCD, 0xD8, 0xCC, 0xC7, /* 0x00-0x03 */
+	0xD5, 0xAC, 0xB6, 0xB4, 0xB1, 0xA9, 0xDD, 0x97, /* 0x04-0x07 */
+	0xD0, 0xD0, 0xBD, 0xB5, 0xD2, 0x8A, 0xC0, 0xAA, /* 0x08-0x0B */
+	0xFE, 0x40, 0xFE, 0x41, 0xFE, 0x42, 0xFE, 0x43, /* 0x0C-0x0F */
+	0x89, 0x56, 0xFE, 0x44, 0xC7, 0xE7, 0xFE, 0x45, /* 0x10-0x13 */
+	0xFE, 0x46, 0x84, 0x44, 0xD8, 0x69, 0xD2, 0xE6, /* 0x14-0x17 */
+	0xFE, 0x47, 0xC9, 0xF1, 0xCF, 0xE9, 0xB8, 0xA3, /* 0x18-0x1B */
+	0xBE, 0xB8, 0xBE, 0xAB, 0xD3, 0xF0, 0xFE, 0x48, /* 0x1C-0x1F */
+	0xFE, 0x49, 0xFE, 0x4A, 0xD6, 0x54, 0xFE, 0x4B, /* 0x20-0x23 */
+	0xFE, 0x4C, 0xD2, 0xDD, 0xB6, 0xBC, 0xFE, 0x4D, /* 0x24-0x27 */
+	0xFE, 0x4E, 0xFE, 0x4F, 0xEF, 0x88, 0xEF, 0x95, /* 0x28-0x2B */
+	0xF0, 0x5E, 0xFA, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+};
+
+static const unsigned char u2c_FE[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xA9, 0x55, 0xA6, 0xF2, 0x00, 0x00, 0xA6, 0xF4, /* 0x30-0x33 */
+	0xA6, 0xF5, 0xA6, 0xE0, 0xA6, 0xE1, 0xA6, 0xF0, /* 0x34-0x37 */
+	0xA6, 0xF1, 0xA6, 0xE2, 0xA6, 0xE3, 0xA6, 0xEE, /* 0x38-0x3B */
+	0xA6, 0xEF, 0xA6, 0xE6, 0xA6, 0xE7, 0xA6, 0xE4, /* 0x3C-0x3F */
+	0xA6, 0xE5, 0xA6, 0xE8, 0xA6, 0xE9, 0xA6, 0xEA, /* 0x40-0x43 */
+	0xA6, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xA9, 0x68, 0xA9, 0x69, 0xA9, 0x6A, /* 0x48-0x4B */
+	0xA9, 0x6B, 0xA9, 0x6C, 0xA9, 0x6D, 0xA9, 0x6E, /* 0x4C-0x4F */
+	0xA9, 0x6F, 0xA9, 0x70, 0xA9, 0x71, 0x00, 0x00, /* 0x50-0x53 */
+	0xA9, 0x72, 0xA9, 0x73, 0xA9, 0x74, 0xA9, 0x75, /* 0x54-0x57 */
+	0x00, 0x00, 0xA9, 0x76, 0xA9, 0x77, 0xA9, 0x78, /* 0x58-0x5B */
+	0xA9, 0x79, 0xA9, 0x7A, 0xA9, 0x7B, 0xA9, 0x7C, /* 0x5C-0x5F */
+	0xA9, 0x7D, 0xA9, 0x7E, 0xA9, 0x80, 0xA9, 0x81, /* 0x60-0x63 */
+	0xA9, 0x82, 0xA9, 0x83, 0xA9, 0x84, 0x00, 0x00, /* 0x64-0x67 */
+	0xA9, 0x85, 0xA9, 0x86, 0xA9, 0x87, 0xA9, 0x88, /* 0x68-0x6B */
+};
+
+static const unsigned char u2c_FF[512] = {
+	0x00, 0x00, 0xA3, 0xA1, 0xA3, 0xA2, 0xA3, 0xA3, /* 0x00-0x03 */
+	0xA1, 0xE7, 0xA3, 0xA5, 0xA3, 0xA6, 0xA3, 0xA7, /* 0x04-0x07 */
+	0xA3, 0xA8, 0xA3, 0xA9, 0xA3, 0xAA, 0xA3, 0xAB, /* 0x08-0x0B */
+	0xA3, 0xAC, 0xA3, 0xAD, 0xA3, 0xAE, 0xA3, 0xAF, /* 0x0C-0x0F */
+	0xA3, 0xB0, 0xA3, 0xB1, 0xA3, 0xB2, 0xA3, 0xB3, /* 0x10-0x13 */
+	0xA3, 0xB4, 0xA3, 0xB5, 0xA3, 0xB6, 0xA3, 0xB7, /* 0x14-0x17 */
+	0xA3, 0xB8, 0xA3, 0xB9, 0xA3, 0xBA, 0xA3, 0xBB, /* 0x18-0x1B */
+	0xA3, 0xBC, 0xA3, 0xBD, 0xA3, 0xBE, 0xA3, 0xBF, /* 0x1C-0x1F */
+	0xA3, 0xC0, 0xA3, 0xC1, 0xA3, 0xC2, 0xA3, 0xC3, /* 0x20-0x23 */
+	0xA3, 0xC4, 0xA3, 0xC5, 0xA3, 0xC6, 0xA3, 0xC7, /* 0x24-0x27 */
+	0xA3, 0xC8, 0xA3, 0xC9, 0xA3, 0xCA, 0xA3, 0xCB, /* 0x28-0x2B */
+	0xA3, 0xCC, 0xA3, 0xCD, 0xA3, 0xCE, 0xA3, 0xCF, /* 0x2C-0x2F */
+	0xA3, 0xD0, 0xA3, 0xD1, 0xA3, 0xD2, 0xA3, 0xD3, /* 0x30-0x33 */
+	0xA3, 0xD4, 0xA3, 0xD5, 0xA3, 0xD6, 0xA3, 0xD7, /* 0x34-0x37 */
+	0xA3, 0xD8, 0xA3, 0xD9, 0xA3, 0xDA, 0xA3, 0xDB, /* 0x38-0x3B */
+	0xA3, 0xDC, 0xA3, 0xDD, 0xA3, 0xDE, 0xA3, 0xDF, /* 0x3C-0x3F */
+	0xA3, 0xE0, 0xA3, 0xE1, 0xA3, 0xE2, 0xA3, 0xE3, /* 0x40-0x43 */
+	0xA3, 0xE4, 0xA3, 0xE5, 0xA3, 0xE6, 0xA3, 0xE7, /* 0x44-0x47 */
+	0xA3, 0xE8, 0xA3, 0xE9, 0xA3, 0xEA, 0xA3, 0xEB, /* 0x48-0x4B */
+	0xA3, 0xEC, 0xA3, 0xED, 0xA3, 0xEE, 0xA3, 0xEF, /* 0x4C-0x4F */
+	0xA3, 0xF0, 0xA3, 0xF1, 0xA3, 0xF2, 0xA3, 0xF3, /* 0x50-0x53 */
+	0xA3, 0xF4, 0xA3, 0xF5, 0xA3, 0xF6, 0xA3, 0xF7, /* 0x54-0x57 */
+	0xA3, 0xF8, 0xA3, 0xF9, 0xA3, 0xFA, 0xA3, 0xFB, /* 0x58-0x5B */
+	0xA3, 0xFC, 0xA3, 0xFD, 0xA1, 0xAB, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xA1, 0xE9, 0xA1, 0xEA, 0xA9, 0x56, 0xA3, 0xFE, /* 0xE0-0xE3 */
+	0xA9, 0x57, 0xA3, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	u2c_00, u2c_01, u2c_02, u2c_03, u2c_04, NULL,   NULL,   NULL,
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_20, u2c_21, u2c_22, u2c_23, u2c_24, u2c_25, u2c_26, NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_30, u2c_31, u2c_32, u2c_33, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   u2c_4E, u2c_4F, 
+	u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, 
+	u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, 
+	u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, 
+	u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, 
+	u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, 
+	u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, 
+	u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, 
+	u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, 
+	u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, 
+	u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, 
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   u2c_DC, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   u2c_F9, u2c_FA, NULL,   NULL,   NULL,   u2c_FE, u2c_FF, };
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(const wchar_t uni,
+			unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni&0xFF;
+	unsigned char ch = (uni>>8)&0xFF;
+	unsigned char out0,out1;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	if (uni == 0x20ac) {/* Euro symbol.The only exception with a non-ascii unicode */
+		out[0] = 0x80;
+		return 1;
+	}
+
+	if (ch == 0) { /* handle the U00 plane*/
+		/* if (cl == 0) return -EINVAL;*/ /*U0000 is legal in cp936*/
+		out0 = u2c_00[cl*2];
+		out1 = u2c_00[cl*2+1];
+		if (out0 == 0x00 && out1 == 0x00) {
+			if (cl<0x80) {
+				out[0] = cl;
+				return 1;
+			}
+			return -EINVAL;
+		} else {
+			if (boundlen <= 1)
+				return -ENAMETOOLONG;
+			out[0] = out0;
+			out[1] = out1;
+			return 2;
+		}
+	}
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset) {
+		if (boundlen <= 1)
+			return -ENAMETOOLONG;
+		out[0] = uni2charset[cl*2];
+		out[1] = uni2charset[cl*2+1];
+		if (out[0] == 0x00 && out[1] == 0x00)
+			return -EINVAL;
+		return 2;
+	}
+	else
+		return -EINVAL;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen,
+			wchar_t *uni)
+{
+	unsigned char ch, cl;
+	const wchar_t *charset2uni;
+	int n;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	if (boundlen == 1) {
+		if (rawstring[0]==0x80) { /* Euro symbol.The only exception with a non-ascii unicode */
+			*uni = 0x20ac;
+		} else {
+			*uni = rawstring[0];
+		}
+		return 1;
+	}
+
+	ch = rawstring[0];
+	cl = rawstring[1];
+
+	charset2uni = page_charset2uni[ch];
+	if (charset2uni && cl) {
+		*uni = charset2uni[cl];
+		if (*uni == 0x0000)
+			return -EINVAL;
+		n = 2;
+	} else{
+		if (ch==0x80) {/* Euro symbol.The only exception with a non-ascii unicode */
+			*uni = 0x20ac;
+		} else {
+			*uni = ch;
+		}
+		n = 1;
+	}
+	return n;
+}
+
+static struct nls_table table = {
+	.charset	= "cp936",
+	.alias		= "gb2312",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp936(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp936(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp936)
+module_exit(exit_nls_cp936)
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NLS(gb2312);
diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c
new file mode 100644
index 00000000..8a7a2fe8
--- /dev/null
+++ b/fs/nls/nls_cp949.c
@@ -0,0 +1,13947 @@
+/*
+ * linux/fs/nls/nls_cp949.c
+ *
+ * Charset cp949 translation tables.
+ * This translation table was generated automatically, the
+ * original table can be download from the Microsoft website.
+ * (http://www.microsoft.com/typography/unicode/unicodecp.htm)
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t c2u_81[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xAC02,0xAC03,0xAC05,0xAC06,0xAC0B,0xAC0C,0xAC0D,/* 0x40-0x47 */
+	0xAC0E,0xAC0F,0xAC18,0xAC1E,0xAC1F,0xAC21,0xAC22,0xAC23,/* 0x48-0x4F */
+	0xAC25,0xAC26,0xAC27,0xAC28,0xAC29,0xAC2A,0xAC2B,0xAC2E,/* 0x50-0x57 */
+	0xAC32,0xAC33,0xAC34,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xAC35,0xAC36,0xAC37,0xAC3A,0xAC3B,0xAC3D,0xAC3E,/* 0x60-0x67 */
+	0xAC3F,0xAC41,0xAC42,0xAC43,0xAC44,0xAC45,0xAC46,0xAC47,/* 0x68-0x6F */
+	0xAC48,0xAC49,0xAC4A,0xAC4C,0xAC4E,0xAC4F,0xAC50,0xAC51,/* 0x70-0x77 */
+	0xAC52,0xAC53,0xAC55,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xAC56,0xAC57,0xAC59,0xAC5A,0xAC5B,0xAC5D,0xAC5E,/* 0x80-0x87 */
+	0xAC5F,0xAC60,0xAC61,0xAC62,0xAC63,0xAC64,0xAC65,0xAC66,/* 0x88-0x8F */
+	0xAC67,0xAC68,0xAC69,0xAC6A,0xAC6B,0xAC6C,0xAC6D,0xAC6E,/* 0x90-0x97 */
+	0xAC6F,0xAC72,0xAC73,0xAC75,0xAC76,0xAC79,0xAC7B,0xAC7C,/* 0x98-0x9F */
+	0xAC7D,0xAC7E,0xAC7F,0xAC82,0xAC87,0xAC88,0xAC8D,0xAC8E,/* 0xA0-0xA7 */
+	0xAC8F,0xAC91,0xAC92,0xAC93,0xAC95,0xAC96,0xAC97,0xAC98,/* 0xA8-0xAF */
+	0xAC99,0xAC9A,0xAC9B,0xAC9E,0xACA2,0xACA3,0xACA4,0xACA5,/* 0xB0-0xB7 */
+	0xACA6,0xACA7,0xACAB,0xACAD,0xACAE,0xACB1,0xACB2,0xACB3,/* 0xB8-0xBF */
+	0xACB4,0xACB5,0xACB6,0xACB7,0xACBA,0xACBE,0xACBF,0xACC0,/* 0xC0-0xC7 */
+	0xACC2,0xACC3,0xACC5,0xACC6,0xACC7,0xACC9,0xACCA,0xACCB,/* 0xC8-0xCF */
+	0xACCD,0xACCE,0xACCF,0xACD0,0xACD1,0xACD2,0xACD3,0xACD4,/* 0xD0-0xD7 */
+	0xACD6,0xACD8,0xACD9,0xACDA,0xACDB,0xACDC,0xACDD,0xACDE,/* 0xD8-0xDF */
+	0xACDF,0xACE2,0xACE3,0xACE5,0xACE6,0xACE9,0xACEB,0xACED,/* 0xE0-0xE7 */
+	0xACEE,0xACF2,0xACF4,0xACF7,0xACF8,0xACF9,0xACFA,0xACFB,/* 0xE8-0xEF */
+	0xACFE,0xACFF,0xAD01,0xAD02,0xAD03,0xAD05,0xAD07,0xAD08,/* 0xF0-0xF7 */
+	0xAD09,0xAD0A,0xAD0B,0xAD0E,0xAD10,0xAD12,0xAD13,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_82[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xAD14,0xAD15,0xAD16,0xAD17,0xAD19,0xAD1A,0xAD1B,/* 0x40-0x47 */
+	0xAD1D,0xAD1E,0xAD1F,0xAD21,0xAD22,0xAD23,0xAD24,0xAD25,/* 0x48-0x4F */
+	0xAD26,0xAD27,0xAD28,0xAD2A,0xAD2B,0xAD2E,0xAD2F,0xAD30,/* 0x50-0x57 */
+	0xAD31,0xAD32,0xAD33,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xAD36,0xAD37,0xAD39,0xAD3A,0xAD3B,0xAD3D,0xAD3E,/* 0x60-0x67 */
+	0xAD3F,0xAD40,0xAD41,0xAD42,0xAD43,0xAD46,0xAD48,0xAD4A,/* 0x68-0x6F */
+	0xAD4B,0xAD4C,0xAD4D,0xAD4E,0xAD4F,0xAD51,0xAD52,0xAD53,/* 0x70-0x77 */
+	0xAD55,0xAD56,0xAD57,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xAD59,0xAD5A,0xAD5B,0xAD5C,0xAD5D,0xAD5E,0xAD5F,/* 0x80-0x87 */
+	0xAD60,0xAD62,0xAD64,0xAD65,0xAD66,0xAD67,0xAD68,0xAD69,/* 0x88-0x8F */
+	0xAD6A,0xAD6B,0xAD6E,0xAD6F,0xAD71,0xAD72,0xAD77,0xAD78,/* 0x90-0x97 */
+	0xAD79,0xAD7A,0xAD7E,0xAD80,0xAD83,0xAD84,0xAD85,0xAD86,/* 0x98-0x9F */
+	0xAD87,0xAD8A,0xAD8B,0xAD8D,0xAD8E,0xAD8F,0xAD91,0xAD92,/* 0xA0-0xA7 */
+	0xAD93,0xAD94,0xAD95,0xAD96,0xAD97,0xAD98,0xAD99,0xAD9A,/* 0xA8-0xAF */
+	0xAD9B,0xAD9E,0xAD9F,0xADA0,0xADA1,0xADA2,0xADA3,0xADA5,/* 0xB0-0xB7 */
+	0xADA6,0xADA7,0xADA8,0xADA9,0xADAA,0xADAB,0xADAC,0xADAD,/* 0xB8-0xBF */
+	0xADAE,0xADAF,0xADB0,0xADB1,0xADB2,0xADB3,0xADB4,0xADB5,/* 0xC0-0xC7 */
+	0xADB6,0xADB8,0xADB9,0xADBA,0xADBB,0xADBC,0xADBD,0xADBE,/* 0xC8-0xCF */
+	0xADBF,0xADC2,0xADC3,0xADC5,0xADC6,0xADC7,0xADC9,0xADCA,/* 0xD0-0xD7 */
+	0xADCB,0xADCC,0xADCD,0xADCE,0xADCF,0xADD2,0xADD4,0xADD5,/* 0xD8-0xDF */
+	0xADD6,0xADD7,0xADD8,0xADD9,0xADDA,0xADDB,0xADDD,0xADDE,/* 0xE0-0xE7 */
+	0xADDF,0xADE1,0xADE2,0xADE3,0xADE5,0xADE6,0xADE7,0xADE8,/* 0xE8-0xEF */
+	0xADE9,0xADEA,0xADEB,0xADEC,0xADED,0xADEE,0xADEF,0xADF0,/* 0xF0-0xF7 */
+	0xADF1,0xADF2,0xADF3,0xADF4,0xADF5,0xADF6,0xADF7,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_83[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xADFA,0xADFB,0xADFD,0xADFE,0xAE02,0xAE03,0xAE04,/* 0x40-0x47 */
+	0xAE05,0xAE06,0xAE07,0xAE0A,0xAE0C,0xAE0E,0xAE0F,0xAE10,/* 0x48-0x4F */
+	0xAE11,0xAE12,0xAE13,0xAE15,0xAE16,0xAE17,0xAE18,0xAE19,/* 0x50-0x57 */
+	0xAE1A,0xAE1B,0xAE1C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xAE1D,0xAE1E,0xAE1F,0xAE20,0xAE21,0xAE22,0xAE23,/* 0x60-0x67 */
+	0xAE24,0xAE25,0xAE26,0xAE27,0xAE28,0xAE29,0xAE2A,0xAE2B,/* 0x68-0x6F */
+	0xAE2C,0xAE2D,0xAE2E,0xAE2F,0xAE32,0xAE33,0xAE35,0xAE36,/* 0x70-0x77 */
+	0xAE39,0xAE3B,0xAE3C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xAE3D,0xAE3E,0xAE3F,0xAE42,0xAE44,0xAE47,0xAE48,/* 0x80-0x87 */
+	0xAE49,0xAE4B,0xAE4F,0xAE51,0xAE52,0xAE53,0xAE55,0xAE57,/* 0x88-0x8F */
+	0xAE58,0xAE59,0xAE5A,0xAE5B,0xAE5E,0xAE62,0xAE63,0xAE64,/* 0x90-0x97 */
+	0xAE66,0xAE67,0xAE6A,0xAE6B,0xAE6D,0xAE6E,0xAE6F,0xAE71,/* 0x98-0x9F */
+	0xAE72,0xAE73,0xAE74,0xAE75,0xAE76,0xAE77,0xAE7A,0xAE7E,/* 0xA0-0xA7 */
+	0xAE7F,0xAE80,0xAE81,0xAE82,0xAE83,0xAE86,0xAE87,0xAE88,/* 0xA8-0xAF */
+	0xAE89,0xAE8A,0xAE8B,0xAE8D,0xAE8E,0xAE8F,0xAE90,0xAE91,/* 0xB0-0xB7 */
+	0xAE92,0xAE93,0xAE94,0xAE95,0xAE96,0xAE97,0xAE98,0xAE99,/* 0xB8-0xBF */
+	0xAE9A,0xAE9B,0xAE9C,0xAE9D,0xAE9E,0xAE9F,0xAEA0,0xAEA1,/* 0xC0-0xC7 */
+	0xAEA2,0xAEA3,0xAEA4,0xAEA5,0xAEA6,0xAEA7,0xAEA8,0xAEA9,/* 0xC8-0xCF */
+	0xAEAA,0xAEAB,0xAEAC,0xAEAD,0xAEAE,0xAEAF,0xAEB0,0xAEB1,/* 0xD0-0xD7 */
+	0xAEB2,0xAEB3,0xAEB4,0xAEB5,0xAEB6,0xAEB7,0xAEB8,0xAEB9,/* 0xD8-0xDF */
+	0xAEBA,0xAEBB,0xAEBF,0xAEC1,0xAEC2,0xAEC3,0xAEC5,0xAEC6,/* 0xE0-0xE7 */
+	0xAEC7,0xAEC8,0xAEC9,0xAECA,0xAECB,0xAECE,0xAED2,0xAED3,/* 0xE8-0xEF */
+	0xAED4,0xAED5,0xAED6,0xAED7,0xAEDA,0xAEDB,0xAEDD,0xAEDE,/* 0xF0-0xF7 */
+	0xAEDF,0xAEE0,0xAEE1,0xAEE2,0xAEE3,0xAEE4,0xAEE5,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_84[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xAEE6,0xAEE7,0xAEE9,0xAEEA,0xAEEC,0xAEEE,0xAEEF,/* 0x40-0x47 */
+	0xAEF0,0xAEF1,0xAEF2,0xAEF3,0xAEF5,0xAEF6,0xAEF7,0xAEF9,/* 0x48-0x4F */
+	0xAEFA,0xAEFB,0xAEFD,0xAEFE,0xAEFF,0xAF00,0xAF01,0xAF02,/* 0x50-0x57 */
+	0xAF03,0xAF04,0xAF05,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xAF06,0xAF09,0xAF0A,0xAF0B,0xAF0C,0xAF0E,0xAF0F,/* 0x60-0x67 */
+	0xAF11,0xAF12,0xAF13,0xAF14,0xAF15,0xAF16,0xAF17,0xAF18,/* 0x68-0x6F */
+	0xAF19,0xAF1A,0xAF1B,0xAF1C,0xAF1D,0xAF1E,0xAF1F,0xAF20,/* 0x70-0x77 */
+	0xAF21,0xAF22,0xAF23,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xAF24,0xAF25,0xAF26,0xAF27,0xAF28,0xAF29,0xAF2A,/* 0x80-0x87 */
+	0xAF2B,0xAF2E,0xAF2F,0xAF31,0xAF33,0xAF35,0xAF36,0xAF37,/* 0x88-0x8F */
+	0xAF38,0xAF39,0xAF3A,0xAF3B,0xAF3E,0xAF40,0xAF44,0xAF45,/* 0x90-0x97 */
+	0xAF46,0xAF47,0xAF4A,0xAF4B,0xAF4C,0xAF4D,0xAF4E,0xAF4F,/* 0x98-0x9F */
+	0xAF51,0xAF52,0xAF53,0xAF54,0xAF55,0xAF56,0xAF57,0xAF58,/* 0xA0-0xA7 */
+	0xAF59,0xAF5A,0xAF5B,0xAF5E,0xAF5F,0xAF60,0xAF61,0xAF62,/* 0xA8-0xAF */
+	0xAF63,0xAF66,0xAF67,0xAF68,0xAF69,0xAF6A,0xAF6B,0xAF6C,/* 0xB0-0xB7 */
+	0xAF6D,0xAF6E,0xAF6F,0xAF70,0xAF71,0xAF72,0xAF73,0xAF74,/* 0xB8-0xBF */
+	0xAF75,0xAF76,0xAF77,0xAF78,0xAF7A,0xAF7B,0xAF7C,0xAF7D,/* 0xC0-0xC7 */
+	0xAF7E,0xAF7F,0xAF81,0xAF82,0xAF83,0xAF85,0xAF86,0xAF87,/* 0xC8-0xCF */
+	0xAF89,0xAF8A,0xAF8B,0xAF8C,0xAF8D,0xAF8E,0xAF8F,0xAF92,/* 0xD0-0xD7 */
+	0xAF93,0xAF94,0xAF96,0xAF97,0xAF98,0xAF99,0xAF9A,0xAF9B,/* 0xD8-0xDF */
+	0xAF9D,0xAF9E,0xAF9F,0xAFA0,0xAFA1,0xAFA2,0xAFA3,0xAFA4,/* 0xE0-0xE7 */
+	0xAFA5,0xAFA6,0xAFA7,0xAFA8,0xAFA9,0xAFAA,0xAFAB,0xAFAC,/* 0xE8-0xEF */
+	0xAFAD,0xAFAE,0xAFAF,0xAFB0,0xAFB1,0xAFB2,0xAFB3,0xAFB4,/* 0xF0-0xF7 */
+	0xAFB5,0xAFB6,0xAFB7,0xAFBA,0xAFBB,0xAFBD,0xAFBE,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_85[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xAFBF,0xAFC1,0xAFC2,0xAFC3,0xAFC4,0xAFC5,0xAFC6,/* 0x40-0x47 */
+	0xAFCA,0xAFCC,0xAFCF,0xAFD0,0xAFD1,0xAFD2,0xAFD3,0xAFD5,/* 0x48-0x4F */
+	0xAFD6,0xAFD7,0xAFD8,0xAFD9,0xAFDA,0xAFDB,0xAFDD,0xAFDE,/* 0x50-0x57 */
+	0xAFDF,0xAFE0,0xAFE1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xAFE2,0xAFE3,0xAFE4,0xAFE5,0xAFE6,0xAFE7,0xAFEA,/* 0x60-0x67 */
+	0xAFEB,0xAFEC,0xAFED,0xAFEE,0xAFEF,0xAFF2,0xAFF3,0xAFF5,/* 0x68-0x6F */
+	0xAFF6,0xAFF7,0xAFF9,0xAFFA,0xAFFB,0xAFFC,0xAFFD,0xAFFE,/* 0x70-0x77 */
+	0xAFFF,0xB002,0xB003,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB005,0xB006,0xB007,0xB008,0xB009,0xB00A,0xB00B,/* 0x80-0x87 */
+	0xB00D,0xB00E,0xB00F,0xB011,0xB012,0xB013,0xB015,0xB016,/* 0x88-0x8F */
+	0xB017,0xB018,0xB019,0xB01A,0xB01B,0xB01E,0xB01F,0xB020,/* 0x90-0x97 */
+	0xB021,0xB022,0xB023,0xB024,0xB025,0xB026,0xB027,0xB029,/* 0x98-0x9F */
+	0xB02A,0xB02B,0xB02C,0xB02D,0xB02E,0xB02F,0xB030,0xB031,/* 0xA0-0xA7 */
+	0xB032,0xB033,0xB034,0xB035,0xB036,0xB037,0xB038,0xB039,/* 0xA8-0xAF */
+	0xB03A,0xB03B,0xB03C,0xB03D,0xB03E,0xB03F,0xB040,0xB041,/* 0xB0-0xB7 */
+	0xB042,0xB043,0xB046,0xB047,0xB049,0xB04B,0xB04D,0xB04F,/* 0xB8-0xBF */
+	0xB050,0xB051,0xB052,0xB056,0xB058,0xB05A,0xB05B,0xB05C,/* 0xC0-0xC7 */
+	0xB05E,0xB05F,0xB060,0xB061,0xB062,0xB063,0xB064,0xB065,/* 0xC8-0xCF */
+	0xB066,0xB067,0xB068,0xB069,0xB06A,0xB06B,0xB06C,0xB06D,/* 0xD0-0xD7 */
+	0xB06E,0xB06F,0xB070,0xB071,0xB072,0xB073,0xB074,0xB075,/* 0xD8-0xDF */
+	0xB076,0xB077,0xB078,0xB079,0xB07A,0xB07B,0xB07E,0xB07F,/* 0xE0-0xE7 */
+	0xB081,0xB082,0xB083,0xB085,0xB086,0xB087,0xB088,0xB089,/* 0xE8-0xEF */
+	0xB08A,0xB08B,0xB08E,0xB090,0xB092,0xB093,0xB094,0xB095,/* 0xF0-0xF7 */
+	0xB096,0xB097,0xB09B,0xB09D,0xB09E,0xB0A3,0xB0A4,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_86[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB0A5,0xB0A6,0xB0A7,0xB0AA,0xB0B0,0xB0B2,0xB0B6,/* 0x40-0x47 */
+	0xB0B7,0xB0B9,0xB0BA,0xB0BB,0xB0BD,0xB0BE,0xB0BF,0xB0C0,/* 0x48-0x4F */
+	0xB0C1,0xB0C2,0xB0C3,0xB0C6,0xB0CA,0xB0CB,0xB0CC,0xB0CD,/* 0x50-0x57 */
+	0xB0CE,0xB0CF,0xB0D2,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB0D3,0xB0D5,0xB0D6,0xB0D7,0xB0D9,0xB0DA,0xB0DB,/* 0x60-0x67 */
+	0xB0DC,0xB0DD,0xB0DE,0xB0DF,0xB0E1,0xB0E2,0xB0E3,0xB0E4,/* 0x68-0x6F */
+	0xB0E6,0xB0E7,0xB0E8,0xB0E9,0xB0EA,0xB0EB,0xB0EC,0xB0ED,/* 0x70-0x77 */
+	0xB0EE,0xB0EF,0xB0F0,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB0F1,0xB0F2,0xB0F3,0xB0F4,0xB0F5,0xB0F6,0xB0F7,/* 0x80-0x87 */
+	0xB0F8,0xB0F9,0xB0FA,0xB0FB,0xB0FC,0xB0FD,0xB0FE,0xB0FF,/* 0x88-0x8F */
+	0xB100,0xB101,0xB102,0xB103,0xB104,0xB105,0xB106,0xB107,/* 0x90-0x97 */
+	0xB10A,0xB10D,0xB10E,0xB10F,0xB111,0xB114,0xB115,0xB116,/* 0x98-0x9F */
+	0xB117,0xB11A,0xB11E,0xB11F,0xB120,0xB121,0xB122,0xB126,/* 0xA0-0xA7 */
+	0xB127,0xB129,0xB12A,0xB12B,0xB12D,0xB12E,0xB12F,0xB130,/* 0xA8-0xAF */
+	0xB131,0xB132,0xB133,0xB136,0xB13A,0xB13B,0xB13C,0xB13D,/* 0xB0-0xB7 */
+	0xB13E,0xB13F,0xB142,0xB143,0xB145,0xB146,0xB147,0xB149,/* 0xB8-0xBF */
+	0xB14A,0xB14B,0xB14C,0xB14D,0xB14E,0xB14F,0xB152,0xB153,/* 0xC0-0xC7 */
+	0xB156,0xB157,0xB159,0xB15A,0xB15B,0xB15D,0xB15E,0xB15F,/* 0xC8-0xCF */
+	0xB161,0xB162,0xB163,0xB164,0xB165,0xB166,0xB167,0xB168,/* 0xD0-0xD7 */
+	0xB169,0xB16A,0xB16B,0xB16C,0xB16D,0xB16E,0xB16F,0xB170,/* 0xD8-0xDF */
+	0xB171,0xB172,0xB173,0xB174,0xB175,0xB176,0xB177,0xB17A,/* 0xE0-0xE7 */
+	0xB17B,0xB17D,0xB17E,0xB17F,0xB181,0xB183,0xB184,0xB185,/* 0xE8-0xEF */
+	0xB186,0xB187,0xB18A,0xB18C,0xB18E,0xB18F,0xB190,0xB191,/* 0xF0-0xF7 */
+	0xB195,0xB196,0xB197,0xB199,0xB19A,0xB19B,0xB19D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_87[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB19E,0xB19F,0xB1A0,0xB1A1,0xB1A2,0xB1A3,0xB1A4,/* 0x40-0x47 */
+	0xB1A5,0xB1A6,0xB1A7,0xB1A9,0xB1AA,0xB1AB,0xB1AC,0xB1AD,/* 0x48-0x4F */
+	0xB1AE,0xB1AF,0xB1B0,0xB1B1,0xB1B2,0xB1B3,0xB1B4,0xB1B5,/* 0x50-0x57 */
+	0xB1B6,0xB1B7,0xB1B8,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB1B9,0xB1BA,0xB1BB,0xB1BC,0xB1BD,0xB1BE,0xB1BF,/* 0x60-0x67 */
+	0xB1C0,0xB1C1,0xB1C2,0xB1C3,0xB1C4,0xB1C5,0xB1C6,0xB1C7,/* 0x68-0x6F */
+	0xB1C8,0xB1C9,0xB1CA,0xB1CB,0xB1CD,0xB1CE,0xB1CF,0xB1D1,/* 0x70-0x77 */
+	0xB1D2,0xB1D3,0xB1D5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB1D6,0xB1D7,0xB1D8,0xB1D9,0xB1DA,0xB1DB,0xB1DE,/* 0x80-0x87 */
+	0xB1E0,0xB1E1,0xB1E2,0xB1E3,0xB1E4,0xB1E5,0xB1E6,0xB1E7,/* 0x88-0x8F */
+	0xB1EA,0xB1EB,0xB1ED,0xB1EE,0xB1EF,0xB1F1,0xB1F2,0xB1F3,/* 0x90-0x97 */
+	0xB1F4,0xB1F5,0xB1F6,0xB1F7,0xB1F8,0xB1FA,0xB1FC,0xB1FE,/* 0x98-0x9F */
+	0xB1FF,0xB200,0xB201,0xB202,0xB203,0xB206,0xB207,0xB209,/* 0xA0-0xA7 */
+	0xB20A,0xB20D,0xB20E,0xB20F,0xB210,0xB211,0xB212,0xB213,/* 0xA8-0xAF */
+	0xB216,0xB218,0xB21A,0xB21B,0xB21C,0xB21D,0xB21E,0xB21F,/* 0xB0-0xB7 */
+	0xB221,0xB222,0xB223,0xB224,0xB225,0xB226,0xB227,0xB228,/* 0xB8-0xBF */
+	0xB229,0xB22A,0xB22B,0xB22C,0xB22D,0xB22E,0xB22F,0xB230,/* 0xC0-0xC7 */
+	0xB231,0xB232,0xB233,0xB235,0xB236,0xB237,0xB238,0xB239,/* 0xC8-0xCF */
+	0xB23A,0xB23B,0xB23D,0xB23E,0xB23F,0xB240,0xB241,0xB242,/* 0xD0-0xD7 */
+	0xB243,0xB244,0xB245,0xB246,0xB247,0xB248,0xB249,0xB24A,/* 0xD8-0xDF */
+	0xB24B,0xB24C,0xB24D,0xB24E,0xB24F,0xB250,0xB251,0xB252,/* 0xE0-0xE7 */
+	0xB253,0xB254,0xB255,0xB256,0xB257,0xB259,0xB25A,0xB25B,/* 0xE8-0xEF */
+	0xB25D,0xB25E,0xB25F,0xB261,0xB262,0xB263,0xB264,0xB265,/* 0xF0-0xF7 */
+	0xB266,0xB267,0xB26A,0xB26B,0xB26C,0xB26D,0xB26E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_88[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB26F,0xB270,0xB271,0xB272,0xB273,0xB276,0xB277,/* 0x40-0x47 */
+	0xB278,0xB279,0xB27A,0xB27B,0xB27D,0xB27E,0xB27F,0xB280,/* 0x48-0x4F */
+	0xB281,0xB282,0xB283,0xB286,0xB287,0xB288,0xB28A,0xB28B,/* 0x50-0x57 */
+	0xB28C,0xB28D,0xB28E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB28F,0xB292,0xB293,0xB295,0xB296,0xB297,0xB29B,/* 0x60-0x67 */
+	0xB29C,0xB29D,0xB29E,0xB29F,0xB2A2,0xB2A4,0xB2A7,0xB2A8,/* 0x68-0x6F */
+	0xB2A9,0xB2AB,0xB2AD,0xB2AE,0xB2AF,0xB2B1,0xB2B2,0xB2B3,/* 0x70-0x77 */
+	0xB2B5,0xB2B6,0xB2B7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB2B8,0xB2B9,0xB2BA,0xB2BB,0xB2BC,0xB2BD,0xB2BE,/* 0x80-0x87 */
+	0xB2BF,0xB2C0,0xB2C1,0xB2C2,0xB2C3,0xB2C4,0xB2C5,0xB2C6,/* 0x88-0x8F */
+	0xB2C7,0xB2CA,0xB2CB,0xB2CD,0xB2CE,0xB2CF,0xB2D1,0xB2D3,/* 0x90-0x97 */
+	0xB2D4,0xB2D5,0xB2D6,0xB2D7,0xB2DA,0xB2DC,0xB2DE,0xB2DF,/* 0x98-0x9F */
+	0xB2E0,0xB2E1,0xB2E3,0xB2E7,0xB2E9,0xB2EA,0xB2F0,0xB2F1,/* 0xA0-0xA7 */
+	0xB2F2,0xB2F6,0xB2FC,0xB2FD,0xB2FE,0xB302,0xB303,0xB305,/* 0xA8-0xAF */
+	0xB306,0xB307,0xB309,0xB30A,0xB30B,0xB30C,0xB30D,0xB30E,/* 0xB0-0xB7 */
+	0xB30F,0xB312,0xB316,0xB317,0xB318,0xB319,0xB31A,0xB31B,/* 0xB8-0xBF */
+	0xB31D,0xB31E,0xB31F,0xB320,0xB321,0xB322,0xB323,0xB324,/* 0xC0-0xC7 */
+	0xB325,0xB326,0xB327,0xB328,0xB329,0xB32A,0xB32B,0xB32C,/* 0xC8-0xCF */
+	0xB32D,0xB32E,0xB32F,0xB330,0xB331,0xB332,0xB333,0xB334,/* 0xD0-0xD7 */
+	0xB335,0xB336,0xB337,0xB338,0xB339,0xB33A,0xB33B,0xB33C,/* 0xD8-0xDF */
+	0xB33D,0xB33E,0xB33F,0xB340,0xB341,0xB342,0xB343,0xB344,/* 0xE0-0xE7 */
+	0xB345,0xB346,0xB347,0xB348,0xB349,0xB34A,0xB34B,0xB34C,/* 0xE8-0xEF */
+	0xB34D,0xB34E,0xB34F,0xB350,0xB351,0xB352,0xB353,0xB357,/* 0xF0-0xF7 */
+	0xB359,0xB35A,0xB35D,0xB360,0xB361,0xB362,0xB363,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_89[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB366,0xB368,0xB36A,0xB36C,0xB36D,0xB36F,0xB372,/* 0x40-0x47 */
+	0xB373,0xB375,0xB376,0xB377,0xB379,0xB37A,0xB37B,0xB37C,/* 0x48-0x4F */
+	0xB37D,0xB37E,0xB37F,0xB382,0xB386,0xB387,0xB388,0xB389,/* 0x50-0x57 */
+	0xB38A,0xB38B,0xB38D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB38E,0xB38F,0xB391,0xB392,0xB393,0xB395,0xB396,/* 0x60-0x67 */
+	0xB397,0xB398,0xB399,0xB39A,0xB39B,0xB39C,0xB39D,0xB39E,/* 0x68-0x6F */
+	0xB39F,0xB3A2,0xB3A3,0xB3A4,0xB3A5,0xB3A6,0xB3A7,0xB3A9,/* 0x70-0x77 */
+	0xB3AA,0xB3AB,0xB3AD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB3AE,0xB3AF,0xB3B0,0xB3B1,0xB3B2,0xB3B3,0xB3B4,/* 0x80-0x87 */
+	0xB3B5,0xB3B6,0xB3B7,0xB3B8,0xB3B9,0xB3BA,0xB3BB,0xB3BC,/* 0x88-0x8F */
+	0xB3BD,0xB3BE,0xB3BF,0xB3C0,0xB3C1,0xB3C2,0xB3C3,0xB3C6,/* 0x90-0x97 */
+	0xB3C7,0xB3C9,0xB3CA,0xB3CD,0xB3CF,0xB3D1,0xB3D2,0xB3D3,/* 0x98-0x9F */
+	0xB3D6,0xB3D8,0xB3DA,0xB3DC,0xB3DE,0xB3DF,0xB3E1,0xB3E2,/* 0xA0-0xA7 */
+	0xB3E3,0xB3E5,0xB3E6,0xB3E7,0xB3E9,0xB3EA,0xB3EB,0xB3EC,/* 0xA8-0xAF */
+	0xB3ED,0xB3EE,0xB3EF,0xB3F0,0xB3F1,0xB3F2,0xB3F3,0xB3F4,/* 0xB0-0xB7 */
+	0xB3F5,0xB3F6,0xB3F7,0xB3F8,0xB3F9,0xB3FA,0xB3FB,0xB3FD,/* 0xB8-0xBF */
+	0xB3FE,0xB3FF,0xB400,0xB401,0xB402,0xB403,0xB404,0xB405,/* 0xC0-0xC7 */
+	0xB406,0xB407,0xB408,0xB409,0xB40A,0xB40B,0xB40C,0xB40D,/* 0xC8-0xCF */
+	0xB40E,0xB40F,0xB411,0xB412,0xB413,0xB414,0xB415,0xB416,/* 0xD0-0xD7 */
+	0xB417,0xB419,0xB41A,0xB41B,0xB41D,0xB41E,0xB41F,0xB421,/* 0xD8-0xDF */
+	0xB422,0xB423,0xB424,0xB425,0xB426,0xB427,0xB42A,0xB42C,/* 0xE0-0xE7 */
+	0xB42D,0xB42E,0xB42F,0xB430,0xB431,0xB432,0xB433,0xB435,/* 0xE8-0xEF */
+	0xB436,0xB437,0xB438,0xB439,0xB43A,0xB43B,0xB43C,0xB43D,/* 0xF0-0xF7 */
+	0xB43E,0xB43F,0xB440,0xB441,0xB442,0xB443,0xB444,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8A[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB445,0xB446,0xB447,0xB448,0xB449,0xB44A,0xB44B,/* 0x40-0x47 */
+	0xB44C,0xB44D,0xB44E,0xB44F,0xB452,0xB453,0xB455,0xB456,/* 0x48-0x4F */
+	0xB457,0xB459,0xB45A,0xB45B,0xB45C,0xB45D,0xB45E,0xB45F,/* 0x50-0x57 */
+	0xB462,0xB464,0xB466,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB467,0xB468,0xB469,0xB46A,0xB46B,0xB46D,0xB46E,/* 0x60-0x67 */
+	0xB46F,0xB470,0xB471,0xB472,0xB473,0xB474,0xB475,0xB476,/* 0x68-0x6F */
+	0xB477,0xB478,0xB479,0xB47A,0xB47B,0xB47C,0xB47D,0xB47E,/* 0x70-0x77 */
+	0xB47F,0xB481,0xB482,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB483,0xB484,0xB485,0xB486,0xB487,0xB489,0xB48A,/* 0x80-0x87 */
+	0xB48B,0xB48C,0xB48D,0xB48E,0xB48F,0xB490,0xB491,0xB492,/* 0x88-0x8F */
+	0xB493,0xB494,0xB495,0xB496,0xB497,0xB498,0xB499,0xB49A,/* 0x90-0x97 */
+	0xB49B,0xB49C,0xB49E,0xB49F,0xB4A0,0xB4A1,0xB4A2,0xB4A3,/* 0x98-0x9F */
+	0xB4A5,0xB4A6,0xB4A7,0xB4A9,0xB4AA,0xB4AB,0xB4AD,0xB4AE,/* 0xA0-0xA7 */
+	0xB4AF,0xB4B0,0xB4B1,0xB4B2,0xB4B3,0xB4B4,0xB4B6,0xB4B8,/* 0xA8-0xAF */
+	0xB4BA,0xB4BB,0xB4BC,0xB4BD,0xB4BE,0xB4BF,0xB4C1,0xB4C2,/* 0xB0-0xB7 */
+	0xB4C3,0xB4C5,0xB4C6,0xB4C7,0xB4C9,0xB4CA,0xB4CB,0xB4CC,/* 0xB8-0xBF */
+	0xB4CD,0xB4CE,0xB4CF,0xB4D1,0xB4D2,0xB4D3,0xB4D4,0xB4D6,/* 0xC0-0xC7 */
+	0xB4D7,0xB4D8,0xB4D9,0xB4DA,0xB4DB,0xB4DE,0xB4DF,0xB4E1,/* 0xC8-0xCF */
+	0xB4E2,0xB4E5,0xB4E7,0xB4E8,0xB4E9,0xB4EA,0xB4EB,0xB4EE,/* 0xD0-0xD7 */
+	0xB4F0,0xB4F2,0xB4F3,0xB4F4,0xB4F5,0xB4F6,0xB4F7,0xB4F9,/* 0xD8-0xDF */
+	0xB4FA,0xB4FB,0xB4FC,0xB4FD,0xB4FE,0xB4FF,0xB500,0xB501,/* 0xE0-0xE7 */
+	0xB502,0xB503,0xB504,0xB505,0xB506,0xB507,0xB508,0xB509,/* 0xE8-0xEF */
+	0xB50A,0xB50B,0xB50C,0xB50D,0xB50E,0xB50F,0xB510,0xB511,/* 0xF0-0xF7 */
+	0xB512,0xB513,0xB516,0xB517,0xB519,0xB51A,0xB51D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8B[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB51E,0xB51F,0xB520,0xB521,0xB522,0xB523,0xB526,/* 0x40-0x47 */
+	0xB52B,0xB52C,0xB52D,0xB52E,0xB52F,0xB532,0xB533,0xB535,/* 0x48-0x4F */
+	0xB536,0xB537,0xB539,0xB53A,0xB53B,0xB53C,0xB53D,0xB53E,/* 0x50-0x57 */
+	0xB53F,0xB542,0xB546,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB547,0xB548,0xB549,0xB54A,0xB54E,0xB54F,0xB551,/* 0x60-0x67 */
+	0xB552,0xB553,0xB555,0xB556,0xB557,0xB558,0xB559,0xB55A,/* 0x68-0x6F */
+	0xB55B,0xB55E,0xB562,0xB563,0xB564,0xB565,0xB566,0xB567,/* 0x70-0x77 */
+	0xB568,0xB569,0xB56A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB56B,0xB56C,0xB56D,0xB56E,0xB56F,0xB570,0xB571,/* 0x80-0x87 */
+	0xB572,0xB573,0xB574,0xB575,0xB576,0xB577,0xB578,0xB579,/* 0x88-0x8F */
+	0xB57A,0xB57B,0xB57C,0xB57D,0xB57E,0xB57F,0xB580,0xB581,/* 0x90-0x97 */
+	0xB582,0xB583,0xB584,0xB585,0xB586,0xB587,0xB588,0xB589,/* 0x98-0x9F */
+	0xB58A,0xB58B,0xB58C,0xB58D,0xB58E,0xB58F,0xB590,0xB591,/* 0xA0-0xA7 */
+	0xB592,0xB593,0xB594,0xB595,0xB596,0xB597,0xB598,0xB599,/* 0xA8-0xAF */
+	0xB59A,0xB59B,0xB59C,0xB59D,0xB59E,0xB59F,0xB5A2,0xB5A3,/* 0xB0-0xB7 */
+	0xB5A5,0xB5A6,0xB5A7,0xB5A9,0xB5AC,0xB5AD,0xB5AE,0xB5AF,/* 0xB8-0xBF */
+	0xB5B2,0xB5B6,0xB5B7,0xB5B8,0xB5B9,0xB5BA,0xB5BE,0xB5BF,/* 0xC0-0xC7 */
+	0xB5C1,0xB5C2,0xB5C3,0xB5C5,0xB5C6,0xB5C7,0xB5C8,0xB5C9,/* 0xC8-0xCF */
+	0xB5CA,0xB5CB,0xB5CE,0xB5D2,0xB5D3,0xB5D4,0xB5D5,0xB5D6,/* 0xD0-0xD7 */
+	0xB5D7,0xB5D9,0xB5DA,0xB5DB,0xB5DC,0xB5DD,0xB5DE,0xB5DF,/* 0xD8-0xDF */
+	0xB5E0,0xB5E1,0xB5E2,0xB5E3,0xB5E4,0xB5E5,0xB5E6,0xB5E7,/* 0xE0-0xE7 */
+	0xB5E8,0xB5E9,0xB5EA,0xB5EB,0xB5ED,0xB5EE,0xB5EF,0xB5F0,/* 0xE8-0xEF */
+	0xB5F1,0xB5F2,0xB5F3,0xB5F4,0xB5F5,0xB5F6,0xB5F7,0xB5F8,/* 0xF0-0xF7 */
+	0xB5F9,0xB5FA,0xB5FB,0xB5FC,0xB5FD,0xB5FE,0xB5FF,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8C[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB600,0xB601,0xB602,0xB603,0xB604,0xB605,0xB606,/* 0x40-0x47 */
+	0xB607,0xB608,0xB609,0xB60A,0xB60B,0xB60C,0xB60D,0xB60E,/* 0x48-0x4F */
+	0xB60F,0xB612,0xB613,0xB615,0xB616,0xB617,0xB619,0xB61A,/* 0x50-0x57 */
+	0xB61B,0xB61C,0xB61D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB61E,0xB61F,0xB620,0xB621,0xB622,0xB623,0xB624,/* 0x60-0x67 */
+	0xB626,0xB627,0xB628,0xB629,0xB62A,0xB62B,0xB62D,0xB62E,/* 0x68-0x6F */
+	0xB62F,0xB630,0xB631,0xB632,0xB633,0xB635,0xB636,0xB637,/* 0x70-0x77 */
+	0xB638,0xB639,0xB63A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB63B,0xB63C,0xB63D,0xB63E,0xB63F,0xB640,0xB641,/* 0x80-0x87 */
+	0xB642,0xB643,0xB644,0xB645,0xB646,0xB647,0xB649,0xB64A,/* 0x88-0x8F */
+	0xB64B,0xB64C,0xB64D,0xB64E,0xB64F,0xB650,0xB651,0xB652,/* 0x90-0x97 */
+	0xB653,0xB654,0xB655,0xB656,0xB657,0xB658,0xB659,0xB65A,/* 0x98-0x9F */
+	0xB65B,0xB65C,0xB65D,0xB65E,0xB65F,0xB660,0xB661,0xB662,/* 0xA0-0xA7 */
+	0xB663,0xB665,0xB666,0xB667,0xB669,0xB66A,0xB66B,0xB66C,/* 0xA8-0xAF */
+	0xB66D,0xB66E,0xB66F,0xB670,0xB671,0xB672,0xB673,0xB674,/* 0xB0-0xB7 */
+	0xB675,0xB676,0xB677,0xB678,0xB679,0xB67A,0xB67B,0xB67C,/* 0xB8-0xBF */
+	0xB67D,0xB67E,0xB67F,0xB680,0xB681,0xB682,0xB683,0xB684,/* 0xC0-0xC7 */
+	0xB685,0xB686,0xB687,0xB688,0xB689,0xB68A,0xB68B,0xB68C,/* 0xC8-0xCF */
+	0xB68D,0xB68E,0xB68F,0xB690,0xB691,0xB692,0xB693,0xB694,/* 0xD0-0xD7 */
+	0xB695,0xB696,0xB697,0xB698,0xB699,0xB69A,0xB69B,0xB69E,/* 0xD8-0xDF */
+	0xB69F,0xB6A1,0xB6A2,0xB6A3,0xB6A5,0xB6A6,0xB6A7,0xB6A8,/* 0xE0-0xE7 */
+	0xB6A9,0xB6AA,0xB6AD,0xB6AE,0xB6AF,0xB6B0,0xB6B2,0xB6B3,/* 0xE8-0xEF */
+	0xB6B4,0xB6B5,0xB6B6,0xB6B7,0xB6B8,0xB6B9,0xB6BA,0xB6BB,/* 0xF0-0xF7 */
+	0xB6BC,0xB6BD,0xB6BE,0xB6BF,0xB6C0,0xB6C1,0xB6C2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8D[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB6C3,0xB6C4,0xB6C5,0xB6C6,0xB6C7,0xB6C8,0xB6C9,/* 0x40-0x47 */
+	0xB6CA,0xB6CB,0xB6CC,0xB6CD,0xB6CE,0xB6CF,0xB6D0,0xB6D1,/* 0x48-0x4F */
+	0xB6D2,0xB6D3,0xB6D5,0xB6D6,0xB6D7,0xB6D8,0xB6D9,0xB6DA,/* 0x50-0x57 */
+	0xB6DB,0xB6DC,0xB6DD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB6DE,0xB6DF,0xB6E0,0xB6E1,0xB6E2,0xB6E3,0xB6E4,/* 0x60-0x67 */
+	0xB6E5,0xB6E6,0xB6E7,0xB6E8,0xB6E9,0xB6EA,0xB6EB,0xB6EC,/* 0x68-0x6F */
+	0xB6ED,0xB6EE,0xB6EF,0xB6F1,0xB6F2,0xB6F3,0xB6F5,0xB6F6,/* 0x70-0x77 */
+	0xB6F7,0xB6F9,0xB6FA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB6FB,0xB6FC,0xB6FD,0xB6FE,0xB6FF,0xB702,0xB703,/* 0x80-0x87 */
+	0xB704,0xB706,0xB707,0xB708,0xB709,0xB70A,0xB70B,0xB70C,/* 0x88-0x8F */
+	0xB70D,0xB70E,0xB70F,0xB710,0xB711,0xB712,0xB713,0xB714,/* 0x90-0x97 */
+	0xB715,0xB716,0xB717,0xB718,0xB719,0xB71A,0xB71B,0xB71C,/* 0x98-0x9F */
+	0xB71D,0xB71E,0xB71F,0xB720,0xB721,0xB722,0xB723,0xB724,/* 0xA0-0xA7 */
+	0xB725,0xB726,0xB727,0xB72A,0xB72B,0xB72D,0xB72E,0xB731,/* 0xA8-0xAF */
+	0xB732,0xB733,0xB734,0xB735,0xB736,0xB737,0xB73A,0xB73C,/* 0xB0-0xB7 */
+	0xB73D,0xB73E,0xB73F,0xB740,0xB741,0xB742,0xB743,0xB745,/* 0xB8-0xBF */
+	0xB746,0xB747,0xB749,0xB74A,0xB74B,0xB74D,0xB74E,0xB74F,/* 0xC0-0xC7 */
+	0xB750,0xB751,0xB752,0xB753,0xB756,0xB757,0xB758,0xB759,/* 0xC8-0xCF */
+	0xB75A,0xB75B,0xB75C,0xB75D,0xB75E,0xB75F,0xB761,0xB762,/* 0xD0-0xD7 */
+	0xB763,0xB765,0xB766,0xB767,0xB769,0xB76A,0xB76B,0xB76C,/* 0xD8-0xDF */
+	0xB76D,0xB76E,0xB76F,0xB772,0xB774,0xB776,0xB777,0xB778,/* 0xE0-0xE7 */
+	0xB779,0xB77A,0xB77B,0xB77E,0xB77F,0xB781,0xB782,0xB783,/* 0xE8-0xEF */
+	0xB785,0xB786,0xB787,0xB788,0xB789,0xB78A,0xB78B,0xB78E,/* 0xF0-0xF7 */
+	0xB793,0xB794,0xB795,0xB79A,0xB79B,0xB79D,0xB79E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8E[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB79F,0xB7A1,0xB7A2,0xB7A3,0xB7A4,0xB7A5,0xB7A6,/* 0x40-0x47 */
+	0xB7A7,0xB7AA,0xB7AE,0xB7AF,0xB7B0,0xB7B1,0xB7B2,0xB7B3,/* 0x48-0x4F */
+	0xB7B6,0xB7B7,0xB7B9,0xB7BA,0xB7BB,0xB7BC,0xB7BD,0xB7BE,/* 0x50-0x57 */
+	0xB7BF,0xB7C0,0xB7C1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB7C2,0xB7C3,0xB7C4,0xB7C5,0xB7C6,0xB7C8,0xB7CA,/* 0x60-0x67 */
+	0xB7CB,0xB7CC,0xB7CD,0xB7CE,0xB7CF,0xB7D0,0xB7D1,0xB7D2,/* 0x68-0x6F */
+	0xB7D3,0xB7D4,0xB7D5,0xB7D6,0xB7D7,0xB7D8,0xB7D9,0xB7DA,/* 0x70-0x77 */
+	0xB7DB,0xB7DC,0xB7DD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB7DE,0xB7DF,0xB7E0,0xB7E1,0xB7E2,0xB7E3,0xB7E4,/* 0x80-0x87 */
+	0xB7E5,0xB7E6,0xB7E7,0xB7E8,0xB7E9,0xB7EA,0xB7EB,0xB7EE,/* 0x88-0x8F */
+	0xB7EF,0xB7F1,0xB7F2,0xB7F3,0xB7F5,0xB7F6,0xB7F7,0xB7F8,/* 0x90-0x97 */
+	0xB7F9,0xB7FA,0xB7FB,0xB7FE,0xB802,0xB803,0xB804,0xB805,/* 0x98-0x9F */
+	0xB806,0xB80A,0xB80B,0xB80D,0xB80E,0xB80F,0xB811,0xB812,/* 0xA0-0xA7 */
+	0xB813,0xB814,0xB815,0xB816,0xB817,0xB81A,0xB81C,0xB81E,/* 0xA8-0xAF */
+	0xB81F,0xB820,0xB821,0xB822,0xB823,0xB826,0xB827,0xB829,/* 0xB0-0xB7 */
+	0xB82A,0xB82B,0xB82D,0xB82E,0xB82F,0xB830,0xB831,0xB832,/* 0xB8-0xBF */
+	0xB833,0xB836,0xB83A,0xB83B,0xB83C,0xB83D,0xB83E,0xB83F,/* 0xC0-0xC7 */
+	0xB841,0xB842,0xB843,0xB845,0xB846,0xB847,0xB848,0xB849,/* 0xC8-0xCF */
+	0xB84A,0xB84B,0xB84C,0xB84D,0xB84E,0xB84F,0xB850,0xB852,/* 0xD0-0xD7 */
+	0xB854,0xB855,0xB856,0xB857,0xB858,0xB859,0xB85A,0xB85B,/* 0xD8-0xDF */
+	0xB85E,0xB85F,0xB861,0xB862,0xB863,0xB865,0xB866,0xB867,/* 0xE0-0xE7 */
+	0xB868,0xB869,0xB86A,0xB86B,0xB86E,0xB870,0xB872,0xB873,/* 0xE8-0xEF */
+	0xB874,0xB875,0xB876,0xB877,0xB879,0xB87A,0xB87B,0xB87D,/* 0xF0-0xF7 */
+	0xB87E,0xB87F,0xB880,0xB881,0xB882,0xB883,0xB884,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_8F[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB885,0xB886,0xB887,0xB888,0xB889,0xB88A,0xB88B,/* 0x40-0x47 */
+	0xB88C,0xB88E,0xB88F,0xB890,0xB891,0xB892,0xB893,0xB894,/* 0x48-0x4F */
+	0xB895,0xB896,0xB897,0xB898,0xB899,0xB89A,0xB89B,0xB89C,/* 0x50-0x57 */
+	0xB89D,0xB89E,0xB89F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB8A0,0xB8A1,0xB8A2,0xB8A3,0xB8A4,0xB8A5,0xB8A6,/* 0x60-0x67 */
+	0xB8A7,0xB8A9,0xB8AA,0xB8AB,0xB8AC,0xB8AD,0xB8AE,0xB8AF,/* 0x68-0x6F */
+	0xB8B1,0xB8B2,0xB8B3,0xB8B5,0xB8B6,0xB8B7,0xB8B9,0xB8BA,/* 0x70-0x77 */
+	0xB8BB,0xB8BC,0xB8BD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB8BE,0xB8BF,0xB8C2,0xB8C4,0xB8C6,0xB8C7,0xB8C8,/* 0x80-0x87 */
+	0xB8C9,0xB8CA,0xB8CB,0xB8CD,0xB8CE,0xB8CF,0xB8D1,0xB8D2,/* 0x88-0x8F */
+	0xB8D3,0xB8D5,0xB8D6,0xB8D7,0xB8D8,0xB8D9,0xB8DA,0xB8DB,/* 0x90-0x97 */
+	0xB8DC,0xB8DE,0xB8E0,0xB8E2,0xB8E3,0xB8E4,0xB8E5,0xB8E6,/* 0x98-0x9F */
+	0xB8E7,0xB8EA,0xB8EB,0xB8ED,0xB8EE,0xB8EF,0xB8F1,0xB8F2,/* 0xA0-0xA7 */
+	0xB8F3,0xB8F4,0xB8F5,0xB8F6,0xB8F7,0xB8FA,0xB8FC,0xB8FE,/* 0xA8-0xAF */
+	0xB8FF,0xB900,0xB901,0xB902,0xB903,0xB905,0xB906,0xB907,/* 0xB0-0xB7 */
+	0xB908,0xB909,0xB90A,0xB90B,0xB90C,0xB90D,0xB90E,0xB90F,/* 0xB8-0xBF */
+	0xB910,0xB911,0xB912,0xB913,0xB914,0xB915,0xB916,0xB917,/* 0xC0-0xC7 */
+	0xB919,0xB91A,0xB91B,0xB91C,0xB91D,0xB91E,0xB91F,0xB921,/* 0xC8-0xCF */
+	0xB922,0xB923,0xB924,0xB925,0xB926,0xB927,0xB928,0xB929,/* 0xD0-0xD7 */
+	0xB92A,0xB92B,0xB92C,0xB92D,0xB92E,0xB92F,0xB930,0xB931,/* 0xD8-0xDF */
+	0xB932,0xB933,0xB934,0xB935,0xB936,0xB937,0xB938,0xB939,/* 0xE0-0xE7 */
+	0xB93A,0xB93B,0xB93E,0xB93F,0xB941,0xB942,0xB943,0xB945,/* 0xE8-0xEF */
+	0xB946,0xB947,0xB948,0xB949,0xB94A,0xB94B,0xB94D,0xB94E,/* 0xF0-0xF7 */
+	0xB950,0xB952,0xB953,0xB954,0xB955,0xB956,0xB957,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_90[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xB95A,0xB95B,0xB95D,0xB95E,0xB95F,0xB961,0xB962,/* 0x40-0x47 */
+	0xB963,0xB964,0xB965,0xB966,0xB967,0xB96A,0xB96C,0xB96E,/* 0x48-0x4F */
+	0xB96F,0xB970,0xB971,0xB972,0xB973,0xB976,0xB977,0xB979,/* 0x50-0x57 */
+	0xB97A,0xB97B,0xB97D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xB97E,0xB97F,0xB980,0xB981,0xB982,0xB983,0xB986,/* 0x60-0x67 */
+	0xB988,0xB98B,0xB98C,0xB98F,0xB990,0xB991,0xB992,0xB993,/* 0x68-0x6F */
+	0xB994,0xB995,0xB996,0xB997,0xB998,0xB999,0xB99A,0xB99B,/* 0x70-0x77 */
+	0xB99C,0xB99D,0xB99E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xB99F,0xB9A0,0xB9A1,0xB9A2,0xB9A3,0xB9A4,0xB9A5,/* 0x80-0x87 */
+	0xB9A6,0xB9A7,0xB9A8,0xB9A9,0xB9AA,0xB9AB,0xB9AE,0xB9AF,/* 0x88-0x8F */
+	0xB9B1,0xB9B2,0xB9B3,0xB9B5,0xB9B6,0xB9B7,0xB9B8,0xB9B9,/* 0x90-0x97 */
+	0xB9BA,0xB9BB,0xB9BE,0xB9C0,0xB9C2,0xB9C3,0xB9C4,0xB9C5,/* 0x98-0x9F */
+	0xB9C6,0xB9C7,0xB9CA,0xB9CB,0xB9CD,0xB9D3,0xB9D4,0xB9D5,/* 0xA0-0xA7 */
+	0xB9D6,0xB9D7,0xB9DA,0xB9DC,0xB9DF,0xB9E0,0xB9E2,0xB9E6,/* 0xA8-0xAF */
+	0xB9E7,0xB9E9,0xB9EA,0xB9EB,0xB9ED,0xB9EE,0xB9EF,0xB9F0,/* 0xB0-0xB7 */
+	0xB9F1,0xB9F2,0xB9F3,0xB9F6,0xB9FB,0xB9FC,0xB9FD,0xB9FE,/* 0xB8-0xBF */
+	0xB9FF,0xBA02,0xBA03,0xBA04,0xBA05,0xBA06,0xBA07,0xBA09,/* 0xC0-0xC7 */
+	0xBA0A,0xBA0B,0xBA0C,0xBA0D,0xBA0E,0xBA0F,0xBA10,0xBA11,/* 0xC8-0xCF */
+	0xBA12,0xBA13,0xBA14,0xBA16,0xBA17,0xBA18,0xBA19,0xBA1A,/* 0xD0-0xD7 */
+	0xBA1B,0xBA1C,0xBA1D,0xBA1E,0xBA1F,0xBA20,0xBA21,0xBA22,/* 0xD8-0xDF */
+	0xBA23,0xBA24,0xBA25,0xBA26,0xBA27,0xBA28,0xBA29,0xBA2A,/* 0xE0-0xE7 */
+	0xBA2B,0xBA2C,0xBA2D,0xBA2E,0xBA2F,0xBA30,0xBA31,0xBA32,/* 0xE8-0xEF */
+	0xBA33,0xBA34,0xBA35,0xBA36,0xBA37,0xBA3A,0xBA3B,0xBA3D,/* 0xF0-0xF7 */
+	0xBA3E,0xBA3F,0xBA41,0xBA43,0xBA44,0xBA45,0xBA46,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_91[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xBA47,0xBA4A,0xBA4C,0xBA4F,0xBA50,0xBA51,0xBA52,/* 0x40-0x47 */
+	0xBA56,0xBA57,0xBA59,0xBA5A,0xBA5B,0xBA5D,0xBA5E,0xBA5F,/* 0x48-0x4F */
+	0xBA60,0xBA61,0xBA62,0xBA63,0xBA66,0xBA6A,0xBA6B,0xBA6C,/* 0x50-0x57 */
+	0xBA6D,0xBA6E,0xBA6F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xBA72,0xBA73,0xBA75,0xBA76,0xBA77,0xBA79,0xBA7A,/* 0x60-0x67 */
+	0xBA7B,0xBA7C,0xBA7D,0xBA7E,0xBA7F,0xBA80,0xBA81,0xBA82,/* 0x68-0x6F */
+	0xBA86,0xBA88,0xBA89,0xBA8A,0xBA8B,0xBA8D,0xBA8E,0xBA8F,/* 0x70-0x77 */
+	0xBA90,0xBA91,0xBA92,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xBA93,0xBA94,0xBA95,0xBA96,0xBA97,0xBA98,0xBA99,/* 0x80-0x87 */
+	0xBA9A,0xBA9B,0xBA9C,0xBA9D,0xBA9E,0xBA9F,0xBAA0,0xBAA1,/* 0x88-0x8F */
+	0xBAA2,0xBAA3,0xBAA4,0xBAA5,0xBAA6,0xBAA7,0xBAAA,0xBAAD,/* 0x90-0x97 */
+	0xBAAE,0xBAAF,0xBAB1,0xBAB3,0xBAB4,0xBAB5,0xBAB6,0xBAB7,/* 0x98-0x9F */
+	0xBABA,0xBABC,0xBABE,0xBABF,0xBAC0,0xBAC1,0xBAC2,0xBAC3,/* 0xA0-0xA7 */
+	0xBAC5,0xBAC6,0xBAC7,0xBAC9,0xBACA,0xBACB,0xBACC,0xBACD,/* 0xA8-0xAF */
+	0xBACE,0xBACF,0xBAD0,0xBAD1,0xBAD2,0xBAD3,0xBAD4,0xBAD5,/* 0xB0-0xB7 */
+	0xBAD6,0xBAD7,0xBADA,0xBADB,0xBADC,0xBADD,0xBADE,0xBADF,/* 0xB8-0xBF */
+	0xBAE0,0xBAE1,0xBAE2,0xBAE3,0xBAE4,0xBAE5,0xBAE6,0xBAE7,/* 0xC0-0xC7 */
+	0xBAE8,0xBAE9,0xBAEA,0xBAEB,0xBAEC,0xBAED,0xBAEE,0xBAEF,/* 0xC8-0xCF */
+	0xBAF0,0xBAF1,0xBAF2,0xBAF3,0xBAF4,0xBAF5,0xBAF6,0xBAF7,/* 0xD0-0xD7 */
+	0xBAF8,0xBAF9,0xBAFA,0xBAFB,0xBAFD,0xBAFE,0xBAFF,0xBB01,/* 0xD8-0xDF */
+	0xBB02,0xBB03,0xBB05,0xBB06,0xBB07,0xBB08,0xBB09,0xBB0A,/* 0xE0-0xE7 */
+	0xBB0B,0xBB0C,0xBB0E,0xBB10,0xBB12,0xBB13,0xBB14,0xBB15,/* 0xE8-0xEF */
+	0xBB16,0xBB17,0xBB19,0xBB1A,0xBB1B,0xBB1D,0xBB1E,0xBB1F,/* 0xF0-0xF7 */
+	0xBB21,0xBB22,0xBB23,0xBB24,0xBB25,0xBB26,0xBB27,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_92[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xBB28,0xBB2A,0xBB2C,0xBB2D,0xBB2E,0xBB2F,0xBB30,/* 0x40-0x47 */
+	0xBB31,0xBB32,0xBB33,0xBB37,0xBB39,0xBB3A,0xBB3F,0xBB40,/* 0x48-0x4F */
+	0xBB41,0xBB42,0xBB43,0xBB46,0xBB48,0xBB4A,0xBB4B,0xBB4C,/* 0x50-0x57 */
+	0xBB4E,0xBB51,0xBB52,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xBB53,0xBB55,0xBB56,0xBB57,0xBB59,0xBB5A,0xBB5B,/* 0x60-0x67 */
+	0xBB5C,0xBB5D,0xBB5E,0xBB5F,0xBB60,0xBB62,0xBB64,0xBB65,/* 0x68-0x6F */
+	0xBB66,0xBB67,0xBB68,0xBB69,0xBB6A,0xBB6B,0xBB6D,0xBB6E,/* 0x70-0x77 */
+	0xBB6F,0xBB70,0xBB71,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xBB72,0xBB73,0xBB74,0xBB75,0xBB76,0xBB77,0xBB78,/* 0x80-0x87 */
+	0xBB79,0xBB7A,0xBB7B,0xBB7C,0xBB7D,0xBB7E,0xBB7F,0xBB80,/* 0x88-0x8F */
+	0xBB81,0xBB82,0xBB83,0xBB84,0xBB85,0xBB86,0xBB87,0xBB89,/* 0x90-0x97 */
+	0xBB8A,0xBB8B,0xBB8D,0xBB8E,0xBB8F,0xBB91,0xBB92,0xBB93,/* 0x98-0x9F */
+	0xBB94,0xBB95,0xBB96,0xBB97,0xBB98,0xBB99,0xBB9A,0xBB9B,/* 0xA0-0xA7 */
+	0xBB9C,0xBB9D,0xBB9E,0xBB9F,0xBBA0,0xBBA1,0xBBA2,0xBBA3,/* 0xA8-0xAF */
+	0xBBA5,0xBBA6,0xBBA7,0xBBA9,0xBBAA,0xBBAB,0xBBAD,0xBBAE,/* 0xB0-0xB7 */
+	0xBBAF,0xBBB0,0xBBB1,0xBBB2,0xBBB3,0xBBB5,0xBBB6,0xBBB8,/* 0xB8-0xBF */
+	0xBBB9,0xBBBA,0xBBBB,0xBBBC,0xBBBD,0xBBBE,0xBBBF,0xBBC1,/* 0xC0-0xC7 */
+	0xBBC2,0xBBC3,0xBBC5,0xBBC6,0xBBC7,0xBBC9,0xBBCA,0xBBCB,/* 0xC8-0xCF */
+	0xBBCC,0xBBCD,0xBBCE,0xBBCF,0xBBD1,0xBBD2,0xBBD4,0xBBD5,/* 0xD0-0xD7 */
+	0xBBD6,0xBBD7,0xBBD8,0xBBD9,0xBBDA,0xBBDB,0xBBDC,0xBBDD,/* 0xD8-0xDF */
+	0xBBDE,0xBBDF,0xBBE0,0xBBE1,0xBBE2,0xBBE3,0xBBE4,0xBBE5,/* 0xE0-0xE7 */
+	0xBBE6,0xBBE7,0xBBE8,0xBBE9,0xBBEA,0xBBEB,0xBBEC,0xBBED,/* 0xE8-0xEF */
+	0xBBEE,0xBBEF,0xBBF0,0xBBF1,0xBBF2,0xBBF3,0xBBF4,0xBBF5,/* 0xF0-0xF7 */
+	0xBBF6,0xBBF7,0xBBFA,0xBBFB,0xBBFD,0xBBFE,0xBC01,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_93[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xBC03,0xBC04,0xBC05,0xBC06,0xBC07,0xBC0A,0xBC0E,/* 0x40-0x47 */
+	0xBC10,0xBC12,0xBC13,0xBC19,0xBC1A,0xBC20,0xBC21,0xBC22,/* 0x48-0x4F */
+	0xBC23,0xBC26,0xBC28,0xBC2A,0xBC2B,0xBC2C,0xBC2E,0xBC2F,/* 0x50-0x57 */
+	0xBC32,0xBC33,0xBC35,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xBC36,0xBC37,0xBC39,0xBC3A,0xBC3B,0xBC3C,0xBC3D,/* 0x60-0x67 */
+	0xBC3E,0xBC3F,0xBC42,0xBC46,0xBC47,0xBC48,0xBC4A,0xBC4B,/* 0x68-0x6F */
+	0xBC4E,0xBC4F,0xBC51,0xBC52,0xBC53,0xBC54,0xBC55,0xBC56,/* 0x70-0x77 */
+	0xBC57,0xBC58,0xBC59,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xBC5A,0xBC5B,0xBC5C,0xBC5E,0xBC5F,0xBC60,0xBC61,/* 0x80-0x87 */
+	0xBC62,0xBC63,0xBC64,0xBC65,0xBC66,0xBC67,0xBC68,0xBC69,/* 0x88-0x8F */
+	0xBC6A,0xBC6B,0xBC6C,0xBC6D,0xBC6E,0xBC6F,0xBC70,0xBC71,/* 0x90-0x97 */
+	0xBC72,0xBC73,0xBC74,0xBC75,0xBC76,0xBC77,0xBC78,0xBC79,/* 0x98-0x9F */
+	0xBC7A,0xBC7B,0xBC7C,0xBC7D,0xBC7E,0xBC7F,0xBC80,0xBC81,/* 0xA0-0xA7 */
+	0xBC82,0xBC83,0xBC86,0xBC87,0xBC89,0xBC8A,0xBC8D,0xBC8F,/* 0xA8-0xAF */
+	0xBC90,0xBC91,0xBC92,0xBC93,0xBC96,0xBC98,0xBC9B,0xBC9C,/* 0xB0-0xB7 */
+	0xBC9D,0xBC9E,0xBC9F,0xBCA2,0xBCA3,0xBCA5,0xBCA6,0xBCA9,/* 0xB8-0xBF */
+	0xBCAA,0xBCAB,0xBCAC,0xBCAD,0xBCAE,0xBCAF,0xBCB2,0xBCB6,/* 0xC0-0xC7 */
+	0xBCB7,0xBCB8,0xBCB9,0xBCBA,0xBCBB,0xBCBE,0xBCBF,0xBCC1,/* 0xC8-0xCF */
+	0xBCC2,0xBCC3,0xBCC5,0xBCC6,0xBCC7,0xBCC8,0xBCC9,0xBCCA,/* 0xD0-0xD7 */
+	0xBCCB,0xBCCC,0xBCCE,0xBCD2,0xBCD3,0xBCD4,0xBCD6,0xBCD7,/* 0xD8-0xDF */
+	0xBCD9,0xBCDA,0xBCDB,0xBCDD,0xBCDE,0xBCDF,0xBCE0,0xBCE1,/* 0xE0-0xE7 */
+	0xBCE2,0xBCE3,0xBCE4,0xBCE5,0xBCE6,0xBCE7,0xBCE8,0xBCE9,/* 0xE8-0xEF */
+	0xBCEA,0xBCEB,0xBCEC,0xBCED,0xBCEE,0xBCEF,0xBCF0,0xBCF1,/* 0xF0-0xF7 */
+	0xBCF2,0xBCF3,0xBCF7,0xBCF9,0xBCFA,0xBCFB,0xBCFD,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_94[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xBCFE,0xBCFF,0xBD00,0xBD01,0xBD02,0xBD03,0xBD06,/* 0x40-0x47 */
+	0xBD08,0xBD0A,0xBD0B,0xBD0C,0xBD0D,0xBD0E,0xBD0F,0xBD11,/* 0x48-0x4F */
+	0xBD12,0xBD13,0xBD15,0xBD16,0xBD17,0xBD18,0xBD19,0xBD1A,/* 0x50-0x57 */
+	0xBD1B,0xBD1C,0xBD1D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xBD1E,0xBD1F,0xBD20,0xBD21,0xBD22,0xBD23,0xBD25,/* 0x60-0x67 */
+	0xBD26,0xBD27,0xBD28,0xBD29,0xBD2A,0xBD2B,0xBD2D,0xBD2E,/* 0x68-0x6F */
+	0xBD2F,0xBD30,0xBD31,0xBD32,0xBD33,0xBD34,0xBD35,0xBD36,/* 0x70-0x77 */
+	0xBD37,0xBD38,0xBD39,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xBD3A,0xBD3B,0xBD3C,0xBD3D,0xBD3E,0xBD3F,0xBD41,/* 0x80-0x87 */
+	0xBD42,0xBD43,0xBD44,0xBD45,0xBD46,0xBD47,0xBD4A,0xBD4B,/* 0x88-0x8F */
+	0xBD4D,0xBD4E,0xBD4F,0xBD51,0xBD52,0xBD53,0xBD54,0xBD55,/* 0x90-0x97 */
+	0xBD56,0xBD57,0xBD5A,0xBD5B,0xBD5C,0xBD5D,0xBD5E,0xBD5F,/* 0x98-0x9F */
+	0xBD60,0xBD61,0xBD62,0xBD63,0xBD65,0xBD66,0xBD67,0xBD69,/* 0xA0-0xA7 */
+	0xBD6A,0xBD6B,0xBD6C,0xBD6D,0xBD6E,0xBD6F,0xBD70,0xBD71,/* 0xA8-0xAF */
+	0xBD72,0xBD73,0xBD74,0xBD75,0xBD76,0xBD77,0xBD78,0xBD79,/* 0xB0-0xB7 */
+	0xBD7A,0xBD7B,0xBD7C,0xBD7D,0xBD7E,0xBD7F,0xBD82,0xBD83,/* 0xB8-0xBF */
+	0xBD85,0xBD86,0xBD8B,0xBD8C,0xBD8D,0xBD8E,0xBD8F,0xBD92,/* 0xC0-0xC7 */
+	0xBD94,0xBD96,0xBD97,0xBD98,0xBD9B,0xBD9D,0xBD9E,0xBD9F,/* 0xC8-0xCF */
+	0xBDA0,0xBDA1,0xBDA2,0xBDA3,0xBDA5,0xBDA6,0xBDA7,0xBDA8,/* 0xD0-0xD7 */
+	0xBDA9,0xBDAA,0xBDAB,0xBDAC,0xBDAD,0xBDAE,0xBDAF,0xBDB1,/* 0xD8-0xDF */
+	0xBDB2,0xBDB3,0xBDB4,0xBDB5,0xBDB6,0xBDB7,0xBDB9,0xBDBA,/* 0xE0-0xE7 */
+	0xBDBB,0xBDBC,0xBDBD,0xBDBE,0xBDBF,0xBDC0,0xBDC1,0xBDC2,/* 0xE8-0xEF */
+	0xBDC3,0xBDC4,0xBDC5,0xBDC6,0xBDC7,0xBDC8,0xBDC9,0xBDCA,/* 0xF0-0xF7 */
+	0xBDCB,0xBDCC,0xBDCD,0xBDCE,0xBDCF,0xBDD0,0xBDD1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_95[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xBDD2,0xBDD3,0xBDD6,0xBDD7,0xBDD9,0xBDDA,0xBDDB,/* 0x40-0x47 */
+	0xBDDD,0xBDDE,0xBDDF,0xBDE0,0xBDE1,0xBDE2,0xBDE3,0xBDE4,/* 0x48-0x4F */
+	0xBDE5,0xBDE6,0xBDE7,0xBDE8,0xBDEA,0xBDEB,0xBDEC,0xBDED,/* 0x50-0x57 */
+	0xBDEE,0xBDEF,0xBDF1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xBDF2,0xBDF3,0xBDF5,0xBDF6,0xBDF7,0xBDF9,0xBDFA,/* 0x60-0x67 */
+	0xBDFB,0xBDFC,0xBDFD,0xBDFE,0xBDFF,0xBE01,0xBE02,0xBE04,/* 0x68-0x6F */
+	0xBE06,0xBE07,0xBE08,0xBE09,0xBE0A,0xBE0B,0xBE0E,0xBE0F,/* 0x70-0x77 */
+	0xBE11,0xBE12,0xBE13,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xBE15,0xBE16,0xBE17,0xBE18,0xBE19,0xBE1A,0xBE1B,/* 0x80-0x87 */
+	0xBE1E,0xBE20,0xBE21,0xBE22,0xBE23,0xBE24,0xBE25,0xBE26,/* 0x88-0x8F */
+	0xBE27,0xBE28,0xBE29,0xBE2A,0xBE2B,0xBE2C,0xBE2D,0xBE2E,/* 0x90-0x97 */
+	0xBE2F,0xBE30,0xBE31,0xBE32,0xBE33,0xBE34,0xBE35,0xBE36,/* 0x98-0x9F */
+	0xBE37,0xBE38,0xBE39,0xBE3A,0xBE3B,0xBE3C,0xBE3D,0xBE3E,/* 0xA0-0xA7 */
+	0xBE3F,0xBE40,0xBE41,0xBE42,0xBE43,0xBE46,0xBE47,0xBE49,/* 0xA8-0xAF */
+	0xBE4A,0xBE4B,0xBE4D,0xBE4F,0xBE50,0xBE51,0xBE52,0xBE53,/* 0xB0-0xB7 */
+	0xBE56,0xBE58,0xBE5C,0xBE5D,0xBE5E,0xBE5F,0xBE62,0xBE63,/* 0xB8-0xBF */
+	0xBE65,0xBE66,0xBE67,0xBE69,0xBE6B,0xBE6C,0xBE6D,0xBE6E,/* 0xC0-0xC7 */
+	0xBE6F,0xBE72,0xBE76,0xBE77,0xBE78,0xBE79,0xBE7A,0xBE7E,/* 0xC8-0xCF */
+	0xBE7F,0xBE81,0xBE82,0xBE83,0xBE85,0xBE86,0xBE87,0xBE88,/* 0xD0-0xD7 */
+	0xBE89,0xBE8A,0xBE8B,0xBE8E,0xBE92,0xBE93,0xBE94,0xBE95,/* 0xD8-0xDF */
+	0xBE96,0xBE97,0xBE9A,0xBE9B,0xBE9C,0xBE9D,0xBE9E,0xBE9F,/* 0xE0-0xE7 */
+	0xBEA0,0xBEA1,0xBEA2,0xBEA3,0xBEA4,0xBEA5,0xBEA6,0xBEA7,/* 0xE8-0xEF */
+	0xBEA9,0xBEAA,0xBEAB,0xBEAC,0xBEAD,0xBEAE,0xBEAF,0xBEB0,/* 0xF0-0xF7 */
+	0xBEB1,0xBEB2,0xBEB3,0xBEB4,0xBEB5,0xBEB6,0xBEB7,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_96[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xBEB8,0xBEB9,0xBEBA,0xBEBB,0xBEBC,0xBEBD,0xBEBE,/* 0x40-0x47 */
+	0xBEBF,0xBEC0,0xBEC1,0xBEC2,0xBEC3,0xBEC4,0xBEC5,0xBEC6,/* 0x48-0x4F */
+	0xBEC7,0xBEC8,0xBEC9,0xBECA,0xBECB,0xBECC,0xBECD,0xBECE,/* 0x50-0x57 */
+	0xBECF,0xBED2,0xBED3,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xBED5,0xBED6,0xBED9,0xBEDA,0xBEDB,0xBEDC,0xBEDD,/* 0x60-0x67 */
+	0xBEDE,0xBEDF,0xBEE1,0xBEE2,0xBEE6,0xBEE7,0xBEE8,0xBEE9,/* 0x68-0x6F */
+	0xBEEA,0xBEEB,0xBEED,0xBEEE,0xBEEF,0xBEF0,0xBEF1,0xBEF2,/* 0x70-0x77 */
+	0xBEF3,0xBEF4,0xBEF5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xBEF6,0xBEF7,0xBEF8,0xBEF9,0xBEFA,0xBEFB,0xBEFC,/* 0x80-0x87 */
+	0xBEFD,0xBEFE,0xBEFF,0xBF00,0xBF02,0xBF03,0xBF04,0xBF05,/* 0x88-0x8F */
+	0xBF06,0xBF07,0xBF0A,0xBF0B,0xBF0C,0xBF0D,0xBF0E,0xBF0F,/* 0x90-0x97 */
+	0xBF10,0xBF11,0xBF12,0xBF13,0xBF14,0xBF15,0xBF16,0xBF17,/* 0x98-0x9F */
+	0xBF1A,0xBF1E,0xBF1F,0xBF20,0xBF21,0xBF22,0xBF23,0xBF24,/* 0xA0-0xA7 */
+	0xBF25,0xBF26,0xBF27,0xBF28,0xBF29,0xBF2A,0xBF2B,0xBF2C,/* 0xA8-0xAF */
+	0xBF2D,0xBF2E,0xBF2F,0xBF30,0xBF31,0xBF32,0xBF33,0xBF34,/* 0xB0-0xB7 */
+	0xBF35,0xBF36,0xBF37,0xBF38,0xBF39,0xBF3A,0xBF3B,0xBF3C,/* 0xB8-0xBF */
+	0xBF3D,0xBF3E,0xBF3F,0xBF42,0xBF43,0xBF45,0xBF46,0xBF47,/* 0xC0-0xC7 */
+	0xBF49,0xBF4A,0xBF4B,0xBF4C,0xBF4D,0xBF4E,0xBF4F,0xBF52,/* 0xC8-0xCF */
+	0xBF53,0xBF54,0xBF56,0xBF57,0xBF58,0xBF59,0xBF5A,0xBF5B,/* 0xD0-0xD7 */
+	0xBF5C,0xBF5D,0xBF5E,0xBF5F,0xBF60,0xBF61,0xBF62,0xBF63,/* 0xD8-0xDF */
+	0xBF64,0xBF65,0xBF66,0xBF67,0xBF68,0xBF69,0xBF6A,0xBF6B,/* 0xE0-0xE7 */
+	0xBF6C,0xBF6D,0xBF6E,0xBF6F,0xBF70,0xBF71,0xBF72,0xBF73,/* 0xE8-0xEF */
+	0xBF74,0xBF75,0xBF76,0xBF77,0xBF78,0xBF79,0xBF7A,0xBF7B,/* 0xF0-0xF7 */
+	0xBF7C,0xBF7D,0xBF7E,0xBF7F,0xBF80,0xBF81,0xBF82,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_97[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xBF83,0xBF84,0xBF85,0xBF86,0xBF87,0xBF88,0xBF89,/* 0x40-0x47 */
+	0xBF8A,0xBF8B,0xBF8C,0xBF8D,0xBF8E,0xBF8F,0xBF90,0xBF91,/* 0x48-0x4F */
+	0xBF92,0xBF93,0xBF95,0xBF96,0xBF97,0xBF98,0xBF99,0xBF9A,/* 0x50-0x57 */
+	0xBF9B,0xBF9C,0xBF9D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xBF9E,0xBF9F,0xBFA0,0xBFA1,0xBFA2,0xBFA3,0xBFA4,/* 0x60-0x67 */
+	0xBFA5,0xBFA6,0xBFA7,0xBFA8,0xBFA9,0xBFAA,0xBFAB,0xBFAC,/* 0x68-0x6F */
+	0xBFAD,0xBFAE,0xBFAF,0xBFB1,0xBFB2,0xBFB3,0xBFB4,0xBFB5,/* 0x70-0x77 */
+	0xBFB6,0xBFB7,0xBFB8,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xBFB9,0xBFBA,0xBFBB,0xBFBC,0xBFBD,0xBFBE,0xBFBF,/* 0x80-0x87 */
+	0xBFC0,0xBFC1,0xBFC2,0xBFC3,0xBFC4,0xBFC6,0xBFC7,0xBFC8,/* 0x88-0x8F */
+	0xBFC9,0xBFCA,0xBFCB,0xBFCE,0xBFCF,0xBFD1,0xBFD2,0xBFD3,/* 0x90-0x97 */
+	0xBFD5,0xBFD6,0xBFD7,0xBFD8,0xBFD9,0xBFDA,0xBFDB,0xBFDD,/* 0x98-0x9F */
+	0xBFDE,0xBFE0,0xBFE2,0xBFE3,0xBFE4,0xBFE5,0xBFE6,0xBFE7,/* 0xA0-0xA7 */
+	0xBFE8,0xBFE9,0xBFEA,0xBFEB,0xBFEC,0xBFED,0xBFEE,0xBFEF,/* 0xA8-0xAF */
+	0xBFF0,0xBFF1,0xBFF2,0xBFF3,0xBFF4,0xBFF5,0xBFF6,0xBFF7,/* 0xB0-0xB7 */
+	0xBFF8,0xBFF9,0xBFFA,0xBFFB,0xBFFC,0xBFFD,0xBFFE,0xBFFF,/* 0xB8-0xBF */
+	0xC000,0xC001,0xC002,0xC003,0xC004,0xC005,0xC006,0xC007,/* 0xC0-0xC7 */
+	0xC008,0xC009,0xC00A,0xC00B,0xC00C,0xC00D,0xC00E,0xC00F,/* 0xC8-0xCF */
+	0xC010,0xC011,0xC012,0xC013,0xC014,0xC015,0xC016,0xC017,/* 0xD0-0xD7 */
+	0xC018,0xC019,0xC01A,0xC01B,0xC01C,0xC01D,0xC01E,0xC01F,/* 0xD8-0xDF */
+	0xC020,0xC021,0xC022,0xC023,0xC024,0xC025,0xC026,0xC027,/* 0xE0-0xE7 */
+	0xC028,0xC029,0xC02A,0xC02B,0xC02C,0xC02D,0xC02E,0xC02F,/* 0xE8-0xEF */
+	0xC030,0xC031,0xC032,0xC033,0xC034,0xC035,0xC036,0xC037,/* 0xF0-0xF7 */
+	0xC038,0xC039,0xC03A,0xC03B,0xC03D,0xC03E,0xC03F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_98[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC040,0xC041,0xC042,0xC043,0xC044,0xC045,0xC046,/* 0x40-0x47 */
+	0xC047,0xC048,0xC049,0xC04A,0xC04B,0xC04C,0xC04D,0xC04E,/* 0x48-0x4F */
+	0xC04F,0xC050,0xC052,0xC053,0xC054,0xC055,0xC056,0xC057,/* 0x50-0x57 */
+	0xC059,0xC05A,0xC05B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC05D,0xC05E,0xC05F,0xC061,0xC062,0xC063,0xC064,/* 0x60-0x67 */
+	0xC065,0xC066,0xC067,0xC06A,0xC06B,0xC06C,0xC06D,0xC06E,/* 0x68-0x6F */
+	0xC06F,0xC070,0xC071,0xC072,0xC073,0xC074,0xC075,0xC076,/* 0x70-0x77 */
+	0xC077,0xC078,0xC079,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC07A,0xC07B,0xC07C,0xC07D,0xC07E,0xC07F,0xC080,/* 0x80-0x87 */
+	0xC081,0xC082,0xC083,0xC084,0xC085,0xC086,0xC087,0xC088,/* 0x88-0x8F */
+	0xC089,0xC08A,0xC08B,0xC08C,0xC08D,0xC08E,0xC08F,0xC092,/* 0x90-0x97 */
+	0xC093,0xC095,0xC096,0xC097,0xC099,0xC09A,0xC09B,0xC09C,/* 0x98-0x9F */
+	0xC09D,0xC09E,0xC09F,0xC0A2,0xC0A4,0xC0A6,0xC0A7,0xC0A8,/* 0xA0-0xA7 */
+	0xC0A9,0xC0AA,0xC0AB,0xC0AE,0xC0B1,0xC0B2,0xC0B7,0xC0B8,/* 0xA8-0xAF */
+	0xC0B9,0xC0BA,0xC0BB,0xC0BE,0xC0C2,0xC0C3,0xC0C4,0xC0C6,/* 0xB0-0xB7 */
+	0xC0C7,0xC0CA,0xC0CB,0xC0CD,0xC0CE,0xC0CF,0xC0D1,0xC0D2,/* 0xB8-0xBF */
+	0xC0D3,0xC0D4,0xC0D5,0xC0D6,0xC0D7,0xC0DA,0xC0DE,0xC0DF,/* 0xC0-0xC7 */
+	0xC0E0,0xC0E1,0xC0E2,0xC0E3,0xC0E6,0xC0E7,0xC0E9,0xC0EA,/* 0xC8-0xCF */
+	0xC0EB,0xC0ED,0xC0EE,0xC0EF,0xC0F0,0xC0F1,0xC0F2,0xC0F3,/* 0xD0-0xD7 */
+	0xC0F6,0xC0F8,0xC0FA,0xC0FB,0xC0FC,0xC0FD,0xC0FE,0xC0FF,/* 0xD8-0xDF */
+	0xC101,0xC102,0xC103,0xC105,0xC106,0xC107,0xC109,0xC10A,/* 0xE0-0xE7 */
+	0xC10B,0xC10C,0xC10D,0xC10E,0xC10F,0xC111,0xC112,0xC113,/* 0xE8-0xEF */
+	0xC114,0xC116,0xC117,0xC118,0xC119,0xC11A,0xC11B,0xC121,/* 0xF0-0xF7 */
+	0xC122,0xC125,0xC128,0xC129,0xC12A,0xC12B,0xC12E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_99[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC132,0xC133,0xC134,0xC135,0xC137,0xC13A,0xC13B,/* 0x40-0x47 */
+	0xC13D,0xC13E,0xC13F,0xC141,0xC142,0xC143,0xC144,0xC145,/* 0x48-0x4F */
+	0xC146,0xC147,0xC14A,0xC14E,0xC14F,0xC150,0xC151,0xC152,/* 0x50-0x57 */
+	0xC153,0xC156,0xC157,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC159,0xC15A,0xC15B,0xC15D,0xC15E,0xC15F,0xC160,/* 0x60-0x67 */
+	0xC161,0xC162,0xC163,0xC166,0xC16A,0xC16B,0xC16C,0xC16D,/* 0x68-0x6F */
+	0xC16E,0xC16F,0xC171,0xC172,0xC173,0xC175,0xC176,0xC177,/* 0x70-0x77 */
+	0xC179,0xC17A,0xC17B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC17C,0xC17D,0xC17E,0xC17F,0xC180,0xC181,0xC182,/* 0x80-0x87 */
+	0xC183,0xC184,0xC186,0xC187,0xC188,0xC189,0xC18A,0xC18B,/* 0x88-0x8F */
+	0xC18F,0xC191,0xC192,0xC193,0xC195,0xC197,0xC198,0xC199,/* 0x90-0x97 */
+	0xC19A,0xC19B,0xC19E,0xC1A0,0xC1A2,0xC1A3,0xC1A4,0xC1A6,/* 0x98-0x9F */
+	0xC1A7,0xC1AA,0xC1AB,0xC1AD,0xC1AE,0xC1AF,0xC1B1,0xC1B2,/* 0xA0-0xA7 */
+	0xC1B3,0xC1B4,0xC1B5,0xC1B6,0xC1B7,0xC1B8,0xC1B9,0xC1BA,/* 0xA8-0xAF */
+	0xC1BB,0xC1BC,0xC1BE,0xC1BF,0xC1C0,0xC1C1,0xC1C2,0xC1C3,/* 0xB0-0xB7 */
+	0xC1C5,0xC1C6,0xC1C7,0xC1C9,0xC1CA,0xC1CB,0xC1CD,0xC1CE,/* 0xB8-0xBF */
+	0xC1CF,0xC1D0,0xC1D1,0xC1D2,0xC1D3,0xC1D5,0xC1D6,0xC1D9,/* 0xC0-0xC7 */
+	0xC1DA,0xC1DB,0xC1DC,0xC1DD,0xC1DE,0xC1DF,0xC1E1,0xC1E2,/* 0xC8-0xCF */
+	0xC1E3,0xC1E5,0xC1E6,0xC1E7,0xC1E9,0xC1EA,0xC1EB,0xC1EC,/* 0xD0-0xD7 */
+	0xC1ED,0xC1EE,0xC1EF,0xC1F2,0xC1F4,0xC1F5,0xC1F6,0xC1F7,/* 0xD8-0xDF */
+	0xC1F8,0xC1F9,0xC1FA,0xC1FB,0xC1FE,0xC1FF,0xC201,0xC202,/* 0xE0-0xE7 */
+	0xC203,0xC205,0xC206,0xC207,0xC208,0xC209,0xC20A,0xC20B,/* 0xE8-0xEF */
+	0xC20E,0xC210,0xC212,0xC213,0xC214,0xC215,0xC216,0xC217,/* 0xF0-0xF7 */
+	0xC21A,0xC21B,0xC21D,0xC21E,0xC221,0xC222,0xC223,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9A[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC224,0xC225,0xC226,0xC227,0xC22A,0xC22C,0xC22E,/* 0x40-0x47 */
+	0xC230,0xC233,0xC235,0xC236,0xC237,0xC238,0xC239,0xC23A,/* 0x48-0x4F */
+	0xC23B,0xC23C,0xC23D,0xC23E,0xC23F,0xC240,0xC241,0xC242,/* 0x50-0x57 */
+	0xC243,0xC244,0xC245,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC246,0xC247,0xC249,0xC24A,0xC24B,0xC24C,0xC24D,/* 0x60-0x67 */
+	0xC24E,0xC24F,0xC252,0xC253,0xC255,0xC256,0xC257,0xC259,/* 0x68-0x6F */
+	0xC25A,0xC25B,0xC25C,0xC25D,0xC25E,0xC25F,0xC261,0xC262,/* 0x70-0x77 */
+	0xC263,0xC264,0xC266,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC267,0xC268,0xC269,0xC26A,0xC26B,0xC26E,0xC26F,/* 0x80-0x87 */
+	0xC271,0xC272,0xC273,0xC275,0xC276,0xC277,0xC278,0xC279,/* 0x88-0x8F */
+	0xC27A,0xC27B,0xC27E,0xC280,0xC282,0xC283,0xC284,0xC285,/* 0x90-0x97 */
+	0xC286,0xC287,0xC28A,0xC28B,0xC28C,0xC28D,0xC28E,0xC28F,/* 0x98-0x9F */
+	0xC291,0xC292,0xC293,0xC294,0xC295,0xC296,0xC297,0xC299,/* 0xA0-0xA7 */
+	0xC29A,0xC29C,0xC29E,0xC29F,0xC2A0,0xC2A1,0xC2A2,0xC2A3,/* 0xA8-0xAF */
+	0xC2A6,0xC2A7,0xC2A9,0xC2AA,0xC2AB,0xC2AE,0xC2AF,0xC2B0,/* 0xB0-0xB7 */
+	0xC2B1,0xC2B2,0xC2B3,0xC2B6,0xC2B8,0xC2BA,0xC2BB,0xC2BC,/* 0xB8-0xBF */
+	0xC2BD,0xC2BE,0xC2BF,0xC2C0,0xC2C1,0xC2C2,0xC2C3,0xC2C4,/* 0xC0-0xC7 */
+	0xC2C5,0xC2C6,0xC2C7,0xC2C8,0xC2C9,0xC2CA,0xC2CB,0xC2CC,/* 0xC8-0xCF */
+	0xC2CD,0xC2CE,0xC2CF,0xC2D0,0xC2D1,0xC2D2,0xC2D3,0xC2D4,/* 0xD0-0xD7 */
+	0xC2D5,0xC2D6,0xC2D7,0xC2D8,0xC2D9,0xC2DA,0xC2DB,0xC2DE,/* 0xD8-0xDF */
+	0xC2DF,0xC2E1,0xC2E2,0xC2E5,0xC2E6,0xC2E7,0xC2E8,0xC2E9,/* 0xE0-0xE7 */
+	0xC2EA,0xC2EE,0xC2F0,0xC2F2,0xC2F3,0xC2F4,0xC2F5,0xC2F7,/* 0xE8-0xEF */
+	0xC2FA,0xC2FD,0xC2FE,0xC2FF,0xC301,0xC302,0xC303,0xC304,/* 0xF0-0xF7 */
+	0xC305,0xC306,0xC307,0xC30A,0xC30B,0xC30E,0xC30F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9B[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC310,0xC311,0xC312,0xC316,0xC317,0xC319,0xC31A,/* 0x40-0x47 */
+	0xC31B,0xC31D,0xC31E,0xC31F,0xC320,0xC321,0xC322,0xC323,/* 0x48-0x4F */
+	0xC326,0xC327,0xC32A,0xC32B,0xC32C,0xC32D,0xC32E,0xC32F,/* 0x50-0x57 */
+	0xC330,0xC331,0xC332,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC333,0xC334,0xC335,0xC336,0xC337,0xC338,0xC339,/* 0x60-0x67 */
+	0xC33A,0xC33B,0xC33C,0xC33D,0xC33E,0xC33F,0xC340,0xC341,/* 0x68-0x6F */
+	0xC342,0xC343,0xC344,0xC346,0xC347,0xC348,0xC349,0xC34A,/* 0x70-0x77 */
+	0xC34B,0xC34C,0xC34D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC34E,0xC34F,0xC350,0xC351,0xC352,0xC353,0xC354,/* 0x80-0x87 */
+	0xC355,0xC356,0xC357,0xC358,0xC359,0xC35A,0xC35B,0xC35C,/* 0x88-0x8F */
+	0xC35D,0xC35E,0xC35F,0xC360,0xC361,0xC362,0xC363,0xC364,/* 0x90-0x97 */
+	0xC365,0xC366,0xC367,0xC36A,0xC36B,0xC36D,0xC36E,0xC36F,/* 0x98-0x9F */
+	0xC371,0xC373,0xC374,0xC375,0xC376,0xC377,0xC37A,0xC37B,/* 0xA0-0xA7 */
+	0xC37E,0xC37F,0xC380,0xC381,0xC382,0xC383,0xC385,0xC386,/* 0xA8-0xAF */
+	0xC387,0xC389,0xC38A,0xC38B,0xC38D,0xC38E,0xC38F,0xC390,/* 0xB0-0xB7 */
+	0xC391,0xC392,0xC393,0xC394,0xC395,0xC396,0xC397,0xC398,/* 0xB8-0xBF */
+	0xC399,0xC39A,0xC39B,0xC39C,0xC39D,0xC39E,0xC39F,0xC3A0,/* 0xC0-0xC7 */
+	0xC3A1,0xC3A2,0xC3A3,0xC3A4,0xC3A5,0xC3A6,0xC3A7,0xC3A8,/* 0xC8-0xCF */
+	0xC3A9,0xC3AA,0xC3AB,0xC3AC,0xC3AD,0xC3AE,0xC3AF,0xC3B0,/* 0xD0-0xD7 */
+	0xC3B1,0xC3B2,0xC3B3,0xC3B4,0xC3B5,0xC3B6,0xC3B7,0xC3B8,/* 0xD8-0xDF */
+	0xC3B9,0xC3BA,0xC3BB,0xC3BC,0xC3BD,0xC3BE,0xC3BF,0xC3C1,/* 0xE0-0xE7 */
+	0xC3C2,0xC3C3,0xC3C4,0xC3C5,0xC3C6,0xC3C7,0xC3C8,0xC3C9,/* 0xE8-0xEF */
+	0xC3CA,0xC3CB,0xC3CC,0xC3CD,0xC3CE,0xC3CF,0xC3D0,0xC3D1,/* 0xF0-0xF7 */
+	0xC3D2,0xC3D3,0xC3D4,0xC3D5,0xC3D6,0xC3D7,0xC3DA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9C[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC3DB,0xC3DD,0xC3DE,0xC3E1,0xC3E3,0xC3E4,0xC3E5,/* 0x40-0x47 */
+	0xC3E6,0xC3E7,0xC3EA,0xC3EB,0xC3EC,0xC3EE,0xC3EF,0xC3F0,/* 0x48-0x4F */
+	0xC3F1,0xC3F2,0xC3F3,0xC3F6,0xC3F7,0xC3F9,0xC3FA,0xC3FB,/* 0x50-0x57 */
+	0xC3FC,0xC3FD,0xC3FE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC3FF,0xC400,0xC401,0xC402,0xC403,0xC404,0xC405,/* 0x60-0x67 */
+	0xC406,0xC407,0xC409,0xC40A,0xC40B,0xC40C,0xC40D,0xC40E,/* 0x68-0x6F */
+	0xC40F,0xC411,0xC412,0xC413,0xC414,0xC415,0xC416,0xC417,/* 0x70-0x77 */
+	0xC418,0xC419,0xC41A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC41B,0xC41C,0xC41D,0xC41E,0xC41F,0xC420,0xC421,/* 0x80-0x87 */
+	0xC422,0xC423,0xC425,0xC426,0xC427,0xC428,0xC429,0xC42A,/* 0x88-0x8F */
+	0xC42B,0xC42D,0xC42E,0xC42F,0xC431,0xC432,0xC433,0xC435,/* 0x90-0x97 */
+	0xC436,0xC437,0xC438,0xC439,0xC43A,0xC43B,0xC43E,0xC43F,/* 0x98-0x9F */
+	0xC440,0xC441,0xC442,0xC443,0xC444,0xC445,0xC446,0xC447,/* 0xA0-0xA7 */
+	0xC449,0xC44A,0xC44B,0xC44C,0xC44D,0xC44E,0xC44F,0xC450,/* 0xA8-0xAF */
+	0xC451,0xC452,0xC453,0xC454,0xC455,0xC456,0xC457,0xC458,/* 0xB0-0xB7 */
+	0xC459,0xC45A,0xC45B,0xC45C,0xC45D,0xC45E,0xC45F,0xC460,/* 0xB8-0xBF */
+	0xC461,0xC462,0xC463,0xC466,0xC467,0xC469,0xC46A,0xC46B,/* 0xC0-0xC7 */
+	0xC46D,0xC46E,0xC46F,0xC470,0xC471,0xC472,0xC473,0xC476,/* 0xC8-0xCF */
+	0xC477,0xC478,0xC47A,0xC47B,0xC47C,0xC47D,0xC47E,0xC47F,/* 0xD0-0xD7 */
+	0xC481,0xC482,0xC483,0xC484,0xC485,0xC486,0xC487,0xC488,/* 0xD8-0xDF */
+	0xC489,0xC48A,0xC48B,0xC48C,0xC48D,0xC48E,0xC48F,0xC490,/* 0xE0-0xE7 */
+	0xC491,0xC492,0xC493,0xC495,0xC496,0xC497,0xC498,0xC499,/* 0xE8-0xEF */
+	0xC49A,0xC49B,0xC49D,0xC49E,0xC49F,0xC4A0,0xC4A1,0xC4A2,/* 0xF0-0xF7 */
+	0xC4A3,0xC4A4,0xC4A5,0xC4A6,0xC4A7,0xC4A8,0xC4A9,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9D[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC4AA,0xC4AB,0xC4AC,0xC4AD,0xC4AE,0xC4AF,0xC4B0,/* 0x40-0x47 */
+	0xC4B1,0xC4B2,0xC4B3,0xC4B4,0xC4B5,0xC4B6,0xC4B7,0xC4B9,/* 0x48-0x4F */
+	0xC4BA,0xC4BB,0xC4BD,0xC4BE,0xC4BF,0xC4C0,0xC4C1,0xC4C2,/* 0x50-0x57 */
+	0xC4C3,0xC4C4,0xC4C5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC4C6,0xC4C7,0xC4C8,0xC4C9,0xC4CA,0xC4CB,0xC4CC,/* 0x60-0x67 */
+	0xC4CD,0xC4CE,0xC4CF,0xC4D0,0xC4D1,0xC4D2,0xC4D3,0xC4D4,/* 0x68-0x6F */
+	0xC4D5,0xC4D6,0xC4D7,0xC4D8,0xC4D9,0xC4DA,0xC4DB,0xC4DC,/* 0x70-0x77 */
+	0xC4DD,0xC4DE,0xC4DF,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC4E0,0xC4E1,0xC4E2,0xC4E3,0xC4E4,0xC4E5,0xC4E6,/* 0x80-0x87 */
+	0xC4E7,0xC4E8,0xC4EA,0xC4EB,0xC4EC,0xC4ED,0xC4EE,0xC4EF,/* 0x88-0x8F */
+	0xC4F2,0xC4F3,0xC4F5,0xC4F6,0xC4F7,0xC4F9,0xC4FB,0xC4FC,/* 0x90-0x97 */
+	0xC4FD,0xC4FE,0xC502,0xC503,0xC504,0xC505,0xC506,0xC507,/* 0x98-0x9F */
+	0xC508,0xC509,0xC50A,0xC50B,0xC50D,0xC50E,0xC50F,0xC511,/* 0xA0-0xA7 */
+	0xC512,0xC513,0xC515,0xC516,0xC517,0xC518,0xC519,0xC51A,/* 0xA8-0xAF */
+	0xC51B,0xC51D,0xC51E,0xC51F,0xC520,0xC521,0xC522,0xC523,/* 0xB0-0xB7 */
+	0xC524,0xC525,0xC526,0xC527,0xC52A,0xC52B,0xC52D,0xC52E,/* 0xB8-0xBF */
+	0xC52F,0xC531,0xC532,0xC533,0xC534,0xC535,0xC536,0xC537,/* 0xC0-0xC7 */
+	0xC53A,0xC53C,0xC53E,0xC53F,0xC540,0xC541,0xC542,0xC543,/* 0xC8-0xCF */
+	0xC546,0xC547,0xC54B,0xC54F,0xC550,0xC551,0xC552,0xC556,/* 0xD0-0xD7 */
+	0xC55A,0xC55B,0xC55C,0xC55F,0xC562,0xC563,0xC565,0xC566,/* 0xD8-0xDF */
+	0xC567,0xC569,0xC56A,0xC56B,0xC56C,0xC56D,0xC56E,0xC56F,/* 0xE0-0xE7 */
+	0xC572,0xC576,0xC577,0xC578,0xC579,0xC57A,0xC57B,0xC57E,/* 0xE8-0xEF */
+	0xC57F,0xC581,0xC582,0xC583,0xC585,0xC586,0xC588,0xC589,/* 0xF0-0xF7 */
+	0xC58A,0xC58B,0xC58E,0xC590,0xC592,0xC593,0xC594,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9E[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC596,0xC599,0xC59A,0xC59B,0xC59D,0xC59E,0xC59F,/* 0x40-0x47 */
+	0xC5A1,0xC5A2,0xC5A3,0xC5A4,0xC5A5,0xC5A6,0xC5A7,0xC5A8,/* 0x48-0x4F */
+	0xC5AA,0xC5AB,0xC5AC,0xC5AD,0xC5AE,0xC5AF,0xC5B0,0xC5B1,/* 0x50-0x57 */
+	0xC5B2,0xC5B3,0xC5B6,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC5B7,0xC5BA,0xC5BF,0xC5C0,0xC5C1,0xC5C2,0xC5C3,/* 0x60-0x67 */
+	0xC5CB,0xC5CD,0xC5CF,0xC5D2,0xC5D3,0xC5D5,0xC5D6,0xC5D7,/* 0x68-0x6F */
+	0xC5D9,0xC5DA,0xC5DB,0xC5DC,0xC5DD,0xC5DE,0xC5DF,0xC5E2,/* 0x70-0x77 */
+	0xC5E4,0xC5E6,0xC5E7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC5E8,0xC5E9,0xC5EA,0xC5EB,0xC5EF,0xC5F1,0xC5F2,/* 0x80-0x87 */
+	0xC5F3,0xC5F5,0xC5F8,0xC5F9,0xC5FA,0xC5FB,0xC602,0xC603,/* 0x88-0x8F */
+	0xC604,0xC609,0xC60A,0xC60B,0xC60D,0xC60E,0xC60F,0xC611,/* 0x90-0x97 */
+	0xC612,0xC613,0xC614,0xC615,0xC616,0xC617,0xC61A,0xC61D,/* 0x98-0x9F */
+	0xC61E,0xC61F,0xC620,0xC621,0xC622,0xC623,0xC626,0xC627,/* 0xA0-0xA7 */
+	0xC629,0xC62A,0xC62B,0xC62F,0xC631,0xC632,0xC636,0xC638,/* 0xA8-0xAF */
+	0xC63A,0xC63C,0xC63D,0xC63E,0xC63F,0xC642,0xC643,0xC645,/* 0xB0-0xB7 */
+	0xC646,0xC647,0xC649,0xC64A,0xC64B,0xC64C,0xC64D,0xC64E,/* 0xB8-0xBF */
+	0xC64F,0xC652,0xC656,0xC657,0xC658,0xC659,0xC65A,0xC65B,/* 0xC0-0xC7 */
+	0xC65E,0xC65F,0xC661,0xC662,0xC663,0xC664,0xC665,0xC666,/* 0xC8-0xCF */
+	0xC667,0xC668,0xC669,0xC66A,0xC66B,0xC66D,0xC66E,0xC670,/* 0xD0-0xD7 */
+	0xC672,0xC673,0xC674,0xC675,0xC676,0xC677,0xC67A,0xC67B,/* 0xD8-0xDF */
+	0xC67D,0xC67E,0xC67F,0xC681,0xC682,0xC683,0xC684,0xC685,/* 0xE0-0xE7 */
+	0xC686,0xC687,0xC68A,0xC68C,0xC68E,0xC68F,0xC690,0xC691,/* 0xE8-0xEF */
+	0xC692,0xC693,0xC696,0xC697,0xC699,0xC69A,0xC69B,0xC69D,/* 0xF0-0xF7 */
+	0xC69E,0xC69F,0xC6A0,0xC6A1,0xC6A2,0xC6A3,0xC6A6,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_9F[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC6A8,0xC6AA,0xC6AB,0xC6AC,0xC6AD,0xC6AE,0xC6AF,/* 0x40-0x47 */
+	0xC6B2,0xC6B3,0xC6B5,0xC6B6,0xC6B7,0xC6BB,0xC6BC,0xC6BD,/* 0x48-0x4F */
+	0xC6BE,0xC6BF,0xC6C2,0xC6C4,0xC6C6,0xC6C7,0xC6C8,0xC6C9,/* 0x50-0x57 */
+	0xC6CA,0xC6CB,0xC6CE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC6CF,0xC6D1,0xC6D2,0xC6D3,0xC6D5,0xC6D6,0xC6D7,/* 0x60-0x67 */
+	0xC6D8,0xC6D9,0xC6DA,0xC6DB,0xC6DE,0xC6DF,0xC6E2,0xC6E3,/* 0x68-0x6F */
+	0xC6E4,0xC6E5,0xC6E6,0xC6E7,0xC6EA,0xC6EB,0xC6ED,0xC6EE,/* 0x70-0x77 */
+	0xC6EF,0xC6F1,0xC6F2,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC6F3,0xC6F4,0xC6F5,0xC6F6,0xC6F7,0xC6FA,0xC6FB,/* 0x80-0x87 */
+	0xC6FC,0xC6FE,0xC6FF,0xC700,0xC701,0xC702,0xC703,0xC706,/* 0x88-0x8F */
+	0xC707,0xC709,0xC70A,0xC70B,0xC70D,0xC70E,0xC70F,0xC710,/* 0x90-0x97 */
+	0xC711,0xC712,0xC713,0xC716,0xC718,0xC71A,0xC71B,0xC71C,/* 0x98-0x9F */
+	0xC71D,0xC71E,0xC71F,0xC722,0xC723,0xC725,0xC726,0xC727,/* 0xA0-0xA7 */
+	0xC729,0xC72A,0xC72B,0xC72C,0xC72D,0xC72E,0xC72F,0xC732,/* 0xA8-0xAF */
+	0xC734,0xC736,0xC738,0xC739,0xC73A,0xC73B,0xC73E,0xC73F,/* 0xB0-0xB7 */
+	0xC741,0xC742,0xC743,0xC745,0xC746,0xC747,0xC748,0xC749,/* 0xB8-0xBF */
+	0xC74B,0xC74E,0xC750,0xC759,0xC75A,0xC75B,0xC75D,0xC75E,/* 0xC0-0xC7 */
+	0xC75F,0xC761,0xC762,0xC763,0xC764,0xC765,0xC766,0xC767,/* 0xC8-0xCF */
+	0xC769,0xC76A,0xC76C,0xC76D,0xC76E,0xC76F,0xC770,0xC771,/* 0xD0-0xD7 */
+	0xC772,0xC773,0xC776,0xC777,0xC779,0xC77A,0xC77B,0xC77F,/* 0xD8-0xDF */
+	0xC780,0xC781,0xC782,0xC786,0xC78B,0xC78C,0xC78D,0xC78F,/* 0xE0-0xE7 */
+	0xC792,0xC793,0xC795,0xC799,0xC79B,0xC79C,0xC79D,0xC79E,/* 0xE8-0xEF */
+	0xC79F,0xC7A2,0xC7A7,0xC7A8,0xC7A9,0xC7AA,0xC7AB,0xC7AE,/* 0xF0-0xF7 */
+	0xC7AF,0xC7B1,0xC7B2,0xC7B3,0xC7B5,0xC7B6,0xC7B7,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC7B8,0xC7B9,0xC7BA,0xC7BB,0xC7BE,0xC7C2,0xC7C3,/* 0x40-0x47 */
+	0xC7C4,0xC7C5,0xC7C6,0xC7C7,0xC7CA,0xC7CB,0xC7CD,0xC7CF,/* 0x48-0x4F */
+	0xC7D1,0xC7D2,0xC7D3,0xC7D4,0xC7D5,0xC7D6,0xC7D7,0xC7D9,/* 0x50-0x57 */
+	0xC7DA,0xC7DB,0xC7DC,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC7DE,0xC7DF,0xC7E0,0xC7E1,0xC7E2,0xC7E3,0xC7E5,/* 0x60-0x67 */
+	0xC7E6,0xC7E7,0xC7E9,0xC7EA,0xC7EB,0xC7ED,0xC7EE,0xC7EF,/* 0x68-0x6F */
+	0xC7F0,0xC7F1,0xC7F2,0xC7F3,0xC7F4,0xC7F5,0xC7F6,0xC7F7,/* 0x70-0x77 */
+	0xC7F8,0xC7F9,0xC7FA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC7FB,0xC7FC,0xC7FD,0xC7FE,0xC7FF,0xC802,0xC803,/* 0x80-0x87 */
+	0xC805,0xC806,0xC807,0xC809,0xC80B,0xC80C,0xC80D,0xC80E,/* 0x88-0x8F */
+	0xC80F,0xC812,0xC814,0xC817,0xC818,0xC819,0xC81A,0xC81B,/* 0x90-0x97 */
+	0xC81E,0xC81F,0xC821,0xC822,0xC823,0xC825,0xC826,0xC827,/* 0x98-0x9F */
+	0xC828,0xC829,0xC82A,0xC82B,0xC82E,0xC830,0xC832,0xC833,/* 0xA0-0xA7 */
+	0xC834,0xC835,0xC836,0xC837,0xC839,0xC83A,0xC83B,0xC83D,/* 0xA8-0xAF */
+	0xC83E,0xC83F,0xC841,0xC842,0xC843,0xC844,0xC845,0xC846,/* 0xB0-0xB7 */
+	0xC847,0xC84A,0xC84B,0xC84E,0xC84F,0xC850,0xC851,0xC852,/* 0xB8-0xBF */
+	0xC853,0xC855,0xC856,0xC857,0xC858,0xC859,0xC85A,0xC85B,/* 0xC0-0xC7 */
+	0xC85C,0xC85D,0xC85E,0xC85F,0xC860,0xC861,0xC862,0xC863,/* 0xC8-0xCF */
+	0xC864,0xC865,0xC866,0xC867,0xC868,0xC869,0xC86A,0xC86B,/* 0xD0-0xD7 */
+	0xC86C,0xC86D,0xC86E,0xC86F,0xC872,0xC873,0xC875,0xC876,/* 0xD8-0xDF */
+	0xC877,0xC879,0xC87B,0xC87C,0xC87D,0xC87E,0xC87F,0xC882,/* 0xE0-0xE7 */
+	0xC884,0xC888,0xC889,0xC88A,0xC88E,0xC88F,0xC890,0xC891,/* 0xE8-0xEF */
+	0xC892,0xC893,0xC895,0xC896,0xC897,0xC898,0xC899,0xC89A,/* 0xF0-0xF7 */
+	0xC89B,0xC89C,0xC89E,0xC8A0,0xC8A2,0xC8A3,0xC8A4,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC8A5,0xC8A6,0xC8A7,0xC8A9,0xC8AA,0xC8AB,0xC8AC,/* 0x40-0x47 */
+	0xC8AD,0xC8AE,0xC8AF,0xC8B0,0xC8B1,0xC8B2,0xC8B3,0xC8B4,/* 0x48-0x4F */
+	0xC8B5,0xC8B6,0xC8B7,0xC8B8,0xC8B9,0xC8BA,0xC8BB,0xC8BE,/* 0x50-0x57 */
+	0xC8BF,0xC8C0,0xC8C1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC8C2,0xC8C3,0xC8C5,0xC8C6,0xC8C7,0xC8C9,0xC8CA,/* 0x60-0x67 */
+	0xC8CB,0xC8CD,0xC8CE,0xC8CF,0xC8D0,0xC8D1,0xC8D2,0xC8D3,/* 0x68-0x6F */
+	0xC8D6,0xC8D8,0xC8DA,0xC8DB,0xC8DC,0xC8DD,0xC8DE,0xC8DF,/* 0x70-0x77 */
+	0xC8E2,0xC8E3,0xC8E5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC8E6,0xC8E7,0xC8E8,0xC8E9,0xC8EA,0xC8EB,0xC8EC,/* 0x80-0x87 */
+	0xC8ED,0xC8EE,0xC8EF,0xC8F0,0xC8F1,0xC8F2,0xC8F3,0xC8F4,/* 0x88-0x8F */
+	0xC8F6,0xC8F7,0xC8F8,0xC8F9,0xC8FA,0xC8FB,0xC8FE,0xC8FF,/* 0x90-0x97 */
+	0xC901,0xC902,0xC903,0xC907,0xC908,0xC909,0xC90A,0xC90B,/* 0x98-0x9F */
+	0xC90E,0x3000,0x3001,0x3002,0x00B7,0x2025,0x2026,0x00A8,/* 0xA0-0xA7 */
+	0x3003,0x00AD,0x2015,0x2225,0xFF3C,0x223C,0x2018,0x2019,/* 0xA8-0xAF */
+	0x201C,0x201D,0x3014,0x3015,0x3008,0x3009,0x300A,0x300B,/* 0xB0-0xB7 */
+	0x300C,0x300D,0x300E,0x300F,0x3010,0x3011,0x00B1,0x00D7,/* 0xB8-0xBF */
+	0x00F7,0x2260,0x2264,0x2265,0x221E,0x2234,0x00B0,0x2032,/* 0xC0-0xC7 */
+	0x2033,0x2103,0x212B,0xFFE0,0xFFE1,0xFFE5,0x2642,0x2640,/* 0xC8-0xCF */
+	0x2220,0x22A5,0x2312,0x2202,0x2207,0x2261,0x2252,0x00A7,/* 0xD0-0xD7 */
+	0x203B,0x2606,0x2605,0x25CB,0x25CF,0x25CE,0x25C7,0x25C6,/* 0xD8-0xDF */
+	0x25A1,0x25A0,0x25B3,0x25B2,0x25BD,0x25BC,0x2192,0x2190,/* 0xE0-0xE7 */
+	0x2191,0x2193,0x2194,0x3013,0x226A,0x226B,0x221A,0x223D,/* 0xE8-0xEF */
+	0x221D,0x2235,0x222B,0x222C,0x2208,0x220B,0x2286,0x2287,/* 0xF0-0xF7 */
+	0x2282,0x2283,0x222A,0x2229,0x2227,0x2228,0xFFE2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC910,0xC912,0xC913,0xC914,0xC915,0xC916,0xC917,/* 0x40-0x47 */
+	0xC919,0xC91A,0xC91B,0xC91C,0xC91D,0xC91E,0xC91F,0xC920,/* 0x48-0x4F */
+	0xC921,0xC922,0xC923,0xC924,0xC925,0xC926,0xC927,0xC928,/* 0x50-0x57 */
+	0xC929,0xC92A,0xC92B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC92D,0xC92E,0xC92F,0xC930,0xC931,0xC932,0xC933,/* 0x60-0x67 */
+	0xC935,0xC936,0xC937,0xC938,0xC939,0xC93A,0xC93B,0xC93C,/* 0x68-0x6F */
+	0xC93D,0xC93E,0xC93F,0xC940,0xC941,0xC942,0xC943,0xC944,/* 0x70-0x77 */
+	0xC945,0xC946,0xC947,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC948,0xC949,0xC94A,0xC94B,0xC94C,0xC94D,0xC94E,/* 0x80-0x87 */
+	0xC94F,0xC952,0xC953,0xC955,0xC956,0xC957,0xC959,0xC95A,/* 0x88-0x8F */
+	0xC95B,0xC95C,0xC95D,0xC95E,0xC95F,0xC962,0xC964,0xC965,/* 0x90-0x97 */
+	0xC966,0xC967,0xC968,0xC969,0xC96A,0xC96B,0xC96D,0xC96E,/* 0x98-0x9F */
+	0xC96F,0x21D2,0x21D4,0x2200,0x2203,0x00B4,0xFF5E,0x02C7,/* 0xA0-0xA7 */
+	0x02D8,0x02DD,0x02DA,0x02D9,0x00B8,0x02DB,0x00A1,0x00BF,/* 0xA8-0xAF */
+	0x02D0,0x222E,0x2211,0x220F,0x00A4,0x2109,0x2030,0x25C1,/* 0xB0-0xB7 */
+	0x25C0,0x25B7,0x25B6,0x2664,0x2660,0x2661,0x2665,0x2667,/* 0xB8-0xBF */
+	0x2663,0x2299,0x25C8,0x25A3,0x25D0,0x25D1,0x2592,0x25A4,/* 0xC0-0xC7 */
+	0x25A5,0x25A8,0x25A7,0x25A6,0x25A9,0x2668,0x260F,0x260E,/* 0xC8-0xCF */
+	0x261C,0x261E,0x00B6,0x2020,0x2021,0x2195,0x2197,0x2199,/* 0xD0-0xD7 */
+	0x2196,0x2198,0x266D,0x2669,0x266A,0x266C,0x327F,0x321C,/* 0xD8-0xDF */
+	0x2116,0x33C7,0x2122,0x33C2,0x33D8,0x2121,0x20AC,0x00AE,/* 0xE0-0xE7 */
+};
+
+static const wchar_t c2u_A3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC971,0xC972,0xC973,0xC975,0xC976,0xC977,0xC978,/* 0x40-0x47 */
+	0xC979,0xC97A,0xC97B,0xC97D,0xC97E,0xC97F,0xC980,0xC981,/* 0x48-0x4F */
+	0xC982,0xC983,0xC984,0xC985,0xC986,0xC987,0xC98A,0xC98B,/* 0x50-0x57 */
+	0xC98D,0xC98E,0xC98F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xC991,0xC992,0xC993,0xC994,0xC995,0xC996,0xC997,/* 0x60-0x67 */
+	0xC99A,0xC99C,0xC99E,0xC99F,0xC9A0,0xC9A1,0xC9A2,0xC9A3,/* 0x68-0x6F */
+	0xC9A4,0xC9A5,0xC9A6,0xC9A7,0xC9A8,0xC9A9,0xC9AA,0xC9AB,/* 0x70-0x77 */
+	0xC9AC,0xC9AD,0xC9AE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xC9AF,0xC9B0,0xC9B1,0xC9B2,0xC9B3,0xC9B4,0xC9B5,/* 0x80-0x87 */
+	0xC9B6,0xC9B7,0xC9B8,0xC9B9,0xC9BA,0xC9BB,0xC9BC,0xC9BD,/* 0x88-0x8F */
+	0xC9BE,0xC9BF,0xC9C2,0xC9C3,0xC9C5,0xC9C6,0xC9C9,0xC9CB,/* 0x90-0x97 */
+	0xC9CC,0xC9CD,0xC9CE,0xC9CF,0xC9D2,0xC9D4,0xC9D7,0xC9D8,/* 0x98-0x9F */
+	0xC9DB,0xFF01,0xFF02,0xFF03,0xFF04,0xFF05,0xFF06,0xFF07,/* 0xA0-0xA7 */
+	0xFF08,0xFF09,0xFF0A,0xFF0B,0xFF0C,0xFF0D,0xFF0E,0xFF0F,/* 0xA8-0xAF */
+	0xFF10,0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,/* 0xB0-0xB7 */
+	0xFF18,0xFF19,0xFF1A,0xFF1B,0xFF1C,0xFF1D,0xFF1E,0xFF1F,/* 0xB8-0xBF */
+	0xFF20,0xFF21,0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,/* 0xC0-0xC7 */
+	0xFF28,0xFF29,0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,/* 0xC8-0xCF */
+	0xFF30,0xFF31,0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,/* 0xD0-0xD7 */
+	0xFF38,0xFF39,0xFF3A,0xFF3B,0xFFE6,0xFF3D,0xFF3E,0xFF3F,/* 0xD8-0xDF */
+	0xFF40,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0xE0-0xE7 */
+	0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0xE8-0xEF */
+	0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0xFF57,/* 0xF0-0xF7 */
+	0xFF58,0xFF59,0xFF5A,0xFF5B,0xFF5C,0xFF5D,0xFFE3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xC9DE,0xC9DF,0xC9E1,0xC9E3,0xC9E5,0xC9E6,0xC9E8,/* 0x40-0x47 */
+	0xC9E9,0xC9EA,0xC9EB,0xC9EE,0xC9F2,0xC9F3,0xC9F4,0xC9F5,/* 0x48-0x4F */
+	0xC9F6,0xC9F7,0xC9FA,0xC9FB,0xC9FD,0xC9FE,0xC9FF,0xCA01,/* 0x50-0x57 */
+	0xCA02,0xCA03,0xCA04,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCA05,0xCA06,0xCA07,0xCA0A,0xCA0E,0xCA0F,0xCA10,/* 0x60-0x67 */
+	0xCA11,0xCA12,0xCA13,0xCA15,0xCA16,0xCA17,0xCA19,0xCA1A,/* 0x68-0x6F */
+	0xCA1B,0xCA1C,0xCA1D,0xCA1E,0xCA1F,0xCA20,0xCA21,0xCA22,/* 0x70-0x77 */
+	0xCA23,0xCA24,0xCA25,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCA26,0xCA27,0xCA28,0xCA2A,0xCA2B,0xCA2C,0xCA2D,/* 0x80-0x87 */
+	0xCA2E,0xCA2F,0xCA30,0xCA31,0xCA32,0xCA33,0xCA34,0xCA35,/* 0x88-0x8F */
+	0xCA36,0xCA37,0xCA38,0xCA39,0xCA3A,0xCA3B,0xCA3C,0xCA3D,/* 0x90-0x97 */
+	0xCA3E,0xCA3F,0xCA40,0xCA41,0xCA42,0xCA43,0xCA44,0xCA45,/* 0x98-0x9F */
+	0xCA46,0x3131,0x3132,0x3133,0x3134,0x3135,0x3136,0x3137,/* 0xA0-0xA7 */
+	0x3138,0x3139,0x313A,0x313B,0x313C,0x313D,0x313E,0x313F,/* 0xA8-0xAF */
+	0x3140,0x3141,0x3142,0x3143,0x3144,0x3145,0x3146,0x3147,/* 0xB0-0xB7 */
+	0x3148,0x3149,0x314A,0x314B,0x314C,0x314D,0x314E,0x314F,/* 0xB8-0xBF */
+	0x3150,0x3151,0x3152,0x3153,0x3154,0x3155,0x3156,0x3157,/* 0xC0-0xC7 */
+	0x3158,0x3159,0x315A,0x315B,0x315C,0x315D,0x315E,0x315F,/* 0xC8-0xCF */
+	0x3160,0x3161,0x3162,0x3163,0x3164,0x3165,0x3166,0x3167,/* 0xD0-0xD7 */
+	0x3168,0x3169,0x316A,0x316B,0x316C,0x316D,0x316E,0x316F,/* 0xD8-0xDF */
+	0x3170,0x3171,0x3172,0x3173,0x3174,0x3175,0x3176,0x3177,/* 0xE0-0xE7 */
+	0x3178,0x3179,0x317A,0x317B,0x317C,0x317D,0x317E,0x317F,/* 0xE8-0xEF */
+	0x3180,0x3181,0x3182,0x3183,0x3184,0x3185,0x3186,0x3187,/* 0xF0-0xF7 */
+	0x3188,0x3189,0x318A,0x318B,0x318C,0x318D,0x318E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCA47,0xCA48,0xCA49,0xCA4A,0xCA4B,0xCA4E,0xCA4F,/* 0x40-0x47 */
+	0xCA51,0xCA52,0xCA53,0xCA55,0xCA56,0xCA57,0xCA58,0xCA59,/* 0x48-0x4F */
+	0xCA5A,0xCA5B,0xCA5E,0xCA62,0xCA63,0xCA64,0xCA65,0xCA66,/* 0x50-0x57 */
+	0xCA67,0xCA69,0xCA6A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCA6B,0xCA6C,0xCA6D,0xCA6E,0xCA6F,0xCA70,0xCA71,/* 0x60-0x67 */
+	0xCA72,0xCA73,0xCA74,0xCA75,0xCA76,0xCA77,0xCA78,0xCA79,/* 0x68-0x6F */
+	0xCA7A,0xCA7B,0xCA7C,0xCA7E,0xCA7F,0xCA80,0xCA81,0xCA82,/* 0x70-0x77 */
+	0xCA83,0xCA85,0xCA86,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCA87,0xCA88,0xCA89,0xCA8A,0xCA8B,0xCA8C,0xCA8D,/* 0x80-0x87 */
+	0xCA8E,0xCA8F,0xCA90,0xCA91,0xCA92,0xCA93,0xCA94,0xCA95,/* 0x88-0x8F */
+	0xCA96,0xCA97,0xCA99,0xCA9A,0xCA9B,0xCA9C,0xCA9D,0xCA9E,/* 0x90-0x97 */
+	0xCA9F,0xCAA0,0xCAA1,0xCAA2,0xCAA3,0xCAA4,0xCAA5,0xCAA6,/* 0x98-0x9F */
+	0xCAA7,0x2170,0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,/* 0xA0-0xA7 */
+	0x2177,0x2178,0x2179,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA8-0xAF */
+	0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,0x2166,0x2167,/* 0xB0-0xB7 */
+	0x2168,0x2169,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xB8-0xBF */
+	0x0000,0x0391,0x0392,0x0393,0x0394,0x0395,0x0396,0x0397,/* 0xC0-0xC7 */
+	0x0398,0x0399,0x039A,0x039B,0x039C,0x039D,0x039E,0x039F,/* 0xC8-0xCF */
+	0x03A0,0x03A1,0x03A3,0x03A4,0x03A5,0x03A6,0x03A7,0x03A8,/* 0xD0-0xD7 */
+	0x03A9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD8-0xDF */
+	0x0000,0x03B1,0x03B2,0x03B3,0x03B4,0x03B5,0x03B6,0x03B7,/* 0xE0-0xE7 */
+	0x03B8,0x03B9,0x03BA,0x03BB,0x03BC,0x03BD,0x03BE,0x03BF,/* 0xE8-0xEF */
+	0x03C0,0x03C1,0x03C3,0x03C4,0x03C5,0x03C6,0x03C7,0x03C8,/* 0xF0-0xF7 */
+	0x03C9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCAA8,0xCAA9,0xCAAA,0xCAAB,0xCAAC,0xCAAD,0xCAAE,/* 0x40-0x47 */
+	0xCAAF,0xCAB0,0xCAB1,0xCAB2,0xCAB3,0xCAB4,0xCAB5,0xCAB6,/* 0x48-0x4F */
+	0xCAB7,0xCAB8,0xCAB9,0xCABA,0xCABB,0xCABE,0xCABF,0xCAC1,/* 0x50-0x57 */
+	0xCAC2,0xCAC3,0xCAC5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCAC6,0xCAC7,0xCAC8,0xCAC9,0xCACA,0xCACB,0xCACE,/* 0x60-0x67 */
+	0xCAD0,0xCAD2,0xCAD4,0xCAD5,0xCAD6,0xCAD7,0xCADA,0xCADB,/* 0x68-0x6F */
+	0xCADC,0xCADD,0xCADE,0xCADF,0xCAE1,0xCAE2,0xCAE3,0xCAE4,/* 0x70-0x77 */
+	0xCAE5,0xCAE6,0xCAE7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCAE8,0xCAE9,0xCAEA,0xCAEB,0xCAED,0xCAEE,0xCAEF,/* 0x80-0x87 */
+	0xCAF0,0xCAF1,0xCAF2,0xCAF3,0xCAF5,0xCAF6,0xCAF7,0xCAF8,/* 0x88-0x8F */
+	0xCAF9,0xCAFA,0xCAFB,0xCAFC,0xCAFD,0xCAFE,0xCAFF,0xCB00,/* 0x90-0x97 */
+	0xCB01,0xCB02,0xCB03,0xCB04,0xCB05,0xCB06,0xCB07,0xCB09,/* 0x98-0x9F */
+	0xCB0A,0x2500,0x2502,0x250C,0x2510,0x2518,0x2514,0x251C,/* 0xA0-0xA7 */
+	0x252C,0x2524,0x2534,0x253C,0x2501,0x2503,0x250F,0x2513,/* 0xA8-0xAF */
+	0x251B,0x2517,0x2523,0x2533,0x252B,0x253B,0x254B,0x2520,/* 0xB0-0xB7 */
+	0x252F,0x2528,0x2537,0x253F,0x251D,0x2530,0x2525,0x2538,/* 0xB8-0xBF */
+	0x2542,0x2512,0x2511,0x251A,0x2519,0x2516,0x2515,0x250E,/* 0xC0-0xC7 */
+	0x250D,0x251E,0x251F,0x2521,0x2522,0x2526,0x2527,0x2529,/* 0xC8-0xCF */
+	0x252A,0x252D,0x252E,0x2531,0x2532,0x2535,0x2536,0x2539,/* 0xD0-0xD7 */
+	0x253A,0x253D,0x253E,0x2540,0x2541,0x2543,0x2544,0x2545,/* 0xD8-0xDF */
+	0x2546,0x2547,0x2548,0x2549,0x254A,0x0000,0x0000,0x0000,/* 0xE0-0xE7 */
+};
+
+static const wchar_t c2u_A7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCB0B,0xCB0C,0xCB0D,0xCB0E,0xCB0F,0xCB11,0xCB12,/* 0x40-0x47 */
+	0xCB13,0xCB15,0xCB16,0xCB17,0xCB19,0xCB1A,0xCB1B,0xCB1C,/* 0x48-0x4F */
+	0xCB1D,0xCB1E,0xCB1F,0xCB22,0xCB23,0xCB24,0xCB25,0xCB26,/* 0x50-0x57 */
+	0xCB27,0xCB28,0xCB29,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCB2A,0xCB2B,0xCB2C,0xCB2D,0xCB2E,0xCB2F,0xCB30,/* 0x60-0x67 */
+	0xCB31,0xCB32,0xCB33,0xCB34,0xCB35,0xCB36,0xCB37,0xCB38,/* 0x68-0x6F */
+	0xCB39,0xCB3A,0xCB3B,0xCB3C,0xCB3D,0xCB3E,0xCB3F,0xCB40,/* 0x70-0x77 */
+	0xCB42,0xCB43,0xCB44,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCB45,0xCB46,0xCB47,0xCB4A,0xCB4B,0xCB4D,0xCB4E,/* 0x80-0x87 */
+	0xCB4F,0xCB51,0xCB52,0xCB53,0xCB54,0xCB55,0xCB56,0xCB57,/* 0x88-0x8F */
+	0xCB5A,0xCB5B,0xCB5C,0xCB5E,0xCB5F,0xCB60,0xCB61,0xCB62,/* 0x90-0x97 */
+	0xCB63,0xCB65,0xCB66,0xCB67,0xCB68,0xCB69,0xCB6A,0xCB6B,/* 0x98-0x9F */
+	0xCB6C,0x3395,0x3396,0x3397,0x2113,0x3398,0x33C4,0x33A3,/* 0xA0-0xA7 */
+	0x33A4,0x33A5,0x33A6,0x3399,0x339A,0x339B,0x339C,0x339D,/* 0xA8-0xAF */
+	0x339E,0x339F,0x33A0,0x33A1,0x33A2,0x33CA,0x338D,0x338E,/* 0xB0-0xB7 */
+	0x338F,0x33CF,0x3388,0x3389,0x33C8,0x33A7,0x33A8,0x33B0,/* 0xB8-0xBF */
+	0x33B1,0x33B2,0x33B3,0x33B4,0x33B5,0x33B6,0x33B7,0x33B8,/* 0xC0-0xC7 */
+	0x33B9,0x3380,0x3381,0x3382,0x3383,0x3384,0x33BA,0x33BB,/* 0xC8-0xCF */
+	0x33BC,0x33BD,0x33BE,0x33BF,0x3390,0x3391,0x3392,0x3393,/* 0xD0-0xD7 */
+	0x3394,0x2126,0x33C0,0x33C1,0x338A,0x338B,0x338C,0x33D6,/* 0xD8-0xDF */
+	0x33C5,0x33AD,0x33AE,0x33AF,0x33DB,0x33A9,0x33AA,0x33AB,/* 0xE0-0xE7 */
+	0x33AC,0x33DD,0x33D0,0x33D3,0x33C3,0x33C9,0x33DC,0x33C6,/* 0xE8-0xEF */
+};
+
+static const wchar_t c2u_A8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCB6D,0xCB6E,0xCB6F,0xCB70,0xCB71,0xCB72,0xCB73,/* 0x40-0x47 */
+	0xCB74,0xCB75,0xCB76,0xCB77,0xCB7A,0xCB7B,0xCB7C,0xCB7D,/* 0x48-0x4F */
+	0xCB7E,0xCB7F,0xCB80,0xCB81,0xCB82,0xCB83,0xCB84,0xCB85,/* 0x50-0x57 */
+	0xCB86,0xCB87,0xCB88,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCB89,0xCB8A,0xCB8B,0xCB8C,0xCB8D,0xCB8E,0xCB8F,/* 0x60-0x67 */
+	0xCB90,0xCB91,0xCB92,0xCB93,0xCB94,0xCB95,0xCB96,0xCB97,/* 0x68-0x6F */
+	0xCB98,0xCB99,0xCB9A,0xCB9B,0xCB9D,0xCB9E,0xCB9F,0xCBA0,/* 0x70-0x77 */
+	0xCBA1,0xCBA2,0xCBA3,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCBA4,0xCBA5,0xCBA6,0xCBA7,0xCBA8,0xCBA9,0xCBAA,/* 0x80-0x87 */
+	0xCBAB,0xCBAC,0xCBAD,0xCBAE,0xCBAF,0xCBB0,0xCBB1,0xCBB2,/* 0x88-0x8F */
+	0xCBB3,0xCBB4,0xCBB5,0xCBB6,0xCBB7,0xCBB9,0xCBBA,0xCBBB,/* 0x90-0x97 */
+	0xCBBC,0xCBBD,0xCBBE,0xCBBF,0xCBC0,0xCBC1,0xCBC2,0xCBC3,/* 0x98-0x9F */
+	0xCBC4,0x00C6,0x00D0,0x00AA,0x0126,0x0000,0x0132,0x0000,/* 0xA0-0xA7 */
+	0x013F,0x0141,0x00D8,0x0152,0x00BA,0x00DE,0x0166,0x014A,/* 0xA8-0xAF */
+	0x0000,0x3260,0x3261,0x3262,0x3263,0x3264,0x3265,0x3266,/* 0xB0-0xB7 */
+	0x3267,0x3268,0x3269,0x326A,0x326B,0x326C,0x326D,0x326E,/* 0xB8-0xBF */
+	0x326F,0x3270,0x3271,0x3272,0x3273,0x3274,0x3275,0x3276,/* 0xC0-0xC7 */
+	0x3277,0x3278,0x3279,0x327A,0x327B,0x24D0,0x24D1,0x24D2,/* 0xC8-0xCF */
+	0x24D3,0x24D4,0x24D5,0x24D6,0x24D7,0x24D8,0x24D9,0x24DA,/* 0xD0-0xD7 */
+	0x24DB,0x24DC,0x24DD,0x24DE,0x24DF,0x24E0,0x24E1,0x24E2,/* 0xD8-0xDF */
+	0x24E3,0x24E4,0x24E5,0x24E6,0x24E7,0x24E8,0x24E9,0x2460,/* 0xE0-0xE7 */
+	0x2461,0x2462,0x2463,0x2464,0x2465,0x2466,0x2467,0x2468,/* 0xE8-0xEF */
+	0x2469,0x246A,0x246B,0x246C,0x246D,0x246E,0x00BD,0x2153,/* 0xF0-0xF7 */
+	0x2154,0x00BC,0x00BE,0x215B,0x215C,0x215D,0x215E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCBC5,0xCBC6,0xCBC7,0xCBC8,0xCBC9,0xCBCA,0xCBCB,/* 0x40-0x47 */
+	0xCBCC,0xCBCD,0xCBCE,0xCBCF,0xCBD0,0xCBD1,0xCBD2,0xCBD3,/* 0x48-0x4F */
+	0xCBD5,0xCBD6,0xCBD7,0xCBD8,0xCBD9,0xCBDA,0xCBDB,0xCBDC,/* 0x50-0x57 */
+	0xCBDD,0xCBDE,0xCBDF,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCBE0,0xCBE1,0xCBE2,0xCBE3,0xCBE5,0xCBE6,0xCBE8,/* 0x60-0x67 */
+	0xCBEA,0xCBEB,0xCBEC,0xCBED,0xCBEE,0xCBEF,0xCBF0,0xCBF1,/* 0x68-0x6F */
+	0xCBF2,0xCBF3,0xCBF4,0xCBF5,0xCBF6,0xCBF7,0xCBF8,0xCBF9,/* 0x70-0x77 */
+	0xCBFA,0xCBFB,0xCBFC,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCBFD,0xCBFE,0xCBFF,0xCC00,0xCC01,0xCC02,0xCC03,/* 0x80-0x87 */
+	0xCC04,0xCC05,0xCC06,0xCC07,0xCC08,0xCC09,0xCC0A,0xCC0B,/* 0x88-0x8F */
+	0xCC0E,0xCC0F,0xCC11,0xCC12,0xCC13,0xCC15,0xCC16,0xCC17,/* 0x90-0x97 */
+	0xCC18,0xCC19,0xCC1A,0xCC1B,0xCC1E,0xCC1F,0xCC20,0xCC23,/* 0x98-0x9F */
+	0xCC24,0x00E6,0x0111,0x00F0,0x0127,0x0131,0x0133,0x0138,/* 0xA0-0xA7 */
+	0x0140,0x0142,0x00F8,0x0153,0x00DF,0x00FE,0x0167,0x014B,/* 0xA8-0xAF */
+	0x0149,0x3200,0x3201,0x3202,0x3203,0x3204,0x3205,0x3206,/* 0xB0-0xB7 */
+	0x3207,0x3208,0x3209,0x320A,0x320B,0x320C,0x320D,0x320E,/* 0xB8-0xBF */
+	0x320F,0x3210,0x3211,0x3212,0x3213,0x3214,0x3215,0x3216,/* 0xC0-0xC7 */
+	0x3217,0x3218,0x3219,0x321A,0x321B,0x249C,0x249D,0x249E,/* 0xC8-0xCF */
+	0x249F,0x24A0,0x24A1,0x24A2,0x24A3,0x24A4,0x24A5,0x24A6,/* 0xD0-0xD7 */
+	0x24A7,0x24A8,0x24A9,0x24AA,0x24AB,0x24AC,0x24AD,0x24AE,/* 0xD8-0xDF */
+	0x24AF,0x24B0,0x24B1,0x24B2,0x24B3,0x24B4,0x24B5,0x2474,/* 0xE0-0xE7 */
+	0x2475,0x2476,0x2477,0x2478,0x2479,0x247A,0x247B,0x247C,/* 0xE8-0xEF */
+	0x247D,0x247E,0x247F,0x2480,0x2481,0x2482,0x00B9,0x00B2,/* 0xF0-0xF7 */
+	0x00B3,0x2074,0x207F,0x2081,0x2082,0x2083,0x2084,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_AA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCC25,0xCC26,0xCC2A,0xCC2B,0xCC2D,0xCC2F,0xCC31,/* 0x40-0x47 */
+	0xCC32,0xCC33,0xCC34,0xCC35,0xCC36,0xCC37,0xCC3A,0xCC3F,/* 0x48-0x4F */
+	0xCC40,0xCC41,0xCC42,0xCC43,0xCC46,0xCC47,0xCC49,0xCC4A,/* 0x50-0x57 */
+	0xCC4B,0xCC4D,0xCC4E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCC4F,0xCC50,0xCC51,0xCC52,0xCC53,0xCC56,0xCC5A,/* 0x60-0x67 */
+	0xCC5B,0xCC5C,0xCC5D,0xCC5E,0xCC5F,0xCC61,0xCC62,0xCC63,/* 0x68-0x6F */
+	0xCC65,0xCC67,0xCC69,0xCC6A,0xCC6B,0xCC6C,0xCC6D,0xCC6E,/* 0x70-0x77 */
+	0xCC6F,0xCC71,0xCC72,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCC73,0xCC74,0xCC76,0xCC77,0xCC78,0xCC79,0xCC7A,/* 0x80-0x87 */
+	0xCC7B,0xCC7C,0xCC7D,0xCC7E,0xCC7F,0xCC80,0xCC81,0xCC82,/* 0x88-0x8F */
+	0xCC83,0xCC84,0xCC85,0xCC86,0xCC87,0xCC88,0xCC89,0xCC8A,/* 0x90-0x97 */
+	0xCC8B,0xCC8C,0xCC8D,0xCC8E,0xCC8F,0xCC90,0xCC91,0xCC92,/* 0x98-0x9F */
+	0xCC93,0x3041,0x3042,0x3043,0x3044,0x3045,0x3046,0x3047,/* 0xA0-0xA7 */
+	0x3048,0x3049,0x304A,0x304B,0x304C,0x304D,0x304E,0x304F,/* 0xA8-0xAF */
+	0x3050,0x3051,0x3052,0x3053,0x3054,0x3055,0x3056,0x3057,/* 0xB0-0xB7 */
+	0x3058,0x3059,0x305A,0x305B,0x305C,0x305D,0x305E,0x305F,/* 0xB8-0xBF */
+	0x3060,0x3061,0x3062,0x3063,0x3064,0x3065,0x3066,0x3067,/* 0xC0-0xC7 */
+	0x3068,0x3069,0x306A,0x306B,0x306C,0x306D,0x306E,0x306F,/* 0xC8-0xCF */
+	0x3070,0x3071,0x3072,0x3073,0x3074,0x3075,0x3076,0x3077,/* 0xD0-0xD7 */
+	0x3078,0x3079,0x307A,0x307B,0x307C,0x307D,0x307E,0x307F,/* 0xD8-0xDF */
+	0x3080,0x3081,0x3082,0x3083,0x3084,0x3085,0x3086,0x3087,/* 0xE0-0xE7 */
+	0x3088,0x3089,0x308A,0x308B,0x308C,0x308D,0x308E,0x308F,/* 0xE8-0xEF */
+	0x3090,0x3091,0x3092,0x3093,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_AB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCC94,0xCC95,0xCC96,0xCC97,0xCC9A,0xCC9B,0xCC9D,/* 0x40-0x47 */
+	0xCC9E,0xCC9F,0xCCA1,0xCCA2,0xCCA3,0xCCA4,0xCCA5,0xCCA6,/* 0x48-0x4F */
+	0xCCA7,0xCCAA,0xCCAE,0xCCAF,0xCCB0,0xCCB1,0xCCB2,0xCCB3,/* 0x50-0x57 */
+	0xCCB6,0xCCB7,0xCCB9,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCCBA,0xCCBB,0xCCBD,0xCCBE,0xCCBF,0xCCC0,0xCCC1,/* 0x60-0x67 */
+	0xCCC2,0xCCC3,0xCCC6,0xCCC8,0xCCCA,0xCCCB,0xCCCC,0xCCCD,/* 0x68-0x6F */
+	0xCCCE,0xCCCF,0xCCD1,0xCCD2,0xCCD3,0xCCD5,0xCCD6,0xCCD7,/* 0x70-0x77 */
+	0xCCD8,0xCCD9,0xCCDA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCCDB,0xCCDC,0xCCDD,0xCCDE,0xCCDF,0xCCE0,0xCCE1,/* 0x80-0x87 */
+	0xCCE2,0xCCE3,0xCCE5,0xCCE6,0xCCE7,0xCCE8,0xCCE9,0xCCEA,/* 0x88-0x8F */
+	0xCCEB,0xCCED,0xCCEE,0xCCEF,0xCCF1,0xCCF2,0xCCF3,0xCCF4,/* 0x90-0x97 */
+	0xCCF5,0xCCF6,0xCCF7,0xCCF8,0xCCF9,0xCCFA,0xCCFB,0xCCFC,/* 0x98-0x9F */
+	0xCCFD,0x30A1,0x30A2,0x30A3,0x30A4,0x30A5,0x30A6,0x30A7,/* 0xA0-0xA7 */
+	0x30A8,0x30A9,0x30AA,0x30AB,0x30AC,0x30AD,0x30AE,0x30AF,/* 0xA8-0xAF */
+	0x30B0,0x30B1,0x30B2,0x30B3,0x30B4,0x30B5,0x30B6,0x30B7,/* 0xB0-0xB7 */
+	0x30B8,0x30B9,0x30BA,0x30BB,0x30BC,0x30BD,0x30BE,0x30BF,/* 0xB8-0xBF */
+	0x30C0,0x30C1,0x30C2,0x30C3,0x30C4,0x30C5,0x30C6,0x30C7,/* 0xC0-0xC7 */
+	0x30C8,0x30C9,0x30CA,0x30CB,0x30CC,0x30CD,0x30CE,0x30CF,/* 0xC8-0xCF */
+	0x30D0,0x30D1,0x30D2,0x30D3,0x30D4,0x30D5,0x30D6,0x30D7,/* 0xD0-0xD7 */
+	0x30D8,0x30D9,0x30DA,0x30DB,0x30DC,0x30DD,0x30DE,0x30DF,/* 0xD8-0xDF */
+	0x30E0,0x30E1,0x30E2,0x30E3,0x30E4,0x30E5,0x30E6,0x30E7,/* 0xE0-0xE7 */
+	0x30E8,0x30E9,0x30EA,0x30EB,0x30EC,0x30ED,0x30EE,0x30EF,/* 0xE8-0xEF */
+	0x30F0,0x30F1,0x30F2,0x30F3,0x30F4,0x30F5,0x30F6,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_AC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCCFE,0xCCFF,0xCD00,0xCD02,0xCD03,0xCD04,0xCD05,/* 0x40-0x47 */
+	0xCD06,0xCD07,0xCD0A,0xCD0B,0xCD0D,0xCD0E,0xCD0F,0xCD11,/* 0x48-0x4F */
+	0xCD12,0xCD13,0xCD14,0xCD15,0xCD16,0xCD17,0xCD1A,0xCD1C,/* 0x50-0x57 */
+	0xCD1E,0xCD1F,0xCD20,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCD21,0xCD22,0xCD23,0xCD25,0xCD26,0xCD27,0xCD29,/* 0x60-0x67 */
+	0xCD2A,0xCD2B,0xCD2D,0xCD2E,0xCD2F,0xCD30,0xCD31,0xCD32,/* 0x68-0x6F */
+	0xCD33,0xCD34,0xCD35,0xCD36,0xCD37,0xCD38,0xCD3A,0xCD3B,/* 0x70-0x77 */
+	0xCD3C,0xCD3D,0xCD3E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCD3F,0xCD40,0xCD41,0xCD42,0xCD43,0xCD44,0xCD45,/* 0x80-0x87 */
+	0xCD46,0xCD47,0xCD48,0xCD49,0xCD4A,0xCD4B,0xCD4C,0xCD4D,/* 0x88-0x8F */
+	0xCD4E,0xCD4F,0xCD50,0xCD51,0xCD52,0xCD53,0xCD54,0xCD55,/* 0x90-0x97 */
+	0xCD56,0xCD57,0xCD58,0xCD59,0xCD5A,0xCD5B,0xCD5D,0xCD5E,/* 0x98-0x9F */
+	0xCD5F,0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0401,/* 0xA0-0xA7 */
+	0x0416,0x0417,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,/* 0xA8-0xAF */
+	0x041E,0x041F,0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,/* 0xB0-0xB7 */
+	0x0426,0x0427,0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,/* 0xB8-0xBF */
+	0x042E,0x042F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC8-0xCF */
+	0x0000,0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0451,/* 0xD0-0xD7 */
+	0x0436,0x0437,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,/* 0xD8-0xDF */
+	0x043E,0x043F,0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,/* 0xE0-0xE7 */
+	0x0446,0x0447,0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,/* 0xE8-0xEF */
+	0x044E,0x044F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */
+};
+
+static const wchar_t c2u_AD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCD61,0xCD62,0xCD63,0xCD65,0xCD66,0xCD67,0xCD68,/* 0x40-0x47 */
+	0xCD69,0xCD6A,0xCD6B,0xCD6E,0xCD70,0xCD72,0xCD73,0xCD74,/* 0x48-0x4F */
+	0xCD75,0xCD76,0xCD77,0xCD79,0xCD7A,0xCD7B,0xCD7C,0xCD7D,/* 0x50-0x57 */
+	0xCD7E,0xCD7F,0xCD80,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCD81,0xCD82,0xCD83,0xCD84,0xCD85,0xCD86,0xCD87,/* 0x60-0x67 */
+	0xCD89,0xCD8A,0xCD8B,0xCD8C,0xCD8D,0xCD8E,0xCD8F,0xCD90,/* 0x68-0x6F */
+	0xCD91,0xCD92,0xCD93,0xCD96,0xCD97,0xCD99,0xCD9A,0xCD9B,/* 0x70-0x77 */
+	0xCD9D,0xCD9E,0xCD9F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCDA0,0xCDA1,0xCDA2,0xCDA3,0xCDA6,0xCDA8,0xCDAA,/* 0x80-0x87 */
+	0xCDAB,0xCDAC,0xCDAD,0xCDAE,0xCDAF,0xCDB1,0xCDB2,0xCDB3,/* 0x88-0x8F */
+	0xCDB4,0xCDB5,0xCDB6,0xCDB7,0xCDB8,0xCDB9,0xCDBA,0xCDBB,/* 0x90-0x97 */
+	0xCDBC,0xCDBD,0xCDBE,0xCDBF,0xCDC0,0xCDC1,0xCDC2,0xCDC3,/* 0x98-0x9F */
+	0xCDC5,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_AE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCDC6,0xCDC7,0xCDC8,0xCDC9,0xCDCA,0xCDCB,0xCDCD,/* 0x40-0x47 */
+	0xCDCE,0xCDCF,0xCDD1,0xCDD2,0xCDD3,0xCDD4,0xCDD5,0xCDD6,/* 0x48-0x4F */
+	0xCDD7,0xCDD8,0xCDD9,0xCDDA,0xCDDB,0xCDDC,0xCDDD,0xCDDE,/* 0x50-0x57 */
+	0xCDDF,0xCDE0,0xCDE1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCDE2,0xCDE3,0xCDE4,0xCDE5,0xCDE6,0xCDE7,0xCDE9,/* 0x60-0x67 */
+	0xCDEA,0xCDEB,0xCDED,0xCDEE,0xCDEF,0xCDF1,0xCDF2,0xCDF3,/* 0x68-0x6F */
+	0xCDF4,0xCDF5,0xCDF6,0xCDF7,0xCDFA,0xCDFC,0xCDFE,0xCDFF,/* 0x70-0x77 */
+	0xCE00,0xCE01,0xCE02,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCE03,0xCE05,0xCE06,0xCE07,0xCE09,0xCE0A,0xCE0B,/* 0x80-0x87 */
+	0xCE0D,0xCE0E,0xCE0F,0xCE10,0xCE11,0xCE12,0xCE13,0xCE15,/* 0x88-0x8F */
+	0xCE16,0xCE17,0xCE18,0xCE1A,0xCE1B,0xCE1C,0xCE1D,0xCE1E,/* 0x90-0x97 */
+	0xCE1F,0xCE22,0xCE23,0xCE25,0xCE26,0xCE27,0xCE29,0xCE2A,/* 0x98-0x9F */
+	0xCE2B,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_AF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCE2C,0xCE2D,0xCE2E,0xCE2F,0xCE32,0xCE34,0xCE36,/* 0x40-0x47 */
+	0xCE37,0xCE38,0xCE39,0xCE3A,0xCE3B,0xCE3C,0xCE3D,0xCE3E,/* 0x48-0x4F */
+	0xCE3F,0xCE40,0xCE41,0xCE42,0xCE43,0xCE44,0xCE45,0xCE46,/* 0x50-0x57 */
+	0xCE47,0xCE48,0xCE49,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCE4A,0xCE4B,0xCE4C,0xCE4D,0xCE4E,0xCE4F,0xCE50,/* 0x60-0x67 */
+	0xCE51,0xCE52,0xCE53,0xCE54,0xCE55,0xCE56,0xCE57,0xCE5A,/* 0x68-0x6F */
+	0xCE5B,0xCE5D,0xCE5E,0xCE62,0xCE63,0xCE64,0xCE65,0xCE66,/* 0x70-0x77 */
+	0xCE67,0xCE6A,0xCE6C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCE6E,0xCE6F,0xCE70,0xCE71,0xCE72,0xCE73,0xCE76,/* 0x80-0x87 */
+	0xCE77,0xCE79,0xCE7A,0xCE7B,0xCE7D,0xCE7E,0xCE7F,0xCE80,/* 0x88-0x8F */
+	0xCE81,0xCE82,0xCE83,0xCE86,0xCE88,0xCE8A,0xCE8B,0xCE8C,/* 0x90-0x97 */
+	0xCE8D,0xCE8E,0xCE8F,0xCE92,0xCE93,0xCE95,0xCE96,0xCE97,/* 0x98-0x9F */
+	0xCE99,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
+};
+
+static const wchar_t c2u_B0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCE9A,0xCE9B,0xCE9C,0xCE9D,0xCE9E,0xCE9F,0xCEA2,/* 0x40-0x47 */
+	0xCEA6,0xCEA7,0xCEA8,0xCEA9,0xCEAA,0xCEAB,0xCEAE,0xCEAF,/* 0x48-0x4F */
+	0xCEB0,0xCEB1,0xCEB2,0xCEB3,0xCEB4,0xCEB5,0xCEB6,0xCEB7,/* 0x50-0x57 */
+	0xCEB8,0xCEB9,0xCEBA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCEBB,0xCEBC,0xCEBD,0xCEBE,0xCEBF,0xCEC0,0xCEC2,/* 0x60-0x67 */
+	0xCEC3,0xCEC4,0xCEC5,0xCEC6,0xCEC7,0xCEC8,0xCEC9,0xCECA,/* 0x68-0x6F */
+	0xCECB,0xCECC,0xCECD,0xCECE,0xCECF,0xCED0,0xCED1,0xCED2,/* 0x70-0x77 */
+	0xCED3,0xCED4,0xCED5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCED6,0xCED7,0xCED8,0xCED9,0xCEDA,0xCEDB,0xCEDC,/* 0x80-0x87 */
+	0xCEDD,0xCEDE,0xCEDF,0xCEE0,0xCEE1,0xCEE2,0xCEE3,0xCEE6,/* 0x88-0x8F */
+	0xCEE7,0xCEE9,0xCEEA,0xCEED,0xCEEE,0xCEEF,0xCEF0,0xCEF1,/* 0x90-0x97 */
+	0xCEF2,0xCEF3,0xCEF6,0xCEFA,0xCEFB,0xCEFC,0xCEFD,0xCEFE,/* 0x98-0x9F */
+	0xCEFF,0xAC00,0xAC01,0xAC04,0xAC07,0xAC08,0xAC09,0xAC0A,/* 0xA0-0xA7 */
+	0xAC10,0xAC11,0xAC12,0xAC13,0xAC14,0xAC15,0xAC16,0xAC17,/* 0xA8-0xAF */
+	0xAC19,0xAC1A,0xAC1B,0xAC1C,0xAC1D,0xAC20,0xAC24,0xAC2C,/* 0xB0-0xB7 */
+	0xAC2D,0xAC2F,0xAC30,0xAC31,0xAC38,0xAC39,0xAC3C,0xAC40,/* 0xB8-0xBF */
+	0xAC4B,0xAC4D,0xAC54,0xAC58,0xAC5C,0xAC70,0xAC71,0xAC74,/* 0xC0-0xC7 */
+	0xAC77,0xAC78,0xAC7A,0xAC80,0xAC81,0xAC83,0xAC84,0xAC85,/* 0xC8-0xCF */
+	0xAC86,0xAC89,0xAC8A,0xAC8B,0xAC8C,0xAC90,0xAC94,0xAC9C,/* 0xD0-0xD7 */
+	0xAC9D,0xAC9F,0xACA0,0xACA1,0xACA8,0xACA9,0xACAA,0xACAC,/* 0xD8-0xDF */
+	0xACAF,0xACB0,0xACB8,0xACB9,0xACBB,0xACBC,0xACBD,0xACC1,/* 0xE0-0xE7 */
+	0xACC4,0xACC8,0xACCC,0xACD5,0xACD7,0xACE0,0xACE1,0xACE4,/* 0xE8-0xEF */
+	0xACE7,0xACE8,0xACEA,0xACEC,0xACEF,0xACF0,0xACF1,0xACF3,/* 0xF0-0xF7 */
+	0xACF5,0xACF6,0xACFC,0xACFD,0xAD00,0xAD04,0xAD06,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCF02,0xCF03,0xCF05,0xCF06,0xCF07,0xCF09,0xCF0A,/* 0x40-0x47 */
+	0xCF0B,0xCF0C,0xCF0D,0xCF0E,0xCF0F,0xCF12,0xCF14,0xCF16,/* 0x48-0x4F */
+	0xCF17,0xCF18,0xCF19,0xCF1A,0xCF1B,0xCF1D,0xCF1E,0xCF1F,/* 0x50-0x57 */
+	0xCF21,0xCF22,0xCF23,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCF25,0xCF26,0xCF27,0xCF28,0xCF29,0xCF2A,0xCF2B,/* 0x60-0x67 */
+	0xCF2E,0xCF32,0xCF33,0xCF34,0xCF35,0xCF36,0xCF37,0xCF39,/* 0x68-0x6F */
+	0xCF3A,0xCF3B,0xCF3C,0xCF3D,0xCF3E,0xCF3F,0xCF40,0xCF41,/* 0x70-0x77 */
+	0xCF42,0xCF43,0xCF44,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCF45,0xCF46,0xCF47,0xCF48,0xCF49,0xCF4A,0xCF4B,/* 0x80-0x87 */
+	0xCF4C,0xCF4D,0xCF4E,0xCF4F,0xCF50,0xCF51,0xCF52,0xCF53,/* 0x88-0x8F */
+	0xCF56,0xCF57,0xCF59,0xCF5A,0xCF5B,0xCF5D,0xCF5E,0xCF5F,/* 0x90-0x97 */
+	0xCF60,0xCF61,0xCF62,0xCF63,0xCF66,0xCF68,0xCF6A,0xCF6B,/* 0x98-0x9F */
+	0xCF6C,0xAD0C,0xAD0D,0xAD0F,0xAD11,0xAD18,0xAD1C,0xAD20,/* 0xA0-0xA7 */
+	0xAD29,0xAD2C,0xAD2D,0xAD34,0xAD35,0xAD38,0xAD3C,0xAD44,/* 0xA8-0xAF */
+	0xAD45,0xAD47,0xAD49,0xAD50,0xAD54,0xAD58,0xAD61,0xAD63,/* 0xB0-0xB7 */
+	0xAD6C,0xAD6D,0xAD70,0xAD73,0xAD74,0xAD75,0xAD76,0xAD7B,/* 0xB8-0xBF */
+	0xAD7C,0xAD7D,0xAD7F,0xAD81,0xAD82,0xAD88,0xAD89,0xAD8C,/* 0xC0-0xC7 */
+	0xAD90,0xAD9C,0xAD9D,0xADA4,0xADB7,0xADC0,0xADC1,0xADC4,/* 0xC8-0xCF */
+	0xADC8,0xADD0,0xADD1,0xADD3,0xADDC,0xADE0,0xADE4,0xADF8,/* 0xD0-0xD7 */
+	0xADF9,0xADFC,0xADFF,0xAE00,0xAE01,0xAE08,0xAE09,0xAE0B,/* 0xD8-0xDF */
+	0xAE0D,0xAE14,0xAE30,0xAE31,0xAE34,0xAE37,0xAE38,0xAE3A,/* 0xE0-0xE7 */
+	0xAE40,0xAE41,0xAE43,0xAE45,0xAE46,0xAE4A,0xAE4C,0xAE4D,/* 0xE8-0xEF */
+	0xAE4E,0xAE50,0xAE54,0xAE56,0xAE5C,0xAE5D,0xAE5F,0xAE60,/* 0xF0-0xF7 */
+	0xAE61,0xAE65,0xAE68,0xAE69,0xAE6C,0xAE70,0xAE78,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCF6D,0xCF6E,0xCF6F,0xCF72,0xCF73,0xCF75,0xCF76,/* 0x40-0x47 */
+	0xCF77,0xCF79,0xCF7A,0xCF7B,0xCF7C,0xCF7D,0xCF7E,0xCF7F,/* 0x48-0x4F */
+	0xCF81,0xCF82,0xCF83,0xCF84,0xCF86,0xCF87,0xCF88,0xCF89,/* 0x50-0x57 */
+	0xCF8A,0xCF8B,0xCF8D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCF8E,0xCF8F,0xCF90,0xCF91,0xCF92,0xCF93,0xCF94,/* 0x60-0x67 */
+	0xCF95,0xCF96,0xCF97,0xCF98,0xCF99,0xCF9A,0xCF9B,0xCF9C,/* 0x68-0x6F */
+	0xCF9D,0xCF9E,0xCF9F,0xCFA0,0xCFA2,0xCFA3,0xCFA4,0xCFA5,/* 0x70-0x77 */
+	0xCFA6,0xCFA7,0xCFA9,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xCFAA,0xCFAB,0xCFAC,0xCFAD,0xCFAE,0xCFAF,0xCFB1,/* 0x80-0x87 */
+	0xCFB2,0xCFB3,0xCFB4,0xCFB5,0xCFB6,0xCFB7,0xCFB8,0xCFB9,/* 0x88-0x8F */
+	0xCFBA,0xCFBB,0xCFBC,0xCFBD,0xCFBE,0xCFBF,0xCFC0,0xCFC1,/* 0x90-0x97 */
+	0xCFC2,0xCFC3,0xCFC5,0xCFC6,0xCFC7,0xCFC8,0xCFC9,0xCFCA,/* 0x98-0x9F */
+	0xCFCB,0xAE79,0xAE7B,0xAE7C,0xAE7D,0xAE84,0xAE85,0xAE8C,/* 0xA0-0xA7 */
+	0xAEBC,0xAEBD,0xAEBE,0xAEC0,0xAEC4,0xAECC,0xAECD,0xAECF,/* 0xA8-0xAF */
+	0xAED0,0xAED1,0xAED8,0xAED9,0xAEDC,0xAEE8,0xAEEB,0xAEED,/* 0xB0-0xB7 */
+	0xAEF4,0xAEF8,0xAEFC,0xAF07,0xAF08,0xAF0D,0xAF10,0xAF2C,/* 0xB8-0xBF */
+	0xAF2D,0xAF30,0xAF32,0xAF34,0xAF3C,0xAF3D,0xAF3F,0xAF41,/* 0xC0-0xC7 */
+	0xAF42,0xAF43,0xAF48,0xAF49,0xAF50,0xAF5C,0xAF5D,0xAF64,/* 0xC8-0xCF */
+	0xAF65,0xAF79,0xAF80,0xAF84,0xAF88,0xAF90,0xAF91,0xAF95,/* 0xD0-0xD7 */
+	0xAF9C,0xAFB8,0xAFB9,0xAFBC,0xAFC0,0xAFC7,0xAFC8,0xAFC9,/* 0xD8-0xDF */
+	0xAFCB,0xAFCD,0xAFCE,0xAFD4,0xAFDC,0xAFE8,0xAFE9,0xAFF0,/* 0xE0-0xE7 */
+	0xAFF1,0xAFF4,0xAFF8,0xB000,0xB001,0xB004,0xB00C,0xB010,/* 0xE8-0xEF */
+	0xB014,0xB01C,0xB01D,0xB028,0xB044,0xB045,0xB048,0xB04A,/* 0xF0-0xF7 */
+	0xB04C,0xB04E,0xB053,0xB054,0xB055,0xB057,0xB059,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xCFCC,0xCFCD,0xCFCE,0xCFCF,0xCFD0,0xCFD1,0xCFD2,/* 0x40-0x47 */
+	0xCFD3,0xCFD4,0xCFD5,0xCFD6,0xCFD7,0xCFD8,0xCFD9,0xCFDA,/* 0x48-0x4F */
+	0xCFDB,0xCFDC,0xCFDD,0xCFDE,0xCFDF,0xCFE2,0xCFE3,0xCFE5,/* 0x50-0x57 */
+	0xCFE6,0xCFE7,0xCFE9,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xCFEA,0xCFEB,0xCFEC,0xCFED,0xCFEE,0xCFEF,0xCFF2,/* 0x60-0x67 */
+	0xCFF4,0xCFF6,0xCFF7,0xCFF8,0xCFF9,0xCFFA,0xCFFB,0xCFFD,/* 0x68-0x6F */
+	0xCFFE,0xCFFF,0xD001,0xD002,0xD003,0xD005,0xD006,0xD007,/* 0x70-0x77 */
+	0xD008,0xD009,0xD00A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD00B,0xD00C,0xD00D,0xD00E,0xD00F,0xD010,0xD012,/* 0x80-0x87 */
+	0xD013,0xD014,0xD015,0xD016,0xD017,0xD019,0xD01A,0xD01B,/* 0x88-0x8F */
+	0xD01C,0xD01D,0xD01E,0xD01F,0xD020,0xD021,0xD022,0xD023,/* 0x90-0x97 */
+	0xD024,0xD025,0xD026,0xD027,0xD028,0xD029,0xD02A,0xD02B,/* 0x98-0x9F */
+	0xD02C,0xB05D,0xB07C,0xB07D,0xB080,0xB084,0xB08C,0xB08D,/* 0xA0-0xA7 */
+	0xB08F,0xB091,0xB098,0xB099,0xB09A,0xB09C,0xB09F,0xB0A0,/* 0xA8-0xAF */
+	0xB0A1,0xB0A2,0xB0A8,0xB0A9,0xB0AB,0xB0AC,0xB0AD,0xB0AE,/* 0xB0-0xB7 */
+	0xB0AF,0xB0B1,0xB0B3,0xB0B4,0xB0B5,0xB0B8,0xB0BC,0xB0C4,/* 0xB8-0xBF */
+	0xB0C5,0xB0C7,0xB0C8,0xB0C9,0xB0D0,0xB0D1,0xB0D4,0xB0D8,/* 0xC0-0xC7 */
+	0xB0E0,0xB0E5,0xB108,0xB109,0xB10B,0xB10C,0xB110,0xB112,/* 0xC8-0xCF */
+	0xB113,0xB118,0xB119,0xB11B,0xB11C,0xB11D,0xB123,0xB124,/* 0xD0-0xD7 */
+	0xB125,0xB128,0xB12C,0xB134,0xB135,0xB137,0xB138,0xB139,/* 0xD8-0xDF */
+	0xB140,0xB141,0xB144,0xB148,0xB150,0xB151,0xB154,0xB155,/* 0xE0-0xE7 */
+	0xB158,0xB15C,0xB160,0xB178,0xB179,0xB17C,0xB180,0xB182,/* 0xE8-0xEF */
+	0xB188,0xB189,0xB18B,0xB18D,0xB192,0xB193,0xB194,0xB198,/* 0xF0-0xF7 */
+	0xB19C,0xB1A8,0xB1CC,0xB1D0,0xB1D4,0xB1DC,0xB1DD,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD02E,0xD02F,0xD030,0xD031,0xD032,0xD033,0xD036,/* 0x40-0x47 */
+	0xD037,0xD039,0xD03A,0xD03B,0xD03D,0xD03E,0xD03F,0xD040,/* 0x48-0x4F */
+	0xD041,0xD042,0xD043,0xD046,0xD048,0xD04A,0xD04B,0xD04C,/* 0x50-0x57 */
+	0xD04D,0xD04E,0xD04F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD051,0xD052,0xD053,0xD055,0xD056,0xD057,0xD059,/* 0x60-0x67 */
+	0xD05A,0xD05B,0xD05C,0xD05D,0xD05E,0xD05F,0xD061,0xD062,/* 0x68-0x6F */
+	0xD063,0xD064,0xD065,0xD066,0xD067,0xD068,0xD069,0xD06A,/* 0x70-0x77 */
+	0xD06B,0xD06E,0xD06F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD071,0xD072,0xD073,0xD075,0xD076,0xD077,0xD078,/* 0x80-0x87 */
+	0xD079,0xD07A,0xD07B,0xD07E,0xD07F,0xD080,0xD082,0xD083,/* 0x88-0x8F */
+	0xD084,0xD085,0xD086,0xD087,0xD088,0xD089,0xD08A,0xD08B,/* 0x90-0x97 */
+	0xD08C,0xD08D,0xD08E,0xD08F,0xD090,0xD091,0xD092,0xD093,/* 0x98-0x9F */
+	0xD094,0xB1DF,0xB1E8,0xB1E9,0xB1EC,0xB1F0,0xB1F9,0xB1FB,/* 0xA0-0xA7 */
+	0xB1FD,0xB204,0xB205,0xB208,0xB20B,0xB20C,0xB214,0xB215,/* 0xA8-0xAF */
+	0xB217,0xB219,0xB220,0xB234,0xB23C,0xB258,0xB25C,0xB260,/* 0xB0-0xB7 */
+	0xB268,0xB269,0xB274,0xB275,0xB27C,0xB284,0xB285,0xB289,/* 0xB8-0xBF */
+	0xB290,0xB291,0xB294,0xB298,0xB299,0xB29A,0xB2A0,0xB2A1,/* 0xC0-0xC7 */
+	0xB2A3,0xB2A5,0xB2A6,0xB2AA,0xB2AC,0xB2B0,0xB2B4,0xB2C8,/* 0xC8-0xCF */
+	0xB2C9,0xB2CC,0xB2D0,0xB2D2,0xB2D8,0xB2D9,0xB2DB,0xB2DD,/* 0xD0-0xD7 */
+	0xB2E2,0xB2E4,0xB2E5,0xB2E6,0xB2E8,0xB2EB,0xB2EC,0xB2ED,/* 0xD8-0xDF */
+	0xB2EE,0xB2EF,0xB2F3,0xB2F4,0xB2F5,0xB2F7,0xB2F8,0xB2F9,/* 0xE0-0xE7 */
+	0xB2FA,0xB2FB,0xB2FF,0xB300,0xB301,0xB304,0xB308,0xB310,/* 0xE8-0xEF */
+	0xB311,0xB313,0xB314,0xB315,0xB31C,0xB354,0xB355,0xB356,/* 0xF0-0xF7 */
+	0xB358,0xB35B,0xB35C,0xB35E,0xB35F,0xB364,0xB365,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD095,0xD096,0xD097,0xD098,0xD099,0xD09A,0xD09B,/* 0x40-0x47 */
+	0xD09C,0xD09D,0xD09E,0xD09F,0xD0A0,0xD0A1,0xD0A2,0xD0A3,/* 0x48-0x4F */
+	0xD0A6,0xD0A7,0xD0A9,0xD0AA,0xD0AB,0xD0AD,0xD0AE,0xD0AF,/* 0x50-0x57 */
+	0xD0B0,0xD0B1,0xD0B2,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD0B3,0xD0B6,0xD0B8,0xD0BA,0xD0BB,0xD0BC,0xD0BD,/* 0x60-0x67 */
+	0xD0BE,0xD0BF,0xD0C2,0xD0C3,0xD0C5,0xD0C6,0xD0C7,0xD0CA,/* 0x68-0x6F */
+	0xD0CB,0xD0CC,0xD0CD,0xD0CE,0xD0CF,0xD0D2,0xD0D6,0xD0D7,/* 0x70-0x77 */
+	0xD0D8,0xD0D9,0xD0DA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD0DB,0xD0DE,0xD0DF,0xD0E1,0xD0E2,0xD0E3,0xD0E5,/* 0x80-0x87 */
+	0xD0E6,0xD0E7,0xD0E8,0xD0E9,0xD0EA,0xD0EB,0xD0EE,0xD0F2,/* 0x88-0x8F */
+	0xD0F3,0xD0F4,0xD0F5,0xD0F6,0xD0F7,0xD0F9,0xD0FA,0xD0FB,/* 0x90-0x97 */
+	0xD0FC,0xD0FD,0xD0FE,0xD0FF,0xD100,0xD101,0xD102,0xD103,/* 0x98-0x9F */
+	0xD104,0xB367,0xB369,0xB36B,0xB36E,0xB370,0xB371,0xB374,/* 0xA0-0xA7 */
+	0xB378,0xB380,0xB381,0xB383,0xB384,0xB385,0xB38C,0xB390,/* 0xA8-0xAF */
+	0xB394,0xB3A0,0xB3A1,0xB3A8,0xB3AC,0xB3C4,0xB3C5,0xB3C8,/* 0xB0-0xB7 */
+	0xB3CB,0xB3CC,0xB3CE,0xB3D0,0xB3D4,0xB3D5,0xB3D7,0xB3D9,/* 0xB8-0xBF */
+	0xB3DB,0xB3DD,0xB3E0,0xB3E4,0xB3E8,0xB3FC,0xB410,0xB418,/* 0xC0-0xC7 */
+	0xB41C,0xB420,0xB428,0xB429,0xB42B,0xB434,0xB450,0xB451,/* 0xC8-0xCF */
+	0xB454,0xB458,0xB460,0xB461,0xB463,0xB465,0xB46C,0xB480,/* 0xD0-0xD7 */
+	0xB488,0xB49D,0xB4A4,0xB4A8,0xB4AC,0xB4B5,0xB4B7,0xB4B9,/* 0xD8-0xDF */
+	0xB4C0,0xB4C4,0xB4C8,0xB4D0,0xB4D5,0xB4DC,0xB4DD,0xB4E0,/* 0xE0-0xE7 */
+	0xB4E3,0xB4E4,0xB4E6,0xB4EC,0xB4ED,0xB4EF,0xB4F1,0xB4F8,/* 0xE8-0xEF */
+	0xB514,0xB515,0xB518,0xB51B,0xB51C,0xB524,0xB525,0xB527,/* 0xF0-0xF7 */
+	0xB528,0xB529,0xB52A,0xB530,0xB531,0xB534,0xB538,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD105,0xD106,0xD107,0xD108,0xD109,0xD10A,0xD10B,/* 0x40-0x47 */
+	0xD10C,0xD10E,0xD10F,0xD110,0xD111,0xD112,0xD113,0xD114,/* 0x48-0x4F */
+	0xD115,0xD116,0xD117,0xD118,0xD119,0xD11A,0xD11B,0xD11C,/* 0x50-0x57 */
+	0xD11D,0xD11E,0xD11F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD120,0xD121,0xD122,0xD123,0xD124,0xD125,0xD126,/* 0x60-0x67 */
+	0xD127,0xD128,0xD129,0xD12A,0xD12B,0xD12C,0xD12D,0xD12E,/* 0x68-0x6F */
+	0xD12F,0xD132,0xD133,0xD135,0xD136,0xD137,0xD139,0xD13B,/* 0x70-0x77 */
+	0xD13C,0xD13D,0xD13E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD13F,0xD142,0xD146,0xD147,0xD148,0xD149,0xD14A,/* 0x80-0x87 */
+	0xD14B,0xD14E,0xD14F,0xD151,0xD152,0xD153,0xD155,0xD156,/* 0x88-0x8F */
+	0xD157,0xD158,0xD159,0xD15A,0xD15B,0xD15E,0xD160,0xD162,/* 0x90-0x97 */
+	0xD163,0xD164,0xD165,0xD166,0xD167,0xD169,0xD16A,0xD16B,/* 0x98-0x9F */
+	0xD16D,0xB540,0xB541,0xB543,0xB544,0xB545,0xB54B,0xB54C,/* 0xA0-0xA7 */
+	0xB54D,0xB550,0xB554,0xB55C,0xB55D,0xB55F,0xB560,0xB561,/* 0xA8-0xAF */
+	0xB5A0,0xB5A1,0xB5A4,0xB5A8,0xB5AA,0xB5AB,0xB5B0,0xB5B1,/* 0xB0-0xB7 */
+	0xB5B3,0xB5B4,0xB5B5,0xB5BB,0xB5BC,0xB5BD,0xB5C0,0xB5C4,/* 0xB8-0xBF */
+	0xB5CC,0xB5CD,0xB5CF,0xB5D0,0xB5D1,0xB5D8,0xB5EC,0xB610,/* 0xC0-0xC7 */
+	0xB611,0xB614,0xB618,0xB625,0xB62C,0xB634,0xB648,0xB664,/* 0xC8-0xCF */
+	0xB668,0xB69C,0xB69D,0xB6A0,0xB6A4,0xB6AB,0xB6AC,0xB6B1,/* 0xD0-0xD7 */
+	0xB6D4,0xB6F0,0xB6F4,0xB6F8,0xB700,0xB701,0xB705,0xB728,/* 0xD8-0xDF */
+	0xB729,0xB72C,0xB72F,0xB730,0xB738,0xB739,0xB73B,0xB744,/* 0xE0-0xE7 */
+	0xB748,0xB74C,0xB754,0xB755,0xB760,0xB764,0xB768,0xB770,/* 0xE8-0xEF */
+	0xB771,0xB773,0xB775,0xB77C,0xB77D,0xB780,0xB784,0xB78C,/* 0xF0-0xF7 */
+	0xB78D,0xB78F,0xB790,0xB791,0xB792,0xB796,0xB797,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD16E,0xD16F,0xD170,0xD171,0xD172,0xD173,0xD174,/* 0x40-0x47 */
+	0xD175,0xD176,0xD177,0xD178,0xD179,0xD17A,0xD17B,0xD17D,/* 0x48-0x4F */
+	0xD17E,0xD17F,0xD180,0xD181,0xD182,0xD183,0xD185,0xD186,/* 0x50-0x57 */
+	0xD187,0xD189,0xD18A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD18B,0xD18C,0xD18D,0xD18E,0xD18F,0xD190,0xD191,/* 0x60-0x67 */
+	0xD192,0xD193,0xD194,0xD195,0xD196,0xD197,0xD198,0xD199,/* 0x68-0x6F */
+	0xD19A,0xD19B,0xD19C,0xD19D,0xD19E,0xD19F,0xD1A2,0xD1A3,/* 0x70-0x77 */
+	0xD1A5,0xD1A6,0xD1A7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD1A9,0xD1AA,0xD1AB,0xD1AC,0xD1AD,0xD1AE,0xD1AF,/* 0x80-0x87 */
+	0xD1B2,0xD1B4,0xD1B6,0xD1B7,0xD1B8,0xD1B9,0xD1BB,0xD1BD,/* 0x88-0x8F */
+	0xD1BE,0xD1BF,0xD1C1,0xD1C2,0xD1C3,0xD1C4,0xD1C5,0xD1C6,/* 0x90-0x97 */
+	0xD1C7,0xD1C8,0xD1C9,0xD1CA,0xD1CB,0xD1CC,0xD1CD,0xD1CE,/* 0x98-0x9F */
+	0xD1CF,0xB798,0xB799,0xB79C,0xB7A0,0xB7A8,0xB7A9,0xB7AB,/* 0xA0-0xA7 */
+	0xB7AC,0xB7AD,0xB7B4,0xB7B5,0xB7B8,0xB7C7,0xB7C9,0xB7EC,/* 0xA8-0xAF */
+	0xB7ED,0xB7F0,0xB7F4,0xB7FC,0xB7FD,0xB7FF,0xB800,0xB801,/* 0xB0-0xB7 */
+	0xB807,0xB808,0xB809,0xB80C,0xB810,0xB818,0xB819,0xB81B,/* 0xB8-0xBF */
+	0xB81D,0xB824,0xB825,0xB828,0xB82C,0xB834,0xB835,0xB837,/* 0xC0-0xC7 */
+	0xB838,0xB839,0xB840,0xB844,0xB851,0xB853,0xB85C,0xB85D,/* 0xC8-0xCF */
+	0xB860,0xB864,0xB86C,0xB86D,0xB86F,0xB871,0xB878,0xB87C,/* 0xD0-0xD7 */
+	0xB88D,0xB8A8,0xB8B0,0xB8B4,0xB8B8,0xB8C0,0xB8C1,0xB8C3,/* 0xD8-0xDF */
+	0xB8C5,0xB8CC,0xB8D0,0xB8D4,0xB8DD,0xB8DF,0xB8E1,0xB8E8,/* 0xE0-0xE7 */
+	0xB8E9,0xB8EC,0xB8F0,0xB8F8,0xB8F9,0xB8FB,0xB8FD,0xB904,/* 0xE8-0xEF */
+	0xB918,0xB920,0xB93C,0xB93D,0xB940,0xB944,0xB94C,0xB94F,/* 0xF0-0xF7 */
+	0xB951,0xB958,0xB959,0xB95C,0xB960,0xB968,0xB969,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD1D0,0xD1D1,0xD1D2,0xD1D3,0xD1D4,0xD1D5,0xD1D6,/* 0x40-0x47 */
+	0xD1D7,0xD1D9,0xD1DA,0xD1DB,0xD1DC,0xD1DD,0xD1DE,0xD1DF,/* 0x48-0x4F */
+	0xD1E0,0xD1E1,0xD1E2,0xD1E3,0xD1E4,0xD1E5,0xD1E6,0xD1E7,/* 0x50-0x57 */
+	0xD1E8,0xD1E9,0xD1EA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD1EB,0xD1EC,0xD1ED,0xD1EE,0xD1EF,0xD1F0,0xD1F1,/* 0x60-0x67 */
+	0xD1F2,0xD1F3,0xD1F5,0xD1F6,0xD1F7,0xD1F9,0xD1FA,0xD1FB,/* 0x68-0x6F */
+	0xD1FC,0xD1FD,0xD1FE,0xD1FF,0xD200,0xD201,0xD202,0xD203,/* 0x70-0x77 */
+	0xD204,0xD205,0xD206,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD208,0xD20A,0xD20B,0xD20C,0xD20D,0xD20E,0xD20F,/* 0x80-0x87 */
+	0xD211,0xD212,0xD213,0xD214,0xD215,0xD216,0xD217,0xD218,/* 0x88-0x8F */
+	0xD219,0xD21A,0xD21B,0xD21C,0xD21D,0xD21E,0xD21F,0xD220,/* 0x90-0x97 */
+	0xD221,0xD222,0xD223,0xD224,0xD225,0xD226,0xD227,0xD228,/* 0x98-0x9F */
+	0xD229,0xB96B,0xB96D,0xB974,0xB975,0xB978,0xB97C,0xB984,/* 0xA0-0xA7 */
+	0xB985,0xB987,0xB989,0xB98A,0xB98D,0xB98E,0xB9AC,0xB9AD,/* 0xA8-0xAF */
+	0xB9B0,0xB9B4,0xB9BC,0xB9BD,0xB9BF,0xB9C1,0xB9C8,0xB9C9,/* 0xB0-0xB7 */
+	0xB9CC,0xB9CE,0xB9CF,0xB9D0,0xB9D1,0xB9D2,0xB9D8,0xB9D9,/* 0xB8-0xBF */
+	0xB9DB,0xB9DD,0xB9DE,0xB9E1,0xB9E3,0xB9E4,0xB9E5,0xB9E8,/* 0xC0-0xC7 */
+	0xB9EC,0xB9F4,0xB9F5,0xB9F7,0xB9F8,0xB9F9,0xB9FA,0xBA00,/* 0xC8-0xCF */
+	0xBA01,0xBA08,0xBA15,0xBA38,0xBA39,0xBA3C,0xBA40,0xBA42,/* 0xD0-0xD7 */
+	0xBA48,0xBA49,0xBA4B,0xBA4D,0xBA4E,0xBA53,0xBA54,0xBA55,/* 0xD8-0xDF */
+	0xBA58,0xBA5C,0xBA64,0xBA65,0xBA67,0xBA68,0xBA69,0xBA70,/* 0xE0-0xE7 */
+	0xBA71,0xBA74,0xBA78,0xBA83,0xBA84,0xBA85,0xBA87,0xBA8C,/* 0xE8-0xEF */
+	0xBAA8,0xBAA9,0xBAAB,0xBAAC,0xBAB0,0xBAB2,0xBAB8,0xBAB9,/* 0xF0-0xF7 */
+	0xBABB,0xBABD,0xBAC4,0xBAC8,0xBAD8,0xBAD9,0xBAFC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD22A,0xD22B,0xD22E,0xD22F,0xD231,0xD232,0xD233,/* 0x40-0x47 */
+	0xD235,0xD236,0xD237,0xD238,0xD239,0xD23A,0xD23B,0xD23E,/* 0x48-0x4F */
+	0xD240,0xD242,0xD243,0xD244,0xD245,0xD246,0xD247,0xD249,/* 0x50-0x57 */
+	0xD24A,0xD24B,0xD24C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD24D,0xD24E,0xD24F,0xD250,0xD251,0xD252,0xD253,/* 0x60-0x67 */
+	0xD254,0xD255,0xD256,0xD257,0xD258,0xD259,0xD25A,0xD25B,/* 0x68-0x6F */
+	0xD25D,0xD25E,0xD25F,0xD260,0xD261,0xD262,0xD263,0xD265,/* 0x70-0x77 */
+	0xD266,0xD267,0xD268,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD269,0xD26A,0xD26B,0xD26C,0xD26D,0xD26E,0xD26F,/* 0x80-0x87 */
+	0xD270,0xD271,0xD272,0xD273,0xD274,0xD275,0xD276,0xD277,/* 0x88-0x8F */
+	0xD278,0xD279,0xD27A,0xD27B,0xD27C,0xD27D,0xD27E,0xD27F,/* 0x90-0x97 */
+	0xD282,0xD283,0xD285,0xD286,0xD287,0xD289,0xD28A,0xD28B,/* 0x98-0x9F */
+	0xD28C,0xBB00,0xBB04,0xBB0D,0xBB0F,0xBB11,0xBB18,0xBB1C,/* 0xA0-0xA7 */
+	0xBB20,0xBB29,0xBB2B,0xBB34,0xBB35,0xBB36,0xBB38,0xBB3B,/* 0xA8-0xAF */
+	0xBB3C,0xBB3D,0xBB3E,0xBB44,0xBB45,0xBB47,0xBB49,0xBB4D,/* 0xB0-0xB7 */
+	0xBB4F,0xBB50,0xBB54,0xBB58,0xBB61,0xBB63,0xBB6C,0xBB88,/* 0xB8-0xBF */
+	0xBB8C,0xBB90,0xBBA4,0xBBA8,0xBBAC,0xBBB4,0xBBB7,0xBBC0,/* 0xC0-0xC7 */
+	0xBBC4,0xBBC8,0xBBD0,0xBBD3,0xBBF8,0xBBF9,0xBBFC,0xBBFF,/* 0xC8-0xCF */
+	0xBC00,0xBC02,0xBC08,0xBC09,0xBC0B,0xBC0C,0xBC0D,0xBC0F,/* 0xD0-0xD7 */
+	0xBC11,0xBC14,0xBC15,0xBC16,0xBC17,0xBC18,0xBC1B,0xBC1C,/* 0xD8-0xDF */
+	0xBC1D,0xBC1E,0xBC1F,0xBC24,0xBC25,0xBC27,0xBC29,0xBC2D,/* 0xE0-0xE7 */
+	0xBC30,0xBC31,0xBC34,0xBC38,0xBC40,0xBC41,0xBC43,0xBC44,/* 0xE8-0xEF */
+	0xBC45,0xBC49,0xBC4C,0xBC4D,0xBC50,0xBC5D,0xBC84,0xBC85,/* 0xF0-0xF7 */
+	0xBC88,0xBC8B,0xBC8C,0xBC8E,0xBC94,0xBC95,0xBC97,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD28D,0xD28E,0xD28F,0xD292,0xD293,0xD294,0xD296,/* 0x40-0x47 */
+	0xD297,0xD298,0xD299,0xD29A,0xD29B,0xD29D,0xD29E,0xD29F,/* 0x48-0x4F */
+	0xD2A1,0xD2A2,0xD2A3,0xD2A5,0xD2A6,0xD2A7,0xD2A8,0xD2A9,/* 0x50-0x57 */
+	0xD2AA,0xD2AB,0xD2AD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD2AE,0xD2AF,0xD2B0,0xD2B2,0xD2B3,0xD2B4,0xD2B5,/* 0x60-0x67 */
+	0xD2B6,0xD2B7,0xD2BA,0xD2BB,0xD2BD,0xD2BE,0xD2C1,0xD2C3,/* 0x68-0x6F */
+	0xD2C4,0xD2C5,0xD2C6,0xD2C7,0xD2CA,0xD2CC,0xD2CD,0xD2CE,/* 0x70-0x77 */
+	0xD2CF,0xD2D0,0xD2D1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD2D2,0xD2D3,0xD2D5,0xD2D6,0xD2D7,0xD2D9,0xD2DA,/* 0x80-0x87 */
+	0xD2DB,0xD2DD,0xD2DE,0xD2DF,0xD2E0,0xD2E1,0xD2E2,0xD2E3,/* 0x88-0x8F */
+	0xD2E6,0xD2E7,0xD2E8,0xD2E9,0xD2EA,0xD2EB,0xD2EC,0xD2ED,/* 0x90-0x97 */
+	0xD2EE,0xD2EF,0xD2F2,0xD2F3,0xD2F5,0xD2F6,0xD2F7,0xD2F9,/* 0x98-0x9F */
+	0xD2FA,0xBC99,0xBC9A,0xBCA0,0xBCA1,0xBCA4,0xBCA7,0xBCA8,/* 0xA0-0xA7 */
+	0xBCB0,0xBCB1,0xBCB3,0xBCB4,0xBCB5,0xBCBC,0xBCBD,0xBCC0,/* 0xA8-0xAF */
+	0xBCC4,0xBCCD,0xBCCF,0xBCD0,0xBCD1,0xBCD5,0xBCD8,0xBCDC,/* 0xB0-0xB7 */
+	0xBCF4,0xBCF5,0xBCF6,0xBCF8,0xBCFC,0xBD04,0xBD05,0xBD07,/* 0xB8-0xBF */
+	0xBD09,0xBD10,0xBD14,0xBD24,0xBD2C,0xBD40,0xBD48,0xBD49,/* 0xC0-0xC7 */
+	0xBD4C,0xBD50,0xBD58,0xBD59,0xBD64,0xBD68,0xBD80,0xBD81,/* 0xC8-0xCF */
+	0xBD84,0xBD87,0xBD88,0xBD89,0xBD8A,0xBD90,0xBD91,0xBD93,/* 0xD0-0xD7 */
+	0xBD95,0xBD99,0xBD9A,0xBD9C,0xBDA4,0xBDB0,0xBDB8,0xBDD4,/* 0xD8-0xDF */
+	0xBDD5,0xBDD8,0xBDDC,0xBDE9,0xBDF0,0xBDF4,0xBDF8,0xBE00,/* 0xE0-0xE7 */
+	0xBE03,0xBE05,0xBE0C,0xBE0D,0xBE10,0xBE14,0xBE1C,0xBE1D,/* 0xE8-0xEF */
+	0xBE1F,0xBE44,0xBE45,0xBE48,0xBE4C,0xBE4E,0xBE54,0xBE55,/* 0xF0-0xF7 */
+	0xBE57,0xBE59,0xBE5A,0xBE5B,0xBE60,0xBE61,0xBE64,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD2FB,0xD2FC,0xD2FD,0xD2FE,0xD2FF,0xD302,0xD304,/* 0x40-0x47 */
+	0xD306,0xD307,0xD308,0xD309,0xD30A,0xD30B,0xD30F,0xD311,/* 0x48-0x4F */
+	0xD312,0xD313,0xD315,0xD317,0xD318,0xD319,0xD31A,0xD31B,/* 0x50-0x57 */
+	0xD31E,0xD322,0xD323,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD324,0xD326,0xD327,0xD32A,0xD32B,0xD32D,0xD32E,/* 0x60-0x67 */
+	0xD32F,0xD331,0xD332,0xD333,0xD334,0xD335,0xD336,0xD337,/* 0x68-0x6F */
+	0xD33A,0xD33E,0xD33F,0xD340,0xD341,0xD342,0xD343,0xD346,/* 0x70-0x77 */
+	0xD347,0xD348,0xD349,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD34A,0xD34B,0xD34C,0xD34D,0xD34E,0xD34F,0xD350,/* 0x80-0x87 */
+	0xD351,0xD352,0xD353,0xD354,0xD355,0xD356,0xD357,0xD358,/* 0x88-0x8F */
+	0xD359,0xD35A,0xD35B,0xD35C,0xD35D,0xD35E,0xD35F,0xD360,/* 0x90-0x97 */
+	0xD361,0xD362,0xD363,0xD364,0xD365,0xD366,0xD367,0xD368,/* 0x98-0x9F */
+	0xD369,0xBE68,0xBE6A,0xBE70,0xBE71,0xBE73,0xBE74,0xBE75,/* 0xA0-0xA7 */
+	0xBE7B,0xBE7C,0xBE7D,0xBE80,0xBE84,0xBE8C,0xBE8D,0xBE8F,/* 0xA8-0xAF */
+	0xBE90,0xBE91,0xBE98,0xBE99,0xBEA8,0xBED0,0xBED1,0xBED4,/* 0xB0-0xB7 */
+	0xBED7,0xBED8,0xBEE0,0xBEE3,0xBEE4,0xBEE5,0xBEEC,0xBF01,/* 0xB8-0xBF */
+	0xBF08,0xBF09,0xBF18,0xBF19,0xBF1B,0xBF1C,0xBF1D,0xBF40,/* 0xC0-0xC7 */
+	0xBF41,0xBF44,0xBF48,0xBF50,0xBF51,0xBF55,0xBF94,0xBFB0,/* 0xC8-0xCF */
+	0xBFC5,0xBFCC,0xBFCD,0xBFD0,0xBFD4,0xBFDC,0xBFDF,0xBFE1,/* 0xD0-0xD7 */
+	0xC03C,0xC051,0xC058,0xC05C,0xC060,0xC068,0xC069,0xC090,/* 0xD8-0xDF */
+	0xC091,0xC094,0xC098,0xC0A0,0xC0A1,0xC0A3,0xC0A5,0xC0AC,/* 0xE0-0xE7 */
+	0xC0AD,0xC0AF,0xC0B0,0xC0B3,0xC0B4,0xC0B5,0xC0B6,0xC0BC,/* 0xE8-0xEF */
+	0xC0BD,0xC0BF,0xC0C0,0xC0C1,0xC0C5,0xC0C8,0xC0C9,0xC0CC,/* 0xF0-0xF7 */
+	0xC0D0,0xC0D8,0xC0D9,0xC0DB,0xC0DC,0xC0DD,0xC0E4,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD36A,0xD36B,0xD36C,0xD36D,0xD36E,0xD36F,0xD370,/* 0x40-0x47 */
+	0xD371,0xD372,0xD373,0xD374,0xD375,0xD376,0xD377,0xD378,/* 0x48-0x4F */
+	0xD379,0xD37A,0xD37B,0xD37E,0xD37F,0xD381,0xD382,0xD383,/* 0x50-0x57 */
+	0xD385,0xD386,0xD387,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD388,0xD389,0xD38A,0xD38B,0xD38E,0xD392,0xD393,/* 0x60-0x67 */
+	0xD394,0xD395,0xD396,0xD397,0xD39A,0xD39B,0xD39D,0xD39E,/* 0x68-0x6F */
+	0xD39F,0xD3A1,0xD3A2,0xD3A3,0xD3A4,0xD3A5,0xD3A6,0xD3A7,/* 0x70-0x77 */
+	0xD3AA,0xD3AC,0xD3AE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD3AF,0xD3B0,0xD3B1,0xD3B2,0xD3B3,0xD3B5,0xD3B6,/* 0x80-0x87 */
+	0xD3B7,0xD3B9,0xD3BA,0xD3BB,0xD3BD,0xD3BE,0xD3BF,0xD3C0,/* 0x88-0x8F */
+	0xD3C1,0xD3C2,0xD3C3,0xD3C6,0xD3C7,0xD3CA,0xD3CB,0xD3CC,/* 0x90-0x97 */
+	0xD3CD,0xD3CE,0xD3CF,0xD3D1,0xD3D2,0xD3D3,0xD3D4,0xD3D5,/* 0x98-0x9F */
+	0xD3D6,0xC0E5,0xC0E8,0xC0EC,0xC0F4,0xC0F5,0xC0F7,0xC0F9,/* 0xA0-0xA7 */
+	0xC100,0xC104,0xC108,0xC110,0xC115,0xC11C,0xC11D,0xC11E,/* 0xA8-0xAF */
+	0xC11F,0xC120,0xC123,0xC124,0xC126,0xC127,0xC12C,0xC12D,/* 0xB0-0xB7 */
+	0xC12F,0xC130,0xC131,0xC136,0xC138,0xC139,0xC13C,0xC140,/* 0xB8-0xBF */
+	0xC148,0xC149,0xC14B,0xC14C,0xC14D,0xC154,0xC155,0xC158,/* 0xC0-0xC7 */
+	0xC15C,0xC164,0xC165,0xC167,0xC168,0xC169,0xC170,0xC174,/* 0xC8-0xCF */
+	0xC178,0xC185,0xC18C,0xC18D,0xC18E,0xC190,0xC194,0xC196,/* 0xD0-0xD7 */
+	0xC19C,0xC19D,0xC19F,0xC1A1,0xC1A5,0xC1A8,0xC1A9,0xC1AC,/* 0xD8-0xDF */
+	0xC1B0,0xC1BD,0xC1C4,0xC1C8,0xC1CC,0xC1D4,0xC1D7,0xC1D8,/* 0xE0-0xE7 */
+	0xC1E0,0xC1E4,0xC1E8,0xC1F0,0xC1F1,0xC1F3,0xC1FC,0xC1FD,/* 0xE8-0xEF */
+	0xC200,0xC204,0xC20C,0xC20D,0xC20F,0xC211,0xC218,0xC219,/* 0xF0-0xF7 */
+	0xC21C,0xC21F,0xC220,0xC228,0xC229,0xC22B,0xC22D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD3D7,0xD3D9,0xD3DA,0xD3DB,0xD3DC,0xD3DD,0xD3DE,/* 0x40-0x47 */
+	0xD3DF,0xD3E0,0xD3E2,0xD3E4,0xD3E5,0xD3E6,0xD3E7,0xD3E8,/* 0x48-0x4F */
+	0xD3E9,0xD3EA,0xD3EB,0xD3EE,0xD3EF,0xD3F1,0xD3F2,0xD3F3,/* 0x50-0x57 */
+	0xD3F5,0xD3F6,0xD3F7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD3F8,0xD3F9,0xD3FA,0xD3FB,0xD3FE,0xD400,0xD402,/* 0x60-0x67 */
+	0xD403,0xD404,0xD405,0xD406,0xD407,0xD409,0xD40A,0xD40B,/* 0x68-0x6F */
+	0xD40C,0xD40D,0xD40E,0xD40F,0xD410,0xD411,0xD412,0xD413,/* 0x70-0x77 */
+	0xD414,0xD415,0xD416,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD417,0xD418,0xD419,0xD41A,0xD41B,0xD41C,0xD41E,/* 0x80-0x87 */
+	0xD41F,0xD420,0xD421,0xD422,0xD423,0xD424,0xD425,0xD426,/* 0x88-0x8F */
+	0xD427,0xD428,0xD429,0xD42A,0xD42B,0xD42C,0xD42D,0xD42E,/* 0x90-0x97 */
+	0xD42F,0xD430,0xD431,0xD432,0xD433,0xD434,0xD435,0xD436,/* 0x98-0x9F */
+	0xD437,0xC22F,0xC231,0xC232,0xC234,0xC248,0xC250,0xC251,/* 0xA0-0xA7 */
+	0xC254,0xC258,0xC260,0xC265,0xC26C,0xC26D,0xC270,0xC274,/* 0xA8-0xAF */
+	0xC27C,0xC27D,0xC27F,0xC281,0xC288,0xC289,0xC290,0xC298,/* 0xB0-0xB7 */
+	0xC29B,0xC29D,0xC2A4,0xC2A5,0xC2A8,0xC2AC,0xC2AD,0xC2B4,/* 0xB8-0xBF */
+	0xC2B5,0xC2B7,0xC2B9,0xC2DC,0xC2DD,0xC2E0,0xC2E3,0xC2E4,/* 0xC0-0xC7 */
+	0xC2EB,0xC2EC,0xC2ED,0xC2EF,0xC2F1,0xC2F6,0xC2F8,0xC2F9,/* 0xC8-0xCF */
+	0xC2FB,0xC2FC,0xC300,0xC308,0xC309,0xC30C,0xC30D,0xC313,/* 0xD0-0xD7 */
+	0xC314,0xC315,0xC318,0xC31C,0xC324,0xC325,0xC328,0xC329,/* 0xD8-0xDF */
+	0xC345,0xC368,0xC369,0xC36C,0xC370,0xC372,0xC378,0xC379,/* 0xE0-0xE7 */
+	0xC37C,0xC37D,0xC384,0xC388,0xC38C,0xC3C0,0xC3D8,0xC3D9,/* 0xE8-0xEF */
+	0xC3DC,0xC3DF,0xC3E0,0xC3E2,0xC3E8,0xC3E9,0xC3ED,0xC3F4,/* 0xF0-0xF7 */
+	0xC3F5,0xC3F8,0xC408,0xC410,0xC424,0xC42C,0xC430,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD438,0xD439,0xD43A,0xD43B,0xD43C,0xD43D,0xD43E,/* 0x40-0x47 */
+	0xD43F,0xD441,0xD442,0xD443,0xD445,0xD446,0xD447,0xD448,/* 0x48-0x4F */
+	0xD449,0xD44A,0xD44B,0xD44C,0xD44D,0xD44E,0xD44F,0xD450,/* 0x50-0x57 */
+	0xD451,0xD452,0xD453,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD454,0xD455,0xD456,0xD457,0xD458,0xD459,0xD45A,/* 0x60-0x67 */
+	0xD45B,0xD45D,0xD45E,0xD45F,0xD461,0xD462,0xD463,0xD465,/* 0x68-0x6F */
+	0xD466,0xD467,0xD468,0xD469,0xD46A,0xD46B,0xD46C,0xD46E,/* 0x70-0x77 */
+	0xD470,0xD471,0xD472,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD473,0xD474,0xD475,0xD476,0xD477,0xD47A,0xD47B,/* 0x80-0x87 */
+	0xD47D,0xD47E,0xD481,0xD483,0xD484,0xD485,0xD486,0xD487,/* 0x88-0x8F */
+	0xD48A,0xD48C,0xD48E,0xD48F,0xD490,0xD491,0xD492,0xD493,/* 0x90-0x97 */
+	0xD495,0xD496,0xD497,0xD498,0xD499,0xD49A,0xD49B,0xD49C,/* 0x98-0x9F */
+	0xD49D,0xC434,0xC43C,0xC43D,0xC448,0xC464,0xC465,0xC468,/* 0xA0-0xA7 */
+	0xC46C,0xC474,0xC475,0xC479,0xC480,0xC494,0xC49C,0xC4B8,/* 0xA8-0xAF */
+	0xC4BC,0xC4E9,0xC4F0,0xC4F1,0xC4F4,0xC4F8,0xC4FA,0xC4FF,/* 0xB0-0xB7 */
+	0xC500,0xC501,0xC50C,0xC510,0xC514,0xC51C,0xC528,0xC529,/* 0xB8-0xBF */
+	0xC52C,0xC530,0xC538,0xC539,0xC53B,0xC53D,0xC544,0xC545,/* 0xC0-0xC7 */
+	0xC548,0xC549,0xC54A,0xC54C,0xC54D,0xC54E,0xC553,0xC554,/* 0xC8-0xCF */
+	0xC555,0xC557,0xC558,0xC559,0xC55D,0xC55E,0xC560,0xC561,/* 0xD0-0xD7 */
+	0xC564,0xC568,0xC570,0xC571,0xC573,0xC574,0xC575,0xC57C,/* 0xD8-0xDF */
+	0xC57D,0xC580,0xC584,0xC587,0xC58C,0xC58D,0xC58F,0xC591,/* 0xE0-0xE7 */
+	0xC595,0xC597,0xC598,0xC59C,0xC5A0,0xC5A9,0xC5B4,0xC5B5,/* 0xE8-0xEF */
+	0xC5B8,0xC5B9,0xC5BB,0xC5BC,0xC5BD,0xC5BE,0xC5C4,0xC5C5,/* 0xF0-0xF7 */
+	0xC5C6,0xC5C7,0xC5C8,0xC5C9,0xC5CA,0xC5CC,0xC5CE,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD49E,0xD49F,0xD4A0,0xD4A1,0xD4A2,0xD4A3,0xD4A4,/* 0x40-0x47 */
+	0xD4A5,0xD4A6,0xD4A7,0xD4A8,0xD4AA,0xD4AB,0xD4AC,0xD4AD,/* 0x48-0x4F */
+	0xD4AE,0xD4AF,0xD4B0,0xD4B1,0xD4B2,0xD4B3,0xD4B4,0xD4B5,/* 0x50-0x57 */
+	0xD4B6,0xD4B7,0xD4B8,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD4B9,0xD4BA,0xD4BB,0xD4BC,0xD4BD,0xD4BE,0xD4BF,/* 0x60-0x67 */
+	0xD4C0,0xD4C1,0xD4C2,0xD4C3,0xD4C4,0xD4C5,0xD4C6,0xD4C7,/* 0x68-0x6F */
+	0xD4C8,0xD4C9,0xD4CA,0xD4CB,0xD4CD,0xD4CE,0xD4CF,0xD4D1,/* 0x70-0x77 */
+	0xD4D2,0xD4D3,0xD4D5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD4D6,0xD4D7,0xD4D8,0xD4D9,0xD4DA,0xD4DB,0xD4DD,/* 0x80-0x87 */
+	0xD4DE,0xD4E0,0xD4E1,0xD4E2,0xD4E3,0xD4E4,0xD4E5,0xD4E6,/* 0x88-0x8F */
+	0xD4E7,0xD4E9,0xD4EA,0xD4EB,0xD4ED,0xD4EE,0xD4EF,0xD4F1,/* 0x90-0x97 */
+	0xD4F2,0xD4F3,0xD4F4,0xD4F5,0xD4F6,0xD4F7,0xD4F9,0xD4FA,/* 0x98-0x9F */
+	0xD4FC,0xC5D0,0xC5D1,0xC5D4,0xC5D8,0xC5E0,0xC5E1,0xC5E3,/* 0xA0-0xA7 */
+	0xC5E5,0xC5EC,0xC5ED,0xC5EE,0xC5F0,0xC5F4,0xC5F6,0xC5F7,/* 0xA8-0xAF */
+	0xC5FC,0xC5FD,0xC5FE,0xC5FF,0xC600,0xC601,0xC605,0xC606,/* 0xB0-0xB7 */
+	0xC607,0xC608,0xC60C,0xC610,0xC618,0xC619,0xC61B,0xC61C,/* 0xB8-0xBF */
+	0xC624,0xC625,0xC628,0xC62C,0xC62D,0xC62E,0xC630,0xC633,/* 0xC0-0xC7 */
+	0xC634,0xC635,0xC637,0xC639,0xC63B,0xC640,0xC641,0xC644,/* 0xC8-0xCF */
+	0xC648,0xC650,0xC651,0xC653,0xC654,0xC655,0xC65C,0xC65D,/* 0xD0-0xD7 */
+	0xC660,0xC66C,0xC66F,0xC671,0xC678,0xC679,0xC67C,0xC680,/* 0xD8-0xDF */
+	0xC688,0xC689,0xC68B,0xC68D,0xC694,0xC695,0xC698,0xC69C,/* 0xE0-0xE7 */
+	0xC6A4,0xC6A5,0xC6A7,0xC6A9,0xC6B0,0xC6B1,0xC6B4,0xC6B8,/* 0xE8-0xEF */
+	0xC6B9,0xC6BA,0xC6C0,0xC6C1,0xC6C3,0xC6C5,0xC6CC,0xC6CD,/* 0xF0-0xF7 */
+	0xC6D0,0xC6D4,0xC6DC,0xC6DD,0xC6E0,0xC6E1,0xC6E8,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD4FE,0xD4FF,0xD500,0xD501,0xD502,0xD503,0xD505,/* 0x40-0x47 */
+	0xD506,0xD507,0xD509,0xD50A,0xD50B,0xD50D,0xD50E,0xD50F,/* 0x48-0x4F */
+	0xD510,0xD511,0xD512,0xD513,0xD516,0xD518,0xD519,0xD51A,/* 0x50-0x57 */
+	0xD51B,0xD51C,0xD51D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD51E,0xD51F,0xD520,0xD521,0xD522,0xD523,0xD524,/* 0x60-0x67 */
+	0xD525,0xD526,0xD527,0xD528,0xD529,0xD52A,0xD52B,0xD52C,/* 0x68-0x6F */
+	0xD52D,0xD52E,0xD52F,0xD530,0xD531,0xD532,0xD533,0xD534,/* 0x70-0x77 */
+	0xD535,0xD536,0xD537,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD538,0xD539,0xD53A,0xD53B,0xD53E,0xD53F,0xD541,/* 0x80-0x87 */
+	0xD542,0xD543,0xD545,0xD546,0xD547,0xD548,0xD549,0xD54A,/* 0x88-0x8F */
+	0xD54B,0xD54E,0xD550,0xD552,0xD553,0xD554,0xD555,0xD556,/* 0x90-0x97 */
+	0xD557,0xD55A,0xD55B,0xD55D,0xD55E,0xD55F,0xD561,0xD562,/* 0x98-0x9F */
+	0xD563,0xC6E9,0xC6EC,0xC6F0,0xC6F8,0xC6F9,0xC6FD,0xC704,/* 0xA0-0xA7 */
+	0xC705,0xC708,0xC70C,0xC714,0xC715,0xC717,0xC719,0xC720,/* 0xA8-0xAF */
+	0xC721,0xC724,0xC728,0xC730,0xC731,0xC733,0xC735,0xC737,/* 0xB0-0xB7 */
+	0xC73C,0xC73D,0xC740,0xC744,0xC74A,0xC74C,0xC74D,0xC74F,/* 0xB8-0xBF */
+	0xC751,0xC752,0xC753,0xC754,0xC755,0xC756,0xC757,0xC758,/* 0xC0-0xC7 */
+	0xC75C,0xC760,0xC768,0xC76B,0xC774,0xC775,0xC778,0xC77C,/* 0xC8-0xCF */
+	0xC77D,0xC77E,0xC783,0xC784,0xC785,0xC787,0xC788,0xC789,/* 0xD0-0xD7 */
+	0xC78A,0xC78E,0xC790,0xC791,0xC794,0xC796,0xC797,0xC798,/* 0xD8-0xDF */
+	0xC79A,0xC7A0,0xC7A1,0xC7A3,0xC7A4,0xC7A5,0xC7A6,0xC7AC,/* 0xE0-0xE7 */
+	0xC7AD,0xC7B0,0xC7B4,0xC7BC,0xC7BD,0xC7BF,0xC7C0,0xC7C1,/* 0xE8-0xEF */
+	0xC7C8,0xC7C9,0xC7CC,0xC7CE,0xC7D0,0xC7D8,0xC7DD,0xC7E4,/* 0xF0-0xF7 */
+	0xC7E8,0xC7EC,0xC800,0xC801,0xC804,0xC808,0xC80A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD564,0xD566,0xD567,0xD56A,0xD56C,0xD56E,0xD56F,/* 0x40-0x47 */
+	0xD570,0xD571,0xD572,0xD573,0xD576,0xD577,0xD579,0xD57A,/* 0x48-0x4F */
+	0xD57B,0xD57D,0xD57E,0xD57F,0xD580,0xD581,0xD582,0xD583,/* 0x50-0x57 */
+	0xD586,0xD58A,0xD58B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD58C,0xD58D,0xD58E,0xD58F,0xD591,0xD592,0xD593,/* 0x60-0x67 */
+	0xD594,0xD595,0xD596,0xD597,0xD598,0xD599,0xD59A,0xD59B,/* 0x68-0x6F */
+	0xD59C,0xD59D,0xD59E,0xD59F,0xD5A0,0xD5A1,0xD5A2,0xD5A3,/* 0x70-0x77 */
+	0xD5A4,0xD5A6,0xD5A7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD5A8,0xD5A9,0xD5AA,0xD5AB,0xD5AC,0xD5AD,0xD5AE,/* 0x80-0x87 */
+	0xD5AF,0xD5B0,0xD5B1,0xD5B2,0xD5B3,0xD5B4,0xD5B5,0xD5B6,/* 0x88-0x8F */
+	0xD5B7,0xD5B8,0xD5B9,0xD5BA,0xD5BB,0xD5BC,0xD5BD,0xD5BE,/* 0x90-0x97 */
+	0xD5BF,0xD5C0,0xD5C1,0xD5C2,0xD5C3,0xD5C4,0xD5C5,0xD5C6,/* 0x98-0x9F */
+	0xD5C7,0xC810,0xC811,0xC813,0xC815,0xC816,0xC81C,0xC81D,/* 0xA0-0xA7 */
+	0xC820,0xC824,0xC82C,0xC82D,0xC82F,0xC831,0xC838,0xC83C,/* 0xA8-0xAF */
+	0xC840,0xC848,0xC849,0xC84C,0xC84D,0xC854,0xC870,0xC871,/* 0xB0-0xB7 */
+	0xC874,0xC878,0xC87A,0xC880,0xC881,0xC883,0xC885,0xC886,/* 0xB8-0xBF */
+	0xC887,0xC88B,0xC88C,0xC88D,0xC894,0xC89D,0xC89F,0xC8A1,/* 0xC0-0xC7 */
+	0xC8A8,0xC8BC,0xC8BD,0xC8C4,0xC8C8,0xC8CC,0xC8D4,0xC8D5,/* 0xC8-0xCF */
+	0xC8D7,0xC8D9,0xC8E0,0xC8E1,0xC8E4,0xC8F5,0xC8FC,0xC8FD,/* 0xD0-0xD7 */
+	0xC900,0xC904,0xC905,0xC906,0xC90C,0xC90D,0xC90F,0xC911,/* 0xD8-0xDF */
+	0xC918,0xC92C,0xC934,0xC950,0xC951,0xC954,0xC958,0xC960,/* 0xE0-0xE7 */
+	0xC961,0xC963,0xC96C,0xC970,0xC974,0xC97C,0xC988,0xC989,/* 0xE8-0xEF */
+	0xC98C,0xC990,0xC998,0xC999,0xC99B,0xC99D,0xC9C0,0xC9C1,/* 0xF0-0xF7 */
+	0xC9C4,0xC9C7,0xC9C8,0xC9CA,0xC9D0,0xC9D1,0xC9D3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD5CA,0xD5CB,0xD5CD,0xD5CE,0xD5CF,0xD5D1,0xD5D3,/* 0x40-0x47 */
+	0xD5D4,0xD5D5,0xD5D6,0xD5D7,0xD5DA,0xD5DC,0xD5DE,0xD5DF,/* 0x48-0x4F */
+	0xD5E0,0xD5E1,0xD5E2,0xD5E3,0xD5E6,0xD5E7,0xD5E9,0xD5EA,/* 0x50-0x57 */
+	0xD5EB,0xD5ED,0xD5EE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD5EF,0xD5F0,0xD5F1,0xD5F2,0xD5F3,0xD5F6,0xD5F8,/* 0x60-0x67 */
+	0xD5FA,0xD5FB,0xD5FC,0xD5FD,0xD5FE,0xD5FF,0xD602,0xD603,/* 0x68-0x6F */
+	0xD605,0xD606,0xD607,0xD609,0xD60A,0xD60B,0xD60C,0xD60D,/* 0x70-0x77 */
+	0xD60E,0xD60F,0xD612,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD616,0xD617,0xD618,0xD619,0xD61A,0xD61B,0xD61D,/* 0x80-0x87 */
+	0xD61E,0xD61F,0xD621,0xD622,0xD623,0xD625,0xD626,0xD627,/* 0x88-0x8F */
+	0xD628,0xD629,0xD62A,0xD62B,0xD62C,0xD62E,0xD62F,0xD630,/* 0x90-0x97 */
+	0xD631,0xD632,0xD633,0xD634,0xD635,0xD636,0xD637,0xD63A,/* 0x98-0x9F */
+	0xD63B,0xC9D5,0xC9D6,0xC9D9,0xC9DA,0xC9DC,0xC9DD,0xC9E0,/* 0xA0-0xA7 */
+	0xC9E2,0xC9E4,0xC9E7,0xC9EC,0xC9ED,0xC9EF,0xC9F0,0xC9F1,/* 0xA8-0xAF */
+	0xC9F8,0xC9F9,0xC9FC,0xCA00,0xCA08,0xCA09,0xCA0B,0xCA0C,/* 0xB0-0xB7 */
+	0xCA0D,0xCA14,0xCA18,0xCA29,0xCA4C,0xCA4D,0xCA50,0xCA54,/* 0xB8-0xBF */
+	0xCA5C,0xCA5D,0xCA5F,0xCA60,0xCA61,0xCA68,0xCA7D,0xCA84,/* 0xC0-0xC7 */
+	0xCA98,0xCABC,0xCABD,0xCAC0,0xCAC4,0xCACC,0xCACD,0xCACF,/* 0xC8-0xCF */
+	0xCAD1,0xCAD3,0xCAD8,0xCAD9,0xCAE0,0xCAEC,0xCAF4,0xCB08,/* 0xD0-0xD7 */
+	0xCB10,0xCB14,0xCB18,0xCB20,0xCB21,0xCB41,0xCB48,0xCB49,/* 0xD8-0xDF */
+	0xCB4C,0xCB50,0xCB58,0xCB59,0xCB5D,0xCB64,0xCB78,0xCB79,/* 0xE0-0xE7 */
+	0xCB9C,0xCBB8,0xCBD4,0xCBE4,0xCBE7,0xCBE9,0xCC0C,0xCC0D,/* 0xE8-0xEF */
+	0xCC10,0xCC14,0xCC1C,0xCC1D,0xCC21,0xCC22,0xCC27,0xCC28,/* 0xF0-0xF7 */
+	0xCC29,0xCC2C,0xCC2E,0xCC30,0xCC38,0xCC39,0xCC3B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD63D,0xD63E,0xD63F,0xD641,0xD642,0xD643,0xD644,/* 0x40-0x47 */
+	0xD646,0xD647,0xD64A,0xD64C,0xD64E,0xD64F,0xD650,0xD652,/* 0x48-0x4F */
+	0xD653,0xD656,0xD657,0xD659,0xD65A,0xD65B,0xD65D,0xD65E,/* 0x50-0x57 */
+	0xD65F,0xD660,0xD661,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD662,0xD663,0xD664,0xD665,0xD666,0xD668,0xD66A,/* 0x60-0x67 */
+	0xD66B,0xD66C,0xD66D,0xD66E,0xD66F,0xD672,0xD673,0xD675,/* 0x68-0x6F */
+	0xD676,0xD677,0xD678,0xD679,0xD67A,0xD67B,0xD67C,0xD67D,/* 0x70-0x77 */
+	0xD67E,0xD67F,0xD680,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD681,0xD682,0xD684,0xD686,0xD687,0xD688,0xD689,/* 0x80-0x87 */
+	0xD68A,0xD68B,0xD68E,0xD68F,0xD691,0xD692,0xD693,0xD695,/* 0x88-0x8F */
+	0xD696,0xD697,0xD698,0xD699,0xD69A,0xD69B,0xD69C,0xD69E,/* 0x90-0x97 */
+	0xD6A0,0xD6A2,0xD6A3,0xD6A4,0xD6A5,0xD6A6,0xD6A7,0xD6A9,/* 0x98-0x9F */
+	0xD6AA,0xCC3C,0xCC3D,0xCC3E,0xCC44,0xCC45,0xCC48,0xCC4C,/* 0xA0-0xA7 */
+	0xCC54,0xCC55,0xCC57,0xCC58,0xCC59,0xCC60,0xCC64,0xCC66,/* 0xA8-0xAF */
+	0xCC68,0xCC70,0xCC75,0xCC98,0xCC99,0xCC9C,0xCCA0,0xCCA8,/* 0xB0-0xB7 */
+	0xCCA9,0xCCAB,0xCCAC,0xCCAD,0xCCB4,0xCCB5,0xCCB8,0xCCBC,/* 0xB8-0xBF */
+	0xCCC4,0xCCC5,0xCCC7,0xCCC9,0xCCD0,0xCCD4,0xCCE4,0xCCEC,/* 0xC0-0xC7 */
+	0xCCF0,0xCD01,0xCD08,0xCD09,0xCD0C,0xCD10,0xCD18,0xCD19,/* 0xC8-0xCF */
+	0xCD1B,0xCD1D,0xCD24,0xCD28,0xCD2C,0xCD39,0xCD5C,0xCD60,/* 0xD0-0xD7 */
+	0xCD64,0xCD6C,0xCD6D,0xCD6F,0xCD71,0xCD78,0xCD88,0xCD94,/* 0xD8-0xDF */
+	0xCD95,0xCD98,0xCD9C,0xCDA4,0xCDA5,0xCDA7,0xCDA9,0xCDB0,/* 0xE0-0xE7 */
+	0xCDC4,0xCDCC,0xCDD0,0xCDE8,0xCDEC,0xCDF0,0xCDF8,0xCDF9,/* 0xE8-0xEF */
+	0xCDFB,0xCDFD,0xCE04,0xCE08,0xCE0C,0xCE14,0xCE19,0xCE20,/* 0xF0-0xF7 */
+	0xCE21,0xCE24,0xCE28,0xCE30,0xCE31,0xCE33,0xCE35,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD6AB,0xD6AD,0xD6AE,0xD6AF,0xD6B1,0xD6B2,0xD6B3,/* 0x40-0x47 */
+	0xD6B4,0xD6B5,0xD6B6,0xD6B7,0xD6B8,0xD6BA,0xD6BC,0xD6BD,/* 0x48-0x4F */
+	0xD6BE,0xD6BF,0xD6C0,0xD6C1,0xD6C2,0xD6C3,0xD6C6,0xD6C7,/* 0x50-0x57 */
+	0xD6C9,0xD6CA,0xD6CB,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD6CD,0xD6CE,0xD6CF,0xD6D0,0xD6D2,0xD6D3,0xD6D5,/* 0x60-0x67 */
+	0xD6D6,0xD6D8,0xD6DA,0xD6DB,0xD6DC,0xD6DD,0xD6DE,0xD6DF,/* 0x68-0x6F */
+	0xD6E1,0xD6E2,0xD6E3,0xD6E5,0xD6E6,0xD6E7,0xD6E9,0xD6EA,/* 0x70-0x77 */
+	0xD6EB,0xD6EC,0xD6ED,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD6EE,0xD6EF,0xD6F1,0xD6F2,0xD6F3,0xD6F4,0xD6F6,/* 0x80-0x87 */
+	0xD6F7,0xD6F8,0xD6F9,0xD6FA,0xD6FB,0xD6FE,0xD6FF,0xD701,/* 0x88-0x8F */
+	0xD702,0xD703,0xD705,0xD706,0xD707,0xD708,0xD709,0xD70A,/* 0x90-0x97 */
+	0xD70B,0xD70C,0xD70D,0xD70E,0xD70F,0xD710,0xD712,0xD713,/* 0x98-0x9F */
+	0xD714,0xCE58,0xCE59,0xCE5C,0xCE5F,0xCE60,0xCE61,0xCE68,/* 0xA0-0xA7 */
+	0xCE69,0xCE6B,0xCE6D,0xCE74,0xCE75,0xCE78,0xCE7C,0xCE84,/* 0xA8-0xAF */
+	0xCE85,0xCE87,0xCE89,0xCE90,0xCE91,0xCE94,0xCE98,0xCEA0,/* 0xB0-0xB7 */
+	0xCEA1,0xCEA3,0xCEA4,0xCEA5,0xCEAC,0xCEAD,0xCEC1,0xCEE4,/* 0xB8-0xBF */
+	0xCEE5,0xCEE8,0xCEEB,0xCEEC,0xCEF4,0xCEF5,0xCEF7,0xCEF8,/* 0xC0-0xC7 */
+	0xCEF9,0xCF00,0xCF01,0xCF04,0xCF08,0xCF10,0xCF11,0xCF13,/* 0xC8-0xCF */
+	0xCF15,0xCF1C,0xCF20,0xCF24,0xCF2C,0xCF2D,0xCF2F,0xCF30,/* 0xD0-0xD7 */
+	0xCF31,0xCF38,0xCF54,0xCF55,0xCF58,0xCF5C,0xCF64,0xCF65,/* 0xD8-0xDF */
+	0xCF67,0xCF69,0xCF70,0xCF71,0xCF74,0xCF78,0xCF80,0xCF85,/* 0xE0-0xE7 */
+	0xCF8C,0xCFA1,0xCFA8,0xCFB0,0xCFC4,0xCFE0,0xCFE1,0xCFE4,/* 0xE8-0xEF */
+	0xCFE8,0xCFF0,0xCFF1,0xCFF3,0xCFF5,0xCFFC,0xD000,0xD004,/* 0xF0-0xF7 */
+	0xD011,0xD018,0xD02D,0xD034,0xD035,0xD038,0xD03C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD715,0xD716,0xD717,0xD71A,0xD71B,0xD71D,0xD71E,/* 0x40-0x47 */
+	0xD71F,0xD721,0xD722,0xD723,0xD724,0xD725,0xD726,0xD727,/* 0x48-0x4F */
+	0xD72A,0xD72C,0xD72E,0xD72F,0xD730,0xD731,0xD732,0xD733,/* 0x50-0x57 */
+	0xD736,0xD737,0xD739,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0xD73A,0xD73B,0xD73D,0xD73E,0xD73F,0xD740,0xD741,/* 0x60-0x67 */
+	0xD742,0xD743,0xD745,0xD746,0xD748,0xD74A,0xD74B,0xD74C,/* 0x68-0x6F */
+	0xD74D,0xD74E,0xD74F,0xD752,0xD753,0xD755,0xD75A,0xD75B,/* 0x70-0x77 */
+	0xD75C,0xD75D,0xD75E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0xD75F,0xD762,0xD764,0xD766,0xD767,0xD768,0xD76A,/* 0x80-0x87 */
+	0xD76B,0xD76D,0xD76E,0xD76F,0xD771,0xD772,0xD773,0xD775,/* 0x88-0x8F */
+	0xD776,0xD777,0xD778,0xD779,0xD77A,0xD77B,0xD77E,0xD77F,/* 0x90-0x97 */
+	0xD780,0xD782,0xD783,0xD784,0xD785,0xD786,0xD787,0xD78A,/* 0x98-0x9F */
+	0xD78B,0xD044,0xD045,0xD047,0xD049,0xD050,0xD054,0xD058,/* 0xA0-0xA7 */
+	0xD060,0xD06C,0xD06D,0xD070,0xD074,0xD07C,0xD07D,0xD081,/* 0xA8-0xAF */
+	0xD0A4,0xD0A5,0xD0A8,0xD0AC,0xD0B4,0xD0B5,0xD0B7,0xD0B9,/* 0xB0-0xB7 */
+	0xD0C0,0xD0C1,0xD0C4,0xD0C8,0xD0C9,0xD0D0,0xD0D1,0xD0D3,/* 0xB8-0xBF */
+	0xD0D4,0xD0D5,0xD0DC,0xD0DD,0xD0E0,0xD0E4,0xD0EC,0xD0ED,/* 0xC0-0xC7 */
+	0xD0EF,0xD0F0,0xD0F1,0xD0F8,0xD10D,0xD130,0xD131,0xD134,/* 0xC8-0xCF */
+	0xD138,0xD13A,0xD140,0xD141,0xD143,0xD144,0xD145,0xD14C,/* 0xD0-0xD7 */
+	0xD14D,0xD150,0xD154,0xD15C,0xD15D,0xD15F,0xD161,0xD168,/* 0xD8-0xDF */
+	0xD16C,0xD17C,0xD184,0xD188,0xD1A0,0xD1A1,0xD1A4,0xD1A8,/* 0xE0-0xE7 */
+	0xD1B0,0xD1B1,0xD1B3,0xD1B5,0xD1BA,0xD1BC,0xD1C0,0xD1D8,/* 0xE8-0xEF */
+	0xD1F4,0xD1F8,0xD207,0xD209,0xD210,0xD22C,0xD22D,0xD230,/* 0xF0-0xF7 */
+	0xD234,0xD23C,0xD23D,0xD23F,0xD241,0xD248,0xD25C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0xD78D,0xD78E,0xD78F,0xD791,0xD792,0xD793,0xD794,/* 0x40-0x47 */
+	0xD795,0xD796,0xD797,0xD79A,0xD79C,0xD79E,0xD79F,0xD7A0,/* 0x48-0x4F */
+	0xD7A1,0xD7A2,0xD7A3,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xD264,0xD280,0xD281,0xD284,0xD288,0xD290,0xD291,/* 0xA0-0xA7 */
+	0xD295,0xD29C,0xD2A0,0xD2A4,0xD2AC,0xD2B1,0xD2B8,0xD2B9,/* 0xA8-0xAF */
+	0xD2BC,0xD2BF,0xD2C0,0xD2C2,0xD2C8,0xD2C9,0xD2CB,0xD2D4,/* 0xB0-0xB7 */
+	0xD2D8,0xD2DC,0xD2E4,0xD2E5,0xD2F0,0xD2F1,0xD2F4,0xD2F8,/* 0xB8-0xBF */
+	0xD300,0xD301,0xD303,0xD305,0xD30C,0xD30D,0xD30E,0xD310,/* 0xC0-0xC7 */
+	0xD314,0xD316,0xD31C,0xD31D,0xD31F,0xD320,0xD321,0xD325,/* 0xC8-0xCF */
+	0xD328,0xD329,0xD32C,0xD330,0xD338,0xD339,0xD33B,0xD33C,/* 0xD0-0xD7 */
+	0xD33D,0xD344,0xD345,0xD37C,0xD37D,0xD380,0xD384,0xD38C,/* 0xD8-0xDF */
+	0xD38D,0xD38F,0xD390,0xD391,0xD398,0xD399,0xD39C,0xD3A0,/* 0xE0-0xE7 */
+	0xD3A8,0xD3A9,0xD3AB,0xD3AD,0xD3B4,0xD3B8,0xD3BC,0xD3C4,/* 0xE8-0xEF */
+	0xD3C5,0xD3C8,0xD3C9,0xD3D0,0xD3D8,0xD3E1,0xD3E3,0xD3EC,/* 0xF0-0xF7 */
+	0xD3ED,0xD3F0,0xD3F4,0xD3FC,0xD3FD,0xD3FF,0xD401,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xD408,0xD41D,0xD440,0xD444,0xD45C,0xD460,0xD464,/* 0xA0-0xA7 */
+	0xD46D,0xD46F,0xD478,0xD479,0xD47C,0xD47F,0xD480,0xD482,/* 0xA8-0xAF */
+	0xD488,0xD489,0xD48B,0xD48D,0xD494,0xD4A9,0xD4CC,0xD4D0,/* 0xB0-0xB7 */
+	0xD4D4,0xD4DC,0xD4DF,0xD4E8,0xD4EC,0xD4F0,0xD4F8,0xD4FB,/* 0xB8-0xBF */
+	0xD4FD,0xD504,0xD508,0xD50C,0xD514,0xD515,0xD517,0xD53C,/* 0xC0-0xC7 */
+	0xD53D,0xD540,0xD544,0xD54C,0xD54D,0xD54F,0xD551,0xD558,/* 0xC8-0xCF */
+	0xD559,0xD55C,0xD560,0xD565,0xD568,0xD569,0xD56B,0xD56D,/* 0xD0-0xD7 */
+	0xD574,0xD575,0xD578,0xD57C,0xD584,0xD585,0xD587,0xD588,/* 0xD8-0xDF */
+	0xD589,0xD590,0xD5A5,0xD5C8,0xD5C9,0xD5CC,0xD5D0,0xD5D2,/* 0xE0-0xE7 */
+	0xD5D8,0xD5D9,0xD5DB,0xD5DD,0xD5E4,0xD5E5,0xD5E8,0xD5EC,/* 0xE8-0xEF */
+	0xD5F4,0xD5F5,0xD5F7,0xD5F9,0xD600,0xD601,0xD604,0xD608,/* 0xF0-0xF7 */
+	0xD610,0xD611,0xD613,0xD614,0xD615,0xD61C,0xD620,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xD624,0xD62D,0xD638,0xD639,0xD63C,0xD640,0xD645,/* 0xA0-0xA7 */
+	0xD648,0xD649,0xD64B,0xD64D,0xD651,0xD654,0xD655,0xD658,/* 0xA8-0xAF */
+	0xD65C,0xD667,0xD669,0xD670,0xD671,0xD674,0xD683,0xD685,/* 0xB0-0xB7 */
+	0xD68C,0xD68D,0xD690,0xD694,0xD69D,0xD69F,0xD6A1,0xD6A8,/* 0xB8-0xBF */
+	0xD6AC,0xD6B0,0xD6B9,0xD6BB,0xD6C4,0xD6C5,0xD6C8,0xD6CC,/* 0xC0-0xC7 */
+	0xD6D1,0xD6D4,0xD6D7,0xD6D9,0xD6E0,0xD6E4,0xD6E8,0xD6F0,/* 0xC8-0xCF */
+	0xD6F5,0xD6FC,0xD6FD,0xD700,0xD704,0xD711,0xD718,0xD719,/* 0xD0-0xD7 */
+	0xD71C,0xD720,0xD728,0xD729,0xD72B,0xD72D,0xD734,0xD735,/* 0xD8-0xDF */
+	0xD738,0xD73C,0xD744,0xD747,0xD749,0xD750,0xD751,0xD754,/* 0xE0-0xE7 */
+	0xD756,0xD757,0xD758,0xD759,0xD760,0xD761,0xD763,0xD765,/* 0xE8-0xEF */
+	0xD769,0xD76C,0xD770,0xD774,0xD77C,0xD77D,0xD781,0xD788,/* 0xF0-0xF7 */
+	0xD789,0xD78C,0xD790,0xD798,0xD799,0xD79B,0xD79D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x4F3D,0x4F73,0x5047,0x50F9,0x52A0,0x53EF,0x5475,/* 0xA0-0xA7 */
+	0x54E5,0x5609,0x5AC1,0x5BB6,0x6687,0x67B6,0x67B7,0x67EF,/* 0xA8-0xAF */
+	0x6B4C,0x73C2,0x75C2,0x7A3C,0x82DB,0x8304,0x8857,0x8888,/* 0xB0-0xB7 */
+	0x8A36,0x8CC8,0x8DCF,0x8EFB,0x8FE6,0x99D5,0x523B,0x5374,/* 0xB8-0xBF */
+	0x5404,0x606A,0x6164,0x6BBC,0x73CF,0x811A,0x89BA,0x89D2,/* 0xC0-0xC7 */
+	0x95A3,0x4F83,0x520A,0x58BE,0x5978,0x59E6,0x5E72,0x5E79,/* 0xC8-0xCF */
+	0x61C7,0x63C0,0x6746,0x67EC,0x687F,0x6F97,0x764E,0x770B,/* 0xD0-0xD7 */
+	0x78F5,0x7A08,0x7AFF,0x7C21,0x809D,0x826E,0x8271,0x8AEB,/* 0xD8-0xDF */
+	0x9593,0x4E6B,0x559D,0x66F7,0x6E34,0x78A3,0x7AED,0x845B,/* 0xE0-0xE7 */
+	0x8910,0x874E,0x97A8,0x52D8,0x574E,0x582A,0x5D4C,0x611F,/* 0xE8-0xEF */
+	0x61BE,0x6221,0x6562,0x67D1,0x6A44,0x6E1B,0x7518,0x75B3,/* 0xF0-0xF7 */
+	0x76E3,0x77B0,0x7D3A,0x90AF,0x9451,0x9452,0x9F95,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5323,0x5CAC,0x7532,0x80DB,0x9240,0x9598,0x525B,/* 0xA0-0xA7 */
+	0x5808,0x59DC,0x5CA1,0x5D17,0x5EB7,0x5F3A,0x5F4A,0x6177,/* 0xA8-0xAF */
+	0x6C5F,0x757A,0x7586,0x7CE0,0x7D73,0x7DB1,0x7F8C,0x8154,/* 0xB0-0xB7 */
+	0x8221,0x8591,0x8941,0x8B1B,0x92FC,0x964D,0x9C47,0x4ECB,/* 0xB8-0xBF */
+	0x4EF7,0x500B,0x51F1,0x584F,0x6137,0x613E,0x6168,0x6539,/* 0xC0-0xC7 */
+	0x69EA,0x6F11,0x75A5,0x7686,0x76D6,0x7B87,0x82A5,0x84CB,/* 0xC8-0xCF */
+	0xF900,0x93A7,0x958B,0x5580,0x5BA2,0x5751,0xF901,0x7CB3,/* 0xD0-0xD7 */
+	0x7FB9,0x91B5,0x5028,0x53BB,0x5C45,0x5DE8,0x62D2,0x636E,/* 0xD8-0xDF */
+	0x64DA,0x64E7,0x6E20,0x70AC,0x795B,0x8DDD,0x8E1E,0xF902,/* 0xE0-0xE7 */
+	0x907D,0x9245,0x92F8,0x4E7E,0x4EF6,0x5065,0x5DFE,0x5EFA,/* 0xE8-0xEF */
+	0x6106,0x6957,0x8171,0x8654,0x8E47,0x9375,0x9A2B,0x4E5E,/* 0xF0-0xF7 */
+	0x5091,0x6770,0x6840,0x5109,0x528D,0x5292,0x6AA2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x77BC,0x9210,0x9ED4,0x52AB,0x602F,0x8FF2,0x5048,/* 0xA0-0xA7 */
+	0x61A9,0x63ED,0x64CA,0x683C,0x6A84,0x6FC0,0x8188,0x89A1,/* 0xA8-0xAF */
+	0x9694,0x5805,0x727D,0x72AC,0x7504,0x7D79,0x7E6D,0x80A9,/* 0xB0-0xB7 */
+	0x898B,0x8B74,0x9063,0x9D51,0x6289,0x6C7A,0x6F54,0x7D50,/* 0xB8-0xBF */
+	0x7F3A,0x8A23,0x517C,0x614A,0x7B9D,0x8B19,0x9257,0x938C,/* 0xC0-0xC7 */
+	0x4EAC,0x4FD3,0x501E,0x50BE,0x5106,0x52C1,0x52CD,0x537F,/* 0xC8-0xCF */
+	0x5770,0x5883,0x5E9A,0x5F91,0x6176,0x61AC,0x64CE,0x656C,/* 0xD0-0xD7 */
+	0x666F,0x66BB,0x66F4,0x6897,0x6D87,0x7085,0x70F1,0x749F,/* 0xD8-0xDF */
+	0x74A5,0x74CA,0x75D9,0x786C,0x78EC,0x7ADF,0x7AF6,0x7D45,/* 0xE0-0xE7 */
+	0x7D93,0x8015,0x803F,0x811B,0x8396,0x8B66,0x8F15,0x9015,/* 0xE8-0xEF */
+	0x93E1,0x9803,0x9838,0x9A5A,0x9BE8,0x4FC2,0x5553,0x583A,/* 0xF0-0xF7 */
+	0x5951,0x5B63,0x5C46,0x60B8,0x6212,0x6842,0x68B0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x68E8,0x6EAA,0x754C,0x7678,0x78CE,0x7A3D,0x7CFB,/* 0xA0-0xA7 */
+	0x7E6B,0x7E7C,0x8A08,0x8AA1,0x8C3F,0x968E,0x9DC4,0x53E4,/* 0xA8-0xAF */
+	0x53E9,0x544A,0x5471,0x56FA,0x59D1,0x5B64,0x5C3B,0x5EAB,/* 0xB0-0xB7 */
+	0x62F7,0x6537,0x6545,0x6572,0x66A0,0x67AF,0x69C1,0x6CBD,/* 0xB8-0xBF */
+	0x75FC,0x7690,0x777E,0x7A3F,0x7F94,0x8003,0x80A1,0x818F,/* 0xC0-0xC7 */
+	0x82E6,0x82FD,0x83F0,0x85C1,0x8831,0x88B4,0x8AA5,0xF903,/* 0xC8-0xCF */
+	0x8F9C,0x932E,0x96C7,0x9867,0x9AD8,0x9F13,0x54ED,0x659B,/* 0xD0-0xD7 */
+	0x66F2,0x688F,0x7A40,0x8C37,0x9D60,0x56F0,0x5764,0x5D11,/* 0xD8-0xDF */
+	0x6606,0x68B1,0x68CD,0x6EFE,0x7428,0x889E,0x9BE4,0x6C68,/* 0xE0-0xE7 */
+	0xF904,0x9AA8,0x4F9B,0x516C,0x5171,0x529F,0x5B54,0x5DE5,/* 0xE8-0xEF */
+	0x6050,0x606D,0x62F1,0x63A7,0x653B,0x73D9,0x7A7A,0x86A3,/* 0xF0-0xF7 */
+	0x8CA2,0x978F,0x4E32,0x5BE1,0x6208,0x679C,0x74DC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x79D1,0x83D3,0x8A87,0x8AB2,0x8DE8,0x904E,0x934B,/* 0xA0-0xA7 */
+	0x9846,0x5ED3,0x69E8,0x85FF,0x90ED,0xF905,0x51A0,0x5B98,/* 0xA8-0xAF */
+	0x5BEC,0x6163,0x68FA,0x6B3E,0x704C,0x742F,0x74D8,0x7BA1,/* 0xB0-0xB7 */
+	0x7F50,0x83C5,0x89C0,0x8CAB,0x95DC,0x9928,0x522E,0x605D,/* 0xB8-0xBF */
+	0x62EC,0x9002,0x4F8A,0x5149,0x5321,0x58D9,0x5EE3,0x66E0,/* 0xC0-0xC7 */
+	0x6D38,0x709A,0x72C2,0x73D6,0x7B50,0x80F1,0x945B,0x5366,/* 0xC8-0xCF */
+	0x639B,0x7F6B,0x4E56,0x5080,0x584A,0x58DE,0x602A,0x6127,/* 0xD0-0xD7 */
+	0x62D0,0x69D0,0x9B41,0x5B8F,0x7D18,0x80B1,0x8F5F,0x4EA4,/* 0xD8-0xDF */
+	0x50D1,0x54AC,0x55AC,0x5B0C,0x5DA0,0x5DE7,0x652A,0x654E,/* 0xE0-0xE7 */
+	0x6821,0x6A4B,0x72E1,0x768E,0x77EF,0x7D5E,0x7FF9,0x81A0,/* 0xE8-0xEF */
+	0x854E,0x86DF,0x8F03,0x8F4E,0x90CA,0x9903,0x9A55,0x9BAB,/* 0xF0-0xF7 */
+	0x4E18,0x4E45,0x4E5D,0x4EC7,0x4FF1,0x5177,0x52FE,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5340,0x53E3,0x53E5,0x548E,0x5614,0x5775,0x57A2,/* 0xA0-0xA7 */
+	0x5BC7,0x5D87,0x5ED0,0x61FC,0x62D8,0x6551,0x67B8,0x67E9,/* 0xA8-0xAF */
+	0x69CB,0x6B50,0x6BC6,0x6BEC,0x6C42,0x6E9D,0x7078,0x72D7,/* 0xB0-0xB7 */
+	0x7396,0x7403,0x77BF,0x77E9,0x7A76,0x7D7F,0x8009,0x81FC,/* 0xB8-0xBF */
+	0x8205,0x820A,0x82DF,0x8862,0x8B33,0x8CFC,0x8EC0,0x9011,/* 0xC0-0xC7 */
+	0x90B1,0x9264,0x92B6,0x99D2,0x9A45,0x9CE9,0x9DD7,0x9F9C,/* 0xC8-0xCF */
+	0x570B,0x5C40,0x83CA,0x97A0,0x97AB,0x9EB4,0x541B,0x7A98,/* 0xD0-0xD7 */
+	0x7FA4,0x88D9,0x8ECD,0x90E1,0x5800,0x5C48,0x6398,0x7A9F,/* 0xD8-0xDF */
+	0x5BAE,0x5F13,0x7A79,0x7AAE,0x828E,0x8EAC,0x5026,0x5238,/* 0xE0-0xE7 */
+	0x52F8,0x5377,0x5708,0x62F3,0x6372,0x6B0A,0x6DC3,0x7737,/* 0xE8-0xEF */
+	0x53A5,0x7357,0x8568,0x8E76,0x95D5,0x673A,0x6AC3,0x6F70,/* 0xF0-0xF7 */
+	0x8A6D,0x8ECC,0x994B,0xF906,0x6677,0x6B78,0x8CB4,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9B3C,0xF907,0x53EB,0x572D,0x594E,0x63C6,0x69FB,/* 0xA0-0xA7 */
+	0x73EA,0x7845,0x7ABA,0x7AC5,0x7CFE,0x8475,0x898F,0x8D73,/* 0xA8-0xAF */
+	0x9035,0x95A8,0x52FB,0x5747,0x7547,0x7B60,0x83CC,0x921E,/* 0xB0-0xB7 */
+	0xF908,0x6A58,0x514B,0x524B,0x5287,0x621F,0x68D8,0x6975,/* 0xB8-0xBF */
+	0x9699,0x50C5,0x52A4,0x52E4,0x61C3,0x65A4,0x6839,0x69FF,/* 0xC0-0xC7 */
+	0x747E,0x7B4B,0x82B9,0x83EB,0x89B2,0x8B39,0x8FD1,0x9949,/* 0xC8-0xCF */
+	0xF909,0x4ECA,0x5997,0x64D2,0x6611,0x6A8E,0x7434,0x7981,/* 0xD0-0xD7 */
+	0x79BD,0x82A9,0x887E,0x887F,0x895F,0xF90A,0x9326,0x4F0B,/* 0xD8-0xDF */
+	0x53CA,0x6025,0x6271,0x6C72,0x7D1A,0x7D66,0x4E98,0x5162,/* 0xE0-0xE7 */
+	0x77DC,0x80AF,0x4F01,0x4F0E,0x5176,0x5180,0x55DC,0x5668,/* 0xE8-0xEF */
+	0x573B,0x57FA,0x57FC,0x5914,0x5947,0x5993,0x5BC4,0x5C90,/* 0xF0-0xF7 */
+	0x5D0E,0x5DF1,0x5E7E,0x5FCC,0x6280,0x65D7,0x65E3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x671E,0x671F,0x675E,0x68CB,0x68C4,0x6A5F,0x6B3A,/* 0xA0-0xA7 */
+	0x6C23,0x6C7D,0x6C82,0x6DC7,0x7398,0x7426,0x742A,0x7482,/* 0xA8-0xAF */
+	0x74A3,0x7578,0x757F,0x7881,0x78EF,0x7941,0x7947,0x7948,/* 0xB0-0xB7 */
+	0x797A,0x7B95,0x7D00,0x7DBA,0x7F88,0x8006,0x802D,0x808C,/* 0xB8-0xBF */
+	0x8A18,0x8B4F,0x8C48,0x8D77,0x9321,0x9324,0x98E2,0x9951,/* 0xC0-0xC7 */
+	0x9A0E,0x9A0F,0x9A65,0x9E92,0x7DCA,0x4F76,0x5409,0x62EE,/* 0xC8-0xCF */
+	0x6854,0x91D1,0x55AB,0x513A,0xF90B,0xF90C,0x5A1C,0x61E6,/* 0xD0-0xD7 */
+	0xF90D,0x62CF,0x62FF,0xF90E,0xF90F,0xF910,0xF911,0xF912,/* 0xD8-0xDF */
+	0xF913,0x90A3,0xF914,0xF915,0xF916,0xF917,0xF918,0x8AFE,/* 0xE0-0xE7 */
+	0xF919,0xF91A,0xF91B,0xF91C,0x6696,0xF91D,0x7156,0xF91E,/* 0xE8-0xEF */
+	0xF91F,0x96E3,0xF920,0x634F,0x637A,0x5357,0xF921,0x678F,/* 0xF0-0xF7 */
+	0x6960,0x6E73,0xF922,0x7537,0xF923,0xF924,0xF925,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7D0D,0xF926,0xF927,0x8872,0x56CA,0x5A18,0xF928,/* 0xA0-0xA7 */
+	0xF929,0xF92A,0xF92B,0xF92C,0x4E43,0xF92D,0x5167,0x5948,/* 0xA8-0xAF */
+	0x67F0,0x8010,0xF92E,0x5973,0x5E74,0x649A,0x79CA,0x5FF5,/* 0xB0-0xB7 */
+	0x606C,0x62C8,0x637B,0x5BE7,0x5BD7,0x52AA,0xF92F,0x5974,/* 0xB8-0xBF */
+	0x5F29,0x6012,0xF930,0xF931,0xF932,0x7459,0xF933,0xF934,/* 0xC0-0xC7 */
+	0xF935,0xF936,0xF937,0xF938,0x99D1,0xF939,0xF93A,0xF93B,/* 0xC8-0xCF */
+	0xF93C,0xF93D,0xF93E,0xF93F,0xF940,0xF941,0xF942,0xF943,/* 0xD0-0xD7 */
+	0x6FC3,0xF944,0xF945,0x81BF,0x8FB2,0x60F1,0xF946,0xF947,/* 0xD8-0xDF */
+	0x8166,0xF948,0xF949,0x5C3F,0xF94A,0xF94B,0xF94C,0xF94D,/* 0xE0-0xE7 */
+	0xF94E,0xF94F,0xF950,0xF951,0x5AE9,0x8A25,0x677B,0x7D10,/* 0xE8-0xEF */
+	0xF952,0xF953,0xF954,0xF955,0xF956,0xF957,0x80FD,0xF958,/* 0xF0-0xF7 */
+	0xF959,0x5C3C,0x6CE5,0x533F,0x6EBA,0x591A,0x8336,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x4E39,0x4EB6,0x4F46,0x55AE,0x5718,0x58C7,0x5F56,/* 0xA0-0xA7 */
+	0x65B7,0x65E6,0x6A80,0x6BB5,0x6E4D,0x77ED,0x7AEF,0x7C1E,/* 0xA8-0xAF */
+	0x7DDE,0x86CB,0x8892,0x9132,0x935B,0x64BB,0x6FBE,0x737A,/* 0xB0-0xB7 */
+	0x75B8,0x9054,0x5556,0x574D,0x61BA,0x64D4,0x66C7,0x6DE1,/* 0xB8-0xBF */
+	0x6E5B,0x6F6D,0x6FB9,0x75F0,0x8043,0x81BD,0x8541,0x8983,/* 0xC0-0xC7 */
+	0x8AC7,0x8B5A,0x931F,0x6C93,0x7553,0x7B54,0x8E0F,0x905D,/* 0xC8-0xCF */
+	0x5510,0x5802,0x5858,0x5E62,0x6207,0x649E,0x68E0,0x7576,/* 0xD0-0xD7 */
+	0x7CD6,0x87B3,0x9EE8,0x4EE3,0x5788,0x576E,0x5927,0x5C0D,/* 0xD8-0xDF */
+	0x5CB1,0x5E36,0x5F85,0x6234,0x64E1,0x73B3,0x81FA,0x888B,/* 0xE0-0xE7 */
+	0x8CB8,0x968A,0x9EDB,0x5B85,0x5FB7,0x60B3,0x5012,0x5200,/* 0xE8-0xEF */
+	0x5230,0x5716,0x5835,0x5857,0x5C0E,0x5C60,0x5CF6,0x5D8B,/* 0xF0-0xF7 */
+	0x5EA6,0x5F92,0x60BC,0x6311,0x6389,0x6417,0x6843,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x68F9,0x6AC2,0x6DD8,0x6E21,0x6ED4,0x6FE4,0x71FE,/* 0xA0-0xA7 */
+	0x76DC,0x7779,0x79B1,0x7A3B,0x8404,0x89A9,0x8CED,0x8DF3,/* 0xA8-0xAF */
+	0x8E48,0x9003,0x9014,0x9053,0x90FD,0x934D,0x9676,0x97DC,/* 0xB0-0xB7 */
+	0x6BD2,0x7006,0x7258,0x72A2,0x7368,0x7763,0x79BF,0x7BE4,/* 0xB8-0xBF */
+	0x7E9B,0x8B80,0x58A9,0x60C7,0x6566,0x65FD,0x66BE,0x6C8C,/* 0xC0-0xC7 */
+	0x711E,0x71C9,0x8C5A,0x9813,0x4E6D,0x7A81,0x4EDD,0x51AC,/* 0xC8-0xCF */
+	0x51CD,0x52D5,0x540C,0x61A7,0x6771,0x6850,0x68DF,0x6D1E,/* 0xD0-0xD7 */
+	0x6F7C,0x75BC,0x77B3,0x7AE5,0x80F4,0x8463,0x9285,0x515C,/* 0xD8-0xDF */
+	0x6597,0x675C,0x6793,0x75D8,0x7AC7,0x8373,0xF95A,0x8C46,/* 0xE0-0xE7 */
+	0x9017,0x982D,0x5C6F,0x81C0,0x829A,0x9041,0x906F,0x920D,/* 0xE8-0xEF */
+	0x5F97,0x5D9D,0x6A59,0x71C8,0x767B,0x7B49,0x85E4,0x8B04,/* 0xF0-0xF7 */
+	0x9127,0x9A30,0x5587,0x61F6,0xF95B,0x7669,0x7F85,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x863F,0x87BA,0x88F8,0x908F,0xF95C,0x6D1B,0x70D9,/* 0xA0-0xA7 */
+	0x73DE,0x7D61,0x843D,0xF95D,0x916A,0x99F1,0xF95E,0x4E82,/* 0xA8-0xAF */
+	0x5375,0x6B04,0x6B12,0x703E,0x721B,0x862D,0x9E1E,0x524C,/* 0xB0-0xB7 */
+	0x8FA3,0x5D50,0x64E5,0x652C,0x6B16,0x6FEB,0x7C43,0x7E9C,/* 0xB8-0xBF */
+	0x85CD,0x8964,0x89BD,0x62C9,0x81D8,0x881F,0x5ECA,0x6717,/* 0xC0-0xC7 */
+	0x6D6A,0x72FC,0x7405,0x746F,0x8782,0x90DE,0x4F86,0x5D0D,/* 0xC8-0xCF */
+	0x5FA0,0x840A,0x51B7,0x63A0,0x7565,0x4EAE,0x5006,0x5169,/* 0xD0-0xD7 */
+	0x51C9,0x6881,0x6A11,0x7CAE,0x7CB1,0x7CE7,0x826F,0x8AD2,/* 0xD8-0xDF */
+	0x8F1B,0x91CF,0x4FB6,0x5137,0x52F5,0x5442,0x5EEC,0x616E,/* 0xE0-0xE7 */
+	0x623E,0x65C5,0x6ADA,0x6FFE,0x792A,0x85DC,0x8823,0x95AD,/* 0xE8-0xEF */
+	0x9A62,0x9A6A,0x9E97,0x9ECE,0x529B,0x66C6,0x6B77,0x701D,/* 0xF0-0xF7 */
+	0x792B,0x8F62,0x9742,0x6190,0x6200,0x6523,0x6F23,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7149,0x7489,0x7DF4,0x806F,0x84EE,0x8F26,0x9023,/* 0xA0-0xA7 */
+	0x934A,0x51BD,0x5217,0x52A3,0x6D0C,0x70C8,0x88C2,0x5EC9,/* 0xA8-0xAF */
+	0x6582,0x6BAE,0x6FC2,0x7C3E,0x7375,0x4EE4,0x4F36,0x56F9,/* 0xB0-0xB7 */
+	0xF95F,0x5CBA,0x5DBA,0x601C,0x73B2,0x7B2D,0x7F9A,0x7FCE,/* 0xB8-0xBF */
+	0x8046,0x901E,0x9234,0x96F6,0x9748,0x9818,0x9F61,0x4F8B,/* 0xC0-0xC7 */
+	0x6FA7,0x79AE,0x91B4,0x96B7,0x52DE,0xF960,0x6488,0x64C4,/* 0xC8-0xCF */
+	0x6AD3,0x6F5E,0x7018,0x7210,0x76E7,0x8001,0x8606,0x865C,/* 0xD0-0xD7 */
+	0x8DEF,0x8F05,0x9732,0x9B6F,0x9DFA,0x9E75,0x788C,0x797F,/* 0xD8-0xDF */
+	0x7DA0,0x83C9,0x9304,0x9E7F,0x9E93,0x8AD6,0x58DF,0x5F04,/* 0xE0-0xE7 */
+	0x6727,0x7027,0x74CF,0x7C60,0x807E,0x5121,0x7028,0x7262,/* 0xE8-0xEF */
+	0x78CA,0x8CC2,0x8CDA,0x8CF4,0x96F7,0x4E86,0x50DA,0x5BEE,/* 0xF0-0xF7 */
+	0x5ED6,0x6599,0x71CE,0x7642,0x77AD,0x804A,0x84FC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x907C,0x9B27,0x9F8D,0x58D8,0x5A41,0x5C62,0x6A13,/* 0xA0-0xA7 */
+	0x6DDA,0x6F0F,0x763B,0x7D2F,0x7E37,0x851E,0x8938,0x93E4,/* 0xA8-0xAF */
+	0x964B,0x5289,0x65D2,0x67F3,0x69B4,0x6D41,0x6E9C,0x700F,/* 0xB0-0xB7 */
+	0x7409,0x7460,0x7559,0x7624,0x786B,0x8B2C,0x985E,0x516D,/* 0xB8-0xBF */
+	0x622E,0x9678,0x4F96,0x502B,0x5D19,0x6DEA,0x7DB8,0x8F2A,/* 0xC0-0xC7 */
+	0x5F8B,0x6144,0x6817,0xF961,0x9686,0x52D2,0x808B,0x51DC,/* 0xC8-0xCF */
+	0x51CC,0x695E,0x7A1C,0x7DBE,0x83F1,0x9675,0x4FDA,0x5229,/* 0xD0-0xD7 */
+	0x5398,0x540F,0x550E,0x5C65,0x60A7,0x674E,0x68A8,0x6D6C,/* 0xD8-0xDF */
+	0x7281,0x72F8,0x7406,0x7483,0xF962,0x75E2,0x7C6C,0x7F79,/* 0xE0-0xE7 */
+	0x7FB8,0x8389,0x88CF,0x88E1,0x91CC,0x91D0,0x96E2,0x9BC9,/* 0xE8-0xEF */
+	0x541D,0x6F7E,0x71D0,0x7498,0x85FA,0x8EAA,0x96A3,0x9C57,/* 0xF0-0xF7 */
+	0x9E9F,0x6797,0x6DCB,0x7433,0x81E8,0x9716,0x782C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7ACB,0x7B20,0x7C92,0x6469,0x746A,0x75F2,0x78BC,/* 0xA0-0xA7 */
+	0x78E8,0x99AC,0x9B54,0x9EBB,0x5BDE,0x5E55,0x6F20,0x819C,/* 0xA8-0xAF */
+	0x83AB,0x9088,0x4E07,0x534D,0x5A29,0x5DD2,0x5F4E,0x6162,/* 0xB0-0xB7 */
+	0x633D,0x6669,0x66FC,0x6EFF,0x6F2B,0x7063,0x779E,0x842C,/* 0xB8-0xBF */
+	0x8513,0x883B,0x8F13,0x9945,0x9C3B,0x551C,0x62B9,0x672B,/* 0xC0-0xC7 */
+	0x6CAB,0x8309,0x896A,0x977A,0x4EA1,0x5984,0x5FD8,0x5FD9,/* 0xC8-0xCF */
+	0x671B,0x7DB2,0x7F54,0x8292,0x832B,0x83BD,0x8F1E,0x9099,/* 0xD0-0xD7 */
+	0x57CB,0x59B9,0x5A92,0x5BD0,0x6627,0x679A,0x6885,0x6BCF,/* 0xD8-0xDF */
+	0x7164,0x7F75,0x8CB7,0x8CE3,0x9081,0x9B45,0x8108,0x8C8A,/* 0xE0-0xE7 */
+	0x964C,0x9A40,0x9EA5,0x5B5F,0x6C13,0x731B,0x76F2,0x76DF,/* 0xE8-0xEF */
+	0x840C,0x51AA,0x8993,0x514D,0x5195,0x52C9,0x68C9,0x6C94,/* 0xF0-0xF7 */
+	0x7704,0x7720,0x7DBF,0x7DEC,0x9762,0x9EB5,0x6EC5,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8511,0x51A5,0x540D,0x547D,0x660E,0x669D,0x6927,/* 0xA0-0xA7 */
+	0x6E9F,0x76BF,0x7791,0x8317,0x84C2,0x879F,0x9169,0x9298,/* 0xA8-0xAF */
+	0x9CF4,0x8882,0x4FAE,0x5192,0x52DF,0x59C6,0x5E3D,0x6155,/* 0xB0-0xB7 */
+	0x6478,0x6479,0x66AE,0x67D0,0x6A21,0x6BCD,0x6BDB,0x725F,/* 0xB8-0xBF */
+	0x7261,0x7441,0x7738,0x77DB,0x8017,0x82BC,0x8305,0x8B00,/* 0xC0-0xC7 */
+	0x8B28,0x8C8C,0x6728,0x6C90,0x7267,0x76EE,0x7766,0x7A46,/* 0xC8-0xCF */
+	0x9DA9,0x6B7F,0x6C92,0x5922,0x6726,0x8499,0x536F,0x5893,/* 0xD0-0xD7 */
+	0x5999,0x5EDF,0x63CF,0x6634,0x6773,0x6E3A,0x732B,0x7AD7,/* 0xD8-0xDF */
+	0x82D7,0x9328,0x52D9,0x5DEB,0x61AE,0x61CB,0x620A,0x62C7,/* 0xE0-0xE7 */
+	0x64AB,0x65E0,0x6959,0x6B66,0x6BCB,0x7121,0x73F7,0x755D,/* 0xE8-0xEF */
+	0x7E46,0x821E,0x8302,0x856A,0x8AA3,0x8CBF,0x9727,0x9D61,/* 0xF0-0xF7 */
+	0x58A8,0x9ED8,0x5011,0x520E,0x543B,0x554F,0x6587,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6C76,0x7D0A,0x7D0B,0x805E,0x868A,0x9580,0x96EF,/* 0xA0-0xA7 */
+	0x52FF,0x6C95,0x7269,0x5473,0x5A9A,0x5C3E,0x5D4B,0x5F4C,/* 0xA8-0xAF */
+	0x5FAE,0x672A,0x68B6,0x6963,0x6E3C,0x6E44,0x7709,0x7C73,/* 0xB0-0xB7 */
+	0x7F8E,0x8587,0x8B0E,0x8FF7,0x9761,0x9EF4,0x5CB7,0x60B6,/* 0xB8-0xBF */
+	0x610D,0x61AB,0x654F,0x65FB,0x65FC,0x6C11,0x6CEF,0x739F,/* 0xC0-0xC7 */
+	0x73C9,0x7DE1,0x9594,0x5BC6,0x871C,0x8B10,0x525D,0x535A,/* 0xC8-0xCF */
+	0x62CD,0x640F,0x64B2,0x6734,0x6A38,0x6CCA,0x73C0,0x749E,/* 0xD0-0xD7 */
+	0x7B94,0x7C95,0x7E1B,0x818A,0x8236,0x8584,0x8FEB,0x96F9,/* 0xD8-0xDF */
+	0x99C1,0x4F34,0x534A,0x53CD,0x53DB,0x62CC,0x642C,0x6500,/* 0xE0-0xE7 */
+	0x6591,0x69C3,0x6CEE,0x6F58,0x73ED,0x7554,0x7622,0x76E4,/* 0xE8-0xEF */
+	0x76FC,0x78D0,0x78FB,0x792C,0x7D46,0x822C,0x87E0,0x8FD4,/* 0xF0-0xF7 */
+	0x9812,0x98EF,0x52C3,0x62D4,0x64A5,0x6E24,0x6F51,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x767C,0x8DCB,0x91B1,0x9262,0x9AEE,0x9B43,0x5023,/* 0xA0-0xA7 */
+	0x508D,0x574A,0x59A8,0x5C28,0x5E47,0x5F77,0x623F,0x653E,/* 0xA8-0xAF */
+	0x65B9,0x65C1,0x6609,0x678B,0x699C,0x6EC2,0x78C5,0x7D21,/* 0xB0-0xB7 */
+	0x80AA,0x8180,0x822B,0x82B3,0x84A1,0x868C,0x8A2A,0x8B17,/* 0xB8-0xBF */
+	0x90A6,0x9632,0x9F90,0x500D,0x4FF3,0xF963,0x57F9,0x5F98,/* 0xC0-0xC7 */
+	0x62DC,0x6392,0x676F,0x6E43,0x7119,0x76C3,0x80CC,0x80DA,/* 0xC8-0xCF */
+	0x88F4,0x88F5,0x8919,0x8CE0,0x8F29,0x914D,0x966A,0x4F2F,/* 0xD0-0xD7 */
+	0x4F70,0x5E1B,0x67CF,0x6822,0x767D,0x767E,0x9B44,0x5E61,/* 0xD8-0xDF */
+	0x6A0A,0x7169,0x71D4,0x756A,0xF964,0x7E41,0x8543,0x85E9,/* 0xE0-0xE7 */
+	0x98DC,0x4F10,0x7B4F,0x7F70,0x95A5,0x51E1,0x5E06,0x68B5,/* 0xE8-0xEF */
+	0x6C3E,0x6C4E,0x6CDB,0x72AF,0x7BC4,0x8303,0x6CD5,0x743A,/* 0xF0-0xF7 */
+	0x50FB,0x5288,0x58C1,0x64D8,0x6A97,0x74A7,0x7656,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x78A7,0x8617,0x95E2,0x9739,0xF965,0x535E,0x5F01,/* 0xA0-0xA7 */
+	0x8B8A,0x8FA8,0x8FAF,0x908A,0x5225,0x77A5,0x9C49,0x9F08,/* 0xA8-0xAF */
+	0x4E19,0x5002,0x5175,0x5C5B,0x5E77,0x661E,0x663A,0x67C4,/* 0xB0-0xB7 */
+	0x68C5,0x70B3,0x7501,0x75C5,0x79C9,0x7ADD,0x8F27,0x9920,/* 0xB8-0xBF */
+	0x9A08,0x4FDD,0x5821,0x5831,0x5BF6,0x666E,0x6B65,0x6D11,/* 0xC0-0xC7 */
+	0x6E7A,0x6F7D,0x73E4,0x752B,0x83E9,0x88DC,0x8913,0x8B5C,/* 0xC8-0xCF */
+	0x8F14,0x4F0F,0x50D5,0x5310,0x535C,0x5B93,0x5FA9,0x670D,/* 0xD0-0xD7 */
+	0x798F,0x8179,0x832F,0x8514,0x8907,0x8986,0x8F39,0x8F3B,/* 0xD8-0xDF */
+	0x99A5,0x9C12,0x672C,0x4E76,0x4FF8,0x5949,0x5C01,0x5CEF,/* 0xE0-0xE7 */
+	0x5CF0,0x6367,0x68D2,0x70FD,0x71A2,0x742B,0x7E2B,0x84EC,/* 0xE8-0xEF */
+	0x8702,0x9022,0x92D2,0x9CF3,0x4E0D,0x4ED8,0x4FEF,0x5085,/* 0xF0-0xF7 */
+	0x5256,0x526F,0x5426,0x5490,0x57E0,0x592B,0x5A66,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5B5A,0x5B75,0x5BCC,0x5E9C,0xF966,0x6276,0x6577,/* 0xA0-0xA7 */
+	0x65A7,0x6D6E,0x6EA5,0x7236,0x7B26,0x7C3F,0x7F36,0x8150,/* 0xA8-0xAF */
+	0x8151,0x819A,0x8240,0x8299,0x83A9,0x8A03,0x8CA0,0x8CE6,/* 0xB0-0xB7 */
+	0x8CFB,0x8D74,0x8DBA,0x90E8,0x91DC,0x961C,0x9644,0x99D9,/* 0xB8-0xBF */
+	0x9CE7,0x5317,0x5206,0x5429,0x5674,0x58B3,0x5954,0x596E,/* 0xC0-0xC7 */
+	0x5FFF,0x61A4,0x626E,0x6610,0x6C7E,0x711A,0x76C6,0x7C89,/* 0xC8-0xCF */
+	0x7CDE,0x7D1B,0x82AC,0x8CC1,0x96F0,0xF967,0x4F5B,0x5F17,/* 0xD0-0xD7 */
+	0x5F7F,0x62C2,0x5D29,0x670B,0x68DA,0x787C,0x7E43,0x9D6C,/* 0xD8-0xDF */
+	0x4E15,0x5099,0x5315,0x532A,0x5351,0x5983,0x5A62,0x5E87,/* 0xE0-0xE7 */
+	0x60B2,0x618A,0x6249,0x6279,0x6590,0x6787,0x69A7,0x6BD4,/* 0xE8-0xEF */
+	0x6BD6,0x6BD7,0x6BD8,0x6CB8,0xF968,0x7435,0x75FA,0x7812,/* 0xF0-0xF7 */
+	0x7891,0x79D5,0x79D8,0x7C83,0x7DCB,0x7FE1,0x80A5,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x813E,0x81C2,0x83F2,0x871A,0x88E8,0x8AB9,0x8B6C,/* 0xA0-0xA7 */
+	0x8CBB,0x9119,0x975E,0x98DB,0x9F3B,0x56AC,0x5B2A,0x5F6C,/* 0xA8-0xAF */
+	0x658C,0x6AB3,0x6BAF,0x6D5C,0x6FF1,0x7015,0x725D,0x73AD,/* 0xB0-0xB7 */
+	0x8CA7,0x8CD3,0x983B,0x6191,0x6C37,0x8058,0x9A01,0x4E4D,/* 0xB8-0xBF */
+	0x4E8B,0x4E9B,0x4ED5,0x4F3A,0x4F3C,0x4F7F,0x4FDF,0x50FF,/* 0xC0-0xC7 */
+	0x53F2,0x53F8,0x5506,0x55E3,0x56DB,0x58EB,0x5962,0x5A11,/* 0xC8-0xCF */
+	0x5BEB,0x5BFA,0x5C04,0x5DF3,0x5E2B,0x5F99,0x601D,0x6368,/* 0xD0-0xD7 */
+	0x659C,0x65AF,0x67F6,0x67FB,0x68AD,0x6B7B,0x6C99,0x6CD7,/* 0xD8-0xDF */
+	0x6E23,0x7009,0x7345,0x7802,0x793E,0x7940,0x7960,0x79C1,/* 0xE0-0xE7 */
+	0x7BE9,0x7D17,0x7D72,0x8086,0x820D,0x838E,0x84D1,0x86C7,/* 0xE8-0xEF */
+	0x88DF,0x8A50,0x8A5E,0x8B1D,0x8CDC,0x8D66,0x8FAD,0x90AA,/* 0xF0-0xF7 */
+	0x98FC,0x99DF,0x9E9D,0x524A,0xF969,0x6714,0xF96A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5098,0x522A,0x5C71,0x6563,0x6C55,0x73CA,0x7523,/* 0xA0-0xA7 */
+	0x759D,0x7B97,0x849C,0x9178,0x9730,0x4E77,0x6492,0x6BBA,/* 0xA8-0xAF */
+	0x715E,0x85A9,0x4E09,0xF96B,0x6749,0x68EE,0x6E17,0x829F,/* 0xB0-0xB7 */
+	0x8518,0x886B,0x63F7,0x6F81,0x9212,0x98AF,0x4E0A,0x50B7,/* 0xB8-0xBF */
+	0x50CF,0x511F,0x5546,0x55AA,0x5617,0x5B40,0x5C19,0x5CE0,/* 0xC0-0xC7 */
+	0x5E38,0x5E8A,0x5EA0,0x5EC2,0x60F3,0x6851,0x6A61,0x6E58,/* 0xC8-0xCF */
+	0x723D,0x7240,0x72C0,0x76F8,0x7965,0x7BB1,0x7FD4,0x88F3,/* 0xD0-0xD7 */
+	0x89F4,0x8A73,0x8C61,0x8CDE,0x971C,0x585E,0x74BD,0x8CFD,/* 0xD8-0xDF */
+	0x55C7,0xF96C,0x7A61,0x7D22,0x8272,0x7272,0x751F,0x7525,/* 0xE0-0xE7 */
+	0xF96D,0x7B19,0x5885,0x58FB,0x5DBC,0x5E8F,0x5EB6,0x5F90,/* 0xE8-0xEF */
+	0x6055,0x6292,0x637F,0x654D,0x6691,0x66D9,0x66F8,0x6816,/* 0xF0-0xF7 */
+	0x68F2,0x7280,0x745E,0x7B6E,0x7D6E,0x7DD6,0x7F72,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x80E5,0x8212,0x85AF,0x897F,0x8A93,0x901D,0x92E4,/* 0xA0-0xA7 */
+	0x9ECD,0x9F20,0x5915,0x596D,0x5E2D,0x60DC,0x6614,0x6673,/* 0xA8-0xAF */
+	0x6790,0x6C50,0x6DC5,0x6F5F,0x77F3,0x78A9,0x84C6,0x91CB,/* 0xB0-0xB7 */
+	0x932B,0x4ED9,0x50CA,0x5148,0x5584,0x5B0B,0x5BA3,0x6247,/* 0xB8-0xBF */
+	0x657E,0x65CB,0x6E32,0x717D,0x7401,0x7444,0x7487,0x74BF,/* 0xC0-0xC7 */
+	0x766C,0x79AA,0x7DDA,0x7E55,0x7FA8,0x817A,0x81B3,0x8239,/* 0xC8-0xCF */
+	0x861A,0x87EC,0x8A75,0x8DE3,0x9078,0x9291,0x9425,0x994D,/* 0xD0-0xD7 */
+	0x9BAE,0x5368,0x5C51,0x6954,0x6CC4,0x6D29,0x6E2B,0x820C,/* 0xD8-0xDF */
+	0x859B,0x893B,0x8A2D,0x8AAA,0x96EA,0x9F67,0x5261,0x66B9,/* 0xE0-0xE7 */
+	0x6BB2,0x7E96,0x87FE,0x8D0D,0x9583,0x965D,0x651D,0x6D89,/* 0xE8-0xEF */
+	0x71EE,0xF96E,0x57CE,0x59D3,0x5BAC,0x6027,0x60FA,0x6210,/* 0xF0-0xF7 */
+	0x661F,0x665F,0x7329,0x73F9,0x76DB,0x7701,0x7B6C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8056,0x8072,0x8165,0x8AA0,0x9192,0x4E16,0x52E2,/* 0xA0-0xA7 */
+	0x6B72,0x6D17,0x7A05,0x7B39,0x7D30,0xF96F,0x8CB0,0x53EC,/* 0xA8-0xAF */
+	0x562F,0x5851,0x5BB5,0x5C0F,0x5C11,0x5DE2,0x6240,0x6383,/* 0xB0-0xB7 */
+	0x6414,0x662D,0x68B3,0x6CBC,0x6D88,0x6EAF,0x701F,0x70A4,/* 0xB8-0xBF */
+	0x71D2,0x7526,0x758F,0x758E,0x7619,0x7B11,0x7BE0,0x7C2B,/* 0xC0-0xC7 */
+	0x7D20,0x7D39,0x852C,0x856D,0x8607,0x8A34,0x900D,0x9061,/* 0xC8-0xCF */
+	0x90B5,0x92B7,0x97F6,0x9A37,0x4FD7,0x5C6C,0x675F,0x6D91,/* 0xD0-0xD7 */
+	0x7C9F,0x7E8C,0x8B16,0x8D16,0x901F,0x5B6B,0x5DFD,0x640D,/* 0xD8-0xDF */
+	0x84C0,0x905C,0x98E1,0x7387,0x5B8B,0x609A,0x677E,0x6DDE,/* 0xE0-0xE7 */
+	0x8A1F,0x8AA6,0x9001,0x980C,0x5237,0xF970,0x7051,0x788E,/* 0xE8-0xEF */
+	0x9396,0x8870,0x91D7,0x4FEE,0x53D7,0x55FD,0x56DA,0x5782,/* 0xF0-0xF7 */
+	0x58FD,0x5AC2,0x5B88,0x5CAB,0x5CC0,0x5E25,0x6101,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x620D,0x624B,0x6388,0x641C,0x6536,0x6578,0x6A39,/* 0xA0-0xA7 */
+	0x6B8A,0x6C34,0x6D19,0x6F31,0x71E7,0x72E9,0x7378,0x7407,/* 0xA8-0xAF */
+	0x74B2,0x7626,0x7761,0x79C0,0x7A57,0x7AEA,0x7CB9,0x7D8F,/* 0xB0-0xB7 */
+	0x7DAC,0x7E61,0x7F9E,0x8129,0x8331,0x8490,0x84DA,0x85EA,/* 0xB8-0xBF */
+	0x8896,0x8AB0,0x8B90,0x8F38,0x9042,0x9083,0x916C,0x9296,/* 0xC0-0xC7 */
+	0x92B9,0x968B,0x96A7,0x96A8,0x96D6,0x9700,0x9808,0x9996,/* 0xC8-0xCF */
+	0x9AD3,0x9B1A,0x53D4,0x587E,0x5919,0x5B70,0x5BBF,0x6DD1,/* 0xD0-0xD7 */
+	0x6F5A,0x719F,0x7421,0x74B9,0x8085,0x83FD,0x5DE1,0x5F87,/* 0xD8-0xDF */
+	0x5FAA,0x6042,0x65EC,0x6812,0x696F,0x6A53,0x6B89,0x6D35,/* 0xE0-0xE7 */
+	0x6DF3,0x73E3,0x76FE,0x77AC,0x7B4D,0x7D14,0x8123,0x821C,/* 0xE8-0xEF */
+	0x8340,0x84F4,0x8563,0x8A62,0x8AC4,0x9187,0x931E,0x9806,/* 0xF0-0xF7 */
+	0x99B4,0x620C,0x8853,0x8FF0,0x9265,0x5D07,0x5D27,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5D69,0x745F,0x819D,0x8768,0x6FD5,0x62FE,0x7FD2,/* 0xA0-0xA7 */
+	0x8936,0x8972,0x4E1E,0x4E58,0x50E7,0x52DD,0x5347,0x627F,/* 0xA8-0xAF */
+	0x6607,0x7E69,0x8805,0x965E,0x4F8D,0x5319,0x5636,0x59CB,/* 0xB0-0xB7 */
+	0x5AA4,0x5C38,0x5C4E,0x5C4D,0x5E02,0x5F11,0x6043,0x65BD,/* 0xB8-0xBF */
+	0x662F,0x6642,0x67BE,0x67F4,0x731C,0x77E2,0x793A,0x7FC5,/* 0xC0-0xC7 */
+	0x8494,0x84CD,0x8996,0x8A66,0x8A69,0x8AE1,0x8C55,0x8C7A,/* 0xC8-0xCF */
+	0x57F4,0x5BD4,0x5F0F,0x606F,0x62ED,0x690D,0x6B96,0x6E5C,/* 0xD0-0xD7 */
+	0x7184,0x7BD2,0x8755,0x8B58,0x8EFE,0x98DF,0x98FE,0x4F38,/* 0xD8-0xDF */
+	0x4F81,0x4FE1,0x547B,0x5A20,0x5BB8,0x613C,0x65B0,0x6668,/* 0xE0-0xE7 */
+	0x71FC,0x7533,0x795E,0x7D33,0x814E,0x81E3,0x8398,0x85AA,/* 0xE8-0xEF */
+	0x85CE,0x8703,0x8A0A,0x8EAB,0x8F9B,0xF971,0x8FC5,0x5931,/* 0xF0-0xF7 */
+	0x5BA4,0x5BE6,0x6089,0x5BE9,0x5C0B,0x5FC3,0x6C81,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xF972,0x6DF1,0x700B,0x751A,0x82AF,0x8AF6,0x4EC0,/* 0xA0-0xA7 */
+	0x5341,0xF973,0x96D9,0x6C0F,0x4E9E,0x4FC4,0x5152,0x555E,/* 0xA8-0xAF */
+	0x5A25,0x5CE8,0x6211,0x7259,0x82BD,0x83AA,0x86FE,0x8859,/* 0xB0-0xB7 */
+	0x8A1D,0x963F,0x96C5,0x9913,0x9D09,0x9D5D,0x580A,0x5CB3,/* 0xB8-0xBF */
+	0x5DBD,0x5E44,0x60E1,0x6115,0x63E1,0x6A02,0x6E25,0x9102,/* 0xC0-0xC7 */
+	0x9354,0x984E,0x9C10,0x9F77,0x5B89,0x5CB8,0x6309,0x664F,/* 0xC8-0xCF */
+	0x6848,0x773C,0x96C1,0x978D,0x9854,0x9B9F,0x65A1,0x8B01,/* 0xD0-0xD7 */
+	0x8ECB,0x95BC,0x5535,0x5CA9,0x5DD6,0x5EB5,0x6697,0x764C,/* 0xD8-0xDF */
+	0x83F4,0x95C7,0x58D3,0x62BC,0x72CE,0x9D28,0x4EF0,0x592E,/* 0xE0-0xE7 */
+	0x600F,0x663B,0x6B83,0x79E7,0x9D26,0x5393,0x54C0,0x57C3,/* 0xE8-0xEF */
+	0x5D16,0x611B,0x66D6,0x6DAF,0x788D,0x827E,0x9698,0x9744,/* 0xF0-0xF7 */
+	0x5384,0x627C,0x6396,0x6DB2,0x7E0A,0x814B,0x984D,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6AFB,0x7F4C,0x9DAF,0x9E1A,0x4E5F,0x503B,0x51B6,/* 0xA0-0xA7 */
+	0x591C,0x60F9,0x63F6,0x6930,0x723A,0x8036,0xF974,0x91CE,/* 0xA8-0xAF */
+	0x5F31,0xF975,0xF976,0x7D04,0x82E5,0x846F,0x84BB,0x85E5,/* 0xB0-0xB7 */
+	0x8E8D,0xF977,0x4F6F,0xF978,0xF979,0x58E4,0x5B43,0x6059,/* 0xB8-0xBF */
+	0x63DA,0x6518,0x656D,0x6698,0xF97A,0x694A,0x6A23,0x6D0B,/* 0xC0-0xC7 */
+	0x7001,0x716C,0x75D2,0x760D,0x79B3,0x7A70,0xF97B,0x7F8A,/* 0xC8-0xCF */
+	0xF97C,0x8944,0xF97D,0x8B93,0x91C0,0x967D,0xF97E,0x990A,/* 0xD0-0xD7 */
+	0x5704,0x5FA1,0x65BC,0x6F01,0x7600,0x79A6,0x8A9E,0x99AD,/* 0xD8-0xDF */
+	0x9B5A,0x9F6C,0x5104,0x61B6,0x6291,0x6A8D,0x81C6,0x5043,/* 0xE0-0xE7 */
+	0x5830,0x5F66,0x7109,0x8A00,0x8AFA,0x5B7C,0x8616,0x4FFA,/* 0xE8-0xEF */
+	0x513C,0x56B4,0x5944,0x63A9,0x6DF9,0x5DAA,0x696D,0x5186,/* 0xF0-0xF7 */
+	0x4E88,0x4F59,0xF97F,0xF980,0xF981,0x5982,0xF982,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xF983,0x6B5F,0x6C5D,0xF984,0x74B5,0x7916,0xF985,/* 0xA0-0xA7 */
+	0x8207,0x8245,0x8339,0x8F3F,0x8F5D,0xF986,0x9918,0xF987,/* 0xA8-0xAF */
+	0xF988,0xF989,0x4EA6,0xF98A,0x57DF,0x5F79,0x6613,0xF98B,/* 0xB0-0xB7 */
+	0xF98C,0x75AB,0x7E79,0x8B6F,0xF98D,0x9006,0x9A5B,0x56A5,/* 0xB8-0xBF */
+	0x5827,0x59F8,0x5A1F,0x5BB4,0xF98E,0x5EF6,0xF98F,0xF990,/* 0xC0-0xC7 */
+	0x6350,0x633B,0xF991,0x693D,0x6C87,0x6CBF,0x6D8E,0x6D93,/* 0xC8-0xCF */
+	0x6DF5,0x6F14,0xF992,0x70DF,0x7136,0x7159,0xF993,0x71C3,/* 0xD0-0xD7 */
+	0x71D5,0xF994,0x784F,0x786F,0xF995,0x7B75,0x7DE3,0xF996,/* 0xD8-0xDF */
+	0x7E2F,0xF997,0x884D,0x8EDF,0xF998,0xF999,0xF99A,0x925B,/* 0xE0-0xE7 */
+	0xF99B,0x9CF6,0xF99C,0xF99D,0xF99E,0x6085,0x6D85,0xF99F,/* 0xE8-0xEF */
+	0x71B1,0xF9A0,0xF9A1,0x95B1,0x53AD,0xF9A2,0xF9A3,0xF9A4,/* 0xF0-0xF7 */
+	0x67D3,0xF9A5,0x708E,0x7130,0x7430,0x8276,0x82D2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xF9A6,0x95BB,0x9AE5,0x9E7D,0x66C4,0xF9A7,0x71C1,/* 0xA0-0xA7 */
+	0x8449,0xF9A8,0xF9A9,0x584B,0xF9AA,0xF9AB,0x5DB8,0x5F71,/* 0xA8-0xAF */
+	0xF9AC,0x6620,0x668E,0x6979,0x69AE,0x6C38,0x6CF3,0x6E36,/* 0xB0-0xB7 */
+	0x6F41,0x6FDA,0x701B,0x702F,0x7150,0x71DF,0x7370,0xF9AD,/* 0xB8-0xBF */
+	0x745B,0xF9AE,0x74D4,0x76C8,0x7A4E,0x7E93,0xF9AF,0xF9B0,/* 0xC0-0xC7 */
+	0x82F1,0x8A60,0x8FCE,0xF9B1,0x9348,0xF9B2,0x9719,0xF9B3,/* 0xC8-0xCF */
+	0xF9B4,0x4E42,0x502A,0xF9B5,0x5208,0x53E1,0x66F3,0x6C6D,/* 0xD0-0xD7 */
+	0x6FCA,0x730A,0x777F,0x7A62,0x82AE,0x85DD,0x8602,0xF9B6,/* 0xD8-0xDF */
+	0x88D4,0x8A63,0x8B7D,0x8C6B,0xF9B7,0x92B3,0xF9B8,0x9713,/* 0xE0-0xE7 */
+	0x9810,0x4E94,0x4F0D,0x4FC9,0x50B2,0x5348,0x543E,0x5433,/* 0xE8-0xEF */
+	0x55DA,0x5862,0x58BA,0x5967,0x5A1B,0x5BE4,0x609F,0xF9B9,/* 0xF0-0xF7 */
+	0x61CA,0x6556,0x65FF,0x6664,0x68A7,0x6C5A,0x6FB3,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x70CF,0x71AC,0x7352,0x7B7D,0x8708,0x8AA4,0x9C32,/* 0xA0-0xA7 */
+	0x9F07,0x5C4B,0x6C83,0x7344,0x7389,0x923A,0x6EAB,0x7465,/* 0xA8-0xAF */
+	0x761F,0x7A69,0x7E15,0x860A,0x5140,0x58C5,0x64C1,0x74EE,/* 0xB0-0xB7 */
+	0x7515,0x7670,0x7FC1,0x9095,0x96CD,0x9954,0x6E26,0x74E6,/* 0xB8-0xBF */
+	0x7AA9,0x7AAA,0x81E5,0x86D9,0x8778,0x8A1B,0x5A49,0x5B8C,/* 0xC0-0xC7 */
+	0x5B9B,0x68A1,0x6900,0x6D63,0x73A9,0x7413,0x742C,0x7897,/* 0xC8-0xCF */
+	0x7DE9,0x7FEB,0x8118,0x8155,0x839E,0x8C4C,0x962E,0x9811,/* 0xD0-0xD7 */
+	0x66F0,0x5F80,0x65FA,0x6789,0x6C6A,0x738B,0x502D,0x5A03,/* 0xD8-0xDF */
+	0x6B6A,0x77EE,0x5916,0x5D6C,0x5DCD,0x7325,0x754F,0xF9BA,/* 0xE0-0xE7 */
+	0xF9BB,0x50E5,0x51F9,0x582F,0x592D,0x5996,0x59DA,0x5BE5,/* 0xE8-0xEF */
+	0xF9BC,0xF9BD,0x5DA2,0x62D7,0x6416,0x6493,0x64FE,0xF9BE,/* 0xF0-0xF7 */
+	0x66DC,0xF9BF,0x6A48,0xF9C0,0x71FF,0x7464,0xF9C1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7A88,0x7AAF,0x7E47,0x7E5E,0x8000,0x8170,0xF9C2,/* 0xA0-0xA7 */
+	0x87EF,0x8981,0x8B20,0x9059,0xF9C3,0x9080,0x9952,0x617E,/* 0xA8-0xAF */
+	0x6B32,0x6D74,0x7E1F,0x8925,0x8FB1,0x4FD1,0x50AD,0x5197,/* 0xB0-0xB7 */
+	0x52C7,0x57C7,0x5889,0x5BB9,0x5EB8,0x6142,0x6995,0x6D8C,/* 0xB8-0xBF */
+	0x6E67,0x6EB6,0x7194,0x7462,0x7528,0x752C,0x8073,0x8338,/* 0xC0-0xC7 */
+	0x84C9,0x8E0A,0x9394,0x93DE,0xF9C4,0x4E8E,0x4F51,0x5076,/* 0xC8-0xCF */
+	0x512A,0x53C8,0x53CB,0x53F3,0x5B87,0x5BD3,0x5C24,0x611A,/* 0xD0-0xD7 */
+	0x6182,0x65F4,0x725B,0x7397,0x7440,0x76C2,0x7950,0x7991,/* 0xD8-0xDF */
+	0x79B9,0x7D06,0x7FBD,0x828B,0x85D5,0x865E,0x8FC2,0x9047,/* 0xE0-0xE7 */
+	0x90F5,0x91EA,0x9685,0x96E8,0x96E9,0x52D6,0x5F67,0x65ED,/* 0xE8-0xEF */
+	0x6631,0x682F,0x715C,0x7A36,0x90C1,0x980A,0x4E91,0xF9C5,/* 0xF0-0xF7 */
+	0x6A52,0x6B9E,0x6F90,0x7189,0x8018,0x82B8,0x8553,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x904B,0x9695,0x96F2,0x97FB,0x851A,0x9B31,0x4E90,/* 0xA0-0xA7 */
+	0x718A,0x96C4,0x5143,0x539F,0x54E1,0x5713,0x5712,0x57A3,/* 0xA8-0xAF */
+	0x5A9B,0x5AC4,0x5BC3,0x6028,0x613F,0x63F4,0x6C85,0x6D39,/* 0xB0-0xB7 */
+	0x6E72,0x6E90,0x7230,0x733F,0x7457,0x82D1,0x8881,0x8F45,/* 0xB8-0xBF */
+	0x9060,0xF9C6,0x9662,0x9858,0x9D1B,0x6708,0x8D8A,0x925E,/* 0xC0-0xC7 */
+	0x4F4D,0x5049,0x50DE,0x5371,0x570D,0x59D4,0x5A01,0x5C09,/* 0xC8-0xCF */
+	0x6170,0x6690,0x6E2D,0x7232,0x744B,0x7DEF,0x80C3,0x840E,/* 0xD0-0xD7 */
+	0x8466,0x853F,0x875F,0x885B,0x8918,0x8B02,0x9055,0x97CB,/* 0xD8-0xDF */
+	0x9B4F,0x4E73,0x4F91,0x5112,0x516A,0xF9C7,0x552F,0x55A9,/* 0xE0-0xE7 */
+	0x5B7A,0x5BA5,0x5E7C,0x5E7D,0x5EBE,0x60A0,0x60DF,0x6108,/* 0xE8-0xEF */
+	0x6109,0x63C4,0x6538,0x6709,0xF9C8,0x67D4,0x67DA,0xF9C9,/* 0xF0-0xF7 */
+	0x6961,0x6962,0x6CB9,0x6D27,0xF9CA,0x6E38,0xF9CB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6FE1,0x7336,0x7337,0xF9CC,0x745C,0x7531,0xF9CD,/* 0xA0-0xA7 */
+	0x7652,0xF9CE,0xF9CF,0x7DAD,0x81FE,0x8438,0x88D5,0x8A98,/* 0xA8-0xAF */
+	0x8ADB,0x8AED,0x8E30,0x8E42,0x904A,0x903E,0x907A,0x9149,/* 0xB0-0xB7 */
+	0x91C9,0x936E,0xF9D0,0xF9D1,0x5809,0xF9D2,0x6BD3,0x8089,/* 0xB8-0xBF */
+	0x80B2,0xF9D3,0xF9D4,0x5141,0x596B,0x5C39,0xF9D5,0xF9D6,/* 0xC0-0xC7 */
+	0x6F64,0x73A7,0x80E4,0x8D07,0xF9D7,0x9217,0x958F,0xF9D8,/* 0xC8-0xCF */
+	0xF9D9,0xF9DA,0xF9DB,0x807F,0x620E,0x701C,0x7D68,0x878D,/* 0xD0-0xD7 */
+	0xF9DC,0x57A0,0x6069,0x6147,0x6BB7,0x8ABE,0x9280,0x96B1,/* 0xD8-0xDF */
+	0x4E59,0x541F,0x6DEB,0x852D,0x9670,0x97F3,0x98EE,0x63D6,/* 0xE0-0xE7 */
+	0x6CE3,0x9091,0x51DD,0x61C9,0x81BA,0x9DF9,0x4F9D,0x501A,/* 0xE8-0xEF */
+	0x5100,0x5B9C,0x610F,0x61FF,0x64EC,0x6905,0x6BC5,0x7591,/* 0xF0-0xF7 */
+	0x77E3,0x7FA9,0x8264,0x858F,0x87FB,0x8863,0x8ABC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8B70,0x91AB,0x4E8C,0x4EE5,0x4F0A,0xF9DD,0xF9DE,/* 0xA0-0xA7 */
+	0x5937,0x59E8,0xF9DF,0x5DF2,0x5F1B,0x5F5B,0x6021,0xF9E0,/* 0xA8-0xAF */
+	0xF9E1,0xF9E2,0xF9E3,0x723E,0x73E5,0xF9E4,0x7570,0x75CD,/* 0xB0-0xB7 */
+	0xF9E5,0x79FB,0xF9E6,0x800C,0x8033,0x8084,0x82E1,0x8351,/* 0xB8-0xBF */
+	0xF9E7,0xF9E8,0x8CBD,0x8CB3,0x9087,0xF9E9,0xF9EA,0x98F4,/* 0xC0-0xC7 */
+	0x990C,0xF9EB,0xF9EC,0x7037,0x76CA,0x7FCA,0x7FCC,0x7FFC,/* 0xC8-0xCF */
+	0x8B1A,0x4EBA,0x4EC1,0x5203,0x5370,0xF9ED,0x54BD,0x56E0,/* 0xD0-0xD7 */
+	0x59FB,0x5BC5,0x5F15,0x5FCD,0x6E6E,0xF9EE,0xF9EF,0x7D6A,/* 0xD8-0xDF */
+	0x8335,0xF9F0,0x8693,0x8A8D,0xF9F1,0x976D,0x9777,0xF9F2,/* 0xE0-0xE7 */
+	0xF9F3,0x4E00,0x4F5A,0x4F7E,0x58F9,0x65E5,0x6EA2,0x9038,/* 0xE8-0xEF */
+	0x93B0,0x99B9,0x4EFB,0x58EC,0x598A,0x59D9,0x6041,0xF9F4,/* 0xF0-0xF7 */
+	0xF9F5,0x7A14,0xF9F6,0x834F,0x8CC3,0x5165,0x5344,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_ED[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xF9F7,0xF9F8,0xF9F9,0x4ECD,0x5269,0x5B55,0x82BF,/* 0xA0-0xA7 */
+	0x4ED4,0x523A,0x54A8,0x59C9,0x59FF,0x5B50,0x5B57,0x5B5C,/* 0xA8-0xAF */
+	0x6063,0x6148,0x6ECB,0x7099,0x716E,0x7386,0x74F7,0x75B5,/* 0xB0-0xB7 */
+	0x78C1,0x7D2B,0x8005,0x81EA,0x8328,0x8517,0x85C9,0x8AEE,/* 0xB8-0xBF */
+	0x8CC7,0x96CC,0x4F5C,0x52FA,0x56BC,0x65AB,0x6628,0x707C,/* 0xC0-0xC7 */
+	0x70B8,0x7235,0x7DBD,0x828D,0x914C,0x96C0,0x9D72,0x5B71,/* 0xC8-0xCF */
+	0x68E7,0x6B98,0x6F7A,0x76DE,0x5C91,0x66AB,0x6F5B,0x7BB4,/* 0xD0-0xD7 */
+	0x7C2A,0x8836,0x96DC,0x4E08,0x4ED7,0x5320,0x5834,0x58BB,/* 0xD8-0xDF */
+	0x58EF,0x596C,0x5C07,0x5E33,0x5E84,0x5F35,0x638C,0x66B2,/* 0xE0-0xE7 */
+	0x6756,0x6A1F,0x6AA3,0x6B0C,0x6F3F,0x7246,0xF9FA,0x7350,/* 0xE8-0xEF */
+	0x748B,0x7AE0,0x7CA7,0x8178,0x81DF,0x81E7,0x838A,0x846C,/* 0xF0-0xF7 */
+	0x8523,0x8594,0x85CF,0x88DD,0x8D13,0x91AC,0x9577,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x969C,0x518D,0x54C9,0x5728,0x5BB0,0x624D,0x6750,/* 0xA0-0xA7 */
+	0x683D,0x6893,0x6E3D,0x6ED3,0x707D,0x7E21,0x88C1,0x8CA1,/* 0xA8-0xAF */
+	0x8F09,0x9F4B,0x9F4E,0x722D,0x7B8F,0x8ACD,0x931A,0x4F47,/* 0xB0-0xB7 */
+	0x4F4E,0x5132,0x5480,0x59D0,0x5E95,0x62B5,0x6775,0x696E,/* 0xB8-0xBF */
+	0x6A17,0x6CAE,0x6E1A,0x72D9,0x732A,0x75BD,0x7BB8,0x7D35,/* 0xC0-0xC7 */
+	0x82E7,0x83F9,0x8457,0x85F7,0x8A5B,0x8CAF,0x8E87,0x9019,/* 0xC8-0xCF */
+	0x90B8,0x96CE,0x9F5F,0x52E3,0x540A,0x5AE1,0x5BC2,0x6458,/* 0xD0-0xD7 */
+	0x6575,0x6EF4,0x72C4,0xF9FB,0x7684,0x7A4D,0x7B1B,0x7C4D,/* 0xD8-0xDF */
+	0x7E3E,0x7FDF,0x837B,0x8B2B,0x8CCA,0x8D64,0x8DE1,0x8E5F,/* 0xE0-0xE7 */
+	0x8FEA,0x8FF9,0x9069,0x93D1,0x4F43,0x4F7A,0x50B3,0x5168,/* 0xE8-0xEF */
+	0x5178,0x524D,0x526A,0x5861,0x587C,0x5960,0x5C08,0x5C55,/* 0xF0-0xF7 */
+	0x5EDB,0x609B,0x6230,0x6813,0x6BBF,0x6C08,0x6FB1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x714E,0x7420,0x7530,0x7538,0x7551,0x7672,0x7B4C,/* 0xA0-0xA7 */
+	0x7B8B,0x7BAD,0x7BC6,0x7E8F,0x8A6E,0x8F3E,0x8F49,0x923F,/* 0xA8-0xAF */
+	0x9293,0x9322,0x942B,0x96FB,0x985A,0x986B,0x991E,0x5207,/* 0xB0-0xB7 */
+	0x622A,0x6298,0x6D59,0x7664,0x7ACA,0x7BC0,0x7D76,0x5360,/* 0xB8-0xBF */
+	0x5CBE,0x5E97,0x6F38,0x70B9,0x7C98,0x9711,0x9B8E,0x9EDE,/* 0xC0-0xC7 */
+	0x63A5,0x647A,0x8776,0x4E01,0x4E95,0x4EAD,0x505C,0x5075,/* 0xC8-0xCF */
+	0x5448,0x59C3,0x5B9A,0x5E40,0x5EAD,0x5EF7,0x5F81,0x60C5,/* 0xD0-0xD7 */
+	0x633A,0x653F,0x6574,0x65CC,0x6676,0x6678,0x67FE,0x6968,/* 0xD8-0xDF */
+	0x6A89,0x6B63,0x6C40,0x6DC0,0x6DE8,0x6E1F,0x6E5E,0x701E,/* 0xE0-0xE7 */
+	0x70A1,0x738E,0x73FD,0x753A,0x775B,0x7887,0x798E,0x7A0B,/* 0xE8-0xEF */
+	0x7A7D,0x7CBE,0x7D8E,0x8247,0x8A02,0x8AEA,0x8C9E,0x912D,/* 0xF0-0xF7 */
+	0x914A,0x91D8,0x9266,0x92CC,0x9320,0x9706,0x9756,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x975C,0x9802,0x9F0E,0x5236,0x5291,0x557C,0x5824,/* 0xA0-0xA7 */
+	0x5E1D,0x5F1F,0x608C,0x63D0,0x68AF,0x6FDF,0x796D,0x7B2C,/* 0xA8-0xAF */
+	0x81CD,0x85BA,0x88FD,0x8AF8,0x8E44,0x918D,0x9664,0x969B,/* 0xB0-0xB7 */
+	0x973D,0x984C,0x9F4A,0x4FCE,0x5146,0x51CB,0x52A9,0x5632,/* 0xB8-0xBF */
+	0x5F14,0x5F6B,0x63AA,0x64CD,0x65E9,0x6641,0x66FA,0x66F9,/* 0xC0-0xC7 */
+	0x671D,0x689D,0x68D7,0x69FD,0x6F15,0x6F6E,0x7167,0x71E5,/* 0xC8-0xCF */
+	0x722A,0x74AA,0x773A,0x7956,0x795A,0x79DF,0x7A20,0x7A95,/* 0xD0-0xD7 */
+	0x7C97,0x7CDF,0x7D44,0x7E70,0x8087,0x85FB,0x86A4,0x8A54,/* 0xD8-0xDF */
+	0x8ABF,0x8D99,0x8E81,0x9020,0x906D,0x91E3,0x963B,0x96D5,/* 0xE0-0xE7 */
+	0x9CE5,0x65CF,0x7C07,0x8DB3,0x93C3,0x5B58,0x5C0A,0x5352,/* 0xE8-0xEF */
+	0x62D9,0x731D,0x5027,0x5B97,0x5F9E,0x60B0,0x616B,0x68D5,/* 0xF0-0xF7 */
+	0x6DD9,0x742E,0x7A2E,0x7D42,0x7D9C,0x7E31,0x816B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8E2A,0x8E35,0x937E,0x9418,0x4F50,0x5750,0x5DE6,/* 0xA0-0xA7 */
+	0x5EA7,0x632B,0x7F6A,0x4E3B,0x4F4F,0x4F8F,0x505A,0x59DD,/* 0xA8-0xAF */
+	0x80C4,0x546A,0x5468,0x55FE,0x594F,0x5B99,0x5DDE,0x5EDA,/* 0xB0-0xB7 */
+	0x665D,0x6731,0x67F1,0x682A,0x6CE8,0x6D32,0x6E4A,0x6F8D,/* 0xB8-0xBF */
+	0x70B7,0x73E0,0x7587,0x7C4C,0x7D02,0x7D2C,0x7DA2,0x821F,/* 0xC0-0xC7 */
+	0x86DB,0x8A3B,0x8A85,0x8D70,0x8E8A,0x8F33,0x9031,0x914E,/* 0xC8-0xCF */
+	0x9152,0x9444,0x99D0,0x7AF9,0x7CA5,0x4FCA,0x5101,0x51C6,/* 0xD0-0xD7 */
+	0x57C8,0x5BEF,0x5CFB,0x6659,0x6A3D,0x6D5A,0x6E96,0x6FEC,/* 0xD8-0xDF */
+	0x710C,0x756F,0x7AE3,0x8822,0x9021,0x9075,0x96CB,0x99FF,/* 0xE0-0xE7 */
+	0x8301,0x4E2D,0x4EF2,0x8846,0x91CD,0x537D,0x6ADB,0x696B,/* 0xE8-0xEF */
+	0x6C41,0x847A,0x589E,0x618E,0x66FE,0x62EF,0x70DD,0x7511,/* 0xF0-0xF7 */
+	0x75C7,0x7E52,0x84B8,0x8B49,0x8D08,0x4E4B,0x53EA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x54AB,0x5730,0x5740,0x5FD7,0x6301,0x6307,0x646F,/* 0xA0-0xA7 */
+	0x652F,0x65E8,0x667A,0x679D,0x67B3,0x6B62,0x6C60,0x6C9A,/* 0xA8-0xAF */
+	0x6F2C,0x77E5,0x7825,0x7949,0x7957,0x7D19,0x80A2,0x8102,/* 0xB0-0xB7 */
+	0x81F3,0x829D,0x82B7,0x8718,0x8A8C,0xF9FC,0x8D04,0x8DBE,/* 0xB8-0xBF */
+	0x9072,0x76F4,0x7A19,0x7A37,0x7E54,0x8077,0x5507,0x55D4,/* 0xC0-0xC7 */
+	0x5875,0x632F,0x6422,0x6649,0x664B,0x686D,0x699B,0x6B84,/* 0xC8-0xCF */
+	0x6D25,0x6EB1,0x73CD,0x7468,0x74A1,0x755B,0x75B9,0x76E1,/* 0xD0-0xD7 */
+	0x771E,0x778B,0x79E6,0x7E09,0x7E1D,0x81FB,0x852F,0x8897,/* 0xD8-0xDF */
+	0x8A3A,0x8CD1,0x8EEB,0x8FB0,0x9032,0x93AD,0x9663,0x9673,/* 0xE0-0xE7 */
+	0x9707,0x4F84,0x53F1,0x59EA,0x5AC9,0x5E19,0x684E,0x74C6,/* 0xE8-0xEF */
+	0x75BE,0x79E9,0x7A92,0x81A3,0x86ED,0x8CEA,0x8DCC,0x8FED,/* 0xF0-0xF7 */
+	0x659F,0x6715,0xF9FD,0x57F7,0x6F57,0x7DDD,0x8F2F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x93F6,0x96C6,0x5FB5,0x61F2,0x6F84,0x4E14,0x4F98,/* 0xA0-0xA7 */
+	0x501F,0x53C9,0x55DF,0x5D6F,0x5DEE,0x6B21,0x6B64,0x78CB,/* 0xA8-0xAF */
+	0x7B9A,0xF9FE,0x8E49,0x8ECA,0x906E,0x6349,0x643E,0x7740,/* 0xB0-0xB7 */
+	0x7A84,0x932F,0x947F,0x9F6A,0x64B0,0x6FAF,0x71E6,0x74A8,/* 0xB8-0xBF */
+	0x74DA,0x7AC4,0x7C12,0x7E82,0x7CB2,0x7E98,0x8B9A,0x8D0A,/* 0xC0-0xC7 */
+	0x947D,0x9910,0x994C,0x5239,0x5BDF,0x64E6,0x672D,0x7D2E,/* 0xC8-0xCF */
+	0x50ED,0x53C3,0x5879,0x6158,0x6159,0x61FA,0x65AC,0x7AD9,/* 0xD0-0xD7 */
+	0x8B92,0x8B96,0x5009,0x5021,0x5275,0x5531,0x5A3C,0x5EE0,/* 0xD8-0xDF */
+	0x5F70,0x6134,0x655E,0x660C,0x6636,0x66A2,0x69CD,0x6EC4,/* 0xE0-0xE7 */
+	0x6F32,0x7316,0x7621,0x7A93,0x8139,0x8259,0x83D6,0x84BC,/* 0xE8-0xEF */
+	0x50B5,0x57F0,0x5BC0,0x5BE8,0x5F69,0x63A1,0x7826,0x7DB5,/* 0xF0-0xF7 */
+	0x83DC,0x8521,0x91C7,0x91F5,0x518A,0x67F5,0x7B56,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8CAC,0x51C4,0x59BB,0x60BD,0x8655,0x501C,0xF9FF,/* 0xA0-0xA7 */
+	0x5254,0x5C3A,0x617D,0x621A,0x62D3,0x64F2,0x65A5,0x6ECC,/* 0xA8-0xAF */
+	0x7620,0x810A,0x8E60,0x965F,0x96BB,0x4EDF,0x5343,0x5598,/* 0xB0-0xB7 */
+	0x5929,0x5DDD,0x64C5,0x6CC9,0x6DFA,0x7394,0x7A7F,0x821B,/* 0xB8-0xBF */
+	0x85A6,0x8CE4,0x8E10,0x9077,0x91E7,0x95E1,0x9621,0x97C6,/* 0xC0-0xC7 */
+	0x51F8,0x54F2,0x5586,0x5FB9,0x64A4,0x6F88,0x7DB4,0x8F1F,/* 0xC8-0xCF */
+	0x8F4D,0x9435,0x50C9,0x5C16,0x6CBE,0x6DFB,0x751B,0x77BB,/* 0xD0-0xD7 */
+	0x7C3D,0x7C64,0x8A79,0x8AC2,0x581E,0x59BE,0x5E16,0x6377,/* 0xD8-0xDF */
+	0x7252,0x758A,0x776B,0x8ADC,0x8CBC,0x8F12,0x5EF3,0x6674,/* 0xE0-0xE7 */
+	0x6DF8,0x807D,0x83C1,0x8ACB,0x9751,0x9BD6,0xFA00,0x5243,/* 0xE8-0xEF */
+	0x66FF,0x6D95,0x6EEF,0x7DE0,0x8AE6,0x902E,0x905E,0x9AD4,/* 0xF0-0xF7 */
+	0x521D,0x527F,0x54E8,0x6194,0x6284,0x62DB,0x68A2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6912,0x695A,0x6A35,0x7092,0x7126,0x785D,0x7901,/* 0xA0-0xA7 */
+	0x790E,0x79D2,0x7A0D,0x8096,0x8278,0x82D5,0x8349,0x8549,/* 0xA8-0xAF */
+	0x8C82,0x8D85,0x9162,0x918B,0x91AE,0x4FC3,0x56D1,0x71ED,/* 0xB0-0xB7 */
+	0x77D7,0x8700,0x89F8,0x5BF8,0x5FD6,0x6751,0x90A8,0x53E2,/* 0xB8-0xBF */
+	0x585A,0x5BF5,0x60A4,0x6181,0x6460,0x7E3D,0x8070,0x8525,/* 0xC0-0xC7 */
+	0x9283,0x64AE,0x50AC,0x5D14,0x6700,0x589C,0x62BD,0x63A8,/* 0xC8-0xCF */
+	0x690E,0x6978,0x6A1E,0x6E6B,0x76BA,0x79CB,0x82BB,0x8429,/* 0xD0-0xD7 */
+	0x8ACF,0x8DA8,0x8FFD,0x9112,0x914B,0x919C,0x9310,0x9318,/* 0xD8-0xDF */
+	0x939A,0x96DB,0x9A36,0x9C0D,0x4E11,0x755C,0x795D,0x7AFA,/* 0xE0-0xE7 */
+	0x7B51,0x7BC9,0x7E2E,0x84C4,0x8E59,0x8E74,0x8EF8,0x9010,/* 0xE8-0xEF */
+	0x6625,0x693F,0x7443,0x51FA,0x672E,0x9EDC,0x5145,0x5FE0,/* 0xF0-0xF7 */
+	0x6C96,0x87F2,0x885D,0x8877,0x60B4,0x81B5,0x8403,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8D05,0x53D6,0x5439,0x5634,0x5A36,0x5C31,0x708A,/* 0xA0-0xA7 */
+	0x7FE0,0x805A,0x8106,0x81ED,0x8DA3,0x9189,0x9A5F,0x9DF2,/* 0xA8-0xAF */
+	0x5074,0x4EC4,0x53A0,0x60FB,0x6E2C,0x5C64,0x4F88,0x5024,/* 0xB0-0xB7 */
+	0x55E4,0x5CD9,0x5E5F,0x6065,0x6894,0x6CBB,0x6DC4,0x71BE,/* 0xB8-0xBF */
+	0x75D4,0x75F4,0x7661,0x7A1A,0x7A49,0x7DC7,0x7DFB,0x7F6E,/* 0xC0-0xC7 */
+	0x81F4,0x86A9,0x8F1C,0x96C9,0x99B3,0x9F52,0x5247,0x52C5,/* 0xC8-0xCF */
+	0x98ED,0x89AA,0x4E03,0x67D2,0x6F06,0x4FB5,0x5BE2,0x6795,/* 0xD0-0xD7 */
+	0x6C88,0x6D78,0x741B,0x7827,0x91DD,0x937C,0x87C4,0x79E4,/* 0xD8-0xDF */
+	0x7A31,0x5FEB,0x4ED6,0x54A4,0x553E,0x58AE,0x59A5,0x60F0,/* 0xE0-0xE7 */
+	0x6253,0x62D6,0x6736,0x6955,0x8235,0x9640,0x99B1,0x99DD,/* 0xE8-0xEF */
+	0x502C,0x5353,0x5544,0x577C,0xFA01,0x6258,0xFA02,0x64E2,/* 0xF0-0xF7 */
+	0x666B,0x67DD,0x6FC1,0x6FEF,0x7422,0x7438,0x8A17,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9438,0x5451,0x5606,0x5766,0x5F48,0x619A,0x6B4E,/* 0xA0-0xA7 */
+	0x7058,0x70AD,0x7DBB,0x8A95,0x596A,0x812B,0x63A2,0x7708,/* 0xA8-0xAF */
+	0x803D,0x8CAA,0x5854,0x642D,0x69BB,0x5B95,0x5E11,0x6E6F,/* 0xB0-0xB7 */
+	0xFA03,0x8569,0x514C,0x53F0,0x592A,0x6020,0x614B,0x6B86,/* 0xB8-0xBF */
+	0x6C70,0x6CF0,0x7B1E,0x80CE,0x82D4,0x8DC6,0x90B0,0x98B1,/* 0xC0-0xC7 */
+	0xFA04,0x64C7,0x6FA4,0x6491,0x6504,0x514E,0x5410,0x571F,/* 0xC8-0xCF */
+	0x8A0E,0x615F,0x6876,0xFA05,0x75DB,0x7B52,0x7D71,0x901A,/* 0xD0-0xD7 */
+	0x5806,0x69CC,0x817F,0x892A,0x9000,0x9839,0x5078,0x5957,/* 0xD8-0xDF */
+	0x59AC,0x6295,0x900F,0x9B2A,0x615D,0x7279,0x95D6,0x5761,/* 0xE0-0xE7 */
+	0x5A46,0x5DF4,0x628A,0x64AD,0x64FA,0x6777,0x6CE2,0x6D3E,/* 0xE8-0xEF */
+	0x722C,0x7436,0x7834,0x7F77,0x82AD,0x8DDB,0x9817,0x5224,/* 0xF0-0xF7 */
+	0x5742,0x677F,0x7248,0x74E3,0x8CA9,0x8FA6,0x9211,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x962A,0x516B,0x53ED,0x634C,0x4F69,0x5504,0x6096,/* 0xA0-0xA7 */
+	0x6557,0x6C9B,0x6D7F,0x724C,0x72FD,0x7A17,0x8987,0x8C9D,/* 0xA8-0xAF */
+	0x5F6D,0x6F8E,0x70F9,0x81A8,0x610E,0x4FBF,0x504F,0x6241,/* 0xB0-0xB7 */
+	0x7247,0x7BC7,0x7DE8,0x7FE9,0x904D,0x97AD,0x9A19,0x8CB6,/* 0xB8-0xBF */
+	0x576A,0x5E73,0x67B0,0x840D,0x8A55,0x5420,0x5B16,0x5E63,/* 0xC0-0xC7 */
+	0x5EE2,0x5F0A,0x6583,0x80BA,0x853D,0x9589,0x965B,0x4F48,/* 0xC8-0xCF */
+	0x5305,0x530D,0x530F,0x5486,0x54FA,0x5703,0x5E03,0x6016,/* 0xD0-0xD7 */
+	0x629B,0x62B1,0x6355,0xFA06,0x6CE1,0x6D66,0x75B1,0x7832,/* 0xD8-0xDF */
+	0x80DE,0x812F,0x82DE,0x8461,0x84B2,0x888D,0x8912,0x900B,/* 0xE0-0xE7 */
+	0x92EA,0x98FD,0x9B91,0x5E45,0x66B4,0x66DD,0x7011,0x7206,/* 0xE8-0xEF */
+	0xFA07,0x4FF5,0x527D,0x5F6A,0x6153,0x6753,0x6A19,0x6F02,/* 0xF0-0xF7 */
+	0x74E2,0x7968,0x8868,0x8C79,0x98C7,0x98C4,0x9A43,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x54C1,0x7A1F,0x6953,0x8AF7,0x8C4A,0x98A8,0x99AE,/* 0xA0-0xA7 */
+	0x5F7C,0x62AB,0x75B2,0x76AE,0x88AB,0x907F,0x9642,0x5339,/* 0xA8-0xAF */
+	0x5F3C,0x5FC5,0x6CCC,0x73CC,0x7562,0x758B,0x7B46,0x82FE,/* 0xB0-0xB7 */
+	0x999D,0x4E4F,0x903C,0x4E0B,0x4F55,0x53A6,0x590F,0x5EC8,/* 0xB8-0xBF */
+	0x6630,0x6CB3,0x7455,0x8377,0x8766,0x8CC0,0x9050,0x971E,/* 0xC0-0xC7 */
+	0x9C15,0x58D1,0x5B78,0x8650,0x8B14,0x9DB4,0x5BD2,0x6068,/* 0xC8-0xCF */
+	0x608D,0x65F1,0x6C57,0x6F22,0x6FA3,0x701A,0x7F55,0x7FF0,/* 0xD0-0xD7 */
+	0x9591,0x9592,0x9650,0x97D3,0x5272,0x8F44,0x51FD,0x542B,/* 0xD8-0xDF */
+	0x54B8,0x5563,0x558A,0x6ABB,0x6DB5,0x7DD8,0x8266,0x929C,/* 0xE0-0xE7 */
+	0x9677,0x9E79,0x5408,0x54C8,0x76D2,0x86E4,0x95A4,0x95D4,/* 0xE8-0xEF */
+	0x965C,0x4EA2,0x4F09,0x59EE,0x5AE6,0x5DF7,0x6052,0x6297,/* 0xF0-0xF7 */
+	0x676D,0x6841,0x6C86,0x6E2F,0x7F38,0x809B,0x822A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_FA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xFA08,0xFA09,0x9805,0x4EA5,0x5055,0x54B3,0x5793,/* 0xA0-0xA7 */
+	0x595A,0x5B69,0x5BB3,0x61C8,0x6977,0x6D77,0x7023,0x87F9,/* 0xA8-0xAF */
+	0x89E3,0x8A72,0x8AE7,0x9082,0x99ED,0x9AB8,0x52BE,0x6838,/* 0xB0-0xB7 */
+	0x5016,0x5E78,0x674F,0x8347,0x884C,0x4EAB,0x5411,0x56AE,/* 0xB8-0xBF */
+	0x73E6,0x9115,0x97FF,0x9909,0x9957,0x9999,0x5653,0x589F,/* 0xC0-0xC7 */
+	0x865B,0x8A31,0x61B2,0x6AF6,0x737B,0x8ED2,0x6B47,0x96AA,/* 0xC8-0xCF */
+	0x9A57,0x5955,0x7200,0x8D6B,0x9769,0x4FD4,0x5CF4,0x5F26,/* 0xD0-0xD7 */
+	0x61F8,0x665B,0x6CEB,0x70AB,0x7384,0x73B9,0x73FE,0x7729,/* 0xD8-0xDF */
+	0x774D,0x7D43,0x7D62,0x7E23,0x8237,0x8852,0xFA0A,0x8CE2,/* 0xE0-0xE7 */
+	0x9249,0x986F,0x5B51,0x7A74,0x8840,0x9801,0x5ACC,0x4FE0,/* 0xE8-0xEF */
+	0x5354,0x593E,0x5CFD,0x633E,0x6D79,0x72F9,0x8105,0x8107,/* 0xF0-0xF7 */
+	0x83A2,0x92CF,0x9830,0x4EA8,0x5144,0x5211,0x578B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_FB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5F62,0x6CC2,0x6ECE,0x7005,0x7050,0x70AF,0x7192,/* 0xA0-0xA7 */
+	0x73E9,0x7469,0x834A,0x87A2,0x8861,0x9008,0x90A2,0x93A3,/* 0xA8-0xAF */
+	0x99A8,0x516E,0x5F57,0x60E0,0x6167,0x66B3,0x8559,0x8E4A,/* 0xB0-0xB7 */
+	0x91AF,0x978B,0x4E4E,0x4E92,0x547C,0x58D5,0x58FA,0x597D,/* 0xB8-0xBF */
+	0x5CB5,0x5F27,0x6236,0x6248,0x660A,0x6667,0x6BEB,0x6D69,/* 0xC0-0xC7 */
+	0x6DCF,0x6E56,0x6EF8,0x6F94,0x6FE0,0x6FE9,0x705D,0x72D0,/* 0xC8-0xCF */
+	0x7425,0x745A,0x74E0,0x7693,0x795C,0x7CCA,0x7E1E,0x80E1,/* 0xD0-0xD7 */
+	0x82A6,0x846B,0x84BF,0x864E,0x865F,0x8774,0x8B77,0x8C6A,/* 0xD8-0xDF */
+	0x93AC,0x9800,0x9865,0x60D1,0x6216,0x9177,0x5A5A,0x660F,/* 0xE0-0xE7 */
+	0x6DF7,0x6E3E,0x743F,0x9B42,0x5FFD,0x60DA,0x7B0F,0x54C4,/* 0xE8-0xEF */
+	0x5F18,0x6C5E,0x6CD3,0x6D2A,0x70D8,0x7D05,0x8679,0x8A0C,/* 0xF0-0xF7 */
+	0x9D3B,0x5316,0x548C,0x5B05,0x6A3A,0x706B,0x7575,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_FC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x798D,0x79BE,0x82B1,0x83EF,0x8A71,0x8B41,0x8CA8,/* 0xA0-0xA7 */
+	0x9774,0xFA0B,0x64F4,0x652B,0x78BA,0x78BB,0x7A6B,0x4E38,/* 0xA8-0xAF */
+	0x559A,0x5950,0x5BA6,0x5E7B,0x60A3,0x63DB,0x6B61,0x6665,/* 0xB0-0xB7 */
+	0x6853,0x6E19,0x7165,0x74B0,0x7D08,0x9084,0x9A69,0x9C25,/* 0xB8-0xBF */
+	0x6D3B,0x6ED1,0x733E,0x8C41,0x95CA,0x51F0,0x5E4C,0x5FA8,/* 0xC0-0xC7 */
+	0x604D,0x60F6,0x6130,0x614C,0x6643,0x6644,0x69A5,0x6CC1,/* 0xC8-0xCF */
+	0x6E5F,0x6EC9,0x6F62,0x714C,0x749C,0x7687,0x7BC1,0x7C27,/* 0xD0-0xD7 */
+	0x8352,0x8757,0x9051,0x968D,0x9EC3,0x532F,0x56DE,0x5EFB,/* 0xD8-0xDF */
+	0x5F8A,0x6062,0x6094,0x61F7,0x6666,0x6703,0x6A9C,0x6DEE,/* 0xE0-0xE7 */
+	0x6FAE,0x7070,0x736A,0x7E6A,0x81BE,0x8334,0x86D4,0x8AA8,/* 0xE8-0xEF */
+	0x8CC4,0x5283,0x7372,0x5B96,0x6A6B,0x9404,0x54EE,0x5686,/* 0xF0-0xF7 */
+	0x5B5D,0x6548,0x6585,0x66C9,0x689F,0x6D8D,0x6DC6,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_FD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x723B,0x80B4,0x9175,0x9A4D,0x4FAF,0x5019,0x539A,/* 0xA0-0xA7 */
+	0x540E,0x543C,0x5589,0x55C5,0x5E3F,0x5F8C,0x673D,0x7166,/* 0xA8-0xAF */
+	0x73DD,0x9005,0x52DB,0x52F3,0x5864,0x58CE,0x7104,0x718F,/* 0xB0-0xB7 */
+	0x71FB,0x85B0,0x8A13,0x6688,0x85A8,0x55A7,0x6684,0x714A,/* 0xB8-0xBF */
+	0x8431,0x5349,0x5599,0x6BC1,0x5F59,0x5FBD,0x63EE,0x6689,/* 0xC0-0xC7 */
+	0x7147,0x8AF1,0x8F1D,0x9EBE,0x4F11,0x643A,0x70CB,0x7566,/* 0xC8-0xCF */
+	0x8667,0x6064,0x8B4E,0x9DF8,0x5147,0x51F6,0x5308,0x6D36,/* 0xD0-0xD7 */
+	0x80F8,0x9ED1,0x6615,0x6B23,0x7098,0x75D5,0x5403,0x5C79,/* 0xD8-0xDF */
+	0x7D07,0x8A16,0x6B20,0x6B3D,0x6B46,0x5438,0x6070,0x6D3D,/* 0xE0-0xE7 */
+	0x7FD5,0x8208,0x50D6,0x51DE,0x559C,0x566B,0x56CD,0x59EC,/* 0xE8-0xEF */
+	0x5B09,0x5E0C,0x6199,0x6198,0x6231,0x665E,0x66E6,0x7199,/* 0xF0-0xF7 */
+	0x71B9,0x71BA,0x72A7,0x79A7,0x7A00,0x7FB2,0x8A70,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t *page_charset2uni[256] = {
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   c2u_81, c2u_82, c2u_83, c2u_84, c2u_85, c2u_86, c2u_87, 
+	c2u_88, c2u_89, c2u_8A, c2u_8B, c2u_8C, c2u_8D, c2u_8E, c2u_8F, 
+	c2u_90, c2u_91, c2u_92, c2u_93, c2u_94, c2u_95, c2u_96, c2u_97, 
+	c2u_98, c2u_99, c2u_9A, c2u_9B, c2u_9C, c2u_9D, c2u_9E, c2u_9F, 
+	c2u_A0, c2u_A1, c2u_A2, c2u_A3, c2u_A4, c2u_A5, c2u_A6, c2u_A7, 
+	c2u_A8, c2u_A9, c2u_AA, c2u_AB, c2u_AC, c2u_AD, c2u_AE, c2u_AF, 
+	c2u_B0, c2u_B1, c2u_B2, c2u_B3, c2u_B4, c2u_B5, c2u_B6, c2u_B7, 
+	c2u_B8, c2u_B9, c2u_BA, c2u_BB, c2u_BC, c2u_BD, c2u_BE, c2u_BF, 
+	c2u_C0, c2u_C1, c2u_C2, c2u_C3, c2u_C4, c2u_C5, c2u_C6, c2u_C7, 
+	c2u_C8, NULL,   c2u_CA, c2u_CB, c2u_CC, c2u_CD, c2u_CE, c2u_CF, 
+	c2u_D0, c2u_D1, c2u_D2, c2u_D3, c2u_D4, c2u_D5, c2u_D6, c2u_D7, 
+	c2u_D8, c2u_D9, c2u_DA, c2u_DB, c2u_DC, c2u_DD, c2u_DE, c2u_DF, 
+	c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, 
+	c2u_E8, c2u_E9, c2u_EA, c2u_EB, c2u_EC, c2u_ED, c2u_EE, c2u_EF, 
+	c2u_F0, c2u_F1, c2u_F2, c2u_F3, c2u_F4, c2u_F5, c2u_F6, c2u_F7, 
+	c2u_F8, c2u_F9, c2u_FA, c2u_FB, c2u_FC, c2u_FD, NULL,   NULL,   
+};
+
+static const unsigned char u2c_01[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xA9, 0xA2, 0xA9, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xA4, 0xA9, 0xA4, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xA9, 0xA5, 0xA8, 0xA6, 0xA9, 0xA6, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xA9, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA8, /* 0x3C-0x3F */
+	0xA9, 0xA8, 0xA8, 0xA9, 0xA9, 0xA9, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xA9, 0xB0, 0xA8, 0xAF, 0xA9, 0xAF, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xAB, 0xA9, 0xAB, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xAE, 0xA9, 0xAE, /* 0x64-0x67 */
+};
+
+static const unsigned char u2c_02[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0xA7, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xA2, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xA2, 0xA8, 0xA2, 0xAB, 0xA2, 0xAA, 0xA2, 0xAD, /* 0xD8-0xDB */
+	0x00, 0x00, 0xA2, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+};
+
+static const unsigned char u2c_03[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xA5, 0xC1, 0xA5, 0xC2, 0xA5, 0xC3, /* 0x90-0x93 */
+	0xA5, 0xC4, 0xA5, 0xC5, 0xA5, 0xC6, 0xA5, 0xC7, /* 0x94-0x97 */
+	0xA5, 0xC8, 0xA5, 0xC9, 0xA5, 0xCA, 0xA5, 0xCB, /* 0x98-0x9B */
+	0xA5, 0xCC, 0xA5, 0xCD, 0xA5, 0xCE, 0xA5, 0xCF, /* 0x9C-0x9F */
+	0xA5, 0xD0, 0xA5, 0xD1, 0x00, 0x00, 0xA5, 0xD2, /* 0xA0-0xA3 */
+	0xA5, 0xD3, 0xA5, 0xD4, 0xA5, 0xD5, 0xA5, 0xD6, /* 0xA4-0xA7 */
+	0xA5, 0xD7, 0xA5, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xA5, 0xE1, 0xA5, 0xE2, 0xA5, 0xE3, /* 0xB0-0xB3 */
+	0xA5, 0xE4, 0xA5, 0xE5, 0xA5, 0xE6, 0xA5, 0xE7, /* 0xB4-0xB7 */
+	0xA5, 0xE8, 0xA5, 0xE9, 0xA5, 0xEA, 0xA5, 0xEB, /* 0xB8-0xBB */
+	0xA5, 0xEC, 0xA5, 0xED, 0xA5, 0xEE, 0xA5, 0xEF, /* 0xBC-0xBF */
+	0xA5, 0xF0, 0xA5, 0xF1, 0x00, 0x00, 0xA5, 0xF2, /* 0xC0-0xC3 */
+	0xA5, 0xF3, 0xA5, 0xF4, 0xA5, 0xF5, 0xA5, 0xF6, /* 0xC4-0xC7 */
+	0xA5, 0xF7, 0xA5, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+};
+
+static const unsigned char u2c_04[512] = {
+	0x00, 0x00, 0xAC, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xAC, 0xA1, 0xAC, 0xA2, 0xAC, 0xA3, 0xAC, 0xA4, /* 0x10-0x13 */
+	0xAC, 0xA5, 0xAC, 0xA6, 0xAC, 0xA8, 0xAC, 0xA9, /* 0x14-0x17 */
+	0xAC, 0xAA, 0xAC, 0xAB, 0xAC, 0xAC, 0xAC, 0xAD, /* 0x18-0x1B */
+	0xAC, 0xAE, 0xAC, 0xAF, 0xAC, 0xB0, 0xAC, 0xB1, /* 0x1C-0x1F */
+	0xAC, 0xB2, 0xAC, 0xB3, 0xAC, 0xB4, 0xAC, 0xB5, /* 0x20-0x23 */
+	0xAC, 0xB6, 0xAC, 0xB7, 0xAC, 0xB8, 0xAC, 0xB9, /* 0x24-0x27 */
+	0xAC, 0xBA, 0xAC, 0xBB, 0xAC, 0xBC, 0xAC, 0xBD, /* 0x28-0x2B */
+	0xAC, 0xBE, 0xAC, 0xBF, 0xAC, 0xC0, 0xAC, 0xC1, /* 0x2C-0x2F */
+	0xAC, 0xD1, 0xAC, 0xD2, 0xAC, 0xD3, 0xAC, 0xD4, /* 0x30-0x33 */
+	0xAC, 0xD5, 0xAC, 0xD6, 0xAC, 0xD8, 0xAC, 0xD9, /* 0x34-0x37 */
+	0xAC, 0xDA, 0xAC, 0xDB, 0xAC, 0xDC, 0xAC, 0xDD, /* 0x38-0x3B */
+	0xAC, 0xDE, 0xAC, 0xDF, 0xAC, 0xE0, 0xAC, 0xE1, /* 0x3C-0x3F */
+	0xAC, 0xE2, 0xAC, 0xE3, 0xAC, 0xE4, 0xAC, 0xE5, /* 0x40-0x43 */
+	0xAC, 0xE6, 0xAC, 0xE7, 0xAC, 0xE8, 0xAC, 0xE9, /* 0x44-0x47 */
+	0xAC, 0xEA, 0xAC, 0xEB, 0xAC, 0xEC, 0xAC, 0xED, /* 0x48-0x4B */
+	0xAC, 0xEE, 0xAC, 0xEF, 0xAC, 0xF0, 0xAC, 0xF1, /* 0x4C-0x4F */
+	0x00, 0x00, 0xAC, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+};
+
+static const unsigned char u2c_11[512] = {
+	0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA4, 0xA4, 0xA7, /* 0x00-0x03 */
+	0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xB1, 0xA4, 0xB2, /* 0x04-0x07 */
+	0xA4, 0xB3, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0x08-0x0B */
+	0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0x0C-0x0F */
+	0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0x00, 0x00, /* 0x10-0x13 */
+	0xA4, 0xD5, 0xA4, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xDD, 0x00, 0x00, /* 0x18-0x1B */
+	0xA4, 0xDE, 0xA4, 0xE1, 0xA4, 0xE2, 0x00, 0x00, /* 0x1C-0x1F */
+	0xA4, 0xE3, 0xA4, 0xB4, 0xA4, 0xE4, 0xA4, 0xE5, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xE6, /* 0x24-0x27 */
+	0x00, 0x00, 0xA4, 0xE7, 0x00, 0x00, 0xA4, 0xE8, /* 0x28-0x2B */
+	0xA4, 0xE9, 0xA4, 0xEA, 0xA4, 0xEB, 0xA4, 0xEC, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xED, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xEE, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB5, 0xA4, 0xB6, /* 0x3C-0x3F */
+	0xA4, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xA4, 0xF2, 0xA4, 0xF3, 0xA4, 0xF0, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xA4, 0xB7, 0x00, 0x00, 0xA4, 0xB8, 0xA4, 0xB9, /* 0x4C-0x4F */
+	0xA4, 0xB8, 0xA4, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xA4, 0xBA, 0xA4, 0xBA, 0x00, 0x00, 0xA4, 0xF4, /* 0x54-0x57 */
+	0xA4, 0xF5, 0xA4, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xD4, /* 0x5C-0x5F */
+	0x00, 0x00, 0xA4, 0xBF, 0xA4, 0xC0, 0xA4, 0xC1, /* 0x60-0x63 */
+	0xA4, 0xC2, 0xA4, 0xC3, 0xA4, 0xC4, 0xA4, 0xC5, /* 0x64-0x67 */
+	0xA4, 0xC6, 0xA4, 0xC7, 0xA4, 0xC8, 0xA4, 0xC9, /* 0x68-0x6B */
+	0xA4, 0xCA, 0xA4, 0xCB, 0xA4, 0xCC, 0xA4, 0xCD, /* 0x6C-0x6F */
+	0xA4, 0xCE, 0xA4, 0xCF, 0xA4, 0xD0, 0xA4, 0xD1, /* 0x70-0x73 */
+	0xA4, 0xD2, 0xA4, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xA4, 0xF7, 0xA4, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xA4, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xA4, 0xFA, 0xA4, 0xFB, 0x00, 0x00, /* 0x90-0x93 */
+	0xA4, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xFD, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xA4, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, 0xA4, 0xA4, /* 0xA8-0xAB */
+	0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, 0xA4, 0xA9, /* 0xAC-0xAF */
+	0xA4, 0xAA, 0xA4, 0xAB, 0xA4, 0xAC, 0xA4, 0xAD, /* 0xB0-0xB3 */
+	0xA4, 0xAE, 0xA4, 0xAF, 0xA4, 0xB0, 0xA4, 0xB1, /* 0xB4-0xB7 */
+	0xA4, 0xB2, 0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, /* 0xB8-0xBB */
+	0xA4, 0xB7, 0xA4, 0xB8, 0xA4, 0xBA, 0xA4, 0xBB, /* 0xBC-0xBF */
+	0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xD6, 0xA4, 0xD7, /* 0xC4-0xC7 */
+	0xA4, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xA4, 0xD9, 0x00, 0x00, 0xA4, 0xDA, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xDB, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xDC, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xA4, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xA4, 0xDE, 0xA4, 0xDF, 0x00, 0x00, 0xA4, 0xE0, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xE8, 0xA4, 0xEA, /* 0xE4-0xE7 */
+	0xA4, 0xEC, 0x00, 0x00, 0xA4, 0xED, 0xA4, 0xEF, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xF0, 0x00, 0x00, /* 0xEC-0xEF */
+	0xA4, 0xB7, 0xA4, 0xF2, 0xA4, 0xF3, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xA4, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xA4, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_20[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xA1, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xA1, 0xAE, 0xA1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xA1, 0xB0, 0xA1, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xA2, 0xD3, 0xA2, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xA1, 0xA5, 0xA1, 0xA6, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xA2, 0xB6, 0x00, 0x00, 0xA1, 0xC7, 0xA1, 0xC8, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD8, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xA9, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA9, 0xFA, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xA9, 0xFB, 0xA9, 0xFC, 0xA9, 0xFD, /* 0x80-0x83 */
+	0xA9, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+};
+
+static const unsigned char u2c_21[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC9, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xA2, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xA4, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0xE0, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xA2, 0xE5, 0xA2, 0xE2, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0xD9, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xCA, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xF7, /* 0x50-0x53 */
+	0xA8, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xFB, /* 0x58-0x5B */
+	0xA8, 0xFC, 0xA8, 0xFD, 0xA8, 0xFE, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA5, 0xB0, 0xA5, 0xB1, 0xA5, 0xB2, 0xA5, 0xB3, /* 0x60-0x63 */
+	0xA5, 0xB4, 0xA5, 0xB5, 0xA5, 0xB6, 0xA5, 0xB7, /* 0x64-0x67 */
+	0xA5, 0xB8, 0xA5, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xA5, 0xA1, 0xA5, 0xA2, 0xA5, 0xA3, 0xA5, 0xA4, /* 0x70-0x73 */
+	0xA5, 0xA5, 0xA5, 0xA6, 0xA5, 0xA7, 0xA5, 0xA8, /* 0x74-0x77 */
+	0xA5, 0xA9, 0xA5, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xA1, 0xE7, 0xA1, 0xE8, 0xA1, 0xE6, 0xA1, 0xE9, /* 0x90-0x93 */
+	0xA1, 0xEA, 0xA2, 0xD5, 0xA2, 0xD8, 0xA2, 0xD6, /* 0x94-0x97 */
+	0xA2, 0xD9, 0xA2, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0xA1, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xA2, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+};
+
+static const unsigned char u2c_22[512] = {
+	0xA2, 0xA3, 0x00, 0x00, 0xA1, 0xD3, 0xA2, 0xA4, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD4, /* 0x04-0x07 */
+	0xA1, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF5, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0xB3, /* 0x0C-0x0F */
+	0x00, 0x00, 0xA2, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xEE, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xA1, 0xF0, 0xA1, 0xC4, 0x00, 0x00, /* 0x1C-0x1F */
+	0xA1, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xA1, 0xAB, 0x00, 0x00, 0xA1, 0xFC, /* 0x24-0x27 */
+	0xA1, 0xFD, 0xA1, 0xFB, 0xA1, 0xFA, 0xA1, 0xF2, /* 0x28-0x2B */
+	0xA1, 0xF3, 0x00, 0x00, 0xA2, 0xB1, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xA1, 0xC5, 0xA1, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xA1, 0xAD, 0xA1, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xD6, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA1, 0xC1, 0xA1, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xA1, 0xC2, 0xA1, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xEC, 0xA1, 0xED, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xF8, 0xA1, 0xF9, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xF6, 0xA1, 0xF7, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xA2, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xA1, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+};
+
+static const unsigned char u2c_23[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xD2, 0x00, 0x00, /* 0x10-0x13 */
+};
+
+static const unsigned char u2c_24[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA8, 0xE7, 0xA8, 0xE8, 0xA8, 0xE9, 0xA8, 0xEA, /* 0x60-0x63 */
+	0xA8, 0xEB, 0xA8, 0xEC, 0xA8, 0xED, 0xA8, 0xEE, /* 0x64-0x67 */
+	0xA8, 0xEF, 0xA8, 0xF0, 0xA8, 0xF1, 0xA8, 0xF2, /* 0x68-0x6B */
+	0xA8, 0xF3, 0xA8, 0xF4, 0xA8, 0xF5, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xA9, 0xE7, 0xA9, 0xE8, 0xA9, 0xE9, 0xA9, 0xEA, /* 0x74-0x77 */
+	0xA9, 0xEB, 0xA9, 0xEC, 0xA9, 0xED, 0xA9, 0xEE, /* 0x78-0x7B */
+	0xA9, 0xEF, 0xA9, 0xF0, 0xA9, 0xF1, 0xA9, 0xF2, /* 0x7C-0x7F */
+	
+	0xA9, 0xF3, 0xA9, 0xF4, 0xA9, 0xF5, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xA9, 0xCD, 0xA9, 0xCE, 0xA9, 0xCF, 0xA9, 0xD0, /* 0x9C-0x9F */
+	0xA9, 0xD1, 0xA9, 0xD2, 0xA9, 0xD3, 0xA9, 0xD4, /* 0xA0-0xA3 */
+	0xA9, 0xD5, 0xA9, 0xD6, 0xA9, 0xD7, 0xA9, 0xD8, /* 0xA4-0xA7 */
+	0xA9, 0xD9, 0xA9, 0xDA, 0xA9, 0xDB, 0xA9, 0xDC, /* 0xA8-0xAB */
+	0xA9, 0xDD, 0xA9, 0xDE, 0xA9, 0xDF, 0xA9, 0xE0, /* 0xAC-0xAF */
+	0xA9, 0xE1, 0xA9, 0xE2, 0xA9, 0xE3, 0xA9, 0xE4, /* 0xB0-0xB3 */
+	0xA9, 0xE5, 0xA9, 0xE6, 0xA8, 0xCD, 0xA8, 0xCE, /* 0xB4-0xB7 */
+	0xA8, 0xCF, 0xA8, 0xD0, 0xA8, 0xD1, 0xA8, 0xD2, /* 0xB8-0xBB */
+	0xA8, 0xD3, 0xA8, 0xD4, 0xA8, 0xD5, 0xA8, 0xD6, /* 0xBC-0xBF */
+	0xA8, 0xD7, 0xA8, 0xD8, 0xA8, 0xD9, 0xA8, 0xDA, /* 0xC0-0xC3 */
+	0xA8, 0xDB, 0xA8, 0xDC, 0xA8, 0xDD, 0xA8, 0xDE, /* 0xC4-0xC7 */
+	0xA8, 0xDF, 0xA8, 0xE0, 0xA8, 0xE1, 0xA8, 0xE2, /* 0xC8-0xCB */
+	0xA8, 0xE3, 0xA8, 0xE4, 0xA8, 0xE5, 0xA8, 0xE6, /* 0xCC-0xCF */
+	0xA8, 0xCD, 0xA8, 0xCE, 0xA8, 0xCF, 0xA8, 0xD0, /* 0xD0-0xD3 */
+	0xA8, 0xD1, 0xA8, 0xD2, 0xA8, 0xD3, 0xA8, 0xD4, /* 0xD4-0xD7 */
+	0xA8, 0xD5, 0xA8, 0xD6, 0xA8, 0xD7, 0xA8, 0xD8, /* 0xD8-0xDB */
+	0xA8, 0xD9, 0xA8, 0xDA, 0xA8, 0xDB, 0xA8, 0xDC, /* 0xDC-0xDF */
+	0xA8, 0xDD, 0xA8, 0xDE, 0xA8, 0xDF, 0xA8, 0xE0, /* 0xE0-0xE3 */
+	0xA8, 0xE1, 0xA8, 0xE2, 0xA8, 0xE3, 0xA8, 0xE4, /* 0xE4-0xE7 */
+	0xA8, 0xE5, 0xA8, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+};
+
+static const unsigned char u2c_25[512] = {
+	0xA6, 0xA1, 0xA6, 0xAC, 0xA6, 0xA2, 0xA6, 0xAD, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xA6, 0xA3, 0xA6, 0xC8, 0xA6, 0xC7, 0xA6, 0xAE, /* 0x0C-0x0F */
+	0xA6, 0xA4, 0xA6, 0xC2, 0xA6, 0xC1, 0xA6, 0xAF, /* 0x10-0x13 */
+	0xA6, 0xA6, 0xA6, 0xC6, 0xA6, 0xC5, 0xA6, 0xB1, /* 0x14-0x17 */
+	0xA6, 0xA5, 0xA6, 0xC4, 0xA6, 0xC3, 0xA6, 0xB0, /* 0x18-0x1B */
+	0xA6, 0xA7, 0xA6, 0xBC, 0xA6, 0xC9, 0xA6, 0xCA, /* 0x1C-0x1F */
+	0xA6, 0xB7, 0xA6, 0xCB, 0xA6, 0xCC, 0xA6, 0xB2, /* 0x20-0x23 */
+	0xA6, 0xA9, 0xA6, 0xBE, 0xA6, 0xCD, 0xA6, 0xCE, /* 0x24-0x27 */
+	0xA6, 0xB9, 0xA6, 0xCF, 0xA6, 0xD0, 0xA6, 0xB4, /* 0x28-0x2B */
+	0xA6, 0xA8, 0xA6, 0xD1, 0xA6, 0xD2, 0xA6, 0xB8, /* 0x2C-0x2F */
+	0xA6, 0xBD, 0xA6, 0xD3, 0xA6, 0xD4, 0xA6, 0xB3, /* 0x30-0x33 */
+	0xA6, 0xAA, 0xA6, 0xD5, 0xA6, 0xD6, 0xA6, 0xBA, /* 0x34-0x37 */
+	0xA6, 0xBF, 0xA6, 0xD7, 0xA6, 0xD8, 0xA6, 0xB5, /* 0x38-0x3B */
+	0xA6, 0xAB, 0xA6, 0xD9, 0xA6, 0xDA, 0xA6, 0xBB, /* 0x3C-0x3F */
+	0xA6, 0xDB, 0xA6, 0xDC, 0xA6, 0xC0, 0xA6, 0xDD, /* 0x40-0x43 */
+	0xA6, 0xDE, 0xA6, 0xDF, 0xA6, 0xE0, 0xA6, 0xE1, /* 0x44-0x47 */
+	0xA6, 0xE2, 0xA6, 0xE3, 0xA6, 0xE4, 0xA6, 0xB6, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0xC6, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xA1, 0xE1, 0xA1, 0xE0, 0x00, 0x00, 0xA2, 0xC3, /* 0xA0-0xA3 */
+	0xA2, 0xC7, 0xA2, 0xC8, 0xA2, 0xCB, 0xA2, 0xCA, /* 0xA4-0xA7 */
+	0xA2, 0xC9, 0xA2, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xE3, 0xA1, 0xE2, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0xBA, 0xA2, 0xB9, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xA1, 0xE5, 0xA1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xA2, 0xB8, 0xA2, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xDF, 0xA1, 0xDE, /* 0xC4-0xC7 */
+	0xA2, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xDB, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xDD, 0xA1, 0xDC, /* 0xCC-0xCF */
+	0xA2, 0xC4, 0xA2, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+};
+
+static const unsigned char u2c_26[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xA1, 0xDA, 0xA1, 0xD9, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0xCF, 0xA2, 0xCE, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xA2, 0xD0, 0x00, 0x00, 0xA2, 0xD1, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xA1, 0xCF, 0x00, 0x00, 0xA1, 0xCE, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA2, 0xBC, 0xA2, 0xBD, 0x00, 0x00, 0xA2, 0xC0, /* 0x60-0x63 */
+	0xA2, 0xBB, 0xA2, 0xBE, 0x00, 0x00, 0xA2, 0xBF, /* 0x64-0x67 */
+	0xA2, 0xCD, 0xA2, 0xDB, 0xA2, 0xDC, 0x00, 0x00, /* 0x68-0x6B */
+	0xA2, 0xDD, 0xA2, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+};
+
+static const unsigned char u2c_30[512] = {
+	0xA1, 0xA1, 0xA1, 0xA2, 0xA1, 0xA3, 0xA1, 0xA8, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xA1, 0xB4, 0xA1, 0xB5, 0xA1, 0xB6, 0xA1, 0xB7, /* 0x08-0x0B */
+	0xA1, 0xB8, 0xA1, 0xB9, 0xA1, 0xBA, 0xA1, 0xBB, /* 0x0C-0x0F */
+	0xA1, 0xBC, 0xA1, 0xBD, 0x00, 0x00, 0xA1, 0xEB, /* 0x10-0x13 */
+	0xA1, 0xB2, 0xA1, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xAA, 0xA1, 0xAA, 0xA2, 0xAA, 0xA3, /* 0x40-0x43 */
+	0xAA, 0xA4, 0xAA, 0xA5, 0xAA, 0xA6, 0xAA, 0xA7, /* 0x44-0x47 */
+	0xAA, 0xA8, 0xAA, 0xA9, 0xAA, 0xAA, 0xAA, 0xAB, /* 0x48-0x4B */
+	0xAA, 0xAC, 0xAA, 0xAD, 0xAA, 0xAE, 0xAA, 0xAF, /* 0x4C-0x4F */
+	0xAA, 0xB0, 0xAA, 0xB1, 0xAA, 0xB2, 0xAA, 0xB3, /* 0x50-0x53 */
+	0xAA, 0xB4, 0xAA, 0xB5, 0xAA, 0xB6, 0xAA, 0xB7, /* 0x54-0x57 */
+	0xAA, 0xB8, 0xAA, 0xB9, 0xAA, 0xBA, 0xAA, 0xBB, /* 0x58-0x5B */
+	0xAA, 0xBC, 0xAA, 0xBD, 0xAA, 0xBE, 0xAA, 0xBF, /* 0x5C-0x5F */
+	0xAA, 0xC0, 0xAA, 0xC1, 0xAA, 0xC2, 0xAA, 0xC3, /* 0x60-0x63 */
+	0xAA, 0xC4, 0xAA, 0xC5, 0xAA, 0xC6, 0xAA, 0xC7, /* 0x64-0x67 */
+	0xAA, 0xC8, 0xAA, 0xC9, 0xAA, 0xCA, 0xAA, 0xCB, /* 0x68-0x6B */
+	0xAA, 0xCC, 0xAA, 0xCD, 0xAA, 0xCE, 0xAA, 0xCF, /* 0x6C-0x6F */
+	0xAA, 0xD0, 0xAA, 0xD1, 0xAA, 0xD2, 0xAA, 0xD3, /* 0x70-0x73 */
+	0xAA, 0xD4, 0xAA, 0xD5, 0xAA, 0xD6, 0xAA, 0xD7, /* 0x74-0x77 */
+	0xAA, 0xD8, 0xAA, 0xD9, 0xAA, 0xDA, 0xAA, 0xDB, /* 0x78-0x7B */
+	0xAA, 0xDC, 0xAA, 0xDD, 0xAA, 0xDE, 0xAA, 0xDF, /* 0x7C-0x7F */
+	
+	0xAA, 0xE0, 0xAA, 0xE1, 0xAA, 0xE2, 0xAA, 0xE3, /* 0x80-0x83 */
+	0xAA, 0xE4, 0xAA, 0xE5, 0xAA, 0xE6, 0xAA, 0xE7, /* 0x84-0x87 */
+	0xAA, 0xE8, 0xAA, 0xE9, 0xAA, 0xEA, 0xAA, 0xEB, /* 0x88-0x8B */
+	0xAA, 0xEC, 0xAA, 0xED, 0xAA, 0xEE, 0xAA, 0xEF, /* 0x8C-0x8F */
+	0xAA, 0xF0, 0xAA, 0xF1, 0xAA, 0xF2, 0xAA, 0xF3, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xAB, 0xA1, 0xAB, 0xA2, 0xAB, 0xA3, /* 0xA0-0xA3 */
+	0xAB, 0xA4, 0xAB, 0xA5, 0xAB, 0xA6, 0xAB, 0xA7, /* 0xA4-0xA7 */
+	0xAB, 0xA8, 0xAB, 0xA9, 0xAB, 0xAA, 0xAB, 0xAB, /* 0xA8-0xAB */
+	0xAB, 0xAC, 0xAB, 0xAD, 0xAB, 0xAE, 0xAB, 0xAF, /* 0xAC-0xAF */
+	0xAB, 0xB0, 0xAB, 0xB1, 0xAB, 0xB2, 0xAB, 0xB3, /* 0xB0-0xB3 */
+	0xAB, 0xB4, 0xAB, 0xB5, 0xAB, 0xB6, 0xAB, 0xB7, /* 0xB4-0xB7 */
+	0xAB, 0xB8, 0xAB, 0xB9, 0xAB, 0xBA, 0xAB, 0xBB, /* 0xB8-0xBB */
+	0xAB, 0xBC, 0xAB, 0xBD, 0xAB, 0xBE, 0xAB, 0xBF, /* 0xBC-0xBF */
+	0xAB, 0xC0, 0xAB, 0xC1, 0xAB, 0xC2, 0xAB, 0xC3, /* 0xC0-0xC3 */
+	0xAB, 0xC4, 0xAB, 0xC5, 0xAB, 0xC6, 0xAB, 0xC7, /* 0xC4-0xC7 */
+	0xAB, 0xC8, 0xAB, 0xC9, 0xAB, 0xCA, 0xAB, 0xCB, /* 0xC8-0xCB */
+	0xAB, 0xCC, 0xAB, 0xCD, 0xAB, 0xCE, 0xAB, 0xCF, /* 0xCC-0xCF */
+	0xAB, 0xD0, 0xAB, 0xD1, 0xAB, 0xD2, 0xAB, 0xD3, /* 0xD0-0xD3 */
+	0xAB, 0xD4, 0xAB, 0xD5, 0xAB, 0xD6, 0xAB, 0xD7, /* 0xD4-0xD7 */
+	0xAB, 0xD8, 0xAB, 0xD9, 0xAB, 0xDA, 0xAB, 0xDB, /* 0xD8-0xDB */
+	0xAB, 0xDC, 0xAB, 0xDD, 0xAB, 0xDE, 0xAB, 0xDF, /* 0xDC-0xDF */
+	0xAB, 0xE0, 0xAB, 0xE1, 0xAB, 0xE2, 0xAB, 0xE3, /* 0xE0-0xE3 */
+	0xAB, 0xE4, 0xAB, 0xE5, 0xAB, 0xE6, 0xAB, 0xE7, /* 0xE4-0xE7 */
+	0xAB, 0xE8, 0xAB, 0xE9, 0xAB, 0xEA, 0xAB, 0xEB, /* 0xE8-0xEB */
+	0xAB, 0xEC, 0xAB, 0xED, 0xAB, 0xEE, 0xAB, 0xEF, /* 0xEC-0xEF */
+	0xAB, 0xF0, 0xAB, 0xF1, 0xAB, 0xF2, 0xAB, 0xF3, /* 0xF0-0xF3 */
+	0xAB, 0xF4, 0xAB, 0xF5, 0xAB, 0xF6, 0x00, 0x00, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_31[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, /* 0x30-0x33 */
+	0xA4, 0xA4, 0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, /* 0x34-0x37 */
+	0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xAA, 0xA4, 0xAB, /* 0x38-0x3B */
+	0xA4, 0xAC, 0xA4, 0xAD, 0xA4, 0xAE, 0xA4, 0xAF, /* 0x3C-0x3F */
+	0xA4, 0xB0, 0xA4, 0xB1, 0xA4, 0xB2, 0xA4, 0xB3, /* 0x40-0x43 */
+	0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0x44-0x47 */
+	0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0x48-0x4B */
+	0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0xA4, 0xBF, /* 0x4C-0x4F */
+	0xA4, 0xC0, 0xA4, 0xC1, 0xA4, 0xC2, 0xA4, 0xC3, /* 0x50-0x53 */
+	0xA4, 0xC4, 0xA4, 0xC5, 0xA4, 0xC6, 0xA4, 0xC7, /* 0x54-0x57 */
+	0xA4, 0xC8, 0xA4, 0xC9, 0xA4, 0xCA, 0xA4, 0xCB, /* 0x58-0x5B */
+	0xA4, 0xCC, 0xA4, 0xCD, 0xA4, 0xCE, 0xA4, 0xCF, /* 0x5C-0x5F */
+	0xA4, 0xD0, 0xA4, 0xD1, 0xA4, 0xD2, 0xA4, 0xD3, /* 0x60-0x63 */
+	0xA4, 0xD4, 0xA4, 0xD5, 0xA4, 0xD6, 0xA4, 0xD7, /* 0x64-0x67 */
+	0xA4, 0xD8, 0xA4, 0xD9, 0xA4, 0xDA, 0xA4, 0xDB, /* 0x68-0x6B */
+	0xA4, 0xDC, 0xA4, 0xDD, 0xA4, 0xDE, 0xA4, 0xDF, /* 0x6C-0x6F */
+	0xA4, 0xE0, 0xA4, 0xE1, 0xA4, 0xE2, 0xA4, 0xE3, /* 0x70-0x73 */
+	0xA4, 0xE4, 0xA4, 0xE5, 0xA4, 0xE6, 0xA4, 0xE7, /* 0x74-0x77 */
+	0xA4, 0xE8, 0xA4, 0xE9, 0xA4, 0xEA, 0xA4, 0xEB, /* 0x78-0x7B */
+	0xA4, 0xEC, 0xA4, 0xED, 0xA4, 0xEE, 0xA4, 0xEF, /* 0x7C-0x7F */
+	
+	0xA4, 0xF0, 0xA4, 0xF1, 0xA4, 0xF2, 0xA4, 0xF3, /* 0x80-0x83 */
+	0xA4, 0xF4, 0xA4, 0xF5, 0xA4, 0xF6, 0xA4, 0xF7, /* 0x84-0x87 */
+	0xA4, 0xF8, 0xA4, 0xF9, 0xA4, 0xFA, 0xA4, 0xFB, /* 0x88-0x8B */
+	0xA4, 0xFC, 0xA4, 0xFD, 0xA4, 0xFE, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xE9, 0xEC, 0xA3, /* 0x90-0x93 */
+	0xDF, 0xB2, 0xDE, 0xCC, 0xDF, 0xBE, 0xF1, 0xE9, /* 0x94-0x97 */
+	0xF9, 0xBB, 0xCB, 0xA3, 0xEB, 0xE0, 0xDC, 0xB0, /* 0x98-0x9B */
+	0xEF, 0xCB, 0xF4, 0xB8, 0xF2, 0xA2, 0xEC, 0xD1, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_32[512] = {
+	0xA9, 0xB1, 0xA9, 0xB2, 0xA9, 0xB3, 0xA9, 0xB4, /* 0x00-0x03 */
+	0xA9, 0xB5, 0xA9, 0xB6, 0xA9, 0xB7, 0xA9, 0xB8, /* 0x04-0x07 */
+	0xA9, 0xB9, 0xA9, 0xBA, 0xA9, 0xBB, 0xA9, 0xBC, /* 0x08-0x0B */
+	0xA9, 0xBD, 0xA9, 0xBE, 0xA9, 0xBF, 0xA9, 0xC0, /* 0x0C-0x0F */
+	0xA9, 0xC1, 0xA9, 0xC2, 0xA9, 0xC3, 0xA9, 0xC4, /* 0x10-0x13 */
+	0xA9, 0xC5, 0xA9, 0xC6, 0xA9, 0xC7, 0xA9, 0xC8, /* 0x14-0x17 */
+	0xA9, 0xC9, 0xA9, 0xCA, 0xA9, 0xCB, 0xA9, 0xCC, /* 0x18-0x1B */
+	0xA2, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xEC, 0xE9, 0xEC, 0xA3, 0xDF, 0xB2, 0xDE, 0xCC, /* 0x20-0x23 */
+	0xE7, 0xE9, 0xD7, 0xBF, 0xF6, 0xD2, 0xF8, 0xA2, /* 0x24-0x27 */
+	0xCE, 0xFA, 0xE4, 0xA8, 0xEA, 0xC5, 0xFB, 0xFD, /* 0x28-0x2B */
+	0xE2, 0xA9, 0xD9, 0xCA, 0xD1, 0xD1, 0xF7, 0xCF, /* 0x2C-0x2F */
+	0xEC, 0xED, 0xF1, 0xBB, 0xEA, 0xF3, 0xDE, 0xE4, /* 0x30-0x33 */
+	0xD9, 0xA3, 0xF7, 0xE5, 0xEE, 0xAF, 0xF5, 0xE6, /* 0x34-0x37 */
+	0xD6, 0xCC, 0xD3, 0xDB, 0xFB, 0xBC, 0xF9, 0xCA, /* 0x38-0x3B */
+	0xCA, 0xF8, 0xD0, 0xEA, 0xED, 0xC0, 0xFA, 0xF0, /* 0x3C-0x3F */
+	0xF0, 0xAE, 0xFD, 0xCC, 0xED, 0xBB, 0xF2, 0xB8, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA8, 0xB1, 0xA8, 0xB2, 0xA8, 0xB3, 0xA8, 0xB4, /* 0x60-0x63 */
+	0xA8, 0xB5, 0xA8, 0xB6, 0xA8, 0xB7, 0xA8, 0xB8, /* 0x64-0x67 */
+	0xA8, 0xB9, 0xA8, 0xBA, 0xA8, 0xBB, 0xA8, 0xBC, /* 0x68-0x6B */
+	0xA8, 0xBD, 0xA8, 0xBE, 0xA8, 0xBF, 0xA8, 0xC0, /* 0x6C-0x6F */
+	0xA8, 0xC1, 0xA8, 0xC2, 0xA8, 0xC3, 0xA8, 0xC4, /* 0x70-0x73 */
+	0xA8, 0xC5, 0xA8, 0xC6, 0xA8, 0xC7, 0xA8, 0xC8, /* 0x74-0x77 */
+	0xA8, 0xC9, 0xA8, 0xCA, 0xA8, 0xCB, 0xA8, 0xCC, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0xDE, /* 0x7C-0x7F */
+	
+	0xEC, 0xE9, 0xEC, 0xA3, 0xDF, 0xB2, 0xDE, 0xCC, /* 0x80-0x83 */
+	0xE7, 0xE9, 0xD7, 0xBF, 0xF6, 0xD2, 0xF8, 0xA2, /* 0x84-0x87 */
+	0xCE, 0xFA, 0xE4, 0xA8, 0xEA, 0xC5, 0xFB, 0xFD, /* 0x88-0x8B */
+	0xE2, 0xA9, 0xD9, 0xCA, 0xD1, 0xD1, 0xF7, 0xCF, /* 0x8C-0x8F */
+	0xEC, 0xED, 0xF1, 0xBB, 0xEA, 0xF3, 0xDE, 0xE4, /* 0x90-0x93 */
+	0xD9, 0xA3, 0xF7, 0xE5, 0xEE, 0xAF, 0xF5, 0xE6, /* 0x94-0x97 */
+	0xD6, 0xCC, 0xDD, 0xFA, 0xD1, 0xFB, 0xD2, 0xB3, /* 0x98-0x9B */
+	0xEE, 0xEA, 0xE9, 0xD0, 0xEC, 0xD4, 0xF1, 0xBC, /* 0x9C-0x9F */
+	0xFA, 0xA3, 0xFD, 0xCC, 0xDE, 0xD0, 0xEF, 0xE1, /* 0xA0-0xA3 */
+	0xDF, 0xBE, 0xF1, 0xE9, 0xF9, 0xBB, 0xF1, 0xA7, /* 0xA4-0xA7 */
+	0xE9, 0xD3, 0xEC, 0xA2, 0xF0, 0xF3, 0xF9, 0xCA, /* 0xA8-0xAB */
+	0xCA, 0xF8, 0xD0, 0xEA, 0xED, 0xC0, 0xFA, 0xF0, /* 0xAC-0xAF */
+	0xE5, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+};
+
+static const unsigned char u2c_33[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xA7, 0xC9, 0xA7, 0xCA, 0xA7, 0xCB, 0xA7, 0xCC, /* 0x80-0x83 */
+	0xA7, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xA7, 0xBA, 0xA7, 0xBB, 0xA7, 0xDC, 0xA7, 0xDD, /* 0x88-0x8B */
+	0xA7, 0xDE, 0xA7, 0xB6, 0xA7, 0xB7, 0xA7, 0xB8, /* 0x8C-0x8F */
+	0xA7, 0xD4, 0xA7, 0xD5, 0xA7, 0xD6, 0xA7, 0xD7, /* 0x90-0x93 */
+	0xA7, 0xD8, 0xA7, 0xA1, 0xA7, 0xA2, 0xA7, 0xA3, /* 0x94-0x97 */
+	0xA7, 0xA5, 0xA7, 0xAB, 0xA7, 0xAC, 0xA7, 0xAD, /* 0x98-0x9B */
+	0xA7, 0xAE, 0xA7, 0xAF, 0xA7, 0xB0, 0xA7, 0xB1, /* 0x9C-0x9F */
+	0xA7, 0xB2, 0xA7, 0xB3, 0xA7, 0xB4, 0xA7, 0xA7, /* 0xA0-0xA3 */
+	0xA7, 0xA8, 0xA7, 0xA9, 0xA7, 0xAA, 0xA7, 0xBD, /* 0xA4-0xA7 */
+	0xA7, 0xBE, 0xA7, 0xE5, 0xA7, 0xE6, 0xA7, 0xE7, /* 0xA8-0xAB */
+	0xA7, 0xE8, 0xA7, 0xE1, 0xA7, 0xE2, 0xA7, 0xE3, /* 0xAC-0xAF */
+	0xA7, 0xBF, 0xA7, 0xC0, 0xA7, 0xC1, 0xA7, 0xC2, /* 0xB0-0xB3 */
+	0xA7, 0xC3, 0xA7, 0xC4, 0xA7, 0xC5, 0xA7, 0xC6, /* 0xB4-0xB7 */
+	0xA7, 0xC7, 0xA7, 0xC8, 0xA7, 0xCE, 0xA7, 0xCF, /* 0xB8-0xBB */
+	0xA7, 0xD0, 0xA7, 0xD1, 0xA7, 0xD2, 0xA7, 0xD3, /* 0xBC-0xBF */
+	0xA7, 0xDA, 0xA7, 0xDB, 0xA2, 0xE3, 0xA7, 0xEC, /* 0xC0-0xC3 */
+	0xA7, 0xA6, 0xA7, 0xE0, 0xA7, 0xEF, 0xA2, 0xE1, /* 0xC4-0xC7 */
+	0xA7, 0xBC, 0xA7, 0xED, 0xA7, 0xB5, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xB9, /* 0xCC-0xCF */
+	0xA7, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xEB, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0xDF, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xA2, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xE4, /* 0xD8-0xDB */
+	0xA7, 0xEE, 0xA7, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+};
+
+static const unsigned char u2c_4E[512] = {
+	0xEC, 0xE9, 0xEF, 0xCB, 0x00, 0x00, 0xF6, 0xD2, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xB2, /* 0x04-0x07 */
+	0xED, 0xDB, 0xDF, 0xB2, 0xDF, 0xBE, 0xF9, 0xBB, /* 0x08-0x0B */
+	0x00, 0x00, 0xDC, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xF5, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xF3, 0xA6, 0xDD, 0xE0, 0xE1, 0xA6, 0x00, 0x00, /* 0x14-0x17 */
+	0xCE, 0xF8, 0xDC, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xAA, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xF1, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xFA, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xFC, 0xAF, 0xD3, 0xA1, 0x00, 0x00, 0xF1, 0xAB, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xD1, 0xD2, 0xAC, /* 0x40-0x43 */
+	0x00, 0x00, 0xCE, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xFD, /* 0x48-0x4B */
+	0x00, 0x00, 0xDE, 0xBF, 0xFB, 0xBA, 0xF9, 0xB9, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xD2, 0x00, 0x00, /* 0x54-0x57 */
+	0xE3, 0xAB, 0xEB, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xCE, 0xFA, 0xCB, 0xF7, 0xE5, 0xA5, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE1, /* 0x68-0x6B */
+	0x00, 0x00, 0xD4, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE1, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xE3, 0xDF, 0xAD, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xEB, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xAF, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xF5, 0x00, 0x00, /* 0x84-0x87 */
+	0xE5, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xC0, /* 0x88-0x8B */
+	0xEC, 0xA3, 0x00, 0x00, 0xE9, 0xCD, 0x00, 0x00, /* 0x8C-0x8F */
+	0xEA, 0xA7, 0xE9, 0xF6, 0xFB, 0xBB, 0x00, 0x00, /* 0x90-0x93 */
+	0xE7, 0xE9, 0xEF, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xD0, 0xE6, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xC1, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xAC, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xD8, 0xCC, 0xF9, 0xF1, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xCE, 0xDF, 0xFA, 0xA4, 0xE6, 0xB2, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xFA, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xBD, /* 0xA8-0xAB */
+	0xCC, 0xC8, 0xEF, 0xCD, 0xD5, 0xD5, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xA2, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xD1, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE4, 0xA7, 0xEC, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF6, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xFB, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xD1, 0xCB, 0xBF, /* 0xC8-0xCB */
+	0x00, 0x00, 0xED, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xED, 0xA8, 0xDE, 0xC2, 0xF6, 0xE2, 0xED, 0xDC, /* 0xD4-0xD7 */
+	0xDC, 0xF5, 0xE0, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xD4, 0xCE, 0x00, 0x00, 0xF4, 0xB5, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xDB, /* 0xE0-0xE3 */
+	0xD6, 0xB5, 0xEC, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE4, 0xE6, 0x00, 0x00, 0xF1, 0xEA, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xEC, 0xCB, 0xC0, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xF2, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_4F[512] = {
+	0x00, 0x00, 0xD0, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xF9, 0xF2, 0xEC, 0xA5, 0xD0, 0xDF, /* 0x08-0x0B */
+	0x00, 0x00, 0xE7, 0xEA, 0xD0, 0xEB, 0xDC, 0xD1, /* 0x0C-0x0F */
+	0xDB, 0xE9, 0xFD, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xD7, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xDA, 0xE1, 0x00, 0x00, 0xD6, 0xB6, 0x00, 0x00, /* 0x34-0x37 */
+	0xE3, 0xDF, 0x00, 0x00, 0xDE, 0xC3, 0x00, 0x00, /* 0x38-0x3B */
+	0xDE, 0xC4, 0xCA, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xEC, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xA3, 0xEE, 0xB7, /* 0x44-0x47 */
+	0xF8, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xEA, 0xC8, 0xEE, 0xB8, 0xF1, 0xAC, /* 0x4C-0x4F */
+	0xF1, 0xA5, 0xE9, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xF9, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xE5, 0xF9, 0xEC, 0xEA, 0xDD, 0xD6, /* 0x58-0x5B */
+	0xED, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xF8, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xBA, /* 0x6C-0x6F */
+	0xDB, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xA2, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xCD, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xED, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xEB, 0xDE, 0xC5, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE3, 0xE0, 0x00, 0x00, 0xCA, 0xC9, /* 0x80-0x83 */
+	0xF2, 0xE9, 0x00, 0x00, 0xD5, 0xCE, 0x00, 0x00, /* 0x84-0x87 */
+	0xF6, 0xB6, 0x00, 0x00, 0xCE, 0xC2, 0xD6, 0xC7, /* 0x88-0x8B */
+	0x00, 0x00, 0xE3, 0xB4, 0x00, 0x00, 0xF1, 0xAD, /* 0x8C-0x8F */
+	0x00, 0x00, 0xEA, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xC2, 0x00, 0x00, /* 0x94-0x97 */
+	0xF3, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xEA, /* 0x98-0x9B */
+	0x00, 0x00, 0xEB, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xB2, 0xFD, 0xA5, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF6, 0xD5, 0xD5, 0xE2, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xB5, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xF5, 0xF5, 0xB5, /* 0xC0-0xC3 */
+	0xE4, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE7, 0xEB, 0xF1, 0xD5, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xBB, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xE9, 0xB5, 0x00, 0x00, 0xCC, 0xC9, /* 0xD0-0xD3 */
+	0xFA, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD4, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xD6, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xDC, 0xC1, 0x00, 0x00, 0xDE, 0xC6, /* 0xDC-0xDF */
+	0xFA, 0xEF, 0xE3, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xF3, 0xDC, 0xF6, /* 0xEC-0xEF */
+	0x00, 0x00, 0xCE, 0xFC, 0x00, 0x00, 0xDB, 0xC4, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xF8, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xDC, 0xE4, 0x00, 0x00, 0xE5, 0xEF, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_50[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xB1, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xD6, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xF3, 0xDA, 0x00, 0x00, 0xCB, 0xC1, /* 0x08-0x0B */
+	0x00, 0x00, 0xDB, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xD9, 0xFA, 0xD3, 0xEE, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xB8, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xFD, 0xA6, 0xEB, 0xEF, 0x00, 0x00, /* 0x18-0x1B */
+	0xF4, 0xA6, 0x00, 0x00, 0xCC, 0xCA, 0xF3, 0xA8, /* 0x1C-0x1F */
+	0x00, 0x00, 0xF3, 0xDB, 0x00, 0x00, 0xDB, 0xA7, /* 0x20-0x23 */
+	0xF6, 0xB7, 0x00, 0x00, 0xCF, 0xE6, 0xF0, 0xF2, /* 0x24-0x27 */
+	0xCB, 0xDA, 0x00, 0x00, 0xE7, 0xD2, 0xD7, 0xC3, /* 0x28-0x2B */
+	0xF6, 0xF0, 0xE8, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA6, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE7, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xA3, /* 0x44-0x47 */
+	0xCC, 0xA7, 0xEA, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xB6, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xFA, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xAE, 0x00, 0x00, /* 0x58-0x5B */
+	0xEF, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xCB, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xF6, 0xB0, 0xEF, 0xCF, 0xE9, 0xCF, 0x00, 0x00, /* 0x74-0x77 */
+	0xF7, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xCE, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xDC, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xDB, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xCB, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xDF, 0xA1, 0xDD, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xF5, 0xCA, 0xE9, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xEC, 0xEE, 0xEE, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF3, 0xF0, 0x00, 0x00, 0xDF, 0xBF, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xCB, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xD0, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF4, 0xD2, 0xE0, 0xBA, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xC0, /* 0xCC-0xCF */
+	0x00, 0x00, 0xCE, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xDC, 0xD2, 0xFD, 0xEA, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xF6, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xCA, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE8, 0xE9, 0x00, 0x00, 0xE3, 0xAC, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xF3, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xCA, 0xA4, 0x00, 0x00, 0xDB, 0xF8, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xC7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_51[512] = {
+	0xEB, 0xF0, 0xF1, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xE5, 0xE2, 0x00, 0x00, 0xCC, 0xCC, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xCB, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xE3, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xC1, /* 0x1C-0x1F */
+	0x00, 0x00, 0xD6, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xD0, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xB9, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xE3, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xD3, 0x00, 0x00, /* 0x38-0x3B */
+	0xE5, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xE8, 0xB4, 0xEB, 0xC3, 0x00, 0x00, 0xEA, 0xAA, /* 0x40-0x43 */
+	0xFA, 0xFC, 0xF5, 0xF6, 0xF0, 0xBC, 0xFD, 0xD4, /* 0x44-0x47 */
+	0xE0, 0xBB, 0xCE, 0xC3, 0x00, 0x00, 0xD0, 0xBA, /* 0x48-0x4B */
+	0xF7, 0xBA, 0xD8, 0xF3, 0xF7, 0xCD, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xAE, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xD4, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xE7, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xEC, 0xFD, 0x00, 0x00, 0xD2, 0xAE, /* 0x64-0x67 */
+	0xEE, 0xEF, 0xD5, 0xD7, 0xEA, 0xE4, 0xF8, 0xA2, /* 0x68-0x6B */
+	0xCD, 0xEB, 0xD7, 0xBF, 0xFB, 0xB1, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xCD, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xDC, 0xB2, 0xD0, 0xEC, 0xCE, 0xFD, /* 0x74-0x77 */
+	0xEE, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xCC, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xD0, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xF7, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xFC, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xEE, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xB3, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xD8, 0xF4, 0x00, 0x00, 0xE9, 0xB7, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xCE, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xD9, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xF1, 0x00, 0x00, /* 0xA8-0xAB */
+	0xD4, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xA7, 0xD5, 0xD2, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xD6, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF4, 0xA2, 0x00, 0x00, 0xF1, 0xD7, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xD5, 0xD8, 0x00, 0x00, 0xF0, 0xBD, /* 0xC8-0xCB */
+	0xD7, 0xD0, 0xD4, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD7, 0xCF, 0xEB, 0xEA, 0xFD, 0xEB, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xDB, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xFC, 0xC5, 0xCB, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xD5, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xF4, 0xC8, 0xE8, 0xEA, 0xF5, 0xF3, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xF9, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_52[512] = {
+	0xD3, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xD3, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xC2, 0xEF, 0xB7, /* 0x04-0x07 */
+	0xE7, 0xD4, 0x00, 0x00, 0xCA, 0xCA, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xFB, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xFA, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xAA, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xF4, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xF7, 0xF7, 0xDC, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xD7, 0xD7, 0xDF, 0xA2, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xBE, 0x00, 0x00, /* 0x2C-0x2F */
+	0xD3, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xA4, 0xE1, 0xEC, /* 0x34-0x37 */
+	0xCF, 0xE7, 0xF3, 0xCB, 0xED, 0xA9, 0xCA, 0xBE, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xEF, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xCE, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xFB, 0xD0, 0xBB, /* 0x48-0x4B */
+	0xD5, 0xB7, 0xEE, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xF4, 0xA8, 0x00, 0x00, 0xDC, 0xF8, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xA7, /* 0x58-0x5B */
+	0x00, 0x00, 0xDA, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xE0, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xED, 0xA5, 0xEE, 0xF2, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xF9, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xDC, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xF3, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xF8, 0xF2, 0x00, 0x00, 0xF4, 0xF9, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xF1, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xBC, /* 0x84-0x87 */
+	0xDB, 0xF9, 0xD7, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xCB, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xF0, 0xA5, 0xCB, 0xFD, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF4, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xED, /* 0x9C-0x9F */
+	0xCA, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xAB, /* 0xA0-0xA3 */
+	0xD0, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xF0, 0xBE, 0xD2, 0xBD, 0xCC, 0xA4, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xB6, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xCC, 0xCD, 0x00, 0x00, 0xDA, 0xFA, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xF6, 0xCF, 0x00, 0x00, 0xE9, 0xB8, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xD8, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xCC, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xCD, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xD4, 0xD1, 0xE9, 0xED, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xCA, 0xEB, 0xD9, 0xE2, 0x00, 0x00, 0xFD, 0xB2, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE3, 0xAD, 0xD6, 0xCC, 0xD9, 0xB4, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xA7, 0xEE, 0xD3, /* 0xE0-0xE3 */
+	0xD0, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xB3, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xD5, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xCF, 0xE8, 0x00, 0x00, 0xED, 0xC3, 0xD0, 0xB2, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xFE, 0xDA, 0xA8, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_53[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xF8, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xFD, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xF8, 0xD1, 0x00, 0x00, 0xF8, 0xD2, /* 0x0C-0x0F */
+	0xDC, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xDD, 0xE2, 0xFB, 0xF9, 0xDD, 0xC1, /* 0x14-0x17 */
+	0x00, 0x00, 0xE3, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xED, 0xDD, 0xCE, 0xC4, 0x00, 0x00, 0xCB, 0xA1, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xE3, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xDD, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xF9, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xFB, /* 0x3C-0x3F */
+	0xCF, 0xA1, 0xE4, 0xA8, 0x00, 0x00, 0xF4, 0xB6, /* 0x40-0x43 */
+	0xEC, 0xFE, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAE, /* 0x44-0x47 */
+	0xE7, 0xED, 0xFD, 0xC1, 0xDA, 0xE2, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xD8, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xDD, 0xE4, 0xF0, 0xEF, 0xF6, 0xF1, /* 0x50-0x53 */
+	0xFA, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xF5, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xCF, 0x00, 0x00, /* 0x58-0x5B */
+	0xDC, 0xD4, 0x00, 0x00, 0xDC, 0xA6, 0x00, 0x00, /* 0x5C-0x5F */
+	0xEF, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xCF, 0x00, 0x00, /* 0x64-0x67 */
+	0xE0, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD6, /* 0x6C-0x6F */
+	0xEC, 0xD4, 0xEA, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xCA, 0xBF, 0xD5, 0xB0, 0x00, 0x00, 0xCF, 0xE9, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xF1, 0xED, 0x00, 0x00, 0xCC, 0xCF, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xE4, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xED, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xD7, 0xD8, 0x00, 0x00, 0xFD, 0xA7, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xAB, /* 0x9C-0x9F */
+	0xF6, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xCF, 0xF0, 0xF9, 0xBD, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xE6, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xDB, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xD1, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE9, 0xD1, 0xF3, 0xA9, 0xD0, 0xE0, 0xE9, 0xD2, /* 0xC8-0xCB */
+	0x00, 0x00, 0xDA, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE2, 0xD2, 0x00, 0x00, 0xF6, 0xA2, 0xE1, 0xF4, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xE4, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE7, 0xD5, 0xF5, 0xBF, 0xCF, 0xA2, /* 0xE0-0xE3 */
+	0xCD, 0xAF, 0xCF, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xCD, 0xB0, 0xF1, 0xFE, 0xD0, 0xA3, /* 0xE8-0xEB */
+	0xE1, 0xAF, 0xF8, 0xA3, 0x00, 0x00, 0xCA, 0xA6, /* 0xEC-0xEF */
+	0xF7, 0xBB, 0xF2, 0xEA, 0xDE, 0xC8, 0xE9, 0xD3, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xDE, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_54[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xDE, /* 0x00-0x03 */
+	0xCA, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xF9, 0xEA, 0xD1, 0xCE, 0xEE, 0xD4, 0x00, 0x00, /* 0x08-0x0B */
+	0xD4, 0xD2, 0xD9, 0xA3, 0xFD, 0xA8, 0xD7, 0xD9, /* 0x0C-0x0F */
+	0xF7, 0xCE, 0xFA, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xD6, /* 0x18-0x1B */
+	0x00, 0x00, 0xD7, 0xF0, 0x00, 0x00, 0xEB, 0xE1, /* 0x1C-0x1F */
+	0xF8, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xFA, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xDD, 0xC3, 0x00, 0x00, 0xF9, 0xDF, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xEF, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xFD, 0xE5, 0xF6, 0xA3, 0x00, 0x00, 0xD9, 0xFC, /* 0x38-0x3B */
+	0xFD, 0xA9, 0x00, 0x00, 0xE7, 0xEE, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xE5, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xEF, 0xD0, 0x00, 0x00, 0xCD, 0xB1, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xF7, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xF1, 0xB2, 0x00, 0x00, 0xF1, 0xB1, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xCD, 0xB2, 0x00, 0x00, 0xDA, 0xAB, /* 0x70-0x73 */
+	0x00, 0x00, 0xCA, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xE2, /* 0x78-0x7B */
+	0xFB, 0xBC, 0xD9, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xEE, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xD3, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xFB, 0xFA, 0x00, 0x00, 0xCF, 0xA4, 0x00, 0x00, /* 0x8C-0x8F */
+	0xDC, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xF6, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xED, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA1, /* 0xA8-0xAB */
+	0xCE, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xA6, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xF9, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xEC, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE4, 0xEE, 0xF9, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xFB, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xF9, 0xEB, 0xEE, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xEA, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xCA, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xF4, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xCD, 0xD6, 0xFC, 0xF6, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xC9, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xD4, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_55[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xF8, 0xA6, 0x00, 0x00, 0xDE, 0xCA, 0xF2, 0xC6, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xDA, 0x00, 0x00, /* 0x0C-0x0F */
+	0xD3, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xD8, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE6, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF3, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xE4, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xE4, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xF6, 0xF2, 0x00, 0x00, 0xDF, 0xC2, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xFD, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xF6, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xBA, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xAF, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xE1, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xF0, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xCB, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xE0, 0xBC, 0x00, 0x00, 0xF4, 0xCA, 0xD4, 0xFA, /* 0x84-0x87 */
+	0x00, 0x00, 0xFD, 0xAA, 0xF9, 0xE2, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xF4, 0xB7, 0xFD, 0xC2, 0xFC, 0xB0, 0x00, 0x00, /* 0x98-0x9B */
+	0xFD, 0xEC, 0xCA, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xBD, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xEA, 0xE7, 0xDF, 0xC3, 0xD1, 0xD2, /* 0xA8-0xAB */
+	0xCE, 0xE2, 0x00, 0x00, 0xD3, 0xA4, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xFD, 0xAB, 0x00, 0x00, 0xDF, 0xE0, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xF2, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xF0, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD0, 0xEE, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xAA, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xCB, /* 0xE0-0xE3 */
+	0xF6, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE1, 0xF5, 0xF1, 0xB3, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_56[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xA3, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xCA, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xCF, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xC4, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB0, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xBF, 0x00, 0x00, /* 0x30-0x33 */
+	0xF6, 0xA4, 0x00, 0x00, 0xE3, 0xB6, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC6, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xD0, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xED, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xDD, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xF7, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xE6, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xDE, 0xAD, 0x00, 0x00, 0xFA, 0xBF, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE5, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xED, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0xA5, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xFD, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xF5, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xF6, 0xDE, 0xCC, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xDE, 0x00, 0x00, /* 0xDC-0xDF */
+	0xEC, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xCD, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xD6, 0xB7, 0xCD, 0xB3, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_57[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD5, /* 0x00-0x03 */
+	0xE5, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xCF, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xD0, /* 0x08-0x0B */
+	0x00, 0x00, 0xEA, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xAE, 0xEA, 0xAD, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xF1, 0x00, 0x00, /* 0x14-0x17 */
+	0xD3, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xCF, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xEE, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xD0, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xF2, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF0, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xF2, 0xA3, 0x00, 0x00, 0xF7, 0xF8, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xB3, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xA9, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xD3, 0xBB, 0xCA, 0xEC, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF1, 0xA6, 0xCB, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xF7, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xCD, 0xDE, 0x00, 0x00, 0xF7, 0xA4, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xC0, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xDD, 0x00, 0x00, /* 0x6C-0x6F */
+	0xCC, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xCF, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xF6, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xF7, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xD3, 0xDC, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xFE, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xA7, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xEB, 0xD9, 0x00, 0x00, 0xCF, 0xA7, 0xEA, 0xAF, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xEF, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB9, /* 0xC4-0xC7 */
+	0xF1, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD8, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xF2, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xB4, /* 0xDC-0xDF */
+	0xDC, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xF3, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE3, 0xD0, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xFB, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xDB, 0xC6, 0xD0, 0xF1, 0x00, 0x00, /* 0xF8-0xFB */
+	0xD0, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_58[512] = {
+	0xCF, 0xDC, 0x00, 0x00, 0xD3, 0xD1, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xCC, 0xB1, 0xF7, 0xD8, 0x00, 0x00, /* 0x04-0x07 */
+	0xCB, 0xA8, 0xEB, 0xBC, 0xE4, 0xBE, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xDC, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xDC, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xF0, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xC0, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xED, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xEB, /* 0x2C-0x2F */
+	0xE5, 0xE8, 0xDC, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xED, 0xDE, 0xD3, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xF7, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xD4, 0xE7, 0xAB, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xC3, /* 0x4C-0x4F */
+	0x00, 0x00, 0xE1, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xF7, 0xB2, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xF3, /* 0x54-0x57 */
+	0xD3, 0xD2, 0x00, 0x00, 0xF5, 0xC0, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xDD, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xEE, 0xF3, 0xE7, 0xF1, 0x00, 0x00, /* 0x60-0x63 */
+	0xFD, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xF2, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xF3, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xEE, 0xF4, 0x00, 0x00, 0xE2, 0xD3, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xD1, /* 0x80-0x83 */
+	0x00, 0x00, 0xDF, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xE9, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD7, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xF5, 0xCD, 0x00, 0x00, 0xF1, 0xF2, 0xFA, 0xC7, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xD9, 0xF8, 0xD4, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xE5, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xC5, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xF2, 0xED, 0xDF, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xCB, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xDB, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE8, 0xB5, 0x00, 0x00, 0xD3, 0xA6, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xB5, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xF9, 0xC9, 0x00, 0x00, 0xE4, 0xE2, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xFB, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xD7, 0xA4, 0xCE, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xD5, 0xD6, 0xE6, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE5, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xCD, /* 0xE8-0xEB */
+	0xEC, 0xF3, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE0, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xEC, 0xEC, 0xFB, 0xBE, 0xDF, 0xEB, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE1, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_59[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xBE, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xD0, 0xF3, 0xE0, 0xAA, 0xE8, 0xE2, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE2, 0xD4, 0xD2, 0xFD, 0x00, 0x00, /* 0x18-0x1B */
+	0xE5, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xD3, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xDE, /* 0x24-0x27 */
+	0x00, 0x00, 0xF4, 0xB8, 0xF7, 0xBC, 0xDC, 0xFD, /* 0x28-0x2B */
+	0x00, 0x00, 0xE8, 0xEC, 0xE4, 0xE7, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE3, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xA8, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xF1, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE5, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF4, /* 0x44-0x47 */
+	0xD2, 0xAF, 0xDC, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xA5, 0xF1, 0xB4, /* 0x4C-0x4F */
+	0xFC, 0xB1, 0xCC, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xDD, 0xC6, 0xFA, 0xD1, 0x00, 0x00, 0xF7, 0xDF, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xA8, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xEE, 0xF5, 0x00, 0x00, 0xDE, 0xCE, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF3, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xAC, 0xEB, 0xC4, /* 0x68-0x6B */
+	0xED, 0xE1, 0xE0, 0xAB, 0xDD, 0xC7, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xB3, /* 0x70-0x73 */
+	0xD2, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xCA, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xFB, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xFD, 0xDD, 0xE5, /* 0x80-0x83 */
+	0xD8, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xF4, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF5, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xED, 0xD0, 0xD2, /* 0x94-0x97 */
+	0x00, 0x00, 0xD9, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xF6, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xDB, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xF7, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xD8, 0xD9, 0x00, 0x00, 0xF4, 0xA3, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xDD, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xD1, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xB5, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xED, 0xAB, 0x00, 0x00, 0xE3, 0xB7, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xEE, 0xBB, 0xCD, 0xB4, 0x00, 0x00, 0xE0, 0xF3, /* 0xD0-0xD3 */
+	0xEA, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xEC, 0xF5, 0xE8, 0xEE, 0x00, 0x00, /* 0xD8-0xDB */
+	0xCB, 0xA9, 0xF1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xCD, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xEC, 0xA9, 0x00, 0x00, 0xF2, 0xEB, 0x00, 0x00, /* 0xE8-0xEB */
+	0xFD, 0xEF, 0x00, 0x00, 0xF9, 0xF3, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xE6, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xD8, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xAC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5A[512] = {
+	0x00, 0x00, 0xEA, 0xCE, 0x00, 0x00, 0xE8, 0xDF, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xDE, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xD2, 0xA6, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF4, /* 0x18-0x1B */
+	0xD1, 0xD6, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xC2, /* 0x1C-0x1F */
+	0xE3, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE4, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xD8, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xA5, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xF3, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xD7, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xE8, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xE8, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xE6, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xE6, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xFE, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xDA, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xAC, 0xEA, 0xB0, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE3, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xCA, 0xAA, 0xE1, 0xF9, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xEA, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF2, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xFA, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xEE, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xF4, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xD2, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+};
+
+static const unsigned char u2c_5B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xFB, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xFD, 0xF0, 0x00, 0x00, 0xE0, 0xBD, /* 0x08-0x0B */
+	0xCE, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xC6, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xAE, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xDF, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xBE, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xED, 0xAD, 0xFA, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xCD, 0xEE, 0xED, 0xA6, 0x00, 0x00, 0xED, 0xAE, /* 0x54-0x57 */
+	0xF0, 0xED, 0x00, 0x00, 0xDD, 0xA1, 0x00, 0x00, /* 0x58-0x5B */
+	0xED, 0xAF, 0xFC, 0xF8, 0x00, 0x00, 0xD8, 0xEB, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xF9, /* 0x60-0x63 */
+	0xCD, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xFA, 0xA9, 0x00, 0x00, 0xE1, 0xDD, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE2, 0xD5, 0xED, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xDD, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xF9, 0xCA, 0x00, 0x00, 0xEA, 0xE8, 0x00, 0x00, /* 0x78-0x7B */
+	0xE5, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xD3, 0xEB, 0x00, 0x00, 0xE9, 0xD4, /* 0x84-0x87 */
+	0xE1, 0xFA, 0xE4, 0xCC, 0x00, 0x00, 0xE1, 0xE4, /* 0x88-0x8B */
+	0xE8, 0xC7, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xDB, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xD5, /* 0x90-0x93 */
+	0x00, 0x00, 0xF7, 0xB5, 0xFC, 0xF3, 0xF0, 0xF3, /* 0x94-0x97 */
+	0xCE, 0xAF, 0xF1, 0xB5, 0xEF, 0xD2, 0xE8, 0xC8, /* 0x98-0x9B */
+	0xEB, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xD4, 0xE0, 0xBE, /* 0xA0-0xA3 */
+	0xE3, 0xF8, 0xEA, 0xE9, 0xFC, 0xB2, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE0, 0xF4, 0x00, 0x00, 0xCF, 0xE0, 0x00, 0x00, /* 0xAC-0xAF */
+	0xEE, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xAA, /* 0xB0-0xB3 */
+	0xE6, 0xC3, 0xE1, 0xB2, 0xCA, 0xAB, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE3, 0xE4, 0xE9, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD6, /* 0xBC-0xBF */
+	0xF3, 0xF2, 0x00, 0x00, 0xEE, 0xD6, 0xEA, 0xB2, /* 0xC0-0xC3 */
+	0xD0, 0xF6, 0xEC, 0xD9, 0xDA, 0xCB, 0xCF, 0xA8, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xDD, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xD8, 0xDB, 0x00, 0x00, 0xF9, 0xCE, 0xE9, 0xD5, /* 0xD0-0xD3 */
+	0xE3, 0xD1, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xBC, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xAC, 0xF3, 0xCC, /* 0xDC-0xDF */
+	0x00, 0x00, 0xCD, 0xFB, 0xF6, 0xD6, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE7, 0xF5, 0xE8, 0xEF, 0xE3, 0xF9, 0xD2, 0xBB, /* 0xE4-0xE7 */
+	0xF3, 0xF3, 0xE3, 0xFB, 0x00, 0x00, 0xDE, 0xD0, /* 0xE8-0xEB */
+	0xCE, 0xB0, 0x00, 0x00, 0xD6, 0xF7, 0xF1, 0xD9, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xF5, 0xC1, 0xDC, 0xC4, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xF5, 0xBB, 0x00, 0x00, 0xDE, 0xD1, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_5C[512] = {
+	0x00, 0x00, 0xDC, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xDE, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE2, /* 0x04-0x07 */
+	0xEE, 0xF6, 0xEA, 0xCF, 0xF0, 0xEE, 0xE3, 0xFC, /* 0x08-0x0B */
+	0x00, 0x00, 0xD3, 0xDF, 0xD3, 0xF4, 0xE1, 0xB3, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE1, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xD3, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xDF, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xE9, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xDB, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF6, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xE3, 0xB9, 0xEB, 0xC5, 0xF4, 0xA9, 0xCD, 0xB6, /* 0x38-0x3B */
+	0xD2, 0xF9, 0x00, 0x00, 0xDA, 0xAD, 0xD2, 0xE3, /* 0x3C-0x3F */
+	0xCF, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xCB, 0xDC, 0xCC, 0xFA, 0x00, 0x00, /* 0x44-0x47 */
+	0xCF, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA9, /* 0x48-0x4B */
+	0x00, 0x00, 0xE3, 0xBB, 0xE3, 0xBA, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xE0, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xEE, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB3, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xD3, 0xF5, 0x00, 0x00, 0xD7, 0xA6, 0x00, 0x00, /* 0x60-0x63 */
+	0xF6, 0xB5, 0xD7, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE1, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xEA, /* 0x6C-0x6F */
+	0x00, 0x00, 0xDF, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xFD, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xD0, 0xF7, 0xED, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xCB, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE4, 0xDB, 0x00, 0x00, 0xE1, 0xFB, /* 0xA8-0xAB */
+	0xCB, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xD3, 0xE0, 0x00, 0x00, 0xE4, 0xBF, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xFB, 0xC0, 0x00, 0x00, 0xDA, 0xBE, /* 0xB4-0xB7 */
+	0xE4, 0xCD, 0x00, 0x00, 0xD6, 0xB9, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xC0, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE1, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xF6, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xDF, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xE4, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xE7, /* 0xEC-0xEF */
+	0xDC, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xFA, 0xD6, 0x00, 0x00, 0xD3, 0xF6, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xDA, /* 0xF8-0xFB */
+	0x00, 0x00, 0xFA, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xFD, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xD5, 0xCF, 0xD0, 0xF8, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xCD, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xF5, 0xCB, 0x00, 0x00, 0xE4, 0xF0, 0xCB, 0xAB, /* 0x14-0x17 */
+	0x00, 0x00, 0xD7, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xFE, /* 0x24-0x27 */
+	0x00, 0x00, 0xDD, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xAE, /* 0x48-0x4B */
+	0xCA, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xD5, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE3, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE8, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xAB, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xA9, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xF7, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xD4, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xCE, 0xE4, 0x00, 0x00, 0xE8, 0xF2, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xF5, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE7, 0xAE, 0x00, 0x00, 0xD6, 0xBA, 0x00, 0x00, /* 0xB8-0xBB */
+	0xDF, 0xEC, 0xE4, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE8, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xB5, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xDC, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF4, 0xB9, 0xF1, 0xB6, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE2, 0xDE, 0xE1, 0xB5, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xCD, 0xEF, 0xF1, 0xA7, 0xCE, 0xE5, /* 0xE4-0xE7 */
+	0xCB, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xE3, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xAC, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xD0, 0xF9, 0xEC, 0xAB, 0xDE, 0xD3, /* 0xF0-0xF3 */
+	0xF7, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xF5, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE1, 0xDE, 0xCB, 0xEE, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xBC, 0xF8, 0xD6, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xEE, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xFD, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xF7, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xDE, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xF2, 0xED, 0x00, 0x00, 0xDB, 0xD9, /* 0x18-0x1B */
+	0x00, 0x00, 0xF0, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE1, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xD4, /* 0x28-0x2B */
+	0x00, 0x00, 0xE0, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE3, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xE1, 0x00, 0x00, /* 0x34-0x37 */
+	0xDF, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xD9, 0xB6, 0x00, 0x00, 0xFD, 0xAC, /* 0x3C-0x3F */
+	0xEF, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE4, 0xC1, 0xF8, 0xEB, 0x00, 0x00, 0xDB, 0xAC, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xFC, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xD8, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xBA, /* 0x5C-0x5F */
+	0x00, 0x00, 0xDB, 0xDF, 0xD3, 0xD3, 0xF8, 0xC7, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xCE, 0xF8, 0xC1, /* 0x70-0x73 */
+	0xD2, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB4, /* 0x74-0x77 */
+	0xFA, 0xB9, 0xCA, 0xCF, 0x00, 0x00, 0xFC, 0xB3, /* 0x78-0x7B */
+	0xEA, 0xEA, 0xEA, 0xEB, 0xD0, 0xFA, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xED, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xE7, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xC9, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xED, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xEE, 0xBC, 0x00, 0x00, 0xEF, 0xC1, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xD2, 0x00, 0x00, /* 0x98-0x9B */
+	0xDD, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xDF, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xF8, 0xF1, 0xA8, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xB7, /* 0xA8-0xAB */
+	0x00, 0x00, 0xEF, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE4, 0xDD, 0xDF, 0xEE, 0xCB, 0xAC, /* 0xB4-0xB7 */
+	0xE9, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xEC, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xCB, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xF9, 0xBF, 0xD6, 0xAF, 0xD5, 0xC6, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xCF, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xA9, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xF8, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xB7, 0xEE, 0xF8, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD9, /* 0xDC-0xDF */
+	0xF3, 0xDF, 0x00, 0x00, 0xF8, 0xC8, 0xCE, 0xC6, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xD5, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xE6, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xC5, 0xEF, 0xD5, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xEF, 0xFC, 0xDF, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_5F[512] = {
+	0x00, 0x00, 0xDC, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xD6, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xC9, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD2, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE3, 0xBD, 0x00, 0x00, 0xCF, 0xE1, /* 0x10-0x13 */
+	0xF0, 0xC0, 0xEC, 0xDA, 0x00, 0x00, 0xDD, 0xD7, /* 0x14-0x17 */
+	0xFB, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xAC, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xA9, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xD7, 0xFB, 0xC1, /* 0x24-0x27 */
+	0x00, 0x00, 0xD2, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE5, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xED, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xAD, 0x00, 0x00, /* 0x38-0x3B */
+	0xF9, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xF7, 0xA5, 0x00, 0x00, 0xCB, 0xAE, 0x00, 0x00, /* 0x48-0x4B */
+	0xDA, 0xAF, 0x00, 0x00, 0xD8, 0xB6, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xA7, 0xFB, 0xB2, /* 0x54-0x57 */
+	0x00, 0x00, 0xFD, 0xC4, 0x00, 0x00, 0xEC, 0xAD, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xA1, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xE9, 0xE9, 0xEE, /* 0x64-0x67 */
+	0x00, 0x00, 0xF3, 0xF4, 0xF8, 0xF3, 0xF0, 0xC1, /* 0x68-0x6B */
+	0xDE, 0xAF, 0xF8, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xF3, 0xE0, 0xE7, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xAD, /* 0x74-0x77 */
+	0x00, 0x00, 0xE6, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xF9, 0xA8, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xD8, /* 0x7C-0x7F */
+	
+	0xE8, 0xD9, 0xEF, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xD3, 0xE2, 0x00, 0x00, 0xE2, 0xDF, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xE0, 0xD7, 0xC8, /* 0x88-0x8B */
+	0xFD, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xDF, 0xEF, 0xCC, 0xD3, 0xD3, 0xF9, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF0, /* 0x94-0x97 */
+	0xDB, 0xC7, 0xDE, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xF4, 0x00, 0x00, /* 0x9C-0x9F */
+	0xD5, 0xD0, 0xE5, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xFC, 0xC7, 0xDC, 0xD6, 0xE2, 0xE0, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xB0, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF3, 0xA3, 0x00, 0x00, 0xD3, 0xEC, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xF4, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xFD, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFD, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xF9, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xD0, 0xFB, 0xEC, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xBC, 0xF2, 0xA4, /* 0xD4-0xD7 */
+	0xD8, 0xCE, 0xD8, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xF5, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xE1, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xD2, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xFB, 0xEC, 0x00, 0x00, 0xDD, 0xC8, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_60[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE8, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0xC1, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xD7, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xD6, 0xBB, 0xDE, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xF7, 0xBD, 0xEC, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xD0, 0xE1, 0x00, 0x00, 0xE0, 0xF5, /* 0x24-0x27 */
+	0xEA, 0xB3, 0x00, 0x00, 0xCE, 0xD6, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xA5, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xEC, 0xF6, 0xE2, 0xE1, 0xE3, 0xBE, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xFC, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xCD, 0xF0, 0x00, 0x00, 0xF9, 0xF6, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xDF, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xE5, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xCE, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xE1, 0xED, 0xB0, /* 0x60-0x63 */
+	0xFD, 0xD1, 0xF6, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xF9, 0xCF, 0xEB, 0xDA, 0xCA, 0xC1, 0x00, 0x00, /* 0x68-0x6B */
+	0xD2, 0xB8, 0xCD, 0xF1, 0x00, 0x00, 0xE3, 0xD3, /* 0x6C-0x6F */
+	0xFD, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xE6, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xE3, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xF0, 0xAA, 0xF9, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xFC, 0xE2, 0x00, 0x00, 0xF8, 0xA7, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xE5, 0xEE, 0xF9, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF6, /* 0x9C-0x9F */
+	0xEA, 0xED, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xB4, /* 0xA0-0xA3 */
+	0xF5, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xDC, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xF0, 0xF5, 0x00, 0x00, 0xDD, 0xE8, 0xD3, 0xED, /* 0xB0-0xB3 */
+	0xF5, 0xFC, 0x00, 0x00, 0xDA, 0xBF, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xCC, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xD3, 0xFA, 0xF4, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xEF, 0xD7, 0x00, 0x00, 0xD4, 0xC3, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xFB, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xED, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE0, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xEE, /* 0xDC-0xDF */
+	0xFB, 0xB3, 0xE4, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xF6, 0xE7, 0xD2, 0xDD, 0x00, 0x00, 0xDF, 0xCC, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xC9, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE5, 0xA9, 0xE0, 0xF6, 0xF6, 0xB3, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_61[512] = {
+	0x00, 0x00, 0xE1, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xF0, 0x00, 0x00, /* 0x04-0x07 */
+	0xEA, 0xEF, 0xEA, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xDA, 0xC0, 0xF8, 0xB4, 0xEB, 0xF2, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xE4, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xD7, 0xE4, 0xF1, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xEF, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xD7, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xFC, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xF3, 0xE1, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xC4, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xE3, 0xE5, 0x00, 0x00, 0xCB, 0xC5, 0xEA, 0xB4, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xBD, 0x00, 0x00, /* 0x40-0x43 */
+	0xD7, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xDB, /* 0x44-0x47 */
+	0xED, 0xB1, 0x00, 0x00, 0xCC, 0xC3, 0xF7, 0xBE, /* 0x48-0x4B */
+	0xFC, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xF4, /* 0x50-0x53 */
+	0x00, 0x00, 0xD9, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xF3, 0xD3, 0xF3, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xF7, 0xE4, 0x00, 0x00, 0xF7, 0xD1, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xB7, 0xCE, 0xB1, /* 0x60-0x63 */
+	0xCA, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xB4, /* 0x64-0x67 */
+	0xCB, 0xC6, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xF6, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xE7, 0x00, 0x00, /* 0x6C-0x6F */
+	0xEA, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xD4, 0xCB, 0xAF, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xF4, 0xAA, 0xE9, 0xAF, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xF5, 0xC3, 0xE9, 0xD8, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xE9, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xF3, 0x00, 0x00, /* 0x8C-0x8F */
+	0xD5, 0xFB, 0xDE, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xF4, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xFD, 0xF3, 0xFD, 0xF2, 0xF7, 0xA6, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xDD, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xD3, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xCC, 0xA8, 0x00, 0x00, 0xDA, 0xC1, /* 0xA8-0xAB */
+	0xCC, 0xD5, 0x00, 0x00, 0xD9, 0xE4, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xCA, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xE3, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xBC, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xF0, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xC4, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xD0, /* 0xC4-0xC7 */
+	0xFA, 0xAB, 0xEB, 0xEB, 0xE7, 0xF8, 0xD9, 0xE5, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xD7, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xA4, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xFB, 0xFC, 0xE3, /* 0xF4-0xF7 */
+	0xFA, 0xD8, 0x00, 0x00, 0xF3, 0xD5, 0x00, 0x00, /* 0xF8-0xFB */
+	0xCF, 0xAB, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xF3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_62[512] = {
+	0xD5, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xD4, /* 0x04-0x07 */
+	0xCD, 0xFC, 0x00, 0x00, 0xD9, 0xE6, 0x00, 0x00, /* 0x08-0x0B */
+	0xE2, 0xF9, 0xE2, 0xA1, 0xEB, 0xD4, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE0, 0xF7, 0xE4, 0xB2, 0xCC, 0xFC, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xE4, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xAB, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xBD, /* 0x1C-0x1F */
+	0x00, 0x00, 0xCA, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xB8, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xC0, 0x00, 0x00, /* 0x2C-0x2F */
+	0xEE, 0xFA, 0xFD, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xD3, 0xE3, 0x00, 0x00, 0xFB, 0xC2, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xE8, 0xDB, 0xAE, /* 0x3C-0x3F */
+	0xE1, 0xB6, 0xF8, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xBF, /* 0x44-0x47 */
+	0xFB, 0xC3, 0xDD, 0xEA, 0x00, 0x00, 0xE2, 0xA2, /* 0x48-0x4B */
+	0x00, 0x00, 0xEE, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xE8, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xF6, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xCA, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xD0, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xA6, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xDD, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xE4, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAF, /* 0x7C-0x7F */
+	
+	0xD0, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xF4, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xCC, 0xBC, 0xF7, 0xEA, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE5, 0xE4, 0xDF, 0xF1, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xF7, 0xE1, 0x00, 0x00, 0xF9, 0xF7, /* 0x94-0x97 */
+	0xEF, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD8, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xA9, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xF8, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xEE, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xD8, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xE4, 0xE3, 0xF5, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xD9, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xE7, /* 0xC4-0xC7 */
+	0xD2, 0xB9, 0xD5, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xDA, 0xE5, 0xDA, 0xD0, 0x00, 0x00, 0xD1, 0xD9, /* 0xCC-0xCF */
+	0xCE, 0xD8, 0x00, 0x00, 0xCB, 0xDE, 0xF4, 0xAC, /* 0xD0-0xD3 */
+	0xDA, 0xFB, 0x00, 0x00, 0xF6, 0xE9, 0xE8, 0xF3, /* 0xD4-0xD7 */
+	0xCF, 0xAC, 0xF0, 0xF0, 0x00, 0x00, 0xF4, 0xFD, /* 0xD8-0xDB */
+	0xDB, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xCE, 0xC0, 0xE3, 0xD4, 0xD1, 0xCF, 0xF1, 0xF5, /* 0xEC-0xEF */
+	0x00, 0x00, 0xCD, 0xF2, 0x00, 0x00, 0xCF, 0xEB, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xB8, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xA6, 0xD1, 0xDA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_63[512] = {
+	0x00, 0x00, 0xF2, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA6, /* 0x04-0x07 */
+	0x00, 0x00, 0xE4, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xD3, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xA9, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xC9, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xD8, 0xE6, 0xC9, /* 0x38-0x3B */
+	0x00, 0x00, 0xD8, 0xB8, 0xFA, 0xF3, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xF3, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xF8, 0xA4, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xF3, /* 0x4C-0x4F */
+	0xE6, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xF8, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xE9, /* 0x64-0x67 */
+	0xDE, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xDF, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0xEC, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xDF, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xF4, 0xD2, 0xBA, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xF2, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB7, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE2, 0xA3, 0xD3, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xED, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xC9, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xFA, 0x00, 0x00, /* 0x94-0x97 */
+	0xCF, 0xDE, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xD0, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xD5, 0xD3, 0xF3, 0xF5, 0xF7, 0xAE, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xEF, 0xC8, 0x00, 0x00, 0xCD, 0xF3, /* 0xA4-0xA7 */
+	0xF5, 0xCF, 0xE5, 0xF3, 0xF0, 0xC2, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCA, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xEA, 0xF1, 0x00, 0x00, 0xD0, 0xA6, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xDA, /* 0xCC-0xCF */
+	0xF0, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xE7, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xC0, 0xFC, 0xB5, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE4, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xCC, 0xA9, 0xFD, 0xC6, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xEA, 0xB5, 0x00, 0x00, 0xE5, 0xAA, 0xDF, 0xBA, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_64[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE1, 0xDF, 0x00, 0x00, 0xDA, 0xD1, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xE1, 0xB8, 0x00, 0x00, 0xE8, 0xF4, 0xD3, 0xFD, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xE2, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xCA, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xDA, 0xE6, 0xF7, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xCD, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xB6, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xEE, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xF5, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xD8, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA7, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xD9, 0xB8, 0xD9, 0xB9, 0xEF, 0xC9, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xD6, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xF7, 0xCB, 0xDF, 0xAE, 0xE8, 0xF5, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0xB5, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xD5, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xF4, 0xCC, 0xDA, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xE8, /* 0xA8-0xAB */
+	0x00, 0x00, 0xF7, 0xEB, 0xF5, 0xC9, 0x00, 0x00, /* 0xAC-0xAF */
+	0xF3, 0xBC, 0x00, 0x00, 0xDA, 0xD2, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB5, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xE8, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xD6, 0xCF, 0xF4, 0xBA, 0x00, 0x00, 0xF7, 0xC9, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xAA, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xF0, 0xC3, 0xCC, 0xD6, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xD3, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xD3, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xDB, 0xFB, 0x00, 0x00, 0xCB, 0xE0, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xD3, 0xE4, 0xF6, 0xF7, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xD5, 0xBA, 0xF3, 0xCD, 0xCB, 0xE1, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xEB, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xAD, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xFC, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xEC, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xF6, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_65[512] = {
+	0xDA, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xF7, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE5, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xE0, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xFD, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xE6, 0xFC, 0xAB, /* 0x28-0x2B */
+	0xD5, 0xBB, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA8, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xA5, 0xCD, 0xB9, /* 0x34-0x37 */
+	0xEA, 0xF2, 0xCB, 0xC7, 0x00, 0x00, 0xCD, 0xF4, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xAF, 0xEF, 0xD9, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xCD, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xFC, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xDF, 0xF3, 0xCE, 0xE7, 0xDA, 0xC2, /* 0x4C-0x4F */
+	0x00, 0x00, 0xCF, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xF9, 0xF8, 0xA8, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xE2, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xF2, 0xDF, 0xA4, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xC4, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xCC, 0xD7, 0xE5, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xBB, 0x00, 0x00, /* 0x70-0x73 */
+	0xEF, 0xDA, 0xEE, 0xD8, 0x00, 0x00, 0xDD, 0xA7, /* 0x74-0x77 */
+	0xE2, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xC0, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xB0, 0xF8, 0xCA, /* 0x80-0x83 */
+	0x00, 0x00, 0xFC, 0xFA, 0x00, 0x00, 0xD9, 0xFE, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xDE, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xDD, 0xEC, 0xDA, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xE0, /* 0x94-0x97 */
+	0x00, 0x00, 0xD6, 0xF9, 0x00, 0x00, 0xCD, 0xD7, /* 0x98-0x9B */
+	0xDE, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xF8, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE4, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xD0, 0xC5, 0xF4, 0xAE, 0x00, 0x00, 0xDD, 0xA8, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xC5, /* 0xA8-0xAB */
+	0xF3, 0xD6, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xD9, /* 0xAC-0xAF */
+	0xE3, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xA8, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xDB, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xE5, 0xDA, 0xE3, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xDB, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xD5, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC1, /* 0xC8-0xCB */
+	0xEF, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xE9, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xB2, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xFD, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xD9, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xFE, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xEC, 0xED, 0xD3, 0xA9, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xF2, 0xA9, 0xF0, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xE2, 0xE2, 0xE9, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xF9, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE9, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xDA, 0xDA, 0xC3, /* 0xF8-0xFB */
+	0xDA, 0xC4, 0xD4, 0xC5, 0x00, 0x00, 0xE7, 0xFA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_66[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xE0, 0xE3, 0xB0, /* 0x04-0x07 */
+	0x00, 0x00, 0xDB, 0xB2, 0xFB, 0xC4, 0x00, 0x00, /* 0x08-0x0B */
+	0xF3, 0xE3, 0x00, 0x00, 0xD9, 0xA5, 0xFB, 0xE7, /* 0x0C-0x0F */
+	0xDD, 0xCB, 0xD0, 0xD4, 0x00, 0x00, 0xE6, 0xB6, /* 0x10-0x13 */
+	0xE0, 0xAE, 0xFD, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xB5, 0xE0, 0xF8, /* 0x1C-0x1F */
+	0xE7, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xF5, 0xF0, 0x00, 0x00, 0xD8, 0xDC, /* 0x24-0x27 */
+	0xED, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xE1, 0xB9, 0x00, 0x00, 0xE3, 0xC0, /* 0x2C-0x2F */
+	0xF9, 0xC0, 0xE9, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xD9, 0xDB, 0x00, 0x00, 0xF3, 0xE4, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xB6, 0xE4, 0xE9, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xF0, 0xC5, 0xE3, 0xC1, 0xFC, 0xCC, /* 0x40-0x43 */
+	0xFC, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xF2, 0xCB, 0x00, 0x00, 0xF2, 0xCC, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xCF, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xF1, 0xDB, 0x00, 0x00, 0xFA, 0xD9, /* 0x58-0x5B */
+	0x00, 0x00, 0xF1, 0xB8, 0xFD, 0xF5, 0xE0, 0xF9, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xE7, 0xFB, 0xFC, 0xB7, 0xFC, 0xE4, 0xFB, 0xC5, /* 0x64-0x67 */
+	0xE3, 0xE7, 0xD8, 0xB9, 0x00, 0x00, 0xF6, 0xF8, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xC5, 0xCC, 0xD8, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xAF, /* 0x70-0x73 */
+	0xF4, 0xE7, 0x00, 0x00, 0xEF, 0xDC, 0xCF, 0xFC, /* 0x74-0x77 */
+	0xEF, 0xDD, 0x00, 0x00, 0xF2, 0xAA, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xFD, 0xBE, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xAC, /* 0x84-0x87 */
+	0xFD, 0xBB, 0xFD, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xB2, 0x00, 0x00, /* 0x8C-0x8F */
+	0xEA, 0xD1, 0xDF, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xEC, 0xE4, 0xDE, /* 0x94-0x97 */
+	0xE5, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xD9, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xCD, 0xBC, 0x00, 0x00, 0xF3, 0xE5, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xD5, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xBA, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xE7, 0xFB, 0xB5, /* 0xB0-0xB3 */
+	0xF8, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE0, 0xE7, 0x00, 0x00, 0xCC, 0xD9, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xC6, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xE7, 0xA5, 0x00, 0x00, 0xD5, 0xF5, 0xD3, 0xBE, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xFC, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xF2, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xDF, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xE8, 0xF8, 0xF8, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xCE, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xF6, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE8, 0xD8, 0x00, 0x00, 0xCD, 0xD8, 0xE7, 0xD6, /* 0xF0-0xF3 */
+	0xCC, 0xDA, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE3, /* 0xF4-0xF7 */
+	0xDF, 0xF6, 0xF0, 0xC7, 0xF0, 0xC6, 0x00, 0x00, /* 0xF8-0xFB */
+	0xD8, 0xBA, 0x00, 0x00, 0xF1, 0xF4, 0xF4, 0xF0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_67[512] = {
+	0xF5, 0xCC, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xE5, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xEA, 0xC5, 0xEA, 0xF3, 0x00, 0x00, 0xDD, 0xDB, /* 0x08-0x0B */
+	0x00, 0x00, 0xDC, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xDE, 0xFD, 0xF2, 0xF9, 0x00, 0x00, 0xD5, 0xC7, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD0, /* 0x18-0x1B */
+	0x00, 0x00, 0xF0, 0xC8, 0xD1, 0xA1, 0xD1, 0xA2, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xD4, 0xD6, 0xE8, /* 0x24-0x27 */
+	0xD9, 0xCA, 0x00, 0x00, 0xDA, 0xB1, 0xD8, 0xC7, /* 0x28-0x2B */
+	0xDC, 0xE2, 0xF3, 0xCE, 0xF5, 0xF4, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF1, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xDA, 0xD3, 0x00, 0x00, 0xF6, 0xEA, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0xF5, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xFD, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xD2, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xDF, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xDD, 0xFA, 0xBA, /* 0x4C-0x4F */
+	0xEE, 0xA7, 0xF5, 0xBD, 0x00, 0x00, 0xF8, 0xF5, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xE8, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xD4, 0xE1, 0x00, 0x00, 0xD1, 0xA3, 0xE1, 0xD6, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xF9, 0xF8, 0x00, 0x00, 0xDB, 0xCA, /* 0x6C-0x6F */
+	0xCB, 0xF9, 0xD4, 0xD4, 0x00, 0x00, 0xD9, 0xDC, /* 0x70-0x73 */
+	0x00, 0x00, 0xEE, 0xBE, 0x00, 0x00, 0xF7, 0xED, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xEE, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xE6, 0xF7, 0xF9, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xED, /* 0x84-0x87 */
+	0x00, 0x00, 0xE8, 0xDB, 0x00, 0x00, 0xDB, 0xB3, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xF7, /* 0x8C-0x8F */
+	0xE0, 0xB0, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xE2, /* 0x90-0x93 */
+	0x00, 0x00, 0xF6, 0xD7, 0x00, 0x00, 0xD7, 0xF9, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xDD, 0x00, 0x00, /* 0x98-0x9B */
+	0xCD, 0xFD, 0xF2, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xBD, /* 0xAC-0xAF */
+	0xF8, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xAC, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xAD, 0xCA, 0xAE, /* 0xB4-0xB7 */
+	0xCF, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xC2, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xDC, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xDA, /* 0xCC-0xCF */
+	0xD9, 0xBB, 0xCA, 0xF3, 0xF6, 0xD3, 0xE6, 0xF8, /* 0xD0-0xD3 */
+	0xEA, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xF6, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF6, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xCF, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xCA, 0xD3, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xAF, /* 0xEC-0xEF */
+	0xD2, 0xB0, 0xF1, 0xBA, 0x00, 0x00, 0xD7, 0xB3, /* 0xF0-0xF3 */
+	0xE3, 0xC3, 0xF3, 0xFD, 0xDE, 0xDA, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xDB, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_68[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xE3, 0xEE, 0xFB, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xF7, 0xD7, 0xCA, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xCE, 0xE8, 0xDB, 0xDB, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xBB, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF1, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xFA, 0xB7, 0xD0, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xCC, 0xAB, 0xEE, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xCB, 0xFA, 0xF9, 0xF9, 0xCC, 0xFD, 0xD3, 0xFE, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xE4, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xEE, 0x00, 0x00, /* 0x4C-0x4F */
+	0xD4, 0xD5, 0xDF, 0xCD, 0x00, 0x00, 0xFC, 0xB8, /* 0x50-0x53 */
+	0xD1, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xF2, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xD2, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xD4, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xD5, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xD8, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xD9, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xA9, /* 0x90-0x93 */
+	0xF6, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xDB, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xF0, 0xC9, 0x00, 0x00, 0xFC, 0xFC, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE8, 0xC9, 0xF4, 0xFE, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xFC, /* 0xA4-0xA7 */
+	0xD7, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xDE, 0xDC, 0x00, 0x00, 0xF0, 0xAC, /* 0xAC-0xAF */
+	0xCC, 0xFE, 0xCD, 0xE1, 0x00, 0x00, 0xE1, 0xBA, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xDB, 0xEF, 0xDA, 0xB2, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xD1, 0xA5, 0xDC, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xD8, 0xF6, 0x00, 0x00, 0xD1, 0xA4, /* 0xC8-0xCB */
+	0x00, 0x00, 0xCD, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xEA, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xF0, 0xF7, 0x00, 0x00, 0xF0, 0xCA, /* 0xD4-0xD7 */
+	0xD0, 0xBE, 0x00, 0x00, 0xDD, 0xDC, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xD6, /* 0xDC-0xDF */
+	0xD3, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xD0, /* 0xE4-0xE7 */
+	0xCD, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xB5, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xF8, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xD4, 0xA1, 0xCE, 0xB2, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_69[512] = {
+	0xE8, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xEB, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE3, 0xD5, 0xF5, 0xD0, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xA1, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xA7, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE5, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xE6, 0xCB, 0x00, 0x00, 0xF5, 0xF1, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xC5, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xA3, /* 0x50-0x53 */
+	0xE0, 0xDB, 0xF6, 0xEB, 0x00, 0x00, 0xCB, 0xF1, /* 0x54-0x57 */
+	0x00, 0x00, 0xD9, 0xEA, 0xF5, 0xA2, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xD1, 0x00, 0x00, /* 0x5C-0x5F */
+	0xD1, 0xF8, 0xEA, 0xF8, 0xEA, 0xF9, 0xDA, 0xB3, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xEF, 0xDF, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xEF, /* 0x68-0x6B */
+	0x00, 0x00, 0xE5, 0xF6, 0xEE, 0xBF, 0xE2, 0xE4, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xD0, 0xBF, 0x00, 0x00, 0xFA, 0xAC, /* 0x74-0x77 */
+	0xF5, 0xD1, 0xE7, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xE9, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xCE, /* 0x98-0x9B */
+	0xDB, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xFC, 0xCE, 0x00, 0x00, 0xDD, 0xEE, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xB4, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xD7, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xB4, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xCD, 0xBE, 0x00, 0x00, 0xDA, 0xE9, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xB0, /* 0xC8-0xCB */
+	0xF7, 0xD9, 0xF3, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xCE, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xCE, 0xAA, 0x00, 0x00, 0xCB, 0xC8, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xA7, /* 0xF8-0xFB */
+	0x00, 0x00, 0xF0, 0xCB, 0x00, 0x00, 0xD0, 0xC7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6A[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xC5, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xE0, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xD5, 0xDA, 0x00, 0x00, 0xD7, 0xA7, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xC0, /* 0x14-0x17 */
+	0x00, 0x00, 0xF8, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xD2, 0xED, 0xE9, /* 0x1C-0x1F */
+	0x00, 0x00, 0xD9, 0xBC, 0x00, 0x00, 0xE5, 0xC6, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xF5, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xDA, 0xD4, 0xE2, 0xA7, 0xFB, 0xFC, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xF1, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xCA, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xE8, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xE9, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xF8, 0xE2, 0xE5, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xD0, 0xB9, 0xD4, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xA6, /* 0x5C-0x5F */
+	0x00, 0x00, 0xDF, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xF4, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xD3, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xCC, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xEF, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xE5, 0xE5, 0xD0, 0xD5, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xFC, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xFC, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xFE, 0xED, 0xEA, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xB1, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xE3, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xA2, 0xCF, 0xF6, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD0, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xEA, 0xF1, 0xEE, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xCB, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA1, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_6B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xD5, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0xED, 0x00, 0x00, /* 0x08-0x0B */
+	0xED, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xB2, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xBC, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xFD, 0xE2, 0xF3, 0xAD, 0x00, 0x00, 0xFD, 0xDB, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xB0, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xA7, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xFD, 0xE3, 0xCE, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xE4, 0xFA, 0xCE, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xCA, 0xB0, 0x00, 0x00, 0xF7, 0xA7, 0x00, 0x00, /* 0x4C-0x4F */
+	0xCF, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xA2, /* 0x5C-0x5F */
+	0x00, 0x00, 0xFC, 0xB6, 0xF2, 0xAD, 0xEF, 0xE1, /* 0x60-0x63 */
+	0xF3, 0xAE, 0xDC, 0xC6, 0xD9, 0xEB, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xE0, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xA8, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF6, /* 0x74-0x77 */
+	0xCF, 0xFD, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xDD, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD1, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xEA, /* 0x80-0x83 */
+	0xF2, 0xCF, 0x00, 0x00, 0xF7, 0xBF, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xE2, 0xE6, 0xE2, 0xA8, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xD6, 0x00, 0x00, /* 0x94-0x97 */
+	0xED, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xF9, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xB1, 0xDE, 0xB2, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xE8, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xD3, 0xAB, 0x00, 0x00, 0xEB, 0xDC, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xAF, 0x00, 0x00, /* 0xB8-0xBB */
+	0xCA, 0xC3, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xFC, /* 0xBC-0xBF */
+	0x00, 0x00, 0xFD, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xEB, 0xF6, 0xCF, 0xB2, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xEC, /* 0xC8-0xCB */
+	0x00, 0x00, 0xD9, 0xBD, 0x00, 0x00, 0xD8, 0xDF, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xB8, 0xEB, 0xBE, /* 0xD0-0xD3 */
+	0xDD, 0xEF, 0x00, 0x00, 0xDD, 0xF0, 0xDD, 0xF1, /* 0xD4-0xD7 */
+	0xDD, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xBE, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xC6, /* 0xE8-0xEB */
+	0xCF, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+};
+
+static const unsigned char u2c_6C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xEE, 0xFD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xAB, /* 0x0C-0x0F */
+	0x00, 0x00, 0xDA, 0xC5, 0x00, 0x00, 0xD8, 0xEC, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xA8, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xE2, 0xA9, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xBC, /* 0x34-0x37 */
+	0xE7, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xF0, 0x00, 0x00, /* 0x3C-0x3F */
+	0xEF, 0xE2, 0xF1, 0xF0, 0xCF, 0xB4, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xF1, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE0, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xDF, 0xA5, 0x00, 0x00, 0xF9, 0xD2, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xFD, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xE6, 0xA3, 0xFB, 0xF1, 0xCB, 0xB0, /* 0x5C-0x5F */
+	0xF2, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xCD, 0xE7, 0x00, 0x00, 0xE8, 0xDC, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xE7, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xF7, 0xC0, 0x00, 0x00, 0xD0, 0xE3, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xA1, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xBD, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xD1, 0xA9, 0xDD, 0xCC, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE3, 0xFE, 0xD1, 0xAA, 0xE8, 0xAA, /* 0x80-0x83 */
+	0x00, 0x00, 0xEA, 0xB6, 0xF9, 0xFA, 0xE6, 0xCC, /* 0x84-0x87 */
+	0xF6, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xD4, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xD9, 0xCB, 0x00, 0x00, 0xD9, 0xD2, 0xD3, 0xCB, /* 0x90-0x93 */
+	0xD8, 0xF7, 0xDA, 0xA9, 0xF5, 0xF8, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xDE, 0xDE, 0xF2, 0xAF, 0xF8, 0xA9, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC8, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xC1, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC1, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xDD, 0xF3, 0xEA, 0xFA, 0x00, 0x00, 0xF6, 0xBD, /* 0xB8-0xBB */
+	0xE1, 0xBB, 0xCD, 0xBF, 0xF4, 0xD4, 0xE6, 0xCD, /* 0xBC-0xBF */
+	0x00, 0x00, 0xFC, 0xCF, 0xFB, 0xA2, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xE0, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF4, 0xBB, 0xDA, 0xD5, 0x00, 0x00, /* 0xC8-0xCB */
+	0xF9, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xF2, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xDB, 0xF6, 0x00, 0x00, 0xDE, 0xDF, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xF2, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xF8, 0xDC, 0xF7, 0xEE, 0xEB, 0xE8, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xD2, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xF1, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xDA, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xEA, 0xDA, 0xC6, /* 0xEC-0xEF */
+	0xF7, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB6, /* 0xF0-0xF3 */
+};
+
+static const unsigned char u2c_6D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC7, /* 0x08-0x0B */
+	0xD6, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xDC, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA9, /* 0x14-0x17 */
+	0x00, 0x00, 0xE2, 0xAA, 0x00, 0x00, 0xD5, 0xA6, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xD7, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xF2, 0xD0, 0x00, 0x00, 0xEA, 0xFB, /* 0x24-0x27 */
+	0x00, 0x00, 0xE0, 0xDD, 0xFB, 0xF3, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xBD, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xE2, 0xE7, 0xFD, 0xD7, 0x00, 0x00, /* 0x34-0x37 */
+	0xCE, 0xC8, 0xEA, 0xB7, 0x00, 0x00, 0xFC, 0xC0, /* 0x38-0x3B */
+	0x00, 0x00, 0xFD, 0xE7, 0xF7, 0xEF, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xD7, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xEF, 0xBA, 0xF1, 0xDD, 0x00, 0x00, /* 0x58-0x5B */
+	0xDE, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xCB, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xDD, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xFB, 0xC7, 0xD5, 0xC8, 0x00, 0x00, /* 0x68-0x6B */
+	0xD7, 0xDF, 0x00, 0x00, 0xDD, 0xA9, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xE9, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xAD, /* 0x74-0x77 */
+	0xF6, 0xD9, 0xFA, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xAA, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xE6, 0xEE, 0x00, 0x00, 0xCC, 0xDC, /* 0x84-0x87 */
+	0xE1, 0xBC, 0xE0, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xE9, 0xBF, 0xFC, 0xFD, 0xE6, 0xCE, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE1, 0xD7, 0x00, 0x00, 0xE6, 0xCF, /* 0x90-0x93 */
+	0x00, 0x00, 0xF4, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF3, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xFB, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF9, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xEF, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xEE, /* 0xC0-0xC3 */
+	0xF6, 0xBE, 0xE0, 0xB2, 0xFC, 0xFE, 0xD1, 0xAB, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xFA, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xC8, /* 0xCC-0xCF */
+	0x00, 0x00, 0xE2, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xD4, 0xA3, 0xF0, 0xF8, 0xD7, 0xA8, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xE7, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xD3, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xEF, 0xE4, 0x00, 0x00, 0xD7, 0xC5, 0xEB, 0xE2, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xE7, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE4, 0xA2, 0x00, 0x00, 0xE2, 0xE8, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xE6, 0xD0, 0x00, 0x00, 0xFB, 0xE8, /* 0xF4-0xF7 */
+	0xF4, 0xE8, 0xE5, 0xF4, 0xF4, 0xBC, 0xF4, 0xD5, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_6E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xB6, /* 0x14-0x17 */
+	0x00, 0x00, 0xFC, 0xB9, 0xEE, 0xC2, 0xCA, 0xF5, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xE5, /* 0x1C-0x1F */
+	0xCB, 0xE2, 0xD4, 0xA4, 0x00, 0x00, 0xDE, 0xE0, /* 0x20-0x23 */
+	0xDA, 0xFD, 0xE4, 0xC6, 0xE8, 0xBE, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xDE, /* 0x28-0x2B */
+	0xF6, 0xB4, 0xEA, 0xD2, 0x00, 0x00, 0xF9, 0xFB, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xC2, 0x00, 0x00, /* 0x30-0x33 */
+	0xCA, 0xE4, 0x00, 0x00, 0xE7, 0xB7, 0x00, 0x00, /* 0x34-0x37 */
+	0xEA, 0xFD, 0x00, 0x00, 0xD9, 0xDD, 0x00, 0x00, /* 0x38-0x3B */
+	0xDA, 0xB4, 0xEE, 0xAA, 0xFB, 0xE9, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xCB, /* 0x40-0x43 */
+	0xDA, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xBE, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xD3, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xC9, 0x00, 0x00, /* 0x54-0x57 */
+	0xDF, 0xCF, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xC0, /* 0x58-0x5B */
+	0xE3, 0xD7, 0x00, 0x00, 0xEF, 0xE6, 0xFC, 0xD0, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC0, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xD3, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xDC, 0xF7, 0xB7, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xB8, 0xD1, 0xF9, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xC8, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xEA, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xDE, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xD7, 0xB6, 0xCF, 0xB5, 0x00, 0x00, 0xD9, 0xA8, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xEE, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xDD, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xA2, 0xE8, 0xAE, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBD, /* 0xAC-0xAF */
+	0x00, 0x00, 0xF2, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xC1, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0xFC, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xB5, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF3, 0xE7, 0xD8, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xFC, 0xD1, 0x00, 0x00, 0xED, 0xB2, /* 0xC8-0xCB */
+	0xF4, 0xAF, 0x00, 0x00, 0xFB, 0xA3, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xFC, 0xC1, 0x00, 0x00, 0xEE, 0xAB, /* 0xD0-0xD3 */
+	0xD4, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xF2, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xEE, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xFB, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xE3, 0xD8, 0xBB, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6F[512] = {
+	0x00, 0x00, 0xE5, 0xDB, 0xF8, 0xF7, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xD4, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xA9, /* 0x0C-0x0F */
+	0x00, 0x00, 0xCB, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xE6, 0xD1, 0xF0, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xD8, 0xAE, 0x00, 0x00, 0xF9, 0xD3, 0xD5, 0xFE, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xBC, /* 0x28-0x2B */
+	0xF2, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE2, 0xAB, 0xF3, 0xE8, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xEF, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xEC, /* 0x3C-0x3F */
+	0x00, 0x00, 0xE7, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xDA, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xCC, 0xBE, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xFC, /* 0x54-0x57 */
+	0xDA, 0xEB, 0x00, 0x00, 0xE2, 0xD8, 0xED, 0xD6, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xD1, 0xE0, 0xB3, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xD2, 0x00, 0x00, /* 0x60-0x63 */
+	0xEB, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xD3, 0xC1, 0xF0, 0xCD, 0x00, 0x00, /* 0x6C-0x6F */
+	0xCF, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xD2, 0x00, 0x00, /* 0x78-0x7B */
+	0xD4, 0xD8, 0xDC, 0xC9, 0xD7, 0xF1, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xDF, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xF3, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xF4, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xF1, 0xBF, 0xF8, 0xB1, 0x00, 0x00, /* 0x8C-0x8F */
+	0xE9, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xFB, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xD5, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xD4, /* 0xA0-0xA3 */
+	0xF7, 0xCA, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC8, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xE8, 0xF3, 0xBD, /* 0xAC-0xAF */
+	0x00, 0x00, 0xEE, 0xFE, 0x00, 0x00, 0xE7, 0xFE, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xD3, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xB6, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCC, 0xAD, 0xF6, 0xFA, 0xD6, 0xB2, 0xD2, 0xD8, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xD8, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE3, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xB9, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xAD, /* 0xDC-0xDF */
+	0xFB, 0xCC, 0xEB, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xD4, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xFB, 0xCD, 0x00, 0x00, 0xD5, 0xBD, /* 0xE8-0xEB */
+	0xF1, 0xDF, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xFB, /* 0xEC-0xEF */
+	0x00, 0x00, 0xDE, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xEB, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_70[512] = {
+	0x00, 0x00, 0xE5, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xFB, 0xA4, 0xD4, 0xB9, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xDE, 0xE1, 0x00, 0x00, 0xE4, 0xA3, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xB7, /* 0x0C-0x0F */
+	0x00, 0x00, 0xF8, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xDE, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xD6, 0xD2, 0x00, 0x00, 0xF9, 0xD5, 0xE7, 0xBA, /* 0x18-0x1B */
+	0xEB, 0xD5, 0xD5, 0xF7, 0xEF, 0xE7, 0xE1, 0xBE, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xAE, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xE9, /* 0x24-0x27 */
+	0xD6, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBB, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xCB, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xCE, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xFB, 0xA5, 0xE1, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xF7, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xFB, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xBD, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xFD, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xFC, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xCF, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xED, 0xC7, 0xEE, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xCC, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xA7, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xFA, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xA4, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xFD, 0xDC, 0xED, 0xB3, 0xCE, 0xC9, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xEF, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE1, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xDB, /* 0xA8-0xAB */
+	0xCB, 0xE3, 0xF7, 0xA9, 0x00, 0x00, 0xFB, 0xA6, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB9, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xC0, /* 0xB4-0xB7 */
+	0xED, 0xC8, 0xEF, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xD6, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xCE, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA1, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xFB, 0xF4, 0xD5, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF1, 0xF6, 0x00, 0x00, 0xE6, 0xD3, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xCC, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xF8, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xDC, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_71[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xFD, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xE5, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xF1, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xDB, 0xCC, 0xDD, 0xCD, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xC8, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xD9, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xA5, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE6, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xD4, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xC8, /* 0x44-0x47 */
+	0x00, 0x00, 0xD6, 0xA1, 0xFD, 0xBF, 0x00, 0x00, /* 0x48-0x4B */
+	0xFC, 0xD3, 0x00, 0x00, 0xEF, 0xA1, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE7, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xEE, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xE6, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xE9, 0xF2, 0x00, 0x00, 0xDF, 0xB0, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xD8, 0xE0, 0xFC, 0xBA, 0xFD, 0xAF, 0xF0, 0xCE, /* 0x64-0x67 */
+	0x00, 0x00, 0xDB, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE5, 0xC9, 0x00, 0x00, 0xED, 0xB4, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE0, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xE3, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xE9, 0xFB, 0xEA, 0xA8, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xB7, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xA7, 0x00, 0x00, /* 0x90-0x93 */
+	0xE9, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xFD, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD9, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xEC, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE8, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xE6, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xFD, 0xF8, 0xFD, 0xF9, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xBF, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xE7, 0xA7, 0x00, 0x00, 0xE6, 0xD7, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xD4, 0xF3, 0xD4, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xFA, 0x00, 0x00, /* 0xCC-0xCF */
+	0xD7, 0xF2, 0x00, 0x00, 0xE1, 0xC0, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xDB, 0xE2, 0xE6, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBD, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xF0, 0xCF, 0xF3, 0xBE, 0xE2, 0xAC, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xF5, 0xB7, 0xE0, 0xF0, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xB8, /* 0xF8-0xFB */
+	0xE3, 0xE8, 0x00, 0x00, 0xD4, 0xA7, 0xE8, 0xFC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_72[512] = {
+	0xFA, 0xD2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xEF, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xD6, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xB4, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xD0, 0x00, 0x00, /* 0x28-0x2B */
+	0xF7, 0xF0, 0xEE, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xEA, 0xBA, 0x00, 0x00, 0xEA, 0xD3, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xED, 0xC9, 0xDD, 0xAB, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xAC, 0xFD, 0xA1, /* 0x38-0x3B */
+	0x00, 0x00, 0xDF, 0xD0, 0xEC, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */
+	0xDF, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xED, 0xF8, 0xB8, /* 0x44-0x47 */
+	0xF7, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xF8, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xE0, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xD4, 0xBA, 0xE4, 0xB3, 0x00, 0x00, 0xE9, 0xDA, /* 0x58-0x5B */
+	0x00, 0x00, 0xDE, 0xB6, 0x00, 0x00, 0xD9, 0xBF, /* 0x5C-0x5F */
+	0x00, 0x00, 0xD9, 0xC0, 0xD6, 0xEF, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xCC, /* 0x64-0x67 */
+	0x00, 0x00, 0xDA, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xE5, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xF7, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xCC, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xDF, 0xF9, 0xD7, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xBB, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xFA, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xCC, 0xB3, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xF3, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xDF, 0xD2, 0x00, 0x00, 0xCE, 0xCA, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xEE, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xE4, 0x00, 0x00, /* 0xCC-0xCF */
+	0xFB, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xB7, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xEE, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xCE, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE2, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xD7, 0xE1, 0xFA, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xD5, 0xC9, 0xF8, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_73[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xD9, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xE9, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xED, /* 0x18-0x1B */
+	0xE3, 0xC4, 0xF0, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE8, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE0, 0xFA, 0xEE, 0xC4, 0xD9, 0xDE, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xA2, 0xEB, 0xA3, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xC2, 0xEA, 0xBB, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE8, 0xAB, 0xDE, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xED, 0xEF, 0x00, 0x00, 0xE8, 0xA3, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xF1, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xD4, 0xBC, 0x00, 0x00, 0xFC, 0xEA, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE7, 0xBE, 0x00, 0x00, 0xFC, 0xF2, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xD6, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xE2, 0xAE, 0x00, 0x00, 0xD3, 0xB7, 0xFA, 0xCC, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xFA, 0xDC, 0x00, 0x00, 0xED, 0xB5, 0xE1, 0xE3, /* 0x84-0x87 */
+	0x00, 0x00, 0xE8, 0xAC, 0x00, 0x00, 0xE8, 0xDD, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xE9, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xF4, 0xBD, 0x00, 0x00, 0xCF, 0xB8, 0xE9, 0xDB, /* 0x94-0x97 */
+	0xD1, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xC7, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xC9, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE8, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xDE, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xBC, 0xD3, 0xE5, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xFA, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xDA, 0xD6, 0x00, 0x00, 0xCA, 0xB1, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xDA, 0xC8, 0xDF, 0xA6, 0x00, 0x00, /* 0xC8-0xCB */
+	0xF9, 0xB3, 0xF2, 0xD2, 0x00, 0x00, 0xCA, 0xC4, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xCB, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xCD, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xFD, 0xB0, 0xD5, 0xA8, 0x00, 0x00, /* 0xDC-0xDF */
+	0xF1, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xE9, /* 0xE0-0xE3 */
+	0xDC, 0xCA, 0xEC, 0xB4, 0xFA, 0xC0, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xFB, 0xA8, 0xD0, 0xA8, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xDA, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xEE, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE0, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xEF, 0xEA, 0xFA, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_74[512] = {
+	0x00, 0x00, 0xE0, 0xC4, 0x00, 0x00, 0xCF, 0xB9, /* 0x00-0x03 */
+	0x00, 0x00, 0xD5, 0xCA, 0xD7, 0xE2, 0xE2, 0xAF, /* 0x04-0x07 */
+	0x00, 0x00, 0xD7, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xCD, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xDA, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xEF, 0xA2, 0xE2, 0xDA, 0xF6, 0xFC, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xFB, 0xD0, 0xD1, 0xAD, 0x00, 0x00, /* 0x24-0x27 */
+	0xCD, 0xE4, 0x00, 0x00, 0xD1, 0xAE, 0xDC, 0xED, /* 0x28-0x2B */
+	0xE8, 0xCE, 0x00, 0x00, 0xF0, 0xF9, 0xCE, 0xB5, /* 0x2C-0x2F */
+	0xE6, 0xFC, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xFB, /* 0x30-0x33 */
+	0xD0, 0xD6, 0xDD, 0xF5, 0xF7, 0xF1, 0x00, 0x00, /* 0x34-0x37 */
+	0xF6, 0xFD, 0x00, 0x00, 0xDB, 0xF7, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xEA, /* 0x3C-0x3F */
+	0xE9, 0xDC, 0xD9, 0xC1, 0x00, 0x00, 0xF5, 0xF2, /* 0x40-0x43 */
+	0xE0, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD4, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xF9, 0xC2, 0x00, 0x00, 0xEA, 0xBC, /* 0x54-0x57 */
+	0x00, 0x00, 0xD2, 0xC5, 0xFB, 0xD1, 0xE7, 0xC0, /* 0x58-0x5B */
+	0xEB, 0xA5, 0x00, 0x00, 0xDF, 0xFA, 0xE3, 0xA2, /* 0x5C-0x5F */
+	0xD7, 0xB9, 0x00, 0x00, 0xE9, 0xC3, 0x00, 0x00, /* 0x60-0x63 */
+	0xE8, 0xFD, 0xE8, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xF2, 0xD3, 0xFB, 0xA9, 0xD8, 0xA5, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xCB, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xC8, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xAF, 0xD7, 0xE3, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC6, /* 0x84-0x87 */
+	0x00, 0x00, 0xD6, 0xA2, 0x00, 0x00, 0xED, 0xF0, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xD7, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xFC, 0xD4, 0x00, 0x00, 0xDA, 0xD7, 0xCC, 0xDF, /* 0x9C-0x9F */
+	0x00, 0x00, 0xF2, 0xD4, 0x00, 0x00, 0xD1, 0xB0, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xCC, 0xE0, 0x00, 0x00, 0xDB, 0xFD, /* 0xA4-0xA7 */
+	0xF3, 0xBF, 0x00, 0x00, 0xF0, 0xD1, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xFC, 0xBB, 0x00, 0x00, 0xE2, 0xB0, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE6, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE2, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xDF, 0xDE, 0x00, 0x00, 0xE0, 0xC7, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xEF, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xE1, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xEA, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE7, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xCE, 0xB6, 0x00, 0x00, 0xF3, 0xC0, 0x00, 0x00, /* 0xD8-0xDB */
+	0xCD, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xFB, 0xD2, 0x00, 0x00, 0xF8, 0xF8, 0xF7, 0xFB, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xBF, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xB7, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xB6, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_75[512] = {
+	0x00, 0x00, 0xDC, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xCC, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xF1, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xE8, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xCA, 0xF6, 0x00, 0x00, 0xE4, 0xA4, 0xF4, 0xD6, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xE6, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xA7, /* 0x20-0x23 */
+	0x00, 0x00, 0xDF, 0xE7, 0xE1, 0xC1, 0x00, 0x00, /* 0x24-0x27 */
+	0xE9, 0xC4, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xCB, /* 0x28-0x2B */
+	0xE9, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xEF, 0xA3, 0xEB, 0xA6, 0xCB, 0xA3, 0xE3, 0xE9, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xFB, /* 0x34-0x37 */
+	0xEF, 0xA4, 0x00, 0x00, 0xEF, 0xEB, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xB4, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xCD, 0xA3, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE6, /* 0x4C-0x4F */
+	0x00, 0x00, 0xEF, 0xA5, 0x00, 0x00, 0xD3, 0xCC, /* 0x50-0x53 */
+	0xDA, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xD7, 0xBA, 0x00, 0x00, 0xF2, 0xD5, /* 0x58-0x5B */
+	0xF5, 0xE5, 0xD9, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xB4, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xD5, 0xD4, 0xFD, 0xCF, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xE3, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xE1, /* 0x6C-0x6F */
+	0xEC, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xFB, 0xFE, 0xD3, 0xD7, 0x00, 0x00, /* 0x74-0x77 */
+	0xD1, 0xB1, 0x00, 0x00, 0xCB, 0xB1, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB2, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xB2, 0xF1, 0xC2, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xE1, 0xF9, 0xB5, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xC3, 0xE1, 0xC2, /* 0x8C-0x8F */
+	0x00, 0x00, 0xEB, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xDF, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xCB, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xB9, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xF8, 0xDE, 0xF9, 0xAA, 0xCA, 0xF7, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xED, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xD3, 0xB8, 0xF2, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xD4, 0xD9, 0xEE, 0xC5, 0xF2, 0xF0, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xB2, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xDC, 0xBB, 0x00, 0x00, 0xF1, 0xF8, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xEC, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xCA, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xF6, 0xC0, 0xFD, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xD4, 0xE3, 0xCC, 0xE2, 0x00, 0x00, 0xF7, 0xD4, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xE5, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xD3, 0xC3, 0x00, 0x00, 0xD8, 0xA6, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xF6, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xF6, 0x00, 0x00, /* 0xF8-0xFB */
+	0xCD, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_76[512] = {
+	0xE5, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE5, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE1, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xB0, /* 0x1C-0x1F */
+	0xF4, 0xB0, 0xF3, 0xEA, 0xDA, 0xEE, 0x00, 0x00, /* 0x20-0x23 */
+	0xD7, 0xBB, 0x00, 0x00, 0xE2, 0xB1, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xAA, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xFB, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE4, 0xDF, 0x00, 0x00, 0xCA, 0xD6, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xA8, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xFE, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xF6, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xEF, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xD4, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE0, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE8, 0xB9, 0x00, 0x00, 0xEF, 0xA6, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xCD, 0xA4, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF4, /* 0x78-0x7B */
+	0xDB, 0xA1, 0xDB, 0xDC, 0xDB, 0xDD, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xEE, 0xDC, 0x00, 0x00, 0xCB, 0xCB, 0xFC, 0xD5, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xEB, 0x00, 0x00, /* 0x8C-0x8F */
+	0xCD, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xD3, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xAB, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xD4, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xA9, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xDD, 0xDB, 0xCD, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xCE, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE7, 0xC3, 0x00, 0x00, 0xEC, 0xCC, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xEC, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xCC, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xFC, /* 0xD8-0xDB */
+	0xD4, 0xA8, 0x00, 0x00, 0xED, 0xD3, 0xD8, 0xEF, /* 0xDC-0xDF */
+	0x00, 0x00, 0xF2, 0xD7, 0x00, 0x00, 0xCA, 0xF8, /* 0xE0-0xE3 */
+	0xDA, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD4, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xCD, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xEE, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xF2, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xDF, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xDA, 0xF0, 0x00, 0x00, 0xE2, 0xEA, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_77[512] = {
+	0x00, 0x00, 0xE0, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xD8, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xF7, 0xAF, 0xDA, 0xB6, 0x00, 0x00, 0xCA, 0xD7, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xD8, 0x00, 0x00, /* 0x1C-0x1F */
+	0xD8, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xFA, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xEF, /* 0x34-0x37 */
+	0xD9, 0xC2, 0x00, 0x00, 0xF0, 0xD2, 0x00, 0x00, /* 0x38-0x3B */
+	0xE4, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xF3, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xFA, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xEC, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xE2, 0xB2, 0x00, 0x00, 0xD4, 0xBD, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xCE, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xE2, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xD4, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xC2, 0xE7, 0xDA, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xD9, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xD9, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xBE, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xDC, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE2, 0xEB, 0xD6, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xCA, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xDA, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xD7, /* 0xB8-0xBB */
+	0xCC, 0xA1, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xBA, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xB8, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xC3, /* 0xD8-0xDB */
+	0xD0, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xC5, 0xEB, 0xF8, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xF2, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xCF, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xD3, 0xAD, 0xE8, 0xE1, 0xCE, 0xEC, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB4, /* 0xF0-0xF3 */
+};
+
+static const unsigned char u2c_78[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xE3, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xF7, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xF2, 0xB2, 0xF3, 0xF6, 0xF6, 0xDB, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xD7, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xDF, 0x00, 0x00, /* 0x30-0x33 */
+	0xF7, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xD0, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xDA, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xF5, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xBC, /* 0x68-0x6B */
+	0xCC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xDB, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xDD, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xD1, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xED, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xD6, 0xDE, 0xE4, 0xF4, 0xE1, 0xEF, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xDD, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xCF, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE5, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xA1, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE0, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xAC, 0xFC, 0xAD, /* 0xB8-0xBB */
+	0xD8, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xED, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xDB, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xF0, 0xF3, 0xAF, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xA5, 0x00, 0x00, /* 0xCC-0xCF */
+	0xDA, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xD8, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xCC, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB4, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xCA, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xF2, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_79[512] = {
+	0x00, 0x00, 0xF5, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xA8, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xA6, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xEC, 0xD5, 0xF8, /* 0x28-0x2B */
+	0xDA, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xC6, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xE4, 0x00, 0x00, /* 0x3C-0x3F */
+	0xDE, 0xE5, 0xD1, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB6, /* 0x44-0x47 */
+	0xD1, 0xB7, 0xF2, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE9, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xD3, 0xF2, 0xB4, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xD4, 0xCB, 0xE4, /* 0x58-0x5B */
+	0xFB, 0xD4, 0xF5, 0xE6, 0xE3, 0xEA, 0x00, 0x00, /* 0x5C-0x5F */
+	0xDE, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xDF, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xF8, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xF0, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xB8, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xDF, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xD0, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xFC, 0xA1, 0xEF, 0xEE, 0xDC, 0xD8, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE9, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xDD, 0xFD, 0xFB, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xC9, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xC9, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xD4, 0xAA, 0x00, 0x00, 0xE5, 0xCC, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE9, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xD0, 0xD8, 0xFC, 0xA2, 0xD4, 0xBE, /* 0xBC-0xBF */
+	0xE2, 0xB3, 0xDE, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xDC, 0xBC, 0xD2, 0xB6, 0xF5, 0xD5, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xCE, 0xA1, 0xF5, 0xA9, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xDD, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xDD, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xD5, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xF6, 0xDF, 0x00, 0x00, 0xF2, 0xDA, 0xE4, 0xEB, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xF2, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xB9, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_7A[512] = {
+	0xFD, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xE1, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xCA, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xEF, /* 0x08-0x0B */
+	0x00, 0x00, 0xF5, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xEC, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xAD, /* 0x14-0x17 */
+	0x00, 0x00, 0xF2, 0xC2, 0xF6, 0xC3, 0x00, 0x00, /* 0x18-0x1B */
+	0xD7, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xA2, /* 0x1C-0x1F */
+	0xF0, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xFA, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF6, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xF3, 0xF2, 0xC3, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xAB, /* 0x38-0x3B */
+	0xCA, 0xB3, 0xCD, 0xA6, 0x00, 0x00, 0xCD, 0xC3, /* 0x3C-0x3F */
+	0xCD, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xCF, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xF6, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xEE, 0xDD, 0xE7, 0xC4, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xB4, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xDF, 0xE2, 0xE7, 0xDB, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE8, 0xB1, 0x00, 0x00, 0xFC, 0xAE, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE5, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xFA, 0xEB, 0x00, 0x00, 0xCF, 0xBC, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xCF, 0xE2, 0xCD, 0xF6, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xEF, 0xF0, 0x00, 0x00, 0xF4, 0xBE, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xD4, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xF3, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE9, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xF2, 0xF3, 0xEB, /* 0x90-0x93 */
+	0x00, 0x00, 0xF0, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xCF, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xDF, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE8, 0xC0, 0xE8, 0xC1, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0xE3, 0xE9, 0xA2, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xAA, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF3, 0xC1, 0xD0, 0xAB, 0x00, 0x00, 0xD4, 0xE4, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xBC, 0xD8, 0xA1, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xDF, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xF3, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xDC, 0xBD, 0x00, 0x00, 0xCC, 0xE5, /* 0xDC-0xDF */
+	0xED, 0xF1, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xE2, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xD4, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xB5, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xCA, 0xE6, 0x00, 0x00, 0xD3, 0xAE, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xE6, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xF1, 0xD3, 0xF5, 0xE7, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xDA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xEE, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE1, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xDF, 0xE9, 0x00, 0x00, 0xEE, 0xDE, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xC2, 0x00, 0x00, /* 0x1C-0x1F */
+	0xD8, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xAC, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xF0, 0xAF, 0xD6, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xE1, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xB6, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xD4, 0xF5, 0x00, 0x00, 0xD0, 0xC9, /* 0x48-0x4B */
+	0xEF, 0xA7, 0xE2, 0xEC, 0x00, 0x00, 0xDB, 0xEA, /* 0x4C-0x4F */
+	0xCE, 0xCC, 0xF5, 0xE8, 0xF7, 0xD5, 0x00, 0x00, /* 0x50-0x53 */
+	0xD3, 0xCD, 0x00, 0x00, 0xF3, 0xFE, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xD0, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE0, 0xFE, 0x00, 0x00, 0xDF, 0xFB, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xE6, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE8, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xCD, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xA8, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB4, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xDA, 0xD8, 0xD1, 0xB9, 0x00, 0x00, 0xDF, 0xA9, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xB0, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xCC, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xCE, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xEF, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xDF, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xED, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xEE, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xEF, 0xBD, 0xFC, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xDB, 0xF4, 0x00, 0x00, 0xEF, 0xAA, 0xF8, 0xB9, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF5, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xD9, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xE1, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xD4, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xDE, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+};
+
+static const unsigned char u2c_7C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xEA, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xC2, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xAF, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xCA, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xD7, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xD8, 0xE1, 0xC7, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xF4, 0xD8, 0xD6, 0xB3, 0xDD, 0xAD, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xBE, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xF1, 0xC3, 0xEE, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xD6, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xF4, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xD7, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xB7, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xFB, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xDD, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xA3, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xDA, 0xD9, 0x00, 0x00, 0xF0, 0xD8, /* 0x94-0x97 */
+	0xEF, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD8, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xF1, 0xD4, 0x00, 0x00, 0xED, 0xF2, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xDB, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xD5, 0xDC, 0xF3, 0xC4, 0xCB, 0xD7, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE2, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xF1, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xD5, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xD8, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xD0, 0xF0, 0xD9, /* 0xDC-0xDF */
+	0xCB, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xDD, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xA7, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xAC, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7D[512] = {
+	0xD1, 0xBA, 0x00, 0x00, 0xF1, 0xC4, 0x00, 0x00, /* 0x00-0x03 */
+	0xE5, 0xB3, 0xFB, 0xF5, 0xE9, 0xE1, 0xFD, 0xE0, /* 0x04-0x07 */
+	0xFC, 0xBC, 0x00, 0x00, 0xDA, 0xA2, 0xDA, 0xA3, /* 0x08-0x0B */
+	0x00, 0x00, 0xD2, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xD2, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xE2, 0xED, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xE9, /* 0x14-0x17 */
+	0xCE, 0xDC, 0xF2, 0xB5, 0xD0, 0xE4, 0xDD, 0xD1, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE1, 0xC8, 0xDB, 0xB7, 0xDF, 0xE3, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xB9, /* 0x28-0x2B */
+	0xF1, 0xC5, 0x00, 0x00, 0xF3, 0xCF, 0xD7, 0xAB, /* 0x2C-0x2F */
+	0xE1, 0xAC, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xEB, /* 0x30-0x33 */
+	0x00, 0x00, 0xEE, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xE1, 0xC9, 0xCA, 0xFA, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xFB, 0xFA, 0xE1, /* 0x40-0x43 */
+	0xF0, 0xDA, 0xCC, 0xE7, 0xDA, 0xF4, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xCC, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xED, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xD5, 0xA9, 0xFA, 0xE2, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xE5, 0x00, 0x00, /* 0x64-0x67 */
+	0xEB, 0xD6, 0x00, 0x00, 0xEC, 0xDF, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xFC, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xF7, 0xD6, 0xDE, 0xEA, 0xCB, 0xB4, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xBE, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xCC, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xBD, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xF2, 0xE2, 0xB7, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xE8, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xF0, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xD6, 0xE0, 0x00, 0x00, 0xF1, 0xC6, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE2, 0xB8, 0xEB, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xCB, 0xB5, 0xD8, 0xD1, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xF4, 0xCE, 0xF3, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xD7, 0xC6, 0x00, 0x00, 0xD1, 0xBB, 0xF7, 0xAA, /* 0xB8-0xBB */
+	0x00, 0x00, 0xED, 0xCA, 0xD7, 0xD3, 0xD8, 0xFA, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xC5, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xCC, 0xDD, 0xFC, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xFD, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xF9, 0xE5, 0x00, 0x00, 0xE0, 0xCA, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF2, 0xFD, 0xD3, 0xB0, 0x00, 0x00, /* 0xDC-0xDF */
+	0xF4, 0xF3, 0xDA, 0xC9, 0x00, 0x00, 0xE6, 0xDE, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xF8, 0xBA, 0xE8, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xD8, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD5, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xD6, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xC6, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_7E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xF2, 0xDB, 0xE4, 0xFC, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xE8, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xDA, /* 0x18-0x1B */
+	0x00, 0x00, 0xF2, 0xDC, 0xFB, 0xD6, 0xE9, 0xB2, /* 0x1C-0x1F */
+	0x00, 0x00, 0xEE, 0xAD, 0x00, 0x00, 0xFA, 0xE3, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xEE, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xEA, 0xE6, 0xE0, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF0, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xAC, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xF5, 0xC5, 0xEE, 0xE0, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xDB, 0xE5, 0x00, 0x00, 0xDD, 0xDE, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xF0, 0xE9, 0xA3, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xF9, 0x00, 0x00, /* 0x50-0x53 */
+	0xF2, 0xC4, 0xE0, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xA4, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xE2, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE3, 0xB1, 0xFC, 0xEB, 0xCD, 0xA8, /* 0x68-0x6B */
+	0x00, 0x00, 0xCC, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xF0, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xE6, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xCD, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xC3, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xE1, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xAB, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xC5, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xE9, 0x00, 0x00, /* 0x94-0x97 */
+	0xF3, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xC0, /* 0x98-0x9B */
+	0xD5, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_7F[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xAE, 0x00, 0x00, /* 0x34-0x37 */
+	0xF9, 0xFC, 0x00, 0x00, 0xCC, 0xC0, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xE5, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xCE, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xD8, 0xD2, 0xF9, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xAA, 0xCE, 0xD1, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xC7, 0x00, 0x00, /* 0x6C-0x6F */
+	0xDB, 0xEB, 0x00, 0x00, 0xDF, 0xFE, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xD8, 0xE1, 0x00, 0x00, 0xF7, 0xF3, /* 0x74-0x77 */
+	0x00, 0x00, 0xD7, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xD4, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xD1, 0xBC, 0x00, 0x00, 0xE5, 0xCF, 0x00, 0x00, /* 0x88-0x8B */
+	0xCB, 0xB6, 0x00, 0x00, 0xDA, 0xB8, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xCD, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xBE, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xBA, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xCF, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE0, 0xCC, 0xEB, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xFD, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xD7, 0xE8, 0xCB, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE9, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xE8, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE3, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xCD, 0x00, 0x00, /* 0xC8-0xCB */
+	0xEC, 0xCE, 0x00, 0x00, 0xD6, 0xBF, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xA7, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xDF, 0xD6, 0xFD, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE1, /* 0xDC-0xDF */
+	0xF6, 0xA8, 0xDD, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xF8, 0xBB, 0x00, 0x00, 0xE8, 0xD1, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xF9, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xCE, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xEC, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_80[512] = {
+	0xE9, 0xA5, 0xD6, 0xD5, 0x00, 0x00, 0xCD, 0xC5, /* 0x00-0x03 */
+	0x00, 0x00, 0xED, 0xBA, 0xD1, 0xBD, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xCF, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xEC, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xD2, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xCC, 0xE9, 0x00, 0x00, 0xD9, 0xC4, /* 0x14-0x17 */
+	0xE9, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xD1, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xBC, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xAD, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xF7, 0xB0, 0x00, 0x00, 0xCC, 0xEA, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xC4, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xC0, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xFD, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xA1, 0x00, 0x00, /* 0x54-0x57 */
+	0xDE, 0xBD, 0x00, 0x00, 0xF6, 0xA9, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xA4, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xA4, /* 0x6C-0x6F */
+	0xF5, 0xC6, 0x00, 0x00, 0xE1, 0xA2, 0xE9, 0xC6, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xC5, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xF4, 0xE9, 0xD6, 0xEC, 0xEB, 0xD3, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xEC, 0xBD, 0xE2, 0xDC, 0xDE, 0xEB, 0xF0, 0xDC, /* 0x84-0x87 */
+	0x00, 0x00, 0xEB, 0xBF, 0x00, 0x00, 0xD7, 0xCE, /* 0x88-0x8B */
+	0xD1, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xAB, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xFD, /* 0x98-0x9B */
+	0x00, 0x00, 0xCA, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xCD, 0xC6, 0xF2, 0xB6, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xDD, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xCC, 0xB7, 0xDB, 0xB8, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xE9, /* 0xAC-0xAF */
+	0x00, 0x00, 0xCE, 0xDD, 0xEB, 0xC0, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xFD, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xCB, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD6, /* 0xC0-0xC3 */
+	0xF1, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xDB, 0xCE, 0x00, 0x00, 0xF7, 0xC3, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xCF, 0xCB, 0xA4, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xE0, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xFB, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xEB, 0xCA, 0xE0, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xCE, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xD4, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xFD, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xD2, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_81[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xB7, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xFA, 0xF6, 0xF6, 0xAA, 0xFA, 0xF7, /* 0x04-0x07 */
+	0xD8, 0xE6, 0x00, 0x00, 0xF4, 0xB1, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE8, 0xD2, 0x00, 0x00, 0xCA, 0xC5, 0xCC, 0xEB, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xEE, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xE2, 0xBB, 0x00, 0x00, 0xF7, 0xAD, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE1, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xF3, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xA1, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xFD, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xEC, 0x00, 0x00, /* 0x4C-0x4F */
+	0xDD, 0xAF, 0xDD, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xCB, 0xB7, 0xE8, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xE1, 0xA3, 0xD2, 0xE0, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xFE, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE9, 0xA6, 0xCB, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xED, 0xF3, 0xDC, 0xD9, 0xE0, 0xCD, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xDA, /* 0x7C-0x7F */
+	
+	0xDB, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xCC, 0xAE, 0x00, 0x00, 0xDA, 0xDB, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xC7, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xB1, 0x00, 0x00, /* 0x98-0x9B */
+	0xD8, 0xAF, 0xE3, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xCE, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xF3, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xF8, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xCE, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF5, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xEC, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xD3, 0xC5, 0xFC, 0xEC, 0xD2, 0xDB, /* 0xBC-0xBF */
+	0xD4, 0xEB, 0x00, 0x00, 0xDE, 0xA2, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xE6, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xF0, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xD5, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xF4, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xED, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE8, 0xC2, 0x00, 0x00, 0xED, 0xF5, /* 0xE4-0xE7 */
+	0xD7, 0xFC, 0x00, 0x00, 0xED, 0xBB, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xF6, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xB8, /* 0xF0-0xF3 */
+	0xF6, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xE6, 0xF2, 0xDD, /* 0xF8-0xFB */
+	0xCF, 0xBF, 0x00, 0x00, 0xEB, 0xAC, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_82[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xCF, 0xC0, 0x00, 0x00, 0xE6, 0xA8, /* 0x04-0x07 */
+	0xFD, 0xE9, 0x00, 0x00, 0xCF, 0xC1, 0x00, 0x00, /* 0x08-0x0B */
+	0xE0, 0xDF, 0xDE, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xA2, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xBF, /* 0x18-0x1B */
+	0xE2, 0xEF, 0x00, 0x00, 0xD9, 0xF1, 0xF1, 0xC7, /* 0x1C-0x1F */
+	0x00, 0x00, 0xCB, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xFE, 0xDB, 0xBA, /* 0x28-0x2B */
+	0xDA, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xF6, 0xEC, 0xDA, 0xDC, 0xFA, 0xE4, /* 0x34-0x37 */
+	0x00, 0x00, 0xE0, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xDD, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xE6, 0xA9, 0x00, 0x00, 0xEF, 0xF3, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xF3, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xEB, 0xFA, 0x00, 0x00, 0xF9, 0xE6, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xDD, 0xD5, 0xDE, /* 0x6C-0x6F */
+	0x00, 0x00, 0xCA, 0xDE, 0xDF, 0xE4, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xFD, 0x00, 0x00, /* 0x74-0x77 */
+	0xF5, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xF5, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE3, /* 0x88-0x8B */
+	0x00, 0x00, 0xED, 0xCB, 0xCF, 0xE4, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xD3, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xDD, 0xB3, 0xD4, 0xEC, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xF2, 0xB9, 0x00, 0x00, 0xDF, 0xB7, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xCB, 0xCE, 0xFB, 0xD8, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xD0, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xDD, 0xD2, 0xF7, 0xF4, 0xE7, 0xDC, 0xE4, 0xA5, /* 0xAC-0xAF */
+	0x00, 0x00, 0xFC, 0xA3, 0x00, 0x00, 0xDB, 0xBB, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xBA, /* 0xB4-0xB7 */
+	0xE9, 0xFD, 0xD0, 0xCA, 0x00, 0x00, 0xF5, 0xD6, /* 0xB8-0xBB */
+	0xD9, 0xC5, 0xE4, 0xB4, 0x00, 0x00, 0xED, 0xA7, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xEA, 0xBD, 0xE6, 0xFE, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xF7, 0xC4, 0xF5, 0xAD, 0x00, 0x00, 0xD9, 0xE0, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xB4, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xE2, 0xCF, 0xC2, /* 0xDC-0xDF */
+	0x00, 0x00, 0xEC, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE5, 0xB4, 0xCD, 0xC8, 0xEE, 0xC8, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE7, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xCD, 0xC9, 0xF9, 0xB7, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_83[512] = {
+	0x00, 0x00, 0xF1, 0xE8, 0xD9, 0xF2, 0xDB, 0xF5, /* 0x00-0x03 */
+	0xCA, 0xB5, 0xD9, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xD8, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xAB, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xED, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD4, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xDA, /* 0x2C-0x2F */
+	0x00, 0x00, 0xE2, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xFC, 0xED, 0xEC, 0xE0, 0xD2, 0xFE, 0x00, 0x00, /* 0x34-0x37 */
+	0xE9, 0xC7, 0xE6, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xE2, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xBB, /* 0x44-0x47 */
+	0x00, 0x00, 0xF5, 0xAE, 0xFB, 0xAA, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xFB, /* 0x4C-0x4F */
+	0x00, 0x00, 0xEC, 0xBF, 0xFC, 0xD8, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xE5, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC3, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE2, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xD7, 0xE9, 0xED, 0xF6, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xED, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xEC, 0x00, 0x00, /* 0x94-0x97 */
+	0xE3, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xD4, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xF8, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xDD, 0xB4, 0xE4, 0xB5, 0xD8, 0xB0, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xD8, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xF4, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xCE, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xD6, 0xE1, 0xCF, 0xD2, 0x00, 0x00, /* 0xC8-0xCB */
+	0xD0, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xA2, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xEE, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xF3, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xDC, 0xCC, 0x00, 0x00, 0xD0, 0xCB, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xA4, /* 0xEC-0xEF */
+	0xCD, 0xCA, 0xD7, 0xD4, 0xDE, 0xA3, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE4, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xEE, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE2, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_84[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xFE, /* 0x00-0x03 */
+	0xD4, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xD1, 0x00, 0x00, /* 0x08-0x0B */
+	0xD8, 0xF0, 0xF8, 0xC3, 0xEA, 0xD7, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xF5, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xD8, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xFD, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xEB, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xD5, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xE7, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCA, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE7, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xF8, 0xE3, 0x00, 0x00, 0xD4, 0xDD, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xD8, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xD9, /* 0x68-0x6B */
+	0xED, 0xF7, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB5, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xD0, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xF1, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xE2, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE3, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xD9, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xDF, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xDB, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xE4, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xF1, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB6, /* 0xB8-0xBB */
+	0xF3, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xDA, /* 0xBC-0xBF */
+	0xE1, 0xE0, 0x00, 0x00, 0xD9, 0xAC, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF5, 0xEB, 0x00, 0x00, 0xE0, 0xB6, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE9, 0xC8, 0x00, 0x00, 0xCB, 0xCF, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE3, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xDE, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xBE, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xDC, 0xEF, 0x00, 0x00, 0xD6, 0xA5, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xE2, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xD6, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_85[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xD9, 0xA1, 0x00, 0x00, 0xD8, 0xC0, /* 0x10-0x13 */
+	0xDC, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xED, 0xBD, /* 0x14-0x17 */
+	0xDF, 0xB8, 0x00, 0x00, 0xEA, 0xA5, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xAD, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xF3, 0xF9, 0x00, 0x00, 0xED, 0xF8, /* 0x20-0x23 */
+	0x00, 0x00, 0xF5, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xE1, 0xCA, 0xEB, 0xE3, 0x00, 0x00, 0xF2, 0xDE, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xF8, 0xCC, 0x00, 0x00, 0xEA, 0xD9, /* 0x3C-0x3F */
+	0x00, 0x00, 0xD3, 0xC6, 0x00, 0x00, 0xDB, 0xE6, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xF5, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xF0, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xFE, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xFB, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF2, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xCF, 0xF2, 0xF7, 0xB9, 0xD9, 0xF3, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xE1, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xDA, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xB9, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xFB, /* 0x8C-0x8F */
+	0x00, 0x00, 0xCB, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xED, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE0, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xC0, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xFD, 0xBC, 0xDF, 0xB1, 0xE3, 0xEF, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA3, /* 0xAC-0xAF */
+	0xFD, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xB1, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xCD, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xED, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xD5, 0xC0, 0xE3, 0xF0, 0xED, 0xFA, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE9, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD5, 0xED, 0xE7, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xD4, 0xF6, 0xE5, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xDB, 0xE7, 0xE2, 0xBF, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCB, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xF4, 0xF0, 0xDD, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xAB, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_86[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xDE, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xD6, 0xE1, 0xCC, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xB3, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xEE, 0xDC, 0xA2, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xD0, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xD5, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xA1, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xDB, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF9, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xCB, 0xF3, 0xF4, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC8, /* 0x58-0x5B */
+	0xD6, 0xD7, 0x00, 0x00, 0xE9, 0xE5, 0xFB, 0xDC, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xD0, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xFB, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xA5, 0x00, 0x00, /* 0x88-0x8B */
+	0xDB, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xE2, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xF7, /* 0xA0-0xA3 */
+	0xF0, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xF6, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xEF, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB1, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xFC, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE8, 0xC3, 0x00, 0x00, 0xF1, 0xC8, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF1, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xF9, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xF2, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xB6, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_87[512] = {
+	0xF5, 0xB9, 0x00, 0x00, 0xDC, 0xF0, 0xE3, 0xF1, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xE8, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xF2, 0xBB, 0x00, 0x00, 0xDE, 0xA4, 0x00, 0x00, /* 0x18-0x1B */
+	0xDA, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xE9, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE3, 0xDA, 0x00, 0x00, 0xFC, 0xD9, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xDA, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xC4, 0x00, 0x00, /* 0x64-0x67 */
+	0xE3, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xFB, 0xDD, 0x00, 0x00, 0xEF, 0xCA, 0x00, 0x00, /* 0x74-0x77 */
+	0xE8, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xCC, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xEB, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xAD, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xAB, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xD9, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xA2, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF6, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xDA, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xE0, 0xD1, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xA8, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xF9, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xFA, 0xAF, 0x00, 0x00, 0xEB, 0xFC, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xEA, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_88[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xE3, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xC5, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xE3, 0xD5, 0xEE, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xCD, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xD9, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC1, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xFA, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xEB, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xFA, 0xBC, 0xE6, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xE5, 0xE2, 0xFA, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xB6, /* 0x54-0x57 */
+	0x00, 0x00, 0xE4, 0xB7, 0x00, 0x00, 0xEA, 0xDB, /* 0x58-0x5B */
+	0x00, 0x00, 0xF5, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xFB, 0xAC, 0xCF, 0xC3, 0xEB, 0xFD, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xF8, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xB9, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xE1, 0xF1, 0x00, 0x00, 0xD2, 0xA4, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xFB, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xDA, 0xD0, 0xDB, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xEA, 0xBE, 0xD9, 0xB1, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xCA, 0xB7, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xE7, /* 0x88-0x8B */
+	0x00, 0x00, 0xF8, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xB2, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xC0, 0xF2, 0xDF, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xE5, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xAC, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xCD, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xEE, 0xAE, 0xD6, 0xAE, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xEA, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE7, 0xE0, 0xEB, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xCF, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xDC, 0xCD, 0xED, 0xFB, 0x00, 0x00, 0xDE, 0xF0, /* 0xDC-0xDF */
+	0x00, 0x00, 0xD7, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xDE, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xD7, /* 0xF0-0xF3 */
+	0xDB, 0xD0, 0xDB, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xD5, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xF0, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_89[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xDC, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xCA, 0xE8, 0x00, 0x00, 0xF8, 0xE6, 0xDC, 0xCE, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xEA, 0xDC, 0xDB, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE9, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xDB, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xA8, 0x00, 0x00, /* 0x34-0x37 */
+	0xD7, 0xAE, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE1, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xCB, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xE5, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xDC, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xD5, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xCA, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xA9, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA4, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE9, 0xA9, 0x00, 0x00, 0xD3, 0xC7, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xDD, 0xF8, 0xAE, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xB8, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAE, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xF2, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xCA, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xCC, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xD4, 0xAD, 0xF6, 0xD1, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xCC, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xC6, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xD5, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCE, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xC7, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xB0, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xDF, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xF5, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_8A[512] = {
+	0xE5, 0xEB, 0x00, 0x00, 0xEF, 0xF4, 0xDD, 0xB5, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xCD, 0xAA, 0x00, 0x00, 0xE3, 0xF2, 0x00, 0x00, /* 0x08-0x0B */
+	0xFB, 0xF7, 0x00, 0x00, 0xF7, 0xD0, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xBA, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xE1, 0xF6, 0xFE, /* 0x14-0x17 */
+	0xD1, 0xC0, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xC5, /* 0x18-0x1B */
+	0x00, 0x00, 0xE4, 0xB8, 0x00, 0x00, 0xE1, 0xE8, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xC1, /* 0x20-0x23 */
+	0x00, 0x00, 0xD2, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xBE, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xE0, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xFA, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xE1, 0xCD, 0x00, 0x00, 0xCA, 0xB8, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xE0, 0xF1, 0xC9, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xDE, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xF0, 0xDF, 0xF8, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCC, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xF2, 0x00, 0x00, /* 0x5C-0x5F */
+	0xE7, 0xC9, 0x00, 0x00, 0xE2, 0xF3, 0xE7, 0xE1, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xCB, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xE3, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xCF, 0xF8, 0xEF, 0xAC, 0x00, 0x00, /* 0x6C-0x6F */
+	0xFD, 0xFE, 0xFC, 0xA5, 0xFA, 0xB1, 0xDF, 0xD9, /* 0x70-0x73 */
+	0x00, 0x00, 0xE0, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xF4, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xF1, 0xCA, 0x00, 0x00, 0xCE, 0xA3, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xF2, 0xBC, 0xEC, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA5, /* 0x90-0x93 */
+	0x00, 0x00, 0xF7, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xEB, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xDE, 0x00, 0x00, /* 0x9C-0x9F */
+	0xE1, 0xA4, 0xCD, 0xAB, 0x00, 0x00, 0xD9, 0xF4, /* 0xA0-0xA3 */
+	0xE8, 0xA6, 0xCD, 0xCE, 0xE1, 0xE9, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xFC, 0xEF, 0x00, 0x00, 0xE0, 0xE3, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE2, 0xC1, 0x00, 0x00, 0xCE, 0xA4, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xDE, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xEB, 0xFE, 0x00, 0x00, 0xEB, 0xDD, 0xF0, 0xE0, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xDB, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xE2, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xC8, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xEB, /* 0xC8-0xCB */
+	0x00, 0x00, 0xEE, 0xB5, 0x00, 0x00, 0xF5, 0xD8, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xDF, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xE5, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xB0, /* 0xD8-0xDB */
+	0xF4, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE3, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xF4, 0xFA, 0xB2, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xF5, 0xCA, 0xDF, /* 0xE8-0xEB */
+	0x00, 0x00, 0xEB, 0xB1, 0xED, 0xBF, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xFD, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xA6, 0xF9, 0xA4, /* 0xF4-0xF7 */
+	0xF0, 0xB3, 0x00, 0x00, 0xE5, 0xEC, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xE7, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8B[512] = {
+	0xD9, 0xC7, 0xE4, 0xD7, 0xEA, 0xDD, 0x00, 0x00, /* 0x00-0x03 */
+	0xD4, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xBA, 0x00, 0x00, /* 0x0C-0x0F */
+	0xDA, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xF9, 0xCC, 0x00, 0x00, 0xE1, 0xDA, 0xDB, 0xBF, /* 0x14-0x17 */
+	0x00, 0x00, 0xCC, 0xC5, 0xEC, 0xD0, 0xCB, 0xBB, /* 0x18-0x1B */
+	0x00, 0x00, 0xDE, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE9, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xD9, 0xC8, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE3, /* 0x28-0x2B */
+	0xD7, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xC4, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xD0, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xFC, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xF1, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xD2, 0xD1, 0xC1, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xE3, 0xDB, 0x00, 0x00, 0xD3, 0xC9, 0x00, 0x00, /* 0x58-0x5B */
+	0xDC, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xED, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xDE, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xBB, /* 0x6C-0x6F */
+	0xEC, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xCC, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xDE, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE7, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xD4, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xA8, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xE2, 0xC2, 0x00, 0x00, 0xF3, 0xD8, 0xE5, 0xD3, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xD9, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xC6, 0x00, 0x00, /* 0x98-0x9B */
+};
+
+static const unsigned char u2c_8C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xDB, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xAC, /* 0x3C-0x3F */
+	0x00, 0x00, 0xFC, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xE7, 0x00, 0x00, /* 0x44-0x47 */
+	0xD1, 0xC2, 0x00, 0x00, 0xF9, 0xA5, 0x00, 0x00, /* 0x48-0x4B */
+	0xE8, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xE3, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0xCA, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xDF, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xDF, 0xE7, 0xE3, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xF8, 0xFB, 0xE3, 0xCF, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xB0, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xE7, 0x00, 0x00, /* 0x88-0x8B */
+	0xD9, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xF8, 0xAF, 0xEF, 0xF6, 0x00, 0x00, /* 0x9C-0x9F */
+	0xDD, 0xB6, 0xEE, 0xAF, 0xCD, 0xF8, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xB8, /* 0xA4-0xA7 */
+	0xFC, 0xA7, 0xF7, 0xFC, 0xF7, 0xB1, 0xCE, 0xBB, /* 0xA8-0xAB */
+	0xF4, 0xA1, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCD, /* 0xAC-0xAF */
+	0xE1, 0xAE, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xC3, /* 0xB0-0xB3 */
+	0xCF, 0xFE, 0x00, 0x00, 0xF8, 0xBF, 0xD8, 0xE2, /* 0xB4-0xB7 */
+	0xD3, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xA8, /* 0xB8-0xBB */
+	0xF4, 0xE4, 0xEC, 0xC2, 0x00, 0x00, 0xD9, 0xF5, /* 0xBC-0xBF */
+	0xF9, 0xC5, 0xDD, 0xD3, 0xD6, 0xF1, 0xEC, 0xFC, /* 0xC0-0xC3 */
+	0xFC, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xED, 0xC0, /* 0xC4-0xC7 */
+	0xCA, 0xB9, 0x00, 0x00, 0xEE, 0xE4, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xF2, 0xE1, 0x00, 0x00, 0xDE, 0xB9, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xF2, 0x00, 0x00, /* 0xD8-0xDB */
+	0xDE, 0xF4, 0x00, 0x00, 0xDF, 0xDB, 0x00, 0x00, /* 0xDC-0xDF */
+	0xDB, 0xD3, 0x00, 0x00, 0xFA, 0xE7, 0xD8, 0xE3, /* 0xE0-0xE3 */
+	0xF4, 0xC1, 0x00, 0x00, 0xDD, 0xB7, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xF5, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xD4, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xD6, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xB8, /* 0xF8-0xFB */
+	0xCF, 0xC5, 0xDF, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xF2, 0xBE, 0xF6, 0xA1, 0x00, 0x00, 0xEB, 0xCB, /* 0x04-0x07 */
+	0xF1, 0xFC, 0x00, 0x00, 0xF3, 0xC7, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE0, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xFC, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xDB, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xEE, 0xE5, 0x00, 0x00, 0xDE, 0xF5, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xD3, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xF1, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAF, /* 0x70-0x73 */
+	0xDD, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xC3, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xF5, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xC6, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xF0, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xAC, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xF5, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xEB, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0xBA, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xBF, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xC5, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xA2, /* 0xC8-0xCB */
+	0xF2, 0xF6, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xBA, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xF5, /* 0xD8-0xDB */
+	0x00, 0x00, 0xCB, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xEE, 0xE6, 0x00, 0x00, 0xE0, 0xD3, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xCE, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD8, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xAF, /* 0xF0-0xF3 */
+};
+
+static const unsigned char u2c_8E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xC9, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xCE, /* 0x0C-0x0F */
+	0xF4, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xE6, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xA1, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xEB, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xF1, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xB3, 0x00, 0x00, /* 0x40-0x43 */
+	0xF0, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xF4, /* 0x44-0x47 */
+	0xD4, 0xB0, 0xF3, 0xB2, 0xFB, 0xB7, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xF5, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE7, /* 0x5C-0x5F */
+	0xF4, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xF5, 0xED, 0x00, 0x00, 0xCF, 0xF3, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xF0, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCE, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xCC, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xE5, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xF5, 0xE3, 0xF3, /* 0xA8-0xAB */
+	0xCF, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCF, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xB3, 0xE4, 0xD8, /* 0xC8-0xCB */
+	0xCF, 0xF9, 0xCF, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xFA, 0xCD, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE3, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xE2, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xF5, 0xEE, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xBB, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xDC, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8F[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF2, /* 0x00-0x03 */
+	0x00, 0x00, 0xD6, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xEE, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xE5, 0xD8, 0xC2, /* 0x10-0x13 */
+	0xDC, 0xD0, 0xCC, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xE0, /* 0x18-0x1B */
+	0xF6, 0xCA, 0xFD, 0xCA, 0xD8, 0xD6, 0xF4, 0xCF, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xA6, 0xDC, 0xBE, /* 0x24-0x27 */
+	0x00, 0x00, 0xDB, 0xD4, 0xD7, 0xC7, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xFE, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xCD, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xE2, 0xC3, 0xDC, 0xDE, 0x00, 0x00, 0xDC, 0xDF, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xAD, 0xE6, 0xAB, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xF9, 0xDD, 0xEA, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xEF, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xF4, 0xD0, 0xCE, 0xF3, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xE6, 0xAC, 0x00, 0x00, 0xCE, 0xDE, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xF9, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xF4, /* 0x98-0x9B */
+	0xCD, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xB8, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xFD, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xDC, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xDE, 0xF6, 0x00, 0x00, 0xDC, 0xAA, /* 0xAC-0xAF */
+	0xF2, 0xE3, 0xE9, 0xB4, 0xD2, 0xDC, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xE6, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE3, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xCA, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xD0, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xDA, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xBC, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xE8, 0xDA, 0xDE, /* 0xE8-0xEB */
+	0x00, 0x00, 0xF2, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE2, 0xFB, 0x00, 0x00, 0xCC, 0xA6, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xBB, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xEE, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xF5, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_90[512] = {
+	0xF7, 0xDC, 0xE1, 0xEA, 0xCE, 0xC1, 0xD4, 0xB1, /* 0x00-0x03 */
+	0x00, 0x00, 0xFD, 0xB1, 0xE6, 0xBD, 0x00, 0x00, /* 0x04-0x07 */
+	0xFB, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE7, /* 0x08-0x0B */
+	0x00, 0x00, 0xE1, 0xCE, 0x00, 0x00, 0xF7, 0xE2, /* 0x0C-0x0F */
+	0xF5, 0xEF, 0xCF, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xD4, 0xB2, 0xCC, 0xEF, 0x00, 0x00, 0xD4, 0xE8, /* 0x14-0x17 */
+	0x00, 0x00, 0xEE, 0xCF, 0xF7, 0xD7, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xE0, 0xA6, 0xD6, 0xC1, 0xE1, 0xDC, /* 0x1C-0x1F */
+	0xF0, 0xE3, 0xF1, 0xE4, 0xDC, 0xF1, 0xD6, 0xA7, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xF5, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF1, 0xCE, 0xF2, 0xE4, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xD0, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xEC, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xF9, 0xBA, 0x00, 0x00, 0xEB, 0xB5, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xD4, 0xED, 0xE2, 0xC4, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE7, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xB4, 0xEA, 0xA1, /* 0x48-0x4B */
+	0x00, 0x00, 0xF8, 0xBC, 0xCE, 0xA6, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF9, 0xC6, 0xFC, 0xDA, 0x00, 0x00, 0xD4, 0xB3, /* 0x50-0x53 */
+	0xD3, 0xB9, 0xEA, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xE9, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xE1, 0xE1, 0xD3, 0xCF, 0xF4, 0xF6, 0x00, 0x00, /* 0x5C-0x5F */
+	0xEA, 0xC0, 0xE1, 0xCF, 0x00, 0x00, 0xCC, 0xBA, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xEE, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xF0, 0xE4, 0xF3, 0xB4, 0xD4, 0xEE, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xC0, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xF1, 0xE5, 0x00, 0x00, 0xF4, 0xC3, /* 0x74-0x77 */
+	0xE0, 0xD4, 0x00, 0x00, 0xEB, 0xB6, 0x00, 0x00, /* 0x78-0x7B */
+	0xD7, 0xA1, 0xCB, 0xE8, 0x00, 0x00, 0xF9, 0xAD, /* 0x7C-0x7F */
+	
+	0xE9, 0xAD, 0xD8, 0xE4, 0xFA, 0xB3, 0xE2, 0xC5, /* 0x80-0x83 */
+	0xFC, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xC4, /* 0x84-0x87 */
+	0xD8, 0xB1, 0x00, 0x00, 0xDC, 0xAB, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xA4, /* 0x8C-0x8F */
+	0x00, 0x00, 0xEB, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xE8, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xD8, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xFB, 0xAE, 0xD1, 0xE1, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xC0, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xF5, 0xBE, 0x00, 0x00, 0xDE, 0xF7, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xFB, /* 0xAC-0xAF */
+	0xF7, 0xC6, 0xCF, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE1, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xEE, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xE9, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xF4, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xCD, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xCF, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xDD, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xCE, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xE9, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xD4, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_91[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xC7, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xDB, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xFA, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xDE, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF8, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xEF, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xB3, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xEB, 0xB7, 0xEF, 0xF8, 0xF5, 0xDC, /* 0x48-0x4B */
+	0xED, 0xCC, 0xDB, 0xD5, 0xF1, 0xCF, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xD0, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xB2, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xD9, 0xAE, 0xD5, 0xAC, 0x00, 0x00, /* 0x68-0x6B */
+	0xE2, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xFD, 0xA3, 0x00, 0x00, 0xFB, 0xE5, /* 0x74-0x77 */
+	0xDF, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF5, /* 0x84-0x87 */
+	0x00, 0x00, 0xF6, 0xAD, 0x00, 0x00, 0xF5, 0xB3, /* 0x88-0x8B */
+	0x00, 0x00, 0xF0, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xA5, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xF5, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xA2, /* 0xA8-0xAB */
+	0xED, 0xFD, 0x00, 0x00, 0xF5, 0xB4, 0xFB, 0xB8, /* 0xAC-0xAF */
+	0x00, 0x00, 0xDB, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xD6, 0xCA, 0xCB, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE5, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xFA, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xEB, 0xB8, 0x00, 0x00, 0xE0, 0xB7, /* 0xC8-0xCB */
+	0xD7, 0xEC, 0xF1, 0xEC, 0xE5, 0xAF, 0xD5, 0xE1, /* 0xCC-0xCF */
+	0xD7, 0xED, 0xD1, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF2, /* 0xD4-0xD7 */
+	0xEF, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xDD, 0xBC, 0xF6, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xE5, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xC4, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xE9, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xF3, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_92[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xD4, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xCC, 0xA2, 0xF7, 0xFE, 0xDF, 0xBC, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xCD, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xB7, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xD6, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xAD, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xAF, /* 0x3C-0x3F */
+	0xCB, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xCB, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xFA, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xC6, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE7, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xC7, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xA4, 0x00, 0x00, /* 0x60-0x63 */
+	0xCF, 0xC9, 0xE2, 0xFC, 0xEF, 0xFA, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xEB, 0xDE, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xC8, /* 0x80-0x83 */
+	0x00, 0x00, 0xD4, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE0, 0xD5, 0x00, 0x00, 0xEF, 0xB0, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xC7, 0x00, 0x00, /* 0x94-0x97 */
+	0xD9, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xF9, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE5, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0xCA, 0xE1, 0xD1, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE2, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xEF, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xF9, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xF2, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE0, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xE8, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xCB, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xCB, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_93[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xD6, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xF5, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xF5, 0xDF, 0x00, 0x00, 0xEE, 0xB6, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xF6, 0xD3, 0xCA, /* 0x1C-0x1F */
+	0xEF, 0xFC, 0xD1, 0xC4, 0xEF, 0xB1, 0x00, 0x00, /* 0x20-0x23 */
+	0xD1, 0xC5, 0x00, 0x00, 0xD0, 0xDE, 0x00, 0x00, /* 0x24-0x27 */
+	0xD9, 0xE1, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB8, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xD1, 0xF3, 0xB9, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xE7, 0xCC, 0x00, 0x00, 0xD6, 0xA8, 0xCE, 0xA7, /* 0x48-0x4B */
+	0x00, 0x00, 0xD4, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xE4, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB4, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xB9, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xCB, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xF6, 0xDD, 0x00, 0x00, 0xF1, 0xA3, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xCC, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE9, 0xCA, 0x00, 0x00, 0xE1, 0xF0, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xE0, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xAF, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xD1, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xFB, 0xE0, 0xF2, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xEC, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xEC, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xEE, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xCB, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xCC, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xD7, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xA1, 0x00, 0x00, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_94[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xFC, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xF1, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xE0, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xB2, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xF4, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xF7, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xF1, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xCA, 0xFC, 0xCA, 0xFD, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xCE, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xF3, 0xC8, 0x00, 0x00, 0xF3, 0xBA, /* 0x7C-0x7F */
+};
+
+static const unsigned char u2c_95[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xFE, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xDA, 0xA6, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xEC, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xF8, 0xCD, 0x00, 0x00, 0xCB, 0xD2, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xCE, /* 0x8C-0x8F */
+	0x00, 0x00, 0xF9, 0xD8, 0xF9, 0xD9, 0xCA, 0xE0, /* 0x90-0x93 */
+	0xDA, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xCB, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xC8, /* 0xA0-0xA3 */
+	0xF9, 0xEE, 0xDB, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xD0, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xD5, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xE6, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xA2, /* 0xB8-0xBB */
+	0xE4, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE1, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xFC, 0xC4, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xF9, 0xEF, 0xCF, 0xF4, 0xF7, 0xE6, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xCE, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0xF4, 0xC5, 0xDC, 0xA3, 0x00, 0x00, /* 0xE0-0xE3 */
+};
+
+static const unsigned char u2c_96[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xDD, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xF4, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xA1, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xD6, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xC1, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xE6, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB9, /* 0x3C-0x3F */
+	0xF6, 0xED, 0x00, 0x00, 0xF9, 0xAE, 0x00, 0x00, /* 0x40-0x43 */
+	0xDD, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xB0, /* 0x48-0x4B */
+	0xD8, 0xE8, 0xCB, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF9, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xCE, /* 0x58-0x5B */
+	0xF9, 0xF0, 0xE0, 0xED, 0xE3, 0xB3, 0xF4, 0xB3, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xC2, 0xF2, 0xE6, /* 0x60-0x63 */
+	0xF0, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xD6, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xEB, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xE7, /* 0x70-0x73 */
+	0x00, 0x00, 0xD7, 0xD5, 0xD4, 0xB6, 0xF9, 0xE8, /* 0x74-0x77 */
+	0xD7, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE5, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xE9, 0xEA, 0xD7, 0xCC, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xE9, 0xE2, 0xC9, /* 0x88-0x8B */
+	0x00, 0x00, 0xFC, 0xDB, 0xCD, 0xAD, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xCC, 0xB0, 0xEA, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xE4, 0xF6, 0xD0, 0xC0, 0x00, 0x00, 0xF0, 0xB7, /* 0x98-0x9B */
+	0xEE, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xF6, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCA, /* 0xA4-0xA7 */
+	0xE2, 0xCB, 0x00, 0x00, 0xFA, 0xCF, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xEB, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xCB, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xB4, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xED, 0xCD, 0xE4, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xEA, 0xA9, 0xE4, 0xBA, 0xF3, 0xA2, 0xCD, 0xD2, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF6, 0xCB, 0x00, 0x00, 0xF1, 0xE6, /* 0xC8-0xCB */
+	0xED, 0xC1, 0xE8, 0xBC, 0xEE, 0xD1, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xF0, 0xE7, 0xE2, 0xCC, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE4, 0xAA, 0x00, 0x00, 0xF5, 0xE1, /* 0xD8-0xDB */
+	0xED, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xEE, 0xD1, 0xF1, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xE9, 0xEB, 0xE9, 0xEC, 0xE0, 0xE4, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xA7, /* 0xEC-0xEF */
+	0xDD, 0xD4, 0x00, 0x00, 0xEA, 0xA3, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xC3, 0xD6, 0xF4, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xDA, 0xDF, 0x00, 0x00, 0xEF, 0xB3, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_97[512] = {
+	0xE2, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xFD, 0xF2, 0xE8, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xEF, 0xC5, 0x00, 0x00, 0xE7, 0xE7, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xFD, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE7, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xDF, 0xDC, 0x00, 0x00, 0xF9, 0xC7, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xF6, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xDF, 0xAC, 0x00, 0x00, 0xD6, 0xDA, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xDC, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xF0, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xFA, 0x00, 0x00, /* 0x40-0x43 */
+	0xE4, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xD6, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xF4, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xFE, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xF0, 0xA1, 0x00, 0x00, 0xDE, 0xAA, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xDA, 0xBC, 0xD8, 0xFC, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xFA, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xEC, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xFC, 0xA8, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xE6, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xCB, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xB9, /* 0x88-0x8B */
+	0x00, 0x00, 0xE4, 0xD3, 0x00, 0x00, 0xCD, 0xF9, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xCF, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xCA, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xD4, /* 0xA8-0xAB */
+	0x00, 0x00, 0xF8, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xC7, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xDF, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xDB, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD4, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xE5, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xD2, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xA4, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC2, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_98[512] = {
+	0xFB, 0xE1, 0xFA, 0xED, 0xF0, 0xA2, 0xCC, 0xF1, /* 0x00-0x03 */
+	0x00, 0x00, 0xFA, 0xA3, 0xE2, 0xF7, 0x00, 0x00, /* 0x04-0x07 */
+	0xE2, 0xCE, 0x00, 0x00, 0xE9, 0xF5, 0x00, 0x00, /* 0x08-0x0B */
+	0xE1, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE7, 0xE8, 0xE8, 0xD7, 0xDA, 0xF8, 0xD4, 0xCB, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xF6, /* 0x14-0x17 */
+	0xD6, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xD4, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xFA, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xCC, 0xF2, 0xF7, 0xDD, 0x00, 0x00, 0xDE, 0xBA, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xA8, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xF0, 0xB9, 0xE4, 0xFE, 0xE4, 0xC9, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xE4, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xEA, 0xC3, 0x00, 0x00, 0xEF, 0xB4, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xBE, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xFB, 0xE2, 0x00, 0x00, 0xCD, 0xD3, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xB5, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xE9, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xF9, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xBD, /* 0xAC-0xAF */
+	0x00, 0x00, 0xF7, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF8, 0xFD, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xFC, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xAB, /* 0xD8-0xDB */
+	0xDB, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xDD, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE1, 0xE2, 0xD1, 0xC6, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xF6, 0xD0, 0xEB, 0xE6, 0xDA, 0xF9, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xEC, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xDE, 0xF8, 0xF8, 0xE9, 0xE3, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_99[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF5, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xFA, 0xC3, 0xE5, 0xD7, 0x00, 0x00, /* 0x08-0x0B */
+	0xEC, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xF3, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xBB, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE6, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xB6, 0x00, 0x00, /* 0x1C-0x1F */
+	0xDC, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xCE, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xD8, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xD0, 0xCF, 0x00, 0x00, 0xCF, 0xFA, /* 0x48-0x4B */
+	0xF3, 0xCA, 0xE0, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xD1, 0xC7, 0xE9, 0xAE, 0x00, 0x00, /* 0x50-0x53 */
+	0xE8, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC4, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xCF, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xFA, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xF9, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xDC, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xFB, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xD8, 0xA9, 0xE5, 0xDF, 0xF9, 0xA7, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xF6, 0xEE, 0x00, 0x00, 0xF6, 0xCC, /* 0xB0-0xB3 */
+	0xE2, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xEC, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xDA, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xF1, 0xD2, 0xD2, 0xCC, 0xCF, 0xCB, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xCA, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xDD, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF6, 0xEF, 0x00, 0x00, 0xDE, 0xF9, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xFA, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xD5, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xE7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9A[512] = {
+	0x00, 0x00, 0xDE, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xDC, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xC8, 0xD1, 0xC9, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xF8, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xF6, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xD4, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xE2, 0xE1, 0xD3, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xD8, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xFE, /* 0x40-0x43 */
+	0x00, 0x00, 0xCF, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xFD, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xCE, 0xF6, 0x00, 0x00, 0xFA, 0xD0, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xF3, 0xE6, 0xBE, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xAE, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xF0, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xD1, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xFC, 0xBE, 0xD5, 0xF1, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xCD, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xFA, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD0, /* 0xD0-0xD3 */
+	0xF4, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xCD, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE7, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xA5, 0x00, 0x00, /* 0xEC-0xEF */
+};
+
+static const unsigned char u2c_9B[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xD1, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xA2, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xE3, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xEA, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xD0, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xCE, 0xDA, 0xFB, 0xEB, 0xDB, 0xA6, /* 0x40-0x43 */
+	0xDB, 0xDE, 0xD8, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE0, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xD8, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xE0, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xDB, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xC6, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xF8, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xD5, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF7, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xD8, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xD7, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xED, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xCD, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xCC, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+};
+
+static const unsigned char u2c_9C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xF5, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE4, 0xCA, 0x00, 0x00, 0xDC, 0xE1, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xF9, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xFC, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xE8, 0xA7, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC4, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xBE, /* 0x44-0x47 */
+	0x00, 0x00, 0xDC, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xF7, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xF0, 0xE8, 0x00, 0x00, 0xDD, 0xC0, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xCF, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xF3, /* 0xF0-0xF3 */
+	0xD9, 0xB0, 0x00, 0x00, 0xE6, 0xE9, 0x00, 0x00, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_9D[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xE4, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xC4, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xEC, 0x00, 0x00, /* 0x24-0x27 */
+	0xE4, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xF8, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xCC, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xE4, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xCD, 0xDC, 0xD9, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xDD, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xCE, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xD9, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA3, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xF9, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xCD, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xCE, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xAF, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xFD, 0xD3, 0xEB, 0xED, 0xD6, 0xDC, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_9E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xA4, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0xB6, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xD6, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xF9, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE7, 0xA4, 0x00, 0x00, 0xD6, 0xE3, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xCB, 0xD6, 0xE4, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF2, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xDE, 0xFA, 0x00, 0x00, 0xD7, 0xF8, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xD8, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xCF, 0xD5, 0xD8, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xAB, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xFD, 0xCB, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xDC, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xE0, 0xA8, 0xD5, 0xF3, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xFD, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xCC, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xD9, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xEA, /* 0xD8-0xDB */
+	0xF5, 0xF5, 0x00, 0x00, 0xEF, 0xC7, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xD3, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xDA, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+};
+
+static const unsigned char u2c_9F[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA8, /* 0x04-0x07 */
+	0xDC, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xA3, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xD5, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE0, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xAC, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xBA, 0xEE, 0xB1, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xB2, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xCD, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xD2, /* 0x5C-0x5F */
+	0x00, 0x00, 0xD6, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE5, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xBB, 0x00, 0x00, /* 0x68-0x6B */
+	0xE5, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xCB, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xD7, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xDB, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xCA, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xCF, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_AC[512] = {
+	0xB0, 0xA1, 0xB0, 0xA2, 0x81, 0x41, 0x81, 0x42, /* 0x00-0x03 */
+	0xB0, 0xA3, 0x81, 0x43, 0x81, 0x44, 0xB0, 0xA4, /* 0x04-0x07 */
+	0xB0, 0xA5, 0xB0, 0xA6, 0xB0, 0xA7, 0x81, 0x45, /* 0x08-0x0B */
+	0x81, 0x46, 0x81, 0x47, 0x81, 0x48, 0x81, 0x49, /* 0x0C-0x0F */
+	0xB0, 0xA8, 0xB0, 0xA9, 0xB0, 0xAA, 0xB0, 0xAB, /* 0x10-0x13 */
+	0xB0, 0xAC, 0xB0, 0xAD, 0xB0, 0xAE, 0xB0, 0xAF, /* 0x14-0x17 */
+	0x81, 0x4A, 0xB0, 0xB0, 0xB0, 0xB1, 0xB0, 0xB2, /* 0x18-0x1B */
+	0xB0, 0xB3, 0xB0, 0xB4, 0x81, 0x4B, 0x81, 0x4C, /* 0x1C-0x1F */
+	0xB0, 0xB5, 0x81, 0x4D, 0x81, 0x4E, 0x81, 0x4F, /* 0x20-0x23 */
+	0xB0, 0xB6, 0x81, 0x50, 0x81, 0x51, 0x81, 0x52, /* 0x24-0x27 */
+	0x81, 0x53, 0x81, 0x54, 0x81, 0x55, 0x81, 0x56, /* 0x28-0x2B */
+	0xB0, 0xB7, 0xB0, 0xB8, 0x81, 0x57, 0xB0, 0xB9, /* 0x2C-0x2F */
+	0xB0, 0xBA, 0xB0, 0xBB, 0x81, 0x58, 0x81, 0x59, /* 0x30-0x33 */
+	0x81, 0x5A, 0x81, 0x61, 0x81, 0x62, 0x81, 0x63, /* 0x34-0x37 */
+	0xB0, 0xBC, 0xB0, 0xBD, 0x81, 0x64, 0x81, 0x65, /* 0x38-0x3B */
+	0xB0, 0xBE, 0x81, 0x66, 0x81, 0x67, 0x81, 0x68, /* 0x3C-0x3F */
+	0xB0, 0xBF, 0x81, 0x69, 0x81, 0x6A, 0x81, 0x6B, /* 0x40-0x43 */
+	0x81, 0x6C, 0x81, 0x6D, 0x81, 0x6E, 0x81, 0x6F, /* 0x44-0x47 */
+	0x81, 0x70, 0x81, 0x71, 0x81, 0x72, 0xB0, 0xC0, /* 0x48-0x4B */
+	0x81, 0x73, 0xB0, 0xC1, 0x81, 0x74, 0x81, 0x75, /* 0x4C-0x4F */
+	0x81, 0x76, 0x81, 0x77, 0x81, 0x78, 0x81, 0x79, /* 0x50-0x53 */
+	0xB0, 0xC2, 0x81, 0x7A, 0x81, 0x81, 0x81, 0x82, /* 0x54-0x57 */
+	0xB0, 0xC3, 0x81, 0x83, 0x81, 0x84, 0x81, 0x85, /* 0x58-0x5B */
+	0xB0, 0xC4, 0x81, 0x86, 0x81, 0x87, 0x81, 0x88, /* 0x5C-0x5F */
+	0x81, 0x89, 0x81, 0x8A, 0x81, 0x8B, 0x81, 0x8C, /* 0x60-0x63 */
+	0x81, 0x8D, 0x81, 0x8E, 0x81, 0x8F, 0x81, 0x90, /* 0x64-0x67 */
+	0x81, 0x91, 0x81, 0x92, 0x81, 0x93, 0x81, 0x94, /* 0x68-0x6B */
+	0x81, 0x95, 0x81, 0x96, 0x81, 0x97, 0x81, 0x98, /* 0x6C-0x6F */
+	0xB0, 0xC5, 0xB0, 0xC6, 0x81, 0x99, 0x81, 0x9A, /* 0x70-0x73 */
+	0xB0, 0xC7, 0x81, 0x9B, 0x81, 0x9C, 0xB0, 0xC8, /* 0x74-0x77 */
+	0xB0, 0xC9, 0x81, 0x9D, 0xB0, 0xCA, 0x81, 0x9E, /* 0x78-0x7B */
+	0x81, 0x9F, 0x81, 0xA0, 0x81, 0xA1, 0x81, 0xA2, /* 0x7C-0x7F */
+	
+	0xB0, 0xCB, 0xB0, 0xCC, 0x81, 0xA3, 0xB0, 0xCD, /* 0x80-0x83 */
+	0xB0, 0xCE, 0xB0, 0xCF, 0xB0, 0xD0, 0x81, 0xA4, /* 0x84-0x87 */
+	0x81, 0xA5, 0xB0, 0xD1, 0xB0, 0xD2, 0xB0, 0xD3, /* 0x88-0x8B */
+	0xB0, 0xD4, 0x81, 0xA6, 0x81, 0xA7, 0x81, 0xA8, /* 0x8C-0x8F */
+	0xB0, 0xD5, 0x81, 0xA9, 0x81, 0xAA, 0x81, 0xAB, /* 0x90-0x93 */
+	0xB0, 0xD6, 0x81, 0xAC, 0x81, 0xAD, 0x81, 0xAE, /* 0x94-0x97 */
+	0x81, 0xAF, 0x81, 0xB0, 0x81, 0xB1, 0x81, 0xB2, /* 0x98-0x9B */
+	0xB0, 0xD7, 0xB0, 0xD8, 0x81, 0xB3, 0xB0, 0xD9, /* 0x9C-0x9F */
+	0xB0, 0xDA, 0xB0, 0xDB, 0x81, 0xB4, 0x81, 0xB5, /* 0xA0-0xA3 */
+	0x81, 0xB6, 0x81, 0xB7, 0x81, 0xB8, 0x81, 0xB9, /* 0xA4-0xA7 */
+	0xB0, 0xDC, 0xB0, 0xDD, 0xB0, 0xDE, 0x81, 0xBA, /* 0xA8-0xAB */
+	0xB0, 0xDF, 0x81, 0xBB, 0x81, 0xBC, 0xB0, 0xE0, /* 0xAC-0xAF */
+	0xB0, 0xE1, 0x81, 0xBD, 0x81, 0xBE, 0x81, 0xBF, /* 0xB0-0xB3 */
+	0x81, 0xC0, 0x81, 0xC1, 0x81, 0xC2, 0x81, 0xC3, /* 0xB4-0xB7 */
+	0xB0, 0xE2, 0xB0, 0xE3, 0x81, 0xC4, 0xB0, 0xE4, /* 0xB8-0xBB */
+	0xB0, 0xE5, 0xB0, 0xE6, 0x81, 0xC5, 0x81, 0xC6, /* 0xBC-0xBF */
+	0x81, 0xC7, 0xB0, 0xE7, 0x81, 0xC8, 0x81, 0xC9, /* 0xC0-0xC3 */
+	0xB0, 0xE8, 0x81, 0xCA, 0x81, 0xCB, 0x81, 0xCC, /* 0xC4-0xC7 */
+	0xB0, 0xE9, 0x81, 0xCD, 0x81, 0xCE, 0x81, 0xCF, /* 0xC8-0xCB */
+	0xB0, 0xEA, 0x81, 0xD0, 0x81, 0xD1, 0x81, 0xD2, /* 0xCC-0xCF */
+	0x81, 0xD3, 0x81, 0xD4, 0x81, 0xD5, 0x81, 0xD6, /* 0xD0-0xD3 */
+	0x81, 0xD7, 0xB0, 0xEB, 0x81, 0xD8, 0xB0, 0xEC, /* 0xD4-0xD7 */
+	0x81, 0xD9, 0x81, 0xDA, 0x81, 0xDB, 0x81, 0xDC, /* 0xD8-0xDB */
+	0x81, 0xDD, 0x81, 0xDE, 0x81, 0xDF, 0x81, 0xE0, /* 0xDC-0xDF */
+	0xB0, 0xED, 0xB0, 0xEE, 0x81, 0xE1, 0x81, 0xE2, /* 0xE0-0xE3 */
+	0xB0, 0xEF, 0x81, 0xE3, 0x81, 0xE4, 0xB0, 0xF0, /* 0xE4-0xE7 */
+	0xB0, 0xF1, 0x81, 0xE5, 0xB0, 0xF2, 0x81, 0xE6, /* 0xE8-0xEB */
+	0xB0, 0xF3, 0x81, 0xE7, 0x81, 0xE8, 0xB0, 0xF4, /* 0xEC-0xEF */
+	0xB0, 0xF5, 0xB0, 0xF6, 0x81, 0xE9, 0xB0, 0xF7, /* 0xF0-0xF3 */
+	0x81, 0xEA, 0xB0, 0xF8, 0xB0, 0xF9, 0x81, 0xEB, /* 0xF4-0xF7 */
+	0x81, 0xEC, 0x81, 0xED, 0x81, 0xEE, 0x81, 0xEF, /* 0xF8-0xFB */
+	0xB0, 0xFA, 0xB0, 0xFB, 0x81, 0xF0, 0x81, 0xF1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_AD[512] = {
+	0xB0, 0xFC, 0x81, 0xF2, 0x81, 0xF3, 0x81, 0xF4, /* 0x00-0x03 */
+	0xB0, 0xFD, 0x81, 0xF5, 0xB0, 0xFE, 0x81, 0xF6, /* 0x04-0x07 */
+	0x81, 0xF7, 0x81, 0xF8, 0x81, 0xF9, 0x81, 0xFA, /* 0x08-0x0B */
+	0xB1, 0xA1, 0xB1, 0xA2, 0x81, 0xFB, 0xB1, 0xA3, /* 0x0C-0x0F */
+	0x81, 0xFC, 0xB1, 0xA4, 0x81, 0xFD, 0x81, 0xFE, /* 0x10-0x13 */
+	0x82, 0x41, 0x82, 0x42, 0x82, 0x43, 0x82, 0x44, /* 0x14-0x17 */
+	0xB1, 0xA5, 0x82, 0x45, 0x82, 0x46, 0x82, 0x47, /* 0x18-0x1B */
+	0xB1, 0xA6, 0x82, 0x48, 0x82, 0x49, 0x82, 0x4A, /* 0x1C-0x1F */
+	0xB1, 0xA7, 0x82, 0x4B, 0x82, 0x4C, 0x82, 0x4D, /* 0x20-0x23 */
+	0x82, 0x4E, 0x82, 0x4F, 0x82, 0x50, 0x82, 0x51, /* 0x24-0x27 */
+	0x82, 0x52, 0xB1, 0xA8, 0x82, 0x53, 0x82, 0x54, /* 0x28-0x2B */
+	0xB1, 0xA9, 0xB1, 0xAA, 0x82, 0x55, 0x82, 0x56, /* 0x2C-0x2F */
+	0x82, 0x57, 0x82, 0x58, 0x82, 0x59, 0x82, 0x5A, /* 0x30-0x33 */
+	0xB1, 0xAB, 0xB1, 0xAC, 0x82, 0x61, 0x82, 0x62, /* 0x34-0x37 */
+	0xB1, 0xAD, 0x82, 0x63, 0x82, 0x64, 0x82, 0x65, /* 0x38-0x3B */
+	0xB1, 0xAE, 0x82, 0x66, 0x82, 0x67, 0x82, 0x68, /* 0x3C-0x3F */
+	0x82, 0x69, 0x82, 0x6A, 0x82, 0x6B, 0x82, 0x6C, /* 0x40-0x43 */
+	0xB1, 0xAF, 0xB1, 0xB0, 0x82, 0x6D, 0xB1, 0xB1, /* 0x44-0x47 */
+	0x82, 0x6E, 0xB1, 0xB2, 0x82, 0x6F, 0x82, 0x70, /* 0x48-0x4B */
+	0x82, 0x71, 0x82, 0x72, 0x82, 0x73, 0x82, 0x74, /* 0x4C-0x4F */
+	0xB1, 0xB3, 0x82, 0x75, 0x82, 0x76, 0x82, 0x77, /* 0x50-0x53 */
+	0xB1, 0xB4, 0x82, 0x78, 0x82, 0x79, 0x82, 0x7A, /* 0x54-0x57 */
+	0xB1, 0xB5, 0x82, 0x81, 0x82, 0x82, 0x82, 0x83, /* 0x58-0x5B */
+	0x82, 0x84, 0x82, 0x85, 0x82, 0x86, 0x82, 0x87, /* 0x5C-0x5F */
+	0x82, 0x88, 0xB1, 0xB6, 0x82, 0x89, 0xB1, 0xB7, /* 0x60-0x63 */
+	0x82, 0x8A, 0x82, 0x8B, 0x82, 0x8C, 0x82, 0x8D, /* 0x64-0x67 */
+	0x82, 0x8E, 0x82, 0x8F, 0x82, 0x90, 0x82, 0x91, /* 0x68-0x6B */
+	0xB1, 0xB8, 0xB1, 0xB9, 0x82, 0x92, 0x82, 0x93, /* 0x6C-0x6F */
+	0xB1, 0xBA, 0x82, 0x94, 0x82, 0x95, 0xB1, 0xBB, /* 0x70-0x73 */
+	0xB1, 0xBC, 0xB1, 0xBD, 0xB1, 0xBE, 0x82, 0x96, /* 0x74-0x77 */
+	0x82, 0x97, 0x82, 0x98, 0x82, 0x99, 0xB1, 0xBF, /* 0x78-0x7B */
+	0xB1, 0xC0, 0xB1, 0xC1, 0x82, 0x9A, 0xB1, 0xC2, /* 0x7C-0x7F */
+	
+	0x82, 0x9B, 0xB1, 0xC3, 0xB1, 0xC4, 0x82, 0x9C, /* 0x80-0x83 */
+	0x82, 0x9D, 0x82, 0x9E, 0x82, 0x9F, 0x82, 0xA0, /* 0x84-0x87 */
+	0xB1, 0xC5, 0xB1, 0xC6, 0x82, 0xA1, 0x82, 0xA2, /* 0x88-0x8B */
+	0xB1, 0xC7, 0x82, 0xA3, 0x82, 0xA4, 0x82, 0xA5, /* 0x8C-0x8F */
+	0xB1, 0xC8, 0x82, 0xA6, 0x82, 0xA7, 0x82, 0xA8, /* 0x90-0x93 */
+	0x82, 0xA9, 0x82, 0xAA, 0x82, 0xAB, 0x82, 0xAC, /* 0x94-0x97 */
+	0x82, 0xAD, 0x82, 0xAE, 0x82, 0xAF, 0x82, 0xB0, /* 0x98-0x9B */
+	0xB1, 0xC9, 0xB1, 0xCA, 0x82, 0xB1, 0x82, 0xB2, /* 0x9C-0x9F */
+	0x82, 0xB3, 0x82, 0xB4, 0x82, 0xB5, 0x82, 0xB6, /* 0xA0-0xA3 */
+	0xB1, 0xCB, 0x82, 0xB7, 0x82, 0xB8, 0x82, 0xB9, /* 0xA4-0xA7 */
+	0x82, 0xBA, 0x82, 0xBB, 0x82, 0xBC, 0x82, 0xBD, /* 0xA8-0xAB */
+	0x82, 0xBE, 0x82, 0xBF, 0x82, 0xC0, 0x82, 0xC1, /* 0xAC-0xAF */
+	0x82, 0xC2, 0x82, 0xC3, 0x82, 0xC4, 0x82, 0xC5, /* 0xB0-0xB3 */
+	0x82, 0xC6, 0x82, 0xC7, 0x82, 0xC8, 0xB1, 0xCC, /* 0xB4-0xB7 */
+	0x82, 0xC9, 0x82, 0xCA, 0x82, 0xCB, 0x82, 0xCC, /* 0xB8-0xBB */
+	0x82, 0xCD, 0x82, 0xCE, 0x82, 0xCF, 0x82, 0xD0, /* 0xBC-0xBF */
+	0xB1, 0xCD, 0xB1, 0xCE, 0x82, 0xD1, 0x82, 0xD2, /* 0xC0-0xC3 */
+	0xB1, 0xCF, 0x82, 0xD3, 0x82, 0xD4, 0x82, 0xD5, /* 0xC4-0xC7 */
+	0xB1, 0xD0, 0x82, 0xD6, 0x82, 0xD7, 0x82, 0xD8, /* 0xC8-0xCB */
+	0x82, 0xD9, 0x82, 0xDA, 0x82, 0xDB, 0x82, 0xDC, /* 0xCC-0xCF */
+	0xB1, 0xD1, 0xB1, 0xD2, 0x82, 0xDD, 0xB1, 0xD3, /* 0xD0-0xD3 */
+	0x82, 0xDE, 0x82, 0xDF, 0x82, 0xE0, 0x82, 0xE1, /* 0xD4-0xD7 */
+	0x82, 0xE2, 0x82, 0xE3, 0x82, 0xE4, 0x82, 0xE5, /* 0xD8-0xDB */
+	0xB1, 0xD4, 0x82, 0xE6, 0x82, 0xE7, 0x82, 0xE8, /* 0xDC-0xDF */
+	0xB1, 0xD5, 0x82, 0xE9, 0x82, 0xEA, 0x82, 0xEB, /* 0xE0-0xE3 */
+	0xB1, 0xD6, 0x82, 0xEC, 0x82, 0xED, 0x82, 0xEE, /* 0xE4-0xE7 */
+	0x82, 0xEF, 0x82, 0xF0, 0x82, 0xF1, 0x82, 0xF2, /* 0xE8-0xEB */
+	0x82, 0xF3, 0x82, 0xF4, 0x82, 0xF5, 0x82, 0xF6, /* 0xEC-0xEF */
+	0x82, 0xF7, 0x82, 0xF8, 0x82, 0xF9, 0x82, 0xFA, /* 0xF0-0xF3 */
+	0x82, 0xFB, 0x82, 0xFC, 0x82, 0xFD, 0x82, 0xFE, /* 0xF4-0xF7 */
+	0xB1, 0xD7, 0xB1, 0xD8, 0x83, 0x41, 0x83, 0x42, /* 0xF8-0xFB */
+	0xB1, 0xD9, 0x83, 0x43, 0x83, 0x44, 0xB1, 0xDA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_AE[512] = {
+	0xB1, 0xDB, 0xB1, 0xDC, 0x83, 0x45, 0x83, 0x46, /* 0x00-0x03 */
+	0x83, 0x47, 0x83, 0x48, 0x83, 0x49, 0x83, 0x4A, /* 0x04-0x07 */
+	0xB1, 0xDD, 0xB1, 0xDE, 0x83, 0x4B, 0xB1, 0xDF, /* 0x08-0x0B */
+	0x83, 0x4C, 0xB1, 0xE0, 0x83, 0x4D, 0x83, 0x4E, /* 0x0C-0x0F */
+	0x83, 0x4F, 0x83, 0x50, 0x83, 0x51, 0x83, 0x52, /* 0x10-0x13 */
+	0xB1, 0xE1, 0x83, 0x53, 0x83, 0x54, 0x83, 0x55, /* 0x14-0x17 */
+	0x83, 0x56, 0x83, 0x57, 0x83, 0x58, 0x83, 0x59, /* 0x18-0x1B */
+	0x83, 0x5A, 0x83, 0x61, 0x83, 0x62, 0x83, 0x63, /* 0x1C-0x1F */
+	0x83, 0x64, 0x83, 0x65, 0x83, 0x66, 0x83, 0x67, /* 0x20-0x23 */
+	0x83, 0x68, 0x83, 0x69, 0x83, 0x6A, 0x83, 0x6B, /* 0x24-0x27 */
+	0x83, 0x6C, 0x83, 0x6D, 0x83, 0x6E, 0x83, 0x6F, /* 0x28-0x2B */
+	0x83, 0x70, 0x83, 0x71, 0x83, 0x72, 0x83, 0x73, /* 0x2C-0x2F */
+	0xB1, 0xE2, 0xB1, 0xE3, 0x83, 0x74, 0x83, 0x75, /* 0x30-0x33 */
+	0xB1, 0xE4, 0x83, 0x76, 0x83, 0x77, 0xB1, 0xE5, /* 0x34-0x37 */
+	0xB1, 0xE6, 0x83, 0x78, 0xB1, 0xE7, 0x83, 0x79, /* 0x38-0x3B */
+	0x83, 0x7A, 0x83, 0x81, 0x83, 0x82, 0x83, 0x83, /* 0x3C-0x3F */
+	0xB1, 0xE8, 0xB1, 0xE9, 0x83, 0x84, 0xB1, 0xEA, /* 0x40-0x43 */
+	0x83, 0x85, 0xB1, 0xEB, 0xB1, 0xEC, 0x83, 0x86, /* 0x44-0x47 */
+	0x83, 0x87, 0x83, 0x88, 0xB1, 0xED, 0x83, 0x89, /* 0x48-0x4B */
+	0xB1, 0xEE, 0xB1, 0xEF, 0xB1, 0xF0, 0x83, 0x8A, /* 0x4C-0x4F */
+	0xB1, 0xF1, 0x83, 0x8B, 0x83, 0x8C, 0x83, 0x8D, /* 0x50-0x53 */
+	0xB1, 0xF2, 0x83, 0x8E, 0xB1, 0xF3, 0x83, 0x8F, /* 0x54-0x57 */
+	0x83, 0x90, 0x83, 0x91, 0x83, 0x92, 0x83, 0x93, /* 0x58-0x5B */
+	0xB1, 0xF4, 0xB1, 0xF5, 0x83, 0x94, 0xB1, 0xF6, /* 0x5C-0x5F */
+	0xB1, 0xF7, 0xB1, 0xF8, 0x83, 0x95, 0x83, 0x96, /* 0x60-0x63 */
+	0x83, 0x97, 0xB1, 0xF9, 0x83, 0x98, 0x83, 0x99, /* 0x64-0x67 */
+	0xB1, 0xFA, 0xB1, 0xFB, 0x83, 0x9A, 0x83, 0x9B, /* 0x68-0x6B */
+	0xB1, 0xFC, 0x83, 0x9C, 0x83, 0x9D, 0x83, 0x9E, /* 0x6C-0x6F */
+	0xB1, 0xFD, 0x83, 0x9F, 0x83, 0xA0, 0x83, 0xA1, /* 0x70-0x73 */
+	0x83, 0xA2, 0x83, 0xA3, 0x83, 0xA4, 0x83, 0xA5, /* 0x74-0x77 */
+	0xB1, 0xFE, 0xB2, 0xA1, 0x83, 0xA6, 0xB2, 0xA2, /* 0x78-0x7B */
+	0xB2, 0xA3, 0xB2, 0xA4, 0x83, 0xA7, 0x83, 0xA8, /* 0x7C-0x7F */
+	
+	0x83, 0xA9, 0x83, 0xAA, 0x83, 0xAB, 0x83, 0xAC, /* 0x80-0x83 */
+	0xB2, 0xA5, 0xB2, 0xA6, 0x83, 0xAD, 0x83, 0xAE, /* 0x84-0x87 */
+	0x83, 0xAF, 0x83, 0xB0, 0x83, 0xB1, 0x83, 0xB2, /* 0x88-0x8B */
+	0xB2, 0xA7, 0x83, 0xB3, 0x83, 0xB4, 0x83, 0xB5, /* 0x8C-0x8F */
+	0x83, 0xB6, 0x83, 0xB7, 0x83, 0xB8, 0x83, 0xB9, /* 0x90-0x93 */
+	0x83, 0xBA, 0x83, 0xBB, 0x83, 0xBC, 0x83, 0xBD, /* 0x94-0x97 */
+	0x83, 0xBE, 0x83, 0xBF, 0x83, 0xC0, 0x83, 0xC1, /* 0x98-0x9B */
+	0x83, 0xC2, 0x83, 0xC3, 0x83, 0xC4, 0x83, 0xC5, /* 0x9C-0x9F */
+	0x83, 0xC6, 0x83, 0xC7, 0x83, 0xC8, 0x83, 0xC9, /* 0xA0-0xA3 */
+	0x83, 0xCA, 0x83, 0xCB, 0x83, 0xCC, 0x83, 0xCD, /* 0xA4-0xA7 */
+	0x83, 0xCE, 0x83, 0xCF, 0x83, 0xD0, 0x83, 0xD1, /* 0xA8-0xAB */
+	0x83, 0xD2, 0x83, 0xD3, 0x83, 0xD4, 0x83, 0xD5, /* 0xAC-0xAF */
+	0x83, 0xD6, 0x83, 0xD7, 0x83, 0xD8, 0x83, 0xD9, /* 0xB0-0xB3 */
+	0x83, 0xDA, 0x83, 0xDB, 0x83, 0xDC, 0x83, 0xDD, /* 0xB4-0xB7 */
+	0x83, 0xDE, 0x83, 0xDF, 0x83, 0xE0, 0x83, 0xE1, /* 0xB8-0xBB */
+	0xB2, 0xA8, 0xB2, 0xA9, 0xB2, 0xAA, 0x83, 0xE2, /* 0xBC-0xBF */
+	0xB2, 0xAB, 0x83, 0xE3, 0x83, 0xE4, 0x83, 0xE5, /* 0xC0-0xC3 */
+	0xB2, 0xAC, 0x83, 0xE6, 0x83, 0xE7, 0x83, 0xE8, /* 0xC4-0xC7 */
+	0x83, 0xE9, 0x83, 0xEA, 0x83, 0xEB, 0x83, 0xEC, /* 0xC8-0xCB */
+	0xB2, 0xAD, 0xB2, 0xAE, 0x83, 0xED, 0xB2, 0xAF, /* 0xCC-0xCF */
+	0xB2, 0xB0, 0xB2, 0xB1, 0x83, 0xEE, 0x83, 0xEF, /* 0xD0-0xD3 */
+	0x83, 0xF0, 0x83, 0xF1, 0x83, 0xF2, 0x83, 0xF3, /* 0xD4-0xD7 */
+	0xB2, 0xB2, 0xB2, 0xB3, 0x83, 0xF4, 0x83, 0xF5, /* 0xD8-0xDB */
+	0xB2, 0xB4, 0x83, 0xF6, 0x83, 0xF7, 0x83, 0xF8, /* 0xDC-0xDF */
+	0x83, 0xF9, 0x83, 0xFA, 0x83, 0xFB, 0x83, 0xFC, /* 0xE0-0xE3 */
+	0x83, 0xFD, 0x83, 0xFE, 0x84, 0x41, 0x84, 0x42, /* 0xE4-0xE7 */
+	0xB2, 0xB5, 0x84, 0x43, 0x84, 0x44, 0xB2, 0xB6, /* 0xE8-0xEB */
+	0x84, 0x45, 0xB2, 0xB7, 0x84, 0x46, 0x84, 0x47, /* 0xEC-0xEF */
+	0x84, 0x48, 0x84, 0x49, 0x84, 0x4A, 0x84, 0x4B, /* 0xF0-0xF3 */
+	0xB2, 0xB8, 0x84, 0x4C, 0x84, 0x4D, 0x84, 0x4E, /* 0xF4-0xF7 */
+	0xB2, 0xB9, 0x84, 0x4F, 0x84, 0x50, 0x84, 0x51, /* 0xF8-0xFB */
+	0xB2, 0xBA, 0x84, 0x52, 0x84, 0x53, 0x84, 0x54, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_AF[512] = {
+	0x84, 0x55, 0x84, 0x56, 0x84, 0x57, 0x84, 0x58, /* 0x00-0x03 */
+	0x84, 0x59, 0x84, 0x5A, 0x84, 0x61, 0xB2, 0xBB, /* 0x04-0x07 */
+	0xB2, 0xBC, 0x84, 0x62, 0x84, 0x63, 0x84, 0x64, /* 0x08-0x0B */
+	0x84, 0x65, 0xB2, 0xBD, 0x84, 0x66, 0x84, 0x67, /* 0x0C-0x0F */
+	0xB2, 0xBE, 0x84, 0x68, 0x84, 0x69, 0x84, 0x6A, /* 0x10-0x13 */
+	0x84, 0x6B, 0x84, 0x6C, 0x84, 0x6D, 0x84, 0x6E, /* 0x14-0x17 */
+	0x84, 0x6F, 0x84, 0x70, 0x84, 0x71, 0x84, 0x72, /* 0x18-0x1B */
+	0x84, 0x73, 0x84, 0x74, 0x84, 0x75, 0x84, 0x76, /* 0x1C-0x1F */
+	0x84, 0x77, 0x84, 0x78, 0x84, 0x79, 0x84, 0x7A, /* 0x20-0x23 */
+	0x84, 0x81, 0x84, 0x82, 0x84, 0x83, 0x84, 0x84, /* 0x24-0x27 */
+	0x84, 0x85, 0x84, 0x86, 0x84, 0x87, 0x84, 0x88, /* 0x28-0x2B */
+	0xB2, 0xBF, 0xB2, 0xC0, 0x84, 0x89, 0x84, 0x8A, /* 0x2C-0x2F */
+	0xB2, 0xC1, 0x84, 0x8B, 0xB2, 0xC2, 0x84, 0x8C, /* 0x30-0x33 */
+	0xB2, 0xC3, 0x84, 0x8D, 0x84, 0x8E, 0x84, 0x8F, /* 0x34-0x37 */
+	0x84, 0x90, 0x84, 0x91, 0x84, 0x92, 0x84, 0x93, /* 0x38-0x3B */
+	0xB2, 0xC4, 0xB2, 0xC5, 0x84, 0x94, 0xB2, 0xC6, /* 0x3C-0x3F */
+	0x84, 0x95, 0xB2, 0xC7, 0xB2, 0xC8, 0xB2, 0xC9, /* 0x40-0x43 */
+	0x84, 0x96, 0x84, 0x97, 0x84, 0x98, 0x84, 0x99, /* 0x44-0x47 */
+	0xB2, 0xCA, 0xB2, 0xCB, 0x84, 0x9A, 0x84, 0x9B, /* 0x48-0x4B */
+	0x84, 0x9C, 0x84, 0x9D, 0x84, 0x9E, 0x84, 0x9F, /* 0x4C-0x4F */
+	0xB2, 0xCC, 0x84, 0xA0, 0x84, 0xA1, 0x84, 0xA2, /* 0x50-0x53 */
+	0x84, 0xA3, 0x84, 0xA4, 0x84, 0xA5, 0x84, 0xA6, /* 0x54-0x57 */
+	0x84, 0xA7, 0x84, 0xA8, 0x84, 0xA9, 0x84, 0xAA, /* 0x58-0x5B */
+	0xB2, 0xCD, 0xB2, 0xCE, 0x84, 0xAB, 0x84, 0xAC, /* 0x5C-0x5F */
+	0x84, 0xAD, 0x84, 0xAE, 0x84, 0xAF, 0x84, 0xB0, /* 0x60-0x63 */
+	0xB2, 0xCF, 0xB2, 0xD0, 0x84, 0xB1, 0x84, 0xB2, /* 0x64-0x67 */
+	0x84, 0xB3, 0x84, 0xB4, 0x84, 0xB5, 0x84, 0xB6, /* 0x68-0x6B */
+	0x84, 0xB7, 0x84, 0xB8, 0x84, 0xB9, 0x84, 0xBA, /* 0x6C-0x6F */
+	0x84, 0xBB, 0x84, 0xBC, 0x84, 0xBD, 0x84, 0xBE, /* 0x70-0x73 */
+	0x84, 0xBF, 0x84, 0xC0, 0x84, 0xC1, 0x84, 0xC2, /* 0x74-0x77 */
+	0x84, 0xC3, 0xB2, 0xD1, 0x84, 0xC4, 0x84, 0xC5, /* 0x78-0x7B */
+	0x84, 0xC6, 0x84, 0xC7, 0x84, 0xC8, 0x84, 0xC9, /* 0x7C-0x7F */
+	
+	0xB2, 0xD2, 0x84, 0xCA, 0x84, 0xCB, 0x84, 0xCC, /* 0x80-0x83 */
+	0xB2, 0xD3, 0x84, 0xCD, 0x84, 0xCE, 0x84, 0xCF, /* 0x84-0x87 */
+	0xB2, 0xD4, 0x84, 0xD0, 0x84, 0xD1, 0x84, 0xD2, /* 0x88-0x8B */
+	0x84, 0xD3, 0x84, 0xD4, 0x84, 0xD5, 0x84, 0xD6, /* 0x8C-0x8F */
+	0xB2, 0xD5, 0xB2, 0xD6, 0x84, 0xD7, 0x84, 0xD8, /* 0x90-0x93 */
+	0x84, 0xD9, 0xB2, 0xD7, 0x84, 0xDA, 0x84, 0xDB, /* 0x94-0x97 */
+	0x84, 0xDC, 0x84, 0xDD, 0x84, 0xDE, 0x84, 0xDF, /* 0x98-0x9B */
+	0xB2, 0xD8, 0x84, 0xE0, 0x84, 0xE1, 0x84, 0xE2, /* 0x9C-0x9F */
+	0x84, 0xE3, 0x84, 0xE4, 0x84, 0xE5, 0x84, 0xE6, /* 0xA0-0xA3 */
+	0x84, 0xE7, 0x84, 0xE8, 0x84, 0xE9, 0x84, 0xEA, /* 0xA4-0xA7 */
+	0x84, 0xEB, 0x84, 0xEC, 0x84, 0xED, 0x84, 0xEE, /* 0xA8-0xAB */
+	0x84, 0xEF, 0x84, 0xF0, 0x84, 0xF1, 0x84, 0xF2, /* 0xAC-0xAF */
+	0x84, 0xF3, 0x84, 0xF4, 0x84, 0xF5, 0x84, 0xF6, /* 0xB0-0xB3 */
+	0x84, 0xF7, 0x84, 0xF8, 0x84, 0xF9, 0x84, 0xFA, /* 0xB4-0xB7 */
+	0xB2, 0xD9, 0xB2, 0xDA, 0x84, 0xFB, 0x84, 0xFC, /* 0xB8-0xBB */
+	0xB2, 0xDB, 0x84, 0xFD, 0x84, 0xFE, 0x85, 0x41, /* 0xBC-0xBF */
+	0xB2, 0xDC, 0x85, 0x42, 0x85, 0x43, 0x85, 0x44, /* 0xC0-0xC3 */
+	0x85, 0x45, 0x85, 0x46, 0x85, 0x47, 0xB2, 0xDD, /* 0xC4-0xC7 */
+	0xB2, 0xDE, 0xB2, 0xDF, 0x85, 0x48, 0xB2, 0xE0, /* 0xC8-0xCB */
+	0x85, 0x49, 0xB2, 0xE1, 0xB2, 0xE2, 0x85, 0x4A, /* 0xCC-0xCF */
+	0x85, 0x4B, 0x85, 0x4C, 0x85, 0x4D, 0x85, 0x4E, /* 0xD0-0xD3 */
+	0xB2, 0xE3, 0x85, 0x4F, 0x85, 0x50, 0x85, 0x51, /* 0xD4-0xD7 */
+	0x85, 0x52, 0x85, 0x53, 0x85, 0x54, 0x85, 0x55, /* 0xD8-0xDB */
+	0xB2, 0xE4, 0x85, 0x56, 0x85, 0x57, 0x85, 0x58, /* 0xDC-0xDF */
+	0x85, 0x59, 0x85, 0x5A, 0x85, 0x61, 0x85, 0x62, /* 0xE0-0xE3 */
+	0x85, 0x63, 0x85, 0x64, 0x85, 0x65, 0x85, 0x66, /* 0xE4-0xE7 */
+	0xB2, 0xE5, 0xB2, 0xE6, 0x85, 0x67, 0x85, 0x68, /* 0xE8-0xEB */
+	0x85, 0x69, 0x85, 0x6A, 0x85, 0x6B, 0x85, 0x6C, /* 0xEC-0xEF */
+	0xB2, 0xE7, 0xB2, 0xE8, 0x85, 0x6D, 0x85, 0x6E, /* 0xF0-0xF3 */
+	0xB2, 0xE9, 0x85, 0x6F, 0x85, 0x70, 0x85, 0x71, /* 0xF4-0xF7 */
+	0xB2, 0xEA, 0x85, 0x72, 0x85, 0x73, 0x85, 0x74, /* 0xF8-0xFB */
+	0x85, 0x75, 0x85, 0x76, 0x85, 0x77, 0x85, 0x78, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B0[512] = {
+	0xB2, 0xEB, 0xB2, 0xEC, 0x85, 0x79, 0x85, 0x7A, /* 0x00-0x03 */
+	0xB2, 0xED, 0x85, 0x81, 0x85, 0x82, 0x85, 0x83, /* 0x04-0x07 */
+	0x85, 0x84, 0x85, 0x85, 0x85, 0x86, 0x85, 0x87, /* 0x08-0x0B */
+	0xB2, 0xEE, 0x85, 0x88, 0x85, 0x89, 0x85, 0x8A, /* 0x0C-0x0F */
+	0xB2, 0xEF, 0x85, 0x8B, 0x85, 0x8C, 0x85, 0x8D, /* 0x10-0x13 */
+	0xB2, 0xF0, 0x85, 0x8E, 0x85, 0x8F, 0x85, 0x90, /* 0x14-0x17 */
+	0x85, 0x91, 0x85, 0x92, 0x85, 0x93, 0x85, 0x94, /* 0x18-0x1B */
+	0xB2, 0xF1, 0xB2, 0xF2, 0x85, 0x95, 0x85, 0x96, /* 0x1C-0x1F */
+	0x85, 0x97, 0x85, 0x98, 0x85, 0x99, 0x85, 0x9A, /* 0x20-0x23 */
+	0x85, 0x9B, 0x85, 0x9C, 0x85, 0x9D, 0x85, 0x9E, /* 0x24-0x27 */
+	0xB2, 0xF3, 0x85, 0x9F, 0x85, 0xA0, 0x85, 0xA1, /* 0x28-0x2B */
+	0x85, 0xA2, 0x85, 0xA3, 0x85, 0xA4, 0x85, 0xA5, /* 0x2C-0x2F */
+	0x85, 0xA6, 0x85, 0xA7, 0x85, 0xA8, 0x85, 0xA9, /* 0x30-0x33 */
+	0x85, 0xAA, 0x85, 0xAB, 0x85, 0xAC, 0x85, 0xAD, /* 0x34-0x37 */
+	0x85, 0xAE, 0x85, 0xAF, 0x85, 0xB0, 0x85, 0xB1, /* 0x38-0x3B */
+	0x85, 0xB2, 0x85, 0xB3, 0x85, 0xB4, 0x85, 0xB5, /* 0x3C-0x3F */
+	0x85, 0xB6, 0x85, 0xB7, 0x85, 0xB8, 0x85, 0xB9, /* 0x40-0x43 */
+	0xB2, 0xF4, 0xB2, 0xF5, 0x85, 0xBA, 0x85, 0xBB, /* 0x44-0x47 */
+	0xB2, 0xF6, 0x85, 0xBC, 0xB2, 0xF7, 0x85, 0xBD, /* 0x48-0x4B */
+	0xB2, 0xF8, 0x85, 0xBE, 0xB2, 0xF9, 0x85, 0xBF, /* 0x4C-0x4F */
+	0x85, 0xC0, 0x85, 0xC1, 0x85, 0xC2, 0xB2, 0xFA, /* 0x50-0x53 */
+	0xB2, 0xFB, 0xB2, 0xFC, 0x85, 0xC3, 0xB2, 0xFD, /* 0x54-0x57 */
+	0x85, 0xC4, 0xB2, 0xFE, 0x85, 0xC5, 0x85, 0xC6, /* 0x58-0x5B */
+	0x85, 0xC7, 0xB3, 0xA1, 0x85, 0xC8, 0x85, 0xC9, /* 0x5C-0x5F */
+	0x85, 0xCA, 0x85, 0xCB, 0x85, 0xCC, 0x85, 0xCD, /* 0x60-0x63 */
+	0x85, 0xCE, 0x85, 0xCF, 0x85, 0xD0, 0x85, 0xD1, /* 0x64-0x67 */
+	0x85, 0xD2, 0x85, 0xD3, 0x85, 0xD4, 0x85, 0xD5, /* 0x68-0x6B */
+	0x85, 0xD6, 0x85, 0xD7, 0x85, 0xD8, 0x85, 0xD9, /* 0x6C-0x6F */
+	0x85, 0xDA, 0x85, 0xDB, 0x85, 0xDC, 0x85, 0xDD, /* 0x70-0x73 */
+	0x85, 0xDE, 0x85, 0xDF, 0x85, 0xE0, 0x85, 0xE1, /* 0x74-0x77 */
+	0x85, 0xE2, 0x85, 0xE3, 0x85, 0xE4, 0x85, 0xE5, /* 0x78-0x7B */
+	0xB3, 0xA2, 0xB3, 0xA3, 0x85, 0xE6, 0x85, 0xE7, /* 0x7C-0x7F */
+	
+	0xB3, 0xA4, 0x85, 0xE8, 0x85, 0xE9, 0x85, 0xEA, /* 0x80-0x83 */
+	0xB3, 0xA5, 0x85, 0xEB, 0x85, 0xEC, 0x85, 0xED, /* 0x84-0x87 */
+	0x85, 0xEE, 0x85, 0xEF, 0x85, 0xF0, 0x85, 0xF1, /* 0x88-0x8B */
+	0xB3, 0xA6, 0xB3, 0xA7, 0x85, 0xF2, 0xB3, 0xA8, /* 0x8C-0x8F */
+	0x85, 0xF3, 0xB3, 0xA9, 0x85, 0xF4, 0x85, 0xF5, /* 0x90-0x93 */
+	0x85, 0xF6, 0x85, 0xF7, 0x85, 0xF8, 0x85, 0xF9, /* 0x94-0x97 */
+	0xB3, 0xAA, 0xB3, 0xAB, 0xB3, 0xAC, 0x85, 0xFA, /* 0x98-0x9B */
+	0xB3, 0xAD, 0x85, 0xFB, 0x85, 0xFC, 0xB3, 0xAE, /* 0x9C-0x9F */
+	0xB3, 0xAF, 0xB3, 0xB0, 0xB3, 0xB1, 0x85, 0xFD, /* 0xA0-0xA3 */
+	0x85, 0xFE, 0x86, 0x41, 0x86, 0x42, 0x86, 0x43, /* 0xA4-0xA7 */
+	0xB3, 0xB2, 0xB3, 0xB3, 0x86, 0x44, 0xB3, 0xB4, /* 0xA8-0xAB */
+	0xB3, 0xB5, 0xB3, 0xB6, 0xB3, 0xB7, 0xB3, 0xB8, /* 0xAC-0xAF */
+	0x86, 0x45, 0xB3, 0xB9, 0x86, 0x46, 0xB3, 0xBA, /* 0xB0-0xB3 */
+	0xB3, 0xBB, 0xB3, 0xBC, 0x86, 0x47, 0x86, 0x48, /* 0xB4-0xB7 */
+	0xB3, 0xBD, 0x86, 0x49, 0x86, 0x4A, 0x86, 0x4B, /* 0xB8-0xBB */
+	0xB3, 0xBE, 0x86, 0x4C, 0x86, 0x4D, 0x86, 0x4E, /* 0xBC-0xBF */
+	0x86, 0x4F, 0x86, 0x50, 0x86, 0x51, 0x86, 0x52, /* 0xC0-0xC3 */
+	0xB3, 0xBF, 0xB3, 0xC0, 0x86, 0x53, 0xB3, 0xC1, /* 0xC4-0xC7 */
+	0xB3, 0xC2, 0xB3, 0xC3, 0x86, 0x54, 0x86, 0x55, /* 0xC8-0xCB */
+	0x86, 0x56, 0x86, 0x57, 0x86, 0x58, 0x86, 0x59, /* 0xCC-0xCF */
+	0xB3, 0xC4, 0xB3, 0xC5, 0x86, 0x5A, 0x86, 0x61, /* 0xD0-0xD3 */
+	0xB3, 0xC6, 0x86, 0x62, 0x86, 0x63, 0x86, 0x64, /* 0xD4-0xD7 */
+	0xB3, 0xC7, 0x86, 0x65, 0x86, 0x66, 0x86, 0x67, /* 0xD8-0xDB */
+	0x86, 0x68, 0x86, 0x69, 0x86, 0x6A, 0x86, 0x6B, /* 0xDC-0xDF */
+	0xB3, 0xC8, 0x86, 0x6C, 0x86, 0x6D, 0x86, 0x6E, /* 0xE0-0xE3 */
+	0x86, 0x6F, 0xB3, 0xC9, 0x86, 0x70, 0x86, 0x71, /* 0xE4-0xE7 */
+	0x86, 0x72, 0x86, 0x73, 0x86, 0x74, 0x86, 0x75, /* 0xE8-0xEB */
+	0x86, 0x76, 0x86, 0x77, 0x86, 0x78, 0x86, 0x79, /* 0xEC-0xEF */
+	0x86, 0x7A, 0x86, 0x81, 0x86, 0x82, 0x86, 0x83, /* 0xF0-0xF3 */
+	0x86, 0x84, 0x86, 0x85, 0x86, 0x86, 0x86, 0x87, /* 0xF4-0xF7 */
+	0x86, 0x88, 0x86, 0x89, 0x86, 0x8A, 0x86, 0x8B, /* 0xF8-0xFB */
+	0x86, 0x8C, 0x86, 0x8D, 0x86, 0x8E, 0x86, 0x8F, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B1[512] = {
+	0x86, 0x90, 0x86, 0x91, 0x86, 0x92, 0x86, 0x93, /* 0x00-0x03 */
+	0x86, 0x94, 0x86, 0x95, 0x86, 0x96, 0x86, 0x97, /* 0x04-0x07 */
+	0xB3, 0xCA, 0xB3, 0xCB, 0x86, 0x98, 0xB3, 0xCC, /* 0x08-0x0B */
+	0xB3, 0xCD, 0x86, 0x99, 0x86, 0x9A, 0x86, 0x9B, /* 0x0C-0x0F */
+	0xB3, 0xCE, 0x86, 0x9C, 0xB3, 0xCF, 0xB3, 0xD0, /* 0x10-0x13 */
+	0x86, 0x9D, 0x86, 0x9E, 0x86, 0x9F, 0x86, 0xA0, /* 0x14-0x17 */
+	0xB3, 0xD1, 0xB3, 0xD2, 0x86, 0xA1, 0xB3, 0xD3, /* 0x18-0x1B */
+	0xB3, 0xD4, 0xB3, 0xD5, 0x86, 0xA2, 0x86, 0xA3, /* 0x1C-0x1F */
+	0x86, 0xA4, 0x86, 0xA5, 0x86, 0xA6, 0xB3, 0xD6, /* 0x20-0x23 */
+	0xB3, 0xD7, 0xB3, 0xD8, 0x86, 0xA7, 0x86, 0xA8, /* 0x24-0x27 */
+	0xB3, 0xD9, 0x86, 0xA9, 0x86, 0xAA, 0x86, 0xAB, /* 0x28-0x2B */
+	0xB3, 0xDA, 0x86, 0xAC, 0x86, 0xAD, 0x86, 0xAE, /* 0x2C-0x2F */
+	0x86, 0xAF, 0x86, 0xB0, 0x86, 0xB1, 0x86, 0xB2, /* 0x30-0x33 */
+	0xB3, 0xDB, 0xB3, 0xDC, 0x86, 0xB3, 0xB3, 0xDD, /* 0x34-0x37 */
+	0xB3, 0xDE, 0xB3, 0xDF, 0x86, 0xB4, 0x86, 0xB5, /* 0x38-0x3B */
+	0x86, 0xB6, 0x86, 0xB7, 0x86, 0xB8, 0x86, 0xB9, /* 0x3C-0x3F */
+	0xB3, 0xE0, 0xB3, 0xE1, 0x86, 0xBA, 0x86, 0xBB, /* 0x40-0x43 */
+	0xB3, 0xE2, 0x86, 0xBC, 0x86, 0xBD, 0x86, 0xBE, /* 0x44-0x47 */
+	0xB3, 0xE3, 0x86, 0xBF, 0x86, 0xC0, 0x86, 0xC1, /* 0x48-0x4B */
+	0x86, 0xC2, 0x86, 0xC3, 0x86, 0xC4, 0x86, 0xC5, /* 0x4C-0x4F */
+	0xB3, 0xE4, 0xB3, 0xE5, 0x86, 0xC6, 0x86, 0xC7, /* 0x50-0x53 */
+	0xB3, 0xE6, 0xB3, 0xE7, 0x86, 0xC8, 0x86, 0xC9, /* 0x54-0x57 */
+	0xB3, 0xE8, 0x86, 0xCA, 0x86, 0xCB, 0x86, 0xCC, /* 0x58-0x5B */
+	0xB3, 0xE9, 0x86, 0xCD, 0x86, 0xCE, 0x86, 0xCF, /* 0x5C-0x5F */
+	0xB3, 0xEA, 0x86, 0xD0, 0x86, 0xD1, 0x86, 0xD2, /* 0x60-0x63 */
+	0x86, 0xD3, 0x86, 0xD4, 0x86, 0xD5, 0x86, 0xD6, /* 0x64-0x67 */
+	0x86, 0xD7, 0x86, 0xD8, 0x86, 0xD9, 0x86, 0xDA, /* 0x68-0x6B */
+	0x86, 0xDB, 0x86, 0xDC, 0x86, 0xDD, 0x86, 0xDE, /* 0x6C-0x6F */
+	0x86, 0xDF, 0x86, 0xE0, 0x86, 0xE1, 0x86, 0xE2, /* 0x70-0x73 */
+	0x86, 0xE3, 0x86, 0xE4, 0x86, 0xE5, 0x86, 0xE6, /* 0x74-0x77 */
+	0xB3, 0xEB, 0xB3, 0xEC, 0x86, 0xE7, 0x86, 0xE8, /* 0x78-0x7B */
+	0xB3, 0xED, 0x86, 0xE9, 0x86, 0xEA, 0x86, 0xEB, /* 0x7C-0x7F */
+	
+	0xB3, 0xEE, 0x86, 0xEC, 0xB3, 0xEF, 0x86, 0xED, /* 0x80-0x83 */
+	0x86, 0xEE, 0x86, 0xEF, 0x86, 0xF0, 0x86, 0xF1, /* 0x84-0x87 */
+	0xB3, 0xF0, 0xB3, 0xF1, 0x86, 0xF2, 0xB3, 0xF2, /* 0x88-0x8B */
+	0x86, 0xF3, 0xB3, 0xF3, 0x86, 0xF4, 0x86, 0xF5, /* 0x8C-0x8F */
+	0x86, 0xF6, 0x86, 0xF7, 0xB3, 0xF4, 0xB3, 0xF5, /* 0x90-0x93 */
+	0xB3, 0xF6, 0x86, 0xF8, 0x86, 0xF9, 0x86, 0xFA, /* 0x94-0x97 */
+	0xB3, 0xF7, 0x86, 0xFB, 0x86, 0xFC, 0x86, 0xFD, /* 0x98-0x9B */
+	0xB3, 0xF8, 0x86, 0xFE, 0x87, 0x41, 0x87, 0x42, /* 0x9C-0x9F */
+	0x87, 0x43, 0x87, 0x44, 0x87, 0x45, 0x87, 0x46, /* 0xA0-0xA3 */
+	0x87, 0x47, 0x87, 0x48, 0x87, 0x49, 0x87, 0x4A, /* 0xA4-0xA7 */
+	0xB3, 0xF9, 0x87, 0x4B, 0x87, 0x4C, 0x87, 0x4D, /* 0xA8-0xAB */
+	0x87, 0x4E, 0x87, 0x4F, 0x87, 0x50, 0x87, 0x51, /* 0xAC-0xAF */
+	0x87, 0x52, 0x87, 0x53, 0x87, 0x54, 0x87, 0x55, /* 0xB0-0xB3 */
+	0x87, 0x56, 0x87, 0x57, 0x87, 0x58, 0x87, 0x59, /* 0xB4-0xB7 */
+	0x87, 0x5A, 0x87, 0x61, 0x87, 0x62, 0x87, 0x63, /* 0xB8-0xBB */
+	0x87, 0x64, 0x87, 0x65, 0x87, 0x66, 0x87, 0x67, /* 0xBC-0xBF */
+	0x87, 0x68, 0x87, 0x69, 0x87, 0x6A, 0x87, 0x6B, /* 0xC0-0xC3 */
+	0x87, 0x6C, 0x87, 0x6D, 0x87, 0x6E, 0x87, 0x6F, /* 0xC4-0xC7 */
+	0x87, 0x70, 0x87, 0x71, 0x87, 0x72, 0x87, 0x73, /* 0xC8-0xCB */
+	0xB3, 0xFA, 0x87, 0x74, 0x87, 0x75, 0x87, 0x76, /* 0xCC-0xCF */
+	0xB3, 0xFB, 0x87, 0x77, 0x87, 0x78, 0x87, 0x79, /* 0xD0-0xD3 */
+	0xB3, 0xFC, 0x87, 0x7A, 0x87, 0x81, 0x87, 0x82, /* 0xD4-0xD7 */
+	0x87, 0x83, 0x87, 0x84, 0x87, 0x85, 0x87, 0x86, /* 0xD8-0xDB */
+	0xB3, 0xFD, 0xB3, 0xFE, 0x87, 0x87, 0xB4, 0xA1, /* 0xDC-0xDF */
+	0x87, 0x88, 0x87, 0x89, 0x87, 0x8A, 0x87, 0x8B, /* 0xE0-0xE3 */
+	0x87, 0x8C, 0x87, 0x8D, 0x87, 0x8E, 0x87, 0x8F, /* 0xE4-0xE7 */
+	0xB4, 0xA2, 0xB4, 0xA3, 0x87, 0x90, 0x87, 0x91, /* 0xE8-0xEB */
+	0xB4, 0xA4, 0x87, 0x92, 0x87, 0x93, 0x87, 0x94, /* 0xEC-0xEF */
+	0xB4, 0xA5, 0x87, 0x95, 0x87, 0x96, 0x87, 0x97, /* 0xF0-0xF3 */
+	0x87, 0x98, 0x87, 0x99, 0x87, 0x9A, 0x87, 0x9B, /* 0xF4-0xF7 */
+	0x87, 0x9C, 0xB4, 0xA6, 0x87, 0x9D, 0xB4, 0xA7, /* 0xF8-0xFB */
+	0x87, 0x9E, 0xB4, 0xA8, 0x87, 0x9F, 0x87, 0xA0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B2[512] = {
+	0x87, 0xA1, 0x87, 0xA2, 0x87, 0xA3, 0x87, 0xA4, /* 0x00-0x03 */
+	0xB4, 0xA9, 0xB4, 0xAA, 0x87, 0xA5, 0x87, 0xA6, /* 0x04-0x07 */
+	0xB4, 0xAB, 0x87, 0xA7, 0x87, 0xA8, 0xB4, 0xAC, /* 0x08-0x0B */
+	0xB4, 0xAD, 0x87, 0xA9, 0x87, 0xAA, 0x87, 0xAB, /* 0x0C-0x0F */
+	0x87, 0xAC, 0x87, 0xAD, 0x87, 0xAE, 0x87, 0xAF, /* 0x10-0x13 */
+	0xB4, 0xAE, 0xB4, 0xAF, 0x87, 0xB0, 0xB4, 0xB0, /* 0x14-0x17 */
+	0x87, 0xB1, 0xB4, 0xB1, 0x87, 0xB2, 0x87, 0xB3, /* 0x18-0x1B */
+	0x87, 0xB4, 0x87, 0xB5, 0x87, 0xB6, 0x87, 0xB7, /* 0x1C-0x1F */
+	0xB4, 0xB2, 0x87, 0xB8, 0x87, 0xB9, 0x87, 0xBA, /* 0x20-0x23 */
+	0x87, 0xBB, 0x87, 0xBC, 0x87, 0xBD, 0x87, 0xBE, /* 0x24-0x27 */
+	0x87, 0xBF, 0x87, 0xC0, 0x87, 0xC1, 0x87, 0xC2, /* 0x28-0x2B */
+	0x87, 0xC3, 0x87, 0xC4, 0x87, 0xC5, 0x87, 0xC6, /* 0x2C-0x2F */
+	0x87, 0xC7, 0x87, 0xC8, 0x87, 0xC9, 0x87, 0xCA, /* 0x30-0x33 */
+	0xB4, 0xB3, 0x87, 0xCB, 0x87, 0xCC, 0x87, 0xCD, /* 0x34-0x37 */
+	0x87, 0xCE, 0x87, 0xCF, 0x87, 0xD0, 0x87, 0xD1, /* 0x38-0x3B */
+	0xB4, 0xB4, 0x87, 0xD2, 0x87, 0xD3, 0x87, 0xD4, /* 0x3C-0x3F */
+	0x87, 0xD5, 0x87, 0xD6, 0x87, 0xD7, 0x87, 0xD8, /* 0x40-0x43 */
+	0x87, 0xD9, 0x87, 0xDA, 0x87, 0xDB, 0x87, 0xDC, /* 0x44-0x47 */
+	0x87, 0xDD, 0x87, 0xDE, 0x87, 0xDF, 0x87, 0xE0, /* 0x48-0x4B */
+	0x87, 0xE1, 0x87, 0xE2, 0x87, 0xE3, 0x87, 0xE4, /* 0x4C-0x4F */
+	0x87, 0xE5, 0x87, 0xE6, 0x87, 0xE7, 0x87, 0xE8, /* 0x50-0x53 */
+	0x87, 0xE9, 0x87, 0xEA, 0x87, 0xEB, 0x87, 0xEC, /* 0x54-0x57 */
+	0xB4, 0xB5, 0x87, 0xED, 0x87, 0xEE, 0x87, 0xEF, /* 0x58-0x5B */
+	0xB4, 0xB6, 0x87, 0xF0, 0x87, 0xF1, 0x87, 0xF2, /* 0x5C-0x5F */
+	0xB4, 0xB7, 0x87, 0xF3, 0x87, 0xF4, 0x87, 0xF5, /* 0x60-0x63 */
+	0x87, 0xF6, 0x87, 0xF7, 0x87, 0xF8, 0x87, 0xF9, /* 0x64-0x67 */
+	0xB4, 0xB8, 0xB4, 0xB9, 0x87, 0xFA, 0x87, 0xFB, /* 0x68-0x6B */
+	0x87, 0xFC, 0x87, 0xFD, 0x87, 0xFE, 0x88, 0x41, /* 0x6C-0x6F */
+	0x88, 0x42, 0x88, 0x43, 0x88, 0x44, 0x88, 0x45, /* 0x70-0x73 */
+	0xB4, 0xBA, 0xB4, 0xBB, 0x88, 0x46, 0x88, 0x47, /* 0x74-0x77 */
+	0x88, 0x48, 0x88, 0x49, 0x88, 0x4A, 0x88, 0x4B, /* 0x78-0x7B */
+	0xB4, 0xBC, 0x88, 0x4C, 0x88, 0x4D, 0x88, 0x4E, /* 0x7C-0x7F */
+	
+	0x88, 0x4F, 0x88, 0x50, 0x88, 0x51, 0x88, 0x52, /* 0x80-0x83 */
+	0xB4, 0xBD, 0xB4, 0xBE, 0x88, 0x53, 0x88, 0x54, /* 0x84-0x87 */
+	0x88, 0x55, 0xB4, 0xBF, 0x88, 0x56, 0x88, 0x57, /* 0x88-0x8B */
+	0x88, 0x58, 0x88, 0x59, 0x88, 0x5A, 0x88, 0x61, /* 0x8C-0x8F */
+	0xB4, 0xC0, 0xB4, 0xC1, 0x88, 0x62, 0x88, 0x63, /* 0x90-0x93 */
+	0xB4, 0xC2, 0x88, 0x64, 0x88, 0x65, 0x88, 0x66, /* 0x94-0x97 */
+	0xB4, 0xC3, 0xB4, 0xC4, 0xB4, 0xC5, 0x88, 0x67, /* 0x98-0x9B */
+	0x88, 0x68, 0x88, 0x69, 0x88, 0x6A, 0x88, 0x6B, /* 0x9C-0x9F */
+	0xB4, 0xC6, 0xB4, 0xC7, 0x88, 0x6C, 0xB4, 0xC8, /* 0xA0-0xA3 */
+	0x88, 0x6D, 0xB4, 0xC9, 0xB4, 0xCA, 0x88, 0x6E, /* 0xA4-0xA7 */
+	0x88, 0x6F, 0x88, 0x70, 0xB4, 0xCB, 0x88, 0x71, /* 0xA8-0xAB */
+	0xB4, 0xCC, 0x88, 0x72, 0x88, 0x73, 0x88, 0x74, /* 0xAC-0xAF */
+	0xB4, 0xCD, 0x88, 0x75, 0x88, 0x76, 0x88, 0x77, /* 0xB0-0xB3 */
+	0xB4, 0xCE, 0x88, 0x78, 0x88, 0x79, 0x88, 0x7A, /* 0xB4-0xB7 */
+	0x88, 0x81, 0x88, 0x82, 0x88, 0x83, 0x88, 0x84, /* 0xB8-0xBB */
+	0x88, 0x85, 0x88, 0x86, 0x88, 0x87, 0x88, 0x88, /* 0xBC-0xBF */
+	0x88, 0x89, 0x88, 0x8A, 0x88, 0x8B, 0x88, 0x8C, /* 0xC0-0xC3 */
+	0x88, 0x8D, 0x88, 0x8E, 0x88, 0x8F, 0x88, 0x90, /* 0xC4-0xC7 */
+	0xB4, 0xCF, 0xB4, 0xD0, 0x88, 0x91, 0x88, 0x92, /* 0xC8-0xCB */
+	0xB4, 0xD1, 0x88, 0x93, 0x88, 0x94, 0x88, 0x95, /* 0xCC-0xCF */
+	0xB4, 0xD2, 0x88, 0x96, 0xB4, 0xD3, 0x88, 0x97, /* 0xD0-0xD3 */
+	0x88, 0x98, 0x88, 0x99, 0x88, 0x9A, 0x88, 0x9B, /* 0xD4-0xD7 */
+	0xB4, 0xD4, 0xB4, 0xD5, 0x88, 0x9C, 0xB4, 0xD6, /* 0xD8-0xDB */
+	0x88, 0x9D, 0xB4, 0xD7, 0x88, 0x9E, 0x88, 0x9F, /* 0xDC-0xDF */
+	0x88, 0xA0, 0x88, 0xA1, 0xB4, 0xD8, 0x88, 0xA2, /* 0xE0-0xE3 */
+	0xB4, 0xD9, 0xB4, 0xDA, 0xB4, 0xDB, 0x88, 0xA3, /* 0xE4-0xE7 */
+	0xB4, 0xDC, 0x88, 0xA4, 0x88, 0xA5, 0xB4, 0xDD, /* 0xE8-0xEB */
+	0xB4, 0xDE, 0xB4, 0xDF, 0xB4, 0xE0, 0xB4, 0xE1, /* 0xEC-0xEF */
+	0x88, 0xA6, 0x88, 0xA7, 0x88, 0xA8, 0xB4, 0xE2, /* 0xF0-0xF3 */
+	0xB4, 0xE3, 0xB4, 0xE4, 0x88, 0xA9, 0xB4, 0xE5, /* 0xF4-0xF7 */
+	0xB4, 0xE6, 0xB4, 0xE7, 0xB4, 0xE8, 0xB4, 0xE9, /* 0xF8-0xFB */
+	0x88, 0xAA, 0x88, 0xAB, 0x88, 0xAC, 0xB4, 0xEA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B3[512] = {
+	0xB4, 0xEB, 0xB4, 0xEC, 0x88, 0xAD, 0x88, 0xAE, /* 0x00-0x03 */
+	0xB4, 0xED, 0x88, 0xAF, 0x88, 0xB0, 0x88, 0xB1, /* 0x04-0x07 */
+	0xB4, 0xEE, 0x88, 0xB2, 0x88, 0xB3, 0x88, 0xB4, /* 0x08-0x0B */
+	0x88, 0xB5, 0x88, 0xB6, 0x88, 0xB7, 0x88, 0xB8, /* 0x0C-0x0F */
+	0xB4, 0xEF, 0xB4, 0xF0, 0x88, 0xB9, 0xB4, 0xF1, /* 0x10-0x13 */
+	0xB4, 0xF2, 0xB4, 0xF3, 0x88, 0xBA, 0x88, 0xBB, /* 0x14-0x17 */
+	0x88, 0xBC, 0x88, 0xBD, 0x88, 0xBE, 0x88, 0xBF, /* 0x18-0x1B */
+	0xB4, 0xF4, 0x88, 0xC0, 0x88, 0xC1, 0x88, 0xC2, /* 0x1C-0x1F */
+	0x88, 0xC3, 0x88, 0xC4, 0x88, 0xC5, 0x88, 0xC6, /* 0x20-0x23 */
+	0x88, 0xC7, 0x88, 0xC8, 0x88, 0xC9, 0x88, 0xCA, /* 0x24-0x27 */
+	0x88, 0xCB, 0x88, 0xCC, 0x88, 0xCD, 0x88, 0xCE, /* 0x28-0x2B */
+	0x88, 0xCF, 0x88, 0xD0, 0x88, 0xD1, 0x88, 0xD2, /* 0x2C-0x2F */
+	0x88, 0xD3, 0x88, 0xD4, 0x88, 0xD5, 0x88, 0xD6, /* 0x30-0x33 */
+	0x88, 0xD7, 0x88, 0xD8, 0x88, 0xD9, 0x88, 0xDA, /* 0x34-0x37 */
+	0x88, 0xDB, 0x88, 0xDC, 0x88, 0xDD, 0x88, 0xDE, /* 0x38-0x3B */
+	0x88, 0xDF, 0x88, 0xE0, 0x88, 0xE1, 0x88, 0xE2, /* 0x3C-0x3F */
+	0x88, 0xE3, 0x88, 0xE4, 0x88, 0xE5, 0x88, 0xE6, /* 0x40-0x43 */
+	0x88, 0xE7, 0x88, 0xE8, 0x88, 0xE9, 0x88, 0xEA, /* 0x44-0x47 */
+	0x88, 0xEB, 0x88, 0xEC, 0x88, 0xED, 0x88, 0xEE, /* 0x48-0x4B */
+	0x88, 0xEF, 0x88, 0xF0, 0x88, 0xF1, 0x88, 0xF2, /* 0x4C-0x4F */
+	0x88, 0xF3, 0x88, 0xF4, 0x88, 0xF5, 0x88, 0xF6, /* 0x50-0x53 */
+	0xB4, 0xF5, 0xB4, 0xF6, 0xB4, 0xF7, 0x88, 0xF7, /* 0x54-0x57 */
+	0xB4, 0xF8, 0x88, 0xF8, 0x88, 0xF9, 0xB4, 0xF9, /* 0x58-0x5B */
+	0xB4, 0xFA, 0x88, 0xFA, 0xB4, 0xFB, 0xB4, 0xFC, /* 0x5C-0x5F */
+	0x88, 0xFB, 0x88, 0xFC, 0x88, 0xFD, 0x88, 0xFE, /* 0x60-0x63 */
+	0xB4, 0xFD, 0xB4, 0xFE, 0x89, 0x41, 0xB5, 0xA1, /* 0x64-0x67 */
+	0x89, 0x42, 0xB5, 0xA2, 0x89, 0x43, 0xB5, 0xA3, /* 0x68-0x6B */
+	0x89, 0x44, 0x89, 0x45, 0xB5, 0xA4, 0x89, 0x46, /* 0x6C-0x6F */
+	0xB5, 0xA5, 0xB5, 0xA6, 0x89, 0x47, 0x89, 0x48, /* 0x70-0x73 */
+	0xB5, 0xA7, 0x89, 0x49, 0x89, 0x4A, 0x89, 0x4B, /* 0x74-0x77 */
+	0xB5, 0xA8, 0x89, 0x4C, 0x89, 0x4D, 0x89, 0x4E, /* 0x78-0x7B */
+	0x89, 0x4F, 0x89, 0x50, 0x89, 0x51, 0x89, 0x52, /* 0x7C-0x7F */
+	
+	0xB5, 0xA9, 0xB5, 0xAA, 0x89, 0x53, 0xB5, 0xAB, /* 0x80-0x83 */
+	0xB5, 0xAC, 0xB5, 0xAD, 0x89, 0x54, 0x89, 0x55, /* 0x84-0x87 */
+	0x89, 0x56, 0x89, 0x57, 0x89, 0x58, 0x89, 0x59, /* 0x88-0x8B */
+	0xB5, 0xAE, 0x89, 0x5A, 0x89, 0x61, 0x89, 0x62, /* 0x8C-0x8F */
+	0xB5, 0xAF, 0x89, 0x63, 0x89, 0x64, 0x89, 0x65, /* 0x90-0x93 */
+	0xB5, 0xB0, 0x89, 0x66, 0x89, 0x67, 0x89, 0x68, /* 0x94-0x97 */
+	0x89, 0x69, 0x89, 0x6A, 0x89, 0x6B, 0x89, 0x6C, /* 0x98-0x9B */
+	0x89, 0x6D, 0x89, 0x6E, 0x89, 0x6F, 0x89, 0x70, /* 0x9C-0x9F */
+	0xB5, 0xB1, 0xB5, 0xB2, 0x89, 0x71, 0x89, 0x72, /* 0xA0-0xA3 */
+	0x89, 0x73, 0x89, 0x74, 0x89, 0x75, 0x89, 0x76, /* 0xA4-0xA7 */
+	0xB5, 0xB3, 0x89, 0x77, 0x89, 0x78, 0x89, 0x79, /* 0xA8-0xAB */
+	0xB5, 0xB4, 0x89, 0x7A, 0x89, 0x81, 0x89, 0x82, /* 0xAC-0xAF */
+	0x89, 0x83, 0x89, 0x84, 0x89, 0x85, 0x89, 0x86, /* 0xB0-0xB3 */
+	0x89, 0x87, 0x89, 0x88, 0x89, 0x89, 0x89, 0x8A, /* 0xB4-0xB7 */
+	0x89, 0x8B, 0x89, 0x8C, 0x89, 0x8D, 0x89, 0x8E, /* 0xB8-0xBB */
+	0x89, 0x8F, 0x89, 0x90, 0x89, 0x91, 0x89, 0x92, /* 0xBC-0xBF */
+	0x89, 0x93, 0x89, 0x94, 0x89, 0x95, 0x89, 0x96, /* 0xC0-0xC3 */
+	0xB5, 0xB5, 0xB5, 0xB6, 0x89, 0x97, 0x89, 0x98, /* 0xC4-0xC7 */
+	0xB5, 0xB7, 0x89, 0x99, 0x89, 0x9A, 0xB5, 0xB8, /* 0xC8-0xCB */
+	0xB5, 0xB9, 0x89, 0x9B, 0xB5, 0xBA, 0x89, 0x9C, /* 0xCC-0xCF */
+	0xB5, 0xBB, 0x89, 0x9D, 0x89, 0x9E, 0x89, 0x9F, /* 0xD0-0xD3 */
+	0xB5, 0xBC, 0xB5, 0xBD, 0x89, 0xA0, 0xB5, 0xBE, /* 0xD4-0xD7 */
+	0x89, 0xA1, 0xB5, 0xBF, 0x89, 0xA2, 0xB5, 0xC0, /* 0xD8-0xDB */
+	0x89, 0xA3, 0xB5, 0xC1, 0x89, 0xA4, 0x89, 0xA5, /* 0xDC-0xDF */
+	0xB5, 0xC2, 0x89, 0xA6, 0x89, 0xA7, 0x89, 0xA8, /* 0xE0-0xE3 */
+	0xB5, 0xC3, 0x89, 0xA9, 0x89, 0xAA, 0x89, 0xAB, /* 0xE4-0xE7 */
+	0xB5, 0xC4, 0x89, 0xAC, 0x89, 0xAD, 0x89, 0xAE, /* 0xE8-0xEB */
+	0x89, 0xAF, 0x89, 0xB0, 0x89, 0xB1, 0x89, 0xB2, /* 0xEC-0xEF */
+	0x89, 0xB3, 0x89, 0xB4, 0x89, 0xB5, 0x89, 0xB6, /* 0xF0-0xF3 */
+	0x89, 0xB7, 0x89, 0xB8, 0x89, 0xB9, 0x89, 0xBA, /* 0xF4-0xF7 */
+	0x89, 0xBB, 0x89, 0xBC, 0x89, 0xBD, 0x89, 0xBE, /* 0xF8-0xFB */
+	0xB5, 0xC5, 0x89, 0xBF, 0x89, 0xC0, 0x89, 0xC1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B4[512] = {
+	0x89, 0xC2, 0x89, 0xC3, 0x89, 0xC4, 0x89, 0xC5, /* 0x00-0x03 */
+	0x89, 0xC6, 0x89, 0xC7, 0x89, 0xC8, 0x89, 0xC9, /* 0x04-0x07 */
+	0x89, 0xCA, 0x89, 0xCB, 0x89, 0xCC, 0x89, 0xCD, /* 0x08-0x0B */
+	0x89, 0xCE, 0x89, 0xCF, 0x89, 0xD0, 0x89, 0xD1, /* 0x0C-0x0F */
+	0xB5, 0xC6, 0x89, 0xD2, 0x89, 0xD3, 0x89, 0xD4, /* 0x10-0x13 */
+	0x89, 0xD5, 0x89, 0xD6, 0x89, 0xD7, 0x89, 0xD8, /* 0x14-0x17 */
+	0xB5, 0xC7, 0x89, 0xD9, 0x89, 0xDA, 0x89, 0xDB, /* 0x18-0x1B */
+	0xB5, 0xC8, 0x89, 0xDC, 0x89, 0xDD, 0x89, 0xDE, /* 0x1C-0x1F */
+	0xB5, 0xC9, 0x89, 0xDF, 0x89, 0xE0, 0x89, 0xE1, /* 0x20-0x23 */
+	0x89, 0xE2, 0x89, 0xE3, 0x89, 0xE4, 0x89, 0xE5, /* 0x24-0x27 */
+	0xB5, 0xCA, 0xB5, 0xCB, 0x89, 0xE6, 0xB5, 0xCC, /* 0x28-0x2B */
+	0x89, 0xE7, 0x89, 0xE8, 0x89, 0xE9, 0x89, 0xEA, /* 0x2C-0x2F */
+	0x89, 0xEB, 0x89, 0xEC, 0x89, 0xED, 0x89, 0xEE, /* 0x30-0x33 */
+	0xB5, 0xCD, 0x89, 0xEF, 0x89, 0xF0, 0x89, 0xF1, /* 0x34-0x37 */
+	0x89, 0xF2, 0x89, 0xF3, 0x89, 0xF4, 0x89, 0xF5, /* 0x38-0x3B */
+	0x89, 0xF6, 0x89, 0xF7, 0x89, 0xF8, 0x89, 0xF9, /* 0x3C-0x3F */
+	0x89, 0xFA, 0x89, 0xFB, 0x89, 0xFC, 0x89, 0xFD, /* 0x40-0x43 */
+	0x89, 0xFE, 0x8A, 0x41, 0x8A, 0x42, 0x8A, 0x43, /* 0x44-0x47 */
+	0x8A, 0x44, 0x8A, 0x45, 0x8A, 0x46, 0x8A, 0x47, /* 0x48-0x4B */
+	0x8A, 0x48, 0x8A, 0x49, 0x8A, 0x4A, 0x8A, 0x4B, /* 0x4C-0x4F */
+	0xB5, 0xCE, 0xB5, 0xCF, 0x8A, 0x4C, 0x8A, 0x4D, /* 0x50-0x53 */
+	0xB5, 0xD0, 0x8A, 0x4E, 0x8A, 0x4F, 0x8A, 0x50, /* 0x54-0x57 */
+	0xB5, 0xD1, 0x8A, 0x51, 0x8A, 0x52, 0x8A, 0x53, /* 0x58-0x5B */
+	0x8A, 0x54, 0x8A, 0x55, 0x8A, 0x56, 0x8A, 0x57, /* 0x5C-0x5F */
+	0xB5, 0xD2, 0xB5, 0xD3, 0x8A, 0x58, 0xB5, 0xD4, /* 0x60-0x63 */
+	0x8A, 0x59, 0xB5, 0xD5, 0x8A, 0x5A, 0x8A, 0x61, /* 0x64-0x67 */
+	0x8A, 0x62, 0x8A, 0x63, 0x8A, 0x64, 0x8A, 0x65, /* 0x68-0x6B */
+	0xB5, 0xD6, 0x8A, 0x66, 0x8A, 0x67, 0x8A, 0x68, /* 0x6C-0x6F */
+	0x8A, 0x69, 0x8A, 0x6A, 0x8A, 0x6B, 0x8A, 0x6C, /* 0x70-0x73 */
+	0x8A, 0x6D, 0x8A, 0x6E, 0x8A, 0x6F, 0x8A, 0x70, /* 0x74-0x77 */
+	0x8A, 0x71, 0x8A, 0x72, 0x8A, 0x73, 0x8A, 0x74, /* 0x78-0x7B */
+	0x8A, 0x75, 0x8A, 0x76, 0x8A, 0x77, 0x8A, 0x78, /* 0x7C-0x7F */
+	
+	0xB5, 0xD7, 0x8A, 0x79, 0x8A, 0x7A, 0x8A, 0x81, /* 0x80-0x83 */
+	0x8A, 0x82, 0x8A, 0x83, 0x8A, 0x84, 0x8A, 0x85, /* 0x84-0x87 */
+	0xB5, 0xD8, 0x8A, 0x86, 0x8A, 0x87, 0x8A, 0x88, /* 0x88-0x8B */
+	0x8A, 0x89, 0x8A, 0x8A, 0x8A, 0x8B, 0x8A, 0x8C, /* 0x8C-0x8F */
+	0x8A, 0x8D, 0x8A, 0x8E, 0x8A, 0x8F, 0x8A, 0x90, /* 0x90-0x93 */
+	0x8A, 0x91, 0x8A, 0x92, 0x8A, 0x93, 0x8A, 0x94, /* 0x94-0x97 */
+	0x8A, 0x95, 0x8A, 0x96, 0x8A, 0x97, 0x8A, 0x98, /* 0x98-0x9B */
+	0x8A, 0x99, 0xB5, 0xD9, 0x8A, 0x9A, 0x8A, 0x9B, /* 0x9C-0x9F */
+	0x8A, 0x9C, 0x8A, 0x9D, 0x8A, 0x9E, 0x8A, 0x9F, /* 0xA0-0xA3 */
+	0xB5, 0xDA, 0x8A, 0xA0, 0x8A, 0xA1, 0x8A, 0xA2, /* 0xA4-0xA7 */
+	0xB5, 0xDB, 0x8A, 0xA3, 0x8A, 0xA4, 0x8A, 0xA5, /* 0xA8-0xAB */
+	0xB5, 0xDC, 0x8A, 0xA6, 0x8A, 0xA7, 0x8A, 0xA8, /* 0xAC-0xAF */
+	0x8A, 0xA9, 0x8A, 0xAA, 0x8A, 0xAB, 0x8A, 0xAC, /* 0xB0-0xB3 */
+	0x8A, 0xAD, 0xB5, 0xDD, 0x8A, 0xAE, 0xB5, 0xDE, /* 0xB4-0xB7 */
+	0x8A, 0xAF, 0xB5, 0xDF, 0x8A, 0xB0, 0x8A, 0xB1, /* 0xB8-0xBB */
+	0x8A, 0xB2, 0x8A, 0xB3, 0x8A, 0xB4, 0x8A, 0xB5, /* 0xBC-0xBF */
+	0xB5, 0xE0, 0x8A, 0xB6, 0x8A, 0xB7, 0x8A, 0xB8, /* 0xC0-0xC3 */
+	0xB5, 0xE1, 0x8A, 0xB9, 0x8A, 0xBA, 0x8A, 0xBB, /* 0xC4-0xC7 */
+	0xB5, 0xE2, 0x8A, 0xBC, 0x8A, 0xBD, 0x8A, 0xBE, /* 0xC8-0xCB */
+	0x8A, 0xBF, 0x8A, 0xC0, 0x8A, 0xC1, 0x8A, 0xC2, /* 0xCC-0xCF */
+	0xB5, 0xE3, 0x8A, 0xC3, 0x8A, 0xC4, 0x8A, 0xC5, /* 0xD0-0xD3 */
+	0x8A, 0xC6, 0xB5, 0xE4, 0x8A, 0xC7, 0x8A, 0xC8, /* 0xD4-0xD7 */
+	0x8A, 0xC9, 0x8A, 0xCA, 0x8A, 0xCB, 0x8A, 0xCC, /* 0xD8-0xDB */
+	0xB5, 0xE5, 0xB5, 0xE6, 0x8A, 0xCD, 0x8A, 0xCE, /* 0xDC-0xDF */
+	0xB5, 0xE7, 0x8A, 0xCF, 0x8A, 0xD0, 0xB5, 0xE8, /* 0xE0-0xE3 */
+	0xB5, 0xE9, 0x8A, 0xD1, 0xB5, 0xEA, 0x8A, 0xD2, /* 0xE4-0xE7 */
+	0x8A, 0xD3, 0x8A, 0xD4, 0x8A, 0xD5, 0x8A, 0xD6, /* 0xE8-0xEB */
+	0xB5, 0xEB, 0xB5, 0xEC, 0x8A, 0xD7, 0xB5, 0xED, /* 0xEC-0xEF */
+	0x8A, 0xD8, 0xB5, 0xEE, 0x8A, 0xD9, 0x8A, 0xDA, /* 0xF0-0xF3 */
+	0x8A, 0xDB, 0x8A, 0xDC, 0x8A, 0xDD, 0x8A, 0xDE, /* 0xF4-0xF7 */
+	0xB5, 0xEF, 0x8A, 0xDF, 0x8A, 0xE0, 0x8A, 0xE1, /* 0xF8-0xFB */
+	0x8A, 0xE2, 0x8A, 0xE3, 0x8A, 0xE4, 0x8A, 0xE5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B5[512] = {
+	0x8A, 0xE6, 0x8A, 0xE7, 0x8A, 0xE8, 0x8A, 0xE9, /* 0x00-0x03 */
+	0x8A, 0xEA, 0x8A, 0xEB, 0x8A, 0xEC, 0x8A, 0xED, /* 0x04-0x07 */
+	0x8A, 0xEE, 0x8A, 0xEF, 0x8A, 0xF0, 0x8A, 0xF1, /* 0x08-0x0B */
+	0x8A, 0xF2, 0x8A, 0xF3, 0x8A, 0xF4, 0x8A, 0xF5, /* 0x0C-0x0F */
+	0x8A, 0xF6, 0x8A, 0xF7, 0x8A, 0xF8, 0x8A, 0xF9, /* 0x10-0x13 */
+	0xB5, 0xF0, 0xB5, 0xF1, 0x8A, 0xFA, 0x8A, 0xFB, /* 0x14-0x17 */
+	0xB5, 0xF2, 0x8A, 0xFC, 0x8A, 0xFD, 0xB5, 0xF3, /* 0x18-0x1B */
+	0xB5, 0xF4, 0x8A, 0xFE, 0x8B, 0x41, 0x8B, 0x42, /* 0x1C-0x1F */
+	0x8B, 0x43, 0x8B, 0x44, 0x8B, 0x45, 0x8B, 0x46, /* 0x20-0x23 */
+	0xB5, 0xF5, 0xB5, 0xF6, 0x8B, 0x47, 0xB5, 0xF7, /* 0x24-0x27 */
+	0xB5, 0xF8, 0xB5, 0xF9, 0xB5, 0xFA, 0x8B, 0x48, /* 0x28-0x2B */
+	0x8B, 0x49, 0x8B, 0x4A, 0x8B, 0x4B, 0x8B, 0x4C, /* 0x2C-0x2F */
+	0xB5, 0xFB, 0xB5, 0xFC, 0x8B, 0x4D, 0x8B, 0x4E, /* 0x30-0x33 */
+	0xB5, 0xFD, 0x8B, 0x4F, 0x8B, 0x50, 0x8B, 0x51, /* 0x34-0x37 */
+	0xB5, 0xFE, 0x8B, 0x52, 0x8B, 0x53, 0x8B, 0x54, /* 0x38-0x3B */
+	0x8B, 0x55, 0x8B, 0x56, 0x8B, 0x57, 0x8B, 0x58, /* 0x3C-0x3F */
+	0xB6, 0xA1, 0xB6, 0xA2, 0x8B, 0x59, 0xB6, 0xA3, /* 0x40-0x43 */
+	0xB6, 0xA4, 0xB6, 0xA5, 0x8B, 0x5A, 0x8B, 0x61, /* 0x44-0x47 */
+	0x8B, 0x62, 0x8B, 0x63, 0x8B, 0x64, 0xB6, 0xA6, /* 0x48-0x4B */
+	0xB6, 0xA7, 0xB6, 0xA8, 0x8B, 0x65, 0x8B, 0x66, /* 0x4C-0x4F */
+	0xB6, 0xA9, 0x8B, 0x67, 0x8B, 0x68, 0x8B, 0x69, /* 0x50-0x53 */
+	0xB6, 0xAA, 0x8B, 0x6A, 0x8B, 0x6B, 0x8B, 0x6C, /* 0x54-0x57 */
+	0x8B, 0x6D, 0x8B, 0x6E, 0x8B, 0x6F, 0x8B, 0x70, /* 0x58-0x5B */
+	0xB6, 0xAB, 0xB6, 0xAC, 0x8B, 0x71, 0xB6, 0xAD, /* 0x5C-0x5F */
+	0xB6, 0xAE, 0xB6, 0xAF, 0x8B, 0x72, 0x8B, 0x73, /* 0x60-0x63 */
+	0x8B, 0x74, 0x8B, 0x75, 0x8B, 0x76, 0x8B, 0x77, /* 0x64-0x67 */
+	0x8B, 0x78, 0x8B, 0x79, 0x8B, 0x7A, 0x8B, 0x81, /* 0x68-0x6B */
+	0x8B, 0x82, 0x8B, 0x83, 0x8B, 0x84, 0x8B, 0x85, /* 0x6C-0x6F */
+	0x8B, 0x86, 0x8B, 0x87, 0x8B, 0x88, 0x8B, 0x89, /* 0x70-0x73 */
+	0x8B, 0x8A, 0x8B, 0x8B, 0x8B, 0x8C, 0x8B, 0x8D, /* 0x74-0x77 */
+	0x8B, 0x8E, 0x8B, 0x8F, 0x8B, 0x90, 0x8B, 0x91, /* 0x78-0x7B */
+	0x8B, 0x92, 0x8B, 0x93, 0x8B, 0x94, 0x8B, 0x95, /* 0x7C-0x7F */
+	
+	0x8B, 0x96, 0x8B, 0x97, 0x8B, 0x98, 0x8B, 0x99, /* 0x80-0x83 */
+	0x8B, 0x9A, 0x8B, 0x9B, 0x8B, 0x9C, 0x8B, 0x9D, /* 0x84-0x87 */
+	0x8B, 0x9E, 0x8B, 0x9F, 0x8B, 0xA0, 0x8B, 0xA1, /* 0x88-0x8B */
+	0x8B, 0xA2, 0x8B, 0xA3, 0x8B, 0xA4, 0x8B, 0xA5, /* 0x8C-0x8F */
+	0x8B, 0xA6, 0x8B, 0xA7, 0x8B, 0xA8, 0x8B, 0xA9, /* 0x90-0x93 */
+	0x8B, 0xAA, 0x8B, 0xAB, 0x8B, 0xAC, 0x8B, 0xAD, /* 0x94-0x97 */
+	0x8B, 0xAE, 0x8B, 0xAF, 0x8B, 0xB0, 0x8B, 0xB1, /* 0x98-0x9B */
+	0x8B, 0xB2, 0x8B, 0xB3, 0x8B, 0xB4, 0x8B, 0xB5, /* 0x9C-0x9F */
+	0xB6, 0xB0, 0xB6, 0xB1, 0x8B, 0xB6, 0x8B, 0xB7, /* 0xA0-0xA3 */
+	0xB6, 0xB2, 0x8B, 0xB8, 0x8B, 0xB9, 0x8B, 0xBA, /* 0xA4-0xA7 */
+	0xB6, 0xB3, 0x8B, 0xBB, 0xB6, 0xB4, 0xB6, 0xB5, /* 0xA8-0xAB */
+	0x8B, 0xBC, 0x8B, 0xBD, 0x8B, 0xBE, 0x8B, 0xBF, /* 0xAC-0xAF */
+	0xB6, 0xB6, 0xB6, 0xB7, 0x8B, 0xC0, 0xB6, 0xB8, /* 0xB0-0xB3 */
+	0xB6, 0xB9, 0xB6, 0xBA, 0x8B, 0xC1, 0x8B, 0xC2, /* 0xB4-0xB7 */
+	0x8B, 0xC3, 0x8B, 0xC4, 0x8B, 0xC5, 0xB6, 0xBB, /* 0xB8-0xBB */
+	0xB6, 0xBC, 0xB6, 0xBD, 0x8B, 0xC6, 0x8B, 0xC7, /* 0xBC-0xBF */
+	0xB6, 0xBE, 0x8B, 0xC8, 0x8B, 0xC9, 0x8B, 0xCA, /* 0xC0-0xC3 */
+	0xB6, 0xBF, 0x8B, 0xCB, 0x8B, 0xCC, 0x8B, 0xCD, /* 0xC4-0xC7 */
+	0x8B, 0xCE, 0x8B, 0xCF, 0x8B, 0xD0, 0x8B, 0xD1, /* 0xC8-0xCB */
+	0xB6, 0xC0, 0xB6, 0xC1, 0x8B, 0xD2, 0xB6, 0xC2, /* 0xCC-0xCF */
+	0xB6, 0xC3, 0xB6, 0xC4, 0x8B, 0xD3, 0x8B, 0xD4, /* 0xD0-0xD3 */
+	0x8B, 0xD5, 0x8B, 0xD6, 0x8B, 0xD7, 0x8B, 0xD8, /* 0xD4-0xD7 */
+	0xB6, 0xC5, 0x8B, 0xD9, 0x8B, 0xDA, 0x8B, 0xDB, /* 0xD8-0xDB */
+	0x8B, 0xDC, 0x8B, 0xDD, 0x8B, 0xDE, 0x8B, 0xDF, /* 0xDC-0xDF */
+	0x8B, 0xE0, 0x8B, 0xE1, 0x8B, 0xE2, 0x8B, 0xE3, /* 0xE0-0xE3 */
+	0x8B, 0xE4, 0x8B, 0xE5, 0x8B, 0xE6, 0x8B, 0xE7, /* 0xE4-0xE7 */
+	0x8B, 0xE8, 0x8B, 0xE9, 0x8B, 0xEA, 0x8B, 0xEB, /* 0xE8-0xEB */
+	0xB6, 0xC6, 0x8B, 0xEC, 0x8B, 0xED, 0x8B, 0xEE, /* 0xEC-0xEF */
+	0x8B, 0xEF, 0x8B, 0xF0, 0x8B, 0xF1, 0x8B, 0xF2, /* 0xF0-0xF3 */
+	0x8B, 0xF3, 0x8B, 0xF4, 0x8B, 0xF5, 0x8B, 0xF6, /* 0xF4-0xF7 */
+	0x8B, 0xF7, 0x8B, 0xF8, 0x8B, 0xF9, 0x8B, 0xFA, /* 0xF8-0xFB */
+	0x8B, 0xFB, 0x8B, 0xFC, 0x8B, 0xFD, 0x8B, 0xFE, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B6[512] = {
+	0x8C, 0x41, 0x8C, 0x42, 0x8C, 0x43, 0x8C, 0x44, /* 0x00-0x03 */
+	0x8C, 0x45, 0x8C, 0x46, 0x8C, 0x47, 0x8C, 0x48, /* 0x04-0x07 */
+	0x8C, 0x49, 0x8C, 0x4A, 0x8C, 0x4B, 0x8C, 0x4C, /* 0x08-0x0B */
+	0x8C, 0x4D, 0x8C, 0x4E, 0x8C, 0x4F, 0x8C, 0x50, /* 0x0C-0x0F */
+	0xB6, 0xC7, 0xB6, 0xC8, 0x8C, 0x51, 0x8C, 0x52, /* 0x10-0x13 */
+	0xB6, 0xC9, 0x8C, 0x53, 0x8C, 0x54, 0x8C, 0x55, /* 0x14-0x17 */
+	0xB6, 0xCA, 0x8C, 0x56, 0x8C, 0x57, 0x8C, 0x58, /* 0x18-0x1B */
+	0x8C, 0x59, 0x8C, 0x5A, 0x8C, 0x61, 0x8C, 0x62, /* 0x1C-0x1F */
+	0x8C, 0x63, 0x8C, 0x64, 0x8C, 0x65, 0x8C, 0x66, /* 0x20-0x23 */
+	0x8C, 0x67, 0xB6, 0xCB, 0x8C, 0x68, 0x8C, 0x69, /* 0x24-0x27 */
+	0x8C, 0x6A, 0x8C, 0x6B, 0x8C, 0x6C, 0x8C, 0x6D, /* 0x28-0x2B */
+	0xB6, 0xCC, 0x8C, 0x6E, 0x8C, 0x6F, 0x8C, 0x70, /* 0x2C-0x2F */
+	0x8C, 0x71, 0x8C, 0x72, 0x8C, 0x73, 0x8C, 0x74, /* 0x30-0x33 */
+	0xB6, 0xCD, 0x8C, 0x75, 0x8C, 0x76, 0x8C, 0x77, /* 0x34-0x37 */
+	0x8C, 0x78, 0x8C, 0x79, 0x8C, 0x7A, 0x8C, 0x81, /* 0x38-0x3B */
+	0x8C, 0x82, 0x8C, 0x83, 0x8C, 0x84, 0x8C, 0x85, /* 0x3C-0x3F */
+	0x8C, 0x86, 0x8C, 0x87, 0x8C, 0x88, 0x8C, 0x89, /* 0x40-0x43 */
+	0x8C, 0x8A, 0x8C, 0x8B, 0x8C, 0x8C, 0x8C, 0x8D, /* 0x44-0x47 */
+	0xB6, 0xCE, 0x8C, 0x8E, 0x8C, 0x8F, 0x8C, 0x90, /* 0x48-0x4B */
+	0x8C, 0x91, 0x8C, 0x92, 0x8C, 0x93, 0x8C, 0x94, /* 0x4C-0x4F */
+	0x8C, 0x95, 0x8C, 0x96, 0x8C, 0x97, 0x8C, 0x98, /* 0x50-0x53 */
+	0x8C, 0x99, 0x8C, 0x9A, 0x8C, 0x9B, 0x8C, 0x9C, /* 0x54-0x57 */
+	0x8C, 0x9D, 0x8C, 0x9E, 0x8C, 0x9F, 0x8C, 0xA0, /* 0x58-0x5B */
+	0x8C, 0xA1, 0x8C, 0xA2, 0x8C, 0xA3, 0x8C, 0xA4, /* 0x5C-0x5F */
+	0x8C, 0xA5, 0x8C, 0xA6, 0x8C, 0xA7, 0x8C, 0xA8, /* 0x60-0x63 */
+	0xB6, 0xCF, 0x8C, 0xA9, 0x8C, 0xAA, 0x8C, 0xAB, /* 0x64-0x67 */
+	0xB6, 0xD0, 0x8C, 0xAC, 0x8C, 0xAD, 0x8C, 0xAE, /* 0x68-0x6B */
+	0x8C, 0xAF, 0x8C, 0xB0, 0x8C, 0xB1, 0x8C, 0xB2, /* 0x6C-0x6F */
+	0x8C, 0xB3, 0x8C, 0xB4, 0x8C, 0xB5, 0x8C, 0xB6, /* 0x70-0x73 */
+	0x8C, 0xB7, 0x8C, 0xB8, 0x8C, 0xB9, 0x8C, 0xBA, /* 0x74-0x77 */
+	0x8C, 0xBB, 0x8C, 0xBC, 0x8C, 0xBD, 0x8C, 0xBE, /* 0x78-0x7B */
+	0x8C, 0xBF, 0x8C, 0xC0, 0x8C, 0xC1, 0x8C, 0xC2, /* 0x7C-0x7F */
+	
+	0x8C, 0xC3, 0x8C, 0xC4, 0x8C, 0xC5, 0x8C, 0xC6, /* 0x80-0x83 */
+	0x8C, 0xC7, 0x8C, 0xC8, 0x8C, 0xC9, 0x8C, 0xCA, /* 0x84-0x87 */
+	0x8C, 0xCB, 0x8C, 0xCC, 0x8C, 0xCD, 0x8C, 0xCE, /* 0x88-0x8B */
+	0x8C, 0xCF, 0x8C, 0xD0, 0x8C, 0xD1, 0x8C, 0xD2, /* 0x8C-0x8F */
+	0x8C, 0xD3, 0x8C, 0xD4, 0x8C, 0xD5, 0x8C, 0xD6, /* 0x90-0x93 */
+	0x8C, 0xD7, 0x8C, 0xD8, 0x8C, 0xD9, 0x8C, 0xDA, /* 0x94-0x97 */
+	0x8C, 0xDB, 0x8C, 0xDC, 0x8C, 0xDD, 0x8C, 0xDE, /* 0x98-0x9B */
+	0xB6, 0xD1, 0xB6, 0xD2, 0x8C, 0xDF, 0x8C, 0xE0, /* 0x9C-0x9F */
+	0xB6, 0xD3, 0x8C, 0xE1, 0x8C, 0xE2, 0x8C, 0xE3, /* 0xA0-0xA3 */
+	0xB6, 0xD4, 0x8C, 0xE4, 0x8C, 0xE5, 0x8C, 0xE6, /* 0xA4-0xA7 */
+	0x8C, 0xE7, 0x8C, 0xE8, 0x8C, 0xE9, 0xB6, 0xD5, /* 0xA8-0xAB */
+	0xB6, 0xD6, 0x8C, 0xEA, 0x8C, 0xEB, 0x8C, 0xEC, /* 0xAC-0xAF */
+	0x8C, 0xED, 0xB6, 0xD7, 0x8C, 0xEE, 0x8C, 0xEF, /* 0xB0-0xB3 */
+	0x8C, 0xF0, 0x8C, 0xF1, 0x8C, 0xF2, 0x8C, 0xF3, /* 0xB4-0xB7 */
+	0x8C, 0xF4, 0x8C, 0xF5, 0x8C, 0xF6, 0x8C, 0xF7, /* 0xB8-0xBB */
+	0x8C, 0xF8, 0x8C, 0xF9, 0x8C, 0xFA, 0x8C, 0xFB, /* 0xBC-0xBF */
+	0x8C, 0xFC, 0x8C, 0xFD, 0x8C, 0xFE, 0x8D, 0x41, /* 0xC0-0xC3 */
+	0x8D, 0x42, 0x8D, 0x43, 0x8D, 0x44, 0x8D, 0x45, /* 0xC4-0xC7 */
+	0x8D, 0x46, 0x8D, 0x47, 0x8D, 0x48, 0x8D, 0x49, /* 0xC8-0xCB */
+	0x8D, 0x4A, 0x8D, 0x4B, 0x8D, 0x4C, 0x8D, 0x4D, /* 0xCC-0xCF */
+	0x8D, 0x4E, 0x8D, 0x4F, 0x8D, 0x50, 0x8D, 0x51, /* 0xD0-0xD3 */
+	0xB6, 0xD8, 0x8D, 0x52, 0x8D, 0x53, 0x8D, 0x54, /* 0xD4-0xD7 */
+	0x8D, 0x55, 0x8D, 0x56, 0x8D, 0x57, 0x8D, 0x58, /* 0xD8-0xDB */
+	0x8D, 0x59, 0x8D, 0x5A, 0x8D, 0x61, 0x8D, 0x62, /* 0xDC-0xDF */
+	0x8D, 0x63, 0x8D, 0x64, 0x8D, 0x65, 0x8D, 0x66, /* 0xE0-0xE3 */
+	0x8D, 0x67, 0x8D, 0x68, 0x8D, 0x69, 0x8D, 0x6A, /* 0xE4-0xE7 */
+	0x8D, 0x6B, 0x8D, 0x6C, 0x8D, 0x6D, 0x8D, 0x6E, /* 0xE8-0xEB */
+	0x8D, 0x6F, 0x8D, 0x70, 0x8D, 0x71, 0x8D, 0x72, /* 0xEC-0xEF */
+	0xB6, 0xD9, 0x8D, 0x73, 0x8D, 0x74, 0x8D, 0x75, /* 0xF0-0xF3 */
+	0xB6, 0xDA, 0x8D, 0x76, 0x8D, 0x77, 0x8D, 0x78, /* 0xF4-0xF7 */
+	0xB6, 0xDB, 0x8D, 0x79, 0x8D, 0x7A, 0x8D, 0x81, /* 0xF8-0xFB */
+	0x8D, 0x82, 0x8D, 0x83, 0x8D, 0x84, 0x8D, 0x85, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B7[512] = {
+	0xB6, 0xDC, 0xB6, 0xDD, 0x8D, 0x86, 0x8D, 0x87, /* 0x00-0x03 */
+	0x8D, 0x88, 0xB6, 0xDE, 0x8D, 0x89, 0x8D, 0x8A, /* 0x04-0x07 */
+	0x8D, 0x8B, 0x8D, 0x8C, 0x8D, 0x8D, 0x8D, 0x8E, /* 0x08-0x0B */
+	0x8D, 0x8F, 0x8D, 0x90, 0x8D, 0x91, 0x8D, 0x92, /* 0x0C-0x0F */
+	0x8D, 0x93, 0x8D, 0x94, 0x8D, 0x95, 0x8D, 0x96, /* 0x10-0x13 */
+	0x8D, 0x97, 0x8D, 0x98, 0x8D, 0x99, 0x8D, 0x9A, /* 0x14-0x17 */
+	0x8D, 0x9B, 0x8D, 0x9C, 0x8D, 0x9D, 0x8D, 0x9E, /* 0x18-0x1B */
+	0x8D, 0x9F, 0x8D, 0xA0, 0x8D, 0xA1, 0x8D, 0xA2, /* 0x1C-0x1F */
+	0x8D, 0xA3, 0x8D, 0xA4, 0x8D, 0xA5, 0x8D, 0xA6, /* 0x20-0x23 */
+	0x8D, 0xA7, 0x8D, 0xA8, 0x8D, 0xA9, 0x8D, 0xAA, /* 0x24-0x27 */
+	0xB6, 0xDF, 0xB6, 0xE0, 0x8D, 0xAB, 0x8D, 0xAC, /* 0x28-0x2B */
+	0xB6, 0xE1, 0x8D, 0xAD, 0x8D, 0xAE, 0xB6, 0xE2, /* 0x2C-0x2F */
+	0xB6, 0xE3, 0x8D, 0xAF, 0x8D, 0xB0, 0x8D, 0xB1, /* 0x30-0x33 */
+	0x8D, 0xB2, 0x8D, 0xB3, 0x8D, 0xB4, 0x8D, 0xB5, /* 0x34-0x37 */
+	0xB6, 0xE4, 0xB6, 0xE5, 0x8D, 0xB6, 0xB6, 0xE6, /* 0x38-0x3B */
+	0x8D, 0xB7, 0x8D, 0xB8, 0x8D, 0xB9, 0x8D, 0xBA, /* 0x3C-0x3F */
+	0x8D, 0xBB, 0x8D, 0xBC, 0x8D, 0xBD, 0x8D, 0xBE, /* 0x40-0x43 */
+	0xB6, 0xE7, 0x8D, 0xBF, 0x8D, 0xC0, 0x8D, 0xC1, /* 0x44-0x47 */
+	0xB6, 0xE8, 0x8D, 0xC2, 0x8D, 0xC3, 0x8D, 0xC4, /* 0x48-0x4B */
+	0xB6, 0xE9, 0x8D, 0xC5, 0x8D, 0xC6, 0x8D, 0xC7, /* 0x4C-0x4F */
+	0x8D, 0xC8, 0x8D, 0xC9, 0x8D, 0xCA, 0x8D, 0xCB, /* 0x50-0x53 */
+	0xB6, 0xEA, 0xB6, 0xEB, 0x8D, 0xCC, 0x8D, 0xCD, /* 0x54-0x57 */
+	0x8D, 0xCE, 0x8D, 0xCF, 0x8D, 0xD0, 0x8D, 0xD1, /* 0x58-0x5B */
+	0x8D, 0xD2, 0x8D, 0xD3, 0x8D, 0xD4, 0x8D, 0xD5, /* 0x5C-0x5F */
+	0xB6, 0xEC, 0x8D, 0xD6, 0x8D, 0xD7, 0x8D, 0xD8, /* 0x60-0x63 */
+	0xB6, 0xED, 0x8D, 0xD9, 0x8D, 0xDA, 0x8D, 0xDB, /* 0x64-0x67 */
+	0xB6, 0xEE, 0x8D, 0xDC, 0x8D, 0xDD, 0x8D, 0xDE, /* 0x68-0x6B */
+	0x8D, 0xDF, 0x8D, 0xE0, 0x8D, 0xE1, 0x8D, 0xE2, /* 0x6C-0x6F */
+	0xB6, 0xEF, 0xB6, 0xF0, 0x8D, 0xE3, 0xB6, 0xF1, /* 0x70-0x73 */
+	0x8D, 0xE4, 0xB6, 0xF2, 0x8D, 0xE5, 0x8D, 0xE6, /* 0x74-0x77 */
+	0x8D, 0xE7, 0x8D, 0xE8, 0x8D, 0xE9, 0x8D, 0xEA, /* 0x78-0x7B */
+	0xB6, 0xF3, 0xB6, 0xF4, 0x8D, 0xEB, 0x8D, 0xEC, /* 0x7C-0x7F */
+	
+	0xB6, 0xF5, 0x8D, 0xED, 0x8D, 0xEE, 0x8D, 0xEF, /* 0x80-0x83 */
+	0xB6, 0xF6, 0x8D, 0xF0, 0x8D, 0xF1, 0x8D, 0xF2, /* 0x84-0x87 */
+	0x8D, 0xF3, 0x8D, 0xF4, 0x8D, 0xF5, 0x8D, 0xF6, /* 0x88-0x8B */
+	0xB6, 0xF7, 0xB6, 0xF8, 0x8D, 0xF7, 0xB6, 0xF9, /* 0x8C-0x8F */
+	0xB6, 0xFA, 0xB6, 0xFB, 0xB6, 0xFC, 0x8D, 0xF8, /* 0x90-0x93 */
+	0x8D, 0xF9, 0x8D, 0xFA, 0xB6, 0xFD, 0xB6, 0xFE, /* 0x94-0x97 */
+	0xB7, 0xA1, 0xB7, 0xA2, 0x8D, 0xFB, 0x8D, 0xFC, /* 0x98-0x9B */
+	0xB7, 0xA3, 0x8D, 0xFD, 0x8D, 0xFE, 0x8E, 0x41, /* 0x9C-0x9F */
+	0xB7, 0xA4, 0x8E, 0x42, 0x8E, 0x43, 0x8E, 0x44, /* 0xA0-0xA3 */
+	0x8E, 0x45, 0x8E, 0x46, 0x8E, 0x47, 0x8E, 0x48, /* 0xA4-0xA7 */
+	0xB7, 0xA5, 0xB7, 0xA6, 0x8E, 0x49, 0xB7, 0xA7, /* 0xA8-0xAB */
+	0xB7, 0xA8, 0xB7, 0xA9, 0x8E, 0x4A, 0x8E, 0x4B, /* 0xAC-0xAF */
+	0x8E, 0x4C, 0x8E, 0x4D, 0x8E, 0x4E, 0x8E, 0x4F, /* 0xB0-0xB3 */
+	0xB7, 0xAA, 0xB7, 0xAB, 0x8E, 0x50, 0x8E, 0x51, /* 0xB4-0xB7 */
+	0xB7, 0xAC, 0x8E, 0x52, 0x8E, 0x53, 0x8E, 0x54, /* 0xB8-0xBB */
+	0x8E, 0x55, 0x8E, 0x56, 0x8E, 0x57, 0x8E, 0x58, /* 0xBC-0xBF */
+	0x8E, 0x59, 0x8E, 0x5A, 0x8E, 0x61, 0x8E, 0x62, /* 0xC0-0xC3 */
+	0x8E, 0x63, 0x8E, 0x64, 0x8E, 0x65, 0xB7, 0xAD, /* 0xC4-0xC7 */
+	0x8E, 0x66, 0xB7, 0xAE, 0x8E, 0x67, 0x8E, 0x68, /* 0xC8-0xCB */
+	0x8E, 0x69, 0x8E, 0x6A, 0x8E, 0x6B, 0x8E, 0x6C, /* 0xCC-0xCF */
+	0x8E, 0x6D, 0x8E, 0x6E, 0x8E, 0x6F, 0x8E, 0x70, /* 0xD0-0xD3 */
+	0x8E, 0x71, 0x8E, 0x72, 0x8E, 0x73, 0x8E, 0x74, /* 0xD4-0xD7 */
+	0x8E, 0x75, 0x8E, 0x76, 0x8E, 0x77, 0x8E, 0x78, /* 0xD8-0xDB */
+	0x8E, 0x79, 0x8E, 0x7A, 0x8E, 0x81, 0x8E, 0x82, /* 0xDC-0xDF */
+	0x8E, 0x83, 0x8E, 0x84, 0x8E, 0x85, 0x8E, 0x86, /* 0xE0-0xE3 */
+	0x8E, 0x87, 0x8E, 0x88, 0x8E, 0x89, 0x8E, 0x8A, /* 0xE4-0xE7 */
+	0x8E, 0x8B, 0x8E, 0x8C, 0x8E, 0x8D, 0x8E, 0x8E, /* 0xE8-0xEB */
+	0xB7, 0xAF, 0xB7, 0xB0, 0x8E, 0x8F, 0x8E, 0x90, /* 0xEC-0xEF */
+	0xB7, 0xB1, 0x8E, 0x91, 0x8E, 0x92, 0x8E, 0x93, /* 0xF0-0xF3 */
+	0xB7, 0xB2, 0x8E, 0x94, 0x8E, 0x95, 0x8E, 0x96, /* 0xF4-0xF7 */
+	0x8E, 0x97, 0x8E, 0x98, 0x8E, 0x99, 0x8E, 0x9A, /* 0xF8-0xFB */
+	0xB7, 0xB3, 0xB7, 0xB4, 0x8E, 0x9B, 0xB7, 0xB5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B8[512] = {
+	0xB7, 0xB6, 0xB7, 0xB7, 0x8E, 0x9C, 0x8E, 0x9D, /* 0x00-0x03 */
+	0x8E, 0x9E, 0x8E, 0x9F, 0x8E, 0xA0, 0xB7, 0xB8, /* 0x04-0x07 */
+	0xB7, 0xB9, 0xB7, 0xBA, 0x8E, 0xA1, 0x8E, 0xA2, /* 0x08-0x0B */
+	0xB7, 0xBB, 0x8E, 0xA3, 0x8E, 0xA4, 0x8E, 0xA5, /* 0x0C-0x0F */
+	0xB7, 0xBC, 0x8E, 0xA6, 0x8E, 0xA7, 0x8E, 0xA8, /* 0x10-0x13 */
+	0x8E, 0xA9, 0x8E, 0xAA, 0x8E, 0xAB, 0x8E, 0xAC, /* 0x14-0x17 */
+	0xB7, 0xBD, 0xB7, 0xBE, 0x8E, 0xAD, 0xB7, 0xBF, /* 0x18-0x1B */
+	0x8E, 0xAE, 0xB7, 0xC0, 0x8E, 0xAF, 0x8E, 0xB0, /* 0x1C-0x1F */
+	0x8E, 0xB1, 0x8E, 0xB2, 0x8E, 0xB3, 0x8E, 0xB4, /* 0x20-0x23 */
+	0xB7, 0xC1, 0xB7, 0xC2, 0x8E, 0xB5, 0x8E, 0xB6, /* 0x24-0x27 */
+	0xB7, 0xC3, 0x8E, 0xB7, 0x8E, 0xB8, 0x8E, 0xB9, /* 0x28-0x2B */
+	0xB7, 0xC4, 0x8E, 0xBA, 0x8E, 0xBB, 0x8E, 0xBC, /* 0x2C-0x2F */
+	0x8E, 0xBD, 0x8E, 0xBE, 0x8E, 0xBF, 0x8E, 0xC0, /* 0x30-0x33 */
+	0xB7, 0xC5, 0xB7, 0xC6, 0x8E, 0xC1, 0xB7, 0xC7, /* 0x34-0x37 */
+	0xB7, 0xC8, 0xB7, 0xC9, 0x8E, 0xC2, 0x8E, 0xC3, /* 0x38-0x3B */
+	0x8E, 0xC4, 0x8E, 0xC5, 0x8E, 0xC6, 0x8E, 0xC7, /* 0x3C-0x3F */
+	0xB7, 0xCA, 0x8E, 0xC8, 0x8E, 0xC9, 0x8E, 0xCA, /* 0x40-0x43 */
+	0xB7, 0xCB, 0x8E, 0xCB, 0x8E, 0xCC, 0x8E, 0xCD, /* 0x44-0x47 */
+	0x8E, 0xCE, 0x8E, 0xCF, 0x8E, 0xD0, 0x8E, 0xD1, /* 0x48-0x4B */
+	0x8E, 0xD2, 0x8E, 0xD3, 0x8E, 0xD4, 0x8E, 0xD5, /* 0x4C-0x4F */
+	0x8E, 0xD6, 0xB7, 0xCC, 0x8E, 0xD7, 0xB7, 0xCD, /* 0x50-0x53 */
+	0x8E, 0xD8, 0x8E, 0xD9, 0x8E, 0xDA, 0x8E, 0xDB, /* 0x54-0x57 */
+	0x8E, 0xDC, 0x8E, 0xDD, 0x8E, 0xDE, 0x8E, 0xDF, /* 0x58-0x5B */
+	0xB7, 0xCE, 0xB7, 0xCF, 0x8E, 0xE0, 0x8E, 0xE1, /* 0x5C-0x5F */
+	0xB7, 0xD0, 0x8E, 0xE2, 0x8E, 0xE3, 0x8E, 0xE4, /* 0x60-0x63 */
+	0xB7, 0xD1, 0x8E, 0xE5, 0x8E, 0xE6, 0x8E, 0xE7, /* 0x64-0x67 */
+	0x8E, 0xE8, 0x8E, 0xE9, 0x8E, 0xEA, 0x8E, 0xEB, /* 0x68-0x6B */
+	0xB7, 0xD2, 0xB7, 0xD3, 0x8E, 0xEC, 0xB7, 0xD4, /* 0x6C-0x6F */
+	0x8E, 0xED, 0xB7, 0xD5, 0x8E, 0xEE, 0x8E, 0xEF, /* 0x70-0x73 */
+	0x8E, 0xF0, 0x8E, 0xF1, 0x8E, 0xF2, 0x8E, 0xF3, /* 0x74-0x77 */
+	0xB7, 0xD6, 0x8E, 0xF4, 0x8E, 0xF5, 0x8E, 0xF6, /* 0x78-0x7B */
+	0xB7, 0xD7, 0x8E, 0xF7, 0x8E, 0xF8, 0x8E, 0xF9, /* 0x7C-0x7F */
+	
+	0x8E, 0xFA, 0x8E, 0xFB, 0x8E, 0xFC, 0x8E, 0xFD, /* 0x80-0x83 */
+	0x8E, 0xFE, 0x8F, 0x41, 0x8F, 0x42, 0x8F, 0x43, /* 0x84-0x87 */
+	0x8F, 0x44, 0x8F, 0x45, 0x8F, 0x46, 0x8F, 0x47, /* 0x88-0x8B */
+	0x8F, 0x48, 0xB7, 0xD8, 0x8F, 0x49, 0x8F, 0x4A, /* 0x8C-0x8F */
+	0x8F, 0x4B, 0x8F, 0x4C, 0x8F, 0x4D, 0x8F, 0x4E, /* 0x90-0x93 */
+	0x8F, 0x4F, 0x8F, 0x50, 0x8F, 0x51, 0x8F, 0x52, /* 0x94-0x97 */
+	0x8F, 0x53, 0x8F, 0x54, 0x8F, 0x55, 0x8F, 0x56, /* 0x98-0x9B */
+	0x8F, 0x57, 0x8F, 0x58, 0x8F, 0x59, 0x8F, 0x5A, /* 0x9C-0x9F */
+	0x8F, 0x61, 0x8F, 0x62, 0x8F, 0x63, 0x8F, 0x64, /* 0xA0-0xA3 */
+	0x8F, 0x65, 0x8F, 0x66, 0x8F, 0x67, 0x8F, 0x68, /* 0xA4-0xA7 */
+	0xB7, 0xD9, 0x8F, 0x69, 0x8F, 0x6A, 0x8F, 0x6B, /* 0xA8-0xAB */
+	0x8F, 0x6C, 0x8F, 0x6D, 0x8F, 0x6E, 0x8F, 0x6F, /* 0xAC-0xAF */
+	0xB7, 0xDA, 0x8F, 0x70, 0x8F, 0x71, 0x8F, 0x72, /* 0xB0-0xB3 */
+	0xB7, 0xDB, 0x8F, 0x73, 0x8F, 0x74, 0x8F, 0x75, /* 0xB4-0xB7 */
+	0xB7, 0xDC, 0x8F, 0x76, 0x8F, 0x77, 0x8F, 0x78, /* 0xB8-0xBB */
+	0x8F, 0x79, 0x8F, 0x7A, 0x8F, 0x81, 0x8F, 0x82, /* 0xBC-0xBF */
+	0xB7, 0xDD, 0xB7, 0xDE, 0x8F, 0x83, 0xB7, 0xDF, /* 0xC0-0xC3 */
+	0x8F, 0x84, 0xB7, 0xE0, 0x8F, 0x85, 0x8F, 0x86, /* 0xC4-0xC7 */
+	0x8F, 0x87, 0x8F, 0x88, 0x8F, 0x89, 0x8F, 0x8A, /* 0xC8-0xCB */
+	0xB7, 0xE1, 0x8F, 0x8B, 0x8F, 0x8C, 0x8F, 0x8D, /* 0xCC-0xCF */
+	0xB7, 0xE2, 0x8F, 0x8E, 0x8F, 0x8F, 0x8F, 0x90, /* 0xD0-0xD3 */
+	0xB7, 0xE3, 0x8F, 0x91, 0x8F, 0x92, 0x8F, 0x93, /* 0xD4-0xD7 */
+	0x8F, 0x94, 0x8F, 0x95, 0x8F, 0x96, 0x8F, 0x97, /* 0xD8-0xDB */
+	0x8F, 0x98, 0xB7, 0xE4, 0x8F, 0x99, 0xB7, 0xE5, /* 0xDC-0xDF */
+	0x8F, 0x9A, 0xB7, 0xE6, 0x8F, 0x9B, 0x8F, 0x9C, /* 0xE0-0xE3 */
+	0x8F, 0x9D, 0x8F, 0x9E, 0x8F, 0x9F, 0x8F, 0xA0, /* 0xE4-0xE7 */
+	0xB7, 0xE7, 0xB7, 0xE8, 0x8F, 0xA1, 0x8F, 0xA2, /* 0xE8-0xEB */
+	0xB7, 0xE9, 0x8F, 0xA3, 0x8F, 0xA4, 0x8F, 0xA5, /* 0xEC-0xEF */
+	0xB7, 0xEA, 0x8F, 0xA6, 0x8F, 0xA7, 0x8F, 0xA8, /* 0xF0-0xF3 */
+	0x8F, 0xA9, 0x8F, 0xAA, 0x8F, 0xAB, 0x8F, 0xAC, /* 0xF4-0xF7 */
+	0xB7, 0xEB, 0xB7, 0xEC, 0x8F, 0xAD, 0xB7, 0xED, /* 0xF8-0xFB */
+	0x8F, 0xAE, 0xB7, 0xEE, 0x8F, 0xAF, 0x8F, 0xB0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_B9[512] = {
+	0x8F, 0xB1, 0x8F, 0xB2, 0x8F, 0xB3, 0x8F, 0xB4, /* 0x00-0x03 */
+	0xB7, 0xEF, 0x8F, 0xB5, 0x8F, 0xB6, 0x8F, 0xB7, /* 0x04-0x07 */
+	0x8F, 0xB8, 0x8F, 0xB9, 0x8F, 0xBA, 0x8F, 0xBB, /* 0x08-0x0B */
+	0x8F, 0xBC, 0x8F, 0xBD, 0x8F, 0xBE, 0x8F, 0xBF, /* 0x0C-0x0F */
+	0x8F, 0xC0, 0x8F, 0xC1, 0x8F, 0xC2, 0x8F, 0xC3, /* 0x10-0x13 */
+	0x8F, 0xC4, 0x8F, 0xC5, 0x8F, 0xC6, 0x8F, 0xC7, /* 0x14-0x17 */
+	0xB7, 0xF0, 0x8F, 0xC8, 0x8F, 0xC9, 0x8F, 0xCA, /* 0x18-0x1B */
+	0x8F, 0xCB, 0x8F, 0xCC, 0x8F, 0xCD, 0x8F, 0xCE, /* 0x1C-0x1F */
+	0xB7, 0xF1, 0x8F, 0xCF, 0x8F, 0xD0, 0x8F, 0xD1, /* 0x20-0x23 */
+	0x8F, 0xD2, 0x8F, 0xD3, 0x8F, 0xD4, 0x8F, 0xD5, /* 0x24-0x27 */
+	0x8F, 0xD6, 0x8F, 0xD7, 0x8F, 0xD8, 0x8F, 0xD9, /* 0x28-0x2B */
+	0x8F, 0xDA, 0x8F, 0xDB, 0x8F, 0xDC, 0x8F, 0xDD, /* 0x2C-0x2F */
+	0x8F, 0xDE, 0x8F, 0xDF, 0x8F, 0xE0, 0x8F, 0xE1, /* 0x30-0x33 */
+	0x8F, 0xE2, 0x8F, 0xE3, 0x8F, 0xE4, 0x8F, 0xE5, /* 0x34-0x37 */
+	0x8F, 0xE6, 0x8F, 0xE7, 0x8F, 0xE8, 0x8F, 0xE9, /* 0x38-0x3B */
+	0xB7, 0xF2, 0xB7, 0xF3, 0x8F, 0xEA, 0x8F, 0xEB, /* 0x3C-0x3F */
+	0xB7, 0xF4, 0x8F, 0xEC, 0x8F, 0xED, 0x8F, 0xEE, /* 0x40-0x43 */
+	0xB7, 0xF5, 0x8F, 0xEF, 0x8F, 0xF0, 0x8F, 0xF1, /* 0x44-0x47 */
+	0x8F, 0xF2, 0x8F, 0xF3, 0x8F, 0xF4, 0x8F, 0xF5, /* 0x48-0x4B */
+	0xB7, 0xF6, 0x8F, 0xF6, 0x8F, 0xF7, 0xB7, 0xF7, /* 0x4C-0x4F */
+	0x8F, 0xF8, 0xB7, 0xF8, 0x8F, 0xF9, 0x8F, 0xFA, /* 0x50-0x53 */
+	0x8F, 0xFB, 0x8F, 0xFC, 0x8F, 0xFD, 0x8F, 0xFE, /* 0x54-0x57 */
+	0xB7, 0xF9, 0xB7, 0xFA, 0x90, 0x41, 0x90, 0x42, /* 0x58-0x5B */
+	0xB7, 0xFB, 0x90, 0x43, 0x90, 0x44, 0x90, 0x45, /* 0x5C-0x5F */
+	0xB7, 0xFC, 0x90, 0x46, 0x90, 0x47, 0x90, 0x48, /* 0x60-0x63 */
+	0x90, 0x49, 0x90, 0x4A, 0x90, 0x4B, 0x90, 0x4C, /* 0x64-0x67 */
+	0xB7, 0xFD, 0xB7, 0xFE, 0x90, 0x4D, 0xB8, 0xA1, /* 0x68-0x6B */
+	0x90, 0x4E, 0xB8, 0xA2, 0x90, 0x4F, 0x90, 0x50, /* 0x6C-0x6F */
+	0x90, 0x51, 0x90, 0x52, 0x90, 0x53, 0x90, 0x54, /* 0x70-0x73 */
+	0xB8, 0xA3, 0xB8, 0xA4, 0x90, 0x55, 0x90, 0x56, /* 0x74-0x77 */
+	0xB8, 0xA5, 0x90, 0x57, 0x90, 0x58, 0x90, 0x59, /* 0x78-0x7B */
+	0xB8, 0xA6, 0x90, 0x5A, 0x90, 0x61, 0x90, 0x62, /* 0x7C-0x7F */
+	
+	0x90, 0x63, 0x90, 0x64, 0x90, 0x65, 0x90, 0x66, /* 0x80-0x83 */
+	0xB8, 0xA7, 0xB8, 0xA8, 0x90, 0x67, 0xB8, 0xA9, /* 0x84-0x87 */
+	0x90, 0x68, 0xB8, 0xAA, 0xB8, 0xAB, 0x90, 0x69, /* 0x88-0x8B */
+	0x90, 0x6A, 0xB8, 0xAC, 0xB8, 0xAD, 0x90, 0x6B, /* 0x8C-0x8F */
+	0x90, 0x6C, 0x90, 0x6D, 0x90, 0x6E, 0x90, 0x6F, /* 0x90-0x93 */
+	0x90, 0x70, 0x90, 0x71, 0x90, 0x72, 0x90, 0x73, /* 0x94-0x97 */
+	0x90, 0x74, 0x90, 0x75, 0x90, 0x76, 0x90, 0x77, /* 0x98-0x9B */
+	0x90, 0x78, 0x90, 0x79, 0x90, 0x7A, 0x90, 0x81, /* 0x9C-0x9F */
+	0x90, 0x82, 0x90, 0x83, 0x90, 0x84, 0x90, 0x85, /* 0xA0-0xA3 */
+	0x90, 0x86, 0x90, 0x87, 0x90, 0x88, 0x90, 0x89, /* 0xA4-0xA7 */
+	0x90, 0x8A, 0x90, 0x8B, 0x90, 0x8C, 0x90, 0x8D, /* 0xA8-0xAB */
+	0xB8, 0xAE, 0xB8, 0xAF, 0x90, 0x8E, 0x90, 0x8F, /* 0xAC-0xAF */
+	0xB8, 0xB0, 0x90, 0x90, 0x90, 0x91, 0x90, 0x92, /* 0xB0-0xB3 */
+	0xB8, 0xB1, 0x90, 0x93, 0x90, 0x94, 0x90, 0x95, /* 0xB4-0xB7 */
+	0x90, 0x96, 0x90, 0x97, 0x90, 0x98, 0x90, 0x99, /* 0xB8-0xBB */
+	0xB8, 0xB2, 0xB8, 0xB3, 0x90, 0x9A, 0xB8, 0xB4, /* 0xBC-0xBF */
+	0x90, 0x9B, 0xB8, 0xB5, 0x90, 0x9C, 0x90, 0x9D, /* 0xC0-0xC3 */
+	0x90, 0x9E, 0x90, 0x9F, 0x90, 0xA0, 0x90, 0xA1, /* 0xC4-0xC7 */
+	0xB8, 0xB6, 0xB8, 0xB7, 0x90, 0xA2, 0x90, 0xA3, /* 0xC8-0xCB */
+	0xB8, 0xB8, 0x90, 0xA4, 0xB8, 0xB9, 0xB8, 0xBA, /* 0xCC-0xCF */
+	0xB8, 0xBB, 0xB8, 0xBC, 0xB8, 0xBD, 0x90, 0xA5, /* 0xD0-0xD3 */
+	0x90, 0xA6, 0x90, 0xA7, 0x90, 0xA8, 0x90, 0xA9, /* 0xD4-0xD7 */
+	0xB8, 0xBE, 0xB8, 0xBF, 0x90, 0xAA, 0xB8, 0xC0, /* 0xD8-0xDB */
+	0x90, 0xAB, 0xB8, 0xC1, 0xB8, 0xC2, 0x90, 0xAC, /* 0xDC-0xDF */
+	0x90, 0xAD, 0xB8, 0xC3, 0x90, 0xAE, 0xB8, 0xC4, /* 0xE0-0xE3 */
+	0xB8, 0xC5, 0xB8, 0xC6, 0x90, 0xAF, 0x90, 0xB0, /* 0xE4-0xE7 */
+	0xB8, 0xC7, 0x90, 0xB1, 0x90, 0xB2, 0x90, 0xB3, /* 0xE8-0xEB */
+	0xB8, 0xC8, 0x90, 0xB4, 0x90, 0xB5, 0x90, 0xB6, /* 0xEC-0xEF */
+	0x90, 0xB7, 0x90, 0xB8, 0x90, 0xB9, 0x90, 0xBA, /* 0xF0-0xF3 */
+	0xB8, 0xC9, 0xB8, 0xCA, 0x90, 0xBB, 0xB8, 0xCB, /* 0xF4-0xF7 */
+	0xB8, 0xCC, 0xB8, 0xCD, 0xB8, 0xCE, 0x90, 0xBC, /* 0xF8-0xFB */
+	0x90, 0xBD, 0x90, 0xBE, 0x90, 0xBF, 0x90, 0xC0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_BA[512] = {
+	0xB8, 0xCF, 0xB8, 0xD0, 0x90, 0xC1, 0x90, 0xC2, /* 0x00-0x03 */
+	0x90, 0xC3, 0x90, 0xC4, 0x90, 0xC5, 0x90, 0xC6, /* 0x04-0x07 */
+	0xB8, 0xD1, 0x90, 0xC7, 0x90, 0xC8, 0x90, 0xC9, /* 0x08-0x0B */
+	0x90, 0xCA, 0x90, 0xCB, 0x90, 0xCC, 0x90, 0xCD, /* 0x0C-0x0F */
+	0x90, 0xCE, 0x90, 0xCF, 0x90, 0xD0, 0x90, 0xD1, /* 0x10-0x13 */
+	0x90, 0xD2, 0xB8, 0xD2, 0x90, 0xD3, 0x90, 0xD4, /* 0x14-0x17 */
+	0x90, 0xD5, 0x90, 0xD6, 0x90, 0xD7, 0x90, 0xD8, /* 0x18-0x1B */
+	0x90, 0xD9, 0x90, 0xDA, 0x90, 0xDB, 0x90, 0xDC, /* 0x1C-0x1F */
+	0x90, 0xDD, 0x90, 0xDE, 0x90, 0xDF, 0x90, 0xE0, /* 0x20-0x23 */
+	0x90, 0xE1, 0x90, 0xE2, 0x90, 0xE3, 0x90, 0xE4, /* 0x24-0x27 */
+	0x90, 0xE5, 0x90, 0xE6, 0x90, 0xE7, 0x90, 0xE8, /* 0x28-0x2B */
+	0x90, 0xE9, 0x90, 0xEA, 0x90, 0xEB, 0x90, 0xEC, /* 0x2C-0x2F */
+	0x90, 0xED, 0x90, 0xEE, 0x90, 0xEF, 0x90, 0xF0, /* 0x30-0x33 */
+	0x90, 0xF1, 0x90, 0xF2, 0x90, 0xF3, 0x90, 0xF4, /* 0x34-0x37 */
+	0xB8, 0xD3, 0xB8, 0xD4, 0x90, 0xF5, 0x90, 0xF6, /* 0x38-0x3B */
+	0xB8, 0xD5, 0x90, 0xF7, 0x90, 0xF8, 0x90, 0xF9, /* 0x3C-0x3F */
+	0xB8, 0xD6, 0x90, 0xFA, 0xB8, 0xD7, 0x90, 0xFB, /* 0x40-0x43 */
+	0x90, 0xFC, 0x90, 0xFD, 0x90, 0xFE, 0x91, 0x41, /* 0x44-0x47 */
+	0xB8, 0xD8, 0xB8, 0xD9, 0x91, 0x42, 0xB8, 0xDA, /* 0x48-0x4B */
+	0x91, 0x43, 0xB8, 0xDB, 0xB8, 0xDC, 0x91, 0x44, /* 0x4C-0x4F */
+	0x91, 0x45, 0x91, 0x46, 0x91, 0x47, 0xB8, 0xDD, /* 0x50-0x53 */
+	0xB8, 0xDE, 0xB8, 0xDF, 0x91, 0x48, 0x91, 0x49, /* 0x54-0x57 */
+	0xB8, 0xE0, 0x91, 0x4A, 0x91, 0x4B, 0x91, 0x4C, /* 0x58-0x5B */
+	0xB8, 0xE1, 0x91, 0x4D, 0x91, 0x4E, 0x91, 0x4F, /* 0x5C-0x5F */
+	0x91, 0x50, 0x91, 0x51, 0x91, 0x52, 0x91, 0x53, /* 0x60-0x63 */
+	0xB8, 0xE2, 0xB8, 0xE3, 0x91, 0x54, 0xB8, 0xE4, /* 0x64-0x67 */
+	0xB8, 0xE5, 0xB8, 0xE6, 0x91, 0x55, 0x91, 0x56, /* 0x68-0x6B */
+	0x91, 0x57, 0x91, 0x58, 0x91, 0x59, 0x91, 0x5A, /* 0x6C-0x6F */
+	0xB8, 0xE7, 0xB8, 0xE8, 0x91, 0x61, 0x91, 0x62, /* 0x70-0x73 */
+	0xB8, 0xE9, 0x91, 0x63, 0x91, 0x64, 0x91, 0x65, /* 0x74-0x77 */
+	0xB8, 0xEA, 0x91, 0x66, 0x91, 0x67, 0x91, 0x68, /* 0x78-0x7B */
+	0x91, 0x69, 0x91, 0x6A, 0x91, 0x6B, 0x91, 0x6C, /* 0x7C-0x7F */
+	
+	0x91, 0x6D, 0x91, 0x6E, 0x91, 0x6F, 0xB8, 0xEB, /* 0x80-0x83 */
+	0xB8, 0xEC, 0xB8, 0xED, 0x91, 0x70, 0xB8, 0xEE, /* 0x84-0x87 */
+	0x91, 0x71, 0x91, 0x72, 0x91, 0x73, 0x91, 0x74, /* 0x88-0x8B */
+	0xB8, 0xEF, 0x91, 0x75, 0x91, 0x76, 0x91, 0x77, /* 0x8C-0x8F */
+	0x91, 0x78, 0x91, 0x79, 0x91, 0x7A, 0x91, 0x81, /* 0x90-0x93 */
+	0x91, 0x82, 0x91, 0x83, 0x91, 0x84, 0x91, 0x85, /* 0x94-0x97 */
+	0x91, 0x86, 0x91, 0x87, 0x91, 0x88, 0x91, 0x89, /* 0x98-0x9B */
+	0x91, 0x8A, 0x91, 0x8B, 0x91, 0x8C, 0x91, 0x8D, /* 0x9C-0x9F */
+	0x91, 0x8E, 0x91, 0x8F, 0x91, 0x90, 0x91, 0x91, /* 0xA0-0xA3 */
+	0x91, 0x92, 0x91, 0x93, 0x91, 0x94, 0x91, 0x95, /* 0xA4-0xA7 */
+	0xB8, 0xF0, 0xB8, 0xF1, 0x91, 0x96, 0xB8, 0xF2, /* 0xA8-0xAB */
+	0xB8, 0xF3, 0x91, 0x97, 0x91, 0x98, 0x91, 0x99, /* 0xAC-0xAF */
+	0xB8, 0xF4, 0x91, 0x9A, 0xB8, 0xF5, 0x91, 0x9B, /* 0xB0-0xB3 */
+	0x91, 0x9C, 0x91, 0x9D, 0x91, 0x9E, 0x91, 0x9F, /* 0xB4-0xB7 */
+	0xB8, 0xF6, 0xB8, 0xF7, 0x91, 0xA0, 0xB8, 0xF8, /* 0xB8-0xBB */
+	0x91, 0xA1, 0xB8, 0xF9, 0x91, 0xA2, 0x91, 0xA3, /* 0xBC-0xBF */
+	0x91, 0xA4, 0x91, 0xA5, 0x91, 0xA6, 0x91, 0xA7, /* 0xC0-0xC3 */
+	0xB8, 0xFA, 0x91, 0xA8, 0x91, 0xA9, 0x91, 0xAA, /* 0xC4-0xC7 */
+	0xB8, 0xFB, 0x91, 0xAB, 0x91, 0xAC, 0x91, 0xAD, /* 0xC8-0xCB */
+	0x91, 0xAE, 0x91, 0xAF, 0x91, 0xB0, 0x91, 0xB1, /* 0xCC-0xCF */
+	0x91, 0xB2, 0x91, 0xB3, 0x91, 0xB4, 0x91, 0xB5, /* 0xD0-0xD3 */
+	0x91, 0xB6, 0x91, 0xB7, 0x91, 0xB8, 0x91, 0xB9, /* 0xD4-0xD7 */
+	0xB8, 0xFC, 0xB8, 0xFD, 0x91, 0xBA, 0x91, 0xBB, /* 0xD8-0xDB */
+	0x91, 0xBC, 0x91, 0xBD, 0x91, 0xBE, 0x91, 0xBF, /* 0xDC-0xDF */
+	0x91, 0xC0, 0x91, 0xC1, 0x91, 0xC2, 0x91, 0xC3, /* 0xE0-0xE3 */
+	0x91, 0xC4, 0x91, 0xC5, 0x91, 0xC6, 0x91, 0xC7, /* 0xE4-0xE7 */
+	0x91, 0xC8, 0x91, 0xC9, 0x91, 0xCA, 0x91, 0xCB, /* 0xE8-0xEB */
+	0x91, 0xCC, 0x91, 0xCD, 0x91, 0xCE, 0x91, 0xCF, /* 0xEC-0xEF */
+	0x91, 0xD0, 0x91, 0xD1, 0x91, 0xD2, 0x91, 0xD3, /* 0xF0-0xF3 */
+	0x91, 0xD4, 0x91, 0xD5, 0x91, 0xD6, 0x91, 0xD7, /* 0xF4-0xF7 */
+	0x91, 0xD8, 0x91, 0xD9, 0x91, 0xDA, 0x91, 0xDB, /* 0xF8-0xFB */
+	0xB8, 0xFE, 0x91, 0xDC, 0x91, 0xDD, 0x91, 0xDE, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_BB[512] = {
+	0xB9, 0xA1, 0x91, 0xDF, 0x91, 0xE0, 0x91, 0xE1, /* 0x00-0x03 */
+	0xB9, 0xA2, 0x91, 0xE2, 0x91, 0xE3, 0x91, 0xE4, /* 0x04-0x07 */
+	0x91, 0xE5, 0x91, 0xE6, 0x91, 0xE7, 0x91, 0xE8, /* 0x08-0x0B */
+	0x91, 0xE9, 0xB9, 0xA3, 0x91, 0xEA, 0xB9, 0xA4, /* 0x0C-0x0F */
+	0x91, 0xEB, 0xB9, 0xA5, 0x91, 0xEC, 0x91, 0xED, /* 0x10-0x13 */
+	0x91, 0xEE, 0x91, 0xEF, 0x91, 0xF0, 0x91, 0xF1, /* 0x14-0x17 */
+	0xB9, 0xA6, 0x91, 0xF2, 0x91, 0xF3, 0x91, 0xF4, /* 0x18-0x1B */
+	0xB9, 0xA7, 0x91, 0xF5, 0x91, 0xF6, 0x91, 0xF7, /* 0x1C-0x1F */
+	0xB9, 0xA8, 0x91, 0xF8, 0x91, 0xF9, 0x91, 0xFA, /* 0x20-0x23 */
+	0x91, 0xFB, 0x91, 0xFC, 0x91, 0xFD, 0x91, 0xFE, /* 0x24-0x27 */
+	0x92, 0x41, 0xB9, 0xA9, 0x92, 0x42, 0xB9, 0xAA, /* 0x28-0x2B */
+	0x92, 0x43, 0x92, 0x44, 0x92, 0x45, 0x92, 0x46, /* 0x2C-0x2F */
+	0x92, 0x47, 0x92, 0x48, 0x92, 0x49, 0x92, 0x4A, /* 0x30-0x33 */
+	0xB9, 0xAB, 0xB9, 0xAC, 0xB9, 0xAD, 0x92, 0x4B, /* 0x34-0x37 */
+	0xB9, 0xAE, 0x92, 0x4C, 0x92, 0x4D, 0xB9, 0xAF, /* 0x38-0x3B */
+	0xB9, 0xB0, 0xB9, 0xB1, 0xB9, 0xB2, 0x92, 0x4E, /* 0x3C-0x3F */
+	0x92, 0x4F, 0x92, 0x50, 0x92, 0x51, 0x92, 0x52, /* 0x40-0x43 */
+	0xB9, 0xB3, 0xB9, 0xB4, 0x92, 0x53, 0xB9, 0xB5, /* 0x44-0x47 */
+	0x92, 0x54, 0xB9, 0xB6, 0x92, 0x55, 0x92, 0x56, /* 0x48-0x4B */
+	0x92, 0x57, 0xB9, 0xB7, 0x92, 0x58, 0xB9, 0xB8, /* 0x4C-0x4F */
+	0xB9, 0xB9, 0x92, 0x59, 0x92, 0x5A, 0x92, 0x61, /* 0x50-0x53 */
+	0xB9, 0xBA, 0x92, 0x62, 0x92, 0x63, 0x92, 0x64, /* 0x54-0x57 */
+	0xB9, 0xBB, 0x92, 0x65, 0x92, 0x66, 0x92, 0x67, /* 0x58-0x5B */
+	0x92, 0x68, 0x92, 0x69, 0x92, 0x6A, 0x92, 0x6B, /* 0x5C-0x5F */
+	0x92, 0x6C, 0xB9, 0xBC, 0x92, 0x6D, 0xB9, 0xBD, /* 0x60-0x63 */
+	0x92, 0x6E, 0x92, 0x6F, 0x92, 0x70, 0x92, 0x71, /* 0x64-0x67 */
+	0x92, 0x72, 0x92, 0x73, 0x92, 0x74, 0x92, 0x75, /* 0x68-0x6B */
+	0xB9, 0xBE, 0x92, 0x76, 0x92, 0x77, 0x92, 0x78, /* 0x6C-0x6F */
+	0x92, 0x79, 0x92, 0x7A, 0x92, 0x81, 0x92, 0x82, /* 0x70-0x73 */
+	0x92, 0x83, 0x92, 0x84, 0x92, 0x85, 0x92, 0x86, /* 0x74-0x77 */
+	0x92, 0x87, 0x92, 0x88, 0x92, 0x89, 0x92, 0x8A, /* 0x78-0x7B */
+	0x92, 0x8B, 0x92, 0x8C, 0x92, 0x8D, 0x92, 0x8E, /* 0x7C-0x7F */
+	
+	0x92, 0x8F, 0x92, 0x90, 0x92, 0x91, 0x92, 0x92, /* 0x80-0x83 */
+	0x92, 0x93, 0x92, 0x94, 0x92, 0x95, 0x92, 0x96, /* 0x84-0x87 */
+	0xB9, 0xBF, 0x92, 0x97, 0x92, 0x98, 0x92, 0x99, /* 0x88-0x8B */
+	0xB9, 0xC0, 0x92, 0x9A, 0x92, 0x9B, 0x92, 0x9C, /* 0x8C-0x8F */
+	0xB9, 0xC1, 0x92, 0x9D, 0x92, 0x9E, 0x92, 0x9F, /* 0x90-0x93 */
+	0x92, 0xA0, 0x92, 0xA1, 0x92, 0xA2, 0x92, 0xA3, /* 0x94-0x97 */
+	0x92, 0xA4, 0x92, 0xA5, 0x92, 0xA6, 0x92, 0xA7, /* 0x98-0x9B */
+	0x92, 0xA8, 0x92, 0xA9, 0x92, 0xAA, 0x92, 0xAB, /* 0x9C-0x9F */
+	0x92, 0xAC, 0x92, 0xAD, 0x92, 0xAE, 0x92, 0xAF, /* 0xA0-0xA3 */
+	0xB9, 0xC2, 0x92, 0xB0, 0x92, 0xB1, 0x92, 0xB2, /* 0xA4-0xA7 */
+	0xB9, 0xC3, 0x92, 0xB3, 0x92, 0xB4, 0x92, 0xB5, /* 0xA8-0xAB */
+	0xB9, 0xC4, 0x92, 0xB6, 0x92, 0xB7, 0x92, 0xB8, /* 0xAC-0xAF */
+	0x92, 0xB9, 0x92, 0xBA, 0x92, 0xBB, 0x92, 0xBC, /* 0xB0-0xB3 */
+	0xB9, 0xC5, 0x92, 0xBD, 0x92, 0xBE, 0xB9, 0xC6, /* 0xB4-0xB7 */
+	0x92, 0xBF, 0x92, 0xC0, 0x92, 0xC1, 0x92, 0xC2, /* 0xB8-0xBB */
+	0x92, 0xC3, 0x92, 0xC4, 0x92, 0xC5, 0x92, 0xC6, /* 0xBC-0xBF */
+	0xB9, 0xC7, 0x92, 0xC7, 0x92, 0xC8, 0x92, 0xC9, /* 0xC0-0xC3 */
+	0xB9, 0xC8, 0x92, 0xCA, 0x92, 0xCB, 0x92, 0xCC, /* 0xC4-0xC7 */
+	0xB9, 0xC9, 0x92, 0xCD, 0x92, 0xCE, 0x92, 0xCF, /* 0xC8-0xCB */
+	0x92, 0xD0, 0x92, 0xD1, 0x92, 0xD2, 0x92, 0xD3, /* 0xCC-0xCF */
+	0xB9, 0xCA, 0x92, 0xD4, 0x92, 0xD5, 0xB9, 0xCB, /* 0xD0-0xD3 */
+	0x92, 0xD6, 0x92, 0xD7, 0x92, 0xD8, 0x92, 0xD9, /* 0xD4-0xD7 */
+	0x92, 0xDA, 0x92, 0xDB, 0x92, 0xDC, 0x92, 0xDD, /* 0xD8-0xDB */
+	0x92, 0xDE, 0x92, 0xDF, 0x92, 0xE0, 0x92, 0xE1, /* 0xDC-0xDF */
+	0x92, 0xE2, 0x92, 0xE3, 0x92, 0xE4, 0x92, 0xE5, /* 0xE0-0xE3 */
+	0x92, 0xE6, 0x92, 0xE7, 0x92, 0xE8, 0x92, 0xE9, /* 0xE4-0xE7 */
+	0x92, 0xEA, 0x92, 0xEB, 0x92, 0xEC, 0x92, 0xED, /* 0xE8-0xEB */
+	0x92, 0xEE, 0x92, 0xEF, 0x92, 0xF0, 0x92, 0xF1, /* 0xEC-0xEF */
+	0x92, 0xF2, 0x92, 0xF3, 0x92, 0xF4, 0x92, 0xF5, /* 0xF0-0xF3 */
+	0x92, 0xF6, 0x92, 0xF7, 0x92, 0xF8, 0x92, 0xF9, /* 0xF4-0xF7 */
+	0xB9, 0xCC, 0xB9, 0xCD, 0x92, 0xFA, 0x92, 0xFB, /* 0xF8-0xFB */
+	0xB9, 0xCE, 0x92, 0xFC, 0x92, 0xFD, 0xB9, 0xCF, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_BC[512] = {
+	0xB9, 0xD0, 0x92, 0xFE, 0xB9, 0xD1, 0x93, 0x41, /* 0x00-0x03 */
+	0x93, 0x42, 0x93, 0x43, 0x93, 0x44, 0x93, 0x45, /* 0x04-0x07 */
+	0xB9, 0xD2, 0xB9, 0xD3, 0x93, 0x46, 0xB9, 0xD4, /* 0x08-0x0B */
+	0xB9, 0xD5, 0xB9, 0xD6, 0x93, 0x47, 0xB9, 0xD7, /* 0x0C-0x0F */
+	0x93, 0x48, 0xB9, 0xD8, 0x93, 0x49, 0x93, 0x4A, /* 0x10-0x13 */
+	0xB9, 0xD9, 0xB9, 0xDA, 0xB9, 0xDB, 0xB9, 0xDC, /* 0x14-0x17 */
+	0xB9, 0xDD, 0x93, 0x4B, 0x93, 0x4C, 0xB9, 0xDE, /* 0x18-0x1B */
+	0xB9, 0xDF, 0xB9, 0xE0, 0xB9, 0xE1, 0xB9, 0xE2, /* 0x1C-0x1F */
+	0x93, 0x4D, 0x93, 0x4E, 0x93, 0x4F, 0x93, 0x50, /* 0x20-0x23 */
+	0xB9, 0xE3, 0xB9, 0xE4, 0x93, 0x51, 0xB9, 0xE5, /* 0x24-0x27 */
+	0x93, 0x52, 0xB9, 0xE6, 0x93, 0x53, 0x93, 0x54, /* 0x28-0x2B */
+	0x93, 0x55, 0xB9, 0xE7, 0x93, 0x56, 0x93, 0x57, /* 0x2C-0x2F */
+	0xB9, 0xE8, 0xB9, 0xE9, 0x93, 0x58, 0x93, 0x59, /* 0x30-0x33 */
+	0xB9, 0xEA, 0x93, 0x5A, 0x93, 0x61, 0x93, 0x62, /* 0x34-0x37 */
+	0xB9, 0xEB, 0x93, 0x63, 0x93, 0x64, 0x93, 0x65, /* 0x38-0x3B */
+	0x93, 0x66, 0x93, 0x67, 0x93, 0x68, 0x93, 0x69, /* 0x3C-0x3F */
+	0xB9, 0xEC, 0xB9, 0xED, 0x93, 0x6A, 0xB9, 0xEE, /* 0x40-0x43 */
+	0xB9, 0xEF, 0xB9, 0xF0, 0x93, 0x6B, 0x93, 0x6C, /* 0x44-0x47 */
+	0x93, 0x6D, 0xB9, 0xF1, 0x93, 0x6E, 0x93, 0x6F, /* 0x48-0x4B */
+	0xB9, 0xF2, 0xB9, 0xF3, 0x93, 0x70, 0x93, 0x71, /* 0x4C-0x4F */
+	0xB9, 0xF4, 0x93, 0x72, 0x93, 0x73, 0x93, 0x74, /* 0x50-0x53 */
+	0x93, 0x75, 0x93, 0x76, 0x93, 0x77, 0x93, 0x78, /* 0x54-0x57 */
+	0x93, 0x79, 0x93, 0x7A, 0x93, 0x81, 0x93, 0x82, /* 0x58-0x5B */
+	0x93, 0x83, 0xB9, 0xF5, 0x93, 0x84, 0x93, 0x85, /* 0x5C-0x5F */
+	0x93, 0x86, 0x93, 0x87, 0x93, 0x88, 0x93, 0x89, /* 0x60-0x63 */
+	0x93, 0x8A, 0x93, 0x8B, 0x93, 0x8C, 0x93, 0x8D, /* 0x64-0x67 */
+	0x93, 0x8E, 0x93, 0x8F, 0x93, 0x90, 0x93, 0x91, /* 0x68-0x6B */
+	0x93, 0x92, 0x93, 0x93, 0x93, 0x94, 0x93, 0x95, /* 0x6C-0x6F */
+	0x93, 0x96, 0x93, 0x97, 0x93, 0x98, 0x93, 0x99, /* 0x70-0x73 */
+	0x93, 0x9A, 0x93, 0x9B, 0x93, 0x9C, 0x93, 0x9D, /* 0x74-0x77 */
+	0x93, 0x9E, 0x93, 0x9F, 0x93, 0xA0, 0x93, 0xA1, /* 0x78-0x7B */
+	0x93, 0xA2, 0x93, 0xA3, 0x93, 0xA4, 0x93, 0xA5, /* 0x7C-0x7F */
+	
+	0x93, 0xA6, 0x93, 0xA7, 0x93, 0xA8, 0x93, 0xA9, /* 0x80-0x83 */
+	0xB9, 0xF6, 0xB9, 0xF7, 0x93, 0xAA, 0x93, 0xAB, /* 0x84-0x87 */
+	0xB9, 0xF8, 0x93, 0xAC, 0x93, 0xAD, 0xB9, 0xF9, /* 0x88-0x8B */
+	0xB9, 0xFA, 0x93, 0xAE, 0xB9, 0xFB, 0x93, 0xAF, /* 0x8C-0x8F */
+	0x93, 0xB0, 0x93, 0xB1, 0x93, 0xB2, 0x93, 0xB3, /* 0x90-0x93 */
+	0xB9, 0xFC, 0xB9, 0xFD, 0x93, 0xB4, 0xB9, 0xFE, /* 0x94-0x97 */
+	0x93, 0xB5, 0xBA, 0xA1, 0xBA, 0xA2, 0x93, 0xB6, /* 0x98-0x9B */
+	0x93, 0xB7, 0x93, 0xB8, 0x93, 0xB9, 0x93, 0xBA, /* 0x9C-0x9F */
+	0xBA, 0xA3, 0xBA, 0xA4, 0x93, 0xBB, 0x93, 0xBC, /* 0xA0-0xA3 */
+	0xBA, 0xA5, 0x93, 0xBD, 0x93, 0xBE, 0xBA, 0xA6, /* 0xA4-0xA7 */
+	0xBA, 0xA7, 0x93, 0xBF, 0x93, 0xC0, 0x93, 0xC1, /* 0xA8-0xAB */
+	0x93, 0xC2, 0x93, 0xC3, 0x93, 0xC4, 0x93, 0xC5, /* 0xAC-0xAF */
+	0xBA, 0xA8, 0xBA, 0xA9, 0x93, 0xC6, 0xBA, 0xAA, /* 0xB0-0xB3 */
+	0xBA, 0xAB, 0xBA, 0xAC, 0x93, 0xC7, 0x93, 0xC8, /* 0xB4-0xB7 */
+	0x93, 0xC9, 0x93, 0xCA, 0x93, 0xCB, 0x93, 0xCC, /* 0xB8-0xBB */
+	0xBA, 0xAD, 0xBA, 0xAE, 0x93, 0xCD, 0x93, 0xCE, /* 0xBC-0xBF */
+	0xBA, 0xAF, 0x93, 0xCF, 0x93, 0xD0, 0x93, 0xD1, /* 0xC0-0xC3 */
+	0xBA, 0xB0, 0x93, 0xD2, 0x93, 0xD3, 0x93, 0xD4, /* 0xC4-0xC7 */
+	0x93, 0xD5, 0x93, 0xD6, 0x93, 0xD7, 0x93, 0xD8, /* 0xC8-0xCB */
+	0x93, 0xD9, 0xBA, 0xB1, 0x93, 0xDA, 0xBA, 0xB2, /* 0xCC-0xCF */
+	0xBA, 0xB3, 0xBA, 0xB4, 0x93, 0xDB, 0x93, 0xDC, /* 0xD0-0xD3 */
+	0x93, 0xDD, 0xBA, 0xB5, 0x93, 0xDE, 0x93, 0xDF, /* 0xD4-0xD7 */
+	0xBA, 0xB6, 0x93, 0xE0, 0x93, 0xE1, 0x93, 0xE2, /* 0xD8-0xDB */
+	0xBA, 0xB7, 0x93, 0xE3, 0x93, 0xE4, 0x93, 0xE5, /* 0xDC-0xDF */
+	0x93, 0xE6, 0x93, 0xE7, 0x93, 0xE8, 0x93, 0xE9, /* 0xE0-0xE3 */
+	0x93, 0xEA, 0x93, 0xEB, 0x93, 0xEC, 0x93, 0xED, /* 0xE4-0xE7 */
+	0x93, 0xEE, 0x93, 0xEF, 0x93, 0xF0, 0x93, 0xF1, /* 0xE8-0xEB */
+	0x93, 0xF2, 0x93, 0xF3, 0x93, 0xF4, 0x93, 0xF5, /* 0xEC-0xEF */
+	0x93, 0xF6, 0x93, 0xF7, 0x93, 0xF8, 0x93, 0xF9, /* 0xF0-0xF3 */
+	0xBA, 0xB8, 0xBA, 0xB9, 0xBA, 0xBA, 0x93, 0xFA, /* 0xF4-0xF7 */
+	0xBA, 0xBB, 0x93, 0xFB, 0x93, 0xFC, 0x93, 0xFD, /* 0xF8-0xFB */
+	0xBA, 0xBC, 0x93, 0xFE, 0x94, 0x41, 0x94, 0x42, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_BD[512] = {
+	0x94, 0x43, 0x94, 0x44, 0x94, 0x45, 0x94, 0x46, /* 0x00-0x03 */
+	0xBA, 0xBD, 0xBA, 0xBE, 0x94, 0x47, 0xBA, 0xBF, /* 0x04-0x07 */
+	0x94, 0x48, 0xBA, 0xC0, 0x94, 0x49, 0x94, 0x4A, /* 0x08-0x0B */
+	0x94, 0x4B, 0x94, 0x4C, 0x94, 0x4D, 0x94, 0x4E, /* 0x0C-0x0F */
+	0xBA, 0xC1, 0x94, 0x4F, 0x94, 0x50, 0x94, 0x51, /* 0x10-0x13 */
+	0xBA, 0xC2, 0x94, 0x52, 0x94, 0x53, 0x94, 0x54, /* 0x14-0x17 */
+	0x94, 0x55, 0x94, 0x56, 0x94, 0x57, 0x94, 0x58, /* 0x18-0x1B */
+	0x94, 0x59, 0x94, 0x5A, 0x94, 0x61, 0x94, 0x62, /* 0x1C-0x1F */
+	0x94, 0x63, 0x94, 0x64, 0x94, 0x65, 0x94, 0x66, /* 0x20-0x23 */
+	0xBA, 0xC3, 0x94, 0x67, 0x94, 0x68, 0x94, 0x69, /* 0x24-0x27 */
+	0x94, 0x6A, 0x94, 0x6B, 0x94, 0x6C, 0x94, 0x6D, /* 0x28-0x2B */
+	0xBA, 0xC4, 0x94, 0x6E, 0x94, 0x6F, 0x94, 0x70, /* 0x2C-0x2F */
+	0x94, 0x71, 0x94, 0x72, 0x94, 0x73, 0x94, 0x74, /* 0x30-0x33 */
+	0x94, 0x75, 0x94, 0x76, 0x94, 0x77, 0x94, 0x78, /* 0x34-0x37 */
+	0x94, 0x79, 0x94, 0x7A, 0x94, 0x81, 0x94, 0x82, /* 0x38-0x3B */
+	0x94, 0x83, 0x94, 0x84, 0x94, 0x85, 0x94, 0x86, /* 0x3C-0x3F */
+	0xBA, 0xC5, 0x94, 0x87, 0x94, 0x88, 0x94, 0x89, /* 0x40-0x43 */
+	0x94, 0x8A, 0x94, 0x8B, 0x94, 0x8C, 0x94, 0x8D, /* 0x44-0x47 */
+	0xBA, 0xC6, 0xBA, 0xC7, 0x94, 0x8E, 0x94, 0x8F, /* 0x48-0x4B */
+	0xBA, 0xC8, 0x94, 0x90, 0x94, 0x91, 0x94, 0x92, /* 0x4C-0x4F */
+	0xBA, 0xC9, 0x94, 0x93, 0x94, 0x94, 0x94, 0x95, /* 0x50-0x53 */
+	0x94, 0x96, 0x94, 0x97, 0x94, 0x98, 0x94, 0x99, /* 0x54-0x57 */
+	0xBA, 0xCA, 0xBA, 0xCB, 0x94, 0x9A, 0x94, 0x9B, /* 0x58-0x5B */
+	0x94, 0x9C, 0x94, 0x9D, 0x94, 0x9E, 0x94, 0x9F, /* 0x5C-0x5F */
+	0x94, 0xA0, 0x94, 0xA1, 0x94, 0xA2, 0x94, 0xA3, /* 0x60-0x63 */
+	0xBA, 0xCC, 0x94, 0xA4, 0x94, 0xA5, 0x94, 0xA6, /* 0x64-0x67 */
+	0xBA, 0xCD, 0x94, 0xA7, 0x94, 0xA8, 0x94, 0xA9, /* 0x68-0x6B */
+	0x94, 0xAA, 0x94, 0xAB, 0x94, 0xAC, 0x94, 0xAD, /* 0x6C-0x6F */
+	0x94, 0xAE, 0x94, 0xAF, 0x94, 0xB0, 0x94, 0xB1, /* 0x70-0x73 */
+	0x94, 0xB2, 0x94, 0xB3, 0x94, 0xB4, 0x94, 0xB5, /* 0x74-0x77 */
+	0x94, 0xB6, 0x94, 0xB7, 0x94, 0xB8, 0x94, 0xB9, /* 0x78-0x7B */
+	0x94, 0xBA, 0x94, 0xBB, 0x94, 0xBC, 0x94, 0xBD, /* 0x7C-0x7F */
+	
+	0xBA, 0xCE, 0xBA, 0xCF, 0x94, 0xBE, 0x94, 0xBF, /* 0x80-0x83 */
+	0xBA, 0xD0, 0x94, 0xC0, 0x94, 0xC1, 0xBA, 0xD1, /* 0x84-0x87 */
+	0xBA, 0xD2, 0xBA, 0xD3, 0xBA, 0xD4, 0x94, 0xC2, /* 0x88-0x8B */
+	0x94, 0xC3, 0x94, 0xC4, 0x94, 0xC5, 0x94, 0xC6, /* 0x8C-0x8F */
+	0xBA, 0xD5, 0xBA, 0xD6, 0x94, 0xC7, 0xBA, 0xD7, /* 0x90-0x93 */
+	0x94, 0xC8, 0xBA, 0xD8, 0x94, 0xC9, 0x94, 0xCA, /* 0x94-0x97 */
+	0x94, 0xCB, 0xBA, 0xD9, 0xBA, 0xDA, 0x94, 0xCC, /* 0x98-0x9B */
+	0xBA, 0xDB, 0x94, 0xCD, 0x94, 0xCE, 0x94, 0xCF, /* 0x9C-0x9F */
+	0x94, 0xD0, 0x94, 0xD1, 0x94, 0xD2, 0x94, 0xD3, /* 0xA0-0xA3 */
+	0xBA, 0xDC, 0x94, 0xD4, 0x94, 0xD5, 0x94, 0xD6, /* 0xA4-0xA7 */
+	0x94, 0xD7, 0x94, 0xD8, 0x94, 0xD9, 0x94, 0xDA, /* 0xA8-0xAB */
+	0x94, 0xDB, 0x94, 0xDC, 0x94, 0xDD, 0x94, 0xDE, /* 0xAC-0xAF */
+	0xBA, 0xDD, 0x94, 0xDF, 0x94, 0xE0, 0x94, 0xE1, /* 0xB0-0xB3 */
+	0x94, 0xE2, 0x94, 0xE3, 0x94, 0xE4, 0x94, 0xE5, /* 0xB4-0xB7 */
+	0xBA, 0xDE, 0x94, 0xE6, 0x94, 0xE7, 0x94, 0xE8, /* 0xB8-0xBB */
+	0x94, 0xE9, 0x94, 0xEA, 0x94, 0xEB, 0x94, 0xEC, /* 0xBC-0xBF */
+	0x94, 0xED, 0x94, 0xEE, 0x94, 0xEF, 0x94, 0xF0, /* 0xC0-0xC3 */
+	0x94, 0xF1, 0x94, 0xF2, 0x94, 0xF3, 0x94, 0xF4, /* 0xC4-0xC7 */
+	0x94, 0xF5, 0x94, 0xF6, 0x94, 0xF7, 0x94, 0xF8, /* 0xC8-0xCB */
+	0x94, 0xF9, 0x94, 0xFA, 0x94, 0xFB, 0x94, 0xFC, /* 0xCC-0xCF */
+	0x94, 0xFD, 0x94, 0xFE, 0x95, 0x41, 0x95, 0x42, /* 0xD0-0xD3 */
+	0xBA, 0xDF, 0xBA, 0xE0, 0x95, 0x43, 0x95, 0x44, /* 0xD4-0xD7 */
+	0xBA, 0xE1, 0x95, 0x45, 0x95, 0x46, 0x95, 0x47, /* 0xD8-0xDB */
+	0xBA, 0xE2, 0x95, 0x48, 0x95, 0x49, 0x95, 0x4A, /* 0xDC-0xDF */
+	0x95, 0x4B, 0x95, 0x4C, 0x95, 0x4D, 0x95, 0x4E, /* 0xE0-0xE3 */
+	0x95, 0x4F, 0x95, 0x50, 0x95, 0x51, 0x95, 0x52, /* 0xE4-0xE7 */
+	0x95, 0x53, 0xBA, 0xE3, 0x95, 0x54, 0x95, 0x55, /* 0xE8-0xEB */
+	0x95, 0x56, 0x95, 0x57, 0x95, 0x58, 0x95, 0x59, /* 0xEC-0xEF */
+	0xBA, 0xE4, 0x95, 0x5A, 0x95, 0x61, 0x95, 0x62, /* 0xF0-0xF3 */
+	0xBA, 0xE5, 0x95, 0x63, 0x95, 0x64, 0x95, 0x65, /* 0xF4-0xF7 */
+	0xBA, 0xE6, 0x95, 0x66, 0x95, 0x67, 0x95, 0x68, /* 0xF8-0xFB */
+	0x95, 0x69, 0x95, 0x6A, 0x95, 0x6B, 0x95, 0x6C, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_BE[512] = {
+	0xBA, 0xE7, 0x95, 0x6D, 0x95, 0x6E, 0xBA, 0xE8, /* 0x00-0x03 */
+	0x95, 0x6F, 0xBA, 0xE9, 0x95, 0x70, 0x95, 0x71, /* 0x04-0x07 */
+	0x95, 0x72, 0x95, 0x73, 0x95, 0x74, 0x95, 0x75, /* 0x08-0x0B */
+	0xBA, 0xEA, 0xBA, 0xEB, 0x95, 0x76, 0x95, 0x77, /* 0x0C-0x0F */
+	0xBA, 0xEC, 0x95, 0x78, 0x95, 0x79, 0x95, 0x7A, /* 0x10-0x13 */
+	0xBA, 0xED, 0x95, 0x81, 0x95, 0x82, 0x95, 0x83, /* 0x14-0x17 */
+	0x95, 0x84, 0x95, 0x85, 0x95, 0x86, 0x95, 0x87, /* 0x18-0x1B */
+	0xBA, 0xEE, 0xBA, 0xEF, 0x95, 0x88, 0xBA, 0xF0, /* 0x1C-0x1F */
+	0x95, 0x89, 0x95, 0x8A, 0x95, 0x8B, 0x95, 0x8C, /* 0x20-0x23 */
+	0x95, 0x8D, 0x95, 0x8E, 0x95, 0x8F, 0x95, 0x90, /* 0x24-0x27 */
+	0x95, 0x91, 0x95, 0x92, 0x95, 0x93, 0x95, 0x94, /* 0x28-0x2B */
+	0x95, 0x95, 0x95, 0x96, 0x95, 0x97, 0x95, 0x98, /* 0x2C-0x2F */
+	0x95, 0x99, 0x95, 0x9A, 0x95, 0x9B, 0x95, 0x9C, /* 0x30-0x33 */
+	0x95, 0x9D, 0x95, 0x9E, 0x95, 0x9F, 0x95, 0xA0, /* 0x34-0x37 */
+	0x95, 0xA1, 0x95, 0xA2, 0x95, 0xA3, 0x95, 0xA4, /* 0x38-0x3B */
+	0x95, 0xA5, 0x95, 0xA6, 0x95, 0xA7, 0x95, 0xA8, /* 0x3C-0x3F */
+	0x95, 0xA9, 0x95, 0xAA, 0x95, 0xAB, 0x95, 0xAC, /* 0x40-0x43 */
+	0xBA, 0xF1, 0xBA, 0xF2, 0x95, 0xAD, 0x95, 0xAE, /* 0x44-0x47 */
+	0xBA, 0xF3, 0x95, 0xAF, 0x95, 0xB0, 0x95, 0xB1, /* 0x48-0x4B */
+	0xBA, 0xF4, 0x95, 0xB2, 0xBA, 0xF5, 0x95, 0xB3, /* 0x4C-0x4F */
+	0x95, 0xB4, 0x95, 0xB5, 0x95, 0xB6, 0x95, 0xB7, /* 0x50-0x53 */
+	0xBA, 0xF6, 0xBA, 0xF7, 0x95, 0xB8, 0xBA, 0xF8, /* 0x54-0x57 */
+	0x95, 0xB9, 0xBA, 0xF9, 0xBA, 0xFA, 0xBA, 0xFB, /* 0x58-0x5B */
+	0x95, 0xBA, 0x95, 0xBB, 0x95, 0xBC, 0x95, 0xBD, /* 0x5C-0x5F */
+	0xBA, 0xFC, 0xBA, 0xFD, 0x95, 0xBE, 0x95, 0xBF, /* 0x60-0x63 */
+	0xBA, 0xFE, 0x95, 0xC0, 0x95, 0xC1, 0x95, 0xC2, /* 0x64-0x67 */
+	0xBB, 0xA1, 0x95, 0xC3, 0xBB, 0xA2, 0x95, 0xC4, /* 0x68-0x6B */
+	0x95, 0xC5, 0x95, 0xC6, 0x95, 0xC7, 0x95, 0xC8, /* 0x6C-0x6F */
+	0xBB, 0xA3, 0xBB, 0xA4, 0x95, 0xC9, 0xBB, 0xA5, /* 0x70-0x73 */
+	0xBB, 0xA6, 0xBB, 0xA7, 0x95, 0xCA, 0x95, 0xCB, /* 0x74-0x77 */
+	0x95, 0xCC, 0x95, 0xCD, 0x95, 0xCE, 0xBB, 0xA8, /* 0x78-0x7B */
+	0xBB, 0xA9, 0xBB, 0xAA, 0x95, 0xCF, 0x95, 0xD0, /* 0x7C-0x7F */
+	
+	0xBB, 0xAB, 0x95, 0xD1, 0x95, 0xD2, 0x95, 0xD3, /* 0x80-0x83 */
+	0xBB, 0xAC, 0x95, 0xD4, 0x95, 0xD5, 0x95, 0xD6, /* 0x84-0x87 */
+	0x95, 0xD7, 0x95, 0xD8, 0x95, 0xD9, 0x95, 0xDA, /* 0x88-0x8B */
+	0xBB, 0xAD, 0xBB, 0xAE, 0x95, 0xDB, 0xBB, 0xAF, /* 0x8C-0x8F */
+	0xBB, 0xB0, 0xBB, 0xB1, 0x95, 0xDC, 0x95, 0xDD, /* 0x90-0x93 */
+	0x95, 0xDE, 0x95, 0xDF, 0x95, 0xE0, 0x95, 0xE1, /* 0x94-0x97 */
+	0xBB, 0xB2, 0xBB, 0xB3, 0x95, 0xE2, 0x95, 0xE3, /* 0x98-0x9B */
+	0x95, 0xE4, 0x95, 0xE5, 0x95, 0xE6, 0x95, 0xE7, /* 0x9C-0x9F */
+	0x95, 0xE8, 0x95, 0xE9, 0x95, 0xEA, 0x95, 0xEB, /* 0xA0-0xA3 */
+	0x95, 0xEC, 0x95, 0xED, 0x95, 0xEE, 0x95, 0xEF, /* 0xA4-0xA7 */
+	0xBB, 0xB4, 0x95, 0xF0, 0x95, 0xF1, 0x95, 0xF2, /* 0xA8-0xAB */
+	0x95, 0xF3, 0x95, 0xF4, 0x95, 0xF5, 0x95, 0xF6, /* 0xAC-0xAF */
+	0x95, 0xF7, 0x95, 0xF8, 0x95, 0xF9, 0x95, 0xFA, /* 0xB0-0xB3 */
+	0x95, 0xFB, 0x95, 0xFC, 0x95, 0xFD, 0x95, 0xFE, /* 0xB4-0xB7 */
+	0x96, 0x41, 0x96, 0x42, 0x96, 0x43, 0x96, 0x44, /* 0xB8-0xBB */
+	0x96, 0x45, 0x96, 0x46, 0x96, 0x47, 0x96, 0x48, /* 0xBC-0xBF */
+	0x96, 0x49, 0x96, 0x4A, 0x96, 0x4B, 0x96, 0x4C, /* 0xC0-0xC3 */
+	0x96, 0x4D, 0x96, 0x4E, 0x96, 0x4F, 0x96, 0x50, /* 0xC4-0xC7 */
+	0x96, 0x51, 0x96, 0x52, 0x96, 0x53, 0x96, 0x54, /* 0xC8-0xCB */
+	0x96, 0x55, 0x96, 0x56, 0x96, 0x57, 0x96, 0x58, /* 0xCC-0xCF */
+	0xBB, 0xB5, 0xBB, 0xB6, 0x96, 0x59, 0x96, 0x5A, /* 0xD0-0xD3 */
+	0xBB, 0xB7, 0x96, 0x61, 0x96, 0x62, 0xBB, 0xB8, /* 0xD4-0xD7 */
+	0xBB, 0xB9, 0x96, 0x63, 0x96, 0x64, 0x96, 0x65, /* 0xD8-0xDB */
+	0x96, 0x66, 0x96, 0x67, 0x96, 0x68, 0x96, 0x69, /* 0xDC-0xDF */
+	0xBB, 0xBA, 0x96, 0x6A, 0x96, 0x6B, 0xBB, 0xBB, /* 0xE0-0xE3 */
+	0xBB, 0xBC, 0xBB, 0xBD, 0x96, 0x6C, 0x96, 0x6D, /* 0xE4-0xE7 */
+	0x96, 0x6E, 0x96, 0x6F, 0x96, 0x70, 0x96, 0x71, /* 0xE8-0xEB */
+	0xBB, 0xBE, 0x96, 0x72, 0x96, 0x73, 0x96, 0x74, /* 0xEC-0xEF */
+	0x96, 0x75, 0x96, 0x76, 0x96, 0x77, 0x96, 0x78, /* 0xF0-0xF3 */
+	0x96, 0x79, 0x96, 0x7A, 0x96, 0x81, 0x96, 0x82, /* 0xF4-0xF7 */
+	0x96, 0x83, 0x96, 0x84, 0x96, 0x85, 0x96, 0x86, /* 0xF8-0xFB */
+	0x96, 0x87, 0x96, 0x88, 0x96, 0x89, 0x96, 0x8A, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_BF[512] = {
+	0x96, 0x8B, 0xBB, 0xBF, 0x96, 0x8C, 0x96, 0x8D, /* 0x00-0x03 */
+	0x96, 0x8E, 0x96, 0x8F, 0x96, 0x90, 0x96, 0x91, /* 0x04-0x07 */
+	0xBB, 0xC0, 0xBB, 0xC1, 0x96, 0x92, 0x96, 0x93, /* 0x08-0x0B */
+	0x96, 0x94, 0x96, 0x95, 0x96, 0x96, 0x96, 0x97, /* 0x0C-0x0F */
+	0x96, 0x98, 0x96, 0x99, 0x96, 0x9A, 0x96, 0x9B, /* 0x10-0x13 */
+	0x96, 0x9C, 0x96, 0x9D, 0x96, 0x9E, 0x96, 0x9F, /* 0x14-0x17 */
+	0xBB, 0xC2, 0xBB, 0xC3, 0x96, 0xA0, 0xBB, 0xC4, /* 0x18-0x1B */
+	0xBB, 0xC5, 0xBB, 0xC6, 0x96, 0xA1, 0x96, 0xA2, /* 0x1C-0x1F */
+	0x96, 0xA3, 0x96, 0xA4, 0x96, 0xA5, 0x96, 0xA6, /* 0x20-0x23 */
+	0x96, 0xA7, 0x96, 0xA8, 0x96, 0xA9, 0x96, 0xAA, /* 0x24-0x27 */
+	0x96, 0xAB, 0x96, 0xAC, 0x96, 0xAD, 0x96, 0xAE, /* 0x28-0x2B */
+	0x96, 0xAF, 0x96, 0xB0, 0x96, 0xB1, 0x96, 0xB2, /* 0x2C-0x2F */
+	0x96, 0xB3, 0x96, 0xB4, 0x96, 0xB5, 0x96, 0xB6, /* 0x30-0x33 */
+	0x96, 0xB7, 0x96, 0xB8, 0x96, 0xB9, 0x96, 0xBA, /* 0x34-0x37 */
+	0x96, 0xBB, 0x96, 0xBC, 0x96, 0xBD, 0x96, 0xBE, /* 0x38-0x3B */
+	0x96, 0xBF, 0x96, 0xC0, 0x96, 0xC1, 0x96, 0xC2, /* 0x3C-0x3F */
+	0xBB, 0xC7, 0xBB, 0xC8, 0x96, 0xC3, 0x96, 0xC4, /* 0x40-0x43 */
+	0xBB, 0xC9, 0x96, 0xC5, 0x96, 0xC6, 0x96, 0xC7, /* 0x44-0x47 */
+	0xBB, 0xCA, 0x96, 0xC8, 0x96, 0xC9, 0x96, 0xCA, /* 0x48-0x4B */
+	0x96, 0xCB, 0x96, 0xCC, 0x96, 0xCD, 0x96, 0xCE, /* 0x4C-0x4F */
+	0xBB, 0xCB, 0xBB, 0xCC, 0x96, 0xCF, 0x96, 0xD0, /* 0x50-0x53 */
+	0x96, 0xD1, 0xBB, 0xCD, 0x96, 0xD2, 0x96, 0xD3, /* 0x54-0x57 */
+	0x96, 0xD4, 0x96, 0xD5, 0x96, 0xD6, 0x96, 0xD7, /* 0x58-0x5B */
+	0x96, 0xD8, 0x96, 0xD9, 0x96, 0xDA, 0x96, 0xDB, /* 0x5C-0x5F */
+	0x96, 0xDC, 0x96, 0xDD, 0x96, 0xDE, 0x96, 0xDF, /* 0x60-0x63 */
+	0x96, 0xE0, 0x96, 0xE1, 0x96, 0xE2, 0x96, 0xE3, /* 0x64-0x67 */
+	0x96, 0xE4, 0x96, 0xE5, 0x96, 0xE6, 0x96, 0xE7, /* 0x68-0x6B */
+	0x96, 0xE8, 0x96, 0xE9, 0x96, 0xEA, 0x96, 0xEB, /* 0x6C-0x6F */
+	0x96, 0xEC, 0x96, 0xED, 0x96, 0xEE, 0x96, 0xEF, /* 0x70-0x73 */
+	0x96, 0xF0, 0x96, 0xF1, 0x96, 0xF2, 0x96, 0xF3, /* 0x74-0x77 */
+	0x96, 0xF4, 0x96, 0xF5, 0x96, 0xF6, 0x96, 0xF7, /* 0x78-0x7B */
+	0x96, 0xF8, 0x96, 0xF9, 0x96, 0xFA, 0x96, 0xFB, /* 0x7C-0x7F */
+	
+	0x96, 0xFC, 0x96, 0xFD, 0x96, 0xFE, 0x97, 0x41, /* 0x80-0x83 */
+	0x97, 0x42, 0x97, 0x43, 0x97, 0x44, 0x97, 0x45, /* 0x84-0x87 */
+	0x97, 0x46, 0x97, 0x47, 0x97, 0x48, 0x97, 0x49, /* 0x88-0x8B */
+	0x97, 0x4A, 0x97, 0x4B, 0x97, 0x4C, 0x97, 0x4D, /* 0x8C-0x8F */
+	0x97, 0x4E, 0x97, 0x4F, 0x97, 0x50, 0x97, 0x51, /* 0x90-0x93 */
+	0xBB, 0xCE, 0x97, 0x52, 0x97, 0x53, 0x97, 0x54, /* 0x94-0x97 */
+	0x97, 0x55, 0x97, 0x56, 0x97, 0x57, 0x97, 0x58, /* 0x98-0x9B */
+	0x97, 0x59, 0x97, 0x5A, 0x97, 0x61, 0x97, 0x62, /* 0x9C-0x9F */
+	0x97, 0x63, 0x97, 0x64, 0x97, 0x65, 0x97, 0x66, /* 0xA0-0xA3 */
+	0x97, 0x67, 0x97, 0x68, 0x97, 0x69, 0x97, 0x6A, /* 0xA4-0xA7 */
+	0x97, 0x6B, 0x97, 0x6C, 0x97, 0x6D, 0x97, 0x6E, /* 0xA8-0xAB */
+	0x97, 0x6F, 0x97, 0x70, 0x97, 0x71, 0x97, 0x72, /* 0xAC-0xAF */
+	0xBB, 0xCF, 0x97, 0x73, 0x97, 0x74, 0x97, 0x75, /* 0xB0-0xB3 */
+	0x97, 0x76, 0x97, 0x77, 0x97, 0x78, 0x97, 0x79, /* 0xB4-0xB7 */
+	0x97, 0x7A, 0x97, 0x81, 0x97, 0x82, 0x97, 0x83, /* 0xB8-0xBB */
+	0x97, 0x84, 0x97, 0x85, 0x97, 0x86, 0x97, 0x87, /* 0xBC-0xBF */
+	0x97, 0x88, 0x97, 0x89, 0x97, 0x8A, 0x97, 0x8B, /* 0xC0-0xC3 */
+	0x97, 0x8C, 0xBB, 0xD0, 0x97, 0x8D, 0x97, 0x8E, /* 0xC4-0xC7 */
+	0x97, 0x8F, 0x97, 0x90, 0x97, 0x91, 0x97, 0x92, /* 0xC8-0xCB */
+	0xBB, 0xD1, 0xBB, 0xD2, 0x97, 0x93, 0x97, 0x94, /* 0xCC-0xCF */
+	0xBB, 0xD3, 0x97, 0x95, 0x97, 0x96, 0x97, 0x97, /* 0xD0-0xD3 */
+	0xBB, 0xD4, 0x97, 0x98, 0x97, 0x99, 0x97, 0x9A, /* 0xD4-0xD7 */
+	0x97, 0x9B, 0x97, 0x9C, 0x97, 0x9D, 0x97, 0x9E, /* 0xD8-0xDB */
+	0xBB, 0xD5, 0x97, 0x9F, 0x97, 0xA0, 0xBB, 0xD6, /* 0xDC-0xDF */
+	0x97, 0xA1, 0xBB, 0xD7, 0x97, 0xA2, 0x97, 0xA3, /* 0xE0-0xE3 */
+	0x97, 0xA4, 0x97, 0xA5, 0x97, 0xA6, 0x97, 0xA7, /* 0xE4-0xE7 */
+	0x97, 0xA8, 0x97, 0xA9, 0x97, 0xAA, 0x97, 0xAB, /* 0xE8-0xEB */
+	0x97, 0xAC, 0x97, 0xAD, 0x97, 0xAE, 0x97, 0xAF, /* 0xEC-0xEF */
+	0x97, 0xB0, 0x97, 0xB1, 0x97, 0xB2, 0x97, 0xB3, /* 0xF0-0xF3 */
+	0x97, 0xB4, 0x97, 0xB5, 0x97, 0xB6, 0x97, 0xB7, /* 0xF4-0xF7 */
+	0x97, 0xB8, 0x97, 0xB9, 0x97, 0xBA, 0x97, 0xBB, /* 0xF8-0xFB */
+	0x97, 0xBC, 0x97, 0xBD, 0x97, 0xBE, 0x97, 0xBF, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C0[512] = {
+	0x97, 0xC0, 0x97, 0xC1, 0x97, 0xC2, 0x97, 0xC3, /* 0x00-0x03 */
+	0x97, 0xC4, 0x97, 0xC5, 0x97, 0xC6, 0x97, 0xC7, /* 0x04-0x07 */
+	0x97, 0xC8, 0x97, 0xC9, 0x97, 0xCA, 0x97, 0xCB, /* 0x08-0x0B */
+	0x97, 0xCC, 0x97, 0xCD, 0x97, 0xCE, 0x97, 0xCF, /* 0x0C-0x0F */
+	0x97, 0xD0, 0x97, 0xD1, 0x97, 0xD2, 0x97, 0xD3, /* 0x10-0x13 */
+	0x97, 0xD4, 0x97, 0xD5, 0x97, 0xD6, 0x97, 0xD7, /* 0x14-0x17 */
+	0x97, 0xD8, 0x97, 0xD9, 0x97, 0xDA, 0x97, 0xDB, /* 0x18-0x1B */
+	0x97, 0xDC, 0x97, 0xDD, 0x97, 0xDE, 0x97, 0xDF, /* 0x1C-0x1F */
+	0x97, 0xE0, 0x97, 0xE1, 0x97, 0xE2, 0x97, 0xE3, /* 0x20-0x23 */
+	0x97, 0xE4, 0x97, 0xE5, 0x97, 0xE6, 0x97, 0xE7, /* 0x24-0x27 */
+	0x97, 0xE8, 0x97, 0xE9, 0x97, 0xEA, 0x97, 0xEB, /* 0x28-0x2B */
+	0x97, 0xEC, 0x97, 0xED, 0x97, 0xEE, 0x97, 0xEF, /* 0x2C-0x2F */
+	0x97, 0xF0, 0x97, 0xF1, 0x97, 0xF2, 0x97, 0xF3, /* 0x30-0x33 */
+	0x97, 0xF4, 0x97, 0xF5, 0x97, 0xF6, 0x97, 0xF7, /* 0x34-0x37 */
+	0x97, 0xF8, 0x97, 0xF9, 0x97, 0xFA, 0x97, 0xFB, /* 0x38-0x3B */
+	0xBB, 0xD8, 0x97, 0xFC, 0x97, 0xFD, 0x97, 0xFE, /* 0x3C-0x3F */
+	0x98, 0x41, 0x98, 0x42, 0x98, 0x43, 0x98, 0x44, /* 0x40-0x43 */
+	0x98, 0x45, 0x98, 0x46, 0x98, 0x47, 0x98, 0x48, /* 0x44-0x47 */
+	0x98, 0x49, 0x98, 0x4A, 0x98, 0x4B, 0x98, 0x4C, /* 0x48-0x4B */
+	0x98, 0x4D, 0x98, 0x4E, 0x98, 0x4F, 0x98, 0x50, /* 0x4C-0x4F */
+	0x98, 0x51, 0xBB, 0xD9, 0x98, 0x52, 0x98, 0x53, /* 0x50-0x53 */
+	0x98, 0x54, 0x98, 0x55, 0x98, 0x56, 0x98, 0x57, /* 0x54-0x57 */
+	0xBB, 0xDA, 0x98, 0x58, 0x98, 0x59, 0x98, 0x5A, /* 0x58-0x5B */
+	0xBB, 0xDB, 0x98, 0x61, 0x98, 0x62, 0x98, 0x63, /* 0x5C-0x5F */
+	0xBB, 0xDC, 0x98, 0x64, 0x98, 0x65, 0x98, 0x66, /* 0x60-0x63 */
+	0x98, 0x67, 0x98, 0x68, 0x98, 0x69, 0x98, 0x6A, /* 0x64-0x67 */
+	0xBB, 0xDD, 0xBB, 0xDE, 0x98, 0x6B, 0x98, 0x6C, /* 0x68-0x6B */
+	0x98, 0x6D, 0x98, 0x6E, 0x98, 0x6F, 0x98, 0x70, /* 0x6C-0x6F */
+	0x98, 0x71, 0x98, 0x72, 0x98, 0x73, 0x98, 0x74, /* 0x70-0x73 */
+	0x98, 0x75, 0x98, 0x76, 0x98, 0x77, 0x98, 0x78, /* 0x74-0x77 */
+	0x98, 0x79, 0x98, 0x7A, 0x98, 0x81, 0x98, 0x82, /* 0x78-0x7B */
+	0x98, 0x83, 0x98, 0x84, 0x98, 0x85, 0x98, 0x86, /* 0x7C-0x7F */
+	
+	0x98, 0x87, 0x98, 0x88, 0x98, 0x89, 0x98, 0x8A, /* 0x80-0x83 */
+	0x98, 0x8B, 0x98, 0x8C, 0x98, 0x8D, 0x98, 0x8E, /* 0x84-0x87 */
+	0x98, 0x8F, 0x98, 0x90, 0x98, 0x91, 0x98, 0x92, /* 0x88-0x8B */
+	0x98, 0x93, 0x98, 0x94, 0x98, 0x95, 0x98, 0x96, /* 0x8C-0x8F */
+	0xBB, 0xDF, 0xBB, 0xE0, 0x98, 0x97, 0x98, 0x98, /* 0x90-0x93 */
+	0xBB, 0xE1, 0x98, 0x99, 0x98, 0x9A, 0x98, 0x9B, /* 0x94-0x97 */
+	0xBB, 0xE2, 0x98, 0x9C, 0x98, 0x9D, 0x98, 0x9E, /* 0x98-0x9B */
+	0x98, 0x9F, 0x98, 0xA0, 0x98, 0xA1, 0x98, 0xA2, /* 0x9C-0x9F */
+	0xBB, 0xE3, 0xBB, 0xE4, 0x98, 0xA3, 0xBB, 0xE5, /* 0xA0-0xA3 */
+	0x98, 0xA4, 0xBB, 0xE6, 0x98, 0xA5, 0x98, 0xA6, /* 0xA4-0xA7 */
+	0x98, 0xA7, 0x98, 0xA8, 0x98, 0xA9, 0x98, 0xAA, /* 0xA8-0xAB */
+	0xBB, 0xE7, 0xBB, 0xE8, 0x98, 0xAB, 0xBB, 0xE9, /* 0xAC-0xAF */
+	0xBB, 0xEA, 0x98, 0xAC, 0x98, 0xAD, 0xBB, 0xEB, /* 0xB0-0xB3 */
+	0xBB, 0xEC, 0xBB, 0xED, 0xBB, 0xEE, 0x98, 0xAE, /* 0xB4-0xB7 */
+	0x98, 0xAF, 0x98, 0xB0, 0x98, 0xB1, 0x98, 0xB2, /* 0xB8-0xBB */
+	0xBB, 0xEF, 0xBB, 0xF0, 0x98, 0xB3, 0xBB, 0xF1, /* 0xBC-0xBF */
+	0xBB, 0xF2, 0xBB, 0xF3, 0x98, 0xB4, 0x98, 0xB5, /* 0xC0-0xC3 */
+	0x98, 0xB6, 0xBB, 0xF4, 0x98, 0xB7, 0x98, 0xB8, /* 0xC4-0xC7 */
+	0xBB, 0xF5, 0xBB, 0xF6, 0x98, 0xB9, 0x98, 0xBA, /* 0xC8-0xCB */
+	0xBB, 0xF7, 0x98, 0xBB, 0x98, 0xBC, 0x98, 0xBD, /* 0xCC-0xCF */
+	0xBB, 0xF8, 0x98, 0xBE, 0x98, 0xBF, 0x98, 0xC0, /* 0xD0-0xD3 */
+	0x98, 0xC1, 0x98, 0xC2, 0x98, 0xC3, 0x98, 0xC4, /* 0xD4-0xD7 */
+	0xBB, 0xF9, 0xBB, 0xFA, 0x98, 0xC5, 0xBB, 0xFB, /* 0xD8-0xDB */
+	0xBB, 0xFC, 0xBB, 0xFD, 0x98, 0xC6, 0x98, 0xC7, /* 0xDC-0xDF */
+	0x98, 0xC8, 0x98, 0xC9, 0x98, 0xCA, 0x98, 0xCB, /* 0xE0-0xE3 */
+	0xBB, 0xFE, 0xBC, 0xA1, 0x98, 0xCC, 0x98, 0xCD, /* 0xE4-0xE7 */
+	0xBC, 0xA2, 0x98, 0xCE, 0x98, 0xCF, 0x98, 0xD0, /* 0xE8-0xEB */
+	0xBC, 0xA3, 0x98, 0xD1, 0x98, 0xD2, 0x98, 0xD3, /* 0xEC-0xEF */
+	0x98, 0xD4, 0x98, 0xD5, 0x98, 0xD6, 0x98, 0xD7, /* 0xF0-0xF3 */
+	0xBC, 0xA4, 0xBC, 0xA5, 0x98, 0xD8, 0xBC, 0xA6, /* 0xF4-0xF7 */
+	0x98, 0xD9, 0xBC, 0xA7, 0x98, 0xDA, 0x98, 0xDB, /* 0xF8-0xFB */
+	0x98, 0xDC, 0x98, 0xDD, 0x98, 0xDE, 0x98, 0xDF, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C1[512] = {
+	0xBC, 0xA8, 0x98, 0xE0, 0x98, 0xE1, 0x98, 0xE2, /* 0x00-0x03 */
+	0xBC, 0xA9, 0x98, 0xE3, 0x98, 0xE4, 0x98, 0xE5, /* 0x04-0x07 */
+	0xBC, 0xAA, 0x98, 0xE6, 0x98, 0xE7, 0x98, 0xE8, /* 0x08-0x0B */
+	0x98, 0xE9, 0x98, 0xEA, 0x98, 0xEB, 0x98, 0xEC, /* 0x0C-0x0F */
+	0xBC, 0xAB, 0x98, 0xED, 0x98, 0xEE, 0x98, 0xEF, /* 0x10-0x13 */
+	0x98, 0xF0, 0xBC, 0xAC, 0x98, 0xF1, 0x98, 0xF2, /* 0x14-0x17 */
+	0x98, 0xF3, 0x98, 0xF4, 0x98, 0xF5, 0x98, 0xF6, /* 0x18-0x1B */
+	0xBC, 0xAD, 0xBC, 0xAE, 0xBC, 0xAF, 0xBC, 0xB0, /* 0x1C-0x1F */
+	0xBC, 0xB1, 0x98, 0xF7, 0x98, 0xF8, 0xBC, 0xB2, /* 0x20-0x23 */
+	0xBC, 0xB3, 0x98, 0xF9, 0xBC, 0xB4, 0xBC, 0xB5, /* 0x24-0x27 */
+	0x98, 0xFA, 0x98, 0xFB, 0x98, 0xFC, 0x98, 0xFD, /* 0x28-0x2B */
+	0xBC, 0xB6, 0xBC, 0xB7, 0x98, 0xFE, 0xBC, 0xB8, /* 0x2C-0x2F */
+	0xBC, 0xB9, 0xBC, 0xBA, 0x99, 0x41, 0x99, 0x42, /* 0x30-0x33 */
+	0x99, 0x43, 0x99, 0x44, 0xBC, 0xBB, 0x99, 0x45, /* 0x34-0x37 */
+	0xBC, 0xBC, 0xBC, 0xBD, 0x99, 0x46, 0x99, 0x47, /* 0x38-0x3B */
+	0xBC, 0xBE, 0x99, 0x48, 0x99, 0x49, 0x99, 0x4A, /* 0x3C-0x3F */
+	0xBC, 0xBF, 0x99, 0x4B, 0x99, 0x4C, 0x99, 0x4D, /* 0x40-0x43 */
+	0x99, 0x4E, 0x99, 0x4F, 0x99, 0x50, 0x99, 0x51, /* 0x44-0x47 */
+	0xBC, 0xC0, 0xBC, 0xC1, 0x99, 0x52, 0xBC, 0xC2, /* 0x48-0x4B */
+	0xBC, 0xC3, 0xBC, 0xC4, 0x99, 0x53, 0x99, 0x54, /* 0x4C-0x4F */
+	0x99, 0x55, 0x99, 0x56, 0x99, 0x57, 0x99, 0x58, /* 0x50-0x53 */
+	0xBC, 0xC5, 0xBC, 0xC6, 0x99, 0x59, 0x99, 0x5A, /* 0x54-0x57 */
+	0xBC, 0xC7, 0x99, 0x61, 0x99, 0x62, 0x99, 0x63, /* 0x58-0x5B */
+	0xBC, 0xC8, 0x99, 0x64, 0x99, 0x65, 0x99, 0x66, /* 0x5C-0x5F */
+	0x99, 0x67, 0x99, 0x68, 0x99, 0x69, 0x99, 0x6A, /* 0x60-0x63 */
+	0xBC, 0xC9, 0xBC, 0xCA, 0x99, 0x6B, 0xBC, 0xCB, /* 0x64-0x67 */
+	0xBC, 0xCC, 0xBC, 0xCD, 0x99, 0x6C, 0x99, 0x6D, /* 0x68-0x6B */
+	0x99, 0x6E, 0x99, 0x6F, 0x99, 0x70, 0x99, 0x71, /* 0x6C-0x6F */
+	0xBC, 0xCE, 0x99, 0x72, 0x99, 0x73, 0x99, 0x74, /* 0x70-0x73 */
+	0xBC, 0xCF, 0x99, 0x75, 0x99, 0x76, 0x99, 0x77, /* 0x74-0x77 */
+	0xBC, 0xD0, 0x99, 0x78, 0x99, 0x79, 0x99, 0x7A, /* 0x78-0x7B */
+	0x99, 0x81, 0x99, 0x82, 0x99, 0x83, 0x99, 0x84, /* 0x7C-0x7F */
+	
+	0x99, 0x85, 0x99, 0x86, 0x99, 0x87, 0x99, 0x88, /* 0x80-0x83 */
+	0x99, 0x89, 0xBC, 0xD1, 0x99, 0x8A, 0x99, 0x8B, /* 0x84-0x87 */
+	0x99, 0x8C, 0x99, 0x8D, 0x99, 0x8E, 0x99, 0x8F, /* 0x88-0x8B */
+	0xBC, 0xD2, 0xBC, 0xD3, 0xBC, 0xD4, 0x99, 0x90, /* 0x8C-0x8F */
+	0xBC, 0xD5, 0x99, 0x91, 0x99, 0x92, 0x99, 0x93, /* 0x90-0x93 */
+	0xBC, 0xD6, 0x99, 0x94, 0xBC, 0xD7, 0x99, 0x95, /* 0x94-0x97 */
+	0x99, 0x96, 0x99, 0x97, 0x99, 0x98, 0x99, 0x99, /* 0x98-0x9B */
+	0xBC, 0xD8, 0xBC, 0xD9, 0x99, 0x9A, 0xBC, 0xDA, /* 0x9C-0x9F */
+	0x99, 0x9B, 0xBC, 0xDB, 0x99, 0x9C, 0x99, 0x9D, /* 0xA0-0xA3 */
+	0x99, 0x9E, 0xBC, 0xDC, 0x99, 0x9F, 0x99, 0xA0, /* 0xA4-0xA7 */
+	0xBC, 0xDD, 0xBC, 0xDE, 0x99, 0xA1, 0x99, 0xA2, /* 0xA8-0xAB */
+	0xBC, 0xDF, 0x99, 0xA3, 0x99, 0xA4, 0x99, 0xA5, /* 0xAC-0xAF */
+	0xBC, 0xE0, 0x99, 0xA6, 0x99, 0xA7, 0x99, 0xA8, /* 0xB0-0xB3 */
+	0x99, 0xA9, 0x99, 0xAA, 0x99, 0xAB, 0x99, 0xAC, /* 0xB4-0xB7 */
+	0x99, 0xAD, 0x99, 0xAE, 0x99, 0xAF, 0x99, 0xB0, /* 0xB8-0xBB */
+	0x99, 0xB1, 0xBC, 0xE1, 0x99, 0xB2, 0x99, 0xB3, /* 0xBC-0xBF */
+	0x99, 0xB4, 0x99, 0xB5, 0x99, 0xB6, 0x99, 0xB7, /* 0xC0-0xC3 */
+	0xBC, 0xE2, 0x99, 0xB8, 0x99, 0xB9, 0x99, 0xBA, /* 0xC4-0xC7 */
+	0xBC, 0xE3, 0x99, 0xBB, 0x99, 0xBC, 0x99, 0xBD, /* 0xC8-0xCB */
+	0xBC, 0xE4, 0x99, 0xBE, 0x99, 0xBF, 0x99, 0xC0, /* 0xCC-0xCF */
+	0x99, 0xC1, 0x99, 0xC2, 0x99, 0xC3, 0x99, 0xC4, /* 0xD0-0xD3 */
+	0xBC, 0xE5, 0x99, 0xC5, 0x99, 0xC6, 0xBC, 0xE6, /* 0xD4-0xD7 */
+	0xBC, 0xE7, 0x99, 0xC7, 0x99, 0xC8, 0x99, 0xC9, /* 0xD8-0xDB */
+	0x99, 0xCA, 0x99, 0xCB, 0x99, 0xCC, 0x99, 0xCD, /* 0xDC-0xDF */
+	0xBC, 0xE8, 0x99, 0xCE, 0x99, 0xCF, 0x99, 0xD0, /* 0xE0-0xE3 */
+	0xBC, 0xE9, 0x99, 0xD1, 0x99, 0xD2, 0x99, 0xD3, /* 0xE4-0xE7 */
+	0xBC, 0xEA, 0x99, 0xD4, 0x99, 0xD5, 0x99, 0xD6, /* 0xE8-0xEB */
+	0x99, 0xD7, 0x99, 0xD8, 0x99, 0xD9, 0x99, 0xDA, /* 0xEC-0xEF */
+	0xBC, 0xEB, 0xBC, 0xEC, 0x99, 0xDB, 0xBC, 0xED, /* 0xF0-0xF3 */
+	0x99, 0xDC, 0x99, 0xDD, 0x99, 0xDE, 0x99, 0xDF, /* 0xF4-0xF7 */
+	0x99, 0xE0, 0x99, 0xE1, 0x99, 0xE2, 0x99, 0xE3, /* 0xF8-0xFB */
+	0xBC, 0xEE, 0xBC, 0xEF, 0x99, 0xE4, 0x99, 0xE5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C2[512] = {
+	0xBC, 0xF0, 0x99, 0xE6, 0x99, 0xE7, 0x99, 0xE8, /* 0x00-0x03 */
+	0xBC, 0xF1, 0x99, 0xE9, 0x99, 0xEA, 0x99, 0xEB, /* 0x04-0x07 */
+	0x99, 0xEC, 0x99, 0xED, 0x99, 0xEE, 0x99, 0xEF, /* 0x08-0x0B */
+	0xBC, 0xF2, 0xBC, 0xF3, 0x99, 0xF0, 0xBC, 0xF4, /* 0x0C-0x0F */
+	0x99, 0xF1, 0xBC, 0xF5, 0x99, 0xF2, 0x99, 0xF3, /* 0x10-0x13 */
+	0x99, 0xF4, 0x99, 0xF5, 0x99, 0xF6, 0x99, 0xF7, /* 0x14-0x17 */
+	0xBC, 0xF6, 0xBC, 0xF7, 0x99, 0xF8, 0x99, 0xF9, /* 0x18-0x1B */
+	0xBC, 0xF8, 0x99, 0xFA, 0x99, 0xFB, 0xBC, 0xF9, /* 0x1C-0x1F */
+	0xBC, 0xFA, 0x99, 0xFC, 0x99, 0xFD, 0x99, 0xFE, /* 0x20-0x23 */
+	0x9A, 0x41, 0x9A, 0x42, 0x9A, 0x43, 0x9A, 0x44, /* 0x24-0x27 */
+	0xBC, 0xFB, 0xBC, 0xFC, 0x9A, 0x45, 0xBC, 0xFD, /* 0x28-0x2B */
+	0x9A, 0x46, 0xBC, 0xFE, 0x9A, 0x47, 0xBD, 0xA1, /* 0x2C-0x2F */
+	0x9A, 0x48, 0xBD, 0xA2, 0xBD, 0xA3, 0x9A, 0x49, /* 0x30-0x33 */
+	0xBD, 0xA4, 0x9A, 0x4A, 0x9A, 0x4B, 0x9A, 0x4C, /* 0x34-0x37 */
+	0x9A, 0x4D, 0x9A, 0x4E, 0x9A, 0x4F, 0x9A, 0x50, /* 0x38-0x3B */
+	0x9A, 0x51, 0x9A, 0x52, 0x9A, 0x53, 0x9A, 0x54, /* 0x3C-0x3F */
+	0x9A, 0x55, 0x9A, 0x56, 0x9A, 0x57, 0x9A, 0x58, /* 0x40-0x43 */
+	0x9A, 0x59, 0x9A, 0x5A, 0x9A, 0x61, 0x9A, 0x62, /* 0x44-0x47 */
+	0xBD, 0xA5, 0x9A, 0x63, 0x9A, 0x64, 0x9A, 0x65, /* 0x48-0x4B */
+	0x9A, 0x66, 0x9A, 0x67, 0x9A, 0x68, 0x9A, 0x69, /* 0x4C-0x4F */
+	0xBD, 0xA6, 0xBD, 0xA7, 0x9A, 0x6A, 0x9A, 0x6B, /* 0x50-0x53 */
+	0xBD, 0xA8, 0x9A, 0x6C, 0x9A, 0x6D, 0x9A, 0x6E, /* 0x54-0x57 */
+	0xBD, 0xA9, 0x9A, 0x6F, 0x9A, 0x70, 0x9A, 0x71, /* 0x58-0x5B */
+	0x9A, 0x72, 0x9A, 0x73, 0x9A, 0x74, 0x9A, 0x75, /* 0x5C-0x5F */
+	0xBD, 0xAA, 0x9A, 0x76, 0x9A, 0x77, 0x9A, 0x78, /* 0x60-0x63 */
+	0x9A, 0x79, 0xBD, 0xAB, 0x9A, 0x7A, 0x9A, 0x81, /* 0x64-0x67 */
+	0x9A, 0x82, 0x9A, 0x83, 0x9A, 0x84, 0x9A, 0x85, /* 0x68-0x6B */
+	0xBD, 0xAC, 0xBD, 0xAD, 0x9A, 0x86, 0x9A, 0x87, /* 0x6C-0x6F */
+	0xBD, 0xAE, 0x9A, 0x88, 0x9A, 0x89, 0x9A, 0x8A, /* 0x70-0x73 */
+	0xBD, 0xAF, 0x9A, 0x8B, 0x9A, 0x8C, 0x9A, 0x8D, /* 0x74-0x77 */
+	0x9A, 0x8E, 0x9A, 0x8F, 0x9A, 0x90, 0x9A, 0x91, /* 0x78-0x7B */
+	0xBD, 0xB0, 0xBD, 0xB1, 0x9A, 0x92, 0xBD, 0xB2, /* 0x7C-0x7F */
+	
+	0x9A, 0x93, 0xBD, 0xB3, 0x9A, 0x94, 0x9A, 0x95, /* 0x80-0x83 */
+	0x9A, 0x96, 0x9A, 0x97, 0x9A, 0x98, 0x9A, 0x99, /* 0x84-0x87 */
+	0xBD, 0xB4, 0xBD, 0xB5, 0x9A, 0x9A, 0x9A, 0x9B, /* 0x88-0x8B */
+	0x9A, 0x9C, 0x9A, 0x9D, 0x9A, 0x9E, 0x9A, 0x9F, /* 0x8C-0x8F */
+	0xBD, 0xB6, 0x9A, 0xA0, 0x9A, 0xA1, 0x9A, 0xA2, /* 0x90-0x93 */
+	0x9A, 0xA3, 0x9A, 0xA4, 0x9A, 0xA5, 0x9A, 0xA6, /* 0x94-0x97 */
+	0xBD, 0xB7, 0x9A, 0xA7, 0x9A, 0xA8, 0xBD, 0xB8, /* 0x98-0x9B */
+	0x9A, 0xA9, 0xBD, 0xB9, 0x9A, 0xAA, 0x9A, 0xAB, /* 0x9C-0x9F */
+	0x9A, 0xAC, 0x9A, 0xAD, 0x9A, 0xAE, 0x9A, 0xAF, /* 0xA0-0xA3 */
+	0xBD, 0xBA, 0xBD, 0xBB, 0x9A, 0xB0, 0x9A, 0xB1, /* 0xA4-0xA7 */
+	0xBD, 0xBC, 0x9A, 0xB2, 0x9A, 0xB3, 0x9A, 0xB4, /* 0xA8-0xAB */
+	0xBD, 0xBD, 0xBD, 0xBE, 0x9A, 0xB5, 0x9A, 0xB6, /* 0xAC-0xAF */
+	0x9A, 0xB7, 0x9A, 0xB8, 0x9A, 0xB9, 0x9A, 0xBA, /* 0xB0-0xB3 */
+	0xBD, 0xBF, 0xBD, 0xC0, 0x9A, 0xBB, 0xBD, 0xC1, /* 0xB4-0xB7 */
+	0x9A, 0xBC, 0xBD, 0xC2, 0x9A, 0xBD, 0x9A, 0xBE, /* 0xB8-0xBB */
+	0x9A, 0xBF, 0x9A, 0xC0, 0x9A, 0xC1, 0x9A, 0xC2, /* 0xBC-0xBF */
+	0x9A, 0xC3, 0x9A, 0xC4, 0x9A, 0xC5, 0x9A, 0xC6, /* 0xC0-0xC3 */
+	0x9A, 0xC7, 0x9A, 0xC8, 0x9A, 0xC9, 0x9A, 0xCA, /* 0xC4-0xC7 */
+	0x9A, 0xCB, 0x9A, 0xCC, 0x9A, 0xCD, 0x9A, 0xCE, /* 0xC8-0xCB */
+	0x9A, 0xCF, 0x9A, 0xD0, 0x9A, 0xD1, 0x9A, 0xD2, /* 0xCC-0xCF */
+	0x9A, 0xD3, 0x9A, 0xD4, 0x9A, 0xD5, 0x9A, 0xD6, /* 0xD0-0xD3 */
+	0x9A, 0xD7, 0x9A, 0xD8, 0x9A, 0xD9, 0x9A, 0xDA, /* 0xD4-0xD7 */
+	0x9A, 0xDB, 0x9A, 0xDC, 0x9A, 0xDD, 0x9A, 0xDE, /* 0xD8-0xDB */
+	0xBD, 0xC3, 0xBD, 0xC4, 0x9A, 0xDF, 0x9A, 0xE0, /* 0xDC-0xDF */
+	0xBD, 0xC5, 0x9A, 0xE1, 0x9A, 0xE2, 0xBD, 0xC6, /* 0xE0-0xE3 */
+	0xBD, 0xC7, 0x9A, 0xE3, 0x9A, 0xE4, 0x9A, 0xE5, /* 0xE4-0xE7 */
+	0x9A, 0xE6, 0x9A, 0xE7, 0x9A, 0xE8, 0xBD, 0xC8, /* 0xE8-0xEB */
+	0xBD, 0xC9, 0xBD, 0xCA, 0x9A, 0xE9, 0xBD, 0xCB, /* 0xEC-0xEF */
+	0x9A, 0xEA, 0xBD, 0xCC, 0x9A, 0xEB, 0x9A, 0xEC, /* 0xF0-0xF3 */
+	0x9A, 0xED, 0x9A, 0xEE, 0xBD, 0xCD, 0x9A, 0xEF, /* 0xF4-0xF7 */
+	0xBD, 0xCE, 0xBD, 0xCF, 0x9A, 0xF0, 0xBD, 0xD0, /* 0xF8-0xFB */
+	0xBD, 0xD1, 0x9A, 0xF1, 0x9A, 0xF2, 0x9A, 0xF3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C3[512] = {
+	0xBD, 0xD2, 0x9A, 0xF4, 0x9A, 0xF5, 0x9A, 0xF6, /* 0x00-0x03 */
+	0x9A, 0xF7, 0x9A, 0xF8, 0x9A, 0xF9, 0x9A, 0xFA, /* 0x04-0x07 */
+	0xBD, 0xD3, 0xBD, 0xD4, 0x9A, 0xFB, 0x9A, 0xFC, /* 0x08-0x0B */
+	0xBD, 0xD5, 0xBD, 0xD6, 0x9A, 0xFD, 0x9A, 0xFE, /* 0x0C-0x0F */
+	0x9B, 0x41, 0x9B, 0x42, 0x9B, 0x43, 0xBD, 0xD7, /* 0x10-0x13 */
+	0xBD, 0xD8, 0xBD, 0xD9, 0x9B, 0x44, 0x9B, 0x45, /* 0x14-0x17 */
+	0xBD, 0xDA, 0x9B, 0x46, 0x9B, 0x47, 0x9B, 0x48, /* 0x18-0x1B */
+	0xBD, 0xDB, 0x9B, 0x49, 0x9B, 0x4A, 0x9B, 0x4B, /* 0x1C-0x1F */
+	0x9B, 0x4C, 0x9B, 0x4D, 0x9B, 0x4E, 0x9B, 0x4F, /* 0x20-0x23 */
+	0xBD, 0xDC, 0xBD, 0xDD, 0x9B, 0x50, 0x9B, 0x51, /* 0x24-0x27 */
+	0xBD, 0xDE, 0xBD, 0xDF, 0x9B, 0x52, 0x9B, 0x53, /* 0x28-0x2B */
+	0x9B, 0x54, 0x9B, 0x55, 0x9B, 0x56, 0x9B, 0x57, /* 0x2C-0x2F */
+	0x9B, 0x58, 0x9B, 0x59, 0x9B, 0x5A, 0x9B, 0x61, /* 0x30-0x33 */
+	0x9B, 0x62, 0x9B, 0x63, 0x9B, 0x64, 0x9B, 0x65, /* 0x34-0x37 */
+	0x9B, 0x66, 0x9B, 0x67, 0x9B, 0x68, 0x9B, 0x69, /* 0x38-0x3B */
+	0x9B, 0x6A, 0x9B, 0x6B, 0x9B, 0x6C, 0x9B, 0x6D, /* 0x3C-0x3F */
+	0x9B, 0x6E, 0x9B, 0x6F, 0x9B, 0x70, 0x9B, 0x71, /* 0x40-0x43 */
+	0x9B, 0x72, 0xBD, 0xE0, 0x9B, 0x73, 0x9B, 0x74, /* 0x44-0x47 */
+	0x9B, 0x75, 0x9B, 0x76, 0x9B, 0x77, 0x9B, 0x78, /* 0x48-0x4B */
+	0x9B, 0x79, 0x9B, 0x7A, 0x9B, 0x81, 0x9B, 0x82, /* 0x4C-0x4F */
+	0x9B, 0x83, 0x9B, 0x84, 0x9B, 0x85, 0x9B, 0x86, /* 0x50-0x53 */
+	0x9B, 0x87, 0x9B, 0x88, 0x9B, 0x89, 0x9B, 0x8A, /* 0x54-0x57 */
+	0x9B, 0x8B, 0x9B, 0x8C, 0x9B, 0x8D, 0x9B, 0x8E, /* 0x58-0x5B */
+	0x9B, 0x8F, 0x9B, 0x90, 0x9B, 0x91, 0x9B, 0x92, /* 0x5C-0x5F */
+	0x9B, 0x93, 0x9B, 0x94, 0x9B, 0x95, 0x9B, 0x96, /* 0x60-0x63 */
+	0x9B, 0x97, 0x9B, 0x98, 0x9B, 0x99, 0x9B, 0x9A, /* 0x64-0x67 */
+	0xBD, 0xE1, 0xBD, 0xE2, 0x9B, 0x9B, 0x9B, 0x9C, /* 0x68-0x6B */
+	0xBD, 0xE3, 0x9B, 0x9D, 0x9B, 0x9E, 0x9B, 0x9F, /* 0x6C-0x6F */
+	0xBD, 0xE4, 0x9B, 0xA0, 0xBD, 0xE5, 0x9B, 0xA1, /* 0x70-0x73 */
+	0x9B, 0xA2, 0x9B, 0xA3, 0x9B, 0xA4, 0x9B, 0xA5, /* 0x74-0x77 */
+	0xBD, 0xE6, 0xBD, 0xE7, 0x9B, 0xA6, 0x9B, 0xA7, /* 0x78-0x7B */
+	0xBD, 0xE8, 0xBD, 0xE9, 0x9B, 0xA8, 0x9B, 0xA9, /* 0x7C-0x7F */
+	
+	0x9B, 0xAA, 0x9B, 0xAB, 0x9B, 0xAC, 0x9B, 0xAD, /* 0x80-0x83 */
+	0xBD, 0xEA, 0x9B, 0xAE, 0x9B, 0xAF, 0x9B, 0xB0, /* 0x84-0x87 */
+	0xBD, 0xEB, 0x9B, 0xB1, 0x9B, 0xB2, 0x9B, 0xB3, /* 0x88-0x8B */
+	0xBD, 0xEC, 0x9B, 0xB4, 0x9B, 0xB5, 0x9B, 0xB6, /* 0x8C-0x8F */
+	0x9B, 0xB7, 0x9B, 0xB8, 0x9B, 0xB9, 0x9B, 0xBA, /* 0x90-0x93 */
+	0x9B, 0xBB, 0x9B, 0xBC, 0x9B, 0xBD, 0x9B, 0xBE, /* 0x94-0x97 */
+	0x9B, 0xBF, 0x9B, 0xC0, 0x9B, 0xC1, 0x9B, 0xC2, /* 0x98-0x9B */
+	0x9B, 0xC3, 0x9B, 0xC4, 0x9B, 0xC5, 0x9B, 0xC6, /* 0x9C-0x9F */
+	0x9B, 0xC7, 0x9B, 0xC8, 0x9B, 0xC9, 0x9B, 0xCA, /* 0xA0-0xA3 */
+	0x9B, 0xCB, 0x9B, 0xCC, 0x9B, 0xCD, 0x9B, 0xCE, /* 0xA4-0xA7 */
+	0x9B, 0xCF, 0x9B, 0xD0, 0x9B, 0xD1, 0x9B, 0xD2, /* 0xA8-0xAB */
+	0x9B, 0xD3, 0x9B, 0xD4, 0x9B, 0xD5, 0x9B, 0xD6, /* 0xAC-0xAF */
+	0x9B, 0xD7, 0x9B, 0xD8, 0x9B, 0xD9, 0x9B, 0xDA, /* 0xB0-0xB3 */
+	0x9B, 0xDB, 0x9B, 0xDC, 0x9B, 0xDD, 0x9B, 0xDE, /* 0xB4-0xB7 */
+	0x9B, 0xDF, 0x9B, 0xE0, 0x9B, 0xE1, 0x9B, 0xE2, /* 0xB8-0xBB */
+	0x9B, 0xE3, 0x9B, 0xE4, 0x9B, 0xE5, 0x9B, 0xE6, /* 0xBC-0xBF */
+	0xBD, 0xED, 0x9B, 0xE7, 0x9B, 0xE8, 0x9B, 0xE9, /* 0xC0-0xC3 */
+	0x9B, 0xEA, 0x9B, 0xEB, 0x9B, 0xEC, 0x9B, 0xED, /* 0xC4-0xC7 */
+	0x9B, 0xEE, 0x9B, 0xEF, 0x9B, 0xF0, 0x9B, 0xF1, /* 0xC8-0xCB */
+	0x9B, 0xF2, 0x9B, 0xF3, 0x9B, 0xF4, 0x9B, 0xF5, /* 0xCC-0xCF */
+	0x9B, 0xF6, 0x9B, 0xF7, 0x9B, 0xF8, 0x9B, 0xF9, /* 0xD0-0xD3 */
+	0x9B, 0xFA, 0x9B, 0xFB, 0x9B, 0xFC, 0x9B, 0xFD, /* 0xD4-0xD7 */
+	0xBD, 0xEE, 0xBD, 0xEF, 0x9B, 0xFE, 0x9C, 0x41, /* 0xD8-0xDB */
+	0xBD, 0xF0, 0x9C, 0x42, 0x9C, 0x43, 0xBD, 0xF1, /* 0xDC-0xDF */
+	0xBD, 0xF2, 0x9C, 0x44, 0xBD, 0xF3, 0x9C, 0x45, /* 0xE0-0xE3 */
+	0x9C, 0x46, 0x9C, 0x47, 0x9C, 0x48, 0x9C, 0x49, /* 0xE4-0xE7 */
+	0xBD, 0xF4, 0xBD, 0xF5, 0x9C, 0x4A, 0x9C, 0x4B, /* 0xE8-0xEB */
+	0x9C, 0x4C, 0xBD, 0xF6, 0x9C, 0x4D, 0x9C, 0x4E, /* 0xEC-0xEF */
+	0x9C, 0x4F, 0x9C, 0x50, 0x9C, 0x51, 0x9C, 0x52, /* 0xF0-0xF3 */
+	0xBD, 0xF7, 0xBD, 0xF8, 0x9C, 0x53, 0x9C, 0x54, /* 0xF4-0xF7 */
+	0xBD, 0xF9, 0x9C, 0x55, 0x9C, 0x56, 0x9C, 0x57, /* 0xF8-0xFB */
+	0x9C, 0x58, 0x9C, 0x59, 0x9C, 0x5A, 0x9C, 0x61, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C4[512] = {
+	0x9C, 0x62, 0x9C, 0x63, 0x9C, 0x64, 0x9C, 0x65, /* 0x00-0x03 */
+	0x9C, 0x66, 0x9C, 0x67, 0x9C, 0x68, 0x9C, 0x69, /* 0x04-0x07 */
+	0xBD, 0xFA, 0x9C, 0x6A, 0x9C, 0x6B, 0x9C, 0x6C, /* 0x08-0x0B */
+	0x9C, 0x6D, 0x9C, 0x6E, 0x9C, 0x6F, 0x9C, 0x70, /* 0x0C-0x0F */
+	0xBD, 0xFB, 0x9C, 0x71, 0x9C, 0x72, 0x9C, 0x73, /* 0x10-0x13 */
+	0x9C, 0x74, 0x9C, 0x75, 0x9C, 0x76, 0x9C, 0x77, /* 0x14-0x17 */
+	0x9C, 0x78, 0x9C, 0x79, 0x9C, 0x7A, 0x9C, 0x81, /* 0x18-0x1B */
+	0x9C, 0x82, 0x9C, 0x83, 0x9C, 0x84, 0x9C, 0x85, /* 0x1C-0x1F */
+	0x9C, 0x86, 0x9C, 0x87, 0x9C, 0x88, 0x9C, 0x89, /* 0x20-0x23 */
+	0xBD, 0xFC, 0x9C, 0x8A, 0x9C, 0x8B, 0x9C, 0x8C, /* 0x24-0x27 */
+	0x9C, 0x8D, 0x9C, 0x8E, 0x9C, 0x8F, 0x9C, 0x90, /* 0x28-0x2B */
+	0xBD, 0xFD, 0x9C, 0x91, 0x9C, 0x92, 0x9C, 0x93, /* 0x2C-0x2F */
+	0xBD, 0xFE, 0x9C, 0x94, 0x9C, 0x95, 0x9C, 0x96, /* 0x30-0x33 */
+	0xBE, 0xA1, 0x9C, 0x97, 0x9C, 0x98, 0x9C, 0x99, /* 0x34-0x37 */
+	0x9C, 0x9A, 0x9C, 0x9B, 0x9C, 0x9C, 0x9C, 0x9D, /* 0x38-0x3B */
+	0xBE, 0xA2, 0xBE, 0xA3, 0x9C, 0x9E, 0x9C, 0x9F, /* 0x3C-0x3F */
+	0x9C, 0xA0, 0x9C, 0xA1, 0x9C, 0xA2, 0x9C, 0xA3, /* 0x40-0x43 */
+	0x9C, 0xA4, 0x9C, 0xA5, 0x9C, 0xA6, 0x9C, 0xA7, /* 0x44-0x47 */
+	0xBE, 0xA4, 0x9C, 0xA8, 0x9C, 0xA9, 0x9C, 0xAA, /* 0x48-0x4B */
+	0x9C, 0xAB, 0x9C, 0xAC, 0x9C, 0xAD, 0x9C, 0xAE, /* 0x4C-0x4F */
+	0x9C, 0xAF, 0x9C, 0xB0, 0x9C, 0xB1, 0x9C, 0xB2, /* 0x50-0x53 */
+	0x9C, 0xB3, 0x9C, 0xB4, 0x9C, 0xB5, 0x9C, 0xB6, /* 0x54-0x57 */
+	0x9C, 0xB7, 0x9C, 0xB8, 0x9C, 0xB9, 0x9C, 0xBA, /* 0x58-0x5B */
+	0x9C, 0xBB, 0x9C, 0xBC, 0x9C, 0xBD, 0x9C, 0xBE, /* 0x5C-0x5F */
+	0x9C, 0xBF, 0x9C, 0xC0, 0x9C, 0xC1, 0x9C, 0xC2, /* 0x60-0x63 */
+	0xBE, 0xA5, 0xBE, 0xA6, 0x9C, 0xC3, 0x9C, 0xC4, /* 0x64-0x67 */
+	0xBE, 0xA7, 0x9C, 0xC5, 0x9C, 0xC6, 0x9C, 0xC7, /* 0x68-0x6B */
+	0xBE, 0xA8, 0x9C, 0xC8, 0x9C, 0xC9, 0x9C, 0xCA, /* 0x6C-0x6F */
+	0x9C, 0xCB, 0x9C, 0xCC, 0x9C, 0xCD, 0x9C, 0xCE, /* 0x70-0x73 */
+	0xBE, 0xA9, 0xBE, 0xAA, 0x9C, 0xCF, 0x9C, 0xD0, /* 0x74-0x77 */
+	0x9C, 0xD1, 0xBE, 0xAB, 0x9C, 0xD2, 0x9C, 0xD3, /* 0x78-0x7B */
+	0x9C, 0xD4, 0x9C, 0xD5, 0x9C, 0xD6, 0x9C, 0xD7, /* 0x7C-0x7F */
+	
+	0xBE, 0xAC, 0x9C, 0xD8, 0x9C, 0xD9, 0x9C, 0xDA, /* 0x80-0x83 */
+	0x9C, 0xDB, 0x9C, 0xDC, 0x9C, 0xDD, 0x9C, 0xDE, /* 0x84-0x87 */
+	0x9C, 0xDF, 0x9C, 0xE0, 0x9C, 0xE1, 0x9C, 0xE2, /* 0x88-0x8B */
+	0x9C, 0xE3, 0x9C, 0xE4, 0x9C, 0xE5, 0x9C, 0xE6, /* 0x8C-0x8F */
+	0x9C, 0xE7, 0x9C, 0xE8, 0x9C, 0xE9, 0x9C, 0xEA, /* 0x90-0x93 */
+	0xBE, 0xAD, 0x9C, 0xEB, 0x9C, 0xEC, 0x9C, 0xED, /* 0x94-0x97 */
+	0x9C, 0xEE, 0x9C, 0xEF, 0x9C, 0xF0, 0x9C, 0xF1, /* 0x98-0x9B */
+	0xBE, 0xAE, 0x9C, 0xF2, 0x9C, 0xF3, 0x9C, 0xF4, /* 0x9C-0x9F */
+	0x9C, 0xF5, 0x9C, 0xF6, 0x9C, 0xF7, 0x9C, 0xF8, /* 0xA0-0xA3 */
+	0x9C, 0xF9, 0x9C, 0xFA, 0x9C, 0xFB, 0x9C, 0xFC, /* 0xA4-0xA7 */
+	0x9C, 0xFD, 0x9C, 0xFE, 0x9D, 0x41, 0x9D, 0x42, /* 0xA8-0xAB */
+	0x9D, 0x43, 0x9D, 0x44, 0x9D, 0x45, 0x9D, 0x46, /* 0xAC-0xAF */
+	0x9D, 0x47, 0x9D, 0x48, 0x9D, 0x49, 0x9D, 0x4A, /* 0xB0-0xB3 */
+	0x9D, 0x4B, 0x9D, 0x4C, 0x9D, 0x4D, 0x9D, 0x4E, /* 0xB4-0xB7 */
+	0xBE, 0xAF, 0x9D, 0x4F, 0x9D, 0x50, 0x9D, 0x51, /* 0xB8-0xBB */
+	0xBE, 0xB0, 0x9D, 0x52, 0x9D, 0x53, 0x9D, 0x54, /* 0xBC-0xBF */
+	0x9D, 0x55, 0x9D, 0x56, 0x9D, 0x57, 0x9D, 0x58, /* 0xC0-0xC3 */
+	0x9D, 0x59, 0x9D, 0x5A, 0x9D, 0x61, 0x9D, 0x62, /* 0xC4-0xC7 */
+	0x9D, 0x63, 0x9D, 0x64, 0x9D, 0x65, 0x9D, 0x66, /* 0xC8-0xCB */
+	0x9D, 0x67, 0x9D, 0x68, 0x9D, 0x69, 0x9D, 0x6A, /* 0xCC-0xCF */
+	0x9D, 0x6B, 0x9D, 0x6C, 0x9D, 0x6D, 0x9D, 0x6E, /* 0xD0-0xD3 */
+	0x9D, 0x6F, 0x9D, 0x70, 0x9D, 0x71, 0x9D, 0x72, /* 0xD4-0xD7 */
+	0x9D, 0x73, 0x9D, 0x74, 0x9D, 0x75, 0x9D, 0x76, /* 0xD8-0xDB */
+	0x9D, 0x77, 0x9D, 0x78, 0x9D, 0x79, 0x9D, 0x7A, /* 0xDC-0xDF */
+	0x9D, 0x81, 0x9D, 0x82, 0x9D, 0x83, 0x9D, 0x84, /* 0xE0-0xE3 */
+	0x9D, 0x85, 0x9D, 0x86, 0x9D, 0x87, 0x9D, 0x88, /* 0xE4-0xE7 */
+	0x9D, 0x89, 0xBE, 0xB1, 0x9D, 0x8A, 0x9D, 0x8B, /* 0xE8-0xEB */
+	0x9D, 0x8C, 0x9D, 0x8D, 0x9D, 0x8E, 0x9D, 0x8F, /* 0xEC-0xEF */
+	0xBE, 0xB2, 0xBE, 0xB3, 0x9D, 0x90, 0x9D, 0x91, /* 0xF0-0xF3 */
+	0xBE, 0xB4, 0x9D, 0x92, 0x9D, 0x93, 0x9D, 0x94, /* 0xF4-0xF7 */
+	0xBE, 0xB5, 0x9D, 0x95, 0xBE, 0xB6, 0x9D, 0x96, /* 0xF8-0xFB */
+	0x9D, 0x97, 0x9D, 0x98, 0x9D, 0x99, 0xBE, 0xB7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C5[512] = {
+	0xBE, 0xB8, 0xBE, 0xB9, 0x9D, 0x9A, 0x9D, 0x9B, /* 0x00-0x03 */
+	0x9D, 0x9C, 0x9D, 0x9D, 0x9D, 0x9E, 0x9D, 0x9F, /* 0x04-0x07 */
+	0x9D, 0xA0, 0x9D, 0xA1, 0x9D, 0xA2, 0x9D, 0xA3, /* 0x08-0x0B */
+	0xBE, 0xBA, 0x9D, 0xA4, 0x9D, 0xA5, 0x9D, 0xA6, /* 0x0C-0x0F */
+	0xBE, 0xBB, 0x9D, 0xA7, 0x9D, 0xA8, 0x9D, 0xA9, /* 0x10-0x13 */
+	0xBE, 0xBC, 0x9D, 0xAA, 0x9D, 0xAB, 0x9D, 0xAC, /* 0x14-0x17 */
+	0x9D, 0xAD, 0x9D, 0xAE, 0x9D, 0xAF, 0x9D, 0xB0, /* 0x18-0x1B */
+	0xBE, 0xBD, 0x9D, 0xB1, 0x9D, 0xB2, 0x9D, 0xB3, /* 0x1C-0x1F */
+	0x9D, 0xB4, 0x9D, 0xB5, 0x9D, 0xB6, 0x9D, 0xB7, /* 0x20-0x23 */
+	0x9D, 0xB8, 0x9D, 0xB9, 0x9D, 0xBA, 0x9D, 0xBB, /* 0x24-0x27 */
+	0xBE, 0xBE, 0xBE, 0xBF, 0x9D, 0xBC, 0x9D, 0xBD, /* 0x28-0x2B */
+	0xBE, 0xC0, 0x9D, 0xBE, 0x9D, 0xBF, 0x9D, 0xC0, /* 0x2C-0x2F */
+	0xBE, 0xC1, 0x9D, 0xC1, 0x9D, 0xC2, 0x9D, 0xC3, /* 0x30-0x33 */
+	0x9D, 0xC4, 0x9D, 0xC5, 0x9D, 0xC6, 0x9D, 0xC7, /* 0x34-0x37 */
+	0xBE, 0xC2, 0xBE, 0xC3, 0x9D, 0xC8, 0xBE, 0xC4, /* 0x38-0x3B */
+	0x9D, 0xC9, 0xBE, 0xC5, 0x9D, 0xCA, 0x9D, 0xCB, /* 0x3C-0x3F */
+	0x9D, 0xCC, 0x9D, 0xCD, 0x9D, 0xCE, 0x9D, 0xCF, /* 0x40-0x43 */
+	0xBE, 0xC6, 0xBE, 0xC7, 0x9D, 0xD0, 0x9D, 0xD1, /* 0x44-0x47 */
+	0xBE, 0xC8, 0xBE, 0xC9, 0xBE, 0xCA, 0x9D, 0xD2, /* 0x48-0x4B */
+	0xBE, 0xCB, 0xBE, 0xCC, 0xBE, 0xCD, 0x9D, 0xD3, /* 0x4C-0x4F */
+	0x9D, 0xD4, 0x9D, 0xD5, 0x9D, 0xD6, 0xBE, 0xCE, /* 0x50-0x53 */
+	0xBE, 0xCF, 0xBE, 0xD0, 0x9D, 0xD7, 0xBE, 0xD1, /* 0x54-0x57 */
+	0xBE, 0xD2, 0xBE, 0xD3, 0x9D, 0xD8, 0x9D, 0xD9, /* 0x58-0x5B */
+	0x9D, 0xDA, 0xBE, 0xD4, 0xBE, 0xD5, 0x9D, 0xDB, /* 0x5C-0x5F */
+	0xBE, 0xD6, 0xBE, 0xD7, 0x9D, 0xDC, 0x9D, 0xDD, /* 0x60-0x63 */
+	0xBE, 0xD8, 0x9D, 0xDE, 0x9D, 0xDF, 0x9D, 0xE0, /* 0x64-0x67 */
+	0xBE, 0xD9, 0x9D, 0xE1, 0x9D, 0xE2, 0x9D, 0xE3, /* 0x68-0x6B */
+	0x9D, 0xE4, 0x9D, 0xE5, 0x9D, 0xE6, 0x9D, 0xE7, /* 0x6C-0x6F */
+	0xBE, 0xDA, 0xBE, 0xDB, 0x9D, 0xE8, 0xBE, 0xDC, /* 0x70-0x73 */
+	0xBE, 0xDD, 0xBE, 0xDE, 0x9D, 0xE9, 0x9D, 0xEA, /* 0x74-0x77 */
+	0x9D, 0xEB, 0x9D, 0xEC, 0x9D, 0xED, 0x9D, 0xEE, /* 0x78-0x7B */
+	0xBE, 0xDF, 0xBE, 0xE0, 0x9D, 0xEF, 0x9D, 0xF0, /* 0x7C-0x7F */
+	
+	0xBE, 0xE1, 0x9D, 0xF1, 0x9D, 0xF2, 0x9D, 0xF3, /* 0x80-0x83 */
+	0xBE, 0xE2, 0x9D, 0xF4, 0x9D, 0xF5, 0xBE, 0xE3, /* 0x84-0x87 */
+	0x9D, 0xF6, 0x9D, 0xF7, 0x9D, 0xF8, 0x9D, 0xF9, /* 0x88-0x8B */
+	0xBE, 0xE4, 0xBE, 0xE5, 0x9D, 0xFA, 0xBE, 0xE6, /* 0x8C-0x8F */
+	0x9D, 0xFB, 0xBE, 0xE7, 0x9D, 0xFC, 0x9D, 0xFD, /* 0x90-0x93 */
+	0x9D, 0xFE, 0xBE, 0xE8, 0x9E, 0x41, 0xBE, 0xE9, /* 0x94-0x97 */
+	0xBE, 0xEA, 0x9E, 0x42, 0x9E, 0x43, 0x9E, 0x44, /* 0x98-0x9B */
+	0xBE, 0xEB, 0x9E, 0x45, 0x9E, 0x46, 0x9E, 0x47, /* 0x9C-0x9F */
+	0xBE, 0xEC, 0x9E, 0x48, 0x9E, 0x49, 0x9E, 0x4A, /* 0xA0-0xA3 */
+	0x9E, 0x4B, 0x9E, 0x4C, 0x9E, 0x4D, 0x9E, 0x4E, /* 0xA4-0xA7 */
+	0x9E, 0x4F, 0xBE, 0xED, 0x9E, 0x50, 0x9E, 0x51, /* 0xA8-0xAB */
+	0x9E, 0x52, 0x9E, 0x53, 0x9E, 0x54, 0x9E, 0x55, /* 0xAC-0xAF */
+	0x9E, 0x56, 0x9E, 0x57, 0x9E, 0x58, 0x9E, 0x59, /* 0xB0-0xB3 */
+	0xBE, 0xEE, 0xBE, 0xEF, 0x9E, 0x5A, 0x9E, 0x61, /* 0xB4-0xB7 */
+	0xBE, 0xF0, 0xBE, 0xF1, 0x9E, 0x62, 0xBE, 0xF2, /* 0xB8-0xBB */
+	0xBE, 0xF3, 0xBE, 0xF4, 0xBE, 0xF5, 0x9E, 0x63, /* 0xBC-0xBF */
+	0x9E, 0x64, 0x9E, 0x65, 0x9E, 0x66, 0x9E, 0x67, /* 0xC0-0xC3 */
+	0xBE, 0xF6, 0xBE, 0xF7, 0xBE, 0xF8, 0xBE, 0xF9, /* 0xC4-0xC7 */
+	0xBE, 0xFA, 0xBE, 0xFB, 0xBE, 0xFC, 0x9E, 0x68, /* 0xC8-0xCB */
+	0xBE, 0xFD, 0x9E, 0x69, 0xBE, 0xFE, 0x9E, 0x6A, /* 0xCC-0xCF */
+	0xBF, 0xA1, 0xBF, 0xA2, 0x9E, 0x6B, 0x9E, 0x6C, /* 0xD0-0xD3 */
+	0xBF, 0xA3, 0x9E, 0x6D, 0x9E, 0x6E, 0x9E, 0x6F, /* 0xD4-0xD7 */
+	0xBF, 0xA4, 0x9E, 0x70, 0x9E, 0x71, 0x9E, 0x72, /* 0xD8-0xDB */
+	0x9E, 0x73, 0x9E, 0x74, 0x9E, 0x75, 0x9E, 0x76, /* 0xDC-0xDF */
+	0xBF, 0xA5, 0xBF, 0xA6, 0x9E, 0x77, 0xBF, 0xA7, /* 0xE0-0xE3 */
+	0x9E, 0x78, 0xBF, 0xA8, 0x9E, 0x79, 0x9E, 0x7A, /* 0xE4-0xE7 */
+	0x9E, 0x81, 0x9E, 0x82, 0x9E, 0x83, 0x9E, 0x84, /* 0xE8-0xEB */
+	0xBF, 0xA9, 0xBF, 0xAA, 0xBF, 0xAB, 0x9E, 0x85, /* 0xEC-0xEF */
+	0xBF, 0xAC, 0x9E, 0x86, 0x9E, 0x87, 0x9E, 0x88, /* 0xF0-0xF3 */
+	0xBF, 0xAD, 0x9E, 0x89, 0xBF, 0xAE, 0xBF, 0xAF, /* 0xF4-0xF7 */
+	0x9E, 0x8A, 0x9E, 0x8B, 0x9E, 0x8C, 0x9E, 0x8D, /* 0xF8-0xFB */
+	0xBF, 0xB0, 0xBF, 0xB1, 0xBF, 0xB2, 0xBF, 0xB3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C6[512] = {
+	0xBF, 0xB4, 0xBF, 0xB5, 0x9E, 0x8E, 0x9E, 0x8F, /* 0x00-0x03 */
+	0x9E, 0x90, 0xBF, 0xB6, 0xBF, 0xB7, 0xBF, 0xB8, /* 0x04-0x07 */
+	0xBF, 0xB9, 0x9E, 0x91, 0x9E, 0x92, 0x9E, 0x93, /* 0x08-0x0B */
+	0xBF, 0xBA, 0x9E, 0x94, 0x9E, 0x95, 0x9E, 0x96, /* 0x0C-0x0F */
+	0xBF, 0xBB, 0x9E, 0x97, 0x9E, 0x98, 0x9E, 0x99, /* 0x10-0x13 */
+	0x9E, 0x9A, 0x9E, 0x9B, 0x9E, 0x9C, 0x9E, 0x9D, /* 0x14-0x17 */
+	0xBF, 0xBC, 0xBF, 0xBD, 0x9E, 0x9E, 0xBF, 0xBE, /* 0x18-0x1B */
+	0xBF, 0xBF, 0x9E, 0x9F, 0x9E, 0xA0, 0x9E, 0xA1, /* 0x1C-0x1F */
+	0x9E, 0xA2, 0x9E, 0xA3, 0x9E, 0xA4, 0x9E, 0xA5, /* 0x20-0x23 */
+	0xBF, 0xC0, 0xBF, 0xC1, 0x9E, 0xA6, 0x9E, 0xA7, /* 0x24-0x27 */
+	0xBF, 0xC2, 0x9E, 0xA8, 0x9E, 0xA9, 0x9E, 0xAA, /* 0x28-0x2B */
+	0xBF, 0xC3, 0xBF, 0xC4, 0xBF, 0xC5, 0x9E, 0xAB, /* 0x2C-0x2F */
+	0xBF, 0xC6, 0x9E, 0xAC, 0x9E, 0xAD, 0xBF, 0xC7, /* 0x30-0x33 */
+	0xBF, 0xC8, 0xBF, 0xC9, 0x9E, 0xAE, 0xBF, 0xCA, /* 0x34-0x37 */
+	0x9E, 0xAF, 0xBF, 0xCB, 0x9E, 0xB0, 0xBF, 0xCC, /* 0x38-0x3B */
+	0x9E, 0xB1, 0x9E, 0xB2, 0x9E, 0xB3, 0x9E, 0xB4, /* 0x3C-0x3F */
+	0xBF, 0xCD, 0xBF, 0xCE, 0x9E, 0xB5, 0x9E, 0xB6, /* 0x40-0x43 */
+	0xBF, 0xCF, 0x9E, 0xB7, 0x9E, 0xB8, 0x9E, 0xB9, /* 0x44-0x47 */
+	0xBF, 0xD0, 0x9E, 0xBA, 0x9E, 0xBB, 0x9E, 0xBC, /* 0x48-0x4B */
+	0x9E, 0xBD, 0x9E, 0xBE, 0x9E, 0xBF, 0x9E, 0xC0, /* 0x4C-0x4F */
+	0xBF, 0xD1, 0xBF, 0xD2, 0x9E, 0xC1, 0xBF, 0xD3, /* 0x50-0x53 */
+	0xBF, 0xD4, 0xBF, 0xD5, 0x9E, 0xC2, 0x9E, 0xC3, /* 0x54-0x57 */
+	0x9E, 0xC4, 0x9E, 0xC5, 0x9E, 0xC6, 0x9E, 0xC7, /* 0x58-0x5B */
+	0xBF, 0xD6, 0xBF, 0xD7, 0x9E, 0xC8, 0x9E, 0xC9, /* 0x5C-0x5F */
+	0xBF, 0xD8, 0x9E, 0xCA, 0x9E, 0xCB, 0x9E, 0xCC, /* 0x60-0x63 */
+	0x9E, 0xCD, 0x9E, 0xCE, 0x9E, 0xCF, 0x9E, 0xD0, /* 0x64-0x67 */
+	0x9E, 0xD1, 0x9E, 0xD2, 0x9E, 0xD3, 0x9E, 0xD4, /* 0x68-0x6B */
+	0xBF, 0xD9, 0x9E, 0xD5, 0x9E, 0xD6, 0xBF, 0xDA, /* 0x6C-0x6F */
+	0x9E, 0xD7, 0xBF, 0xDB, 0x9E, 0xD8, 0x9E, 0xD9, /* 0x70-0x73 */
+	0x9E, 0xDA, 0x9E, 0xDB, 0x9E, 0xDC, 0x9E, 0xDD, /* 0x74-0x77 */
+	0xBF, 0xDC, 0xBF, 0xDD, 0x9E, 0xDE, 0x9E, 0xDF, /* 0x78-0x7B */
+	0xBF, 0xDE, 0x9E, 0xE0, 0x9E, 0xE1, 0x9E, 0xE2, /* 0x7C-0x7F */
+	
+	0xBF, 0xDF, 0x9E, 0xE3, 0x9E, 0xE4, 0x9E, 0xE5, /* 0x80-0x83 */
+	0x9E, 0xE6, 0x9E, 0xE7, 0x9E, 0xE8, 0x9E, 0xE9, /* 0x84-0x87 */
+	0xBF, 0xE0, 0xBF, 0xE1, 0x9E, 0xEA, 0xBF, 0xE2, /* 0x88-0x8B */
+	0x9E, 0xEB, 0xBF, 0xE3, 0x9E, 0xEC, 0x9E, 0xED, /* 0x8C-0x8F */
+	0x9E, 0xEE, 0x9E, 0xEF, 0x9E, 0xF0, 0x9E, 0xF1, /* 0x90-0x93 */
+	0xBF, 0xE4, 0xBF, 0xE5, 0x9E, 0xF2, 0x9E, 0xF3, /* 0x94-0x97 */
+	0xBF, 0xE6, 0x9E, 0xF4, 0x9E, 0xF5, 0x9E, 0xF6, /* 0x98-0x9B */
+	0xBF, 0xE7, 0x9E, 0xF7, 0x9E, 0xF8, 0x9E, 0xF9, /* 0x9C-0x9F */
+	0x9E, 0xFA, 0x9E, 0xFB, 0x9E, 0xFC, 0x9E, 0xFD, /* 0xA0-0xA3 */
+	0xBF, 0xE8, 0xBF, 0xE9, 0x9E, 0xFE, 0xBF, 0xEA, /* 0xA4-0xA7 */
+	0x9F, 0x41, 0xBF, 0xEB, 0x9F, 0x42, 0x9F, 0x43, /* 0xA8-0xAB */
+	0x9F, 0x44, 0x9F, 0x45, 0x9F, 0x46, 0x9F, 0x47, /* 0xAC-0xAF */
+	0xBF, 0xEC, 0xBF, 0xED, 0x9F, 0x48, 0x9F, 0x49, /* 0xB0-0xB3 */
+	0xBF, 0xEE, 0x9F, 0x4A, 0x9F, 0x4B, 0x9F, 0x4C, /* 0xB4-0xB7 */
+	0xBF, 0xEF, 0xBF, 0xF0, 0xBF, 0xF1, 0x9F, 0x4D, /* 0xB8-0xBB */
+	0x9F, 0x4E, 0x9F, 0x4F, 0x9F, 0x50, 0x9F, 0x51, /* 0xBC-0xBF */
+	0xBF, 0xF2, 0xBF, 0xF3, 0x9F, 0x52, 0xBF, 0xF4, /* 0xC0-0xC3 */
+	0x9F, 0x53, 0xBF, 0xF5, 0x9F, 0x54, 0x9F, 0x55, /* 0xC4-0xC7 */
+	0x9F, 0x56, 0x9F, 0x57, 0x9F, 0x58, 0x9F, 0x59, /* 0xC8-0xCB */
+	0xBF, 0xF6, 0xBF, 0xF7, 0x9F, 0x5A, 0x9F, 0x61, /* 0xCC-0xCF */
+	0xBF, 0xF8, 0x9F, 0x62, 0x9F, 0x63, 0x9F, 0x64, /* 0xD0-0xD3 */
+	0xBF, 0xF9, 0x9F, 0x65, 0x9F, 0x66, 0x9F, 0x67, /* 0xD4-0xD7 */
+	0x9F, 0x68, 0x9F, 0x69, 0x9F, 0x6A, 0x9F, 0x6B, /* 0xD8-0xDB */
+	0xBF, 0xFA, 0xBF, 0xFB, 0x9F, 0x6C, 0x9F, 0x6D, /* 0xDC-0xDF */
+	0xBF, 0xFC, 0xBF, 0xFD, 0x9F, 0x6E, 0x9F, 0x6F, /* 0xE0-0xE3 */
+	0x9F, 0x70, 0x9F, 0x71, 0x9F, 0x72, 0x9F, 0x73, /* 0xE4-0xE7 */
+	0xBF, 0xFE, 0xC0, 0xA1, 0x9F, 0x74, 0x9F, 0x75, /* 0xE8-0xEB */
+	0xC0, 0xA2, 0x9F, 0x76, 0x9F, 0x77, 0x9F, 0x78, /* 0xEC-0xEF */
+	0xC0, 0xA3, 0x9F, 0x79, 0x9F, 0x7A, 0x9F, 0x81, /* 0xF0-0xF3 */
+	0x9F, 0x82, 0x9F, 0x83, 0x9F, 0x84, 0x9F, 0x85, /* 0xF4-0xF7 */
+	0xC0, 0xA4, 0xC0, 0xA5, 0x9F, 0x86, 0x9F, 0x87, /* 0xF8-0xFB */
+	0x9F, 0x88, 0xC0, 0xA6, 0x9F, 0x89, 0x9F, 0x8A, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C7[512] = {
+	0x9F, 0x8B, 0x9F, 0x8C, 0x9F, 0x8D, 0x9F, 0x8E, /* 0x00-0x03 */
+	0xC0, 0xA7, 0xC0, 0xA8, 0x9F, 0x8F, 0x9F, 0x90, /* 0x04-0x07 */
+	0xC0, 0xA9, 0x9F, 0x91, 0x9F, 0x92, 0x9F, 0x93, /* 0x08-0x0B */
+	0xC0, 0xAA, 0x9F, 0x94, 0x9F, 0x95, 0x9F, 0x96, /* 0x0C-0x0F */
+	0x9F, 0x97, 0x9F, 0x98, 0x9F, 0x99, 0x9F, 0x9A, /* 0x10-0x13 */
+	0xC0, 0xAB, 0xC0, 0xAC, 0x9F, 0x9B, 0xC0, 0xAD, /* 0x14-0x17 */
+	0x9F, 0x9C, 0xC0, 0xAE, 0x9F, 0x9D, 0x9F, 0x9E, /* 0x18-0x1B */
+	0x9F, 0x9F, 0x9F, 0xA0, 0x9F, 0xA1, 0x9F, 0xA2, /* 0x1C-0x1F */
+	0xC0, 0xAF, 0xC0, 0xB0, 0x9F, 0xA3, 0x9F, 0xA4, /* 0x20-0x23 */
+	0xC0, 0xB1, 0x9F, 0xA5, 0x9F, 0xA6, 0x9F, 0xA7, /* 0x24-0x27 */
+	0xC0, 0xB2, 0x9F, 0xA8, 0x9F, 0xA9, 0x9F, 0xAA, /* 0x28-0x2B */
+	0x9F, 0xAB, 0x9F, 0xAC, 0x9F, 0xAD, 0x9F, 0xAE, /* 0x2C-0x2F */
+	0xC0, 0xB3, 0xC0, 0xB4, 0x9F, 0xAF, 0xC0, 0xB5, /* 0x30-0x33 */
+	0x9F, 0xB0, 0xC0, 0xB6, 0x9F, 0xB1, 0xC0, 0xB7, /* 0x34-0x37 */
+	0x9F, 0xB2, 0x9F, 0xB3, 0x9F, 0xB4, 0x9F, 0xB5, /* 0x38-0x3B */
+	0xC0, 0xB8, 0xC0, 0xB9, 0x9F, 0xB6, 0x9F, 0xB7, /* 0x3C-0x3F */
+	0xC0, 0xBA, 0x9F, 0xB8, 0x9F, 0xB9, 0x9F, 0xBA, /* 0x40-0x43 */
+	0xC0, 0xBB, 0x9F, 0xBB, 0x9F, 0xBC, 0x9F, 0xBD, /* 0x44-0x47 */
+	0x9F, 0xBE, 0x9F, 0xBF, 0xC0, 0xBC, 0x9F, 0xC0, /* 0x48-0x4B */
+	0xC0, 0xBD, 0xC0, 0xBE, 0x9F, 0xC1, 0xC0, 0xBF, /* 0x4C-0x4F */
+	0x9F, 0xC2, 0xC0, 0xC0, 0xC0, 0xC1, 0xC0, 0xC2, /* 0x50-0x53 */
+	0xC0, 0xC3, 0xC0, 0xC4, 0xC0, 0xC5, 0xC0, 0xC6, /* 0x54-0x57 */
+	0xC0, 0xC7, 0x9F, 0xC3, 0x9F, 0xC4, 0x9F, 0xC5, /* 0x58-0x5B */
+	0xC0, 0xC8, 0x9F, 0xC6, 0x9F, 0xC7, 0x9F, 0xC8, /* 0x5C-0x5F */
+	0xC0, 0xC9, 0x9F, 0xC9, 0x9F, 0xCA, 0x9F, 0xCB, /* 0x60-0x63 */
+	0x9F, 0xCC, 0x9F, 0xCD, 0x9F, 0xCE, 0x9F, 0xCF, /* 0x64-0x67 */
+	0xC0, 0xCA, 0x9F, 0xD0, 0x9F, 0xD1, 0xC0, 0xCB, /* 0x68-0x6B */
+	0x9F, 0xD2, 0x9F, 0xD3, 0x9F, 0xD4, 0x9F, 0xD5, /* 0x6C-0x6F */
+	0x9F, 0xD6, 0x9F, 0xD7, 0x9F, 0xD8, 0x9F, 0xD9, /* 0x70-0x73 */
+	0xC0, 0xCC, 0xC0, 0xCD, 0x9F, 0xDA, 0x9F, 0xDB, /* 0x74-0x77 */
+	0xC0, 0xCE, 0x9F, 0xDC, 0x9F, 0xDD, 0x9F, 0xDE, /* 0x78-0x7B */
+	0xC0, 0xCF, 0xC0, 0xD0, 0xC0, 0xD1, 0x9F, 0xDF, /* 0x7C-0x7F */
+	
+	0x9F, 0xE0, 0x9F, 0xE1, 0x9F, 0xE2, 0xC0, 0xD2, /* 0x80-0x83 */
+	0xC0, 0xD3, 0xC0, 0xD4, 0x9F, 0xE3, 0xC0, 0xD5, /* 0x84-0x87 */
+	0xC0, 0xD6, 0xC0, 0xD7, 0xC0, 0xD8, 0x9F, 0xE4, /* 0x88-0x8B */
+	0x9F, 0xE5, 0x9F, 0xE6, 0xC0, 0xD9, 0x9F, 0xE7, /* 0x8C-0x8F */
+	0xC0, 0xDA, 0xC0, 0xDB, 0x9F, 0xE8, 0x9F, 0xE9, /* 0x90-0x93 */
+	0xC0, 0xDC, 0x9F, 0xEA, 0xC0, 0xDD, 0xC0, 0xDE, /* 0x94-0x97 */
+	0xC0, 0xDF, 0x9F, 0xEB, 0xC0, 0xE0, 0x9F, 0xEC, /* 0x98-0x9B */
+	0x9F, 0xED, 0x9F, 0xEE, 0x9F, 0xEF, 0x9F, 0xF0, /* 0x9C-0x9F */
+	0xC0, 0xE1, 0xC0, 0xE2, 0x9F, 0xF1, 0xC0, 0xE3, /* 0xA0-0xA3 */
+	0xC0, 0xE4, 0xC0, 0xE5, 0xC0, 0xE6, 0x9F, 0xF2, /* 0xA4-0xA7 */
+	0x9F, 0xF3, 0x9F, 0xF4, 0x9F, 0xF5, 0x9F, 0xF6, /* 0xA8-0xAB */
+	0xC0, 0xE7, 0xC0, 0xE8, 0x9F, 0xF7, 0x9F, 0xF8, /* 0xAC-0xAF */
+	0xC0, 0xE9, 0x9F, 0xF9, 0x9F, 0xFA, 0x9F, 0xFB, /* 0xB0-0xB3 */
+	0xC0, 0xEA, 0x9F, 0xFC, 0x9F, 0xFD, 0x9F, 0xFE, /* 0xB4-0xB7 */
+	0xA0, 0x41, 0xA0, 0x42, 0xA0, 0x43, 0xA0, 0x44, /* 0xB8-0xBB */
+	0xC0, 0xEB, 0xC0, 0xEC, 0xA0, 0x45, 0xC0, 0xED, /* 0xBC-0xBF */
+	0xC0, 0xEE, 0xC0, 0xEF, 0xA0, 0x46, 0xA0, 0x47, /* 0xC0-0xC3 */
+	0xA0, 0x48, 0xA0, 0x49, 0xA0, 0x4A, 0xA0, 0x4B, /* 0xC4-0xC7 */
+	0xC0, 0xF0, 0xC0, 0xF1, 0xA0, 0x4C, 0xA0, 0x4D, /* 0xC8-0xCB */
+	0xC0, 0xF2, 0xA0, 0x4E, 0xC0, 0xF3, 0xA0, 0x4F, /* 0xCC-0xCF */
+	0xC0, 0xF4, 0xA0, 0x50, 0xA0, 0x51, 0xA0, 0x52, /* 0xD0-0xD3 */
+	0xA0, 0x53, 0xA0, 0x54, 0xA0, 0x55, 0xA0, 0x56, /* 0xD4-0xD7 */
+	0xC0, 0xF5, 0xA0, 0x57, 0xA0, 0x58, 0xA0, 0x59, /* 0xD8-0xDB */
+	0xA0, 0x5A, 0xC0, 0xF6, 0xA0, 0x61, 0xA0, 0x62, /* 0xDC-0xDF */
+	0xA0, 0x63, 0xA0, 0x64, 0xA0, 0x65, 0xA0, 0x66, /* 0xE0-0xE3 */
+	0xC0, 0xF7, 0xA0, 0x67, 0xA0, 0x68, 0xA0, 0x69, /* 0xE4-0xE7 */
+	0xC0, 0xF8, 0xA0, 0x6A, 0xA0, 0x6B, 0xA0, 0x6C, /* 0xE8-0xEB */
+	0xC0, 0xF9, 0xA0, 0x6D, 0xA0, 0x6E, 0xA0, 0x6F, /* 0xEC-0xEF */
+	0xA0, 0x70, 0xA0, 0x71, 0xA0, 0x72, 0xA0, 0x73, /* 0xF0-0xF3 */
+	0xA0, 0x74, 0xA0, 0x75, 0xA0, 0x76, 0xA0, 0x77, /* 0xF4-0xF7 */
+	0xA0, 0x78, 0xA0, 0x79, 0xA0, 0x7A, 0xA0, 0x81, /* 0xF8-0xFB */
+	0xA0, 0x82, 0xA0, 0x83, 0xA0, 0x84, 0xA0, 0x85, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C8[512] = {
+	0xC0, 0xFA, 0xC0, 0xFB, 0xA0, 0x86, 0xA0, 0x87, /* 0x00-0x03 */
+	0xC0, 0xFC, 0xA0, 0x88, 0xA0, 0x89, 0xA0, 0x8A, /* 0x04-0x07 */
+	0xC0, 0xFD, 0xA0, 0x8B, 0xC0, 0xFE, 0xA0, 0x8C, /* 0x08-0x0B */
+	0xA0, 0x8D, 0xA0, 0x8E, 0xA0, 0x8F, 0xA0, 0x90, /* 0x0C-0x0F */
+	0xC1, 0xA1, 0xC1, 0xA2, 0xA0, 0x91, 0xC1, 0xA3, /* 0x10-0x13 */
+	0xA0, 0x92, 0xC1, 0xA4, 0xC1, 0xA5, 0xA0, 0x93, /* 0x14-0x17 */
+	0xA0, 0x94, 0xA0, 0x95, 0xA0, 0x96, 0xA0, 0x97, /* 0x18-0x1B */
+	0xC1, 0xA6, 0xC1, 0xA7, 0xA0, 0x98, 0xA0, 0x99, /* 0x1C-0x1F */
+	0xC1, 0xA8, 0xA0, 0x9A, 0xA0, 0x9B, 0xA0, 0x9C, /* 0x20-0x23 */
+	0xC1, 0xA9, 0xA0, 0x9D, 0xA0, 0x9E, 0xA0, 0x9F, /* 0x24-0x27 */
+	0xA0, 0xA0, 0xA0, 0xA1, 0xA0, 0xA2, 0xA0, 0xA3, /* 0x28-0x2B */
+	0xC1, 0xAA, 0xC1, 0xAB, 0xA0, 0xA4, 0xC1, 0xAC, /* 0x2C-0x2F */
+	0xA0, 0xA5, 0xC1, 0xAD, 0xA0, 0xA6, 0xA0, 0xA7, /* 0x30-0x33 */
+	0xA0, 0xA8, 0xA0, 0xA9, 0xA0, 0xAA, 0xA0, 0xAB, /* 0x34-0x37 */
+	0xC1, 0xAE, 0xA0, 0xAC, 0xA0, 0xAD, 0xA0, 0xAE, /* 0x38-0x3B */
+	0xC1, 0xAF, 0xA0, 0xAF, 0xA0, 0xB0, 0xA0, 0xB1, /* 0x3C-0x3F */
+	0xC1, 0xB0, 0xA0, 0xB2, 0xA0, 0xB3, 0xA0, 0xB4, /* 0x40-0x43 */
+	0xA0, 0xB5, 0xA0, 0xB6, 0xA0, 0xB7, 0xA0, 0xB8, /* 0x44-0x47 */
+	0xC1, 0xB1, 0xC1, 0xB2, 0xA0, 0xB9, 0xA0, 0xBA, /* 0x48-0x4B */
+	0xC1, 0xB3, 0xC1, 0xB4, 0xA0, 0xBB, 0xA0, 0xBC, /* 0x4C-0x4F */
+	0xA0, 0xBD, 0xA0, 0xBE, 0xA0, 0xBF, 0xA0, 0xC0, /* 0x50-0x53 */
+	0xC1, 0xB5, 0xA0, 0xC1, 0xA0, 0xC2, 0xA0, 0xC3, /* 0x54-0x57 */
+	0xA0, 0xC4, 0xA0, 0xC5, 0xA0, 0xC6, 0xA0, 0xC7, /* 0x58-0x5B */
+	0xA0, 0xC8, 0xA0, 0xC9, 0xA0, 0xCA, 0xA0, 0xCB, /* 0x5C-0x5F */
+	0xA0, 0xCC, 0xA0, 0xCD, 0xA0, 0xCE, 0xA0, 0xCF, /* 0x60-0x63 */
+	0xA0, 0xD0, 0xA0, 0xD1, 0xA0, 0xD2, 0xA0, 0xD3, /* 0x64-0x67 */
+	0xA0, 0xD4, 0xA0, 0xD5, 0xA0, 0xD6, 0xA0, 0xD7, /* 0x68-0x6B */
+	0xA0, 0xD8, 0xA0, 0xD9, 0xA0, 0xDA, 0xA0, 0xDB, /* 0x6C-0x6F */
+	0xC1, 0xB6, 0xC1, 0xB7, 0xA0, 0xDC, 0xA0, 0xDD, /* 0x70-0x73 */
+	0xC1, 0xB8, 0xA0, 0xDE, 0xA0, 0xDF, 0xA0, 0xE0, /* 0x74-0x77 */
+	0xC1, 0xB9, 0xA0, 0xE1, 0xC1, 0xBA, 0xA0, 0xE2, /* 0x78-0x7B */
+	0xA0, 0xE3, 0xA0, 0xE4, 0xA0, 0xE5, 0xA0, 0xE6, /* 0x7C-0x7F */
+	
+	0xC1, 0xBB, 0xC1, 0xBC, 0xA0, 0xE7, 0xC1, 0xBD, /* 0x80-0x83 */
+	0xA0, 0xE8, 0xC1, 0xBE, 0xC1, 0xBF, 0xC1, 0xC0, /* 0x84-0x87 */
+	0xA0, 0xE9, 0xA0, 0xEA, 0xA0, 0xEB, 0xC1, 0xC1, /* 0x88-0x8B */
+	0xC1, 0xC2, 0xC1, 0xC3, 0xA0, 0xEC, 0xA0, 0xED, /* 0x8C-0x8F */
+	0xA0, 0xEE, 0xA0, 0xEF, 0xA0, 0xF0, 0xA0, 0xF1, /* 0x90-0x93 */
+	0xC1, 0xC4, 0xA0, 0xF2, 0xA0, 0xF3, 0xA0, 0xF4, /* 0x94-0x97 */
+	0xA0, 0xF5, 0xA0, 0xF6, 0xA0, 0xF7, 0xA0, 0xF8, /* 0x98-0x9B */
+	0xA0, 0xF9, 0xC1, 0xC5, 0xA0, 0xFA, 0xC1, 0xC6, /* 0x9C-0x9F */
+	0xA0, 0xFB, 0xC1, 0xC7, 0xA0, 0xFC, 0xA0, 0xFD, /* 0xA0-0xA3 */
+	0xA0, 0xFE, 0xA1, 0x41, 0xA1, 0x42, 0xA1, 0x43, /* 0xA4-0xA7 */
+	0xC1, 0xC8, 0xA1, 0x44, 0xA1, 0x45, 0xA1, 0x46, /* 0xA8-0xAB */
+	0xA1, 0x47, 0xA1, 0x48, 0xA1, 0x49, 0xA1, 0x4A, /* 0xAC-0xAF */
+	0xA1, 0x4B, 0xA1, 0x4C, 0xA1, 0x4D, 0xA1, 0x4E, /* 0xB0-0xB3 */
+	0xA1, 0x4F, 0xA1, 0x50, 0xA1, 0x51, 0xA1, 0x52, /* 0xB4-0xB7 */
+	0xA1, 0x53, 0xA1, 0x54, 0xA1, 0x55, 0xA1, 0x56, /* 0xB8-0xBB */
+	0xC1, 0xC9, 0xC1, 0xCA, 0xA1, 0x57, 0xA1, 0x58, /* 0xBC-0xBF */
+	0xA1, 0x59, 0xA1, 0x5A, 0xA1, 0x61, 0xA1, 0x62, /* 0xC0-0xC3 */
+	0xC1, 0xCB, 0xA1, 0x63, 0xA1, 0x64, 0xA1, 0x65, /* 0xC4-0xC7 */
+	0xC1, 0xCC, 0xA1, 0x66, 0xA1, 0x67, 0xA1, 0x68, /* 0xC8-0xCB */
+	0xC1, 0xCD, 0xA1, 0x69, 0xA1, 0x6A, 0xA1, 0x6B, /* 0xCC-0xCF */
+	0xA1, 0x6C, 0xA1, 0x6D, 0xA1, 0x6E, 0xA1, 0x6F, /* 0xD0-0xD3 */
+	0xC1, 0xCE, 0xC1, 0xCF, 0xA1, 0x70, 0xC1, 0xD0, /* 0xD4-0xD7 */
+	0xA1, 0x71, 0xC1, 0xD1, 0xA1, 0x72, 0xA1, 0x73, /* 0xD8-0xDB */
+	0xA1, 0x74, 0xA1, 0x75, 0xA1, 0x76, 0xA1, 0x77, /* 0xDC-0xDF */
+	0xC1, 0xD2, 0xC1, 0xD3, 0xA1, 0x78, 0xA1, 0x79, /* 0xE0-0xE3 */
+	0xC1, 0xD4, 0xA1, 0x7A, 0xA1, 0x81, 0xA1, 0x82, /* 0xE4-0xE7 */
+	0xA1, 0x83, 0xA1, 0x84, 0xA1, 0x85, 0xA1, 0x86, /* 0xE8-0xEB */
+	0xA1, 0x87, 0xA1, 0x88, 0xA1, 0x89, 0xA1, 0x8A, /* 0xEC-0xEF */
+	0xA1, 0x8B, 0xA1, 0x8C, 0xA1, 0x8D, 0xA1, 0x8E, /* 0xF0-0xF3 */
+	0xA1, 0x8F, 0xC1, 0xD5, 0xA1, 0x90, 0xA1, 0x91, /* 0xF4-0xF7 */
+	0xA1, 0x92, 0xA1, 0x93, 0xA1, 0x94, 0xA1, 0x95, /* 0xF8-0xFB */
+	0xC1, 0xD6, 0xC1, 0xD7, 0xA1, 0x96, 0xA1, 0x97, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_C9[512] = {
+	0xC1, 0xD8, 0xA1, 0x98, 0xA1, 0x99, 0xA1, 0x9A, /* 0x00-0x03 */
+	0xC1, 0xD9, 0xC1, 0xDA, 0xC1, 0xDB, 0xA1, 0x9B, /* 0x04-0x07 */
+	0xA1, 0x9C, 0xA1, 0x9D, 0xA1, 0x9E, 0xA1, 0x9F, /* 0x08-0x0B */
+	0xC1, 0xDC, 0xC1, 0xDD, 0xA1, 0xA0, 0xC1, 0xDE, /* 0x0C-0x0F */
+	0xA2, 0x41, 0xC1, 0xDF, 0xA2, 0x42, 0xA2, 0x43, /* 0x10-0x13 */
+	0xA2, 0x44, 0xA2, 0x45, 0xA2, 0x46, 0xA2, 0x47, /* 0x14-0x17 */
+	0xC1, 0xE0, 0xA2, 0x48, 0xA2, 0x49, 0xA2, 0x4A, /* 0x18-0x1B */
+	0xA2, 0x4B, 0xA2, 0x4C, 0xA2, 0x4D, 0xA2, 0x4E, /* 0x1C-0x1F */
+	0xA2, 0x4F, 0xA2, 0x50, 0xA2, 0x51, 0xA2, 0x52, /* 0x20-0x23 */
+	0xA2, 0x53, 0xA2, 0x54, 0xA2, 0x55, 0xA2, 0x56, /* 0x24-0x27 */
+	0xA2, 0x57, 0xA2, 0x58, 0xA2, 0x59, 0xA2, 0x5A, /* 0x28-0x2B */
+	0xC1, 0xE1, 0xA2, 0x61, 0xA2, 0x62, 0xA2, 0x63, /* 0x2C-0x2F */
+	0xA2, 0x64, 0xA2, 0x65, 0xA2, 0x66, 0xA2, 0x67, /* 0x30-0x33 */
+	0xC1, 0xE2, 0xA2, 0x68, 0xA2, 0x69, 0xA2, 0x6A, /* 0x34-0x37 */
+	0xA2, 0x6B, 0xA2, 0x6C, 0xA2, 0x6D, 0xA2, 0x6E, /* 0x38-0x3B */
+	0xA2, 0x6F, 0xA2, 0x70, 0xA2, 0x71, 0xA2, 0x72, /* 0x3C-0x3F */
+	0xA2, 0x73, 0xA2, 0x74, 0xA2, 0x75, 0xA2, 0x76, /* 0x40-0x43 */
+	0xA2, 0x77, 0xA2, 0x78, 0xA2, 0x79, 0xA2, 0x7A, /* 0x44-0x47 */
+	0xA2, 0x81, 0xA2, 0x82, 0xA2, 0x83, 0xA2, 0x84, /* 0x48-0x4B */
+	0xA2, 0x85, 0xA2, 0x86, 0xA2, 0x87, 0xA2, 0x88, /* 0x4C-0x4F */
+	0xC1, 0xE3, 0xC1, 0xE4, 0xA2, 0x89, 0xA2, 0x8A, /* 0x50-0x53 */
+	0xC1, 0xE5, 0xA2, 0x8B, 0xA2, 0x8C, 0xA2, 0x8D, /* 0x54-0x57 */
+	0xC1, 0xE6, 0xA2, 0x8E, 0xA2, 0x8F, 0xA2, 0x90, /* 0x58-0x5B */
+	0xA2, 0x91, 0xA2, 0x92, 0xA2, 0x93, 0xA2, 0x94, /* 0x5C-0x5F */
+	0xC1, 0xE7, 0xC1, 0xE8, 0xA2, 0x95, 0xC1, 0xE9, /* 0x60-0x63 */
+	0xA2, 0x96, 0xA2, 0x97, 0xA2, 0x98, 0xA2, 0x99, /* 0x64-0x67 */
+	0xA2, 0x9A, 0xA2, 0x9B, 0xA2, 0x9C, 0xA2, 0x9D, /* 0x68-0x6B */
+	0xC1, 0xEA, 0xA2, 0x9E, 0xA2, 0x9F, 0xA2, 0xA0, /* 0x6C-0x6F */
+	0xC1, 0xEB, 0xA3, 0x41, 0xA3, 0x42, 0xA3, 0x43, /* 0x70-0x73 */
+	0xC1, 0xEC, 0xA3, 0x44, 0xA3, 0x45, 0xA3, 0x46, /* 0x74-0x77 */
+	0xA3, 0x47, 0xA3, 0x48, 0xA3, 0x49, 0xA3, 0x4A, /* 0x78-0x7B */
+	0xC1, 0xED, 0xA3, 0x4B, 0xA3, 0x4C, 0xA3, 0x4D, /* 0x7C-0x7F */
+	
+	0xA3, 0x4E, 0xA3, 0x4F, 0xA3, 0x50, 0xA3, 0x51, /* 0x80-0x83 */
+	0xA3, 0x52, 0xA3, 0x53, 0xA3, 0x54, 0xA3, 0x55, /* 0x84-0x87 */
+	0xC1, 0xEE, 0xC1, 0xEF, 0xA3, 0x56, 0xA3, 0x57, /* 0x88-0x8B */
+	0xC1, 0xF0, 0xA3, 0x58, 0xA3, 0x59, 0xA3, 0x5A, /* 0x8C-0x8F */
+	0xC1, 0xF1, 0xA3, 0x61, 0xA3, 0x62, 0xA3, 0x63, /* 0x90-0x93 */
+	0xA3, 0x64, 0xA3, 0x65, 0xA3, 0x66, 0xA3, 0x67, /* 0x94-0x97 */
+	0xC1, 0xF2, 0xC1, 0xF3, 0xA3, 0x68, 0xC1, 0xF4, /* 0x98-0x9B */
+	0xA3, 0x69, 0xC1, 0xF5, 0xA3, 0x6A, 0xA3, 0x6B, /* 0x9C-0x9F */
+	0xA3, 0x6C, 0xA3, 0x6D, 0xA3, 0x6E, 0xA3, 0x6F, /* 0xA0-0xA3 */
+	0xA3, 0x70, 0xA3, 0x71, 0xA3, 0x72, 0xA3, 0x73, /* 0xA4-0xA7 */
+	0xA3, 0x74, 0xA3, 0x75, 0xA3, 0x76, 0xA3, 0x77, /* 0xA8-0xAB */
+	0xA3, 0x78, 0xA3, 0x79, 0xA3, 0x7A, 0xA3, 0x81, /* 0xAC-0xAF */
+	0xA3, 0x82, 0xA3, 0x83, 0xA3, 0x84, 0xA3, 0x85, /* 0xB0-0xB3 */
+	0xA3, 0x86, 0xA3, 0x87, 0xA3, 0x88, 0xA3, 0x89, /* 0xB4-0xB7 */
+	0xA3, 0x8A, 0xA3, 0x8B, 0xA3, 0x8C, 0xA3, 0x8D, /* 0xB8-0xBB */
+	0xA3, 0x8E, 0xA3, 0x8F, 0xA3, 0x90, 0xA3, 0x91, /* 0xBC-0xBF */
+	0xC1, 0xF6, 0xC1, 0xF7, 0xA3, 0x92, 0xA3, 0x93, /* 0xC0-0xC3 */
+	0xC1, 0xF8, 0xA3, 0x94, 0xA3, 0x95, 0xC1, 0xF9, /* 0xC4-0xC7 */
+	0xC1, 0xFA, 0xA3, 0x96, 0xC1, 0xFB, 0xA3, 0x97, /* 0xC8-0xCB */
+	0xA3, 0x98, 0xA3, 0x99, 0xA3, 0x9A, 0xA3, 0x9B, /* 0xCC-0xCF */
+	0xC1, 0xFC, 0xC1, 0xFD, 0xA3, 0x9C, 0xC1, 0xFE, /* 0xD0-0xD3 */
+	0xA3, 0x9D, 0xC2, 0xA1, 0xC2, 0xA2, 0xA3, 0x9E, /* 0xD4-0xD7 */
+	0xA3, 0x9F, 0xC2, 0xA3, 0xC2, 0xA4, 0xA3, 0xA0, /* 0xD8-0xDB */
+	0xC2, 0xA5, 0xC2, 0xA6, 0xA4, 0x41, 0xA4, 0x42, /* 0xDC-0xDF */
+	0xC2, 0xA7, 0xA4, 0x43, 0xC2, 0xA8, 0xA4, 0x44, /* 0xE0-0xE3 */
+	0xC2, 0xA9, 0xA4, 0x45, 0xA4, 0x46, 0xC2, 0xAA, /* 0xE4-0xE7 */
+	0xA4, 0x47, 0xA4, 0x48, 0xA4, 0x49, 0xA4, 0x4A, /* 0xE8-0xEB */
+	0xC2, 0xAB, 0xC2, 0xAC, 0xA4, 0x4B, 0xC2, 0xAD, /* 0xEC-0xEF */
+	0xC2, 0xAE, 0xC2, 0xAF, 0xA4, 0x4C, 0xA4, 0x4D, /* 0xF0-0xF3 */
+	0xA4, 0x4E, 0xA4, 0x4F, 0xA4, 0x50, 0xA4, 0x51, /* 0xF4-0xF7 */
+	0xC2, 0xB0, 0xC2, 0xB1, 0xA4, 0x52, 0xA4, 0x53, /* 0xF8-0xFB */
+	0xC2, 0xB2, 0xA4, 0x54, 0xA4, 0x55, 0xA4, 0x56, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_CA[512] = {
+	0xC2, 0xB3, 0xA4, 0x57, 0xA4, 0x58, 0xA4, 0x59, /* 0x00-0x03 */
+	0xA4, 0x5A, 0xA4, 0x61, 0xA4, 0x62, 0xA4, 0x63, /* 0x04-0x07 */
+	0xC2, 0xB4, 0xC2, 0xB5, 0xA4, 0x64, 0xC2, 0xB6, /* 0x08-0x0B */
+	0xC2, 0xB7, 0xC2, 0xB8, 0xA4, 0x65, 0xA4, 0x66, /* 0x0C-0x0F */
+	0xA4, 0x67, 0xA4, 0x68, 0xA4, 0x69, 0xA4, 0x6A, /* 0x10-0x13 */
+	0xC2, 0xB9, 0xA4, 0x6B, 0xA4, 0x6C, 0xA4, 0x6D, /* 0x14-0x17 */
+	0xC2, 0xBA, 0xA4, 0x6E, 0xA4, 0x6F, 0xA4, 0x70, /* 0x18-0x1B */
+	0xA4, 0x71, 0xA4, 0x72, 0xA4, 0x73, 0xA4, 0x74, /* 0x1C-0x1F */
+	0xA4, 0x75, 0xA4, 0x76, 0xA4, 0x77, 0xA4, 0x78, /* 0x20-0x23 */
+	0xA4, 0x79, 0xA4, 0x7A, 0xA4, 0x81, 0xA4, 0x82, /* 0x24-0x27 */
+	0xA4, 0x83, 0xC2, 0xBB, 0xA4, 0x84, 0xA4, 0x85, /* 0x28-0x2B */
+	0xA4, 0x86, 0xA4, 0x87, 0xA4, 0x88, 0xA4, 0x89, /* 0x2C-0x2F */
+	0xA4, 0x8A, 0xA4, 0x8B, 0xA4, 0x8C, 0xA4, 0x8D, /* 0x30-0x33 */
+	0xA4, 0x8E, 0xA4, 0x8F, 0xA4, 0x90, 0xA4, 0x91, /* 0x34-0x37 */
+	0xA4, 0x92, 0xA4, 0x93, 0xA4, 0x94, 0xA4, 0x95, /* 0x38-0x3B */
+	0xA4, 0x96, 0xA4, 0x97, 0xA4, 0x98, 0xA4, 0x99, /* 0x3C-0x3F */
+	0xA4, 0x9A, 0xA4, 0x9B, 0xA4, 0x9C, 0xA4, 0x9D, /* 0x40-0x43 */
+	0xA4, 0x9E, 0xA4, 0x9F, 0xA4, 0xA0, 0xA5, 0x41, /* 0x44-0x47 */
+	0xA5, 0x42, 0xA5, 0x43, 0xA5, 0x44, 0xA5, 0x45, /* 0x48-0x4B */
+	0xC2, 0xBC, 0xC2, 0xBD, 0xA5, 0x46, 0xA5, 0x47, /* 0x4C-0x4F */
+	0xC2, 0xBE, 0xA5, 0x48, 0xA5, 0x49, 0xA5, 0x4A, /* 0x50-0x53 */
+	0xC2, 0xBF, 0xA5, 0x4B, 0xA5, 0x4C, 0xA5, 0x4D, /* 0x54-0x57 */
+	0xA5, 0x4E, 0xA5, 0x4F, 0xA5, 0x50, 0xA5, 0x51, /* 0x58-0x5B */
+	0xC2, 0xC0, 0xC2, 0xC1, 0xA5, 0x52, 0xC2, 0xC2, /* 0x5C-0x5F */
+	0xC2, 0xC3, 0xC2, 0xC4, 0xA5, 0x53, 0xA5, 0x54, /* 0x60-0x63 */
+	0xA5, 0x55, 0xA5, 0x56, 0xA5, 0x57, 0xA5, 0x58, /* 0x64-0x67 */
+	0xC2, 0xC5, 0xA5, 0x59, 0xA5, 0x5A, 0xA5, 0x61, /* 0x68-0x6B */
+	0xA5, 0x62, 0xA5, 0x63, 0xA5, 0x64, 0xA5, 0x65, /* 0x6C-0x6F */
+	0xA5, 0x66, 0xA5, 0x67, 0xA5, 0x68, 0xA5, 0x69, /* 0x70-0x73 */
+	0xA5, 0x6A, 0xA5, 0x6B, 0xA5, 0x6C, 0xA5, 0x6D, /* 0x74-0x77 */
+	0xA5, 0x6E, 0xA5, 0x6F, 0xA5, 0x70, 0xA5, 0x71, /* 0x78-0x7B */
+	0xA5, 0x72, 0xC2, 0xC6, 0xA5, 0x73, 0xA5, 0x74, /* 0x7C-0x7F */
+	
+	0xA5, 0x75, 0xA5, 0x76, 0xA5, 0x77, 0xA5, 0x78, /* 0x80-0x83 */
+	0xC2, 0xC7, 0xA5, 0x79, 0xA5, 0x7A, 0xA5, 0x81, /* 0x84-0x87 */
+	0xA5, 0x82, 0xA5, 0x83, 0xA5, 0x84, 0xA5, 0x85, /* 0x88-0x8B */
+	0xA5, 0x86, 0xA5, 0x87, 0xA5, 0x88, 0xA5, 0x89, /* 0x8C-0x8F */
+	0xA5, 0x8A, 0xA5, 0x8B, 0xA5, 0x8C, 0xA5, 0x8D, /* 0x90-0x93 */
+	0xA5, 0x8E, 0xA5, 0x8F, 0xA5, 0x90, 0xA5, 0x91, /* 0x94-0x97 */
+	0xC2, 0xC8, 0xA5, 0x92, 0xA5, 0x93, 0xA5, 0x94, /* 0x98-0x9B */
+	0xA5, 0x95, 0xA5, 0x96, 0xA5, 0x97, 0xA5, 0x98, /* 0x9C-0x9F */
+	0xA5, 0x99, 0xA5, 0x9A, 0xA5, 0x9B, 0xA5, 0x9C, /* 0xA0-0xA3 */
+	0xA5, 0x9D, 0xA5, 0x9E, 0xA5, 0x9F, 0xA5, 0xA0, /* 0xA4-0xA7 */
+	0xA6, 0x41, 0xA6, 0x42, 0xA6, 0x43, 0xA6, 0x44, /* 0xA8-0xAB */
+	0xA6, 0x45, 0xA6, 0x46, 0xA6, 0x47, 0xA6, 0x48, /* 0xAC-0xAF */
+	0xA6, 0x49, 0xA6, 0x4A, 0xA6, 0x4B, 0xA6, 0x4C, /* 0xB0-0xB3 */
+	0xA6, 0x4D, 0xA6, 0x4E, 0xA6, 0x4F, 0xA6, 0x50, /* 0xB4-0xB7 */
+	0xA6, 0x51, 0xA6, 0x52, 0xA6, 0x53, 0xA6, 0x54, /* 0xB8-0xBB */
+	0xC2, 0xC9, 0xC2, 0xCA, 0xA6, 0x55, 0xA6, 0x56, /* 0xBC-0xBF */
+	0xC2, 0xCB, 0xA6, 0x57, 0xA6, 0x58, 0xA6, 0x59, /* 0xC0-0xC3 */
+	0xC2, 0xCC, 0xA6, 0x5A, 0xA6, 0x61, 0xA6, 0x62, /* 0xC4-0xC7 */
+	0xA6, 0x63, 0xA6, 0x64, 0xA6, 0x65, 0xA6, 0x66, /* 0xC8-0xCB */
+	0xC2, 0xCD, 0xC2, 0xCE, 0xA6, 0x67, 0xC2, 0xCF, /* 0xCC-0xCF */
+	0xA6, 0x68, 0xC2, 0xD0, 0xA6, 0x69, 0xC2, 0xD1, /* 0xD0-0xD3 */
+	0xA6, 0x6A, 0xA6, 0x6B, 0xA6, 0x6C, 0xA6, 0x6D, /* 0xD4-0xD7 */
+	0xC2, 0xD2, 0xC2, 0xD3, 0xA6, 0x6E, 0xA6, 0x6F, /* 0xD8-0xDB */
+	0xA6, 0x70, 0xA6, 0x71, 0xA6, 0x72, 0xA6, 0x73, /* 0xDC-0xDF */
+	0xC2, 0xD4, 0xA6, 0x74, 0xA6, 0x75, 0xA6, 0x76, /* 0xE0-0xE3 */
+	0xA6, 0x77, 0xA6, 0x78, 0xA6, 0x79, 0xA6, 0x7A, /* 0xE4-0xE7 */
+	0xA6, 0x81, 0xA6, 0x82, 0xA6, 0x83, 0xA6, 0x84, /* 0xE8-0xEB */
+	0xC2, 0xD5, 0xA6, 0x85, 0xA6, 0x86, 0xA6, 0x87, /* 0xEC-0xEF */
+	0xA6, 0x88, 0xA6, 0x89, 0xA6, 0x8A, 0xA6, 0x8B, /* 0xF0-0xF3 */
+	0xC2, 0xD6, 0xA6, 0x8C, 0xA6, 0x8D, 0xA6, 0x8E, /* 0xF4-0xF7 */
+	0xA6, 0x8F, 0xA6, 0x90, 0xA6, 0x91, 0xA6, 0x92, /* 0xF8-0xFB */
+	0xA6, 0x93, 0xA6, 0x94, 0xA6, 0x95, 0xA6, 0x96, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_CB[512] = {
+	0xA6, 0x97, 0xA6, 0x98, 0xA6, 0x99, 0xA6, 0x9A, /* 0x00-0x03 */
+	0xA6, 0x9B, 0xA6, 0x9C, 0xA6, 0x9D, 0xA6, 0x9E, /* 0x04-0x07 */
+	0xC2, 0xD7, 0xA6, 0x9F, 0xA6, 0xA0, 0xA7, 0x41, /* 0x08-0x0B */
+	0xA7, 0x42, 0xA7, 0x43, 0xA7, 0x44, 0xA7, 0x45, /* 0x0C-0x0F */
+	0xC2, 0xD8, 0xA7, 0x46, 0xA7, 0x47, 0xA7, 0x48, /* 0x10-0x13 */
+	0xC2, 0xD9, 0xA7, 0x49, 0xA7, 0x4A, 0xA7, 0x4B, /* 0x14-0x17 */
+	0xC2, 0xDA, 0xA7, 0x4C, 0xA7, 0x4D, 0xA7, 0x4E, /* 0x18-0x1B */
+	0xA7, 0x4F, 0xA7, 0x50, 0xA7, 0x51, 0xA7, 0x52, /* 0x1C-0x1F */
+	0xC2, 0xDB, 0xC2, 0xDC, 0xA7, 0x53, 0xA7, 0x54, /* 0x20-0x23 */
+	0xA7, 0x55, 0xA7, 0x56, 0xA7, 0x57, 0xA7, 0x58, /* 0x24-0x27 */
+	0xA7, 0x59, 0xA7, 0x5A, 0xA7, 0x61, 0xA7, 0x62, /* 0x28-0x2B */
+	0xA7, 0x63, 0xA7, 0x64, 0xA7, 0x65, 0xA7, 0x66, /* 0x2C-0x2F */
+	0xA7, 0x67, 0xA7, 0x68, 0xA7, 0x69, 0xA7, 0x6A, /* 0x30-0x33 */
+	0xA7, 0x6B, 0xA7, 0x6C, 0xA7, 0x6D, 0xA7, 0x6E, /* 0x34-0x37 */
+	0xA7, 0x6F, 0xA7, 0x70, 0xA7, 0x71, 0xA7, 0x72, /* 0x38-0x3B */
+	0xA7, 0x73, 0xA7, 0x74, 0xA7, 0x75, 0xA7, 0x76, /* 0x3C-0x3F */
+	0xA7, 0x77, 0xC2, 0xDD, 0xA7, 0x78, 0xA7, 0x79, /* 0x40-0x43 */
+	0xA7, 0x7A, 0xA7, 0x81, 0xA7, 0x82, 0xA7, 0x83, /* 0x44-0x47 */
+	0xC2, 0xDE, 0xC2, 0xDF, 0xA7, 0x84, 0xA7, 0x85, /* 0x48-0x4B */
+	0xC2, 0xE0, 0xA7, 0x86, 0xA7, 0x87, 0xA7, 0x88, /* 0x4C-0x4F */
+	0xC2, 0xE1, 0xA7, 0x89, 0xA7, 0x8A, 0xA7, 0x8B, /* 0x50-0x53 */
+	0xA7, 0x8C, 0xA7, 0x8D, 0xA7, 0x8E, 0xA7, 0x8F, /* 0x54-0x57 */
+	0xC2, 0xE2, 0xC2, 0xE3, 0xA7, 0x90, 0xA7, 0x91, /* 0x58-0x5B */
+	0xA7, 0x92, 0xC2, 0xE4, 0xA7, 0x93, 0xA7, 0x94, /* 0x5C-0x5F */
+	0xA7, 0x95, 0xA7, 0x96, 0xA7, 0x97, 0xA7, 0x98, /* 0x60-0x63 */
+	0xC2, 0xE5, 0xA7, 0x99, 0xA7, 0x9A, 0xA7, 0x9B, /* 0x64-0x67 */
+	0xA7, 0x9C, 0xA7, 0x9D, 0xA7, 0x9E, 0xA7, 0x9F, /* 0x68-0x6B */
+	0xA7, 0xA0, 0xA8, 0x41, 0xA8, 0x42, 0xA8, 0x43, /* 0x6C-0x6F */
+	0xA8, 0x44, 0xA8, 0x45, 0xA8, 0x46, 0xA8, 0x47, /* 0x70-0x73 */
+	0xA8, 0x48, 0xA8, 0x49, 0xA8, 0x4A, 0xA8, 0x4B, /* 0x74-0x77 */
+	0xC2, 0xE6, 0xC2, 0xE7, 0xA8, 0x4C, 0xA8, 0x4D, /* 0x78-0x7B */
+	0xA8, 0x4E, 0xA8, 0x4F, 0xA8, 0x50, 0xA8, 0x51, /* 0x7C-0x7F */
+	
+	0xA8, 0x52, 0xA8, 0x53, 0xA8, 0x54, 0xA8, 0x55, /* 0x80-0x83 */
+	0xA8, 0x56, 0xA8, 0x57, 0xA8, 0x58, 0xA8, 0x59, /* 0x84-0x87 */
+	0xA8, 0x5A, 0xA8, 0x61, 0xA8, 0x62, 0xA8, 0x63, /* 0x88-0x8B */
+	0xA8, 0x64, 0xA8, 0x65, 0xA8, 0x66, 0xA8, 0x67, /* 0x8C-0x8F */
+	0xA8, 0x68, 0xA8, 0x69, 0xA8, 0x6A, 0xA8, 0x6B, /* 0x90-0x93 */
+	0xA8, 0x6C, 0xA8, 0x6D, 0xA8, 0x6E, 0xA8, 0x6F, /* 0x94-0x97 */
+	0xA8, 0x70, 0xA8, 0x71, 0xA8, 0x72, 0xA8, 0x73, /* 0x98-0x9B */
+	0xC2, 0xE8, 0xA8, 0x74, 0xA8, 0x75, 0xA8, 0x76, /* 0x9C-0x9F */
+	0xA8, 0x77, 0xA8, 0x78, 0xA8, 0x79, 0xA8, 0x7A, /* 0xA0-0xA3 */
+	0xA8, 0x81, 0xA8, 0x82, 0xA8, 0x83, 0xA8, 0x84, /* 0xA4-0xA7 */
+	0xA8, 0x85, 0xA8, 0x86, 0xA8, 0x87, 0xA8, 0x88, /* 0xA8-0xAB */
+	0xA8, 0x89, 0xA8, 0x8A, 0xA8, 0x8B, 0xA8, 0x8C, /* 0xAC-0xAF */
+	0xA8, 0x8D, 0xA8, 0x8E, 0xA8, 0x8F, 0xA8, 0x90, /* 0xB0-0xB3 */
+	0xA8, 0x91, 0xA8, 0x92, 0xA8, 0x93, 0xA8, 0x94, /* 0xB4-0xB7 */
+	0xC2, 0xE9, 0xA8, 0x95, 0xA8, 0x96, 0xA8, 0x97, /* 0xB8-0xBB */
+	0xA8, 0x98, 0xA8, 0x99, 0xA8, 0x9A, 0xA8, 0x9B, /* 0xBC-0xBF */
+	0xA8, 0x9C, 0xA8, 0x9D, 0xA8, 0x9E, 0xA8, 0x9F, /* 0xC0-0xC3 */
+	0xA8, 0xA0, 0xA9, 0x41, 0xA9, 0x42, 0xA9, 0x43, /* 0xC4-0xC7 */
+	0xA9, 0x44, 0xA9, 0x45, 0xA9, 0x46, 0xA9, 0x47, /* 0xC8-0xCB */
+	0xA9, 0x48, 0xA9, 0x49, 0xA9, 0x4A, 0xA9, 0x4B, /* 0xCC-0xCF */
+	0xA9, 0x4C, 0xA9, 0x4D, 0xA9, 0x4E, 0xA9, 0x4F, /* 0xD0-0xD3 */
+	0xC2, 0xEA, 0xA9, 0x50, 0xA9, 0x51, 0xA9, 0x52, /* 0xD4-0xD7 */
+	0xA9, 0x53, 0xA9, 0x54, 0xA9, 0x55, 0xA9, 0x56, /* 0xD8-0xDB */
+	0xA9, 0x57, 0xA9, 0x58, 0xA9, 0x59, 0xA9, 0x5A, /* 0xDC-0xDF */
+	0xA9, 0x61, 0xA9, 0x62, 0xA9, 0x63, 0xA9, 0x64, /* 0xE0-0xE3 */
+	0xC2, 0xEB, 0xA9, 0x65, 0xA9, 0x66, 0xC2, 0xEC, /* 0xE4-0xE7 */
+	0xA9, 0x67, 0xC2, 0xED, 0xA9, 0x68, 0xA9, 0x69, /* 0xE8-0xEB */
+	0xA9, 0x6A, 0xA9, 0x6B, 0xA9, 0x6C, 0xA9, 0x6D, /* 0xEC-0xEF */
+	0xA9, 0x6E, 0xA9, 0x6F, 0xA9, 0x70, 0xA9, 0x71, /* 0xF0-0xF3 */
+	0xA9, 0x72, 0xA9, 0x73, 0xA9, 0x74, 0xA9, 0x75, /* 0xF4-0xF7 */
+	0xA9, 0x76, 0xA9, 0x77, 0xA9, 0x78, 0xA9, 0x79, /* 0xF8-0xFB */
+	0xA9, 0x7A, 0xA9, 0x81, 0xA9, 0x82, 0xA9, 0x83, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_CC[512] = {
+	0xA9, 0x84, 0xA9, 0x85, 0xA9, 0x86, 0xA9, 0x87, /* 0x00-0x03 */
+	0xA9, 0x88, 0xA9, 0x89, 0xA9, 0x8A, 0xA9, 0x8B, /* 0x04-0x07 */
+	0xA9, 0x8C, 0xA9, 0x8D, 0xA9, 0x8E, 0xA9, 0x8F, /* 0x08-0x0B */
+	0xC2, 0xEE, 0xC2, 0xEF, 0xA9, 0x90, 0xA9, 0x91, /* 0x0C-0x0F */
+	0xC2, 0xF0, 0xA9, 0x92, 0xA9, 0x93, 0xA9, 0x94, /* 0x10-0x13 */
+	0xC2, 0xF1, 0xA9, 0x95, 0xA9, 0x96, 0xA9, 0x97, /* 0x14-0x17 */
+	0xA9, 0x98, 0xA9, 0x99, 0xA9, 0x9A, 0xA9, 0x9B, /* 0x18-0x1B */
+	0xC2, 0xF2, 0xC2, 0xF3, 0xA9, 0x9C, 0xA9, 0x9D, /* 0x1C-0x1F */
+	0xA9, 0x9E, 0xC2, 0xF4, 0xC2, 0xF5, 0xA9, 0x9F, /* 0x20-0x23 */
+	0xA9, 0xA0, 0xAA, 0x41, 0xAA, 0x42, 0xC2, 0xF6, /* 0x24-0x27 */
+	0xC2, 0xF7, 0xC2, 0xF8, 0xAA, 0x43, 0xAA, 0x44, /* 0x28-0x2B */
+	0xC2, 0xF9, 0xAA, 0x45, 0xC2, 0xFA, 0xAA, 0x46, /* 0x2C-0x2F */
+	0xC2, 0xFB, 0xAA, 0x47, 0xAA, 0x48, 0xAA, 0x49, /* 0x30-0x33 */
+	0xAA, 0x4A, 0xAA, 0x4B, 0xAA, 0x4C, 0xAA, 0x4D, /* 0x34-0x37 */
+	0xC2, 0xFC, 0xC2, 0xFD, 0xAA, 0x4E, 0xC2, 0xFE, /* 0x38-0x3B */
+	0xC3, 0xA1, 0xC3, 0xA2, 0xC3, 0xA3, 0xAA, 0x4F, /* 0x3C-0x3F */
+	0xAA, 0x50, 0xAA, 0x51, 0xAA, 0x52, 0xAA, 0x53, /* 0x40-0x43 */
+	0xC3, 0xA4, 0xC3, 0xA5, 0xAA, 0x54, 0xAA, 0x55, /* 0x44-0x47 */
+	0xC3, 0xA6, 0xAA, 0x56, 0xAA, 0x57, 0xAA, 0x58, /* 0x48-0x4B */
+	0xC3, 0xA7, 0xAA, 0x59, 0xAA, 0x5A, 0xAA, 0x61, /* 0x4C-0x4F */
+	0xAA, 0x62, 0xAA, 0x63, 0xAA, 0x64, 0xAA, 0x65, /* 0x50-0x53 */
+	0xC3, 0xA8, 0xC3, 0xA9, 0xAA, 0x66, 0xC3, 0xAA, /* 0x54-0x57 */
+	0xC3, 0xAB, 0xC3, 0xAC, 0xAA, 0x67, 0xAA, 0x68, /* 0x58-0x5B */
+	0xAA, 0x69, 0xAA, 0x6A, 0xAA, 0x6B, 0xAA, 0x6C, /* 0x5C-0x5F */
+	0xC3, 0xAD, 0xAA, 0x6D, 0xAA, 0x6E, 0xAA, 0x6F, /* 0x60-0x63 */
+	0xC3, 0xAE, 0xAA, 0x70, 0xC3, 0xAF, 0xAA, 0x71, /* 0x64-0x67 */
+	0xC3, 0xB0, 0xAA, 0x72, 0xAA, 0x73, 0xAA, 0x74, /* 0x68-0x6B */
+	0xAA, 0x75, 0xAA, 0x76, 0xAA, 0x77, 0xAA, 0x78, /* 0x6C-0x6F */
+	0xC3, 0xB1, 0xAA, 0x79, 0xAA, 0x7A, 0xAA, 0x81, /* 0x70-0x73 */
+	0xAA, 0x82, 0xC3, 0xB2, 0xAA, 0x83, 0xAA, 0x84, /* 0x74-0x77 */
+	0xAA, 0x85, 0xAA, 0x86, 0xAA, 0x87, 0xAA, 0x88, /* 0x78-0x7B */
+	0xAA, 0x89, 0xAA, 0x8A, 0xAA, 0x8B, 0xAA, 0x8C, /* 0x7C-0x7F */
+	
+	0xAA, 0x8D, 0xAA, 0x8E, 0xAA, 0x8F, 0xAA, 0x90, /* 0x80-0x83 */
+	0xAA, 0x91, 0xAA, 0x92, 0xAA, 0x93, 0xAA, 0x94, /* 0x84-0x87 */
+	0xAA, 0x95, 0xAA, 0x96, 0xAA, 0x97, 0xAA, 0x98, /* 0x88-0x8B */
+	0xAA, 0x99, 0xAA, 0x9A, 0xAA, 0x9B, 0xAA, 0x9C, /* 0x8C-0x8F */
+	0xAA, 0x9D, 0xAA, 0x9E, 0xAA, 0x9F, 0xAA, 0xA0, /* 0x90-0x93 */
+	0xAB, 0x41, 0xAB, 0x42, 0xAB, 0x43, 0xAB, 0x44, /* 0x94-0x97 */
+	0xC3, 0xB3, 0xC3, 0xB4, 0xAB, 0x45, 0xAB, 0x46, /* 0x98-0x9B */
+	0xC3, 0xB5, 0xAB, 0x47, 0xAB, 0x48, 0xAB, 0x49, /* 0x9C-0x9F */
+	0xC3, 0xB6, 0xAB, 0x4A, 0xAB, 0x4B, 0xAB, 0x4C, /* 0xA0-0xA3 */
+	0xAB, 0x4D, 0xAB, 0x4E, 0xAB, 0x4F, 0xAB, 0x50, /* 0xA4-0xA7 */
+	0xC3, 0xB7, 0xC3, 0xB8, 0xAB, 0x51, 0xC3, 0xB9, /* 0xA8-0xAB */
+	0xC3, 0xBA, 0xC3, 0xBB, 0xAB, 0x52, 0xAB, 0x53, /* 0xAC-0xAF */
+	0xAB, 0x54, 0xAB, 0x55, 0xAB, 0x56, 0xAB, 0x57, /* 0xB0-0xB3 */
+	0xC3, 0xBC, 0xC3, 0xBD, 0xAB, 0x58, 0xAB, 0x59, /* 0xB4-0xB7 */
+	0xC3, 0xBE, 0xAB, 0x5A, 0xAB, 0x61, 0xAB, 0x62, /* 0xB8-0xBB */
+	0xC3, 0xBF, 0xAB, 0x63, 0xAB, 0x64, 0xAB, 0x65, /* 0xBC-0xBF */
+	0xAB, 0x66, 0xAB, 0x67, 0xAB, 0x68, 0xAB, 0x69, /* 0xC0-0xC3 */
+	0xC3, 0xC0, 0xC3, 0xC1, 0xAB, 0x6A, 0xC3, 0xC2, /* 0xC4-0xC7 */
+	0xAB, 0x6B, 0xC3, 0xC3, 0xAB, 0x6C, 0xAB, 0x6D, /* 0xC8-0xCB */
+	0xAB, 0x6E, 0xAB, 0x6F, 0xAB, 0x70, 0xAB, 0x71, /* 0xCC-0xCF */
+	0xC3, 0xC4, 0xAB, 0x72, 0xAB, 0x73, 0xAB, 0x74, /* 0xD0-0xD3 */
+	0xC3, 0xC5, 0xAB, 0x75, 0xAB, 0x76, 0xAB, 0x77, /* 0xD4-0xD7 */
+	0xAB, 0x78, 0xAB, 0x79, 0xAB, 0x7A, 0xAB, 0x81, /* 0xD8-0xDB */
+	0xAB, 0x82, 0xAB, 0x83, 0xAB, 0x84, 0xAB, 0x85, /* 0xDC-0xDF */
+	0xAB, 0x86, 0xAB, 0x87, 0xAB, 0x88, 0xAB, 0x89, /* 0xE0-0xE3 */
+	0xC3, 0xC6, 0xAB, 0x8A, 0xAB, 0x8B, 0xAB, 0x8C, /* 0xE4-0xE7 */
+	0xAB, 0x8D, 0xAB, 0x8E, 0xAB, 0x8F, 0xAB, 0x90, /* 0xE8-0xEB */
+	0xC3, 0xC7, 0xAB, 0x91, 0xAB, 0x92, 0xAB, 0x93, /* 0xEC-0xEF */
+	0xC3, 0xC8, 0xAB, 0x94, 0xAB, 0x95, 0xAB, 0x96, /* 0xF0-0xF3 */
+	0xAB, 0x97, 0xAB, 0x98, 0xAB, 0x99, 0xAB, 0x9A, /* 0xF4-0xF7 */
+	0xAB, 0x9B, 0xAB, 0x9C, 0xAB, 0x9D, 0xAB, 0x9E, /* 0xF8-0xFB */
+	0xAB, 0x9F, 0xAB, 0xA0, 0xAC, 0x41, 0xAC, 0x42, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_CD[512] = {
+	0xAC, 0x43, 0xC3, 0xC9, 0xAC, 0x44, 0xAC, 0x45, /* 0x00-0x03 */
+	0xAC, 0x46, 0xAC, 0x47, 0xAC, 0x48, 0xAC, 0x49, /* 0x04-0x07 */
+	0xC3, 0xCA, 0xC3, 0xCB, 0xAC, 0x4A, 0xAC, 0x4B, /* 0x08-0x0B */
+	0xC3, 0xCC, 0xAC, 0x4C, 0xAC, 0x4D, 0xAC, 0x4E, /* 0x0C-0x0F */
+	0xC3, 0xCD, 0xAC, 0x4F, 0xAC, 0x50, 0xAC, 0x51, /* 0x10-0x13 */
+	0xAC, 0x52, 0xAC, 0x53, 0xAC, 0x54, 0xAC, 0x55, /* 0x14-0x17 */
+	0xC3, 0xCE, 0xC3, 0xCF, 0xAC, 0x56, 0xC3, 0xD0, /* 0x18-0x1B */
+	0xAC, 0x57, 0xC3, 0xD1, 0xAC, 0x58, 0xAC, 0x59, /* 0x1C-0x1F */
+	0xAC, 0x5A, 0xAC, 0x61, 0xAC, 0x62, 0xAC, 0x63, /* 0x20-0x23 */
+	0xC3, 0xD2, 0xAC, 0x64, 0xAC, 0x65, 0xAC, 0x66, /* 0x24-0x27 */
+	0xC3, 0xD3, 0xAC, 0x67, 0xAC, 0x68, 0xAC, 0x69, /* 0x28-0x2B */
+	0xC3, 0xD4, 0xAC, 0x6A, 0xAC, 0x6B, 0xAC, 0x6C, /* 0x2C-0x2F */
+	0xAC, 0x6D, 0xAC, 0x6E, 0xAC, 0x6F, 0xAC, 0x70, /* 0x30-0x33 */
+	0xAC, 0x71, 0xAC, 0x72, 0xAC, 0x73, 0xAC, 0x74, /* 0x34-0x37 */
+	0xAC, 0x75, 0xC3, 0xD5, 0xAC, 0x76, 0xAC, 0x77, /* 0x38-0x3B */
+	0xAC, 0x78, 0xAC, 0x79, 0xAC, 0x7A, 0xAC, 0x81, /* 0x3C-0x3F */
+	0xAC, 0x82, 0xAC, 0x83, 0xAC, 0x84, 0xAC, 0x85, /* 0x40-0x43 */
+	0xAC, 0x86, 0xAC, 0x87, 0xAC, 0x88, 0xAC, 0x89, /* 0x44-0x47 */
+	0xAC, 0x8A, 0xAC, 0x8B, 0xAC, 0x8C, 0xAC, 0x8D, /* 0x48-0x4B */
+	0xAC, 0x8E, 0xAC, 0x8F, 0xAC, 0x90, 0xAC, 0x91, /* 0x4C-0x4F */
+	0xAC, 0x92, 0xAC, 0x93, 0xAC, 0x94, 0xAC, 0x95, /* 0x50-0x53 */
+	0xAC, 0x96, 0xAC, 0x97, 0xAC, 0x98, 0xAC, 0x99, /* 0x54-0x57 */
+	0xAC, 0x9A, 0xAC, 0x9B, 0xAC, 0x9C, 0xAC, 0x9D, /* 0x58-0x5B */
+	0xC3, 0xD6, 0xAC, 0x9E, 0xAC, 0x9F, 0xAC, 0xA0, /* 0x5C-0x5F */
+	0xC3, 0xD7, 0xAD, 0x41, 0xAD, 0x42, 0xAD, 0x43, /* 0x60-0x63 */
+	0xC3, 0xD8, 0xAD, 0x44, 0xAD, 0x45, 0xAD, 0x46, /* 0x64-0x67 */
+	0xAD, 0x47, 0xAD, 0x48, 0xAD, 0x49, 0xAD, 0x4A, /* 0x68-0x6B */
+	0xC3, 0xD9, 0xC3, 0xDA, 0xAD, 0x4B, 0xC3, 0xDB, /* 0x6C-0x6F */
+	0xAD, 0x4C, 0xC3, 0xDC, 0xAD, 0x4D, 0xAD, 0x4E, /* 0x70-0x73 */
+	0xAD, 0x4F, 0xAD, 0x50, 0xAD, 0x51, 0xAD, 0x52, /* 0x74-0x77 */
+	0xC3, 0xDD, 0xAD, 0x53, 0xAD, 0x54, 0xAD, 0x55, /* 0x78-0x7B */
+	0xAD, 0x56, 0xAD, 0x57, 0xAD, 0x58, 0xAD, 0x59, /* 0x7C-0x7F */
+	
+	0xAD, 0x5A, 0xAD, 0x61, 0xAD, 0x62, 0xAD, 0x63, /* 0x80-0x83 */
+	0xAD, 0x64, 0xAD, 0x65, 0xAD, 0x66, 0xAD, 0x67, /* 0x84-0x87 */
+	0xC3, 0xDE, 0xAD, 0x68, 0xAD, 0x69, 0xAD, 0x6A, /* 0x88-0x8B */
+	0xAD, 0x6B, 0xAD, 0x6C, 0xAD, 0x6D, 0xAD, 0x6E, /* 0x8C-0x8F */
+	0xAD, 0x6F, 0xAD, 0x70, 0xAD, 0x71, 0xAD, 0x72, /* 0x90-0x93 */
+	0xC3, 0xDF, 0xC3, 0xE0, 0xAD, 0x73, 0xAD, 0x74, /* 0x94-0x97 */
+	0xC3, 0xE1, 0xAD, 0x75, 0xAD, 0x76, 0xAD, 0x77, /* 0x98-0x9B */
+	0xC3, 0xE2, 0xAD, 0x78, 0xAD, 0x79, 0xAD, 0x7A, /* 0x9C-0x9F */
+	0xAD, 0x81, 0xAD, 0x82, 0xAD, 0x83, 0xAD, 0x84, /* 0xA0-0xA3 */
+	0xC3, 0xE3, 0xC3, 0xE4, 0xAD, 0x85, 0xC3, 0xE5, /* 0xA4-0xA7 */
+	0xAD, 0x86, 0xC3, 0xE6, 0xAD, 0x87, 0xAD, 0x88, /* 0xA8-0xAB */
+	0xAD, 0x89, 0xAD, 0x8A, 0xAD, 0x8B, 0xAD, 0x8C, /* 0xAC-0xAF */
+	0xC3, 0xE7, 0xAD, 0x8D, 0xAD, 0x8E, 0xAD, 0x8F, /* 0xB0-0xB3 */
+	0xAD, 0x90, 0xAD, 0x91, 0xAD, 0x92, 0xAD, 0x93, /* 0xB4-0xB7 */
+	0xAD, 0x94, 0xAD, 0x95, 0xAD, 0x96, 0xAD, 0x97, /* 0xB8-0xBB */
+	0xAD, 0x98, 0xAD, 0x99, 0xAD, 0x9A, 0xAD, 0x9B, /* 0xBC-0xBF */
+	0xAD, 0x9C, 0xAD, 0x9D, 0xAD, 0x9E, 0xAD, 0x9F, /* 0xC0-0xC3 */
+	0xC3, 0xE8, 0xAD, 0xA0, 0xAE, 0x41, 0xAE, 0x42, /* 0xC4-0xC7 */
+	0xAE, 0x43, 0xAE, 0x44, 0xAE, 0x45, 0xAE, 0x46, /* 0xC8-0xCB */
+	0xC3, 0xE9, 0xAE, 0x47, 0xAE, 0x48, 0xAE, 0x49, /* 0xCC-0xCF */
+	0xC3, 0xEA, 0xAE, 0x4A, 0xAE, 0x4B, 0xAE, 0x4C, /* 0xD0-0xD3 */
+	0xAE, 0x4D, 0xAE, 0x4E, 0xAE, 0x4F, 0xAE, 0x50, /* 0xD4-0xD7 */
+	0xAE, 0x51, 0xAE, 0x52, 0xAE, 0x53, 0xAE, 0x54, /* 0xD8-0xDB */
+	0xAE, 0x55, 0xAE, 0x56, 0xAE, 0x57, 0xAE, 0x58, /* 0xDC-0xDF */
+	0xAE, 0x59, 0xAE, 0x5A, 0xAE, 0x61, 0xAE, 0x62, /* 0xE0-0xE3 */
+	0xAE, 0x63, 0xAE, 0x64, 0xAE, 0x65, 0xAE, 0x66, /* 0xE4-0xE7 */
+	0xC3, 0xEB, 0xAE, 0x67, 0xAE, 0x68, 0xAE, 0x69, /* 0xE8-0xEB */
+	0xC3, 0xEC, 0xAE, 0x6A, 0xAE, 0x6B, 0xAE, 0x6C, /* 0xEC-0xEF */
+	0xC3, 0xED, 0xAE, 0x6D, 0xAE, 0x6E, 0xAE, 0x6F, /* 0xF0-0xF3 */
+	0xAE, 0x70, 0xAE, 0x71, 0xAE, 0x72, 0xAE, 0x73, /* 0xF4-0xF7 */
+	0xC3, 0xEE, 0xC3, 0xEF, 0xAE, 0x74, 0xC3, 0xF0, /* 0xF8-0xFB */
+	0xAE, 0x75, 0xC3, 0xF1, 0xAE, 0x76, 0xAE, 0x77, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_CE[512] = {
+	0xAE, 0x78, 0xAE, 0x79, 0xAE, 0x7A, 0xAE, 0x81, /* 0x00-0x03 */
+	0xC3, 0xF2, 0xAE, 0x82, 0xAE, 0x83, 0xAE, 0x84, /* 0x04-0x07 */
+	0xC3, 0xF3, 0xAE, 0x85, 0xAE, 0x86, 0xAE, 0x87, /* 0x08-0x0B */
+	0xC3, 0xF4, 0xAE, 0x88, 0xAE, 0x89, 0xAE, 0x8A, /* 0x0C-0x0F */
+	0xAE, 0x8B, 0xAE, 0x8C, 0xAE, 0x8D, 0xAE, 0x8E, /* 0x10-0x13 */
+	0xC3, 0xF5, 0xAE, 0x8F, 0xAE, 0x90, 0xAE, 0x91, /* 0x14-0x17 */
+	0xAE, 0x92, 0xC3, 0xF6, 0xAE, 0x93, 0xAE, 0x94, /* 0x18-0x1B */
+	0xAE, 0x95, 0xAE, 0x96, 0xAE, 0x97, 0xAE, 0x98, /* 0x1C-0x1F */
+	0xC3, 0xF7, 0xC3, 0xF8, 0xAE, 0x99, 0xAE, 0x9A, /* 0x20-0x23 */
+	0xC3, 0xF9, 0xAE, 0x9B, 0xAE, 0x9C, 0xAE, 0x9D, /* 0x24-0x27 */
+	0xC3, 0xFA, 0xAE, 0x9E, 0xAE, 0x9F, 0xAE, 0xA0, /* 0x28-0x2B */
+	0xAF, 0x41, 0xAF, 0x42, 0xAF, 0x43, 0xAF, 0x44, /* 0x2C-0x2F */
+	0xC3, 0xFB, 0xC3, 0xFC, 0xAF, 0x45, 0xC3, 0xFD, /* 0x30-0x33 */
+	0xAF, 0x46, 0xC3, 0xFE, 0xAF, 0x47, 0xAF, 0x48, /* 0x34-0x37 */
+	0xAF, 0x49, 0xAF, 0x4A, 0xAF, 0x4B, 0xAF, 0x4C, /* 0x38-0x3B */
+	0xAF, 0x4D, 0xAF, 0x4E, 0xAF, 0x4F, 0xAF, 0x50, /* 0x3C-0x3F */
+	0xAF, 0x51, 0xAF, 0x52, 0xAF, 0x53, 0xAF, 0x54, /* 0x40-0x43 */
+	0xAF, 0x55, 0xAF, 0x56, 0xAF, 0x57, 0xAF, 0x58, /* 0x44-0x47 */
+	0xAF, 0x59, 0xAF, 0x5A, 0xAF, 0x61, 0xAF, 0x62, /* 0x48-0x4B */
+	0xAF, 0x63, 0xAF, 0x64, 0xAF, 0x65, 0xAF, 0x66, /* 0x4C-0x4F */
+	0xAF, 0x67, 0xAF, 0x68, 0xAF, 0x69, 0xAF, 0x6A, /* 0x50-0x53 */
+	0xAF, 0x6B, 0xAF, 0x6C, 0xAF, 0x6D, 0xAF, 0x6E, /* 0x54-0x57 */
+	0xC4, 0xA1, 0xC4, 0xA2, 0xAF, 0x6F, 0xAF, 0x70, /* 0x58-0x5B */
+	0xC4, 0xA3, 0xAF, 0x71, 0xAF, 0x72, 0xC4, 0xA4, /* 0x5C-0x5F */
+	0xC4, 0xA5, 0xC4, 0xA6, 0xAF, 0x73, 0xAF, 0x74, /* 0x60-0x63 */
+	0xAF, 0x75, 0xAF, 0x76, 0xAF, 0x77, 0xAF, 0x78, /* 0x64-0x67 */
+	0xC4, 0xA7, 0xC4, 0xA8, 0xAF, 0x79, 0xC4, 0xA9, /* 0x68-0x6B */
+	0xAF, 0x7A, 0xC4, 0xAA, 0xAF, 0x81, 0xAF, 0x82, /* 0x6C-0x6F */
+	0xAF, 0x83, 0xAF, 0x84, 0xAF, 0x85, 0xAF, 0x86, /* 0x70-0x73 */
+	0xC4, 0xAB, 0xC4, 0xAC, 0xAF, 0x87, 0xAF, 0x88, /* 0x74-0x77 */
+	0xC4, 0xAD, 0xAF, 0x89, 0xAF, 0x8A, 0xAF, 0x8B, /* 0x78-0x7B */
+	0xC4, 0xAE, 0xAF, 0x8C, 0xAF, 0x8D, 0xAF, 0x8E, /* 0x7C-0x7F */
+	
+	0xAF, 0x8F, 0xAF, 0x90, 0xAF, 0x91, 0xAF, 0x92, /* 0x80-0x83 */
+	0xC4, 0xAF, 0xC4, 0xB0, 0xAF, 0x93, 0xC4, 0xB1, /* 0x84-0x87 */
+	0xAF, 0x94, 0xC4, 0xB2, 0xAF, 0x95, 0xAF, 0x96, /* 0x88-0x8B */
+	0xAF, 0x97, 0xAF, 0x98, 0xAF, 0x99, 0xAF, 0x9A, /* 0x8C-0x8F */
+	0xC4, 0xB3, 0xC4, 0xB4, 0xAF, 0x9B, 0xAF, 0x9C, /* 0x90-0x93 */
+	0xC4, 0xB5, 0xAF, 0x9D, 0xAF, 0x9E, 0xAF, 0x9F, /* 0x94-0x97 */
+	0xC4, 0xB6, 0xAF, 0xA0, 0xB0, 0x41, 0xB0, 0x42, /* 0x98-0x9B */
+	0xB0, 0x43, 0xB0, 0x44, 0xB0, 0x45, 0xB0, 0x46, /* 0x9C-0x9F */
+	0xC4, 0xB7, 0xC4, 0xB8, 0xB0, 0x47, 0xC4, 0xB9, /* 0xA0-0xA3 */
+	0xC4, 0xBA, 0xC4, 0xBB, 0xB0, 0x48, 0xB0, 0x49, /* 0xA4-0xA7 */
+	0xB0, 0x4A, 0xB0, 0x4B, 0xB0, 0x4C, 0xB0, 0x4D, /* 0xA8-0xAB */
+	0xC4, 0xBC, 0xC4, 0xBD, 0xB0, 0x4E, 0xB0, 0x4F, /* 0xAC-0xAF */
+	0xB0, 0x50, 0xB0, 0x51, 0xB0, 0x52, 0xB0, 0x53, /* 0xB0-0xB3 */
+	0xB0, 0x54, 0xB0, 0x55, 0xB0, 0x56, 0xB0, 0x57, /* 0xB4-0xB7 */
+	0xB0, 0x58, 0xB0, 0x59, 0xB0, 0x5A, 0xB0, 0x61, /* 0xB8-0xBB */
+	0xB0, 0x62, 0xB0, 0x63, 0xB0, 0x64, 0xB0, 0x65, /* 0xBC-0xBF */
+	0xB0, 0x66, 0xC4, 0xBE, 0xB0, 0x67, 0xB0, 0x68, /* 0xC0-0xC3 */
+	0xB0, 0x69, 0xB0, 0x6A, 0xB0, 0x6B, 0xB0, 0x6C, /* 0xC4-0xC7 */
+	0xB0, 0x6D, 0xB0, 0x6E, 0xB0, 0x6F, 0xB0, 0x70, /* 0xC8-0xCB */
+	0xB0, 0x71, 0xB0, 0x72, 0xB0, 0x73, 0xB0, 0x74, /* 0xCC-0xCF */
+	0xB0, 0x75, 0xB0, 0x76, 0xB0, 0x77, 0xB0, 0x78, /* 0xD0-0xD3 */
+	0xB0, 0x79, 0xB0, 0x7A, 0xB0, 0x81, 0xB0, 0x82, /* 0xD4-0xD7 */
+	0xB0, 0x83, 0xB0, 0x84, 0xB0, 0x85, 0xB0, 0x86, /* 0xD8-0xDB */
+	0xB0, 0x87, 0xB0, 0x88, 0xB0, 0x89, 0xB0, 0x8A, /* 0xDC-0xDF */
+	0xB0, 0x8B, 0xB0, 0x8C, 0xB0, 0x8D, 0xB0, 0x8E, /* 0xE0-0xE3 */
+	0xC4, 0xBF, 0xC4, 0xC0, 0xB0, 0x8F, 0xB0, 0x90, /* 0xE4-0xE7 */
+	0xC4, 0xC1, 0xB0, 0x91, 0xB0, 0x92, 0xC4, 0xC2, /* 0xE8-0xEB */
+	0xC4, 0xC3, 0xB0, 0x93, 0xB0, 0x94, 0xB0, 0x95, /* 0xEC-0xEF */
+	0xB0, 0x96, 0xB0, 0x97, 0xB0, 0x98, 0xB0, 0x99, /* 0xF0-0xF3 */
+	0xC4, 0xC4, 0xC4, 0xC5, 0xB0, 0x9A, 0xC4, 0xC6, /* 0xF4-0xF7 */
+	0xC4, 0xC7, 0xC4, 0xC8, 0xB0, 0x9B, 0xB0, 0x9C, /* 0xF8-0xFB */
+	0xB0, 0x9D, 0xB0, 0x9E, 0xB0, 0x9F, 0xB0, 0xA0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_CF[512] = {
+	0xC4, 0xC9, 0xC4, 0xCA, 0xB1, 0x41, 0xB1, 0x42, /* 0x00-0x03 */
+	0xC4, 0xCB, 0xB1, 0x43, 0xB1, 0x44, 0xB1, 0x45, /* 0x04-0x07 */
+	0xC4, 0xCC, 0xB1, 0x46, 0xB1, 0x47, 0xB1, 0x48, /* 0x08-0x0B */
+	0xB1, 0x49, 0xB1, 0x4A, 0xB1, 0x4B, 0xB1, 0x4C, /* 0x0C-0x0F */
+	0xC4, 0xCD, 0xC4, 0xCE, 0xB1, 0x4D, 0xC4, 0xCF, /* 0x10-0x13 */
+	0xB1, 0x4E, 0xC4, 0xD0, 0xB1, 0x4F, 0xB1, 0x50, /* 0x14-0x17 */
+	0xB1, 0x51, 0xB1, 0x52, 0xB1, 0x53, 0xB1, 0x54, /* 0x18-0x1B */
+	0xC4, 0xD1, 0xB1, 0x55, 0xB1, 0x56, 0xB1, 0x57, /* 0x1C-0x1F */
+	0xC4, 0xD2, 0xB1, 0x58, 0xB1, 0x59, 0xB1, 0x5A, /* 0x20-0x23 */
+	0xC4, 0xD3, 0xB1, 0x61, 0xB1, 0x62, 0xB1, 0x63, /* 0x24-0x27 */
+	0xB1, 0x64, 0xB1, 0x65, 0xB1, 0x66, 0xB1, 0x67, /* 0x28-0x2B */
+	0xC4, 0xD4, 0xC4, 0xD5, 0xB1, 0x68, 0xC4, 0xD6, /* 0x2C-0x2F */
+	0xC4, 0xD7, 0xC4, 0xD8, 0xB1, 0x69, 0xB1, 0x6A, /* 0x30-0x33 */
+	0xB1, 0x6B, 0xB1, 0x6C, 0xB1, 0x6D, 0xB1, 0x6E, /* 0x34-0x37 */
+	0xC4, 0xD9, 0xB1, 0x6F, 0xB1, 0x70, 0xB1, 0x71, /* 0x38-0x3B */
+	0xB1, 0x72, 0xB1, 0x73, 0xB1, 0x74, 0xB1, 0x75, /* 0x3C-0x3F */
+	0xB1, 0x76, 0xB1, 0x77, 0xB1, 0x78, 0xB1, 0x79, /* 0x40-0x43 */
+	0xB1, 0x7A, 0xB1, 0x81, 0xB1, 0x82, 0xB1, 0x83, /* 0x44-0x47 */
+	0xB1, 0x84, 0xB1, 0x85, 0xB1, 0x86, 0xB1, 0x87, /* 0x48-0x4B */
+	0xB1, 0x88, 0xB1, 0x89, 0xB1, 0x8A, 0xB1, 0x8B, /* 0x4C-0x4F */
+	0xB1, 0x8C, 0xB1, 0x8D, 0xB1, 0x8E, 0xB1, 0x8F, /* 0x50-0x53 */
+	0xC4, 0xDA, 0xC4, 0xDB, 0xB1, 0x90, 0xB1, 0x91, /* 0x54-0x57 */
+	0xC4, 0xDC, 0xB1, 0x92, 0xB1, 0x93, 0xB1, 0x94, /* 0x58-0x5B */
+	0xC4, 0xDD, 0xB1, 0x95, 0xB1, 0x96, 0xB1, 0x97, /* 0x5C-0x5F */
+	0xB1, 0x98, 0xB1, 0x99, 0xB1, 0x9A, 0xB1, 0x9B, /* 0x60-0x63 */
+	0xC4, 0xDE, 0xC4, 0xDF, 0xB1, 0x9C, 0xC4, 0xE0, /* 0x64-0x67 */
+	0xB1, 0x9D, 0xC4, 0xE1, 0xB1, 0x9E, 0xB1, 0x9F, /* 0x68-0x6B */
+	0xB1, 0xA0, 0xB2, 0x41, 0xB2, 0x42, 0xB2, 0x43, /* 0x6C-0x6F */
+	0xC4, 0xE2, 0xC4, 0xE3, 0xB2, 0x44, 0xB2, 0x45, /* 0x70-0x73 */
+	0xC4, 0xE4, 0xB2, 0x46, 0xB2, 0x47, 0xB2, 0x48, /* 0x74-0x77 */
+	0xC4, 0xE5, 0xB2, 0x49, 0xB2, 0x4A, 0xB2, 0x4B, /* 0x78-0x7B */
+	0xB2, 0x4C, 0xB2, 0x4D, 0xB2, 0x4E, 0xB2, 0x4F, /* 0x7C-0x7F */
+	
+	0xC4, 0xE6, 0xB2, 0x50, 0xB2, 0x51, 0xB2, 0x52, /* 0x80-0x83 */
+	0xB2, 0x53, 0xC4, 0xE7, 0xB2, 0x54, 0xB2, 0x55, /* 0x84-0x87 */
+	0xB2, 0x56, 0xB2, 0x57, 0xB2, 0x58, 0xB2, 0x59, /* 0x88-0x8B */
+	0xC4, 0xE8, 0xB2, 0x5A, 0xB2, 0x61, 0xB2, 0x62, /* 0x8C-0x8F */
+	0xB2, 0x63, 0xB2, 0x64, 0xB2, 0x65, 0xB2, 0x66, /* 0x90-0x93 */
+	0xB2, 0x67, 0xB2, 0x68, 0xB2, 0x69, 0xB2, 0x6A, /* 0x94-0x97 */
+	0xB2, 0x6B, 0xB2, 0x6C, 0xB2, 0x6D, 0xB2, 0x6E, /* 0x98-0x9B */
+	0xB2, 0x6F, 0xB2, 0x70, 0xB2, 0x71, 0xB2, 0x72, /* 0x9C-0x9F */
+	0xB2, 0x73, 0xC4, 0xE9, 0xB2, 0x74, 0xB2, 0x75, /* 0xA0-0xA3 */
+	0xB2, 0x76, 0xB2, 0x77, 0xB2, 0x78, 0xB2, 0x79, /* 0xA4-0xA7 */
+	0xC4, 0xEA, 0xB2, 0x7A, 0xB2, 0x81, 0xB2, 0x82, /* 0xA8-0xAB */
+	0xB2, 0x83, 0xB2, 0x84, 0xB2, 0x85, 0xB2, 0x86, /* 0xAC-0xAF */
+	0xC4, 0xEB, 0xB2, 0x87, 0xB2, 0x88, 0xB2, 0x89, /* 0xB0-0xB3 */
+	0xB2, 0x8A, 0xB2, 0x8B, 0xB2, 0x8C, 0xB2, 0x8D, /* 0xB4-0xB7 */
+	0xB2, 0x8E, 0xB2, 0x8F, 0xB2, 0x90, 0xB2, 0x91, /* 0xB8-0xBB */
+	0xB2, 0x92, 0xB2, 0x93, 0xB2, 0x94, 0xB2, 0x95, /* 0xBC-0xBF */
+	0xB2, 0x96, 0xB2, 0x97, 0xB2, 0x98, 0xB2, 0x99, /* 0xC0-0xC3 */
+	0xC4, 0xEC, 0xB2, 0x9A, 0xB2, 0x9B, 0xB2, 0x9C, /* 0xC4-0xC7 */
+	0xB2, 0x9D, 0xB2, 0x9E, 0xB2, 0x9F, 0xB2, 0xA0, /* 0xC8-0xCB */
+	0xB3, 0x41, 0xB3, 0x42, 0xB3, 0x43, 0xB3, 0x44, /* 0xCC-0xCF */
+	0xB3, 0x45, 0xB3, 0x46, 0xB3, 0x47, 0xB3, 0x48, /* 0xD0-0xD3 */
+	0xB3, 0x49, 0xB3, 0x4A, 0xB3, 0x4B, 0xB3, 0x4C, /* 0xD4-0xD7 */
+	0xB3, 0x4D, 0xB3, 0x4E, 0xB3, 0x4F, 0xB3, 0x50, /* 0xD8-0xDB */
+	0xB3, 0x51, 0xB3, 0x52, 0xB3, 0x53, 0xB3, 0x54, /* 0xDC-0xDF */
+	0xC4, 0xED, 0xC4, 0xEE, 0xB3, 0x55, 0xB3, 0x56, /* 0xE0-0xE3 */
+	0xC4, 0xEF, 0xB3, 0x57, 0xB3, 0x58, 0xB3, 0x59, /* 0xE4-0xE7 */
+	0xC4, 0xF0, 0xB3, 0x5A, 0xB3, 0x61, 0xB3, 0x62, /* 0xE8-0xEB */
+	0xB3, 0x63, 0xB3, 0x64, 0xB3, 0x65, 0xB3, 0x66, /* 0xEC-0xEF */
+	0xC4, 0xF1, 0xC4, 0xF2, 0xB3, 0x67, 0xC4, 0xF3, /* 0xF0-0xF3 */
+	0xB3, 0x68, 0xC4, 0xF4, 0xB3, 0x69, 0xB3, 0x6A, /* 0xF4-0xF7 */
+	0xB3, 0x6B, 0xB3, 0x6C, 0xB3, 0x6D, 0xB3, 0x6E, /* 0xF8-0xFB */
+	0xC4, 0xF5, 0xB3, 0x6F, 0xB3, 0x70, 0xB3, 0x71, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D0[512] = {
+	0xC4, 0xF6, 0xB3, 0x72, 0xB3, 0x73, 0xB3, 0x74, /* 0x00-0x03 */
+	0xC4, 0xF7, 0xB3, 0x75, 0xB3, 0x76, 0xB3, 0x77, /* 0x04-0x07 */
+	0xB3, 0x78, 0xB3, 0x79, 0xB3, 0x7A, 0xB3, 0x81, /* 0x08-0x0B */
+	0xB3, 0x82, 0xB3, 0x83, 0xB3, 0x84, 0xB3, 0x85, /* 0x0C-0x0F */
+	0xB3, 0x86, 0xC4, 0xF8, 0xB3, 0x87, 0xB3, 0x88, /* 0x10-0x13 */
+	0xB3, 0x89, 0xB3, 0x8A, 0xB3, 0x8B, 0xB3, 0x8C, /* 0x14-0x17 */
+	0xC4, 0xF9, 0xB3, 0x8D, 0xB3, 0x8E, 0xB3, 0x8F, /* 0x18-0x1B */
+	0xB3, 0x90, 0xB3, 0x91, 0xB3, 0x92, 0xB3, 0x93, /* 0x1C-0x1F */
+	0xB3, 0x94, 0xB3, 0x95, 0xB3, 0x96, 0xB3, 0x97, /* 0x20-0x23 */
+	0xB3, 0x98, 0xB3, 0x99, 0xB3, 0x9A, 0xB3, 0x9B, /* 0x24-0x27 */
+	0xB3, 0x9C, 0xB3, 0x9D, 0xB3, 0x9E, 0xB3, 0x9F, /* 0x28-0x2B */
+	0xB3, 0xA0, 0xC4, 0xFA, 0xB4, 0x41, 0xB4, 0x42, /* 0x2C-0x2F */
+	0xB4, 0x43, 0xB4, 0x44, 0xB4, 0x45, 0xB4, 0x46, /* 0x30-0x33 */
+	0xC4, 0xFB, 0xC4, 0xFC, 0xB4, 0x47, 0xB4, 0x48, /* 0x34-0x37 */
+	0xC4, 0xFD, 0xB4, 0x49, 0xB4, 0x4A, 0xB4, 0x4B, /* 0x38-0x3B */
+	0xC4, 0xFE, 0xB4, 0x4C, 0xB4, 0x4D, 0xB4, 0x4E, /* 0x3C-0x3F */
+	0xB4, 0x4F, 0xB4, 0x50, 0xB4, 0x51, 0xB4, 0x52, /* 0x40-0x43 */
+	0xC5, 0xA1, 0xC5, 0xA2, 0xB4, 0x53, 0xC5, 0xA3, /* 0x44-0x47 */
+	0xB4, 0x54, 0xC5, 0xA4, 0xB4, 0x55, 0xB4, 0x56, /* 0x48-0x4B */
+	0xB4, 0x57, 0xB4, 0x58, 0xB4, 0x59, 0xB4, 0x5A, /* 0x4C-0x4F */
+	0xC5, 0xA5, 0xB4, 0x61, 0xB4, 0x62, 0xB4, 0x63, /* 0x50-0x53 */
+	0xC5, 0xA6, 0xB4, 0x64, 0xB4, 0x65, 0xB4, 0x66, /* 0x54-0x57 */
+	0xC5, 0xA7, 0xB4, 0x67, 0xB4, 0x68, 0xB4, 0x69, /* 0x58-0x5B */
+	0xB4, 0x6A, 0xB4, 0x6B, 0xB4, 0x6C, 0xB4, 0x6D, /* 0x5C-0x5F */
+	0xC5, 0xA8, 0xB4, 0x6E, 0xB4, 0x6F, 0xB4, 0x70, /* 0x60-0x63 */
+	0xB4, 0x71, 0xB4, 0x72, 0xB4, 0x73, 0xB4, 0x74, /* 0x64-0x67 */
+	0xB4, 0x75, 0xB4, 0x76, 0xB4, 0x77, 0xB4, 0x78, /* 0x68-0x6B */
+	0xC5, 0xA9, 0xC5, 0xAA, 0xB4, 0x79, 0xB4, 0x7A, /* 0x6C-0x6F */
+	0xC5, 0xAB, 0xB4, 0x81, 0xB4, 0x82, 0xB4, 0x83, /* 0x70-0x73 */
+	0xC5, 0xAC, 0xB4, 0x84, 0xB4, 0x85, 0xB4, 0x86, /* 0x74-0x77 */
+	0xB4, 0x87, 0xB4, 0x88, 0xB4, 0x89, 0xB4, 0x8A, /* 0x78-0x7B */
+	0xC5, 0xAD, 0xC5, 0xAE, 0xB4, 0x8B, 0xB4, 0x8C, /* 0x7C-0x7F */
+	
+	0xB4, 0x8D, 0xC5, 0xAF, 0xB4, 0x8E, 0xB4, 0x8F, /* 0x80-0x83 */
+	0xB4, 0x90, 0xB4, 0x91, 0xB4, 0x92, 0xB4, 0x93, /* 0x84-0x87 */
+	0xB4, 0x94, 0xB4, 0x95, 0xB4, 0x96, 0xB4, 0x97, /* 0x88-0x8B */
+	0xB4, 0x98, 0xB4, 0x99, 0xB4, 0x9A, 0xB4, 0x9B, /* 0x8C-0x8F */
+	0xB4, 0x9C, 0xB4, 0x9D, 0xB4, 0x9E, 0xB4, 0x9F, /* 0x90-0x93 */
+	0xB4, 0xA0, 0xB5, 0x41, 0xB5, 0x42, 0xB5, 0x43, /* 0x94-0x97 */
+	0xB5, 0x44, 0xB5, 0x45, 0xB5, 0x46, 0xB5, 0x47, /* 0x98-0x9B */
+	0xB5, 0x48, 0xB5, 0x49, 0xB5, 0x4A, 0xB5, 0x4B, /* 0x9C-0x9F */
+	0xB5, 0x4C, 0xB5, 0x4D, 0xB5, 0x4E, 0xB5, 0x4F, /* 0xA0-0xA3 */
+	0xC5, 0xB0, 0xC5, 0xB1, 0xB5, 0x50, 0xB5, 0x51, /* 0xA4-0xA7 */
+	0xC5, 0xB2, 0xB5, 0x52, 0xB5, 0x53, 0xB5, 0x54, /* 0xA8-0xAB */
+	0xC5, 0xB3, 0xB5, 0x55, 0xB5, 0x56, 0xB5, 0x57, /* 0xAC-0xAF */
+	0xB5, 0x58, 0xB5, 0x59, 0xB5, 0x5A, 0xB5, 0x61, /* 0xB0-0xB3 */
+	0xC5, 0xB4, 0xC5, 0xB5, 0xB5, 0x62, 0xC5, 0xB6, /* 0xB4-0xB7 */
+	0xB5, 0x63, 0xC5, 0xB7, 0xB5, 0x64, 0xB5, 0x65, /* 0xB8-0xBB */
+	0xB5, 0x66, 0xB5, 0x67, 0xB5, 0x68, 0xB5, 0x69, /* 0xBC-0xBF */
+	0xC5, 0xB8, 0xC5, 0xB9, 0xB5, 0x6A, 0xB5, 0x6B, /* 0xC0-0xC3 */
+	0xC5, 0xBA, 0xB5, 0x6C, 0xB5, 0x6D, 0xB5, 0x6E, /* 0xC4-0xC7 */
+	0xC5, 0xBB, 0xC5, 0xBC, 0xB5, 0x6F, 0xB5, 0x70, /* 0xC8-0xCB */
+	0xB5, 0x71, 0xB5, 0x72, 0xB5, 0x73, 0xB5, 0x74, /* 0xCC-0xCF */
+	0xC5, 0xBD, 0xC5, 0xBE, 0xB5, 0x75, 0xC5, 0xBF, /* 0xD0-0xD3 */
+	0xC5, 0xC0, 0xC5, 0xC1, 0xB5, 0x76, 0xB5, 0x77, /* 0xD4-0xD7 */
+	0xB5, 0x78, 0xB5, 0x79, 0xB5, 0x7A, 0xB5, 0x81, /* 0xD8-0xDB */
+	0xC5, 0xC2, 0xC5, 0xC3, 0xB5, 0x82, 0xB5, 0x83, /* 0xDC-0xDF */
+	0xC5, 0xC4, 0xB5, 0x84, 0xB5, 0x85, 0xB5, 0x86, /* 0xE0-0xE3 */
+	0xC5, 0xC5, 0xB5, 0x87, 0xB5, 0x88, 0xB5, 0x89, /* 0xE4-0xE7 */
+	0xB5, 0x8A, 0xB5, 0x8B, 0xB5, 0x8C, 0xB5, 0x8D, /* 0xE8-0xEB */
+	0xC5, 0xC6, 0xC5, 0xC7, 0xB5, 0x8E, 0xC5, 0xC8, /* 0xEC-0xEF */
+	0xC5, 0xC9, 0xC5, 0xCA, 0xB5, 0x8F, 0xB5, 0x90, /* 0xF0-0xF3 */
+	0xB5, 0x91, 0xB5, 0x92, 0xB5, 0x93, 0xB5, 0x94, /* 0xF4-0xF7 */
+	0xC5, 0xCB, 0xB5, 0x95, 0xB5, 0x96, 0xB5, 0x97, /* 0xF8-0xFB */
+	0xB5, 0x98, 0xB5, 0x99, 0xB5, 0x9A, 0xB5, 0x9B, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D1[512] = {
+	0xB5, 0x9C, 0xB5, 0x9D, 0xB5, 0x9E, 0xB5, 0x9F, /* 0x00-0x03 */
+	0xB5, 0xA0, 0xB6, 0x41, 0xB6, 0x42, 0xB6, 0x43, /* 0x04-0x07 */
+	0xB6, 0x44, 0xB6, 0x45, 0xB6, 0x46, 0xB6, 0x47, /* 0x08-0x0B */
+	0xB6, 0x48, 0xC5, 0xCC, 0xB6, 0x49, 0xB6, 0x4A, /* 0x0C-0x0F */
+	0xB6, 0x4B, 0xB6, 0x4C, 0xB6, 0x4D, 0xB6, 0x4E, /* 0x10-0x13 */
+	0xB6, 0x4F, 0xB6, 0x50, 0xB6, 0x51, 0xB6, 0x52, /* 0x14-0x17 */
+	0xB6, 0x53, 0xB6, 0x54, 0xB6, 0x55, 0xB6, 0x56, /* 0x18-0x1B */
+	0xB6, 0x57, 0xB6, 0x58, 0xB6, 0x59, 0xB6, 0x5A, /* 0x1C-0x1F */
+	0xB6, 0x61, 0xB6, 0x62, 0xB6, 0x63, 0xB6, 0x64, /* 0x20-0x23 */
+	0xB6, 0x65, 0xB6, 0x66, 0xB6, 0x67, 0xB6, 0x68, /* 0x24-0x27 */
+	0xB6, 0x69, 0xB6, 0x6A, 0xB6, 0x6B, 0xB6, 0x6C, /* 0x28-0x2B */
+	0xB6, 0x6D, 0xB6, 0x6E, 0xB6, 0x6F, 0xB6, 0x70, /* 0x2C-0x2F */
+	0xC5, 0xCD, 0xC5, 0xCE, 0xB6, 0x71, 0xB6, 0x72, /* 0x30-0x33 */
+	0xC5, 0xCF, 0xB6, 0x73, 0xB6, 0x74, 0xB6, 0x75, /* 0x34-0x37 */
+	0xC5, 0xD0, 0xB6, 0x76, 0xC5, 0xD1, 0xB6, 0x77, /* 0x38-0x3B */
+	0xB6, 0x78, 0xB6, 0x79, 0xB6, 0x7A, 0xB6, 0x81, /* 0x3C-0x3F */
+	0xC5, 0xD2, 0xC5, 0xD3, 0xB6, 0x82, 0xC5, 0xD4, /* 0x40-0x43 */
+	0xC5, 0xD5, 0xC5, 0xD6, 0xB6, 0x83, 0xB6, 0x84, /* 0x44-0x47 */
+	0xB6, 0x85, 0xB6, 0x86, 0xB6, 0x87, 0xB6, 0x88, /* 0x48-0x4B */
+	0xC5, 0xD7, 0xC5, 0xD8, 0xB6, 0x89, 0xB6, 0x8A, /* 0x4C-0x4F */
+	0xC5, 0xD9, 0xB6, 0x8B, 0xB6, 0x8C, 0xB6, 0x8D, /* 0x50-0x53 */
+	0xC5, 0xDA, 0xB6, 0x8E, 0xB6, 0x8F, 0xB6, 0x90, /* 0x54-0x57 */
+	0xB6, 0x91, 0xB6, 0x92, 0xB6, 0x93, 0xB6, 0x94, /* 0x58-0x5B */
+	0xC5, 0xDB, 0xC5, 0xDC, 0xB6, 0x95, 0xC5, 0xDD, /* 0x5C-0x5F */
+	0xB6, 0x96, 0xC5, 0xDE, 0xB6, 0x97, 0xB6, 0x98, /* 0x60-0x63 */
+	0xB6, 0x99, 0xB6, 0x9A, 0xB6, 0x9B, 0xB6, 0x9C, /* 0x64-0x67 */
+	0xC5, 0xDF, 0xB6, 0x9D, 0xB6, 0x9E, 0xB6, 0x9F, /* 0x68-0x6B */
+	0xC5, 0xE0, 0xB6, 0xA0, 0xB7, 0x41, 0xB7, 0x42, /* 0x6C-0x6F */
+	0xB7, 0x43, 0xB7, 0x44, 0xB7, 0x45, 0xB7, 0x46, /* 0x70-0x73 */
+	0xB7, 0x47, 0xB7, 0x48, 0xB7, 0x49, 0xB7, 0x4A, /* 0x74-0x77 */
+	0xB7, 0x4B, 0xB7, 0x4C, 0xB7, 0x4D, 0xB7, 0x4E, /* 0x78-0x7B */
+	0xC5, 0xE1, 0xB7, 0x4F, 0xB7, 0x50, 0xB7, 0x51, /* 0x7C-0x7F */
+	
+	0xB7, 0x52, 0xB7, 0x53, 0xB7, 0x54, 0xB7, 0x55, /* 0x80-0x83 */
+	0xC5, 0xE2, 0xB7, 0x56, 0xB7, 0x57, 0xB7, 0x58, /* 0x84-0x87 */
+	0xC5, 0xE3, 0xB7, 0x59, 0xB7, 0x5A, 0xB7, 0x61, /* 0x88-0x8B */
+	0xB7, 0x62, 0xB7, 0x63, 0xB7, 0x64, 0xB7, 0x65, /* 0x8C-0x8F */
+	0xB7, 0x66, 0xB7, 0x67, 0xB7, 0x68, 0xB7, 0x69, /* 0x90-0x93 */
+	0xB7, 0x6A, 0xB7, 0x6B, 0xB7, 0x6C, 0xB7, 0x6D, /* 0x94-0x97 */
+	0xB7, 0x6E, 0xB7, 0x6F, 0xB7, 0x70, 0xB7, 0x71, /* 0x98-0x9B */
+	0xB7, 0x72, 0xB7, 0x73, 0xB7, 0x74, 0xB7, 0x75, /* 0x9C-0x9F */
+	0xC5, 0xE4, 0xC5, 0xE5, 0xB7, 0x76, 0xB7, 0x77, /* 0xA0-0xA3 */
+	0xC5, 0xE6, 0xB7, 0x78, 0xB7, 0x79, 0xB7, 0x7A, /* 0xA4-0xA7 */
+	0xC5, 0xE7, 0xB7, 0x81, 0xB7, 0x82, 0xB7, 0x83, /* 0xA8-0xAB */
+	0xB7, 0x84, 0xB7, 0x85, 0xB7, 0x86, 0xB7, 0x87, /* 0xAC-0xAF */
+	0xC5, 0xE8, 0xC5, 0xE9, 0xB7, 0x88, 0xC5, 0xEA, /* 0xB0-0xB3 */
+	0xB7, 0x89, 0xC5, 0xEB, 0xB7, 0x8A, 0xB7, 0x8B, /* 0xB4-0xB7 */
+	0xB7, 0x8C, 0xB7, 0x8D, 0xC5, 0xEC, 0xB7, 0x8E, /* 0xB8-0xBB */
+	0xC5, 0xED, 0xB7, 0x8F, 0xB7, 0x90, 0xB7, 0x91, /* 0xBC-0xBF */
+	0xC5, 0xEE, 0xB7, 0x92, 0xB7, 0x93, 0xB7, 0x94, /* 0xC0-0xC3 */
+	0xB7, 0x95, 0xB7, 0x96, 0xB7, 0x97, 0xB7, 0x98, /* 0xC4-0xC7 */
+	0xB7, 0x99, 0xB7, 0x9A, 0xB7, 0x9B, 0xB7, 0x9C, /* 0xC8-0xCB */
+	0xB7, 0x9D, 0xB7, 0x9E, 0xB7, 0x9F, 0xB7, 0xA0, /* 0xCC-0xCF */
+	0xB8, 0x41, 0xB8, 0x42, 0xB8, 0x43, 0xB8, 0x44, /* 0xD0-0xD3 */
+	0xB8, 0x45, 0xB8, 0x46, 0xB8, 0x47, 0xB8, 0x48, /* 0xD4-0xD7 */
+	0xC5, 0xEF, 0xB8, 0x49, 0xB8, 0x4A, 0xB8, 0x4B, /* 0xD8-0xDB */
+	0xB8, 0x4C, 0xB8, 0x4D, 0xB8, 0x4E, 0xB8, 0x4F, /* 0xDC-0xDF */
+	0xB8, 0x50, 0xB8, 0x51, 0xB8, 0x52, 0xB8, 0x53, /* 0xE0-0xE3 */
+	0xB8, 0x54, 0xB8, 0x55, 0xB8, 0x56, 0xB8, 0x57, /* 0xE4-0xE7 */
+	0xB8, 0x58, 0xB8, 0x59, 0xB8, 0x5A, 0xB8, 0x61, /* 0xE8-0xEB */
+	0xB8, 0x62, 0xB8, 0x63, 0xB8, 0x64, 0xB8, 0x65, /* 0xEC-0xEF */
+	0xB8, 0x66, 0xB8, 0x67, 0xB8, 0x68, 0xB8, 0x69, /* 0xF0-0xF3 */
+	0xC5, 0xF0, 0xB8, 0x6A, 0xB8, 0x6B, 0xB8, 0x6C, /* 0xF4-0xF7 */
+	0xC5, 0xF1, 0xB8, 0x6D, 0xB8, 0x6E, 0xB8, 0x6F, /* 0xF8-0xFB */
+	0xB8, 0x70, 0xB8, 0x71, 0xB8, 0x72, 0xB8, 0x73, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D2[512] = {
+	0xB8, 0x74, 0xB8, 0x75, 0xB8, 0x76, 0xB8, 0x77, /* 0x00-0x03 */
+	0xB8, 0x78, 0xB8, 0x79, 0xB8, 0x7A, 0xC5, 0xF2, /* 0x04-0x07 */
+	0xB8, 0x81, 0xC5, 0xF3, 0xB8, 0x82, 0xB8, 0x83, /* 0x08-0x0B */
+	0xB8, 0x84, 0xB8, 0x85, 0xB8, 0x86, 0xB8, 0x87, /* 0x0C-0x0F */
+	0xC5, 0xF4, 0xB8, 0x88, 0xB8, 0x89, 0xB8, 0x8A, /* 0x10-0x13 */
+	0xB8, 0x8B, 0xB8, 0x8C, 0xB8, 0x8D, 0xB8, 0x8E, /* 0x14-0x17 */
+	0xB8, 0x8F, 0xB8, 0x90, 0xB8, 0x91, 0xB8, 0x92, /* 0x18-0x1B */
+	0xB8, 0x93, 0xB8, 0x94, 0xB8, 0x95, 0xB8, 0x96, /* 0x1C-0x1F */
+	0xB8, 0x97, 0xB8, 0x98, 0xB8, 0x99, 0xB8, 0x9A, /* 0x20-0x23 */
+	0xB8, 0x9B, 0xB8, 0x9C, 0xB8, 0x9D, 0xB8, 0x9E, /* 0x24-0x27 */
+	0xB8, 0x9F, 0xB8, 0xA0, 0xB9, 0x41, 0xB9, 0x42, /* 0x28-0x2B */
+	0xC5, 0xF5, 0xC5, 0xF6, 0xB9, 0x43, 0xB9, 0x44, /* 0x2C-0x2F */
+	0xC5, 0xF7, 0xB9, 0x45, 0xB9, 0x46, 0xB9, 0x47, /* 0x30-0x33 */
+	0xC5, 0xF8, 0xB9, 0x48, 0xB9, 0x49, 0xB9, 0x4A, /* 0x34-0x37 */
+	0xB9, 0x4B, 0xB9, 0x4C, 0xB9, 0x4D, 0xB9, 0x4E, /* 0x38-0x3B */
+	0xC5, 0xF9, 0xC5, 0xFA, 0xB9, 0x4F, 0xC5, 0xFB, /* 0x3C-0x3F */
+	0xB9, 0x50, 0xC5, 0xFC, 0xB9, 0x51, 0xB9, 0x52, /* 0x40-0x43 */
+	0xB9, 0x53, 0xB9, 0x54, 0xB9, 0x55, 0xB9, 0x56, /* 0x44-0x47 */
+	0xC5, 0xFD, 0xB9, 0x57, 0xB9, 0x58, 0xB9, 0x59, /* 0x48-0x4B */
+	0xB9, 0x5A, 0xB9, 0x61, 0xB9, 0x62, 0xB9, 0x63, /* 0x4C-0x4F */
+	0xB9, 0x64, 0xB9, 0x65, 0xB9, 0x66, 0xB9, 0x67, /* 0x50-0x53 */
+	0xB9, 0x68, 0xB9, 0x69, 0xB9, 0x6A, 0xB9, 0x6B, /* 0x54-0x57 */
+	0xB9, 0x6C, 0xB9, 0x6D, 0xB9, 0x6E, 0xB9, 0x6F, /* 0x58-0x5B */
+	0xC5, 0xFE, 0xB9, 0x70, 0xB9, 0x71, 0xB9, 0x72, /* 0x5C-0x5F */
+	0xB9, 0x73, 0xB9, 0x74, 0xB9, 0x75, 0xB9, 0x76, /* 0x60-0x63 */
+	0xC6, 0xA1, 0xB9, 0x77, 0xB9, 0x78, 0xB9, 0x79, /* 0x64-0x67 */
+	0xB9, 0x7A, 0xB9, 0x81, 0xB9, 0x82, 0xB9, 0x83, /* 0x68-0x6B */
+	0xB9, 0x84, 0xB9, 0x85, 0xB9, 0x86, 0xB9, 0x87, /* 0x6C-0x6F */
+	0xB9, 0x88, 0xB9, 0x89, 0xB9, 0x8A, 0xB9, 0x8B, /* 0x70-0x73 */
+	0xB9, 0x8C, 0xB9, 0x8D, 0xB9, 0x8E, 0xB9, 0x8F, /* 0x74-0x77 */
+	0xB9, 0x90, 0xB9, 0x91, 0xB9, 0x92, 0xB9, 0x93, /* 0x78-0x7B */
+	0xB9, 0x94, 0xB9, 0x95, 0xB9, 0x96, 0xB9, 0x97, /* 0x7C-0x7F */
+	
+	0xC6, 0xA2, 0xC6, 0xA3, 0xB9, 0x98, 0xB9, 0x99, /* 0x80-0x83 */
+	0xC6, 0xA4, 0xB9, 0x9A, 0xB9, 0x9B, 0xB9, 0x9C, /* 0x84-0x87 */
+	0xC6, 0xA5, 0xB9, 0x9D, 0xB9, 0x9E, 0xB9, 0x9F, /* 0x88-0x8B */
+	0xB9, 0xA0, 0xBA, 0x41, 0xBA, 0x42, 0xBA, 0x43, /* 0x8C-0x8F */
+	0xC6, 0xA6, 0xC6, 0xA7, 0xBA, 0x44, 0xBA, 0x45, /* 0x90-0x93 */
+	0xBA, 0x46, 0xC6, 0xA8, 0xBA, 0x47, 0xBA, 0x48, /* 0x94-0x97 */
+	0xBA, 0x49, 0xBA, 0x4A, 0xBA, 0x4B, 0xBA, 0x4C, /* 0x98-0x9B */
+	0xC6, 0xA9, 0xBA, 0x4D, 0xBA, 0x4E, 0xBA, 0x4F, /* 0x9C-0x9F */
+	0xC6, 0xAA, 0xBA, 0x50, 0xBA, 0x51, 0xBA, 0x52, /* 0xA0-0xA3 */
+	0xC6, 0xAB, 0xBA, 0x53, 0xBA, 0x54, 0xBA, 0x55, /* 0xA4-0xA7 */
+	0xBA, 0x56, 0xBA, 0x57, 0xBA, 0x58, 0xBA, 0x59, /* 0xA8-0xAB */
+	0xC6, 0xAC, 0xBA, 0x5A, 0xBA, 0x61, 0xBA, 0x62, /* 0xAC-0xAF */
+	0xBA, 0x63, 0xC6, 0xAD, 0xBA, 0x64, 0xBA, 0x65, /* 0xB0-0xB3 */
+	0xBA, 0x66, 0xBA, 0x67, 0xBA, 0x68, 0xBA, 0x69, /* 0xB4-0xB7 */
+	0xC6, 0xAE, 0xC6, 0xAF, 0xBA, 0x6A, 0xBA, 0x6B, /* 0xB8-0xBB */
+	0xC6, 0xB0, 0xBA, 0x6C, 0xBA, 0x6D, 0xC6, 0xB1, /* 0xBC-0xBF */
+	0xC6, 0xB2, 0xBA, 0x6E, 0xC6, 0xB3, 0xBA, 0x6F, /* 0xC0-0xC3 */
+	0xBA, 0x70, 0xBA, 0x71, 0xBA, 0x72, 0xBA, 0x73, /* 0xC4-0xC7 */
+	0xC6, 0xB4, 0xC6, 0xB5, 0xBA, 0x74, 0xC6, 0xB6, /* 0xC8-0xCB */
+	0xBA, 0x75, 0xBA, 0x76, 0xBA, 0x77, 0xBA, 0x78, /* 0xCC-0xCF */
+	0xBA, 0x79, 0xBA, 0x7A, 0xBA, 0x81, 0xBA, 0x82, /* 0xD0-0xD3 */
+	0xC6, 0xB7, 0xBA, 0x83, 0xBA, 0x84, 0xBA, 0x85, /* 0xD4-0xD7 */
+	0xC6, 0xB8, 0xBA, 0x86, 0xBA, 0x87, 0xBA, 0x88, /* 0xD8-0xDB */
+	0xC6, 0xB9, 0xBA, 0x89, 0xBA, 0x8A, 0xBA, 0x8B, /* 0xDC-0xDF */
+	0xBA, 0x8C, 0xBA, 0x8D, 0xBA, 0x8E, 0xBA, 0x8F, /* 0xE0-0xE3 */
+	0xC6, 0xBA, 0xC6, 0xBB, 0xBA, 0x90, 0xBA, 0x91, /* 0xE4-0xE7 */
+	0xBA, 0x92, 0xBA, 0x93, 0xBA, 0x94, 0xBA, 0x95, /* 0xE8-0xEB */
+	0xBA, 0x96, 0xBA, 0x97, 0xBA, 0x98, 0xBA, 0x99, /* 0xEC-0xEF */
+	0xC6, 0xBC, 0xC6, 0xBD, 0xBA, 0x9A, 0xBA, 0x9B, /* 0xF0-0xF3 */
+	0xC6, 0xBE, 0xBA, 0x9C, 0xBA, 0x9D, 0xBA, 0x9E, /* 0xF4-0xF7 */
+	0xC6, 0xBF, 0xBA, 0x9F, 0xBA, 0xA0, 0xBB, 0x41, /* 0xF8-0xFB */
+	0xBB, 0x42, 0xBB, 0x43, 0xBB, 0x44, 0xBB, 0x45, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D3[512] = {
+	0xC6, 0xC0, 0xC6, 0xC1, 0xBB, 0x46, 0xC6, 0xC2, /* 0x00-0x03 */
+	0xBB, 0x47, 0xC6, 0xC3, 0xBB, 0x48, 0xBB, 0x49, /* 0x04-0x07 */
+	0xBB, 0x4A, 0xBB, 0x4B, 0xBB, 0x4C, 0xBB, 0x4D, /* 0x08-0x0B */
+	0xC6, 0xC4, 0xC6, 0xC5, 0xC6, 0xC6, 0xBB, 0x4E, /* 0x0C-0x0F */
+	0xC6, 0xC7, 0xBB, 0x4F, 0xBB, 0x50, 0xBB, 0x51, /* 0x10-0x13 */
+	0xC6, 0xC8, 0xBB, 0x52, 0xC6, 0xC9, 0xBB, 0x53, /* 0x14-0x17 */
+	0xBB, 0x54, 0xBB, 0x55, 0xBB, 0x56, 0xBB, 0x57, /* 0x18-0x1B */
+	0xC6, 0xCA, 0xC6, 0xCB, 0xBB, 0x58, 0xC6, 0xCC, /* 0x1C-0x1F */
+	0xC6, 0xCD, 0xC6, 0xCE, 0xBB, 0x59, 0xBB, 0x5A, /* 0x20-0x23 */
+	0xBB, 0x61, 0xC6, 0xCF, 0xBB, 0x62, 0xBB, 0x63, /* 0x24-0x27 */
+	0xC6, 0xD0, 0xC6, 0xD1, 0xBB, 0x64, 0xBB, 0x65, /* 0x28-0x2B */
+	0xC6, 0xD2, 0xBB, 0x66, 0xBB, 0x67, 0xBB, 0x68, /* 0x2C-0x2F */
+	0xC6, 0xD3, 0xBB, 0x69, 0xBB, 0x6A, 0xBB, 0x6B, /* 0x30-0x33 */
+	0xBB, 0x6C, 0xBB, 0x6D, 0xBB, 0x6E, 0xBB, 0x6F, /* 0x34-0x37 */
+	0xC6, 0xD4, 0xC6, 0xD5, 0xBB, 0x70, 0xC6, 0xD6, /* 0x38-0x3B */
+	0xC6, 0xD7, 0xC6, 0xD8, 0xBB, 0x71, 0xBB, 0x72, /* 0x3C-0x3F */
+	0xBB, 0x73, 0xBB, 0x74, 0xBB, 0x75, 0xBB, 0x76, /* 0x40-0x43 */
+	0xC6, 0xD9, 0xC6, 0xDA, 0xBB, 0x77, 0xBB, 0x78, /* 0x44-0x47 */
+	0xBB, 0x79, 0xBB, 0x7A, 0xBB, 0x81, 0xBB, 0x82, /* 0x48-0x4B */
+	0xBB, 0x83, 0xBB, 0x84, 0xBB, 0x85, 0xBB, 0x86, /* 0x4C-0x4F */
+	0xBB, 0x87, 0xBB, 0x88, 0xBB, 0x89, 0xBB, 0x8A, /* 0x50-0x53 */
+	0xBB, 0x8B, 0xBB, 0x8C, 0xBB, 0x8D, 0xBB, 0x8E, /* 0x54-0x57 */
+	0xBB, 0x8F, 0xBB, 0x90, 0xBB, 0x91, 0xBB, 0x92, /* 0x58-0x5B */
+	0xBB, 0x93, 0xBB, 0x94, 0xBB, 0x95, 0xBB, 0x96, /* 0x5C-0x5F */
+	0xBB, 0x97, 0xBB, 0x98, 0xBB, 0x99, 0xBB, 0x9A, /* 0x60-0x63 */
+	0xBB, 0x9B, 0xBB, 0x9C, 0xBB, 0x9D, 0xBB, 0x9E, /* 0x64-0x67 */
+	0xBB, 0x9F, 0xBB, 0xA0, 0xBC, 0x41, 0xBC, 0x42, /* 0x68-0x6B */
+	0xBC, 0x43, 0xBC, 0x44, 0xBC, 0x45, 0xBC, 0x46, /* 0x6C-0x6F */
+	0xBC, 0x47, 0xBC, 0x48, 0xBC, 0x49, 0xBC, 0x4A, /* 0x70-0x73 */
+	0xBC, 0x4B, 0xBC, 0x4C, 0xBC, 0x4D, 0xBC, 0x4E, /* 0x74-0x77 */
+	0xBC, 0x4F, 0xBC, 0x50, 0xBC, 0x51, 0xBC, 0x52, /* 0x78-0x7B */
+	0xC6, 0xDB, 0xC6, 0xDC, 0xBC, 0x53, 0xBC, 0x54, /* 0x7C-0x7F */
+	
+	0xC6, 0xDD, 0xBC, 0x55, 0xBC, 0x56, 0xBC, 0x57, /* 0x80-0x83 */
+	0xC6, 0xDE, 0xBC, 0x58, 0xBC, 0x59, 0xBC, 0x5A, /* 0x84-0x87 */
+	0xBC, 0x61, 0xBC, 0x62, 0xBC, 0x63, 0xBC, 0x64, /* 0x88-0x8B */
+	0xC6, 0xDF, 0xC6, 0xE0, 0xBC, 0x65, 0xC6, 0xE1, /* 0x8C-0x8F */
+	0xC6, 0xE2, 0xC6, 0xE3, 0xBC, 0x66, 0xBC, 0x67, /* 0x90-0x93 */
+	0xBC, 0x68, 0xBC, 0x69, 0xBC, 0x6A, 0xBC, 0x6B, /* 0x94-0x97 */
+	0xC6, 0xE4, 0xC6, 0xE5, 0xBC, 0x6C, 0xBC, 0x6D, /* 0x98-0x9B */
+	0xC6, 0xE6, 0xBC, 0x6E, 0xBC, 0x6F, 0xBC, 0x70, /* 0x9C-0x9F */
+	0xC6, 0xE7, 0xBC, 0x71, 0xBC, 0x72, 0xBC, 0x73, /* 0xA0-0xA3 */
+	0xBC, 0x74, 0xBC, 0x75, 0xBC, 0x76, 0xBC, 0x77, /* 0xA4-0xA7 */
+	0xC6, 0xE8, 0xC6, 0xE9, 0xBC, 0x78, 0xC6, 0xEA, /* 0xA8-0xAB */
+	0xBC, 0x79, 0xC6, 0xEB, 0xBC, 0x7A, 0xBC, 0x81, /* 0xAC-0xAF */
+	0xBC, 0x82, 0xBC, 0x83, 0xBC, 0x84, 0xBC, 0x85, /* 0xB0-0xB3 */
+	0xC6, 0xEC, 0xBC, 0x86, 0xBC, 0x87, 0xBC, 0x88, /* 0xB4-0xB7 */
+	0xC6, 0xED, 0xBC, 0x89, 0xBC, 0x8A, 0xBC, 0x8B, /* 0xB8-0xBB */
+	0xC6, 0xEE, 0xBC, 0x8C, 0xBC, 0x8D, 0xBC, 0x8E, /* 0xBC-0xBF */
+	0xBC, 0x8F, 0xBC, 0x90, 0xBC, 0x91, 0xBC, 0x92, /* 0xC0-0xC3 */
+	0xC6, 0xEF, 0xC6, 0xF0, 0xBC, 0x93, 0xBC, 0x94, /* 0xC4-0xC7 */
+	0xC6, 0xF1, 0xC6, 0xF2, 0xBC, 0x95, 0xBC, 0x96, /* 0xC8-0xCB */
+	0xBC, 0x97, 0xBC, 0x98, 0xBC, 0x99, 0xBC, 0x9A, /* 0xCC-0xCF */
+	0xC6, 0xF3, 0xBC, 0x9B, 0xBC, 0x9C, 0xBC, 0x9D, /* 0xD0-0xD3 */
+	0xBC, 0x9E, 0xBC, 0x9F, 0xBC, 0xA0, 0xBD, 0x41, /* 0xD4-0xD7 */
+	0xC6, 0xF4, 0xBD, 0x42, 0xBD, 0x43, 0xBD, 0x44, /* 0xD8-0xDB */
+	0xBD, 0x45, 0xBD, 0x46, 0xBD, 0x47, 0xBD, 0x48, /* 0xDC-0xDF */
+	0xBD, 0x49, 0xC6, 0xF5, 0xBD, 0x4A, 0xC6, 0xF6, /* 0xE0-0xE3 */
+	0xBD, 0x4B, 0xBD, 0x4C, 0xBD, 0x4D, 0xBD, 0x4E, /* 0xE4-0xE7 */
+	0xBD, 0x4F, 0xBD, 0x50, 0xBD, 0x51, 0xBD, 0x52, /* 0xE8-0xEB */
+	0xC6, 0xF7, 0xC6, 0xF8, 0xBD, 0x53, 0xBD, 0x54, /* 0xEC-0xEF */
+	0xC6, 0xF9, 0xBD, 0x55, 0xBD, 0x56, 0xBD, 0x57, /* 0xF0-0xF3 */
+	0xC6, 0xFA, 0xBD, 0x58, 0xBD, 0x59, 0xBD, 0x5A, /* 0xF4-0xF7 */
+	0xBD, 0x61, 0xBD, 0x62, 0xBD, 0x63, 0xBD, 0x64, /* 0xF8-0xFB */
+	0xC6, 0xFB, 0xC6, 0xFC, 0xBD, 0x65, 0xC6, 0xFD, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D4[512] = {
+	0xBD, 0x66, 0xC6, 0xFE, 0xBD, 0x67, 0xBD, 0x68, /* 0x00-0x03 */
+	0xBD, 0x69, 0xBD, 0x6A, 0xBD, 0x6B, 0xBD, 0x6C, /* 0x04-0x07 */
+	0xC7, 0xA1, 0xBD, 0x6D, 0xBD, 0x6E, 0xBD, 0x6F, /* 0x08-0x0B */
+	0xBD, 0x70, 0xBD, 0x71, 0xBD, 0x72, 0xBD, 0x73, /* 0x0C-0x0F */
+	0xBD, 0x74, 0xBD, 0x75, 0xBD, 0x76, 0xBD, 0x77, /* 0x10-0x13 */
+	0xBD, 0x78, 0xBD, 0x79, 0xBD, 0x7A, 0xBD, 0x81, /* 0x14-0x17 */
+	0xBD, 0x82, 0xBD, 0x83, 0xBD, 0x84, 0xBD, 0x85, /* 0x18-0x1B */
+	0xBD, 0x86, 0xC7, 0xA2, 0xBD, 0x87, 0xBD, 0x88, /* 0x1C-0x1F */
+	0xBD, 0x89, 0xBD, 0x8A, 0xBD, 0x8B, 0xBD, 0x8C, /* 0x20-0x23 */
+	0xBD, 0x8D, 0xBD, 0x8E, 0xBD, 0x8F, 0xBD, 0x90, /* 0x24-0x27 */
+	0xBD, 0x91, 0xBD, 0x92, 0xBD, 0x93, 0xBD, 0x94, /* 0x28-0x2B */
+	0xBD, 0x95, 0xBD, 0x96, 0xBD, 0x97, 0xBD, 0x98, /* 0x2C-0x2F */
+	0xBD, 0x99, 0xBD, 0x9A, 0xBD, 0x9B, 0xBD, 0x9C, /* 0x30-0x33 */
+	0xBD, 0x9D, 0xBD, 0x9E, 0xBD, 0x9F, 0xBD, 0xA0, /* 0x34-0x37 */
+	0xBE, 0x41, 0xBE, 0x42, 0xBE, 0x43, 0xBE, 0x44, /* 0x38-0x3B */
+	0xBE, 0x45, 0xBE, 0x46, 0xBE, 0x47, 0xBE, 0x48, /* 0x3C-0x3F */
+	0xC7, 0xA3, 0xBE, 0x49, 0xBE, 0x4A, 0xBE, 0x4B, /* 0x40-0x43 */
+	0xC7, 0xA4, 0xBE, 0x4C, 0xBE, 0x4D, 0xBE, 0x4E, /* 0x44-0x47 */
+	0xBE, 0x4F, 0xBE, 0x50, 0xBE, 0x51, 0xBE, 0x52, /* 0x48-0x4B */
+	0xBE, 0x53, 0xBE, 0x54, 0xBE, 0x55, 0xBE, 0x56, /* 0x4C-0x4F */
+	0xBE, 0x57, 0xBE, 0x58, 0xBE, 0x59, 0xBE, 0x5A, /* 0x50-0x53 */
+	0xBE, 0x61, 0xBE, 0x62, 0xBE, 0x63, 0xBE, 0x64, /* 0x54-0x57 */
+	0xBE, 0x65, 0xBE, 0x66, 0xBE, 0x67, 0xBE, 0x68, /* 0x58-0x5B */
+	0xC7, 0xA5, 0xBE, 0x69, 0xBE, 0x6A, 0xBE, 0x6B, /* 0x5C-0x5F */
+	0xC7, 0xA6, 0xBE, 0x6C, 0xBE, 0x6D, 0xBE, 0x6E, /* 0x60-0x63 */
+	0xC7, 0xA7, 0xBE, 0x6F, 0xBE, 0x70, 0xBE, 0x71, /* 0x64-0x67 */
+	0xBE, 0x72, 0xBE, 0x73, 0xBE, 0x74, 0xBE, 0x75, /* 0x68-0x6B */
+	0xBE, 0x76, 0xC7, 0xA8, 0xBE, 0x77, 0xC7, 0xA9, /* 0x6C-0x6F */
+	0xBE, 0x78, 0xBE, 0x79, 0xBE, 0x7A, 0xBE, 0x81, /* 0x70-0x73 */
+	0xBE, 0x82, 0xBE, 0x83, 0xBE, 0x84, 0xBE, 0x85, /* 0x74-0x77 */
+	0xC7, 0xAA, 0xC7, 0xAB, 0xBE, 0x86, 0xBE, 0x87, /* 0x78-0x7B */
+	0xC7, 0xAC, 0xBE, 0x88, 0xBE, 0x89, 0xC7, 0xAD, /* 0x7C-0x7F */
+	
+	0xC7, 0xAE, 0xBE, 0x8A, 0xC7, 0xAF, 0xBE, 0x8B, /* 0x80-0x83 */
+	0xBE, 0x8C, 0xBE, 0x8D, 0xBE, 0x8E, 0xBE, 0x8F, /* 0x84-0x87 */
+	0xC7, 0xB0, 0xC7, 0xB1, 0xBE, 0x90, 0xC7, 0xB2, /* 0x88-0x8B */
+	0xBE, 0x91, 0xC7, 0xB3, 0xBE, 0x92, 0xBE, 0x93, /* 0x8C-0x8F */
+	0xBE, 0x94, 0xBE, 0x95, 0xBE, 0x96, 0xBE, 0x97, /* 0x90-0x93 */
+	0xC7, 0xB4, 0xBE, 0x98, 0xBE, 0x99, 0xBE, 0x9A, /* 0x94-0x97 */
+	0xBE, 0x9B, 0xBE, 0x9C, 0xBE, 0x9D, 0xBE, 0x9E, /* 0x98-0x9B */
+	0xBE, 0x9F, 0xBE, 0xA0, 0xBF, 0x41, 0xBF, 0x42, /* 0x9C-0x9F */
+	0xBF, 0x43, 0xBF, 0x44, 0xBF, 0x45, 0xBF, 0x46, /* 0xA0-0xA3 */
+	0xBF, 0x47, 0xBF, 0x48, 0xBF, 0x49, 0xBF, 0x4A, /* 0xA4-0xA7 */
+	0xBF, 0x4B, 0xC7, 0xB5, 0xBF, 0x4C, 0xBF, 0x4D, /* 0xA8-0xAB */
+	0xBF, 0x4E, 0xBF, 0x4F, 0xBF, 0x50, 0xBF, 0x51, /* 0xAC-0xAF */
+	0xBF, 0x52, 0xBF, 0x53, 0xBF, 0x54, 0xBF, 0x55, /* 0xB0-0xB3 */
+	0xBF, 0x56, 0xBF, 0x57, 0xBF, 0x58, 0xBF, 0x59, /* 0xB4-0xB7 */
+	0xBF, 0x5A, 0xBF, 0x61, 0xBF, 0x62, 0xBF, 0x63, /* 0xB8-0xBB */
+	0xBF, 0x64, 0xBF, 0x65, 0xBF, 0x66, 0xBF, 0x67, /* 0xBC-0xBF */
+	0xBF, 0x68, 0xBF, 0x69, 0xBF, 0x6A, 0xBF, 0x6B, /* 0xC0-0xC3 */
+	0xBF, 0x6C, 0xBF, 0x6D, 0xBF, 0x6E, 0xBF, 0x6F, /* 0xC4-0xC7 */
+	0xBF, 0x70, 0xBF, 0x71, 0xBF, 0x72, 0xBF, 0x73, /* 0xC8-0xCB */
+	0xC7, 0xB6, 0xBF, 0x74, 0xBF, 0x75, 0xBF, 0x76, /* 0xCC-0xCF */
+	0xC7, 0xB7, 0xBF, 0x77, 0xBF, 0x78, 0xBF, 0x79, /* 0xD0-0xD3 */
+	0xC7, 0xB8, 0xBF, 0x7A, 0xBF, 0x81, 0xBF, 0x82, /* 0xD4-0xD7 */
+	0xBF, 0x83, 0xBF, 0x84, 0xBF, 0x85, 0xBF, 0x86, /* 0xD8-0xDB */
+	0xC7, 0xB9, 0xBF, 0x87, 0xBF, 0x88, 0xC7, 0xBA, /* 0xDC-0xDF */
+	0xBF, 0x89, 0xBF, 0x8A, 0xBF, 0x8B, 0xBF, 0x8C, /* 0xE0-0xE3 */
+	0xBF, 0x8D, 0xBF, 0x8E, 0xBF, 0x8F, 0xBF, 0x90, /* 0xE4-0xE7 */
+	0xC7, 0xBB, 0xBF, 0x91, 0xBF, 0x92, 0xBF, 0x93, /* 0xE8-0xEB */
+	0xC7, 0xBC, 0xBF, 0x94, 0xBF, 0x95, 0xBF, 0x96, /* 0xEC-0xEF */
+	0xC7, 0xBD, 0xBF, 0x97, 0xBF, 0x98, 0xBF, 0x99, /* 0xF0-0xF3 */
+	0xBF, 0x9A, 0xBF, 0x9B, 0xBF, 0x9C, 0xBF, 0x9D, /* 0xF4-0xF7 */
+	0xC7, 0xBE, 0xBF, 0x9E, 0xBF, 0x9F, 0xC7, 0xBF, /* 0xF8-0xFB */
+	0xBF, 0xA0, 0xC7, 0xC0, 0xC0, 0x41, 0xC0, 0x42, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D5[512] = {
+	0xC0, 0x43, 0xC0, 0x44, 0xC0, 0x45, 0xC0, 0x46, /* 0x00-0x03 */
+	0xC7, 0xC1, 0xC0, 0x47, 0xC0, 0x48, 0xC0, 0x49, /* 0x04-0x07 */
+	0xC7, 0xC2, 0xC0, 0x4A, 0xC0, 0x4B, 0xC0, 0x4C, /* 0x08-0x0B */
+	0xC7, 0xC3, 0xC0, 0x4D, 0xC0, 0x4E, 0xC0, 0x4F, /* 0x0C-0x0F */
+	0xC0, 0x50, 0xC0, 0x51, 0xC0, 0x52, 0xC0, 0x53, /* 0x10-0x13 */
+	0xC7, 0xC4, 0xC7, 0xC5, 0xC0, 0x54, 0xC7, 0xC6, /* 0x14-0x17 */
+	0xC0, 0x55, 0xC0, 0x56, 0xC0, 0x57, 0xC0, 0x58, /* 0x18-0x1B */
+	0xC0, 0x59, 0xC0, 0x5A, 0xC0, 0x61, 0xC0, 0x62, /* 0x1C-0x1F */
+	0xC0, 0x63, 0xC0, 0x64, 0xC0, 0x65, 0xC0, 0x66, /* 0x20-0x23 */
+	0xC0, 0x67, 0xC0, 0x68, 0xC0, 0x69, 0xC0, 0x6A, /* 0x24-0x27 */
+	0xC0, 0x6B, 0xC0, 0x6C, 0xC0, 0x6D, 0xC0, 0x6E, /* 0x28-0x2B */
+	0xC0, 0x6F, 0xC0, 0x70, 0xC0, 0x71, 0xC0, 0x72, /* 0x2C-0x2F */
+	0xC0, 0x73, 0xC0, 0x74, 0xC0, 0x75, 0xC0, 0x76, /* 0x30-0x33 */
+	0xC0, 0x77, 0xC0, 0x78, 0xC0, 0x79, 0xC0, 0x7A, /* 0x34-0x37 */
+	0xC0, 0x81, 0xC0, 0x82, 0xC0, 0x83, 0xC0, 0x84, /* 0x38-0x3B */
+	0xC7, 0xC7, 0xC7, 0xC8, 0xC0, 0x85, 0xC0, 0x86, /* 0x3C-0x3F */
+	0xC7, 0xC9, 0xC0, 0x87, 0xC0, 0x88, 0xC0, 0x89, /* 0x40-0x43 */
+	0xC7, 0xCA, 0xC0, 0x8A, 0xC0, 0x8B, 0xC0, 0x8C, /* 0x44-0x47 */
+	0xC0, 0x8D, 0xC0, 0x8E, 0xC0, 0x8F, 0xC0, 0x90, /* 0x48-0x4B */
+	0xC7, 0xCB, 0xC7, 0xCC, 0xC0, 0x91, 0xC7, 0xCD, /* 0x4C-0x4F */
+	0xC0, 0x92, 0xC7, 0xCE, 0xC0, 0x93, 0xC0, 0x94, /* 0x50-0x53 */
+	0xC0, 0x95, 0xC0, 0x96, 0xC0, 0x97, 0xC0, 0x98, /* 0x54-0x57 */
+	0xC7, 0xCF, 0xC7, 0xD0, 0xC0, 0x99, 0xC0, 0x9A, /* 0x58-0x5B */
+	0xC7, 0xD1, 0xC0, 0x9B, 0xC0, 0x9C, 0xC0, 0x9D, /* 0x5C-0x5F */
+	0xC7, 0xD2, 0xC0, 0x9E, 0xC0, 0x9F, 0xC0, 0xA0, /* 0x60-0x63 */
+	0xC1, 0x41, 0xC7, 0xD3, 0xC1, 0x42, 0xC1, 0x43, /* 0x64-0x67 */
+	0xC7, 0xD4, 0xC7, 0xD5, 0xC1, 0x44, 0xC7, 0xD6, /* 0x68-0x6B */
+	0xC1, 0x45, 0xC7, 0xD7, 0xC1, 0x46, 0xC1, 0x47, /* 0x6C-0x6F */
+	0xC1, 0x48, 0xC1, 0x49, 0xC1, 0x4A, 0xC1, 0x4B, /* 0x70-0x73 */
+	0xC7, 0xD8, 0xC7, 0xD9, 0xC1, 0x4C, 0xC1, 0x4D, /* 0x74-0x77 */
+	0xC7, 0xDA, 0xC1, 0x4E, 0xC1, 0x4F, 0xC1, 0x50, /* 0x78-0x7B */
+	0xC7, 0xDB, 0xC1, 0x51, 0xC1, 0x52, 0xC1, 0x53, /* 0x7C-0x7F */
+	
+	0xC1, 0x54, 0xC1, 0x55, 0xC1, 0x56, 0xC1, 0x57, /* 0x80-0x83 */
+	0xC7, 0xDC, 0xC7, 0xDD, 0xC1, 0x58, 0xC7, 0xDE, /* 0x84-0x87 */
+	0xC7, 0xDF, 0xC7, 0xE0, 0xC1, 0x59, 0xC1, 0x5A, /* 0x88-0x8B */
+	0xC1, 0x61, 0xC1, 0x62, 0xC1, 0x63, 0xC1, 0x64, /* 0x8C-0x8F */
+	0xC7, 0xE1, 0xC1, 0x65, 0xC1, 0x66, 0xC1, 0x67, /* 0x90-0x93 */
+	0xC1, 0x68, 0xC1, 0x69, 0xC1, 0x6A, 0xC1, 0x6B, /* 0x94-0x97 */
+	0xC1, 0x6C, 0xC1, 0x6D, 0xC1, 0x6E, 0xC1, 0x6F, /* 0x98-0x9B */
+	0xC1, 0x70, 0xC1, 0x71, 0xC1, 0x72, 0xC1, 0x73, /* 0x9C-0x9F */
+	0xC1, 0x74, 0xC1, 0x75, 0xC1, 0x76, 0xC1, 0x77, /* 0xA0-0xA3 */
+	0xC1, 0x78, 0xC7, 0xE2, 0xC1, 0x79, 0xC1, 0x7A, /* 0xA4-0xA7 */
+	0xC1, 0x81, 0xC1, 0x82, 0xC1, 0x83, 0xC1, 0x84, /* 0xA8-0xAB */
+	0xC1, 0x85, 0xC1, 0x86, 0xC1, 0x87, 0xC1, 0x88, /* 0xAC-0xAF */
+	0xC1, 0x89, 0xC1, 0x8A, 0xC1, 0x8B, 0xC1, 0x8C, /* 0xB0-0xB3 */
+	0xC1, 0x8D, 0xC1, 0x8E, 0xC1, 0x8F, 0xC1, 0x90, /* 0xB4-0xB7 */
+	0xC1, 0x91, 0xC1, 0x92, 0xC1, 0x93, 0xC1, 0x94, /* 0xB8-0xBB */
+	0xC1, 0x95, 0xC1, 0x96, 0xC1, 0x97, 0xC1, 0x98, /* 0xBC-0xBF */
+	0xC1, 0x99, 0xC1, 0x9A, 0xC1, 0x9B, 0xC1, 0x9C, /* 0xC0-0xC3 */
+	0xC1, 0x9D, 0xC1, 0x9E, 0xC1, 0x9F, 0xC1, 0xA0, /* 0xC4-0xC7 */
+	0xC7, 0xE3, 0xC7, 0xE4, 0xC2, 0x41, 0xC2, 0x42, /* 0xC8-0xCB */
+	0xC7, 0xE5, 0xC2, 0x43, 0xC2, 0x44, 0xC2, 0x45, /* 0xCC-0xCF */
+	0xC7, 0xE6, 0xC2, 0x46, 0xC7, 0xE7, 0xC2, 0x47, /* 0xD0-0xD3 */
+	0xC2, 0x48, 0xC2, 0x49, 0xC2, 0x4A, 0xC2, 0x4B, /* 0xD4-0xD7 */
+	0xC7, 0xE8, 0xC7, 0xE9, 0xC2, 0x4C, 0xC7, 0xEA, /* 0xD8-0xDB */
+	0xC2, 0x4D, 0xC7, 0xEB, 0xC2, 0x4E, 0xC2, 0x4F, /* 0xDC-0xDF */
+	0xC2, 0x50, 0xC2, 0x51, 0xC2, 0x52, 0xC2, 0x53, /* 0xE0-0xE3 */
+	0xC7, 0xEC, 0xC7, 0xED, 0xC2, 0x54, 0xC2, 0x55, /* 0xE4-0xE7 */
+	0xC7, 0xEE, 0xC2, 0x56, 0xC2, 0x57, 0xC2, 0x58, /* 0xE8-0xEB */
+	0xC7, 0xEF, 0xC2, 0x59, 0xC2, 0x5A, 0xC2, 0x61, /* 0xEC-0xEF */
+	0xC2, 0x62, 0xC2, 0x63, 0xC2, 0x64, 0xC2, 0x65, /* 0xF0-0xF3 */
+	0xC7, 0xF0, 0xC7, 0xF1, 0xC2, 0x66, 0xC7, 0xF2, /* 0xF4-0xF7 */
+	0xC2, 0x67, 0xC7, 0xF3, 0xC2, 0x68, 0xC2, 0x69, /* 0xF8-0xFB */
+	0xC2, 0x6A, 0xC2, 0x6B, 0xC2, 0x6C, 0xC2, 0x6D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D6[512] = {
+	0xC7, 0xF4, 0xC7, 0xF5, 0xC2, 0x6E, 0xC2, 0x6F, /* 0x00-0x03 */
+	0xC7, 0xF6, 0xC2, 0x70, 0xC2, 0x71, 0xC2, 0x72, /* 0x04-0x07 */
+	0xC7, 0xF7, 0xC2, 0x73, 0xC2, 0x74, 0xC2, 0x75, /* 0x08-0x0B */
+	0xC2, 0x76, 0xC2, 0x77, 0xC2, 0x78, 0xC2, 0x79, /* 0x0C-0x0F */
+	0xC7, 0xF8, 0xC7, 0xF9, 0xC2, 0x7A, 0xC7, 0xFA, /* 0x10-0x13 */
+	0xC7, 0xFB, 0xC7, 0xFC, 0xC2, 0x81, 0xC2, 0x82, /* 0x14-0x17 */
+	0xC2, 0x83, 0xC2, 0x84, 0xC2, 0x85, 0xC2, 0x86, /* 0x18-0x1B */
+	0xC7, 0xFD, 0xC2, 0x87, 0xC2, 0x88, 0xC2, 0x89, /* 0x1C-0x1F */
+	0xC7, 0xFE, 0xC2, 0x8A, 0xC2, 0x8B, 0xC2, 0x8C, /* 0x20-0x23 */
+	0xC8, 0xA1, 0xC2, 0x8D, 0xC2, 0x8E, 0xC2, 0x8F, /* 0x24-0x27 */
+	0xC2, 0x90, 0xC2, 0x91, 0xC2, 0x92, 0xC2, 0x93, /* 0x28-0x2B */
+	0xC2, 0x94, 0xC8, 0xA2, 0xC2, 0x95, 0xC2, 0x96, /* 0x2C-0x2F */
+	0xC2, 0x97, 0xC2, 0x98, 0xC2, 0x99, 0xC2, 0x9A, /* 0x30-0x33 */
+	0xC2, 0x9B, 0xC2, 0x9C, 0xC2, 0x9D, 0xC2, 0x9E, /* 0x34-0x37 */
+	0xC8, 0xA3, 0xC8, 0xA4, 0xC2, 0x9F, 0xC2, 0xA0, /* 0x38-0x3B */
+	0xC8, 0xA5, 0xC3, 0x41, 0xC3, 0x42, 0xC3, 0x43, /* 0x3C-0x3F */
+	0xC8, 0xA6, 0xC3, 0x44, 0xC3, 0x45, 0xC3, 0x46, /* 0x40-0x43 */
+	0xC3, 0x47, 0xC8, 0xA7, 0xC3, 0x48, 0xC3, 0x49, /* 0x44-0x47 */
+	0xC8, 0xA8, 0xC8, 0xA9, 0xC3, 0x4A, 0xC8, 0xAA, /* 0x48-0x4B */
+	0xC3, 0x4B, 0xC8, 0xAB, 0xC3, 0x4C, 0xC3, 0x4D, /* 0x4C-0x4F */
+	0xC3, 0x4E, 0xC8, 0xAC, 0xC3, 0x4F, 0xC3, 0x50, /* 0x50-0x53 */
+	0xC8, 0xAD, 0xC8, 0xAE, 0xC3, 0x51, 0xC3, 0x52, /* 0x54-0x57 */
+	0xC8, 0xAF, 0xC3, 0x53, 0xC3, 0x54, 0xC3, 0x55, /* 0x58-0x5B */
+	0xC8, 0xB0, 0xC3, 0x56, 0xC3, 0x57, 0xC3, 0x58, /* 0x5C-0x5F */
+	0xC3, 0x59, 0xC3, 0x5A, 0xC3, 0x61, 0xC3, 0x62, /* 0x60-0x63 */
+	0xC3, 0x63, 0xC3, 0x64, 0xC3, 0x65, 0xC8, 0xB1, /* 0x64-0x67 */
+	0xC3, 0x66, 0xC8, 0xB2, 0xC3, 0x67, 0xC3, 0x68, /* 0x68-0x6B */
+	0xC3, 0x69, 0xC3, 0x6A, 0xC3, 0x6B, 0xC3, 0x6C, /* 0x6C-0x6F */
+	0xC8, 0xB3, 0xC8, 0xB4, 0xC3, 0x6D, 0xC3, 0x6E, /* 0x70-0x73 */
+	0xC8, 0xB5, 0xC3, 0x6F, 0xC3, 0x70, 0xC3, 0x71, /* 0x74-0x77 */
+	0xC3, 0x72, 0xC3, 0x73, 0xC3, 0x74, 0xC3, 0x75, /* 0x78-0x7B */
+	0xC3, 0x76, 0xC3, 0x77, 0xC3, 0x78, 0xC3, 0x79, /* 0x7C-0x7F */
+	
+	0xC3, 0x7A, 0xC3, 0x81, 0xC3, 0x82, 0xC8, 0xB6, /* 0x80-0x83 */
+	0xC3, 0x83, 0xC8, 0xB7, 0xC3, 0x84, 0xC3, 0x85, /* 0x84-0x87 */
+	0xC3, 0x86, 0xC3, 0x87, 0xC3, 0x88, 0xC3, 0x89, /* 0x88-0x8B */
+	0xC8, 0xB8, 0xC8, 0xB9, 0xC3, 0x8A, 0xC3, 0x8B, /* 0x8C-0x8F */
+	0xC8, 0xBA, 0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E, /* 0x90-0x93 */
+	0xC8, 0xBB, 0xC3, 0x8F, 0xC3, 0x90, 0xC3, 0x91, /* 0x94-0x97 */
+	0xC3, 0x92, 0xC3, 0x93, 0xC3, 0x94, 0xC3, 0x95, /* 0x98-0x9B */
+	0xC3, 0x96, 0xC8, 0xBC, 0xC3, 0x97, 0xC8, 0xBD, /* 0x9C-0x9F */
+	0xC3, 0x98, 0xC8, 0xBE, 0xC3, 0x99, 0xC3, 0x9A, /* 0xA0-0xA3 */
+	0xC3, 0x9B, 0xC3, 0x9C, 0xC3, 0x9D, 0xC3, 0x9E, /* 0xA4-0xA7 */
+	0xC8, 0xBF, 0xC3, 0x9F, 0xC3, 0xA0, 0xC4, 0x41, /* 0xA8-0xAB */
+	0xC8, 0xC0, 0xC4, 0x42, 0xC4, 0x43, 0xC4, 0x44, /* 0xAC-0xAF */
+	0xC8, 0xC1, 0xC4, 0x45, 0xC4, 0x46, 0xC4, 0x47, /* 0xB0-0xB3 */
+	0xC4, 0x48, 0xC4, 0x49, 0xC4, 0x4A, 0xC4, 0x4B, /* 0xB4-0xB7 */
+	0xC4, 0x4C, 0xC8, 0xC2, 0xC4, 0x4D, 0xC8, 0xC3, /* 0xB8-0xBB */
+	0xC4, 0x4E, 0xC4, 0x4F, 0xC4, 0x50, 0xC4, 0x51, /* 0xBC-0xBF */
+	0xC4, 0x52, 0xC4, 0x53, 0xC4, 0x54, 0xC4, 0x55, /* 0xC0-0xC3 */
+	0xC8, 0xC4, 0xC8, 0xC5, 0xC4, 0x56, 0xC4, 0x57, /* 0xC4-0xC7 */
+	0xC8, 0xC6, 0xC4, 0x58, 0xC4, 0x59, 0xC4, 0x5A, /* 0xC8-0xCB */
+	0xC8, 0xC7, 0xC4, 0x61, 0xC4, 0x62, 0xC4, 0x63, /* 0xCC-0xCF */
+	0xC4, 0x64, 0xC8, 0xC8, 0xC4, 0x65, 0xC4, 0x66, /* 0xD0-0xD3 */
+	0xC8, 0xC9, 0xC4, 0x67, 0xC4, 0x68, 0xC8, 0xCA, /* 0xD4-0xD7 */
+	0xC4, 0x69, 0xC8, 0xCB, 0xC4, 0x6A, 0xC4, 0x6B, /* 0xD8-0xDB */
+	0xC4, 0x6C, 0xC4, 0x6D, 0xC4, 0x6E, 0xC4, 0x6F, /* 0xDC-0xDF */
+	0xC8, 0xCC, 0xC4, 0x70, 0xC4, 0x71, 0xC4, 0x72, /* 0xE0-0xE3 */
+	0xC8, 0xCD, 0xC4, 0x73, 0xC4, 0x74, 0xC4, 0x75, /* 0xE4-0xE7 */
+	0xC8, 0xCE, 0xC4, 0x76, 0xC4, 0x77, 0xC4, 0x78, /* 0xE8-0xEB */
+	0xC4, 0x79, 0xC4, 0x7A, 0xC4, 0x81, 0xC4, 0x82, /* 0xEC-0xEF */
+	0xC8, 0xCF, 0xC4, 0x83, 0xC4, 0x84, 0xC4, 0x85, /* 0xF0-0xF3 */
+	0xC4, 0x86, 0xC8, 0xD0, 0xC4, 0x87, 0xC4, 0x88, /* 0xF4-0xF7 */
+	0xC4, 0x89, 0xC4, 0x8A, 0xC4, 0x8B, 0xC4, 0x8C, /* 0xF8-0xFB */
+	0xC8, 0xD1, 0xC8, 0xD2, 0xC4, 0x8D, 0xC4, 0x8E, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_D7[512] = {
+	0xC8, 0xD3, 0xC4, 0x8F, 0xC4, 0x90, 0xC4, 0x91, /* 0x00-0x03 */
+	0xC8, 0xD4, 0xC4, 0x92, 0xC4, 0x93, 0xC4, 0x94, /* 0x04-0x07 */
+	0xC4, 0x95, 0xC4, 0x96, 0xC4, 0x97, 0xC4, 0x98, /* 0x08-0x0B */
+	0xC4, 0x99, 0xC4, 0x9A, 0xC4, 0x9B, 0xC4, 0x9C, /* 0x0C-0x0F */
+	0xC4, 0x9D, 0xC8, 0xD5, 0xC4, 0x9E, 0xC4, 0x9F, /* 0x10-0x13 */
+	0xC4, 0xA0, 0xC5, 0x41, 0xC5, 0x42, 0xC5, 0x43, /* 0x14-0x17 */
+	0xC8, 0xD6, 0xC8, 0xD7, 0xC5, 0x44, 0xC5, 0x45, /* 0x18-0x1B */
+	0xC8, 0xD8, 0xC5, 0x46, 0xC5, 0x47, 0xC5, 0x48, /* 0x1C-0x1F */
+	0xC8, 0xD9, 0xC5, 0x49, 0xC5, 0x4A, 0xC5, 0x4B, /* 0x20-0x23 */
+	0xC5, 0x4C, 0xC5, 0x4D, 0xC5, 0x4E, 0xC5, 0x4F, /* 0x24-0x27 */
+	0xC8, 0xDA, 0xC8, 0xDB, 0xC5, 0x50, 0xC8, 0xDC, /* 0x28-0x2B */
+	0xC5, 0x51, 0xC8, 0xDD, 0xC5, 0x52, 0xC5, 0x53, /* 0x2C-0x2F */
+	0xC5, 0x54, 0xC5, 0x55, 0xC5, 0x56, 0xC5, 0x57, /* 0x30-0x33 */
+	0xC8, 0xDE, 0xC8, 0xDF, 0xC5, 0x58, 0xC5, 0x59, /* 0x34-0x37 */
+	0xC8, 0xE0, 0xC5, 0x5A, 0xC5, 0x61, 0xC5, 0x62, /* 0x38-0x3B */
+	0xC8, 0xE1, 0xC5, 0x63, 0xC5, 0x64, 0xC5, 0x65, /* 0x3C-0x3F */
+	0xC5, 0x66, 0xC5, 0x67, 0xC5, 0x68, 0xC5, 0x69, /* 0x40-0x43 */
+	0xC8, 0xE2, 0xC5, 0x6A, 0xC5, 0x6B, 0xC8, 0xE3, /* 0x44-0x47 */
+	0xC5, 0x6C, 0xC8, 0xE4, 0xC5, 0x6D, 0xC5, 0x6E, /* 0x48-0x4B */
+	0xC5, 0x6F, 0xC5, 0x70, 0xC5, 0x71, 0xC5, 0x72, /* 0x4C-0x4F */
+	0xC8, 0xE5, 0xC8, 0xE6, 0xC5, 0x73, 0xC5, 0x74, /* 0x50-0x53 */
+	0xC8, 0xE7, 0xC5, 0x75, 0xC8, 0xE8, 0xC8, 0xE9, /* 0x54-0x57 */
+	0xC8, 0xEA, 0xC8, 0xEB, 0xC5, 0x76, 0xC5, 0x77, /* 0x58-0x5B */
+	0xC5, 0x78, 0xC5, 0x79, 0xC5, 0x7A, 0xC5, 0x81, /* 0x5C-0x5F */
+	0xC8, 0xEC, 0xC8, 0xED, 0xC5, 0x82, 0xC8, 0xEE, /* 0x60-0x63 */
+	0xC5, 0x83, 0xC8, 0xEF, 0xC5, 0x84, 0xC5, 0x85, /* 0x64-0x67 */
+	0xC5, 0x86, 0xC8, 0xF0, 0xC5, 0x87, 0xC5, 0x88, /* 0x68-0x6B */
+	0xC8, 0xF1, 0xC5, 0x89, 0xC5, 0x8A, 0xC5, 0x8B, /* 0x6C-0x6F */
+	0xC8, 0xF2, 0xC5, 0x8C, 0xC5, 0x8D, 0xC5, 0x8E, /* 0x70-0x73 */
+	0xC8, 0xF3, 0xC5, 0x8F, 0xC5, 0x90, 0xC5, 0x91, /* 0x74-0x77 */
+	0xC5, 0x92, 0xC5, 0x93, 0xC5, 0x94, 0xC5, 0x95, /* 0x78-0x7B */
+	0xC8, 0xF4, 0xC8, 0xF5, 0xC5, 0x96, 0xC5, 0x97, /* 0x7C-0x7F */
+	
+	0xC5, 0x98, 0xC8, 0xF6, 0xC5, 0x99, 0xC5, 0x9A, /* 0x80-0x83 */
+	0xC5, 0x9B, 0xC5, 0x9C, 0xC5, 0x9D, 0xC5, 0x9E, /* 0x84-0x87 */
+	0xC8, 0xF7, 0xC8, 0xF8, 0xC5, 0x9F, 0xC5, 0xA0, /* 0x88-0x8B */
+	0xC8, 0xF9, 0xC6, 0x41, 0xC6, 0x42, 0xC6, 0x43, /* 0x8C-0x8F */
+	0xC8, 0xFA, 0xC6, 0x44, 0xC6, 0x45, 0xC6, 0x46, /* 0x90-0x93 */
+	0xC6, 0x47, 0xC6, 0x48, 0xC6, 0x49, 0xC6, 0x4A, /* 0x94-0x97 */
+	0xC8, 0xFB, 0xC8, 0xFC, 0xC6, 0x4B, 0xC8, 0xFD, /* 0x98-0x9B */
+	0xC6, 0x4C, 0xC8, 0xFE, 0xC6, 0x4D, 0xC6, 0x4E, /* 0x9C-0x9F */
+	0xC6, 0x4F, 0xC6, 0x50, 0xC6, 0x51, 0xC6, 0x52, /* 0xA0-0xA3 */
+};
+
+static const unsigned char u2c_DC[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+};
+
+static const unsigned char u2c_F9[512] = {
+	0xCB, 0xD0, 0xCB, 0xD6, 0xCB, 0xE7, 0xCD, 0xCF, /* 0x00-0x03 */
+	0xCD, 0xE8, 0xCE, 0xAD, 0xCF, 0xFB, 0xD0, 0xA2, /* 0x04-0x07 */
+	0xD0, 0xB8, 0xD0, 0xD0, 0xD0, 0xDD, 0xD1, 0xD4, /* 0x08-0x0B */
+	0xD1, 0xD5, 0xD1, 0xD8, 0xD1, 0xDB, 0xD1, 0xDC, /* 0x0C-0x0F */
+	0xD1, 0xDD, 0xD1, 0xDE, 0xD1, 0xDF, 0xD1, 0xE0, /* 0x10-0x13 */
+	0xD1, 0xE2, 0xD1, 0xE3, 0xD1, 0xE4, 0xD1, 0xE5, /* 0x14-0x17 */
+	0xD1, 0xE6, 0xD1, 0xE8, 0xD1, 0xE9, 0xD1, 0xEA, /* 0x18-0x1B */
+	0xD1, 0xEB, 0xD1, 0xED, 0xD1, 0xEF, 0xD1, 0xF0, /* 0x1C-0x1F */
+	0xD1, 0xF2, 0xD1, 0xF6, 0xD1, 0xFA, 0xD1, 0xFC, /* 0x20-0x23 */
+	0xD1, 0xFD, 0xD1, 0xFE, 0xD2, 0xA2, 0xD2, 0xA3, /* 0x24-0x27 */
+	0xD2, 0xA7, 0xD2, 0xA8, 0xD2, 0xA9, 0xD2, 0xAA, /* 0x28-0x2B */
+	0xD2, 0xAB, 0xD2, 0xAD, 0xD2, 0xB2, 0xD2, 0xBE, /* 0x2C-0x2F */
+	0xD2, 0xC2, 0xD2, 0xC3, 0xD2, 0xC4, 0xD2, 0xC6, /* 0x30-0x33 */
+	0xD2, 0xC7, 0xD2, 0xC8, 0xD2, 0xC9, 0xD2, 0xCA, /* 0x34-0x37 */
+	0xD2, 0xCB, 0xD2, 0xCD, 0xD2, 0xCE, 0xD2, 0xCF, /* 0x38-0x3B */
+	0xD2, 0xD0, 0xD2, 0xD1, 0xD2, 0xD2, 0xD2, 0xD3, /* 0x3C-0x3F */
+	0xD2, 0xD4, 0xD2, 0xD5, 0xD2, 0xD6, 0xD2, 0xD7, /* 0x40-0x43 */
+	0xD2, 0xD9, 0xD2, 0xDA, 0xD2, 0xDE, 0xD2, 0xDF, /* 0x44-0x47 */
+	0xD2, 0xE1, 0xD2, 0xE2, 0xD2, 0xE4, 0xD2, 0xE5, /* 0x48-0x4B */
+	0xD2, 0xE6, 0xD2, 0xE7, 0xD2, 0xE8, 0xD2, 0xE9, /* 0x4C-0x4F */
+	0xD2, 0xEA, 0xD2, 0xEB, 0xD2, 0xF0, 0xD2, 0xF1, /* 0x50-0x53 */
+	0xD2, 0xF2, 0xD2, 0xF3, 0xD2, 0xF4, 0xD2, 0xF5, /* 0x54-0x57 */
+	0xD2, 0xF7, 0xD2, 0xF8, 0xD4, 0xE6, 0xD4, 0xFC, /* 0x58-0x5B */
+	0xD5, 0xA5, 0xD5, 0xAB, 0xD5, 0xAE, 0xD6, 0xB8, /* 0x5C-0x5F */
+	0xD6, 0xCD, 0xD7, 0xCB, 0xD7, 0xE4, 0xDB, 0xC5, /* 0x60-0x63 */
+	0xDB, 0xE4, 0xDC, 0xA5, 0xDD, 0xA5, 0xDD, 0xD5, /* 0x64-0x67 */
+	0xDD, 0xF4, 0xDE, 0xFC, 0xDE, 0xFE, 0xDF, 0xB3, /* 0x68-0x6B */
+	0xDF, 0xE1, 0xDF, 0xE8, 0xE0, 0xF1, 0xE1, 0xAD, /* 0x6C-0x6F */
+	0xE1, 0xED, 0xE3, 0xF5, 0xE4, 0xA1, 0xE4, 0xA9, /* 0x70-0x73 */
+	0xE5, 0xAE, 0xE5, 0xB1, 0xE5, 0xB2, 0xE5, 0xB9, /* 0x74-0x77 */
+	0xE5, 0xBB, 0xE5, 0xBC, 0xE5, 0xC4, 0xE5, 0xCE, /* 0x78-0x7B */
+	0xE5, 0xD0, 0xE5, 0xD2, 0xE5, 0xD6, 0xE5, 0xFA, /* 0x7C-0x7F */
+	
+	0xE5, 0xFB, 0xE5, 0xFC, 0xE5, 0xFE, 0xE6, 0xA1, /* 0x80-0x83 */
+	0xE6, 0xA4, 0xE6, 0xA7, 0xE6, 0xAD, 0xE6, 0xAF, /* 0x84-0x87 */
+	0xE6, 0xB0, 0xE6, 0xB1, 0xE6, 0xB3, 0xE6, 0xB7, /* 0x88-0x8B */
+	0xE6, 0xB8, 0xE6, 0xBC, 0xE6, 0xC4, 0xE6, 0xC6, /* 0x8C-0x8F */
+	0xE6, 0xC7, 0xE6, 0xCA, 0xE6, 0xD2, 0xE6, 0xD6, /* 0x90-0x93 */
+	0xE6, 0xD9, 0xE6, 0xDC, 0xE6, 0xDF, 0xE6, 0xE1, /* 0x94-0x97 */
+	0xE6, 0xE4, 0xE6, 0xE5, 0xE6, 0xE6, 0xE6, 0xE8, /* 0x98-0x9B */
+	0xE6, 0xEA, 0xE6, 0xEB, 0xE6, 0xEC, 0xE6, 0xEF, /* 0x9C-0x9F */
+	0xE6, 0xF1, 0xE6, 0xF2, 0xE6, 0xF5, 0xE6, 0xF6, /* 0xA0-0xA3 */
+	0xE6, 0xF7, 0xE6, 0xF9, 0xE7, 0xA1, 0xE7, 0xA6, /* 0xA4-0xA7 */
+	0xE7, 0xA9, 0xE7, 0xAA, 0xE7, 0xAC, 0xE7, 0xAD, /* 0xA8-0xAB */
+	0xE7, 0xB0, 0xE7, 0xBF, 0xE7, 0xC1, 0xE7, 0xC6, /* 0xAC-0xAF */
+	0xE7, 0xC7, 0xE7, 0xCB, 0xE7, 0xCD, 0xE7, 0xCF, /* 0xB0-0xB3 */
+	0xE7, 0xD0, 0xE7, 0xD3, 0xE7, 0xDF, 0xE7, 0xE4, /* 0xB4-0xB7 */
+	0xE7, 0xE6, 0xE7, 0xF7, 0xE8, 0xE7, 0xE8, 0xE8, /* 0xB8-0xBB */
+	0xE8, 0xF0, 0xE8, 0xF1, 0xE8, 0xF7, 0xE8, 0xF9, /* 0xBC-0xBF */
+	0xE8, 0xFB, 0xE8, 0xFE, 0xE9, 0xA7, 0xE9, 0xAC, /* 0xC0-0xC3 */
+	0xE9, 0xCC, 0xE9, 0xF7, 0xEA, 0xC1, 0xEA, 0xE5, /* 0xC4-0xC7 */
+	0xEA, 0xF4, 0xEA, 0xF7, 0xEA, 0xFC, 0xEA, 0xFE, /* 0xC8-0xCB */
+	0xEB, 0xA4, 0xEB, 0xA7, 0xEB, 0xA9, 0xEB, 0xAA, /* 0xCC-0xCF */
+	0xEB, 0xBA, 0xEB, 0xBB, 0xEB, 0xBD, 0xEB, 0xC1, /* 0xD0-0xD3 */
+	0xEB, 0xC2, 0xEB, 0xC6, 0xEB, 0xC7, 0xEB, 0xCC, /* 0xD4-0xD7 */
+	0xEB, 0xCF, 0xEB, 0xD0, 0xEB, 0xD1, 0xEB, 0xD2, /* 0xD8-0xDB */
+	0xEB, 0xD8, 0xEC, 0xA6, 0xEC, 0xA7, 0xEC, 0xAA, /* 0xDC-0xDF */
+	0xEC, 0xAF, 0xEC, 0xB0, 0xEC, 0xB1, 0xEC, 0xB2, /* 0xE0-0xE3 */
+	0xEC, 0xB5, 0xEC, 0xB8, 0xEC, 0xBA, 0xEC, 0xC0, /* 0xE4-0xE7 */
+	0xEC, 0xC1, 0xEC, 0xC5, 0xEC, 0xC6, 0xEC, 0xC9, /* 0xE8-0xEB */
+	0xEC, 0xCA, 0xEC, 0xD5, 0xEC, 0xDD, 0xEC, 0xDE, /* 0xEC-0xEF */
+	0xEC, 0xE1, 0xEC, 0xE4, 0xEC, 0xE7, 0xEC, 0xE8, /* 0xF0-0xF3 */
+	0xEC, 0xF7, 0xEC, 0xF8, 0xEC, 0xFA, 0xED, 0xA1, /* 0xF4-0xF7 */
+	0xED, 0xA2, 0xED, 0xA3, 0xED, 0xEE, 0xEE, 0xDB, /* 0xF8-0xFB */
+	0xF2, 0xBD, 0xF2, 0xFA, 0xF3, 0xB1, 0xF4, 0xA7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_FA[512] = {
+	0xF4, 0xEE, 0xF6, 0xF4, 0xF6, 0xF6, 0xF7, 0xB8, /* 0x00-0x03 */
+	0xF7, 0xC8, 0xF7, 0xD3, 0xF8, 0xDB, 0xF8, 0xF0, /* 0x04-0x07 */
+	0xFA, 0xA1, 0xFA, 0xA2, 0xFA, 0xE6, 0xFC, 0xA9, /* 0x08-0x0B */
+	0xE8, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xF5, 0xC0, 0x00, 0x00, 0xF4, 0xE7, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xFD, 0xEB, 0x00, 0x00, 0xEC, 0xCC, /* 0x14-0x17 */
+	0x00, 0x00, 0xE3, 0xEA, 0xDF, 0xD4, 0xDC, 0xD8, /* 0x18-0x1B */
+	0xEF, 0xFE, 0xEF, 0xF1, 0xE9, 0xE2, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0xB3, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xEC, 0xEF, 0xD4, 0xB4, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xF9, 0xDE, 0xF8, /* 0x28-0x2B */
+	0xCE, 0xBD, 0xF9, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+};
+
+static const unsigned char u2c_FF[512] = {
+	0x00, 0x00, 0xA3, 0xA1, 0xA3, 0xA2, 0xA3, 0xA3, /* 0x00-0x03 */
+	0xA3, 0xA4, 0xA3, 0xA5, 0xA3, 0xA6, 0xA3, 0xA7, /* 0x04-0x07 */
+	0xA3, 0xA8, 0xA3, 0xA9, 0xA3, 0xAA, 0xA3, 0xAB, /* 0x08-0x0B */
+	0xA3, 0xAC, 0xA3, 0xAD, 0xA3, 0xAE, 0xA3, 0xAF, /* 0x0C-0x0F */
+	0xA3, 0xB0, 0xA3, 0xB1, 0xA3, 0xB2, 0xA3, 0xB3, /* 0x10-0x13 */
+	0xA3, 0xB4, 0xA3, 0xB5, 0xA3, 0xB6, 0xA3, 0xB7, /* 0x14-0x17 */
+	0xA3, 0xB8, 0xA3, 0xB9, 0xA3, 0xBA, 0xA3, 0xBB, /* 0x18-0x1B */
+	0xA3, 0xBC, 0xA3, 0xBD, 0xA3, 0xBE, 0xA3, 0xBF, /* 0x1C-0x1F */
+	0xA3, 0xC0, 0xA3, 0xC1, 0xA3, 0xC2, 0xA3, 0xC3, /* 0x20-0x23 */
+	0xA3, 0xC4, 0xA3, 0xC5, 0xA3, 0xC6, 0xA3, 0xC7, /* 0x24-0x27 */
+	0xA3, 0xC8, 0xA3, 0xC9, 0xA3, 0xCA, 0xA3, 0xCB, /* 0x28-0x2B */
+	0xA3, 0xCC, 0xA3, 0xCD, 0xA3, 0xCE, 0xA3, 0xCF, /* 0x2C-0x2F */
+	0xA3, 0xD0, 0xA3, 0xD1, 0xA3, 0xD2, 0xA3, 0xD3, /* 0x30-0x33 */
+	0xA3, 0xD4, 0xA3, 0xD5, 0xA3, 0xD6, 0xA3, 0xD7, /* 0x34-0x37 */
+	0xA3, 0xD8, 0xA3, 0xD9, 0xA3, 0xDA, 0xA3, 0xDB, /* 0x38-0x3B */
+	0xA1, 0xAC, 0xA3, 0xDD, 0xA3, 0xDE, 0xA3, 0xDF, /* 0x3C-0x3F */
+	0xA3, 0xE0, 0xA3, 0xE1, 0xA3, 0xE2, 0xA3, 0xE3, /* 0x40-0x43 */
+	0xA3, 0xE4, 0xA3, 0xE5, 0xA3, 0xE6, 0xA3, 0xE7, /* 0x44-0x47 */
+	0xA3, 0xE8, 0xA3, 0xE9, 0xA3, 0xEA, 0xA3, 0xEB, /* 0x48-0x4B */
+	0xA3, 0xEC, 0xA3, 0xED, 0xA3, 0xEE, 0xA3, 0xEF, /* 0x4C-0x4F */
+	0xA3, 0xF0, 0xA3, 0xF1, 0xA3, 0xF2, 0xA3, 0xF3, /* 0x50-0x53 */
+	0xA3, 0xF4, 0xA3, 0xF5, 0xA3, 0xF6, 0xA3, 0xF7, /* 0x54-0x57 */
+	0xA3, 0xF8, 0xA3, 0xF9, 0xA3, 0xFA, 0xA3, 0xFB, /* 0x58-0x5B */
+	0xA3, 0xFC, 0xA3, 0xFD, 0xA2, 0xA6, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xA4, 0xD4, 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, /* 0xA0-0xA3 */
+	0xA4, 0xA4, 0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, /* 0xA4-0xA7 */
+	0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xAA, 0xA4, 0xAB, /* 0xA8-0xAB */
+	0xA4, 0xAC, 0xA4, 0xAD, 0xA4, 0xAE, 0xA4, 0xAF, /* 0xAC-0xAF */
+	0xA4, 0xB0, 0xA4, 0xB1, 0xA4, 0xB2, 0xA4, 0xB3, /* 0xB0-0xB3 */
+	0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0xB4-0xB7 */
+	0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0xB8-0xBB */
+	0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xBF, 0xA4, 0xC0, /* 0xC0-0xC3 */
+	0xA4, 0xC1, 0xA4, 0xC2, 0xA4, 0xC3, 0xA4, 0xC4, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xC5, 0xA4, 0xC6, /* 0xC8-0xCB */
+	0xA4, 0xC7, 0xA4, 0xC8, 0xA4, 0xC9, 0xA4, 0xCA, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xCB, 0xA4, 0xCC, /* 0xD0-0xD3 */
+	0xA4, 0xCD, 0xA4, 0xCE, 0xA4, 0xCF, 0xA4, 0xD0, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xD1, 0xA4, 0xD2, /* 0xD8-0xDB */
+	0xA4, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xA1, 0xCB, 0xA1, 0xCC, 0xA1, 0xFE, 0xA3, 0xFE, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xA1, 0xCD, 0xA3, 0xDC, 0x00, 0x00, /* 0xE4-0xE7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	NULL,   u2c_01, u2c_02, u2c_03, u2c_04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   u2c_11, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_20, u2c_21, u2c_22, u2c_23, u2c_24, u2c_25, u2c_26, NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_30, u2c_31, u2c_32, u2c_33, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   u2c_4E, u2c_4F, 
+	u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, 
+	u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, 
+	u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, 
+	u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, 
+	u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, 
+	u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, 
+	u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, 
+	u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, 
+	u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, 
+	u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, 
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   u2c_AC, u2c_AD, u2c_AE, u2c_AF, 
+	u2c_B0, u2c_B1, u2c_B2, u2c_B3, u2c_B4, u2c_B5, u2c_B6, u2c_B7, 
+	u2c_B8, u2c_B9, u2c_BA, u2c_BB, u2c_BC, u2c_BD, u2c_BE, u2c_BF, 
+	u2c_C0, u2c_C1, u2c_C2, u2c_C3, u2c_C4, u2c_C5, u2c_C6, u2c_C7, 
+	u2c_C8, u2c_C9, u2c_CA, u2c_CB, u2c_CC, u2c_CD, u2c_CE, u2c_CF, 
+	u2c_D0, u2c_D1, u2c_D2, u2c_D3, u2c_D4, u2c_D5, u2c_D6, u2c_D7, 
+	NULL,   NULL,   NULL,   NULL,   u2c_DC, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   u2c_F9, u2c_FA, NULL,   NULL,   NULL,   NULL,   u2c_FF, };
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(const wchar_t uni,
+			unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni&0xFF;
+	unsigned char ch = (uni>>8)&0xFF;
+	int n;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset) {
+		if (boundlen <= 1)
+			return -ENAMETOOLONG;
+		out[0] = uni2charset[cl*2];
+		out[1] = uni2charset[cl*2+1];
+		if (out[0] == 0x00 && out[1] == 0x00)
+			return -EINVAL;
+		n = 2;
+	} else if (ch==0 && cl) {
+		out[0] = cl;
+		n = 1;
+	}
+	else
+		return -EINVAL;
+
+	return n;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen,
+			wchar_t *uni)
+{
+	unsigned char ch, cl;
+	const wchar_t *charset2uni;
+	int n;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	if (boundlen == 1) {
+		*uni = rawstring[0];
+		return 1;
+	}
+
+	ch = rawstring[0];
+	cl = rawstring[1];
+
+	charset2uni = page_charset2uni[ch];
+	if (charset2uni && cl) {
+		*uni = charset2uni[cl];
+		if (*uni == 0x0000)
+			return -EINVAL;
+		n = 2;
+	} else{
+		*uni = ch;
+		n = 1;
+	}
+	return n;
+}
+
+static struct nls_table table = {
+	.charset	= "cp949",
+	.alias		= "euc-kr",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp949(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp949(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp949)
+module_exit(exit_nls_cp949)
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NLS(euc-kr);
diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c
new file mode 100644
index 00000000..ef253682
--- /dev/null
+++ b/fs/nls/nls_cp950.c
@@ -0,0 +1,9483 @@
+/*
+ * linux/fs/nls/nls_cp950.c
+ *
+ * Charset cp950 translation tables.
+ * This translation table was generated automatically, the
+ * original table can be download from the Microsoft website.
+ * (http://www.microsoft.com/typography/unicode/unicodecp.htm)
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t c2u_A1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x3000,0xFF0C,0x3001,0x3002,0xFF0E,0x2027,0xFF1B,0xFF1A,/* 0x40-0x47 */
+	0xFF1F,0xFF01,0xFE30,0x2026,0x2025,0xFE50,0xFE51,0xFE52,/* 0x48-0x4F */
+	0x00B7,0xFE54,0xFE55,0xFE56,0xFE57,0xFF5C,0x2013,0xFE31,/* 0x50-0x57 */
+	0x2014,0xFE33,0x2574,0xFE34,0xFE4F,0xFF08,0xFF09,0xFE35,/* 0x58-0x5F */
+	0xFE36,0xFF5B,0xFF5D,0xFE37,0xFE38,0x3014,0x3015,0xFE39,/* 0x60-0x67 */
+	0xFE3A,0x3010,0x3011,0xFE3B,0xFE3C,0x300A,0x300B,0xFE3D,/* 0x68-0x6F */
+	0xFE3E,0x3008,0x3009,0xFE3F,0xFE40,0x300C,0x300D,0xFE41,/* 0x70-0x77 */
+	0xFE42,0x300E,0x300F,0xFE43,0xFE44,0xFE59,0xFE5A,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0xFE5B,0xFE5C,0xFE5D,0xFE5E,0x2018,0x2019,0x201C,/* 0xA0-0xA7 */
+	0x201D,0x301D,0x301E,0x2035,0x2032,0xFF03,0xFF06,0xFF0A,/* 0xA8-0xAF */
+	0x203B,0x00A7,0x3003,0x25CB,0x25CF,0x25B3,0x25B2,0x25CE,/* 0xB0-0xB7 */
+	0x2606,0x2605,0x25C7,0x25C6,0x25A1,0x25A0,0x25BD,0x25BC,/* 0xB8-0xBF */
+	0x32A3,0x2105,0x00AF,0xFFE3,0xFF3F,0x02CD,0xFE49,0xFE4A,/* 0xC0-0xC7 */
+	0xFE4D,0xFE4E,0xFE4B,0xFE4C,0xFE5F,0xFE60,0xFE61,0xFF0B,/* 0xC8-0xCF */
+	0xFF0D,0x00D7,0x00F7,0x00B1,0x221A,0xFF1C,0xFF1E,0xFF1D,/* 0xD0-0xD7 */
+	0x2266,0x2267,0x2260,0x221E,0x2252,0x2261,0xFE62,0xFE63,/* 0xD8-0xDF */
+	0xFE64,0xFE65,0xFE66,0xFF5E,0x2229,0x222A,0x22A5,0x2220,/* 0xE0-0xE7 */
+	0x221F,0x22BF,0x33D2,0x33D1,0x222B,0x222E,0x2235,0x2234,/* 0xE8-0xEF */
+	0x2640,0x2642,0x2295,0x2299,0x2191,0x2193,0x2190,0x2192,/* 0xF0-0xF7 */
+	0x2196,0x2197,0x2199,0x2198,0x2225,0x2223,0xFF0F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0xFF3C,0x2215,0xFE68,0xFF04,0xFFE5,0x3012,0xFFE0,0xFFE1,/* 0x40-0x47 */
+	0xFF05,0xFF20,0x2103,0x2109,0xFE69,0xFE6A,0xFE6B,0x33D5,/* 0x48-0x4F */
+	0x339C,0x339D,0x339E,0x33CE,0x33A1,0x338E,0x338F,0x33C4,/* 0x50-0x57 */
+	0x00B0,0x5159,0x515B,0x515E,0x515D,0x5161,0x5163,0x55E7,/* 0x58-0x5F */
+	0x74E9,0x7CCE,0x2581,0x2582,0x2583,0x2584,0x2585,0x2586,/* 0x60-0x67 */
+	0x2587,0x2588,0x258F,0x258E,0x258D,0x258C,0x258B,0x258A,/* 0x68-0x6F */
+	0x2589,0x253C,0x2534,0x252C,0x2524,0x251C,0x2594,0x2500,/* 0x70-0x77 */
+	0x2502,0x2595,0x250C,0x2510,0x2514,0x2518,0x256D,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x256E,0x2570,0x256F,0x2550,0x255E,0x256A,0x2561,/* 0xA0-0xA7 */
+	0x25E2,0x25E3,0x25E5,0x25E4,0x2571,0x2572,0x2573,0xFF10,/* 0xA8-0xAF */
+	0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,0xFF18,/* 0xB0-0xB7 */
+	0xFF19,0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,0x2166,/* 0xB8-0xBF */
+	0x2167,0x2168,0x2169,0x3021,0x3022,0x3023,0x3024,0x3025,/* 0xC0-0xC7 */
+	0x3026,0x3027,0x3028,0x3029,0x5341,0x5344,0x5345,0xFF21,/* 0xC8-0xCF */
+	0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,0xFF28,0xFF29,/* 0xD0-0xD7 */
+	0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,0xFF30,0xFF31,/* 0xD8-0xDF */
+	0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,0xFF38,0xFF39,/* 0xE0-0xE7 */
+	0xFF3A,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0xE8-0xEF */
+	0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0xF0-0xF7 */
+	0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0xFF57,0xFF58,0xFF59,0xFF5A,0x0391,0x0392,0x0393,0x0394,/* 0x40-0x47 */
+	0x0395,0x0396,0x0397,0x0398,0x0399,0x039A,0x039B,0x039C,/* 0x48-0x4F */
+	0x039D,0x039E,0x039F,0x03A0,0x03A1,0x03A3,0x03A4,0x03A5,/* 0x50-0x57 */
+	0x03A6,0x03A7,0x03A8,0x03A9,0x03B1,0x03B2,0x03B3,0x03B4,/* 0x58-0x5F */
+	0x03B5,0x03B6,0x03B7,0x03B8,0x03B9,0x03BA,0x03BB,0x03BC,/* 0x60-0x67 */
+	0x03BD,0x03BE,0x03BF,0x03C0,0x03C1,0x03C3,0x03C4,0x03C5,/* 0x68-0x6F */
+	0x03C6,0x03C7,0x03C8,0x03C9,0x3105,0x3106,0x3107,0x3108,/* 0x70-0x77 */
+	0x3109,0x310A,0x310B,0x310C,0x310D,0x310E,0x310F,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x3110,0x3111,0x3112,0x3113,0x3114,0x3115,0x3116,/* 0xA0-0xA7 */
+	0x3117,0x3118,0x3119,0x311A,0x311B,0x311C,0x311D,0x311E,/* 0xA8-0xAF */
+	0x311F,0x3120,0x3121,0x3122,0x3123,0x3124,0x3125,0x3126,/* 0xB0-0xB7 */
+	0x3127,0x3128,0x3129,0x02D9,0x02C9,0x02CA,0x02C7,0x02CB,/* 0xB8-0xBF */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC8-0xCF */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD0-0xD7 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD8-0xDF */
+	0x0000,0x20AC,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xE0-0xE7 */
+};
+
+static const wchar_t c2u_A4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x4E00,0x4E59,0x4E01,0x4E03,0x4E43,0x4E5D,0x4E86,0x4E8C,/* 0x40-0x47 */
+	0x4EBA,0x513F,0x5165,0x516B,0x51E0,0x5200,0x5201,0x529B,/* 0x48-0x4F */
+	0x5315,0x5341,0x535C,0x53C8,0x4E09,0x4E0B,0x4E08,0x4E0A,/* 0x50-0x57 */
+	0x4E2B,0x4E38,0x51E1,0x4E45,0x4E48,0x4E5F,0x4E5E,0x4E8E,/* 0x58-0x5F */
+	0x4EA1,0x5140,0x5203,0x52FA,0x5343,0x53C9,0x53E3,0x571F,/* 0x60-0x67 */
+	0x58EB,0x5915,0x5927,0x5973,0x5B50,0x5B51,0x5B53,0x5BF8,/* 0x68-0x6F */
+	0x5C0F,0x5C22,0x5C38,0x5C71,0x5DDD,0x5DE5,0x5DF1,0x5DF2,/* 0x70-0x77 */
+	0x5DF3,0x5DFE,0x5E72,0x5EFE,0x5F0B,0x5F13,0x624D,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x4E11,0x4E10,0x4E0D,0x4E2D,0x4E30,0x4E39,0x4E4B,/* 0xA0-0xA7 */
+	0x5C39,0x4E88,0x4E91,0x4E95,0x4E92,0x4E94,0x4EA2,0x4EC1,/* 0xA8-0xAF */
+	0x4EC0,0x4EC3,0x4EC6,0x4EC7,0x4ECD,0x4ECA,0x4ECB,0x4EC4,/* 0xB0-0xB7 */
+	0x5143,0x5141,0x5167,0x516D,0x516E,0x516C,0x5197,0x51F6,/* 0xB8-0xBF */
+	0x5206,0x5207,0x5208,0x52FB,0x52FE,0x52FF,0x5316,0x5339,/* 0xC0-0xC7 */
+	0x5348,0x5347,0x5345,0x535E,0x5384,0x53CB,0x53CA,0x53CD,/* 0xC8-0xCF */
+	0x58EC,0x5929,0x592B,0x592A,0x592D,0x5B54,0x5C11,0x5C24,/* 0xD0-0xD7 */
+	0x5C3A,0x5C6F,0x5DF4,0x5E7B,0x5EFF,0x5F14,0x5F15,0x5FC3,/* 0xD8-0xDF */
+	0x6208,0x6236,0x624B,0x624E,0x652F,0x6587,0x6597,0x65A4,/* 0xE0-0xE7 */
+	0x65B9,0x65E5,0x66F0,0x6708,0x6728,0x6B20,0x6B62,0x6B79,/* 0xE8-0xEF */
+	0x6BCB,0x6BD4,0x6BDB,0x6C0F,0x6C34,0x706B,0x722A,0x7236,/* 0xF0-0xF7 */
+	0x723B,0x7247,0x7259,0x725B,0x72AC,0x738B,0x4E19,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x4E16,0x4E15,0x4E14,0x4E18,0x4E3B,0x4E4D,0x4E4F,0x4E4E,/* 0x40-0x47 */
+	0x4EE5,0x4ED8,0x4ED4,0x4ED5,0x4ED6,0x4ED7,0x4EE3,0x4EE4,/* 0x48-0x4F */
+	0x4ED9,0x4EDE,0x5145,0x5144,0x5189,0x518A,0x51AC,0x51F9,/* 0x50-0x57 */
+	0x51FA,0x51F8,0x520A,0x52A0,0x529F,0x5305,0x5306,0x5317,/* 0x58-0x5F */
+	0x531D,0x4EDF,0x534A,0x5349,0x5361,0x5360,0x536F,0x536E,/* 0x60-0x67 */
+	0x53BB,0x53EF,0x53E4,0x53F3,0x53EC,0x53EE,0x53E9,0x53E8,/* 0x68-0x6F */
+	0x53FC,0x53F8,0x53F5,0x53EB,0x53E6,0x53EA,0x53F2,0x53F1,/* 0x70-0x77 */
+	0x53F0,0x53E5,0x53ED,0x53FB,0x56DB,0x56DA,0x5916,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x592E,0x5931,0x5974,0x5976,0x5B55,0x5B83,0x5C3C,/* 0xA0-0xA7 */
+	0x5DE8,0x5DE7,0x5DE6,0x5E02,0x5E03,0x5E73,0x5E7C,0x5F01,/* 0xA8-0xAF */
+	0x5F18,0x5F17,0x5FC5,0x620A,0x6253,0x6254,0x6252,0x6251,/* 0xB0-0xB7 */
+	0x65A5,0x65E6,0x672E,0x672C,0x672A,0x672B,0x672D,0x6B63,/* 0xB8-0xBF */
+	0x6BCD,0x6C11,0x6C10,0x6C38,0x6C41,0x6C40,0x6C3E,0x72AF,/* 0xC0-0xC7 */
+	0x7384,0x7389,0x74DC,0x74E6,0x7518,0x751F,0x7528,0x7529,/* 0xC8-0xCF */
+	0x7530,0x7531,0x7532,0x7533,0x758B,0x767D,0x76AE,0x76BF,/* 0xD0-0xD7 */
+	0x76EE,0x77DB,0x77E2,0x77F3,0x793A,0x79BE,0x7A74,0x7ACB,/* 0xD8-0xDF */
+	0x4E1E,0x4E1F,0x4E52,0x4E53,0x4E69,0x4E99,0x4EA4,0x4EA6,/* 0xE0-0xE7 */
+	0x4EA5,0x4EFF,0x4F09,0x4F19,0x4F0A,0x4F15,0x4F0D,0x4F10,/* 0xE8-0xEF */
+	0x4F11,0x4F0F,0x4EF2,0x4EF6,0x4EFB,0x4EF0,0x4EF3,0x4EFD,/* 0xF0-0xF7 */
+	0x4F01,0x4F0B,0x5149,0x5147,0x5146,0x5148,0x5168,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5171,0x518D,0x51B0,0x5217,0x5211,0x5212,0x520E,0x5216,/* 0x40-0x47 */
+	0x52A3,0x5308,0x5321,0x5320,0x5370,0x5371,0x5409,0x540F,/* 0x48-0x4F */
+	0x540C,0x540A,0x5410,0x5401,0x540B,0x5404,0x5411,0x540D,/* 0x50-0x57 */
+	0x5408,0x5403,0x540E,0x5406,0x5412,0x56E0,0x56DE,0x56DD,/* 0x58-0x5F */
+	0x5733,0x5730,0x5728,0x572D,0x572C,0x572F,0x5729,0x5919,/* 0x60-0x67 */
+	0x591A,0x5937,0x5938,0x5984,0x5978,0x5983,0x597D,0x5979,/* 0x68-0x6F */
+	0x5982,0x5981,0x5B57,0x5B58,0x5B87,0x5B88,0x5B85,0x5B89,/* 0x70-0x77 */
+	0x5BFA,0x5C16,0x5C79,0x5DDE,0x5E06,0x5E76,0x5E74,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5F0F,0x5F1B,0x5FD9,0x5FD6,0x620E,0x620C,0x620D,/* 0xA0-0xA7 */
+	0x6210,0x6263,0x625B,0x6258,0x6536,0x65E9,0x65E8,0x65EC,/* 0xA8-0xAF */
+	0x65ED,0x66F2,0x66F3,0x6709,0x673D,0x6734,0x6731,0x6735,/* 0xB0-0xB7 */
+	0x6B21,0x6B64,0x6B7B,0x6C16,0x6C5D,0x6C57,0x6C59,0x6C5F,/* 0xB8-0xBF */
+	0x6C60,0x6C50,0x6C55,0x6C61,0x6C5B,0x6C4D,0x6C4E,0x7070,/* 0xC0-0xC7 */
+	0x725F,0x725D,0x767E,0x7AF9,0x7C73,0x7CF8,0x7F36,0x7F8A,/* 0xC8-0xCF */
+	0x7FBD,0x8001,0x8003,0x800C,0x8012,0x8033,0x807F,0x8089,/* 0xD0-0xD7 */
+	0x808B,0x808C,0x81E3,0x81EA,0x81F3,0x81FC,0x820C,0x821B,/* 0xD8-0xDF */
+	0x821F,0x826E,0x8272,0x827E,0x866B,0x8840,0x884C,0x8863,/* 0xE0-0xE7 */
+	0x897F,0x9621,0x4E32,0x4EA8,0x4F4D,0x4F4F,0x4F47,0x4F57,/* 0xE8-0xEF */
+	0x4F5E,0x4F34,0x4F5B,0x4F55,0x4F30,0x4F50,0x4F51,0x4F3D,/* 0xF0-0xF7 */
+	0x4F3A,0x4F38,0x4F43,0x4F54,0x4F3C,0x4F46,0x4F63,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x4F5C,0x4F60,0x4F2F,0x4F4E,0x4F36,0x4F59,0x4F5D,0x4F48,/* 0x40-0x47 */
+	0x4F5A,0x514C,0x514B,0x514D,0x5175,0x51B6,0x51B7,0x5225,/* 0x48-0x4F */
+	0x5224,0x5229,0x522A,0x5228,0x52AB,0x52A9,0x52AA,0x52AC,/* 0x50-0x57 */
+	0x5323,0x5373,0x5375,0x541D,0x542D,0x541E,0x543E,0x5426,/* 0x58-0x5F */
+	0x544E,0x5427,0x5446,0x5443,0x5433,0x5448,0x5442,0x541B,/* 0x60-0x67 */
+	0x5429,0x544A,0x5439,0x543B,0x5438,0x542E,0x5435,0x5436,/* 0x68-0x6F */
+	0x5420,0x543C,0x5440,0x5431,0x542B,0x541F,0x542C,0x56EA,/* 0x70-0x77 */
+	0x56F0,0x56E4,0x56EB,0x574A,0x5751,0x5740,0x574D,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5747,0x574E,0x573E,0x5750,0x574F,0x573B,0x58EF,/* 0xA0-0xA7 */
+	0x593E,0x599D,0x5992,0x59A8,0x599E,0x59A3,0x5999,0x5996,/* 0xA8-0xAF */
+	0x598D,0x59A4,0x5993,0x598A,0x59A5,0x5B5D,0x5B5C,0x5B5A,/* 0xB0-0xB7 */
+	0x5B5B,0x5B8C,0x5B8B,0x5B8F,0x5C2C,0x5C40,0x5C41,0x5C3F,/* 0xB8-0xBF */
+	0x5C3E,0x5C90,0x5C91,0x5C94,0x5C8C,0x5DEB,0x5E0C,0x5E8F,/* 0xC0-0xC7 */
+	0x5E87,0x5E8A,0x5EF7,0x5F04,0x5F1F,0x5F64,0x5F62,0x5F77,/* 0xC8-0xCF */
+	0x5F79,0x5FD8,0x5FCC,0x5FD7,0x5FCD,0x5FF1,0x5FEB,0x5FF8,/* 0xD0-0xD7 */
+	0x5FEA,0x6212,0x6211,0x6284,0x6297,0x6296,0x6280,0x6276,/* 0xD8-0xDF */
+	0x6289,0x626D,0x628A,0x627C,0x627E,0x6279,0x6273,0x6292,/* 0xE0-0xE7 */
+	0x626F,0x6298,0x626E,0x6295,0x6293,0x6291,0x6286,0x6539,/* 0xE8-0xEF */
+	0x653B,0x6538,0x65F1,0x66F4,0x675F,0x674E,0x674F,0x6750,/* 0xF0-0xF7 */
+	0x6751,0x675C,0x6756,0x675E,0x6749,0x6746,0x6760,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6753,0x6757,0x6B65,0x6BCF,0x6C42,0x6C5E,0x6C99,0x6C81,/* 0x40-0x47 */
+	0x6C88,0x6C89,0x6C85,0x6C9B,0x6C6A,0x6C7A,0x6C90,0x6C70,/* 0x48-0x4F */
+	0x6C8C,0x6C68,0x6C96,0x6C92,0x6C7D,0x6C83,0x6C72,0x6C7E,/* 0x50-0x57 */
+	0x6C74,0x6C86,0x6C76,0x6C8D,0x6C94,0x6C98,0x6C82,0x7076,/* 0x58-0x5F */
+	0x707C,0x707D,0x7078,0x7262,0x7261,0x7260,0x72C4,0x72C2,/* 0x60-0x67 */
+	0x7396,0x752C,0x752B,0x7537,0x7538,0x7682,0x76EF,0x77E3,/* 0x68-0x6F */
+	0x79C1,0x79C0,0x79BF,0x7A76,0x7CFB,0x7F55,0x8096,0x8093,/* 0x70-0x77 */
+	0x809D,0x8098,0x809B,0x809A,0x80B2,0x826F,0x8292,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x828B,0x828D,0x898B,0x89D2,0x8A00,0x8C37,0x8C46,/* 0xA0-0xA7 */
+	0x8C55,0x8C9D,0x8D64,0x8D70,0x8DB3,0x8EAB,0x8ECA,0x8F9B,/* 0xA8-0xAF */
+	0x8FB0,0x8FC2,0x8FC6,0x8FC5,0x8FC4,0x5DE1,0x9091,0x90A2,/* 0xB0-0xB7 */
+	0x90AA,0x90A6,0x90A3,0x9149,0x91C6,0x91CC,0x9632,0x962E,/* 0xB8-0xBF */
+	0x9631,0x962A,0x962C,0x4E26,0x4E56,0x4E73,0x4E8B,0x4E9B,/* 0xC0-0xC7 */
+	0x4E9E,0x4EAB,0x4EAC,0x4F6F,0x4F9D,0x4F8D,0x4F73,0x4F7F,/* 0xC8-0xCF */
+	0x4F6C,0x4F9B,0x4F8B,0x4F86,0x4F83,0x4F70,0x4F75,0x4F88,/* 0xD0-0xD7 */
+	0x4F69,0x4F7B,0x4F96,0x4F7E,0x4F8F,0x4F91,0x4F7A,0x5154,/* 0xD8-0xDF */
+	0x5152,0x5155,0x5169,0x5177,0x5176,0x5178,0x51BD,0x51FD,/* 0xE0-0xE7 */
+	0x523B,0x5238,0x5237,0x523A,0x5230,0x522E,0x5236,0x5241,/* 0xE8-0xEF */
+	0x52BE,0x52BB,0x5352,0x5354,0x5353,0x5351,0x5366,0x5377,/* 0xF0-0xF7 */
+	0x5378,0x5379,0x53D6,0x53D4,0x53D7,0x5473,0x5475,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_A9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5496,0x5478,0x5495,0x5480,0x547B,0x5477,0x5484,0x5492,/* 0x40-0x47 */
+	0x5486,0x547C,0x5490,0x5471,0x5476,0x548C,0x549A,0x5462,/* 0x48-0x4F */
+	0x5468,0x548B,0x547D,0x548E,0x56FA,0x5783,0x5777,0x576A,/* 0x50-0x57 */
+	0x5769,0x5761,0x5766,0x5764,0x577C,0x591C,0x5949,0x5947,/* 0x58-0x5F */
+	0x5948,0x5944,0x5954,0x59BE,0x59BB,0x59D4,0x59B9,0x59AE,/* 0x60-0x67 */
+	0x59D1,0x59C6,0x59D0,0x59CD,0x59CB,0x59D3,0x59CA,0x59AF,/* 0x68-0x6F */
+	0x59B3,0x59D2,0x59C5,0x5B5F,0x5B64,0x5B63,0x5B97,0x5B9A,/* 0x70-0x77 */
+	0x5B98,0x5B9C,0x5B99,0x5B9B,0x5C1A,0x5C48,0x5C45,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5C46,0x5CB7,0x5CA1,0x5CB8,0x5CA9,0x5CAB,0x5CB1,/* 0xA0-0xA7 */
+	0x5CB3,0x5E18,0x5E1A,0x5E16,0x5E15,0x5E1B,0x5E11,0x5E78,/* 0xA8-0xAF */
+	0x5E9A,0x5E97,0x5E9C,0x5E95,0x5E96,0x5EF6,0x5F26,0x5F27,/* 0xB0-0xB7 */
+	0x5F29,0x5F80,0x5F81,0x5F7F,0x5F7C,0x5FDD,0x5FE0,0x5FFD,/* 0xB8-0xBF */
+	0x5FF5,0x5FFF,0x600F,0x6014,0x602F,0x6035,0x6016,0x602A,/* 0xC0-0xC7 */
+	0x6015,0x6021,0x6027,0x6029,0x602B,0x601B,0x6216,0x6215,/* 0xC8-0xCF */
+	0x623F,0x623E,0x6240,0x627F,0x62C9,0x62CC,0x62C4,0x62BF,/* 0xD0-0xD7 */
+	0x62C2,0x62B9,0x62D2,0x62DB,0x62AB,0x62D3,0x62D4,0x62CB,/* 0xD8-0xDF */
+	0x62C8,0x62A8,0x62BD,0x62BC,0x62D0,0x62D9,0x62C7,0x62CD,/* 0xE0-0xE7 */
+	0x62B5,0x62DA,0x62B1,0x62D8,0x62D6,0x62D7,0x62C6,0x62AC,/* 0xE8-0xEF */
+	0x62CE,0x653E,0x65A7,0x65BC,0x65FA,0x6614,0x6613,0x660C,/* 0xF0-0xF7 */
+	0x6606,0x6602,0x660E,0x6600,0x660F,0x6615,0x660A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_AA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6607,0x670D,0x670B,0x676D,0x678B,0x6795,0x6771,0x679C,/* 0x40-0x47 */
+	0x6773,0x6777,0x6787,0x679D,0x6797,0x676F,0x6770,0x677F,/* 0x48-0x4F */
+	0x6789,0x677E,0x6790,0x6775,0x679A,0x6793,0x677C,0x676A,/* 0x50-0x57 */
+	0x6772,0x6B23,0x6B66,0x6B67,0x6B7F,0x6C13,0x6C1B,0x6CE3,/* 0x58-0x5F */
+	0x6CE8,0x6CF3,0x6CB1,0x6CCC,0x6CE5,0x6CB3,0x6CBD,0x6CBE,/* 0x60-0x67 */
+	0x6CBC,0x6CE2,0x6CAB,0x6CD5,0x6CD3,0x6CB8,0x6CC4,0x6CB9,/* 0x68-0x6F */
+	0x6CC1,0x6CAE,0x6CD7,0x6CC5,0x6CF1,0x6CBF,0x6CBB,0x6CE1,/* 0x70-0x77 */
+	0x6CDB,0x6CCA,0x6CAC,0x6CEF,0x6CDC,0x6CD6,0x6CE0,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7095,0x708E,0x7092,0x708A,0x7099,0x722C,0x722D,/* 0xA0-0xA7 */
+	0x7238,0x7248,0x7267,0x7269,0x72C0,0x72CE,0x72D9,0x72D7,/* 0xA8-0xAF */
+	0x72D0,0x73A9,0x73A8,0x739F,0x73AB,0x73A5,0x753D,0x759D,/* 0xB0-0xB7 */
+	0x7599,0x759A,0x7684,0x76C2,0x76F2,0x76F4,0x77E5,0x77FD,/* 0xB8-0xBF */
+	0x793E,0x7940,0x7941,0x79C9,0x79C8,0x7A7A,0x7A79,0x7AFA,/* 0xC0-0xC7 */
+	0x7CFE,0x7F54,0x7F8C,0x7F8B,0x8005,0x80BA,0x80A5,0x80A2,/* 0xC8-0xCF */
+	0x80B1,0x80A1,0x80AB,0x80A9,0x80B4,0x80AA,0x80AF,0x81E5,/* 0xD0-0xD7 */
+	0x81FE,0x820D,0x82B3,0x829D,0x8299,0x82AD,0x82BD,0x829F,/* 0xD8-0xDF */
+	0x82B9,0x82B1,0x82AC,0x82A5,0x82AF,0x82B8,0x82A3,0x82B0,/* 0xE0-0xE7 */
+	0x82BE,0x82B7,0x864E,0x8671,0x521D,0x8868,0x8ECB,0x8FCE,/* 0xE8-0xEF */
+	0x8FD4,0x8FD1,0x90B5,0x90B8,0x90B1,0x90B6,0x91C7,0x91D1,/* 0xF0-0xF7 */
+	0x9577,0x9580,0x961C,0x9640,0x963F,0x963B,0x9644,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_AB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9642,0x96B9,0x96E8,0x9752,0x975E,0x4E9F,0x4EAD,0x4EAE,/* 0x40-0x47 */
+	0x4FE1,0x4FB5,0x4FAF,0x4FBF,0x4FE0,0x4FD1,0x4FCF,0x4FDD,/* 0x48-0x4F */
+	0x4FC3,0x4FB6,0x4FD8,0x4FDF,0x4FCA,0x4FD7,0x4FAE,0x4FD0,/* 0x50-0x57 */
+	0x4FC4,0x4FC2,0x4FDA,0x4FCE,0x4FDE,0x4FB7,0x5157,0x5192,/* 0x58-0x5F */
+	0x5191,0x51A0,0x524E,0x5243,0x524A,0x524D,0x524C,0x524B,/* 0x60-0x67 */
+	0x5247,0x52C7,0x52C9,0x52C3,0x52C1,0x530D,0x5357,0x537B,/* 0x68-0x6F */
+	0x539A,0x53DB,0x54AC,0x54C0,0x54A8,0x54CE,0x54C9,0x54B8,/* 0x70-0x77 */
+	0x54A6,0x54B3,0x54C7,0x54C2,0x54BD,0x54AA,0x54C1,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x54C4,0x54C8,0x54AF,0x54AB,0x54B1,0x54BB,0x54A9,/* 0xA0-0xA7 */
+	0x54A7,0x54BF,0x56FF,0x5782,0x578B,0x57A0,0x57A3,0x57A2,/* 0xA8-0xAF */
+	0x57CE,0x57AE,0x5793,0x5955,0x5951,0x594F,0x594E,0x5950,/* 0xB0-0xB7 */
+	0x59DC,0x59D8,0x59FF,0x59E3,0x59E8,0x5A03,0x59E5,0x59EA,/* 0xB8-0xBF */
+	0x59DA,0x59E6,0x5A01,0x59FB,0x5B69,0x5BA3,0x5BA6,0x5BA4,/* 0xC0-0xC7 */
+	0x5BA2,0x5BA5,0x5C01,0x5C4E,0x5C4F,0x5C4D,0x5C4B,0x5CD9,/* 0xC8-0xCF */
+	0x5CD2,0x5DF7,0x5E1D,0x5E25,0x5E1F,0x5E7D,0x5EA0,0x5EA6,/* 0xD0-0xD7 */
+	0x5EFA,0x5F08,0x5F2D,0x5F65,0x5F88,0x5F85,0x5F8A,0x5F8B,/* 0xD8-0xDF */
+	0x5F87,0x5F8C,0x5F89,0x6012,0x601D,0x6020,0x6025,0x600E,/* 0xE0-0xE7 */
+	0x6028,0x604D,0x6070,0x6068,0x6062,0x6046,0x6043,0x606C,/* 0xE8-0xEF */
+	0x606B,0x606A,0x6064,0x6241,0x62DC,0x6316,0x6309,0x62FC,/* 0xF0-0xF7 */
+	0x62ED,0x6301,0x62EE,0x62FD,0x6307,0x62F1,0x62F7,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_AC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x62EF,0x62EC,0x62FE,0x62F4,0x6311,0x6302,0x653F,0x6545,/* 0x40-0x47 */
+	0x65AB,0x65BD,0x65E2,0x6625,0x662D,0x6620,0x6627,0x662F,/* 0x48-0x4F */
+	0x661F,0x6628,0x6631,0x6624,0x66F7,0x67FF,0x67D3,0x67F1,/* 0x50-0x57 */
+	0x67D4,0x67D0,0x67EC,0x67B6,0x67AF,0x67F5,0x67E9,0x67EF,/* 0x58-0x5F */
+	0x67C4,0x67D1,0x67B4,0x67DA,0x67E5,0x67B8,0x67CF,0x67DE,/* 0x60-0x67 */
+	0x67F3,0x67B0,0x67D9,0x67E2,0x67DD,0x67D2,0x6B6A,0x6B83,/* 0x68-0x6F */
+	0x6B86,0x6BB5,0x6BD2,0x6BD7,0x6C1F,0x6CC9,0x6D0B,0x6D32,/* 0x70-0x77 */
+	0x6D2A,0x6D41,0x6D25,0x6D0C,0x6D31,0x6D1E,0x6D17,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6D3B,0x6D3D,0x6D3E,0x6D36,0x6D1B,0x6CF5,0x6D39,/* 0xA0-0xA7 */
+	0x6D27,0x6D38,0x6D29,0x6D2E,0x6D35,0x6D0E,0x6D2B,0x70AB,/* 0xA8-0xAF */
+	0x70BA,0x70B3,0x70AC,0x70AF,0x70AD,0x70B8,0x70AE,0x70A4,/* 0xB0-0xB7 */
+	0x7230,0x7272,0x726F,0x7274,0x72E9,0x72E0,0x72E1,0x73B7,/* 0xB8-0xBF */
+	0x73CA,0x73BB,0x73B2,0x73CD,0x73C0,0x73B3,0x751A,0x752D,/* 0xC0-0xC7 */
+	0x754F,0x754C,0x754E,0x754B,0x75AB,0x75A4,0x75A5,0x75A2,/* 0xC8-0xCF */
+	0x75A3,0x7678,0x7686,0x7687,0x7688,0x76C8,0x76C6,0x76C3,/* 0xD0-0xD7 */
+	0x76C5,0x7701,0x76F9,0x76F8,0x7709,0x770B,0x76FE,0x76FC,/* 0xD8-0xDF */
+	0x7707,0x77DC,0x7802,0x7814,0x780C,0x780D,0x7946,0x7949,/* 0xE0-0xE7 */
+	0x7948,0x7947,0x79B9,0x79BA,0x79D1,0x79D2,0x79CB,0x7A7F,/* 0xE8-0xEF */
+	0x7A81,0x7AFF,0x7AFD,0x7C7D,0x7D02,0x7D05,0x7D00,0x7D09,/* 0xF0-0xF7 */
+	0x7D07,0x7D04,0x7D06,0x7F38,0x7F8E,0x7FBF,0x8004,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_AD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8010,0x800D,0x8011,0x8036,0x80D6,0x80E5,0x80DA,0x80C3,/* 0x40-0x47 */
+	0x80C4,0x80CC,0x80E1,0x80DB,0x80CE,0x80DE,0x80E4,0x80DD,/* 0x48-0x4F */
+	0x81F4,0x8222,0x82E7,0x8303,0x8305,0x82E3,0x82DB,0x82E6,/* 0x50-0x57 */
+	0x8304,0x82E5,0x8302,0x8309,0x82D2,0x82D7,0x82F1,0x8301,/* 0x58-0x5F */
+	0x82DC,0x82D4,0x82D1,0x82DE,0x82D3,0x82DF,0x82EF,0x8306,/* 0x60-0x67 */
+	0x8650,0x8679,0x867B,0x867A,0x884D,0x886B,0x8981,0x89D4,/* 0x68-0x6F */
+	0x8A08,0x8A02,0x8A03,0x8C9E,0x8CA0,0x8D74,0x8D73,0x8DB4,/* 0x70-0x77 */
+	0x8ECD,0x8ECC,0x8FF0,0x8FE6,0x8FE2,0x8FEA,0x8FE5,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8FED,0x8FEB,0x8FE4,0x8FE8,0x90CA,0x90CE,0x90C1,/* 0xA0-0xA7 */
+	0x90C3,0x914B,0x914A,0x91CD,0x9582,0x9650,0x964B,0x964C,/* 0xA8-0xAF */
+	0x964D,0x9762,0x9769,0x97CB,0x97ED,0x97F3,0x9801,0x98A8,/* 0xB0-0xB7 */
+	0x98DB,0x98DF,0x9996,0x9999,0x4E58,0x4EB3,0x500C,0x500D,/* 0xB8-0xBF */
+	0x5023,0x4FEF,0x5026,0x5025,0x4FF8,0x5029,0x5016,0x5006,/* 0xC0-0xC7 */
+	0x503C,0x501F,0x501A,0x5012,0x5011,0x4FFA,0x5000,0x5014,/* 0xC8-0xCF */
+	0x5028,0x4FF1,0x5021,0x500B,0x5019,0x5018,0x4FF3,0x4FEE,/* 0xD0-0xD7 */
+	0x502D,0x502A,0x4FFE,0x502B,0x5009,0x517C,0x51A4,0x51A5,/* 0xD8-0xDF */
+	0x51A2,0x51CD,0x51CC,0x51C6,0x51CB,0x5256,0x525C,0x5254,/* 0xE0-0xE7 */
+	0x525B,0x525D,0x532A,0x537F,0x539F,0x539D,0x53DF,0x54E8,/* 0xE8-0xEF */
+	0x5510,0x5501,0x5537,0x54FC,0x54E5,0x54F2,0x5506,0x54FA,/* 0xF0-0xF7 */
+	0x5514,0x54E9,0x54ED,0x54E1,0x5509,0x54EE,0x54EA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_AE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x54E6,0x5527,0x5507,0x54FD,0x550F,0x5703,0x5704,0x57C2,/* 0x40-0x47 */
+	0x57D4,0x57CB,0x57C3,0x5809,0x590F,0x5957,0x5958,0x595A,/* 0x48-0x4F */
+	0x5A11,0x5A18,0x5A1C,0x5A1F,0x5A1B,0x5A13,0x59EC,0x5A20,/* 0x50-0x57 */
+	0x5A23,0x5A29,0x5A25,0x5A0C,0x5A09,0x5B6B,0x5C58,0x5BB0,/* 0x58-0x5F */
+	0x5BB3,0x5BB6,0x5BB4,0x5BAE,0x5BB5,0x5BB9,0x5BB8,0x5C04,/* 0x60-0x67 */
+	0x5C51,0x5C55,0x5C50,0x5CED,0x5CFD,0x5CFB,0x5CEA,0x5CE8,/* 0x68-0x6F */
+	0x5CF0,0x5CF6,0x5D01,0x5CF4,0x5DEE,0x5E2D,0x5E2B,0x5EAB,/* 0x70-0x77 */
+	0x5EAD,0x5EA7,0x5F31,0x5F92,0x5F91,0x5F90,0x6059,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6063,0x6065,0x6050,0x6055,0x606D,0x6069,0x606F,/* 0xA0-0xA7 */
+	0x6084,0x609F,0x609A,0x608D,0x6094,0x608C,0x6085,0x6096,/* 0xA8-0xAF */
+	0x6247,0x62F3,0x6308,0x62FF,0x634E,0x633E,0x632F,0x6355,/* 0xB0-0xB7 */
+	0x6342,0x6346,0x634F,0x6349,0x633A,0x6350,0x633D,0x632A,/* 0xB8-0xBF */
+	0x632B,0x6328,0x634D,0x634C,0x6548,0x6549,0x6599,0x65C1,/* 0xC0-0xC7 */
+	0x65C5,0x6642,0x6649,0x664F,0x6643,0x6652,0x664C,0x6645,/* 0xC8-0xCF */
+	0x6641,0x66F8,0x6714,0x6715,0x6717,0x6821,0x6838,0x6848,/* 0xD0-0xD7 */
+	0x6846,0x6853,0x6839,0x6842,0x6854,0x6829,0x68B3,0x6817,/* 0xD8-0xDF */
+	0x684C,0x6851,0x683D,0x67F4,0x6850,0x6840,0x683C,0x6843,/* 0xE0-0xE7 */
+	0x682A,0x6845,0x6813,0x6818,0x6841,0x6B8A,0x6B89,0x6BB7,/* 0xE8-0xEF */
+	0x6C23,0x6C27,0x6C28,0x6C26,0x6C24,0x6CF0,0x6D6A,0x6D95,/* 0xF0-0xF7 */
+	0x6D88,0x6D87,0x6D66,0x6D78,0x6D77,0x6D59,0x6D93,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_AF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6D6C,0x6D89,0x6D6E,0x6D5A,0x6D74,0x6D69,0x6D8C,0x6D8A,/* 0x40-0x47 */
+	0x6D79,0x6D85,0x6D65,0x6D94,0x70CA,0x70D8,0x70E4,0x70D9,/* 0x48-0x4F */
+	0x70C8,0x70CF,0x7239,0x7279,0x72FC,0x72F9,0x72FD,0x72F8,/* 0x50-0x57 */
+	0x72F7,0x7386,0x73ED,0x7409,0x73EE,0x73E0,0x73EA,0x73DE,/* 0x58-0x5F */
+	0x7554,0x755D,0x755C,0x755A,0x7559,0x75BE,0x75C5,0x75C7,/* 0x60-0x67 */
+	0x75B2,0x75B3,0x75BD,0x75BC,0x75B9,0x75C2,0x75B8,0x768B,/* 0x68-0x6F */
+	0x76B0,0x76CA,0x76CD,0x76CE,0x7729,0x771F,0x7720,0x7728,/* 0x70-0x77 */
+	0x77E9,0x7830,0x7827,0x7838,0x781D,0x7834,0x7837,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7825,0x782D,0x7820,0x781F,0x7832,0x7955,0x7950,/* 0xA0-0xA7 */
+	0x7960,0x795F,0x7956,0x795E,0x795D,0x7957,0x795A,0x79E4,/* 0xA8-0xAF */
+	0x79E3,0x79E7,0x79DF,0x79E6,0x79E9,0x79D8,0x7A84,0x7A88,/* 0xB0-0xB7 */
+	0x7AD9,0x7B06,0x7B11,0x7C89,0x7D21,0x7D17,0x7D0B,0x7D0A,/* 0xB8-0xBF */
+	0x7D20,0x7D22,0x7D14,0x7D10,0x7D15,0x7D1A,0x7D1C,0x7D0D,/* 0xC0-0xC7 */
+	0x7D19,0x7D1B,0x7F3A,0x7F5F,0x7F94,0x7FC5,0x7FC1,0x8006,/* 0xC8-0xCF */
+	0x8018,0x8015,0x8019,0x8017,0x803D,0x803F,0x80F1,0x8102,/* 0xD0-0xD7 */
+	0x80F0,0x8105,0x80ED,0x80F4,0x8106,0x80F8,0x80F3,0x8108,/* 0xD8-0xDF */
+	0x80FD,0x810A,0x80FC,0x80EF,0x81ED,0x81EC,0x8200,0x8210,/* 0xE0-0xE7 */
+	0x822A,0x822B,0x8228,0x822C,0x82BB,0x832B,0x8352,0x8354,/* 0xE8-0xEF */
+	0x834A,0x8338,0x8350,0x8349,0x8335,0x8334,0x834F,0x8332,/* 0xF0-0xF7 */
+	0x8339,0x8336,0x8317,0x8340,0x8331,0x8328,0x8343,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8654,0x868A,0x86AA,0x8693,0x86A4,0x86A9,0x868C,0x86A3,/* 0x40-0x47 */
+	0x869C,0x8870,0x8877,0x8881,0x8882,0x887D,0x8879,0x8A18,/* 0x48-0x4F */
+	0x8A10,0x8A0E,0x8A0C,0x8A15,0x8A0A,0x8A17,0x8A13,0x8A16,/* 0x50-0x57 */
+	0x8A0F,0x8A11,0x8C48,0x8C7A,0x8C79,0x8CA1,0x8CA2,0x8D77,/* 0x58-0x5F */
+	0x8EAC,0x8ED2,0x8ED4,0x8ECF,0x8FB1,0x9001,0x9006,0x8FF7,/* 0x60-0x67 */
+	0x9000,0x8FFA,0x8FF4,0x9003,0x8FFD,0x9005,0x8FF8,0x9095,/* 0x68-0x6F */
+	0x90E1,0x90DD,0x90E2,0x9152,0x914D,0x914C,0x91D8,0x91DD,/* 0x70-0x77 */
+	0x91D7,0x91DC,0x91D9,0x9583,0x9662,0x9663,0x9661,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x965B,0x965D,0x9664,0x9658,0x965E,0x96BB,0x98E2,/* 0xA0-0xA7 */
+	0x99AC,0x9AA8,0x9AD8,0x9B25,0x9B32,0x9B3C,0x4E7E,0x507A,/* 0xA8-0xAF */
+	0x507D,0x505C,0x5047,0x5043,0x504C,0x505A,0x5049,0x5065,/* 0xB0-0xB7 */
+	0x5076,0x504E,0x5055,0x5075,0x5074,0x5077,0x504F,0x500F,/* 0xB8-0xBF */
+	0x506F,0x506D,0x515C,0x5195,0x51F0,0x526A,0x526F,0x52D2,/* 0xC0-0xC7 */
+	0x52D9,0x52D8,0x52D5,0x5310,0x530F,0x5319,0x533F,0x5340,/* 0xC8-0xCF */
+	0x533E,0x53C3,0x66FC,0x5546,0x556A,0x5566,0x5544,0x555E,/* 0xD0-0xD7 */
+	0x5561,0x5543,0x554A,0x5531,0x5556,0x554F,0x5555,0x552F,/* 0xD8-0xDF */
+	0x5564,0x5538,0x552E,0x555C,0x552C,0x5563,0x5533,0x5541,/* 0xE0-0xE7 */
+	0x5557,0x5708,0x570B,0x5709,0x57DF,0x5805,0x580A,0x5806,/* 0xE8-0xEF */
+	0x57E0,0x57E4,0x57FA,0x5802,0x5835,0x57F7,0x57F9,0x5920,/* 0xF0-0xF7 */
+	0x5962,0x5A36,0x5A41,0x5A49,0x5A66,0x5A6A,0x5A40,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5A3C,0x5A62,0x5A5A,0x5A46,0x5A4A,0x5B70,0x5BC7,0x5BC5,/* 0x40-0x47 */
+	0x5BC4,0x5BC2,0x5BBF,0x5BC6,0x5C09,0x5C08,0x5C07,0x5C60,/* 0x48-0x4F */
+	0x5C5C,0x5C5D,0x5D07,0x5D06,0x5D0E,0x5D1B,0x5D16,0x5D22,/* 0x50-0x57 */
+	0x5D11,0x5D29,0x5D14,0x5D19,0x5D24,0x5D27,0x5D17,0x5DE2,/* 0x58-0x5F */
+	0x5E38,0x5E36,0x5E33,0x5E37,0x5EB7,0x5EB8,0x5EB6,0x5EB5,/* 0x60-0x67 */
+	0x5EBE,0x5F35,0x5F37,0x5F57,0x5F6C,0x5F69,0x5F6B,0x5F97,/* 0x68-0x6F */
+	0x5F99,0x5F9E,0x5F98,0x5FA1,0x5FA0,0x5F9C,0x607F,0x60A3,/* 0x70-0x77 */
+	0x6089,0x60A0,0x60A8,0x60CB,0x60B4,0x60E6,0x60BD,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x60C5,0x60BB,0x60B5,0x60DC,0x60BC,0x60D8,0x60D5,/* 0xA0-0xA7 */
+	0x60C6,0x60DF,0x60B8,0x60DA,0x60C7,0x621A,0x621B,0x6248,/* 0xA8-0xAF */
+	0x63A0,0x63A7,0x6372,0x6396,0x63A2,0x63A5,0x6377,0x6367,/* 0xB0-0xB7 */
+	0x6398,0x63AA,0x6371,0x63A9,0x6389,0x6383,0x639B,0x636B,/* 0xB8-0xBF */
+	0x63A8,0x6384,0x6388,0x6399,0x63A1,0x63AC,0x6392,0x638F,/* 0xC0-0xC7 */
+	0x6380,0x637B,0x6369,0x6368,0x637A,0x655D,0x6556,0x6551,/* 0xC8-0xCF */
+	0x6559,0x6557,0x555F,0x654F,0x6558,0x6555,0x6554,0x659C,/* 0xD0-0xD7 */
+	0x659B,0x65AC,0x65CF,0x65CB,0x65CC,0x65CE,0x665D,0x665A,/* 0xD8-0xDF */
+	0x6664,0x6668,0x6666,0x665E,0x66F9,0x52D7,0x671B,0x6881,/* 0xE0-0xE7 */
+	0x68AF,0x68A2,0x6893,0x68B5,0x687F,0x6876,0x68B1,0x68A7,/* 0xE8-0xEF */
+	0x6897,0x68B0,0x6883,0x68C4,0x68AD,0x6886,0x6885,0x6894,/* 0xF0-0xF7 */
+	0x689D,0x68A8,0x689F,0x68A1,0x6882,0x6B32,0x6BBA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6BEB,0x6BEC,0x6C2B,0x6D8E,0x6DBC,0x6DF3,0x6DD9,0x6DB2,/* 0x40-0x47 */
+	0x6DE1,0x6DCC,0x6DE4,0x6DFB,0x6DFA,0x6E05,0x6DC7,0x6DCB,/* 0x48-0x4F */
+	0x6DAF,0x6DD1,0x6DAE,0x6DDE,0x6DF9,0x6DB8,0x6DF7,0x6DF5,/* 0x50-0x57 */
+	0x6DC5,0x6DD2,0x6E1A,0x6DB5,0x6DDA,0x6DEB,0x6DD8,0x6DEA,/* 0x58-0x5F */
+	0x6DF1,0x6DEE,0x6DE8,0x6DC6,0x6DC4,0x6DAA,0x6DEC,0x6DBF,/* 0x60-0x67 */
+	0x6DE6,0x70F9,0x7109,0x710A,0x70FD,0x70EF,0x723D,0x727D,/* 0x68-0x6F */
+	0x7281,0x731C,0x731B,0x7316,0x7313,0x7319,0x7387,0x7405,/* 0x70-0x77 */
+	0x740A,0x7403,0x7406,0x73FE,0x740D,0x74E0,0x74F6,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x74F7,0x751C,0x7522,0x7565,0x7566,0x7562,0x7570,/* 0xA0-0xA7 */
+	0x758F,0x75D4,0x75D5,0x75B5,0x75CA,0x75CD,0x768E,0x76D4,/* 0xA8-0xAF */
+	0x76D2,0x76DB,0x7737,0x773E,0x773C,0x7736,0x7738,0x773A,/* 0xB0-0xB7 */
+	0x786B,0x7843,0x784E,0x7965,0x7968,0x796D,0x79FB,0x7A92,/* 0xB8-0xBF */
+	0x7A95,0x7B20,0x7B28,0x7B1B,0x7B2C,0x7B26,0x7B19,0x7B1E,/* 0xC0-0xC7 */
+	0x7B2E,0x7C92,0x7C97,0x7C95,0x7D46,0x7D43,0x7D71,0x7D2E,/* 0xC8-0xCF */
+	0x7D39,0x7D3C,0x7D40,0x7D30,0x7D33,0x7D44,0x7D2F,0x7D42,/* 0xD0-0xD7 */
+	0x7D32,0x7D31,0x7F3D,0x7F9E,0x7F9A,0x7FCC,0x7FCE,0x7FD2,/* 0xD8-0xDF */
+	0x801C,0x804A,0x8046,0x812F,0x8116,0x8123,0x812B,0x8129,/* 0xE0-0xE7 */
+	0x8130,0x8124,0x8202,0x8235,0x8237,0x8236,0x8239,0x838E,/* 0xE8-0xEF */
+	0x839E,0x8398,0x8378,0x83A2,0x8396,0x83BD,0x83AB,0x8392,/* 0xF0-0xF7 */
+	0x838A,0x8393,0x8389,0x83A0,0x8377,0x837B,0x837C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8386,0x83A7,0x8655,0x5F6A,0x86C7,0x86C0,0x86B6,0x86C4,/* 0x40-0x47 */
+	0x86B5,0x86C6,0x86CB,0x86B1,0x86AF,0x86C9,0x8853,0x889E,/* 0x48-0x4F */
+	0x8888,0x88AB,0x8892,0x8896,0x888D,0x888B,0x8993,0x898F,/* 0x50-0x57 */
+	0x8A2A,0x8A1D,0x8A23,0x8A25,0x8A31,0x8A2D,0x8A1F,0x8A1B,/* 0x58-0x5F */
+	0x8A22,0x8C49,0x8C5A,0x8CA9,0x8CAC,0x8CAB,0x8CA8,0x8CAA,/* 0x60-0x67 */
+	0x8CA7,0x8D67,0x8D66,0x8DBE,0x8DBA,0x8EDB,0x8EDF,0x9019,/* 0x68-0x6F */
+	0x900D,0x901A,0x9017,0x9023,0x901F,0x901D,0x9010,0x9015,/* 0x70-0x77 */
+	0x901E,0x9020,0x900F,0x9022,0x9016,0x901B,0x9014,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x90E8,0x90ED,0x90FD,0x9157,0x91CE,0x91F5,0x91E6,/* 0xA0-0xA7 */
+	0x91E3,0x91E7,0x91ED,0x91E9,0x9589,0x966A,0x9675,0x9673,/* 0xA8-0xAF */
+	0x9678,0x9670,0x9674,0x9676,0x9677,0x966C,0x96C0,0x96EA,/* 0xB0-0xB7 */
+	0x96E9,0x7AE0,0x7ADF,0x9802,0x9803,0x9B5A,0x9CE5,0x9E75,/* 0xB8-0xBF */
+	0x9E7F,0x9EA5,0x9EBB,0x50A2,0x508D,0x5085,0x5099,0x5091,/* 0xC0-0xC7 */
+	0x5080,0x5096,0x5098,0x509A,0x6700,0x51F1,0x5272,0x5274,/* 0xC8-0xCF */
+	0x5275,0x5269,0x52DE,0x52DD,0x52DB,0x535A,0x53A5,0x557B,/* 0xD0-0xD7 */
+	0x5580,0x55A7,0x557C,0x558A,0x559D,0x5598,0x5582,0x559C,/* 0xD8-0xDF */
+	0x55AA,0x5594,0x5587,0x558B,0x5583,0x55B3,0x55AE,0x559F,/* 0xE0-0xE7 */
+	0x553E,0x55B2,0x559A,0x55BB,0x55AC,0x55B1,0x557E,0x5589,/* 0xE8-0xEF */
+	0x55AB,0x5599,0x570D,0x582F,0x582A,0x5834,0x5824,0x5830,/* 0xF0-0xF7 */
+	0x5831,0x5821,0x581D,0x5820,0x58F9,0x58FA,0x5960,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5A77,0x5A9A,0x5A7F,0x5A92,0x5A9B,0x5AA7,0x5B73,0x5B71,/* 0x40-0x47 */
+	0x5BD2,0x5BCC,0x5BD3,0x5BD0,0x5C0A,0x5C0B,0x5C31,0x5D4C,/* 0x48-0x4F */
+	0x5D50,0x5D34,0x5D47,0x5DFD,0x5E45,0x5E3D,0x5E40,0x5E43,/* 0x50-0x57 */
+	0x5E7E,0x5ECA,0x5EC1,0x5EC2,0x5EC4,0x5F3C,0x5F6D,0x5FA9,/* 0x58-0x5F */
+	0x5FAA,0x5FA8,0x60D1,0x60E1,0x60B2,0x60B6,0x60E0,0x611C,/* 0x60-0x67 */
+	0x6123,0x60FA,0x6115,0x60F0,0x60FB,0x60F4,0x6168,0x60F1,/* 0x68-0x6F */
+	0x610E,0x60F6,0x6109,0x6100,0x6112,0x621F,0x6249,0x63A3,/* 0x70-0x77 */
+	0x638C,0x63CF,0x63C0,0x63E9,0x63C9,0x63C6,0x63CD,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x63D2,0x63E3,0x63D0,0x63E1,0x63D6,0x63ED,0x63EE,/* 0xA0-0xA7 */
+	0x6376,0x63F4,0x63EA,0x63DB,0x6452,0x63DA,0x63F9,0x655E,/* 0xA8-0xAF */
+	0x6566,0x6562,0x6563,0x6591,0x6590,0x65AF,0x666E,0x6670,/* 0xB0-0xB7 */
+	0x6674,0x6676,0x666F,0x6691,0x667A,0x667E,0x6677,0x66FE,/* 0xB8-0xBF */
+	0x66FF,0x671F,0x671D,0x68FA,0x68D5,0x68E0,0x68D8,0x68D7,/* 0xC0-0xC7 */
+	0x6905,0x68DF,0x68F5,0x68EE,0x68E7,0x68F9,0x68D2,0x68F2,/* 0xC8-0xCF */
+	0x68E3,0x68CB,0x68CD,0x690D,0x6912,0x690E,0x68C9,0x68DA,/* 0xD0-0xD7 */
+	0x696E,0x68FB,0x6B3E,0x6B3A,0x6B3D,0x6B98,0x6B96,0x6BBC,/* 0xD8-0xDF */
+	0x6BEF,0x6C2E,0x6C2F,0x6C2C,0x6E2F,0x6E38,0x6E54,0x6E21,/* 0xE0-0xE7 */
+	0x6E32,0x6E67,0x6E4A,0x6E20,0x6E25,0x6E23,0x6E1B,0x6E5B,/* 0xE8-0xEF */
+	0x6E58,0x6E24,0x6E56,0x6E6E,0x6E2D,0x6E26,0x6E6F,0x6E34,/* 0xF0-0xF7 */
+	0x6E4D,0x6E3A,0x6E2C,0x6E43,0x6E1D,0x6E3E,0x6ECB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6E89,0x6E19,0x6E4E,0x6E63,0x6E44,0x6E72,0x6E69,0x6E5F,/* 0x40-0x47 */
+	0x7119,0x711A,0x7126,0x7130,0x7121,0x7136,0x716E,0x711C,/* 0x48-0x4F */
+	0x724C,0x7284,0x7280,0x7336,0x7325,0x7334,0x7329,0x743A,/* 0x50-0x57 */
+	0x742A,0x7433,0x7422,0x7425,0x7435,0x7436,0x7434,0x742F,/* 0x58-0x5F */
+	0x741B,0x7426,0x7428,0x7525,0x7526,0x756B,0x756A,0x75E2,/* 0x60-0x67 */
+	0x75DB,0x75E3,0x75D9,0x75D8,0x75DE,0x75E0,0x767B,0x767C,/* 0x68-0x6F */
+	0x7696,0x7693,0x76B4,0x76DC,0x774F,0x77ED,0x785D,0x786C,/* 0x70-0x77 */
+	0x786F,0x7A0D,0x7A08,0x7A0B,0x7A05,0x7A00,0x7A98,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7A97,0x7A96,0x7AE5,0x7AE3,0x7B49,0x7B56,0x7B46,/* 0xA0-0xA7 */
+	0x7B50,0x7B52,0x7B54,0x7B4D,0x7B4B,0x7B4F,0x7B51,0x7C9F,/* 0xA8-0xAF */
+	0x7CA5,0x7D5E,0x7D50,0x7D68,0x7D55,0x7D2B,0x7D6E,0x7D72,/* 0xB0-0xB7 */
+	0x7D61,0x7D66,0x7D62,0x7D70,0x7D73,0x5584,0x7FD4,0x7FD5,/* 0xB8-0xBF */
+	0x800B,0x8052,0x8085,0x8155,0x8154,0x814B,0x8151,0x814E,/* 0xC0-0xC7 */
+	0x8139,0x8146,0x813E,0x814C,0x8153,0x8174,0x8212,0x821C,/* 0xC8-0xCF */
+	0x83E9,0x8403,0x83F8,0x840D,0x83E0,0x83C5,0x840B,0x83C1,/* 0xD0-0xD7 */
+	0x83EF,0x83F1,0x83F4,0x8457,0x840A,0x83F0,0x840C,0x83CC,/* 0xD8-0xDF */
+	0x83FD,0x83F2,0x83CA,0x8438,0x840E,0x8404,0x83DC,0x8407,/* 0xE0-0xE7 */
+	0x83D4,0x83DF,0x865B,0x86DF,0x86D9,0x86ED,0x86D4,0x86DB,/* 0xE8-0xEF */
+	0x86E4,0x86D0,0x86DE,0x8857,0x88C1,0x88C2,0x88B1,0x8983,/* 0xF0-0xF7 */
+	0x8996,0x8A3B,0x8A60,0x8A55,0x8A5E,0x8A3C,0x8A41,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8A54,0x8A5B,0x8A50,0x8A46,0x8A34,0x8A3A,0x8A36,0x8A56,/* 0x40-0x47 */
+	0x8C61,0x8C82,0x8CAF,0x8CBC,0x8CB3,0x8CBD,0x8CC1,0x8CBB,/* 0x48-0x4F */
+	0x8CC0,0x8CB4,0x8CB7,0x8CB6,0x8CBF,0x8CB8,0x8D8A,0x8D85,/* 0x50-0x57 */
+	0x8D81,0x8DCE,0x8DDD,0x8DCB,0x8DDA,0x8DD1,0x8DCC,0x8DDB,/* 0x58-0x5F */
+	0x8DC6,0x8EFB,0x8EF8,0x8EFC,0x8F9C,0x902E,0x9035,0x9031,/* 0x60-0x67 */
+	0x9038,0x9032,0x9036,0x9102,0x90F5,0x9109,0x90FE,0x9163,/* 0x68-0x6F */
+	0x9165,0x91CF,0x9214,0x9215,0x9223,0x9209,0x921E,0x920D,/* 0x70-0x77 */
+	0x9210,0x9207,0x9211,0x9594,0x958F,0x958B,0x9591,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9593,0x9592,0x958E,0x968A,0x968E,0x968B,0x967D,/* 0xA0-0xA7 */
+	0x9685,0x9686,0x968D,0x9672,0x9684,0x96C1,0x96C5,0x96C4,/* 0xA8-0xAF */
+	0x96C6,0x96C7,0x96EF,0x96F2,0x97CC,0x9805,0x9806,0x9808,/* 0xB0-0xB7 */
+	0x98E7,0x98EA,0x98EF,0x98E9,0x98F2,0x98ED,0x99AE,0x99AD,/* 0xB8-0xBF */
+	0x9EC3,0x9ECD,0x9ED1,0x4E82,0x50AD,0x50B5,0x50B2,0x50B3,/* 0xC0-0xC7 */
+	0x50C5,0x50BE,0x50AC,0x50B7,0x50BB,0x50AF,0x50C7,0x527F,/* 0xC8-0xCF */
+	0x5277,0x527D,0x52DF,0x52E6,0x52E4,0x52E2,0x52E3,0x532F,/* 0xD0-0xD7 */
+	0x55DF,0x55E8,0x55D3,0x55E6,0x55CE,0x55DC,0x55C7,0x55D1,/* 0xD8-0xDF */
+	0x55E3,0x55E4,0x55EF,0x55DA,0x55E1,0x55C5,0x55C6,0x55E5,/* 0xE0-0xE7 */
+	0x55C9,0x5712,0x5713,0x585E,0x5851,0x5858,0x5857,0x585A,/* 0xE8-0xEF */
+	0x5854,0x586B,0x584C,0x586D,0x584A,0x5862,0x5852,0x584B,/* 0xF0-0xF7 */
+	0x5967,0x5AC1,0x5AC9,0x5ACC,0x5ABE,0x5ABD,0x5ABC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5AB3,0x5AC2,0x5AB2,0x5D69,0x5D6F,0x5E4C,0x5E79,0x5EC9,/* 0x40-0x47 */
+	0x5EC8,0x5F12,0x5F59,0x5FAC,0x5FAE,0x611A,0x610F,0x6148,/* 0x48-0x4F */
+	0x611F,0x60F3,0x611B,0x60F9,0x6101,0x6108,0x614E,0x614C,/* 0x50-0x57 */
+	0x6144,0x614D,0x613E,0x6134,0x6127,0x610D,0x6106,0x6137,/* 0x58-0x5F */
+	0x6221,0x6222,0x6413,0x643E,0x641E,0x642A,0x642D,0x643D,/* 0x60-0x67 */
+	0x642C,0x640F,0x641C,0x6414,0x640D,0x6436,0x6416,0x6417,/* 0x68-0x6F */
+	0x6406,0x656C,0x659F,0x65B0,0x6697,0x6689,0x6687,0x6688,/* 0x70-0x77 */
+	0x6696,0x6684,0x6698,0x668D,0x6703,0x6994,0x696D,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x695A,0x6977,0x6960,0x6954,0x6975,0x6930,0x6982,/* 0xA0-0xA7 */
+	0x694A,0x6968,0x696B,0x695E,0x6953,0x6979,0x6986,0x695D,/* 0xA8-0xAF */
+	0x6963,0x695B,0x6B47,0x6B72,0x6BC0,0x6BBF,0x6BD3,0x6BFD,/* 0xB0-0xB7 */
+	0x6EA2,0x6EAF,0x6ED3,0x6EB6,0x6EC2,0x6E90,0x6E9D,0x6EC7,/* 0xB8-0xBF */
+	0x6EC5,0x6EA5,0x6E98,0x6EBC,0x6EBA,0x6EAB,0x6ED1,0x6E96,/* 0xC0-0xC7 */
+	0x6E9C,0x6EC4,0x6ED4,0x6EAA,0x6EA7,0x6EB4,0x714E,0x7159,/* 0xC8-0xCF */
+	0x7169,0x7164,0x7149,0x7167,0x715C,0x716C,0x7166,0x714C,/* 0xD0-0xD7 */
+	0x7165,0x715E,0x7146,0x7168,0x7156,0x723A,0x7252,0x7337,/* 0xD8-0xDF */
+	0x7345,0x733F,0x733E,0x746F,0x745A,0x7455,0x745F,0x745E,/* 0xE0-0xE7 */
+	0x7441,0x743F,0x7459,0x745B,0x745C,0x7576,0x7578,0x7600,/* 0xE8-0xEF */
+	0x75F0,0x7601,0x75F2,0x75F1,0x75FA,0x75FF,0x75F4,0x75F3,/* 0xF0-0xF7 */
+	0x76DE,0x76DF,0x775B,0x776B,0x7766,0x775E,0x7763,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7779,0x776A,0x776C,0x775C,0x7765,0x7768,0x7762,0x77EE,/* 0x40-0x47 */
+	0x788E,0x78B0,0x7897,0x7898,0x788C,0x7889,0x787C,0x7891,/* 0x48-0x4F */
+	0x7893,0x787F,0x797A,0x797F,0x7981,0x842C,0x79BD,0x7A1C,/* 0x50-0x57 */
+	0x7A1A,0x7A20,0x7A14,0x7A1F,0x7A1E,0x7A9F,0x7AA0,0x7B77,/* 0x58-0x5F */
+	0x7BC0,0x7B60,0x7B6E,0x7B67,0x7CB1,0x7CB3,0x7CB5,0x7D93,/* 0x60-0x67 */
+	0x7D79,0x7D91,0x7D81,0x7D8F,0x7D5B,0x7F6E,0x7F69,0x7F6A,/* 0x68-0x6F */
+	0x7F72,0x7FA9,0x7FA8,0x7FA4,0x8056,0x8058,0x8086,0x8084,/* 0x70-0x77 */
+	0x8171,0x8170,0x8178,0x8165,0x816E,0x8173,0x816B,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8179,0x817A,0x8166,0x8205,0x8247,0x8482,0x8477,/* 0xA0-0xA7 */
+	0x843D,0x8431,0x8475,0x8466,0x846B,0x8449,0x846C,0x845B,/* 0xA8-0xAF */
+	0x843C,0x8435,0x8461,0x8463,0x8469,0x846D,0x8446,0x865E,/* 0xB0-0xB7 */
+	0x865C,0x865F,0x86F9,0x8713,0x8708,0x8707,0x8700,0x86FE,/* 0xB8-0xBF */
+	0x86FB,0x8702,0x8703,0x8706,0x870A,0x8859,0x88DF,0x88D4,/* 0xC0-0xC7 */
+	0x88D9,0x88DC,0x88D8,0x88DD,0x88E1,0x88CA,0x88D5,0x88D2,/* 0xC8-0xCF */
+	0x899C,0x89E3,0x8A6B,0x8A72,0x8A73,0x8A66,0x8A69,0x8A70,/* 0xD0-0xD7 */
+	0x8A87,0x8A7C,0x8A63,0x8AA0,0x8A71,0x8A85,0x8A6D,0x8A62,/* 0xD8-0xDF */
+	0x8A6E,0x8A6C,0x8A79,0x8A7B,0x8A3E,0x8A68,0x8C62,0x8C8A,/* 0xE0-0xE7 */
+	0x8C89,0x8CCA,0x8CC7,0x8CC8,0x8CC4,0x8CB2,0x8CC3,0x8CC2,/* 0xE8-0xEF */
+	0x8CC5,0x8DE1,0x8DDF,0x8DE8,0x8DEF,0x8DF3,0x8DFA,0x8DEA,/* 0xF0-0xF7 */
+	0x8DE4,0x8DE6,0x8EB2,0x8F03,0x8F09,0x8EFE,0x8F0A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_B9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8F9F,0x8FB2,0x904B,0x904A,0x9053,0x9042,0x9054,0x903C,/* 0x40-0x47 */
+	0x9055,0x9050,0x9047,0x904F,0x904E,0x904D,0x9051,0x903E,/* 0x48-0x4F */
+	0x9041,0x9112,0x9117,0x916C,0x916A,0x9169,0x91C9,0x9237,/* 0x50-0x57 */
+	0x9257,0x9238,0x923D,0x9240,0x923E,0x925B,0x924B,0x9264,/* 0x58-0x5F */
+	0x9251,0x9234,0x9249,0x924D,0x9245,0x9239,0x923F,0x925A,/* 0x60-0x67 */
+	0x9598,0x9698,0x9694,0x9695,0x96CD,0x96CB,0x96C9,0x96CA,/* 0x68-0x6F */
+	0x96F7,0x96FB,0x96F9,0x96F6,0x9756,0x9774,0x9776,0x9810,/* 0x70-0x77 */
+	0x9811,0x9813,0x980A,0x9812,0x980C,0x98FC,0x98F4,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x98FD,0x98FE,0x99B3,0x99B1,0x99B4,0x9AE1,0x9CE9,/* 0xA0-0xA7 */
+	0x9E82,0x9F0E,0x9F13,0x9F20,0x50E7,0x50EE,0x50E5,0x50D6,/* 0xA8-0xAF */
+	0x50ED,0x50DA,0x50D5,0x50CF,0x50D1,0x50F1,0x50CE,0x50E9,/* 0xB0-0xB7 */
+	0x5162,0x51F3,0x5283,0x5282,0x5331,0x53AD,0x55FE,0x5600,/* 0xB8-0xBF */
+	0x561B,0x5617,0x55FD,0x5614,0x5606,0x5609,0x560D,0x560E,/* 0xC0-0xC7 */
+	0x55F7,0x5616,0x561F,0x5608,0x5610,0x55F6,0x5718,0x5716,/* 0xC8-0xCF */
+	0x5875,0x587E,0x5883,0x5893,0x588A,0x5879,0x5885,0x587D,/* 0xD0-0xD7 */
+	0x58FD,0x5925,0x5922,0x5924,0x596A,0x5969,0x5AE1,0x5AE6,/* 0xD8-0xDF */
+	0x5AE9,0x5AD7,0x5AD6,0x5AD8,0x5AE3,0x5B75,0x5BDE,0x5BE7,/* 0xE0-0xE7 */
+	0x5BE1,0x5BE5,0x5BE6,0x5BE8,0x5BE2,0x5BE4,0x5BDF,0x5C0D,/* 0xE8-0xEF */
+	0x5C62,0x5D84,0x5D87,0x5E5B,0x5E63,0x5E55,0x5E57,0x5E54,/* 0xF0-0xF7 */
+	0x5ED3,0x5ED6,0x5F0A,0x5F46,0x5F70,0x5FB9,0x6147,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x613F,0x614B,0x6177,0x6162,0x6163,0x615F,0x615A,0x6158,/* 0x40-0x47 */
+	0x6175,0x622A,0x6487,0x6458,0x6454,0x64A4,0x6478,0x645F,/* 0x48-0x4F */
+	0x647A,0x6451,0x6467,0x6434,0x646D,0x647B,0x6572,0x65A1,/* 0x50-0x57 */
+	0x65D7,0x65D6,0x66A2,0x66A8,0x669D,0x699C,0x69A8,0x6995,/* 0x58-0x5F */
+	0x69C1,0x69AE,0x69D3,0x69CB,0x699B,0x69B7,0x69BB,0x69AB,/* 0x60-0x67 */
+	0x69B4,0x69D0,0x69CD,0x69AD,0x69CC,0x69A6,0x69C3,0x69A3,/* 0x68-0x6F */
+	0x6B49,0x6B4C,0x6C33,0x6F33,0x6F14,0x6EFE,0x6F13,0x6EF4,/* 0x70-0x77 */
+	0x6F29,0x6F3E,0x6F20,0x6F2C,0x6F0F,0x6F02,0x6F22,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6EFF,0x6EEF,0x6F06,0x6F31,0x6F38,0x6F32,0x6F23,/* 0xA0-0xA7 */
+	0x6F15,0x6F2B,0x6F2F,0x6F88,0x6F2A,0x6EEC,0x6F01,0x6EF2,/* 0xA8-0xAF */
+	0x6ECC,0x6EF7,0x7194,0x7199,0x717D,0x718A,0x7184,0x7192,/* 0xB0-0xB7 */
+	0x723E,0x7292,0x7296,0x7344,0x7350,0x7464,0x7463,0x746A,/* 0xB8-0xBF */
+	0x7470,0x746D,0x7504,0x7591,0x7627,0x760D,0x760B,0x7609,/* 0xC0-0xC7 */
+	0x7613,0x76E1,0x76E3,0x7784,0x777D,0x777F,0x7761,0x78C1,/* 0xC8-0xCF */
+	0x789F,0x78A7,0x78B3,0x78A9,0x78A3,0x798E,0x798F,0x798D,/* 0xD0-0xD7 */
+	0x7A2E,0x7A31,0x7AAA,0x7AA9,0x7AED,0x7AEF,0x7BA1,0x7B95,/* 0xD8-0xDF */
+	0x7B8B,0x7B75,0x7B97,0x7B9D,0x7B94,0x7B8F,0x7BB8,0x7B87,/* 0xE0-0xE7 */
+	0x7B84,0x7CB9,0x7CBD,0x7CBE,0x7DBB,0x7DB0,0x7D9C,0x7DBD,/* 0xE8-0xEF */
+	0x7DBE,0x7DA0,0x7DCA,0x7DB4,0x7DB2,0x7DB1,0x7DBA,0x7DA2,/* 0xF0-0xF7 */
+	0x7DBF,0x7DB5,0x7DB8,0x7DAD,0x7DD2,0x7DC7,0x7DAC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7F70,0x7FE0,0x7FE1,0x7FDF,0x805E,0x805A,0x8087,0x8150,/* 0x40-0x47 */
+	0x8180,0x818F,0x8188,0x818A,0x817F,0x8182,0x81E7,0x81FA,/* 0x48-0x4F */
+	0x8207,0x8214,0x821E,0x824B,0x84C9,0x84BF,0x84C6,0x84C4,/* 0x50-0x57 */
+	0x8499,0x849E,0x84B2,0x849C,0x84CB,0x84B8,0x84C0,0x84D3,/* 0x58-0x5F */
+	0x8490,0x84BC,0x84D1,0x84CA,0x873F,0x871C,0x873B,0x8722,/* 0x60-0x67 */
+	0x8725,0x8734,0x8718,0x8755,0x8737,0x8729,0x88F3,0x8902,/* 0x68-0x6F */
+	0x88F4,0x88F9,0x88F8,0x88FD,0x88E8,0x891A,0x88EF,0x8AA6,/* 0x70-0x77 */
+	0x8A8C,0x8A9E,0x8AA3,0x8A8D,0x8AA1,0x8A93,0x8AA4,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8AAA,0x8AA5,0x8AA8,0x8A98,0x8A91,0x8A9A,0x8AA7,/* 0xA0-0xA7 */
+	0x8C6A,0x8C8D,0x8C8C,0x8CD3,0x8CD1,0x8CD2,0x8D6B,0x8D99,/* 0xA8-0xAF */
+	0x8D95,0x8DFC,0x8F14,0x8F12,0x8F15,0x8F13,0x8FA3,0x9060,/* 0xB0-0xB7 */
+	0x9058,0x905C,0x9063,0x9059,0x905E,0x9062,0x905D,0x905B,/* 0xB8-0xBF */
+	0x9119,0x9118,0x911E,0x9175,0x9178,0x9177,0x9174,0x9278,/* 0xC0-0xC7 */
+	0x9280,0x9285,0x9298,0x9296,0x927B,0x9293,0x929C,0x92A8,/* 0xC8-0xCF */
+	0x927C,0x9291,0x95A1,0x95A8,0x95A9,0x95A3,0x95A5,0x95A4,/* 0xD0-0xD7 */
+	0x9699,0x969C,0x969B,0x96CC,0x96D2,0x9700,0x977C,0x9785,/* 0xD8-0xDF */
+	0x97F6,0x9817,0x9818,0x98AF,0x98B1,0x9903,0x9905,0x990C,/* 0xE0-0xE7 */
+	0x9909,0x99C1,0x9AAF,0x9AB0,0x9AE6,0x9B41,0x9B42,0x9CF4,/* 0xE8-0xEF */
+	0x9CF6,0x9CF3,0x9EBC,0x9F3B,0x9F4A,0x5104,0x5100,0x50FB,/* 0xF0-0xF7 */
+	0x50F5,0x50F9,0x5102,0x5108,0x5109,0x5105,0x51DC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5287,0x5288,0x5289,0x528D,0x528A,0x52F0,0x53B2,0x562E,/* 0x40-0x47 */
+	0x563B,0x5639,0x5632,0x563F,0x5634,0x5629,0x5653,0x564E,/* 0x48-0x4F */
+	0x5657,0x5674,0x5636,0x562F,0x5630,0x5880,0x589F,0x589E,/* 0x50-0x57 */
+	0x58B3,0x589C,0x58AE,0x58A9,0x58A6,0x596D,0x5B09,0x5AFB,/* 0x58-0x5F */
+	0x5B0B,0x5AF5,0x5B0C,0x5B08,0x5BEE,0x5BEC,0x5BE9,0x5BEB,/* 0x60-0x67 */
+	0x5C64,0x5C65,0x5D9D,0x5D94,0x5E62,0x5E5F,0x5E61,0x5EE2,/* 0x68-0x6F */
+	0x5EDA,0x5EDF,0x5EDD,0x5EE3,0x5EE0,0x5F48,0x5F71,0x5FB7,/* 0x70-0x77 */
+	0x5FB5,0x6176,0x6167,0x616E,0x615D,0x6155,0x6182,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x617C,0x6170,0x616B,0x617E,0x61A7,0x6190,0x61AB,/* 0xA0-0xA7 */
+	0x618E,0x61AC,0x619A,0x61A4,0x6194,0x61AE,0x622E,0x6469,/* 0xA8-0xAF */
+	0x646F,0x6479,0x649E,0x64B2,0x6488,0x6490,0x64B0,0x64A5,/* 0xB0-0xB7 */
+	0x6493,0x6495,0x64A9,0x6492,0x64AE,0x64AD,0x64AB,0x649A,/* 0xB8-0xBF */
+	0x64AC,0x6499,0x64A2,0x64B3,0x6575,0x6577,0x6578,0x66AE,/* 0xC0-0xC7 */
+	0x66AB,0x66B4,0x66B1,0x6A23,0x6A1F,0x69E8,0x6A01,0x6A1E,/* 0xC8-0xCF */
+	0x6A19,0x69FD,0x6A21,0x6A13,0x6A0A,0x69F3,0x6A02,0x6A05,/* 0xD0-0xD7 */
+	0x69ED,0x6A11,0x6B50,0x6B4E,0x6BA4,0x6BC5,0x6BC6,0x6F3F,/* 0xD8-0xDF */
+	0x6F7C,0x6F84,0x6F51,0x6F66,0x6F54,0x6F86,0x6F6D,0x6F5B,/* 0xE0-0xE7 */
+	0x6F78,0x6F6E,0x6F8E,0x6F7A,0x6F70,0x6F64,0x6F97,0x6F58,/* 0xE8-0xEF */
+	0x6ED5,0x6F6F,0x6F60,0x6F5F,0x719F,0x71AC,0x71B1,0x71A8,/* 0xF0-0xF7 */
+	0x7256,0x729B,0x734E,0x7357,0x7469,0x748B,0x7483,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x747E,0x7480,0x757F,0x7620,0x7629,0x761F,0x7624,0x7626,/* 0x40-0x47 */
+	0x7621,0x7622,0x769A,0x76BA,0x76E4,0x778E,0x7787,0x778C,/* 0x48-0x4F */
+	0x7791,0x778B,0x78CB,0x78C5,0x78BA,0x78CA,0x78BE,0x78D5,/* 0x50-0x57 */
+	0x78BC,0x78D0,0x7A3F,0x7A3C,0x7A40,0x7A3D,0x7A37,0x7A3B,/* 0x58-0x5F */
+	0x7AAF,0x7AAE,0x7BAD,0x7BB1,0x7BC4,0x7BB4,0x7BC6,0x7BC7,/* 0x60-0x67 */
+	0x7BC1,0x7BA0,0x7BCC,0x7CCA,0x7DE0,0x7DF4,0x7DEF,0x7DFB,/* 0x68-0x6F */
+	0x7DD8,0x7DEC,0x7DDD,0x7DE8,0x7DE3,0x7DDA,0x7DDE,0x7DE9,/* 0x70-0x77 */
+	0x7D9E,0x7DD9,0x7DF2,0x7DF9,0x7F75,0x7F77,0x7FAF,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7FE9,0x8026,0x819B,0x819C,0x819D,0x81A0,0x819A,/* 0xA0-0xA7 */
+	0x8198,0x8517,0x853D,0x851A,0x84EE,0x852C,0x852D,0x8513,/* 0xA8-0xAF */
+	0x8511,0x8523,0x8521,0x8514,0x84EC,0x8525,0x84FF,0x8506,/* 0xB0-0xB7 */
+	0x8782,0x8774,0x8776,0x8760,0x8766,0x8778,0x8768,0x8759,/* 0xB8-0xBF */
+	0x8757,0x874C,0x8753,0x885B,0x885D,0x8910,0x8907,0x8912,/* 0xC0-0xC7 */
+	0x8913,0x8915,0x890A,0x8ABC,0x8AD2,0x8AC7,0x8AC4,0x8A95,/* 0xC8-0xCF */
+	0x8ACB,0x8AF8,0x8AB2,0x8AC9,0x8AC2,0x8ABF,0x8AB0,0x8AD6,/* 0xD0-0xD7 */
+	0x8ACD,0x8AB6,0x8AB9,0x8ADB,0x8C4C,0x8C4E,0x8C6C,0x8CE0,/* 0xD8-0xDF */
+	0x8CDE,0x8CE6,0x8CE4,0x8CEC,0x8CED,0x8CE2,0x8CE3,0x8CDC,/* 0xE0-0xE7 */
+	0x8CEA,0x8CE1,0x8D6D,0x8D9F,0x8DA3,0x8E2B,0x8E10,0x8E1D,/* 0xE8-0xEF */
+	0x8E22,0x8E0F,0x8E29,0x8E1F,0x8E21,0x8E1E,0x8EBA,0x8F1D,/* 0xF0-0xF7 */
+	0x8F1B,0x8F1F,0x8F29,0x8F26,0x8F2A,0x8F1C,0x8F1E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8F25,0x9069,0x906E,0x9068,0x906D,0x9077,0x9130,0x912D,/* 0x40-0x47 */
+	0x9127,0x9131,0x9187,0x9189,0x918B,0x9183,0x92C5,0x92BB,/* 0x48-0x4F */
+	0x92B7,0x92EA,0x92AC,0x92E4,0x92C1,0x92B3,0x92BC,0x92D2,/* 0x50-0x57 */
+	0x92C7,0x92F0,0x92B2,0x95AD,0x95B1,0x9704,0x9706,0x9707,/* 0x58-0x5F */
+	0x9709,0x9760,0x978D,0x978B,0x978F,0x9821,0x982B,0x981C,/* 0x60-0x67 */
+	0x98B3,0x990A,0x9913,0x9912,0x9918,0x99DD,0x99D0,0x99DF,/* 0x68-0x6F */
+	0x99DB,0x99D1,0x99D5,0x99D2,0x99D9,0x9AB7,0x9AEE,0x9AEF,/* 0x70-0x77 */
+	0x9B27,0x9B45,0x9B44,0x9B77,0x9B6F,0x9D06,0x9D09,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9D03,0x9EA9,0x9EBE,0x9ECE,0x58A8,0x9F52,0x5112,/* 0xA0-0xA7 */
+	0x5118,0x5114,0x5110,0x5115,0x5180,0x51AA,0x51DD,0x5291,/* 0xA8-0xAF */
+	0x5293,0x52F3,0x5659,0x566B,0x5679,0x5669,0x5664,0x5678,/* 0xB0-0xB7 */
+	0x566A,0x5668,0x5665,0x5671,0x566F,0x566C,0x5662,0x5676,/* 0xB8-0xBF */
+	0x58C1,0x58BE,0x58C7,0x58C5,0x596E,0x5B1D,0x5B34,0x5B78,/* 0xC0-0xC7 */
+	0x5BF0,0x5C0E,0x5F4A,0x61B2,0x6191,0x61A9,0x618A,0x61CD,/* 0xC8-0xCF */
+	0x61B6,0x61BE,0x61CA,0x61C8,0x6230,0x64C5,0x64C1,0x64CB,/* 0xD0-0xD7 */
+	0x64BB,0x64BC,0x64DA,0x64C4,0x64C7,0x64C2,0x64CD,0x64BF,/* 0xD8-0xDF */
+	0x64D2,0x64D4,0x64BE,0x6574,0x66C6,0x66C9,0x66B9,0x66C4,/* 0xE0-0xE7 */
+	0x66C7,0x66B8,0x6A3D,0x6A38,0x6A3A,0x6A59,0x6A6B,0x6A58,/* 0xE8-0xEF */
+	0x6A39,0x6A44,0x6A62,0x6A61,0x6A4B,0x6A47,0x6A35,0x6A5F,/* 0xF0-0xF7 */
+	0x6A48,0x6B59,0x6B77,0x6C05,0x6FC2,0x6FB1,0x6FA1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_BF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6FC3,0x6FA4,0x6FC1,0x6FA7,0x6FB3,0x6FC0,0x6FB9,0x6FB6,/* 0x40-0x47 */
+	0x6FA6,0x6FA0,0x6FB4,0x71BE,0x71C9,0x71D0,0x71D2,0x71C8,/* 0x48-0x4F */
+	0x71D5,0x71B9,0x71CE,0x71D9,0x71DC,0x71C3,0x71C4,0x7368,/* 0x50-0x57 */
+	0x749C,0x74A3,0x7498,0x749F,0x749E,0x74E2,0x750C,0x750D,/* 0x58-0x5F */
+	0x7634,0x7638,0x763A,0x76E7,0x76E5,0x77A0,0x779E,0x779F,/* 0x60-0x67 */
+	0x77A5,0x78E8,0x78DA,0x78EC,0x78E7,0x79A6,0x7A4D,0x7A4E,/* 0x68-0x6F */
+	0x7A46,0x7A4C,0x7A4B,0x7ABA,0x7BD9,0x7C11,0x7BC9,0x7BE4,/* 0x70-0x77 */
+	0x7BDB,0x7BE1,0x7BE9,0x7BE6,0x7CD5,0x7CD6,0x7E0A,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7E11,0x7E08,0x7E1B,0x7E23,0x7E1E,0x7E1D,0x7E09,/* 0xA0-0xA7 */
+	0x7E10,0x7F79,0x7FB2,0x7FF0,0x7FF1,0x7FEE,0x8028,0x81B3,/* 0xA8-0xAF */
+	0x81A9,0x81A8,0x81FB,0x8208,0x8258,0x8259,0x854A,0x8559,/* 0xB0-0xB7 */
+	0x8548,0x8568,0x8569,0x8543,0x8549,0x856D,0x856A,0x855E,/* 0xB8-0xBF */
+	0x8783,0x879F,0x879E,0x87A2,0x878D,0x8861,0x892A,0x8932,/* 0xC0-0xC7 */
+	0x8925,0x892B,0x8921,0x89AA,0x89A6,0x8AE6,0x8AFA,0x8AEB,/* 0xC8-0xCF */
+	0x8AF1,0x8B00,0x8ADC,0x8AE7,0x8AEE,0x8AFE,0x8B01,0x8B02,/* 0xD0-0xD7 */
+	0x8AF7,0x8AED,0x8AF3,0x8AF6,0x8AFC,0x8C6B,0x8C6D,0x8C93,/* 0xD8-0xDF */
+	0x8CF4,0x8E44,0x8E31,0x8E34,0x8E42,0x8E39,0x8E35,0x8F3B,/* 0xE0-0xE7 */
+	0x8F2F,0x8F38,0x8F33,0x8FA8,0x8FA6,0x9075,0x9074,0x9078,/* 0xE8-0xEF */
+	0x9072,0x907C,0x907A,0x9134,0x9192,0x9320,0x9336,0x92F8,/* 0xF0-0xF7 */
+	0x9333,0x932F,0x9322,0x92FC,0x932B,0x9304,0x931A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9310,0x9326,0x9321,0x9315,0x932E,0x9319,0x95BB,0x96A7,/* 0x40-0x47 */
+	0x96A8,0x96AA,0x96D5,0x970E,0x9711,0x9716,0x970D,0x9713,/* 0x48-0x4F */
+	0x970F,0x975B,0x975C,0x9766,0x9798,0x9830,0x9838,0x983B,/* 0x50-0x57 */
+	0x9837,0x982D,0x9839,0x9824,0x9910,0x9928,0x991E,0x991B,/* 0x58-0x5F */
+	0x9921,0x991A,0x99ED,0x99E2,0x99F1,0x9AB8,0x9ABC,0x9AFB,/* 0x60-0x67 */
+	0x9AED,0x9B28,0x9B91,0x9D15,0x9D23,0x9D26,0x9D28,0x9D12,/* 0x68-0x6F */
+	0x9D1B,0x9ED8,0x9ED4,0x9F8D,0x9F9C,0x512A,0x511F,0x5121,/* 0x70-0x77 */
+	0x5132,0x52F5,0x568E,0x5680,0x5690,0x5685,0x5687,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x568F,0x58D5,0x58D3,0x58D1,0x58CE,0x5B30,0x5B2A,/* 0xA0-0xA7 */
+	0x5B24,0x5B7A,0x5C37,0x5C68,0x5DBC,0x5DBA,0x5DBD,0x5DB8,/* 0xA8-0xAF */
+	0x5E6B,0x5F4C,0x5FBD,0x61C9,0x61C2,0x61C7,0x61E6,0x61CB,/* 0xB0-0xB7 */
+	0x6232,0x6234,0x64CE,0x64CA,0x64D8,0x64E0,0x64F0,0x64E6,/* 0xB8-0xBF */
+	0x64EC,0x64F1,0x64E2,0x64ED,0x6582,0x6583,0x66D9,0x66D6,/* 0xC0-0xC7 */
+	0x6A80,0x6A94,0x6A84,0x6AA2,0x6A9C,0x6ADB,0x6AA3,0x6A7E,/* 0xC8-0xCF */
+	0x6A97,0x6A90,0x6AA0,0x6B5C,0x6BAE,0x6BDA,0x6C08,0x6FD8,/* 0xD0-0xD7 */
+	0x6FF1,0x6FDF,0x6FE0,0x6FDB,0x6FE4,0x6FEB,0x6FEF,0x6F80,/* 0xD8-0xDF */
+	0x6FEC,0x6FE1,0x6FE9,0x6FD5,0x6FEE,0x6FF0,0x71E7,0x71DF,/* 0xE0-0xE7 */
+	0x71EE,0x71E6,0x71E5,0x71ED,0x71EC,0x71F4,0x71E0,0x7235,/* 0xE8-0xEF */
+	0x7246,0x7370,0x7372,0x74A9,0x74B0,0x74A6,0x74A8,0x7646,/* 0xF0-0xF7 */
+	0x7642,0x764C,0x76EA,0x77B3,0x77AA,0x77B0,0x77AC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x77A7,0x77AD,0x77EF,0x78F7,0x78FA,0x78F4,0x78EF,0x7901,/* 0x40-0x47 */
+	0x79A7,0x79AA,0x7A57,0x7ABF,0x7C07,0x7C0D,0x7BFE,0x7BF7,/* 0x48-0x4F */
+	0x7C0C,0x7BE0,0x7CE0,0x7CDC,0x7CDE,0x7CE2,0x7CDF,0x7CD9,/* 0x50-0x57 */
+	0x7CDD,0x7E2E,0x7E3E,0x7E46,0x7E37,0x7E32,0x7E43,0x7E2B,/* 0x58-0x5F */
+	0x7E3D,0x7E31,0x7E45,0x7E41,0x7E34,0x7E39,0x7E48,0x7E35,/* 0x60-0x67 */
+	0x7E3F,0x7E2F,0x7F44,0x7FF3,0x7FFC,0x8071,0x8072,0x8070,/* 0x68-0x6F */
+	0x806F,0x8073,0x81C6,0x81C3,0x81BA,0x81C2,0x81C0,0x81BF,/* 0x70-0x77 */
+	0x81BD,0x81C9,0x81BE,0x81E8,0x8209,0x8271,0x85AA,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8584,0x857E,0x859C,0x8591,0x8594,0x85AF,0x859B,/* 0xA0-0xA7 */
+	0x8587,0x85A8,0x858A,0x8667,0x87C0,0x87D1,0x87B3,0x87D2,/* 0xA8-0xAF */
+	0x87C6,0x87AB,0x87BB,0x87BA,0x87C8,0x87CB,0x893B,0x8936,/* 0xB0-0xB7 */
+	0x8944,0x8938,0x893D,0x89AC,0x8B0E,0x8B17,0x8B19,0x8B1B,/* 0xB8-0xBF */
+	0x8B0A,0x8B20,0x8B1D,0x8B04,0x8B10,0x8C41,0x8C3F,0x8C73,/* 0xC0-0xC7 */
+	0x8CFA,0x8CFD,0x8CFC,0x8CF8,0x8CFB,0x8DA8,0x8E49,0x8E4B,/* 0xC8-0xCF */
+	0x8E48,0x8E4A,0x8F44,0x8F3E,0x8F42,0x8F45,0x8F3F,0x907F,/* 0xD0-0xD7 */
+	0x907D,0x9084,0x9081,0x9082,0x9080,0x9139,0x91A3,0x919E,/* 0xD8-0xDF */
+	0x919C,0x934D,0x9382,0x9328,0x9375,0x934A,0x9365,0x934B,/* 0xE0-0xE7 */
+	0x9318,0x937E,0x936C,0x935B,0x9370,0x935A,0x9354,0x95CA,/* 0xE8-0xEF */
+	0x95CB,0x95CC,0x95C8,0x95C6,0x96B1,0x96B8,0x96D6,0x971C,/* 0xF0-0xF7 */
+	0x971E,0x97A0,0x97D3,0x9846,0x98B6,0x9935,0x9A01,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x99FF,0x9BAE,0x9BAB,0x9BAA,0x9BAD,0x9D3B,0x9D3F,0x9E8B,/* 0x40-0x47 */
+	0x9ECF,0x9EDE,0x9EDC,0x9EDD,0x9EDB,0x9F3E,0x9F4B,0x53E2,/* 0x48-0x4F */
+	0x5695,0x56AE,0x58D9,0x58D8,0x5B38,0x5F5D,0x61E3,0x6233,/* 0x50-0x57 */
+	0x64F4,0x64F2,0x64FE,0x6506,0x64FA,0x64FB,0x64F7,0x65B7,/* 0x58-0x5F */
+	0x66DC,0x6726,0x6AB3,0x6AAC,0x6AC3,0x6ABB,0x6AB8,0x6AC2,/* 0x60-0x67 */
+	0x6AAE,0x6AAF,0x6B5F,0x6B78,0x6BAF,0x7009,0x700B,0x6FFE,/* 0x68-0x6F */
+	0x7006,0x6FFA,0x7011,0x700F,0x71FB,0x71FC,0x71FE,0x71F8,/* 0x70-0x77 */
+	0x7377,0x7375,0x74A7,0x74BF,0x7515,0x7656,0x7658,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7652,0x77BD,0x77BF,0x77BB,0x77BC,0x790E,0x79AE,/* 0xA0-0xA7 */
+	0x7A61,0x7A62,0x7A60,0x7AC4,0x7AC5,0x7C2B,0x7C27,0x7C2A,/* 0xA8-0xAF */
+	0x7C1E,0x7C23,0x7C21,0x7CE7,0x7E54,0x7E55,0x7E5E,0x7E5A,/* 0xB0-0xB7 */
+	0x7E61,0x7E52,0x7E59,0x7F48,0x7FF9,0x7FFB,0x8077,0x8076,/* 0xB8-0xBF */
+	0x81CD,0x81CF,0x820A,0x85CF,0x85A9,0x85CD,0x85D0,0x85C9,/* 0xC0-0xC7 */
+	0x85B0,0x85BA,0x85B9,0x85A6,0x87EF,0x87EC,0x87F2,0x87E0,/* 0xC8-0xCF */
+	0x8986,0x89B2,0x89F4,0x8B28,0x8B39,0x8B2C,0x8B2B,0x8C50,/* 0xD0-0xD7 */
+	0x8D05,0x8E59,0x8E63,0x8E66,0x8E64,0x8E5F,0x8E55,0x8EC0,/* 0xD8-0xDF */
+	0x8F49,0x8F4D,0x9087,0x9083,0x9088,0x91AB,0x91AC,0x91D0,/* 0xE0-0xE7 */
+	0x9394,0x938A,0x9396,0x93A2,0x93B3,0x93AE,0x93AC,0x93B0,/* 0xE8-0xEF */
+	0x9398,0x939A,0x9397,0x95D4,0x95D6,0x95D0,0x95D5,0x96E2,/* 0xF0-0xF7 */
+	0x96DC,0x96D9,0x96DB,0x96DE,0x9724,0x97A3,0x97A6,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x97AD,0x97F9,0x984D,0x984F,0x984C,0x984E,0x9853,0x98BA,/* 0x40-0x47 */
+	0x993E,0x993F,0x993D,0x992E,0x99A5,0x9A0E,0x9AC1,0x9B03,/* 0x48-0x4F */
+	0x9B06,0x9B4F,0x9B4E,0x9B4D,0x9BCA,0x9BC9,0x9BFD,0x9BC8,/* 0x50-0x57 */
+	0x9BC0,0x9D51,0x9D5D,0x9D60,0x9EE0,0x9F15,0x9F2C,0x5133,/* 0x58-0x5F */
+	0x56A5,0x58DE,0x58DF,0x58E2,0x5BF5,0x9F90,0x5EEC,0x61F2,/* 0x60-0x67 */
+	0x61F7,0x61F6,0x61F5,0x6500,0x650F,0x66E0,0x66DD,0x6AE5,/* 0x68-0x6F */
+	0x6ADD,0x6ADA,0x6AD3,0x701B,0x701F,0x7028,0x701A,0x701D,/* 0x70-0x77 */
+	0x7015,0x7018,0x7206,0x720D,0x7258,0x72A2,0x7378,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x737A,0x74BD,0x74CA,0x74E3,0x7587,0x7586,0x765F,/* 0xA0-0xA7 */
+	0x7661,0x77C7,0x7919,0x79B1,0x7A6B,0x7A69,0x7C3E,0x7C3F,/* 0xA8-0xAF */
+	0x7C38,0x7C3D,0x7C37,0x7C40,0x7E6B,0x7E6D,0x7E79,0x7E69,/* 0xB0-0xB7 */
+	0x7E6A,0x7F85,0x7E73,0x7FB6,0x7FB9,0x7FB8,0x81D8,0x85E9,/* 0xB8-0xBF */
+	0x85DD,0x85EA,0x85D5,0x85E4,0x85E5,0x85F7,0x87FB,0x8805,/* 0xC0-0xC7 */
+	0x880D,0x87F9,0x87FE,0x8960,0x895F,0x8956,0x895E,0x8B41,/* 0xC8-0xCF */
+	0x8B5C,0x8B58,0x8B49,0x8B5A,0x8B4E,0x8B4F,0x8B46,0x8B59,/* 0xD0-0xD7 */
+	0x8D08,0x8D0A,0x8E7C,0x8E72,0x8E87,0x8E76,0x8E6C,0x8E7A,/* 0xD8-0xDF */
+	0x8E74,0x8F54,0x8F4E,0x8FAD,0x908A,0x908B,0x91B1,0x91AE,/* 0xE0-0xE7 */
+	0x93E1,0x93D1,0x93DF,0x93C3,0x93C8,0x93DC,0x93DD,0x93D6,/* 0xE8-0xEF */
+	0x93E2,0x93CD,0x93D8,0x93E4,0x93D7,0x93E8,0x95DC,0x96B4,/* 0xF0-0xF7 */
+	0x96E3,0x972A,0x9727,0x9761,0x97DC,0x97FB,0x985E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x9858,0x985B,0x98BC,0x9945,0x9949,0x9A16,0x9A19,0x9B0D,/* 0x40-0x47 */
+	0x9BE8,0x9BE7,0x9BD6,0x9BDB,0x9D89,0x9D61,0x9D72,0x9D6A,/* 0x48-0x4F */
+	0x9D6C,0x9E92,0x9E97,0x9E93,0x9EB4,0x52F8,0x56A8,0x56B7,/* 0x50-0x57 */
+	0x56B6,0x56B4,0x56BC,0x58E4,0x5B40,0x5B43,0x5B7D,0x5BF6,/* 0x58-0x5F */
+	0x5DC9,0x61F8,0x61FA,0x6518,0x6514,0x6519,0x66E6,0x6727,/* 0x60-0x67 */
+	0x6AEC,0x703E,0x7030,0x7032,0x7210,0x737B,0x74CF,0x7662,/* 0x68-0x6F */
+	0x7665,0x7926,0x792A,0x792C,0x792B,0x7AC7,0x7AF6,0x7C4C,/* 0x70-0x77 */
+	0x7C43,0x7C4D,0x7CEF,0x7CF0,0x8FAE,0x7E7D,0x7E7C,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7E82,0x7F4C,0x8000,0x81DA,0x8266,0x85FB,0x85F9,/* 0xA0-0xA7 */
+	0x8611,0x85FA,0x8606,0x860B,0x8607,0x860A,0x8814,0x8815,/* 0xA8-0xAF */
+	0x8964,0x89BA,0x89F8,0x8B70,0x8B6C,0x8B66,0x8B6F,0x8B5F,/* 0xB0-0xB7 */
+	0x8B6B,0x8D0F,0x8D0D,0x8E89,0x8E81,0x8E85,0x8E82,0x91B4,/* 0xB8-0xBF */
+	0x91CB,0x9418,0x9403,0x93FD,0x95E1,0x9730,0x98C4,0x9952,/* 0xC0-0xC7 */
+	0x9951,0x99A8,0x9A2B,0x9A30,0x9A37,0x9A35,0x9C13,0x9C0D,/* 0xC8-0xCF */
+	0x9E79,0x9EB5,0x9EE8,0x9F2F,0x9F5F,0x9F63,0x9F61,0x5137,/* 0xD0-0xD7 */
+	0x5138,0x56C1,0x56C0,0x56C2,0x5914,0x5C6C,0x5DCD,0x61FC,/* 0xD8-0xDF */
+	0x61FE,0x651D,0x651C,0x6595,0x66E9,0x6AFB,0x6B04,0x6AFA,/* 0xE0-0xE7 */
+	0x6BB2,0x704C,0x721B,0x72A7,0x74D6,0x74D4,0x7669,0x77D3,/* 0xE8-0xEF */
+	0x7C50,0x7E8F,0x7E8C,0x7FBC,0x8617,0x862D,0x861A,0x8823,/* 0xF0-0xF7 */
+	0x8822,0x8821,0x881F,0x896A,0x896C,0x89BD,0x8B74,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8B77,0x8B7D,0x8D13,0x8E8A,0x8E8D,0x8E8B,0x8F5F,0x8FAF,/* 0x40-0x47 */
+	0x91BA,0x942E,0x9433,0x9435,0x943A,0x9438,0x9432,0x942B,/* 0x48-0x4F */
+	0x95E2,0x9738,0x9739,0x9732,0x97FF,0x9867,0x9865,0x9957,/* 0x50-0x57 */
+	0x9A45,0x9A43,0x9A40,0x9A3E,0x9ACF,0x9B54,0x9B51,0x9C2D,/* 0x58-0x5F */
+	0x9C25,0x9DAF,0x9DB4,0x9DC2,0x9DB8,0x9E9D,0x9EEF,0x9F19,/* 0x60-0x67 */
+	0x9F5C,0x9F66,0x9F67,0x513C,0x513B,0x56C8,0x56CA,0x56C9,/* 0x68-0x6F */
+	0x5B7F,0x5DD4,0x5DD2,0x5F4E,0x61FF,0x6524,0x6B0A,0x6B61,/* 0x70-0x77 */
+	0x7051,0x7058,0x7380,0x74E4,0x758A,0x766E,0x766C,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x79B3,0x7C60,0x7C5F,0x807E,0x807D,0x81DF,0x8972,/* 0xA0-0xA7 */
+	0x896F,0x89FC,0x8B80,0x8D16,0x8D17,0x8E91,0x8E93,0x8F61,/* 0xA8-0xAF */
+	0x9148,0x9444,0x9451,0x9452,0x973D,0x973E,0x97C3,0x97C1,/* 0xB0-0xB7 */
+	0x986B,0x9955,0x9A55,0x9A4D,0x9AD2,0x9B1A,0x9C49,0x9C31,/* 0xB8-0xBF */
+	0x9C3E,0x9C3B,0x9DD3,0x9DD7,0x9F34,0x9F6C,0x9F6A,0x9F94,/* 0xC0-0xC7 */
+	0x56CC,0x5DD6,0x6200,0x6523,0x652B,0x652A,0x66EC,0x6B10,/* 0xC8-0xCF */
+	0x74DA,0x7ACA,0x7C64,0x7C63,0x7C65,0x7E93,0x7E96,0x7E94,/* 0xD0-0xD7 */
+	0x81E2,0x8638,0x863F,0x8831,0x8B8A,0x9090,0x908F,0x9463,/* 0xD8-0xDF */
+	0x9460,0x9464,0x9768,0x986F,0x995C,0x9A5A,0x9A5B,0x9A57,/* 0xE0-0xE7 */
+	0x9AD3,0x9AD4,0x9AD1,0x9C54,0x9C57,0x9C56,0x9DE5,0x9E9F,/* 0xE8-0xEF */
+	0x9EF4,0x56D1,0x58E9,0x652C,0x705E,0x7671,0x7672,0x77D7,/* 0xF0-0xF7 */
+	0x7F50,0x7F88,0x8836,0x8839,0x8862,0x8B93,0x8B92,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_C6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8B96,0x8277,0x8D1B,0x91C0,0x946A,0x9742,0x9748,0x9744,/* 0x40-0x47 */
+	0x97C6,0x9870,0x9A5F,0x9B22,0x9B58,0x9C5F,0x9DF9,0x9DFA,/* 0x48-0x4F */
+	0x9E7C,0x9E7D,0x9F07,0x9F77,0x9F72,0x5EF3,0x6B16,0x7063,/* 0x50-0x57 */
+	0x7C6C,0x7C6E,0x883B,0x89C0,0x8EA1,0x91C1,0x9472,0x9470,/* 0x58-0x5F */
+	0x9871,0x995E,0x9AD6,0x9B23,0x9ECC,0x7064,0x77DA,0x8B9A,/* 0x60-0x67 */
+	0x9477,0x97C9,0x9A62,0x9A65,0x7E9C,0x8B9C,0x8EAA,0x91C5,/* 0x68-0x6F */
+	0x947D,0x947E,0x947C,0x9C77,0x9C78,0x9EF7,0x8C54,0x947F,/* 0x70-0x77 */
+	0x9E1A,0x7228,0x9A6A,0x9B31,0x9E1B,0x9E1E,0x7C72,0x0000,/* 0x78-0x7F */
+};
+
+static const wchar_t c2u_C9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x4E42,0x4E5C,0x51F5,0x531A,0x5382,0x4E07,0x4E0C,0x4E47,/* 0x40-0x47 */
+	0x4E8D,0x56D7,0xFA0C,0x5C6E,0x5F73,0x4E0F,0x5187,0x4E0E,/* 0x48-0x4F */
+	0x4E2E,0x4E93,0x4EC2,0x4EC9,0x4EC8,0x5198,0x52FC,0x536C,/* 0x50-0x57 */
+	0x53B9,0x5720,0x5903,0x592C,0x5C10,0x5DFF,0x65E1,0x6BB3,/* 0x58-0x5F */
+	0x6BCC,0x6C14,0x723F,0x4E31,0x4E3C,0x4EE8,0x4EDC,0x4EE9,/* 0x60-0x67 */
+	0x4EE1,0x4EDD,0x4EDA,0x520C,0x531C,0x534C,0x5722,0x5723,/* 0x68-0x6F */
+	0x5917,0x592F,0x5B81,0x5B84,0x5C12,0x5C3B,0x5C74,0x5C73,/* 0x70-0x77 */
+	0x5E04,0x5E80,0x5E82,0x5FC9,0x6209,0x6250,0x6C15,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6C36,0x6C43,0x6C3F,0x6C3B,0x72AE,0x72B0,0x738A,/* 0xA0-0xA7 */
+	0x79B8,0x808A,0x961E,0x4F0E,0x4F18,0x4F2C,0x4EF5,0x4F14,/* 0xA8-0xAF */
+	0x4EF1,0x4F00,0x4EF7,0x4F08,0x4F1D,0x4F02,0x4F05,0x4F22,/* 0xB0-0xB7 */
+	0x4F13,0x4F04,0x4EF4,0x4F12,0x51B1,0x5213,0x5209,0x5210,/* 0xB8-0xBF */
+	0x52A6,0x5322,0x531F,0x534D,0x538A,0x5407,0x56E1,0x56DF,/* 0xC0-0xC7 */
+	0x572E,0x572A,0x5734,0x593C,0x5980,0x597C,0x5985,0x597B,/* 0xC8-0xCF */
+	0x597E,0x5977,0x597F,0x5B56,0x5C15,0x5C25,0x5C7C,0x5C7A,/* 0xD0-0xD7 */
+	0x5C7B,0x5C7E,0x5DDF,0x5E75,0x5E84,0x5F02,0x5F1A,0x5F74,/* 0xD8-0xDF */
+	0x5FD5,0x5FD4,0x5FCF,0x625C,0x625E,0x6264,0x6261,0x6266,/* 0xE0-0xE7 */
+	0x6262,0x6259,0x6260,0x625A,0x6265,0x65EF,0x65EE,0x673E,/* 0xE8-0xEF */
+	0x6739,0x6738,0x673B,0x673A,0x673F,0x673C,0x6733,0x6C18,/* 0xF0-0xF7 */
+	0x6C46,0x6C52,0x6C5C,0x6C4F,0x6C4A,0x6C54,0x6C4B,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6C4C,0x7071,0x725E,0x72B4,0x72B5,0x738E,0x752A,0x767F,/* 0x40-0x47 */
+	0x7A75,0x7F51,0x8278,0x827C,0x8280,0x827D,0x827F,0x864D,/* 0x48-0x4F */
+	0x897E,0x9099,0x9097,0x9098,0x909B,0x9094,0x9622,0x9624,/* 0x50-0x57 */
+	0x9620,0x9623,0x4F56,0x4F3B,0x4F62,0x4F49,0x4F53,0x4F64,/* 0x58-0x5F */
+	0x4F3E,0x4F67,0x4F52,0x4F5F,0x4F41,0x4F58,0x4F2D,0x4F33,/* 0x60-0x67 */
+	0x4F3F,0x4F61,0x518F,0x51B9,0x521C,0x521E,0x5221,0x52AD,/* 0x68-0x6F */
+	0x52AE,0x5309,0x5363,0x5372,0x538E,0x538F,0x5430,0x5437,/* 0x70-0x77 */
+	0x542A,0x5454,0x5445,0x5419,0x541C,0x5425,0x5418,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x543D,0x544F,0x5441,0x5428,0x5424,0x5447,0x56EE,/* 0xA0-0xA7 */
+	0x56E7,0x56E5,0x5741,0x5745,0x574C,0x5749,0x574B,0x5752,/* 0xA8-0xAF */
+	0x5906,0x5940,0x59A6,0x5998,0x59A0,0x5997,0x598E,0x59A2,/* 0xB0-0xB7 */
+	0x5990,0x598F,0x59A7,0x59A1,0x5B8E,0x5B92,0x5C28,0x5C2A,/* 0xB8-0xBF */
+	0x5C8D,0x5C8F,0x5C88,0x5C8B,0x5C89,0x5C92,0x5C8A,0x5C86,/* 0xC0-0xC7 */
+	0x5C93,0x5C95,0x5DE0,0x5E0A,0x5E0E,0x5E8B,0x5E89,0x5E8C,/* 0xC8-0xCF */
+	0x5E88,0x5E8D,0x5F05,0x5F1D,0x5F78,0x5F76,0x5FD2,0x5FD1,/* 0xD0-0xD7 */
+	0x5FD0,0x5FED,0x5FE8,0x5FEE,0x5FF3,0x5FE1,0x5FE4,0x5FE3,/* 0xD8-0xDF */
+	0x5FFA,0x5FEF,0x5FF7,0x5FFB,0x6000,0x5FF4,0x623A,0x6283,/* 0xE0-0xE7 */
+	0x628C,0x628E,0x628F,0x6294,0x6287,0x6271,0x627B,0x627A,/* 0xE8-0xEF */
+	0x6270,0x6281,0x6288,0x6277,0x627D,0x6272,0x6274,0x6537,/* 0xF0-0xF7 */
+	0x65F0,0x65F4,0x65F3,0x65F2,0x65F5,0x6745,0x6747,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6759,0x6755,0x674C,0x6748,0x675D,0x674D,0x675A,0x674B,/* 0x40-0x47 */
+	0x6BD0,0x6C19,0x6C1A,0x6C78,0x6C67,0x6C6B,0x6C84,0x6C8B,/* 0x48-0x4F */
+	0x6C8F,0x6C71,0x6C6F,0x6C69,0x6C9A,0x6C6D,0x6C87,0x6C95,/* 0x50-0x57 */
+	0x6C9C,0x6C66,0x6C73,0x6C65,0x6C7B,0x6C8E,0x7074,0x707A,/* 0x58-0x5F */
+	0x7263,0x72BF,0x72BD,0x72C3,0x72C6,0x72C1,0x72BA,0x72C5,/* 0x60-0x67 */
+	0x7395,0x7397,0x7393,0x7394,0x7392,0x753A,0x7539,0x7594,/* 0x68-0x6F */
+	0x7595,0x7681,0x793D,0x8034,0x8095,0x8099,0x8090,0x8092,/* 0x70-0x77 */
+	0x809C,0x8290,0x828F,0x8285,0x828E,0x8291,0x8293,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x828A,0x8283,0x8284,0x8C78,0x8FC9,0x8FBF,0x909F,/* 0xA0-0xA7 */
+	0x90A1,0x90A5,0x909E,0x90A7,0x90A0,0x9630,0x9628,0x962F,/* 0xA8-0xAF */
+	0x962D,0x4E33,0x4F98,0x4F7C,0x4F85,0x4F7D,0x4F80,0x4F87,/* 0xB0-0xB7 */
+	0x4F76,0x4F74,0x4F89,0x4F84,0x4F77,0x4F4C,0x4F97,0x4F6A,/* 0xB8-0xBF */
+	0x4F9A,0x4F79,0x4F81,0x4F78,0x4F90,0x4F9C,0x4F94,0x4F9E,/* 0xC0-0xC7 */
+	0x4F92,0x4F82,0x4F95,0x4F6B,0x4F6E,0x519E,0x51BC,0x51BE,/* 0xC8-0xCF */
+	0x5235,0x5232,0x5233,0x5246,0x5231,0x52BC,0x530A,0x530B,/* 0xD0-0xD7 */
+	0x533C,0x5392,0x5394,0x5487,0x547F,0x5481,0x5491,0x5482,/* 0xD8-0xDF */
+	0x5488,0x546B,0x547A,0x547E,0x5465,0x546C,0x5474,0x5466,/* 0xE0-0xE7 */
+	0x548D,0x546F,0x5461,0x5460,0x5498,0x5463,0x5467,0x5464,/* 0xE8-0xEF */
+	0x56F7,0x56F9,0x576F,0x5772,0x576D,0x576B,0x5771,0x5770,/* 0xF0-0xF7 */
+	0x5776,0x5780,0x5775,0x577B,0x5773,0x5774,0x5762,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5768,0x577D,0x590C,0x5945,0x59B5,0x59BA,0x59CF,0x59CE,/* 0x40-0x47 */
+	0x59B2,0x59CC,0x59C1,0x59B6,0x59BC,0x59C3,0x59D6,0x59B1,/* 0x48-0x4F */
+	0x59BD,0x59C0,0x59C8,0x59B4,0x59C7,0x5B62,0x5B65,0x5B93,/* 0x50-0x57 */
+	0x5B95,0x5C44,0x5C47,0x5CAE,0x5CA4,0x5CA0,0x5CB5,0x5CAF,/* 0x58-0x5F */
+	0x5CA8,0x5CAC,0x5C9F,0x5CA3,0x5CAD,0x5CA2,0x5CAA,0x5CA7,/* 0x60-0x67 */
+	0x5C9D,0x5CA5,0x5CB6,0x5CB0,0x5CA6,0x5E17,0x5E14,0x5E19,/* 0x68-0x6F */
+	0x5F28,0x5F22,0x5F23,0x5F24,0x5F54,0x5F82,0x5F7E,0x5F7D,/* 0x70-0x77 */
+	0x5FDE,0x5FE5,0x602D,0x6026,0x6019,0x6032,0x600B,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6034,0x600A,0x6017,0x6033,0x601A,0x601E,0x602C,/* 0xA0-0xA7 */
+	0x6022,0x600D,0x6010,0x602E,0x6013,0x6011,0x600C,0x6009,/* 0xA8-0xAF */
+	0x601C,0x6214,0x623D,0x62AD,0x62B4,0x62D1,0x62BE,0x62AA,/* 0xB0-0xB7 */
+	0x62B6,0x62CA,0x62AE,0x62B3,0x62AF,0x62BB,0x62A9,0x62B0,/* 0xB8-0xBF */
+	0x62B8,0x653D,0x65A8,0x65BB,0x6609,0x65FC,0x6604,0x6612,/* 0xC0-0xC7 */
+	0x6608,0x65FB,0x6603,0x660B,0x660D,0x6605,0x65FD,0x6611,/* 0xC8-0xCF */
+	0x6610,0x66F6,0x670A,0x6785,0x676C,0x678E,0x6792,0x6776,/* 0xD0-0xD7 */
+	0x677B,0x6798,0x6786,0x6784,0x6774,0x678D,0x678C,0x677A,/* 0xD8-0xDF */
+	0x679F,0x6791,0x6799,0x6783,0x677D,0x6781,0x6778,0x6779,/* 0xE0-0xE7 */
+	0x6794,0x6B25,0x6B80,0x6B7E,0x6BDE,0x6C1D,0x6C93,0x6CEC,/* 0xE8-0xEF */
+	0x6CEB,0x6CEE,0x6CD9,0x6CB6,0x6CD4,0x6CAD,0x6CE7,0x6CB7,/* 0xF0-0xF7 */
+	0x6CD0,0x6CC2,0x6CBA,0x6CC3,0x6CC6,0x6CED,0x6CF2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6CD2,0x6CDD,0x6CB4,0x6C8A,0x6C9D,0x6C80,0x6CDE,0x6CC0,/* 0x40-0x47 */
+	0x6D30,0x6CCD,0x6CC7,0x6CB0,0x6CF9,0x6CCF,0x6CE9,0x6CD1,/* 0x48-0x4F */
+	0x7094,0x7098,0x7085,0x7093,0x7086,0x7084,0x7091,0x7096,/* 0x50-0x57 */
+	0x7082,0x709A,0x7083,0x726A,0x72D6,0x72CB,0x72D8,0x72C9,/* 0x58-0x5F */
+	0x72DC,0x72D2,0x72D4,0x72DA,0x72CC,0x72D1,0x73A4,0x73A1,/* 0x60-0x67 */
+	0x73AD,0x73A6,0x73A2,0x73A0,0x73AC,0x739D,0x74DD,0x74E8,/* 0x68-0x6F */
+	0x753F,0x7540,0x753E,0x758C,0x7598,0x76AF,0x76F3,0x76F1,/* 0x70-0x77 */
+	0x76F0,0x76F5,0x77F8,0x77FC,0x77F9,0x77FB,0x77FA,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x77F7,0x7942,0x793F,0x79C5,0x7A78,0x7A7B,0x7AFB,/* 0xA0-0xA7 */
+	0x7C75,0x7CFD,0x8035,0x808F,0x80AE,0x80A3,0x80B8,0x80B5,/* 0xA8-0xAF */
+	0x80AD,0x8220,0x82A0,0x82C0,0x82AB,0x829A,0x8298,0x829B,/* 0xB0-0xB7 */
+	0x82B5,0x82A7,0x82AE,0x82BC,0x829E,0x82BA,0x82B4,0x82A8,/* 0xB8-0xBF */
+	0x82A1,0x82A9,0x82C2,0x82A4,0x82C3,0x82B6,0x82A2,0x8670,/* 0xC0-0xC7 */
+	0x866F,0x866D,0x866E,0x8C56,0x8FD2,0x8FCB,0x8FD3,0x8FCD,/* 0xC8-0xCF */
+	0x8FD6,0x8FD5,0x8FD7,0x90B2,0x90B4,0x90AF,0x90B3,0x90B0,/* 0xD0-0xD7 */
+	0x9639,0x963D,0x963C,0x963A,0x9643,0x4FCD,0x4FC5,0x4FD3,/* 0xD8-0xDF */
+	0x4FB2,0x4FC9,0x4FCB,0x4FC1,0x4FD4,0x4FDC,0x4FD9,0x4FBB,/* 0xE0-0xE7 */
+	0x4FB3,0x4FDB,0x4FC7,0x4FD6,0x4FBA,0x4FC0,0x4FB9,0x4FEC,/* 0xE8-0xEF */
+	0x5244,0x5249,0x52C0,0x52C2,0x533D,0x537C,0x5397,0x5396,/* 0xF0-0xF7 */
+	0x5399,0x5398,0x54BA,0x54A1,0x54AD,0x54A5,0x54CF,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x54C3,0x830D,0x54B7,0x54AE,0x54D6,0x54B6,0x54C5,0x54C6,/* 0x40-0x47 */
+	0x54A0,0x5470,0x54BC,0x54A2,0x54BE,0x5472,0x54DE,0x54B0,/* 0x48-0x4F */
+	0x57B5,0x579E,0x579F,0x57A4,0x578C,0x5797,0x579D,0x579B,/* 0x50-0x57 */
+	0x5794,0x5798,0x578F,0x5799,0x57A5,0x579A,0x5795,0x58F4,/* 0x58-0x5F */
+	0x590D,0x5953,0x59E1,0x59DE,0x59EE,0x5A00,0x59F1,0x59DD,/* 0x60-0x67 */
+	0x59FA,0x59FD,0x59FC,0x59F6,0x59E4,0x59F2,0x59F7,0x59DB,/* 0x68-0x6F */
+	0x59E9,0x59F3,0x59F5,0x59E0,0x59FE,0x59F4,0x59ED,0x5BA8,/* 0x70-0x77 */
+	0x5C4C,0x5CD0,0x5CD8,0x5CCC,0x5CD7,0x5CCB,0x5CDB,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5CDE,0x5CDA,0x5CC9,0x5CC7,0x5CCA,0x5CD6,0x5CD3,/* 0xA0-0xA7 */
+	0x5CD4,0x5CCF,0x5CC8,0x5CC6,0x5CCE,0x5CDF,0x5CF8,0x5DF9,/* 0xA8-0xAF */
+	0x5E21,0x5E22,0x5E23,0x5E20,0x5E24,0x5EB0,0x5EA4,0x5EA2,/* 0xB0-0xB7 */
+	0x5E9B,0x5EA3,0x5EA5,0x5F07,0x5F2E,0x5F56,0x5F86,0x6037,/* 0xB8-0xBF */
+	0x6039,0x6054,0x6072,0x605E,0x6045,0x6053,0x6047,0x6049,/* 0xC0-0xC7 */
+	0x605B,0x604C,0x6040,0x6042,0x605F,0x6024,0x6044,0x6058,/* 0xC8-0xCF */
+	0x6066,0x606E,0x6242,0x6243,0x62CF,0x630D,0x630B,0x62F5,/* 0xD0-0xD7 */
+	0x630E,0x6303,0x62EB,0x62F9,0x630F,0x630C,0x62F8,0x62F6,/* 0xD8-0xDF */
+	0x6300,0x6313,0x6314,0x62FA,0x6315,0x62FB,0x62F0,0x6541,/* 0xE0-0xE7 */
+	0x6543,0x65AA,0x65BF,0x6636,0x6621,0x6632,0x6635,0x661C,/* 0xE8-0xEF */
+	0x6626,0x6622,0x6633,0x662B,0x663A,0x661D,0x6634,0x6639,/* 0xF0-0xF7 */
+	0x662E,0x670F,0x6710,0x67C1,0x67F2,0x67C8,0x67BA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_CF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x67DC,0x67BB,0x67F8,0x67D8,0x67C0,0x67B7,0x67C5,0x67EB,/* 0x40-0x47 */
+	0x67E4,0x67DF,0x67B5,0x67CD,0x67B3,0x67F7,0x67F6,0x67EE,/* 0x48-0x4F */
+	0x67E3,0x67C2,0x67B9,0x67CE,0x67E7,0x67F0,0x67B2,0x67FC,/* 0x50-0x57 */
+	0x67C6,0x67ED,0x67CC,0x67AE,0x67E6,0x67DB,0x67FA,0x67C9,/* 0x58-0x5F */
+	0x67CA,0x67C3,0x67EA,0x67CB,0x6B28,0x6B82,0x6B84,0x6BB6,/* 0x60-0x67 */
+	0x6BD6,0x6BD8,0x6BE0,0x6C20,0x6C21,0x6D28,0x6D34,0x6D2D,/* 0x68-0x6F */
+	0x6D1F,0x6D3C,0x6D3F,0x6D12,0x6D0A,0x6CDA,0x6D33,0x6D04,/* 0x70-0x77 */
+	0x6D19,0x6D3A,0x6D1A,0x6D11,0x6D00,0x6D1D,0x6D42,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6D01,0x6D18,0x6D37,0x6D03,0x6D0F,0x6D40,0x6D07,/* 0xA0-0xA7 */
+	0x6D20,0x6D2C,0x6D08,0x6D22,0x6D09,0x6D10,0x70B7,0x709F,/* 0xA8-0xAF */
+	0x70BE,0x70B1,0x70B0,0x70A1,0x70B4,0x70B5,0x70A9,0x7241,/* 0xB0-0xB7 */
+	0x7249,0x724A,0x726C,0x7270,0x7273,0x726E,0x72CA,0x72E4,/* 0xB8-0xBF */
+	0x72E8,0x72EB,0x72DF,0x72EA,0x72E6,0x72E3,0x7385,0x73CC,/* 0xC0-0xC7 */
+	0x73C2,0x73C8,0x73C5,0x73B9,0x73B6,0x73B5,0x73B4,0x73EB,/* 0xC8-0xCF */
+	0x73BF,0x73C7,0x73BE,0x73C3,0x73C6,0x73B8,0x73CB,0x74EC,/* 0xD0-0xD7 */
+	0x74EE,0x752E,0x7547,0x7548,0x75A7,0x75AA,0x7679,0x76C4,/* 0xD8-0xDF */
+	0x7708,0x7703,0x7704,0x7705,0x770A,0x76F7,0x76FB,0x76FA,/* 0xE0-0xE7 */
+	0x77E7,0x77E8,0x7806,0x7811,0x7812,0x7805,0x7810,0x780F,/* 0xE8-0xEF */
+	0x780E,0x7809,0x7803,0x7813,0x794A,0x794C,0x794B,0x7945,/* 0xF0-0xF7 */
+	0x7944,0x79D5,0x79CD,0x79CF,0x79D6,0x79CE,0x7A80,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7A7E,0x7AD1,0x7B00,0x7B01,0x7C7A,0x7C78,0x7C79,0x7C7F,/* 0x40-0x47 */
+	0x7C80,0x7C81,0x7D03,0x7D08,0x7D01,0x7F58,0x7F91,0x7F8D,/* 0x48-0x4F */
+	0x7FBE,0x8007,0x800E,0x800F,0x8014,0x8037,0x80D8,0x80C7,/* 0x50-0x57 */
+	0x80E0,0x80D1,0x80C8,0x80C2,0x80D0,0x80C5,0x80E3,0x80D9,/* 0x58-0x5F */
+	0x80DC,0x80CA,0x80D5,0x80C9,0x80CF,0x80D7,0x80E6,0x80CD,/* 0x60-0x67 */
+	0x81FF,0x8221,0x8294,0x82D9,0x82FE,0x82F9,0x8307,0x82E8,/* 0x68-0x6F */
+	0x8300,0x82D5,0x833A,0x82EB,0x82D6,0x82F4,0x82EC,0x82E1,/* 0x70-0x77 */
+	0x82F2,0x82F5,0x830C,0x82FB,0x82F6,0x82F0,0x82EA,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x82E4,0x82E0,0x82FA,0x82F3,0x82ED,0x8677,0x8674,/* 0xA0-0xA7 */
+	0x867C,0x8673,0x8841,0x884E,0x8867,0x886A,0x8869,0x89D3,/* 0xA8-0xAF */
+	0x8A04,0x8A07,0x8D72,0x8FE3,0x8FE1,0x8FEE,0x8FE0,0x90F1,/* 0xB0-0xB7 */
+	0x90BD,0x90BF,0x90D5,0x90C5,0x90BE,0x90C7,0x90CB,0x90C8,/* 0xB8-0xBF */
+	0x91D4,0x91D3,0x9654,0x964F,0x9651,0x9653,0x964A,0x964E,/* 0xC0-0xC7 */
+	0x501E,0x5005,0x5007,0x5013,0x5022,0x5030,0x501B,0x4FF5,/* 0xC8-0xCF */
+	0x4FF4,0x5033,0x5037,0x502C,0x4FF6,0x4FF7,0x5017,0x501C,/* 0xD0-0xD7 */
+	0x5020,0x5027,0x5035,0x502F,0x5031,0x500E,0x515A,0x5194,/* 0xD8-0xDF */
+	0x5193,0x51CA,0x51C4,0x51C5,0x51C8,0x51CE,0x5261,0x525A,/* 0xE0-0xE7 */
+	0x5252,0x525E,0x525F,0x5255,0x5262,0x52CD,0x530E,0x539E,/* 0xE8-0xEF */
+	0x5526,0x54E2,0x5517,0x5512,0x54E7,0x54F3,0x54E4,0x551A,/* 0xF0-0xF7 */
+	0x54FF,0x5504,0x5508,0x54EB,0x5511,0x5505,0x54F1,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x550A,0x54FB,0x54F7,0x54F8,0x54E0,0x550E,0x5503,0x550B,/* 0x40-0x47 */
+	0x5701,0x5702,0x57CC,0x5832,0x57D5,0x57D2,0x57BA,0x57C6,/* 0x48-0x4F */
+	0x57BD,0x57BC,0x57B8,0x57B6,0x57BF,0x57C7,0x57D0,0x57B9,/* 0x50-0x57 */
+	0x57C1,0x590E,0x594A,0x5A19,0x5A16,0x5A2D,0x5A2E,0x5A15,/* 0x58-0x5F */
+	0x5A0F,0x5A17,0x5A0A,0x5A1E,0x5A33,0x5B6C,0x5BA7,0x5BAD,/* 0x60-0x67 */
+	0x5BAC,0x5C03,0x5C56,0x5C54,0x5CEC,0x5CFF,0x5CEE,0x5CF1,/* 0x68-0x6F */
+	0x5CF7,0x5D00,0x5CF9,0x5E29,0x5E28,0x5EA8,0x5EAE,0x5EAA,/* 0x70-0x77 */
+	0x5EAC,0x5F33,0x5F30,0x5F67,0x605D,0x605A,0x6067,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6041,0x60A2,0x6088,0x6080,0x6092,0x6081,0x609D,/* 0xA0-0xA7 */
+	0x6083,0x6095,0x609B,0x6097,0x6087,0x609C,0x608E,0x6219,/* 0xA8-0xAF */
+	0x6246,0x62F2,0x6310,0x6356,0x632C,0x6344,0x6345,0x6336,/* 0xB0-0xB7 */
+	0x6343,0x63E4,0x6339,0x634B,0x634A,0x633C,0x6329,0x6341,/* 0xB8-0xBF */
+	0x6334,0x6358,0x6354,0x6359,0x632D,0x6347,0x6333,0x635A,/* 0xC0-0xC7 */
+	0x6351,0x6338,0x6357,0x6340,0x6348,0x654A,0x6546,0x65C6,/* 0xC8-0xCF */
+	0x65C3,0x65C4,0x65C2,0x664A,0x665F,0x6647,0x6651,0x6712,/* 0xD0-0xD7 */
+	0x6713,0x681F,0x681A,0x6849,0x6832,0x6833,0x683B,0x684B,/* 0xD8-0xDF */
+	0x684F,0x6816,0x6831,0x681C,0x6835,0x682B,0x682D,0x682F,/* 0xE0-0xE7 */
+	0x684E,0x6844,0x6834,0x681D,0x6812,0x6814,0x6826,0x6828,/* 0xE8-0xEF */
+	0x682E,0x684D,0x683A,0x6825,0x6820,0x6B2C,0x6B2F,0x6B2D,/* 0xF0-0xF7 */
+	0x6B31,0x6B34,0x6B6D,0x8082,0x6B88,0x6BE6,0x6BE4,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6BE8,0x6BE3,0x6BE2,0x6BE7,0x6C25,0x6D7A,0x6D63,0x6D64,/* 0x40-0x47 */
+	0x6D76,0x6D0D,0x6D61,0x6D92,0x6D58,0x6D62,0x6D6D,0x6D6F,/* 0x48-0x4F */
+	0x6D91,0x6D8D,0x6DEF,0x6D7F,0x6D86,0x6D5E,0x6D67,0x6D60,/* 0x50-0x57 */
+	0x6D97,0x6D70,0x6D7C,0x6D5F,0x6D82,0x6D98,0x6D2F,0x6D68,/* 0x58-0x5F */
+	0x6D8B,0x6D7E,0x6D80,0x6D84,0x6D16,0x6D83,0x6D7B,0x6D7D,/* 0x60-0x67 */
+	0x6D75,0x6D90,0x70DC,0x70D3,0x70D1,0x70DD,0x70CB,0x7F39,/* 0x68-0x6F */
+	0x70E2,0x70D7,0x70D2,0x70DE,0x70E0,0x70D4,0x70CD,0x70C5,/* 0x70-0x77 */
+	0x70C6,0x70C7,0x70DA,0x70CE,0x70E1,0x7242,0x7278,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7277,0x7276,0x7300,0x72FA,0x72F4,0x72FE,0x72F6,/* 0xA0-0xA7 */
+	0x72F3,0x72FB,0x7301,0x73D3,0x73D9,0x73E5,0x73D6,0x73BC,/* 0xA8-0xAF */
+	0x73E7,0x73E3,0x73E9,0x73DC,0x73D2,0x73DB,0x73D4,0x73DD,/* 0xB0-0xB7 */
+	0x73DA,0x73D7,0x73D8,0x73E8,0x74DE,0x74DF,0x74F4,0x74F5,/* 0xB8-0xBF */
+	0x7521,0x755B,0x755F,0x75B0,0x75C1,0x75BB,0x75C4,0x75C0,/* 0xC0-0xC7 */
+	0x75BF,0x75B6,0x75BA,0x768A,0x76C9,0x771D,0x771B,0x7710,/* 0xC8-0xCF */
+	0x7713,0x7712,0x7723,0x7711,0x7715,0x7719,0x771A,0x7722,/* 0xD0-0xD7 */
+	0x7727,0x7823,0x782C,0x7822,0x7835,0x782F,0x7828,0x782E,/* 0xD8-0xDF */
+	0x782B,0x7821,0x7829,0x7833,0x782A,0x7831,0x7954,0x795B,/* 0xE0-0xE7 */
+	0x794F,0x795C,0x7953,0x7952,0x7951,0x79EB,0x79EC,0x79E0,/* 0xE8-0xEF */
+	0x79EE,0x79ED,0x79EA,0x79DC,0x79DE,0x79DD,0x7A86,0x7A89,/* 0xF0-0xF7 */
+	0x7A85,0x7A8B,0x7A8C,0x7A8A,0x7A87,0x7AD8,0x7B10,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7B04,0x7B13,0x7B05,0x7B0F,0x7B08,0x7B0A,0x7B0E,0x7B09,/* 0x40-0x47 */
+	0x7B12,0x7C84,0x7C91,0x7C8A,0x7C8C,0x7C88,0x7C8D,0x7C85,/* 0x48-0x4F */
+	0x7D1E,0x7D1D,0x7D11,0x7D0E,0x7D18,0x7D16,0x7D13,0x7D1F,/* 0x50-0x57 */
+	0x7D12,0x7D0F,0x7D0C,0x7F5C,0x7F61,0x7F5E,0x7F60,0x7F5D,/* 0x58-0x5F */
+	0x7F5B,0x7F96,0x7F92,0x7FC3,0x7FC2,0x7FC0,0x8016,0x803E,/* 0x60-0x67 */
+	0x8039,0x80FA,0x80F2,0x80F9,0x80F5,0x8101,0x80FB,0x8100,/* 0x68-0x6F */
+	0x8201,0x822F,0x8225,0x8333,0x832D,0x8344,0x8319,0x8351,/* 0x70-0x77 */
+	0x8325,0x8356,0x833F,0x8341,0x8326,0x831C,0x8322,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8342,0x834E,0x831B,0x832A,0x8308,0x833C,0x834D,/* 0xA0-0xA7 */
+	0x8316,0x8324,0x8320,0x8337,0x832F,0x8329,0x8347,0x8345,/* 0xA8-0xAF */
+	0x834C,0x8353,0x831E,0x832C,0x834B,0x8327,0x8348,0x8653,/* 0xB0-0xB7 */
+	0x8652,0x86A2,0x86A8,0x8696,0x868D,0x8691,0x869E,0x8687,/* 0xB8-0xBF */
+	0x8697,0x8686,0x868B,0x869A,0x8685,0x86A5,0x8699,0x86A1,/* 0xC0-0xC7 */
+	0x86A7,0x8695,0x8698,0x868E,0x869D,0x8690,0x8694,0x8843,/* 0xC8-0xCF */
+	0x8844,0x886D,0x8875,0x8876,0x8872,0x8880,0x8871,0x887F,/* 0xD0-0xD7 */
+	0x886F,0x8883,0x887E,0x8874,0x887C,0x8A12,0x8C47,0x8C57,/* 0xD8-0xDF */
+	0x8C7B,0x8CA4,0x8CA3,0x8D76,0x8D78,0x8DB5,0x8DB7,0x8DB6,/* 0xE0-0xE7 */
+	0x8ED1,0x8ED3,0x8FFE,0x8FF5,0x9002,0x8FFF,0x8FFB,0x9004,/* 0xE8-0xEF */
+	0x8FFC,0x8FF6,0x90D6,0x90E0,0x90D9,0x90DA,0x90E3,0x90DF,/* 0xF0-0xF7 */
+	0x90E5,0x90D8,0x90DB,0x90D7,0x90DC,0x90E4,0x9150,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x914E,0x914F,0x91D5,0x91E2,0x91DA,0x965C,0x965F,0x96BC,/* 0x40-0x47 */
+	0x98E3,0x9ADF,0x9B2F,0x4E7F,0x5070,0x506A,0x5061,0x505E,/* 0x48-0x4F */
+	0x5060,0x5053,0x504B,0x505D,0x5072,0x5048,0x504D,0x5041,/* 0x50-0x57 */
+	0x505B,0x504A,0x5062,0x5015,0x5045,0x505F,0x5069,0x506B,/* 0x58-0x5F */
+	0x5063,0x5064,0x5046,0x5040,0x506E,0x5073,0x5057,0x5051,/* 0x60-0x67 */
+	0x51D0,0x526B,0x526D,0x526C,0x526E,0x52D6,0x52D3,0x532D,/* 0x68-0x6F */
+	0x539C,0x5575,0x5576,0x553C,0x554D,0x5550,0x5534,0x552A,/* 0x70-0x77 */
+	0x5551,0x5562,0x5536,0x5535,0x5530,0x5552,0x5545,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x550C,0x5532,0x5565,0x554E,0x5539,0x5548,0x552D,/* 0xA0-0xA7 */
+	0x553B,0x5540,0x554B,0x570A,0x5707,0x57FB,0x5814,0x57E2,/* 0xA8-0xAF */
+	0x57F6,0x57DC,0x57F4,0x5800,0x57ED,0x57FD,0x5808,0x57F8,/* 0xB0-0xB7 */
+	0x580B,0x57F3,0x57CF,0x5807,0x57EE,0x57E3,0x57F2,0x57E5,/* 0xB8-0xBF */
+	0x57EC,0x57E1,0x580E,0x57FC,0x5810,0x57E7,0x5801,0x580C,/* 0xC0-0xC7 */
+	0x57F1,0x57E9,0x57F0,0x580D,0x5804,0x595C,0x5A60,0x5A58,/* 0xC8-0xCF */
+	0x5A55,0x5A67,0x5A5E,0x5A38,0x5A35,0x5A6D,0x5A50,0x5A5F,/* 0xD0-0xD7 */
+	0x5A65,0x5A6C,0x5A53,0x5A64,0x5A57,0x5A43,0x5A5D,0x5A52,/* 0xD8-0xDF */
+	0x5A44,0x5A5B,0x5A48,0x5A8E,0x5A3E,0x5A4D,0x5A39,0x5A4C,/* 0xE0-0xE7 */
+	0x5A70,0x5A69,0x5A47,0x5A51,0x5A56,0x5A42,0x5A5C,0x5B72,/* 0xE8-0xEF */
+	0x5B6E,0x5BC1,0x5BC0,0x5C59,0x5D1E,0x5D0B,0x5D1D,0x5D1A,/* 0xF0-0xF7 */
+	0x5D20,0x5D0C,0x5D28,0x5D0D,0x5D26,0x5D25,0x5D0F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5D30,0x5D12,0x5D23,0x5D1F,0x5D2E,0x5E3E,0x5E34,0x5EB1,/* 0x40-0x47 */
+	0x5EB4,0x5EB9,0x5EB2,0x5EB3,0x5F36,0x5F38,0x5F9B,0x5F96,/* 0x48-0x4F */
+	0x5F9F,0x608A,0x6090,0x6086,0x60BE,0x60B0,0x60BA,0x60D3,/* 0x50-0x57 */
+	0x60D4,0x60CF,0x60E4,0x60D9,0x60DD,0x60C8,0x60B1,0x60DB,/* 0x58-0x5F */
+	0x60B7,0x60CA,0x60BF,0x60C3,0x60CD,0x60C0,0x6332,0x6365,/* 0x60-0x67 */
+	0x638A,0x6382,0x637D,0x63BD,0x639E,0x63AD,0x639D,0x6397,/* 0x68-0x6F */
+	0x63AB,0x638E,0x636F,0x6387,0x6390,0x636E,0x63AF,0x6375,/* 0x70-0x77 */
+	0x639C,0x636D,0x63AE,0x637C,0x63A4,0x633B,0x639F,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6378,0x6385,0x6381,0x6391,0x638D,0x6370,0x6553,/* 0xA0-0xA7 */
+	0x65CD,0x6665,0x6661,0x665B,0x6659,0x665C,0x6662,0x6718,/* 0xA8-0xAF */
+	0x6879,0x6887,0x6890,0x689C,0x686D,0x686E,0x68AE,0x68AB,/* 0xB0-0xB7 */
+	0x6956,0x686F,0x68A3,0x68AC,0x68A9,0x6875,0x6874,0x68B2,/* 0xB8-0xBF */
+	0x688F,0x6877,0x6892,0x687C,0x686B,0x6872,0x68AA,0x6880,/* 0xC0-0xC7 */
+	0x6871,0x687E,0x689B,0x6896,0x688B,0x68A0,0x6889,0x68A4,/* 0xC8-0xCF */
+	0x6878,0x687B,0x6891,0x688C,0x688A,0x687D,0x6B36,0x6B33,/* 0xD0-0xD7 */
+	0x6B37,0x6B38,0x6B91,0x6B8F,0x6B8D,0x6B8E,0x6B8C,0x6C2A,/* 0xD8-0xDF */
+	0x6DC0,0x6DAB,0x6DB4,0x6DB3,0x6E74,0x6DAC,0x6DE9,0x6DE2,/* 0xE0-0xE7 */
+	0x6DB7,0x6DF6,0x6DD4,0x6E00,0x6DC8,0x6DE0,0x6DDF,0x6DD6,/* 0xE8-0xEF */
+	0x6DBE,0x6DE5,0x6DDC,0x6DDD,0x6DDB,0x6DF4,0x6DCA,0x6DBD,/* 0xF0-0xF7 */
+	0x6DED,0x6DF0,0x6DBA,0x6DD5,0x6DC2,0x6DCF,0x6DC9,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6DD0,0x6DF2,0x6DD3,0x6DFD,0x6DD7,0x6DCD,0x6DE3,0x6DBB,/* 0x40-0x47 */
+	0x70FA,0x710D,0x70F7,0x7117,0x70F4,0x710C,0x70F0,0x7104,/* 0x48-0x4F */
+	0x70F3,0x7110,0x70FC,0x70FF,0x7106,0x7113,0x7100,0x70F8,/* 0x50-0x57 */
+	0x70F6,0x710B,0x7102,0x710E,0x727E,0x727B,0x727C,0x727F,/* 0x58-0x5F */
+	0x731D,0x7317,0x7307,0x7311,0x7318,0x730A,0x7308,0x72FF,/* 0x60-0x67 */
+	0x730F,0x731E,0x7388,0x73F6,0x73F8,0x73F5,0x7404,0x7401,/* 0x68-0x6F */
+	0x73FD,0x7407,0x7400,0x73FA,0x73FC,0x73FF,0x740C,0x740B,/* 0x70-0x77 */
+	0x73F4,0x7408,0x7564,0x7563,0x75CE,0x75D2,0x75CF,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x75CB,0x75CC,0x75D1,0x75D0,0x768F,0x7689,0x76D3,/* 0xA0-0xA7 */
+	0x7739,0x772F,0x772D,0x7731,0x7732,0x7734,0x7733,0x773D,/* 0xA8-0xAF */
+	0x7725,0x773B,0x7735,0x7848,0x7852,0x7849,0x784D,0x784A,/* 0xB0-0xB7 */
+	0x784C,0x7826,0x7845,0x7850,0x7964,0x7967,0x7969,0x796A,/* 0xB8-0xBF */
+	0x7963,0x796B,0x7961,0x79BB,0x79FA,0x79F8,0x79F6,0x79F7,/* 0xC0-0xC7 */
+	0x7A8F,0x7A94,0x7A90,0x7B35,0x7B47,0x7B34,0x7B25,0x7B30,/* 0xC8-0xCF */
+	0x7B22,0x7B24,0x7B33,0x7B18,0x7B2A,0x7B1D,0x7B31,0x7B2B,/* 0xD0-0xD7 */
+	0x7B2D,0x7B2F,0x7B32,0x7B38,0x7B1A,0x7B23,0x7C94,0x7C98,/* 0xD8-0xDF */
+	0x7C96,0x7CA3,0x7D35,0x7D3D,0x7D38,0x7D36,0x7D3A,0x7D45,/* 0xE0-0xE7 */
+	0x7D2C,0x7D29,0x7D41,0x7D47,0x7D3E,0x7D3F,0x7D4A,0x7D3B,/* 0xE8-0xEF */
+	0x7D28,0x7F63,0x7F95,0x7F9C,0x7F9D,0x7F9B,0x7FCA,0x7FCB,/* 0xF0-0xF7 */
+	0x7FCD,0x7FD0,0x7FD1,0x7FC7,0x7FCF,0x7FC9,0x801F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x801E,0x801B,0x8047,0x8043,0x8048,0x8118,0x8125,0x8119,/* 0x40-0x47 */
+	0x811B,0x812D,0x811F,0x812C,0x811E,0x8121,0x8115,0x8127,/* 0x48-0x4F */
+	0x811D,0x8122,0x8211,0x8238,0x8233,0x823A,0x8234,0x8232,/* 0x50-0x57 */
+	0x8274,0x8390,0x83A3,0x83A8,0x838D,0x837A,0x8373,0x83A4,/* 0x58-0x5F */
+	0x8374,0x838F,0x8381,0x8395,0x8399,0x8375,0x8394,0x83A9,/* 0x60-0x67 */
+	0x837D,0x8383,0x838C,0x839D,0x839B,0x83AA,0x838B,0x837E,/* 0x68-0x6F */
+	0x83A5,0x83AF,0x8388,0x8397,0x83B0,0x837F,0x83A6,0x8387,/* 0x70-0x77 */
+	0x83AE,0x8376,0x839A,0x8659,0x8656,0x86BF,0x86B7,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x86C2,0x86C1,0x86C5,0x86BA,0x86B0,0x86C8,0x86B9,/* 0xA0-0xA7 */
+	0x86B3,0x86B8,0x86CC,0x86B4,0x86BB,0x86BC,0x86C3,0x86BD,/* 0xA8-0xAF */
+	0x86BE,0x8852,0x8889,0x8895,0x88A8,0x88A2,0x88AA,0x889A,/* 0xB0-0xB7 */
+	0x8891,0x88A1,0x889F,0x8898,0x88A7,0x8899,0x889B,0x8897,/* 0xB8-0xBF */
+	0x88A4,0x88AC,0x888C,0x8893,0x888E,0x8982,0x89D6,0x89D9,/* 0xC0-0xC7 */
+	0x89D5,0x8A30,0x8A27,0x8A2C,0x8A1E,0x8C39,0x8C3B,0x8C5C,/* 0xC8-0xCF */
+	0x8C5D,0x8C7D,0x8CA5,0x8D7D,0x8D7B,0x8D79,0x8DBC,0x8DC2,/* 0xD0-0xD7 */
+	0x8DB9,0x8DBF,0x8DC1,0x8ED8,0x8EDE,0x8EDD,0x8EDC,0x8ED7,/* 0xD8-0xDF */
+	0x8EE0,0x8EE1,0x9024,0x900B,0x9011,0x901C,0x900C,0x9021,/* 0xE0-0xE7 */
+	0x90EF,0x90EA,0x90F0,0x90F4,0x90F2,0x90F3,0x90D4,0x90EB,/* 0xE8-0xEF */
+	0x90EC,0x90E9,0x9156,0x9158,0x915A,0x9153,0x9155,0x91EC,/* 0xF0-0xF7 */
+	0x91F4,0x91F1,0x91F3,0x91F8,0x91E4,0x91F9,0x91EA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x91EB,0x91F7,0x91E8,0x91EE,0x957A,0x9586,0x9588,0x967C,/* 0x40-0x47 */
+	0x966D,0x966B,0x9671,0x966F,0x96BF,0x976A,0x9804,0x98E5,/* 0x48-0x4F */
+	0x9997,0x509B,0x5095,0x5094,0x509E,0x508B,0x50A3,0x5083,/* 0x50-0x57 */
+	0x508C,0x508E,0x509D,0x5068,0x509C,0x5092,0x5082,0x5087,/* 0x58-0x5F */
+	0x515F,0x51D4,0x5312,0x5311,0x53A4,0x53A7,0x5591,0x55A8,/* 0x60-0x67 */
+	0x55A5,0x55AD,0x5577,0x5645,0x55A2,0x5593,0x5588,0x558F,/* 0x68-0x6F */
+	0x55B5,0x5581,0x55A3,0x5592,0x55A4,0x557D,0x558C,0x55A6,/* 0x70-0x77 */
+	0x557F,0x5595,0x55A1,0x558E,0x570C,0x5829,0x5837,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5819,0x581E,0x5827,0x5823,0x5828,0x57F5,0x5848,/* 0xA0-0xA7 */
+	0x5825,0x581C,0x581B,0x5833,0x583F,0x5836,0x582E,0x5839,/* 0xA8-0xAF */
+	0x5838,0x582D,0x582C,0x583B,0x5961,0x5AAF,0x5A94,0x5A9F,/* 0xB0-0xB7 */
+	0x5A7A,0x5AA2,0x5A9E,0x5A78,0x5AA6,0x5A7C,0x5AA5,0x5AAC,/* 0xB8-0xBF */
+	0x5A95,0x5AAE,0x5A37,0x5A84,0x5A8A,0x5A97,0x5A83,0x5A8B,/* 0xC0-0xC7 */
+	0x5AA9,0x5A7B,0x5A7D,0x5A8C,0x5A9C,0x5A8F,0x5A93,0x5A9D,/* 0xC8-0xCF */
+	0x5BEA,0x5BCD,0x5BCB,0x5BD4,0x5BD1,0x5BCA,0x5BCE,0x5C0C,/* 0xD0-0xD7 */
+	0x5C30,0x5D37,0x5D43,0x5D6B,0x5D41,0x5D4B,0x5D3F,0x5D35,/* 0xD8-0xDF */
+	0x5D51,0x5D4E,0x5D55,0x5D33,0x5D3A,0x5D52,0x5D3D,0x5D31,/* 0xE0-0xE7 */
+	0x5D59,0x5D42,0x5D39,0x5D49,0x5D38,0x5D3C,0x5D32,0x5D36,/* 0xE8-0xEF */
+	0x5D40,0x5D45,0x5E44,0x5E41,0x5F58,0x5FA6,0x5FA5,0x5FAB,/* 0xF0-0xF7 */
+	0x60C9,0x60B9,0x60CC,0x60E2,0x60CE,0x60C4,0x6114,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_D9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x60F2,0x610A,0x6116,0x6105,0x60F5,0x6113,0x60F8,0x60FC,/* 0x40-0x47 */
+	0x60FE,0x60C1,0x6103,0x6118,0x611D,0x6110,0x60FF,0x6104,/* 0x48-0x4F */
+	0x610B,0x624A,0x6394,0x63B1,0x63B0,0x63CE,0x63E5,0x63E8,/* 0x50-0x57 */
+	0x63EF,0x63C3,0x649D,0x63F3,0x63CA,0x63E0,0x63F6,0x63D5,/* 0x58-0x5F */
+	0x63F2,0x63F5,0x6461,0x63DF,0x63BE,0x63DD,0x63DC,0x63C4,/* 0x60-0x67 */
+	0x63D8,0x63D3,0x63C2,0x63C7,0x63CC,0x63CB,0x63C8,0x63F0,/* 0x68-0x6F */
+	0x63D7,0x63D9,0x6532,0x6567,0x656A,0x6564,0x655C,0x6568,/* 0x70-0x77 */
+	0x6565,0x658C,0x659D,0x659E,0x65AE,0x65D0,0x65D2,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x667C,0x666C,0x667B,0x6680,0x6671,0x6679,0x666A,/* 0xA0-0xA7 */
+	0x6672,0x6701,0x690C,0x68D3,0x6904,0x68DC,0x692A,0x68EC,/* 0xA8-0xAF */
+	0x68EA,0x68F1,0x690F,0x68D6,0x68F7,0x68EB,0x68E4,0x68F6,/* 0xB0-0xB7 */
+	0x6913,0x6910,0x68F3,0x68E1,0x6907,0x68CC,0x6908,0x6970,/* 0xB8-0xBF */
+	0x68B4,0x6911,0x68EF,0x68C6,0x6914,0x68F8,0x68D0,0x68FD,/* 0xC0-0xC7 */
+	0x68FC,0x68E8,0x690B,0x690A,0x6917,0x68CE,0x68C8,0x68DD,/* 0xC8-0xCF */
+	0x68DE,0x68E6,0x68F4,0x68D1,0x6906,0x68D4,0x68E9,0x6915,/* 0xD0-0xD7 */
+	0x6925,0x68C7,0x6B39,0x6B3B,0x6B3F,0x6B3C,0x6B94,0x6B97,/* 0xD8-0xDF */
+	0x6B99,0x6B95,0x6BBD,0x6BF0,0x6BF2,0x6BF3,0x6C30,0x6DFC,/* 0xE0-0xE7 */
+	0x6E46,0x6E47,0x6E1F,0x6E49,0x6E88,0x6E3C,0x6E3D,0x6E45,/* 0xE8-0xEF */
+	0x6E62,0x6E2B,0x6E3F,0x6E41,0x6E5D,0x6E73,0x6E1C,0x6E33,/* 0xF0-0xF7 */
+	0x6E4B,0x6E40,0x6E51,0x6E3B,0x6E03,0x6E2E,0x6E5E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6E68,0x6E5C,0x6E61,0x6E31,0x6E28,0x6E60,0x6E71,0x6E6B,/* 0x40-0x47 */
+	0x6E39,0x6E22,0x6E30,0x6E53,0x6E65,0x6E27,0x6E78,0x6E64,/* 0x48-0x4F */
+	0x6E77,0x6E55,0x6E79,0x6E52,0x6E66,0x6E35,0x6E36,0x6E5A,/* 0x50-0x57 */
+	0x7120,0x711E,0x712F,0x70FB,0x712E,0x7131,0x7123,0x7125,/* 0x58-0x5F */
+	0x7122,0x7132,0x711F,0x7128,0x713A,0x711B,0x724B,0x725A,/* 0x60-0x67 */
+	0x7288,0x7289,0x7286,0x7285,0x728B,0x7312,0x730B,0x7330,/* 0x68-0x6F */
+	0x7322,0x7331,0x7333,0x7327,0x7332,0x732D,0x7326,0x7323,/* 0x70-0x77 */
+	0x7335,0x730C,0x742E,0x742C,0x7430,0x742B,0x7416,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x741A,0x7421,0x742D,0x7431,0x7424,0x7423,0x741D,/* 0xA0-0xA7 */
+	0x7429,0x7420,0x7432,0x74FB,0x752F,0x756F,0x756C,0x75E7,/* 0xA8-0xAF */
+	0x75DA,0x75E1,0x75E6,0x75DD,0x75DF,0x75E4,0x75D7,0x7695,/* 0xB0-0xB7 */
+	0x7692,0x76DA,0x7746,0x7747,0x7744,0x774D,0x7745,0x774A,/* 0xB8-0xBF */
+	0x774E,0x774B,0x774C,0x77DE,0x77EC,0x7860,0x7864,0x7865,/* 0xC0-0xC7 */
+	0x785C,0x786D,0x7871,0x786A,0x786E,0x7870,0x7869,0x7868,/* 0xC8-0xCF */
+	0x785E,0x7862,0x7974,0x7973,0x7972,0x7970,0x7A02,0x7A0A,/* 0xD0-0xD7 */
+	0x7A03,0x7A0C,0x7A04,0x7A99,0x7AE6,0x7AE4,0x7B4A,0x7B3B,/* 0xD8-0xDF */
+	0x7B44,0x7B48,0x7B4C,0x7B4E,0x7B40,0x7B58,0x7B45,0x7CA2,/* 0xE0-0xE7 */
+	0x7C9E,0x7CA8,0x7CA1,0x7D58,0x7D6F,0x7D63,0x7D53,0x7D56,/* 0xE8-0xEF */
+	0x7D67,0x7D6A,0x7D4F,0x7D6D,0x7D5C,0x7D6B,0x7D52,0x7D54,/* 0xF0-0xF7 */
+	0x7D69,0x7D51,0x7D5F,0x7D4E,0x7F3E,0x7F3F,0x7F65,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7F66,0x7FA2,0x7FA0,0x7FA1,0x7FD7,0x8051,0x804F,0x8050,/* 0x40-0x47 */
+	0x80FE,0x80D4,0x8143,0x814A,0x8152,0x814F,0x8147,0x813D,/* 0x48-0x4F */
+	0x814D,0x813A,0x81E6,0x81EE,0x81F7,0x81F8,0x81F9,0x8204,/* 0x50-0x57 */
+	0x823C,0x823D,0x823F,0x8275,0x833B,0x83CF,0x83F9,0x8423,/* 0x58-0x5F */
+	0x83C0,0x83E8,0x8412,0x83E7,0x83E4,0x83FC,0x83F6,0x8410,/* 0x60-0x67 */
+	0x83C6,0x83C8,0x83EB,0x83E3,0x83BF,0x8401,0x83DD,0x83E5,/* 0x68-0x6F */
+	0x83D8,0x83FF,0x83E1,0x83CB,0x83CE,0x83D6,0x83F5,0x83C9,/* 0x70-0x77 */
+	0x8409,0x840F,0x83DE,0x8411,0x8406,0x83C2,0x83F3,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x83D5,0x83FA,0x83C7,0x83D1,0x83EA,0x8413,0x83C3,/* 0xA0-0xA7 */
+	0x83EC,0x83EE,0x83C4,0x83FB,0x83D7,0x83E2,0x841B,0x83DB,/* 0xA8-0xAF */
+	0x83FE,0x86D8,0x86E2,0x86E6,0x86D3,0x86E3,0x86DA,0x86EA,/* 0xB0-0xB7 */
+	0x86DD,0x86EB,0x86DC,0x86EC,0x86E9,0x86D7,0x86E8,0x86D1,/* 0xB8-0xBF */
+	0x8848,0x8856,0x8855,0x88BA,0x88D7,0x88B9,0x88B8,0x88C0,/* 0xC0-0xC7 */
+	0x88BE,0x88B6,0x88BC,0x88B7,0x88BD,0x88B2,0x8901,0x88C9,/* 0xC8-0xCF */
+	0x8995,0x8998,0x8997,0x89DD,0x89DA,0x89DB,0x8A4E,0x8A4D,/* 0xD0-0xD7 */
+	0x8A39,0x8A59,0x8A40,0x8A57,0x8A58,0x8A44,0x8A45,0x8A52,/* 0xD8-0xDF */
+	0x8A48,0x8A51,0x8A4A,0x8A4C,0x8A4F,0x8C5F,0x8C81,0x8C80,/* 0xE0-0xE7 */
+	0x8CBA,0x8CBE,0x8CB0,0x8CB9,0x8CB5,0x8D84,0x8D80,0x8D89,/* 0xE8-0xEF */
+	0x8DD8,0x8DD3,0x8DCD,0x8DC7,0x8DD6,0x8DDC,0x8DCF,0x8DD5,/* 0xF0-0xF7 */
+	0x8DD9,0x8DC8,0x8DD7,0x8DC5,0x8EEF,0x8EF7,0x8EFA,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8EF9,0x8EE6,0x8EEE,0x8EE5,0x8EF5,0x8EE7,0x8EE8,0x8EF6,/* 0x40-0x47 */
+	0x8EEB,0x8EF1,0x8EEC,0x8EF4,0x8EE9,0x902D,0x9034,0x902F,/* 0x48-0x4F */
+	0x9106,0x912C,0x9104,0x90FF,0x90FC,0x9108,0x90F9,0x90FB,/* 0x50-0x57 */
+	0x9101,0x9100,0x9107,0x9105,0x9103,0x9161,0x9164,0x915F,/* 0x58-0x5F */
+	0x9162,0x9160,0x9201,0x920A,0x9225,0x9203,0x921A,0x9226,/* 0x60-0x67 */
+	0x920F,0x920C,0x9200,0x9212,0x91FF,0x91FD,0x9206,0x9204,/* 0x68-0x6F */
+	0x9227,0x9202,0x921C,0x9224,0x9219,0x9217,0x9205,0x9216,/* 0x70-0x77 */
+	0x957B,0x958D,0x958C,0x9590,0x9687,0x967E,0x9688,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9689,0x9683,0x9680,0x96C2,0x96C8,0x96C3,0x96F1,/* 0xA0-0xA7 */
+	0x96F0,0x976C,0x9770,0x976E,0x9807,0x98A9,0x98EB,0x9CE6,/* 0xA8-0xAF */
+	0x9EF9,0x4E83,0x4E84,0x4EB6,0x50BD,0x50BF,0x50C6,0x50AE,/* 0xB0-0xB7 */
+	0x50C4,0x50CA,0x50B4,0x50C8,0x50C2,0x50B0,0x50C1,0x50BA,/* 0xB8-0xBF */
+	0x50B1,0x50CB,0x50C9,0x50B6,0x50B8,0x51D7,0x527A,0x5278,/* 0xC0-0xC7 */
+	0x527B,0x527C,0x55C3,0x55DB,0x55CC,0x55D0,0x55CB,0x55CA,/* 0xC8-0xCF */
+	0x55DD,0x55C0,0x55D4,0x55C4,0x55E9,0x55BF,0x55D2,0x558D,/* 0xD0-0xD7 */
+	0x55CF,0x55D5,0x55E2,0x55D6,0x55C8,0x55F2,0x55CD,0x55D9,/* 0xD8-0xDF */
+	0x55C2,0x5714,0x5853,0x5868,0x5864,0x584F,0x584D,0x5849,/* 0xE0-0xE7 */
+	0x586F,0x5855,0x584E,0x585D,0x5859,0x5865,0x585B,0x583D,/* 0xE8-0xEF */
+	0x5863,0x5871,0x58FC,0x5AC7,0x5AC4,0x5ACB,0x5ABA,0x5AB8,/* 0xF0-0xF7 */
+	0x5AB1,0x5AB5,0x5AB0,0x5ABF,0x5AC8,0x5ABB,0x5AC6,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DD[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5AB7,0x5AC0,0x5ACA,0x5AB4,0x5AB6,0x5ACD,0x5AB9,0x5A90,/* 0x40-0x47 */
+	0x5BD6,0x5BD8,0x5BD9,0x5C1F,0x5C33,0x5D71,0x5D63,0x5D4A,/* 0x48-0x4F */
+	0x5D65,0x5D72,0x5D6C,0x5D5E,0x5D68,0x5D67,0x5D62,0x5DF0,/* 0x50-0x57 */
+	0x5E4F,0x5E4E,0x5E4A,0x5E4D,0x5E4B,0x5EC5,0x5ECC,0x5EC6,/* 0x58-0x5F */
+	0x5ECB,0x5EC7,0x5F40,0x5FAF,0x5FAD,0x60F7,0x6149,0x614A,/* 0x60-0x67 */
+	0x612B,0x6145,0x6136,0x6132,0x612E,0x6146,0x612F,0x614F,/* 0x68-0x6F */
+	0x6129,0x6140,0x6220,0x9168,0x6223,0x6225,0x6224,0x63C5,/* 0x70-0x77 */
+	0x63F1,0x63EB,0x6410,0x6412,0x6409,0x6420,0x6424,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6433,0x6443,0x641F,0x6415,0x6418,0x6439,0x6437,/* 0xA0-0xA7 */
+	0x6422,0x6423,0x640C,0x6426,0x6430,0x6428,0x6441,0x6435,/* 0xA8-0xAF */
+	0x642F,0x640A,0x641A,0x6440,0x6425,0x6427,0x640B,0x63E7,/* 0xB0-0xB7 */
+	0x641B,0x642E,0x6421,0x640E,0x656F,0x6592,0x65D3,0x6686,/* 0xB8-0xBF */
+	0x668C,0x6695,0x6690,0x668B,0x668A,0x6699,0x6694,0x6678,/* 0xC0-0xC7 */
+	0x6720,0x6966,0x695F,0x6938,0x694E,0x6962,0x6971,0x693F,/* 0xC8-0xCF */
+	0x6945,0x696A,0x6939,0x6942,0x6957,0x6959,0x697A,0x6948,/* 0xD0-0xD7 */
+	0x6949,0x6935,0x696C,0x6933,0x693D,0x6965,0x68F0,0x6978,/* 0xD8-0xDF */
+	0x6934,0x6969,0x6940,0x696F,0x6944,0x6976,0x6958,0x6941,/* 0xE0-0xE7 */
+	0x6974,0x694C,0x693B,0x694B,0x6937,0x695C,0x694F,0x6951,/* 0xE8-0xEF */
+	0x6932,0x6952,0x692F,0x697B,0x693C,0x6B46,0x6B45,0x6B43,/* 0xF0-0xF7 */
+	0x6B42,0x6B48,0x6B41,0x6B9B,0xFA0D,0x6BFB,0x6BFC,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6BF9,0x6BF7,0x6BF8,0x6E9B,0x6ED6,0x6EC8,0x6E8F,0x6EC0,/* 0x40-0x47 */
+	0x6E9F,0x6E93,0x6E94,0x6EA0,0x6EB1,0x6EB9,0x6EC6,0x6ED2,/* 0x48-0x4F */
+	0x6EBD,0x6EC1,0x6E9E,0x6EC9,0x6EB7,0x6EB0,0x6ECD,0x6EA6,/* 0x50-0x57 */
+	0x6ECF,0x6EB2,0x6EBE,0x6EC3,0x6EDC,0x6ED8,0x6E99,0x6E92,/* 0x58-0x5F */
+	0x6E8E,0x6E8D,0x6EA4,0x6EA1,0x6EBF,0x6EB3,0x6ED0,0x6ECA,/* 0x60-0x67 */
+	0x6E97,0x6EAE,0x6EA3,0x7147,0x7154,0x7152,0x7163,0x7160,/* 0x68-0x6F */
+	0x7141,0x715D,0x7162,0x7172,0x7178,0x716A,0x7161,0x7142,/* 0x70-0x77 */
+	0x7158,0x7143,0x714B,0x7170,0x715F,0x7150,0x7153,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7144,0x714D,0x715A,0x724F,0x728D,0x728C,0x7291,/* 0xA0-0xA7 */
+	0x7290,0x728E,0x733C,0x7342,0x733B,0x733A,0x7340,0x734A,/* 0xA8-0xAF */
+	0x7349,0x7444,0x744A,0x744B,0x7452,0x7451,0x7457,0x7440,/* 0xB0-0xB7 */
+	0x744F,0x7450,0x744E,0x7442,0x7446,0x744D,0x7454,0x74E1,/* 0xB8-0xBF */
+	0x74FF,0x74FE,0x74FD,0x751D,0x7579,0x7577,0x6983,0x75EF,/* 0xC0-0xC7 */
+	0x760F,0x7603,0x75F7,0x75FE,0x75FC,0x75F9,0x75F8,0x7610,/* 0xC8-0xCF */
+	0x75FB,0x75F6,0x75ED,0x75F5,0x75FD,0x7699,0x76B5,0x76DD,/* 0xD0-0xD7 */
+	0x7755,0x775F,0x7760,0x7752,0x7756,0x775A,0x7769,0x7767,/* 0xD8-0xDF */
+	0x7754,0x7759,0x776D,0x77E0,0x7887,0x789A,0x7894,0x788F,/* 0xE0-0xE7 */
+	0x7884,0x7895,0x7885,0x7886,0x78A1,0x7883,0x7879,0x7899,/* 0xE8-0xEF */
+	0x7880,0x7896,0x787B,0x797C,0x7982,0x797D,0x7979,0x7A11,/* 0xF0-0xF7 */
+	0x7A18,0x7A19,0x7A12,0x7A17,0x7A15,0x7A22,0x7A13,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_DF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7A1B,0x7A10,0x7AA3,0x7AA2,0x7A9E,0x7AEB,0x7B66,0x7B64,/* 0x40-0x47 */
+	0x7B6D,0x7B74,0x7B69,0x7B72,0x7B65,0x7B73,0x7B71,0x7B70,/* 0x48-0x4F */
+	0x7B61,0x7B78,0x7B76,0x7B63,0x7CB2,0x7CB4,0x7CAF,0x7D88,/* 0x50-0x57 */
+	0x7D86,0x7D80,0x7D8D,0x7D7F,0x7D85,0x7D7A,0x7D8E,0x7D7B,/* 0x58-0x5F */
+	0x7D83,0x7D7C,0x7D8C,0x7D94,0x7D84,0x7D7D,0x7D92,0x7F6D,/* 0x60-0x67 */
+	0x7F6B,0x7F67,0x7F68,0x7F6C,0x7FA6,0x7FA5,0x7FA7,0x7FDB,/* 0x68-0x6F */
+	0x7FDC,0x8021,0x8164,0x8160,0x8177,0x815C,0x8169,0x815B,/* 0x70-0x77 */
+	0x8162,0x8172,0x6721,0x815E,0x8176,0x8167,0x816F,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8144,0x8161,0x821D,0x8249,0x8244,0x8240,0x8242,/* 0xA0-0xA7 */
+	0x8245,0x84F1,0x843F,0x8456,0x8476,0x8479,0x848F,0x848D,/* 0xA8-0xAF */
+	0x8465,0x8451,0x8440,0x8486,0x8467,0x8430,0x844D,0x847D,/* 0xB0-0xB7 */
+	0x845A,0x8459,0x8474,0x8473,0x845D,0x8507,0x845E,0x8437,/* 0xB8-0xBF */
+	0x843A,0x8434,0x847A,0x8443,0x8478,0x8432,0x8445,0x8429,/* 0xC0-0xC7 */
+	0x83D9,0x844B,0x842F,0x8442,0x842D,0x845F,0x8470,0x8439,/* 0xC8-0xCF */
+	0x844E,0x844C,0x8452,0x846F,0x84C5,0x848E,0x843B,0x8447,/* 0xD0-0xD7 */
+	0x8436,0x8433,0x8468,0x847E,0x8444,0x842B,0x8460,0x8454,/* 0xD8-0xDF */
+	0x846E,0x8450,0x870B,0x8704,0x86F7,0x870C,0x86FA,0x86D6,/* 0xE0-0xE7 */
+	0x86F5,0x874D,0x86F8,0x870E,0x8709,0x8701,0x86F6,0x870D,/* 0xE8-0xEF */
+	0x8705,0x88D6,0x88CB,0x88CD,0x88CE,0x88DE,0x88DB,0x88DA,/* 0xF0-0xF7 */
+	0x88CC,0x88D0,0x8985,0x899B,0x89DF,0x89E5,0x89E4,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x89E1,0x89E0,0x89E2,0x89DC,0x89E6,0x8A76,0x8A86,0x8A7F,/* 0x40-0x47 */
+	0x8A61,0x8A3F,0x8A77,0x8A82,0x8A84,0x8A75,0x8A83,0x8A81,/* 0x48-0x4F */
+	0x8A74,0x8A7A,0x8C3C,0x8C4B,0x8C4A,0x8C65,0x8C64,0x8C66,/* 0x50-0x57 */
+	0x8C86,0x8C84,0x8C85,0x8CCC,0x8D68,0x8D69,0x8D91,0x8D8C,/* 0x58-0x5F */
+	0x8D8E,0x8D8F,0x8D8D,0x8D93,0x8D94,0x8D90,0x8D92,0x8DF0,/* 0x60-0x67 */
+	0x8DE0,0x8DEC,0x8DF1,0x8DEE,0x8DD0,0x8DE9,0x8DE3,0x8DE2,/* 0x68-0x6F */
+	0x8DE7,0x8DF2,0x8DEB,0x8DF4,0x8F06,0x8EFF,0x8F01,0x8F00,/* 0x70-0x77 */
+	0x8F05,0x8F07,0x8F08,0x8F02,0x8F0B,0x9052,0x903F,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9044,0x9049,0x903D,0x9110,0x910D,0x910F,0x9111,/* 0xA0-0xA7 */
+	0x9116,0x9114,0x910B,0x910E,0x916E,0x916F,0x9248,0x9252,/* 0xA8-0xAF */
+	0x9230,0x923A,0x9266,0x9233,0x9265,0x925E,0x9283,0x922E,/* 0xB0-0xB7 */
+	0x924A,0x9246,0x926D,0x926C,0x924F,0x9260,0x9267,0x926F,/* 0xB8-0xBF */
+	0x9236,0x9261,0x9270,0x9231,0x9254,0x9263,0x9250,0x9272,/* 0xC0-0xC7 */
+	0x924E,0x9253,0x924C,0x9256,0x9232,0x959F,0x959C,0x959E,/* 0xC8-0xCF */
+	0x959B,0x9692,0x9693,0x9691,0x9697,0x96CE,0x96FA,0x96FD,/* 0xD0-0xD7 */
+	0x96F8,0x96F5,0x9773,0x9777,0x9778,0x9772,0x980F,0x980D,/* 0xD8-0xDF */
+	0x980E,0x98AC,0x98F6,0x98F9,0x99AF,0x99B2,0x99B0,0x99B5,/* 0xE0-0xE7 */
+	0x9AAD,0x9AAB,0x9B5B,0x9CEA,0x9CED,0x9CE7,0x9E80,0x9EFD,/* 0xE8-0xEF */
+	0x50E6,0x50D4,0x50D7,0x50E8,0x50F3,0x50DB,0x50EA,0x50DD,/* 0xF0-0xF7 */
+	0x50E4,0x50D3,0x50EC,0x50F0,0x50EF,0x50E3,0x50E0,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x51D8,0x5280,0x5281,0x52E9,0x52EB,0x5330,0x53AC,0x5627,/* 0x40-0x47 */
+	0x5615,0x560C,0x5612,0x55FC,0x560F,0x561C,0x5601,0x5613,/* 0x48-0x4F */
+	0x5602,0x55FA,0x561D,0x5604,0x55FF,0x55F9,0x5889,0x587C,/* 0x50-0x57 */
+	0x5890,0x5898,0x5886,0x5881,0x587F,0x5874,0x588B,0x587A,/* 0x58-0x5F */
+	0x5887,0x5891,0x588E,0x5876,0x5882,0x5888,0x587B,0x5894,/* 0x60-0x67 */
+	0x588F,0x58FE,0x596B,0x5ADC,0x5AEE,0x5AE5,0x5AD5,0x5AEA,/* 0x68-0x6F */
+	0x5ADA,0x5AED,0x5AEB,0x5AF3,0x5AE2,0x5AE0,0x5ADB,0x5AEC,/* 0x70-0x77 */
+	0x5ADE,0x5ADD,0x5AD9,0x5AE8,0x5ADF,0x5B77,0x5BE0,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x5BE3,0x5C63,0x5D82,0x5D80,0x5D7D,0x5D86,0x5D7A,/* 0xA0-0xA7 */
+	0x5D81,0x5D77,0x5D8A,0x5D89,0x5D88,0x5D7E,0x5D7C,0x5D8D,/* 0xA8-0xAF */
+	0x5D79,0x5D7F,0x5E58,0x5E59,0x5E53,0x5ED8,0x5ED1,0x5ED7,/* 0xB0-0xB7 */
+	0x5ECE,0x5EDC,0x5ED5,0x5ED9,0x5ED2,0x5ED4,0x5F44,0x5F43,/* 0xB8-0xBF */
+	0x5F6F,0x5FB6,0x612C,0x6128,0x6141,0x615E,0x6171,0x6173,/* 0xC0-0xC7 */
+	0x6152,0x6153,0x6172,0x616C,0x6180,0x6174,0x6154,0x617A,/* 0xC8-0xCF */
+	0x615B,0x6165,0x613B,0x616A,0x6161,0x6156,0x6229,0x6227,/* 0xD0-0xD7 */
+	0x622B,0x642B,0x644D,0x645B,0x645D,0x6474,0x6476,0x6472,/* 0xD8-0xDF */
+	0x6473,0x647D,0x6475,0x6466,0x64A6,0x644E,0x6482,0x645E,/* 0xE0-0xE7 */
+	0x645C,0x644B,0x6453,0x6460,0x6450,0x647F,0x643F,0x646C,/* 0xE8-0xEF */
+	0x646B,0x6459,0x6465,0x6477,0x6573,0x65A0,0x66A1,0x66A0,/* 0xF0-0xF7 */
+	0x669F,0x6705,0x6704,0x6722,0x69B1,0x69B6,0x69C9,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x69A0,0x69CE,0x6996,0x69B0,0x69AC,0x69BC,0x6991,0x6999,/* 0x40-0x47 */
+	0x698E,0x69A7,0x698D,0x69A9,0x69BE,0x69AF,0x69BF,0x69C4,/* 0x48-0x4F */
+	0x69BD,0x69A4,0x69D4,0x69B9,0x69CA,0x699A,0x69CF,0x69B3,/* 0x50-0x57 */
+	0x6993,0x69AA,0x69A1,0x699E,0x69D9,0x6997,0x6990,0x69C2,/* 0x58-0x5F */
+	0x69B5,0x69A5,0x69C6,0x6B4A,0x6B4D,0x6B4B,0x6B9E,0x6B9F,/* 0x60-0x67 */
+	0x6BA0,0x6BC3,0x6BC4,0x6BFE,0x6ECE,0x6EF5,0x6EF1,0x6F03,/* 0x68-0x6F */
+	0x6F25,0x6EF8,0x6F37,0x6EFB,0x6F2E,0x6F09,0x6F4E,0x6F19,/* 0x70-0x77 */
+	0x6F1A,0x6F27,0x6F18,0x6F3B,0x6F12,0x6EED,0x6F0A,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x6F36,0x6F73,0x6EF9,0x6EEE,0x6F2D,0x6F40,0x6F30,/* 0xA0-0xA7 */
+	0x6F3C,0x6F35,0x6EEB,0x6F07,0x6F0E,0x6F43,0x6F05,0x6EFD,/* 0xA8-0xAF */
+	0x6EF6,0x6F39,0x6F1C,0x6EFC,0x6F3A,0x6F1F,0x6F0D,0x6F1E,/* 0xB0-0xB7 */
+	0x6F08,0x6F21,0x7187,0x7190,0x7189,0x7180,0x7185,0x7182,/* 0xB8-0xBF */
+	0x718F,0x717B,0x7186,0x7181,0x7197,0x7244,0x7253,0x7297,/* 0xC0-0xC7 */
+	0x7295,0x7293,0x7343,0x734D,0x7351,0x734C,0x7462,0x7473,/* 0xC8-0xCF */
+	0x7471,0x7475,0x7472,0x7467,0x746E,0x7500,0x7502,0x7503,/* 0xD0-0xD7 */
+	0x757D,0x7590,0x7616,0x7608,0x760C,0x7615,0x7611,0x760A,/* 0xD8-0xDF */
+	0x7614,0x76B8,0x7781,0x777C,0x7785,0x7782,0x776E,0x7780,/* 0xE0-0xE7 */
+	0x776F,0x777E,0x7783,0x78B2,0x78AA,0x78B4,0x78AD,0x78A8,/* 0xE8-0xEF */
+	0x787E,0x78AB,0x789E,0x78A5,0x78A0,0x78AC,0x78A2,0x78A4,/* 0xF0-0xF7 */
+	0x7998,0x798A,0x798B,0x7996,0x7995,0x7994,0x7993,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7997,0x7988,0x7992,0x7990,0x7A2B,0x7A4A,0x7A30,0x7A2F,/* 0x40-0x47 */
+	0x7A28,0x7A26,0x7AA8,0x7AAB,0x7AAC,0x7AEE,0x7B88,0x7B9C,/* 0x48-0x4F */
+	0x7B8A,0x7B91,0x7B90,0x7B96,0x7B8D,0x7B8C,0x7B9B,0x7B8E,/* 0x50-0x57 */
+	0x7B85,0x7B98,0x5284,0x7B99,0x7BA4,0x7B82,0x7CBB,0x7CBF,/* 0x58-0x5F */
+	0x7CBC,0x7CBA,0x7DA7,0x7DB7,0x7DC2,0x7DA3,0x7DAA,0x7DC1,/* 0x60-0x67 */
+	0x7DC0,0x7DC5,0x7D9D,0x7DCE,0x7DC4,0x7DC6,0x7DCB,0x7DCC,/* 0x68-0x6F */
+	0x7DAF,0x7DB9,0x7D96,0x7DBC,0x7D9F,0x7DA6,0x7DAE,0x7DA9,/* 0x70-0x77 */
+	0x7DA1,0x7DC9,0x7F73,0x7FE2,0x7FE3,0x7FE5,0x7FDE,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8024,0x805D,0x805C,0x8189,0x8186,0x8183,0x8187,/* 0xA0-0xA7 */
+	0x818D,0x818C,0x818B,0x8215,0x8497,0x84A4,0x84A1,0x849F,/* 0xA8-0xAF */
+	0x84BA,0x84CE,0x84C2,0x84AC,0x84AE,0x84AB,0x84B9,0x84B4,/* 0xB0-0xB7 */
+	0x84C1,0x84CD,0x84AA,0x849A,0x84B1,0x84D0,0x849D,0x84A7,/* 0xB8-0xBF */
+	0x84BB,0x84A2,0x8494,0x84C7,0x84CC,0x849B,0x84A9,0x84AF,/* 0xC0-0xC7 */
+	0x84A8,0x84D6,0x8498,0x84B6,0x84CF,0x84A0,0x84D7,0x84D4,/* 0xC8-0xCF */
+	0x84D2,0x84DB,0x84B0,0x8491,0x8661,0x8733,0x8723,0x8728,/* 0xD0-0xD7 */
+	0x876B,0x8740,0x872E,0x871E,0x8721,0x8719,0x871B,0x8743,/* 0xD8-0xDF */
+	0x872C,0x8741,0x873E,0x8746,0x8720,0x8732,0x872A,0x872D,/* 0xE0-0xE7 */
+	0x873C,0x8712,0x873A,0x8731,0x8735,0x8742,0x8726,0x8727,/* 0xE8-0xEF */
+	0x8738,0x8724,0x871A,0x8730,0x8711,0x88F7,0x88E7,0x88F1,/* 0xF0-0xF7 */
+	0x88F2,0x88FA,0x88FE,0x88EE,0x88FC,0x88F6,0x88FB,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x88F0,0x88EC,0x88EB,0x899D,0x89A1,0x899F,0x899E,0x89E9,/* 0x40-0x47 */
+	0x89EB,0x89E8,0x8AAB,0x8A99,0x8A8B,0x8A92,0x8A8F,0x8A96,/* 0x48-0x4F */
+	0x8C3D,0x8C68,0x8C69,0x8CD5,0x8CCF,0x8CD7,0x8D96,0x8E09,/* 0x50-0x57 */
+	0x8E02,0x8DFF,0x8E0D,0x8DFD,0x8E0A,0x8E03,0x8E07,0x8E06,/* 0x58-0x5F */
+	0x8E05,0x8DFE,0x8E00,0x8E04,0x8F10,0x8F11,0x8F0E,0x8F0D,/* 0x60-0x67 */
+	0x9123,0x911C,0x9120,0x9122,0x911F,0x911D,0x911A,0x9124,/* 0x68-0x6F */
+	0x9121,0x911B,0x917A,0x9172,0x9179,0x9173,0x92A5,0x92A4,/* 0x70-0x77 */
+	0x9276,0x929B,0x927A,0x92A0,0x9294,0x92AA,0x928D,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x92A6,0x929A,0x92AB,0x9279,0x9297,0x927F,0x92A3,/* 0xA0-0xA7 */
+	0x92EE,0x928E,0x9282,0x9295,0x92A2,0x927D,0x9288,0x92A1,/* 0xA8-0xAF */
+	0x928A,0x9286,0x928C,0x9299,0x92A7,0x927E,0x9287,0x92A9,/* 0xB0-0xB7 */
+	0x929D,0x928B,0x922D,0x969E,0x96A1,0x96FF,0x9758,0x977D,/* 0xB8-0xBF */
+	0x977A,0x977E,0x9783,0x9780,0x9782,0x977B,0x9784,0x9781,/* 0xC0-0xC7 */
+	0x977F,0x97CE,0x97CD,0x9816,0x98AD,0x98AE,0x9902,0x9900,/* 0xC8-0xCF */
+	0x9907,0x999D,0x999C,0x99C3,0x99B9,0x99BB,0x99BA,0x99C2,/* 0xD0-0xD7 */
+	0x99BD,0x99C7,0x9AB1,0x9AE3,0x9AE7,0x9B3E,0x9B3F,0x9B60,/* 0xD8-0xDF */
+	0x9B61,0x9B5F,0x9CF1,0x9CF2,0x9CF5,0x9EA7,0x50FF,0x5103,/* 0xE0-0xE7 */
+	0x5130,0x50F8,0x5106,0x5107,0x50F6,0x50FE,0x510B,0x510C,/* 0xE8-0xEF */
+	0x50FD,0x510A,0x528B,0x528C,0x52F1,0x52EF,0x5648,0x5642,/* 0xF0-0xF7 */
+	0x564C,0x5635,0x5641,0x564A,0x5649,0x5646,0x5658,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x565A,0x5640,0x5633,0x563D,0x562C,0x563E,0x5638,0x562A,/* 0x40-0x47 */
+	0x563A,0x571A,0x58AB,0x589D,0x58B1,0x58A0,0x58A3,0x58AF,/* 0x48-0x4F */
+	0x58AC,0x58A5,0x58A1,0x58FF,0x5AFF,0x5AF4,0x5AFD,0x5AF7,/* 0x50-0x57 */
+	0x5AF6,0x5B03,0x5AF8,0x5B02,0x5AF9,0x5B01,0x5B07,0x5B05,/* 0x58-0x5F */
+	0x5B0F,0x5C67,0x5D99,0x5D97,0x5D9F,0x5D92,0x5DA2,0x5D93,/* 0x60-0x67 */
+	0x5D95,0x5DA0,0x5D9C,0x5DA1,0x5D9A,0x5D9E,0x5E69,0x5E5D,/* 0x68-0x6F */
+	0x5E60,0x5E5C,0x7DF3,0x5EDB,0x5EDE,0x5EE1,0x5F49,0x5FB2,/* 0x70-0x77 */
+	0x618B,0x6183,0x6179,0x61B1,0x61B0,0x61A2,0x6189,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x619B,0x6193,0x61AF,0x61AD,0x619F,0x6192,0x61AA,/* 0xA0-0xA7 */
+	0x61A1,0x618D,0x6166,0x61B3,0x622D,0x646E,0x6470,0x6496,/* 0xA8-0xAF */
+	0x64A0,0x6485,0x6497,0x649C,0x648F,0x648B,0x648A,0x648C,/* 0xB0-0xB7 */
+	0x64A3,0x649F,0x6468,0x64B1,0x6498,0x6576,0x657A,0x6579,/* 0xB8-0xBF */
+	0x657B,0x65B2,0x65B3,0x66B5,0x66B0,0x66A9,0x66B2,0x66B7,/* 0xC0-0xC7 */
+	0x66AA,0x66AF,0x6A00,0x6A06,0x6A17,0x69E5,0x69F8,0x6A15,/* 0xC8-0xCF */
+	0x69F1,0x69E4,0x6A20,0x69FF,0x69EC,0x69E2,0x6A1B,0x6A1D,/* 0xD0-0xD7 */
+	0x69FE,0x6A27,0x69F2,0x69EE,0x6A14,0x69F7,0x69E7,0x6A40,/* 0xD8-0xDF */
+	0x6A08,0x69E6,0x69FB,0x6A0D,0x69FC,0x69EB,0x6A09,0x6A04,/* 0xE0-0xE7 */
+	0x6A18,0x6A25,0x6A0F,0x69F6,0x6A26,0x6A07,0x69F4,0x6A16,/* 0xE8-0xEF */
+	0x6B51,0x6BA5,0x6BA3,0x6BA2,0x6BA6,0x6C01,0x6C00,0x6BFF,/* 0xF0-0xF7 */
+	0x6C02,0x6F41,0x6F26,0x6F7E,0x6F87,0x6FC6,0x6F92,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6F8D,0x6F89,0x6F8C,0x6F62,0x6F4F,0x6F85,0x6F5A,0x6F96,/* 0x40-0x47 */
+	0x6F76,0x6F6C,0x6F82,0x6F55,0x6F72,0x6F52,0x6F50,0x6F57,/* 0x48-0x4F */
+	0x6F94,0x6F93,0x6F5D,0x6F00,0x6F61,0x6F6B,0x6F7D,0x6F67,/* 0x50-0x57 */
+	0x6F90,0x6F53,0x6F8B,0x6F69,0x6F7F,0x6F95,0x6F63,0x6F77,/* 0x58-0x5F */
+	0x6F6A,0x6F7B,0x71B2,0x71AF,0x719B,0x71B0,0x71A0,0x719A,/* 0x60-0x67 */
+	0x71A9,0x71B5,0x719D,0x71A5,0x719E,0x71A4,0x71A1,0x71AA,/* 0x68-0x6F */
+	0x719C,0x71A7,0x71B3,0x7298,0x729A,0x7358,0x7352,0x735E,/* 0x70-0x77 */
+	0x735F,0x7360,0x735D,0x735B,0x7361,0x735A,0x7359,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7362,0x7487,0x7489,0x748A,0x7486,0x7481,0x747D,/* 0xA0-0xA7 */
+	0x7485,0x7488,0x747C,0x7479,0x7508,0x7507,0x757E,0x7625,/* 0xA8-0xAF */
+	0x761E,0x7619,0x761D,0x761C,0x7623,0x761A,0x7628,0x761B,/* 0xB0-0xB7 */
+	0x769C,0x769D,0x769E,0x769B,0x778D,0x778F,0x7789,0x7788,/* 0xB8-0xBF */
+	0x78CD,0x78BB,0x78CF,0x78CC,0x78D1,0x78CE,0x78D4,0x78C8,/* 0xC0-0xC7 */
+	0x78C3,0x78C4,0x78C9,0x799A,0x79A1,0x79A0,0x799C,0x79A2,/* 0xC8-0xCF */
+	0x799B,0x6B76,0x7A39,0x7AB2,0x7AB4,0x7AB3,0x7BB7,0x7BCB,/* 0xD0-0xD7 */
+	0x7BBE,0x7BAC,0x7BCE,0x7BAF,0x7BB9,0x7BCA,0x7BB5,0x7CC5,/* 0xD8-0xDF */
+	0x7CC8,0x7CCC,0x7CCB,0x7DF7,0x7DDB,0x7DEA,0x7DE7,0x7DD7,/* 0xE0-0xE7 */
+	0x7DE1,0x7E03,0x7DFA,0x7DE6,0x7DF6,0x7DF1,0x7DF0,0x7DEE,/* 0xE8-0xEF */
+	0x7DDF,0x7F76,0x7FAC,0x7FB0,0x7FAD,0x7FED,0x7FEB,0x7FEA,/* 0xF0-0xF7 */
+	0x7FEC,0x7FE6,0x7FE8,0x8064,0x8067,0x81A3,0x819F,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x819E,0x8195,0x81A2,0x8199,0x8197,0x8216,0x824F,0x8253,/* 0x40-0x47 */
+	0x8252,0x8250,0x824E,0x8251,0x8524,0x853B,0x850F,0x8500,/* 0x48-0x4F */
+	0x8529,0x850E,0x8509,0x850D,0x851F,0x850A,0x8527,0x851C,/* 0x50-0x57 */
+	0x84FB,0x852B,0x84FA,0x8508,0x850C,0x84F4,0x852A,0x84F2,/* 0x58-0x5F */
+	0x8515,0x84F7,0x84EB,0x84F3,0x84FC,0x8512,0x84EA,0x84E9,/* 0x60-0x67 */
+	0x8516,0x84FE,0x8528,0x851D,0x852E,0x8502,0x84FD,0x851E,/* 0x68-0x6F */
+	0x84F6,0x8531,0x8526,0x84E7,0x84E8,0x84F0,0x84EF,0x84F9,/* 0x70-0x77 */
+	0x8518,0x8520,0x8530,0x850B,0x8519,0x852F,0x8662,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8756,0x8763,0x8764,0x8777,0x87E1,0x8773,0x8758,/* 0xA0-0xA7 */
+	0x8754,0x875B,0x8752,0x8761,0x875A,0x8751,0x875E,0x876D,/* 0xA8-0xAF */
+	0x876A,0x8750,0x874E,0x875F,0x875D,0x876F,0x876C,0x877A,/* 0xB0-0xB7 */
+	0x876E,0x875C,0x8765,0x874F,0x877B,0x8775,0x8762,0x8767,/* 0xB8-0xBF */
+	0x8769,0x885A,0x8905,0x890C,0x8914,0x890B,0x8917,0x8918,/* 0xC0-0xC7 */
+	0x8919,0x8906,0x8916,0x8911,0x890E,0x8909,0x89A2,0x89A4,/* 0xC8-0xCF */
+	0x89A3,0x89ED,0x89F0,0x89EC,0x8ACF,0x8AC6,0x8AB8,0x8AD3,/* 0xD0-0xD7 */
+	0x8AD1,0x8AD4,0x8AD5,0x8ABB,0x8AD7,0x8ABE,0x8AC0,0x8AC5,/* 0xD8-0xDF */
+	0x8AD8,0x8AC3,0x8ABA,0x8ABD,0x8AD9,0x8C3E,0x8C4D,0x8C8F,/* 0xE0-0xE7 */
+	0x8CE5,0x8CDF,0x8CD9,0x8CE8,0x8CDA,0x8CDD,0x8CE7,0x8DA0,/* 0xE8-0xEF */
+	0x8D9C,0x8DA1,0x8D9B,0x8E20,0x8E23,0x8E25,0x8E24,0x8E2E,/* 0xF0-0xF7 */
+	0x8E15,0x8E1B,0x8E16,0x8E11,0x8E19,0x8E26,0x8E27,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8E14,0x8E12,0x8E18,0x8E13,0x8E1C,0x8E17,0x8E1A,0x8F2C,/* 0x40-0x47 */
+	0x8F24,0x8F18,0x8F1A,0x8F20,0x8F23,0x8F16,0x8F17,0x9073,/* 0x48-0x4F */
+	0x9070,0x906F,0x9067,0x906B,0x912F,0x912B,0x9129,0x912A,/* 0x50-0x57 */
+	0x9132,0x9126,0x912E,0x9185,0x9186,0x918A,0x9181,0x9182,/* 0x58-0x5F */
+	0x9184,0x9180,0x92D0,0x92C3,0x92C4,0x92C0,0x92D9,0x92B6,/* 0x60-0x67 */
+	0x92CF,0x92F1,0x92DF,0x92D8,0x92E9,0x92D7,0x92DD,0x92CC,/* 0x68-0x6F */
+	0x92EF,0x92C2,0x92E8,0x92CA,0x92C8,0x92CE,0x92E6,0x92CD,/* 0x70-0x77 */
+	0x92D5,0x92C9,0x92E0,0x92DE,0x92E7,0x92D1,0x92D3,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x92B5,0x92E1,0x92C6,0x92B4,0x957C,0x95AC,0x95AB,/* 0xA0-0xA7 */
+	0x95AE,0x95B0,0x96A4,0x96A2,0x96D3,0x9705,0x9708,0x9702,/* 0xA8-0xAF */
+	0x975A,0x978A,0x978E,0x9788,0x97D0,0x97CF,0x981E,0x981D,/* 0xB0-0xB7 */
+	0x9826,0x9829,0x9828,0x9820,0x981B,0x9827,0x98B2,0x9908,/* 0xB8-0xBF */
+	0x98FA,0x9911,0x9914,0x9916,0x9917,0x9915,0x99DC,0x99CD,/* 0xC0-0xC7 */
+	0x99CF,0x99D3,0x99D4,0x99CE,0x99C9,0x99D6,0x99D8,0x99CB,/* 0xC8-0xCF */
+	0x99D7,0x99CC,0x9AB3,0x9AEC,0x9AEB,0x9AF3,0x9AF2,0x9AF1,/* 0xD0-0xD7 */
+	0x9B46,0x9B43,0x9B67,0x9B74,0x9B71,0x9B66,0x9B76,0x9B75,/* 0xD8-0xDF */
+	0x9B70,0x9B68,0x9B64,0x9B6C,0x9CFC,0x9CFA,0x9CFD,0x9CFF,/* 0xE0-0xE7 */
+	0x9CF7,0x9D07,0x9D00,0x9CF9,0x9CFB,0x9D08,0x9D05,0x9D04,/* 0xE8-0xEF */
+	0x9E83,0x9ED3,0x9F0F,0x9F10,0x511C,0x5113,0x5117,0x511A,/* 0xF0-0xF7 */
+	0x5111,0x51DE,0x5334,0x53E1,0x5670,0x5660,0x566E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_E9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5673,0x5666,0x5663,0x566D,0x5672,0x565E,0x5677,0x571C,/* 0x40-0x47 */
+	0x571B,0x58C8,0x58BD,0x58C9,0x58BF,0x58BA,0x58C2,0x58BC,/* 0x48-0x4F */
+	0x58C6,0x5B17,0x5B19,0x5B1B,0x5B21,0x5B14,0x5B13,0x5B10,/* 0x50-0x57 */
+	0x5B16,0x5B28,0x5B1A,0x5B20,0x5B1E,0x5BEF,0x5DAC,0x5DB1,/* 0x58-0x5F */
+	0x5DA9,0x5DA7,0x5DB5,0x5DB0,0x5DAE,0x5DAA,0x5DA8,0x5DB2,/* 0x60-0x67 */
+	0x5DAD,0x5DAF,0x5DB4,0x5E67,0x5E68,0x5E66,0x5E6F,0x5EE9,/* 0x68-0x6F */
+	0x5EE7,0x5EE6,0x5EE8,0x5EE5,0x5F4B,0x5FBC,0x619D,0x61A8,/* 0x70-0x77 */
+	0x6196,0x61C5,0x61B4,0x61C6,0x61C1,0x61CC,0x61BA,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x61BF,0x61B8,0x618C,0x64D7,0x64D6,0x64D0,0x64CF,/* 0xA0-0xA7 */
+	0x64C9,0x64BD,0x6489,0x64C3,0x64DB,0x64F3,0x64D9,0x6533,/* 0xA8-0xAF */
+	0x657F,0x657C,0x65A2,0x66C8,0x66BE,0x66C0,0x66CA,0x66CB,/* 0xB0-0xB7 */
+	0x66CF,0x66BD,0x66BB,0x66BA,0x66CC,0x6723,0x6A34,0x6A66,/* 0xB8-0xBF */
+	0x6A49,0x6A67,0x6A32,0x6A68,0x6A3E,0x6A5D,0x6A6D,0x6A76,/* 0xC0-0xC7 */
+	0x6A5B,0x6A51,0x6A28,0x6A5A,0x6A3B,0x6A3F,0x6A41,0x6A6A,/* 0xC8-0xCF */
+	0x6A64,0x6A50,0x6A4F,0x6A54,0x6A6F,0x6A69,0x6A60,0x6A3C,/* 0xD0-0xD7 */
+	0x6A5E,0x6A56,0x6A55,0x6A4D,0x6A4E,0x6A46,0x6B55,0x6B54,/* 0xD8-0xDF */
+	0x6B56,0x6BA7,0x6BAA,0x6BAB,0x6BC8,0x6BC7,0x6C04,0x6C03,/* 0xE0-0xE7 */
+	0x6C06,0x6FAD,0x6FCB,0x6FA3,0x6FC7,0x6FBC,0x6FCE,0x6FC8,/* 0xE8-0xEF */
+	0x6F5E,0x6FC4,0x6FBD,0x6F9E,0x6FCA,0x6FA8,0x7004,0x6FA5,/* 0xF0-0xF7 */
+	0x6FAE,0x6FBA,0x6FAC,0x6FAA,0x6FCF,0x6FBF,0x6FB8,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EA[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6FA2,0x6FC9,0x6FAB,0x6FCD,0x6FAF,0x6FB2,0x6FB0,0x71C5,/* 0x40-0x47 */
+	0x71C2,0x71BF,0x71B8,0x71D6,0x71C0,0x71C1,0x71CB,0x71D4,/* 0x48-0x4F */
+	0x71CA,0x71C7,0x71CF,0x71BD,0x71D8,0x71BC,0x71C6,0x71DA,/* 0x50-0x57 */
+	0x71DB,0x729D,0x729E,0x7369,0x7366,0x7367,0x736C,0x7365,/* 0x58-0x5F */
+	0x736B,0x736A,0x747F,0x749A,0x74A0,0x7494,0x7492,0x7495,/* 0x60-0x67 */
+	0x74A1,0x750B,0x7580,0x762F,0x762D,0x7631,0x763D,0x7633,/* 0x68-0x6F */
+	0x763C,0x7635,0x7632,0x7630,0x76BB,0x76E6,0x779A,0x779D,/* 0x70-0x77 */
+	0x77A1,0x779C,0x779B,0x77A2,0x77A3,0x7795,0x7799,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7797,0x78DD,0x78E9,0x78E5,0x78EA,0x78DE,0x78E3,/* 0xA0-0xA7 */
+	0x78DB,0x78E1,0x78E2,0x78ED,0x78DF,0x78E0,0x79A4,0x7A44,/* 0xA8-0xAF */
+	0x7A48,0x7A47,0x7AB6,0x7AB8,0x7AB5,0x7AB1,0x7AB7,0x7BDE,/* 0xB0-0xB7 */
+	0x7BE3,0x7BE7,0x7BDD,0x7BD5,0x7BE5,0x7BDA,0x7BE8,0x7BF9,/* 0xB8-0xBF */
+	0x7BD4,0x7BEA,0x7BE2,0x7BDC,0x7BEB,0x7BD8,0x7BDF,0x7CD2,/* 0xC0-0xC7 */
+	0x7CD4,0x7CD7,0x7CD0,0x7CD1,0x7E12,0x7E21,0x7E17,0x7E0C,/* 0xC8-0xCF */
+	0x7E1F,0x7E20,0x7E13,0x7E0E,0x7E1C,0x7E15,0x7E1A,0x7E22,/* 0xD0-0xD7 */
+	0x7E0B,0x7E0F,0x7E16,0x7E0D,0x7E14,0x7E25,0x7E24,0x7F43,/* 0xD8-0xDF */
+	0x7F7B,0x7F7C,0x7F7A,0x7FB1,0x7FEF,0x802A,0x8029,0x806C,/* 0xE0-0xE7 */
+	0x81B1,0x81A6,0x81AE,0x81B9,0x81B5,0x81AB,0x81B0,0x81AC,/* 0xE8-0xEF */
+	0x81B4,0x81B2,0x81B7,0x81A7,0x81F2,0x8255,0x8256,0x8257,/* 0xF0-0xF7 */
+	0x8556,0x8545,0x856B,0x854D,0x8553,0x8561,0x8558,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EB[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8540,0x8546,0x8564,0x8541,0x8562,0x8544,0x8551,0x8547,/* 0x40-0x47 */
+	0x8563,0x853E,0x855B,0x8571,0x854E,0x856E,0x8575,0x8555,/* 0x48-0x4F */
+	0x8567,0x8560,0x858C,0x8566,0x855D,0x8554,0x8565,0x856C,/* 0x50-0x57 */
+	0x8663,0x8665,0x8664,0x879B,0x878F,0x8797,0x8793,0x8792,/* 0x58-0x5F */
+	0x8788,0x8781,0x8796,0x8798,0x8779,0x8787,0x87A3,0x8785,/* 0x60-0x67 */
+	0x8790,0x8791,0x879D,0x8784,0x8794,0x879C,0x879A,0x8789,/* 0x68-0x6F */
+	0x891E,0x8926,0x8930,0x892D,0x892E,0x8927,0x8931,0x8922,/* 0x70-0x77 */
+	0x8929,0x8923,0x892F,0x892C,0x891F,0x89F1,0x8AE0,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8AE2,0x8AF2,0x8AF4,0x8AF5,0x8ADD,0x8B14,0x8AE4,/* 0xA0-0xA7 */
+	0x8ADF,0x8AF0,0x8AC8,0x8ADE,0x8AE1,0x8AE8,0x8AFF,0x8AEF,/* 0xA8-0xAF */
+	0x8AFB,0x8C91,0x8C92,0x8C90,0x8CF5,0x8CEE,0x8CF1,0x8CF0,/* 0xB0-0xB7 */
+	0x8CF3,0x8D6C,0x8D6E,0x8DA5,0x8DA7,0x8E33,0x8E3E,0x8E38,/* 0xB8-0xBF */
+	0x8E40,0x8E45,0x8E36,0x8E3C,0x8E3D,0x8E41,0x8E30,0x8E3F,/* 0xC0-0xC7 */
+	0x8EBD,0x8F36,0x8F2E,0x8F35,0x8F32,0x8F39,0x8F37,0x8F34,/* 0xC8-0xCF */
+	0x9076,0x9079,0x907B,0x9086,0x90FA,0x9133,0x9135,0x9136,/* 0xD0-0xD7 */
+	0x9193,0x9190,0x9191,0x918D,0x918F,0x9327,0x931E,0x9308,/* 0xD8-0xDF */
+	0x931F,0x9306,0x930F,0x937A,0x9338,0x933C,0x931B,0x9323,/* 0xE0-0xE7 */
+	0x9312,0x9301,0x9346,0x932D,0x930E,0x930D,0x92CB,0x931D,/* 0xE8-0xEF */
+	0x92FA,0x9325,0x9313,0x92F9,0x92F7,0x9334,0x9302,0x9324,/* 0xF0-0xF7 */
+	0x92FF,0x9329,0x9339,0x9335,0x932A,0x9314,0x930C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EC[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x930B,0x92FE,0x9309,0x9300,0x92FB,0x9316,0x95BC,0x95CD,/* 0x40-0x47 */
+	0x95BE,0x95B9,0x95BA,0x95B6,0x95BF,0x95B5,0x95BD,0x96A9,/* 0x48-0x4F */
+	0x96D4,0x970B,0x9712,0x9710,0x9799,0x9797,0x9794,0x97F0,/* 0x50-0x57 */
+	0x97F8,0x9835,0x982F,0x9832,0x9924,0x991F,0x9927,0x9929,/* 0x58-0x5F */
+	0x999E,0x99EE,0x99EC,0x99E5,0x99E4,0x99F0,0x99E3,0x99EA,/* 0x60-0x67 */
+	0x99E9,0x99E7,0x9AB9,0x9ABF,0x9AB4,0x9ABB,0x9AF6,0x9AFA,/* 0x68-0x6F */
+	0x9AF9,0x9AF7,0x9B33,0x9B80,0x9B85,0x9B87,0x9B7C,0x9B7E,/* 0x70-0x77 */
+	0x9B7B,0x9B82,0x9B93,0x9B92,0x9B90,0x9B7A,0x9B95,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9B7D,0x9B88,0x9D25,0x9D17,0x9D20,0x9D1E,0x9D14,/* 0xA0-0xA7 */
+	0x9D29,0x9D1D,0x9D18,0x9D22,0x9D10,0x9D19,0x9D1F,0x9E88,/* 0xA8-0xAF */
+	0x9E86,0x9E87,0x9EAE,0x9EAD,0x9ED5,0x9ED6,0x9EFA,0x9F12,/* 0xB0-0xB7 */
+	0x9F3D,0x5126,0x5125,0x5122,0x5124,0x5120,0x5129,0x52F4,/* 0xB8-0xBF */
+	0x5693,0x568C,0x568D,0x5686,0x5684,0x5683,0x567E,0x5682,/* 0xC0-0xC7 */
+	0x567F,0x5681,0x58D6,0x58D4,0x58CF,0x58D2,0x5B2D,0x5B25,/* 0xC8-0xCF */
+	0x5B32,0x5B23,0x5B2C,0x5B27,0x5B26,0x5B2F,0x5B2E,0x5B7B,/* 0xD0-0xD7 */
+	0x5BF1,0x5BF2,0x5DB7,0x5E6C,0x5E6A,0x5FBE,0x5FBB,0x61C3,/* 0xD8-0xDF */
+	0x61B5,0x61BC,0x61E7,0x61E0,0x61E5,0x61E4,0x61E8,0x61DE,/* 0xE0-0xE7 */
+	0x64EF,0x64E9,0x64E3,0x64EB,0x64E4,0x64E8,0x6581,0x6580,/* 0xE8-0xEF */
+	0x65B6,0x65DA,0x66D2,0x6A8D,0x6A96,0x6A81,0x6AA5,0x6A89,/* 0xF0-0xF7 */
+	0x6A9F,0x6A9B,0x6AA1,0x6A9E,0x6A87,0x6A93,0x6A8E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_ED[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x6A95,0x6A83,0x6AA8,0x6AA4,0x6A91,0x6A7F,0x6AA6,0x6A9A,/* 0x40-0x47 */
+	0x6A85,0x6A8C,0x6A92,0x6B5B,0x6BAD,0x6C09,0x6FCC,0x6FA9,/* 0x48-0x4F */
+	0x6FF4,0x6FD4,0x6FE3,0x6FDC,0x6FED,0x6FE7,0x6FE6,0x6FDE,/* 0x50-0x57 */
+	0x6FF2,0x6FDD,0x6FE2,0x6FE8,0x71E1,0x71F1,0x71E8,0x71F2,/* 0x58-0x5F */
+	0x71E4,0x71F0,0x71E2,0x7373,0x736E,0x736F,0x7497,0x74B2,/* 0x60-0x67 */
+	0x74AB,0x7490,0x74AA,0x74AD,0x74B1,0x74A5,0x74AF,0x7510,/* 0x68-0x6F */
+	0x7511,0x7512,0x750F,0x7584,0x7643,0x7648,0x7649,0x7647,/* 0x70-0x77 */
+	0x76A4,0x76E9,0x77B5,0x77AB,0x77B2,0x77B7,0x77B6,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x77B4,0x77B1,0x77A8,0x77F0,0x78F3,0x78FD,0x7902,/* 0xA0-0xA7 */
+	0x78FB,0x78FC,0x78F2,0x7905,0x78F9,0x78FE,0x7904,0x79AB,/* 0xA8-0xAF */
+	0x79A8,0x7A5C,0x7A5B,0x7A56,0x7A58,0x7A54,0x7A5A,0x7ABE,/* 0xB0-0xB7 */
+	0x7AC0,0x7AC1,0x7C05,0x7C0F,0x7BF2,0x7C00,0x7BFF,0x7BFB,/* 0xB8-0xBF */
+	0x7C0E,0x7BF4,0x7C0B,0x7BF3,0x7C02,0x7C09,0x7C03,0x7C01,/* 0xC0-0xC7 */
+	0x7BF8,0x7BFD,0x7C06,0x7BF0,0x7BF1,0x7C10,0x7C0A,0x7CE8,/* 0xC8-0xCF */
+	0x7E2D,0x7E3C,0x7E42,0x7E33,0x9848,0x7E38,0x7E2A,0x7E49,/* 0xD0-0xD7 */
+	0x7E40,0x7E47,0x7E29,0x7E4C,0x7E30,0x7E3B,0x7E36,0x7E44,/* 0xD8-0xDF */
+	0x7E3A,0x7F45,0x7F7F,0x7F7E,0x7F7D,0x7FF4,0x7FF2,0x802C,/* 0xE0-0xE7 */
+	0x81BB,0x81C4,0x81CC,0x81CA,0x81C5,0x81C7,0x81BC,0x81E9,/* 0xE8-0xEF */
+	0x825B,0x825A,0x825C,0x8583,0x8580,0x858F,0x85A7,0x8595,/* 0xF0-0xF7 */
+	0x85A0,0x858B,0x85A3,0x857B,0x85A4,0x859A,0x859E,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EE[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8577,0x857C,0x8589,0x85A1,0x857A,0x8578,0x8557,0x858E,/* 0x40-0x47 */
+	0x8596,0x8586,0x858D,0x8599,0x859D,0x8581,0x85A2,0x8582,/* 0x48-0x4F */
+	0x8588,0x8585,0x8579,0x8576,0x8598,0x8590,0x859F,0x8668,/* 0x50-0x57 */
+	0x87BE,0x87AA,0x87AD,0x87C5,0x87B0,0x87AC,0x87B9,0x87B5,/* 0x58-0x5F */
+	0x87BC,0x87AE,0x87C9,0x87C3,0x87C2,0x87CC,0x87B7,0x87AF,/* 0x60-0x67 */
+	0x87C4,0x87CA,0x87B4,0x87B6,0x87BF,0x87B8,0x87BD,0x87DE,/* 0x68-0x6F */
+	0x87B2,0x8935,0x8933,0x893C,0x893E,0x8941,0x8952,0x8937,/* 0x70-0x77 */
+	0x8942,0x89AD,0x89AF,0x89AE,0x89F2,0x89F3,0x8B1E,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x8B18,0x8B16,0x8B11,0x8B05,0x8B0B,0x8B22,0x8B0F,/* 0xA0-0xA7 */
+	0x8B12,0x8B15,0x8B07,0x8B0D,0x8B08,0x8B06,0x8B1C,0x8B13,/* 0xA8-0xAF */
+	0x8B1A,0x8C4F,0x8C70,0x8C72,0x8C71,0x8C6F,0x8C95,0x8C94,/* 0xB0-0xB7 */
+	0x8CF9,0x8D6F,0x8E4E,0x8E4D,0x8E53,0x8E50,0x8E4C,0x8E47,/* 0xB8-0xBF */
+	0x8F43,0x8F40,0x9085,0x907E,0x9138,0x919A,0x91A2,0x919B,/* 0xC0-0xC7 */
+	0x9199,0x919F,0x91A1,0x919D,0x91A0,0x93A1,0x9383,0x93AF,/* 0xC8-0xCF */
+	0x9364,0x9356,0x9347,0x937C,0x9358,0x935C,0x9376,0x9349,/* 0xD0-0xD7 */
+	0x9350,0x9351,0x9360,0x936D,0x938F,0x934C,0x936A,0x9379,/* 0xD8-0xDF */
+	0x9357,0x9355,0x9352,0x934F,0x9371,0x9377,0x937B,0x9361,/* 0xE0-0xE7 */
+	0x935E,0x9363,0x9367,0x9380,0x934E,0x9359,0x95C7,0x95C0,/* 0xE8-0xEF */
+	0x95C9,0x95C3,0x95C5,0x95B7,0x96AE,0x96B0,0x96AC,0x9720,/* 0xF0-0xF7 */
+	0x971F,0x9718,0x971D,0x9719,0x979A,0x97A1,0x979C,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_EF[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x979E,0x979D,0x97D5,0x97D4,0x97F1,0x9841,0x9844,0x984A,/* 0x40-0x47 */
+	0x9849,0x9845,0x9843,0x9925,0x992B,0x992C,0x992A,0x9933,/* 0x48-0x4F */
+	0x9932,0x992F,0x992D,0x9931,0x9930,0x9998,0x99A3,0x99A1,/* 0x50-0x57 */
+	0x9A02,0x99FA,0x99F4,0x99F7,0x99F9,0x99F8,0x99F6,0x99FB,/* 0x58-0x5F */
+	0x99FD,0x99FE,0x99FC,0x9A03,0x9ABE,0x9AFE,0x9AFD,0x9B01,/* 0x60-0x67 */
+	0x9AFC,0x9B48,0x9B9A,0x9BA8,0x9B9E,0x9B9B,0x9BA6,0x9BA1,/* 0x68-0x6F */
+	0x9BA5,0x9BA4,0x9B86,0x9BA2,0x9BA0,0x9BAF,0x9D33,0x9D41,/* 0x70-0x77 */
+	0x9D67,0x9D36,0x9D2E,0x9D2F,0x9D31,0x9D38,0x9D30,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9D45,0x9D42,0x9D43,0x9D3E,0x9D37,0x9D40,0x9D3D,/* 0xA0-0xA7 */
+	0x7FF5,0x9D2D,0x9E8A,0x9E89,0x9E8D,0x9EB0,0x9EC8,0x9EDA,/* 0xA8-0xAF */
+	0x9EFB,0x9EFF,0x9F24,0x9F23,0x9F22,0x9F54,0x9FA0,0x5131,/* 0xB0-0xB7 */
+	0x512D,0x512E,0x5698,0x569C,0x5697,0x569A,0x569D,0x5699,/* 0xB8-0xBF */
+	0x5970,0x5B3C,0x5C69,0x5C6A,0x5DC0,0x5E6D,0x5E6E,0x61D8,/* 0xC0-0xC7 */
+	0x61DF,0x61ED,0x61EE,0x61F1,0x61EA,0x61F0,0x61EB,0x61D6,/* 0xC8-0xCF */
+	0x61E9,0x64FF,0x6504,0x64FD,0x64F8,0x6501,0x6503,0x64FC,/* 0xD0-0xD7 */
+	0x6594,0x65DB,0x66DA,0x66DB,0x66D8,0x6AC5,0x6AB9,0x6ABD,/* 0xD8-0xDF */
+	0x6AE1,0x6AC6,0x6ABA,0x6AB6,0x6AB7,0x6AC7,0x6AB4,0x6AAD,/* 0xE0-0xE7 */
+	0x6B5E,0x6BC9,0x6C0B,0x7007,0x700C,0x700D,0x7001,0x7005,/* 0xE8-0xEF */
+	0x7014,0x700E,0x6FFF,0x7000,0x6FFB,0x7026,0x6FFC,0x6FF7,/* 0xF0-0xF7 */
+	0x700A,0x7201,0x71FF,0x71F9,0x7203,0x71FD,0x7376,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F0[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x74B8,0x74C0,0x74B5,0x74C1,0x74BE,0x74B6,0x74BB,0x74C2,/* 0x40-0x47 */
+	0x7514,0x7513,0x765C,0x7664,0x7659,0x7650,0x7653,0x7657,/* 0x48-0x4F */
+	0x765A,0x76A6,0x76BD,0x76EC,0x77C2,0x77BA,0x78FF,0x790C,/* 0x50-0x57 */
+	0x7913,0x7914,0x7909,0x7910,0x7912,0x7911,0x79AD,0x79AC,/* 0x58-0x5F */
+	0x7A5F,0x7C1C,0x7C29,0x7C19,0x7C20,0x7C1F,0x7C2D,0x7C1D,/* 0x60-0x67 */
+	0x7C26,0x7C28,0x7C22,0x7C25,0x7C30,0x7E5C,0x7E50,0x7E56,/* 0x68-0x6F */
+	0x7E63,0x7E58,0x7E62,0x7E5F,0x7E51,0x7E60,0x7E57,0x7E53,/* 0x70-0x77 */
+	0x7FB5,0x7FB3,0x7FF7,0x7FF8,0x8075,0x81D1,0x81D2,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x81D0,0x825F,0x825E,0x85B4,0x85C6,0x85C0,0x85C3,/* 0xA0-0xA7 */
+	0x85C2,0x85B3,0x85B5,0x85BD,0x85C7,0x85C4,0x85BF,0x85CB,/* 0xA8-0xAF */
+	0x85CE,0x85C8,0x85C5,0x85B1,0x85B6,0x85D2,0x8624,0x85B8,/* 0xB0-0xB7 */
+	0x85B7,0x85BE,0x8669,0x87E7,0x87E6,0x87E2,0x87DB,0x87EB,/* 0xB8-0xBF */
+	0x87EA,0x87E5,0x87DF,0x87F3,0x87E4,0x87D4,0x87DC,0x87D3,/* 0xC0-0xC7 */
+	0x87ED,0x87D8,0x87E3,0x87A4,0x87D7,0x87D9,0x8801,0x87F4,/* 0xC8-0xCF */
+	0x87E8,0x87DD,0x8953,0x894B,0x894F,0x894C,0x8946,0x8950,/* 0xD0-0xD7 */
+	0x8951,0x8949,0x8B2A,0x8B27,0x8B23,0x8B33,0x8B30,0x8B35,/* 0xD8-0xDF */
+	0x8B47,0x8B2F,0x8B3C,0x8B3E,0x8B31,0x8B25,0x8B37,0x8B26,/* 0xE0-0xE7 */
+	0x8B36,0x8B2E,0x8B24,0x8B3B,0x8B3D,0x8B3A,0x8C42,0x8C75,/* 0xE8-0xEF */
+	0x8C99,0x8C98,0x8C97,0x8CFE,0x8D04,0x8D02,0x8D00,0x8E5C,/* 0xF0-0xF7 */
+	0x8E62,0x8E60,0x8E57,0x8E56,0x8E5E,0x8E65,0x8E67,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F1[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8E5B,0x8E5A,0x8E61,0x8E5D,0x8E69,0x8E54,0x8F46,0x8F47,/* 0x40-0x47 */
+	0x8F48,0x8F4B,0x9128,0x913A,0x913B,0x913E,0x91A8,0x91A5,/* 0x48-0x4F */
+	0x91A7,0x91AF,0x91AA,0x93B5,0x938C,0x9392,0x93B7,0x939B,/* 0x50-0x57 */
+	0x939D,0x9389,0x93A7,0x938E,0x93AA,0x939E,0x93A6,0x9395,/* 0x58-0x5F */
+	0x9388,0x9399,0x939F,0x938D,0x93B1,0x9391,0x93B2,0x93A4,/* 0x60-0x67 */
+	0x93A8,0x93B4,0x93A3,0x93A5,0x95D2,0x95D3,0x95D1,0x96B3,/* 0x68-0x6F */
+	0x96D7,0x96DA,0x5DC2,0x96DF,0x96D8,0x96DD,0x9723,0x9722,/* 0x70-0x77 */
+	0x9725,0x97AC,0x97AE,0x97A8,0x97AB,0x97A4,0x97AA,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x97A2,0x97A5,0x97D7,0x97D9,0x97D6,0x97D8,0x97FA,/* 0xA0-0xA7 */
+	0x9850,0x9851,0x9852,0x98B8,0x9941,0x993C,0x993A,0x9A0F,/* 0xA8-0xAF */
+	0x9A0B,0x9A09,0x9A0D,0x9A04,0x9A11,0x9A0A,0x9A05,0x9A07,/* 0xB0-0xB7 */
+	0x9A06,0x9AC0,0x9ADC,0x9B08,0x9B04,0x9B05,0x9B29,0x9B35,/* 0xB8-0xBF */
+	0x9B4A,0x9B4C,0x9B4B,0x9BC7,0x9BC6,0x9BC3,0x9BBF,0x9BC1,/* 0xC0-0xC7 */
+	0x9BB5,0x9BB8,0x9BD3,0x9BB6,0x9BC4,0x9BB9,0x9BBD,0x9D5C,/* 0xC8-0xCF */
+	0x9D53,0x9D4F,0x9D4A,0x9D5B,0x9D4B,0x9D59,0x9D56,0x9D4C,/* 0xD0-0xD7 */
+	0x9D57,0x9D52,0x9D54,0x9D5F,0x9D58,0x9D5A,0x9E8E,0x9E8C,/* 0xD8-0xDF */
+	0x9EDF,0x9F01,0x9F00,0x9F16,0x9F25,0x9F2B,0x9F2A,0x9F29,/* 0xE0-0xE7 */
+	0x9F28,0x9F4C,0x9F55,0x5134,0x5135,0x5296,0x52F7,0x53B4,/* 0xE8-0xEF */
+	0x56AB,0x56AD,0x56A6,0x56A7,0x56AA,0x56AC,0x58DA,0x58DD,/* 0xF0-0xF7 */
+	0x58DB,0x5912,0x5B3D,0x5B3E,0x5B3F,0x5DC3,0x5E70,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F2[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x5FBF,0x61FB,0x6507,0x6510,0x650D,0x6509,0x650C,0x650E,/* 0x40-0x47 */
+	0x6584,0x65DE,0x65DD,0x66DE,0x6AE7,0x6AE0,0x6ACC,0x6AD1,/* 0x48-0x4F */
+	0x6AD9,0x6ACB,0x6ADF,0x6ADC,0x6AD0,0x6AEB,0x6ACF,0x6ACD,/* 0x50-0x57 */
+	0x6ADE,0x6B60,0x6BB0,0x6C0C,0x7019,0x7027,0x7020,0x7016,/* 0x58-0x5F */
+	0x702B,0x7021,0x7022,0x7023,0x7029,0x7017,0x7024,0x701C,/* 0x60-0x67 */
+	0x702A,0x720C,0x720A,0x7207,0x7202,0x7205,0x72A5,0x72A6,/* 0x68-0x6F */
+	0x72A4,0x72A3,0x72A1,0x74CB,0x74C5,0x74B7,0x74C3,0x7516,/* 0x70-0x77 */
+	0x7660,0x77C9,0x77CA,0x77C4,0x77F1,0x791D,0x791B,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x7921,0x791C,0x7917,0x791E,0x79B0,0x7A67,0x7A68,/* 0xA0-0xA7 */
+	0x7C33,0x7C3C,0x7C39,0x7C2C,0x7C3B,0x7CEC,0x7CEA,0x7E76,/* 0xA8-0xAF */
+	0x7E75,0x7E78,0x7E70,0x7E77,0x7E6F,0x7E7A,0x7E72,0x7E74,/* 0xB0-0xB7 */
+	0x7E68,0x7F4B,0x7F4A,0x7F83,0x7F86,0x7FB7,0x7FFD,0x7FFE,/* 0xB8-0xBF */
+	0x8078,0x81D7,0x81D5,0x8264,0x8261,0x8263,0x85EB,0x85F1,/* 0xC0-0xC7 */
+	0x85ED,0x85D9,0x85E1,0x85E8,0x85DA,0x85D7,0x85EC,0x85F2,/* 0xC8-0xCF */
+	0x85F8,0x85D8,0x85DF,0x85E3,0x85DC,0x85D1,0x85F0,0x85E6,/* 0xD0-0xD7 */
+	0x85EF,0x85DE,0x85E2,0x8800,0x87FA,0x8803,0x87F6,0x87F7,/* 0xD8-0xDF */
+	0x8809,0x880C,0x880B,0x8806,0x87FC,0x8808,0x87FF,0x880A,/* 0xE0-0xE7 */
+	0x8802,0x8962,0x895A,0x895B,0x8957,0x8961,0x895C,0x8958,/* 0xE8-0xEF */
+	0x895D,0x8959,0x8988,0x89B7,0x89B6,0x89F6,0x8B50,0x8B48,/* 0xF0-0xF7 */
+	0x8B4A,0x8B40,0x8B53,0x8B56,0x8B54,0x8B4B,0x8B55,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F3[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8B51,0x8B42,0x8B52,0x8B57,0x8C43,0x8C77,0x8C76,0x8C9A,/* 0x40-0x47 */
+	0x8D06,0x8D07,0x8D09,0x8DAC,0x8DAA,0x8DAD,0x8DAB,0x8E6D,/* 0x48-0x4F */
+	0x8E78,0x8E73,0x8E6A,0x8E6F,0x8E7B,0x8EC2,0x8F52,0x8F51,/* 0x50-0x57 */
+	0x8F4F,0x8F50,0x8F53,0x8FB4,0x9140,0x913F,0x91B0,0x91AD,/* 0x58-0x5F */
+	0x93DE,0x93C7,0x93CF,0x93C2,0x93DA,0x93D0,0x93F9,0x93EC,/* 0x60-0x67 */
+	0x93CC,0x93D9,0x93A9,0x93E6,0x93CA,0x93D4,0x93EE,0x93E3,/* 0x68-0x6F */
+	0x93D5,0x93C4,0x93CE,0x93C0,0x93D2,0x93E7,0x957D,0x95DA,/* 0x70-0x77 */
+	0x95DB,0x96E1,0x9729,0x972B,0x972C,0x9728,0x9726,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x97B3,0x97B7,0x97B6,0x97DD,0x97DE,0x97DF,0x985C,/* 0xA0-0xA7 */
+	0x9859,0x985D,0x9857,0x98BF,0x98BD,0x98BB,0x98BE,0x9948,/* 0xA8-0xAF */
+	0x9947,0x9943,0x99A6,0x99A7,0x9A1A,0x9A15,0x9A25,0x9A1D,/* 0xB0-0xB7 */
+	0x9A24,0x9A1B,0x9A22,0x9A20,0x9A27,0x9A23,0x9A1E,0x9A1C,/* 0xB8-0xBF */
+	0x9A14,0x9AC2,0x9B0B,0x9B0A,0x9B0E,0x9B0C,0x9B37,0x9BEA,/* 0xC0-0xC7 */
+	0x9BEB,0x9BE0,0x9BDE,0x9BE4,0x9BE6,0x9BE2,0x9BF0,0x9BD4,/* 0xC8-0xCF */
+	0x9BD7,0x9BEC,0x9BDC,0x9BD9,0x9BE5,0x9BD5,0x9BE1,0x9BDA,/* 0xD0-0xD7 */
+	0x9D77,0x9D81,0x9D8A,0x9D84,0x9D88,0x9D71,0x9D80,0x9D78,/* 0xD8-0xDF */
+	0x9D86,0x9D8B,0x9D8C,0x9D7D,0x9D6B,0x9D74,0x9D75,0x9D70,/* 0xE0-0xE7 */
+	0x9D69,0x9D85,0x9D73,0x9D7B,0x9D82,0x9D6F,0x9D79,0x9D7F,/* 0xE8-0xEF */
+	0x9D87,0x9D68,0x9E94,0x9E91,0x9EC0,0x9EFC,0x9F2D,0x9F40,/* 0xF0-0xF7 */
+	0x9F41,0x9F4D,0x9F56,0x9F57,0x9F58,0x5337,0x56B2,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F4[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x56B5,0x56B3,0x58E3,0x5B45,0x5DC6,0x5DC7,0x5EEE,0x5EEF,/* 0x40-0x47 */
+	0x5FC0,0x5FC1,0x61F9,0x6517,0x6516,0x6515,0x6513,0x65DF,/* 0x48-0x4F */
+	0x66E8,0x66E3,0x66E4,0x6AF3,0x6AF0,0x6AEA,0x6AE8,0x6AF9,/* 0x50-0x57 */
+	0x6AF1,0x6AEE,0x6AEF,0x703C,0x7035,0x702F,0x7037,0x7034,/* 0x58-0x5F */
+	0x7031,0x7042,0x7038,0x703F,0x703A,0x7039,0x7040,0x703B,/* 0x60-0x67 */
+	0x7033,0x7041,0x7213,0x7214,0x72A8,0x737D,0x737C,0x74BA,/* 0x68-0x6F */
+	0x76AB,0x76AA,0x76BE,0x76ED,0x77CC,0x77CE,0x77CF,0x77CD,/* 0x70-0x77 */
+	0x77F2,0x7925,0x7923,0x7927,0x7928,0x7924,0x7929,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x79B2,0x7A6E,0x7A6C,0x7A6D,0x7AF7,0x7C49,0x7C48,/* 0xA0-0xA7 */
+	0x7C4A,0x7C47,0x7C45,0x7CEE,0x7E7B,0x7E7E,0x7E81,0x7E80,/* 0xA8-0xAF */
+	0x7FBA,0x7FFF,0x8079,0x81DB,0x81D9,0x820B,0x8268,0x8269,/* 0xB0-0xB7 */
+	0x8622,0x85FF,0x8601,0x85FE,0x861B,0x8600,0x85F6,0x8604,/* 0xB8-0xBF */
+	0x8609,0x8605,0x860C,0x85FD,0x8819,0x8810,0x8811,0x8817,/* 0xC0-0xC7 */
+	0x8813,0x8816,0x8963,0x8966,0x89B9,0x89F7,0x8B60,0x8B6A,/* 0xC8-0xCF */
+	0x8B5D,0x8B68,0x8B63,0x8B65,0x8B67,0x8B6D,0x8DAE,0x8E86,/* 0xD0-0xD7 */
+	0x8E88,0x8E84,0x8F59,0x8F56,0x8F57,0x8F55,0x8F58,0x8F5A,/* 0xD8-0xDF */
+	0x908D,0x9143,0x9141,0x91B7,0x91B5,0x91B2,0x91B3,0x940B,/* 0xE0-0xE7 */
+	0x9413,0x93FB,0x9420,0x940F,0x9414,0x93FE,0x9415,0x9410,/* 0xE8-0xEF */
+	0x9428,0x9419,0x940D,0x93F5,0x9400,0x93F7,0x9407,0x940E,/* 0xF0-0xF7 */
+	0x9416,0x9412,0x93FA,0x9409,0x93F8,0x940A,0x93FF,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F5[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x93FC,0x940C,0x93F6,0x9411,0x9406,0x95DE,0x95E0,0x95DF,/* 0x40-0x47 */
+	0x972E,0x972F,0x97B9,0x97BB,0x97FD,0x97FE,0x9860,0x9862,/* 0x48-0x4F */
+	0x9863,0x985F,0x98C1,0x98C2,0x9950,0x994E,0x9959,0x994C,/* 0x50-0x57 */
+	0x994B,0x9953,0x9A32,0x9A34,0x9A31,0x9A2C,0x9A2A,0x9A36,/* 0x58-0x5F */
+	0x9A29,0x9A2E,0x9A38,0x9A2D,0x9AC7,0x9ACA,0x9AC6,0x9B10,/* 0x60-0x67 */
+	0x9B12,0x9B11,0x9C0B,0x9C08,0x9BF7,0x9C05,0x9C12,0x9BF8,/* 0x68-0x6F */
+	0x9C40,0x9C07,0x9C0E,0x9C06,0x9C17,0x9C14,0x9C09,0x9D9F,/* 0x70-0x77 */
+	0x9D99,0x9DA4,0x9D9D,0x9D92,0x9D98,0x9D90,0x9D9B,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9DA0,0x9D94,0x9D9C,0x9DAA,0x9D97,0x9DA1,0x9D9A,/* 0xA0-0xA7 */
+	0x9DA2,0x9DA8,0x9D9E,0x9DA3,0x9DBF,0x9DA9,0x9D96,0x9DA6,/* 0xA8-0xAF */
+	0x9DA7,0x9E99,0x9E9B,0x9E9A,0x9EE5,0x9EE4,0x9EE7,0x9EE6,/* 0xB0-0xB7 */
+	0x9F30,0x9F2E,0x9F5B,0x9F60,0x9F5E,0x9F5D,0x9F59,0x9F91,/* 0xB8-0xBF */
+	0x513A,0x5139,0x5298,0x5297,0x56C3,0x56BD,0x56BE,0x5B48,/* 0xC0-0xC7 */
+	0x5B47,0x5DCB,0x5DCF,0x5EF1,0x61FD,0x651B,0x6B02,0x6AFC,/* 0xC8-0xCF */
+	0x6B03,0x6AF8,0x6B00,0x7043,0x7044,0x704A,0x7048,0x7049,/* 0xD0-0xD7 */
+	0x7045,0x7046,0x721D,0x721A,0x7219,0x737E,0x7517,0x766A,/* 0xD8-0xDF */
+	0x77D0,0x792D,0x7931,0x792F,0x7C54,0x7C53,0x7CF2,0x7E8A,/* 0xE0-0xE7 */
+	0x7E87,0x7E88,0x7E8B,0x7E86,0x7E8D,0x7F4D,0x7FBB,0x8030,/* 0xE8-0xEF */
+	0x81DD,0x8618,0x862A,0x8626,0x861F,0x8623,0x861C,0x8619,/* 0xF0-0xF7 */
+	0x8627,0x862E,0x8621,0x8620,0x8629,0x861E,0x8625,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F6[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8829,0x881D,0x881B,0x8820,0x8824,0x881C,0x882B,0x884A,/* 0x40-0x47 */
+	0x896D,0x8969,0x896E,0x896B,0x89FA,0x8B79,0x8B78,0x8B45,/* 0x48-0x4F */
+	0x8B7A,0x8B7B,0x8D10,0x8D14,0x8DAF,0x8E8E,0x8E8C,0x8F5E,/* 0x50-0x57 */
+	0x8F5B,0x8F5D,0x9146,0x9144,0x9145,0x91B9,0x943F,0x943B,/* 0x58-0x5F */
+	0x9436,0x9429,0x943D,0x943C,0x9430,0x9439,0x942A,0x9437,/* 0x60-0x67 */
+	0x942C,0x9440,0x9431,0x95E5,0x95E4,0x95E3,0x9735,0x973A,/* 0x68-0x6F */
+	0x97BF,0x97E1,0x9864,0x98C9,0x98C6,0x98C0,0x9958,0x9956,/* 0x70-0x77 */
+	0x9A39,0x9A3D,0x9A46,0x9A44,0x9A42,0x9A41,0x9A3A,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9A3F,0x9ACD,0x9B15,0x9B17,0x9B18,0x9B16,0x9B3A,/* 0xA0-0xA7 */
+	0x9B52,0x9C2B,0x9C1D,0x9C1C,0x9C2C,0x9C23,0x9C28,0x9C29,/* 0xA8-0xAF */
+	0x9C24,0x9C21,0x9DB7,0x9DB6,0x9DBC,0x9DC1,0x9DC7,0x9DCA,/* 0xB0-0xB7 */
+	0x9DCF,0x9DBE,0x9DC5,0x9DC3,0x9DBB,0x9DB5,0x9DCE,0x9DB9,/* 0xB8-0xBF */
+	0x9DBA,0x9DAC,0x9DC8,0x9DB1,0x9DAD,0x9DCC,0x9DB3,0x9DCD,/* 0xC0-0xC7 */
+	0x9DB2,0x9E7A,0x9E9C,0x9EEB,0x9EEE,0x9EED,0x9F1B,0x9F18,/* 0xC8-0xCF */
+	0x9F1A,0x9F31,0x9F4E,0x9F65,0x9F64,0x9F92,0x4EB9,0x56C6,/* 0xD0-0xD7 */
+	0x56C5,0x56CB,0x5971,0x5B4B,0x5B4C,0x5DD5,0x5DD1,0x5EF2,/* 0xD8-0xDF */
+	0x6521,0x6520,0x6526,0x6522,0x6B0B,0x6B08,0x6B09,0x6C0D,/* 0xE0-0xE7 */
+	0x7055,0x7056,0x7057,0x7052,0x721E,0x721F,0x72A9,0x737F,/* 0xE8-0xEF */
+	0x74D8,0x74D5,0x74D9,0x74D7,0x766D,0x76AD,0x7935,0x79B4,/* 0xF0-0xF7 */
+	0x7A70,0x7A71,0x7C57,0x7C5C,0x7C59,0x7C5B,0x7C5A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F7[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7CF4,0x7CF1,0x7E91,0x7F4F,0x7F87,0x81DE,0x826B,0x8634,/* 0x40-0x47 */
+	0x8635,0x8633,0x862C,0x8632,0x8636,0x882C,0x8828,0x8826,/* 0x48-0x4F */
+	0x882A,0x8825,0x8971,0x89BF,0x89BE,0x89FB,0x8B7E,0x8B84,/* 0x50-0x57 */
+	0x8B82,0x8B86,0x8B85,0x8B7F,0x8D15,0x8E95,0x8E94,0x8E9A,/* 0x58-0x5F */
+	0x8E92,0x8E90,0x8E96,0x8E97,0x8F60,0x8F62,0x9147,0x944C,/* 0x60-0x67 */
+	0x9450,0x944A,0x944B,0x944F,0x9447,0x9445,0x9448,0x9449,/* 0x68-0x6F */
+	0x9446,0x973F,0x97E3,0x986A,0x9869,0x98CB,0x9954,0x995B,/* 0x70-0x77 */
+	0x9A4E,0x9A53,0x9A54,0x9A4C,0x9A4F,0x9A48,0x9A4A,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9A49,0x9A52,0x9A50,0x9AD0,0x9B19,0x9B2B,0x9B3B,/* 0xA0-0xA7 */
+	0x9B56,0x9B55,0x9C46,0x9C48,0x9C3F,0x9C44,0x9C39,0x9C33,/* 0xA8-0xAF */
+	0x9C41,0x9C3C,0x9C37,0x9C34,0x9C32,0x9C3D,0x9C36,0x9DDB,/* 0xB0-0xB7 */
+	0x9DD2,0x9DDE,0x9DDA,0x9DCB,0x9DD0,0x9DDC,0x9DD1,0x9DDF,/* 0xB8-0xBF */
+	0x9DE9,0x9DD9,0x9DD8,0x9DD6,0x9DF5,0x9DD5,0x9DDD,0x9EB6,/* 0xC0-0xC7 */
+	0x9EF0,0x9F35,0x9F33,0x9F32,0x9F42,0x9F6B,0x9F95,0x9FA2,/* 0xC8-0xCF */
+	0x513D,0x5299,0x58E8,0x58E7,0x5972,0x5B4D,0x5DD8,0x882F,/* 0xD0-0xD7 */
+	0x5F4F,0x6201,0x6203,0x6204,0x6529,0x6525,0x6596,0x66EB,/* 0xD8-0xDF */
+	0x6B11,0x6B12,0x6B0F,0x6BCA,0x705B,0x705A,0x7222,0x7382,/* 0xE0-0xE7 */
+	0x7381,0x7383,0x7670,0x77D4,0x7C67,0x7C66,0x7E95,0x826C,/* 0xE8-0xEF */
+	0x863A,0x8640,0x8639,0x863C,0x8631,0x863B,0x863E,0x8830,/* 0xF0-0xF7 */
+	0x8832,0x882E,0x8833,0x8976,0x8974,0x8973,0x89FE,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F8[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x8B8C,0x8B8E,0x8B8B,0x8B88,0x8C45,0x8D19,0x8E98,0x8F64,/* 0x40-0x47 */
+	0x8F63,0x91BC,0x9462,0x9455,0x945D,0x9457,0x945E,0x97C4,/* 0x48-0x4F */
+	0x97C5,0x9800,0x9A56,0x9A59,0x9B1E,0x9B1F,0x9B20,0x9C52,/* 0x50-0x57 */
+	0x9C58,0x9C50,0x9C4A,0x9C4D,0x9C4B,0x9C55,0x9C59,0x9C4C,/* 0x58-0x5F */
+	0x9C4E,0x9DFB,0x9DF7,0x9DEF,0x9DE3,0x9DEB,0x9DF8,0x9DE4,/* 0x60-0x67 */
+	0x9DF6,0x9DE1,0x9DEE,0x9DE6,0x9DF2,0x9DF0,0x9DE2,0x9DEC,/* 0x68-0x6F */
+	0x9DF4,0x9DF3,0x9DE8,0x9DED,0x9EC2,0x9ED0,0x9EF2,0x9EF3,/* 0x70-0x77 */
+	0x9F06,0x9F1C,0x9F38,0x9F37,0x9F36,0x9F43,0x9F4F,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9F71,0x9F70,0x9F6E,0x9F6F,0x56D3,0x56CD,0x5B4E,/* 0xA0-0xA7 */
+	0x5C6D,0x652D,0x66ED,0x66EE,0x6B13,0x705F,0x7061,0x705D,/* 0xA8-0xAF */
+	0x7060,0x7223,0x74DB,0x74E5,0x77D5,0x7938,0x79B7,0x79B6,/* 0xB0-0xB7 */
+	0x7C6A,0x7E97,0x7F89,0x826D,0x8643,0x8838,0x8837,0x8835,/* 0xB8-0xBF */
+	0x884B,0x8B94,0x8B95,0x8E9E,0x8E9F,0x8EA0,0x8E9D,0x91BE,/* 0xC0-0xC7 */
+	0x91BD,0x91C2,0x946B,0x9468,0x9469,0x96E5,0x9746,0x9743,/* 0xC8-0xCF */
+	0x9747,0x97C7,0x97E5,0x9A5E,0x9AD5,0x9B59,0x9C63,0x9C67,/* 0xD0-0xD7 */
+	0x9C66,0x9C62,0x9C5E,0x9C60,0x9E02,0x9DFE,0x9E07,0x9E03,/* 0xD8-0xDF */
+	0x9E06,0x9E05,0x9E00,0x9E01,0x9E09,0x9DFF,0x9DFD,0x9E04,/* 0xE0-0xE7 */
+	0x9EA0,0x9F1E,0x9F46,0x9F74,0x9F75,0x9F76,0x56D4,0x652E,/* 0xE8-0xEF */
+	0x65B8,0x6B18,0x6B19,0x6B17,0x6B1A,0x7062,0x7226,0x72AA,/* 0xF0-0xF7 */
+	0x77D8,0x77D9,0x7939,0x7C69,0x7C6B,0x7CF6,0x7E9A,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t c2u_F9[256] = {
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
+	0x7E98,0x7E9B,0x7E99,0x81E0,0x81E1,0x8646,0x8647,0x8648,/* 0x40-0x47 */
+	0x8979,0x897A,0x897C,0x897B,0x89FF,0x8B98,0x8B99,0x8EA5,/* 0x48-0x4F */
+	0x8EA4,0x8EA3,0x946E,0x946D,0x946F,0x9471,0x9473,0x9749,/* 0x50-0x57 */
+	0x9872,0x995F,0x9C68,0x9C6E,0x9C6D,0x9E0B,0x9E0D,0x9E10,/* 0x58-0x5F */
+	0x9E0F,0x9E12,0x9E11,0x9EA1,0x9EF5,0x9F09,0x9F47,0x9F78,/* 0x60-0x67 */
+	0x9F7B,0x9F7A,0x9F79,0x571E,0x7066,0x7C6F,0x883C,0x8DB2,/* 0x68-0x6F */
+	0x8EA6,0x91C3,0x9474,0x9478,0x9476,0x9475,0x9A60,0x9C74,/* 0x70-0x77 */
+	0x9C73,0x9C71,0x9C75,0x9E14,0x9E13,0x9EF6,0x9F0A,0x0000,/* 0x78-0x7F */
+
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
+	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
+	0x0000,0x9FA4,0x7068,0x7065,0x7CF7,0x866A,0x883E,0x883D,/* 0xA0-0xA7 */
+	0x883F,0x8B9E,0x8C9C,0x8EA9,0x8EC9,0x974B,0x9873,0x9874,/* 0xA8-0xAF */
+	0x98CC,0x9961,0x99AB,0x9A64,0x9A66,0x9A67,0x9B24,0x9E15,/* 0xB0-0xB7 */
+	0x9E17,0x9F48,0x6207,0x6B1E,0x7227,0x864C,0x8EA8,0x9482,/* 0xB8-0xBF */
+	0x9480,0x9481,0x9A69,0x9A68,0x9B2E,0x9E19,0x7229,0x864B,/* 0xC0-0xC7 */
+	0x8B9F,0x9483,0x9C79,0x9EB7,0x7675,0x9A6B,0x9C7A,0x9E1D,/* 0xC8-0xCF */
+	0x7069,0x706A,0x9EA4,0x9F7E,0x9F49,0x9F98,0x7881,0x92B9,/* 0xD0-0xD7 */
+	0x88CF,0x58BB,0x6052,0x7CA7,0x5AFA,0x2554,0x2566,0x2557,/* 0xD8-0xDF */
+	0x2560,0x256C,0x2563,0x255A,0x2569,0x255D,0x2552,0x2564,/* 0xE0-0xE7 */
+	0x2555,0x255E,0x256A,0x2561,0x2558,0x2567,0x255B,0x2553,/* 0xE8-0xEF */
+	0x2565,0x2556,0x255F,0x256B,0x2562,0x2559,0x2568,0x255C,/* 0xF0-0xF7 */
+	0x2551,0x2550,0x256D,0x256E,0x2570,0x256F,0x2593,0x0000,/* 0xF8-0xFF */
+};
+
+static const wchar_t *page_charset2uni[256] = {
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   c2u_A1, c2u_A2, c2u_A3, c2u_A4, c2u_A5, c2u_A6, c2u_A7, 
+	c2u_A8, c2u_A9, c2u_AA, c2u_AB, c2u_AC, c2u_AD, c2u_AE, c2u_AF, 
+	c2u_B0, c2u_B1, c2u_B2, c2u_B3, c2u_B4, c2u_B5, c2u_B6, c2u_B7, 
+	c2u_B8, c2u_B9, c2u_BA, c2u_BB, c2u_BC, c2u_BD, c2u_BE, c2u_BF, 
+	c2u_C0, c2u_C1, c2u_C2, c2u_C3, c2u_C4, c2u_C5, c2u_C6, NULL,   
+	NULL,   c2u_C9, c2u_CA, c2u_CB, c2u_CC, c2u_CD, c2u_CE, c2u_CF, 
+	c2u_D0, c2u_D1, c2u_D2, c2u_D3, c2u_D4, c2u_D5, c2u_D6, c2u_D7, 
+	c2u_D8, c2u_D9, c2u_DA, c2u_DB, c2u_DC, c2u_DD, c2u_DE, c2u_DF, 
+	c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, 
+	c2u_E8, c2u_E9, c2u_EA, c2u_EB, c2u_EC, c2u_ED, c2u_EE, c2u_EF, 
+	c2u_F0, c2u_F1, c2u_F2, c2u_F3, c2u_F4, c2u_F5, c2u_F6, c2u_F7, 
+	c2u_F8, c2u_F9, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char u2c_02[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA3, 0xBE, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xA3, 0xBC, 0xA3, 0xBD, 0xA3, 0xBF, /* 0xC8-0xCB */
+	0x00, 0x00, 0xA1, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xA3, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+};
+
+static const unsigned char u2c_03[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xA1, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xA3, 0x44, 0xA3, 0x45, 0xA3, 0x46, /* 0x90-0x93 */
+	0xA3, 0x47, 0xA3, 0x48, 0xA3, 0x49, 0xA3, 0x4A, /* 0x94-0x97 */
+	0xA3, 0x4B, 0xA3, 0x4C, 0xA3, 0x4D, 0xA3, 0x4E, /* 0x98-0x9B */
+	0xA3, 0x4F, 0xA3, 0x50, 0xA3, 0x51, 0xA3, 0x52, /* 0x9C-0x9F */
+	0xA3, 0x53, 0xA3, 0x54, 0x00, 0x00, 0xA3, 0x55, /* 0xA0-0xA3 */
+	0xA3, 0x56, 0xA3, 0x57, 0xA3, 0x58, 0xA3, 0x59, /* 0xA4-0xA7 */
+	0xA3, 0x5A, 0xA3, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xA3, 0x5C, 0xA3, 0x5D, 0xA3, 0x5E, /* 0xB0-0xB3 */
+	0xA3, 0x5F, 0xA3, 0x60, 0xA3, 0x61, 0xA3, 0x62, /* 0xB4-0xB7 */
+	0xA3, 0x63, 0xA3, 0x64, 0xA3, 0x65, 0xA3, 0x66, /* 0xB8-0xBB */
+	0xA3, 0x67, 0xA3, 0x68, 0xA3, 0x69, 0xA3, 0x6A, /* 0xBC-0xBF */
+	0xA3, 0x6B, 0xA3, 0x6C, 0x00, 0x00, 0xA3, 0x6D, /* 0xC0-0xC3 */
+	0xA3, 0x6E, 0xA3, 0x6F, 0xA3, 0x70, 0xA3, 0x71, /* 0xC4-0xC7 */
+	0xA3, 0x72, 0xA3, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+};
+
+static const unsigned char u2c_20[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x56, /* 0x10-0x13 */
+	0xA1, 0x58, 0xA2, 0x77, 0xA1, 0xFC, 0x00, 0x00, /* 0x14-0x17 */
+	0xA1, 0xA5, 0xA1, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xA1, 0xA7, 0xA1, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0x45, 0x00, 0x00, /* 0x20-0x23 */
+	0xA3, 0xBB, 0xA1, 0x4C, 0xA1, 0x4B, 0xA1, 0x45, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xAC, 0xA1, 0xB2, /* 0x30-0x33 */
+	0x00, 0x00, 0xA1, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xB0, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xC3, 0x00, 0x00, /* 0x3C-0x3F */
+};
+
+static const unsigned char u2c_21[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0x4A, /* 0x00-0x03 */
+	0x00, 0x00, 0xA1, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xA2, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA2, 0xB9, 0xA2, 0xBA, 0xA2, 0xBB, 0xA2, 0xBC, /* 0x60-0x63 */
+	0xA2, 0xBD, 0xA2, 0xBE, 0xA2, 0xBF, 0xA2, 0xC0, /* 0x64-0x67 */
+	0xA2, 0xC1, 0xA2, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xA1, 0xF6, 0xA1, 0xF4, 0xA1, 0xF7, 0xA1, 0xF5, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xF8, 0xA1, 0xF9, /* 0x94-0x97 */
+	0xA1, 0xFB, 0xA1, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+};
+
+static const unsigned char u2c_22[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xA2, 0x41, 0xA2, 0x42, 0x00, 0x00, /* 0x14-0x17 */
+	0xA2, 0x58, 0x00, 0x00, 0xA1, 0xD4, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xDB, 0xA1, 0xE8, /* 0x1C-0x1F */
+	0xA1, 0xE7, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xFD, /* 0x20-0x23 */
+	0x00, 0x00, 0xA1, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xA1, 0xE4, 0xA1, 0xE5, 0xA1, 0xEC, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xED, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xA1, 0xEF, 0xA1, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xDC, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA1, 0xDA, 0xA1, 0xDD, 0x00, 0x00, 0xA1, 0xDD, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xD8, 0xA1, 0xD9, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xA1, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xA1, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xA1, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xE9, /* 0xBC-0xBF */
+};
+
+static const unsigned char u2c_23[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x5B, /* 0x04-0x07 */
+};
+
+static const unsigned char u2c_25[512] = {
+	0xA2, 0x77, 0x00, 0x00, 0xA2, 0x78, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xA2, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xA2, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xA2, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xA2, 0x7D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xA2, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xA2, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xA2, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xA2, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xA2, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF9, 0xF9, 0xF9, 0xF8, 0xF9, 0xE6, 0xF9, 0xEF, /* 0x50-0x53 */
+	0xF9, 0xDD, 0xF9, 0xE8, 0xF9, 0xF1, 0xF9, 0xDF, /* 0x54-0x57 */
+	0xF9, 0xEC, 0xF9, 0xF5, 0xF9, 0xE3, 0xF9, 0xEE, /* 0x58-0x5B */
+	0xF9, 0xF7, 0xF9, 0xE5, 0xF9, 0xE9, 0xF9, 0xF2, /* 0x5C-0x5F */
+	0xF9, 0xE0, 0xF9, 0xEB, 0xF9, 0xF4, 0xF9, 0xE2, /* 0x60-0x63 */
+	0xF9, 0xE7, 0xF9, 0xF0, 0xF9, 0xDE, 0xF9, 0xED, /* 0x64-0x67 */
+	0xF9, 0xF6, 0xF9, 0xE4, 0xF9, 0xEA, 0xF9, 0xF3, /* 0x68-0x6B */
+	0xF9, 0xE1, 0xA2, 0x7E, 0xA2, 0xA1, 0xA2, 0xA3, /* 0x6C-0x6F */
+	0xA2, 0xA2, 0xA2, 0xAC, 0xA2, 0xAD, 0xA2, 0xAE, /* 0x70-0x73 */
+	0xA1, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xA2, 0x62, 0xA2, 0x63, 0xA2, 0x64, /* 0x80-0x83 */
+	0xA2, 0x65, 0xA2, 0x66, 0xA2, 0x67, 0xA2, 0x68, /* 0x84-0x87 */
+	0xA2, 0x69, 0xA2, 0x70, 0xA2, 0x6F, 0xA2, 0x6E, /* 0x88-0x8B */
+	0xA2, 0x6D, 0xA2, 0x6C, 0xA2, 0x6B, 0xA2, 0x6A, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xFE, /* 0x90-0x93 */
+	0xA2, 0x76, 0xA2, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xA1, 0xBD, 0xA1, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xB6, 0xA1, 0xB5, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xA1, 0xBF, 0xA1, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xBB, 0xA1, 0xBA, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xB3, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xA1, 0xB7, 0xA1, 0xB4, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0xA8, 0xA2, 0xA9, /* 0xE0-0xE3 */
+	0xA2, 0xAB, 0xA2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+};
+
+static const unsigned char u2c_26[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xA1, 0xB9, 0xA1, 0xB8, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xA1, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xA1, 0xF0, 0xA1, 0xF2, 0xA1, 0xF1, 0x00, 0x00, /* 0x40-0x43 */
+};
+
+static const unsigned char u2c_30[512] = {
+	0xA1, 0x40, 0xA1, 0x42, 0xA1, 0x43, 0xA1, 0xB2, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xA1, 0x71, 0xA1, 0x72, 0xA1, 0x6D, 0xA1, 0x6E, /* 0x08-0x0B */
+	0xA1, 0x75, 0xA1, 0x76, 0xA1, 0x79, 0xA1, 0x7A, /* 0x0C-0x0F */
+	0xA1, 0x69, 0xA1, 0x6A, 0xA2, 0x45, 0x00, 0x00, /* 0x10-0x13 */
+	0xA1, 0x65, 0xA1, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xA1, 0xE3, 0xA1, 0xA9, 0xA1, 0xAA, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xA2, 0xC3, 0xA2, 0xC4, 0xA2, 0xC5, /* 0x20-0x23 */
+	0xA2, 0xC6, 0xA2, 0xC7, 0xA2, 0xC8, 0xA2, 0xC9, /* 0x24-0x27 */
+	0xA2, 0xCA, 0xA2, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xA1, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+};
+
+static const unsigned char u2c_31[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xA3, 0x74, 0xA3, 0x75, 0xA3, 0x76, /* 0x04-0x07 */
+	0xA3, 0x77, 0xA3, 0x78, 0xA3, 0x79, 0xA3, 0x7A, /* 0x08-0x0B */
+	0xA3, 0x7B, 0xA3, 0x7C, 0xA3, 0x7D, 0xA3, 0x7E, /* 0x0C-0x0F */
+	0xA3, 0xA1, 0xA3, 0xA2, 0xA3, 0xA3, 0xA3, 0xA4, /* 0x10-0x13 */
+	0xA3, 0xA5, 0xA3, 0xA6, 0xA3, 0xA7, 0xA3, 0xA8, /* 0x14-0x17 */
+	0xA3, 0xA9, 0xA3, 0xAA, 0xA3, 0xAB, 0xA3, 0xAC, /* 0x18-0x1B */
+	0xA3, 0xAD, 0xA3, 0xAE, 0xA3, 0xAF, 0xA3, 0xB0, /* 0x1C-0x1F */
+	0xA3, 0xB1, 0xA3, 0xB2, 0xA3, 0xB3, 0xA3, 0xB4, /* 0x20-0x23 */
+	0xA3, 0xB5, 0xA3, 0xB6, 0xA3, 0xB7, 0xA3, 0xB8, /* 0x24-0x27 */
+	0xA3, 0xB9, 0xA3, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0x40, 0xA4, 0x47, /* 0x90-0x93 */
+	0xA4, 0x54, 0xA5, 0x7C, 0xA4, 0x57, 0xA4, 0xA4, /* 0x94-0x97 */
+	0xA4, 0x55, 0xA5, 0xD2, 0xA4, 0x41, 0xA4, 0xFE, /* 0x98-0x9B */
+	0xA4, 0x42, 0xA4, 0xD1, 0xA6, 0x61, 0xA4, 0x48, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_32[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xA4, 0x40, 0xA4, 0x47, 0xA4, 0x54, 0xA5, 0x7C, /* 0x20-0x23 */
+	0xA4, 0xAD, 0xA4, 0xBB, 0xA4, 0x43, 0xA4, 0x4B, /* 0x24-0x27 */
+	0xA4, 0x45, 0xA4, 0x51, 0xA4, 0xEB, 0xA4, 0xF5, /* 0x28-0x2B */
+	0xA4, 0xF4, 0xA4, 0xEC, 0xAA, 0xF7, 0xA4, 0x67, /* 0x2C-0x2F */
+	0xA4, 0xE9, 0xAE, 0xE8, 0xA6, 0xB3, 0xAA, 0xC0, /* 0x30-0x33 */
+	0xA6, 0x57, 0xAF, 0x53, 0xB0, 0x5D, 0xAF, 0xAC, /* 0x34-0x37 */
+	0xB3, 0xD2, 0xA5, 0x4E, 0xA9, 0x49, 0xBE, 0xC7, /* 0x38-0x3B */
+	0xBA, 0xCA, 0xA5, 0xF8, 0xB8, 0xEA, 0xA8, 0xF3, /* 0x3C-0x3F */
+	0xB2, 0xBD, 0xA5, 0xF0, 0xA6, 0xDB, 0xA6, 0xDC, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xA4, 0x40, 0xA4, 0x47, 0xA4, 0x54, 0xA5, 0x7C, /* 0x80-0x83 */
+	0xA4, 0xAD, 0xA4, 0xBB, 0xA4, 0x43, 0xA4, 0x4B, /* 0x84-0x87 */
+	0xA4, 0x45, 0xA4, 0x51, 0xA4, 0xEB, 0xA4, 0xF5, /* 0x88-0x8B */
+	0xA4, 0xF4, 0xA4, 0xEC, 0xAA, 0xF7, 0xA4, 0x67, /* 0x8C-0x8F */
+	0xA4, 0xE9, 0xAE, 0xE8, 0xA6, 0xB3, 0xAA, 0xC0, /* 0x90-0x93 */
+	0xA6, 0x57, 0xAF, 0x53, 0xB0, 0x5D, 0xAF, 0xAC, /* 0x94-0x97 */
+	0xB3, 0xD2, 0xAF, 0xB5, 0xA8, 0x6B, 0xA4, 0x6B, /* 0x98-0x9B */
+	0xBE, 0x41, 0xC0, 0x75, 0xA6, 0x4C, 0xAA, 0x60, /* 0x9C-0x9F */
+	0xB6, 0xB5, 0xA5, 0xF0, 0xBC, 0x67, 0xA1, 0xC0, /* 0xA0-0xA3 */
+	0xA4, 0x57, 0xA4, 0xA4, 0xA4, 0x55, 0xA5, 0xAA, /* 0xA4-0xA7 */
+	0xA5, 0x6B, 0xC2, 0xE5, 0xA9, 0x76, 0xBE, 0xC7, /* 0xA8-0xAB */
+	0xBA, 0xCA, 0xA5, 0xF8, 0xB8, 0xEA, 0xA8, 0xF3, /* 0xAC-0xAF */
+	0xA9, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+};
+
+static const unsigned char u2c_33[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0x55, 0xA2, 0x56, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xA2, 0x50, 0xA2, 0x51, 0xA2, 0x52, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xA2, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xA2, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xA2, 0x53, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xA1, 0xEB, 0xA1, 0xEA, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xA2, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+};
+
+static const unsigned char u2c_4E[512] = {
+	0xA4, 0x40, 0xA4, 0x42, 0x00, 0x00, 0xA4, 0x43, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x45, /* 0x04-0x07 */
+	0xA4, 0x56, 0xA4, 0x54, 0xA4, 0x57, 0xA4, 0x55, /* 0x08-0x0B */
+	0xC9, 0x46, 0xA4, 0xA3, 0xC9, 0x4F, 0xC9, 0x4D, /* 0x0C-0x0F */
+	0xA4, 0xA2, 0xA4, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xA5, 0x42, 0xA5, 0x41, 0xA5, 0x40, 0x00, 0x00, /* 0x14-0x17 */
+	0xA5, 0x43, 0xA4, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xA5, 0xE0, 0xA5, 0xE1, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xC3, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0x58, /* 0x28-0x2B */
+	0x00, 0x00, 0xA4, 0xA4, 0xC9, 0x50, 0x00, 0x00, /* 0x2C-0x2F */
+	0xA4, 0xA5, 0xC9, 0x63, 0xA6, 0xEA, 0xCB, 0xB1, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xA4, 0x59, 0xA4, 0xA6, 0x00, 0x00, 0xA5, 0x44, /* 0x38-0x3B */
+	0xC9, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xC9, 0x40, 0xA4, 0x44, /* 0x40-0x43 */
+	0x00, 0x00, 0xA4, 0x5B, 0x00, 0x00, 0xC9, 0x47, /* 0x44-0x47 */
+	0xA4, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xA7, /* 0x48-0x4B */
+	0x00, 0x00, 0xA5, 0x45, 0xA5, 0x47, 0xA5, 0x46, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xA5, 0xE2, 0xA5, 0xE3, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xC4, 0x00, 0x00, /* 0x54-0x57 */
+	0xAD, 0xBC, 0xA4, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xC9, 0x41, 0xA4, 0x45, 0xA4, 0x5E, 0xA4, 0x5D, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xA5, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC5, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xB0, 0xAE, 0xD4, 0x4B, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xB6, 0xC3, 0xDC, 0xB1, /* 0x80-0x83 */
+	0xDC, 0xB2, 0x00, 0x00, 0xA4, 0x46, 0x00, 0x00, /* 0x84-0x87 */
+	0xA4, 0xA9, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC6, /* 0x88-0x8B */
+	0xA4, 0x47, 0xC9, 0x48, 0xA4, 0x5F, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xA4, 0xAA, 0xA4, 0xAC, 0xC9, 0x51, /* 0x90-0x93 */
+	0xA4, 0xAD, 0xA4, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xA5, 0xE5, 0x00, 0x00, 0xA8, 0xC7, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xC8, 0xAB, 0x45, /* 0x9C-0x9F */
+	0x00, 0x00, 0xA4, 0x60, 0xA4, 0xAE, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xA5, 0xE6, 0xA5, 0xE8, 0xA5, 0xE7, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xA6, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC9, /* 0xA8-0xAB */
+	0xA8, 0xCA, 0xAB, 0x46, 0xAB, 0x47, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xBD, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xB3, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xF6, 0xD6, 0xA4, 0x48, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xA4, 0xB0, 0xA4, 0xAF, 0xC9, 0x52, 0xA4, 0xB1, /* 0xC0-0xC3 */
+	0xA4, 0xB7, 0x00, 0x00, 0xA4, 0xB2, 0xA4, 0xB3, /* 0xC4-0xC7 */
+	0xC9, 0x54, 0xC9, 0x53, 0xA4, 0xB5, 0xA4, 0xB6, /* 0xC8-0xCB */
+	0x00, 0x00, 0xA4, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xA5, 0x4A, 0xA5, 0x4B, 0xA5, 0x4C, 0xA5, 0x4D, /* 0xD4-0xD7 */
+	0xA5, 0x49, 0xA5, 0x50, 0xC9, 0x6A, 0x00, 0x00, /* 0xD8-0xDB */
+	0xC9, 0x66, 0xC9, 0x69, 0xA5, 0x51, 0xA5, 0x61, /* 0xDC-0xDF */
+	0x00, 0x00, 0xC9, 0x68, 0x00, 0x00, 0xA5, 0x4E, /* 0xE0-0xE3 */
+	0xA5, 0x4F, 0xA5, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xC9, 0x65, 0xC9, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xA5, 0xF5, 0xC9, 0xB0, 0xA5, 0xF2, 0xA5, 0xF6, /* 0xF0-0xF3 */
+	0xC9, 0xBA, 0xC9, 0xAE, 0xA5, 0xF3, 0xC9, 0xB2, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA5, 0xF4, /* 0xF8-0xFB */
+	0x00, 0x00, 0xA5, 0xF7, 0x00, 0x00, 0xA5, 0xE9, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_4F[512] = {
+	0xC9, 0xB1, 0xA5, 0xF8, 0xC9, 0xB5, 0x00, 0x00, /* 0x00-0x03 */
+	0xC9, 0xB9, 0xC9, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xC9, 0xB3, 0xA5, 0xEA, 0xA5, 0xEC, 0xA5, 0xF9, /* 0x08-0x0B */
+	0x00, 0x00, 0xA5, 0xEE, 0xC9, 0xAB, 0xA5, 0xF1, /* 0x0C-0x0F */
+	0xA5, 0xEF, 0xA5, 0xF0, 0xC9, 0xBB, 0xC9, 0xB8, /* 0x10-0x13 */
+	0xC9, 0xAF, 0xA5, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xC9, 0xAC, 0xA5, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xC9, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xC9, 0xB7, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xC9, 0xAD, 0xCA, 0x66, 0x00, 0x00, 0xA7, 0x42, /* 0x2C-0x2F */
+	0xA6, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xCA, 0x67, /* 0x30-0x33 */
+	0xA6, 0xF1, 0x00, 0x00, 0xA7, 0x44, 0x00, 0x00, /* 0x34-0x37 */
+	0xA6, 0xF9, 0x00, 0x00, 0xA6, 0xF8, 0xCA, 0x5B, /* 0x38-0x3B */
+	0xA6, 0xFC, 0xA6, 0xF7, 0xCA, 0x60, 0xCA, 0x68, /* 0x3C-0x3F */
+	0x00, 0x00, 0xCA, 0x64, 0x00, 0x00, 0xA6, 0xFA, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xA6, 0xFD, 0xA6, 0xEE, /* 0x44-0x47 */
+	0xA7, 0x47, 0xCA, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xCB, 0xBD, 0xA6, 0xEC, 0xA7, 0x43, 0xA6, 0xED, /* 0x4C-0x4F */
+	0xA6, 0xF5, 0xA6, 0xF6, 0xCA, 0x62, 0xCA, 0x5E, /* 0x50-0x53 */
+	0xA6, 0xFB, 0xA6, 0xF3, 0xCA, 0x5A, 0xA6, 0xEF, /* 0x54-0x57 */
+	0xCA, 0x65, 0xA7, 0x45, 0xA7, 0x48, 0xA6, 0xF2, /* 0x58-0x5B */
+	0xA7, 0x40, 0xA7, 0x46, 0xA6, 0xF0, 0xCA, 0x63, /* 0x5C-0x5F */
+	0xA7, 0x41, 0xCA, 0x69, 0xCA, 0x5C, 0xA6, 0xFE, /* 0x60-0x63 */
+	0xCA, 0x5F, 0x00, 0x00, 0x00, 0x00, 0xCA, 0x61, /* 0x64-0x67 */
+	0x00, 0x00, 0xA8, 0xD8, 0xCB, 0xBF, 0xCB, 0xCB, /* 0x68-0x6B */
+	0xA8, 0xD0, 0x00, 0x00, 0xCB, 0xCC, 0xA8, 0xCB, /* 0x6C-0x6F */
+	0xA8, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xCE, /* 0x70-0x73 */
+	0xCB, 0xB9, 0xA8, 0xD6, 0xCB, 0xB8, 0xCB, 0xBC, /* 0x74-0x77 */
+	0xCB, 0xC3, 0xCB, 0xC1, 0xA8, 0xDE, 0xA8, 0xD9, /* 0x78-0x7B */
+	0xCB, 0xB3, 0xCB, 0xB5, 0xA8, 0xDB, 0xA8, 0xCF, /* 0x7C-0x7F */
+	
+	0xCB, 0xB6, 0xCB, 0xC2, 0xCB, 0xC9, 0xA8, 0xD4, /* 0x80-0x83 */
+	0xCB, 0xBB, 0xCB, 0xB4, 0xA8, 0xD3, 0xCB, 0xB7, /* 0x84-0x87 */
+	0xA8, 0xD7, 0xCB, 0xBA, 0x00, 0x00, 0xA8, 0xD2, /* 0x88-0x8B */
+	0x00, 0x00, 0xA8, 0xCD, 0x00, 0x00, 0xA8, 0xDC, /* 0x8C-0x8F */
+	0xCB, 0xC4, 0xA8, 0xDD, 0xCB, 0xC8, 0x00, 0x00, /* 0x90-0x93 */
+	0xCB, 0xC6, 0xCB, 0xCA, 0xA8, 0xDA, 0xCB, 0xBE, /* 0x94-0x97 */
+	0xCB, 0xB2, 0x00, 0x00, 0xCB, 0xC0, 0xA8, 0xD1, /* 0x98-0x9B */
+	0xCB, 0xC5, 0xA8, 0xCC, 0xCB, 0xC7, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0x56, 0xAB, 0x4A, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0xE0, 0xCD, 0xE8, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xAB, 0x49, 0xAB, 0x51, 0xAB, 0x5D, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xCD, 0xEE, 0xCD, 0xEC, 0xCD, 0xE7, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x4B, /* 0xBC-0xBF */
+	0xCD, 0xED, 0xCD, 0xE3, 0xAB, 0x59, 0xAB, 0x50, /* 0xC0-0xC3 */
+	0xAB, 0x58, 0xCD, 0xDE, 0x00, 0x00, 0xCD, 0xEA, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xCD, 0xE1, 0xAB, 0x54, 0xCD, 0xE2, /* 0xC8-0xCB */
+	0x00, 0x00, 0xCD, 0xDD, 0xAB, 0x5B, 0xAB, 0x4E, /* 0xCC-0xCF */
+	0xAB, 0x57, 0xAB, 0x4D, 0x00, 0x00, 0xCD, 0xDF, /* 0xD0-0xD3 */
+	0xCD, 0xE4, 0x00, 0x00, 0xCD, 0xEB, 0xAB, 0x55, /* 0xD4-0xD7 */
+	0xAB, 0x52, 0xCD, 0xE6, 0xAB, 0x5A, 0xCD, 0xE9, /* 0xD8-0xDB */
+	0xCD, 0xE5, 0xAB, 0x4F, 0xAB, 0x5C, 0xAB, 0x53, /* 0xDC-0xDF */
+	0xAB, 0x4C, 0xAB, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xCD, 0xEF, 0x00, 0x00, 0xAD, 0xD7, 0xAD, 0xC1, /* 0xEC-0xEF */
+	0x00, 0x00, 0xAD, 0xD1, 0x00, 0x00, 0xAD, 0xD6, /* 0xF0-0xF3 */
+	0xD0, 0xD0, 0xD0, 0xCF, 0xD0, 0xD4, 0xD0, 0xD5, /* 0xF4-0xF7 */
+	0xAD, 0xC4, 0x00, 0x00, 0xAD, 0xCD, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xAD, 0xDA, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_50[512] = {
+	0xAD, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xD0, 0xC9, 0xAD, 0xC7, 0xD0, 0xCA, /* 0x04-0x07 */
+	0x00, 0x00, 0xAD, 0xDC, 0x00, 0x00, 0xAD, 0xD3, /* 0x08-0x0B */
+	0xAD, 0xBE, 0xAD, 0xBF, 0xD0, 0xDD, 0xB0, 0xBF, /* 0x0C-0x0F */
+	0x00, 0x00, 0xAD, 0xCC, 0xAD, 0xCB, 0xD0, 0xCB, /* 0x10-0x13 */
+	0xAD, 0xCF, 0xD4, 0x5B, 0xAD, 0xC6, 0xD0, 0xD6, /* 0x14-0x17 */
+	0xAD, 0xD5, 0xAD, 0xD4, 0xAD, 0xCA, 0xD0, 0xCE, /* 0x18-0x1B */
+	0xD0, 0xD7, 0x00, 0x00, 0xD0, 0xC8, 0xAD, 0xC9, /* 0x1C-0x1F */
+	0xD0, 0xD8, 0xAD, 0xD2, 0xD0, 0xCC, 0xAD, 0xC0, /* 0x20-0x23 */
+	0x00, 0x00, 0xAD, 0xC3, 0xAD, 0xC2, 0xD0, 0xD9, /* 0x24-0x27 */
+	0xAD, 0xD0, 0xAD, 0xC5, 0xAD, 0xD9, 0xAD, 0xDB, /* 0x28-0x2B */
+	0xD0, 0xD3, 0xAD, 0xD8, 0x00, 0x00, 0xD0, 0xDB, /* 0x2C-0x2F */
+	0xD0, 0xCD, 0xD0, 0xDC, 0x00, 0x00, 0xD0, 0xD1, /* 0x30-0x33 */
+	0x00, 0x00, 0xD0, 0xDA, 0x00, 0x00, 0xD0, 0xD2, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xAD, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xD4, 0x63, 0xD4, 0x57, 0x00, 0x00, 0xB0, 0xB3, /* 0x40-0x43 */
+	0x00, 0x00, 0xD4, 0x5C, 0xD4, 0x62, 0xB0, 0xB2, /* 0x44-0x47 */
+	0xD4, 0x55, 0xB0, 0xB6, 0xD4, 0x59, 0xD4, 0x52, /* 0x48-0x4B */
+	0xB0, 0xB4, 0xD4, 0x56, 0xB0, 0xB9, 0xB0, 0xBE, /* 0x4C-0x4F */
+	0x00, 0x00, 0xD4, 0x67, 0x00, 0x00, 0xD4, 0x51, /* 0x50-0x53 */
+	0x00, 0x00, 0xB0, 0xBA, 0x00, 0x00, 0xD4, 0x66, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xB0, 0xB5, 0xD4, 0x58, /* 0x58-0x5B */
+	0xB0, 0xB1, 0xD4, 0x53, 0xD4, 0x4F, 0xD4, 0x5D, /* 0x5C-0x5F */
+	0xD4, 0x50, 0xD4, 0x4E, 0xD4, 0x5A, 0xD4, 0x60, /* 0x60-0x63 */
+	0xD4, 0x61, 0xB0, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xD8, 0x5B, 0xD4, 0x5E, 0xD4, 0x4D, 0xD4, 0x5F, /* 0x68-0x6B */
+	0x00, 0x00, 0xB0, 0xC1, 0xD4, 0x64, 0xB0, 0xC0, /* 0x6C-0x6F */
+	0xD4, 0x4C, 0x00, 0x00, 0xD4, 0x54, 0xD4, 0x65, /* 0x70-0x73 */
+	0xB0, 0xBC, 0xB0, 0xBB, 0xB0, 0xB8, 0xB0, 0xBD, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xB0, 0xAF, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xB0, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xB3, 0xC8, 0x00, 0x00, 0xD8, 0x5E, 0xD8, 0x57, /* 0x80-0x83 */
+	0x00, 0x00, 0xB3, 0xC5, 0x00, 0x00, 0xD8, 0x5F, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0x55, /* 0x88-0x8B */
+	0xD8, 0x58, 0xB3, 0xC4, 0xD8, 0x59, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xB3, 0xC7, 0xD8, 0x5D, 0x00, 0x00, /* 0x90-0x93 */
+	0xD8, 0x53, 0xD8, 0x52, 0xB3, 0xC9, 0x00, 0x00, /* 0x94-0x97 */
+	0xB3, 0xCA, 0xB3, 0xC6, 0xB3, 0xCB, 0xD8, 0x51, /* 0x98-0x9B */
+	0xD8, 0x5C, 0xD8, 0x5A, 0xD8, 0x54, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0xC3, 0xD8, 0x56, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xB6, 0xCA, 0xB6, 0xC4, 0xDC, 0xB7, 0xB6, 0xCD, /* 0xAC-0xAF */
+	0xDC, 0xBD, 0xDC, 0xC0, 0xB6, 0xC6, 0xB6, 0xC7, /* 0xB0-0xB3 */
+	0xDC, 0xBA, 0xB6, 0xC5, 0xDC, 0xC3, 0xB6, 0xCB, /* 0xB4-0xB7 */
+	0xDC, 0xC4, 0x00, 0x00, 0xDC, 0xBF, 0xB6, 0xCC, /* 0xB8-0xBB */
+	0x00, 0x00, 0xDC, 0xB4, 0xB6, 0xC9, 0xDC, 0xB5, /* 0xBC-0xBF */
+	0x00, 0x00, 0xDC, 0xBE, 0xDC, 0xBC, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xDC, 0xB8, 0xB6, 0xC8, 0xDC, 0xB6, 0xB6, 0xCE, /* 0xC4-0xC7 */
+	0xDC, 0xBB, 0xDC, 0xC2, 0xDC, 0xB9, 0xDC, 0xC1, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xB9, 0xB6, 0xB9, 0xB3, /* 0xCC-0xCF */
+	0x00, 0x00, 0xB9, 0xB4, 0x00, 0x00, 0xE0, 0xF9, /* 0xD0-0xD3 */
+	0xE0, 0xF1, 0xB9, 0xB2, 0xB9, 0xAF, 0xE0, 0xF2, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xB9, 0xB1, 0xE0, 0xF5, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE0, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xE0, 0xFE, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xFD, /* 0xE0-0xE3 */
+	0xE0, 0xF8, 0xB9, 0xAE, 0xE0, 0xF0, 0xB9, 0xAC, /* 0xE4-0xE7 */
+	0xE0, 0xF3, 0xB9, 0xB7, 0xE0, 0xF6, 0x00, 0x00, /* 0xE8-0xEB */
+	0xE0, 0xFA, 0xB9, 0xB0, 0xB9, 0xAD, 0xE0, 0xFC, /* 0xEC-0xEF */
+	0xE0, 0xFB, 0xB9, 0xB5, 0x00, 0x00, 0xE0, 0xF4, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xBB, 0xF8, 0xE4, 0xEC, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xE4, 0xE9, 0xBB, 0xF9, 0x00, 0x00, 0xBB, 0xF7, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE4, 0xF0, 0xE4, 0xED, 0xE4, 0xE6, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_51[512] = {
+	0xBB, 0xF6, 0x00, 0x00, 0xBB, 0xFA, 0xE4, 0xE7, /* 0x00-0x03 */
+	0xBB, 0xF5, 0xBB, 0xFD, 0xE4, 0xEA, 0xE4, 0xEB, /* 0x04-0x07 */
+	0xBB, 0xFB, 0xBB, 0xFC, 0xE4, 0xF1, 0xE4, 0xEE, /* 0x08-0x0B */
+	0xE4, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xBE, 0xAA, 0xE8, 0xF8, 0xBE, 0xA7, 0xE8, 0xF5, /* 0x10-0x13 */
+	0xBE, 0xA9, 0xBE, 0xAB, 0x00, 0x00, 0xE8, 0xF6, /* 0x14-0x17 */
+	0xBE, 0xA8, 0x00, 0x00, 0xE8, 0xF7, 0x00, 0x00, /* 0x18-0x1B */
+	0xE8, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xC0, 0x76, /* 0x1C-0x1F */
+	0xEC, 0xBD, 0xC0, 0x77, 0xEC, 0xBB, 0x00, 0x00, /* 0x20-0x23 */
+	0xEC, 0xBC, 0xEC, 0xBA, 0xEC, 0xB9, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xEC, 0xBE, 0xC0, 0x75, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xEF, 0xB8, 0xEF, 0xB9, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE4, 0xE8, 0xEF, 0xB7, 0xC0, 0x78, 0xC3, 0x5F, /* 0x30-0x33 */
+	0xF1, 0xEB, 0xF1, 0xEC, 0x00, 0x00, 0xC4, 0xD7, /* 0x34-0x37 */
+	0xC4, 0xD8, 0xF5, 0xC1, 0xF5, 0xC0, 0xC5, 0x6C, /* 0x38-0x3B */
+	0xC5, 0x6B, 0xF7, 0xD0, 0x00, 0x00, 0xA4, 0x49, /* 0x3C-0x3F */
+	0xA4, 0x61, 0xA4, 0xB9, 0x00, 0x00, 0xA4, 0xB8, /* 0x40-0x43 */
+	0xA5, 0x53, 0xA5, 0x52, 0xA5, 0xFC, 0xA5, 0xFB, /* 0x44-0x47 */
+	0xA5, 0xFD, 0xA5, 0xFA, 0x00, 0x00, 0xA7, 0x4A, /* 0x48-0x4B */
+	0xA7, 0x49, 0xA7, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xE0, 0x00, 0x00, /* 0x50-0x53 */
+	0xA8, 0xDF, 0xA8, 0xE1, 0x00, 0x00, 0xAB, 0x5E, /* 0x54-0x57 */
+	0x00, 0x00, 0xA2, 0x59, 0xD0, 0xDE, 0xA2, 0x5A, /* 0x58-0x5B */
+	0xB0, 0xC2, 0xA2, 0x5C, 0xA2, 0x5B, 0xD8, 0x60, /* 0x5C-0x5F */
+	0x00, 0x00, 0xA2, 0x5D, 0xB9, 0xB8, 0xA2, 0x5E, /* 0x60-0x63 */
+	0x00, 0x00, 0xA4, 0x4A, 0x00, 0x00, 0xA4, 0xBA, /* 0x64-0x67 */
+	0xA5, 0xFE, 0xA8, 0xE2, 0x00, 0x00, 0xA4, 0x4B, /* 0x68-0x6B */
+	0xA4, 0xBD, 0xA4, 0xBB, 0xA4, 0xBC, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xA6, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xA7, 0x4C, 0xA8, 0xE4, 0xA8, 0xE3, /* 0x74-0x77 */
+	0xA8, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xAD, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xBE, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x4E, /* 0x84-0x87 */
+	0x00, 0x00, 0xA5, 0x54, 0xA5, 0x55, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xA6, 0x41, 0x00, 0x00, 0xCA, 0x6A, /* 0x8C-0x8F */
+	0x00, 0x00, 0xAB, 0x60, 0xAB, 0x5F, 0xD0, 0xE0, /* 0x90-0x93 */
+	0xD0, 0xDF, 0xB0, 0xC3, 0x00, 0x00, 0xA4, 0xBE, /* 0x94-0x97 */
+	0xC9, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xCD, 0x00, 0x00, /* 0x9C-0x9F */
+	0xAB, 0x61, 0x00, 0x00, 0xAD, 0xE0, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xAD, 0xDE, 0xAD, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xBE, 0xAD, 0x00, 0x00, /* 0xA8-0xAB */
+	0xA5, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xA6, 0x42, 0xC9, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0x4D, 0xA7, 0x4E, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xCA, 0x6B, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xCB, 0xCE, 0xA8, 0xE6, 0xCB, 0xCF, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xD0, 0xE2, 0xD0, 0xE3, 0xAD, 0xE3, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xD0, 0xE4, 0x00, 0x00, 0xD0, 0xE1, 0xAD, 0xE4, /* 0xC8-0xCB */
+	0xAD, 0xE2, 0xAD, 0xE1, 0xD0, 0xE5, 0x00, 0x00, /* 0xCC-0xCF */
+	0xD4, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xD8, 0x61, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xC5, /* 0xD4-0xD7 */
+	0xE1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xBB, 0xFE, 0xBE, 0xAE, 0xE8, 0xF9, 0x00, 0x00, /* 0xDC-0xDF */
+	0xA4, 0x4C, 0xA4, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xB0, 0xC4, 0xB3, 0xCD, 0x00, 0x00, 0xB9, 0xB9, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xC9, 0x42, 0xA4, 0xBF, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xA5, 0x59, 0xA5, 0x57, 0xA5, 0x58, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xA8, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_52[512] = {
+	0xA4, 0x4D, 0xA4, 0x4E, 0x00, 0x00, 0xA4, 0x62, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0xC0, 0xA4, 0xC1, /* 0x04-0x07 */
+	0xA4, 0xC2, 0xC9, 0xBE, 0xA5, 0x5A, 0x00, 0x00, /* 0x08-0x0B */
+	0xC9, 0x6B, 0x00, 0x00, 0xA6, 0x46, 0x00, 0x00, /* 0x0C-0x0F */
+	0xC9, 0xBF, 0xA6, 0x44, 0xA6, 0x45, 0xC9, 0xBD, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xA6, 0x47, 0xA6, 0x43, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xCA, 0x6C, 0xAA, 0xEC, 0xCA, 0x6D, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xCA, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xA7, 0x50, 0xA7, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xA7, 0x53, 0xA7, 0x51, 0xA7, 0x52, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xED, 0x00, 0x00, /* 0x2C-0x2F */
+	0xA8, 0xEC, 0xCB, 0xD4, 0xCB, 0xD1, 0xCB, 0xD2, /* 0x30-0x33 */
+	0x00, 0x00, 0xCB, 0xD0, 0xA8, 0xEE, 0xA8, 0xEA, /* 0x34-0x37 */
+	0xA8, 0xE9, 0x00, 0x00, 0xA8, 0xEB, 0xA8, 0xE8, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xA8, 0xEF, 0x00, 0x00, 0xAB, 0x63, /* 0x40-0x43 */
+	0xCD, 0xF0, 0x00, 0x00, 0xCB, 0xD3, 0xAB, 0x68, /* 0x44-0x47 */
+	0x00, 0x00, 0xCD, 0xF1, 0xAB, 0x64, 0xAB, 0x67, /* 0x48-0x4B */
+	0xAB, 0x66, 0xAB, 0x65, 0xAB, 0x62, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xE8, 0x00, 0x00, /* 0x50-0x53 */
+	0xAD, 0xE7, 0xD0, 0xEB, 0xAD, 0xE5, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xE7, 0xAD, 0xE8, /* 0x58-0x5B */
+	0xAD, 0xE6, 0xAD, 0xE9, 0xD0, 0xE9, 0xD0, 0xEA, /* 0x5C-0x5F */
+	0x00, 0x00, 0xD0, 0xE6, 0xD0, 0xEC, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xB3, 0xD1, 0xB0, 0xC5, 0xD4, 0x69, /* 0x68-0x6B */
+	0xD4, 0x6B, 0xD4, 0x6A, 0xD4, 0x6C, 0xB0, 0xC6, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0xCE, 0x00, 0x00, /* 0x70-0x73 */
+	0xB3, 0xCF, 0xB3, 0xD0, 0x00, 0x00, 0xB6, 0xD0, /* 0x74-0x77 */
+	0xDC, 0xC7, 0x00, 0x00, 0xDC, 0xC6, 0xDC, 0xC8, /* 0x78-0x7B */
+	0xDC, 0xC9, 0xB6, 0xD1, 0x00, 0x00, 0xB6, 0xCF, /* 0x7C-0x7F */
+	
+	0xE1, 0x41, 0xE1, 0x42, 0xB9, 0xBB, 0xB9, 0xBA, /* 0x80-0x83 */
+	0xE3, 0x5A, 0x00, 0x00, 0x00, 0x00, 0xBC, 0x40, /* 0x84-0x87 */
+	0xBC, 0x41, 0xBC, 0x42, 0xBC, 0x44, 0xE4, 0xF2, /* 0x88-0x8B */
+	0xE4, 0xF3, 0xBC, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xBE, 0xAF, 0x00, 0x00, 0xBE, 0xB0, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xED, 0xF5, 0xC3, /* 0x94-0x97 */
+	0xF5, 0xC2, 0xF7, 0xD1, 0x00, 0x00, 0xA4, 0x4F, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA5, 0x5C, /* 0x9C-0x9F */
+	0xA5, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x48, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xC9, 0xC0, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xA7, 0x55, 0xA7, 0x56, 0xA7, 0x54, /* 0xA8-0xAB */
+	0xA7, 0x57, 0xCA, 0x6F, 0xCA, 0x70, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xF1, /* 0xB8-0xBB */
+	0xCB, 0xD5, 0x00, 0x00, 0xA8, 0xF0, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCD, 0xF2, 0xAB, 0x6C, 0xCD, 0xF3, 0xAB, 0x6B, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x69, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xAB, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xD0, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xB0, 0xC7, 0xD4, 0x6E, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xB0, 0xCA, 0xD4, 0x6D, 0xB1, 0xE5, /* 0xD4-0xD7 */
+	0xB0, 0xC9, 0xB0, 0xC8, 0x00, 0x00, 0xB3, 0xD4, /* 0xD8-0xDB */
+	0x00, 0x00, 0xB3, 0xD3, 0xB3, 0xD2, 0xB6, 0xD2, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xB6, 0xD5, 0xB6, 0xD6, /* 0xE0-0xE3 */
+	0xB6, 0xD4, 0x00, 0x00, 0xB6, 0xD3, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xE1, 0x43, 0x00, 0x00, 0xE1, 0x44, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF5, /* 0xEC-0xEF */
+	0xBC, 0x45, 0xE4, 0xF4, 0x00, 0x00, 0xBE, 0xB1, /* 0xF0-0xF3 */
+	0xEC, 0xBF, 0xC0, 0x79, 0x00, 0x00, 0xF1, 0xEE, /* 0xF4-0xF7 */
+	0xC4, 0x55, 0x00, 0x00, 0xA4, 0x63, 0xA4, 0xC3, /* 0xF8-0xFB */
+	0xC9, 0x56, 0x00, 0x00, 0xA4, 0xC4, 0xA4, 0xC5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_53[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xA5, 0x5D, 0xA5, 0x5E, 0x00, 0x00, /* 0x04-0x07 */
+	0xA6, 0x49, 0xCA, 0x71, 0xCB, 0xD6, 0xCB, 0xD7, /* 0x08-0x0B */
+	0x00, 0x00, 0xAB, 0x6D, 0xD0, 0xEE, 0xB0, 0xCC, /* 0x0C-0x0F */
+	0xB0, 0xCB, 0xD8, 0x63, 0xD8, 0x62, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xA4, 0x50, 0xA4, 0xC6, 0xA5, 0x5F, /* 0x14-0x17 */
+	0x00, 0x00, 0xB0, 0xCD, 0xC9, 0x43, 0x00, 0x00, /* 0x18-0x1B */
+	0xC9, 0x6C, 0xA5, 0x60, 0x00, 0x00, 0xC9, 0xC2, /* 0x1C-0x1F */
+	0xA6, 0x4B, 0xA6, 0x4A, 0xC9, 0xC1, 0xA7, 0x58, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xAD, 0xEA, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xD4, 0x6F, 0x00, 0x00, 0xB6, 0xD7, /* 0x2C-0x2F */
+	0xE1, 0x45, 0xB9, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xE8, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xFD, /* 0x34-0x37 */
+	0x00, 0x00, 0xA4, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xCB, 0xD8, 0xCD, 0xF4, 0xB0, 0xD0, 0xB0, 0xCE, /* 0x3C-0x3F */
+	0xB0, 0xCF, 0xA4, 0x51, 0x00, 0x00, 0xA4, 0x64, /* 0x40-0x43 */
+	0xA2, 0xCD, 0xA4, 0xCA, 0x00, 0x00, 0xA4, 0xC9, /* 0x44-0x47 */
+	0xA4, 0xC8, 0xA5, 0x63, 0xA5, 0x62, 0x00, 0x00, /* 0x48-0x4B */
+	0xC9, 0x6D, 0xC9, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xA8, 0xF5, 0xA8, 0xF2, 0xA8, 0xF4, /* 0x50-0x53 */
+	0xA8, 0xF3, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x6E, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0xD5, 0x00, 0x00, /* 0x58-0x5B */
+	0xA4, 0x52, 0x00, 0x00, 0xA4, 0xCB, 0x00, 0x00, /* 0x5C-0x5F */
+	0xA5, 0x65, 0xA5, 0x64, 0x00, 0x00, 0xCA, 0x72, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xF6, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xC9, 0x57, 0x00, 0x00, 0xA5, 0x67, 0xA5, 0x66, /* 0x6C-0x6F */
+	0xA6, 0x4C, 0xA6, 0x4D, 0xCA, 0x73, 0xA7, 0x59, /* 0x70-0x73 */
+	0x00, 0x00, 0xA7, 0x5A, 0x00, 0x00, 0xA8, 0xF7, /* 0x74-0x77 */
+	0xA8, 0xF8, 0xA8, 0xF9, 0x00, 0x00, 0xAB, 0x6F, /* 0x78-0x7B */
+	0xCD, 0xF5, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xEB, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xC9, 0x44, 0x00, 0x00, /* 0x80-0x83 */
+	0xA4, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xC9, 0xC4, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0x74, 0xCA, 0x75, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xD9, 0x00, 0x00, /* 0x90-0x93 */
+	0xCB, 0xDA, 0x00, 0x00, 0xCD, 0xF7, 0xCD, 0xF6, /* 0x94-0x97 */
+	0xCD, 0xF9, 0xCD, 0xF8, 0xAB, 0x70, 0x00, 0x00, /* 0x98-0x9B */
+	0xD4, 0x70, 0xAD, 0xED, 0xD0, 0xEF, 0xAD, 0xEC, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xD8, 0x64, 0xB3, 0xD6, 0x00, 0x00, 0xD8, 0x65, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE1, 0x46, 0xB9, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xBC, 0x46, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xF1, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xC9, 0x58, 0x00, 0x00, 0xA5, 0x68, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB0, 0xD1, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xA4, 0x53, 0xA4, 0x65, 0xA4, 0xCE, 0xA4, 0xCD, /* 0xC8-0xCB */
+	0x00, 0x00, 0xA4, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xA8, 0xFB, 0x00, 0x00, 0xA8, 0xFA, 0xA8, 0xFC, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x71, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xEE, /* 0xDC-0xDF */
+	0x00, 0x00, 0xE8, 0xFB, 0xC2, 0x4F, 0xA4, 0x66, /* 0xE0-0xE3 */
+	0xA5, 0x6A, 0xA5, 0x79, 0xA5, 0x74, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xA5, 0x6F, 0xA5, 0x6E, 0xA5, 0x75, 0xA5, 0x73, /* 0xE8-0xEB */
+	0xA5, 0x6C, 0xA5, 0x7A, 0xA5, 0x6D, 0xA5, 0x69, /* 0xEC-0xEF */
+	0xA5, 0x78, 0xA5, 0x77, 0xA5, 0x76, 0xA5, 0x6B, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xA5, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xA5, 0x71, 0x00, 0x00, 0x00, 0x00, 0xA5, 0x7B, /* 0xF8-0xFB */
+	0xA5, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_54[512] = {
+	0x00, 0x00, 0xA6, 0x53, 0x00, 0x00, 0xA6, 0x59, /* 0x00-0x03 */
+	0xA6, 0x55, 0x00, 0x00, 0xA6, 0x5B, 0xC9, 0xC5, /* 0x04-0x07 */
+	0xA6, 0x58, 0xA6, 0x4E, 0xA6, 0x51, 0xA6, 0x54, /* 0x08-0x0B */
+	0xA6, 0x50, 0xA6, 0x57, 0xA6, 0x5A, 0xA6, 0x4F, /* 0x0C-0x0F */
+	0xA6, 0x52, 0xA6, 0x56, 0xA6, 0x5C, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xCA, 0x7E, 0xCA, 0x7B, 0x00, 0x00, 0xA7, 0x67, /* 0x18-0x1B */
+	0xCA, 0x7C, 0xA7, 0x5B, 0xA7, 0x5D, 0xA7, 0x75, /* 0x1C-0x1F */
+	0xA7, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xCA, 0xA5, 0xCA, 0x7D, 0xA7, 0x5F, 0xA7, 0x61, /* 0x24-0x27 */
+	0xCA, 0xA4, 0xA7, 0x68, 0xCA, 0x78, 0xA7, 0x74, /* 0x28-0x2B */
+	0xA7, 0x76, 0xA7, 0x5C, 0xA7, 0x6D, 0x00, 0x00, /* 0x2C-0x2F */
+	0xCA, 0x76, 0xA7, 0x73, 0x00, 0x00, 0xA7, 0x64, /* 0x30-0x33 */
+	0x00, 0x00, 0xA7, 0x6E, 0xA7, 0x6F, 0xCA, 0x77, /* 0x34-0x37 */
+	0xA7, 0x6C, 0xA7, 0x6A, 0x00, 0x00, 0xA7, 0x6B, /* 0x38-0x3B */
+	0xA7, 0x71, 0xCA, 0xA1, 0xA7, 0x5E, 0x00, 0x00, /* 0x3C-0x3F */
+	0xA7, 0x72, 0xCA, 0xA3, 0xA7, 0x66, 0xA7, 0x63, /* 0x40-0x43 */
+	0x00, 0x00, 0xCA, 0x7A, 0xA7, 0x62, 0xCA, 0xA6, /* 0x44-0x47 */
+	0xA7, 0x65, 0x00, 0x00, 0xA7, 0x69, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0x60, 0xCA, 0xA2, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xCA, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xCB, 0xEB, 0xCB, 0xEA, 0xA9, 0x4F, 0xCB, 0xED, /* 0x60-0x63 */
+	0xCB, 0xEF, 0xCB, 0xE4, 0xCB, 0xE7, 0xCB, 0xEE, /* 0x64-0x67 */
+	0xA9, 0x50, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xE1, /* 0x68-0x6B */
+	0xCB, 0xE5, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xE9, /* 0x6C-0x6F */
+	0xCE, 0x49, 0xA9, 0x4B, 0xCE, 0x4D, 0xA8, 0xFD, /* 0x70-0x73 */
+	0xCB, 0xE6, 0xA8, 0xFE, 0xA9, 0x4C, 0xA9, 0x45, /* 0x74-0x77 */
+	0xA9, 0x41, 0x00, 0x00, 0xCB, 0xE2, 0xA9, 0x44, /* 0x78-0x7B */
+	0xA9, 0x49, 0xA9, 0x52, 0xCB, 0xE3, 0xCB, 0xDC, /* 0x7C-0x7F */
+	
+	0xA9, 0x43, 0xCB, 0xDD, 0xCB, 0xDF, 0x00, 0x00, /* 0x80-0x83 */
+	0xA9, 0x46, 0x00, 0x00, 0xA9, 0x48, 0xCB, 0xDB, /* 0x84-0x87 */
+	0xCB, 0xE0, 0x00, 0x00, 0x00, 0x00, 0xA9, 0x51, /* 0x88-0x8B */
+	0xA9, 0x4D, 0xCB, 0xE8, 0xA9, 0x53, 0x00, 0x00, /* 0x8C-0x8F */
+	0xA9, 0x4A, 0xCB, 0xDE, 0xA9, 0x47, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xA9, 0x42, 0xA9, 0x40, 0x00, 0x00, /* 0x94-0x97 */
+	0xCB, 0xEC, 0x00, 0x00, 0xA9, 0x4E, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xCE, 0x48, 0xCD, 0xFB, 0xCE, 0x4B, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xCD, 0xFD, 0xAB, 0x78, 0xAB, 0xA8, /* 0xA4-0xA7 */
+	0xAB, 0x74, 0xAB, 0xA7, 0xAB, 0x7D, 0xAB, 0xA4, /* 0xA8-0xAB */
+	0xAB, 0x72, 0xCD, 0xFC, 0xCE, 0x43, 0xAB, 0xA3, /* 0xAC-0xAF */
+	0xCE, 0x4F, 0xAB, 0xA5, 0x00, 0x00, 0xAB, 0x79, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0x45, 0xCE, 0x42, /* 0xB4-0xB7 */
+	0xAB, 0x77, 0x00, 0x00, 0xCD, 0xFA, 0xAB, 0xA6, /* 0xB8-0xBB */
+	0xCE, 0x4A, 0xAB, 0x7C, 0xCE, 0x4C, 0xAB, 0xA9, /* 0xBC-0xBF */
+	0xAB, 0x73, 0xAB, 0x7E, 0xAB, 0x7B, 0xCE, 0x40, /* 0xC0-0xC3 */
+	0xAB, 0xA1, 0xCE, 0x46, 0xCE, 0x47, 0xAB, 0x7A, /* 0xC4-0xC7 */
+	0xAB, 0xA2, 0xAB, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0x75, 0xCD, 0xFE, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0x44, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0x4E, 0x00, 0x00, /* 0xDC-0xDF */
+	0xD1, 0x44, 0xAD, 0xFB, 0xD0, 0xF1, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xD0, 0xF6, 0xAD, 0xF4, 0xAE, 0x40, 0xD0, 0xF4, /* 0xE4-0xE7 */
+	0xAD, 0xEF, 0xAD, 0xF9, 0xAD, 0xFE, 0xD0, 0xFB, /* 0xE8-0xEB */
+	0x00, 0x00, 0xAD, 0xFA, 0xAD, 0xFD, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xD0, 0xFE, 0xAD, 0xF5, 0xD0, 0xF5, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0x42, /* 0xF4-0xF7 */
+	0xD1, 0x43, 0x00, 0x00, 0xAD, 0xF7, 0xD1, 0x41, /* 0xF8-0xFB */
+	0xAD, 0xF3, 0xAE, 0x43, 0x00, 0x00, 0xD0, 0xF8, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_55[512] = {
+	0x00, 0x00, 0xAD, 0xF1, 0x00, 0x00, 0xD1, 0x46, /* 0x00-0x03 */
+	0xD0, 0xF9, 0xD0, 0xFD, 0xAD, 0xF6, 0xAE, 0x42, /* 0x04-0x07 */
+	0xD0, 0xFA, 0xAD, 0xFC, 0xD1, 0x40, 0xD1, 0x47, /* 0x08-0x0B */
+	0xD4, 0xA1, 0x00, 0x00, 0xD1, 0x45, 0xAE, 0x44, /* 0x0C-0x0F */
+	0xAD, 0xF0, 0xD0, 0xFC, 0xD0, 0xF3, 0x00, 0x00, /* 0x10-0x13 */
+	0xAD, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF2, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xF7, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xF0, 0xAE, 0x41, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0x77, 0x00, 0x00, /* 0x28-0x2B */
+	0xB0, 0xE4, 0xD4, 0xA7, 0xB0, 0xE2, 0xB0, 0xDF, /* 0x2C-0x2F */
+	0xD4, 0x7C, 0xB0, 0xDB, 0xD4, 0xA2, 0xB0, 0xE6, /* 0x30-0x33 */
+	0xD4, 0x76, 0xD4, 0x7B, 0xD4, 0x7A, 0xAD, 0xF2, /* 0x34-0x37 */
+	0xB0, 0xE1, 0xD4, 0xA5, 0x00, 0x00, 0xD4, 0xA8, /* 0x38-0x3B */
+	0xD4, 0x73, 0x00, 0x00, 0xB3, 0xE8, 0x00, 0x00, /* 0x3C-0x3F */
+	0xD4, 0xA9, 0xB0, 0xE7, 0x00, 0x00, 0xB0, 0xD9, /* 0x40-0x43 */
+	0xB0, 0xD6, 0xD4, 0x7E, 0xB0, 0xD3, 0x00, 0x00, /* 0x44-0x47 */
+	0xD4, 0xA6, 0x00, 0x00, 0xB0, 0xDA, 0xD4, 0xAA, /* 0x48-0x4B */
+	0x00, 0x00, 0xD4, 0x74, 0xD4, 0xA4, 0xB0, 0xDD, /* 0x4C-0x4F */
+	0xD4, 0x75, 0xD4, 0x78, 0xD4, 0x7D, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xB0, 0xDE, 0xB0, 0xDC, 0xB0, 0xE8, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xB0, 0xE3, 0x00, 0x00, 0xB0, 0xD7, 0xB1, 0xD2, /* 0x5C-0x5F */
+	0x00, 0x00, 0xB0, 0xD8, 0xD4, 0x79, 0xB0, 0xE5, /* 0x60-0x63 */
+	0xB0, 0xE0, 0xD4, 0xA3, 0xB0, 0xD5, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xB0, 0xD4, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xD4, 0x71, 0xD4, 0x72, 0xD8, 0x6A, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xD7, /* 0x78-0x7B */
+	0xB3, 0xDA, 0xD8, 0x75, 0xB3, 0xEE, 0xD8, 0x78, /* 0x7C-0x7F */
+	
+	0xB3, 0xD8, 0xD8, 0x71, 0xB3, 0xDE, 0xB3, 0xE4, /* 0x80-0x83 */
+	0xB5, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xE2, /* 0x84-0x87 */
+	0xD8, 0x6E, 0xB3, 0xEF, 0xB3, 0xDB, 0xB3, 0xE3, /* 0x88-0x8B */
+	0xD8, 0x76, 0xDC, 0xD7, 0xD8, 0x7B, 0xD8, 0x6F, /* 0x8C-0x8F */
+	0x00, 0x00, 0xD8, 0x66, 0xD8, 0x73, 0xD8, 0x6D, /* 0x90-0x93 */
+	0xB3, 0xE1, 0xD8, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xB3, 0xDD, 0xB3, 0xF1, 0xB3, 0xEA, 0x00, 0x00, /* 0x98-0x9B */
+	0xB3, 0xDF, 0xB3, 0xDC, 0x00, 0x00, 0xB3, 0xE7, /* 0x9C-0x9F */
+	0x00, 0x00, 0xD8, 0x7A, 0xD8, 0x6C, 0xD8, 0x72, /* 0xA0-0xA3 */
+	0xD8, 0x74, 0xD8, 0x68, 0xD8, 0x77, 0xB3, 0xD9, /* 0xA4-0xA7 */
+	0xD8, 0x67, 0x00, 0x00, 0xB3, 0xE0, 0xB3, 0xF0, /* 0xA8-0xAB */
+	0xB3, 0xEC, 0xD8, 0x69, 0xB3, 0xE6, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xB3, 0xED, 0xB3, 0xE9, 0xB3, 0xE5, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xD8, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xEB, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xD5, /* 0xBC-0xBF */
+	0xDC, 0xD1, 0x00, 0x00, 0xDC, 0xE0, 0xDC, 0xCA, /* 0xC0-0xC3 */
+	0xDC, 0xD3, 0xB6, 0xE5, 0xB6, 0xE6, 0xB6, 0xDE, /* 0xC4-0xC7 */
+	0xDC, 0xDC, 0xB6, 0xE8, 0xDC, 0xCF, 0xDC, 0xCE, /* 0xC8-0xCB */
+	0xDC, 0xCC, 0xDC, 0xDE, 0xB6, 0xDC, 0xDC, 0xD8, /* 0xCC-0xCF */
+	0xDC, 0xCD, 0xB6, 0xDF, 0xDC, 0xD6, 0xB6, 0xDA, /* 0xD0-0xD3 */
+	0xDC, 0xD2, 0xDC, 0xD9, 0xDC, 0xDB, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xDC, 0xDF, 0xB6, 0xE3, 0xDC, 0xCB, /* 0xD8-0xDB */
+	0xB6, 0xDD, 0xDC, 0xD0, 0x00, 0x00, 0xB6, 0xD8, /* 0xDC-0xDF */
+	0x00, 0x00, 0xB6, 0xE4, 0xDC, 0xDA, 0xB6, 0xE0, /* 0xE0-0xE3 */
+	0xB6, 0xE1, 0xB6, 0xE7, 0xB6, 0xDB, 0xA2, 0x5F, /* 0xE4-0xE7 */
+	0xB6, 0xD9, 0xDC, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xE2, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xDC, 0xDD, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xB9, 0xCD, 0xB9, 0xC8, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE1, 0x55, 0xE1, 0x51, 0x00, 0x00, /* 0xF8-0xFB */
+	0xE1, 0x4B, 0xB9, 0xC2, 0xB9, 0xBE, 0xE1, 0x54, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_56[512] = {
+	0xB9, 0xBF, 0xE1, 0x4E, 0xE1, 0x50, 0x00, 0x00, /* 0x00-0x03 */
+	0xE1, 0x53, 0x00, 0x00, 0xB9, 0xC4, 0x00, 0x00, /* 0x04-0x07 */
+	0xB9, 0xCB, 0xB9, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xE1, 0x49, 0xB9, 0xC6, 0xB9, 0xC7, 0xE1, 0x4C, /* 0x0C-0x0F */
+	0xB9, 0xCC, 0x00, 0x00, 0xE1, 0x4A, 0xE1, 0x4F, /* 0x10-0x13 */
+	0xB9, 0xC3, 0xE1, 0x48, 0xB9, 0xC9, 0xB9, 0xC1, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB9, 0xC0, /* 0x18-0x1B */
+	0xE1, 0x4D, 0xE1, 0x52, 0x00, 0x00, 0xB9, 0xCA, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x47, /* 0x24-0x27 */
+	0x00, 0x00, 0xBC, 0x4D, 0xE5, 0x47, 0x00, 0x00, /* 0x28-0x2B */
+	0xE5, 0x44, 0x00, 0x00, 0xBC, 0x47, 0xBC, 0x53, /* 0x2C-0x2F */
+	0xBC, 0x54, 0x00, 0x00, 0xBC, 0x4A, 0xE5, 0x42, /* 0x30-0x33 */
+	0xBC, 0x4C, 0xE4, 0xF9, 0xBC, 0x52, 0x00, 0x00, /* 0x34-0x37 */
+	0xE5, 0x46, 0xBC, 0x49, 0xE5, 0x48, 0xBC, 0x48, /* 0x38-0x3B */
+	0x00, 0x00, 0xE5, 0x43, 0xE5, 0x45, 0xBC, 0x4B, /* 0x3C-0x3F */
+	0xE5, 0x41, 0xE4, 0xFA, 0xE4, 0xF7, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xD8, 0x6B, 0xE4, 0xFD, 0x00, 0x00, /* 0x44-0x47 */
+	0xE4, 0xF6, 0xE4, 0xFC, 0xE4, 0xFB, 0x00, 0x00, /* 0x48-0x4B */
+	0xE4, 0xF8, 0x00, 0x00, 0xBC, 0x4F, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBC, 0x4E, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBC, 0x50, /* 0x54-0x57 */
+	0xE4, 0xFE, 0xBE, 0xB2, 0xE5, 0x40, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x45, 0x00, 0x00, /* 0x5C-0x5F */
+	0xE8, 0xFD, 0x00, 0x00, 0xBE, 0xBE, 0xE9, 0x42, /* 0x60-0x63 */
+	0xBE, 0xB6, 0xBE, 0xBA, 0xE9, 0x41, 0x00, 0x00, /* 0x64-0x67 */
+	0xBE, 0xB9, 0xBE, 0xB5, 0xBE, 0xB8, 0xBE, 0xB3, /* 0x68-0x6B */
+	0xBE, 0xBD, 0xE9, 0x43, 0xE8, 0xFE, 0xBE, 0xBC, /* 0x6C-0x6F */
+	0xE8, 0xFC, 0xBE, 0xBB, 0xE9, 0x44, 0xE9, 0x40, /* 0x70-0x73 */
+	0xBC, 0x51, 0x00, 0x00, 0xBE, 0xBF, 0xE9, 0x46, /* 0x74-0x77 */
+	0xBE, 0xB7, 0xBE, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xC6, 0xEC, 0xC8, /* 0x7C-0x7F */
+	
+	0xC0, 0x7B, 0xEC, 0xC9, 0xEC, 0xC7, 0xEC, 0xC5, /* 0x80-0x83 */
+	0xEC, 0xC4, 0xC0, 0x7D, 0xEC, 0xC3, 0xC0, 0x7E, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xEC, 0xC1, 0xEC, 0xC2, 0xC0, 0x7A, 0xC0, 0xA1, /* 0x8C-0x8F */
+	0xC0, 0x7C, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xC0, /* 0x90-0x93 */
+	0x00, 0x00, 0xC2, 0x50, 0x00, 0x00, 0xEF, 0xBC, /* 0x94-0x97 */
+	0xEF, 0xBA, 0xEF, 0xBF, 0xEF, 0xBD, 0x00, 0x00, /* 0x98-0x9B */
+	0xEF, 0xBB, 0xEF, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xC3, 0x60, 0xF1, 0xF2, 0xF1, 0xF3, /* 0xA4-0xA7 */
+	0xC4, 0x56, 0x00, 0x00, 0xF1, 0xF4, 0xF1, 0xF0, /* 0xA8-0xAB */
+	0xF1, 0xF5, 0xF1, 0xF1, 0xC2, 0x51, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xFE, 0xF4, 0x41, /* 0xB0-0xB3 */
+	0xC4, 0x59, 0xF4, 0x40, 0xC4, 0x58, 0xC4, 0x57, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xC4, 0x5A, 0xF5, 0xC5, 0xF5, 0xC6, 0x00, 0x00, /* 0xBC-0xBF */
+	0xC4, 0xDA, 0xC4, 0xD9, 0xC4, 0xDB, 0xF5, 0xC4, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xF6, 0xD8, 0xF6, 0xD7, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xC5, 0x6D, 0xC5, 0x6F, 0xC5, 0x6E, 0xF6, 0xD9, /* 0xC8-0xCB */
+	0xC5, 0xC8, 0xF8, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xC5, 0xF1, 0x00, 0x00, 0xF8, 0xA5, /* 0xD0-0xD3 */
+	0xF8, 0xEE, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x49, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xA5, 0x7D, 0xA5, 0x7C, /* 0xD8-0xDB */
+	0x00, 0x00, 0xA6, 0x5F, 0xA6, 0x5E, 0xC9, 0xC7, /* 0xDC-0xDF */
+	0xA6, 0x5D, 0xC9, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xA7, 0x79, 0xCA, 0xA9, 0x00, 0x00, 0xCA, 0xA8, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0x77, 0xA7, 0x7A, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xA7, 0x00, 0x00, /* 0xEC-0xEF */
+	0xA7, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xF0, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xCB, 0xF1, 0xA9, 0x54, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xAA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_57[512] = {
+	0x00, 0x00, 0xD1, 0x48, 0xD1, 0x49, 0xAE, 0x45, /* 0x00-0x03 */
+	0xAE, 0x46, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xAC, /* 0x04-0x07 */
+	0xB0, 0xE9, 0xB0, 0xEB, 0xD4, 0xAB, 0xB0, 0xEA, /* 0x08-0x0B */
+	0xD8, 0x7C, 0xB3, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xB6, 0xE9, 0xB6, 0xEA, /* 0x10-0x13 */
+	0xDC, 0xE1, 0x00, 0x00, 0xB9, 0xCF, 0x00, 0x00, /* 0x14-0x17 */
+	0xB9, 0xCE, 0x00, 0x00, 0xE5, 0x49, 0xE9, 0x48, /* 0x18-0x1B */
+	0xE9, 0x47, 0x00, 0x00, 0xF9, 0x6B, 0xA4, 0x67, /* 0x1C-0x1F */
+	0xC9, 0x59, 0x00, 0x00, 0xC9, 0x6E, 0xC9, 0x6F, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xA6, 0x62, 0xA6, 0x66, 0xC9, 0xC9, 0x00, 0x00, /* 0x28-0x2B */
+	0xA6, 0x64, 0xA6, 0x63, 0xC9, 0xC8, 0xA6, 0x65, /* 0x2C-0x2F */
+	0xA6, 0x61, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x60, /* 0x30-0x33 */
+	0xC9, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xA6, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0xA3, 0x00, 0x00, /* 0x3C-0x3F */
+	0xA7, 0x7D, 0xCA, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xCA, 0xAB, 0x00, 0x00, 0xA7, 0xA1, /* 0x44-0x47 */
+	0x00, 0x00, 0xCA, 0xAD, 0xA7, 0x7B, 0xCA, 0xAE, /* 0x48-0x4B */
+	0xCA, 0xAC, 0xA7, 0x7E, 0xA7, 0xA2, 0xA7, 0xA5, /* 0x4C-0x4F */
+	0xA7, 0xA4, 0xA7, 0x7C, 0xCA, 0xAF, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xA9, 0x59, 0xCB, 0xFE, 0x00, 0x00, /* 0x60-0x63 */
+	0xA9, 0x5B, 0x00, 0x00, 0xA9, 0x5A, 0x00, 0x00, /* 0x64-0x67 */
+	0xCC, 0x40, 0xA9, 0x58, 0xA9, 0x57, 0xCB, 0xF5, /* 0x68-0x6B */
+	0x00, 0x00, 0xCB, 0xF4, 0x00, 0x00, 0xCB, 0xF2, /* 0x6C-0x6F */
+	0xCB, 0xF7, 0xCB, 0xF6, 0xCB, 0xF3, 0xCB, 0xFC, /* 0x70-0x73 */
+	0xCB, 0xFD, 0xCB, 0xFA, 0xCB, 0xF8, 0xA9, 0x56, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xFB, /* 0x78-0x7B */
+	0xA9, 0x5C, 0xCC, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xCB, 0xF9, 0x00, 0x00, 0xAB, 0xAB, 0xA9, 0x55, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xAC, /* 0x88-0x8B */
+	0xCE, 0x54, 0x00, 0x00, 0x00, 0x00, 0xCE, 0x5A, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xB2, /* 0x90-0x93 */
+	0xCE, 0x58, 0xCE, 0x5E, 0x00, 0x00, 0xCE, 0x55, /* 0x94-0x97 */
+	0xCE, 0x59, 0xCE, 0x5B, 0xCE, 0x5D, 0xCE, 0x57, /* 0x98-0x9B */
+	0x00, 0x00, 0xCE, 0x56, 0xCE, 0x51, 0xCE, 0x52, /* 0x9C-0x9F */
+	0xAB, 0xAD, 0x00, 0x00, 0xAB, 0xAF, 0xAB, 0xAE, /* 0xA0-0xA3 */
+	0xCE, 0x53, 0xCE, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0xB1, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xCE, 0x50, 0xD1, 0x53, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xD1, 0x52, 0xD1, 0x57, 0xD1, 0x4E, 0x00, 0x00, /* 0xB8-0xBB */
+	0xD1, 0x51, 0xD1, 0x50, 0x00, 0x00, 0xD1, 0x54, /* 0xBC-0xBF */
+	0x00, 0x00, 0xD1, 0x58, 0xAE, 0x47, 0xAE, 0x4A, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0x4F, 0xD1, 0x55, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAE, 0x49, /* 0xC8-0xCB */
+	0xD1, 0x4A, 0x00, 0x00, 0xAB, 0xB0, 0xD4, 0xBA, /* 0xCC-0xCF */
+	0xD1, 0x56, 0x00, 0x00, 0xD1, 0x4D, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xAE, 0x48, 0xD1, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD4, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xB0, 0xEC, /* 0xDC-0xDF */
+	0xB0, 0xF0, 0xD4, 0xC1, 0xD4, 0xAF, 0xD4, 0xBD, /* 0xE0-0xE3 */
+	0xB0, 0xF1, 0xD4, 0xBF, 0x00, 0x00, 0xD4, 0xC5, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xD4, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xD4, 0xC0, 0xD4, 0xB4, 0xD4, 0xBC, 0x00, 0x00, /* 0xEC-0xEF */
+	0xD4, 0xCA, 0xD4, 0xC8, 0xD4, 0xBE, 0xD4, 0xB9, /* 0xF0-0xF3 */
+	0xD4, 0xB2, 0xD8, 0xA6, 0xD4, 0xB0, 0xB0, 0xF5, /* 0xF4-0xF7 */
+	0xD4, 0xB7, 0xB0, 0xF6, 0xB0, 0xF2, 0xD4, 0xAD, /* 0xF8-0xFB */
+	0xD4, 0xC3, 0xD4, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_58[512] = {
+	0xD4, 0xB3, 0xD4, 0xC6, 0xB0, 0xF3, 0x00, 0x00, /* 0x00-0x03 */
+	0xD4, 0xCC, 0xB0, 0xED, 0xB0, 0xEF, 0xD4, 0xBB, /* 0x04-0x07 */
+	0xD4, 0xB6, 0xAE, 0x4B, 0xB0, 0xEE, 0xD4, 0xB8, /* 0x08-0x0B */
+	0xD4, 0xC7, 0xD4, 0xCB, 0xD4, 0xC2, 0x00, 0x00, /* 0x0C-0x0F */
+	0xD4, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xD4, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xD8, 0xA1, 0x00, 0x00, 0xD8, 0xAA, /* 0x18-0x1B */
+	0xD8, 0xA9, 0xB3, 0xFA, 0xD8, 0xA2, 0x00, 0x00, /* 0x1C-0x1F */
+	0xB3, 0xFB, 0xB3, 0xF9, 0x00, 0x00, 0xD8, 0xA4, /* 0x20-0x23 */
+	0xB3, 0xF6, 0xD8, 0xA8, 0x00, 0x00, 0xD8, 0xA3, /* 0x24-0x27 */
+	0xD8, 0xA5, 0xD8, 0x7D, 0xB3, 0xF4, 0x00, 0x00, /* 0x28-0x2B */
+	0xD8, 0xB2, 0xD8, 0xB1, 0xD8, 0xAE, 0xB3, 0xF3, /* 0x2C-0x2F */
+	0xB3, 0xF7, 0xB3, 0xF8, 0xD1, 0x4B, 0xD8, 0xAB, /* 0x30-0x33 */
+	0xB3, 0xF5, 0xB0, 0xF4, 0xD8, 0xAD, 0xD8, 0x7E, /* 0x34-0x37 */
+	0xD8, 0xB0, 0xD8, 0xAF, 0x00, 0x00, 0xD8, 0xB3, /* 0x38-0x3B */
+	0x00, 0x00, 0xDC, 0xEF, 0x00, 0x00, 0xD8, 0xAC, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xD8, 0xA7, 0xDC, 0xE7, 0xB6, 0xF4, 0xB6, 0xF7, /* 0x48-0x4B */
+	0xB6, 0xF2, 0xDC, 0xE6, 0xDC, 0xEA, 0xDC, 0xE5, /* 0x4C-0x4F */
+	0x00, 0x00, 0xB6, 0xEC, 0xB6, 0xF6, 0xDC, 0xE2, /* 0x50-0x53 */
+	0xB6, 0xF0, 0xDC, 0xE9, 0x00, 0x00, 0xB6, 0xEE, /* 0x54-0x57 */
+	0xB6, 0xED, 0xDC, 0xEC, 0xB6, 0xEF, 0xDC, 0xEE, /* 0x58-0x5B */
+	0x00, 0x00, 0xDC, 0xEB, 0xB6, 0xEB, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xB6, 0xF5, 0xDC, 0xF0, /* 0x60-0x63 */
+	0xDC, 0xE4, 0xDC, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xDC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xF1, /* 0x68-0x6B */
+	0x00, 0x00, 0xB6, 0xF3, 0x00, 0x00, 0xDC, 0xE8, /* 0x6C-0x6F */
+	0x00, 0x00, 0xDC, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xE1, 0x5D, 0xB9, 0xD0, 0xE1, 0x63, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xB9, 0xD5, 0xE1, 0x5F, 0xE1, 0x66, /* 0x78-0x7B */
+	0xE1, 0x57, 0xB9, 0xD7, 0xB9, 0xD1, 0xE1, 0x5C, /* 0x7C-0x7F */
+	
+	0xBC, 0x55, 0xE1, 0x5B, 0xE1, 0x64, 0xB9, 0xD2, /* 0x80-0x83 */
+	0x00, 0x00, 0xB9, 0xD6, 0xE1, 0x5A, 0xE1, 0x60, /* 0x84-0x87 */
+	0xE1, 0x65, 0xE1, 0x56, 0xB9, 0xD4, 0xE1, 0x5E, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0x62, 0xE1, 0x68, /* 0x8C-0x8F */
+	0xE1, 0x58, 0xE1, 0x61, 0x00, 0x00, 0xB9, 0xD3, /* 0x90-0x93 */
+	0xE1, 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xE1, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xBC, 0x59, 0xE5, 0x4B, 0xBC, 0x57, 0xBC, 0x56, /* 0x9C-0x9F */
+	0xE5, 0x4D, 0xE5, 0x52, 0x00, 0x00, 0xE5, 0x4E, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xE5, 0x51, 0xBC, 0x5C, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xBE, 0xA5, 0xBC, 0x5B, 0x00, 0x00, 0xE5, 0x4A, /* 0xA8-0xAB */
+	0xE5, 0x50, 0x00, 0x00, 0xBC, 0x5A, 0xE5, 0x4F, /* 0xAC-0xAF */
+	0x00, 0x00, 0xE5, 0x4C, 0x00, 0x00, 0xBC, 0x58, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x4D, 0xF9, 0xD9, /* 0xB8-0xBB */
+	0xE9, 0x4F, 0xE9, 0x4A, 0xBE, 0xC1, 0xE9, 0x4C, /* 0xBC-0xBF */
+	0x00, 0x00, 0xBE, 0xC0, 0xE9, 0x4E, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xBE, 0xC3, 0xE9, 0x50, 0xBE, 0xC2, /* 0xC4-0xC7 */
+	0xE9, 0x49, 0xE9, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0xA5, 0xEC, 0xCC, /* 0xCC-0xCF */
+	0x00, 0x00, 0xC0, 0xA4, 0xEC, 0xCD, 0xC0, 0xA3, /* 0xD0-0xD3 */
+	0xEC, 0xCB, 0xC0, 0xA2, 0xEC, 0xCA, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xC2, 0x53, 0xC2, 0x52, 0xF1, 0xF6, 0xF1, 0xF8, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF1, 0xF7, 0xC3, 0x61, 0xC3, 0x62, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xC3, 0x63, 0xF4, 0x42, /* 0xE0-0xE3 */
+	0xC4, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xD3, /* 0xE4-0xE7 */
+	0xF7, 0xD2, 0xC5, 0xF2, 0x00, 0x00, 0xA4, 0x68, /* 0xE8-0xEB */
+	0xA4, 0xD0, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xA7, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xCE, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xB3, 0xFC, 0xB3, 0xFD, 0x00, 0x00, /* 0xF8-0xFB */
+	0xDC, 0xF2, 0xB9, 0xD8, 0xE1, 0x69, 0xE5, 0x53, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_59[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x5A, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xB0, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xCC, 0x42, 0xCE, 0x60, 0xD1, 0x59, 0xAE, 0x4C, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xF9, 0x00, 0x00, /* 0x10-0x13 */
+	0xC4, 0xDC, 0xA4, 0x69, 0xA5, 0x7E, 0xC9, 0x70, /* 0x14-0x17 */
+	0x00, 0x00, 0xA6, 0x67, 0xA6, 0x68, 0x00, 0x00, /* 0x18-0x1B */
+	0xA9, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xB0, 0xF7, 0x00, 0x00, 0xB9, 0xDA, 0x00, 0x00, /* 0x20-0x23 */
+	0xB9, 0xDB, 0xB9, 0xD9, 0x00, 0x00, 0xA4, 0x6A, /* 0x24-0x27 */
+	0x00, 0x00, 0xA4, 0xD1, 0xA4, 0xD3, 0xA4, 0xD2, /* 0x28-0x2B */
+	0xC9, 0x5B, 0xA4, 0xD4, 0xA5, 0xA1, 0xC9, 0x71, /* 0x2C-0x2F */
+	0x00, 0x00, 0xA5, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x69, /* 0x34-0x37 */
+	0xA6, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xC9, 0xCB, 0x00, 0x00, 0xA7, 0xA8, 0x00, 0x00, /* 0x3C-0x3F */
+	0xCA, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xA9, 0x61, 0xCC, 0x43, 0x00, 0x00, 0xA9, 0x5F, /* 0x44-0x47 */
+	0xA9, 0x60, 0xA9, 0x5E, 0xD1, 0x5A, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0xB6, 0xAB, 0xB5, /* 0x4C-0x4F */
+	0xAB, 0xB7, 0xAB, 0xB4, 0x00, 0x00, 0xCE, 0x61, /* 0x50-0x53 */
+	0xA9, 0x62, 0xAB, 0xB3, 0x00, 0x00, 0xAE, 0x4D, /* 0x54-0x57 */
+	0xAE, 0x4E, 0x00, 0x00, 0xAE, 0x4F, 0x00, 0x00, /* 0x58-0x5B */
+	0xD4, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xB3, 0xFE, 0xD8, 0xB4, 0xB0, 0xF8, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xF8, /* 0x64-0x67 */
+	0x00, 0x00, 0xB9, 0xDD, 0xB9, 0xDC, 0xE1, 0x6A, /* 0x68-0x6B */
+	0x00, 0x00, 0xBC, 0x5D, 0xBE, 0xC4, 0x00, 0x00, /* 0x6C-0x6F */
+	0xEF, 0xC0, 0xF6, 0xDA, 0xF7, 0xD4, 0xA4, 0x6B, /* 0x70-0x73 */
+	0xA5, 0xA3, 0x00, 0x00, 0xA5, 0xA4, 0xC9, 0xD1, /* 0x74-0x77 */
+	0xA6, 0x6C, 0xA6, 0x6F, 0x00, 0x00, 0xC9, 0xCF, /* 0x78-0x7B */
+	0xC9, 0xCD, 0xA6, 0x6E, 0xC9, 0xD0, 0xC9, 0xD2, /* 0x7C-0x7F */
+	
+	0xC9, 0xCC, 0xA6, 0x71, 0xA6, 0x70, 0xA6, 0x6D, /* 0x80-0x83 */
+	0xA6, 0x6B, 0xC9, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0xB3, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xA7, 0xB0, 0xCA, 0xB6, 0xCA, 0xB9, /* 0x8C-0x8F */
+	0xCA, 0xB8, 0x00, 0x00, 0xA7, 0xAA, 0xA7, 0xB2, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0xAF, 0xCA, 0xB5, /* 0x94-0x97 */
+	0xCA, 0xB3, 0xA7, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xA7, 0xA9, 0xA7, 0xAC, 0x00, 0x00, /* 0x9C-0x9F */
+	0xCA, 0xB4, 0xCA, 0xBB, 0xCA, 0xB7, 0xA7, 0xAD, /* 0xA0-0xA3 */
+	0xA7, 0xB1, 0xA7, 0xB4, 0xCA, 0xB2, 0xCA, 0xBA, /* 0xA4-0xA7 */
+	0xA7, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0x67, 0xA9, 0x6F, /* 0xAC-0xAF */
+	0x00, 0x00, 0xCC, 0x4F, 0xCC, 0x48, 0xA9, 0x70, /* 0xB0-0xB3 */
+	0xCC, 0x53, 0xCC, 0x44, 0xCC, 0x4B, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xA9, 0x66, 0xCC, 0x45, 0xA9, 0x64, /* 0xB8-0xBB */
+	0xCC, 0x4C, 0xCC, 0x50, 0xA9, 0x63, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCC, 0x51, 0xCC, 0x4A, 0x00, 0x00, 0xCC, 0x4D, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xA9, 0x72, 0xA9, 0x69, 0xCC, 0x54, /* 0xC4-0xC7 */
+	0xCC, 0x52, 0x00, 0x00, 0xA9, 0x6E, 0xA9, 0x6C, /* 0xC8-0xCB */
+	0xCC, 0x49, 0xA9, 0x6B, 0xCC, 0x47, 0xCC, 0x46, /* 0xCC-0xCF */
+	0xA9, 0x6A, 0xA9, 0x68, 0xA9, 0x71, 0xA9, 0x6D, /* 0xD0-0xD3 */
+	0xA9, 0x65, 0x00, 0x00, 0xCC, 0x4E, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xAB, 0xB9, 0x00, 0x00, 0xAB, 0xC0, 0xCE, 0x6F, /* 0xD8-0xDB */
+	0xAB, 0xB8, 0xCE, 0x67, 0xCE, 0x63, 0x00, 0x00, /* 0xDC-0xDF */
+	0xCE, 0x73, 0xCE, 0x62, 0x00, 0x00, 0xAB, 0xBB, /* 0xE0-0xE3 */
+	0xCE, 0x6C, 0xAB, 0xBE, 0xAB, 0xC1, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xAB, 0xBC, 0xCE, 0x70, 0xAB, 0xBF, 0x00, 0x00, /* 0xE8-0xEB */
+	0xAE, 0x56, 0xCE, 0x76, 0xCE, 0x64, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xCE, 0x66, 0xCE, 0x6D, 0xCE, 0x71, /* 0xF0-0xF3 */
+	0xCE, 0x75, 0xCE, 0x72, 0xCE, 0x6B, 0xCE, 0x6E, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0x68, 0xAB, 0xC3, /* 0xF8-0xFB */
+	0xCE, 0x6A, 0xCE, 0x69, 0xCE, 0x74, 0xAB, 0xBA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5A[512] = {
+	0xCE, 0x65, 0xAB, 0xC2, 0x00, 0x00, 0xAB, 0xBD, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xAE, 0x5C, 0xD1, 0x62, 0x00, 0x00, /* 0x08-0x0B */
+	0xAE, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xD1, 0x60, /* 0x0C-0x0F */
+	0x00, 0x00, 0xAE, 0x50, 0x00, 0x00, 0xAE, 0x55, /* 0x10-0x13 */
+	0x00, 0x00, 0xD1, 0x5F, 0xD1, 0x5C, 0xD1, 0x61, /* 0x14-0x17 */
+	0xAE, 0x51, 0xD1, 0x5B, 0x00, 0x00, 0xAE, 0x54, /* 0x18-0x1B */
+	0xAE, 0x52, 0x00, 0x00, 0xD1, 0x63, 0xAE, 0x53, /* 0x1C-0x1F */
+	0xAE, 0x57, 0x00, 0x00, 0x00, 0x00, 0xAE, 0x58, /* 0x20-0x23 */
+	0x00, 0x00, 0xAE, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xAE, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xD1, 0x5D, 0xD1, 0x5E, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0x64, /* 0x30-0x33 */
+	0x00, 0x00, 0xD4, 0xD4, 0xB0, 0xF9, 0xD8, 0xC2, /* 0x34-0x37 */
+	0xD4, 0xD3, 0xD4, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xB1, 0x40, 0x00, 0x00, 0xD4, 0xE4, 0x00, 0x00, /* 0x3C-0x3F */
+	0xB0, 0xFE, 0xB0, 0xFA, 0xD4, 0xED, 0xD4, 0xDD, /* 0x40-0x43 */
+	0xD4, 0xE0, 0x00, 0x00, 0xB1, 0x43, 0xD4, 0xEA, /* 0x44-0x47 */
+	0xD4, 0xE2, 0xB0, 0xFB, 0xB1, 0x44, 0x00, 0x00, /* 0x48-0x4B */
+	0xD4, 0xE7, 0xD4, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xD4, 0xD6, 0xD4, 0xEB, 0xD4, 0xDF, 0xD4, 0xDA, /* 0x50-0x53 */
+	0x00, 0x00, 0xD4, 0xD0, 0xD4, 0xEC, 0xD4, 0xDC, /* 0x54-0x57 */
+	0xD4, 0xCF, 0x00, 0x00, 0xB1, 0x42, 0xD4, 0xE1, /* 0x58-0x5B */
+	0xD4, 0xEE, 0xD4, 0xDE, 0xD4, 0xD2, 0xD4, 0xD7, /* 0x5C-0x5F */
+	0xD4, 0xCE, 0x00, 0x00, 0xB1, 0x41, 0x00, 0x00, /* 0x60-0x63 */
+	0xD4, 0xDB, 0xD4, 0xD8, 0xB0, 0xFC, 0xD4, 0xD1, /* 0x64-0x67 */
+	0x00, 0x00, 0xD4, 0xE9, 0xB0, 0xFD, 0x00, 0x00, /* 0x68-0x6B */
+	0xD4, 0xD9, 0xD4, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xD4, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB4, 0x40, /* 0x74-0x77 */
+	0xD8, 0xBB, 0x00, 0x00, 0xD8, 0xB8, 0xD8, 0xC9, /* 0x78-0x7B */
+	0xD8, 0xBD, 0xD8, 0xCA, 0x00, 0x00, 0xB4, 0x42, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC6, /* 0x80-0x83 */
+	0xD8, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xC4, 0xD8, 0xC7, /* 0x88-0x8B */
+	0xD8, 0xCB, 0x00, 0x00, 0xD4, 0xE3, 0xD8, 0xCD, /* 0x8C-0x8F */
+	0xDD, 0x47, 0x00, 0x00, 0xB4, 0x43, 0xD8, 0xCE, /* 0x90-0x93 */
+	0xD8, 0xB6, 0xD8, 0xC0, 0x00, 0x00, 0xD8, 0xC5, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xB4, 0x41, 0xB4, 0x44, /* 0x98-0x9B */
+	0xD8, 0xCC, 0xD8, 0xCF, 0xD8, 0xBA, 0xD8, 0xB7, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xB9, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xD8, 0xBE, 0xD8, 0xBC, 0xB4, 0x45, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xD8, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xD8, 0xBF, 0x00, 0x00, 0xD8, 0xC1, 0xD8, 0xB5, /* 0xAC-0xAF */
+	0xDC, 0xFA, 0xDC, 0xF8, 0xB7, 0x42, 0xB7, 0x40, /* 0xB0-0xB3 */
+	0xDD, 0x43, 0xDC, 0xF9, 0xDD, 0x44, 0xDD, 0x40, /* 0xB4-0xB7 */
+	0xDC, 0xF7, 0xDD, 0x46, 0xDC, 0xF6, 0xDC, 0xFD, /* 0xB8-0xBB */
+	0xB6, 0xFE, 0xB6, 0xFD, 0xB6, 0xFC, 0xDC, 0xFB, /* 0xBC-0xBF */
+	0xDD, 0x41, 0xB6, 0xF9, 0xB7, 0x41, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xDC, 0xF4, 0x00, 0x00, 0xDC, 0xFE, 0xDC, 0xF3, /* 0xC4-0xC7 */
+	0xDC, 0xFC, 0xB6, 0xFA, 0xDD, 0x42, 0xDC, 0xF5, /* 0xC8-0xCB */
+	0xB6, 0xFB, 0xDD, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE1, 0x6E, 0xB9, 0xE2, 0xB9, 0xE1, /* 0xD4-0xD7 */
+	0xB9, 0xE3, 0xE1, 0x7A, 0xE1, 0x70, 0xE1, 0x76, /* 0xD8-0xDB */
+	0xE1, 0x6B, 0xE1, 0x79, 0xE1, 0x78, 0xE1, 0x7C, /* 0xDC-0xDF */
+	0xE1, 0x75, 0xB9, 0xDE, 0xE1, 0x74, 0xB9, 0xE4, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE1, 0x6D, 0xB9, 0xDF, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xE1, 0x7B, 0xB9, 0xE0, 0xE1, 0x6F, 0xE1, 0x72, /* 0xE8-0xEB */
+	0xE1, 0x77, 0xE1, 0x71, 0xE1, 0x6C, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x73, /* 0xF0-0xF3 */
+	0xE5, 0x55, 0xBC, 0x61, 0xE5, 0x58, 0xE5, 0x57, /* 0xF4-0xF7 */
+	0xE5, 0x5A, 0xE5, 0x5C, 0xF9, 0xDC, 0xBC, 0x5F, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE5, 0x56, 0x00, 0x00, 0xE5, 0x54, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5B[512] = {
+	0x00, 0x00, 0xE5, 0x5D, 0xE5, 0x5B, 0xE5, 0x59, /* 0x00-0x03 */
+	0x00, 0x00, 0xE5, 0x5F, 0x00, 0x00, 0xE5, 0x5E, /* 0x04-0x07 */
+	0xBC, 0x63, 0xBC, 0x5E, 0x00, 0x00, 0xBC, 0x60, /* 0x08-0x0B */
+	0xBC, 0x62, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x60, /* 0x0C-0x0F */
+	0xE9, 0x57, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x56, /* 0x10-0x13 */
+	0xE9, 0x55, 0x00, 0x00, 0xE9, 0x58, 0xE9, 0x51, /* 0x14-0x17 */
+	0x00, 0x00, 0xE9, 0x52, 0xE9, 0x5A, 0xE9, 0x53, /* 0x18-0x1B */
+	0x00, 0x00, 0xBE, 0xC5, 0xE9, 0x5C, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE9, 0x5B, 0xE9, 0x54, 0x00, 0x00, 0xEC, 0xD1, /* 0x20-0x23 */
+	0xC0, 0xA8, 0xEC, 0xCF, 0xEC, 0xD4, 0xEC, 0xD3, /* 0x24-0x27 */
+	0xE9, 0x59, 0x00, 0x00, 0xC0, 0xA7, 0x00, 0x00, /* 0x28-0x2B */
+	0xEC, 0xD2, 0xEC, 0xCE, 0xEC, 0xD6, 0xEC, 0xD5, /* 0x2C-0x2F */
+	0xC0, 0xA6, 0x00, 0x00, 0xEC, 0xD0, 0x00, 0x00, /* 0x30-0x33 */
+	0xBE, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xC2, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xEF, 0xC1, 0xF1, 0xFA, 0xF1, 0xFB, 0xF1, 0xFC, /* 0x3C-0x3F */
+	0xC4, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x5D, /* 0x40-0x43 */
+	0x00, 0x00, 0xF4, 0x43, 0x00, 0x00, 0xF5, 0xC8, /* 0x44-0x47 */
+	0xF5, 0xC7, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xDB, /* 0x48-0x4B */
+	0xF6, 0xDC, 0xF7, 0xD5, 0xF8, 0xA7, 0x00, 0x00, /* 0x4C-0x4F */
+	0xA4, 0x6C, 0xA4, 0x6D, 0x00, 0x00, 0xA4, 0x6E, /* 0x50-0x53 */
+	0xA4, 0xD5, 0xA5, 0xA5, 0xC9, 0xD3, 0xA6, 0x72, /* 0x54-0x57 */
+	0xA6, 0x73, 0x00, 0x00, 0xA7, 0xB7, 0xA7, 0xB8, /* 0x58-0x5B */
+	0xA7, 0xB6, 0xA7, 0xB5, 0x00, 0x00, 0xA9, 0x73, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0x55, 0xA9, 0x75, /* 0x60-0x63 */
+	0xA9, 0x74, 0xCC, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xAB, 0xC4, 0x00, 0x00, 0xAE, 0x5D, /* 0x68-0x6B */
+	0xD1, 0x65, 0x00, 0x00, 0xD4, 0xF0, 0x00, 0x00, /* 0x6C-0x6F */
+	0xB1, 0x45, 0xB4, 0x47, 0xD4, 0xEF, 0xB4, 0x46, /* 0x70-0x73 */
+	0x00, 0x00, 0xB9, 0xE5, 0x00, 0x00, 0xE1, 0x7D, /* 0x74-0x77 */
+	0xBE, 0xC7, 0x00, 0x00, 0xC0, 0xA9, 0xEC, 0xD7, /* 0x78-0x7B */
+	0x00, 0x00, 0xC4, 0x5E, 0x00, 0x00, 0xC5, 0x70, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xC9, 0x72, 0x00, 0x00, 0xA5, 0xA6, /* 0x80-0x83 */
+	0xC9, 0x73, 0xA6, 0x76, 0x00, 0x00, 0xA6, 0x74, /* 0x84-0x87 */
+	0xA6, 0x75, 0xA6, 0x77, 0x00, 0x00, 0xA7, 0xBA, /* 0x88-0x8B */
+	0xA7, 0xB9, 0x00, 0x00, 0xCA, 0xBC, 0xA7, 0xBB, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xBD, 0xCC, 0x57, /* 0x90-0x93 */
+	0x00, 0x00, 0xCC, 0x58, 0x00, 0x00, 0xA9, 0x76, /* 0x94-0x97 */
+	0xA9, 0x78, 0xA9, 0x7A, 0xA9, 0x77, 0xA9, 0x7B, /* 0x98-0x9B */
+	0xA9, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0xC8, 0xAB, 0xC5, /* 0xA0-0xA3 */
+	0xAB, 0xC7, 0xAB, 0xC9, 0xAB, 0xC6, 0xD1, 0x66, /* 0xA4-0xA7 */
+	0xCE, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xD1, 0x68, 0xD1, 0x67, 0xAE, 0x63, 0x00, 0x00, /* 0xAC-0xAF */
+	0xAE, 0x5F, 0x00, 0x00, 0x00, 0x00, 0xAE, 0x60, /* 0xB0-0xB3 */
+	0xAE, 0x62, 0xAE, 0x64, 0xAE, 0x61, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xAE, 0x66, 0xAE, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x4A, /* 0xBC-0xBF */
+	0xD4, 0xF2, 0xD4, 0xF1, 0xB1, 0x49, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xB1, 0x48, 0xB1, 0x47, 0xB1, 0x4B, 0xB1, 0x46, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0xD5, 0xD8, 0xD2, /* 0xC8-0xCB */
+	0xB4, 0x49, 0xD8, 0xD1, 0xD8, 0xD6, 0x00, 0x00, /* 0xCC-0xCF */
+	0xB4, 0x4B, 0xD8, 0xD4, 0xB4, 0x48, 0xB4, 0x4A, /* 0xD0-0xD3 */
+	0xD8, 0xD3, 0x00, 0x00, 0xDD, 0x48, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xDD, 0x49, 0xDD, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xB9, 0xE6, 0xB9, 0xEE, /* 0xDC-0xDF */
+	0xE1, 0x7E, 0xB9, 0xE8, 0xB9, 0xEC, 0xE1, 0xA1, /* 0xE0-0xE3 */
+	0xB9, 0xED, 0xB9, 0xE9, 0xB9, 0xEA, 0xB9, 0xE7, /* 0xE4-0xE7 */
+	0xB9, 0xEB, 0xBC, 0x66, 0xD8, 0xD0, 0xBC, 0x67, /* 0xE8-0xEB */
+	0xBC, 0x65, 0x00, 0x00, 0xBC, 0x64, 0xE9, 0x5D, /* 0xEC-0xEF */
+	0xBE, 0xC8, 0xEC, 0xD8, 0xEC, 0xD9, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xC3, 0x64, 0xC4, 0x5F, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xA4, 0x6F, 0x00, 0x00, 0xA6, 0x78, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5C[512] = {
+	0x00, 0x00, 0xAB, 0xCA, 0x00, 0x00, 0xD1, 0x69, /* 0x00-0x03 */
+	0xAE, 0x67, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x4E, /* 0x04-0x07 */
+	0xB1, 0x4D, 0xB1, 0x4C, 0xB4, 0x4C, 0xB4, 0x4D, /* 0x08-0x0B */
+	0xD8, 0xD7, 0xB9, 0xEF, 0xBE, 0xC9, 0xA4, 0x70, /* 0x0C-0x0F */
+	0xC9, 0x5C, 0xA4, 0xD6, 0xC9, 0x74, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xC9, 0xD4, 0xA6, 0x79, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0x7C, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0x4B, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0x71, 0x00, 0x00, /* 0x20-0x23 */
+	0xA4, 0xD7, 0xC9, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xCA, 0xBE, 0x00, 0x00, 0xCA, 0xBF, 0x00, 0x00, /* 0x28-0x2B */
+	0xA7, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xD8, 0xD8, 0xB4, 0x4E, 0x00, 0x00, 0xDD, 0x4C, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC0, 0xAA, /* 0x34-0x37 */
+	0xA4, 0x72, 0xA4, 0xA8, 0xA4, 0xD8, 0xC9, 0x75, /* 0x38-0x3B */
+	0xA5, 0xA7, 0x00, 0x00, 0xA7, 0xC0, 0xA7, 0xBF, /* 0x3C-0x3F */
+	0xA7, 0xBD, 0xA7, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xCC, 0x59, 0xA9, 0x7E, 0xA9, 0xA1, 0xCC, 0x5A, /* 0x44-0x47 */
+	0xA9, 0x7D, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xCE, /* 0x48-0x4B */
+	0xCE, 0x78, 0xAB, 0xCD, 0xAB, 0xCB, 0xAB, 0xCC, /* 0x4C-0x4F */
+	0xAE, 0x6A, 0xAE, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xD1, 0x6B, 0xAE, 0x69, 0xD1, 0x6A, 0x00, 0x00, /* 0x54-0x57 */
+	0xAE, 0x5E, 0xD4, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xB1, 0x50, 0xB1, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xB1, 0x4F, 0x00, 0x00, 0xB9, 0xF0, 0xE1, 0xA2, /* 0x60-0x63 */
+	0xBC, 0x68, 0xBC, 0x69, 0x00, 0x00, 0xE5, 0x61, /* 0x64-0x67 */
+	0xC0, 0xAB, 0xEF, 0xC2, 0xEF, 0xC3, 0x00, 0x00, /* 0x68-0x6B */
+	0xC4, 0xDD, 0xF8, 0xA8, 0xC9, 0x4B, 0xA4, 0xD9, /* 0x6C-0x6F */
+	0x00, 0x00, 0xA4, 0x73, 0x00, 0x00, 0xC9, 0x77, /* 0x70-0x73 */
+	0xC9, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xA6, 0x7A, 0xC9, 0xD7, 0xC9, 0xD8, /* 0x78-0x7B */
+	0xC9, 0xD6, 0x00, 0x00, 0xC9, 0xD9, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xC7, 0x00, 0x00, /* 0x84-0x87 */
+	0xCA, 0xC2, 0xCA, 0xC4, 0xCA, 0xC6, 0xCA, 0xC3, /* 0x88-0x8B */
+	0xA7, 0xC4, 0xCA, 0xC0, 0x00, 0x00, 0xCA, 0xC1, /* 0x8C-0x8F */
+	0xA7, 0xC1, 0xA7, 0xC2, 0xCA, 0xC5, 0xCA, 0xC8, /* 0x90-0x93 */
+	0xA7, 0xC3, 0xCA, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xCC, 0x68, 0x00, 0x00, 0xCC, 0x62, /* 0x9C-0x9F */
+	0xCC, 0x5D, 0xA9, 0xA3, 0xCC, 0x65, 0xCC, 0x63, /* 0xA0-0xA3 */
+	0xCC, 0x5C, 0xCC, 0x69, 0xCC, 0x6C, 0xCC, 0x67, /* 0xA4-0xA7 */
+	0xCC, 0x60, 0xA9, 0xA5, 0xCC, 0x66, 0xA9, 0xA6, /* 0xA8-0xAB */
+	0xCC, 0x61, 0xCC, 0x64, 0xCC, 0x5B, 0xCC, 0x5F, /* 0xAC-0xAF */
+	0xCC, 0x6B, 0xA9, 0xA7, 0x00, 0x00, 0xA9, 0xA8, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xCC, 0x5E, 0xCC, 0x6A, 0xA9, 0xA2, /* 0xB4-0xB7 */
+	0xA9, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xAB, 0xCE, 0xA4, /* 0xC4-0xC7 */
+	0xCE, 0xAA, 0xCE, 0xA3, 0xCE, 0xA5, 0xCE, 0x7D, /* 0xC8-0xCB */
+	0xCE, 0x7B, 0x00, 0x00, 0xCE, 0xAC, 0xCE, 0xA9, /* 0xCC-0xCF */
+	0xCE, 0x79, 0x00, 0x00, 0xAB, 0xD0, 0xCE, 0xA7, /* 0xD0-0xD3 */
+	0xCE, 0xA8, 0x00, 0x00, 0xCE, 0xA6, 0xCE, 0x7C, /* 0xD4-0xD7 */
+	0xCE, 0x7A, 0xAB, 0xCF, 0xCE, 0xA2, 0xCE, 0x7E, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xCE, 0xA1, 0xCE, 0xAD, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xAE, 0x6F, 0x00, 0x00, 0xAE, 0x6E, 0x00, 0x00, /* 0xE8-0xEB */
+	0xD1, 0x6C, 0xAE, 0x6B, 0xD1, 0x6E, 0x00, 0x00, /* 0xEC-0xEF */
+	0xAE, 0x70, 0xD1, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xAE, 0x73, 0x00, 0x00, 0xAE, 0x71, 0xD1, 0x70, /* 0xF4-0xF7 */
+	0xCE, 0xAE, 0xD1, 0x72, 0x00, 0x00, 0xAE, 0x6D, /* 0xF8-0xFB */
+	0x00, 0x00, 0xAE, 0x6C, 0x00, 0x00, 0xD1, 0x6D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5D[512] = {
+	0xD1, 0x71, 0xAE, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xB1, 0x53, 0xB1, 0x52, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF5, /* 0x08-0x0B */
+	0xD4, 0xF9, 0xD4, 0xFB, 0xB1, 0x54, 0xD4, 0xFE, /* 0x0C-0x0F */
+	0x00, 0x00, 0xB1, 0x58, 0xD5, 0x41, 0x00, 0x00, /* 0x10-0x13 */
+	0xB1, 0x5A, 0x00, 0x00, 0xB1, 0x56, 0xB1, 0x5E, /* 0x14-0x17 */
+	0x00, 0x00, 0xB1, 0x5B, 0xD4, 0xF7, 0xB1, 0x55, /* 0x18-0x1B */
+	0x00, 0x00, 0xD4, 0xF6, 0xD4, 0xF4, 0xD5, 0x43, /* 0x1C-0x1F */
+	0xD4, 0xF8, 0x00, 0x00, 0xB1, 0x57, 0xD5, 0x42, /* 0x20-0x23 */
+	0xB1, 0x5C, 0xD4, 0xFD, 0xD4, 0xFC, 0xB1, 0x5D, /* 0x24-0x27 */
+	0xD4, 0xFA, 0xB1, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0x44, 0x00, 0x00, /* 0x2C-0x2F */
+	0xD5, 0x40, 0xD8, 0xE7, 0xD8, 0xEE, 0xD8, 0xE3, /* 0x30-0x33 */
+	0xB4, 0x51, 0xD8, 0xDF, 0xD8, 0xEF, 0xD8, 0xD9, /* 0x34-0x37 */
+	0xD8, 0xEC, 0xD8, 0xEA, 0xD8, 0xE4, 0x00, 0x00, /* 0x38-0x3B */
+	0xD8, 0xED, 0xD8, 0xE6, 0x00, 0x00, 0xD8, 0xDE, /* 0x3C-0x3F */
+	0xD8, 0xF0, 0xD8, 0xDC, 0xD8, 0xE9, 0xD8, 0xDA, /* 0x40-0x43 */
+	0x00, 0x00, 0xD8, 0xF1, 0x00, 0x00, 0xB4, 0x52, /* 0x44-0x47 */
+	0x00, 0x00, 0xD8, 0xEB, 0xDD, 0x4F, 0xD8, 0xDD, /* 0x48-0x4B */
+	0xB4, 0x4F, 0x00, 0x00, 0xD8, 0xE1, 0x00, 0x00, /* 0x4C-0x4F */
+	0xB4, 0x50, 0xD8, 0xE0, 0xD8, 0xE5, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xD8, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xD8, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0x53, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0x56, 0xDD, 0x4E, /* 0x60-0x63 */
+	0x00, 0x00, 0xDD, 0x50, 0x00, 0x00, 0xDD, 0x55, /* 0x64-0x67 */
+	0xDD, 0x54, 0xB7, 0x43, 0x00, 0x00, 0xD8, 0xDB, /* 0x68-0x6B */
+	0xDD, 0x52, 0x00, 0x00, 0x00, 0x00, 0xB7, 0x44, /* 0x6C-0x6F */
+	0x00, 0x00, 0xDD, 0x4D, 0xDD, 0x51, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA9, /* 0x74-0x77 */
+	0x00, 0x00, 0xE1, 0xB0, 0xE1, 0xA7, 0x00, 0x00, /* 0x78-0x7B */
+	0xE1, 0xAE, 0xE1, 0xA5, 0xE1, 0xAD, 0xE1, 0xB1, /* 0x7C-0x7F */
+	
+	0xE1, 0xA4, 0xE1, 0xA8, 0xE1, 0xA3, 0x00, 0x00, /* 0x80-0x83 */
+	0xB9, 0xF1, 0x00, 0x00, 0xE1, 0xA6, 0xB9, 0xF2, /* 0x84-0x87 */
+	0xE1, 0xAC, 0xE1, 0xAB, 0xE1, 0xAA, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xE1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x65, 0xE5, 0x67, /* 0x90-0x93 */
+	0xBC, 0x6B, 0xE5, 0x68, 0x00, 0x00, 0xE5, 0x63, /* 0x94-0x97 */
+	0x00, 0x00, 0xE5, 0x62, 0xE5, 0x6C, 0x00, 0x00, /* 0x98-0x9B */
+	0xE5, 0x6A, 0xBC, 0x6A, 0xE5, 0x6D, 0xE5, 0x64, /* 0x9C-0x9F */
+	0xE5, 0x69, 0xE5, 0x6B, 0xE5, 0x66, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x61, /* 0xA4-0xA7 */
+	0xE9, 0x66, 0xE9, 0x60, 0xE9, 0x65, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE9, 0x5E, 0xE9, 0x68, 0xE9, 0x64, 0xE9, 0x69, /* 0xAC-0xAF */
+	0xE9, 0x63, 0xE9, 0x5F, 0xE9, 0x67, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE9, 0x6A, 0xE9, 0x62, 0x00, 0x00, 0xEC, 0xDA, /* 0xB4-0xB7 */
+	0xC0, 0xAF, 0x00, 0x00, 0xC0, 0xAD, 0x00, 0x00, /* 0xB8-0xBB */
+	0xC0, 0xAC, 0xC0, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xEF, 0xC4, 0x00, 0x00, 0xF1, 0x72, 0xF1, 0xFD, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0x44, 0xF4, 0x45, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xC4, 0x60, 0x00, 0x00, 0xF5, 0xC9, /* 0xC8-0xCB */
+	0x00, 0x00, 0xC4, 0xDE, 0x00, 0x00, 0xF5, 0xCA, /* 0xCC-0xCF */
+	0x00, 0x00, 0xF6, 0xDE, 0xC5, 0x72, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xC5, 0x71, 0xF6, 0xDD, 0xC5, 0xC9, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xF7, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xA4, 0x74, 0xA6, 0x7B, 0xC9, 0xDA, /* 0xDC-0xDF */
+	0xCA, 0xCA, 0xA8, 0xB5, 0xB1, 0x5F, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xA4, 0x75, 0xA5, 0xAA, 0xA5, 0xA9, /* 0xE4-0xE7 */
+	0xA5, 0xA8, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xC5, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xAE, 0x74, 0x00, 0x00, /* 0xEC-0xEF */
+	0xDD, 0x57, 0xA4, 0x76, 0xA4, 0x77, 0xA4, 0x78, /* 0xF0-0xF3 */
+	0xA4, 0xDA, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xD1, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xCE, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xB4, 0x53, 0xA4, 0x79, 0xC9, 0x5D, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xA5, 0xAB, 0xA5, 0xAC, /* 0x00-0x03 */
+	0xC9, 0x78, 0x00, 0x00, 0xA6, 0x7C, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xCB, 0x00, 0x00, /* 0x08-0x0B */
+	0xA7, 0xC6, 0x00, 0x00, 0xCA, 0xCC, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xA9, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xCC, 0x6E, 0xA9, 0xAC, 0xA9, 0xAB, 0xCC, 0x6D, /* 0x14-0x17 */
+	0xA9, 0xA9, 0xCC, 0x6F, 0xA9, 0xAA, 0xA9, 0xAD, /* 0x18-0x1B */
+	0x00, 0x00, 0xAB, 0xD2, 0x00, 0x00, 0xAB, 0xD4, /* 0x1C-0x1F */
+	0xCE, 0xB3, 0xCE, 0xB0, 0xCE, 0xB1, 0xCE, 0xB2, /* 0x20-0x23 */
+	0xCE, 0xB4, 0xAB, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xD1, 0x74, 0xD1, 0x73, 0x00, 0x00, 0xAE, 0x76, /* 0x28-0x2B */
+	0x00, 0x00, 0xAE, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x62, /* 0x30-0x33 */
+	0xD5, 0x46, 0x00, 0x00, 0xB1, 0x61, 0xB1, 0x63, /* 0x34-0x37 */
+	0xB1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xB4, 0x55, 0xD5, 0x45, 0x00, 0x00, /* 0x3C-0x3F */
+	0xB4, 0x56, 0xD8, 0xF3, 0x00, 0x00, 0xB4, 0x57, /* 0x40-0x43 */
+	0xD8, 0xF2, 0xB4, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0x5A, 0xDD, 0x5C, /* 0x48-0x4B */
+	0xB7, 0x45, 0xDD, 0x5B, 0xDD, 0x59, 0xDD, 0x58, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB4, /* 0x50-0x53 */
+	0xB9, 0xF7, 0xB9, 0xF5, 0x00, 0x00, 0xB9, 0xF6, /* 0x54-0x57 */
+	0xE1, 0xB2, 0xE1, 0xB3, 0x00, 0x00, 0xB9, 0xF3, /* 0x58-0x5B */
+	0xE5, 0x71, 0xE5, 0x6F, 0x00, 0x00, 0xBC, 0x6D, /* 0x5C-0x5F */
+	0xE5, 0x70, 0xBC, 0x6E, 0xBC, 0x6C, 0xB9, 0xF4, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0x6D, 0xE9, 0x6B, /* 0x64-0x67 */
+	0xE9, 0x6C, 0xE5, 0x6E, 0xEC, 0xDC, 0xC0, 0xB0, /* 0x68-0x6B */
+	0xEC, 0xDB, 0xEF, 0xC5, 0xEF, 0xC6, 0xE9, 0x6E, /* 0x6C-0x6F */
+	0xF1, 0xFE, 0x00, 0x00, 0xA4, 0x7A, 0xA5, 0xAD, /* 0x70-0x73 */
+	0xA6, 0x7E, 0xC9, 0xDB, 0xA6, 0x7D, 0x00, 0x00, /* 0x74-0x77 */
+	0xA9, 0xAF, 0xB7, 0x46, 0x00, 0x00, 0xA4, 0xDB, /* 0x78-0x7B */
+	0xA5, 0xAE, 0xAB, 0xD5, 0xB4, 0x58, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xC9, 0x79, 0x00, 0x00, 0xC9, 0x7A, 0x00, 0x00, /* 0x80-0x83 */
+	0xC9, 0xDC, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xC8, /* 0x84-0x87 */
+	0xCA, 0xD0, 0xCA, 0xCE, 0xA7, 0xC9, 0xCA, 0xCD, /* 0x88-0x8B */
+	0xCA, 0xCF, 0xCA, 0xD1, 0x00, 0x00, 0xA7, 0xC7, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xA9, 0xB3, 0xA9, 0xB4, 0xA9, 0xB1, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0xB0, 0xCE, 0xB8, /* 0x98-0x9B */
+	0xA9, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xAB, 0xD6, 0x00, 0x00, 0xCE, 0xB7, 0xCE, 0xB9, /* 0xA0-0xA3 */
+	0xCE, 0xB6, 0xCE, 0xBA, 0xAB, 0xD7, 0xAE, 0x79, /* 0xA4-0xA7 */
+	0xD1, 0x75, 0x00, 0x00, 0xD1, 0x77, 0xAE, 0x77, /* 0xA8-0xAB */
+	0xD1, 0x78, 0xAE, 0x78, 0xD1, 0x76, 0x00, 0x00, /* 0xAC-0xAF */
+	0xCE, 0xB5, 0xD5, 0x47, 0xD5, 0x4A, 0xD5, 0x4B, /* 0xB0-0xB3 */
+	0xD5, 0x48, 0xB1, 0x67, 0xB1, 0x66, 0xB1, 0x64, /* 0xB4-0xB7 */
+	0xB1, 0x65, 0xD5, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xB1, 0x68, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xB4, 0x5A, 0xB4, 0x5B, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xB4, 0x5C, 0xDD, 0x5D, 0xDD, 0x5F, 0xDD, 0x61, /* 0xC4-0xC7 */
+	0xB7, 0x48, 0xB7, 0x47, 0xB4, 0x59, 0xDD, 0x60, /* 0xC8-0xCB */
+	0xDD, 0x5E, 0x00, 0x00, 0xE1, 0xB8, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xE1, 0xB6, 0xE1, 0xBC, 0xB9, 0xF8, /* 0xD0-0xD3 */
+	0xE1, 0xBD, 0xE1, 0xBA, 0xB9, 0xF9, 0xE1, 0xB7, /* 0xD4-0xD7 */
+	0xE1, 0xB5, 0xE1, 0xBB, 0xBC, 0x70, 0xE5, 0x73, /* 0xD8-0xDB */
+	0xE1, 0xB9, 0xBC, 0x72, 0xE5, 0x74, 0xBC, 0x71, /* 0xDC-0xDF */
+	0xBC, 0x74, 0xE5, 0x75, 0xBC, 0x6F, 0xBC, 0x73, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE9, 0x73, 0xE9, 0x71, 0xE9, 0x70, /* 0xE4-0xE7 */
+	0xE9, 0x72, 0xE9, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xC3, 0x66, 0x00, 0x00, 0xF4, 0x46, 0xF4, 0x47, /* 0xEC-0xEF */
+	0x00, 0x00, 0xF5, 0xCB, 0xF6, 0xDF, 0xC6, 0x55, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0xB5, 0xA7, 0xCA, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0xD8, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xA4, 0x7B, 0xA4, 0xDC, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_5F[512] = {
+	0x00, 0x00, 0xA5, 0xAF, 0xC9, 0xDD, 0x00, 0x00, /* 0x00-0x03 */
+	0xA7, 0xCB, 0xCA, 0xD2, 0x00, 0x00, 0xCE, 0xBB, /* 0x04-0x07 */
+	0xAB, 0xD9, 0x00, 0x00, 0xB9, 0xFA, 0xA4, 0x7C, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0xA1, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0x49, 0xA4, 0x7D, /* 0x10-0x13 */
+	0xA4, 0xDD, 0xA4, 0xDE, 0x00, 0x00, 0xA5, 0xB1, /* 0x14-0x17 */
+	0xA5, 0xB0, 0x00, 0x00, 0xC9, 0xDE, 0xA6, 0xA2, /* 0x18-0x1B */
+	0x00, 0x00, 0xCA, 0xD3, 0x00, 0x00, 0xA7, 0xCC, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0x71, 0xCC, 0x72, /* 0x20-0x23 */
+	0xCC, 0x73, 0x00, 0x00, 0xA9, 0xB6, 0xA9, 0xB7, /* 0x24-0x27 */
+	0xCC, 0x70, 0xA9, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xAB, 0xDA, 0xCE, 0xBC, 0x00, 0x00, /* 0x2C-0x2F */
+	0xD1, 0x7A, 0xAE, 0x7A, 0x00, 0x00, 0xD1, 0x79, /* 0x30-0x33 */
+	0x00, 0x00, 0xB1, 0x69, 0xD5, 0x4C, 0xB1, 0x6A, /* 0x34-0x37 */
+	0xD5, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xB4, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xDD, 0x62, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBF, /* 0x40-0x43 */
+	0xE1, 0xBE, 0x00, 0x00, 0xB9, 0xFB, 0x00, 0x00, /* 0x44-0x47 */
+	0xBC, 0x75, 0xE5, 0x76, 0xBE, 0xCA, 0xE9, 0x74, /* 0x48-0x4B */
+	0xC0, 0xB1, 0x00, 0x00, 0xC5, 0x73, 0xF7, 0xD8, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xCC, 0x74, 0x00, 0x00, 0xCE, 0xBD, 0xB1, 0x6B, /* 0x54-0x57 */
+	0xD8, 0xF4, 0xB7, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xC2, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xA7, 0xCE, 0x00, 0x00, /* 0x60-0x63 */
+	0xA7, 0xCD, 0xAB, 0xDB, 0x00, 0x00, 0xD1, 0x7B, /* 0x64-0x67 */
+	0x00, 0x00, 0xB1, 0x6D, 0xB3, 0x43, 0xB1, 0x6E, /* 0x68-0x6B */
+	0xB1, 0x6C, 0xB4, 0x5E, 0x00, 0x00, 0xE1, 0xC0, /* 0x6C-0x6F */
+	0xB9, 0xFC, 0xBC, 0x76, 0x00, 0x00, 0xC9, 0x4C, /* 0x70-0x73 */
+	0xC9, 0xDF, 0x00, 0x00, 0xCA, 0xD5, 0xA7, 0xCF, /* 0x74-0x77 */
+	0xCA, 0xD4, 0xA7, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xA9, 0xBC, 0xCC, 0x77, 0xCC, 0x76, 0xA9, 0xBB, /* 0x7C-0x7F */
+	
+	0xA9, 0xB9, 0xA9, 0xBA, 0xCC, 0x75, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xAB, 0xDD, 0xCE, 0xBE, 0xAB, 0xE0, /* 0x84-0x87 */
+	0xAB, 0xDC, 0xAB, 0xE2, 0xAB, 0xDE, 0xAB, 0xDF, /* 0x88-0x8B */
+	0xAB, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xAE, 0x7D, 0xAE, 0x7C, 0xAE, 0x7B, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0x4F, 0xB1, 0x6F, /* 0x94-0x97 */
+	0xB1, 0x72, 0xB1, 0x70, 0x00, 0x00, 0xD5, 0x4E, /* 0x98-0x9B */
+	0xB1, 0x75, 0x00, 0x00, 0xB1, 0x71, 0xD5, 0x50, /* 0x9C-0x9F */
+	0xB1, 0x74, 0xB1, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xD8, 0xF6, 0xD8, 0xF5, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xB4, 0x61, 0xB4, 0x5F, 0xB4, 0x60, 0xD8, 0xF7, /* 0xA8-0xAB */
+	0xB7, 0x4B, 0xDD, 0x64, 0xB7, 0x4C, 0xDD, 0x63, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0x77, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xBC, 0x78, 0xE1, 0xC1, 0xBC, 0x77, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xB9, 0xFD, 0x00, 0x00, 0xEC, 0xDE, /* 0xB8-0xBB */
+	0xE9, 0x75, 0xC0, 0xB2, 0xEC, 0xDD, 0xF2, 0x40, /* 0xBC-0xBF */
+	0xF4, 0x48, 0xF4, 0x49, 0x00, 0x00, 0xA4, 0xDF, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xA5, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xC9, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xA7, 0xD2, 0xA7, 0xD4, 0x00, 0x00, 0xC9, 0xE2, /* 0xCC-0xCF */
+	0xCA, 0xD8, 0xCA, 0xD7, 0xCA, 0xD6, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xC9, 0xE1, 0xC9, 0xE0, 0xA6, 0xA4, 0xA7, 0xD3, /* 0xD4-0xD7 */
+	0xA7, 0xD1, 0xA6, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xA9, 0xBD, 0xCC, 0x78, 0x00, 0x00, /* 0xDC-0xDF */
+	0xA9, 0xBE, 0xCA, 0xDD, 0x00, 0x00, 0xCA, 0xDF, /* 0xE0-0xE3 */
+	0xCA, 0xDE, 0xCC, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xCA, 0xDA, 0x00, 0x00, 0xA7, 0xD8, 0xA7, 0xD6, /* 0xE8-0xEB */
+	0x00, 0x00, 0xCA, 0xD9, 0xCA, 0xDB, 0xCA, 0xE1, /* 0xEC-0xEF */
+	0x00, 0x00, 0xA7, 0xD5, 0x00, 0x00, 0xCA, 0xDC, /* 0xF0-0xF3 */
+	0xCA, 0xE5, 0xA9, 0xC0, 0x00, 0x00, 0xCA, 0xE2, /* 0xF4-0xF7 */
+	0xA7, 0xD7, 0x00, 0x00, 0xCA, 0xE0, 0xCA, 0xE3, /* 0xF8-0xFB */
+	0x00, 0x00, 0xA9, 0xBF, 0x00, 0x00, 0xA9, 0xC1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_60[512] = {
+	0xCA, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xCC, 0xAF, 0xCC, 0xA2, 0xCC, 0x7E, /* 0x08-0x0B */
+	0xCC, 0xAE, 0xCC, 0xA9, 0xAB, 0xE7, 0xA9, 0xC2, /* 0x0C-0x0F */
+	0xCC, 0xAA, 0xCC, 0xAD, 0xAB, 0xE3, 0xCC, 0xAC, /* 0x10-0x13 */
+	0xA9, 0xC3, 0xA9, 0xC8, 0xA9, 0xC6, 0xCC, 0xA3, /* 0x14-0x17 */
+	0x00, 0x00, 0xCC, 0x7C, 0xCC, 0xA5, 0xA9, 0xCD, /* 0x18-0x1B */
+	0xCC, 0xB0, 0xAB, 0xE4, 0xCC, 0xA6, 0x00, 0x00, /* 0x1C-0x1F */
+	0xAB, 0xE5, 0xA9, 0xC9, 0xCC, 0xA8, 0x00, 0x00, /* 0x20-0x23 */
+	0xCE, 0xCD, 0xAB, 0xE6, 0xCC, 0x7B, 0xA9, 0xCA, /* 0x24-0x27 */
+	0xAB, 0xE8, 0xA9, 0xCB, 0xA9, 0xC7, 0xA9, 0xCC, /* 0x28-0x2B */
+	0xCC, 0xA7, 0xCC, 0x7A, 0xCC, 0xAB, 0xA9, 0xC4, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0x7D, 0xCC, 0xA4, /* 0x30-0x33 */
+	0xCC, 0xA1, 0xA9, 0xC5, 0x00, 0x00, 0xCE, 0xBF, /* 0x34-0x37 */
+	0x00, 0x00, 0xCE, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xCE, 0xCA, 0xD1, 0xA1, 0xCE, 0xCB, 0xAB, 0xEE, /* 0x40-0x43 */
+	0xCE, 0xCE, 0xCE, 0xC4, 0xAB, 0xED, 0xCE, 0xC6, /* 0x44-0x47 */
+	0x00, 0x00, 0xCE, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xCE, 0xC9, 0xAB, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xAE, 0xA3, 0x00, 0x00, 0xF9, 0xDA, 0xCE, 0xC5, /* 0x50-0x53 */
+	0xCE, 0xC1, 0xAE, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xCE, 0xCF, 0xAE, 0x7E, 0xD1, 0x7D, 0xCE, 0xC8, /* 0x58-0x5B */
+	0x00, 0x00, 0xD1, 0x7C, 0xCE, 0xC3, 0xCE, 0xCC, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0xEC, 0xAE, 0xA1, /* 0x60-0x63 */
+	0xAB, 0xF2, 0xAE, 0xA2, 0xCE, 0xD0, 0xD1, 0x7E, /* 0x64-0x67 */
+	0xAB, 0xEB, 0xAE, 0xA6, 0xAB, 0xF1, 0xAB, 0xF0, /* 0x68-0x6B */
+	0xAB, 0xEF, 0xAE, 0xA5, 0xCE, 0xD1, 0xAE, 0xA7, /* 0x6C-0x6F */
+	0xAB, 0xEA, 0x00, 0x00, 0xCE, 0xC2, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x76, /* 0x7C-0x7F */
+	
+	0xD1, 0xA4, 0xD1, 0xA6, 0x00, 0x00, 0xD1, 0xA8, /* 0x80-0x83 */
+	0xAE, 0xA8, 0xAE, 0xAE, 0xD5, 0x53, 0xD1, 0xAC, /* 0x84-0x87 */
+	0xD1, 0xA3, 0xB1, 0x78, 0xD5, 0x51, 0x00, 0x00, /* 0x88-0x8B */
+	0xAE, 0xAD, 0xAE, 0xAB, 0xD1, 0xAE, 0x00, 0x00, /* 0x8C-0x8F */
+	0xD5, 0x52, 0x00, 0x00, 0xD1, 0xA5, 0x00, 0x00, /* 0x90-0x93 */
+	0xAE, 0xAC, 0xD1, 0xA9, 0xAE, 0xAF, 0xD1, 0xAB, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xAE, 0xAA, 0xD1, 0xAA, /* 0x98-0x9B */
+	0xD1, 0xAD, 0xD1, 0xA7, 0x00, 0x00, 0xAE, 0xA9, /* 0x9C-0x9F */
+	0xB1, 0x79, 0x00, 0x00, 0xD1, 0xA2, 0xB1, 0x77, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xB1, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xD5, 0x55, 0xD5, 0x5E, 0xB4, 0x64, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xB1, 0x7C, 0xB1, 0xA3, 0xB4, 0x65, 0xD5, 0x60, /* 0xB4-0xB7 */
+	0xB1, 0xAA, 0xD8, 0xF9, 0xD5, 0x56, 0xB1, 0xA2, /* 0xB8-0xBB */
+	0xB1, 0xA5, 0xB1, 0x7E, 0xD5, 0x54, 0xD5, 0x62, /* 0xBC-0xBF */
+	0xD5, 0x65, 0xD9, 0x49, 0x00, 0x00, 0xD5, 0x63, /* 0xC0-0xC3 */
+	0xD8, 0xFD, 0xB1, 0xA1, 0xB1, 0xA8, 0xB1, 0xAC, /* 0xC4-0xC7 */
+	0xD5, 0x5D, 0xD8, 0xF8, 0xD5, 0x61, 0xB1, 0x7B, /* 0xC8-0xCB */
+	0xD8, 0xFA, 0xD5, 0x64, 0xD8, 0xFC, 0xD5, 0x59, /* 0xCC-0xCF */
+	0x00, 0x00, 0xB4, 0x62, 0x00, 0x00, 0xD5, 0x57, /* 0xD0-0xD3 */
+	0xD5, 0x58, 0xB1, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xB1, 0xA6, 0xD5, 0x5B, 0xB1, 0xAB, 0xD5, 0x5F, /* 0xD8-0xDB */
+	0xB1, 0xA4, 0xD5, 0x5C, 0x00, 0x00, 0xB1, 0xA9, /* 0xDC-0xDF */
+	0xB4, 0x66, 0xB4, 0x63, 0xD8, 0xFB, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xD5, 0x5A, 0x00, 0x00, 0xB1, 0x7D, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xB4, 0x6B, 0xB4, 0x6F, 0xD9, 0x40, 0xB7, 0x51, /* 0xF0-0xF3 */
+	0xB4, 0x6D, 0xD9, 0x44, 0xB4, 0x71, 0xDD, 0x65, /* 0xF4-0xF7 */
+	0xD9, 0x46, 0xB7, 0x53, 0xB4, 0x69, 0xB4, 0x6C, /* 0xF8-0xFB */
+	0xD9, 0x47, 0x00, 0x00, 0xD9, 0x48, 0xD9, 0x4E, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_61[512] = {
+	0xB4, 0x73, 0xB7, 0x54, 0x00, 0x00, 0xD9, 0x4A, /* 0x00-0x03 */
+	0xD9, 0x4F, 0xD9, 0x43, 0xB7, 0x5E, 0x00, 0x00, /* 0x04-0x07 */
+	0xB7, 0x55, 0xB4, 0x72, 0xD9, 0x41, 0xD9, 0x50, /* 0x08-0x0B */
+	0x00, 0x00, 0xB7, 0x5D, 0xB4, 0x70, 0xB7, 0x4E, /* 0x0C-0x0F */
+	0xD9, 0x4D, 0x00, 0x00, 0xB4, 0x74, 0xD9, 0x45, /* 0x10-0x13 */
+	0xD8, 0xFE, 0xB4, 0x6A, 0xD9, 0x42, 0x00, 0x00, /* 0x14-0x17 */
+	0xD9, 0x4B, 0x00, 0x00, 0xB7, 0x4D, 0xB7, 0x52, /* 0x18-0x1B */
+	0xB4, 0x67, 0xD9, 0x4C, 0x00, 0x00, 0xB7, 0x50, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB4, 0x68, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB7, 0x5C, /* 0x24-0x27 */
+	0xE1, 0xC3, 0xDD, 0x70, 0x00, 0x00, 0xDD, 0x68, /* 0x28-0x2B */
+	0xE1, 0xC2, 0x00, 0x00, 0xDD, 0x6C, 0xDD, 0x6E, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xDD, 0x6B, 0x00, 0x00, /* 0x30-0x33 */
+	0xB7, 0x5B, 0x00, 0x00, 0xDD, 0x6A, 0xB7, 0x5F, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD2, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0x5A, 0xBA, 0x40, /* 0x3C-0x3F */
+	0xDD, 0x71, 0xE1, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xB7, 0x58, 0xDD, 0x69, 0xDD, 0x6D, 0xB9, 0xFE, /* 0x44-0x47 */
+	0xB7, 0x4F, 0xDD, 0x66, 0xDD, 0x67, 0xBA, 0x41, /* 0x48-0x4B */
+	0xB7, 0x57, 0xB7, 0x59, 0xB7, 0x56, 0xDD, 0x6F, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xC8, 0xE1, 0xC9, /* 0x50-0x53 */
+	0xE1, 0xCE, 0xBC, 0x7D, 0xE1, 0xD5, 0x00, 0x00, /* 0x54-0x57 */
+	0xBA, 0x47, 0x00, 0x00, 0xBA, 0x46, 0xE1, 0xD0, /* 0x58-0x5B */
+	0x00, 0x00, 0xBC, 0x7C, 0xE1, 0xC5, 0xBA, 0x45, /* 0x5C-0x5F */
+	0x00, 0x00, 0xE1, 0xD4, 0xBA, 0x43, 0xBA, 0x44, /* 0x60-0x63 */
+	0x00, 0x00, 0xE1, 0xD1, 0xE5, 0xAA, 0xBC, 0x7A, /* 0x64-0x67 */
+	0xB4, 0x6E, 0x00, 0x00, 0xE1, 0xD3, 0xBC, 0xA3, /* 0x68-0x6B */
+	0xE1, 0xCB, 0x00, 0x00, 0xBC, 0x7B, 0x00, 0x00, /* 0x6C-0x6F */
+	0xBC, 0xA2, 0xE1, 0xC6, 0xE1, 0xCA, 0xE1, 0xC7, /* 0x70-0x73 */
+	0xE1, 0xCD, 0xBA, 0x48, 0xBC, 0x79, 0xBA, 0x42, /* 0x74-0x77 */
+	0x00, 0x00, 0xE5, 0x7A, 0xE1, 0xCF, 0x00, 0x00, /* 0x78-0x7B */
+	0xBC, 0xA1, 0x00, 0x00, 0xBC, 0xA4, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xE1, 0xCC, 0x00, 0x00, 0xBC, 0x7E, 0xE5, 0x79, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xE5, 0x7E, 0xBE, 0xCE, 0xE5, 0x78, /* 0x88-0x8B */
+	0xE9, 0xA3, 0xE5, 0xA9, 0xBC, 0xA8, 0x00, 0x00, /* 0x8C-0x8F */
+	0xBC, 0xA6, 0xBE, 0xCC, 0xE5, 0xA6, 0xE5, 0xA2, /* 0x90-0x93 */
+	0xBC, 0xAC, 0x00, 0x00, 0xE9, 0x78, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xBC, 0xAA, 0xE5, 0xA1, /* 0x98-0x9B */
+	0x00, 0x00, 0xE9, 0x76, 0x00, 0x00, 0xE5, 0xA5, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE5, 0xA8, 0xE5, 0x7D, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xBC, 0xAB, 0x00, 0x00, 0x00, 0x00, 0xBC, 0xA5, /* 0xA4-0xA7 */
+	0xE9, 0x77, 0xBE, 0xCD, 0xE5, 0xA7, 0xBC, 0xA7, /* 0xA8-0xAB */
+	0xBC, 0xA9, 0xE5, 0xA4, 0xBC, 0xAD, 0xE5, 0xA3, /* 0xAC-0xAF */
+	0xE5, 0x7C, 0xE5, 0x7B, 0xBE, 0xCB, 0xE5, 0xAB, /* 0xB0-0xB3 */
+	0xE9, 0x7A, 0xEC, 0xE0, 0xBE, 0xD0, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE9, 0xA2, 0x00, 0x00, 0xE9, 0x7E, 0x00, 0x00, /* 0xB8-0xBB */
+	0xEC, 0xE1, 0x00, 0x00, 0xBE, 0xD1, 0xE9, 0xA1, /* 0xBC-0xBF */
+	0x00, 0x00, 0xE9, 0x7C, 0xC0, 0xB4, 0xEC, 0xDF, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE9, 0x79, 0xE9, 0x7B, 0xC0, 0xB5, /* 0xC4-0xC7 */
+	0xBE, 0xD3, 0xC0, 0xB3, 0xBE, 0xD2, 0xC0, 0xB7, /* 0xC8-0xCB */
+	0xE9, 0x7D, 0xBE, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0xCF, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xEF, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xE7, 0xEF, 0xC8, /* 0xDC-0xDF */
+	0xEC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x56, /* 0xE0-0xE3 */
+	0xEC, 0xE5, 0xEC, 0xE4, 0xC0, 0xB6, 0xEC, 0xE2, /* 0xE4-0xE7 */
+	0xEC, 0xE6, 0xEF, 0xD0, 0xEF, 0xCC, 0xEF, 0xCE, /* 0xE8-0xEB */
+	0x00, 0x00, 0xEF, 0xC9, 0xEF, 0xCA, 0x00, 0x00, /* 0xEC-0xEF */
+	0xEF, 0xCD, 0xEF, 0xCB, 0xC3, 0x67, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xC3, 0x6A, 0xC3, 0x69, 0xC3, 0x68, /* 0xF4-0xF7 */
+	0xC4, 0x61, 0xF4, 0x4A, 0xC4, 0x62, 0xF2, 0x41, /* 0xF8-0xFB */
+	0xC4, 0xDF, 0xF5, 0xCC, 0xC4, 0xE0, 0xC5, 0x74, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_62[512] = {
+	0xC5, 0xCA, 0xF7, 0xD9, 0x00, 0x00, 0xF7, 0xDA, /* 0x00-0x03 */
+	0xF7, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xBA, /* 0x04-0x07 */
+	0xA4, 0xE0, 0xC9, 0x7C, 0xA5, 0xB3, 0x00, 0x00, /* 0x08-0x0B */
+	0xA6, 0xA6, 0xA6, 0xA7, 0xA6, 0xA5, 0x00, 0x00, /* 0x0C-0x0F */
+	0xA6, 0xA8, 0xA7, 0xDA, 0xA7, 0xD9, 0x00, 0x00, /* 0x10-0x13 */
+	0xCC, 0xB1, 0xA9, 0xCF, 0xA9, 0xCE, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xD1, 0xAF, 0xB1, 0xAD, 0xB1, 0xAE, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB4, 0x75, /* 0x1C-0x1F */
+	0xDD, 0x72, 0xB7, 0x60, 0xB7, 0x61, 0xDD, 0x74, /* 0x20-0x23 */
+	0xDD, 0x76, 0xDD, 0x75, 0x00, 0x00, 0xE1, 0xD7, /* 0x24-0x27 */
+	0x00, 0x00, 0xE1, 0xD6, 0xBA, 0x49, 0xE1, 0xD8, /* 0x28-0x2B */
+	0x00, 0x00, 0xE5, 0xAC, 0xBC, 0xAE, 0x00, 0x00, /* 0x2C-0x2F */
+	0xBE, 0xD4, 0x00, 0x00, 0xC0, 0xB8, 0xC2, 0x57, /* 0x30-0x33 */
+	0xC0, 0xB9, 0x00, 0x00, 0xA4, 0xE1, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0xE6, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xCC, 0xB2, 0xA9, 0xD1, 0xA9, 0xD0, /* 0x3C-0x3F */
+	0xA9, 0xD2, 0xAB, 0xF3, 0xCE, 0xD2, 0xCE, 0xD3, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xB0, 0xAE, 0xB0, /* 0x44-0x47 */
+	0xB1, 0xAF, 0xB4, 0x76, 0xD9, 0x51, 0xA4, 0xE2, /* 0x48-0x4B */
+	0x00, 0x00, 0xA4, 0x7E, 0xA4, 0xE3, 0x00, 0x00, /* 0x4C-0x4F */
+	0xC9, 0x7D, 0xA5, 0xB7, 0xA5, 0xB6, 0xA5, 0xB4, /* 0x50-0x53 */
+	0xA5, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xA6, 0xAB, 0xC9, 0xE9, 0xC9, 0xEB, 0xA6, 0xAA, /* 0x58-0x5B */
+	0xC9, 0xE3, 0x00, 0x00, 0xC9, 0xE4, 0x00, 0x00, /* 0x5C-0x5F */
+	0xC9, 0xEA, 0xC9, 0xE6, 0xC9, 0xE8, 0xA6, 0xA9, /* 0x60-0x63 */
+	0xC9, 0xE5, 0xC9, 0xEC, 0xC9, 0xE7, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xA7, 0xE1, 0xA7, 0xEA, 0xA7, 0xE8, /* 0x6C-0x6F */
+	0xCA, 0xF0, 0xCA, 0xED, 0xCA, 0xF5, 0xA7, 0xE6, /* 0x70-0x73 */
+	0xCA, 0xF6, 0x00, 0x00, 0xA7, 0xDF, 0xCA, 0xF3, /* 0x74-0x77 */
+	0x00, 0x00, 0xA7, 0xE5, 0xCA, 0xEF, 0xCA, 0xEE, /* 0x78-0x7B */
+	0xA7, 0xE3, 0xCA, 0xF4, 0xA7, 0xE4, 0xA9, 0xD3, /* 0x7C-0x7F */
+	
+	0xA7, 0xDE, 0xCA, 0xF1, 0x00, 0x00, 0xCA, 0xE7, /* 0x80-0x83 */
+	0xA7, 0xDB, 0x00, 0x00, 0xA7, 0xEE, 0xCA, 0xEC, /* 0x84-0x87 */
+	0xCA, 0xF2, 0xA7, 0xE0, 0xA7, 0xE2, 0x00, 0x00, /* 0x88-0x8B */
+	0xCA, 0xE8, 0x00, 0x00, 0xCA, 0xE9, 0xCA, 0xEA, /* 0x8C-0x8F */
+	0x00, 0x00, 0xA7, 0xED, 0xA7, 0xE7, 0xA7, 0xEC, /* 0x90-0x93 */
+	0xCA, 0xEB, 0xA7, 0xEB, 0xA7, 0xDD, 0xA7, 0xDC, /* 0x94-0x97 */
+	0xA7, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xA9, 0xE1, 0xCC, 0xBE, 0xCC, 0xB7, 0xA9, 0xDC, /* 0xA8-0xAB */
+	0xA9, 0xEF, 0xCC, 0xB3, 0xCC, 0xBA, 0xCC, 0xBC, /* 0xAC-0xAF */
+	0xCC, 0xBF, 0xA9, 0xEA, 0x00, 0x00, 0xCC, 0xBB, /* 0xB0-0xB3 */
+	0xCC, 0xB4, 0xA9, 0xE8, 0xCC, 0xB8, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xCC, 0xC0, 0xA9, 0xD9, 0x00, 0x00, 0xCC, 0xBD, /* 0xB8-0xBB */
+	0xA9, 0xE3, 0xA9, 0xE2, 0xCC, 0xB6, 0xA9, 0xD7, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0xD8, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xA9, 0xD6, 0x00, 0x00, 0xA9, 0xEE, 0xA9, 0xE6, /* 0xC4-0xC7 */
+	0xA9, 0xE0, 0xA9, 0xD4, 0xCC, 0xB9, 0xA9, 0xDF, /* 0xC8-0xCB */
+	0xA9, 0xD5, 0xA9, 0xE7, 0xA9, 0xF0, 0xCE, 0xD4, /* 0xCC-0xCF */
+	0xA9, 0xE4, 0xCC, 0xB5, 0xA9, 0xDA, 0xA9, 0xDD, /* 0xD0-0xD3 */
+	0xA9, 0xDE, 0x00, 0x00, 0xA9, 0xEC, 0xA9, 0xED, /* 0xD4-0xD7 */
+	0xA9, 0xEB, 0xA9, 0xE5, 0xA9, 0xE9, 0xA9, 0xDB, /* 0xD8-0xDB */
+	0xAB, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xDA, /* 0xE8-0xEB */
+	0xAC, 0x41, 0xAB, 0xF8, 0xAB, 0xFA, 0xAC, 0x40, /* 0xEC-0xEF */
+	0xCE, 0xE6, 0xAB, 0xFD, 0xD1, 0xB1, 0xAE, 0xB1, /* 0xF0-0xF3 */
+	0xAC, 0x43, 0xCE, 0xD7, 0xCE, 0xDF, 0xAB, 0xFE, /* 0xF4-0xF7 */
+	0xCE, 0xDE, 0xCE, 0xDB, 0xCE, 0xE3, 0xCE, 0xE5, /* 0xF8-0xFB */
+	0xAB, 0xF7, 0xAB, 0xFB, 0xAC, 0x42, 0xAE, 0xB3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_63[512] = {
+	0xCE, 0xE0, 0xAB, 0xF9, 0xAC, 0x45, 0xCE, 0xD9, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xFC, /* 0x04-0x07 */
+	0xAE, 0xB2, 0xAB, 0xF6, 0x00, 0x00, 0xCE, 0xD6, /* 0x08-0x0B */
+	0xCE, 0xDD, 0xCE, 0xD5, 0xCE, 0xD8, 0xCE, 0xDC, /* 0x0C-0x0F */
+	0xD1, 0xB2, 0xAC, 0x44, 0x00, 0x00, 0xCE, 0xE1, /* 0x10-0x13 */
+	0xCE, 0xE2, 0xCE, 0xE4, 0xAB, 0xF5, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xAE, 0xC1, 0xD1, 0xBE, 0xAE, 0xBF, 0xAE, 0xC0, /* 0x28-0x2B */
+	0xD1, 0xB4, 0xD1, 0xC4, 0x00, 0x00, 0xAE, 0xB6, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xD5, 0x66, 0xD1, 0xC6, /* 0x30-0x33 */
+	0xD1, 0xC0, 0x00, 0x00, 0xD1, 0xB7, 0x00, 0x00, /* 0x34-0x37 */
+	0xD1, 0xC9, 0xD1, 0xBA, 0xAE, 0xBC, 0xD5, 0x7D, /* 0x38-0x3B */
+	0xD1, 0xBD, 0xAE, 0xBE, 0xAE, 0xB5, 0x00, 0x00, /* 0x3C-0x3F */
+	0xD1, 0xCB, 0xD1, 0xBF, 0xAE, 0xB8, 0xD1, 0xB8, /* 0x40-0x43 */
+	0xD1, 0xB5, 0xD1, 0xB6, 0xAE, 0xB9, 0xD1, 0xC5, /* 0x44-0x47 */
+	0xD1, 0xCC, 0xAE, 0xBB, 0xD1, 0xBC, 0xD1, 0xBB, /* 0x48-0x4B */
+	0xAE, 0xC3, 0xAE, 0xC2, 0xAE, 0xB4, 0xAE, 0xBA, /* 0x4C-0x4F */
+	0xAE, 0xBD, 0xD1, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xD1, 0xC2, 0xAE, 0xB7, 0xD1, 0xB3, 0xD1, 0xCA, /* 0x54-0x57 */
+	0xD1, 0xC1, 0xD1, 0xC3, 0xD1, 0xC7, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xD5, 0x67, 0x00, 0x00, 0xB1, 0xB7, /* 0x64-0x67 */
+	0xB1, 0xCB, 0xB1, 0xCA, 0x00, 0x00, 0xB1, 0xBF, /* 0x68-0x6B */
+	0x00, 0x00, 0xD5, 0x79, 0xD5, 0x75, 0xD5, 0x72, /* 0x6C-0x6F */
+	0xD5, 0xA6, 0xB1, 0xBA, 0xB1, 0xB2, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xD5, 0x77, 0xB4, 0xA8, 0xB1, 0xB6, /* 0x74-0x77 */
+	0xD5, 0xA1, 0x00, 0x00, 0xB1, 0xCC, 0xB1, 0xC9, /* 0x78-0x7B */
+	0xD5, 0x7B, 0xD5, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xB1, 0xC8, 0xD5, 0xA3, 0xD5, 0x69, 0xB1, 0xBD, /* 0x80-0x83 */
+	0xB1, 0xC1, 0xD5, 0xA2, 0x00, 0x00, 0xD5, 0x73, /* 0x84-0x87 */
+	0xB1, 0xC2, 0xB1, 0xBC, 0xD5, 0x68, 0x00, 0x00, /* 0x88-0x8B */
+	0xB4, 0x78, 0xD5, 0xA5, 0xD5, 0x71, 0xB1, 0xC7, /* 0x8C-0x8F */
+	0xD5, 0x74, 0xD5, 0xA4, 0xB1, 0xC6, 0x00, 0x00, /* 0x90-0x93 */
+	0xD9, 0x52, 0x00, 0x00, 0xB1, 0xB3, 0xD5, 0x6F, /* 0x94-0x97 */
+	0xB1, 0xB8, 0xB1, 0xC3, 0x00, 0x00, 0xB1, 0xBE, /* 0x98-0x9B */
+	0xD5, 0x78, 0xD5, 0x6E, 0xD5, 0x6C, 0xD5, 0x7E, /* 0x9C-0x9F */
+	0xB1, 0xB0, 0xB1, 0xC4, 0xB1, 0xB4, 0xB4, 0x77, /* 0xA0-0xA3 */
+	0xD5, 0x7C, 0xB1, 0xB5, 0x00, 0x00, 0xB1, 0xB1, /* 0xA4-0xA7 */
+	0xB1, 0xC0, 0xB1, 0xBB, 0xB1, 0xB9, 0xD5, 0x70, /* 0xA8-0xAB */
+	0xB1, 0xC5, 0xD5, 0x6D, 0xD5, 0x7A, 0xD5, 0x76, /* 0xAC-0xAF */
+	0xD9, 0x54, 0xD9, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xD5, 0x6B, 0xD9, 0x64, 0x00, 0x00, /* 0xBC-0xBF */
+	0xB4, 0x7A, 0x00, 0x00, 0xD9, 0x6A, 0xD9, 0x59, /* 0xC0-0xC3 */
+	0xD9, 0x67, 0xDD, 0x77, 0xB4, 0x7D, 0xD9, 0x6B, /* 0xC4-0xC7 */
+	0xD9, 0x6E, 0xB4, 0x7C, 0xD9, 0x5C, 0xD9, 0x6D, /* 0xC8-0xCB */
+	0xD9, 0x6C, 0xB4, 0x7E, 0xD9, 0x55, 0xB4, 0x79, /* 0xCC-0xCF */
+	0xB4, 0xA3, 0x00, 0x00, 0xB4, 0xA1, 0xD9, 0x69, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xD9, 0x5F, 0xB4, 0xA5, 0xD9, 0x70, /* 0xD4-0xD7 */
+	0xD9, 0x68, 0xD9, 0x71, 0xB4, 0xAD, 0xB4, 0xAB, /* 0xD8-0xDB */
+	0xD9, 0x66, 0xD9, 0x65, 0x00, 0x00, 0xD9, 0x63, /* 0xDC-0xDF */
+	0xD9, 0x5D, 0xB4, 0xA4, 0x00, 0x00, 0xB4, 0xA2, /* 0xE0-0xE3 */
+	0xD1, 0xB9, 0xD9, 0x56, 0x00, 0x00, 0xDD, 0xB7, /* 0xE4-0xE7 */
+	0xD9, 0x57, 0xB4, 0x7B, 0xB4, 0xAA, 0xDD, 0x79, /* 0xE8-0xEB */
+	0x00, 0x00, 0xB4, 0xA6, 0xB4, 0xA7, 0xD9, 0x58, /* 0xEC-0xEF */
+	0xD9, 0x6F, 0xDD, 0x78, 0xD9, 0x60, 0xD9, 0x5B, /* 0xF0-0xF3 */
+	0xB4, 0xA9, 0xD9, 0x61, 0xD9, 0x5E, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xB4, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_64[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0x70, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xDD, 0x7C, 0xDD, 0xB1, 0xDD, 0xB6, /* 0x08-0x0B */
+	0xDD, 0xAA, 0xB7, 0x6C, 0xDD, 0xBB, 0xB7, 0x69, /* 0x0C-0x0F */
+	0xDD, 0x7A, 0x00, 0x00, 0xDD, 0x7B, 0xB7, 0x62, /* 0x10-0x13 */
+	0xB7, 0x6B, 0xDD, 0xA4, 0xB7, 0x6E, 0xB7, 0x6F, /* 0x14-0x17 */
+	0xDD, 0xA5, 0x00, 0x00, 0xDD, 0xB2, 0xDD, 0xB8, /* 0x18-0x1B */
+	0xB7, 0x6A, 0x00, 0x00, 0xB7, 0x64, 0xDD, 0xA3, /* 0x1C-0x1F */
+	0xDD, 0x7D, 0xDD, 0xBA, 0xDD, 0xA8, 0xDD, 0xA9, /* 0x20-0x23 */
+	0xDD, 0x7E, 0xDD, 0xB4, 0xDD, 0xAB, 0xDD, 0xB5, /* 0x24-0x27 */
+	0xDD, 0xAD, 0x00, 0x00, 0xB7, 0x65, 0xE1, 0xD9, /* 0x28-0x2B */
+	0xB7, 0x68, 0xB7, 0x66, 0xDD, 0xB9, 0xDD, 0xB0, /* 0x2C-0x2F */
+	0xDD, 0xAC, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xA1, /* 0x30-0x33 */
+	0xBA, 0x53, 0xDD, 0xAF, 0xB7, 0x6D, 0xDD, 0xA7, /* 0x34-0x37 */
+	0x00, 0x00, 0xDD, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xB7, 0x67, 0xB7, 0x63, 0xE1, 0xEE, /* 0x3C-0x3F */
+	0xDD, 0xB3, 0xDD, 0xAE, 0x00, 0x00, 0xDD, 0xA2, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE9, /* 0x48-0x4B */
+	0x00, 0x00, 0xE1, 0xDA, 0xE1, 0xE5, 0x00, 0x00, /* 0x4C-0x4F */
+	0xE1, 0xEC, 0xBA, 0x51, 0xB4, 0xAC, 0xE1, 0xEA, /* 0x50-0x53 */
+	0xBA, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xBA, 0x4B, 0xE1, 0xF1, 0x00, 0x00, 0xE1, 0xDB, /* 0x58-0x5B */
+	0xE1, 0xE8, 0xE1, 0xDC, 0xE1, 0xE7, 0xBA, 0x4F, /* 0x5C-0x5F */
+	0xE1, 0xEB, 0xD9, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xE1, 0xF2, 0xE1, 0xE3, 0xBA, 0x52, /* 0x64-0x67 */
+	0xE5, 0xBA, 0xBC, 0xAF, 0x00, 0x00, 0xE1, 0xF0, /* 0x68-0x6B */
+	0xE1, 0xEF, 0xBA, 0x54, 0xE5, 0xAD, 0xBC, 0xB0, /* 0x6C-0x6F */
+	0xE5, 0xAE, 0x00, 0x00, 0xE1, 0xDF, 0xE1, 0xE0, /* 0x70-0x73 */
+	0xE1, 0xDD, 0xE1, 0xE2, 0xE1, 0xDE, 0xE1, 0xF3, /* 0x74-0x77 */
+	0xBA, 0x4E, 0xBC, 0xB1, 0xBA, 0x50, 0xBA, 0x55, /* 0x78-0x7B */
+	0x00, 0x00, 0xE1, 0xE1, 0x00, 0x00, 0xE1, 0xED, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE1, 0xE6, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xE5, 0xB1, 0x00, 0x00, 0xBA, 0x4A, /* 0x84-0x87 */
+	0xBC, 0xB4, 0xE9, 0xAA, 0xE5, 0xB6, 0xE5, 0xB5, /* 0x88-0x8B */
+	0xE5, 0xB7, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB4, /* 0x8C-0x8F */
+	0xBC, 0xB5, 0x00, 0x00, 0xBC, 0xBB, 0xBC, 0xB8, /* 0x90-0x93 */
+	0x00, 0x00, 0xBC, 0xB9, 0xE5, 0xAF, 0xE5, 0xB2, /* 0x94-0x97 */
+	0xE5, 0xBC, 0xBC, 0xC1, 0xBC, 0xBF, 0x00, 0x00, /* 0x98-0x9B */
+	0xE5, 0xB3, 0xD9, 0x5A, 0xBC, 0xB2, 0xE5, 0xB9, /* 0x9C-0x9F */
+	0xE5, 0xB0, 0x00, 0x00, 0xBC, 0xC2, 0xE5, 0xB8, /* 0xA0-0xA3 */
+	0xBA, 0x4D, 0xBC, 0xB7, 0xE1, 0xE4, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xBC, 0xBA, 0x00, 0x00, 0xBC, 0xBE, /* 0xA8-0xAB */
+	0xBC, 0xC0, 0xBC, 0xBD, 0xBC, 0xBC, 0x00, 0x00, /* 0xAC-0xAF */
+	0xBC, 0xB6, 0xE5, 0xBB, 0xBC, 0xB3, 0xBC, 0xC3, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBE, 0xD8, /* 0xB8-0xBB */
+	0xBE, 0xD9, 0xE9, 0xA9, 0xBE, 0xE2, 0xBE, 0xDF, /* 0xBC-0xBF */
+	0x00, 0x00, 0xBE, 0xD6, 0xBE, 0xDD, 0xE9, 0xAB, /* 0xC0-0xC3 */
+	0xBE, 0xDB, 0xBE, 0xD5, 0x00, 0x00, 0xBE, 0xDC, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE9, 0xA8, 0xC0, 0xBB, 0xBE, 0xD7, /* 0xC8-0xCB */
+	0x00, 0x00, 0xBE, 0xDE, 0xC0, 0xBA, 0xE9, 0xA7, /* 0xCC-0xCF */
+	0xE9, 0xA6, 0x00, 0x00, 0xBE, 0xE0, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xBE, 0xE1, 0x00, 0x00, 0xE9, 0xA5, 0xE9, 0xA4, /* 0xD4-0xD7 */
+	0xC0, 0xBC, 0xE9, 0xAE, 0xBE, 0xDA, 0xE9, 0xAC, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xC0, 0xBD, 0x00, 0x00, 0xC0, 0xC2, 0xEC, 0xEA, /* 0xE0-0xE3 */
+	0xEC, 0xEC, 0x00, 0x00, 0xC0, 0xBF, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xEC, 0xED, 0xEC, 0xE9, 0x00, 0x00, 0xEC, 0xEB, /* 0xE8-0xEB */
+	0xC0, 0xC0, 0xC0, 0xC3, 0x00, 0x00, 0xEC, 0xE8, /* 0xEC-0xEF */
+	0xC0, 0xBE, 0xC0, 0xC1, 0xC2, 0x59, 0xE9, 0xAD, /* 0xF0-0xF3 */
+	0xC2, 0x58, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x5E, /* 0xF4-0xF7 */
+	0xEF, 0xD4, 0x00, 0x00, 0xC2, 0x5C, 0xC2, 0x5D, /* 0xF8-0xFB */
+	0xEF, 0xD7, 0xEF, 0xD3, 0xC2, 0x5A, 0xEF, 0xD1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_65[512] = {
+	0xC3, 0x6B, 0xEF, 0xD5, 0x00, 0x00, 0xEF, 0xD6, /* 0x00-0x03 */
+	0xEF, 0xD2, 0x00, 0x00, 0xC2, 0x5B, 0xF2, 0x42, /* 0x04-0x07 */
+	0x00, 0x00, 0xF2, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xF2, 0x46, 0xF2, 0x44, 0xF2, 0x47, 0xC3, 0x6C, /* 0x0C-0x0F */
+	0xF2, 0x43, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x4E, /* 0x10-0x13 */
+	0xC4, 0x64, 0xF4, 0x4D, 0xF4, 0x4C, 0xF4, 0x4B, /* 0x14-0x17 */
+	0xC4, 0x63, 0xC4, 0x65, 0x00, 0x00, 0xF5, 0xCD, /* 0x18-0x1B */
+	0xC4, 0xE2, 0xC4, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xF6, 0xE1, 0xF6, 0xE0, 0xF6, 0xE3, 0xC5, 0xCB, /* 0x20-0x23 */
+	0xC5, 0x75, 0xF7, 0xDD, 0xF6, 0xE2, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xF7, 0xDC, 0xC5, 0xCD, 0xC5, 0xCC, /* 0x28-0x2B */
+	0xC5, 0xF3, 0xF8, 0xA9, 0xF8, 0xEF, 0xA4, 0xE4, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0x72, 0xE9, 0xAF, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xA6, 0xAC, 0xCA, 0xF7, /* 0x34-0x37 */
+	0xA7, 0xF1, 0xA7, 0xEF, 0x00, 0x00, 0xA7, 0xF0, /* 0x38-0x3B */
+	0x00, 0x00, 0xCC, 0xC1, 0xA9, 0xF1, 0xAC, 0x46, /* 0x3C-0x3F */
+	0x00, 0x00, 0xCE, 0xE7, 0x00, 0x00, 0xCE, 0xE8, /* 0x40-0x43 */
+	0x00, 0x00, 0xAC, 0x47, 0xD1, 0xCE, 0x00, 0x00, /* 0x44-0x47 */
+	0xAE, 0xC4, 0xAE, 0xC5, 0xD1, 0xCD, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xD3, /* 0x4C-0x4F */
+	0x00, 0x00, 0xB1, 0xCF, 0x00, 0x00, 0xD5, 0xA7, /* 0x50-0x53 */
+	0xB1, 0xD6, 0xB1, 0xD5, 0xB1, 0xCE, 0xB1, 0xD1, /* 0x54-0x57 */
+	0xB1, 0xD4, 0xB1, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xD9, 0x76, 0xB1, 0xCD, 0xB4, 0xAF, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xB4, 0xB1, 0xB4, 0xB2, /* 0x60-0x63 */
+	0xD9, 0x75, 0xD9, 0x78, 0xB4, 0xB0, 0xD9, 0x73, /* 0x64-0x67 */
+	0xD9, 0x77, 0x00, 0x00, 0xD9, 0x74, 0x00, 0x00, /* 0x68-0x6B */
+	0xB7, 0x71, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xBC, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xBA, 0x56, 0xE1, 0xF4, /* 0x70-0x73 */
+	0xBE, 0xE3, 0xBC, 0xC4, 0xE5, 0xBD, 0xBC, 0xC5, /* 0x74-0x77 */
+	0xBC, 0xC6, 0xE5, 0xBF, 0xE5, 0xBE, 0xE5, 0xC0, /* 0x78-0x7B */
+	0xE9, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB0, /* 0x7C-0x7F */
+	
+	0xEC, 0xEF, 0xEC, 0xEE, 0xC0, 0xC4, 0xC0, 0xC5, /* 0x80-0x83 */
+	0xF2, 0x48, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xE5, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xD9, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xB4, 0xB4, 0xB4, 0xB3, 0xDD, 0xBD, 0x00, 0x00, /* 0x90-0x93 */
+	0xEF, 0xD8, 0xC4, 0xE3, 0xF7, 0xDE, 0xA4, 0xE6, /* 0x94-0x97 */
+	0x00, 0x00, 0xAE, 0xC6, 0x00, 0x00, 0xB1, 0xD8, /* 0x98-0x9B */
+	0xB1, 0xD7, 0xD9, 0x7A, 0xD9, 0x7B, 0xB7, 0x72, /* 0x9C-0x9F */
+	0xE1, 0xF5, 0xBA, 0x57, 0xE9, 0xB2, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xA4, 0xE7, 0xA5, 0xB8, 0x00, 0x00, 0xA9, 0xF2, /* 0xA4-0xA7 */
+	0xCC, 0xC2, 0x00, 0x00, 0xCE, 0xE9, 0xAC, 0x48, /* 0xA8-0xAB */
+	0xB1, 0xD9, 0x00, 0x00, 0xD9, 0x7C, 0xB4, 0xB5, /* 0xAC-0xAF */
+	0xB7, 0x73, 0x00, 0x00, 0xE5, 0xC1, 0xE5, 0xC2, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xF0, 0xC2, 0x5F, /* 0xB4-0xB7 */
+	0xF8, 0xF0, 0xA4, 0xE8, 0x00, 0x00, 0xCC, 0xC3, /* 0xB8-0xBB */
+	0xA9, 0xF3, 0xAC, 0x49, 0x00, 0x00, 0xCE, 0xEA, /* 0xBC-0xBF */
+	0x00, 0x00, 0xAE, 0xC7, 0xD1, 0xD2, 0xD1, 0xD0, /* 0xC0-0xC3 */
+	0xD1, 0xD1, 0xAE, 0xC8, 0xD1, 0xCF, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xDB, /* 0xC8-0xCB */
+	0xB1, 0xDC, 0xD5, 0xA8, 0xB1, 0xDD, 0xB1, 0xDA, /* 0xCC-0xCF */
+	0xD9, 0x7D, 0x00, 0x00, 0xD9, 0x7E, 0xDD, 0xBE, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xBA, 0x59, 0xBA, 0x58, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xF1, 0xEF, 0xD9, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF2, 0x4A, 0xF2, 0x49, 0xF4, 0x4F, /* 0xDC-0xDF */
+	0x00, 0x00, 0xC9, 0x5E, 0xAC, 0x4A, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xA4, 0xE9, 0xA5, 0xB9, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xA6, 0xAE, 0xA6, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xA6, 0xAF, 0xA6, 0xB0, 0xC9, 0xEE, 0xC9, 0xED, /* 0xEC-0xEF */
+	0xCA, 0xF8, 0xA7, 0xF2, 0xCA, 0xFB, 0xCA, 0xFA, /* 0xF0-0xF3 */
+	0xCA, 0xF9, 0xCA, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xA9, 0xF4, 0xCC, 0xC9, /* 0xF8-0xFB */
+	0xCC, 0xC5, 0xCC, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_66[512] = {
+	0xA9, 0xFB, 0x00, 0x00, 0xA9, 0xF9, 0xCC, 0xCA, /* 0x00-0x03 */
+	0xCC, 0xC6, 0xCC, 0xCD, 0xA9, 0xF8, 0xAA, 0x40, /* 0x04-0x07 */
+	0xCC, 0xC8, 0xCC, 0xC4, 0xA9, 0xFE, 0xCC, 0xCB, /* 0x08-0x0B */
+	0xA9, 0xF7, 0xCC, 0xCC, 0xA9, 0xFA, 0xA9, 0xFC, /* 0x0C-0x0F */
+	0xCC, 0xD0, 0xCC, 0xCF, 0xCC, 0xC7, 0xA9, 0xF6, /* 0x10-0x13 */
+	0xA9, 0xF5, 0xA9, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xCE, 0xEF, 0xCE, 0xF5, 0x00, 0x00, 0xAC, 0x50, /* 0x1C-0x1F */
+	0xAC, 0x4D, 0xCE, 0xEC, 0xCE, 0xF1, 0x00, 0x00, /* 0x20-0x23 */
+	0xAC, 0x53, 0xAC, 0x4B, 0xCE, 0xF0, 0xAC, 0x4E, /* 0x24-0x27 */
+	0xAC, 0x51, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF3, /* 0x28-0x2B */
+	0x00, 0x00, 0xAC, 0x4C, 0xCE, 0xF8, 0xAC, 0x4F, /* 0x2C-0x2F */
+	0x00, 0x00, 0xAC, 0x52, 0xCE, 0xED, 0xCE, 0xF2, /* 0x30-0x33 */
+	0xCE, 0xF6, 0xCE, 0xEE, 0xCE, 0xEB, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xCE, 0xF7, 0xCE, 0xF4, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xAE, 0xD0, 0xAE, 0xC9, 0xAE, 0xCC, /* 0x40-0x43 */
+	0x00, 0x00, 0xAE, 0xCF, 0x00, 0x00, 0xD1, 0xD5, /* 0x44-0x47 */
+	0x00, 0x00, 0xAE, 0xCA, 0xD1, 0xD3, 0x00, 0x00, /* 0x48-0x4B */
+	0xAE, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xAE, 0xCB, /* 0x4C-0x4F */
+	0x00, 0x00, 0xD1, 0xD6, 0xAE, 0xCD, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xD5, 0xAC, 0xB1, 0xDF, 0xD5, 0xAB, /* 0x58-0x5B */
+	0xD5, 0xAD, 0xB1, 0xDE, 0xB1, 0xE3, 0xD1, 0xD4, /* 0x5C-0x5F */
+	0x00, 0x00, 0xD5, 0xAA, 0xD5, 0xAE, 0x00, 0x00, /* 0x60-0x63 */
+	0xB1, 0xE0, 0xD5, 0xA9, 0xB1, 0xE2, 0x00, 0x00, /* 0x64-0x67 */
+	0xB1, 0xE1, 0x00, 0x00, 0xD9, 0xA7, 0x00, 0x00, /* 0x68-0x6B */
+	0xD9, 0xA2, 0x00, 0x00, 0xB4, 0xB6, 0xB4, 0xBA, /* 0x6C-0x6F */
+	0xB4, 0xB7, 0xD9, 0xA5, 0xD9, 0xA8, 0x00, 0x00, /* 0x70-0x73 */
+	0xB4, 0xB8, 0x00, 0x00, 0xB4, 0xB9, 0xB4, 0xBE, /* 0x74-0x77 */
+	0xDD, 0xC7, 0xD9, 0xA6, 0xB4, 0xBC, 0xD9, 0xA3, /* 0x78-0x7B */
+	0xD9, 0xA1, 0x00, 0x00, 0xB4, 0xBD, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xD9, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xB7, 0x79, 0x00, 0x00, 0xDD, 0xBF, 0xB7, 0x76, /* 0x84-0x87 */
+	0xB7, 0x77, 0xB7, 0x75, 0xDD, 0xC4, 0xDD, 0xC3, /* 0x88-0x8B */
+	0xDD, 0xC0, 0xB7, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xDD, 0xC2, 0xB4, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xDD, 0xC6, 0xDD, 0xC1, 0xB7, 0x78, 0xB7, 0x74, /* 0x94-0x97 */
+	0xB7, 0x7A, 0xDD, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xBA, 0x5C, 0x00, 0x00, 0xE1, 0xF8, /* 0x9C-0x9F */
+	0xE1, 0xF7, 0xE1, 0xF6, 0xBA, 0x5A, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xBA, 0x5B, 0xE5, 0xC5, 0xE5, 0xC8, 0xBC, 0xC8, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xBC, 0xC7, 0xE5, 0xC9, /* 0xAC-0xAF */
+	0xE5, 0xC4, 0xBC, 0xCA, 0xE5, 0xC6, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xBC, 0xC9, 0xE5, 0xC3, 0x00, 0x00, 0xE5, 0xC7, /* 0xB4-0xB7 */
+	0xBE, 0xE9, 0xBE, 0xE6, 0xE9, 0xBB, 0xE9, 0xBA, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE9, 0xB9, 0xE9, 0xB4, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE9, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xBE, 0xE7, 0x00, 0x00, 0xBE, 0xE4, 0xBE, 0xE8, /* 0xC4-0xC7 */
+	0xE9, 0xB3, 0xBE, 0xE5, 0xE9, 0xB6, 0xE9, 0xB7, /* 0xC8-0xCB */
+	0xE9, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB8, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xF2, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0xC7, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xEF, 0xDC, 0xC0, 0xC6, 0xEF, 0xDA, 0xEF, 0xDB, /* 0xD8-0xDB */
+	0xC2, 0x60, 0xC3, 0x6E, 0xF2, 0x4B, 0x00, 0x00, /* 0xDC-0xDF */
+	0xC3, 0x6D, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x51, /* 0xE0-0xE3 */
+	0xF4, 0x52, 0x00, 0x00, 0xC4, 0x66, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xF4, 0x50, 0xC4, 0xE4, 0x00, 0x00, 0xF7, 0xDF, /* 0xE8-0xEB */
+	0xC5, 0xCE, 0xF8, 0xAA, 0xF8, 0xAB, 0x00, 0x00, /* 0xEC-0xEF */
+	0xA4, 0xEA, 0x00, 0x00, 0xA6, 0xB1, 0xA6, 0xB2, /* 0xF0-0xF3 */
+	0xA7, 0xF3, 0x00, 0x00, 0xCC, 0xD1, 0xAC, 0x54, /* 0xF4-0xF7 */
+	0xAE, 0xD1, 0xB1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0xB0, 0xD2, 0x00, 0x00, 0xB4, 0xBF, 0xB4, 0xC0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_67[512] = {
+	0xB3, 0xCC, 0xD9, 0xA9, 0x00, 0x00, 0xB7, 0x7C, /* 0x00-0x03 */
+	0xE1, 0xFA, 0xE1, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xA4, 0xEB, 0xA6, 0xB3, 0xCC, 0xD2, 0xAA, 0x42, /* 0x08-0x0B */
+	0x00, 0x00, 0xAA, 0x41, 0x00, 0x00, 0xCE, 0xF9, /* 0x0C-0x0F */
+	0xCE, 0xFA, 0x00, 0x00, 0xD1, 0xD7, 0xD1, 0xD8, /* 0x10-0x13 */
+	0xAE, 0xD2, 0xAE, 0xD3, 0x00, 0x00, 0xAE, 0xD4, /* 0x14-0x17 */
+	0xD5, 0xAF, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xE6, /* 0x18-0x1B */
+	0x00, 0x00, 0xB4, 0xC2, 0x00, 0x00, 0xB4, 0xC1, /* 0x1C-0x1F */
+	0xDD, 0xC8, 0xDF, 0x7A, 0xE1, 0xFB, 0xE9, 0xBD, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xC2, 0x61, 0xC4, 0x67, /* 0x24-0x27 */
+	0xA4, 0xEC, 0x00, 0x00, 0xA5, 0xBC, 0xA5, 0xBD, /* 0x28-0x2B */
+	0xA5, 0xBB, 0xA5, 0xBE, 0xA5, 0xBA, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xA6, 0xB6, 0x00, 0x00, 0xC9, 0xF6, /* 0x30-0x33 */
+	0xA6, 0xB5, 0xA6, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xC9, 0xF1, 0xC9, 0xF0, 0xC9, 0xF3, 0xC9, 0xF2, /* 0x38-0x3B */
+	0xC9, 0xF5, 0xA6, 0xB4, 0xC9, 0xEF, 0xC9, 0xF4, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xCA, 0xFD, 0xA7, 0xFD, 0xCA, 0xFE, /* 0x44-0x47 */
+	0xCB, 0x43, 0xA7, 0xFC, 0x00, 0x00, 0xCB, 0x47, /* 0x48-0x4B */
+	0xCB, 0x42, 0xCB, 0x45, 0xA7, 0xF5, 0xA7, 0xF6, /* 0x4C-0x4F */
+	0xA7, 0xF7, 0xA7, 0xF8, 0x00, 0x00, 0xA8, 0x40, /* 0x50-0x53 */
+	0x00, 0x00, 0xCB, 0x41, 0xA7, 0xFA, 0xA8, 0x41, /* 0x54-0x57 */
+	0x00, 0x00, 0xCB, 0x40, 0xCB, 0x46, 0x00, 0x00, /* 0x58-0x5B */
+	0xA7, 0xF9, 0xCB, 0x44, 0xA7, 0xFB, 0xA7, 0xF4, /* 0x5C-0x5F */
+	0xA7, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xAA, 0x57, 0x00, 0x00, /* 0x68-0x6B */
+	0xCC, 0xD4, 0xAA, 0x43, 0x00, 0x00, 0xAA, 0x4D, /* 0x6C-0x6F */
+	0xAA, 0x4E, 0xAA, 0x46, 0xAA, 0x58, 0xAA, 0x48, /* 0x70-0x73 */
+	0xCC, 0xDC, 0xAA, 0x53, 0xCC, 0xD7, 0xAA, 0x49, /* 0x74-0x77 */
+	0xCC, 0xE6, 0xCC, 0xE7, 0xCC, 0xDF, 0xCC, 0xD8, /* 0x78-0x7B */
+	0xAA, 0x56, 0xCC, 0xE4, 0xAA, 0x51, 0xAA, 0x4F, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xCC, 0xE5, 0x00, 0x00, 0xCC, 0xE3, /* 0x80-0x83 */
+	0xCC, 0xDB, 0xCC, 0xD3, 0xCC, 0xDA, 0xAA, 0x4A, /* 0x84-0x87 */
+	0x00, 0x00, 0xAA, 0x50, 0x00, 0x00, 0xAA, 0x44, /* 0x88-0x8B */
+	0xCC, 0xDE, 0xCC, 0xDD, 0xCC, 0xD5, 0x00, 0x00, /* 0x8C-0x8F */
+	0xAA, 0x52, 0xCC, 0xE1, 0xCC, 0xD6, 0xAA, 0x55, /* 0x90-0x93 */
+	0xCC, 0xE8, 0xAA, 0x45, 0x00, 0x00, 0xAA, 0x4C, /* 0x94-0x97 */
+	0xCC, 0xD9, 0xCC, 0xE2, 0xAA, 0x54, 0x00, 0x00, /* 0x98-0x9B */
+	0xAA, 0x47, 0xAA, 0x4B, 0x00, 0x00, 0xCC, 0xE0, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0x5B, 0xAC, 0x5C, /* 0xAC-0xAF */
+	0xAC, 0x69, 0x00, 0x00, 0xCF, 0x56, 0xCF, 0x4C, /* 0xB0-0xB3 */
+	0xAC, 0x62, 0xCF, 0x4A, 0xAC, 0x5B, 0xCF, 0x45, /* 0xB4-0xB7 */
+	0xAC, 0x65, 0xCF, 0x52, 0xCE, 0xFE, 0xCF, 0x41, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCF, 0x44, 0xCE, 0xFB, 0xCF, 0x51, 0xCF, 0x61, /* 0xC0-0xC3 */
+	0xAC, 0x60, 0xCF, 0x46, 0xCF, 0x58, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xCE, 0xFD, 0xCF, 0x5F, 0xCF, 0x60, 0xCF, 0x63, /* 0xC8-0xCB */
+	0xCF, 0x5A, 0xCF, 0x4B, 0xCF, 0x53, 0xAC, 0x66, /* 0xCC-0xCF */
+	0xAC, 0x59, 0xAC, 0x61, 0xAC, 0x6D, 0xAC, 0x56, /* 0xD0-0xD3 */
+	0xAC, 0x58, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xCF, 0x43, 0xAC, 0x6A, 0xAC, 0x63, 0xCF, 0x5D, /* 0xD8-0xDB */
+	0xCF, 0x40, 0xAC, 0x6C, 0xAC, 0x67, 0xCF, 0x49, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xAC, 0x6B, 0xCF, 0x50, /* 0xE0-0xE3 */
+	0xCF, 0x48, 0xAC, 0x64, 0xCF, 0x5C, 0xCF, 0x54, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xAC, 0x5E, 0xCF, 0x62, 0xCF, 0x47, /* 0xE8-0xEB */
+	0xAC, 0x5A, 0xCF, 0x59, 0xCF, 0x4F, 0xAC, 0x5F, /* 0xEC-0xEF */
+	0xCF, 0x55, 0xAC, 0x57, 0xCE, 0xFC, 0xAC, 0x68, /* 0xF0-0xF3 */
+	0xAE, 0xE3, 0xAC, 0x5D, 0xCF, 0x4E, 0xCF, 0x4D, /* 0xF4-0xF7 */
+	0xCF, 0x42, 0x00, 0x00, 0xCF, 0x5E, 0x00, 0x00, /* 0xF8-0xFB */
+	0xCF, 0x57, 0x00, 0x00, 0x00, 0x00, 0xAC, 0x55, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_68[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xEC, 0xAE, 0xEA, /* 0x10-0x13 */
+	0xD1, 0xED, 0x00, 0x00, 0xD1, 0xE1, 0xAE, 0xDF, /* 0x14-0x17 */
+	0xAE, 0xEB, 0x00, 0x00, 0xD1, 0xDA, 0x00, 0x00, /* 0x18-0x1B */
+	0xD1, 0xE3, 0xD1, 0xEB, 0x00, 0x00, 0xD1, 0xD9, /* 0x1C-0x1F */
+	0xD1, 0xF4, 0xAE, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xD1, 0xF3, 0xD1, 0xEE, 0x00, 0x00, /* 0x24-0x27 */
+	0xD1, 0xEF, 0xAE, 0xDD, 0xAE, 0xE8, 0xD1, 0xE5, /* 0x28-0x2B */
+	0x00, 0x00, 0xD1, 0xE6, 0xD1, 0xF0, 0xD1, 0xE7, /* 0x2C-0x2F */
+	0x00, 0x00, 0xD1, 0xE2, 0xD1, 0xDC, 0xD1, 0xDD, /* 0x30-0x33 */
+	0xD1, 0xEA, 0xD1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xAE, 0xD6, 0xAE, 0xDA, 0xD1, 0xF2, 0xD1, 0xDE, /* 0x38-0x3B */
+	0xAE, 0xE6, 0xAE, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xAE, 0xE5, 0xAE, 0xEC, 0xAE, 0xDB, 0xAE, 0xE7, /* 0x40-0x43 */
+	0xD1, 0xE9, 0xAE, 0xE9, 0xAE, 0xD8, 0x00, 0x00, /* 0x44-0x47 */
+	0xAE, 0xD7, 0xD1, 0xDB, 0x00, 0x00, 0xD1, 0xDF, /* 0x48-0x4B */
+	0xAE, 0xE0, 0xD1, 0xF1, 0xD1, 0xE8, 0xD1, 0xE0, /* 0x4C-0x4F */
+	0xAE, 0xE4, 0xAE, 0xE1, 0x00, 0x00, 0xAE, 0xD9, /* 0x50-0x53 */
+	0xAE, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xC4, /* 0x68-0x6B */
+	0x00, 0x00, 0xD5, 0xB4, 0xD5, 0xB5, 0xD5, 0xB9, /* 0x6C-0x6F */
+	0x00, 0x00, 0xD5, 0xC8, 0xD5, 0xC5, 0x00, 0x00, /* 0x70-0x73 */
+	0xD5, 0xBE, 0xD5, 0xBD, 0xB1, 0xED, 0xD5, 0xC1, /* 0x74-0x77 */
+	0xD5, 0xD0, 0xD5, 0xB0, 0x00, 0x00, 0xD5, 0xD1, /* 0x78-0x7B */
+	0xD5, 0xC3, 0xD5, 0xD5, 0xD5, 0xC9, 0xB1, 0xEC, /* 0x7C-0x7F */
+	
+	0xD5, 0xC7, 0xB1, 0xE7, 0xB1, 0xFC, 0xB1, 0xF2, /* 0x80-0x83 */
+	0x00, 0x00, 0xB1, 0xF6, 0xB1, 0xF5, 0xD5, 0xB1, /* 0x84-0x87 */
+	0x00, 0x00, 0xD5, 0xCE, 0xD5, 0xD4, 0xD5, 0xCC, /* 0x88-0x8B */
+	0xD5, 0xD3, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xC0, /* 0x8C-0x8F */
+	0xD5, 0xB2, 0xD5, 0xD2, 0xD5, 0xC2, 0xB1, 0xEA, /* 0x90-0x93 */
+	0xB1, 0xF7, 0x00, 0x00, 0xD5, 0xCB, 0xB1, 0xF0, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xCA, /* 0x98-0x9B */
+	0xD5, 0xB3, 0xB1, 0xF8, 0x00, 0x00, 0xB1, 0xFA, /* 0x9C-0x9F */
+	0xD5, 0xCD, 0xB1, 0xFB, 0xB1, 0xE9, 0xD5, 0xBA, /* 0xA0-0xA3 */
+	0xD5, 0xCF, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xEF, /* 0xA4-0xA7 */
+	0xB1, 0xF9, 0xD5, 0xBC, 0xD5, 0xC6, 0xD5, 0xB7, /* 0xA8-0xAB */
+	0xD5, 0xBB, 0xB1, 0xF4, 0xD5, 0xB6, 0xB1, 0xE8, /* 0xAC-0xAF */
+	0xB1, 0xF1, 0xB1, 0xEE, 0xD5, 0xBF, 0xAE, 0xDE, /* 0xB0-0xB3 */
+	0xD9, 0xC0, 0xB1, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xB1, 0xF3, 0x00, 0x00, 0xD9, 0xC3, 0xD9, 0xD9, /* 0xC4-0xC7 */
+	0xD9, 0xCE, 0xB4, 0xD6, 0x00, 0x00, 0xB4, 0xD1, /* 0xC8-0xCB */
+	0xD9, 0xBD, 0xB4, 0xD2, 0xD9, 0xCD, 0x00, 0x00, /* 0xCC-0xCF */
+	0xD9, 0xC6, 0xD9, 0xD3, 0xB4, 0xCE, 0xD9, 0xAB, /* 0xD0-0xD3 */
+	0xD9, 0xD5, 0xB4, 0xC4, 0xD9, 0xB3, 0xB4, 0xC7, /* 0xD4-0xD7 */
+	0xB4, 0xC6, 0x00, 0x00, 0xB4, 0xD7, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD9, 0xAD, 0xD9, 0xCF, 0xD9, 0xD0, 0xB4, 0xC9, /* 0xDC-0xDF */
+	0xB4, 0xC5, 0xD9, 0xBB, 0x00, 0x00, 0xB4, 0xD0, /* 0xE0-0xE3 */
+	0xD9, 0xB6, 0x00, 0x00, 0xD9, 0xD1, 0xB4, 0xCC, /* 0xE4-0xE7 */
+	0xD9, 0xC9, 0xD9, 0xD6, 0xD9, 0xB0, 0xD9, 0xB5, /* 0xE8-0xEB */
+	0xD9, 0xAF, 0x00, 0x00, 0xB4, 0xCB, 0xD9, 0xC2, /* 0xEC-0xEF */
+	0xDD, 0xDE, 0xD9, 0xB1, 0xB4, 0xCF, 0xD9, 0xBA, /* 0xF0-0xF3 */
+	0xD9, 0xD2, 0xB4, 0xCA, 0xD9, 0xB7, 0xD9, 0xB4, /* 0xF4-0xF7 */
+	0xD9, 0xC5, 0xB4, 0xCD, 0xB4, 0xC3, 0xB4, 0xD9, /* 0xF8-0xFB */
+	0xD9, 0xC8, 0xD9, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_69[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xD9, 0xAC, 0xB4, 0xC8, 0xD9, 0xD4, 0xD9, 0xBC, /* 0x04-0x07 */
+	0xD9, 0xBE, 0x00, 0x00, 0xD9, 0xCB, 0xD9, 0xCA, /* 0x08-0x0B */
+	0xD9, 0xAA, 0xB4, 0xD3, 0xB4, 0xD5, 0xD9, 0xB2, /* 0x0C-0x0F */
+	0xD9, 0xB9, 0xD9, 0xC1, 0xB4, 0xD4, 0xD9, 0xB8, /* 0x10-0x13 */
+	0xD9, 0xC4, 0xD9, 0xD7, 0x00, 0x00, 0xD9, 0xCC, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xD9, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xD9, 0xAE, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xF2, /* 0x2C-0x2F */
+	0xB7, 0xA6, 0x00, 0x00, 0xDD, 0xF0, 0xDD, 0xDB, /* 0x30-0x33 */
+	0xDD, 0xE0, 0xDD, 0xD9, 0x00, 0x00, 0xDD, 0xEC, /* 0x34-0x37 */
+	0xDD, 0xCB, 0xDD, 0xD2, 0x00, 0x00, 0xDD, 0xEA, /* 0x38-0x3B */
+	0xDD, 0xF4, 0xDD, 0xDC, 0x00, 0x00, 0xDD, 0xCF, /* 0x3C-0x3F */
+	0xDD, 0xE2, 0xDD, 0xE7, 0xDD, 0xD3, 0x00, 0x00, /* 0x40-0x43 */
+	0xDD, 0xE4, 0xDD, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xDD, 0xD7, 0xDD, 0xD8, 0xB7, 0xA8, 0xDD, 0xEB, /* 0x48-0x4B */
+	0xDD, 0xE9, 0x00, 0x00, 0xDD, 0xCC, 0xDD, 0xEE, /* 0x4C-0x4F */
+	0x00, 0x00, 0xDD, 0xEF, 0xDD, 0xF1, 0xB7, 0xAC, /* 0x50-0x53 */
+	0xB7, 0xA4, 0x00, 0x00, 0xD5, 0xB8, 0xDD, 0xD4, /* 0x54-0x57 */
+	0xDD, 0xE6, 0xDD, 0xD5, 0xB7, 0xA1, 0xB7, 0xB1, /* 0x58-0x5B */
+	0xDD, 0xED, 0xB7, 0xAF, 0xB7, 0xAB, 0xDD, 0xCA, /* 0x5C-0x5F */
+	0xB7, 0xA3, 0x00, 0x00, 0xDD, 0xCD, 0xB7, 0xB0, /* 0x60-0x63 */
+	0x00, 0x00, 0xDD, 0xDD, 0xDD, 0xC9, 0x00, 0x00, /* 0x64-0x67 */
+	0xB7, 0xA9, 0xDD, 0xE1, 0xDD, 0xD1, 0xB7, 0xAA, /* 0x68-0x6B */
+	0xDD, 0xDA, 0xB7, 0x7E, 0xB4, 0xD8, 0xDD, 0xE3, /* 0x6C-0x6F */
+	0xD9, 0xBF, 0xDD, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xDD, 0xE8, 0xB7, 0xA5, 0xDD, 0xE5, 0xB7, 0xA2, /* 0x74-0x77 */
+	0xDD, 0xDF, 0xB7, 0xAD, 0xDD, 0xD6, 0xDD, 0xF3, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0xA7, 0xDE, 0xC6, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0xAE, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xE2, 0x4A, 0xE2, 0x48, 0x00, 0x00, /* 0x8C-0x8F */
+	0xE2, 0x5E, 0xE2, 0x46, 0x00, 0x00, 0xE2, 0x58, /* 0x90-0x93 */
+	0xB7, 0x7D, 0xBA, 0x5F, 0xE2, 0x42, 0xE2, 0x5D, /* 0x94-0x97 */
+	0x00, 0x00, 0xE2, 0x47, 0xE2, 0x55, 0xBA, 0x64, /* 0x98-0x9B */
+	0xBA, 0x5D, 0x00, 0x00, 0xE2, 0x5B, 0x00, 0x00, /* 0x9C-0x9F */
+	0xE2, 0x40, 0xE2, 0x5A, 0x00, 0x00, 0xBA, 0x6F, /* 0xA0-0xA3 */
+	0xE2, 0x51, 0xE2, 0x61, 0xBA, 0x6D, 0xE2, 0x49, /* 0xA4-0xA7 */
+	0xBA, 0x5E, 0xE2, 0x4B, 0xE2, 0x59, 0xBA, 0x67, /* 0xA8-0xAB */
+	0xE2, 0x44, 0xBA, 0x6B, 0xBA, 0x61, 0xE2, 0x4D, /* 0xAC-0xAF */
+	0xE2, 0x43, 0xE1, 0xFC, 0x00, 0x00, 0xE2, 0x57, /* 0xB0-0xB3 */
+	0xBA, 0x68, 0xE2, 0x60, 0xE1, 0xFD, 0xBA, 0x65, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE2, 0x53, 0x00, 0x00, 0xBA, 0x66, /* 0xB8-0xBB */
+	0xE2, 0x45, 0xE2, 0x50, 0xE2, 0x4C, 0xE2, 0x4E, /* 0xBC-0xBF */
+	0x00, 0x00, 0xBA, 0x60, 0xE2, 0x5F, 0xBA, 0x6E, /* 0xC0-0xC3 */
+	0xE2, 0x4F, 0x00, 0x00, 0xE2, 0x62, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE1, 0xFE, 0xE2, 0x54, 0xBA, 0x63, /* 0xC8-0xCB */
+	0xBA, 0x6C, 0xBA, 0x6A, 0xE2, 0x41, 0xE2, 0x56, /* 0xCC-0xCF */
+	0xBA, 0x69, 0x00, 0x00, 0x00, 0x00, 0xBA, 0x62, /* 0xD0-0xD3 */
+	0xE2, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE2, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xE5, 0xD5, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xE5, 0xD1, 0xE5, 0xCD, 0xE5, 0xE1, 0xE5, 0xDE, /* 0xE4-0xE7 */
+	0xBC, 0xCD, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE5, /* 0xE8-0xEB */
+	0xE5, 0xD4, 0xBC, 0xD8, 0xE5, 0xDB, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE5, 0xD0, 0xE5, 0xDA, 0xBC, 0xD5, /* 0xF0-0xF3 */
+	0xE5, 0xEE, 0x00, 0x00, 0xE5, 0xEB, 0xE5, 0xDD, /* 0xF4-0xF7 */
+	0xE5, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE2, /* 0xF8-0xFB */
+	0xE5, 0xE4, 0xBC, 0xD1, 0xE5, 0xD8, 0xE5, 0xD3, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6A[512] = {
+	0xE5, 0xCA, 0xBC, 0xCE, 0xBC, 0xD6, 0x00, 0x00, /* 0x00-0x03 */
+	0xE5, 0xE7, 0xBC, 0xD7, 0xE5, 0xCB, 0xE5, 0xED, /* 0x04-0x07 */
+	0xE5, 0xE0, 0xE5, 0xE6, 0xBC, 0xD4, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE5, 0xE3, 0x00, 0x00, 0xE5, 0xEA, /* 0x0C-0x0F */
+	0x00, 0x00, 0xBC, 0xD9, 0x00, 0x00, 0xBC, 0xD3, /* 0x10-0x13 */
+	0xE5, 0xDC, 0xE5, 0xCF, 0xE5, 0xEF, 0xE5, 0xCC, /* 0x14-0x17 */
+	0xE5, 0xE8, 0xBC, 0xD0, 0x00, 0x00, 0xE5, 0xD6, /* 0x18-0x1B */
+	0x00, 0x00, 0xE5, 0xD7, 0xBC, 0xCF, 0xBC, 0xCC, /* 0x1C-0x1F */
+	0xE5, 0xD2, 0xBC, 0xD2, 0x00, 0x00, 0xBC, 0xCB, /* 0x20-0x23 */
+	0x00, 0x00, 0xE5, 0xE9, 0xE5, 0xEC, 0xE5, 0xD9, /* 0x24-0x27 */
+	0xE9, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xC2, 0x00, 0x00, /* 0x30-0x33 */
+	0xE9, 0xBE, 0xBE, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xBE, 0xEB, 0xBE, 0xF0, 0xBE, 0xEC, 0xE9, 0xCC, /* 0x38-0x3B */
+	0xE9, 0xD7, 0xBE, 0xEA, 0xE9, 0xC4, 0xE9, 0xCD, /* 0x3C-0x3F */
+	0xE5, 0xDF, 0xE9, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xBE, 0xF1, 0x00, 0x00, 0xE9, 0xDD, 0xBE, 0xF5, /* 0x44-0x47 */
+	0xBE, 0xF8, 0xE9, 0xC0, 0x00, 0x00, 0xBE, 0xF4, /* 0x48-0x4B */
+	0x00, 0x00, 0xE9, 0xDB, 0xE9, 0xDC, 0xE9, 0xD2, /* 0x4C-0x4F */
+	0xE9, 0xD1, 0xE9, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xE9, 0xD3, 0xE9, 0xDA, 0xE9, 0xD9, 0x00, 0x00, /* 0x54-0x57 */
+	0xBE, 0xEF, 0xBE, 0xED, 0xE9, 0xCB, 0xE9, 0xC8, /* 0x58-0x5B */
+	0x00, 0x00, 0xE9, 0xC5, 0xE9, 0xD8, 0xBE, 0xF7, /* 0x5C-0x5F */
+	0xE9, 0xD6, 0xBE, 0xF3, 0xBE, 0xF2, 0x00, 0x00, /* 0x60-0x63 */
+	0xE9, 0xD0, 0x00, 0x00, 0xE9, 0xBF, 0xE9, 0xC1, /* 0x64-0x67 */
+	0xE9, 0xC3, 0xE9, 0xD5, 0xE9, 0xCF, 0xBE, 0xEE, /* 0x68-0x6B */
+	0x00, 0x00, 0xE9, 0xC6, 0x00, 0x00, 0xE9, 0xD4, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xC7, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0xCF, 0xED, 0x45, /* 0x7C-0x7F */
+	
+	0xC0, 0xC8, 0xEC, 0xF5, 0x00, 0x00, 0xED, 0x41, /* 0x80-0x83 */
+	0xC0, 0xCA, 0xED, 0x48, 0x00, 0x00, 0xEC, 0xFC, /* 0x84-0x87 */
+	0x00, 0x00, 0xEC, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xED, 0x49, 0xEC, 0xF3, 0xEC, 0xFE, 0x00, 0x00, /* 0x8C-0x8F */
+	0xC0, 0xD1, 0xED, 0x44, 0xED, 0x4A, 0xEC, 0xFD, /* 0x90-0x93 */
+	0xC0, 0xC9, 0xED, 0x40, 0xEC, 0xF4, 0xC0, 0xD0, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0x47, 0xEC, 0xF9, /* 0x98-0x9B */
+	0xC0, 0xCC, 0x00, 0x00, 0xEC, 0xFB, 0xEC, 0xF8, /* 0x9C-0x9F */
+	0xC0, 0xD2, 0xEC, 0xFA, 0xC0, 0xCB, 0xC0, 0xCE, /* 0xA0-0xA3 */
+	0xED, 0x43, 0xEC, 0xF6, 0xED, 0x46, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xED, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xC2, 0x63, 0xEF, 0xE7, 0xC2, 0x68, 0xC2, 0x69, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x62, /* 0xB0-0xB3 */
+	0xEF, 0xE6, 0x00, 0x00, 0xEF, 0xE3, 0xEF, 0xE4, /* 0xB4-0xB7 */
+	0xC2, 0x66, 0xEF, 0xDE, 0xEF, 0xE2, 0xC2, 0x65, /* 0xB8-0xBB */
+	0x00, 0x00, 0xEF, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xC2, 0x67, 0xC2, 0x64, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xEF, 0xDD, 0xEF, 0xE1, 0xEF, 0xE5, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0x51, /* 0xC8-0xCB */
+	0xF2, 0x4E, 0xF2, 0x57, 0x00, 0x00, 0xF2, 0x56, /* 0xCC-0xCF */
+	0xF2, 0x54, 0xF2, 0x4F, 0x00, 0x00, 0xC3, 0x72, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xF2, 0x50, 0xC3, 0x71, 0xC0, 0xCD, /* 0xD8-0xDB */
+	0xF2, 0x53, 0xC3, 0x70, 0xF2, 0x58, 0xF2, 0x52, /* 0xDC-0xDF */
+	0xF2, 0x4D, 0xEF, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xC3, 0x6F, 0x00, 0x00, 0xF2, 0x4C, /* 0xE4-0xE7 */
+	0xF4, 0x56, 0x00, 0x00, 0xF4, 0x55, 0xF2, 0x55, /* 0xE8-0xEB */
+	0xC4, 0x68, 0x00, 0x00, 0xF4, 0x59, 0xF4, 0x5A, /* 0xEC-0xEF */
+	0xF4, 0x54, 0xF4, 0x58, 0x00, 0x00, 0xF4, 0x53, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xF5, 0xD1, 0xF4, 0x57, 0xC4, 0xE7, 0xC4, 0xE5, /* 0xF8-0xFB */
+	0xF5, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6B[512] = {
+	0xF5, 0xD2, 0x00, 0x00, 0xF5, 0xCE, 0xF5, 0xD0, /* 0x00-0x03 */
+	0xC4, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xF6, 0xE5, 0xF6, 0xE6, 0xC5, 0x76, 0xF6, 0xE4, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xE2, /* 0x0C-0x0F */
+	0xC5, 0xCF, 0xF7, 0xE0, 0xF7, 0xE1, 0xF8, 0xAC, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xC6, 0x56, 0xF8, 0xF3, /* 0x14-0x17 */
+	0xF8, 0xF1, 0xF8, 0xF2, 0xF8, 0xF4, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xBB, 0x00, 0x00, /* 0x1C-0x1F */
+	0xA4, 0xED, 0xA6, 0xB8, 0x00, 0x00, 0xAA, 0x59, /* 0x20-0x23 */
+	0x00, 0x00, 0xCC, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xCF, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0xD1, 0xF5, 0xD1, 0xF7, 0x00, 0x00, 0xD1, 0xF6, /* 0x2C-0x2F */
+	0x00, 0x00, 0xD1, 0xF8, 0xB1, 0xFD, 0xD5, 0xD7, /* 0x30-0x33 */
+	0xD1, 0xF9, 0x00, 0x00, 0xD5, 0xD6, 0xD5, 0xD8, /* 0x34-0x37 */
+	0xD5, 0xD9, 0xD9, 0xDA, 0xB4, 0xDB, 0xD9, 0xDB, /* 0x38-0x3B */
+	0xD9, 0xDD, 0xB4, 0xDC, 0xB4, 0xDA, 0xD9, 0xDC, /* 0x3C-0x3F */
+	0x00, 0x00, 0xDD, 0xFA, 0xDD, 0xF8, 0xDD, 0xF7, /* 0x40-0x43 */
+	0x00, 0x00, 0xDD, 0xF6, 0xDD, 0xF5, 0xB7, 0xB2, /* 0x44-0x47 */
+	0xDD, 0xF9, 0xBA, 0x70, 0xE2, 0x63, 0xE2, 0x65, /* 0x48-0x4B */
+	0xBA, 0x71, 0xE2, 0x64, 0xBC, 0xDB, 0x00, 0x00, /* 0x4C-0x4F */
+	0xBC, 0xDA, 0xE5, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xE9, 0xDF, 0xE9, 0xDE, 0xE9, 0xE0, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xBE, 0xF9, 0x00, 0x00, 0xED, 0x4B, /* 0x58-0x5B */
+	0xC0, 0xD3, 0x00, 0x00, 0xEF, 0xE8, 0xC2, 0x6A, /* 0x5C-0x5F */
+	0xF2, 0x59, 0xC5, 0x77, 0xA4, 0xEE, 0xA5, 0xBF, /* 0x60-0x63 */
+	0xA6, 0xB9, 0xA8, 0x42, 0xAA, 0x5A, 0xAA, 0x5B, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xAC, 0x6E, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xD1, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0xB3, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xD1, 0xBE, 0xFA, /* 0x74-0x77 */
+	0xC2, 0x6B, 0xA4, 0xEF, 0x00, 0x00, 0xA6, 0xBA, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xEB, 0xAA, 0x5C, /* 0x7C-0x7F */
+	
+	0xCC, 0xEA, 0x00, 0x00, 0xCF, 0x65, 0xAC, 0x6F, /* 0x80-0x83 */
+	0xCF, 0x66, 0x00, 0x00, 0xAC, 0x70, 0x00, 0x00, /* 0x84-0x87 */
+	0xD1, 0xFC, 0xAE, 0xEE, 0xAE, 0xED, 0x00, 0x00, /* 0x88-0x8B */
+	0xD5, 0xDE, 0xD5, 0xDC, 0xD5, 0xDD, 0xD5, 0xDB, /* 0x8C-0x8F */
+	0x00, 0x00, 0xD5, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xD9, 0xDE, 0xD9, 0xE1, 0xB4, 0xDE, 0xD9, 0xDF, /* 0x94-0x97 */
+	0xB4, 0xDD, 0xD9, 0xE0, 0x00, 0x00, 0xDD, 0xFB, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x66, 0xE2, 0x67, /* 0x9C-0x9F */
+	0xE2, 0x68, 0x00, 0x00, 0xE5, 0xF3, 0xE5, 0xF2, /* 0xA0-0xA3 */
+	0xBC, 0xDC, 0xE5, 0xF1, 0xE5, 0xF4, 0xE9, 0xE1, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xE2, 0xE9, 0xE3, /* 0xA8-0xAB */
+	0x00, 0x00, 0xED, 0x4C, 0xC0, 0xD4, 0xC2, 0x6C, /* 0xAC-0xAF */
+	0xF2, 0x5A, 0x00, 0x00, 0xC4, 0xE8, 0xC9, 0x5F, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xAC, 0x71, 0xCF, 0x67, 0xAE, 0xEF, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xB1, 0xFE, 0x00, 0x00, /* 0xB8-0xBB */
+	0xB4, 0xDF, 0xD9, 0xE2, 0x00, 0x00, 0xB7, 0xB5, /* 0xBC-0xBF */
+	0xB7, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x69, /* 0xC0-0xC3 */
+	0xE2, 0x6A, 0xBC, 0xDD, 0xBC, 0xDE, 0xE9, 0xE5, /* 0xC4-0xC7 */
+	0xE9, 0xE4, 0xEF, 0xE9, 0xF7, 0xE3, 0xA4, 0xF0, /* 0xC8-0xCB */
+	0xC9, 0x60, 0xA5, 0xC0, 0x00, 0x00, 0xA8, 0x43, /* 0xCC-0xCF */
+	0xCB, 0x48, 0x00, 0x00, 0xAC, 0x72, 0xB7, 0xB6, /* 0xD0-0xD3 */
+	0xA4, 0xF1, 0x00, 0x00, 0xCF, 0x68, 0xAC, 0x73, /* 0xD4-0xD7 */
+	0xCF, 0x69, 0x00, 0x00, 0xC0, 0xD5, 0xA4, 0xF2, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0xCC, 0xEC, 0x00, 0x00, /* 0xDC-0xDF */
+	0xCF, 0x6A, 0x00, 0x00, 0xD2, 0x42, 0xD2, 0x41, /* 0xE0-0xE3 */
+	0xD1, 0xFE, 0x00, 0x00, 0xD1, 0xFD, 0xD2, 0x43, /* 0xE4-0xE7 */
+	0xD2, 0x40, 0x00, 0x00, 0x00, 0x00, 0xB2, 0x40, /* 0xE8-0xEB */
+	0xB2, 0x41, 0x00, 0x00, 0x00, 0x00, 0xB4, 0xE0, /* 0xEC-0xEF */
+	0xD9, 0xE3, 0x00, 0x00, 0xD9, 0xE4, 0xD9, 0xE5, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0x41, /* 0xF4-0xF7 */
+	0xDE, 0x42, 0xDE, 0x40, 0x00, 0x00, 0xDD, 0xFD, /* 0xF8-0xFB */
+	0xDD, 0xFE, 0xB7, 0xB7, 0xE2, 0x6B, 0xE5, 0xF7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6C[512] = {
+	0xE5, 0xF6, 0xE5, 0xF5, 0xE5, 0xF8, 0xE9, 0xE7, /* 0x00-0x03 */
+	0xE9, 0xE6, 0xBE, 0xFB, 0xE9, 0xE8, 0x00, 0x00, /* 0x04-0x07 */
+	0xC0, 0xD6, 0xED, 0x4D, 0x00, 0x00, 0xEF, 0xEA, /* 0x08-0x0B */
+	0xF2, 0x5B, 0xF6, 0xE7, 0x00, 0x00, 0xA4, 0xF3, /* 0x0C-0x0F */
+	0xA5, 0xC2, 0xA5, 0xC1, 0x00, 0x00, 0xAA, 0x5D, /* 0x10-0x13 */
+	0xC9, 0x61, 0xC9, 0x7E, 0xA6, 0xBB, 0x00, 0x00, /* 0x14-0x17 */
+	0xC9, 0xF7, 0xCB, 0x49, 0xCB, 0x4A, 0xAA, 0x5E, /* 0x18-0x1B */
+	0x00, 0x00, 0xCC, 0xED, 0x00, 0x00, 0xAC, 0x74, /* 0x1C-0x1F */
+	0xCF, 0x6B, 0xCF, 0x6C, 0x00, 0x00, 0xAE, 0xF0, /* 0x20-0x23 */
+	0xAE, 0xF4, 0xD2, 0x44, 0xAE, 0xF3, 0xAE, 0xF1, /* 0x24-0x27 */
+	0xAE, 0xF2, 0x00, 0x00, 0xD5, 0xDF, 0xB2, 0x42, /* 0x28-0x2B */
+	0xB4, 0xE3, 0x00, 0x00, 0xB4, 0xE1, 0xB4, 0xE2, /* 0x2C-0x2F */
+	0xD9, 0xE6, 0x00, 0x00, 0x00, 0x00, 0xBA, 0x72, /* 0x30-0x33 */
+	0xA4, 0xF4, 0x00, 0x00, 0xC9, 0xA1, 0x00, 0x00, /* 0x34-0x37 */
+	0xA5, 0xC3, 0x00, 0x00, 0x00, 0x00, 0xC9, 0xA4, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xA5, 0xC6, 0xC9, 0xA3, /* 0x3C-0x3F */
+	0xA5, 0xC5, 0xA5, 0xC4, 0xA8, 0x44, 0xC9, 0xA2, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xC9, 0xF8, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xC9, 0xFC, 0xC9, 0xFE, /* 0x48-0x4B */
+	0xCA, 0x40, 0xA6, 0xC5, 0xA6, 0xC6, 0xC9, 0xFB, /* 0x4C-0x4F */
+	0xA6, 0xC1, 0x00, 0x00, 0xC9, 0xF9, 0x00, 0x00, /* 0x50-0x53 */
+	0xC9, 0xFD, 0xA6, 0xC2, 0x00, 0x00, 0xA6, 0xBD, /* 0x54-0x57 */
+	0x00, 0x00, 0xA6, 0xBE, 0x00, 0x00, 0xA6, 0xC4, /* 0x58-0x5B */
+	0xC9, 0xFA, 0xA6, 0xBC, 0xA8, 0x45, 0xA6, 0xBF, /* 0x5C-0x5F */
+	0xA6, 0xC0, 0xA6, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xCB, 0x5B, 0xCB, 0x59, 0xCB, 0x4C, /* 0x64-0x67 */
+	0xA8, 0x51, 0xCB, 0x53, 0xA8, 0x4C, 0xCB, 0x4D, /* 0x68-0x6B */
+	0x00, 0x00, 0xCB, 0x55, 0x00, 0x00, 0xCB, 0x52, /* 0x6C-0x6F */
+	0xA8, 0x4F, 0xCB, 0x51, 0xA8, 0x56, 0xCB, 0x5A, /* 0x70-0x73 */
+	0xA8, 0x58, 0x00, 0x00, 0xA8, 0x5A, 0x00, 0x00, /* 0x74-0x77 */
+	0xCB, 0x4B, 0x00, 0x00, 0xA8, 0x4D, 0xCB, 0x5C, /* 0x78-0x7B */
+	0x00, 0x00, 0xA8, 0x54, 0xA8, 0x57, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xCD, 0x45, 0xA8, 0x47, 0xA8, 0x5E, 0xA8, 0x55, /* 0x80-0x83 */
+	0xCB, 0x4E, 0xA8, 0x4A, 0xA8, 0x59, 0xCB, 0x56, /* 0x84-0x87 */
+	0xA8, 0x48, 0xA8, 0x49, 0xCD, 0x43, 0xCB, 0x4F, /* 0x88-0x8B */
+	0xA8, 0x50, 0xA8, 0x5B, 0xCB, 0x5D, 0xCB, 0x50, /* 0x8C-0x8F */
+	0xA8, 0x4E, 0x00, 0x00, 0xA8, 0x53, 0xCC, 0xEE, /* 0x90-0x93 */
+	0xA8, 0x5C, 0xCB, 0x57, 0xA8, 0x52, 0x00, 0x00, /* 0x94-0x97 */
+	0xA8, 0x5D, 0xA8, 0x46, 0xCB, 0x54, 0xA8, 0x4B, /* 0x98-0x9B */
+	0xCB, 0x58, 0xCD, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAA, 0x6A, /* 0xA8-0xAB */
+	0xAA, 0x7A, 0xCC, 0xF5, 0xAA, 0x71, 0x00, 0x00, /* 0xAC-0xAF */
+	0xCD, 0x4B, 0xAA, 0x62, 0x00, 0x00, 0xAA, 0x65, /* 0xB0-0xB3 */
+	0xCD, 0x42, 0x00, 0x00, 0xCC, 0xF3, 0xCC, 0xF7, /* 0xB4-0xB7 */
+	0xAA, 0x6D, 0xAA, 0x6F, 0xCC, 0xFA, 0xAA, 0x76, /* 0xB8-0xBB */
+	0xAA, 0x68, 0xAA, 0x66, 0xAA, 0x67, 0xAA, 0x75, /* 0xBC-0xBF */
+	0xCD, 0x47, 0xAA, 0x70, 0xCC, 0xF9, 0xCC, 0xFB, /* 0xC0-0xC3 */
+	0xAA, 0x6E, 0xAA, 0x73, 0xCC, 0xFC, 0xCD, 0x4A, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xAC, 0x75, 0xAA, 0x79, 0x00, 0x00, /* 0xC8-0xCB */
+	0xAA, 0x63, 0xCD, 0x49, 0x00, 0x00, 0xCD, 0x4D, /* 0xCC-0xCF */
+	0xCC, 0xF8, 0xCD, 0x4F, 0xCD, 0x40, 0xAA, 0x6C, /* 0xD0-0xD3 */
+	0xCC, 0xF4, 0xAA, 0x6B, 0xAA, 0x7D, 0xAA, 0x72, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xCC, 0xF2, 0xCF, 0x75, 0xAA, 0x78, /* 0xD8-0xDB */
+	0xAA, 0x7C, 0xCD, 0x41, 0xCD, 0x46, 0x00, 0x00, /* 0xDC-0xDF */
+	0xAA, 0x7E, 0xAA, 0x77, 0xAA, 0x69, 0xAA, 0x5F, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xAA, 0x64, 0x00, 0x00, 0xCC, 0xF6, /* 0xE4-0xE7 */
+	0xAA, 0x60, 0xCD, 0x4E, 0x00, 0x00, 0xCC, 0xF0, /* 0xE8-0xEB */
+	0xCC, 0xEF, 0xCC, 0xFD, 0xCC, 0xF1, 0xAA, 0x7B, /* 0xEC-0xEF */
+	0xAE, 0xF5, 0xAA, 0x74, 0xCC, 0xFE, 0xAA, 0x61, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xAC, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xCD, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_6D[512] = {
+	0xCF, 0x7C, 0xCF, 0xA1, 0x00, 0x00, 0xCF, 0xA4, /* 0x00-0x03 */
+	0xCF, 0x77, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xA7, /* 0x04-0x07 */
+	0xCF, 0xAA, 0xCF, 0xAC, 0xCF, 0x74, 0xAC, 0x76, /* 0x08-0x0B */
+	0xAC, 0x7B, 0xD2, 0x49, 0xAC, 0xAD, 0xCF, 0xA5, /* 0x0C-0x0F */
+	0xCF, 0xAD, 0xCF, 0x7B, 0xCF, 0x73, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0x64, 0xAC, 0x7E, /* 0x14-0x17 */
+	0xCF, 0xA2, 0xCF, 0x78, 0xCF, 0x7A, 0xAC, 0xA5, /* 0x18-0x1B */
+	0x00, 0x00, 0xCF, 0x7D, 0xAC, 0x7D, 0xCF, 0x70, /* 0x1C-0x1F */
+	0xCF, 0xA8, 0x00, 0x00, 0xCF, 0xAB, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xAC, 0x7A, 0x00, 0x00, 0xAC, 0xA8, /* 0x24-0x27 */
+	0xCF, 0x6D, 0xAC, 0xAA, 0xAC, 0x78, 0xAC, 0xAE, /* 0x28-0x2B */
+	0xCF, 0xA9, 0xCF, 0x6F, 0xAC, 0xAB, 0xD2, 0x5E, /* 0x2C-0x2F */
+	0xCD, 0x48, 0xAC, 0x7C, 0xAC, 0x77, 0xCF, 0x76, /* 0x30-0x33 */
+	0xCF, 0x6E, 0xAC, 0xAC, 0xAC, 0xA4, 0xCF, 0xA3, /* 0x34-0x37 */
+	0xAC, 0xA9, 0xAC, 0xA7, 0xCF, 0x79, 0xAC, 0xA1, /* 0x38-0x3B */
+	0xCF, 0x71, 0xAC, 0xA2, 0xAC, 0xA3, 0xCF, 0x72, /* 0x3C-0x3F */
+	0xCF, 0xA6, 0xAC, 0x79, 0xCF, 0x7E, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xD2, 0x4C, 0xAE, 0xFD, 0xAF, 0x43, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0x55, 0xD2, 0x5B, /* 0x5C-0x5F */
+	0xD2, 0x57, 0xD2, 0x4A, 0xD2, 0x4D, 0xD2, 0x46, /* 0x60-0x63 */
+	0xD2, 0x47, 0xAF, 0x4A, 0xAE, 0xFA, 0xD2, 0x56, /* 0x64-0x67 */
+	0xD2, 0x5F, 0xAF, 0x45, 0xAE, 0xF6, 0x00, 0x00, /* 0x68-0x6B */
+	0xAF, 0x40, 0xD2, 0x4E, 0xAF, 0x42, 0xD2, 0x4F, /* 0x6C-0x6F */
+	0xD2, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xAF, 0x44, 0xD2, 0x68, 0xD2, 0x48, 0xAE, 0xFC, /* 0x74-0x77 */
+	0xAE, 0xFB, 0xAF, 0x48, 0xD2, 0x45, 0xD2, 0x66, /* 0x78-0x7B */
+	0xD2, 0x5A, 0xD2, 0x67, 0xD2, 0x61, 0xD2, 0x53, /* 0x7C-0x7F */
+	
+	0xD2, 0x62, 0x00, 0x00, 0xD2, 0x5C, 0xD2, 0x65, /* 0x80-0x83 */
+	0xD2, 0x63, 0xAF, 0x49, 0xD2, 0x54, 0xAE, 0xF9, /* 0x84-0x87 */
+	0xAE, 0xF8, 0xAF, 0x41, 0xAF, 0x47, 0xD2, 0x60, /* 0x88-0x8B */
+	0xAF, 0x46, 0xD2, 0x51, 0xB2, 0x43, 0x00, 0x00, /* 0x8C-0x8F */
+	0xD2, 0x69, 0xD2, 0x50, 0xD2, 0x4B, 0xAE, 0xFE, /* 0x90-0x93 */
+	0xAF, 0x4B, 0xAE, 0xF7, 0x00, 0x00, 0xD2, 0x58, /* 0x94-0x97 */
+	0xD2, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0x65, 0xD5, 0xE1, /* 0xA8-0xAB */
+	0xD5, 0xE5, 0x00, 0x00, 0xB2, 0x52, 0xB2, 0x50, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0x47, 0xD5, 0xE3, /* 0xB0-0xB3 */
+	0xD5, 0xE2, 0xB2, 0x5B, 0x00, 0x00, 0xD5, 0xE8, /* 0xB4-0xB7 */
+	0xB2, 0x55, 0x00, 0x00, 0xD5, 0xFA, 0xD6, 0x47, /* 0xB8-0xBB */
+	0xB2, 0x44, 0xD5, 0xF7, 0xD5, 0xF0, 0xB2, 0x67, /* 0xBC-0xBF */
+	0xD5, 0xE0, 0x00, 0x00, 0xD5, 0xFC, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xB2, 0x64, 0xB2, 0x58, 0xB2, 0x63, 0xB2, 0x4E, /* 0xC4-0xC7 */
+	0xD5, 0xEC, 0xD5, 0xFE, 0xD5, 0xF6, 0xB2, 0x4F, /* 0xC8-0xCB */
+	0xB2, 0x49, 0xD6, 0x45, 0x00, 0x00, 0xD5, 0xFD, /* 0xCC-0xCF */
+	0xD6, 0x40, 0xB2, 0x51, 0xB2, 0x59, 0xD6, 0x42, /* 0xD0-0xD3 */
+	0xD5, 0xEA, 0xD5, 0xFB, 0xD5, 0xEF, 0xD6, 0x44, /* 0xD4-0xD7 */
+	0xB2, 0x5E, 0xB2, 0x46, 0xB2, 0x5C, 0xD5, 0xF4, /* 0xD8-0xDB */
+	0xD5, 0xF2, 0xD5, 0xF3, 0xB2, 0x53, 0xD5, 0xEE, /* 0xDC-0xDF */
+	0xD5, 0xED, 0xB2, 0x48, 0xD5, 0xE7, 0xD6, 0x46, /* 0xE0-0xE3 */
+	0xB2, 0x4A, 0xD5, 0xF1, 0xB2, 0x68, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xB2, 0x62, 0xD5, 0xE6, 0xB2, 0x5F, 0xB2, 0x5D, /* 0xE8-0xEB */
+	0xB2, 0x66, 0xD5, 0xF8, 0xB2, 0x61, 0xD2, 0x52, /* 0xEC-0xEF */
+	0xD5, 0xF9, 0xB2, 0x60, 0xD6, 0x41, 0xB2, 0x45, /* 0xF0-0xF3 */
+	0xD5, 0xF5, 0xB2, 0x57, 0xD5, 0xE9, 0xB2, 0x56, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xB2, 0x54, 0xB2, 0x4C, 0xB2, 0x4B, /* 0xF8-0xFB */
+	0xD9, 0xE7, 0xD6, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6E[512] = {
+	0xD5, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xFC, /* 0x00-0x03 */
+	0x00, 0x00, 0xB2, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xB5, 0x41, 0xB2, 0x5A, 0xB4, 0xEE, /* 0x18-0x1B */
+	0xD9, 0xF6, 0xB4, 0xFC, 0x00, 0x00, 0xD9, 0xEA, /* 0x1C-0x1F */
+	0xB4, 0xEB, 0xB4, 0xE7, 0xDA, 0x49, 0xB4, 0xED, /* 0x20-0x23 */
+	0xB4, 0xF1, 0xB4, 0xEC, 0xB4, 0xF5, 0xDA, 0x4D, /* 0x24-0x27 */
+	0xDA, 0x44, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xF1, /* 0x28-0x2B */
+	0xB4, 0xFA, 0xB4, 0xF4, 0xD9, 0xFD, 0xB4, 0xE4, /* 0x2C-0x2F */
+	0xDA, 0x4A, 0xDA, 0x43, 0xB4, 0xE8, 0xD9, 0xF7, /* 0x30-0x33 */
+	0xB4, 0xF7, 0xDA, 0x55, 0xDA, 0x56, 0x00, 0x00, /* 0x34-0x37 */
+	0xB4, 0xE5, 0xDA, 0x48, 0xB4, 0xF9, 0xD9, 0xFB, /* 0x38-0x3B */
+	0xD9, 0xED, 0xD9, 0xEE, 0xB4, 0xFD, 0xD9, 0xF2, /* 0x3C-0x3F */
+	0xD9, 0xF9, 0xD9, 0xF3, 0x00, 0x00, 0xB4, 0xFB, /* 0x40-0x43 */
+	0xB5, 0x44, 0xD9, 0xEF, 0xD9, 0xE8, 0xD9, 0xE9, /* 0x44-0x47 */
+	0x00, 0x00, 0xD9, 0xEB, 0xB4, 0xEA, 0xD9, 0xF8, /* 0x48-0x4B */
+	0x00, 0x00, 0xB4, 0xF8, 0xB5, 0x42, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xD9, 0xFA, 0xDA, 0x53, 0xDA, 0x4B, /* 0x50-0x53 */
+	0xB4, 0xE6, 0xDA, 0x51, 0xB4, 0xF2, 0x00, 0x00, /* 0x54-0x57 */
+	0xB4, 0xF0, 0x00, 0x00, 0xDA, 0x57, 0xB4, 0xEF, /* 0x58-0x5B */
+	0xDA, 0x41, 0xD9, 0xF4, 0xD9, 0xFE, 0xB5, 0x47, /* 0x5C-0x5F */
+	0xDA, 0x45, 0xDA, 0x42, 0xD9, 0xF0, 0xB5, 0x43, /* 0x60-0x63 */
+	0xDA, 0x4F, 0xDA, 0x4C, 0xDA, 0x54, 0xB4, 0xE9, /* 0x64-0x67 */
+	0xDA, 0x40, 0xB5, 0x46, 0x00, 0x00, 0xDA, 0x47, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xB4, 0xF3, 0xB4, 0xF6, /* 0x6C-0x6F */
+	0x00, 0x00, 0xDA, 0x46, 0xB5, 0x45, 0xD9, 0xF5, /* 0x70-0x73 */
+	0xD5, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xDA, 0x50, /* 0x74-0x77 */
+	0xDA, 0x4E, 0xDA, 0x52, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xD9, 0xEC, 0xB5, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xDE, 0x61, 0xDE, 0x60, 0xDE, 0x46, /* 0x8C-0x8F */
+	0xB7, 0xBD, 0x00, 0x00, 0xDE, 0x5F, 0xDE, 0x49, /* 0x90-0x93 */
+	0xDE, 0x4A, 0x00, 0x00, 0xB7, 0xC7, 0xDE, 0x68, /* 0x94-0x97 */
+	0xB7, 0xC2, 0xDE, 0x5E, 0x00, 0x00, 0xDE, 0x43, /* 0x98-0x9B */
+	0xB7, 0xC8, 0xB7, 0xBE, 0xDE, 0x52, 0xDE, 0x48, /* 0x9C-0x9F */
+	0xDE, 0x4B, 0xDE, 0x63, 0xB7, 0xB8, 0xDE, 0x6A, /* 0xA0-0xA3 */
+	0xDE, 0x62, 0xB7, 0xC1, 0xDE, 0x57, 0xB7, 0xCC, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0xCB, 0xB7, 0xC5, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0x69, 0xB7, 0xB9, /* 0xAC-0xAF */
+	0xDE, 0x55, 0xDE, 0x4C, 0xDE, 0x59, 0xDE, 0x65, /* 0xB0-0xB3 */
+	0xB7, 0xCD, 0x00, 0x00, 0xB7, 0xBB, 0xDE, 0x54, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xDE, 0x4D, 0xB7, 0xC4, 0x00, 0x00, /* 0xB8-0xBB */
+	0xB7, 0xC3, 0xDE, 0x50, 0xDE, 0x5A, 0xDE, 0x64, /* 0xBC-0xBF */
+	0xDE, 0x47, 0xDE, 0x51, 0xB7, 0xBC, 0xDE, 0x5B, /* 0xC0-0xC3 */
+	0xB7, 0xC9, 0xB7, 0xC0, 0xDE, 0x4E, 0xB7, 0xBF, /* 0xC4-0xC7 */
+	0xDE, 0x45, 0xDE, 0x53, 0xDE, 0x67, 0xB4, 0xFE, /* 0xC8-0xCB */
+	0xBA, 0xB0, 0xDE, 0x56, 0xE2, 0x6C, 0xDE, 0x58, /* 0xCC-0xCF */
+	0xDE, 0x66, 0xB7, 0xC6, 0xDE, 0x4F, 0xB7, 0xBA, /* 0xD0-0xD3 */
+	0xB7, 0xCA, 0xBC, 0xF0, 0xDE, 0x44, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xDE, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xDE, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xAA, /* 0xE8-0xEB */
+	0xBA, 0xAD, 0xE2, 0x7D, 0xE2, 0xA4, 0xBA, 0xA2, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE2, 0x6E, 0xBA, 0xAF, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xBA, 0x77, 0xE2, 0x6D, 0xE2, 0xB0, 0xBA, 0xB1, /* 0xF4-0xF7 */
+	0xE2, 0x71, 0xE2, 0xA3, 0x00, 0x00, 0xE2, 0x73, /* 0xF8-0xFB */
+	0xE2, 0xB3, 0xE2, 0xAF, 0xBA, 0x75, 0xBA, 0xA1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_6F[512] = {
+	0xE6, 0x53, 0xBA, 0xAE, 0xBA, 0x7D, 0xE2, 0x6F, /* 0x00-0x03 */
+	0x00, 0x00, 0xE2, 0xAE, 0xBA, 0xA3, 0xE2, 0xAB, /* 0x04-0x07 */
+	0xE2, 0xB8, 0xE2, 0x75, 0xE2, 0x7E, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE2, 0xB6, 0xE2, 0xAC, 0xBA, 0x7C, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x7C, 0xBA, 0x76, /* 0x10-0x13 */
+	0xBA, 0x74, 0xBA, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xE2, 0x7A, 0xE2, 0x77, 0xE2, 0x78, 0x00, 0x00, /* 0x18-0x1B */
+	0xE2, 0xB2, 0x00, 0x00, 0xE2, 0xB7, 0xE2, 0xB5, /* 0x1C-0x1F */
+	0xBA, 0x7A, 0xE2, 0xB9, 0xBA, 0x7E, 0xBA, 0xA7, /* 0x20-0x23 */
+	0x00, 0x00, 0xE2, 0x70, 0xE5, 0xFA, 0xE2, 0x79, /* 0x24-0x27 */
+	0x00, 0x00, 0xBA, 0x78, 0xBA, 0xAC, 0xBA, 0xA9, /* 0x28-0x2B */
+	0xBA, 0x7B, 0xE2, 0xA5, 0xE2, 0x74, 0xBA, 0xAA, /* 0x2C-0x2F */
+	0xE2, 0xA7, 0xBA, 0xA4, 0xBA, 0xA6, 0xBA, 0x73, /* 0x30-0x33 */
+	0x00, 0x00, 0xE2, 0xA9, 0xE2, 0xA1, 0xE2, 0x72, /* 0x34-0x37 */
+	0xBA, 0xA5, 0xE2, 0xB1, 0xE2, 0xB4, 0xE2, 0x7B, /* 0x38-0x3B */
+	0xE2, 0xA8, 0x00, 0x00, 0xBA, 0x79, 0xBC, 0xDF, /* 0x3C-0x3F */
+	0xE2, 0xA6, 0xE5, 0xF9, 0x00, 0x00, 0xE2, 0xAD, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0x76, 0xE6, 0x44, /* 0x4C-0x4F */
+	0xE6, 0x4E, 0xBC, 0xE2, 0xE6, 0x4D, 0xE6, 0x59, /* 0x50-0x53 */
+	0xBC, 0xE4, 0xE6, 0x4B, 0x00, 0x00, 0xE6, 0x4F, /* 0x54-0x57 */
+	0xBC, 0xEF, 0x00, 0x00, 0xE6, 0x46, 0xBC, 0xE7, /* 0x58-0x5B */
+	0x00, 0x00, 0xE6, 0x52, 0xE9, 0xF0, 0xBC, 0xF3, /* 0x5C-0x5F */
+	0xBC, 0xF2, 0xE6, 0x54, 0xE6, 0x43, 0xE6, 0x5E, /* 0x60-0x63 */
+	0xBC, 0xED, 0x00, 0x00, 0xBC, 0xE3, 0xE6, 0x57, /* 0x64-0x67 */
+	0x00, 0x00, 0xE6, 0x5B, 0xE6, 0x60, 0xE6, 0x55, /* 0x68-0x6B */
+	0xE6, 0x49, 0xBC, 0xE6, 0xBC, 0xE9, 0xBC, 0xF1, /* 0x6C-0x6F */
+	0xBC, 0xEC, 0x00, 0x00, 0xE6, 0x4C, 0xE2, 0xA2, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0x48, 0xE6, 0x5F, /* 0x74-0x77 */
+	0xBC, 0xE8, 0x00, 0x00, 0xBC, 0xEB, 0xE6, 0x61, /* 0x78-0x7B */
+	0xBC, 0xE0, 0xE6, 0x56, 0xE5, 0xFB, 0xE6, 0x5C, /* 0x7C-0x7F */
+	
+	0xC0, 0xDF, 0x00, 0x00, 0xE6, 0x4A, 0x00, 0x00, /* 0x80-0x83 */
+	0xBC, 0xE1, 0xE6, 0x45, 0xBC, 0xE5, 0xE5, 0xFC, /* 0x84-0x87 */
+	0xBA, 0xAB, 0xE6, 0x41, 0x00, 0x00, 0xE6, 0x5A, /* 0x88-0x8B */
+	0xE6, 0x42, 0xE6, 0x40, 0xBC, 0xEA, 0x00, 0x00, /* 0x8C-0x8F */
+	0xE6, 0x58, 0x00, 0x00, 0xE5, 0xFE, 0xE6, 0x51, /* 0x90-0x93 */
+	0xE6, 0x50, 0xE6, 0x5D, 0xE6, 0x47, 0xBC, 0xEE, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE9, 0xF3, 0x00, 0x00, /* 0x9C-0x9F */
+	0xBF, 0x49, 0xBE, 0xFE, 0xEA, 0x40, 0xE9, 0xEB, /* 0xA0-0xA3 */
+	0xBF, 0x41, 0xE9, 0xF7, 0xBF, 0x48, 0xBF, 0x43, /* 0xA4-0xA7 */
+	0xE9, 0xF5, 0xED, 0x4F, 0xE9, 0xFB, 0xEA, 0x42, /* 0xA8-0xAB */
+	0xE9, 0xFA, 0xE9, 0xE9, 0xE9, 0xF8, 0xEA, 0x44, /* 0xAC-0xAF */
+	0xEA, 0x46, 0xBE, 0xFD, 0xEA, 0x45, 0xBF, 0x44, /* 0xB0-0xB3 */
+	0xBF, 0x4A, 0x00, 0x00, 0xBF, 0x47, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE9, 0xFE, 0xBF, 0x46, 0xE9, 0xF9, 0x00, 0x00, /* 0xB8-0xBB */
+	0xE9, 0xED, 0xE9, 0xF2, 0x00, 0x00, 0xE9, 0xFD, /* 0xBC-0xBF */
+	0xBF, 0x45, 0xBF, 0x42, 0xBE, 0xFC, 0xBF, 0x40, /* 0xC0-0xC3 */
+	0xE9, 0xF1, 0x00, 0x00, 0xE5, 0xFD, 0xE9, 0xEC, /* 0xC4-0xC7 */
+	0xE9, 0xEF, 0xEA, 0x41, 0xE9, 0xF4, 0xE9, 0xEA, /* 0xC8-0xCB */
+	0xED, 0x4E, 0xEA, 0x43, 0xE9, 0xEE, 0xE9, 0xFC, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xED, 0x51, 0xC0, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xC0, 0xD7, 0x00, 0x00, 0x00, 0x00, 0xC0, 0xDB, /* 0xD8-0xDB */
+	0xED, 0x53, 0xED, 0x59, 0xED, 0x57, 0xC0, 0xD9, /* 0xDC-0xDF */
+	0xC0, 0xDA, 0xC0, 0xE1, 0xED, 0x5A, 0xED, 0x52, /* 0xE0-0xE3 */
+	0xC0, 0xDC, 0x00, 0x00, 0xED, 0x56, 0xED, 0x55, /* 0xE4-0xE7 */
+	0xED, 0x5B, 0xC0, 0xE2, 0x00, 0x00, 0xC0, 0xDD, /* 0xE8-0xEB */
+	0xC0, 0xE0, 0xED, 0x54, 0xC0, 0xE4, 0xC0, 0xDE, /* 0xEC-0xEF */
+	0xC0, 0xE5, 0xC0, 0xD8, 0xED, 0x58, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xED, 0x50, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xF7, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xC2, 0x71, 0xEF, 0xF4, /* 0xF8-0xFB */
+	0xEF, 0xF6, 0x00, 0x00, 0xC2, 0x6F, 0xEF, 0xF2, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_70[512] = {
+	0xEF, 0xF3, 0xEF, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xE9, 0xF6, 0xEF, 0xEF, 0xC2, 0x70, 0xEF, 0xEB, /* 0x04-0x07 */
+	0x00, 0x00, 0xC2, 0x6D, 0xEF, 0xF8, 0xC2, 0x6E, /* 0x08-0x0B */
+	0xEF, 0xEC, 0xEF, 0xED, 0xEF, 0xF1, 0xC2, 0x73, /* 0x0C-0x0F */
+	0x00, 0x00, 0xC2, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xEF, 0xF0, 0xC3, 0x78, 0xF2, 0x5F, 0xF2, 0x65, /* 0x14-0x17 */
+	0xC3, 0x79, 0xF2, 0x5C, 0xC3, 0x76, 0xC3, 0x73, /* 0x18-0x1B */
+	0xF2, 0x67, 0xC3, 0x77, 0x00, 0x00, 0xC3, 0x74, /* 0x1C-0x1F */
+	0xF2, 0x5E, 0xF2, 0x61, 0xF2, 0x62, 0xF2, 0x63, /* 0x20-0x23 */
+	0xF2, 0x66, 0x00, 0x00, 0xEF, 0xF5, 0xF2, 0x5D, /* 0x24-0x27 */
+	0xC3, 0x75, 0xF2, 0x64, 0xF2, 0x68, 0xF2, 0x60, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x5D, /* 0x2C-0x2F */
+	0xC4, 0x6A, 0xF4, 0x60, 0xC4, 0x6B, 0xF4, 0x68, /* 0x30-0x33 */
+	0xF4, 0x5F, 0xF4, 0x5C, 0x00, 0x00, 0xF4, 0x5E, /* 0x34-0x37 */
+	0xF4, 0x62, 0xF4, 0x65, 0xF4, 0x64, 0xF4, 0x67, /* 0x38-0x3B */
+	0xF4, 0x5B, 0x00, 0x00, 0xC4, 0x69, 0xF4, 0x63, /* 0x3C-0x3F */
+	0xF4, 0x66, 0xF4, 0x69, 0xF4, 0x61, 0xF5, 0xD3, /* 0x40-0x43 */
+	0xF5, 0xD4, 0xF5, 0xD8, 0xF5, 0xD9, 0x00, 0x00, /* 0x44-0x47 */
+	0xF5, 0xD6, 0xF5, 0xD7, 0xF5, 0xD5, 0x00, 0x00, /* 0x48-0x4B */
+	0xC4, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xC5, 0x78, 0xF6, 0xEB, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xF6, 0xE8, 0xF6, 0xE9, 0xF6, 0xEA, /* 0x54-0x57 */
+	0xC5, 0x79, 0x00, 0x00, 0xF7, 0xE5, 0xF7, 0xE4, /* 0x58-0x5B */
+	0x00, 0x00, 0xF8, 0xAF, 0xC5, 0xF4, 0xF8, 0xAD, /* 0x5C-0x5F */
+	0xF8, 0xB0, 0xF8, 0xAE, 0xF8, 0xF5, 0xC6, 0x57, /* 0x60-0x63 */
+	0xC6, 0x65, 0xF9, 0xA3, 0xF9, 0x6C, 0x00, 0x00, /* 0x64-0x67 */
+	0xF9, 0xA2, 0xF9, 0xD0, 0xF9, 0xD1, 0xA4, 0xF5, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xA6, 0xC7, 0xCA, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xCB, 0x5E, 0x00, 0x00, 0xA8, 0x5F, 0x00, 0x00, /* 0x74-0x77 */
+	0xA8, 0x62, 0x00, 0x00, 0xCB, 0x5F, 0x00, 0x00, /* 0x78-0x7B */
+	0xA8, 0x60, 0xA8, 0x61, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xCD, 0x58, 0xCD, 0x5A, /* 0x80-0x83 */
+	0xCD, 0x55, 0xCD, 0x52, 0xCD, 0x54, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xAA, 0xA4, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xAA, 0xA2, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xCD, 0x56, 0xAA, 0xA3, 0xCD, 0x53, /* 0x90-0x93 */
+	0xCD, 0x50, 0xAA, 0xA1, 0xCD, 0x57, 0x00, 0x00, /* 0x94-0x97 */
+	0xCD, 0x51, 0xAA, 0xA5, 0xCD, 0x59, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xAF, /* 0x9C-0x9F */
+	0x00, 0x00, 0xCF, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xAC, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xCF, 0xB6, 0x00, 0x00, 0xAC, 0xAF, /* 0xA8-0xAB */
+	0xAC, 0xB2, 0xAC, 0xB4, 0xAC, 0xB6, 0xAC, 0xB3, /* 0xAC-0xAF */
+	0xCF, 0xB2, 0xCF, 0xB1, 0x00, 0x00, 0xAC, 0xB1, /* 0xB0-0xB3 */
+	0xCF, 0xB4, 0xCF, 0xB5, 0x00, 0x00, 0xCF, 0xAE, /* 0xB4-0xB7 */
+	0xAC, 0xB5, 0x00, 0x00, 0xAC, 0xB0, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0xB0, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xD2, 0x77, 0xD2, 0x78, 0xD2, 0x79, /* 0xC4-0xC7 */
+	0xAF, 0x50, 0x00, 0x00, 0xAF, 0x4C, 0xD2, 0x6E, /* 0xC8-0xCB */
+	0x00, 0x00, 0xD2, 0x76, 0xD2, 0x7B, 0xAF, 0x51, /* 0xCC-0xCF */
+	0x00, 0x00, 0xD2, 0x6C, 0xD2, 0x72, 0xD2, 0x6B, /* 0xD0-0xD3 */
+	0xD2, 0x75, 0x00, 0x00, 0x00, 0x00, 0xD2, 0x71, /* 0xD4-0xD7 */
+	0xAF, 0x4D, 0xAF, 0x4F, 0xD2, 0x7A, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD2, 0x6A, 0xD2, 0x6D, 0xD2, 0x73, 0x00, 0x00, /* 0xDC-0xDF */
+	0xD2, 0x74, 0xD2, 0x7C, 0xD2, 0x70, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xAF, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB2, 0x6D, /* 0xEC-0xEF */
+	0xD6, 0x4E, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x50, /* 0xF0-0xF3 */
+	0xD6, 0x4C, 0x00, 0x00, 0xD6, 0x58, 0xD6, 0x4A, /* 0xF4-0xF7 */
+	0xD6, 0x57, 0xB2, 0x69, 0xD6, 0x48, 0xDA, 0x5B, /* 0xF8-0xFB */
+	0xD6, 0x52, 0xB2, 0x6C, 0x00, 0x00, 0xD6, 0x53, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_71[512] = {
+	0xD6, 0x56, 0x00, 0x00, 0xD6, 0x5A, 0x00, 0x00, /* 0x00-0x03 */
+	0xD6, 0x4F, 0x00, 0x00, 0xD6, 0x54, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xB2, 0x6A, 0xB2, 0x6B, 0xD6, 0x59, /* 0x08-0x0B */
+	0xD6, 0x4D, 0xD6, 0x49, 0xD6, 0x5B, 0x00, 0x00, /* 0x0C-0x0F */
+	0xD6, 0x51, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x55, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x4B, /* 0x14-0x17 */
+	0x00, 0x00, 0xB5, 0x48, 0xB5, 0x49, 0xDA, 0x65, /* 0x18-0x1B */
+	0xB5, 0x4F, 0x00, 0x00, 0xDA, 0x59, 0xDA, 0x62, /* 0x1C-0x1F */
+	0xDA, 0x58, 0xB5, 0x4C, 0xDA, 0x60, 0xDA, 0x5E, /* 0x20-0x23 */
+	0x00, 0x00, 0xDA, 0x5F, 0xB5, 0x4A, 0x00, 0x00, /* 0x24-0x27 */
+	0xDA, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0x5C, 0xDA, 0x5A, /* 0x2C-0x2F */
+	0xB5, 0x4B, 0xDA, 0x5D, 0xDA, 0x61, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xB5, 0x4D, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0x64, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xDE, 0x70, 0xDE, 0x77, 0xDE, 0x79, /* 0x40-0x43 */
+	0xDE, 0xA1, 0x00, 0x00, 0xB7, 0xDA, 0xDE, 0x6B, /* 0x44-0x47 */
+	0x00, 0x00, 0xB7, 0xD2, 0x00, 0x00, 0xDE, 0x7A, /* 0x48-0x4B */
+	0xB7, 0xD7, 0xDE, 0xA2, 0xB7, 0xCE, 0x00, 0x00, /* 0x4C-0x4F */
+	0xDE, 0x7D, 0x00, 0x00, 0xDE, 0x6D, 0xDE, 0x7E, /* 0x50-0x53 */
+	0xDE, 0x6C, 0x00, 0x00, 0xB7, 0xDC, 0x00, 0x00, /* 0x54-0x57 */
+	0xDE, 0x78, 0xB7, 0xCF, 0xDE, 0xA3, 0x00, 0x00, /* 0x58-0x5B */
+	0xB7, 0xD4, 0xDE, 0x71, 0xB7, 0xD9, 0xDE, 0x7C, /* 0x5C-0x5F */
+	0xDE, 0x6F, 0xDE, 0x76, 0xDE, 0x72, 0xDE, 0x6E, /* 0x60-0x63 */
+	0xB7, 0xD1, 0xB7, 0xD8, 0xB7, 0xD6, 0xB7, 0xD3, /* 0x64-0x67 */
+	0xB7, 0xDB, 0xB7, 0xD0, 0xDE, 0x75, 0x00, 0x00, /* 0x68-0x6B */
+	0xB7, 0xD5, 0x00, 0x00, 0xB5, 0x4E, 0x00, 0x00, /* 0x6C-0x6F */
+	0xDE, 0x7B, 0x00, 0x00, 0xDE, 0x73, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xDE, 0x74, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC1, /* 0x78-0x7B */
+	0x00, 0x00, 0xBA, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xE2, 0xBD, 0xE2, 0xC3, 0xE2, 0xBF, 0x00, 0x00, /* 0x80-0x83 */
+	0xBA, 0xB6, 0xE2, 0xBE, 0xE2, 0xC2, 0xE2, 0xBA, /* 0x84-0x87 */
+	0x00, 0x00, 0xE2, 0xBC, 0xBA, 0xB5, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC0, /* 0x8C-0x8F */
+	0xE2, 0xBB, 0x00, 0x00, 0xBA, 0xB7, 0x00, 0x00, /* 0x90-0x93 */
+	0xBA, 0xB2, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC4, /* 0x94-0x97 */
+	0x00, 0x00, 0xBA, 0xB3, 0xE6, 0x67, 0xE6, 0x64, /* 0x98-0x9B */
+	0xE6, 0x70, 0xE6, 0x6A, 0xE6, 0x6C, 0xBC, 0xF4, /* 0x9C-0x9F */
+	0xE6, 0x66, 0xE6, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE6, 0x6D, 0xE6, 0x6B, 0x00, 0x00, 0xE6, 0x71, /* 0xA4-0xA7 */
+	0xBC, 0xF7, 0xE6, 0x68, 0xE6, 0x6F, 0x00, 0x00, /* 0xA8-0xAB */
+	0xBC, 0xF5, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x63, /* 0xAC-0xAF */
+	0xE6, 0x65, 0xBC, 0xF6, 0xE6, 0x62, 0xE6, 0x72, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xE6, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xEA, 0x4A, 0xBF, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0xEA, 0x55, 0xEA, 0x53, 0xBF, 0x4B, 0xEA, 0x49, /* 0xBC-0xBF */
+	0xEA, 0x4C, 0xEA, 0x4D, 0xEA, 0x48, 0xBF, 0x55, /* 0xC0-0xC3 */
+	0xBF, 0x56, 0xEA, 0x47, 0xEA, 0x56, 0xEA, 0x51, /* 0xC4-0xC7 */
+	0xBF, 0x4F, 0xBF, 0x4C, 0xEA, 0x50, 0xEA, 0x4E, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xBF, 0x52, 0xEA, 0x52, /* 0xCC-0xCF */
+	0xBF, 0x4D, 0x00, 0x00, 0xBF, 0x4E, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xEA, 0x4F, 0xBF, 0x50, 0xEA, 0x4B, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xEA, 0x54, 0xBF, 0x53, 0xEA, 0x57, 0xEA, 0x58, /* 0xD8-0xDB */
+	0xBF, 0x54, 0x00, 0x00, 0x00, 0x00, 0xC0, 0xE7, /* 0xDC-0xDF */
+	0xC0, 0xEE, 0xED, 0x5C, 0xED, 0x62, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xED, 0x60, 0xC0, 0xEA, 0xC0, 0xE9, 0xC0, 0xE6, /* 0xE4-0xE7 */
+	0xED, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xC0, 0xEC, 0xC0, 0xEB, 0xC0, 0xE8, 0x00, 0x00, /* 0xEC-0xEF */
+	0xED, 0x61, 0xED, 0x5D, 0xED, 0x5F, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xC0, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xC2, 0x77, 0xEF, 0xFB, 0x00, 0x00, 0xC2, 0x74, /* 0xF8-0xFB */
+	0xC2, 0x75, 0xEF, 0xFD, 0xC2, 0x76, 0xEF, 0xFA, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_72[512] = {
+	0x00, 0x00, 0xEF, 0xF9, 0xF2, 0x6C, 0xEF, 0xFC, /* 0x00-0x03 */
+	0x00, 0x00, 0xF2, 0x6D, 0xC3, 0x7A, 0xF2, 0x6B, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0x6A, 0x00, 0x00, /* 0x08-0x0B */
+	0xF2, 0x69, 0xC3, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xC4, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x6A, /* 0x10-0x13 */
+	0xF4, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xF5, 0xDC, 0xF5, 0xDB, 0xC4, 0xEA, /* 0x18-0x1B */
+	0x00, 0x00, 0xF5, 0xDA, 0xF6, 0xEC, 0xF6, 0xED, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xF7, 0xE6, 0xF8, 0xB1, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xF6, 0xF9, 0xBC, /* 0x24-0x27 */
+	0xC6, 0x79, 0xF9, 0xC6, 0xA4, 0xF6, 0x00, 0x00, /* 0x28-0x2B */
+	0xAA, 0xA6, 0xAA, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xAC, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xC0, 0xEF, 0xA4, 0xF7, 0x00, 0x00, /* 0x34-0x37 */
+	0xAA, 0xA8, 0xAF, 0x52, 0xB7, 0xDD, 0xA4, 0xF8, /* 0x38-0x3B */
+	0x00, 0x00, 0xB2, 0x6E, 0xBA, 0xB8, 0xC9, 0x62, /* 0x3C-0x3F */
+	0x00, 0x00, 0xCF, 0xB7, 0xD2, 0x7D, 0x00, 0x00, /* 0x40-0x43 */
+	0xE2, 0xC5, 0x00, 0x00, 0xC0, 0xF0, 0xA4, 0xF9, /* 0x44-0x47 */
+	0xAA, 0xA9, 0xCF, 0xB8, 0xCF, 0xB9, 0xDA, 0x66, /* 0x48-0x4B */
+	0xB5, 0x50, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xA4, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0xDE, 0xE2, 0xC6, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xBC, 0xF8, 0x00, 0x00, /* 0x54-0x57 */
+	0xC3, 0x7C, 0xA4, 0xFA, 0xDA, 0x67, 0xA4, 0xFB, /* 0x58-0x5B */
+	0x00, 0x00, 0xA6, 0xC9, 0xCA, 0x42, 0xA6, 0xC8, /* 0x5C-0x5F */
+	0xA8, 0x65, 0xA8, 0x64, 0xA8, 0x63, 0xCB, 0x60, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAA, 0xAA, /* 0x64-0x67 */
+	0x00, 0x00, 0xAA, 0xAB, 0xCD, 0x5B, 0x00, 0x00, /* 0x68-0x6B */
+	0xCF, 0xBA, 0x00, 0x00, 0xCF, 0xBD, 0xAC, 0xBA, /* 0x6C-0x6F */
+	0xCF, 0xBB, 0x00, 0x00, 0xAC, 0xB9, 0xCF, 0xBC, /* 0x70-0x73 */
+	0xAC, 0xBB, 0x00, 0x00, 0xD2, 0xA2, 0xD2, 0xA1, /* 0x74-0x77 */
+	0xD2, 0x7E, 0xAF, 0x53, 0x00, 0x00, 0xD6, 0x5D, /* 0x78-0x7B */
+	0xD6, 0x5E, 0xB2, 0x6F, 0xD6, 0x5C, 0xD6, 0x5F, /* 0x7C-0x7F */
+	
+	0xB5, 0x52, 0xB2, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xB5, 0x51, 0xDA, 0x6B, 0xDA, 0x6A, 0x00, 0x00, /* 0x84-0x87 */
+	0xDA, 0x68, 0xDA, 0x69, 0x00, 0x00, 0xDA, 0x6C, /* 0x88-0x8B */
+	0xDE, 0xA6, 0xDE, 0xA5, 0xDE, 0xA9, 0x00, 0x00, /* 0x8C-0x8F */
+	0xDE, 0xA8, 0xDE, 0xA7, 0xBA, 0xB9, 0xE2, 0xC9, /* 0x90-0x93 */
+	0x00, 0x00, 0xE2, 0xC8, 0xBA, 0xBA, 0xE2, 0xC7, /* 0x94-0x97 */
+	0xE6, 0x73, 0x00, 0x00, 0xE6, 0x74, 0xBC, 0xF9, /* 0x98-0x9B */
+	0x00, 0x00, 0xEA, 0x59, 0xEA, 0x5A, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xF2, 0x72, 0xC3, 0x7D, 0xF2, 0x71, /* 0xA0-0xA3 */
+	0xF2, 0x70, 0xF2, 0x6E, 0xF2, 0x6F, 0xC4, 0xEB, /* 0xA4-0xA7 */
+	0xF4, 0x6C, 0xF6, 0xEE, 0xF8, 0xF7, 0x00, 0x00, /* 0xA8-0xAB */
+	0xA4, 0xFC, 0x00, 0x00, 0xC9, 0xA5, 0xA5, 0xC7, /* 0xAC-0xAF */
+	0xC9, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xCA, 0x43, 0xCA, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0x66, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xCB, 0x62, 0x00, 0x00, 0xCB, 0x61, /* 0xBC-0xBF */
+	0xAA, 0xAC, 0xCB, 0x65, 0xA8, 0x67, 0xCB, 0x63, /* 0xC0-0xC3 */
+	0xA8, 0x66, 0xCB, 0x67, 0xCB, 0x64, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xCD, 0x5F, 0xCF, 0xBE, 0xCD, 0x5D, /* 0xC8-0xCB */
+	0xCD, 0x64, 0x00, 0x00, 0xAA, 0xAD, 0x00, 0x00, /* 0xCC-0xCF */
+	0xAA, 0xB0, 0xCD, 0x65, 0xCD, 0x61, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xCD, 0x62, 0x00, 0x00, 0xCD, 0x5C, 0xAA, 0xAF, /* 0xD4-0xD7 */
+	0xCD, 0x5E, 0xAA, 0xAE, 0xCD, 0x63, 0x00, 0x00, /* 0xD8-0xDB */
+	0xCD, 0x60, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xC2, /* 0xDC-0xDF */
+	0xAC, 0xBD, 0xAC, 0xBE, 0x00, 0x00, 0xCF, 0xC5, /* 0xE0-0xE3 */
+	0xCF, 0xBF, 0x00, 0x00, 0xCF, 0xC4, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xCF, 0xC0, 0xAC, 0xBC, 0xCF, 0xC3, 0xCF, 0xC1, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xA8, /* 0xF0-0xF3 */
+	0xD2, 0xA5, 0x00, 0x00, 0xD2, 0xA7, 0xAF, 0x58, /* 0xF4-0xF7 */
+	0xAF, 0x57, 0xAF, 0x55, 0xD2, 0xA4, 0xD2, 0xA9, /* 0xF8-0xFB */
+	0xAF, 0x54, 0xAF, 0x56, 0xD2, 0xA6, 0xD6, 0x67, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_73[512] = {
+	0xD2, 0xA3, 0xD2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x62, /* 0x04-0x07 */
+	0xD6, 0x66, 0x00, 0x00, 0xD6, 0x65, 0xDA, 0x6E, /* 0x08-0x0B */
+	0xDA, 0x79, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x68, /* 0x0C-0x0F */
+	0x00, 0x00, 0xD6, 0x63, 0xDA, 0x6D, 0xB2, 0x74, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0x73, 0xD6, 0x61, /* 0x14-0x17 */
+	0xD6, 0x64, 0xB2, 0x75, 0x00, 0x00, 0xB2, 0x72, /* 0x18-0x1B */
+	0xB2, 0x71, 0xD6, 0x60, 0xD6, 0x69, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0x70, 0xDA, 0x77, /* 0x20-0x23 */
+	0x00, 0x00, 0xB5, 0x54, 0xDA, 0x76, 0xDA, 0x73, /* 0x24-0x27 */
+	0x00, 0x00, 0xB5, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xDA, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xDA, 0x6F, 0xDA, 0x71, 0xDA, 0x74, 0xDA, 0x72, /* 0x30-0x33 */
+	0xB5, 0x55, 0xDA, 0x78, 0xB5, 0x53, 0xB7, 0xDF, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xAD, 0xDE, 0xAC, /* 0x38-0x3B */
+	0xDE, 0xAA, 0x00, 0x00, 0xB7, 0xE2, 0xB7, 0xE1, /* 0x3C-0x3F */
+	0xDE, 0xAE, 0x00, 0x00, 0xDE, 0xAB, 0xE2, 0xCA, /* 0x40-0x43 */
+	0xBA, 0xBB, 0xB7, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xDE, 0xB0, 0xDE, 0xAF, 0x00, 0x00, /* 0x48-0x4B */
+	0xE2, 0xCD, 0xE2, 0xCB, 0xBC, 0xFA, 0x00, 0x00, /* 0x4C-0x4F */
+	0xBA, 0xBC, 0xE2, 0xCC, 0xE6, 0x76, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBC, 0xFB, /* 0x54-0x57 */
+	0xE6, 0x75, 0xE6, 0x7E, 0xE6, 0x7D, 0xE6, 0x7B, /* 0x58-0x5B */
+	0x00, 0x00, 0xE6, 0x7A, 0xE6, 0x77, 0xE6, 0x78, /* 0x5C-0x5F */
+	0xE6, 0x79, 0xE6, 0x7C, 0xE6, 0xA1, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0xEA, 0x5F, 0xEA, 0x5C, 0xEA, 0x5D, /* 0x64-0x67 */
+	0xBF, 0x57, 0xEA, 0x5B, 0xEA, 0x61, 0xEA, 0x60, /* 0x68-0x6B */
+	0xEA, 0x5E, 0x00, 0x00, 0xED, 0x64, 0xED, 0x65, /* 0x6C-0x6F */
+	0xC0, 0xF1, 0x00, 0x00, 0xC0, 0xF2, 0xED, 0x63, /* 0x70-0x73 */
+	0x00, 0x00, 0xC2, 0x79, 0xEF, 0xFE, 0xC2, 0x78, /* 0x74-0x77 */
+	0xC3, 0x7E, 0x00, 0x00, 0xC3, 0xA1, 0xC4, 0x6D, /* 0x78-0x7B */
+	0xF4, 0x6E, 0xF4, 0x6D, 0xF5, 0xDD, 0xF6, 0xEF, /* 0x7C-0x7F */
+	
+	0xC5, 0x7A, 0xF7, 0xE8, 0xF7, 0xE7, 0xF7, 0xE9, /* 0x80-0x83 */
+	0xA5, 0xC8, 0xCF, 0xC6, 0xAF, 0x59, 0xB2, 0x76, /* 0x84-0x87 */
+	0xD6, 0x6A, 0xA5, 0xC9, 0xC9, 0xA7, 0xA4, 0xFD, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xCA, 0x45, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0x6C, 0xCB, 0x6A, /* 0x90-0x93 */
+	0xCB, 0x6B, 0xCB, 0x68, 0xA8, 0x68, 0xCB, 0x69, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xCD, 0x6D, 0x00, 0x00, 0xAA, 0xB3, /* 0x9C-0x9F */
+	0xCD, 0x6B, 0xCD, 0x67, 0xCD, 0x6A, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xCD, 0x66, 0xAA, 0xB5, 0xCD, 0x69, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xAA, 0xB2, 0xAA, 0xB1, 0x00, 0x00, 0xAA, 0xB4, /* 0xA8-0xAB */
+	0xCD, 0x6C, 0xCD, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xAC, 0xC2, 0xAC, 0xC5, /* 0xB0-0xB3 */
+	0xCF, 0xCE, 0xCF, 0xCD, 0xCF, 0xCC, 0xAC, 0xBF, /* 0xB4-0xB7 */
+	0xCF, 0xD5, 0xCF, 0xCB, 0x00, 0x00, 0xAC, 0xC1, /* 0xB8-0xBB */
+	0xD2, 0xAF, 0x00, 0x00, 0xCF, 0xD2, 0xCF, 0xD0, /* 0xBC-0xBF */
+	0xAC, 0xC4, 0x00, 0x00, 0xCF, 0xC8, 0xCF, 0xD3, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xCF, 0xCA, 0xCF, 0xD4, 0xCF, 0xD1, /* 0xC4-0xC7 */
+	0xCF, 0xC9, 0x00, 0x00, 0xAC, 0xC0, 0xCF, 0xD6, /* 0xC8-0xCB */
+	0xCF, 0xC7, 0xAC, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xD2, 0xB4, 0xD2, 0xAB, /* 0xD0-0xD3 */
+	0xD2, 0xB6, 0x00, 0x00, 0xD2, 0xAE, 0xD2, 0xB9, /* 0xD4-0xD7 */
+	0xD2, 0xBA, 0xD2, 0xAC, 0xD2, 0xB8, 0xD2, 0xB5, /* 0xD8-0xDB */
+	0xD2, 0xB3, 0xD2, 0xB7, 0xAF, 0x5F, 0x00, 0x00, /* 0xDC-0xDF */
+	0xAF, 0x5D, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xB1, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xD2, 0xAD, 0x00, 0x00, 0xD2, 0xB0, /* 0xE4-0xE7 */
+	0xD2, 0xBB, 0xD2, 0xB2, 0xAF, 0x5E, 0xCF, 0xCF, /* 0xE8-0xEB */
+	0x00, 0x00, 0xAF, 0x5A, 0xAF, 0x5C, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xD6, 0x78, 0xD6, 0x6D, 0xD6, 0x6B, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xD6, 0x6C, 0x00, 0x00, 0xD6, 0x73, 0x00, 0x00, /* 0xF8-0xFB */
+	0xD6, 0x74, 0xD6, 0x70, 0xB2, 0x7B, 0xD6, 0x75, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_74[512] = {
+	0xD6, 0x72, 0xD6, 0x6F, 0x00, 0x00, 0xB2, 0x79, /* 0x00-0x03 */
+	0xD6, 0x6E, 0xB2, 0x77, 0xB2, 0x7A, 0xD6, 0x71, /* 0x04-0x07 */
+	0xD6, 0x79, 0xAF, 0x5B, 0xB2, 0x78, 0xD6, 0x77, /* 0x08-0x0B */
+	0xD6, 0x76, 0xB2, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0x7E, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xA1, 0xB5, 0x60, /* 0x18-0x1B */
+	0x00, 0x00, 0xDA, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xDA, 0xA9, 0xDA, 0xA2, 0xB5, 0x5A, 0xDA, 0xA6, /* 0x20-0x23 */
+	0xDA, 0xA5, 0xB5, 0x5B, 0xB5, 0x61, 0x00, 0x00, /* 0x24-0x27 */
+	0xB5, 0x62, 0xDA, 0xA8, 0xB5, 0x58, 0xDA, 0x7D, /* 0x28-0x2B */
+	0xDA, 0x7B, 0xDA, 0xA3, 0xDA, 0x7A, 0xB5, 0x5F, /* 0x2C-0x2F */
+	0xDA, 0x7C, 0xDA, 0xA4, 0xDA, 0xAA, 0xB5, 0x59, /* 0x30-0x33 */
+	0xB5, 0x5E, 0xB5, 0x5C, 0xB5, 0x5D, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xB5, 0x57, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB7, 0xE9, /* 0x3C-0x3F */
+	0xDE, 0xB7, 0xB7, 0xE8, 0xDE, 0xBB, 0x00, 0x00, /* 0x40-0x43 */
+	0xDE, 0xB1, 0x00, 0x00, 0xDE, 0xBC, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xB2, 0xDE, 0xB3, /* 0x48-0x4B */
+	0x00, 0x00, 0xDE, 0xBD, 0xDE, 0xBA, 0xDE, 0xB8, /* 0x4C-0x4F */
+	0xDE, 0xB9, 0xDE, 0xB5, 0xDE, 0xB4, 0x00, 0x00, /* 0x50-0x53 */
+	0xDE, 0xBE, 0xB7, 0xE5, 0x00, 0x00, 0xDE, 0xB6, /* 0x54-0x57 */
+	0x00, 0x00, 0xB7, 0xEA, 0xB7, 0xE4, 0xB7, 0xEB, /* 0x58-0x5B */
+	0xB7, 0xEC, 0x00, 0x00, 0xB7, 0xE7, 0xB7, 0xE6, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xCE, 0xBA, 0xBE, /* 0x60-0x63 */
+	0xBA, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD3, /* 0x64-0x67 */
+	0x00, 0x00, 0xBC, 0xFC, 0xBA, 0xBF, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xBA, 0xC1, 0xE2, 0xD4, 0xB7, 0xE3, /* 0x6C-0x6F */
+	0xBA, 0xC0, 0xE2, 0xD0, 0xE2, 0xD2, 0xE2, 0xCF, /* 0x70-0x73 */
+	0x00, 0x00, 0xE2, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xE6, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xE6, 0xAA, 0xE6, 0xA7, 0xBD, 0x40, 0xEA, 0x62, /* 0x7C-0x7F */
+	
+	0xBD, 0x41, 0xE6, 0xA6, 0x00, 0x00, 0xBC, 0xFE, /* 0x80-0x83 */
+	0x00, 0x00, 0xE6, 0xA8, 0xE6, 0xA5, 0xE6, 0xA2, /* 0x84-0x87 */
+	0xE6, 0xA9, 0xE6, 0xA3, 0xE6, 0xA4, 0xBC, 0xFD, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xED, 0x69, 0x00, 0x00, 0xEA, 0x66, 0x00, 0x00, /* 0x90-0x93 */
+	0xEA, 0x65, 0xEA, 0x67, 0x00, 0x00, 0xED, 0x66, /* 0x94-0x97 */
+	0xBF, 0x5A, 0x00, 0x00, 0xEA, 0x63, 0x00, 0x00, /* 0x98-0x9B */
+	0xBF, 0x58, 0x00, 0x00, 0xBF, 0x5C, 0xBF, 0x5B, /* 0x9C-0x9F */
+	0xEA, 0x64, 0xEA, 0x68, 0x00, 0x00, 0xBF, 0x59, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xED, 0x6D, 0xC0, 0xF5, 0xC2, 0x7A, /* 0xA4-0xA7 */
+	0xC0, 0xF6, 0xC0, 0xF3, 0xED, 0x6A, 0xED, 0x68, /* 0xA8-0xAB */
+	0x00, 0x00, 0xED, 0x6B, 0x00, 0x00, 0xED, 0x6E, /* 0xAC-0xAF */
+	0xC0, 0xF4, 0xED, 0x6C, 0xED, 0x67, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF0, 0x42, 0xF0, 0x45, 0xF2, 0x75, /* 0xB4-0xB7 */
+	0xF0, 0x40, 0x00, 0x00, 0xF4, 0x6F, 0xF0, 0x46, /* 0xB8-0xBB */
+	0x00, 0x00, 0xC3, 0xA2, 0xF0, 0x44, 0xC2, 0x7B, /* 0xBC-0xBF */
+	0xF0, 0x41, 0xF0, 0x43, 0xF0, 0x47, 0xF2, 0x76, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xF2, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xC3, 0xA3, 0xF2, 0x73, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x6E, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xC4, 0xED, 0xF6, 0xF1, 0xC4, 0xEC, 0xF6, 0xF3, /* 0xD4-0xD7 */
+	0xF6, 0xF0, 0xF6, 0xF2, 0xC5, 0xD0, 0xF8, 0xB2, /* 0xD8-0xDB */
+	0xA5, 0xCA, 0xCD, 0x6E, 0xD2, 0xBC, 0xD2, 0xBD, /* 0xDC-0xDF */
+	0xB2, 0x7D, 0xDE, 0xBF, 0xBF, 0x5D, 0xC3, 0xA4, /* 0xE0-0xE3 */
+	0xC5, 0x7B, 0xF8, 0xB3, 0xA5, 0xCB, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xCD, 0x6F, 0xA2, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xCF, 0xD7, 0x00, 0x00, 0xCF, 0xD8, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xD2, 0xBE, 0xD2, 0xBF, 0xB2, 0x7E, 0xB2, 0xA1, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xAB, /* 0xF8-0xFB */
+	0x00, 0x00, 0xDE, 0xC2, 0xDE, 0xC1, 0xDE, 0xC0, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_75[512] = {
+	0xE2, 0xD5, 0x00, 0x00, 0xE2, 0xD6, 0xE2, 0xD7, /* 0x00-0x03 */
+	0xBA, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xAD, /* 0x04-0x07 */
+	0xE6, 0xAC, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x69, /* 0x08-0x0B */
+	0xBF, 0x5E, 0xBF, 0x5F, 0x00, 0x00, 0xED, 0x72, /* 0x0C-0x0F */
+	0xED, 0x6F, 0xED, 0x70, 0xED, 0x71, 0xF0, 0x49, /* 0x10-0x13 */
+	0xF0, 0x48, 0xC2, 0x7C, 0xF2, 0x77, 0xF5, 0xDE, /* 0x14-0x17 */
+	0xA5, 0xCC, 0x00, 0x00, 0xAC, 0xC6, 0x00, 0x00, /* 0x18-0x1B */
+	0xB2, 0xA2, 0xDE, 0xC3, 0x00, 0x00, 0xA5, 0xCD, /* 0x1C-0x1F */
+	0x00, 0x00, 0xD2, 0xC0, 0xB2, 0xA3, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xB5, 0x63, 0xB5, 0x64, 0x00, 0x00, /* 0x24-0x27 */
+	0xA5, 0xCE, 0xA5, 0xCF, 0xCA, 0x46, 0xA8, 0x6A, /* 0x28-0x2B */
+	0xA8, 0x69, 0xAC, 0xC7, 0xCF, 0xD9, 0xDA, 0xAC, /* 0x2C-0x2F */
+	0xA5, 0xD0, 0xA5, 0xD1, 0xA5, 0xD2, 0xA5, 0xD3, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x6B, /* 0x34-0x37 */
+	0xA8, 0x6C, 0xCB, 0x6E, 0xCB, 0x6D, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xAA, 0xB6, 0xCD, 0x72, 0xCD, 0x70, /* 0x3C-0x3F */
+	0xCD, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xDA, /* 0x44-0x47 */
+	0xCF, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xAC, 0xCB, /* 0x48-0x4B */
+	0xAC, 0xC9, 0x00, 0x00, 0xAC, 0xCA, 0xAC, 0xC8, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xAF, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xAF, 0x64, 0xAF, 0x63, 0xD2, 0xC1, /* 0x58-0x5B */
+	0xAF, 0x62, 0xAF, 0x61, 0x00, 0x00, 0xD2, 0xC2, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0xA6, 0xD6, 0x7B, /* 0x60-0x63 */
+	0xD6, 0x7A, 0xB2, 0xA4, 0xB2, 0xA5, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xB5, 0x66, 0xB5, 0x65, /* 0x68-0x6B */
+	0xDA, 0xAE, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xAD, /* 0x6C-0x6F */
+	0xB2, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xB7, 0xED, 0xDE, 0xC5, /* 0x74-0x77 */
+	0xB7, 0xEE, 0xDE, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xE2, 0xD8, 0xE6, 0xAE, 0xBD, 0x42, /* 0x7C-0x7F */
+	
+	0xEA, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xED, 0x73, 0x00, 0x00, 0xC3, 0xA6, 0xC3, 0xA5, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xC5, 0x7C, 0xA5, 0xD4, /* 0x88-0x8B */
+	0xCD, 0x73, 0x00, 0x00, 0x00, 0x00, 0xB2, 0xA8, /* 0x8C-0x8F */
+	0xE2, 0xD9, 0xBA, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xCB, 0x6F, 0xCB, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xCD, 0x74, 0xAA, 0xB8, 0xAA, 0xB9, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xAA, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xAC, 0xCF, 0xAC, 0xD0, /* 0xA0-0xA3 */
+	0xAC, 0xCD, 0xAC, 0xCE, 0x00, 0x00, 0xCF, 0xDC, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xCF, 0xDD, 0xAC, 0xCC, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xD2, 0xC3, 0x00, 0x00, 0xAF, 0x68, 0xAF, 0x69, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xB2, 0xAB, 0xD2, 0xC9, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xAF, 0x6E, 0xAF, 0x6C, 0xD2, 0xCA, 0xD2, 0xC5, /* 0xB8-0xBB */
+	0xAF, 0x6B, 0xAF, 0x6A, 0xAF, 0x65, 0xD2, 0xC8, /* 0xBC-0xBF */
+	0xD2, 0xC7, 0xD2, 0xC4, 0xAF, 0x6D, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xD2, 0xC6, 0xAF, 0x66, 0x00, 0x00, 0xAF, 0x67, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0xAC, 0xD6, 0xA1, /* 0xC8-0xCB */
+	0xD6, 0xA2, 0xB2, 0xAD, 0xD6, 0x7C, 0xD6, 0x7E, /* 0xCC-0xCF */
+	0xD6, 0xA4, 0xD6, 0xA3, 0xD6, 0x7D, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xB2, 0xA9, 0xB2, 0xAA, 0x00, 0x00, 0xDA, 0xB6, /* 0xD4-0xD7 */
+	0xB5, 0x6B, 0xB5, 0x6A, 0xDA, 0xB0, 0xB5, 0x68, /* 0xD8-0xDB */
+	0x00, 0x00, 0xDA, 0xB3, 0xB5, 0x6C, 0xDA, 0xB4, /* 0xDC-0xDF */
+	0xB5, 0x6D, 0xDA, 0xB1, 0xB5, 0x67, 0xB5, 0x69, /* 0xE0-0xE3 */
+	0xDA, 0xB5, 0x00, 0x00, 0xDA, 0xB2, 0xDA, 0xAF, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xDE, 0xD2, 0x00, 0x00, 0xDE, 0xC7, /* 0xEC-0xEF */
+	0xB7, 0xF0, 0xB7, 0xF3, 0xB7, 0xF2, 0xB7, 0xF7, /* 0xF0-0xF3 */
+	0xB7, 0xF6, 0xDE, 0xD3, 0xDE, 0xD1, 0xDE, 0xCA, /* 0xF4-0xF7 */
+	0xDE, 0xCE, 0xDE, 0xCD, 0xB7, 0xF4, 0xDE, 0xD0, /* 0xF8-0xFB */
+	0xDE, 0xCC, 0xDE, 0xD4, 0xDE, 0xCB, 0xB7, 0xF5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_76[512] = {
+	0xB7, 0xEF, 0xB7, 0xF1, 0x00, 0x00, 0xDE, 0xC9, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xE2, 0xDB, 0xBA, 0xC7, 0xE2, 0xDF, 0xBA, 0xC6, /* 0x08-0x0B */
+	0xE2, 0xDC, 0xBA, 0xC5, 0x00, 0x00, 0xDE, 0xC8, /* 0x0C-0x0F */
+	0xDE, 0xCF, 0xE2, 0xDE, 0x00, 0x00, 0xBA, 0xC8, /* 0x10-0x13 */
+	0xE2, 0xE0, 0xE2, 0xDD, 0xE2, 0xDA, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xE6, 0xB1, 0xE6, 0xB5, 0xE6, 0xB7, /* 0x18-0x1B */
+	0xE6, 0xB3, 0xE6, 0xB2, 0xE6, 0xB0, 0xBD, 0x45, /* 0x1C-0x1F */
+	0xBD, 0x43, 0xBD, 0x48, 0xBD, 0x49, 0xE6, 0xB4, /* 0x20-0x23 */
+	0xBD, 0x46, 0xE6, 0xAF, 0xBD, 0x47, 0xBA, 0xC4, /* 0x24-0x27 */
+	0xE6, 0xB6, 0xBD, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xEA, 0x6C, 0x00, 0x00, 0xEA, 0x6B, /* 0x2C-0x2F */
+	0xEA, 0x73, 0xEA, 0x6D, 0xEA, 0x72, 0xEA, 0x6F, /* 0x30-0x33 */
+	0xBF, 0x60, 0xEA, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xBF, 0x61, 0x00, 0x00, 0xBF, 0x62, 0x00, 0x00, /* 0x38-0x3B */
+	0xEA, 0x70, 0xEA, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0xF8, 0xED, 0x74, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0xF7, 0xED, 0x77, /* 0x44-0x47 */
+	0xED, 0x75, 0xED, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xC0, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF0, 0x4D, 0x00, 0x00, 0xC2, 0xA1, 0xF0, 0x4E, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xC2, 0x7D, 0xF0, 0x4F, /* 0x54-0x57 */
+	0xC2, 0x7E, 0xF0, 0x4C, 0xF0, 0x50, 0x00, 0x00, /* 0x58-0x5B */
+	0xF0, 0x4A, 0x00, 0x00, 0x00, 0x00, 0xC3, 0xA7, /* 0x5C-0x5F */
+	0xF2, 0x78, 0xC3, 0xA8, 0xC4, 0x6F, 0x00, 0x00, /* 0x60-0x63 */
+	0xF0, 0x4B, 0xC4, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xC4, 0xEE, 0xF5, 0xDF, 0x00, 0x00, /* 0x68-0x6B */
+	0xC5, 0x7E, 0xF6, 0xF4, 0xC5, 0x7D, 0x00, 0x00, /* 0x6C-0x6F */
+	0xF7, 0xEA, 0xC5, 0xF5, 0xC5, 0xF6, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xF9, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xAC, 0xD1, 0xCF, 0xDE, 0x00, 0x00, 0xB5, 0x6E, /* 0x78-0x7B */
+	0xB5, 0x6F, 0xA5, 0xD5, 0xA6, 0xCA, 0xCA, 0x47, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xCB, 0x71, 0xA8, 0x6D, 0x00, 0x00, /* 0x80-0x83 */
+	0xAA, 0xBA, 0x00, 0x00, 0xAC, 0xD2, 0xAC, 0xD3, /* 0x84-0x87 */
+	0xAC, 0xD4, 0xD6, 0xA6, 0xD2, 0xCB, 0xAF, 0x6F, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0xAE, 0xD6, 0xA5, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xB8, 0xB5, 0x71, /* 0x90-0x93 */
+	0x00, 0x00, 0xDA, 0xB7, 0xB5, 0x70, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xDE, 0xD5, 0xBD, 0x4A, 0xE6, 0xBB, /* 0x98-0x9B */
+	0xE6, 0xB8, 0xE6, 0xB9, 0xE6, 0xBA, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xED, 0x78, 0x00, 0x00, 0xF0, 0x51, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0x71, 0xF4, 0x70, /* 0xA8-0xAB */
+	0x00, 0x00, 0xF6, 0xF5, 0xA5, 0xD6, 0xCD, 0x75, /* 0xAC-0xAF */
+	0xAF, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xB5, 0x72, 0xDE, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE2, 0xE1, 0x00, 0x00, 0xBD, 0x4B, 0xEA, 0x74, /* 0xB8-0xBB */
+	0x00, 0x00, 0xF0, 0x52, 0xF4, 0x72, 0xA5, 0xD7, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xAA, 0xBB, 0xAC, 0xD7, /* 0xC0-0xC3 */
+	0xCF, 0xDF, 0xAC, 0xD8, 0xAC, 0xD6, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xAC, 0xD5, 0xD2, 0xCC, 0xAF, 0x71, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xAF, 0x72, 0xAF, 0x73, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0xB0, 0xD6, 0xA7, /* 0xD0-0xD3 */
+	0xB2, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xB9, 0xB2, 0xB1, /* 0xD8-0xDB */
+	0xB5, 0x73, 0xDE, 0xD7, 0xB7, 0xF8, 0xB7, 0xF9, /* 0xDC-0xDF */
+	0x00, 0x00, 0xBA, 0xC9, 0x00, 0x00, 0xBA, 0xCA, /* 0xE0-0xE3 */
+	0xBD, 0x4C, 0xBF, 0x64, 0xEA, 0x75, 0xBF, 0x63, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xED, 0x79, 0xC0, 0xFA, 0x00, 0x00, /* 0xE8-0xEB */
+	0xF0, 0x53, 0xF4, 0x73, 0xA5, 0xD8, 0xA8, 0x6E, /* 0xEC-0xEF */
+	0xCD, 0x78, 0xCD, 0x77, 0xAA, 0xBC, 0xCD, 0x76, /* 0xF0-0xF3 */
+	0xAA, 0xBD, 0xCD, 0x79, 0x00, 0x00, 0xCF, 0xE5, /* 0xF4-0xF7 */
+	0xAC, 0xDB, 0xAC, 0xDA, 0xCF, 0xE7, 0xCF, 0xE6, /* 0xF8-0xFB */
+	0xAC, 0xDF, 0x00, 0x00, 0xAC, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_77[512] = {
+	0x00, 0x00, 0xAC, 0xD9, 0x00, 0x00, 0xCF, 0xE1, /* 0x00-0x03 */
+	0xCF, 0xE2, 0xCF, 0xE3, 0x00, 0x00, 0xAC, 0xE0, /* 0x04-0x07 */
+	0xCF, 0xE0, 0xAC, 0xDC, 0xCF, 0xE4, 0xAC, 0xDD, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xD2, 0xCF, 0xD2, 0xD3, 0xD2, 0xD1, 0xD2, 0xD0, /* 0x10-0x13 */
+	0x00, 0x00, 0xD2, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xD2, 0xD5, 0xD2, 0xD6, 0xD2, 0xCE, /* 0x18-0x1B */
+	0x00, 0x00, 0xD2, 0xCD, 0x00, 0x00, 0xAF, 0x75, /* 0x1C-0x1F */
+	0xAF, 0x76, 0x00, 0x00, 0xD2, 0xD7, 0xD2, 0xD2, /* 0x20-0x23 */
+	0x00, 0x00, 0xD6, 0xB0, 0x00, 0x00, 0xD2, 0xD8, /* 0x24-0x27 */
+	0xAF, 0x77, 0xAF, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xD6, 0xAA, 0x00, 0x00, 0xD6, 0xA9, /* 0x2C-0x2F */
+	0x00, 0x00, 0xD6, 0xAB, 0xD6, 0xAC, 0xD6, 0xAE, /* 0x30-0x33 */
+	0xD6, 0xAD, 0xD6, 0xB2, 0xB2, 0xB5, 0xB2, 0xB2, /* 0x34-0x37 */
+	0xB2, 0xB6, 0xD6, 0xA8, 0xB2, 0xB7, 0xD6, 0xB1, /* 0x38-0x3B */
+	0xB2, 0xB4, 0xD6, 0xAF, 0xB2, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xDA, 0xBC, 0xDA, 0xBE, 0xDA, 0xBA, 0xDA, 0xBB, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xBF, 0xDA, 0xC1, /* 0x48-0x4B */
+	0xDA, 0xC2, 0xDA, 0xBD, 0xDA, 0xC0, 0xB5, 0x74, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xDE, 0xDB, 0x00, 0x00, /* 0x50-0x53 */
+	0xDE, 0xE0, 0xDE, 0xD8, 0xDE, 0xDC, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xDE, 0xE1, 0xDE, 0xDD, 0xB7, 0xFA, /* 0x58-0x5B */
+	0xB8, 0x43, 0x00, 0x00, 0xB7, 0xFD, 0xDE, 0xD9, /* 0x5C-0x5F */
+	0xDE, 0xDA, 0xBA, 0xCE, 0xB8, 0x46, 0xB7, 0xFE, /* 0x60-0x63 */
+	0x00, 0x00, 0xB8, 0x44, 0xB7, 0xFC, 0xDE, 0xDF, /* 0x64-0x67 */
+	0xB8, 0x45, 0xDE, 0xDE, 0xB8, 0x41, 0xB7, 0xFB, /* 0x68-0x6B */
+	0xB8, 0x42, 0xDE, 0xE2, 0xE2, 0xE6, 0xE2, 0xE8, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xB8, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xE2, 0xE3, 0xBA, 0xCC, 0xE2, 0xE9, 0xBA, 0xCD, /* 0x7C-0x7F */
+	
+	0xE2, 0xE7, 0xE2, 0xE2, 0xE2, 0xE5, 0xE2, 0xEA, /* 0x80-0x83 */
+	0xBA, 0xCB, 0xE2, 0xE4, 0x00, 0x00, 0xBD, 0x4E, /* 0x84-0x87 */
+	0xE6, 0xBF, 0xE6, 0xBE, 0x00, 0x00, 0xBD, 0x51, /* 0x88-0x8B */
+	0xBD, 0x4F, 0xE6, 0xBC, 0xBD, 0x4D, 0xE6, 0xBD, /* 0x8C-0x8F */
+	0x00, 0x00, 0xBD, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xEA, 0x7D, 0x00, 0x00, 0xEA, 0xA1, /* 0x94-0x97 */
+	0x00, 0x00, 0xEA, 0x7E, 0xEA, 0x76, 0xEA, 0x7A, /* 0x98-0x9B */
+	0xEA, 0x79, 0xEA, 0x77, 0xBF, 0x66, 0xBF, 0x67, /* 0x9C-0x9F */
+	0xBF, 0x65, 0xEA, 0x78, 0xEA, 0x7B, 0xEA, 0x7C, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xBF, 0x68, 0x00, 0x00, 0xC1, 0x40, /* 0xA4-0xA7 */
+	0xED, 0xA3, 0x00, 0x00, 0xC0, 0xFC, 0xED, 0x7B, /* 0xA8-0xAB */
+	0xC0, 0xFE, 0xC1, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xC0, 0xFD, 0xED, 0xA2, 0xED, 0x7C, 0xC0, 0xFB, /* 0xB0-0xB3 */
+	0xED, 0xA1, 0xED, 0x7A, 0xED, 0x7E, 0xED, 0x7D, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0x55, 0xC2, 0xA4, /* 0xB8-0xBB */
+	0xC2, 0xA5, 0xC2, 0xA2, 0x00, 0x00, 0xC2, 0xA3, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xF0, 0x54, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xF2, 0x7B, 0x00, 0x00, 0x00, 0x00, 0xC3, 0xA9, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF2, 0x79, 0xF2, 0x7A, 0x00, 0x00, /* 0xC8-0xCB */
+	0xF4, 0x74, 0xF4, 0x77, 0xF4, 0x75, 0xF4, 0x76, /* 0xCC-0xCF */
+	0xF5, 0xE0, 0x00, 0x00, 0x00, 0x00, 0xC4, 0xEF, /* 0xD0-0xD3 */
+	0xF7, 0xEB, 0xF8, 0xB4, 0x00, 0x00, 0xC5, 0xF7, /* 0xD4-0xD7 */
+	0xF8, 0xF8, 0xF8, 0xF9, 0xC6, 0x66, 0xA5, 0xD9, /* 0xD8-0xDB */
+	0xAC, 0xE1, 0x00, 0x00, 0xDA, 0xC3, 0x00, 0x00, /* 0xDC-0xDF */
+	0xDE, 0xE3, 0x00, 0x00, 0xA5, 0xDA, 0xA8, 0x6F, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xAA, 0xBE, 0x00, 0x00, 0xCF, 0xE8, /* 0xE4-0xE7 */
+	0xCF, 0xE9, 0xAF, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xDA, 0xC4, 0xB5, 0x75, 0xB8, 0x47, 0xC1, 0x42, /* 0xEC-0xEF */
+	0xED, 0xA4, 0xF2, 0x7C, 0xF4, 0x78, 0xA5, 0xDB, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xA1, /* 0xF4-0xF7 */
+	0xCD, 0x7A, 0xCD, 0x7C, 0xCD, 0x7E, 0xCD, 0x7D, /* 0xF8-0xFB */
+	0xCD, 0x7B, 0xAA, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_78[512] = {
+	0x00, 0x00, 0x00, 0x00, 0xAC, 0xE2, 0xCF, 0xF2, /* 0x00-0x03 */
+	0x00, 0x00, 0xCF, 0xED, 0xCF, 0xEA, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xCF, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xAC, 0xE4, 0xAC, 0xE5, 0xCF, 0xF0, 0xCF, 0xEF, /* 0x0C-0x0F */
+	0xCF, 0xEE, 0xCF, 0xEB, 0xCF, 0xEC, 0xCF, 0xF3, /* 0x10-0x13 */
+	0xAC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0xAF, 0x7C, 0x00, 0x00, 0xAF, 0xA4, /* 0x1C-0x1F */
+	0xAF, 0xA3, 0xD2, 0xE1, 0xD2, 0xDB, 0xD2, 0xD9, /* 0x20-0x23 */
+	0x00, 0x00, 0xAF, 0xA1, 0xD6, 0xB9, 0xAF, 0x7A, /* 0x24-0x27 */
+	0xD2, 0xDE, 0xD2, 0xE2, 0xD2, 0xE4, 0xD2, 0xE0, /* 0x28-0x2B */
+	0xD2, 0xDA, 0xAF, 0xA2, 0xD2, 0xDF, 0xD2, 0xDD, /* 0x2C-0x2F */
+	0xAF, 0x79, 0xD2, 0xE5, 0xAF, 0xA5, 0xD2, 0xE3, /* 0x30-0x33 */
+	0xAF, 0x7D, 0xD2, 0xDC, 0x00, 0x00, 0xAF, 0x7E, /* 0x34-0x37 */
+	0xAF, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB2, 0xB9, /* 0x40-0x43 */
+	0x00, 0x00, 0xD6, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xD6, 0xB3, 0xD6, 0xB5, 0xD6, 0xB7, 0x00, 0x00, /* 0x48-0x4B */
+	0xD6, 0xB8, 0xD6, 0xB6, 0xB2, 0xBA, 0x00, 0x00, /* 0x4C-0x4F */
+	0xD6, 0xBB, 0x00, 0x00, 0xD6, 0xB4, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0xDA, 0xC8, 0xB5, 0x76, 0xDA, 0xD0, 0x00, 0x00, /* 0x5C-0x5F */
+	0xDA, 0xC5, 0x00, 0x00, 0xDA, 0xD1, 0x00, 0x00, /* 0x60-0x63 */
+	0xDA, 0xC6, 0xDA, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xDA, 0xCF, 0xDA, 0xCE, 0xDA, 0xCB, 0xB2, 0xB8, /* 0x68-0x6B */
+	0xB5, 0x77, 0xDA, 0xC9, 0xDA, 0xCC, 0xB5, 0x78, /* 0x6C-0x6F */
+	0xDA, 0xCD, 0xDA, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xDE, 0xEE, 0x00, 0x00, 0xDE, 0xF2, /* 0x78-0x7B */
+	0xB8, 0x4E, 0x00, 0x00, 0xE2, 0xF0, 0xB8, 0x51, /* 0x7C-0x7F */
+	
+	0xDE, 0xF0, 0xF9, 0xD6, 0x00, 0x00, 0xDE, 0xED, /* 0x80-0x83 */
+	0xDE, 0xE8, 0xDE, 0xEA, 0xDE, 0xEB, 0xDE, 0xE4, /* 0x84-0x87 */
+	0x00, 0x00, 0xB8, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xB8, 0x4C, 0x00, 0x00, 0xB8, 0x48, 0xDE, 0xE7, /* 0x8C-0x8F */
+	0x00, 0x00, 0xB8, 0x4F, 0x00, 0x00, 0xB8, 0x50, /* 0x90-0x93 */
+	0xDE, 0xE6, 0xDE, 0xE9, 0xDE, 0xF1, 0xB8, 0x4A, /* 0x94-0x97 */
+	0xB8, 0x4B, 0xDE, 0xEF, 0xDE, 0xE5, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xE2, 0xF2, 0xBA, 0xD0, /* 0x9C-0x9F */
+	0xE2, 0xF4, 0xDE, 0xEC, 0xE2, 0xF6, 0xBA, 0xD4, /* 0xA0-0xA3 */
+	0xE2, 0xF7, 0xE2, 0xF3, 0x00, 0x00, 0xBA, 0xD1, /* 0xA4-0xA7 */
+	0xE2, 0xEF, 0xBA, 0xD3, 0xE2, 0xEC, 0xE2, 0xF1, /* 0xA8-0xAB */
+	0xE2, 0xF5, 0xE2, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xB8, 0x49, 0x00, 0x00, 0xE2, 0xEB, 0xBA, 0xD2, /* 0xB0-0xB3 */
+	0xE2, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xBD, 0x54, 0xE6, 0xC1, /* 0xB8-0xBB */
+	0xBD, 0x58, 0x00, 0x00, 0xBD, 0x56, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xBA, 0xCF, 0x00, 0x00, 0xE6, 0xC8, /* 0xC0-0xC3 */
+	0xE6, 0xC9, 0xBD, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE6, 0xC7, 0xE6, 0xCA, 0xBD, 0x55, 0xBD, 0x52, /* 0xC8-0xCB */
+	0xE6, 0xC3, 0xE6, 0xC0, 0xE6, 0xC5, 0xE6, 0xC2, /* 0xCC-0xCF */
+	0xBD, 0x59, 0xE6, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xE6, 0xC6, 0xBD, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xBF, 0x6A, 0xEA, 0xA8, /* 0xD8-0xDB */
+	0x00, 0x00, 0xEA, 0xA2, 0xEA, 0xA6, 0xEA, 0xAC, /* 0xDC-0xDF */
+	0xEA, 0xAD, 0xEA, 0xA9, 0xEA, 0xAA, 0xEA, 0xA7, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xEA, 0xA4, 0x00, 0x00, 0xBF, 0x6C, /* 0xE4-0xE7 */
+	0xBF, 0x69, 0xEA, 0xA3, 0xEA, 0xA5, 0x00, 0x00, /* 0xE8-0xEB */
+	0xBF, 0x6B, 0xEA, 0xAB, 0x00, 0x00, 0xC1, 0x46, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xAA, 0xED, 0xA5, /* 0xF0-0xF3 */
+	0xC1, 0x45, 0x00, 0x00, 0x00, 0x00, 0xC1, 0x43, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xED, 0xAC, 0xC1, 0x44, 0xED, 0xA8, /* 0xF8-0xFB */
+	0xED, 0xA9, 0xED, 0xA6, 0xED, 0xAD, 0xF0, 0x56, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_79[512] = {
+	0x00, 0x00, 0xC1, 0x47, 0xED, 0xA7, 0x00, 0x00, /* 0x00-0x03 */
+	0xED, 0xAE, 0xED, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0xF0, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xF0, 0x57, 0x00, 0x00, 0xC2, 0xA6, 0x00, 0x00, /* 0x0C-0x0F */
+	0xF0, 0x5B, 0xF0, 0x5D, 0xF0, 0x5C, 0xF0, 0x58, /* 0x10-0x13 */
+	0xF0, 0x59, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA3, /* 0x14-0x17 */
+	0x00, 0x00, 0xC3, 0xAA, 0x00, 0x00, 0xF2, 0x7E, /* 0x18-0x1B */
+	0xF2, 0xA2, 0xF2, 0x7D, 0xF2, 0xA4, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xF2, 0xA1, 0x00, 0x00, 0xF4, 0x7A, /* 0x20-0x23 */
+	0xF4, 0x7D, 0xF4, 0x79, 0xC4, 0x71, 0xF4, 0x7B, /* 0x24-0x27 */
+	0xF4, 0x7C, 0xF4, 0x7E, 0xC4, 0x72, 0xC4, 0x74, /* 0x28-0x2B */
+	0xC4, 0x73, 0xF5, 0xE1, 0x00, 0x00, 0xF5, 0xE3, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF5, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xF6, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xF8, 0xB5, 0xF8, 0xFA, 0xA5, 0xDC, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xCB, 0x72, 0xAA, 0xC0, 0xCD, 0xA3, /* 0x3C-0x3F */
+	0xAA, 0xC1, 0xAA, 0xC2, 0xCD, 0xA2, 0x00, 0x00, /* 0x40-0x43 */
+	0xCF, 0xF8, 0xCF, 0xF7, 0xAC, 0xE6, 0xAC, 0xE9, /* 0x44-0x47 */
+	0xAC, 0xE8, 0xAC, 0xE7, 0xCF, 0xF4, 0xCF, 0xF6, /* 0x48-0x4B */
+	0xCF, 0xF5, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xE8, /* 0x4C-0x4F */
+	0xAF, 0xA7, 0xD2, 0xEC, 0xD2, 0xEB, 0xD2, 0xEA, /* 0x50-0x53 */
+	0xD2, 0xE6, 0xAF, 0xA6, 0xAF, 0xAA, 0xAF, 0xAD, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xAF, 0xAE, 0xD2, 0xE7, /* 0x58-0x5B */
+	0xD2, 0xE9, 0xAF, 0xAC, 0xAF, 0xAB, 0xAF, 0xA9, /* 0x5C-0x5F */
+	0xAF, 0xA8, 0xD6, 0xC2, 0x00, 0x00, 0xD6, 0xC0, /* 0x60-0x63 */
+	0xD6, 0xBC, 0xB2, 0xBB, 0x00, 0x00, 0xD6, 0xBD, /* 0x64-0x67 */
+	0xB2, 0xBC, 0xD6, 0xBE, 0xD6, 0xBF, 0xD6, 0xC1, /* 0x68-0x6B */
+	0x00, 0x00, 0xB2, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xDA, 0xD5, 0x00, 0x00, 0xDA, 0xD4, 0xDA, 0xD3, /* 0x70-0x73 */
+	0xDA, 0xD2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xDE, 0xF6, 0xB8, 0x52, 0x00, 0x00, /* 0x78-0x7B */
+	0xDE, 0xF3, 0xDE, 0xF5, 0x00, 0x00, 0xB8, 0x53, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xB8, 0x54, 0xDE, 0xF4, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE3, 0x41, 0x00, 0x00, 0xE2, 0xF9, 0xE2, 0xFA, /* 0x88-0x8B */
+	0x00, 0x00, 0xBA, 0xD7, 0xBA, 0xD5, 0xBA, 0xD6, /* 0x8C-0x8F */
+	0xE3, 0x43, 0x00, 0x00, 0xE3, 0x42, 0xE2, 0xFE, /* 0x90-0x93 */
+	0xE2, 0xFD, 0xE2, 0xFC, 0xE2, 0xFB, 0xE3, 0x40, /* 0x94-0x97 */
+	0xE2, 0xF8, 0x00, 0x00, 0xE6, 0xCB, 0xE6, 0xD0, /* 0x98-0x9B */
+	0xE6, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xE6, 0xCD, 0xE6, 0xCC, 0xE6, 0xCF, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xEA, 0xAE, 0x00, 0x00, 0xBF, 0x6D, 0xC1, 0x48, /* 0xA4-0xA7 */
+	0xED, 0xB0, 0x00, 0x00, 0xC1, 0x49, 0xED, 0xAF, /* 0xA8-0xAB */
+	0xF0, 0x5F, 0xF0, 0x5E, 0xC2, 0xA7, 0x00, 0x00, /* 0xAC-0xAF */
+	0xF2, 0xA5, 0xC3, 0xAB, 0xF4, 0xA1, 0xC5, 0xA1, /* 0xB0-0xB3 */
+	0xF6, 0xF7, 0x00, 0x00, 0xF8, 0xB7, 0xF8, 0xB6, /* 0xB4-0xB7 */
+	0xC9, 0xA8, 0xAC, 0xEA, 0xAC, 0xEB, 0xD6, 0xC3, /* 0xB8-0xBB */
+	0x00, 0x00, 0xB8, 0x56, 0xA5, 0xDD, 0xA8, 0x72, /* 0xBC-0xBF */
+	0xA8, 0x71, 0xA8, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xCD, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xAA, 0xC4, 0xAA, 0xC3, 0x00, 0x00, 0xAC, 0xEE, /* 0xC8-0xCB */
+	0x00, 0x00, 0xCF, 0xFA, 0xCF, 0xFD, 0xCF, 0xFB, /* 0xCC-0xCF */
+	0x00, 0x00, 0xAC, 0xEC, 0xAC, 0xED, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xCF, 0xF9, 0xCF, 0xFC, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xAF, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xD2, 0xF3, 0xD2, 0xF5, 0xD2, 0xF4, 0xAF, 0xB2, /* 0xDC-0xDF */
+	0xD2, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xAF, 0xB0, /* 0xE0-0xE3 */
+	0xAF, 0xAF, 0x00, 0x00, 0xAF, 0xB3, 0xAF, 0xB1, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xAF, 0xB4, 0xD2, 0xF2, 0xD2, 0xED, /* 0xE8-0xEB */
+	0xD2, 0xEE, 0xD2, 0xF1, 0xD2, 0xF0, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xC6, 0xD6, 0xC7, /* 0xF4-0xF7 */
+	0xD6, 0xC5, 0x00, 0x00, 0xD6, 0xC4, 0xB2, 0xBE, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7A[512] = {
+	0xB5, 0x7D, 0x00, 0x00, 0xDA, 0xD6, 0xDA, 0xD8, /* 0x00-0x03 */
+	0xDA, 0xDA, 0xB5, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xB5, 0x7A, 0x00, 0x00, 0xDA, 0xD7, 0xB5, 0x7B, /* 0x08-0x0B */
+	0xDA, 0xD9, 0xB5, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xDF, 0x41, 0xDE, 0xF7, 0xDE, 0xFA, 0xDE, 0xFE, /* 0x10-0x13 */
+	0xB8, 0x5A, 0xDE, 0xFC, 0x00, 0x00, 0xDE, 0xFB, /* 0x14-0x17 */
+	0xDE, 0xF8, 0xDE, 0xF9, 0xB8, 0x58, 0xDF, 0x40, /* 0x18-0x1B */
+	0xB8, 0x57, 0x00, 0x00, 0xB8, 0x5C, 0xB8, 0x5B, /* 0x1C-0x1F */
+	0xB8, 0x59, 0x00, 0x00, 0xDE, 0xFD, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x49, 0x00, 0x00, /* 0x24-0x27 */
+	0xE3, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x44, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xBA, 0xD8, 0xE3, 0x47, /* 0x2C-0x2F */
+	0xE3, 0x46, 0xBA, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBD, 0x5E, /* 0x34-0x37 */
+	0x00, 0x00, 0xE6, 0xD2, 0x00, 0x00, 0xBD, 0x5F, /* 0x38-0x3B */
+	0xBD, 0x5B, 0xBD, 0x5D, 0x00, 0x00, 0xBD, 0x5A, /* 0x3C-0x3F */
+	0xBD, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xEA, 0xAF, 0x00, 0x00, 0xBF, 0x70, 0xEA, 0xB1, /* 0x44-0x47 */
+	0xEA, 0xB0, 0x00, 0x00, 0xE3, 0x45, 0xBF, 0x72, /* 0x48-0x4B */
+	0xBF, 0x71, 0xBF, 0x6E, 0xBF, 0x6F, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xED, 0xB5, 0x00, 0x00, 0xED, 0xB3, 0xC1, 0x4A, /* 0x54-0x57 */
+	0xED, 0xB4, 0x00, 0x00, 0xED, 0xB6, 0xED, 0xB2, /* 0x58-0x5B */
+	0xED, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x60, /* 0x5C-0x5F */
+	0xC2, 0xAA, 0xC2, 0xA8, 0xC2, 0xA9, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA6, /* 0x64-0x67 */
+	0xF2, 0xA7, 0xC3, 0xAD, 0x00, 0x00, 0xC3, 0xAC, /* 0x68-0x6B */
+	0xF4, 0xA3, 0xF4, 0xA4, 0xF4, 0xA2, 0x00, 0x00, /* 0x6C-0x6F */
+	0xF6, 0xF8, 0xF6, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xA5, 0xDE, 0xCA, 0x48, 0xA8, 0x73, 0x00, 0x00, /* 0x74-0x77 */
+	0xCD, 0xA5, 0xAA, 0xC6, 0xAA, 0xC5, 0xCD, 0xA6, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0x40, 0xAC, 0xEF, /* 0x7C-0x7F */
+	
+	0xCF, 0xFE, 0xAC, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xAF, 0xB6, 0xD2, 0xF8, 0xD2, 0xF6, 0xD2, 0xFC, /* 0x84-0x87 */
+	0xAF, 0xB7, 0xD2, 0xF7, 0xD2, 0xFB, 0xD2, 0xF9, /* 0x88-0x8B */
+	0xD2, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC8, /* 0x8C-0x8F */
+	0xD6, 0xCA, 0x00, 0x00, 0xB2, 0xBF, 0x00, 0x00, /* 0x90-0x93 */
+	0xD6, 0xC9, 0xB2, 0xC0, 0xB5, 0xA2, 0xB5, 0xA1, /* 0x94-0x97 */
+	0xB5, 0x7E, 0xDA, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0x44, 0xB8, 0x5D, /* 0x9C-0x9F */
+	0xB8, 0x5E, 0x00, 0x00, 0xDF, 0x43, 0xDF, 0x42, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xE3, 0x4A, 0xBA, 0xDB, 0xBA, 0xDA, 0xE3, 0x4B, /* 0xA8-0xAB */
+	0xE3, 0x4C, 0x00, 0x00, 0xBD, 0x61, 0xBD, 0x60, /* 0xAC-0xAF */
+	0x00, 0x00, 0xEA, 0xB5, 0xE6, 0xD3, 0xE6, 0xD5, /* 0xB0-0xB3 */
+	0xE6, 0xD4, 0xEA, 0xB4, 0xEA, 0xB2, 0xEA, 0xB6, /* 0xB4-0xB7 */
+	0xEA, 0xB3, 0x00, 0x00, 0xBF, 0x73, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xED, 0xB7, 0xC1, 0x4B, /* 0xBC-0xBF */
+	0xED, 0xB8, 0xED, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xC2, 0xAB, 0xC2, 0xAC, 0x00, 0x00, 0xC4, 0x75, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xC5, 0xD1, 0xA5, 0xDF, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xD0, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xD2, 0xFD, 0xAF, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xBA, /* 0xDC-0xDF */
+	0xB3, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xB5, 0xA4, /* 0xE0-0xE3 */
+	0xDA, 0xDD, 0xB5, 0xA3, 0xDA, 0xDC, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x45, /* 0xE8-0xEB */
+	0x00, 0x00, 0xBA, 0xDC, 0xE3, 0x4D, 0xBA, 0xDD, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xC4, 0x76, 0xF4, 0xA5, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xA6, 0xCB, 0xAA, 0xC7, 0xCD, 0xA7, /* 0xF8-0xFB */
+	0x00, 0x00, 0xAC, 0xF2, 0x00, 0x00, 0xAC, 0xF1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7B[512] = {
+	0xD0, 0x42, 0xD0, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xD3, 0x40, 0xD3, 0x42, 0xAF, 0xB9, 0x00, 0x00, /* 0x04-0x07 */
+	0xD3, 0x44, 0xD3, 0x47, 0xD3, 0x45, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0x46, 0xD3, 0x43, /* 0x0C-0x0F */
+	0xD2, 0xFE, 0xAF, 0xBA, 0xD3, 0x48, 0xD3, 0x41, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xD6, 0xD3, 0xB2, 0xC6, 0xD6, 0xDC, 0xB2, 0xC3, /* 0x18-0x1B */
+	0x00, 0x00, 0xD6, 0xD5, 0xB2, 0xC7, 0x00, 0x00, /* 0x1C-0x1F */
+	0xB2, 0xC1, 0x00, 0x00, 0xD6, 0xD0, 0xD6, 0xDD, /* 0x20-0x23 */
+	0xD6, 0xD1, 0xD6, 0xCE, 0xB2, 0xC5, 0x00, 0x00, /* 0x24-0x27 */
+	0xB2, 0xC2, 0x00, 0x00, 0xD6, 0xD4, 0xD6, 0xD7, /* 0x28-0x2B */
+	0xB2, 0xC4, 0xD6, 0xD8, 0xB2, 0xC8, 0xD6, 0xD9, /* 0x2C-0x2F */
+	0xD6, 0xCF, 0xD6, 0xD6, 0xD6, 0xDA, 0xD6, 0xD2, /* 0x30-0x33 */
+	0xD6, 0xCD, 0xD6, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xD6, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xDF, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0xDA, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xDA, 0xE0, 0xDA, 0xE6, 0xB5, 0xA7, 0xD6, 0xCC, /* 0x44-0x47 */
+	0xDA, 0xE1, 0xB5, 0xA5, 0xDA, 0xDE, 0xB5, 0xAC, /* 0x48-0x4B */
+	0xDA, 0xE2, 0xB5, 0xAB, 0xDA, 0xE3, 0xB5, 0xAD, /* 0x4C-0x4F */
+	0xB5, 0xA8, 0xB5, 0xAE, 0xB5, 0xA9, 0x00, 0x00, /* 0x50-0x53 */
+	0xB5, 0xAA, 0x00, 0x00, 0xB5, 0xA6, 0x00, 0x00, /* 0x54-0x57 */
+	0xDA, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0xB8, 0x61, 0xDF, 0x50, 0x00, 0x00, 0xDF, 0x53, /* 0x60-0x63 */
+	0xDF, 0x47, 0xDF, 0x4C, 0xDF, 0x46, 0xB8, 0x63, /* 0x64-0x67 */
+	0x00, 0x00, 0xDF, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xDF, 0x48, 0xB8, 0x62, 0x00, 0x00, /* 0x6C-0x6F */
+	0xDF, 0x4F, 0xDF, 0x4E, 0xDF, 0x4B, 0xDF, 0x4D, /* 0x70-0x73 */
+	0xDF, 0x49, 0xBA, 0xE1, 0xDF, 0x52, 0xB8, 0x5F, /* 0x74-0x77 */
+	0xDF, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x5D, 0x00, 0x00, /* 0x80-0x83 */
+	0xBA, 0xE8, 0xE3, 0x58, 0x00, 0x00, 0xBA, 0xE7, /* 0x84-0x87 */
+	0xE3, 0x4E, 0x00, 0x00, 0xE3, 0x50, 0xBA, 0xE0, /* 0x88-0x8B */
+	0xE3, 0x55, 0xE3, 0x54, 0xE3, 0x57, 0xBA, 0xE5, /* 0x8C-0x8F */
+	0xE3, 0x52, 0xE3, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xBA, 0xE4, 0xBA, 0xDF, 0xE3, 0x53, 0xBA, 0xE2, /* 0x94-0x97 */
+	0xE3, 0x59, 0xE3, 0x5B, 0x00, 0x00, 0xE3, 0x56, /* 0x98-0x9B */
+	0xE3, 0x4F, 0xBA, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xBD, 0x69, 0xBA, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE3, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE6, 0xD9, 0xBD, 0x62, 0x00, 0x00, 0xE6, 0xDB, /* 0xAC-0xAF */
+	0x00, 0x00, 0xBD, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xBD, 0x65, 0xE6, 0xDE, 0x00, 0x00, 0xE6, 0xD6, /* 0xB4-0xB7 */
+	0xBA, 0xE6, 0xE6, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xD8, 0x00, 0x00, /* 0xBC-0xBF */
+	0xB8, 0x60, 0xBD, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xBD, 0x64, 0x00, 0x00, 0xBD, 0x66, 0xBD, 0x67, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xBF, 0x76, 0xE6, 0xDD, 0xE6, 0xD7, /* 0xC8-0xCB */
+	0xBD, 0x6A, 0x00, 0x00, 0xE6, 0xDA, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xEA, 0xC0, 0xEA, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xEA, 0xC5, 0xBF, 0x74, 0xEA, 0xBD, 0xBF, 0x78, /* 0xD8-0xDB */
+	0xEA, 0xC3, 0xEA, 0xBA, 0xEA, 0xB7, 0xEA, 0xC6, /* 0xDC-0xDF */
+	0xC1, 0x51, 0xBF, 0x79, 0xEA, 0xC2, 0xEA, 0xB8, /* 0xE0-0xE3 */
+	0xBF, 0x77, 0xEA, 0xBC, 0xBF, 0x7B, 0xEA, 0xB9, /* 0xE4-0xE7 */
+	0xEA, 0xBE, 0xBF, 0x7A, 0xEA, 0xC1, 0xEA, 0xC4, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xED, 0xCB, 0xED, 0xCC, 0xED, 0xBC, 0xED, 0xC3, /* 0xF0-0xF3 */
+	0xED, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xC1, 0x4F, /* 0xF4-0xF7 */
+	0xED, 0xC8, 0xEA, 0xBF, 0x00, 0x00, 0xED, 0xBF, /* 0xF8-0xFB */
+	0x00, 0x00, 0xED, 0xC9, 0xC1, 0x4E, 0xED, 0xBE, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7C[512] = {
+	0xED, 0xBD, 0xED, 0xC7, 0xED, 0xC4, 0xED, 0xC6, /* 0x00-0x03 */
+	0x00, 0x00, 0xED, 0xBA, 0xED, 0xCA, 0xC1, 0x4C, /* 0x04-0x07 */
+	0x00, 0x00, 0xED, 0xC5, 0xED, 0xCE, 0xED, 0xC2, /* 0x08-0x0B */
+	0xC1, 0x50, 0xC1, 0x4D, 0xED, 0xC0, 0xED, 0xBB, /* 0x0C-0x0F */
+	0xED, 0xCD, 0xBF, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xF0, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xF0, 0x61, 0xF0, 0x67, 0xC2, 0xB0, 0xF0, 0x65, /* 0x1C-0x1F */
+	0xF0, 0x64, 0xC2, 0xB2, 0xF0, 0x6A, 0xC2, 0xB1, /* 0x20-0x23 */
+	0x00, 0x00, 0xF0, 0x6B, 0xF0, 0x68, 0xC2, 0xAE, /* 0x24-0x27 */
+	0xF0, 0x69, 0xF0, 0x62, 0xC2, 0xAF, 0xC2, 0xAD, /* 0x28-0x2B */
+	0xF2, 0xAB, 0xF0, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xF0, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA8, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC3, 0xB2, /* 0x34-0x37 */
+	0xC3, 0xB0, 0xF2, 0xAA, 0x00, 0x00, 0xF2, 0xAC, /* 0x38-0x3B */
+	0xF2, 0xA9, 0xC3, 0xB1, 0xC3, 0xAE, 0xC3, 0xAF, /* 0x3C-0x3F */
+	0xC3, 0xB3, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x78, /* 0x40-0x43 */
+	0x00, 0x00, 0xF4, 0xAA, 0x00, 0x00, 0xF4, 0xA9, /* 0x44-0x47 */
+	0xF4, 0xA7, 0xF4, 0xA6, 0xF4, 0xA8, 0x00, 0x00, /* 0x48-0x4B */
+	0xC4, 0x77, 0xC4, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xC4, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xE5, /* 0x50-0x53 */
+	0xF5, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xFA, /* 0x54-0x57 */
+	0x00, 0x00, 0xF6, 0xFC, 0xF6, 0xFE, 0xF6, 0xFD, /* 0x58-0x5B */
+	0xF6, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xC5, 0xA3, /* 0x5C-0x5F */
+	0xC5, 0xA2, 0x00, 0x00, 0x00, 0x00, 0xC5, 0xD3, /* 0x60-0x63 */
+	0xC5, 0xD2, 0xC5, 0xD4, 0xF7, 0xED, 0xF7, 0xEC, /* 0x64-0x67 */
+	0x00, 0x00, 0xF8, 0xFB, 0xF8, 0xB8, 0xF8, 0xFC, /* 0x68-0x6B */
+	0xC6, 0x58, 0x00, 0x00, 0xC6, 0x59, 0xF9, 0x6D, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xC6, 0x7E, 0xA6, 0xCC, /* 0x70-0x73 */
+	0x00, 0x00, 0xCD, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0xD0, 0x45, 0xD0, 0x46, 0xD0, 0x44, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xAC, 0xF3, 0x00, 0x00, 0xD0, 0x47, /* 0x7C-0x7F */
+	
+	0xD0, 0x48, 0xD0, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xD3, 0x49, 0xD3, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xD3, 0x4D, 0xAF, 0xBB, 0xD3, 0x4B, 0x00, 0x00, /* 0x88-0x8B */
+	0xD3, 0x4C, 0xD3, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xD3, 0x4A, 0xB2, 0xC9, 0x00, 0x00, /* 0x90-0x93 */
+	0xD6, 0xDE, 0xB2, 0xCB, 0xD6, 0xE0, 0xB2, 0xCA, /* 0x94-0x97 */
+	0xD6, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xE8, 0xB5, 0xAF, /* 0x9C-0x9F */
+	0x00, 0x00, 0xDA, 0xEA, 0xDA, 0xE7, 0xD6, 0xE1, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xB5, 0xB0, 0x00, 0x00, 0xF9, 0xDB, /* 0xA4-0xA7 */
+	0xDA, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x56, /* 0xAC-0xAF */
+	0x00, 0x00, 0xB8, 0x64, 0xDF, 0x54, 0xB8, 0x65, /* 0xB0-0xB3 */
+	0xDF, 0x55, 0xB8, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xBA, 0xE9, 0xE3, 0x61, 0xE3, 0x5E, /* 0xB8-0xBB */
+	0xE3, 0x60, 0xBA, 0xEA, 0xBA, 0xEB, 0xE3, 0x5F, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xE6, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xE6, 0xE0, 0x00, 0x00, 0xBD, 0x6B, 0xE6, 0xE2, /* 0xC8-0xCB */
+	0xE6, 0xE1, 0x00, 0x00, 0xA2, 0x61, 0x00, 0x00, /* 0xCC-0xCF */
+	0xEA, 0xCA, 0xEA, 0xCB, 0xEA, 0xC7, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xEA, 0xC8, 0xBF, 0x7C, 0xBF, 0x7D, 0xEA, 0xC9, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xC1, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xC1, 0x53, 0xC1, 0x58, 0xC1, 0x54, 0xC1, 0x56, /* 0xDC-0xDF */
+	0xC1, 0x52, 0x00, 0x00, 0xC1, 0x55, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC2, 0xB3, /* 0xE4-0xE7 */
+	0xED, 0xCF, 0x00, 0x00, 0xF2, 0xAE, 0x00, 0x00, /* 0xE8-0xEB */
+	0xF2, 0xAD, 0x00, 0x00, 0xF4, 0xAB, 0xC4, 0x7A, /* 0xEC-0xEF */
+	0xC4, 0x7B, 0xF7, 0x41, 0xF5, 0xE6, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xF7, 0x40, 0x00, 0x00, 0xF8, 0xFD, 0xF9, 0xA4, /* 0xF4-0xF7 */
+	0xA6, 0xCD, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x74, /* 0xF8-0xFB */
+	0x00, 0x00, 0xCD, 0xA9, 0xAA, 0xC8, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_7D[512] = {
+	0xAC, 0xF6, 0xD0, 0x4C, 0xAC, 0xF4, 0xD0, 0x4A, /* 0x00-0x03 */
+	0xAC, 0xF9, 0xAC, 0xF5, 0xAC, 0xFA, 0xAC, 0xF8, /* 0x04-0x07 */
+	0xD0, 0x4B, 0xAC, 0xF7, 0xAF, 0xBF, 0xAF, 0xBE, /* 0x08-0x0B */
+	0xD3, 0x5A, 0xAF, 0xC7, 0xD3, 0x53, 0xD3, 0x59, /* 0x0C-0x0F */
+	0xAF, 0xC3, 0xD3, 0x52, 0xD3, 0x58, 0xD3, 0x56, /* 0x10-0x13 */
+	0xAF, 0xC2, 0xAF, 0xC4, 0xD3, 0x55, 0xAF, 0xBD, /* 0x14-0x17 */
+	0xD3, 0x54, 0xAF, 0xC8, 0xAF, 0xC5, 0xAF, 0xC9, /* 0x18-0x1B */
+	0xAF, 0xC6, 0xD3, 0x51, 0xD3, 0x50, 0xD3, 0x57, /* 0x1C-0x1F */
+	0xAF, 0xC0, 0xAF, 0xBC, 0xAF, 0xC1, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xD6, 0xF0, 0xD6, 0xE9, 0x00, 0x00, 0xB5, 0xB5, /* 0x28-0x2B */
+	0xD6, 0xE8, 0x00, 0x00, 0xB2, 0xCF, 0xB2, 0xD6, /* 0x2C-0x2F */
+	0xB2, 0xD3, 0xB2, 0xD9, 0xB2, 0xD8, 0xB2, 0xD4, /* 0x30-0x33 */
+	0x00, 0x00, 0xD6, 0xE2, 0xD6, 0xE5, 0x00, 0x00, /* 0x34-0x37 */
+	0xD6, 0xE4, 0xB2, 0xD0, 0xD6, 0xE6, 0xD6, 0xEF, /* 0x38-0x3B */
+	0xB2, 0xD1, 0xD6, 0xE3, 0xD6, 0xEC, 0xD6, 0xED, /* 0x3C-0x3F */
+	0xB2, 0xD2, 0xD6, 0xEA, 0xB2, 0xD7, 0xB2, 0xCD, /* 0x40-0x43 */
+	0xB2, 0xD5, 0xD6, 0xE7, 0xB2, 0xCC, 0xD6, 0xEB, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xD6, 0xEE, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xDA, 0xFB, 0xDA, 0xF2, /* 0x4C-0x4F */
+	0xB5, 0xB2, 0xDA, 0xF9, 0xDA, 0xF6, 0xDA, 0xEE, /* 0x50-0x53 */
+	0xDA, 0xF7, 0xB5, 0xB4, 0xDA, 0xEF, 0x00, 0x00, /* 0x54-0x57 */
+	0xDA, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xB8, 0x6C, /* 0x58-0x5B */
+	0xDA, 0xF4, 0x00, 0x00, 0xB5, 0xB1, 0xDA, 0xFA, /* 0x5C-0x5F */
+	0x00, 0x00, 0xB5, 0xB8, 0xB5, 0xBA, 0xDA, 0xED, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xB5, 0xB9, 0xDA, 0xF0, /* 0x64-0x67 */
+	0xB5, 0xB3, 0xDA, 0xF8, 0xDA, 0xF1, 0xDA, 0xF5, /* 0x68-0x6B */
+	0x00, 0x00, 0xDA, 0xF3, 0xB5, 0xB6, 0xDA, 0xEC, /* 0x6C-0x6F */
+	0xB5, 0xBB, 0xB2, 0xCE, 0xB5, 0xB7, 0xB5, 0xBC, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xB8, 0x68, 0xDF, 0x5D, 0xDF, 0x5F, /* 0x78-0x7B */
+	0xDF, 0x61, 0xDF, 0x65, 0x00, 0x00, 0xDF, 0x5B, /* 0x7C-0x7F */
+	
+	0xDF, 0x59, 0xB8, 0x6A, 0x00, 0x00, 0xDF, 0x60, /* 0x80-0x83 */
+	0xDF, 0x64, 0xDF, 0x5C, 0xDF, 0x58, 0x00, 0x00, /* 0x84-0x87 */
+	0xDF, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0xDF, 0x62, 0xDF, 0x5A, 0xDF, 0x5E, 0xB8, 0x6B, /* 0x8C-0x8F */
+	0x00, 0x00, 0xB8, 0x69, 0xDF, 0x66, 0xB8, 0x67, /* 0x90-0x93 */
+	0xDF, 0x63, 0x00, 0x00, 0xE3, 0x72, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xBA, 0xEE, 0xE3, 0x6A, 0xBD, 0x78, 0xE3, 0x74, /* 0x9C-0x9F */
+	0xBA, 0xF1, 0xE3, 0x78, 0xBA, 0xF7, 0xE3, 0x65, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0x75, 0xE3, 0x62, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xE3, 0x77, 0xE3, 0x66, 0x00, 0x00, /* 0xA8-0xAB */
+	0xBA, 0xFE, 0xBA, 0xFB, 0xE3, 0x76, 0xE3, 0x70, /* 0xAC-0xAF */
+	0xBA, 0xED, 0xBA, 0xF5, 0xBA, 0xF4, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xBA, 0xF3, 0xBA, 0xF9, 0x00, 0x00, 0xE3, 0x63, /* 0xB4-0xB7 */
+	0xBA, 0xFA, 0xE3, 0x71, 0xBA, 0xF6, 0xBA, 0xEC, /* 0xB8-0xBB */
+	0xE3, 0x73, 0xBA, 0xEF, 0xBA, 0xF0, 0xBA, 0xF8, /* 0xBC-0xBF */
+	0xE3, 0x68, 0xE3, 0x67, 0xE3, 0x64, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xE3, 0x6C, 0xE3, 0x69, 0xE3, 0x6D, 0xBA, 0xFD, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE3, 0x79, 0xBA, 0xF2, 0xE3, 0x6E, /* 0xC8-0xCB */
+	0xE3, 0x6F, 0x00, 0x00, 0xE3, 0x6B, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xBA, 0xFC, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE7, /* 0xD4-0xD7 */
+	0xBD, 0x70, 0xBD, 0x79, 0xBD, 0x75, 0xE6, 0xE4, /* 0xD8-0xDB */
+	0x00, 0x00, 0xBD, 0x72, 0xBD, 0x76, 0xE6, 0xF0, /* 0xDC-0xDF */
+	0xBD, 0x6C, 0xE6, 0xE8, 0x00, 0x00, 0xBD, 0x74, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xE6, 0xEB, 0xE6, 0xE6, /* 0xE4-0xE7 */
+	0xBD, 0x73, 0xBD, 0x77, 0xE6, 0xE5, 0x00, 0x00, /* 0xE8-0xEB */
+	0xBD, 0x71, 0x00, 0x00, 0xE6, 0xEF, 0xBD, 0x6E, /* 0xEC-0xEF */
+	0xE6, 0xEE, 0xE6, 0xED, 0xBD, 0x7A, 0xE5, 0x72, /* 0xF0-0xF3 */
+	0xBD, 0x6D, 0x00, 0x00, 0xE6, 0xEC, 0xE6, 0xE3, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xBD, 0x7B, 0xE6, 0xEA, 0xBD, 0x6F, /* 0xF8-0xFB */
+};
+
+static const unsigned char u2c_7E[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE9, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0xBF, 0xA2, 0xBF, 0xA7, 0xBF, 0x7E, 0xEA, 0xD8, /* 0x08-0x0B */
+	0xEA, 0xCF, 0xEA, 0xDB, 0xEA, 0xD3, 0xEA, 0xD9, /* 0x0C-0x0F */
+	0xBF, 0xA8, 0xBF, 0xA1, 0xEA, 0xCC, 0xEA, 0xD2, /* 0x10-0x13 */
+	0xEA, 0xDC, 0xEA, 0xD5, 0xEA, 0xDA, 0xEA, 0xCE, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xD6, 0xBF, 0xA3, /* 0x18-0x1B */
+	0xEA, 0xD4, 0xBF, 0xA6, 0xBF, 0xA5, 0xEA, 0xD0, /* 0x1C-0x1F */
+	0xEA, 0xD1, 0xEA, 0xCD, 0xEA, 0xD7, 0xBF, 0xA4, /* 0x20-0x23 */
+	0xEA, 0xDE, 0xEA, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xED, 0xDA, 0xED, 0xD6, 0xC1, 0x5F, /* 0x28-0x2B */
+	0x00, 0x00, 0xED, 0xD0, 0xC1, 0x59, 0xC1, 0x69, /* 0x2C-0x2F */
+	0xED, 0xDC, 0xC1, 0x61, 0xC1, 0x5D, 0xED, 0xD3, /* 0x30-0x33 */
+	0xC1, 0x64, 0xC1, 0x67, 0xED, 0xDE, 0xC1, 0x5C, /* 0x34-0x37 */
+	0xED, 0xD5, 0xC1, 0x65, 0xED, 0xE0, 0xED, 0xDD, /* 0x38-0x3B */
+	0xED, 0xD1, 0xC1, 0x60, 0xC1, 0x5A, 0xC1, 0x68, /* 0x3C-0x3F */
+	0xED, 0xD8, 0xC1, 0x63, 0xED, 0xD2, 0xC1, 0x5E, /* 0x40-0x43 */
+	0xED, 0xDF, 0xC1, 0x62, 0xC1, 0x5B, 0xED, 0xD9, /* 0x44-0x47 */
+	0xC1, 0x66, 0xED, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xED, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF0, 0x6E, 0xF0, 0x74, 0xC2, 0xB9, 0xF0, 0x77, /* 0x50-0x53 */
+	0xC2, 0xB4, 0xC2, 0xB5, 0xF0, 0x6F, 0xF0, 0x76, /* 0x54-0x57 */
+	0xF0, 0x71, 0xC2, 0xBA, 0xC2, 0xB7, 0x00, 0x00, /* 0x58-0x5B */
+	0xF0, 0x6D, 0x00, 0x00, 0xC2, 0xB6, 0xF0, 0x73, /* 0x5C-0x5F */
+	0xF0, 0x75, 0xC2, 0xB8, 0xF0, 0x72, 0xF0, 0x70, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xF2, 0xB8, 0xC3, 0xB7, 0xC3, 0xB8, 0xC3, 0xB4, /* 0x68-0x6B */
+	0x00, 0x00, 0xC3, 0xB5, 0x00, 0x00, 0xF2, 0xB4, /* 0x6C-0x6F */
+	0xF2, 0xB2, 0x00, 0x00, 0xF2, 0xB6, 0xC3, 0xBA, /* 0x70-0x73 */
+	0xF2, 0xB7, 0xF2, 0xB0, 0xF2, 0xAF, 0xF2, 0xB3, /* 0x74-0x77 */
+	0xF2, 0xB1, 0xC3, 0xB6, 0xF2, 0xB5, 0xF4, 0xAC, /* 0x78-0x7B */
+	0xC4, 0x7E, 0xC4, 0x7D, 0xF4, 0xAD, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xF4, 0xAF, 0xF4, 0xAE, 0xC4, 0xA1, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0xEB, 0xF5, 0xE8, /* 0x84-0x87 */
+	0xF5, 0xE9, 0x00, 0x00, 0xF5, 0xE7, 0xF5, 0xEA, /* 0x88-0x8B */
+	0xC4, 0xF2, 0xF5, 0xEC, 0x00, 0x00, 0xC4, 0xF1, /* 0x8C-0x8F */
+	0x00, 0x00, 0xF7, 0x42, 0x00, 0x00, 0xC5, 0xD5, /* 0x90-0x93 */
+	0xC5, 0xD7, 0xF7, 0xEE, 0xC5, 0xD6, 0xF8, 0xB9, /* 0x94-0x97 */
+	0xF9, 0x40, 0xF9, 0x42, 0xF8, 0xFE, 0xF9, 0x41, /* 0x98-0x9B */
+	0xC6, 0x6C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_7F[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xA6, 0xCE, 0x00, 0x00, /* 0x34-0x37 */
+	0xAC, 0xFB, 0xD2, 0x6F, 0xAF, 0xCA, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xB2, 0xDA, 0xDA, 0xFC, 0xDA, 0xFD, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xDF, /* 0x40-0x43 */
+	0xC1, 0x6A, 0xED, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xC2, 0xBB, 0x00, 0x00, 0xF2, 0xBA, 0xF2, 0xB9, /* 0x48-0x4B */
+	0xC4, 0xA2, 0xF5, 0xED, 0x00, 0x00, 0xF7, 0x43, /* 0x4C-0x4F */
+	0xC5, 0xF8, 0xCA, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xAA, 0xC9, 0xA8, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xD0, 0x4D, 0x00, 0x00, 0x00, 0x00, 0xD3, 0x60, /* 0x58-0x5B */
+	0xD3, 0x5B, 0xD3, 0x5F, 0xD3, 0x5D, 0xAF, 0xCB, /* 0x5C-0x5F */
+	0xD3, 0x5E, 0xD3, 0x5C, 0x00, 0x00, 0xD6, 0xF1, /* 0x60-0x63 */
+	0x00, 0x00, 0xDA, 0xFE, 0xDB, 0x40, 0xDF, 0x69, /* 0x64-0x67 */
+	0xDF, 0x6A, 0xB8, 0x6E, 0xB8, 0x6F, 0xDF, 0x68, /* 0x68-0x6B */
+	0xDF, 0x6B, 0xDF, 0x67, 0xB8, 0x6D, 0x00, 0x00, /* 0x6C-0x6F */
+	0xBB, 0x40, 0x00, 0x00, 0xB8, 0x70, 0xE3, 0x7A, /* 0x70-0x73 */
+	0x00, 0x00, 0xBD, 0x7C, 0xE6, 0xF1, 0xBD, 0x7D, /* 0x74-0x77 */
+	0x00, 0x00, 0xBF, 0xA9, 0xEA, 0xE2, 0xEA, 0xE0, /* 0x78-0x7B */
+	0xEA, 0xE1, 0xED, 0xE4, 0xED, 0xE3, 0xED, 0xE2, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xBB, /* 0x80-0x83 */
+	0x00, 0x00, 0xC3, 0xB9, 0xF2, 0xBC, 0xF7, 0x44, /* 0x84-0x87 */
+	0xC5, 0xF9, 0xF8, 0xBA, 0xA6, 0xCF, 0xAA, 0xCB, /* 0x88-0x8B */
+	0xAA, 0xCA, 0xD0, 0x4F, 0xAC, 0xFC, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xD0, 0x4E, 0xD3, 0x62, 0x00, 0x00, /* 0x90-0x93 */
+	0xAF, 0xCC, 0xD6, 0xF2, 0xD3, 0x61, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0xDC, 0xD6, 0xF5, /* 0x98-0x9B */
+	0xD6, 0xF3, 0xD6, 0xF4, 0xB2, 0xDB, 0x00, 0x00, /* 0x9C-0x9F */
+	0xDB, 0x42, 0xDB, 0x43, 0xDB, 0x41, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xB8, 0x73, 0xDF, 0x6D, 0xDF, 0x6C, 0xDF, 0x6E, /* 0xA4-0xA7 */
+	0xB8, 0x72, 0xB8, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE6, 0xF2, 0xE6, 0xF4, 0x00, 0x00, 0xBD, 0x7E, /* 0xAC-0xAF */
+	0xE6, 0xF3, 0xEA, 0xE3, 0xBF, 0xAA, 0xF0, 0x79, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF0, 0x78, 0xC3, 0xBB, 0xF2, 0xBD, /* 0xB4-0xB7 */
+	0xC3, 0xBD, 0xC3, 0xBC, 0xF4, 0xB0, 0xF5, 0xEE, /* 0xB8-0xBB */
+	0xC4, 0xF3, 0xA6, 0xD0, 0xD0, 0x50, 0xAC, 0xFD, /* 0xBC-0xBF */
+	0xD3, 0x65, 0xAF, 0xCE, 0xD3, 0x64, 0xD3, 0x63, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xAF, 0xCD, 0x00, 0x00, 0xD6, 0xFB, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xD6, 0xFD, 0xD6, 0xF6, 0xD6, 0xF7, /* 0xC8-0xCB */
+	0xB2, 0xDD, 0xD6, 0xF8, 0xB2, 0xDE, 0xD6, 0xFC, /* 0xCC-0xCF */
+	0xD6, 0xF9, 0xD6, 0xFA, 0xB2, 0xDF, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xB5, 0xBE, 0xB5, 0xBF, 0x00, 0x00, 0xDB, 0x44, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x6F, /* 0xD8-0xDB */
+	0xDF, 0x70, 0x00, 0x00, 0xE3, 0x7E, 0xBB, 0x43, /* 0xDC-0xDF */
+	0xBB, 0x41, 0xBB, 0x42, 0xE3, 0x7B, 0xE3, 0x7C, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xE3, 0x7D, 0xE6, 0xF9, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xE6, 0xFA, 0xBD, 0xA1, 0xE6, 0xF7, 0xE6, 0xF6, /* 0xE8-0xEB */
+	0xE6, 0xF8, 0xE6, 0xF5, 0xBF, 0xAD, 0xEA, 0xE4, /* 0xEC-0xEF */
+	0xBF, 0xAB, 0xBF, 0xAC, 0xED, 0xE6, 0xC1, 0x6B, /* 0xF0-0xF3 */
+	0xED, 0xE5, 0xEF, 0xA8, 0x00, 0x00, 0xF0, 0x7A, /* 0xF4-0xF7 */
+	0xF0, 0x7B, 0xC2, 0xBC, 0x00, 0x00, 0xC2, 0xBD, /* 0xF8-0xFB */
+	0xC1, 0x6C, 0xF2, 0xBE, 0xF2, 0xBF, 0xF4, 0xB1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_80[512] = {
+	0xC4, 0xA3, 0xA6, 0xD1, 0x00, 0x00, 0xA6, 0xD2, /* 0x00-0x03 */
+	0xAC, 0xFE, 0xAA, 0xCC, 0xAF, 0xCF, 0xD0, 0x51, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB5, 0xC0, /* 0x08-0x0B */
+	0xA6, 0xD3, 0xAD, 0x41, 0xD0, 0x52, 0xD0, 0x53, /* 0x0C-0x0F */
+	0xAD, 0x40, 0xAD, 0x42, 0xA6, 0xD4, 0x00, 0x00, /* 0x10-0x13 */
+	0xD0, 0x54, 0xAF, 0xD1, 0xD3, 0x66, 0xAF, 0xD3, /* 0x14-0x17 */
+	0xAF, 0xD0, 0xAF, 0xD2, 0x00, 0x00, 0xD7, 0x41, /* 0x18-0x1B */
+	0xB2, 0xE0, 0x00, 0x00, 0xD7, 0x40, 0xD6, 0xFE, /* 0x1C-0x1F */
+	0x00, 0x00, 0xDF, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xE3, 0xA1, 0x00, 0x00, 0xBD, 0xA2, 0x00, 0x00, /* 0x24-0x27 */
+	0xBF, 0xAE, 0xEA, 0xE6, 0xEA, 0xE5, 0x00, 0x00, /* 0x28-0x2B */
+	0xED, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xF5, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xA6, 0xD5, /* 0x30-0x33 */
+	0xCB, 0x73, 0xCD, 0xAA, 0xAD, 0x43, 0xD0, 0x55, /* 0x34-0x37 */
+	0x00, 0x00, 0xD3, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xAF, 0xD4, 0xD3, 0x67, 0xAF, 0xD5, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0x43, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xB2, 0xE2, 0xD7, 0x42, /* 0x44-0x47 */
+	0xD7, 0x44, 0x00, 0x00, 0xB2, 0xE1, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x46, /* 0x4C-0x4F */
+	0xDB, 0x47, 0xDB, 0x45, 0xB5, 0xC1, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xB8, 0x74, 0x00, 0x00, /* 0x54-0x57 */
+	0xB8, 0x75, 0x00, 0x00, 0xBB, 0x45, 0x00, 0x00, /* 0x58-0x5B */
+	0xE3, 0xA3, 0xE3, 0xA2, 0xBB, 0x44, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xE6, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xFC, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xEA, 0xE7, 0x00, 0x00, 0x00, 0x00, 0xC1, 0x70, /* 0x6C-0x6F */
+	0xC1, 0x6F, 0xC1, 0x6D, 0xC1, 0x6E, 0xC1, 0x71, /* 0x70-0x73 */
+	0x00, 0x00, 0xF0, 0x7C, 0xC2, 0xBF, 0xC2, 0xBE, /* 0x74-0x77 */
+	0xF2, 0xC0, 0xF4, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xC5, 0xA5, 0xC5, 0xA4, 0xA6, 0xD6, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xD1, 0xFB, 0x00, 0x00, /* 0x80-0x83 */
+	0xB8, 0x77, 0xB5, 0xC2, 0xB8, 0x76, 0xBB, 0x46, /* 0x84-0x87 */
+	0x00, 0x00, 0xA6, 0xD7, 0xC9, 0xA9, 0xA6, 0xD8, /* 0x88-0x8B */
+	0xA6, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xAB, /* 0x8C-0x8F */
+	0xCB, 0x76, 0x00, 0x00, 0xCB, 0x77, 0xA8, 0x77, /* 0x90-0x93 */
+	0x00, 0x00, 0xCB, 0x74, 0xA8, 0x76, 0x00, 0x00, /* 0x94-0x97 */
+	0xA8, 0x79, 0xCB, 0x75, 0xA8, 0x7B, 0xA8, 0x7A, /* 0x98-0x9B */
+	0xCB, 0x78, 0xA8, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xAA, 0xD1, 0xAA, 0xCF, 0xCD, 0xAD, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xAA, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xAA, 0xD3, 0xAA, 0xD5, 0xAA, 0xD2, /* 0xA8-0xAB */
+	0x00, 0x00, 0xCD, 0xB0, 0xCD, 0xAC, 0xAA, 0xD6, /* 0xAC-0xAF */
+	0x00, 0x00, 0xAA, 0xD0, 0xA8, 0x7C, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xAA, 0xD4, 0xCD, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xCD, 0xAE, 0x00, 0x00, 0xAA, 0xCD, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0x5B, 0xAD, 0x47, /* 0xC0-0xC3 */
+	0xAD, 0x48, 0xD0, 0x5D, 0x00, 0x00, 0xD0, 0x57, /* 0xC4-0xC7 */
+	0xD0, 0x5A, 0xD0, 0x63, 0xD0, 0x61, 0x00, 0x00, /* 0xC8-0xCB */
+	0xAD, 0x49, 0xD0, 0x67, 0xAD, 0x4C, 0xD0, 0x64, /* 0xCC-0xCF */
+	0xD0, 0x5C, 0xD0, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xDB, 0x49, 0xD0, 0x62, 0xAD, 0x44, 0xD0, 0x65, /* 0xD4-0xD7 */
+	0xD0, 0x56, 0xD0, 0x5F, 0xAD, 0x46, 0xAD, 0x4B, /* 0xD8-0xDB */
+	0xD0, 0x60, 0xAD, 0x4F, 0xAD, 0x4D, 0x00, 0x00, /* 0xDC-0xDF */
+	0xD0, 0x58, 0xAD, 0x4A, 0x00, 0x00, 0xD0, 0x5E, /* 0xE0-0xE3 */
+	0xAD, 0x4E, 0xAD, 0x45, 0xD0, 0x66, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xAF, 0xDA, 0x00, 0x00, 0xAF, 0xE3, /* 0xEC-0xEF */
+	0xAF, 0xD8, 0xAF, 0xD6, 0xD3, 0x6A, 0xAF, 0xDE, /* 0xF0-0xF3 */
+	0xAF, 0xDB, 0xD3, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xAF, 0xDD, 0xD3, 0x6B, 0xD3, 0x69, 0xD3, 0x6E, /* 0xF8-0xFB */
+	0xAF, 0xE2, 0xAF, 0xE0, 0xDB, 0x48, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_81[512] = {
+	0xD3, 0x6F, 0xD3, 0x6D, 0xAF, 0xD7, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xAF, 0xD9, 0xAF, 0xDC, 0x00, 0x00, /* 0x04-0x07 */
+	0xAF, 0xDF, 0x00, 0x00, 0xAF, 0xE1, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xD7, 0x4E, 0xB2, 0xE4, 0x00, 0x00, /* 0x14-0x17 */
+	0xD7, 0x45, 0xD7, 0x47, 0x00, 0x00, 0xD7, 0x48, /* 0x18-0x1B */
+	0x00, 0x00, 0xD7, 0x50, 0xD7, 0x4C, 0xD7, 0x4A, /* 0x1C-0x1F */
+	0x00, 0x00, 0xD7, 0x4D, 0xD7, 0x51, 0xB2, 0xE5, /* 0x20-0x23 */
+	0xB2, 0xE9, 0xD7, 0x46, 0x00, 0x00, 0xD7, 0x4F, /* 0x24-0x27 */
+	0x00, 0x00, 0xB2, 0xE7, 0x00, 0x00, 0xB2, 0xE6, /* 0x28-0x2B */
+	0xD7, 0x4B, 0xD7, 0x49, 0x00, 0x00, 0xB2, 0xE3, /* 0x2C-0x2F */
+	0xB2, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xB5, 0xC8, 0xDB, 0x51, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xDB, 0x4F, 0xB5, 0xCA, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x4A, /* 0x40-0x43 */
+	0xDF, 0xA1, 0x00, 0x00, 0xB5, 0xC9, 0xDB, 0x4E, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0x4B, 0xB5, 0xC5, /* 0x48-0x4B */
+	0xB5, 0xCB, 0xDB, 0x50, 0xB5, 0xC7, 0xDB, 0x4D, /* 0x4C-0x4F */
+	0xBB, 0x47, 0xB5, 0xC6, 0xDB, 0x4C, 0xB5, 0xCC, /* 0x50-0x53 */
+	0xB5, 0xC4, 0xB5, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x77, /* 0x58-0x5B */
+	0xDF, 0x75, 0x00, 0x00, 0xDF, 0x7B, 0x00, 0x00, /* 0x5C-0x5F */
+	0xDF, 0x73, 0xDF, 0xA2, 0xDF, 0x78, 0x00, 0x00, /* 0x60-0x63 */
+	0xDF, 0x72, 0xB8, 0x7B, 0xB8, 0xA3, 0xDF, 0x7D, /* 0x64-0x67 */
+	0x00, 0x00, 0xDF, 0x76, 0x00, 0x00, 0xB8, 0x7E, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0xB8, 0x7C, 0xDF, 0x7E, /* 0x6C-0x6F */
+	0xB8, 0x79, 0xB8, 0x78, 0xDF, 0x79, 0xB8, 0x7D, /* 0x70-0x73 */
+	0xB5, 0xCD, 0x00, 0x00, 0xDF, 0x7C, 0xDF, 0x74, /* 0x74-0x77 */
+	0xB8, 0x7A, 0xB8, 0xA1, 0xB8, 0xA2, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBB, 0x4C, /* 0x7C-0x7F */
+	
+	0xBB, 0x48, 0x00, 0x00, 0xBB, 0x4D, 0xE3, 0xA6, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xA5, 0xE3, 0xA7, /* 0x84-0x87 */
+	0xBB, 0x4A, 0xE3, 0xA4, 0xBB, 0x4B, 0xE3, 0xAA, /* 0x88-0x8B */
+	0xE3, 0xA9, 0xE3, 0xA8, 0x00, 0x00, 0xBB, 0x49, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0xE7, 0x41, 0x00, 0x00, 0xE7, 0x44, /* 0x94-0x97 */
+	0xBD, 0xA8, 0xE7, 0x43, 0xBD, 0xA7, 0xBD, 0xA3, /* 0x98-0x9B */
+	0xBD, 0xA4, 0xBD, 0xA5, 0xE7, 0x40, 0xE6, 0xFE, /* 0x9C-0x9F */
+	0xBD, 0xA6, 0x00, 0x00, 0xE7, 0x42, 0xE6, 0xFD, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xE9, 0xEA, 0xF3, /* 0xA4-0xA7 */
+	0xBF, 0xB1, 0xBF, 0xB0, 0x00, 0x00, 0xEA, 0xED, /* 0xA8-0xAB */
+	0xEA, 0xEF, 0x00, 0x00, 0xEA, 0xEA, 0x00, 0x00, /* 0xAC-0xAF */
+	0xEA, 0xEE, 0xEA, 0xE8, 0xEA, 0xF1, 0xBF, 0xAF, /* 0xB0-0xB3 */
+	0xEA, 0xF0, 0xEA, 0xEC, 0x00, 0x00, 0xEA, 0xF2, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xEA, 0xEB, 0xC1, 0x74, 0xED, 0xE8, /* 0xB8-0xBB */
+	0xED, 0xEE, 0xC1, 0x78, 0xC1, 0x7A, 0xC1, 0x77, /* 0xBC-0xBF */
+	0xC1, 0x76, 0x00, 0x00, 0xC1, 0x75, 0xC1, 0x73, /* 0xC0-0xC3 */
+	0xED, 0xE9, 0xED, 0xEC, 0xC1, 0x72, 0xED, 0xED, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xC1, 0x79, 0xED, 0xEB, 0x00, 0x00, /* 0xC8-0xCB */
+	0xED, 0xEA, 0xC2, 0xC0, 0x00, 0x00, 0xC2, 0xC1, /* 0xCC-0xCF */
+	0xF0, 0xA1, 0xF0, 0x7D, 0xF0, 0x7E, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xF2, 0xC2, 0x00, 0x00, 0xF2, 0xC1, /* 0xD4-0xD7 */
+	0xC3, 0xBE, 0xF4, 0xB4, 0xC4, 0xA4, 0xF4, 0xB3, /* 0xD8-0xDB */
+	0x00, 0x00, 0xF5, 0xF0, 0xF7, 0x45, 0xC5, 0xA6, /* 0xDC-0xDF */
+	0xF9, 0x43, 0xF9, 0x44, 0xC5, 0xD8, 0xA6, 0xDA, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xAA, 0xD7, 0xDB, 0x52, 0xBB, 0x4E, /* 0xE4-0xE7 */
+	0xC1, 0x7B, 0xED, 0xEF, 0xA6, 0xDB, 0x00, 0x00, /* 0xE8-0xEB */
+	0xAF, 0xE5, 0xAF, 0xE4, 0xDB, 0x53, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xEA, 0xF4, 0xA6, 0xDC, /* 0xF0-0xF3 */
+	0xAD, 0x50, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x54, /* 0xF4-0xF7 */
+	0xDB, 0x55, 0xDB, 0x56, 0xBB, 0x4F, 0xBF, 0xB2, /* 0xF8-0xFB */
+	0xA6, 0xDD, 0x00, 0x00, 0xAA, 0xD8, 0xD0, 0x68, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_82[512] = {
+	0xAF, 0xE6, 0xD3, 0x70, 0xB2, 0xEA, 0x00, 0x00, /* 0x00-0x03 */
+	0xDB, 0x57, 0xB8, 0xA4, 0x00, 0x00, 0xBB, 0x50, /* 0x04-0x07 */
+	0xBF, 0xB3, 0xC1, 0x7C, 0xC2, 0xC2, 0xF4, 0xB5, /* 0x08-0x0B */
+	0xA6, 0xDE, 0xAA, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xAF, 0xE7, 0xD7, 0x52, 0xB5, 0xCE, 0x00, 0x00, /* 0x10-0x13 */
+	0xBB, 0x51, 0xE3, 0xAB, 0xE7, 0x45, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0xDF, /* 0x18-0x1B */
+	0xB5, 0xCF, 0xDF, 0xA3, 0xBB, 0x52, 0xA6, 0xE0, /* 0x1C-0x1F */
+	0xCD, 0xB1, 0xD0, 0x69, 0xAD, 0x51, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xD3, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xAF, 0xEA, 0x00, 0x00, 0xAF, 0xE8, 0xAF, 0xE9, /* 0x28-0x2B */
+	0xAF, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xD3, 0x71, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0x57, 0xD7, 0x54, /* 0x30-0x33 */
+	0xD7, 0x56, 0xB2, 0xEB, 0xB2, 0xED, 0xB2, 0xEC, /* 0x34-0x37 */
+	0xD7, 0x53, 0xB2, 0xEE, 0xD7, 0x55, 0x00, 0x00, /* 0x38-0x3B */
+	0xDB, 0x58, 0xDB, 0x59, 0x00, 0x00, 0xDB, 0x5A, /* 0x3C-0x3F */
+	0xDF, 0xA6, 0x00, 0x00, 0xDF, 0xA7, 0x00, 0x00, /* 0x40-0x43 */
+	0xDF, 0xA5, 0xDF, 0xA8, 0x00, 0x00, 0xB8, 0xA5, /* 0x44-0x47 */
+	0x00, 0x00, 0xDF, 0xA4, 0x00, 0x00, 0xBB, 0x53, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0x4A, 0xE7, 0x46, /* 0x4C-0x4F */
+	0xE7, 0x49, 0xE7, 0x4B, 0xE7, 0x48, 0xE7, 0x47, /* 0x50-0x53 */
+	0x00, 0x00, 0xEA, 0xF5, 0xEA, 0xF6, 0xEA, 0xF7, /* 0x54-0x57 */
+	0xBF, 0xB4, 0xBF, 0xB5, 0xED, 0xF1, 0xED, 0xF0, /* 0x58-0x5B */
+	0xED, 0xF2, 0x00, 0x00, 0xF0, 0xA3, 0xF0, 0xA2, /* 0x5C-0x5F */
+	0x00, 0x00, 0xF2, 0xC4, 0x00, 0x00, 0xF2, 0xC5, /* 0x60-0x63 */
+	0xF2, 0xC3, 0x00, 0x00, 0xC4, 0xA5, 0x00, 0x00, /* 0x64-0x67 */
+	0xF4, 0xB6, 0xF4, 0xB7, 0x00, 0x00, 0xF7, 0x46, /* 0x68-0x6B */
+	0xF7, 0xEF, 0xF8, 0xBB, 0xA6, 0xE1, 0xA8, 0x7D, /* 0x6C-0x6F */
+	0x00, 0x00, 0xC1, 0x7D, 0xA6, 0xE2, 0x00, 0x00, /* 0x70-0x73 */
+	0xD7, 0x58, 0xDB, 0x5B, 0x00, 0x00, 0xC6, 0x41, /* 0x74-0x77 */
+	0xCA, 0x4A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xCA, 0x4B, 0xCA, 0x4D, 0xA6, 0xE3, 0xCA, 0x4E, /* 0x7C-0x7F */
+	
+	0xCA, 0x4C, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xA2, /* 0x80-0x83 */
+	0xCB, 0xA3, 0xCB, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xA1, 0xA8, 0xA1, /* 0x88-0x8B */
+	0x00, 0x00, 0xA8, 0xA2, 0xCB, 0x7C, 0xCB, 0x7A, /* 0x8C-0x8F */
+	0xCB, 0x79, 0xCB, 0x7D, 0xA8, 0x7E, 0xCB, 0x7E, /* 0x90-0x93 */
+	0xD0, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xCD, 0xB6, 0xAA, 0xDC, 0xCD, 0xB5, 0xCD, 0xB7, /* 0x98-0x9B */
+	0x00, 0x00, 0xAA, 0xDB, 0xCD, 0xBC, 0xAA, 0xDF, /* 0x9C-0x9F */
+	0xCD, 0xB2, 0xCD, 0xC0, 0xCD, 0xC6, 0xAA, 0xE6, /* 0xA0-0xA3 */
+	0xCD, 0xC3, 0xAA, 0xE3, 0x00, 0x00, 0xCD, 0xB9, /* 0xA4-0xA7 */
+	0xCD, 0xBF, 0xCD, 0xC1, 0x00, 0x00, 0xCD, 0xB4, /* 0xA8-0xAB */
+	0xAA, 0xE2, 0xAA, 0xDD, 0xCD, 0xBA, 0xAA, 0xE4, /* 0xAC-0xAF */
+	0xAA, 0xE7, 0xAA, 0xE1, 0x00, 0x00, 0xAA, 0xDA, /* 0xB0-0xB3 */
+	0xCD, 0xBE, 0xCD, 0xB8, 0xCD, 0xC5, 0xAA, 0xE9, /* 0xB4-0xB7 */
+	0xAA, 0xE5, 0xAA, 0xE0, 0xCD, 0xBD, 0xAF, 0xEC, /* 0xB8-0xBB */
+	0xCD, 0xBB, 0xAA, 0xDE, 0xAA, 0xE8, 0x00, 0x00, /* 0xBC-0xBF */
+	0xCD, 0xB3, 0x00, 0x00, 0xCD, 0xC2, 0xCD, 0xC4, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xAD, 0x62, 0xAD, 0x5C, 0xAD, 0x64, /* 0xD0-0xD3 */
+	0xAD, 0x61, 0xD0, 0x71, 0xD0, 0x74, 0xAD, 0x5D, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xD0, 0x6B, 0x00, 0x00, 0xAD, 0x56, /* 0xD8-0xDB */
+	0xAD, 0x60, 0x00, 0x00, 0xAD, 0x63, 0xAD, 0x65, /* 0xDC-0xDF */
+	0xD0, 0xA2, 0xD0, 0x77, 0x00, 0x00, 0xAD, 0x55, /* 0xE0-0xE3 */
+	0xD0, 0xA1, 0xAD, 0x59, 0xAD, 0x57, 0xAD, 0x52, /* 0xE4-0xE7 */
+	0xD0, 0x6F, 0x00, 0x00, 0xD0, 0x7E, 0xD0, 0x73, /* 0xE8-0xEB */
+	0xD0, 0x76, 0xD0, 0xA5, 0x00, 0x00, 0xAD, 0x66, /* 0xEC-0xEF */
+	0xD0, 0x7D, 0xAD, 0x5E, 0xD0, 0x78, 0xD0, 0xA4, /* 0xF0-0xF3 */
+	0xD0, 0x75, 0xD0, 0x79, 0xD0, 0x7C, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xD0, 0x6D, 0xD0, 0xA3, 0xD0, 0x7B, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0x6C, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_83[512] = {
+	0xD0, 0x70, 0xAD, 0x5F, 0xAD, 0x5A, 0xAD, 0x53, /* 0x00-0x03 */
+	0xAD, 0x58, 0xAD, 0x54, 0xAD, 0x67, 0xD0, 0x6E, /* 0x04-0x07 */
+	0xD3, 0xA5, 0xAD, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0xD0, 0x7A, 0xCE, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xD3, 0xA8, 0xAF, 0xFA, /* 0x14-0x17 */
+	0x00, 0x00, 0xD3, 0x76, 0x00, 0x00, 0xD3, 0xA3, /* 0x18-0x1B */
+	0xD3, 0x7D, 0x00, 0x00, 0xD3, 0xB2, 0x00, 0x00, /* 0x1C-0x1F */
+	0xD3, 0xAA, 0x00, 0x00, 0xD3, 0x7E, 0x00, 0x00, /* 0x20-0x23 */
+	0xD3, 0xA9, 0xD3, 0x78, 0xD3, 0x7C, 0xD3, 0xB5, /* 0x24-0x27 */
+	0xAF, 0xFD, 0xD3, 0xAD, 0xD3, 0xA4, 0xAF, 0xED, /* 0x28-0x2B */
+	0xD3, 0xB3, 0xD3, 0x74, 0x00, 0x00, 0xD3, 0xAC, /* 0x2C-0x2F */
+	0x00, 0x00, 0xAF, 0xFC, 0xAF, 0xF7, 0xD3, 0x73, /* 0x30-0x33 */
+	0xAF, 0xF5, 0xAF, 0xF4, 0xAF, 0xF9, 0xD3, 0xAB, /* 0x34-0x37 */
+	0xAF, 0xF1, 0xAF, 0xF8, 0xD0, 0x72, 0xDB, 0x5C, /* 0x38-0x3B */
+	0xD3, 0xA6, 0x00, 0x00, 0x00, 0x00, 0xD3, 0x7A, /* 0x3C-0x3F */
+	0xAF, 0xFB, 0xD3, 0x7B, 0xD3, 0xA1, 0xAF, 0xFE, /* 0x40-0x43 */
+	0xD3, 0x75, 0xD3, 0xAF, 0x00, 0x00, 0xD3, 0xAE, /* 0x44-0x47 */
+	0xD3, 0xB6, 0xAF, 0xF3, 0xAF, 0xF0, 0xD3, 0xB4, /* 0x48-0x4B */
+	0xD3, 0xB0, 0xD3, 0xA7, 0xD3, 0xA2, 0xAF, 0xF6, /* 0x4C-0x4F */
+	0xAF, 0xF2, 0xD3, 0x77, 0xAF, 0xEE, 0xD3, 0xB1, /* 0x50-0x53 */
+	0xAF, 0xEF, 0x00, 0x00, 0xD3, 0x79, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0x5E, /* 0x70-0x73 */
+	0xD7, 0x60, 0xD7, 0x65, 0xD7, 0x79, 0xB2, 0xFC, /* 0x74-0x77 */
+	0xB2, 0xF2, 0x00, 0x00, 0xD7, 0x5D, 0xB2, 0xFD, /* 0x78-0x7B */
+	0xB2, 0xFE, 0xD7, 0x68, 0xD7, 0x6F, 0xD7, 0x75, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xD7, 0x62, 0x00, 0x00, 0xD7, 0x69, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0x40, 0xD7, 0x77, /* 0x84-0x87 */
+	0xD7, 0x72, 0xB2, 0xFA, 0xB2, 0xF8, 0xD7, 0x6E, /* 0x88-0x8B */
+	0xD7, 0x6A, 0xD7, 0x5C, 0xB2, 0xEF, 0xD7, 0x61, /* 0x8C-0x8F */
+	0xD7, 0x59, 0x00, 0x00, 0xB2, 0xF7, 0xB2, 0xF9, /* 0x90-0x93 */
+	0xD7, 0x66, 0xD7, 0x63, 0xB2, 0xF4, 0xD7, 0x73, /* 0x94-0x97 */
+	0xB2, 0xF1, 0xD7, 0x64, 0xD7, 0x7A, 0xD7, 0x6C, /* 0x98-0x9B */
+	0x00, 0x00, 0xD7, 0x6B, 0xB2, 0xF0, 0x00, 0x00, /* 0x9C-0x9F */
+	0xB2, 0xFB, 0x00, 0x00, 0xB2, 0xF3, 0xD7, 0x5A, /* 0xA0-0xA3 */
+	0xD7, 0x5F, 0xD7, 0x70, 0xD7, 0x76, 0xB3, 0x41, /* 0xA4-0xA7 */
+	0xD7, 0x5B, 0xD7, 0x67, 0xD7, 0x6D, 0xB2, 0xF6, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0x78, 0xD7, 0x71, /* 0xAC-0xAF */
+	0xD7, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xB2, 0xF5, 0x00, 0x00, 0xDB, 0x6C, /* 0xBC-0xBF */
+	0xDB, 0x60, 0xB5, 0xD7, 0xDB, 0x7D, 0xDB, 0xA7, /* 0xC0-0xC3 */
+	0xDB, 0xAA, 0xB5, 0xD5, 0xDB, 0x68, 0xDB, 0xA3, /* 0xC4-0xC7 */
+	0xDB, 0x69, 0xDB, 0x77, 0xB5, 0xE2, 0xDB, 0x73, /* 0xC8-0xCB */
+	0xB5, 0xDF, 0x00, 0x00, 0xDB, 0x74, 0xDB, 0x5D, /* 0xCC-0xCF */
+	0x00, 0x00, 0xDB, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xB5, 0xE8, 0xDB, 0xA1, 0xDB, 0x75, 0xDB, 0xAC, /* 0xD4-0xD7 */
+	0xDB, 0x70, 0xDF, 0xC8, 0x00, 0x00, 0xDB, 0xAF, /* 0xD8-0xDB */
+	0xB5, 0xE6, 0xDB, 0x6E, 0xDB, 0x7A, 0xB5, 0xE9, /* 0xDC-0xDF */
+	0xB5, 0xD4, 0xDB, 0x72, 0xDB, 0xAD, 0xDB, 0x6B, /* 0xE0-0xE3 */
+	0xDB, 0x64, 0xDB, 0x6F, 0x00, 0x00, 0xDB, 0x63, /* 0xE4-0xE7 */
+	0xDB, 0x61, 0xB5, 0xD0, 0xDB, 0xA5, 0xDB, 0x6A, /* 0xE8-0xEB */
+	0xDB, 0xA8, 0x00, 0x00, 0xDB, 0xA9, 0xB5, 0xD8, /* 0xEC-0xEF */
+	0xB5, 0xDD, 0xB5, 0xD9, 0xB5, 0xE1, 0xDB, 0x7E, /* 0xF0-0xF3 */
+	0xB5, 0xDA, 0xDB, 0x76, 0xDB, 0x66, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xB5, 0xD2, 0xDB, 0x5E, 0xDB, 0xA2, 0xDB, 0xAB, /* 0xF8-0xFB */
+	0xDB, 0x65, 0xB5, 0xE0, 0xDB, 0xB0, 0xDB, 0x71, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_84[512] = {
+	0x00, 0x00, 0xDB, 0x6D, 0x00, 0x00, 0xB5, 0xD1, /* 0x00-0x03 */
+	0xB5, 0xE5, 0x00, 0x00, 0xDB, 0x7C, 0xB5, 0xE7, /* 0x04-0x07 */
+	0x00, 0x00, 0xDB, 0x78, 0xB5, 0xDC, 0xB5, 0xD6, /* 0x08-0x0B */
+	0xB5, 0xDE, 0xB5, 0xD3, 0xB5, 0xE4, 0xDB, 0x79, /* 0x0C-0x0F */
+	0xDB, 0x67, 0xDB, 0x7B, 0xDB, 0x62, 0xDB, 0xA6, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xAE, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x5F, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xDF, 0xC7, 0x00, 0x00, 0xDF, 0xDD, /* 0x28-0x2B */
+	0xB8, 0x55, 0xDF, 0xCC, 0x00, 0x00, 0xDF, 0xCA, /* 0x2C-0x2F */
+	0xDF, 0xB5, 0xB8, 0xA9, 0xDF, 0xC5, 0xDF, 0xD9, /* 0x30-0x33 */
+	0xDF, 0xC1, 0xB8, 0xB1, 0xDF, 0xD8, 0xDF, 0xBF, /* 0x34-0x37 */
+	0xB5, 0xE3, 0xDF, 0xCF, 0xDF, 0xC0, 0xDF, 0xD6, /* 0x38-0x3B */
+	0xB8, 0xB0, 0xB8, 0xA8, 0x00, 0x00, 0xDF, 0xAA, /* 0x3C-0x3F */
+	0xDF, 0xB2, 0x00, 0x00, 0xDF, 0xCB, 0xDF, 0xC3, /* 0x40-0x43 */
+	0xDF, 0xDC, 0xDF, 0xC6, 0xB8, 0xB6, 0xDF, 0xD7, /* 0x44-0x47 */
+	0x00, 0x00, 0xB8, 0xAD, 0x00, 0x00, 0xDF, 0xC9, /* 0x48-0x4B */
+	0xDF, 0xD1, 0xDF, 0xB6, 0xDF, 0xD0, 0x00, 0x00, /* 0x4C-0x4F */
+	0xDF, 0xE1, 0xDF, 0xB1, 0xDF, 0xD2, 0x00, 0x00, /* 0x50-0x53 */
+	0xDF, 0xDF, 0x00, 0x00, 0xDF, 0xAB, 0xB5, 0xDB, /* 0x54-0x57 */
+	0x00, 0x00, 0xDF, 0xB9, 0xDF, 0xB8, 0xB8, 0xAF, /* 0x58-0x5B */
+	0x00, 0x00, 0xDF, 0xBC, 0xDF, 0xBE, 0xDF, 0xCD, /* 0x5C-0x5F */
+	0xDF, 0xDE, 0xB8, 0xB2, 0x00, 0x00, 0xB8, 0xB3, /* 0x60-0x63 */
+	0x00, 0x00, 0xDF, 0xB0, 0xB8, 0xAB, 0xDF, 0xB4, /* 0x64-0x67 */
+	0xDF, 0xDA, 0xB8, 0xB4, 0x00, 0x00, 0xB8, 0xAC, /* 0x68-0x6B */
+	0xB8, 0xAE, 0xB8, 0xB5, 0xDF, 0xE0, 0xDF, 0xD3, /* 0x6C-0x6F */
+	0xDF, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xBB, /* 0x70-0x73 */
+	0xDF, 0xBA, 0xB8, 0xAA, 0xDF, 0xAC, 0xB8, 0xA7, /* 0x74-0x77 */
+	0xDF, 0xC4, 0xDF, 0xAD, 0xDF, 0xC2, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0xDF, 0xB7, 0xDF, 0xDB, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0xB8, 0xA6, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xDF, 0xB3, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xDF, 0xAF, 0xDF, 0xD5, 0xDF, 0xAE, /* 0x8C-0x8F */
+	0xBB, 0x60, 0xE3, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xE3, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAC, /* 0x94-0x97 */
+	0xE3, 0xCA, 0xBB, 0x58, 0xE3, 0xBB, 0xE3, 0xC5, /* 0x98-0x9B */
+	0xBB, 0x5B, 0xE3, 0xBE, 0xBB, 0x59, 0xE3, 0xAF, /* 0x9C-0x9F */
+	0xE3, 0xCD, 0xE3, 0xAE, 0xE3, 0xC1, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE3, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xBF, /* 0xA4-0xA7 */
+	0xE3, 0xC8, 0xE3, 0xC6, 0xE3, 0xBA, 0xE3, 0xB5, /* 0xA8-0xAB */
+	0xE3, 0xB3, 0x00, 0x00, 0xE3, 0xB4, 0xE3, 0xC7, /* 0xAC-0xAF */
+	0xE3, 0xD2, 0xE3, 0xBC, 0xBB, 0x5A, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xE3, 0xB7, 0x00, 0x00, 0xE3, 0xCB, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xBB, 0x5D, 0xE3, 0xB6, 0xE3, 0xB0, 0xE3, 0xC0, /* 0xB8-0xBB */
+	0xBB, 0x61, 0x00, 0x00, 0x00, 0x00, 0xBB, 0x55, /* 0xBC-0xBF */
+	0xBB, 0x5E, 0xE3, 0xB8, 0xE3, 0xB2, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xBB, 0x57, 0xDF, 0xD4, 0xBB, 0x56, 0xE3, 0xC3, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xBB, 0x54, 0xBB, 0x63, 0xBB, 0x5C, /* 0xC8-0xCB */
+	0xE3, 0xC4, 0xE3, 0xB9, 0xE3, 0xB1, 0xE3, 0xCC, /* 0xCC-0xCF */
+	0xE3, 0xBD, 0xBB, 0x62, 0xE3, 0xD0, 0xBB, 0x5F, /* 0xD0-0xD3 */
+	0xE3, 0xCF, 0x00, 0x00, 0xE3, 0xC9, 0xE3, 0xCE, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD1, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x73, /* 0xE4-0xE7 */
+	0xE7, 0x74, 0xE7, 0x67, 0xE7, 0x66, 0xE7, 0x62, /* 0xE8-0xEB */
+	0xBD, 0xB4, 0x00, 0x00, 0xBD, 0xAC, 0xE7, 0x76, /* 0xEC-0xEF */
+	0xE7, 0x75, 0xDF, 0xA9, 0xE7, 0x5F, 0xE7, 0x63, /* 0xF0-0xF3 */
+	0xE7, 0x5D, 0x00, 0x00, 0xE7, 0x70, 0xE7, 0x61, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE7, 0x77, 0xE7, 0x5A, 0xE7, 0x58, /* 0xF8-0xFB */
+	0xE7, 0x64, 0xE7, 0x6E, 0xE7, 0x69, 0xBD, 0xB6, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_85[512] = {
+	0xE7, 0x4F, 0x00, 0x00, 0xE7, 0x6D, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xBD, 0xB7, 0xDF, 0xBD, /* 0x04-0x07 */
+	0xE7, 0x5B, 0xE7, 0x52, 0xE7, 0x55, 0xE7, 0x7B, /* 0x08-0x0B */
+	0xE7, 0x5C, 0xE7, 0x53, 0xE7, 0x51, 0xE7, 0x4E, /* 0x0C-0x0F */
+	0x00, 0x00, 0xBD, 0xB0, 0xE7, 0x65, 0xBD, 0xAF, /* 0x10-0x13 */
+	0xBD, 0xB3, 0xE7, 0x60, 0xE7, 0x68, 0xBD, 0xA9, /* 0x14-0x17 */
+	0xE7, 0x78, 0xE7, 0x7C, 0xBD, 0xAB, 0x00, 0x00, /* 0x18-0x1B */
+	0xE7, 0x57, 0xE7, 0x6B, 0xE7, 0x6F, 0xE7, 0x54, /* 0x1C-0x1F */
+	0xE7, 0x79, 0xBD, 0xB2, 0x00, 0x00, 0xBD, 0xB1, /* 0x20-0x23 */
+	0xE7, 0x4C, 0xBD, 0xB5, 0xE7, 0x72, 0xE7, 0x56, /* 0x24-0x27 */
+	0xE7, 0x6A, 0xE7, 0x50, 0xE7, 0x5E, 0xE7, 0x59, /* 0x28-0x2B */
+	0xBD, 0xAD, 0xBD, 0xAE, 0xE7, 0x6C, 0xE7, 0x7D, /* 0x2C-0x2F */
+	0xE7, 0x7A, 0xE7, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x4D, /* 0x38-0x3B */
+	0x00, 0x00, 0xBD, 0xAA, 0xEB, 0x49, 0x00, 0x00, /* 0x3C-0x3F */
+	0xEB, 0x40, 0xEB, 0x43, 0x00, 0x00, 0xBF, 0xBB, /* 0x40-0x43 */
+	0xEB, 0x45, 0xEA, 0xF9, 0xEB, 0x41, 0xEB, 0x47, /* 0x44-0x47 */
+	0xBF, 0xB8, 0xBF, 0xBC, 0xBF, 0xB6, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0xEA, 0xFB, 0xEB, 0x4C, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0xEB, 0x46, 0x00, 0x00, 0xEA, 0xFC, /* 0x50-0x53 */
+	0xEB, 0x55, 0xEB, 0x4F, 0xEA, 0xF8, 0xEE, 0x46, /* 0x54-0x57 */
+	0xEA, 0xFE, 0xBF, 0xB7, 0x00, 0x00, 0xEB, 0x4A, /* 0x58-0x5B */
+	0x00, 0x00, 0xEB, 0x54, 0xBF, 0xBF, 0x00, 0x00, /* 0x5C-0x5F */
+	0xEB, 0x51, 0xEA, 0xFD, 0xEB, 0x44, 0xEB, 0x48, /* 0x60-0x63 */
+	0xEB, 0x42, 0xEB, 0x56, 0xEB, 0x53, 0xEB, 0x50, /* 0x64-0x67 */
+	0xBF, 0xB9, 0xBF, 0xBA, 0xBF, 0xBE, 0xEA, 0xFA, /* 0x68-0x6B */
+	0xEB, 0x57, 0xBF, 0xBD, 0xEB, 0x4D, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xEB, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xEB, 0x4E, 0xEE, 0x53, 0xEE, 0x40, /* 0x74-0x77 */
+	0xEE, 0x45, 0xEE, 0x52, 0xEE, 0x44, 0xED, 0xFB, /* 0x78-0x7B */
+	0xEE, 0x41, 0x00, 0x00, 0xC1, 0xA2, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xED, 0xF4, 0xEE, 0x4D, 0xEE, 0x4F, 0xED, 0xF3, /* 0x80-0x83 */
+	0xC1, 0xA1, 0xEE, 0x51, 0xEE, 0x49, 0xC1, 0xA8, /* 0x84-0x87 */
+	0xEE, 0x50, 0xEE, 0x42, 0xC1, 0xAA, 0xED, 0xF9, /* 0x88-0x8B */
+	0xEB, 0x52, 0xEE, 0x4A, 0xEE, 0x47, 0xED, 0xF5, /* 0x8C-0x8F */
+	0xEE, 0x55, 0xC1, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xC1, 0xA5, 0xED, 0xF7, 0xEE, 0x48, 0x00, 0x00, /* 0x94-0x97 */
+	0xEE, 0x54, 0xEE, 0x4B, 0xED, 0xFD, 0xC1, 0xA7, /* 0x98-0x9B */
+	0xC1, 0xA3, 0xEE, 0x4C, 0xED, 0xFE, 0xEE, 0x56, /* 0x9C-0x9F */
+	0xED, 0xF8, 0xEE, 0x43, 0xEE, 0x4E, 0xED, 0xFA, /* 0xA0-0xA3 */
+	0xED, 0xFC, 0x00, 0x00, 0xC2, 0xCB, 0xED, 0xF6, /* 0xA4-0xA7 */
+	0xC1, 0xA9, 0xC2, 0xC4, 0xC1, 0x7E, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC1, 0xA6, /* 0xAC-0xAF */
+	0xC2, 0xC8, 0xF0, 0xB3, 0x00, 0x00, 0xF0, 0xA9, /* 0xB0-0xB3 */
+	0xF0, 0xA4, 0xF0, 0xAA, 0xF0, 0xB4, 0xF0, 0xB8, /* 0xB4-0xB7 */
+	0xF0, 0xB7, 0xC2, 0xCA, 0xC2, 0xC9, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xF0, 0xAB, 0xF0, 0xB9, 0xF0, 0xAE, /* 0xBC-0xBF */
+	0xF0, 0xA6, 0x00, 0x00, 0xF0, 0xA8, 0xF0, 0xA7, /* 0xC0-0xC3 */
+	0xF0, 0xAD, 0xF0, 0xB2, 0xF0, 0xA5, 0xF0, 0xAC, /* 0xC4-0xC7 */
+	0xF0, 0xB1, 0xC2, 0xC7, 0x00, 0x00, 0xF0, 0xAF, /* 0xC8-0xCB */
+	0x00, 0x00, 0xC2, 0xC5, 0xF0, 0xB0, 0xC2, 0xC3, /* 0xCC-0xCF */
+	0xC2, 0xC6, 0xF2, 0xD5, 0xF0, 0xB5, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xC3, 0xC2, 0x00, 0x00, 0xF2, 0xCD, /* 0xD4-0xD7 */
+	0xF2, 0xD1, 0xF2, 0xC9, 0xF2, 0xCC, 0x00, 0x00, /* 0xD8-0xDB */
+	0xF2, 0xD4, 0xC3, 0xC0, 0xF2, 0xD9, 0xF2, 0xD2, /* 0xDC-0xDF */
+	0x00, 0x00, 0xF2, 0xCA, 0xF2, 0xDA, 0xF2, 0xD3, /* 0xE0-0xE3 */
+	0xC3, 0xC3, 0xC3, 0xC4, 0xF2, 0xD7, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xF2, 0xCB, 0xC3, 0xBF, 0xC3, 0xC1, 0xF2, 0xC6, /* 0xE8-0xEB */
+	0xF2, 0xCE, 0xF2, 0xC8, 0x00, 0x00, 0xF2, 0xD8, /* 0xEC-0xEF */
+	0xF2, 0xD6, 0xF2, 0xC7, 0xF2, 0xCF, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xF4, 0xBE, 0xC3, 0xC5, /* 0xF4-0xF7 */
+	0xF2, 0xD0, 0xC4, 0xA7, 0xC4, 0xA9, 0xC4, 0xA6, /* 0xF8-0xFB */
+	0x00, 0x00, 0xF4, 0xC3, 0xF4, 0xBB, 0xF4, 0xB9, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_86[512] = {
+	0xF4, 0xBD, 0xF4, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0xF4, 0xBF, 0xF4, 0xC1, 0xC4, 0xAA, 0xC4, 0xAC, /* 0x04-0x07 */
+	0x00, 0x00, 0xF4, 0xC0, 0xC4, 0xAD, 0xC4, 0xAB, /* 0x08-0x0B */
+	0xF4, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xC4, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC4, 0xF4, /* 0x14-0x17 */
+	0xF5, 0xF1, 0xF5, 0xF7, 0xC4, 0xF6, 0xF4, 0xBC, /* 0x18-0x1B */
+	0xF5, 0xF6, 0x00, 0x00, 0xF5, 0xFD, 0xF5, 0xF4, /* 0x1C-0x1F */
+	0xF5, 0xFB, 0xF5, 0xFA, 0xF4, 0xB8, 0xF5, 0xF5, /* 0x20-0x23 */
+	0xF0, 0xB6, 0xF5, 0xFE, 0xF5, 0xF3, 0xF5, 0xF8, /* 0x24-0x27 */
+	0x00, 0x00, 0xF5, 0xFC, 0xF5, 0xF2, 0x00, 0x00, /* 0x28-0x2B */
+	0xF7, 0x4A, 0xC4, 0xF5, 0xF5, 0xF9, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xF7, 0xF4, 0xF7, 0x4B, 0xF7, 0x49, /* 0x30-0x33 */
+	0xF7, 0x47, 0xF7, 0x48, 0xF7, 0x4C, 0x00, 0x00, /* 0x34-0x37 */
+	0xC5, 0xD9, 0xF7, 0xF2, 0xF7, 0xF0, 0xF7, 0xF5, /* 0x38-0x3B */
+	0xF7, 0xF3, 0x00, 0x00, 0xF7, 0xF6, 0xC5, 0xDA, /* 0x3C-0x3F */
+	0xF7, 0xF1, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xBC, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0x45, 0xF9, 0x46, /* 0x44-0x47 */
+	0xF9, 0x47, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC7, /* 0x48-0x4B */
+	0xF9, 0xBD, 0xCA, 0x4F, 0xAA, 0xEA, 0x00, 0x00, /* 0x4C-0x4F */
+	0xAD, 0x68, 0x00, 0x00, 0xD3, 0xB8, 0xD3, 0xB7, /* 0x50-0x53 */
+	0xB0, 0x40, 0xB3, 0x42, 0xD7, 0x7C, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0xD7, 0x7B, 0x00, 0x00, 0xB5, 0xEA, /* 0x58-0x5B */
+	0xB8, 0xB8, 0x00, 0x00, 0xB8, 0xB7, 0xB8, 0xB9, /* 0x5C-0x5F */
+	0x00, 0x00, 0xE3, 0xD4, 0xE7, 0x7E, 0xEB, 0x58, /* 0x60-0x63 */
+	0xEB, 0x5A, 0xEB, 0x59, 0x00, 0x00, 0xC1, 0xAB, /* 0x64-0x67 */
+	0xEE, 0x57, 0xF0, 0xBA, 0xF9, 0xA5, 0xA6, 0xE4, /* 0x68-0x6B */
+	0x00, 0x00, 0xCD, 0xC9, 0xCD, 0xCA, 0xCD, 0xC8, /* 0x6C-0x6F */
+	0xCD, 0xC7, 0xAA, 0xEB, 0x00, 0x00, 0xD0, 0xA9, /* 0x70-0x73 */
+	0xD0, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xA6, /* 0x74-0x77 */
+	0x00, 0x00, 0xAD, 0x69, 0xAD, 0x6B, 0xAD, 0x6A, /* 0x78-0x7B */
+	0xD0, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xD3, 0xC4, 0xD3, 0xC1, 0xD3, 0xBF, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0xB0, 0x41, 0xD3, 0xC2, /* 0x88-0x8B */
+	0xB0, 0x46, 0xD3, 0xBC, 0xD3, 0xCB, 0x00, 0x00, /* 0x8C-0x8F */
+	0xD3, 0xCD, 0xD3, 0xBD, 0x00, 0x00, 0xB0, 0x43, /* 0x90-0x93 */
+	0xD3, 0xCE, 0xD3, 0xC9, 0xD3, 0xBB, 0xD3, 0xC0, /* 0x94-0x97 */
+	0xD3, 0xCA, 0xD3, 0xC6, 0xD3, 0xC3, 0x00, 0x00, /* 0x98-0x9B */
+	0xB0, 0x48, 0xD3, 0xCC, 0xD3, 0xBE, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xD3, 0xC7, 0xD3, 0xB9, 0xB0, 0x47, /* 0xA0-0xA3 */
+	0xB0, 0x44, 0xD3, 0xC5, 0x00, 0x00, 0xD3, 0xC8, /* 0xA4-0xA7 */
+	0xD3, 0xBA, 0xB0, 0x45, 0xB0, 0x42, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x4C, /* 0xAC-0xAF */
+	0xD7, 0xA5, 0xB3, 0x4B, 0x00, 0x00, 0xD7, 0xA8, /* 0xB0-0xB3 */
+	0xD7, 0xAB, 0xB3, 0x48, 0xB3, 0x46, 0xD7, 0x7E, /* 0xB4-0xB7 */
+	0xD7, 0xA9, 0xD7, 0xA7, 0xD7, 0xA4, 0xD7, 0xAC, /* 0xB8-0xBB */
+	0xD7, 0xAD, 0xD7, 0xAF, 0xD7, 0xB0, 0xD7, 0x7D, /* 0xBC-0xBF */
+	0xB3, 0x45, 0xD7, 0xA2, 0xD7, 0xA1, 0xD7, 0xAE, /* 0xC0-0xC3 */
+	0xB3, 0x47, 0xD7, 0xA3, 0xB3, 0x49, 0xB3, 0x44, /* 0xC4-0xC7 */
+	0xD7, 0xA6, 0xB3, 0x4D, 0x00, 0x00, 0xB3, 0x4A, /* 0xC8-0xCB */
+	0xD7, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xB5, 0xF1, 0xDB, 0xBF, 0x00, 0x00, 0xDB, 0xB4, /* 0xD0-0xD3 */
+	0xB5, 0xEE, 0x00, 0x00, 0xDF, 0xE7, 0xDB, 0xBD, /* 0xD4-0xD7 */
+	0xDB, 0xB1, 0xB5, 0xEC, 0xDB, 0xB6, 0xB5, 0xEF, /* 0xD8-0xDB */
+	0xDB, 0xBA, 0xDB, 0xB8, 0xB5, 0xF2, 0xB5, 0xEB, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xB2, 0xDB, 0xB5, /* 0xE0-0xE3 */
+	0xB5, 0xF0, 0x00, 0x00, 0xDB, 0xB3, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xDB, 0xBE, 0xDB, 0xBC, 0xDB, 0xB7, 0xDB, 0xB9, /* 0xE8-0xEB */
+	0xDB, 0xBB, 0xB5, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xDF, 0xE8, 0xDF, 0xEE, 0xDF, 0xE4, /* 0xF4-0xF7 */
+	0xDF, 0xEA, 0xB8, 0xBA, 0xDF, 0xE6, 0xB8, 0xC0, /* 0xF8-0xFB */
+	0x00, 0x00, 0x00, 0x00, 0xB8, 0xBF, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_87[512] = {
+	0xB8, 0xBE, 0xDF, 0xED, 0xB8, 0xC1, 0xB8, 0xC2, /* 0x00-0x03 */
+	0xDF, 0xE3, 0xDF, 0xF0, 0xB8, 0xC3, 0xB8, 0xBD, /* 0x04-0x07 */
+	0xB8, 0xBC, 0xDF, 0xEC, 0xB8, 0xC4, 0xDF, 0xE2, /* 0x08-0x0B */
+	0xDF, 0xE5, 0xDF, 0xEF, 0xDF, 0xEB, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0xE3, 0xF4, 0xE3, 0xE9, 0xB8, 0xBB, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0xBB, 0x6A, 0xE3, 0xDD, 0xE3, 0xF2, 0xE3, 0xDE, /* 0x18-0x1B */
+	0xBB, 0x65, 0x00, 0x00, 0xE3, 0xDB, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE3, 0xE4, 0xE3, 0xDC, 0xBB, 0x67, 0xE3, 0xD6, /* 0x20-0x23 */
+	0xE3, 0xF1, 0xBB, 0x68, 0xE3, 0xEE, 0xE3, 0xEF, /* 0x24-0x27 */
+	0xE3, 0xD7, 0xBB, 0x6D, 0xE3, 0xE6, 0x00, 0x00, /* 0x28-0x2B */
+	0xE3, 0xE0, 0xE3, 0xE7, 0xE3, 0xDA, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE3, 0xF3, 0xE3, 0xEB, 0xE3, 0xE5, 0xE3, 0xD5, /* 0x30-0x33 */
+	0xBB, 0x69, 0xE3, 0xEC, 0x00, 0x00, 0xBB, 0x6C, /* 0x34-0x37 */
+	0xE3, 0xF0, 0x00, 0x00, 0xE3, 0xEA, 0xBB, 0x66, /* 0x38-0x3B */
+	0xE3, 0xE8, 0x00, 0x00, 0xE3, 0xE2, 0xBB, 0x64, /* 0x3C-0x3F */
+	0xE3, 0xD9, 0xE3, 0xE1, 0xE3, 0xED, 0xE3, 0xDF, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xE3, 0xE3, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0xBD, 0xC1, 0xDF, 0xE9, 0xE7, 0xB2, 0xE7, 0xBB, /* 0x4C-0x4F */
+	0xE7, 0xB1, 0xE7, 0xAD, 0xE7, 0xAA, 0xBD, 0xC2, /* 0x50-0x53 */
+	0xE7, 0xA8, 0xBB, 0x6B, 0xE7, 0xA1, 0xBD, 0xC0, /* 0x54-0x57 */
+	0xE7, 0xA7, 0xBD, 0xBF, 0xE7, 0xAC, 0xE7, 0xA9, /* 0x58-0x5B */
+	0xE7, 0xB9, 0xE7, 0xB4, 0xE7, 0xAE, 0xE7, 0xB3, /* 0x5C-0x5F */
+	0xBD, 0xBB, 0xE7, 0xAB, 0xE7, 0xBE, 0xE7, 0xA2, /* 0x60-0x63 */
+	0xE7, 0xA3, 0xE7, 0xBA, 0xBD, 0xBC, 0xE7, 0xBF, /* 0x64-0x67 */
+	0xBD, 0xBE, 0xE7, 0xC0, 0xE7, 0xB0, 0xE3, 0xD8, /* 0x68-0x6B */
+	0xE7, 0xB6, 0xE7, 0xAF, 0xE7, 0xB8, 0xE7, 0xB5, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xA6, /* 0x70-0x73 */
+	0xBD, 0xB9, 0xE7, 0xBD, 0xBD, 0xBA, 0xE7, 0xA4, /* 0x74-0x77 */
+	0xBD, 0xBD, 0xEB, 0x64, 0xE7, 0xB7, 0xE7, 0xBC, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xEB, 0x61, 0xBD, 0xB8, 0xBF, 0xC0, /* 0x80-0x83 */
+	0xEB, 0x6B, 0xEB, 0x67, 0x00, 0x00, 0xEB, 0x65, /* 0x84-0x87 */
+	0xEB, 0x60, 0xEB, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xBF, 0xC4, 0x00, 0x00, 0xEB, 0x5C, /* 0x8C-0x8F */
+	0xEB, 0x68, 0xEB, 0x69, 0xEB, 0x5F, 0xEB, 0x5E, /* 0x90-0x93 */
+	0xEB, 0x6C, 0x00, 0x00, 0xEB, 0x62, 0xEB, 0x5D, /* 0x94-0x97 */
+	0xEB, 0x63, 0x00, 0x00, 0xEB, 0x6E, 0xEB, 0x5B, /* 0x98-0x9B */
+	0xEB, 0x6D, 0xEB, 0x6A, 0xBF, 0xC2, 0xBF, 0xC1, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0xBF, 0xC3, 0xEB, 0x66, /* 0xA0-0xA3 */
+	0xF0, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0x59, 0xC1, 0xB1, /* 0xA8-0xAB */
+	0xEE, 0x5D, 0xEE, 0x5A, 0xEE, 0x61, 0xEE, 0x67, /* 0xAC-0xAF */
+	0xEE, 0x5C, 0x00, 0x00, 0xEE, 0x70, 0xC1, 0xAE, /* 0xB0-0xB3 */
+	0xEE, 0x6A, 0xEE, 0x5F, 0xEE, 0x6B, 0xEE, 0x66, /* 0xB4-0xB7 */
+	0xEE, 0x6D, 0xEE, 0x5E, 0xC1, 0xB3, 0xC1, 0xB2, /* 0xB8-0xBB */
+	0xEE, 0x60, 0xEE, 0x6E, 0xEE, 0x58, 0xEE, 0x6C, /* 0xBC-0xBF */
+	0xC1, 0xAC, 0x00, 0x00, 0xEE, 0x64, 0xEE, 0x63, /* 0xC0-0xC3 */
+	0xEE, 0x68, 0xEE, 0x5B, 0xC1, 0xB0, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xC1, 0xB4, 0xEE, 0x62, 0xEE, 0x69, 0xC1, 0xB5, /* 0xC8-0xCB */
+	0xEE, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xC1, 0xAD, 0xC1, 0xAF, 0xF0, 0xC7, /* 0xD0-0xD3 */
+	0xF0, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xCC, /* 0xD4-0xD7 */
+	0xF0, 0xC9, 0xF0, 0xCD, 0x00, 0x00, 0xF0, 0xBE, /* 0xD8-0xDB */
+	0xF0, 0xC6, 0xF0, 0xD1, 0xEE, 0x6F, 0xF0, 0xC2, /* 0xDC-0xDF */
+	0xC2, 0xCF, 0xE7, 0xA5, 0xF0, 0xBD, 0xF0, 0xCA, /* 0xE0-0xE3 */
+	0xF0, 0xC4, 0xF0, 0xC1, 0xF0, 0xBC, 0xF0, 0xBB, /* 0xE4-0xE7 */
+	0xF0, 0xD0, 0x00, 0x00, 0xF0, 0xC0, 0xF0, 0xBF, /* 0xE8-0xEB */
+	0xC2, 0xCD, 0xF0, 0xC8, 0x00, 0x00, 0xC2, 0xCC, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xC2, 0xCE, 0xF0, 0xC3, /* 0xF0-0xF3 */
+	0xF0, 0xCF, 0x00, 0x00, 0xF2, 0xDE, 0xF2, 0xDF, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xC3, 0xC9, 0xF2, 0xDC, 0xC3, 0xC6, /* 0xF8-0xFB */
+	0xF2, 0xE4, 0x00, 0x00, 0xC3, 0xCA, 0xF2, 0xE6, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_88[512] = {
+	0xF2, 0xDB, 0xF0, 0xCE, 0xF2, 0xE8, 0xF2, 0xDD, /* 0x00-0x03 */
+	0x00, 0x00, 0xC3, 0xC7, 0xF2, 0xE3, 0x00, 0x00, /* 0x04-0x07 */
+	0xF2, 0xE5, 0xF2, 0xE0, 0xF2, 0xE7, 0xF2, 0xE2, /* 0x08-0x0B */
+	0xF2, 0xE1, 0xC3, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xF4, 0xC5, 0xF4, 0xC6, 0x00, 0x00, 0xF4, 0xC8, /* 0x10-0x13 */
+	0xC4, 0xAE, 0xC4, 0xAF, 0xF4, 0xC9, 0xF4, 0xC7, /* 0x14-0x17 */
+	0x00, 0x00, 0xF4, 0xC4, 0x00, 0x00, 0xF6, 0x42, /* 0x18-0x1B */
+	0xF6, 0x45, 0xF6, 0x41, 0x00, 0x00, 0xC4, 0xFA, /* 0x1C-0x1F */
+	0xF6, 0x43, 0xC4, 0xF9, 0xC4, 0xF8, 0xC4, 0xF7, /* 0x20-0x23 */
+	0xF6, 0x44, 0xF7, 0x51, 0xF7, 0x4F, 0x00, 0x00, /* 0x24-0x27 */
+	0xF7, 0x4E, 0xF6, 0x40, 0xF7, 0x50, 0xF6, 0x46, /* 0x28-0x2B */
+	0xF7, 0x4D, 0x00, 0x00, 0xF7, 0xF9, 0xF7, 0xD7, /* 0x2C-0x2F */
+	0xF7, 0xF7, 0xC5, 0xDB, 0xF7, 0xF8, 0xF7, 0xFA, /* 0x30-0x33 */
+	0x00, 0x00, 0xF8, 0xBF, 0xC5, 0xFA, 0xF8, 0xBE, /* 0x34-0x37 */
+	0xF8, 0xBD, 0xC5, 0xFB, 0x00, 0x00, 0xC6, 0x5A, /* 0x38-0x3B */
+	0xF9, 0x6E, 0xF9, 0xA7, 0xF9, 0xA6, 0xF9, 0xA8, /* 0x3C-0x3F */
+	0xA6, 0xE5, 0xD0, 0xAA, 0x00, 0x00, 0xD3, 0xCF, /* 0x40-0x43 */
+	0xD3, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0xDB, 0xC0, 0x00, 0x00, 0xF6, 0x47, 0xF8, 0xC0, /* 0x48-0x4B */
+	0xA6, 0xE6, 0xAD, 0x6C, 0xD0, 0xAB, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xD7, 0xB1, 0xB3, 0x4E, /* 0x50-0x53 */
+	0x00, 0x00, 0xDB, 0xC2, 0xDB, 0xC1, 0xB5, 0xF3, /* 0x54-0x57 */
+	0x00, 0x00, 0xB8, 0xC5, 0xE7, 0xC1, 0xBD, 0xC3, /* 0x58-0x5B */
+	0x00, 0x00, 0xBD, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0xBF, 0xC5, 0xC5, 0xFC, 0xA6, 0xE7, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAC, /* 0x64-0x67 */
+	0xAA, 0xED, 0xD0, 0xAE, 0xD0, 0xAD, 0xAD, 0x6D, /* 0x68-0x6B */
+	0x00, 0x00, 0xD3, 0xD1, 0x00, 0x00, 0xD3, 0xD8, /* 0x6C-0x6F */
+	0xB0, 0x49, 0xD3, 0xD6, 0xD3, 0xD4, 0x00, 0x00, /* 0x70-0x73 */
+	0xD3, 0xDB, 0xD3, 0xD2, 0xD3, 0xD3, 0xB0, 0x4A, /* 0x74-0x77 */
+	0x00, 0x00, 0xB0, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xD3, 0xDC, 0xB0, 0x4D, 0xD3, 0xDA, 0xD3, 0xD7, /* 0x7C-0x7F */
+	
+	0xD3, 0xD5, 0xB0, 0x4B, 0xB0, 0x4C, 0xD3, 0xD9, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xB3, 0x50, 0xD7, 0xB2, 0x00, 0x00, 0xB3, 0x55, /* 0x88-0x8B */
+	0xD7, 0xC2, 0xB3, 0x54, 0xD7, 0xC4, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xD7, 0xB8, 0xB3, 0x52, 0xD7, 0xC3, /* 0x90-0x93 */
+	0x00, 0x00, 0xD7, 0xB3, 0xB3, 0x53, 0xD7, 0xBF, /* 0x94-0x97 */
+	0xD7, 0xBB, 0xD7, 0xBD, 0xD7, 0xB7, 0xD7, 0xBE, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0x4F, 0xD7, 0xBA, /* 0x9C-0x9F */
+	0x00, 0x00, 0xD7, 0xB9, 0xD7, 0xB5, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xD7, 0xC0, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xBC, /* 0xA4-0xA7 */
+	0xD7, 0xB4, 0x00, 0x00, 0xD7, 0xB6, 0xB3, 0x51, /* 0xA8-0xAB */
+	0xD7, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0xB5, 0xF6, 0xDB, 0xCD, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xDB, 0xC9, 0xDB, 0xCB, /* 0xB4-0xB7 */
+	0xDB, 0xC6, 0xDB, 0xC5, 0xDB, 0xC3, 0x00, 0x00, /* 0xB8-0xBB */
+	0xDB, 0xCA, 0xDB, 0xCC, 0xDB, 0xC8, 0x00, 0x00, /* 0xBC-0xBF */
+	0xDB, 0xC7, 0xB5, 0xF4, 0xB5, 0xF5, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xDB, 0xCF, 0xB8, 0xCD, 0xDF, 0xF2, /* 0xC8-0xCB */
+	0xDF, 0xF8, 0xDF, 0xF3, 0xDF, 0xF4, 0xF9, 0xD8, /* 0xCC-0xCF */
+	0xDF, 0xF9, 0x00, 0x00, 0xB8, 0xCF, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xB8, 0xC7, 0xB8, 0xCE, 0xDF, 0xF1, 0xDB, 0xC4, /* 0xD4-0xD7 */
+	0xB8, 0xCA, 0xB8, 0xC8, 0xDF, 0xF7, 0xDF, 0xF6, /* 0xD8-0xDB */
+	0xB8, 0xC9, 0xB8, 0xCB, 0xDF, 0xF5, 0xB8, 0xC6, /* 0xDC-0xDF */
+	0x00, 0x00, 0xB8, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xF6, /* 0xE4-0xE7 */
+	0xBB, 0x74, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x42, /* 0xE8-0xEB */
+	0xE4, 0x41, 0x00, 0x00, 0xE3, 0xFB, 0xBB, 0x76, /* 0xEC-0xEF */
+	0xE4, 0x40, 0xE3, 0xF7, 0xE3, 0xF8, 0xBB, 0x6E, /* 0xF0-0xF3 */
+	0xBB, 0x70, 0x00, 0x00, 0xE3, 0xFD, 0xE3, 0xF5, /* 0xF4-0xF7 */
+	0xBB, 0x72, 0xBB, 0x71, 0xE3, 0xF9, 0xE3, 0xFE, /* 0xF8-0xFB */
+	0xE3, 0xFC, 0xBB, 0x73, 0xE3, 0xFA, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_89[512] = {
+	0x00, 0x00, 0xDB, 0xCE, 0xBB, 0x6F, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xE7, 0xC2, 0xE7, 0xC9, 0xBD, 0xC6, /* 0x04-0x07 */
+	0x00, 0x00, 0xE7, 0xCD, 0xBD, 0xCA, 0xE7, 0xC5, /* 0x08-0x0B */
+	0xE7, 0xC3, 0x00, 0x00, 0xE7, 0xCC, 0x00, 0x00, /* 0x0C-0x0F */
+	0xBD, 0xC5, 0xE7, 0xCB, 0xBD, 0xC7, 0xBD, 0xC8, /* 0x10-0x13 */
+	0xE7, 0xC4, 0xBD, 0xC9, 0xE7, 0xCA, 0xE7, 0xC6, /* 0x14-0x17 */
+	0xE7, 0xC7, 0xE7, 0xC8, 0xBB, 0x75, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0x70, 0xEB, 0x7C, /* 0x1C-0x1F */
+	0x00, 0x00, 0xBF, 0xCA, 0xEB, 0x77, 0xEB, 0x79, /* 0x20-0x23 */
+	0x00, 0x00, 0xBF, 0xC8, 0xEB, 0x71, 0xEB, 0x75, /* 0x24-0x27 */
+	0x00, 0x00, 0xEB, 0x78, 0xBF, 0xC6, 0xBF, 0xC9, /* 0x28-0x2B */
+	0xEB, 0x7B, 0xEB, 0x73, 0xEB, 0x74, 0xEB, 0x7A, /* 0x2C-0x2F */
+	0xEB, 0x72, 0xEB, 0x76, 0xBF, 0xC7, 0xEE, 0x72, /* 0x30-0x33 */
+	0x00, 0x00, 0xEE, 0x71, 0xC1, 0xB7, 0xEE, 0x77, /* 0x34-0x37 */
+	0xC1, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xC1, 0xB6, /* 0x38-0x3B */
+	0xEE, 0x73, 0xC1, 0xBA, 0xEE, 0x74, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xEE, 0x75, 0xEE, 0x78, 0x00, 0x00, /* 0x40-0x43 */
+	0xC1, 0xB8, 0x00, 0x00, 0xF0, 0xD6, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xF0, 0xD9, 0x00, 0x00, 0xF0, 0xD3, /* 0x48-0x4B */
+	0xF0, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xD4, /* 0x4C-0x4F */
+	0xF0, 0xD7, 0xF0, 0xD8, 0xEE, 0x76, 0xF0, 0xD2, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xC3, 0xCD, 0xF2, 0xEC, /* 0x54-0x57 */
+	0xF2, 0xEF, 0xF2, 0xF1, 0xF2, 0xEA, 0xF2, 0xEB, /* 0x58-0x5B */
+	0xF2, 0xEE, 0xF2, 0xF0, 0xC3, 0xCE, 0xC3, 0xCC, /* 0x5C-0x5F */
+	0xC3, 0xCB, 0xF2, 0xED, 0xF2, 0xE9, 0xF4, 0xCA, /* 0x60-0x63 */
+	0xC4, 0xB0, 0x00, 0x00, 0xF4, 0xCB, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0xF6, 0x49, 0xC4, 0xFB, 0xF6, 0x4B, /* 0x68-0x6B */
+	0xC4, 0xFC, 0xF6, 0x48, 0xF6, 0x4A, 0xC5, 0xA8, /* 0x6C-0x6F */
+	0x00, 0x00, 0xF7, 0x52, 0xC5, 0xA7, 0xF7, 0xFD, /* 0x70-0x73 */
+	0xF7, 0xFC, 0x00, 0x00, 0xF7, 0xFB, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xF9, 0x48, 0xF9, 0x49, 0xF9, 0x4B, /* 0x78-0x7B */
+	0xF9, 0x4A, 0x00, 0x00, 0xCA, 0x50, 0xA6, 0xE8, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xAD, 0x6E, 0xD7, 0xC5, 0xB5, 0xF7, /* 0x80-0x83 */
+	0x00, 0x00, 0xDF, 0xFA, 0xC2, 0xD0, 0x00, 0x00, /* 0x84-0x87 */
+	0xF2, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA3, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x57, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x56, /* 0x90-0x93 */
+	0x00, 0x00, 0xDB, 0xD0, 0xB5, 0xF8, 0xDB, 0xD2, /* 0x94-0x97 */
+	0xDB, 0xD1, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xFB, /* 0x98-0x9B */
+	0xB8, 0xD0, 0xE4, 0x43, 0xE4, 0x46, 0xE4, 0x45, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE4, 0x44, 0xE7, 0xCE, 0xE7, 0xD0, /* 0xA0-0xA3 */
+	0xE7, 0xCF, 0x00, 0x00, 0xBF, 0xCC, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xBF, 0xCB, 0x00, 0x00, /* 0xA8-0xAB */
+	0xC1, 0xBB, 0xEE, 0x79, 0xEE, 0x7B, 0xEE, 0x7A, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xC2, 0xD1, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xF2, 0xF4, 0xF2, 0xF3, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xF4, 0xCC, 0xC4, 0xB1, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xC4, 0xFD, 0xF7, 0x54, 0xF7, 0x53, /* 0xBC-0xBF */
+	0xC6, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xA4, 0xD0, 0xAF, /* 0xD0-0xD3 */
+	0xAD, 0x6F, 0xD7, 0xC8, 0xD7, 0xC6, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xD7, 0xC7, 0xDB, 0xD4, 0xDB, 0xD5, /* 0xD8-0xDB */
+	0xE0, 0x43, 0xDB, 0xD3, 0x00, 0x00, 0xDF, 0xFC, /* 0xDC-0xDF */
+	0xE0, 0x41, 0xE0, 0x40, 0xE0, 0x42, 0xB8, 0xD1, /* 0xE0-0xE3 */
+	0xDF, 0xFE, 0xDF, 0xFD, 0xE0, 0x44, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xE4, 0x49, 0xE4, 0x47, 0x00, 0x00, 0xE4, 0x48, /* 0xE8-0xEB */
+	0xE7, 0xD3, 0xE7, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xE7, 0xD2, 0xEB, 0x7D, 0xEE, 0x7C, 0xEE, 0x7D, /* 0xF0-0xF3 */
+	0xC2, 0xD2, 0x00, 0x00, 0xF2, 0xF5, 0xF4, 0xCD, /* 0xF4-0xF7 */
+	0xC4, 0xB2, 0x00, 0x00, 0xF6, 0x4C, 0xF7, 0x55, /* 0xF8-0xFB */
+	0xC5, 0xA9, 0x00, 0x00, 0xF7, 0xFE, 0xF9, 0x4C, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8A[512] = {
+	0xA8, 0xA5, 0x00, 0x00, 0xAD, 0x71, 0xAD, 0x72, /* 0x00-0x03 */
+	0xD0, 0xB0, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xB1, /* 0x04-0x07 */
+	0xAD, 0x70, 0x00, 0x00, 0xB0, 0x54, 0x00, 0x00, /* 0x08-0x0B */
+	0xB0, 0x52, 0x00, 0x00, 0xB0, 0x51, 0xB0, 0x58, /* 0x0C-0x0F */
+	0xB0, 0x50, 0xB0, 0x59, 0xD3, 0xDD, 0xB0, 0x56, /* 0x10-0x13 */
+	0x00, 0x00, 0xB0, 0x53, 0xB0, 0x57, 0xB0, 0x55, /* 0x14-0x17 */
+	0xB0, 0x4F, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x5F, /* 0x18-0x1B */
+	0x00, 0x00, 0xB3, 0x59, 0xD7, 0xCC, 0xB3, 0x5E, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0x60, 0xB3, 0x5A, /* 0x20-0x23 */
+	0x00, 0x00, 0xB3, 0x5B, 0x00, 0x00, 0xD7, 0xCA, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0x58, 0x00, 0x00, /* 0x28-0x2B */
+	0xD7, 0xCB, 0xB3, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xD7, 0xC9, 0xB3, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0xB6, 0x44, 0x00, 0x00, 0xB6, 0x46, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xDB, 0xD8, 0xB6, 0x45, 0xB5, 0xF9, /* 0x38-0x3B */
+	0xB5, 0xFD, 0x00, 0x00, 0xB8, 0xE4, 0xE0, 0x49, /* 0x3C-0x3F */
+	0xDB, 0xDA, 0xB5, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xDB, 0xDD, 0xDB, 0xDE, 0xB6, 0x43, 0x00, 0x00, /* 0x44-0x47 */
+	0xDB, 0xE0, 0x00, 0x00, 0xDB, 0xE2, 0x00, 0x00, /* 0x48-0x4B */
+	0xDB, 0xE3, 0xDB, 0xD7, 0xDB, 0xD6, 0xDB, 0xE4, /* 0x4C-0x4F */
+	0xB6, 0x42, 0xDB, 0xE1, 0xDB, 0xDF, 0x00, 0x00, /* 0x50-0x53 */
+	0xB6, 0x40, 0xB5, 0xFB, 0xB6, 0x47, 0xDB, 0xDB, /* 0x54-0x57 */
+	0xDB, 0xDC, 0xDB, 0xD9, 0x00, 0x00, 0xB6, 0x41, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xB5, 0xFC, 0x00, 0x00, /* 0x5C-0x5F */
+	0xB5, 0xFA, 0xE0, 0x48, 0xB8, 0xDF, 0xB8, 0xDA, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xB8, 0xD5, 0x00, 0x00, /* 0x64-0x67 */
+	0xB8, 0xE5, 0xB8, 0xD6, 0x00, 0x00, 0xB8, 0xD2, /* 0x68-0x6B */
+	0xB8, 0xE1, 0xB8, 0xDE, 0xB8, 0xE0, 0x00, 0x00, /* 0x6C-0x6F */
+	0xB8, 0xD7, 0xB8, 0xDC, 0xB8, 0xD3, 0xB8, 0xD4, /* 0x70-0x73 */
+	0xE0, 0x50, 0xE0, 0x4D, 0xE0, 0x45, 0xE0, 0x4A, /* 0x74-0x77 */
+	0x00, 0x00, 0xB8, 0xE2, 0xE0, 0x51, 0xB8, 0xE3, /* 0x78-0x7B */
+	0xB8, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x47, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xE0, 0x4F, 0xE0, 0x4B, 0xE0, 0x4E, /* 0x80-0x83 */
+	0xE0, 0x4C, 0xB8, 0xDD, 0xE0, 0x46, 0xB8, 0xD8, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x4C, /* 0x88-0x8B */
+	0xBB, 0x78, 0xBB, 0x7B, 0x00, 0x00, 0xE4, 0x4E, /* 0x8C-0x8F */
+	0x00, 0x00, 0xBB, 0xA5, 0xE4, 0x4D, 0xBB, 0x7D, /* 0x90-0x93 */
+	0x00, 0x00, 0xBD, 0xCF, 0xE4, 0x4F, 0x00, 0x00, /* 0x94-0x97 */
+	0xBB, 0xA4, 0xE4, 0x4B, 0xBB, 0xA6, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xBB, 0x79, 0x00, 0x00, /* 0x9C-0x9F */
+	0xB8, 0xDB, 0xBB, 0x7C, 0x00, 0x00, 0xBB, 0x7A, /* 0xA0-0xA3 */
+	0xBB, 0x7E, 0xBB, 0xA2, 0xBB, 0x77, 0xBB, 0xA7, /* 0xA4-0xA7 */
+	0xBB, 0xA3, 0x00, 0x00, 0xBB, 0xA1, 0xE4, 0x4A, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0xBD, 0xD6, 0x00, 0x00, 0xBD, 0xD2, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xBD, 0xD9, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xE7, 0xD6, 0xBD, 0xDA, 0xE7, 0xE2, 0xE7, 0xDB, /* 0xB8-0xBB */
+	0xBD, 0xCB, 0xE7, 0xE3, 0xE7, 0xDD, 0xBD, 0xD5, /* 0xBC-0xBF */
+	0xE7, 0xDE, 0x00, 0x00, 0xBD, 0xD4, 0xE7, 0xE1, /* 0xC0-0xC3 */
+	0xBD, 0xCE, 0xE7, 0xDF, 0xE7, 0xD5, 0xBD, 0xCD, /* 0xC4-0xC7 */
+	0xEB, 0xAA, 0xBD, 0xD3, 0x00, 0x00, 0xBD, 0xD0, /* 0xC8-0xCB */
+	0x00, 0x00, 0xBD, 0xD8, 0x00, 0x00, 0xE7, 0xD4, /* 0xCC-0xCF */
+	0x00, 0x00, 0xE7, 0xD8, 0xBD, 0xCC, 0xE7, 0xD7, /* 0xD0-0xD3 */
+	0xE7, 0xD9, 0xE7, 0xDA, 0xBD, 0xD7, 0xE7, 0xDC, /* 0xD4-0xD7 */
+	0xE7, 0xE0, 0xE7, 0xE4, 0x00, 0x00, 0xBD, 0xDB, /* 0xD8-0xDB */
+	0xBF, 0xD2, 0xEB, 0xA5, 0xEB, 0xAB, 0xEB, 0xA8, /* 0xDC-0xDF */
+	0xEB, 0x7E, 0xEB, 0xAC, 0xEB, 0xA1, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xEB, 0xA7, 0x00, 0x00, 0xBF, 0xCD, 0xBF, 0xD3, /* 0xE4-0xE7 */
+	0xEB, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xBF, 0xCF, /* 0xE8-0xEB */
+	0x00, 0x00, 0xBF, 0xD9, 0xBF, 0xD4, 0xEB, 0xAF, /* 0xEC-0xEF */
+	0xEB, 0xA9, 0xBF, 0xD0, 0xEB, 0xA2, 0xBF, 0xDA, /* 0xF0-0xF3 */
+	0xEB, 0xA3, 0xEB, 0xA4, 0xBF, 0xDB, 0xBF, 0xD8, /* 0xF4-0xF7 */
+	0xBD, 0xD1, 0x00, 0x00, 0xBF, 0xCE, 0xEB, 0xB0, /* 0xF8-0xFB */
+	0xBF, 0xDC, 0x00, 0x00, 0xBF, 0xD5, 0xEB, 0xAE, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8B[512] = {
+	0xBF, 0xD1, 0xBF, 0xD6, 0xBF, 0xD7, 0x00, 0x00, /* 0x00-0x03 */
+	0xC1, 0xC3, 0xEE, 0xA4, 0xEE, 0xAD, 0xEE, 0xAA, /* 0x04-0x07 */
+	0xEE, 0xAC, 0x00, 0x00, 0xC1, 0xC0, 0xEE, 0xA5, /* 0x08-0x0B */
+	0x00, 0x00, 0xEE, 0xAB, 0xC1, 0xBC, 0xEE, 0xA7, /* 0x0C-0x0F */
+	0xC1, 0xC4, 0xEE, 0xA3, 0xEE, 0xA8, 0xEE, 0xAF, /* 0x10-0x13 */
+	0xEB, 0xA6, 0xEE, 0xA9, 0xEE, 0xA2, 0xC1, 0xBD, /* 0x14-0x17 */
+	0xEE, 0xA1, 0xC1, 0xBE, 0xEE, 0xB0, 0xC1, 0xBF, /* 0x18-0x1B */
+	0xEE, 0xAE, 0xC1, 0xC2, 0xEE, 0x7E, 0x00, 0x00, /* 0x1C-0x1F */
+	0xC1, 0xC1, 0x00, 0x00, 0xEE, 0xA6, 0xF0, 0xDC, /* 0x20-0x23 */
+	0xF0, 0xEA, 0xF0, 0xE5, 0xF0, 0xE7, 0xF0, 0xDB, /* 0x24-0x27 */
+	0xC2, 0xD3, 0x00, 0x00, 0xF0, 0xDA, 0xC2, 0xD6, /* 0x28-0x2B */
+	0xC2, 0xD5, 0x00, 0x00, 0xF0, 0xE9, 0xF0, 0xE1, /* 0x2C-0x2F */
+	0xF0, 0xDE, 0xF0, 0xE4, 0x00, 0x00, 0xF0, 0xDD, /* 0x30-0x33 */
+	0x00, 0x00, 0xF0, 0xDF, 0xF0, 0xE8, 0xF0, 0xE6, /* 0x34-0x37 */
+	0x00, 0x00, 0xC2, 0xD4, 0xF0, 0xED, 0xF0, 0xEB, /* 0x38-0x3B */
+	0xF0, 0xE2, 0xF0, 0xEC, 0xF0, 0xE3, 0x00, 0x00, /* 0x3C-0x3F */
+	0xF2, 0xF9, 0xC3, 0xCF, 0xF3, 0x41, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xF6, 0x4F, 0xC3, 0xD6, 0xF0, 0xE0, /* 0x44-0x47 */
+	0xF2, 0xF7, 0xC3, 0xD2, 0xF2, 0xF8, 0xF2, 0xFD, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0xC3, 0xD4, 0xC3, 0xD5, /* 0x4C-0x4F */
+	0xF2, 0xF6, 0xF3, 0x40, 0xF3, 0x42, 0xF2, 0xFA, /* 0x50-0x53 */
+	0xF2, 0xFC, 0xF2, 0xFE, 0xF2, 0xFB, 0xF3, 0x43, /* 0x54-0x57 */
+	0xC3, 0xD1, 0xC3, 0xD7, 0xC3, 0xD3, 0x00, 0x00, /* 0x58-0x5B */
+	0xC3, 0xD0, 0xF4, 0xD0, 0x00, 0x00, 0xC4, 0xB7, /* 0x5C-0x5F */
+	0xF4, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xD2, /* 0x60-0x63 */
+	0x00, 0x00, 0xF4, 0xD3, 0xC4, 0xB5, 0xF4, 0xD4, /* 0x64-0x67 */
+	0xF4, 0xD1, 0x00, 0x00, 0xF4, 0xCF, 0xC4, 0xB8, /* 0x68-0x6B */
+	0xC4, 0xB4, 0xF4, 0xD5, 0x00, 0x00, 0xC4, 0xB6, /* 0x6C-0x6F */
+	0xC4, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xC4, 0xFE, 0x00, 0x00, 0x00, 0x00, 0xC5, 0x40, /* 0x74-0x77 */
+	0xF6, 0x4E, 0xF6, 0x4D, 0xF6, 0x50, 0xF6, 0x51, /* 0x78-0x7B */
+	0x00, 0x00, 0xC5, 0x41, 0xF7, 0x56, 0xF7, 0x5B, /* 0x7C-0x7F */
+	
+	0xC5, 0xAA, 0x00, 0x00, 0xF7, 0x58, 0x00, 0x00, /* 0x80-0x83 */
+	0xF7, 0x57, 0xF7, 0x5A, 0xF7, 0x59, 0x00, 0x00, /* 0x84-0x87 */
+	0xF8, 0x43, 0x00, 0x00, 0xC5, 0xDC, 0xF8, 0x42, /* 0x88-0x8B */
+	0xF8, 0x40, 0x00, 0x00, 0xF8, 0x41, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0xC5, 0xFE, 0xC5, 0xFD, /* 0x90-0x93 */
+	0xF8, 0xC1, 0xF8, 0xC2, 0xC6, 0x40, 0x00, 0x00, /* 0x94-0x97 */
+	0xF9, 0x4D, 0xF9, 0x4E, 0xC6, 0x67, 0x00, 0x00, /* 0x98-0x9B */
+	0xC6, 0x6D, 0x00, 0x00, 0xF9, 0xA9, 0xF9, 0xC8, /* 0x9C-0x9F */
+};
+
+static const unsigned char u2c_8C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA6, /* 0x34-0x37 */
+	0x00, 0x00, 0xD7, 0xCD, 0x00, 0x00, 0xD7, 0xCE, /* 0x38-0x3B */
+	0xE0, 0x52, 0xE4, 0x50, 0xE7, 0xE5, 0xC1, 0xC6, /* 0x3C-0x3F */
+	0x00, 0x00, 0xC1, 0xC5, 0xF0, 0xEE, 0xF3, 0x44, /* 0x40-0x43 */
+	0x00, 0x00, 0xF8, 0x44, 0xA8, 0xA7, 0xD3, 0xDE, /* 0x44-0x47 */
+	0xB0, 0x5A, 0xB3, 0x61, 0xE0, 0x54, 0xE0, 0x53, /* 0x48-0x4B */
+	0xBD, 0xDC, 0xE7, 0xE6, 0xBD, 0xDD, 0xEE, 0xB1, /* 0x4C-0x4F */
+	0xC2, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0xC6, 0x76, 0xA8, 0xA8, 0xCD, 0xCB, 0xD3, 0xDF, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0x62, 0x00, 0x00, /* 0x58-0x5B */
+	0xD7, 0xCF, 0xD7, 0xD0, 0x00, 0x00, 0xDB, 0xE5, /* 0x5C-0x5F */
+	0x00, 0x00, 0xB6, 0x48, 0xB8, 0xE6, 0x00, 0x00, /* 0x60-0x63 */
+	0xE0, 0x56, 0xE0, 0x55, 0xE0, 0x57, 0x00, 0x00, /* 0x64-0x67 */
+	0xE4, 0x51, 0xE4, 0x52, 0xBB, 0xA8, 0xBF, 0xDD, /* 0x68-0x6B */
+	0xBD, 0xDE, 0xBF, 0xDE, 0x00, 0x00, 0xEE, 0xB5, /* 0x6C-0x6F */
+	0xEE, 0xB2, 0xEE, 0xB4, 0xEE, 0xB3, 0xC1, 0xC7, /* 0x70-0x73 */
+	0x00, 0x00, 0xF0, 0xEF, 0xF3, 0x46, 0xF3, 0x45, /* 0x74-0x77 */
+	0xCB, 0xA4, 0xB0, 0x5C, 0xB0, 0x5B, 0xD3, 0xE0, /* 0x78-0x7B */
+	0x00, 0x00, 0xD7, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xDB, 0xE7, 0xDB, 0xE6, 0xB6, 0x49, 0x00, 0x00, /* 0x80-0x83 */
+	0xE0, 0x59, 0xE0, 0x5A, 0xE0, 0x58, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xB8, 0xE8, 0xB8, 0xE7, 0x00, 0x00, /* 0x88-0x8B */
+	0xBB, 0xAA, 0xBB, 0xA9, 0x00, 0x00, 0xE7, 0xE7, /* 0x8C-0x8F */
+	0xEB, 0xB3, 0xEB, 0xB1, 0xEB, 0xB2, 0xBF, 0xDF, /* 0x90-0x93 */
+	0xEE, 0xB7, 0xEE, 0xB6, 0x00, 0x00, 0xF0, 0xF2, /* 0x94-0x97 */
+	0xF0, 0xF1, 0xF0, 0xF0, 0xF3, 0x47, 0x00, 0x00, /* 0x98-0x9B */
+	0xF9, 0xAA, 0xA8, 0xA9, 0xAD, 0x73, 0x00, 0x00, /* 0x9C-0x9F */
+	0xAD, 0x74, 0xB0, 0x5D, 0xB0, 0x5E, 0xD3, 0xE2, /* 0xA0-0xA3 */
+	0xD3, 0xE1, 0xD7, 0xD2, 0x00, 0x00, 0xB3, 0x68, /* 0xA4-0xA7 */
+	0xB3, 0x66, 0xB3, 0x63, 0xB3, 0x67, 0xB3, 0x65, /* 0xA8-0xAB */
+	0xB3, 0x64, 0x00, 0x00, 0x00, 0x00, 0xB6, 0x4A, /* 0xAC-0xAF */
+	0xDB, 0xEA, 0x00, 0x00, 0xB8, 0xED, 0xB6, 0x4C, /* 0xB0-0xB3 */
+	0xB6, 0x51, 0xDB, 0xEC, 0xB6, 0x53, 0xB6, 0x52, /* 0xB4-0xB7 */
+	0xB6, 0x55, 0xDB, 0xEB, 0xDB, 0xE8, 0xB6, 0x4F, /* 0xB8-0xBB */
+	0xB6, 0x4B, 0xB6, 0x4D, 0xDB, 0xE9, 0xB6, 0x54, /* 0xBC-0xBF */
+	0xB6, 0x50, 0xB6, 0x4E, 0xB8, 0xEF, 0xB8, 0xEE, /* 0xC0-0xC3 */
+	0xB8, 0xEC, 0xB8, 0xF0, 0x00, 0x00, 0xB8, 0xEA, /* 0xC4-0xC7 */
+	0xB8, 0xEB, 0x00, 0x00, 0xB8, 0xE9, 0x00, 0x00, /* 0xC8-0xCB */
+	0xE0, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x54, /* 0xCC-0xCF */
+	0x00, 0x00, 0xBB, 0xAC, 0xBB, 0xAD, 0xBB, 0xAB, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE4, 0x53, 0x00, 0x00, 0xE4, 0x55, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xE7, 0xEA, 0xE7, 0xEC, 0x00, 0x00, /* 0xD8-0xDB */
+	0xBD, 0xE7, 0xE7, 0xED, 0xBD, 0xE0, 0xE7, 0xE9, /* 0xDC-0xDF */
+	0xBD, 0xDF, 0xBD, 0xE9, 0xBD, 0xE5, 0xBD, 0xE6, /* 0xE0-0xE3 */
+	0xBD, 0xE2, 0xE7, 0xE8, 0xBD, 0xE1, 0xE7, 0xEE, /* 0xE4-0xE7 */
+	0xE7, 0xEB, 0x00, 0x00, 0xBD, 0xE8, 0x00, 0x00, /* 0xE8-0xEB */
+	0xBD, 0xE3, 0xBD, 0xE4, 0xEB, 0xB5, 0x00, 0x00, /* 0xEC-0xEF */
+	0xEB, 0xB7, 0xEB, 0xB6, 0x00, 0x00, 0xEB, 0xB8, /* 0xF0-0xF3 */
+	0xBF, 0xE0, 0xEB, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xC1, 0xCB, 0xEE, 0xB8, 0xC1, 0xC8, 0xC1, 0xCC, /* 0xF8-0xFB */
+	0xC1, 0xCA, 0xC1, 0xC9, 0xF0, 0xF3, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8D[512] = {
+	0xF0, 0xF6, 0x00, 0x00, 0xF0, 0xF5, 0x00, 0x00, /* 0x00-0x03 */
+	0xF0, 0xF4, 0xC2, 0xD8, 0xF3, 0x48, 0xF3, 0x49, /* 0x04-0x07 */
+	0xC3, 0xD8, 0xF3, 0x4A, 0xC3, 0xD9, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xC4, 0xBA, 0x00, 0x00, 0xC4, 0xB9, /* 0x0C-0x0F */
+	0xF6, 0x52, 0x00, 0x00, 0x00, 0x00, 0xC5, 0x42, /* 0x10-0x13 */
+	0xF6, 0x53, 0xF7, 0x5C, 0xC5, 0xAB, 0xC5, 0xAC, /* 0x14-0x17 */
+	0x00, 0x00, 0xF8, 0x45, 0x00, 0x00, 0xC6, 0x42, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xA8, 0xAA, 0x00, 0x00, 0xB3, 0x6A, 0xB3, 0x69, /* 0x64-0x67 */
+	0xE0, 0x5C, 0xE0, 0x5D, 0x00, 0x00, 0xBB, 0xAE, /* 0x68-0x6B */
+	0xEB, 0xB9, 0xBD, 0xEA, 0xEB, 0xBA, 0xEE, 0xB9, /* 0x6C-0x6F */
+	0xA8, 0xAB, 0x00, 0x00, 0xD0, 0xB2, 0xAD, 0x76, /* 0x70-0x73 */
+	0xAD, 0x75, 0x00, 0x00, 0xD3, 0xE3, 0xB0, 0x5F, /* 0x74-0x77 */
+	0xD3, 0xE4, 0xD7, 0xD5, 0x00, 0x00, 0xD7, 0xD4, /* 0x78-0x7B */
+	0x00, 0x00, 0xD7, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xDB, 0xEE, 0xB6, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0xDB, 0xED, 0xB6, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0xDB, 0xEF, 0xB6, 0x56, 0x00, 0x00, /* 0x88-0x8B */
+	0xE0, 0x5F, 0xE0, 0x62, 0xE0, 0x60, 0xE0, 0x61, /* 0x8C-0x8F */
+	0xE0, 0x65, 0xE0, 0x5E, 0xE0, 0x66, 0xE0, 0x63, /* 0x90-0x93 */
+	0xE0, 0x64, 0xBB, 0xB0, 0xE4, 0x56, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xBB, 0xAF, 0x00, 0x00, 0xE7, 0xF2, /* 0x98-0x9B */
+	0xE7, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xBD, 0xEB, /* 0x9C-0x9F */
+	0xE7, 0xEF, 0xE7, 0xF1, 0x00, 0x00, 0xBD, 0xEC, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xEB, 0xBB, 0x00, 0x00, 0xEB, 0xBC, /* 0xA4-0xA7 */
+	0xC1, 0xCD, 0x00, 0x00, 0xF3, 0x4C, 0xF3, 0x4E, /* 0xA8-0xAB */
+	0xF3, 0x4B, 0xF3, 0x4D, 0xF4, 0xD6, 0xF6, 0x54, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0x6F, 0xA8, 0xAC, /* 0xB0-0xB3 */
+	0xAD, 0x77, 0xD3, 0xE5, 0xD3, 0xE7, 0xD3, 0xE6, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xD7, 0xD8, 0xB3, 0x6C, 0x00, 0x00, /* 0xB8-0xBB */
+	0xD7, 0xD6, 0x00, 0x00, 0xB3, 0x6B, 0xD7, 0xD9, /* 0xBC-0xBF */
+	0x00, 0x00, 0xD7, 0xDA, 0xD7, 0xD7, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xDB, 0xFB, 0xB6, 0x60, 0xDB, 0xF3, /* 0xC4-0xC7 */
+	0xDB, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xB6, 0x5B, /* 0xC8-0xCB */
+	0xB6, 0x5E, 0xDB, 0xF2, 0xB6, 0x59, 0xDB, 0xF6, /* 0xCC-0xCF */
+	0xE0, 0x6C, 0xB6, 0x5D, 0x00, 0x00, 0xDB, 0xF1, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xDB, 0xF7, 0xDB, 0xF4, 0xDB, 0xFA, /* 0xD4-0xD7 */
+	0xDB, 0xF0, 0xDB, 0xF8, 0xB6, 0x5C, 0xB6, 0x5F, /* 0xD8-0xDB */
+	0xDB, 0xF5, 0xB6, 0x5A, 0x00, 0x00, 0xB8, 0xF2, /* 0xDC-0xDF */
+	0xE0, 0x68, 0xB8, 0xF1, 0xE0, 0x6F, 0xE0, 0x6E, /* 0xE0-0xE3 */
+	0xB8, 0xF8, 0x00, 0x00, 0xB8, 0xF9, 0xE0, 0x70, /* 0xE4-0xE7 */
+	0xB8, 0xF3, 0xE0, 0x6D, 0xB8, 0xF7, 0xE0, 0x72, /* 0xE8-0xEB */
+	0xE0, 0x69, 0x00, 0x00, 0xE0, 0x6B, 0xB8, 0xF4, /* 0xEC-0xEF */
+	0xE0, 0x67, 0xE0, 0x6A, 0xE0, 0x71, 0xB8, 0xF5, /* 0xF0-0xF3 */
+	0xE0, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0x00, 0x00, 0xB8, 0xF6, 0x00, 0x00, /* 0xF8-0xFB */
+	0xBB, 0xB1, 0xE4, 0x5B, 0xE4, 0x61, 0xE4, 0x59, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8E[512] = {
+	0xE4, 0x62, 0x00, 0x00, 0xE4, 0x58, 0xE4, 0x5D, /* 0x00-0x03 */
+	0xE4, 0x63, 0xE4, 0x60, 0xE4, 0x5F, 0xE4, 0x5E, /* 0x04-0x07 */
+	0x00, 0x00, 0xE4, 0x57, 0xE4, 0x5C, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0xE4, 0x5A, 0x00, 0x00, 0xBD, 0xF1, /* 0x0C-0x0F */
+	0xBD, 0xEE, 0xE7, 0xFB, 0xE8, 0x41, 0xE8, 0x43, /* 0x10-0x13 */
+	0xE8, 0x40, 0xE7, 0xF8, 0xE7, 0xFA, 0xE8, 0x45, /* 0x14-0x17 */
+	0xE8, 0x42, 0xE7, 0xFC, 0xE8, 0x46, 0xE7, 0xF9, /* 0x18-0x1B */
+	0xE8, 0x44, 0xBD, 0xEF, 0xBD, 0xF5, 0xBD, 0xF3, /* 0x1C-0x1F */
+	0xE7, 0xF3, 0xBD, 0xF4, 0xBD, 0xF0, 0xE7, 0xF4, /* 0x20-0x23 */
+	0xE7, 0xF6, 0xE7, 0xF5, 0xE7, 0xFD, 0xE7, 0xFE, /* 0x24-0x27 */
+	0x00, 0x00, 0xBD, 0xF2, 0x00, 0x00, 0xBD, 0xED, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xE7, 0xF7, 0x00, 0x00, /* 0x2C-0x2F */
+	0xEB, 0xC6, 0xBF, 0xE2, 0x00, 0x00, 0xEB, 0xBD, /* 0x30-0x33 */
+	0xBF, 0xE3, 0xBF, 0xE6, 0xEB, 0xC2, 0x00, 0x00, /* 0x34-0x37 */
+	0xEB, 0xBF, 0xBF, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xEB, 0xC3, 0xEB, 0xC4, 0xEB, 0xBE, 0xEB, 0xC7, /* 0x3C-0x3F */
+	0xEB, 0xC0, 0xEB, 0xC5, 0xBF, 0xE4, 0x00, 0x00, /* 0x40-0x43 */
+	0xBF, 0xE1, 0xEB, 0xC1, 0x00, 0x00, 0xEE, 0xBF, /* 0x44-0x47 */
+	0xC1, 0xD0, 0xC1, 0xCE, 0xC1, 0xD1, 0xC1, 0xCF, /* 0x48-0x4B */
+	0xEE, 0xBE, 0xEE, 0xBB, 0xEE, 0xBA, 0x00, 0x00, /* 0x4C-0x4F */
+	0xEE, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xBC, /* 0x50-0x53 */
+	0xF1, 0x45, 0xC2, 0xDE, 0xF0, 0xFB, 0xF0, 0xFA, /* 0x54-0x57 */
+	0x00, 0x00, 0xC2, 0xD9, 0xF1, 0x41, 0xF1, 0x40, /* 0x58-0x5B */
+	0xF0, 0xF7, 0xF1, 0x43, 0xF0, 0xFC, 0xC2, 0xDD, /* 0x5C-0x5F */
+	0xF0, 0xF9, 0xF1, 0x42, 0xF0, 0xF8, 0xC2, 0xDA, /* 0x60-0x63 */
+	0xC2, 0xDC, 0xF0, 0xFD, 0xC2, 0xDB, 0xF0, 0xFE, /* 0x64-0x67 */
+	0x00, 0x00, 0xF1, 0x44, 0xF3, 0x52, 0x00, 0x00, /* 0x68-0x6B */
+	0xC3, 0xDE, 0xF3, 0x4F, 0x00, 0x00, 0xF3, 0x53, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xC3, 0xDB, 0xF3, 0x51, /* 0x70-0x73 */
+	0xC3, 0xE0, 0x00, 0x00, 0xC3, 0xDD, 0x00, 0x00, /* 0x74-0x77 */
+	0xF3, 0x50, 0x00, 0x00, 0xC3, 0xDF, 0xF3, 0x54, /* 0x78-0x7B */
+	0xC3, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0xC4, 0xBC, 0xC4, 0xBE, 0x00, 0x00, /* 0x80-0x83 */
+	0xF4, 0xD9, 0xC4, 0xBD, 0xF4, 0xD7, 0xC3, 0xDC, /* 0x84-0x87 */
+	0xF4, 0xD8, 0xC4, 0xBB, 0xC5, 0x43, 0xC5, 0x45, /* 0x88-0x8B */
+	0xF6, 0x56, 0xC5, 0x44, 0xF6, 0x55, 0x00, 0x00, /* 0x8C-0x8F */
+	0xF7, 0x61, 0xC5, 0xAD, 0xF7, 0x60, 0xC5, 0xAE, /* 0x90-0x93 */
+	0xF7, 0x5E, 0xF7, 0x5D, 0xF7, 0x62, 0xF7, 0x63, /* 0x94-0x97 */
+	0xF8, 0x46, 0x00, 0x00, 0xF7, 0x5F, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0xF8, 0xC6, 0xF8, 0xC3, 0xF8, 0xC4, /* 0x9C-0x9F */
+	0xF8, 0xC5, 0xC6, 0x5C, 0x00, 0x00, 0xF9, 0x51, /* 0xA0-0xA3 */
+	0xF9, 0x50, 0xF9, 0x4F, 0xF9, 0x70, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xF9, 0xBE, 0xF9, 0xAB, 0xC6, 0x6E, 0xA8, 0xAD, /* 0xA8-0xAB */
+	0xB0, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xB8, 0xFA, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0xBD, 0xF6, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xEB, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xC2, 0xDF, 0x00, 0x00, 0xF3, 0x55, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF9, 0xAC, 0xA8, 0xAE, 0xAA, 0xEE, /* 0xC8-0xCB */
+	0xAD, 0x79, 0xAD, 0x78, 0x00, 0x00, 0xB0, 0x63, /* 0xCC-0xCF */
+	0x00, 0x00, 0xD3, 0xE8, 0xB0, 0x61, 0xD3, 0xE9, /* 0xD0-0xD3 */
+	0xB0, 0x62, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xDF, /* 0xD4-0xD7 */
+	0xD7, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x6D, /* 0xD8-0xDB */
+	0xD7, 0xDE, 0xD7, 0xDD, 0xD7, 0xDC, 0xB3, 0x6E, /* 0xDC-0xDF */
+	0xD7, 0xE0, 0xD7, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xDC, 0x43, 0xDC, 0x41, 0xDC, 0x45, /* 0xE4-0xE7 */
+	0xDC, 0x46, 0xDC, 0x4C, 0x00, 0x00, 0xDC, 0x48, /* 0xE8-0xEB */
+	0xDC, 0x4A, 0x00, 0x00, 0xDC, 0x42, 0xDB, 0xFC, /* 0xEC-0xEF */
+	0x00, 0x00, 0xDC, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xDC, 0x4B, 0xDC, 0x44, 0xDC, 0x47, 0xDB, 0xFD, /* 0xF4-0xF7 */
+	0xB6, 0x62, 0xDC, 0x40, 0xDB, 0xFE, 0xB6, 0x61, /* 0xF8-0xFB */
+	0xB6, 0x63, 0x00, 0x00, 0xB8, 0xFD, 0xE0, 0x75, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_8F[512] = {
+	0xE0, 0x77, 0xE0, 0x76, 0xE0, 0x7B, 0xB8, 0xFB, /* 0x00-0x03 */
+	0x00, 0x00, 0xE0, 0x78, 0xE0, 0x74, 0xE0, 0x79, /* 0x04-0x07 */
+	0xE0, 0x7A, 0xB8, 0xFC, 0xB8, 0xFE, 0xE0, 0x7C, /* 0x08-0x0B */
+	0x00, 0x00, 0xE4, 0x67, 0xE4, 0x66, 0x00, 0x00, /* 0x0C-0x0F */
+	0xE4, 0x64, 0xE4, 0x65, 0xBB, 0xB3, 0xBB, 0xB5, /* 0x10-0x13 */
+	0xBB, 0xB2, 0xBB, 0xB4, 0xE8, 0x4D, 0xE8, 0x4E, /* 0x14-0x17 */
+	0xE8, 0x49, 0x00, 0x00, 0xE8, 0x4A, 0xBD, 0xF8, /* 0x18-0x1B */
+	0xBD, 0xFD, 0xBD, 0xF7, 0xBD, 0xFE, 0xBD, 0xF9, /* 0x1C-0x1F */
+	0xE8, 0x4B, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x4C, /* 0x20-0x23 */
+	0xE8, 0x48, 0xBE, 0x40, 0xBD, 0xFB, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0xBD, 0xFA, 0xBD, 0xFC, 0x00, 0x00, /* 0x28-0x2B */
+	0xE8, 0x47, 0x00, 0x00, 0xEB, 0xCA, 0xBF, 0xE8, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xCC, 0xBF, 0xEA, /* 0x30-0x33 */
+	0xEB, 0xCF, 0xEB, 0xCB, 0xEB, 0xC9, 0xEB, 0xCE, /* 0x34-0x37 */
+	0xBF, 0xE9, 0xEB, 0xCD, 0x00, 0x00, 0xBF, 0xE7, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xC1, 0xD3, 0xC1, 0xD6, /* 0x3C-0x3F */
+	0xEE, 0xC1, 0x00, 0x00, 0xC1, 0xD4, 0xEE, 0xC0, /* 0x40-0x43 */
+	0xC1, 0xD2, 0xC1, 0xD5, 0xF1, 0x46, 0xF1, 0x47, /* 0x44-0x47 */
+	0xF1, 0x48, 0xC2, 0xE0, 0x00, 0x00, 0xF1, 0x49, /* 0x48-0x4B */
+	0x00, 0x00, 0xC2, 0xE1, 0xC3, 0xE2, 0xF3, 0x58, /* 0x4C-0x4F */
+	0xF3, 0x59, 0xF3, 0x57, 0xF3, 0x56, 0xF3, 0x5A, /* 0x50-0x53 */
+	0xC3, 0xE1, 0xF4, 0xDD, 0xF4, 0xDB, 0xF4, 0xDC, /* 0x54-0x57 */
+	0xF4, 0xDE, 0xF4, 0xDA, 0xF4, 0xDF, 0xF6, 0x58, /* 0x58-0x5B */
+	0x00, 0x00, 0xF6, 0x59, 0xF6, 0x57, 0xC5, 0x46, /* 0x5C-0x5F */
+	0xF7, 0x64, 0xC5, 0xAF, 0xF7, 0x65, 0xF8, 0x48, /* 0x60-0x63 */
+	0xF8, 0x47, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xAF, /* 0x98-0x9B */
+	0xB6, 0x64, 0x00, 0x00, 0x00, 0x00, 0xB9, 0x40, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBB, 0xB6, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0xBF, 0xEC, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xBF, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xC3, 0xE3, 0xC4, 0x7C, 0xC5, 0x47, /* 0xAC-0xAF */
+	0xA8, 0xB0, 0xB0, 0x64, 0xB9, 0x41, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xF3, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xA6, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xB1, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xA8, 0xB4, 0xA8, 0xB3, 0xA8, 0xB2, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xCB, 0xA5, 0x00, 0x00, 0xCD, 0xCD, /* 0xC8-0xCB */
+	0x00, 0x00, 0xCD, 0xCF, 0xAA, 0xEF, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0xAA, 0xF1, 0xCD, 0xCC, 0xCD, 0xCE, /* 0xD0-0xD3 */
+	0xAA, 0xF0, 0xCD, 0xD1, 0xCD, 0xD0, 0xCD, 0xD2, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xD0, 0xB6, 0xD0, 0xB4, 0xAD, 0x7C, 0xD0, 0xB3, /* 0xE0-0xE3 */
+	0xAD, 0xA3, 0xAD, 0x7E, 0xAD, 0x7B, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xAD, 0xA4, 0x00, 0x00, 0xAD, 0x7D, 0xAD, 0xA2, /* 0xE8-0xEB */
+	0x00, 0x00, 0xAD, 0xA1, 0xD0, 0xB5, 0x00, 0x00, /* 0xEC-0xEF */
+	0xAD, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xB0, 0x6A, 0xD3, 0xEB, 0xD3, 0xF1, 0xB0, 0x67, /* 0xF4-0xF7 */
+	0xB0, 0x6E, 0x00, 0x00, 0xB0, 0x69, 0xD3, 0xEE, /* 0xF8-0xFB */
+	0xD3, 0xF0, 0xB0, 0x6C, 0xD3, 0xEA, 0xD3, 0xED, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_90[512] = {
+	0xB0, 0x68, 0xB0, 0x65, 0xD3, 0xEC, 0xB0, 0x6B, /* 0x00-0x03 */
+	0xD3, 0xEF, 0xB0, 0x6D, 0xB0, 0x66, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xE3, /* 0x08-0x0B */
+	0xD7, 0xE6, 0xB3, 0x70, 0x00, 0x00, 0xB3, 0x7A, /* 0x0C-0x0F */
+	0xB3, 0x76, 0xD7, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xB3, 0x7E, 0xB3, 0x77, 0xB3, 0x7C, 0xB3, 0x72, /* 0x14-0x17 */
+	0x00, 0x00, 0xB3, 0x6F, 0xB3, 0x71, 0xB3, 0x7D, /* 0x18-0x1B */
+	0xD7, 0xE5, 0xB3, 0x75, 0xB3, 0x78, 0xB3, 0x74, /* 0x1C-0x1F */
+	0xB3, 0x79, 0xD7, 0xE7, 0xB3, 0x7B, 0xB3, 0x73, /* 0x20-0x23 */
+	0xD7, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xDC, 0x4D, 0xB6, 0x65, 0xDC, 0x4F, /* 0x2C-0x2F */
+	0x00, 0x00, 0xB6, 0x67, 0xB6, 0x69, 0x00, 0x00, /* 0x30-0x33 */
+	0xDC, 0x4E, 0xB6, 0x66, 0xB6, 0x6A, 0x00, 0x00, /* 0x34-0x37 */
+	0xB6, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xB9, 0x47, 0xE0, 0xA3, 0xB9, 0x4F, 0xE0, 0x7E, /* 0x3C-0x3F */
+	0x00, 0x00, 0xB9, 0x50, 0xB9, 0x45, 0x00, 0x00, /* 0x40-0x43 */
+	0xE0, 0xA1, 0x00, 0x00, 0x00, 0x00, 0xB9, 0x4A, /* 0x44-0x47 */
+	0x00, 0x00, 0xE0, 0xA2, 0xB9, 0x43, 0xB9, 0x42, /* 0x48-0x4B */
+	0x00, 0x00, 0xB9, 0x4D, 0xB9, 0x4C, 0xB9, 0x4B, /* 0x4C-0x4F */
+	0xB9, 0x49, 0xB9, 0x4E, 0xE0, 0x7D, 0xB9, 0x44, /* 0x50-0x53 */
+	0xB9, 0x46, 0xB9, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xBB, 0xB8, 0xBB, 0xBB, 0x00, 0x00, 0xBB, 0xBF, /* 0x58-0x5B */
+	0xBB, 0xB9, 0xBB, 0xBE, 0xBB, 0xBC, 0x00, 0x00, /* 0x5C-0x5F */
+	0xBB, 0xB7, 0x00, 0x00, 0xBB, 0xBD, 0xBB, 0xBA, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x52, /* 0x64-0x67 */
+	0xBE, 0x43, 0xBE, 0x41, 0x00, 0x00, 0xE8, 0x53, /* 0x68-0x6B */
+	0x00, 0x00, 0xBE, 0x44, 0xBE, 0x42, 0xE8, 0x51, /* 0x6C-0x6F */
+	0xE8, 0x50, 0x00, 0x00, 0xBF, 0xF0, 0xE8, 0x4F, /* 0x70-0x73 */
+	0xBF, 0xEE, 0xBF, 0xED, 0xEB, 0xD0, 0xBE, 0x45, /* 0x74-0x77 */
+	0xBF, 0xEF, 0xEB, 0xD1, 0xBF, 0xF2, 0xEB, 0xD2, /* 0x78-0x7B */
+	0xBF, 0xF1, 0xC1, 0xD8, 0xEE, 0xC3, 0xC1, 0xD7, /* 0x7C-0x7F */
+	
+	0xC1, 0xDC, 0xC1, 0xDA, 0xC1, 0xDB, 0xC2, 0xE3, /* 0x80-0x83 */
+	0xC1, 0xD9, 0xEE, 0xC2, 0xEB, 0xD3, 0xC2, 0xE2, /* 0x84-0x87 */
+	0xC2, 0xE4, 0x00, 0x00, 0xC3, 0xE4, 0xC3, 0xE5, /* 0x88-0x8B */
+	0x00, 0x00, 0xF4, 0xE0, 0x00, 0x00, 0xC5, 0xDE, /* 0x8C-0x8F */
+	0xC5, 0xDD, 0xA8, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xCA, 0x55, 0xB0, 0x6F, 0x00, 0x00, 0xCA, 0x52, /* 0x94-0x97 */
+	0xCA, 0x53, 0xCA, 0x51, 0x00, 0x00, 0xCA, 0x54, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xCB, 0xAA, 0xCB, 0xA7, /* 0x9C-0x9F */
+	0xCB, 0xAC, 0xCB, 0xA8, 0xA8, 0xB7, 0xA8, 0xBA, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xCB, 0xA9, 0xA8, 0xB9, 0xCB, 0xAB, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0xA8, 0xB8, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xD5, /* 0xAC-0xAF */
+	0xCD, 0xD7, 0xAA, 0xF4, 0xCD, 0xD3, 0xCD, 0xD6, /* 0xB0-0xB3 */
+	0xCD, 0xD4, 0xAA, 0xF2, 0xAA, 0xF5, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xAA, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xD0, 0xB8, 0xD0, 0xBC, 0xD0, 0xB9, /* 0xBC-0xBF */
+	0x00, 0x00, 0xAD, 0xA7, 0x00, 0x00, 0xAD, 0xA8, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xD0, 0xBB, 0x00, 0x00, 0xD0, 0xBD, /* 0xC4-0xC7 */
+	0xD0, 0xBF, 0x00, 0x00, 0xAD, 0xA5, 0xD0, 0xBE, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0xAD, 0xA6, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xD7, 0xEE, 0xD0, 0xBA, 0xD3, 0xF2, 0xD3, 0xFB, /* 0xD4-0xD7 */
+	0xD3, 0xF9, 0xD3, 0xF4, 0xD3, 0xF5, 0xD3, 0xFA, /* 0xD8-0xDB */
+	0xD3, 0xFC, 0xB0, 0x71, 0x00, 0x00, 0xD3, 0xF7, /* 0xDC-0xDF */
+	0xD3, 0xF3, 0xB0, 0x70, 0xB0, 0x72, 0xD3, 0xF6, /* 0xE0-0xE3 */
+	0xD3, 0xFD, 0xD3, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xB3, 0xA1, 0xD7, 0xF1, 0xD7, 0xE9, 0xD7, 0xEF, /* 0xE8-0xEB */
+	0xD7, 0xF0, 0xB3, 0xA2, 0x00, 0x00, 0xD7, 0xE8, /* 0xEC-0xEF */
+	0xD7, 0xEA, 0xD0, 0xB7, 0xD7, 0xEC, 0xD7, 0xED, /* 0xF0-0xF3 */
+	0xD7, 0xEB, 0xB6, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xDC, 0x56, 0xEB, 0xD4, 0xDC, 0x57, /* 0xF8-0xFB */
+	0xDC, 0x54, 0xB3, 0xA3, 0xB6, 0x6E, 0xDC, 0x53, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_91[512] = {
+	0xDC, 0x59, 0xDC, 0x58, 0xB6, 0x6B, 0xDC, 0x5C, /* 0x00-0x03 */
+	0xDC, 0x52, 0xDC, 0x5B, 0xDC, 0x50, 0xDC, 0x5A, /* 0x04-0x07 */
+	0xDC, 0x55, 0xB6, 0x6D, 0x00, 0x00, 0xE0, 0xAA, /* 0x08-0x0B */
+	0x00, 0x00, 0xE0, 0xA5, 0xE0, 0xAB, 0xE0, 0xA6, /* 0x0C-0x0F */
+	0xE0, 0xA4, 0xE0, 0xA7, 0xB9, 0x51, 0x00, 0x00, /* 0x10-0x13 */
+	0xE0, 0xA9, 0x00, 0x00, 0xE0, 0xA8, 0xB9, 0x52, /* 0x14-0x17 */
+	0xBB, 0xC1, 0xBB, 0xC0, 0xE4, 0x6E, 0xE4, 0x71, /* 0x18-0x1B */
+	0xE4, 0x69, 0xE4, 0x6D, 0xBB, 0xC2, 0xE4, 0x6C, /* 0x1C-0x1F */
+	0xE4, 0x6A, 0xE4, 0x70, 0xE4, 0x6B, 0xE4, 0x68, /* 0x20-0x23 */
+	0xE4, 0x6F, 0x00, 0x00, 0xE8, 0x59, 0xBE, 0x48, /* 0x24-0x27 */
+	0xF1, 0x4A, 0xE8, 0x56, 0xE8, 0x57, 0xE8, 0x55, /* 0x28-0x2B */
+	0xDC, 0x51, 0xBE, 0x47, 0xE8, 0x5A, 0xE8, 0x54, /* 0x2C-0x2F */
+	0xBE, 0x46, 0xBE, 0x49, 0xE8, 0x58, 0xEB, 0xD5, /* 0x30-0x33 */
+	0xBF, 0xF3, 0xEB, 0xD6, 0xEB, 0xD7, 0x00, 0x00, /* 0x34-0x37 */
+	0xEE, 0xC4, 0xC1, 0xDD, 0xF1, 0x4B, 0xF1, 0x4C, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0x4D, 0xF3, 0x5D, /* 0x3C-0x3F */
+	0xF3, 0x5C, 0xF4, 0xE2, 0x00, 0x00, 0xF4, 0xE1, /* 0x40-0x43 */
+	0xF6, 0x5B, 0xF6, 0x5C, 0xF6, 0x5A, 0xF7, 0x66, /* 0x44-0x47 */
+	0xC5, 0xB0, 0xA8, 0xBB, 0xAD, 0xAA, 0xAD, 0xA9, /* 0x48-0x4B */
+	0xB0, 0x75, 0xB0, 0x74, 0xD4, 0x40, 0xD4, 0x41, /* 0x4C-0x4F */
+	0xD3, 0xFE, 0x00, 0x00, 0xB0, 0x73, 0xD7, 0xF5, /* 0x50-0x53 */
+	0x00, 0x00, 0xD7, 0xF6, 0xD7, 0xF2, 0xB3, 0xA4, /* 0x54-0x57 */
+	0xD7, 0xF3, 0x00, 0x00, 0xD7, 0xF4, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0x5F, /* 0x5C-0x5F */
+	0xDC, 0x61, 0xDC, 0x5D, 0xDC, 0x60, 0xB6, 0x6F, /* 0x60-0x63 */
+	0xDC, 0x5E, 0xB6, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xDD, 0x73, 0xB9, 0x55, 0xB9, 0x54, 0x00, 0x00, /* 0x68-0x6B */
+	0xB9, 0x53, 0x00, 0x00, 0xE0, 0xAC, 0xE0, 0xAD, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x73, 0xE4, 0x75, /* 0x70-0x73 */
+	0xBB, 0xC6, 0xBB, 0xC3, 0x00, 0x00, 0xBB, 0xC5, /* 0x74-0x77 */
+	0xBB, 0xC4, 0xE4, 0x74, 0xE4, 0x72, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xE8, 0x61, 0xE8, 0x5E, 0xE8, 0x5F, 0xBE, 0x4D, /* 0x80-0x83 */
+	0xE8, 0x60, 0xE8, 0x5B, 0xE8, 0x5C, 0xBE, 0x4A, /* 0x84-0x87 */
+	0x00, 0x00, 0xBE, 0x4B, 0xE8, 0x5D, 0xBE, 0x4C, /* 0x88-0x8B */
+	0x00, 0x00, 0xEB, 0xDB, 0x00, 0x00, 0xEB, 0xDC, /* 0x8C-0x8F */
+	0xEB, 0xD9, 0xEB, 0xDA, 0xBF, 0xF4, 0xEB, 0xD8, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0xEE, 0xC8, 0xEE, 0xC5, 0xEE, 0xC7, /* 0x98-0x9B */
+	0xC1, 0xE0, 0xEE, 0xCB, 0xC1, 0xDF, 0xEE, 0xC9, /* 0x9C-0x9F */
+	0xEE, 0xCC, 0xEE, 0xCA, 0xEE, 0xC6, 0xC1, 0xDE, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xF1, 0x4F, 0x00, 0x00, 0xF1, 0x50, /* 0xA4-0xA7 */
+	0xF1, 0x4E, 0x00, 0x00, 0xF1, 0x52, 0xC2, 0xE5, /* 0xA8-0xAB */
+	0xC2, 0xE6, 0xF3, 0x5F, 0xC3, 0xE7, 0xF1, 0x51, /* 0xAC-0xAF */
+	0xF3, 0x5E, 0xC3, 0xE6, 0xF4, 0xE5, 0xF4, 0xE6, /* 0xB0-0xB3 */
+	0xC4, 0xBF, 0xF4, 0xE4, 0x00, 0x00, 0xF4, 0xE3, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xF6, 0x5D, 0xC5, 0x48, 0x00, 0x00, /* 0xB8-0xBB */
+	0xF8, 0x49, 0xF8, 0xC8, 0xF8, 0xC7, 0x00, 0x00, /* 0xBC-0xBF */
+	0xC6, 0x43, 0xC6, 0x5D, 0xF8, 0xC9, 0xF9, 0x71, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xC6, 0x6F, 0xA8, 0xBC, 0xAA, 0xF6, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xB9, 0x56, 0x00, 0x00, 0xC4, 0xC0, /* 0xC8-0xCB */
+	0xA8, 0xBD, 0xAD, 0xAB, 0xB3, 0xA5, 0xB6, 0x71, /* 0xCC-0xCF */
+	0xC2, 0xE7, 0xAA, 0xF7, 0x00, 0x00, 0xD0, 0xC1, /* 0xD0-0xD3 */
+	0xD0, 0xC0, 0xD4, 0x42, 0x00, 0x00, 0xB0, 0x78, /* 0xD4-0xD7 */
+	0xB0, 0x76, 0xB0, 0x7A, 0xD4, 0x44, 0x00, 0x00, /* 0xD8-0xDB */
+	0xB0, 0x79, 0xB0, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xD4, 0x43, 0xB3, 0xA8, /* 0xE0-0xE3 */
+	0xD7, 0xFC, 0x00, 0x00, 0xB3, 0xA7, 0xB3, 0xA9, /* 0xE4-0xE7 */
+	0xD8, 0x42, 0xB3, 0xAB, 0xD7, 0xFE, 0xD8, 0x40, /* 0xE8-0xEB */
+	0xD7, 0xF7, 0xB3, 0xAA, 0xD8, 0x43, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xD7, 0xF9, 0x00, 0x00, 0xD7, 0xFA, /* 0xF0-0xF3 */
+	0xD7, 0xF8, 0xB3, 0xA6, 0x00, 0x00, 0xD8, 0x41, /* 0xF4-0xF7 */
+	0xD7, 0xFB, 0xD7, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xDC, 0x6D, 0x00, 0x00, 0xDC, 0x6C, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_92[512] = {
+	0xDC, 0x6A, 0xDC, 0x62, 0xDC, 0x71, 0xDC, 0x65, /* 0x00-0x03 */
+	0xDC, 0x6F, 0xDC, 0x76, 0xDC, 0x6E, 0xB6, 0x79, /* 0x04-0x07 */
+	0x00, 0x00, 0xB6, 0x75, 0xDC, 0x63, 0x00, 0x00, /* 0x08-0x0B */
+	0xDC, 0x69, 0xB6, 0x77, 0x00, 0x00, 0xDC, 0x68, /* 0x0C-0x0F */
+	0xB6, 0x78, 0xB6, 0x7A, 0xDC, 0x6B, 0x00, 0x00, /* 0x10-0x13 */
+	0xB6, 0x72, 0xB6, 0x73, 0xDC, 0x77, 0xDC, 0x75, /* 0x14-0x17 */
+	0x00, 0x00, 0xDC, 0x74, 0xDC, 0x66, 0x00, 0x00, /* 0x18-0x1B */
+	0xDC, 0x72, 0x00, 0x00, 0xB6, 0x76, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0x74, /* 0x20-0x23 */
+	0xDC, 0x73, 0xDC, 0x64, 0xDC, 0x67, 0xDC, 0x70, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xE4, 0xBA, 0xE0, 0xB7, 0x00, 0x00, /* 0x2C-0x2F */
+	0xE0, 0xB0, 0xE0, 0xC3, 0xE0, 0xCC, 0xE0, 0xB3, /* 0x30-0x33 */
+	0xB9, 0x61, 0x00, 0x00, 0xE0, 0xC0, 0xB9, 0x57, /* 0x34-0x37 */
+	0xB9, 0x59, 0xB9, 0x65, 0xE0, 0xB1, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xB9, 0x5A, 0xB9, 0x5C, 0xB9, 0x66, /* 0x3C-0x3F */
+	0xB9, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0xB9, 0x64, 0xE0, 0xB9, 0x00, 0x00, /* 0x44-0x47 */
+	0xE0, 0xAE, 0xB9, 0x62, 0xE0, 0xB8, 0xB9, 0x5E, /* 0x48-0x4B */
+	0xE0, 0xCA, 0xB9, 0x63, 0xE0, 0xC8, 0xE0, 0xBC, /* 0x4C-0x4F */
+	0xE0, 0xC6, 0xB9, 0x60, 0xE0, 0xAF, 0xE0, 0xC9, /* 0x50-0x53 */
+	0xE0, 0xC4, 0x00, 0x00, 0xE0, 0xCB, 0xB9, 0x58, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xB9, 0x67, 0xB9, 0x5D, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xE0, 0xB5, 0x00, 0x00, /* 0x5C-0x5F */
+	0xE0, 0xBD, 0xE0, 0xC1, 0x00, 0x00, 0xE0, 0xC5, /* 0x60-0x63 */
+	0xB9, 0x5F, 0xE0, 0xB4, 0xE0, 0xB2, 0xE0, 0xBE, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE0, 0xBB, 0xE0, 0xBA, 0x00, 0x00, 0xE0, 0xBF, /* 0x6C-0x6F */
+	0xE0, 0xC2, 0x00, 0x00, 0xE0, 0xC7, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0x78, 0x00, 0x00, /* 0x74-0x77 */
+	0xBB, 0xC7, 0xE4, 0xA4, 0xE4, 0x7A, 0xBB, 0xCC, /* 0x78-0x7B */
+	0xBB, 0xD0, 0xE4, 0xAD, 0xE4, 0xB5, 0xE4, 0xA6, /* 0x7C-0x7F */
+	
+	0xBB, 0xC8, 0x00, 0x00, 0xE4, 0xAA, 0xE0, 0xB6, /* 0x80-0x83 */
+	0x00, 0x00, 0xBB, 0xC9, 0xE4, 0xB1, 0xE4, 0xB6, /* 0x84-0x87 */
+	0xE4, 0xAE, 0x00, 0x00, 0xE4, 0xB0, 0xE4, 0xB9, /* 0x88-0x8B */
+	0xE4, 0xB2, 0xE4, 0x7E, 0xE4, 0xA9, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xBB, 0xD1, 0x00, 0x00, 0xBB, 0xCD, /* 0x90-0x93 */
+	0xE4, 0x7C, 0xE4, 0xAB, 0xBB, 0xCB, 0xE4, 0xA5, /* 0x94-0x97 */
+	0xBB, 0xCA, 0xE4, 0xB3, 0xE4, 0xA2, 0xE4, 0x79, /* 0x98-0x9B */
+	0xBB, 0xCE, 0xE4, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xE4, 0x7B, 0xE4, 0xAF, 0xE4, 0xAC, 0xE4, 0xA7, /* 0xA0-0xA3 */
+	0xE4, 0x77, 0xE4, 0x76, 0xE4, 0xA1, 0xE4, 0xB4, /* 0xA4-0xA7 */
+	0xBB, 0xCF, 0xE4, 0xB7, 0xE4, 0x7D, 0xE4, 0xA3, /* 0xA8-0xAB */
+	0xBE, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0xBE, 0x5A, 0xBE, 0x55, /* 0xB0-0xB3 */
+	0xE8, 0xA4, 0xE8, 0xA1, 0xE8, 0x67, 0xBE, 0x50, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xF9, 0xD7, 0x00, 0x00, 0xBE, 0x4F, /* 0xB8-0xBB */
+	0xBE, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xE8, 0x65, 0xBE, 0x54, 0xE8, 0x71, 0xE8, 0x63, /* 0xC0-0xC3 */
+	0xE8, 0x64, 0xBE, 0x4E, 0xE8, 0xA3, 0xBE, 0x58, /* 0xC4-0xC7 */
+	0xE8, 0x74, 0xE8, 0x79, 0xE8, 0x73, 0xEB, 0xEE, /* 0xC8-0xCB */
+	0xE8, 0x6F, 0xE8, 0x77, 0xE8, 0x75, 0xE8, 0x68, /* 0xCC-0xCF */
+	0xE8, 0x62, 0xE8, 0x7D, 0xBE, 0x57, 0xE8, 0x7E, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xE8, 0x78, 0x00, 0x00, 0xE8, 0x6D, /* 0xD4-0xD7 */
+	0xE8, 0x6B, 0xE8, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0xE8, 0x6E, 0xE8, 0x7B, 0xE8, 0x6A, /* 0xDC-0xDF */
+	0xE8, 0x7A, 0xE8, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xBE, 0x53, 0x00, 0x00, 0xE8, 0x76, 0xE8, 0x7C, /* 0xE4-0xE7 */
+	0xE8, 0x72, 0xE8, 0x6C, 0xBE, 0x51, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xA8, 0xE8, 0x70, /* 0xEC-0xEF */
+	0xBE, 0x59, 0xE8, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xF4, /* 0xF4-0xF7 */
+	0xBF, 0xF7, 0xEB, 0xF3, 0xEB, 0xF0, 0xEC, 0x44, /* 0xF8-0xFB */
+	0xBF, 0xFB, 0x00, 0x00, 0xEC, 0x41, 0xEB, 0xF8, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_93[512] = {
+	0xEC, 0x43, 0xEB, 0xE9, 0xEB, 0xF6, 0x00, 0x00, /* 0x00-0x03 */
+	0xBF, 0xFD, 0x00, 0x00, 0xEB, 0xE1, 0x00, 0x00, /* 0x04-0x07 */
+	0xEB, 0xDF, 0xEC, 0x42, 0x00, 0x00, 0xEC, 0x40, /* 0x08-0x0B */
+	0xEB, 0xFE, 0xEB, 0xED, 0xEB, 0xEC, 0xEB, 0xE2, /* 0x0C-0x0F */
+	0xC0, 0x40, 0x00, 0x00, 0xEB, 0xE8, 0xEB, 0xF2, /* 0x10-0x13 */
+	0xEB, 0xFD, 0xC0, 0x43, 0xEC, 0x45, 0x00, 0x00, /* 0x14-0x17 */
+	0xC1, 0xE8, 0xC0, 0x45, 0xBF, 0xFE, 0xEB, 0xE6, /* 0x18-0x1B */
+	0x00, 0x00, 0xEB, 0xEF, 0xEB, 0xDE, 0xEB, 0xE0, /* 0x1C-0x1F */
+	0xBF, 0xF5, 0xC0, 0x42, 0xBF, 0xFA, 0xEB, 0xE7, /* 0x20-0x23 */
+	0xEB, 0xF7, 0xEB, 0xF1, 0xC0, 0x41, 0xEB, 0xDD, /* 0x24-0x27 */
+	0xC1, 0xE3, 0xEB, 0xF9, 0xEB, 0xFC, 0xBF, 0xFC, /* 0x28-0x2B */
+	0x00, 0x00, 0xEB, 0xEB, 0xC0, 0x44, 0xBF, 0xF9, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBF, 0xF8, /* 0x30-0x33 */
+	0xEB, 0xF5, 0xEB, 0xFB, 0xBF, 0xF6, 0x00, 0x00, /* 0x34-0x37 */
+	0xEB, 0xE4, 0xEB, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0xEB, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xEB, 0xEA, 0xEE, 0xD2, /* 0x44-0x47 */
+	0x00, 0x00, 0xEE, 0xD7, 0xC1, 0xE5, 0xC1, 0xE7, /* 0x48-0x4B */
+	0xEE, 0xDD, 0xC1, 0xE1, 0xEE, 0xEC, 0xEE, 0xE3, /* 0x4C-0x4F */
+	0xEE, 0xD8, 0xEE, 0xD9, 0xEE, 0xE2, 0x00, 0x00, /* 0x50-0x53 */
+	0xC1, 0xEE, 0xEE, 0xE1, 0xEE, 0xD1, 0xEE, 0xE0, /* 0x54-0x57 */
+	0xEE, 0xD4, 0xEE, 0xED, 0xC1, 0xED, 0xC1, 0xEB, /* 0x58-0x5B */
+	0xEE, 0xD5, 0x00, 0x00, 0xEE, 0xE8, 0x00, 0x00, /* 0x5C-0x5F */
+	0xEE, 0xDA, 0xEE, 0xE7, 0x00, 0x00, 0xEE, 0xE9, /* 0x60-0x63 */
+	0xEE, 0xD0, 0xC1, 0xE6, 0x00, 0x00, 0xEE, 0xEA, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xEE, 0xDE, 0x00, 0x00, /* 0x68-0x6B */
+	0xC1, 0xEA, 0xEE, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0xC1, 0xEC, 0xEE, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xC1, 0xE4, 0xEE, 0xD6, 0xEE, 0xE5, /* 0x74-0x77 */
+	0x00, 0x00, 0xEE, 0xDF, 0xEB, 0xE3, 0xEE, 0xE6, /* 0x78-0x7B */
+	0xEE, 0xD3, 0x00, 0x00, 0xC1, 0xE9, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xEE, 0xEB, 0x00, 0x00, 0xC1, 0xE2, 0xEE, 0xCE, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xF1, 0x60, 0xF1, 0x59, 0xC2, 0xE9, 0x00, 0x00, /* 0x88-0x8B */
+	0xF1, 0x54, 0xF1, 0x63, 0xF1, 0x5B, 0xEE, 0xDC, /* 0x8C-0x8F */
+	0x00, 0x00, 0xF1, 0x65, 0xF1, 0x55, 0x00, 0x00, /* 0x90-0x93 */
+	0xC2, 0xE8, 0xF1, 0x5F, 0xC2, 0xEA, 0xC2, 0xF2, /* 0x94-0x97 */
+	0xC2, 0xF0, 0xF1, 0x61, 0xC2, 0xF1, 0xF1, 0x57, /* 0x98-0x9B */
+	0x00, 0x00, 0xF1, 0x58, 0xF1, 0x5D, 0xF1, 0x62, /* 0x9C-0x9F */
+	0x00, 0x00, 0xEE, 0xCD, 0xC2, 0xEB, 0xF1, 0x6A, /* 0xA0-0xA3 */
+	0xF1, 0x67, 0xF1, 0x6B, 0xF1, 0x5E, 0xF1, 0x5A, /* 0xA4-0xA7 */
+	0xF1, 0x68, 0xF3, 0x6A, 0xF1, 0x5C, 0x00, 0x00, /* 0xA8-0xAB */
+	0xC2, 0xEE, 0x00, 0x00, 0xC2, 0xED, 0xEE, 0xCF, /* 0xAC-0xAF */
+	0xC2, 0xEF, 0xF1, 0x64, 0xF1, 0x66, 0xC2, 0xEC, /* 0xB0-0xB3 */
+	0xF1, 0x69, 0xF1, 0x53, 0x00, 0x00, 0xF1, 0x56, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0xF3, 0x73, 0x00, 0x00, 0xF3, 0x63, 0xC3, 0xEB, /* 0xC0-0xC3 */
+	0xF3, 0x71, 0x00, 0x00, 0x00, 0x00, 0xF3, 0x61, /* 0xC4-0xC7 */
+	0xC3, 0xEC, 0x00, 0x00, 0xF3, 0x6C, 0x00, 0x00, /* 0xC8-0xCB */
+	0xF3, 0x68, 0xC3, 0xF1, 0xF3, 0x72, 0xF3, 0x62, /* 0xCC-0xCF */
+	0xF3, 0x65, 0xC3, 0xE9, 0xF3, 0x74, 0x00, 0x00, /* 0xD0-0xD3 */
+	0xF3, 0x6D, 0xF3, 0x70, 0xC3, 0xEF, 0xC3, 0xF4, /* 0xD4-0xD7 */
+	0xC3, 0xF2, 0xF3, 0x69, 0xF3, 0x64, 0x00, 0x00, /* 0xD8-0xDB */
+	0xC3, 0xED, 0xC3, 0xEE, 0xF3, 0x60, 0xC3, 0xEA, /* 0xDC-0xDF */
+	0x00, 0x00, 0xC3, 0xE8, 0xC3, 0xF0, 0xF3, 0x6F, /* 0xE0-0xE3 */
+	0xC3, 0xF3, 0x00, 0x00, 0xF3, 0x6B, 0xF3, 0x75, /* 0xE4-0xE7 */
+	0xC3, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0xF3, 0x67, 0x00, 0x00, 0xF3, 0x6E, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xF4, 0xF3, 0xF5, 0x42, 0xF4, 0xF5, /* 0xF4-0xF7 */
+	0xF4, 0xFC, 0xF3, 0x66, 0xF4, 0xFA, 0xF4, 0xE9, /* 0xF8-0xFB */
+	0xF5, 0x40, 0xC4, 0xC3, 0xF4, 0xED, 0xF4, 0xFE, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_94[512] = {
+	0xF4, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xC4, 0xC2, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0x44, 0xF4, 0xF6, /* 0x04-0x07 */
+	0x00, 0x00, 0xF4, 0xFB, 0xF4, 0xFD, 0xF4, 0xE7, /* 0x08-0x0B */
+	0xF5, 0x41, 0xF4, 0xF2, 0xF4, 0xF7, 0xF4, 0xEB, /* 0x0C-0x0F */
+	0xF4, 0xEF, 0xF5, 0x43, 0xF4, 0xF9, 0xF4, 0xE8, /* 0x10-0x13 */
+	0xF4, 0xEC, 0xF4, 0xEE, 0xF4, 0xF8, 0x00, 0x00, /* 0x14-0x17 */
+	0xC4, 0xC1, 0xF4, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0xF4, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xF4, 0xF0, 0xF6, 0x61, 0xF6, 0x66, 0xC5, 0x4F, /* 0x28-0x2B */
+	0xF6, 0x68, 0x00, 0x00, 0xC5, 0x49, 0x00, 0x00, /* 0x2C-0x2F */
+	0xF6, 0x64, 0xF6, 0x6A, 0xC5, 0x4E, 0xC5, 0x4A, /* 0x30-0x33 */
+	0x00, 0x00, 0xC5, 0x4B, 0xF6, 0x60, 0xF6, 0x67, /* 0x34-0x37 */
+	0xC5, 0x4D, 0xF6, 0x65, 0xC5, 0x4C, 0xF6, 0x5F, /* 0x38-0x3B */
+	0xF6, 0x63, 0xF6, 0x62, 0x00, 0x00, 0xF6, 0x5E, /* 0x3C-0x3F */
+	0xF6, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xC5, 0xB1, 0xF7, 0x6D, 0xF7, 0x70, 0xF7, 0x6C, /* 0x44-0x47 */
+	0xF7, 0x6E, 0xF7, 0x6F, 0xF7, 0x69, 0xF7, 0x6A, /* 0x48-0x4B */
+	0xF7, 0x67, 0x00, 0x00, 0x00, 0x00, 0xF7, 0x6B, /* 0x4C-0x4F */
+	0xF7, 0x68, 0xC5, 0xB2, 0xC5, 0xB3, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0xF8, 0x4B, 0x00, 0x00, 0xF8, 0x4D, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0xF8, 0x4C, 0xF8, 0x4E, 0x00, 0x00, /* 0x5C-0x5F */
+	0xC5, 0xE0, 0x00, 0x00, 0xF8, 0x4A, 0xC5, 0xDF, /* 0x60-0x63 */
+	0xC5, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0xF8, 0xCB, 0xF8, 0xCC, 0xC6, 0x44, 0xF8, 0xCA, /* 0x68-0x6B */
+	0x00, 0x00, 0xF9, 0x53, 0xF9, 0x52, 0xF9, 0x54, /* 0x6C-0x6F */
+	0xC6, 0x5F, 0xF9, 0x55, 0xC6, 0x5E, 0xF9, 0x56, /* 0x70-0x73 */
+	0xF9, 0x72, 0xF9, 0x75, 0xF9, 0x74, 0xC6, 0x68, /* 0x74-0x77 */
+	0xF9, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xC6, 0x72, 0xC6, 0x70, 0xC6, 0x71, 0xC6, 0x77, /* 0x7C-0x7F */
+	
+	0xF9, 0xC0, 0xF9, 0xC1, 0xF9, 0xBF, 0xF9, 0xC9, /* 0x80-0x83 */
+};
+
+static const unsigned char u2c_95[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAA, 0xF8, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0x44, 0xDC, 0x78, /* 0x78-0x7B */
+	0xE8, 0xA5, 0xF3, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xAA, 0xF9, 0x00, 0x00, 0xAD, 0xAC, 0xB0, 0x7B, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xD8, 0x45, 0x00, 0x00, /* 0x84-0x87 */
+	0xD8, 0x46, 0xB3, 0xAC, 0x00, 0x00, 0xB6, 0x7D, /* 0x88-0x8B */
+	0xDC, 0x7A, 0xDC, 0x79, 0xB6, 0xA3, 0xB6, 0x7C, /* 0x8C-0x8F */
+	0xDC, 0x7B, 0xB6, 0x7E, 0xB6, 0xA2, 0xB6, 0xA1, /* 0x90-0x93 */
+	0xB6, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xB9, 0x68, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xD0, /* 0x98-0x9B */
+	0xE0, 0xCE, 0x00, 0x00, 0xE0, 0xCF, 0xE0, 0xCD, /* 0x9C-0x9F */
+	0x00, 0x00, 0xBB, 0xD2, 0x00, 0x00, 0xBB, 0xD5, /* 0xA0-0xA3 */
+	0xBB, 0xD7, 0xBB, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xBB, 0xD3, 0xBB, 0xD4, 0x00, 0x00, 0xE8, 0xA7, /* 0xA8-0xAB */
+	0xE8, 0xA6, 0xBE, 0x5B, 0xE8, 0xA8, 0x00, 0x00, /* 0xAC-0xAF */
+	0xE8, 0xA9, 0xBE, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xEC, 0x4D, 0xEC, 0x4B, 0xEE, 0xF3, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xEC, 0x49, 0xEC, 0x4A, 0xC0, 0x46, /* 0xB8-0xBB */
+	0xEC, 0x46, 0xEC, 0x4E, 0xEC, 0x48, 0xEC, 0x4C, /* 0xBC-0xBF */
+	0xEE, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xF1, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xEE, 0xF2, 0xC1, 0xF3, 0xEE, 0xEE, /* 0xC4-0xC7 */
+	0xC1, 0xF2, 0xEE, 0xF0, 0xC1, 0xEF, 0xC1, 0xF0, /* 0xC8-0xCB */
+	0xC1, 0xF1, 0xEC, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0xC2, 0xF5, 0xF1, 0x6E, 0xF1, 0x6C, 0xF1, 0x6D, /* 0xD0-0xD3 */
+	0xC2, 0xF3, 0xC2, 0xF6, 0xC2, 0xF4, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0x77, 0xF3, 0x78, /* 0xD8-0xDB */
+	0xC3, 0xF6, 0x00, 0x00, 0xF5, 0x45, 0xF5, 0x47, /* 0xDC-0xDF */
+	0xF5, 0x46, 0xC4, 0xC4, 0xC5, 0x50, 0xF6, 0x6D, /* 0xE0-0xE3 */
+	0xF6, 0x6C, 0xF6, 0x6B, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+};
+
+static const unsigned char u2c_96[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xAA, 0xFA, 0x00, 0x00, 0xC9, 0xAA, 0x00, 0x00, /* 0x1C-0x1F */
+	0xCA, 0x58, 0xA6, 0xE9, 0xCA, 0x56, 0xCA, 0x59, /* 0x20-0x23 */
+	0xCA, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xCB, 0xAE, 0x00, 0x00, 0xA8, 0xC1, 0x00, 0x00, /* 0x28-0x2B */
+	0xA8, 0xC2, 0xCB, 0xB0, 0xA8, 0xBF, 0xCB, 0xAF, /* 0x2C-0x2F */
+	0xCB, 0xAD, 0xA8, 0xC0, 0xA8, 0xBE, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0xCD, 0xD8, 0xCD, 0xDB, 0xAA, 0xFD, /* 0x38-0x3B */
+	0xCD, 0xDA, 0xCD, 0xD9, 0x00, 0x00, 0xAA, 0xFC, /* 0x3C-0x3F */
+	0xAA, 0xFB, 0x00, 0x00, 0xAB, 0x40, 0xCD, 0xDC, /* 0x40-0x43 */
+	0xAA, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xD0, 0xC6, 0xAD, 0xAE, /* 0x48-0x4B */
+	0xAD, 0xAF, 0xAD, 0xB0, 0xD0, 0xC7, 0xD0, 0xC3, /* 0x4C-0x4F */
+	0xAD, 0xAD, 0xD0, 0xC4, 0x00, 0x00, 0xD0, 0xC5, /* 0x50-0x53 */
+	0xD0, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0xB0, 0xA4, 0x00, 0x00, 0x00, 0x00, 0xB0, 0xA1, /* 0x58-0x5B */
+	0xD4, 0x45, 0xB0, 0xA2, 0xB0, 0xA5, 0xD4, 0x46, /* 0x5C-0x5F */
+	0x00, 0x00, 0xB0, 0x7E, 0xB0, 0x7C, 0xB0, 0x7D, /* 0x60-0x63 */
+	0xB0, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xB3, 0xAD, 0xD8, 0x49, /* 0x68-0x6B */
+	0xB3, 0xB5, 0xD8, 0x48, 0x00, 0x00, 0xD8, 0x4B, /* 0x6C-0x6F */
+	0xB3, 0xB1, 0xD8, 0x4A, 0xB6, 0xAB, 0xB3, 0xAF, /* 0x70-0x73 */
+	0xB3, 0xB2, 0xB3, 0xAE, 0xB3, 0xB3, 0xB3, 0xB4, /* 0x74-0x77 */
+	0xB3, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0xD8, 0x47, 0xB6, 0xA7, 0xDC, 0x7D, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xDC, 0xA3, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xA2, /* 0x80-0x83 */
+	0xB6, 0xAC, 0xB6, 0xA8, 0xB6, 0xA9, 0xDC, 0x7C, /* 0x84-0x87 */
+	0xDC, 0x7E, 0xDC, 0xA1, 0xB6, 0xA4, 0xB6, 0xA6, /* 0x88-0x8B */
+	0x00, 0x00, 0xB6, 0xAA, 0xB6, 0xA5, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xE0, 0xD3, 0xE0, 0xD1, 0xE0, 0xD2, /* 0x90-0x93 */
+	0xB9, 0x6A, 0xB9, 0x6B, 0x00, 0x00, 0xE0, 0xD4, /* 0x94-0x97 */
+	0xB9, 0x69, 0xBB, 0xD8, 0x00, 0x00, 0xBB, 0xDA, /* 0x98-0x9B */
+	0xBB, 0xD9, 0x00, 0x00, 0xE4, 0xBB, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xE4, 0xBC, 0xE8, 0xAB, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xE8, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xC0, 0x47, /* 0xA4-0xA7 */
+	0xC0, 0x48, 0xEC, 0x4F, 0xC0, 0x49, 0x00, 0x00, /* 0xA8-0xAB */
+	0xEE, 0xF6, 0x00, 0x00, 0xEE, 0xF4, 0x00, 0x00, /* 0xAC-0xAF */
+	0xEE, 0xF5, 0xC1, 0xF4, 0x00, 0x00, 0xF1, 0x6F, /* 0xB0-0xB3 */
+	0xC3, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xC1, 0xF5, 0xAB, 0x41, 0x00, 0x00, 0xB0, 0xA6, /* 0xB8-0xBB */
+	0xD4, 0x47, 0x00, 0x00, 0x00, 0x00, 0xD8, 0x4C, /* 0xBC-0xBF */
+	0xB3, 0xB6, 0xB6, 0xAD, 0xDC, 0xA4, 0xDC, 0xA6, /* 0xC0-0xC3 */
+	0xB6, 0xAF, 0xB6, 0xAE, 0xB6, 0xB0, 0xB6, 0xB1, /* 0xC4-0xC7 */
+	0xDC, 0xA5, 0xB9, 0x6E, 0xB9, 0x6F, 0xB9, 0x6D, /* 0xC8-0xCB */
+	0xBB, 0xDB, 0xB9, 0x6C, 0xE0, 0xD5, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0xBB, 0xDC, 0xE8, 0xAC, /* 0xD0-0xD3 */
+	0xEC, 0x50, 0xC0, 0x4A, 0xC1, 0xF6, 0xF1, 0x70, /* 0xD4-0xD7 */
+	0xF1, 0x74, 0xC2, 0xF9, 0xF1, 0x71, 0xC2, 0xFA, /* 0xD8-0xDB */
+	0xC2, 0xF8, 0xF1, 0x75, 0xC2, 0xFB, 0xF1, 0x73, /* 0xDC-0xDF */
+	0x00, 0x00, 0xF3, 0x79, 0xC2, 0xF7, 0xC3, 0xF8, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xF8, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xAB, 0x42, 0xB3, 0xB8, 0xB3, 0xB7, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xB2, /* 0xEC-0xEF */
+	0xDC, 0xA8, 0xDC, 0xA7, 0xB6, 0xB3, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0xE0, 0xD9, 0xB9, 0x73, 0xB9, 0x70, /* 0xF4-0xF7 */
+	0xE0, 0xD8, 0xB9, 0x72, 0xE0, 0xD6, 0xB9, 0x71, /* 0xF8-0xFB */
+	0x00, 0x00, 0xE0, 0xD7, 0x00, 0x00, 0xE4, 0xBD, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_97[512] = {
+	0xBB, 0xDD, 0x00, 0x00, 0xE8, 0xAF, 0x00, 0x00, /* 0x00-0x03 */
+	0xBE, 0x5D, 0xE8, 0xAD, 0xBE, 0x5E, 0xBE, 0x5F, /* 0x04-0x07 */
+	0xE8, 0xAE, 0xBE, 0x60, 0x00, 0x00, 0xEC, 0x51, /* 0x08-0x0B */
+	0x00, 0x00, 0xC0, 0x4E, 0xC0, 0x4B, 0xC0, 0x50, /* 0x0C-0x0F */
+	0xEC, 0x53, 0xC0, 0x4C, 0xEC, 0x52, 0xC0, 0x4F, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0x4D, 0x00, 0x00, /* 0x14-0x17 */
+	0xEE, 0xF9, 0xEE, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xC1, 0xF7, 0xEE, 0xFA, 0xC1, 0xF8, 0xEE, 0xF8, /* 0x1C-0x1F */
+	0xEE, 0xF7, 0x00, 0x00, 0xF1, 0x77, 0xF1, 0x76, /* 0x20-0x23 */
+	0xC2, 0xFC, 0xF1, 0x78, 0xF3, 0x7E, 0xC3, 0xFA, /* 0x24-0x27 */
+	0xF3, 0x7D, 0xF3, 0x7A, 0xC3, 0xF9, 0xF3, 0x7B, /* 0x28-0x2B */
+	0xF3, 0x7C, 0x00, 0x00, 0xF5, 0x48, 0xF5, 0x49, /* 0x2C-0x2F */
+	0xC4, 0xC5, 0x00, 0x00, 0xC5, 0x53, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xF6, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0xC5, 0x51, 0xC5, 0x52, 0xF6, 0x6F, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xC5, 0xB4, 0xC5, 0xB5, 0xF7, 0x71, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0xC6, 0x45, 0xF8, 0xCF, /* 0x40-0x43 */
+	0xC6, 0x47, 0x00, 0x00, 0xF8, 0xCE, 0xF8, 0xD0, /* 0x44-0x47 */
+	0xC6, 0x46, 0xF9, 0x57, 0x00, 0x00, 0xF9, 0xAD, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xAB, 0x43, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0xB9, 0x74, 0x00, 0x00, /* 0x54-0x57 */
+	0xE4, 0xBE, 0x00, 0x00, 0xE8, 0xB0, 0xC0, 0x51, /* 0x58-0x5B */
+	0xC0, 0x52, 0x00, 0x00, 0xAB, 0x44, 0x00, 0x00, /* 0x5C-0x5F */
+	0xBE, 0x61, 0xC3, 0xFB, 0xAD, 0xB1, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0x53, 0x00, 0x00, /* 0x64-0x67 */
+	0xC5, 0xE2, 0xAD, 0xB2, 0xD8, 0x4D, 0x00, 0x00, /* 0x68-0x6B */
+	0xDC, 0xA9, 0x00, 0x00, 0xDC, 0xAB, 0x00, 0x00, /* 0x6C-0x6F */
+	0xDC, 0xAA, 0x00, 0x00, 0xE0, 0xDD, 0xE0, 0xDA, /* 0x70-0x73 */
+	0xB9, 0x75, 0x00, 0x00, 0xB9, 0x76, 0xE0, 0xDB, /* 0x74-0x77 */
+	0xE0, 0xDC, 0x00, 0x00, 0xE4, 0xC0, 0xE4, 0xC5, /* 0x78-0x7B */
+	0xBB, 0xDE, 0xE4, 0xBF, 0xE4, 0xC1, 0xE4, 0xC8, /* 0x7C-0x7F */
+	
+	0xE4, 0xC3, 0xE4, 0xC7, 0xE4, 0xC4, 0xE4, 0xC2, /* 0x80-0x83 */
+	0xE4, 0xC6, 0xBB, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0xE8, 0xB3, 0x00, 0x00, 0xE8, 0xB1, 0xBE, 0x63, /* 0x88-0x8B */
+	0x00, 0x00, 0xBE, 0x62, 0xE8, 0xB2, 0xBE, 0x64, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0xEC, 0x56, 0x00, 0x00, 0x00, 0x00, 0xEC, 0x55, /* 0x94-0x97 */
+	0xC0, 0x54, 0xEC, 0x54, 0xEE, 0xFC, 0x00, 0x00, /* 0x98-0x9B */
+	0xEE, 0xFE, 0xEF, 0x41, 0xEF, 0x40, 0x00, 0x00, /* 0x9C-0x9F */
+	0xC1, 0xF9, 0xEE, 0xFD, 0xF1, 0xA1, 0xC2, 0xFD, /* 0xA0-0xA3 */
+	0xF1, 0x7D, 0xF1, 0xA2, 0xC2, 0xFE, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xF1, 0x7B, 0x00, 0x00, 0xF1, 0x7E, 0xF1, 0x7C, /* 0xA8-0xAB */
+	0xF1, 0x79, 0xC3, 0x40, 0xF1, 0x7A, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xA1, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xF3, 0xA3, 0xF3, 0xA2, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xF5, 0x4A, 0x00, 0x00, 0xF5, 0x4B, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0x70, /* 0xBC-0xBF */
+	0x00, 0x00, 0xC5, 0xB7, 0x00, 0x00, 0xC5, 0xB6, /* 0xC0-0xC3 */
+	0xF8, 0x4F, 0xF8, 0x50, 0xC6, 0x48, 0xF8, 0xD1, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xC6, 0x69, 0x00, 0x00, 0xAD, 0xB3, /* 0xC8-0xCB */
+	0xB6, 0xB4, 0xE4, 0xCA, 0xE4, 0xC9, 0xE8, 0xB5, /* 0xCC-0xCF */
+	0xE8, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xC1, 0xFA, /* 0xD0-0xD3 */
+	0xEF, 0x43, 0xEF, 0x42, 0xF1, 0xA5, 0xF1, 0xA3, /* 0xD4-0xD7 */
+	0xF1, 0xA6, 0xF1, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xC3, 0xFC, 0xF3, 0xA4, 0xF3, 0xA5, 0xF3, 0xA6, /* 0xDC-0xDF */
+	0x00, 0x00, 0xF6, 0x71, 0x00, 0x00, 0xF7, 0x72, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xF8, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xAD, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xEC, 0x57, 0xEF, 0x44, 0x00, 0x00, 0xAD, 0xB5, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xBB, 0xE0, 0x00, 0x00, /* 0xF4-0xF7 */
+	0xEC, 0x58, 0xC3, 0x41, 0xF1, 0xA7, 0xC3, 0xFD, /* 0xF8-0xFB */
+	0x00, 0x00, 0xF5, 0x4C, 0xF5, 0x4D, 0xC5, 0x54, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_98[512] = {
+	0xF8, 0x51, 0xAD, 0xB6, 0xB3, 0xBB, 0xB3, 0xBC, /* 0x00-0x03 */
+	0xD8, 0x4E, 0xB6, 0xB5, 0xB6, 0xB6, 0xDC, 0xAC, /* 0x04-0x07 */
+	0xB6, 0xB7, 0x00, 0x00, 0xB9, 0x7A, 0x00, 0x00, /* 0x08-0x0B */
+	0xB9, 0x7C, 0xE0, 0xDF, 0xE0, 0xE0, 0xE0, 0xDE, /* 0x0C-0x0F */
+	0xB9, 0x77, 0xB9, 0x78, 0xB9, 0x7B, 0xB9, 0x79, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0xE4, 0xCB, 0xBB, 0xE1, /* 0x14-0x17 */
+	0xBB, 0xE2, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xBC, /* 0x18-0x1B */
+	0xBE, 0x67, 0xE8, 0xB7, 0xE8, 0xB6, 0x00, 0x00, /* 0x1C-0x1F */
+	0xE8, 0xBB, 0xBE, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xC0, 0x5B, 0x00, 0x00, 0xE8, 0xB8, 0xE8, 0xBD, /* 0x24-0x27 */
+	0xE8, 0xBA, 0xE8, 0xB9, 0x00, 0x00, 0xBE, 0x66, /* 0x28-0x2B */
+	0x00, 0x00, 0xC0, 0x59, 0x00, 0x00, 0xEC, 0x5A, /* 0x2C-0x2F */
+	0xC0, 0x55, 0x00, 0x00, 0xEC, 0x5B, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0xEC, 0x59, 0x00, 0x00, 0xC0, 0x58, /* 0x34-0x37 */
+	0xC0, 0x56, 0xC0, 0x5A, 0x00, 0x00, 0xC0, 0x57, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0xEF, 0x45, 0x00, 0x00, 0xEF, 0x4A, /* 0x40-0x43 */
+	0xEF, 0x46, 0xEF, 0x49, 0xC1, 0xFB, 0x00, 0x00, /* 0x44-0x47 */
+	0xED, 0xD4, 0xEF, 0x48, 0xEF, 0x47, 0x00, 0x00, /* 0x48-0x4B */
+	0xC3, 0x44, 0xC3, 0x42, 0xC3, 0x45, 0xC3, 0x43, /* 0x4C-0x4F */
+	0xF1, 0xA8, 0xF1, 0xA9, 0xF1, 0xAA, 0xC3, 0x46, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xAA, /* 0x54-0x57 */
+	0xC4, 0x40, 0xF3, 0xA8, 0x00, 0x00, 0xC4, 0x41, /* 0x58-0x5B */
+	0xF3, 0xA7, 0xF3, 0xA9, 0xC3, 0xFE, 0xF5, 0x51, /* 0x5C-0x5F */
+	0xF5, 0x4E, 0x00, 0x00, 0xF5, 0x4F, 0xF5, 0x50, /* 0x60-0x63 */
+	0xF6, 0x72, 0xC5, 0x56, 0x00, 0x00, 0xC5, 0x55, /* 0x64-0x67 */
+	0x00, 0x00, 0xF7, 0x74, 0xF7, 0x73, 0xC5, 0xB8, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC5, 0xE3, /* 0x6C-0x6F */
+	0xC6, 0x49, 0xC6, 0x60, 0xF9, 0x58, 0xF9, 0xAE, /* 0x70-0x73 */
+	0xF9, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xAD, 0xB7, 0xDC, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0xE0, 0xE1, 0xE4, 0xCC, 0xE4, 0xCD, 0xBB, 0xE3, /* 0xAC-0xAF */
+	0x00, 0x00, 0xBB, 0xE4, 0xE8, 0xBE, 0xBE, 0x68, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0xC1, 0xFC, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xF1, 0xAB, 0x00, 0x00, 0xC3, 0x47, 0xF3, 0xAD, /* 0xB8-0xBB */
+	0xC4, 0x42, 0xF3, 0xAC, 0xF3, 0xAE, 0xF3, 0xAB, /* 0xBC-0xBF */
+	0xF6, 0x75, 0xF5, 0x52, 0xF5, 0x53, 0x00, 0x00, /* 0xC0-0xC3 */
+	0xC4, 0xC6, 0x00, 0x00, 0xF6, 0x74, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xF6, 0x73, 0x00, 0x00, 0xF7, 0x75, /* 0xC8-0xCB */
+	0xF9, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xB8, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xB9, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xB0, 0xA7, 0xD4, 0x48, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xD8, 0x4F, 0x00, 0x00, 0xB6, 0xB8, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xB6, 0xBB, 0xB6, 0xB9, 0xDC, 0xAE, /* 0xE8-0xEB */
+	0x00, 0x00, 0xB6, 0xBD, 0x00, 0x00, 0xB6, 0xBA, /* 0xEC-0xEF */
+	0x00, 0x00, 0x00, 0x00, 0xB6, 0xBC, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xB9, 0x7E, 0x00, 0x00, 0xE0, 0xE2, 0x00, 0x00, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE0, 0xE3, 0xE8, 0xC0, 0x00, 0x00, /* 0xF8-0xFB */
+	0xB9, 0x7D, 0xB9, 0xA1, 0xB9, 0xA2, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_99[512] = {
+	0xE4, 0xCF, 0x00, 0x00, 0xE4, 0xCE, 0xBB, 0xE5, /* 0x00-0x03 */
+	0x00, 0x00, 0xBB, 0xE6, 0x00, 0x00, 0xE4, 0xD0, /* 0x04-0x07 */
+	0xE8, 0xBF, 0xBB, 0xE8, 0xBE, 0x69, 0x00, 0x00, /* 0x08-0x0B */
+	0xBB, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xC0, 0x5C, 0xE8, 0xC1, 0xBE, 0x6B, 0xBE, 0x6A, /* 0x10-0x13 */
+	0xE8, 0xC2, 0xE8, 0xC5, 0xE8, 0xC3, 0xE8, 0xC4, /* 0x14-0x17 */
+	0xBE, 0x6C, 0x00, 0x00, 0xC0, 0x61, 0xC0, 0x5F, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0x5E, 0xEC, 0x5D, /* 0x1C-0x1F */
+	0x00, 0x00, 0xC0, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0xEC, 0x5C, 0xEF, 0x4B, 0x00, 0x00, 0xEC, 0x5E, /* 0x24-0x27 */
+	0xC0, 0x5D, 0xEC, 0x5F, 0xEF, 0x4E, 0xEF, 0x4C, /* 0x28-0x2B */
+	0xEF, 0x4D, 0xEF, 0x52, 0xC3, 0x4B, 0xEF, 0x51, /* 0x2C-0x2F */
+	0xEF, 0x54, 0xEF, 0x53, 0xEF, 0x50, 0xEF, 0x4F, /* 0x30-0x33 */
+	0x00, 0x00, 0xC1, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xAE, 0x00, 0x00, /* 0x38-0x3B */
+	0xF1, 0xAD, 0xC3, 0x4A, 0xC3, 0x48, 0xC3, 0x49, /* 0x3C-0x3F */
+	0x00, 0x00, 0xF1, 0xAC, 0x00, 0x00, 0xF3, 0xB1, /* 0x40-0x43 */
+	0x00, 0x00, 0xC4, 0x43, 0x00, 0x00, 0xF3, 0xB0, /* 0x44-0x47 */
+	0xF3, 0xAF, 0xC4, 0x44, 0x00, 0x00, 0xF5, 0x58, /* 0x48-0x4B */
+	0xF5, 0x57, 0x00, 0x00, 0xF5, 0x55, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF5, 0x54, 0xC4, 0xC8, 0xC4, 0xC7, 0xF5, 0x59, /* 0x50-0x53 */
+	0xF7, 0x76, 0xC5, 0xB9, 0xF6, 0x77, 0xC5, 0x57, /* 0x54-0x57 */
+	0xF6, 0x76, 0xF5, 0x56, 0x00, 0x00, 0xF7, 0x77, /* 0x58-0x5B */
+	0xC5, 0xE4, 0x00, 0x00, 0xC6, 0x61, 0xF9, 0x59, /* 0x5C-0x5F */
+	0x00, 0x00, 0xF9, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0xAD, 0xBA, 0xD8, 0x50, /* 0x94-0x97 */
+	0xEF, 0x55, 0xAD, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xE4, 0xD2, 0xE4, 0xD1, 0xEC, 0x60, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0xEF, 0x57, 0x00, 0x00, 0xEF, 0x56, /* 0xA0-0xA3 */
+	0x00, 0x00, 0xC3, 0x4C, 0xF3, 0xB2, 0xF3, 0xB3, /* 0xA4-0xA7 */
+	0xC4, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xB2, /* 0xA8-0xAB */
+	0xB0, 0xA8, 0xB6, 0xBF, 0xB6, 0xBE, 0xE0, 0xE4, /* 0xAC-0xAF */
+	0xE0, 0xE6, 0xB9, 0xA4, 0xE0, 0xE5, 0xB9, 0xA3, /* 0xB0-0xB3 */
+	0xB9, 0xA5, 0xE0, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0xE4, 0xD4, 0xE4, 0xD6, 0xE4, 0xD5, /* 0xB8-0xBB */
+	0x00, 0x00, 0xE4, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0xBB, 0xE9, 0xE4, 0xD7, 0xE4, 0xD3, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xD9, /* 0xC4-0xC7 */
+	0x00, 0x00, 0xE8, 0xCC, 0x00, 0x00, 0xE8, 0xCF, /* 0xC8-0xCB */
+	0xE8, 0xD1, 0xE8, 0xC7, 0xE8, 0xCB, 0xE8, 0xC8, /* 0xCC-0xCF */
+	0xBE, 0x6E, 0xBE, 0x71, 0xBE, 0x73, 0xE8, 0xC9, /* 0xD0-0xD3 */
+	0xE8, 0xCA, 0xBE, 0x72, 0xE8, 0xCD, 0xE8, 0xD0, /* 0xD4-0xD7 */
+	0xE8, 0xCE, 0xBE, 0x74, 0x00, 0x00, 0xBE, 0x70, /* 0xD8-0xDB */
+	0xE8, 0xC6, 0xBE, 0x6D, 0x00, 0x00, 0xBE, 0x6F, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0xC0, 0x63, 0xEC, 0x66, /* 0xE0-0xE3 */
+	0xEC, 0x64, 0xEC, 0x63, 0x00, 0x00, 0xEC, 0x69, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xEC, 0x68, 0xEC, 0x67, 0x00, 0x00, /* 0xE8-0xEB */
+	0xEC, 0x62, 0xC0, 0x62, 0xEC, 0x61, 0x00, 0x00, /* 0xEC-0xEF */
+	0xEC, 0x65, 0xC0, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0xEF, 0x5A, 0x00, 0x00, 0xEF, 0x5E, 0xEF, 0x5B, /* 0xF4-0xF7 */
+	0xEF, 0x5D, 0xEF, 0x5C, 0xEF, 0x59, 0xEF, 0x5F, /* 0xF8-0xFB */
+	0xEF, 0x62, 0xEF, 0x60, 0xEF, 0x61, 0xC2, 0x40, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9A[512] = {
+	0x00, 0x00, 0xC1, 0xFE, 0xEF, 0x58, 0xEF, 0x63, /* 0x00-0x03 */
+	0xF1, 0xB3, 0xF1, 0xB6, 0xF1, 0xB8, 0xF1, 0xB7, /* 0x04-0x07 */
+	0x00, 0x00, 0xF1, 0xB1, 0xF1, 0xB5, 0xF1, 0xB0, /* 0x08-0x0B */
+	0x00, 0x00, 0xF1, 0xB2, 0xC3, 0x4D, 0xF1, 0xAF, /* 0x0C-0x0F */
+	0x00, 0x00, 0xF1, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0xF3, 0xC0, 0xF3, 0xB5, 0xC4, 0x45, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0xC4, 0x46, 0xF3, 0xB4, 0xF3, 0xB9, /* 0x18-0x1B */
+	0xF3, 0xBF, 0xF3, 0xB7, 0xF3, 0xBE, 0x00, 0x00, /* 0x1C-0x1F */
+	0xF3, 0xBB, 0x00, 0x00, 0xF3, 0xBA, 0xF3, 0xBD, /* 0x20-0x23 */
+	0xF3, 0xB8, 0xF3, 0xB6, 0x00, 0x00, 0xF3, 0xBC, /* 0x24-0x27 */
+	0x00, 0x00, 0xF5, 0x60, 0xF5, 0x5E, 0xC4, 0xCA, /* 0x28-0x2B */
+	0xF5, 0x5D, 0xF5, 0x63, 0xF5, 0x61, 0x00, 0x00, /* 0x2C-0x2F */
+	0xC4, 0xCB, 0xF5, 0x5C, 0xF5, 0x5A, 0x00, 0x00, /* 0x30-0x33 */
+	0xF5, 0x5B, 0xC4, 0xCD, 0xF5, 0x5F, 0xC4, 0xCC, /* 0x34-0x37 */
+	0xF5, 0x62, 0xF6, 0x78, 0xF6, 0x7E, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0xF6, 0x79, 0xC5, 0x5B, 0xF6, 0xA1, /* 0x3C-0x3F */
+	0xC5, 0x5A, 0xF6, 0x7D, 0xF6, 0x7C, 0xC5, 0x59, /* 0x40-0x43 */
+	0xF6, 0x7B, 0xC5, 0x58, 0xF6, 0x7A, 0x00, 0x00, /* 0x44-0x47 */
+	0xF7, 0x7D, 0xF7, 0xA1, 0xF7, 0x7E, 0x00, 0x00, /* 0x48-0x4B */
+	0xF7, 0x7B, 0xC5, 0xBB, 0xF7, 0x78, 0xF7, 0x7C, /* 0x4C-0x4F */
+	0xF7, 0xA3, 0x00, 0x00, 0xF7, 0xA2, 0xF7, 0x79, /* 0x50-0x53 */
+	0xF7, 0x7A, 0xC5, 0xBA, 0xF8, 0x52, 0xC5, 0xE7, /* 0x54-0x57 */
+	0x00, 0x00, 0xF8, 0x53, 0xC5, 0xE5, 0xC5, 0xE6, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xD3, 0xC6, 0x4A, /* 0x5C-0x5F */
+	0xF9, 0x76, 0x00, 0x00, 0xC6, 0x6A, 0x00, 0x00, /* 0x60-0x63 */
+	0xF9, 0xB3, 0xC6, 0x6B, 0xF9, 0xB4, 0xF9, 0xB5, /* 0x64-0x67 */
+	0xF9, 0xC3, 0xF9, 0xC2, 0xC6, 0x7A, 0xF9, 0xCD, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xB0, 0xA9, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE9, /* 0xA8-0xAB */
+	0x00, 0x00, 0xE0, 0xE8, 0x00, 0x00, 0xBB, 0xEA, /* 0xAC-0xAF */
+	0xBB, 0xEB, 0xE4, 0xDA, 0x00, 0x00, 0xE8, 0xD2, /* 0xB0-0xB3 */
+	0xEC, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xBE, 0x75, /* 0xB4-0xB7 */
+	0xC0, 0x65, 0xEC, 0x6A, 0x00, 0x00, 0xEC, 0x6D, /* 0xB8-0xBB */
+	0xC0, 0x66, 0x00, 0x00, 0xEF, 0x64, 0xEC, 0x6B, /* 0xBC-0xBF */
+	0xF1, 0xB9, 0xC3, 0x4E, 0xF3, 0xC1, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0x66, 0xF5, 0x64, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0x65, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0xF6, 0xA2, 0x00, 0x00, 0xC5, 0x5C, /* 0xCC-0xCF */
+	0xF7, 0xA4, 0xC5, 0xEA, 0xC5, 0xBC, 0xC5, 0xE8, /* 0xD0-0xD3 */
+	0xC5, 0xE9, 0xF8, 0xD4, 0xC6, 0x62, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xB0, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0xF1, 0xBA, 0x00, 0x00, 0x00, 0x00, 0xD4, 0x49, /* 0xDC-0xDF */
+	0x00, 0x00, 0xB9, 0xA6, 0x00, 0x00, 0xE4, 0xDB, /* 0xE0-0xE3 */
+	0x00, 0x00, 0x00, 0x00, 0xBB, 0xEC, 0xE4, 0xDC, /* 0xE4-0xE7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xD4, /* 0xE8-0xEB */
+	0xE8, 0xD3, 0xC0, 0x68, 0xBE, 0x76, 0xBE, 0x77, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE8, 0xD7, 0xE8, 0xD6, 0xE8, 0xD5, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0x6E, 0xEC, 0x71, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xEC, 0x70, 0xEC, 0x6F, 0xC0, 0x67, /* 0xF8-0xFB */
+	0xEF, 0x68, 0xEF, 0x66, 0xEF, 0x65, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9B[512] = {
+	0x00, 0x00, 0xEF, 0x67, 0x00, 0x00, 0xC3, 0x4F, /* 0x00-0x03 */
+	0xF1, 0xBC, 0xF1, 0xBD, 0xC3, 0x50, 0x00, 0x00, /* 0x04-0x07 */
+	0xF1, 0xBB, 0x00, 0x00, 0xF3, 0xC3, 0xF3, 0xC2, /* 0x08-0x0B */
+	0xF3, 0xC5, 0xC4, 0x47, 0xF3, 0xC4, 0x00, 0x00, /* 0x0C-0x0F */
+	0xF5, 0x67, 0xF5, 0x69, 0xF5, 0x68, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xF6, 0xA3, 0xF6, 0xA6, 0xF6, 0xA4, /* 0x14-0x17 */
+	0xF6, 0xA5, 0xF7, 0xA5, 0xC5, 0xBD, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0x54, 0xF8, 0x55, /* 0x1C-0x1F */
+	0xF8, 0x56, 0x00, 0x00, 0xC6, 0x4B, 0xC6, 0x63, /* 0x20-0x23 */
+	0xF9, 0xB6, 0xB0, 0xAB, 0x00, 0x00, 0xBE, 0x78, /* 0x24-0x27 */
+	0xC0, 0x69, 0xF1, 0xBE, 0x00, 0x00, 0xF7, 0xA6, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xC4, 0xD4, 0x4A, /* 0x2C-0x2F */
+	0x00, 0x00, 0xC6, 0x7B, 0xB0, 0xAC, 0xEC, 0x72, /* 0x30-0x33 */
+	0x00, 0x00, 0xF1, 0xBF, 0x00, 0x00, 0xF3, 0xC6, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0xF6, 0xA7, 0xF7, 0xA7, /* 0x38-0x3B */
+	0xB0, 0xAD, 0x00, 0x00, 0xE4, 0xDD, 0xE4, 0xDE, /* 0x3C-0x3F */
+	0x00, 0x00, 0xBB, 0xED, 0xBB, 0xEE, 0xE8, 0xD9, /* 0x40-0x43 */
+	0xBE, 0x7A, 0xBE, 0x79, 0xE8, 0xD8, 0x00, 0x00, /* 0x44-0x47 */
+	0xEF, 0x69, 0x00, 0x00, 0xF1, 0xC0, 0xF1, 0xC2, /* 0x48-0x4B */
+	0xF1, 0xC1, 0xC3, 0x53, 0xC3, 0x52, 0xC3, 0x51, /* 0x4C-0x4F */
+	0x00, 0x00, 0xC5, 0x5E, 0xF6, 0xA8, 0x00, 0x00, /* 0x50-0x53 */
+	0xC5, 0x5D, 0xF7, 0xA9, 0xF7, 0xA8, 0x00, 0x00, /* 0x54-0x57 */
+	0xC6, 0x4C, 0xF8, 0xD5, 0xB3, 0xBD, 0xE0, 0xEA, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE1, /* 0x5C-0x5F */
+	0xE4, 0xDF, 0xE4, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xE8, 0xE2, 0x00, 0x00, 0xE8, 0xDD, 0xE8, 0xDA, /* 0x64-0x67 */
+	0xE8, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0xE8, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xBE, 0x7C, /* 0x6C-0x6F */
+	0xE8, 0xE0, 0xE8, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0xE8, 0xDB, 0xE8, 0xDF, 0xE8, 0xDE, 0xBE, 0x7B, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0x7D, 0xEC, 0x78, /* 0x78-0x7B */
+	0xEC, 0x76, 0xEC, 0xA1, 0xEC, 0x77, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0xEC, 0x73, 0x00, 0x00, 0xEC, 0x79, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0xEC, 0x74, 0xEF, 0x72, 0xEC, 0x75, /* 0x84-0x87 */
+	0xEC, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xEC, 0x7C, 0xC0, 0x6A, 0xEC, 0x7B, 0xEC, 0x7A, /* 0x90-0x93 */
+	0x00, 0x00, 0xEC, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0x6A, 0xEF, 0x6D, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0x6C, 0x00, 0x00, /* 0x9C-0x9F */
+	0xEF, 0x74, 0xEF, 0x6F, 0xEF, 0x73, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xEF, 0x71, 0xEF, 0x70, 0xEF, 0x6E, 0x00, 0x00, /* 0xA4-0xA7 */
+	0xEF, 0x6B, 0x00, 0x00, 0xC2, 0x43, 0xC2, 0x42, /* 0xA8-0xAB */
+	0x00, 0x00, 0xC2, 0x44, 0xC2, 0x41, 0xEF, 0x75, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0xF1, 0xC8, 0xF1, 0xCB, 0x00, 0x00, /* 0xB4-0xB7 */
+	0xF1, 0xC9, 0xF1, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0xF1, 0xCE, 0x00, 0x00, 0xF1, 0xC6, /* 0xBC-0xBF */
+	0xC3, 0x58, 0xF1, 0xC7, 0x00, 0x00, 0xF1, 0xC5, /* 0xC0-0xC3 */
+	0xF1, 0xCC, 0x00, 0x00, 0xF1, 0xC4, 0xF1, 0xC3, /* 0xC4-0xC7 */
+	0xC3, 0x57, 0xC3, 0x55, 0xC3, 0x54, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xCA, /* 0xD0-0xD3 */
+	0xF3, 0xCF, 0xF3, 0xD5, 0xC4, 0x4A, 0xF3, 0xD0, /* 0xD4-0xD7 */
+	0x00, 0x00, 0xF3, 0xD3, 0xF3, 0xD7, 0xC4, 0x4B, /* 0xD8-0xDB */
+	0xF3, 0xD2, 0x00, 0x00, 0xF3, 0xCA, 0x00, 0x00, /* 0xDC-0xDF */
+	0xF3, 0xC9, 0xF3, 0xD6, 0xF3, 0xCD, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xF3, 0xCB, 0xF3, 0xD4, 0xF3, 0xCC, 0xC4, 0x49, /* 0xE4-0xE7 */
+	0xC4, 0x48, 0x00, 0x00, 0xF3, 0xC7, 0xF3, 0xC8, /* 0xE8-0xEB */
+	0xF3, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0xF3, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0x6C, /* 0xF4-0xF7 */
+	0xF5, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
+	0x00, 0x00, 0xC3, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9C[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0xF5, 0x6D, 0xF5, 0x73, 0xF5, 0x71, /* 0x04-0x07 */
+	0xF5, 0x6B, 0xF5, 0x76, 0x00, 0x00, 0xF5, 0x6A, /* 0x08-0x0B */
+	0x00, 0x00, 0xC4, 0xCF, 0xF5, 0x72, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0xF5, 0x6E, 0xC4, 0xCE, /* 0x10-0x13 */
+	0xF5, 0x75, 0x00, 0x00, 0x00, 0x00, 0xF5, 0x74, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0xF6, 0xAB, 0xF6, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0xF6, 0xB1, 0x00, 0x00, 0xF6, 0xAD, /* 0x20-0x23 */
+	0xF6, 0xB0, 0xC5, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xF6, 0xAE, 0xF6, 0xAF, 0x00, 0x00, 0xF6, 0xA9, /* 0x28-0x2B */
+	0xF6, 0xAC, 0xC5, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0xC5, 0xBF, 0xF7, 0xB4, 0xF7, 0xAF, /* 0x30-0x33 */
+	0xF7, 0xB3, 0x00, 0x00, 0xF7, 0xB6, 0xF7, 0xB2, /* 0x34-0x37 */
+	0x00, 0x00, 0xF7, 0xAE, 0x00, 0x00, 0xC5, 0xC1, /* 0x38-0x3B */
+	0xF7, 0xB1, 0xF7, 0xB5, 0xC5, 0xC0, 0xF7, 0xAC, /* 0x3C-0x3F */
+	0xF5, 0x70, 0xF7, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0xF7, 0xAD, 0x00, 0x00, 0xF7, 0xAA, 0x00, 0x00, /* 0x44-0x47 */
+	0xF7, 0xAB, 0xC5, 0xBE, 0xF8, 0x5A, 0xF8, 0x5C, /* 0x48-0x4B */
+	0xF8, 0x5F, 0xF8, 0x5B, 0xF8, 0x60, 0x00, 0x00, /* 0x4C-0x4F */
+	0xF8, 0x59, 0x00, 0x00, 0xF8, 0x57, 0x00, 0x00, /* 0x50-0x53 */
+	0xC5, 0xEB, 0xF8, 0x5D, 0xC5, 0xED, 0xC5, 0xEC, /* 0x54-0x57 */
+	0xF8, 0x58, 0xF8, 0x5E, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xDA, 0xC6, 0x4D, /* 0x5C-0x5F */
+	0xF8, 0xDB, 0x00, 0x00, 0xF8, 0xD9, 0xF8, 0xD6, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xD8, 0xF8, 0xD7, /* 0x64-0x67 */
+	0xF9, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0xF9, 0x5C, 0xF9, 0x5B, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0xF9, 0x79, 0x00, 0x00, 0xF9, 0x78, /* 0x70-0x73 */
+	0xF9, 0x77, 0xF9, 0x7A, 0x00, 0x00, 0xC6, 0x73, /* 0x74-0x77 */
+	0xC6, 0x74, 0xF9, 0xCA, 0xF9, 0xCE, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xB3, 0xBE, 0xDC, 0xAF, 0xE0, 0xED, /* 0xE4-0xE7 */
+	0x00, 0x00, 0xB9, 0xA7, 0xE0, 0xEB, 0x00, 0x00, /* 0xE8-0xEB */
+	0x00, 0x00, 0xE0, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
+	0x00, 0x00, 0xE4, 0xE2, 0xE4, 0xE3, 0xBB, 0xF1, /* 0xF0-0xF3 */
+	0xBB, 0xEF, 0xE4, 0xE4, 0xBB, 0xF0, 0xE8, 0xE8, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xE8, 0xEB, 0xE8, 0xE5, 0xE8, 0xEC, /* 0xF8-0xFB */
+	0xE8, 0xE4, 0xE8, 0xE6, 0x00, 0x00, 0xE8, 0xE7, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9D[512] = {
+	0xE8, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xBE, 0xA1, /* 0x00-0x03 */
+	0xE8, 0xEF, 0xE8, 0xEE, 0xBE, 0x7D, 0xE8, 0xE9, /* 0x04-0x07 */
+	0xE8, 0xED, 0xBE, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xEC, 0xAC, 0x00, 0x00, 0xC0, 0x6F, 0x00, 0x00, /* 0x10-0x13 */
+	0xEC, 0xA7, 0xC0, 0x6B, 0x00, 0x00, 0xEC, 0xA4, /* 0x14-0x17 */
+	0xEC, 0xAA, 0xEC, 0xAD, 0x00, 0x00, 0xC0, 0x70, /* 0x18-0x1B */
+	0x00, 0x00, 0xEC, 0xA9, 0xEC, 0xA6, 0xEC, 0xAE, /* 0x1C-0x1F */
+	0xEC, 0xA5, 0x00, 0x00, 0xEC, 0xAB, 0xC0, 0x6C, /* 0x20-0x23 */
+	0x00, 0x00, 0xEC, 0xA3, 0xC0, 0x6D, 0x00, 0x00, /* 0x24-0x27 */
+	0xC0, 0x6E, 0xEC, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0xEF, 0xA9, 0xEF, 0x7A, 0xEF, 0x7B, /* 0x2C-0x2F */
+	0xEF, 0x7E, 0xEF, 0x7C, 0x00, 0x00, 0xEF, 0x76, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0xEF, 0x79, 0xEF, 0xA5, /* 0x34-0x37 */
+	0xEF, 0x7D, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x45, /* 0x38-0x3B */
+	0x00, 0x00, 0xEF, 0xA7, 0xEF, 0xA4, 0xC2, 0x46, /* 0x3C-0x3F */
+	0xEF, 0xA6, 0xEF, 0x77, 0xEF, 0xA2, 0xEF, 0xA3, /* 0x40-0x43 */
+	0x00, 0x00, 0xEF, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xF1, 0xD2, 0xF1, 0xD4, /* 0x48-0x4B */
+	0xF1, 0xD7, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xD1, /* 0x4C-0x4F */
+	0x00, 0x00, 0xC3, 0x59, 0xF1, 0xD9, 0xF1, 0xD0, /* 0x50-0x53 */
+	0xF1, 0xDA, 0x00, 0x00, 0xF1, 0xD6, 0xF1, 0xD8, /* 0x54-0x57 */
+	0xF1, 0xDC, 0xF1, 0xD5, 0xF1, 0xDD, 0xF1, 0xD3, /* 0x58-0x5B */
+	0xF1, 0xCF, 0xC3, 0x5A, 0x00, 0x00, 0xF1, 0xDB, /* 0x5C-0x5F */
+	0xC3, 0x5B, 0xC4, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0x78, /* 0x64-0x67 */
+	0xF3, 0xF1, 0xF3, 0xE8, 0xC4, 0x4F, 0xF3, 0xE4, /* 0x68-0x6B */
+	0xC4, 0x50, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xED, /* 0x6C-0x6F */
+	0xF3, 0xE7, 0xF3, 0xDD, 0xC4, 0x4E, 0xF3, 0xEA, /* 0x70-0x73 */
+	0xF3, 0xE5, 0xF3, 0xE6, 0x00, 0x00, 0xF3, 0xD8, /* 0x74-0x77 */
+	0xF3, 0xDF, 0xF3, 0xEE, 0x00, 0x00, 0xF3, 0xEB, /* 0x78-0x7B */
+	0x00, 0x00, 0xF3, 0xE3, 0x00, 0x00, 0xF3, 0xEF, /* 0x7C-0x7F */
+	
+	0xF3, 0xDE, 0xF3, 0xD9, 0xF3, 0xEC, 0x00, 0x00, /* 0x80-0x83 */
+	0xF3, 0xDB, 0xF3, 0xE9, 0xF3, 0xE0, 0xF3, 0xF0, /* 0x84-0x87 */
+	0xF3, 0xDC, 0xC4, 0x4C, 0xF3, 0xDA, 0xF3, 0xE1, /* 0x88-0x8B */
+	0xF3, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xF5, 0x7D, 0x00, 0x00, 0xF5, 0x7B, 0x00, 0x00, /* 0x90-0x93 */
+	0xF5, 0xA2, 0x00, 0x00, 0xF5, 0xAE, 0xF5, 0xA5, /* 0x94-0x97 */
+	0xF5, 0x7C, 0xF5, 0x78, 0xF5, 0xA7, 0xF5, 0x7E, /* 0x98-0x9B */
+	0xF5, 0xA3, 0xF5, 0x7A, 0xF5, 0xAA, 0xF5, 0x77, /* 0x9C-0x9F */
+	0xF5, 0xA1, 0xF5, 0xA6, 0xF5, 0xA8, 0xF5, 0xAB, /* 0xA0-0xA3 */
+	0xF5, 0x79, 0x00, 0x00, 0xF5, 0xAF, 0xF5, 0xB0, /* 0xA4-0xA7 */
+	0xF5, 0xA9, 0xF5, 0xAD, 0xF5, 0xA4, 0x00, 0x00, /* 0xA8-0xAB */
+	0xF6, 0xC1, 0xF6, 0xC4, 0x00, 0x00, 0xC5, 0x61, /* 0xAC-0xAF */
+	0x00, 0x00, 0xF6, 0xC3, 0xF6, 0xC8, 0xF6, 0xC6, /* 0xB0-0xB3 */
+	0xC5, 0x62, 0xF6, 0xBD, 0xF6, 0xB3, 0xF6, 0xB2, /* 0xB4-0xB7 */
+	0xC5, 0x64, 0xF6, 0xBF, 0xF6, 0xC0, 0xF6, 0xBC, /* 0xB8-0xBB */
+	0xF6, 0xB4, 0x00, 0x00, 0xF6, 0xB9, 0xF5, 0xAC, /* 0xBC-0xBF */
+	0x00, 0x00, 0xF6, 0xB5, 0xC5, 0x63, 0xF6, 0xBB, /* 0xC0-0xC3 */
+	0x00, 0x00, 0xF6, 0xBA, 0x00, 0x00, 0xF6, 0xB6, /* 0xC4-0xC7 */
+	0xF6, 0xC2, 0x00, 0x00, 0xF6, 0xB7, 0xF7, 0xBB, /* 0xC8-0xCB */
+	0xF6, 0xC5, 0xF6, 0xC7, 0xF6, 0xBE, 0xF6, 0xB8, /* 0xCC-0xCF */
+	0xF7, 0xBC, 0xF7, 0xBE, 0xF7, 0xB8, 0xC5, 0xC2, /* 0xD0-0xD3 */
+	0x00, 0x00, 0xF7, 0xC5, 0xF7, 0xC3, 0xC5, 0xC3, /* 0xD4-0xD7 */
+	0xF7, 0xC2, 0xF7, 0xC1, 0xF7, 0xBA, 0xF7, 0xB7, /* 0xD8-0xDB */
+	0xF7, 0xBD, 0xF7, 0xC6, 0xF7, 0xB9, 0xF7, 0xBF, /* 0xDC-0xDF */
+	0x00, 0x00, 0xF8, 0x69, 0xF8, 0x6E, 0xF8, 0x64, /* 0xE0-0xE3 */
+	0xF8, 0x67, 0xC5, 0xEE, 0xF8, 0x6B, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xF8, 0x72, 0xF7, 0xC0, 0x00, 0x00, 0xF8, 0x65, /* 0xE8-0xEB */
+	0xF8, 0x6F, 0xF8, 0x73, 0xF8, 0x6A, 0xF8, 0x63, /* 0xEC-0xEF */
+	0xF8, 0x6D, 0x00, 0x00, 0xF8, 0x6C, 0xF8, 0x71, /* 0xF0-0xF3 */
+	0xF8, 0x70, 0xF7, 0xC4, 0xF8, 0x68, 0xF8, 0x62, /* 0xF4-0xF7 */
+	0xF8, 0x66, 0xC6, 0x4E, 0xC6, 0x4F, 0xF8, 0x61, /* 0xF8-0xFB */
+	0x00, 0x00, 0xF8, 0xE6, 0xF8, 0xDD, 0xF8, 0xE5, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9E[512] = {
+	0xF8, 0xE2, 0xF8, 0xE3, 0xF8, 0xDC, 0xF8, 0xDF, /* 0x00-0x03 */
+	0xF8, 0xE7, 0xF8, 0xE1, 0xF8, 0xE0, 0xF8, 0xDE, /* 0x04-0x07 */
+	0x00, 0x00, 0xF8, 0xE4, 0x00, 0x00, 0xF9, 0x5D, /* 0x08-0x0B */
+	0x00, 0x00, 0xF9, 0x5E, 0x00, 0x00, 0xF9, 0x60, /* 0x0C-0x0F */
+	0xF9, 0x5F, 0xF9, 0x62, 0xF9, 0x61, 0xF9, 0x7C, /* 0x10-0x13 */
+	0xF9, 0x7B, 0xF9, 0xB7, 0x00, 0x00, 0xF9, 0xB8, /* 0x14-0x17 */
+	0x00, 0x00, 0xF9, 0xC5, 0xC6, 0x78, 0xC6, 0x7C, /* 0x18-0x1B */
+	0x00, 0x00, 0xF9, 0xCF, 0xC6, 0x7D, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0xB3, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0xC4, 0xD0, 0xF6, 0xC9, 0x00, 0x00, /* 0x78-0x7B */
+	0xC6, 0x50, 0xC6, 0x51, 0x00, 0x00, 0xB3, 0xC0, /* 0x7C-0x7F */
+	
+	0xE0, 0xEE, 0x00, 0x00, 0xB9, 0xA8, 0xE8, 0xF0, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0xEC, 0xB0, 0xEC, 0xB1, /* 0x84-0x87 */
+	0xEC, 0xAF, 0xEF, 0xAB, 0xEF, 0xAA, 0xC2, 0x47, /* 0x88-0x8B */
+	0xF1, 0xDF, 0xEF, 0xAC, 0xF1, 0xDE, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0xF3, 0xF3, 0xC4, 0x51, 0xC4, 0x53, /* 0x90-0x93 */
+	0xF3, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x52, /* 0x94-0x97 */
+	0x00, 0x00, 0xF5, 0xB1, 0xF5, 0xB3, 0xF5, 0xB2, /* 0x98-0x9B */
+	0xF6, 0xCA, 0xC5, 0x65, 0x00, 0x00, 0xC5, 0xEF, /* 0x9C-0x9F */
+	0xF8, 0xE8, 0xF9, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xF9, 0xD2, 0xB3, 0xC1, 0x00, 0x00, 0xE4, 0xE5, /* 0xA4-0xA7 */
+	0x00, 0x00, 0xBE, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0xEC, 0xB3, 0xEC, 0xB2, 0x00, 0x00, /* 0xAC-0xAF */
+	0xEF, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0xC4, 0x54, 0xC4, 0xD1, 0xF7, 0xC7, 0xF9, 0xCB, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xC2, /* 0xB8-0xBB */
+	0xBB, 0xF2, 0x00, 0x00, 0xBE, 0xA3, 0x00, 0x00, /* 0xBC-0xBF */
+	0xF3, 0xF4, 0x00, 0x00, 0xF8, 0x74, 0xB6, 0xC0, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0xEF, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0xC6, 0x64, 0xB6, 0xC1, 0xBE, 0xA4, 0xC2, 0x48, /* 0xCC-0xCF */
+	0xF8, 0x75, 0xB6, 0xC2, 0x00, 0x00, 0xE8, 0xF1, /* 0xD0-0xD3 */
+	0xC0, 0x72, 0xEC, 0xB4, 0xEC, 0xB5, 0x00, 0x00, /* 0xD4-0xD7 */
+	0xC0, 0x71, 0x00, 0x00, 0xEF, 0xAF, 0xC2, 0x4C, /* 0xD8-0xDB */
+	0xC2, 0x4A, 0xC2, 0x4B, 0xC2, 0x49, 0xF1, 0xE0, /* 0xDC-0xDF */
+	0xC3, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
+	0xF5, 0xB5, 0xF5, 0xB4, 0xF5, 0xB7, 0xF5, 0xB6, /* 0xE4-0xE7 */
+	0xC4, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xCB, /* 0xE8-0xEB */
+	0x00, 0x00, 0xF6, 0xCD, 0xF6, 0xCC, 0xC5, 0x66, /* 0xEC-0xEF */
+	0xF7, 0xC8, 0x00, 0x00, 0xF8, 0x76, 0xF8, 0x77, /* 0xF0-0xF3 */
+	0xC5, 0xF0, 0xF9, 0x64, 0xF9, 0x7D, 0xC6, 0x75, /* 0xF4-0xF7 */
+	0x00, 0x00, 0xDC, 0xB0, 0xEC, 0xB6, 0xEF, 0xB0, /* 0xF8-0xFB */
+	0xF3, 0xF5, 0xE0, 0xEF, 0x00, 0x00, 0xEF, 0xB1, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_9F[512] = {
+	0xF1, 0xE2, 0xF1, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0x78, 0xC6, 0x52, /* 0x04-0x07 */
+	0x00, 0x00, 0xF9, 0x65, 0xF9, 0x7E, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0xB9, 0xA9, 0xE8, 0xF2, /* 0x0C-0x0F */
+	0xE8, 0xF3, 0x00, 0x00, 0xEC, 0xB7, 0xB9, 0xAA, /* 0x10-0x13 */
+	0x00, 0x00, 0xC3, 0x5D, 0xF1, 0xE3, 0x00, 0x00, /* 0x14-0x17 */
+	0xF6, 0xCF, 0xC5, 0x67, 0xF6, 0xD0, 0xF6, 0xCE, /* 0x18-0x1B */
+	0xF8, 0x79, 0x00, 0x00, 0xF8, 0xE9, 0x00, 0x00, /* 0x1C-0x1F */
+	0xB9, 0xAB, 0x00, 0x00, 0xEF, 0xB4, 0xEF, 0xB3, /* 0x20-0x23 */
+	0xEF, 0xB2, 0xF1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0xF1, 0xE8, 0xF1, 0xE7, 0xF1, 0xE6, 0xF1, 0xE5, /* 0x28-0x2B */
+	0xC3, 0x5E, 0xF3, 0xF6, 0xF5, 0xB9, 0xC4, 0xD3, /* 0x2C-0x2F */
+	0xF5, 0xB8, 0xF6, 0xD1, 0xF7, 0xCB, 0xF7, 0xCA, /* 0x30-0x33 */
+	0xC5, 0xC4, 0xF7, 0xC9, 0xF8, 0x7C, 0xF8, 0x7B, /* 0x34-0x37 */
+	0xF8, 0x7A, 0x00, 0x00, 0x00, 0x00, 0xBB, 0xF3, /* 0x38-0x3B */
+	0x00, 0x00, 0xEC, 0xB8, 0xC2, 0x4D, 0x00, 0x00, /* 0x3C-0x3F */
+	0xF3, 0xF7, 0xF3, 0xF8, 0xF7, 0xCC, 0xF8, 0x7D, /* 0x40-0x43 */
+	0x00, 0x00, 0x00, 0x00, 0xF8, 0xEA, 0xF9, 0x66, /* 0x44-0x47 */
+	0xF9, 0xB9, 0xF9, 0xD4, 0xBB, 0xF4, 0xC2, 0x4E, /* 0x48-0x4B */
+	0xF1, 0xE9, 0xF3, 0xF9, 0xF6, 0xD2, 0xF8, 0x7E, /* 0x4C-0x4F */
+	0x00, 0x00, 0x00, 0x00, 0xBE, 0xA6, 0x00, 0x00, /* 0x50-0x53 */
+	0xEF, 0xB5, 0xF1, 0xEA, 0xF3, 0xFA, 0xF3, 0xFB, /* 0x54-0x57 */
+	0xF3, 0xFC, 0xF5, 0xBE, 0x00, 0x00, 0xF5, 0xBA, /* 0x58-0x5B */
+	0xC5, 0x68, 0xF5, 0xBD, 0xF5, 0xBC, 0xC4, 0xD4, /* 0x5C-0x5F */
+	0xF5, 0xBB, 0xC4, 0xD6, 0x00, 0x00, 0xC4, 0xD5, /* 0x60-0x63 */
+	0xF6, 0xD4, 0xF6, 0xD3, 0xC5, 0x69, 0xC5, 0x6A, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xC5, 0xC6, 0xF7, 0xCD, /* 0x68-0x6B */
+	0xC5, 0xC5, 0x00, 0x00, 0xF8, 0xA3, 0xF8, 0xA4, /* 0x6C-0x6F */
+	0xF8, 0xA2, 0xF8, 0xA1, 0xC6, 0x54, 0x00, 0x00, /* 0x70-0x73 */
+	0xF8, 0xEB, 0xF8, 0xEC, 0xF8, 0xED, 0xC6, 0x53, /* 0x74-0x77 */
+	0xF9, 0x67, 0xF9, 0x6A, 0xF9, 0x69, 0xF9, 0x68, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0xF9, 0xD3, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0xC0, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0xC3, 0x65, 0xF5, 0xBF, 0xF6, 0xD5, 0x00, 0x00, /* 0x90-0x93 */
+	0xC5, 0xC7, 0xF7, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0xF9, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0xC0, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0xEF, 0xB6, 0x00, 0x00, 0xF7, 0xCF, 0x00, 0x00, /* 0xA0-0xA3 */
+	0xF9, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+};
+
+static const unsigned char u2c_DC[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+};
+
+static const unsigned char u2c_F9[512] = {
+	0xB0, 0x5A, 0xA7, 0xF3, 0xA8, 0xAE, 0xB8, 0xEB, /* 0x00-0x03 */
+	0xB7, 0xC6, 0xA6, 0xEA, 0xA5, 0x79, 0xC0, 0x74, /* 0x04-0x07 */
+	0xC0, 0x74, 0xAB, 0xB4, 0xAA, 0xF7, 0xB3, 0xE2, /* 0x08-0x0B */
+	0xA9, 0x60, 0xC3, 0x69, 0xC4, 0xEE, 0xC3, 0xB9, /* 0x0C-0x0F */
+	0xC5, 0xDA, 0xC1, 0xB3, 0xBB, 0x72, 0xC5, 0xDE, /* 0x10-0x13 */
+	0xBC, 0xD6, 0xAC, 0xA5, 0xAF, 0x4F, 0xAF, 0x5F, /* 0x14-0x17 */
+	0xB8, 0xA8, 0xB9, 0x54, 0xC0, 0x64, 0xB6, 0xC3, /* 0x18-0x1B */
+	0xA7, 0x5A, 0xC4, 0xE6, 0xC4, 0xEA, 0xC4, 0xF5, /* 0x1C-0x1F */
+	0xC6, 0x7D, 0xB4, 0x50, 0xC0, 0xDD, 0xC2, 0xC5, /* 0x20-0x23 */
+	0xC4, 0xB0, 0xA9, 0xD4, 0xC3, 0xBE, 0xC4, 0xFA, /* 0x24-0x27 */
+	0xB4, 0x59, 0xAE, 0xD4, 0xAE, 0xF6, 0xAF, 0x54, /* 0x28-0x2B */
+	0x00, 0x00, 0xA8, 0xD3, 0xA7, 0x4E, 0xB3, 0xD2, /* 0x2C-0x2F */
+	0xBE, 0xDB, 0xC3, 0x72, 0xC4, 0x6C, 0xBF, 0x63, /* 0x30-0x33 */
+	0xA6, 0xD1, 0xC4, 0xAA, 0xB8, 0xB8, 0xB8, 0xF4, /* 0x34-0x37 */
+	0xC5, 0x53, 0xBE, 0x7C, 0xC6, 0x4F, 0xB8, 0x4C, /* 0x38-0x3B */
+	0xB8, 0x53, 0xBA, 0xF1, 0xDB, 0x77, 0xBF, 0xFD, /* 0x3C-0x3F */
+	0xB3, 0xC0, 0xBD, 0xD7, 0xC3, 0x62, 0xA7, 0xCB, /* 0x40-0x43 */
+	0xC5, 0xA2, 0xC5, 0xA4, 0xA8, 0x63, 0xBD, 0x55, /* 0x44-0x47 */
+	0xB8, 0xEF, 0xB9, 0x70, 0xC2, 0x53, 0xB9, 0xF0, /* 0x48-0x4B */
+	0xBC, 0xD3, 0xB2, 0x5C, 0xBA, 0x7C, 0xB2, 0xD6, /* 0x4C-0x4F */
+	0xC1, 0x5C, 0xAD, 0xAE, 0xB0, 0xC7, 0xA6, 0xD8, /* 0x50-0x53 */
+	0xBB, 0xFE, 0xAD, 0xE2, 0xB8, 0x57, 0xBA, 0xF0, /* 0x54-0x57 */
+	0xB5, 0xD9, 0xB3, 0xAE, 0xC5, 0xAA, 0xCE, 0xD4, /* 0x58-0x5B */
+	0xBC, 0xD6, 0xBF, 0xD5, 0xA4, 0xA6, 0xB9, 0xE7, /* 0x5C-0x5F */
+	0xAB, 0xE3, 0xB2, 0x76, 0xB2, 0xA7, 0xA5, 0x5F, /* 0x60-0x63 */
+	0xED, 0xA8, 0xAB, 0x4B, 0xB4, 0x5F, 0xA4, 0xA3, /* 0x64-0x67 */
+	0xAA, 0x63, 0xBC, 0xC6, 0xAF, 0xC1, 0xB0, 0xD1, /* 0x68-0x6B */
+	0xB6, 0xEB, 0xAC, 0xD9, 0xB8, 0xAD, 0xBB, 0xA1, /* 0x6C-0x6F */
+	0xB1, 0xFE, 0xA8, 0xB0, 0xA8, 0x48, 0xAC, 0x42, /* 0x70-0x73 */
+	0xAD, 0x59, 0xB1, 0xB0, 0xB2, 0xA4, 0xAB, 0x47, /* 0x74-0x77 */
+	0xA8, 0xE2, 0x00, 0x00, 0xB1, 0xE7, 0xC2, 0xB3, /* 0x78-0x7B */
+	0xA8, 0x7D, 0xBD, 0xCC, 0xB6, 0x71, 0xC0, 0x79, /* 0x7C-0x7F */
+	
+	0xA7, 0x66, 0xA4, 0x6B, 0xC3, 0x66, 0xAE, 0xC8, /* 0x80-0x83 */
+	0xC2, 0x6F, 0xC4, 0x72, 0xBE, 0x5B, 0xC6, 0x7A, /* 0x84-0x87 */
+	0xC4, 0x52, 0xBE, 0xA4, 0xA4, 0x4F, 0xBE, 0xE4, /* 0x88-0x8B */
+	0xBE, 0xFA, 0xF7, 0x65, 0xA6, 0x7E, 0xBC, 0xA6, /* 0x8C-0x8F */
+	0xC5, 0xCA, 0xBC, 0xBF, 0xBA, 0xA7, 0xB7, 0xD2, /* 0x90-0x93 */
+	0xE6, 0xA3, 0x00, 0x00, 0xBD, 0x6D, 0xC1, 0x70, /* 0x94-0x97 */
+	0xBD, 0xFB, 0xBD, 0xAC, 0xB3, 0x73, 0xC1, 0xE5, /* 0x98-0x9B */
+	0xA6, 0x43, 0xA6, 0x48, 0xAB, 0x7C, 0xAF, 0x50, /* 0x9C-0x9F */
+	0xB5, 0xF5, 0xBB, 0xA1, 0xB7, 0x47, 0xA9, 0xC0, /* 0xA0-0xA3 */
+	0xB1, 0xC9, 0xC0, 0xD4, 0xC3, 0xAE, 0xC2, 0x79, /* 0xA4-0xA7 */
+	0xA5, 0x4F, 0xCB, 0xF1, 0xB9, 0xE7, 0xC0, 0xAD, /* 0xA8-0xAB */
+	0xCC, 0xB0, 0xAC, 0xC2, 0xBC, 0xFC, 0xB2, 0xDC, /* 0xAC-0xAF */
+	0xB2, 0xE2, 0xB9, 0x61, 0xB9, 0x73, 0xC6, 0x46, /* 0xB0-0xB3 */
+	0xBB, 0xE2, 0xA8, 0xD2, 0xC2, 0xA7, 0xC4, 0xBF, /* 0xB4-0xB7 */
+	0xC1, 0xF5, 0xB4, 0x63, 0xA4, 0x46, 0xB9, 0xB1, /* 0xB8-0xBB */
+	0xBC, 0x64, 0xA7, 0xBF, 0xAE, 0xC6, 0xBC, 0xD6, /* 0xBC-0xBF */
+	0xBF, 0x52, 0xC0, 0xF8, 0xE7, 0x64, 0xBF, 0xF1, /* 0xC0-0xC3 */
+	0xC0, 0x73, 0xB7, 0x77, 0xA8, 0xBF, 0xBC, 0x42, /* 0xC4-0xC7 */
+	0xCC, 0xD8, 0xAC, 0x68, 0xAC, 0x79, 0xB7, 0xC8, /* 0xC8-0xCB */
+	0xAF, 0x5B, 0xAF, 0x64, 0xB2, 0xB8, 0xAF, 0xC3, /* 0xCC-0xCF */
+	0xC3, 0xFE, 0xA4, 0xBB, 0xBC, 0xAE, 0xB3, 0xB0, /* 0xD0-0xD3 */
+	0xAD, 0xDB, 0xB1, 0x5B, 0xB2, 0x5F, 0xBD, 0xFC, /* 0xD4-0xD7 */
+	0xAB, 0xDF, 0xB7, 0x58, 0xAE, 0xDF, 0xB2, 0x76, /* 0xD8-0xDB */
+	0xB6, 0xA9, 0xA7, 0x51, 0xA6, 0x4F, 0xBC, 0x69, /* 0xDC-0xDF */
+	0xA9, 0xF6, 0xA7, 0xF5, 0xB1, 0xF9, 0xAA, 0x64, /* 0xE0-0xE3 */
+	0xB2, 0x7A, 0xB5, 0x67, 0xBF, 0xA9, 0x00, 0x00, /* 0xE4-0xE7 */
+	0xB8, 0xCC, 0xA8, 0xBD, 0xC2, 0xF7, 0xB0, 0xCE, /* 0xE8-0xEB */
+	0xB7, 0xC4, 0xA7, 0x5B, 0xBF, 0x4D, 0xBF, 0x5A, /* 0xEC-0xEF */
+	0xC4, 0xA9, 0x00, 0x00, 0xC5, 0xEC, 0xC5, 0xEF, /* 0xF0-0xF3 */
+	0xAA, 0x4C, 0xB2, 0x4F, 0xC1, 0x7B, 0xA5, 0xDF, /* 0xF4-0xF7 */
+	0xB2, 0xC1, 0xB2, 0xC9, 0xAA, 0xAC, 0xAA, 0xA5, /* 0xF8-0xFB */
+	0xC3, 0xD1, 0xA4, 0xB0, 0xAF, 0xF9, 0xA8, 0xEB, /* 0xFC-0xFF */
+};
+
+static const unsigned char u2c_FA[512] = {
+	0xA4, 0xC1, 0xAB, 0xD7, 0xA9, 0xDD, 0xBF, 0x7D, /* 0x00-0x03 */
+	0xA6, 0x76, 0xAC, 0x7D, 0xBC, 0xC9, 0xBF, 0xE7, /* 0x04-0x07 */
+	0xA6, 0xE6, 0xAD, 0xB0, 0xA8, 0xA3, 0xB9, 0xF8, /* 0x08-0x0B */
+	0xC9, 0x4A, 0xDD, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0xB6, 0xEF, 0x00, 0x00, 0xB4, 0xB8, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0xE8, 0xF9, 0xBD, 0xDE, 0xAF, 0x71, /* 0x14-0x17 */
+	0x00, 0x00, 0xAF, 0xAB, 0xB2, 0xBB, 0xBA, 0xD6, /* 0x18-0x1B */
+	0xB9, 0x74, 0xBA, 0xEB, 0xA6, 0xD0, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0xBD, 0xD1, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0xB6, 0x68, 0xB3, 0xA3, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0xB6, 0xBA, 0xB9, 0x7D, /* 0x28-0x2B */
+	0xC0, 0x5D, 0xC5, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+};
+
+static const unsigned char u2c_FE[512] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
+	0xA1, 0x4A, 0xA1, 0x57, 0x00, 0x00, 0xA1, 0x59, /* 0x30-0x33 */
+	0xA1, 0x5B, 0xA1, 0x5F, 0xA1, 0x60, 0xA1, 0x63, /* 0x34-0x37 */
+	0xA1, 0x64, 0xA1, 0x67, 0xA1, 0x68, 0xA1, 0x6B, /* 0x38-0x3B */
+	0xA1, 0x6C, 0xA1, 0x6F, 0xA1, 0x70, 0xA1, 0x73, /* 0x3C-0x3F */
+	0xA1, 0x74, 0xA1, 0x77, 0xA1, 0x78, 0xA1, 0x7B, /* 0x40-0x43 */
+	0xA1, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
+	0x00, 0x00, 0xA1, 0xC6, 0xA1, 0xC7, 0xA1, 0xCA, /* 0x48-0x4B */
+	0xA1, 0xCB, 0xA1, 0xC8, 0xA1, 0xC9, 0xA1, 0x5C, /* 0x4C-0x4F */
+	0xA1, 0x4D, 0xA1, 0x4E, 0xA1, 0x4F, 0x00, 0x00, /* 0x50-0x53 */
+	0xA1, 0x51, 0xA1, 0x52, 0xA1, 0x53, 0xA1, 0x54, /* 0x54-0x57 */
+	0x00, 0x00, 0xA1, 0x7D, 0xA1, 0x7E, 0xA1, 0xA1, /* 0x58-0x5B */
+	0xA1, 0xA2, 0xA1, 0xA3, 0xA1, 0xA4, 0xA1, 0xCC, /* 0x5C-0x5F */
+	0xA1, 0xCD, 0xA1, 0xCE, 0xA1, 0xDE, 0xA1, 0xDF, /* 0x60-0x63 */
+	0xA1, 0xE0, 0xA1, 0xE1, 0xA1, 0xE2, 0x00, 0x00, /* 0x64-0x67 */
+	0xA2, 0x42, 0xA2, 0x4C, 0xA2, 0x4D, 0xA2, 0x4E, /* 0x68-0x6B */
+};
+
+static const unsigned char u2c_FF[512] = {
+	0x00, 0x00, 0xA1, 0x49, 0xA1, 0xA8, 0xA1, 0xAD, /* 0x00-0x03 */
+	0xA2, 0x43, 0xA2, 0x48, 0xA1, 0xAE, 0xA1, 0xA6, /* 0x04-0x07 */
+	0xA1, 0x5D, 0xA1, 0x5E, 0xA1, 0xAF, 0xA1, 0xCF, /* 0x08-0x0B */
+	0xA1, 0x41, 0xA1, 0xD0, 0xA1, 0x44, 0xA1, 0xFE, /* 0x0C-0x0F */
+	0xA2, 0xAF, 0xA2, 0xB0, 0xA2, 0xB1, 0xA2, 0xB2, /* 0x10-0x13 */
+	0xA2, 0xB3, 0xA2, 0xB4, 0xA2, 0xB5, 0xA2, 0xB6, /* 0x14-0x17 */
+	0xA2, 0xB7, 0xA2, 0xB8, 0xA1, 0x47, 0xA1, 0x46, /* 0x18-0x1B */
+	0xA1, 0xD5, 0xA1, 0xD7, 0xA1, 0xD6, 0xA1, 0x48, /* 0x1C-0x1F */
+	0xA2, 0x49, 0xA2, 0xCF, 0xA2, 0xD0, 0xA2, 0xD1, /* 0x20-0x23 */
+	0xA2, 0xD2, 0xA2, 0xD3, 0xA2, 0xD4, 0xA2, 0xD5, /* 0x24-0x27 */
+	0xA2, 0xD6, 0xA2, 0xD7, 0xA2, 0xD8, 0xA2, 0xD9, /* 0x28-0x2B */
+	0xA2, 0xDA, 0xA2, 0xDB, 0xA2, 0xDC, 0xA2, 0xDD, /* 0x2C-0x2F */
+	0xA2, 0xDE, 0xA2, 0xDF, 0xA2, 0xE0, 0xA2, 0xE1, /* 0x30-0x33 */
+	0xA2, 0xE2, 0xA2, 0xE3, 0xA2, 0xE4, 0xA2, 0xE5, /* 0x34-0x37 */
+	0xA2, 0xE6, 0xA2, 0xE7, 0xA2, 0xE8, 0xA1, 0x65, /* 0x38-0x3B */
+	0xA2, 0x40, 0xA1, 0x66, 0xA1, 0x73, 0xA1, 0xC4, /* 0x3C-0x3F */
+	0xA1, 0xA5, 0xA2, 0xE9, 0xA2, 0xEA, 0xA2, 0xEB, /* 0x40-0x43 */
+	0xA2, 0xEC, 0xA2, 0xED, 0xA2, 0xEE, 0xA2, 0xEF, /* 0x44-0x47 */
+	0xA2, 0xF0, 0xA2, 0xF1, 0xA2, 0xF2, 0xA2, 0xF3, /* 0x48-0x4B */
+	0xA2, 0xF4, 0xA2, 0xF5, 0xA2, 0xF6, 0xA2, 0xF7, /* 0x4C-0x4F */
+	0xA2, 0xF8, 0xA2, 0xF9, 0xA2, 0xFA, 0xA2, 0xFB, /* 0x50-0x53 */
+	0xA2, 0xFC, 0xA2, 0xFD, 0xA2, 0xFE, 0xA3, 0x40, /* 0x54-0x57 */
+	0xA3, 0x41, 0xA3, 0x42, 0xA3, 0x43, 0xA1, 0x61, /* 0x58-0x5B */
+	0xA1, 0x55, 0xA1, 0x62, 0xA1, 0xE3, 0x00, 0x00, /* 0x5C-0x5F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
+	0xA1, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
+	
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
+	0xA2, 0x46, 0xA2, 0x47, 0x00, 0x00, 0xA1, 0xC3, /* 0xE0-0xE3 */
+	0x00, 0x00, 0xA2, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	NULL,   NULL,   u2c_02, u2c_03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_20, u2c_21, u2c_22, u2c_23, NULL,   u2c_25, u2c_26, NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	u2c_30, u2c_31, u2c_32, u2c_33, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   u2c_4E, u2c_4F, 
+	u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, 
+	u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, 
+	u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, 
+	u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, 
+	u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, 
+	u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, 
+	u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, 
+	u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, 
+	u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, 
+	u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, 
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   u2c_DC, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   u2c_F9, u2c_FA, NULL,   NULL,   NULL,   u2c_FE, u2c_FF, };
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(const wchar_t uni,
+			unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni&0xFF;
+	unsigned char ch = (uni>>8)&0xFF;
+	int n;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset) {
+		if (boundlen <= 1)
+			return -ENAMETOOLONG;
+		out[0] = uni2charset[cl*2];
+		out[1] = uni2charset[cl*2+1];
+		if (out[0] == 0x00 && out[1] == 0x00)
+			return -EINVAL;
+		n = 2;
+	} else if (ch==0 && cl) {
+		out[0] = cl;
+		n = 1;
+	}
+	else
+		return -EINVAL;
+
+	return n;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen,
+			wchar_t *uni)
+{
+	unsigned char ch, cl;
+	const wchar_t *charset2uni;
+	int n;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	if (boundlen == 1) {
+		*uni = rawstring[0];
+		return 1;
+	}
+
+	ch = rawstring[0];
+	cl = rawstring[1];
+
+	charset2uni = page_charset2uni[ch];
+	if (charset2uni && cl) {
+		*uni = charset2uni[cl];
+		if (*uni == 0x0000)
+			return -EINVAL;
+		n = 2;
+	} else{
+		*uni = ch;
+		n = 1;
+	}
+	return n;
+}
+
+static struct nls_table table = {
+	.charset	= "cp950",
+	.alias		= "big5",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_cp950(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_cp950(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_cp950)
+module_exit(exit_nls_cp950)
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NLS(big5);
diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c
new file mode 100644
index 00000000..7424929a
--- /dev/null
+++ b/fs/nls/nls_euc-jp.c
@@ -0,0 +1,581 @@
+/*
+ * linux/fs/nls/nls_euc-jp.c
+ *
+ * Added `OSF/JVC Recommended Code Set Conversion Specification
+ * between Japanese EUC and Shift-JIS' support: <hirofumi@mail.parknet.co.jp>
+ * (http://www.opengroup.or.jp/jvc/cde/sjis-euc-e.html)
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static struct nls_table *p_nls;
+
+#define IS_SJIS_LOW_BYTE(l)	((0x40 <= (l)) && ((l) <= 0xFC) && ((l) != 0x7F))
+/* JIS X 0208 (include NEC spesial characters) */
+#define IS_SJIS_JISX0208(h, l)	((((0x81 <= (h)) && ((h) <= 0x9F))	\
+				 || ((0xE0 <= (h)) && ((h) <= 0xEA)))	\
+				 && IS_SJIS_LOW_BYTE(l))
+#define IS_SJIS_JISX0201KANA(c)	((0xA1 <= (c)) && ((c) <= 0xDF))
+#define IS_SJIS_UDC_LOW(h, l)	(((0xF0 <= (h)) && ((h) <= 0xF4))	\
+				 && IS_SJIS_LOW_BYTE(l))
+#define IS_SJIS_UDC_HI(h, l)	(((0xF5 <= (h)) && ((h) <= 0xF9))	\
+				 && IS_SJIS_LOW_BYTE(l))
+#define IS_SJIS_IBM(h, l)	(((0xFA <= (h)) && ((h) <= 0xFC))	\
+				 && IS_SJIS_LOW_BYTE(l))
+#define IS_SJIS_NECIBM(h, l)	(((0xED <= (h)) && ((h) <= 0xEE))	\
+				 && IS_SJIS_LOW_BYTE(l))
+#define MAP_SJIS2EUC(sjis_hi, sjis_lo, sjis_p, euc_hi, euc_lo, euc_p) {		\
+	if ((sjis_lo) >= 0x9F) {						\
+		(euc_hi) = (sjis_hi) * 2 - (((sjis_p) * 2 - (euc_p)) - 1);	\
+		(euc_lo) = (sjis_lo) + 2;					\
+	} else {								\
+		(euc_hi) = (sjis_hi) * 2 - ((sjis_p) * 2 - (euc_p));		\
+		(euc_lo) = (sjis_lo) + ((sjis_lo) >= 0x7F ? 0x60 : 0x61);	\
+	}									\
+} while(0)
+
+#define SS2		(0x8E)		/* Single Shift 2 */
+#define SS3		(0x8F)		/* Single Shift 3 */
+#define IS_EUC_BYTE(c)		((0xA1 <= (c)) && ((c) <= 0xFE))
+#define IS_EUC_JISX0208(h, l)	(IS_EUC_BYTE(h) && IS_EUC_BYTE(l))
+#define IS_EUC_JISX0201KANA(h, l)	(((h) == SS2) && (0xA1 <= (l) && (l) <= 0xDF))
+#define IS_EUC_UDC_LOW(h, l)	(((0xF5 <= (h)) && ((h) <= 0xFE))	\
+				 && IS_EUC_BYTE(l))
+#define IS_EUC_UDC_HI(h, l)	IS_EUC_UDC_LOW(h, l) /* G3 block */
+#define MAP_EUC2SJIS(euc_hi, euc_lo, euc_p, sjis_hi, sjis_lo, sjis_p) {		\
+	if ((euc_hi) & 1) {							\
+		(sjis_hi) = (euc_hi) / 2 + ((sjis_p) - (euc_p) / 2);		\
+		(sjis_lo) = (euc_lo) - ((euc_lo) >= 0xE0 ? 0x60 : 0x61);	\
+	} else {								\
+		(sjis_hi) = (euc_hi) / 2 + (((sjis_p) - (euc_p) / 2) - 1);	\
+		(sjis_lo) = (euc_lo) - 2;					\
+	}									\
+} while(0)
+
+/* SJIS IBM extended characters to EUC map */
+static const unsigned char sjisibm2euc_map[][2] = {
+	{0xF3, 0xF3}, {0xF3, 0xF4}, {0xF3, 0xF5}, {0xF3, 0xF6}, {0xF3, 0xF7},
+	{0xF3, 0xF8}, {0xF3, 0xF9}, {0xF3, 0xFA}, {0xF3, 0xFB}, {0xF3, 0xFC},
+	{0xF3, 0xFD}, {0xF3, 0xFE}, {0xF4, 0xA1}, {0xF4, 0xA2}, {0xF4, 0xA3},
+	{0xF4, 0xA4}, {0xF4, 0xA5}, {0xF4, 0xA6}, {0xF4, 0xA7}, {0xF4, 0xA8},
+	{0xA2, 0xCC}, {0xA2, 0xC3}, {0xF4, 0xA9}, {0xF4, 0xAA}, {0xF4, 0xAB},
+	{0xF4, 0xAC}, {0xF4, 0xAD}, {0xA2, 0xE8}, {0xD4, 0xE3}, {0xDC, 0xDF},
+	{0xE4, 0xE9}, {0xE3, 0xF8}, {0xD9, 0xA1}, {0xB1, 0xBB}, {0xF4, 0xAE},
+	{0xC2, 0xAD}, {0xC3, 0xFC}, {0xE4, 0xD0}, {0xC2, 0xBF}, {0xBC, 0xF4},
+	{0xB0, 0xA9}, {0xB0, 0xC8}, {0xF4, 0xAF}, {0xB0, 0xD2}, {0xB0, 0xD4},
+	{0xB0, 0xE3}, {0xB0, 0xEE}, {0xB1, 0xA7}, {0xB1, 0xA3}, {0xB1, 0xAC},
+	{0xB1, 0xA9}, {0xB1, 0xBE}, {0xB1, 0xDF}, {0xB1, 0xD8}, {0xB1, 0xC8},
+	{0xB1, 0xD7}, {0xB1, 0xE3}, {0xB1, 0xF4}, {0xB1, 0xE1}, {0xB2, 0xA3},
+	{0xF4, 0xB0}, {0xB2, 0xBB}, {0xB2, 0xE6}, {0x00, 0x00}, {0xB2, 0xED},
+	{0xB2, 0xF5}, {0xB2, 0xFC}, {0xF4, 0xB1}, {0xB3, 0xB5}, {0xB3, 0xD8},
+	{0xB3, 0xDB}, {0xB3, 0xE5}, {0xB3, 0xEE}, {0xB3, 0xFB}, {0xF4, 0xB2},
+	{0xF4, 0xB3}, {0xB4, 0xC0}, {0xB4, 0xC7}, {0xB4, 0xD0}, {0xB4, 0xDE},
+	{0xF4, 0xB4}, {0xB5, 0xAA}, {0xF4, 0xB5}, {0xB5, 0xAF}, {0xB5, 0xC4},
+	{0xB5, 0xE8}, {0xF4, 0xB6}, {0xB7, 0xC2}, {0xB7, 0xE4}, {0xB7, 0xE8},
+	{0xB7, 0xE7}, {0xF4, 0xB7}, {0xF4, 0xB8}, {0xF4, 0xB9}, {0xB8, 0xCE},
+	{0xB8, 0xE1}, {0xB8, 0xF5}, {0xB8, 0xF7}, {0xB8, 0xF8}, {0xB8, 0xFC},
+	{0xB9, 0xAF}, {0xB9, 0xB7}, {0xBA, 0xBE}, {0xBA, 0xDB}, {0xCD, 0xAA},
+	{0xBA, 0xE1}, {0xF4, 0xBA}, {0xBA, 0xEB}, {0xBB, 0xB3}, {0xBB, 0xB8},
+	{0xF4, 0xBB}, {0xBB, 0xCA}, {0xF4, 0xBC}, {0xF4, 0xBD}, {0xBB, 0xD0},
+	{0xBB, 0xDE}, {0xBB, 0xF4}, {0xBB, 0xF5}, {0xBB, 0xF9}, {0xBC, 0xE4},
+	{0xBC, 0xED}, {0xBC, 0xFE}, {0xF4, 0xBE}, {0xBD, 0xC2}, {0xBD, 0xE7},
+	{0xF4, 0xBF}, {0xBD, 0xF0}, {0xBE, 0xB0}, {0xBE, 0xAC}, {0xF4, 0xC0},
+	{0xBE, 0xB3}, {0xBE, 0xBD}, {0xBE, 0xCD}, {0xBE, 0xC9}, {0xBE, 0xE4},
+	{0xBF, 0xA8}, {0xBF, 0xC9}, {0xC0, 0xC4}, {0xC0, 0xE4}, {0xC0, 0xF4},
+	{0xC1, 0xA6}, {0xF4, 0xC1}, {0xC1, 0xF5}, {0xC1, 0xFC}, {0xF4, 0xC2},
+	{0xC1, 0xF8}, {0xC2, 0xAB}, {0xC2, 0xA1}, {0xC2, 0xA5}, {0xF4, 0xC3},
+	{0xC2, 0xB8}, {0xC2, 0xBA}, {0xF4, 0xC4}, {0xC2, 0xC4}, {0xC2, 0xD2},
+	{0xC2, 0xD7}, {0xC2, 0xDB}, {0xC2, 0xDE}, {0xC2, 0xED}, {0xC2, 0xF0},
+	{0xF4, 0xC5}, {0xC3, 0xA1}, {0xC3, 0xB5}, {0xC3, 0xC9}, {0xC3, 0xB9},
+	{0xF4, 0xC6}, {0xC3, 0xD8}, {0xC3, 0xFE}, {0xF4, 0xC7}, {0xC4, 0xCC},
+	{0xF4, 0xC8}, {0xC4, 0xD9}, {0xC4, 0xEA}, {0xC4, 0xFD}, {0xF4, 0xC9},
+	{0xC5, 0xA7}, {0xC5, 0xB5}, {0xC5, 0xB6}, {0xF4, 0xCA}, {0xC5, 0xD5},
+	{0xC6, 0xB8}, {0xC6, 0xD7}, {0xC6, 0xE0}, {0xC6, 0xEA}, {0xC6, 0xE3},
+	{0xC7, 0xA1}, {0xC7, 0xAB}, {0xC7, 0xC7}, {0xC7, 0xC3}, {0xC7, 0xCB},
+	{0xC7, 0xCF}, {0xC7, 0xD9}, {0xF4, 0xCB}, {0xF4, 0xCC}, {0xC7, 0xE6},
+	{0xC7, 0xEE}, {0xC7, 0xFC}, {0xC7, 0xEB}, {0xC7, 0xF0}, {0xC8, 0xB1},
+	{0xC8, 0xE5}, {0xC8, 0xF8}, {0xC9, 0xA6}, {0xC9, 0xAB}, {0xC9, 0xAD},
+	{0xF4, 0xCD}, {0xC9, 0xCA}, {0xC9, 0xD3}, {0xC9, 0xE9}, {0xC9, 0xE3},
+	{0xC9, 0xFC}, {0xC9, 0xF4}, {0xC9, 0xF5}, {0xF4, 0xCE}, {0xCA, 0xB3},
+	{0xCA, 0xBD}, {0xCA, 0xEF}, {0xCA, 0xF1}, {0xCB, 0xAE}, {0xF4, 0xCF},
+	{0xCB, 0xCA}, {0xCB, 0xE6}, {0xCB, 0xEA}, {0xCB, 0xF0}, {0xCB, 0xF4},
+	{0xCB, 0xEE}, {0xCC, 0xA5}, {0xCB, 0xF9}, {0xCC, 0xAB}, {0xCC, 0xAE},
+	{0xCC, 0xAD}, {0xCC, 0xB2}, {0xCC, 0xC2}, {0xCC, 0xD0}, {0xCC, 0xD9},
+	{0xF4, 0xD0}, {0xCD, 0xBB}, {0xF4, 0xD1}, {0xCE, 0xBB}, {0xF4, 0xD2},
+	{0xCE, 0xBA}, {0xCE, 0xC3}, {0xF4, 0xD3}, {0xCE, 0xF2}, {0xB3, 0xDD},
+	{0xCF, 0xD5}, {0xCF, 0xE2}, {0xCF, 0xE9}, {0xCF, 0xED}, {0xF4, 0xD4},
+	{0xF4, 0xD5}, {0xF4, 0xD6}, {0x00, 0x00}, {0xF4, 0xD7}, {0xD0, 0xE5},
+	{0xF4, 0xD8}, {0xD0, 0xE9}, {0xD1, 0xE8}, {0xF4, 0xD9}, {0xF4, 0xDA},
+	{0xD1, 0xEC}, {0xD2, 0xBB}, {0xF4, 0xDB}, {0xD3, 0xE1}, {0xD3, 0xE8},
+	{0xD4, 0xA7}, {0xF4, 0xDC}, {0xF4, 0xDD}, {0xD4, 0xD4}, {0xD4, 0xF2},
+	{0xD5, 0xAE}, {0xF4, 0xDE}, {0xD7, 0xDE}, {0xF4, 0xDF}, {0xD8, 0xA2},
+	{0xD8, 0xB7}, {0xD8, 0xC1}, {0xD8, 0xD1}, {0xD8, 0xF4}, {0xD9, 0xC6},
+	{0xD9, 0xC8}, {0xD9, 0xD1}, {0xF4, 0xE0}, {0xF4, 0xE1}, {0xF4, 0xE2},
+	{0xF4, 0xE3}, {0xF4, 0xE4}, {0xDC, 0xD3}, {0xDD, 0xC8}, {0xDD, 0xD4},
+	{0xDD, 0xEA}, {0xDD, 0xFA}, {0xDE, 0xA4}, {0xDE, 0xB0}, {0xF4, 0xE5},
+	{0xDE, 0xB5}, {0xDE, 0xCB}, {0xF4, 0xE6}, {0xDF, 0xB9}, {0xF4, 0xE7},
+	{0xDF, 0xC3}, {0xF4, 0xE8}, {0xF4, 0xE9}, {0xE0, 0xD9}, {0xF4, 0xEA},
+	{0xF4, 0xEB}, {0xE1, 0xE2}, {0xF4, 0xEC}, {0xF4, 0xED}, {0xF4, 0xEE},
+	{0xE2, 0xC7}, {0xE3, 0xA8}, {0xE3, 0xA6}, {0xE3, 0xA9}, {0xE3, 0xAF},
+	{0xE3, 0xB0}, {0xE3, 0xAA}, {0xE3, 0xAB}, {0xE3, 0xBC}, {0xE3, 0xC1},
+	{0xE3, 0xBF}, {0xE3, 0xD5}, {0xE3, 0xD8}, {0xE3, 0xD6}, {0xE3, 0xDF},
+	{0xE3, 0xE3}, {0xE3, 0xE1}, {0xE3, 0xD4}, {0xE3, 0xE9}, {0xE4, 0xA6},
+	{0xE3, 0xF1}, {0xE3, 0xF2}, {0xE4, 0xCB}, {0xE4, 0xC1}, {0xE4, 0xC3},
+	{0xE4, 0xBE}, {0xF4, 0xEF}, {0xE4, 0xC0}, {0xE4, 0xC7}, {0xE4, 0xBF},
+	{0xE4, 0xE0}, {0xE4, 0xDE}, {0xE4, 0xD1}, {0xF4, 0xF0}, {0xE4, 0xDC},
+	{0xE4, 0xD2}, {0xE4, 0xDB}, {0xE4, 0xD4}, {0xE4, 0xFA}, {0xE4, 0xEF},
+	{0xE5, 0xB3}, {0xE5, 0xBF}, {0xE5, 0xC9}, {0xE5, 0xD0}, {0xE5, 0xE2},
+	{0xE5, 0xEA}, {0xE5, 0xEB}, {0xF4, 0xF1}, {0xF4, 0xF2}, {0xF4, 0xF3},
+	{0xE6, 0xE8}, {0xE6, 0xEF}, {0xE7, 0xAC}, {0xF4, 0xF4}, {0xE7, 0xAE},
+	{0xF4, 0xF5}, {0xE7, 0xB1}, {0xF4, 0xF6}, {0xE7, 0xB2}, {0xE8, 0xB1},
+	{0xE8, 0xB6}, {0xF4, 0xF7}, {0xF4, 0xF8}, {0xE8, 0xDD}, {0xF4, 0xF9},
+	{0xF4, 0xFA}, {0xE9, 0xD1}, {0xF4, 0xFB}, {0xE9, 0xED}, {0xEA, 0xCD},
+	{0xF4, 0xFC}, {0xEA, 0xDB}, {0xEA, 0xE6}, {0xEA, 0xEA}, {0xEB, 0xA5},
+	{0xEB, 0xFB}, {0xEB, 0xFA}, {0xF4, 0xFD}, {0xEC, 0xD6}, {0xF4, 0xFE},
+};
+
+#define IS_EUC_IBM2JISX0208(h, l) \
+		(((h) == 0xA2 && (l) == 0xCC) || ((h) == 0xA2 && (l) == 0xE8))
+
+/* EUC to SJIS IBM extended characters map (G3 JIS X 0212 block) */
+static struct {
+	unsigned short euc;
+	unsigned char sjis[2];
+} euc2sjisibm_jisx0212_map[] = {
+	{0xA2C3, {0xFA, 0x55}}, {0xB0A9, {0xFA, 0x68}}, {0xB0C8, {0xFA, 0x69}},
+	{0xB0D2, {0xFA, 0x6B}}, {0xB0D4, {0xFA, 0x6C}}, {0xB0E3, {0xFA, 0x6D}},
+	{0xB0EE, {0xFA, 0x6E}}, {0xB1A3, {0xFA, 0x70}}, {0xB1A7, {0xFA, 0x6F}},
+	{0xB1A9, {0xFA, 0x72}}, {0xB1AC, {0xFA, 0x71}}, {0xB1BB, {0xFA, 0x61}},
+	{0xB1BE, {0xFA, 0x73}}, {0xB1C8, {0xFA, 0x76}}, {0xB1D7, {0xFA, 0x77}},
+	{0xB1D8, {0xFA, 0x75}}, {0xB1DF, {0xFA, 0x74}}, {0xB1E1, {0xFA, 0x7A}},
+	{0xB1E3, {0xFA, 0x78}}, {0xB1F4, {0xFA, 0x79}}, {0xB2A3, {0xFA, 0x7B}},
+	{0xB2BB, {0xFA, 0x7D}}, {0xB2E6, {0xFA, 0x7E}}, {0xB2ED, {0xFA, 0x80}},
+	{0xB2F5, {0xFA, 0x81}}, {0xB2FC, {0xFA, 0x82}}, {0xB3B5, {0xFA, 0x84}},
+	{0xB3D8, {0xFA, 0x85}}, {0xB3DB, {0xFA, 0x86}}, {0xB3DD, {0xFB, 0x77}},
+	{0xB3E5, {0xFA, 0x87}}, {0xB3EE, {0xFA, 0x88}}, {0xB3FB, {0xFA, 0x89}},
+	{0xB4C0, {0xFA, 0x8C}}, {0xB4C7, {0xFA, 0x8D}}, {0xB4D0, {0xFA, 0x8E}},
+	{0xB4DE, {0xFA, 0x8F}}, {0xB5AA, {0xFA, 0x91}}, {0xB5AF, {0xFA, 0x93}},
+	{0xB5C4, {0xFA, 0x94}}, {0xB5E8, {0xFA, 0x95}}, {0xB7C2, {0xFA, 0x97}},
+	{0xB7E4, {0xFA, 0x98}}, {0xB7E7, {0xFA, 0x9A}}, {0xB7E8, {0xFA, 0x99}},
+	{0xB8CE, {0xFA, 0x9E}}, {0xB8E1, {0xFA, 0x9F}}, {0xB8F5, {0xFA, 0xA0}},
+	{0xB8F7, {0xFA, 0xA1}}, {0xB8F8, {0xFA, 0xA2}}, {0xB8FC, {0xFA, 0xA3}},
+	{0xB9AF, {0xFA, 0xA4}}, {0xB9B7, {0xFA, 0xA5}}, {0xBABE, {0xFA, 0xA6}},
+	{0xBADB, {0xFA, 0xA7}}, {0xBAE1, {0xFA, 0xA9}}, {0xBAEB, {0xFA, 0xAB}},
+	{0xBBB3, {0xFA, 0xAC}}, {0xBBB8, {0xFA, 0xAD}}, {0xBBCA, {0xFA, 0xAF}},
+	{0xBBD0, {0xFA, 0xB2}}, {0xBBDE, {0xFA, 0xB3}}, {0xBBF4, {0xFA, 0xB4}},
+	{0xBBF5, {0xFA, 0xB5}}, {0xBBF9, {0xFA, 0xB6}}, {0xBCE4, {0xFA, 0xB7}},
+	{0xBCED, {0xFA, 0xB8}}, {0xBCF4, {0xFA, 0x67}}, {0xBCFE, {0xFA, 0xB9}},
+	{0xBDC2, {0xFA, 0xBB}}, {0xBDE7, {0xFA, 0xBC}}, {0xBDF0, {0xFA, 0xBE}},
+	{0xBEAC, {0xFA, 0xC0}}, {0xBEB0, {0xFA, 0xBF}}, {0xBEB3, {0xFA, 0xC2}},
+	{0xBEBD, {0xFA, 0xC3}}, {0xBEC9, {0xFA, 0xC5}}, {0xBECD, {0xFA, 0xC4}},
+	{0xBEE4, {0xFA, 0xC6}}, {0xBFA8, {0xFA, 0xC7}}, {0xBFC9, {0xFA, 0xC8}},
+	{0xC0C4, {0xFA, 0xC9}}, {0xC0E4, {0xFA, 0xCA}}, {0xC0F4, {0xFA, 0xCB}},
+	{0xC1A6, {0xFA, 0xCC}}, {0xC1F5, {0xFA, 0xCE}}, {0xC1F8, {0xFA, 0xD1}},
+	{0xC1FC, {0xFA, 0xCF}}, {0xC2A1, {0xFA, 0xD3}}, {0xC2A5, {0xFA, 0xD4}},
+	{0xC2AB, {0xFA, 0xD2}}, {0xC2AD, {0xFA, 0x63}}, {0xC2B8, {0xFA, 0xD6}},
+	{0xC2BA, {0xFA, 0xD7}}, {0xC2BF, {0xFA, 0x66}}, {0xC2C4, {0xFA, 0xD9}},
+	{0xC2D2, {0xFA, 0xDA}}, {0xC2D7, {0xFA, 0xDB}}, {0xC2DB, {0xFA, 0xDC}},
+	{0xC2DE, {0xFA, 0xDD}}, {0xC2ED, {0xFA, 0xDE}}, {0xC2F0, {0xFA, 0xDF}},
+	{0xC3A1, {0xFA, 0xE1}}, {0xC3B5, {0xFA, 0xE2}}, {0xC3B9, {0xFA, 0xE4}},
+	{0xC3C9, {0xFA, 0xE3}}, {0xC3D8, {0xFA, 0xE6}}, {0xC3FC, {0xFA, 0x64}},
+	{0xC3FE, {0xFA, 0xE7}}, {0xC4CC, {0xFA, 0xE9}}, {0xC4D9, {0xFA, 0xEB}},
+	{0xC4EA, {0xFA, 0xEC}}, {0xC4FD, {0xFA, 0xED}}, {0xC5A7, {0xFA, 0xEF}},
+	{0xC5B5, {0xFA, 0xF0}}, {0xC5B6, {0xFA, 0xF1}}, {0xC5D5, {0xFA, 0xF3}},
+	{0xC6B8, {0xFA, 0xF4}}, {0xC6D7, {0xFA, 0xF5}}, {0xC6E0, {0xFA, 0xF6}},
+	{0xC6E3, {0xFA, 0xF8}}, {0xC6EA, {0xFA, 0xF7}}, {0xC7A1, {0xFA, 0xF9}},
+	{0xC7AB, {0xFA, 0xFA}}, {0xC7C3, {0xFA, 0xFC}}, {0xC7C7, {0xFA, 0xFB}},
+	{0xC7CB, {0xFB, 0x40}}, {0xC7CF, {0xFB, 0x41}}, {0xC7D9, {0xFB, 0x42}},
+	{0xC7E6, {0xFB, 0x45}}, {0xC7EB, {0xFB, 0x48}}, {0xC7EE, {0xFB, 0x46}},
+	{0xC7F0, {0xFB, 0x49}}, {0xC7FC, {0xFB, 0x47}}, {0xC8B1, {0xFB, 0x4A}},
+	{0xC8E5, {0xFB, 0x4B}}, {0xC8F8, {0xFB, 0x4C}}, {0xC9A6, {0xFB, 0x4D}},
+	{0xC9AB, {0xFB, 0x4E}}, {0xC9AD, {0xFB, 0x4F}}, {0xC9CA, {0xFB, 0x51}},
+	{0xC9D3, {0xFB, 0x52}}, {0xC9E3, {0xFB, 0x54}}, {0xC9E9, {0xFB, 0x53}},
+	{0xC9F4, {0xFB, 0x56}}, {0xC9F5, {0xFB, 0x57}}, {0xC9FC, {0xFB, 0x55}},
+	{0xCAB3, {0xFB, 0x59}}, {0xCABD, {0xFB, 0x5A}}, {0xCAEF, {0xFB, 0x5B}},
+	{0xCAF1, {0xFB, 0x5C}}, {0xCBAE, {0xFB, 0x5D}}, {0xCBCA, {0xFB, 0x5F}},
+	{0xCBE6, {0xFB, 0x60}}, {0xCBEA, {0xFB, 0x61}}, {0xCBEE, {0xFB, 0x64}},
+	{0xCBF0, {0xFB, 0x62}}, {0xCBF4, {0xFB, 0x63}}, {0xCBF9, {0xFB, 0x66}},
+	{0xCCA5, {0xFB, 0x65}}, {0xCCAB, {0xFB, 0x67}}, {0xCCAD, {0xFB, 0x69}},
+	{0xCCAE, {0xFB, 0x68}}, {0xCCB2, {0xFB, 0x6A}}, {0xCCC2, {0xFB, 0x6B}},
+	{0xCCD0, {0xFB, 0x6C}}, {0xCCD9, {0xFB, 0x6D}}, {0xCDAA, {0xFA, 0xA8}},
+	{0xCDBB, {0xFB, 0x6F}}, {0xCEBA, {0xFB, 0x73}}, {0xCEBB, {0xFB, 0x71}},
+	{0xCEC3, {0xFB, 0x74}}, {0xCEF2, {0xFB, 0x76}}, {0xCFD5, {0xFB, 0x78}},
+	{0xCFE2, {0xFB, 0x79}}, {0xCFE9, {0xFB, 0x7A}}, {0xCFED, {0xFB, 0x7B}},
+	{0xD0E5, {0xFB, 0x81}}, {0xD0E9, {0xFB, 0x83}}, {0xD1E8, {0xFB, 0x84}},
+	{0xD1EC, {0xFB, 0x87}}, {0xD2BB, {0xFB, 0x88}}, {0xD3E1, {0xFB, 0x8A}},
+	{0xD3E8, {0xFB, 0x8B}}, {0xD4A7, {0xFB, 0x8C}}, {0xD4D4, {0xFB, 0x8F}},
+	{0xD4E3, {0xFA, 0x5C}}, {0xD4F2, {0xFB, 0x90}}, {0xD5AE, {0xFB, 0x91}},
+	{0xD7DE, {0xFB, 0x93}}, {0xD8A2, {0xFB, 0x95}}, {0xD8B7, {0xFB, 0x96}},
+	{0xD8C1, {0xFB, 0x97}}, {0xD8D1, {0xFB, 0x98}}, {0xD8F4, {0xFB, 0x99}},
+	{0xD9A1, {0xFA, 0x60}}, {0xD9C6, {0xFB, 0x9A}}, {0xD9C8, {0xFB, 0x9B}},
+	{0xD9D1, {0xFB, 0x9C}}, {0xDCD3, {0xFB, 0xA2}}, {0xDCDF, {0xFA, 0x5D}},
+	{0xDDC8, {0xFB, 0xA3}}, {0xDDD4, {0xFB, 0xA4}}, {0xDDEA, {0xFB, 0xA5}},
+	{0xDDFA, {0xFB, 0xA6}}, {0xDEA4, {0xFB, 0xA7}}, {0xDEB0, {0xFB, 0xA8}},
+	{0xDEB5, {0xFB, 0xAA}}, {0xDECB, {0xFB, 0xAB}}, {0xDFB9, {0xFB, 0xAD}},
+	{0xDFC3, {0xFB, 0xAF}}, {0xE0D9, {0xFB, 0xB2}}, {0xE1E2, {0xFB, 0xB5}},
+	{0xE2C7, {0xFB, 0xB9}}, {0xE3A6, {0xFB, 0xBB}}, {0xE3A8, {0xFB, 0xBA}},
+	{0xE3A9, {0xFB, 0xBC}}, {0xE3AA, {0xFB, 0xBF}}, {0xE3AB, {0xFB, 0xC0}},
+	{0xE3AF, {0xFB, 0xBD}}, {0xE3B0, {0xFB, 0xBE}}, {0xE3BC, {0xFB, 0xC1}},
+	{0xE3BF, {0xFB, 0xC3}}, {0xE3C1, {0xFB, 0xC2}}, {0xE3D4, {0xFB, 0xCA}},
+	{0xE3D5, {0xFB, 0xC4}}, {0xE3D6, {0xFB, 0xC6}}, {0xE3D8, {0xFB, 0xC5}},
+	{0xE3DF, {0xFB, 0xC7}}, {0xE3E1, {0xFB, 0xC9}}, {0xE3E3, {0xFB, 0xC8}},
+	{0xE3E9, {0xFB, 0xCB}}, {0xE3F1, {0xFB, 0xCD}}, {0xE3F2, {0xFB, 0xCE}},
+	{0xE3F8, {0xFA, 0x5F}}, {0xE4A6, {0xFB, 0xCC}}, {0xE4BE, {0xFB, 0xD2}},
+	{0xE4BF, {0xFB, 0xD6}}, {0xE4C0, {0xFB, 0xD4}}, {0xE4C1, {0xFB, 0xD0}},
+	{0xE4C3, {0xFB, 0xD1}}, {0xE4C7, {0xFB, 0xD5}}, {0xE4CB, {0xFB, 0xCF}},
+	{0xE4D0, {0xFA, 0x65}}, {0xE4D1, {0xFB, 0xD9}}, {0xE4D2, {0xFB, 0xDC}},
+	{0xE4D4, {0xFB, 0xDE}}, {0xE4DB, {0xFB, 0xDD}}, {0xE4DC, {0xFB, 0xDB}},
+	{0xE4DE, {0xFB, 0xD8}}, {0xE4E0, {0xFB, 0xD7}}, {0xE4E9, {0xFA, 0x5E}},
+	{0xE4EF, {0xFB, 0xE0}}, {0xE4FA, {0xFB, 0xDF}}, {0xE5B3, {0xFB, 0xE1}},
+	{0xE5BF, {0xFB, 0xE2}}, {0xE5C9, {0xFB, 0xE3}}, {0xE5D0, {0xFB, 0xE4}},
+	{0xE5E2, {0xFB, 0xE5}}, {0xE5EA, {0xFB, 0xE6}}, {0xE5EB, {0xFB, 0xE7}},
+	{0xE6E8, {0xFB, 0xEB}}, {0xE6EF, {0xFB, 0xEC}}, {0xE7AC, {0xFB, 0xED}},
+	{0xE7AE, {0xFB, 0xEF}}, {0xE7B1, {0xFB, 0xF1}}, {0xE7B2, {0xFB, 0xF3}},
+	{0xE8B1, {0xFB, 0xF4}}, {0xE8B6, {0xFB, 0xF5}}, {0xE8DD, {0xFB, 0xF8}},
+	{0xE9D1, {0xFB, 0xFB}}, {0xE9ED, {0xFC, 0x40}}, {0xEACD, {0xFC, 0x41}},
+	{0xEADB, {0xFC, 0x43}}, {0xEAE6, {0xFC, 0x44}}, {0xEAEA, {0xFC, 0x45}},
+	{0xEBA5, {0xFC, 0x46}}, {0xEBFA, {0xFC, 0x48}}, {0xEBFB, {0xFC, 0x47}},
+	{0xECD6, {0xFC, 0x4A}},
+};
+
+/* EUC to SJIS IBM extended characters map (G3 Upper block) */
+static const unsigned char euc2sjisibm_g3upper_map[][2] = {
+	{0xFA, 0x40}, {0xFA, 0x41}, {0xFA, 0x42}, {0xFA, 0x43}, {0xFA, 0x44},
+	{0xFA, 0x45}, {0xFA, 0x46}, {0xFA, 0x47}, {0xFA, 0x48}, {0xFA, 0x49},
+	{0xFA, 0x4A}, {0xFA, 0x4B}, {0xFA, 0x4C}, {0xFA, 0x4D}, {0xFA, 0x4E},
+	{0xFA, 0x4F}, {0xFA, 0x50}, {0xFA, 0x51}, {0xFA, 0x52}, {0xFA, 0x53},
+	{0xFA, 0x56}, {0xFA, 0x57}, {0xFA, 0x58}, {0xFA, 0x59}, {0xFA, 0x5A},
+	{0xFA, 0x62}, {0xFA, 0x6A}, {0xFA, 0x7C}, {0xFA, 0x83}, {0xFA, 0x8A},
+	{0xFA, 0x8B}, {0xFA, 0x90}, {0xFA, 0x92}, {0xFA, 0x96}, {0xFA, 0x9B},
+	{0xFA, 0x9C}, {0xFA, 0x9D}, {0xFA, 0xAA}, {0xFA, 0xAE}, {0xFA, 0xB0},
+	{0xFA, 0xB1}, {0xFA, 0xBA}, {0xFA, 0xBD}, {0xFA, 0xC1}, {0xFA, 0xCD},
+	{0xFA, 0xD0}, {0xFA, 0xD5}, {0xFA, 0xD8}, {0xFA, 0xE0}, {0xFA, 0xE5},
+	{0xFA, 0xE8}, {0xFA, 0xEA}, {0xFA, 0xEE}, {0xFA, 0xF2}, {0xFB, 0x43},
+	{0xFB, 0x44}, {0xFB, 0x50}, {0xFB, 0x58}, {0xFB, 0x5E}, {0xFB, 0x6E},
+	{0xFB, 0x70}, {0xFB, 0x72}, {0xFB, 0x75}, {0xFB, 0x7C}, {0xFB, 0x7D},
+	{0xFB, 0x7E}, {0xFB, 0x80}, {0xFB, 0x82}, {0xFB, 0x85}, {0xFB, 0x86},
+	{0xFB, 0x89}, {0xFB, 0x8D}, {0xFB, 0x8E}, {0xFB, 0x92}, {0xFB, 0x94},
+	{0xFB, 0x9D}, {0xFB, 0x9E}, {0xFB, 0x9F}, {0xFB, 0xA0}, {0xFB, 0xA1},
+	{0xFB, 0xA9}, {0xFB, 0xAC}, {0xFB, 0xAE}, {0xFB, 0xB0}, {0xFB, 0xB1},
+	{0xFB, 0xB3}, {0xFB, 0xB4}, {0xFB, 0xB6}, {0xFB, 0xB7}, {0xFB, 0xB8},
+	{0xFB, 0xD3}, {0xFB, 0xDA}, {0xFB, 0xE8}, {0xFB, 0xE9}, {0xFB, 0xEA},
+	{0xFB, 0xEE}, {0xFB, 0xF0}, {0xFB, 0xF2}, {0xFB, 0xF6}, {0xFB, 0xF7},
+	{0xFB, 0xF9}, {0xFB, 0xFA}, {0xFB, 0xFC}, {0xFC, 0x42}, {0xFC, 0x49},
+	{0xFC, 0x4B},
+};
+
+static inline int sjisibm2euc(unsigned char *euc, const unsigned char sjis_hi,
+			      const unsigned char sjis_lo);
+static inline int euc2sjisibm_jisx0212(unsigned char *sjis, const unsigned char euc_hi,
+				       const unsigned char euc_lo);
+static inline int euc2sjisibm_g3upper(unsigned char *sjis, const unsigned char euc_hi,
+				      const unsigned char euc_lo);
+static inline int euc2sjisibm(unsigned char *sjis, const unsigned char euc_hi,
+			      const unsigned char euc_lo);
+static inline int sjisnec2sjisibm(unsigned char *sjisibm,
+				  const unsigned char sjisnec_hi,
+				  const unsigned char sjisnec_lo);
+
+/* SJIS IBM extended characters to EUC */
+static inline int sjisibm2euc(unsigned char *euc, const unsigned char sjis_hi,
+			      const unsigned char sjis_lo)
+{
+	int index;
+
+	index = ((sjis_hi - 0xFA) * (0xFD - 0x40)) + (sjis_lo - 0x40);
+	if (IS_EUC_IBM2JISX0208(sjisibm2euc_map[index][0],
+				sjisibm2euc_map[index][1])) {
+		euc[0] = sjisibm2euc_map[index][0];
+		euc[1] = sjisibm2euc_map[index][1];
+		return 2;
+	} else {
+		euc[0] = SS3;
+		euc[1] = sjisibm2euc_map[index][0];
+		euc[2] = sjisibm2euc_map[index][1];
+		return 3;
+	}
+}
+
+/* EUC to SJIS IBM extended characters (G3 JIS X 0212 block) */
+static inline int euc2sjisibm_jisx0212(unsigned char *sjis, const unsigned char euc_hi,
+				       const unsigned char euc_lo)
+{
+	int index, min_index, max_index;
+	unsigned short euc;
+
+	min_index = 0;
+	max_index = ARRAY_SIZE(euc2sjisibm_jisx0212_map) - 1;
+	euc = (euc_hi << 8) | euc_lo;
+
+	while (min_index <= max_index) {
+		index = (min_index + max_index) / 2;
+		if (euc < euc2sjisibm_jisx0212_map[index].euc)
+			max_index = index - 1;
+		else
+			min_index = index + 1;
+		if (euc == euc2sjisibm_jisx0212_map[index].euc) {
+			sjis[0] = euc2sjisibm_jisx0212_map[index].sjis[0];
+			sjis[1] = euc2sjisibm_jisx0212_map[index].sjis[1];
+			return 3;
+		}
+	}
+	return 0;
+}
+
+/* EUC to SJIS IBM extended characters (G3 Upper block) */
+static inline int euc2sjisibm_g3upper(unsigned char *sjis, const unsigned char euc_hi,
+				      const unsigned char euc_lo)
+{
+	int index;
+
+	if (euc_hi == 0xF3)
+		index = ((euc_hi << 8) | euc_lo) - 0xF3F3;
+	else
+		index = ((euc_hi << 8) | euc_lo) - 0xF4A1 + 12;
+
+	if ((index < 0) || (index >= ARRAY_SIZE(euc2sjisibm_g3upper_map)))
+		return 0;
+
+	sjis[0] = euc2sjisibm_g3upper_map[index][0];
+	sjis[1] = euc2sjisibm_g3upper_map[index][1];
+
+	return 3;
+}
+
+/* EUC to SJIS IBM extended characters (G3 block) */
+static inline int euc2sjisibm(unsigned char *sjis, const unsigned char euc_hi,
+			      const unsigned char euc_lo)
+{
+	int n;
+
+#if 0
+	if ((euc_hi == 0xA2) && (euc_lo == 0xCC)) {
+		sjis[0] = 0xFA;
+		sjis[1] = 0x54;
+		return 2;
+	} else if ((euc_hi == 0xA2) && (euc_lo == 0xE8)) {
+		sjis[0] = 0xFA;
+		sjis[1] = 0x5B;
+		return 2;
+	}
+#endif
+	if ((n = euc2sjisibm_g3upper(sjis, euc_hi, euc_lo))) {
+		return n;
+	} else if ((n = euc2sjisibm_jisx0212(sjis, euc_hi, euc_lo))) {
+		return n;
+	}
+
+	return 0;
+}
+
+/* NEC/IBM extended characters to IBM extended characters */
+static inline int sjisnec2sjisibm(unsigned char *sjisibm,
+				  const unsigned char sjisnec_hi,
+				  const unsigned char sjisnec_lo)
+{
+	int count;
+
+	if (! IS_SJIS_NECIBM(sjisnec_hi, sjisnec_lo))
+		return 0;
+
+	if ((sjisnec_hi == 0xEE) && (sjisnec_lo == 0xF9)) {
+		sjisibm[0] = 0x81;
+		sjisibm[1] = 0xCA;
+		return 2;
+	}
+
+	if ((sjisnec_hi == 0xEE) && (sjisnec_lo >= 0xEF)) {
+		count = (sjisnec_hi << 8 | sjisnec_lo)
+			- (sjisnec_lo <= 0xF9 ? 0xEEEF : (0xEEEF - 10));
+	} else {
+		count = (sjisnec_hi - 0xED) * (0xFC - 0x40)
+			+ (sjisnec_lo - 0x40) + (0x5C - 0x40);
+		if (sjisnec_lo >= 0x7F)
+			count--;
+	}
+
+	sjisibm[0] = 0xFA + (count / (0xFC - 0x40));
+	sjisibm[1] = 0x40 + (count % (0xFC - 0x40));
+	if (sjisibm[1] >= 0x7F)
+		sjisibm[1]++;
+
+	return 2;
+}
+
+static int uni2char(const wchar_t uni,
+		    unsigned char *out, int boundlen)
+{
+	int n;
+
+	if (!p_nls)
+		return -EINVAL;
+	if ((n = p_nls->uni2char(uni, out, boundlen)) < 0)
+		return n;
+
+	/* translate SJIS into EUC-JP */
+	if (n == 1) {
+		if (IS_SJIS_JISX0201KANA(out[0])) {
+			/* JIS X 0201 KANA */
+			if (boundlen < 2)
+				return -ENAMETOOLONG;
+
+			out[1] = out[0];
+			out[0] = SS2;
+			return 2;
+		}
+	} else if (n == 2) {
+		/* NEC/IBM extended characters to IBM extended characters */
+		sjisnec2sjisibm(out, out[0], out[1]);
+
+		if (IS_SJIS_UDC_LOW(out[0], out[1])) {
+			/* User defined characters half low */
+			MAP_SJIS2EUC(out[0], out[1], 0xF0, out[0], out[1], 0xF5);
+		} else if (IS_SJIS_UDC_HI(out[0], out[1])) {
+			/* User defined characters half high */
+			unsigned char ch, cl;
+
+			if (boundlen < 3)
+				return -ENAMETOOLONG;
+
+			n = 3; ch = out[0]; cl = out[1];
+			out[0] = SS3;
+			MAP_SJIS2EUC(ch, cl, 0xF5, out[1], out[2], 0xF5);
+		} else if (IS_SJIS_IBM(out[0], out[1])) {
+			/* IBM extended characters */
+			unsigned char euc[3], i;
+
+			n = sjisibm2euc(euc, out[0], out[1]);
+			if (boundlen < n)
+				return -ENAMETOOLONG;
+			for (i = 0; i < n; i++)
+				out[i] = euc[i];
+		} else if (IS_SJIS_JISX0208(out[0], out[1])) {
+			/* JIS X 0208 (include NEC special characters) */
+			out[0] = (out[0]^0xA0)*2 + 0x5F;
+			if (out[1] > 0x9E)
+				out[0]++;
+
+			if (out[1] < 0x7F)
+				out[1] = out[1] + 0x61;
+			else if (out[1] < 0x9F)
+				out[1] = out[1] + 0x60;
+			else
+				out[1] = out[1] + 0x02;
+		} else {
+			/* Invalid characters */
+			return -EINVAL;
+		}
+	}
+	else
+		return -EINVAL;
+
+	return n;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen,
+		    wchar_t *uni)
+{
+	unsigned char sjis_temp[2];
+	int euc_offset, n;
+
+	if ( !p_nls )
+		return -EINVAL;
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	/* translate EUC-JP into SJIS */
+	if (rawstring[0] > 0x7F) {
+		if (rawstring[0] == SS3) {
+			if (boundlen < 3)
+				return -EINVAL;
+			euc_offset = 3;
+
+			if (IS_EUC_UDC_HI(rawstring[1], rawstring[2])) {
+				/* User defined characters half high */
+				MAP_EUC2SJIS(rawstring[1], rawstring[2], 0xF5,
+					     sjis_temp[0], sjis_temp[1], 0xF5);
+			} else if (euc2sjisibm(sjis_temp,rawstring[1],rawstring[2])) {
+				/* IBM extended characters */
+			} else {
+				/* JIS X 0212 and Invalid characters*/
+				return -EINVAL;
+
+				/* 'GETA' with SJIS coding */
+				/* sjis_temp[0] = 0x81; */
+				/* sjis_temp[1] = 0xAC; */
+			}
+		} else {
+			if (boundlen < 2)
+				return -EINVAL;
+			euc_offset = 2;
+
+			if (IS_EUC_JISX0201KANA(rawstring[0], rawstring[1])) {
+				/* JIS X 0201 KANA */
+				sjis_temp[0] = rawstring[1];
+				sjis_temp[1] = 0x00;
+			} else if (IS_EUC_UDC_LOW(rawstring[0], rawstring[1])) {
+				/* User defined characters half low */
+				MAP_EUC2SJIS(rawstring[0], rawstring[1], 0xF5,
+					     sjis_temp[0], sjis_temp[1], 0xF0);
+			} else if (IS_EUC_JISX0208(rawstring[0], rawstring[1])) {
+				/* JIS X 0208 (include NEC spesial characters) */
+				sjis_temp[0] = ((rawstring[0]-0x5f)/2) ^ 0xA0;
+				if (!(rawstring[0] & 1))
+					sjis_temp[1] = rawstring[1] - 0x02;
+				else if (rawstring[1] < 0xE0)
+					sjis_temp[1] = rawstring[1] - 0x61;
+				else
+					sjis_temp[1] = rawstring[1] - 0x60;
+			} else {
+				/* Invalid characters */
+				return -EINVAL;
+			}
+		}
+	} else {
+		euc_offset = 1;
+
+		/* JIS X 0201 ROMAJI */
+		sjis_temp[0] = rawstring[0];
+		sjis_temp[1] = 0x00;
+	}
+
+	if ( (n = p_nls->char2uni(sjis_temp, sizeof(sjis_temp), uni)) < 0)
+		return n;
+
+	return euc_offset;
+}
+
+static struct nls_table table = {
+	.charset	= "euc-jp",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_euc_jp(void)
+{
+	p_nls = load_nls("cp932");
+
+	if (p_nls) {
+		table.charset2upper = p_nls->charset2upper;
+		table.charset2lower = p_nls->charset2lower;
+		return register_nls(&table);
+	}
+
+	return -EINVAL;
+}
+
+static void __exit exit_nls_euc_jp(void)
+{
+	unregister_nls(&table);
+	unload_nls(p_nls);
+}
+
+module_init(init_nls_euc_jp)
+module_exit(exit_nls_euc_jp)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c
new file mode 100644
index 00000000..7b951bb5
--- /dev/null
+++ b/fs/nls/nls_iso8859-1.c
@@ -0,0 +1,258 @@
+/*
+ * linux/fs/nls/nls_iso8859-1.c
+ *
+ * Charset iso8859-1 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x00a1, 0x00a2, 0x00a3,
+	0x00a4, 0x00a5, 0x00a6, 0x00a7,
+	0x00a8, 0x00a9, 0x00aa, 0x00ab,
+	0x00ac, 0x00ad, 0x00ae, 0x00af,
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x00b2, 0x00b3,
+	0x00b4, 0x00b5, 0x00b6, 0x00b7,
+	0x00b8, 0x00b9, 0x00ba, 0x00bb,
+	0x00bc, 0x00bd, 0x00be, 0x00bf,
+	/* 0xc0*/
+	0x00c0, 0x00c1, 0x00c2, 0x00c3,
+	0x00c4, 0x00c5, 0x00c6, 0x00c7,
+	0x00c8, 0x00c9, 0x00ca, 0x00cb,
+	0x00cc, 0x00cd, 0x00ce, 0x00cf,
+	/* 0xd0*/
+	0x00d0, 0x00d1, 0x00d2, 0x00d3,
+	0x00d4, 0x00d5, 0x00d6, 0x00d7,
+	0x00d8, 0x00d9, 0x00da, 0x00db,
+	0x00dc, 0x00dd, 0x00de, 0x00df,
+	/* 0xe0*/
+	0x00e0, 0x00e1, 0x00e2, 0x00e3,
+	0x00e4, 0x00e5, 0x00e6, 0x00e7,
+	0x00e8, 0x00e9, 0x00ea, 0x00eb,
+	0x00ec, 0x00ed, 0x00ee, 0x00ef,
+	/* 0xf0*/
+	0x00f0, 0x00f1, 0x00f2, 0x00f3,
+	0x00f4, 0x00f5, 0x00f6, 0x00f7,
+	0x00f8, 0x00f9, 0x00fa, 0x00fb,
+	0x00fc, 0x00fd, 0x00fe, 0x00ff,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0x00, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-1",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_iso8859_1(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_1(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_1)
+module_exit(exit_nls_iso8859_1)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c
new file mode 100644
index 00000000..c4d52ea9
--- /dev/null
+++ b/fs/nls/nls_iso8859-13.c
@@ -0,0 +1,286 @@
+/*
+ * linux/fs/nls/nls_iso8859-13.c
+ *
+ * Charset iso8859-13 translation tables.
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x201d, 0x00a2, 0x00a3,
+	0x00a4, 0x201e, 0x00a6, 0x00a7,
+	0x00d8, 0x00a9, 0x0156, 0x00ab,
+	0x00ac, 0x00ad, 0x00ae, 0x00c6,
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x00b2, 0x00b3,
+	0x201c, 0x00b5, 0x00b6, 0x00b7,
+	0x00f8, 0x00b9, 0x0157, 0x00bb,
+	0x00bc, 0x00bd, 0x00be, 0x00e6,
+	/* 0xc0*/
+	0x0104, 0x012e, 0x0100, 0x0106,
+	0x00c4, 0x00c5, 0x0118, 0x0112,
+	0x010c, 0x00c9, 0x0179, 0x0116,
+	0x0122, 0x0136, 0x012a, 0x013b,
+	/* 0xd0*/
+	0x0160, 0x0143, 0x0145, 0x00d3,
+	0x014c, 0x00d5, 0x00d6, 0x00d7,
+	0x0172, 0x0141, 0x015a, 0x016a,
+	0x00dc, 0x017b, 0x017d, 0x00df,
+	/* 0xe0*/
+	0x0105, 0x012f, 0x0101, 0x0107,
+	0x00e4, 0x00e5, 0x0119, 0x0113,
+	0x010d, 0x00e9, 0x017a, 0x0117,
+	0x0123, 0x0137, 0x012b, 0x013c,
+	/* 0xf0*/
+	0x0161, 0x0144, 0x0146, 0x00f3,
+	0x014d, 0x00f5, 0x00f6, 0x00f7,
+	0x0173, 0x0142, 0x015b, 0x016b,
+	0x00fc, 0x017c, 0x017e, 0x2019,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0xa2, 0xa3, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0x00, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0x00, 0xb9, 0x00, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0xc4, 0xc5, 0xaf, 0x00, /* 0xc0-0xc7 */
+	0x00, 0xc9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0xd3, 0x00, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xa8, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0xe4, 0xe5, 0xbf, 0x00, /* 0xe0-0xe7 */
+	0x00, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0xf3, 0x00, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xb8, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0xc2, 0xe2, 0x00, 0x00, 0xc0, 0xe0, 0xc3, 0xe3, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0xc7, 0xe7, 0x00, 0x00, 0xcb, 0xeb, /* 0x10-0x17 */
+	0xc6, 0xe6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xcc, 0xec, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0xce, 0xee, 0x00, 0x00, 0xc1, 0xe1, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xcd, 0xed, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0xcf, 0xef, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0xd9, 0xf9, 0xd1, 0xf1, 0xd2, 0xf2, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0xd4, 0xf4, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, 0xba, /* 0x50-0x57 */
+	0x00, 0x00, 0xda, 0xfa, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0xdb, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0xd8, 0xf8, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0xca, 0xea, 0xdd, 0xfd, 0xde, 0xfe, 0x00, /* 0x78-0x7f */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0xff, 0x00, 0x00, 0xb4, 0xa1, 0xa5, 0x00, /* 0x18-0x1f */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	  NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	  NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	  NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbf, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xbd, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-13",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_iso8859_13(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_13(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_13)
+module_exit(exit_nls_iso8859_13)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c
new file mode 100644
index 00000000..dc02600c
--- /dev/null
+++ b/fs/nls/nls_iso8859-14.c
@@ -0,0 +1,342 @@
+/*
+ * linux/fs/nls/nls_iso8859-14.c
+ *
+ * Charset iso8859-14 translation tables.
+ *
+ * Generated automatically from the Unicode and charset table
+ * provided by the Unicode Organisation at
+ * http://www.unicode.org/
+ * The Unicode to charset table has only exact mappings.
+ *
+ * Rhys Jones, Swansea University Computer Society
+ * rhys@sucs.swan.ac.uk
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003, 
+	0x0004, 0x0005, 0x0006, 0x0007, 
+	0x0008, 0x0009, 0x000a, 0x000b, 
+	0x000c, 0x000d, 0x000e, 0x000f, 
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013, 
+	0x0014, 0x0015, 0x0016, 0x0017, 
+	0x0018, 0x0019, 0x001a, 0x001b, 
+	0x001c, 0x001d, 0x001e, 0x001f, 
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023, 
+	0x0024, 0x0025, 0x0026, 0x0027, 
+	0x0028, 0x0029, 0x002a, 0x002b, 
+	0x002c, 0x002d, 0x002e, 0x002f, 
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033, 
+	0x0034, 0x0035, 0x0036, 0x0037, 
+	0x0038, 0x0039, 0x003a, 0x003b, 
+	0x003c, 0x003d, 0x003e, 0x003f, 
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043, 
+	0x0044, 0x0045, 0x0046, 0x0047, 
+	0x0048, 0x0049, 0x004a, 0x004b, 
+	0x004c, 0x004d, 0x004e, 0x004f, 
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053, 
+	0x0054, 0x0055, 0x0056, 0x0057, 
+	0x0058, 0x0059, 0x005a, 0x005b, 
+	0x005c, 0x005d, 0x005e, 0x005f, 
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063, 
+	0x0064, 0x0065, 0x0066, 0x0067, 
+	0x0068, 0x0069, 0x006a, 0x006b, 
+	0x006c, 0x006d, 0x006e, 0x006f, 
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073, 
+	0x0074, 0x0075, 0x0076, 0x0077, 
+	0x0078, 0x0079, 0x007a, 0x007b, 
+	0x007c, 0x007d, 0x007e, 0x007f, 
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x1e02, 0x1e03, 0x00a3, 
+	0x010a, 0x010b, 0x1e0a, 0x00a7, 
+	0x1e80, 0x00a9, 0x1e82, 0x1e0b, 
+	0x1ef2, 0x00ad, 0x00ae, 0x0178, 
+	/* 0xb0*/
+	0x1e1e, 0x1e1f, 0x0120, 0x0121, 
+	0x1e40, 0x1e41, 0x00b6, 0x1e56, 
+	0x1e81, 0x1e57, 0x1e83, 0x1e60, 
+	0x1ef3, 0x1e84, 0x1e85, 0x1e61, 
+	/* 0xc0*/
+	0x00c0, 0x00c1, 0x00c2, 0x00c3, 
+	0x00c4, 0x00c5, 0x00c6, 0x00c7, 
+	0x00c8, 0x00c9, 0x00ca, 0x00cb, 
+	0x00cc, 0x00cd, 0x00ce, 0x00cf, 
+	/* 0xd0*/
+	0x0174, 0x00d1, 0x00d2, 0x00d3, 
+	0x00d4, 0x00d5, 0x00d6, 0x1e6a, 
+	0x00d8, 0x00d9, 0x00da, 0x00db, 
+	0x00dc, 0x00dd, 0x0176, 0x00df, 
+	/* 0xe0*/
+	0x00e0, 0x00e1, 0x00e2, 0x00e3, 
+	0x00e4, 0x00e5, 0x00e6, 0x00e7, 
+	0x00e8, 0x00e9, 0x00ea, 0x00eb, 
+	0x00ec, 0x00ed, 0x00ee, 0x00ef, 
+	/* 0xf0*/
+	0x0175, 0x00f1, 0x00f2, 0x00f3, 
+	0x00f4, 0x00f5, 0x00f6, 0x1e6b, 
+	0x00f8, 0x00f9, 0x00fa, 0x00fb, 
+	0x00fc, 0x00fd, 0x0177, 0x00ff, 
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */
+	0x00, 0xa9, 0x00, 0x00, 0x00, 0xad, 0xae, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb6, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0x00, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0x00, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x00, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0x00, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0xa1, 0xa2, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0xa6, 0xab, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0xb1, /* 0x18-0x1f */
+	0xb2, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0xb4, 0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, 0xb9, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xbb, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0xd7, 0xf7, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0xd0, 0xf0, 0xde, 0xfe, /* 0x70-0x77 */
+	0xaf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+	
+	0xa8, 0xb8, 0xaa, 0xba, 0xbd, 0xbe, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0xac, 0xbc, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page1e[256] = {
+	0x00, 0x00, 0xa1, 0xa2, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0xa6, 0xab, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0xb1, /* 0x18-0x1f */
+	0xb2, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0xb4, 0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, 0xb9, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xbb, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0xd7, 0xf7, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0xd0, 0xf0, 0xde, 0xfe, /* 0x70-0x77 */
+	0xaf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0xa8, 0xb8, 0xaa, 0xba, 0xbd, 0xbe, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0xac, 0xbc, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00,	page01, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	page1e,	NULL,
+
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa2, 0xa2, 0xa3, 0xab, 0xab, 0xab, 0xa7, /* 0xa0-0xa7 */
+	0xb8, 0xa9, 0xba, 0xab, 0xbc, 0xad, 0xae, 0xff, /* 0xa8-0xaf */
+	0xb1, 0xb1, 0xb3, 0xb3, 0xb5, 0xb5, 0xb6, 0xb9, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbf, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa1, 0xa3, 0xa6, 0xa6, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xa6, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb0, 0xb2, 0xb2, 0xb4, 0xb4, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xa8, 0xb7, 0xaa, 0xbb, 0xac, 0xbd, 0xbd, 0xbb, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xaf, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-14",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_iso8859_14(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_14(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_14)
+module_exit(exit_nls_iso8859_14)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c
new file mode 100644
index 00000000..3c7dfc83
--- /dev/null
+++ b/fs/nls/nls_iso8859-15.c
@@ -0,0 +1,308 @@
+/*
+ * linux/fs/nls/nls_iso8859-15.c
+ *
+ * Charset iso8859-15 translation tables.
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x00a1, 0x00a2, 0x00a3,
+	0x20ac, 0x00a5, 0x0160, 0x00a7,
+	0x0161, 0x00a9, 0x00aa, 0x00ab,
+	0x00ac, 0x00ad, 0x00ae, 0x00af,
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x00b2, 0x00b3,
+	0x017d, 0x00b5, 0x00b6, 0x00b7,
+	0x017e, 0x00b9, 0x00ba, 0x00bb,
+	0x0152, 0x0153, 0x0178, 0x00bf,
+	/* 0xc0*/
+	0x00c0, 0x00c1, 0x00c2, 0x00c3,
+	0x00c4, 0x00c5, 0x00c6, 0x00c7,
+	0x00c8, 0x00c9, 0x00ca, 0x00cb,
+	0x00cc, 0x00cd, 0x00ce, 0x00cf,
+	/* 0xd0*/
+	0x00d0, 0x00d1, 0x00d2, 0x00d3,
+	0x00d4, 0x00d5, 0x00d6, 0x00d7,
+	0x00d8, 0x00d9, 0x00da, 0x00db,
+	0x00dc, 0x00dd, 0x00de, 0x00df,
+	/* 0xe0*/
+	0x00e0, 0x00e1, 0x00e2, 0x00e3,
+	0x00e4, 0x00e5, 0x00e6, 0x00e7,
+	0x00e8, 0x00e9, 0x00ea, 0x00eb,
+	0x00ec, 0x00ed, 0x00ee, 0x00ef,
+	/* 0xf0*/
+	0x00f0, 0x00f1, 0x00f2, 0x00f3,
+	0x00f4, 0x00f5, 0x00f6, 0x00f7,
+	0x00f8, 0x00f9, 0x00fa, 0x00fb,
+	0x00fc, 0x00fd, 0x00fe, 0x00ff,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0x00, 0xa5, 0x00, 0xa7, /* 0xa0-0xa7 */
+	0x00, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0x00, 0xb9, 0xba, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0xbc, 0xbd, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xa6, 0xa8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0xbe, 0x00, 0x00, 0x00, 0x00, 0xb4, 0xb8, 0x00, /* 0x78-0x7f */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01,	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+
+	page20,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,	NULL,
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb8, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbd, 0xbd, 0xff, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa6, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb4, 0xb9, 0xba, 0xbb, 0xbc, 0xbc, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xbe, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-15",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_iso8859_15(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_15(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_15)
+module_exit(exit_nls_iso8859_15)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c
new file mode 100644
index 00000000..a2d2197e
--- /dev/null
+++ b/fs/nls/nls_iso8859-2.c
@@ -0,0 +1,309 @@
+/*
+ * linux/fs/nls/nls_iso8859-2.c
+ *
+ * Charset iso8859-2 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x0104, 0x02d8, 0x0141,
+	0x00a4, 0x013d, 0x015a, 0x00a7,
+	0x00a8, 0x0160, 0x015e, 0x0164,
+	0x0179, 0x00ad, 0x017d, 0x017b,
+	/* 0xb0*/
+	0x00b0, 0x0105, 0x02db, 0x0142,
+	0x00b4, 0x013e, 0x015b, 0x02c7,
+	0x00b8, 0x0161, 0x015f, 0x0165,
+	0x017a, 0x02dd, 0x017e, 0x017c,
+	/* 0xc0*/
+	0x0154, 0x00c1, 0x00c2, 0x0102,
+	0x00c4, 0x0139, 0x0106, 0x00c7,
+	0x010c, 0x00c9, 0x0118, 0x00cb,
+	0x011a, 0x00cd, 0x00ce, 0x010e,
+	/* 0xd0*/
+	0x0110, 0x0143, 0x0147, 0x00d3,
+	0x00d4, 0x0150, 0x00d6, 0x00d7,
+	0x0158, 0x016e, 0x00da, 0x0170,
+	0x00dc, 0x00dd, 0x0162, 0x00df,
+	/* 0xe0*/
+	0x0155, 0x00e1, 0x00e2, 0x0103,
+	0x00e4, 0x013a, 0x0107, 0x00e7,
+	0x010d, 0x00e9, 0x0119, 0x00eb,
+	0x011b, 0x00ed, 0x00ee, 0x010f,
+	/* 0xf0*/
+	0x0111, 0x0144, 0x0148, 0x00f3,
+	0x00f4, 0x0151, 0x00f6, 0x00f7,
+	0x0159, 0x016f, 0x00fa, 0x0171,
+	0x00fc, 0x00fd, 0x0163, 0x02d9,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
+	0xb0, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */
+	0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0x00, 0x00, 0xda, 0x00, 0xdc, 0xdd, 0x00, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */
+	0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0x00, 0x00, 0xfa, 0x00, 0xfc, 0xfd, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0xc3, 0xe3, 0xa1, 0xb1, 0xc6, 0xe6, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0xcf, 0xef, /* 0x08-0x0f */
+	0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0xca, 0xea, 0xcc, 0xec, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0xc5, 0xe5, 0x00, 0x00, 0xa5, 0xb5, 0x00, /* 0x38-0x3f */
+	0x00, 0xa3, 0xb3, 0xd1, 0xf1, 0x00, 0x00, 0xd2, /* 0x40-0x47 */
+	0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xd5, 0xf5, 0x00, 0x00, 0xc0, 0xe0, 0x00, 0x00, /* 0x50-0x57 */
+	0xd8, 0xf8, 0xa6, 0xb6, 0x00, 0x00, 0xaa, 0xba, /* 0x58-0x5f */
+	0xa9, 0xb9, 0xde, 0xfe, 0xab, 0xbb, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd9, 0xf9, /* 0x68-0x6f */
+	0xdb, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0xac, 0xbc, 0xaf, 0xbf, 0xae, 0xbe, 0x00, /* 0x78-0x7f */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0xa2, 0xff, 0x00, 0xb2, 0x00, 0xbd, 0x00, 0x00, /* 0xd8-0xdf */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xbf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xaf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-2",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_iso8859_2(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_2(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_2)
+module_exit(exit_nls_iso8859_2)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c
new file mode 100644
index 00000000..a61e0daa
--- /dev/null
+++ b/fs/nls/nls_iso8859-3.c
@@ -0,0 +1,309 @@
+/*
+ * linux/fs/nls/nls_iso8859-3.c
+ *
+ * Charset iso8859-3 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x0126, 0x02d8, 0x00a3,
+	0x00a4, 0x0000, 0x0124, 0x00a7,
+	0x00a8, 0x0130, 0x015e, 0x011e,
+	0x0134, 0x00ad, 0x0000, 0x017b,
+	/* 0xb0*/
+	0x00b0, 0x0127, 0x00b2, 0x00b3,
+	0x00b4, 0x00b5, 0x0125, 0x00b7,
+	0x00b8, 0x0131, 0x015f, 0x011f,
+	0x0135, 0x00bd, 0x0000, 0x017c,
+	/* 0xc0*/
+	0x00c0, 0x00c1, 0x00c2, 0x0000,
+	0x00c4, 0x010a, 0x0108, 0x00c7,
+	0x00c8, 0x00c9, 0x00ca, 0x00cb,
+	0x00cc, 0x00cd, 0x00ce, 0x00cf,
+	/* 0xd0*/
+	0x0000, 0x00d1, 0x00d2, 0x00d3,
+	0x00d4, 0x0120, 0x00d6, 0x00d7,
+	0x011c, 0x00d9, 0x00da, 0x00db,
+	0x00dc, 0x016c, 0x015c, 0x00df,
+	/* 0xe0*/
+	0x00e0, 0x00e1, 0x00e2, 0x0000,
+	0x00e4, 0x010b, 0x0109, 0x00e7,
+	0x00e8, 0x00e9, 0x00ea, 0x00eb,
+	0x00ec, 0x00ed, 0x00ee, 0x00ef,
+	/* 0xf0*/
+	0x0000, 0x00f1, 0x00f2, 0x00f3,
+	0x00f4, 0x0121, 0x00f6, 0x00f7,
+	0x011d, 0x00f9, 0x00fa, 0x00fb,
+	0x00fc, 0x016d, 0x015d, 0x02d9,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0xa3, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
+	0xb0, 0x00, 0xb2, 0xb3, 0xb4, 0xb5, 0x00, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0x00, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0x00, 0xd9, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0x00, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0xc6, 0xe6, 0xc5, 0xe5, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0xd8, 0xf8, 0xab, 0xbb, /* 0x18-0x1f */
+	0xd5, 0xf5, 0x00, 0x00, 0xa6, 0xb6, 0xa1, 0xb1, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0xa9, 0xb9, 0x00, 0x00, 0xac, 0xbc, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0xde, 0xfe, 0xaa, 0xba, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0xdd, 0xfd, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0xaf, 0xbf, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0xa2, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xb1, 0xa2, 0xa3, 0xa4, 0x00, 0xb6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0x69, 0xba, 0xbb, 0xbc, 0xad, 0x00, 0xbf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0x00, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xa1, 0xb2, 0xb3, 0xb4, 0x00, 0xa6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0x49, 0xaa, 0xab, 0xac, 0xbd, 0x00, 0xaf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-3",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_iso8859_3(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_3(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_3)
+module_exit(exit_nls_iso8859_3)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c
new file mode 100644
index 00000000..e8ff5554
--- /dev/null
+++ b/fs/nls/nls_iso8859-4.c
@@ -0,0 +1,309 @@
+/*
+ * linux/fs/nls/nls_iso8859-4.c
+ *
+ * Charset iso8859-4 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x0104, 0x0138, 0x0156,
+	0x00a4, 0x0128, 0x013b, 0x00a7,
+	0x00a8, 0x0160, 0x0112, 0x0122,
+	0x0166, 0x00ad, 0x017d, 0x00af,
+	/* 0xb0*/
+	0x00b0, 0x0105, 0x02db, 0x0157,
+	0x00b4, 0x0129, 0x013c, 0x02c7,
+	0x00b8, 0x0161, 0x0113, 0x0123,
+	0x0167, 0x014a, 0x017e, 0x014b,
+	/* 0xc0*/
+	0x0100, 0x00c1, 0x00c2, 0x00c3,
+	0x00c4, 0x00c5, 0x00c6, 0x012e,
+	0x010c, 0x00c9, 0x0118, 0x00cb,
+	0x0116, 0x00cd, 0x00ce, 0x012a,
+	/* 0xd0*/
+	0x0110, 0x0145, 0x014c, 0x0136,
+	0x00d4, 0x00d5, 0x00d6, 0x00d7,
+	0x00d8, 0x0172, 0x00da, 0x00db,
+	0x00dc, 0x0168, 0x016a, 0x00df,
+	/* 0xe0*/
+	0x0101, 0x00e1, 0x00e2, 0x00e3,
+	0x00e4, 0x00e5, 0x00e6, 0x012f,
+	0x010d, 0x00e9, 0x0119, 0x00eb,
+	0x0117, 0x00ed, 0x00ee, 0x012b,
+	/* 0xf0*/
+	0x0111, 0x0146, 0x014d, 0x0137,
+	0x00f4, 0x00f5, 0x00f6, 0x00f7,
+	0x00f8, 0x0173, 0x00fa, 0x00fb,
+	0x00fc, 0x0169, 0x016b, 0x02d9,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0x00, /* 0xc0-0xc7 */
+	0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0x00, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0x00, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0x00, /* 0xe0-0xe7 */
+	0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0x00, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0xc0, 0xe0, 0x00, 0x00, 0xa1, 0xb1, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0x00, 0x00, /* 0x08-0x0f */
+	0xd0, 0xf0, 0xaa, 0xba, 0x00, 0x00, 0xcc, 0xec, /* 0x10-0x17 */
+	0xca, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0xab, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0xa5, 0xb5, 0xcf, 0xef, 0x00, 0x00, 0xc7, 0xe7, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd3, 0xf3, /* 0x30-0x37 */
+	0xa2, 0x00, 0x00, 0xa6, 0xb6, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xd1, 0xf1, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0xbd, 0xbf, 0xd2, 0xf2, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa3, 0xb3, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0xa9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0xac, 0xbc, /* 0x60-0x67 */
+	0xdd, 0xfd, 0xde, 0xfe, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0xd9, 0xf9, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xae, 0xbe, 0x00, /* 0x78-0x7f */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0xff, 0x00, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, page02, NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbf, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xbd, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-4",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_iso8859_4(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_4(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_4)
+module_exit(exit_nls_iso8859_4)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c
new file mode 100644
index 00000000..4721e893
--- /dev/null
+++ b/fs/nls/nls_iso8859-5.c
@@ -0,0 +1,273 @@
+/*
+ * linux/fs/nls/nls_iso8859-5.c
+ *
+ * Charset iso8859-5 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x0401, 0x0402, 0x0403,
+	0x0404, 0x0405, 0x0406, 0x0407,
+	0x0408, 0x0409, 0x040a, 0x040b,
+	0x040c, 0x00ad, 0x040e, 0x040f,
+	/* 0xb0*/
+	0x0410, 0x0411, 0x0412, 0x0413,
+	0x0414, 0x0415, 0x0416, 0x0417,
+	0x0418, 0x0419, 0x041a, 0x041b,
+	0x041c, 0x041d, 0x041e, 0x041f,
+	/* 0xc0*/
+	0x0420, 0x0421, 0x0422, 0x0423,
+	0x0424, 0x0425, 0x0426, 0x0427,
+	0x0428, 0x0429, 0x042a, 0x042b,
+	0x042c, 0x042d, 0x042e, 0x042f,
+	/* 0xd0*/
+	0x0430, 0x0431, 0x0432, 0x0433,
+	0x0434, 0x0435, 0x0436, 0x0437,
+	0x0438, 0x0439, 0x043a, 0x043b,
+	0x043c, 0x043d, 0x043e, 0x043f,
+	/* 0xe0*/
+	0x0440, 0x0441, 0x0442, 0x0443,
+	0x0444, 0x0445, 0x0446, 0x0447,
+	0x0448, 0x0449, 0x044a, 0x044b,
+	0x044c, 0x044d, 0x044e, 0x044f,
+	/* 0xf0*/
+	0x2116, 0x0451, 0x0452, 0x0453,
+	0x0454, 0x0455, 0x0456, 0x0457,
+	0x0458, 0x0459, 0x045a, 0x045b,
+	0x045c, 0x00a7, 0x045e, 0x045f,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
+};
+
+static const unsigned char page04[256] = {
+	0x00, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x00-0x07 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0x00, 0xae, 0xaf, /* 0x08-0x0f */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x10-0x17 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0x18-0x1f */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0x38-0x3f */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */
+	0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x50-0x57 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0xfe, 0xff, /* 0x58-0x5f */
+};
+
+static const unsigned char page21[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00, /* 0x10-0x17 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   page04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   page21, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xa0-0xa7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xad, 0xfe, 0xff, /* 0xa8-0xaf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xb0-0xb7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xd0-0xd7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xf0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xf0-0xf7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xfd, 0xae, 0xaf, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-5",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_iso8859_5(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_5(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_5)
+module_exit(exit_nls_iso8859_5)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c
new file mode 100644
index 00000000..01a517d6
--- /dev/null
+++ b/fs/nls/nls_iso8859-6.c
@@ -0,0 +1,264 @@
+/*
+ * linux/fs/nls/nls_iso8859-6.c
+ *
+ * Charset iso8859-6 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0660, 0x0661, 0x0662, 0x0663,
+	0x0664, 0x0665, 0x0666, 0x0667,
+	0x0668, 0x0669, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x0000, 0x0000, 0x0000,
+	0x00a4, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x060c, 0x00ad, 0x0000, 0x0000,
+	/* 0xb0*/
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x061b,
+	0x0000, 0x0000, 0x0000, 0x061f,
+	/* 0xc0*/
+	0x0000, 0x0621, 0x0622, 0x0623,
+	0x0624, 0x0625, 0x0626, 0x0627,
+	0x0628, 0x0629, 0x062a, 0x062b,
+	0x062c, 0x062d, 0x062e, 0x062f,
+	/* 0xd0*/
+	0x0630, 0x0631, 0x0632, 0x0633,
+	0x0634, 0x0635, 0x0636, 0x0637,
+	0x0638, 0x0639, 0x063a, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	/* 0xe0*/
+	0x0640, 0x0641, 0x0642, 0x0643,
+	0x0644, 0x0645, 0x0646, 0x0647,
+	0x0648, 0x0649, 0x064a, 0x064b,
+	0x064c, 0x064d, 0x064e, 0x064f,
+	/* 0xf0*/
+	0x0650, 0x0651, 0x0652, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
+};
+
+static const unsigned char page06[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0x18-0x1f */
+	0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */
+	0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */
+	0xf0, 0xf1, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x60-0x67 */
+	0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   NULL,   NULL,   page06, NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0xac, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0xb8-0xbf */
+	0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0xac, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0xb8-0xbf */
+	0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-6",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_iso8859_6(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_6(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_6)
+module_exit(exit_nls_iso8859_6)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c
new file mode 100644
index 00000000..2d27b93e
--- /dev/null
+++ b/fs/nls/nls_iso8859-7.c
@@ -0,0 +1,318 @@
+/*
+ * linux/fs/nls/nls_iso8859-7.c
+ *
+ * Charset iso8859-7 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x02bd, 0x02bc, 0x00a3,
+	0x0000, 0x0000, 0x00a6, 0x00a7,
+	0x00a8, 0x00a9, 0x0000, 0x00ab,
+	0x00ac, 0x00ad, 0x0000, 0x2015,
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x00b2, 0x00b3,
+	0x0384, 0x0385, 0x0386, 0x00b7,
+	0x0388, 0x0389, 0x038a, 0x00bb,
+	0x038c, 0x00bd, 0x038e, 0x038f,
+	/* 0xc0*/
+	0x0390, 0x0391, 0x0392, 0x0393,
+	0x0394, 0x0395, 0x0396, 0x0397,
+	0x0398, 0x0399, 0x039a, 0x039b,
+	0x039c, 0x039d, 0x039e, 0x039f,
+	/* 0xd0*/
+	0x03a0, 0x03a1, 0x0000, 0x03a3,
+	0x03a4, 0x03a5, 0x03a6, 0x03a7,
+	0x03a8, 0x03a9, 0x03aa, 0x03ab,
+	0x03ac, 0x03ad, 0x03ae, 0x03af,
+	/* 0xe0*/
+	0x03b0, 0x03b1, 0x03b2, 0x03b3,
+	0x03b4, 0x03b5, 0x03b6, 0x03b7,
+	0x03b8, 0x03b9, 0x03ba, 0x03bb,
+	0x03bc, 0x03bd, 0x03be, 0x03bf,
+	/* 0xf0*/
+	0x03c0, 0x03c1, 0x03c2, 0x03c3,
+	0x03c4, 0x03c5, 0x03c6, 0x03c7,
+	0x03c8, 0x03c9, 0x03ca, 0x03cb,
+	0x03cc, 0x03cd, 0x03ce, 0x0000,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0x00, 0x00, 0xa3, 0x00, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0x00, 0x00, 0x00, 0xb7, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0xbb, 0x00, 0xbd, 0x00, 0x00, /* 0xb8-0xbf */
+};
+
+static const unsigned char page02[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0xa2, 0xa1, 0x00, 0x00, /* 0xb8-0xbf */
+};
+
+static const unsigned char page03[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0xb4, 0xb5, 0xb6, 0x00, /* 0x80-0x87 */
+	0xb8, 0xb9, 0xba, 0x00, 0xbc, 0x00, 0xbe, 0xbf, /* 0x88-0x8f */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x90-0x97 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x98-0x9f */
+	0xd0, 0xd1, 0x00, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xa0-0xa7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xa8-0xaf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xb0-0xb7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xb8-0xbf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xc0-0xc7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xc8-0xcf */
+};
+
+static const unsigned char page20[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0xaf, 0x00, 0x00, /* 0x10-0x17 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   page02, page03, NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	page20, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0x00, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xdc, 0xb7, /* 0xb0-0xb7 */
+	0xdd, 0xde, 0xdf, 0xbb, 0xfc, 0xbd, 0xfd, 0xfe, /* 0xb8-0xbf */
+	0xc0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0x00, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0x00, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xb6, 0xb8, 0xb9, 0xba, /* 0xd8-0xdf */
+	0xe0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd3, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xbc, 0xbe, 0xbf, 0x00, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-7",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_iso8859_7(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_7(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_7)
+module_exit(exit_nls_iso8859_7)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c
new file mode 100644
index 00000000..694bf070
--- /dev/null
+++ b/fs/nls/nls_iso8859-9.c
@@ -0,0 +1,273 @@
+/*
+ * linux/fs/nls/nls_iso8859-9.c
+ *
+ * Charset iso8859-9 translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x0080, 0x0081, 0x0082, 0x0083,
+	0x0084, 0x0085, 0x0086, 0x0087,
+	0x0088, 0x0089, 0x008a, 0x008b,
+	0x008c, 0x008d, 0x008e, 0x008f,
+	/* 0x90*/
+	0x0090, 0x0091, 0x0092, 0x0093,
+	0x0094, 0x0095, 0x0096, 0x0097,
+	0x0098, 0x0099, 0x009a, 0x009b,
+	0x009c, 0x009d, 0x009e, 0x009f,
+	/* 0xa0*/
+	0x00a0, 0x00a1, 0x00a2, 0x00a3,
+	0x00a4, 0x00a5, 0x00a6, 0x00a7,
+	0x00a8, 0x00a9, 0x00aa, 0x00ab,
+	0x00ac, 0x00ad, 0x00ae, 0x00af,
+	/* 0xb0*/
+	0x00b0, 0x00b1, 0x00b2, 0x00b3,
+	0x00b4, 0x00b5, 0x00b6, 0x00b7,
+	0x00b8, 0x00b9, 0x00ba, 0x00bb,
+	0x00bc, 0x00bd, 0x00be, 0x00bf,
+	/* 0xc0*/
+	0x00c0, 0x00c1, 0x00c2, 0x00c3,
+	0x00c4, 0x00c5, 0x00c6, 0x00c7,
+	0x00c8, 0x00c9, 0x00ca, 0x00cb,
+	0x00cc, 0x00cd, 0x00ce, 0x00cf,
+	/* 0xd0*/
+	0x011e, 0x00d1, 0x00d2, 0x00d3,
+	0x00d4, 0x00d5, 0x00d6, 0x00d7,
+	0x00d8, 0x00d9, 0x00da, 0x00db,
+	0x00dc, 0x0130, 0x015e, 0x00df,
+	/* 0xe0*/
+	0x00e0, 0x00e1, 0x00e2, 0x00e3,
+	0x00e4, 0x00e5, 0x00e6, 0x00e7,
+	0x00e8, 0x00e9, 0x00ea, 0x00eb,
+	0x00ec, 0x00ed, 0x00ee, 0x00ef,
+	/* 0xf0*/
+	0x011f, 0x00f1, 0x00f2, 0x00f3,
+	0x00f4, 0x00f5, 0x00f6, 0x00f7,
+	0x00f8, 0x00f9, 0x00fa, 0x00fb,
+	0x00fc, 0x0131, 0x015f, 0x00ff,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char page01[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd0, 0xf0, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0xdd, 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0xfe, /* 0x58-0x5f */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, page01, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x69, 0xfe, 0xdf, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0x49, 0xde, 0x00, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "iso8859-9",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_iso8859_9(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_iso8859_9(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_iso8859_9)
+module_exit(exit_nls_iso8859_9)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c
new file mode 100644
index 00000000..43875310
--- /dev/null
+++ b/fs/nls/nls_koi8-r.c
@@ -0,0 +1,324 @@
+/*
+ * linux/fs/nls/nls_koi8-r.c
+ *
+ * Charset koi8-r translation tables.
+ * Generated automatically from the Unicode and charset
+ * tables from the Unicode Organization (www.unicode.org).
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x2500, 0x2502, 0x250c, 0x2510,
+	0x2514, 0x2518, 0x251c, 0x2524,
+	0x252c, 0x2534, 0x253c, 0x2580,
+	0x2584, 0x2588, 0x258c, 0x2590,
+	/* 0x90*/
+	0x2591, 0x2592, 0x2593, 0x2320,
+	0x25a0, 0x2219, 0x221a, 0x2248,
+	0x2264, 0x2265, 0x00a0, 0x2321,
+	0x00b0, 0x00b2, 0x00b7, 0x00f7,
+	/* 0xa0*/
+	0x2550, 0x2551, 0x2552, 0x0451,
+	0x2553, 0x2554, 0x2555, 0x2556,
+	0x2557, 0x2558, 0x2559, 0x255a,
+	0x255b, 0x255c, 0x255d, 0x255e,
+	/* 0xb0*/
+	0x255f, 0x2560, 0x2561, 0x0401,
+	0x2562, 0x2563, 0x2564, 0x2565,
+	0x2566, 0x2567, 0x2568, 0x2569,
+	0x256a, 0x256b, 0x256c, 0x00a9,
+	/* 0xc0*/
+	0x044e, 0x0430, 0x0431, 0x0446,
+	0x0434, 0x0435, 0x0444, 0x0433,
+	0x0445, 0x0438, 0x0439, 0x043a,
+	0x043b, 0x043c, 0x043d, 0x043e,
+	/* 0xd0*/
+	0x043f, 0x044f, 0x0440, 0x0441,
+	0x0442, 0x0443, 0x0436, 0x0432,
+	0x044c, 0x044b, 0x0437, 0x0448,
+	0x044d, 0x0449, 0x0447, 0x044a,
+	/* 0xe0*/
+	0x042e, 0x0410, 0x0411, 0x0426,
+	0x0414, 0x0415, 0x0424, 0x0413,
+	0x0425, 0x0418, 0x0419, 0x041a,
+	0x041b, 0x041c, 0x041d, 0x041e,
+	/* 0xf0*/
+	0x041f, 0x042f, 0x0420, 0x0421,
+	0x0422, 0x0423, 0x0416, 0x0412,
+	0x042c, 0x042b, 0x0417, 0x0428,
+	0x042d, 0x0429, 0x0427, 0x042a,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x9c, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9f, /* 0xf0-0xf7 */
+};
+
+static const unsigned char page04[256] = {
+	0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, /* 0x10-0x17 */
+	0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, /* 0x18-0x1f */
+	0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, /* 0x20-0x27 */
+	0xfb, 0xfd, 0xff, 0xf9, 0xf8, 0xfc, 0xe0, 0xf1, /* 0x28-0x2f */
+	0xc1, 0xc2, 0xd7, 0xc7, 0xc4, 0xc5, 0xd6, 0xda, /* 0x30-0x37 */
+	0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, /* 0x38-0x3f */
+	0xd2, 0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, /* 0x40-0x47 */
+	0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1, /* 0x48-0x4f */
+	0x00, 0xa3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x95, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x99, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x93, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0x80, 0x00, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x85, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xa0, 0xa1, 0xa2, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, /* 0x50-0x57 */
+	0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, /* 0x58-0x5f */
+	0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, /* 0x60-0x67 */
+	0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x8b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x8d, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x8f, 0x90, 0x91, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   page04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xa3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xb3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "koi8-r",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_koi8_r(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_koi8_r(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_koi8_r)
+module_exit(exit_nls_koi8_r)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c
new file mode 100644
index 00000000..e7bc1d75
--- /dev/null
+++ b/fs/nls/nls_koi8-ru.c
@@ -0,0 +1,83 @@
+/*
+ * linux/fs/nls/nls_koi8-ru.c
+ *
+ * Charset koi8-ru translation based on charset koi8-u.
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static struct nls_table *p_nls;
+
+static int uni2char(const wchar_t uni,
+		    unsigned char *out, int boundlen)
+{
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	if ((uni & 0xffaf) == 0x040e || (uni & 0xffce) == 0x254c) {
+		/* koi8-ru and koi8-u differ only on two characters */
+		if (uni == 0x040e)
+			out[0] = 0xbe;
+		else if (uni == 0x045e)
+			out[0] = 0xae;
+		else if (uni == 0x255d || uni == 0x256c)
+			return 0;
+		else
+			return p_nls->uni2char(uni, out, boundlen);
+		return 1;
+	}
+	else
+		/* fast path */
+		return p_nls->uni2char(uni, out, boundlen);
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen,
+		    wchar_t *uni)
+{
+	int n;
+
+	if ((*rawstring & 0xef) != 0xae) {
+		/* koi8-ru and koi8-u differ only on two characters */
+		*uni = (*rawstring & 0x10) ? 0x040e : 0x045e;
+		return 1;
+	}
+
+	n = p_nls->char2uni(rawstring, boundlen, uni);
+	return n;
+}
+
+static struct nls_table table = {
+	.charset	= "koi8-ru",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_koi8_ru(void)
+{
+	p_nls = load_nls("koi8-u");
+
+	if (p_nls) {
+		table.charset2upper = p_nls->charset2upper;
+		table.charset2lower = p_nls->charset2lower;
+		return register_nls(&table);
+	}
+
+	return -EINVAL;
+}
+
+static void __exit exit_nls_koi8_ru(void)
+{
+	unregister_nls(&table);
+	unload_nls(p_nls);
+}
+
+module_init(init_nls_koi8_ru)
+module_exit(exit_nls_koi8_ru)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c
new file mode 100644
index 00000000..8c9f0292
--- /dev/null
+++ b/fs/nls/nls_koi8-u.c
@@ -0,0 +1,331 @@
+/*
+ * linux/fs/nls/nls_koi8-u.c
+ *
+ * Charset koi8-u translation tables.
+ * The Unicode to charset table has only exact mappings.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static const wchar_t charset2uni[256] = {
+	/* 0x00*/
+	0x0000, 0x0001, 0x0002, 0x0003,
+	0x0004, 0x0005, 0x0006, 0x0007,
+	0x0008, 0x0009, 0x000a, 0x000b,
+	0x000c, 0x000d, 0x000e, 0x000f,
+	/* 0x10*/
+	0x0010, 0x0011, 0x0012, 0x0013,
+	0x0014, 0x0015, 0x0016, 0x0017,
+	0x0018, 0x0019, 0x001a, 0x001b,
+	0x001c, 0x001d, 0x001e, 0x001f,
+	/* 0x20*/
+	0x0020, 0x0021, 0x0022, 0x0023,
+	0x0024, 0x0025, 0x0026, 0x0027,
+	0x0028, 0x0029, 0x002a, 0x002b,
+	0x002c, 0x002d, 0x002e, 0x002f,
+	/* 0x30*/
+	0x0030, 0x0031, 0x0032, 0x0033,
+	0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b,
+	0x003c, 0x003d, 0x003e, 0x003f,
+	/* 0x40*/
+	0x0040, 0x0041, 0x0042, 0x0043,
+	0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b,
+	0x004c, 0x004d, 0x004e, 0x004f,
+	/* 0x50*/
+	0x0050, 0x0051, 0x0052, 0x0053,
+	0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x005b,
+	0x005c, 0x005d, 0x005e, 0x005f,
+	/* 0x60*/
+	0x0060, 0x0061, 0x0062, 0x0063,
+	0x0064, 0x0065, 0x0066, 0x0067,
+	0x0068, 0x0069, 0x006a, 0x006b,
+	0x006c, 0x006d, 0x006e, 0x006f,
+	/* 0x70*/
+	0x0070, 0x0071, 0x0072, 0x0073,
+	0x0074, 0x0075, 0x0076, 0x0077,
+	0x0078, 0x0079, 0x007a, 0x007b,
+	0x007c, 0x007d, 0x007e, 0x007f,
+	/* 0x80*/
+	0x2500, 0x2502, 0x250c, 0x2510,
+	0x2514, 0x2518, 0x251c, 0x2524,
+	0x252c, 0x2534, 0x253c, 0x2580,
+	0x2584, 0x2588, 0x258c, 0x2590,
+	/* 0x90*/
+	0x2591, 0x2592, 0x2593, 0x2320,
+	0x25a0, 0x2219, 0x221a, 0x2248,
+	0x2264, 0x2265, 0x00a0, 0x2321,
+	0x00b0, 0x00b2, 0x00b7, 0x00f7,
+	/* 0xa0*/
+	0x2550, 0x2551, 0x2552, 0x0451,
+	0x0454, 0x2554, 0x0456, 0x0457,
+	0x2557, 0x2558, 0x2559, 0x255a,
+	0x255b, 0x0491, 0x255d, 0x255e,
+	/* 0xb0*/
+	0x255f, 0x2560, 0x2561, 0x0401,
+	0x0404, 0x2563, 0x0406, 0x0407,
+	0x2566, 0x2567, 0x2568, 0x2569,
+	0x256a, 0x0490, 0x256c, 0x00a9,
+	/* 0xc0*/
+	0x044e, 0x0430, 0x0431, 0x0446,
+	0x0434, 0x0435, 0x0444, 0x0433,
+	0x0445, 0x0438, 0x0439, 0x043a,
+	0x043b, 0x043c, 0x043d, 0x043e,
+	/* 0xd0*/
+	0x043f, 0x044f, 0x0440, 0x0441,
+	0x0442, 0x0443, 0x0436, 0x0432,
+	0x044c, 0x044b, 0x0437, 0x0448,
+	0x044d, 0x0449, 0x0447, 0x044a,
+	/* 0xe0*/
+	0x042e, 0x0410, 0x0411, 0x0426,
+	0x0414, 0x0415, 0x0424, 0x0413,
+	0x0425, 0x0418, 0x0419, 0x041a,
+	0x041b, 0x041c, 0x041d, 0x041e,
+	/* 0xf0*/
+	0x041f, 0x042f, 0x0420, 0x0421,
+	0x0422, 0x0423, 0x0416, 0x0412,
+	0x042c, 0x042b, 0x0417, 0x0428,
+	0x042d, 0x0429, 0x0427, 0x042a,
+};
+
+static const unsigned char page00[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+	0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+	0x9c, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xb0-0xb7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9f, /* 0xf0-0xf7 */
+};
+
+static const unsigned char page04[256] = {
+	0x00, 0xb3, 0x00, 0x00, 0xb4, 0x00, 0xb6, 0xb7, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, /* 0x10-0x17 */
+	0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, /* 0x18-0x1f */
+	0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, /* 0x20-0x27 */
+	0xfb, 0xfd, 0xff, 0xf9, 0xf8, 0xfc, 0xe0, 0xf1, /* 0x28-0x2f */
+	0xc1, 0xc2, 0xd7, 0xc7, 0xc4, 0xc5, 0xd6, 0xda, /* 0x30-0x37 */
+	0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, /* 0x38-0x3f */
+	0xd2, 0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, /* 0x40-0x47 */
+	0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1, /* 0x48-0x4f */
+	0x00, 0xa3, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0xbd, 0xad, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char page22[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x95, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+	0x00, 0x00, 0x00, 0x00, 0x98, 0x99, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char page23[256] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x93, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char page25[256] = {
+	0x80, 0x00, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+	0x00, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+	0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+	0x85, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+	0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+	0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+	0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+	0x00, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+	0xa0, 0xa1, 0xa2, 0x00, 0xa5, 0x00, 0x00, 0xa8, /* 0x50-0x57 */
+	0xa9, 0xaa, 0xab, 0xac, 0x00, 0xae, 0xaf, 0xb0, /* 0x58-0x5f */
+	0xb1, 0xb2, 0x00, 0xb5, 0x00, 0x00, 0xb8, 0xb9, /* 0x60-0x67 */
+	0xba, 0xbb, 0xbc, 0x00, 0xbe, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+	0x8b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+	0x8d, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+	0x8f, 0x90, 0x91, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+	0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const page_uni2charset[256] = {
+	page00, NULL,   NULL,   NULL,   page04, NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
+	NULL,   NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
+};
+
+static const unsigned char charset2lower[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
+	0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xa3, 0xa4, 0xb5, 0xa6, 0xa7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */
+};
+
+static const unsigned char charset2upper[256] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+	0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
+	0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
+	0xa0, 0xa1, 0xa2, 0xb3, 0xb4, 0xa5, 0xb6, 0xb7, /* 0xa0-0xa7 */
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xaf, /* 0xa8-0xaf */
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
+};
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	const unsigned char *uni2charset;
+	unsigned char cl = uni & 0x00ff;
+	unsigned char ch = (uni & 0xff00) >> 8;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	uni2charset = page_uni2charset[ch];
+	if (uni2charset && uni2charset[cl])
+		out[0] = uni2charset[cl];
+	else
+		return -EINVAL;
+	return 1;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	*uni = charset2uni[*rawstring];
+	if (*uni == 0x0000)
+		return -EINVAL;
+	return 1;
+}
+
+static struct nls_table table = {
+	.charset	= "koi8-u",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= charset2lower,
+	.charset2upper	= charset2upper,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_koi8_u(void)
+{
+	return register_nls(&table);
+}
+
+static void __exit exit_nls_koi8_u(void)
+{
+	unregister_nls(&table);
+}
+
+module_init(init_nls_koi8_u)
+module_exit(exit_nls_koi8_u)
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c
new file mode 100644
index 00000000..0d60a44a
--- /dev/null
+++ b/fs/nls/nls_utf8.c
@@ -0,0 +1,68 @@
+/*
+ * Module for handling utf8 just like any other charset.
+ * By Urban Widmark 2000
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/errno.h>
+
+static unsigned char identity[256];
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	int n;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	n = utf32_to_utf8(uni, out, boundlen);
+	if (n < 0) {
+		*out = '?';
+		return -EINVAL;
+	}
+	return n;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	int n;
+	unicode_t u;
+
+	n = utf8_to_utf32(rawstring, boundlen, &u);
+	if (n < 0 || u > MAX_WCHAR_T) {
+		*uni = 0x003f;	/* ? */
+		return -EINVAL;
+	}
+	*uni = (wchar_t) u;
+	return n;
+}
+
+static struct nls_table table = {
+	.charset	= "utf8",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+	.charset2lower	= identity,	/* no conversion */
+	.charset2upper	= identity,
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_nls_utf8(void)
+{
+	int i;
+	for (i=0; i<256; i++)
+		identity[i] = i;
+
+        return register_nls(&table);
+}
+
+static void __exit exit_nls_utf8(void)
+{
+        unregister_nls(&table);
+}
+
+module_init(init_nls_utf8)
+module_exit(exit_nls_utf8)
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/no-block.c b/fs/no-block.c
new file mode 100644
index 00000000..6e40e42a
--- /dev/null
+++ b/fs/no-block.c
@@ -0,0 +1,23 @@
+/* no-block.c: implementation of routines required for non-BLOCK configuration
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+
+static int no_blkdev_open(struct inode * inode, struct file * filp)
+{
+	return -ENODEV;
+}
+
+const struct file_operations def_blk_fops = {
+	.open		= no_blkdev_open,
+	.llseek		= noop_llseek,
+};
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
new file mode 100644
index 00000000..22c629ee
--- /dev/null
+++ b/fs/notify/Kconfig
@@ -0,0 +1,6 @@
+config FSNOTIFY
+	def_bool n
+
+source "fs/notify/dnotify/Kconfig"
+source "fs/notify/inotify/Kconfig"
+source "fs/notify/fanotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
new file mode 100644
index 00000000..ae5f33a6
--- /dev/null
+++ b/fs/notify/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o inode_mark.o \
+				   mark.o vfsmount_mark.o
+
+obj-y			+= dnotify/
+obj-y			+= inotify/
+obj-y			+= fanotify/
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
new file mode 100644
index 00000000..f9c1ca13
--- /dev/null
+++ b/fs/notify/dnotify/Kconfig
@@ -0,0 +1,11 @@
+config DNOTIFY
+	bool "Dnotify support"
+	select FSNOTIFY
+	default y
+	help
+	  Dnotify is a directory-based per-fd file change notification system
+	  that uses signals to communicate events to user-space.  There exist
+	  superior alternatives, but some applications may still rely on
+	  dnotify.
+
+	  If unsure, say Y.
diff --git a/fs/notify/dnotify/Makefile b/fs/notify/dnotify/Makefile
new file mode 100644
index 00000000..f145251d
--- /dev/null
+++ b/fs/notify/dnotify/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DNOTIFY)		+= dnotify.o
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
new file mode 100644
index 00000000..3344bdd5
--- /dev/null
+++ b/fs/notify/dnotify/dnotify.c
@@ -0,0 +1,411 @@
+/*
+ * Directory notifications for Linux.
+ *
+ * Copyright (C) 2000,2001,2002 Stephen Rothwell
+ *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * dnotify was largly rewritten to use the new fsnotify infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/dnotify.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/fdtable.h>
+#include <linux/fsnotify_backend.h>
+
+int dir_notify_enable __read_mostly = 1;
+
+static struct kmem_cache *dnotify_struct_cache __read_mostly;
+static struct kmem_cache *dnotify_mark_cache __read_mostly;
+static struct fsnotify_group *dnotify_group __read_mostly;
+static DEFINE_MUTEX(dnotify_mark_mutex);
+
+/*
+ * dnotify will attach one of these to each inode (i_fsnotify_marks) which
+ * is being watched by dnotify.  If multiple userspace applications are watching
+ * the same directory with dnotify their information is chained in dn
+ */
+struct dnotify_mark {
+	struct fsnotify_mark fsn_mark;
+	struct dnotify_struct *dn;
+};
+
+/*
+ * When a process starts or stops watching an inode the set of events which
+ * dnotify cares about for that inode may change.  This function runs the
+ * list of everything receiving dnotify events about this directory and calculates
+ * the set of all those events.  After it updates what dnotify is interested in
+ * it calls the fsnotify function so it can update the set of all events relevant
+ * to this inode.
+ */
+static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
+{
+	__u32 new_mask, old_mask;
+	struct dnotify_struct *dn;
+	struct dnotify_mark *dn_mark  = container_of(fsn_mark,
+						     struct dnotify_mark,
+						     fsn_mark);
+
+	assert_spin_locked(&fsn_mark->lock);
+
+	old_mask = fsn_mark->mask;
+	new_mask = 0;
+	for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
+		new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
+	fsnotify_set_mark_mask_locked(fsn_mark, new_mask);
+
+	if (old_mask == new_mask)
+		return;
+
+	if (fsn_mark->i.inode)
+		fsnotify_recalc_inode_mask(fsn_mark->i.inode);
+}
+
+/*
+ * Mains fsnotify call where events are delivered to dnotify.
+ * Find the dnotify mark on the relevant inode, run the list of dnotify structs
+ * on that mark and determine which of them has expressed interest in receiving
+ * events of this type.  When found send the correct process and signal and
+ * destroy the dnotify struct if it was not registered to receive multiple
+ * events.
+ */
+static int dnotify_handle_event(struct fsnotify_group *group,
+				struct fsnotify_mark *inode_mark,
+				struct fsnotify_mark *vfsmount_mark,
+				struct fsnotify_event *event)
+{
+	struct dnotify_mark *dn_mark;
+	struct inode *to_tell;
+	struct dnotify_struct *dn;
+	struct dnotify_struct **prev;
+	struct fown_struct *fown;
+	__u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
+
+	BUG_ON(vfsmount_mark);
+
+	to_tell = event->to_tell;
+
+	dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
+
+	spin_lock(&inode_mark->lock);
+	prev = &dn_mark->dn;
+	while ((dn = *prev) != NULL) {
+		if ((dn->dn_mask & test_mask) == 0) {
+			prev = &dn->dn_next;
+			continue;
+		}
+		fown = &dn->dn_filp->f_owner;
+		send_sigio(fown, dn->dn_fd, POLL_MSG);
+		if (dn->dn_mask & FS_DN_MULTISHOT)
+			prev = &dn->dn_next;
+		else {
+			*prev = dn->dn_next;
+			kmem_cache_free(dnotify_struct_cache, dn);
+			dnotify_recalc_inode_mask(inode_mark);
+		}
+	}
+
+	spin_unlock(&inode_mark->lock);
+
+	return 0;
+}
+
+/*
+ * Given an inode and mask determine if dnotify would be interested in sending
+ * userspace notification for that pair.
+ */
+static bool dnotify_should_send_event(struct fsnotify_group *group,
+				      struct inode *inode,
+				      struct fsnotify_mark *inode_mark,
+				      struct fsnotify_mark *vfsmount_mark,
+				      __u32 mask, void *data, int data_type)
+{
+	/* not a dir, dnotify doesn't care */
+	if (!S_ISDIR(inode->i_mode))
+		return false;
+
+	return true;
+}
+
+static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+	struct dnotify_mark *dn_mark = container_of(fsn_mark,
+						    struct dnotify_mark,
+						    fsn_mark);
+
+	BUG_ON(dn_mark->dn);
+
+	kmem_cache_free(dnotify_mark_cache, dn_mark);
+}
+
+static struct fsnotify_ops dnotify_fsnotify_ops = {
+	.handle_event = dnotify_handle_event,
+	.should_send_event = dnotify_should_send_event,
+	.free_group_priv = NULL,
+	.freeing_mark = NULL,
+	.free_event_priv = NULL,
+};
+
+/*
+ * Called every time a file is closed.  Looks first for a dnotify mark on the
+ * inode.  If one is found run all of the ->dn structures attached to that
+ * mark for one relevant to this process closing the file and remove that
+ * dnotify_struct.  If that was the last dnotify_struct also remove the
+ * fsnotify_mark.
+ */
+void dnotify_flush(struct file *filp, fl_owner_t id)
+{
+	struct fsnotify_mark *fsn_mark;
+	struct dnotify_mark *dn_mark;
+	struct dnotify_struct *dn;
+	struct dnotify_struct **prev;
+	struct inode *inode;
+
+	inode = filp->f_path.dentry->d_inode;
+	if (!S_ISDIR(inode->i_mode))
+		return;
+
+	fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
+	if (!fsn_mark)
+		return;
+	dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
+
+	mutex_lock(&dnotify_mark_mutex);
+
+	spin_lock(&fsn_mark->lock);
+	prev = &dn_mark->dn;
+	while ((dn = *prev) != NULL) {
+		if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
+			*prev = dn->dn_next;
+			kmem_cache_free(dnotify_struct_cache, dn);
+			dnotify_recalc_inode_mask(fsn_mark);
+			break;
+		}
+		prev = &dn->dn_next;
+	}
+
+	spin_unlock(&fsn_mark->lock);
+
+	/* nothing else could have found us thanks to the dnotify_mark_mutex */
+	if (dn_mark->dn == NULL)
+		fsnotify_destroy_mark(fsn_mark);
+
+	mutex_unlock(&dnotify_mark_mutex);
+
+	fsnotify_put_mark(fsn_mark);
+}
+
+/* this conversion is done only at watch creation */
+static __u32 convert_arg(unsigned long arg)
+{
+	__u32 new_mask = FS_EVENT_ON_CHILD;
+
+	if (arg & DN_MULTISHOT)
+		new_mask |= FS_DN_MULTISHOT;
+	if (arg & DN_DELETE)
+		new_mask |= (FS_DELETE | FS_MOVED_FROM);
+	if (arg & DN_MODIFY)
+		new_mask |= FS_MODIFY;
+	if (arg & DN_ACCESS)
+		new_mask |= FS_ACCESS;
+	if (arg & DN_ATTRIB)
+		new_mask |= FS_ATTRIB;
+	if (arg & DN_RENAME)
+		new_mask |= FS_DN_RENAME;
+	if (arg & DN_CREATE)
+		new_mask |= (FS_CREATE | FS_MOVED_TO);
+
+	return new_mask;
+}
+
+/*
+ * If multiple processes watch the same inode with dnotify there is only one
+ * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct
+ * onto that mark.  This function either attaches the new dnotify_struct onto
+ * that list, or it |= the mask onto an existing dnofiy_struct.
+ */
+static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
+		     fl_owner_t id, int fd, struct file *filp, __u32 mask)
+{
+	struct dnotify_struct *odn;
+
+	odn = dn_mark->dn;
+	while (odn != NULL) {
+		/* adding more events to existing dnofiy_struct? */
+		if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
+			odn->dn_fd = fd;
+			odn->dn_mask |= mask;
+			return -EEXIST;
+		}
+		odn = odn->dn_next;
+	}
+
+	dn->dn_mask = mask;
+	dn->dn_fd = fd;
+	dn->dn_filp = filp;
+	dn->dn_owner = id;
+	dn->dn_next = dn_mark->dn;
+	dn_mark->dn = dn;
+
+	return 0;
+}
+
+/*
+ * When a process calls fcntl to attach a dnotify watch to a directory it ends
+ * up here.  Allocate both a mark for fsnotify to add and a dnotify_struct to be
+ * attached to the fsnotify_mark.
+ */
+int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
+{
+	struct dnotify_mark *new_dn_mark, *dn_mark;
+	struct fsnotify_mark *new_fsn_mark, *fsn_mark;
+	struct dnotify_struct *dn;
+	struct inode *inode;
+	fl_owner_t id = current->files;
+	struct file *f;
+	int destroy = 0, error = 0;
+	__u32 mask;
+
+	/* we use these to tell if we need to kfree */
+	new_fsn_mark = NULL;
+	dn = NULL;
+
+	if (!dir_notify_enable) {
+		error = -EINVAL;
+		goto out_err;
+	}
+
+	/* a 0 mask means we are explicitly removing the watch */
+	if ((arg & ~DN_MULTISHOT) == 0) {
+		dnotify_flush(filp, id);
+		error = 0;
+		goto out_err;
+	}
+
+	/* dnotify only works on directories */
+	inode = filp->f_path.dentry->d_inode;
+	if (!S_ISDIR(inode->i_mode)) {
+		error = -ENOTDIR;
+		goto out_err;
+	}
+
+	/* expect most fcntl to add new rather than augment old */
+	dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
+	if (!dn) {
+		error = -ENOMEM;
+		goto out_err;
+	}
+
+	/* new fsnotify mark, we expect most fcntl calls to add a new mark */
+	new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
+	if (!new_dn_mark) {
+		error = -ENOMEM;
+		goto out_err;
+	}
+
+	/* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
+	mask = convert_arg(arg);
+
+	/* set up the new_fsn_mark and new_dn_mark */
+	new_fsn_mark = &new_dn_mark->fsn_mark;
+	fsnotify_init_mark(new_fsn_mark, dnotify_free_mark);
+	new_fsn_mark->mask = mask;
+	new_dn_mark->dn = NULL;
+
+	/* this is needed to prevent the fcntl/close race described below */
+	mutex_lock(&dnotify_mark_mutex);
+
+	/* add the new_fsn_mark or find an old one. */
+	fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
+	if (fsn_mark) {
+		dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
+		spin_lock(&fsn_mark->lock);
+	} else {
+		fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0);
+		spin_lock(&new_fsn_mark->lock);
+		fsn_mark = new_fsn_mark;
+		dn_mark = new_dn_mark;
+		/* we used new_fsn_mark, so don't free it */
+		new_fsn_mark = NULL;
+	}
+
+	rcu_read_lock();
+	f = fcheck(fd);
+	rcu_read_unlock();
+
+	/* if (f != filp) means that we lost a race and another task/thread
+	 * actually closed the fd we are still playing with before we grabbed
+	 * the dnotify_mark_mutex and fsn_mark->lock.  Since closing the fd is the
+	 * only time we clean up the marks we need to get our mark off
+	 * the list. */
+	if (f != filp) {
+		/* if we added ourselves, shoot ourselves, it's possible that
+		 * the flush actually did shoot this fsn_mark.  That's fine too
+		 * since multiple calls to destroy_mark is perfectly safe, if
+		 * we found a dn_mark already attached to the inode, just sod
+		 * off silently as the flush at close time dealt with it.
+		 */
+		if (dn_mark == new_dn_mark)
+			destroy = 1;
+		goto out;
+	}
+
+	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+	if (error) {
+		/* if we added, we must shoot */
+		if (dn_mark == new_dn_mark)
+			destroy = 1;
+		goto out;
+	}
+
+	error = attach_dn(dn, dn_mark, id, fd, filp, mask);
+	/* !error means that we attached the dn to the dn_mark, so don't free it */
+	if (!error)
+		dn = NULL;
+	/* -EEXIST means that we didn't add this new dn and used an old one.
+	 * that isn't an error (and the unused dn should be freed) */
+	else if (error == -EEXIST)
+		error = 0;
+
+	dnotify_recalc_inode_mask(fsn_mark);
+out:
+	spin_unlock(&fsn_mark->lock);
+
+	if (destroy)
+		fsnotify_destroy_mark(fsn_mark);
+
+	mutex_unlock(&dnotify_mark_mutex);
+	fsnotify_put_mark(fsn_mark);
+out_err:
+	if (new_fsn_mark)
+		fsnotify_put_mark(new_fsn_mark);
+	if (dn)
+		kmem_cache_free(dnotify_struct_cache, dn);
+	return error;
+}
+
+static int __init dnotify_init(void)
+{
+	dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
+	dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC);
+
+	dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
+	if (IS_ERR(dnotify_group))
+		panic("unable to allocate fsnotify group for dnotify\n");
+	return 0;
+}
+
+module_init(dnotify_init)
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
new file mode 100644
index 00000000..7dceff00
--- /dev/null
+++ b/fs/notify/fanotify/Kconfig
@@ -0,0 +1,26 @@
+config FANOTIFY
+	bool "Filesystem wide access notification"
+	select FSNOTIFY
+	select ANON_INODES
+	default n
+	---help---
+	   Say Y here to enable fanotify suport.  fanotify is a file access
+	   notification system which differs from inotify in that it sends
+	   an open file descriptor to the userspace listener along with
+	   the event.
+
+	   If unsure, say Y.
+
+config FANOTIFY_ACCESS_PERMISSIONS
+	bool "fanotify permissions checking"
+	depends on FANOTIFY
+	depends on SECURITY
+	default n
+	---help---
+	   Say Y here is you want fanotify listeners to be able to make permissions
+	   decisions concerning filesystem events.  This is used by some fanotify
+	   listeners which need to scan files before allowing the system access to
+	   use those files.  This is used by some anti-malware vendors and by some
+	   hierarchical storage managent systems.
+
+	   If unsure, say N.
diff --git a/fs/notify/fanotify/Makefile b/fs/notify/fanotify/Makefile
new file mode 100644
index 00000000..0999213e
--- /dev/null
+++ b/fs/notify/fanotify/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_FANOTIFY)		+= fanotify.o fanotify_user.o
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
new file mode 100644
index 00000000..f35794b9
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.c
@@ -0,0 +1,228 @@
+#include <linux/fanotify.h>
+#include <linux/fdtable.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h> /* UINT_MAX */
+#include <linux/mount.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
+{
+	pr_debug("%s: old=%p new=%p\n", __func__, old, new);
+
+	if (old->to_tell == new->to_tell &&
+	    old->data_type == new->data_type &&
+	    old->tgid == new->tgid) {
+		switch (old->data_type) {
+		case (FSNOTIFY_EVENT_PATH):
+			if ((old->path.mnt == new->path.mnt) &&
+			    (old->path.dentry == new->path.dentry))
+				return true;
+		case (FSNOTIFY_EVENT_NONE):
+			return true;
+		default:
+			BUG();
+		};
+	}
+	return false;
+}
+
+/* and the list better be locked by something too! */
+static struct fsnotify_event *fanotify_merge(struct list_head *list,
+					     struct fsnotify_event *event)
+{
+	struct fsnotify_event_holder *test_holder;
+	struct fsnotify_event *test_event = NULL;
+	struct fsnotify_event *new_event;
+
+	pr_debug("%s: list=%p event=%p\n", __func__, list, event);
+
+
+	list_for_each_entry_reverse(test_holder, list, event_list) {
+		if (should_merge(test_holder->event, event)) {
+			test_event = test_holder->event;
+			break;
+		}
+	}
+
+	if (!test_event)
+		return NULL;
+
+	fsnotify_get_event(test_event);
+
+	/* if they are exactly the same we are done */
+	if (test_event->mask == event->mask)
+		return test_event;
+
+	/*
+	 * if the refcnt == 2 this is the only queue
+	 * for this event and so we can update the mask
+	 * in place.
+	 */
+	if (atomic_read(&test_event->refcnt) == 2) {
+		test_event->mask |= event->mask;
+		return test_event;
+	}
+
+	new_event = fsnotify_clone_event(test_event);
+
+	/* done with test_event */
+	fsnotify_put_event(test_event);
+
+	/* couldn't allocate memory, merge was not possible */
+	if (unlikely(!new_event))
+		return ERR_PTR(-ENOMEM);
+
+	/* build new event and replace it on the list */
+	new_event->mask = (test_event->mask | event->mask);
+	fsnotify_replace_event(test_holder, new_event);
+
+	/* we hold a reference on new_event from clone_event */
+	return new_event;
+}
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+static int fanotify_get_response_from_access(struct fsnotify_group *group,
+					     struct fsnotify_event *event)
+{
+	int ret;
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	wait_event(group->fanotify_data.access_waitq, event->response ||
+				atomic_read(&group->fanotify_data.bypass_perm));
+
+	if (!event->response) /* bypass_perm set */
+		return 0;
+
+	/* userspace responded, convert to something usable */
+	spin_lock(&event->lock);
+	switch (event->response) {
+	case FAN_ALLOW:
+		ret = 0;
+		break;
+	case FAN_DENY:
+	default:
+		ret = -EPERM;
+	}
+	event->response = 0;
+	spin_unlock(&event->lock);
+
+	pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
+		 group, event, ret);
+	
+	return ret;
+}
+#endif
+
+static int fanotify_handle_event(struct fsnotify_group *group,
+				 struct fsnotify_mark *inode_mark,
+				 struct fsnotify_mark *fanotify_mark,
+				 struct fsnotify_event *event)
+{
+	int ret = 0;
+	struct fsnotify_event *notify_event = NULL;
+
+	BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
+	BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+	BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+	BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
+	BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+	BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
+	BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
+	BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
+	BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
+	BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
+	if (IS_ERR(notify_event))
+		return PTR_ERR(notify_event);
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	if (event->mask & FAN_ALL_PERM_EVENTS) {
+		/* if we merged we need to wait on the new event */
+		if (notify_event)
+			event = notify_event;
+		ret = fanotify_get_response_from_access(group, event);
+	}
+#endif
+
+	if (notify_event)
+		fsnotify_put_event(notify_event);
+
+	return ret;
+}
+
+static bool fanotify_should_send_event(struct fsnotify_group *group,
+				       struct inode *to_tell,
+				       struct fsnotify_mark *inode_mark,
+				       struct fsnotify_mark *vfsmnt_mark,
+				       __u32 event_mask, void *data, int data_type)
+{
+	__u32 marks_mask, marks_ignored_mask;
+	struct path *path = data;
+
+	pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
+		 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
+		 inode_mark, vfsmnt_mark, event_mask, data, data_type);
+
+	/* if we don't have enough info to send an event to userspace say no */
+	if (data_type != FSNOTIFY_EVENT_PATH)
+		return false;
+
+	/* sorry, fanotify only gives a damn about files and dirs */
+	if (!S_ISREG(path->dentry->d_inode->i_mode) &&
+	    !S_ISDIR(path->dentry->d_inode->i_mode))
+		return false;
+
+	if (inode_mark && vfsmnt_mark) {
+		marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
+		marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
+	} else if (inode_mark) {
+		/*
+		 * if the event is for a child and this inode doesn't care about
+		 * events on the child, don't send it!
+		 */
+		if ((event_mask & FS_EVENT_ON_CHILD) &&
+		    !(inode_mark->mask & FS_EVENT_ON_CHILD))
+			return false;
+		marks_mask = inode_mark->mask;
+		marks_ignored_mask = inode_mark->ignored_mask;
+	} else if (vfsmnt_mark) {
+		marks_mask = vfsmnt_mark->mask;
+		marks_ignored_mask = vfsmnt_mark->ignored_mask;
+	} else {
+		BUG();
+	}
+
+	if (S_ISDIR(path->dentry->d_inode->i_mode) &&
+	    (marks_ignored_mask & FS_ISDIR))
+		return false;
+
+	if (event_mask & marks_mask & ~marks_ignored_mask)
+		return true;
+
+	return false;
+}
+
+static void fanotify_free_group_priv(struct fsnotify_group *group)
+{
+	struct user_struct *user;
+
+	user = group->fanotify_data.user;
+	atomic_dec(&user->fanotify_listeners);
+	free_uid(user);
+}
+
+const struct fsnotify_ops fanotify_fsnotify_ops = {
+	.handle_event = fanotify_handle_event,
+	.should_send_event = fanotify_should_send_event,
+	.free_group_priv = fanotify_free_group_priv,
+	.free_event_priv = NULL,
+	.freeing_mark = NULL,
+};
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
new file mode 100644
index 00000000..9fde1c00
--- /dev/null
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -0,0 +1,891 @@
+#include <linux/fanotify.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/init.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+
+#include <asm/ioctls.h>
+
+#define FANOTIFY_DEFAULT_MAX_EVENTS	16384
+#define FANOTIFY_DEFAULT_MAX_MARKS	8192
+#define FANOTIFY_DEFAULT_MAX_LISTENERS	128
+
+extern const struct fsnotify_ops fanotify_fsnotify_ops;
+
+static struct kmem_cache *fanotify_mark_cache __read_mostly;
+static struct kmem_cache *fanotify_response_event_cache __read_mostly;
+
+struct fanotify_response_event {
+	struct list_head list;
+	__s32 fd;
+	struct fsnotify_event *event;
+};
+
+/*
+ * Get an fsnotify notification event if one exists and is small
+ * enough to fit in "count". Return an error pointer if the count
+ * is not large enough.
+ *
+ * Called with the group->notification_mutex held.
+ */
+static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
+					    size_t count)
+{
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+
+	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
+
+	if (fsnotify_notify_queue_is_empty(group))
+		return NULL;
+
+	if (FAN_EVENT_METADATA_LEN > count)
+		return ERR_PTR(-EINVAL);
+
+	/* held the notification_mutex the whole time, so this is the
+	 * same event we peeked above */
+	return fsnotify_remove_notify_event(group);
+}
+
+static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+	int client_fd;
+	struct dentry *dentry;
+	struct vfsmount *mnt;
+	struct file *new_file;
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	client_fd = get_unused_fd();
+	if (client_fd < 0)
+		return client_fd;
+
+	if (event->data_type != FSNOTIFY_EVENT_PATH) {
+		WARN_ON(1);
+		put_unused_fd(client_fd);
+		return -EINVAL;
+	}
+
+	/*
+	 * we need a new file handle for the userspace program so it can read even if it was
+	 * originally opened O_WRONLY.
+	 */
+	dentry = dget(event->path.dentry);
+	mnt = mntget(event->path.mnt);
+	/* it's possible this event was an overflow event.  in that case dentry and mnt
+	 * are NULL;  That's fine, just don't call dentry open */
+	if (dentry && mnt)
+		new_file = dentry_open(dentry, mnt,
+				       group->fanotify_data.f_flags | FMODE_NONOTIFY,
+				       current_cred());
+	else
+		new_file = ERR_PTR(-EOVERFLOW);
+	if (IS_ERR(new_file)) {
+		/*
+		 * we still send an event even if we can't open the file.  this
+		 * can happen when say tasks are gone and we try to open their
+		 * /proc files or we try to open a WRONLY file like in sysfs
+		 * we just send the errno to userspace since there isn't much
+		 * else we can do.
+		 */
+		put_unused_fd(client_fd);
+		client_fd = PTR_ERR(new_file);
+	} else {
+		fd_install(client_fd, new_file);
+	}
+
+	return client_fd;
+}
+
+static int fill_event_metadata(struct fsnotify_group *group,
+				   struct fanotify_event_metadata *metadata,
+				   struct fsnotify_event *event)
+{
+	int ret = 0;
+
+	pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
+		 group, metadata, event);
+
+	metadata->event_len = FAN_EVENT_METADATA_LEN;
+	metadata->metadata_len = FAN_EVENT_METADATA_LEN;
+	metadata->vers = FANOTIFY_METADATA_VERSION;
+	metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
+	metadata->pid = pid_vnr(event->tgid);
+	if (unlikely(event->mask & FAN_Q_OVERFLOW))
+		metadata->fd = FAN_NOFD;
+	else {
+		metadata->fd = create_fd(group, event);
+		if (metadata->fd < 0)
+			ret = metadata->fd;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group,
+						  __s32 fd)
+{
+	struct fanotify_response_event *re, *return_re = NULL;
+
+	mutex_lock(&group->fanotify_data.access_mutex);
+	list_for_each_entry(re, &group->fanotify_data.access_list, list) {
+		if (re->fd != fd)
+			continue;
+
+		list_del_init(&re->list);
+		return_re = re;
+		break;
+	}
+	mutex_unlock(&group->fanotify_data.access_mutex);
+
+	pr_debug("%s: found return_re=%p\n", __func__, return_re);
+
+	return return_re;
+}
+
+static int process_access_response(struct fsnotify_group *group,
+				   struct fanotify_response *response_struct)
+{
+	struct fanotify_response_event *re;
+	__s32 fd = response_struct->fd;
+	__u32 response = response_struct->response;
+
+	pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
+		 fd, response);
+	/*
+	 * make sure the response is valid, if invalid we do nothing and either
+	 * userspace can send a valid response or we will clean it up after the
+	 * timeout
+	 */
+	switch (response) {
+	case FAN_ALLOW:
+	case FAN_DENY:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (fd < 0)
+		return -EINVAL;
+
+	re = dequeue_re(group, fd);
+	if (!re)
+		return -ENOENT;
+
+	re->event->response = response;
+
+	wake_up(&group->fanotify_data.access_waitq);
+
+	kmem_cache_free(fanotify_response_event_cache, re);
+
+	return 0;
+}
+
+static int prepare_for_access_response(struct fsnotify_group *group,
+				       struct fsnotify_event *event,
+				       __s32 fd)
+{
+	struct fanotify_response_event *re;
+
+	if (!(event->mask & FAN_ALL_PERM_EVENTS))
+		return 0;
+
+	re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL);
+	if (!re)
+		return -ENOMEM;
+
+	re->event = event;
+	re->fd = fd;
+
+	mutex_lock(&group->fanotify_data.access_mutex);
+
+	if (atomic_read(&group->fanotify_data.bypass_perm)) {
+		mutex_unlock(&group->fanotify_data.access_mutex);
+		kmem_cache_free(fanotify_response_event_cache, re);
+		event->response = FAN_ALLOW;
+		return 0;
+	}
+		
+	list_add_tail(&re->list, &group->fanotify_data.access_list);
+	mutex_unlock(&group->fanotify_data.access_mutex);
+
+	return 0;
+}
+
+static void remove_access_response(struct fsnotify_group *group,
+				   struct fsnotify_event *event,
+				   __s32 fd)
+{
+	struct fanotify_response_event *re;
+
+	if (!(event->mask & FAN_ALL_PERM_EVENTS))
+		return;
+
+	re = dequeue_re(group, fd);
+	if (!re)
+		return;
+
+	BUG_ON(re->event != event);
+
+	kmem_cache_free(fanotify_response_event_cache, re);
+
+	return;
+}
+#else
+static int prepare_for_access_response(struct fsnotify_group *group,
+				       struct fsnotify_event *event,
+				       __s32 fd)
+{
+	return 0;
+}
+
+static void remove_access_response(struct fsnotify_group *group,
+				   struct fsnotify_event *event,
+				   __s32 fd)
+{
+	return;
+}
+#endif
+
+static ssize_t copy_event_to_user(struct fsnotify_group *group,
+				  struct fsnotify_event *event,
+				  char __user *buf)
+{
+	struct fanotify_event_metadata fanotify_event_metadata;
+	int fd, ret;
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	ret = fill_event_metadata(group, &fanotify_event_metadata, event);
+	if (ret < 0)
+		goto out;
+
+	fd = fanotify_event_metadata.fd;
+	ret = prepare_for_access_response(group, event, fd);
+	if (ret)
+		goto out_close_fd;
+
+	ret = -EFAULT;
+	if (copy_to_user(buf, &fanotify_event_metadata,
+			 fanotify_event_metadata.event_len))
+		goto out_kill_access_response;
+
+	return fanotify_event_metadata.event_len;
+
+out_kill_access_response:
+	remove_access_response(group, event, fd);
+out_close_fd:
+	if (fd != FAN_NOFD)
+		sys_close(fd);
+out:
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	if (event->mask & FAN_ALL_PERM_EVENTS) {
+		event->response = FAN_DENY;
+		wake_up(&group->fanotify_data.access_waitq);
+	}
+#endif
+	return ret;
+}
+
+/* intofiy userspace file descriptor functions */
+static unsigned int fanotify_poll(struct file *file, poll_table *wait)
+{
+	struct fsnotify_group *group = file->private_data;
+	int ret = 0;
+
+	poll_wait(file, &group->notification_waitq, wait);
+	mutex_lock(&group->notification_mutex);
+	if (!fsnotify_notify_queue_is_empty(group))
+		ret = POLLIN | POLLRDNORM;
+	mutex_unlock(&group->notification_mutex);
+
+	return ret;
+}
+
+static ssize_t fanotify_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct fsnotify_group *group;
+	struct fsnotify_event *kevent;
+	char __user *start;
+	int ret;
+	DEFINE_WAIT(wait);
+
+	start = buf;
+	group = file->private_data;
+
+	pr_debug("%s: group=%p\n", __func__, group);
+
+	while (1) {
+		prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
+
+		mutex_lock(&group->notification_mutex);
+		kevent = get_one_event(group, count);
+		mutex_unlock(&group->notification_mutex);
+
+		if (kevent) {
+			ret = PTR_ERR(kevent);
+			if (IS_ERR(kevent))
+				break;
+			ret = copy_event_to_user(group, kevent, buf);
+			fsnotify_put_event(kevent);
+			if (ret < 0)
+				break;
+			buf += ret;
+			count -= ret;
+			continue;
+		}
+
+		ret = -EAGAIN;
+		if (file->f_flags & O_NONBLOCK)
+			break;
+		ret = -ERESTARTSYS;
+		if (signal_pending(current))
+			break;
+
+		if (start != buf)
+			break;
+
+		schedule();
+	}
+
+	finish_wait(&group->notification_waitq, &wait);
+	if (start != buf && ret != -EFAULT)
+		ret = buf - start;
+	return ret;
+}
+
+static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
+{
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	struct fanotify_response response = { .fd = -1, .response = -1 };
+	struct fsnotify_group *group;
+	int ret;
+
+	group = file->private_data;
+
+	if (count > sizeof(response))
+		count = sizeof(response);
+
+	pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
+
+	if (copy_from_user(&response, buf, count))
+		return -EFAULT;
+
+	ret = process_access_response(group, &response);
+	if (ret < 0)
+		count = ret;
+
+	return count;
+#else
+	return -EINVAL;
+#endif
+}
+
+static int fanotify_release(struct inode *ignored, struct file *file)
+{
+	struct fsnotify_group *group = file->private_data;
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	struct fanotify_response_event *re, *lre;
+
+	mutex_lock(&group->fanotify_data.access_mutex);
+
+	atomic_inc(&group->fanotify_data.bypass_perm);
+
+	list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
+		pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
+			 re, re->event);
+
+		list_del_init(&re->list);
+		re->event->response = FAN_ALLOW;
+
+		kmem_cache_free(fanotify_response_event_cache, re);
+	}
+	mutex_unlock(&group->fanotify_data.access_mutex);
+
+	wake_up(&group->fanotify_data.access_waitq);
+#endif
+	/* matches the fanotify_init->fsnotify_alloc_group */
+	fsnotify_put_group(group);
+
+	return 0;
+}
+
+static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct fsnotify_group *group;
+	struct fsnotify_event_holder *holder;
+	void __user *p;
+	int ret = -ENOTTY;
+	size_t send_len = 0;
+
+	group = file->private_data;
+
+	p = (void __user *) arg;
+
+	switch (cmd) {
+	case FIONREAD:
+		mutex_lock(&group->notification_mutex);
+		list_for_each_entry(holder, &group->notification_list, event_list)
+			send_len += FAN_EVENT_METADATA_LEN;
+		mutex_unlock(&group->notification_mutex);
+		ret = put_user(send_len, (int __user *) p);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct file_operations fanotify_fops = {
+	.poll		= fanotify_poll,
+	.read		= fanotify_read,
+	.write		= fanotify_write,
+	.fasync		= NULL,
+	.release	= fanotify_release,
+	.unlocked_ioctl	= fanotify_ioctl,
+	.compat_ioctl	= fanotify_ioctl,
+	.llseek		= noop_llseek,
+};
+
+static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+	kmem_cache_free(fanotify_mark_cache, fsn_mark);
+}
+
+static int fanotify_find_path(int dfd, const char __user *filename,
+			      struct path *path, unsigned int flags)
+{
+	int ret;
+
+	pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
+		 dfd, filename, flags);
+
+	if (filename == NULL) {
+		struct file *file;
+		int fput_needed;
+
+		ret = -EBADF;
+		file = fget_light(dfd, &fput_needed);
+		if (!file)
+			goto out;
+
+		ret = -ENOTDIR;
+		if ((flags & FAN_MARK_ONLYDIR) &&
+		    !(S_ISDIR(file->f_path.dentry->d_inode->i_mode))) {
+			fput_light(file, fput_needed);
+			goto out;
+		}
+
+		*path = file->f_path;
+		path_get(path);
+		fput_light(file, fput_needed);
+	} else {
+		unsigned int lookup_flags = 0;
+
+		if (!(flags & FAN_MARK_DONT_FOLLOW))
+			lookup_flags |= LOOKUP_FOLLOW;
+		if (flags & FAN_MARK_ONLYDIR)
+			lookup_flags |= LOOKUP_DIRECTORY;
+
+		ret = user_path_at(dfd, filename, lookup_flags, path);
+		if (ret)
+			goto out;
+	}
+
+	/* you can only watch an inode if you have read permissions on it */
+	ret = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (ret)
+		path_put(path);
+out:
+	return ret;
+}
+
+static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
+					    __u32 mask,
+					    unsigned int flags)
+{
+	__u32 oldmask;
+
+	spin_lock(&fsn_mark->lock);
+	if (!(flags & FAN_MARK_IGNORED_MASK)) {
+		oldmask = fsn_mark->mask;
+		fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask));
+	} else {
+		oldmask = fsn_mark->ignored_mask;
+		fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask));
+	}
+	spin_unlock(&fsn_mark->lock);
+
+	if (!(oldmask & ~mask))
+		fsnotify_destroy_mark(fsn_mark);
+
+	return mask & oldmask;
+}
+
+static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
+					 struct vfsmount *mnt, __u32 mask,
+					 unsigned int flags)
+{
+	struct fsnotify_mark *fsn_mark = NULL;
+	__u32 removed;
+
+	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+	if (!fsn_mark)
+		return -ENOENT;
+
+	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
+	fsnotify_put_mark(fsn_mark);
+	if (removed & mnt->mnt_fsnotify_mask)
+		fsnotify_recalc_vfsmount_mask(mnt);
+
+	return 0;
+}
+
+static int fanotify_remove_inode_mark(struct fsnotify_group *group,
+				      struct inode *inode, __u32 mask,
+				      unsigned int flags)
+{
+	struct fsnotify_mark *fsn_mark = NULL;
+	__u32 removed;
+
+	fsn_mark = fsnotify_find_inode_mark(group, inode);
+	if (!fsn_mark)
+		return -ENOENT;
+
+	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
+	/* matches the fsnotify_find_inode_mark() */
+	fsnotify_put_mark(fsn_mark);
+	if (removed & inode->i_fsnotify_mask)
+		fsnotify_recalc_inode_mask(inode);
+
+	return 0;
+}
+
+static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
+				       __u32 mask,
+				       unsigned int flags)
+{
+	__u32 oldmask = -1;
+
+	spin_lock(&fsn_mark->lock);
+	if (!(flags & FAN_MARK_IGNORED_MASK)) {
+		oldmask = fsn_mark->mask;
+		fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
+	} else {
+		__u32 tmask = fsn_mark->ignored_mask | mask;
+		fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+		if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
+			fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
+	}
+
+	if (!(flags & FAN_MARK_ONDIR)) {
+		__u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR;
+		fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+	}
+
+	spin_unlock(&fsn_mark->lock);
+
+	return mask & ~oldmask;
+}
+
+static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
+				      struct vfsmount *mnt, __u32 mask,
+				      unsigned int flags)
+{
+	struct fsnotify_mark *fsn_mark;
+	__u32 added;
+	int ret = 0;
+
+	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+	if (!fsn_mark) {
+		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+			return -ENOSPC;
+
+		fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+		if (!fsn_mark)
+			return -ENOMEM;
+
+		fsnotify_init_mark(fsn_mark, fanotify_free_mark);
+		ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
+		if (ret)
+			goto err;
+	}
+	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+
+	if (added & ~mnt->mnt_fsnotify_mask)
+		fsnotify_recalc_vfsmount_mask(mnt);
+err:
+	fsnotify_put_mark(fsn_mark);
+	return ret;
+}
+
+static int fanotify_add_inode_mark(struct fsnotify_group *group,
+				   struct inode *inode, __u32 mask,
+				   unsigned int flags)
+{
+	struct fsnotify_mark *fsn_mark;
+	__u32 added;
+	int ret = 0;
+
+	pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
+
+	/*
+	 * If some other task has this inode open for write we should not add
+	 * an ignored mark, unless that ignored mark is supposed to survive
+	 * modification changes anyway.
+	 */
+	if ((flags & FAN_MARK_IGNORED_MASK) &&
+	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+	    (atomic_read(&inode->i_writecount) > 0))
+		return 0;
+
+	fsn_mark = fsnotify_find_inode_mark(group, inode);
+	if (!fsn_mark) {
+		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+			return -ENOSPC;
+
+		fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+		if (!fsn_mark)
+			return -ENOMEM;
+
+		fsnotify_init_mark(fsn_mark, fanotify_free_mark);
+		ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
+		if (ret)
+			goto err;
+	}
+	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+
+	if (added & ~inode->i_fsnotify_mask)
+		fsnotify_recalc_inode_mask(inode);
+err:
+	fsnotify_put_mark(fsn_mark);
+	return ret;
+}
+
+/* fanotify syscalls */
+SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
+{
+	struct fsnotify_group *group;
+	int f_flags, fd;
+	struct user_struct *user;
+
+	pr_debug("%s: flags=%d event_f_flags=%d\n",
+		__func__, flags, event_f_flags);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (flags & ~FAN_ALL_INIT_FLAGS)
+		return -EINVAL;
+
+	user = get_current_user();
+	if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
+		free_uid(user);
+		return -EMFILE;
+	}
+
+	f_flags = O_RDWR | FMODE_NONOTIFY;
+	if (flags & FAN_CLOEXEC)
+		f_flags |= O_CLOEXEC;
+	if (flags & FAN_NONBLOCK)
+		f_flags |= O_NONBLOCK;
+
+	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
+	group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
+	if (IS_ERR(group)) {
+		free_uid(user);
+		return PTR_ERR(group);
+	}
+
+	group->fanotify_data.user = user;
+	atomic_inc(&user->fanotify_listeners);
+
+	group->fanotify_data.f_flags = event_f_flags;
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	mutex_init(&group->fanotify_data.access_mutex);
+	init_waitqueue_head(&group->fanotify_data.access_waitq);
+	INIT_LIST_HEAD(&group->fanotify_data.access_list);
+	atomic_set(&group->fanotify_data.bypass_perm, 0);
+#endif
+	switch (flags & FAN_ALL_CLASS_BITS) {
+	case FAN_CLASS_NOTIF:
+		group->priority = FS_PRIO_0;
+		break;
+	case FAN_CLASS_CONTENT:
+		group->priority = FS_PRIO_1;
+		break;
+	case FAN_CLASS_PRE_CONTENT:
+		group->priority = FS_PRIO_2;
+		break;
+	default:
+		fd = -EINVAL;
+		goto out_put_group;
+	}
+
+	if (flags & FAN_UNLIMITED_QUEUE) {
+		fd = -EPERM;
+		if (!capable(CAP_SYS_ADMIN))
+			goto out_put_group;
+		group->max_events = UINT_MAX;
+	} else {
+		group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
+	}
+
+	if (flags & FAN_UNLIMITED_MARKS) {
+		fd = -EPERM;
+		if (!capable(CAP_SYS_ADMIN))
+			goto out_put_group;
+		group->fanotify_data.max_marks = UINT_MAX;
+	} else {
+		group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
+	}
+
+	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
+	if (fd < 0)
+		goto out_put_group;
+
+	return fd;
+
+out_put_group:
+	fsnotify_put_group(group);
+	return fd;
+}
+
+SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
+			      __u64 mask, int dfd,
+			      const char  __user * pathname)
+{
+	struct inode *inode = NULL;
+	struct vfsmount *mnt = NULL;
+	struct fsnotify_group *group;
+	struct file *filp;
+	struct path path;
+	int ret, fput_needed;
+
+	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
+		 __func__, fanotify_fd, flags, dfd, pathname, mask);
+
+	/* we only use the lower 32 bits as of right now. */
+	if (mask & ((__u64)0xffffffff << 32))
+		return -EINVAL;
+
+	if (flags & ~FAN_ALL_MARK_FLAGS)
+		return -EINVAL;
+	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
+	case FAN_MARK_ADD:		/* fallthrough */
+	case FAN_MARK_REMOVE:
+		if (!mask)
+			return -EINVAL;
+	case FAN_MARK_FLUSH:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (mask & FAN_ONDIR) {
+		flags |= FAN_MARK_ONDIR;
+		mask &= ~FAN_ONDIR;
+	}
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD))
+#else
+	if (mask & ~(FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD))
+#endif
+		return -EINVAL;
+
+	filp = fget_light(fanotify_fd, &fput_needed);
+	if (unlikely(!filp))
+		return -EBADF;
+
+	/* verify that this is indeed an fanotify instance */
+	ret = -EINVAL;
+	if (unlikely(filp->f_op != &fanotify_fops))
+		goto fput_and_out;
+	group = filp->private_data;
+
+	/*
+	 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
+	 * allowed to set permissions events.
+	 */
+	ret = -EINVAL;
+	if (mask & FAN_ALL_PERM_EVENTS &&
+	    group->priority == FS_PRIO_0)
+		goto fput_and_out;
+
+	ret = fanotify_find_path(dfd, pathname, &path, flags);
+	if (ret)
+		goto fput_and_out;
+
+	/* inode held in place by reference to path; group by fget on fd */
+	if (!(flags & FAN_MARK_MOUNT))
+		inode = path.dentry->d_inode;
+	else
+		mnt = path.mnt;
+
+	/* create/update an inode mark */
+	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
+	case FAN_MARK_ADD:
+		if (flags & FAN_MARK_MOUNT)
+			ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
+		else
+			ret = fanotify_add_inode_mark(group, inode, mask, flags);
+		break;
+	case FAN_MARK_REMOVE:
+		if (flags & FAN_MARK_MOUNT)
+			ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
+		else
+			ret = fanotify_remove_inode_mark(group, inode, mask, flags);
+		break;
+	case FAN_MARK_FLUSH:
+		if (flags & FAN_MARK_MOUNT)
+			fsnotify_clear_vfsmount_marks_by_group(group);
+		else
+			fsnotify_clear_inode_marks_by_group(group);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	path_put(&path);
+fput_and_out:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fanotify_mark(long fanotify_fd, long flags, __u64 mask,
+				  long dfd, long pathname)
+{
+	return SYSC_fanotify_mark((int) fanotify_fd, (unsigned int) flags,
+				  mask, (int) dfd,
+				  (const char  __user *) pathname);
+}
+SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
+#endif
+
+/*
+ * fanotify_user_setup - Our initialization function.  Note that we cannot return
+ * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
+ * must result in panic().
+ */
+static int __init fanotify_user_setup(void)
+{
+	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
+	fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
+						   SLAB_PANIC);
+
+	return 0;
+}
+device_initcall(fanotify_user_setup);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
new file mode 100644
index 00000000..79b47cbb
--- /dev/null
+++ b/fs/notify/fsnotify.c
@@ -0,0 +1,310 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/srcu.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+/*
+ * Clear all of the marks on an inode when it is being evicted from core
+ */
+void __fsnotify_inode_delete(struct inode *inode)
+{
+	fsnotify_clear_marks_by_inode(inode);
+}
+EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
+
+void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
+{
+	fsnotify_clear_marks_by_mount(mnt);
+}
+
+/*
+ * Given an inode, first check if we care what happens to our children.  Inotify
+ * and dnotify both tell their parents about events.  If we care about any event
+ * on a child we run all of our children and set a dentry flag saying that the
+ * parent cares.  Thus when an event happens on a child it can quickly tell if
+ * if there is a need to find a parent and send the event to the parent.
+ */
+void __fsnotify_update_child_dentry_flags(struct inode *inode)
+{
+	struct dentry *alias;
+	int watched;
+
+	if (!S_ISDIR(inode->i_mode))
+		return;
+
+	/* determine if the children should tell inode about their events */
+	watched = fsnotify_inode_watches_children(inode);
+
+	spin_lock(&inode->i_lock);
+	/* run all of the dentries associated with this inode.  Since this is a
+	 * directory, there damn well better only be one item on this list */
+	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
+		struct dentry *child;
+
+		/* run all of the children of the original inode and fix their
+		 * d_flags to indicate parental interest (their parent is the
+		 * original inode) */
+		spin_lock(&alias->d_lock);
+		list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
+			if (!child->d_inode)
+				continue;
+
+			spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+			if (watched)
+				child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
+			else
+				child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
+			spin_unlock(&child->d_lock);
+		}
+		spin_unlock(&alias->d_lock);
+	}
+	spin_unlock(&inode->i_lock);
+}
+
+/* Notify this dentry's parent about a child's events. */
+int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
+{
+	struct dentry *parent;
+	struct inode *p_inode;
+	int ret = 0;
+
+	if (!dentry)
+		dentry = path->dentry;
+
+	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
+		return 0;
+
+	parent = dget_parent(dentry);
+	p_inode = parent->d_inode;
+
+	if (unlikely(!fsnotify_inode_watches_children(p_inode)))
+		__fsnotify_update_child_dentry_flags(p_inode);
+	else if (p_inode->i_fsnotify_mask & mask) {
+		/* we are notifying a parent so come up with the new mask which
+		 * specifies these are events which came from a child. */
+		mask |= FS_EVENT_ON_CHILD;
+
+		if (path)
+			ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
+				       dentry->d_name.name, 0);
+		else
+			ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+				       dentry->d_name.name, 0);
+	}
+
+	dput(parent);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__fsnotify_parent);
+
+static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
+			 struct fsnotify_mark *inode_mark,
+			 struct fsnotify_mark *vfsmount_mark,
+			 __u32 mask, void *data,
+			 int data_is, u32 cookie,
+			 const unsigned char *file_name,
+			 struct fsnotify_event **event)
+{
+	struct fsnotify_group *group = NULL;
+	__u32 inode_test_mask = 0;
+	__u32 vfsmount_test_mask = 0;
+
+	if (unlikely(!inode_mark && !vfsmount_mark)) {
+		BUG();
+		return 0;
+	}
+
+	/* clear ignored on inode modification */
+	if (mask & FS_MODIFY) {
+		if (inode_mark &&
+		    !(inode_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+			inode_mark->ignored_mask = 0;
+		if (vfsmount_mark &&
+		    !(vfsmount_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+			vfsmount_mark->ignored_mask = 0;
+	}
+
+	/* does the inode mark tell us to do something? */
+	if (inode_mark) {
+		group = inode_mark->group;
+		inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+		inode_test_mask &= inode_mark->mask;
+		inode_test_mask &= ~inode_mark->ignored_mask;
+	}
+
+	/* does the vfsmount_mark tell us to do something? */
+	if (vfsmount_mark) {
+		vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+		group = vfsmount_mark->group;
+		vfsmount_test_mask &= vfsmount_mark->mask;
+		vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
+		if (inode_mark)
+			vfsmount_test_mask &= ~inode_mark->ignored_mask;
+	}
+
+	pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
+		 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
+		 " data=%p data_is=%d cookie=%d event=%p\n",
+		 __func__, group, to_tell, mnt, mask, inode_mark,
+		 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
+		 data_is, cookie, *event);
+
+	if (!inode_test_mask && !vfsmount_test_mask)
+		return 0;
+
+	if (group->ops->should_send_event(group, to_tell, inode_mark,
+					  vfsmount_mark, mask, data,
+					  data_is) == false)
+		return 0;
+
+	if (!*event) {
+		*event = fsnotify_create_event(to_tell, mask, data,
+						data_is, file_name,
+						cookie, GFP_KERNEL);
+		if (!*event)
+			return -ENOMEM;
+	}
+	return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
+}
+
+/*
+ * This is the main call to fsnotify.  The VFS calls into hook specific functions
+ * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
+ * out to all of the registered fsnotify_group.  Those groups can then use the
+ * notification event in whatever means they feel necessary.
+ */
+int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+	     const unsigned char *file_name, u32 cookie)
+{
+	struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
+	struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
+	struct fsnotify_group *inode_group, *vfsmount_group;
+	struct fsnotify_event *event = NULL;
+	struct vfsmount *mnt;
+	int idx, ret = 0;
+	/* global tests shouldn't care about events on child only the specific event */
+	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
+
+	if (data_is == FSNOTIFY_EVENT_PATH)
+		mnt = ((struct path *)data)->mnt;
+	else
+		mnt = NULL;
+
+	/*
+	 * if this is a modify event we may need to clear the ignored masks
+	 * otherwise return if neither the inode nor the vfsmount care about
+	 * this type of event.
+	 */
+	if (!(mask & FS_MODIFY) &&
+	    !(test_mask & to_tell->i_fsnotify_mask) &&
+	    !(mnt && test_mask & mnt->mnt_fsnotify_mask))
+		return 0;
+
+	idx = srcu_read_lock(&fsnotify_mark_srcu);
+
+	if ((mask & FS_MODIFY) ||
+	    (test_mask & to_tell->i_fsnotify_mask))
+		inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+					      &fsnotify_mark_srcu);
+
+	if (mnt && ((mask & FS_MODIFY) ||
+		    (test_mask & mnt->mnt_fsnotify_mask))) {
+		vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
+						 &fsnotify_mark_srcu);
+		inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+					      &fsnotify_mark_srcu);
+	}
+
+	while (inode_node || vfsmount_node) {
+		inode_group = vfsmount_group = NULL;
+
+		if (inode_node) {
+			inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
+						 struct fsnotify_mark, i.i_list);
+			inode_group = inode_mark->group;
+		}
+
+		if (vfsmount_node) {
+			vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
+							struct fsnotify_mark, m.m_list);
+			vfsmount_group = vfsmount_mark->group;
+		}
+
+		if (inode_group > vfsmount_group) {
+			/* handle inode */
+			ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
+					    data_is, cookie, file_name, &event);
+			/* we didn't use the vfsmount_mark */
+			vfsmount_group = NULL;
+		} else if (vfsmount_group > inode_group) {
+			ret = send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
+					    data_is, cookie, file_name, &event);
+			inode_group = NULL;
+		} else {
+			ret = send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
+					    mask, data, data_is, cookie, file_name,
+					    &event);
+		}
+
+		if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
+			goto out;
+
+		if (inode_group)
+			inode_node = srcu_dereference(inode_node->next,
+						      &fsnotify_mark_srcu);
+		if (vfsmount_group)
+			vfsmount_node = srcu_dereference(vfsmount_node->next,
+							 &fsnotify_mark_srcu);
+	}
+	ret = 0;
+out:
+	srcu_read_unlock(&fsnotify_mark_srcu, idx);
+	/*
+	 * fsnotify_create_event() took a reference so the event can't be cleaned
+	 * up while we are still trying to add it to lists, drop that one.
+	 */
+	if (event)
+		fsnotify_put_event(event);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(fsnotify);
+
+static __init int fsnotify_init(void)
+{
+	int ret;
+
+	BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23);
+
+	ret = init_srcu_struct(&fsnotify_mark_srcu);
+	if (ret)
+		panic("initializing fsnotify_mark_srcu");
+
+	return 0;
+}
+core_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
new file mode 100644
index 00000000..85e7d2b4
--- /dev/null
+++ b/fs/notify/fsnotify.h
@@ -0,0 +1,47 @@
+#ifndef __FS_NOTIFY_FSNOTIFY_H_
+#define __FS_NOTIFY_FSNOTIFY_H_
+
+#include <linux/list.h>
+#include <linux/fsnotify.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+
+/* destroy all events sitting in this groups notification queue */
+extern void fsnotify_flush_notify(struct fsnotify_group *group);
+
+/* protects reads of inode and vfsmount marks list */
+extern struct srcu_struct fsnotify_mark_srcu;
+
+extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
+						__u32 mask);
+/* add a mark to an inode */
+extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
+				   struct fsnotify_group *group, struct inode *inode,
+				   int allow_dups);
+/* add a mark to a vfsmount */
+extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
+				      struct fsnotify_group *group, struct vfsmount *mnt,
+				      int allow_dups);
+
+/* final kfree of a group */
+extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
+
+/* vfsmount specific destruction of a mark */
+extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
+/* inode specific destruction of a mark */
+extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
+/* run the list of all marks associated with inode and flag them to be freed */
+extern void fsnotify_clear_marks_by_inode(struct inode *inode);
+/* run the list of all marks associated with vfsmount and flag them to be freed */
+extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
+/*
+ * update the dentry->d_flags of all of inode's children to indicate if inode cares
+ * about events that happen to its children.
+ */
+extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
+
+/* allocate and destroy and event holder to attach events to notification/access queues */
+extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
+extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
+
+#endif	/* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
new file mode 100644
index 00000000..d309f384
--- /dev/null
+++ b/fs/notify/group.c
@@ -0,0 +1,104 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/rculist.h>
+#include <linux/wait.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+#include <asm/atomic.h>
+
+/*
+ * Final freeing of a group
+ */
+void fsnotify_final_destroy_group(struct fsnotify_group *group)
+{
+	/* clear the notification queue of all events */
+	fsnotify_flush_notify(group);
+
+	if (group->ops->free_group_priv)
+		group->ops->free_group_priv(group);
+
+	kfree(group);
+}
+
+/*
+ * Trying to get rid of a group.  We need to first get rid of any outstanding
+ * allocations and then free the group.  Remember that fsnotify_clear_marks_by_group
+ * could miss marks that are being freed by inode and those marks could still
+ * hold a reference to this group (via group->num_marks)  If we get into that
+ * situtation, the fsnotify_final_destroy_group will get called when that final
+ * mark is freed.
+ */
+static void fsnotify_destroy_group(struct fsnotify_group *group)
+{
+	/* clear all inode marks for this group */
+	fsnotify_clear_marks_by_group(group);
+
+	synchronize_srcu(&fsnotify_mark_srcu);
+
+	/* past the point of no return, matches the initial value of 1 */
+	if (atomic_dec_and_test(&group->num_marks))
+		fsnotify_final_destroy_group(group);
+}
+
+/*
+ * Drop a reference to a group.  Free it if it's through.
+ */
+void fsnotify_put_group(struct fsnotify_group *group)
+{
+	if (atomic_dec_and_test(&group->refcnt))
+		fsnotify_destroy_group(group);
+}
+
+/*
+ * Create a new fsnotify_group and hold a reference for the group returned.
+ */
+struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
+{
+	struct fsnotify_group *group;
+
+	group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
+	if (!group)
+		return ERR_PTR(-ENOMEM);
+
+	/* set to 0 when there a no external references to this group */
+	atomic_set(&group->refcnt, 1);
+	/*
+	 * hits 0 when there are no external references AND no marks for
+	 * this group
+	 */
+	atomic_set(&group->num_marks, 1);
+
+	mutex_init(&group->notification_mutex);
+	INIT_LIST_HEAD(&group->notification_list);
+	init_waitqueue_head(&group->notification_waitq);
+	group->max_events = UINT_MAX;
+
+	spin_lock_init(&group->mark_lock);
+	INIT_LIST_HEAD(&group->marks_list);
+
+	group->ops = ops;
+
+	return group;
+}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
new file mode 100644
index 00000000..07ea8d3e
--- /dev/null
+++ b/fs/notify/inode_mark.c
@@ -0,0 +1,315 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+
+#include <asm/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+#include "../internal.h"
+
+/*
+ * Recalculate the mask of events relevant to a given inode locked.
+ */
+static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
+{
+	struct fsnotify_mark *mark;
+	struct hlist_node *pos;
+	__u32 new_mask = 0;
+
+	assert_spin_locked(&inode->i_lock);
+
+	hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list)
+		new_mask |= mark->mask;
+	inode->i_fsnotify_mask = new_mask;
+}
+
+/*
+ * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
+ * any notifier is interested in hearing for this inode.
+ */
+void fsnotify_recalc_inode_mask(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	fsnotify_recalc_inode_mask_locked(inode);
+	spin_unlock(&inode->i_lock);
+
+	__fsnotify_update_child_dentry_flags(inode);
+}
+
+void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
+{
+	struct inode *inode = mark->i.inode;
+
+	assert_spin_locked(&mark->lock);
+	assert_spin_locked(&mark->group->mark_lock);
+
+	spin_lock(&inode->i_lock);
+
+	hlist_del_init_rcu(&mark->i.i_list);
+	mark->i.inode = NULL;
+
+	/*
+	 * this mark is now off the inode->i_fsnotify_marks list and we
+	 * hold the inode->i_lock, so this is the perfect time to update the
+	 * inode->i_fsnotify_mask
+	 */
+	fsnotify_recalc_inode_mask_locked(inode);
+
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Given an inode, destroy all of the marks associated with that inode.
+ */
+void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+	struct fsnotify_mark *mark, *lmark;
+	struct hlist_node *pos, *n;
+	LIST_HEAD(free_list);
+
+	spin_lock(&inode->i_lock);
+	hlist_for_each_entry_safe(mark, pos, n, &inode->i_fsnotify_marks, i.i_list) {
+		list_add(&mark->i.free_i_list, &free_list);
+		hlist_del_init_rcu(&mark->i.i_list);
+		fsnotify_get_mark(mark);
+	}
+	spin_unlock(&inode->i_lock);
+
+	list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) {
+		fsnotify_destroy_mark(mark);
+		fsnotify_put_mark(mark);
+	}
+}
+
+/*
+ * Given a group clear all of the inode marks associated with that group.
+ */
+void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
+{
+	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE);
+}
+
+/*
+ * given a group and inode, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark *fsnotify_find_inode_mark_locked(struct fsnotify_group *group,
+						      struct inode *inode)
+{
+	struct fsnotify_mark *mark;
+	struct hlist_node *pos;
+
+	assert_spin_locked(&inode->i_lock);
+
+	hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) {
+		if (mark->group == group) {
+			fsnotify_get_mark(mark);
+			return mark;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * given a group and inode, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
+					       struct inode *inode)
+{
+	struct fsnotify_mark *mark;
+
+	spin_lock(&inode->i_lock);
+	mark = fsnotify_find_inode_mark_locked(group, inode);
+	spin_unlock(&inode->i_lock);
+
+	return mark;
+}
+
+/*
+ * If we are setting a mark mask on an inode mark we should pin the inode
+ * in memory.
+ */
+void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
+					 __u32 mask)
+{
+	struct inode *inode;
+
+	assert_spin_locked(&mark->lock);
+
+	if (mask &&
+	    mark->i.inode &&
+	    !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
+		mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
+		inode = igrab(mark->i.inode);
+		/*
+		 * we shouldn't be able to get here if the inode wasn't
+		 * already safely held in memory.  But bug in case it
+		 * ever is wrong.
+		 */
+		BUG_ON(!inode);
+	}
+}
+
+/*
+ * Attach an initialized mark to a given inode.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which group and for which inodes.  These
+ * marks are ordered according to priority, highest number first, and then by
+ * the group's location in memory.
+ */
+int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
+			    struct fsnotify_group *group, struct inode *inode,
+			    int allow_dups)
+{
+	struct fsnotify_mark *lmark;
+	struct hlist_node *node, *last = NULL;
+	int ret = 0;
+
+	mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
+
+	assert_spin_locked(&mark->lock);
+	assert_spin_locked(&group->mark_lock);
+
+	spin_lock(&inode->i_lock);
+
+	mark->i.inode = inode;
+
+	/* is mark the first mark? */
+	if (hlist_empty(&inode->i_fsnotify_marks)) {
+		hlist_add_head_rcu(&mark->i.i_list, &inode->i_fsnotify_marks);
+		goto out;
+	}
+
+	/* should mark be in the middle of the current list? */
+	hlist_for_each_entry(lmark, node, &inode->i_fsnotify_marks, i.i_list) {
+		last = node;
+
+		if ((lmark->group == group) && !allow_dups) {
+			ret = -EEXIST;
+			goto out;
+		}
+
+		if (mark->group->priority < lmark->group->priority)
+			continue;
+
+		if ((mark->group->priority == lmark->group->priority) &&
+		    (mark->group < lmark->group))
+			continue;
+
+		hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
+		goto out;
+	}
+
+	BUG_ON(last == NULL);
+	/* mark should be the last entry.  last is the current last entry */
+	hlist_add_after_rcu(last, &mark->i.i_list);
+out:
+	fsnotify_recalc_inode_mask_locked(inode);
+	spin_unlock(&inode->i_lock);
+
+	return ret;
+}
+
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
+ * @list: list of inodes being unmounted (sb->s_inodes)
+ *
+ * Called during unmount with no locks held, so needs to be safe against
+ * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block.
+ */
+void fsnotify_unmount_inodes(struct list_head *list)
+{
+	struct inode *inode, *next_i, *need_iput = NULL;
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+		struct inode *need_iput_tmp;
+
+		/*
+		 * We cannot __iget() an inode in state I_FREEING,
+		 * I_WILL_FREE, or I_NEW which is fine because by that point
+		 * the inode cannot have any associated watches.
+		 */
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+
+		/*
+		 * If i_count is zero, the inode cannot have any watches and
+		 * doing an __iget/iput with MS_ACTIVE clear would actually
+		 * evict all inodes with zero i_count from icache which is
+		 * unnecessarily violent and may in fact be illegal to do.
+		 */
+		if (!atomic_read(&inode->i_count)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+
+		need_iput_tmp = need_iput;
+		need_iput = NULL;
+
+		/* In case fsnotify_inode_delete() drops a reference. */
+		if (inode != need_iput_tmp)
+			__iget(inode);
+		else
+			need_iput_tmp = NULL;
+		spin_unlock(&inode->i_lock);
+
+		/* In case the dropping of a reference would nuke next_i. */
+		if ((&next_i->i_sb_list != list) &&
+		    atomic_read(&next_i->i_count)) {
+			spin_lock(&next_i->i_lock);
+			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
+				__iget(next_i);
+				need_iput = next_i;
+			}
+			spin_unlock(&next_i->i_lock);
+		}
+
+		/*
+		 * We can safely drop inode_sb_list_lock here because we hold
+		 * references on both inode and next_i.  Also no new inodes
+		 * will be added since the umount has begun.
+		 */
+		spin_unlock(&inode_sb_list_lock);
+
+		if (need_iput_tmp)
+			iput(need_iput_tmp);
+
+		/* for each watch, send FS_UNMOUNT and then remove it */
+		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+
+		fsnotify_inode_delete(inode);
+
+		iput(inode);
+
+		spin_lock(&inode_sb_list_lock);
+	}
+	spin_unlock(&inode_sb_list_lock);
+}
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
new file mode 100644
index 00000000..b981fc0c
--- /dev/null
+++ b/fs/notify/inotify/Kconfig
@@ -0,0 +1,17 @@
+config INOTIFY_USER
+	bool "Inotify support for userspace"
+	select ANON_INODES
+	select FSNOTIFY
+	default y
+	---help---
+	  Say Y here to enable inotify support for userspace, including the
+	  associated system calls.  Inotify allows monitoring of both files and
+	  directories via a single open fd.  Events are read from the file
+	  descriptor, which is also select()- and poll()-able.
+	  Inotify fixes numerous shortcomings in dnotify and introduces several
+	  new features including multiple file events, one-shot support, and
+	  unmount notification.
+
+	  For more information, see <file:Documentation/filesystems/inotify.txt>
+
+	  If unsure, say Y.
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
new file mode 100644
index 00000000..a380dabe
--- /dev/null
+++ b/fs/notify/inotify/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_INOTIFY_USER)	+= inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
new file mode 100644
index 00000000..b6642e4d
--- /dev/null
+++ b/fs/notify/inotify/inotify.h
@@ -0,0 +1,21 @@
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/slab.h> /* struct kmem_cache */
+
+extern struct kmem_cache *event_priv_cachep;
+
+struct inotify_event_private_data {
+	struct fsnotify_event_private_data fsnotify_event_priv_data;
+	int wd;
+};
+
+struct inotify_inode_mark {
+	struct fsnotify_mark fsn_mark;
+	int wd;
+};
+
+extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
+					   struct fsnotify_group *group);
+extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
+
+extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
new file mode 100644
index 00000000..e3cbd746
--- /dev/null
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -0,0 +1,222 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *	John McCutchan	<ttb@tentacle.dhs.org>
+ *	Robert Love	<rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/dcache.h> /* d_unlinked */
+#include <linux/fs.h> /* struct inode */
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/path.h> /* struct path */
+#include <linux/slab.h> /* kmem_* */
+#include <linux/types.h>
+#include <linux/sched.h>
+
+#include "inotify.h"
+
+/*
+ * Check if 2 events contain the same information.  We do not compare private data
+ * but at this moment that isn't a problem for any know fsnotify listeners.
+ */
+static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+{
+	if ((old->mask == new->mask) &&
+	    (old->to_tell == new->to_tell) &&
+	    (old->data_type == new->data_type) &&
+	    (old->name_len == new->name_len)) {
+		switch (old->data_type) {
+		case (FSNOTIFY_EVENT_INODE):
+			/* remember, after old was put on the wait_q we aren't
+			 * allowed to look at the inode any more, only thing
+			 * left to check was if the file_name is the same */
+			if (!old->name_len ||
+			    !strcmp(old->file_name, new->file_name))
+				return true;
+			break;
+		case (FSNOTIFY_EVENT_PATH):
+			if ((old->path.mnt == new->path.mnt) &&
+			    (old->path.dentry == new->path.dentry))
+				return true;
+			break;
+		case (FSNOTIFY_EVENT_NONE):
+			if (old->mask & FS_Q_OVERFLOW)
+				return true;
+			else if (old->mask & FS_IN_IGNORED)
+				return false;
+			return true;
+		};
+	}
+	return false;
+}
+
+static struct fsnotify_event *inotify_merge(struct list_head *list,
+					    struct fsnotify_event *event)
+{
+	struct fsnotify_event_holder *last_holder;
+	struct fsnotify_event *last_event;
+
+	/* and the list better be locked by something too */
+	spin_lock(&event->lock);
+
+	last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
+	last_event = last_holder->event;
+	if (event_compare(last_event, event))
+		fsnotify_get_event(last_event);
+	else
+		last_event = NULL;
+
+	spin_unlock(&event->lock);
+
+	return last_event;
+}
+
+static int inotify_handle_event(struct fsnotify_group *group,
+				struct fsnotify_mark *inode_mark,
+				struct fsnotify_mark *vfsmount_mark,
+				struct fsnotify_event *event)
+{
+	struct inotify_inode_mark *i_mark;
+	struct inode *to_tell;
+	struct inotify_event_private_data *event_priv;
+	struct fsnotify_event_private_data *fsn_event_priv;
+	struct fsnotify_event *added_event;
+	int wd, ret = 0;
+
+	BUG_ON(vfsmount_mark);
+
+	pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group,
+		 event, event->to_tell, event->mask);
+
+	to_tell = event->to_tell;
+
+	i_mark = container_of(inode_mark, struct inotify_inode_mark,
+			      fsn_mark);
+	wd = i_mark->wd;
+
+	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+	if (unlikely(!event_priv))
+		return -ENOMEM;
+
+	fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+
+	fsn_event_priv->group = group;
+	event_priv->wd = wd;
+
+	added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge);
+	if (added_event) {
+		inotify_free_event_priv(fsn_event_priv);
+		if (!IS_ERR(added_event))
+			fsnotify_put_event(added_event);
+		else
+			ret = PTR_ERR(added_event);
+	}
+
+	if (inode_mark->mask & IN_ONESHOT)
+		fsnotify_destroy_mark(inode_mark);
+
+	return ret;
+}
+
+static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
+{
+	inotify_ignored_and_remove_idr(fsn_mark, group);
+}
+
+static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
+				      struct fsnotify_mark *inode_mark,
+				      struct fsnotify_mark *vfsmount_mark,
+				      __u32 mask, void *data, int data_type)
+{
+	if ((inode_mark->mask & FS_EXCL_UNLINK) &&
+	    (data_type == FSNOTIFY_EVENT_PATH)) {
+		struct path *path = data;
+
+		if (d_unlinked(path->dentry))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * This is NEVER supposed to be called.  Inotify marks should either have been
+ * removed from the idr when the watch was removed or in the
+ * fsnotify_destroy_mark_by_group() call when the inotify instance was being
+ * torn down.  This is only called if the idr is about to be freed but there
+ * are still marks in it.
+ */
+static int idr_callback(int id, void *p, void *data)
+{
+	struct fsnotify_mark *fsn_mark;
+	struct inotify_inode_mark *i_mark;
+	static bool warned = false;
+
+	if (warned)
+		return 0;
+
+	warned = true;
+	fsn_mark = p;
+	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
+
+	WARN(1, "inotify closing but id=%d for fsn_mark=%p in group=%p still in "
+		"idr.  Probably leaking memory\n", id, p, data);
+
+	/*
+	 * I'm taking the liberty of assuming that the mark in question is a
+	 * valid address and I'm dereferencing it.  This might help to figure
+	 * out why we got here and the panic is no worse than the original
+	 * BUG() that was here.
+	 */
+	if (fsn_mark)
+		printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n",
+			fsn_mark->group, fsn_mark->i.inode, i_mark->wd);
+	return 0;
+}
+
+static void inotify_free_group_priv(struct fsnotify_group *group)
+{
+	/* ideally the idr is empty and we won't hit the BUG in the callback */
+	idr_for_each(&group->inotify_data.idr, idr_callback, group);
+	idr_remove_all(&group->inotify_data.idr);
+	idr_destroy(&group->inotify_data.idr);
+	atomic_dec(&group->inotify_data.user->inotify_devs);
+	free_uid(group->inotify_data.user);
+}
+
+void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
+{
+	struct inotify_event_private_data *event_priv;
+
+
+	event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
+				  fsnotify_event_priv_data);
+
+	kmem_cache_free(event_priv_cachep, event_priv);
+}
+
+const struct fsnotify_ops inotify_fsnotify_ops = {
+	.handle_event = inotify_handle_event,
+	.should_send_event = inotify_should_send_event,
+	.free_group_priv = inotify_free_group_priv,
+	.free_event_priv = inotify_free_event_priv,
+	.freeing_mark = inotify_freeing_mark,
+};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
new file mode 100644
index 00000000..8445fbc8
--- /dev/null
+++ b/fs/notify/inotify/inotify_user.c
@@ -0,0 +1,867 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *	John McCutchan	<ttb@tentacle.dhs.org>
+ *	Robert Love	<rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h> /* struct inode */
+#include <linux/fsnotify_backend.h>
+#include <linux/idr.h>
+#include <linux/init.h> /* module_init */
+#include <linux/inotify.h>
+#include <linux/kernel.h> /* roundup() */
+#include <linux/namei.h> /* LOOKUP_FOLLOW */
+#include <linux/sched.h> /* struct user */
+#include <linux/slab.h> /* struct kmem_cache */
+#include <linux/syscalls.h>
+#include <linux/types.h>
+#include <linux/anon_inodes.h>
+#include <linux/uaccess.h>
+#include <linux/poll.h>
+#include <linux/wait.h>
+
+#include "inotify.h"
+
+#include <asm/ioctls.h>
+
+/* these are configurable via /proc/sys/fs/inotify/ */
+static int inotify_max_user_instances __read_mostly;
+static int inotify_max_queued_events __read_mostly;
+static int inotify_max_user_watches __read_mostly;
+
+static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
+struct kmem_cache *event_priv_cachep __read_mostly;
+
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+static int zero;
+
+ctl_table inotify_table[] = {
+	{
+		.procname	= "max_user_instances",
+		.data		= &inotify_max_user_instances,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "max_user_watches",
+		.data		= &inotify_max_user_watches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "max_queued_events",
+		.data		= &inotify_max_queued_events,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero
+	},
+	{ }
+};
+#endif /* CONFIG_SYSCTL */
+
+static inline __u32 inotify_arg_to_mask(u32 arg)
+{
+	__u32 mask;
+
+	/*
+	 * everything should accept their own ignored, cares about children,
+	 * and should receive events when the inode is unmounted
+	 */
+	mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD | FS_UNMOUNT);
+
+	/* mask off the flags used to open the fd */
+	mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK));
+
+	return mask;
+}
+
+static inline u32 inotify_mask_to_arg(__u32 mask)
+{
+	return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
+		       IN_Q_OVERFLOW);
+}
+
+/* intofiy userspace file descriptor functions */
+static unsigned int inotify_poll(struct file *file, poll_table *wait)
+{
+	struct fsnotify_group *group = file->private_data;
+	int ret = 0;
+
+	poll_wait(file, &group->notification_waitq, wait);
+	mutex_lock(&group->notification_mutex);
+	if (!fsnotify_notify_queue_is_empty(group))
+		ret = POLLIN | POLLRDNORM;
+	mutex_unlock(&group->notification_mutex);
+
+	return ret;
+}
+
+/*
+ * Get an inotify_kernel_event if one exists and is small
+ * enough to fit in "count". Return an error pointer if
+ * not large enough.
+ *
+ * Called with the group->notification_mutex held.
+ */
+static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
+					    size_t count)
+{
+	size_t event_size = sizeof(struct inotify_event);
+	struct fsnotify_event *event;
+
+	if (fsnotify_notify_queue_is_empty(group))
+		return NULL;
+
+	event = fsnotify_peek_notify_event(group);
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	if (event->name_len)
+		event_size += roundup(event->name_len + 1, event_size);
+
+	if (event_size > count)
+		return ERR_PTR(-EINVAL);
+
+	/* held the notification_mutex the whole time, so this is the
+	 * same event we peeked above */
+	fsnotify_remove_notify_event(group);
+
+	return event;
+}
+
+/*
+ * Copy an event to user space, returning how much we copied.
+ *
+ * We already checked that the event size is smaller than the
+ * buffer we had in "get_one_event()" above.
+ */
+static ssize_t copy_event_to_user(struct fsnotify_group *group,
+				  struct fsnotify_event *event,
+				  char __user *buf)
+{
+	struct inotify_event inotify_event;
+	struct fsnotify_event_private_data *fsn_priv;
+	struct inotify_event_private_data *priv;
+	size_t event_size = sizeof(struct inotify_event);
+	size_t name_len = 0;
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	/* we get the inotify watch descriptor from the event private data */
+	spin_lock(&event->lock);
+	fsn_priv = fsnotify_remove_priv_from_event(group, event);
+	spin_unlock(&event->lock);
+
+	if (!fsn_priv)
+		inotify_event.wd = -1;
+	else {
+		priv = container_of(fsn_priv, struct inotify_event_private_data,
+				    fsnotify_event_priv_data);
+		inotify_event.wd = priv->wd;
+		inotify_free_event_priv(fsn_priv);
+	}
+
+	/*
+	 * round up event->name_len so it is a multiple of event_size
+	 * plus an extra byte for the terminating '\0'.
+	 */
+	if (event->name_len)
+		name_len = roundup(event->name_len + 1, event_size);
+	inotify_event.len = name_len;
+
+	inotify_event.mask = inotify_mask_to_arg(event->mask);
+	inotify_event.cookie = event->sync_cookie;
+
+	/* send the main event */
+	if (copy_to_user(buf, &inotify_event, event_size))
+		return -EFAULT;
+
+	buf += event_size;
+
+	/*
+	 * fsnotify only stores the pathname, so here we have to send the pathname
+	 * and then pad that pathname out to a multiple of sizeof(inotify_event)
+	 * with zeros.  I get my zeros from the nul_inotify_event.
+	 */
+	if (name_len) {
+		unsigned int len_to_zero = name_len - event->name_len;
+		/* copy the path name */
+		if (copy_to_user(buf, event->file_name, event->name_len))
+			return -EFAULT;
+		buf += event->name_len;
+
+		/* fill userspace with 0's */
+		if (clear_user(buf, len_to_zero))
+			return -EFAULT;
+		buf += len_to_zero;
+		event_size += name_len;
+	}
+
+	return event_size;
+}
+
+static ssize_t inotify_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct fsnotify_group *group;
+	struct fsnotify_event *kevent;
+	char __user *start;
+	int ret;
+	DEFINE_WAIT(wait);
+
+	start = buf;
+	group = file->private_data;
+
+	while (1) {
+		prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
+
+		mutex_lock(&group->notification_mutex);
+		kevent = get_one_event(group, count);
+		mutex_unlock(&group->notification_mutex);
+
+		pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent);
+
+		if (kevent) {
+			ret = PTR_ERR(kevent);
+			if (IS_ERR(kevent))
+				break;
+			ret = copy_event_to_user(group, kevent, buf);
+			fsnotify_put_event(kevent);
+			if (ret < 0)
+				break;
+			buf += ret;
+			count -= ret;
+			continue;
+		}
+
+		ret = -EAGAIN;
+		if (file->f_flags & O_NONBLOCK)
+			break;
+		ret = -EINTR;
+		if (signal_pending(current))
+			break;
+
+		if (start != buf)
+			break;
+
+		schedule();
+	}
+
+	finish_wait(&group->notification_waitq, &wait);
+	if (start != buf && ret != -EFAULT)
+		ret = buf - start;
+	return ret;
+}
+
+static int inotify_fasync(int fd, struct file *file, int on)
+{
+	struct fsnotify_group *group = file->private_data;
+
+	return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO;
+}
+
+static int inotify_release(struct inode *ignored, struct file *file)
+{
+	struct fsnotify_group *group = file->private_data;
+
+	pr_debug("%s: group=%p\n", __func__, group);
+
+	fsnotify_clear_marks_by_group(group);
+
+	/* free this group, matching get was inotify_init->fsnotify_obtain_group */
+	fsnotify_put_group(group);
+
+	return 0;
+}
+
+static long inotify_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	struct fsnotify_group *group;
+	struct fsnotify_event_holder *holder;
+	struct fsnotify_event *event;
+	void __user *p;
+	int ret = -ENOTTY;
+	size_t send_len = 0;
+
+	group = file->private_data;
+	p = (void __user *) arg;
+
+	pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd);
+
+	switch (cmd) {
+	case FIONREAD:
+		mutex_lock(&group->notification_mutex);
+		list_for_each_entry(holder, &group->notification_list, event_list) {
+			event = holder->event;
+			send_len += sizeof(struct inotify_event);
+			if (event->name_len)
+				send_len += roundup(event->name_len + 1,
+						sizeof(struct inotify_event));
+		}
+		mutex_unlock(&group->notification_mutex);
+		ret = put_user(send_len, (int __user *) p);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct file_operations inotify_fops = {
+	.poll		= inotify_poll,
+	.read		= inotify_read,
+	.fasync		= inotify_fasync,
+	.release	= inotify_release,
+	.unlocked_ioctl	= inotify_ioctl,
+	.compat_ioctl	= inotify_ioctl,
+	.llseek		= noop_llseek,
+};
+
+
+/*
+ * find_inode - resolve a user-given path to a specific inode
+ */
+static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags)
+{
+	int error;
+
+	error = user_path_at(AT_FDCWD, dirname, flags, path);
+	if (error)
+		return error;
+	/* you can only watch an inode if you have read permissions on it */
+	error = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (error)
+		path_put(path);
+	return error;
+}
+
+static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock,
+			      int *last_wd,
+			      struct inotify_inode_mark *i_mark)
+{
+	int ret;
+
+	do {
+		if (unlikely(!idr_pre_get(idr, GFP_KERNEL)))
+			return -ENOMEM;
+
+		spin_lock(idr_lock);
+		ret = idr_get_new_above(idr, i_mark, *last_wd + 1,
+					&i_mark->wd);
+		/* we added the mark to the idr, take a reference */
+		if (!ret) {
+			*last_wd = i_mark->wd;
+			fsnotify_get_mark(&i_mark->fsn_mark);
+		}
+		spin_unlock(idr_lock);
+	} while (ret == -EAGAIN);
+
+	return ret;
+}
+
+static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group,
+								int wd)
+{
+	struct idr *idr = &group->inotify_data.idr;
+	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+	struct inotify_inode_mark *i_mark;
+
+	assert_spin_locked(idr_lock);
+
+	i_mark = idr_find(idr, wd);
+	if (i_mark) {
+		struct fsnotify_mark *fsn_mark = &i_mark->fsn_mark;
+
+		fsnotify_get_mark(fsn_mark);
+		/* One ref for being in the idr, one ref we just took */
+		BUG_ON(atomic_read(&fsn_mark->refcnt) < 2);
+	}
+
+	return i_mark;
+}
+
+static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group,
+							 int wd)
+{
+	struct inotify_inode_mark *i_mark;
+	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+
+	spin_lock(idr_lock);
+	i_mark = inotify_idr_find_locked(group, wd);
+	spin_unlock(idr_lock);
+
+	return i_mark;
+}
+
+static void do_inotify_remove_from_idr(struct fsnotify_group *group,
+				       struct inotify_inode_mark *i_mark)
+{
+	struct idr *idr = &group->inotify_data.idr;
+	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+	int wd = i_mark->wd;
+
+	assert_spin_locked(idr_lock);
+
+	idr_remove(idr, wd);
+
+	/* removed from the idr, drop that ref */
+	fsnotify_put_mark(&i_mark->fsn_mark);
+}
+
+/*
+ * Remove the mark from the idr (if present) and drop the reference
+ * on the mark because it was in the idr.
+ */
+static void inotify_remove_from_idr(struct fsnotify_group *group,
+				    struct inotify_inode_mark *i_mark)
+{
+	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+	struct inotify_inode_mark *found_i_mark = NULL;
+	int wd;
+
+	spin_lock(idr_lock);
+	wd = i_mark->wd;
+
+	/*
+	 * does this i_mark think it is in the idr?  we shouldn't get called
+	 * if it wasn't....
+	 */
+	if (wd == -1) {
+		WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
+			" i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
+			i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
+		goto out;
+	}
+
+	/* Lets look in the idr to see if we find it */
+	found_i_mark = inotify_idr_find_locked(group, wd);
+	if (unlikely(!found_i_mark)) {
+		WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
+			" i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
+			i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
+		goto out;
+	}
+
+	/*
+	 * We found an mark in the idr at the right wd, but it's
+	 * not the mark we were told to remove.  eparis seriously
+	 * fucked up somewhere.
+	 */
+	if (unlikely(found_i_mark != i_mark)) {
+		WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p "
+			"mark->inode=%p found_i_mark=%p found_i_mark->wd=%d "
+			"found_i_mark->group=%p found_i_mark->inode=%p\n",
+			__func__, i_mark, i_mark->wd, i_mark->fsn_mark.group,
+			i_mark->fsn_mark.i.inode, found_i_mark, found_i_mark->wd,
+			found_i_mark->fsn_mark.group,
+			found_i_mark->fsn_mark.i.inode);
+		goto out;
+	}
+
+	/*
+	 * One ref for being in the idr
+	 * one ref held by the caller trying to kill us
+	 * one ref grabbed by inotify_idr_find
+	 */
+	if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) {
+		printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
+			" i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
+			i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
+		/* we can't really recover with bad ref cnting.. */
+		BUG();
+	}
+
+	do_inotify_remove_from_idr(group, i_mark);
+out:
+	/* match the ref taken by inotify_idr_find_locked() */
+	if (found_i_mark)
+		fsnotify_put_mark(&found_i_mark->fsn_mark);
+	i_mark->wd = -1;
+	spin_unlock(idr_lock);
+}
+
+/*
+ * Send IN_IGNORED for this wd, remove this wd from the idr.
+ */
+void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
+				    struct fsnotify_group *group)
+{
+	struct inotify_inode_mark *i_mark;
+	struct fsnotify_event *ignored_event, *notify_event;
+	struct inotify_event_private_data *event_priv;
+	struct fsnotify_event_private_data *fsn_event_priv;
+	int ret;
+
+	ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
+					      FSNOTIFY_EVENT_NONE, NULL, 0,
+					      GFP_NOFS);
+	if (!ignored_event)
+		return;
+
+	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
+
+	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
+	if (unlikely(!event_priv))
+		goto skip_send_ignore;
+
+	fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+
+	fsn_event_priv->group = group;
+	event_priv->wd = i_mark->wd;
+
+	notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
+	if (notify_event) {
+		if (IS_ERR(notify_event))
+			ret = PTR_ERR(notify_event);
+		else
+			fsnotify_put_event(notify_event);
+		inotify_free_event_priv(fsn_event_priv);
+	}
+
+skip_send_ignore:
+
+	/* matches the reference taken when the event was created */
+	fsnotify_put_event(ignored_event);
+
+	/* remove this mark from the idr */
+	inotify_remove_from_idr(group, i_mark);
+
+	atomic_dec(&group->inotify_data.user->inotify_watches);
+}
+
+/* ding dong the mark is dead */
+static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+	struct inotify_inode_mark *i_mark;
+
+	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
+
+	kmem_cache_free(inotify_inode_mark_cachep, i_mark);
+}
+
+static int inotify_update_existing_watch(struct fsnotify_group *group,
+					 struct inode *inode,
+					 u32 arg)
+{
+	struct fsnotify_mark *fsn_mark;
+	struct inotify_inode_mark *i_mark;
+	__u32 old_mask, new_mask;
+	__u32 mask;
+	int add = (arg & IN_MASK_ADD);
+	int ret;
+
+	/* don't allow invalid bits: we don't want flags set */
+	mask = inotify_arg_to_mask(arg);
+	if (unlikely(!(mask & IN_ALL_EVENTS)))
+		return -EINVAL;
+
+	fsn_mark = fsnotify_find_inode_mark(group, inode);
+	if (!fsn_mark)
+		return -ENOENT;
+
+	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
+
+	spin_lock(&fsn_mark->lock);
+
+	old_mask = fsn_mark->mask;
+	if (add)
+		fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask));
+	else
+		fsnotify_set_mark_mask_locked(fsn_mark, mask);
+	new_mask = fsn_mark->mask;
+
+	spin_unlock(&fsn_mark->lock);
+
+	if (old_mask != new_mask) {
+		/* more bits in old than in new? */
+		int dropped = (old_mask & ~new_mask);
+		/* more bits in this fsn_mark than the inode's mask? */
+		int do_inode = (new_mask & ~inode->i_fsnotify_mask);
+
+		/* update the inode with this new fsn_mark */
+		if (dropped || do_inode)
+			fsnotify_recalc_inode_mask(inode);
+
+	}
+
+	/* return the wd */
+	ret = i_mark->wd;
+
+	/* match the get from fsnotify_find_mark() */
+	fsnotify_put_mark(fsn_mark);
+
+	return ret;
+}
+
+static int inotify_new_watch(struct fsnotify_group *group,
+			     struct inode *inode,
+			     u32 arg)
+{
+	struct inotify_inode_mark *tmp_i_mark;
+	__u32 mask;
+	int ret;
+	struct idr *idr = &group->inotify_data.idr;
+	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+
+	/* don't allow invalid bits: we don't want flags set */
+	mask = inotify_arg_to_mask(arg);
+	if (unlikely(!(mask & IN_ALL_EVENTS)))
+		return -EINVAL;
+
+	tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+	if (unlikely(!tmp_i_mark))
+		return -ENOMEM;
+
+	fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark);
+	tmp_i_mark->fsn_mark.mask = mask;
+	tmp_i_mark->wd = -1;
+
+	ret = -ENOSPC;
+	if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
+		goto out_err;
+
+	ret = inotify_add_to_idr(idr, idr_lock, &group->inotify_data.last_wd,
+				 tmp_i_mark);
+	if (ret)
+		goto out_err;
+
+	/* we are on the idr, now get on the inode */
+	ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0);
+	if (ret) {
+		/* we failed to get on the inode, get off the idr */
+		inotify_remove_from_idr(group, tmp_i_mark);
+		goto out_err;
+	}
+
+	/* increment the number of watches the user has */
+	atomic_inc(&group->inotify_data.user->inotify_watches);
+
+	/* return the watch descriptor for this new mark */
+	ret = tmp_i_mark->wd;
+
+out_err:
+	/* match the ref from fsnotify_init_mark() */
+	fsnotify_put_mark(&tmp_i_mark->fsn_mark);
+
+	return ret;
+}
+
+static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+{
+	int ret = 0;
+
+retry:
+	/* try to update and existing watch with the new arg */
+	ret = inotify_update_existing_watch(group, inode, arg);
+	/* no mark present, try to add a new one */
+	if (ret == -ENOENT)
+		ret = inotify_new_watch(group, inode, arg);
+	/*
+	 * inotify_new_watch could race with another thread which did an
+	 * inotify_new_watch between the update_existing and the add watch
+	 * here, go back and try to update an existing mark again.
+	 */
+	if (ret == -EEXIST)
+		goto retry;
+
+	return ret;
+}
+
+static struct fsnotify_group *inotify_new_group(unsigned int max_events)
+{
+	struct fsnotify_group *group;
+
+	group = fsnotify_alloc_group(&inotify_fsnotify_ops);
+	if (IS_ERR(group))
+		return group;
+
+	group->max_events = max_events;
+
+	spin_lock_init(&group->inotify_data.idr_lock);
+	idr_init(&group->inotify_data.idr);
+	group->inotify_data.last_wd = 0;
+	group->inotify_data.fa = NULL;
+	group->inotify_data.user = get_current_user();
+
+	if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
+	    inotify_max_user_instances) {
+		fsnotify_put_group(group);
+		return ERR_PTR(-EMFILE);
+	}
+
+	return group;
+}
+
+
+/* inotify syscalls */
+SYSCALL_DEFINE1(inotify_init1, int, flags)
+{
+	struct fsnotify_group *group;
+	int ret;
+
+	/* Check the IN_* constants for consistency.  */
+	BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK);
+
+	if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
+		return -EINVAL;
+
+	/* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
+	group = inotify_new_group(inotify_max_queued_events);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	ret = anon_inode_getfd("inotify", &inotify_fops, group,
+				  O_RDONLY | flags);
+	if (ret < 0)
+		fsnotify_put_group(group);
+
+	return ret;
+}
+
+SYSCALL_DEFINE0(inotify_init)
+{
+	return sys_inotify_init1(0);
+}
+
+SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
+		u32, mask)
+{
+	struct fsnotify_group *group;
+	struct inode *inode;
+	struct path path;
+	struct file *filp;
+	int ret, fput_needed;
+	unsigned flags = 0;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(!filp))
+		return -EBADF;
+
+	/* verify that this is indeed an inotify instance */
+	if (unlikely(filp->f_op != &inotify_fops)) {
+		ret = -EINVAL;
+		goto fput_and_out;
+	}
+
+	if (!(mask & IN_DONT_FOLLOW))
+		flags |= LOOKUP_FOLLOW;
+	if (mask & IN_ONLYDIR)
+		flags |= LOOKUP_DIRECTORY;
+
+	ret = inotify_find_inode(pathname, &path, flags);
+	if (ret)
+		goto fput_and_out;
+
+	/* inode held in place by reference to path; group by fget on fd */
+	inode = path.dentry->d_inode;
+	group = filp->private_data;
+
+	/* create/update an inode mark */
+	ret = inotify_update_watch(group, inode, mask);
+	path_put(&path);
+fput_and_out:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
+{
+	struct fsnotify_group *group;
+	struct inotify_inode_mark *i_mark;
+	struct file *filp;
+	int ret = 0, fput_needed;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(!filp))
+		return -EBADF;
+
+	/* verify that this is indeed an inotify instance */
+	ret = -EINVAL;
+	if (unlikely(filp->f_op != &inotify_fops))
+		goto out;
+
+	group = filp->private_data;
+
+	ret = -EINVAL;
+	i_mark = inotify_idr_find(group, wd);
+	if (unlikely(!i_mark))
+		goto out;
+
+	ret = 0;
+
+	fsnotify_destroy_mark(&i_mark->fsn_mark);
+
+	/* match ref taken by inotify_idr_find */
+	fsnotify_put_mark(&i_mark->fsn_mark);
+
+out:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+/*
+ * inotify_user_setup - Our initialization function.  Note that we cannot return
+ * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
+ * must result in panic().
+ */
+static int __init inotify_user_setup(void)
+{
+	BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
+	BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
+	BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
+	BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
+	BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+	BUILD_BUG_ON(IN_OPEN != FS_OPEN);
+	BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
+	BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
+	BUILD_BUG_ON(IN_CREATE != FS_CREATE);
+	BUILD_BUG_ON(IN_DELETE != FS_DELETE);
+	BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
+	BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
+	BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
+	BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
+	BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
+	BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
+	BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
+	BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
+
+	BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
+
+	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
+	event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
+
+	inotify_max_queued_events = 16384;
+	inotify_max_user_instances = 128;
+	inotify_max_user_watches = 8192;
+
+	return 0;
+}
+module_init(inotify_user_setup);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
new file mode 100644
index 00000000..42ed1957
--- /dev/null
+++ b/fs/notify/mark.c
@@ -0,0 +1,372 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * fsnotify inode mark locking/lifetime/and refcnting
+ *
+ * REFCNT:
+ * The mark->refcnt tells how many "things" in the kernel currently are
+ * referencing this object.  The object typically will live inside the kernel
+ * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
+ * which can find this object holding the appropriete locks, can take a reference
+ * and the object itself is guaranteed to survive until the reference is dropped.
+ *
+ * LOCKING:
+ * There are 3 spinlocks involved with fsnotify inode marks and they MUST
+ * be taken in order as follows:
+ *
+ * mark->lock
+ * group->mark_lock
+ * inode->i_lock
+ *
+ * mark->lock protects 2 things, mark->group and mark->inode.  You must hold
+ * that lock to dereference either of these things (they could be NULL even with
+ * the lock)
+ *
+ * group->mark_lock protects the marks_list anchored inside a given group
+ * and each mark is hooked via the g_list.  It also sorta protects the
+ * free_g_list, which when used is anchored by a private list on the stack of the
+ * task which held the group->mark_lock.
+ *
+ * inode->i_lock protects the i_fsnotify_marks list anchored inside a
+ * given inode and each mark is hooked via the i_list. (and sorta the
+ * free_i_list)
+ *
+ *
+ * LIFETIME:
+ * Inode marks survive between when they are added to an inode and when their
+ * refcnt==0.
+ *
+ * The inode mark can be cleared for a number of different reasons including:
+ * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
+ * - The inode is being evicted from cache. (fsnotify_inode_delete)
+ * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
+ * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark)
+ * - The fsnotify_group associated with the mark is going away and all such marks
+ *   need to be cleaned up. (fsnotify_clear_marks_by_group)
+ *
+ * Worst case we are given an inode and need to clean up all the marks on that
+ * inode.  We take i_lock and walk the i_fsnotify_marks safely.  For each
+ * mark on the list we take a reference (so the mark can't disappear under us).
+ * We remove that mark form the inode's list of marks and we add this mark to a
+ * private list anchored on the stack using i_free_list;  At this point we no
+ * longer fear anything finding the mark using the inode's list of marks.
+ *
+ * We can safely and locklessly run the private list on the stack of everything
+ * we just unattached from the original inode.  For each mark on the private list
+ * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
+ * we see the group and inode are not NULL we take those locks.  Now holding all
+ * 3 locks we can completely remove the mark from other tasks finding it in the
+ * future.  Remember, 10 things might already be referencing this mark, but they
+ * better be holding a ref.  We drop our reference we took before we unhooked it
+ * from the inode.  When the ref hits 0 we can free the mark.
+ *
+ * Very similarly for freeing by group, except we use free_g_list.
+ *
+ * This has the very interesting property of being able to run concurrently with
+ * any (or all) other directions.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/srcu.h>
+
+#include <asm/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+struct srcu_struct fsnotify_mark_srcu;
+static DEFINE_SPINLOCK(destroy_lock);
+static LIST_HEAD(destroy_list);
+static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
+
+void fsnotify_get_mark(struct fsnotify_mark *mark)
+{
+	atomic_inc(&mark->refcnt);
+}
+
+void fsnotify_put_mark(struct fsnotify_mark *mark)
+{
+	if (atomic_dec_and_test(&mark->refcnt))
+		mark->free_mark(mark);
+}
+
+/*
+ * Any time a mark is getting freed we end up here.
+ * The caller had better be holding a reference to this mark so we don't actually
+ * do the final put under the mark->lock
+ */
+void fsnotify_destroy_mark(struct fsnotify_mark *mark)
+{
+	struct fsnotify_group *group;
+	struct inode *inode = NULL;
+
+	spin_lock(&mark->lock);
+
+	group = mark->group;
+
+	/* something else already called this function on this mark */
+	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+		spin_unlock(&mark->lock);
+		return;
+	}
+
+	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+
+	spin_lock(&group->mark_lock);
+
+	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
+		inode = mark->i.inode;
+		fsnotify_destroy_inode_mark(mark);
+	} else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
+		fsnotify_destroy_vfsmount_mark(mark);
+	else
+		BUG();
+
+	list_del_init(&mark->g_list);
+
+	spin_unlock(&group->mark_lock);
+	spin_unlock(&mark->lock);
+
+	spin_lock(&destroy_lock);
+	list_add(&mark->destroy_list, &destroy_list);
+	spin_unlock(&destroy_lock);
+	wake_up(&destroy_waitq);
+
+	/*
+	 * Some groups like to know that marks are being freed.  This is a
+	 * callback to the group function to let it know that this mark
+	 * is being freed.
+	 */
+	if (group->ops->freeing_mark)
+		group->ops->freeing_mark(mark, group);
+
+	/*
+	 * __fsnotify_update_child_dentry_flags(inode);
+	 *
+	 * I really want to call that, but we can't, we have no idea if the inode
+	 * still exists the second we drop the mark->lock.
+	 *
+	 * The next time an event arrive to this inode from one of it's children
+	 * __fsnotify_parent will see that the inode doesn't care about it's
+	 * children and will update all of these flags then.  So really this
+	 * is just a lazy update (and could be a perf win...)
+	 */
+
+	if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
+		iput(inode);
+
+	/*
+	 * We don't necessarily have a ref on mark from caller so the above iput
+	 * may have already destroyed it.  Don't touch from now on.
+	 */
+
+	/*
+	 * it's possible that this group tried to destroy itself, but this
+	 * this mark was simultaneously being freed by inode.  If that's the
+	 * case, we finish freeing the group here.
+	 */
+	if (unlikely(atomic_dec_and_test(&group->num_marks)))
+		fsnotify_final_destroy_group(group);
+}
+
+void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
+{
+	assert_spin_locked(&mark->lock);
+
+	mark->mask = mask;
+
+	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
+		fsnotify_set_inode_mark_mask_locked(mark, mask);
+}
+
+void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
+{
+	assert_spin_locked(&mark->lock);
+
+	mark->ignored_mask = mask;
+}
+
+/*
+ * Attach an initialized mark to a given group and fs object.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which group.
+ */
+int fsnotify_add_mark(struct fsnotify_mark *mark,
+		      struct fsnotify_group *group, struct inode *inode,
+		      struct vfsmount *mnt, int allow_dups)
+{
+	int ret = 0;
+
+	BUG_ON(inode && mnt);
+	BUG_ON(!inode && !mnt);
+
+	/*
+	 * LOCKING ORDER!!!!
+	 * mark->lock
+	 * group->mark_lock
+	 * inode->i_lock
+	 */
+	spin_lock(&mark->lock);
+	spin_lock(&group->mark_lock);
+
+	mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
+
+	mark->group = group;
+	list_add(&mark->g_list, &group->marks_list);
+	atomic_inc(&group->num_marks);
+	fsnotify_get_mark(mark); /* for i_list and g_list */
+
+	if (inode) {
+		ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups);
+		if (ret)
+			goto err;
+	} else if (mnt) {
+		ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups);
+		if (ret)
+			goto err;
+	} else {
+		BUG();
+	}
+
+	spin_unlock(&group->mark_lock);
+
+	/* this will pin the object if appropriate */
+	fsnotify_set_mark_mask_locked(mark, mark->mask);
+
+	spin_unlock(&mark->lock);
+
+	if (inode)
+		__fsnotify_update_child_dentry_flags(inode);
+
+	return ret;
+err:
+	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+	list_del_init(&mark->g_list);
+	mark->group = NULL;
+	atomic_dec(&group->num_marks);
+
+	spin_unlock(&group->mark_lock);
+	spin_unlock(&mark->lock);
+
+	spin_lock(&destroy_lock);
+	list_add(&mark->destroy_list, &destroy_list);
+	spin_unlock(&destroy_lock);
+	wake_up(&destroy_waitq);
+
+	return ret;
+}
+
+/*
+ * clear any marks in a group in which mark->flags & flags is true
+ */
+void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
+					 unsigned int flags)
+{
+	struct fsnotify_mark *lmark, *mark;
+	LIST_HEAD(free_list);
+
+	spin_lock(&group->mark_lock);
+	list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
+		if (mark->flags & flags) {
+			list_add(&mark->free_g_list, &free_list);
+			list_del_init(&mark->g_list);
+			fsnotify_get_mark(mark);
+		}
+	}
+	spin_unlock(&group->mark_lock);
+
+	list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) {
+		fsnotify_destroy_mark(mark);
+		fsnotify_put_mark(mark);
+	}
+}
+
+/*
+ * Given a group, destroy all of the marks associated with that group.
+ */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+{
+	fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
+}
+
+void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
+{
+	assert_spin_locked(&old->lock);
+	new->i.inode = old->i.inode;
+	new->m.mnt = old->m.mnt;
+	new->group = old->group;
+	new->mask = old->mask;
+	new->free_mark = old->free_mark;
+}
+
+/*
+ * Nothing fancy, just initialize lists and locks and counters.
+ */
+void fsnotify_init_mark(struct fsnotify_mark *mark,
+			void (*free_mark)(struct fsnotify_mark *mark))
+{
+	memset(mark, 0, sizeof(*mark));
+	spin_lock_init(&mark->lock);
+	atomic_set(&mark->refcnt, 1);
+	mark->free_mark = free_mark;
+}
+
+static int fsnotify_mark_destroy(void *ignored)
+{
+	struct fsnotify_mark *mark, *next;
+	LIST_HEAD(private_destroy_list);
+
+	for (;;) {
+		spin_lock(&destroy_lock);
+		/* exchange the list head */
+		list_replace_init(&destroy_list, &private_destroy_list);
+		spin_unlock(&destroy_lock);
+
+		synchronize_srcu(&fsnotify_mark_srcu);
+
+		list_for_each_entry_safe(mark, next, &private_destroy_list, destroy_list) {
+			list_del_init(&mark->destroy_list);
+			fsnotify_put_mark(mark);
+		}
+
+		wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
+	}
+
+	return 0;
+}
+
+static int __init fsnotify_mark_init(void)
+{
+	struct task_struct *thread;
+
+	thread = kthread_run(fsnotify_mark_destroy, NULL,
+			     "fsnotify_mark");
+	if (IS_ERR(thread))
+		panic("unable to start fsnotify mark destruction thread.");
+
+	return 0;
+}
+device_initcall(fsnotify_mark_init);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
new file mode 100644
index 00000000..f39260f8
--- /dev/null
+++ b/fs/notify/notification.c
@@ -0,0 +1,464 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * Basic idea behind the notification queue: An fsnotify group (like inotify)
+ * sends the userspace notification about events asyncronously some time after
+ * the event happened.  When inotify gets an event it will need to add that
+ * event to the group notify queue.  Since a single event might need to be on
+ * multiple group's notification queues we can't add the event directly to each
+ * queue and instead add a small "event_holder" to each queue.  This event_holder
+ * has a pointer back to the original event.  Since the majority of events are
+ * going to end up on one, and only one, notification queue we embed one
+ * event_holder into each event.  This means we have a single allocation instead
+ * of always needing two.  If the embedded event_holder is already in use by
+ * another group a new event_holder (from fsnotify_event_holder_cachep) will be
+ * allocated and used.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <asm/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+static struct kmem_cache *fsnotify_event_cachep;
+static struct kmem_cache *fsnotify_event_holder_cachep;
+/*
+ * This is a magic event we send when the q is too full.  Since it doesn't
+ * hold real event information we just keep one system wide and use it any time
+ * it is needed.  It's refcnt is set 1 at kernel init time and will never
+ * get set to 0 so it will never get 'freed'
+ */
+static struct fsnotify_event *q_overflow_event;
+static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
+
+/**
+ * fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
+ * Called from fsnotify_move, which is inlined into filesystem modules.
+ */
+u32 fsnotify_get_cookie(void)
+{
+	return atomic_inc_return(&fsnotify_sync_cookie);
+}
+EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
+
+/* return true if the notify queue is empty, false otherwise */
+bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
+{
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+	return list_empty(&group->notification_list) ? true : false;
+}
+
+void fsnotify_get_event(struct fsnotify_event *event)
+{
+	atomic_inc(&event->refcnt);
+}
+
+void fsnotify_put_event(struct fsnotify_event *event)
+{
+	if (!event)
+		return;
+
+	if (atomic_dec_and_test(&event->refcnt)) {
+		pr_debug("%s: event=%p\n", __func__, event);
+
+		if (event->data_type == FSNOTIFY_EVENT_PATH)
+			path_put(&event->path);
+
+		BUG_ON(!list_empty(&event->private_data_list));
+
+		kfree(event->file_name);
+		put_pid(event->tgid);
+		kmem_cache_free(fsnotify_event_cachep, event);
+	}
+}
+
+struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
+{
+	return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
+}
+
+void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
+{
+	if (holder)
+		kmem_cache_free(fsnotify_event_holder_cachep, holder);
+}
+
+/*
+ * Find the private data that the group previously attached to this event when
+ * the group added the event to the notification queue (fsnotify_add_notify_event)
+ */
+struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+	struct fsnotify_event_private_data *lpriv;
+	struct fsnotify_event_private_data *priv = NULL;
+
+	assert_spin_locked(&event->lock);
+
+	list_for_each_entry(lpriv, &event->private_data_list, event_list) {
+		if (lpriv->group == group) {
+			priv = lpriv;
+			list_del(&priv->event_list);
+			break;
+		}
+	}
+	return priv;
+}
+
+/*
+ * Add an event to the group notification queue.  The group can later pull this
+ * event off the queue to deal with.  If the event is successfully added to the
+ * group's notification queue, a reference is taken on event.
+ */
+struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+						 struct fsnotify_event_private_data *priv,
+						 struct fsnotify_event *(*merge)(struct list_head *,
+										 struct fsnotify_event *))
+{
+	struct fsnotify_event *return_event = NULL;
+	struct fsnotify_event_holder *holder = NULL;
+	struct list_head *list = &group->notification_list;
+
+	pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv);
+
+	/*
+	 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
+	 * Check if we expect to be able to use that holder.  If not alloc a new
+	 * holder.
+	 * For the overflow event it's possible that something will use the in
+	 * event holder before we get the lock so we may need to jump back and
+	 * alloc a new holder, this can't happen for most events...
+	 */
+	if (!list_empty(&event->holder.event_list)) {
+alloc_holder:
+		holder = fsnotify_alloc_event_holder();
+		if (!holder)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	mutex_lock(&group->notification_mutex);
+
+	if (group->q_len >= group->max_events) {
+		event = q_overflow_event;
+
+		/*
+		 * we need to return the overflow event
+		 * which means we need a ref
+		 */
+		fsnotify_get_event(event);
+		return_event = event;
+
+		/* sorry, no private data on the overflow event */
+		priv = NULL;
+	}
+
+	if (!list_empty(list) && merge) {
+		struct fsnotify_event *tmp;
+
+		tmp = merge(list, event);
+		if (tmp) {
+			mutex_unlock(&group->notification_mutex);
+
+			if (return_event)
+				fsnotify_put_event(return_event);
+			if (holder != &event->holder)
+				fsnotify_destroy_event_holder(holder);
+			return tmp;
+		}
+	}
+
+	spin_lock(&event->lock);
+
+	if (list_empty(&event->holder.event_list)) {
+		if (unlikely(holder))
+			fsnotify_destroy_event_holder(holder);
+		holder = &event->holder;
+	} else if (unlikely(!holder)) {
+		/* between the time we checked above and got the lock the in
+		 * event holder was used, go back and get a new one */
+		spin_unlock(&event->lock);
+		mutex_unlock(&group->notification_mutex);
+
+		if (return_event) {
+			fsnotify_put_event(return_event);
+			return_event = NULL;
+		}
+
+		goto alloc_holder;
+	}
+
+	group->q_len++;
+	holder->event = event;
+
+	fsnotify_get_event(event);
+	list_add_tail(&holder->event_list, list);
+	if (priv)
+		list_add_tail(&priv->event_list, &event->private_data_list);
+	spin_unlock(&event->lock);
+	mutex_unlock(&group->notification_mutex);
+
+	wake_up(&group->notification_waitq);
+	return return_event;
+}
+
+/*
+ * Remove and return the first event from the notification list.  There is a
+ * reference held on this event since it was on the list.  It is the responsibility
+ * of the caller to drop this reference.
+ */
+struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
+{
+	struct fsnotify_event *event;
+	struct fsnotify_event_holder *holder;
+
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+
+	pr_debug("%s: group=%p\n", __func__, group);
+
+	holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+
+	event = holder->event;
+
+	spin_lock(&event->lock);
+	holder->event = NULL;
+	list_del_init(&holder->event_list);
+	spin_unlock(&event->lock);
+
+	/* event == holder means we are referenced through the in event holder */
+	if (holder != &event->holder)
+		fsnotify_destroy_event_holder(holder);
+
+	group->q_len--;
+
+	return event;
+}
+
+/*
+ * This will not remove the event, that must be done with fsnotify_remove_notify_event()
+ */
+struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
+{
+	struct fsnotify_event *event;
+	struct fsnotify_event_holder *holder;
+
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+
+	holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+	event = holder->event;
+
+	return event;
+}
+
+/*
+ * Called when a group is being torn down to clean up any outstanding
+ * event notifications.
+ */
+void fsnotify_flush_notify(struct fsnotify_group *group)
+{
+	struct fsnotify_event *event;
+	struct fsnotify_event_private_data *priv;
+
+	mutex_lock(&group->notification_mutex);
+	while (!fsnotify_notify_queue_is_empty(group)) {
+		event = fsnotify_remove_notify_event(group);
+		/* if they don't implement free_event_priv they better not have attached any */
+		if (group->ops->free_event_priv) {
+			spin_lock(&event->lock);
+			priv = fsnotify_remove_priv_from_event(group, event);
+			spin_unlock(&event->lock);
+			if (priv)
+				group->ops->free_event_priv(priv);
+		}
+		fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
+	}
+	mutex_unlock(&group->notification_mutex);
+}
+
+static void initialize_event(struct fsnotify_event *event)
+{
+	INIT_LIST_HEAD(&event->holder.event_list);
+	atomic_set(&event->refcnt, 1);
+
+	spin_lock_init(&event->lock);
+
+	INIT_LIST_HEAD(&event->private_data_list);
+}
+
+/*
+ * Caller damn well better be holding whatever mutex is protecting the
+ * old_holder->event_list and the new_event must be a clean event which
+ * cannot be found anywhere else in the kernel.
+ */
+int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
+			   struct fsnotify_event *new_event)
+{
+	struct fsnotify_event *old_event = old_holder->event;
+	struct fsnotify_event_holder *new_holder = &new_event->holder;
+
+	enum event_spinlock_class {
+		SPINLOCK_OLD,
+		SPINLOCK_NEW,
+	};
+
+	pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event);
+
+	/*
+	 * if the new_event's embedded holder is in use someone
+	 * screwed up and didn't give us a clean new event.
+	 */
+	BUG_ON(!list_empty(&new_holder->event_list));
+
+	spin_lock_nested(&old_event->lock, SPINLOCK_OLD);
+	spin_lock_nested(&new_event->lock, SPINLOCK_NEW);
+
+	new_holder->event = new_event;
+	list_replace_init(&old_holder->event_list, &new_holder->event_list);
+
+	spin_unlock(&new_event->lock);
+	spin_unlock(&old_event->lock);
+
+	/* event == holder means we are referenced through the in event holder */
+	if (old_holder != &old_event->holder)
+		fsnotify_destroy_event_holder(old_holder);
+
+	fsnotify_get_event(new_event); /* on the list take reference */
+	fsnotify_put_event(old_event); /* off the list, drop reference */
+
+	return 0;
+}
+
+struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
+{
+	struct fsnotify_event *event;
+
+	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+	if (!event)
+		return NULL;
+
+	pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event);
+
+	memcpy(event, old_event, sizeof(*event));
+	initialize_event(event);
+
+	if (event->name_len) {
+		event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
+		if (!event->file_name) {
+			kmem_cache_free(fsnotify_event_cachep, event);
+			return NULL;
+		}
+	}
+	event->tgid = get_pid(old_event->tgid);
+	if (event->data_type == FSNOTIFY_EVENT_PATH)
+		path_get(&event->path);
+
+	return event;
+}
+
+/*
+ * fsnotify_create_event - Allocate a new event which will be sent to each
+ * group's handle_event function if the group was interested in this
+ * particular event.
+ *
+ * @to_tell the inode which is supposed to receive the event (sometimes a
+ *	parent of the inode to which the event happened.
+ * @mask what actually happened.
+ * @data pointer to the object which was actually affected
+ * @data_type flag indication if the data is a file, path, inode, nothing...
+ * @name the filename, if available
+ */
+struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
+					     int data_type, const unsigned char *name,
+					     u32 cookie, gfp_t gfp)
+{
+	struct fsnotify_event *event;
+
+	event = kmem_cache_zalloc(fsnotify_event_cachep, gfp);
+	if (!event)
+		return NULL;
+
+	pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n",
+		 __func__, event, to_tell, mask, data, data_type);
+
+	initialize_event(event);
+
+	if (name) {
+		event->file_name = kstrdup(name, gfp);
+		if (!event->file_name) {
+			kmem_cache_free(fsnotify_event_cachep, event);
+			return NULL;
+		}
+		event->name_len = strlen(event->file_name);
+	}
+
+	event->tgid = get_pid(task_tgid(current));
+	event->sync_cookie = cookie;
+	event->to_tell = to_tell;
+	event->data_type = data_type;
+
+	switch (data_type) {
+	case FSNOTIFY_EVENT_PATH: {
+		struct path *path = data;
+		event->path.dentry = path->dentry;
+		event->path.mnt = path->mnt;
+		path_get(&event->path);
+		break;
+	}
+	case FSNOTIFY_EVENT_INODE:
+		event->inode = data;
+		break;
+	case FSNOTIFY_EVENT_NONE:
+		event->inode = NULL;
+		event->path.dentry = NULL;
+		event->path.mnt = NULL;
+		break;
+	default:
+		BUG();
+	}
+
+	event->mask = mask;
+
+	return event;
+}
+
+__init int fsnotify_notification_init(void)
+{
+	fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
+	fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
+
+	q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL,
+						 FSNOTIFY_EVENT_NONE, NULL, 0,
+						 GFP_KERNEL);
+	if (!q_overflow_event)
+		panic("unable to allocate fsnotify q_overflow_event\n");
+
+	return 0;
+}
+subsys_initcall(fsnotify_notification_init);
+
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
new file mode 100644
index 00000000..e86577d6
--- /dev/null
+++ b/fs/notify/vfsmount_mark.c
@@ -0,0 +1,190 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+
+#include <asm/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark, *lmark;
+	struct hlist_node *pos, *n;
+	LIST_HEAD(free_list);
+
+	spin_lock(&mnt->mnt_root->d_lock);
+	hlist_for_each_entry_safe(mark, pos, n, &mnt->mnt_fsnotify_marks, m.m_list) {
+		list_add(&mark->m.free_m_list, &free_list);
+		hlist_del_init_rcu(&mark->m.m_list);
+		fsnotify_get_mark(mark);
+	}
+	spin_unlock(&mnt->mnt_root->d_lock);
+
+	list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) {
+		fsnotify_destroy_mark(mark);
+		fsnotify_put_mark(mark);
+	}
+}
+
+void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
+{
+	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT);
+}
+
+/*
+ * Recalculate the mask of events relevant to a given vfsmount locked.
+ */
+static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark;
+	struct hlist_node *pos;
+	__u32 new_mask = 0;
+
+	assert_spin_locked(&mnt->mnt_root->d_lock);
+
+	hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list)
+		new_mask |= mark->mask;
+	mnt->mnt_fsnotify_mask = new_mask;
+}
+
+/*
+ * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
+ * any notifier is interested in hearing for this mount point
+ */
+void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
+{
+	spin_lock(&mnt->mnt_root->d_lock);
+	fsnotify_recalc_vfsmount_mask_locked(mnt);
+	spin_unlock(&mnt->mnt_root->d_lock);
+}
+
+void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
+{
+	struct vfsmount *mnt = mark->m.mnt;
+
+	assert_spin_locked(&mark->lock);
+	assert_spin_locked(&mark->group->mark_lock);
+
+	spin_lock(&mnt->mnt_root->d_lock);
+
+	hlist_del_init_rcu(&mark->m.m_list);
+	mark->m.mnt = NULL;
+
+	fsnotify_recalc_vfsmount_mask_locked(mnt);
+
+	spin_unlock(&mnt->mnt_root->d_lock);
+}
+
+static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group,
+								struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark;
+	struct hlist_node *pos;
+
+	assert_spin_locked(&mnt->mnt_root->d_lock);
+
+	hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list) {
+		if (mark->group == group) {
+			fsnotify_get_mark(mark);
+			return mark;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * given a group and vfsmount, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
+						  struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark;
+
+	spin_lock(&mnt->mnt_root->d_lock);
+	mark = fsnotify_find_vfsmount_mark_locked(group, mnt);
+	spin_unlock(&mnt->mnt_root->d_lock);
+
+	return mark;
+}
+
+/*
+ * Attach an initialized mark to a given group and vfsmount.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which groups.
+ */
+int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
+			       struct fsnotify_group *group, struct vfsmount *mnt,
+			       int allow_dups)
+{
+	struct fsnotify_mark *lmark;
+	struct hlist_node *node, *last = NULL;
+	int ret = 0;
+
+	mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
+
+	assert_spin_locked(&mark->lock);
+	assert_spin_locked(&group->mark_lock);
+
+	spin_lock(&mnt->mnt_root->d_lock);
+
+	mark->m.mnt = mnt;
+
+	/* is mark the first mark? */
+	if (hlist_empty(&mnt->mnt_fsnotify_marks)) {
+		hlist_add_head_rcu(&mark->m.m_list, &mnt->mnt_fsnotify_marks);
+		goto out;
+	}
+
+	/* should mark be in the middle of the current list? */
+	hlist_for_each_entry(lmark, node, &mnt->mnt_fsnotify_marks, m.m_list) {
+		last = node;
+
+		if ((lmark->group == group) && !allow_dups) {
+			ret = -EEXIST;
+			goto out;
+		}
+
+		if (mark->group->priority < lmark->group->priority)
+			continue;
+
+		if ((mark->group->priority == lmark->group->priority) &&
+		    (mark->group < lmark->group))
+			continue;
+
+		hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
+		goto out;
+	}
+
+	BUG_ON(last == NULL);
+	/* mark should be the last entry.  last is the current last entry */
+	hlist_add_after_rcu(last, &mark->m.m_list);
+out:
+	fsnotify_recalc_vfsmount_mask_locked(mnt);
+	spin_unlock(&mnt->mnt_root->d_lock);
+
+	return ret;
+}
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
new file mode 100644
index 00000000..f5a868cc
--- /dev/null
+++ b/fs/ntfs/Kconfig
@@ -0,0 +1,78 @@
+config NTFS_FS
+	tristate "NTFS file system support"
+	select NLS
+	help
+	  NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
+
+	  Saying Y or M here enables read support.  There is partial, but
+	  safe, write support available.  For write support you must also
+	  say Y to "NTFS write support" below.
+
+	  There are also a number of user-space tools available, called
+	  ntfsprogs.  These include ntfsundelete and ntfsresize, that work
+	  without NTFS support enabled in the kernel.
+
+	  This is a rewrite from scratch of Linux NTFS support and replaced
+	  the old NTFS code starting with Linux 2.5.11.  A backport to
+	  the Linux 2.4 kernel series is separately available as a patch
+	  from the project web site.
+
+	  For more information see <file:Documentation/filesystems/ntfs.txt>
+	  and <http://www.linux-ntfs.org/>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ntfs.
+
+	  If you are not using Windows NT, 2000, XP or 2003 in addition to
+	  Linux on your computer it is safe to say N.
+
+config NTFS_DEBUG
+	bool "NTFS debugging support"
+	depends on NTFS_FS
+	help
+	  If you are experiencing any problems with the NTFS file system, say
+	  Y here.  This will result in additional consistency checks to be
+	  performed by the driver as well as additional debugging messages to
+	  be written to the system log.  Note that debugging messages are
+	  disabled by default.  To enable them, supply the option debug_msgs=1
+	  at the kernel command line when booting the kernel or as an option
+	  to insmod when loading the ntfs module.  Once the driver is active,
+	  you can enable debugging messages by doing (as root):
+	  echo 1 > /proc/sys/fs/ntfs-debug
+	  Replacing the "1" with "0" would disable debug messages.
+
+	  If you leave debugging messages disabled, this results in little
+	  overhead, but enabling debug messages results in very significant
+	  slowdown of the system.
+
+	  When reporting bugs, please try to have available a full dump of
+	  debugging messages while the misbehaviour was occurring.
+
+config NTFS_RW
+	bool "NTFS write support"
+	depends on NTFS_FS
+	help
+	  This enables the partial, but safe, write support in the NTFS driver.
+
+	  The only supported operation is overwriting existing files, without
+	  changing the file length.  No file or directory creation, deletion or
+	  renaming is possible.  Note only non-resident files can be written to
+	  so you may find that some very small files (<500 bytes or so) cannot
+	  be written to.
+
+	  While we cannot guarantee that it will not damage any data, we have
+	  so far not received a single report where the driver would have
+	  damaged someones data so we assume it is perfectly safe to use.
+
+	  Note:  While write support is safe in this version (a rewrite from
+	  scratch of the NTFS support), it should be noted that the old NTFS
+	  write support, included in Linux 2.5.10 and before (since 1997),
+	  is not safe.
+
+	  This is currently useful with TopologiLinux.  TopologiLinux is run
+	  on top of any DOS/Microsoft Windows system without partitioning your
+	  hard disk.  Unlike other Linux distributions TopologiLinux does not
+	  need its own partition.  For more information see
+	  <http://topologi-linux.sourceforge.net/>
+
+	  It is perfectly safe to say N here.
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
new file mode 100644
index 00000000..30206b23
--- /dev/null
+++ b/fs/ntfs/Makefile
@@ -0,0 +1,14 @@
+# Rules for making the NTFS driver.
+
+obj-$(CONFIG_NTFS_FS) += ntfs.o
+
+ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
+	  index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
+	  unistr.o upcase.o
+
+ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
+
+ccflags-y := -DNTFS_VERSION=\"2.1.30\"
+ccflags-$(CONFIG_NTFS_DEBUG)	+= -DDEBUG
+ccflags-$(CONFIG_NTFS_RW)	+= -DNTFS_RW
+
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
new file mode 100644
index 00000000..0b1e885b
--- /dev/null
+++ b/fs/ntfs/aops.c
@@ -0,0 +1,1638 @@
+/**
+ * aops.c - NTFS kernel address space operations and page cache handling.
+ *	    Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+
+#include "aops.h"
+#include "attrib.h"
+#include "debug.h"
+#include "inode.h"
+#include "mft.h"
+#include "runlist.h"
+#include "types.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_end_buffer_async_read - async io completion for reading attributes
+ * @bh:		buffer head on which io is completed
+ * @uptodate:	whether @bh is now uptodate or not
+ *
+ * Asynchronous I/O completion handler for reading pages belonging to the
+ * attribute address space of an inode.  The inodes can either be files or
+ * directories or they can be fake inodes describing some attribute.
+ *
+ * If NInoMstProtected(), perform the post read mst fixups when all IO on the
+ * page has been completed and mark the page uptodate or set the error bit on
+ * the page.  To determine the size of the records that need fixing up, we
+ * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
+ * record size, and index_block_size_bits, to the log(base 2) of the ntfs
+ * record size.
+ */
+static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
+{
+	unsigned long flags;
+	struct buffer_head *first, *tmp;
+	struct page *page;
+	struct inode *vi;
+	ntfs_inode *ni;
+	int page_uptodate = 1;
+
+	page = bh->b_page;
+	vi = page->mapping->host;
+	ni = NTFS_I(vi);
+
+	if (likely(uptodate)) {
+		loff_t i_size;
+		s64 file_ofs, init_size;
+
+		set_buffer_uptodate(bh);
+
+		file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) +
+				bh_offset(bh);
+		read_lock_irqsave(&ni->size_lock, flags);
+		init_size = ni->initialized_size;
+		i_size = i_size_read(vi);
+		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (unlikely(init_size > i_size)) {
+			/* Race with shrinking truncate. */
+			init_size = i_size;
+		}
+		/* Check for the current buffer head overflowing. */
+		if (unlikely(file_ofs + bh->b_size > init_size)) {
+			int ofs;
+			void *kaddr;
+
+			ofs = 0;
+			if (file_ofs < init_size)
+				ofs = init_size - file_ofs;
+			local_irq_save(flags);
+			kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ);
+			memset(kaddr + bh_offset(bh) + ofs, 0,
+					bh->b_size - ofs);
+			flush_dcache_page(page);
+			kunmap_atomic(kaddr, KM_BIO_SRC_IRQ);
+			local_irq_restore(flags);
+		}
+	} else {
+		clear_buffer_uptodate(bh);
+		SetPageError(page);
+		ntfs_error(ni->vol->sb, "Buffer I/O error, logical block "
+				"0x%llx.", (unsigned long long)bh->b_blocknr);
+	}
+	first = page_buffers(page);
+	local_irq_save(flags);
+	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	clear_buffer_async_read(bh);
+	unlock_buffer(bh);
+	tmp = bh;
+	do {
+		if (!buffer_uptodate(tmp))
+			page_uptodate = 0;
+		if (buffer_async_read(tmp)) {
+			if (likely(buffer_locked(tmp)))
+				goto still_busy;
+			/* Async buffers must be locked. */
+			BUG();
+		}
+		tmp = tmp->b_this_page;
+	} while (tmp != bh);
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
+	/*
+	 * If none of the buffers had errors then we can set the page uptodate,
+	 * but we first have to perform the post read mst fixups, if the
+	 * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
+	 * Note we ignore fixup errors as those are detected when
+	 * map_mft_record() is called which gives us per record granularity
+	 * rather than per page granularity.
+	 */
+	if (!NInoMstProtected(ni)) {
+		if (likely(page_uptodate && !PageError(page)))
+			SetPageUptodate(page);
+	} else {
+		u8 *kaddr;
+		unsigned int i, recs;
+		u32 rec_size;
+
+		rec_size = ni->itype.index.block_size;
+		recs = PAGE_CACHE_SIZE / rec_size;
+		/* Should have been verified before we got here... */
+		BUG_ON(!recs);
+		local_irq_save(flags);
+		kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ);
+		for (i = 0; i < recs; i++)
+			post_read_mst_fixup((NTFS_RECORD*)(kaddr +
+					i * rec_size), rec_size);
+		kunmap_atomic(kaddr, KM_BIO_SRC_IRQ);
+		local_irq_restore(flags);
+		flush_dcache_page(page);
+		if (likely(page_uptodate && !PageError(page)))
+			SetPageUptodate(page);
+	}
+	unlock_page(page);
+	return;
+still_busy:
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
+	return;
+}
+
+/**
+ * ntfs_read_block - fill a @page of an address space with data
+ * @page:	page cache page to fill with data
+ *
+ * Fill the page @page of the address space belonging to the @page->host inode.
+ * We read each buffer asynchronously and when all buffers are read in, our io
+ * completion handler ntfs_end_buffer_read_async(), if required, automatically
+ * applies the mst fixups to the page before finally marking it uptodate and
+ * unlocking it.
+ *
+ * We only enforce allocated_size limit because i_size is checked for in
+ * generic_file_read().
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Contains an adapted version of fs/buffer.c::block_read_full_page().
+ */
+static int ntfs_read_block(struct page *page)
+{
+	loff_t i_size;
+	VCN vcn;
+	LCN lcn;
+	s64 init_size;
+	struct inode *vi;
+	ntfs_inode *ni;
+	ntfs_volume *vol;
+	runlist_element *rl;
+	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+	sector_t iblock, lblock, zblock;
+	unsigned long flags;
+	unsigned int blocksize, vcn_ofs;
+	int i, nr;
+	unsigned char blocksize_bits;
+
+	vi = page->mapping->host;
+	ni = NTFS_I(vi);
+	vol = ni->vol;
+
+	/* $MFT/$DATA must have its complete runlist in memory at all times. */
+	BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
+
+	blocksize = vol->sb->s_blocksize;
+	blocksize_bits = vol->sb->s_blocksize_bits;
+
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, blocksize, 0);
+		if (unlikely(!page_has_buffers(page))) {
+			unlock_page(page);
+			return -ENOMEM;
+		}
+	}
+	bh = head = page_buffers(page);
+	BUG_ON(!bh);
+
+	/*
+	 * We may be racing with truncate.  To avoid some of the problems we
+	 * now take a snapshot of the various sizes and use those for the whole
+	 * of the function.  In case of an extending truncate it just means we
+	 * may leave some buffers unmapped which are now allocated.  This is
+	 * not a problem since these buffers will just get mapped when a write
+	 * occurs.  In case of a shrinking truncate, we will detect this later
+	 * on due to the runlist being incomplete and if the page is being
+	 * fully truncated, truncate will throw it away as soon as we unlock
+	 * it so no need to worry what we do with it.
+	 */
+	iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+	read_lock_irqsave(&ni->size_lock, flags);
+	lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
+	init_size = ni->initialized_size;
+	i_size = i_size_read(vi);
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (unlikely(init_size > i_size)) {
+		/* Race with shrinking truncate. */
+		init_size = i_size;
+	}
+	zblock = (init_size + blocksize - 1) >> blocksize_bits;
+
+	/* Loop through all the buffers in the page. */
+	rl = NULL;
+	nr = i = 0;
+	do {
+		int err = 0;
+
+		if (unlikely(buffer_uptodate(bh)))
+			continue;
+		if (unlikely(buffer_mapped(bh))) {
+			arr[nr++] = bh;
+			continue;
+		}
+		bh->b_bdev = vol->sb->s_bdev;
+		/* Is the block within the allowed limits? */
+		if (iblock < lblock) {
+			bool is_retry = false;
+
+			/* Convert iblock into corresponding vcn and offset. */
+			vcn = (VCN)iblock << blocksize_bits >>
+					vol->cluster_size_bits;
+			vcn_ofs = ((VCN)iblock << blocksize_bits) &
+					vol->cluster_size_mask;
+			if (!rl) {
+lock_retry_remap:
+				down_read(&ni->runlist.lock);
+				rl = ni->runlist.rl;
+			}
+			if (likely(rl != NULL)) {
+				/* Seek to element containing target vcn. */
+				while (rl->length && rl[1].vcn <= vcn)
+					rl++;
+				lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+			} else
+				lcn = LCN_RL_NOT_MAPPED;
+			/* Successful remap. */
+			if (lcn >= 0) {
+				/* Setup buffer head to correct block. */
+				bh->b_blocknr = ((lcn << vol->cluster_size_bits)
+						+ vcn_ofs) >> blocksize_bits;
+				set_buffer_mapped(bh);
+				/* Only read initialized data blocks. */
+				if (iblock < zblock) {
+					arr[nr++] = bh;
+					continue;
+				}
+				/* Fully non-initialized data block, zero it. */
+				goto handle_zblock;
+			}
+			/* It is a hole, need to zero it. */
+			if (lcn == LCN_HOLE)
+				goto handle_hole;
+			/* If first try and runlist unmapped, map and retry. */
+			if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
+				is_retry = true;
+				/*
+				 * Attempt to map runlist, dropping lock for
+				 * the duration.
+				 */
+				up_read(&ni->runlist.lock);
+				err = ntfs_map_runlist(ni, vcn);
+				if (likely(!err))
+					goto lock_retry_remap;
+				rl = NULL;
+			} else if (!rl)
+				up_read(&ni->runlist.lock);
+			/*
+			 * If buffer is outside the runlist, treat it as a
+			 * hole.  This can happen due to concurrent truncate
+			 * for example.
+			 */
+			if (err == -ENOENT || lcn == LCN_ENOENT) {
+				err = 0;
+				goto handle_hole;
+			}
+			/* Hard error, zero out region. */
+			if (!err)
+				err = -EIO;
+			bh->b_blocknr = -1;
+			SetPageError(page);
+			ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
+					"attribute type 0x%x, vcn 0x%llx, "
+					"offset 0x%x because its location on "
+					"disk could not be determined%s "
+					"(error code %i).", ni->mft_no,
+					ni->type, (unsigned long long)vcn,
+					vcn_ofs, is_retry ? " even after "
+					"retrying" : "", err);
+		}
+		/*
+		 * Either iblock was outside lblock limits or
+		 * ntfs_rl_vcn_to_lcn() returned error.  Just zero that portion
+		 * of the page and set the buffer uptodate.
+		 */
+handle_hole:
+		bh->b_blocknr = -1UL;
+		clear_buffer_mapped(bh);
+handle_zblock:
+		zero_user(page, i * blocksize, blocksize);
+		if (likely(!err))
+			set_buffer_uptodate(bh);
+	} while (i++, iblock++, (bh = bh->b_this_page) != head);
+
+	/* Release the lock if we took it. */
+	if (rl)
+		up_read(&ni->runlist.lock);
+
+	/* Check we have at least one buffer ready for i/o. */
+	if (nr) {
+		struct buffer_head *tbh;
+
+		/* Lock the buffers. */
+		for (i = 0; i < nr; i++) {
+			tbh = arr[i];
+			lock_buffer(tbh);
+			tbh->b_end_io = ntfs_end_buffer_async_read;
+			set_buffer_async_read(tbh);
+		}
+		/* Finally, start i/o on the buffers. */
+		for (i = 0; i < nr; i++) {
+			tbh = arr[i];
+			if (likely(!buffer_uptodate(tbh)))
+				submit_bh(READ, tbh);
+			else
+				ntfs_end_buffer_async_read(tbh, 1);
+		}
+		return 0;
+	}
+	/* No i/o was scheduled on any of the buffers. */
+	if (likely(!PageError(page)))
+		SetPageUptodate(page);
+	else /* Signal synchronous i/o error. */
+		nr = -EIO;
+	unlock_page(page);
+	return nr;
+}
+
+/**
+ * ntfs_readpage - fill a @page of a @file with data from the device
+ * @file:	open file to which the page @page belongs or NULL
+ * @page:	page cache page to fill with data
+ *
+ * For non-resident attributes, ntfs_readpage() fills the @page of the open
+ * file @file by calling the ntfs version of the generic block_read_full_page()
+ * function, ntfs_read_block(), which in turn creates and reads in the buffers
+ * associated with the page asynchronously.
+ *
+ * For resident attributes, OTOH, ntfs_readpage() fills @page by copying the
+ * data from the mft record (which at this stage is most likely in memory) and
+ * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
+ * even if the mft record is not cached at this point in time, we need to wait
+ * for it to be read in before we can do the copy.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_readpage(struct file *file, struct page *page)
+{
+	loff_t i_size;
+	struct inode *vi;
+	ntfs_inode *ni, *base_ni;
+	u8 *addr;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *mrec;
+	unsigned long flags;
+	u32 attr_len;
+	int err = 0;
+
+retry_readpage:
+	BUG_ON(!PageLocked(page));
+	vi = page->mapping->host;
+	i_size = i_size_read(vi);
+	/* Is the page fully outside i_size? (truncate in progress) */
+	if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
+			PAGE_CACHE_SHIFT)) {
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+		ntfs_debug("Read outside i_size - truncated?");
+		goto done;
+	}
+	/*
+	 * This can potentially happen because we clear PageUptodate() during
+	 * ntfs_writepage() of MstProtected() attributes.
+	 */
+	if (PageUptodate(page)) {
+		unlock_page(page);
+		return 0;
+	}
+	ni = NTFS_I(vi);
+	/*
+	 * Only $DATA attributes can be encrypted and only unnamed $DATA
+	 * attributes can be compressed.  Index root can have the flags set but
+	 * this means to create compressed/encrypted files, not that the
+	 * attribute is compressed/encrypted.  Note we need to check for
+	 * AT_INDEX_ALLOCATION since this is the type of both directory and
+	 * index inodes.
+	 */
+	if (ni->type != AT_INDEX_ALLOCATION) {
+		/* If attribute is encrypted, deny access, just like NT4. */
+		if (NInoEncrypted(ni)) {
+			BUG_ON(ni->type != AT_DATA);
+			err = -EACCES;
+			goto err_out;
+		}
+		/* Compressed data streams are handled in compress.c. */
+		if (NInoNonResident(ni) && NInoCompressed(ni)) {
+			BUG_ON(ni->type != AT_DATA);
+			BUG_ON(ni->name_len);
+			return ntfs_read_compressed_block(page);
+		}
+	}
+	/* NInoNonResident() == NInoIndexAllocPresent() */
+	if (NInoNonResident(ni)) {
+		/* Normal, non-resident data stream. */
+		return ntfs_read_block(page);
+	}
+	/*
+	 * Attribute is resident, implying it is not compressed or encrypted.
+	 * This also means the attribute is smaller than an mft record and
+	 * hence smaller than a page, so can simply zero out any pages with
+	 * index above 0.  Note the attribute can actually be marked compressed
+	 * but if it is resident the actual data is not compressed so we are
+	 * ok to ignore the compressed flag here.
+	 */
+	if (unlikely(page->index > 0)) {
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+		goto done;
+	}
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	/* Map, pin, and lock the mft record. */
+	mrec = map_mft_record(base_ni);
+	if (IS_ERR(mrec)) {
+		err = PTR_ERR(mrec);
+		goto err_out;
+	}
+	/*
+	 * If a parallel write made the attribute non-resident, drop the mft
+	 * record and retry the readpage.
+	 */
+	if (unlikely(NInoNonResident(ni))) {
+		unmap_mft_record(base_ni);
+		goto retry_readpage;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto unm_err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err))
+		goto put_unm_err_out;
+	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
+	read_lock_irqsave(&ni->size_lock, flags);
+	if (unlikely(attr_len > ni->initialized_size))
+		attr_len = ni->initialized_size;
+	i_size = i_size_read(vi);
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (unlikely(attr_len > i_size)) {
+		/* Race with shrinking truncate. */
+		attr_len = i_size;
+	}
+	addr = kmap_atomic(page, KM_USER0);
+	/* Copy the data to the page. */
+	memcpy(addr, (u8*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset),
+			attr_len);
+	/* Zero the remainder of the page. */
+	memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+	flush_dcache_page(page);
+	kunmap_atomic(addr, KM_USER0);
+put_unm_err_out:
+	ntfs_attr_put_search_ctx(ctx);
+unm_err_out:
+	unmap_mft_record(base_ni);
+done:
+	SetPageUptodate(page);
+err_out:
+	unlock_page(page);
+	return err;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_write_block - write a @page to the backing store
+ * @page:	page cache page to write out
+ * @wbc:	writeback control structure
+ *
+ * This function is for writing pages belonging to non-resident, non-mst
+ * protected attributes to their backing store.
+ *
+ * For a page with buffers, map and write the dirty buffers asynchronously
+ * under page writeback. For a page without buffers, create buffers for the
+ * page, then proceed as above.
+ *
+ * If a page doesn't have buffers the page dirty state is definitive. If a page
+ * does have buffers, the page dirty state is just a hint, and the buffer dirty
+ * state is definitive. (A hint which has rules: dirty buffers against a clean
+ * page is illegal. Other combinations are legal and need to be handled. In
+ * particular a dirty page containing clean buffers for example.)
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Based on ntfs_read_block() and __block_write_full_page().
+ */
+static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
+{
+	VCN vcn;
+	LCN lcn;
+	s64 initialized_size;
+	loff_t i_size;
+	sector_t block, dblock, iblock;
+	struct inode *vi;
+	ntfs_inode *ni;
+	ntfs_volume *vol;
+	runlist_element *rl;
+	struct buffer_head *bh, *head;
+	unsigned long flags;
+	unsigned int blocksize, vcn_ofs;
+	int err;
+	bool need_end_writeback;
+	unsigned char blocksize_bits;
+
+	vi = page->mapping->host;
+	ni = NTFS_I(vi);
+	vol = ni->vol;
+
+	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
+			"0x%lx.", ni->mft_no, ni->type, page->index);
+
+	BUG_ON(!NInoNonResident(ni));
+	BUG_ON(NInoMstProtected(ni));
+	blocksize = vol->sb->s_blocksize;
+	blocksize_bits = vol->sb->s_blocksize_bits;
+	if (!page_has_buffers(page)) {
+		BUG_ON(!PageUptodate(page));
+		create_empty_buffers(page, blocksize,
+				(1 << BH_Uptodate) | (1 << BH_Dirty));
+		if (unlikely(!page_has_buffers(page))) {
+			ntfs_warning(vol->sb, "Error allocating page "
+					"buffers.  Redirtying page so we try "
+					"again later.");
+			/*
+			 * Put the page back on mapping->dirty_pages, but leave
+			 * its buffers' dirty state as-is.
+			 */
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
+	}
+	bh = head = page_buffers(page);
+	BUG_ON(!bh);
+
+	/* NOTE: Different naming scheme to ntfs_read_block()! */
+
+	/* The first block in the page. */
+	block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+
+	read_lock_irqsave(&ni->size_lock, flags);
+	i_size = i_size_read(vi);
+	initialized_size = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+
+	/* The first out of bounds block for the data size. */
+	dblock = (i_size + blocksize - 1) >> blocksize_bits;
+
+	/* The last (fully or partially) initialized block. */
+	iblock = initialized_size >> blocksize_bits;
+
+	/*
+	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
+	 * here, and the (potentially unmapped) buffers may become dirty at
+	 * any time.  If a buffer becomes dirty here after we've inspected it
+	 * then we just miss that fact, and the page stays dirty.
+	 *
+	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
+	 * handle that here by just cleaning them.
+	 */
+
+	/*
+	 * Loop through all the buffers in the page, mapping all the dirty
+	 * buffers to disk addresses and handling any aliases from the
+	 * underlying block device's mapping.
+	 */
+	rl = NULL;
+	err = 0;
+	do {
+		bool is_retry = false;
+
+		if (unlikely(block >= dblock)) {
+			/*
+			 * Mapped buffers outside i_size will occur, because
+			 * this page can be outside i_size when there is a
+			 * truncate in progress. The contents of such buffers
+			 * were zeroed by ntfs_writepage().
+			 *
+			 * FIXME: What about the small race window where
+			 * ntfs_writepage() has not done any clearing because
+			 * the page was within i_size but before we get here,
+			 * vmtruncate() modifies i_size?
+			 */
+			clear_buffer_dirty(bh);
+			set_buffer_uptodate(bh);
+			continue;
+		}
+
+		/* Clean buffers are not written out, so no need to map them. */
+		if (!buffer_dirty(bh))
+			continue;
+
+		/* Make sure we have enough initialized size. */
+		if (unlikely((block >= iblock) &&
+				(initialized_size < i_size))) {
+			/*
+			 * If this page is fully outside initialized size, zero
+			 * out all pages between the current initialized size
+			 * and the current page. Just use ntfs_readpage() to do
+			 * the zeroing transparently.
+			 */
+			if (block > iblock) {
+				// TODO:
+				// For each page do:
+				// - read_cache_page()
+				// Again for each page do:
+				// - wait_on_page_locked()
+				// - Check (PageUptodate(page) &&
+				//			!PageError(page))
+				// Update initialized size in the attribute and
+				// in the inode.
+				// Again, for each page do:
+				//	__set_page_dirty_buffers();
+				// page_cache_release()
+				// We don't need to wait on the writes.
+				// Update iblock.
+			}
+			/*
+			 * The current page straddles initialized size. Zero
+			 * all non-uptodate buffers and set them uptodate (and
+			 * dirty?). Note, there aren't any non-uptodate buffers
+			 * if the page is uptodate.
+			 * FIXME: For an uptodate page, the buffers may need to
+			 * be written out because they were not initialized on
+			 * disk before.
+			 */
+			if (!PageUptodate(page)) {
+				// TODO:
+				// Zero any non-uptodate buffers up to i_size.
+				// Set them uptodate and dirty.
+			}
+			// TODO:
+			// Update initialized size in the attribute and in the
+			// inode (up to i_size).
+			// Update iblock.
+			// FIXME: This is inefficient. Try to batch the two
+			// size changes to happen in one go.
+			ntfs_error(vol->sb, "Writing beyond initialized size "
+					"is not supported yet. Sorry.");
+			err = -EOPNOTSUPP;
+			break;
+			// Do NOT set_buffer_new() BUT DO clear buffer range
+			// outside write request range.
+			// set_buffer_uptodate() on complete buffers as well as
+			// set_buffer_dirty().
+		}
+
+		/* No need to map buffers that are already mapped. */
+		if (buffer_mapped(bh))
+			continue;
+
+		/* Unmapped, dirty buffer. Need to map it. */
+		bh->b_bdev = vol->sb->s_bdev;
+
+		/* Convert block into corresponding vcn and offset. */
+		vcn = (VCN)block << blocksize_bits;
+		vcn_ofs = vcn & vol->cluster_size_mask;
+		vcn >>= vol->cluster_size_bits;
+		if (!rl) {
+lock_retry_remap:
+			down_read(&ni->runlist.lock);
+			rl = ni->runlist.rl;
+		}
+		if (likely(rl != NULL)) {
+			/* Seek to element containing target vcn. */
+			while (rl->length && rl[1].vcn <= vcn)
+				rl++;
+			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+		} else
+			lcn = LCN_RL_NOT_MAPPED;
+		/* Successful remap. */
+		if (lcn >= 0) {
+			/* Setup buffer head to point to correct block. */
+			bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
+					vcn_ofs) >> blocksize_bits;
+			set_buffer_mapped(bh);
+			continue;
+		}
+		/* It is a hole, need to instantiate it. */
+		if (lcn == LCN_HOLE) {
+			u8 *kaddr;
+			unsigned long *bpos, *bend;
+
+			/* Check if the buffer is zero. */
+			kaddr = kmap_atomic(page, KM_USER0);
+			bpos = (unsigned long *)(kaddr + bh_offset(bh));
+			bend = (unsigned long *)((u8*)bpos + blocksize);
+			do {
+				if (unlikely(*bpos))
+					break;
+			} while (likely(++bpos < bend));
+			kunmap_atomic(kaddr, KM_USER0);
+			if (bpos == bend) {
+				/*
+				 * Buffer is zero and sparse, no need to write
+				 * it.
+				 */
+				bh->b_blocknr = -1;
+				clear_buffer_dirty(bh);
+				continue;
+			}
+			// TODO: Instantiate the hole.
+			// clear_buffer_new(bh);
+			// unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+			ntfs_error(vol->sb, "Writing into sparse regions is "
+					"not supported yet. Sorry.");
+			err = -EOPNOTSUPP;
+			break;
+		}
+		/* If first try and runlist unmapped, map and retry. */
+		if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
+			is_retry = true;
+			/*
+			 * Attempt to map runlist, dropping lock for
+			 * the duration.
+			 */
+			up_read(&ni->runlist.lock);
+			err = ntfs_map_runlist(ni, vcn);
+			if (likely(!err))
+				goto lock_retry_remap;
+			rl = NULL;
+		} else if (!rl)
+			up_read(&ni->runlist.lock);
+		/*
+		 * If buffer is outside the runlist, truncate has cut it out
+		 * of the runlist.  Just clean and clear the buffer and set it
+		 * uptodate so it can get discarded by the VM.
+		 */
+		if (err == -ENOENT || lcn == LCN_ENOENT) {
+			bh->b_blocknr = -1;
+			clear_buffer_dirty(bh);
+			zero_user(page, bh_offset(bh), blocksize);
+			set_buffer_uptodate(bh);
+			err = 0;
+			continue;
+		}
+		/* Failed to map the buffer, even after retrying. */
+		if (!err)
+			err = -EIO;
+		bh->b_blocknr = -1;
+		ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
+				"attribute type 0x%x, vcn 0x%llx, offset 0x%x "
+				"because its location on disk could not be "
+				"determined%s (error code %i).", ni->mft_no,
+				ni->type, (unsigned long long)vcn,
+				vcn_ofs, is_retry ? " even after "
+				"retrying" : "", err);
+		break;
+	} while (block++, (bh = bh->b_this_page) != head);
+
+	/* Release the lock if we took it. */
+	if (rl)
+		up_read(&ni->runlist.lock);
+
+	/* For the error case, need to reset bh to the beginning. */
+	bh = head;
+
+	/* Just an optimization, so ->readpage() is not called later. */
+	if (unlikely(!PageUptodate(page))) {
+		int uptodate = 1;
+		do {
+			if (!buffer_uptodate(bh)) {
+				uptodate = 0;
+				bh = head;
+				break;
+			}
+		} while ((bh = bh->b_this_page) != head);
+		if (uptodate)
+			SetPageUptodate(page);
+	}
+
+	/* Setup all mapped, dirty buffers for async write i/o. */
+	do {
+		if (buffer_mapped(bh) && buffer_dirty(bh)) {
+			lock_buffer(bh);
+			if (test_clear_buffer_dirty(bh)) {
+				BUG_ON(!buffer_uptodate(bh));
+				mark_buffer_async_write(bh);
+			} else
+				unlock_buffer(bh);
+		} else if (unlikely(err)) {
+			/*
+			 * For the error case. The buffer may have been set
+			 * dirty during attachment to a dirty page.
+			 */
+			if (err != -ENOMEM)
+				clear_buffer_dirty(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+	if (unlikely(err)) {
+		// TODO: Remove the -EOPNOTSUPP check later on...
+		if (unlikely(err == -EOPNOTSUPP))
+			err = 0;
+		else if (err == -ENOMEM) {
+			ntfs_warning(vol->sb, "Error allocating memory. "
+					"Redirtying page so we try again "
+					"later.");
+			/*
+			 * Put the page back on mapping->dirty_pages, but
+			 * leave its buffer's dirty state as-is.
+			 */
+			redirty_page_for_writepage(wbc, page);
+			err = 0;
+		} else
+			SetPageError(page);
+	}
+
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);	/* Keeps try_to_free_buffers() away. */
+
+	/* Submit the prepared buffers for i/o. */
+	need_end_writeback = true;
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			submit_bh(WRITE, bh);
+			need_end_writeback = false;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
+
+	/* If no i/o was started, need to end_page_writeback(). */
+	if (unlikely(need_end_writeback))
+		end_page_writeback(page);
+
+	ntfs_debug("Done.");
+	return err;
+}
+
+/**
+ * ntfs_write_mst_block - write a @page to the backing store
+ * @page:	page cache page to write out
+ * @wbc:	writeback control structure
+ *
+ * This function is for writing pages belonging to non-resident, mst protected
+ * attributes to their backing store.  The only supported attributes are index
+ * allocation and $MFT/$DATA.  Both directory inodes and index inodes are
+ * supported for the index allocation case.
+ *
+ * The page must remain locked for the duration of the write because we apply
+ * the mst fixups, write, and then undo the fixups, so if we were to unlock the
+ * page before undoing the fixups, any other user of the page will see the
+ * page contents as corrupt.
+ *
+ * We clear the page uptodate flag for the duration of the function to ensure
+ * exclusion for the $MFT/$DATA case against someone mapping an mft record we
+ * are about to apply the mst fixups to.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Based on ntfs_write_block(), ntfs_mft_writepage(), and
+ * write_mft_record_nolock().
+ */
+static int ntfs_write_mst_block(struct page *page,
+		struct writeback_control *wbc)
+{
+	sector_t block, dblock, rec_block;
+	struct inode *vi = page->mapping->host;
+	ntfs_inode *ni = NTFS_I(vi);
+	ntfs_volume *vol = ni->vol;
+	u8 *kaddr;
+	unsigned int rec_size = ni->itype.index.block_size;
+	ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size];
+	struct buffer_head *bh, *head, *tbh, *rec_start_bh;
+	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
+	runlist_element *rl;
+	int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
+	unsigned bh_size, rec_size_bits;
+	bool sync, is_mft, page_is_dirty, rec_is_dirty;
+	unsigned char bh_size_bits;
+
+	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
+			"0x%lx.", vi->i_ino, ni->type, page->index);
+	BUG_ON(!NInoNonResident(ni));
+	BUG_ON(!NInoMstProtected(ni));
+	is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
+	/*
+	 * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
+	 * in its page cache were to be marked dirty.  However this should
+	 * never happen with the current driver and considering we do not
+	 * handle this case here we do want to BUG(), at least for now.
+	 */
+	BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
+			(NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
+	bh_size = vol->sb->s_blocksize;
+	bh_size_bits = vol->sb->s_blocksize_bits;
+	max_bhs = PAGE_CACHE_SIZE / bh_size;
+	BUG_ON(!max_bhs);
+	BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
+
+	/* Were we called for sync purposes? */
+	sync = (wbc->sync_mode == WB_SYNC_ALL);
+
+	/* Make sure we have mapped buffers. */
+	bh = head = page_buffers(page);
+	BUG_ON(!bh);
+
+	rec_size_bits = ni->itype.index.block_size_bits;
+	BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits));
+	bhs_per_rec = rec_size >> bh_size_bits;
+	BUG_ON(!bhs_per_rec);
+
+	/* The first block in the page. */
+	rec_block = block = (sector_t)page->index <<
+			(PAGE_CACHE_SHIFT - bh_size_bits);
+
+	/* The first out of bounds block for the data size. */
+	dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
+
+	rl = NULL;
+	err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
+	page_is_dirty = rec_is_dirty = false;
+	rec_start_bh = NULL;
+	do {
+		bool is_retry = false;
+
+		if (likely(block < rec_block)) {
+			if (unlikely(block >= dblock)) {
+				clear_buffer_dirty(bh);
+				set_buffer_uptodate(bh);
+				continue;
+			}
+			/*
+			 * This block is not the first one in the record.  We
+			 * ignore the buffer's dirty state because we could
+			 * have raced with a parallel mark_ntfs_record_dirty().
+			 */
+			if (!rec_is_dirty)
+				continue;
+			if (unlikely(err2)) {
+				if (err2 != -ENOMEM)
+					clear_buffer_dirty(bh);
+				continue;
+			}
+		} else /* if (block == rec_block) */ {
+			BUG_ON(block > rec_block);
+			/* This block is the first one in the record. */
+			rec_block += bhs_per_rec;
+			err2 = 0;
+			if (unlikely(block >= dblock)) {
+				clear_buffer_dirty(bh);
+				continue;
+			}
+			if (!buffer_dirty(bh)) {
+				/* Clean records are not written out. */
+				rec_is_dirty = false;
+				continue;
+			}
+			rec_is_dirty = true;
+			rec_start_bh = bh;
+		}
+		/* Need to map the buffer if it is not mapped already. */
+		if (unlikely(!buffer_mapped(bh))) {
+			VCN vcn;
+			LCN lcn;
+			unsigned int vcn_ofs;
+
+			bh->b_bdev = vol->sb->s_bdev;
+			/* Obtain the vcn and offset of the current block. */
+			vcn = (VCN)block << bh_size_bits;
+			vcn_ofs = vcn & vol->cluster_size_mask;
+			vcn >>= vol->cluster_size_bits;
+			if (!rl) {
+lock_retry_remap:
+				down_read(&ni->runlist.lock);
+				rl = ni->runlist.rl;
+			}
+			if (likely(rl != NULL)) {
+				/* Seek to element containing target vcn. */
+				while (rl->length && rl[1].vcn <= vcn)
+					rl++;
+				lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+			} else
+				lcn = LCN_RL_NOT_MAPPED;
+			/* Successful remap. */
+			if (likely(lcn >= 0)) {
+				/* Setup buffer head to correct block. */
+				bh->b_blocknr = ((lcn <<
+						vol->cluster_size_bits) +
+						vcn_ofs) >> bh_size_bits;
+				set_buffer_mapped(bh);
+			} else {
+				/*
+				 * Remap failed.  Retry to map the runlist once
+				 * unless we are working on $MFT which always
+				 * has the whole of its runlist in memory.
+				 */
+				if (!is_mft && !is_retry &&
+						lcn == LCN_RL_NOT_MAPPED) {
+					is_retry = true;
+					/*
+					 * Attempt to map runlist, dropping
+					 * lock for the duration.
+					 */
+					up_read(&ni->runlist.lock);
+					err2 = ntfs_map_runlist(ni, vcn);
+					if (likely(!err2))
+						goto lock_retry_remap;
+					if (err2 == -ENOMEM)
+						page_is_dirty = true;
+					lcn = err2;
+				} else {
+					err2 = -EIO;
+					if (!rl)
+						up_read(&ni->runlist.lock);
+				}
+				/* Hard error.  Abort writing this record. */
+				if (!err || err == -ENOMEM)
+					err = err2;
+				bh->b_blocknr = -1;
+				ntfs_error(vol->sb, "Cannot write ntfs record "
+						"0x%llx (inode 0x%lx, "
+						"attribute type 0x%x) because "
+						"its location on disk could "
+						"not be determined (error "
+						"code %lli).",
+						(long long)block <<
+						bh_size_bits >>
+						vol->mft_record_size_bits,
+						ni->mft_no, ni->type,
+						(long long)lcn);
+				/*
+				 * If this is not the first buffer, remove the
+				 * buffers in this record from the list of
+				 * buffers to write and clear their dirty bit
+				 * if not error -ENOMEM.
+				 */
+				if (rec_start_bh != bh) {
+					while (bhs[--nr_bhs] != rec_start_bh)
+						;
+					if (err2 != -ENOMEM) {
+						do {
+							clear_buffer_dirty(
+								rec_start_bh);
+						} while ((rec_start_bh =
+								rec_start_bh->
+								b_this_page) !=
+								bh);
+					}
+				}
+				continue;
+			}
+		}
+		BUG_ON(!buffer_uptodate(bh));
+		BUG_ON(nr_bhs >= max_bhs);
+		bhs[nr_bhs++] = bh;
+	} while (block++, (bh = bh->b_this_page) != head);
+	if (unlikely(rl))
+		up_read(&ni->runlist.lock);
+	/* If there were no dirty buffers, we are done. */
+	if (!nr_bhs)
+		goto done;
+	/* Map the page so we can access its contents. */
+	kaddr = kmap(page);
+	/* Clear the page uptodate flag whilst the mst fixups are applied. */
+	BUG_ON(!PageUptodate(page));
+	ClearPageUptodate(page);
+	for (i = 0; i < nr_bhs; i++) {
+		unsigned int ofs;
+
+		/* Skip buffers which are not at the beginning of records. */
+		if (i % bhs_per_rec)
+			continue;
+		tbh = bhs[i];
+		ofs = bh_offset(tbh);
+		if (is_mft) {
+			ntfs_inode *tni;
+			unsigned long mft_no;
+
+			/* Get the mft record number. */
+			mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+					>> rec_size_bits;
+			/* Check whether to write this mft record. */
+			tni = NULL;
+			if (!ntfs_may_write_mft_record(vol, mft_no,
+					(MFT_RECORD*)(kaddr + ofs), &tni)) {
+				/*
+				 * The record should not be written.  This
+				 * means we need to redirty the page before
+				 * returning.
+				 */
+				page_is_dirty = true;
+				/*
+				 * Remove the buffers in this mft record from
+				 * the list of buffers to write.
+				 */
+				do {
+					bhs[i] = NULL;
+				} while (++i % bhs_per_rec);
+				continue;
+			}
+			/*
+			 * The record should be written.  If a locked ntfs
+			 * inode was returned, add it to the array of locked
+			 * ntfs inodes.
+			 */
+			if (tni)
+				locked_nis[nr_locked_nis++] = tni;
+		}
+		/* Apply the mst protection fixups. */
+		err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
+				rec_size);
+		if (unlikely(err2)) {
+			if (!err || err == -ENOMEM)
+				err = -EIO;
+			ntfs_error(vol->sb, "Failed to apply mst fixups "
+					"(inode 0x%lx, attribute type 0x%x, "
+					"page index 0x%lx, page offset 0x%x)!"
+					"  Unmount and run chkdsk.", vi->i_ino,
+					ni->type, page->index, ofs);
+			/*
+			 * Mark all the buffers in this record clean as we do
+			 * not want to write corrupt data to disk.
+			 */
+			do {
+				clear_buffer_dirty(bhs[i]);
+				bhs[i] = NULL;
+			} while (++i % bhs_per_rec);
+			continue;
+		}
+		nr_recs++;
+	}
+	/* If no records are to be written out, we are done. */
+	if (!nr_recs)
+		goto unm_done;
+	flush_dcache_page(page);
+	/* Lock buffers and start synchronous write i/o on them. */
+	for (i = 0; i < nr_bhs; i++) {
+		tbh = bhs[i];
+		if (!tbh)
+			continue;
+		if (!trylock_buffer(tbh))
+			BUG();
+		/* The buffer dirty state is now irrelevant, just clean it. */
+		clear_buffer_dirty(tbh);
+		BUG_ON(!buffer_uptodate(tbh));
+		BUG_ON(!buffer_mapped(tbh));
+		get_bh(tbh);
+		tbh->b_end_io = end_buffer_write_sync;
+		submit_bh(WRITE, tbh);
+	}
+	/* Synchronize the mft mirror now if not @sync. */
+	if (is_mft && !sync)
+		goto do_mirror;
+do_wait:
+	/* Wait on i/o completion of buffers. */
+	for (i = 0; i < nr_bhs; i++) {
+		tbh = bhs[i];
+		if (!tbh)
+			continue;
+		wait_on_buffer(tbh);
+		if (unlikely(!buffer_uptodate(tbh))) {
+			ntfs_error(vol->sb, "I/O error while writing ntfs "
+					"record buffer (inode 0x%lx, "
+					"attribute type 0x%x, page index "
+					"0x%lx, page offset 0x%lx)!  Unmount "
+					"and run chkdsk.", vi->i_ino, ni->type,
+					page->index, bh_offset(tbh));
+			if (!err || err == -ENOMEM)
+				err = -EIO;
+			/*
+			 * Set the buffer uptodate so the page and buffer
+			 * states do not become out of sync.
+			 */
+			set_buffer_uptodate(tbh);
+		}
+	}
+	/* If @sync, now synchronize the mft mirror. */
+	if (is_mft && sync) {
+do_mirror:
+		for (i = 0; i < nr_bhs; i++) {
+			unsigned long mft_no;
+			unsigned int ofs;
+
+			/*
+			 * Skip buffers which are not at the beginning of
+			 * records.
+			 */
+			if (i % bhs_per_rec)
+				continue;
+			tbh = bhs[i];
+			/* Skip removed buffers (and hence records). */
+			if (!tbh)
+				continue;
+			ofs = bh_offset(tbh);
+			/* Get the mft record number. */
+			mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+					>> rec_size_bits;
+			if (mft_no < vol->mftmirr_size)
+				ntfs_sync_mft_mirror(vol, mft_no,
+						(MFT_RECORD*)(kaddr + ofs),
+						sync);
+		}
+		if (!sync)
+			goto do_wait;
+	}
+	/* Remove the mst protection fixups again. */
+	for (i = 0; i < nr_bhs; i++) {
+		if (!(i % bhs_per_rec)) {
+			tbh = bhs[i];
+			if (!tbh)
+				continue;
+			post_write_mst_fixup((NTFS_RECORD*)(kaddr +
+					bh_offset(tbh)));
+		}
+	}
+	flush_dcache_page(page);
+unm_done:
+	/* Unlock any locked inodes. */
+	while (nr_locked_nis-- > 0) {
+		ntfs_inode *tni, *base_tni;
+		
+		tni = locked_nis[nr_locked_nis];
+		/* Get the base inode. */
+		mutex_lock(&tni->extent_lock);
+		if (tni->nr_extents >= 0)
+			base_tni = tni;
+		else {
+			base_tni = tni->ext.base_ntfs_ino;
+			BUG_ON(!base_tni);
+		}
+		mutex_unlock(&tni->extent_lock);
+		ntfs_debug("Unlocking %s inode 0x%lx.",
+				tni == base_tni ? "base" : "extent",
+				tni->mft_no);
+		mutex_unlock(&tni->mrec_lock);
+		atomic_dec(&tni->count);
+		iput(VFS_I(base_tni));
+	}
+	SetPageUptodate(page);
+	kunmap(page);
+done:
+	if (unlikely(err && err != -ENOMEM)) {
+		/*
+		 * Set page error if there is only one ntfs record in the page.
+		 * Otherwise we would loose per-record granularity.
+		 */
+		if (ni->itype.index.block_size == PAGE_CACHE_SIZE)
+			SetPageError(page);
+		NVolSetErrors(vol);
+	}
+	if (page_is_dirty) {
+		ntfs_debug("Page still contains one or more dirty ntfs "
+				"records.  Redirtying the page starting at "
+				"record 0x%lx.", page->index <<
+				(PAGE_CACHE_SHIFT - rec_size_bits));
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+	} else {
+		/*
+		 * Keep the VM happy.  This must be done otherwise the
+		 * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
+		 * the page is clean.
+		 */
+		BUG_ON(PageWriteback(page));
+		set_page_writeback(page);
+		unlock_page(page);
+		end_page_writeback(page);
+	}
+	if (likely(!err))
+		ntfs_debug("Done.");
+	return err;
+}
+
+/**
+ * ntfs_writepage - write a @page to the backing store
+ * @page:	page cache page to write out
+ * @wbc:	writeback control structure
+ *
+ * This is called from the VM when it wants to have a dirty ntfs page cache
+ * page cleaned.  The VM has already locked the page and marked it clean.
+ *
+ * For non-resident attributes, ntfs_writepage() writes the @page by calling
+ * the ntfs version of the generic block_write_full_page() function,
+ * ntfs_write_block(), which in turn if necessary creates and writes the
+ * buffers associated with the page asynchronously.
+ *
+ * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
+ * the data to the mft record (which at this stage is most likely in memory).
+ * The mft record is then marked dirty and written out asynchronously via the
+ * vfs inode dirty code path for the inode the mft record belongs to or via the
+ * vm page dirty code path for the page the mft record is in.
+ *
+ * Based on ntfs_readpage() and fs/buffer.c::block_write_full_page().
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	loff_t i_size;
+	struct inode *vi = page->mapping->host;
+	ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
+	char *addr;
+	ntfs_attr_search_ctx *ctx = NULL;
+	MFT_RECORD *m = NULL;
+	u32 attr_len;
+	int err;
+
+retry_writepage:
+	BUG_ON(!PageLocked(page));
+	i_size = i_size_read(vi);
+	/* Is the page fully outside i_size? (truncate in progress) */
+	if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
+			PAGE_CACHE_SHIFT)) {
+		/*
+		 * The page may have dirty, unmapped buffers.  Make them
+		 * freeable here, so the page does not leak.
+		 */
+		block_invalidatepage(page, 0);
+		unlock_page(page);
+		ntfs_debug("Write outside i_size - truncated?");
+		return 0;
+	}
+	/*
+	 * Only $DATA attributes can be encrypted and only unnamed $DATA
+	 * attributes can be compressed.  Index root can have the flags set but
+	 * this means to create compressed/encrypted files, not that the
+	 * attribute is compressed/encrypted.  Note we need to check for
+	 * AT_INDEX_ALLOCATION since this is the type of both directory and
+	 * index inodes.
+	 */
+	if (ni->type != AT_INDEX_ALLOCATION) {
+		/* If file is encrypted, deny access, just like NT4. */
+		if (NInoEncrypted(ni)) {
+			unlock_page(page);
+			BUG_ON(ni->type != AT_DATA);
+			ntfs_debug("Denying write access to encrypted file.");
+			return -EACCES;
+		}
+		/* Compressed data streams are handled in compress.c. */
+		if (NInoNonResident(ni) && NInoCompressed(ni)) {
+			BUG_ON(ni->type != AT_DATA);
+			BUG_ON(ni->name_len);
+			// TODO: Implement and replace this with
+			// return ntfs_write_compressed_block(page);
+			unlock_page(page);
+			ntfs_error(vi->i_sb, "Writing to compressed files is "
+					"not supported yet.  Sorry.");
+			return -EOPNOTSUPP;
+		}
+		// TODO: Implement and remove this check.
+		if (NInoNonResident(ni) && NInoSparse(ni)) {
+			unlock_page(page);
+			ntfs_error(vi->i_sb, "Writing to sparse files is not "
+					"supported yet.  Sorry.");
+			return -EOPNOTSUPP;
+		}
+	}
+	/* NInoNonResident() == NInoIndexAllocPresent() */
+	if (NInoNonResident(ni)) {
+		/* We have to zero every time due to mmap-at-end-of-file. */
+		if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) {
+			/* The page straddles i_size. */
+			unsigned int ofs = i_size & ~PAGE_CACHE_MASK;
+			zero_user_segment(page, ofs, PAGE_CACHE_SIZE);
+		}
+		/* Handle mst protected attributes. */
+		if (NInoMstProtected(ni))
+			return ntfs_write_mst_block(page, wbc);
+		/* Normal, non-resident data stream. */
+		return ntfs_write_block(page, wbc);
+	}
+	/*
+	 * Attribute is resident, implying it is not compressed, encrypted, or
+	 * mst protected.  This also means the attribute is smaller than an mft
+	 * record and hence smaller than a page, so can simply return error on
+	 * any pages with index above 0.  Note the attribute can actually be
+	 * marked compressed but if it is resident the actual data is not
+	 * compressed so we are ok to ignore the compressed flag here.
+	 */
+	BUG_ON(page_has_buffers(page));
+	BUG_ON(!PageUptodate(page));
+	if (unlikely(page->index > 0)) {
+		ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0.  "
+				"Aborting write.", page->index);
+		BUG_ON(PageWriteback(page));
+		set_page_writeback(page);
+		unlock_page(page);
+		end_page_writeback(page);
+		return -EIO;
+	}
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	/* Map, pin, and lock the mft record. */
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		ctx = NULL;
+		goto err_out;
+	}
+	/*
+	 * If a parallel write made the attribute non-resident, drop the mft
+	 * record and retry the writepage.
+	 */
+	if (unlikely(NInoNonResident(ni))) {
+		unmap_mft_record(base_ni);
+		goto retry_writepage;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err))
+		goto err_out;
+	/*
+	 * Keep the VM happy.  This must be done otherwise the radix-tree tag
+	 * PAGECACHE_TAG_DIRTY remains set even though the page is clean.
+	 */
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
+	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
+	i_size = i_size_read(vi);
+	if (unlikely(attr_len > i_size)) {
+		/* Race with shrinking truncate or a failed truncate. */
+		attr_len = i_size;
+		/*
+		 * If the truncate failed, fix it up now.  If a concurrent
+		 * truncate, we do its job, so it does not have to do anything.
+		 */
+		err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr,
+				attr_len);
+		/* Shrinking cannot fail. */
+		BUG_ON(err);
+	}
+	addr = kmap_atomic(page, KM_USER0);
+	/* Copy the data from the page to the mft record. */
+	memcpy((u8*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset),
+			addr, attr_len);
+	/* Zero out of bounds area in the page cache page. */
+	memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+	kunmap_atomic(addr, KM_USER0);
+	flush_dcache_page(page);
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	/* We are done with the page. */
+	end_page_writeback(page);
+	/* Finally, mark the mft record dirty, so it gets written back. */
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	return 0;
+err_out:
+	if (err == -ENOMEM) {
+		ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
+				"page so we try again later.");
+		/*
+		 * Put the page back on mapping->dirty_pages, but leave its
+		 * buffers' dirty state as-is.
+		 */
+		redirty_page_for_writepage(wbc, page);
+		err = 0;
+	} else {
+		ntfs_error(vi->i_sb, "Resident attribute write failed with "
+				"error %i.", err);
+		SetPageError(page);
+		NVolSetErrors(ni->vol);
+	}
+	unlock_page(page);
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	return err;
+}
+
+#endif	/* NTFS_RW */
+
+/**
+ * ntfs_aops - general address space operations for inodes and attributes
+ */
+const struct address_space_operations ntfs_aops = {
+	.readpage	= ntfs_readpage,	/* Fill page with data. */
+#ifdef NTFS_RW
+	.writepage	= ntfs_writepage,	/* Write dirty page to disk. */
+#endif /* NTFS_RW */
+	.migratepage	= buffer_migrate_page,	/* Move a page cache page from
+						   one physical page to an
+						   other. */
+	.error_remove_page = generic_error_remove_page,
+};
+
+/**
+ * ntfs_mst_aops - general address space operations for mst protecteed inodes
+ *		   and attributes
+ */
+const struct address_space_operations ntfs_mst_aops = {
+	.readpage	= ntfs_readpage,	/* Fill page with data. */
+#ifdef NTFS_RW
+	.writepage	= ntfs_writepage,	/* Write dirty page to disk. */
+	.set_page_dirty	= __set_page_dirty_nobuffers,	/* Set the page dirty
+						   without touching the buffers
+						   belonging to the page. */
+#endif /* NTFS_RW */
+	.migratepage	= buffer_migrate_page,	/* Move a page cache page from
+						   one physical page to an
+						   other. */
+	.error_remove_page = generic_error_remove_page,
+};
+
+#ifdef NTFS_RW
+
+/**
+ * mark_ntfs_record_dirty - mark an ntfs record dirty
+ * @page:	page containing the ntfs record to mark dirty
+ * @ofs:	byte offset within @page at which the ntfs record begins
+ *
+ * Set the buffers and the page in which the ntfs record is located dirty.
+ *
+ * The latter also marks the vfs inode the ntfs record belongs to dirty
+ * (I_DIRTY_PAGES only).
+ *
+ * If the page does not have buffers, we create them and set them uptodate.
+ * The page may not be locked which is why we need to handle the buffers under
+ * the mapping->private_lock.  Once the buffers are marked dirty we no longer
+ * need the lock since try_to_free_buffers() does not free dirty buffers.
+ */
+void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
+	struct address_space *mapping = page->mapping;
+	ntfs_inode *ni = NTFS_I(mapping->host);
+	struct buffer_head *bh, *head, *buffers_to_free = NULL;
+	unsigned int end, bh_size, bh_ofs;
+
+	BUG_ON(!PageUptodate(page));
+	end = ofs + ni->itype.index.block_size;
+	bh_size = VFS_I(ni)->i_sb->s_blocksize;
+	spin_lock(&mapping->private_lock);
+	if (unlikely(!page_has_buffers(page))) {
+		spin_unlock(&mapping->private_lock);
+		bh = head = alloc_page_buffers(page, bh_size, 1);
+		spin_lock(&mapping->private_lock);
+		if (likely(!page_has_buffers(page))) {
+			struct buffer_head *tail;
+
+			do {
+				set_buffer_uptodate(bh);
+				tail = bh;
+				bh = bh->b_this_page;
+			} while (bh);
+			tail->b_this_page = head;
+			attach_page_buffers(page, head);
+		} else
+			buffers_to_free = bh;
+	}
+	bh = head = page_buffers(page);
+	BUG_ON(!bh);
+	do {
+		bh_ofs = bh_offset(bh);
+		if (bh_ofs + bh_size <= ofs)
+			continue;
+		if (unlikely(bh_ofs >= end))
+			break;
+		set_buffer_dirty(bh);
+	} while ((bh = bh->b_this_page) != head);
+	spin_unlock(&mapping->private_lock);
+	__set_page_dirty_nobuffers(page);
+	if (unlikely(buffers_to_free)) {
+		do {
+			bh = buffers_to_free->b_this_page;
+			free_buffer_head(buffers_to_free);
+			buffers_to_free = bh;
+		} while (buffers_to_free);
+	}
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
new file mode 100644
index 00000000..caecc58f
--- /dev/null
+++ b/fs/ntfs/aops.h
@@ -0,0 +1,107 @@
+/**
+ * aops.h - Defines for NTFS kernel address space operations and page cache
+ *	    handling.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_AOPS_H
+#define _LINUX_NTFS_AOPS_H
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/fs.h>
+
+#include "inode.h"
+
+/**
+ * ntfs_unmap_page - release a page that was mapped using ntfs_map_page()
+ * @page:	the page to release
+ *
+ * Unpin, unmap and release a page that was obtained from ntfs_map_page().
+ */
+static inline void ntfs_unmap_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+/**
+ * ntfs_map_page - map a page into accessible memory, reading it if necessary
+ * @mapping:	address space for which to obtain the page
+ * @index:	index into the page cache for @mapping of the page to map
+ *
+ * Read a page from the page cache of the address space @mapping at position
+ * @index, where @index is in units of PAGE_CACHE_SIZE, and not in bytes.
+ *
+ * If the page is not in memory it is loaded from disk first using the readpage
+ * method defined in the address space operations of @mapping and the page is
+ * added to the page cache of @mapping in the process.
+ *
+ * If the page belongs to an mst protected attribute and it is marked as such
+ * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no
+ * error checking is performed.  This means the caller has to verify whether
+ * the ntfs record(s) contained in the page are valid or not using one of the
+ * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are
+ * expecting to see.  (For details of the macros, see fs/ntfs/layout.h.)
+ *
+ * If the page is in high memory it is mapped into memory directly addressible
+ * by the kernel.
+ *
+ * Finally the page count is incremented, thus pinning the page into place.
+ *
+ * The above means that page_address(page) can be used on all pages obtained
+ * with ntfs_map_page() to get the kernel virtual address of the page.
+ *
+ * When finished with the page, the caller has to call ntfs_unmap_page() to
+ * unpin, unmap and release the page.
+ *
+ * Note this does not grant exclusive access. If such is desired, the caller
+ * must provide it independently of the ntfs_{un}map_page() calls by using
+ * a {rw_}semaphore or other means of serialization. A spin lock cannot be
+ * used as ntfs_map_page() can block.
+ *
+ * The unlocked and uptodate page is returned on success or an encoded error
+ * on failure. Caller has to test for error using the IS_ERR() macro on the
+ * return value. If that evaluates to 'true', the negative error code can be
+ * obtained using PTR_ERR() on the return value of ntfs_map_page().
+ */
+static inline struct page *ntfs_map_page(struct address_space *mapping,
+		unsigned long index)
+{
+	struct page *page = read_mapping_page(mapping, index, NULL);
+
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (!PageError(page))
+			return page;
+		ntfs_unmap_page(page);
+		return ERR_PTR(-EIO);
+	}
+	return page;
+}
+
+#ifdef NTFS_RW
+
+extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_AOPS_H */
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
new file mode 100644
index 00000000..f14fde2b
--- /dev/null
+++ b/fs/ntfs/attrib.c
@@ -0,0 +1,2615 @@
+/**
+ * attrib.c - NTFS attribute operations.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+
+#include "attrib.h"
+#include "debug.h"
+#include "layout.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
+#include "ntfs.h"
+#include "types.h"
+
+/**
+ * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode
+ * @ni:		ntfs inode for which to map (part of) a runlist
+ * @vcn:	map runlist part containing this vcn
+ * @ctx:	active attribute search context if present or NULL if not
+ *
+ * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
+ *
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record.  This is needed when ntfs_map_runlist_nolock() encounters unmapped
+ * runlist fragments and allows their mapping.  If you do not have the mft
+ * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock()
+ * will perform the necessary mapping and unmapping.
+ *
+ * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and
+ * restores it before returning.  Thus, @ctx will be left pointing to the same
+ * attribute on return as on entry.  However, the actual pointers in @ctx may
+ * point to different memory locations on return, so you must remember to reset
+ * any cached pointers from the @ctx, i.e. after the call to
+ * ntfs_map_runlist_nolock(), you will probably want to do:
+ *	m = ctx->mrec;
+ *	a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ *
+ * Return 0 on success and -errno on error.  There is one special error code
+ * which is not an error as such.  This is -ENOENT.  It means that @vcn is out
+ * of bounds of the runlist.
+ *
+ * Note the runlist can be NULL after this function returns if @vcn is zero and
+ * the attribute has zero allocated size, i.e. there simply is no runlist.
+ *
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ *	    is no longer valid, i.e. you need to either call
+ *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ *	    why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ *	      and is locked on return.  Note the runlist will be modified.
+ *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ *	      entry and it will be left unmapped on return.
+ *	    - If @ctx is not NULL, the base mft record must be mapped on entry
+ *	      and it will be left mapped on return.
+ */
+int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
+{
+	VCN end_vcn;
+	unsigned long flags;
+	ntfs_inode *base_ni;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	runlist_element *rl;
+	struct page *put_this_page = NULL;
+	int err = 0;
+	bool ctx_is_temporary, ctx_needs_reset;
+	ntfs_attr_search_ctx old_ctx = { NULL, };
+
+	ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
+			(unsigned long long)vcn);
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	if (!ctx) {
+		ctx_is_temporary = ctx_needs_reset = true;
+		m = map_mft_record(base_ni);
+		if (IS_ERR(m))
+			return PTR_ERR(m);
+		ctx = ntfs_attr_get_search_ctx(base_ni, m);
+		if (unlikely(!ctx)) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+	} else {
+		VCN allocated_size_vcn;
+
+		BUG_ON(IS_ERR(ctx->mrec));
+		a = ctx->attr;
+		BUG_ON(!a->non_resident);
+		ctx_is_temporary = false;
+		end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+		read_lock_irqsave(&ni->size_lock, flags);
+		allocated_size_vcn = ni->allocated_size >>
+				ni->vol->cluster_size_bits;
+		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (!a->data.non_resident.lowest_vcn && end_vcn <= 0)
+			end_vcn = allocated_size_vcn - 1;
+		/*
+		 * If we already have the attribute extent containing @vcn in
+		 * @ctx, no need to look it up again.  We slightly cheat in
+		 * that if vcn exceeds the allocated size, we will refuse to
+		 * map the runlist below, so there is definitely no need to get
+		 * the right attribute extent.
+		 */
+		if (vcn >= allocated_size_vcn || (a->type == ni->type &&
+				a->name_length == ni->name_len &&
+				!memcmp((u8*)a + le16_to_cpu(a->name_offset),
+				ni->name, ni->name_len) &&
+				sle64_to_cpu(a->data.non_resident.lowest_vcn)
+				<= vcn && end_vcn >= vcn))
+			ctx_needs_reset = false;
+		else {
+			/* Save the old search context. */
+			old_ctx = *ctx;
+			/*
+			 * If the currently mapped (extent) inode is not the
+			 * base inode we will unmap it when we reinitialize the
+			 * search context which means we need to get a
+			 * reference to the page containing the mapped mft
+			 * record so we do not accidentally drop changes to the
+			 * mft record when it has not been marked dirty yet.
+			 */
+			if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino !=
+					old_ctx.base_ntfs_ino) {
+				put_this_page = old_ctx.ntfs_ino->page;
+				page_cache_get(put_this_page);
+			}
+			/*
+			 * Reinitialize the search context so we can lookup the
+			 * needed attribute extent.
+			 */
+			ntfs_attr_reinit_search_ctx(ctx);
+			ctx_needs_reset = true;
+		}
+	}
+	if (ctx_needs_reset) {
+		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, vcn, NULL, 0, ctx);
+		if (unlikely(err)) {
+			if (err == -ENOENT)
+				err = -EIO;
+			goto err_out;
+		}
+		BUG_ON(!ctx->attr->non_resident);
+	}
+	a = ctx->attr;
+	/*
+	 * Only decompress the mapping pairs if @vcn is inside it.  Otherwise
+	 * we get into problems when we try to map an out of bounds vcn because
+	 * we then try to map the already mapped runlist fragment and
+	 * ntfs_mapping_pairs_decompress() fails.
+	 */
+	end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
+	if (unlikely(vcn && vcn >= end_vcn)) {
+		err = -ENOENT;
+		goto err_out;
+	}
+	rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl);
+	if (IS_ERR(rl))
+		err = PTR_ERR(rl);
+	else
+		ni->runlist.rl = rl;
+err_out:
+	if (ctx_is_temporary) {
+		if (likely(ctx))
+			ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(base_ni);
+	} else if (ctx_needs_reset) {
+		/*
+		 * If there is no attribute list, restoring the search context
+		 * is accomplished simply by copying the saved context back over
+		 * the caller supplied context.  If there is an attribute list,
+		 * things are more complicated as we need to deal with mapping
+		 * of mft records and resulting potential changes in pointers.
+		 */
+		if (NInoAttrList(base_ni)) {
+			/*
+			 * If the currently mapped (extent) inode is not the
+			 * one we had before, we need to unmap it and map the
+			 * old one.
+			 */
+			if (ctx->ntfs_ino != old_ctx.ntfs_ino) {
+				/*
+				 * If the currently mapped inode is not the
+				 * base inode, unmap it.
+				 */
+				if (ctx->base_ntfs_ino && ctx->ntfs_ino !=
+						ctx->base_ntfs_ino) {
+					unmap_extent_mft_record(ctx->ntfs_ino);
+					ctx->mrec = ctx->base_mrec;
+					BUG_ON(!ctx->mrec);
+				}
+				/*
+				 * If the old mapped inode is not the base
+				 * inode, map it.
+				 */
+				if (old_ctx.base_ntfs_ino &&
+						old_ctx.ntfs_ino !=
+						old_ctx.base_ntfs_ino) {
+retry_map:
+					ctx->mrec = map_mft_record(
+							old_ctx.ntfs_ino);
+					/*
+					 * Something bad has happened.  If out
+					 * of memory retry till it succeeds.
+					 * Any other errors are fatal and we
+					 * return the error code in ctx->mrec.
+					 * Let the caller deal with it...  We
+					 * just need to fudge things so the
+					 * caller can reinit and/or put the
+					 * search context safely.
+					 */
+					if (IS_ERR(ctx->mrec)) {
+						if (PTR_ERR(ctx->mrec) ==
+								-ENOMEM) {
+							schedule();
+							goto retry_map;
+						} else
+							old_ctx.ntfs_ino =
+								old_ctx.
+								base_ntfs_ino;
+					}
+				}
+			}
+			/* Update the changed pointers in the saved context. */
+			if (ctx->mrec != old_ctx.mrec) {
+				if (!IS_ERR(ctx->mrec))
+					old_ctx.attr = (ATTR_RECORD*)(
+							(u8*)ctx->mrec +
+							((u8*)old_ctx.attr -
+							(u8*)old_ctx.mrec));
+				old_ctx.mrec = ctx->mrec;
+			}
+		}
+		/* Restore the search context to the saved one. */
+		*ctx = old_ctx;
+		/*
+		 * We drop the reference on the page we took earlier.  In the
+		 * case that IS_ERR(ctx->mrec) is true this means we might lose
+		 * some changes to the mft record that had been made between
+		 * the last time it was marked dirty/written out and now.  This
+		 * at this stage is not a problem as the mapping error is fatal
+		 * enough that the mft record cannot be written out anyway and
+		 * the caller is very likely to shutdown the whole inode
+		 * immediately and mark the volume dirty for chkdsk to pick up
+		 * the pieces anyway.
+		 */
+		if (put_this_page)
+			page_cache_release(put_this_page);
+	}
+	return err;
+}
+
+/**
+ * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode
+ * @ni:		ntfs inode for which to map (part of) a runlist
+ * @vcn:	map runlist part containing this vcn
+ *
+ * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
+ *
+ * Return 0 on success and -errno on error.  There is one special error code
+ * which is not an error as such.  This is -ENOENT.  It means that @vcn is out
+ * of bounds of the runlist.
+ *
+ * Locking: - The runlist must be unlocked on entry and is unlocked on return.
+ *	    - This function takes the runlist lock for writing and may modify
+ *	      the runlist.
+ */
+int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
+{
+	int err = 0;
+
+	down_write(&ni->runlist.lock);
+	/* Make sure someone else didn't do the work while we were sleeping. */
+	if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
+			LCN_RL_NOT_MAPPED))
+		err = ntfs_map_runlist_nolock(ni, vcn, NULL);
+	up_write(&ni->runlist.lock);
+	return err;
+}
+
+/**
+ * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode
+ * @ni:			ntfs inode of the attribute whose runlist to search
+ * @vcn:		vcn to convert
+ * @write_locked:	true if the runlist is locked for writing
+ *
+ * Find the virtual cluster number @vcn in the runlist of the ntfs attribute
+ * described by the ntfs inode @ni and return the corresponding logical cluster
+ * number (lcn).
+ *
+ * If the @vcn is not mapped yet, the attempt is made to map the attribute
+ * extent containing the @vcn and the vcn to lcn conversion is retried.
+ *
+ * If @write_locked is true the caller has locked the runlist for writing and
+ * if false for reading.
+ *
+ * Since lcns must be >= 0, we use negative return codes with special meaning:
+ *
+ * Return code	Meaning / Description
+ * ==========================================
+ *  LCN_HOLE	Hole / not allocated on disk.
+ *  LCN_ENOENT	There is no such vcn in the runlist, i.e. @vcn is out of bounds.
+ *  LCN_ENOMEM	Not enough memory to map runlist.
+ *  LCN_EIO	Critical error (runlist/file is corrupt, i/o error, etc).
+ *
+ * Locking: - The runlist must be locked on entry and is left locked on return.
+ *	    - If @write_locked is 'false', i.e. the runlist is locked for reading,
+ *	      the lock may be dropped inside the function so you cannot rely on
+ *	      the runlist still being the same when this function returns.
+ */
+LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
+		const bool write_locked)
+{
+	LCN lcn;
+	unsigned long flags;
+	bool is_retry = false;
+
+	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
+			ni->mft_no, (unsigned long long)vcn,
+			write_locked ? "write" : "read");
+	BUG_ON(!ni);
+	BUG_ON(!NInoNonResident(ni));
+	BUG_ON(vcn < 0);
+	if (!ni->runlist.rl) {
+		read_lock_irqsave(&ni->size_lock, flags);
+		if (!ni->allocated_size) {
+			read_unlock_irqrestore(&ni->size_lock, flags);
+			return LCN_ENOENT;
+		}
+		read_unlock_irqrestore(&ni->size_lock, flags);
+	}
+retry_remap:
+	/* Convert vcn to lcn.  If that fails map the runlist and retry once. */
+	lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn);
+	if (likely(lcn >= LCN_HOLE)) {
+		ntfs_debug("Done, lcn 0x%llx.", (long long)lcn);
+		return lcn;
+	}
+	if (lcn != LCN_RL_NOT_MAPPED) {
+		if (lcn != LCN_ENOENT)
+			lcn = LCN_EIO;
+	} else if (!is_retry) {
+		int err;
+
+		if (!write_locked) {
+			up_read(&ni->runlist.lock);
+			down_write(&ni->runlist.lock);
+			if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) !=
+					LCN_RL_NOT_MAPPED)) {
+				up_write(&ni->runlist.lock);
+				down_read(&ni->runlist.lock);
+				goto retry_remap;
+			}
+		}
+		err = ntfs_map_runlist_nolock(ni, vcn, NULL);
+		if (!write_locked) {
+			up_write(&ni->runlist.lock);
+			down_read(&ni->runlist.lock);
+		}
+		if (likely(!err)) {
+			is_retry = true;
+			goto retry_remap;
+		}
+		if (err == -ENOENT)
+			lcn = LCN_ENOENT;
+		else if (err == -ENOMEM)
+			lcn = LCN_ENOMEM;
+		else
+			lcn = LCN_EIO;
+	}
+	if (lcn != LCN_ENOENT)
+		ntfs_error(ni->vol->sb, "Failed with error code %lli.",
+				(long long)lcn);
+	return lcn;
+}
+
+/**
+ * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode
+ * @ni:		ntfs inode describing the runlist to search
+ * @vcn:	vcn to find
+ * @ctx:	active attribute search context if present or NULL if not
+ *
+ * Find the virtual cluster number @vcn in the runlist described by the ntfs
+ * inode @ni and return the address of the runlist element containing the @vcn.
+ *
+ * If the @vcn is not mapped yet, the attempt is made to map the attribute
+ * extent containing the @vcn and the vcn to lcn conversion is retried.
+ *
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record.  This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped
+ * runlist fragments and allows their mapping.  If you do not have the mft
+ * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock()
+ * will perform the necessary mapping and unmapping.
+ *
+ * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and
+ * restores it before returning.  Thus, @ctx will be left pointing to the same
+ * attribute on return as on entry.  However, the actual pointers in @ctx may
+ * point to different memory locations on return, so you must remember to reset
+ * any cached pointers from the @ctx, i.e. after the call to
+ * ntfs_attr_find_vcn_nolock(), you will probably want to do:
+ *	m = ctx->mrec;
+ *	a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ * Note you need to distinguish between the lcn of the returned runlist element
+ * being >= 0 and LCN_HOLE.  In the later case you have to return zeroes on
+ * read and allocate clusters on write.
+ *
+ * Return the runlist element containing the @vcn on success and
+ * ERR_PTR(-errno) on error.  You need to test the return value with IS_ERR()
+ * to decide if the return is success or failure and PTR_ERR() to get to the
+ * error code if IS_ERR() is true.
+ *
+ * The possible error return codes are:
+ *	-ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds.
+ *	-ENOMEM - Not enough memory to map runlist.
+ *	-EIO	- Critical error (runlist/file is corrupt, i/o error, etc).
+ *
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ *	    is no longer valid, i.e. you need to either call
+ *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ *	    why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ *	      and is locked on return.  Note the runlist may be modified when
+ *	      needed runlist fragments need to be mapped.
+ *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ *	      entry and it will be left unmapped on return.
+ *	    - If @ctx is not NULL, the base mft record must be mapped on entry
+ *	      and it will be left mapped on return.
+ */
+runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
+		ntfs_attr_search_ctx *ctx)
+{
+	unsigned long flags;
+	runlist_element *rl;
+	int err = 0;
+	bool is_retry = false;
+
+	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.",
+			ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out");
+	BUG_ON(!ni);
+	BUG_ON(!NInoNonResident(ni));
+	BUG_ON(vcn < 0);
+	if (!ni->runlist.rl) {
+		read_lock_irqsave(&ni->size_lock, flags);
+		if (!ni->allocated_size) {
+			read_unlock_irqrestore(&ni->size_lock, flags);
+			return ERR_PTR(-ENOENT);
+		}
+		read_unlock_irqrestore(&ni->size_lock, flags);
+	}
+retry_remap:
+	rl = ni->runlist.rl;
+	if (likely(rl && vcn >= rl[0].vcn)) {
+		while (likely(rl->length)) {
+			if (unlikely(vcn < rl[1].vcn)) {
+				if (likely(rl->lcn >= LCN_HOLE)) {
+					ntfs_debug("Done.");
+					return rl;
+				}
+				break;
+			}
+			rl++;
+		}
+		if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) {
+			if (likely(rl->lcn == LCN_ENOENT))
+				err = -ENOENT;
+			else
+				err = -EIO;
+		}
+	}
+	if (!err && !is_retry) {
+		/*
+		 * If the search context is invalid we cannot map the unmapped
+		 * region.
+		 */
+		if (IS_ERR(ctx->mrec))
+			err = PTR_ERR(ctx->mrec);
+		else {
+			/*
+			 * The @vcn is in an unmapped region, map the runlist
+			 * and retry.
+			 */
+			err = ntfs_map_runlist_nolock(ni, vcn, ctx);
+			if (likely(!err)) {
+				is_retry = true;
+				goto retry_remap;
+			}
+		}
+		if (err == -EINVAL)
+			err = -EIO;
+	} else if (!err)
+		err = -EIO;
+	if (err != -ENOENT)
+		ntfs_error(ni->vol->sb, "Failed with error code %i.", err);
+	return ERR_PTR(err);
+}
+
+/**
+ * ntfs_attr_find - find (next) attribute in mft record
+ * @type:	attribute type to find
+ * @name:	attribute name to find (optional, i.e. NULL means don't care)
+ * @name_len:	attribute name length (only needed if @name present)
+ * @ic:		IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
+ * @val:	attribute value to find (optional, resident attributes only)
+ * @val_len:	attribute value length
+ * @ctx:	search context with mft record and attribute to search from
+ *
+ * You should not need to call this function directly.  Use ntfs_attr_lookup()
+ * instead.
+ *
+ * ntfs_attr_find() takes a search context @ctx as parameter and searches the
+ * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an
+ * attribute of @type, optionally @name and @val.
+ *
+ * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will
+ * point to the found attribute.
+ *
+ * If the attribute is not found, ntfs_attr_find() returns -ENOENT and
+ * @ctx->attr will point to the attribute before which the attribute being
+ * searched for would need to be inserted if such an action were to be desired.
+ *
+ * On actual error, ntfs_attr_find() returns -EIO.  In this case @ctx->attr is
+ * undefined and in particular do not rely on it not changing.
+ *
+ * If @ctx->is_first is 'true', the search begins with @ctx->attr itself.  If it
+ * is 'false', the search begins after @ctx->attr.
+ *
+ * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and
+ * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record
+ * @ctx->mrec belongs.  This is so we can get at the ntfs volume and hence at
+ * the upcase table.  If @ic is CASE_SENSITIVE, the comparison is case
+ * sensitive.  When @name is present, @name_len is the @name length in Unicode
+ * characters.
+ *
+ * If @name is not present (NULL), we assume that the unnamed attribute is
+ * being searched for.
+ *
+ * Finally, the resident attribute value @val is looked for, if present.  If
+ * @val is not present (NULL), @val_len is ignored.
+ *
+ * ntfs_attr_find() only searches the specified mft record and it ignores the
+ * presence of an attribute list attribute (unless it is the one being searched
+ * for, obviously).  If you need to take attribute lists into consideration,
+ * use ntfs_attr_lookup() instead (see below).  This also means that you cannot
+ * use ntfs_attr_find() to search for extent records of non-resident
+ * attributes, as extents with lowest_vcn != 0 are usually described by the
+ * attribute list attribute only. - Note that it is possible that the first
+ * extent is only in the attribute list while the last extent is in the base
+ * mft record, so do not rely on being able to find the first extent in the
+ * base mft record.
+ *
+ * Warning: Never use @val when looking for attribute types which can be
+ *	    non-resident as this most likely will result in a crash!
+ */
+static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
+		const u32 name_len, const IGNORE_CASE_BOOL ic,
+		const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
+{
+	ATTR_RECORD *a;
+	ntfs_volume *vol = ctx->ntfs_ino->vol;
+	ntfschar *upcase = vol->upcase;
+	u32 upcase_len = vol->upcase_len;
+
+	/*
+	 * Iterate over attributes in mft record starting at @ctx->attr, or the
+	 * attribute following that, if @ctx->is_first is 'true'.
+	 */
+	if (ctx->is_first) {
+		a = ctx->attr;
+		ctx->is_first = false;
+	} else
+		a = (ATTR_RECORD*)((u8*)ctx->attr +
+				le32_to_cpu(ctx->attr->length));
+	for (;;	a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
+		if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
+				le32_to_cpu(ctx->mrec->bytes_allocated))
+			break;
+		ctx->attr = a;
+		if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
+				a->type == AT_END))
+			return -ENOENT;
+		if (unlikely(!a->length))
+			break;
+		if (a->type != type)
+			continue;
+		/*
+		 * If @name is present, compare the two names.  If @name is
+		 * missing, assume we want an unnamed attribute.
+		 */
+		if (!name) {
+			/* The search failed if the found attribute is named. */
+			if (a->name_length)
+				return -ENOENT;
+		} else if (!ntfs_are_names_equal(name, name_len,
+			    (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)),
+			    a->name_length, ic, upcase, upcase_len)) {
+			register int rc;
+
+			rc = ntfs_collate_names(name, name_len,
+					(ntfschar*)((u8*)a +
+					le16_to_cpu(a->name_offset)),
+					a->name_length, 1, IGNORE_CASE,
+					upcase, upcase_len);
+			/*
+			 * If @name collates before a->name, there is no
+			 * matching attribute.
+			 */
+			if (rc == -1)
+				return -ENOENT;
+			/* If the strings are not equal, continue search. */
+			if (rc)
+				continue;
+			rc = ntfs_collate_names(name, name_len,
+					(ntfschar*)((u8*)a +
+					le16_to_cpu(a->name_offset)),
+					a->name_length, 1, CASE_SENSITIVE,
+					upcase, upcase_len);
+			if (rc == -1)
+				return -ENOENT;
+			if (rc)
+				continue;
+		}
+		/*
+		 * The names match or @name not present and attribute is
+		 * unnamed.  If no @val specified, we have found the attribute
+		 * and are done.
+		 */
+		if (!val)
+			return 0;
+		/* @val is present; compare values. */
+		else {
+			register int rc;
+
+			rc = memcmp(val, (u8*)a + le16_to_cpu(
+					a->data.resident.value_offset),
+					min_t(u32, val_len, le32_to_cpu(
+					a->data.resident.value_length)));
+			/*
+			 * If @val collates before the current attribute's
+			 * value, there is no matching attribute.
+			 */
+			if (!rc) {
+				register u32 avl;
+
+				avl = le32_to_cpu(
+						a->data.resident.value_length);
+				if (val_len == avl)
+					return 0;
+				if (val_len < avl)
+					return -ENOENT;
+			} else if (rc < 0)
+				return -ENOENT;
+		}
+	}
+	ntfs_error(vol->sb, "Inode is corrupt.  Run chkdsk.");
+	NVolSetErrors(vol);
+	return -EIO;
+}
+
+/**
+ * load_attribute_list - load an attribute list into memory
+ * @vol:		ntfs volume from which to read
+ * @runlist:		runlist of the attribute list
+ * @al_start:		destination buffer
+ * @size:		size of the destination buffer in bytes
+ * @initialized_size:	initialized size of the attribute list
+ *
+ * Walk the runlist @runlist and load all clusters from it copying them into
+ * the linear buffer @al. The maximum number of bytes copied to @al is @size
+ * bytes. Note, @size does not need to be a multiple of the cluster size. If
+ * @initialized_size is less than @size, the region in @al between
+ * @initialized_size and @size will be zeroed and not read from disk.
+ *
+ * Return 0 on success or -errno on error.
+ */
+int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start,
+		const s64 size, const s64 initialized_size)
+{
+	LCN lcn;
+	u8 *al = al_start;
+	u8 *al_end = al + initialized_size;
+	runlist_element *rl;
+	struct buffer_head *bh;
+	struct super_block *sb;
+	unsigned long block_size;
+	unsigned long block, max_block;
+	int err = 0;
+	unsigned char block_size_bits;
+
+	ntfs_debug("Entering.");
+	if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 ||
+			initialized_size > size)
+		return -EINVAL;
+	if (!initialized_size) {
+		memset(al, 0, size);
+		return 0;
+	}
+	sb = vol->sb;
+	block_size = sb->s_blocksize;
+	block_size_bits = sb->s_blocksize_bits;
+	down_read(&runlist->lock);
+	rl = runlist->rl;
+	if (!rl) {
+		ntfs_error(sb, "Cannot read attribute list since runlist is "
+				"missing.");
+		goto err_out;	
+	}
+	/* Read all clusters specified by the runlist one run at a time. */
+	while (rl->length) {
+		lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn);
+		ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
+				(unsigned long long)rl->vcn,
+				(unsigned long long)lcn);
+		/* The attribute list cannot be sparse. */
+		if (lcn < 0) {
+			ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed.  Cannot "
+					"read attribute list.");
+			goto err_out;
+		}
+		block = lcn << vol->cluster_size_bits >> block_size_bits;
+		/* Read the run from device in chunks of block_size bytes. */
+		max_block = block + (rl->length << vol->cluster_size_bits >>
+				block_size_bits);
+		ntfs_debug("max_block = 0x%lx.", max_block);
+		do {
+			ntfs_debug("Reading block = 0x%lx.", block);
+			bh = sb_bread(sb, block);
+			if (!bh) {
+				ntfs_error(sb, "sb_bread() failed. Cannot "
+						"read attribute list.");
+				goto err_out;
+			}
+			if (al + block_size >= al_end)
+				goto do_final;
+			memcpy(al, bh->b_data, block_size);
+			brelse(bh);
+			al += block_size;
+		} while (++block < max_block);
+		rl++;
+	}
+	if (initialized_size < size) {
+initialize:
+		memset(al_start + initialized_size, 0, size - initialized_size);
+	}
+done:
+	up_read(&runlist->lock);
+	return err;
+do_final:
+	if (al < al_end) {
+		/*
+		 * Partial block.
+		 *
+		 * Note: The attribute list can be smaller than its allocation
+		 * by multiple clusters.  This has been encountered by at least
+		 * two people running Windows XP, thus we cannot do any
+		 * truncation sanity checking here. (AIA)
+		 */
+		memcpy(al, bh->b_data, al_end - al);
+		brelse(bh);
+		if (initialized_size < size)
+			goto initialize;
+		goto done;
+	}
+	brelse(bh);
+	/* Real overflow! */
+	ntfs_error(sb, "Attribute list buffer overflow. Read attribute list "
+			"is truncated.");
+err_out:
+	err = -EIO;
+	goto done;
+}
+
+/**
+ * ntfs_external_attr_find - find an attribute in the attribute list of an inode
+ * @type:	attribute type to find
+ * @name:	attribute name to find (optional, i.e. NULL means don't care)
+ * @name_len:	attribute name length (only needed if @name present)
+ * @ic:		IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
+ * @lowest_vcn:	lowest vcn to find (optional, non-resident attributes only)
+ * @val:	attribute value to find (optional, resident attributes only)
+ * @val_len:	attribute value length
+ * @ctx:	search context with mft record and attribute to search from
+ *
+ * You should not need to call this function directly.  Use ntfs_attr_lookup()
+ * instead.
+ *
+ * Find an attribute by searching the attribute list for the corresponding
+ * attribute list entry.  Having found the entry, map the mft record if the
+ * attribute is in a different mft record/inode, ntfs_attr_find() the attribute
+ * in there and return it.
+ *
+ * On first search @ctx->ntfs_ino must be the base mft record and @ctx must
+ * have been obtained from a call to ntfs_attr_get_search_ctx().  On subsequent
+ * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is
+ * then the base inode).
+ *
+ * After finishing with the attribute/mft record you need to call
+ * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
+ * mapped inodes, etc).
+ *
+ * If the attribute is found, ntfs_external_attr_find() returns 0 and
+ * @ctx->attr will point to the found attribute.  @ctx->mrec will point to the
+ * mft record in which @ctx->attr is located and @ctx->al_entry will point to
+ * the attribute list entry for the attribute.
+ *
+ * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and
+ * @ctx->attr will point to the attribute in the base mft record before which
+ * the attribute being searched for would need to be inserted if such an action
+ * were to be desired.  @ctx->mrec will point to the mft record in which
+ * @ctx->attr is located and @ctx->al_entry will point to the attribute list
+ * entry of the attribute before which the attribute being searched for would
+ * need to be inserted if such an action were to be desired.
+ *
+ * Thus to insert the not found attribute, one wants to add the attribute to
+ * @ctx->mrec (the base mft record) and if there is not enough space, the
+ * attribute should be placed in a newly allocated extent mft record.  The
+ * attribute list entry for the inserted attribute should be inserted in the
+ * attribute list attribute at @ctx->al_entry.
+ *
+ * On actual error, ntfs_external_attr_find() returns -EIO.  In this case
+ * @ctx->attr is undefined and in particular do not rely on it not changing.
+ */
+static int ntfs_external_attr_find(const ATTR_TYPE type,
+		const ntfschar *name, const u32 name_len,
+		const IGNORE_CASE_BOOL ic, const VCN lowest_vcn,
+		const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
+{
+	ntfs_inode *base_ni, *ni;
+	ntfs_volume *vol;
+	ATTR_LIST_ENTRY *al_entry, *next_al_entry;
+	u8 *al_start, *al_end;
+	ATTR_RECORD *a;
+	ntfschar *al_name;
+	u32 al_name_len;
+	int err = 0;
+	static const char *es = " Unmount and run chkdsk.";
+
+	ni = ctx->ntfs_ino;
+	base_ni = ctx->base_ntfs_ino;
+	ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type);
+	if (!base_ni) {
+		/* First call happens with the base mft record. */
+		base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino;
+		ctx->base_mrec = ctx->mrec;
+	}
+	if (ni == base_ni)
+		ctx->base_attr = ctx->attr;
+	if (type == AT_END)
+		goto not_found;
+	vol = base_ni->vol;
+	al_start = base_ni->attr_list;
+	al_end = al_start + base_ni->attr_list_size;
+	if (!ctx->al_entry)
+		ctx->al_entry = (ATTR_LIST_ENTRY*)al_start;
+	/*
+	 * Iterate over entries in attribute list starting at @ctx->al_entry,
+	 * or the entry following that, if @ctx->is_first is 'true'.
+	 */
+	if (ctx->is_first) {
+		al_entry = ctx->al_entry;
+		ctx->is_first = false;
+	} else
+		al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry +
+				le16_to_cpu(ctx->al_entry->length));
+	for (;; al_entry = next_al_entry) {
+		/* Out of bounds check. */
+		if ((u8*)al_entry < base_ni->attr_list ||
+				(u8*)al_entry > al_end)
+			break;	/* Inode is corrupt. */
+		ctx->al_entry = al_entry;
+		/* Catch the end of the attribute list. */
+		if ((u8*)al_entry == al_end)
+			goto not_found;
+		if (!al_entry->length)
+			break;
+		if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
+				le16_to_cpu(al_entry->length) > al_end)
+			break;
+		next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
+				le16_to_cpu(al_entry->length));
+		if (le32_to_cpu(al_entry->type) > le32_to_cpu(type))
+			goto not_found;
+		if (type != al_entry->type)
+			continue;
+		/*
+		 * If @name is present, compare the two names.  If @name is
+		 * missing, assume we want an unnamed attribute.
+		 */
+		al_name_len = al_entry->name_length;
+		al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset);
+		if (!name) {
+			if (al_name_len)
+				goto not_found;
+		} else if (!ntfs_are_names_equal(al_name, al_name_len, name,
+				name_len, ic, vol->upcase, vol->upcase_len)) {
+			register int rc;
+
+			rc = ntfs_collate_names(name, name_len, al_name,
+					al_name_len, 1, IGNORE_CASE,
+					vol->upcase, vol->upcase_len);
+			/*
+			 * If @name collates before al_name, there is no
+			 * matching attribute.
+			 */
+			if (rc == -1)
+				goto not_found;
+			/* If the strings are not equal, continue search. */
+			if (rc)
+				continue;
+			/*
+			 * FIXME: Reverse engineering showed 0, IGNORE_CASE but
+			 * that is inconsistent with ntfs_attr_find().  The
+			 * subsequent rc checks were also different.  Perhaps I
+			 * made a mistake in one of the two.  Need to recheck
+			 * which is correct or at least see what is going on...
+			 * (AIA)
+			 */
+			rc = ntfs_collate_names(name, name_len, al_name,
+					al_name_len, 1, CASE_SENSITIVE,
+					vol->upcase, vol->upcase_len);
+			if (rc == -1)
+				goto not_found;
+			if (rc)
+				continue;
+		}
+		/*
+		 * The names match or @name not present and attribute is
+		 * unnamed.  Now check @lowest_vcn.  Continue search if the
+		 * next attribute list entry still fits @lowest_vcn.  Otherwise
+		 * we have reached the right one or the search has failed.
+		 */
+		if (lowest_vcn && (u8*)next_al_entry >= al_start	    &&
+				(u8*)next_al_entry + 6 < al_end		    &&
+				(u8*)next_al_entry + le16_to_cpu(
+					next_al_entry->length) <= al_end    &&
+				sle64_to_cpu(next_al_entry->lowest_vcn) <=
+					lowest_vcn			    &&
+				next_al_entry->type == al_entry->type	    &&
+				next_al_entry->name_length == al_name_len   &&
+				ntfs_are_names_equal((ntfschar*)((u8*)
+					next_al_entry +
+					next_al_entry->name_offset),
+					next_al_entry->name_length,
+					al_name, al_name_len, CASE_SENSITIVE,
+					vol->upcase, vol->upcase_len))
+			continue;
+		if (MREF_LE(al_entry->mft_reference) == ni->mft_no) {
+			if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) {
+				ntfs_error(vol->sb, "Found stale mft "
+						"reference in attribute list "
+						"of base inode 0x%lx.%s",
+						base_ni->mft_no, es);
+				err = -EIO;
+				break;
+			}
+		} else { /* Mft references do not match. */
+			/* If there is a mapped record unmap it first. */
+			if (ni != base_ni)
+				unmap_extent_mft_record(ni);
+			/* Do we want the base record back? */
+			if (MREF_LE(al_entry->mft_reference) ==
+					base_ni->mft_no) {
+				ni = ctx->ntfs_ino = base_ni;
+				ctx->mrec = ctx->base_mrec;
+			} else {
+				/* We want an extent record. */
+				ctx->mrec = map_extent_mft_record(base_ni,
+						le64_to_cpu(
+						al_entry->mft_reference), &ni);
+				if (IS_ERR(ctx->mrec)) {
+					ntfs_error(vol->sb, "Failed to map "
+							"extent mft record "
+							"0x%lx of base inode "
+							"0x%lx.%s",
+							MREF_LE(al_entry->
+							mft_reference),
+							base_ni->mft_no, es);
+					err = PTR_ERR(ctx->mrec);
+					if (err == -ENOENT)
+						err = -EIO;
+					/* Cause @ctx to be sanitized below. */
+					ni = NULL;
+					break;
+				}
+				ctx->ntfs_ino = ni;
+			}
+			ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
+					le16_to_cpu(ctx->mrec->attrs_offset));
+		}
+		/*
+		 * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the
+		 * mft record containing the attribute represented by the
+		 * current al_entry.
+		 */
+		/*
+		 * We could call into ntfs_attr_find() to find the right
+		 * attribute in this mft record but this would be less
+		 * efficient and not quite accurate as ntfs_attr_find() ignores
+		 * the attribute instance numbers for example which become
+		 * important when one plays with attribute lists.  Also,
+		 * because a proper match has been found in the attribute list
+		 * entry above, the comparison can now be optimized.  So it is
+		 * worth re-implementing a simplified ntfs_attr_find() here.
+		 */
+		a = ctx->attr;
+		/*
+		 * Use a manual loop so we can still use break and continue
+		 * with the same meanings as above.
+		 */
+do_next_attr_loop:
+		if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
+				le32_to_cpu(ctx->mrec->bytes_allocated))
+			break;
+		if (a->type == AT_END)
+			break;
+		if (!a->length)
+			break;
+		if (al_entry->instance != a->instance)
+			goto do_next_attr;
+		/*
+		 * If the type and/or the name are mismatched between the
+		 * attribute list entry and the attribute record, there is
+		 * corruption so we break and return error EIO.
+		 */
+		if (al_entry->type != a->type)
+			break;
+		if (!ntfs_are_names_equal((ntfschar*)((u8*)a +
+				le16_to_cpu(a->name_offset)), a->name_length,
+				al_name, al_name_len, CASE_SENSITIVE,
+				vol->upcase, vol->upcase_len))
+			break;
+		ctx->attr = a;
+		/*
+		 * If no @val specified or @val specified and it matches, we
+		 * have found it!
+		 */
+		if (!val || (!a->non_resident && le32_to_cpu(
+				a->data.resident.value_length) == val_len &&
+				!memcmp((u8*)a +
+				le16_to_cpu(a->data.resident.value_offset),
+				val, val_len))) {
+			ntfs_debug("Done, found.");
+			return 0;
+		}
+do_next_attr:
+		/* Proceed to the next attribute in the current mft record. */
+		a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length));
+		goto do_next_attr_loop;
+	}
+	if (!err) {
+		ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt "
+				"attribute list attribute.%s", base_ni->mft_no,
+				es);
+		err = -EIO;
+	}
+	if (ni != base_ni) {
+		if (ni)
+			unmap_extent_mft_record(ni);
+		ctx->ntfs_ino = base_ni;
+		ctx->mrec = ctx->base_mrec;
+		ctx->attr = ctx->base_attr;
+	}
+	if (err != -ENOMEM)
+		NVolSetErrors(vol);
+	return err;
+not_found:
+	/*
+	 * If we were looking for AT_END, we reset the search context @ctx and
+	 * use ntfs_attr_find() to seek to the end of the base mft record.
+	 */
+	if (type == AT_END) {
+		ntfs_attr_reinit_search_ctx(ctx);
+		return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len,
+				ctx);
+	}
+	/*
+	 * The attribute was not found.  Before we return, we want to ensure
+	 * @ctx->mrec and @ctx->attr indicate the position at which the
+	 * attribute should be inserted in the base mft record.  Since we also
+	 * want to preserve @ctx->al_entry we cannot reinitialize the search
+	 * context using ntfs_attr_reinit_search_ctx() as this would set
+	 * @ctx->al_entry to NULL.  Thus we do the necessary bits manually (see
+	 * ntfs_attr_init_search_ctx() below).  Note, we _only_ preserve
+	 * @ctx->al_entry as the remaining fields (base_*) are identical to
+	 * their non base_ counterparts and we cannot set @ctx->base_attr
+	 * correctly yet as we do not know what @ctx->attr will be set to by
+	 * the call to ntfs_attr_find() below.
+	 */
+	if (ni != base_ni)
+		unmap_extent_mft_record(ni);
+	ctx->mrec = ctx->base_mrec;
+	ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
+			le16_to_cpu(ctx->mrec->attrs_offset));
+	ctx->is_first = true;
+	ctx->ntfs_ino = base_ni;
+	ctx->base_ntfs_ino = NULL;
+	ctx->base_mrec = NULL;
+	ctx->base_attr = NULL;
+	/*
+	 * In case there are multiple matches in the base mft record, need to
+	 * keep enumerating until we get an attribute not found response (or
+	 * another error), otherwise we would keep returning the same attribute
+	 * over and over again and all programs using us for enumeration would
+	 * lock up in a tight loop.
+	 */
+	do {
+		err = ntfs_attr_find(type, name, name_len, ic, val, val_len,
+				ctx);
+	} while (!err);
+	ntfs_debug("Done, not found.");
+	return err;
+}
+
+/**
+ * ntfs_attr_lookup - find an attribute in an ntfs inode
+ * @type:	attribute type to find
+ * @name:	attribute name to find (optional, i.e. NULL means don't care)
+ * @name_len:	attribute name length (only needed if @name present)
+ * @ic:		IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
+ * @lowest_vcn:	lowest vcn to find (optional, non-resident attributes only)
+ * @val:	attribute value to find (optional, resident attributes only)
+ * @val_len:	attribute value length
+ * @ctx:	search context with mft record and attribute to search from
+ *
+ * Find an attribute in an ntfs inode.  On first search @ctx->ntfs_ino must
+ * be the base mft record and @ctx must have been obtained from a call to
+ * ntfs_attr_get_search_ctx().
+ *
+ * This function transparently handles attribute lists and @ctx is used to
+ * continue searches where they were left off at.
+ *
+ * After finishing with the attribute/mft record you need to call
+ * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
+ * mapped inodes, etc).
+ *
+ * Return 0 if the search was successful and -errno if not.
+ *
+ * When 0, @ctx->attr is the found attribute and it is in mft record
+ * @ctx->mrec.  If an attribute list attribute is present, @ctx->al_entry is
+ * the attribute list entry of the found attribute.
+ *
+ * When -ENOENT, @ctx->attr is the attribute which collates just after the
+ * attribute being searched for, i.e. if one wants to add the attribute to the
+ * mft record this is the correct place to insert it into.  If an attribute
+ * list attribute is present, @ctx->al_entry is the attribute list entry which
+ * collates just after the attribute list entry of the attribute being searched
+ * for, i.e. if one wants to add the attribute to the mft record this is the
+ * correct place to insert its attribute list entry into.
+ *
+ * When -errno != -ENOENT, an error occurred during the lookup.  @ctx->attr is
+ * then undefined and in particular you should not rely on it not changing.
+ */
+int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
+		const u32 name_len, const IGNORE_CASE_BOOL ic,
+		const VCN lowest_vcn, const u8 *val, const u32 val_len,
+		ntfs_attr_search_ctx *ctx)
+{
+	ntfs_inode *base_ni;
+
+	ntfs_debug("Entering.");
+	BUG_ON(IS_ERR(ctx->mrec));
+	if (ctx->base_ntfs_ino)
+		base_ni = ctx->base_ntfs_ino;
+	else
+		base_ni = ctx->ntfs_ino;
+	/* Sanity check, just for debugging really. */
+	BUG_ON(!base_ni);
+	if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST)
+		return ntfs_attr_find(type, name, name_len, ic, val, val_len,
+				ctx);
+	return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn,
+			val, val_len, ctx);
+}
+
+/**
+ * ntfs_attr_init_search_ctx - initialize an attribute search context
+ * @ctx:	attribute search context to initialize
+ * @ni:		ntfs inode with which to initialize the search context
+ * @mrec:	mft record with which to initialize the search context
+ *
+ * Initialize the attribute search context @ctx with @ni and @mrec.
+ */
+static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx,
+		ntfs_inode *ni, MFT_RECORD *mrec)
+{
+	*ctx = (ntfs_attr_search_ctx) {
+		.mrec = mrec,
+		/* Sanity checks are performed elsewhere. */
+		.attr = (ATTR_RECORD*)((u8*)mrec +
+				le16_to_cpu(mrec->attrs_offset)),
+		.is_first = true,
+		.ntfs_ino = ni,
+	};
+}
+
+/**
+ * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context
+ * @ctx:	attribute search context to reinitialize
+ *
+ * Reinitialize the attribute search context @ctx, unmapping an associated
+ * extent mft record if present, and initialize the search context again.
+ *
+ * This is used when a search for a new attribute is being started to reset
+ * the search context to the beginning.
+ */
+void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx)
+{
+	if (likely(!ctx->base_ntfs_ino)) {
+		/* No attribute list. */
+		ctx->is_first = true;
+		/* Sanity checks are performed elsewhere. */
+		ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
+				le16_to_cpu(ctx->mrec->attrs_offset));
+		/*
+		 * This needs resetting due to ntfs_external_attr_find() which
+		 * can leave it set despite having zeroed ctx->base_ntfs_ino.
+		 */
+		ctx->al_entry = NULL;
+		return;
+	} /* Attribute list. */
+	if (ctx->ntfs_ino != ctx->base_ntfs_ino)
+		unmap_extent_mft_record(ctx->ntfs_ino);
+	ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec);
+	return;
+}
+
+/**
+ * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context
+ * @ni:		ntfs inode with which to initialize the search context
+ * @mrec:	mft record with which to initialize the search context
+ *
+ * Allocate a new attribute search context, initialize it with @ni and @mrec,
+ * and return it. Return NULL if allocation failed.
+ */
+ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec)
+{
+	ntfs_attr_search_ctx *ctx;
+
+	ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS);
+	if (ctx)
+		ntfs_attr_init_search_ctx(ctx, ni, mrec);
+	return ctx;
+}
+
+/**
+ * ntfs_attr_put_search_ctx - release an attribute search context
+ * @ctx:	attribute search context to free
+ *
+ * Release the attribute search context @ctx, unmapping an associated extent
+ * mft record if present.
+ */
+void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx)
+{
+	if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino)
+		unmap_extent_mft_record(ctx->ntfs_ino);
+	kmem_cache_free(ntfs_attr_ctx_cache, ctx);
+	return;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file
+ * @vol:	ntfs volume to which the attribute belongs
+ * @type:	attribute type which to find
+ *
+ * Search for the attribute definition record corresponding to the attribute
+ * @type in the $AttrDef system file.
+ *
+ * Return the attribute type definition record if found and NULL if not found.
+ */
+static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol,
+		const ATTR_TYPE type)
+{
+	ATTR_DEF *ad;
+
+	BUG_ON(!vol->attrdef);
+	BUG_ON(!type);
+	for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef <
+			vol->attrdef_size && ad->type; ++ad) {
+		/* We have not found it yet, carry on searching. */
+		if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type)))
+			continue;
+		/* We found the attribute; return it. */
+		if (likely(ad->type == type))
+			return ad;
+		/* We have gone too far already.  No point in continuing. */
+		break;
+	}
+	/* Attribute not found. */
+	ntfs_debug("Attribute type 0x%x not found in $AttrDef.",
+			le32_to_cpu(type));
+	return NULL;
+}
+
+/**
+ * ntfs_attr_size_bounds_check - check a size of an attribute type for validity
+ * @vol:	ntfs volume to which the attribute belongs
+ * @type:	attribute type which to check
+ * @size:	size which to check
+ *
+ * Check whether the @size in bytes is valid for an attribute of @type on the
+ * ntfs volume @vol.  This information is obtained from $AttrDef system file.
+ *
+ * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not
+ * listed in $AttrDef.
+ */
+int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type,
+		const s64 size)
+{
+	ATTR_DEF *ad;
+
+	BUG_ON(size < 0);
+	/*
+	 * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not
+	 * listed in $AttrDef.
+	 */
+	if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024))
+		return -ERANGE;
+	/* Get the $AttrDef entry for the attribute @type. */
+	ad = ntfs_attr_find_in_attrdef(vol, type);
+	if (unlikely(!ad))
+		return -ENOENT;
+	/* Do the bounds check. */
+	if (((sle64_to_cpu(ad->min_size) > 0) &&
+			size < sle64_to_cpu(ad->min_size)) ||
+			((sle64_to_cpu(ad->max_size) > 0) && size >
+			sle64_to_cpu(ad->max_size)))
+		return -ERANGE;
+	return 0;
+}
+
+/**
+ * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident
+ * @vol:	ntfs volume to which the attribute belongs
+ * @type:	attribute type which to check
+ *
+ * Check whether the attribute of @type on the ntfs volume @vol is allowed to
+ * be non-resident.  This information is obtained from $AttrDef system file.
+ *
+ * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and
+ * -ENOENT if the attribute is not listed in $AttrDef.
+ */
+int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type)
+{
+	ATTR_DEF *ad;
+
+	/* Find the attribute definition record in $AttrDef. */
+	ad = ntfs_attr_find_in_attrdef(vol, type);
+	if (unlikely(!ad))
+		return -ENOENT;
+	/* Check the flags and return the result. */
+	if (ad->flags & ATTR_DEF_RESIDENT)
+		return -EPERM;
+	return 0;
+}
+
+/**
+ * ntfs_attr_can_be_resident - check if an attribute can be resident
+ * @vol:	ntfs volume to which the attribute belongs
+ * @type:	attribute type which to check
+ *
+ * Check whether the attribute of @type on the ntfs volume @vol is allowed to
+ * be resident.  This information is derived from our ntfs knowledge and may
+ * not be completely accurate, especially when user defined attributes are
+ * present.  Basically we allow everything to be resident except for index
+ * allocation and $EA attributes.
+ *
+ * Return 0 if the attribute is allowed to be non-resident and -EPERM if not.
+ *
+ * Warning: In the system file $MFT the attribute $Bitmap must be non-resident
+ *	    otherwise windows will not boot (blue screen of death)!  We cannot
+ *	    check for this here as we do not know which inode's $Bitmap is
+ *	    being asked about so the caller needs to special case this.
+ */
+int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type)
+{
+	if (type == AT_INDEX_ALLOCATION)
+		return -EPERM;
+	return 0;
+}
+
+/**
+ * ntfs_attr_record_resize - resize an attribute record
+ * @m:		mft record containing attribute record
+ * @a:		attribute record to resize
+ * @new_size:	new size in bytes to which to resize the attribute record @a
+ *
+ * Resize the attribute record @a, i.e. the resident part of the attribute, in
+ * the mft record @m to @new_size bytes.
+ *
+ * Return 0 on success and -errno on error.  The following error codes are
+ * defined:
+ *	-ENOSPC	- Not enough space in the mft record @m to perform the resize.
+ *
+ * Note: On error, no modifications have been performed whatsoever.
+ *
+ * Warning: If you make a record smaller without having copied all the data you
+ *	    are interested in the data may be overwritten.
+ */
+int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
+{
+	ntfs_debug("Entering for new_size %u.", new_size);
+	/* Align to 8 bytes if it is not already done. */
+	if (new_size & 7)
+		new_size = (new_size + 7) & ~7;
+	/* If the actual attribute length has changed, move things around. */
+	if (new_size != le32_to_cpu(a->length)) {
+		u32 new_muse = le32_to_cpu(m->bytes_in_use) -
+				le32_to_cpu(a->length) + new_size;
+		/* Not enough space in this mft record. */
+		if (new_muse > le32_to_cpu(m->bytes_allocated))
+			return -ENOSPC;
+		/* Move attributes following @a to their new location. */
+		memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length),
+				le32_to_cpu(m->bytes_in_use) - ((u8*)a -
+				(u8*)m) - le32_to_cpu(a->length));
+		/* Adjust @m to reflect the change in used space. */
+		m->bytes_in_use = cpu_to_le32(new_muse);
+		/* Adjust @a to reflect the new size. */
+		if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length))
+			a->length = cpu_to_le32(new_size);
+	}
+	return 0;
+}
+
+/**
+ * ntfs_resident_attr_value_resize - resize the value of a resident attribute
+ * @m:		mft record containing attribute record
+ * @a:		attribute record whose value to resize
+ * @new_size:	new size in bytes to which to resize the attribute value of @a
+ *
+ * Resize the value of the attribute @a in the mft record @m to @new_size bytes.
+ * If the value is made bigger, the newly allocated space is cleared.
+ *
+ * Return 0 on success and -errno on error.  The following error codes are
+ * defined:
+ *	-ENOSPC	- Not enough space in the mft record @m to perform the resize.
+ *
+ * Note: On error, no modifications have been performed whatsoever.
+ *
+ * Warning: If you make a record smaller without having copied all the data you
+ *	    are interested in the data may be overwritten.
+ */
+int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
+		const u32 new_size)
+{
+	u32 old_size;
+
+	/* Resize the resident part of the attribute record. */
+	if (ntfs_attr_record_resize(m, a,
+			le16_to_cpu(a->data.resident.value_offset) + new_size))
+		return -ENOSPC;
+	/*
+	 * The resize succeeded!  If we made the attribute value bigger, clear
+	 * the area between the old size and @new_size.
+	 */
+	old_size = le32_to_cpu(a->data.resident.value_length);
+	if (new_size > old_size)
+		memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
+				old_size, 0, new_size - old_size);
+	/* Finally update the length of the attribute value. */
+	a->data.resident.value_length = cpu_to_le32(new_size);
+	return 0;
+}
+
+/**
+ * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute
+ * @ni:		ntfs inode describing the attribute to convert
+ * @data_size:	size of the resident data to copy to the non-resident attribute
+ *
+ * Convert the resident ntfs attribute described by the ntfs inode @ni to a
+ * non-resident one.
+ *
+ * @data_size must be equal to the attribute value size.  This is needed since
+ * we need to know the size before we can map the mft record and our callers
+ * always know it.  The reason we cannot simply read the size from the vfs
+ * inode i_size is that this is not necessarily uptodate.  This happens when
+ * ntfs_attr_make_non_resident() is called in the ->truncate call path(s).
+ *
+ * Return 0 on success and -errno on error.  The following error return codes
+ * are defined:
+ *	-EPERM	- The attribute is not allowed to be non-resident.
+ *	-ENOMEM	- Not enough memory.
+ *	-ENOSPC	- Not enough disk space.
+ *	-EINVAL	- Attribute not defined on the volume.
+ *	-EIO	- I/o error or other error.
+ * Note that -ENOSPC is also returned in the case that there is not enough
+ * space in the mft record to do the conversion.  This can happen when the mft
+ * record is already very full.  The caller is responsible for trying to make
+ * space in the mft record and trying again.  FIXME: Do we need a separate
+ * error return code for this kind of -ENOSPC or is it always worth trying
+ * again in case the attribute may then fit in a resident state so no need to
+ * make it non-resident at all?  Ho-hum...  (AIA)
+ *
+ * NOTE to self: No changes in the attribute list are required to move from
+ *		 a resident to a non-resident attribute.
+ *
+ * Locking: - The caller must hold i_mutex on the inode.
+ */
+int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
+{
+	s64 new_size;
+	struct inode *vi = VFS_I(ni);
+	ntfs_volume *vol = ni->vol;
+	ntfs_inode *base_ni;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	ntfs_attr_search_ctx *ctx;
+	struct page *page;
+	runlist_element *rl;
+	u8 *kaddr;
+	unsigned long flags;
+	int mp_size, mp_ofs, name_ofs, arec_size, err, err2;
+	u32 attr_size;
+	u8 old_res_attr_flags;
+
+	/* Check that the attribute is allowed to be non-resident. */
+	err = ntfs_attr_can_be_non_resident(vol, ni->type);
+	if (unlikely(err)) {
+		if (err == -EPERM)
+			ntfs_debug("Attribute is not allowed to be "
+					"non-resident.");
+		else
+			ntfs_debug("Attribute not defined on the NTFS "
+					"volume!");
+		return err;
+	}
+	/*
+	 * FIXME: Compressed and encrypted attributes are not supported when
+	 * writing and we should never have gotten here for them.
+	 */
+	BUG_ON(NInoCompressed(ni));
+	BUG_ON(NInoEncrypted(ni));
+	/*
+	 * The size needs to be aligned to a cluster boundary for allocation
+	 * purposes.
+	 */
+	new_size = (data_size + vol->cluster_size - 1) &
+			~(vol->cluster_size - 1);
+	if (new_size > 0) {
+		/*
+		 * Will need the page later and since the page lock nests
+		 * outside all ntfs locks, we need to get the page now.
+		 */
+		page = find_or_create_page(vi->i_mapping, 0,
+				mapping_gfp_mask(vi->i_mapping));
+		if (unlikely(!page))
+			return -ENOMEM;
+		/* Start by allocating clusters to hold the attribute value. */
+		rl = ntfs_cluster_alloc(vol, 0, new_size >>
+				vol->cluster_size_bits, -1, DATA_ZONE, true);
+		if (IS_ERR(rl)) {
+			err = PTR_ERR(rl);
+			ntfs_debug("Failed to allocate cluster%s, error code "
+					"%i.", (new_size >>
+					vol->cluster_size_bits) > 1 ? "s" : "",
+					err);
+			goto page_err_out;
+		}
+	} else {
+		rl = NULL;
+		page = NULL;
+	}
+	/* Determine the size of the mapping pairs array. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1);
+	if (unlikely(mp_size < 0)) {
+		err = mp_size;
+		ntfs_debug("Failed to get size for mapping pairs array, error "
+				"code %i.", err);
+		goto rl_err_out;
+	}
+	down_write(&ni->runlist.lock);
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		ctx = NULL;
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto err_out;
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	BUG_ON(NInoNonResident(ni));
+	BUG_ON(a->non_resident);
+	/*
+	 * Calculate new offsets for the name and the mapping pairs array.
+	 */
+	if (NInoSparse(ni) || NInoCompressed(ni))
+		name_ofs = (offsetof(ATTR_REC,
+				data.non_resident.compressed_size) +
+				sizeof(a->data.non_resident.compressed_size) +
+				7) & ~7;
+	else
+		name_ofs = (offsetof(ATTR_REC,
+				data.non_resident.compressed_size) + 7) & ~7;
+	mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
+	/*
+	 * Determine the size of the resident part of the now non-resident
+	 * attribute record.
+	 */
+	arec_size = (mp_ofs + mp_size + 7) & ~7;
+	/*
+	 * If the page is not uptodate bring it uptodate by copying from the
+	 * attribute value.
+	 */
+	attr_size = le32_to_cpu(a->data.resident.value_length);
+	BUG_ON(attr_size != data_size);
+	if (page && !PageUptodate(page)) {
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy(kaddr, (u8*)a +
+				le16_to_cpu(a->data.resident.value_offset),
+				attr_size);
+		memset(kaddr + attr_size, 0, PAGE_CACHE_SIZE - attr_size);
+		kunmap_atomic(kaddr, KM_USER0);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+	}
+	/* Backup the attribute flag. */
+	old_res_attr_flags = a->data.resident.flags;
+	/* Resize the resident part of the attribute record. */
+	err = ntfs_attr_record_resize(m, a, arec_size);
+	if (unlikely(err))
+		goto err_out;
+	/*
+	 * Convert the resident part of the attribute record to describe a
+	 * non-resident attribute.
+	 */
+	a->non_resident = 1;
+	/* Move the attribute name if it exists and update the offset. */
+	if (a->name_length)
+		memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
+				a->name_length * sizeof(ntfschar));
+	a->name_offset = cpu_to_le16(name_ofs);
+	/* Setup the fields specific to non-resident attributes. */
+	a->data.non_resident.lowest_vcn = 0;
+	a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >>
+			vol->cluster_size_bits);
+	a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs);
+	memset(&a->data.non_resident.reserved, 0,
+			sizeof(a->data.non_resident.reserved));
+	a->data.non_resident.allocated_size = cpu_to_sle64(new_size);
+	a->data.non_resident.data_size =
+			a->data.non_resident.initialized_size =
+			cpu_to_sle64(attr_size);
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		a->data.non_resident.compression_unit = 0;
+		if (NInoCompressed(ni) || vol->major_ver < 3)
+			a->data.non_resident.compression_unit = 4;
+		a->data.non_resident.compressed_size =
+				a->data.non_resident.allocated_size;
+	} else
+		a->data.non_resident.compression_unit = 0;
+	/* Generate the mapping pairs array into the attribute record. */
+	err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs,
+			arec_size - mp_ofs, rl, 0, -1, NULL);
+	if (unlikely(err)) {
+		ntfs_debug("Failed to build mapping pairs, error code %i.",
+				err);
+		goto undo_err_out;
+	}
+	/* Setup the in-memory attribute structure to be non-resident. */
+	ni->runlist.rl = rl;
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->allocated_size = new_size;
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		ni->itype.compressed.size = ni->allocated_size;
+		if (a->data.non_resident.compression_unit) {
+			ni->itype.compressed.block_size = 1U << (a->data.
+					non_resident.compression_unit +
+					vol->cluster_size_bits);
+			ni->itype.compressed.block_size_bits =
+					ffs(ni->itype.compressed.block_size) -
+					1;
+			ni->itype.compressed.block_clusters = 1U <<
+					a->data.non_resident.compression_unit;
+		} else {
+			ni->itype.compressed.block_size = 0;
+			ni->itype.compressed.block_size_bits = 0;
+			ni->itype.compressed.block_clusters = 0;
+		}
+		vi->i_blocks = ni->itype.compressed.size >> 9;
+	} else
+		vi->i_blocks = ni->allocated_size >> 9;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * This needs to be last since the address space operations ->readpage
+	 * and ->writepage can run concurrently with us as they are not
+	 * serialized on i_mutex.  Note, we are not allowed to fail once we flip
+	 * this switch, which is another reason to do this last.
+	 */
+	NInoSetNonResident(ni);
+	/* Mark the mft record dirty, so it gets written back. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+	if (page) {
+		set_page_dirty(page);
+		unlock_page(page);
+		mark_page_accessed(page);
+		page_cache_release(page);
+	}
+	ntfs_debug("Done.");
+	return 0;
+undo_err_out:
+	/* Convert the attribute back into a resident attribute. */
+	a->non_resident = 0;
+	/* Move the attribute name if it exists and update the offset. */
+	name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) +
+			sizeof(a->data.resident.reserved) + 7) & ~7;
+	if (a->name_length)
+		memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
+				a->name_length * sizeof(ntfschar));
+	mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
+	a->name_offset = cpu_to_le16(name_ofs);
+	arec_size = (mp_ofs + attr_size + 7) & ~7;
+	/* Resize the resident part of the attribute record. */
+	err2 = ntfs_attr_record_resize(m, a, arec_size);
+	if (unlikely(err2)) {
+		/*
+		 * This cannot happen (well if memory corruption is at work it
+		 * could happen in theory), but deal with it as well as we can.
+		 * If the old size is too small, truncate the attribute,
+		 * otherwise simply give it a larger allocated size.
+		 * FIXME: Should check whether chkdsk complains when the
+		 * allocated size is much bigger than the resident value size.
+		 */
+		arec_size = le32_to_cpu(a->length);
+		if ((mp_ofs + attr_size) > arec_size) {
+			err2 = attr_size;
+			attr_size = arec_size - mp_ofs;
+			ntfs_error(vol->sb, "Failed to undo partial resident "
+					"to non-resident attribute "
+					"conversion.  Truncating inode 0x%lx, "
+					"attribute type 0x%x from %i bytes to "
+					"%i bytes to maintain metadata "
+					"consistency.  THIS MEANS YOU ARE "
+					"LOSING %i BYTES DATA FROM THIS %s.",
+					vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type),
+					err2, attr_size, err2 - attr_size,
+					((ni->type == AT_DATA) &&
+					!ni->name_len) ? "FILE": "ATTRIBUTE");
+			write_lock_irqsave(&ni->size_lock, flags);
+			ni->initialized_size = attr_size;
+			i_size_write(vi, attr_size);
+			write_unlock_irqrestore(&ni->size_lock, flags);
+		}
+	}
+	/* Setup the fields specific to resident attributes. */
+	a->data.resident.value_length = cpu_to_le32(attr_size);
+	a->data.resident.value_offset = cpu_to_le16(mp_ofs);
+	a->data.resident.flags = old_res_attr_flags;
+	memset(&a->data.resident.reserved, 0,
+			sizeof(a->data.resident.reserved));
+	/* Copy the data from the page back to the attribute value. */
+	if (page) {
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy((u8*)a + mp_ofs, kaddr, attr_size);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+	/* Setup the allocated size in the ntfs inode in case it changed. */
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->allocated_size = arec_size - mp_ofs;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	/* Mark the mft record dirty, so it gets written back. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	ni->runlist.rl = NULL;
+	up_write(&ni->runlist.lock);
+rl_err_out:
+	if (rl) {
+		if (ntfs_cluster_free_from_rl(vol, rl) < 0) {
+			ntfs_error(vol->sb, "Failed to release allocated "
+					"cluster(s) in error code path.  Run "
+					"chkdsk to recover the lost "
+					"cluster(s).");
+			NVolSetErrors(vol);
+		}
+		ntfs_free(rl);
+page_err_out:
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	if (err == -EINVAL)
+		err = -EIO;
+	return err;
+}
+
+/**
+ * ntfs_attr_extend_allocation - extend the allocated space of an attribute
+ * @ni:			ntfs inode of the attribute whose allocation to extend
+ * @new_alloc_size:	new size in bytes to which to extend the allocation to
+ * @new_data_size:	new size in bytes to which to extend the data to
+ * @data_start:		beginning of region which is required to be non-sparse
+ *
+ * Extend the allocated space of an attribute described by the ntfs inode @ni
+ * to @new_alloc_size bytes.  If @data_start is -1, the whole extension may be
+ * implemented as a hole in the file (as long as both the volume and the ntfs
+ * inode @ni have sparse support enabled).  If @data_start is >= 0, then the
+ * region between the old allocated size and @data_start - 1 may be made sparse
+ * but the regions between @data_start and @new_alloc_size must be backed by
+ * actual clusters.
+ *
+ * If @new_data_size is -1, it is ignored.  If it is >= 0, then the data size
+ * of the attribute is extended to @new_data_size.  Note that the i_size of the
+ * vfs inode is not updated.  Only the data size in the base attribute record
+ * is updated.  The caller has to update i_size separately if this is required.
+ * WARNING: It is a BUG() for @new_data_size to be smaller than the old data
+ * size as well as for @new_data_size to be greater than @new_alloc_size.
+ *
+ * For resident attributes this involves resizing the attribute record and if
+ * necessary moving it and/or other attributes into extent mft records and/or
+ * converting the attribute to a non-resident attribute which in turn involves
+ * extending the allocation of a non-resident attribute as described below.
+ *
+ * For non-resident attributes this involves allocating clusters in the data
+ * zone on the volume (except for regions that are being made sparse) and
+ * extending the run list to describe the allocated clusters as well as
+ * updating the mapping pairs array of the attribute.  This in turn involves
+ * resizing the attribute record and if necessary moving it and/or other
+ * attributes into extent mft records and/or splitting the attribute record
+ * into multiple extent attribute records.
+ *
+ * Also, the attribute list attribute is updated if present and in some of the
+ * above cases (the ones where extent mft records/attributes come into play),
+ * an attribute list attribute is created if not already present.
+ *
+ * Return the new allocated size on success and -errno on error.  In the case
+ * that an error is encountered but a partial extension at least up to
+ * @data_start (if present) is possible, the allocation is partially extended
+ * and this is returned.  This means the caller must check the returned size to
+ * determine if the extension was partial.  If @data_start is -1 then partial
+ * allocations are not performed.
+ *
+ * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA.
+ *
+ * Locking: This function takes the runlist lock of @ni for writing as well as
+ * locking the mft record of the base ntfs inode.  These locks are maintained
+ * throughout execution of the function.  These locks are required so that the
+ * attribute can be resized safely and so that it can for example be converted
+ * from resident to non-resident safely.
+ *
+ * TODO: At present attribute list attribute handling is not implemented.
+ *
+ * TODO: At present it is not safe to call this function for anything other
+ * than the $DATA attribute(s) of an uncompressed and unencrypted file.
+ */
+s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
+		const s64 new_data_size, const s64 data_start)
+{
+	VCN vcn;
+	s64 ll, allocated_size, start = data_start;
+	struct inode *vi = VFS_I(ni);
+	ntfs_volume *vol = ni->vol;
+	ntfs_inode *base_ni;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	ntfs_attr_search_ctx *ctx;
+	runlist_element *rl, *rl2;
+	unsigned long flags;
+	int err, mp_size;
+	u32 attr_len = 0; /* Silence stupid gcc warning. */
+	bool mp_rebuilt;
+
+#ifdef DEBUG
+	read_lock_irqsave(&ni->size_lock, flags);
+	allocated_size = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
+			"old_allocated_size 0x%llx, "
+			"new_allocated_size 0x%llx, new_data_size 0x%llx, "
+			"data_start 0x%llx.", vi->i_ino,
+			(unsigned)le32_to_cpu(ni->type),
+			(unsigned long long)allocated_size,
+			(unsigned long long)new_alloc_size,
+			(unsigned long long)new_data_size,
+			(unsigned long long)start);
+#endif
+retry_extend:
+	/*
+	 * For non-resident attributes, @start and @new_size need to be aligned
+	 * to cluster boundaries for allocation purposes.
+	 */
+	if (NInoNonResident(ni)) {
+		if (start > 0)
+			start &= ~(s64)vol->cluster_size_mask;
+		new_alloc_size = (new_alloc_size + vol->cluster_size - 1) &
+				~(s64)vol->cluster_size_mask;
+	}
+	BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size);
+	/* Check if new size is allowed in $AttrDef. */
+	err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size);
+	if (unlikely(err)) {
+		/* Only emit errors when the write will fail completely. */
+		read_lock_irqsave(&ni->size_lock, flags);
+		allocated_size = ni->allocated_size;
+		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (start < 0 || start >= allocated_size) {
+			if (err == -ERANGE) {
+				ntfs_error(vol->sb, "Cannot extend allocation "
+						"of inode 0x%lx, attribute "
+						"type 0x%x, because the new "
+						"allocation would exceed the "
+						"maximum allowed size for "
+						"this attribute type.",
+						vi->i_ino, (unsigned)
+						le32_to_cpu(ni->type));
+			} else {
+				ntfs_error(vol->sb, "Cannot extend allocation "
+						"of inode 0x%lx, attribute "
+						"type 0x%x, because this "
+						"attribute type is not "
+						"defined on the NTFS volume.  "
+						"Possible corruption!  You "
+						"should run chkdsk!",
+						vi->i_ino, (unsigned)
+						le32_to_cpu(ni->type));
+			}
+		}
+		/* Translate error code to be POSIX conformant for write(2). */
+		if (err == -ERANGE)
+			err = -EFBIG;
+		else
+			err = -EIO;
+		return err;
+	}
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	/*
+	 * We will be modifying both the runlist (if non-resident) and the mft
+	 * record so lock them both down.
+	 */
+	down_write(&ni->runlist.lock);
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		ctx = NULL;
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	read_lock_irqsave(&ni->size_lock, flags);
+	allocated_size = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * If non-resident, seek to the last extent.  If resident, there is
+	 * only one extent, so seek to that.
+	 */
+	vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits :
+			0;
+	/*
+	 * Abort if someone did the work whilst we waited for the locks.  If we
+	 * just converted the attribute from resident to non-resident it is
+	 * likely that exactly this has happened already.  We cannot quite
+	 * abort if we need to update the data size.
+	 */
+	if (unlikely(new_alloc_size <= allocated_size)) {
+		ntfs_debug("Allocated size already exceeds requested size.");
+		new_alloc_size = allocated_size;
+		if (new_data_size < 0)
+			goto done;
+		/*
+		 * We want the first attribute extent so that we can update the
+		 * data size.
+		 */
+		vcn = 0;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, vcn, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto err_out;
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	/* Use goto to reduce indentation. */
+	if (a->non_resident)
+		goto do_non_resident_extend;
+	BUG_ON(NInoNonResident(ni));
+	/* The total length of the attribute value. */
+	attr_len = le32_to_cpu(a->data.resident.value_length);
+	/*
+	 * Extend the attribute record to be able to store the new attribute
+	 * size.  ntfs_attr_record_resize() will not do anything if the size is
+	 * not changing.
+	 */
+	if (new_alloc_size < vol->mft_record_size &&
+			!ntfs_attr_record_resize(m, a,
+			le16_to_cpu(a->data.resident.value_offset) +
+			new_alloc_size)) {
+		/* The resize succeeded! */
+		write_lock_irqsave(&ni->size_lock, flags);
+		ni->allocated_size = le32_to_cpu(a->length) -
+				le16_to_cpu(a->data.resident.value_offset);
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		if (new_data_size >= 0) {
+			BUG_ON(new_data_size < attr_len);
+			a->data.resident.value_length =
+					cpu_to_le32((u32)new_data_size);
+		}
+		goto flush_done;
+	}
+	/*
+	 * We have to drop all the locks so we can call
+	 * ntfs_attr_make_non_resident().  This could be optimised by try-
+	 * locking the first page cache page and only if that fails dropping
+	 * the locks, locking the page, and redoing all the locking and
+	 * lookups.  While this would be a huge optimisation, it is not worth
+	 * it as this is definitely a slow code path.
+	 */
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+	/*
+	 * Not enough space in the mft record, try to make the attribute
+	 * non-resident and if successful restart the extension process.
+	 */
+	err = ntfs_attr_make_non_resident(ni, attr_len);
+	if (likely(!err))
+		goto retry_extend;
+	/*
+	 * Could not make non-resident.  If this is due to this not being
+	 * permitted for this attribute type or there not being enough space,
+	 * try to make other attributes non-resident.  Otherwise fail.
+	 */
+	if (unlikely(err != -EPERM && err != -ENOSPC)) {
+		/* Only emit errors when the write will fail completely. */
+		read_lock_irqsave(&ni->size_lock, flags);
+		allocated_size = ni->allocated_size;
+		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Cannot extend allocation of "
+					"inode 0x%lx, attribute type 0x%x, "
+					"because the conversion from resident "
+					"to non-resident attribute failed "
+					"with error code %i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+		if (err != -ENOMEM)
+			err = -EIO;
+		goto conv_err_out;
+	}
+	/* TODO: Not implemented from here, abort. */
+	read_lock_irqsave(&ni->size_lock, flags);
+	allocated_size = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (start < 0 || start >= allocated_size) {
+		if (err == -ENOSPC)
+			ntfs_error(vol->sb, "Not enough space in the mft "
+					"record/on disk for the non-resident "
+					"attribute value.  This case is not "
+					"implemented yet.");
+		else /* if (err == -EPERM) */
+			ntfs_error(vol->sb, "This attribute type may not be "
+					"non-resident.  This case is not "
+					"implemented yet.");
+	}
+	err = -EOPNOTSUPP;
+	goto conv_err_out;
+#if 0
+	// TODO: Attempt to make other attributes non-resident.
+	if (!err)
+		goto do_resident_extend;
+	/*
+	 * Both the attribute list attribute and the standard information
+	 * attribute must remain in the base inode.  Thus, if this is one of
+	 * these attributes, we have to try to move other attributes out into
+	 * extent mft records instead.
+	 */
+	if (ni->type == AT_ATTRIBUTE_LIST ||
+			ni->type == AT_STANDARD_INFORMATION) {
+		// TODO: Attempt to move other attributes into extent mft
+		// records.
+		err = -EOPNOTSUPP;
+		if (!err)
+			goto do_resident_extend;
+		goto err_out;
+	}
+	// TODO: Attempt to move this attribute to an extent mft record, but
+	// only if it is not already the only attribute in an mft record in
+	// which case there would be nothing to gain.
+	err = -EOPNOTSUPP;
+	if (!err)
+		goto do_resident_extend;
+	/* There is nothing we can do to make enough space. )-: */
+	goto err_out;
+#endif
+do_non_resident_extend:
+	BUG_ON(!NInoNonResident(ni));
+	if (new_alloc_size == allocated_size) {
+		BUG_ON(vcn);
+		goto alloc_done;
+	}
+	/*
+	 * If the data starts after the end of the old allocation, this is a
+	 * $DATA attribute and sparse attributes are enabled on the volume and
+	 * for this inode, then create a sparse region between the old
+	 * allocated size and the start of the data.  Otherwise simply proceed
+	 * with filling the whole space between the old allocated size and the
+	 * new allocated size with clusters.
+	 */
+	if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA ||
+			!NVolSparseEnabled(vol) || NInoSparseDisabled(ni))
+		goto skip_sparse;
+	// TODO: This is not implemented yet.  We just fill in with real
+	// clusters for now...
+	ntfs_debug("Inserting holes is not-implemented yet.  Falling back to "
+			"allocating real clusters instead.");
+skip_sparse:
+	rl = ni->runlist.rl;
+	if (likely(rl)) {
+		/* Seek to the end of the runlist. */
+		while (rl->length)
+			rl++;
+	}
+	/* If this attribute extent is not mapped, map it now. */
+	if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED ||
+			(rl->lcn == LCN_ENOENT && rl > ni->runlist.rl &&
+			(rl-1)->lcn == LCN_RL_NOT_MAPPED))) {
+		if (!rl && !allocated_size)
+			goto first_alloc;
+		rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
+		if (IS_ERR(rl)) {
+			err = PTR_ERR(rl);
+			if (start < 0 || start >= allocated_size)
+				ntfs_error(vol->sb, "Cannot extend allocation "
+						"of inode 0x%lx, attribute "
+						"type 0x%x, because the "
+						"mapping of a runlist "
+						"fragment failed with error "
+						"code %i.", vi->i_ino,
+						(unsigned)le32_to_cpu(ni->type),
+						err);
+			if (err != -ENOMEM)
+				err = -EIO;
+			goto err_out;
+		}
+		ni->runlist.rl = rl;
+		/* Seek to the end of the runlist. */
+		while (rl->length)
+			rl++;
+	}
+	/*
+	 * We now know the runlist of the last extent is mapped and @rl is at
+	 * the end of the runlist.  We want to begin allocating clusters
+	 * starting at the last allocated cluster to reduce fragmentation.  If
+	 * there are no valid LCNs in the attribute we let the cluster
+	 * allocator choose the starting cluster.
+	 */
+	/* If the last LCN is a hole or simillar seek back to last real LCN. */
+	while (rl->lcn < 0 && rl > ni->runlist.rl)
+		rl--;
+first_alloc:
+	// FIXME: Need to implement partial allocations so at least part of the
+	// write can be performed when start >= 0.  (Needed for POSIX write(2)
+	// conformance.)
+	rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits,
+			(new_alloc_size - allocated_size) >>
+			vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ?
+			rl->lcn + rl->length : -1, DATA_ZONE, true);
+	if (IS_ERR(rl2)) {
+		err = PTR_ERR(rl2);
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Cannot extend allocation of "
+					"inode 0x%lx, attribute type 0x%x, "
+					"because the allocation of clusters "
+					"failed with error code %i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+		if (err != -ENOMEM && err != -ENOSPC)
+			err = -EIO;
+		goto err_out;
+	}
+	rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
+	if (IS_ERR(rl)) {
+		err = PTR_ERR(rl);
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Cannot extend allocation of "
+					"inode 0x%lx, attribute type 0x%x, "
+					"because the runlist merge failed "
+					"with error code %i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+		if (err != -ENOMEM)
+			err = -EIO;
+		if (ntfs_cluster_free_from_rl(vol, rl2)) {
+			ntfs_error(vol->sb, "Failed to release allocated "
+					"cluster(s) in error code path.  Run "
+					"chkdsk to recover the lost "
+					"cluster(s).");
+			NVolSetErrors(vol);
+		}
+		ntfs_free(rl2);
+		goto err_out;
+	}
+	ni->runlist.rl = rl;
+	ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size -
+			allocated_size) >> vol->cluster_size_bits);
+	/* Find the runlist element with which the attribute extent starts. */
+	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+	rl2 = ntfs_rl_find_vcn_nolock(rl, ll);
+	BUG_ON(!rl2);
+	BUG_ON(!rl2->length);
+	BUG_ON(rl2->lcn < LCN_HOLE);
+	mp_rebuilt = false;
+	/* Get the size for the new mapping pairs array for this extent. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
+	if (unlikely(mp_size <= 0)) {
+		err = mp_size;
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Cannot extend allocation of "
+					"inode 0x%lx, attribute type 0x%x, "
+					"because determining the size for the "
+					"mapping pairs failed with error code "
+					"%i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+		err = -EIO;
+		goto undo_alloc;
+	}
+	/* Extend the attribute record to fit the bigger mapping pairs array. */
+	attr_len = le32_to_cpu(a->length);
+	err = ntfs_attr_record_resize(m, a, mp_size +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+	if (unlikely(err)) {
+		BUG_ON(err != -ENOSPC);
+		// TODO: Deal with this by moving this extent to a new mft
+		// record or by starting a new extent in a new mft record,
+		// possibly by extending this extent partially and filling it
+		// and creating a new extent for the remainder, or by making
+		// other attributes non-resident and/or by moving other
+		// attributes out of this mft record.
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Not enough space in the mft "
+					"record for the extended attribute "
+					"record.  This case is not "
+					"implemented yet.");
+		err = -EOPNOTSUPP;
+		goto undo_alloc;
+	}
+	mp_rebuilt = true;
+	/* Generate the mapping pairs array directly into the attr record. */
+	err = ntfs_mapping_pairs_build(vol, (u8*)a +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+			mp_size, rl2, ll, -1, NULL);
+	if (unlikely(err)) {
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Cannot extend allocation of "
+					"inode 0x%lx, attribute type 0x%x, "
+					"because building the mapping pairs "
+					"failed with error code %i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+		err = -EIO;
+		goto undo_alloc;
+	}
+	/* Update the highest_vcn. */
+	a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
+			vol->cluster_size_bits) - 1);
+	/*
+	 * We now have extended the allocated size of the attribute.  Reflect
+	 * this in the ntfs_inode structure and the attribute record.
+	 */
+	if (a->data.non_resident.lowest_vcn) {
+		/*
+		 * We are not in the first attribute extent, switch to it, but
+		 * first ensure the changes will make it to disk later.
+		 */
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		ntfs_attr_reinit_search_ctx(ctx);
+		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, 0, NULL, 0, ctx);
+		if (unlikely(err))
+			goto restore_undo_alloc;
+		/* @m is not used any more so no need to set it. */
+		a = ctx->attr;
+	}
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->allocated_size = new_alloc_size;
+	a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
+	/*
+	 * FIXME: This would fail if @ni is a directory, $MFT, or an index,
+	 * since those can have sparse/compressed set.  For example can be
+	 * set compressed even though it is not compressed itself and in that
+	 * case the bit means that files are to be created compressed in the
+	 * directory...  At present this is ok as this code is only called for
+	 * regular files, and only for their $DATA attribute(s).
+	 * FIXME: The calculation is wrong if we created a hole above.  For now
+	 * it does not matter as we never create holes.
+	 */
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		ni->itype.compressed.size += new_alloc_size - allocated_size;
+		a->data.non_resident.compressed_size =
+				cpu_to_sle64(ni->itype.compressed.size);
+		vi->i_blocks = ni->itype.compressed.size >> 9;
+	} else
+		vi->i_blocks = new_alloc_size >> 9;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+alloc_done:
+	if (new_data_size >= 0) {
+		BUG_ON(new_data_size <
+				sle64_to_cpu(a->data.non_resident.data_size));
+		a->data.non_resident.data_size = cpu_to_sle64(new_data_size);
+	}
+flush_done:
+	/* Ensure the changes make it to disk. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+done:
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+	ntfs_debug("Done, new_allocated_size 0x%llx.",
+			(unsigned long long)new_alloc_size);
+	return new_alloc_size;
+restore_undo_alloc:
+	if (start < 0 || start >= allocated_size)
+		ntfs_error(vol->sb, "Cannot complete extension of allocation "
+				"of inode 0x%lx, attribute type 0x%x, because "
+				"lookup of first attribute extent failed with "
+				"error code %i.", vi->i_ino,
+				(unsigned)le32_to_cpu(ni->type), err);
+	if (err == -ENOENT)
+		err = -EIO;
+	ntfs_attr_reinit_search_ctx(ctx);
+	if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE,
+			allocated_size >> vol->cluster_size_bits, NULL, 0,
+			ctx)) {
+		ntfs_error(vol->sb, "Failed to find last attribute extent of "
+				"attribute in error code path.  Run chkdsk to "
+				"recover.");
+		write_lock_irqsave(&ni->size_lock, flags);
+		ni->allocated_size = new_alloc_size;
+		/*
+		 * FIXME: This would fail if @ni is a directory...  See above.
+		 * FIXME: The calculation is wrong if we created a hole above.
+		 * For now it does not matter as we never create holes.
+		 */
+		if (NInoSparse(ni) || NInoCompressed(ni)) {
+			ni->itype.compressed.size += new_alloc_size -
+					allocated_size;
+			vi->i_blocks = ni->itype.compressed.size >> 9;
+		} else
+			vi->i_blocks = new_alloc_size >> 9;
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(base_ni);
+		up_write(&ni->runlist.lock);
+		/*
+		 * The only thing that is now wrong is the allocated size of the
+		 * base attribute extent which chkdsk should be able to fix.
+		 */
+		NVolSetErrors(vol);
+		return err;
+	}
+	ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64(
+			(allocated_size >> vol->cluster_size_bits) - 1);
+undo_alloc:
+	ll = allocated_size >> vol->cluster_size_bits;
+	if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) {
+		ntfs_error(vol->sb, "Failed to release allocated cluster(s) "
+				"in error code path.  Run chkdsk to recover "
+				"the lost cluster(s).");
+		NVolSetErrors(vol);
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	/*
+	 * If the runlist truncation fails and/or the search context is no
+	 * longer valid, we cannot resize the attribute record or build the
+	 * mapping pairs array thus we mark the inode bad so that no access to
+	 * the freed clusters can happen.
+	 */
+	if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) {
+		ntfs_error(vol->sb, "Failed to %s in error code path.  Run "
+				"chkdsk to recover.", IS_ERR(m) ?
+				"restore attribute search context" :
+				"truncate attribute runlist");
+		NVolSetErrors(vol);
+	} else if (mp_rebuilt) {
+		if (ntfs_attr_record_resize(m, a, attr_len)) {
+			ntfs_error(vol->sb, "Failed to restore attribute "
+					"record in error code path.  Run "
+					"chkdsk to recover.");
+			NVolSetErrors(vol);
+		} else /* if (success) */ {
+			if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+					a->data.non_resident.
+					mapping_pairs_offset), attr_len -
+					le16_to_cpu(a->data.non_resident.
+					mapping_pairs_offset), rl2, ll, -1,
+					NULL)) {
+				ntfs_error(vol->sb, "Failed to restore "
+						"mapping pairs array in error "
+						"code path.  Run chkdsk to "
+						"recover.");
+				NVolSetErrors(vol);
+			}
+			flush_dcache_mft_record_page(ctx->ntfs_ino);
+			mark_mft_record_dirty(ctx->ntfs_ino);
+		}
+	}
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+conv_err_out:
+	ntfs_debug("Failed.  Returning error code %i.", err);
+	return err;
+}
+
+/**
+ * ntfs_attr_set - fill (a part of) an attribute with a byte
+ * @ni:		ntfs inode describing the attribute to fill
+ * @ofs:	offset inside the attribute at which to start to fill
+ * @cnt:	number of bytes to fill
+ * @val:	the unsigned 8-bit value with which to fill the attribute
+ *
+ * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at
+ * byte offset @ofs inside the attribute with the constant byte @val.
+ *
+ * This function is effectively like memset() applied to an ntfs attribute.
+ * Note thie function actually only operates on the page cache pages belonging
+ * to the ntfs attribute and it marks them dirty after doing the memset().
+ * Thus it relies on the vm dirty page write code paths to cause the modified
+ * pages to be written to the mft record/disk.
+ *
+ * Return 0 on success and -errno on error.  An error code of -ESPIPE means
+ * that @ofs + @cnt were outside the end of the attribute and no write was
+ * performed.
+ */
+int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
+{
+	ntfs_volume *vol = ni->vol;
+	struct address_space *mapping;
+	struct page *page;
+	u8 *kaddr;
+	pgoff_t idx, end;
+	unsigned start_ofs, end_ofs, size;
+
+	ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.",
+			(long long)ofs, (long long)cnt, val);
+	BUG_ON(ofs < 0);
+	BUG_ON(cnt < 0);
+	if (!cnt)
+		goto done;
+	/*
+	 * FIXME: Compressed and encrypted attributes are not supported when
+	 * writing and we should never have gotten here for them.
+	 */
+	BUG_ON(NInoCompressed(ni));
+	BUG_ON(NInoEncrypted(ni));
+	mapping = VFS_I(ni)->i_mapping;
+	/* Work out the starting index and page offset. */
+	idx = ofs >> PAGE_CACHE_SHIFT;
+	start_ofs = ofs & ~PAGE_CACHE_MASK;
+	/* Work out the ending index and page offset. */
+	end = ofs + cnt;
+	end_ofs = end & ~PAGE_CACHE_MASK;
+	/* If the end is outside the inode size return -ESPIPE. */
+	if (unlikely(end > i_size_read(VFS_I(ni)))) {
+		ntfs_error(vol->sb, "Request exceeds end of attribute.");
+		return -ESPIPE;
+	}
+	end >>= PAGE_CACHE_SHIFT;
+	/* If there is a first partial page, need to do it the slow way. */
+	if (start_ofs) {
+		page = read_mapping_page(mapping, idx, NULL);
+		if (IS_ERR(page)) {
+			ntfs_error(vol->sb, "Failed to read first partial "
+					"page (error, index 0x%lx).", idx);
+			return PTR_ERR(page);
+		}
+		/*
+		 * If the last page is the same as the first page, need to
+		 * limit the write to the end offset.
+		 */
+		size = PAGE_CACHE_SIZE;
+		if (idx == end)
+			size = end_ofs;
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + start_ofs, val, size - start_ofs);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		set_page_dirty(page);
+		page_cache_release(page);
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+		if (idx == end)
+			goto done;
+		idx++;
+	}
+	/* Do the whole pages the fast way. */
+	for (; idx < end; idx++) {
+		/* Find or create the current page.  (The page is locked.) */
+		page = grab_cache_page(mapping, idx);
+		if (unlikely(!page)) {
+			ntfs_error(vol->sb, "Insufficient memory to grab "
+					"page (index 0x%lx).", idx);
+			return -ENOMEM;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr, val, PAGE_CACHE_SIZE);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		/*
+		 * If the page has buffers, mark them uptodate since buffer
+		 * state and not page state is definitive in 2.6 kernels.
+		 */
+		if (page_has_buffers(page)) {
+			struct buffer_head *bh, *head;
+
+			bh = head = page_buffers(page);
+			do {
+				set_buffer_uptodate(bh);
+			} while ((bh = bh->b_this_page) != head);
+		}
+		/* Now that buffers are uptodate, set the page uptodate, too. */
+		SetPageUptodate(page);
+		/*
+		 * Set the page and all its buffers dirty and mark the inode
+		 * dirty, too.  The VM will write the page later on.
+		 */
+		set_page_dirty(page);
+		/* Finally unlock and release the page. */
+		unlock_page(page);
+		page_cache_release(page);
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+	}
+	/* If there is a last partial page, need to do it the slow way. */
+	if (end_ofs) {
+		page = read_mapping_page(mapping, idx, NULL);
+		if (IS_ERR(page)) {
+			ntfs_error(vol->sb, "Failed to read last partial page "
+					"(error, index 0x%lx).", idx);
+			return PTR_ERR(page);
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr, val, end_ofs);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		set_page_dirty(page);
+		page_cache_release(page);
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+	}
+done:
+	ntfs_debug("Done.");
+	return 0;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
new file mode 100644
index 00000000..3c8b74c9
--- /dev/null
+++ b/fs/ntfs/attrib.h
@@ -0,0 +1,116 @@
+/*
+ * attrib.h - Defines for attribute handling in NTFS Linux kernel driver.
+ *	      Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_ATTRIB_H
+#define _LINUX_NTFS_ATTRIB_H
+
+#include "endian.h"
+#include "types.h"
+#include "layout.h"
+#include "inode.h"
+#include "runlist.h"
+#include "volume.h"
+
+/**
+ * ntfs_attr_search_ctx - used in attribute search functions
+ * @mrec:	buffer containing mft record to search
+ * @attr:	attribute record in @mrec where to begin/continue search
+ * @is_first:	if true ntfs_attr_lookup() begins search with @attr, else after
+ *
+ * Structure must be initialized to zero before the first call to one of the
+ * attribute search functions. Initialize @mrec to point to the mft record to
+ * search, and @attr to point to the first attribute within @mrec (not necessary
+ * if calling the _first() functions), and set @is_first to 'true' (not necessary
+ * if calling the _first() functions).
+ *
+ * If @is_first is 'true', the search begins with @attr. If @is_first is 'false',
+ * the search begins after @attr. This is so that, after the first call to one
+ * of the search attribute functions, we can call the function again, without
+ * any modification of the search context, to automagically get the next
+ * matching attribute.
+ */
+typedef struct {
+	MFT_RECORD *mrec;
+	ATTR_RECORD *attr;
+	bool is_first;
+	ntfs_inode *ntfs_ino;
+	ATTR_LIST_ENTRY *al_entry;
+	ntfs_inode *base_ntfs_ino;
+	MFT_RECORD *base_mrec;
+	ATTR_RECORD *base_attr;
+} ntfs_attr_search_ctx;
+
+extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn,
+		ntfs_attr_search_ctx *ctx);
+extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
+
+extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
+		const bool write_locked);
+
+extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni,
+		const VCN vcn, ntfs_attr_search_ctx *ctx);
+
+int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
+		const u32 name_len, const IGNORE_CASE_BOOL ic,
+		const VCN lowest_vcn, const u8 *val, const u32 val_len,
+		ntfs_attr_search_ctx *ctx);
+
+extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start,
+		const s64 size, const s64 initialized_size);
+
+static inline s64 ntfs_attr_size(const ATTR_RECORD *a)
+{
+	if (!a->non_resident)
+		return (s64)le32_to_cpu(a->data.resident.value_length);
+	return sle64_to_cpu(a->data.non_resident.data_size);
+}
+
+extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx);
+extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni,
+		MFT_RECORD *mrec);
+extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx);
+
+#ifdef NTFS_RW
+
+extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol,
+		const ATTR_TYPE type, const s64 size);
+extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol,
+		const ATTR_TYPE type);
+extern int ntfs_attr_can_be_resident(const ntfs_volume *vol,
+		const ATTR_TYPE type);
+
+extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
+extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
+		const u32 new_size);
+
+extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size);
+
+extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
+		const s64 new_data_size, const s64 data_start);
+
+extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt,
+		const u8 val);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_ATTRIB_H */
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
new file mode 100644
index 00000000..0809cf87
--- /dev/null
+++ b/fs/ntfs/bitmap.c
@@ -0,0 +1,193 @@
+/*
+ * bitmap.c - NTFS kernel bitmap handling.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifdef NTFS_RW
+
+#include <linux/pagemap.h>
+
+#include "bitmap.h"
+#include "debug.h"
+#include "aops.h"
+#include "ntfs.h"
+
+/**
+ * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
+ * @vi:			vfs inode describing the bitmap
+ * @start_bit:		first bit to set
+ * @count:		number of bits to set
+ * @value:		value to set the bits to (i.e. 0 or 1)
+ * @is_rollback:	if 'true' this is a rollback operation
+ *
+ * Set @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi to @value, where @value is either 0 or 1.
+ *
+ * @is_rollback should always be 'false', it is for internal use to rollback
+ * errors.  You probably want to use ntfs_bitmap_set_bits_in_run() instead.
+ *
+ * Return 0 on success and -errno on error.
+ */
+int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
+		const s64 count, const u8 value, const bool is_rollback)
+{
+	s64 cnt = count;
+	pgoff_t index, end_index;
+	struct address_space *mapping;
+	struct page *page;
+	u8 *kaddr;
+	int pos, len;
+	u8 bit;
+
+	BUG_ON(!vi);
+	ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, "
+			"value %u.%s", vi->i_ino, (unsigned long long)start_bit,
+			(unsigned long long)cnt, (unsigned int)value,
+			is_rollback ? " (rollback)" : "");
+	BUG_ON(start_bit < 0);
+	BUG_ON(cnt < 0);
+	BUG_ON(value > 1);
+	/*
+	 * Calculate the indices for the pages containing the first and last
+	 * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively.
+	 */
+	index = start_bit >> (3 + PAGE_CACHE_SHIFT);
+	end_index = (start_bit + cnt - 1) >> (3 + PAGE_CACHE_SHIFT);
+
+	/* Get the page containing the first bit (@start_bit). */
+	mapping = vi->i_mapping;
+	page = ntfs_map_page(mapping, index);
+	if (IS_ERR(page)) {
+		if (!is_rollback)
+			ntfs_error(vi->i_sb, "Failed to map first page (error "
+					"%li), aborting.", PTR_ERR(page));
+		return PTR_ERR(page);
+	}
+	kaddr = page_address(page);
+
+	/* Set @pos to the position of the byte containing @start_bit. */
+	pos = (start_bit >> 3) & ~PAGE_CACHE_MASK;
+
+	/* Calculate the position of @start_bit in the first byte. */
+	bit = start_bit & 7;
+
+	/* If the first byte is partial, modify the appropriate bits in it. */
+	if (bit) {
+		u8 *byte = kaddr + pos;
+		while ((bit & 7) && cnt) {
+			cnt--;
+			if (value)
+				*byte |= 1 << bit++;
+			else
+				*byte &= ~(1 << bit++);
+		}
+		/* If we are done, unmap the page and return success. */
+		if (!cnt)
+			goto done;
+
+		/* Update @pos to the new position. */
+		pos++;
+	}
+	/*
+	 * Depending on @value, modify all remaining whole bytes in the page up
+	 * to @cnt.
+	 */
+	len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE - pos);
+	memset(kaddr + pos, value ? 0xff : 0, len);
+	cnt -= len << 3;
+
+	/* Update @len to point to the first not-done byte in the page. */
+	if (cnt < 8)
+		len += pos;
+
+	/* If we are not in the last page, deal with all subsequent pages. */
+	while (index < end_index) {
+		BUG_ON(cnt <= 0);
+
+		/* Update @index and get the next page. */
+		flush_dcache_page(page);
+		set_page_dirty(page);
+		ntfs_unmap_page(page);
+		page = ntfs_map_page(mapping, ++index);
+		if (IS_ERR(page))
+			goto rollback;
+		kaddr = page_address(page);
+		/*
+		 * Depending on @value, modify all remaining whole bytes in the
+		 * page up to @cnt.
+		 */
+		len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE);
+		memset(kaddr, value ? 0xff : 0, len);
+		cnt -= len << 3;
+	}
+	/*
+	 * The currently mapped page is the last one.  If the last byte is
+	 * partial, modify the appropriate bits in it.  Note, @len is the
+	 * position of the last byte inside the page.
+	 */
+	if (cnt) {
+		u8 *byte;
+
+		BUG_ON(cnt > 7);
+
+		bit = cnt;
+		byte = kaddr + len;
+		while (bit--) {
+			if (value)
+				*byte |= 1 << bit;
+			else
+				*byte &= ~(1 << bit);
+		}
+	}
+done:
+	/* We are done.  Unmap the page and return success. */
+	flush_dcache_page(page);
+	set_page_dirty(page);
+	ntfs_unmap_page(page);
+	ntfs_debug("Done.");
+	return 0;
+rollback:
+	/*
+	 * Current state:
+	 *	- no pages are mapped
+	 *	- @count - @cnt is the number of bits that have been modified
+	 */
+	if (is_rollback)
+		return PTR_ERR(page);
+	if (count != cnt)
+		pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt,
+				value ? 0 : 1, true);
+	else
+		pos = 0;
+	if (!pos) {
+		/* Rollback was successful. */
+		ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
+				"%li), aborting.", PTR_ERR(page));
+	} else {
+		/* Rollback failed. */
+		ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
+				"%li) and rollback failed (error %i).  "
+				"Aborting and leaving inconsistent metadata.  "
+				"Unmount and run chkdsk.", PTR_ERR(page), pos);
+		NVolSetErrors(NTFS_SB(vi->i_sb));
+	}
+	return PTR_ERR(page);
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h
new file mode 100644
index 00000000..72c9ad8b
--- /dev/null
+++ b/fs/ntfs/bitmap.h
@@ -0,0 +1,118 @@
+/*
+ * bitmap.h - Defines for NTFS kernel bitmap handling.  Part of the Linux-NTFS
+ *	      project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_BITMAP_H
+#define _LINUX_NTFS_BITMAP_H
+
+#ifdef NTFS_RW
+
+#include <linux/fs.h>
+
+#include "types.h"
+
+extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
+		const s64 count, const u8 value, const bool is_rollback);
+
+/**
+ * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
+ * @vi:			vfs inode describing the bitmap
+ * @start_bit:		first bit to set
+ * @count:		number of bits to set
+ * @value:		value to set the bits to (i.e. 0 or 1)
+ *
+ * Set @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi to @value, where @value is either 0 or 1.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi,
+		const s64 start_bit, const s64 count, const u8 value)
+{
+	return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value,
+			false);
+}
+
+/**
+ * ntfs_bitmap_set_run - set a run of bits in a bitmap
+ * @vi:		vfs inode describing the bitmap
+ * @start_bit:	first bit to set
+ * @count:	number of bits to set
+ *
+ * Set @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit,
+		const s64 count)
+{
+	return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1);
+}
+
+/**
+ * ntfs_bitmap_clear_run - clear a run of bits in a bitmap
+ * @vi:		vfs inode describing the bitmap
+ * @start_bit:	first bit to clear
+ * @count:	number of bits to clear
+ *
+ * Clear @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit,
+		const s64 count)
+{
+	return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0);
+}
+
+/**
+ * ntfs_bitmap_set_bit - set a bit in a bitmap
+ * @vi:		vfs inode describing the bitmap
+ * @bit:	bit to set
+ *
+ * Set bit @bit in the bitmap described by the vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit)
+{
+	return ntfs_bitmap_set_run(vi, bit, 1);
+}
+
+/**
+ * ntfs_bitmap_clear_bit - clear a bit in a bitmap
+ * @vi:		vfs inode describing the bitmap
+ * @bit:	bit to clear
+ *
+ * Clear bit @bit in the bitmap described by the vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit)
+{
+	return ntfs_bitmap_clear_run(vi, bit, 1);
+}
+
+#endif /* NTFS_RW */
+
+#endif /* defined _LINUX_NTFS_BITMAP_H */
diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c
new file mode 100644
index 00000000..4a28ab38
--- /dev/null
+++ b/fs/ntfs/collate.c
@@ -0,0 +1,124 @@
+/*
+ * collate.c - NTFS kernel collation handling.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "collate.h"
+#include "debug.h"
+#include "ntfs.h"
+
+static int ntfs_collate_binary(ntfs_volume *vol,
+		const void *data1, const int data1_len,
+		const void *data2, const int data2_len)
+{
+	int rc;
+
+	ntfs_debug("Entering.");
+	rc = memcmp(data1, data2, min(data1_len, data2_len));
+	if (!rc && (data1_len != data2_len)) {
+		if (data1_len < data2_len)
+			rc = -1;
+		else
+			rc = 1;
+	}
+	ntfs_debug("Done, returning %i", rc);
+	return rc;
+}
+
+static int ntfs_collate_ntofs_ulong(ntfs_volume *vol,
+		const void *data1, const int data1_len,
+		const void *data2, const int data2_len)
+{
+	int rc;
+	u32 d1, d2;
+
+	ntfs_debug("Entering.");
+	// FIXME:  We don't really want to bug here.
+	BUG_ON(data1_len != data2_len);
+	BUG_ON(data1_len != 4);
+	d1 = le32_to_cpup(data1);
+	d2 = le32_to_cpup(data2);
+	if (d1 < d2)
+		rc = -1;
+	else {
+		if (d1 == d2)
+			rc = 0;
+		else
+			rc = 1;
+	}
+	ntfs_debug("Done, returning %i", rc);
+	return rc;
+}
+
+typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int,
+		const void *, const int);
+
+static ntfs_collate_func_t ntfs_do_collate0x0[3] = {
+	ntfs_collate_binary,
+	NULL/*ntfs_collate_file_name*/,
+	NULL/*ntfs_collate_unicode_string*/,
+};
+
+static ntfs_collate_func_t ntfs_do_collate0x1[4] = {
+	ntfs_collate_ntofs_ulong,
+	NULL/*ntfs_collate_ntofs_sid*/,
+	NULL/*ntfs_collate_ntofs_security_hash*/,
+	NULL/*ntfs_collate_ntofs_ulongs*/,
+};
+
+/**
+ * ntfs_collate - collate two data items using a specified collation rule
+ * @vol:	ntfs volume to which the data items belong
+ * @cr:		collation rule to use when comparing the items
+ * @data1:	first data item to collate
+ * @data1_len:	length in bytes of @data1
+ * @data2:	second data item to collate
+ * @data2_len:	length in bytes of @data2
+ *
+ * Collate the two data items @data1 and @data2 using the collation rule @cr
+ * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before,
+ * to match, or to collate after @data2.
+ *
+ * For speed we use the collation rule @cr as an index into two tables of
+ * function pointers to call the appropriate collation function.
+ */
+int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
+		const void *data1, const int data1_len,
+		const void *data2, const int data2_len) {
+	int i;
+
+	ntfs_debug("Entering.");
+	/*
+	 * FIXME:  At the moment we only support COLLATION_BINARY and
+	 * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now.
+	 */
+	BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG);
+	i = le32_to_cpu(cr);
+	BUG_ON(i < 0);
+	if (i <= 0x02)
+		return ntfs_do_collate0x0[i](vol, data1, data1_len,
+				data2, data2_len);
+	BUG_ON(i < 0x10);
+	i -= 0x10;
+	if (likely(i <= 3))
+		return ntfs_do_collate0x1[i](vol, data1, data1_len,
+				data2, data2_len);
+	BUG();
+	return 0;
+}
diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h
new file mode 100644
index 00000000..aba83347
--- /dev/null
+++ b/fs/ntfs/collate.h
@@ -0,0 +1,50 @@
+/*
+ * collate.h - Defines for NTFS kernel collation handling.  Part of the
+ *	       Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_COLLATE_H
+#define _LINUX_NTFS_COLLATE_H
+
+#include "types.h"
+#include "volume.h"
+
+static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) {
+	int i;
+
+	/*
+	 * FIXME:  At the moment we only support COLLATION_BINARY and
+	 * COLLATION_NTOFS_ULONG, so we return false for everything else for
+	 * now.
+	 */
+	if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG))
+		return false;
+	i = le32_to_cpu(cr);
+	if (likely(((i >= 0) && (i <= 0x02)) ||
+			((i >= 0x10) && (i <= 0x13))))
+		return true;
+	return false;
+}
+
+extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
+		const void *data1, const int data1_len,
+		const void *data2, const int data2_len);
+
+#endif /* _LINUX_NTFS_COLLATE_H */
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
new file mode 100644
index 00000000..ee4144ce
--- /dev/null
+++ b/fs/ntfs/compress.c
@@ -0,0 +1,969 @@
+/**
+ * compress.c - NTFS kernel compressed attributes handling.
+ *		Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include "attrib.h"
+#include "inode.h"
+#include "debug.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_compression_constants - enum of constants used in the compression code
+ */
+typedef enum {
+	/* Token types and access mask. */
+	NTFS_SYMBOL_TOKEN	=	0,
+	NTFS_PHRASE_TOKEN	=	1,
+	NTFS_TOKEN_MASK		=	1,
+
+	/* Compression sub-block constants. */
+	NTFS_SB_SIZE_MASK	=	0x0fff,
+	NTFS_SB_SIZE		=	0x1000,
+	NTFS_SB_IS_COMPRESSED	=	0x8000,
+
+	/*
+	 * The maximum compression block size is by definition 16 * the cluster
+	 * size, with the maximum supported cluster size being 4kiB. Thus the
+	 * maximum compression buffer size is 64kiB, so we use this when
+	 * initializing the compression buffer.
+	 */
+	NTFS_MAX_CB_SIZE	= 64 * 1024,
+} ntfs_compression_constants;
+
+/**
+ * ntfs_compression_buffer - one buffer for the decompression engine
+ */
+static u8 *ntfs_compression_buffer = NULL;
+
+/**
+ * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer
+ */
+static DEFINE_SPINLOCK(ntfs_cb_lock);
+
+/**
+ * allocate_compression_buffers - allocate the decompression buffers
+ *
+ * Caller has to hold the ntfs_lock mutex.
+ *
+ * Return 0 on success or -ENOMEM if the allocations failed.
+ */
+int allocate_compression_buffers(void)
+{
+	BUG_ON(ntfs_compression_buffer);
+
+	ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE);
+	if (!ntfs_compression_buffer)
+		return -ENOMEM;
+	return 0;
+}
+
+/**
+ * free_compression_buffers - free the decompression buffers
+ *
+ * Caller has to hold the ntfs_lock mutex.
+ */
+void free_compression_buffers(void)
+{
+	BUG_ON(!ntfs_compression_buffer);
+	vfree(ntfs_compression_buffer);
+	ntfs_compression_buffer = NULL;
+}
+
+/**
+ * zero_partial_compressed_page - zero out of bounds compressed page region
+ */
+static void zero_partial_compressed_page(struct page *page,
+		const s64 initialized_size)
+{
+	u8 *kp = page_address(page);
+	unsigned int kp_ofs;
+
+	ntfs_debug("Zeroing page region outside initialized size.");
+	if (((s64)page->index << PAGE_CACHE_SHIFT) >= initialized_size) {
+		/*
+		 * FIXME: Using clear_page() will become wrong when we get
+		 * PAGE_CACHE_SIZE != PAGE_SIZE but for now there is no problem.
+		 */
+		clear_page(kp);
+		return;
+	}
+	kp_ofs = initialized_size & ~PAGE_CACHE_MASK;
+	memset(kp + kp_ofs, 0, PAGE_CACHE_SIZE - kp_ofs);
+	return;
+}
+
+/**
+ * handle_bounds_compressed_page - test for&handle out of bounds compressed page
+ */
+static inline void handle_bounds_compressed_page(struct page *page,
+		const loff_t i_size, const s64 initialized_size)
+{
+	if ((page->index >= (initialized_size >> PAGE_CACHE_SHIFT)) &&
+			(initialized_size < i_size))
+		zero_partial_compressed_page(page, initialized_size);
+	return;
+}
+
+/**
+ * ntfs_decompress - decompress a compression block into an array of pages
+ * @dest_pages:		destination array of pages
+ * @dest_index:		current index into @dest_pages (IN/OUT)
+ * @dest_ofs:		current offset within @dest_pages[@dest_index] (IN/OUT)
+ * @dest_max_index:	maximum index into @dest_pages (IN)
+ * @dest_max_ofs:	maximum offset within @dest_pages[@dest_max_index] (IN)
+ * @xpage:		the target page (-1 if none) (IN)
+ * @xpage_done:		set to 1 if xpage was completed successfully (IN/OUT)
+ * @cb_start:		compression block to decompress (IN)
+ * @cb_size:		size of compression block @cb_start in bytes (IN)
+ * @i_size:		file size when we started the read (IN)
+ * @initialized_size:	initialized file size when we started the read (IN)
+ *
+ * The caller must have disabled preemption. ntfs_decompress() reenables it when
+ * the critical section is finished.
+ *
+ * This decompresses the compression block @cb_start into the array of
+ * destination pages @dest_pages starting at index @dest_index into @dest_pages
+ * and at offset @dest_pos into the page @dest_pages[@dest_index].
+ *
+ * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1.
+ * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified.
+ *
+ * @cb_start is a pointer to the compression block which needs decompressing
+ * and @cb_size is the size of @cb_start in bytes (8-64kiB).
+ *
+ * Return 0 if success or -EOVERFLOW on error in the compressed stream.
+ * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was
+ * completed during the decompression of the compression block (@cb_start).
+ *
+ * Warning: This function *REQUIRES* PAGE_CACHE_SIZE >= 4096 or it will blow up
+ * unpredicatbly! You have been warned!
+ *
+ * Note to hackers: This function may not sleep until it has finished accessing
+ * the compression block @cb_start as it is a per-CPU buffer.
+ */
+static int ntfs_decompress(struct page *dest_pages[], int *dest_index,
+		int *dest_ofs, const int dest_max_index, const int dest_max_ofs,
+		const int xpage, char *xpage_done, u8 *const cb_start,
+		const u32 cb_size, const loff_t i_size,
+		const s64 initialized_size)
+{
+	/*
+	 * Pointers into the compressed data, i.e. the compression block (cb),
+	 * and the therein contained sub-blocks (sb).
+	 */
+	u8 *cb_end = cb_start + cb_size; /* End of cb. */
+	u8 *cb = cb_start;	/* Current position in cb. */
+	u8 *cb_sb_start = cb;	/* Beginning of the current sb in the cb. */
+	u8 *cb_sb_end;		/* End of current sb / beginning of next sb. */
+
+	/* Variables for uncompressed data / destination. */
+	struct page *dp;	/* Current destination page being worked on. */
+	u8 *dp_addr;		/* Current pointer into dp. */
+	u8 *dp_sb_start;	/* Start of current sub-block in dp. */
+	u8 *dp_sb_end;		/* End of current sb in dp (dp_sb_start +
+				   NTFS_SB_SIZE). */
+	u16 do_sb_start;	/* @dest_ofs when starting this sub-block. */
+	u16 do_sb_end;		/* @dest_ofs of end of this sb (do_sb_start +
+				   NTFS_SB_SIZE). */
+
+	/* Variables for tag and token parsing. */
+	u8 tag;			/* Current tag. */
+	int token;		/* Loop counter for the eight tokens in tag. */
+
+	/* Need this because we can't sleep, so need two stages. */
+	int completed_pages[dest_max_index - *dest_index + 1];
+	int nr_completed_pages = 0;
+
+	/* Default error code. */
+	int err = -EOVERFLOW;
+
+	ntfs_debug("Entering, cb_size = 0x%x.", cb_size);
+do_next_sb:
+	ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.",
+			cb - cb_start);
+	/*
+	 * Have we reached the end of the compression block or the end of the
+	 * decompressed data?  The latter can happen for example if the current
+	 * position in the compression block is one byte before its end so the
+	 * first two checks do not detect it.
+	 */
+	if (cb == cb_end || !le16_to_cpup((le16*)cb) ||
+			(*dest_index == dest_max_index &&
+			*dest_ofs == dest_max_ofs)) {
+		int i;
+
+		ntfs_debug("Completed. Returning success (0).");
+		err = 0;
+return_error:
+		/* We can sleep from now on, so we drop lock. */
+		spin_unlock(&ntfs_cb_lock);
+		/* Second stage: finalize completed pages. */
+		if (nr_completed_pages > 0) {
+			for (i = 0; i < nr_completed_pages; i++) {
+				int di = completed_pages[i];
+
+				dp = dest_pages[di];
+				/*
+				 * If we are outside the initialized size, zero
+				 * the out of bounds page range.
+				 */
+				handle_bounds_compressed_page(dp, i_size,
+						initialized_size);
+				flush_dcache_page(dp);
+				kunmap(dp);
+				SetPageUptodate(dp);
+				unlock_page(dp);
+				if (di == xpage)
+					*xpage_done = 1;
+				else
+					page_cache_release(dp);
+				dest_pages[di] = NULL;
+			}
+		}
+		return err;
+	}
+
+	/* Setup offsets for the current sub-block destination. */
+	do_sb_start = *dest_ofs;
+	do_sb_end = do_sb_start + NTFS_SB_SIZE;
+
+	/* Check that we are still within allowed boundaries. */
+	if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs)
+		goto return_overflow;
+
+	/* Does the minimum size of a compressed sb overflow valid range? */
+	if (cb + 6 > cb_end)
+		goto return_overflow;
+
+	/* Setup the current sub-block source pointers and validate range. */
+	cb_sb_start = cb;
+	cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK)
+			+ 3;
+	if (cb_sb_end > cb_end)
+		goto return_overflow;
+
+	/* Get the current destination page. */
+	dp = dest_pages[*dest_index];
+	if (!dp) {
+		/* No page present. Skip decompression of this sub-block. */
+		cb = cb_sb_end;
+
+		/* Advance destination position to next sub-block. */
+		*dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_CACHE_MASK;
+		if (!*dest_ofs && (++*dest_index > dest_max_index))
+			goto return_overflow;
+		goto do_next_sb;
+	}
+
+	/* We have a valid destination page. Setup the destination pointers. */
+	dp_addr = (u8*)page_address(dp) + do_sb_start;
+
+	/* Now, we are ready to process the current sub-block (sb). */
+	if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) {
+		ntfs_debug("Found uncompressed sub-block.");
+		/* This sb is not compressed, just copy it into destination. */
+
+		/* Advance source position to first data byte. */
+		cb += 2;
+
+		/* An uncompressed sb must be full size. */
+		if (cb_sb_end - cb != NTFS_SB_SIZE)
+			goto return_overflow;
+
+		/* Copy the block and advance the source position. */
+		memcpy(dp_addr, cb, NTFS_SB_SIZE);
+		cb += NTFS_SB_SIZE;
+
+		/* Advance destination position to next sub-block. */
+		*dest_ofs += NTFS_SB_SIZE;
+		if (!(*dest_ofs &= ~PAGE_CACHE_MASK)) {
+finalize_page:
+			/*
+			 * First stage: add current page index to array of
+			 * completed pages.
+			 */
+			completed_pages[nr_completed_pages++] = *dest_index;
+			if (++*dest_index > dest_max_index)
+				goto return_overflow;
+		}
+		goto do_next_sb;
+	}
+	ntfs_debug("Found compressed sub-block.");
+	/* This sb is compressed, decompress it into destination. */
+
+	/* Setup destination pointers. */
+	dp_sb_start = dp_addr;
+	dp_sb_end = dp_sb_start + NTFS_SB_SIZE;
+
+	/* Forward to the first tag in the sub-block. */
+	cb += 2;
+do_next_tag:
+	if (cb == cb_sb_end) {
+		/* Check if the decompressed sub-block was not full-length. */
+		if (dp_addr < dp_sb_end) {
+			int nr_bytes = do_sb_end - *dest_ofs;
+
+			ntfs_debug("Filling incomplete sub-block with "
+					"zeroes.");
+			/* Zero remainder and update destination position. */
+			memset(dp_addr, 0, nr_bytes);
+			*dest_ofs += nr_bytes;
+		}
+		/* We have finished the current sub-block. */
+		if (!(*dest_ofs &= ~PAGE_CACHE_MASK))
+			goto finalize_page;
+		goto do_next_sb;
+	}
+
+	/* Check we are still in range. */
+	if (cb > cb_sb_end || dp_addr > dp_sb_end)
+		goto return_overflow;
+
+	/* Get the next tag and advance to first token. */
+	tag = *cb++;
+
+	/* Parse the eight tokens described by the tag. */
+	for (token = 0; token < 8; token++, tag >>= 1) {
+		u16 lg, pt, length, max_non_overlap;
+		register u16 i;
+		u8 *dp_back_addr;
+
+		/* Check if we are done / still in range. */
+		if (cb >= cb_sb_end || dp_addr > dp_sb_end)
+			break;
+
+		/* Determine token type and parse appropriately.*/
+		if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) {
+			/*
+			 * We have a symbol token, copy the symbol across, and
+			 * advance the source and destination positions.
+			 */
+			*dp_addr++ = *cb++;
+			++*dest_ofs;
+
+			/* Continue with the next token. */
+			continue;
+		}
+
+		/*
+		 * We have a phrase token. Make sure it is not the first tag in
+		 * the sb as this is illegal and would confuse the code below.
+		 */
+		if (dp_addr == dp_sb_start)
+			goto return_overflow;
+
+		/*
+		 * Determine the number of bytes to go back (p) and the number
+		 * of bytes to copy (l). We use an optimized algorithm in which
+		 * we first calculate log2(current destination position in sb),
+		 * which allows determination of l and p in O(1) rather than
+		 * O(n). We just need an arch-optimized log2() function now.
+		 */
+		lg = 0;
+		for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1)
+			lg++;
+
+		/* Get the phrase token into i. */
+		pt = le16_to_cpup((le16*)cb);
+
+		/*
+		 * Calculate starting position of the byte sequence in
+		 * the destination using the fact that p = (pt >> (12 - lg)) + 1
+		 * and make sure we don't go too far back.
+		 */
+		dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1;
+		if (dp_back_addr < dp_sb_start)
+			goto return_overflow;
+
+		/* Now calculate the length of the byte sequence. */
+		length = (pt & (0xfff >> lg)) + 3;
+
+		/* Advance destination position and verify it is in range. */
+		*dest_ofs += length;
+		if (*dest_ofs > do_sb_end)
+			goto return_overflow;
+
+		/* The number of non-overlapping bytes. */
+		max_non_overlap = dp_addr - dp_back_addr;
+
+		if (length <= max_non_overlap) {
+			/* The byte sequence doesn't overlap, just copy it. */
+			memcpy(dp_addr, dp_back_addr, length);
+
+			/* Advance destination pointer. */
+			dp_addr += length;
+		} else {
+			/*
+			 * The byte sequence does overlap, copy non-overlapping
+			 * part and then do a slow byte by byte copy for the
+			 * overlapping part. Also, advance the destination
+			 * pointer.
+			 */
+			memcpy(dp_addr, dp_back_addr, max_non_overlap);
+			dp_addr += max_non_overlap;
+			dp_back_addr += max_non_overlap;
+			length -= max_non_overlap;
+			while (length--)
+				*dp_addr++ = *dp_back_addr++;
+		}
+
+		/* Advance source position and continue with the next token. */
+		cb += 2;
+	}
+
+	/* No tokens left in the current tag. Continue with the next tag. */
+	goto do_next_tag;
+
+return_overflow:
+	ntfs_error(NULL, "Failed. Returning -EOVERFLOW.");
+	goto return_error;
+}
+
+/**
+ * ntfs_read_compressed_block - read a compressed block into the page cache
+ * @page:	locked page in the compression block(s) we need to read
+ *
+ * When we are called the page has already been verified to be locked and the
+ * attribute is known to be non-resident, not encrypted, but compressed.
+ *
+ * 1. Determine which compression block(s) @page is in.
+ * 2. Get hold of all pages corresponding to this/these compression block(s).
+ * 3. Read the (first) compression block.
+ * 4. Decompress it into the corresponding pages.
+ * 5. Throw the compressed data away and proceed to 3. for the next compression
+ *    block or return success if no more compression blocks left.
+ *
+ * Warning: We have to be careful what we do about existing pages. They might
+ * have been written to so that we would lose data if we were to just overwrite
+ * them with the out-of-date uncompressed data.
+ *
+ * FIXME: For PAGE_CACHE_SIZE > cb_size we are not doing the Right Thing(TM) at
+ * the end of the file I think. We need to detect this case and zero the out
+ * of bounds remainder of the page in question and mark it as handled. At the
+ * moment we would just return -EIO on such a page. This bug will only become
+ * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte
+ * clusters so is probably not going to be seen by anyone. Still this should
+ * be fixed. (AIA)
+ *
+ * FIXME: Again for PAGE_CACHE_SIZE > cb_size we are screwing up both in
+ * handling sparse and compressed cbs. (AIA)
+ *
+ * FIXME: At the moment we don't do any zeroing out in the case that
+ * initialized_size is less than data_size. This should be safe because of the
+ * nature of the compression algorithm used. Just in case we check and output
+ * an error message in read inode if the two sizes are not equal for a
+ * compressed file. (AIA)
+ */
+int ntfs_read_compressed_block(struct page *page)
+{
+	loff_t i_size;
+	s64 initialized_size;
+	struct address_space *mapping = page->mapping;
+	ntfs_inode *ni = NTFS_I(mapping->host);
+	ntfs_volume *vol = ni->vol;
+	struct super_block *sb = vol->sb;
+	runlist_element *rl;
+	unsigned long flags, block_size = sb->s_blocksize;
+	unsigned char block_size_bits = sb->s_blocksize_bits;
+	u8 *cb, *cb_pos, *cb_end;
+	struct buffer_head **bhs;
+	unsigned long offset, index = page->index;
+	u32 cb_size = ni->itype.compressed.block_size;
+	u64 cb_size_mask = cb_size - 1UL;
+	VCN vcn;
+	LCN lcn;
+	/* The first wanted vcn (minimum alignment is PAGE_CACHE_SIZE). */
+	VCN start_vcn = (((s64)index << PAGE_CACHE_SHIFT) & ~cb_size_mask) >>
+			vol->cluster_size_bits;
+	/*
+	 * The first vcn after the last wanted vcn (minimum alignment is again
+	 * PAGE_CACHE_SIZE.
+	 */
+	VCN end_vcn = ((((s64)(index + 1UL) << PAGE_CACHE_SHIFT) + cb_size - 1)
+			& ~cb_size_mask) >> vol->cluster_size_bits;
+	/* Number of compression blocks (cbs) in the wanted vcn range. */
+	unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits
+			>> ni->itype.compressed.block_size_bits;
+	/*
+	 * Number of pages required to store the uncompressed data from all
+	 * compression blocks (cbs) overlapping @page. Due to alignment
+	 * guarantees of start_vcn and end_vcn, no need to round up here.
+	 */
+	unsigned int nr_pages = (end_vcn - start_vcn) <<
+			vol->cluster_size_bits >> PAGE_CACHE_SHIFT;
+	unsigned int xpage, max_page, cur_page, cur_ofs, i;
+	unsigned int cb_clusters, cb_max_ofs;
+	int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0;
+	struct page **pages;
+	unsigned char xpage_done = 0;
+
+	ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = "
+			"%i.", index, cb_size, nr_pages);
+	/*
+	 * Bad things happen if we get here for anything that is not an
+	 * unnamed $DATA attribute.
+	 */
+	BUG_ON(ni->type != AT_DATA);
+	BUG_ON(ni->name_len);
+
+	pages = kmalloc(nr_pages * sizeof(struct page *), GFP_NOFS);
+
+	/* Allocate memory to store the buffer heads we need. */
+	bhs_size = cb_size / block_size * sizeof(struct buffer_head *);
+	bhs = kmalloc(bhs_size, GFP_NOFS);
+
+	if (unlikely(!pages || !bhs)) {
+		kfree(bhs);
+		kfree(pages);
+		unlock_page(page);
+		ntfs_error(vol->sb, "Failed to allocate internal buffers.");
+		return -ENOMEM;
+	}
+
+	/*
+	 * We have already been given one page, this is the one we must do.
+	 * Once again, the alignment guarantees keep it simple.
+	 */
+	offset = start_vcn << vol->cluster_size_bits >> PAGE_CACHE_SHIFT;
+	xpage = index - offset;
+	pages[xpage] = page;
+	/*
+	 * The remaining pages need to be allocated and inserted into the page
+	 * cache, alignment guarantees keep all the below much simpler. (-8
+	 */
+	read_lock_irqsave(&ni->size_lock, flags);
+	i_size = i_size_read(VFS_I(ni));
+	initialized_size = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	max_page = ((i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+			offset;
+	/* Is the page fully outside i_size? (truncate in progress) */
+	if (xpage >= max_page) {
+		kfree(bhs);
+		kfree(pages);
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+		ntfs_debug("Compressed read outside i_size - truncated?");
+		SetPageUptodate(page);
+		unlock_page(page);
+		return 0;
+	}
+	if (nr_pages < max_page)
+		max_page = nr_pages;
+	for (i = 0; i < max_page; i++, offset++) {
+		if (i != xpage)
+			pages[i] = grab_cache_page_nowait(mapping, offset);
+		page = pages[i];
+		if (page) {
+			/*
+			 * We only (re)read the page if it isn't already read
+			 * in and/or dirty or we would be losing data or at
+			 * least wasting our time.
+			 */
+			if (!PageDirty(page) && (!PageUptodate(page) ||
+					PageError(page))) {
+				ClearPageError(page);
+				kmap(page);
+				continue;
+			}
+			unlock_page(page);
+			page_cache_release(page);
+			pages[i] = NULL;
+		}
+	}
+
+	/*
+	 * We have the runlist, and all the destination pages we need to fill.
+	 * Now read the first compression block.
+	 */
+	cur_page = 0;
+	cur_ofs = 0;
+	cb_clusters = ni->itype.compressed.block_clusters;
+do_next_cb:
+	nr_cbs--;
+	nr_bhs = 0;
+
+	/* Read all cb buffer heads one cluster at a time. */
+	rl = NULL;
+	for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn;
+			vcn++) {
+		bool is_retry = false;
+
+		if (!rl) {
+lock_retry_remap:
+			down_read(&ni->runlist.lock);
+			rl = ni->runlist.rl;
+		}
+		if (likely(rl != NULL)) {
+			/* Seek to element containing target vcn. */
+			while (rl->length && rl[1].vcn <= vcn)
+				rl++;
+			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+		} else
+			lcn = LCN_RL_NOT_MAPPED;
+		ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
+				(unsigned long long)vcn,
+				(unsigned long long)lcn);
+		if (lcn < 0) {
+			/*
+			 * When we reach the first sparse cluster we have
+			 * finished with the cb.
+			 */
+			if (lcn == LCN_HOLE)
+				break;
+			if (is_retry || lcn != LCN_RL_NOT_MAPPED)
+				goto rl_err;
+			is_retry = true;
+			/*
+			 * Attempt to map runlist, dropping lock for the
+			 * duration.
+			 */
+			up_read(&ni->runlist.lock);
+			if (!ntfs_map_runlist(ni, vcn))
+				goto lock_retry_remap;
+			goto map_rl_err;
+		}
+		block = lcn << vol->cluster_size_bits >> block_size_bits;
+		/* Read the lcn from device in chunks of block_size bytes. */
+		max_block = block + (vol->cluster_size >> block_size_bits);
+		do {
+			ntfs_debug("block = 0x%x.", block);
+			if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block))))
+				goto getblk_err;
+			nr_bhs++;
+		} while (++block < max_block);
+	}
+
+	/* Release the lock if we took it. */
+	if (rl)
+		up_read(&ni->runlist.lock);
+
+	/* Setup and initiate io on all buffer heads. */
+	for (i = 0; i < nr_bhs; i++) {
+		struct buffer_head *tbh = bhs[i];
+
+		if (!trylock_buffer(tbh))
+			continue;
+		if (unlikely(buffer_uptodate(tbh))) {
+			unlock_buffer(tbh);
+			continue;
+		}
+		get_bh(tbh);
+		tbh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, tbh);
+	}
+
+	/* Wait for io completion on all buffer heads. */
+	for (i = 0; i < nr_bhs; i++) {
+		struct buffer_head *tbh = bhs[i];
+
+		if (buffer_uptodate(tbh))
+			continue;
+		wait_on_buffer(tbh);
+		/*
+		 * We need an optimization barrier here, otherwise we start
+		 * hitting the below fixup code when accessing a loopback
+		 * mounted ntfs partition. This indicates either there is a
+		 * race condition in the loop driver or, more likely, gcc
+		 * overoptimises the code without the barrier and it doesn't
+		 * do the Right Thing(TM).
+		 */
+		barrier();
+		if (unlikely(!buffer_uptodate(tbh))) {
+			ntfs_warning(vol->sb, "Buffer is unlocked but not "
+					"uptodate! Unplugging the disk queue "
+					"and rescheduling.");
+			get_bh(tbh);
+			io_schedule();
+			put_bh(tbh);
+			if (unlikely(!buffer_uptodate(tbh)))
+				goto read_err;
+			ntfs_warning(vol->sb, "Buffer is now uptodate. Good.");
+		}
+	}
+
+	/*
+	 * Get the compression buffer. We must not sleep any more
+	 * until we are finished with it.
+	 */
+	spin_lock(&ntfs_cb_lock);
+	cb = ntfs_compression_buffer;
+
+	BUG_ON(!cb);
+
+	cb_pos = cb;
+	cb_end = cb + cb_size;
+
+	/* Copy the buffer heads into the contiguous buffer. */
+	for (i = 0; i < nr_bhs; i++) {
+		memcpy(cb_pos, bhs[i]->b_data, block_size);
+		cb_pos += block_size;
+	}
+
+	/* Just a precaution. */
+	if (cb_pos + 2 <= cb + cb_size)
+		*(u16*)cb_pos = 0;
+
+	/* Reset cb_pos back to the beginning. */
+	cb_pos = cb;
+
+	/* We now have both source (if present) and destination. */
+	ntfs_debug("Successfully read the compression block.");
+
+	/* The last page and maximum offset within it for the current cb. */
+	cb_max_page = (cur_page << PAGE_CACHE_SHIFT) + cur_ofs + cb_size;
+	cb_max_ofs = cb_max_page & ~PAGE_CACHE_MASK;
+	cb_max_page >>= PAGE_CACHE_SHIFT;
+
+	/* Catch end of file inside a compression block. */
+	if (cb_max_page > max_page)
+		cb_max_page = max_page;
+
+	if (vcn == start_vcn - cb_clusters) {
+		/* Sparse cb, zero out page range overlapping the cb. */
+		ntfs_debug("Found sparse compression block.");
+		/* We can sleep from now on, so we drop lock. */
+		spin_unlock(&ntfs_cb_lock);
+		if (cb_max_ofs)
+			cb_max_page--;
+		for (; cur_page < cb_max_page; cur_page++) {
+			page = pages[cur_page];
+			if (page) {
+				/*
+				 * FIXME: Using clear_page() will become wrong
+				 * when we get PAGE_CACHE_SIZE != PAGE_SIZE but
+				 * for now there is no problem.
+				 */
+				if (likely(!cur_ofs))
+					clear_page(page_address(page));
+				else
+					memset(page_address(page) + cur_ofs, 0,
+							PAGE_CACHE_SIZE -
+							cur_ofs);
+				flush_dcache_page(page);
+				kunmap(page);
+				SetPageUptodate(page);
+				unlock_page(page);
+				if (cur_page == xpage)
+					xpage_done = 1;
+				else
+					page_cache_release(page);
+				pages[cur_page] = NULL;
+			}
+			cb_pos += PAGE_CACHE_SIZE - cur_ofs;
+			cur_ofs = 0;
+			if (cb_pos >= cb_end)
+				break;
+		}
+		/* If we have a partial final page, deal with it now. */
+		if (cb_max_ofs && cb_pos < cb_end) {
+			page = pages[cur_page];
+			if (page)
+				memset(page_address(page) + cur_ofs, 0,
+						cb_max_ofs - cur_ofs);
+			/*
+			 * No need to update cb_pos at this stage:
+			 *	cb_pos += cb_max_ofs - cur_ofs;
+			 */
+			cur_ofs = cb_max_ofs;
+		}
+	} else if (vcn == start_vcn) {
+		/* We can't sleep so we need two stages. */
+		unsigned int cur2_page = cur_page;
+		unsigned int cur_ofs2 = cur_ofs;
+		u8 *cb_pos2 = cb_pos;
+
+		ntfs_debug("Found uncompressed compression block.");
+		/* Uncompressed cb, copy it to the destination pages. */
+		/*
+		 * TODO: As a big optimization, we could detect this case
+		 * before we read all the pages and use block_read_full_page()
+		 * on all full pages instead (we still have to treat partial
+		 * pages especially but at least we are getting rid of the
+		 * synchronous io for the majority of pages.
+		 * Or if we choose not to do the read-ahead/-behind stuff, we
+		 * could just return block_read_full_page(pages[xpage]) as long
+		 * as PAGE_CACHE_SIZE <= cb_size.
+		 */
+		if (cb_max_ofs)
+			cb_max_page--;
+		/* First stage: copy data into destination pages. */
+		for (; cur_page < cb_max_page; cur_page++) {
+			page = pages[cur_page];
+			if (page)
+				memcpy(page_address(page) + cur_ofs, cb_pos,
+						PAGE_CACHE_SIZE - cur_ofs);
+			cb_pos += PAGE_CACHE_SIZE - cur_ofs;
+			cur_ofs = 0;
+			if (cb_pos >= cb_end)
+				break;
+		}
+		/* If we have a partial final page, deal with it now. */
+		if (cb_max_ofs && cb_pos < cb_end) {
+			page = pages[cur_page];
+			if (page)
+				memcpy(page_address(page) + cur_ofs, cb_pos,
+						cb_max_ofs - cur_ofs);
+			cb_pos += cb_max_ofs - cur_ofs;
+			cur_ofs = cb_max_ofs;
+		}
+		/* We can sleep from now on, so drop lock. */
+		spin_unlock(&ntfs_cb_lock);
+		/* Second stage: finalize pages. */
+		for (; cur2_page < cb_max_page; cur2_page++) {
+			page = pages[cur2_page];
+			if (page) {
+				/*
+				 * If we are outside the initialized size, zero
+				 * the out of bounds page range.
+				 */
+				handle_bounds_compressed_page(page, i_size,
+						initialized_size);
+				flush_dcache_page(page);
+				kunmap(page);
+				SetPageUptodate(page);
+				unlock_page(page);
+				if (cur2_page == xpage)
+					xpage_done = 1;
+				else
+					page_cache_release(page);
+				pages[cur2_page] = NULL;
+			}
+			cb_pos2 += PAGE_CACHE_SIZE - cur_ofs2;
+			cur_ofs2 = 0;
+			if (cb_pos2 >= cb_end)
+				break;
+		}
+	} else {
+		/* Compressed cb, decompress it into the destination page(s). */
+		unsigned int prev_cur_page = cur_page;
+
+		ntfs_debug("Found compressed compression block.");
+		err = ntfs_decompress(pages, &cur_page, &cur_ofs,
+				cb_max_page, cb_max_ofs, xpage, &xpage_done,
+				cb_pos,	cb_size - (cb_pos - cb), i_size,
+				initialized_size);
+		/*
+		 * We can sleep from now on, lock already dropped by
+		 * ntfs_decompress().
+		 */
+		if (err) {
+			ntfs_error(vol->sb, "ntfs_decompress() failed in inode "
+					"0x%lx with error code %i. Skipping "
+					"this compression block.",
+					ni->mft_no, -err);
+			/* Release the unfinished pages. */
+			for (; prev_cur_page < cur_page; prev_cur_page++) {
+				page = pages[prev_cur_page];
+				if (page) {
+					flush_dcache_page(page);
+					kunmap(page);
+					unlock_page(page);
+					if (prev_cur_page != xpage)
+						page_cache_release(page);
+					pages[prev_cur_page] = NULL;
+				}
+			}
+		}
+	}
+
+	/* Release the buffer heads. */
+	for (i = 0; i < nr_bhs; i++)
+		brelse(bhs[i]);
+
+	/* Do we have more work to do? */
+	if (nr_cbs)
+		goto do_next_cb;
+
+	/* We no longer need the list of buffer heads. */
+	kfree(bhs);
+
+	/* Clean up if we have any pages left. Should never happen. */
+	for (cur_page = 0; cur_page < max_page; cur_page++) {
+		page = pages[cur_page];
+		if (page) {
+			ntfs_error(vol->sb, "Still have pages left! "
+					"Terminating them with extreme "
+					"prejudice.  Inode 0x%lx, page index "
+					"0x%lx.", ni->mft_no, page->index);
+			flush_dcache_page(page);
+			kunmap(page);
+			unlock_page(page);
+			if (cur_page != xpage)
+				page_cache_release(page);
+			pages[cur_page] = NULL;
+		}
+	}
+
+	/* We no longer need the list of pages. */
+	kfree(pages);
+
+	/* If we have completed the requested page, we return success. */
+	if (likely(xpage_done))
+		return 0;
+
+	ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
+			"EOVERFLOW" : (!err ? "EIO" : "unknown error"));
+	return err < 0 ? err : -EIO;
+
+read_err:
+	ntfs_error(vol->sb, "IO error while reading compressed data.");
+	/* Release the buffer heads. */
+	for (i = 0; i < nr_bhs; i++)
+		brelse(bhs[i]);
+	goto err_out;
+
+map_rl_err:
+	ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read "
+			"compression block.");
+	goto err_out;
+
+rl_err:
+	up_read(&ni->runlist.lock);
+	ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read "
+			"compression block.");
+	goto err_out;
+
+getblk_err:
+	up_read(&ni->runlist.lock);
+	ntfs_error(vol->sb, "getblk() failed. Cannot read compression block.");
+
+err_out:
+	kfree(bhs);
+	for (i = cur_page; i < max_page; i++) {
+		page = pages[i];
+		if (page) {
+			flush_dcache_page(page);
+			kunmap(page);
+			unlock_page(page);
+			if (i != xpage)
+				page_cache_release(page);
+		}
+	}
+	kfree(pages);
+	return -EIO;
+}
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
new file mode 100644
index 00000000..807150e2
--- /dev/null
+++ b/fs/ntfs/debug.c
@@ -0,0 +1,183 @@
+/*
+ * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "debug.h"
+
+/*
+ * A static buffer to hold the error string being displayed and a spinlock
+ * to protect concurrent accesses to it.
+ */
+static char err_buf[1024];
+static DEFINE_SPINLOCK(err_buf_lock);
+
+/**
+ * __ntfs_warning - output a warning to the syslog
+ * @function:	name of function outputting the warning
+ * @sb:		super block of mounted ntfs filesystem
+ * @fmt:	warning string containing format specifications
+ * @...:	a variable number of arguments specified in @fmt
+ *
+ * Outputs a warning to the syslog for the mounted ntfs filesystem described
+ * by @sb.
+ *
+ * @fmt and the corresponding @... is printf style format string containing
+ * the warning string and the corresponding format arguments, respectively.
+ *
+ * @function is the name of the function from which __ntfs_warning is being
+ * called.
+ *
+ * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead
+ * as this provides the @function parameter automatically.
+ */
+void __ntfs_warning(const char *function, const struct super_block *sb,
+		const char *fmt, ...)
+{
+	va_list args;
+	int flen = 0;
+
+#ifndef DEBUG
+	if (!printk_ratelimit())
+		return;
+#endif
+	if (function)
+		flen = strlen(function);
+	spin_lock(&err_buf_lock);
+	va_start(args, fmt);
+	vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+	va_end(args);
+	if (sb)
+		printk(KERN_ERR "NTFS-fs warning (device %s): %s(): %s\n",
+				sb->s_id, flen ? function : "", err_buf);
+	else
+		printk(KERN_ERR "NTFS-fs warning: %s(): %s\n",
+				flen ? function : "", err_buf);
+	spin_unlock(&err_buf_lock);
+}
+
+/**
+ * __ntfs_error - output an error to the syslog
+ * @function:	name of function outputting the error
+ * @sb:		super block of mounted ntfs filesystem
+ * @fmt:	error string containing format specifications
+ * @...:	a variable number of arguments specified in @fmt
+ *
+ * Outputs an error to the syslog for the mounted ntfs filesystem described
+ * by @sb.
+ *
+ * @fmt and the corresponding @... is printf style format string containing
+ * the error string and the corresponding format arguments, respectively.
+ *
+ * @function is the name of the function from which __ntfs_error is being
+ * called.
+ *
+ * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead
+ * as this provides the @function parameter automatically.
+ */
+void __ntfs_error(const char *function, const struct super_block *sb,
+		const char *fmt, ...)
+{
+	va_list args;
+	int flen = 0;
+
+#ifndef DEBUG
+	if (!printk_ratelimit())
+		return;
+#endif
+	if (function)
+		flen = strlen(function);
+	spin_lock(&err_buf_lock);
+	va_start(args, fmt);
+	vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+	va_end(args);
+	if (sb)
+		printk(KERN_ERR "NTFS-fs error (device %s): %s(): %s\n",
+				sb->s_id, flen ? function : "", err_buf);
+	else
+		printk(KERN_ERR "NTFS-fs error: %s(): %s\n",
+				flen ? function : "", err_buf);
+	spin_unlock(&err_buf_lock);
+}
+
+#ifdef DEBUG
+
+/* If 1, output debug messages, and if 0, don't. */
+int debug_msgs = 0;
+
+void __ntfs_debug (const char *file, int line, const char *function,
+		const char *fmt, ...)
+{
+	va_list args;
+	int flen = 0;
+
+	if (!debug_msgs)
+		return;
+	if (function)
+		flen = strlen(function);
+	spin_lock(&err_buf_lock);
+	va_start(args, fmt);
+	vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+	va_end(args);
+	printk(KERN_DEBUG "NTFS-fs DEBUG (%s, %d): %s(): %s\n", file, line,
+			flen ? function : "", err_buf);
+	spin_unlock(&err_buf_lock);
+}
+
+/* Dump a runlist. Caller has to provide synchronisation for @rl. */
+void ntfs_debug_dump_runlist(const runlist_element *rl)
+{
+	int i;
+	const char *lcn_str[5] = { "LCN_HOLE         ", "LCN_RL_NOT_MAPPED",
+				   "LCN_ENOENT       ", "LCN_unknown      " };
+
+	if (!debug_msgs)
+		return;
+	printk(KERN_DEBUG "NTFS-fs DEBUG: Dumping runlist (values in hex):\n");
+	if (!rl) {
+		printk(KERN_DEBUG "Run list not present.\n");
+		return;
+	}
+	printk(KERN_DEBUG "VCN              LCN               Run length\n");
+	for (i = 0; ; i++) {
+		LCN lcn = (rl + i)->lcn;
+
+		if (lcn < (LCN)0) {
+			int index = -lcn - 1;
+
+			if (index > -LCN_ENOENT - 1)
+				index = 3;
+			printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n",
+					(long long)(rl + i)->vcn, lcn_str[index],
+					(long long)(rl + i)->length,
+					(rl + i)->length ? "" :
+						" (runlist end)");
+		} else
+			printk(KERN_DEBUG "%-16Lx %-16Lx  %-16Lx%s\n",
+					(long long)(rl + i)->vcn,
+					(long long)(rl + i)->lcn,
+					(long long)(rl + i)->length,
+					(rl + i)->length ? "" :
+						" (runlist end)");
+		if (!(rl + i)->length)
+			break;
+	}
+}
+
+#endif
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
new file mode 100644
index 00000000..2142b1c6
--- /dev/null
+++ b/fs/ntfs/debug.h
@@ -0,0 +1,63 @@
+/*
+ * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_DEBUG_H
+#define _LINUX_NTFS_DEBUG_H
+
+#include <linux/fs.h>
+
+#include "runlist.h"
+
+#ifdef DEBUG
+
+extern int debug_msgs;
+
+extern void __ntfs_debug(const char *file, int line, const char *function,
+	const char *format, ...) __attribute__ ((format (printf, 4, 5)));
+/**
+ * ntfs_debug - write a debug level message to syslog
+ * @f:		a printf format string containing the message
+ * @...:	the variables to substitute into @f
+ *
+ * ntfs_debug() writes a DEBUG level message to the syslog but only if the
+ * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP.
+ */
+#define ntfs_debug(f, a...)						\
+	__ntfs_debug(__FILE__, __LINE__, __func__, f, ##a)
+
+extern void ntfs_debug_dump_runlist(const runlist_element *rl);
+
+#else	/* !DEBUG */
+
+#define ntfs_debug(f, a...)		do {} while (0)
+#define ntfs_debug_dump_runlist(rl)	do {} while (0)
+
+#endif	/* !DEBUG */
+
+extern void __ntfs_warning(const char *function, const struct super_block *sb,
+		const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+#define ntfs_warning(sb, f, a...)	__ntfs_warning(__func__, sb, f, ##a)
+
+extern void __ntfs_error(const char *function, const struct super_block *sb,
+		const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+#define ntfs_error(sb, f, a...)		__ntfs_error(__func__, sb, f, ##a)
+
+#endif /* _LINUX_NTFS_DEBUG_H */
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
new file mode 100644
index 00000000..0f48e7c5
--- /dev/null
+++ b/fs/ntfs/dir.c
@@ -0,0 +1,1575 @@
+/**
+ * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+
+#include "dir.h"
+#include "aops.h"
+#include "attrib.h"
+#include "mft.h"
+#include "debug.h"
+#include "ntfs.h"
+
+/**
+ * The little endian Unicode string $I30 as a global constant.
+ */
+ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
+		cpu_to_le16('3'),	cpu_to_le16('0'), 0 };
+
+/**
+ * ntfs_lookup_inode_by_name - find an inode in a directory given its name
+ * @dir_ni:	ntfs inode of the directory in which to search for the name
+ * @uname:	Unicode name for which to search in the directory
+ * @uname_len:	length of the name @uname in Unicode characters
+ * @res:	return the found file name if necessary (see below)
+ *
+ * Look for an inode with name @uname in the directory with inode @dir_ni.
+ * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
+ * the Unicode name. If the name is found in the directory, the corresponding
+ * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
+ * is a 64-bit number containing the sequence number.
+ *
+ * On error, a negative value is returned corresponding to the error code. In
+ * particular if the inode is not found -ENOENT is returned. Note that you
+ * can't just check the return value for being negative, you have to check the
+ * inode number for being negative which you can extract using MREC(return
+ * value).
+ *
+ * Note, @uname_len does not include the (optional) terminating NULL character.
+ *
+ * Note, we look for a case sensitive match first but we also look for a case
+ * insensitive match at the same time. If we find a case insensitive match, we
+ * save that for the case that we don't find an exact match, where we return
+ * the case insensitive match and setup @res (which we allocate!) with the mft
+ * reference, the file name type, length and with a copy of the little endian
+ * Unicode file name itself. If we match a file name which is in the DOS name
+ * space, we only return the mft reference and file name type in @res.
+ * ntfs_lookup() then uses this to find the long file name in the inode itself.
+ * This is to avoid polluting the dcache with short file names. We want them to
+ * work but we don't care for how quickly one can access them. This also fixes
+ * the dcache aliasing issues.
+ *
+ * Locking:  - Caller must hold i_mutex on the directory.
+ *	     - Each page cache page in the index allocation mapping must be
+ *	       locked whilst being accessed otherwise we may find a corrupt
+ *	       page due to it being under ->writepage at the moment which
+ *	       applies the mst protection fixups before writing out and then
+ *	       removes them again after the write is complete after which it 
+ *	       unlocks the page.
+ */
+MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
+		const int uname_len, ntfs_name **res)
+{
+	ntfs_volume *vol = dir_ni->vol;
+	struct super_block *sb = vol->sb;
+	MFT_RECORD *m;
+	INDEX_ROOT *ir;
+	INDEX_ENTRY *ie;
+	INDEX_ALLOCATION *ia;
+	u8 *index_end;
+	u64 mref;
+	ntfs_attr_search_ctx *ctx;
+	int err, rc;
+	VCN vcn, old_vcn;
+	struct address_space *ia_mapping;
+	struct page *page;
+	u8 *kaddr;
+	ntfs_name *name = NULL;
+
+	BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode));
+	BUG_ON(NInoAttr(dir_ni));
+	/* Get hold of the mft record for the directory. */
+	m = map_mft_record(dir_ni);
+	if (IS_ERR(m)) {
+		ntfs_error(sb, "map_mft_record() failed with error code %ld.",
+				-PTR_ERR(m));
+		return ERR_MREF(PTR_ERR(m));
+	}
+	ctx = ntfs_attr_get_search_ctx(dir_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	/* Find the index root attribute in the mft record. */
+	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
+			0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT) {
+			ntfs_error(sb, "Index root attribute missing in "
+					"directory inode 0x%lx.",
+					dir_ni->mft_no);
+			err = -EIO;
+		}
+		goto err_out;
+	}
+	/* Get to the index root value (it's been verified in read_inode). */
+	ir = (INDEX_ROOT*)((u8*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset));
+	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+	/* The first index entry. */
+	ie = (INDEX_ENTRY*)((u8*)&ir->index +
+			le32_to_cpu(ir->index.entries_offset));
+	/*
+	 * Loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry.
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		/* Bounds checks. */
+		if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->key_length) >
+				index_end)
+			goto dir_err_out;
+		/*
+		 * The last entry cannot contain a name. It can however contain
+		 * a pointer to a child node in the B+tree so we just break out.
+		 */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/*
+		 * We perform a case sensitive comparison and if that matches
+		 * we are done and return the mft reference of the inode (i.e.
+		 * the inode number together with the sequence number for
+		 * consistency checking). We convert it to cpu format before
+		 * returning.
+		 */
+		if (ntfs_are_names_equal(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length,
+				CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
+found_it:
+			/*
+			 * We have a perfect match, so we don't need to care
+			 * about having matched imperfectly before, so we can
+			 * free name and set *res to NULL.
+			 * However, if the perfect match is a short file name,
+			 * we need to signal this through *res, so that
+			 * ntfs_lookup() can fix dcache aliasing issues.
+			 * As an optimization we just reuse an existing
+			 * allocation of *res.
+			 */
+			if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
+				if (!name) {
+					name = kmalloc(sizeof(ntfs_name),
+							GFP_NOFS);
+					if (!name) {
+						err = -ENOMEM;
+						goto err_out;
+					}
+				}
+				name->mref = le64_to_cpu(
+						ie->data.dir.indexed_file);
+				name->type = FILE_NAME_DOS;
+				name->len = 0;
+				*res = name;
+			} else {
+				kfree(name);
+				*res = NULL;
+			}
+			mref = le64_to_cpu(ie->data.dir.indexed_file);
+			ntfs_attr_put_search_ctx(ctx);
+			unmap_mft_record(dir_ni);
+			return mref;
+		}
+		/*
+		 * For a case insensitive mount, we also perform a case
+		 * insensitive comparison (provided the file name is not in the
+		 * POSIX namespace). If the comparison matches, and the name is
+		 * in the WIN32 namespace, we cache the filename in *res so
+		 * that the caller, ntfs_lookup(), can work on it. If the
+		 * comparison matches, and the name is in the DOS namespace, we
+		 * only cache the mft reference and the file name type (we set
+		 * the name length to zero for simplicity).
+		 */
+		if (!NVolCaseSensitive(vol) &&
+				ie->key.file_name.file_name_type &&
+				ntfs_are_names_equal(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length,
+				IGNORE_CASE, vol->upcase, vol->upcase_len)) {
+			int name_size = sizeof(ntfs_name);
+			u8 type = ie->key.file_name.file_name_type;
+			u8 len = ie->key.file_name.file_name_length;
+
+			/* Only one case insensitive matching name allowed. */
+			if (name) {
+				ntfs_error(sb, "Found already allocated name "
+						"in phase 1. Please run chkdsk "
+						"and if that doesn't find any "
+						"errors please report you saw "
+						"this message to "
+						"linux-ntfs-dev@lists."
+						"sourceforge.net.");
+				goto dir_err_out;
+			}
+
+			if (type != FILE_NAME_DOS)
+				name_size += len * sizeof(ntfschar);
+			name = kmalloc(name_size, GFP_NOFS);
+			if (!name) {
+				err = -ENOMEM;
+				goto err_out;
+			}
+			name->mref = le64_to_cpu(ie->data.dir.indexed_file);
+			name->type = type;
+			if (type != FILE_NAME_DOS) {
+				name->len = len;
+				memcpy(name->name, ie->key.file_name.file_name,
+						len * sizeof(ntfschar));
+			} else
+				name->len = 0;
+			*res = name;
+		}
+		/*
+		 * Not a perfect match, need to do full blown collation so we
+		 * know which way in the B+tree we have to go.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				IGNORE_CASE, vol->upcase, vol->upcase_len);
+		/*
+		 * If uname collates before the name of the current entry, there
+		 * is definitely no such name in this index but we might need to
+		 * descend into the B+tree so we just break out of the loop.
+		 */
+		if (rc == -1)
+			break;
+		/* The names are not equal, continue the search. */
+		if (rc)
+			continue;
+		/*
+		 * Names match with case insensitive comparison, now try the
+		 * case sensitive comparison, which is required for proper
+		 * collation.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+		if (rc == -1)
+			break;
+		if (rc)
+			continue;
+		/*
+		 * Perfect match, this will never happen as the
+		 * ntfs_are_names_equal() call will have gotten a match but we
+		 * still treat it correctly.
+		 */
+		goto found_it;
+	}
+	/*
+	 * We have finished with this index without success. Check for the
+	 * presence of a child node and if not present return -ENOENT, unless
+	 * we have got a matching name cached in name in which case return the
+	 * mft reference associated with it.
+	 */
+	if (!(ie->flags & INDEX_ENTRY_NODE)) {
+		if (name) {
+			ntfs_attr_put_search_ctx(ctx);
+			unmap_mft_record(dir_ni);
+			return name->mref;
+		}
+		ntfs_debug("Entry not found.");
+		err = -ENOENT;
+		goto err_out;
+	} /* Child node present, descend into it. */
+	/* Consistency check: Verify that an index allocation exists. */
+	if (!NInoIndexAllocPresent(dir_ni)) {
+		ntfs_error(sb, "No index allocation attribute but index entry "
+				"requires one. Directory inode 0x%lx is "
+				"corrupt or driver bug.", dir_ni->mft_no);
+		goto err_out;
+	}
+	/* Get the starting vcn of the index_block holding the child node. */
+	vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
+	ia_mapping = VFS_I(dir_ni)->i_mapping;
+	/*
+	 * We are done with the index root and the mft record. Release them,
+	 * otherwise we deadlock with ntfs_map_page().
+	 */
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(dir_ni);
+	m = NULL;
+	ctx = NULL;
+descend_into_child_node:
+	/*
+	 * Convert vcn to index into the index allocation attribute in units
+	 * of PAGE_CACHE_SIZE and map the page cache page, reading it from
+	 * disk if necessary.
+	 */
+	page = ntfs_map_page(ia_mapping, vcn <<
+			dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+	if (IS_ERR(page)) {
+		ntfs_error(sb, "Failed to map directory index page, error %ld.",
+				-PTR_ERR(page));
+		err = PTR_ERR(page);
+		goto err_out;
+	}
+	lock_page(page);
+	kaddr = (u8*)page_address(page);
+fast_descend_into_child_node:
+	/* Get to the index allocation block. */
+	ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
+			dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+	/* Bounds checks. */
+	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+		ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
+				"inode 0x%lx or driver bug.", dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* Catch multi sector transfer fixup errors. */
+	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+		ntfs_error(sb, "Directory index record with vcn 0x%llx is "
+				"corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
+				(unsigned long long)vcn, dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
+		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+				"different from expected VCN (0x%llx). "
+				"Directory inode 0x%lx is corrupt or driver "
+				"bug.", (unsigned long long)
+				sle64_to_cpu(ia->index_block_vcn),
+				(unsigned long long)vcn, dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+			dir_ni->itype.index.block_size) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+				"0x%lx has a size (%u) differing from the "
+				"directory specified size (%u). Directory "
+				"inode is corrupt or driver bug.",
+				(unsigned long long)vcn, dir_ni->mft_no,
+				le32_to_cpu(ia->index.allocated_size) + 0x18,
+				dir_ni->itype.index.block_size);
+		goto unm_err_out;
+	}
+	index_end = (u8*)ia + dir_ni->itype.index.block_size;
+	if (index_end > kaddr + PAGE_CACHE_SIZE) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+				"0x%lx crosses page boundary. Impossible! "
+				"Cannot access! This is probably a bug in the "
+				"driver.", (unsigned long long)vcn,
+				dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+	if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
+		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
+				"inode 0x%lx exceeds maximum size.",
+				(unsigned long long)vcn, dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* The first index entry. */
+	ie = (INDEX_ENTRY*)((u8*)&ia->index +
+			le32_to_cpu(ia->index.entries_offset));
+	/*
+	 * Iterate similar to above big loop but applied to index buffer, thus
+	 * loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry.
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		/* Bounds check. */
+		if ((u8*)ie < (u8*)ia || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->key_length) >
+				index_end) {
+			ntfs_error(sb, "Index entry out of bounds in "
+					"directory inode 0x%lx.",
+					dir_ni->mft_no);
+			goto unm_err_out;
+		}
+		/*
+		 * The last entry cannot contain a name. It can however contain
+		 * a pointer to a child node in the B+tree so we just break out.
+		 */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/*
+		 * We perform a case sensitive comparison and if that matches
+		 * we are done and return the mft reference of the inode (i.e.
+		 * the inode number together with the sequence number for
+		 * consistency checking). We convert it to cpu format before
+		 * returning.
+		 */
+		if (ntfs_are_names_equal(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length,
+				CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
+found_it2:
+			/*
+			 * We have a perfect match, so we don't need to care
+			 * about having matched imperfectly before, so we can
+			 * free name and set *res to NULL.
+			 * However, if the perfect match is a short file name,
+			 * we need to signal this through *res, so that
+			 * ntfs_lookup() can fix dcache aliasing issues.
+			 * As an optimization we just reuse an existing
+			 * allocation of *res.
+			 */
+			if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
+				if (!name) {
+					name = kmalloc(sizeof(ntfs_name),
+							GFP_NOFS);
+					if (!name) {
+						err = -ENOMEM;
+						goto unm_err_out;
+					}
+				}
+				name->mref = le64_to_cpu(
+						ie->data.dir.indexed_file);
+				name->type = FILE_NAME_DOS;
+				name->len = 0;
+				*res = name;
+			} else {
+				kfree(name);
+				*res = NULL;
+			}
+			mref = le64_to_cpu(ie->data.dir.indexed_file);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			return mref;
+		}
+		/*
+		 * For a case insensitive mount, we also perform a case
+		 * insensitive comparison (provided the file name is not in the
+		 * POSIX namespace). If the comparison matches, and the name is
+		 * in the WIN32 namespace, we cache the filename in *res so
+		 * that the caller, ntfs_lookup(), can work on it. If the
+		 * comparison matches, and the name is in the DOS namespace, we
+		 * only cache the mft reference and the file name type (we set
+		 * the name length to zero for simplicity).
+		 */
+		if (!NVolCaseSensitive(vol) &&
+				ie->key.file_name.file_name_type &&
+				ntfs_are_names_equal(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length,
+				IGNORE_CASE, vol->upcase, vol->upcase_len)) {
+			int name_size = sizeof(ntfs_name);
+			u8 type = ie->key.file_name.file_name_type;
+			u8 len = ie->key.file_name.file_name_length;
+
+			/* Only one case insensitive matching name allowed. */
+			if (name) {
+				ntfs_error(sb, "Found already allocated name "
+						"in phase 2. Please run chkdsk "
+						"and if that doesn't find any "
+						"errors please report you saw "
+						"this message to "
+						"linux-ntfs-dev@lists."
+						"sourceforge.net.");
+				unlock_page(page);
+				ntfs_unmap_page(page);
+				goto dir_err_out;
+			}
+
+			if (type != FILE_NAME_DOS)
+				name_size += len * sizeof(ntfschar);
+			name = kmalloc(name_size, GFP_NOFS);
+			if (!name) {
+				err = -ENOMEM;
+				goto unm_err_out;
+			}
+			name->mref = le64_to_cpu(ie->data.dir.indexed_file);
+			name->type = type;
+			if (type != FILE_NAME_DOS) {
+				name->len = len;
+				memcpy(name->name, ie->key.file_name.file_name,
+						len * sizeof(ntfschar));
+			} else
+				name->len = 0;
+			*res = name;
+		}
+		/*
+		 * Not a perfect match, need to do full blown collation so we
+		 * know which way in the B+tree we have to go.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				IGNORE_CASE, vol->upcase, vol->upcase_len);
+		/*
+		 * If uname collates before the name of the current entry, there
+		 * is definitely no such name in this index but we might need to
+		 * descend into the B+tree so we just break out of the loop.
+		 */
+		if (rc == -1)
+			break;
+		/* The names are not equal, continue the search. */
+		if (rc)
+			continue;
+		/*
+		 * Names match with case insensitive comparison, now try the
+		 * case sensitive comparison, which is required for proper
+		 * collation.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+		if (rc == -1)
+			break;
+		if (rc)
+			continue;
+		/*
+		 * Perfect match, this will never happen as the
+		 * ntfs_are_names_equal() call will have gotten a match but we
+		 * still treat it correctly.
+		 */
+		goto found_it2;
+	}
+	/*
+	 * We have finished with this index buffer without success. Check for
+	 * the presence of a child node.
+	 */
+	if (ie->flags & INDEX_ENTRY_NODE) {
+		if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
+			ntfs_error(sb, "Index entry with child node found in "
+					"a leaf node in directory inode 0x%lx.",
+					dir_ni->mft_no);
+			goto unm_err_out;
+		}
+		/* Child node present, descend into it. */
+		old_vcn = vcn;
+		vcn = sle64_to_cpup((sle64*)((u8*)ie +
+				le16_to_cpu(ie->length) - 8));
+		if (vcn >= 0) {
+			/* If vcn is in the same page cache page as old_vcn we
+			 * recycle the mapped page. */
+			if (old_vcn << vol->cluster_size_bits >>
+					PAGE_CACHE_SHIFT == vcn <<
+					vol->cluster_size_bits >>
+					PAGE_CACHE_SHIFT)
+				goto fast_descend_into_child_node;
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			goto descend_into_child_node;
+		}
+		ntfs_error(sb, "Negative child node vcn in directory inode "
+				"0x%lx.", dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	/*
+	 * No child node present, return -ENOENT, unless we have got a matching
+	 * name cached in name in which case return the mft reference
+	 * associated with it.
+	 */
+	if (name) {
+		unlock_page(page);
+		ntfs_unmap_page(page);
+		return name->mref;
+	}
+	ntfs_debug("Entry not found.");
+	err = -ENOENT;
+unm_err_out:
+	unlock_page(page);
+	ntfs_unmap_page(page);
+err_out:
+	if (!err)
+		err = -EIO;
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(dir_ni);
+	if (name) {
+		kfree(name);
+		*res = NULL;
+	}
+	return ERR_MREF(err);
+dir_err_out:
+	ntfs_error(sb, "Corrupt directory.  Aborting lookup.");
+	goto err_out;
+}
+
+#if 0
+
+// TODO: (AIA)
+// The algorithm embedded in this code will be required for the time when we
+// want to support adding of entries to directories, where we require correct
+// collation of file names in order not to cause corruption of the filesystem.
+
+/**
+ * ntfs_lookup_inode_by_name - find an inode in a directory given its name
+ * @dir_ni:	ntfs inode of the directory in which to search for the name
+ * @uname:	Unicode name for which to search in the directory
+ * @uname_len:	length of the name @uname in Unicode characters
+ *
+ * Look for an inode with name @uname in the directory with inode @dir_ni.
+ * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
+ * the Unicode name. If the name is found in the directory, the corresponding
+ * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
+ * is a 64-bit number containing the sequence number.
+ *
+ * On error, a negative value is returned corresponding to the error code. In
+ * particular if the inode is not found -ENOENT is returned. Note that you
+ * can't just check the return value for being negative, you have to check the
+ * inode number for being negative which you can extract using MREC(return
+ * value).
+ *
+ * Note, @uname_len does not include the (optional) terminating NULL character.
+ */
+u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
+		const int uname_len)
+{
+	ntfs_volume *vol = dir_ni->vol;
+	struct super_block *sb = vol->sb;
+	MFT_RECORD *m;
+	INDEX_ROOT *ir;
+	INDEX_ENTRY *ie;
+	INDEX_ALLOCATION *ia;
+	u8 *index_end;
+	u64 mref;
+	ntfs_attr_search_ctx *ctx;
+	int err, rc;
+	IGNORE_CASE_BOOL ic;
+	VCN vcn, old_vcn;
+	struct address_space *ia_mapping;
+	struct page *page;
+	u8 *kaddr;
+
+	/* Get hold of the mft record for the directory. */
+	m = map_mft_record(dir_ni);
+	if (IS_ERR(m)) {
+		ntfs_error(sb, "map_mft_record() failed with error code %ld.",
+				-PTR_ERR(m));
+		return ERR_MREF(PTR_ERR(m));
+	}
+	ctx = ntfs_attr_get_search_ctx(dir_ni, m);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	/* Find the index root attribute in the mft record. */
+	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
+			0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT) {
+			ntfs_error(sb, "Index root attribute missing in "
+					"directory inode 0x%lx.",
+					dir_ni->mft_no);
+			err = -EIO;
+		}
+		goto err_out;
+	}
+	/* Get to the index root value (it's been verified in read_inode). */
+	ir = (INDEX_ROOT*)((u8*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset));
+	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+	/* The first index entry. */
+	ie = (INDEX_ENTRY*)((u8*)&ir->index +
+			le32_to_cpu(ir->index.entries_offset));
+	/*
+	 * Loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry.
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		/* Bounds checks. */
+		if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->key_length) >
+				index_end)
+			goto dir_err_out;
+		/*
+		 * The last entry cannot contain a name. It can however contain
+		 * a pointer to a child node in the B+tree so we just break out.
+		 */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/*
+		 * If the current entry has a name type of POSIX, the name is
+		 * case sensitive and not otherwise. This has the effect of us
+		 * not being able to access any POSIX file names which collate
+		 * after the non-POSIX one when they only differ in case, but
+		 * anyone doing screwy stuff like that deserves to burn in
+		 * hell... Doing that kind of stuff on NT4 actually causes
+		 * corruption on the partition even when using SP6a and Linux
+		 * is not involved at all.
+		 */
+		ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
+				CASE_SENSITIVE;
+		/*
+		 * If the names match perfectly, we are done and return the
+		 * mft reference of the inode (i.e. the inode number together
+		 * with the sequence number for consistency checking. We
+		 * convert it to cpu format before returning.
+		 */
+		if (ntfs_are_names_equal(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, ic,
+				vol->upcase, vol->upcase_len)) {
+found_it:
+			mref = le64_to_cpu(ie->data.dir.indexed_file);
+			ntfs_attr_put_search_ctx(ctx);
+			unmap_mft_record(dir_ni);
+			return mref;
+		}
+		/*
+		 * Not a perfect match, need to do full blown collation so we
+		 * know which way in the B+tree we have to go.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				IGNORE_CASE, vol->upcase, vol->upcase_len);
+		/*
+		 * If uname collates before the name of the current entry, there
+		 * is definitely no such name in this index but we might need to
+		 * descend into the B+tree so we just break out of the loop.
+		 */
+		if (rc == -1)
+			break;
+		/* The names are not equal, continue the search. */
+		if (rc)
+			continue;
+		/*
+		 * Names match with case insensitive comparison, now try the
+		 * case sensitive comparison, which is required for proper
+		 * collation.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+		if (rc == -1)
+			break;
+		if (rc)
+			continue;
+		/*
+		 * Perfect match, this will never happen as the
+		 * ntfs_are_names_equal() call will have gotten a match but we
+		 * still treat it correctly.
+		 */
+		goto found_it;
+	}
+	/*
+	 * We have finished with this index without success. Check for the
+	 * presence of a child node.
+	 */
+	if (!(ie->flags & INDEX_ENTRY_NODE)) {
+		/* No child node, return -ENOENT. */
+		err = -ENOENT;
+		goto err_out;
+	} /* Child node present, descend into it. */
+	/* Consistency check: Verify that an index allocation exists. */
+	if (!NInoIndexAllocPresent(dir_ni)) {
+		ntfs_error(sb, "No index allocation attribute but index entry "
+				"requires one. Directory inode 0x%lx is "
+				"corrupt or driver bug.", dir_ni->mft_no);
+		goto err_out;
+	}
+	/* Get the starting vcn of the index_block holding the child node. */
+	vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
+	ia_mapping = VFS_I(dir_ni)->i_mapping;
+	/*
+	 * We are done with the index root and the mft record. Release them,
+	 * otherwise we deadlock with ntfs_map_page().
+	 */
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(dir_ni);
+	m = NULL;
+	ctx = NULL;
+descend_into_child_node:
+	/*
+	 * Convert vcn to index into the index allocation attribute in units
+	 * of PAGE_CACHE_SIZE and map the page cache page, reading it from
+	 * disk if necessary.
+	 */
+	page = ntfs_map_page(ia_mapping, vcn <<
+			dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+	if (IS_ERR(page)) {
+		ntfs_error(sb, "Failed to map directory index page, error %ld.",
+				-PTR_ERR(page));
+		err = PTR_ERR(page);
+		goto err_out;
+	}
+	lock_page(page);
+	kaddr = (u8*)page_address(page);
+fast_descend_into_child_node:
+	/* Get to the index allocation block. */
+	ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
+			dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+	/* Bounds checks. */
+	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+		ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
+				"inode 0x%lx or driver bug.", dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* Catch multi sector transfer fixup errors. */
+	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+		ntfs_error(sb, "Directory index record with vcn 0x%llx is "
+				"corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
+				(unsigned long long)vcn, dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
+		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+				"different from expected VCN (0x%llx). "
+				"Directory inode 0x%lx is corrupt or driver "
+				"bug.", (unsigned long long)
+				sle64_to_cpu(ia->index_block_vcn),
+				(unsigned long long)vcn, dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+			dir_ni->itype.index.block_size) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+				"0x%lx has a size (%u) differing from the "
+				"directory specified size (%u). Directory "
+				"inode is corrupt or driver bug.",
+				(unsigned long long)vcn, dir_ni->mft_no,
+				le32_to_cpu(ia->index.allocated_size) + 0x18,
+				dir_ni->itype.index.block_size);
+		goto unm_err_out;
+	}
+	index_end = (u8*)ia + dir_ni->itype.index.block_size;
+	if (index_end > kaddr + PAGE_CACHE_SIZE) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+				"0x%lx crosses page boundary. Impossible! "
+				"Cannot access! This is probably a bug in the "
+				"driver.", (unsigned long long)vcn,
+				dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+	if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
+		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
+				"inode 0x%lx exceeds maximum size.",
+				(unsigned long long)vcn, dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* The first index entry. */
+	ie = (INDEX_ENTRY*)((u8*)&ia->index +
+			le32_to_cpu(ia->index.entries_offset));
+	/*
+	 * Iterate similar to above big loop but applied to index buffer, thus
+	 * loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry.
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		/* Bounds check. */
+		if ((u8*)ie < (u8*)ia || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->key_length) >
+				index_end) {
+			ntfs_error(sb, "Index entry out of bounds in "
+					"directory inode 0x%lx.",
+					dir_ni->mft_no);
+			goto unm_err_out;
+		}
+		/*
+		 * The last entry cannot contain a name. It can however contain
+		 * a pointer to a child node in the B+tree so we just break out.
+		 */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/*
+		 * If the current entry has a name type of POSIX, the name is
+		 * case sensitive and not otherwise. This has the effect of us
+		 * not being able to access any POSIX file names which collate
+		 * after the non-POSIX one when they only differ in case, but
+		 * anyone doing screwy stuff like that deserves to burn in
+		 * hell... Doing that kind of stuff on NT4 actually causes
+		 * corruption on the partition even when using SP6a and Linux
+		 * is not involved at all.
+		 */
+		ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
+				CASE_SENSITIVE;
+		/*
+		 * If the names match perfectly, we are done and return the
+		 * mft reference of the inode (i.e. the inode number together
+		 * with the sequence number for consistency checking. We
+		 * convert it to cpu format before returning.
+		 */
+		if (ntfs_are_names_equal(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, ic,
+				vol->upcase, vol->upcase_len)) {
+found_it2:
+			mref = le64_to_cpu(ie->data.dir.indexed_file);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			return mref;
+		}
+		/*
+		 * Not a perfect match, need to do full blown collation so we
+		 * know which way in the B+tree we have to go.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				IGNORE_CASE, vol->upcase, vol->upcase_len);
+		/*
+		 * If uname collates before the name of the current entry, there
+		 * is definitely no such name in this index but we might need to
+		 * descend into the B+tree so we just break out of the loop.
+		 */
+		if (rc == -1)
+			break;
+		/* The names are not equal, continue the search. */
+		if (rc)
+			continue;
+		/*
+		 * Names match with case insensitive comparison, now try the
+		 * case sensitive comparison, which is required for proper
+		 * collation.
+		 */
+		rc = ntfs_collate_names(uname, uname_len,
+				(ntfschar*)&ie->key.file_name.file_name,
+				ie->key.file_name.file_name_length, 1,
+				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+		if (rc == -1)
+			break;
+		if (rc)
+			continue;
+		/*
+		 * Perfect match, this will never happen as the
+		 * ntfs_are_names_equal() call will have gotten a match but we
+		 * still treat it correctly.
+		 */
+		goto found_it2;
+	}
+	/*
+	 * We have finished with this index buffer without success. Check for
+	 * the presence of a child node.
+	 */
+	if (ie->flags & INDEX_ENTRY_NODE) {
+		if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
+			ntfs_error(sb, "Index entry with child node found in "
+					"a leaf node in directory inode 0x%lx.",
+					dir_ni->mft_no);
+			goto unm_err_out;
+		}
+		/* Child node present, descend into it. */
+		old_vcn = vcn;
+		vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
+		if (vcn >= 0) {
+			/* If vcn is in the same page cache page as old_vcn we
+			 * recycle the mapped page. */
+			if (old_vcn << vol->cluster_size_bits >>
+					PAGE_CACHE_SHIFT == vcn <<
+					vol->cluster_size_bits >>
+					PAGE_CACHE_SHIFT)
+				goto fast_descend_into_child_node;
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			goto descend_into_child_node;
+		}
+		ntfs_error(sb, "Negative child node vcn in directory inode "
+				"0x%lx.", dir_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* No child node, return -ENOENT. */
+	ntfs_debug("Entry not found.");
+	err = -ENOENT;
+unm_err_out:
+	unlock_page(page);
+	ntfs_unmap_page(page);
+err_out:
+	if (!err)
+		err = -EIO;
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(dir_ni);
+	return ERR_MREF(err);
+dir_err_out:
+	ntfs_error(sb, "Corrupt directory. Aborting lookup.");
+	goto err_out;
+}
+
+#endif
+
+/**
+ * ntfs_filldir - ntfs specific filldir method
+ * @vol:	current ntfs volume
+ * @fpos:	position in the directory
+ * @ndir:	ntfs inode of current directory
+ * @ia_page:	page in which the index allocation buffer @ie is in resides
+ * @ie:		current index entry
+ * @name:	buffer to use for the converted name
+ * @dirent:	vfs filldir callback context
+ * @filldir:	vfs filldir callback
+ *
+ * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
+ * callback.
+ *
+ * If @ia_page is not NULL it is the locked page containing the index
+ * allocation block containing the index entry @ie.
+ *
+ * Note, we drop (and then reacquire) the page lock on @ia_page across the
+ * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup
+ * since ntfs_lookup() will lock the same page.  As an optimization, we do not
+ * retake the lock if we are returning a non-zero value as ntfs_readdir()
+ * would need to drop the lock immediately anyway.
+ */
+static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
+		ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
+		u8 *name, void *dirent, filldir_t filldir)
+{
+	unsigned long mref;
+	int name_len, rc;
+	unsigned dt_type;
+	FILE_NAME_TYPE_FLAGS name_type;
+
+	name_type = ie->key.file_name.file_name_type;
+	if (name_type == FILE_NAME_DOS) {
+		ntfs_debug("Skipping DOS name space entry.");
+		return 0;
+	}
+	if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) {
+		ntfs_debug("Skipping root directory self reference entry.");
+		return 0;
+	}
+	if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user &&
+			!NVolShowSystemFiles(vol)) {
+		ntfs_debug("Skipping system file.");
+		return 0;
+	}
+	name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name,
+			ie->key.file_name.file_name_length, &name,
+			NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1);
+	if (name_len <= 0) {
+		ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.",
+				(long long)MREF_LE(ie->data.dir.indexed_file));
+		return 0;
+	}
+	if (ie->key.file_name.file_attributes &
+			FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT)
+		dt_type = DT_DIR;
+	else
+		dt_type = DT_REG;
+	mref = MREF_LE(ie->data.dir.indexed_file);
+	/*
+	 * Drop the page lock otherwise we deadlock with NFS when it calls
+	 * ->lookup since ntfs_lookup() will lock the same page.
+	 */
+	if (ia_page)
+		unlock_page(ia_page);
+	ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
+			"0x%lx, DT_%s.", name, name_len, fpos, mref,
+			dt_type == DT_DIR ? "DIR" : "REG");
+	rc = filldir(dirent, name, name_len, fpos, mref, dt_type);
+	/* Relock the page but not if we are aborting ->readdir. */
+	if (!rc && ia_page)
+		lock_page(ia_page);
+	return rc;
+}
+
+/*
+ * We use the same basic approach as the old NTFS driver, i.e. we parse the
+ * index root entries and then the index allocation entries that are marked
+ * as in use in the index bitmap.
+ *
+ * While this will return the names in random order this doesn't matter for
+ * ->readdir but OTOH results in a faster ->readdir.
+ *
+ * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS
+ * parts (e.g. ->f_pos and ->i_size, and it also protects against directory
+ * modifications).
+ *
+ * Locking:  - Caller must hold i_mutex on the directory.
+ *	     - Each page cache page in the index allocation mapping must be
+ *	       locked whilst being accessed otherwise we may find a corrupt
+ *	       page due to it being under ->writepage at the moment which
+ *	       applies the mst protection fixups before writing out and then
+ *	       removes them again after the write is complete after which it 
+ *	       unlocks the page.
+ */
+static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
+	loff_t fpos, i_size;
+	struct inode *bmp_vi, *vdir = filp->f_path.dentry->d_inode;
+	struct super_block *sb = vdir->i_sb;
+	ntfs_inode *ndir = NTFS_I(vdir);
+	ntfs_volume *vol = NTFS_SB(sb);
+	MFT_RECORD *m;
+	INDEX_ROOT *ir = NULL;
+	INDEX_ENTRY *ie;
+	INDEX_ALLOCATION *ia;
+	u8 *name = NULL;
+	int rc, err, ir_pos, cur_bmp_pos;
+	struct address_space *ia_mapping, *bmp_mapping;
+	struct page *bmp_page = NULL, *ia_page = NULL;
+	u8 *kaddr, *bmp, *index_end;
+	ntfs_attr_search_ctx *ctx;
+
+	fpos = filp->f_pos;
+	ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
+			vdir->i_ino, fpos);
+	rc = err = 0;
+	/* Are we at end of dir yet? */
+	i_size = i_size_read(vdir);
+	if (fpos >= i_size + vol->mft_record_size)
+		goto done;
+	/* Emulate . and .. for all directories. */
+	if (!fpos) {
+		ntfs_debug("Calling filldir for . with len 1, fpos 0x0, "
+				"inode 0x%lx, DT_DIR.", vdir->i_ino);
+		rc = filldir(dirent, ".", 1, fpos, vdir->i_ino, DT_DIR);
+		if (rc)
+			goto done;
+		fpos++;
+	}
+	if (fpos == 1) {
+		ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, "
+				"inode 0x%lx, DT_DIR.",
+				(unsigned long)parent_ino(filp->f_path.dentry));
+		rc = filldir(dirent, "..", 2, fpos,
+				parent_ino(filp->f_path.dentry), DT_DIR);
+		if (rc)
+			goto done;
+		fpos++;
+	}
+	m = NULL;
+	ctx = NULL;
+	/*
+	 * Allocate a buffer to store the current name being processed
+	 * converted to format determined by current NLS.
+	 */
+	name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS);
+	if (unlikely(!name)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	/* Are we jumping straight into the index allocation attribute? */
+	if (fpos >= vol->mft_record_size)
+		goto skip_index_root;
+	/* Get hold of the mft record for the directory. */
+	m = map_mft_record(ndir);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(ndir, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	/* Get the offset into the index root attribute. */
+	ir_pos = (s64)fpos;
+	/* Find the index root attribute in the mft record. */
+	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
+			0, ctx);
+	if (unlikely(err)) {
+		ntfs_error(sb, "Index root attribute missing in directory "
+				"inode 0x%lx.", vdir->i_ino);
+		goto err_out;
+	}
+	/*
+	 * Copy the index root attribute value to a buffer so that we can put
+	 * the search context and unmap the mft record before calling the
+	 * filldir() callback.  We need to do this because of NFSd which calls
+	 * ->lookup() from its filldir callback() and this causes NTFS to
+	 * deadlock as ntfs_lookup() maps the mft record of the directory and
+	 * we have got it mapped here already.  The only solution is for us to
+	 * unmap the mft record here so that a call to ntfs_lookup() is able to
+	 * map the mft record without deadlocking.
+	 */
+	rc = le32_to_cpu(ctx->attr->data.resident.value_length);
+	ir = kmalloc(rc, GFP_NOFS);
+	if (unlikely(!ir)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	/* Copy the index root value (it has been verified in read_inode). */
+	memcpy(ir, (u8*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset), rc);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(ndir);
+	ctx = NULL;
+	m = NULL;
+	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+	/* The first index entry. */
+	ie = (INDEX_ENTRY*)((u8*)&ir->index +
+			le32_to_cpu(ir->index.entries_offset));
+	/*
+	 * Loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry or until filldir tells us it has had enough
+	 * or signals an error (both covered by the rc test).
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir);
+		/* Bounds checks. */
+		if (unlikely((u8*)ie < (u8*)ir || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->key_length) >
+				index_end))
+			goto err_out;
+		/* The last entry cannot contain a name. */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/* Skip index root entry if continuing previous readdir. */
+		if (ir_pos > (u8*)ie - (u8*)ir)
+			continue;
+		/* Advance the position even if going to skip the entry. */
+		fpos = (u8*)ie - (u8*)ir;
+		/* Submit the name to the filldir callback. */
+		rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent,
+				filldir);
+		if (rc) {
+			kfree(ir);
+			goto abort;
+		}
+	}
+	/* We are done with the index root and can free the buffer. */
+	kfree(ir);
+	ir = NULL;
+	/* If there is no index allocation attribute we are finished. */
+	if (!NInoIndexAllocPresent(ndir))
+		goto EOD;
+	/* Advance fpos to the beginning of the index allocation. */
+	fpos = vol->mft_record_size;
+skip_index_root:
+	kaddr = NULL;
+	prev_ia_pos = -1LL;
+	/* Get the offset into the index allocation attribute. */
+	ia_pos = (s64)fpos - vol->mft_record_size;
+	ia_mapping = vdir->i_mapping;
+	ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
+	bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
+	if (IS_ERR(bmp_vi)) {
+		ntfs_error(sb, "Failed to get bitmap attribute.");
+		err = PTR_ERR(bmp_vi);
+		goto err_out;
+	}
+	bmp_mapping = bmp_vi->i_mapping;
+	/* Get the starting bitmap bit position and sanity check it. */
+	bmp_pos = ia_pos >> ndir->itype.index.block_size_bits;
+	if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) {
+		ntfs_error(sb, "Current index allocation position exceeds "
+				"index bitmap size.");
+		goto iput_err_out;
+	}
+	/* Get the starting bit position in the current bitmap page. */
+	cur_bmp_pos = bmp_pos & ((PAGE_CACHE_SIZE * 8) - 1);
+	bmp_pos &= ~(u64)((PAGE_CACHE_SIZE * 8) - 1);
+get_next_bmp_page:
+	ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx",
+			(unsigned long long)bmp_pos >> (3 + PAGE_CACHE_SHIFT),
+			(unsigned long long)bmp_pos &
+			(unsigned long long)((PAGE_CACHE_SIZE * 8) - 1));
+	bmp_page = ntfs_map_page(bmp_mapping,
+			bmp_pos >> (3 + PAGE_CACHE_SHIFT));
+	if (IS_ERR(bmp_page)) {
+		ntfs_error(sb, "Reading index bitmap failed.");
+		err = PTR_ERR(bmp_page);
+		bmp_page = NULL;
+		goto iput_err_out;
+	}
+	bmp = (u8*)page_address(bmp_page);
+	/* Find next index block in use. */
+	while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) {
+find_next_index_buffer:
+		cur_bmp_pos++;
+		/*
+		 * If we have reached the end of the bitmap page, get the next
+		 * page, and put away the old one.
+		 */
+		if (unlikely((cur_bmp_pos >> 3) >= PAGE_CACHE_SIZE)) {
+			ntfs_unmap_page(bmp_page);
+			bmp_pos += PAGE_CACHE_SIZE * 8;
+			cur_bmp_pos = 0;
+			goto get_next_bmp_page;
+		}
+		/* If we have reached the end of the bitmap, we are done. */
+		if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size))
+			goto unm_EOD;
+		ia_pos = (bmp_pos + cur_bmp_pos) <<
+				ndir->itype.index.block_size_bits;
+	}
+	ntfs_debug("Handling index buffer 0x%llx.",
+			(unsigned long long)bmp_pos + cur_bmp_pos);
+	/* If the current index buffer is in the same page we reuse the page. */
+	if ((prev_ia_pos & (s64)PAGE_CACHE_MASK) !=
+			(ia_pos & (s64)PAGE_CACHE_MASK)) {
+		prev_ia_pos = ia_pos;
+		if (likely(ia_page != NULL)) {
+			unlock_page(ia_page);
+			ntfs_unmap_page(ia_page);
+		}
+		/*
+		 * Map the page cache page containing the current ia_pos,
+		 * reading it from disk if necessary.
+		 */
+		ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_CACHE_SHIFT);
+		if (IS_ERR(ia_page)) {
+			ntfs_error(sb, "Reading index allocation data failed.");
+			err = PTR_ERR(ia_page);
+			ia_page = NULL;
+			goto err_out;
+		}
+		lock_page(ia_page);
+		kaddr = (u8*)page_address(ia_page);
+	}
+	/* Get the current index buffer. */
+	ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_CACHE_MASK &
+			~(s64)(ndir->itype.index.block_size - 1)));
+	/* Bounds checks. */
+	if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE)) {
+		ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
+				"inode 0x%lx or driver bug.", vdir->i_ino);
+		goto err_out;
+	}
+	/* Catch multi sector transfer fixup errors. */
+	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+		ntfs_error(sb, "Directory index record with vcn 0x%llx is "
+				"corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
+				(unsigned long long)ia_pos >>
+				ndir->itype.index.vcn_size_bits, vdir->i_ino);
+		goto err_out;
+	}
+	if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos &
+			~(s64)(ndir->itype.index.block_size - 1)) >>
+			ndir->itype.index.vcn_size_bits)) {
+		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+				"different from expected VCN (0x%llx). "
+				"Directory inode 0x%lx is corrupt or driver "
+				"bug. ", (unsigned long long)
+				sle64_to_cpu(ia->index_block_vcn),
+				(unsigned long long)ia_pos >>
+				ndir->itype.index.vcn_size_bits, vdir->i_ino);
+		goto err_out;
+	}
+	if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+			ndir->itype.index.block_size)) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+				"0x%lx has a size (%u) differing from the "
+				"directory specified size (%u). Directory "
+				"inode is corrupt or driver bug.",
+				(unsigned long long)ia_pos >>
+				ndir->itype.index.vcn_size_bits, vdir->i_ino,
+				le32_to_cpu(ia->index.allocated_size) + 0x18,
+				ndir->itype.index.block_size);
+		goto err_out;
+	}
+	index_end = (u8*)ia + ndir->itype.index.block_size;
+	if (unlikely(index_end > kaddr + PAGE_CACHE_SIZE)) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+				"0x%lx crosses page boundary. Impossible! "
+				"Cannot access! This is probably a bug in the "
+				"driver.", (unsigned long long)ia_pos >>
+				ndir->itype.index.vcn_size_bits, vdir->i_ino);
+		goto err_out;
+	}
+	ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1);
+	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+	if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) {
+		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
+				"inode 0x%lx exceeds maximum size.",
+				(unsigned long long)ia_pos >>
+				ndir->itype.index.vcn_size_bits, vdir->i_ino);
+		goto err_out;
+	}
+	/* The first index entry in this index buffer. */
+	ie = (INDEX_ENTRY*)((u8*)&ia->index +
+			le32_to_cpu(ia->index.entries_offset));
+	/*
+	 * Loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry or until filldir tells us it has had enough
+	 * or signals an error (both covered by the rc test).
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		ntfs_debug("In index allocation, offset 0x%llx.",
+				(unsigned long long)ia_start +
+				(unsigned long long)((u8*)ie - (u8*)ia));
+		/* Bounds checks. */
+		if (unlikely((u8*)ie < (u8*)ia || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->key_length) >
+				index_end))
+			goto err_out;
+		/* The last entry cannot contain a name. */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/* Skip index block entry if continuing previous readdir. */
+		if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
+			continue;
+		/* Advance the position even if going to skip the entry. */
+		fpos = (u8*)ie - (u8*)ia +
+				(sle64_to_cpu(ia->index_block_vcn) <<
+				ndir->itype.index.vcn_size_bits) +
+				vol->mft_record_size;
+		/*
+		 * Submit the name to the @filldir callback.  Note,
+		 * ntfs_filldir() drops the lock on @ia_page but it retakes it
+		 * before returning, unless a non-zero value is returned in
+		 * which case the page is left unlocked.
+		 */
+		rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent,
+				filldir);
+		if (rc) {
+			/* @ia_page is already unlocked in this case. */
+			ntfs_unmap_page(ia_page);
+			ntfs_unmap_page(bmp_page);
+			iput(bmp_vi);
+			goto abort;
+		}
+	}
+	goto find_next_index_buffer;
+unm_EOD:
+	if (ia_page) {
+		unlock_page(ia_page);
+		ntfs_unmap_page(ia_page);
+	}
+	ntfs_unmap_page(bmp_page);
+	iput(bmp_vi);
+EOD:
+	/* We are finished, set fpos to EOD. */
+	fpos = i_size + vol->mft_record_size;
+abort:
+	kfree(name);
+done:
+#ifdef DEBUG
+	if (!rc)
+		ntfs_debug("EOD, fpos 0x%llx, returning 0.", fpos);
+	else
+		ntfs_debug("filldir returned %i, fpos 0x%llx, returning 0.",
+				rc, fpos);
+#endif
+	filp->f_pos = fpos;
+	return 0;
+err_out:
+	if (bmp_page) {
+		ntfs_unmap_page(bmp_page);
+iput_err_out:
+		iput(bmp_vi);
+	}
+	if (ia_page) {
+		unlock_page(ia_page);
+		ntfs_unmap_page(ia_page);
+	}
+	kfree(ir);
+	kfree(name);
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(ndir);
+	if (!err)
+		err = -EIO;
+	ntfs_debug("Failed. Returning error code %i.", -err);
+	filp->f_pos = fpos;
+	return err;
+}
+
+/**
+ * ntfs_dir_open - called when an inode is about to be opened
+ * @vi:		inode to be opened
+ * @filp:	file structure describing the inode
+ *
+ * Limit directory size to the page cache limit on architectures where unsigned
+ * long is 32-bits. This is the most we can do for now without overflowing the
+ * page cache page index. Doing it this way means we don't run into problems
+ * because of existing too large directories. It would be better to allow the
+ * user to read the accessible part of the directory but I doubt very much
+ * anyone is going to hit this check on a 32-bit architecture, so there is no
+ * point in adding the extra complexity required to support this.
+ *
+ * On 64-bit architectures, the check is hopefully optimized away by the
+ * compiler.
+ */
+static int ntfs_dir_open(struct inode *vi, struct file *filp)
+{
+	if (sizeof(unsigned long) < 8) {
+		if (i_size_read(vi) > MAX_LFS_FILESIZE)
+			return -EFBIG;
+	}
+	return 0;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_dir_fsync - sync a directory to disk
+ * @filp:	directory to be synced
+ * @dentry:	dentry describing the directory to sync
+ * @datasync:	if non-zero only flush user data and not metadata
+ *
+ * Data integrity sync of a directory to disk.  Used for fsync, fdatasync, and
+ * msync system calls.  This function is based on file.c::ntfs_file_fsync().
+ *
+ * Write the mft record and all associated extent mft records as well as the
+ * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device.
+ *
+ * If @datasync is true, we do not wait on the inode(s) to be written out
+ * but we always wait on the page cache pages to be written out.
+ *
+ * Note: In the past @filp could be NULL so we ignore it as we don't need it
+ * anyway.
+ *
+ * Locking: Caller must hold i_mutex on the inode.
+ *
+ * TODO: We should probably also write all attribute/index inodes associated
+ * with this inode but since we have no simple way of getting to them we ignore
+ * this problem for now.  We do write the $BITMAP attribute if it is present
+ * which is the important one for a directory so things are not too bad.
+ */
+static int ntfs_dir_fsync(struct file *filp, int datasync)
+{
+	struct inode *bmp_vi, *vi = filp->f_mapping->host;
+	int err, ret;
+	ntfs_attr na;
+
+	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+	BUG_ON(!S_ISDIR(vi->i_mode));
+	/* If the bitmap attribute inode is in memory sync it, too. */
+	na.mft_no = vi->i_ino;
+	na.type = AT_BITMAP;
+	na.name = I30;
+	na.name_len = 4;
+	bmp_vi = ilookup5(vi->i_sb, vi->i_ino, (test_t)ntfs_test_inode, &na);
+	if (bmp_vi) {
+ 		write_inode_now(bmp_vi, !datasync);
+		iput(bmp_vi);
+	}
+	ret = __ntfs_write_inode(vi, 1);
+	write_inode_now(vi, !datasync);
+	err = sync_blockdev(vi->i_sb->s_bdev);
+	if (unlikely(err && !ret))
+		ret = err;
+	if (likely(!ret))
+		ntfs_debug("Done.");
+	else
+		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
+				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
+	return ret;
+}
+
+#endif /* NTFS_RW */
+
+const struct file_operations ntfs_dir_ops = {
+	.llseek		= generic_file_llseek,	/* Seek inside directory. */
+	.read		= generic_read_dir,	/* Return -EISDIR. */
+	.readdir	= ntfs_readdir,		/* Read directory contents. */
+#ifdef NTFS_RW
+	.fsync		= ntfs_dir_fsync,	/* Sync a directory to disk. */
+	/*.aio_fsync	= ,*/			/* Sync all outstanding async
+						   i/o operations on a kiocb. */
+#endif /* NTFS_RW */
+	/*.ioctl	= ,*/			/* Perform function on the
+						   mounted filesystem. */
+	.open		= ntfs_dir_open,	/* Open directory. */
+};
diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h
new file mode 100644
index 00000000..aea7582d
--- /dev/null
+++ b/fs/ntfs/dir.h
@@ -0,0 +1,48 @@
+/*
+ * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of
+ *	   the Linux-NTFS project.
+ *
+ * Copyright (c) 2002-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_DIR_H
+#define _LINUX_NTFS_DIR_H
+
+#include "layout.h"
+#include "inode.h"
+#include "types.h"
+
+/*
+ * ntfs_name is used to return the file name to the caller of
+ * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup())
+ * to be able to deal with dcache aliasing issues.
+ */
+typedef struct {
+	MFT_REF mref;
+	FILE_NAME_TYPE_FLAGS type;
+	u8 len;
+	ntfschar name[0];
+} __attribute__ ((__packed__)) ntfs_name;
+
+/* The little endian Unicode string $I30 as a global constant. */
+extern ntfschar I30[5];
+
+extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni,
+		const ntfschar *uname, const int uname_len, ntfs_name **res);
+
+#endif /* _LINUX_NTFS_FS_DIR_H */
diff --git a/fs/ntfs/endian.h b/fs/ntfs/endian.h
new file mode 100644
index 00000000..927b5bf0
--- /dev/null
+++ b/fs/ntfs/endian.h
@@ -0,0 +1,93 @@
+/*
+ * endian.h - Defines for endianness handling in NTFS Linux kernel driver.
+ *	      Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_ENDIAN_H
+#define _LINUX_NTFS_ENDIAN_H
+
+#include <asm/byteorder.h>
+#include "types.h"
+
+/*
+ * Signed endianness conversion functions.
+ */
+
+static inline s16 sle16_to_cpu(sle16 x)
+{
+	return le16_to_cpu((__force le16)x);
+}
+
+static inline s32 sle32_to_cpu(sle32 x)
+{
+	return le32_to_cpu((__force le32)x);
+}
+
+static inline s64 sle64_to_cpu(sle64 x)
+{
+	return le64_to_cpu((__force le64)x);
+}
+
+static inline s16 sle16_to_cpup(sle16 *x)
+{
+	return le16_to_cpu(*(__force le16*)x);
+}
+
+static inline s32 sle32_to_cpup(sle32 *x)
+{
+	return le32_to_cpu(*(__force le32*)x);
+}
+
+static inline s64 sle64_to_cpup(sle64 *x)
+{
+	return le64_to_cpu(*(__force le64*)x);
+}
+
+static inline sle16 cpu_to_sle16(s16 x)
+{
+	return (__force sle16)cpu_to_le16(x);
+}
+
+static inline sle32 cpu_to_sle32(s32 x)
+{
+	return (__force sle32)cpu_to_le32(x);
+}
+
+static inline sle64 cpu_to_sle64(s64 x)
+{
+	return (__force sle64)cpu_to_le64(x);
+}
+
+static inline sle16 cpu_to_sle16p(s16 *x)
+{
+	return (__force sle16)cpu_to_le16(*x);
+}
+
+static inline sle32 cpu_to_sle32p(s32 *x)
+{
+	return (__force sle32)cpu_to_le32(*x);
+}
+
+static inline sle64 cpu_to_sle64p(s64 *x)
+{
+	return (__force sle64)cpu_to_le64(*x);
+}
+
+#endif /* _LINUX_NTFS_ENDIAN_H */
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
new file mode 100644
index 00000000..f4b1057a
--- /dev/null
+++ b/fs/ntfs/file.c
@@ -0,0 +1,2227 @@
+/*
+ * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+
+#include <asm/page.h>
+#include <asm/uaccess.h>
+
+#include "attrib.h"
+#include "bitmap.h"
+#include "inode.h"
+#include "debug.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_file_open - called when an inode is about to be opened
+ * @vi:		inode to be opened
+ * @filp:	file structure describing the inode
+ *
+ * Limit file size to the page cache limit on architectures where unsigned long
+ * is 32-bits. This is the most we can do for now without overflowing the page
+ * cache page index. Doing it this way means we don't run into problems because
+ * of existing too large files. It would be better to allow the user to read
+ * the beginning of the file but I doubt very much anyone is going to hit this
+ * check on a 32-bit architecture, so there is no point in adding the extra
+ * complexity required to support this.
+ *
+ * On 64-bit architectures, the check is hopefully optimized away by the
+ * compiler.
+ *
+ * After the check passes, just call generic_file_open() to do its work.
+ */
+static int ntfs_file_open(struct inode *vi, struct file *filp)
+{
+	if (sizeof(unsigned long) < 8) {
+		if (i_size_read(vi) > MAX_LFS_FILESIZE)
+			return -EOVERFLOW;
+	}
+	return generic_file_open(vi, filp);
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_attr_extend_initialized - extend the initialized size of an attribute
+ * @ni:			ntfs inode of the attribute to extend
+ * @new_init_size:	requested new initialized size in bytes
+ * @cached_page:	store any allocated but unused page here
+ * @lru_pvec:		lru-buffering pagevec of the caller
+ *
+ * Extend the initialized size of an attribute described by the ntfs inode @ni
+ * to @new_init_size bytes.  This involves zeroing any non-sparse space between
+ * the old initialized size and @new_init_size both in the page cache and on
+ * disk (if relevant complete pages are already uptodate in the page cache then
+ * these are simply marked dirty).
+ *
+ * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
+ * in the resident attribute case, it is tied to the initialized size and, in
+ * the non-resident attribute case, it may not fall below the initialized size.
+ *
+ * Note that if the attribute is resident, we do not need to touch the page
+ * cache at all.  This is because if the page cache page is not uptodate we
+ * bring it uptodate later, when doing the write to the mft record since we
+ * then already have the page mapped.  And if the page is uptodate, the
+ * non-initialized region will already have been zeroed when the page was
+ * brought uptodate and the region may in fact already have been overwritten
+ * with new data via mmap() based writes, so we cannot just zero it.  And since
+ * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
+ * is unspecified, we choose not to do zeroing and thus we do not need to touch
+ * the page at all.  For a more detailed explanation see ntfs_truncate() in
+ * fs/ntfs/inode.c.
+ *
+ * Return 0 on success and -errno on error.  In the case that an error is
+ * encountered it is possible that the initialized size will already have been
+ * incremented some way towards @new_init_size but it is guaranteed that if
+ * this is the case, the necessary zeroing will also have happened and that all
+ * metadata is self-consistent.
+ *
+ * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
+ *	    held by the caller.
+ */
+static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
+{
+	s64 old_init_size;
+	loff_t old_i_size;
+	pgoff_t index, end_index;
+	unsigned long flags;
+	struct inode *vi = VFS_I(ni);
+	ntfs_inode *base_ni;
+	MFT_RECORD *m = NULL;
+	ATTR_RECORD *a;
+	ntfs_attr_search_ctx *ctx = NULL;
+	struct address_space *mapping;
+	struct page *page = NULL;
+	u8 *kattr;
+	int err;
+	u32 attr_len;
+
+	read_lock_irqsave(&ni->size_lock, flags);
+	old_init_size = ni->initialized_size;
+	old_i_size = i_size_read(vi);
+	BUG_ON(new_init_size > ni->allocated_size);
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
+			"old_initialized_size 0x%llx, "
+			"new_initialized_size 0x%llx, i_size 0x%llx.",
+			vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+			(unsigned long long)old_init_size,
+			(unsigned long long)new_init_size, old_i_size);
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	/* Use goto to reduce indentation and we need the label below anyway. */
+	if (NInoNonResident(ni))
+		goto do_non_resident_extend;
+	BUG_ON(old_init_size != old_i_size);
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto err_out;
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	BUG_ON(a->non_resident);
+	/* The total length of the attribute value. */
+	attr_len = le32_to_cpu(a->data.resident.value_length);
+	BUG_ON(old_i_size != (loff_t)attr_len);
+	/*
+	 * Do the zeroing in the mft record and update the attribute size in
+	 * the mft record.
+	 */
+	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
+	memset(kattr + attr_len, 0, new_init_size - attr_len);
+	a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
+	/* Finally, update the sizes in the vfs and ntfs inodes. */
+	write_lock_irqsave(&ni->size_lock, flags);
+	i_size_write(vi, new_init_size);
+	ni->initialized_size = new_init_size;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	goto done;
+do_non_resident_extend:
+	/*
+	 * If the new initialized size @new_init_size exceeds the current file
+	 * size (vfs inode->i_size), we need to extend the file size to the
+	 * new initialized size.
+	 */
+	if (new_init_size > old_i_size) {
+		m = map_mft_record(base_ni);
+		if (IS_ERR(m)) {
+			err = PTR_ERR(m);
+			m = NULL;
+			goto err_out;
+		}
+		ctx = ntfs_attr_get_search_ctx(base_ni, m);
+		if (unlikely(!ctx)) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, 0, NULL, 0, ctx);
+		if (unlikely(err)) {
+			if (err == -ENOENT)
+				err = -EIO;
+			goto err_out;
+		}
+		m = ctx->mrec;
+		a = ctx->attr;
+		BUG_ON(!a->non_resident);
+		BUG_ON(old_i_size != (loff_t)
+				sle64_to_cpu(a->data.non_resident.data_size));
+		a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		/* Update the file size in the vfs inode. */
+		i_size_write(vi, new_init_size);
+		ntfs_attr_put_search_ctx(ctx);
+		ctx = NULL;
+		unmap_mft_record(base_ni);
+		m = NULL;
+	}
+	mapping = vi->i_mapping;
+	index = old_init_size >> PAGE_CACHE_SHIFT;
+	end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	do {
+		/*
+		 * Read the page.  If the page is not present, this will zero
+		 * the uninitialized regions for us.
+		 */
+		page = read_mapping_page(mapping, index, NULL);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto init_err_out;
+		}
+		if (unlikely(PageError(page))) {
+			page_cache_release(page);
+			err = -EIO;
+			goto init_err_out;
+		}
+		/*
+		 * Update the initialized size in the ntfs inode.  This is
+		 * enough to make ntfs_writepage() work.
+		 */
+		write_lock_irqsave(&ni->size_lock, flags);
+		ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT;
+		if (ni->initialized_size > new_init_size)
+			ni->initialized_size = new_init_size;
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		/* Set the page dirty so it gets written out. */
+		set_page_dirty(page);
+		page_cache_release(page);
+		/*
+		 * Play nice with the vm and the rest of the system.  This is
+		 * very much needed as we can potentially be modifying the
+		 * initialised size from a very small value to a really huge
+		 * value, e.g.
+		 *	f = open(somefile, O_TRUNC);
+		 *	truncate(f, 10GiB);
+		 *	seek(f, 10GiB);
+		 *	write(f, 1);
+		 * And this would mean we would be marking dirty hundreds of
+		 * thousands of pages or as in the above example more than
+		 * two and a half million pages!
+		 *
+		 * TODO: For sparse pages could optimize this workload by using
+		 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit.  This
+		 * would be set in readpage for sparse pages and here we would
+		 * not need to mark dirty any pages which have this bit set.
+		 * The only caveat is that we have to clear the bit everywhere
+		 * where we allocate any clusters that lie in the page or that
+		 * contain the page.
+		 *
+		 * TODO: An even greater optimization would be for us to only
+		 * call readpage() on pages which are not in sparse regions as
+		 * determined from the runlist.  This would greatly reduce the
+		 * number of pages we read and make dirty in the case of sparse
+		 * files.
+		 */
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+	} while (++index < end_index);
+	read_lock_irqsave(&ni->size_lock, flags);
+	BUG_ON(ni->initialized_size != new_init_size);
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	/* Now bring in sync the initialized_size in the mft record. */
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		goto init_err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto init_err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto init_err_out;
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	BUG_ON(!a->non_resident);
+	a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
+done:
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
+			(unsigned long long)new_init_size, i_size_read(vi));
+	return 0;
+init_err_out:
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->initialized_size = old_init_size;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	ntfs_debug("Failed.  Returning error code %i.", err);
+	return err;
+}
+
+/**
+ * ntfs_fault_in_pages_readable -
+ *
+ * Fault a number of userspace pages into pagetables.
+ *
+ * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
+ * with more than two userspace pages as well as handling the single page case
+ * elegantly.
+ *
+ * If you find this difficult to understand, then think of the while loop being
+ * the following code, except that we do without the integer variable ret:
+ *
+ *	do {
+ *		ret = __get_user(c, uaddr);
+ *		uaddr += PAGE_SIZE;
+ *	} while (!ret && uaddr < end);
+ *
+ * Note, the final __get_user() may well run out-of-bounds of the user buffer,
+ * but _not_ out-of-bounds of the page the user buffer belongs to, and since
+ * this is only a read and not a write, and since it is still in the same page,
+ * it should not matter and this makes the code much simpler.
+ */
+static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
+		int bytes)
+{
+	const char __user *end;
+	volatile char c;
+
+	/* Set @end to the first byte outside the last page we care about. */
+	end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes);
+
+	while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
+		;
+}
+
+/**
+ * ntfs_fault_in_pages_readable_iovec -
+ *
+ * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
+ */
+static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
+		size_t iov_ofs, int bytes)
+{
+	do {
+		const char __user *buf;
+		unsigned len;
+
+		buf = iov->iov_base + iov_ofs;
+		len = iov->iov_len - iov_ofs;
+		if (len > bytes)
+			len = bytes;
+		ntfs_fault_in_pages_readable(buf, len);
+		bytes -= len;
+		iov++;
+		iov_ofs = 0;
+	} while (bytes);
+}
+
+/**
+ * __ntfs_grab_cache_pages - obtain a number of locked pages
+ * @mapping:	address space mapping from which to obtain page cache pages
+ * @index:	starting index in @mapping at which to begin obtaining pages
+ * @nr_pages:	number of page cache pages to obtain
+ * @pages:	array of pages in which to return the obtained page cache pages
+ * @cached_page: allocated but as yet unused page
+ * @lru_pvec:	lru-buffering pagevec of caller
+ *
+ * Obtain @nr_pages locked page cache pages from the mapping @mapping and
+ * starting at index @index.
+ *
+ * If a page is newly created, add it to lru list
+ *
+ * Note, the page locks are obtained in ascending page index order.
+ */
+static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
+		pgoff_t index, const unsigned nr_pages, struct page **pages,
+		struct page **cached_page)
+{
+	int err, nr;
+
+	BUG_ON(!nr_pages);
+	err = nr = 0;
+	do {
+		pages[nr] = find_lock_page(mapping, index);
+		if (!pages[nr]) {
+			if (!*cached_page) {
+				*cached_page = page_cache_alloc(mapping);
+				if (unlikely(!*cached_page)) {
+					err = -ENOMEM;
+					goto err_out;
+				}
+			}
+			err = add_to_page_cache_lru(*cached_page, mapping, index,
+					GFP_KERNEL);
+			if (unlikely(err)) {
+				if (err == -EEXIST)
+					continue;
+				goto err_out;
+			}
+			pages[nr] = *cached_page;
+			*cached_page = NULL;
+		}
+		index++;
+		nr++;
+	} while (nr < nr_pages);
+out:
+	return err;
+err_out:
+	while (nr > 0) {
+		unlock_page(pages[--nr]);
+		page_cache_release(pages[nr]);
+	}
+	goto out;
+}
+
+static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
+{
+	lock_buffer(bh);
+	get_bh(bh);
+	bh->b_end_io = end_buffer_read_sync;
+	return submit_bh(READ, bh);
+}
+
+/**
+ * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
+ * @pages:	array of destination pages
+ * @nr_pages:	number of pages in @pages
+ * @pos:	byte position in file at which the write begins
+ * @bytes:	number of bytes to be written
+ *
+ * This is called for non-resident attributes from ntfs_file_buffered_write()
+ * with i_mutex held on the inode (@pages[0]->mapping->host).  There are
+ * @nr_pages pages in @pages which are locked but not kmap()ped.  The source
+ * data has not yet been copied into the @pages.
+ * 
+ * Need to fill any holes with actual clusters, allocate buffers if necessary,
+ * ensure all the buffers are mapped, and bring uptodate any buffers that are
+ * only partially being written to.
+ *
+ * If @nr_pages is greater than one, we are guaranteed that the cluster size is
+ * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside
+ * the same cluster and that they are the entirety of that cluster, and that
+ * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
+ *
+ * i_size is not to be modified yet.
+ *
+ * Return 0 on success or -errno on error.
+ */
+static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
+		unsigned nr_pages, s64 pos, size_t bytes)
+{
+	VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
+	LCN lcn;
+	s64 bh_pos, vcn_len, end, initialized_size;
+	sector_t lcn_block;
+	struct page *page;
+	struct inode *vi;
+	ntfs_inode *ni, *base_ni = NULL;
+	ntfs_volume *vol;
+	runlist_element *rl, *rl2;
+	struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
+	ntfs_attr_search_ctx *ctx = NULL;
+	MFT_RECORD *m = NULL;
+	ATTR_RECORD *a = NULL;
+	unsigned long flags;
+	u32 attr_rec_len = 0;
+	unsigned blocksize, u;
+	int err, mp_size;
+	bool rl_write_locked, was_hole, is_retry;
+	unsigned char blocksize_bits;
+	struct {
+		u8 runlist_merged:1;
+		u8 mft_attr_mapped:1;
+		u8 mp_rebuilt:1;
+		u8 attr_switched:1;
+	} status = { 0, 0, 0, 0 };
+
+	BUG_ON(!nr_pages);
+	BUG_ON(!pages);
+	BUG_ON(!*pages);
+	vi = pages[0]->mapping->host;
+	ni = NTFS_I(vi);
+	vol = ni->vol;
+	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
+			"index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
+			vi->i_ino, ni->type, pages[0]->index, nr_pages,
+			(long long)pos, bytes);
+	blocksize = vol->sb->s_blocksize;
+	blocksize_bits = vol->sb->s_blocksize_bits;
+	u = 0;
+	do {
+		page = pages[u];
+		BUG_ON(!page);
+		/*
+		 * create_empty_buffers() will create uptodate/dirty buffers if
+		 * the page is uptodate/dirty.
+		 */
+		if (!page_has_buffers(page)) {
+			create_empty_buffers(page, blocksize, 0);
+			if (unlikely(!page_has_buffers(page)))
+				return -ENOMEM;
+		}
+	} while (++u < nr_pages);
+	rl_write_locked = false;
+	rl = NULL;
+	err = 0;
+	vcn = lcn = -1;
+	vcn_len = 0;
+	lcn_block = -1;
+	was_hole = false;
+	cpos = pos >> vol->cluster_size_bits;
+	end = pos + bytes;
+	cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
+	/*
+	 * Loop over each page and for each page over each buffer.  Use goto to
+	 * reduce indentation.
+	 */
+	u = 0;
+do_next_page:
+	page = pages[u];
+	bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
+	bh = head = page_buffers(page);
+	do {
+		VCN cdelta;
+		s64 bh_end;
+		unsigned bh_cofs;
+
+		/* Clear buffer_new on all buffers to reinitialise state. */
+		if (buffer_new(bh))
+			clear_buffer_new(bh);
+		bh_end = bh_pos + blocksize;
+		bh_cpos = bh_pos >> vol->cluster_size_bits;
+		bh_cofs = bh_pos & vol->cluster_size_mask;
+		if (buffer_mapped(bh)) {
+			/*
+			 * The buffer is already mapped.  If it is uptodate,
+			 * ignore it.
+			 */
+			if (buffer_uptodate(bh))
+				continue;
+			/*
+			 * The buffer is not uptodate.  If the page is uptodate
+			 * set the buffer uptodate and otherwise ignore it.
+			 */
+			if (PageUptodate(page)) {
+				set_buffer_uptodate(bh);
+				continue;
+			}
+			/*
+			 * Neither the page nor the buffer are uptodate.  If
+			 * the buffer is only partially being written to, we
+			 * need to read it in before the write, i.e. now.
+			 */
+			if ((bh_pos < pos && bh_end > pos) ||
+					(bh_pos < end && bh_end > end)) {
+				/*
+				 * If the buffer is fully or partially within
+				 * the initialized size, do an actual read.
+				 * Otherwise, simply zero the buffer.
+				 */
+				read_lock_irqsave(&ni->size_lock, flags);
+				initialized_size = ni->initialized_size;
+				read_unlock_irqrestore(&ni->size_lock, flags);
+				if (bh_pos < initialized_size) {
+					ntfs_submit_bh_for_read(bh);
+					*wait_bh++ = bh;
+				} else {
+					zero_user(page, bh_offset(bh),
+							blocksize);
+					set_buffer_uptodate(bh);
+				}
+			}
+			continue;
+		}
+		/* Unmapped buffer.  Need to map it. */
+		bh->b_bdev = vol->sb->s_bdev;
+		/*
+		 * If the current buffer is in the same clusters as the map
+		 * cache, there is no need to check the runlist again.  The
+		 * map cache is made up of @vcn, which is the first cached file
+		 * cluster, @vcn_len which is the number of cached file
+		 * clusters, @lcn is the device cluster corresponding to @vcn,
+		 * and @lcn_block is the block number corresponding to @lcn.
+		 */
+		cdelta = bh_cpos - vcn;
+		if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
+map_buffer_cached:
+			BUG_ON(lcn < 0);
+			bh->b_blocknr = lcn_block +
+					(cdelta << (vol->cluster_size_bits -
+					blocksize_bits)) +
+					(bh_cofs >> blocksize_bits);
+			set_buffer_mapped(bh);
+			/*
+			 * If the page is uptodate so is the buffer.  If the
+			 * buffer is fully outside the write, we ignore it if
+			 * it was already allocated and we mark it dirty so it
+			 * gets written out if we allocated it.  On the other
+			 * hand, if we allocated the buffer but we are not
+			 * marking it dirty we set buffer_new so we can do
+			 * error recovery.
+			 */
+			if (PageUptodate(page)) {
+				if (!buffer_uptodate(bh))
+					set_buffer_uptodate(bh);
+				if (unlikely(was_hole)) {
+					/* We allocated the buffer. */
+					unmap_underlying_metadata(bh->b_bdev,
+							bh->b_blocknr);
+					if (bh_end <= pos || bh_pos >= end)
+						mark_buffer_dirty(bh);
+					else
+						set_buffer_new(bh);
+				}
+				continue;
+			}
+			/* Page is _not_ uptodate. */
+			if (likely(!was_hole)) {
+				/*
+				 * Buffer was already allocated.  If it is not
+				 * uptodate and is only partially being written
+				 * to, we need to read it in before the write,
+				 * i.e. now.
+				 */
+				if (!buffer_uptodate(bh) && bh_pos < end &&
+						bh_end > pos &&
+						(bh_pos < pos ||
+						bh_end > end)) {
+					/*
+					 * If the buffer is fully or partially
+					 * within the initialized size, do an
+					 * actual read.  Otherwise, simply zero
+					 * the buffer.
+					 */
+					read_lock_irqsave(&ni->size_lock,
+							flags);
+					initialized_size = ni->initialized_size;
+					read_unlock_irqrestore(&ni->size_lock,
+							flags);
+					if (bh_pos < initialized_size) {
+						ntfs_submit_bh_for_read(bh);
+						*wait_bh++ = bh;
+					} else {
+						zero_user(page, bh_offset(bh),
+								blocksize);
+						set_buffer_uptodate(bh);
+					}
+				}
+				continue;
+			}
+			/* We allocated the buffer. */
+			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+			/*
+			 * If the buffer is fully outside the write, zero it,
+			 * set it uptodate, and mark it dirty so it gets
+			 * written out.  If it is partially being written to,
+			 * zero region surrounding the write but leave it to
+			 * commit write to do anything else.  Finally, if the
+			 * buffer is fully being overwritten, do nothing.
+			 */
+			if (bh_end <= pos || bh_pos >= end) {
+				if (!buffer_uptodate(bh)) {
+					zero_user(page, bh_offset(bh),
+							blocksize);
+					set_buffer_uptodate(bh);
+				}
+				mark_buffer_dirty(bh);
+				continue;
+			}
+			set_buffer_new(bh);
+			if (!buffer_uptodate(bh) &&
+					(bh_pos < pos || bh_end > end)) {
+				u8 *kaddr;
+				unsigned pofs;
+					
+				kaddr = kmap_atomic(page, KM_USER0);
+				if (bh_pos < pos) {
+					pofs = bh_pos & ~PAGE_CACHE_MASK;
+					memset(kaddr + pofs, 0, pos - bh_pos);
+				}
+				if (bh_end > end) {
+					pofs = end & ~PAGE_CACHE_MASK;
+					memset(kaddr + pofs, 0, bh_end - end);
+				}
+				kunmap_atomic(kaddr, KM_USER0);
+				flush_dcache_page(page);
+			}
+			continue;
+		}
+		/*
+		 * Slow path: this is the first buffer in the cluster.  If it
+		 * is outside allocated size and is not uptodate, zero it and
+		 * set it uptodate.
+		 */
+		read_lock_irqsave(&ni->size_lock, flags);
+		initialized_size = ni->allocated_size;
+		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (bh_pos > initialized_size) {
+			if (PageUptodate(page)) {
+				if (!buffer_uptodate(bh))
+					set_buffer_uptodate(bh);
+			} else if (!buffer_uptodate(bh)) {
+				zero_user(page, bh_offset(bh), blocksize);
+				set_buffer_uptodate(bh);
+			}
+			continue;
+		}
+		is_retry = false;
+		if (!rl) {
+			down_read(&ni->runlist.lock);
+retry_remap:
+			rl = ni->runlist.rl;
+		}
+		if (likely(rl != NULL)) {
+			/* Seek to element containing target cluster. */
+			while (rl->length && rl[1].vcn <= bh_cpos)
+				rl++;
+			lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
+			if (likely(lcn >= 0)) {
+				/*
+				 * Successful remap, setup the map cache and
+				 * use that to deal with the buffer.
+				 */
+				was_hole = false;
+				vcn = bh_cpos;
+				vcn_len = rl[1].vcn - vcn;
+				lcn_block = lcn << (vol->cluster_size_bits -
+						blocksize_bits);
+				cdelta = 0;
+				/*
+				 * If the number of remaining clusters touched
+				 * by the write is smaller or equal to the
+				 * number of cached clusters, unlock the
+				 * runlist as the map cache will be used from
+				 * now on.
+				 */
+				if (likely(vcn + vcn_len >= cend)) {
+					if (rl_write_locked) {
+						up_write(&ni->runlist.lock);
+						rl_write_locked = false;
+					} else
+						up_read(&ni->runlist.lock);
+					rl = NULL;
+				}
+				goto map_buffer_cached;
+			}
+		} else
+			lcn = LCN_RL_NOT_MAPPED;
+		/*
+		 * If it is not a hole and not out of bounds, the runlist is
+		 * probably unmapped so try to map it now.
+		 */
+		if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
+			if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
+				/* Attempt to map runlist. */
+				if (!rl_write_locked) {
+					/*
+					 * We need the runlist locked for
+					 * writing, so if it is locked for
+					 * reading relock it now and retry in
+					 * case it changed whilst we dropped
+					 * the lock.
+					 */
+					up_read(&ni->runlist.lock);
+					down_write(&ni->runlist.lock);
+					rl_write_locked = true;
+					goto retry_remap;
+				}
+				err = ntfs_map_runlist_nolock(ni, bh_cpos,
+						NULL);
+				if (likely(!err)) {
+					is_retry = true;
+					goto retry_remap;
+				}
+				/*
+				 * If @vcn is out of bounds, pretend @lcn is
+				 * LCN_ENOENT.  As long as the buffer is out
+				 * of bounds this will work fine.
+				 */
+				if (err == -ENOENT) {
+					lcn = LCN_ENOENT;
+					err = 0;
+					goto rl_not_mapped_enoent;
+				}
+			} else
+				err = -EIO;
+			/* Failed to map the buffer, even after retrying. */
+			bh->b_blocknr = -1;
+			ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
+					"attribute type 0x%x, vcn 0x%llx, "
+					"vcn offset 0x%x, because its "
+					"location on disk could not be "
+					"determined%s (error code %i).",
+					ni->mft_no, ni->type,
+					(unsigned long long)bh_cpos,
+					(unsigned)bh_pos &
+					vol->cluster_size_mask,
+					is_retry ? " even after retrying" : "",
+					err);
+			break;
+		}
+rl_not_mapped_enoent:
+		/*
+		 * The buffer is in a hole or out of bounds.  We need to fill
+		 * the hole, unless the buffer is in a cluster which is not
+		 * touched by the write, in which case we just leave the buffer
+		 * unmapped.  This can only happen when the cluster size is
+		 * less than the page cache size.
+		 */
+		if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) {
+			bh_cend = (bh_end + vol->cluster_size - 1) >>
+					vol->cluster_size_bits;
+			if ((bh_cend <= cpos || bh_cpos >= cend)) {
+				bh->b_blocknr = -1;
+				/*
+				 * If the buffer is uptodate we skip it.  If it
+				 * is not but the page is uptodate, we can set
+				 * the buffer uptodate.  If the page is not
+				 * uptodate, we can clear the buffer and set it
+				 * uptodate.  Whether this is worthwhile is
+				 * debatable and this could be removed.
+				 */
+				if (PageUptodate(page)) {
+					if (!buffer_uptodate(bh))
+						set_buffer_uptodate(bh);
+				} else if (!buffer_uptodate(bh)) {
+					zero_user(page, bh_offset(bh),
+						blocksize);
+					set_buffer_uptodate(bh);
+				}
+				continue;
+			}
+		}
+		/*
+		 * Out of bounds buffer is invalid if it was not really out of
+		 * bounds.
+		 */
+		BUG_ON(lcn != LCN_HOLE);
+		/*
+		 * We need the runlist locked for writing, so if it is locked
+		 * for reading relock it now and retry in case it changed
+		 * whilst we dropped the lock.
+		 */
+		BUG_ON(!rl);
+		if (!rl_write_locked) {
+			up_read(&ni->runlist.lock);
+			down_write(&ni->runlist.lock);
+			rl_write_locked = true;
+			goto retry_remap;
+		}
+		/* Find the previous last allocated cluster. */
+		BUG_ON(rl->lcn != LCN_HOLE);
+		lcn = -1;
+		rl2 = rl;
+		while (--rl2 >= ni->runlist.rl) {
+			if (rl2->lcn >= 0) {
+				lcn = rl2->lcn + rl2->length;
+				break;
+			}
+		}
+		rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
+				false);
+		if (IS_ERR(rl2)) {
+			err = PTR_ERR(rl2);
+			ntfs_debug("Failed to allocate cluster, error code %i.",
+					err);
+			break;
+		}
+		lcn = rl2->lcn;
+		rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
+		if (IS_ERR(rl)) {
+			err = PTR_ERR(rl);
+			if (err != -ENOMEM)
+				err = -EIO;
+			if (ntfs_cluster_free_from_rl(vol, rl2)) {
+				ntfs_error(vol->sb, "Failed to release "
+						"allocated cluster in error "
+						"code path.  Run chkdsk to "
+						"recover the lost cluster.");
+				NVolSetErrors(vol);
+			}
+			ntfs_free(rl2);
+			break;
+		}
+		ni->runlist.rl = rl;
+		status.runlist_merged = 1;
+		ntfs_debug("Allocated cluster, lcn 0x%llx.",
+				(unsigned long long)lcn);
+		/* Map and lock the mft record and get the attribute record. */
+		if (!NInoAttr(ni))
+			base_ni = ni;
+		else
+			base_ni = ni->ext.base_ntfs_ino;
+		m = map_mft_record(base_ni);
+		if (IS_ERR(m)) {
+			err = PTR_ERR(m);
+			break;
+		}
+		ctx = ntfs_attr_get_search_ctx(base_ni, m);
+		if (unlikely(!ctx)) {
+			err = -ENOMEM;
+			unmap_mft_record(base_ni);
+			break;
+		}
+		status.mft_attr_mapped = 1;
+		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
+		if (unlikely(err)) {
+			if (err == -ENOENT)
+				err = -EIO;
+			break;
+		}
+		m = ctx->mrec;
+		a = ctx->attr;
+		/*
+		 * Find the runlist element with which the attribute extent
+		 * starts.  Note, we cannot use the _attr_ version because we
+		 * have mapped the mft record.  That is ok because we know the
+		 * runlist fragment must be mapped already to have ever gotten
+		 * here, so we can just use the _rl_ version.
+		 */
+		vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+		rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
+		BUG_ON(!rl2);
+		BUG_ON(!rl2->length);
+		BUG_ON(rl2->lcn < LCN_HOLE);
+		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+		/*
+		 * If @highest_vcn is zero, calculate the real highest_vcn
+		 * (which can really be zero).
+		 */
+		if (!highest_vcn)
+			highest_vcn = (sle64_to_cpu(
+					a->data.non_resident.allocated_size) >>
+					vol->cluster_size_bits) - 1;
+		/*
+		 * Determine the size of the mapping pairs array for the new
+		 * extent, i.e. the old extent with the hole filled.
+		 */
+		mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
+				highest_vcn);
+		if (unlikely(mp_size <= 0)) {
+			if (!(err = mp_size))
+				err = -EIO;
+			ntfs_debug("Failed to get size for mapping pairs "
+					"array, error code %i.", err);
+			break;
+		}
+		/*
+		 * Resize the attribute record to fit the new mapping pairs
+		 * array.
+		 */
+		attr_rec_len = le32_to_cpu(a->length);
+		err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset));
+		if (unlikely(err)) {
+			BUG_ON(err != -ENOSPC);
+			// TODO: Deal with this by using the current attribute
+			// and fill it with as much of the mapping pairs
+			// array as possible.  Then loop over each attribute
+			// extent rewriting the mapping pairs arrays as we go
+			// along and if when we reach the end we have not
+			// enough space, try to resize the last attribute
+			// extent and if even that fails, add a new attribute
+			// extent.
+			// We could also try to resize at each step in the hope
+			// that we will not need to rewrite every single extent.
+			// Note, we may need to decompress some extents to fill
+			// the runlist as we are walking the extents...
+			ntfs_error(vol->sb, "Not enough space in the mft "
+					"record for the extended attribute "
+					"record.  This case is not "
+					"implemented yet.");
+			err = -EOPNOTSUPP;
+			break ;
+		}
+		status.mp_rebuilt = 1;
+		/*
+		 * Generate the mapping pairs array directly into the attribute
+		 * record.
+		 */
+		err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset),
+				mp_size, rl2, vcn, highest_vcn, NULL);
+		if (unlikely(err)) {
+			ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
+					"attribute type 0x%x, because building "
+					"the mapping pairs failed with error "
+					"code %i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+			err = -EIO;
+			break;
+		}
+		/* Update the highest_vcn but only if it was not set. */
+		if (unlikely(!a->data.non_resident.highest_vcn))
+			a->data.non_resident.highest_vcn =
+					cpu_to_sle64(highest_vcn);
+		/*
+		 * If the attribute is sparse/compressed, update the compressed
+		 * size in the ntfs_inode structure and the attribute record.
+		 */
+		if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
+			/*
+			 * If we are not in the first attribute extent, switch
+			 * to it, but first ensure the changes will make it to
+			 * disk later.
+			 */
+			if (a->data.non_resident.lowest_vcn) {
+				flush_dcache_mft_record_page(ctx->ntfs_ino);
+				mark_mft_record_dirty(ctx->ntfs_ino);
+				ntfs_attr_reinit_search_ctx(ctx);
+				err = ntfs_attr_lookup(ni->type, ni->name,
+						ni->name_len, CASE_SENSITIVE,
+						0, NULL, 0, ctx);
+				if (unlikely(err)) {
+					status.attr_switched = 1;
+					break;
+				}
+				/* @m is not used any more so do not set it. */
+				a = ctx->attr;
+			}
+			write_lock_irqsave(&ni->size_lock, flags);
+			ni->itype.compressed.size += vol->cluster_size;
+			a->data.non_resident.compressed_size =
+					cpu_to_sle64(ni->itype.compressed.size);
+			write_unlock_irqrestore(&ni->size_lock, flags);
+		}
+		/* Ensure the changes make it to disk. */
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(base_ni);
+		/* Successfully filled the hole. */
+		status.runlist_merged = 0;
+		status.mft_attr_mapped = 0;
+		status.mp_rebuilt = 0;
+		/* Setup the map cache and use that to deal with the buffer. */
+		was_hole = true;
+		vcn = bh_cpos;
+		vcn_len = 1;
+		lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
+		cdelta = 0;
+		/*
+		 * If the number of remaining clusters in the @pages is smaller
+		 * or equal to the number of cached clusters, unlock the
+		 * runlist as the map cache will be used from now on.
+		 */
+		if (likely(vcn + vcn_len >= cend)) {
+			up_write(&ni->runlist.lock);
+			rl_write_locked = false;
+			rl = NULL;
+		}
+		goto map_buffer_cached;
+	} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
+	/* If there are no errors, do the next page. */
+	if (likely(!err && ++u < nr_pages))
+		goto do_next_page;
+	/* If there are no errors, release the runlist lock if we took it. */
+	if (likely(!err)) {
+		if (unlikely(rl_write_locked)) {
+			up_write(&ni->runlist.lock);
+			rl_write_locked = false;
+		} else if (unlikely(rl))
+			up_read(&ni->runlist.lock);
+		rl = NULL;
+	}
+	/* If we issued read requests, let them complete. */
+	read_lock_irqsave(&ni->size_lock, flags);
+	initialized_size = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	while (wait_bh > wait) {
+		bh = *--wait_bh;
+		wait_on_buffer(bh);
+		if (likely(buffer_uptodate(bh))) {
+			page = bh->b_page;
+			bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) +
+					bh_offset(bh);
+			/*
+			 * If the buffer overflows the initialized size, need
+			 * to zero the overflowing region.
+			 */
+			if (unlikely(bh_pos + blocksize > initialized_size)) {
+				int ofs = 0;
+
+				if (likely(bh_pos < initialized_size))
+					ofs = initialized_size - bh_pos;
+				zero_user_segment(page, bh_offset(bh) + ofs,
+						blocksize);
+			}
+		} else /* if (unlikely(!buffer_uptodate(bh))) */
+			err = -EIO;
+	}
+	if (likely(!err)) {
+		/* Clear buffer_new on all buffers. */
+		u = 0;
+		do {
+			bh = head = page_buffers(pages[u]);
+			do {
+				if (buffer_new(bh))
+					clear_buffer_new(bh);
+			} while ((bh = bh->b_this_page) != head);
+		} while (++u < nr_pages);
+		ntfs_debug("Done.");
+		return err;
+	}
+	if (status.attr_switched) {
+		/* Get back to the attribute extent we modified. */
+		ntfs_attr_reinit_search_ctx(ctx);
+		if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
+			ntfs_error(vol->sb, "Failed to find required "
+					"attribute extent of attribute in "
+					"error code path.  Run chkdsk to "
+					"recover.");
+			write_lock_irqsave(&ni->size_lock, flags);
+			ni->itype.compressed.size += vol->cluster_size;
+			write_unlock_irqrestore(&ni->size_lock, flags);
+			flush_dcache_mft_record_page(ctx->ntfs_ino);
+			mark_mft_record_dirty(ctx->ntfs_ino);
+			/*
+			 * The only thing that is now wrong is the compressed
+			 * size of the base attribute extent which chkdsk
+			 * should be able to fix.
+			 */
+			NVolSetErrors(vol);
+		} else {
+			m = ctx->mrec;
+			a = ctx->attr;
+			status.attr_switched = 0;
+		}
+	}
+	/*
+	 * If the runlist has been modified, need to restore it by punching a
+	 * hole into it and we then need to deallocate the on-disk cluster as
+	 * well.  Note, we only modify the runlist if we are able to generate a
+	 * new mapping pairs array, i.e. only when the mapped attribute extent
+	 * is not switched.
+	 */
+	if (status.runlist_merged && !status.attr_switched) {
+		BUG_ON(!rl_write_locked);
+		/* Make the file cluster we allocated sparse in the runlist. */
+		if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
+			ntfs_error(vol->sb, "Failed to punch hole into "
+					"attribute runlist in error code "
+					"path.  Run chkdsk to recover the "
+					"lost cluster.");
+			NVolSetErrors(vol);
+		} else /* if (success) */ {
+			status.runlist_merged = 0;
+			/*
+			 * Deallocate the on-disk cluster we allocated but only
+			 * if we succeeded in punching its vcn out of the
+			 * runlist.
+			 */
+			down_write(&vol->lcnbmp_lock);
+			if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
+				ntfs_error(vol->sb, "Failed to release "
+						"allocated cluster in error "
+						"code path.  Run chkdsk to "
+						"recover the lost cluster.");
+				NVolSetErrors(vol);
+			}
+			up_write(&vol->lcnbmp_lock);
+		}
+	}
+	/*
+	 * Resize the attribute record to its old size and rebuild the mapping
+	 * pairs array.  Note, we only can do this if the runlist has been
+	 * restored to its old state which also implies that the mapped
+	 * attribute extent is not switched.
+	 */
+	if (status.mp_rebuilt && !status.runlist_merged) {
+		if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
+			ntfs_error(vol->sb, "Failed to restore attribute "
+					"record in error code path.  Run "
+					"chkdsk to recover.");
+			NVolSetErrors(vol);
+		} else /* if (success) */ {
+			if (ntfs_mapping_pairs_build(vol, (u8*)a +
+					le16_to_cpu(a->data.non_resident.
+					mapping_pairs_offset), attr_rec_len -
+					le16_to_cpu(a->data.non_resident.
+					mapping_pairs_offset), ni->runlist.rl,
+					vcn, highest_vcn, NULL)) {
+				ntfs_error(vol->sb, "Failed to restore "
+						"mapping pairs array in error "
+						"code path.  Run chkdsk to "
+						"recover.");
+				NVolSetErrors(vol);
+			}
+			flush_dcache_mft_record_page(ctx->ntfs_ino);
+			mark_mft_record_dirty(ctx->ntfs_ino);
+		}
+	}
+	/* Release the mft record and the attribute. */
+	if (status.mft_attr_mapped) {
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(base_ni);
+	}
+	/* Release the runlist lock. */
+	if (rl_write_locked)
+		up_write(&ni->runlist.lock);
+	else if (rl)
+		up_read(&ni->runlist.lock);
+	/*
+	 * Zero out any newly allocated blocks to avoid exposing stale data.
+	 * If BH_New is set, we know that the block was newly allocated above
+	 * and that it has not been fully zeroed and marked dirty yet.
+	 */
+	nr_pages = u;
+	u = 0;
+	end = bh_cpos << vol->cluster_size_bits;
+	do {
+		page = pages[u];
+		bh = head = page_buffers(page);
+		do {
+			if (u == nr_pages &&
+					((s64)page->index << PAGE_CACHE_SHIFT) +
+					bh_offset(bh) >= end)
+				break;
+			if (!buffer_new(bh))
+				continue;
+			clear_buffer_new(bh);
+			if (!buffer_uptodate(bh)) {
+				if (PageUptodate(page))
+					set_buffer_uptodate(bh);
+				else {
+					zero_user(page, bh_offset(bh),
+							blocksize);
+					set_buffer_uptodate(bh);
+				}
+			}
+			mark_buffer_dirty(bh);
+		} while ((bh = bh->b_this_page) != head);
+	} while (++u <= nr_pages);
+	ntfs_error(vol->sb, "Failed.  Returning error code %i.", err);
+	return err;
+}
+
+/*
+ * Copy as much as we can into the pages and return the number of bytes which
+ * were successfully copied.  If a fault is encountered then clear the pages
+ * out to (ofs + bytes) and return the number of bytes which were copied.
+ */
+static inline size_t ntfs_copy_from_user(struct page **pages,
+		unsigned nr_pages, unsigned ofs, const char __user *buf,
+		size_t bytes)
+{
+	struct page **last_page = pages + nr_pages;
+	char *addr;
+	size_t total = 0;
+	unsigned len;
+	int left;
+
+	do {
+		len = PAGE_CACHE_SIZE - ofs;
+		if (len > bytes)
+			len = bytes;
+		addr = kmap_atomic(*pages, KM_USER0);
+		left = __copy_from_user_inatomic(addr + ofs, buf, len);
+		kunmap_atomic(addr, KM_USER0);
+		if (unlikely(left)) {
+			/* Do it the slow way. */
+			addr = kmap(*pages);
+			left = __copy_from_user(addr + ofs, buf, len);
+			kunmap(*pages);
+			if (unlikely(left))
+				goto err_out;
+		}
+		total += len;
+		bytes -= len;
+		if (!bytes)
+			break;
+		buf += len;
+		ofs = 0;
+	} while (++pages < last_page);
+out:
+	return total;
+err_out:
+	total += len - left;
+	/* Zero the rest of the target like __copy_from_user(). */
+	while (++pages < last_page) {
+		bytes -= len;
+		if (!bytes)
+			break;
+		len = PAGE_CACHE_SIZE;
+		if (len > bytes)
+			len = bytes;
+		zero_user(*pages, 0, len);
+	}
+	goto out;
+}
+
+static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
+		const struct iovec *iov, size_t iov_ofs, size_t bytes)
+{
+	size_t total = 0;
+
+	while (1) {
+		const char __user *buf = iov->iov_base + iov_ofs;
+		unsigned len;
+		size_t left;
+
+		len = iov->iov_len - iov_ofs;
+		if (len > bytes)
+			len = bytes;
+		left = __copy_from_user_inatomic(vaddr, buf, len);
+		total += len;
+		bytes -= len;
+		vaddr += len;
+		if (unlikely(left)) {
+			total -= left;
+			break;
+		}
+		if (!bytes)
+			break;
+		iov++;
+		iov_ofs = 0;
+	}
+	return total;
+}
+
+static inline void ntfs_set_next_iovec(const struct iovec **iovp,
+		size_t *iov_ofsp, size_t bytes)
+{
+	const struct iovec *iov = *iovp;
+	size_t iov_ofs = *iov_ofsp;
+
+	while (bytes) {
+		unsigned len;
+
+		len = iov->iov_len - iov_ofs;
+		if (len > bytes)
+			len = bytes;
+		bytes -= len;
+		iov_ofs += len;
+		if (iov->iov_len == iov_ofs) {
+			iov++;
+			iov_ofs = 0;
+		}
+	}
+	*iovp = iov;
+	*iov_ofsp = iov_ofs;
+}
+
+/*
+ * This has the same side-effects and return value as ntfs_copy_from_user().
+ * The difference is that on a fault we need to memset the remainder of the
+ * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
+ * single-segment behaviour.
+ *
+ * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
+ * atomic and when not atomic.  This is ok because it calls
+ * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
+ * fact, the only difference between __copy_from_user_inatomic() and
+ * __copy_from_user() is that the latter calls might_sleep() and the former
+ * should not zero the tail of the buffer on error.  And on many architectures
+ * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
+ * makes no difference at all on those architectures.
+ */
+static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
+		unsigned nr_pages, unsigned ofs, const struct iovec **iov,
+		size_t *iov_ofs, size_t bytes)
+{
+	struct page **last_page = pages + nr_pages;
+	char *addr;
+	size_t copied, len, total = 0;
+
+	do {
+		len = PAGE_CACHE_SIZE - ofs;
+		if (len > bytes)
+			len = bytes;
+		addr = kmap_atomic(*pages, KM_USER0);
+		copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
+				*iov, *iov_ofs, len);
+		kunmap_atomic(addr, KM_USER0);
+		if (unlikely(copied != len)) {
+			/* Do it the slow way. */
+			addr = kmap(*pages);
+			copied = __ntfs_copy_from_user_iovec_inatomic(addr +
+					ofs, *iov, *iov_ofs, len);
+			if (unlikely(copied != len))
+				goto err_out;
+			kunmap(*pages);
+		}
+		total += len;
+		ntfs_set_next_iovec(iov, iov_ofs, len);
+		bytes -= len;
+		if (!bytes)
+			break;
+		ofs = 0;
+	} while (++pages < last_page);
+out:
+	return total;
+err_out:
+	BUG_ON(copied > len);
+	/* Zero the rest of the target like __copy_from_user(). */
+	memset(addr + ofs + copied, 0, len - copied);
+	kunmap(*pages);
+	total += copied;
+	ntfs_set_next_iovec(iov, iov_ofs, copied);
+	while (++pages < last_page) {
+		bytes -= len;
+		if (!bytes)
+			break;
+		len = PAGE_CACHE_SIZE;
+		if (len > bytes)
+			len = bytes;
+		zero_user(*pages, 0, len);
+	}
+	goto out;
+}
+
+static inline void ntfs_flush_dcache_pages(struct page **pages,
+		unsigned nr_pages)
+{
+	BUG_ON(!nr_pages);
+	/*
+	 * Warning: Do not do the decrement at the same time as the call to
+	 * flush_dcache_page() because it is a NULL macro on i386 and hence the
+	 * decrement never happens so the loop never terminates.
+	 */
+	do {
+		--nr_pages;
+		flush_dcache_page(pages[nr_pages]);
+	} while (nr_pages > 0);
+}
+
+/**
+ * ntfs_commit_pages_after_non_resident_write - commit the received data
+ * @pages:	array of destination pages
+ * @nr_pages:	number of pages in @pages
+ * @pos:	byte position in file at which the write begins
+ * @bytes:	number of bytes to be written
+ *
+ * See description of ntfs_commit_pages_after_write(), below.
+ */
+static inline int ntfs_commit_pages_after_non_resident_write(
+		struct page **pages, const unsigned nr_pages,
+		s64 pos, size_t bytes)
+{
+	s64 end, initialized_size;
+	struct inode *vi;
+	ntfs_inode *ni, *base_ni;
+	struct buffer_head *bh, *head;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	unsigned long flags;
+	unsigned blocksize, u;
+	int err;
+
+	vi = pages[0]->mapping->host;
+	ni = NTFS_I(vi);
+	blocksize = vi->i_sb->s_blocksize;
+	end = pos + bytes;
+	u = 0;
+	do {
+		s64 bh_pos;
+		struct page *page;
+		bool partial;
+
+		page = pages[u];
+		bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
+		bh = head = page_buffers(page);
+		partial = false;
+		do {
+			s64 bh_end;
+
+			bh_end = bh_pos + blocksize;
+			if (bh_end <= pos || bh_pos >= end) {
+				if (!buffer_uptodate(bh))
+					partial = true;
+			} else {
+				set_buffer_uptodate(bh);
+				mark_buffer_dirty(bh);
+			}
+		} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
+		/*
+		 * If all buffers are now uptodate but the page is not, set the
+		 * page uptodate.
+		 */
+		if (!partial && !PageUptodate(page))
+			SetPageUptodate(page);
+	} while (++u < nr_pages);
+	/*
+	 * Finally, if we do not need to update initialized_size or i_size we
+	 * are finished.
+	 */
+	read_lock_irqsave(&ni->size_lock, flags);
+	initialized_size = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (end <= initialized_size) {
+		ntfs_debug("Done.");
+		return 0;
+	}
+	/*
+	 * Update initialized_size/i_size as appropriate, both in the inode and
+	 * the mft record.
+	 */
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	/* Map, pin, and lock the mft record. */
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		ctx = NULL;
+		goto err_out;
+	}
+	BUG_ON(!NInoNonResident(ni));
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto err_out;
+	}
+	a = ctx->attr;
+	BUG_ON(!a->non_resident);
+	write_lock_irqsave(&ni->size_lock, flags);
+	BUG_ON(end > ni->allocated_size);
+	ni->initialized_size = end;
+	a->data.non_resident.initialized_size = cpu_to_sle64(end);
+	if (end > i_size_read(vi)) {
+		i_size_write(vi, end);
+		a->data.non_resident.data_size =
+				a->data.non_resident.initialized_size;
+	}
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	/* Mark the mft record dirty, so it gets written back. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	ntfs_debug("Done.");
+	return 0;
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
+			"code %i).", err);
+	if (err != -ENOMEM)
+		NVolSetErrors(ni->vol);
+	return err;
+}
+
+/**
+ * ntfs_commit_pages_after_write - commit the received data
+ * @pages:	array of destination pages
+ * @nr_pages:	number of pages in @pages
+ * @pos:	byte position in file at which the write begins
+ * @bytes:	number of bytes to be written
+ *
+ * This is called from ntfs_file_buffered_write() with i_mutex held on the inode
+ * (@pages[0]->mapping->host).  There are @nr_pages pages in @pages which are
+ * locked but not kmap()ped.  The source data has already been copied into the
+ * @page.  ntfs_prepare_pages_for_non_resident_write() has been called before
+ * the data was copied (for non-resident attributes only) and it returned
+ * success.
+ *
+ * Need to set uptodate and mark dirty all buffers within the boundary of the
+ * write.  If all buffers in a page are uptodate we set the page uptodate, too.
+ *
+ * Setting the buffers dirty ensures that they get written out later when
+ * ntfs_writepage() is invoked by the VM.
+ *
+ * Finally, we need to update i_size and initialized_size as appropriate both
+ * in the inode and the mft record.
+ *
+ * This is modelled after fs/buffer.c::generic_commit_write(), which marks
+ * buffers uptodate and dirty, sets the page uptodate if all buffers in the
+ * page are uptodate, and updates i_size if the end of io is beyond i_size.  In
+ * that case, it also marks the inode dirty.
+ *
+ * If things have gone as outlined in
+ * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
+ * content modifications here for non-resident attributes.  For resident
+ * attributes we need to do the uptodate bringing here which we combine with
+ * the copying into the mft record which means we save one atomic kmap.
+ *
+ * Return 0 on success or -errno on error.
+ */
+static int ntfs_commit_pages_after_write(struct page **pages,
+		const unsigned nr_pages, s64 pos, size_t bytes)
+{
+	s64 end, initialized_size;
+	loff_t i_size;
+	struct inode *vi;
+	ntfs_inode *ni, *base_ni;
+	struct page *page;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	char *kattr, *kaddr;
+	unsigned long flags;
+	u32 attr_len;
+	int err;
+
+	BUG_ON(!nr_pages);
+	BUG_ON(!pages);
+	page = pages[0];
+	BUG_ON(!page);
+	vi = page->mapping->host;
+	ni = NTFS_I(vi);
+	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
+			"index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
+			vi->i_ino, ni->type, page->index, nr_pages,
+			(long long)pos, bytes);
+	if (NInoNonResident(ni))
+		return ntfs_commit_pages_after_non_resident_write(pages,
+				nr_pages, pos, bytes);
+	BUG_ON(nr_pages > 1);
+	/*
+	 * Attribute is resident, implying it is not compressed, encrypted, or
+	 * sparse.
+	 */
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	BUG_ON(NInoNonResident(ni));
+	/* Map, pin, and lock the mft record. */
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		ctx = NULL;
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto err_out;
+	}
+	a = ctx->attr;
+	BUG_ON(a->non_resident);
+	/* The total length of the attribute value. */
+	attr_len = le32_to_cpu(a->data.resident.value_length);
+	i_size = i_size_read(vi);
+	BUG_ON(attr_len != i_size);
+	BUG_ON(pos > attr_len);
+	end = pos + bytes;
+	BUG_ON(end > le32_to_cpu(a->length) -
+			le16_to_cpu(a->data.resident.value_offset));
+	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
+	kaddr = kmap_atomic(page, KM_USER0);
+	/* Copy the received data from the page to the mft record. */
+	memcpy(kattr + pos, kaddr + pos, bytes);
+	/* Update the attribute length if necessary. */
+	if (end > attr_len) {
+		attr_len = end;
+		a->data.resident.value_length = cpu_to_le32(attr_len);
+	}
+	/*
+	 * If the page is not uptodate, bring the out of bounds area(s)
+	 * uptodate by copying data from the mft record to the page.
+	 */
+	if (!PageUptodate(page)) {
+		if (pos > 0)
+			memcpy(kaddr, kattr, pos);
+		if (end < attr_len)
+			memcpy(kaddr + end, kattr + end, attr_len - end);
+		/* Zero the region outside the end of the attribute value. */
+		memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+	/* Update initialized_size/i_size if necessary. */
+	read_lock_irqsave(&ni->size_lock, flags);
+	initialized_size = ni->initialized_size;
+	BUG_ON(end > ni->allocated_size);
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	BUG_ON(initialized_size != i_size);
+	if (end > initialized_size) {
+		write_lock_irqsave(&ni->size_lock, flags);
+		ni->initialized_size = end;
+		i_size_write(vi, end);
+		write_unlock_irqrestore(&ni->size_lock, flags);
+	}
+	/* Mark the mft record dirty, so it gets written back. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	ntfs_debug("Done.");
+	return 0;
+err_out:
+	if (err == -ENOMEM) {
+		ntfs_warning(vi->i_sb, "Error allocating memory required to "
+				"commit the write.");
+		if (PageUptodate(page)) {
+			ntfs_warning(vi->i_sb, "Page is uptodate, setting "
+					"dirty so the write will be retried "
+					"later on by the VM.");
+			/*
+			 * Put the page on mapping->dirty_pages, but leave its
+			 * buffers' dirty state as-is.
+			 */
+			__set_page_dirty_nobuffers(page);
+			err = 0;
+		} else
+			ntfs_error(vi->i_sb, "Page is not uptodate.  Written "
+					"data has been lost.");
+	} else {
+		ntfs_error(vi->i_sb, "Resident attribute commit write failed "
+				"with error %i.", err);
+		NVolSetErrors(ni->vol);
+	}
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	return err;
+}
+
+/**
+ * ntfs_file_buffered_write -
+ *
+ * Locking: The vfs is holding ->i_mutex on the inode.
+ */
+static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
+		const struct iovec *iov, unsigned long nr_segs,
+		loff_t pos, loff_t *ppos, size_t count)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *vi = mapping->host;
+	ntfs_inode *ni = NTFS_I(vi);
+	ntfs_volume *vol = ni->vol;
+	struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
+	struct page *cached_page = NULL;
+	char __user *buf = NULL;
+	s64 end, ll;
+	VCN last_vcn;
+	LCN lcn;
+	unsigned long flags;
+	size_t bytes, iov_ofs = 0;	/* Offset in the current iovec. */
+	ssize_t status, written;
+	unsigned nr_pages;
+	int err;
+
+	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
+			"pos 0x%llx, count 0x%lx.",
+			vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+			(unsigned long long)pos, (unsigned long)count);
+	if (unlikely(!count))
+		return 0;
+	BUG_ON(NInoMstProtected(ni));
+	/*
+	 * If the attribute is not an index root and it is encrypted or
+	 * compressed, we cannot write to it yet.  Note we need to check for
+	 * AT_INDEX_ALLOCATION since this is the type of both directory and
+	 * index inodes.
+	 */
+	if (ni->type != AT_INDEX_ALLOCATION) {
+		/* If file is encrypted, deny access, just like NT4. */
+		if (NInoEncrypted(ni)) {
+			/*
+			 * Reminder for later: Encrypted files are _always_
+			 * non-resident so that the content can always be
+			 * encrypted.
+			 */
+			ntfs_debug("Denying write access to encrypted file.");
+			return -EACCES;
+		}
+		if (NInoCompressed(ni)) {
+			/* Only unnamed $DATA attribute can be compressed. */
+			BUG_ON(ni->type != AT_DATA);
+			BUG_ON(ni->name_len);
+			/*
+			 * Reminder for later: If resident, the data is not
+			 * actually compressed.  Only on the switch to non-
+			 * resident does compression kick in.  This is in
+			 * contrast to encrypted files (see above).
+			 */
+			ntfs_error(vi->i_sb, "Writing to compressed files is "
+					"not implemented yet.  Sorry.");
+			return -EOPNOTSUPP;
+		}
+	}
+	/*
+	 * If a previous ntfs_truncate() failed, repeat it and abort if it
+	 * fails again.
+	 */
+	if (unlikely(NInoTruncateFailed(ni))) {
+		down_write(&vi->i_alloc_sem);
+		err = ntfs_truncate(vi);
+		up_write(&vi->i_alloc_sem);
+		if (err || NInoTruncateFailed(ni)) {
+			if (!err)
+				err = -EIO;
+			ntfs_error(vol->sb, "Cannot perform write to inode "
+					"0x%lx, attribute type 0x%x, because "
+					"ntfs_truncate() failed (error code "
+					"%i).", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+			return err;
+		}
+	}
+	/* The first byte after the write. */
+	end = pos + count;
+	/*
+	 * If the write goes beyond the allocated size, extend the allocation
+	 * to cover the whole of the write, rounded up to the nearest cluster.
+	 */
+	read_lock_irqsave(&ni->size_lock, flags);
+	ll = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (end > ll) {
+		/* Extend the allocation without changing the data size. */
+		ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
+		if (likely(ll >= 0)) {
+			BUG_ON(pos >= ll);
+			/* If the extension was partial truncate the write. */
+			if (end > ll) {
+				ntfs_debug("Truncating write to inode 0x%lx, "
+						"attribute type 0x%x, because "
+						"the allocation was only "
+						"partially extended.",
+						vi->i_ino, (unsigned)
+						le32_to_cpu(ni->type));
+				end = ll;
+				count = ll - pos;
+			}
+		} else {
+			err = ll;
+			read_lock_irqsave(&ni->size_lock, flags);
+			ll = ni->allocated_size;
+			read_unlock_irqrestore(&ni->size_lock, flags);
+			/* Perform a partial write if possible or fail. */
+			if (pos < ll) {
+				ntfs_debug("Truncating write to inode 0x%lx, "
+						"attribute type 0x%x, because "
+						"extending the allocation "
+						"failed (error code %i).",
+						vi->i_ino, (unsigned)
+						le32_to_cpu(ni->type), err);
+				end = ll;
+				count = ll - pos;
+			} else {
+				ntfs_error(vol->sb, "Cannot perform write to "
+						"inode 0x%lx, attribute type "
+						"0x%x, because extending the "
+						"allocation failed (error "
+						"code %i).", vi->i_ino,
+						(unsigned)
+						le32_to_cpu(ni->type), err);
+				return err;
+			}
+		}
+	}
+	written = 0;
+	/*
+	 * If the write starts beyond the initialized size, extend it up to the
+	 * beginning of the write and initialize all non-sparse space between
+	 * the old initialized size and the new one.  This automatically also
+	 * increments the vfs inode->i_size to keep it above or equal to the
+	 * initialized_size.
+	 */
+	read_lock_irqsave(&ni->size_lock, flags);
+	ll = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (pos > ll) {
+		err = ntfs_attr_extend_initialized(ni, pos);
+		if (err < 0) {
+			ntfs_error(vol->sb, "Cannot perform write to inode "
+					"0x%lx, attribute type 0x%x, because "
+					"extending the initialized size "
+					"failed (error code %i).", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+			status = err;
+			goto err_out;
+		}
+	}
+	/*
+	 * Determine the number of pages per cluster for non-resident
+	 * attributes.
+	 */
+	nr_pages = 1;
+	if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
+		nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
+	/* Finally, perform the actual write. */
+	last_vcn = -1;
+	if (likely(nr_segs == 1))
+		buf = iov->iov_base;
+	do {
+		VCN vcn;
+		pgoff_t idx, start_idx;
+		unsigned ofs, do_pages, u;
+		size_t copied;
+
+		start_idx = idx = pos >> PAGE_CACHE_SHIFT;
+		ofs = pos & ~PAGE_CACHE_MASK;
+		bytes = PAGE_CACHE_SIZE - ofs;
+		do_pages = 1;
+		if (nr_pages > 1) {
+			vcn = pos >> vol->cluster_size_bits;
+			if (vcn != last_vcn) {
+				last_vcn = vcn;
+				/*
+				 * Get the lcn of the vcn the write is in.  If
+				 * it is a hole, need to lock down all pages in
+				 * the cluster.
+				 */
+				down_read(&ni->runlist.lock);
+				lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
+						vol->cluster_size_bits, false);
+				up_read(&ni->runlist.lock);
+				if (unlikely(lcn < LCN_HOLE)) {
+					status = -EIO;
+					if (lcn == LCN_ENOMEM)
+						status = -ENOMEM;
+					else
+						ntfs_error(vol->sb, "Cannot "
+							"perform write to "
+							"inode 0x%lx, "
+							"attribute type 0x%x, "
+							"because the attribute "
+							"is corrupt.",
+							vi->i_ino, (unsigned)
+							le32_to_cpu(ni->type));
+					break;
+				}
+				if (lcn == LCN_HOLE) {
+					start_idx = (pos & ~(s64)
+							vol->cluster_size_mask)
+							>> PAGE_CACHE_SHIFT;
+					bytes = vol->cluster_size - (pos &
+							vol->cluster_size_mask);
+					do_pages = nr_pages;
+				}
+			}
+		}
+		if (bytes > count)
+			bytes = count;
+		/*
+		 * Bring in the user page(s) that we will copy from _first_.
+		 * Otherwise there is a nasty deadlock on copying from the same
+		 * page(s) as we are writing to, without it/them being marked
+		 * up-to-date.  Note, at present there is nothing to stop the
+		 * pages being swapped out between us bringing them into memory
+		 * and doing the actual copying.
+		 */
+		if (likely(nr_segs == 1))
+			ntfs_fault_in_pages_readable(buf, bytes);
+		else
+			ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
+		/* Get and lock @do_pages starting at index @start_idx. */
+		status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
+				pages, &cached_page);
+		if (unlikely(status))
+			break;
+		/*
+		 * For non-resident attributes, we need to fill any holes with
+		 * actual clusters and ensure all bufferes are mapped.  We also
+		 * need to bring uptodate any buffers that are only partially
+		 * being written to.
+		 */
+		if (NInoNonResident(ni)) {
+			status = ntfs_prepare_pages_for_non_resident_write(
+					pages, do_pages, pos, bytes);
+			if (unlikely(status)) {
+				loff_t i_size;
+
+				do {
+					unlock_page(pages[--do_pages]);
+					page_cache_release(pages[do_pages]);
+				} while (do_pages);
+				/*
+				 * The write preparation may have instantiated
+				 * allocated space outside i_size.  Trim this
+				 * off again.  We can ignore any errors in this
+				 * case as we will just be waisting a bit of
+				 * allocated space, which is not a disaster.
+				 */
+				i_size = i_size_read(vi);
+				if (pos + bytes > i_size)
+					vmtruncate(vi, i_size);
+				break;
+			}
+		}
+		u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
+		if (likely(nr_segs == 1)) {
+			copied = ntfs_copy_from_user(pages + u, do_pages - u,
+					ofs, buf, bytes);
+			buf += copied;
+		} else
+			copied = ntfs_copy_from_user_iovec(pages + u,
+					do_pages - u, ofs, &iov, &iov_ofs,
+					bytes);
+		ntfs_flush_dcache_pages(pages + u, do_pages - u);
+		status = ntfs_commit_pages_after_write(pages, do_pages, pos,
+				bytes);
+		if (likely(!status)) {
+			written += copied;
+			count -= copied;
+			pos += copied;
+			if (unlikely(copied != bytes))
+				status = -EFAULT;
+		}
+		do {
+			unlock_page(pages[--do_pages]);
+			mark_page_accessed(pages[do_pages]);
+			page_cache_release(pages[do_pages]);
+		} while (do_pages);
+		if (unlikely(status))
+			break;
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+	} while (count);
+err_out:
+	*ppos = pos;
+	if (cached_page)
+		page_cache_release(cached_page);
+	ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
+			written ? "written" : "status", (unsigned long)written,
+			(long)status);
+	return written ? written : status;
+}
+
+/**
+ * ntfs_file_aio_write_nolock -
+ */
+static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
+		const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	loff_t pos;
+	size_t count;		/* after file limit checks */
+	ssize_t written, err;
+
+	count = 0;
+	err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
+	if (err)
+		return err;
+	pos = *ppos;
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+	/* We can write back this queue in page reclaim. */
+	current->backing_dev_info = mapping->backing_dev_info;
+	written = 0;
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err)
+		goto out;
+	if (!count)
+		goto out;
+	err = file_remove_suid(file);
+	if (err)
+		goto out;
+	file_update_time(file);
+	written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
+			count);
+out:
+	current->backing_dev_info = NULL;
+	return written ? written : err;
+}
+
+/**
+ * ntfs_file_aio_write -
+ */
+static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+
+	BUG_ON(iocb->ki_pos != pos);
+
+	mutex_lock(&inode->i_mutex);
+	ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
+	mutex_unlock(&inode->i_mutex);
+	if (ret > 0) {
+		int err = generic_write_sync(file, pos, ret);
+		if (err < 0)
+			ret = err;
+	}
+	return ret;
+}
+
+/**
+ * ntfs_file_fsync - sync a file to disk
+ * @filp:	file to be synced
+ * @datasync:	if non-zero only flush user data and not metadata
+ *
+ * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
+ * system calls.  This function is inspired by fs/buffer.c::file_fsync().
+ *
+ * If @datasync is false, write the mft record and all associated extent mft
+ * records as well as the $DATA attribute and then sync the block device.
+ *
+ * If @datasync is true and the attribute is non-resident, we skip the writing
+ * of the mft record and all associated extent mft records (this might still
+ * happen due to the write_inode_now() call).
+ *
+ * Also, if @datasync is true, we do not wait on the inode to be written out
+ * but we always wait on the page cache pages to be written out.
+ *
+ * Locking: Caller must hold i_mutex on the inode.
+ *
+ * TODO: We should probably also write all attribute/index inodes associated
+ * with this inode but since we have no simple way of getting to them we ignore
+ * this problem for now.
+ */
+static int ntfs_file_fsync(struct file *filp, int datasync)
+{
+	struct inode *vi = filp->f_mapping->host;
+	int err, ret = 0;
+
+	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+	BUG_ON(S_ISDIR(vi->i_mode));
+	if (!datasync || !NInoNonResident(NTFS_I(vi)))
+		ret = __ntfs_write_inode(vi, 1);
+	write_inode_now(vi, !datasync);
+	/*
+	 * NOTE: If we were to use mapping->private_list (see ext2 and
+	 * fs/buffer.c) for dirty blocks then we could optimize the below to be
+	 * sync_mapping_buffers(vi->i_mapping).
+	 */
+	err = sync_blockdev(vi->i_sb->s_bdev);
+	if (unlikely(err && !ret))
+		ret = err;
+	if (likely(!ret))
+		ntfs_debug("Done.");
+	else
+		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
+				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
+	return ret;
+}
+
+#endif /* NTFS_RW */
+
+const struct file_operations ntfs_file_ops = {
+	.llseek		= generic_file_llseek,	 /* Seek inside file. */
+	.read		= do_sync_read,		 /* Read from file. */
+	.aio_read	= generic_file_aio_read, /* Async read from file. */
+#ifdef NTFS_RW
+	.write		= do_sync_write,	 /* Write to file. */
+	.aio_write	= ntfs_file_aio_write,	 /* Async write to file. */
+	/*.release	= ,*/			 /* Last file is closed.  See
+						    fs/ext2/file.c::
+						    ext2_release_file() for
+						    how to use this to discard
+						    preallocated space for
+						    write opened files. */
+	.fsync		= ntfs_file_fsync,	 /* Sync a file to disk. */
+	/*.aio_fsync	= ,*/			 /* Sync all outstanding async
+						    i/o operations on a
+						    kiocb. */
+#endif /* NTFS_RW */
+	/*.ioctl	= ,*/			 /* Perform function on the
+						    mounted filesystem. */
+	.mmap		= generic_file_mmap,	 /* Mmap file. */
+	.open		= ntfs_file_open,	 /* Open file. */
+	.splice_read	= generic_file_splice_read /* Zero-copy data send with
+						    the data source being on
+						    the ntfs partition.  We do
+						    not need to care about the
+						    data destination. */
+	/*.sendpage	= ,*/			 /* Zero-copy data send with
+						    the data destination being
+						    on the ntfs partition.  We
+						    do not need to care about
+						    the data source. */
+};
+
+const struct inode_operations ntfs_file_inode_ops = {
+#ifdef NTFS_RW
+	.truncate	= ntfs_truncate_vfs,
+	.setattr	= ntfs_setattr,
+#endif /* NTFS_RW */
+};
+
+const struct file_operations ntfs_empty_file_ops = {};
+
+const struct inode_operations ntfs_empty_inode_ops = {};
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
new file mode 100644
index 00000000..096c1356
--- /dev/null
+++ b/fs/ntfs/index.c
@@ -0,0 +1,454 @@
+/*
+ * index.c - NTFS kernel index handling.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/slab.h>
+
+#include "aops.h"
+#include "collate.h"
+#include "debug.h"
+#include "index.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_index_ctx_get - allocate and initialize a new index context
+ * @idx_ni:	ntfs index inode with which to initialize the context
+ *
+ * Allocate a new index context, initialize it with @idx_ni and return it.
+ * Return NULL if allocation failed.
+ *
+ * Locking:  Caller must hold i_mutex on the index inode.
+ */
+ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
+{
+	ntfs_index_context *ictx;
+
+	ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS);
+	if (ictx)
+		*ictx = (ntfs_index_context){ .idx_ni = idx_ni };
+	return ictx;
+}
+
+/**
+ * ntfs_index_ctx_put - release an index context
+ * @ictx:	index context to free
+ *
+ * Release the index context @ictx, releasing all associated resources.
+ *
+ * Locking:  Caller must hold i_mutex on the index inode.
+ */
+void ntfs_index_ctx_put(ntfs_index_context *ictx)
+{
+	if (ictx->entry) {
+		if (ictx->is_in_root) {
+			if (ictx->actx)
+				ntfs_attr_put_search_ctx(ictx->actx);
+			if (ictx->base_ni)
+				unmap_mft_record(ictx->base_ni);
+		} else {
+			struct page *page = ictx->page;
+			if (page) {
+				BUG_ON(!PageLocked(page));
+				unlock_page(page);
+				ntfs_unmap_page(page);
+			}
+		}
+	}
+	kmem_cache_free(ntfs_index_ctx_cache, ictx);
+	return;
+}
+
+/**
+ * ntfs_index_lookup - find a key in an index and return its index entry
+ * @key:	[IN] key for which to search in the index
+ * @key_len:	[IN] length of @key in bytes
+ * @ictx:	[IN/OUT] context describing the index and the returned entry
+ *
+ * Before calling ntfs_index_lookup(), @ictx must have been obtained from a
+ * call to ntfs_index_ctx_get().
+ *
+ * Look for the @key in the index specified by the index lookup context @ictx.
+ * ntfs_index_lookup() walks the contents of the index looking for the @key.
+ *
+ * If the @key is found in the index, 0 is returned and @ictx is setup to
+ * describe the index entry containing the matching @key.  @ictx->entry is the
+ * index entry and @ictx->data and @ictx->data_len are the index entry data and
+ * its length in bytes, respectively.
+ *
+ * If the @key is not found in the index, -ENOENT is returned and @ictx is
+ * setup to describe the index entry whose key collates immediately after the
+ * search @key, i.e. this is the position in the index at which an index entry
+ * with a key of @key would need to be inserted.
+ *
+ * If an error occurs return the negative error code and @ictx is left
+ * untouched.
+ *
+ * When finished with the entry and its data, call ntfs_index_ctx_put() to free
+ * the context and other associated resources.
+ *
+ * If the index entry was modified, call flush_dcache_index_entry_page()
+ * immediately after the modification and either ntfs_index_entry_mark_dirty()
+ * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
+ * ensure that the changes are written to disk.
+ *
+ * Locking:  - Caller must hold i_mutex on the index inode.
+ *	     - Each page cache page in the index allocation mapping must be
+ *	       locked whilst being accessed otherwise we may find a corrupt
+ *	       page due to it being under ->writepage at the moment which
+ *	       applies the mst protection fixups before writing out and then
+ *	       removes them again after the write is complete after which it 
+ *	       unlocks the page.
+ */
+int ntfs_index_lookup(const void *key, const int key_len,
+		ntfs_index_context *ictx)
+{
+	VCN vcn, old_vcn;
+	ntfs_inode *idx_ni = ictx->idx_ni;
+	ntfs_volume *vol = idx_ni->vol;
+	struct super_block *sb = vol->sb;
+	ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino;
+	MFT_RECORD *m;
+	INDEX_ROOT *ir;
+	INDEX_ENTRY *ie;
+	INDEX_ALLOCATION *ia;
+	u8 *index_end, *kaddr;
+	ntfs_attr_search_ctx *actx;
+	struct address_space *ia_mapping;
+	struct page *page;
+	int rc, err = 0;
+
+	ntfs_debug("Entering.");
+	BUG_ON(!NInoAttr(idx_ni));
+	BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION);
+	BUG_ON(idx_ni->nr_extents != -1);
+	BUG_ON(!base_ni);
+	BUG_ON(!key);
+	BUG_ON(key_len <= 0);
+	if (!ntfs_is_collation_rule_supported(
+			idx_ni->itype.index.collation_rule)) {
+		ntfs_error(sb, "Index uses unsupported collation rule 0x%x.  "
+				"Aborting lookup.", le32_to_cpu(
+				idx_ni->itype.index.collation_rule));
+		return -EOPNOTSUPP;
+	}
+	/* Get hold of the mft record for the index inode. */
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		ntfs_error(sb, "map_mft_record() failed with error code %ld.",
+				-PTR_ERR(m));
+		return PTR_ERR(m);
+	}
+	actx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!actx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	/* Find the index root attribute in the mft record. */
+	err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, actx);
+	if (unlikely(err)) {
+		if (err == -ENOENT) {
+			ntfs_error(sb, "Index root attribute missing in inode "
+					"0x%lx.", idx_ni->mft_no);
+			err = -EIO;
+		}
+		goto err_out;
+	}
+	/* Get to the index root value (it has been verified in read_inode). */
+	ir = (INDEX_ROOT*)((u8*)actx->attr +
+			le16_to_cpu(actx->attr->data.resident.value_offset));
+	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+	/* The first index entry. */
+	ie = (INDEX_ENTRY*)((u8*)&ir->index +
+			le32_to_cpu(ir->index.entries_offset));
+	/*
+	 * Loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry.
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		/* Bounds checks. */
+		if ((u8*)ie < (u8*)actx->mrec || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->length) > index_end)
+			goto idx_err_out;
+		/*
+		 * The last entry cannot contain a key.  It can however contain
+		 * a pointer to a child node in the B+tree so we just break out.
+		 */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/* Further bounds checks. */
+		if ((u32)sizeof(INDEX_ENTRY_HEADER) +
+				le16_to_cpu(ie->key_length) >
+				le16_to_cpu(ie->data.vi.data_offset) ||
+				(u32)le16_to_cpu(ie->data.vi.data_offset) +
+				le16_to_cpu(ie->data.vi.data_length) >
+				le16_to_cpu(ie->length))
+			goto idx_err_out;
+		/* If the keys match perfectly, we setup @ictx and return 0. */
+		if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
+				&ie->key, key_len)) {
+ir_done:
+			ictx->is_in_root = true;
+			ictx->ir = ir;
+			ictx->actx = actx;
+			ictx->base_ni = base_ni;
+			ictx->ia = NULL;
+			ictx->page = NULL;
+done:
+			ictx->entry = ie;
+			ictx->data = (u8*)ie +
+					le16_to_cpu(ie->data.vi.data_offset);
+			ictx->data_len = le16_to_cpu(ie->data.vi.data_length);
+			ntfs_debug("Done.");
+			return err;
+		}
+		/*
+		 * Not a perfect match, need to do full blown collation so we
+		 * know which way in the B+tree we have to go.
+		 */
+		rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
+				key_len, &ie->key, le16_to_cpu(ie->key_length));
+		/*
+		 * If @key collates before the key of the current entry, there
+		 * is definitely no such key in this index but we might need to
+		 * descend into the B+tree so we just break out of the loop.
+		 */
+		if (rc == -1)
+			break;
+		/*
+		 * A match should never happen as the memcmp() call should have
+		 * cought it, but we still treat it correctly.
+		 */
+		if (!rc)
+			goto ir_done;
+		/* The keys are not equal, continue the search. */
+	}
+	/*
+	 * We have finished with this index without success.  Check for the
+	 * presence of a child node and if not present setup @ictx and return
+	 * -ENOENT.
+	 */
+	if (!(ie->flags & INDEX_ENTRY_NODE)) {
+		ntfs_debug("Entry not found.");
+		err = -ENOENT;
+		goto ir_done;
+	} /* Child node present, descend into it. */
+	/* Consistency check: Verify that an index allocation exists. */
+	if (!NInoIndexAllocPresent(idx_ni)) {
+		ntfs_error(sb, "No index allocation attribute but index entry "
+				"requires one.  Inode 0x%lx is corrupt or "
+				"driver bug.", idx_ni->mft_no);
+		goto err_out;
+	}
+	/* Get the starting vcn of the index_block holding the child node. */
+	vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
+	ia_mapping = VFS_I(idx_ni)->i_mapping;
+	/*
+	 * We are done with the index root and the mft record.  Release them,
+	 * otherwise we deadlock with ntfs_map_page().
+	 */
+	ntfs_attr_put_search_ctx(actx);
+	unmap_mft_record(base_ni);
+	m = NULL;
+	actx = NULL;
+descend_into_child_node:
+	/*
+	 * Convert vcn to index into the index allocation attribute in units
+	 * of PAGE_CACHE_SIZE and map the page cache page, reading it from
+	 * disk if necessary.
+	 */
+	page = ntfs_map_page(ia_mapping, vcn <<
+			idx_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+	if (IS_ERR(page)) {
+		ntfs_error(sb, "Failed to map index page, error %ld.",
+				-PTR_ERR(page));
+		err = PTR_ERR(page);
+		goto err_out;
+	}
+	lock_page(page);
+	kaddr = (u8*)page_address(page);
+fast_descend_into_child_node:
+	/* Get to the index allocation block. */
+	ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
+			idx_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+	/* Bounds checks. */
+	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+		ntfs_error(sb, "Out of bounds check failed.  Corrupt inode "
+				"0x%lx or driver bug.", idx_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* Catch multi sector transfer fixup errors. */
+	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+		ntfs_error(sb, "Index record with vcn 0x%llx is corrupt.  "
+				"Corrupt inode 0x%lx.  Run chkdsk.",
+				(long long)vcn, idx_ni->mft_no);
+		goto unm_err_out;
+	}
+	if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
+		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+				"different from expected VCN (0x%llx).  Inode "
+				"0x%lx is corrupt or driver bug.",
+				(unsigned long long)
+				sle64_to_cpu(ia->index_block_vcn),
+				(unsigned long long)vcn, idx_ni->mft_no);
+		goto unm_err_out;
+	}
+	if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+			idx_ni->itype.index.block_size) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has "
+				"a size (%u) differing from the index "
+				"specified size (%u).  Inode is corrupt or "
+				"driver bug.", (unsigned long long)vcn,
+				idx_ni->mft_no,
+				le32_to_cpu(ia->index.allocated_size) + 0x18,
+				idx_ni->itype.index.block_size);
+		goto unm_err_out;
+	}
+	index_end = (u8*)ia + idx_ni->itype.index.block_size;
+	if (index_end > kaddr + PAGE_CACHE_SIZE) {
+		ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx "
+				"crosses page boundary.  Impossible!  Cannot "
+				"access!  This is probably a bug in the "
+				"driver.", (unsigned long long)vcn,
+				idx_ni->mft_no);
+		goto unm_err_out;
+	}
+	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+	if (index_end > (u8*)ia + idx_ni->itype.index.block_size) {
+		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode "
+				"0x%lx exceeds maximum size.",
+				(unsigned long long)vcn, idx_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* The first index entry. */
+	ie = (INDEX_ENTRY*)((u8*)&ia->index +
+			le32_to_cpu(ia->index.entries_offset));
+	/*
+	 * Iterate similar to above big loop but applied to index buffer, thus
+	 * loop until we exceed valid memory (corruption case) or until we
+	 * reach the last entry.
+	 */
+	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+		/* Bounds checks. */
+		if ((u8*)ie < (u8*)ia || (u8*)ie +
+				sizeof(INDEX_ENTRY_HEADER) > index_end ||
+				(u8*)ie + le16_to_cpu(ie->length) > index_end) {
+			ntfs_error(sb, "Index entry out of bounds in inode "
+					"0x%lx.", idx_ni->mft_no);
+			goto unm_err_out;
+		}
+		/*
+		 * The last entry cannot contain a key.  It can however contain
+		 * a pointer to a child node in the B+tree so we just break out.
+		 */
+		if (ie->flags & INDEX_ENTRY_END)
+			break;
+		/* Further bounds checks. */
+		if ((u32)sizeof(INDEX_ENTRY_HEADER) +
+				le16_to_cpu(ie->key_length) >
+				le16_to_cpu(ie->data.vi.data_offset) ||
+				(u32)le16_to_cpu(ie->data.vi.data_offset) +
+				le16_to_cpu(ie->data.vi.data_length) >
+				le16_to_cpu(ie->length)) {
+			ntfs_error(sb, "Index entry out of bounds in inode "
+					"0x%lx.", idx_ni->mft_no);
+			goto unm_err_out;
+		}
+		/* If the keys match perfectly, we setup @ictx and return 0. */
+		if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
+				&ie->key, key_len)) {
+ia_done:
+			ictx->is_in_root = false;
+			ictx->actx = NULL;
+			ictx->base_ni = NULL;
+			ictx->ia = ia;
+			ictx->page = page;
+			goto done;
+		}
+		/*
+		 * Not a perfect match, need to do full blown collation so we
+		 * know which way in the B+tree we have to go.
+		 */
+		rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
+				key_len, &ie->key, le16_to_cpu(ie->key_length));
+		/*
+		 * If @key collates before the key of the current entry, there
+		 * is definitely no such key in this index but we might need to
+		 * descend into the B+tree so we just break out of the loop.
+		 */
+		if (rc == -1)
+			break;
+		/*
+		 * A match should never happen as the memcmp() call should have
+		 * cought it, but we still treat it correctly.
+		 */
+		if (!rc)
+			goto ia_done;
+		/* The keys are not equal, continue the search. */
+	}
+	/*
+	 * We have finished with this index buffer without success.  Check for
+	 * the presence of a child node and if not present return -ENOENT.
+	 */
+	if (!(ie->flags & INDEX_ENTRY_NODE)) {
+		ntfs_debug("Entry not found.");
+		err = -ENOENT;
+		goto ia_done;
+	}
+	if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
+		ntfs_error(sb, "Index entry with child node found in a leaf "
+				"node in inode 0x%lx.", idx_ni->mft_no);
+		goto unm_err_out;
+	}
+	/* Child node present, descend into it. */
+	old_vcn = vcn;
+	vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
+	if (vcn >= 0) {
+		/*
+		 * If vcn is in the same page cache page as old_vcn we recycle
+		 * the mapped page.
+		 */
+		if (old_vcn << vol->cluster_size_bits >>
+				PAGE_CACHE_SHIFT == vcn <<
+				vol->cluster_size_bits >>
+				PAGE_CACHE_SHIFT)
+			goto fast_descend_into_child_node;
+		unlock_page(page);
+		ntfs_unmap_page(page);
+		goto descend_into_child_node;
+	}
+	ntfs_error(sb, "Negative child node vcn in inode 0x%lx.",
+			idx_ni->mft_no);
+unm_err_out:
+	unlock_page(page);
+	ntfs_unmap_page(page);
+err_out:
+	if (!err)
+		err = -EIO;
+	if (actx)
+		ntfs_attr_put_search_ctx(actx);
+	if (m)
+		unmap_mft_record(base_ni);
+	return err;
+idx_err_out:
+	ntfs_error(sb, "Corrupt index.  Aborting lookup.");
+	goto err_out;
+}
diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h
new file mode 100644
index 00000000..8745469c
--- /dev/null
+++ b/fs/ntfs/index.h
@@ -0,0 +1,148 @@
+/*
+ * index.h - Defines for NTFS kernel index handling.  Part of the Linux-NTFS
+ *	     project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_INDEX_H
+#define _LINUX_NTFS_INDEX_H
+
+#include <linux/fs.h>
+
+#include "types.h"
+#include "layout.h"
+#include "inode.h"
+#include "attrib.h"
+#include "mft.h"
+#include "aops.h"
+
+/**
+ * @idx_ni:	index inode containing the @entry described by this context
+ * @entry:	index entry (points into @ir or @ia)
+ * @data:	index entry data (points into @entry)
+ * @data_len:	length in bytes of @data
+ * @is_in_root:	'true' if @entry is in @ir and 'false' if it is in @ia
+ * @ir:		index root if @is_in_root and NULL otherwise
+ * @actx:	attribute search context if @is_in_root and NULL otherwise
+ * @base_ni:	base inode if @is_in_root and NULL otherwise
+ * @ia:		index block if @is_in_root is 'false' and NULL otherwise
+ * @page:	page if @is_in_root is 'false' and NULL otherwise
+ *
+ * @idx_ni is the index inode this context belongs to.
+ *
+ * @entry is the index entry described by this context.  @data and @data_len
+ * are the index entry data and its length in bytes, respectively.  @data
+ * simply points into @entry.  This is probably what the user is interested in.
+ *
+ * If @is_in_root is 'true', @entry is in the index root attribute @ir described
+ * by the attribute search context @actx and the base inode @base_ni.  @ia and
+ * @page are NULL in this case.
+ *
+ * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia
+ * and @page point to the index allocation block and the mapped, locked page it
+ * is in, respectively.  @ir, @actx and @base_ni are NULL in this case.
+ *
+ * To obtain a context call ntfs_index_ctx_get().
+ *
+ * We use this context to allow ntfs_index_lookup() to return the found index
+ * @entry and its @data without having to allocate a buffer and copy the @entry
+ * and/or its @data into it.
+ *
+ * When finished with the @entry and its @data, call ntfs_index_ctx_put() to
+ * free the context and other associated resources.
+ *
+ * If the index entry was modified, call flush_dcache_index_entry_page()
+ * immediately after the modification and either ntfs_index_entry_mark_dirty()
+ * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
+ * ensure that the changes are written to disk.
+ */
+typedef struct {
+	ntfs_inode *idx_ni;
+	INDEX_ENTRY *entry;
+	void *data;
+	u16 data_len;
+	bool is_in_root;
+	INDEX_ROOT *ir;
+	ntfs_attr_search_ctx *actx;
+	ntfs_inode *base_ni;
+	INDEX_ALLOCATION *ia;
+	struct page *page;
+} ntfs_index_context;
+
+extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni);
+extern void ntfs_index_ctx_put(ntfs_index_context *ictx);
+
+extern int ntfs_index_lookup(const void *key, const int key_len,
+		ntfs_index_context *ictx);
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries
+ * @ictx:	ntfs index context describing the index entry
+ *
+ * Call flush_dcache_page() for the page in which an index entry resides.
+ *
+ * This must be called every time an index entry is modified, just after the
+ * modification.
+ *
+ * If the index entry is in the index root attribute, simply flush the page
+ * containing the mft record containing the index root attribute.
+ *
+ * If the index entry is in an index block belonging to the index allocation
+ * attribute, simply flush the page cache page containing the index block.
+ */
+static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx)
+{
+	if (ictx->is_in_root)
+		flush_dcache_mft_record_page(ictx->actx->ntfs_ino);
+	else
+		flush_dcache_page(ictx->page);
+}
+
+/**
+ * ntfs_index_entry_mark_dirty - mark an index entry dirty
+ * @ictx:	ntfs index context describing the index entry
+ *
+ * Mark the index entry described by the index entry context @ictx dirty.
+ *
+ * If the index entry is in the index root attribute, simply mark the mft
+ * record containing the index root attribute dirty.  This ensures the mft
+ * record, and hence the index root attribute, will be written out to disk
+ * later.
+ *
+ * If the index entry is in an index block belonging to the index allocation
+ * attribute, mark the buffers belonging to the index record as well as the
+ * page cache page the index block is in dirty.  This automatically marks the
+ * VFS inode of the ntfs index inode to which the index entry belongs dirty,
+ * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the
+ * dirty index block, will be written out to disk later.
+ */
+static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx)
+{
+	if (ictx->is_in_root)
+		mark_mft_record_dirty(ictx->actx->ntfs_ino);
+	else
+		mark_ntfs_record_dirty(ictx->page,
+				(u8*)ictx->ia - (u8*)page_address(ictx->page));
+}
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_INDEX_H */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
new file mode 100644
index 00000000..c05d6dcf
--- /dev/null
+++ b/fs/ntfs/inode.c
@@ -0,0 +1,3112 @@
+/**
+ * inode.c - NTFS kernel inode handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+
+#include "aops.h"
+#include "attrib.h"
+#include "bitmap.h"
+#include "dir.h"
+#include "debug.h"
+#include "inode.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
+#include "time.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_test_inode - compare two (possibly fake) inodes for equality
+ * @vi:		vfs inode which to test
+ * @na:		ntfs attribute which is being tested with
+ *
+ * Compare the ntfs attribute embedded in the ntfs specific part of the vfs
+ * inode @vi for equality with the ntfs attribute @na.
+ *
+ * If searching for the normal file/directory inode, set @na->type to AT_UNUSED.
+ * @na->name and @na->name_len are then ignored.
+ *
+ * Return 1 if the attributes match and 0 if not.
+ *
+ * NOTE: This function runs with the inode->i_lock spin lock held so it is not
+ * allowed to sleep.
+ */
+int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
+{
+	ntfs_inode *ni;
+
+	if (vi->i_ino != na->mft_no)
+		return 0;
+	ni = NTFS_I(vi);
+	/* If !NInoAttr(ni), @vi is a normal file or directory inode. */
+	if (likely(!NInoAttr(ni))) {
+		/* If not looking for a normal inode this is a mismatch. */
+		if (unlikely(na->type != AT_UNUSED))
+			return 0;
+	} else {
+		/* A fake inode describing an attribute. */
+		if (ni->type != na->type)
+			return 0;
+		if (ni->name_len != na->name_len)
+			return 0;
+		if (na->name_len && memcmp(ni->name, na->name,
+				na->name_len * sizeof(ntfschar)))
+			return 0;
+	}
+	/* Match! */
+	return 1;
+}
+
+/**
+ * ntfs_init_locked_inode - initialize an inode
+ * @vi:		vfs inode to initialize
+ * @na:		ntfs attribute which to initialize @vi to
+ *
+ * Initialize the vfs inode @vi with the values from the ntfs attribute @na in
+ * order to enable ntfs_test_inode() to do its work.
+ *
+ * If initializing the normal file/directory inode, set @na->type to AT_UNUSED.
+ * In that case, @na->name and @na->name_len should be set to NULL and 0,
+ * respectively. Although that is not strictly necessary as
+ * ntfs_read_locked_inode() will fill them in later.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * NOTE: This function runs with the inode->i_lock spin lock held so it is not
+ * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
+ */
+static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
+{
+	ntfs_inode *ni = NTFS_I(vi);
+
+	vi->i_ino = na->mft_no;
+
+	ni->type = na->type;
+	if (na->type == AT_INDEX_ALLOCATION)
+		NInoSetMstProtected(ni);
+
+	ni->name = na->name;
+	ni->name_len = na->name_len;
+
+	/* If initializing a normal inode, we are done. */
+	if (likely(na->type == AT_UNUSED)) {
+		BUG_ON(na->name);
+		BUG_ON(na->name_len);
+		return 0;
+	}
+
+	/* It is a fake inode. */
+	NInoSetAttr(ni);
+
+	/*
+	 * We have I30 global constant as an optimization as it is the name
+	 * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC
+	 * allocation but that is ok. And most attributes are unnamed anyway,
+	 * thus the fraction of named attributes with name != I30 is actually
+	 * absolutely tiny.
+	 */
+	if (na->name_len && na->name != I30) {
+		unsigned int i;
+
+		BUG_ON(!na->name);
+		i = na->name_len * sizeof(ntfschar);
+		ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC);
+		if (!ni->name)
+			return -ENOMEM;
+		memcpy(ni->name, na->name, i);
+		ni->name[na->name_len] = 0;
+	}
+	return 0;
+}
+
+typedef int (*set_t)(struct inode *, void *);
+static int ntfs_read_locked_inode(struct inode *vi);
+static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi);
+static int ntfs_read_locked_index_inode(struct inode *base_vi,
+		struct inode *vi);
+
+/**
+ * ntfs_iget - obtain a struct inode corresponding to a specific normal inode
+ * @sb:		super block of mounted volume
+ * @mft_no:	mft record number / inode number to obtain
+ *
+ * Obtain the struct inode corresponding to a specific normal inode (i.e. a
+ * file or directory).
+ *
+ * If the inode is in the cache, it is just returned with an increased
+ * reference count. Otherwise, a new struct inode is allocated and initialized,
+ * and finally ntfs_read_locked_inode() is called to read in the inode and
+ * fill in the remainder of the inode structure.
+ *
+ * Return the struct inode on success. Check the return value with IS_ERR() and
+ * if true, the function failed and the error code is obtained from PTR_ERR().
+ */
+struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no)
+{
+	struct inode *vi;
+	int err;
+	ntfs_attr na;
+
+	na.mft_no = mft_no;
+	na.type = AT_UNUSED;
+	na.name = NULL;
+	na.name_len = 0;
+
+	vi = iget5_locked(sb, mft_no, (test_t)ntfs_test_inode,
+			(set_t)ntfs_init_locked_inode, &na);
+	if (unlikely(!vi))
+		return ERR_PTR(-ENOMEM);
+
+	err = 0;
+
+	/* If this is a freshly allocated inode, need to read it now. */
+	if (vi->i_state & I_NEW) {
+		err = ntfs_read_locked_inode(vi);
+		unlock_new_inode(vi);
+	}
+	/*
+	 * There is no point in keeping bad inodes around if the failure was
+	 * due to ENOMEM. We want to be able to retry again later.
+	 */
+	if (unlikely(err == -ENOMEM)) {
+		iput(vi);
+		vi = ERR_PTR(err);
+	}
+	return vi;
+}
+
+/**
+ * ntfs_attr_iget - obtain a struct inode corresponding to an attribute
+ * @base_vi:	vfs base inode containing the attribute
+ * @type:	attribute type
+ * @name:	Unicode name of the attribute (NULL if unnamed)
+ * @name_len:	length of @name in Unicode characters (0 if unnamed)
+ *
+ * Obtain the (fake) struct inode corresponding to the attribute specified by
+ * @type, @name, and @name_len, which is present in the base mft record
+ * specified by the vfs inode @base_vi.
+ *
+ * If the attribute inode is in the cache, it is just returned with an
+ * increased reference count. Otherwise, a new struct inode is allocated and
+ * initialized, and finally ntfs_read_locked_attr_inode() is called to read the
+ * attribute and fill in the inode structure.
+ *
+ * Note, for index allocation attributes, you need to use ntfs_index_iget()
+ * instead of ntfs_attr_iget() as working with indices is a lot more complex.
+ *
+ * Return the struct inode of the attribute inode on success. Check the return
+ * value with IS_ERR() and if true, the function failed and the error code is
+ * obtained from PTR_ERR().
+ */
+struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
+		ntfschar *name, u32 name_len)
+{
+	struct inode *vi;
+	int err;
+	ntfs_attr na;
+
+	/* Make sure no one calls ntfs_attr_iget() for indices. */
+	BUG_ON(type == AT_INDEX_ALLOCATION);
+
+	na.mft_no = base_vi->i_ino;
+	na.type = type;
+	na.name = name;
+	na.name_len = name_len;
+
+	vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
+			(set_t)ntfs_init_locked_inode, &na);
+	if (unlikely(!vi))
+		return ERR_PTR(-ENOMEM);
+
+	err = 0;
+
+	/* If this is a freshly allocated inode, need to read it now. */
+	if (vi->i_state & I_NEW) {
+		err = ntfs_read_locked_attr_inode(base_vi, vi);
+		unlock_new_inode(vi);
+	}
+	/*
+	 * There is no point in keeping bad attribute inodes around. This also
+	 * simplifies things in that we never need to check for bad attribute
+	 * inodes elsewhere.
+	 */
+	if (unlikely(err)) {
+		iput(vi);
+		vi = ERR_PTR(err);
+	}
+	return vi;
+}
+
+/**
+ * ntfs_index_iget - obtain a struct inode corresponding to an index
+ * @base_vi:	vfs base inode containing the index related attributes
+ * @name:	Unicode name of the index
+ * @name_len:	length of @name in Unicode characters
+ *
+ * Obtain the (fake) struct inode corresponding to the index specified by @name
+ * and @name_len, which is present in the base mft record specified by the vfs
+ * inode @base_vi.
+ *
+ * If the index inode is in the cache, it is just returned with an increased
+ * reference count.  Otherwise, a new struct inode is allocated and
+ * initialized, and finally ntfs_read_locked_index_inode() is called to read
+ * the index related attributes and fill in the inode structure.
+ *
+ * Return the struct inode of the index inode on success. Check the return
+ * value with IS_ERR() and if true, the function failed and the error code is
+ * obtained from PTR_ERR().
+ */
+struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
+		u32 name_len)
+{
+	struct inode *vi;
+	int err;
+	ntfs_attr na;
+
+	na.mft_no = base_vi->i_ino;
+	na.type = AT_INDEX_ALLOCATION;
+	na.name = name;
+	na.name_len = name_len;
+
+	vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
+			(set_t)ntfs_init_locked_inode, &na);
+	if (unlikely(!vi))
+		return ERR_PTR(-ENOMEM);
+
+	err = 0;
+
+	/* If this is a freshly allocated inode, need to read it now. */
+	if (vi->i_state & I_NEW) {
+		err = ntfs_read_locked_index_inode(base_vi, vi);
+		unlock_new_inode(vi);
+	}
+	/*
+	 * There is no point in keeping bad index inodes around.  This also
+	 * simplifies things in that we never need to check for bad index
+	 * inodes elsewhere.
+	 */
+	if (unlikely(err)) {
+		iput(vi);
+		vi = ERR_PTR(err);
+	}
+	return vi;
+}
+
+struct inode *ntfs_alloc_big_inode(struct super_block *sb)
+{
+	ntfs_inode *ni;
+
+	ntfs_debug("Entering.");
+	ni = kmem_cache_alloc(ntfs_big_inode_cache, GFP_NOFS);
+	if (likely(ni != NULL)) {
+		ni->state = 0;
+		return VFS_I(ni);
+	}
+	ntfs_error(sb, "Allocation of NTFS big inode structure failed.");
+	return NULL;
+}
+
+static void ntfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+}
+
+void ntfs_destroy_big_inode(struct inode *inode)
+{
+	ntfs_inode *ni = NTFS_I(inode);
+
+	ntfs_debug("Entering.");
+	BUG_ON(ni->page);
+	if (!atomic_dec_and_test(&ni->count))
+		BUG();
+	call_rcu(&inode->i_rcu, ntfs_i_callback);
+}
+
+static inline ntfs_inode *ntfs_alloc_extent_inode(void)
+{
+	ntfs_inode *ni;
+
+	ntfs_debug("Entering.");
+	ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS);
+	if (likely(ni != NULL)) {
+		ni->state = 0;
+		return ni;
+	}
+	ntfs_error(NULL, "Allocation of NTFS inode structure failed.");
+	return NULL;
+}
+
+static void ntfs_destroy_extent_inode(ntfs_inode *ni)
+{
+	ntfs_debug("Entering.");
+	BUG_ON(ni->page);
+	if (!atomic_dec_and_test(&ni->count))
+		BUG();
+	kmem_cache_free(ntfs_inode_cache, ni);
+}
+
+/*
+ * The attribute runlist lock has separate locking rules from the
+ * normal runlist lock, so split the two lock-classes:
+ */
+static struct lock_class_key attr_list_rl_lock_class;
+
+/**
+ * __ntfs_init_inode - initialize ntfs specific part of an inode
+ * @sb:		super block of mounted volume
+ * @ni:		freshly allocated ntfs inode which to initialize
+ *
+ * Initialize an ntfs inode to defaults.
+ *
+ * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left
+ * untouched. Make sure to initialize them elsewhere.
+ *
+ * Return zero on success and -ENOMEM on error.
+ */
+void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni)
+{
+	ntfs_debug("Entering.");
+	rwlock_init(&ni->size_lock);
+	ni->initialized_size = ni->allocated_size = 0;
+	ni->seq_no = 0;
+	atomic_set(&ni->count, 1);
+	ni->vol = NTFS_SB(sb);
+	ntfs_init_runlist(&ni->runlist);
+	mutex_init(&ni->mrec_lock);
+	ni->page = NULL;
+	ni->page_ofs = 0;
+	ni->attr_list_size = 0;
+	ni->attr_list = NULL;
+	ntfs_init_runlist(&ni->attr_list_rl);
+	lockdep_set_class(&ni->attr_list_rl.lock,
+				&attr_list_rl_lock_class);
+	ni->itype.index.block_size = 0;
+	ni->itype.index.vcn_size = 0;
+	ni->itype.index.collation_rule = 0;
+	ni->itype.index.block_size_bits = 0;
+	ni->itype.index.vcn_size_bits = 0;
+	mutex_init(&ni->extent_lock);
+	ni->nr_extents = 0;
+	ni->ext.base_ntfs_ino = NULL;
+}
+
+/*
+ * Extent inodes get MFT-mapped in a nested way, while the base inode
+ * is still mapped. Teach this nesting to the lock validator by creating
+ * a separate class for nested inode's mrec_lock's:
+ */
+static struct lock_class_key extent_inode_mrec_lock_key;
+
+inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
+		unsigned long mft_no)
+{
+	ntfs_inode *ni = ntfs_alloc_extent_inode();
+
+	ntfs_debug("Entering.");
+	if (likely(ni != NULL)) {
+		__ntfs_init_inode(sb, ni);
+		lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key);
+		ni->mft_no = mft_no;
+		ni->type = AT_UNUSED;
+		ni->name = NULL;
+		ni->name_len = 0;
+	}
+	return ni;
+}
+
+/**
+ * ntfs_is_extended_system_file - check if a file is in the $Extend directory
+ * @ctx:	initialized attribute search context
+ *
+ * Search all file name attributes in the inode described by the attribute
+ * search context @ctx and check if any of the names are in the $Extend system
+ * directory.
+ *
+ * Return values:
+ *	   1: file is in $Extend directory
+ *	   0: file is not in $Extend directory
+ *    -errno: failed to determine if the file is in the $Extend directory
+ */
+static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx)
+{
+	int nr_links, err;
+
+	/* Restart search. */
+	ntfs_attr_reinit_search_ctx(ctx);
+
+	/* Get number of hard links. */
+	nr_links = le16_to_cpu(ctx->mrec->link_count);
+
+	/* Loop through all hard links. */
+	while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0,
+			ctx))) {
+		FILE_NAME_ATTR *file_name_attr;
+		ATTR_RECORD *attr = ctx->attr;
+		u8 *p, *p2;
+
+		nr_links--;
+		/*
+		 * Maximum sanity checking as we are called on an inode that
+		 * we suspect might be corrupt.
+		 */
+		p = (u8*)attr + le32_to_cpu(attr->length);
+		if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec +
+				le32_to_cpu(ctx->mrec->bytes_in_use)) {
+err_corrupt_attr:
+			ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name "
+					"attribute. You should run chkdsk.");
+			return -EIO;
+		}
+		if (attr->non_resident) {
+			ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file "
+					"name. You should run chkdsk.");
+			return -EIO;
+		}
+		if (attr->flags) {
+			ntfs_error(ctx->ntfs_ino->vol->sb, "File name with "
+					"invalid flags. You should run "
+					"chkdsk.");
+			return -EIO;
+		}
+		if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) {
+			ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file "
+					"name. You should run chkdsk.");
+			return -EIO;
+		}
+		file_name_attr = (FILE_NAME_ATTR*)((u8*)attr +
+				le16_to_cpu(attr->data.resident.value_offset));
+		p2 = (u8*)attr + le32_to_cpu(attr->data.resident.value_length);
+		if (p2 < (u8*)attr || p2 > p)
+			goto err_corrupt_attr;
+		/* This attribute is ok, but is it in the $Extend directory? */
+		if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend)
+			return 1;	/* YES, it's an extended system file. */
+	}
+	if (unlikely(err != -ENOENT))
+		return err;
+	if (unlikely(nr_links)) {
+		ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count "
+				"doesn't match number of name attributes. You "
+				"should run chkdsk.");
+		return -EIO;
+	}
+	return 0;	/* NO, it is not an extended system file. */
+}
+
+/**
+ * ntfs_read_locked_inode - read an inode from its device
+ * @vi:		inode to read
+ *
+ * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode
+ * described by @vi into memory from the device.
+ *
+ * The only fields in @vi that we need to/can look at when the function is
+ * called are i_sb, pointing to the mounted device's super block, and i_ino,
+ * the number of the inode to load.
+ *
+ * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino
+ * for reading and sets up the necessary @vi fields as well as initializing
+ * the ntfs inode.
+ *
+ * Q: What locks are held when the function is called?
+ * A: i_state has I_NEW set, hence the inode is locked, also
+ *    i_count is set to 1, so it is not going to go away
+ *    i_flags is set to 0 and we have no business touching it.  Only an ioctl()
+ *    is allowed to write to them. We should of course be honouring them but
+ *    we need to do that using the IS_* macros defined in include/linux/fs.h.
+ *    In any case ntfs_read_locked_inode() has nothing to do with i_flags.
+ *
+ * Return 0 on success and -errno on error.  In the error case, the inode will
+ * have had make_bad_inode() executed on it.
+ */
+static int ntfs_read_locked_inode(struct inode *vi)
+{
+	ntfs_volume *vol = NTFS_SB(vi->i_sb);
+	ntfs_inode *ni;
+	struct inode *bvi;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	STANDARD_INFORMATION *si;
+	ntfs_attr_search_ctx *ctx;
+	int err = 0;
+
+	ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
+
+	/* Setup the generic vfs inode parts now. */
+
+	/*
+	 * This is for checking whether an inode has changed w.r.t. a file so
+	 * that the file can be updated if necessary (compare with f_version).
+	 */
+	vi->i_version = 1;
+
+	vi->i_uid = vol->uid;
+	vi->i_gid = vol->gid;
+	vi->i_mode = 0;
+
+	/*
+	 * Initialize the ntfs specific part of @vi special casing
+	 * FILE_MFT which we need to do at mount time.
+	 */
+	if (vi->i_ino != FILE_MFT)
+		ntfs_init_big_inode(vi);
+	ni = NTFS_I(vi);
+
+	m = map_mft_record(ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(ni, m);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto unm_err_out;
+	}
+
+	if (!(m->flags & MFT_RECORD_IN_USE)) {
+		ntfs_error(vi->i_sb, "Inode is not in use!");
+		goto unm_err_out;
+	}
+	if (m->base_mft_record) {
+		ntfs_error(vi->i_sb, "Inode is an extent inode!");
+		goto unm_err_out;
+	}
+
+	/* Transfer information from mft record into vfs and ntfs inodes. */
+	vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
+
+	/*
+	 * FIXME: Keep in mind that link_count is two for files which have both
+	 * a long file name and a short file name as separate entries, so if
+	 * we are hiding short file names this will be too high. Either we need
+	 * to account for the short file names by subtracting them or we need
+	 * to make sure we delete files even though i_nlink is not zero which
+	 * might be tricky due to vfs interactions. Need to think about this
+	 * some more when implementing the unlink command.
+	 */
+	vi->i_nlink = le16_to_cpu(m->link_count);
+	/*
+	 * FIXME: Reparse points can have the directory bit set even though
+	 * they would be S_IFLNK. Need to deal with this further below when we
+	 * implement reparse points / symbolic links but it will do for now.
+	 * Also if not a directory, it could be something else, rather than
+	 * a regular file. But again, will do for now.
+	 */
+	/* Everyone gets all permissions. */
+	vi->i_mode |= S_IRWXUGO;
+	/* If read-only, no one gets write permissions. */
+	if (IS_RDONLY(vi))
+		vi->i_mode &= ~S_IWUGO;
+	if (m->flags & MFT_RECORD_IS_DIRECTORY) {
+		vi->i_mode |= S_IFDIR;
+		/*
+		 * Apply the directory permissions mask set in the mount
+		 * options.
+		 */
+		vi->i_mode &= ~vol->dmask;
+		/* Things break without this kludge! */
+		if (vi->i_nlink > 1)
+			vi->i_nlink = 1;
+	} else {
+		vi->i_mode |= S_IFREG;
+		/* Apply the file permissions mask set in the mount options. */
+		vi->i_mode &= ~vol->fmask;
+	}
+	/*
+	 * Find the standard information attribute in the mft record. At this
+	 * stage we haven't setup the attribute list stuff yet, so this could
+	 * in fact fail if the standard information is in an extent record, but
+	 * I don't think this actually ever happens.
+	 */
+	err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0,
+			ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT) {
+			/*
+			 * TODO: We should be performing a hot fix here (if the
+			 * recover mount option is set) by creating a new
+			 * attribute.
+			 */
+			ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute "
+					"is missing.");
+		}
+		goto unm_err_out;
+	}
+	a = ctx->attr;
+	/* Get the standard information attribute value. */
+	si = (STANDARD_INFORMATION*)((u8*)a +
+			le16_to_cpu(a->data.resident.value_offset));
+
+	/* Transfer information from the standard information into vi. */
+	/*
+	 * Note: The i_?times do not quite map perfectly onto the NTFS times,
+	 * but they are close enough, and in the end it doesn't really matter
+	 * that much...
+	 */
+	/*
+	 * mtime is the last change of the data within the file. Not changed
+	 * when only metadata is changed, e.g. a rename doesn't affect mtime.
+	 */
+	vi->i_mtime = ntfs2utc(si->last_data_change_time);
+	/*
+	 * ctime is the last change of the metadata of the file. This obviously
+	 * always changes, when mtime is changed. ctime can be changed on its
+	 * own, mtime is then not changed, e.g. when a file is renamed.
+	 */
+	vi->i_ctime = ntfs2utc(si->last_mft_change_time);
+	/*
+	 * Last access to the data within the file. Not changed during a rename
+	 * for example but changed whenever the file is written to.
+	 */
+	vi->i_atime = ntfs2utc(si->last_access_time);
+
+	/* Find the attribute list attribute if present. */
+	ntfs_attr_reinit_search_ctx(ctx);
+	err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
+	if (err) {
+		if (unlikely(err != -ENOENT)) {
+			ntfs_error(vi->i_sb, "Failed to lookup attribute list "
+					"attribute.");
+			goto unm_err_out;
+		}
+	} else /* if (!err) */ {
+		if (vi->i_ino == FILE_MFT)
+			goto skip_attr_list_load;
+		ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino);
+		NInoSetAttrList(ni);
+		a = ctx->attr;
+		if (a->flags & ATTR_COMPRESSION_MASK) {
+			ntfs_error(vi->i_sb, "Attribute list attribute is "
+					"compressed.");
+			goto unm_err_out;
+		}
+		if (a->flags & ATTR_IS_ENCRYPTED ||
+				a->flags & ATTR_IS_SPARSE) {
+			if (a->non_resident) {
+				ntfs_error(vi->i_sb, "Non-resident attribute "
+						"list attribute is encrypted/"
+						"sparse.");
+				goto unm_err_out;
+			}
+			ntfs_warning(vi->i_sb, "Resident attribute list "
+					"attribute in inode 0x%lx is marked "
+					"encrypted/sparse which is not true.  "
+					"However, Windows allows this and "
+					"chkdsk does not detect or correct it "
+					"so we will just ignore the invalid "
+					"flags and pretend they are not set.",
+					vi->i_ino);
+		}
+		/* Now allocate memory for the attribute list. */
+		ni->attr_list_size = (u32)ntfs_attr_size(a);
+		ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
+		if (!ni->attr_list) {
+			ntfs_error(vi->i_sb, "Not enough memory to allocate "
+					"buffer for attribute list.");
+			err = -ENOMEM;
+			goto unm_err_out;
+		}
+		if (a->non_resident) {
+			NInoSetAttrListNonResident(ni);
+			if (a->data.non_resident.lowest_vcn) {
+				ntfs_error(vi->i_sb, "Attribute list has non "
+						"zero lowest_vcn.");
+				goto unm_err_out;
+			}
+			/*
+			 * Setup the runlist. No need for locking as we have
+			 * exclusive access to the inode at this time.
+			 */
+			ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
+					a, NULL);
+			if (IS_ERR(ni->attr_list_rl.rl)) {
+				err = PTR_ERR(ni->attr_list_rl.rl);
+				ni->attr_list_rl.rl = NULL;
+				ntfs_error(vi->i_sb, "Mapping pairs "
+						"decompression failed.");
+				goto unm_err_out;
+			}
+			/* Now load the attribute list. */
+			if ((err = load_attribute_list(vol, &ni->attr_list_rl,
+					ni->attr_list, ni->attr_list_size,
+					sle64_to_cpu(a->data.non_resident.
+					initialized_size)))) {
+				ntfs_error(vi->i_sb, "Failed to load "
+						"attribute list attribute.");
+				goto unm_err_out;
+			}
+		} else /* if (!a->non_resident) */ {
+			if ((u8*)a + le16_to_cpu(a->data.resident.value_offset)
+					+ le32_to_cpu(
+					a->data.resident.value_length) >
+					(u8*)ctx->mrec + vol->mft_record_size) {
+				ntfs_error(vi->i_sb, "Corrupt attribute list "
+						"in inode.");
+				goto unm_err_out;
+			}
+			/* Now copy the attribute list. */
+			memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
+					a->data.resident.value_offset),
+					le32_to_cpu(
+					a->data.resident.value_length));
+		}
+	}
+skip_attr_list_load:
+	/*
+	 * If an attribute list is present we now have the attribute list value
+	 * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes.
+	 */
+	if (S_ISDIR(vi->i_mode)) {
+		loff_t bvi_size;
+		ntfs_inode *bni;
+		INDEX_ROOT *ir;
+		u8 *ir_end, *index_end;
+
+		/* It is a directory, find index root attribute. */
+		ntfs_attr_reinit_search_ctx(ctx);
+		err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE,
+				0, NULL, 0, ctx);
+		if (unlikely(err)) {
+			if (err == -ENOENT) {
+				// FIXME: File is corrupt! Hot-fix with empty
+				// index root attribute if recovery option is
+				// set.
+				ntfs_error(vi->i_sb, "$INDEX_ROOT attribute "
+						"is missing.");
+			}
+			goto unm_err_out;
+		}
+		a = ctx->attr;
+		/* Set up the state. */
+		if (unlikely(a->non_resident)) {
+			ntfs_error(vol->sb, "$INDEX_ROOT attribute is not "
+					"resident.");
+			goto unm_err_out;
+		}
+		/* Ensure the attribute name is placed before the value. */
+		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+				le16_to_cpu(a->data.resident.value_offset)))) {
+			ntfs_error(vol->sb, "$INDEX_ROOT attribute name is "
+					"placed after the attribute value.");
+			goto unm_err_out;
+		}
+		/*
+		 * Compressed/encrypted index root just means that the newly
+		 * created files in that directory should be created compressed/
+		 * encrypted. However index root cannot be both compressed and
+		 * encrypted.
+		 */
+		if (a->flags & ATTR_COMPRESSION_MASK)
+			NInoSetCompressed(ni);
+		if (a->flags & ATTR_IS_ENCRYPTED) {
+			if (a->flags & ATTR_COMPRESSION_MASK) {
+				ntfs_error(vi->i_sb, "Found encrypted and "
+						"compressed attribute.");
+				goto unm_err_out;
+			}
+			NInoSetEncrypted(ni);
+		}
+		if (a->flags & ATTR_IS_SPARSE)
+			NInoSetSparse(ni);
+		ir = (INDEX_ROOT*)((u8*)a +
+				le16_to_cpu(a->data.resident.value_offset));
+		ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
+		if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
+			ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
+					"corrupt.");
+			goto unm_err_out;
+		}
+		index_end = (u8*)&ir->index +
+				le32_to_cpu(ir->index.index_length);
+		if (index_end > ir_end) {
+			ntfs_error(vi->i_sb, "Directory index is corrupt.");
+			goto unm_err_out;
+		}
+		if (ir->type != AT_FILE_NAME) {
+			ntfs_error(vi->i_sb, "Indexed attribute is not "
+					"$FILE_NAME.");
+			goto unm_err_out;
+		}
+		if (ir->collation_rule != COLLATION_FILE_NAME) {
+			ntfs_error(vi->i_sb, "Index collation rule is not "
+					"COLLATION_FILE_NAME.");
+			goto unm_err_out;
+		}
+		ni->itype.index.collation_rule = ir->collation_rule;
+		ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
+		if (ni->itype.index.block_size &
+				(ni->itype.index.block_size - 1)) {
+			ntfs_error(vi->i_sb, "Index block size (%u) is not a "
+					"power of two.",
+					ni->itype.index.block_size);
+			goto unm_err_out;
+		}
+		if (ni->itype.index.block_size > PAGE_CACHE_SIZE) {
+			ntfs_error(vi->i_sb, "Index block size (%u) > "
+					"PAGE_CACHE_SIZE (%ld) is not "
+					"supported.  Sorry.",
+					ni->itype.index.block_size,
+					PAGE_CACHE_SIZE);
+			err = -EOPNOTSUPP;
+			goto unm_err_out;
+		}
+		if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
+			ntfs_error(vi->i_sb, "Index block size (%u) < "
+					"NTFS_BLOCK_SIZE (%i) is not "
+					"supported.  Sorry.",
+					ni->itype.index.block_size,
+					NTFS_BLOCK_SIZE);
+			err = -EOPNOTSUPP;
+			goto unm_err_out;
+		}
+		ni->itype.index.block_size_bits =
+				ffs(ni->itype.index.block_size) - 1;
+		/* Determine the size of a vcn in the directory index. */
+		if (vol->cluster_size <= ni->itype.index.block_size) {
+			ni->itype.index.vcn_size = vol->cluster_size;
+			ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
+		} else {
+			ni->itype.index.vcn_size = vol->sector_size;
+			ni->itype.index.vcn_size_bits = vol->sector_size_bits;
+		}
+
+		/* Setup the index allocation attribute, even if not present. */
+		NInoSetMstProtected(ni);
+		ni->type = AT_INDEX_ALLOCATION;
+		ni->name = I30;
+		ni->name_len = 4;
+
+		if (!(ir->index.flags & LARGE_INDEX)) {
+			/* No index allocation. */
+			vi->i_size = ni->initialized_size =
+					ni->allocated_size = 0;
+			/* We are done with the mft record, so we release it. */
+			ntfs_attr_put_search_ctx(ctx);
+			unmap_mft_record(ni);
+			m = NULL;
+			ctx = NULL;
+			goto skip_large_dir_stuff;
+		} /* LARGE_INDEX: Index allocation present. Setup state. */
+		NInoSetIndexAllocPresent(ni);
+		/* Find index allocation attribute. */
+		ntfs_attr_reinit_search_ctx(ctx);
+		err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4,
+				CASE_SENSITIVE, 0, NULL, 0, ctx);
+		if (unlikely(err)) {
+			if (err == -ENOENT)
+				ntfs_error(vi->i_sb, "$INDEX_ALLOCATION "
+						"attribute is not present but "
+						"$INDEX_ROOT indicated it is.");
+			else
+				ntfs_error(vi->i_sb, "Failed to lookup "
+						"$INDEX_ALLOCATION "
+						"attribute.");
+			goto unm_err_out;
+		}
+		a = ctx->attr;
+		if (!a->non_resident) {
+			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+					"is resident.");
+			goto unm_err_out;
+		}
+		/*
+		 * Ensure the attribute name is placed before the mapping pairs
+		 * array.
+		 */
+		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+				le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset)))) {
+			ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name "
+					"is placed after the mapping pairs "
+					"array.");
+			goto unm_err_out;
+		}
+		if (a->flags & ATTR_IS_ENCRYPTED) {
+			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+					"is encrypted.");
+			goto unm_err_out;
+		}
+		if (a->flags & ATTR_IS_SPARSE) {
+			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+					"is sparse.");
+			goto unm_err_out;
+		}
+		if (a->flags & ATTR_COMPRESSION_MASK) {
+			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+					"is compressed.");
+			goto unm_err_out;
+		}
+		if (a->data.non_resident.lowest_vcn) {
+			ntfs_error(vi->i_sb, "First extent of "
+					"$INDEX_ALLOCATION attribute has non "
+					"zero lowest_vcn.");
+			goto unm_err_out;
+		}
+		vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
+		ni->initialized_size = sle64_to_cpu(
+				a->data.non_resident.initialized_size);
+		ni->allocated_size = sle64_to_cpu(
+				a->data.non_resident.allocated_size);
+		/*
+		 * We are done with the mft record, so we release it. Otherwise
+		 * we would deadlock in ntfs_attr_iget().
+		 */
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(ni);
+		m = NULL;
+		ctx = NULL;
+		/* Get the index bitmap attribute inode. */
+		bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4);
+		if (IS_ERR(bvi)) {
+			ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
+			err = PTR_ERR(bvi);
+			goto unm_err_out;
+		}
+		bni = NTFS_I(bvi);
+		if (NInoCompressed(bni) || NInoEncrypted(bni) ||
+				NInoSparse(bni)) {
+			ntfs_error(vi->i_sb, "$BITMAP attribute is compressed "
+					"and/or encrypted and/or sparse.");
+			goto iput_unm_err_out;
+		}
+		/* Consistency check bitmap size vs. index allocation size. */
+		bvi_size = i_size_read(bvi);
+		if ((bvi_size << 3) < (vi->i_size >>
+				ni->itype.index.block_size_bits)) {
+			ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) "
+					"for index allocation (0x%llx).",
+					bvi_size << 3, vi->i_size);
+			goto iput_unm_err_out;
+		}
+		/* No longer need the bitmap attribute inode. */
+		iput(bvi);
+skip_large_dir_stuff:
+		/* Setup the operations for this inode. */
+		vi->i_op = &ntfs_dir_inode_ops;
+		vi->i_fop = &ntfs_dir_ops;
+	} else {
+		/* It is a file. */
+		ntfs_attr_reinit_search_ctx(ctx);
+
+		/* Setup the data attribute, even if not present. */
+		ni->type = AT_DATA;
+		ni->name = NULL;
+		ni->name_len = 0;
+
+		/* Find first extent of the unnamed data attribute. */
+		err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx);
+		if (unlikely(err)) {
+			vi->i_size = ni->initialized_size =
+					ni->allocated_size = 0;
+			if (err != -ENOENT) {
+				ntfs_error(vi->i_sb, "Failed to lookup $DATA "
+						"attribute.");
+				goto unm_err_out;
+			}
+			/*
+			 * FILE_Secure does not have an unnamed $DATA
+			 * attribute, so we special case it here.
+			 */
+			if (vi->i_ino == FILE_Secure)
+				goto no_data_attr_special_case;
+			/*
+			 * Most if not all the system files in the $Extend
+			 * system directory do not have unnamed data
+			 * attributes so we need to check if the parent
+			 * directory of the file is FILE_Extend and if it is
+			 * ignore this error. To do this we need to get the
+			 * name of this inode from the mft record as the name
+			 * contains the back reference to the parent directory.
+			 */
+			if (ntfs_is_extended_system_file(ctx) > 0)
+				goto no_data_attr_special_case;
+			// FIXME: File is corrupt! Hot-fix with empty data
+			// attribute if recovery option is set.
+			ntfs_error(vi->i_sb, "$DATA attribute is missing.");
+			goto unm_err_out;
+		}
+		a = ctx->attr;
+		/* Setup the state. */
+		if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
+			if (a->flags & ATTR_COMPRESSION_MASK) {
+				NInoSetCompressed(ni);
+				if (vol->cluster_size > 4096) {
+					ntfs_error(vi->i_sb, "Found "
+							"compressed data but "
+							"compression is "
+							"disabled due to "
+							"cluster size (%i) > "
+							"4kiB.",
+							vol->cluster_size);
+					goto unm_err_out;
+				}
+				if ((a->flags & ATTR_COMPRESSION_MASK)
+						!= ATTR_IS_COMPRESSED) {
+					ntfs_error(vi->i_sb, "Found unknown "
+							"compression method "
+							"or corrupt file.");
+					goto unm_err_out;
+				}
+			}
+			if (a->flags & ATTR_IS_SPARSE)
+				NInoSetSparse(ni);
+		}
+		if (a->flags & ATTR_IS_ENCRYPTED) {
+			if (NInoCompressed(ni)) {
+				ntfs_error(vi->i_sb, "Found encrypted and "
+						"compressed data.");
+				goto unm_err_out;
+			}
+			NInoSetEncrypted(ni);
+		}
+		if (a->non_resident) {
+			NInoSetNonResident(ni);
+			if (NInoCompressed(ni) || NInoSparse(ni)) {
+				if (NInoCompressed(ni) && a->data.non_resident.
+						compression_unit != 4) {
+					ntfs_error(vi->i_sb, "Found "
+							"non-standard "
+							"compression unit (%u "
+							"instead of 4).  "
+							"Cannot handle this.",
+							a->data.non_resident.
+							compression_unit);
+					err = -EOPNOTSUPP;
+					goto unm_err_out;
+				}
+				if (a->data.non_resident.compression_unit) {
+					ni->itype.compressed.block_size = 1U <<
+							(a->data.non_resident.
+							compression_unit +
+							vol->cluster_size_bits);
+					ni->itype.compressed.block_size_bits =
+							ffs(ni->itype.
+							compressed.
+							block_size) - 1;
+					ni->itype.compressed.block_clusters =
+							1U << a->data.
+							non_resident.
+							compression_unit;
+				} else {
+					ni->itype.compressed.block_size = 0;
+					ni->itype.compressed.block_size_bits =
+							0;
+					ni->itype.compressed.block_clusters =
+							0;
+				}
+				ni->itype.compressed.size = sle64_to_cpu(
+						a->data.non_resident.
+						compressed_size);
+			}
+			if (a->data.non_resident.lowest_vcn) {
+				ntfs_error(vi->i_sb, "First extent of $DATA "
+						"attribute has non zero "
+						"lowest_vcn.");
+				goto unm_err_out;
+			}
+			vi->i_size = sle64_to_cpu(
+					a->data.non_resident.data_size);
+			ni->initialized_size = sle64_to_cpu(
+					a->data.non_resident.initialized_size);
+			ni->allocated_size = sle64_to_cpu(
+					a->data.non_resident.allocated_size);
+		} else { /* Resident attribute. */
+			vi->i_size = ni->initialized_size = le32_to_cpu(
+					a->data.resident.value_length);
+			ni->allocated_size = le32_to_cpu(a->length) -
+					le16_to_cpu(
+					a->data.resident.value_offset);
+			if (vi->i_size > ni->allocated_size) {
+				ntfs_error(vi->i_sb, "Resident data attribute "
+						"is corrupt (size exceeds "
+						"allocation).");
+				goto unm_err_out;
+			}
+		}
+no_data_attr_special_case:
+		/* We are done with the mft record, so we release it. */
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(ni);
+		m = NULL;
+		ctx = NULL;
+		/* Setup the operations for this inode. */
+		vi->i_op = &ntfs_file_inode_ops;
+		vi->i_fop = &ntfs_file_ops;
+	}
+	if (NInoMstProtected(ni))
+		vi->i_mapping->a_ops = &ntfs_mst_aops;
+	else
+		vi->i_mapping->a_ops = &ntfs_aops;
+	/*
+	 * The number of 512-byte blocks used on disk (for stat). This is in so
+	 * far inaccurate as it doesn't account for any named streams or other
+	 * special non-resident attributes, but that is how Windows works, too,
+	 * so we are at least consistent with Windows, if not entirely
+	 * consistent with the Linux Way. Doing it the Linux Way would cause a
+	 * significant slowdown as it would involve iterating over all
+	 * attributes in the mft record and adding the allocated/compressed
+	 * sizes of all non-resident attributes present to give us the Linux
+	 * correct size that should go into i_blocks (after division by 512).
+	 */
+	if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni)))
+		vi->i_blocks = ni->itype.compressed.size >> 9;
+	else
+		vi->i_blocks = ni->allocated_size >> 9;
+	ntfs_debug("Done.");
+	return 0;
+iput_unm_err_out:
+	iput(bvi);
+unm_err_out:
+	if (!err)
+		err = -EIO;
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(ni);
+err_out:
+	ntfs_error(vol->sb, "Failed with error code %i.  Marking corrupt "
+			"inode 0x%lx as bad.  Run chkdsk.", err, vi->i_ino);
+	make_bad_inode(vi);
+	if (err != -EOPNOTSUPP && err != -ENOMEM)
+		NVolSetErrors(vol);
+	return err;
+}
+
+/**
+ * ntfs_read_locked_attr_inode - read an attribute inode from its base inode
+ * @base_vi:	base inode
+ * @vi:		attribute inode to read
+ *
+ * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the
+ * attribute inode described by @vi into memory from the base mft record
+ * described by @base_ni.
+ *
+ * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for
+ * reading and looks up the attribute described by @vi before setting up the
+ * necessary fields in @vi as well as initializing the ntfs inode.
+ *
+ * Q: What locks are held when the function is called?
+ * A: i_state has I_NEW set, hence the inode is locked, also
+ *    i_count is set to 1, so it is not going to go away
+ *
+ * Return 0 on success and -errno on error.  In the error case, the inode will
+ * have had make_bad_inode() executed on it.
+ *
+ * Note this cannot be called for AT_INDEX_ALLOCATION.
+ */
+static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
+{
+	ntfs_volume *vol = NTFS_SB(vi->i_sb);
+	ntfs_inode *ni, *base_ni;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	ntfs_attr_search_ctx *ctx;
+	int err = 0;
+
+	ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
+
+	ntfs_init_big_inode(vi);
+
+	ni	= NTFS_I(vi);
+	base_ni = NTFS_I(base_vi);
+
+	/* Just mirror the values from the base inode. */
+	vi->i_version	= base_vi->i_version;
+	vi->i_uid	= base_vi->i_uid;
+	vi->i_gid	= base_vi->i_gid;
+	vi->i_nlink	= base_vi->i_nlink;
+	vi->i_mtime	= base_vi->i_mtime;
+	vi->i_ctime	= base_vi->i_ctime;
+	vi->i_atime	= base_vi->i_atime;
+	vi->i_generation = ni->seq_no = base_ni->seq_no;
+
+	/* Set inode type to zero but preserve permissions. */
+	vi->i_mode	= base_vi->i_mode & ~S_IFMT;
+
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto unm_err_out;
+	}
+	/* Find the attribute. */
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err))
+		goto unm_err_out;
+	a = ctx->attr;
+	if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
+		if (a->flags & ATTR_COMPRESSION_MASK) {
+			NInoSetCompressed(ni);
+			if ((ni->type != AT_DATA) || (ni->type == AT_DATA &&
+					ni->name_len)) {
+				ntfs_error(vi->i_sb, "Found compressed "
+						"non-data or named data "
+						"attribute.  Please report "
+						"you saw this message to "
+						"linux-ntfs-dev@lists."
+						"sourceforge.net");
+				goto unm_err_out;
+			}
+			if (vol->cluster_size > 4096) {
+				ntfs_error(vi->i_sb, "Found compressed "
+						"attribute but compression is "
+						"disabled due to cluster size "
+						"(%i) > 4kiB.",
+						vol->cluster_size);
+				goto unm_err_out;
+			}
+			if ((a->flags & ATTR_COMPRESSION_MASK) !=
+					ATTR_IS_COMPRESSED) {
+				ntfs_error(vi->i_sb, "Found unknown "
+						"compression method.");
+				goto unm_err_out;
+			}
+		}
+		/*
+		 * The compressed/sparse flag set in an index root just means
+		 * to compress all files.
+		 */
+		if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
+			ntfs_error(vi->i_sb, "Found mst protected attribute "
+					"but the attribute is %s.  Please "
+					"report you saw this message to "
+					"linux-ntfs-dev@lists.sourceforge.net",
+					NInoCompressed(ni) ? "compressed" :
+					"sparse");
+			goto unm_err_out;
+		}
+		if (a->flags & ATTR_IS_SPARSE)
+			NInoSetSparse(ni);
+	}
+	if (a->flags & ATTR_IS_ENCRYPTED) {
+		if (NInoCompressed(ni)) {
+			ntfs_error(vi->i_sb, "Found encrypted and compressed "
+					"data.");
+			goto unm_err_out;
+		}
+		/*
+		 * The encryption flag set in an index root just means to
+		 * encrypt all files.
+		 */
+		if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
+			ntfs_error(vi->i_sb, "Found mst protected attribute "
+					"but the attribute is encrypted.  "
+					"Please report you saw this message "
+					"to linux-ntfs-dev@lists.sourceforge."
+					"net");
+			goto unm_err_out;
+		}
+		if (ni->type != AT_DATA) {
+			ntfs_error(vi->i_sb, "Found encrypted non-data "
+					"attribute.");
+			goto unm_err_out;
+		}
+		NInoSetEncrypted(ni);
+	}
+	if (!a->non_resident) {
+		/* Ensure the attribute name is placed before the value. */
+		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+				le16_to_cpu(a->data.resident.value_offset)))) {
+			ntfs_error(vol->sb, "Attribute name is placed after "
+					"the attribute value.");
+			goto unm_err_out;
+		}
+		if (NInoMstProtected(ni)) {
+			ntfs_error(vi->i_sb, "Found mst protected attribute "
+					"but the attribute is resident.  "
+					"Please report you saw this message to "
+					"linux-ntfs-dev@lists.sourceforge.net");
+			goto unm_err_out;
+		}
+		vi->i_size = ni->initialized_size = le32_to_cpu(
+				a->data.resident.value_length);
+		ni->allocated_size = le32_to_cpu(a->length) -
+				le16_to_cpu(a->data.resident.value_offset);
+		if (vi->i_size > ni->allocated_size) {
+			ntfs_error(vi->i_sb, "Resident attribute is corrupt "
+					"(size exceeds allocation).");
+			goto unm_err_out;
+		}
+	} else {
+		NInoSetNonResident(ni);
+		/*
+		 * Ensure the attribute name is placed before the mapping pairs
+		 * array.
+		 */
+		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+				le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset)))) {
+			ntfs_error(vol->sb, "Attribute name is placed after "
+					"the mapping pairs array.");
+			goto unm_err_out;
+		}
+		if (NInoCompressed(ni) || NInoSparse(ni)) {
+			if (NInoCompressed(ni) && a->data.non_resident.
+					compression_unit != 4) {
+				ntfs_error(vi->i_sb, "Found non-standard "
+						"compression unit (%u instead "
+						"of 4).  Cannot handle this.",
+						a->data.non_resident.
+						compression_unit);
+				err = -EOPNOTSUPP;
+				goto unm_err_out;
+			}
+			if (a->data.non_resident.compression_unit) {
+				ni->itype.compressed.block_size = 1U <<
+						(a->data.non_resident.
+						compression_unit +
+						vol->cluster_size_bits);
+				ni->itype.compressed.block_size_bits =
+						ffs(ni->itype.compressed.
+						block_size) - 1;
+				ni->itype.compressed.block_clusters = 1U <<
+						a->data.non_resident.
+						compression_unit;
+			} else {
+				ni->itype.compressed.block_size = 0;
+				ni->itype.compressed.block_size_bits = 0;
+				ni->itype.compressed.block_clusters = 0;
+			}
+			ni->itype.compressed.size = sle64_to_cpu(
+					a->data.non_resident.compressed_size);
+		}
+		if (a->data.non_resident.lowest_vcn) {
+			ntfs_error(vi->i_sb, "First extent of attribute has "
+					"non-zero lowest_vcn.");
+			goto unm_err_out;
+		}
+		vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
+		ni->initialized_size = sle64_to_cpu(
+				a->data.non_resident.initialized_size);
+		ni->allocated_size = sle64_to_cpu(
+				a->data.non_resident.allocated_size);
+	}
+	if (NInoMstProtected(ni))
+		vi->i_mapping->a_ops = &ntfs_mst_aops;
+	else
+		vi->i_mapping->a_ops = &ntfs_aops;
+	if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT)
+		vi->i_blocks = ni->itype.compressed.size >> 9;
+	else
+		vi->i_blocks = ni->allocated_size >> 9;
+	/*
+	 * Make sure the base inode does not go away and attach it to the
+	 * attribute inode.
+	 */
+	igrab(base_vi);
+	ni->ext.base_ntfs_ino = base_ni;
+	ni->nr_extents = -1;
+
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+
+	ntfs_debug("Done.");
+	return 0;
+
+unm_err_out:
+	if (!err)
+		err = -EIO;
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+err_out:
+	ntfs_error(vol->sb, "Failed with error code %i while reading attribute "
+			"inode (mft_no 0x%lx, type 0x%x, name_len %i).  "
+			"Marking corrupt inode and base inode 0x%lx as bad.  "
+			"Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len,
+			base_vi->i_ino);
+	make_bad_inode(vi);
+	if (err != -ENOMEM)
+		NVolSetErrors(vol);
+	return err;
+}
+
+/**
+ * ntfs_read_locked_index_inode - read an index inode from its base inode
+ * @base_vi:	base inode
+ * @vi:		index inode to read
+ *
+ * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the
+ * index inode described by @vi into memory from the base mft record described
+ * by @base_ni.
+ *
+ * ntfs_read_locked_index_inode() maps, pins and locks the base inode for
+ * reading and looks up the attributes relating to the index described by @vi
+ * before setting up the necessary fields in @vi as well as initializing the
+ * ntfs inode.
+ *
+ * Note, index inodes are essentially attribute inodes (NInoAttr() is true)
+ * with the attribute type set to AT_INDEX_ALLOCATION.  Apart from that, they
+ * are setup like directory inodes since directories are a special case of
+ * indices ao they need to be treated in much the same way.  Most importantly,
+ * for small indices the index allocation attribute might not actually exist.
+ * However, the index root attribute always exists but this does not need to
+ * have an inode associated with it and this is why we define a new inode type
+ * index.  Also, like for directories, we need to have an attribute inode for
+ * the bitmap attribute corresponding to the index allocation attribute and we
+ * can store this in the appropriate field of the inode, just like we do for
+ * normal directory inodes.
+ *
+ * Q: What locks are held when the function is called?
+ * A: i_state has I_NEW set, hence the inode is locked, also
+ *    i_count is set to 1, so it is not going to go away
+ *
+ * Return 0 on success and -errno on error.  In the error case, the inode will
+ * have had make_bad_inode() executed on it.
+ */
+static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
+{
+	loff_t bvi_size;
+	ntfs_volume *vol = NTFS_SB(vi->i_sb);
+	ntfs_inode *ni, *base_ni, *bni;
+	struct inode *bvi;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	ntfs_attr_search_ctx *ctx;
+	INDEX_ROOT *ir;
+	u8 *ir_end, *index_end;
+	int err = 0;
+
+	ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
+	ntfs_init_big_inode(vi);
+	ni	= NTFS_I(vi);
+	base_ni = NTFS_I(base_vi);
+	/* Just mirror the values from the base inode. */
+	vi->i_version	= base_vi->i_version;
+	vi->i_uid	= base_vi->i_uid;
+	vi->i_gid	= base_vi->i_gid;
+	vi->i_nlink	= base_vi->i_nlink;
+	vi->i_mtime	= base_vi->i_mtime;
+	vi->i_ctime	= base_vi->i_ctime;
+	vi->i_atime	= base_vi->i_atime;
+	vi->i_generation = ni->seq_no = base_ni->seq_no;
+	/* Set inode type to zero but preserve permissions. */
+	vi->i_mode	= base_vi->i_mode & ~S_IFMT;
+	/* Map the mft record for the base inode. */
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto unm_err_out;
+	}
+	/* Find the index root attribute. */
+	err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
+					"missing.");
+		goto unm_err_out;
+	}
+	a = ctx->attr;
+	/* Set up the state. */
+	if (unlikely(a->non_resident)) {
+		ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident.");
+		goto unm_err_out;
+	}
+	/* Ensure the attribute name is placed before the value. */
+	if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+			le16_to_cpu(a->data.resident.value_offset)))) {
+		ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed "
+				"after the attribute value.");
+		goto unm_err_out;
+	}
+	/*
+	 * Compressed/encrypted/sparse index root is not allowed, except for
+	 * directories of course but those are not dealt with here.
+	 */
+	if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED |
+			ATTR_IS_SPARSE)) {
+		ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index "
+				"root attribute.");
+		goto unm_err_out;
+	}
+	ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset));
+	ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
+	if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
+		ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt.");
+		goto unm_err_out;
+	}
+	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+	if (index_end > ir_end) {
+		ntfs_error(vi->i_sb, "Index is corrupt.");
+		goto unm_err_out;
+	}
+	if (ir->type) {
+		ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).",
+				le32_to_cpu(ir->type));
+		goto unm_err_out;
+	}
+	ni->itype.index.collation_rule = ir->collation_rule;
+	ntfs_debug("Index collation rule is 0x%x.",
+			le32_to_cpu(ir->collation_rule));
+	ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
+	if (!is_power_of_2(ni->itype.index.block_size)) {
+		ntfs_error(vi->i_sb, "Index block size (%u) is not a power of "
+				"two.", ni->itype.index.block_size);
+		goto unm_err_out;
+	}
+	if (ni->itype.index.block_size > PAGE_CACHE_SIZE) {
+		ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_CACHE_SIZE "
+				"(%ld) is not supported.  Sorry.",
+				ni->itype.index.block_size, PAGE_CACHE_SIZE);
+		err = -EOPNOTSUPP;
+		goto unm_err_out;
+	}
+	if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
+		ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE "
+				"(%i) is not supported.  Sorry.",
+				ni->itype.index.block_size, NTFS_BLOCK_SIZE);
+		err = -EOPNOTSUPP;
+		goto unm_err_out;
+	}
+	ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1;
+	/* Determine the size of a vcn in the index. */
+	if (vol->cluster_size <= ni->itype.index.block_size) {
+		ni->itype.index.vcn_size = vol->cluster_size;
+		ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
+	} else {
+		ni->itype.index.vcn_size = vol->sector_size;
+		ni->itype.index.vcn_size_bits = vol->sector_size_bits;
+	}
+	/* Check for presence of index allocation attribute. */
+	if (!(ir->index.flags & LARGE_INDEX)) {
+		/* No index allocation. */
+		vi->i_size = ni->initialized_size = ni->allocated_size = 0;
+		/* We are done with the mft record, so we release it. */
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(base_ni);
+		m = NULL;
+		ctx = NULL;
+		goto skip_large_index_stuff;
+	} /* LARGE_INDEX:  Index allocation present.  Setup state. */
+	NInoSetIndexAllocPresent(ni);
+	/* Find index allocation attribute. */
+	ntfs_attr_reinit_search_ctx(ctx);
+	err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+					"not present but $INDEX_ROOT "
+					"indicated it is.");
+		else
+			ntfs_error(vi->i_sb, "Failed to lookup "
+					"$INDEX_ALLOCATION attribute.");
+		goto unm_err_out;
+	}
+	a = ctx->attr;
+	if (!a->non_resident) {
+		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+				"resident.");
+		goto unm_err_out;
+	}
+	/*
+	 * Ensure the attribute name is placed before the mapping pairs array.
+	 */
+	if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+			le16_to_cpu(
+			a->data.non_resident.mapping_pairs_offset)))) {
+		ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is "
+				"placed after the mapping pairs array.");
+		goto unm_err_out;
+	}
+	if (a->flags & ATTR_IS_ENCRYPTED) {
+		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+				"encrypted.");
+		goto unm_err_out;
+	}
+	if (a->flags & ATTR_IS_SPARSE) {
+		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse.");
+		goto unm_err_out;
+	}
+	if (a->flags & ATTR_COMPRESSION_MASK) {
+		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+				"compressed.");
+		goto unm_err_out;
+	}
+	if (a->data.non_resident.lowest_vcn) {
+		ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION "
+				"attribute has non zero lowest_vcn.");
+		goto unm_err_out;
+	}
+	vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
+	ni->initialized_size = sle64_to_cpu(
+			a->data.non_resident.initialized_size);
+	ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size);
+	/*
+	 * We are done with the mft record, so we release it.  Otherwise
+	 * we would deadlock in ntfs_attr_iget().
+	 */
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	m = NULL;
+	ctx = NULL;
+	/* Get the index bitmap attribute inode. */
+	bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len);
+	if (IS_ERR(bvi)) {
+		ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
+		err = PTR_ERR(bvi);
+		goto unm_err_out;
+	}
+	bni = NTFS_I(bvi);
+	if (NInoCompressed(bni) || NInoEncrypted(bni) ||
+			NInoSparse(bni)) {
+		ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or "
+				"encrypted and/or sparse.");
+		goto iput_unm_err_out;
+	}
+	/* Consistency check bitmap size vs. index allocation size. */
+	bvi_size = i_size_read(bvi);
+	if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) {
+		ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for "
+				"index allocation (0x%llx).", bvi_size << 3,
+				vi->i_size);
+		goto iput_unm_err_out;
+	}
+	iput(bvi);
+skip_large_index_stuff:
+	/* Setup the operations for this index inode. */
+	vi->i_op = NULL;
+	vi->i_fop = NULL;
+	vi->i_mapping->a_ops = &ntfs_mst_aops;
+	vi->i_blocks = ni->allocated_size >> 9;
+	/*
+	 * Make sure the base inode doesn't go away and attach it to the
+	 * index inode.
+	 */
+	igrab(base_vi);
+	ni->ext.base_ntfs_ino = base_ni;
+	ni->nr_extents = -1;
+
+	ntfs_debug("Done.");
+	return 0;
+iput_unm_err_out:
+	iput(bvi);
+unm_err_out:
+	if (!err)
+		err = -EIO;
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+err_out:
+	ntfs_error(vi->i_sb, "Failed with error code %i while reading index "
+			"inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino,
+			ni->name_len);
+	make_bad_inode(vi);
+	if (err != -EOPNOTSUPP && err != -ENOMEM)
+		NVolSetErrors(vol);
+	return err;
+}
+
+/*
+ * The MFT inode has special locking, so teach the lock validator
+ * about this by splitting off the locking rules of the MFT from
+ * the locking rules of other inodes. The MFT inode can never be
+ * accessed from the VFS side (or even internally), only by the
+ * map_mft functions.
+ */
+static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key;
+
+/**
+ * ntfs_read_inode_mount - special read_inode for mount time use only
+ * @vi:		inode to read
+ *
+ * Read inode FILE_MFT at mount time, only called with super_block lock
+ * held from within the read_super() code path.
+ *
+ * This function exists because when it is called the page cache for $MFT/$DATA
+ * is not initialized and hence we cannot get at the contents of mft records
+ * by calling map_mft_record*().
+ *
+ * Further it needs to cope with the circular references problem, i.e. cannot
+ * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because
+ * we do not know where the other extent mft records are yet and again, because
+ * we cannot call map_mft_record*() yet.  Obviously this applies only when an
+ * attribute list is actually present in $MFT inode.
+ *
+ * We solve these problems by starting with the $DATA attribute before anything
+ * else and iterating using ntfs_attr_lookup($DATA) over all extents.  As each
+ * extent is found, we ntfs_mapping_pairs_decompress() including the implied
+ * ntfs_runlists_merge().  Each step of the iteration necessarily provides
+ * sufficient information for the next step to complete.
+ *
+ * This should work but there are two possible pit falls (see inline comments
+ * below), but only time will tell if they are real pits or just smoke...
+ */
+int ntfs_read_inode_mount(struct inode *vi)
+{
+	VCN next_vcn, last_vcn, highest_vcn;
+	s64 block;
+	struct super_block *sb = vi->i_sb;
+	ntfs_volume *vol = NTFS_SB(sb);
+	struct buffer_head *bh;
+	ntfs_inode *ni;
+	MFT_RECORD *m = NULL;
+	ATTR_RECORD *a;
+	ntfs_attr_search_ctx *ctx;
+	unsigned int i, nr_blocks;
+	int err;
+
+	ntfs_debug("Entering.");
+
+	/* Initialize the ntfs specific part of @vi. */
+	ntfs_init_big_inode(vi);
+
+	ni = NTFS_I(vi);
+
+	/* Setup the data attribute. It is special as it is mst protected. */
+	NInoSetNonResident(ni);
+	NInoSetMstProtected(ni);
+	NInoSetSparseDisabled(ni);
+	ni->type = AT_DATA;
+	ni->name = NULL;
+	ni->name_len = 0;
+	/*
+	 * This sets up our little cheat allowing us to reuse the async read io
+	 * completion handler for directories.
+	 */
+	ni->itype.index.block_size = vol->mft_record_size;
+	ni->itype.index.block_size_bits = vol->mft_record_size_bits;
+
+	/* Very important! Needed to be able to call map_mft_record*(). */
+	vol->mft_ino = vi;
+
+	/* Allocate enough memory to read the first mft record. */
+	if (vol->mft_record_size > 64 * 1024) {
+		ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).",
+				vol->mft_record_size);
+		goto err_out;
+	}
+	i = vol->mft_record_size;
+	if (i < sb->s_blocksize)
+		i = sb->s_blocksize;
+	m = (MFT_RECORD*)ntfs_malloc_nofs(i);
+	if (!m) {
+		ntfs_error(sb, "Failed to allocate buffer for $MFT record 0.");
+		goto err_out;
+	}
+
+	/* Determine the first block of the $MFT/$DATA attribute. */
+	block = vol->mft_lcn << vol->cluster_size_bits >>
+			sb->s_blocksize_bits;
+	nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits;
+	if (!nr_blocks)
+		nr_blocks = 1;
+
+	/* Load $MFT/$DATA's first mft record. */
+	for (i = 0; i < nr_blocks; i++) {
+		bh = sb_bread(sb, block++);
+		if (!bh) {
+			ntfs_error(sb, "Device read failed.");
+			goto err_out;
+		}
+		memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data,
+				sb->s_blocksize);
+		brelse(bh);
+	}
+
+	/* Apply the mst fixups. */
+	if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) {
+		/* FIXME: Try to use the $MFTMirr now. */
+		ntfs_error(sb, "MST fixup failed. $MFT is corrupt.");
+		goto err_out;
+	}
+
+	/* Need this to sanity check attribute list references to $MFT. */
+	vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
+
+	/* Provides readpage() and sync_page() for map_mft_record(). */
+	vi->i_mapping->a_ops = &ntfs_mst_aops;
+
+	ctx = ntfs_attr_get_search_ctx(ni, m);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	/* Find the attribute list attribute if present. */
+	err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
+	if (err) {
+		if (unlikely(err != -ENOENT)) {
+			ntfs_error(sb, "Failed to lookup attribute list "
+					"attribute. You should run chkdsk.");
+			goto put_err_out;
+		}
+	} else /* if (!err) */ {
+		ATTR_LIST_ENTRY *al_entry, *next_al_entry;
+		u8 *al_end;
+		static const char *es = "  Not allowed.  $MFT is corrupt.  "
+				"You should run chkdsk.";
+
+		ntfs_debug("Attribute list attribute found in $MFT.");
+		NInoSetAttrList(ni);
+		a = ctx->attr;
+		if (a->flags & ATTR_COMPRESSION_MASK) {
+			ntfs_error(sb, "Attribute list attribute is "
+					"compressed.%s", es);
+			goto put_err_out;
+		}
+		if (a->flags & ATTR_IS_ENCRYPTED ||
+				a->flags & ATTR_IS_SPARSE) {
+			if (a->non_resident) {
+				ntfs_error(sb, "Non-resident attribute list "
+						"attribute is encrypted/"
+						"sparse.%s", es);
+				goto put_err_out;
+			}
+			ntfs_warning(sb, "Resident attribute list attribute "
+					"in $MFT system file is marked "
+					"encrypted/sparse which is not true.  "
+					"However, Windows allows this and "
+					"chkdsk does not detect or correct it "
+					"so we will just ignore the invalid "
+					"flags and pretend they are not set.");
+		}
+		/* Now allocate memory for the attribute list. */
+		ni->attr_list_size = (u32)ntfs_attr_size(a);
+		ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
+		if (!ni->attr_list) {
+			ntfs_error(sb, "Not enough memory to allocate buffer "
+					"for attribute list.");
+			goto put_err_out;
+		}
+		if (a->non_resident) {
+			NInoSetAttrListNonResident(ni);
+			if (a->data.non_resident.lowest_vcn) {
+				ntfs_error(sb, "Attribute list has non zero "
+						"lowest_vcn. $MFT is corrupt. "
+						"You should run chkdsk.");
+				goto put_err_out;
+			}
+			/* Setup the runlist. */
+			ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
+					a, NULL);
+			if (IS_ERR(ni->attr_list_rl.rl)) {
+				err = PTR_ERR(ni->attr_list_rl.rl);
+				ni->attr_list_rl.rl = NULL;
+				ntfs_error(sb, "Mapping pairs decompression "
+						"failed with error code %i.",
+						-err);
+				goto put_err_out;
+			}
+			/* Now load the attribute list. */
+			if ((err = load_attribute_list(vol, &ni->attr_list_rl,
+					ni->attr_list, ni->attr_list_size,
+					sle64_to_cpu(a->data.
+					non_resident.initialized_size)))) {
+				ntfs_error(sb, "Failed to load attribute list "
+						"attribute with error code %i.",
+						-err);
+				goto put_err_out;
+			}
+		} else /* if (!ctx.attr->non_resident) */ {
+			if ((u8*)a + le16_to_cpu(
+					a->data.resident.value_offset) +
+					le32_to_cpu(
+					a->data.resident.value_length) >
+					(u8*)ctx->mrec + vol->mft_record_size) {
+				ntfs_error(sb, "Corrupt attribute list "
+						"attribute.");
+				goto put_err_out;
+			}
+			/* Now copy the attribute list. */
+			memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
+					a->data.resident.value_offset),
+					le32_to_cpu(
+					a->data.resident.value_length));
+		}
+		/* The attribute list is now setup in memory. */
+		/*
+		 * FIXME: I don't know if this case is actually possible.
+		 * According to logic it is not possible but I have seen too
+		 * many weird things in MS software to rely on logic... Thus we
+		 * perform a manual search and make sure the first $MFT/$DATA
+		 * extent is in the base inode. If it is not we abort with an
+		 * error and if we ever see a report of this error we will need
+		 * to do some magic in order to have the necessary mft record
+		 * loaded and in the right place in the page cache. But
+		 * hopefully logic will prevail and this never happens...
+		 */
+		al_entry = (ATTR_LIST_ENTRY*)ni->attr_list;
+		al_end = (u8*)al_entry + ni->attr_list_size;
+		for (;; al_entry = next_al_entry) {
+			/* Out of bounds check. */
+			if ((u8*)al_entry < ni->attr_list ||
+					(u8*)al_entry > al_end)
+				goto em_put_err_out;
+			/* Catch the end of the attribute list. */
+			if ((u8*)al_entry == al_end)
+				goto em_put_err_out;
+			if (!al_entry->length)
+				goto em_put_err_out;
+			if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
+					le16_to_cpu(al_entry->length) > al_end)
+				goto em_put_err_out;
+			next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
+					le16_to_cpu(al_entry->length));
+			if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
+				goto em_put_err_out;
+			if (AT_DATA != al_entry->type)
+				continue;
+			/* We want an unnamed attribute. */
+			if (al_entry->name_length)
+				goto em_put_err_out;
+			/* Want the first entry, i.e. lowest_vcn == 0. */
+			if (al_entry->lowest_vcn)
+				goto em_put_err_out;
+			/* First entry has to be in the base mft record. */
+			if (MREF_LE(al_entry->mft_reference) != vi->i_ino) {
+				/* MFT references do not match, logic fails. */
+				ntfs_error(sb, "BUG: The first $DATA extent "
+						"of $MFT is not in the base "
+						"mft record. Please report "
+						"you saw this message to "
+						"linux-ntfs-dev@lists."
+						"sourceforge.net");
+				goto put_err_out;
+			} else {
+				/* Sequence numbers must match. */
+				if (MSEQNO_LE(al_entry->mft_reference) !=
+						ni->seq_no)
+					goto em_put_err_out;
+				/* Got it. All is ok. We can stop now. */
+				break;
+			}
+		}
+	}
+
+	ntfs_attr_reinit_search_ctx(ctx);
+
+	/* Now load all attribute extents. */
+	a = NULL;
+	next_vcn = last_vcn = highest_vcn = 0;
+	while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0,
+			ctx))) {
+		runlist_element *nrl;
+
+		/* Cache the current attribute. */
+		a = ctx->attr;
+		/* $MFT must be non-resident. */
+		if (!a->non_resident) {
+			ntfs_error(sb, "$MFT must be non-resident but a "
+					"resident extent was found. $MFT is "
+					"corrupt. Run chkdsk.");
+			goto put_err_out;
+		}
+		/* $MFT must be uncompressed and unencrypted. */
+		if (a->flags & ATTR_COMPRESSION_MASK ||
+				a->flags & ATTR_IS_ENCRYPTED ||
+				a->flags & ATTR_IS_SPARSE) {
+			ntfs_error(sb, "$MFT must be uncompressed, "
+					"non-sparse, and unencrypted but a "
+					"compressed/sparse/encrypted extent "
+					"was found. $MFT is corrupt. Run "
+					"chkdsk.");
+			goto put_err_out;
+		}
+		/*
+		 * Decompress the mapping pairs array of this extent and merge
+		 * the result into the existing runlist. No need for locking
+		 * as we have exclusive access to the inode at this time and we
+		 * are a mount in progress task, too.
+		 */
+		nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
+		if (IS_ERR(nrl)) {
+			ntfs_error(sb, "ntfs_mapping_pairs_decompress() "
+					"failed with error code %ld.  $MFT is "
+					"corrupt.", PTR_ERR(nrl));
+			goto put_err_out;
+		}
+		ni->runlist.rl = nrl;
+
+		/* Are we in the first extent? */
+		if (!next_vcn) {
+			if (a->data.non_resident.lowest_vcn) {
+				ntfs_error(sb, "First extent of $DATA "
+						"attribute has non zero "
+						"lowest_vcn. $MFT is corrupt. "
+						"You should run chkdsk.");
+				goto put_err_out;
+			}
+			/* Get the last vcn in the $DATA attribute. */
+			last_vcn = sle64_to_cpu(
+					a->data.non_resident.allocated_size)
+					>> vol->cluster_size_bits;
+			/* Fill in the inode size. */
+			vi->i_size = sle64_to_cpu(
+					a->data.non_resident.data_size);
+			ni->initialized_size = sle64_to_cpu(
+					a->data.non_resident.initialized_size);
+			ni->allocated_size = sle64_to_cpu(
+					a->data.non_resident.allocated_size);
+			/*
+			 * Verify the number of mft records does not exceed
+			 * 2^32 - 1.
+			 */
+			if ((vi->i_size >> vol->mft_record_size_bits) >=
+					(1ULL << 32)) {
+				ntfs_error(sb, "$MFT is too big! Aborting.");
+				goto put_err_out;
+			}
+			/*
+			 * We have got the first extent of the runlist for
+			 * $MFT which means it is now relatively safe to call
+			 * the normal ntfs_read_inode() function.
+			 * Complete reading the inode, this will actually
+			 * re-read the mft record for $MFT, this time entering
+			 * it into the page cache with which we complete the
+			 * kick start of the volume. It should be safe to do
+			 * this now as the first extent of $MFT/$DATA is
+			 * already known and we would hope that we don't need
+			 * further extents in order to find the other
+			 * attributes belonging to $MFT. Only time will tell if
+			 * this is really the case. If not we will have to play
+			 * magic at this point, possibly duplicating a lot of
+			 * ntfs_read_inode() at this point. We will need to
+			 * ensure we do enough of its work to be able to call
+			 * ntfs_read_inode() on extents of $MFT/$DATA. But lets
+			 * hope this never happens...
+			 */
+			ntfs_read_locked_inode(vi);
+			if (is_bad_inode(vi)) {
+				ntfs_error(sb, "ntfs_read_inode() of $MFT "
+						"failed. BUG or corrupt $MFT. "
+						"Run chkdsk and if no errors "
+						"are found, please report you "
+						"saw this message to "
+						"linux-ntfs-dev@lists."
+						"sourceforge.net");
+				ntfs_attr_put_search_ctx(ctx);
+				/* Revert to the safe super operations. */
+				ntfs_free(m);
+				return -1;
+			}
+			/*
+			 * Re-initialize some specifics about $MFT's inode as
+			 * ntfs_read_inode() will have set up the default ones.
+			 */
+			/* Set uid and gid to root. */
+			vi->i_uid = vi->i_gid = 0;
+			/* Regular file. No access for anyone. */
+			vi->i_mode = S_IFREG;
+			/* No VFS initiated operations allowed for $MFT. */
+			vi->i_op = &ntfs_empty_inode_ops;
+			vi->i_fop = &ntfs_empty_file_ops;
+		}
+
+		/* Get the lowest vcn for the next extent. */
+		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+		next_vcn = highest_vcn + 1;
+
+		/* Only one extent or error, which we catch below. */
+		if (next_vcn <= 0)
+			break;
+
+		/* Avoid endless loops due to corruption. */
+		if (next_vcn < sle64_to_cpu(
+				a->data.non_resident.lowest_vcn)) {
+			ntfs_error(sb, "$MFT has corrupt attribute list "
+					"attribute. Run chkdsk.");
+			goto put_err_out;
+		}
+	}
+	if (err != -ENOENT) {
+		ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. "
+				"$MFT is corrupt. Run chkdsk.");
+		goto put_err_out;
+	}
+	if (!a) {
+		ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is "
+				"corrupt. Run chkdsk.");
+		goto put_err_out;
+	}
+	if (highest_vcn && highest_vcn != last_vcn - 1) {
+		ntfs_error(sb, "Failed to load the complete runlist for "
+				"$MFT/$DATA. Driver bug or corrupt $MFT. "
+				"Run chkdsk.");
+		ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx",
+				(unsigned long long)highest_vcn,
+				(unsigned long long)last_vcn - 1);
+		goto put_err_out;
+	}
+	ntfs_attr_put_search_ctx(ctx);
+	ntfs_debug("Done.");
+	ntfs_free(m);
+
+	/*
+	 * Split the locking rules of the MFT inode from the
+	 * locking rules of other inodes:
+	 */
+	lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key);
+	lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key);
+
+	return 0;
+
+em_put_err_out:
+	ntfs_error(sb, "Couldn't find first extent of $DATA attribute in "
+			"attribute list. $MFT is corrupt. Run chkdsk.");
+put_err_out:
+	ntfs_attr_put_search_ctx(ctx);
+err_out:
+	ntfs_error(sb, "Failed. Marking inode as bad.");
+	make_bad_inode(vi);
+	ntfs_free(m);
+	return -1;
+}
+
+static void __ntfs_clear_inode(ntfs_inode *ni)
+{
+	/* Free all alocated memory. */
+	down_write(&ni->runlist.lock);
+	if (ni->runlist.rl) {
+		ntfs_free(ni->runlist.rl);
+		ni->runlist.rl = NULL;
+	}
+	up_write(&ni->runlist.lock);
+
+	if (ni->attr_list) {
+		ntfs_free(ni->attr_list);
+		ni->attr_list = NULL;
+	}
+
+	down_write(&ni->attr_list_rl.lock);
+	if (ni->attr_list_rl.rl) {
+		ntfs_free(ni->attr_list_rl.rl);
+		ni->attr_list_rl.rl = NULL;
+	}
+	up_write(&ni->attr_list_rl.lock);
+
+	if (ni->name_len && ni->name != I30) {
+		/* Catch bugs... */
+		BUG_ON(!ni->name);
+		kfree(ni->name);
+	}
+}
+
+void ntfs_clear_extent_inode(ntfs_inode *ni)
+{
+	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
+
+	BUG_ON(NInoAttr(ni));
+	BUG_ON(ni->nr_extents != -1);
+
+#ifdef NTFS_RW
+	if (NInoDirty(ni)) {
+		if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino)))
+			ntfs_error(ni->vol->sb, "Clearing dirty extent inode!  "
+					"Losing data!  This is a BUG!!!");
+		// FIXME:  Do something!!!
+	}
+#endif /* NTFS_RW */
+
+	__ntfs_clear_inode(ni);
+
+	/* Bye, bye... */
+	ntfs_destroy_extent_inode(ni);
+}
+
+/**
+ * ntfs_evict_big_inode - clean up the ntfs specific part of an inode
+ * @vi:		vfs inode pending annihilation
+ *
+ * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode()
+ * is called, which deallocates all memory belonging to the NTFS specific part
+ * of the inode and returns.
+ *
+ * If the MFT record is dirty, we commit it before doing anything else.
+ */
+void ntfs_evict_big_inode(struct inode *vi)
+{
+	ntfs_inode *ni = NTFS_I(vi);
+
+	truncate_inode_pages(&vi->i_data, 0);
+	end_writeback(vi);
+
+#ifdef NTFS_RW
+	if (NInoDirty(ni)) {
+		bool was_bad = (is_bad_inode(vi));
+
+		/* Committing the inode also commits all extent inodes. */
+		ntfs_commit_inode(vi);
+
+		if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) {
+			ntfs_error(vi->i_sb, "Failed to commit dirty inode "
+					"0x%lx.  Losing data!", vi->i_ino);
+			// FIXME:  Do something!!!
+		}
+	}
+#endif /* NTFS_RW */
+
+	/* No need to lock at this stage as no one else has a reference. */
+	if (ni->nr_extents > 0) {
+		int i;
+
+		for (i = 0; i < ni->nr_extents; i++)
+			ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]);
+		kfree(ni->ext.extent_ntfs_inos);
+	}
+
+	__ntfs_clear_inode(ni);
+
+	if (NInoAttr(ni)) {
+		/* Release the base inode if we are holding it. */
+		if (ni->nr_extents == -1) {
+			iput(VFS_I(ni->ext.base_ntfs_ino));
+			ni->nr_extents = 0;
+			ni->ext.base_ntfs_ino = NULL;
+		}
+	}
+	return;
+}
+
+/**
+ * ntfs_show_options - show mount options in /proc/mounts
+ * @sf:		seq_file in which to write our mount options
+ * @mnt:	vfs mount whose mount options to display
+ *
+ * Called by the VFS once for each mounted ntfs volume when someone reads
+ * /proc/mounts in order to display the NTFS specific mount options of each
+ * mount. The mount options of the vfs mount @mnt are written to the seq file
+ * @sf and success is returned.
+ */
+int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
+{
+	ntfs_volume *vol = NTFS_SB(mnt->mnt_sb);
+	int i;
+
+	seq_printf(sf, ",uid=%i", vol->uid);
+	seq_printf(sf, ",gid=%i", vol->gid);
+	if (vol->fmask == vol->dmask)
+		seq_printf(sf, ",umask=0%o", vol->fmask);
+	else {
+		seq_printf(sf, ",fmask=0%o", vol->fmask);
+		seq_printf(sf, ",dmask=0%o", vol->dmask);
+	}
+	seq_printf(sf, ",nls=%s", vol->nls_map->charset);
+	if (NVolCaseSensitive(vol))
+		seq_printf(sf, ",case_sensitive");
+	if (NVolShowSystemFiles(vol))
+		seq_printf(sf, ",show_sys_files");
+	if (!NVolSparseEnabled(vol))
+		seq_printf(sf, ",disable_sparse");
+	for (i = 0; on_errors_arr[i].val; i++) {
+		if (on_errors_arr[i].val & vol->on_errors)
+			seq_printf(sf, ",errors=%s", on_errors_arr[i].str);
+	}
+	seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier);
+	return 0;
+}
+
+#ifdef NTFS_RW
+
+static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
+		"chkdsk.";
+
+/**
+ * ntfs_truncate - called when the i_size of an ntfs inode is changed
+ * @vi:		inode for which the i_size was changed
+ *
+ * We only support i_size changes for normal files at present, i.e. not
+ * compressed and not encrypted.  This is enforced in ntfs_setattr(), see
+ * below.
+ *
+ * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and
+ * that the change is allowed.
+ *
+ * This implies for us that @vi is a file inode rather than a directory, index,
+ * or attribute inode as well as that @vi is a base inode.
+ *
+ * Returns 0 on success or -errno on error.
+ *
+ * Called with ->i_mutex held.  In all but one case ->i_alloc_sem is held for
+ * writing.  The only case in the kernel where ->i_alloc_sem is not held is
+ * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
+ * with the current i_size as the offset.  The analogous place in NTFS is in
+ * fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again
+ * without holding ->i_alloc_sem.
+ */
+int ntfs_truncate(struct inode *vi)
+{
+	s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size;
+	VCN highest_vcn;
+	unsigned long flags;
+	ntfs_inode *base_ni, *ni = NTFS_I(vi);
+	ntfs_volume *vol = ni->vol;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	const char *te = "  Leaving file length out of sync with i_size.";
+	int err, mp_size, size_change, alloc_change;
+	u32 attr_len;
+
+	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+	BUG_ON(NInoAttr(ni));
+	BUG_ON(S_ISDIR(vi->i_mode));
+	BUG_ON(NInoMstProtected(ni));
+	BUG_ON(ni->nr_extents < 0);
+retry_truncate:
+	/*
+	 * Lock the runlist for writing and map the mft record to ensure it is
+	 * safe to mess with the attribute runlist and sizes.
+	 */
+	down_write(&ni->runlist.lock);
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx "
+				"(error code %d).%s", vi->i_ino, err, te);
+		ctx = NULL;
+		m = NULL;
+		goto old_bad_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		ntfs_error(vi->i_sb, "Failed to allocate a search context for "
+				"inode 0x%lx (not enough memory).%s",
+				vi->i_ino, te);
+		err = -ENOMEM;
+		goto old_bad_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT) {
+			ntfs_error(vi->i_sb, "Open attribute is missing from "
+					"mft record.  Inode 0x%lx is corrupt.  "
+					"Run chkdsk.%s", vi->i_ino, te);
+			err = -EIO;
+		} else
+			ntfs_error(vi->i_sb, "Failed to lookup attribute in "
+					"inode 0x%lx (error code %d).%s",
+					vi->i_ino, err, te);
+		goto old_bad_out;
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	/*
+	 * The i_size of the vfs inode is the new size for the attribute value.
+	 */
+	new_size = i_size_read(vi);
+	/* The current size of the attribute value is the old size. */
+	old_size = ntfs_attr_size(a);
+	/* Calculate the new allocated size. */
+	if (NInoNonResident(ni))
+		new_alloc_size = (new_size + vol->cluster_size - 1) &
+				~(s64)vol->cluster_size_mask;
+	else
+		new_alloc_size = (new_size + 7) & ~7;
+	/* The current allocated size is the old allocated size. */
+	read_lock_irqsave(&ni->size_lock, flags);
+	old_alloc_size = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * The change in the file size.  This will be 0 if no change, >0 if the
+	 * size is growing, and <0 if the size is shrinking.
+	 */
+	size_change = -1;
+	if (new_size - old_size >= 0) {
+		size_change = 1;
+		if (new_size == old_size)
+			size_change = 0;
+	}
+	/* As above for the allocated size. */
+	alloc_change = -1;
+	if (new_alloc_size - old_alloc_size >= 0) {
+		alloc_change = 1;
+		if (new_alloc_size == old_alloc_size)
+			alloc_change = 0;
+	}
+	/*
+	 * If neither the size nor the allocation are being changed there is
+	 * nothing to do.
+	 */
+	if (!size_change && !alloc_change)
+		goto unm_done;
+	/* If the size is changing, check if new size is allowed in $AttrDef. */
+	if (size_change) {
+		err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
+		if (unlikely(err)) {
+			if (err == -ERANGE) {
+				ntfs_error(vol->sb, "Truncate would cause the "
+						"inode 0x%lx to %simum size "
+						"for its attribute type "
+						"(0x%x).  Aborting truncate.",
+						vi->i_ino,
+						new_size > old_size ? "exceed "
+						"the max" : "go under the min",
+						le32_to_cpu(ni->type));
+				err = -EFBIG;
+			} else {
+				ntfs_error(vol->sb, "Inode 0x%lx has unknown "
+						"attribute type 0x%x.  "
+						"Aborting truncate.",
+						vi->i_ino,
+						le32_to_cpu(ni->type));
+				err = -EIO;
+			}
+			/* Reset the vfs inode size to the old size. */
+			i_size_write(vi, old_size);
+			goto err_out;
+		}
+	}
+	if (NInoCompressed(ni) || NInoEncrypted(ni)) {
+		ntfs_warning(vi->i_sb, "Changes in inode size are not "
+				"supported yet for %s files, ignoring.",
+				NInoCompressed(ni) ? "compressed" :
+				"encrypted");
+		err = -EOPNOTSUPP;
+		goto bad_out;
+	}
+	if (a->non_resident)
+		goto do_non_resident_truncate;
+	BUG_ON(NInoNonResident(ni));
+	/* Resize the attribute record to best fit the new attribute size. */
+	if (new_size < vol->mft_record_size &&
+			!ntfs_resident_attr_value_resize(m, a, new_size)) {
+		/* The resize succeeded! */
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		write_lock_irqsave(&ni->size_lock, flags);
+		/* Update the sizes in the ntfs inode and all is done. */
+		ni->allocated_size = le32_to_cpu(a->length) -
+				le16_to_cpu(a->data.resident.value_offset);
+		/*
+		 * Note ntfs_resident_attr_value_resize() has already done any
+		 * necessary data clearing in the attribute record.  When the
+		 * file is being shrunk vmtruncate() will already have cleared
+		 * the top part of the last partial page, i.e. since this is
+		 * the resident case this is the page with index 0.  However,
+		 * when the file is being expanded, the page cache page data
+		 * between the old data_size, i.e. old_size, and the new_size
+		 * has not been zeroed.  Fortunately, we do not need to zero it
+		 * either since on one hand it will either already be zero due
+		 * to both readpage and writepage clearing partial page data
+		 * beyond i_size in which case there is nothing to do or in the
+		 * case of the file being mmap()ped at the same time, POSIX
+		 * specifies that the behaviour is unspecified thus we do not
+		 * have to do anything.  This means that in our implementation
+		 * in the rare case that the file is mmap()ped and a write
+		 * occurred into the mmap()ped region just beyond the file size
+		 * and writepage has not yet been called to write out the page
+		 * (which would clear the area beyond the file size) and we now
+		 * extend the file size to incorporate this dirty region
+		 * outside the file size, a write of the page would result in
+		 * this data being written to disk instead of being cleared.
+		 * Given both POSIX and the Linux mmap(2) man page specify that
+		 * this corner case is undefined, we choose to leave it like
+		 * that as this is much simpler for us as we cannot lock the
+		 * relevant page now since we are holding too many ntfs locks
+		 * which would result in a lock reversal deadlock.
+		 */
+		ni->initialized_size = new_size;
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		goto unm_done;
+	}
+	/* If the above resize failed, this must be an attribute extension. */
+	BUG_ON(size_change < 0);
+	/*
+	 * We have to drop all the locks so we can call
+	 * ntfs_attr_make_non_resident().  This could be optimised by try-
+	 * locking the first page cache page and only if that fails dropping
+	 * the locks, locking the page, and redoing all the locking and
+	 * lookups.  While this would be a huge optimisation, it is not worth
+	 * it as this is definitely a slow code path as it only ever can happen
+	 * once for any given file.
+	 */
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+	/*
+	 * Not enough space in the mft record, try to make the attribute
+	 * non-resident and if successful restart the truncation process.
+	 */
+	err = ntfs_attr_make_non_resident(ni, old_size);
+	if (likely(!err))
+		goto retry_truncate;
+	/*
+	 * Could not make non-resident.  If this is due to this not being
+	 * permitted for this attribute type or there not being enough space,
+	 * try to make other attributes non-resident.  Otherwise fail.
+	 */
+	if (unlikely(err != -EPERM && err != -ENOSPC)) {
+		ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute "
+				"type 0x%x, because the conversion from "
+				"resident to non-resident attribute failed "
+				"with error code %i.", vi->i_ino,
+				(unsigned)le32_to_cpu(ni->type), err);
+		if (err != -ENOMEM)
+			err = -EIO;
+		goto conv_err_out;
+	}
+	/* TODO: Not implemented from here, abort. */
+	if (err == -ENOSPC)
+		ntfs_error(vol->sb, "Not enough space in the mft record/on "
+				"disk for the non-resident attribute value.  "
+				"This case is not implemented yet.");
+	else /* if (err == -EPERM) */
+		ntfs_error(vol->sb, "This attribute type may not be "
+				"non-resident.  This case is not implemented "
+				"yet.");
+	err = -EOPNOTSUPP;
+	goto conv_err_out;
+#if 0
+	// TODO: Attempt to make other attributes non-resident.
+	if (!err)
+		goto do_resident_extend;
+	/*
+	 * Both the attribute list attribute and the standard information
+	 * attribute must remain in the base inode.  Thus, if this is one of
+	 * these attributes, we have to try to move other attributes out into
+	 * extent mft records instead.
+	 */
+	if (ni->type == AT_ATTRIBUTE_LIST ||
+			ni->type == AT_STANDARD_INFORMATION) {
+		// TODO: Attempt to move other attributes into extent mft
+		// records.
+		err = -EOPNOTSUPP;
+		if (!err)
+			goto do_resident_extend;
+		goto err_out;
+	}
+	// TODO: Attempt to move this attribute to an extent mft record, but
+	// only if it is not already the only attribute in an mft record in
+	// which case there would be nothing to gain.
+	err = -EOPNOTSUPP;
+	if (!err)
+		goto do_resident_extend;
+	/* There is nothing we can do to make enough space. )-: */
+	goto err_out;
+#endif
+do_non_resident_truncate:
+	BUG_ON(!NInoNonResident(ni));
+	if (alloc_change < 0) {
+		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+		if (highest_vcn > 0 &&
+				old_alloc_size >> vol->cluster_size_bits >
+				highest_vcn + 1) {
+			/*
+			 * This attribute has multiple extents.  Not yet
+			 * supported.
+			 */
+			ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, "
+					"attribute type 0x%x, because the "
+					"attribute is highly fragmented (it "
+					"consists of multiple extents) and "
+					"this case is not implemented yet.",
+					vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type));
+			err = -EOPNOTSUPP;
+			goto bad_out;
+		}
+	}
+	/*
+	 * If the size is shrinking, need to reduce the initialized_size and
+	 * the data_size before reducing the allocation.
+	 */
+	if (size_change < 0) {
+		/*
+		 * Make the valid size smaller (i_size is already up-to-date).
+		 */
+		write_lock_irqsave(&ni->size_lock, flags);
+		if (new_size < ni->initialized_size) {
+			ni->initialized_size = new_size;
+			a->data.non_resident.initialized_size =
+					cpu_to_sle64(new_size);
+		}
+		a->data.non_resident.data_size = cpu_to_sle64(new_size);
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		/* If the allocated size is not changing, we are done. */
+		if (!alloc_change)
+			goto unm_done;
+		/*
+		 * If the size is shrinking it makes no sense for the
+		 * allocation to be growing.
+		 */
+		BUG_ON(alloc_change > 0);
+	} else /* if (size_change >= 0) */ {
+		/*
+		 * The file size is growing or staying the same but the
+		 * allocation can be shrinking, growing or staying the same.
+		 */
+		if (alloc_change > 0) {
+			/*
+			 * We need to extend the allocation and possibly update
+			 * the data size.  If we are updating the data size,
+			 * since we are not touching the initialized_size we do
+			 * not need to worry about the actual data on disk.
+			 * And as far as the page cache is concerned, there
+			 * will be no pages beyond the old data size and any
+			 * partial region in the last page between the old and
+			 * new data size (or the end of the page if the new
+			 * data size is outside the page) does not need to be
+			 * modified as explained above for the resident
+			 * attribute truncate case.  To do this, we simply drop
+			 * the locks we hold and leave all the work to our
+			 * friendly helper ntfs_attr_extend_allocation().
+			 */
+			ntfs_attr_put_search_ctx(ctx);
+			unmap_mft_record(base_ni);
+			up_write(&ni->runlist.lock);
+			err = ntfs_attr_extend_allocation(ni, new_size,
+					size_change > 0 ? new_size : -1, -1);
+			/*
+			 * ntfs_attr_extend_allocation() will have done error
+			 * output already.
+			 */
+			goto done;
+		}
+		if (!alloc_change)
+			goto alloc_done;
+	}
+	/* alloc_change < 0 */
+	/* Free the clusters. */
+	nr_freed = ntfs_cluster_free(ni, new_alloc_size >>
+			vol->cluster_size_bits, -1, ctx);
+	m = ctx->mrec;
+	a = ctx->attr;
+	if (unlikely(nr_freed < 0)) {
+		ntfs_error(vol->sb, "Failed to release cluster(s) (error code "
+				"%lli).  Unmount and run chkdsk to recover "
+				"the lost cluster(s).", (long long)nr_freed);
+		NVolSetErrors(vol);
+		nr_freed = 0;
+	}
+	/* Truncate the runlist. */
+	err = ntfs_rl_truncate_nolock(vol, &ni->runlist,
+			new_alloc_size >> vol->cluster_size_bits);
+	/*
+	 * If the runlist truncation failed and/or the search context is no
+	 * longer valid, we cannot resize the attribute record or build the
+	 * mapping pairs array thus we mark the inode bad so that no access to
+	 * the freed clusters can happen.
+	 */
+	if (unlikely(err || IS_ERR(m))) {
+		ntfs_error(vol->sb, "Failed to %s (error code %li).%s",
+				IS_ERR(m) ?
+				"restore attribute search context" :
+				"truncate attribute runlist",
+				IS_ERR(m) ? PTR_ERR(m) : err, es);
+		err = -EIO;
+		goto bad_out;
+	}
+	/* Get the size for the shrunk mapping pairs array for the runlist. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1);
+	if (unlikely(mp_size <= 0)) {
+		ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
+				"attribute type 0x%x, because determining the "
+				"size for the mapping pairs failed with error "
+				"code %i.%s", vi->i_ino,
+				(unsigned)le32_to_cpu(ni->type), mp_size, es);
+		err = -EIO;
+		goto bad_out;
+	}
+	/*
+	 * Shrink the attribute record for the new mapping pairs array.  Note,
+	 * this cannot fail since we are making the attribute smaller thus by
+	 * definition there is enough space to do so.
+	 */
+	attr_len = le32_to_cpu(a->length);
+	err = ntfs_attr_record_resize(m, a, mp_size +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+	BUG_ON(err);
+	/*
+	 * Generate the mapping pairs array directly into the attribute record.
+	 */
+	err = ntfs_mapping_pairs_build(vol, (u8*)a +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+			mp_size, ni->runlist.rl, 0, -1, NULL);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
+				"attribute type 0x%x, because building the "
+				"mapping pairs failed with error code %i.%s",
+				vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+				err, es);
+		err = -EIO;
+		goto bad_out;
+	}
+	/* Update the allocated/compressed size as well as the highest vcn. */
+	a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
+			vol->cluster_size_bits) - 1);
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->allocated_size = new_alloc_size;
+	a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		if (nr_freed) {
+			ni->itype.compressed.size -= nr_freed <<
+					vol->cluster_size_bits;
+			BUG_ON(ni->itype.compressed.size < 0);
+			a->data.non_resident.compressed_size = cpu_to_sle64(
+					ni->itype.compressed.size);
+			vi->i_blocks = ni->itype.compressed.size >> 9;
+		}
+	} else
+		vi->i_blocks = new_alloc_size >> 9;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * We have shrunk the allocation.  If this is a shrinking truncate we
+	 * have already dealt with the initialized_size and the data_size above
+	 * and we are done.  If the truncate is only changing the allocation
+	 * and not the data_size, we are also done.  If this is an extending
+	 * truncate, need to extend the data_size now which is ensured by the
+	 * fact that @size_change is positive.
+	 */
+alloc_done:
+	/*
+	 * If the size is growing, need to update it now.  If it is shrinking,
+	 * we have already updated it above (before the allocation change).
+	 */
+	if (size_change > 0)
+		a->data.non_resident.data_size = cpu_to_sle64(new_size);
+	/* Ensure the modified mft record is written out. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+unm_done:
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+done:
+	/* Update the mtime and ctime on the base inode. */
+	/* normally ->truncate shouldn't update ctime or mtime,
+	 * but ntfs did before so it got a copy & paste version
+	 * of file_update_time.  one day someone should fix this
+	 * for real.
+	 */
+	if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
+		struct timespec now = current_fs_time(VFS_I(base_ni)->i_sb);
+		int sync_it = 0;
+
+		if (!timespec_equal(&VFS_I(base_ni)->i_mtime, &now) ||
+		    !timespec_equal(&VFS_I(base_ni)->i_ctime, &now))
+			sync_it = 1;
+		VFS_I(base_ni)->i_mtime = now;
+		VFS_I(base_ni)->i_ctime = now;
+
+		if (sync_it)
+			mark_inode_dirty_sync(VFS_I(base_ni));
+	}
+
+	if (likely(!err)) {
+		NInoClearTruncateFailed(ni);
+		ntfs_debug("Done.");
+	}
+	return err;
+old_bad_out:
+	old_size = -1;
+bad_out:
+	if (err != -ENOMEM && err != -EOPNOTSUPP)
+		NVolSetErrors(vol);
+	if (err != -EOPNOTSUPP)
+		NInoSetTruncateFailed(ni);
+	else if (old_size >= 0)
+		i_size_write(vi, old_size);
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+out:
+	ntfs_debug("Failed.  Returning error code %i.", err);
+	return err;
+conv_err_out:
+	if (err != -ENOMEM && err != -EOPNOTSUPP)
+		NVolSetErrors(vol);
+	if (err != -EOPNOTSUPP)
+		NInoSetTruncateFailed(ni);
+	else
+		i_size_write(vi, old_size);
+	goto out;
+}
+
+/**
+ * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value
+ * @vi:		inode for which the i_size was changed
+ *
+ * Wrapper for ntfs_truncate() that has no return value.
+ *
+ * See ntfs_truncate() description above for details.
+ */
+void ntfs_truncate_vfs(struct inode *vi) {
+	ntfs_truncate(vi);
+}
+
+/**
+ * ntfs_setattr - called from notify_change() when an attribute is being changed
+ * @dentry:	dentry whose attributes to change
+ * @attr:	structure describing the attributes and the changes
+ *
+ * We have to trap VFS attempts to truncate the file described by @dentry as
+ * soon as possible, because we do not implement changes in i_size yet.  So we
+ * abort all i_size changes here.
+ *
+ * We also abort all changes of user, group, and mode as we do not implement
+ * the NTFS ACLs yet.
+ *
+ * Called with ->i_mutex held.  For the ATTR_SIZE (i.e. ->truncate) case, also
+ * called with ->i_alloc_sem held for writing.
+ */
+int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *vi = dentry->d_inode;
+	int err;
+	unsigned int ia_valid = attr->ia_valid;
+
+	err = inode_change_ok(vi, attr);
+	if (err)
+		goto out;
+	/* We do not support NTFS ACLs yet. */
+	if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) {
+		ntfs_warning(vi->i_sb, "Changes in user/group/mode are not "
+				"supported yet, ignoring.");
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+	if (ia_valid & ATTR_SIZE) {
+		if (attr->ia_size != i_size_read(vi)) {
+			ntfs_inode *ni = NTFS_I(vi);
+			/*
+			 * FIXME: For now we do not support resizing of
+			 * compressed or encrypted files yet.
+			 */
+			if (NInoCompressed(ni) || NInoEncrypted(ni)) {
+				ntfs_warning(vi->i_sb, "Changes in inode size "
+						"are not supported yet for "
+						"%s files, ignoring.",
+						NInoCompressed(ni) ?
+						"compressed" : "encrypted");
+				err = -EOPNOTSUPP;
+			} else
+				err = vmtruncate(vi, attr->ia_size);
+			if (err || ia_valid == ATTR_SIZE)
+				goto out;
+		} else {
+			/*
+			 * We skipped the truncate but must still update
+			 * timestamps.
+			 */
+			ia_valid |= ATTR_MTIME | ATTR_CTIME;
+		}
+	}
+	if (ia_valid & ATTR_ATIME)
+		vi->i_atime = timespec_trunc(attr->ia_atime,
+				vi->i_sb->s_time_gran);
+	if (ia_valid & ATTR_MTIME)
+		vi->i_mtime = timespec_trunc(attr->ia_mtime,
+				vi->i_sb->s_time_gran);
+	if (ia_valid & ATTR_CTIME)
+		vi->i_ctime = timespec_trunc(attr->ia_ctime,
+				vi->i_sb->s_time_gran);
+	mark_inode_dirty(vi);
+out:
+	return err;
+}
+
+/**
+ * ntfs_write_inode - write out a dirty inode
+ * @vi:		inode to write out
+ * @sync:	if true, write out synchronously
+ *
+ * Write out a dirty inode to disk including any extent inodes if present.
+ *
+ * If @sync is true, commit the inode to disk and wait for io completion.  This
+ * is done using write_mft_record().
+ *
+ * If @sync is false, just schedule the write to happen but do not wait for i/o
+ * completion.  In 2.6 kernels, scheduling usually happens just by virtue of
+ * marking the page (and in this case mft record) dirty but we do not implement
+ * this yet as write_mft_record() largely ignores the @sync parameter and
+ * always performs synchronous writes.
+ *
+ * Return 0 on success and -errno on error.
+ */
+int __ntfs_write_inode(struct inode *vi, int sync)
+{
+	sle64 nt;
+	ntfs_inode *ni = NTFS_I(vi);
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *m;
+	STANDARD_INFORMATION *si;
+	int err = 0;
+	bool modified = false;
+
+	ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "",
+			vi->i_ino);
+	/*
+	 * Dirty attribute inodes are written via their real inodes so just
+	 * clean them here.  Access time updates are taken care off when the
+	 * real inode is written.
+	 */
+	if (NInoAttr(ni)) {
+		NInoClearDirty(ni);
+		ntfs_debug("Done.");
+		return 0;
+	}
+	/* Map, pin, and lock the mft record belonging to the inode. */
+	m = map_mft_record(ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		goto err_out;
+	}
+	/* Update the access times in the standard information attribute. */
+	ctx = ntfs_attr_get_search_ctx(ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto unm_err_out;
+	}
+	err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		ntfs_attr_put_search_ctx(ctx);
+		goto unm_err_out;
+	}
+	si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset));
+	/* Update the access times if they have changed. */
+	nt = utc2ntfs(vi->i_mtime);
+	if (si->last_data_change_time != nt) {
+		ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
+				"new = 0x%llx", vi->i_ino, (long long)
+				sle64_to_cpu(si->last_data_change_time),
+				(long long)sle64_to_cpu(nt));
+		si->last_data_change_time = nt;
+		modified = true;
+	}
+	nt = utc2ntfs(vi->i_ctime);
+	if (si->last_mft_change_time != nt) {
+		ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
+				"new = 0x%llx", vi->i_ino, (long long)
+				sle64_to_cpu(si->last_mft_change_time),
+				(long long)sle64_to_cpu(nt));
+		si->last_mft_change_time = nt;
+		modified = true;
+	}
+	nt = utc2ntfs(vi->i_atime);
+	if (si->last_access_time != nt) {
+		ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
+				"new = 0x%llx", vi->i_ino,
+				(long long)sle64_to_cpu(si->last_access_time),
+				(long long)sle64_to_cpu(nt));
+		si->last_access_time = nt;
+		modified = true;
+	}
+	/*
+	 * If we just modified the standard information attribute we need to
+	 * mark the mft record it is in dirty.  We do this manually so that
+	 * mark_inode_dirty() is not called which would redirty the inode and
+	 * hence result in an infinite loop of trying to write the inode.
+	 * There is no need to mark the base inode nor the base mft record
+	 * dirty, since we are going to write this mft record below in any case
+	 * and the base mft record may actually not have been modified so it
+	 * might not need to be written out.
+	 * NOTE: It is not a problem when the inode for $MFT itself is being
+	 * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES
+	 * on the $MFT inode and hence ntfs_write_inode() will not be
+	 * re-invoked because of it which in turn is ok since the dirtied mft
+	 * record will be cleaned and written out to disk below, i.e. before
+	 * this function returns.
+	 */
+	if (modified) {
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		if (!NInoTestSetDirty(ctx->ntfs_ino))
+			mark_ntfs_record_dirty(ctx->ntfs_ino->page,
+					ctx->ntfs_ino->page_ofs);
+	}
+	ntfs_attr_put_search_ctx(ctx);
+	/* Now the access times are updated, write the base mft record. */
+	if (NInoDirty(ni))
+		err = write_mft_record(ni, m, sync);
+	/* Write all attached extent mft records. */
+	mutex_lock(&ni->extent_lock);
+	if (ni->nr_extents > 0) {
+		ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos;
+		int i;
+
+		ntfs_debug("Writing %i extent inodes.", ni->nr_extents);
+		for (i = 0; i < ni->nr_extents; i++) {
+			ntfs_inode *tni = extent_nis[i];
+
+			if (NInoDirty(tni)) {
+				MFT_RECORD *tm = map_mft_record(tni);
+				int ret;
+
+				if (IS_ERR(tm)) {
+					if (!err || err == -ENOMEM)
+						err = PTR_ERR(tm);
+					continue;
+				}
+				ret = write_mft_record(tni, tm, sync);
+				unmap_mft_record(tni);
+				if (unlikely(ret)) {
+					if (!err || err == -ENOMEM)
+						err = ret;
+				}
+			}
+		}
+	}
+	mutex_unlock(&ni->extent_lock);
+	unmap_mft_record(ni);
+	if (unlikely(err))
+		goto err_out;
+	ntfs_debug("Done.");
+	return 0;
+unm_err_out:
+	unmap_mft_record(ni);
+err_out:
+	if (err == -ENOMEM) {
+		ntfs_warning(vi->i_sb, "Not enough memory to write inode.  "
+				"Marking the inode dirty again, so the VFS "
+				"retries later.");
+		mark_inode_dirty(vi);
+	} else {
+		ntfs_error(vi->i_sb, "Failed (error %i):  Run chkdsk.", -err);
+		NVolSetErrors(ni->vol);
+	}
+	return err;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
new file mode 100644
index 00000000..2dabf813
--- /dev/null
+++ b/fs/ntfs/inode.h
@@ -0,0 +1,321 @@
+/*
+ * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of
+ *	     the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_INODE_H
+#define _LINUX_NTFS_INODE_H
+
+#include <asm/atomic.h>
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+
+#include "layout.h"
+#include "volume.h"
+#include "types.h"
+#include "runlist.h"
+#include "debug.h"
+
+typedef struct _ntfs_inode ntfs_inode;
+
+/*
+ * The NTFS in-memory inode structure. It is just used as an extension to the
+ * fields already provided in the VFS inode.
+ */
+struct _ntfs_inode {
+	rwlock_t size_lock;	/* Lock serializing access to inode sizes. */
+	s64 initialized_size;	/* Copy from the attribute record. */
+	s64 allocated_size;	/* Copy from the attribute record. */
+	unsigned long state;	/* NTFS specific flags describing this inode.
+				   See ntfs_inode_state_bits below. */
+	unsigned long mft_no;	/* Number of the mft record / inode. */
+	u16 seq_no;		/* Sequence number of the mft record. */
+	atomic_t count;		/* Inode reference count for book keeping. */
+	ntfs_volume *vol;	/* Pointer to the ntfs volume of this inode. */
+	/*
+	 * If NInoAttr() is true, the below fields describe the attribute which
+	 * this fake inode belongs to. The actual inode of this attribute is
+	 * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see
+	 * below). For real inodes, we also set the type (AT_DATA for files and
+	 * AT_INDEX_ALLOCATION for directories), with the name = NULL and
+	 * name_len = 0 for files and name = I30 (global constant) and
+	 * name_len = 4 for directories.
+	 */
+	ATTR_TYPE type;	/* Attribute type of this fake inode. */
+	ntfschar *name;		/* Attribute name of this fake inode. */
+	u32 name_len;		/* Attribute name length of this fake inode. */
+	runlist runlist;	/* If state has the NI_NonResident bit set,
+				   the runlist of the unnamed data attribute
+				   (if a file) or of the index allocation
+				   attribute (directory) or of the attribute
+				   described by the fake inode (if NInoAttr()).
+				   If runlist.rl is NULL, the runlist has not
+				   been read in yet or has been unmapped. If
+				   NI_NonResident is clear, the attribute is
+				   resident (file and fake inode) or there is
+				   no $I30 index allocation attribute
+				   (small directory). In the latter case
+				   runlist.rl is always NULL.*/
+	/*
+	 * The following fields are only valid for real inodes and extent
+	 * inodes.
+	 */
+	struct mutex mrec_lock;	/* Lock for serializing access to the
+				   mft record belonging to this inode. */
+	struct page *page;	/* The page containing the mft record of the
+				   inode. This should only be touched by the
+				   (un)map_mft_record*() functions. */
+	int page_ofs;		/* Offset into the page at which the mft record
+				   begins. This should only be touched by the
+				   (un)map_mft_record*() functions. */
+	/*
+	 * Attribute list support (only for use by the attribute lookup
+	 * functions). Setup during read_inode for all inodes with attribute
+	 * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is
+	 * further only valid if NI_AttrListNonResident is set.
+	 */
+	u32 attr_list_size;	/* Length of attribute list value in bytes. */
+	u8 *attr_list;		/* Attribute list value itself. */
+	runlist attr_list_rl;	/* Run list for the attribute list value. */
+	union {
+		struct { /* It is a directory, $MFT, or an index inode. */
+			u32 block_size;		/* Size of an index block. */
+			u32 vcn_size;		/* Size of a vcn in this
+						   index. */
+			COLLATION_RULE collation_rule; /* The collation rule
+						   for the index. */
+			u8 block_size_bits; 	/* Log2 of the above. */
+			u8 vcn_size_bits;	/* Log2 of the above. */
+		} index;
+		struct { /* It is a compressed/sparse file/attribute inode. */
+			s64 size;		/* Copy of compressed_size from
+						   $DATA. */
+			u32 block_size;		/* Size of a compression block
+						   (cb). */
+			u8 block_size_bits;	/* Log2 of the size of a cb. */
+			u8 block_clusters;	/* Number of clusters per cb. */
+		} compressed;
+	} itype;
+	struct mutex extent_lock;	/* Lock for accessing/modifying the
+					   below . */
+	s32 nr_extents;	/* For a base mft record, the number of attached extent
+			   inodes (0 if none), for extent records and for fake
+			   inodes describing an attribute this is -1. */
+	union {		/* This union is only used if nr_extents != 0. */
+		ntfs_inode **extent_ntfs_inos;	/* For nr_extents > 0, array of
+						   the ntfs inodes of the extent
+						   mft records belonging to
+						   this base inode which have
+						   been loaded. */
+		ntfs_inode *base_ntfs_ino;	/* For nr_extents == -1, the
+						   ntfs inode of the base mft
+						   record. For fake inodes, the
+						   real (base) inode to which
+						   the attribute belongs. */
+	} ext;
+};
+
+/*
+ * Defined bits for the state field in the ntfs_inode structure.
+ * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only
+ */
+typedef enum {
+	NI_Dirty,		/* 1: Mft record needs to be written to disk. */
+	NI_AttrList,		/* 1: Mft record contains an attribute list. */
+	NI_AttrListNonResident,	/* 1: Attribute list is non-resident. Implies
+				      NI_AttrList is set. */
+
+	NI_Attr,		/* 1: Fake inode for attribute i/o.
+				   0: Real inode or extent inode. */
+
+	NI_MstProtected,	/* 1: Attribute is protected by MST fixups.
+				   0: Attribute is not protected by fixups. */
+	NI_NonResident,		/* 1: Unnamed data attr is non-resident (f).
+				   1: Attribute is non-resident (a). */
+	NI_IndexAllocPresent = NI_NonResident,	/* 1: $I30 index alloc attr is
+						   present (d). */
+	NI_Compressed,		/* 1: Unnamed data attr is compressed (f).
+				   1: Create compressed files by default (d).
+				   1: Attribute is compressed (a). */
+	NI_Encrypted,		/* 1: Unnamed data attr is encrypted (f).
+				   1: Create encrypted files by default (d).
+				   1: Attribute is encrypted (a). */
+	NI_Sparse,		/* 1: Unnamed data attr is sparse (f).
+				   1: Create sparse files by default (d).
+				   1: Attribute is sparse (a). */
+	NI_SparseDisabled,	/* 1: May not create sparse regions. */
+	NI_TruncateFailed,	/* 1: Last ntfs_truncate() call failed. */
+} ntfs_inode_state_bits;
+
+/*
+ * NOTE: We should be adding dirty mft records to a list somewhere and they
+ * should be independent of the (ntfs/vfs) inode structure so that an inode can
+ * be removed but the record can be left dirty for syncing later.
+ */
+
+/*
+ * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo()
+ * functions.
+ */
+#define NINO_FNS(flag)					\
+static inline int NIno##flag(ntfs_inode *ni)		\
+{							\
+	return test_bit(NI_##flag, &(ni)->state);	\
+}							\
+static inline void NInoSet##flag(ntfs_inode *ni)	\
+{							\
+	set_bit(NI_##flag, &(ni)->state);		\
+}							\
+static inline void NInoClear##flag(ntfs_inode *ni)	\
+{							\
+	clear_bit(NI_##flag, &(ni)->state);		\
+}
+
+/*
+ * As above for NInoTestSetFoo() and NInoTestClearFoo().
+ */
+#define TAS_NINO_FNS(flag)					\
+static inline int NInoTestSet##flag(ntfs_inode *ni)		\
+{								\
+	return test_and_set_bit(NI_##flag, &(ni)->state);	\
+}								\
+static inline int NInoTestClear##flag(ntfs_inode *ni)		\
+{								\
+	return test_and_clear_bit(NI_##flag, &(ni)->state);	\
+}
+
+/* Emit the ntfs inode bitops functions. */
+NINO_FNS(Dirty)
+TAS_NINO_FNS(Dirty)
+NINO_FNS(AttrList)
+NINO_FNS(AttrListNonResident)
+NINO_FNS(Attr)
+NINO_FNS(MstProtected)
+NINO_FNS(NonResident)
+NINO_FNS(IndexAllocPresent)
+NINO_FNS(Compressed)
+NINO_FNS(Encrypted)
+NINO_FNS(Sparse)
+NINO_FNS(SparseDisabled)
+NINO_FNS(TruncateFailed)
+
+/*
+ * The full structure containing a ntfs_inode and a vfs struct inode. Used for
+ * all real and fake inodes but not for extent inodes which lack the vfs struct
+ * inode.
+ */
+typedef struct {
+	ntfs_inode ntfs_inode;
+	struct inode vfs_inode;		/* The vfs inode structure. */
+} big_ntfs_inode;
+
+/**
+ * NTFS_I - return the ntfs inode given a vfs inode
+ * @inode:	VFS inode
+ *
+ * NTFS_I() returns the ntfs inode associated with the VFS @inode.
+ */
+static inline ntfs_inode *NTFS_I(struct inode *inode)
+{
+	return (ntfs_inode *)list_entry(inode, big_ntfs_inode, vfs_inode);
+}
+
+static inline struct inode *VFS_I(ntfs_inode *ni)
+{
+	return &((big_ntfs_inode *)ni)->vfs_inode;
+}
+
+/**
+ * ntfs_attr - ntfs in memory attribute structure
+ * @mft_no:	mft record number of the base mft record of this attribute
+ * @name:	Unicode name of the attribute (NULL if unnamed)
+ * @name_len:	length of @name in Unicode characters (0 if unnamed)
+ * @type:	attribute type (see layout.h)
+ *
+ * This structure exists only to provide a small structure for the
+ * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism.
+ *
+ * NOTE: Elements are ordered by size to make the structure as compact as
+ * possible on all architectures.
+ */
+typedef struct {
+	unsigned long mft_no;
+	ntfschar *name;
+	u32 name_len;
+	ATTR_TYPE type;
+} ntfs_attr;
+
+typedef int (*test_t)(struct inode *, void *);
+
+extern int ntfs_test_inode(struct inode *vi, ntfs_attr *na);
+
+extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no);
+extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
+		ntfschar *name, u32 name_len);
+extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
+		u32 name_len);
+
+extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
+extern void ntfs_destroy_big_inode(struct inode *inode);
+extern void ntfs_evict_big_inode(struct inode *vi);
+
+extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
+
+static inline void ntfs_init_big_inode(struct inode *vi)
+{
+	ntfs_inode *ni = NTFS_I(vi);
+
+	ntfs_debug("Entering.");
+	__ntfs_init_inode(vi->i_sb, ni);
+	ni->mft_no = vi->i_ino;
+}
+
+extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
+		unsigned long mft_no);
+extern void ntfs_clear_extent_inode(ntfs_inode *ni);
+
+extern int ntfs_read_inode_mount(struct inode *vi);
+
+extern int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt);
+
+#ifdef NTFS_RW
+
+extern int ntfs_truncate(struct inode *vi);
+extern void ntfs_truncate_vfs(struct inode *vi);
+
+extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr);
+
+extern int __ntfs_write_inode(struct inode *vi, int sync);
+
+static inline void ntfs_commit_inode(struct inode *vi)
+{
+	if (!is_bad_inode(vi))
+		__ntfs_write_inode(vi, 1);
+	return;
+}
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_INODE_H */
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
new file mode 100644
index 00000000..faece719
--- /dev/null
+++ b/fs/ntfs/layout.h
@@ -0,0 +1,2435 @@
+/*
+ * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS
+ *	      project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_LAYOUT_H
+#define _LINUX_NTFS_LAYOUT_H
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+#include <asm/byteorder.h>
+
+#include "types.h"
+
+/* The NTFS oem_id "NTFS    " */
+#define magicNTFS	cpu_to_le64(0x202020205346544eULL)
+
+/*
+ * Location of bootsector on partition:
+ *	The standard NTFS_BOOT_SECTOR is on sector 0 of the partition.
+ *	On NT4 and above there is one backup copy of the boot sector to
+ *	be found on the last sector of the partition (not normally accessible
+ *	from within Windows as the bootsector contained number of sectors
+ *	value is one less than the actual value!).
+ *	On versions of NT 3.51 and earlier, the backup copy was located at
+ *	number of sectors/2 (integer divide), i.e. in the middle of the volume.
+ */
+
+/*
+ * BIOS parameter block (bpb) structure.
+ */
+typedef struct {
+	le16 bytes_per_sector;		/* Size of a sector in bytes. */
+	u8  sectors_per_cluster;	/* Size of a cluster in sectors. */
+	le16 reserved_sectors;		/* zero */
+	u8  fats;			/* zero */
+	le16 root_entries;		/* zero */
+	le16 sectors;			/* zero */
+	u8  media_type;			/* 0xf8 = hard disk */
+	le16 sectors_per_fat;		/* zero */
+	le16 sectors_per_track;		/* irrelevant */
+	le16 heads;			/* irrelevant */
+	le32 hidden_sectors;		/* zero */
+	le32 large_sectors;		/* zero */
+} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK;
+
+/*
+ * NTFS boot sector structure.
+ */
+typedef struct {
+	u8  jump[3];			/* Irrelevant (jump to boot up code).*/
+	le64 oem_id;			/* Magic "NTFS    ". */
+	BIOS_PARAMETER_BLOCK bpb;	/* See BIOS_PARAMETER_BLOCK. */
+	u8  unused[4];			/* zero, NTFS diskedit.exe states that
+					   this is actually:
+						__u8 physical_drive;	// 0x80
+						__u8 current_head;	// zero
+						__u8 extended_boot_signature;
+									// 0x80
+						__u8 unused;		// zero
+					 */
+/*0x28*/sle64 number_of_sectors;	/* Number of sectors in volume. Gives
+					   maximum volume size of 2^63 sectors.
+					   Assuming standard sector size of 512
+					   bytes, the maximum byte size is
+					   approx. 4.7x10^21 bytes. (-; */
+	sle64 mft_lcn;			/* Cluster location of mft data. */
+	sle64 mftmirr_lcn;		/* Cluster location of copy of mft. */
+	s8  clusters_per_mft_record;	/* Mft record size in clusters. */
+	u8  reserved0[3];		/* zero */
+	s8  clusters_per_index_record;	/* Index block size in clusters. */
+	u8  reserved1[3];		/* zero */
+	le64 volume_serial_number;	/* Irrelevant (serial number). */
+	le32 checksum;			/* Boot sector checksum. */
+/*0x54*/u8  bootstrap[426];		/* Irrelevant (boot up code). */
+	le16 end_of_sector_marker;	/* End of bootsector magic. Always is
+					   0xaa55 in little endian. */
+/* sizeof() = 512 (0x200) bytes */
+} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR;
+
+/*
+ * Magic identifiers present at the beginning of all ntfs record containing
+ * records (like mft records for example).
+ */
+enum {
+	/* Found in $MFT/$DATA. */
+	magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
+	magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
+	magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
+
+	/* Found in $LogFile/$DATA. */
+	magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
+	magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
+
+	/* Found in $LogFile/$DATA.  (May be found in $MFT/$DATA, also?) */
+	magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
+
+	/* Found in all ntfs record containing records. */
+	magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
+						       transfer was detected. */
+	/*
+	 * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
+	 * thus not initialized.  Page must be initialized before using it.
+	 */
+	magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
+};
+
+typedef le32 NTFS_RECORD_TYPE;
+
+/*
+ * Generic magic comparison macros. Finally found a use for the ## preprocessor
+ * operator! (-8
+ */
+
+static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r)
+{
+	return (x == r);
+}
+#define ntfs_is_magic(x, m)	__ntfs_is_magic(x, magic_##m)
+
+static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r)
+{
+	return (*p == r);
+}
+#define ntfs_is_magicp(p, m)	__ntfs_is_magicp(p, magic_##m)
+
+/*
+ * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above.
+ */
+#define ntfs_is_file_record(x)		( ntfs_is_magic (x, FILE) )
+#define ntfs_is_file_recordp(p)		( ntfs_is_magicp(p, FILE) )
+#define ntfs_is_mft_record(x)		( ntfs_is_file_record (x) )
+#define ntfs_is_mft_recordp(p)		( ntfs_is_file_recordp(p) )
+#define ntfs_is_indx_record(x)		( ntfs_is_magic (x, INDX) )
+#define ntfs_is_indx_recordp(p)		( ntfs_is_magicp(p, INDX) )
+#define ntfs_is_hole_record(x)		( ntfs_is_magic (x, HOLE) )
+#define ntfs_is_hole_recordp(p)		( ntfs_is_magicp(p, HOLE) )
+
+#define ntfs_is_rstr_record(x)		( ntfs_is_magic (x, RSTR) )
+#define ntfs_is_rstr_recordp(p)		( ntfs_is_magicp(p, RSTR) )
+#define ntfs_is_rcrd_record(x)		( ntfs_is_magic (x, RCRD) )
+#define ntfs_is_rcrd_recordp(p)		( ntfs_is_magicp(p, RCRD) )
+
+#define ntfs_is_chkd_record(x)		( ntfs_is_magic (x, CHKD) )
+#define ntfs_is_chkd_recordp(p)		( ntfs_is_magicp(p, CHKD) )
+
+#define ntfs_is_baad_record(x)		( ntfs_is_magic (x, BAAD) )
+#define ntfs_is_baad_recordp(p)		( ntfs_is_magicp(p, BAAD) )
+
+#define ntfs_is_empty_record(x)		( ntfs_is_magic (x, empty) )
+#define ntfs_is_empty_recordp(p)	( ntfs_is_magicp(p, empty) )
+
+/*
+ * The Update Sequence Array (usa) is an array of the le16 values which belong
+ * to the end of each sector protected by the update sequence record in which
+ * this array is contained. Note that the first entry is the Update Sequence
+ * Number (usn), a cyclic counter of how many times the protected record has
+ * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All
+ * last le16's of each sector have to be equal to the usn (during reading) or
+ * are set to it (during writing). If they are not, an incomplete multi sector
+ * transfer has occurred when the data was written.
+ * The maximum size for the update sequence array is fixed to:
+ *	maximum size = usa_ofs + (usa_count * 2) = 510 bytes
+ * The 510 bytes comes from the fact that the last le16 in the array has to
+ * (obviously) finish before the last le16 of the first 512-byte sector.
+ * This formula can be used as a consistency check in that usa_ofs +
+ * (usa_count * 2) has to be less than or equal to 510.
+ */
+typedef struct {
+	NTFS_RECORD_TYPE magic;	/* A four-byte magic identifying the record
+				   type and/or status. */
+	le16 usa_ofs;		/* Offset to the Update Sequence Array (usa)
+				   from the start of the ntfs record. */
+	le16 usa_count;		/* Number of le16 sized entries in the usa
+				   including the Update Sequence Number (usn),
+				   thus the number of fixups is the usa_count
+				   minus 1. */
+} __attribute__ ((__packed__)) NTFS_RECORD;
+
+/*
+ * System files mft record numbers. All these files are always marked as used
+ * in the bitmap attribute of the mft; presumably in order to avoid accidental
+ * allocation for random other mft records. Also, the sequence number for each
+ * of the system files is always equal to their mft record number and it is
+ * never modified.
+ */
+typedef enum {
+	FILE_MFT       = 0,	/* Master file table (mft). Data attribute
+				   contains the entries and bitmap attribute
+				   records which ones are in use (bit==1). */
+	FILE_MFTMirr   = 1,	/* Mft mirror: copy of first four mft records
+				   in data attribute. If cluster size > 4kiB,
+				   copy of first N mft records, with
+					N = cluster_size / mft_record_size. */
+	FILE_LogFile   = 2,	/* Journalling log in data attribute. */
+	FILE_Volume    = 3,	/* Volume name attribute and volume information
+				   attribute (flags and ntfs version). Windows
+				   refers to this file as volume DASD (Direct
+				   Access Storage Device). */
+	FILE_AttrDef   = 4,	/* Array of attribute definitions in data
+				   attribute. */
+	FILE_root      = 5,	/* Root directory. */
+	FILE_Bitmap    = 6,	/* Allocation bitmap of all clusters (lcns) in
+				   data attribute. */
+	FILE_Boot      = 7,	/* Boot sector (always at cluster 0) in data
+				   attribute. */
+	FILE_BadClus   = 8,	/* Contains all bad clusters in the non-resident
+				   data attribute. */
+	FILE_Secure    = 9,	/* Shared security descriptors in data attribute
+				   and two indexes into the descriptors.
+				   Appeared in Windows 2000. Before that, this
+				   file was named $Quota but was unused. */
+	FILE_UpCase    = 10,	/* Uppercase equivalents of all 65536 Unicode
+				   characters in data attribute. */
+	FILE_Extend    = 11,	/* Directory containing other system files (eg.
+				   $ObjId, $Quota, $Reparse and $UsnJrnl). This
+				   is new to NTFS3.0. */
+	FILE_reserved12 = 12,	/* Reserved for future use (records 12-15). */
+	FILE_reserved13 = 13,
+	FILE_reserved14 = 14,
+	FILE_reserved15 = 15,
+	FILE_first_user = 16,	/* First user file, used as test limit for
+				   whether to allow opening a file or not. */
+} NTFS_SYSTEM_FILES;
+
+/*
+ * These are the so far known MFT_RECORD_* flags (16-bit) which contain
+ * information about the mft record in which they are present.
+ */
+enum {
+	MFT_RECORD_IN_USE	= cpu_to_le16(0x0001),
+	MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
+} __attribute__ ((__packed__));
+
+typedef le16 MFT_RECORD_FLAGS;
+
+/*
+ * mft references (aka file references or file record segment references) are
+ * used whenever a structure needs to refer to a record in the mft.
+ *
+ * A reference consists of a 48-bit index into the mft and a 16-bit sequence
+ * number used to detect stale references.
+ *
+ * For error reporting purposes we treat the 48-bit index as a signed quantity.
+ *
+ * The sequence number is a circular counter (skipping 0) describing how many
+ * times the referenced mft record has been (re)used. This has to match the
+ * sequence number of the mft record being referenced, otherwise the reference
+ * is considered stale and removed (FIXME: only ntfsck or the driver itself?).
+ *
+ * If the sequence number is zero it is assumed that no sequence number
+ * consistency checking should be performed.
+ *
+ * FIXME: Since inodes are 32-bit as of now, the driver needs to always check
+ * for high_part being 0 and if not either BUG(), cause a panic() or handle
+ * the situation in some other way. This shouldn't be a problem as a volume has
+ * to become HUGE in order to need more than 32-bits worth of mft records.
+ * Assuming the standard mft record size of 1kb only the records (never mind
+ * the non-resident attributes, etc.) would require 4Tb of space on their own
+ * for the first 32 bits worth of records. This is only if some strange person
+ * doesn't decide to foul play and make the mft sparse which would be a really
+ * horrible thing to do as it would trash our current driver implementation. )-:
+ * Do I hear screams "we want 64-bit inodes!" ?!? (-;
+ *
+ * FIXME: The mft zone is defined as the first 12% of the volume. This space is
+ * reserved so that the mft can grow contiguously and hence doesn't become
+ * fragmented. Volume free space includes the empty part of the mft zone and
+ * when the volume's free 88% are used up, the mft zone is shrunk by a factor
+ * of 2, thus making more space available for more files/data. This process is
+ * repeated every time there is no more free space except for the mft zone until
+ * there really is no more free space.
+ */
+
+/*
+ * Typedef the MFT_REF as a 64-bit value for easier handling.
+ * Also define two unpacking macros to get to the reference (MREF) and
+ * sequence number (MSEQNO) respectively.
+ * The _LE versions are to be applied on little endian MFT_REFs.
+ * Note: The _LE versions will return a CPU endian formatted value!
+ */
+#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
+#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
+
+typedef u64 MFT_REF;
+typedef le64 leMFT_REF;
+
+#define MK_MREF(m, s)	((MFT_REF)(((MFT_REF)(s) << 48) |		\
+					((MFT_REF)(m) & MFT_REF_MASK_CPU)))
+#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s))
+
+#define MREF(x)		((unsigned long)((x) & MFT_REF_MASK_CPU))
+#define MSEQNO(x)	((u16)(((x) >> 48) & 0xffff))
+#define MREF_LE(x)	((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU))
+#define MSEQNO_LE(x)	((u16)((le64_to_cpu(x) >> 48) & 0xffff))
+
+#define IS_ERR_MREF(x)	(((x) & 0x0000800000000000ULL) ? true : false)
+#define ERR_MREF(x)	((u64)((s64)(x)))
+#define MREF_ERR(x)	((int)((s64)(x)))
+
+/*
+ * The mft record header present at the beginning of every record in the mft.
+ * This is followed by a sequence of variable length attribute records which
+ * is terminated by an attribute of type AT_END which is a truncated attribute
+ * in that it only consists of the attribute type code AT_END and none of the
+ * other members of the attribute structure are present.
+ */
+typedef struct {
+/*Ofs*/
+/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+	NTFS_RECORD_TYPE magic;	/* Usually the magic is "FILE". */
+	le16 usa_ofs;		/* See NTFS_RECORD definition above. */
+	le16 usa_count;		/* See NTFS_RECORD definition above. */
+
+/*  8*/	le64 lsn;		/* $LogFile sequence number for this record.
+				   Changed every time the record is modified. */
+/* 16*/	le16 sequence_number;	/* Number of times this mft record has been
+				   reused. (See description for MFT_REF
+				   above.) NOTE: The increment (skipping zero)
+				   is done when the file is deleted. NOTE: If
+				   this is zero it is left zero. */
+/* 18*/	le16 link_count;	/* Number of hard links, i.e. the number of
+				   directory entries referencing this record.
+				   NOTE: Only used in mft base records.
+				   NOTE: When deleting a directory entry we
+				   check the link_count and if it is 1 we
+				   delete the file. Otherwise we delete the
+				   FILE_NAME_ATTR being referenced by the
+				   directory entry from the mft record and
+				   decrement the link_count.
+				   FIXME: Careful with Win32 + DOS names! */
+/* 20*/	le16 attrs_offset;	/* Byte offset to the first attribute in this
+				   mft record from the start of the mft record.
+				   NOTE: Must be aligned to 8-byte boundary. */
+/* 22*/	MFT_RECORD_FLAGS flags;	/* Bit array of MFT_RECORD_FLAGS. When a file
+				   is deleted, the MFT_RECORD_IN_USE flag is
+				   set to zero. */
+/* 24*/	le32 bytes_in_use;	/* Number of bytes used in this mft record.
+				   NOTE: Must be aligned to 8-byte boundary. */
+/* 28*/	le32 bytes_allocated;	/* Number of bytes allocated for this mft
+				   record. This should be equal to the mft
+				   record size. */
+/* 32*/	leMFT_REF base_mft_record;/* This is zero for base mft records.
+				   When it is not zero it is a mft reference
+				   pointing to the base mft record to which
+				   this record belongs (this is then used to
+				   locate the attribute list attribute present
+				   in the base record which describes this
+				   extension record and hence might need
+				   modification when the extension record
+				   itself is modified, also locating the
+				   attribute list also means finding the other
+				   potential extents, belonging to the non-base
+				   mft record). */
+/* 40*/	le16 next_attr_instance;/* The instance number that will be assigned to
+				   the next attribute added to this mft record.
+				   NOTE: Incremented each time after it is used.
+				   NOTE: Every time the mft record is reused
+				   this number is set to zero.  NOTE: The first
+				   instance number is always 0. */
+/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */
+/* 42*/ le16 reserved;		/* Reserved/alignment. */
+/* 44*/ le32 mft_record_number;	/* Number of this mft record. */
+/* sizeof() = 48 bytes */
+/*
+ * When (re)using the mft record, we place the update sequence array at this
+ * offset, i.e. before we start with the attributes.  This also makes sense,
+ * otherwise we could run into problems with the update sequence array
+ * containing in itself the last two bytes of a sector which would mean that
+ * multi sector transfer protection wouldn't work.  As you can't protect data
+ * by overwriting it since you then can't get it back...
+ * When reading we obviously use the data from the ntfs record header.
+ */
+} __attribute__ ((__packed__)) MFT_RECORD;
+
+/* This is the version without the NTFS 3.1+ specific fields. */
+typedef struct {
+/*Ofs*/
+/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+	NTFS_RECORD_TYPE magic;	/* Usually the magic is "FILE". */
+	le16 usa_ofs;		/* See NTFS_RECORD definition above. */
+	le16 usa_count;		/* See NTFS_RECORD definition above. */
+
+/*  8*/	le64 lsn;		/* $LogFile sequence number for this record.
+				   Changed every time the record is modified. */
+/* 16*/	le16 sequence_number;	/* Number of times this mft record has been
+				   reused. (See description for MFT_REF
+				   above.) NOTE: The increment (skipping zero)
+				   is done when the file is deleted. NOTE: If
+				   this is zero it is left zero. */
+/* 18*/	le16 link_count;	/* Number of hard links, i.e. the number of
+				   directory entries referencing this record.
+				   NOTE: Only used in mft base records.
+				   NOTE: When deleting a directory entry we
+				   check the link_count and if it is 1 we
+				   delete the file. Otherwise we delete the
+				   FILE_NAME_ATTR being referenced by the
+				   directory entry from the mft record and
+				   decrement the link_count.
+				   FIXME: Careful with Win32 + DOS names! */
+/* 20*/	le16 attrs_offset;	/* Byte offset to the first attribute in this
+				   mft record from the start of the mft record.
+				   NOTE: Must be aligned to 8-byte boundary. */
+/* 22*/	MFT_RECORD_FLAGS flags;	/* Bit array of MFT_RECORD_FLAGS. When a file
+				   is deleted, the MFT_RECORD_IN_USE flag is
+				   set to zero. */
+/* 24*/	le32 bytes_in_use;	/* Number of bytes used in this mft record.
+				   NOTE: Must be aligned to 8-byte boundary. */
+/* 28*/	le32 bytes_allocated;	/* Number of bytes allocated for this mft
+				   record. This should be equal to the mft
+				   record size. */
+/* 32*/	leMFT_REF base_mft_record;/* This is zero for base mft records.
+				   When it is not zero it is a mft reference
+				   pointing to the base mft record to which
+				   this record belongs (this is then used to
+				   locate the attribute list attribute present
+				   in the base record which describes this
+				   extension record and hence might need
+				   modification when the extension record
+				   itself is modified, also locating the
+				   attribute list also means finding the other
+				   potential extents, belonging to the non-base
+				   mft record). */
+/* 40*/	le16 next_attr_instance;/* The instance number that will be assigned to
+				   the next attribute added to this mft record.
+				   NOTE: Incremented each time after it is used.
+				   NOTE: Every time the mft record is reused
+				   this number is set to zero.  NOTE: The first
+				   instance number is always 0. */
+/* sizeof() = 42 bytes */
+/*
+ * When (re)using the mft record, we place the update sequence array at this
+ * offset, i.e. before we start with the attributes.  This also makes sense,
+ * otherwise we could run into problems with the update sequence array
+ * containing in itself the last two bytes of a sector which would mean that
+ * multi sector transfer protection wouldn't work.  As you can't protect data
+ * by overwriting it since you then can't get it back...
+ * When reading we obviously use the data from the ntfs record header.
+ */
+} __attribute__ ((__packed__)) MFT_RECORD_OLD;
+
+/*
+ * System defined attributes (32-bit).  Each attribute type has a corresponding
+ * attribute name (Unicode string of maximum 64 character length) as described
+ * by the attribute definitions present in the data attribute of the $AttrDef
+ * system file.  On NTFS 3.0 volumes the names are just as the types are named
+ * in the below defines exchanging AT_ for the dollar sign ($).  If that is not
+ * a revealing choice of symbol I do not know what is... (-;
+ */
+enum {
+	AT_UNUSED			= cpu_to_le32(         0),
+	AT_STANDARD_INFORMATION		= cpu_to_le32(      0x10),
+	AT_ATTRIBUTE_LIST		= cpu_to_le32(      0x20),
+	AT_FILE_NAME			= cpu_to_le32(      0x30),
+	AT_OBJECT_ID			= cpu_to_le32(      0x40),
+	AT_SECURITY_DESCRIPTOR		= cpu_to_le32(      0x50),
+	AT_VOLUME_NAME			= cpu_to_le32(      0x60),
+	AT_VOLUME_INFORMATION		= cpu_to_le32(      0x70),
+	AT_DATA				= cpu_to_le32(      0x80),
+	AT_INDEX_ROOT			= cpu_to_le32(      0x90),
+	AT_INDEX_ALLOCATION		= cpu_to_le32(      0xa0),
+	AT_BITMAP			= cpu_to_le32(      0xb0),
+	AT_REPARSE_POINT		= cpu_to_le32(      0xc0),
+	AT_EA_INFORMATION		= cpu_to_le32(      0xd0),
+	AT_EA				= cpu_to_le32(      0xe0),
+	AT_PROPERTY_SET			= cpu_to_le32(      0xf0),
+	AT_LOGGED_UTILITY_STREAM	= cpu_to_le32(     0x100),
+	AT_FIRST_USER_DEFINED_ATTRIBUTE	= cpu_to_le32(    0x1000),
+	AT_END				= cpu_to_le32(0xffffffff)
+};
+
+typedef le32 ATTR_TYPE;
+
+/*
+ * The collation rules for sorting views/indexes/etc (32-bit).
+ *
+ * COLLATION_BINARY - Collate by binary compare where the first byte is most
+ *	significant.
+ * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary
+ *	Unicode values, except that when a character can be uppercased, the
+ *	upper case value collates before the lower case one.
+ * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation
+ *	is done very much like COLLATION_UNICODE_STRING. In fact I have no idea
+ *	what the difference is. Perhaps the difference is that file names
+ *	would treat some special characters in an odd way (see
+ *	unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[]
+ *	for what I mean but COLLATION_UNICODE_STRING would not give any special
+ *	treatment to any characters at all, but this is speculation.
+ * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key
+ *	values. E.g. used for $SII index in FILE_Secure, which sorts by
+ *	security_id (le32).
+ * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values.
+ *	E.g. used for $O index in FILE_Extend/$Quota.
+ * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash
+ *	values and second by ascending security_id values. E.g. used for $SDH
+ *	index in FILE_Secure.
+ * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending
+ *	le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which
+ *	sorts by object_id (16-byte), by splitting up the object_id in four
+ *	le32 values and using them as individual keys. E.g. take the following
+ *	two security_ids, stored as follows on disk:
+ *		1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59
+ *		2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45
+ *	To compare them, they are split into four le32 values each, like so:
+ *		1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081
+ *		2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179
+ *	Now, it is apparent why the 2nd object_id collates after the 1st: the
+ *	first le32 value of the 1st object_id is less than the first le32 of
+ *	the 2nd object_id. If the first le32 values of both object_ids were
+ *	equal then the second le32 values would be compared, etc.
+ */
+enum {
+	COLLATION_BINARY		= cpu_to_le32(0x00),
+	COLLATION_FILE_NAME		= cpu_to_le32(0x01),
+	COLLATION_UNICODE_STRING	= cpu_to_le32(0x02),
+	COLLATION_NTOFS_ULONG		= cpu_to_le32(0x10),
+	COLLATION_NTOFS_SID		= cpu_to_le32(0x11),
+	COLLATION_NTOFS_SECURITY_HASH	= cpu_to_le32(0x12),
+	COLLATION_NTOFS_ULONGS		= cpu_to_le32(0x13),
+};
+
+typedef le32 COLLATION_RULE;
+
+/*
+ * The flags (32-bit) describing attribute properties in the attribute
+ * definition structure.  FIXME: This information is based on Regis's
+ * information and, according to him, it is not certain and probably
+ * incomplete.  The INDEXABLE flag is fairly certainly correct as only the file
+ * name attribute has this flag set and this is the only attribute indexed in
+ * NT4.
+ */
+enum {
+	ATTR_DEF_INDEXABLE	= cpu_to_le32(0x02), /* Attribute can be
+					indexed. */
+	ATTR_DEF_MULTIPLE	= cpu_to_le32(0x04), /* Attribute type
+					can be present multiple times in the
+					mft records of an inode. */
+	ATTR_DEF_NOT_ZERO	= cpu_to_le32(0x08), /* Attribute value
+					must contain at least one non-zero
+					byte. */
+	ATTR_DEF_INDEXED_UNIQUE	= cpu_to_le32(0x10), /* Attribute must be
+					indexed and the attribute value must be
+					unique for the attribute type in all of
+					the mft records of an inode. */
+	ATTR_DEF_NAMED_UNIQUE	= cpu_to_le32(0x20), /* Attribute must be
+					named and the name must be unique for
+					the attribute type in all of the mft
+					records of an inode. */
+	ATTR_DEF_RESIDENT	= cpu_to_le32(0x40), /* Attribute must be
+					resident. */
+	ATTR_DEF_ALWAYS_LOG	= cpu_to_le32(0x80), /* Always log
+					modifications to this attribute,
+					regardless of whether it is resident or
+					non-resident.  Without this, only log
+					modifications if the attribute is
+					resident. */
+};
+
+typedef le32 ATTR_DEF_FLAGS;
+
+/*
+ * The data attribute of FILE_AttrDef contains a sequence of attribute
+ * definitions for the NTFS volume. With this, it is supposed to be safe for an
+ * older NTFS driver to mount a volume containing a newer NTFS version without
+ * damaging it (that's the theory. In practice it's: not damaging it too much).
+ * Entries are sorted by attribute type. The flags describe whether the
+ * attribute can be resident/non-resident and possibly other things, but the
+ * actual bits are unknown.
+ */
+typedef struct {
+/*hex ofs*/
+/*  0*/	ntfschar name[0x40];		/* Unicode name of the attribute. Zero
+					   terminated. */
+/* 80*/	ATTR_TYPE type;			/* Type of the attribute. */
+/* 84*/	le32 display_rule;		/* Default display rule.
+					   FIXME: What does it mean? (AIA) */
+/* 88*/ COLLATION_RULE collation_rule;	/* Default collation rule. */
+/* 8c*/	ATTR_DEF_FLAGS flags;		/* Flags describing the attribute. */
+/* 90*/	sle64 min_size;			/* Optional minimum attribute size. */
+/* 98*/	sle64 max_size;			/* Maximum size of attribute. */
+/* sizeof() = 0xa0 or 160 bytes */
+} __attribute__ ((__packed__)) ATTR_DEF;
+
+/*
+ * Attribute flags (16-bit).
+ */
+enum {
+	ATTR_IS_COMPRESSED    = cpu_to_le16(0x0001),
+	ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
+							      mask.  Also, first
+							      illegal value. */
+	ATTR_IS_ENCRYPTED     = cpu_to_le16(0x4000),
+	ATTR_IS_SPARSE	      = cpu_to_le16(0x8000),
+} __attribute__ ((__packed__));
+
+typedef le16 ATTR_FLAGS;
+
+/*
+ * Attribute compression.
+ *
+ * Only the data attribute is ever compressed in the current ntfs driver in
+ * Windows. Further, compression is only applied when the data attribute is
+ * non-resident. Finally, to use compression, the maximum allowed cluster size
+ * on a volume is 4kib.
+ *
+ * The compression method is based on independently compressing blocks of X
+ * clusters, where X is determined from the compression_unit value found in the
+ * non-resident attribute record header (more precisely: X = 2^compression_unit
+ * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4).
+ *
+ * There are three different cases of how a compression block of X clusters
+ * can be stored:
+ *
+ *   1) The data in the block is all zero (a sparse block):
+ *	  This is stored as a sparse block in the runlist, i.e. the runlist
+ *	  entry has length = X and lcn = -1. The mapping pairs array actually
+ *	  uses a delta_lcn value length of 0, i.e. delta_lcn is not present at
+ *	  all, which is then interpreted by the driver as lcn = -1.
+ *	  NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then
+ *	  the same principles apply as above, except that the length is not
+ *	  restricted to being any particular value.
+ *
+ *   2) The data in the block is not compressed:
+ *	  This happens when compression doesn't reduce the size of the block
+ *	  in clusters. I.e. if compression has a small effect so that the
+ *	  compressed data still occupies X clusters, then the uncompressed data
+ *	  is stored in the block.
+ *	  This case is recognised by the fact that the runlist entry has
+ *	  length = X and lcn >= 0. The mapping pairs array stores this as
+ *	  normal with a run length of X and some specific delta_lcn, i.e.
+ *	  delta_lcn has to be present.
+ *
+ *   3) The data in the block is compressed:
+ *	  The common case. This case is recognised by the fact that the run
+ *	  list entry has length L < X and lcn >= 0. The mapping pairs array
+ *	  stores this as normal with a run length of X and some specific
+ *	  delta_lcn, i.e. delta_lcn has to be present. This runlist entry is
+ *	  immediately followed by a sparse entry with length = X - L and
+ *	  lcn = -1. The latter entry is to make up the vcn counting to the
+ *	  full compression block size X.
+ *
+ * In fact, life is more complicated because adjacent entries of the same type
+ * can be coalesced. This means that one has to keep track of the number of
+ * clusters handled and work on a basis of X clusters at a time being one
+ * block. An example: if length L > X this means that this particular runlist
+ * entry contains a block of length X and part of one or more blocks of length
+ * L - X. Another example: if length L < X, this does not necessarily mean that
+ * the block is compressed as it might be that the lcn changes inside the block
+ * and hence the following runlist entry describes the continuation of the
+ * potentially compressed block. The block would be compressed if the
+ * following runlist entry describes at least X - L sparse clusters, thus
+ * making up the compression block length as described in point 3 above. (Of
+ * course, there can be several runlist entries with small lengths so that the
+ * sparse entry does not follow the first data containing entry with
+ * length < X.)
+ *
+ * NOTE: At the end of the compressed attribute value, there most likely is not
+ * just the right amount of data to make up a compression block, thus this data
+ * is not even attempted to be compressed. It is just stored as is, unless
+ * the number of clusters it occupies is reduced when compressed in which case
+ * it is stored as a compressed compression block, complete with sparse
+ * clusters at the end.
+ */
+
+/*
+ * Flags of resident attributes (8-bit).
+ */
+enum {
+	RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index
+					    (has implications for deleting and
+					    modifying the attribute). */
+} __attribute__ ((__packed__));
+
+typedef u8 RESIDENT_ATTR_FLAGS;
+
+/*
+ * Attribute record header. Always aligned to 8-byte boundary.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/	ATTR_TYPE type;		/* The (32-bit) type of the attribute. */
+/*  4*/	le32 length;		/* Byte size of the resident part of the
+				   attribute (aligned to 8-byte boundary).
+				   Used to get to the next attribute. */
+/*  8*/	u8 non_resident;	/* If 0, attribute is resident.
+				   If 1, attribute is non-resident. */
+/*  9*/	u8 name_length;		/* Unicode character size of name of attribute.
+				   0 if unnamed. */
+/* 10*/	le16 name_offset;	/* If name_length != 0, the byte offset to the
+				   beginning of the name from the attribute
+				   record. Note that the name is stored as a
+				   Unicode string. When creating, place offset
+				   just at the end of the record header. Then,
+				   follow with attribute value or mapping pairs
+				   array, resident and non-resident attributes
+				   respectively, aligning to an 8-byte
+				   boundary. */
+/* 12*/	ATTR_FLAGS flags;	/* Flags describing the attribute. */
+/* 14*/	le16 instance;		/* The instance of this attribute record. This
+				   number is unique within this mft record (see
+				   MFT_RECORD/next_attribute_instance notes in
+				   in mft.h for more details). */
+/* 16*/	union {
+		/* Resident attributes. */
+		struct {
+/* 16 */		le32 value_length;/* Byte size of attribute value. */
+/* 20 */		le16 value_offset;/* Byte offset of the attribute
+					     value from the start of the
+					     attribute record. When creating,
+					     align to 8-byte boundary if we
+					     have a name present as this might
+					     not have a length of a multiple
+					     of 8-bytes. */
+/* 22 */		RESIDENT_ATTR_FLAGS flags; /* See above. */
+/* 23 */		s8 reserved;	  /* Reserved/alignment to 8-byte
+					     boundary. */
+		} __attribute__ ((__packed__)) resident;
+		/* Non-resident attributes. */
+		struct {
+/* 16*/			leVCN lowest_vcn;/* Lowest valid virtual cluster number
+				for this portion of the attribute value or
+				0 if this is the only extent (usually the
+				case). - Only when an attribute list is used
+				does lowest_vcn != 0 ever occur. */
+/* 24*/			leVCN highest_vcn;/* Highest valid vcn of this extent of
+				the attribute value. - Usually there is only one
+				portion, so this usually equals the attribute
+				value size in clusters minus 1. Can be -1 for
+				zero length files. Can be 0 for "single extent"
+				attributes. */
+/* 32*/			le16 mapping_pairs_offset; /* Byte offset from the
+				beginning of the structure to the mapping pairs
+				array which contains the mappings between the
+				vcns and the logical cluster numbers (lcns).
+				When creating, place this at the end of this
+				record header aligned to 8-byte boundary. */
+/* 34*/			u8 compression_unit; /* The compression unit expressed
+				as the log to the base 2 of the number of
+				clusters in a compression unit.  0 means not
+				compressed.  (This effectively limits the
+				compression unit size to be a power of two
+				clusters.)  WinNT4 only uses a value of 4.
+				Sparse files have this set to 0 on XPSP2. */
+/* 35*/			u8 reserved[5];		/* Align to 8-byte boundary. */
+/* The sizes below are only used when lowest_vcn is zero, as otherwise it would
+   be difficult to keep them up-to-date.*/
+/* 40*/			sle64 allocated_size;	/* Byte size of disk space
+				allocated to hold the attribute value. Always
+				is a multiple of the cluster size. When a file
+				is compressed, this field is a multiple of the
+				compression block size (2^compression_unit) and
+				it represents the logically allocated space
+				rather than the actual on disk usage. For this
+				use the compressed_size (see below). */
+/* 48*/			sle64 data_size;	/* Byte size of the attribute
+				value. Can be larger than allocated_size if
+				attribute value is compressed or sparse. */
+/* 56*/			sle64 initialized_size;	/* Byte size of initialized
+				portion of the attribute value. Usually equals
+				data_size. */
+/* sizeof(uncompressed attr) = 64*/
+/* 64*/			sle64 compressed_size;	/* Byte size of the attribute
+				value after compression.  Only present when
+				compressed or sparse.  Always is a multiple of
+				the cluster size.  Represents the actual amount
+				of disk space being used on the disk. */
+/* sizeof(compressed attr) = 72*/
+		} __attribute__ ((__packed__)) non_resident;
+	} __attribute__ ((__packed__)) data;
+} __attribute__ ((__packed__)) ATTR_RECORD;
+
+typedef ATTR_RECORD ATTR_REC;
+
+/*
+ * File attribute flags (32-bit) appearing in the file_attributes fields of the
+ * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR
+ * attributes of MFT_RECORDs and directory index entries.
+ *
+ * All of the below flags appear in the directory index entries but only some
+ * appear in the STANDARD_INFORMATION attribute whilst only some others appear
+ * in the FILENAME_ATTR attribute of MFT_RECORDs.  Unless otherwise stated the
+ * flags appear in all of the above.
+ */
+enum {
+	FILE_ATTR_READONLY		= cpu_to_le32(0x00000001),
+	FILE_ATTR_HIDDEN		= cpu_to_le32(0x00000002),
+	FILE_ATTR_SYSTEM		= cpu_to_le32(0x00000004),
+	/* Old DOS volid. Unused in NT.	= cpu_to_le32(0x00000008), */
+
+	FILE_ATTR_DIRECTORY		= cpu_to_le32(0x00000010),
+	/* Note, FILE_ATTR_DIRECTORY is not considered valid in NT.  It is
+	   reserved for the DOS SUBDIRECTORY flag. */
+	FILE_ATTR_ARCHIVE		= cpu_to_le32(0x00000020),
+	FILE_ATTR_DEVICE		= cpu_to_le32(0x00000040),
+	FILE_ATTR_NORMAL		= cpu_to_le32(0x00000080),
+
+	FILE_ATTR_TEMPORARY		= cpu_to_le32(0x00000100),
+	FILE_ATTR_SPARSE_FILE		= cpu_to_le32(0x00000200),
+	FILE_ATTR_REPARSE_POINT		= cpu_to_le32(0x00000400),
+	FILE_ATTR_COMPRESSED		= cpu_to_le32(0x00000800),
+
+	FILE_ATTR_OFFLINE		= cpu_to_le32(0x00001000),
+	FILE_ATTR_NOT_CONTENT_INDEXED	= cpu_to_le32(0x00002000),
+	FILE_ATTR_ENCRYPTED		= cpu_to_le32(0x00004000),
+
+	FILE_ATTR_VALID_FLAGS		= cpu_to_le32(0x00007fb7),
+	/* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
+	   FILE_ATTR_DEVICE and preserves everything else.  This mask is used
+	   to obtain all flags that are valid for reading. */
+	FILE_ATTR_VALID_SET_FLAGS	= cpu_to_le32(0x000031a7),
+	/* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
+	   F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
+	   F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest.  This mask
+	   is used to obtain all flags that are valid for setting. */
+	/*
+	 * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all
+	 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
+	 * attribute of an mft record.
+	 */
+	FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT	= cpu_to_le32(0x10000000),
+	/* Note, this is a copy of the corresponding bit from the mft record,
+	   telling us whether this is a directory or not, i.e. whether it has
+	   an index root attribute or not. */
+	FILE_ATTR_DUP_VIEW_INDEX_PRESENT	= cpu_to_le32(0x20000000),
+	/* Note, this is a copy of the corresponding bit from the mft record,
+	   telling us whether this file has a view index present (eg. object id
+	   index, quota index, one of the security indexes or the encrypting
+	   filesystem related indexes). */
+};
+
+typedef le32 FILE_ATTR_FLAGS;
+
+/*
+ * NOTE on times in NTFS: All times are in MS standard time format, i.e. they
+ * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00
+ * universal coordinated time (UTC). (In Linux time starts 1st January 1970,
+ * 00:00:00 UTC and is stored as the number of 1-second intervals since then.)
+ */
+
+/*
+ * Attribute: Standard information (0x10).
+ *
+ * NOTE: Always resident.
+ * NOTE: Present in all base file records on a volume.
+ * NOTE: There is conflicting information about the meaning of each of the time
+ *	 fields but the meaning as defined below has been verified to be
+ *	 correct by practical experimentation on Windows NT4 SP6a and is hence
+ *	 assumed to be the one and only correct interpretation.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/	sle64 creation_time;		/* Time file was created. Updated when
+					   a filename is changed(?). */
+/*  8*/	sle64 last_data_change_time;	/* Time the data attribute was last
+					   modified. */
+/* 16*/	sle64 last_mft_change_time;	/* Time this mft record was last
+					   modified. */
+/* 24*/	sle64 last_access_time;		/* Approximate time when the file was
+					   last accessed (obviously this is not
+					   updated on read-only volumes). In
+					   Windows this is only updated when
+					   accessed if some time delta has
+					   passed since the last update. Also,
+					   last access time updates can be
+					   disabled altogether for speed. */
+/* 32*/	FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */
+/* 36*/	union {
+	/* NTFS 1.2 */
+		struct {
+		/* 36*/	u8 reserved12[12];	/* Reserved/alignment to 8-byte
+						   boundary. */
+		} __attribute__ ((__packed__)) v1;
+	/* sizeof() = 48 bytes */
+	/* NTFS 3.x */
+		struct {
+/*
+ * If a volume has been upgraded from a previous NTFS version, then these
+ * fields are present only if the file has been accessed since the upgrade.
+ * Recognize the difference by comparing the length of the resident attribute
+ * value. If it is 48, then the following fields are missing. If it is 72 then
+ * the fields are present. Maybe just check like this:
+ *	if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) {
+ *		Assume NTFS 1.2- format.
+ *		If (volume version is 3.x)
+ *			Upgrade attribute to NTFS 3.x format.
+ *		else
+ *			Use NTFS 1.2- format for access.
+ *	} else
+ *		Use NTFS 3.x format for access.
+ * Only problem is that it might be legal to set the length of the value to
+ * arbitrarily large values thus spoiling this check. - But chkdsk probably
+ * views that as a corruption, assuming that it behaves like this for all
+ * attributes.
+ */
+		/* 36*/	le32 maximum_versions;	/* Maximum allowed versions for
+				file. Zero if version numbering is disabled. */
+		/* 40*/	le32 version_number;	/* This file's version (if any).
+				Set to zero if maximum_versions is zero. */
+		/* 44*/	le32 class_id;		/* Class id from bidirectional
+				class id index (?). */
+		/* 48*/	le32 owner_id;		/* Owner_id of the user owning
+				the file. Translate via $Q index in FILE_Extend
+				/$Quota to the quota control entry for the user
+				owning the file. Zero if quotas are disabled. */
+		/* 52*/	le32 security_id;	/* Security_id for the file.
+				Translate via $SII index and $SDS data stream
+				in FILE_Secure to the security descriptor. */
+		/* 56*/	le64 quota_charged;	/* Byte size of the charge to
+				the quota for all streams of the file. Note: Is
+				zero if quotas are disabled. */
+		/* 64*/	leUSN usn;		/* Last update sequence number
+				of the file.  This is a direct index into the
+				transaction log file ($UsnJrnl).  It is zero if
+				the usn journal is disabled or this file has
+				not been subject to logging yet.  See usnjrnl.h
+				for details. */
+		} __attribute__ ((__packed__)) v3;
+	/* sizeof() = 72 bytes (NTFS 3.x) */
+	} __attribute__ ((__packed__)) ver;
+} __attribute__ ((__packed__)) STANDARD_INFORMATION;
+
+/*
+ * Attribute: Attribute list (0x20).
+ *
+ * - Can be either resident or non-resident.
+ * - Value consists of a sequence of variable length, 8-byte aligned,
+ * ATTR_LIST_ENTRY records.
+ * - The list is not terminated by anything at all! The only way to know when
+ * the end is reached is to keep track of the current offset and compare it to
+ * the attribute value size.
+ * - The attribute list attribute contains one entry for each attribute of
+ * the file in which the list is located, except for the list attribute
+ * itself. The list is sorted: first by attribute type, second by attribute
+ * name (if present), third by instance number. The extents of one
+ * non-resident attribute (if present) immediately follow after the initial
+ * extent. They are ordered by lowest_vcn and have their instace set to zero.
+ * It is not allowed to have two attributes with all sorting keys equal.
+ * - Further restrictions:
+ *	- If not resident, the vcn to lcn mapping array has to fit inside the
+ *	  base mft record.
+ *	- The attribute list attribute value has a maximum size of 256kb. This
+ *	  is imposed by the Windows cache manager.
+ * - Attribute lists are only used when the attributes of mft record do not
+ * fit inside the mft record despite all attributes (that can be made
+ * non-resident) having been made non-resident. This can happen e.g. when:
+ *	- File has a large number of hard links (lots of file name
+ *	  attributes present).
+ *	- The mapping pairs array of some non-resident attribute becomes so
+ *	  large due to fragmentation that it overflows the mft record.
+ *	- The security descriptor is very complex (not applicable to
+ *	  NTFS 3.0 volumes).
+ *	- There are many named streams.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/	ATTR_TYPE type;		/* Type of referenced attribute. */
+/*  4*/	le16 length;		/* Byte size of this entry (8-byte aligned). */
+/*  6*/	u8 name_length;		/* Size in Unicode chars of the name of the
+				   attribute or 0 if unnamed. */
+/*  7*/	u8 name_offset;		/* Byte offset to beginning of attribute name
+				   (always set this to where the name would
+				   start even if unnamed). */
+/*  8*/	leVCN lowest_vcn;	/* Lowest virtual cluster number of this portion
+				   of the attribute value. This is usually 0. It
+				   is non-zero for the case where one attribute
+				   does not fit into one mft record and thus
+				   several mft records are allocated to hold
+				   this attribute. In the latter case, each mft
+				   record holds one extent of the attribute and
+				   there is one attribute list entry for each
+				   extent. NOTE: This is DEFINITELY a signed
+				   value! The windows driver uses cmp, followed
+				   by jg when comparing this, thus it treats it
+				   as signed. */
+/* 16*/	leMFT_REF mft_reference;/* The reference of the mft record holding
+				   the ATTR_RECORD for this portion of the
+				   attribute value. */
+/* 24*/	le16 instance;		/* If lowest_vcn = 0, the instance of the
+				   attribute being referenced; otherwise 0. */
+/* 26*/	ntfschar name[0];	/* Use when creating only. When reading use
+				   name_offset to determine the location of the
+				   name. */
+/* sizeof() = 26 + (attribute_name_length * 2) bytes */
+} __attribute__ ((__packed__)) ATTR_LIST_ENTRY;
+
+/*
+ * The maximum allowed length for a file name.
+ */
+#define MAXIMUM_FILE_NAME_LENGTH	255
+
+/*
+ * Possible namespaces for filenames in ntfs (8-bit).
+ */
+enum {
+	FILE_NAME_POSIX		= 0x00,
+	/* This is the largest namespace. It is case sensitive and allows all
+	   Unicode characters except for: '\0' and '/'.  Beware that in
+	   WinNT/2k/2003 by default files which eg have the same name except
+	   for their case will not be distinguished by the standard utilities
+	   and thus a "del filename" will delete both "filename" and "fileName"
+	   without warning.  However if for example Services For Unix (SFU) are
+	   installed and the case sensitive option was enabled at installation
+	   time, then you can create/access/delete such files.
+	   Note that even SFU places restrictions on the filenames beyond the
+	   '\0' and '/' and in particular the following set of characters is
+	   not allowed: '"', '/', '<', '>', '\'.  All other characters,
+	   including the ones no allowed in WIN32 namespace are allowed.
+	   Tested with SFU 3.5 (this is now free) running on Windows XP. */
+	FILE_NAME_WIN32		= 0x01,
+	/* The standard WinNT/2k NTFS long filenames. Case insensitive.  All
+	   Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\',
+	   and '|'.  Further, names cannot end with a '.' or a space. */
+	FILE_NAME_DOS		= 0x02,
+	/* The standard DOS filenames (8.3 format). Uppercase only.  All 8-bit
+	   characters greater space, except: '"', '*', '+', ',', '/', ':', ';',
+	   '<', '=', '>', '?', and '\'. */
+	FILE_NAME_WIN32_AND_DOS	= 0x03,
+	/* 3 means that both the Win32 and the DOS filenames are identical and
+	   hence have been saved in this single filename record. */
+} __attribute__ ((__packed__));
+
+typedef u8 FILE_NAME_TYPE_FLAGS;
+
+/*
+ * Attribute: Filename (0x30).
+ *
+ * NOTE: Always resident.
+ * NOTE: All fields, except the parent_directory, are only updated when the
+ *	 filename is changed. Until then, they just become out of sync with
+ *	 reality and the more up to date values are present in the standard
+ *	 information attribute.
+ * NOTE: There is conflicting information about the meaning of each of the time
+ *	 fields but the meaning as defined below has been verified to be
+ *	 correct by practical experimentation on Windows NT4 SP6a and is hence
+ *	 assumed to be the one and only correct interpretation.
+ */
+typedef struct {
+/*hex ofs*/
+/*  0*/	leMFT_REF parent_directory;	/* Directory this filename is
+					   referenced from. */
+/*  8*/	sle64 creation_time;		/* Time file was created. */
+/* 10*/	sle64 last_data_change_time;	/* Time the data attribute was last
+					   modified. */
+/* 18*/	sle64 last_mft_change_time;	/* Time this mft record was last
+					   modified. */
+/* 20*/	sle64 last_access_time;		/* Time this mft record was last
+					   accessed. */
+/* 28*/	sle64 allocated_size;		/* Byte size of on-disk allocated space
+					   for the unnamed data attribute.  So
+					   for normal $DATA, this is the
+					   allocated_size from the unnamed
+					   $DATA attribute and for compressed
+					   and/or sparse $DATA, this is the
+					   compressed_size from the unnamed
+					   $DATA attribute.  For a directory or
+					   other inode without an unnamed $DATA
+					   attribute, this is always 0.  NOTE:
+					   This is a multiple of the cluster
+					   size. */
+/* 30*/	sle64 data_size;		/* Byte size of actual data in unnamed
+					   data attribute.  For a directory or
+					   other inode without an unnamed $DATA
+					   attribute, this is always 0. */
+/* 38*/	FILE_ATTR_FLAGS file_attributes;	/* Flags describing the file. */
+/* 3c*/	union {
+	/* 3c*/	struct {
+		/* 3c*/	le16 packed_ea_size;	/* Size of the buffer needed to
+						   pack the extended attributes
+						   (EAs), if such are present.*/
+		/* 3e*/	le16 reserved;		/* Reserved for alignment. */
+		} __attribute__ ((__packed__)) ea;
+	/* 3c*/	struct {
+		/* 3c*/	le32 reparse_point_tag;	/* Type of reparse point,
+						   present only in reparse
+						   points and only if there are
+						   no EAs. */
+		} __attribute__ ((__packed__)) rp;
+	} __attribute__ ((__packed__)) type;
+/* 40*/	u8 file_name_length;			/* Length of file name in
+						   (Unicode) characters. */
+/* 41*/	FILE_NAME_TYPE_FLAGS file_name_type;	/* Namespace of the file name.*/
+/* 42*/	ntfschar file_name[0];			/* File name in Unicode. */
+} __attribute__ ((__packed__)) FILE_NAME_ATTR;
+
+/*
+ * GUID structures store globally unique identifiers (GUID). A GUID is a
+ * 128-bit value consisting of one group of eight hexadecimal digits, followed
+ * by three groups of four hexadecimal digits each, followed by one group of
+ * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the
+ * distributed computing environment (DCE) universally unique identifier (UUID).
+ * Example of a GUID:
+ *	1F010768-5A73-BC91-0010A52216A7
+ */
+typedef struct {
+	le32 data1;	/* The first eight hexadecimal digits of the GUID. */
+	le16 data2;	/* The first group of four hexadecimal digits. */
+	le16 data3;	/* The second group of four hexadecimal digits. */
+	u8 data4[8];	/* The first two bytes are the third group of four
+			   hexadecimal digits. The remaining six bytes are the
+			   final 12 hexadecimal digits. */
+} __attribute__ ((__packed__)) GUID;
+
+/*
+ * FILE_Extend/$ObjId contains an index named $O. This index contains all
+ * object_ids present on the volume as the index keys and the corresponding
+ * mft_record numbers as the index entry data parts. The data part (defined
+ * below) also contains three other object_ids:
+ *	birth_volume_id - object_id of FILE_Volume on which the file was first
+ *			  created. Optional (i.e. can be zero).
+ *	birth_object_id - object_id of file when it was first created. Usually
+ *			  equals the object_id. Optional (i.e. can be zero).
+ *	domain_id	- Reserved (always zero).
+ */
+typedef struct {
+	leMFT_REF mft_reference;/* Mft record containing the object_id in
+				   the index entry key. */
+	union {
+		struct {
+			GUID birth_volume_id;
+			GUID birth_object_id;
+			GUID domain_id;
+		} __attribute__ ((__packed__)) origin;
+		u8 extended_info[48];
+	} __attribute__ ((__packed__)) opt;
+} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA;
+
+/*
+ * Attribute: Object id (NTFS 3.0+) (0x40).
+ *
+ * NOTE: Always resident.
+ */
+typedef struct {
+	GUID object_id;				/* Unique id assigned to the
+						   file.*/
+	/* The following fields are optional. The attribute value size is 16
+	   bytes, i.e. sizeof(GUID), if these are not present at all. Note,
+	   the entries can be present but one or more (or all) can be zero
+	   meaning that that particular value(s) is(are) not defined. */
+	union {
+		struct {
+			GUID birth_volume_id;	/* Unique id of volume on which
+						   the file was first created.*/
+			GUID birth_object_id;	/* Unique id of file when it was
+						   first created. */
+			GUID domain_id;		/* Reserved, zero. */
+		} __attribute__ ((__packed__)) origin;
+		u8 extended_info[48];
+	} __attribute__ ((__packed__)) opt;
+} __attribute__ ((__packed__)) OBJECT_ID_ATTR;
+
+/*
+ * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in
+ * the SID structure (see below).
+ */
+//typedef enum {					/* SID string prefix. */
+//	SECURITY_NULL_SID_AUTHORITY	= {0, 0, 0, 0, 0, 0},	/* S-1-0 */
+//	SECURITY_WORLD_SID_AUTHORITY	= {0, 0, 0, 0, 0, 1},	/* S-1-1 */
+//	SECURITY_LOCAL_SID_AUTHORITY	= {0, 0, 0, 0, 0, 2},	/* S-1-2 */
+//	SECURITY_CREATOR_SID_AUTHORITY	= {0, 0, 0, 0, 0, 3},	/* S-1-3 */
+//	SECURITY_NON_UNIQUE_AUTHORITY	= {0, 0, 0, 0, 0, 4},	/* S-1-4 */
+//	SECURITY_NT_SID_AUTHORITY	= {0, 0, 0, 0, 0, 5},	/* S-1-5 */
+//} IDENTIFIER_AUTHORITIES;
+
+/*
+ * These relative identifiers (RIDs) are used with the above identifier
+ * authorities to make up universal well-known SIDs.
+ *
+ * Note: The relative identifier (RID) refers to the portion of a SID, which
+ * identifies a user or group in relation to the authority that issued the SID.
+ * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is
+ * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and
+ * the relative identifier SECURITY_CREATOR_OWNER_RID (0).
+ */
+typedef enum {					/* Identifier authority. */
+	SECURITY_NULL_RID		  = 0,	/* S-1-0 */
+	SECURITY_WORLD_RID		  = 0,	/* S-1-1 */
+	SECURITY_LOCAL_RID		  = 0,	/* S-1-2 */
+
+	SECURITY_CREATOR_OWNER_RID	  = 0,	/* S-1-3 */
+	SECURITY_CREATOR_GROUP_RID	  = 1,	/* S-1-3 */
+
+	SECURITY_CREATOR_OWNER_SERVER_RID = 2,	/* S-1-3 */
+	SECURITY_CREATOR_GROUP_SERVER_RID = 3,	/* S-1-3 */
+
+	SECURITY_DIALUP_RID		  = 1,
+	SECURITY_NETWORK_RID		  = 2,
+	SECURITY_BATCH_RID		  = 3,
+	SECURITY_INTERACTIVE_RID	  = 4,
+	SECURITY_SERVICE_RID		  = 6,
+	SECURITY_ANONYMOUS_LOGON_RID	  = 7,
+	SECURITY_PROXY_RID		  = 8,
+	SECURITY_ENTERPRISE_CONTROLLERS_RID=9,
+	SECURITY_SERVER_LOGON_RID	  = 9,
+	SECURITY_PRINCIPAL_SELF_RID	  = 0xa,
+	SECURITY_AUTHENTICATED_USER_RID	  = 0xb,
+	SECURITY_RESTRICTED_CODE_RID	  = 0xc,
+	SECURITY_TERMINAL_SERVER_RID	  = 0xd,
+
+	SECURITY_LOGON_IDS_RID		  = 5,
+	SECURITY_LOGON_IDS_RID_COUNT	  = 3,
+
+	SECURITY_LOCAL_SYSTEM_RID	  = 0x12,
+
+	SECURITY_NT_NON_UNIQUE		  = 0x15,
+
+	SECURITY_BUILTIN_DOMAIN_RID	  = 0x20,
+
+	/*
+	 * Well-known domain relative sub-authority values (RIDs).
+	 */
+
+	/* Users. */
+	DOMAIN_USER_RID_ADMIN		  = 0x1f4,
+	DOMAIN_USER_RID_GUEST		  = 0x1f5,
+	DOMAIN_USER_RID_KRBTGT		  = 0x1f6,
+
+	/* Groups. */
+	DOMAIN_GROUP_RID_ADMINS		  = 0x200,
+	DOMAIN_GROUP_RID_USERS		  = 0x201,
+	DOMAIN_GROUP_RID_GUESTS		  = 0x202,
+	DOMAIN_GROUP_RID_COMPUTERS	  = 0x203,
+	DOMAIN_GROUP_RID_CONTROLLERS	  = 0x204,
+	DOMAIN_GROUP_RID_CERT_ADMINS	  = 0x205,
+	DOMAIN_GROUP_RID_SCHEMA_ADMINS	  = 0x206,
+	DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207,
+	DOMAIN_GROUP_RID_POLICY_ADMINS	  = 0x208,
+
+	/* Aliases. */
+	DOMAIN_ALIAS_RID_ADMINS		  = 0x220,
+	DOMAIN_ALIAS_RID_USERS		  = 0x221,
+	DOMAIN_ALIAS_RID_GUESTS		  = 0x222,
+	DOMAIN_ALIAS_RID_POWER_USERS	  = 0x223,
+
+	DOMAIN_ALIAS_RID_ACCOUNT_OPS	  = 0x224,
+	DOMAIN_ALIAS_RID_SYSTEM_OPS	  = 0x225,
+	DOMAIN_ALIAS_RID_PRINT_OPS	  = 0x226,
+	DOMAIN_ALIAS_RID_BACKUP_OPS	  = 0x227,
+
+	DOMAIN_ALIAS_RID_REPLICATOR	  = 0x228,
+	DOMAIN_ALIAS_RID_RAS_SERVERS	  = 0x229,
+	DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a,
+} RELATIVE_IDENTIFIERS;
+
+/*
+ * The universal well-known SIDs:
+ *
+ *	NULL_SID			S-1-0-0
+ *	WORLD_SID			S-1-1-0
+ *	LOCAL_SID			S-1-2-0
+ *	CREATOR_OWNER_SID		S-1-3-0
+ *	CREATOR_GROUP_SID		S-1-3-1
+ *	CREATOR_OWNER_SERVER_SID	S-1-3-2
+ *	CREATOR_GROUP_SERVER_SID	S-1-3-3
+ *
+ *	(Non-unique IDs)		S-1-4
+ *
+ * NT well-known SIDs:
+ *
+ *	NT_AUTHORITY_SID	S-1-5
+ *	DIALUP_SID		S-1-5-1
+ *
+ *	NETWORD_SID		S-1-5-2
+ *	BATCH_SID		S-1-5-3
+ *	INTERACTIVE_SID		S-1-5-4
+ *	SERVICE_SID		S-1-5-6
+ *	ANONYMOUS_LOGON_SID	S-1-5-7		(aka null logon session)
+ *	PROXY_SID		S-1-5-8
+ *	SERVER_LOGON_SID	S-1-5-9		(aka domain controller account)
+ *	SELF_SID		S-1-5-10	(self RID)
+ *	AUTHENTICATED_USER_SID	S-1-5-11
+ *	RESTRICTED_CODE_SID	S-1-5-12	(running restricted code)
+ *	TERMINAL_SERVER_SID	S-1-5-13	(running on terminal server)
+ *
+ *	(Logon IDs)		S-1-5-5-X-Y
+ *
+ *	(NT non-unique IDs)	S-1-5-0x15-...
+ *
+ *	(Built-in domain)	S-1-5-0x20
+ */
+
+/*
+ * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure.
+ *
+ * NOTE: This is stored as a big endian number, hence the high_part comes
+ * before the low_part.
+ */
+typedef union {
+	struct {
+		u16 high_part;	/* High 16-bits. */
+		u32 low_part;	/* Low 32-bits. */
+	} __attribute__ ((__packed__)) parts;
+	u8 value[6];		/* Value as individual bytes. */
+} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY;
+
+/*
+ * The SID structure is a variable-length structure used to uniquely identify
+ * users or groups. SID stands for security identifier.
+ *
+ * The standard textual representation of the SID is of the form:
+ *	S-R-I-S-S...
+ * Where:
+ *    - The first "S" is the literal character 'S' identifying the following
+ *	digits as a SID.
+ *    - R is the revision level of the SID expressed as a sequence of digits
+ *	either in decimal or hexadecimal (if the later, prefixed by "0x").
+ *    - I is the 48-bit identifier_authority, expressed as digits as R above.
+ *    - S... is one or more sub_authority values, expressed as digits as above.
+ *
+ * Example SID; the domain-relative SID of the local Administrators group on
+ * Windows NT/2k:
+ *	S-1-5-32-544
+ * This translates to a SID with:
+ *	revision = 1,
+ *	sub_authority_count = 2,
+ *	identifier_authority = {0,0,0,0,0,5},	// SECURITY_NT_AUTHORITY
+ *	sub_authority[0] = 32,			// SECURITY_BUILTIN_DOMAIN_RID
+ *	sub_authority[1] = 544			// DOMAIN_ALIAS_RID_ADMINS
+ */
+typedef struct {
+	u8 revision;
+	u8 sub_authority_count;
+	SID_IDENTIFIER_AUTHORITY identifier_authority;
+	le32 sub_authority[1];		/* At least one sub_authority. */
+} __attribute__ ((__packed__)) SID;
+
+/*
+ * Current constants for SIDs.
+ */
+typedef enum {
+	SID_REVISION			=  1,	/* Current revision level. */
+	SID_MAX_SUB_AUTHORITIES		= 15,	/* Maximum number of those. */
+	SID_RECOMMENDED_SUB_AUTHORITIES	=  1,	/* Will change to around 6 in
+						   a future revision. */
+} SID_CONSTANTS;
+
+/*
+ * The predefined ACE types (8-bit, see below).
+ */
+enum {
+	ACCESS_MIN_MS_ACE_TYPE		= 0,
+	ACCESS_ALLOWED_ACE_TYPE		= 0,
+	ACCESS_DENIED_ACE_TYPE		= 1,
+	SYSTEM_AUDIT_ACE_TYPE		= 2,
+	SYSTEM_ALARM_ACE_TYPE		= 3, /* Not implemented as of Win2k. */
+	ACCESS_MAX_MS_V2_ACE_TYPE	= 3,
+
+	ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4,
+	ACCESS_MAX_MS_V3_ACE_TYPE	= 4,
+
+	/* The following are Win2k only. */
+	ACCESS_MIN_MS_OBJECT_ACE_TYPE	= 5,
+	ACCESS_ALLOWED_OBJECT_ACE_TYPE	= 5,
+	ACCESS_DENIED_OBJECT_ACE_TYPE	= 6,
+	SYSTEM_AUDIT_OBJECT_ACE_TYPE	= 7,
+	SYSTEM_ALARM_OBJECT_ACE_TYPE	= 8,
+	ACCESS_MAX_MS_OBJECT_ACE_TYPE	= 8,
+
+	ACCESS_MAX_MS_V4_ACE_TYPE	= 8,
+
+	/* This one is for WinNT/2k. */
+	ACCESS_MAX_MS_ACE_TYPE		= 8,
+} __attribute__ ((__packed__));
+
+typedef u8 ACE_TYPES;
+
+/*
+ * The ACE flags (8-bit) for audit and inheritance (see below).
+ *
+ * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE
+ * types to indicate that a message is generated (in Windows!) for successful
+ * accesses.
+ *
+ * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types
+ * to indicate that a message is generated (in Windows!) for failed accesses.
+ */
+enum {
+	/* The inheritance flags. */
+	OBJECT_INHERIT_ACE		= 0x01,
+	CONTAINER_INHERIT_ACE		= 0x02,
+	NO_PROPAGATE_INHERIT_ACE	= 0x04,
+	INHERIT_ONLY_ACE		= 0x08,
+	INHERITED_ACE			= 0x10,	/* Win2k only. */
+	VALID_INHERIT_FLAGS		= 0x1f,
+
+	/* The audit flags. */
+	SUCCESSFUL_ACCESS_ACE_FLAG	= 0x40,
+	FAILED_ACCESS_ACE_FLAG		= 0x80,
+} __attribute__ ((__packed__));
+
+typedef u8 ACE_FLAGS;
+
+/*
+ * An ACE is an access-control entry in an access-control list (ACL).
+ * An ACE defines access to an object for a specific user or group or defines
+ * the types of access that generate system-administration messages or alarms
+ * for a specific user or group. The user or group is identified by a security
+ * identifier (SID).
+ *
+ * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary),
+ * which specifies the type and size of the ACE. The format of the subsequent
+ * data depends on the ACE type.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/	ACE_TYPES type;		/* Type of the ACE. */
+/*  1*/	ACE_FLAGS flags;	/* Flags describing the ACE. */
+/*  2*/	le16 size;		/* Size in bytes of the ACE. */
+} __attribute__ ((__packed__)) ACE_HEADER;
+
+/*
+ * The access mask (32-bit). Defines the access rights.
+ *
+ * The specific rights (bits 0 to 15).  These depend on the type of the object
+ * being secured by the ACE.
+ */
+enum {
+	/* Specific rights for files and directories are as follows: */
+
+	/* Right to read data from the file. (FILE) */
+	FILE_READ_DATA			= cpu_to_le32(0x00000001),
+	/* Right to list contents of a directory. (DIRECTORY) */
+	FILE_LIST_DIRECTORY		= cpu_to_le32(0x00000001),
+
+	/* Right to write data to the file. (FILE) */
+	FILE_WRITE_DATA			= cpu_to_le32(0x00000002),
+	/* Right to create a file in the directory. (DIRECTORY) */
+	FILE_ADD_FILE			= cpu_to_le32(0x00000002),
+
+	/* Right to append data to the file. (FILE) */
+	FILE_APPEND_DATA		= cpu_to_le32(0x00000004),
+	/* Right to create a subdirectory. (DIRECTORY) */
+	FILE_ADD_SUBDIRECTORY		= cpu_to_le32(0x00000004),
+
+	/* Right to read extended attributes. (FILE/DIRECTORY) */
+	FILE_READ_EA			= cpu_to_le32(0x00000008),
+
+	/* Right to write extended attributes. (FILE/DIRECTORY) */
+	FILE_WRITE_EA			= cpu_to_le32(0x00000010),
+
+	/* Right to execute a file. (FILE) */
+	FILE_EXECUTE			= cpu_to_le32(0x00000020),
+	/* Right to traverse the directory. (DIRECTORY) */
+	FILE_TRAVERSE			= cpu_to_le32(0x00000020),
+
+	/*
+	 * Right to delete a directory and all the files it contains (its
+	 * children), even if the files are read-only. (DIRECTORY)
+	 */
+	FILE_DELETE_CHILD		= cpu_to_le32(0x00000040),
+
+	/* Right to read file attributes. (FILE/DIRECTORY) */
+	FILE_READ_ATTRIBUTES		= cpu_to_le32(0x00000080),
+
+	/* Right to change file attributes. (FILE/DIRECTORY) */
+	FILE_WRITE_ATTRIBUTES		= cpu_to_le32(0x00000100),
+
+	/*
+	 * The standard rights (bits 16 to 23).  These are independent of the
+	 * type of object being secured.
+	 */
+
+	/* Right to delete the object. */
+	DELETE				= cpu_to_le32(0x00010000),
+
+	/*
+	 * Right to read the information in the object's security descriptor,
+	 * not including the information in the SACL, i.e. right to read the
+	 * security descriptor and owner.
+	 */
+	READ_CONTROL			= cpu_to_le32(0x00020000),
+
+	/* Right to modify the DACL in the object's security descriptor. */
+	WRITE_DAC			= cpu_to_le32(0x00040000),
+
+	/* Right to change the owner in the object's security descriptor. */
+	WRITE_OWNER			= cpu_to_le32(0x00080000),
+
+	/*
+	 * Right to use the object for synchronization.  Enables a process to
+	 * wait until the object is in the signalled state.  Some object types
+	 * do not support this access right.
+	 */
+	SYNCHRONIZE			= cpu_to_le32(0x00100000),
+
+	/*
+	 * The following STANDARD_RIGHTS_* are combinations of the above for
+	 * convenience and are defined by the Win32 API.
+	 */
+
+	/* These are currently defined to READ_CONTROL. */
+	STANDARD_RIGHTS_READ		= cpu_to_le32(0x00020000),
+	STANDARD_RIGHTS_WRITE		= cpu_to_le32(0x00020000),
+	STANDARD_RIGHTS_EXECUTE		= cpu_to_le32(0x00020000),
+
+	/* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
+	STANDARD_RIGHTS_REQUIRED	= cpu_to_le32(0x000f0000),
+
+	/*
+	 * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
+	 * SYNCHRONIZE access.
+	 */
+	STANDARD_RIGHTS_ALL		= cpu_to_le32(0x001f0000),
+
+	/*
+	 * The access system ACL and maximum allowed access types (bits 24 to
+	 * 25, bits 26 to 27 are reserved).
+	 */
+	ACCESS_SYSTEM_SECURITY		= cpu_to_le32(0x01000000),
+	MAXIMUM_ALLOWED			= cpu_to_le32(0x02000000),
+
+	/*
+	 * The generic rights (bits 28 to 31).  These map onto the standard and
+	 * specific rights.
+	 */
+
+	/* Read, write, and execute access. */
+	GENERIC_ALL			= cpu_to_le32(0x10000000),
+
+	/* Execute access. */
+	GENERIC_EXECUTE			= cpu_to_le32(0x20000000),
+
+	/*
+	 * Write access.  For files, this maps onto:
+	 *	FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA |
+	 *	FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE
+	 * For directories, the mapping has the same numerical value.  See
+	 * above for the descriptions of the rights granted.
+	 */
+	GENERIC_WRITE			= cpu_to_le32(0x40000000),
+
+	/*
+	 * Read access.  For files, this maps onto:
+	 *	FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA |
+	 *	STANDARD_RIGHTS_READ | SYNCHRONIZE
+	 * For directories, the mapping has the same numberical value.  See
+	 * above for the descriptions of the rights granted.
+	 */
+	GENERIC_READ			= cpu_to_le32(0x80000000),
+};
+
+typedef le32 ACCESS_MASK;
+
+/*
+ * The generic mapping array. Used to denote the mapping of each generic
+ * access right to a specific access mask.
+ *
+ * FIXME: What exactly is this and what is it for? (AIA)
+ */
+typedef struct {
+	ACCESS_MASK generic_read;
+	ACCESS_MASK generic_write;
+	ACCESS_MASK generic_execute;
+	ACCESS_MASK generic_all;
+} __attribute__ ((__packed__)) GENERIC_MAPPING;
+
+/*
+ * The predefined ACE type structures are as defined below.
+ */
+
+/*
+ * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE
+ */
+typedef struct {
+/*  0	ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
+	ACE_TYPES type;		/* Type of the ACE. */
+	ACE_FLAGS flags;	/* Flags describing the ACE. */
+	le16 size;		/* Size in bytes of the ACE. */
+/*  4*/	ACCESS_MASK mask;	/* Access mask associated with the ACE. */
+
+/*  8*/	SID sid;		/* The SID associated with the ACE. */
+} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE,
+			       SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE;
+
+/*
+ * The object ACE flags (32-bit).
+ */
+enum {
+	ACE_OBJECT_TYPE_PRESENT			= cpu_to_le32(1),
+	ACE_INHERITED_OBJECT_TYPE_PRESENT	= cpu_to_le32(2),
+};
+
+typedef le32 OBJECT_ACE_FLAGS;
+
+typedef struct {
+/*  0	ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
+	ACE_TYPES type;		/* Type of the ACE. */
+	ACE_FLAGS flags;	/* Flags describing the ACE. */
+	le16 size;		/* Size in bytes of the ACE. */
+/*  4*/	ACCESS_MASK mask;	/* Access mask associated with the ACE. */
+
+/*  8*/	OBJECT_ACE_FLAGS object_flags;	/* Flags describing the object ACE. */
+/* 12*/	GUID object_type;
+/* 28*/	GUID inherited_object_type;
+
+/* 44*/	SID sid;		/* The SID associated with the ACE. */
+} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE,
+			       ACCESS_DENIED_OBJECT_ACE,
+			       SYSTEM_AUDIT_OBJECT_ACE,
+			       SYSTEM_ALARM_OBJECT_ACE;
+
+/*
+ * An ACL is an access-control list (ACL).
+ * An ACL starts with an ACL header structure, which specifies the size of
+ * the ACL and the number of ACEs it contains. The ACL header is followed by
+ * zero or more access control entries (ACEs). The ACL as well as each ACE
+ * are aligned on 4-byte boundaries.
+ */
+typedef struct {
+	u8 revision;	/* Revision of this ACL. */
+	u8 alignment1;
+	le16 size;	/* Allocated space in bytes for ACL. Includes this
+			   header, the ACEs and the remaining free space. */
+	le16 ace_count;	/* Number of ACEs in the ACL. */
+	le16 alignment2;
+/* sizeof() = 8 bytes */
+} __attribute__ ((__packed__)) ACL;
+
+/*
+ * Current constants for ACLs.
+ */
+typedef enum {
+	/* Current revision. */
+	ACL_REVISION		= 2,
+	ACL_REVISION_DS		= 4,
+
+	/* History of revisions. */
+	ACL_REVISION1		= 1,
+	MIN_ACL_REVISION	= 2,
+	ACL_REVISION2		= 2,
+	ACL_REVISION3		= 3,
+	ACL_REVISION4		= 4,
+	MAX_ACL_REVISION	= 4,
+} ACL_CONSTANTS;
+
+/*
+ * The security descriptor control flags (16-bit).
+ *
+ * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID
+ *	pointed to by the Owner field was provided by a defaulting mechanism
+ *	rather than explicitly provided by the original provider of the
+ *	security descriptor.  This may affect the treatment of the SID with
+ *	respect to inheritance of an owner.
+ *
+ * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in
+ *	the Group field was provided by a defaulting mechanism rather than
+ *	explicitly provided by the original provider of the security
+ *	descriptor.  This may affect the treatment of the SID with respect to
+ *	inheritance of a primary group.
+ *
+ * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security
+ *	descriptor contains a discretionary ACL.  If this flag is set and the
+ *	Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is
+ *	explicitly being specified.
+ *
+ * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
+ *	pointed to by the Dacl field was provided by a defaulting mechanism
+ *	rather than explicitly provided by the original provider of the
+ *	security descriptor.  This may affect the treatment of the ACL with
+ *	respect to inheritance of an ACL.  This flag is ignored if the
+ *	DaclPresent flag is not set.
+ *
+ * SE_SACL_PRESENT - This boolean flag, when set,  indicates that the security
+ *	descriptor contains a system ACL pointed to by the Sacl field.  If this
+ *	flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then
+ *	an empty (but present) ACL is being specified.
+ *
+ * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
+ *	pointed to by the Sacl field was provided by a defaulting mechanism
+ *	rather than explicitly provided by the original provider of the
+ *	security descriptor.  This may affect the treatment of the ACL with
+ *	respect to inheritance of an ACL.  This flag is ignored if the
+ *	SaclPresent flag is not set.
+ *
+ * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security
+ *	descriptor is in self-relative form.  In this form, all fields of the
+ *	security descriptor are contiguous in memory and all pointer fields are
+ *	expressed as offsets from the beginning of the security descriptor.
+ */
+enum {
+	SE_OWNER_DEFAULTED		= cpu_to_le16(0x0001),
+	SE_GROUP_DEFAULTED		= cpu_to_le16(0x0002),
+	SE_DACL_PRESENT			= cpu_to_le16(0x0004),
+	SE_DACL_DEFAULTED		= cpu_to_le16(0x0008),
+
+	SE_SACL_PRESENT			= cpu_to_le16(0x0010),
+	SE_SACL_DEFAULTED		= cpu_to_le16(0x0020),
+
+	SE_DACL_AUTO_INHERIT_REQ	= cpu_to_le16(0x0100),
+	SE_SACL_AUTO_INHERIT_REQ	= cpu_to_le16(0x0200),
+	SE_DACL_AUTO_INHERITED		= cpu_to_le16(0x0400),
+	SE_SACL_AUTO_INHERITED		= cpu_to_le16(0x0800),
+
+	SE_DACL_PROTECTED		= cpu_to_le16(0x1000),
+	SE_SACL_PROTECTED		= cpu_to_le16(0x2000),
+	SE_RM_CONTROL_VALID		= cpu_to_le16(0x4000),
+	SE_SELF_RELATIVE		= cpu_to_le16(0x8000)
+} __attribute__ ((__packed__));
+
+typedef le16 SECURITY_DESCRIPTOR_CONTROL;
+
+/*
+ * Self-relative security descriptor. Contains the owner and group SIDs as well
+ * as the sacl and dacl ACLs inside the security descriptor itself.
+ */
+typedef struct {
+	u8 revision;	/* Revision level of the security descriptor. */
+	u8 alignment;
+	SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of
+			   the descriptor as well as the following fields. */
+	le32 owner;	/* Byte offset to a SID representing an object's
+			   owner. If this is NULL, no owner SID is present in
+			   the descriptor. */
+	le32 group;	/* Byte offset to a SID representing an object's
+			   primary group. If this is NULL, no primary group
+			   SID is present in the descriptor. */
+	le32 sacl;	/* Byte offset to a system ACL. Only valid, if
+			   SE_SACL_PRESENT is set in the control field. If
+			   SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
+			   is specified. */
+	le32 dacl;	/* Byte offset to a discretionary ACL. Only valid, if
+			   SE_DACL_PRESENT is set in the control field. If
+			   SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
+			   (unconditionally granting access) is specified. */
+/* sizeof() = 0x14 bytes */
+} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE;
+
+/*
+ * Absolute security descriptor. Does not contain the owner and group SIDs, nor
+ * the sacl and dacl ACLs inside the security descriptor. Instead, it contains
+ * pointers to these structures in memory. Obviously, absolute security
+ * descriptors are only useful for in memory representations of security
+ * descriptors. On disk, a self-relative security descriptor is used.
+ */
+typedef struct {
+	u8 revision;	/* Revision level of the security descriptor. */
+	u8 alignment;
+	SECURITY_DESCRIPTOR_CONTROL control;	/* Flags qualifying the type of
+			   the descriptor as well as the following fields. */
+	SID *owner;	/* Points to a SID representing an object's owner. If
+			   this is NULL, no owner SID is present in the
+			   descriptor. */
+	SID *group;	/* Points to a SID representing an object's primary
+			   group. If this is NULL, no primary group SID is
+			   present in the descriptor. */
+	ACL *sacl;	/* Points to a system ACL. Only valid, if
+			   SE_SACL_PRESENT is set in the control field. If
+			   SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
+			   is specified. */
+	ACL *dacl;	/* Points to a discretionary ACL. Only valid, if
+			   SE_DACL_PRESENT is set in the control field. If
+			   SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
+			   (unconditionally granting access) is specified. */
+} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR;
+
+/*
+ * Current constants for security descriptors.
+ */
+typedef enum {
+	/* Current revision. */
+	SECURITY_DESCRIPTOR_REVISION	= 1,
+	SECURITY_DESCRIPTOR_REVISION1	= 1,
+
+	/* The sizes of both the absolute and relative security descriptors is
+	   the same as pointers, at least on ia32 architecture are 32-bit. */
+	SECURITY_DESCRIPTOR_MIN_LENGTH	= sizeof(SECURITY_DESCRIPTOR),
+} SECURITY_DESCRIPTOR_CONSTANTS;
+
+/*
+ * Attribute: Security descriptor (0x50). A standard self-relative security
+ * descriptor.
+ *
+ * NOTE: Can be resident or non-resident.
+ * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally
+ * in FILE_Secure and the correct descriptor is found using the security_id
+ * from the standard information attribute.
+ */
+typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR;
+
+/*
+ * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one
+ * referenced instance of each unique security descriptor is stored.
+ *
+ * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It
+ * does, however, contain two indexes ($SDH and $SII) as well as a named data
+ * stream ($SDS).
+ *
+ * Every unique security descriptor is assigned a unique security identifier
+ * (security_id, not to be confused with a SID). The security_id is unique for
+ * the NTFS volume and is used as an index into the $SII index, which maps
+ * security_ids to the security descriptor's storage location within the $SDS
+ * data attribute. The $SII index is sorted by ascending security_id.
+ *
+ * A simple hash is computed from each security descriptor. This hash is used
+ * as an index into the $SDH index, which maps security descriptor hashes to
+ * the security descriptor's storage location within the $SDS data attribute.
+ * The $SDH index is sorted by security descriptor hash and is stored in a B+
+ * tree. When searching $SDH (with the intent of determining whether or not a
+ * new security descriptor is already present in the $SDS data stream), if a
+ * matching hash is found, but the security descriptors do not match, the
+ * search in the $SDH index is continued, searching for a next matching hash.
+ *
+ * When a precise match is found, the security_id coresponding to the security
+ * descriptor in the $SDS attribute is read from the found $SDH index entry and
+ * is stored in the $STANDARD_INFORMATION attribute of the file/directory to
+ * which the security descriptor is being applied. The $STANDARD_INFORMATION
+ * attribute is present in all base mft records (i.e. in all files and
+ * directories).
+ *
+ * If a match is not found, the security descriptor is assigned a new unique
+ * security_id and is added to the $SDS data attribute. Then, entries
+ * referencing the this security descriptor in the $SDS data attribute are
+ * added to the $SDH and $SII indexes.
+ *
+ * Note: Entries are never deleted from FILE_Secure, even if nothing
+ * references an entry any more.
+ */
+
+/*
+ * This header precedes each security descriptor in the $SDS data stream.
+ * This is also the index entry data part of both the $SII and $SDH indexes.
+ */
+typedef struct {
+	le32 hash;	  /* Hash of the security descriptor. */
+	le32 security_id; /* The security_id assigned to the descriptor. */
+	le64 offset;	  /* Byte offset of this entry in the $SDS stream. */
+	le32 length;	  /* Size in bytes of this entry in $SDS stream. */
+} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER;
+
+/*
+ * The $SDS data stream contains the security descriptors, aligned on 16-byte
+ * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot
+ * cross 256kib boundaries (this restriction is imposed by the Windows cache
+ * manager). Each security descriptor is contained in a SDS_ENTRY structure.
+ * Also, each security descriptor is stored twice in the $SDS stream with a
+ * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size)
+ * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the
+ * the first copy of the security descriptor will be at offset 0x51d0 in the
+ * $SDS data stream and the second copy will be at offset 0x451d0.
+ */
+typedef struct {
+/*Ofs*/
+/*  0	SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like
+				       unnamed structs. */
+	le32 hash;	  /* Hash of the security descriptor. */
+	le32 security_id; /* The security_id assigned to the descriptor. */
+	le64 offset;	  /* Byte offset of this entry in the $SDS stream. */
+	le32 length;	  /* Size in bytes of this entry in $SDS stream. */
+/* 20*/	SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security
+					     descriptor. */
+} __attribute__ ((__packed__)) SDS_ENTRY;
+
+/*
+ * The index entry key used in the $SII index. The collation type is
+ * COLLATION_NTOFS_ULONG.
+ */
+typedef struct {
+	le32 security_id; /* The security_id assigned to the descriptor. */
+} __attribute__ ((__packed__)) SII_INDEX_KEY;
+
+/*
+ * The index entry key used in the $SDH index. The keys are sorted first by
+ * hash and then by security_id. The collation rule is
+ * COLLATION_NTOFS_SECURITY_HASH.
+ */
+typedef struct {
+	le32 hash;	  /* Hash of the security descriptor. */
+	le32 security_id; /* The security_id assigned to the descriptor. */
+} __attribute__ ((__packed__)) SDH_INDEX_KEY;
+
+/*
+ * Attribute: Volume name (0x60).
+ *
+ * NOTE: Always resident.
+ * NOTE: Present only in FILE_Volume.
+ */
+typedef struct {
+	ntfschar name[0];	/* The name of the volume in Unicode. */
+} __attribute__ ((__packed__)) VOLUME_NAME;
+
+/*
+ * Possible flags for the volume (16-bit).
+ */
+enum {
+	VOLUME_IS_DIRTY			= cpu_to_le16(0x0001),
+	VOLUME_RESIZE_LOG_FILE		= cpu_to_le16(0x0002),
+	VOLUME_UPGRADE_ON_MOUNT		= cpu_to_le16(0x0004),
+	VOLUME_MOUNTED_ON_NT4		= cpu_to_le16(0x0008),
+
+	VOLUME_DELETE_USN_UNDERWAY	= cpu_to_le16(0x0010),
+	VOLUME_REPAIR_OBJECT_ID		= cpu_to_le16(0x0020),
+
+	VOLUME_CHKDSK_UNDERWAY		= cpu_to_le16(0x4000),
+	VOLUME_MODIFIED_BY_CHKDSK	= cpu_to_le16(0x8000),
+
+	VOLUME_FLAGS_MASK		= cpu_to_le16(0xc03f),
+
+	/* To make our life easier when checking if we must mount read-only. */
+	VOLUME_MUST_MOUNT_RO_MASK	= cpu_to_le16(0xc027),
+} __attribute__ ((__packed__));
+
+typedef le16 VOLUME_FLAGS;
+
+/*
+ * Attribute: Volume information (0x70).
+ *
+ * NOTE: Always resident.
+ * NOTE: Present only in FILE_Volume.
+ * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses
+ *	 NTFS 1.2. I haven't personally seen other values yet.
+ */
+typedef struct {
+	le64 reserved;		/* Not used (yet?). */
+	u8 major_ver;		/* Major version of the ntfs format. */
+	u8 minor_ver;		/* Minor version of the ntfs format. */
+	VOLUME_FLAGS flags;	/* Bit array of VOLUME_* flags. */
+} __attribute__ ((__packed__)) VOLUME_INFORMATION;
+
+/*
+ * Attribute: Data attribute (0x80).
+ *
+ * NOTE: Can be resident or non-resident.
+ *
+ * Data contents of a file (i.e. the unnamed stream) or of a named stream.
+ */
+typedef struct {
+	u8 data[0];		/* The file's data contents. */
+} __attribute__ ((__packed__)) DATA_ATTR;
+
+/*
+ * Index header flags (8-bit).
+ */
+enum {
+	/*
+	 * When index header is in an index root attribute:
+	 */
+	SMALL_INDEX = 0, /* The index is small enough to fit inside the index
+			    root attribute and there is no index allocation
+			    attribute present. */
+	LARGE_INDEX = 1, /* The index is too large to fit in the index root
+			    attribute and/or an index allocation attribute is
+			    present. */
+	/*
+	 * When index header is in an index block, i.e. is part of index
+	 * allocation attribute:
+	 */
+	LEAF_NODE  = 0, /* This is a leaf node, i.e. there are no more nodes
+			   branching off it. */
+	INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf
+			   node. */
+	NODE_MASK  = 1, /* Mask for accessing the *_NODE bits. */
+} __attribute__ ((__packed__));
+
+typedef u8 INDEX_HEADER_FLAGS;
+
+/*
+ * This is the header for indexes, describing the INDEX_ENTRY records, which
+ * follow the INDEX_HEADER. Together the index header and the index entries
+ * make up a complete index.
+ *
+ * IMPORTANT NOTE: The offset, length and size structure members are counted
+ * relative to the start of the index header structure and not relative to the
+ * start of the index root or index allocation structures themselves.
+ */
+typedef struct {
+	le32 entries_offset;		/* Byte offset to first INDEX_ENTRY
+					   aligned to 8-byte boundary. */
+	le32 index_length;		/* Data size of the index in bytes,
+					   i.e. bytes used from allocated
+					   size, aligned to 8-byte boundary. */
+	le32 allocated_size;		/* Byte size of this index (block),
+					   multiple of 8 bytes. */
+	/* NOTE: For the index root attribute, the above two numbers are always
+	   equal, as the attribute is resident and it is resized as needed. In
+	   the case of the index allocation attribute the attribute is not
+	   resident and hence the allocated_size is a fixed value and must
+	   equal the index_block_size specified by the INDEX_ROOT attribute
+	   corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK
+	   belongs to. */
+	INDEX_HEADER_FLAGS flags;	/* Bit field of INDEX_HEADER_FLAGS. */
+	u8 reserved[3];			/* Reserved/align to 8-byte boundary. */
+} __attribute__ ((__packed__)) INDEX_HEADER;
+
+/*
+ * Attribute: Index root (0x90).
+ *
+ * NOTE: Always resident.
+ *
+ * This is followed by a sequence of index entries (INDEX_ENTRY structures)
+ * as described by the index header.
+ *
+ * When a directory is small enough to fit inside the index root then this
+ * is the only attribute describing the directory. When the directory is too
+ * large to fit in the index root, on the other hand, two aditional attributes
+ * are present: an index allocation attribute, containing sub-nodes of the B+
+ * directory tree (see below), and a bitmap attribute, describing which virtual
+ * cluster numbers (vcns) in the index allocation attribute are in use by an
+ * index block.
+ *
+ * NOTE: The root directory (FILE_root) contains an entry for itself. Other
+ * dircetories do not contain entries for themselves, though.
+ */
+typedef struct {
+	ATTR_TYPE type;			/* Type of the indexed attribute. Is
+					   $FILE_NAME for directories, zero
+					   for view indexes. No other values
+					   allowed. */
+	COLLATION_RULE collation_rule;	/* Collation rule used to sort the
+					   index entries. If type is $FILE_NAME,
+					   this must be COLLATION_FILE_NAME. */
+	le32 index_block_size;		/* Size of each index block in bytes (in
+					   the index allocation attribute). */
+	u8 clusters_per_index_block;	/* Cluster size of each index block (in
+					   the index allocation attribute), when
+					   an index block is >= than a cluster,
+					   otherwise this will be the log of
+					   the size (like how the encoding of
+					   the mft record size and the index
+					   record size found in the boot sector
+					   work). Has to be a power of 2. */
+	u8 reserved[3];			/* Reserved/align to 8-byte boundary. */
+	INDEX_HEADER index;		/* Index header describing the
+					   following index entries. */
+} __attribute__ ((__packed__)) INDEX_ROOT;
+
+/*
+ * Attribute: Index allocation (0xa0).
+ *
+ * NOTE: Always non-resident (doesn't make sense to be resident anyway!).
+ *
+ * This is an array of index blocks. Each index block starts with an
+ * INDEX_BLOCK structure containing an index header, followed by a sequence of
+ * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER.
+ */
+typedef struct {
+/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+	NTFS_RECORD_TYPE magic;	/* Magic is "INDX". */
+	le16 usa_ofs;		/* See NTFS_RECORD definition. */
+	le16 usa_count;		/* See NTFS_RECORD definition. */
+
+/*  8*/	sle64 lsn;		/* $LogFile sequence number of the last
+				   modification of this index block. */
+/* 16*/	leVCN index_block_vcn;	/* Virtual cluster number of the index block.
+				   If the cluster_size on the volume is <= the
+				   index_block_size of the directory,
+				   index_block_vcn counts in units of clusters,
+				   and in units of sectors otherwise. */
+/* 24*/	INDEX_HEADER index;	/* Describes the following index entries. */
+/* sizeof()= 40 (0x28) bytes */
+/*
+ * When creating the index block, we place the update sequence array at this
+ * offset, i.e. before we start with the index entries. This also makes sense,
+ * otherwise we could run into problems with the update sequence array
+ * containing in itself the last two bytes of a sector which would mean that
+ * multi sector transfer protection wouldn't work. As you can't protect data
+ * by overwriting it since you then can't get it back...
+ * When reading use the data from the ntfs record header.
+ */
+} __attribute__ ((__packed__)) INDEX_BLOCK;
+
+typedef INDEX_BLOCK INDEX_ALLOCATION;
+
+/*
+ * The system file FILE_Extend/$Reparse contains an index named $R listing
+ * all reparse points on the volume. The index entry keys are as defined
+ * below. Note, that there is no index data associated with the index entries.
+ *
+ * The index entries are sorted by the index key file_id. The collation rule is
+ * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the
+ * primary key / is not a key at all. (AIA)
+ */
+typedef struct {
+	le32 reparse_tag;	/* Reparse point type (inc. flags). */
+	leMFT_REF file_id;	/* Mft record of the file containing the
+				   reparse point attribute. */
+} __attribute__ ((__packed__)) REPARSE_INDEX_KEY;
+
+/*
+ * Quota flags (32-bit).
+ *
+ * The user quota flags.  Names explain meaning.
+ */
+enum {
+	QUOTA_FLAG_DEFAULT_LIMITS	= cpu_to_le32(0x00000001),
+	QUOTA_FLAG_LIMIT_REACHED	= cpu_to_le32(0x00000002),
+	QUOTA_FLAG_ID_DELETED		= cpu_to_le32(0x00000004),
+
+	QUOTA_FLAG_USER_MASK		= cpu_to_le32(0x00000007),
+	/* This is a bit mask for the user quota flags. */
+
+	/*
+	 * These flags are only present in the quota defaults index entry, i.e.
+	 * in the entry where owner_id = QUOTA_DEFAULTS_ID.
+	 */
+	QUOTA_FLAG_TRACKING_ENABLED	= cpu_to_le32(0x00000010),
+	QUOTA_FLAG_ENFORCEMENT_ENABLED	= cpu_to_le32(0x00000020),
+	QUOTA_FLAG_TRACKING_REQUESTED	= cpu_to_le32(0x00000040),
+	QUOTA_FLAG_LOG_THRESHOLD	= cpu_to_le32(0x00000080),
+
+	QUOTA_FLAG_LOG_LIMIT		= cpu_to_le32(0x00000100),
+	QUOTA_FLAG_OUT_OF_DATE		= cpu_to_le32(0x00000200),
+	QUOTA_FLAG_CORRUPT		= cpu_to_le32(0x00000400),
+	QUOTA_FLAG_PENDING_DELETES	= cpu_to_le32(0x00000800),
+};
+
+typedef le32 QUOTA_FLAGS;
+
+/*
+ * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas
+ * are on a per volume and per user basis.
+ *
+ * The $Q index contains one entry for each existing user_id on the volume. The
+ * index key is the user_id of the user/group owning this quota control entry,
+ * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the
+ * owner_id, is found in the standard information attribute. The collation rule
+ * for $Q is COLLATION_NTOFS_ULONG.
+ *
+ * The $O index contains one entry for each user/group who has been assigned
+ * a quota on that volume. The index key holds the SID of the user_id the
+ * entry belongs to, i.e. the owner_id. The collation rule for $O is
+ * COLLATION_NTOFS_SID.
+ *
+ * The $O index entry data is the user_id of the user corresponding to the SID.
+ * This user_id is used as an index into $Q to find the quota control entry
+ * associated with the SID.
+ *
+ * The $Q index entry data is the quota control entry and is defined below.
+ */
+typedef struct {
+	le32 version;		/* Currently equals 2. */
+	QUOTA_FLAGS flags;	/* Flags describing this quota entry. */
+	le64 bytes_used;	/* How many bytes of the quota are in use. */
+	sle64 change_time;	/* Last time this quota entry was changed. */
+	sle64 threshold;	/* Soft quota (-1 if not limited). */
+	sle64 limit;		/* Hard quota (-1 if not limited). */
+	sle64 exceeded_time;	/* How long the soft quota has been exceeded. */
+	SID sid;		/* The SID of the user/object associated with
+				   this quota entry.  Equals zero for the quota
+				   defaults entry (and in fact on a WinXP
+				   volume, it is not present at all). */
+} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY;
+
+/*
+ * Predefined owner_id values (32-bit).
+ */
+enum {
+	QUOTA_INVALID_ID	= cpu_to_le32(0x00000000),
+	QUOTA_DEFAULTS_ID	= cpu_to_le32(0x00000001),
+	QUOTA_FIRST_USER_ID	= cpu_to_le32(0x00000100),
+};
+
+/*
+ * Current constants for quota control entries.
+ */
+typedef enum {
+	/* Current version. */
+	QUOTA_VERSION	= 2,
+} QUOTA_CONTROL_ENTRY_CONSTANTS;
+
+/*
+ * Index entry flags (16-bit).
+ */
+enum {
+	INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
+			sub-node, i.e. a reference to an index block in form of
+			a virtual cluster number (see below). */
+	INDEX_ENTRY_END  = cpu_to_le16(2), /* This signifies the last
+			entry in an index block.  The index entry does not
+			represent a file but it can point to a sub-node. */
+
+	INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
+			enum bit width to 16-bit. */
+} __attribute__ ((__packed__));
+
+typedef le16 INDEX_ENTRY_FLAGS;
+
+/*
+ * This the index entry header (see below).
+ */
+typedef struct {
+/*  0*/	union {
+		struct { /* Only valid when INDEX_ENTRY_END is not set. */
+			leMFT_REF indexed_file;	/* The mft reference of the file
+						   described by this index
+						   entry. Used for directory
+						   indexes. */
+		} __attribute__ ((__packed__)) dir;
+		struct { /* Used for views/indexes to find the entry's data. */
+			le16 data_offset;	/* Data byte offset from this
+						   INDEX_ENTRY. Follows the
+						   index key. */
+			le16 data_length;	/* Data length in bytes. */
+			le32 reservedV;		/* Reserved (zero). */
+		} __attribute__ ((__packed__)) vi;
+	} __attribute__ ((__packed__)) data;
+/*  8*/	le16 length;		 /* Byte size of this index entry, multiple of
+				    8-bytes. */
+/* 10*/	le16 key_length;	 /* Byte size of the key value, which is in the
+				    index entry. It follows field reserved. Not
+				    multiple of 8-bytes. */
+/* 12*/	INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
+/* 14*/	le16 reserved;		 /* Reserved/align to 8-byte boundary. */
+/* sizeof() = 16 bytes */
+} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER;
+
+/*
+ * This is an index entry. A sequence of such entries follows each INDEX_HEADER
+ * structure. Together they make up a complete index. The index follows either
+ * an index root attribute or an index allocation attribute.
+ *
+ * NOTE: Before NTFS 3.0 only filename attributes were indexed.
+ */
+typedef struct {
+/*Ofs*/
+/*  0	INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */
+	union {
+		struct { /* Only valid when INDEX_ENTRY_END is not set. */
+			leMFT_REF indexed_file;	/* The mft reference of the file
+						   described by this index
+						   entry. Used for directory
+						   indexes. */
+		} __attribute__ ((__packed__)) dir;
+		struct { /* Used for views/indexes to find the entry's data. */
+			le16 data_offset;	/* Data byte offset from this
+						   INDEX_ENTRY. Follows the
+						   index key. */
+			le16 data_length;	/* Data length in bytes. */
+			le32 reservedV;		/* Reserved (zero). */
+		} __attribute__ ((__packed__)) vi;
+	} __attribute__ ((__packed__)) data;
+	le16 length;		 /* Byte size of this index entry, multiple of
+				    8-bytes. */
+	le16 key_length;	 /* Byte size of the key value, which is in the
+				    index entry. It follows field reserved. Not
+				    multiple of 8-bytes. */
+	INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
+	le16 reserved;		 /* Reserved/align to 8-byte boundary. */
+
+/* 16*/	union {		/* The key of the indexed attribute. NOTE: Only present
+			   if INDEX_ENTRY_END bit in flags is not set. NOTE: On
+			   NTFS versions before 3.0 the only valid key is the
+			   FILE_NAME_ATTR. On NTFS 3.0+ the following
+			   additional index keys are defined: */
+		FILE_NAME_ATTR file_name;/* $I30 index in directories. */
+		SII_INDEX_KEY sii;	/* $SII index in $Secure. */
+		SDH_INDEX_KEY sdh;	/* $SDH index in $Secure. */
+		GUID object_id;		/* $O index in FILE_Extend/$ObjId: The
+					   object_id of the mft record found in
+					   the data part of the index. */
+		REPARSE_INDEX_KEY reparse;	/* $R index in
+						   FILE_Extend/$Reparse. */
+		SID sid;		/* $O index in FILE_Extend/$Quota:
+					   SID of the owner of the user_id. */
+		le32 owner_id;		/* $Q index in FILE_Extend/$Quota:
+					   user_id of the owner of the quota
+					   control entry in the data part of
+					   the index. */
+	} __attribute__ ((__packed__)) key;
+	/* The (optional) index data is inserted here when creating. */
+	// leVCN vcn;	/* If INDEX_ENTRY_NODE bit in flags is set, the last
+	//		   eight bytes of this index entry contain the virtual
+	//		   cluster number of the index block that holds the
+	//		   entries immediately preceding the current entry (the
+	//		   vcn references the corresponding cluster in the data
+	//		   of the non-resident index allocation attribute). If
+	//		   the key_length is zero, then the vcn immediately
+	//		   follows the INDEX_ENTRY_HEADER. Regardless of
+	//		   key_length, the address of the 8-byte boundary
+	//		   aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by
+	//		   (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN),
+	//		   where sizeof(VCN) can be hardcoded as 8 if wanted. */
+} __attribute__ ((__packed__)) INDEX_ENTRY;
+
+/*
+ * Attribute: Bitmap (0xb0).
+ *
+ * Contains an array of bits (aka a bitfield).
+ *
+ * When used in conjunction with the index allocation attribute, each bit
+ * corresponds to one index block within the index allocation attribute. Thus
+ * the number of bits in the bitmap * index block size / cluster size is the
+ * number of clusters in the index allocation attribute.
+ */
+typedef struct {
+	u8 bitmap[0];			/* Array of bits. */
+} __attribute__ ((__packed__)) BITMAP_ATTR;
+
+/*
+ * The reparse point tag defines the type of the reparse point. It also
+ * includes several flags, which further describe the reparse point.
+ *
+ * The reparse point tag is an unsigned 32-bit value divided in three parts:
+ *
+ * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of
+ *    the reparse point.
+ * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use.
+ * 3. The most significant three bits are flags describing the reparse point.
+ *    They are defined as follows:
+ *	bit 29: Name surrogate bit. If set, the filename is an alias for
+ *		another object in the system.
+ *	bit 30: High-latency bit. If set, accessing the first byte of data will
+ *		be slow. (E.g. the data is stored on a tape drive.)
+ *	bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User
+ *		defined tags have to use zero here.
+ *
+ * These are the predefined reparse point tags:
+ */
+enum {
+	IO_REPARSE_TAG_IS_ALIAS		= cpu_to_le32(0x20000000),
+	IO_REPARSE_TAG_IS_HIGH_LATENCY	= cpu_to_le32(0x40000000),
+	IO_REPARSE_TAG_IS_MICROSOFT	= cpu_to_le32(0x80000000),
+
+	IO_REPARSE_TAG_RESERVED_ZERO	= cpu_to_le32(0x00000000),
+	IO_REPARSE_TAG_RESERVED_ONE	= cpu_to_le32(0x00000001),
+	IO_REPARSE_TAG_RESERVED_RANGE	= cpu_to_le32(0x00000001),
+
+	IO_REPARSE_TAG_NSS		= cpu_to_le32(0x68000005),
+	IO_REPARSE_TAG_NSS_RECOVER	= cpu_to_le32(0x68000006),
+	IO_REPARSE_TAG_SIS		= cpu_to_le32(0x68000007),
+	IO_REPARSE_TAG_DFS		= cpu_to_le32(0x68000008),
+
+	IO_REPARSE_TAG_MOUNT_POINT	= cpu_to_le32(0x88000003),
+
+	IO_REPARSE_TAG_HSM		= cpu_to_le32(0xa8000004),
+
+	IO_REPARSE_TAG_SYMBOLIC_LINK	= cpu_to_le32(0xe8000000),
+
+	IO_REPARSE_TAG_VALID_VALUES	= cpu_to_le32(0xe000ffff),
+};
+
+/*
+ * Attribute: Reparse point (0xc0).
+ *
+ * NOTE: Can be resident or non-resident.
+ */
+typedef struct {
+	le32 reparse_tag;		/* Reparse point type (inc. flags). */
+	le16 reparse_data_length;	/* Byte size of reparse data. */
+	le16 reserved;			/* Align to 8-byte boundary. */
+	u8 reparse_data[0];		/* Meaning depends on reparse_tag. */
+} __attribute__ ((__packed__)) REPARSE_POINT;
+
+/*
+ * Attribute: Extended attribute (EA) information (0xd0).
+ *
+ * NOTE: Always resident. (Is this true???)
+ */
+typedef struct {
+	le16 ea_length;		/* Byte size of the packed extended
+				   attributes. */
+	le16 need_ea_count;	/* The number of extended attributes which have
+				   the NEED_EA bit set. */
+	le32 ea_query_length;	/* Byte size of the buffer required to query
+				   the extended attributes when calling
+				   ZwQueryEaFile() in Windows NT/2k. I.e. the
+				   byte size of the unpacked extended
+				   attributes. */
+} __attribute__ ((__packed__)) EA_INFORMATION;
+
+/*
+ * Extended attribute flags (8-bit).
+ */
+enum {
+	NEED_EA	= 0x80		/* If set the file to which the EA belongs
+				   cannot be interpreted without understanding
+				   the associates extended attributes. */
+} __attribute__ ((__packed__));
+
+typedef u8 EA_FLAGS;
+
+/*
+ * Attribute: Extended attribute (EA) (0xe0).
+ *
+ * NOTE: Can be resident or non-resident.
+ *
+ * Like the attribute list and the index buffer list, the EA attribute value is
+ * a sequence of EA_ATTR variable length records.
+ */
+typedef struct {
+	le32 next_entry_offset;	/* Offset to the next EA_ATTR. */
+	EA_FLAGS flags;		/* Flags describing the EA. */
+	u8 ea_name_length;	/* Length of the name of the EA in bytes
+				   excluding the '\0' byte terminator. */
+	le16 ea_value_length;	/* Byte size of the EA's value. */
+	u8 ea_name[0];		/* Name of the EA.  Note this is ASCII, not
+				   Unicode and it is zero terminated. */
+	u8 ea_value[0];		/* The value of the EA.  Immediately follows
+				   the name. */
+} __attribute__ ((__packed__)) EA_ATTR;
+
+/*
+ * Attribute: Property set (0xf0).
+ *
+ * Intended to support Native Structure Storage (NSS) - a feature removed from
+ * NTFS 3.0 during beta testing.
+ */
+typedef struct {
+	/* Irrelevant as feature unused. */
+} __attribute__ ((__packed__)) PROPERTY_SET;
+
+/*
+ * Attribute: Logged utility stream (0x100).
+ *
+ * NOTE: Can be resident or non-resident.
+ *
+ * Operations on this attribute are logged to the journal ($LogFile) like
+ * normal metadata changes.
+ *
+ * Used by the Encrypting File System (EFS). All encrypted files have this
+ * attribute with the name $EFS.
+ */
+typedef struct {
+	/* Can be anything the creator chooses. */
+	/* EFS uses it as follows: */
+	// FIXME: Type this info, verifying it along the way. (AIA)
+} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR;
+
+#endif /* _LINUX_NTFS_LAYOUT_H */
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
new file mode 100644
index 00000000..1711b710
--- /dev/null
+++ b/fs/ntfs/lcnalloc.c
@@ -0,0 +1,1014 @@
+/*
+ * lcnalloc.c - Cluster (de)allocation code.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifdef NTFS_RW
+
+#include <linux/pagemap.h>
+
+#include "lcnalloc.h"
+#include "debug.h"
+#include "bitmap.h"
+#include "inode.h"
+#include "volume.h"
+#include "attrib.h"
+#include "malloc.h"
+#include "aops.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_cluster_free_from_rl_nolock - free clusters from runlist
+ * @vol:	mounted ntfs volume on which to free the clusters
+ * @rl:		runlist describing the clusters to free
+ *
+ * Free all the clusters described by the runlist @rl on the volume @vol.  In
+ * the case of an error being returned, at least some of the clusters were not
+ * freed.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - The volume lcn bitmap must be locked for writing on entry and is
+ *	      left locked on return.
+ */
+int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
+		const runlist_element *rl)
+{
+	struct inode *lcnbmp_vi = vol->lcnbmp_ino;
+	int ret = 0;
+
+	ntfs_debug("Entering.");
+	if (!rl)
+		return 0;
+	for (; rl->length; rl++) {
+		int err;
+
+		if (rl->lcn < 0)
+			continue;
+		err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length);
+		if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err))
+			ret = err;
+	}
+	ntfs_debug("Done.");
+	return ret;
+}
+
+/**
+ * ntfs_cluster_alloc - allocate clusters on an ntfs volume
+ * @vol:	mounted ntfs volume on which to allocate the clusters
+ * @start_vcn:	vcn to use for the first allocated cluster
+ * @count:	number of clusters to allocate
+ * @start_lcn:	starting lcn at which to allocate the clusters (or -1 if none)
+ * @zone:	zone from which to allocate the clusters
+ * @is_extension:	if 'true', this is an attribute extension
+ *
+ * Allocate @count clusters preferably starting at cluster @start_lcn or at the
+ * current allocator position if @start_lcn is -1, on the mounted ntfs volume
+ * @vol. @zone is either DATA_ZONE for allocation of normal clusters or
+ * MFT_ZONE for allocation of clusters for the master file table, i.e. the
+ * $MFT/$DATA attribute.
+ *
+ * @start_vcn specifies the vcn of the first allocated cluster.  This makes
+ * merging the resulting runlist with the old runlist easier.
+ *
+ * If @is_extension is 'true', the caller is allocating clusters to extend an
+ * attribute and if it is 'false', the caller is allocating clusters to fill a
+ * hole in an attribute.  Practically the difference is that if @is_extension
+ * is 'true' the returned runlist will be terminated with LCN_ENOENT and if
+ * @is_extension is 'false' the runlist will be terminated with
+ * LCN_RL_NOT_MAPPED.
+ *
+ * You need to check the return value with IS_ERR().  If this is false, the
+ * function was successful and the return value is a runlist describing the
+ * allocated cluster(s).  If IS_ERR() is true, the function failed and
+ * PTR_ERR() gives you the error code.
+ *
+ * Notes on the allocation algorithm
+ * =================================
+ *
+ * There are two data zones.  First is the area between the end of the mft zone
+ * and the end of the volume, and second is the area between the start of the
+ * volume and the start of the mft zone.  On unmodified/standard NTFS 1.x
+ * volumes, the second data zone does not exist due to the mft zone being
+ * expanded to cover the start of the volume in order to reserve space for the
+ * mft bitmap attribute.
+ *
+ * This is not the prettiest function but the complexity stems from the need of
+ * implementing the mft vs data zoned approach and from the fact that we have
+ * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we
+ * need to cope with crossing over boundaries of two buffers.  Further, the
+ * fact that the allocator allows for caller supplied hints as to the location
+ * of where allocation should begin and the fact that the allocator keeps track
+ * of where in the data zones the next natural allocation should occur,
+ * contribute to the complexity of the function.  But it should all be
+ * worthwhile, because this allocator should: 1) be a full implementation of
+ * the MFT zone approach used by Windows NT, 2) cause reduction in
+ * fragmentation, and 3) be speedy in allocations (the code is not optimized
+ * for speed, but the algorithm is, so further speed improvements are probably
+ * possible).
+ *
+ * FIXME: We should be monitoring cluster allocation and increment the MFT zone
+ * size dynamically but this is something for the future.  We will just cause
+ * heavier fragmentation by not doing it and I am not even sure Windows would
+ * grow the MFT zone dynamically, so it might even be correct not to do this.
+ * The overhead in doing dynamic MFT zone expansion would be very large and
+ * unlikely worth the effort. (AIA)
+ *
+ * TODO: I have added in double the required zone position pointer wrap around
+ * logic which can be optimized to having only one of the two logic sets.
+ * However, having the double logic will work fine, but if we have only one of
+ * the sets and we get it wrong somewhere, then we get into trouble, so
+ * removing the duplicate logic requires _very_ careful consideration of _all_
+ * possible code paths.  So at least for now, I am leaving the double logic -
+ * better safe than sorry... (AIA)
+ *
+ * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked
+ *	      on return.
+ *	    - This function takes the volume lcn bitmap lock for writing and
+ *	      modifies the bitmap contents.
+ */
+runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
+		const s64 count, const LCN start_lcn,
+		const NTFS_CLUSTER_ALLOCATION_ZONES zone,
+		const bool is_extension)
+{
+	LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
+	LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size;
+	s64 clusters;
+	loff_t i_size;
+	struct inode *lcnbmp_vi;
+	runlist_element *rl = NULL;
+	struct address_space *mapping;
+	struct page *page = NULL;
+	u8 *buf, *byte;
+	int err = 0, rlpos, rlsize, buf_size;
+	u8 pass, done_zones, search_zone, need_writeback = 0, bit;
+
+	ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn "
+			"0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn,
+			(unsigned long long)count,
+			(unsigned long long)start_lcn,
+			zone == MFT_ZONE ? "MFT" : "DATA");
+	BUG_ON(!vol);
+	lcnbmp_vi = vol->lcnbmp_ino;
+	BUG_ON(!lcnbmp_vi);
+	BUG_ON(start_vcn < 0);
+	BUG_ON(count < 0);
+	BUG_ON(start_lcn < -1);
+	BUG_ON(zone < FIRST_ZONE);
+	BUG_ON(zone > LAST_ZONE);
+
+	/* Return NULL if @count is zero. */
+	if (!count)
+		return NULL;
+	/* Take the lcnbmp lock for writing. */
+	down_write(&vol->lcnbmp_lock);
+	/*
+	 * If no specific @start_lcn was requested, use the current data zone
+	 * position, otherwise use the requested @start_lcn but make sure it
+	 * lies outside the mft zone.  Also set done_zones to 0 (no zones done)
+	 * and pass depending on whether we are starting inside a zone (1) or
+	 * at the beginning of a zone (2).  If requesting from the MFT_ZONE,
+	 * we either start at the current position within the mft zone or at
+	 * the specified position.  If the latter is out of bounds then we start
+	 * at the beginning of the MFT_ZONE.
+	 */
+	done_zones = 0;
+	pass = 1;
+	/*
+	 * zone_start and zone_end are the current search range.  search_zone
+	 * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of
+	 * volume) and 4 for data zone 2 (start of volume till start of mft
+	 * zone).
+	 */
+	zone_start = start_lcn;
+	if (zone_start < 0) {
+		if (zone == DATA_ZONE)
+			zone_start = vol->data1_zone_pos;
+		else
+			zone_start = vol->mft_zone_pos;
+		if (!zone_start) {
+			/*
+			 * Zone starts at beginning of volume which means a
+			 * single pass is sufficient.
+			 */
+			pass = 2;
+		}
+	} else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start &&
+			zone_start < vol->mft_zone_end) {
+		zone_start = vol->mft_zone_end;
+		/*
+		 * Starting at beginning of data1_zone which means a single
+		 * pass in this zone is sufficient.
+		 */
+		pass = 2;
+	} else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start ||
+			zone_start >= vol->mft_zone_end)) {
+		zone_start = vol->mft_lcn;
+		if (!vol->mft_zone_end)
+			zone_start = 0;
+		/*
+		 * Starting at beginning of volume which means a single pass
+		 * is sufficient.
+		 */
+		pass = 2;
+	}
+	if (zone == MFT_ZONE) {
+		zone_end = vol->mft_zone_end;
+		search_zone = 1;
+	} else /* if (zone == DATA_ZONE) */ {
+		/* Skip searching the mft zone. */
+		done_zones |= 1;
+		if (zone_start >= vol->mft_zone_end) {
+			zone_end = vol->nr_clusters;
+			search_zone = 2;
+		} else {
+			zone_end = vol->mft_zone_start;
+			search_zone = 4;
+		}
+	}
+	/*
+	 * bmp_pos is the current bit position inside the bitmap.  We use
+	 * bmp_initial_pos to determine whether or not to do a zone switch.
+	 */
+	bmp_pos = bmp_initial_pos = zone_start;
+
+	/* Loop until all clusters are allocated, i.e. clusters == 0. */
+	clusters = count;
+	rlpos = rlsize = 0;
+	mapping = lcnbmp_vi->i_mapping;
+	i_size = i_size_read(lcnbmp_vi);
+	while (1) {
+		ntfs_debug("Start of outer while loop: done_zones 0x%x, "
+				"search_zone %i, pass %i, zone_start 0x%llx, "
+				"zone_end 0x%llx, bmp_initial_pos 0x%llx, "
+				"bmp_pos 0x%llx, rlpos %i, rlsize %i.",
+				done_zones, search_zone, pass,
+				(unsigned long long)zone_start,
+				(unsigned long long)zone_end,
+				(unsigned long long)bmp_initial_pos,
+				(unsigned long long)bmp_pos, rlpos, rlsize);
+		/* Loop until we run out of free clusters. */
+		last_read_pos = bmp_pos >> 3;
+		ntfs_debug("last_read_pos 0x%llx.",
+				(unsigned long long)last_read_pos);
+		if (last_read_pos > i_size) {
+			ntfs_debug("End of attribute reached.  "
+					"Skipping to zone_pass_done.");
+			goto zone_pass_done;
+		}
+		if (likely(page)) {
+			if (need_writeback) {
+				ntfs_debug("Marking page dirty.");
+				flush_dcache_page(page);
+				set_page_dirty(page);
+				need_writeback = 0;
+			}
+			ntfs_unmap_page(page);
+		}
+		page = ntfs_map_page(mapping, last_read_pos >>
+				PAGE_CACHE_SHIFT);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			ntfs_error(vol->sb, "Failed to map page.");
+			goto out;
+		}
+		buf_size = last_read_pos & ~PAGE_CACHE_MASK;
+		buf = page_address(page) + buf_size;
+		buf_size = PAGE_CACHE_SIZE - buf_size;
+		if (unlikely(last_read_pos + buf_size > i_size))
+			buf_size = i_size - last_read_pos;
+		buf_size <<= 3;
+		lcn = bmp_pos & 7;
+		bmp_pos &= ~(LCN)7;
+		ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, "
+				"bmp_pos 0x%llx, need_writeback %i.", buf_size,
+				(unsigned long long)lcn,
+				(unsigned long long)bmp_pos, need_writeback);
+		while (lcn < buf_size && lcn + bmp_pos < zone_end) {
+			byte = buf + (lcn >> 3);
+			ntfs_debug("In inner while loop: buf_size %i, "
+					"lcn 0x%llx, bmp_pos 0x%llx, "
+					"need_writeback %i, byte ofs 0x%x, "
+					"*byte 0x%x.", buf_size,
+					(unsigned long long)lcn,
+					(unsigned long long)bmp_pos,
+					need_writeback,
+					(unsigned int)(lcn >> 3),
+					(unsigned int)*byte);
+			/* Skip full bytes. */
+			if (*byte == 0xff) {
+				lcn = (lcn + 8) & ~(LCN)7;
+				ntfs_debug("Continuing while loop 1.");
+				continue;
+			}
+			bit = 1 << (lcn & 7);
+			ntfs_debug("bit 0x%x.", bit);
+			/* If the bit is already set, go onto the next one. */
+			if (*byte & bit) {
+				lcn++;
+				ntfs_debug("Continuing while loop 2.");
+				continue;
+			}
+			/*
+			 * Allocate more memory if needed, including space for
+			 * the terminator element.
+			 * ntfs_malloc_nofs() operates on whole pages only.
+			 */
+			if ((rlpos + 2) * sizeof(*rl) > rlsize) {
+				runlist_element *rl2;
+
+				ntfs_debug("Reallocating memory.");
+				if (!rl)
+					ntfs_debug("First free bit is at LCN "
+							"0x%llx.",
+							(unsigned long long)
+							(lcn + bmp_pos));
+				rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
+				if (unlikely(!rl2)) {
+					err = -ENOMEM;
+					ntfs_error(vol->sb, "Failed to "
+							"allocate memory.");
+					goto out;
+				}
+				memcpy(rl2, rl, rlsize);
+				ntfs_free(rl);
+				rl = rl2;
+				rlsize += PAGE_SIZE;
+				ntfs_debug("Reallocated memory, rlsize 0x%x.",
+						rlsize);
+			}
+			/* Allocate the bitmap bit. */
+			*byte |= bit;
+			/* We need to write this bitmap page to disk. */
+			need_writeback = 1;
+			ntfs_debug("*byte 0x%x, need_writeback is set.",
+					(unsigned int)*byte);
+			/*
+			 * Coalesce with previous run if adjacent LCNs.
+			 * Otherwise, append a new run.
+			 */
+			ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), "
+					"prev_lcn 0x%llx, lcn 0x%llx, "
+					"bmp_pos 0x%llx, prev_run_len 0x%llx, "
+					"rlpos %i.",
+					(unsigned long long)(lcn + bmp_pos),
+					1ULL, (unsigned long long)prev_lcn,
+					(unsigned long long)lcn,
+					(unsigned long long)bmp_pos,
+					(unsigned long long)prev_run_len,
+					rlpos);
+			if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) {
+				ntfs_debug("Coalescing to run (lcn 0x%llx, "
+						"len 0x%llx).",
+						(unsigned long long)
+						rl[rlpos - 1].lcn,
+						(unsigned long long)
+						rl[rlpos - 1].length);
+				rl[rlpos - 1].length = ++prev_run_len;
+				ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), "
+						"prev_run_len 0x%llx.",
+						(unsigned long long)
+						rl[rlpos - 1].lcn,
+						(unsigned long long)
+						rl[rlpos - 1].length,
+						(unsigned long long)
+						prev_run_len);
+			} else {
+				if (likely(rlpos)) {
+					ntfs_debug("Adding new run, (previous "
+							"run lcn 0x%llx, "
+							"len 0x%llx).",
+							(unsigned long long)
+							rl[rlpos - 1].lcn,
+							(unsigned long long)
+							rl[rlpos - 1].length);
+					rl[rlpos].vcn = rl[rlpos - 1].vcn +
+							prev_run_len;
+				} else {
+					ntfs_debug("Adding new run, is first "
+							"run.");
+					rl[rlpos].vcn = start_vcn;
+				}
+				rl[rlpos].lcn = prev_lcn = lcn + bmp_pos;
+				rl[rlpos].length = prev_run_len = 1;
+				rlpos++;
+			}
+			/* Done? */
+			if (!--clusters) {
+				LCN tc;
+				/*
+				 * Update the current zone position.  Positions
+				 * of already scanned zones have been updated
+				 * during the respective zone switches.
+				 */
+				tc = lcn + bmp_pos + 1;
+				ntfs_debug("Done. Updating current zone "
+						"position, tc 0x%llx, "
+						"search_zone %i.",
+						(unsigned long long)tc,
+						search_zone);
+				switch (search_zone) {
+				case 1:
+					ntfs_debug("Before checks, "
+							"vol->mft_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->mft_zone_pos);
+					if (tc >= vol->mft_zone_end) {
+						vol->mft_zone_pos =
+								vol->mft_lcn;
+						if (!vol->mft_zone_end)
+							vol->mft_zone_pos = 0;
+					} else if ((bmp_initial_pos >=
+							vol->mft_zone_pos ||
+							tc > vol->mft_zone_pos)
+							&& tc >= vol->mft_lcn)
+						vol->mft_zone_pos = tc;
+					ntfs_debug("After checks, "
+							"vol->mft_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->mft_zone_pos);
+					break;
+				case 2:
+					ntfs_debug("Before checks, "
+							"vol->data1_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data1_zone_pos);
+					if (tc >= vol->nr_clusters)
+						vol->data1_zone_pos =
+							     vol->mft_zone_end;
+					else if ((bmp_initial_pos >=
+						    vol->data1_zone_pos ||
+						    tc > vol->data1_zone_pos)
+						    && tc >= vol->mft_zone_end)
+						vol->data1_zone_pos = tc;
+					ntfs_debug("After checks, "
+							"vol->data1_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data1_zone_pos);
+					break;
+				case 4:
+					ntfs_debug("Before checks, "
+							"vol->data2_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data2_zone_pos);
+					if (tc >= vol->mft_zone_start)
+						vol->data2_zone_pos = 0;
+					else if (bmp_initial_pos >=
+						      vol->data2_zone_pos ||
+						      tc > vol->data2_zone_pos)
+						vol->data2_zone_pos = tc;
+					ntfs_debug("After checks, "
+							"vol->data2_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data2_zone_pos);
+					break;
+				default:
+					BUG();
+				}
+				ntfs_debug("Finished.  Going to out.");
+				goto out;
+			}
+			lcn++;
+		}
+		bmp_pos += buf_size;
+		ntfs_debug("After inner while loop: buf_size 0x%x, lcn "
+				"0x%llx, bmp_pos 0x%llx, need_writeback %i.",
+				buf_size, (unsigned long long)lcn,
+				(unsigned long long)bmp_pos, need_writeback);
+		if (bmp_pos < zone_end) {
+			ntfs_debug("Continuing outer while loop, "
+					"bmp_pos 0x%llx, zone_end 0x%llx.",
+					(unsigned long long)bmp_pos,
+					(unsigned long long)zone_end);
+			continue;
+		}
+zone_pass_done:	/* Finished with the current zone pass. */
+		ntfs_debug("At zone_pass_done, pass %i.", pass);
+		if (pass == 1) {
+			/*
+			 * Now do pass 2, scanning the first part of the zone
+			 * we omitted in pass 1.
+			 */
+			pass = 2;
+			zone_end = zone_start;
+			switch (search_zone) {
+			case 1: /* mft_zone */
+				zone_start = vol->mft_zone_start;
+				break;
+			case 2: /* data1_zone */
+				zone_start = vol->mft_zone_end;
+				break;
+			case 4: /* data2_zone */
+				zone_start = 0;
+				break;
+			default:
+				BUG();
+			}
+			/* Sanity check. */
+			if (zone_end < zone_start)
+				zone_end = zone_start;
+			bmp_pos = zone_start;
+			ntfs_debug("Continuing outer while loop, pass 2, "
+					"zone_start 0x%llx, zone_end 0x%llx, "
+					"bmp_pos 0x%llx.",
+					(unsigned long long)zone_start,
+					(unsigned long long)zone_end,
+					(unsigned long long)bmp_pos);
+			continue;
+		} /* pass == 2 */
+done_zones_check:
+		ntfs_debug("At done_zones_check, search_zone %i, done_zones "
+				"before 0x%x, done_zones after 0x%x.",
+				search_zone, done_zones,
+				done_zones | search_zone);
+		done_zones |= search_zone;
+		if (done_zones < 7) {
+			ntfs_debug("Switching zone.");
+			/* Now switch to the next zone we haven't done yet. */
+			pass = 1;
+			switch (search_zone) {
+			case 1:
+				ntfs_debug("Switching from mft zone to data1 "
+						"zone.");
+				/* Update mft zone position. */
+				if (rlpos) {
+					LCN tc;
+
+					ntfs_debug("Before checks, "
+							"vol->mft_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->mft_zone_pos);
+					tc = rl[rlpos - 1].lcn +
+							rl[rlpos - 1].length;
+					if (tc >= vol->mft_zone_end) {
+						vol->mft_zone_pos =
+								vol->mft_lcn;
+						if (!vol->mft_zone_end)
+							vol->mft_zone_pos = 0;
+					} else if ((bmp_initial_pos >=
+							vol->mft_zone_pos ||
+							tc > vol->mft_zone_pos)
+							&& tc >= vol->mft_lcn)
+						vol->mft_zone_pos = tc;
+					ntfs_debug("After checks, "
+							"vol->mft_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->mft_zone_pos);
+				}
+				/* Switch from mft zone to data1 zone. */
+switch_to_data1_zone:		search_zone = 2;
+				zone_start = bmp_initial_pos =
+						vol->data1_zone_pos;
+				zone_end = vol->nr_clusters;
+				if (zone_start == vol->mft_zone_end)
+					pass = 2;
+				if (zone_start >= zone_end) {
+					vol->data1_zone_pos = zone_start =
+							vol->mft_zone_end;
+					pass = 2;
+				}
+				break;
+			case 2:
+				ntfs_debug("Switching from data1 zone to "
+						"data2 zone.");
+				/* Update data1 zone position. */
+				if (rlpos) {
+					LCN tc;
+
+					ntfs_debug("Before checks, "
+							"vol->data1_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data1_zone_pos);
+					tc = rl[rlpos - 1].lcn +
+							rl[rlpos - 1].length;
+					if (tc >= vol->nr_clusters)
+						vol->data1_zone_pos =
+							     vol->mft_zone_end;
+					else if ((bmp_initial_pos >=
+						    vol->data1_zone_pos ||
+						    tc > vol->data1_zone_pos)
+						    && tc >= vol->mft_zone_end)
+						vol->data1_zone_pos = tc;
+					ntfs_debug("After checks, "
+							"vol->data1_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data1_zone_pos);
+				}
+				/* Switch from data1 zone to data2 zone. */
+				search_zone = 4;
+				zone_start = bmp_initial_pos =
+						vol->data2_zone_pos;
+				zone_end = vol->mft_zone_start;
+				if (!zone_start)
+					pass = 2;
+				if (zone_start >= zone_end) {
+					vol->data2_zone_pos = zone_start =
+							bmp_initial_pos = 0;
+					pass = 2;
+				}
+				break;
+			case 4:
+				ntfs_debug("Switching from data2 zone to "
+						"data1 zone.");
+				/* Update data2 zone position. */
+				if (rlpos) {
+					LCN tc;
+
+					ntfs_debug("Before checks, "
+							"vol->data2_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data2_zone_pos);
+					tc = rl[rlpos - 1].lcn +
+							rl[rlpos - 1].length;
+					if (tc >= vol->mft_zone_start)
+						vol->data2_zone_pos = 0;
+					else if (bmp_initial_pos >=
+						      vol->data2_zone_pos ||
+						      tc > vol->data2_zone_pos)
+						vol->data2_zone_pos = tc;
+					ntfs_debug("After checks, "
+							"vol->data2_zone_pos "
+							"0x%llx.",
+							(unsigned long long)
+							vol->data2_zone_pos);
+				}
+				/* Switch from data2 zone to data1 zone. */
+				goto switch_to_data1_zone;
+			default:
+				BUG();
+			}
+			ntfs_debug("After zone switch, search_zone %i, "
+					"pass %i, bmp_initial_pos 0x%llx, "
+					"zone_start 0x%llx, zone_end 0x%llx.",
+					search_zone, pass,
+					(unsigned long long)bmp_initial_pos,
+					(unsigned long long)zone_start,
+					(unsigned long long)zone_end);
+			bmp_pos = zone_start;
+			if (zone_start == zone_end) {
+				ntfs_debug("Empty zone, going to "
+						"done_zones_check.");
+				/* Empty zone. Don't bother searching it. */
+				goto done_zones_check;
+			}
+			ntfs_debug("Continuing outer while loop.");
+			continue;
+		} /* done_zones == 7 */
+		ntfs_debug("All zones are finished.");
+		/*
+		 * All zones are finished!  If DATA_ZONE, shrink mft zone.  If
+		 * MFT_ZONE, we have really run out of space.
+		 */
+		mft_zone_size = vol->mft_zone_end - vol->mft_zone_start;
+		ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end "
+				"0x%llx, mft_zone_size 0x%llx.",
+				(unsigned long long)vol->mft_zone_start,
+				(unsigned long long)vol->mft_zone_end,
+				(unsigned long long)mft_zone_size);
+		if (zone == MFT_ZONE || mft_zone_size <= 0) {
+			ntfs_debug("No free clusters left, going to out.");
+			/* Really no more space left on device. */
+			err = -ENOSPC;
+			goto out;
+		} /* zone == DATA_ZONE && mft_zone_size > 0 */
+		ntfs_debug("Shrinking mft zone.");
+		zone_end = vol->mft_zone_end;
+		mft_zone_size >>= 1;
+		if (mft_zone_size > 0)
+			vol->mft_zone_end = vol->mft_zone_start + mft_zone_size;
+		else /* mft zone and data2 zone no longer exist. */
+			vol->data2_zone_pos = vol->mft_zone_start =
+					vol->mft_zone_end = 0;
+		if (vol->mft_zone_pos >= vol->mft_zone_end) {
+			vol->mft_zone_pos = vol->mft_lcn;
+			if (!vol->mft_zone_end)
+				vol->mft_zone_pos = 0;
+		}
+		bmp_pos = zone_start = bmp_initial_pos =
+				vol->data1_zone_pos = vol->mft_zone_end;
+		search_zone = 2;
+		pass = 2;
+		done_zones &= ~2;
+		ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, "
+				"vol->mft_zone_start 0x%llx, "
+				"vol->mft_zone_end 0x%llx, "
+				"vol->mft_zone_pos 0x%llx, search_zone 2, "
+				"pass 2, dones_zones 0x%x, zone_start 0x%llx, "
+				"zone_end 0x%llx, vol->data1_zone_pos 0x%llx, "
+				"continuing outer while loop.",
+				(unsigned long long)mft_zone_size,
+				(unsigned long long)vol->mft_zone_start,
+				(unsigned long long)vol->mft_zone_end,
+				(unsigned long long)vol->mft_zone_pos,
+				done_zones, (unsigned long long)zone_start,
+				(unsigned long long)zone_end,
+				(unsigned long long)vol->data1_zone_pos);
+	}
+	ntfs_debug("After outer while loop.");
+out:
+	ntfs_debug("At out.");
+	/* Add runlist terminator element. */
+	if (likely(rl)) {
+		rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length;
+		rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED;
+		rl[rlpos].length = 0;
+	}
+	if (likely(page && !IS_ERR(page))) {
+		if (need_writeback) {
+			ntfs_debug("Marking page dirty.");
+			flush_dcache_page(page);
+			set_page_dirty(page);
+			need_writeback = 0;
+		}
+		ntfs_unmap_page(page);
+	}
+	if (likely(!err)) {
+		up_write(&vol->lcnbmp_lock);
+		ntfs_debug("Done.");
+		return rl;
+	}
+	ntfs_error(vol->sb, "Failed to allocate clusters, aborting "
+			"(error %i).", err);
+	if (rl) {
+		int err2;
+
+		if (err == -ENOSPC)
+			ntfs_debug("Not enough space to complete allocation, "
+					"err -ENOSPC, first free lcn 0x%llx, "
+					"could allocate up to 0x%llx "
+					"clusters.",
+					(unsigned long long)rl[0].lcn,
+					(unsigned long long)(count - clusters));
+		/* Deallocate all allocated clusters. */
+		ntfs_debug("Attempting rollback...");
+		err2 = ntfs_cluster_free_from_rl_nolock(vol, rl);
+		if (err2) {
+			ntfs_error(vol->sb, "Failed to rollback (error %i).  "
+					"Leaving inconsistent metadata!  "
+					"Unmount and run chkdsk.", err2);
+			NVolSetErrors(vol);
+		}
+		/* Free the runlist. */
+		ntfs_free(rl);
+	} else if (err == -ENOSPC)
+		ntfs_debug("No space left at all, err = -ENOSPC, first free "
+				"lcn = 0x%llx.",
+				(long long)vol->data1_zone_pos);
+	up_write(&vol->lcnbmp_lock);
+	return ERR_PTR(err);
+}
+
+/**
+ * __ntfs_cluster_free - free clusters on an ntfs volume
+ * @ni:		ntfs inode whose runlist describes the clusters to free
+ * @start_vcn:	vcn in the runlist of @ni at which to start freeing clusters
+ * @count:	number of clusters to free or -1 for all clusters
+ * @ctx:	active attribute search context if present or NULL if not
+ * @is_rollback:	true if this is a rollback operation
+ *
+ * Free @count clusters starting at the cluster @start_vcn in the runlist
+ * described by the vfs inode @ni.
+ *
+ * If @count is -1, all clusters from @start_vcn to the end of the runlist are
+ * deallocated.  Thus, to completely free all clusters in a runlist, use
+ * @start_vcn = 0 and @count = -1.
+ *
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record.  This is needed when __ntfs_cluster_free() encounters unmapped
+ * runlist fragments and allows their mapping.  If you do not have the mft
+ * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will
+ * perform the necessary mapping and unmapping.
+ *
+ * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it
+ * before returning.  Thus, @ctx will be left pointing to the same attribute on
+ * return as on entry.  However, the actual pointers in @ctx may point to
+ * different memory locations on return, so you must remember to reset any
+ * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(),
+ * you will probably want to do:
+ *	m = ctx->mrec;
+ *	a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ *
+ * @is_rollback should always be 'false', it is for internal use to rollback
+ * errors.  You probably want to use ntfs_cluster_free() instead.
+ *
+ * Note, __ntfs_cluster_free() does not modify the runlist, so you have to
+ * remove from the runlist or mark sparse the freed runs later.
+ *
+ * Return the number of deallocated clusters (not counting sparse ones) on
+ * success and -errno on error.
+ *
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ *	    is no longer valid, i.e. you need to either call
+ *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ *	    why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ *	      and is locked on return.  Note the runlist may be modified when
+ *	      needed runlist fragments need to be mapped.
+ *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
+ *	      on return.
+ *	    - This function takes the volume lcn bitmap lock for writing and
+ *	      modifies the bitmap contents.
+ *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ *	      entry and it will be left unmapped on return.
+ *	    - If @ctx is not NULL, the base mft record must be mapped on entry
+ *	      and it will be left mapped on return.
+ */
+s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
+		ntfs_attr_search_ctx *ctx, const bool is_rollback)
+{
+	s64 delta, to_free, total_freed, real_freed;
+	ntfs_volume *vol;
+	struct inode *lcnbmp_vi;
+	runlist_element *rl;
+	int err;
+
+	BUG_ON(!ni);
+	ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count "
+			"0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn,
+			(unsigned long long)count,
+			is_rollback ? " (rollback)" : "");
+	vol = ni->vol;
+	lcnbmp_vi = vol->lcnbmp_ino;
+	BUG_ON(!lcnbmp_vi);
+	BUG_ON(start_vcn < 0);
+	BUG_ON(count < -1);
+	/*
+	 * Lock the lcn bitmap for writing but only if not rolling back.  We
+	 * must hold the lock all the way including through rollback otherwise
+	 * rollback is not possible because once we have cleared a bit and
+	 * dropped the lock, anyone could have set the bit again, thus
+	 * allocating the cluster for another use.
+	 */
+	if (likely(!is_rollback))
+		down_write(&vol->lcnbmp_lock);
+
+	total_freed = real_freed = 0;
+
+	rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx);
+	if (IS_ERR(rl)) {
+		if (!is_rollback)
+			ntfs_error(vol->sb, "Failed to find first runlist "
+					"element (error %li), aborting.",
+					PTR_ERR(rl));
+		err = PTR_ERR(rl);
+		goto err_out;
+	}
+	if (unlikely(rl->lcn < LCN_HOLE)) {
+		if (!is_rollback)
+			ntfs_error(vol->sb, "First runlist element has "
+					"invalid lcn, aborting.");
+		err = -EIO;
+		goto err_out;
+	}
+	/* Find the starting cluster inside the run that needs freeing. */
+	delta = start_vcn - rl->vcn;
+
+	/* The number of clusters in this run that need freeing. */
+	to_free = rl->length - delta;
+	if (count >= 0 && to_free > count)
+		to_free = count;
+
+	if (likely(rl->lcn >= 0)) {
+		/* Do the actual freeing of the clusters in this run. */
+		err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta,
+				to_free, likely(!is_rollback) ? 0 : 1);
+		if (unlikely(err)) {
+			if (!is_rollback)
+				ntfs_error(vol->sb, "Failed to clear first run "
+						"(error %i), aborting.", err);
+			goto err_out;
+		}
+		/* We have freed @to_free real clusters. */
+		real_freed = to_free;
+	};
+	/* Go to the next run and adjust the number of clusters left to free. */
+	++rl;
+	if (count >= 0)
+		count -= to_free;
+
+	/* Keep track of the total "freed" clusters, including sparse ones. */
+	total_freed = to_free;
+	/*
+	 * Loop over the remaining runs, using @count as a capping value, and
+	 * free them.
+	 */
+	for (; rl->length && count != 0; ++rl) {
+		if (unlikely(rl->lcn < LCN_HOLE)) {
+			VCN vcn;
+
+			/* Attempt to map runlist. */
+			vcn = rl->vcn;
+			rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx);
+			if (IS_ERR(rl)) {
+				err = PTR_ERR(rl);
+				if (!is_rollback)
+					ntfs_error(vol->sb, "Failed to map "
+							"runlist fragment or "
+							"failed to find "
+							"subsequent runlist "
+							"element.");
+				goto err_out;
+			}
+			if (unlikely(rl->lcn < LCN_HOLE)) {
+				if (!is_rollback)
+					ntfs_error(vol->sb, "Runlist element "
+							"has invalid lcn "
+							"(0x%llx).",
+							(unsigned long long)
+							rl->lcn);
+				err = -EIO;
+				goto err_out;
+			}
+		}
+		/* The number of clusters in this run that need freeing. */
+		to_free = rl->length;
+		if (count >= 0 && to_free > count)
+			to_free = count;
+
+		if (likely(rl->lcn >= 0)) {
+			/* Do the actual freeing of the clusters in the run. */
+			err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn,
+					to_free, likely(!is_rollback) ? 0 : 1);
+			if (unlikely(err)) {
+				if (!is_rollback)
+					ntfs_error(vol->sb, "Failed to clear "
+							"subsequent run.");
+				goto err_out;
+			}
+			/* We have freed @to_free real clusters. */
+			real_freed += to_free;
+		}
+		/* Adjust the number of clusters left to free. */
+		if (count >= 0)
+			count -= to_free;
+	
+		/* Update the total done clusters. */
+		total_freed += to_free;
+	}
+	if (likely(!is_rollback))
+		up_write(&vol->lcnbmp_lock);
+
+	BUG_ON(count > 0);
+
+	/* We are done.  Return the number of actually freed clusters. */
+	ntfs_debug("Done.");
+	return real_freed;
+err_out:
+	if (is_rollback)
+		return err;
+	/* If no real clusters were freed, no need to rollback. */
+	if (!real_freed) {
+		up_write(&vol->lcnbmp_lock);
+		return err;
+	}
+	/*
+	 * Attempt to rollback and if that succeeds just return the error code.
+	 * If rollback fails, set the volume errors flag, emit an error
+	 * message, and return the error code.
+	 */
+	delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true);
+	if (delta < 0) {
+		ntfs_error(vol->sb, "Failed to rollback (error %i).  Leaving "
+				"inconsistent metadata!  Unmount and run "
+				"chkdsk.", (int)delta);
+		NVolSetErrors(vol);
+	}
+	up_write(&vol->lcnbmp_lock);
+	ntfs_error(vol->sb, "Aborting (error %i).", err);
+	return err;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
new file mode 100644
index 00000000..2adb0431
--- /dev/null
+++ b/fs/ntfs/lcnalloc.h
@@ -0,0 +1,145 @@
+/*
+ * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation.  Part of the
+ *		Linux-NTFS project.
+ *
+ * Copyright (c) 2004-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_LCNALLOC_H
+#define _LINUX_NTFS_LCNALLOC_H
+
+#ifdef NTFS_RW
+
+#include <linux/fs.h>
+
+#include "attrib.h"
+#include "types.h"
+#include "inode.h"
+#include "runlist.h"
+#include "volume.h"
+
+typedef enum {
+	FIRST_ZONE	= 0,	/* For sanity checking. */
+	MFT_ZONE	= 0,	/* Allocate from $MFT zone. */
+	DATA_ZONE	= 1,	/* Allocate from $DATA zone. */
+	LAST_ZONE	= 1,	/* For sanity checking. */
+} NTFS_CLUSTER_ALLOCATION_ZONES;
+
+extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
+		const VCN start_vcn, const s64 count, const LCN start_lcn,
+		const NTFS_CLUSTER_ALLOCATION_ZONES zone,
+		const bool is_extension);
+
+extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
+		s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback);
+
+/**
+ * ntfs_cluster_free - free clusters on an ntfs volume
+ * @ni:		ntfs inode whose runlist describes the clusters to free
+ * @start_vcn:	vcn in the runlist of @ni at which to start freeing clusters
+ * @count:	number of clusters to free or -1 for all clusters
+ * @ctx:	active attribute search context if present or NULL if not
+ *
+ * Free @count clusters starting at the cluster @start_vcn in the runlist
+ * described by the ntfs inode @ni.
+ *
+ * If @count is -1, all clusters from @start_vcn to the end of the runlist are
+ * deallocated.  Thus, to completely free all clusters in a runlist, use
+ * @start_vcn = 0 and @count = -1.
+ *
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record.  This is needed when ntfs_cluster_free() encounters unmapped runlist
+ * fragments and allows their mapping.  If you do not have the mft record
+ * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform
+ * the necessary mapping and unmapping.
+ *
+ * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it
+ * before returning.  Thus, @ctx will be left pointing to the same attribute on
+ * return as on entry.  However, the actual pointers in @ctx may point to
+ * different memory locations on return, so you must remember to reset any
+ * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(),
+ * you will probably want to do:
+ *	m = ctx->mrec;
+ *	a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ *
+ * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove
+ * from the runlist or mark sparse the freed runs later.
+ *
+ * Return the number of deallocated clusters (not counting sparse ones) on
+ * success and -errno on error.
+ *
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ *	    is no longer valid, i.e. you need to either call
+ *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ *	    why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ *	      and is locked on return.  Note the runlist may be modified when
+ *	      needed runlist fragments need to be mapped.
+ *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
+ *	      on return.
+ *	    - This function takes the volume lcn bitmap lock for writing and
+ *	      modifies the bitmap contents.
+ *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ *	      entry and it will be left unmapped on return.
+ *	    - If @ctx is not NULL, the base mft record must be mapped on entry
+ *	      and it will be left mapped on return.
+ */
+static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
+		s64 count, ntfs_attr_search_ctx *ctx)
+{
+	return __ntfs_cluster_free(ni, start_vcn, count, ctx, false);
+}
+
+extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
+		const runlist_element *rl);
+
+/**
+ * ntfs_cluster_free_from_rl - free clusters from runlist
+ * @vol:	mounted ntfs volume on which to free the clusters
+ * @rl:		runlist describing the clusters to free
+ *
+ * Free all the clusters described by the runlist @rl on the volume @vol.  In
+ * the case of an error being returned, at least some of the clusters were not
+ * freed.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - This function takes the volume lcn bitmap lock for writing and
+ *	      modifies the bitmap contents.
+ *	    - The caller must have locked the runlist @rl for reading or
+ *	      writing.
+ */
+static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol,
+		const runlist_element *rl)
+{
+	int ret;
+
+	down_write(&vol->lcnbmp_lock);
+	ret = ntfs_cluster_free_from_rl_nolock(vol, rl);
+	up_write(&vol->lcnbmp_lock);
+	return ret;
+}
+
+#endif /* NTFS_RW */
+
+#endif /* defined _LINUX_NTFS_LCNALLOC_H */
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
new file mode 100644
index 00000000..c71de292
--- /dev/null
+++ b/fs/ntfs/logfile.c
@@ -0,0 +1,863 @@
+/*
+ * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2002-2007 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifdef NTFS_RW
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <linux/log2.h>
+
+#include "attrib.h"
+#include "aops.h"
+#include "debug.h"
+#include "logfile.h"
+#include "malloc.h"
+#include "volume.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_check_restart_page_header - check the page header for consistency
+ * @vi:		$LogFile inode to which the restart page header belongs
+ * @rp:		restart page header to check
+ * @pos:	position in @vi at which the restart page header resides
+ *
+ * Check the restart page header @rp for consistency and return 'true' if it is
+ * consistent and 'false' otherwise.
+ *
+ * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
+ * require the full restart page.
+ */
+static bool ntfs_check_restart_page_header(struct inode *vi,
+		RESTART_PAGE_HEADER *rp, s64 pos)
+{
+	u32 logfile_system_page_size, logfile_log_page_size;
+	u16 ra_ofs, usa_count, usa_ofs, usa_end = 0;
+	bool have_usa = true;
+
+	ntfs_debug("Entering.");
+	/*
+	 * If the system or log page sizes are smaller than the ntfs block size
+	 * or either is not a power of 2 we cannot handle this log file.
+	 */
+	logfile_system_page_size = le32_to_cpu(rp->system_page_size);
+	logfile_log_page_size = le32_to_cpu(rp->log_page_size);
+	if (logfile_system_page_size < NTFS_BLOCK_SIZE ||
+			logfile_log_page_size < NTFS_BLOCK_SIZE ||
+			logfile_system_page_size &
+			(logfile_system_page_size - 1) ||
+			!is_power_of_2(logfile_log_page_size)) {
+		ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
+		return false;
+	}
+	/*
+	 * We must be either at !pos (1st restart page) or at pos = system page
+	 * size (2nd restart page).
+	 */
+	if (pos && pos != logfile_system_page_size) {
+		ntfs_error(vi->i_sb, "Found restart area in incorrect "
+				"position in $LogFile.");
+		return false;
+	}
+	/* We only know how to handle version 1.1. */
+	if (sle16_to_cpu(rp->major_ver) != 1 ||
+			sle16_to_cpu(rp->minor_ver) != 1) {
+		ntfs_error(vi->i_sb, "$LogFile version %i.%i is not "
+				"supported.  (This driver supports version "
+				"1.1 only.)", (int)sle16_to_cpu(rp->major_ver),
+				(int)sle16_to_cpu(rp->minor_ver));
+		return false;
+	}
+	/*
+	 * If chkdsk has been run the restart page may not be protected by an
+	 * update sequence array.
+	 */
+	if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) {
+		have_usa = false;
+		goto skip_usa_checks;
+	}
+	/* Verify the size of the update sequence array. */
+	usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS);
+	if (usa_count != le16_to_cpu(rp->usa_count)) {
+		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
+				"inconsistent update sequence array count.");
+		return false;
+	}
+	/* Verify the position of the update sequence array. */
+	usa_ofs = le16_to_cpu(rp->usa_ofs);
+	usa_end = usa_ofs + usa_count * sizeof(u16);
+	if (usa_ofs < sizeof(RESTART_PAGE_HEADER) ||
+			usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) {
+		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
+				"inconsistent update sequence array offset.");
+		return false;
+	}
+skip_usa_checks:
+	/*
+	 * Verify the position of the restart area.  It must be:
+	 *	- aligned to 8-byte boundary,
+	 *	- after the update sequence array, and
+	 *	- within the system page size.
+	 */
+	ra_ofs = le16_to_cpu(rp->restart_area_offset);
+	if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end :
+			ra_ofs < sizeof(RESTART_PAGE_HEADER)) ||
+			ra_ofs > logfile_system_page_size) {
+		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
+				"inconsistent restart area offset.");
+		return false;
+	}
+	/*
+	 * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn
+	 * set.
+	 */
+	if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) {
+		ntfs_error(vi->i_sb, "$LogFile restart page is not modified "
+				"by chkdsk but a chkdsk LSN is specified.");
+		return false;
+	}
+	ntfs_debug("Done.");
+	return true;
+}
+
+/**
+ * ntfs_check_restart_area - check the restart area for consistency
+ * @vi:		$LogFile inode to which the restart page belongs
+ * @rp:		restart page whose restart area to check
+ *
+ * Check the restart area of the restart page @rp for consistency and return
+ * 'true' if it is consistent and 'false' otherwise.
+ *
+ * This function assumes that the restart page header has already been
+ * consistency checked.
+ *
+ * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
+ * require the full restart page.
+ */
+static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
+{
+	u64 file_size;
+	RESTART_AREA *ra;
+	u16 ra_ofs, ra_len, ca_ofs;
+	u8 fs_bits;
+
+	ntfs_debug("Entering.");
+	ra_ofs = le16_to_cpu(rp->restart_area_offset);
+	ra = (RESTART_AREA*)((u8*)rp + ra_ofs);
+	/*
+	 * Everything before ra->file_size must be before the first word
+	 * protected by an update sequence number.  This ensures that it is
+	 * safe to access ra->client_array_offset.
+	 */
+	if (ra_ofs + offsetof(RESTART_AREA, file_size) >
+			NTFS_BLOCK_SIZE - sizeof(u16)) {
+		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+				"inconsistent file offset.");
+		return false;
+	}
+	/*
+	 * Now that we can access ra->client_array_offset, make sure everything
+	 * up to the log client array is before the first word protected by an
+	 * update sequence number.  This ensures we can access all of the
+	 * restart area elements safely.  Also, the client array offset must be
+	 * aligned to an 8-byte boundary.
+	 */
+	ca_ofs = le16_to_cpu(ra->client_array_offset);
+	if (((ca_ofs + 7) & ~7) != ca_ofs ||
+			ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) {
+		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+				"inconsistent client array offset.");
+		return false;
+	}
+	/*
+	 * The restart area must end within the system page size both when
+	 * calculated manually and as specified by ra->restart_area_length.
+	 * Also, the calculated length must not exceed the specified length.
+	 */
+	ra_len = ca_ofs + le16_to_cpu(ra->log_clients) *
+			sizeof(LOG_CLIENT_RECORD);
+	if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) ||
+			ra_ofs + le16_to_cpu(ra->restart_area_length) >
+			le32_to_cpu(rp->system_page_size) ||
+			ra_len > le16_to_cpu(ra->restart_area_length)) {
+		ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds "
+				"of the system page size specified by the "
+				"restart page header and/or the specified "
+				"restart area length is inconsistent.");
+		return false;
+	}
+	/*
+	 * The ra->client_free_list and ra->client_in_use_list must be either
+	 * LOGFILE_NO_CLIENT or less than ra->log_clients or they are
+	 * overflowing the client array.
+	 */
+	if ((ra->client_free_list != LOGFILE_NO_CLIENT &&
+			le16_to_cpu(ra->client_free_list) >=
+			le16_to_cpu(ra->log_clients)) ||
+			(ra->client_in_use_list != LOGFILE_NO_CLIENT &&
+			le16_to_cpu(ra->client_in_use_list) >=
+			le16_to_cpu(ra->log_clients))) {
+		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+				"overflowing client free and/or in use lists.");
+		return false;
+	}
+	/*
+	 * Check ra->seq_number_bits against ra->file_size for consistency.
+	 * We cannot just use ffs() because the file size is not a power of 2.
+	 */
+	file_size = (u64)sle64_to_cpu(ra->file_size);
+	fs_bits = 0;
+	while (file_size) {
+		file_size >>= 1;
+		fs_bits++;
+	}
+	if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) {
+		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+				"inconsistent sequence number bits.");
+		return false;
+	}
+	/* The log record header length must be a multiple of 8. */
+	if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) !=
+			le16_to_cpu(ra->log_record_header_length)) {
+		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+				"inconsistent log record header length.");
+		return false;
+	}
+	/* Dito for the log page data offset. */
+	if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) !=
+			le16_to_cpu(ra->log_page_data_offset)) {
+		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+				"inconsistent log page data offset.");
+		return false;
+	}
+	ntfs_debug("Done.");
+	return true;
+}
+
+/**
+ * ntfs_check_log_client_array - check the log client array for consistency
+ * @vi:		$LogFile inode to which the restart page belongs
+ * @rp:		restart page whose log client array to check
+ *
+ * Check the log client array of the restart page @rp for consistency and
+ * return 'true' if it is consistent and 'false' otherwise.
+ *
+ * This function assumes that the restart page header and the restart area have
+ * already been consistency checked.
+ *
+ * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this
+ * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full
+ * restart page and the page must be multi sector transfer deprotected.
+ */
+static bool ntfs_check_log_client_array(struct inode *vi,
+		RESTART_PAGE_HEADER *rp)
+{
+	RESTART_AREA *ra;
+	LOG_CLIENT_RECORD *ca, *cr;
+	u16 nr_clients, idx;
+	bool in_free_list, idx_is_first;
+
+	ntfs_debug("Entering.");
+	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
+	ca = (LOG_CLIENT_RECORD*)((u8*)ra +
+			le16_to_cpu(ra->client_array_offset));
+	/*
+	 * Check the ra->client_free_list first and then check the
+	 * ra->client_in_use_list.  Check each of the log client records in
+	 * each of the lists and check that the array does not overflow the
+	 * ra->log_clients value.  Also keep track of the number of records
+	 * visited as there cannot be more than ra->log_clients records and
+	 * that way we detect eventual loops in within a list.
+	 */
+	nr_clients = le16_to_cpu(ra->log_clients);
+	idx = le16_to_cpu(ra->client_free_list);
+	in_free_list = true;
+check_list:
+	for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--,
+			idx = le16_to_cpu(cr->next_client)) {
+		if (!nr_clients || idx >= le16_to_cpu(ra->log_clients))
+			goto err_out;
+		/* Set @cr to the current log client record. */
+		cr = ca + idx;
+		/* The first log client record must not have a prev_client. */
+		if (idx_is_first) {
+			if (cr->prev_client != LOGFILE_NO_CLIENT)
+				goto err_out;
+			idx_is_first = false;
+		}
+	}
+	/* Switch to and check the in use list if we just did the free list. */
+	if (in_free_list) {
+		in_free_list = false;
+		idx = le16_to_cpu(ra->client_in_use_list);
+		goto check_list;
+	}
+	ntfs_debug("Done.");
+	return true;
+err_out:
+	ntfs_error(vi->i_sb, "$LogFile log client array is corrupt.");
+	return false;
+}
+
+/**
+ * ntfs_check_and_load_restart_page - check the restart page for consistency
+ * @vi:		$LogFile inode to which the restart page belongs
+ * @rp:		restart page to check
+ * @pos:	position in @vi at which the restart page resides
+ * @wrp:	[OUT] copy of the multi sector transfer deprotected restart page
+ * @lsn:	[OUT] set to the current logfile lsn on success
+ *
+ * Check the restart page @rp for consistency and return 0 if it is consistent
+ * and -errno otherwise.  The restart page may have been modified by chkdsk in
+ * which case its magic is CHKD instead of RSTR.
+ *
+ * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
+ * require the full restart page.
+ *
+ * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a
+ * copy of the complete multi sector transfer deprotected page.  On failure,
+ * *@wrp is undefined.
+ *
+ * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current
+ * logfile lsn according to this restart page.  On failure, *@lsn is undefined.
+ *
+ * The following error codes are defined:
+ *	-EINVAL	- The restart page is inconsistent.
+ *	-ENOMEM	- Not enough memory to load the restart page.
+ *	-EIO	- Failed to reading from $LogFile.
+ */
+static int ntfs_check_and_load_restart_page(struct inode *vi,
+		RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp,
+		LSN *lsn)
+{
+	RESTART_AREA *ra;
+	RESTART_PAGE_HEADER *trp;
+	int size, err;
+
+	ntfs_debug("Entering.");
+	/* Check the restart page header for consistency. */
+	if (!ntfs_check_restart_page_header(vi, rp, pos)) {
+		/* Error output already done inside the function. */
+		return -EINVAL;
+	}
+	/* Check the restart area for consistency. */
+	if (!ntfs_check_restart_area(vi, rp)) {
+		/* Error output already done inside the function. */
+		return -EINVAL;
+	}
+	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
+	/*
+	 * Allocate a buffer to store the whole restart page so we can multi
+	 * sector transfer deprotect it.
+	 */
+	trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size));
+	if (!trp) {
+		ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile "
+				"restart page buffer.");
+		return -ENOMEM;
+	}
+	/*
+	 * Read the whole of the restart page into the buffer.  If it fits
+	 * completely inside @rp, just copy it from there.  Otherwise map all
+	 * the required pages and copy the data from them.
+	 */
+	size = PAGE_CACHE_SIZE - (pos & ~PAGE_CACHE_MASK);
+	if (size >= le32_to_cpu(rp->system_page_size)) {
+		memcpy(trp, rp, le32_to_cpu(rp->system_page_size));
+	} else {
+		pgoff_t idx;
+		struct page *page;
+		int have_read, to_read;
+
+		/* First copy what we already have in @rp. */
+		memcpy(trp, rp, size);
+		/* Copy the remaining data one page at a time. */
+		have_read = size;
+		to_read = le32_to_cpu(rp->system_page_size) - size;
+		idx = (pos + size) >> PAGE_CACHE_SHIFT;
+		BUG_ON((pos + size) & ~PAGE_CACHE_MASK);
+		do {
+			page = ntfs_map_page(vi->i_mapping, idx);
+			if (IS_ERR(page)) {
+				ntfs_error(vi->i_sb, "Error mapping $LogFile "
+						"page (index %lu).", idx);
+				err = PTR_ERR(page);
+				if (err != -EIO && err != -ENOMEM)
+					err = -EIO;
+				goto err_out;
+			}
+			size = min_t(int, to_read, PAGE_CACHE_SIZE);
+			memcpy((u8*)trp + have_read, page_address(page), size);
+			ntfs_unmap_page(page);
+			have_read += size;
+			to_read -= size;
+			idx++;
+		} while (to_read > 0);
+	}
+	/*
+	 * Perform the multi sector transfer deprotection on the buffer if the
+	 * restart page is protected.
+	 */
+	if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count))
+			&& post_read_mst_fixup((NTFS_RECORD*)trp,
+			le32_to_cpu(rp->system_page_size))) {
+		/*
+		 * A multi sector tranfer error was detected.  We only need to
+		 * abort if the restart page contents exceed the multi sector
+		 * transfer fixup of the first sector.
+		 */
+		if (le16_to_cpu(rp->restart_area_offset) +
+				le16_to_cpu(ra->restart_area_length) >
+				NTFS_BLOCK_SIZE - sizeof(u16)) {
+			ntfs_error(vi->i_sb, "Multi sector transfer error "
+					"detected in $LogFile restart page.");
+			err = -EINVAL;
+			goto err_out;
+		}
+	}
+	/*
+	 * If the restart page is modified by chkdsk or there are no active
+	 * logfile clients, the logfile is consistent.  Otherwise, need to
+	 * check the log client records for consistency, too.
+	 */
+	err = 0;
+	if (ntfs_is_rstr_record(rp->magic) &&
+			ra->client_in_use_list != LOGFILE_NO_CLIENT) {
+		if (!ntfs_check_log_client_array(vi, trp)) {
+			err = -EINVAL;
+			goto err_out;
+		}
+	}
+	if (lsn) {
+		if (ntfs_is_rstr_record(rp->magic))
+			*lsn = sle64_to_cpu(ra->current_lsn);
+		else /* if (ntfs_is_chkd_record(rp->magic)) */
+			*lsn = sle64_to_cpu(rp->chkdsk_lsn);
+	}
+	ntfs_debug("Done.");
+	if (wrp)
+		*wrp = trp;
+	else {
+err_out:
+		ntfs_free(trp);
+	}
+	return err;
+}
+
+/**
+ * ntfs_check_logfile - check the journal for consistency
+ * @log_vi:	struct inode of loaded journal $LogFile to check
+ * @rp:		[OUT] on success this is a copy of the current restart page
+ *
+ * Check the $LogFile journal for consistency and return 'true' if it is
+ * consistent and 'false' if not.  On success, the current restart page is
+ * returned in *@rp.  Caller must call ntfs_free(*@rp) when finished with it.
+ *
+ * At present we only check the two restart pages and ignore the log record
+ * pages.
+ *
+ * Note that the MstProtected flag is not set on the $LogFile inode and hence
+ * when reading pages they are not deprotected.  This is because we do not know
+ * if the $LogFile was created on a system with a different page size to ours
+ * yet and mst deprotection would fail if our page size is smaller.
+ */
+bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
+{
+	s64 size, pos;
+	LSN rstr1_lsn, rstr2_lsn;
+	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
+	struct address_space *mapping = log_vi->i_mapping;
+	struct page *page = NULL;
+	u8 *kaddr = NULL;
+	RESTART_PAGE_HEADER *rstr1_ph = NULL;
+	RESTART_PAGE_HEADER *rstr2_ph = NULL;
+	int log_page_size, log_page_mask, err;
+	bool logfile_is_empty = true;
+	u8 log_page_bits;
+
+	ntfs_debug("Entering.");
+	/* An empty $LogFile must have been clean before it got emptied. */
+	if (NVolLogFileEmpty(vol))
+		goto is_empty;
+	size = i_size_read(log_vi);
+	/* Make sure the file doesn't exceed the maximum allowed size. */
+	if (size > MaxLogFileSize)
+		size = MaxLogFileSize;
+	/*
+	 * Truncate size to a multiple of the page cache size or the default
+	 * log page size if the page cache size is between the default log page
+	 * log page size if the page cache size is between the default log page
+	 * size and twice that.
+	 */
+	if (PAGE_CACHE_SIZE >= DefaultLogPageSize && PAGE_CACHE_SIZE <=
+			DefaultLogPageSize * 2)
+		log_page_size = DefaultLogPageSize;
+	else
+		log_page_size = PAGE_CACHE_SIZE;
+	log_page_mask = log_page_size - 1;
+	/*
+	 * Use ntfs_ffs() instead of ffs() to enable the compiler to
+	 * optimize log_page_size and log_page_bits into constants.
+	 */
+	log_page_bits = ntfs_ffs(log_page_size) - 1;
+	size &= ~(s64)(log_page_size - 1);
+	/*
+	 * Ensure the log file is big enough to store at least the two restart
+	 * pages and the minimum number of log record pages.
+	 */
+	if (size < log_page_size * 2 || (size - log_page_size * 2) >>
+			log_page_bits < MinLogRecordPages) {
+		ntfs_error(vol->sb, "$LogFile is too small.");
+		return false;
+	}
+	/*
+	 * Read through the file looking for a restart page.  Since the restart
+	 * page header is at the beginning of a page we only need to search at
+	 * what could be the beginning of a page (for each page size) rather
+	 * than scanning the whole file byte by byte.  If all potential places
+	 * contain empty and uninitialzed records, the log file can be assumed
+	 * to be empty.
+	 */
+	for (pos = 0; pos < size; pos <<= 1) {
+		pgoff_t idx = pos >> PAGE_CACHE_SHIFT;
+		if (!page || page->index != idx) {
+			if (page)
+				ntfs_unmap_page(page);
+			page = ntfs_map_page(mapping, idx);
+			if (IS_ERR(page)) {
+				ntfs_error(vol->sb, "Error mapping $LogFile "
+						"page (index %lu).", idx);
+				goto err_out;
+			}
+		}
+		kaddr = (u8*)page_address(page) + (pos & ~PAGE_CACHE_MASK);
+		/*
+		 * A non-empty block means the logfile is not empty while an
+		 * empty block after a non-empty block has been encountered
+		 * means we are done.
+		 */
+		if (!ntfs_is_empty_recordp((le32*)kaddr))
+			logfile_is_empty = false;
+		else if (!logfile_is_empty)
+			break;
+		/*
+		 * A log record page means there cannot be a restart page after
+		 * this so no need to continue searching.
+		 */
+		if (ntfs_is_rcrd_recordp((le32*)kaddr))
+			break;
+		/* If not a (modified by chkdsk) restart page, continue. */
+		if (!ntfs_is_rstr_recordp((le32*)kaddr) &&
+				!ntfs_is_chkd_recordp((le32*)kaddr)) {
+			if (!pos)
+				pos = NTFS_BLOCK_SIZE >> 1;
+			continue;
+		}
+		/*
+		 * Check the (modified by chkdsk) restart page for consistency
+		 * and get a copy of the complete multi sector transfer
+		 * deprotected restart page.
+		 */
+		err = ntfs_check_and_load_restart_page(log_vi,
+				(RESTART_PAGE_HEADER*)kaddr, pos,
+				!rstr1_ph ? &rstr1_ph : &rstr2_ph,
+				!rstr1_ph ? &rstr1_lsn : &rstr2_lsn);
+		if (!err) {
+			/*
+			 * If we have now found the first (modified by chkdsk)
+			 * restart page, continue looking for the second one.
+			 */
+			if (!pos) {
+				pos = NTFS_BLOCK_SIZE >> 1;
+				continue;
+			}
+			/*
+			 * We have now found the second (modified by chkdsk)
+			 * restart page, so we can stop looking.
+			 */
+			break;
+		}
+		/*
+		 * Error output already done inside the function.  Note, we do
+		 * not abort if the restart page was invalid as we might still
+		 * find a valid one further in the file.
+		 */
+		if (err != -EINVAL) {
+			ntfs_unmap_page(page);
+			goto err_out;
+		}
+		/* Continue looking. */
+		if (!pos)
+			pos = NTFS_BLOCK_SIZE >> 1;
+	}
+	if (page)
+		ntfs_unmap_page(page);
+	if (logfile_is_empty) {
+		NVolSetLogFileEmpty(vol);
+is_empty:
+		ntfs_debug("Done.  ($LogFile is empty.)");
+		return true;
+	}
+	if (!rstr1_ph) {
+		BUG_ON(rstr2_ph);
+		ntfs_error(vol->sb, "Did not find any restart pages in "
+				"$LogFile and it was not empty.");
+		return false;
+	}
+	/* If both restart pages were found, use the more recent one. */
+	if (rstr2_ph) {
+		/*
+		 * If the second restart area is more recent, switch to it.
+		 * Otherwise just throw it away.
+		 */
+		if (rstr2_lsn > rstr1_lsn) {
+			ntfs_debug("Using second restart page as it is more "
+					"recent.");
+			ntfs_free(rstr1_ph);
+			rstr1_ph = rstr2_ph;
+			/* rstr1_lsn = rstr2_lsn; */
+		} else {
+			ntfs_debug("Using first restart page as it is more "
+					"recent.");
+			ntfs_free(rstr2_ph);
+		}
+		rstr2_ph = NULL;
+	}
+	/* All consistency checks passed. */
+	if (rp)
+		*rp = rstr1_ph;
+	else
+		ntfs_free(rstr1_ph);
+	ntfs_debug("Done.");
+	return true;
+err_out:
+	if (rstr1_ph)
+		ntfs_free(rstr1_ph);
+	return false;
+}
+
+/**
+ * ntfs_is_logfile_clean - check in the journal if the volume is clean
+ * @log_vi:	struct inode of loaded journal $LogFile to check
+ * @rp:		copy of the current restart page
+ *
+ * Analyze the $LogFile journal and return 'true' if it indicates the volume was
+ * shutdown cleanly and 'false' if not.
+ *
+ * At present we only look at the two restart pages and ignore the log record
+ * pages.  This is a little bit crude in that there will be a very small number
+ * of cases where we think that a volume is dirty when in fact it is clean.
+ * This should only affect volumes that have not been shutdown cleanly but did
+ * not have any pending, non-check-pointed i/o, i.e. they were completely idle
+ * at least for the five seconds preceding the unclean shutdown.
+ *
+ * This function assumes that the $LogFile journal has already been consistency
+ * checked by a call to ntfs_check_logfile() and in particular if the $LogFile
+ * is empty this function requires that NVolLogFileEmpty() is true otherwise an
+ * empty volume will be reported as dirty.
+ */
+bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
+{
+	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
+	RESTART_AREA *ra;
+
+	ntfs_debug("Entering.");
+	/* An empty $LogFile must have been clean before it got emptied. */
+	if (NVolLogFileEmpty(vol)) {
+		ntfs_debug("Done.  ($LogFile is empty.)");
+		return true;
+	}
+	BUG_ON(!rp);
+	if (!ntfs_is_rstr_record(rp->magic) &&
+			!ntfs_is_chkd_record(rp->magic)) {
+		ntfs_error(vol->sb, "Restart page buffer is invalid.  This is "
+				"probably a bug in that the $LogFile should "
+				"have been consistency checked before calling "
+				"this function.");
+		return false;
+	}
+	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
+	/*
+	 * If the $LogFile has active clients, i.e. it is open, and we do not
+	 * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags,
+	 * we assume there was an unclean shutdown.
+	 */
+	if (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
+			!(ra->flags & RESTART_VOLUME_IS_CLEAN)) {
+		ntfs_debug("Done.  $LogFile indicates a dirty shutdown.");
+		return false;
+	}
+	/* $LogFile indicates a clean shutdown. */
+	ntfs_debug("Done.  $LogFile indicates a clean shutdown.");
+	return true;
+}
+
+/**
+ * ntfs_empty_logfile - empty the contents of the $LogFile journal
+ * @log_vi:	struct inode of loaded journal $LogFile to empty
+ *
+ * Empty the contents of the $LogFile journal @log_vi and return 'true' on
+ * success and 'false' on error.
+ *
+ * This function assumes that the $LogFile journal has already been consistency
+ * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean()
+ * has been used to ensure that the $LogFile is clean.
+ */
+bool ntfs_empty_logfile(struct inode *log_vi)
+{
+	VCN vcn, end_vcn;
+	ntfs_inode *log_ni = NTFS_I(log_vi);
+	ntfs_volume *vol = log_ni->vol;
+	struct super_block *sb = vol->sb;
+	runlist_element *rl;
+	unsigned long flags;
+	unsigned block_size, block_size_bits;
+	int err;
+	bool should_wait = true;
+
+	ntfs_debug("Entering.");
+	if (NVolLogFileEmpty(vol)) {
+		ntfs_debug("Done.");
+		return true;
+	}
+	/*
+	 * We cannot use ntfs_attr_set() because we may be still in the middle
+	 * of a mount operation.  Thus we do the emptying by hand by first
+	 * zapping the page cache pages for the $LogFile/$DATA attribute and
+	 * then emptying each of the buffers in each of the clusters specified
+	 * by the runlist by hand.
+	 */
+	block_size = sb->s_blocksize;
+	block_size_bits = sb->s_blocksize_bits;
+	vcn = 0;
+	read_lock_irqsave(&log_ni->size_lock, flags);
+	end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >>
+			vol->cluster_size_bits;
+	read_unlock_irqrestore(&log_ni->size_lock, flags);
+	truncate_inode_pages(log_vi->i_mapping, 0);
+	down_write(&log_ni->runlist.lock);
+	rl = log_ni->runlist.rl;
+	if (unlikely(!rl || vcn < rl->vcn || !rl->length)) {
+map_vcn:
+		err = ntfs_map_runlist_nolock(log_ni, vcn, NULL);
+		if (err) {
+			ntfs_error(sb, "Failed to map runlist fragment (error "
+					"%d).", -err);
+			goto err;
+		}
+		rl = log_ni->runlist.rl;
+		BUG_ON(!rl || vcn < rl->vcn || !rl->length);
+	}
+	/* Seek to the runlist element containing @vcn. */
+	while (rl->length && vcn >= rl[1].vcn)
+		rl++;
+	do {
+		LCN lcn;
+		sector_t block, end_block;
+		s64 len;
+
+		/*
+		 * If this run is not mapped map it now and start again as the
+		 * runlist will have been updated.
+		 */
+		lcn = rl->lcn;
+		if (unlikely(lcn == LCN_RL_NOT_MAPPED)) {
+			vcn = rl->vcn;
+			goto map_vcn;
+		}
+		/* If this run is not valid abort with an error. */
+		if (unlikely(!rl->length || lcn < LCN_HOLE))
+			goto rl_err;
+		/* Skip holes. */
+		if (lcn == LCN_HOLE)
+			continue;
+		block = lcn << vol->cluster_size_bits >> block_size_bits;
+		len = rl->length;
+		if (rl[1].vcn > end_vcn)
+			len = end_vcn - rl->vcn;
+		end_block = (lcn + len) << vol->cluster_size_bits >>
+				block_size_bits;
+		/* Iterate over the blocks in the run and empty them. */
+		do {
+			struct buffer_head *bh;
+
+			/* Obtain the buffer, possibly not uptodate. */
+			bh = sb_getblk(sb, block);
+			BUG_ON(!bh);
+			/* Setup buffer i/o submission. */
+			lock_buffer(bh);
+			bh->b_end_io = end_buffer_write_sync;
+			get_bh(bh);
+			/* Set the entire contents of the buffer to 0xff. */
+			memset(bh->b_data, -1, block_size);
+			if (!buffer_uptodate(bh))
+				set_buffer_uptodate(bh);
+			if (buffer_dirty(bh))
+				clear_buffer_dirty(bh);
+			/*
+			 * Submit the buffer and wait for i/o to complete but
+			 * only for the first buffer so we do not miss really
+			 * serious i/o errors.  Once the first buffer has
+			 * completed ignore errors afterwards as we can assume
+			 * that if one buffer worked all of them will work.
+			 */
+			submit_bh(WRITE, bh);
+			if (should_wait) {
+				should_wait = false;
+				wait_on_buffer(bh);
+				if (unlikely(!buffer_uptodate(bh)))
+					goto io_err;
+			}
+			brelse(bh);
+		} while (++block < end_block);
+	} while ((++rl)->vcn < end_vcn);
+	up_write(&log_ni->runlist.lock);
+	/*
+	 * Zap the pages again just in case any got instantiated whilst we were
+	 * emptying the blocks by hand.  FIXME: We may not have completed
+	 * writing to all the buffer heads yet so this may happen too early.
+	 * We really should use a kernel thread to do the emptying
+	 * asynchronously and then we can also set the volume dirty and output
+	 * an error message if emptying should fail.
+	 */
+	truncate_inode_pages(log_vi->i_mapping, 0);
+	/* Set the flag so we do not have to do it again on remount. */
+	NVolSetLogFileEmpty(vol);
+	ntfs_debug("Done.");
+	return true;
+io_err:
+	ntfs_error(sb, "Failed to write buffer.  Unmount and run chkdsk.");
+	goto dirty_err;
+rl_err:
+	ntfs_error(sb, "Runlist is corrupt.  Unmount and run chkdsk.");
+dirty_err:
+	NVolSetErrors(vol);
+	err = -EIO;
+err:
+	up_write(&log_ni->runlist.lock);
+	ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).",
+			-err);
+	return false;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
new file mode 100644
index 00000000..aa2b6ac3
--- /dev/null
+++ b/fs/ntfs/logfile.h
@@ -0,0 +1,309 @@
+/*
+ * logfile.h - Defines for NTFS kernel journal ($LogFile) handling.  Part of
+ *	       the Linux-NTFS project.
+ *
+ * Copyright (c) 2000-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_LOGFILE_H
+#define _LINUX_NTFS_LOGFILE_H
+
+#ifdef NTFS_RW
+
+#include <linux/fs.h>
+
+#include "types.h"
+#include "endian.h"
+#include "layout.h"
+
+/*
+ * Journal ($LogFile) organization:
+ *
+ * Two restart areas present in the first two pages (restart pages, one restart
+ * area in each page).  When the volume is dismounted they should be identical,
+ * except for the update sequence array which usually has a different update
+ * sequence number.
+ *
+ * These are followed by log records organized in pages headed by a log record
+ * header going up to log file size.  Not all pages contain log records when a
+ * volume is first formatted, but as the volume ages, all records will be used.
+ * When the log file fills up, the records at the beginning are purged (by
+ * modifying the oldest_lsn to a higher value presumably) and writing begins
+ * at the beginning of the file.  Effectively, the log file is viewed as a
+ * circular entity.
+ *
+ * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept
+ * versions <= 1.x, including 0.-1.  (Yes, that is a minus one in there!)  We
+ * probably only want to support 1.1 as this seems to be the current version
+ * and we don't know how that differs from the older versions.  The only
+ * exception is if the journal is clean as marked by the two restart pages
+ * then it doesn't matter whether we are on an earlier version.  We can just
+ * reinitialize the logfile and start again with version 1.1.
+ */
+
+/* Some $LogFile related constants. */
+#define MaxLogFileSize		0x100000000ULL
+#define DefaultLogPageSize	4096
+#define MinLogRecordPages	48
+
+/*
+ * Log file restart page header (begins the restart area).
+ */
+typedef struct {
+/*Ofs*/
+/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+/*  0*/	NTFS_RECORD_TYPE magic;	/* The magic is "RSTR". */
+/*  4*/	le16 usa_ofs;		/* See NTFS_RECORD definition in layout.h.
+				   When creating, set this to be immediately
+				   after this header structure (without any
+				   alignment). */
+/*  6*/	le16 usa_count;		/* See NTFS_RECORD definition in layout.h. */
+
+/*  8*/	leLSN chkdsk_lsn;	/* The last log file sequence number found by
+				   chkdsk.  Only used when the magic is changed
+				   to "CHKD".  Otherwise this is zero. */
+/* 16*/	le32 system_page_size;	/* Byte size of system pages when the log file
+				   was created, has to be >= 512 and a power of
+				   2.  Use this to calculate the required size
+				   of the usa (usa_count) and add it to usa_ofs.
+				   Then verify that the result is less than the
+				   value of the restart_area_offset. */
+/* 20*/	le32 log_page_size;	/* Byte size of log file pages, has to be >=
+				   512 and a power of 2.  The default is 4096
+				   and is used when the system page size is
+				   between 4096 and 8192.  Otherwise this is
+				   set to the system page size instead. */
+/* 24*/	le16 restart_area_offset;/* Byte offset from the start of this header to
+				   the RESTART_AREA.  Value has to be aligned
+				   to 8-byte boundary.  When creating, set this
+				   to be after the usa. */
+/* 26*/	sle16 minor_ver;	/* Log file minor version.  Only check if major
+				   version is 1. */
+/* 28*/	sle16 major_ver;	/* Log file major version.  We only support
+				   version 1.1. */
+/* sizeof() = 30 (0x1e) bytes */
+} __attribute__ ((__packed__)) RESTART_PAGE_HEADER;
+
+/*
+ * Constant for the log client indices meaning that there are no client records
+ * in this particular client array.  Also inside the client records themselves,
+ * this means that there are no client records preceding or following this one.
+ */
+#define LOGFILE_NO_CLIENT	cpu_to_le16(0xffff)
+#define LOGFILE_NO_CLIENT_CPU	0xffff
+
+/*
+ * These are the so far known RESTART_AREA_* flags (16-bit) which contain
+ * information about the log file in which they are present.
+ */
+enum {
+	RESTART_VOLUME_IS_CLEAN	= cpu_to_le16(0x0002),
+	RESTART_SPACE_FILLER	= cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
+} __attribute__ ((__packed__));
+
+typedef le16 RESTART_AREA_FLAGS;
+
+/*
+ * Log file restart area record.  The offset of this record is found by adding
+ * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found
+ * in it.  See notes at restart_area_offset above.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/	leLSN current_lsn;	/* The current, i.e. last LSN inside the log
+				   when the restart area was last written.
+				   This happens often but what is the interval?
+				   Is it just fixed time or is it every time a
+				   check point is written or somethine else?
+				   On create set to 0. */
+/*  8*/	le16 log_clients;	/* Number of log client records in the array of
+				   log client records which follows this
+				   restart area.  Must be 1.  */
+/* 10*/	le16 client_free_list;	/* The index of the first free log client record
+				   in the array of log client records.
+				   LOGFILE_NO_CLIENT means that there are no
+				   free log client records in the array.
+				   If != LOGFILE_NO_CLIENT, check that
+				   log_clients > client_free_list.  On Win2k
+				   and presumably earlier, on a clean volume
+				   this is != LOGFILE_NO_CLIENT, and it should
+				   be 0, i.e. the first (and only) client
+				   record is free and thus the logfile is
+				   closed and hence clean.  A dirty volume
+				   would have left the logfile open and hence
+				   this would be LOGFILE_NO_CLIENT.  On WinXP
+				   and presumably later, the logfile is always
+				   open, even on clean shutdown so this should
+				   always be LOGFILE_NO_CLIENT. */
+/* 12*/	le16 client_in_use_list;/* The index of the first in-use log client
+				   record in the array of log client records.
+				   LOGFILE_NO_CLIENT means that there are no
+				   in-use log client records in the array.  If
+				   != LOGFILE_NO_CLIENT check that log_clients
+				   > client_in_use_list.  On Win2k and
+				   presumably earlier, on a clean volume this
+				   is LOGFILE_NO_CLIENT, i.e. there are no
+				   client records in use and thus the logfile
+				   is closed and hence clean.  A dirty volume
+				   would have left the logfile open and hence
+				   this would be != LOGFILE_NO_CLIENT, and it
+				   should be 0, i.e. the first (and only)
+				   client record is in use.  On WinXP and
+				   presumably later, the logfile is always
+				   open, even on clean shutdown so this should
+				   always be 0. */
+/* 14*/	RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour.  On Win2k
+				   and presumably earlier this is always 0.  On
+				   WinXP and presumably later, if the logfile
+				   was shutdown cleanly, the second bit,
+				   RESTART_VOLUME_IS_CLEAN, is set.  This bit
+				   is cleared when the volume is mounted by
+				   WinXP and set when the volume is dismounted,
+				   thus if the logfile is dirty, this bit is
+				   clear.  Thus we don't need to check the
+				   Windows version to determine if the logfile
+				   is clean.  Instead if the logfile is closed,
+				   we know it must be clean.  If it is open and
+				   this bit is set, we also know it must be
+				   clean.  If on the other hand the logfile is
+				   open and this bit is clear, we can be almost
+				   certain that the logfile is dirty. */
+/* 16*/	le32 seq_number_bits;	/* How many bits to use for the sequence
+				   number.  This is calculated as 67 - the
+				   number of bits required to store the logfile
+				   size in bytes and this can be used in with
+				   the specified file_size as a consistency
+				   check. */
+/* 20*/	le16 restart_area_length;/* Length of the restart area including the
+				   client array.  Following checks required if
+				   version matches.  Otherwise, skip them.
+				   restart_area_offset + restart_area_length
+				   has to be <= system_page_size.  Also,
+				   restart_area_length has to be >=
+				   client_array_offset + (log_clients *
+				   sizeof(log client record)). */
+/* 22*/	le16 client_array_offset;/* Offset from the start of this record to
+				   the first log client record if versions are
+				   matched.  When creating, set this to be
+				   after this restart area structure, aligned
+				   to 8-bytes boundary.  If the versions do not
+				   match, this is ignored and the offset is
+				   assumed to be (sizeof(RESTART_AREA) + 7) &
+				   ~7, i.e. rounded up to first 8-byte
+				   boundary.  Either way, client_array_offset
+				   has to be aligned to an 8-byte boundary.
+				   Also, restart_area_offset +
+				   client_array_offset has to be <= 510.
+				   Finally, client_array_offset + (log_clients
+				   * sizeof(log client record)) has to be <=
+				   system_page_size.  On Win2k and presumably
+				   earlier, this is 0x30, i.e. immediately
+				   following this record.  On WinXP and
+				   presumably later, this is 0x40, i.e. there
+				   are 16 extra bytes between this record and
+				   the client array.  This probably means that
+				   the RESTART_AREA record is actually bigger
+				   in WinXP and later. */
+/* 24*/	sle64 file_size;	/* Usable byte size of the log file.  If the
+				   restart_area_offset + the offset of the
+				   file_size are > 510 then corruption has
+				   occurred.  This is the very first check when
+				   starting with the restart_area as if it
+				   fails it means that some of the above values
+				   will be corrupted by the multi sector
+				   transfer protection.  The file_size has to
+				   be rounded down to be a multiple of the
+				   log_page_size in the RESTART_PAGE_HEADER and
+				   then it has to be at least big enough to
+				   store the two restart pages and 48 (0x30)
+				   log record pages. */
+/* 32*/	le32 last_lsn_data_length;/* Length of data of last LSN, not including
+				   the log record header.  On create set to
+				   0. */
+/* 36*/	le16 log_record_header_length;/* Byte size of the log record header.
+				   If the version matches then check that the
+				   value of log_record_header_length is a
+				   multiple of 8, i.e.
+				   (log_record_header_length + 7) & ~7 ==
+				   log_record_header_length.  When creating set
+				   it to sizeof(LOG_RECORD_HEADER), aligned to
+				   8 bytes. */
+/* 38*/	le16 log_page_data_offset;/* Offset to the start of data in a log record
+				   page.  Must be a multiple of 8.  On create
+				   set it to immediately after the update
+				   sequence array of the log record page. */
+/* 40*/	le32 restart_log_open_count;/* A counter that gets incremented every
+				   time the logfile is restarted which happens
+				   at mount time when the logfile is opened.
+				   When creating set to a random value.  Win2k
+				   sets it to the low 32 bits of the current
+				   system time in NTFS format (see time.h). */
+/* 44*/	le32 reserved;		/* Reserved/alignment to 8-byte boundary. */
+/* sizeof() = 48 (0x30) bytes */
+} __attribute__ ((__packed__)) RESTART_AREA;
+
+/*
+ * Log client record.  The offset of this record is found by adding the offset
+ * of the RESTART_AREA to the client_array_offset value found in it.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/	leLSN oldest_lsn;	/* Oldest LSN needed by this client.  On create
+				   set to 0. */
+/*  8*/	leLSN client_restart_lsn;/* LSN at which this client needs to restart
+				   the volume, i.e. the current position within
+				   the log file.  At present, if clean this
+				   should = current_lsn in restart area but it
+				   probably also = current_lsn when dirty most
+				   of the time.  At create set to 0. */
+/* 16*/	le16 prev_client;	/* The offset to the previous log client record
+				   in the array of log client records.
+				   LOGFILE_NO_CLIENT means there is no previous
+				   client record, i.e. this is the first one.
+				   This is always LOGFILE_NO_CLIENT. */
+/* 18*/	le16 next_client;	/* The offset to the next log client record in
+				   the array of log client records.
+				   LOGFILE_NO_CLIENT means there are no next
+				   client records, i.e. this is the last one.
+				   This is always LOGFILE_NO_CLIENT. */
+/* 20*/	le16 seq_number;	/* On Win2k and presumably earlier, this is set
+				   to zero every time the logfile is restarted
+				   and it is incremented when the logfile is
+				   closed at dismount time.  Thus it is 0 when
+				   dirty and 1 when clean.  On WinXP and
+				   presumably later, this is always 0. */
+/* 22*/	u8 reserved[6];		/* Reserved/alignment. */
+/* 28*/	le32 client_name_length;/* Length of client name in bytes.  Should
+				   always be 8. */
+/* 32*/	ntfschar client_name[64];/* Name of the client in Unicode.  Should
+				   always be "NTFS" with the remaining bytes
+				   set to 0. */
+/* sizeof() = 160 (0xa0) bytes */
+} __attribute__ ((__packed__)) LOG_CLIENT_RECORD;
+
+extern bool ntfs_check_logfile(struct inode *log_vi,
+		RESTART_PAGE_HEADER **rp);
+
+extern bool ntfs_is_logfile_clean(struct inode *log_vi,
+		const RESTART_PAGE_HEADER *rp);
+
+extern bool ntfs_empty_logfile(struct inode *log_vi);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_LOGFILE_H */
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
new file mode 100644
index 00000000..a44b14cb
--- /dev/null
+++ b/fs/ntfs/malloc.h
@@ -0,0 +1,96 @@
+/*
+ * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_MALLOC_H
+#define _LINUX_NTFS_MALLOC_H
+
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+/**
+ * __ntfs_malloc - allocate memory in multiples of pages
+ * @size:	number of bytes to allocate
+ * @gfp_mask:	extra flags for the allocator
+ *
+ * Internal function.  You probably want ntfs_malloc_nofs()...
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ * Depending on @gfp_mask the allocation may be guaranteed to succeed.
+ */
+static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
+{
+	if (likely(size <= PAGE_SIZE)) {
+		BUG_ON(!size);
+		/* kmalloc() has per-CPU caches so is faster for now. */
+		return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
+		/* return (void *)__get_free_page(gfp_mask); */
+	}
+	if (likely((size >> PAGE_SHIFT) < totalram_pages))
+		return __vmalloc(size, gfp_mask, PAGE_KERNEL);
+	return NULL;
+}
+
+/**
+ * ntfs_malloc_nofs - allocate memory in multiples of pages
+ * @size:	number of bytes to allocate
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ */
+static inline void *ntfs_malloc_nofs(unsigned long size)
+{
+	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM);
+}
+
+/**
+ * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages
+ * @size:	number of bytes to allocate
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * This function guarantees that the allocation will succeed.  It will sleep
+ * for as long as it takes to complete the allocation.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ */
+static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
+{
+	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL);
+}
+
+static inline void ntfs_free(void *addr)
+{
+	if (!is_vmalloc_addr(addr)) {
+		kfree(addr);
+		/* free_page((unsigned long)addr); */
+		return;
+	}
+	vfree(addr);
+}
+
+#endif /* _LINUX_NTFS_MALLOC_H */
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
new file mode 100644
index 00000000..382857f9
--- /dev/null
+++ b/fs/ntfs/mft.c
@@ -0,0 +1,2917 @@
+/**
+ * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+
+#include "attrib.h"
+#include "aops.h"
+#include "bitmap.h"
+#include "debug.h"
+#include "dir.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
+#include "ntfs.h"
+
+/**
+ * map_mft_record_page - map the page in which a specific mft record resides
+ * @ni:		ntfs inode whose mft record page to map
+ *
+ * This maps the page in which the mft record of the ntfs inode @ni is situated
+ * and returns a pointer to the mft record within the mapped page.
+ *
+ * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
+ * contains the negative error code returned.
+ */
+static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
+{
+	loff_t i_size;
+	ntfs_volume *vol = ni->vol;
+	struct inode *mft_vi = vol->mft_ino;
+	struct page *page;
+	unsigned long index, end_index;
+	unsigned ofs;
+
+	BUG_ON(ni->page);
+	/*
+	 * The index into the page cache and the offset within the page cache
+	 * page of the wanted mft record. FIXME: We need to check for
+	 * overflowing the unsigned long, but I don't think we would ever get
+	 * here if the volume was that big...
+	 */
+	index = (u64)ni->mft_no << vol->mft_record_size_bits >>
+			PAGE_CACHE_SHIFT;
+	ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+
+	i_size = i_size_read(mft_vi);
+	/* The maximum valid index into the page cache for $MFT's data. */
+	end_index = i_size >> PAGE_CACHE_SHIFT;
+
+	/* If the wanted index is out of bounds the mft record doesn't exist. */
+	if (unlikely(index >= end_index)) {
+		if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs +
+				vol->mft_record_size) {
+			page = ERR_PTR(-ENOENT);
+			ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
+					"which is beyond the end of the mft.  "
+					"This is probably a bug in the ntfs "
+					"driver.", ni->mft_no);
+			goto err_out;
+		}
+	}
+	/* Read, map, and pin the page. */
+	page = ntfs_map_page(mft_vi->i_mapping, index);
+	if (likely(!IS_ERR(page))) {
+		/* Catch multi sector transfer fixup errors. */
+		if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
+				ofs)))) {
+			ni->page = page;
+			ni->page_ofs = ofs;
+			return page_address(page) + ofs;
+		}
+		ntfs_error(vol->sb, "Mft record 0x%lx is corrupt.  "
+				"Run chkdsk.", ni->mft_no);
+		ntfs_unmap_page(page);
+		page = ERR_PTR(-EIO);
+		NVolSetErrors(vol);
+	}
+err_out:
+	ni->page = NULL;
+	ni->page_ofs = 0;
+	return (void*)page;
+}
+
+/**
+ * map_mft_record - map, pin and lock an mft record
+ * @ni:		ntfs inode whose MFT record to map
+ *
+ * First, take the mrec_lock mutex.  We might now be sleeping, while waiting
+ * for the mutex if it was already locked by someone else.
+ *
+ * The page of the record is mapped using map_mft_record_page() before being
+ * returned to the caller.
+ *
+ * This in turn uses ntfs_map_page() to get the page containing the wanted mft
+ * record (it in turn calls read_cache_page() which reads it in from disk if
+ * necessary, increments the use count on the page so that it cannot disappear
+ * under us and returns a reference to the page cache page).
+ *
+ * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
+ * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
+ * and the post-read mst fixups on each mft record in the page have been
+ * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
+ * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
+ * ntfs_map_page() waits for PG_locked to become clear and checks if
+ * PG_uptodate is set and returns an error code if not. This provides
+ * sufficient protection against races when reading/using the page.
+ *
+ * However there is the write mapping to think about. Doing the above described
+ * checking here will be fine, because when initiating the write we will set
+ * PG_locked and clear PG_uptodate making sure nobody is touching the page
+ * contents. Doing the locking this way means that the commit to disk code in
+ * the page cache code paths is automatically sufficiently locked with us as
+ * we will not touch a page that has been locked or is not uptodate. The only
+ * locking problem then is them locking the page while we are accessing it.
+ *
+ * So that code will end up having to own the mrec_lock of all mft
+ * records/inodes present in the page before I/O can proceed. In that case we
+ * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
+ * accessing anything without owning the mrec_lock mutex.  But we do need to
+ * use them because of the read_cache_page() invocation and the code becomes so
+ * much simpler this way that it is well worth it.
+ *
+ * The mft record is now ours and we return a pointer to it. You need to check
+ * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
+ * the error code.
+ *
+ * NOTE: Caller is responsible for setting the mft record dirty before calling
+ * unmap_mft_record(). This is obviously only necessary if the caller really
+ * modified the mft record...
+ * Q: Do we want to recycle one of the VFS inode state bits instead?
+ * A: No, the inode ones mean we want to change the mft record, not we want to
+ * write it out.
+ */
+MFT_RECORD *map_mft_record(ntfs_inode *ni)
+{
+	MFT_RECORD *m;
+
+	ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
+
+	/* Make sure the ntfs inode doesn't go away. */
+	atomic_inc(&ni->count);
+
+	/* Serialize access to this mft record. */
+	mutex_lock(&ni->mrec_lock);
+
+	m = map_mft_record_page(ni);
+	if (likely(!IS_ERR(m)))
+		return m;
+
+	mutex_unlock(&ni->mrec_lock);
+	atomic_dec(&ni->count);
+	ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
+	return m;
+}
+
+/**
+ * unmap_mft_record_page - unmap the page in which a specific mft record resides
+ * @ni:		ntfs inode whose mft record page to unmap
+ *
+ * This unmaps the page in which the mft record of the ntfs inode @ni is
+ * situated and returns. This is a NOOP if highmem is not configured.
+ *
+ * The unmap happens via ntfs_unmap_page() which in turn decrements the use
+ * count on the page thus releasing it from the pinned state.
+ *
+ * We do not actually unmap the page from memory of course, as that will be
+ * done by the page cache code itself when memory pressure increases or
+ * whatever.
+ */
+static inline void unmap_mft_record_page(ntfs_inode *ni)
+{
+	BUG_ON(!ni->page);
+
+	// TODO: If dirty, blah...
+	ntfs_unmap_page(ni->page);
+	ni->page = NULL;
+	ni->page_ofs = 0;
+	return;
+}
+
+/**
+ * unmap_mft_record - release a mapped mft record
+ * @ni:		ntfs inode whose MFT record to unmap
+ *
+ * We release the page mapping and the mrec_lock mutex which unmaps the mft
+ * record and releases it for others to get hold of. We also release the ntfs
+ * inode by decrementing the ntfs inode reference count.
+ *
+ * NOTE: If caller has modified the mft record, it is imperative to set the mft
+ * record dirty BEFORE calling unmap_mft_record().
+ */
+void unmap_mft_record(ntfs_inode *ni)
+{
+	struct page *page = ni->page;
+
+	BUG_ON(!page);
+
+	ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
+
+	unmap_mft_record_page(ni);
+	mutex_unlock(&ni->mrec_lock);
+	atomic_dec(&ni->count);
+	/*
+	 * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
+	 * ntfs_clear_extent_inode() in the extent inode case, and to the
+	 * caller in the non-extent, yet pure ntfs inode case, to do the actual
+	 * tear down of all structures and freeing of all allocated memory.
+	 */
+	return;
+}
+
+/**
+ * map_extent_mft_record - load an extent inode and attach it to its base
+ * @base_ni:	base ntfs inode
+ * @mref:	mft reference of the extent inode to load
+ * @ntfs_ino:	on successful return, pointer to the ntfs_inode structure
+ *
+ * Load the extent mft record @mref and attach it to its base inode @base_ni.
+ * Return the mapped extent mft record if IS_ERR(result) is false.  Otherwise
+ * PTR_ERR(result) gives the negative error code.
+ *
+ * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
+ * structure of the mapped extent inode.
+ */
+MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
+		ntfs_inode **ntfs_ino)
+{
+	MFT_RECORD *m;
+	ntfs_inode *ni = NULL;
+	ntfs_inode **extent_nis = NULL;
+	int i;
+	unsigned long mft_no = MREF(mref);
+	u16 seq_no = MSEQNO(mref);
+	bool destroy_ni = false;
+
+	ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
+			mft_no, base_ni->mft_no);
+	/* Make sure the base ntfs inode doesn't go away. */
+	atomic_inc(&base_ni->count);
+	/*
+	 * Check if this extent inode has already been added to the base inode,
+	 * in which case just return it. If not found, add it to the base
+	 * inode before returning it.
+	 */
+	mutex_lock(&base_ni->extent_lock);
+	if (base_ni->nr_extents > 0) {
+		extent_nis = base_ni->ext.extent_ntfs_inos;
+		for (i = 0; i < base_ni->nr_extents; i++) {
+			if (mft_no != extent_nis[i]->mft_no)
+				continue;
+			ni = extent_nis[i];
+			/* Make sure the ntfs inode doesn't go away. */
+			atomic_inc(&ni->count);
+			break;
+		}
+	}
+	if (likely(ni != NULL)) {
+		mutex_unlock(&base_ni->extent_lock);
+		atomic_dec(&base_ni->count);
+		/* We found the record; just have to map and return it. */
+		m = map_mft_record(ni);
+		/* map_mft_record() has incremented this on success. */
+		atomic_dec(&ni->count);
+		if (likely(!IS_ERR(m))) {
+			/* Verify the sequence number. */
+			if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
+				ntfs_debug("Done 1.");
+				*ntfs_ino = ni;
+				return m;
+			}
+			unmap_mft_record(ni);
+			ntfs_error(base_ni->vol->sb, "Found stale extent mft "
+					"reference! Corrupt filesystem. "
+					"Run chkdsk.");
+			return ERR_PTR(-EIO);
+		}
+map_err_out:
+		ntfs_error(base_ni->vol->sb, "Failed to map extent "
+				"mft record, error code %ld.", -PTR_ERR(m));
+		return m;
+	}
+	/* Record wasn't there. Get a new ntfs inode and initialize it. */
+	ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
+	if (unlikely(!ni)) {
+		mutex_unlock(&base_ni->extent_lock);
+		atomic_dec(&base_ni->count);
+		return ERR_PTR(-ENOMEM);
+	}
+	ni->vol = base_ni->vol;
+	ni->seq_no = seq_no;
+	ni->nr_extents = -1;
+	ni->ext.base_ntfs_ino = base_ni;
+	/* Now map the record. */
+	m = map_mft_record(ni);
+	if (IS_ERR(m)) {
+		mutex_unlock(&base_ni->extent_lock);
+		atomic_dec(&base_ni->count);
+		ntfs_clear_extent_inode(ni);
+		goto map_err_out;
+	}
+	/* Verify the sequence number if it is present. */
+	if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
+		ntfs_error(base_ni->vol->sb, "Found stale extent mft "
+				"reference! Corrupt filesystem. Run chkdsk.");
+		destroy_ni = true;
+		m = ERR_PTR(-EIO);
+		goto unm_err_out;
+	}
+	/* Attach extent inode to base inode, reallocating memory if needed. */
+	if (!(base_ni->nr_extents & 3)) {
+		ntfs_inode **tmp;
+		int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
+
+		tmp = kmalloc(new_size, GFP_NOFS);
+		if (unlikely(!tmp)) {
+			ntfs_error(base_ni->vol->sb, "Failed to allocate "
+					"internal buffer.");
+			destroy_ni = true;
+			m = ERR_PTR(-ENOMEM);
+			goto unm_err_out;
+		}
+		if (base_ni->nr_extents) {
+			BUG_ON(!base_ni->ext.extent_ntfs_inos);
+			memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
+					4 * sizeof(ntfs_inode *));
+			kfree(base_ni->ext.extent_ntfs_inos);
+		}
+		base_ni->ext.extent_ntfs_inos = tmp;
+	}
+	base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
+	mutex_unlock(&base_ni->extent_lock);
+	atomic_dec(&base_ni->count);
+	ntfs_debug("Done 2.");
+	*ntfs_ino = ni;
+	return m;
+unm_err_out:
+	unmap_mft_record(ni);
+	mutex_unlock(&base_ni->extent_lock);
+	atomic_dec(&base_ni->count);
+	/*
+	 * If the extent inode was not attached to the base inode we need to
+	 * release it or we will leak memory.
+	 */
+	if (destroy_ni)
+		ntfs_clear_extent_inode(ni);
+	return m;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * __mark_mft_record_dirty - set the mft record and the page containing it dirty
+ * @ni:		ntfs inode describing the mapped mft record
+ *
+ * Internal function.  Users should call mark_mft_record_dirty() instead.
+ *
+ * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
+ * as well as the page containing the mft record, dirty.  Also, mark the base
+ * vfs inode dirty.  This ensures that any changes to the mft record are
+ * written out to disk.
+ *
+ * NOTE:  We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
+ * on the base vfs inode, because even though file data may have been modified,
+ * it is dirty in the inode meta data rather than the data page cache of the
+ * inode, and thus there are no data pages that need writing out.  Therefore, a
+ * full mark_inode_dirty() is overkill.  A mark_inode_dirty_sync(), on the
+ * other hand, is not sufficient, because ->write_inode needs to be called even
+ * in case of fdatasync. This needs to happen or the file data would not
+ * necessarily hit the device synchronously, even though the vfs inode has the
+ * O_SYNC flag set.  Also, I_DIRTY_DATASYNC simply "feels" better than just
+ * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
+ * which is not what I_DIRTY_SYNC on its own would suggest.
+ */
+void __mark_mft_record_dirty(ntfs_inode *ni)
+{
+	ntfs_inode *base_ni;
+
+	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
+	BUG_ON(NInoAttr(ni));
+	mark_ntfs_record_dirty(ni->page, ni->page_ofs);
+	/* Determine the base vfs inode and mark it dirty, too. */
+	mutex_lock(&ni->extent_lock);
+	if (likely(ni->nr_extents >= 0))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	mutex_unlock(&ni->extent_lock);
+	__mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+}
+
+static const char *ntfs_please_email = "Please email "
+		"linux-ntfs-dev@lists.sourceforge.net and say that you saw "
+		"this message.  Thank you.";
+
+/**
+ * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror
+ * @vol:	ntfs volume on which the mft record to synchronize resides
+ * @mft_no:	mft record number of mft record to synchronize
+ * @m:		mapped, mst protected (extent) mft record to synchronize
+ *
+ * Write the mapped, mst protected (extent) mft record @m with mft record
+ * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol,
+ * bypassing the page cache and the $MFTMirr inode itself.
+ *
+ * This function is only for use at umount time when the mft mirror inode has
+ * already been disposed off.  We BUG() if we are called while the mft mirror
+ * inode is still attached to the volume.
+ *
+ * On success return 0.  On error return -errno.
+ *
+ * NOTE:  This function is not implemented yet as I am not convinced it can
+ * actually be triggered considering the sequence of commits we do in super.c::
+ * ntfs_put_super().  But just in case we provide this place holder as the
+ * alternative would be either to BUG() or to get a NULL pointer dereference
+ * and Oops.
+ */
+static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol,
+		const unsigned long mft_no, MFT_RECORD *m)
+{
+	BUG_ON(vol->mftmirr_ino);
+	ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
+			"implemented yet.  %s", ntfs_please_email);
+	return -EOPNOTSUPP;
+}
+
+/**
+ * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
+ * @vol:	ntfs volume on which the mft record to synchronize resides
+ * @mft_no:	mft record number of mft record to synchronize
+ * @m:		mapped, mst protected (extent) mft record to synchronize
+ * @sync:	if true, wait for i/o completion
+ *
+ * Write the mapped, mst protected (extent) mft record @m with mft record
+ * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
+ *
+ * On success return 0.  On error return -errno and set the volume errors flag
+ * in the ntfs volume @vol.
+ *
+ * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
+ *
+ * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
+ * schedule i/o via ->writepage or do it via kntfsd or whatever.
+ */
+int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
+		MFT_RECORD *m, int sync)
+{
+	struct page *page;
+	unsigned int blocksize = vol->sb->s_blocksize;
+	int max_bhs = vol->mft_record_size / blocksize;
+	struct buffer_head *bhs[max_bhs];
+	struct buffer_head *bh, *head;
+	u8 *kmirr;
+	runlist_element *rl;
+	unsigned int block_start, block_end, m_start, m_end, page_ofs;
+	int i_bhs, nr_bhs, err = 0;
+	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
+
+	ntfs_debug("Entering for inode 0x%lx.", mft_no);
+	BUG_ON(!max_bhs);
+	if (unlikely(!vol->mftmirr_ino)) {
+		/* This could happen during umount... */
+		err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
+		if (likely(!err))
+			return err;
+		goto err_out;
+	}
+	/* Get the page containing the mirror copy of the mft record @m. */
+	page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
+			(PAGE_CACHE_SHIFT - vol->mft_record_size_bits));
+	if (IS_ERR(page)) {
+		ntfs_error(vol->sb, "Failed to map mft mirror page.");
+		err = PTR_ERR(page);
+		goto err_out;
+	}
+	lock_page(page);
+	BUG_ON(!PageUptodate(page));
+	ClearPageUptodate(page);
+	/* Offset of the mft mirror record inside the page. */
+	page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+	/* The address in the page of the mirror copy of the mft record @m. */
+	kmirr = page_address(page) + page_ofs;
+	/* Copy the mst protected mft record to the mirror. */
+	memcpy(kmirr, m, vol->mft_record_size);
+	/* Create uptodate buffers if not present. */
+	if (unlikely(!page_has_buffers(page))) {
+		struct buffer_head *tail;
+
+		bh = head = alloc_page_buffers(page, blocksize, 1);
+		do {
+			set_buffer_uptodate(bh);
+			tail = bh;
+			bh = bh->b_this_page;
+		} while (bh);
+		tail->b_this_page = head;
+		attach_page_buffers(page, head);
+	}
+	bh = head = page_buffers(page);
+	BUG_ON(!bh);
+	rl = NULL;
+	nr_bhs = 0;
+	block_start = 0;
+	m_start = kmirr - (u8*)page_address(page);
+	m_end = m_start + vol->mft_record_size;
+	do {
+		block_end = block_start + blocksize;
+		/* If the buffer is outside the mft record, skip it. */
+		if (block_end <= m_start)
+			continue;
+		if (unlikely(block_start >= m_end))
+			break;
+		/* Need to map the buffer if it is not mapped already. */
+		if (unlikely(!buffer_mapped(bh))) {
+			VCN vcn;
+			LCN lcn;
+			unsigned int vcn_ofs;
+
+			bh->b_bdev = vol->sb->s_bdev;
+			/* Obtain the vcn and offset of the current block. */
+			vcn = ((VCN)mft_no << vol->mft_record_size_bits) +
+					(block_start - m_start);
+			vcn_ofs = vcn & vol->cluster_size_mask;
+			vcn >>= vol->cluster_size_bits;
+			if (!rl) {
+				down_read(&NTFS_I(vol->mftmirr_ino)->
+						runlist.lock);
+				rl = NTFS_I(vol->mftmirr_ino)->runlist.rl;
+				/*
+				 * $MFTMirr always has the whole of its runlist
+				 * in memory.
+				 */
+				BUG_ON(!rl);
+			}
+			/* Seek to element containing target vcn. */
+			while (rl->length && rl[1].vcn <= vcn)
+				rl++;
+			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+			/* For $MFTMirr, only lcn >= 0 is a successful remap. */
+			if (likely(lcn >= 0)) {
+				/* Setup buffer head to correct block. */
+				bh->b_blocknr = ((lcn <<
+						vol->cluster_size_bits) +
+						vcn_ofs) >> blocksize_bits;
+				set_buffer_mapped(bh);
+			} else {
+				bh->b_blocknr = -1;
+				ntfs_error(vol->sb, "Cannot write mft mirror "
+						"record 0x%lx because its "
+						"location on disk could not "
+						"be determined (error code "
+						"%lli).", mft_no,
+						(long long)lcn);
+				err = -EIO;
+			}
+		}
+		BUG_ON(!buffer_uptodate(bh));
+		BUG_ON(!nr_bhs && (m_start != block_start));
+		BUG_ON(nr_bhs >= max_bhs);
+		bhs[nr_bhs++] = bh;
+		BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
+	} while (block_start = block_end, (bh = bh->b_this_page) != head);
+	if (unlikely(rl))
+		up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock);
+	if (likely(!err)) {
+		/* Lock buffers and start synchronous write i/o on them. */
+		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+			struct buffer_head *tbh = bhs[i_bhs];
+
+			if (!trylock_buffer(tbh))
+				BUG();
+			BUG_ON(!buffer_uptodate(tbh));
+			clear_buffer_dirty(tbh);
+			get_bh(tbh);
+			tbh->b_end_io = end_buffer_write_sync;
+			submit_bh(WRITE, tbh);
+		}
+		/* Wait on i/o completion of buffers. */
+		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+			struct buffer_head *tbh = bhs[i_bhs];
+
+			wait_on_buffer(tbh);
+			if (unlikely(!buffer_uptodate(tbh))) {
+				err = -EIO;
+				/*
+				 * Set the buffer uptodate so the page and
+				 * buffer states do not become out of sync.
+				 */
+				set_buffer_uptodate(tbh);
+			}
+		}
+	} else /* if (unlikely(err)) */ {
+		/* Clean the buffers. */
+		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
+			clear_buffer_dirty(bhs[i_bhs]);
+	}
+	/* Current state: all buffers are clean, unlocked, and uptodate. */
+	/* Remove the mst protection fixups again. */
+	post_write_mst_fixup((NTFS_RECORD*)kmirr);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	unlock_page(page);
+	ntfs_unmap_page(page);
+	if (likely(!err)) {
+		ntfs_debug("Done.");
+	} else {
+		ntfs_error(vol->sb, "I/O error while writing mft mirror "
+				"record 0x%lx!", mft_no);
+err_out:
+		ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error "
+				"code %i).  Volume will be left marked dirty "
+				"on umount.  Run ntfsfix on the partition "
+				"after umounting to correct this.", -err);
+		NVolSetErrors(vol);
+	}
+	return err;
+}
+
+/**
+ * write_mft_record_nolock - write out a mapped (extent) mft record
+ * @ni:		ntfs inode describing the mapped (extent) mft record
+ * @m:		mapped (extent) mft record to write
+ * @sync:	if true, wait for i/o completion
+ *
+ * Write the mapped (extent) mft record @m described by the (regular or extent)
+ * ntfs inode @ni to backing store.  If the mft record @m has a counterpart in
+ * the mft mirror, that is also updated.
+ *
+ * We only write the mft record if the ntfs inode @ni is dirty and the first
+ * buffer belonging to its mft record is dirty, too.  We ignore the dirty state
+ * of subsequent buffers because we could have raced with
+ * fs/ntfs/aops.c::mark_ntfs_record_dirty().
+ *
+ * On success, clean the mft record and return 0.  On error, leave the mft
+ * record dirty and return -errno.
+ *
+ * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
+ * However, if the mft record has a counterpart in the mft mirror and @sync is
+ * true, we write the mft record, wait for i/o completion, and only then write
+ * the mft mirror copy.  This ensures that if the system crashes either the mft
+ * or the mft mirror will contain a self-consistent mft record @m.  If @sync is
+ * false on the other hand, we start i/o on both and then wait for completion
+ * on them.  This provides a speedup but no longer guarantees that you will end
+ * up with a self-consistent mft record in the case of a crash but if you asked
+ * for asynchronous writing you probably do not care about that anyway.
+ *
+ * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
+ * schedule i/o via ->writepage or do it via kntfsd or whatever.
+ */
+int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
+{
+	ntfs_volume *vol = ni->vol;
+	struct page *page = ni->page;
+	unsigned int blocksize = vol->sb->s_blocksize;
+	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
+	int max_bhs = vol->mft_record_size / blocksize;
+	struct buffer_head *bhs[max_bhs];
+	struct buffer_head *bh, *head;
+	runlist_element *rl;
+	unsigned int block_start, block_end, m_start, m_end;
+	int i_bhs, nr_bhs, err = 0;
+
+	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
+	BUG_ON(NInoAttr(ni));
+	BUG_ON(!max_bhs);
+	BUG_ON(!PageLocked(page));
+	/*
+	 * If the ntfs_inode is clean no need to do anything.  If it is dirty,
+	 * mark it as clean now so that it can be redirtied later on if needed.
+	 * There is no danger of races since the caller is holding the locks
+	 * for the mft record @m and the page it is in.
+	 */
+	if (!NInoTestClearDirty(ni))
+		goto done;
+	bh = head = page_buffers(page);
+	BUG_ON(!bh);
+	rl = NULL;
+	nr_bhs = 0;
+	block_start = 0;
+	m_start = ni->page_ofs;
+	m_end = m_start + vol->mft_record_size;
+	do {
+		block_end = block_start + blocksize;
+		/* If the buffer is outside the mft record, skip it. */
+		if (block_end <= m_start)
+			continue;
+		if (unlikely(block_start >= m_end))
+			break;
+		/*
+		 * If this block is not the first one in the record, we ignore
+		 * the buffer's dirty state because we could have raced with a
+		 * parallel mark_ntfs_record_dirty().
+		 */
+		if (block_start == m_start) {
+			/* This block is the first one in the record. */
+			if (!buffer_dirty(bh)) {
+				BUG_ON(nr_bhs);
+				/* Clean records are not written out. */
+				break;
+			}
+		}
+		/* Need to map the buffer if it is not mapped already. */
+		if (unlikely(!buffer_mapped(bh))) {
+			VCN vcn;
+			LCN lcn;
+			unsigned int vcn_ofs;
+
+			bh->b_bdev = vol->sb->s_bdev;
+			/* Obtain the vcn and offset of the current block. */
+			vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) +
+					(block_start - m_start);
+			vcn_ofs = vcn & vol->cluster_size_mask;
+			vcn >>= vol->cluster_size_bits;
+			if (!rl) {
+				down_read(&NTFS_I(vol->mft_ino)->runlist.lock);
+				rl = NTFS_I(vol->mft_ino)->runlist.rl;
+				BUG_ON(!rl);
+			}
+			/* Seek to element containing target vcn. */
+			while (rl->length && rl[1].vcn <= vcn)
+				rl++;
+			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+			/* For $MFT, only lcn >= 0 is a successful remap. */
+			if (likely(lcn >= 0)) {
+				/* Setup buffer head to correct block. */
+				bh->b_blocknr = ((lcn <<
+						vol->cluster_size_bits) +
+						vcn_ofs) >> blocksize_bits;
+				set_buffer_mapped(bh);
+			} else {
+				bh->b_blocknr = -1;
+				ntfs_error(vol->sb, "Cannot write mft record "
+						"0x%lx because its location "
+						"on disk could not be "
+						"determined (error code %lli).",
+						ni->mft_no, (long long)lcn);
+				err = -EIO;
+			}
+		}
+		BUG_ON(!buffer_uptodate(bh));
+		BUG_ON(!nr_bhs && (m_start != block_start));
+		BUG_ON(nr_bhs >= max_bhs);
+		bhs[nr_bhs++] = bh;
+		BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
+	} while (block_start = block_end, (bh = bh->b_this_page) != head);
+	if (unlikely(rl))
+		up_read(&NTFS_I(vol->mft_ino)->runlist.lock);
+	if (!nr_bhs)
+		goto done;
+	if (unlikely(err))
+		goto cleanup_out;
+	/* Apply the mst protection fixups. */
+	err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
+	if (err) {
+		ntfs_error(vol->sb, "Failed to apply mst fixups!");
+		goto cleanup_out;
+	}
+	flush_dcache_mft_record_page(ni);
+	/* Lock buffers and start synchronous write i/o on them. */
+	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+		struct buffer_head *tbh = bhs[i_bhs];
+
+		if (!trylock_buffer(tbh))
+			BUG();
+		BUG_ON(!buffer_uptodate(tbh));
+		clear_buffer_dirty(tbh);
+		get_bh(tbh);
+		tbh->b_end_io = end_buffer_write_sync;
+		submit_bh(WRITE, tbh);
+	}
+	/* Synchronize the mft mirror now if not @sync. */
+	if (!sync && ni->mft_no < vol->mftmirr_size)
+		ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
+	/* Wait on i/o completion of buffers. */
+	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+		struct buffer_head *tbh = bhs[i_bhs];
+
+		wait_on_buffer(tbh);
+		if (unlikely(!buffer_uptodate(tbh))) {
+			err = -EIO;
+			/*
+			 * Set the buffer uptodate so the page and buffer
+			 * states do not become out of sync.
+			 */
+			if (PageUptodate(page))
+				set_buffer_uptodate(tbh);
+		}
+	}
+	/* If @sync, now synchronize the mft mirror. */
+	if (sync && ni->mft_no < vol->mftmirr_size)
+		ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
+	/* Remove the mst protection fixups again. */
+	post_write_mst_fixup((NTFS_RECORD*)m);
+	flush_dcache_mft_record_page(ni);
+	if (unlikely(err)) {
+		/* I/O error during writing.  This is really bad! */
+		ntfs_error(vol->sb, "I/O error while writing mft record "
+				"0x%lx!  Marking base inode as bad.  You "
+				"should unmount the volume and run chkdsk.",
+				ni->mft_no);
+		goto err_out;
+	}
+done:
+	ntfs_debug("Done.");
+	return 0;
+cleanup_out:
+	/* Clean the buffers. */
+	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
+		clear_buffer_dirty(bhs[i_bhs]);
+err_out:
+	/*
+	 * Current state: all buffers are clean, unlocked, and uptodate.
+	 * The caller should mark the base inode as bad so that no more i/o
+	 * happens.  ->clear_inode() will still be invoked so all extent inodes
+	 * and other allocated memory will be freed.
+	 */
+	if (err == -ENOMEM) {
+		ntfs_error(vol->sb, "Not enough memory to write mft record.  "
+				"Redirtying so the write is retried later.");
+		mark_mft_record_dirty(ni);
+		err = 0;
+	} else
+		NVolSetErrors(vol);
+	return err;
+}
+
+/**
+ * ntfs_may_write_mft_record - check if an mft record may be written out
+ * @vol:	[IN]  ntfs volume on which the mft record to check resides
+ * @mft_no:	[IN]  mft record number of the mft record to check
+ * @m:		[IN]  mapped mft record to check
+ * @locked_ni:	[OUT] caller has to unlock this ntfs inode if one is returned
+ *
+ * Check if the mapped (base or extent) mft record @m with mft record number
+ * @mft_no belonging to the ntfs volume @vol may be written out.  If necessary
+ * and possible the ntfs inode of the mft record is locked and the base vfs
+ * inode is pinned.  The locked ntfs inode is then returned in @locked_ni.  The
+ * caller is responsible for unlocking the ntfs inode and unpinning the base
+ * vfs inode.
+ *
+ * Return 'true' if the mft record may be written out and 'false' if not.
+ *
+ * The caller has locked the page and cleared the uptodate flag on it which
+ * means that we can safely write out any dirty mft records that do not have
+ * their inodes in icache as determined by ilookup5() as anyone
+ * opening/creating such an inode would block when attempting to map the mft
+ * record in read_cache_page() until we are finished with the write out.
+ *
+ * Here is a description of the tests we perform:
+ *
+ * If the inode is found in icache we know the mft record must be a base mft
+ * record.  If it is dirty, we do not write it and return 'false' as the vfs
+ * inode write paths will result in the access times being updated which would
+ * cause the base mft record to be redirtied and written out again.  (We know
+ * the access time update will modify the base mft record because Windows
+ * chkdsk complains if the standard information attribute is not in the base
+ * mft record.)
+ *
+ * If the inode is in icache and not dirty, we attempt to lock the mft record
+ * and if we find the lock was already taken, it is not safe to write the mft
+ * record and we return 'false'.
+ *
+ * If we manage to obtain the lock we have exclusive access to the mft record,
+ * which also allows us safe writeout of the mft record.  We then set
+ * @locked_ni to the locked ntfs inode and return 'true'.
+ *
+ * Note we cannot just lock the mft record and sleep while waiting for the lock
+ * because this would deadlock due to lock reversal (normally the mft record is
+ * locked before the page is locked but we already have the page locked here
+ * when we try to lock the mft record).
+ *
+ * If the inode is not in icache we need to perform further checks.
+ *
+ * If the mft record is not a FILE record or it is a base mft record, we can
+ * safely write it and return 'true'.
+ *
+ * We now know the mft record is an extent mft record.  We check if the inode
+ * corresponding to its base mft record is in icache and obtain a reference to
+ * it if it is.  If it is not, we can safely write it and return 'true'.
+ *
+ * We now have the base inode for the extent mft record.  We check if it has an
+ * ntfs inode for the extent mft record attached and if not it is safe to write
+ * the extent mft record and we return 'true'.
+ *
+ * The ntfs inode for the extent mft record is attached to the base inode so we
+ * attempt to lock the extent mft record and if we find the lock was already
+ * taken, it is not safe to write the extent mft record and we return 'false'.
+ *
+ * If we manage to obtain the lock we have exclusive access to the extent mft
+ * record, which also allows us safe writeout of the extent mft record.  We
+ * set the ntfs inode of the extent mft record clean and then set @locked_ni to
+ * the now locked ntfs inode and return 'true'.
+ *
+ * Note, the reason for actually writing dirty mft records here and not just
+ * relying on the vfs inode dirty code paths is that we can have mft records
+ * modified without them ever having actual inodes in memory.  Also we can have
+ * dirty mft records with clean ntfs inodes in memory.  None of the described
+ * cases would result in the dirty mft records being written out if we only
+ * relied on the vfs inode dirty code paths.  And these cases can really occur
+ * during allocation of new mft records and in particular when the
+ * initialized_size of the $MFT/$DATA attribute is extended and the new space
+ * is initialized using ntfs_mft_record_format().  The clean inode can then
+ * appear if the mft record is reused for a new inode before it got written
+ * out.
+ */
+bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
+		const MFT_RECORD *m, ntfs_inode **locked_ni)
+{
+	struct super_block *sb = vol->sb;
+	struct inode *mft_vi = vol->mft_ino;
+	struct inode *vi;
+	ntfs_inode *ni, *eni, **extent_nis;
+	int i;
+	ntfs_attr na;
+
+	ntfs_debug("Entering for inode 0x%lx.", mft_no);
+	/*
+	 * Normally we do not return a locked inode so set @locked_ni to NULL.
+	 */
+	BUG_ON(!locked_ni);
+	*locked_ni = NULL;
+	/*
+	 * Check if the inode corresponding to this mft record is in the VFS
+	 * inode cache and obtain a reference to it if it is.
+	 */
+	ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
+	na.mft_no = mft_no;
+	na.name = NULL;
+	na.name_len = 0;
+	na.type = AT_UNUSED;
+	/*
+	 * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
+	 * we get here for it rather often.
+	 */
+	if (!mft_no) {
+		/* Balance the below iput(). */
+		vi = igrab(mft_vi);
+		BUG_ON(vi != mft_vi);
+	} else {
+		/*
+		 * Have to use ilookup5_nowait() since ilookup5() waits for the
+		 * inode lock which causes ntfs to deadlock when a concurrent
+		 * inode write via the inode dirty code paths and the page
+		 * dirty code path of the inode dirty code path when writing
+		 * $MFT occurs.
+		 */
+		vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na);
+	}
+	if (vi) {
+		ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
+		/* The inode is in icache. */
+		ni = NTFS_I(vi);
+		/* Take a reference to the ntfs inode. */
+		atomic_inc(&ni->count);
+		/* If the inode is dirty, do not write this record. */
+		if (NInoDirty(ni)) {
+			ntfs_debug("Inode 0x%lx is dirty, do not write it.",
+					mft_no);
+			atomic_dec(&ni->count);
+			iput(vi);
+			return false;
+		}
+		ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
+		/* The inode is not dirty, try to take the mft record lock. */
+		if (unlikely(!mutex_trylock(&ni->mrec_lock))) {
+			ntfs_debug("Mft record 0x%lx is already locked, do "
+					"not write it.", mft_no);
+			atomic_dec(&ni->count);
+			iput(vi);
+			return false;
+		}
+		ntfs_debug("Managed to lock mft record 0x%lx, write it.",
+				mft_no);
+		/*
+		 * The write has to occur while we hold the mft record lock so
+		 * return the locked ntfs inode.
+		 */
+		*locked_ni = ni;
+		return true;
+	}
+	ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
+	/* The inode is not in icache. */
+	/* Write the record if it is not a mft record (type "FILE"). */
+	if (!ntfs_is_mft_record(m->magic)) {
+		ntfs_debug("Mft record 0x%lx is not a FILE record, write it.",
+				mft_no);
+		return true;
+	}
+	/* Write the mft record if it is a base inode. */
+	if (!m->base_mft_record) {
+		ntfs_debug("Mft record 0x%lx is a base record, write it.",
+				mft_no);
+		return true;
+	}
+	/*
+	 * This is an extent mft record.  Check if the inode corresponding to
+	 * its base mft record is in icache and obtain a reference to it if it
+	 * is.
+	 */
+	na.mft_no = MREF_LE(m->base_mft_record);
+	ntfs_debug("Mft record 0x%lx is an extent record.  Looking for base "
+			"inode 0x%lx in icache.", mft_no, na.mft_no);
+	if (!na.mft_no) {
+		/* Balance the below iput(). */
+		vi = igrab(mft_vi);
+		BUG_ON(vi != mft_vi);
+	} else
+		vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode,
+				&na);
+	if (!vi) {
+		/*
+		 * The base inode is not in icache, write this extent mft
+		 * record.
+		 */
+		ntfs_debug("Base inode 0x%lx is not in icache, write the "
+				"extent record.", na.mft_no);
+		return true;
+	}
+	ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
+	/*
+	 * The base inode is in icache.  Check if it has the extent inode
+	 * corresponding to this extent mft record attached.
+	 */
+	ni = NTFS_I(vi);
+	mutex_lock(&ni->extent_lock);
+	if (ni->nr_extents <= 0) {
+		/*
+		 * The base inode has no attached extent inodes, write this
+		 * extent mft record.
+		 */
+		mutex_unlock(&ni->extent_lock);
+		iput(vi);
+		ntfs_debug("Base inode 0x%lx has no attached extent inodes, "
+				"write the extent record.", na.mft_no);
+		return true;
+	}
+	/* Iterate over the attached extent inodes. */
+	extent_nis = ni->ext.extent_ntfs_inos;
+	for (eni = NULL, i = 0; i < ni->nr_extents; ++i) {
+		if (mft_no == extent_nis[i]->mft_no) {
+			/*
+			 * Found the extent inode corresponding to this extent
+			 * mft record.
+			 */
+			eni = extent_nis[i];
+			break;
+		}
+	}
+	/*
+	 * If the extent inode was not attached to the base inode, write this
+	 * extent mft record.
+	 */
+	if (!eni) {
+		mutex_unlock(&ni->extent_lock);
+		iput(vi);
+		ntfs_debug("Extent inode 0x%lx is not attached to its base "
+				"inode 0x%lx, write the extent record.",
+				mft_no, na.mft_no);
+		return true;
+	}
+	ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.",
+			mft_no, na.mft_no);
+	/* Take a reference to the extent ntfs inode. */
+	atomic_inc(&eni->count);
+	mutex_unlock(&ni->extent_lock);
+	/*
+	 * Found the extent inode coresponding to this extent mft record.
+	 * Try to take the mft record lock.
+	 */
+	if (unlikely(!mutex_trylock(&eni->mrec_lock))) {
+		atomic_dec(&eni->count);
+		iput(vi);
+		ntfs_debug("Extent mft record 0x%lx is already locked, do "
+				"not write it.", mft_no);
+		return false;
+	}
+	ntfs_debug("Managed to lock extent mft record 0x%lx, write it.",
+			mft_no);
+	if (NInoTestClearDirty(eni))
+		ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.",
+				mft_no);
+	/*
+	 * The write has to occur while we hold the mft record lock so return
+	 * the locked extent ntfs inode.
+	 */
+	*locked_ni = eni;
+	return true;
+}
+
+static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
+		"chkdsk.";
+
+/**
+ * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
+ * @vol:	volume on which to search for a free mft record
+ * @base_ni:	open base inode if allocating an extent mft record or NULL
+ *
+ * Search for a free mft record in the mft bitmap attribute on the ntfs volume
+ * @vol.
+ *
+ * If @base_ni is NULL start the search at the default allocator position.
+ *
+ * If @base_ni is not NULL start the search at the mft record after the base
+ * mft record @base_ni.
+ *
+ * Return the free mft record on success and -errno on error.  An error code of
+ * -ENOSPC means that there are no free mft records in the currently
+ * initialized mft bitmap.
+ *
+ * Locking: Caller must hold vol->mftbmp_lock for writing.
+ */
+static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
+		ntfs_inode *base_ni)
+{
+	s64 pass_end, ll, data_pos, pass_start, ofs, bit;
+	unsigned long flags;
+	struct address_space *mftbmp_mapping;
+	u8 *buf, *byte;
+	struct page *page;
+	unsigned int page_ofs, size;
+	u8 pass, b;
+
+	ntfs_debug("Searching for free mft record in the currently "
+			"initialized mft bitmap.");
+	mftbmp_mapping = vol->mftbmp_ino->i_mapping;
+	/*
+	 * Set the end of the pass making sure we do not overflow the mft
+	 * bitmap.
+	 */
+	read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags);
+	pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
+			vol->mft_record_size_bits;
+	read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags);
+	read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
+	ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
+	read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
+	if (pass_end > ll)
+		pass_end = ll;
+	pass = 1;
+	if (!base_ni)
+		data_pos = vol->mft_data_pos;
+	else
+		data_pos = base_ni->mft_no + 1;
+	if (data_pos < 24)
+		data_pos = 24;
+	if (data_pos >= pass_end) {
+		data_pos = 24;
+		pass = 2;
+		/* This happens on a freshly formatted volume. */
+		if (data_pos >= pass_end)
+			return -ENOSPC;
+	}
+	pass_start = data_pos;
+	ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, "
+			"pass_end 0x%llx, data_pos 0x%llx.", pass,
+			(long long)pass_start, (long long)pass_end,
+			(long long)data_pos);
+	/* Loop until a free mft record is found. */
+	for (; pass <= 2;) {
+		/* Cap size to pass_end. */
+		ofs = data_pos >> 3;
+		page_ofs = ofs & ~PAGE_CACHE_MASK;
+		size = PAGE_CACHE_SIZE - page_ofs;
+		ll = ((pass_end + 7) >> 3) - ofs;
+		if (size > ll)
+			size = ll;
+		size <<= 3;
+		/*
+		 * If we are still within the active pass, search the next page
+		 * for a zero bit.
+		 */
+		if (size) {
+			page = ntfs_map_page(mftbmp_mapping,
+					ofs >> PAGE_CACHE_SHIFT);
+			if (IS_ERR(page)) {
+				ntfs_error(vol->sb, "Failed to read mft "
+						"bitmap, aborting.");
+				return PTR_ERR(page);
+			}
+			buf = (u8*)page_address(page) + page_ofs;
+			bit = data_pos & 7;
+			data_pos &= ~7ull;
+			ntfs_debug("Before inner for loop: size 0x%x, "
+					"data_pos 0x%llx, bit 0x%llx", size,
+					(long long)data_pos, (long long)bit);
+			for (; bit < size && data_pos + bit < pass_end;
+					bit &= ~7ull, bit += 8) {
+				byte = buf + (bit >> 3);
+				if (*byte == 0xff)
+					continue;
+				b = ffz((unsigned long)*byte);
+				if (b < 8 && b >= (bit & 7)) {
+					ll = data_pos + (bit & ~7ull) + b;
+					if (unlikely(ll > (1ll << 32))) {
+						ntfs_unmap_page(page);
+						return -ENOSPC;
+					}
+					*byte |= 1 << b;
+					flush_dcache_page(page);
+					set_page_dirty(page);
+					ntfs_unmap_page(page);
+					ntfs_debug("Done.  (Found and "
+							"allocated mft record "
+							"0x%llx.)",
+							(long long)ll);
+					return ll;
+				}
+			}
+			ntfs_debug("After inner for loop: size 0x%x, "
+					"data_pos 0x%llx, bit 0x%llx", size,
+					(long long)data_pos, (long long)bit);
+			data_pos += size;
+			ntfs_unmap_page(page);
+			/*
+			 * If the end of the pass has not been reached yet,
+			 * continue searching the mft bitmap for a zero bit.
+			 */
+			if (data_pos < pass_end)
+				continue;
+		}
+		/* Do the next pass. */
+		if (++pass == 2) {
+			/*
+			 * Starting the second pass, in which we scan the first
+			 * part of the zone which we omitted earlier.
+			 */
+			pass_end = pass_start;
+			data_pos = pass_start = 24;
+			ntfs_debug("pass %i, pass_start 0x%llx, pass_end "
+					"0x%llx.", pass, (long long)pass_start,
+					(long long)pass_end);
+			if (data_pos >= pass_end)
+				break;
+		}
+	}
+	/* No free mft records in currently initialized mft bitmap. */
+	ntfs_debug("Done.  (No free mft records left in currently initialized "
+			"mft bitmap.)");
+	return -ENOSPC;
+}
+
+/**
+ * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
+ * @vol:	volume on which to extend the mft bitmap attribute
+ *
+ * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster.
+ *
+ * Note: Only changes allocated_size, i.e. does not touch initialized_size or
+ * data_size.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - Caller must hold vol->mftbmp_lock for writing.
+ *	    - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for
+ *	      writing and releases it before returning.
+ *	    - This function takes vol->lcnbmp_lock for writing and releases it
+ *	      before returning.
+ */
+static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
+{
+	LCN lcn;
+	s64 ll;
+	unsigned long flags;
+	struct page *page;
+	ntfs_inode *mft_ni, *mftbmp_ni;
+	runlist_element *rl, *rl2 = NULL;
+	ntfs_attr_search_ctx *ctx = NULL;
+	MFT_RECORD *mrec;
+	ATTR_RECORD *a = NULL;
+	int ret, mp_size;
+	u32 old_alen = 0;
+	u8 *b, tb;
+	struct {
+		u8 added_cluster:1;
+		u8 added_run:1;
+		u8 mp_rebuilt:1;
+	} status = { 0, 0, 0 };
+
+	ntfs_debug("Extending mft bitmap allocation.");
+	mft_ni = NTFS_I(vol->mft_ino);
+	mftbmp_ni = NTFS_I(vol->mftbmp_ino);
+	/*
+	 * Determine the last lcn of the mft bitmap.  The allocated size of the
+	 * mft bitmap cannot be zero so we are ok to do this.
+	 */
+	down_write(&mftbmp_ni->runlist.lock);
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	ll = mftbmp_ni->allocated_size;
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
+			(ll - 1) >> vol->cluster_size_bits, NULL);
+	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+		up_write(&mftbmp_ni->runlist.lock);
+		ntfs_error(vol->sb, "Failed to determine last allocated "
+				"cluster of mft bitmap attribute.");
+		if (!IS_ERR(rl))
+			ret = -EIO;
+		else
+			ret = PTR_ERR(rl);
+		return ret;
+	}
+	lcn = rl->lcn + rl->length;
+	ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
+			(long long)lcn);
+	/*
+	 * Attempt to get the cluster following the last allocated cluster by
+	 * hand as it may be in the MFT zone so the allocator would not give it
+	 * to us.
+	 */
+	ll = lcn >> 3;
+	page = ntfs_map_page(vol->lcnbmp_ino->i_mapping,
+			ll >> PAGE_CACHE_SHIFT);
+	if (IS_ERR(page)) {
+		up_write(&mftbmp_ni->runlist.lock);
+		ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
+		return PTR_ERR(page);
+	}
+	b = (u8*)page_address(page) + (ll & ~PAGE_CACHE_MASK);
+	tb = 1 << (lcn & 7ull);
+	down_write(&vol->lcnbmp_lock);
+	if (*b != 0xff && !(*b & tb)) {
+		/* Next cluster is free, allocate it. */
+		*b |= tb;
+		flush_dcache_page(page);
+		set_page_dirty(page);
+		up_write(&vol->lcnbmp_lock);
+		ntfs_unmap_page(page);
+		/* Update the mft bitmap runlist. */
+		rl->length++;
+		rl[1].vcn++;
+		status.added_cluster = 1;
+		ntfs_debug("Appending one cluster to mft bitmap.");
+	} else {
+		up_write(&vol->lcnbmp_lock);
+		ntfs_unmap_page(page);
+		/* Allocate a cluster from the DATA_ZONE. */
+		rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
+				true);
+		if (IS_ERR(rl2)) {
+			up_write(&mftbmp_ni->runlist.lock);
+			ntfs_error(vol->sb, "Failed to allocate a cluster for "
+					"the mft bitmap.");
+			return PTR_ERR(rl2);
+		}
+		rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2);
+		if (IS_ERR(rl)) {
+			up_write(&mftbmp_ni->runlist.lock);
+			ntfs_error(vol->sb, "Failed to merge runlists for mft "
+					"bitmap.");
+			if (ntfs_cluster_free_from_rl(vol, rl2)) {
+				ntfs_error(vol->sb, "Failed to dealocate "
+						"allocated cluster.%s", es);
+				NVolSetErrors(vol);
+			}
+			ntfs_free(rl2);
+			return PTR_ERR(rl);
+		}
+		mftbmp_ni->runlist.rl = rl;
+		status.added_run = 1;
+		ntfs_debug("Adding one run to mft bitmap.");
+		/* Find the last run in the new runlist. */
+		for (; rl[1].length; rl++)
+			;
+	}
+	/*
+	 * Update the attribute record as well.  Note: @rl is the last
+	 * (non-terminator) runlist element of mft bitmap.
+	 */
+	mrec = map_mft_record(mft_ni);
+	if (IS_ERR(mrec)) {
+		ntfs_error(vol->sb, "Failed to map mft record.");
+		ret = PTR_ERR(mrec);
+		goto undo_alloc;
+	}
+	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+	if (unlikely(!ctx)) {
+		ntfs_error(vol->sb, "Failed to get search context.");
+		ret = -ENOMEM;
+		goto undo_alloc;
+	}
+	ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+			mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
+			0, ctx);
+	if (unlikely(ret)) {
+		ntfs_error(vol->sb, "Failed to find last attribute extent of "
+				"mft bitmap attribute.");
+		if (ret == -ENOENT)
+			ret = -EIO;
+		goto undo_alloc;
+	}
+	a = ctx->attr;
+	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+	/* Search back for the previous last allocated cluster of mft bitmap. */
+	for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) {
+		if (ll >= rl2->vcn)
+			break;
+	}
+	BUG_ON(ll < rl2->vcn);
+	BUG_ON(ll >= rl2->vcn + rl2->length);
+	/* Get the size for the new mapping pairs array for this extent. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
+	if (unlikely(mp_size <= 0)) {
+		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
+				"mft bitmap attribute extent.");
+		ret = mp_size;
+		if (!ret)
+			ret = -EIO;
+		goto undo_alloc;
+	}
+	/* Expand the attribute record if necessary. */
+	old_alen = le32_to_cpu(a->length);
+	ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+	if (unlikely(ret)) {
+		if (ret != -ENOSPC) {
+			ntfs_error(vol->sb, "Failed to resize attribute "
+					"record for mft bitmap attribute.");
+			goto undo_alloc;
+		}
+		// TODO: Deal with this by moving this extent to a new mft
+		// record or by starting a new extent in a new mft record or by
+		// moving other attributes out of this mft record.
+		// Note: It will need to be a special mft record and if none of
+		// those are available it gets rather complicated...
+		ntfs_error(vol->sb, "Not enough space in this mft record to "
+				"accommodate extended mft bitmap attribute "
+				"extent.  Cannot handle this yet.");
+		ret = -EOPNOTSUPP;
+		goto undo_alloc;
+	}
+	status.mp_rebuilt = 1;
+	/* Generate the mapping pairs array directly into the attr record. */
+	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+			mp_size, rl2, ll, -1, NULL);
+	if (unlikely(ret)) {
+		ntfs_error(vol->sb, "Failed to build mapping pairs array for "
+				"mft bitmap attribute.");
+		goto undo_alloc;
+	}
+	/* Update the highest_vcn. */
+	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
+	/*
+	 * We now have extended the mft bitmap allocated_size by one cluster.
+	 * Reflect this in the ntfs_inode structure and the attribute record.
+	 */
+	if (a->data.non_resident.lowest_vcn) {
+		/*
+		 * We are not in the first attribute extent, switch to it, but
+		 * first ensure the changes will make it to disk later.
+		 */
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		ntfs_attr_reinit_search_ctx(ctx);
+		ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+				mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL,
+				0, ctx);
+		if (unlikely(ret)) {
+			ntfs_error(vol->sb, "Failed to find first attribute "
+					"extent of mft bitmap attribute.");
+			goto restore_undo_alloc;
+		}
+		a = ctx->attr;
+	}
+	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	mftbmp_ni->allocated_size += vol->cluster_size;
+	a->data.non_resident.allocated_size =
+			cpu_to_sle64(mftbmp_ni->allocated_size);
+	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	/* Ensure the changes make it to disk. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(mft_ni);
+	up_write(&mftbmp_ni->runlist.lock);
+	ntfs_debug("Done.");
+	return 0;
+restore_undo_alloc:
+	ntfs_attr_reinit_search_ctx(ctx);
+	if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+			mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
+			0, ctx)) {
+		ntfs_error(vol->sb, "Failed to find last attribute extent of "
+				"mft bitmap attribute.%s", es);
+		write_lock_irqsave(&mftbmp_ni->size_lock, flags);
+		mftbmp_ni->allocated_size += vol->cluster_size;
+		write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(mft_ni);
+		up_write(&mftbmp_ni->runlist.lock);
+		/*
+		 * The only thing that is now wrong is ->allocated_size of the
+		 * base attribute extent which chkdsk should be able to fix.
+		 */
+		NVolSetErrors(vol);
+		return ret;
+	}
+	a = ctx->attr;
+	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2);
+undo_alloc:
+	if (status.added_cluster) {
+		/* Truncate the last run in the runlist by one cluster. */
+		rl->length--;
+		rl[1].vcn--;
+	} else if (status.added_run) {
+		lcn = rl->lcn;
+		/* Remove the last run from the runlist. */
+		rl->lcn = rl[1].lcn;
+		rl->length = 0;
+	}
+	/* Deallocate the cluster. */
+	down_write(&vol->lcnbmp_lock);
+	if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
+		ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es);
+		NVolSetErrors(vol);
+	}
+	up_write(&vol->lcnbmp_lock);
+	if (status.mp_rebuilt) {
+		if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset),
+				old_alen - le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset),
+				rl2, ll, -1, NULL)) {
+			ntfs_error(vol->sb, "Failed to restore mapping pairs "
+					"array.%s", es);
+			NVolSetErrors(vol);
+		}
+		if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
+			ntfs_error(vol->sb, "Failed to restore attribute "
+					"record.%s", es);
+			NVolSetErrors(vol);
+		}
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+	}
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (!IS_ERR(mrec))
+		unmap_mft_record(mft_ni);
+	up_write(&mftbmp_ni->runlist.lock);
+	return ret;
+}
+
+/**
+ * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
+ * @vol:	volume on which to extend the mft bitmap attribute
+ *
+ * Extend the initialized portion of the mft bitmap attribute on the ntfs
+ * volume @vol by 8 bytes.
+ *
+ * Note:  Only changes initialized_size and data_size, i.e. requires that
+ * allocated_size is big enough to fit the new initialized_size.
+ *
+ * Return 0 on success and -error on error.
+ *
+ * Locking: Caller must hold vol->mftbmp_lock for writing.
+ */
+static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
+{
+	s64 old_data_size, old_initialized_size;
+	unsigned long flags;
+	struct inode *mftbmp_vi;
+	ntfs_inode *mft_ni, *mftbmp_ni;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *mrec;
+	ATTR_RECORD *a;
+	int ret;
+
+	ntfs_debug("Extending mft bitmap initiailized (and data) size.");
+	mft_ni = NTFS_I(vol->mft_ino);
+	mftbmp_vi = vol->mftbmp_ino;
+	mftbmp_ni = NTFS_I(mftbmp_vi);
+	/* Get the attribute record. */
+	mrec = map_mft_record(mft_ni);
+	if (IS_ERR(mrec)) {
+		ntfs_error(vol->sb, "Failed to map mft record.");
+		return PTR_ERR(mrec);
+	}
+	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+	if (unlikely(!ctx)) {
+		ntfs_error(vol->sb, "Failed to get search context.");
+		ret = -ENOMEM;
+		goto unm_err_out;
+	}
+	ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+			mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(ret)) {
+		ntfs_error(vol->sb, "Failed to find first attribute extent of "
+				"mft bitmap attribute.");
+		if (ret == -ENOENT)
+			ret = -EIO;
+		goto put_err_out;
+	}
+	a = ctx->attr;
+	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	old_data_size = i_size_read(mftbmp_vi);
+	old_initialized_size = mftbmp_ni->initialized_size;
+	/*
+	 * We can simply update the initialized_size before filling the space
+	 * with zeroes because the caller is holding the mft bitmap lock for
+	 * writing which ensures that no one else is trying to access the data.
+	 */
+	mftbmp_ni->initialized_size += 8;
+	a->data.non_resident.initialized_size =
+			cpu_to_sle64(mftbmp_ni->initialized_size);
+	if (mftbmp_ni->initialized_size > old_data_size) {
+		i_size_write(mftbmp_vi, mftbmp_ni->initialized_size);
+		a->data.non_resident.data_size =
+				cpu_to_sle64(mftbmp_ni->initialized_size);
+	}
+	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	/* Ensure the changes make it to disk. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(mft_ni);
+	/* Initialize the mft bitmap attribute value with zeroes. */
+	ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0);
+	if (likely(!ret)) {
+		ntfs_debug("Done.  (Wrote eight initialized bytes to mft "
+				"bitmap.");
+		return 0;
+	}
+	ntfs_error(vol->sb, "Failed to write to mft bitmap.");
+	/* Try to recover from the error. */
+	mrec = map_mft_record(mft_ni);
+	if (IS_ERR(mrec)) {
+		ntfs_error(vol->sb, "Failed to map mft record.%s", es);
+		NVolSetErrors(vol);
+		return ret;
+	}
+	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+	if (unlikely(!ctx)) {
+		ntfs_error(vol->sb, "Failed to get search context.%s", es);
+		NVolSetErrors(vol);
+		goto unm_err_out;
+	}
+	if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+			mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) {
+		ntfs_error(vol->sb, "Failed to find first attribute extent of "
+				"mft bitmap attribute.%s", es);
+		NVolSetErrors(vol);
+put_err_out:
+		ntfs_attr_put_search_ctx(ctx);
+unm_err_out:
+		unmap_mft_record(mft_ni);
+		goto err_out;
+	}
+	a = ctx->attr;
+	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	mftbmp_ni->initialized_size = old_initialized_size;
+	a->data.non_resident.initialized_size =
+			cpu_to_sle64(old_initialized_size);
+	if (i_size_read(mftbmp_vi) != old_data_size) {
+		i_size_write(mftbmp_vi, old_data_size);
+		a->data.non_resident.data_size = cpu_to_sle64(old_data_size);
+	}
+	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(mft_ni);
+#ifdef DEBUG
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
+			"data_size 0x%llx, initialized_size 0x%llx.",
+			(long long)mftbmp_ni->allocated_size,
+			(long long)i_size_read(mftbmp_vi),
+			(long long)mftbmp_ni->initialized_size);
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+#endif /* DEBUG */
+err_out:
+	return ret;
+}
+
+/**
+ * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
+ * @vol:	volume on which to extend the mft data attribute
+ *
+ * Extend the mft data attribute on the ntfs volume @vol by 16 mft records
+ * worth of clusters or if not enough space for this by one mft record worth
+ * of clusters.
+ *
+ * Note:  Only changes allocated_size, i.e. does not touch initialized_size or
+ * data_size.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - Caller must hold vol->mftbmp_lock for writing.
+ *	    - This function takes NTFS_I(vol->mft_ino)->runlist.lock for
+ *	      writing and releases it before returning.
+ *	    - This function calls functions which take vol->lcnbmp_lock for
+ *	      writing and release it before returning.
+ */
+static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
+{
+	LCN lcn;
+	VCN old_last_vcn;
+	s64 min_nr, nr, ll;
+	unsigned long flags;
+	ntfs_inode *mft_ni;
+	runlist_element *rl, *rl2;
+	ntfs_attr_search_ctx *ctx = NULL;
+	MFT_RECORD *mrec;
+	ATTR_RECORD *a = NULL;
+	int ret, mp_size;
+	u32 old_alen = 0;
+	bool mp_rebuilt = false;
+
+	ntfs_debug("Extending mft data allocation.");
+	mft_ni = NTFS_I(vol->mft_ino);
+	/*
+	 * Determine the preferred allocation location, i.e. the last lcn of
+	 * the mft data attribute.  The allocated size of the mft data
+	 * attribute cannot be zero so we are ok to do this.
+	 */
+	down_write(&mft_ni->runlist.lock);
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	ll = mft_ni->allocated_size;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	rl = ntfs_attr_find_vcn_nolock(mft_ni,
+			(ll - 1) >> vol->cluster_size_bits, NULL);
+	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+		up_write(&mft_ni->runlist.lock);
+		ntfs_error(vol->sb, "Failed to determine last allocated "
+				"cluster of mft data attribute.");
+		if (!IS_ERR(rl))
+			ret = -EIO;
+		else
+			ret = PTR_ERR(rl);
+		return ret;
+	}
+	lcn = rl->lcn + rl->length;
+	ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn);
+	/* Minimum allocation is one mft record worth of clusters. */
+	min_nr = vol->mft_record_size >> vol->cluster_size_bits;
+	if (!min_nr)
+		min_nr = 1;
+	/* Want to allocate 16 mft records worth of clusters. */
+	nr = vol->mft_record_size << 4 >> vol->cluster_size_bits;
+	if (!nr)
+		nr = min_nr;
+	/* Ensure we do not go above 2^32-1 mft records. */
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	ll = mft_ni->allocated_size;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
+			vol->mft_record_size_bits >= (1ll << 32))) {
+		nr = min_nr;
+		if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
+				vol->mft_record_size_bits >= (1ll << 32))) {
+			ntfs_warning(vol->sb, "Cannot allocate mft record "
+					"because the maximum number of inodes "
+					"(2^32) has already been reached.");
+			up_write(&mft_ni->runlist.lock);
+			return -ENOSPC;
+		}
+	}
+	ntfs_debug("Trying mft data allocation with %s cluster count %lli.",
+			nr > min_nr ? "default" : "minimal", (long long)nr);
+	old_last_vcn = rl[1].vcn;
+	do {
+		rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
+				true);
+		if (likely(!IS_ERR(rl2)))
+			break;
+		if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
+			ntfs_error(vol->sb, "Failed to allocate the minimal "
+					"number of clusters (%lli) for the "
+					"mft data attribute.", (long long)nr);
+			up_write(&mft_ni->runlist.lock);
+			return PTR_ERR(rl2);
+		}
+		/*
+		 * There is not enough space to do the allocation, but there
+		 * might be enough space to do a minimal allocation so try that
+		 * before failing.
+		 */
+		nr = min_nr;
+		ntfs_debug("Retrying mft data allocation with minimal cluster "
+				"count %lli.", (long long)nr);
+	} while (1);
+	rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2);
+	if (IS_ERR(rl)) {
+		up_write(&mft_ni->runlist.lock);
+		ntfs_error(vol->sb, "Failed to merge runlists for mft data "
+				"attribute.");
+		if (ntfs_cluster_free_from_rl(vol, rl2)) {
+			ntfs_error(vol->sb, "Failed to dealocate clusters "
+					"from the mft data attribute.%s", es);
+			NVolSetErrors(vol);
+		}
+		ntfs_free(rl2);
+		return PTR_ERR(rl);
+	}
+	mft_ni->runlist.rl = rl;
+	ntfs_debug("Allocated %lli clusters.", (long long)nr);
+	/* Find the last run in the new runlist. */
+	for (; rl[1].length; rl++)
+		;
+	/* Update the attribute record as well. */
+	mrec = map_mft_record(mft_ni);
+	if (IS_ERR(mrec)) {
+		ntfs_error(vol->sb, "Failed to map mft record.");
+		ret = PTR_ERR(mrec);
+		goto undo_alloc;
+	}
+	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+	if (unlikely(!ctx)) {
+		ntfs_error(vol->sb, "Failed to get search context.");
+		ret = -ENOMEM;
+		goto undo_alloc;
+	}
+	ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
+			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx);
+	if (unlikely(ret)) {
+		ntfs_error(vol->sb, "Failed to find last attribute extent of "
+				"mft data attribute.");
+		if (ret == -ENOENT)
+			ret = -EIO;
+		goto undo_alloc;
+	}
+	a = ctx->attr;
+	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+	/* Search back for the previous last allocated cluster of mft bitmap. */
+	for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) {
+		if (ll >= rl2->vcn)
+			break;
+	}
+	BUG_ON(ll < rl2->vcn);
+	BUG_ON(ll >= rl2->vcn + rl2->length);
+	/* Get the size for the new mapping pairs array for this extent. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
+	if (unlikely(mp_size <= 0)) {
+		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
+				"mft data attribute extent.");
+		ret = mp_size;
+		if (!ret)
+			ret = -EIO;
+		goto undo_alloc;
+	}
+	/* Expand the attribute record if necessary. */
+	old_alen = le32_to_cpu(a->length);
+	ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+	if (unlikely(ret)) {
+		if (ret != -ENOSPC) {
+			ntfs_error(vol->sb, "Failed to resize attribute "
+					"record for mft data attribute.");
+			goto undo_alloc;
+		}
+		// TODO: Deal with this by moving this extent to a new mft
+		// record or by starting a new extent in a new mft record or by
+		// moving other attributes out of this mft record.
+		// Note: Use the special reserved mft records and ensure that
+		// this extent is not required to find the mft record in
+		// question.  If no free special records left we would need to
+		// move an existing record away, insert ours in its place, and
+		// then place the moved record into the newly allocated space
+		// and we would then need to update all references to this mft
+		// record appropriately.  This is rather complicated...
+		ntfs_error(vol->sb, "Not enough space in this mft record to "
+				"accommodate extended mft data attribute "
+				"extent.  Cannot handle this yet.");
+		ret = -EOPNOTSUPP;
+		goto undo_alloc;
+	}
+	mp_rebuilt = true;
+	/* Generate the mapping pairs array directly into the attr record. */
+	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+			mp_size, rl2, ll, -1, NULL);
+	if (unlikely(ret)) {
+		ntfs_error(vol->sb, "Failed to build mapping pairs array of "
+				"mft data attribute.");
+		goto undo_alloc;
+	}
+	/* Update the highest_vcn. */
+	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
+	/*
+	 * We now have extended the mft data allocated_size by nr clusters.
+	 * Reflect this in the ntfs_inode structure and the attribute record.
+	 * @rl is the last (non-terminator) runlist element of mft data
+	 * attribute.
+	 */
+	if (a->data.non_resident.lowest_vcn) {
+		/*
+		 * We are not in the first attribute extent, switch to it, but
+		 * first ensure the changes will make it to disk later.
+		 */
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		ntfs_attr_reinit_search_ctx(ctx);
+		ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name,
+				mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0,
+				ctx);
+		if (unlikely(ret)) {
+			ntfs_error(vol->sb, "Failed to find first attribute "
+					"extent of mft data attribute.");
+			goto restore_undo_alloc;
+		}
+		a = ctx->attr;
+	}
+	write_lock_irqsave(&mft_ni->size_lock, flags);
+	mft_ni->allocated_size += nr << vol->cluster_size_bits;
+	a->data.non_resident.allocated_size =
+			cpu_to_sle64(mft_ni->allocated_size);
+	write_unlock_irqrestore(&mft_ni->size_lock, flags);
+	/* Ensure the changes make it to disk. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(mft_ni);
+	up_write(&mft_ni->runlist.lock);
+	ntfs_debug("Done.");
+	return 0;
+restore_undo_alloc:
+	ntfs_attr_reinit_search_ctx(ctx);
+	if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
+			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
+		ntfs_error(vol->sb, "Failed to find last attribute extent of "
+				"mft data attribute.%s", es);
+		write_lock_irqsave(&mft_ni->size_lock, flags);
+		mft_ni->allocated_size += nr << vol->cluster_size_bits;
+		write_unlock_irqrestore(&mft_ni->size_lock, flags);
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(mft_ni);
+		up_write(&mft_ni->runlist.lock);
+		/*
+		 * The only thing that is now wrong is ->allocated_size of the
+		 * base attribute extent which chkdsk should be able to fix.
+		 */
+		NVolSetErrors(vol);
+		return ret;
+	}
+	ctx->attr->data.non_resident.highest_vcn =
+			cpu_to_sle64(old_last_vcn - 1);
+undo_alloc:
+	if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) {
+		ntfs_error(vol->sb, "Failed to free clusters from mft data "
+				"attribute.%s", es);
+		NVolSetErrors(vol);
+	}
+	a = ctx->attr;
+	if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
+		ntfs_error(vol->sb, "Failed to truncate mft data attribute "
+				"runlist.%s", es);
+		NVolSetErrors(vol);
+	}
+	if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
+		if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset),
+				old_alen - le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset),
+				rl2, ll, -1, NULL)) {
+			ntfs_error(vol->sb, "Failed to restore mapping pairs "
+					"array.%s", es);
+			NVolSetErrors(vol);
+		}
+		if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
+			ntfs_error(vol->sb, "Failed to restore attribute "
+					"record.%s", es);
+			NVolSetErrors(vol);
+		}
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+	} else if (IS_ERR(ctx->mrec)) {
+		ntfs_error(vol->sb, "Failed to restore attribute search "
+				"context.%s", es);
+		NVolSetErrors(vol);
+	}
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (!IS_ERR(mrec))
+		unmap_mft_record(mft_ni);
+	up_write(&mft_ni->runlist.lock);
+	return ret;
+}
+
+/**
+ * ntfs_mft_record_layout - layout an mft record into a memory buffer
+ * @vol:	volume to which the mft record will belong
+ * @mft_no:	mft reference specifying the mft record number
+ * @m:		destination buffer of size >= @vol->mft_record_size bytes
+ *
+ * Layout an empty, unused mft record with the mft record number @mft_no into
+ * the buffer @m.  The volume @vol is needed because the mft record structure
+ * was modified in NTFS 3.1 so we need to know which volume version this mft
+ * record will be used on.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
+		MFT_RECORD *m)
+{
+	ATTR_RECORD *a;
+
+	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
+	if (mft_no >= (1ll << 32)) {
+		ntfs_error(vol->sb, "Mft record number 0x%llx exceeds "
+				"maximum of 2^32.", (long long)mft_no);
+		return -ERANGE;
+	}
+	/* Start by clearing the whole mft record to gives us a clean slate. */
+	memset(m, 0, vol->mft_record_size);
+	/* Aligned to 2-byte boundary. */
+	if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
+		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1);
+	else {
+		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
+		/*
+		 * Set the NTFS 3.1+ specific fields while we know that the
+		 * volume version is 3.1+.
+		 */
+		m->reserved = 0;
+		m->mft_record_number = cpu_to_le32((u32)mft_no);
+	}
+	m->magic = magic_FILE;
+	if (vol->mft_record_size >= NTFS_BLOCK_SIZE)
+		m->usa_count = cpu_to_le16(vol->mft_record_size /
+				NTFS_BLOCK_SIZE + 1);
+	else {
+		m->usa_count = cpu_to_le16(1);
+		ntfs_warning(vol->sb, "Sector size is bigger than mft record "
+				"size.  Setting usa_count to 1.  If chkdsk "
+				"reports this as corruption, please email "
+				"linux-ntfs-dev@lists.sourceforge.net stating "
+				"that you saw this message and that the "
+				"modified filesystem created was corrupt.  "
+				"Thank you.");
+	}
+	/* Set the update sequence number to 1. */
+	*(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
+	m->lsn = 0;
+	m->sequence_number = cpu_to_le16(1);
+	m->link_count = 0;
+	/*
+	 * Place the attributes straight after the update sequence array,
+	 * aligned to 8-byte boundary.
+	 */
+	m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
+			(le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
+	m->flags = 0;
+	/*
+	 * Using attrs_offset plus eight bytes (for the termination attribute).
+	 * attrs_offset is already aligned to 8-byte boundary, so no need to
+	 * align again.
+	 */
+	m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8);
+	m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
+	m->base_mft_record = 0;
+	m->next_attr_instance = 0;
+	/* Add the termination attribute. */
+	a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
+	a->type = AT_END;
+	a->length = 0;
+	ntfs_debug("Done.");
+	return 0;
+}
+
+/**
+ * ntfs_mft_record_format - format an mft record on an ntfs volume
+ * @vol:	volume on which to format the mft record
+ * @mft_no:	mft record number to format
+ *
+ * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
+ * mft record into the appropriate place of the mft data attribute.  This is
+ * used when extending the mft data attribute.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
+{
+	loff_t i_size;
+	struct inode *mft_vi = vol->mft_ino;
+	struct page *page;
+	MFT_RECORD *m;
+	pgoff_t index, end_index;
+	unsigned int ofs;
+	int err;
+
+	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
+	/*
+	 * The index into the page cache and the offset within the page cache
+	 * page of the wanted mft record.
+	 */
+	index = mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
+	ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+	/* The maximum valid index into the page cache for $MFT's data. */
+	i_size = i_size_read(mft_vi);
+	end_index = i_size >> PAGE_CACHE_SHIFT;
+	if (unlikely(index >= end_index)) {
+		if (unlikely(index > end_index || ofs + vol->mft_record_size >=
+				(i_size & ~PAGE_CACHE_MASK))) {
+			ntfs_error(vol->sb, "Tried to format non-existing mft "
+					"record 0x%llx.", (long long)mft_no);
+			return -ENOENT;
+		}
+	}
+	/* Read, map, and pin the page containing the mft record. */
+	page = ntfs_map_page(mft_vi->i_mapping, index);
+	if (IS_ERR(page)) {
+		ntfs_error(vol->sb, "Failed to map page containing mft record "
+				"to format 0x%llx.", (long long)mft_no);
+		return PTR_ERR(page);
+	}
+	lock_page(page);
+	BUG_ON(!PageUptodate(page));
+	ClearPageUptodate(page);
+	m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
+	err = ntfs_mft_record_layout(vol, mft_no, m);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.",
+				(long long)mft_no);
+		SetPageUptodate(page);
+		unlock_page(page);
+		ntfs_unmap_page(page);
+		return err;
+	}
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	unlock_page(page);
+	/*
+	 * Make sure the mft record is written out to disk.  We could use
+	 * ilookup5() to check if an inode is in icache and so on but this is
+	 * unnecessary as ntfs_writepage() will write the dirty record anyway.
+	 */
+	mark_ntfs_record_dirty(page, ofs);
+	ntfs_unmap_page(page);
+	ntfs_debug("Done.");
+	return 0;
+}
+
+/**
+ * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
+ * @vol:	[IN]  volume on which to allocate the mft record
+ * @mode:	[IN]  mode if want a file or directory, i.e. base inode or 0
+ * @base_ni:	[IN]  open base inode if allocating an extent mft record or NULL
+ * @mrec:	[OUT] on successful return this is the mapped mft record
+ *
+ * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol.
+ *
+ * If @base_ni is NULL make the mft record a base mft record, i.e. a file or
+ * direvctory inode, and allocate it at the default allocator position.  In
+ * this case @mode is the file mode as given to us by the caller.  We in
+ * particular use @mode to distinguish whether a file or a directory is being
+ * created (S_IFDIR(mode) and S_IFREG(mode), respectively).
+ *
+ * If @base_ni is not NULL make the allocated mft record an extent record,
+ * allocate it starting at the mft record after the base mft record and attach
+ * the allocated and opened ntfs inode to the base inode @base_ni.  In this
+ * case @mode must be 0 as it is meaningless for extent inodes.
+ *
+ * You need to check the return value with IS_ERR().  If false, the function
+ * was successful and the return value is the now opened ntfs inode of the
+ * allocated mft record.  *@mrec is then set to the allocated, mapped, pinned,
+ * and locked mft record.  If IS_ERR() is true, the function failed and the
+ * error code is obtained from PTR_ERR(return value).  *@mrec is undefined in
+ * this case.
+ *
+ * Allocation strategy:
+ *
+ * To find a free mft record, we scan the mft bitmap for a zero bit.  To
+ * optimize this we start scanning at the place specified by @base_ni or if
+ * @base_ni is NULL we start where we last stopped and we perform wrap around
+ * when we reach the end.  Note, we do not try to allocate mft records below
+ * number 24 because numbers 0 to 15 are the defined system files anyway and 16
+ * to 24 are special in that they are used for storing extension mft records
+ * for the $DATA attribute of $MFT.  This is required to avoid the possibility
+ * of creating a runlist with a circular dependency which once written to disk
+ * can never be read in again.  Windows will only use records 16 to 24 for
+ * normal files if the volume is completely out of space.  We never use them
+ * which means that when the volume is really out of space we cannot create any
+ * more files while Windows can still create up to 8 small files.  We can start
+ * doing this at some later time, it does not matter much for now.
+ *
+ * When scanning the mft bitmap, we only search up to the last allocated mft
+ * record.  If there are no free records left in the range 24 to number of
+ * allocated mft records, then we extend the $MFT/$DATA attribute in order to
+ * create free mft records.  We extend the allocated size of $MFT/$DATA by 16
+ * records at a time or one cluster, if cluster size is above 16kiB.  If there
+ * is not sufficient space to do this, we try to extend by a single mft record
+ * or one cluster, if cluster size is above the mft record size.
+ *
+ * No matter how many mft records we allocate, we initialize only the first
+ * allocated mft record, incrementing mft data size and initialized size
+ * accordingly, open an ntfs_inode for it and return it to the caller, unless
+ * there are less than 24 mft records, in which case we allocate and initialize
+ * mft records until we reach record 24 which we consider as the first free mft
+ * record for use by normal files.
+ *
+ * If during any stage we overflow the initialized data in the mft bitmap, we
+ * extend the initialized size (and data size) by 8 bytes, allocating another
+ * cluster if required.  The bitmap data size has to be at least equal to the
+ * number of mft records in the mft, but it can be bigger, in which case the
+ * superflous bits are padded with zeroes.
+ *
+ * Thus, when we return successfully (IS_ERR() is false), we will have:
+ *	- initialized / extended the mft bitmap if necessary,
+ *	- initialized / extended the mft data if necessary,
+ *	- set the bit corresponding to the mft record being allocated in the
+ *	  mft bitmap,
+ *	- opened an ntfs_inode for the allocated mft record, and we will have
+ *	- returned the ntfs_inode as well as the allocated mapped, pinned, and
+ *	  locked mft record.
+ *
+ * On error, the volume will be left in a consistent state and no record will
+ * be allocated.  If rolling back a partial operation fails, we may leave some
+ * inconsistent metadata in which case we set NVolErrors() so the volume is
+ * left dirty when unmounted.
+ *
+ * Note, this function cannot make use of most of the normal functions, like
+ * for example for attribute resizing, etc, because when the run list overflows
+ * the base mft record and an attribute list is used, it is very important that
+ * the extension mft records used to store the $DATA attribute of $MFT can be
+ * reached without having to read the information contained inside them, as
+ * this would make it impossible to find them in the first place after the
+ * volume is unmounted.  $MFT/$BITMAP probably does not need to follow this
+ * rule because the bitmap is not essential for finding the mft records, but on
+ * the other hand, handling the bitmap in this special way would make life
+ * easier because otherwise there might be circular invocations of functions
+ * when reading the bitmap.
+ */
+ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
+		ntfs_inode *base_ni, MFT_RECORD **mrec)
+{
+	s64 ll, bit, old_data_initialized, old_data_size;
+	unsigned long flags;
+	struct inode *vi;
+	struct page *page;
+	ntfs_inode *mft_ni, *mftbmp_ni, *ni;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	pgoff_t index;
+	unsigned int ofs;
+	int err;
+	le16 seq_no, usn;
+	bool record_formatted = false;
+
+	if (base_ni) {
+		ntfs_debug("Entering (allocating an extent mft record for "
+				"base mft record 0x%llx).",
+				(long long)base_ni->mft_no);
+		/* @mode and @base_ni are mutually exclusive. */
+		BUG_ON(mode);
+	} else
+		ntfs_debug("Entering (allocating a base mft record).");
+	if (mode) {
+		/* @mode and @base_ni are mutually exclusive. */
+		BUG_ON(base_ni);
+		/* We only support creation of normal files and directories. */
+		if (!S_ISREG(mode) && !S_ISDIR(mode))
+			return ERR_PTR(-EOPNOTSUPP);
+	}
+	BUG_ON(!mrec);
+	mft_ni = NTFS_I(vol->mft_ino);
+	mftbmp_ni = NTFS_I(vol->mftbmp_ino);
+	down_write(&vol->mftbmp_lock);
+	bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni);
+	if (bit >= 0) {
+		ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
+				(long long)bit);
+		goto have_alloc_rec;
+	}
+	if (bit != -ENOSPC) {
+		up_write(&vol->mftbmp_lock);
+		return ERR_PTR(bit);
+	}
+	/*
+	 * No free mft records left.  If the mft bitmap already covers more
+	 * than the currently used mft records, the next records are all free,
+	 * so we can simply allocate the first unused mft record.
+	 * Note: We also have to make sure that the mft bitmap at least covers
+	 * the first 24 mft records as they are special and whilst they may not
+	 * be in use, we do not allocate from them.
+	 */
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	old_data_initialized = mftbmp_ni->initialized_size;
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	if (old_data_initialized << 3 > ll && old_data_initialized > 3) {
+		bit = ll;
+		if (bit < 24)
+			bit = 24;
+		if (unlikely(bit >= (1ll << 32)))
+			goto max_err_out;
+		ntfs_debug("Found free record (#2), bit 0x%llx.",
+				(long long)bit);
+		goto found_free_rec;
+	}
+	/*
+	 * The mft bitmap needs to be expanded until it covers the first unused
+	 * mft record that we can allocate.
+	 * Note: The smallest mft record we allocate is mft record 24.
+	 */
+	bit = old_data_initialized << 3;
+	if (unlikely(bit >= (1ll << 32)))
+		goto max_err_out;
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	old_data_size = mftbmp_ni->allocated_size;
+	ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
+			"data_size 0x%llx, initialized_size 0x%llx.",
+			(long long)old_data_size,
+			(long long)i_size_read(vol->mftbmp_ino),
+			(long long)old_data_initialized);
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	if (old_data_initialized + 8 > old_data_size) {
+		/* Need to extend bitmap by one more cluster. */
+		ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
+		err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
+		if (unlikely(err)) {
+			up_write(&vol->mftbmp_lock);
+			goto err_out;
+		}
+#ifdef DEBUG
+		read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+		ntfs_debug("Status of mftbmp after allocation extension: "
+				"allocated_size 0x%llx, data_size 0x%llx, "
+				"initialized_size 0x%llx.",
+				(long long)mftbmp_ni->allocated_size,
+				(long long)i_size_read(vol->mftbmp_ino),
+				(long long)mftbmp_ni->initialized_size);
+		read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+#endif /* DEBUG */
+	}
+	/*
+	 * We now have sufficient allocated space, extend the initialized_size
+	 * as well as the data_size if necessary and fill the new space with
+	 * zeroes.
+	 */
+	err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
+	if (unlikely(err)) {
+		up_write(&vol->mftbmp_lock);
+		goto err_out;
+	}
+#ifdef DEBUG
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	ntfs_debug("Status of mftbmp after initialized extension: "
+			"allocated_size 0x%llx, data_size 0x%llx, "
+			"initialized_size 0x%llx.",
+			(long long)mftbmp_ni->allocated_size,
+			(long long)i_size_read(vol->mftbmp_ino),
+			(long long)mftbmp_ni->initialized_size);
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+#endif /* DEBUG */
+	ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
+found_free_rec:
+	/* @bit is the found free mft record, allocate it in the mft bitmap. */
+	ntfs_debug("At found_free_rec.");
+	err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap.");
+		up_write(&vol->mftbmp_lock);
+		goto err_out;
+	}
+	ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit);
+have_alloc_rec:
+	/*
+	 * The mft bitmap is now uptodate.  Deal with mft data attribute now.
+	 * Note, we keep hold of the mft bitmap lock for writing until all
+	 * modifications to the mft data attribute are complete, too, as they
+	 * will impact decisions for mft bitmap and mft record allocation done
+	 * by a parallel allocation and if the lock is not maintained a
+	 * parallel allocation could allocate the same mft record as this one.
+	 */
+	ll = (bit + 1) << vol->mft_record_size_bits;
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	old_data_initialized = mft_ni->initialized_size;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	if (ll <= old_data_initialized) {
+		ntfs_debug("Allocated mft record already initialized.");
+		goto mft_rec_already_initialized;
+	}
+	ntfs_debug("Initializing allocated mft record.");
+	/*
+	 * The mft record is outside the initialized data.  Extend the mft data
+	 * attribute until it covers the allocated record.  The loop is only
+	 * actually traversed more than once when a freshly formatted volume is
+	 * first written to so it optimizes away nicely in the common case.
+	 */
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	ntfs_debug("Status of mft data before extension: "
+			"allocated_size 0x%llx, data_size 0x%llx, "
+			"initialized_size 0x%llx.",
+			(long long)mft_ni->allocated_size,
+			(long long)i_size_read(vol->mft_ino),
+			(long long)mft_ni->initialized_size);
+	while (ll > mft_ni->allocated_size) {
+		read_unlock_irqrestore(&mft_ni->size_lock, flags);
+		err = ntfs_mft_data_extend_allocation_nolock(vol);
+		if (unlikely(err)) {
+			ntfs_error(vol->sb, "Failed to extend mft data "
+					"allocation.");
+			goto undo_mftbmp_alloc_nolock;
+		}
+		read_lock_irqsave(&mft_ni->size_lock, flags);
+		ntfs_debug("Status of mft data after allocation extension: "
+				"allocated_size 0x%llx, data_size 0x%llx, "
+				"initialized_size 0x%llx.",
+				(long long)mft_ni->allocated_size,
+				(long long)i_size_read(vol->mft_ino),
+				(long long)mft_ni->initialized_size);
+	}
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	/*
+	 * Extend mft data initialized size (and data size of course) to reach
+	 * the allocated mft record, formatting the mft records allong the way.
+	 * Note: We only modify the ntfs_inode structure as that is all that is
+	 * needed by ntfs_mft_record_format().  We will update the attribute
+	 * record itself in one fell swoop later on.
+	 */
+	write_lock_irqsave(&mft_ni->size_lock, flags);
+	old_data_initialized = mft_ni->initialized_size;
+	old_data_size = vol->mft_ino->i_size;
+	while (ll > mft_ni->initialized_size) {
+		s64 new_initialized_size, mft_no;
+		
+		new_initialized_size = mft_ni->initialized_size +
+				vol->mft_record_size;
+		mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
+		if (new_initialized_size > i_size_read(vol->mft_ino))
+			i_size_write(vol->mft_ino, new_initialized_size);
+		write_unlock_irqrestore(&mft_ni->size_lock, flags);
+		ntfs_debug("Initializing mft record 0x%llx.",
+				(long long)mft_no);
+		err = ntfs_mft_record_format(vol, mft_no);
+		if (unlikely(err)) {
+			ntfs_error(vol->sb, "Failed to format mft record.");
+			goto undo_data_init;
+		}
+		write_lock_irqsave(&mft_ni->size_lock, flags);
+		mft_ni->initialized_size = new_initialized_size;
+	}
+	write_unlock_irqrestore(&mft_ni->size_lock, flags);
+	record_formatted = true;
+	/* Update the mft data attribute record to reflect the new sizes. */
+	m = map_mft_record(mft_ni);
+	if (IS_ERR(m)) {
+		ntfs_error(vol->sb, "Failed to map mft record.");
+		err = PTR_ERR(m);
+		goto undo_data_init;
+	}
+	ctx = ntfs_attr_get_search_ctx(mft_ni, m);
+	if (unlikely(!ctx)) {
+		ntfs_error(vol->sb, "Failed to get search context.");
+		err = -ENOMEM;
+		unmap_mft_record(mft_ni);
+		goto undo_data_init;
+	}
+	err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Failed to find first attribute extent of "
+				"mft data attribute.");
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(mft_ni);
+		goto undo_data_init;
+	}
+	a = ctx->attr;
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	a->data.non_resident.initialized_size =
+			cpu_to_sle64(mft_ni->initialized_size);
+	a->data.non_resident.data_size =
+			cpu_to_sle64(i_size_read(vol->mft_ino));
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	/* Ensure the changes make it to disk. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(mft_ni);
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	ntfs_debug("Status of mft data after mft record initialization: "
+			"allocated_size 0x%llx, data_size 0x%llx, "
+			"initialized_size 0x%llx.",
+			(long long)mft_ni->allocated_size,
+			(long long)i_size_read(vol->mft_ino),
+			(long long)mft_ni->initialized_size);
+	BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size);
+	BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino));
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+mft_rec_already_initialized:
+	/*
+	 * We can finally drop the mft bitmap lock as the mft data attribute
+	 * has been fully updated.  The only disparity left is that the
+	 * allocated mft record still needs to be marked as in use to match the
+	 * set bit in the mft bitmap but this is actually not a problem since
+	 * this mft record is not referenced from anywhere yet and the fact
+	 * that it is allocated in the mft bitmap means that no-one will try to
+	 * allocate it either.
+	 */
+	up_write(&vol->mftbmp_lock);
+	/*
+	 * We now have allocated and initialized the mft record.  Calculate the
+	 * index of and the offset within the page cache page the record is in.
+	 */
+	index = bit << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
+	ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+	/* Read, map, and pin the page containing the mft record. */
+	page = ntfs_map_page(vol->mft_ino->i_mapping, index);
+	if (IS_ERR(page)) {
+		ntfs_error(vol->sb, "Failed to map page containing allocated "
+				"mft record 0x%llx.", (long long)bit);
+		err = PTR_ERR(page);
+		goto undo_mftbmp_alloc;
+	}
+	lock_page(page);
+	BUG_ON(!PageUptodate(page));
+	ClearPageUptodate(page);
+	m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
+	/* If we just formatted the mft record no need to do it again. */
+	if (!record_formatted) {
+		/* Sanity check that the mft record is really not in use. */
+		if (ntfs_is_file_record(m->magic) &&
+				(m->flags & MFT_RECORD_IN_USE)) {
+			ntfs_error(vol->sb, "Mft record 0x%llx was marked "
+					"free in mft bitmap but is marked "
+					"used itself.  Corrupt filesystem.  "
+					"Unmount and run chkdsk.",
+					(long long)bit);
+			err = -EIO;
+			SetPageUptodate(page);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			NVolSetErrors(vol);
+			goto undo_mftbmp_alloc;
+		}
+		/*
+		 * We need to (re-)format the mft record, preserving the
+		 * sequence number if it is not zero as well as the update
+		 * sequence number if it is not zero or -1 (0xffff).  This
+		 * means we do not need to care whether or not something went
+		 * wrong with the previous mft record.
+		 */
+		seq_no = m->sequence_number;
+		usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs));
+		err = ntfs_mft_record_layout(vol, bit, m);
+		if (unlikely(err)) {
+			ntfs_error(vol->sb, "Failed to layout allocated mft "
+					"record 0x%llx.", (long long)bit);
+			SetPageUptodate(page);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			goto undo_mftbmp_alloc;
+		}
+		if (seq_no)
+			m->sequence_number = seq_no;
+		if (usn && le16_to_cpu(usn) != 0xffff)
+			*(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn;
+	}
+	/* Set the mft record itself in use. */
+	m->flags |= MFT_RECORD_IN_USE;
+	if (S_ISDIR(mode))
+		m->flags |= MFT_RECORD_IS_DIRECTORY;
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	if (base_ni) {
+		MFT_RECORD *m_tmp;
+
+		/*
+		 * Setup the base mft record in the extent mft record.  This
+		 * completes initialization of the allocated extent mft record
+		 * and we can simply use it with map_extent_mft_record().
+		 */
+		m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
+				base_ni->seq_no);
+		/*
+		 * Allocate an extent inode structure for the new mft record,
+		 * attach it to the base inode @base_ni and map, pin, and lock
+		 * its, i.e. the allocated, mft record.
+		 */
+		m_tmp = map_extent_mft_record(base_ni, bit, &ni);
+		if (IS_ERR(m_tmp)) {
+			ntfs_error(vol->sb, "Failed to map allocated extent "
+					"mft record 0x%llx.", (long long)bit);
+			err = PTR_ERR(m_tmp);
+			/* Set the mft record itself not in use. */
+			m->flags &= cpu_to_le16(
+					~le16_to_cpu(MFT_RECORD_IN_USE));
+			flush_dcache_page(page);
+			/* Make sure the mft record is written out to disk. */
+			mark_ntfs_record_dirty(page, ofs);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			goto undo_mftbmp_alloc;
+		}
+		BUG_ON(m != m_tmp);
+		/*
+		 * Make sure the allocated mft record is written out to disk.
+		 * No need to set the inode dirty because the caller is going
+		 * to do that anyway after finishing with the new extent mft
+		 * record (e.g. at a minimum a new attribute will be added to
+		 * the mft record.
+		 */
+		mark_ntfs_record_dirty(page, ofs);
+		unlock_page(page);
+		/*
+		 * Need to unmap the page since map_extent_mft_record() mapped
+		 * it as well so we have it mapped twice at the moment.
+		 */
+		ntfs_unmap_page(page);
+	} else {
+		/*
+		 * Allocate a new VFS inode and set it up.  NOTE: @vi->i_nlink
+		 * is set to 1 but the mft record->link_count is 0.  The caller
+		 * needs to bear this in mind.
+		 */
+		vi = new_inode(vol->sb);
+		if (unlikely(!vi)) {
+			err = -ENOMEM;
+			/* Set the mft record itself not in use. */
+			m->flags &= cpu_to_le16(
+					~le16_to_cpu(MFT_RECORD_IN_USE));
+			flush_dcache_page(page);
+			/* Make sure the mft record is written out to disk. */
+			mark_ntfs_record_dirty(page, ofs);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			goto undo_mftbmp_alloc;
+		}
+		vi->i_ino = bit;
+		/*
+		 * This is for checking whether an inode has changed w.r.t. a
+		 * file so that the file can be updated if necessary (compare
+		 * with f_version).
+		 */
+		vi->i_version = 1;
+
+		/* The owner and group come from the ntfs volume. */
+		vi->i_uid = vol->uid;
+		vi->i_gid = vol->gid;
+
+		/* Initialize the ntfs specific part of @vi. */
+		ntfs_init_big_inode(vi);
+		ni = NTFS_I(vi);
+		/*
+		 * Set the appropriate mode, attribute type, and name.  For
+		 * directories, also setup the index values to the defaults.
+		 */
+		if (S_ISDIR(mode)) {
+			vi->i_mode = S_IFDIR | S_IRWXUGO;
+			vi->i_mode &= ~vol->dmask;
+
+			NInoSetMstProtected(ni);
+			ni->type = AT_INDEX_ALLOCATION;
+			ni->name = I30;
+			ni->name_len = 4;
+
+			ni->itype.index.block_size = 4096;
+			ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1;
+			ni->itype.index.collation_rule = COLLATION_FILE_NAME;
+			if (vol->cluster_size <= ni->itype.index.block_size) {
+				ni->itype.index.vcn_size = vol->cluster_size;
+				ni->itype.index.vcn_size_bits =
+						vol->cluster_size_bits;
+			} else {
+				ni->itype.index.vcn_size = vol->sector_size;
+				ni->itype.index.vcn_size_bits =
+						vol->sector_size_bits;
+			}
+		} else {
+			vi->i_mode = S_IFREG | S_IRWXUGO;
+			vi->i_mode &= ~vol->fmask;
+
+			ni->type = AT_DATA;
+			ni->name = NULL;
+			ni->name_len = 0;
+		}
+		if (IS_RDONLY(vi))
+			vi->i_mode &= ~S_IWUGO;
+
+		/* Set the inode times to the current time. */
+		vi->i_atime = vi->i_mtime = vi->i_ctime =
+			current_fs_time(vi->i_sb);
+		/*
+		 * Set the file size to 0, the ntfs inode sizes are set to 0 by
+		 * the call to ntfs_init_big_inode() below.
+		 */
+		vi->i_size = 0;
+		vi->i_blocks = 0;
+
+		/* Set the sequence number. */
+		vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
+		/*
+		 * Manually map, pin, and lock the mft record as we already
+		 * have its page mapped and it is very easy to do.
+		 */
+		atomic_inc(&ni->count);
+		mutex_lock(&ni->mrec_lock);
+		ni->page = page;
+		ni->page_ofs = ofs;
+		/*
+		 * Make sure the allocated mft record is written out to disk.
+		 * NOTE: We do not set the ntfs inode dirty because this would
+		 * fail in ntfs_write_inode() because the inode does not have a
+		 * standard information attribute yet.  Also, there is no need
+		 * to set the inode dirty because the caller is going to do
+		 * that anyway after finishing with the new mft record (e.g. at
+		 * a minimum some new attributes will be added to the mft
+		 * record.
+		 */
+		mark_ntfs_record_dirty(page, ofs);
+		unlock_page(page);
+
+		/* Add the inode to the inode hash for the superblock. */
+		insert_inode_hash(vi);
+
+		/* Update the default mft allocation position. */
+		vol->mft_data_pos = bit + 1;
+	}
+	/*
+	 * Return the opened, allocated inode of the allocated mft record as
+	 * well as the mapped, pinned, and locked mft record.
+	 */
+	ntfs_debug("Returning opened, allocated %sinode 0x%llx.",
+			base_ni ? "extent " : "", (long long)bit);
+	*mrec = m;
+	return ni;
+undo_data_init:
+	write_lock_irqsave(&mft_ni->size_lock, flags);
+	mft_ni->initialized_size = old_data_initialized;
+	i_size_write(vol->mft_ino, old_data_size);
+	write_unlock_irqrestore(&mft_ni->size_lock, flags);
+	goto undo_mftbmp_alloc_nolock;
+undo_mftbmp_alloc:
+	down_write(&vol->mftbmp_lock);
+undo_mftbmp_alloc_nolock:
+	if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) {
+		ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
+		NVolSetErrors(vol);
+	}
+	up_write(&vol->mftbmp_lock);
+err_out:
+	return ERR_PTR(err);
+max_err_out:
+	ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum "
+			"number of inodes (2^32) has already been reached.");
+	up_write(&vol->mftbmp_lock);
+	return ERR_PTR(-ENOSPC);
+}
+
+/**
+ * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume
+ * @ni:		ntfs inode of the mapped extent mft record to free
+ * @m:		mapped extent mft record of the ntfs inode @ni
+ *
+ * Free the mapped extent mft record @m of the extent ntfs inode @ni.
+ *
+ * Note that this function unmaps the mft record and closes and destroys @ni
+ * internally and hence you cannot use either @ni nor @m any more after this
+ * function returns success.
+ *
+ * On success return 0 and on error return -errno.  @ni and @m are still valid
+ * in this case and have not been freed.
+ *
+ * For some errors an error message is displayed and the success code 0 is
+ * returned and the volume is then left dirty on umount.  This makes sense in
+ * case we could not rollback the changes that were already done since the
+ * caller no longer wants to reference this mft record so it does not matter to
+ * the caller if something is wrong with it as long as it is properly detached
+ * from the base inode.
+ */
+int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
+{
+	unsigned long mft_no = ni->mft_no;
+	ntfs_volume *vol = ni->vol;
+	ntfs_inode *base_ni;
+	ntfs_inode **extent_nis;
+	int i, err;
+	le16 old_seq_no;
+	u16 seq_no;
+	
+	BUG_ON(NInoAttr(ni));
+	BUG_ON(ni->nr_extents != -1);
+
+	mutex_lock(&ni->extent_lock);
+	base_ni = ni->ext.base_ntfs_ino;
+	mutex_unlock(&ni->extent_lock);
+
+	BUG_ON(base_ni->nr_extents <= 0);
+
+	ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n",
+			mft_no, base_ni->mft_no);
+
+	mutex_lock(&base_ni->extent_lock);
+
+	/* Make sure we are holding the only reference to the extent inode. */
+	if (atomic_read(&ni->count) > 2) {
+		ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, "
+				"not freeing.", base_ni->mft_no);
+		mutex_unlock(&base_ni->extent_lock);
+		return -EBUSY;
+	}
+
+	/* Dissociate the ntfs inode from the base inode. */
+	extent_nis = base_ni->ext.extent_ntfs_inos;
+	err = -ENOENT;
+	for (i = 0; i < base_ni->nr_extents; i++) {
+		if (ni != extent_nis[i])
+			continue;
+		extent_nis += i;
+		base_ni->nr_extents--;
+		memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) *
+				sizeof(ntfs_inode*));
+		err = 0;
+		break;
+	}
+
+	mutex_unlock(&base_ni->extent_lock);
+
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to "
+				"its base inode 0x%lx.", mft_no,
+				base_ni->mft_no);
+		BUG();
+	}
+
+	/*
+	 * The extent inode is no longer attached to the base inode so no one
+	 * can get a reference to it any more.
+	 */
+
+	/* Mark the mft record as not in use. */
+	m->flags &= ~MFT_RECORD_IN_USE;
+
+	/* Increment the sequence number, skipping zero, if it is not zero. */
+	old_seq_no = m->sequence_number;
+	seq_no = le16_to_cpu(old_seq_no);
+	if (seq_no == 0xffff)
+		seq_no = 1;
+	else if (seq_no)
+		seq_no++;
+	m->sequence_number = cpu_to_le16(seq_no);
+
+	/*
+	 * Set the ntfs inode dirty and write it out.  We do not need to worry
+	 * about the base inode here since whatever caused the extent mft
+	 * record to be freed is guaranteed to do it already.
+	 */
+	NInoSetDirty(ni);
+	err = write_mft_record(ni, m, 0);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not "
+				"freeing.", mft_no);
+		goto rollback;
+	}
+rollback_error:
+	/* Unmap and throw away the now freed extent inode. */
+	unmap_extent_mft_record(ni);
+	ntfs_clear_extent_inode(ni);
+
+	/* Clear the bit in the $MFT/$BITMAP corresponding to this record. */
+	down_write(&vol->mftbmp_lock);
+	err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no);
+	up_write(&vol->mftbmp_lock);
+	if (unlikely(err)) {
+		/*
+		 * The extent inode is gone but we failed to deallocate it in
+		 * the mft bitmap.  Just emit a warning and leave the volume
+		 * dirty on umount.
+		 */
+		ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
+		NVolSetErrors(vol);
+	}
+	return 0;
+rollback:
+	/* Rollback what we did... */
+	mutex_lock(&base_ni->extent_lock);
+	extent_nis = base_ni->ext.extent_ntfs_inos;
+	if (!(base_ni->nr_extents & 3)) {
+		int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*);
+
+		extent_nis = kmalloc(new_size, GFP_NOFS);
+		if (unlikely(!extent_nis)) {
+			ntfs_error(vol->sb, "Failed to allocate internal "
+					"buffer during rollback.%s", es);
+			mutex_unlock(&base_ni->extent_lock);
+			NVolSetErrors(vol);
+			goto rollback_error;
+		}
+		if (base_ni->nr_extents) {
+			BUG_ON(!base_ni->ext.extent_ntfs_inos);
+			memcpy(extent_nis, base_ni->ext.extent_ntfs_inos,
+					new_size - 4 * sizeof(ntfs_inode*));
+			kfree(base_ni->ext.extent_ntfs_inos);
+		}
+		base_ni->ext.extent_ntfs_inos = extent_nis;
+	}
+	m->flags |= MFT_RECORD_IN_USE;
+	m->sequence_number = old_seq_no;
+	extent_nis[base_ni->nr_extents++] = ni;
+	mutex_unlock(&base_ni->extent_lock);
+	mark_mft_record_dirty(ni);
+	return err;
+}
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h
new file mode 100644
index 00000000..b52bf87b
--- /dev/null
+++ b/fs/ntfs/mft.h
@@ -0,0 +1,124 @@
+/*
+ * mft.h - Defines for mft record handling in NTFS Linux kernel driver.
+ *	   Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_MFT_H
+#define _LINUX_NTFS_MFT_H
+
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+
+#include "inode.h"
+
+extern MFT_RECORD *map_mft_record(ntfs_inode *ni);
+extern void unmap_mft_record(ntfs_inode *ni);
+
+extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
+		ntfs_inode **ntfs_ino);
+
+static inline void unmap_extent_mft_record(ntfs_inode *ni)
+{
+	unmap_mft_record(ni);
+	return;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * flush_dcache_mft_record_page - flush_dcache_page() for mft records
+ * @ni:		ntfs inode structure of mft record
+ *
+ * Call flush_dcache_page() for the page in which an mft record resides.
+ *
+ * This must be called every time an mft record is modified, just after the
+ * modification.
+ */
+static inline void flush_dcache_mft_record_page(ntfs_inode *ni)
+{
+	flush_dcache_page(ni->page);
+}
+
+extern void __mark_mft_record_dirty(ntfs_inode *ni);
+
+/**
+ * mark_mft_record_dirty - set the mft record and the page containing it dirty
+ * @ni:		ntfs inode describing the mapped mft record
+ *
+ * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
+ * as well as the page containing the mft record, dirty.  Also, mark the base
+ * vfs inode dirty.  This ensures that any changes to the mft record are
+ * written out to disk.
+ *
+ * NOTE:  Do not do anything if the mft record is already marked dirty.
+ */
+static inline void mark_mft_record_dirty(ntfs_inode *ni)
+{
+	if (!NInoTestSetDirty(ni))
+		__mark_mft_record_dirty(ni);
+}
+
+extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
+		MFT_RECORD *m, int sync);
+
+extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync);
+
+/**
+ * write_mft_record - write out a mapped (extent) mft record
+ * @ni:		ntfs inode describing the mapped (extent) mft record
+ * @m:		mapped (extent) mft record to write
+ * @sync:	if true, wait for i/o completion
+ *
+ * This is just a wrapper for write_mft_record_nolock() (see mft.c), which
+ * locks the page for the duration of the write.  This ensures that there are
+ * no race conditions between writing the mft record via the dirty inode code
+ * paths and via the page cache write back code paths or between writing
+ * neighbouring mft records residing in the same page.
+ *
+ * Locking the page also serializes us against ->readpage() if the page is not
+ * uptodate.
+ *
+ * On success, clean the mft record and return 0.  On error, leave the mft
+ * record dirty and return -errno.
+ */
+static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync)
+{
+	struct page *page = ni->page;
+	int err;
+
+	BUG_ON(!page);
+	lock_page(page);
+	err = write_mft_record_nolock(ni, m, sync);
+	unlock_page(page);
+	return err;
+}
+
+extern bool ntfs_may_write_mft_record(ntfs_volume *vol,
+		const unsigned long mft_no, const MFT_RECORD *m,
+		ntfs_inode **locked_ni);
+
+extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
+		ntfs_inode *base_ni, MFT_RECORD **mrec);
+extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_MFT_H */
diff --git a/fs/ntfs/mst.c b/fs/ntfs/mst.c
new file mode 100644
index 00000000..5a858d83
--- /dev/null
+++ b/fs/ntfs/mst.c
@@ -0,0 +1,203 @@
+/*
+ * mst.c - NTFS multi sector transfer protection handling code. Part of the
+ *	   Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "ntfs.h"
+
+/**
+ * post_read_mst_fixup - deprotect multi sector transfer protected data
+ * @b:		pointer to the data to deprotect
+ * @size:	size in bytes of @b
+ *
+ * Perform the necessary post read multi sector transfer fixup and detect the
+ * presence of incomplete multi sector transfers. - In that case, overwrite the
+ * magic of the ntfs record header being processed with "BAAD" (in memory only!)
+ * and abort processing.
+ *
+ * Return 0 on success and -EINVAL on error ("BAAD" magic will be present).
+ *
+ * NOTE: We consider the absence / invalidity of an update sequence array to
+ * mean that the structure is not protected at all and hence doesn't need to
+ * be fixed up. Thus, we return success and not failure in this case. This is
+ * in contrast to pre_write_mst_fixup(), see below.
+ */
+int post_read_mst_fixup(NTFS_RECORD *b, const u32 size)
+{
+	u16 usa_ofs, usa_count, usn;
+	u16 *usa_pos, *data_pos;
+
+	/* Setup the variables. */
+	usa_ofs = le16_to_cpu(b->usa_ofs);
+	/* Decrement usa_count to get number of fixups. */
+	usa_count = le16_to_cpu(b->usa_count) - 1;
+	/* Size and alignment checks. */
+	if ( size & (NTFS_BLOCK_SIZE - 1)	||
+	     usa_ofs & 1			||
+	     usa_ofs + (usa_count * 2) > size	||
+	     (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
+		return 0;
+	/* Position of usn in update sequence array. */
+	usa_pos = (u16*)b + usa_ofs/sizeof(u16);
+	/*
+	 * The update sequence number which has to be equal to each of the
+	 * u16 values before they are fixed up. Note no need to care for
+	 * endianness since we are comparing and moving data for on disk
+	 * structures which means the data is consistent. - If it is
+	 * consistenty the wrong endianness it doesn't make any difference.
+	 */
+	usn = *usa_pos;
+	/*
+	 * Position in protected data of first u16 that needs fixing up.
+	 */
+	data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
+	/*
+	 * Check for incomplete multi sector transfer(s).
+	 */
+	while (usa_count--) {
+		if (*data_pos != usn) {
+			/*
+			 * Incomplete multi sector transfer detected! )-:
+			 * Set the magic to "BAAD" and return failure.
+			 * Note that magic_BAAD is already converted to le32.
+			 */
+			b->magic = magic_BAAD;
+			return -EINVAL;
+		}
+		data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
+	}
+	/* Re-setup the variables. */
+	usa_count = le16_to_cpu(b->usa_count) - 1;
+	data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
+	/* Fixup all sectors. */
+	while (usa_count--) {
+		/*
+		 * Increment position in usa and restore original data from
+		 * the usa into the data buffer.
+		 */
+		*data_pos = *(++usa_pos);
+		/* Increment position in data as well. */
+		data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
+	}
+	return 0;
+}
+
+/**
+ * pre_write_mst_fixup - apply multi sector transfer protection
+ * @b:		pointer to the data to protect
+ * @size:	size in bytes of @b
+ *
+ * Perform the necessary pre write multi sector transfer fixup on the data
+ * pointer to by @b of @size.
+ *
+ * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed
+ * (assumed not needed). This is in contrast to post_read_mst_fixup() above.
+ *
+ * NOTE: We consider the absence / invalidity of an update sequence array to
+ * mean that the structure is not subject to protection and hence doesn't need
+ * to be fixed up. This means that you have to create a valid update sequence
+ * array header in the ntfs record before calling this function, otherwise it
+ * will fail (the header needs to contain the position of the update sequence
+ * array together with the number of elements in the array). You also need to
+ * initialise the update sequence number before calling this function
+ * otherwise a random word will be used (whatever was in the record at that
+ * position at that time).
+ */
+int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size)
+{
+	le16 *usa_pos, *data_pos;
+	u16 usa_ofs, usa_count, usn;
+	le16 le_usn;
+
+	/* Sanity check + only fixup if it makes sense. */
+	if (!b || ntfs_is_baad_record(b->magic) ||
+			ntfs_is_hole_record(b->magic))
+		return -EINVAL;
+	/* Setup the variables. */
+	usa_ofs = le16_to_cpu(b->usa_ofs);
+	/* Decrement usa_count to get number of fixups. */
+	usa_count = le16_to_cpu(b->usa_count) - 1;
+	/* Size and alignment checks. */
+	if ( size & (NTFS_BLOCK_SIZE - 1)	||
+	     usa_ofs & 1			||
+	     usa_ofs + (usa_count * 2) > size	||
+	     (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
+		return -EINVAL;
+	/* Position of usn in update sequence array. */
+	usa_pos = (le16*)((u8*)b + usa_ofs);
+	/*
+	 * Cyclically increment the update sequence number
+	 * (skipping 0 and -1, i.e. 0xffff).
+	 */
+	usn = le16_to_cpup(usa_pos) + 1;
+	if (usn == 0xffff || !usn)
+		usn = 1;
+	le_usn = cpu_to_le16(usn);
+	*usa_pos = le_usn;
+	/* Position in data of first u16 that needs fixing up. */
+	data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
+	/* Fixup all sectors. */
+	while (usa_count--) {
+		/*
+		 * Increment the position in the usa and save the
+		 * original data from the data buffer into the usa.
+		 */
+		*(++usa_pos) = *data_pos;
+		/* Apply fixup to data. */
+		*data_pos = le_usn;
+		/* Increment position in data as well. */
+		data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
+	}
+	return 0;
+}
+
+/**
+ * post_write_mst_fixup - fast deprotect multi sector transfer protected data
+ * @b:		pointer to the data to deprotect
+ *
+ * Perform the necessary post write multi sector transfer fixup, not checking
+ * for any errors, because we assume we have just used pre_write_mst_fixup(),
+ * thus the data will be fine or we would never have gotten here.
+ */
+void post_write_mst_fixup(NTFS_RECORD *b)
+{
+	le16 *usa_pos, *data_pos;
+
+	u16 usa_ofs = le16_to_cpu(b->usa_ofs);
+	u16 usa_count = le16_to_cpu(b->usa_count) - 1;
+
+	/* Position of usn in update sequence array. */
+	usa_pos = (le16*)b + usa_ofs/sizeof(le16);
+
+	/* Position in protected data of first u16 that needs fixing up. */
+	data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
+
+	/* Fixup all sectors. */
+	while (usa_count--) {
+		/*
+		 * Increment position in usa and restore original data from
+		 * the usa into the data buffer.
+		 */
+		*data_pos = *(++usa_pos);
+
+		/* Increment position in data as well. */
+		data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
+	}
+}
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
new file mode 100644
index 00000000..358273e5
--- /dev/null
+++ b/fs/ntfs/namei.c
@@ -0,0 +1,405 @@
+/*
+ * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS
+ *	     project.
+ *
+ * Copyright (c) 2001-2006 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/dcache.h>
+#include <linux/exportfs.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+
+#include "attrib.h"
+#include "debug.h"
+#include "dir.h"
+#include "mft.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_lookup - find the inode represented by a dentry in a directory inode
+ * @dir_ino:	directory inode in which to look for the inode
+ * @dent:	dentry representing the inode to look for
+ * @nd:		lookup nameidata
+ *
+ * In short, ntfs_lookup() looks for the inode represented by the dentry @dent
+ * in the directory inode @dir_ino and if found attaches the inode to the
+ * dentry @dent.
+ *
+ * In more detail, the dentry @dent specifies which inode to look for by
+ * supplying the name of the inode in @dent->d_name.name. ntfs_lookup()
+ * converts the name to Unicode and walks the contents of the directory inode
+ * @dir_ino looking for the converted Unicode name. If the name is found in the
+ * directory, the corresponding inode is loaded by calling ntfs_iget() on its
+ * inode number and the inode is associated with the dentry @dent via a call to
+ * d_splice_alias().
+ *
+ * If the name is not found in the directory, a NULL inode is inserted into the
+ * dentry @dent via a call to d_add(). The dentry is then termed a negative
+ * dentry.
+ *
+ * Only if an actual error occurs, do we return an error via ERR_PTR().
+ *
+ * In order to handle the case insensitivity issues of NTFS with regards to the
+ * dcache and the dcache requiring only one dentry per directory, we deal with
+ * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining
+ * a case sensitive dcache. This means that we get the full benefit of dcache
+ * speed when the file/directory is looked up with the same case as returned by
+ * ->ntfs_readdir() but that a lookup for any other case (or for the short file
+ * name) will not find anything in dcache and will enter ->ntfs_lookup()
+ * instead, where we search the directory for a fully matching file name
+ * (including case) and if that is not found, we search for a file name that
+ * matches with different case and if that has non-POSIX semantics we return
+ * that. We actually do only one search (case sensitive) and keep tabs on
+ * whether we have found a case insensitive match in the process.
+ *
+ * To simplify matters for us, we do not treat the short vs long filenames as
+ * two hard links but instead if the lookup matches a short filename, we
+ * return the dentry for the corresponding long filename instead.
+ *
+ * There are three cases we need to distinguish here:
+ *
+ * 1) @dent perfectly matches (i.e. including case) a directory entry with a
+ *    file name in the WIN32 or POSIX namespaces. In this case
+ *    ntfs_lookup_inode_by_name() will return with name set to NULL and we
+ *    just d_splice_alias() @dent.
+ * 2) @dent matches (not including case) a directory entry with a file name in
+ *    the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return
+ *    with name set to point to a kmalloc()ed ntfs_name structure containing
+ *    the properly cased little endian Unicode name. We convert the name to the
+ *    current NLS code page, search if a dentry with this name already exists
+ *    and if so return that instead of @dent.  At this point things are
+ *    complicated by the possibility of 'disconnected' dentries due to NFS
+ *    which we deal with appropriately (see the code comments).  The VFS will
+ *    then destroy the old @dent and use the one we returned.  If a dentry is
+ *    not found, we allocate a new one, d_splice_alias() it, and return it as
+ *    above.
+ * 3) @dent matches either perfectly or not (i.e. we don't care about case) a
+ *    directory entry with a file name in the DOS namespace. In this case
+ *    ntfs_lookup_inode_by_name() will return with name set to point to a
+ *    kmalloc()ed ntfs_name structure containing the mft reference (cpu endian)
+ *    of the inode. We use the mft reference to read the inode and to find the
+ *    file name in the WIN32 namespace corresponding to the matched short file
+ *    name. We then convert the name to the current NLS code page, and proceed
+ *    searching for a dentry with this name, etc, as in case 2), above.
+ *
+ * Locking: Caller must hold i_mutex on the directory.
+ */
+static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
+		struct nameidata *nd)
+{
+	ntfs_volume *vol = NTFS_SB(dir_ino->i_sb);
+	struct inode *dent_inode;
+	ntfschar *uname;
+	ntfs_name *name = NULL;
+	MFT_REF mref;
+	unsigned long dent_ino;
+	int uname_len;
+
+	ntfs_debug("Looking up %s in directory inode 0x%lx.",
+			dent->d_name.name, dir_ino->i_ino);
+	/* Convert the name of the dentry to Unicode. */
+	uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len,
+			&uname);
+	if (uname_len < 0) {
+		if (uname_len != -ENAMETOOLONG)
+			ntfs_error(vol->sb, "Failed to convert name to "
+					"Unicode.");
+		return ERR_PTR(uname_len);
+	}
+	mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len,
+			&name);
+	kmem_cache_free(ntfs_name_cache, uname);
+	if (!IS_ERR_MREF(mref)) {
+		dent_ino = MREF(mref);
+		ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino);
+		dent_inode = ntfs_iget(vol->sb, dent_ino);
+		if (likely(!IS_ERR(dent_inode))) {
+			/* Consistency check. */
+			if (is_bad_inode(dent_inode) || MSEQNO(mref) ==
+					NTFS_I(dent_inode)->seq_no ||
+					dent_ino == FILE_MFT) {
+				/* Perfect WIN32/POSIX match. -- Case 1. */
+				if (!name) {
+					ntfs_debug("Done.  (Case 1.)");
+					return d_splice_alias(dent_inode, dent);
+				}
+				/*
+				 * We are too indented.  Handle imperfect
+				 * matches and short file names further below.
+				 */
+				goto handle_name;
+			}
+			ntfs_error(vol->sb, "Found stale reference to inode "
+					"0x%lx (reference sequence number = "
+					"0x%x, inode sequence number = 0x%x), "
+					"returning -EIO. Run chkdsk.",
+					dent_ino, MSEQNO(mref),
+					NTFS_I(dent_inode)->seq_no);
+			iput(dent_inode);
+			dent_inode = ERR_PTR(-EIO);
+		} else
+			ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with "
+					"error code %li.", dent_ino,
+					PTR_ERR(dent_inode));
+		kfree(name);
+		/* Return the error code. */
+		return (struct dentry *)dent_inode;
+	}
+	/* It is guaranteed that @name is no longer allocated at this point. */
+	if (MREF_ERR(mref) == -ENOENT) {
+		ntfs_debug("Entry was not found, adding negative dentry.");
+		/* The dcache will handle negative entries. */
+		d_add(dent, NULL);
+		ntfs_debug("Done.");
+		return NULL;
+	}
+	ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error "
+			"code %i.", -MREF_ERR(mref));
+	return ERR_PTR(MREF_ERR(mref));
+	// TODO: Consider moving this lot to a separate function! (AIA)
+handle_name:
+   {
+	MFT_RECORD *m;
+	ntfs_attr_search_ctx *ctx;
+	ntfs_inode *ni = NTFS_I(dent_inode);
+	int err;
+	struct qstr nls_name;
+
+	nls_name.name = NULL;
+	if (name->type != FILE_NAME_DOS) {			/* Case 2. */
+		ntfs_debug("Case 2.");
+		nls_name.len = (unsigned)ntfs_ucstonls(vol,
+				(ntfschar*)&name->name, name->len,
+				(unsigned char**)&nls_name.name, 0);
+		kfree(name);
+	} else /* if (name->type == FILE_NAME_DOS) */ {		/* Case 3. */
+		FILE_NAME_ATTR *fn;
+
+		ntfs_debug("Case 3.");
+		kfree(name);
+
+		/* Find the WIN32 name corresponding to the matched DOS name. */
+		ni = NTFS_I(dent_inode);
+		m = map_mft_record(ni);
+		if (IS_ERR(m)) {
+			err = PTR_ERR(m);
+			m = NULL;
+			ctx = NULL;
+			goto err_out;
+		}
+		ctx = ntfs_attr_get_search_ctx(ni, m);
+		if (unlikely(!ctx)) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+		do {
+			ATTR_RECORD *a;
+			u32 val_len;
+
+			err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0,
+					NULL, 0, ctx);
+			if (unlikely(err)) {
+				ntfs_error(vol->sb, "Inode corrupt: No WIN32 "
+						"namespace counterpart to DOS "
+						"file name. Run chkdsk.");
+				if (err == -ENOENT)
+					err = -EIO;
+				goto err_out;
+			}
+			/* Consistency checks. */
+			a = ctx->attr;
+			if (a->non_resident || a->flags)
+				goto eio_err_out;
+			val_len = le32_to_cpu(a->data.resident.value_length);
+			if (le16_to_cpu(a->data.resident.value_offset) +
+					val_len > le32_to_cpu(a->length))
+				goto eio_err_out;
+			fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu(
+					ctx->attr->data.resident.value_offset));
+			if ((u32)(fn->file_name_length * sizeof(ntfschar) +
+					sizeof(FILE_NAME_ATTR)) > val_len)
+				goto eio_err_out;
+		} while (fn->file_name_type != FILE_NAME_WIN32);
+
+		/* Convert the found WIN32 name to current NLS code page. */
+		nls_name.len = (unsigned)ntfs_ucstonls(vol,
+				(ntfschar*)&fn->file_name, fn->file_name_length,
+				(unsigned char**)&nls_name.name, 0);
+
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(ni);
+	}
+	m = NULL;
+	ctx = NULL;
+
+	/* Check if a conversion error occurred. */
+	if ((signed)nls_name.len < 0) {
+		err = (signed)nls_name.len;
+		goto err_out;
+	}
+	nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
+
+	dent = d_add_ci(dent, dent_inode, &nls_name);
+	kfree(nls_name.name);
+	return dent;
+
+eio_err_out:
+	ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
+	err = -EIO;
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(ni);
+	iput(dent_inode);
+	ntfs_error(vol->sb, "Failed, returning error code %i.", err);
+	return ERR_PTR(err);
+   }
+}
+
+/**
+ * Inode operations for directories.
+ */
+const struct inode_operations ntfs_dir_inode_ops = {
+	.lookup	= ntfs_lookup,	/* VFS: Lookup directory. */
+};
+
+/**
+ * ntfs_get_parent - find the dentry of the parent of a given directory dentry
+ * @child_dent:		dentry of the directory whose parent directory to find
+ *
+ * Find the dentry for the parent directory of the directory specified by the
+ * dentry @child_dent.  This function is called from
+ * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the
+ * default ->decode_fh() which is export_decode_fh() in the same file.
+ *
+ * The code is based on the ext3 ->get_parent() implementation found in
+ * fs/ext3/namei.c::ext3_get_parent().
+ *
+ * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_mutex down.
+ *
+ * Return the dentry of the parent directory on success or the error code on
+ * error (IS_ERR() is true).
+ */
+static struct dentry *ntfs_get_parent(struct dentry *child_dent)
+{
+	struct inode *vi = child_dent->d_inode;
+	ntfs_inode *ni = NTFS_I(vi);
+	MFT_RECORD *mrec;
+	ntfs_attr_search_ctx *ctx;
+	ATTR_RECORD *attr;
+	FILE_NAME_ATTR *fn;
+	unsigned long parent_ino;
+	int err;
+
+	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+	/* Get the mft record of the inode belonging to the child dentry. */
+	mrec = map_mft_record(ni);
+	if (IS_ERR(mrec))
+		return (struct dentry *)mrec;
+	/* Find the first file name attribute in the mft record. */
+	ctx = ntfs_attr_get_search_ctx(ni, mrec);
+	if (unlikely(!ctx)) {
+		unmap_mft_record(ni);
+		return ERR_PTR(-ENOMEM);
+	}
+try_next:
+	err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL,
+			0, ctx);
+	if (unlikely(err)) {
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(ni);
+		if (err == -ENOENT)
+			ntfs_error(vi->i_sb, "Inode 0x%lx does not have a "
+					"file name attribute.  Run chkdsk.",
+					vi->i_ino);
+		return ERR_PTR(err);
+	}
+	attr = ctx->attr;
+	if (unlikely(attr->non_resident))
+		goto try_next;
+	fn = (FILE_NAME_ATTR *)((u8 *)attr +
+			le16_to_cpu(attr->data.resident.value_offset));
+	if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) >
+			(u8*)attr + le32_to_cpu(attr->length)))
+		goto try_next;
+	/* Get the inode number of the parent directory. */
+	parent_ino = MREF_LE(fn->parent_directory);
+	/* Release the search context and the mft record of the child. */
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(ni);
+
+	return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino));
+}
+
+static struct inode *ntfs_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	inode = ntfs_iget(sb, ino);
+	if (!IS_ERR(inode)) {
+		if (is_bad_inode(inode) || inode->i_generation != generation) {
+			iput(inode);
+			inode = ERR_PTR(-ESTALE);
+		}
+	}
+
+	return inode;
+}
+
+static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    ntfs_nfs_get_inode);
+}
+
+static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    ntfs_nfs_get_inode);
+}
+
+/**
+ * Export operations allowing NFS exporting of mounted NTFS partitions.
+ *
+ * We use the default ->encode_fh() for now.  Note that they
+ * use 32 bits to store the inode number which is an unsigned long so on 64-bit
+ * architectures is usually 64 bits so it would all fail horribly on huge
+ * volumes.  I guess we need to define our own encode and decode fh functions
+ * that store 64-bit inode numbers at some point but for now we will ignore the
+ * problem...
+ *
+ * We also use the default ->get_name() helper (used by ->decode_fh() via
+ * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs
+ * independent.
+ *
+ * The default ->get_parent() just returns -EACCES so we have to provide our
+ * own and the default ->get_dentry() is incompatible with NTFS due to not
+ * allowing the inode number 0 which is used in NTFS for the system file $MFT
+ * and due to using iget() whereas NTFS needs ntfs_iget().
+ */
+const struct export_operations ntfs_export_ops = {
+	.get_parent	= ntfs_get_parent,	/* Find the parent of a given
+						   directory. */
+	.fh_to_dentry	= ntfs_fh_to_dentry,
+	.fh_to_parent	= ntfs_fh_to_parent,
+};
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
new file mode 100644
index 00000000..d6a340bf
--- /dev/null
+++ b/fs/ntfs/ntfs.h
@@ -0,0 +1,164 @@
+/*
+ * ntfs.h - Defines for NTFS Linux kernel driver. Part of the Linux-NTFS
+ *	    project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (C) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_H
+#define _LINUX_NTFS_H
+
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/nls.h>
+#include <linux/smp.h>
+#include <linux/pagemap.h>
+
+#include "types.h"
+#include "volume.h"
+#include "layout.h"
+
+typedef enum {
+	NTFS_BLOCK_SIZE		= 512,
+	NTFS_BLOCK_SIZE_BITS	= 9,
+	NTFS_SB_MAGIC		= 0x5346544e,	/* 'NTFS' */
+	NTFS_MAX_NAME_LEN	= 255,
+	NTFS_MAX_ATTR_NAME_LEN	= 255,
+	NTFS_MAX_CLUSTER_SIZE	= 64 * 1024,	/* 64kiB */
+	NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_CACHE_SIZE,
+} NTFS_CONSTANTS;
+
+/* Global variables. */
+
+/* Slab caches (from super.c). */
+extern struct kmem_cache *ntfs_name_cache;
+extern struct kmem_cache *ntfs_inode_cache;
+extern struct kmem_cache *ntfs_big_inode_cache;
+extern struct kmem_cache *ntfs_attr_ctx_cache;
+extern struct kmem_cache *ntfs_index_ctx_cache;
+
+/* The various operations structs defined throughout the driver files. */
+extern const struct address_space_operations ntfs_aops;
+extern const struct address_space_operations ntfs_mst_aops;
+
+extern const struct  file_operations ntfs_file_ops;
+extern const struct inode_operations ntfs_file_inode_ops;
+
+extern const struct  file_operations ntfs_dir_ops;
+extern const struct inode_operations ntfs_dir_inode_ops;
+
+extern const struct  file_operations ntfs_empty_file_ops;
+extern const struct inode_operations ntfs_empty_inode_ops;
+
+extern const struct export_operations ntfs_export_ops;
+
+/**
+ * NTFS_SB - return the ntfs volume given a vfs super block
+ * @sb:		VFS super block
+ *
+ * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb.
+ */
+static inline ntfs_volume *NTFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/* Declarations of functions and global variables. */
+
+/* From fs/ntfs/compress.c */
+extern int ntfs_read_compressed_block(struct page *page);
+extern int allocate_compression_buffers(void);
+extern void free_compression_buffers(void);
+
+/* From fs/ntfs/super.c */
+#define default_upcase_len 0x10000
+extern struct mutex ntfs_lock;
+
+typedef struct {
+	int val;
+	char *str;
+} option_t;
+extern const option_t on_errors_arr[];
+
+/* From fs/ntfs/mst.c */
+extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size);
+extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size);
+extern void post_write_mst_fixup(NTFS_RECORD *b);
+
+/* From fs/ntfs/unistr.c */
+extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
+		const ntfschar *s2, size_t s2_len,
+		const IGNORE_CASE_BOOL ic,
+		const ntfschar *upcase, const u32 upcase_size);
+extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
+		const ntfschar *name2, const u32 name2_len,
+		const int err_val, const IGNORE_CASE_BOOL ic,
+		const ntfschar *upcase, const u32 upcase_len);
+extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n);
+extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
+		const ntfschar *upcase, const u32 upcase_size);
+extern void ntfs_upcase_name(ntfschar *name, u32 name_len,
+		const ntfschar *upcase, const u32 upcase_len);
+extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
+		const ntfschar *upcase, const u32 upcase_len);
+extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
+		FILE_NAME_ATTR *file_name_attr2,
+		const int err_val, const IGNORE_CASE_BOOL ic,
+		const ntfschar *upcase, const u32 upcase_len);
+extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
+		const int ins_len, ntfschar **outs);
+extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
+		const int ins_len, unsigned char **outs, int outs_len);
+
+/* From fs/ntfs/upcase.c */
+extern ntfschar *generate_default_upcase(void);
+
+static inline int ntfs_ffs(int x)
+{
+	int r = 1;
+
+	if (!x)
+		return 0;
+	if (!(x & 0xffff)) {
+		x >>= 16;
+		r += 16;
+	}
+	if (!(x & 0xff)) {
+		x >>= 8;
+		r += 8;
+	}
+	if (!(x & 0xf)) {
+		x >>= 4;
+		r += 4;
+	}
+	if (!(x & 3)) {
+		x >>= 2;
+		r += 2;
+	}
+	if (!(x & 1)) {
+		x >>= 1;
+		r += 1;
+	}
+	return r;
+}
+
+#endif /* _LINUX_NTFS_H */
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
new file mode 100644
index 00000000..d80e3315
--- /dev/null
+++ b/fs/ntfs/quota.c
@@ -0,0 +1,117 @@
+/*
+ * quota.c - NTFS kernel quota ($Quota) handling.  Part of the Linux-NTFS
+ *	     project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifdef NTFS_RW
+
+#include "index.h"
+#include "quota.h"
+#include "debug.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume
+ * @vol:	ntfs volume on which to mark the quotas out of date
+ *
+ * Mark the quotas out of date on the ntfs volume @vol and return 'true' on
+ * success and 'false' on error.
+ */
+bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
+{
+	ntfs_index_context *ictx;
+	QUOTA_CONTROL_ENTRY *qce;
+	const le32 qid = QUOTA_DEFAULTS_ID;
+	int err;
+
+	ntfs_debug("Entering.");
+	if (NVolQuotaOutOfDate(vol))
+		goto done;
+	if (!vol->quota_ino || !vol->quota_q_ino) {
+		ntfs_error(vol->sb, "Quota inodes are not open.");
+		return false;
+	}
+	mutex_lock(&vol->quota_q_ino->i_mutex);
+	ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
+	if (!ictx) {
+		ntfs_error(vol->sb, "Failed to get index context.");
+		goto err_out;
+	}
+	err = ntfs_index_lookup(&qid, sizeof(qid), ictx);
+	if (err) {
+		if (err == -ENOENT)
+			ntfs_error(vol->sb, "Quota defaults entry is not "
+					"present.");
+		else
+			ntfs_error(vol->sb, "Lookup of quota defaults entry "
+					"failed.");
+		goto err_out;
+	}
+	if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) {
+		ntfs_error(vol->sb, "Quota defaults entry size is invalid.  "
+				"Run chkdsk.");
+		goto err_out;
+	}
+	qce = (QUOTA_CONTROL_ENTRY*)ictx->data;
+	if (le32_to_cpu(qce->version) != QUOTA_VERSION) {
+		ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not "
+				"supported.", le32_to_cpu(qce->version));
+		goto err_out;
+	}
+	ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags));
+	/* If quotas are already marked out of date, no need to do anything. */
+	if (qce->flags & QUOTA_FLAG_OUT_OF_DATE)
+		goto set_done;
+	/*
+	 * If quota tracking is neither requested, nor enabled and there are no
+	 * pending deletes, no need to mark the quotas out of date.
+	 */
+	if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED |
+			QUOTA_FLAG_TRACKING_REQUESTED |
+			QUOTA_FLAG_PENDING_DELETES)))
+		goto set_done;
+	/*
+	 * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date.
+	 * This is verified on WinXP to be sufficient to cause windows to
+	 * rescan the volume on boot and update all quota entries.
+	 */
+	qce->flags |= QUOTA_FLAG_OUT_OF_DATE;
+	/* Ensure the modified flags are written to disk. */
+	ntfs_index_entry_flush_dcache_page(ictx);
+	ntfs_index_entry_mark_dirty(ictx);
+set_done:
+	ntfs_index_ctx_put(ictx);
+	mutex_unlock(&vol->quota_q_ino->i_mutex);
+	/*
+	 * We set the flag so we do not try to mark the quotas out of date
+	 * again on remount.
+	 */
+	NVolSetQuotaOutOfDate(vol);
+done:
+	ntfs_debug("Done.");
+	return true;
+err_out:
+	if (ictx)
+		ntfs_index_ctx_put(ictx);
+	mutex_unlock(&vol->quota_q_ino->i_mutex);
+	return false;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h
new file mode 100644
index 00000000..4cbe5594
--- /dev/null
+++ b/fs/ntfs/quota.h
@@ -0,0 +1,35 @@
+/*
+ * quota.h - Defines for NTFS kernel quota ($Quota) handling.  Part of the
+ *	     Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_QUOTA_H
+#define _LINUX_NTFS_QUOTA_H
+
+#ifdef NTFS_RW
+
+#include "types.h"
+#include "volume.h"
+
+extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_QUOTA_H */
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
new file mode 100644
index 00000000..eac7d678
--- /dev/null
+++ b/fs/ntfs/runlist.c
@@ -0,0 +1,1907 @@
+/**
+ * runlist.c - NTFS runlist handling code.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2002-2005 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "debug.h"
+#include "dir.h"
+#include "endian.h"
+#include "malloc.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_rl_mm - runlist memmove
+ *
+ * It is up to the caller to serialize access to the runlist @base.
+ */
+static inline void ntfs_rl_mm(runlist_element *base, int dst, int src,
+		int size)
+{
+	if (likely((dst != src) && (size > 0)))
+		memmove(base + dst, base + src, size * sizeof(*base));
+}
+
+/**
+ * ntfs_rl_mc - runlist memory copy
+ *
+ * It is up to the caller to serialize access to the runlists @dstbase and
+ * @srcbase.
+ */
+static inline void ntfs_rl_mc(runlist_element *dstbase, int dst,
+		runlist_element *srcbase, int src, int size)
+{
+	if (likely(size > 0))
+		memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase));
+}
+
+/**
+ * ntfs_rl_realloc - Reallocate memory for runlists
+ * @rl:		original runlist
+ * @old_size:	number of runlist elements in the original runlist @rl
+ * @new_size:	number of runlist elements we need space for
+ *
+ * As the runlists grow, more memory will be required.  To prevent the
+ * kernel having to allocate and reallocate large numbers of small bits of
+ * memory, this function returns an entire page of memory.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * N.B.  If the new allocation doesn't require a different number of pages in
+ *       memory, the function will return the original pointer.
+ *
+ * On success, return a pointer to the newly allocated, or recycled, memory.
+ * On error, return -errno. The following error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
+		int old_size, int new_size)
+{
+	runlist_element *new_rl;
+
+	old_size = PAGE_ALIGN(old_size * sizeof(*rl));
+	new_size = PAGE_ALIGN(new_size * sizeof(*rl));
+	if (old_size == new_size)
+		return rl;
+
+	new_rl = ntfs_malloc_nofs(new_size);
+	if (unlikely(!new_rl))
+		return ERR_PTR(-ENOMEM);
+
+	if (likely(rl != NULL)) {
+		if (unlikely(old_size > new_size))
+			old_size = new_size;
+		memcpy(new_rl, rl, old_size);
+		ntfs_free(rl);
+	}
+	return new_rl;
+}
+
+/**
+ * ntfs_rl_realloc_nofail - Reallocate memory for runlists
+ * @rl:		original runlist
+ * @old_size:	number of runlist elements in the original runlist @rl
+ * @new_size:	number of runlist elements we need space for
+ *
+ * As the runlists grow, more memory will be required.  To prevent the
+ * kernel having to allocate and reallocate large numbers of small bits of
+ * memory, this function returns an entire page of memory.
+ *
+ * This function guarantees that the allocation will succeed.  It will sleep
+ * for as long as it takes to complete the allocation.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * N.B.  If the new allocation doesn't require a different number of pages in
+ *       memory, the function will return the original pointer.
+ *
+ * On success, return a pointer to the newly allocated, or recycled, memory.
+ * On error, return -errno. The following error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl,
+		int old_size, int new_size)
+{
+	runlist_element *new_rl;
+
+	old_size = PAGE_ALIGN(old_size * sizeof(*rl));
+	new_size = PAGE_ALIGN(new_size * sizeof(*rl));
+	if (old_size == new_size)
+		return rl;
+
+	new_rl = ntfs_malloc_nofs_nofail(new_size);
+	BUG_ON(!new_rl);
+
+	if (likely(rl != NULL)) {
+		if (unlikely(old_size > new_size))
+			old_size = new_size;
+		memcpy(new_rl, rl, old_size);
+		ntfs_free(rl);
+	}
+	return new_rl;
+}
+
+/**
+ * ntfs_are_rl_mergeable - test if two runlists can be joined together
+ * @dst:	original runlist
+ * @src:	new runlist to test for mergeability with @dst
+ *
+ * Test if two runlists can be joined together. For this, their VCNs and LCNs
+ * must be adjacent.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * Return: true   Success, the runlists can be merged.
+ *	   false  Failure, the runlists cannot be merged.
+ */
+static inline bool ntfs_are_rl_mergeable(runlist_element *dst,
+		runlist_element *src)
+{
+	BUG_ON(!dst);
+	BUG_ON(!src);
+
+	/* We can merge unmapped regions even if they are misaligned. */
+	if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED))
+		return true;
+	/* If the runs are misaligned, we cannot merge them. */
+	if ((dst->vcn + dst->length) != src->vcn)
+		return false;
+	/* If both runs are non-sparse and contiguous, we can merge them. */
+	if ((dst->lcn >= 0) && (src->lcn >= 0) &&
+			((dst->lcn + dst->length) == src->lcn))
+		return true;
+	/* If we are merging two holes, we can merge them. */
+	if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE))
+		return true;
+	/* Cannot merge. */
+	return false;
+}
+
+/**
+ * __ntfs_rl_merge - merge two runlists without testing if they can be merged
+ * @dst:	original, destination runlist
+ * @src:	new runlist to merge with @dst
+ *
+ * Merge the two runlists, writing into the destination runlist @dst. The
+ * caller must make sure the runlists can be merged or this will corrupt the
+ * destination runlist.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ */
+static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src)
+{
+	dst->length += src->length;
+}
+
+/**
+ * ntfs_rl_append - append a runlist after a given element
+ * @dst:	original runlist to be worked on
+ * @dsize:	number of elements in @dst (including end marker)
+ * @src:	runlist to be inserted into @dst
+ * @ssize:	number of elements in @src (excluding end marker)
+ * @loc:	append the new runlist @src after this element in @dst
+ *
+ * Append the runlist @src after element @loc in @dst.  Merge the right end of
+ * the new runlist, if necessary. Adjust the size of the hole before the
+ * appended runlist.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_append(runlist_element *dst,
+		int dsize, runlist_element *src, int ssize, int loc)
+{
+	bool right = false;	/* Right end of @src needs merging. */
+	int marker;		/* End of the inserted runs. */
+
+	BUG_ON(!dst);
+	BUG_ON(!src);
+
+	/* First, check if the right hand end needs merging. */
+	if ((loc + 1) < dsize)
+		right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
+
+	/* Space required: @dst size + @src size, less one if we merged. */
+	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right);
+	if (IS_ERR(dst))
+		return dst;
+	/*
+	 * We are guaranteed to succeed from here so can start modifying the
+	 * original runlists.
+	 */
+
+	/* First, merge the right hand end, if necessary. */
+	if (right)
+		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
+
+	/* First run after the @src runs that have been inserted. */
+	marker = loc + ssize + 1;
+
+	/* Move the tail of @dst out of the way, then copy in @src. */
+	ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right));
+	ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
+
+	/* Adjust the size of the preceding hole. */
+	dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
+
+	/* We may have changed the length of the file, so fix the end marker */
+	if (dst[marker].lcn == LCN_ENOENT)
+		dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
+
+	return dst;
+}
+
+/**
+ * ntfs_rl_insert - insert a runlist into another
+ * @dst:	original runlist to be worked on
+ * @dsize:	number of elements in @dst (including end marker)
+ * @src:	new runlist to be inserted
+ * @ssize:	number of elements in @src (excluding end marker)
+ * @loc:	insert the new runlist @src before this element in @dst
+ *
+ * Insert the runlist @src before element @loc in the runlist @dst. Merge the
+ * left end of the new runlist, if necessary. Adjust the size of the hole
+ * after the inserted runlist.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
+		int dsize, runlist_element *src, int ssize, int loc)
+{
+	bool left = false;	/* Left end of @src needs merging. */
+	bool disc = false;	/* Discontinuity between @dst and @src. */
+	int marker;		/* End of the inserted runs. */
+
+	BUG_ON(!dst);
+	BUG_ON(!src);
+
+	/*
+	 * disc => Discontinuity between the end of @dst and the start of @src.
+	 *	   This means we might need to insert a "not mapped" run.
+	 */
+	if (loc == 0)
+		disc = (src[0].vcn > 0);
+	else {
+		s64 merged_length;
+
+		left = ntfs_are_rl_mergeable(dst + loc - 1, src);
+
+		merged_length = dst[loc - 1].length;
+		if (left)
+			merged_length += src->length;
+
+		disc = (src[0].vcn > dst[loc - 1].vcn + merged_length);
+	}
+	/*
+	 * Space required: @dst size + @src size, less one if we merged, plus
+	 * one if there was a discontinuity.
+	 */
+	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc);
+	if (IS_ERR(dst))
+		return dst;
+	/*
+	 * We are guaranteed to succeed from here so can start modifying the
+	 * original runlist.
+	 */
+	if (left)
+		__ntfs_rl_merge(dst + loc - 1, src);
+	/*
+	 * First run after the @src runs that have been inserted.
+	 * Nominally,  @marker equals @loc + @ssize, i.e. location + number of
+	 * runs in @src.  However, if @left, then the first run in @src has
+	 * been merged with one in @dst.  And if @disc, then @dst and @src do
+	 * not meet and we need an extra run to fill the gap.
+	 */
+	marker = loc + ssize - left + disc;
+
+	/* Move the tail of @dst out of the way, then copy in @src. */
+	ntfs_rl_mm(dst, marker, loc, dsize - loc);
+	ntfs_rl_mc(dst, loc + disc, src, left, ssize - left);
+
+	/* Adjust the VCN of the first run after the insertion... */
+	dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
+	/* ... and the length. */
+	if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED)
+		dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn;
+
+	/* Writing beyond the end of the file and there is a discontinuity. */
+	if (disc) {
+		if (loc > 0) {
+			dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length;
+			dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
+		} else {
+			dst[loc].vcn = 0;
+			dst[loc].length = dst[loc + 1].vcn;
+		}
+		dst[loc].lcn = LCN_RL_NOT_MAPPED;
+	}
+	return dst;
+}
+
+/**
+ * ntfs_rl_replace - overwrite a runlist element with another runlist
+ * @dst:	original runlist to be worked on
+ * @dsize:	number of elements in @dst (including end marker)
+ * @src:	new runlist to be inserted
+ * @ssize:	number of elements in @src (excluding end marker)
+ * @loc:	index in runlist @dst to overwrite with @src
+ *
+ * Replace the runlist element @dst at @loc with @src. Merge the left and
+ * right ends of the inserted runlist, if necessary.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_replace(runlist_element *dst,
+		int dsize, runlist_element *src, int ssize, int loc)
+{
+	signed delta;
+	bool left = false;	/* Left end of @src needs merging. */
+	bool right = false;	/* Right end of @src needs merging. */
+	int tail;		/* Start of tail of @dst. */
+	int marker;		/* End of the inserted runs. */
+
+	BUG_ON(!dst);
+	BUG_ON(!src);
+
+	/* First, see if the left and right ends need merging. */
+	if ((loc + 1) < dsize)
+		right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
+	if (loc > 0)
+		left = ntfs_are_rl_mergeable(dst + loc - 1, src);
+	/*
+	 * Allocate some space.  We will need less if the left, right, or both
+	 * ends get merged.  The -1 accounts for the run being replaced.
+	 */
+	delta = ssize - 1 - left - right;
+	if (delta > 0) {
+		dst = ntfs_rl_realloc(dst, dsize, dsize + delta);
+		if (IS_ERR(dst))
+			return dst;
+	}
+	/*
+	 * We are guaranteed to succeed from here so can start modifying the
+	 * original runlists.
+	 */
+
+	/* First, merge the left and right ends, if necessary. */
+	if (right)
+		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
+	if (left)
+		__ntfs_rl_merge(dst + loc - 1, src);
+	/*
+	 * Offset of the tail of @dst.  This needs to be moved out of the way
+	 * to make space for the runs to be copied from @src, i.e. the first
+	 * run of the tail of @dst.
+	 * Nominally, @tail equals @loc + 1, i.e. location, skipping the
+	 * replaced run.  However, if @right, then one of @dst's runs is
+	 * already merged into @src.
+	 */
+	tail = loc + right + 1;
+	/*
+	 * First run after the @src runs that have been inserted, i.e. where
+	 * the tail of @dst needs to be moved to.
+	 * Nominally, @marker equals @loc + @ssize, i.e. location + number of
+	 * runs in @src.  However, if @left, then the first run in @src has
+	 * been merged with one in @dst.
+	 */
+	marker = loc + ssize - left;
+
+	/* Move the tail of @dst out of the way, then copy in @src. */
+	ntfs_rl_mm(dst, marker, tail, dsize - tail);
+	ntfs_rl_mc(dst, loc, src, left, ssize - left);
+
+	/* We may have changed the length of the file, so fix the end marker. */
+	if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT)
+		dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
+	return dst;
+}
+
+/**
+ * ntfs_rl_split - insert a runlist into the centre of a hole
+ * @dst:	original runlist to be worked on
+ * @dsize:	number of elements in @dst (including end marker)
+ * @src:	new runlist to be inserted
+ * @ssize:	number of elements in @src (excluding end marker)
+ * @loc:	index in runlist @dst at which to split and insert @src
+ *
+ * Split the runlist @dst at @loc into two and insert @new in between the two
+ * fragments. No merging of runlists is necessary. Adjust the size of the
+ * holes either side.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize,
+		runlist_element *src, int ssize, int loc)
+{
+	BUG_ON(!dst);
+	BUG_ON(!src);
+
+	/* Space required: @dst size + @src size + one new hole. */
+	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1);
+	if (IS_ERR(dst))
+		return dst;
+	/*
+	 * We are guaranteed to succeed from here so can start modifying the
+	 * original runlists.
+	 */
+
+	/* Move the tail of @dst out of the way, then copy in @src. */
+	ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc);
+	ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
+
+	/* Adjust the size of the holes either size of @src. */
+	dst[loc].length		= dst[loc+1].vcn       - dst[loc].vcn;
+	dst[loc+ssize+1].vcn    = dst[loc+ssize].vcn   + dst[loc+ssize].length;
+	dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn;
+
+	return dst;
+}
+
+/**
+ * ntfs_runlists_merge - merge two runlists into one
+ * @drl:	original runlist to be worked on
+ * @srl:	new runlist to be merged into @drl
+ *
+ * First we sanity check the two runlists @srl and @drl to make sure that they
+ * are sensible and can be merged. The runlist @srl must be either after the
+ * runlist @drl or completely within a hole (or unmapped region) in @drl.
+ *
+ * It is up to the caller to serialize access to the runlists @drl and @srl.
+ *
+ * Merging of runlists is necessary in two cases:
+ *   1. When attribute lists are used and a further extent is being mapped.
+ *   2. When new clusters are allocated to fill a hole or extend a file.
+ *
+ * There are four possible ways @srl can be merged. It can:
+ *	- be inserted at the beginning of a hole,
+ *	- split the hole in two and be inserted between the two fragments,
+ *	- be appended at the end of a hole, or it can
+ *	- replace the whole hole.
+ * It can also be appended to the end of the runlist, which is just a variant
+ * of the insert case.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @drl and @srl are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ *	-ERANGE	- The runlists overlap and cannot be merged.
+ */
+runlist_element *ntfs_runlists_merge(runlist_element *drl,
+		runlist_element *srl)
+{
+	int di, si;		/* Current index into @[ds]rl. */
+	int sstart;		/* First index with lcn > LCN_RL_NOT_MAPPED. */
+	int dins;		/* Index into @drl at which to insert @srl. */
+	int dend, send;		/* Last index into @[ds]rl. */
+	int dfinal, sfinal;	/* The last index into @[ds]rl with
+				   lcn >= LCN_HOLE. */
+	int marker = 0;
+	VCN marker_vcn = 0;
+
+#ifdef DEBUG
+	ntfs_debug("dst:");
+	ntfs_debug_dump_runlist(drl);
+	ntfs_debug("src:");
+	ntfs_debug_dump_runlist(srl);
+#endif
+
+	/* Check for silly calling... */
+	if (unlikely(!srl))
+		return drl;
+	if (IS_ERR(srl) || IS_ERR(drl))
+		return ERR_PTR(-EINVAL);
+
+	/* Check for the case where the first mapping is being done now. */
+	if (unlikely(!drl)) {
+		drl = srl;
+		/* Complete the source runlist if necessary. */
+		if (unlikely(drl[0].vcn)) {
+			/* Scan to the end of the source runlist. */
+			for (dend = 0; likely(drl[dend].length); dend++)
+				;
+			dend++;
+			drl = ntfs_rl_realloc(drl, dend, dend + 1);
+			if (IS_ERR(drl))
+				return drl;
+			/* Insert start element at the front of the runlist. */
+			ntfs_rl_mm(drl, 1, 0, dend);
+			drl[0].vcn = 0;
+			drl[0].lcn = LCN_RL_NOT_MAPPED;
+			drl[0].length = drl[1].vcn;
+		}
+		goto finished;
+	}
+
+	si = di = 0;
+
+	/* Skip any unmapped start element(s) in the source runlist. */
+	while (srl[si].length && srl[si].lcn < LCN_HOLE)
+		si++;
+
+	/* Can't have an entirely unmapped source runlist. */
+	BUG_ON(!srl[si].length);
+
+	/* Record the starting points. */
+	sstart = si;
+
+	/*
+	 * Skip forward in @drl until we reach the position where @srl needs to
+	 * be inserted. If we reach the end of @drl, @srl just needs to be
+	 * appended to @drl.
+	 */
+	for (; drl[di].length; di++) {
+		if (drl[di].vcn + drl[di].length > srl[sstart].vcn)
+			break;
+	}
+	dins = di;
+
+	/* Sanity check for illegal overlaps. */
+	if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) &&
+			(srl[si].lcn >= 0)) {
+		ntfs_error(NULL, "Run lists overlap. Cannot merge!");
+		return ERR_PTR(-ERANGE);
+	}
+
+	/* Scan to the end of both runlists in order to know their sizes. */
+	for (send = si; srl[send].length; send++)
+		;
+	for (dend = di; drl[dend].length; dend++)
+		;
+
+	if (srl[send].lcn == LCN_ENOENT)
+		marker_vcn = srl[marker = send].vcn;
+
+	/* Scan to the last element with lcn >= LCN_HOLE. */
+	for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--)
+		;
+	for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--)
+		;
+
+	{
+	bool start;
+	bool finish;
+	int ds = dend + 1;		/* Number of elements in drl & srl */
+	int ss = sfinal - sstart + 1;
+
+	start  = ((drl[dins].lcn <  LCN_RL_NOT_MAPPED) ||    /* End of file   */
+		  (drl[dins].vcn == srl[sstart].vcn));	     /* Start of hole */
+	finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) &&    /* End of file   */
+		 ((drl[dins].vcn + drl[dins].length) <=      /* End of hole   */
+		  (srl[send - 1].vcn + srl[send - 1].length)));
+
+	/* Or we will lose an end marker. */
+	if (finish && !drl[dins].length)
+		ss++;
+	if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
+		finish = false;
+#if 0
+	ntfs_debug("dfinal = %i, dend = %i", dfinal, dend);
+	ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send);
+	ntfs_debug("start = %i, finish = %i", start, finish);
+	ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins);
+#endif
+	if (start) {
+		if (finish)
+			drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins);
+		else
+			drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins);
+	} else {
+		if (finish)
+			drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins);
+		else
+			drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins);
+	}
+	if (IS_ERR(drl)) {
+		ntfs_error(NULL, "Merge failed.");
+		return drl;
+	}
+	ntfs_free(srl);
+	if (marker) {
+		ntfs_debug("Triggering marker code.");
+		for (ds = dend; drl[ds].length; ds++)
+			;
+		/* We only need to care if @srl ended after @drl. */
+		if (drl[ds].vcn <= marker_vcn) {
+			int slots = 0;
+
+			if (drl[ds].vcn == marker_vcn) {
+				ntfs_debug("Old marker = 0x%llx, replacing "
+						"with LCN_ENOENT.",
+						(unsigned long long)
+						drl[ds].lcn);
+				drl[ds].lcn = LCN_ENOENT;
+				goto finished;
+			}
+			/*
+			 * We need to create an unmapped runlist element in
+			 * @drl or extend an existing one before adding the
+			 * ENOENT terminator.
+			 */
+			if (drl[ds].lcn == LCN_ENOENT) {
+				ds--;
+				slots = 1;
+			}
+			if (drl[ds].lcn != LCN_RL_NOT_MAPPED) {
+				/* Add an unmapped runlist element. */
+				if (!slots) {
+					drl = ntfs_rl_realloc_nofail(drl, ds,
+							ds + 2);
+					slots = 2;
+				}
+				ds++;
+				/* Need to set vcn if it isn't set already. */
+				if (slots != 1)
+					drl[ds].vcn = drl[ds - 1].vcn +
+							drl[ds - 1].length;
+				drl[ds].lcn = LCN_RL_NOT_MAPPED;
+				/* We now used up a slot. */
+				slots--;
+			}
+			drl[ds].length = marker_vcn - drl[ds].vcn;
+			/* Finally add the ENOENT terminator. */
+			ds++;
+			if (!slots)
+				drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1);
+			drl[ds].vcn = marker_vcn;
+			drl[ds].lcn = LCN_ENOENT;
+			drl[ds].length = (s64)0;
+		}
+	}
+	}
+
+finished:
+	/* The merge was completed successfully. */
+	ntfs_debug("Merged runlist:");
+	ntfs_debug_dump_runlist(drl);
+	return drl;
+}
+
+/**
+ * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist
+ * @vol:	ntfs volume on which the attribute resides
+ * @attr:	attribute record whose mapping pairs array to decompress
+ * @old_rl:	optional runlist in which to insert @attr's runlist
+ *
+ * It is up to the caller to serialize access to the runlist @old_rl.
+ *
+ * Decompress the attribute @attr's mapping pairs array into a runlist. On
+ * success, return the decompressed runlist.
+ *
+ * If @old_rl is not NULL, decompressed runlist is inserted into the
+ * appropriate place in @old_rl and the resultant, combined runlist is
+ * returned. The original @old_rl is deallocated.
+ *
+ * On error, return -errno. @old_rl is left unmodified in that case.
+ *
+ * The following error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EIO	- Corrupt runlist.
+ *	-EINVAL	- Invalid parameters were passed in.
+ *	-ERANGE	- The two runlists overlap.
+ *
+ * FIXME: For now we take the conceptionally simplest approach of creating the
+ * new runlist disregarding the already existing one and then splicing the
+ * two into one, if that is possible (we check for overlap and discard the new
+ * runlist if overlap present before returning ERR_PTR(-ERANGE)).
+ */
+runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
+		const ATTR_RECORD *attr, runlist_element *old_rl)
+{
+	VCN vcn;		/* Current vcn. */
+	LCN lcn;		/* Current lcn. */
+	s64 deltaxcn;		/* Change in [vl]cn. */
+	runlist_element *rl;	/* The output runlist. */
+	u8 *buf;		/* Current position in mapping pairs array. */
+	u8 *attr_end;		/* End of attribute. */
+	int rlsize;		/* Size of runlist buffer. */
+	u16 rlpos;		/* Current runlist position in units of
+				   runlist_elements. */
+	u8 b;			/* Current byte offset in buf. */
+
+#ifdef DEBUG
+	/* Make sure attr exists and is non-resident. */
+	if (!attr || !attr->non_resident || sle64_to_cpu(
+			attr->data.non_resident.lowest_vcn) < (VCN)0) {
+		ntfs_error(vol->sb, "Invalid arguments.");
+		return ERR_PTR(-EINVAL);
+	}
+#endif
+	/* Start at vcn = lowest_vcn and lcn 0. */
+	vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn);
+	lcn = 0;
+	/* Get start of the mapping pairs array. */
+	buf = (u8*)attr + le16_to_cpu(
+			attr->data.non_resident.mapping_pairs_offset);
+	attr_end = (u8*)attr + le32_to_cpu(attr->length);
+	if (unlikely(buf < (u8*)attr || buf > attr_end)) {
+		ntfs_error(vol->sb, "Corrupt attribute.");
+		return ERR_PTR(-EIO);
+	}
+	/* If the mapping pairs array is valid but empty, nothing to do. */
+	if (!vcn && !*buf)
+		return old_rl;
+	/* Current position in runlist array. */
+	rlpos = 0;
+	/* Allocate first page and set current runlist size to one page. */
+	rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE);
+	if (unlikely(!rl))
+		return ERR_PTR(-ENOMEM);
+	/* Insert unmapped starting element if necessary. */
+	if (vcn) {
+		rl->vcn = 0;
+		rl->lcn = LCN_RL_NOT_MAPPED;
+		rl->length = vcn;
+		rlpos++;
+	}
+	while (buf < attr_end && *buf) {
+		/*
+		 * Allocate more memory if needed, including space for the
+		 * not-mapped and terminator elements. ntfs_malloc_nofs()
+		 * operates on whole pages only.
+		 */
+		if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) {
+			runlist_element *rl2;
+
+			rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
+			if (unlikely(!rl2)) {
+				ntfs_free(rl);
+				return ERR_PTR(-ENOMEM);
+			}
+			memcpy(rl2, rl, rlsize);
+			ntfs_free(rl);
+			rl = rl2;
+			rlsize += PAGE_SIZE;
+		}
+		/* Enter the current vcn into the current runlist element. */
+		rl[rlpos].vcn = vcn;
+		/*
+		 * Get the change in vcn, i.e. the run length in clusters.
+		 * Doing it this way ensures that we signextend negative values.
+		 * A negative run length doesn't make any sense, but hey, I
+		 * didn't make up the NTFS specs and Windows NT4 treats the run
+		 * length as a signed value so that's how it is...
+		 */
+		b = *buf & 0xf;
+		if (b) {
+			if (unlikely(buf + b > attr_end))
+				goto io_error;
+			for (deltaxcn = (s8)buf[b--]; b; b--)
+				deltaxcn = (deltaxcn << 8) + buf[b];
+		} else { /* The length entry is compulsory. */
+			ntfs_error(vol->sb, "Missing length entry in mapping "
+					"pairs array.");
+			deltaxcn = (s64)-1;
+		}
+		/*
+		 * Assume a negative length to indicate data corruption and
+		 * hence clean-up and return NULL.
+		 */
+		if (unlikely(deltaxcn < 0)) {
+			ntfs_error(vol->sb, "Invalid length in mapping pairs "
+					"array.");
+			goto err_out;
+		}
+		/*
+		 * Enter the current run length into the current runlist
+		 * element.
+		 */
+		rl[rlpos].length = deltaxcn;
+		/* Increment the current vcn by the current run length. */
+		vcn += deltaxcn;
+		/*
+		 * There might be no lcn change at all, as is the case for
+		 * sparse clusters on NTFS 3.0+, in which case we set the lcn
+		 * to LCN_HOLE.
+		 */
+		if (!(*buf & 0xf0))
+			rl[rlpos].lcn = LCN_HOLE;
+		else {
+			/* Get the lcn change which really can be negative. */
+			u8 b2 = *buf & 0xf;
+			b = b2 + ((*buf >> 4) & 0xf);
+			if (buf + b > attr_end)
+				goto io_error;
+			for (deltaxcn = (s8)buf[b--]; b > b2; b--)
+				deltaxcn = (deltaxcn << 8) + buf[b];
+			/* Change the current lcn to its new value. */
+			lcn += deltaxcn;
+#ifdef DEBUG
+			/*
+			 * On NTFS 1.2-, apparently can have lcn == -1 to
+			 * indicate a hole. But we haven't verified ourselves
+			 * whether it is really the lcn or the deltaxcn that is
+			 * -1. So if either is found give us a message so we
+			 * can investigate it further!
+			 */
+			if (vol->major_ver < 3) {
+				if (unlikely(deltaxcn == (LCN)-1))
+					ntfs_error(vol->sb, "lcn delta == -1");
+				if (unlikely(lcn == (LCN)-1))
+					ntfs_error(vol->sb, "lcn == -1");
+			}
+#endif
+			/* Check lcn is not below -1. */
+			if (unlikely(lcn < (LCN)-1)) {
+				ntfs_error(vol->sb, "Invalid LCN < -1 in "
+						"mapping pairs array.");
+				goto err_out;
+			}
+			/* Enter the current lcn into the runlist element. */
+			rl[rlpos].lcn = lcn;
+		}
+		/* Get to the next runlist element. */
+		rlpos++;
+		/* Increment the buffer position to the next mapping pair. */
+		buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1;
+	}
+	if (unlikely(buf >= attr_end))
+		goto io_error;
+	/*
+	 * If there is a highest_vcn specified, it must be equal to the final
+	 * vcn in the runlist - 1, or something has gone badly wrong.
+	 */
+	deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn);
+	if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) {
+mpa_err:
+		ntfs_error(vol->sb, "Corrupt mapping pairs array in "
+				"non-resident attribute.");
+		goto err_out;
+	}
+	/* Setup not mapped runlist element if this is the base extent. */
+	if (!attr->data.non_resident.lowest_vcn) {
+		VCN max_cluster;
+
+		max_cluster = ((sle64_to_cpu(
+				attr->data.non_resident.allocated_size) +
+				vol->cluster_size - 1) >>
+				vol->cluster_size_bits) - 1;
+		/*
+		 * A highest_vcn of zero means this is a single extent
+		 * attribute so simply terminate the runlist with LCN_ENOENT).
+		 */
+		if (deltaxcn) {
+			/*
+			 * If there is a difference between the highest_vcn and
+			 * the highest cluster, the runlist is either corrupt
+			 * or, more likely, there are more extents following
+			 * this one.
+			 */
+			if (deltaxcn < max_cluster) {
+				ntfs_debug("More extents to follow; deltaxcn "
+						"= 0x%llx, max_cluster = "
+						"0x%llx",
+						(unsigned long long)deltaxcn,
+						(unsigned long long)
+						max_cluster);
+				rl[rlpos].vcn = vcn;
+				vcn += rl[rlpos].length = max_cluster -
+						deltaxcn;
+				rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
+				rlpos++;
+			} else if (unlikely(deltaxcn > max_cluster)) {
+				ntfs_error(vol->sb, "Corrupt attribute.  "
+						"deltaxcn = 0x%llx, "
+						"max_cluster = 0x%llx",
+						(unsigned long long)deltaxcn,
+						(unsigned long long)
+						max_cluster);
+				goto mpa_err;
+			}
+		}
+		rl[rlpos].lcn = LCN_ENOENT;
+	} else /* Not the base extent. There may be more extents to follow. */
+		rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
+
+	/* Setup terminating runlist element. */
+	rl[rlpos].vcn = vcn;
+	rl[rlpos].length = (s64)0;
+	/* If no existing runlist was specified, we are done. */
+	if (!old_rl) {
+		ntfs_debug("Mapping pairs array successfully decompressed:");
+		ntfs_debug_dump_runlist(rl);
+		return rl;
+	}
+	/* Now combine the new and old runlists checking for overlaps. */
+	old_rl = ntfs_runlists_merge(old_rl, rl);
+	if (likely(!IS_ERR(old_rl)))
+		return old_rl;
+	ntfs_free(rl);
+	ntfs_error(vol->sb, "Failed to merge runlists.");
+	return old_rl;
+io_error:
+	ntfs_error(vol->sb, "Corrupt attribute.");
+err_out:
+	ntfs_free(rl);
+	return ERR_PTR(-EIO);
+}
+
+/**
+ * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist
+ * @rl:		runlist to use for conversion
+ * @vcn:	vcn to convert
+ *
+ * Convert the virtual cluster number @vcn of an attribute into a logical
+ * cluster number (lcn) of a device using the runlist @rl to map vcns to their
+ * corresponding lcns.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * Since lcns must be >= 0, we use negative return codes with special meaning:
+ *
+ * Return code		Meaning / Description
+ * ==================================================
+ *  LCN_HOLE		Hole / not allocated on disk.
+ *  LCN_RL_NOT_MAPPED	This is part of the runlist which has not been
+ *			inserted into the runlist yet.
+ *  LCN_ENOENT		There is no such vcn in the attribute.
+ *
+ * Locking: - The caller must have locked the runlist (for reading or writing).
+ *	    - This function does not touch the lock, nor does it modify the
+ *	      runlist.
+ */
+LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn)
+{
+	int i;
+
+	BUG_ON(vcn < 0);
+	/*
+	 * If rl is NULL, assume that we have found an unmapped runlist. The
+	 * caller can then attempt to map it and fail appropriately if
+	 * necessary.
+	 */
+	if (unlikely(!rl))
+		return LCN_RL_NOT_MAPPED;
+
+	/* Catch out of lower bounds vcn. */
+	if (unlikely(vcn < rl[0].vcn))
+		return LCN_ENOENT;
+
+	for (i = 0; likely(rl[i].length); i++) {
+		if (unlikely(vcn < rl[i+1].vcn)) {
+			if (likely(rl[i].lcn >= (LCN)0))
+				return rl[i].lcn + (vcn - rl[i].vcn);
+			return rl[i].lcn;
+		}
+	}
+	/*
+	 * The terminator element is setup to the correct value, i.e. one of
+	 * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT.
+	 */
+	if (likely(rl[i].lcn < (LCN)0))
+		return rl[i].lcn;
+	/* Just in case... We could replace this with BUG() some day. */
+	return LCN_ENOENT;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_rl_find_vcn_nolock - find a vcn in a runlist
+ * @rl:		runlist to search
+ * @vcn:	vcn to find
+ *
+ * Find the virtual cluster number @vcn in the runlist @rl and return the
+ * address of the runlist element containing the @vcn on success.
+ *
+ * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of
+ * the runlist.
+ *
+ * Locking: The runlist must be locked on entry.
+ */
+runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn)
+{
+	BUG_ON(vcn < 0);
+	if (unlikely(!rl || vcn < rl[0].vcn))
+		return NULL;
+	while (likely(rl->length)) {
+		if (unlikely(vcn < rl[1].vcn)) {
+			if (likely(rl->lcn >= LCN_HOLE))
+				return rl;
+			return NULL;
+		}
+		rl++;
+	}
+	if (likely(rl->lcn == LCN_ENOENT))
+		return rl;
+	return NULL;
+}
+
+/**
+ * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number
+ * @n:		number for which to get the number of bytes for
+ *
+ * Return the number of bytes required to store @n unambiguously as
+ * a signed number.
+ *
+ * This is used in the context of the mapping pairs array to determine how
+ * many bytes will be needed in the array to store a given logical cluster
+ * number (lcn) or a specific run length.
+ *
+ * Return the number of bytes written.  This function cannot fail.
+ */
+static inline int ntfs_get_nr_significant_bytes(const s64 n)
+{
+	s64 l = n;
+	int i;
+	s8 j;
+
+	i = 0;
+	do {
+		l >>= 8;
+		i++;
+	} while (l != 0 && l != -1);
+	j = (n >> 8 * (i - 1)) & 0xff;
+	/* If the sign bit is wrong, we need an extra byte. */
+	if ((n < 0 && j >= 0) || (n > 0 && j < 0))
+		i++;
+	return i;
+}
+
+/**
+ * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array
+ * @vol:	ntfs volume (needed for the ntfs version)
+ * @rl:		locked runlist to determine the size of the mapping pairs of
+ * @first_vcn:	first vcn which to include in the mapping pairs array
+ * @last_vcn:	last vcn which to include in the mapping pairs array
+ *
+ * Walk the locked runlist @rl and calculate the size in bytes of the mapping
+ * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and
+ * finishing with vcn @last_vcn.
+ *
+ * A @last_vcn of -1 means end of runlist and in that case the size of the
+ * mapping pairs array corresponding to the runlist starting at vcn @first_vcn
+ * and finishing at the end of the runlist is determined.
+ *
+ * This for example allows us to allocate a buffer of the right size when
+ * building the mapping pairs array.
+ *
+ * If @rl is NULL, just return 1 (for the single terminator byte).
+ *
+ * Return the calculated size in bytes on success.  On error, return -errno.
+ * The following error codes are defined:
+ *	-EINVAL	- Run list contains unmapped elements.  Make sure to only pass
+ *		  fully mapped runlists to this function.
+ *	-EIO	- The runlist is corrupt.
+ *
+ * Locking: @rl must be locked on entry (either for reading or writing), it
+ *	    remains locked throughout, and is left locked upon return.
+ */
+int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
+		const runlist_element *rl, const VCN first_vcn,
+		const VCN last_vcn)
+{
+	LCN prev_lcn;
+	int rls;
+	bool the_end = false;
+
+	BUG_ON(first_vcn < 0);
+	BUG_ON(last_vcn < -1);
+	BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
+	if (!rl) {
+		BUG_ON(first_vcn);
+		BUG_ON(last_vcn > 0);
+		return 1;
+	}
+	/* Skip to runlist element containing @first_vcn. */
+	while (rl->length && first_vcn >= rl[1].vcn)
+		rl++;
+	if (unlikely((!rl->length && first_vcn > rl->vcn) ||
+			first_vcn < rl->vcn))
+		return -EINVAL;
+	prev_lcn = 0;
+	/* Always need the termining zero byte. */
+	rls = 1;
+	/* Do the first partial run if present. */
+	if (first_vcn > rl->vcn) {
+		s64 delta, length = rl->length;
+
+		/* We know rl->length != 0 already. */
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
+			goto err_out;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = true;
+		}
+		delta = first_vcn - rl->vcn;
+		/* Header byte + length. */
+		rls += 1 + ntfs_get_nr_significant_bytes(length - delta);
+		/*
+		 * If the logical cluster number (lcn) denotes a hole and we
+		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+		 * zero space.  On earlier NTFS versions we just store the lcn.
+		 * Note: this assumes that on NTFS 1.2-, holes are stored with
+		 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
+		 */
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
+			prev_lcn = rl->lcn;
+			if (likely(rl->lcn >= 0))
+				prev_lcn += delta;
+			/* Change in lcn. */
+			rls += ntfs_get_nr_significant_bytes(prev_lcn);
+		}
+		/* Go to next runlist element. */
+		rl++;
+	}
+	/* Do the full runs. */
+	for (; rl->length && !the_end; rl++) {
+		s64 length = rl->length;
+
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
+			goto err_out;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = true;
+		}
+		/* Header byte + length. */
+		rls += 1 + ntfs_get_nr_significant_bytes(length);
+		/*
+		 * If the logical cluster number (lcn) denotes a hole and we
+		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+		 * zero space.  On earlier NTFS versions we just store the lcn.
+		 * Note: this assumes that on NTFS 1.2-, holes are stored with
+		 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
+		 */
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
+			/* Change in lcn. */
+			rls += ntfs_get_nr_significant_bytes(rl->lcn -
+					prev_lcn);
+			prev_lcn = rl->lcn;
+		}
+	}
+	return rls;
+err_out:
+	if (rl->lcn == LCN_RL_NOT_MAPPED)
+		rls = -EINVAL;
+	else
+		rls = -EIO;
+	return rls;
+}
+
+/**
+ * ntfs_write_significant_bytes - write the significant bytes of a number
+ * @dst:	destination buffer to write to
+ * @dst_max:	pointer to last byte of destination buffer for bounds checking
+ * @n:		number whose significant bytes to write
+ *
+ * Store in @dst, the minimum bytes of the number @n which are required to
+ * identify @n unambiguously as a signed number, taking care not to exceed
+ * @dest_max, the maximum position within @dst to which we are allowed to
+ * write.
+ *
+ * This is used when building the mapping pairs array of a runlist to compress
+ * a given logical cluster number (lcn) or a specific run length to the minimum
+ * size possible.
+ *
+ * Return the number of bytes written on success.  On error, i.e. the
+ * destination buffer @dst is too small, return -ENOSPC.
+ */
+static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max,
+		const s64 n)
+{
+	s64 l = n;
+	int i;
+	s8 j;
+
+	i = 0;
+	do {
+		if (unlikely(dst > dst_max))
+			goto err_out;
+		*dst++ = l & 0xffll;
+		l >>= 8;
+		i++;
+	} while (l != 0 && l != -1);
+	j = (n >> 8 * (i - 1)) & 0xff;
+	/* If the sign bit is wrong, we need an extra byte. */
+	if (n < 0 && j >= 0) {
+		if (unlikely(dst > dst_max))
+			goto err_out;
+		i++;
+		*dst = (s8)-1;
+	} else if (n > 0 && j < 0) {
+		if (unlikely(dst > dst_max))
+			goto err_out;
+		i++;
+		*dst = (s8)0;
+	}
+	return i;
+err_out:
+	return -ENOSPC;
+}
+
+/**
+ * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist
+ * @vol:	ntfs volume (needed for the ntfs version)
+ * @dst:	destination buffer to which to write the mapping pairs array
+ * @dst_len:	size of destination buffer @dst in bytes
+ * @rl:		locked runlist for which to build the mapping pairs array
+ * @first_vcn:	first vcn which to include in the mapping pairs array
+ * @last_vcn:	last vcn which to include in the mapping pairs array
+ * @stop_vcn:	first vcn outside destination buffer on success or -ENOSPC
+ *
+ * Create the mapping pairs array from the locked runlist @rl, starting at vcn
+ * @first_vcn and finishing with vcn @last_vcn and save the array in @dst.
+ * @dst_len is the size of @dst in bytes and it should be at least equal to the
+ * value obtained by calling ntfs_get_size_for_mapping_pairs().
+ *
+ * A @last_vcn of -1 means end of runlist and in that case the mapping pairs
+ * array corresponding to the runlist starting at vcn @first_vcn and finishing
+ * at the end of the runlist is created.
+ *
+ * If @rl is NULL, just write a single terminator byte to @dst.
+ *
+ * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to
+ * the first vcn outside the destination buffer.  Note that on error, @dst has
+ * been filled with all the mapping pairs that will fit, thus it can be treated
+ * as partial success, in that a new attribute extent needs to be created or
+ * the next extent has to be used and the mapping pairs build has to be
+ * continued with @first_vcn set to *@stop_vcn.
+ *
+ * Return 0 on success and -errno on error.  The following error codes are
+ * defined:
+ *	-EINVAL	- Run list contains unmapped elements.  Make sure to only pass
+ *		  fully mapped runlists to this function.
+ *	-EIO	- The runlist is corrupt.
+ *	-ENOSPC	- The destination buffer is too small.
+ *
+ * Locking: @rl must be locked on entry (either for reading or writing), it
+ *	    remains locked throughout, and is left locked upon return.
+ */
+int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
+		const int dst_len, const runlist_element *rl,
+		const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn)
+{
+	LCN prev_lcn;
+	s8 *dst_max, *dst_next;
+	int err = -ENOSPC;
+	bool the_end = false;
+	s8 len_len, lcn_len;
+
+	BUG_ON(first_vcn < 0);
+	BUG_ON(last_vcn < -1);
+	BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
+	BUG_ON(dst_len < 1);
+	if (!rl) {
+		BUG_ON(first_vcn);
+		BUG_ON(last_vcn > 0);
+		if (stop_vcn)
+			*stop_vcn = 0;
+		/* Terminator byte. */
+		*dst = 0;
+		return 0;
+	}
+	/* Skip to runlist element containing @first_vcn. */
+	while (rl->length && first_vcn >= rl[1].vcn)
+		rl++;
+	if (unlikely((!rl->length && first_vcn > rl->vcn) ||
+			first_vcn < rl->vcn))
+		return -EINVAL;
+	/*
+	 * @dst_max is used for bounds checking in
+	 * ntfs_write_significant_bytes().
+	 */
+	dst_max = dst + dst_len - 1;
+	prev_lcn = 0;
+	/* Do the first partial run if present. */
+	if (first_vcn > rl->vcn) {
+		s64 delta, length = rl->length;
+
+		/* We know rl->length != 0 already. */
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
+			goto err_out;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = true;
+		}
+		delta = first_vcn - rl->vcn;
+		/* Write length. */
+		len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
+				length - delta);
+		if (unlikely(len_len < 0))
+			goto size_err;
+		/*
+		 * If the logical cluster number (lcn) denotes a hole and we
+		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+		 * zero space.  On earlier NTFS versions we just write the lcn
+		 * change.  FIXME: Do we need to write the lcn change or just
+		 * the lcn in that case?  Not sure as I have never seen this
+		 * case on NT4. - We assume that we just need to write the lcn
+		 * change until someone tells us otherwise... (AIA)
+		 */
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
+			prev_lcn = rl->lcn;
+			if (likely(rl->lcn >= 0))
+				prev_lcn += delta;
+			/* Write change in lcn. */
+			lcn_len = ntfs_write_significant_bytes(dst + 1 +
+					len_len, dst_max, prev_lcn);
+			if (unlikely(lcn_len < 0))
+				goto size_err;
+		} else
+			lcn_len = 0;
+		dst_next = dst + len_len + lcn_len + 1;
+		if (unlikely(dst_next > dst_max))
+			goto size_err;
+		/* Update header byte. */
+		*dst = lcn_len << 4 | len_len;
+		/* Position at next mapping pairs array element. */
+		dst = dst_next;
+		/* Go to next runlist element. */
+		rl++;
+	}
+	/* Do the full runs. */
+	for (; rl->length && !the_end; rl++) {
+		s64 length = rl->length;
+
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
+			goto err_out;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = true;
+		}
+		/* Write length. */
+		len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
+				length);
+		if (unlikely(len_len < 0))
+			goto size_err;
+		/*
+		 * If the logical cluster number (lcn) denotes a hole and we
+		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+		 * zero space.  On earlier NTFS versions we just write the lcn
+		 * change.  FIXME: Do we need to write the lcn change or just
+		 * the lcn in that case?  Not sure as I have never seen this
+		 * case on NT4. - We assume that we just need to write the lcn
+		 * change until someone tells us otherwise... (AIA)
+		 */
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
+			/* Write change in lcn. */
+			lcn_len = ntfs_write_significant_bytes(dst + 1 +
+					len_len, dst_max, rl->lcn - prev_lcn);
+			if (unlikely(lcn_len < 0))
+				goto size_err;
+			prev_lcn = rl->lcn;
+		} else
+			lcn_len = 0;
+		dst_next = dst + len_len + lcn_len + 1;
+		if (unlikely(dst_next > dst_max))
+			goto size_err;
+		/* Update header byte. */
+		*dst = lcn_len << 4 | len_len;
+		/* Position at next mapping pairs array element. */
+		dst = dst_next;
+	}
+	/* Success. */
+	err = 0;
+size_err:
+	/* Set stop vcn. */
+	if (stop_vcn)
+		*stop_vcn = rl->vcn;
+	/* Add terminator byte. */
+	*dst = 0;
+	return err;
+err_out:
+	if (rl->lcn == LCN_RL_NOT_MAPPED)
+		err = -EINVAL;
+	else
+		err = -EIO;
+	return err;
+}
+
+/**
+ * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn
+ * @vol:	ntfs volume (needed for error output)
+ * @runlist:	runlist to truncate
+ * @new_length:	the new length of the runlist in VCNs
+ *
+ * Truncate the runlist described by @runlist as well as the memory buffer
+ * holding the runlist elements to a length of @new_length VCNs.
+ *
+ * If @new_length lies within the runlist, the runlist elements with VCNs of
+ * @new_length and above are discarded.  As a special case if @new_length is
+ * zero, the runlist is discarded and set to NULL.
+ *
+ * If @new_length lies beyond the runlist, a sparse runlist element is added to
+ * the end of the runlist @runlist or if the last runlist element is a sparse
+ * one already, this is extended.
+ *
+ * Note, no checking is done for unmapped runlist elements.  It is assumed that
+ * the caller has mapped any elements that need to be mapped already.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: The caller must hold @runlist->lock for writing.
+ */
+int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
+		const s64 new_length)
+{
+	runlist_element *rl;
+	int old_size;
+
+	ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length);
+	BUG_ON(!runlist);
+	BUG_ON(new_length < 0);
+	rl = runlist->rl;
+	if (!new_length) {
+		ntfs_debug("Freeing runlist.");
+		runlist->rl = NULL;
+		if (rl)
+			ntfs_free(rl);
+		return 0;
+	}
+	if (unlikely(!rl)) {
+		/*
+		 * Create a runlist consisting of a sparse runlist element of
+		 * length @new_length followed by a terminator runlist element.
+		 */
+		rl = ntfs_malloc_nofs(PAGE_SIZE);
+		if (unlikely(!rl)) {
+			ntfs_error(vol->sb, "Not enough memory to allocate "
+					"runlist element buffer.");
+			return -ENOMEM;
+		}
+		runlist->rl = rl;
+		rl[1].length = rl->vcn = 0;
+		rl->lcn = LCN_HOLE;
+		rl[1].vcn = rl->length = new_length;
+		rl[1].lcn = LCN_ENOENT;
+		return 0;
+	}
+	BUG_ON(new_length < rl->vcn);
+	/* Find @new_length in the runlist. */
+	while (likely(rl->length && new_length >= rl[1].vcn))
+		rl++;
+	/*
+	 * If not at the end of the runlist we need to shrink it.
+	 * If at the end of the runlist we need to expand it.
+	 */
+	if (rl->length) {
+		runlist_element *trl;
+		bool is_end;
+
+		ntfs_debug("Shrinking runlist.");
+		/* Determine the runlist size. */
+		trl = rl + 1;
+		while (likely(trl->length))
+			trl++;
+		old_size = trl - runlist->rl + 1;
+		/* Truncate the run. */
+		rl->length = new_length - rl->vcn;
+		/*
+		 * If a run was partially truncated, make the following runlist
+		 * element a terminator.
+		 */
+		is_end = false;
+		if (rl->length) {
+			rl++;
+			if (!rl->length)
+				is_end = true;
+			rl->vcn = new_length;
+			rl->length = 0;
+		}
+		rl->lcn = LCN_ENOENT;
+		/* Reallocate memory if necessary. */
+		if (!is_end) {
+			int new_size = rl - runlist->rl + 1;
+			rl = ntfs_rl_realloc(runlist->rl, old_size, new_size);
+			if (IS_ERR(rl))
+				ntfs_warning(vol->sb, "Failed to shrink "
+						"runlist buffer.  This just "
+						"wastes a bit of memory "
+						"temporarily so we ignore it "
+						"and return success.");
+			else
+				runlist->rl = rl;
+		}
+	} else if (likely(/* !rl->length && */ new_length > rl->vcn)) {
+		ntfs_debug("Expanding runlist.");
+		/*
+		 * If there is a previous runlist element and it is a sparse
+		 * one, extend it.  Otherwise need to add a new, sparse runlist
+		 * element.
+		 */
+		if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE))
+			(rl - 1)->length = new_length - (rl - 1)->vcn;
+		else {
+			/* Determine the runlist size. */
+			old_size = rl - runlist->rl + 1;
+			/* Reallocate memory if necessary. */
+			rl = ntfs_rl_realloc(runlist->rl, old_size,
+					old_size + 1);
+			if (IS_ERR(rl)) {
+				ntfs_error(vol->sb, "Failed to expand runlist "
+						"buffer, aborting.");
+				return PTR_ERR(rl);
+			}
+			runlist->rl = rl;
+			/*
+			 * Set @rl to the same runlist element in the new
+			 * runlist as before in the old runlist.
+			 */
+			rl += old_size - 1;
+			/* Add a new, sparse runlist element. */
+			rl->lcn = LCN_HOLE;
+			rl->length = new_length - rl->vcn;
+			/* Add a new terminator runlist element. */
+			rl++;
+			rl->length = 0;
+		}
+		rl->vcn = new_length;
+		rl->lcn = LCN_ENOENT;
+	} else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ {
+		/* Runlist already has same size as requested. */
+		rl->lcn = LCN_ENOENT;
+	}
+	ntfs_debug("Done.");
+	return 0;
+}
+
+/**
+ * ntfs_rl_punch_nolock - punch a hole into a runlist
+ * @vol:	ntfs volume (needed for error output)
+ * @runlist:	runlist to punch a hole into
+ * @start:	starting VCN of the hole to be created
+ * @length:	size of the hole to be created in units of clusters
+ *
+ * Punch a hole into the runlist @runlist starting at VCN @start and of size
+ * @length clusters.
+ *
+ * Return 0 on success and -errno on error, in which case @runlist has not been
+ * modified.
+ *
+ * If @start and/or @start + @length are outside the runlist return error code
+ * -ENOENT.
+ *
+ * If the runlist contains unmapped or error elements between @start and @start
+ * + @length return error code -EINVAL.
+ *
+ * Locking: The caller must hold @runlist->lock for writing.
+ */
+int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
+		const VCN start, const s64 length)
+{
+	const VCN end = start + length;
+	s64 delta;
+	runlist_element *rl, *rl_end, *rl_real_end, *trl;
+	int old_size;
+	bool lcn_fixup = false;
+
+	ntfs_debug("Entering for start 0x%llx, length 0x%llx.",
+			(long long)start, (long long)length);
+	BUG_ON(!runlist);
+	BUG_ON(start < 0);
+	BUG_ON(length < 0);
+	BUG_ON(end < 0);
+	rl = runlist->rl;
+	if (unlikely(!rl)) {
+		if (likely(!start && !length))
+			return 0;
+		return -ENOENT;
+	}
+	/* Find @start in the runlist. */
+	while (likely(rl->length && start >= rl[1].vcn))
+		rl++;
+	rl_end = rl;
+	/* Find @end in the runlist. */
+	while (likely(rl_end->length && end >= rl_end[1].vcn)) {
+		/* Verify there are no unmapped or error elements. */
+		if (unlikely(rl_end->lcn < LCN_HOLE))
+			return -EINVAL;
+		rl_end++;
+	}
+	/* Check the last element. */
+	if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE))
+		return -EINVAL;
+	/* This covers @start being out of bounds, too. */
+	if (!rl_end->length && end > rl_end->vcn)
+		return -ENOENT;
+	if (!length)
+		return 0;
+	if (!rl->length)
+		return -ENOENT;
+	rl_real_end = rl_end;
+	/* Determine the runlist size. */
+	while (likely(rl_real_end->length))
+		rl_real_end++;
+	old_size = rl_real_end - runlist->rl + 1;
+	/* If @start is in a hole simply extend the hole. */
+	if (rl->lcn == LCN_HOLE) {
+		/*
+		 * If both @start and @end are in the same sparse run, we are
+		 * done.
+		 */
+		if (end <= rl[1].vcn) {
+			ntfs_debug("Done (requested hole is already sparse).");
+			return 0;
+		}
+extend_hole:
+		/* Extend the hole. */
+		rl->length = end - rl->vcn;
+		/* If @end is in a hole, merge it with the current one. */
+		if (rl_end->lcn == LCN_HOLE) {
+			rl_end++;
+			rl->length = rl_end->vcn - rl->vcn;
+		}
+		/* We have done the hole.  Now deal with the remaining tail. */
+		rl++;
+		/* Cut out all runlist elements up to @end. */
+		if (rl < rl_end)
+			memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
+					sizeof(*rl));
+		/* Adjust the beginning of the tail if necessary. */
+		if (end > rl->vcn) {
+			delta = end - rl->vcn;
+			rl->vcn = end;
+			rl->length -= delta;
+			/* Only adjust the lcn if it is real. */
+			if (rl->lcn >= 0)
+				rl->lcn += delta;
+		}
+shrink_allocation:
+		/* Reallocate memory if the allocation changed. */
+		if (rl < rl_end) {
+			rl = ntfs_rl_realloc(runlist->rl, old_size,
+					old_size - (rl_end - rl));
+			if (IS_ERR(rl))
+				ntfs_warning(vol->sb, "Failed to shrink "
+						"runlist buffer.  This just "
+						"wastes a bit of memory "
+						"temporarily so we ignore it "
+						"and return success.");
+			else
+				runlist->rl = rl;
+		}
+		ntfs_debug("Done (extend hole).");
+		return 0;
+	}
+	/*
+	 * If @start is at the beginning of a run things are easier as there is
+	 * no need to split the first run.
+	 */
+	if (start == rl->vcn) {
+		/*
+		 * @start is at the beginning of a run.
+		 *
+		 * If the previous run is sparse, extend its hole.
+		 *
+		 * If @end is not in the same run, switch the run to be sparse
+		 * and extend the newly created hole.
+		 *
+		 * Thus both of these cases reduce the problem to the above
+		 * case of "@start is in a hole".
+		 */
+		if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) {
+			rl--;
+			goto extend_hole;
+		}
+		if (end >= rl[1].vcn) {
+			rl->lcn = LCN_HOLE;
+			goto extend_hole;
+		}
+		/*
+		 * The final case is when @end is in the same run as @start.
+		 * For this need to split the run into two.  One run for the
+		 * sparse region between the beginning of the old run, i.e.
+		 * @start, and @end and one for the remaining non-sparse
+		 * region, i.e. between @end and the end of the old run.
+		 */
+		trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
+		if (IS_ERR(trl))
+			goto enomem_out;
+		old_size++;
+		if (runlist->rl != trl) {
+			rl = trl + (rl - runlist->rl);
+			rl_end = trl + (rl_end - runlist->rl);
+			rl_real_end = trl + (rl_real_end - runlist->rl);
+			runlist->rl = trl;
+		}
+split_end:
+		/* Shift all the runs up by one. */
+		memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl));
+		/* Finally, setup the two split runs. */
+		rl->lcn = LCN_HOLE;
+		rl->length = length;
+		rl++;
+		rl->vcn += length;
+		/* Only adjust the lcn if it is real. */
+		if (rl->lcn >= 0 || lcn_fixup)
+			rl->lcn += length;
+		rl->length -= length;
+		ntfs_debug("Done (split one).");
+		return 0;
+	}
+	/*
+	 * @start is neither in a hole nor at the beginning of a run.
+	 *
+	 * If @end is in a hole, things are easier as simply truncating the run
+	 * @start is in to end at @start - 1, deleting all runs after that up
+	 * to @end, and finally extending the beginning of the run @end is in
+	 * to be @start is all that is needed.
+	 */
+	if (rl_end->lcn == LCN_HOLE) {
+		/* Truncate the run containing @start. */
+		rl->length = start - rl->vcn;
+		rl++;
+		/* Cut out all runlist elements up to @end. */
+		if (rl < rl_end)
+			memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
+					sizeof(*rl));
+		/* Extend the beginning of the run @end is in to be @start. */
+		rl->vcn = start;
+		rl->length = rl[1].vcn - start;
+		goto shrink_allocation;
+	}
+	/* 
+	 * If @end is not in a hole there are still two cases to distinguish.
+	 * Either @end is or is not in the same run as @start.
+	 *
+	 * The second case is easier as it can be reduced to an already solved
+	 * problem by truncating the run @start is in to end at @start - 1.
+	 * Then, if @end is in the next run need to split the run into a sparse
+	 * run followed by a non-sparse run (already covered above) and if @end
+	 * is not in the next run switching it to be sparse, again reduces the
+	 * problem to the already covered case of "@start is in a hole".
+	 */
+	if (end >= rl[1].vcn) {
+		/*
+		 * If @end is not in the next run, reduce the problem to the
+		 * case of "@start is in a hole".
+		 */
+		if (rl[1].length && end >= rl[2].vcn) {
+			/* Truncate the run containing @start. */
+			rl->length = start - rl->vcn;
+			rl++;
+			rl->vcn = start;
+			rl->lcn = LCN_HOLE;
+			goto extend_hole;
+		}
+		trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
+		if (IS_ERR(trl))
+			goto enomem_out;
+		old_size++;
+		if (runlist->rl != trl) {
+			rl = trl + (rl - runlist->rl);
+			rl_end = trl + (rl_end - runlist->rl);
+			rl_real_end = trl + (rl_real_end - runlist->rl);
+			runlist->rl = trl;
+		}
+		/* Truncate the run containing @start. */
+		rl->length = start - rl->vcn;
+		rl++;
+		/*
+		 * @end is in the next run, reduce the problem to the case
+		 * where "@start is at the beginning of a run and @end is in
+		 * the same run as @start".
+		 */
+		delta = rl->vcn - start;
+		rl->vcn = start;
+		if (rl->lcn >= 0) {
+			rl->lcn -= delta;
+			/* Need this in case the lcn just became negative. */
+			lcn_fixup = true;
+		}
+		rl->length += delta;
+		goto split_end;
+	}
+	/*
+	 * The first case from above, i.e. @end is in the same run as @start.
+	 * We need to split the run into three.  One run for the non-sparse
+	 * region between the beginning of the old run and @start, one for the
+	 * sparse region between @start and @end, and one for the remaining
+	 * non-sparse region, i.e. between @end and the end of the old run.
+	 */
+	trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2);
+	if (IS_ERR(trl))
+		goto enomem_out;
+	old_size += 2;
+	if (runlist->rl != trl) {
+		rl = trl + (rl - runlist->rl);
+		rl_end = trl + (rl_end - runlist->rl);
+		rl_real_end = trl + (rl_real_end - runlist->rl);
+		runlist->rl = trl;
+	}
+	/* Shift all the runs up by two. */
+	memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl));
+	/* Finally, setup the three split runs. */
+	rl->length = start - rl->vcn;
+	rl++;
+	rl->vcn = start;
+	rl->lcn = LCN_HOLE;
+	rl->length = length;
+	rl++;
+	delta = end - rl->vcn;
+	rl->vcn = end;
+	rl->lcn += delta;
+	rl->length -= delta;
+	ntfs_debug("Done (split both).");
+	return 0;
+enomem_out:
+	ntfs_error(vol->sb, "Not enough memory to extend runlist buffer.");
+	return -ENOMEM;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
new file mode 100644
index 00000000..47728fbb
--- /dev/null
+++ b/fs/ntfs/runlist.h
@@ -0,0 +1,102 @@
+/*
+ * runlist.h - Defines for runlist handling in NTFS Linux kernel driver.
+ *	       Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_RUNLIST_H
+#define _LINUX_NTFS_RUNLIST_H
+
+#include "types.h"
+#include "layout.h"
+#include "volume.h"
+
+/**
+ * runlist_element - in memory vcn to lcn mapping array element
+ * @vcn:	starting vcn of the current array element
+ * @lcn:	starting lcn of the current array element
+ * @length:	length in clusters of the current array element
+ *
+ * The last vcn (in fact the last vcn + 1) is reached when length == 0.
+ *
+ * When lcn == -1 this means that the count vcns starting at vcn are not
+ * physically allocated (i.e. this is a hole / data is sparse).
+ */
+typedef struct {	/* In memory vcn to lcn mapping structure element. */
+	VCN vcn;	/* vcn = Starting virtual cluster number. */
+	LCN lcn;	/* lcn = Starting logical cluster number. */
+	s64 length;	/* Run length in clusters. */
+} runlist_element;
+
+/**
+ * runlist - in memory vcn to lcn mapping array including a read/write lock
+ * @rl:		pointer to an array of runlist elements
+ * @lock:	read/write spinlock for serializing access to @rl
+ *
+ */
+typedef struct {
+	runlist_element *rl;
+	struct rw_semaphore lock;
+} runlist;
+
+static inline void ntfs_init_runlist(runlist *rl)
+{
+	rl->rl = NULL;
+	init_rwsem(&rl->lock);
+}
+
+typedef enum {
+	LCN_HOLE		= -1,	/* Keep this as highest value or die! */
+	LCN_RL_NOT_MAPPED	= -2,
+	LCN_ENOENT		= -3,
+	LCN_ENOMEM		= -4,
+	LCN_EIO			= -5,
+} LCN_SPECIAL_VALUES;
+
+extern runlist_element *ntfs_runlists_merge(runlist_element *drl,
+		runlist_element *srl);
+
+extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
+		const ATTR_RECORD *attr, runlist_element *old_rl);
+
+extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn);
+
+#ifdef NTFS_RW
+
+extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl,
+		const VCN vcn);
+
+extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
+		const runlist_element *rl, const VCN first_vcn,
+		const VCN last_vcn);
+
+extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
+		const int dst_len, const runlist_element *rl,
+		const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn);
+
+extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol,
+		runlist *const runlist, const s64 new_length);
+
+int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
+		const VCN start, const s64 length);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_RUNLIST_H */
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
new file mode 100644
index 00000000..b52706da
--- /dev/null
+++ b/fs/ntfs/super.c
@@ -0,0 +1,3206 @@
+/*
+ * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001,2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/stddef.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>	/* For bdev_logical_block_size(). */
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/moduleparam.h>
+#include <linux/bitmap.h>
+
+#include "sysctl.h"
+#include "logfile.h"
+#include "quota.h"
+#include "usnjrnl.h"
+#include "dir.h"
+#include "debug.h"
+#include "index.h"
+#include "inode.h"
+#include "aops.h"
+#include "layout.h"
+#include "malloc.h"
+#include "ntfs.h"
+
+/* Number of mounted filesystems which have compression enabled. */
+static unsigned long ntfs_nr_compression_users;
+
+/* A global default upcase table and a corresponding reference count. */
+static ntfschar *default_upcase = NULL;
+static unsigned long ntfs_nr_upcase_users = 0;
+
+/* Error constants/strings used in inode.c::ntfs_show_options(). */
+typedef enum {
+	/* One of these must be present, default is ON_ERRORS_CONTINUE. */
+	ON_ERRORS_PANIC			= 0x01,
+	ON_ERRORS_REMOUNT_RO		= 0x02,
+	ON_ERRORS_CONTINUE		= 0x04,
+	/* Optional, can be combined with any of the above. */
+	ON_ERRORS_RECOVER		= 0x10,
+} ON_ERRORS_ACTIONS;
+
+const option_t on_errors_arr[] = {
+	{ ON_ERRORS_PANIC,	"panic" },
+	{ ON_ERRORS_REMOUNT_RO,	"remount-ro", },
+	{ ON_ERRORS_CONTINUE,	"continue", },
+	{ ON_ERRORS_RECOVER,	"recover" },
+	{ 0,			NULL }
+};
+
+/**
+ * simple_getbool -
+ *
+ * Copied from old ntfs driver (which copied from vfat driver).
+ */
+static int simple_getbool(char *s, bool *setval)
+{
+	if (s) {
+		if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true"))
+			*setval = true;
+		else if (!strcmp(s, "0") || !strcmp(s, "no") ||
+							!strcmp(s, "false"))
+			*setval = false;
+		else
+			return 0;
+	} else
+		*setval = true;
+	return 1;
+}
+
+/**
+ * parse_options - parse the (re)mount options
+ * @vol:	ntfs volume
+ * @opt:	string containing the (re)mount options
+ *
+ * Parse the recognized options in @opt for the ntfs volume described by @vol.
+ */
+static bool parse_options(ntfs_volume *vol, char *opt)
+{
+	char *p, *v, *ov;
+	static char *utf8 = "utf8";
+	int errors = 0, sloppy = 0;
+	uid_t uid = (uid_t)-1;
+	gid_t gid = (gid_t)-1;
+	mode_t fmask = (mode_t)-1, dmask = (mode_t)-1;
+	int mft_zone_multiplier = -1, on_errors = -1;
+	int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1;
+	struct nls_table *nls_map = NULL, *old_nls;
+
+	/* I am lazy... (-8 */
+#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value)	\
+	if (!strcmp(p, option)) {					\
+		if (!v || !*v)						\
+			variable = default_value;			\
+		else {							\
+			variable = simple_strtoul(ov = v, &v, 0);	\
+			if (*v)						\
+				goto needs_val;				\
+		}							\
+	}
+#define NTFS_GETOPT(option, variable)					\
+	if (!strcmp(p, option)) {					\
+		if (!v || !*v)						\
+			goto needs_arg;					\
+		variable = simple_strtoul(ov = v, &v, 0);		\
+		if (*v)							\
+			goto needs_val;					\
+	}
+#define NTFS_GETOPT_OCTAL(option, variable)				\
+	if (!strcmp(p, option)) {					\
+		if (!v || !*v)						\
+			goto needs_arg;					\
+		variable = simple_strtoul(ov = v, &v, 8);		\
+		if (*v)							\
+			goto needs_val;					\
+	}
+#define NTFS_GETOPT_BOOL(option, variable)				\
+	if (!strcmp(p, option)) {					\
+		bool val;						\
+		if (!simple_getbool(v, &val))				\
+			goto needs_bool;				\
+		variable = val;						\
+	}
+#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array)		\
+	if (!strcmp(p, option)) {					\
+		int _i;							\
+		if (!v || !*v)						\
+			goto needs_arg;					\
+		ov = v;							\
+		if (variable == -1)					\
+			variable = 0;					\
+		for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \
+			if (!strcmp(opt_array[_i].str, v)) {		\
+				variable |= opt_array[_i].val;		\
+				break;					\
+			}						\
+		if (!opt_array[_i].str || !*opt_array[_i].str)		\
+			goto needs_val;					\
+	}
+	if (!opt || !*opt)
+		goto no_mount_options;
+	ntfs_debug("Entering with mount options string: %s", opt);
+	while ((p = strsep(&opt, ","))) {
+		if ((v = strchr(p, '=')))
+			*v++ = 0;
+		NTFS_GETOPT("uid", uid)
+		else NTFS_GETOPT("gid", gid)
+		else NTFS_GETOPT_OCTAL("umask", fmask = dmask)
+		else NTFS_GETOPT_OCTAL("fmask", fmask)
+		else NTFS_GETOPT_OCTAL("dmask", dmask)
+		else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier)
+		else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true)
+		else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files)
+		else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive)
+		else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse)
+		else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors,
+				on_errors_arr)
+		else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes"))
+			ntfs_warning(vol->sb, "Ignoring obsolete option %s.",
+					p);
+		else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) {
+			if (!strcmp(p, "iocharset"))
+				ntfs_warning(vol->sb, "Option iocharset is "
+						"deprecated. Please use "
+						"option nls=<charsetname> in "
+						"the future.");
+			if (!v || !*v)
+				goto needs_arg;
+use_utf8:
+			old_nls = nls_map;
+			nls_map = load_nls(v);
+			if (!nls_map) {
+				if (!old_nls) {
+					ntfs_error(vol->sb, "NLS character set "
+							"%s not found.", v);
+					return false;
+				}
+				ntfs_error(vol->sb, "NLS character set %s not "
+						"found. Using previous one %s.",
+						v, old_nls->charset);
+				nls_map = old_nls;
+			} else /* nls_map */ {
+				unload_nls(old_nls);
+			}
+		} else if (!strcmp(p, "utf8")) {
+			bool val = false;
+			ntfs_warning(vol->sb, "Option utf8 is no longer "
+				   "supported, using option nls=utf8. Please "
+				   "use option nls=utf8 in the future and "
+				   "make sure utf8 is compiled either as a "
+				   "module or into the kernel.");
+			if (!v || !*v)
+				val = true;
+			else if (!simple_getbool(v, &val))
+				goto needs_bool;
+			if (val) {
+				v = utf8;
+				goto use_utf8;
+			}
+		} else {
+			ntfs_error(vol->sb, "Unrecognized mount option %s.", p);
+			if (errors < INT_MAX)
+				errors++;
+		}
+#undef NTFS_GETOPT_OPTIONS_ARRAY
+#undef NTFS_GETOPT_BOOL
+#undef NTFS_GETOPT
+#undef NTFS_GETOPT_WITH_DEFAULT
+	}
+no_mount_options:
+	if (errors && !sloppy)
+		return false;
+	if (sloppy)
+		ntfs_warning(vol->sb, "Sloppy option given. Ignoring "
+				"unrecognized mount option(s) and continuing.");
+	/* Keep this first! */
+	if (on_errors != -1) {
+		if (!on_errors) {
+			ntfs_error(vol->sb, "Invalid errors option argument "
+					"or bug in options parser.");
+			return false;
+		}
+	}
+	if (nls_map) {
+		if (vol->nls_map && vol->nls_map != nls_map) {
+			ntfs_error(vol->sb, "Cannot change NLS character set "
+					"on remount.");
+			return false;
+		} /* else (!vol->nls_map) */
+		ntfs_debug("Using NLS character set %s.", nls_map->charset);
+		vol->nls_map = nls_map;
+	} else /* (!nls_map) */ {
+		if (!vol->nls_map) {
+			vol->nls_map = load_nls_default();
+			if (!vol->nls_map) {
+				ntfs_error(vol->sb, "Failed to load default "
+						"NLS character set.");
+				return false;
+			}
+			ntfs_debug("Using default NLS character set (%s).",
+					vol->nls_map->charset);
+		}
+	}
+	if (mft_zone_multiplier != -1) {
+		if (vol->mft_zone_multiplier && vol->mft_zone_multiplier !=
+				mft_zone_multiplier) {
+			ntfs_error(vol->sb, "Cannot change mft_zone_multiplier "
+					"on remount.");
+			return false;
+		}
+		if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) {
+			ntfs_error(vol->sb, "Invalid mft_zone_multiplier. "
+					"Using default value, i.e. 1.");
+			mft_zone_multiplier = 1;
+		}
+		vol->mft_zone_multiplier = mft_zone_multiplier;
+	}
+	if (!vol->mft_zone_multiplier)
+		vol->mft_zone_multiplier = 1;
+	if (on_errors != -1)
+		vol->on_errors = on_errors;
+	if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER)
+		vol->on_errors |= ON_ERRORS_CONTINUE;
+	if (uid != (uid_t)-1)
+		vol->uid = uid;
+	if (gid != (gid_t)-1)
+		vol->gid = gid;
+	if (fmask != (mode_t)-1)
+		vol->fmask = fmask;
+	if (dmask != (mode_t)-1)
+		vol->dmask = dmask;
+	if (show_sys_files != -1) {
+		if (show_sys_files)
+			NVolSetShowSystemFiles(vol);
+		else
+			NVolClearShowSystemFiles(vol);
+	}
+	if (case_sensitive != -1) {
+		if (case_sensitive)
+			NVolSetCaseSensitive(vol);
+		else
+			NVolClearCaseSensitive(vol);
+	}
+	if (disable_sparse != -1) {
+		if (disable_sparse)
+			NVolClearSparseEnabled(vol);
+		else {
+			if (!NVolSparseEnabled(vol) &&
+					vol->major_ver && vol->major_ver < 3)
+				ntfs_warning(vol->sb, "Not enabling sparse "
+						"support due to NTFS volume "
+						"version %i.%i (need at least "
+						"version 3.0).", vol->major_ver,
+						vol->minor_ver);
+			else
+				NVolSetSparseEnabled(vol);
+		}
+	}
+	return true;
+needs_arg:
+	ntfs_error(vol->sb, "The %s option requires an argument.", p);
+	return false;
+needs_bool:
+	ntfs_error(vol->sb, "The %s option requires a boolean argument.", p);
+	return false;
+needs_val:
+	ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov);
+	return false;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_write_volume_flags - write new flags to the volume information flags
+ * @vol:	ntfs volume on which to modify the flags
+ * @flags:	new flags value for the volume information flags
+ *
+ * Internal function.  You probably want to use ntfs_{set,clear}_volume_flags()
+ * instead (see below).
+ *
+ * Replace the volume information flags on the volume @vol with the value
+ * supplied in @flags.  Note, this overwrites the volume information flags, so
+ * make sure to combine the flags you want to modify with the old flags and use
+ * the result when calling ntfs_write_volume_flags().
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags)
+{
+	ntfs_inode *ni = NTFS_I(vol->vol_ino);
+	MFT_RECORD *m;
+	VOLUME_INFORMATION *vi;
+	ntfs_attr_search_ctx *ctx;
+	int err;
+
+	ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.",
+			le16_to_cpu(vol->vol_flags), le16_to_cpu(flags));
+	if (vol->vol_flags == flags)
+		goto done;
+	BUG_ON(!ni);
+	m = map_mft_record(ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(ni, m);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto put_unm_err_out;
+	}
+	err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
+			ctx);
+	if (err)
+		goto put_unm_err_out;
+	vi = (VOLUME_INFORMATION*)((u8*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset));
+	vol->vol_flags = vi->flags = flags;
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(ni);
+done:
+	ntfs_debug("Done.");
+	return 0;
+put_unm_err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(ni);
+err_out:
+	ntfs_error(vol->sb, "Failed with error code %i.", -err);
+	return err;
+}
+
+/**
+ * ntfs_set_volume_flags - set bits in the volume information flags
+ * @vol:	ntfs volume on which to modify the flags
+ * @flags:	flags to set on the volume
+ *
+ * Set the bits in @flags in the volume information flags on the volume @vol.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
+{
+	flags &= VOLUME_FLAGS_MASK;
+	return ntfs_write_volume_flags(vol, vol->vol_flags | flags);
+}
+
+/**
+ * ntfs_clear_volume_flags - clear bits in the volume information flags
+ * @vol:	ntfs volume on which to modify the flags
+ * @flags:	flags to clear on the volume
+ *
+ * Clear the bits in @flags in the volume information flags on the volume @vol.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
+{
+	flags &= VOLUME_FLAGS_MASK;
+	flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags));
+	return ntfs_write_volume_flags(vol, flags);
+}
+
+#endif /* NTFS_RW */
+
+/**
+ * ntfs_remount - change the mount options of a mounted ntfs filesystem
+ * @sb:		superblock of mounted ntfs filesystem
+ * @flags:	remount flags
+ * @opt:	remount options string
+ *
+ * Change the mount options of an already mounted ntfs filesystem.
+ *
+ * NOTE:  The VFS sets the @sb->s_flags remount flags to @flags after
+ * ntfs_remount() returns successfully (i.e. returns 0).  Otherwise,
+ * @sb->s_flags are not changed.
+ */
+static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
+{
+	ntfs_volume *vol = NTFS_SB(sb);
+
+	ntfs_debug("Entering with remount options string: %s", opt);
+
+#ifndef NTFS_RW
+	/* For read-only compiled driver, enforce read-only flag. */
+	*flags |= MS_RDONLY;
+#else /* NTFS_RW */
+	/*
+	 * For the read-write compiled driver, if we are remounting read-write,
+	 * make sure there are no volume errors and that no unsupported volume
+	 * flags are set.  Also, empty the logfile journal as it would become
+	 * stale as soon as something is written to the volume and mark the
+	 * volume dirty so that chkdsk is run if the volume is not umounted
+	 * cleanly.  Finally, mark the quotas out of date so Windows rescans
+	 * the volume on boot and updates them.
+	 *
+	 * When remounting read-only, mark the volume clean if no volume errors
+	 * have occurred.
+	 */
+	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+		static const char *es = ".  Cannot remount read-write.";
+
+		/* Remounting read-write. */
+		if (NVolErrors(vol)) {
+			ntfs_error(sb, "Volume has errors and is read-only%s",
+					es);
+			return -EROFS;
+		}
+		if (vol->vol_flags & VOLUME_IS_DIRTY) {
+			ntfs_error(sb, "Volume is dirty and read-only%s", es);
+			return -EROFS;
+		}
+		if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
+			ntfs_error(sb, "Volume has been modified by chkdsk "
+					"and is read-only%s", es);
+			return -EROFS;
+		}
+		if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
+			ntfs_error(sb, "Volume has unsupported flags set "
+					"(0x%x) and is read-only%s",
+					(unsigned)le16_to_cpu(vol->vol_flags),
+					es);
+			return -EROFS;
+		}
+		if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
+			ntfs_error(sb, "Failed to set dirty bit in volume "
+					"information flags%s", es);
+			return -EROFS;
+		}
+#if 0
+		// TODO: Enable this code once we start modifying anything that
+		//	 is different between NTFS 1.2 and 3.x...
+		/* Set NT4 compatibility flag on newer NTFS version volumes. */
+		if ((vol->major_ver > 1)) {
+			if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
+				ntfs_error(sb, "Failed to set NT4 "
+						"compatibility flag%s", es);
+				NVolSetErrors(vol);
+				return -EROFS;
+			}
+		}
+#endif
+		if (!ntfs_empty_logfile(vol->logfile_ino)) {
+			ntfs_error(sb, "Failed to empty journal $LogFile%s",
+					es);
+			NVolSetErrors(vol);
+			return -EROFS;
+		}
+		if (!ntfs_mark_quotas_out_of_date(vol)) {
+			ntfs_error(sb, "Failed to mark quotas out of date%s",
+					es);
+			NVolSetErrors(vol);
+			return -EROFS;
+		}
+		if (!ntfs_stamp_usnjrnl(vol)) {
+			ntfs_error(sb, "Failed to stamp transation log "
+					"($UsnJrnl)%s", es);
+			NVolSetErrors(vol);
+			return -EROFS;
+		}
+	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
+		/* Remounting read-only. */
+		if (!NVolErrors(vol)) {
+			if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
+				ntfs_warning(sb, "Failed to clear dirty bit "
+						"in volume information "
+						"flags.  Run chkdsk.");
+		}
+	}
+#endif /* NTFS_RW */
+
+	// TODO: Deal with *flags.
+
+	if (!parse_options(vol, opt))
+		return -EINVAL;
+
+	ntfs_debug("Done.");
+	return 0;
+}
+
+/**
+ * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector
+ * @sb:		Super block of the device to which @b belongs.
+ * @b:		Boot sector of device @sb to check.
+ * @silent:	If 'true', all output will be silenced.
+ *
+ * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot
+ * sector. Returns 'true' if it is valid and 'false' if not.
+ *
+ * @sb is only needed for warning/error output, i.e. it can be NULL when silent
+ * is 'true'.
+ */
+static bool is_boot_sector_ntfs(const struct super_block *sb,
+		const NTFS_BOOT_SECTOR *b, const bool silent)
+{
+	/*
+	 * Check that checksum == sum of u32 values from b to the checksum
+	 * field.  If checksum is zero, no checking is done.  We will work when
+	 * the checksum test fails, since some utilities update the boot sector
+	 * ignoring the checksum which leaves the checksum out-of-date.  We
+	 * report a warning if this is the case.
+	 */
+	if ((void*)b < (void*)&b->checksum && b->checksum && !silent) {
+		le32 *u;
+		u32 i;
+
+		for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u)
+			i += le32_to_cpup(u);
+		if (le32_to_cpu(b->checksum) != i)
+			ntfs_warning(sb, "Invalid boot sector checksum.");
+	}
+	/* Check OEMidentifier is "NTFS    " */
+	if (b->oem_id != magicNTFS)
+		goto not_ntfs;
+	/* Check bytes per sector value is between 256 and 4096. */
+	if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 ||
+			le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000)
+		goto not_ntfs;
+	/* Check sectors per cluster value is valid. */
+	switch (b->bpb.sectors_per_cluster) {
+	case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128:
+		break;
+	default:
+		goto not_ntfs;
+	}
+	/* Check the cluster size is not above the maximum (64kiB). */
+	if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) *
+			b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE)
+		goto not_ntfs;
+	/* Check reserved/unused fields are really zero. */
+	if (le16_to_cpu(b->bpb.reserved_sectors) ||
+			le16_to_cpu(b->bpb.root_entries) ||
+			le16_to_cpu(b->bpb.sectors) ||
+			le16_to_cpu(b->bpb.sectors_per_fat) ||
+			le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats)
+		goto not_ntfs;
+	/* Check clusters per file mft record value is valid. */
+	if ((u8)b->clusters_per_mft_record < 0xe1 ||
+			(u8)b->clusters_per_mft_record > 0xf7)
+		switch (b->clusters_per_mft_record) {
+		case 1: case 2: case 4: case 8: case 16: case 32: case 64:
+			break;
+		default:
+			goto not_ntfs;
+		}
+	/* Check clusters per index block value is valid. */
+	if ((u8)b->clusters_per_index_record < 0xe1 ||
+			(u8)b->clusters_per_index_record > 0xf7)
+		switch (b->clusters_per_index_record) {
+		case 1: case 2: case 4: case 8: case 16: case 32: case 64:
+			break;
+		default:
+			goto not_ntfs;
+		}
+	/*
+	 * Check for valid end of sector marker. We will work without it, but
+	 * many BIOSes will refuse to boot from a bootsector if the magic is
+	 * incorrect, so we emit a warning.
+	 */
+	if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
+		ntfs_warning(sb, "Invalid end of sector marker.");
+	return true;
+not_ntfs:
+	return false;
+}
+
+/**
+ * read_ntfs_boot_sector - read the NTFS boot sector of a device
+ * @sb:		super block of device to read the boot sector from
+ * @silent:	if true, suppress all output
+ *
+ * Reads the boot sector from the device and validates it. If that fails, tries
+ * to read the backup boot sector, first from the end of the device a-la NT4 and
+ * later and then from the middle of the device a-la NT3.51 and before.
+ *
+ * If a valid boot sector is found but it is not the primary boot sector, we
+ * repair the primary boot sector silently (unless the device is read-only or
+ * the primary boot sector is not accessible).
+ *
+ * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super
+ * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized
+ * to their respective values.
+ *
+ * Return the unlocked buffer head containing the boot sector or NULL on error.
+ */
+static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb,
+		const int silent)
+{
+	const char *read_err_str = "Unable to read %s boot sector.";
+	struct buffer_head *bh_primary, *bh_backup;
+	sector_t nr_blocks = NTFS_SB(sb)->nr_blocks;
+
+	/* Try to read primary boot sector. */
+	if ((bh_primary = sb_bread(sb, 0))) {
+		if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
+				bh_primary->b_data, silent))
+			return bh_primary;
+		if (!silent)
+			ntfs_error(sb, "Primary boot sector is invalid.");
+	} else if (!silent)
+		ntfs_error(sb, read_err_str, "primary");
+	if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) {
+		if (bh_primary)
+			brelse(bh_primary);
+		if (!silent)
+			ntfs_error(sb, "Mount option errors=recover not used. "
+					"Aborting without trying to recover.");
+		return NULL;
+	}
+	/* Try to read NT4+ backup boot sector. */
+	if ((bh_backup = sb_bread(sb, nr_blocks - 1))) {
+		if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
+				bh_backup->b_data, silent))
+			goto hotfix_primary_boot_sector;
+		brelse(bh_backup);
+	} else if (!silent)
+		ntfs_error(sb, read_err_str, "backup");
+	/* Try to read NT3.51- backup boot sector. */
+	if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) {
+		if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
+				bh_backup->b_data, silent))
+			goto hotfix_primary_boot_sector;
+		if (!silent)
+			ntfs_error(sb, "Could not find a valid backup boot "
+					"sector.");
+		brelse(bh_backup);
+	} else if (!silent)
+		ntfs_error(sb, read_err_str, "backup");
+	/* We failed. Cleanup and return. */
+	if (bh_primary)
+		brelse(bh_primary);
+	return NULL;
+hotfix_primary_boot_sector:
+	if (bh_primary) {
+		/*
+		 * If we managed to read sector zero and the volume is not
+		 * read-only, copy the found, valid backup boot sector to the
+		 * primary boot sector.  Note we only copy the actual boot
+		 * sector structure, not the actual whole device sector as that
+		 * may be bigger and would potentially damage the $Boot system
+		 * file (FIXME: Would be nice to know if the backup boot sector
+		 * on a large sector device contains the whole boot loader or
+		 * just the first 512 bytes).
+		 */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			ntfs_warning(sb, "Hot-fix: Recovering invalid primary "
+					"boot sector from backup copy.");
+			memcpy(bh_primary->b_data, bh_backup->b_data,
+					NTFS_BLOCK_SIZE);
+			mark_buffer_dirty(bh_primary);
+			sync_dirty_buffer(bh_primary);
+			if (buffer_uptodate(bh_primary)) {
+				brelse(bh_backup);
+				return bh_primary;
+			}
+			ntfs_error(sb, "Hot-fix: Device write error while "
+					"recovering primary boot sector.");
+		} else {
+			ntfs_warning(sb, "Hot-fix: Recovery of primary boot "
+					"sector failed: Read-only mount.");
+		}
+		brelse(bh_primary);
+	}
+	ntfs_warning(sb, "Using backup boot sector.");
+	return bh_backup;
+}
+
+/**
+ * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol
+ * @vol:	volume structure to initialise with data from boot sector
+ * @b:		boot sector to parse
+ *
+ * Parse the ntfs boot sector @b and store all imporant information therein in
+ * the ntfs super block @vol.  Return 'true' on success and 'false' on error.
+ */
+static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
+{
+	unsigned int sectors_per_cluster_bits, nr_hidden_sects;
+	int clusters_per_mft_record, clusters_per_index_record;
+	s64 ll;
+
+	vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector);
+	vol->sector_size_bits = ffs(vol->sector_size) - 1;
+	ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size,
+			vol->sector_size);
+	ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits,
+			vol->sector_size_bits);
+	if (vol->sector_size < vol->sb->s_blocksize) {
+		ntfs_error(vol->sb, "Sector size (%i) is smaller than the "
+				"device block size (%lu).  This is not "
+				"supported.  Sorry.", vol->sector_size,
+				vol->sb->s_blocksize);
+		return false;
+	}
+	ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster);
+	sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1;
+	ntfs_debug("sectors_per_cluster_bits = 0x%x",
+			sectors_per_cluster_bits);
+	nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors);
+	ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects);
+	vol->cluster_size = vol->sector_size << sectors_per_cluster_bits;
+	vol->cluster_size_mask = vol->cluster_size - 1;
+	vol->cluster_size_bits = ffs(vol->cluster_size) - 1;
+	ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size,
+			vol->cluster_size);
+	ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask);
+	ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits);
+	if (vol->cluster_size < vol->sector_size) {
+		ntfs_error(vol->sb, "Cluster size (%i) is smaller than the "
+				"sector size (%i).  This is not supported.  "
+				"Sorry.", vol->cluster_size, vol->sector_size);
+		return false;
+	}
+	clusters_per_mft_record = b->clusters_per_mft_record;
+	ntfs_debug("clusters_per_mft_record = %i (0x%x)",
+			clusters_per_mft_record, clusters_per_mft_record);
+	if (clusters_per_mft_record > 0)
+		vol->mft_record_size = vol->cluster_size <<
+				(ffs(clusters_per_mft_record) - 1);
+	else
+		/*
+		 * When mft_record_size < cluster_size, clusters_per_mft_record
+		 * = -log2(mft_record_size) bytes. mft_record_size normaly is
+		 * 1024 bytes, which is encoded as 0xF6 (-10 in decimal).
+		 */
+		vol->mft_record_size = 1 << -clusters_per_mft_record;
+	vol->mft_record_size_mask = vol->mft_record_size - 1;
+	vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1;
+	ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size,
+			vol->mft_record_size);
+	ntfs_debug("vol->mft_record_size_mask = 0x%x",
+			vol->mft_record_size_mask);
+	ntfs_debug("vol->mft_record_size_bits = %i (0x%x)",
+			vol->mft_record_size_bits, vol->mft_record_size_bits);
+	/*
+	 * We cannot support mft record sizes above the PAGE_CACHE_SIZE since
+	 * we store $MFT/$DATA, the table of mft records in the page cache.
+	 */
+	if (vol->mft_record_size > PAGE_CACHE_SIZE) {
+		ntfs_error(vol->sb, "Mft record size (%i) exceeds the "
+				"PAGE_CACHE_SIZE on your system (%lu).  "
+				"This is not supported.  Sorry.",
+				vol->mft_record_size, PAGE_CACHE_SIZE);
+		return false;
+	}
+	/* We cannot support mft record sizes below the sector size. */
+	if (vol->mft_record_size < vol->sector_size) {
+		ntfs_error(vol->sb, "Mft record size (%i) is smaller than the "
+				"sector size (%i).  This is not supported.  "
+				"Sorry.", vol->mft_record_size,
+				vol->sector_size);
+		return false;
+	}
+	clusters_per_index_record = b->clusters_per_index_record;
+	ntfs_debug("clusters_per_index_record = %i (0x%x)",
+			clusters_per_index_record, clusters_per_index_record);
+	if (clusters_per_index_record > 0)
+		vol->index_record_size = vol->cluster_size <<
+				(ffs(clusters_per_index_record) - 1);
+	else
+		/*
+		 * When index_record_size < cluster_size,
+		 * clusters_per_index_record = -log2(index_record_size) bytes.
+		 * index_record_size normaly equals 4096 bytes, which is
+		 * encoded as 0xF4 (-12 in decimal).
+		 */
+		vol->index_record_size = 1 << -clusters_per_index_record;
+	vol->index_record_size_mask = vol->index_record_size - 1;
+	vol->index_record_size_bits = ffs(vol->index_record_size) - 1;
+	ntfs_debug("vol->index_record_size = %i (0x%x)",
+			vol->index_record_size, vol->index_record_size);
+	ntfs_debug("vol->index_record_size_mask = 0x%x",
+			vol->index_record_size_mask);
+	ntfs_debug("vol->index_record_size_bits = %i (0x%x)",
+			vol->index_record_size_bits,
+			vol->index_record_size_bits);
+	/* We cannot support index record sizes below the sector size. */
+	if (vol->index_record_size < vol->sector_size) {
+		ntfs_error(vol->sb, "Index record size (%i) is smaller than "
+				"the sector size (%i).  This is not "
+				"supported.  Sorry.", vol->index_record_size,
+				vol->sector_size);
+		return false;
+	}
+	/*
+	 * Get the size of the volume in clusters and check for 64-bit-ness.
+	 * Windows currently only uses 32 bits to save the clusters so we do
+	 * the same as it is much faster on 32-bit CPUs.
+	 */
+	ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits;
+	if ((u64)ll >= 1ULL << 32) {
+		ntfs_error(vol->sb, "Cannot handle 64-bit clusters.  Sorry.");
+		return false;
+	}
+	vol->nr_clusters = ll;
+	ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters);
+	/*
+	 * On an architecture where unsigned long is 32-bits, we restrict the
+	 * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler
+	 * will hopefully optimize the whole check away.
+	 */
+	if (sizeof(unsigned long) < 8) {
+		if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) {
+			ntfs_error(vol->sb, "Volume size (%lluTiB) is too "
+					"large for this architecture.  "
+					"Maximum supported is 2TiB.  Sorry.",
+					(unsigned long long)ll >> (40 -
+					vol->cluster_size_bits));
+			return false;
+		}
+	}
+	ll = sle64_to_cpu(b->mft_lcn);
+	if (ll >= vol->nr_clusters) {
+		ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of "
+				"volume.  Weird.", (unsigned long long)ll,
+				(unsigned long long)ll);
+		return false;
+	}
+	vol->mft_lcn = ll;
+	ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn);
+	ll = sle64_to_cpu(b->mftmirr_lcn);
+	if (ll >= vol->nr_clusters) {
+		ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end "
+				"of volume.  Weird.", (unsigned long long)ll,
+				(unsigned long long)ll);
+		return false;
+	}
+	vol->mftmirr_lcn = ll;
+	ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn);
+#ifdef NTFS_RW
+	/*
+	 * Work out the size of the mft mirror in number of mft records. If the
+	 * cluster size is less than or equal to the size taken by four mft
+	 * records, the mft mirror stores the first four mft records. If the
+	 * cluster size is bigger than the size taken by four mft records, the
+	 * mft mirror contains as many mft records as will fit into one
+	 * cluster.
+	 */
+	if (vol->cluster_size <= (4 << vol->mft_record_size_bits))
+		vol->mftmirr_size = 4;
+	else
+		vol->mftmirr_size = vol->cluster_size >>
+				vol->mft_record_size_bits;
+	ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size);
+#endif /* NTFS_RW */
+	vol->serial_no = le64_to_cpu(b->volume_serial_number);
+	ntfs_debug("vol->serial_no = 0x%llx",
+			(unsigned long long)vol->serial_no);
+	return true;
+}
+
+/**
+ * ntfs_setup_allocators - initialize the cluster and mft allocators
+ * @vol:	volume structure for which to setup the allocators
+ *
+ * Setup the cluster (lcn) and mft allocators to the starting values.
+ */
+static void ntfs_setup_allocators(ntfs_volume *vol)
+{
+#ifdef NTFS_RW
+	LCN mft_zone_size, mft_lcn;
+#endif /* NTFS_RW */
+
+	ntfs_debug("vol->mft_zone_multiplier = 0x%x",
+			vol->mft_zone_multiplier);
+#ifdef NTFS_RW
+	/* Determine the size of the MFT zone. */
+	mft_zone_size = vol->nr_clusters;
+	switch (vol->mft_zone_multiplier) {  /* % of volume size in clusters */
+	case 4:
+		mft_zone_size >>= 1;			/* 50%   */
+		break;
+	case 3:
+		mft_zone_size = (mft_zone_size +
+				(mft_zone_size >> 1)) >> 2;	/* 37.5% */
+		break;
+	case 2:
+		mft_zone_size >>= 2;			/* 25%   */
+		break;
+	/* case 1: */
+	default:
+		mft_zone_size >>= 3;			/* 12.5% */
+		break;
+	}
+	/* Setup the mft zone. */
+	vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn;
+	ntfs_debug("vol->mft_zone_pos = 0x%llx",
+			(unsigned long long)vol->mft_zone_pos);
+	/*
+	 * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs
+	 * source) and if the actual mft_lcn is in the expected place or even
+	 * further to the front of the volume, extend the mft_zone to cover the
+	 * beginning of the volume as well.  This is in order to protect the
+	 * area reserved for the mft bitmap as well within the mft_zone itself.
+	 * On non-standard volumes we do not protect it as the overhead would
+	 * be higher than the speed increase we would get by doing it.
+	 */
+	mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size;
+	if (mft_lcn * vol->cluster_size < 16 * 1024)
+		mft_lcn = (16 * 1024 + vol->cluster_size - 1) /
+				vol->cluster_size;
+	if (vol->mft_zone_start <= mft_lcn)
+		vol->mft_zone_start = 0;
+	ntfs_debug("vol->mft_zone_start = 0x%llx",
+			(unsigned long long)vol->mft_zone_start);
+	/*
+	 * Need to cap the mft zone on non-standard volumes so that it does
+	 * not point outside the boundaries of the volume.  We do this by
+	 * halving the zone size until we are inside the volume.
+	 */
+	vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
+	while (vol->mft_zone_end >= vol->nr_clusters) {
+		mft_zone_size >>= 1;
+		vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
+	}
+	ntfs_debug("vol->mft_zone_end = 0x%llx",
+			(unsigned long long)vol->mft_zone_end);
+	/*
+	 * Set the current position within each data zone to the start of the
+	 * respective zone.
+	 */
+	vol->data1_zone_pos = vol->mft_zone_end;
+	ntfs_debug("vol->data1_zone_pos = 0x%llx",
+			(unsigned long long)vol->data1_zone_pos);
+	vol->data2_zone_pos = 0;
+	ntfs_debug("vol->data2_zone_pos = 0x%llx",
+			(unsigned long long)vol->data2_zone_pos);
+
+	/* Set the mft data allocation position to mft record 24. */
+	vol->mft_data_pos = 24;
+	ntfs_debug("vol->mft_data_pos = 0x%llx",
+			(unsigned long long)vol->mft_data_pos);
+#endif /* NTFS_RW */
+}
+
+#ifdef NTFS_RW
+
+/**
+ * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume
+ * @vol:	ntfs super block describing device whose mft mirror to load
+ *
+ * Return 'true' on success or 'false' on error.
+ */
+static bool load_and_init_mft_mirror(ntfs_volume *vol)
+{
+	struct inode *tmp_ino;
+	ntfs_inode *tmp_ni;
+
+	ntfs_debug("Entering.");
+	/* Get mft mirror inode. */
+	tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr);
+	if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
+		if (!IS_ERR(tmp_ino))
+			iput(tmp_ino);
+		/* Caller will display error message. */
+		return false;
+	}
+	/*
+	 * Re-initialize some specifics about $MFTMirr's inode as
+	 * ntfs_read_inode() will have set up the default ones.
+	 */
+	/* Set uid and gid to root. */
+	tmp_ino->i_uid = tmp_ino->i_gid = 0;
+	/* Regular file.  No access for anyone. */
+	tmp_ino->i_mode = S_IFREG;
+	/* No VFS initiated operations allowed for $MFTMirr. */
+	tmp_ino->i_op = &ntfs_empty_inode_ops;
+	tmp_ino->i_fop = &ntfs_empty_file_ops;
+	/* Put in our special address space operations. */
+	tmp_ino->i_mapping->a_ops = &ntfs_mst_aops;
+	tmp_ni = NTFS_I(tmp_ino);
+	/* The $MFTMirr, like the $MFT is multi sector transfer protected. */
+	NInoSetMstProtected(tmp_ni);
+	NInoSetSparseDisabled(tmp_ni);
+	/*
+	 * Set up our little cheat allowing us to reuse the async read io
+	 * completion handler for directories.
+	 */
+	tmp_ni->itype.index.block_size = vol->mft_record_size;
+	tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits;
+	vol->mftmirr_ino = tmp_ino;
+	ntfs_debug("Done.");
+	return true;
+}
+
+/**
+ * check_mft_mirror - compare contents of the mft mirror with the mft
+ * @vol:	ntfs super block describing device whose mft mirror to check
+ *
+ * Return 'true' on success or 'false' on error.
+ *
+ * Note, this function also results in the mft mirror runlist being completely
+ * mapped into memory.  The mft mirror write code requires this and will BUG()
+ * should it find an unmapped runlist element.
+ */
+static bool check_mft_mirror(ntfs_volume *vol)
+{
+	struct super_block *sb = vol->sb;
+	ntfs_inode *mirr_ni;
+	struct page *mft_page, *mirr_page;
+	u8 *kmft, *kmirr;
+	runlist_element *rl, rl2[2];
+	pgoff_t index;
+	int mrecs_per_page, i;
+
+	ntfs_debug("Entering.");
+	/* Compare contents of $MFT and $MFTMirr. */
+	mrecs_per_page = PAGE_CACHE_SIZE / vol->mft_record_size;
+	BUG_ON(!mrecs_per_page);
+	BUG_ON(!vol->mftmirr_size);
+	mft_page = mirr_page = NULL;
+	kmft = kmirr = NULL;
+	index = i = 0;
+	do {
+		u32 bytes;
+
+		/* Switch pages if necessary. */
+		if (!(i % mrecs_per_page)) {
+			if (index) {
+				ntfs_unmap_page(mft_page);
+				ntfs_unmap_page(mirr_page);
+			}
+			/* Get the $MFT page. */
+			mft_page = ntfs_map_page(vol->mft_ino->i_mapping,
+					index);
+			if (IS_ERR(mft_page)) {
+				ntfs_error(sb, "Failed to read $MFT.");
+				return false;
+			}
+			kmft = page_address(mft_page);
+			/* Get the $MFTMirr page. */
+			mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping,
+					index);
+			if (IS_ERR(mirr_page)) {
+				ntfs_error(sb, "Failed to read $MFTMirr.");
+				goto mft_unmap_out;
+			}
+			kmirr = page_address(mirr_page);
+			++index;
+		}
+		/* Do not check the record if it is not in use. */
+		if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) {
+			/* Make sure the record is ok. */
+			if (ntfs_is_baad_recordp((le32*)kmft)) {
+				ntfs_error(sb, "Incomplete multi sector "
+						"transfer detected in mft "
+						"record %i.", i);
+mm_unmap_out:
+				ntfs_unmap_page(mirr_page);
+mft_unmap_out:
+				ntfs_unmap_page(mft_page);
+				return false;
+			}
+		}
+		/* Do not check the mirror record if it is not in use. */
+		if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) {
+			if (ntfs_is_baad_recordp((le32*)kmirr)) {
+				ntfs_error(sb, "Incomplete multi sector "
+						"transfer detected in mft "
+						"mirror record %i.", i);
+				goto mm_unmap_out;
+			}
+		}
+		/* Get the amount of data in the current record. */
+		bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use);
+		if (bytes < sizeof(MFT_RECORD_OLD) ||
+				bytes > vol->mft_record_size ||
+				ntfs_is_baad_recordp((le32*)kmft)) {
+			bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use);
+			if (bytes < sizeof(MFT_RECORD_OLD) ||
+					bytes > vol->mft_record_size ||
+					ntfs_is_baad_recordp((le32*)kmirr))
+				bytes = vol->mft_record_size;
+		}
+		/* Compare the two records. */
+		if (memcmp(kmft, kmirr, bytes)) {
+			ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not "
+					"match.  Run ntfsfix or chkdsk.", i);
+			goto mm_unmap_out;
+		}
+		kmft += vol->mft_record_size;
+		kmirr += vol->mft_record_size;
+	} while (++i < vol->mftmirr_size);
+	/* Release the last pages. */
+	ntfs_unmap_page(mft_page);
+	ntfs_unmap_page(mirr_page);
+
+	/* Construct the mft mirror runlist by hand. */
+	rl2[0].vcn = 0;
+	rl2[0].lcn = vol->mftmirr_lcn;
+	rl2[0].length = (vol->mftmirr_size * vol->mft_record_size +
+			vol->cluster_size - 1) / vol->cluster_size;
+	rl2[1].vcn = rl2[0].length;
+	rl2[1].lcn = LCN_ENOENT;
+	rl2[1].length = 0;
+	/*
+	 * Because we have just read all of the mft mirror, we know we have
+	 * mapped the full runlist for it.
+	 */
+	mirr_ni = NTFS_I(vol->mftmirr_ino);
+	down_read(&mirr_ni->runlist.lock);
+	rl = mirr_ni->runlist.rl;
+	/* Compare the two runlists.  They must be identical. */
+	i = 0;
+	do {
+		if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn ||
+				rl2[i].length != rl[i].length) {
+			ntfs_error(sb, "$MFTMirr location mismatch.  "
+					"Run chkdsk.");
+			up_read(&mirr_ni->runlist.lock);
+			return false;
+		}
+	} while (rl2[i++].length);
+	up_read(&mirr_ni->runlist.lock);
+	ntfs_debug("Done.");
+	return true;
+}
+
+/**
+ * load_and_check_logfile - load and check the logfile inode for a volume
+ * @vol:	ntfs super block describing device whose logfile to load
+ *
+ * Return 'true' on success or 'false' on error.
+ */
+static bool load_and_check_logfile(ntfs_volume *vol,
+		RESTART_PAGE_HEADER **rp)
+{
+	struct inode *tmp_ino;
+
+	ntfs_debug("Entering.");
+	tmp_ino = ntfs_iget(vol->sb, FILE_LogFile);
+	if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
+		if (!IS_ERR(tmp_ino))
+			iput(tmp_ino);
+		/* Caller will display error message. */
+		return false;
+	}
+	if (!ntfs_check_logfile(tmp_ino, rp)) {
+		iput(tmp_ino);
+		/* ntfs_check_logfile() will have displayed error output. */
+		return false;
+	}
+	NInoSetSparseDisabled(NTFS_I(tmp_ino));
+	vol->logfile_ino = tmp_ino;
+	ntfs_debug("Done.");
+	return true;
+}
+
+#define NTFS_HIBERFIL_HEADER_SIZE	4096
+
+/**
+ * check_windows_hibernation_status - check if Windows is suspended on a volume
+ * @vol:	ntfs super block of device to check
+ *
+ * Check if Windows is hibernated on the ntfs volume @vol.  This is done by
+ * looking for the file hiberfil.sys in the root directory of the volume.  If
+ * the file is not present Windows is definitely not suspended.
+ *
+ * If hiberfil.sys exists and is less than 4kiB in size it means Windows is
+ * definitely suspended (this volume is not the system volume).  Caveat:  on a
+ * system with many volumes it is possible that the < 4kiB check is bogus but
+ * for now this should do fine.
+ *
+ * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the
+ * hiberfil header (which is the first 4kiB).  If this begins with "hibr",
+ * Windows is definitely suspended.  If it is completely full of zeroes,
+ * Windows is definitely not hibernated.  Any other case is treated as if
+ * Windows is suspended.  This caters for the above mentioned caveat of a
+ * system with many volumes where no "hibr" magic would be present and there is
+ * no zero header.
+ *
+ * Return 0 if Windows is not hibernated on the volume, >0 if Windows is
+ * hibernated on the volume, and -errno on error.
+ */
+static int check_windows_hibernation_status(ntfs_volume *vol)
+{
+	MFT_REF mref;
+	struct inode *vi;
+	ntfs_inode *ni;
+	struct page *page;
+	u32 *kaddr, *kend;
+	ntfs_name *name = NULL;
+	int ret = 1;
+	static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
+			cpu_to_le16('i'), cpu_to_le16('b'),
+			cpu_to_le16('e'), cpu_to_le16('r'),
+			cpu_to_le16('f'), cpu_to_le16('i'),
+			cpu_to_le16('l'), cpu_to_le16('.'),
+			cpu_to_le16('s'), cpu_to_le16('y'),
+			cpu_to_le16('s'), 0 };
+
+	ntfs_debug("Entering.");
+	/*
+	 * Find the inode number for the hibernation file by looking up the
+	 * filename hiberfil.sys in the root directory.
+	 */
+	mutex_lock(&vol->root_ino->i_mutex);
+	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
+			&name);
+	mutex_unlock(&vol->root_ino->i_mutex);
+	if (IS_ERR_MREF(mref)) {
+		ret = MREF_ERR(mref);
+		/* If the file does not exist, Windows is not hibernated. */
+		if (ret == -ENOENT) {
+			ntfs_debug("hiberfil.sys not present.  Windows is not "
+					"hibernated on the volume.");
+			return 0;
+		}
+		/* A real error occurred. */
+		ntfs_error(vol->sb, "Failed to find inode number for "
+				"hiberfil.sys.");
+		return ret;
+	}
+	/* We do not care for the type of match that was found. */
+	kfree(name);
+	/* Get the inode. */
+	vi = ntfs_iget(vol->sb, MREF(mref));
+	if (IS_ERR(vi) || is_bad_inode(vi)) {
+		if (!IS_ERR(vi))
+			iput(vi);
+		ntfs_error(vol->sb, "Failed to load hiberfil.sys.");
+		return IS_ERR(vi) ? PTR_ERR(vi) : -EIO;
+	}
+	if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) {
+		ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx).  "
+				"Windows is hibernated on the volume.  This "
+				"is not the system volume.", i_size_read(vi));
+		goto iput_out;
+	}
+	ni = NTFS_I(vi);
+	page = ntfs_map_page(vi->i_mapping, 0);
+	if (IS_ERR(page)) {
+		ntfs_error(vol->sb, "Failed to read from hiberfil.sys.");
+		ret = PTR_ERR(page);
+		goto iput_out;
+	}
+	kaddr = (u32*)page_address(page);
+	if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
+		ntfs_debug("Magic \"hibr\" found in hiberfil.sys.  Windows is "
+				"hibernated on the volume.  This is the "
+				"system volume.");
+		goto unm_iput_out;
+	}
+	kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr);
+	do {
+		if (unlikely(*kaddr)) {
+			ntfs_debug("hiberfil.sys is larger than 4kiB "
+					"(0x%llx), does not contain the "
+					"\"hibr\" magic, and does not have a "
+					"zero header.  Windows is hibernated "
+					"on the volume.  This is not the "
+					"system volume.", i_size_read(vi));
+			goto unm_iput_out;
+		}
+	} while (++kaddr < kend);
+	ntfs_debug("hiberfil.sys contains a zero header.  Windows is not "
+			"hibernated on the volume.  This is the system "
+			"volume.");
+	ret = 0;
+unm_iput_out:
+	ntfs_unmap_page(page);
+iput_out:
+	iput(vi);
+	return ret;
+}
+
+/**
+ * load_and_init_quota - load and setup the quota file for a volume if present
+ * @vol:	ntfs super block describing device whose quota file to load
+ *
+ * Return 'true' on success or 'false' on error.  If $Quota is not present, we
+ * leave vol->quota_ino as NULL and return success.
+ */
+static bool load_and_init_quota(ntfs_volume *vol)
+{
+	MFT_REF mref;
+	struct inode *tmp_ino;
+	ntfs_name *name = NULL;
+	static const ntfschar Quota[7] = { cpu_to_le16('$'),
+			cpu_to_le16('Q'), cpu_to_le16('u'),
+			cpu_to_le16('o'), cpu_to_le16('t'),
+			cpu_to_le16('a'), 0 };
+	static ntfschar Q[3] = { cpu_to_le16('$'),
+			cpu_to_le16('Q'), 0 };
+
+	ntfs_debug("Entering.");
+	/*
+	 * Find the inode number for the quota file by looking up the filename
+	 * $Quota in the extended system files directory $Extend.
+	 */
+	mutex_lock(&vol->extend_ino->i_mutex);
+	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
+			&name);
+	mutex_unlock(&vol->extend_ino->i_mutex);
+	if (IS_ERR_MREF(mref)) {
+		/*
+		 * If the file does not exist, quotas are disabled and have
+		 * never been enabled on this volume, just return success.
+		 */
+		if (MREF_ERR(mref) == -ENOENT) {
+			ntfs_debug("$Quota not present.  Volume does not have "
+					"quotas enabled.");
+			/*
+			 * No need to try to set quotas out of date if they are
+			 * not enabled.
+			 */
+			NVolSetQuotaOutOfDate(vol);
+			return true;
+		}
+		/* A real error occurred. */
+		ntfs_error(vol->sb, "Failed to find inode number for $Quota.");
+		return false;
+	}
+	/* We do not care for the type of match that was found. */
+	kfree(name);
+	/* Get the inode. */
+	tmp_ino = ntfs_iget(vol->sb, MREF(mref));
+	if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
+		if (!IS_ERR(tmp_ino))
+			iput(tmp_ino);
+		ntfs_error(vol->sb, "Failed to load $Quota.");
+		return false;
+	}
+	vol->quota_ino = tmp_ino;
+	/* Get the $Q index allocation attribute. */
+	tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2);
+	if (IS_ERR(tmp_ino)) {
+		ntfs_error(vol->sb, "Failed to load $Quota/$Q index.");
+		return false;
+	}
+	vol->quota_q_ino = tmp_ino;
+	ntfs_debug("Done.");
+	return true;
+}
+
+/**
+ * load_and_init_usnjrnl - load and setup the transaction log if present
+ * @vol:	ntfs super block describing device whose usnjrnl file to load
+ *
+ * Return 'true' on success or 'false' on error.
+ *
+ * If $UsnJrnl is not present or in the process of being disabled, we set
+ * NVolUsnJrnlStamped() and return success.
+ *
+ * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn,
+ * i.e. transaction logging has only just been enabled or the journal has been
+ * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped()
+ * and return success.
+ */
+static bool load_and_init_usnjrnl(ntfs_volume *vol)
+{
+	MFT_REF mref;
+	struct inode *tmp_ino;
+	ntfs_inode *tmp_ni;
+	struct page *page;
+	ntfs_name *name = NULL;
+	USN_HEADER *uh;
+	static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
+			cpu_to_le16('U'), cpu_to_le16('s'),
+			cpu_to_le16('n'), cpu_to_le16('J'),
+			cpu_to_le16('r'), cpu_to_le16('n'),
+			cpu_to_le16('l'), 0 };
+	static ntfschar Max[5] = { cpu_to_le16('$'),
+			cpu_to_le16('M'), cpu_to_le16('a'),
+			cpu_to_le16('x'), 0 };
+	static ntfschar J[3] = { cpu_to_le16('$'),
+			cpu_to_le16('J'), 0 };
+
+	ntfs_debug("Entering.");
+	/*
+	 * Find the inode number for the transaction log file by looking up the
+	 * filename $UsnJrnl in the extended system files directory $Extend.
+	 */
+	mutex_lock(&vol->extend_ino->i_mutex);
+	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
+			&name);
+	mutex_unlock(&vol->extend_ino->i_mutex);
+	if (IS_ERR_MREF(mref)) {
+		/*
+		 * If the file does not exist, transaction logging is disabled,
+		 * just return success.
+		 */
+		if (MREF_ERR(mref) == -ENOENT) {
+			ntfs_debug("$UsnJrnl not present.  Volume does not "
+					"have transaction logging enabled.");
+not_enabled:
+			/*
+			 * No need to try to stamp the transaction log if
+			 * transaction logging is not enabled.
+			 */
+			NVolSetUsnJrnlStamped(vol);
+			return true;
+		}
+		/* A real error occurred. */
+		ntfs_error(vol->sb, "Failed to find inode number for "
+				"$UsnJrnl.");
+		return false;
+	}
+	/* We do not care for the type of match that was found. */
+	kfree(name);
+	/* Get the inode. */
+	tmp_ino = ntfs_iget(vol->sb, MREF(mref));
+	if (unlikely(IS_ERR(tmp_ino) || is_bad_inode(tmp_ino))) {
+		if (!IS_ERR(tmp_ino))
+			iput(tmp_ino);
+		ntfs_error(vol->sb, "Failed to load $UsnJrnl.");
+		return false;
+	}
+	vol->usnjrnl_ino = tmp_ino;
+	/*
+	 * If the transaction log is in the process of being deleted, we can
+	 * ignore it.
+	 */
+	if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) {
+		ntfs_debug("$UsnJrnl in the process of being disabled.  "
+				"Volume does not have transaction logging "
+				"enabled.");
+		goto not_enabled;
+	}
+	/* Get the $DATA/$Max attribute. */
+	tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4);
+	if (IS_ERR(tmp_ino)) {
+		ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max "
+				"attribute.");
+		return false;
+	}
+	vol->usnjrnl_max_ino = tmp_ino;
+	if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) {
+		ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max "
+				"attribute (size is 0x%llx but should be at "
+				"least 0x%zx bytes).", i_size_read(tmp_ino),
+				sizeof(USN_HEADER));
+		return false;
+	}
+	/* Get the $DATA/$J attribute. */
+	tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2);
+	if (IS_ERR(tmp_ino)) {
+		ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J "
+				"attribute.");
+		return false;
+	}
+	vol->usnjrnl_j_ino = tmp_ino;
+	/* Verify $J is non-resident and sparse. */
+	tmp_ni = NTFS_I(vol->usnjrnl_j_ino);
+	if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) {
+		ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident "
+				"and/or not sparse.");
+		return false;
+	}
+	/* Read the USN_HEADER from $DATA/$Max. */
+	page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
+	if (IS_ERR(page)) {
+		ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max "
+				"attribute.");
+		return false;
+	}
+	uh = (USN_HEADER*)page_address(page);
+	/* Sanity check the $Max. */
+	if (unlikely(sle64_to_cpu(uh->allocation_delta) >
+			sle64_to_cpu(uh->maximum_size))) {
+		ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds "
+				"maximum size (0x%llx).  $UsnJrnl is corrupt.",
+				(long long)sle64_to_cpu(uh->allocation_delta),
+				(long long)sle64_to_cpu(uh->maximum_size));
+		ntfs_unmap_page(page);
+		return false;
+	}
+	/*
+	 * If the transaction log has been stamped and nothing has been written
+	 * to it since, we do not need to stamp it.
+	 */
+	if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >=
+			i_size_read(vol->usnjrnl_j_ino))) {
+		if (likely(sle64_to_cpu(uh->lowest_valid_usn) ==
+				i_size_read(vol->usnjrnl_j_ino))) {
+			ntfs_unmap_page(page);
+			ntfs_debug("$UsnJrnl is enabled but nothing has been "
+					"logged since it was last stamped.  "
+					"Treating this as if the volume does "
+					"not have transaction logging "
+					"enabled.");
+			goto not_enabled;
+		}
+		ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) "
+				"which is out of bounds (0x%llx).  $UsnJrnl "
+				"is corrupt.",
+				(long long)sle64_to_cpu(uh->lowest_valid_usn),
+				i_size_read(vol->usnjrnl_j_ino));
+		ntfs_unmap_page(page);
+		return false;
+	}
+	ntfs_unmap_page(page);
+	ntfs_debug("Done.");
+	return true;
+}
+
+/**
+ * load_and_init_attrdef - load the attribute definitions table for a volume
+ * @vol:	ntfs super block describing device whose attrdef to load
+ *
+ * Return 'true' on success or 'false' on error.
+ */
+static bool load_and_init_attrdef(ntfs_volume *vol)
+{
+	loff_t i_size;
+	struct super_block *sb = vol->sb;
+	struct inode *ino;
+	struct page *page;
+	pgoff_t index, max_index;
+	unsigned int size;
+
+	ntfs_debug("Entering.");
+	/* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */
+	ino = ntfs_iget(sb, FILE_AttrDef);
+	if (IS_ERR(ino) || is_bad_inode(ino)) {
+		if (!IS_ERR(ino))
+			iput(ino);
+		goto failed;
+	}
+	NInoSetSparseDisabled(NTFS_I(ino));
+	/* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */
+	i_size = i_size_read(ino);
+	if (i_size <= 0 || i_size > 0x7fffffff)
+		goto iput_failed;
+	vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size);
+	if (!vol->attrdef)
+		goto iput_failed;
+	index = 0;
+	max_index = i_size >> PAGE_CACHE_SHIFT;
+	size = PAGE_CACHE_SIZE;
+	while (index < max_index) {
+		/* Read the attrdef table and copy it into the linear buffer. */
+read_partial_attrdef_page:
+		page = ntfs_map_page(ino->i_mapping, index);
+		if (IS_ERR(page))
+			goto free_iput_failed;
+		memcpy((u8*)vol->attrdef + (index++ << PAGE_CACHE_SHIFT),
+				page_address(page), size);
+		ntfs_unmap_page(page);
+	};
+	if (size == PAGE_CACHE_SIZE) {
+		size = i_size & ~PAGE_CACHE_MASK;
+		if (size)
+			goto read_partial_attrdef_page;
+	}
+	vol->attrdef_size = i_size;
+	ntfs_debug("Read %llu bytes from $AttrDef.", i_size);
+	iput(ino);
+	return true;
+free_iput_failed:
+	ntfs_free(vol->attrdef);
+	vol->attrdef = NULL;
+iput_failed:
+	iput(ino);
+failed:
+	ntfs_error(sb, "Failed to initialize attribute definition table.");
+	return false;
+}
+
+#endif /* NTFS_RW */
+
+/**
+ * load_and_init_upcase - load the upcase table for an ntfs volume
+ * @vol:	ntfs super block describing device whose upcase to load
+ *
+ * Return 'true' on success or 'false' on error.
+ */
+static bool load_and_init_upcase(ntfs_volume *vol)
+{
+	loff_t i_size;
+	struct super_block *sb = vol->sb;
+	struct inode *ino;
+	struct page *page;
+	pgoff_t index, max_index;
+	unsigned int size;
+	int i, max;
+
+	ntfs_debug("Entering.");
+	/* Read upcase table and setup vol->upcase and vol->upcase_len. */
+	ino = ntfs_iget(sb, FILE_UpCase);
+	if (IS_ERR(ino) || is_bad_inode(ino)) {
+		if (!IS_ERR(ino))
+			iput(ino);
+		goto upcase_failed;
+	}
+	/*
+	 * The upcase size must not be above 64k Unicode characters, must not
+	 * be zero and must be a multiple of sizeof(ntfschar).
+	 */
+	i_size = i_size_read(ino);
+	if (!i_size || i_size & (sizeof(ntfschar) - 1) ||
+			i_size > 64ULL * 1024 * sizeof(ntfschar))
+		goto iput_upcase_failed;
+	vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size);
+	if (!vol->upcase)
+		goto iput_upcase_failed;
+	index = 0;
+	max_index = i_size >> PAGE_CACHE_SHIFT;
+	size = PAGE_CACHE_SIZE;
+	while (index < max_index) {
+		/* Read the upcase table and copy it into the linear buffer. */
+read_partial_upcase_page:
+		page = ntfs_map_page(ino->i_mapping, index);
+		if (IS_ERR(page))
+			goto iput_upcase_failed;
+		memcpy((char*)vol->upcase + (index++ << PAGE_CACHE_SHIFT),
+				page_address(page), size);
+		ntfs_unmap_page(page);
+	};
+	if (size == PAGE_CACHE_SIZE) {
+		size = i_size & ~PAGE_CACHE_MASK;
+		if (size)
+			goto read_partial_upcase_page;
+	}
+	vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS;
+	ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).",
+			i_size, 64 * 1024 * sizeof(ntfschar));
+	iput(ino);
+	mutex_lock(&ntfs_lock);
+	if (!default_upcase) {
+		ntfs_debug("Using volume specified $UpCase since default is "
+				"not present.");
+		mutex_unlock(&ntfs_lock);
+		return true;
+	}
+	max = default_upcase_len;
+	if (max > vol->upcase_len)
+		max = vol->upcase_len;
+	for (i = 0; i < max; i++)
+		if (vol->upcase[i] != default_upcase[i])
+			break;
+	if (i == max) {
+		ntfs_free(vol->upcase);
+		vol->upcase = default_upcase;
+		vol->upcase_len = max;
+		ntfs_nr_upcase_users++;
+		mutex_unlock(&ntfs_lock);
+		ntfs_debug("Volume specified $UpCase matches default. Using "
+				"default.");
+		return true;
+	}
+	mutex_unlock(&ntfs_lock);
+	ntfs_debug("Using volume specified $UpCase since it does not match "
+			"the default.");
+	return true;
+iput_upcase_failed:
+	iput(ino);
+	ntfs_free(vol->upcase);
+	vol->upcase = NULL;
+upcase_failed:
+	mutex_lock(&ntfs_lock);
+	if (default_upcase) {
+		vol->upcase = default_upcase;
+		vol->upcase_len = default_upcase_len;
+		ntfs_nr_upcase_users++;
+		mutex_unlock(&ntfs_lock);
+		ntfs_error(sb, "Failed to load $UpCase from the volume. Using "
+				"default.");
+		return true;
+	}
+	mutex_unlock(&ntfs_lock);
+	ntfs_error(sb, "Failed to initialize upcase table.");
+	return false;
+}
+
+/*
+ * The lcn and mft bitmap inodes are NTFS-internal inodes with
+ * their own special locking rules:
+ */
+static struct lock_class_key
+	lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key,
+	mftbmp_runlist_lock_key, mftbmp_mrec_lock_key;
+
+/**
+ * load_system_files - open the system files using normal functions
+ * @vol:	ntfs super block describing device whose system files to load
+ *
+ * Open the system files with normal access functions and complete setting up
+ * the ntfs super block @vol.
+ *
+ * Return 'true' on success or 'false' on error.
+ */
+static bool load_system_files(ntfs_volume *vol)
+{
+	struct super_block *sb = vol->sb;
+	MFT_RECORD *m;
+	VOLUME_INFORMATION *vi;
+	ntfs_attr_search_ctx *ctx;
+#ifdef NTFS_RW
+	RESTART_PAGE_HEADER *rp;
+	int err;
+#endif /* NTFS_RW */
+
+	ntfs_debug("Entering.");
+#ifdef NTFS_RW
+	/* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */
+	if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) {
+		static const char *es1 = "Failed to load $MFTMirr";
+		static const char *es2 = "$MFTMirr does not match $MFT";
+		static const char *es3 = ".  Run ntfsfix and/or chkdsk.";
+
+		/* If a read-write mount, convert it to a read-only mount. */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+					ON_ERRORS_CONTINUE))) {
+				ntfs_error(sb, "%s and neither on_errors="
+						"continue nor on_errors="
+						"remount-ro was specified%s",
+						!vol->mftmirr_ino ? es1 : es2,
+						es3);
+				goto iput_mirr_err_out;
+			}
+			sb->s_flags |= MS_RDONLY;
+			ntfs_error(sb, "%s.  Mounting read-only%s",
+					!vol->mftmirr_ino ? es1 : es2, es3);
+		} else
+			ntfs_warning(sb, "%s.  Will not be able to remount "
+					"read-write%s",
+					!vol->mftmirr_ino ? es1 : es2, es3);
+		/* This will prevent a read-write remount. */
+		NVolSetErrors(vol);
+	}
+#endif /* NTFS_RW */
+	/* Get mft bitmap attribute inode. */
+	vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0);
+	if (IS_ERR(vol->mftbmp_ino)) {
+		ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute.");
+		goto iput_mirr_err_out;
+	}
+	lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock,
+			   &mftbmp_runlist_lock_key);
+	lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock,
+			   &mftbmp_mrec_lock_key);
+	/* Read upcase table and setup @vol->upcase and @vol->upcase_len. */
+	if (!load_and_init_upcase(vol))
+		goto iput_mftbmp_err_out;
+#ifdef NTFS_RW
+	/*
+	 * Read attribute definitions table and setup @vol->attrdef and
+	 * @vol->attrdef_size.
+	 */
+	if (!load_and_init_attrdef(vol))
+		goto iput_upcase_err_out;
+#endif /* NTFS_RW */
+	/*
+	 * Get the cluster allocation bitmap inode and verify the size, no
+	 * need for any locking at this stage as we are already running
+	 * exclusively as we are mount in progress task.
+	 */
+	vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap);
+	if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) {
+		if (!IS_ERR(vol->lcnbmp_ino))
+			iput(vol->lcnbmp_ino);
+		goto bitmap_failed;
+	}
+	lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock,
+			   &lcnbmp_runlist_lock_key);
+	lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock,
+			   &lcnbmp_mrec_lock_key);
+
+	NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino));
+	if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) {
+		iput(vol->lcnbmp_ino);
+bitmap_failed:
+		ntfs_error(sb, "Failed to load $Bitmap.");
+		goto iput_attrdef_err_out;
+	}
+	/*
+	 * Get the volume inode and setup our cache of the volume flags and
+	 * version.
+	 */
+	vol->vol_ino = ntfs_iget(sb, FILE_Volume);
+	if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) {
+		if (!IS_ERR(vol->vol_ino))
+			iput(vol->vol_ino);
+volume_failed:
+		ntfs_error(sb, "Failed to load $Volume.");
+		goto iput_lcnbmp_err_out;
+	}
+	m = map_mft_record(NTFS_I(vol->vol_ino));
+	if (IS_ERR(m)) {
+iput_volume_failed:
+		iput(vol->vol_ino);
+		goto volume_failed;
+	}
+	if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) {
+		ntfs_error(sb, "Failed to get attribute search context.");
+		goto get_ctx_vol_failed;
+	}
+	if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
+			ctx) || ctx->attr->non_resident || ctx->attr->flags) {
+err_put_vol:
+		ntfs_attr_put_search_ctx(ctx);
+get_ctx_vol_failed:
+		unmap_mft_record(NTFS_I(vol->vol_ino));
+		goto iput_volume_failed;
+	}
+	vi = (VOLUME_INFORMATION*)((char*)ctx->attr +
+			le16_to_cpu(ctx->attr->data.resident.value_offset));
+	/* Some bounds checks. */
+	if ((u8*)vi < (u8*)ctx->attr || (u8*)vi +
+			le32_to_cpu(ctx->attr->data.resident.value_length) >
+			(u8*)ctx->attr + le32_to_cpu(ctx->attr->length))
+		goto err_put_vol;
+	/* Copy the volume flags and version to the ntfs_volume structure. */
+	vol->vol_flags = vi->flags;
+	vol->major_ver = vi->major_ver;
+	vol->minor_ver = vi->minor_ver;
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(NTFS_I(vol->vol_ino));
+	printk(KERN_INFO "NTFS volume version %i.%i.\n", vol->major_ver,
+			vol->minor_ver);
+	if (vol->major_ver < 3 && NVolSparseEnabled(vol)) {
+		ntfs_warning(vol->sb, "Disabling sparse support due to NTFS "
+				"volume version %i.%i (need at least version "
+				"3.0).", vol->major_ver, vol->minor_ver);
+		NVolClearSparseEnabled(vol);
+	}
+#ifdef NTFS_RW
+	/* Make sure that no unsupported volume flags are set. */
+	if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
+		static const char *es1a = "Volume is dirty";
+		static const char *es1b = "Volume has been modified by chkdsk";
+		static const char *es1c = "Volume has unsupported flags set";
+		static const char *es2a = ".  Run chkdsk and mount in Windows.";
+		static const char *es2b = ".  Mount in Windows.";
+		const char *es1, *es2;
+
+		es2 = es2a;
+		if (vol->vol_flags & VOLUME_IS_DIRTY)
+			es1 = es1a;
+		else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
+			es1 = es1b;
+			es2 = es2b;
+		} else {
+			es1 = es1c;
+			ntfs_warning(sb, "Unsupported volume flags 0x%x "
+					"encountered.",
+					(unsigned)le16_to_cpu(vol->vol_flags));
+		}
+		/* If a read-write mount, convert it to a read-only mount. */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+					ON_ERRORS_CONTINUE))) {
+				ntfs_error(sb, "%s and neither on_errors="
+						"continue nor on_errors="
+						"remount-ro was specified%s",
+						es1, es2);
+				goto iput_vol_err_out;
+			}
+			sb->s_flags |= MS_RDONLY;
+			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		} else
+			ntfs_warning(sb, "%s.  Will not be able to remount "
+					"read-write%s", es1, es2);
+		/*
+		 * Do not set NVolErrors() because ntfs_remount() re-checks the
+		 * flags which we need to do in case any flags have changed.
+		 */
+	}
+	/*
+	 * Get the inode for the logfile, check it and determine if the volume
+	 * was shutdown cleanly.
+	 */
+	rp = NULL;
+	if (!load_and_check_logfile(vol, &rp) ||
+			!ntfs_is_logfile_clean(vol->logfile_ino, rp)) {
+		static const char *es1a = "Failed to load $LogFile";
+		static const char *es1b = "$LogFile is not clean";
+		static const char *es2 = ".  Mount in Windows.";
+		const char *es1;
+
+		es1 = !vol->logfile_ino ? es1a : es1b;
+		/* If a read-write mount, convert it to a read-only mount. */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+					ON_ERRORS_CONTINUE))) {
+				ntfs_error(sb, "%s and neither on_errors="
+						"continue nor on_errors="
+						"remount-ro was specified%s",
+						es1, es2);
+				if (vol->logfile_ino) {
+					BUG_ON(!rp);
+					ntfs_free(rp);
+				}
+				goto iput_logfile_err_out;
+			}
+			sb->s_flags |= MS_RDONLY;
+			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		} else
+			ntfs_warning(sb, "%s.  Will not be able to remount "
+					"read-write%s", es1, es2);
+		/* This will prevent a read-write remount. */
+		NVolSetErrors(vol);
+	}
+	ntfs_free(rp);
+#endif /* NTFS_RW */
+	/* Get the root directory inode so we can do path lookups. */
+	vol->root_ino = ntfs_iget(sb, FILE_root);
+	if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) {
+		if (!IS_ERR(vol->root_ino))
+			iput(vol->root_ino);
+		ntfs_error(sb, "Failed to load root directory.");
+		goto iput_logfile_err_out;
+	}
+#ifdef NTFS_RW
+	/*
+	 * Check if Windows is suspended to disk on the target volume.  If it
+	 * is hibernated, we must not write *anything* to the disk so set
+	 * NVolErrors() without setting the dirty volume flag and mount
+	 * read-only.  This will prevent read-write remounting and it will also
+	 * prevent all writes.
+	 */
+	err = check_windows_hibernation_status(vol);
+	if (unlikely(err)) {
+		static const char *es1a = "Failed to determine if Windows is "
+				"hibernated";
+		static const char *es1b = "Windows is hibernated";
+		static const char *es2 = ".  Run chkdsk.";
+		const char *es1;
+
+		es1 = err < 0 ? es1a : es1b;
+		/* If a read-write mount, convert it to a read-only mount. */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+					ON_ERRORS_CONTINUE))) {
+				ntfs_error(sb, "%s and neither on_errors="
+						"continue nor on_errors="
+						"remount-ro was specified%s",
+						es1, es2);
+				goto iput_root_err_out;
+			}
+			sb->s_flags |= MS_RDONLY;
+			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		} else
+			ntfs_warning(sb, "%s.  Will not be able to remount "
+					"read-write%s", es1, es2);
+		/* This will prevent a read-write remount. */
+		NVolSetErrors(vol);
+	}
+	/* If (still) a read-write mount, mark the volume dirty. */
+	if (!(sb->s_flags & MS_RDONLY) &&
+			ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
+		static const char *es1 = "Failed to set dirty bit in volume "
+				"information flags";
+		static const char *es2 = ".  Run chkdsk.";
+
+		/* Convert to a read-only mount. */
+		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+				ON_ERRORS_CONTINUE))) {
+			ntfs_error(sb, "%s and neither on_errors=continue nor "
+					"on_errors=remount-ro was specified%s",
+					es1, es2);
+			goto iput_root_err_out;
+		}
+		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		sb->s_flags |= MS_RDONLY;
+		/*
+		 * Do not set NVolErrors() because ntfs_remount() might manage
+		 * to set the dirty flag in which case all would be well.
+		 */
+	}
+#if 0
+	// TODO: Enable this code once we start modifying anything that is
+	//	 different between NTFS 1.2 and 3.x...
+	/*
+	 * If (still) a read-write mount, set the NT4 compatibility flag on
+	 * newer NTFS version volumes.
+	 */
+	if (!(sb->s_flags & MS_RDONLY) && (vol->major_ver > 1) &&
+			ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
+		static const char *es1 = "Failed to set NT4 compatibility flag";
+		static const char *es2 = ".  Run chkdsk.";
+
+		/* Convert to a read-only mount. */
+		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+				ON_ERRORS_CONTINUE))) {
+			ntfs_error(sb, "%s and neither on_errors=continue nor "
+					"on_errors=remount-ro was specified%s",
+					es1, es2);
+			goto iput_root_err_out;
+		}
+		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		sb->s_flags |= MS_RDONLY;
+		NVolSetErrors(vol);
+	}
+#endif
+	/* If (still) a read-write mount, empty the logfile. */
+	if (!(sb->s_flags & MS_RDONLY) &&
+			!ntfs_empty_logfile(vol->logfile_ino)) {
+		static const char *es1 = "Failed to empty $LogFile";
+		static const char *es2 = ".  Mount in Windows.";
+
+		/* Convert to a read-only mount. */
+		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+				ON_ERRORS_CONTINUE))) {
+			ntfs_error(sb, "%s and neither on_errors=continue nor "
+					"on_errors=remount-ro was specified%s",
+					es1, es2);
+			goto iput_root_err_out;
+		}
+		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		sb->s_flags |= MS_RDONLY;
+		NVolSetErrors(vol);
+	}
+#endif /* NTFS_RW */
+	/* If on NTFS versions before 3.0, we are done. */
+	if (unlikely(vol->major_ver < 3))
+		return true;
+	/* NTFS 3.0+ specific initialization. */
+	/* Get the security descriptors inode. */
+	vol->secure_ino = ntfs_iget(sb, FILE_Secure);
+	if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) {
+		if (!IS_ERR(vol->secure_ino))
+			iput(vol->secure_ino);
+		ntfs_error(sb, "Failed to load $Secure.");
+		goto iput_root_err_out;
+	}
+	// TODO: Initialize security.
+	/* Get the extended system files' directory inode. */
+	vol->extend_ino = ntfs_iget(sb, FILE_Extend);
+	if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) {
+		if (!IS_ERR(vol->extend_ino))
+			iput(vol->extend_ino);
+		ntfs_error(sb, "Failed to load $Extend.");
+		goto iput_sec_err_out;
+	}
+#ifdef NTFS_RW
+	/* Find the quota file, load it if present, and set it up. */
+	if (!load_and_init_quota(vol)) {
+		static const char *es1 = "Failed to load $Quota";
+		static const char *es2 = ".  Run chkdsk.";
+
+		/* If a read-write mount, convert it to a read-only mount. */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+					ON_ERRORS_CONTINUE))) {
+				ntfs_error(sb, "%s and neither on_errors="
+						"continue nor on_errors="
+						"remount-ro was specified%s",
+						es1, es2);
+				goto iput_quota_err_out;
+			}
+			sb->s_flags |= MS_RDONLY;
+			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		} else
+			ntfs_warning(sb, "%s.  Will not be able to remount "
+					"read-write%s", es1, es2);
+		/* This will prevent a read-write remount. */
+		NVolSetErrors(vol);
+	}
+	/* If (still) a read-write mount, mark the quotas out of date. */
+	if (!(sb->s_flags & MS_RDONLY) &&
+			!ntfs_mark_quotas_out_of_date(vol)) {
+		static const char *es1 = "Failed to mark quotas out of date";
+		static const char *es2 = ".  Run chkdsk.";
+
+		/* Convert to a read-only mount. */
+		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+				ON_ERRORS_CONTINUE))) {
+			ntfs_error(sb, "%s and neither on_errors=continue nor "
+					"on_errors=remount-ro was specified%s",
+					es1, es2);
+			goto iput_quota_err_out;
+		}
+		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		sb->s_flags |= MS_RDONLY;
+		NVolSetErrors(vol);
+	}
+	/*
+	 * Find the transaction log file ($UsnJrnl), load it if present, check
+	 * it, and set it up.
+	 */
+	if (!load_and_init_usnjrnl(vol)) {
+		static const char *es1 = "Failed to load $UsnJrnl";
+		static const char *es2 = ".  Run chkdsk.";
+
+		/* If a read-write mount, convert it to a read-only mount. */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+					ON_ERRORS_CONTINUE))) {
+				ntfs_error(sb, "%s and neither on_errors="
+						"continue nor on_errors="
+						"remount-ro was specified%s",
+						es1, es2);
+				goto iput_usnjrnl_err_out;
+			}
+			sb->s_flags |= MS_RDONLY;
+			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		} else
+			ntfs_warning(sb, "%s.  Will not be able to remount "
+					"read-write%s", es1, es2);
+		/* This will prevent a read-write remount. */
+		NVolSetErrors(vol);
+	}
+	/* If (still) a read-write mount, stamp the transaction log. */
+	if (!(sb->s_flags & MS_RDONLY) && !ntfs_stamp_usnjrnl(vol)) {
+		static const char *es1 = "Failed to stamp transaction log "
+				"($UsnJrnl)";
+		static const char *es2 = ".  Run chkdsk.";
+
+		/* Convert to a read-only mount. */
+		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+				ON_ERRORS_CONTINUE))) {
+			ntfs_error(sb, "%s and neither on_errors=continue nor "
+					"on_errors=remount-ro was specified%s",
+					es1, es2);
+			goto iput_usnjrnl_err_out;
+		}
+		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		sb->s_flags |= MS_RDONLY;
+		NVolSetErrors(vol);
+	}
+#endif /* NTFS_RW */
+	return true;
+#ifdef NTFS_RW
+iput_usnjrnl_err_out:
+	if (vol->usnjrnl_j_ino)
+		iput(vol->usnjrnl_j_ino);
+	if (vol->usnjrnl_max_ino)
+		iput(vol->usnjrnl_max_ino);
+	if (vol->usnjrnl_ino)
+		iput(vol->usnjrnl_ino);
+iput_quota_err_out:
+	if (vol->quota_q_ino)
+		iput(vol->quota_q_ino);
+	if (vol->quota_ino)
+		iput(vol->quota_ino);
+	iput(vol->extend_ino);
+#endif /* NTFS_RW */
+iput_sec_err_out:
+	iput(vol->secure_ino);
+iput_root_err_out:
+	iput(vol->root_ino);
+iput_logfile_err_out:
+#ifdef NTFS_RW
+	if (vol->logfile_ino)
+		iput(vol->logfile_ino);
+iput_vol_err_out:
+#endif /* NTFS_RW */
+	iput(vol->vol_ino);
+iput_lcnbmp_err_out:
+	iput(vol->lcnbmp_ino);
+iput_attrdef_err_out:
+	vol->attrdef_size = 0;
+	if (vol->attrdef) {
+		ntfs_free(vol->attrdef);
+		vol->attrdef = NULL;
+	}
+#ifdef NTFS_RW
+iput_upcase_err_out:
+#endif /* NTFS_RW */
+	vol->upcase_len = 0;
+	mutex_lock(&ntfs_lock);
+	if (vol->upcase == default_upcase) {
+		ntfs_nr_upcase_users--;
+		vol->upcase = NULL;
+	}
+	mutex_unlock(&ntfs_lock);
+	if (vol->upcase) {
+		ntfs_free(vol->upcase);
+		vol->upcase = NULL;
+	}
+iput_mftbmp_err_out:
+	iput(vol->mftbmp_ino);
+iput_mirr_err_out:
+#ifdef NTFS_RW
+	if (vol->mftmirr_ino)
+		iput(vol->mftmirr_ino);
+#endif /* NTFS_RW */
+	return false;
+}
+
+/**
+ * ntfs_put_super - called by the vfs to unmount a volume
+ * @sb:		vfs superblock of volume to unmount
+ *
+ * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when
+ * the volume is being unmounted (umount system call has been invoked) and it
+ * releases all inodes and memory belonging to the NTFS specific part of the
+ * super block.
+ */
+static void ntfs_put_super(struct super_block *sb)
+{
+	ntfs_volume *vol = NTFS_SB(sb);
+
+	ntfs_debug("Entering.");
+
+#ifdef NTFS_RW
+	/*
+	 * Commit all inodes while they are still open in case some of them
+	 * cause others to be dirtied.
+	 */
+	ntfs_commit_inode(vol->vol_ino);
+
+	/* NTFS 3.0+ specific. */
+	if (vol->major_ver >= 3) {
+		if (vol->usnjrnl_j_ino)
+			ntfs_commit_inode(vol->usnjrnl_j_ino);
+		if (vol->usnjrnl_max_ino)
+			ntfs_commit_inode(vol->usnjrnl_max_ino);
+		if (vol->usnjrnl_ino)
+			ntfs_commit_inode(vol->usnjrnl_ino);
+		if (vol->quota_q_ino)
+			ntfs_commit_inode(vol->quota_q_ino);
+		if (vol->quota_ino)
+			ntfs_commit_inode(vol->quota_ino);
+		if (vol->extend_ino)
+			ntfs_commit_inode(vol->extend_ino);
+		if (vol->secure_ino)
+			ntfs_commit_inode(vol->secure_ino);
+	}
+
+	ntfs_commit_inode(vol->root_ino);
+
+	down_write(&vol->lcnbmp_lock);
+	ntfs_commit_inode(vol->lcnbmp_ino);
+	up_write(&vol->lcnbmp_lock);
+
+	down_write(&vol->mftbmp_lock);
+	ntfs_commit_inode(vol->mftbmp_ino);
+	up_write(&vol->mftbmp_lock);
+
+	if (vol->logfile_ino)
+		ntfs_commit_inode(vol->logfile_ino);
+
+	if (vol->mftmirr_ino)
+		ntfs_commit_inode(vol->mftmirr_ino);
+	ntfs_commit_inode(vol->mft_ino);
+
+	/*
+	 * If a read-write mount and no volume errors have occurred, mark the
+	 * volume clean.  Also, re-commit all affected inodes.
+	 */
+	if (!(sb->s_flags & MS_RDONLY)) {
+		if (!NVolErrors(vol)) {
+			if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
+				ntfs_warning(sb, "Failed to clear dirty bit "
+						"in volume information "
+						"flags.  Run chkdsk.");
+			ntfs_commit_inode(vol->vol_ino);
+			ntfs_commit_inode(vol->root_ino);
+			if (vol->mftmirr_ino)
+				ntfs_commit_inode(vol->mftmirr_ino);
+			ntfs_commit_inode(vol->mft_ino);
+		} else {
+			ntfs_warning(sb, "Volume has errors.  Leaving volume "
+					"marked dirty.  Run chkdsk.");
+		}
+	}
+#endif /* NTFS_RW */
+
+	iput(vol->vol_ino);
+	vol->vol_ino = NULL;
+
+	/* NTFS 3.0+ specific clean up. */
+	if (vol->major_ver >= 3) {
+#ifdef NTFS_RW
+		if (vol->usnjrnl_j_ino) {
+			iput(vol->usnjrnl_j_ino);
+			vol->usnjrnl_j_ino = NULL;
+		}
+		if (vol->usnjrnl_max_ino) {
+			iput(vol->usnjrnl_max_ino);
+			vol->usnjrnl_max_ino = NULL;
+		}
+		if (vol->usnjrnl_ino) {
+			iput(vol->usnjrnl_ino);
+			vol->usnjrnl_ino = NULL;
+		}
+		if (vol->quota_q_ino) {
+			iput(vol->quota_q_ino);
+			vol->quota_q_ino = NULL;
+		}
+		if (vol->quota_ino) {
+			iput(vol->quota_ino);
+			vol->quota_ino = NULL;
+		}
+#endif /* NTFS_RW */
+		if (vol->extend_ino) {
+			iput(vol->extend_ino);
+			vol->extend_ino = NULL;
+		}
+		if (vol->secure_ino) {
+			iput(vol->secure_ino);
+			vol->secure_ino = NULL;
+		}
+	}
+
+	iput(vol->root_ino);
+	vol->root_ino = NULL;
+
+	down_write(&vol->lcnbmp_lock);
+	iput(vol->lcnbmp_ino);
+	vol->lcnbmp_ino = NULL;
+	up_write(&vol->lcnbmp_lock);
+
+	down_write(&vol->mftbmp_lock);
+	iput(vol->mftbmp_ino);
+	vol->mftbmp_ino = NULL;
+	up_write(&vol->mftbmp_lock);
+
+#ifdef NTFS_RW
+	if (vol->logfile_ino) {
+		iput(vol->logfile_ino);
+		vol->logfile_ino = NULL;
+	}
+	if (vol->mftmirr_ino) {
+		/* Re-commit the mft mirror and mft just in case. */
+		ntfs_commit_inode(vol->mftmirr_ino);
+		ntfs_commit_inode(vol->mft_ino);
+		iput(vol->mftmirr_ino);
+		vol->mftmirr_ino = NULL;
+	}
+	/*
+	 * We should have no dirty inodes left, due to
+	 * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
+	 * the underlying mft records are written out and cleaned.
+	 */
+	ntfs_commit_inode(vol->mft_ino);
+	write_inode_now(vol->mft_ino, 1);
+#endif /* NTFS_RW */
+
+	iput(vol->mft_ino);
+	vol->mft_ino = NULL;
+
+	/* Throw away the table of attribute definitions. */
+	vol->attrdef_size = 0;
+	if (vol->attrdef) {
+		ntfs_free(vol->attrdef);
+		vol->attrdef = NULL;
+	}
+	vol->upcase_len = 0;
+	/*
+	 * Destroy the global default upcase table if necessary.  Also decrease
+	 * the number of upcase users if we are a user.
+	 */
+	mutex_lock(&ntfs_lock);
+	if (vol->upcase == default_upcase) {
+		ntfs_nr_upcase_users--;
+		vol->upcase = NULL;
+	}
+	if (!ntfs_nr_upcase_users && default_upcase) {
+		ntfs_free(default_upcase);
+		default_upcase = NULL;
+	}
+	if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
+		free_compression_buffers();
+	mutex_unlock(&ntfs_lock);
+	if (vol->upcase) {
+		ntfs_free(vol->upcase);
+		vol->upcase = NULL;
+	}
+
+	unload_nls(vol->nls_map);
+
+	sb->s_fs_info = NULL;
+	kfree(vol);
+}
+
+/**
+ * get_nr_free_clusters - return the number of free clusters on a volume
+ * @vol:	ntfs volume for which to obtain free cluster count
+ *
+ * Calculate the number of free clusters on the mounted NTFS volume @vol. We
+ * actually calculate the number of clusters in use instead because this
+ * allows us to not care about partial pages as these will be just zero filled
+ * and hence not be counted as allocated clusters.
+ *
+ * The only particularity is that clusters beyond the end of the logical ntfs
+ * volume will be marked as allocated to prevent errors which means we have to
+ * discount those at the end. This is important as the cluster bitmap always
+ * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside
+ * the logical volume and marked in use when they are not as they do not exist.
+ *
+ * If any pages cannot be read we assume all clusters in the erroring pages are
+ * in use. This means we return an underestimate on errors which is better than
+ * an overestimate.
+ */
+static s64 get_nr_free_clusters(ntfs_volume *vol)
+{
+	s64 nr_free = vol->nr_clusters;
+	struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
+	struct page *page;
+	pgoff_t index, max_index;
+
+	ntfs_debug("Entering.");
+	/* Serialize accesses to the cluster bitmap. */
+	down_read(&vol->lcnbmp_lock);
+	/*
+	 * Convert the number of bits into bytes rounded up, then convert into
+	 * multiples of PAGE_CACHE_SIZE, rounding up so that if we have one
+	 * full and one partial page max_index = 2.
+	 */
+	max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_CACHE_SIZE - 1) >>
+			PAGE_CACHE_SHIFT;
+	/* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */
+	ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
+			max_index, PAGE_CACHE_SIZE / 4);
+	for (index = 0; index < max_index; index++) {
+		unsigned long *kaddr;
+
+		/*
+		 * Read the page from page cache, getting it from backing store
+		 * if necessary, and increment the use count.
+		 */
+		page = read_mapping_page(mapping, index, NULL);
+		/* Ignore pages which errored synchronously. */
+		if (IS_ERR(page)) {
+			ntfs_debug("read_mapping_page() error. Skipping "
+					"page (index 0x%lx).", index);
+			nr_free -= PAGE_CACHE_SIZE * 8;
+			continue;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		/*
+		 * Subtract the number of set bits. If this
+		 * is the last page and it is partial we don't really care as
+		 * it just means we do a little extra work but it won't affect
+		 * the result as all out of range bytes are set to zero by
+		 * ntfs_readpage().
+		 */
+		nr_free -= bitmap_weight(kaddr,
+					PAGE_CACHE_SIZE * BITS_PER_BYTE);
+		kunmap_atomic(kaddr, KM_USER0);
+		page_cache_release(page);
+	}
+	ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1);
+	/*
+	 * Fixup for eventual bits outside logical ntfs volume (see function
+	 * description above).
+	 */
+	if (vol->nr_clusters & 63)
+		nr_free += 64 - (vol->nr_clusters & 63);
+	up_read(&vol->lcnbmp_lock);
+	/* If errors occurred we may well have gone below zero, fix this. */
+	if (nr_free < 0)
+		nr_free = 0;
+	ntfs_debug("Exiting.");
+	return nr_free;
+}
+
+/**
+ * __get_nr_free_mft_records - return the number of free inodes on a volume
+ * @vol:	ntfs volume for which to obtain free inode count
+ * @nr_free:	number of mft records in filesystem
+ * @max_index:	maximum number of pages containing set bits
+ *
+ * Calculate the number of free mft records (inodes) on the mounted NTFS
+ * volume @vol. We actually calculate the number of mft records in use instead
+ * because this allows us to not care about partial pages as these will be just
+ * zero filled and hence not be counted as allocated mft record.
+ *
+ * If any pages cannot be read we assume all mft records in the erroring pages
+ * are in use. This means we return an underestimate on errors which is better
+ * than an overestimate.
+ *
+ * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing.
+ */
+static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
+		s64 nr_free, const pgoff_t max_index)
+{
+	struct address_space *mapping = vol->mftbmp_ino->i_mapping;
+	struct page *page;
+	pgoff_t index;
+
+	ntfs_debug("Entering.");
+	/* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */
+	ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
+			"0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
+	for (index = 0; index < max_index; index++) {
+		unsigned long *kaddr;
+
+		/*
+		 * Read the page from page cache, getting it from backing store
+		 * if necessary, and increment the use count.
+		 */
+		page = read_mapping_page(mapping, index, NULL);
+		/* Ignore pages which errored synchronously. */
+		if (IS_ERR(page)) {
+			ntfs_debug("read_mapping_page() error. Skipping "
+					"page (index 0x%lx).", index);
+			nr_free -= PAGE_CACHE_SIZE * 8;
+			continue;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		/*
+		 * Subtract the number of set bits. If this
+		 * is the last page and it is partial we don't really care as
+		 * it just means we do a little extra work but it won't affect
+		 * the result as all out of range bytes are set to zero by
+		 * ntfs_readpage().
+		 */
+		nr_free -= bitmap_weight(kaddr,
+					PAGE_CACHE_SIZE * BITS_PER_BYTE);
+		kunmap_atomic(kaddr, KM_USER0);
+		page_cache_release(page);
+	}
+	ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
+			index - 1);
+	/* If errors occurred we may well have gone below zero, fix this. */
+	if (nr_free < 0)
+		nr_free = 0;
+	ntfs_debug("Exiting.");
+	return nr_free;
+}
+
+/**
+ * ntfs_statfs - return information about mounted NTFS volume
+ * @dentry:	dentry from mounted volume
+ * @sfs:	statfs structure in which to return the information
+ *
+ * Return information about the mounted NTFS volume @dentry in the statfs structure
+ * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
+ * called). We interpret the values to be correct of the moment in time at
+ * which we are called. Most values are variable otherwise and this isn't just
+ * the free values but the totals as well. For example we can increase the
+ * total number of file nodes if we run out and we can keep doing this until
+ * there is no more space on the volume left at all.
+ *
+ * Called from vfs_statfs which is used to handle the statfs, fstatfs, and
+ * ustat system calls.
+ *
+ * Return 0 on success or -errno on error.
+ */
+static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
+{
+	struct super_block *sb = dentry->d_sb;
+	s64 size;
+	ntfs_volume *vol = NTFS_SB(sb);
+	ntfs_inode *mft_ni = NTFS_I(vol->mft_ino);
+	pgoff_t max_index;
+	unsigned long flags;
+
+	ntfs_debug("Entering.");
+	/* Type of filesystem. */
+	sfs->f_type   = NTFS_SB_MAGIC;
+	/* Optimal transfer block size. */
+	sfs->f_bsize  = PAGE_CACHE_SIZE;
+	/*
+	 * Total data blocks in filesystem in units of f_bsize and since
+	 * inodes are also stored in data blocs ($MFT is a file) this is just
+	 * the total clusters.
+	 */
+	sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >>
+				PAGE_CACHE_SHIFT;
+	/* Free data blocks in filesystem in units of f_bsize. */
+	size	      = get_nr_free_clusters(vol) << vol->cluster_size_bits >>
+				PAGE_CACHE_SHIFT;
+	if (size < 0LL)
+		size = 0LL;
+	/* Free blocks avail to non-superuser, same as above on NTFS. */
+	sfs->f_bavail = sfs->f_bfree = size;
+	/* Serialize accesses to the inode bitmap. */
+	down_read(&vol->mftbmp_lock);
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits;
+	/*
+	 * Convert the maximum number of set bits into bytes rounded up, then
+	 * convert into multiples of PAGE_CACHE_SIZE, rounding up so that if we
+	 * have one full and one partial page max_index = 2.
+	 */
+	max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits)
+			+ 7) >> 3) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	/* Number of inodes in filesystem (at this point in time). */
+	sfs->f_files = size;
+	/* Free inodes in fs (based on current total count). */
+	sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index);
+	up_read(&vol->mftbmp_lock);
+	/*
+	 * File system id. This is extremely *nix flavour dependent and even
+	 * within Linux itself all fs do their own thing. I interpret this to
+	 * mean a unique id associated with the mounted fs and not the id
+	 * associated with the filesystem driver, the latter is already given
+	 * by the filesystem type in sfs->f_type. Thus we use the 64-bit
+	 * volume serial number splitting it into two 32-bit parts. We enter
+	 * the least significant 32-bits in f_fsid[0] and the most significant
+	 * 32-bits in f_fsid[1].
+	 */
+	sfs->f_fsid.val[0] = vol->serial_no & 0xffffffff;
+	sfs->f_fsid.val[1] = (vol->serial_no >> 32) & 0xffffffff;
+	/* Maximum length of filenames. */
+	sfs->f_namelen	   = NTFS_MAX_NAME_LEN;
+	return 0;
+}
+
+#ifdef NTFS_RW
+static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
+{
+	return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
+}
+#endif
+
+/**
+ * The complete super operations.
+ */
+static const struct super_operations ntfs_sops = {
+	.alloc_inode	= ntfs_alloc_big_inode,	  /* VFS: Allocate new inode. */
+	.destroy_inode	= ntfs_destroy_big_inode, /* VFS: Deallocate inode. */
+#ifdef NTFS_RW
+	//.dirty_inode	= NULL,			/* VFS: Called from
+	//					   __mark_inode_dirty(). */
+	.write_inode	= ntfs_write_inode,	/* VFS: Write dirty inode to
+						   disk. */
+	//.drop_inode	= NULL,			/* VFS: Called just after the
+	//					   inode reference count has
+	//					   been decreased to zero.
+	//					   NOTE: The inode lock is
+	//					   held. See fs/inode.c::
+	//					   generic_drop_inode(). */
+	//.delete_inode	= NULL,			/* VFS: Delete inode from disk.
+	//					   Called when i_count becomes
+	//					   0 and i_nlink is also 0. */
+	//.write_super	= NULL,			/* Flush dirty super block to
+	//					   disk. */
+	//.sync_fs	= NULL,			/* ? */
+	//.write_super_lockfs	= NULL,		/* ? */
+	//.unlockfs	= NULL,			/* ? */
+#endif /* NTFS_RW */
+	.put_super	= ntfs_put_super,	/* Syscall: umount. */
+	.statfs		= ntfs_statfs,		/* Syscall: statfs */
+	.remount_fs	= ntfs_remount,		/* Syscall: mount -o remount. */
+	.evict_inode	= ntfs_evict_big_inode,	/* VFS: Called when an inode is
+						   removed from memory. */
+	//.umount_begin	= NULL,			/* Forced umount. */
+	.show_options	= ntfs_show_options,	/* Show mount options in
+						   proc. */
+};
+
+/**
+ * ntfs_fill_super - mount an ntfs filesystem
+ * @sb:		super block of ntfs filesystem to mount
+ * @opt:	string containing the mount options
+ * @silent:	silence error output
+ *
+ * ntfs_fill_super() is called by the VFS to mount the device described by @sb
+ * with the mount otions in @data with the NTFS filesystem.
+ *
+ * If @silent is true, remain silent even if errors are detected. This is used
+ * during bootup, when the kernel tries to mount the root filesystem with all
+ * registered filesystems one after the other until one succeeds. This implies
+ * that all filesystems except the correct one will quite correctly and
+ * expectedly return an error, but nobody wants to see error messages when in
+ * fact this is what is supposed to happen.
+ *
+ * NOTE: @sb->s_flags contains the mount options flags.
+ */
+static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
+{
+	ntfs_volume *vol;
+	struct buffer_head *bh;
+	struct inode *tmp_ino;
+	int blocksize, result;
+
+	/*
+	 * We do a pretty difficult piece of bootstrap by reading the
+	 * MFT (and other metadata) from disk into memory. We'll only
+	 * release this metadata during umount, so the locking patterns
+	 * observed during bootstrap do not count. So turn off the
+	 * observation of locking patterns (strictly for this context
+	 * only) while mounting NTFS. [The validator is still active
+	 * otherwise, even for this context: it will for example record
+	 * lock class registrations.]
+	 */
+	lockdep_off();
+	ntfs_debug("Entering.");
+#ifndef NTFS_RW
+	sb->s_flags |= MS_RDONLY;
+#endif /* ! NTFS_RW */
+	/* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
+	sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
+	vol = NTFS_SB(sb);
+	if (!vol) {
+		if (!silent)
+			ntfs_error(sb, "Allocation of NTFS volume structure "
+					"failed. Aborting mount...");
+		lockdep_on();
+		return -ENOMEM;
+	}
+	/* Initialize ntfs_volume structure. */
+	*vol = (ntfs_volume) {
+		.sb = sb,
+		/*
+		 * Default is group and other don't have any access to files or
+		 * directories while owner has full access. Further, files by
+		 * default are not executable but directories are of course
+		 * browseable.
+		 */
+		.fmask = 0177,
+		.dmask = 0077,
+	};
+	init_rwsem(&vol->mftbmp_lock);
+	init_rwsem(&vol->lcnbmp_lock);
+
+	/* By default, enable sparse support. */
+	NVolSetSparseEnabled(vol);
+
+	/* Important to get the mount options dealt with now. */
+	if (!parse_options(vol, (char*)opt))
+		goto err_out_now;
+
+	/* We support sector sizes up to the PAGE_CACHE_SIZE. */
+	if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
+		if (!silent)
+			ntfs_error(sb, "Device has unsupported sector size "
+					"(%i).  The maximum supported sector "
+					"size on this architecture is %lu "
+					"bytes.",
+					bdev_logical_block_size(sb->s_bdev),
+					PAGE_CACHE_SIZE);
+		goto err_out_now;
+	}
+	/*
+	 * Setup the device access block size to NTFS_BLOCK_SIZE or the hard
+	 * sector size, whichever is bigger.
+	 */
+	blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE);
+	if (blocksize < NTFS_BLOCK_SIZE) {
+		if (!silent)
+			ntfs_error(sb, "Unable to set device block size.");
+		goto err_out_now;
+	}
+	BUG_ON(blocksize != sb->s_blocksize);
+	ntfs_debug("Set device block size to %i bytes (block size bits %i).",
+			blocksize, sb->s_blocksize_bits);
+	/* Determine the size of the device in units of block_size bytes. */
+	if (!i_size_read(sb->s_bdev->bd_inode)) {
+		if (!silent)
+			ntfs_error(sb, "Unable to determine device size.");
+		goto err_out_now;
+	}
+	vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
+			sb->s_blocksize_bits;
+	/* Read the boot sector and return unlocked buffer head to it. */
+	if (!(bh = read_ntfs_boot_sector(sb, silent))) {
+		if (!silent)
+			ntfs_error(sb, "Not an NTFS volume.");
+		goto err_out_now;
+	}
+	/*
+	 * Extract the data from the boot sector and setup the ntfs volume
+	 * using it.
+	 */
+	result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data);
+	brelse(bh);
+	if (!result) {
+		if (!silent)
+			ntfs_error(sb, "Unsupported NTFS filesystem.");
+		goto err_out_now;
+	}
+	/*
+	 * If the boot sector indicates a sector size bigger than the current
+	 * device block size, switch the device block size to the sector size.
+	 * TODO: It may be possible to support this case even when the set
+	 * below fails, we would just be breaking up the i/o for each sector
+	 * into multiple blocks for i/o purposes but otherwise it should just
+	 * work.  However it is safer to leave disabled until someone hits this
+	 * error message and then we can get them to try it without the setting
+	 * so we know for sure that it works.
+	 */
+	if (vol->sector_size > blocksize) {
+		blocksize = sb_set_blocksize(sb, vol->sector_size);
+		if (blocksize != vol->sector_size) {
+			if (!silent)
+				ntfs_error(sb, "Unable to set device block "
+						"size to sector size (%i).",
+						vol->sector_size);
+			goto err_out_now;
+		}
+		BUG_ON(blocksize != sb->s_blocksize);
+		vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
+				sb->s_blocksize_bits;
+		ntfs_debug("Changed device block size to %i bytes (block size "
+				"bits %i) to match volume sector size.",
+				blocksize, sb->s_blocksize_bits);
+	}
+	/* Initialize the cluster and mft allocators. */
+	ntfs_setup_allocators(vol);
+	/* Setup remaining fields in the super block. */
+	sb->s_magic = NTFS_SB_MAGIC;
+	/*
+	 * Ntfs allows 63 bits for the file size, i.e. correct would be:
+	 *	sb->s_maxbytes = ~0ULL >> 1;
+	 * But the kernel uses a long as the page cache page index which on
+	 * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel
+	 * defined to the maximum the page cache page index can cope with
+	 * without overflowing the index or to 2^63 - 1, whichever is smaller.
+	 */
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	/* Ntfs measures time in 100ns intervals. */
+	sb->s_time_gran = 100;
+	/*
+	 * Now load the metadata required for the page cache and our address
+	 * space operations to function. We do this by setting up a specialised
+	 * read_inode method and then just calling the normal iget() to obtain
+	 * the inode for $MFT which is sufficient to allow our normal inode
+	 * operations and associated address space operations to function.
+	 */
+	sb->s_op = &ntfs_sops;
+	tmp_ino = new_inode(sb);
+	if (!tmp_ino) {
+		if (!silent)
+			ntfs_error(sb, "Failed to load essential metadata.");
+		goto err_out_now;
+	}
+	tmp_ino->i_ino = FILE_MFT;
+	insert_inode_hash(tmp_ino);
+	if (ntfs_read_inode_mount(tmp_ino) < 0) {
+		if (!silent)
+			ntfs_error(sb, "Failed to load essential metadata.");
+		goto iput_tmp_ino_err_out_now;
+	}
+	mutex_lock(&ntfs_lock);
+	/*
+	 * The current mount is a compression user if the cluster size is
+	 * less than or equal 4kiB.
+	 */
+	if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) {
+		result = allocate_compression_buffers();
+		if (result) {
+			ntfs_error(NULL, "Failed to allocate buffers "
+					"for compression engine.");
+			ntfs_nr_compression_users--;
+			mutex_unlock(&ntfs_lock);
+			goto iput_tmp_ino_err_out_now;
+		}
+	}
+	/*
+	 * Generate the global default upcase table if necessary.  Also
+	 * temporarily increment the number of upcase users to avoid race
+	 * conditions with concurrent (u)mounts.
+	 */
+	if (!default_upcase)
+		default_upcase = generate_default_upcase();
+	ntfs_nr_upcase_users++;
+	mutex_unlock(&ntfs_lock);
+	/*
+	 * From now on, ignore @silent parameter. If we fail below this line,
+	 * it will be due to a corrupt fs or a system error, so we report it.
+	 */
+	/*
+	 * Open the system files with normal access functions and complete
+	 * setting up the ntfs super block.
+	 */
+	if (!load_system_files(vol)) {
+		ntfs_error(sb, "Failed to load system files.");
+		goto unl_upcase_iput_tmp_ino_err_out_now;
+	}
+	if ((sb->s_root = d_alloc_root(vol->root_ino))) {
+		/* We grab a reference, simulating an ntfs_iget(). */
+		ihold(vol->root_ino);
+		ntfs_debug("Exiting, status successful.");
+		/* Release the default upcase if it has no users. */
+		mutex_lock(&ntfs_lock);
+		if (!--ntfs_nr_upcase_users && default_upcase) {
+			ntfs_free(default_upcase);
+			default_upcase = NULL;
+		}
+		mutex_unlock(&ntfs_lock);
+		sb->s_export_op = &ntfs_export_ops;
+		lockdep_on();
+		return 0;
+	}
+	ntfs_error(sb, "Failed to allocate root directory.");
+	/* Clean up after the successful load_system_files() call from above. */
+	// TODO: Use ntfs_put_super() instead of repeating all this code...
+	// FIXME: Should mark the volume clean as the error is most likely
+	// 	  -ENOMEM.
+	iput(vol->vol_ino);
+	vol->vol_ino = NULL;
+	/* NTFS 3.0+ specific clean up. */
+	if (vol->major_ver >= 3) {
+#ifdef NTFS_RW
+		if (vol->usnjrnl_j_ino) {
+			iput(vol->usnjrnl_j_ino);
+			vol->usnjrnl_j_ino = NULL;
+		}
+		if (vol->usnjrnl_max_ino) {
+			iput(vol->usnjrnl_max_ino);
+			vol->usnjrnl_max_ino = NULL;
+		}
+		if (vol->usnjrnl_ino) {
+			iput(vol->usnjrnl_ino);
+			vol->usnjrnl_ino = NULL;
+		}
+		if (vol->quota_q_ino) {
+			iput(vol->quota_q_ino);
+			vol->quota_q_ino = NULL;
+		}
+		if (vol->quota_ino) {
+			iput(vol->quota_ino);
+			vol->quota_ino = NULL;
+		}
+#endif /* NTFS_RW */
+		if (vol->extend_ino) {
+			iput(vol->extend_ino);
+			vol->extend_ino = NULL;
+		}
+		if (vol->secure_ino) {
+			iput(vol->secure_ino);
+			vol->secure_ino = NULL;
+		}
+	}
+	iput(vol->root_ino);
+	vol->root_ino = NULL;
+	iput(vol->lcnbmp_ino);
+	vol->lcnbmp_ino = NULL;
+	iput(vol->mftbmp_ino);
+	vol->mftbmp_ino = NULL;
+#ifdef NTFS_RW
+	if (vol->logfile_ino) {
+		iput(vol->logfile_ino);
+		vol->logfile_ino = NULL;
+	}
+	if (vol->mftmirr_ino) {
+		iput(vol->mftmirr_ino);
+		vol->mftmirr_ino = NULL;
+	}
+#endif /* NTFS_RW */
+	/* Throw away the table of attribute definitions. */
+	vol->attrdef_size = 0;
+	if (vol->attrdef) {
+		ntfs_free(vol->attrdef);
+		vol->attrdef = NULL;
+	}
+	vol->upcase_len = 0;
+	mutex_lock(&ntfs_lock);
+	if (vol->upcase == default_upcase) {
+		ntfs_nr_upcase_users--;
+		vol->upcase = NULL;
+	}
+	mutex_unlock(&ntfs_lock);
+	if (vol->upcase) {
+		ntfs_free(vol->upcase);
+		vol->upcase = NULL;
+	}
+	if (vol->nls_map) {
+		unload_nls(vol->nls_map);
+		vol->nls_map = NULL;
+	}
+	/* Error exit code path. */
+unl_upcase_iput_tmp_ino_err_out_now:
+	/*
+	 * Decrease the number of upcase users and destroy the global default
+	 * upcase table if necessary.
+	 */
+	mutex_lock(&ntfs_lock);
+	if (!--ntfs_nr_upcase_users && default_upcase) {
+		ntfs_free(default_upcase);
+		default_upcase = NULL;
+	}
+	if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
+		free_compression_buffers();
+	mutex_unlock(&ntfs_lock);
+iput_tmp_ino_err_out_now:
+	iput(tmp_ino);
+	if (vol->mft_ino && vol->mft_ino != tmp_ino)
+		iput(vol->mft_ino);
+	vol->mft_ino = NULL;
+	/* Errors at this stage are irrelevant. */
+err_out_now:
+	sb->s_fs_info = NULL;
+	kfree(vol);
+	ntfs_debug("Failed, returning -EINVAL.");
+	lockdep_on();
+	return -EINVAL;
+}
+
+/*
+ * This is a slab cache to optimize allocations and deallocations of Unicode
+ * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN
+ * (255) Unicode characters + a terminating NULL Unicode character.
+ */
+struct kmem_cache *ntfs_name_cache;
+
+/* Slab caches for efficient allocation/deallocation of inodes. */
+struct kmem_cache *ntfs_inode_cache;
+struct kmem_cache *ntfs_big_inode_cache;
+
+/* Init once constructor for the inode slab cache. */
+static void ntfs_big_inode_init_once(void *foo)
+{
+	ntfs_inode *ni = (ntfs_inode *)foo;
+
+	inode_init_once(VFS_I(ni));
+}
+
+/*
+ * Slab caches to optimize allocations and deallocations of attribute search
+ * contexts and index contexts, respectively.
+ */
+struct kmem_cache *ntfs_attr_ctx_cache;
+struct kmem_cache *ntfs_index_ctx_cache;
+
+/* Driver wide mutex. */
+DEFINE_MUTEX(ntfs_lock);
+
+static struct dentry *ntfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
+}
+
+static struct file_system_type ntfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ntfs",
+	.mount		= ntfs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+/* Stable names for the slab caches. */
+static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache";
+static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache";
+static const char ntfs_name_cache_name[] = "ntfs_name_cache";
+static const char ntfs_inode_cache_name[] = "ntfs_inode_cache";
+static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache";
+
+static int __init init_ntfs_fs(void)
+{
+	int err = 0;
+
+	/* This may be ugly but it results in pretty output so who cares. (-8 */
+	printk(KERN_INFO "NTFS driver " NTFS_VERSION " [Flags: R/"
+#ifdef NTFS_RW
+			"W"
+#else
+			"O"
+#endif
+#ifdef DEBUG
+			" DEBUG"
+#endif
+#ifdef MODULE
+			" MODULE"
+#endif
+			"].\n");
+
+	ntfs_debug("Debug messages are enabled.");
+
+	ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name,
+			sizeof(ntfs_index_context), 0 /* offset */,
+			SLAB_HWCACHE_ALIGN, NULL /* ctor */);
+	if (!ntfs_index_ctx_cache) {
+		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
+				ntfs_index_ctx_cache_name);
+		goto ictx_err_out;
+	}
+	ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
+			sizeof(ntfs_attr_search_ctx), 0 /* offset */,
+			SLAB_HWCACHE_ALIGN, NULL /* ctor */);
+	if (!ntfs_attr_ctx_cache) {
+		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
+				ntfs_attr_ctx_cache_name);
+		goto actx_err_out;
+	}
+
+	ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name,
+			(NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
+			SLAB_HWCACHE_ALIGN, NULL);
+	if (!ntfs_name_cache) {
+		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
+				ntfs_name_cache_name);
+		goto name_err_out;
+	}
+
+	ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name,
+			sizeof(ntfs_inode), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+	if (!ntfs_inode_cache) {
+		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
+				ntfs_inode_cache_name);
+		goto inode_err_out;
+	}
+
+	ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
+			sizeof(big_ntfs_inode), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			ntfs_big_inode_init_once);
+	if (!ntfs_big_inode_cache) {
+		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
+				ntfs_big_inode_cache_name);
+		goto big_inode_err_out;
+	}
+
+	/* Register the ntfs sysctls. */
+	err = ntfs_sysctl(1);
+	if (err) {
+		printk(KERN_CRIT "NTFS: Failed to register NTFS sysctls!\n");
+		goto sysctl_err_out;
+	}
+
+	err = register_filesystem(&ntfs_fs_type);
+	if (!err) {
+		ntfs_debug("NTFS driver registered successfully.");
+		return 0; /* Success! */
+	}
+	printk(KERN_CRIT "NTFS: Failed to register NTFS filesystem driver!\n");
+
+sysctl_err_out:
+	kmem_cache_destroy(ntfs_big_inode_cache);
+big_inode_err_out:
+	kmem_cache_destroy(ntfs_inode_cache);
+inode_err_out:
+	kmem_cache_destroy(ntfs_name_cache);
+name_err_out:
+	kmem_cache_destroy(ntfs_attr_ctx_cache);
+actx_err_out:
+	kmem_cache_destroy(ntfs_index_ctx_cache);
+ictx_err_out:
+	if (!err) {
+		printk(KERN_CRIT "NTFS: Aborting NTFS filesystem driver "
+				"registration...\n");
+		err = -ENOMEM;
+	}
+	return err;
+}
+
+static void __exit exit_ntfs_fs(void)
+{
+	ntfs_debug("Unregistering NTFS driver.");
+
+	unregister_filesystem(&ntfs_fs_type);
+	kmem_cache_destroy(ntfs_big_inode_cache);
+	kmem_cache_destroy(ntfs_inode_cache);
+	kmem_cache_destroy(ntfs_name_cache);
+	kmem_cache_destroy(ntfs_attr_ctx_cache);
+	kmem_cache_destroy(ntfs_index_ctx_cache);
+	/* Unregister the ntfs sysctls. */
+	ntfs_sysctl(0);
+}
+
+MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
+MODULE_VERSION(NTFS_VERSION);
+MODULE_LICENSE("GPL");
+#ifdef DEBUG
+module_param(debug_msgs, bool, 0);
+MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
+#endif
+
+module_init(init_ntfs_fs)
+module_exit(exit_ntfs_fs)
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
new file mode 100644
index 00000000..79a89184
--- /dev/null
+++ b/fs/ntfs/sysctl.c
@@ -0,0 +1,83 @@
+/*
+ * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of
+ *	      the Linux-NTFS project. Adapted from the old NTFS driver,
+ *	      Copyright (C) 1997 Martin von Löwis, Régis Duchesne
+ *
+ * Copyright (c) 2002-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifdef DEBUG
+
+#include <linux/module.h>
+
+#ifdef CONFIG_SYSCTL
+
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+
+#include "sysctl.h"
+#include "debug.h"
+
+/* Definition of the ntfs sysctl. */
+static ctl_table ntfs_sysctls[] = {
+	{
+		.procname	= "ntfs-debug",
+		.data		= &debug_msgs,		/* Data pointer and size. */
+		.maxlen		= sizeof(debug_msgs),
+		.mode		= 0644,			/* Mode, proc handler. */
+		.proc_handler	= proc_dointvec
+	},
+	{}
+};
+
+/* Define the parent directory /proc/sys/fs. */
+static ctl_table sysctls_root[] = {
+	{
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= ntfs_sysctls
+	},
+	{}
+};
+
+/* Storage for the sysctls header. */
+static struct ctl_table_header *sysctls_root_table = NULL;
+
+/**
+ * ntfs_sysctl - add or remove the debug sysctl
+ * @add:	add (1) or remove (0) the sysctl
+ *
+ * Add or remove the debug sysctl. Return 0 on success or -errno on error.
+ */
+int ntfs_sysctl(int add)
+{
+	if (add) {
+		BUG_ON(sysctls_root_table);
+		sysctls_root_table = register_sysctl_table(sysctls_root);
+		if (!sysctls_root_table)
+			return -ENOMEM;
+	} else {
+		BUG_ON(!sysctls_root_table);
+		unregister_sysctl_table(sysctls_root_table);
+		sysctls_root_table = NULL;
+	}
+	return 0;
+}
+
+#endif /* CONFIG_SYSCTL */
+#endif /* DEBUG */
diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h
new file mode 100644
index 00000000..d4f8ce92
--- /dev/null
+++ b/fs/ntfs/sysctl.h
@@ -0,0 +1,41 @@
+/*
+ * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of
+ *	      the Linux-NTFS project. Adapted from the old NTFS driver,
+ *	      Copyright (C) 1997 Martin von Löwis, Régis Duchesne
+ *
+ * Copyright (c) 2002-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_SYSCTL_H
+#define _LINUX_NTFS_SYSCTL_H
+
+
+#if defined(DEBUG) && defined(CONFIG_SYSCTL)
+
+extern int ntfs_sysctl(int add);
+
+#else
+
+/* Just return success. */
+static inline int ntfs_sysctl(int add)
+{
+	return 0;
+}
+
+#endif /* DEBUG && CONFIG_SYSCTL */
+#endif /* _LINUX_NTFS_SYSCTL_H */
diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h
new file mode 100644
index 00000000..01233989
--- /dev/null
+++ b/fs/ntfs/time.h
@@ -0,0 +1,100 @@
+/*
+ * time.h - NTFS time conversion functions.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_TIME_H
+#define _LINUX_NTFS_TIME_H
+
+#include <linux/time.h>		/* For current_kernel_time(). */
+#include <asm/div64.h>		/* For do_div(). */
+
+#include "endian.h"
+
+#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000)
+
+/**
+ * utc2ntfs - convert Linux UTC time to NTFS time
+ * @ts:		Linux UTC time to convert to NTFS time
+ *
+ * Convert the Linux UTC time @ts to its corresponding NTFS time and return
+ * that in little endian format.
+ *
+ * Linux stores time in a struct timespec consisting of a time_t (long at
+ * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second
+ * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of
+ * 1-nano-second intervals since the value of tv_sec.
+ *
+ * NTFS uses Microsoft's standard time format which is stored in a s64 and is
+ * measured as the number of 100-nano-second intervals since 1st January 1601,
+ * 00:00:00 UTC.
+ */
+static inline sle64 utc2ntfs(const struct timespec ts)
+{
+	/*
+	 * Convert the seconds to 100ns intervals, add the nano-seconds
+	 * converted to 100ns intervals, and then add the NTFS time offset.
+	 */
+	return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 +
+			NTFS_TIME_OFFSET);
+}
+
+/**
+ * get_current_ntfs_time - get the current time in little endian NTFS format
+ *
+ * Get the current time from the Linux kernel, convert it to its corresponding
+ * NTFS time and return that in little endian format.
+ */
+static inline sle64 get_current_ntfs_time(void)
+{
+	return utc2ntfs(current_kernel_time());
+}
+
+/**
+ * ntfs2utc - convert NTFS time to Linux time
+ * @time:	NTFS time (little endian) to convert to Linux UTC
+ *
+ * Convert the little endian NTFS time @time to its corresponding Linux UTC
+ * time and return that in cpu format.
+ *
+ * Linux stores time in a struct timespec consisting of a time_t (long at
+ * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second
+ * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of
+ * 1-nano-second intervals since the value of tv_sec.
+ *
+ * NTFS uses Microsoft's standard time format which is stored in a s64 and is
+ * measured as the number of 100 nano-second intervals since 1st January 1601,
+ * 00:00:00 UTC.
+ */
+static inline struct timespec ntfs2utc(const sle64 time)
+{
+	struct timespec ts;
+
+	/* Subtract the NTFS time offset. */
+	u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET);
+	/*
+	 * Convert the time to 1-second intervals and the remainder to
+	 * 1-nano-second intervals.
+	 */
+	ts.tv_nsec = do_div(t, 10000000) * 100;
+	ts.tv_sec = t;
+	return ts;
+}
+
+#endif /* _LINUX_NTFS_TIME_H */
diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h
new file mode 100644
index 00000000..8c8053b6
--- /dev/null
+++ b/fs/ntfs/types.h
@@ -0,0 +1,69 @@
+/*
+ * types.h - Defines for NTFS Linux kernel driver specific types.
+ *	     Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_TYPES_H
+#define _LINUX_NTFS_TYPES_H
+
+#include <linux/types.h>
+
+typedef __le16 le16;
+typedef __le32 le32;
+typedef __le64 le64;
+typedef __u16 __bitwise sle16;
+typedef __u32 __bitwise sle32;
+typedef __u64 __bitwise sle64;
+
+/* 2-byte Unicode character type. */
+typedef le16 ntfschar;
+#define UCHAR_T_SIZE_BITS 1
+
+/*
+ * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN
+ * and VCN, to allow for type checking and better code readability.
+ */
+typedef s64 VCN;
+typedef sle64 leVCN;
+typedef s64 LCN;
+typedef sle64 leLCN;
+
+/*
+ * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit
+ * values.  We define our own type LSN, to allow for type checking and better
+ * code readability.
+ */
+typedef s64 LSN;
+typedef sle64 leLSN;
+
+/*
+ * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values.
+ * We define our own type USN, to allow for type checking and better code
+ * readability.
+ */
+typedef s64 USN;
+typedef sle64 leUSN;
+
+typedef enum {
+	CASE_SENSITIVE = 0,
+	IGNORE_CASE = 1,
+} IGNORE_CASE_BOOL;
+
+#endif /* _LINUX_NTFS_TYPES_H */
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
new file mode 100644
index 00000000..005ca4b0
--- /dev/null
+++ b/fs/ntfs/unistr.c
@@ -0,0 +1,398 @@
+/*
+ * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2006 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/slab.h>
+
+#include "types.h"
+#include "debug.h"
+#include "ntfs.h"
+
+/*
+ * IMPORTANT
+ * =========
+ *
+ * All these routines assume that the Unicode characters are in little endian
+ * encoding inside the strings!!!
+ */
+
+/*
+ * This is used by the name collation functions to quickly determine what
+ * characters are (in)valid.
+ */
+static const u8 legal_ansi_char_array[0x40] = {
+	0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+
+	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+
+	0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
+	0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
+
+	0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
+	0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
+};
+
+/**
+ * ntfs_are_names_equal - compare two Unicode names for equality
+ * @s1:			name to compare to @s2
+ * @s1_len:		length in Unicode characters of @s1
+ * @s2:			name to compare to @s1
+ * @s2_len:		length in Unicode characters of @s2
+ * @ic:			ignore case bool
+ * @upcase:		upcase table (only if @ic == IGNORE_CASE)
+ * @upcase_size:	length in Unicode characters of @upcase (if present)
+ *
+ * Compare the names @s1 and @s2 and return 'true' (1) if the names are
+ * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE,
+ * the @upcase table is used to performa a case insensitive comparison.
+ */
+bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
+		const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic,
+		const ntfschar *upcase, const u32 upcase_size)
+{
+	if (s1_len != s2_len)
+		return false;
+	if (ic == CASE_SENSITIVE)
+		return !ntfs_ucsncmp(s1, s2, s1_len);
+	return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size);
+}
+
+/**
+ * ntfs_collate_names - collate two Unicode names
+ * @name1:	first Unicode name to compare
+ * @name2:	second Unicode name to compare
+ * @err_val:	if @name1 contains an invalid character return this value
+ * @ic:		either CASE_SENSITIVE or IGNORE_CASE
+ * @upcase:	upcase table (ignored if @ic is CASE_SENSITIVE)
+ * @upcase_len:	upcase table size (ignored if @ic is CASE_SENSITIVE)
+ *
+ * ntfs_collate_names collates two Unicode names and returns:
+ *
+ *  -1 if the first name collates before the second one,
+ *   0 if the names match,
+ *   1 if the second name collates before the first one, or
+ * @err_val if an invalid character is found in @name1 during the comparison.
+ *
+ * The following characters are considered invalid: '"', '*', '<', '>' and '?'.
+ */
+int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
+		const ntfschar *name2, const u32 name2_len,
+		const int err_val, const IGNORE_CASE_BOOL ic,
+		const ntfschar *upcase, const u32 upcase_len)
+{
+	u32 cnt, min_len;
+	u16 c1, c2;
+
+	min_len = name1_len;
+	if (name1_len > name2_len)
+		min_len = name2_len;
+	for (cnt = 0; cnt < min_len; ++cnt) {
+		c1 = le16_to_cpu(*name1++);
+		c2 = le16_to_cpu(*name2++);
+		if (ic) {
+			if (c1 < upcase_len)
+				c1 = le16_to_cpu(upcase[c1]);
+			if (c2 < upcase_len)
+				c2 = le16_to_cpu(upcase[c2]);
+		}
+		if (c1 < 64 && legal_ansi_char_array[c1] & 8)
+			return err_val;
+		if (c1 < c2)
+			return -1;
+		if (c1 > c2)
+			return 1;
+	}
+	if (name1_len < name2_len)
+		return -1;
+	if (name1_len == name2_len)
+		return 0;
+	/* name1_len > name2_len */
+	c1 = le16_to_cpu(*name1);
+	if (c1 < 64 && legal_ansi_char_array[c1] & 8)
+		return err_val;
+	return 1;
+}
+
+/**
+ * ntfs_ucsncmp - compare two little endian Unicode strings
+ * @s1:		first string
+ * @s2:		second string
+ * @n:		maximum unicode characters to compare
+ *
+ * Compare the first @n characters of the Unicode strings @s1 and @s2,
+ * The strings in little endian format and appropriate le16_to_cpu()
+ * conversion is performed on non-little endian machines.
+ *
+ * The function returns an integer less than, equal to, or greater than zero
+ * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
+ * to be less than, to match, or be greater than @s2.
+ */
+int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
+{
+	u16 c1, c2;
+	size_t i;
+
+	for (i = 0; i < n; ++i) {
+		c1 = le16_to_cpu(s1[i]);
+		c2 = le16_to_cpu(s2[i]);
+		if (c1 < c2)
+			return -1;
+		if (c1 > c2)
+			return 1;
+		if (!c1)
+			break;
+	}
+	return 0;
+}
+
+/**
+ * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
+ * @s1:			first string
+ * @s2:			second string
+ * @n:			maximum unicode characters to compare
+ * @upcase:		upcase table
+ * @upcase_size:	upcase table size in Unicode characters
+ *
+ * Compare the first @n characters of the Unicode strings @s1 and @s2,
+ * ignoring case. The strings in little endian format and appropriate
+ * le16_to_cpu() conversion is performed on non-little endian machines.
+ *
+ * Each character is uppercased using the @upcase table before the comparison.
+ *
+ * The function returns an integer less than, equal to, or greater than zero
+ * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
+ * to be less than, to match, or be greater than @s2.
+ */
+int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
+		const ntfschar *upcase, const u32 upcase_size)
+{
+	size_t i;
+	u16 c1, c2;
+
+	for (i = 0; i < n; ++i) {
+		if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
+			c1 = le16_to_cpu(upcase[c1]);
+		if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
+			c2 = le16_to_cpu(upcase[c2]);
+		if (c1 < c2)
+			return -1;
+		if (c1 > c2)
+			return 1;
+		if (!c1)
+			break;
+	}
+	return 0;
+}
+
+void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase,
+		const u32 upcase_len)
+{
+	u32 i;
+	u16 u;
+
+	for (i = 0; i < name_len; i++)
+		if ((u = le16_to_cpu(name[i])) < upcase_len)
+			name[i] = upcase[u];
+}
+
+void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
+		const ntfschar *upcase, const u32 upcase_len)
+{
+	ntfs_upcase_name((ntfschar*)&file_name_attr->file_name,
+			file_name_attr->file_name_length, upcase, upcase_len);
+}
+
+int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
+		FILE_NAME_ATTR *file_name_attr2,
+		const int err_val, const IGNORE_CASE_BOOL ic,
+		const ntfschar *upcase, const u32 upcase_len)
+{
+	return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name,
+			file_name_attr1->file_name_length,
+			(ntfschar*)&file_name_attr2->file_name,
+			file_name_attr2->file_name_length,
+			err_val, ic, upcase, upcase_len);
+}
+
+/**
+ * ntfs_nlstoucs - convert NLS string to little endian Unicode string
+ * @vol:	ntfs volume which we are working with
+ * @ins:	input NLS string buffer
+ * @ins_len:	length of input string in bytes
+ * @outs:	on return contains the allocated output Unicode string buffer
+ *
+ * Convert the input string @ins, which is in whatever format the loaded NLS
+ * map dictates, into a little endian, 2-byte Unicode string.
+ *
+ * This function allocates the string and the caller is responsible for
+ * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it.
+ *
+ * On success the function returns the number of Unicode characters written to
+ * the output string *@outs (>= 0), not counting the terminating Unicode NULL
+ * character. *@outs is set to the allocated output string buffer.
+ *
+ * On error, a negative number corresponding to the error code is returned. In
+ * that case the output string is not allocated. Both *@outs and *@outs_len
+ * are then undefined.
+ *
+ * This might look a bit odd due to fast path optimization...
+ */
+int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
+		const int ins_len, ntfschar **outs)
+{
+	struct nls_table *nls = vol->nls_map;
+	ntfschar *ucs;
+	wchar_t wc;
+	int i, o, wc_len;
+
+	/* We do not trust outside sources. */
+	if (likely(ins)) {
+		ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS);
+		if (likely(ucs)) {
+			for (i = o = 0; i < ins_len; i += wc_len) {
+				wc_len = nls->char2uni(ins + i, ins_len - i,
+						&wc);
+				if (likely(wc_len >= 0 &&
+						o < NTFS_MAX_NAME_LEN)) {
+					if (likely(wc)) {
+						ucs[o++] = cpu_to_le16(wc);
+						continue;
+					} /* else if (!wc) */
+					break;
+				} /* else if (wc_len < 0 ||
+						o >= NTFS_MAX_NAME_LEN) */
+				goto name_err;
+			}
+			ucs[o] = 0;
+			*outs = ucs;
+			return o;
+		} /* else if (!ucs) */
+		ntfs_error(vol->sb, "Failed to allocate buffer for converted "
+				"name from ntfs_name_cache.");
+		return -ENOMEM;
+	} /* else if (!ins) */
+	ntfs_error(vol->sb, "Received NULL pointer.");
+	return -EINVAL;
+name_err:
+	kmem_cache_free(ntfs_name_cache, ucs);
+	if (wc_len < 0) {
+		ntfs_error(vol->sb, "Name using character set %s contains "
+				"characters that cannot be converted to "
+				"Unicode.", nls->charset);
+		i = -EILSEQ;
+	} else /* if (o >= NTFS_MAX_NAME_LEN) */ {
+		ntfs_error(vol->sb, "Name is too long (maximum length for a "
+				"name on NTFS is %d Unicode characters.",
+				NTFS_MAX_NAME_LEN);
+		i = -ENAMETOOLONG;
+	}
+	return i;
+}
+
+/**
+ * ntfs_ucstonls - convert little endian Unicode string to NLS string
+ * @vol:	ntfs volume which we are working with
+ * @ins:	input Unicode string buffer
+ * @ins_len:	length of input string in Unicode characters
+ * @outs:	on return contains the (allocated) output NLS string buffer
+ * @outs_len:	length of output string buffer in bytes
+ *
+ * Convert the input little endian, 2-byte Unicode string @ins, of length
+ * @ins_len into the string format dictated by the loaded NLS.
+ *
+ * If *@outs is NULL, this function allocates the string and the caller is
+ * responsible for calling kfree(*@outs); when finished with it. In this case
+ * @outs_len is ignored and can be 0.
+ *
+ * On success the function returns the number of bytes written to the output
+ * string *@outs (>= 0), not counting the terminating NULL byte. If the output
+ * string buffer was allocated, *@outs is set to it.
+ *
+ * On error, a negative number corresponding to the error code is returned. In
+ * that case the output string is not allocated. The contents of *@outs are
+ * then undefined.
+ *
+ * This might look a bit odd due to fast path optimization...
+ */
+int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
+		const int ins_len, unsigned char **outs, int outs_len)
+{
+	struct nls_table *nls = vol->nls_map;
+	unsigned char *ns;
+	int i, o, ns_len, wc;
+
+	/* We don't trust outside sources. */
+	if (ins) {
+		ns = *outs;
+		ns_len = outs_len;
+		if (ns && !ns_len) {
+			wc = -ENAMETOOLONG;
+			goto conversion_err;
+		}
+		if (!ns) {
+			ns_len = ins_len * NLS_MAX_CHARSET_SIZE;
+			ns = kmalloc(ns_len + 1, GFP_NOFS);
+			if (!ns)
+				goto mem_err_out;
+		}
+		for (i = o = 0; i < ins_len; i++) {
+retry:			wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
+					ns_len - o);
+			if (wc > 0) {
+				o += wc;
+				continue;
+			} else if (!wc)
+				break;
+			else if (wc == -ENAMETOOLONG && ns != *outs) {
+				unsigned char *tc;
+				/* Grow in multiples of 64 bytes. */
+				tc = kmalloc((ns_len + 64) &
+						~63, GFP_NOFS);
+				if (tc) {
+					memcpy(tc, ns, ns_len);
+					ns_len = ((ns_len + 64) & ~63) - 1;
+					kfree(ns);
+					ns = tc;
+					goto retry;
+				} /* No memory so goto conversion_error; */
+			} /* wc < 0, real error. */
+			goto conversion_err;
+		}
+		ns[o] = 0;
+		*outs = ns;
+		return o;
+	} /* else (!ins) */
+	ntfs_error(vol->sb, "Received NULL pointer.");
+	return -EINVAL;
+conversion_err:
+	ntfs_error(vol->sb, "Unicode name contains characters that cannot be "
+			"converted to character set %s.  You might want to "
+			"try to use the mount option nls=utf8.", nls->charset);
+	if (ns != *outs)
+		kfree(ns);
+	if (wc != -ENAMETOOLONG)
+		wc = -EILSEQ;
+	return wc;
+mem_err_out:
+	ntfs_error(vol->sb, "Failed to allocate name!");
+	return -ENOMEM;
+}
diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
new file mode 100644
index 00000000..e2f72ca9
--- /dev/null
+++ b/fs/ntfs/upcase.c
@@ -0,0 +1,87 @@
+/*
+ * upcase.c - Generate the full NTFS Unicode upcase table in little endian.
+ *	      Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org>
+ * Copyright (c) 2001-2006 Anton Altaparmakov
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS source
+ * in the file COPYING); if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "malloc.h"
+#include "ntfs.h"
+
+ntfschar *generate_default_upcase(void)
+{
+	static const int uc_run_table[][3] = { /* Start, End, Add */
+	{0x0061, 0x007B,  -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72,  74},
+	{0x00E0, 0x00F7,  -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76,  86},
+	{0x00F8, 0x00FF,  -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
+	{0x0256, 0x0258, -205}, {0x1F00, 0x1F08,   8}, {0x1F78, 0x1F7A, 128},
+	{0x028A, 0x028C, -217}, {0x1F10, 0x1F16,   8}, {0x1F7A, 0x1F7C, 112},
+	{0x03AC, 0x03AD,  -38}, {0x1F20, 0x1F28,   8}, {0x1F7C, 0x1F7E, 126},
+	{0x03AD, 0x03B0,  -37}, {0x1F30, 0x1F38,   8}, {0x1FB0, 0x1FB2,   8},
+	{0x03B1, 0x03C2,  -32}, {0x1F40, 0x1F46,   8}, {0x1FD0, 0x1FD2,   8},
+	{0x03C2, 0x03C3,  -31}, {0x1F51, 0x1F52,   8}, {0x1FE0, 0x1FE2,   8},
+	{0x03C3, 0x03CC,  -32}, {0x1F53, 0x1F54,   8}, {0x1FE5, 0x1FE6,   7},
+	{0x03CC, 0x03CD,  -64}, {0x1F55, 0x1F56,   8}, {0x2170, 0x2180, -16},
+	{0x03CD, 0x03CF,  -63}, {0x1F57, 0x1F58,   8}, {0x24D0, 0x24EA, -26},
+	{0x0430, 0x0450,  -32}, {0x1F60, 0x1F68,   8}, {0xFF41, 0xFF5B, -32},
+	{0}
+	};
+
+	static const int uc_dup_table[][2] = { /* Start, End */
+	{0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
+	{0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
+	{0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
+	{0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
+	{0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
+	{0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
+	{0}
+	};
+
+	static const int uc_word_table[][2] = { /* Offset, Value */
+	{0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
+	{0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
+	{0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
+	{0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
+	{0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
+	{0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
+	{0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
+	{0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
+	{0}
+	};
+
+	int i, r;
+	ntfschar *uc;
+
+	uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar));
+	if (!uc)
+		return uc;
+	memset(uc, 0, default_upcase_len * sizeof(ntfschar));
+	/* Generate the little endian Unicode upcase table used by ntfs. */
+	for (i = 0; i < default_upcase_len; i++)
+		uc[i] = cpu_to_le16(i);
+	for (r = 0; uc_run_table[r][0]; r++)
+		for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
+			le16_add_cpu(&uc[i], uc_run_table[r][2]);
+	for (r = 0; uc_dup_table[r][0]; r++)
+		for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
+			le16_add_cpu(&uc[i + 1], -1);
+	for (r = 0; uc_word_table[r][0]; r++)
+		uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]);
+	return uc;
+}
diff --git a/fs/ntfs/usnjrnl.c b/fs/ntfs/usnjrnl.c
new file mode 100644
index 00000000..b2bc0d55
--- /dev/null
+++ b/fs/ntfs/usnjrnl.c
@@ -0,0 +1,84 @@
+/*
+ * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling.  Part of the
+ *	       Linux-NTFS project.
+ *
+ * Copyright (c) 2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifdef NTFS_RW
+
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+
+#include "aops.h"
+#include "debug.h"
+#include "endian.h"
+#include "time.h"
+#include "types.h"
+#include "usnjrnl.h"
+#include "volume.h"
+
+/**
+ * ntfs_stamp_usnjrnl - stamp the transaction log ($UsnJrnl) on an ntfs volume
+ * @vol:	ntfs volume on which to stamp the transaction log
+ *
+ * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return
+ * 'true' on success and 'false' on error.
+ *
+ * This function assumes that the transaction log has already been loaded and
+ * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl().
+ */
+bool ntfs_stamp_usnjrnl(ntfs_volume *vol)
+{
+	ntfs_debug("Entering.");
+	if (likely(!NVolUsnJrnlStamped(vol))) {
+		sle64 stamp;
+		struct page *page;
+		USN_HEADER *uh;
+
+		page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
+		if (IS_ERR(page)) {
+			ntfs_error(vol->sb, "Failed to read from "
+					"$UsnJrnl/$DATA/$Max attribute.");
+			return false;
+		}
+		uh = (USN_HEADER*)page_address(page);
+		stamp = get_current_ntfs_time();
+		ntfs_debug("Stamping transaction log ($UsnJrnl): old "
+				"journal_id 0x%llx, old lowest_valid_usn "
+				"0x%llx, new journal_id 0x%llx, new "
+				"lowest_valid_usn 0x%llx.",
+				(long long)sle64_to_cpu(uh->journal_id),
+				(long long)sle64_to_cpu(uh->lowest_valid_usn),
+				(long long)sle64_to_cpu(stamp),
+				i_size_read(vol->usnjrnl_j_ino));
+		uh->lowest_valid_usn =
+				cpu_to_sle64(i_size_read(vol->usnjrnl_j_ino));
+		uh->journal_id = stamp;
+		flush_dcache_page(page);
+		set_page_dirty(page);
+		ntfs_unmap_page(page);
+		/* Set the flag so we do not have to do it again on remount. */
+		NVolSetUsnJrnlStamped(vol);
+	}
+	ntfs_debug("Done.");
+	return true;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
new file mode 100644
index 00000000..00d8e6bd
--- /dev/null
+++ b/fs/ntfs/usnjrnl.h
@@ -0,0 +1,205 @@
+/*
+ * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling.
+ *	       Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_USNJRNL_H
+#define _LINUX_NTFS_USNJRNL_H
+
+#ifdef NTFS_RW
+
+#include "types.h"
+#include "endian.h"
+#include "layout.h"
+#include "volume.h"
+
+/*
+ * Transaction log ($UsnJrnl) organization:
+ *
+ * The transaction log records whenever a file is modified in any way.  So for
+ * example it will record that file "blah" was written to at a particular time
+ * but not what was written.  If will record that a file was deleted or
+ * created, that a file was truncated, etc.  See below for all the reason
+ * codes used.
+ *
+ * The transaction log is in the $Extend directory which is in the root
+ * directory of each volume.  If it is not present it means transaction
+ * logging is disabled.  If it is present it means transaction logging is
+ * either enabled or in the process of being disabled in which case we can
+ * ignore it as it will go away as soon as Windows gets its hands on it.
+ *
+ * To determine whether the transaction logging is enabled or in the process
+ * of being disabled, need to check the volume flags in the
+ * $VOLUME_INFORMATION attribute in the $Volume system file (which is present
+ * in the root directory and has a fixed mft record number, see layout.h).
+ * If the flag VOLUME_DELETE_USN_UNDERWAY is set it means the transaction log
+ * is in the process of being disabled and if this flag is clear it means the
+ * transaction log is enabled.
+ *
+ * The transaction log consists of two parts; the $DATA/$Max attribute as well
+ * as the $DATA/$J attribute.  $Max is a header describing the transaction
+ * log whilst $J is the transaction log data itself as a sequence of variable
+ * sized USN_RECORDs (see below for all the structures).
+ *
+ * We do not care about transaction logging at this point in time but we still
+ * need to let windows know that the transaction log is out of date.  To do
+ * this we need to stamp the transaction log.  This involves setting the
+ * lowest_valid_usn field in the $DATA/$Max attribute to the usn to be used
+ * for the next added USN_RECORD to the $DATA/$J attribute as well as
+ * generating a new journal_id in $DATA/$Max.
+ *
+ * The journal_id is as of the current version (2.0) of the transaction log
+ * simply the 64-bit timestamp of when the journal was either created or last
+ * stamped.
+ *
+ * To determine the next usn there are two ways.  The first is to parse
+ * $DATA/$J and to find the last USN_RECORD in it and to add its record_length
+ * to its usn (which is the byte offset in the $DATA/$J attribute).  The
+ * second is simply to take the data size of the attribute.  Since the usns
+ * are simply byte offsets into $DATA/$J, this is exactly the next usn.  For
+ * obvious reasons we use the second method as it is much simpler and faster.
+ *
+ * As an aside, note that to actually disable the transaction log, one would
+ * need to set the VOLUME_DELETE_USN_UNDERWAY flag (see above), then go
+ * through all the mft records on the volume and set the usn field in their
+ * $STANDARD_INFORMATION attribute to zero.  Once that is done, one would need
+ * to delete the transaction log file, i.e. \$Extent\$UsnJrnl, and finally,
+ * one would need to clear the VOLUME_DELETE_USN_UNDERWAY flag.
+ *
+ * Note that if a volume is unmounted whilst the transaction log is being
+ * disabled, the process will continue the next time the volume is mounted.
+ * This is why we can safely mount read-write when we see a transaction log
+ * in the process of being deleted.
+ */
+
+/* Some $UsnJrnl related constants. */
+#define UsnJrnlMajorVer		2
+#define UsnJrnlMinorVer		0
+
+/*
+ * $DATA/$Max attribute.  This is (always?) resident and has a fixed size of
+ * 32 bytes.  It contains the header describing the transaction log.
+ */
+typedef struct {
+/*Ofs*/
+/*   0*/sle64 maximum_size;	/* The maximum on-disk size of the $DATA/$J
+				   attribute. */
+/*   8*/sle64 allocation_delta;	/* Number of bytes by which to increase the
+				   size of the $DATA/$J attribute. */
+/*0x10*/sle64 journal_id;	/* Current id of the transaction log. */
+/*0x18*/leUSN lowest_valid_usn;	/* Lowest valid usn in $DATA/$J for the
+				   current journal_id. */
+/* sizeof() = 32 (0x20) bytes */
+} __attribute__ ((__packed__)) USN_HEADER;
+
+/*
+ * Reason flags (32-bit).  Cumulative flags describing the change(s) to the
+ * file since it was last opened.  I think the names speak for themselves but
+ * if you disagree check out the descriptions in the Linux NTFS project NTFS
+ * documentation: http://www.linux-ntfs.org/
+ */
+enum {
+	USN_REASON_DATA_OVERWRITE	= cpu_to_le32(0x00000001),
+	USN_REASON_DATA_EXTEND		= cpu_to_le32(0x00000002),
+	USN_REASON_DATA_TRUNCATION	= cpu_to_le32(0x00000004),
+	USN_REASON_NAMED_DATA_OVERWRITE	= cpu_to_le32(0x00000010),
+	USN_REASON_NAMED_DATA_EXTEND	= cpu_to_le32(0x00000020),
+	USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
+	USN_REASON_FILE_CREATE		= cpu_to_le32(0x00000100),
+	USN_REASON_FILE_DELETE		= cpu_to_le32(0x00000200),
+	USN_REASON_EA_CHANGE		= cpu_to_le32(0x00000400),
+	USN_REASON_SECURITY_CHANGE	= cpu_to_le32(0x00000800),
+	USN_REASON_RENAME_OLD_NAME	= cpu_to_le32(0x00001000),
+	USN_REASON_RENAME_NEW_NAME	= cpu_to_le32(0x00002000),
+	USN_REASON_INDEXABLE_CHANGE	= cpu_to_le32(0x00004000),
+	USN_REASON_BASIC_INFO_CHANGE	= cpu_to_le32(0x00008000),
+	USN_REASON_HARD_LINK_CHANGE	= cpu_to_le32(0x00010000),
+	USN_REASON_COMPRESSION_CHANGE	= cpu_to_le32(0x00020000),
+	USN_REASON_ENCRYPTION_CHANGE	= cpu_to_le32(0x00040000),
+	USN_REASON_OBJECT_ID_CHANGE	= cpu_to_le32(0x00080000),
+	USN_REASON_REPARSE_POINT_CHANGE	= cpu_to_le32(0x00100000),
+	USN_REASON_STREAM_CHANGE	= cpu_to_le32(0x00200000),
+	USN_REASON_CLOSE		= cpu_to_le32(0x80000000),
+};
+
+typedef le32 USN_REASON_FLAGS;
+
+/*
+ * Source info flags (32-bit).  Information about the source of the change(s)
+ * to the file.  For detailed descriptions of what these mean, see the Linux
+ * NTFS project NTFS documentation:
+ *	http://www.linux-ntfs.org/
+ */
+enum {
+	USN_SOURCE_DATA_MANAGEMENT	  = cpu_to_le32(0x00000001),
+	USN_SOURCE_AUXILIARY_DATA	  = cpu_to_le32(0x00000002),
+	USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
+};
+
+typedef le32 USN_SOURCE_INFO_FLAGS;
+
+/*
+ * $DATA/$J attribute.  This is always non-resident, is marked as sparse, and
+ * is of variabled size.  It consists of a sequence of variable size
+ * USN_RECORDS.  The minimum allocated_size is allocation_delta as
+ * specified in $DATA/$Max.  When the maximum_size specified in $DATA/$Max is
+ * exceeded by more than allocation_delta bytes, allocation_delta bytes are
+ * allocated and appended to the $DATA/$J attribute and an equal number of
+ * bytes at the beginning of the attribute are freed and made sparse.  Note the
+ * making sparse only happens at volume checkpoints and hence the actual
+ * $DATA/$J size can exceed maximum_size + allocation_delta temporarily.
+ */
+typedef struct {
+/*Ofs*/
+/*   0*/le32 length;		/* Byte size of this record (8-byte
+				   aligned). */
+/*   4*/le16 major_ver;		/* Major version of the transaction log used
+				   for this record. */
+/*   6*/le16 minor_ver;		/* Minor version of the transaction log used
+				   for this record. */
+/*   8*/leMFT_REF mft_reference;/* The mft reference of the file (or
+				   directory) described by this record. */
+/*0x10*/leMFT_REF parent_directory;/* The mft reference of the parent
+				   directory of the file described by this
+				   record. */
+/*0x18*/leUSN usn;		/* The usn of this record.  Equals the offset
+				   within the $DATA/$J attribute. */
+/*0x20*/sle64 time;		/* Time when this record was created. */
+/*0x28*/USN_REASON_FLAGS reason;/* Reason flags (see above). */
+/*0x2c*/USN_SOURCE_INFO_FLAGS source_info;/* Source info flags (see above). */
+/*0x30*/le32 security_id;	/* File security_id copied from
+				   $STANDARD_INFORMATION. */
+/*0x34*/FILE_ATTR_FLAGS file_attributes;	/* File attributes copied from
+				   $STANDARD_INFORMATION or $FILE_NAME (not
+				   sure which). */
+/*0x38*/le16 file_name_size;	/* Size of the file name in bytes. */
+/*0x3a*/le16 file_name_offset;	/* Offset to the file name in bytes from the
+				   start of this record. */
+/*0x3c*/ntfschar file_name[0];	/* Use when creating only.  When reading use
+				   file_name_offset to determine the location
+				   of the name. */
+/* sizeof() = 60 (0x3c) bytes */
+} __attribute__ ((__packed__)) USN_RECORD;
+
+extern bool ntfs_stamp_usnjrnl(ntfs_volume *vol);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_USNJRNL_H */
diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h
new file mode 100644
index 00000000..406ab55d
--- /dev/null
+++ b/fs/ntfs/volume.h
@@ -0,0 +1,177 @@
+/*
+ * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part
+ *	      of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2006 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_VOLUME_H
+#define _LINUX_NTFS_VOLUME_H
+
+#include <linux/rwsem.h>
+
+#include "types.h"
+#include "layout.h"
+
+/*
+ * The NTFS in memory super block structure.
+ */
+typedef struct {
+	/*
+	 * FIXME: Reorder to have commonly used together element within the
+	 * same cache line, aiming at a cache line size of 32 bytes. Aim for
+	 * 64 bytes for less commonly used together elements. Put most commonly
+	 * used elements to front of structure. Obviously do this only when the
+	 * structure has stabilized... (AIA)
+	 */
+	/* Device specifics. */
+	struct super_block *sb;		/* Pointer back to the super_block. */
+	LCN nr_blocks;			/* Number of sb->s_blocksize bytes
+					   sized blocks on the device. */
+	/* Configuration provided by user at mount time. */
+	unsigned long flags;		/* Miscellaneous flags, see below. */
+	uid_t uid;			/* uid that files will be mounted as. */
+	gid_t gid;			/* gid that files will be mounted as. */
+	mode_t fmask;			/* The mask for file permissions. */
+	mode_t dmask;			/* The mask for directory
+					   permissions. */
+	u8 mft_zone_multiplier;		/* Initial mft zone multiplier. */
+	u8 on_errors;			/* What to do on filesystem errors. */
+	/* NTFS bootsector provided information. */
+	u16 sector_size;		/* in bytes */
+	u8 sector_size_bits;		/* log2(sector_size) */
+	u32 cluster_size;		/* in bytes */
+	u32 cluster_size_mask;		/* cluster_size - 1 */
+	u8 cluster_size_bits;		/* log2(cluster_size) */
+	u32 mft_record_size;		/* in bytes */
+	u32 mft_record_size_mask;	/* mft_record_size - 1 */
+	u8 mft_record_size_bits;	/* log2(mft_record_size) */
+	u32 index_record_size;		/* in bytes */
+	u32 index_record_size_mask;	/* index_record_size - 1 */
+	u8 index_record_size_bits;	/* log2(index_record_size) */
+	LCN nr_clusters;		/* Volume size in clusters == number of
+					   bits in lcn bitmap. */
+	LCN mft_lcn;			/* Cluster location of mft data. */
+	LCN mftmirr_lcn;		/* Cluster location of copy of mft. */
+	u64 serial_no;			/* The volume serial number. */
+	/* Mount specific NTFS information. */
+	u32 upcase_len;			/* Number of entries in upcase[]. */
+	ntfschar *upcase;		/* The upcase table. */
+
+	s32 attrdef_size;		/* Size of the attribute definition
+					   table in bytes. */
+	ATTR_DEF *attrdef;		/* Table of attribute definitions.
+					   Obtained from FILE_AttrDef. */
+
+#ifdef NTFS_RW
+	/* Variables used by the cluster and mft allocators. */
+	s64 mft_data_pos;		/* Mft record number at which to
+					   allocate the next mft record. */
+	LCN mft_zone_start;		/* First cluster of the mft zone. */
+	LCN mft_zone_end;		/* First cluster beyond the mft zone. */
+	LCN mft_zone_pos;		/* Current position in the mft zone. */
+	LCN data1_zone_pos;		/* Current position in the first data
+					   zone. */
+	LCN data2_zone_pos;		/* Current position in the second data
+					   zone. */
+#endif /* NTFS_RW */
+
+	struct inode *mft_ino;		/* The VFS inode of $MFT. */
+
+	struct inode *mftbmp_ino;	/* Attribute inode for $MFT/$BITMAP. */
+	struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the
+					    mft record bitmap ($MFT/$BITMAP). */
+#ifdef NTFS_RW
+	struct inode *mftmirr_ino;	/* The VFS inode of $MFTMirr. */
+	int mftmirr_size;		/* Size of mft mirror in mft records. */
+
+	struct inode *logfile_ino;	/* The VFS inode of $LogFile. */
+#endif /* NTFS_RW */
+
+	struct inode *lcnbmp_ino;	/* The VFS inode of $Bitmap. */
+	struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the
+					    cluster bitmap ($Bitmap/$DATA). */
+
+	struct inode *vol_ino;		/* The VFS inode of $Volume. */
+	VOLUME_FLAGS vol_flags;		/* Volume flags. */
+	u8 major_ver;			/* Ntfs major version of volume. */
+	u8 minor_ver;			/* Ntfs minor version of volume. */
+
+	struct inode *root_ino;		/* The VFS inode of the root
+					   directory. */
+	struct inode *secure_ino;	/* The VFS inode of $Secure (NTFS3.0+
+					   only, otherwise NULL). */
+	struct inode *extend_ino;	/* The VFS inode of $Extend (NTFS3.0+
+					   only, otherwise NULL). */
+#ifdef NTFS_RW
+	/* $Quota stuff is NTFS3.0+ specific.  Unused/NULL otherwise. */
+	struct inode *quota_ino;	/* The VFS inode of $Quota. */
+	struct inode *quota_q_ino;	/* Attribute inode for $Quota/$Q. */
+	/* $UsnJrnl stuff is NTFS3.0+ specific.  Unused/NULL otherwise. */
+	struct inode *usnjrnl_ino;	/* The VFS inode of $UsnJrnl. */
+	struct inode *usnjrnl_max_ino;	/* Attribute inode for $UsnJrnl/$Max. */
+	struct inode *usnjrnl_j_ino;	/* Attribute inode for $UsnJrnl/$J. */
+#endif /* NTFS_RW */
+	struct nls_table *nls_map;
+} ntfs_volume;
+
+/*
+ * Defined bits for the flags field in the ntfs_volume structure.
+ */
+typedef enum {
+	NV_Errors,		/* 1: Volume has errors, prevent remount rw. */
+	NV_ShowSystemFiles,	/* 1: Return system files in ntfs_readdir(). */
+	NV_CaseSensitive,	/* 1: Treat file names as case sensitive and
+				      create filenames in the POSIX namespace.
+				      Otherwise be case insensitive but still
+				      create file names in POSIX namespace. */
+	NV_LogFileEmpty,	/* 1: $LogFile journal is empty. */
+	NV_QuotaOutOfDate,	/* 1: $Quota is out of date. */
+	NV_UsnJrnlStamped,	/* 1: $UsnJrnl has been stamped. */
+	NV_SparseEnabled,	/* 1: May create sparse files. */
+} ntfs_volume_flags;
+
+/*
+ * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo()
+ * functions.
+ */
+#define DEFINE_NVOL_BIT_OPS(flag)					\
+static inline int NVol##flag(ntfs_volume *vol)		\
+{							\
+	return test_bit(NV_##flag, &(vol)->flags);	\
+}							\
+static inline void NVolSet##flag(ntfs_volume *vol)	\
+{							\
+	set_bit(NV_##flag, &(vol)->flags);		\
+}							\
+static inline void NVolClear##flag(ntfs_volume *vol)	\
+{							\
+	clear_bit(NV_##flag, &(vol)->flags);		\
+}
+
+/* Emit the ntfs volume bitops functions. */
+DEFINE_NVOL_BIT_OPS(Errors)
+DEFINE_NVOL_BIT_OPS(ShowSystemFiles)
+DEFINE_NVOL_BIT_OPS(CaseSensitive)
+DEFINE_NVOL_BIT_OPS(LogFileEmpty)
+DEFINE_NVOL_BIT_OPS(QuotaOutOfDate)
+DEFINE_NVOL_BIT_OPS(UsnJrnlStamped)
+DEFINE_NVOL_BIT_OPS(SparseEnabled)
+
+#endif /* _LINUX_NTFS_VOLUME_H */
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
new file mode 100644
index 00000000..77a8de5f
--- /dev/null
+++ b/fs/ocfs2/Kconfig
@@ -0,0 +1,76 @@
+config OCFS2_FS
+	tristate "OCFS2 file system support"
+	depends on NET && SYSFS && CONFIGFS_FS
+	select JBD2
+	select CRC32
+	select QUOTA
+	select QUOTA_TREE
+	select FS_POSIX_ACL
+	help
+	  OCFS2 is a general purpose extent based shared disk cluster file
+	  system with many similarities to ext3. It supports 64 bit inode
+	  numbers, and has automatically extending metadata groups which may
+	  also make it attractive for non-clustered use.
+
+	  You'll want to install the ocfs2-tools package in order to at least
+	  get "mount.ocfs2".
+
+	  Project web page:    http://oss.oracle.com/projects/ocfs2
+	  Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+	  OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+	  For more information on OCFS2, see the file
+	  <file:Documentation/filesystems/ocfs2.txt>.
+
+config OCFS2_FS_O2CB
+	tristate "O2CB Kernelspace Clustering"
+	depends on OCFS2_FS
+	default y
+	help
+	  OCFS2 includes a simple kernelspace clustering package, the OCFS2
+	  Cluster Base.  It only requires a very small userspace component
+	  to configure it. This comes with the standard ocfs2-tools package.
+	  O2CB is limited to maintaining a cluster for OCFS2 file systems.
+	  It cannot manage any other cluster applications.
+
+	  It is always safe to say Y here, as the clustering method is
+	  run-time selectable.
+
+config OCFS2_FS_USERSPACE_CLUSTER
+	tristate "OCFS2 Userspace Clustering"
+	depends on OCFS2_FS && DLM
+	default y
+	help
+	  This option will allow OCFS2 to use userspace clustering services
+	  in conjunction with the DLM in fs/dlm.  If you are using a
+	  userspace cluster manager, say Y here.
+
+	  It is safe to say Y, as the clustering method is run-time
+	  selectable.
+
+config OCFS2_FS_STATS
+	bool "OCFS2 statistics"
+	depends on OCFS2_FS && DEBUG_FS
+	default y
+	help
+	  This option allows some fs statistics to be captured. Enabling
+	  this option may increase the memory consumption.
+
+config OCFS2_DEBUG_MASKLOG
+	bool "OCFS2 logging support"
+	depends on OCFS2_FS
+	default y
+	help
+	  The ocfs2 filesystem has an extensive logging system.  The system
+	  allows selection of events to log via files in /sys/o2cb/logmask/.
+	  This option will enlarge your kernel, but it allows debugging of
+	  ocfs2 filesystem issues.
+
+config OCFS2_DEBUG_FS
+	bool "OCFS2 expensive checks"
+	depends on OCFS2_FS
+	default n
+	help
+	  This option will enable expensive consistency checks. Enable
+	  this option for debugging only as it is likely to decrease
+	  performance of the filesystem.
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
new file mode 100644
index 00000000..f17e58b3
--- /dev/null
+++ b/fs/ocfs2/Makefile
@@ -0,0 +1,54 @@
+ccflags-y := -Ifs/ocfs2
+
+ccflags-y += -DCATCH_BH_JBD_RACES
+
+obj-$(CONFIG_OCFS2_FS) += 	\
+	ocfs2.o			\
+	ocfs2_stackglue.o
+
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o
+obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
+
+ocfs2-objs := \
+	alloc.o 		\
+	aops.o 			\
+	blockcheck.o		\
+	buffer_head_io.o	\
+	dcache.o 		\
+	dir.o 			\
+	dlmglue.o 		\
+	export.o 		\
+	extent_map.o 		\
+	file.o 			\
+	heartbeat.o 		\
+	inode.o 		\
+	ioctl.o 		\
+	journal.o 		\
+	localalloc.o 		\
+	locks.o			\
+	mmap.o 			\
+	namei.o 		\
+	refcounttree.o		\
+	reservations.o		\
+	move_extents.o		\
+	resize.o		\
+	slot_map.o 		\
+	suballoc.o 		\
+	super.o 		\
+	symlink.o 		\
+	sysfile.o 		\
+	uptodate.o		\
+	ver.o			\
+	quota_local.o		\
+	quota_global.o		\
+	xattr.o			\
+	acl.o
+
+ocfs2_stackglue-objs := stackglue.o
+ocfs2_stack_o2cb-objs := stack_o2cb.o
+ocfs2_stack_user-objs := stack_user.o
+
+obj-$(CONFIG_OCFS2_FS) += dlmfs/
+# cluster/ is always needed when OCFS2_FS for masklog support
+obj-$(CONFIG_OCFS2_FS) += cluster/
+obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
new file mode 100644
index 00000000..e913ad13
--- /dev/null
+++ b/fs/ocfs2/acl.c
@@ -0,0 +1,535 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.c
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/acl.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_fs.h"
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from xattr value to acl struct.
+ */
+static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
+{
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(struct posix_acl_entry))
+		return ERR_PTR(-EINVAL);
+
+	count = size / sizeof(struct posix_acl_entry);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n = 0; n < count; n++) {
+		struct ocfs2_acl_entry *entry =
+			(struct ocfs2_acl_entry *)value;
+
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		acl->a_entries[n].e_id   = le32_to_cpu(entry->e_id);
+		value += sizeof(struct posix_acl_entry);
+
+	}
+	return acl;
+}
+
+/*
+ * Convert acl struct to xattr value.
+ */
+static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
+{
+	struct ocfs2_acl_entry *entry = NULL;
+	char *ocfs2_acl;
+	size_t n;
+
+	*size = acl->a_count * sizeof(struct posix_acl_entry);
+
+	ocfs2_acl = kmalloc(*size, GFP_NOFS);
+	if (!ocfs2_acl)
+		return ERR_PTR(-ENOMEM);
+
+	entry = (struct ocfs2_acl_entry *)ocfs2_acl;
+	for (n = 0; n < acl->a_count; n++, entry++) {
+		entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+		entry->e_id   = cpu_to_le32(acl->a_entries[n].e_id);
+	}
+	return ocfs2_acl;
+}
+
+static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
+					      int type,
+					      struct buffer_head *di_bh)
+{
+	int name_index;
+	char *value = NULL;
+	struct posix_acl *acl;
+	int retval;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
+	if (retval > 0) {
+		value = kmalloc(retval, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+						"", value, retval);
+	}
+
+	if (retval > 0)
+		acl = ocfs2_acl_from_xattr(value, retval);
+	else if (retval == -ENODATA || retval == 0)
+		acl = NULL;
+	else
+		acl = ERR_PTR(retval);
+
+	kfree(value);
+
+	return acl;
+}
+
+
+/*
+ * Get posix acl.
+ */
+static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *di_bh = NULL;
+	struct posix_acl *acl;
+	int ret;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return NULL;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		acl = ERR_PTR(ret);
+		return acl;
+	}
+
+	acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+
+	ocfs2_inode_unlock(inode, 0);
+
+	brelse(di_bh);
+
+	return acl;
+}
+
+/*
+ * Helper function to set i_mode in memory and disk. Some call paths
+ * will not have di_bh or a journal handle to pass, in which case it
+ * will create it's own.
+ */
+static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
+			      handle_t *handle, umode_t new_mode)
+{
+	int ret, commit_handle = 0;
+	struct ocfs2_dinode *di;
+
+	if (di_bh == NULL) {
+		ret = ocfs2_read_inode_block(inode, &di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else
+		get_bh(di_bh);
+
+	if (handle == NULL) {
+		handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+					   OCFS2_INODE_UPDATE_CREDITS);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			mlog_errno(ret);
+			goto out_brelse;
+		}
+
+		commit_handle = 1;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	inode->i_mode = new_mode;
+	inode->i_ctime = CURRENT_TIME;
+	di->i_mode = cpu_to_le16(inode->i_mode);
+	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	if (commit_handle)
+		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out_brelse:
+	brelse(di_bh);
+out:
+	return ret;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ */
+static int ocfs2_set_acl(handle_t *handle,
+			 struct inode *inode,
+			 struct buffer_head *di_bh,
+			 int type,
+			 struct posix_acl *acl,
+			 struct ocfs2_alloc_context *meta_ac,
+			 struct ocfs2_alloc_context *data_ac)
+{
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int ret;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			ret = posix_acl_equiv_mode(acl, &mode);
+			if (ret < 0)
+				return ret;
+			else {
+				if (ret == 0)
+					acl = NULL;
+
+				ret = ocfs2_acl_set_mode(inode, di_bh,
+							 handle, mode);
+				if (ret)
+					return ret;
+
+			}
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		value = ocfs2_acl_to_xattr(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	if (handle)
+		ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
+					     "", value, size, 0,
+					     meta_ac, data_ac);
+	else
+		ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
+
+	kfree(value);
+
+	return ret;
+}
+
+int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
+{
+	struct ocfs2_super *osb;
+	struct buffer_head *di_bh = NULL;
+	struct posix_acl *acl;
+	int ret = -EAGAIN;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	osb = OCFS2_SB(inode->i_sb);
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return ret;
+
+	ret = ocfs2_read_inode_block(inode, &di_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh);
+
+	brelse(di_bh);
+
+	if (IS_ERR(acl)) {
+		mlog_errno(PTR_ERR(acl));
+		return PTR_ERR(acl);
+	}
+	if (acl) {
+		ret = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return ret;
+	}
+
+	return -EAGAIN;
+}
+
+int ocfs2_acl_chmod(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl, *clone;
+	int ret;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return 0;
+
+	acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+	ret = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!ret)
+		ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+				    clone, NULL, NULL);
+	posix_acl_release(clone);
+	return ret;
+}
+
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+		   struct inode *inode,
+		   struct inode *dir,
+		   struct buffer_head *di_bh,
+		   struct buffer_head *dir_bh,
+		   struct ocfs2_alloc_context *meta_ac,
+		   struct ocfs2_alloc_context *data_ac)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl = NULL;
+	int ret = 0, ret2;
+	mode_t mode;
+
+	if (!S_ISLNK(inode->i_mode)) {
+		if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+			acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+						   dir_bh);
+			if (IS_ERR(acl))
+				return PTR_ERR(acl);
+		}
+		if (!acl) {
+			mode = inode->i_mode & ~current_umask();
+			ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+			if (ret) {
+				mlog_errno(ret);
+				goto cleanup;
+			}
+		}
+	}
+	if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+		struct posix_acl *clone;
+
+		if (S_ISDIR(inode->i_mode)) {
+			ret = ocfs2_set_acl(handle, inode, di_bh,
+					    ACL_TYPE_DEFAULT, acl,
+					    meta_ac, data_ac);
+			if (ret)
+				goto cleanup;
+		}
+		clone = posix_acl_clone(acl, GFP_NOFS);
+		ret = -ENOMEM;
+		if (!clone)
+			goto cleanup;
+
+		mode = inode->i_mode;
+		ret = posix_acl_create_masq(clone, &mode);
+		if (ret >= 0) {
+			ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+			if (ret2) {
+				mlog_errno(ret2);
+				ret = ret2;
+				goto cleanup;
+			}
+			if (ret > 0) {
+				ret = ocfs2_set_acl(handle, inode,
+						    di_bh, ACL_TYPE_ACCESS,
+						    clone, meta_ac, data_ac);
+			}
+		}
+		posix_acl_release(clone);
+	}
+cleanup:
+	posix_acl_release(acl);
+	return ret;
+}
+
+static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry,
+					  char *list,
+					  size_t list_len,
+					  const char *name,
+					  size_t name_len,
+					  int type)
+{
+	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return 0;
+
+	if (list && size <= list_len)
+		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+	return size;
+}
+
+static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry,
+					   char *list,
+					   size_t list_len,
+					   const char *name,
+					   size_t name_len,
+					   int type)
+{
+	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return 0;
+
+	if (list && size <= list_len)
+		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+	return size;
+}
+
+static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+	struct posix_acl *acl;
+	int ret;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return -EOPNOTSUPP;
+
+	acl = ocfs2_get_acl(dentry->d_inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	ret = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return ret;
+}
+
+static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl;
+	int ret = 0;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return -EOPNOTSUPP;
+
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		else if (acl) {
+			ret = posix_acl_valid(acl);
+			if (ret)
+				goto cleanup;
+		}
+	} else
+		acl = NULL;
+
+	ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+
+cleanup:
+	posix_acl_release(acl);
+	return ret;
+}
+
+const struct xattr_handler ocfs2_xattr_acl_access_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.flags	= ACL_TYPE_ACCESS,
+	.list	= ocfs2_xattr_list_acl_access,
+	.get	= ocfs2_xattr_get_acl,
+	.set	= ocfs2_xattr_set_acl,
+};
+
+const struct xattr_handler ocfs2_xattr_acl_default_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.flags	= ACL_TYPE_DEFAULT,
+	.list	= ocfs2_xattr_list_acl_default,
+	.get	= ocfs2_xattr_get_acl,
+	.set	= ocfs2_xattr_set_acl,
+};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
new file mode 100644
index 00000000..4fe7c9cf
--- /dev/null
+++ b/fs/ocfs2/acl.h
@@ -0,0 +1,36 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.h
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_ACL_H
+#define OCFS2_ACL_H
+
+#include <linux/posix_acl_xattr.h>
+
+struct ocfs2_acl_entry {
+	__le16 e_tag;
+	__le16 e_perm;
+	__le32 e_id;
+};
+
+extern int ocfs2_check_acl(struct inode *, int, unsigned int);
+extern int ocfs2_acl_chmod(struct inode *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+			  struct buffer_head *, struct buffer_head *,
+			  struct ocfs2_alloc_context *,
+			  struct ocfs2_alloc_context *);
+
+#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
new file mode 100644
index 00000000..76c8165d
--- /dev/null
+++ b/fs/ocfs2/alloc.c
@@ -0,0 +1,7352 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.c
+ *
+ * Extent allocs and frees
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/quotaops.h>
+#include <linux/blkdev.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "blockcheck.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "sysfile.h"
+#include "file.h"
+#include "super.h"
+#include "uptodate.h"
+#include "xattr.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+enum ocfs2_contig_type {
+	CONTIG_NONE = 0,
+	CONTIG_LEFT,
+	CONTIG_RIGHT,
+	CONTIG_LEFTRIGHT,
+};
+
+static enum ocfs2_contig_type
+	ocfs2_extent_rec_contig(struct super_block *sb,
+				struct ocfs2_extent_rec *ext,
+				struct ocfs2_extent_rec *insert_rec);
+/*
+ * Operations for a specific extent tree type.
+ *
+ * To implement an on-disk btree (extent tree) type in ocfs2, add
+ * an ocfs2_extent_tree_operations structure and the matching
+ * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
+ * for the allocation portion of the extent tree.
+ */
+struct ocfs2_extent_tree_operations {
+	/*
+	 * last_eb_blk is the block number of the right most leaf extent
+	 * block.  Most on-disk structures containing an extent tree store
+	 * this value for fast access.  The ->eo_set_last_eb_blk() and
+	 * ->eo_get_last_eb_blk() operations access this value.  They are
+	 *  both required.
+	 */
+	void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
+				   u64 blkno);
+	u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * The on-disk structure usually keeps track of how many total
+	 * clusters are stored in this extent tree.  This function updates
+	 * that value.  new_clusters is the delta, and must be
+	 * added to the total.  Required.
+	 */
+	void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
+				   u32 new_clusters);
+
+	/*
+	 * If this extent tree is supported by an extent map, insert
+	 * a record into the map.
+	 */
+	void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
+				     struct ocfs2_extent_rec *rec);
+
+	/*
+	 * If this extent tree is supported by an extent map, truncate the
+	 * map to clusters,
+	 */
+	void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
+				       u32 clusters);
+
+	/*
+	 * If ->eo_insert_check() exists, it is called before rec is
+	 * inserted into the extent tree.  It is optional.
+	 */
+	int (*eo_insert_check)(struct ocfs2_extent_tree *et,
+			       struct ocfs2_extent_rec *rec);
+	int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * --------------------------------------------------------------
+	 * The remaining are internal to ocfs2_extent_tree and don't have
+	 * accessor functions
+	 */
+
+	/*
+	 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
+	 * It is required.
+	 */
+	void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
+	 * it exists.  If it does not, et->et_max_leaf_clusters is set
+	 * to 0 (unlimited).  Optional.
+	 */
+	void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
+	 * are contiguous or not. Optional. Don't need to set it if use
+	 * ocfs2_extent_rec as the tree leaf.
+	 */
+	enum ocfs2_contig_type
+		(*eo_extent_contig)(struct ocfs2_extent_tree *et,
+				    struct ocfs2_extent_rec *ext,
+				    struct ocfs2_extent_rec *insert_rec);
+};
+
+
+/*
+ * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
+ * in the methods.
+ */
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					 u64 blkno);
+static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
+					 u32 clusters);
+static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
+					   struct ocfs2_extent_rec *rec);
+static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
+					     u32 clusters);
+static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
+				     struct ocfs2_extent_rec *rec);
+static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
+static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_dinode_update_clusters,
+	.eo_extent_map_insert	= ocfs2_dinode_extent_map_insert,
+	.eo_extent_map_truncate	= ocfs2_dinode_extent_map_truncate,
+	.eo_insert_check	= ocfs2_dinode_insert_check,
+	.eo_sanity_check	= ocfs2_dinode_sanity_check,
+	.eo_fill_root_el	= ocfs2_dinode_fill_root_el,
+};
+
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					 u64 blkno)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+	di->i_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+	return le64_to_cpu(di->i_last_eb_blk);
+}
+
+static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
+					 u32 clusters)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
+	struct ocfs2_dinode *di = et->et_object;
+
+	le32_add_cpu(&di->i_clusters, clusters);
+	spin_lock(&oi->ip_lock);
+	oi->ip_clusters = le32_to_cpu(di->i_clusters);
+	spin_unlock(&oi->ip_lock);
+}
+
+static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
+					   struct ocfs2_extent_rec *rec)
+{
+	struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
+
+	ocfs2_extent_map_insert_rec(inode, rec);
+}
+
+static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
+					     u32 clusters)
+{
+	struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
+
+	ocfs2_extent_map_trunc(inode, clusters);
+}
+
+static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
+				     struct ocfs2_extent_rec *rec)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
+	struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
+
+	BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
+	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
+			(oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
+			"Device %s, asking for sparse allocation: inode %llu, "
+			"cpos %u, clusters %u\n",
+			osb->dev_str,
+			(unsigned long long)oi->ip_blkno,
+			rec->e_cpos, oi->ip_clusters);
+
+	return 0;
+}
+
+static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+	return 0;
+}
+
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	et->et_root_el = &di->id2.i_list;
+}
+
+
+static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
+
+	et->et_root_el = &vb->vb_xv->xr_list;
+}
+
+static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					      u64 blkno)
+{
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
+
+	vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
+
+	return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
+}
+
+static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
+					      u32 clusters)
+{
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
+
+	le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
+	.eo_fill_root_el	= ocfs2_xattr_value_fill_root_el,
+};
+
+static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+
+	et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
+}
+
+static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
+{
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+	et->et_max_leaf_clusters =
+		ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
+}
+
+static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					     u64 blkno)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+	xt->xt_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+	return le64_to_cpu(xt->xt_last_eb_blk);
+}
+
+static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
+					     u32 clusters)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+
+	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
+	.eo_fill_root_el	= ocfs2_xattr_tree_fill_root_el,
+	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
+};
+
+static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					  u64 blkno)
+{
+	struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+	dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+	return le64_to_cpu(dx_root->dr_last_eb_blk);
+}
+
+static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
+					  u32 clusters)
+{
+	struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+	le32_add_cpu(&dx_root->dr_clusters, clusters);
+}
+
+static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+	BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
+
+	return 0;
+}
+
+static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+	et->et_root_el = &dx_root->dr_list;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_dx_root_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_dx_root_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_dx_root_update_clusters,
+	.eo_sanity_check	= ocfs2_dx_root_sanity_check,
+	.eo_fill_root_el	= ocfs2_dx_root_fill_root_el,
+};
+
+static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	et->et_root_el = &rb->rf_list;
+}
+
+static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+						u64 blkno)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	rb->rf_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	return le64_to_cpu(rb->rf_last_eb_blk);
+}
+
+static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
+						u32 clusters)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	le32_add_cpu(&rb->rf_clusters, clusters);
+}
+
+static enum ocfs2_contig_type
+ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
+				  struct ocfs2_extent_rec *ext,
+				  struct ocfs2_extent_rec *insert_rec)
+{
+	return CONTIG_NONE;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_refcount_tree_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_refcount_tree_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_refcount_tree_update_clusters,
+	.eo_fill_root_el	= ocfs2_refcount_tree_fill_root_el,
+	.eo_extent_contig	= ocfs2_refcount_tree_extent_contig,
+};
+
+static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *bh,
+				     ocfs2_journal_access_func access,
+				     void *obj,
+				     struct ocfs2_extent_tree_operations *ops)
+{
+	et->et_ops = ops;
+	et->et_root_bh = bh;
+	et->et_ci = ci;
+	et->et_root_journal_access = access;
+	if (!obj)
+		obj = (void *)bh->b_data;
+	et->et_object = obj;
+
+	et->et_ops->eo_fill_root_el(et);
+	if (!et->et_ops->eo_fill_max_leaf_clusters)
+		et->et_max_leaf_clusters = 0;
+	else
+		et->et_ops->eo_fill_max_leaf_clusters(et);
+}
+
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+				   struct ocfs2_caching_info *ci,
+				   struct buffer_head *bh)
+{
+	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
+				 NULL, &ocfs2_dinode_et_ops);
+}
+
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+				       struct ocfs2_caching_info *ci,
+				       struct buffer_head *bh)
+{
+	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
+				 NULL, &ocfs2_xattr_tree_et_ops);
+}
+
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+					struct ocfs2_caching_info *ci,
+					struct ocfs2_xattr_value_buf *vb)
+{
+	__ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
+				 &ocfs2_xattr_value_et_ops);
+}
+
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+				    struct ocfs2_caching_info *ci,
+				    struct buffer_head *bh)
+{
+	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
+				 NULL, &ocfs2_dx_root_et_ops);
+}
+
+void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *bh)
+{
+	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
+				 NULL, &ocfs2_refcount_tree_et_ops);
+}
+
+static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					    u64 new_last_eb_blk)
+{
+	et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
+}
+
+static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	return et->et_ops->eo_get_last_eb_blk(et);
+}
+
+static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
+					    u32 clusters)
+{
+	et->et_ops->eo_update_clusters(et, clusters);
+}
+
+static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
+					      struct ocfs2_extent_rec *rec)
+{
+	if (et->et_ops->eo_extent_map_insert)
+		et->et_ops->eo_extent_map_insert(et, rec);
+}
+
+static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
+						u32 clusters)
+{
+	if (et->et_ops->eo_extent_map_truncate)
+		et->et_ops->eo_extent_map_truncate(et, clusters);
+}
+
+static inline int ocfs2_et_root_journal_access(handle_t *handle,
+					       struct ocfs2_extent_tree *et,
+					       int type)
+{
+	return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
+					  type);
+}
+
+static inline enum ocfs2_contig_type
+	ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
+			       struct ocfs2_extent_rec *rec,
+			       struct ocfs2_extent_rec *insert_rec)
+{
+	if (et->et_ops->eo_extent_contig)
+		return et->et_ops->eo_extent_contig(et, rec, insert_rec);
+
+	return ocfs2_extent_rec_contig(
+				ocfs2_metadata_cache_get_super(et->et_ci),
+				rec, insert_rec);
+}
+
+static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
+					struct ocfs2_extent_rec *rec)
+{
+	int ret = 0;
+
+	if (et->et_ops->eo_insert_check)
+		ret = et->et_ops->eo_insert_check(et, rec);
+	return ret;
+}
+
+static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
+{
+	int ret = 0;
+
+	if (et->et_ops->eo_sanity_check)
+		ret = et->et_ops->eo_sanity_check(et);
+	return ret;
+}
+
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+					 struct ocfs2_extent_block *eb);
+static void ocfs2_adjust_rightmost_records(handle_t *handle,
+					   struct ocfs2_extent_tree *et,
+					   struct ocfs2_path *path,
+					   struct ocfs2_extent_rec *insert_rec);
+/*
+ * Reset the actual path elements so that we can re-use the structure
+ * to build another path. Generally, this involves freeing the buffer
+ * heads.
+ */
+void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
+{
+	int i, start = 0, depth = 0;
+	struct ocfs2_path_item *node;
+
+	if (keep_root)
+		start = 1;
+
+	for(i = start; i < path_num_items(path); i++) {
+		node = &path->p_node[i];
+
+		brelse(node->bh);
+		node->bh = NULL;
+		node->el = NULL;
+	}
+
+	/*
+	 * Tree depth may change during truncate, or insert. If we're
+	 * keeping the root extent list, then make sure that our path
+	 * structure reflects the proper depth.
+	 */
+	if (keep_root)
+		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+	else
+		path_root_access(path) = NULL;
+
+	path->p_tree_depth = depth;
+}
+
+void ocfs2_free_path(struct ocfs2_path *path)
+{
+	if (path) {
+		ocfs2_reinit_path(path, 0);
+		kfree(path);
+	}
+}
+
+/*
+ * All the elements of src into dest. After this call, src could be freed
+ * without affecting dest.
+ *
+ * Both paths should have the same root. Any non-root elements of dest
+ * will be freed.
+ */
+static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+	int i;
+
+	BUG_ON(path_root_bh(dest) != path_root_bh(src));
+	BUG_ON(path_root_el(dest) != path_root_el(src));
+	BUG_ON(path_root_access(dest) != path_root_access(src));
+
+	ocfs2_reinit_path(dest, 1);
+
+	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+		dest->p_node[i].bh = src->p_node[i].bh;
+		dest->p_node[i].el = src->p_node[i].el;
+
+		if (dest->p_node[i].bh)
+			get_bh(dest->p_node[i].bh);
+	}
+}
+
+/*
+ * Make the *dest path the same as src and re-initialize src path to
+ * have a root only.
+ */
+static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+	int i;
+
+	BUG_ON(path_root_bh(dest) != path_root_bh(src));
+	BUG_ON(path_root_access(dest) != path_root_access(src));
+
+	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+		brelse(dest->p_node[i].bh);
+
+		dest->p_node[i].bh = src->p_node[i].bh;
+		dest->p_node[i].el = src->p_node[i].el;
+
+		src->p_node[i].bh = NULL;
+		src->p_node[i].el = NULL;
+	}
+}
+
+/*
+ * Insert an extent block at given index.
+ *
+ * This will not take an additional reference on eb_bh.
+ */
+static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
+					struct buffer_head *eb_bh)
+{
+	struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+
+	/*
+	 * Right now, no root bh is an extent block, so this helps
+	 * catch code errors with dinode trees. The assertion can be
+	 * safely removed if we ever need to insert extent block
+	 * structures at the root.
+	 */
+	BUG_ON(index == 0);
+
+	path->p_node[index].bh = eb_bh;
+	path->p_node[index].el = &eb->h_list;
+}
+
+static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
+					 struct ocfs2_extent_list *root_el,
+					 ocfs2_journal_access_func access)
+{
+	struct ocfs2_path *path;
+
+	BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
+
+	path = kzalloc(sizeof(*path), GFP_NOFS);
+	if (path) {
+		path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
+		get_bh(root_bh);
+		path_root_bh(path) = root_bh;
+		path_root_el(path) = root_el;
+		path_root_access(path) = access;
+	}
+
+	return path;
+}
+
+struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+{
+	return ocfs2_new_path(path_root_bh(path), path_root_el(path),
+			      path_root_access(path));
+}
+
+struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+{
+	return ocfs2_new_path(et->et_root_bh, et->et_root_el,
+			      et->et_root_journal_access);
+}
+
+/*
+ * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
+ * otherwise it's the root_access function.
+ *
+ * I don't like the way this function's name looks next to
+ * ocfs2_journal_access_path(), but I don't have a better one.
+ */
+int ocfs2_path_bh_journal_access(handle_t *handle,
+				 struct ocfs2_caching_info *ci,
+				 struct ocfs2_path *path,
+				 int idx)
+{
+	ocfs2_journal_access_func access = path_root_access(path);
+
+	if (!access)
+		access = ocfs2_journal_access;
+
+	if (idx)
+		access = ocfs2_journal_access_eb;
+
+	return access(handle, ci, path->p_node[idx].bh,
+		      OCFS2_JOURNAL_ACCESS_WRITE);
+}
+
+/*
+ * Convenience function to journal all components in a path.
+ */
+int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
+			      handle_t *handle,
+			      struct ocfs2_path *path)
+{
+	int i, ret = 0;
+
+	if (!path)
+		goto out;
+
+	for(i = 0; i < path_num_items(path); i++) {
+		ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Return the index of the extent record which contains cluster #v_cluster.
+ * -1 is returned if it was not found.
+ *
+ * Should work fine on interior and exterior nodes.
+ */
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
+{
+	int ret = -1;
+	int i;
+	struct ocfs2_extent_rec *rec;
+	u32 rec_end, rec_start, clusters;
+
+	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+
+		rec_start = le32_to_cpu(rec->e_cpos);
+		clusters = ocfs2_rec_clusters(el, rec);
+
+		rec_end = rec_start + clusters;
+
+		if (v_cluster >= rec_start && v_cluster < rec_end) {
+			ret = i;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
+ * ocfs2_extent_rec_contig only work properly against leaf nodes!
+ */
+static int ocfs2_block_extent_contig(struct super_block *sb,
+				     struct ocfs2_extent_rec *ext,
+				     u64 blkno)
+{
+	u64 blk_end = le64_to_cpu(ext->e_blkno);
+
+	blk_end += ocfs2_clusters_to_blocks(sb,
+				    le16_to_cpu(ext->e_leaf_clusters));
+
+	return blkno == blk_end;
+}
+
+static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
+				  struct ocfs2_extent_rec *right)
+{
+	u32 left_range;
+
+	left_range = le32_to_cpu(left->e_cpos) +
+		le16_to_cpu(left->e_leaf_clusters);
+
+	return (left_range == le32_to_cpu(right->e_cpos));
+}
+
+static enum ocfs2_contig_type
+	ocfs2_extent_rec_contig(struct super_block *sb,
+				struct ocfs2_extent_rec *ext,
+				struct ocfs2_extent_rec *insert_rec)
+{
+	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
+
+	/*
+	 * Refuse to coalesce extent records with different flag
+	 * fields - we don't want to mix unwritten extents with user
+	 * data.
+	 */
+	if (ext->e_flags != insert_rec->e_flags)
+		return CONTIG_NONE;
+
+	if (ocfs2_extents_adjacent(ext, insert_rec) &&
+	    ocfs2_block_extent_contig(sb, ext, blkno))
+			return CONTIG_RIGHT;
+
+	blkno = le64_to_cpu(ext->e_blkno);
+	if (ocfs2_extents_adjacent(insert_rec, ext) &&
+	    ocfs2_block_extent_contig(sb, insert_rec, blkno))
+		return CONTIG_LEFT;
+
+	return CONTIG_NONE;
+}
+
+/*
+ * NOTE: We can have pretty much any combination of contiguousness and
+ * appending.
+ *
+ * The usefulness of APPEND_TAIL is more in that it lets us know that
+ * we'll have to update the path to that leaf.
+ */
+enum ocfs2_append_type {
+	APPEND_NONE = 0,
+	APPEND_TAIL,
+};
+
+enum ocfs2_split_type {
+	SPLIT_NONE = 0,
+	SPLIT_LEFT,
+	SPLIT_RIGHT,
+};
+
+struct ocfs2_insert_type {
+	enum ocfs2_split_type	ins_split;
+	enum ocfs2_append_type	ins_appending;
+	enum ocfs2_contig_type	ins_contig;
+	int			ins_contig_index;
+	int			ins_tree_depth;
+};
+
+struct ocfs2_merge_ctxt {
+	enum ocfs2_contig_type	c_contig_type;
+	int			c_has_empty_extent;
+	int			c_split_covers_rec;
+};
+
+static int ocfs2_validate_extent_block(struct super_block *sb,
+				       struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_extent_block *eb =
+		(struct ocfs2_extent_block *)bh->b_data;
+
+	trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
+	if (rc) {
+		mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		return rc;
+	}
+
+	/*
+	 * Errors after here are fatal.
+	 */
+
+	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+		ocfs2_error(sb,
+			    "Extent block #%llu has bad signature %.*s",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    eb->h_signature);
+		return -EINVAL;
+	}
+
+	if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb,
+			    "Extent block #%llu has an invalid h_blkno "
+			    "of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(eb->h_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Extent block #%llu has an invalid "
+			    "h_fs_generation of #%u",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(eb->h_fs_generation));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
+			    struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(ci, eb_blkno, &tmp,
+			      ocfs2_validate_extent_block);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
+
+/*
+ * How many free extents have we got before we need more meta data?
+ */
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+			   struct ocfs2_extent_tree *et)
+{
+	int retval;
+	struct ocfs2_extent_list *el = NULL;
+	struct ocfs2_extent_block *eb;
+	struct buffer_head *eb_bh = NULL;
+	u64 last_eb_blk = 0;
+
+	el = et->et_root_el;
+	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
+
+	if (last_eb_blk) {
+		retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
+						 &eb_bh);
+		if (retval < 0) {
+			mlog_errno(retval);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+	}
+
+	BUG_ON(el->l_tree_depth != 0);
+
+	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
+bail:
+	brelse(eb_bh);
+
+	trace_ocfs2_num_free_extents(retval);
+	return retval;
+}
+
+/* expects array to already be allocated
+ *
+ * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
+ * l_count for you
+ */
+static int ocfs2_create_new_meta_bhs(handle_t *handle,
+				     struct ocfs2_extent_tree *et,
+				     int wanted,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct buffer_head *bhs[])
+{
+	int count, status, i;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 suballoc_loc, first_blkno;
+	struct ocfs2_super *osb =
+		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
+	struct ocfs2_extent_block *eb;
+
+	count = 0;
+	while (count < wanted) {
+		status = ocfs2_claim_metadata(handle,
+					      meta_ac,
+					      wanted - count,
+					      &suballoc_loc,
+					      &suballoc_bit_start,
+					      &num_got,
+					      &first_blkno);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		for(i = count;  i < (num_got + count); i++) {
+			bhs[i] = sb_getblk(osb->sb, first_blkno);
+			if (bhs[i] == NULL) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+			ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
+
+			status = ocfs2_journal_access_eb(handle, et->et_ci,
+							 bhs[i],
+							 OCFS2_JOURNAL_ACCESS_CREATE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+
+			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
+			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
+			/* Ok, setup the minimal stuff here. */
+			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
+			eb->h_blkno = cpu_to_le64(first_blkno);
+			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
+			eb->h_suballoc_slot =
+				cpu_to_le16(meta_ac->ac_alloc_slot);
+			eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
+			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+			eb->h_list.l_count =
+				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
+
+			suballoc_bit_start++;
+			first_blkno++;
+
+			/* We'll also be dirtied by the caller, so
+			 * this isn't absolutely necessary. */
+			ocfs2_journal_dirty(handle, bhs[i]);
+		}
+
+		count += num_got;
+	}
+
+	status = 0;
+bail:
+	if (status < 0) {
+		for(i = 0; i < wanted; i++) {
+			brelse(bhs[i]);
+			bhs[i] = NULL;
+		}
+		mlog_errno(status);
+	}
+	return status;
+}
+
+/*
+ * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
+ *
+ * Returns the sum of the rightmost extent rec logical offset and
+ * cluster count.
+ *
+ * ocfs2_add_branch() uses this to determine what logical cluster
+ * value should be populated into the leftmost new branch records.
+ *
+ * ocfs2_shift_tree_depth() uses this to determine the # clusters
+ * value for the new topmost tree record.
+ */
+static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
+{
+	int i;
+
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+
+	return le32_to_cpu(el->l_recs[i].e_cpos) +
+		ocfs2_rec_clusters(el, &el->l_recs[i]);
+}
+
+/*
+ * Change range of the branches in the right most path according to the leaf
+ * extent block's rightmost record.
+ */
+static int ocfs2_adjust_rightmost_branch(handle_t *handle,
+					 struct ocfs2_extent_tree *et)
+{
+	int status;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+
+	path = ocfs2_new_path_from_et(et);
+	if (!path) {
+		status = -ENOMEM;
+		return status;
+	}
+
+	status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_extend_trans(handle, path_num_items(path));
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_journal_access_path(et->et_ci, handle, path);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+	rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1];
+
+	ocfs2_adjust_rightmost_records(handle, et, path, rec);
+
+out:
+	ocfs2_free_path(path);
+	return status;
+}
+
+/*
+ * Add an entire tree branch to our inode. eb_bh is the extent block
+ * to start at, if we don't want to start the branch at the root
+ * structure.
+ *
+ * last_eb_bh is required as we have to update it's next_leaf pointer
+ * for the new last extent block.
+ *
+ * the new branch will be 'empty' in the sense that every block will
+ * contain a single record with cluster count == 0.
+ */
+static int ocfs2_add_branch(handle_t *handle,
+			    struct ocfs2_extent_tree *et,
+			    struct buffer_head *eb_bh,
+			    struct buffer_head **last_eb_bh,
+			    struct ocfs2_alloc_context *meta_ac)
+{
+	int status, new_blocks, i;
+	u64 next_blkno, new_last_eb_blk;
+	struct buffer_head *bh;
+	struct buffer_head **new_eb_bhs = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *eb_el;
+	struct ocfs2_extent_list  *el;
+	u32 new_cpos, root_end;
+
+	BUG_ON(!last_eb_bh || !*last_eb_bh);
+
+	if (eb_bh) {
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+	} else
+		el = et->et_root_el;
+
+	/* we never add a branch to a leaf. */
+	BUG_ON(!el->l_tree_depth);
+
+	new_blocks = le16_to_cpu(el->l_tree_depth);
+
+	eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
+	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
+	root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
+
+	/*
+	 * If there is a gap before the root end and the real end
+	 * of the righmost leaf block, we need to remove the gap
+	 * between new_cpos and root_end first so that the tree
+	 * is consistent after we add a new branch(it will start
+	 * from new_cpos).
+	 */
+	if (root_end > new_cpos) {
+		trace_ocfs2_adjust_rightmost_branch(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			root_end, new_cpos);
+
+		status = ocfs2_adjust_rightmost_branch(handle, et);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* allocate the number of new eb blocks we need */
+	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
+			     GFP_KERNEL);
+	if (!new_eb_bhs) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
+					   meta_ac, new_eb_bhs);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
+	 * linked with the rest of the tree.
+	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
+	 *
+	 * when we leave the loop, new_last_eb_blk will point to the
+	 * newest leaf, and next_blkno will point to the topmost extent
+	 * block. */
+	next_blkno = new_last_eb_blk = 0;
+	for(i = 0; i < new_blocks; i++) {
+		bh = new_eb_bhs[i];
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		/* ocfs2_create_new_meta_bhs() should create it right! */
+		BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
+		eb_el = &eb->h_list;
+
+		status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
+						 OCFS2_JOURNAL_ACCESS_CREATE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		eb->h_next_leaf_blk = 0;
+		eb_el->l_tree_depth = cpu_to_le16(i);
+		eb_el->l_next_free_rec = cpu_to_le16(1);
+		/*
+		 * This actually counts as an empty extent as
+		 * c_clusters == 0
+		 */
+		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
+		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
+		/*
+		 * eb_el isn't always an interior node, but even leaf
+		 * nodes want a zero'd flags and reserved field so
+		 * this gets the whole 32 bits regardless of use.
+		 */
+		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
+		if (!eb_el->l_tree_depth)
+			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
+
+		ocfs2_journal_dirty(handle, bh);
+		next_blkno = le64_to_cpu(eb->h_blkno);
+	}
+
+	/* This is a bit hairy. We want to update up to three blocks
+	 * here without leaving any of them in an inconsistent state
+	 * in case of error. We don't have to worry about
+	 * journal_dirty erroring as it won't unless we've aborted the
+	 * handle (in which case we would never be here) so reserving
+	 * the write with journal_access is all we need to do. */
+	status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	status = ocfs2_et_root_journal_access(handle, et,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (eb_bh) {
+		status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* Link the new branch into the rest of the tree (el will
+	 * either be on the root_bh, or the extent block passed in. */
+	i = le16_to_cpu(el->l_next_free_rec);
+	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
+	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
+	el->l_recs[i].e_int_clusters = 0;
+	le16_add_cpu(&el->l_next_free_rec, 1);
+
+	/* fe needs a new last extent block pointer, as does the
+	 * next_leaf on the previously last-extent-block. */
+	ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
+
+	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
+	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
+
+	ocfs2_journal_dirty(handle, *last_eb_bh);
+	ocfs2_journal_dirty(handle, et->et_root_bh);
+	if (eb_bh)
+		ocfs2_journal_dirty(handle, eb_bh);
+
+	/*
+	 * Some callers want to track the rightmost leaf so pass it
+	 * back here.
+	 */
+	brelse(*last_eb_bh);
+	get_bh(new_eb_bhs[0]);
+	*last_eb_bh = new_eb_bhs[0];
+
+	status = 0;
+bail:
+	if (new_eb_bhs) {
+		for (i = 0; i < new_blocks; i++)
+			brelse(new_eb_bhs[i]);
+		kfree(new_eb_bhs);
+	}
+
+	return status;
+}
+
+/*
+ * adds another level to the allocation tree.
+ * returns back the new extent block so you can add a branch to it
+ * after this call.
+ */
+static int ocfs2_shift_tree_depth(handle_t *handle,
+				  struct ocfs2_extent_tree *et,
+				  struct ocfs2_alloc_context *meta_ac,
+				  struct buffer_head **ret_new_eb_bh)
+{
+	int status, i;
+	u32 new_clusters;
+	struct buffer_head *new_eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *root_el;
+	struct ocfs2_extent_list  *eb_el;
+
+	status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
+					   &new_eb_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
+	/* ocfs2_create_new_meta_bhs() should create it right! */
+	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
+
+	eb_el = &eb->h_list;
+	root_el = et->et_root_el;
+
+	status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* copy the root extent list data into the new extent block */
+	eb_el->l_tree_depth = root_el->l_tree_depth;
+	eb_el->l_next_free_rec = root_el->l_next_free_rec;
+	for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+		eb_el->l_recs[i] = root_el->l_recs[i];
+
+	ocfs2_journal_dirty(handle, new_eb_bh);
+
+	status = ocfs2_et_root_journal_access(handle, et,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
+
+	/* update root_bh now */
+	le16_add_cpu(&root_el->l_tree_depth, 1);
+	root_el->l_recs[0].e_cpos = 0;
+	root_el->l_recs[0].e_blkno = eb->h_blkno;
+	root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+	for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+		memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+	root_el->l_next_free_rec = cpu_to_le16(1);
+
+	/* If this is our 1st tree depth shift, then last_eb_blk
+	 * becomes the allocated extent block */
+	if (root_el->l_tree_depth == cpu_to_le16(1))
+		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
+
+	ocfs2_journal_dirty(handle, et->et_root_bh);
+
+	*ret_new_eb_bh = new_eb_bh;
+	new_eb_bh = NULL;
+	status = 0;
+bail:
+	brelse(new_eb_bh);
+
+	return status;
+}
+
+/*
+ * Should only be called when there is no space left in any of the
+ * leaf nodes. What we want to do is find the lowest tree depth
+ * non-leaf extent block with room for new records. There are three
+ * valid results of this search:
+ *
+ * 1) a lowest extent block is found, then we pass it back in
+ *    *lowest_eb_bh and return '0'
+ *
+ * 2) the search fails to find anything, but the root_el has room. We
+ *    pass NULL back in *lowest_eb_bh, but still return '0'
+ *
+ * 3) the search fails to find anything AND the root_el is full, in
+ *    which case we return > 0
+ *
+ * return status < 0 indicates an error.
+ */
+static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
+				    struct buffer_head **target_bh)
+{
+	int status = 0, i;
+	u64 blkno;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *el;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *lowest_bh = NULL;
+
+	*target_bh = NULL;
+
+	el = et->et_root_el;
+
+	while(le16_to_cpu(el->l_tree_depth) > 1) {
+		if (le16_to_cpu(el->l_next_free_rec) == 0) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+				    "Owner %llu has empty "
+				    "extent list (next_free_rec == 0)",
+				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+			status = -EIO;
+			goto bail;
+		}
+		i = le16_to_cpu(el->l_next_free_rec) - 1;
+		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+		if (!blkno) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+				    "Owner %llu has extent "
+				    "list where extent # %d has no physical "
+				    "block start",
+				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
+			status = -EIO;
+			goto bail;
+		}
+
+		brelse(bh);
+		bh = NULL;
+
+		status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		el = &eb->h_list;
+
+		if (le16_to_cpu(el->l_next_free_rec) <
+		    le16_to_cpu(el->l_count)) {
+			brelse(lowest_bh);
+			lowest_bh = bh;
+			get_bh(lowest_bh);
+		}
+	}
+
+	/* If we didn't find one and the fe doesn't have any room,
+	 * then return '1' */
+	el = et->et_root_el;
+	if (!lowest_bh && (el->l_next_free_rec == el->l_count))
+		status = 1;
+
+	*target_bh = lowest_bh;
+bail:
+	brelse(bh);
+
+	return status;
+}
+
+/*
+ * Grow a b-tree so that it has more records.
+ *
+ * We might shift the tree depth in which case existing paths should
+ * be considered invalid.
+ *
+ * Tree depth after the grow is returned via *final_depth.
+ *
+ * *last_eb_bh will be updated by ocfs2_add_branch().
+ */
+static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
+			   int *final_depth, struct buffer_head **last_eb_bh,
+			   struct ocfs2_alloc_context *meta_ac)
+{
+	int ret, shift;
+	struct ocfs2_extent_list *el = et->et_root_el;
+	int depth = le16_to_cpu(el->l_tree_depth);
+	struct buffer_head *bh = NULL;
+
+	BUG_ON(meta_ac == NULL);
+
+	shift = ocfs2_find_branch_target(et, &bh);
+	if (shift < 0) {
+		ret = shift;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* We traveled all the way to the bottom of the allocation tree
+	 * and didn't find room for any more extents - we need to add
+	 * another tree level */
+	if (shift) {
+		BUG_ON(bh);
+		trace_ocfs2_grow_tree(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			depth);
+
+		/* ocfs2_shift_tree_depth will return us a buffer with
+		 * the new extent block (so we can pass that to
+		 * ocfs2_add_branch). */
+		ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		depth++;
+		if (depth == 1) {
+			/*
+			 * Special case: we have room now if we shifted from
+			 * tree_depth 0, so no more work needs to be done.
+			 *
+			 * We won't be calling add_branch, so pass
+			 * back *last_eb_bh as the new leaf. At depth
+			 * zero, it should always be null so there's
+			 * no reason to brelse.
+			 */
+			BUG_ON(*last_eb_bh);
+			get_bh(bh);
+			*last_eb_bh = bh;
+			goto out;
+		}
+	}
+
+	/* call ocfs2_add_branch to add the final part of the tree with
+	 * the new data. */
+	ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
+			       meta_ac);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (final_depth)
+		*final_depth = depth;
+	brelse(bh);
+	return ret;
+}
+
+/*
+ * This function will discard the rightmost extent record.
+ */
+static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
+{
+	int next_free = le16_to_cpu(el->l_next_free_rec);
+	int count = le16_to_cpu(el->l_count);
+	unsigned int num_bytes;
+
+	BUG_ON(!next_free);
+	/* This will cause us to go off the end of our extent list. */
+	BUG_ON(next_free >= count);
+
+	num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
+
+	memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
+}
+
+static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
+			      struct ocfs2_extent_rec *insert_rec)
+{
+	int i, insert_index, next_free, has_empty, num_bytes;
+	u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
+	struct ocfs2_extent_rec *rec;
+
+	next_free = le16_to_cpu(el->l_next_free_rec);
+	has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
+
+	BUG_ON(!next_free);
+
+	/* The tree code before us didn't allow enough room in the leaf. */
+	BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
+
+	/*
+	 * The easiest way to approach this is to just remove the
+	 * empty extent and temporarily decrement next_free.
+	 */
+	if (has_empty) {
+		/*
+		 * If next_free was 1 (only an empty extent), this
+		 * loop won't execute, which is fine. We still want
+		 * the decrement above to happen.
+		 */
+		for(i = 0; i < (next_free - 1); i++)
+			el->l_recs[i] = el->l_recs[i+1];
+
+		next_free--;
+	}
+
+	/*
+	 * Figure out what the new record index should be.
+	 */
+	for(i = 0; i < next_free; i++) {
+		rec = &el->l_recs[i];
+
+		if (insert_cpos < le32_to_cpu(rec->e_cpos))
+			break;
+	}
+	insert_index = i;
+
+	trace_ocfs2_rotate_leaf(insert_cpos, insert_index,
+				has_empty, next_free,
+				le16_to_cpu(el->l_count));
+
+	BUG_ON(insert_index < 0);
+	BUG_ON(insert_index >= le16_to_cpu(el->l_count));
+	BUG_ON(insert_index > next_free);
+
+	/*
+	 * No need to memmove if we're just adding to the tail.
+	 */
+	if (insert_index != next_free) {
+		BUG_ON(next_free >= le16_to_cpu(el->l_count));
+
+		num_bytes = next_free - insert_index;
+		num_bytes *= sizeof(struct ocfs2_extent_rec);
+		memmove(&el->l_recs[insert_index + 1],
+			&el->l_recs[insert_index],
+			num_bytes);
+	}
+
+	/*
+	 * Either we had an empty extent, and need to re-increment or
+	 * there was no empty extent on a non full rightmost leaf node,
+	 * in which case we still need to increment.
+	 */
+	next_free++;
+	el->l_next_free_rec = cpu_to_le16(next_free);
+	/*
+	 * Make sure none of the math above just messed up our tree.
+	 */
+	BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
+
+	el->l_recs[insert_index] = *insert_rec;
+
+}
+
+static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
+{
+	int size, num_recs = le16_to_cpu(el->l_next_free_rec);
+
+	BUG_ON(num_recs == 0);
+
+	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+		num_recs--;
+		size = num_recs * sizeof(struct ocfs2_extent_rec);
+		memmove(&el->l_recs[0], &el->l_recs[1], size);
+		memset(&el->l_recs[num_recs], 0,
+		       sizeof(struct ocfs2_extent_rec));
+		el->l_next_free_rec = cpu_to_le16(num_recs);
+	}
+}
+
+/*
+ * Create an empty extent record .
+ *
+ * l_next_free_rec may be updated.
+ *
+ * If an empty extent already exists do nothing.
+ */
+static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
+{
+	int next_free = le16_to_cpu(el->l_next_free_rec);
+
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+	if (next_free == 0)
+		goto set_and_inc;
+
+	if (ocfs2_is_empty_extent(&el->l_recs[0]))
+		return;
+
+	mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
+			"Asked to create an empty extent in a full list:\n"
+			"count = %u, tree depth = %u",
+			le16_to_cpu(el->l_count),
+			le16_to_cpu(el->l_tree_depth));
+
+	ocfs2_shift_records_right(el);
+
+set_and_inc:
+	le16_add_cpu(&el->l_next_free_rec, 1);
+	memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+}
+
+/*
+ * For a rotation which involves two leaf nodes, the "root node" is
+ * the lowest level tree node which contains a path to both leafs. This
+ * resulting set of information can be used to form a complete "subtree"
+ *
+ * This function is passed two full paths from the dinode down to a
+ * pair of adjacent leaves. It's task is to figure out which path
+ * index contains the subtree root - this can be the root index itself
+ * in a worst-case rotation.
+ *
+ * The array index of the subtree root is passed back.
+ */
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+			    struct ocfs2_path *left,
+			    struct ocfs2_path *right)
+{
+	int i = 0;
+
+	/*
+	 * Check that the caller passed in two paths from the same tree.
+	 */
+	BUG_ON(path_root_bh(left) != path_root_bh(right));
+
+	do {
+		i++;
+
+		/*
+		 * The caller didn't pass two adjacent paths.
+		 */
+		mlog_bug_on_msg(i > left->p_tree_depth,
+				"Owner %llu, left depth %u, right depth %u\n"
+				"left leaf blk %llu, right leaf blk %llu\n",
+				(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+				left->p_tree_depth, right->p_tree_depth,
+				(unsigned long long)path_leaf_bh(left)->b_blocknr,
+				(unsigned long long)path_leaf_bh(right)->b_blocknr);
+	} while (left->p_node[i].bh->b_blocknr ==
+		 right->p_node[i].bh->b_blocknr);
+
+	return i - 1;
+}
+
+typedef void (path_insert_t)(void *, struct buffer_head *);
+
+/*
+ * Traverse a btree path in search of cpos, starting at root_el.
+ *
+ * This code can be called with a cpos larger than the tree, in which
+ * case it will return the rightmost path.
+ */
+static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
+			     struct ocfs2_extent_list *root_el, u32 cpos,
+			     path_insert_t *func, void *data)
+{
+	int i, ret = 0;
+	u32 range;
+	u64 blkno;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+
+	el = root_el;
+	while (el->l_tree_depth) {
+		if (le16_to_cpu(el->l_next_free_rec) == 0) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+				    "Owner %llu has empty extent list at "
+				    "depth %u\n",
+				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
+				    le16_to_cpu(el->l_tree_depth));
+			ret = -EROFS;
+			goto out;
+
+		}
+
+		for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
+			rec = &el->l_recs[i];
+
+			/*
+			 * In the case that cpos is off the allocation
+			 * tree, this should just wind up returning the
+			 * rightmost record.
+			 */
+			range = le32_to_cpu(rec->e_cpos) +
+				ocfs2_rec_clusters(el, rec);
+			if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+			    break;
+		}
+
+		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+		if (blkno == 0) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+				    "Owner %llu has bad blkno in extent list "
+				    "at depth %u (index %d)\n",
+				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
+				    le16_to_cpu(el->l_tree_depth), i);
+			ret = -EROFS;
+			goto out;
+		}
+
+		brelse(bh);
+		bh = NULL;
+		ret = ocfs2_read_extent_block(ci, blkno, &bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		el = &eb->h_list;
+
+		if (le16_to_cpu(el->l_next_free_rec) >
+		    le16_to_cpu(el->l_count)) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+				    "Owner %llu has bad count in extent list "
+				    "at block %llu (next free=%u, count=%u)\n",
+				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
+				    (unsigned long long)bh->b_blocknr,
+				    le16_to_cpu(el->l_next_free_rec),
+				    le16_to_cpu(el->l_count));
+			ret = -EROFS;
+			goto out;
+		}
+
+		if (func)
+			func(data, bh);
+	}
+
+out:
+	/*
+	 * Catch any trailing bh that the loop didn't handle.
+	 */
+	brelse(bh);
+
+	return ret;
+}
+
+/*
+ * Given an initialized path (that is, it has a valid root extent
+ * list), this function will traverse the btree in search of the path
+ * which would contain cpos.
+ *
+ * The path traveled is recorded in the path structure.
+ *
+ * Note that this will not do any comparisons on leaf node extent
+ * records, so it will work fine in the case that we just added a tree
+ * branch.
+ */
+struct find_path_data {
+	int index;
+	struct ocfs2_path *path;
+};
+static void find_path_ins(void *data, struct buffer_head *bh)
+{
+	struct find_path_data *fp = data;
+
+	get_bh(bh);
+	ocfs2_path_insert_eb(fp->path, fp->index, bh);
+	fp->index++;
+}
+int ocfs2_find_path(struct ocfs2_caching_info *ci,
+		    struct ocfs2_path *path, u32 cpos)
+{
+	struct find_path_data data;
+
+	data.index = 1;
+	data.path = path;
+	return __ocfs2_find_path(ci, path_root_el(path), cpos,
+				 find_path_ins, &data);
+}
+
+static void find_leaf_ins(void *data, struct buffer_head *bh)
+{
+	struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
+	struct ocfs2_extent_list *el = &eb->h_list;
+	struct buffer_head **ret = data;
+
+	/* We want to retain only the leaf block. */
+	if (le16_to_cpu(el->l_tree_depth) == 0) {
+		get_bh(bh);
+		*ret = bh;
+	}
+}
+/*
+ * Find the leaf block in the tree which would contain cpos. No
+ * checking of the actual leaf is done.
+ *
+ * Some paths want to call this instead of allocating a path structure
+ * and calling ocfs2_find_path().
+ *
+ * This function doesn't handle non btree extent lists.
+ */
+int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
+		    struct ocfs2_extent_list *root_el, u32 cpos,
+		    struct buffer_head **leaf_bh)
+{
+	int ret;
+	struct buffer_head *bh = NULL;
+
+	ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*leaf_bh = bh;
+out:
+	return ret;
+}
+
+/*
+ * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
+ *
+ * Basically, we've moved stuff around at the bottom of the tree and
+ * we need to fix up the extent records above the changes to reflect
+ * the new changes.
+ *
+ * left_rec: the record on the left.
+ * left_child_el: is the child list pointed to by left_rec
+ * right_rec: the record to the right of left_rec
+ * right_child_el: is the child list pointed to by right_rec
+ *
+ * By definition, this only works on interior nodes.
+ */
+static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
+				  struct ocfs2_extent_list *left_child_el,
+				  struct ocfs2_extent_rec *right_rec,
+				  struct ocfs2_extent_list *right_child_el)
+{
+	u32 left_clusters, right_end;
+
+	/*
+	 * Interior nodes never have holes. Their cpos is the cpos of
+	 * the leftmost record in their child list. Their cluster
+	 * count covers the full theoretical range of their child list
+	 * - the range between their cpos and the cpos of the record
+	 * immediately to their right.
+	 */
+	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
+	if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
+		BUG_ON(right_child_el->l_tree_depth);
+		BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
+		left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
+	}
+	left_clusters -= le32_to_cpu(left_rec->e_cpos);
+	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
+
+	/*
+	 * Calculate the rightmost cluster count boundary before
+	 * moving cpos - we will need to adjust clusters after
+	 * updating e_cpos to keep the same highest cluster count.
+	 */
+	right_end = le32_to_cpu(right_rec->e_cpos);
+	right_end += le32_to_cpu(right_rec->e_int_clusters);
+
+	right_rec->e_cpos = left_rec->e_cpos;
+	le32_add_cpu(&right_rec->e_cpos, left_clusters);
+
+	right_end -= le32_to_cpu(right_rec->e_cpos);
+	right_rec->e_int_clusters = cpu_to_le32(right_end);
+}
+
+/*
+ * Adjust the adjacent root node records involved in a
+ * rotation. left_el_blkno is passed in as a key so that we can easily
+ * find it's index in the root list.
+ */
+static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
+				      struct ocfs2_extent_list *left_el,
+				      struct ocfs2_extent_list *right_el,
+				      u64 left_el_blkno)
+{
+	int i;
+
+	BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
+	       le16_to_cpu(left_el->l_tree_depth));
+
+	for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
+		if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
+			break;
+	}
+
+	/*
+	 * The path walking code should have never returned a root and
+	 * two paths which are not adjacent.
+	 */
+	BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
+
+	ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
+				      &root_el->l_recs[i + 1], right_el);
+}
+
+/*
+ * We've changed a leaf block (in right_path) and need to reflect that
+ * change back up the subtree.
+ *
+ * This happens in multiple places:
+ *   - When we've moved an extent record from the left path leaf to the right
+ *     path leaf to make room for an empty extent in the left path leaf.
+ *   - When our insert into the right path leaf is at the leftmost edge
+ *     and requires an update of the path immediately to it's left. This
+ *     can occur at the end of some types of rotation and appending inserts.
+ *   - When we've adjusted the last extent record in the left path leaf and the
+ *     1st extent record in the right path leaf during cross extent block merge.
+ */
+static void ocfs2_complete_edge_insert(handle_t *handle,
+				       struct ocfs2_path *left_path,
+				       struct ocfs2_path *right_path,
+				       int subtree_index)
+{
+	int i, idx;
+	struct ocfs2_extent_list *el, *left_el, *right_el;
+	struct ocfs2_extent_rec *left_rec, *right_rec;
+	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+
+	/*
+	 * Update the counts and position values within all the
+	 * interior nodes to reflect the leaf rotation we just did.
+	 *
+	 * The root node is handled below the loop.
+	 *
+	 * We begin the loop with right_el and left_el pointing to the
+	 * leaf lists and work our way up.
+	 *
+	 * NOTE: within this loop, left_el and right_el always refer
+	 * to the *child* lists.
+	 */
+	left_el = path_leaf_el(left_path);
+	right_el = path_leaf_el(right_path);
+	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
+		trace_ocfs2_complete_edge_insert(i);
+
+		/*
+		 * One nice property of knowing that all of these
+		 * nodes are below the root is that we only deal with
+		 * the leftmost right node record and the rightmost
+		 * left node record.
+		 */
+		el = left_path->p_node[i].el;
+		idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
+		left_rec = &el->l_recs[idx];
+
+		el = right_path->p_node[i].el;
+		right_rec = &el->l_recs[0];
+
+		ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
+					      right_el);
+
+		ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+		ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
+
+		/*
+		 * Setup our list pointers now so that the current
+		 * parents become children in the next iteration.
+		 */
+		left_el = left_path->p_node[i].el;
+		right_el = right_path->p_node[i].el;
+	}
+
+	/*
+	 * At the root node, adjust the two adjacent records which
+	 * begin our path to the leaves.
+	 */
+
+	el = left_path->p_node[subtree_index].el;
+	left_el = left_path->p_node[subtree_index + 1].el;
+	right_el = right_path->p_node[subtree_index + 1].el;
+
+	ocfs2_adjust_root_records(el, left_el, right_el,
+				  left_path->p_node[subtree_index + 1].bh->b_blocknr);
+
+	root_bh = left_path->p_node[subtree_index].bh;
+
+	ocfs2_journal_dirty(handle, root_bh);
+}
+
+static int ocfs2_rotate_subtree_right(handle_t *handle,
+				      struct ocfs2_extent_tree *et,
+				      struct ocfs2_path *left_path,
+				      struct ocfs2_path *right_path,
+				      int subtree_index)
+{
+	int ret, i;
+	struct buffer_head *right_leaf_bh;
+	struct buffer_head *left_leaf_bh = NULL;
+	struct buffer_head *root_bh;
+	struct ocfs2_extent_list *right_el, *left_el;
+	struct ocfs2_extent_rec move_rec;
+
+	left_leaf_bh = path_leaf_bh(left_path);
+	left_el = path_leaf_el(left_path);
+
+	if (left_el->l_next_free_rec != left_el->l_count) {
+		ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+			    "Inode %llu has non-full interior leaf node %llu"
+			    "(next free = %u)",
+			    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+			    (unsigned long long)left_leaf_bh->b_blocknr,
+			    le16_to_cpu(left_el->l_next_free_rec));
+		return -EROFS;
+	}
+
+	/*
+	 * This extent block may already have an empty record, so we
+	 * return early if so.
+	 */
+	if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
+		return 0;
+
+	root_bh = left_path->p_node[subtree_index].bh;
+	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
+					   subtree_index);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+						   right_path, i);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+						   left_path, i);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	right_leaf_bh = path_leaf_bh(right_path);
+	right_el = path_leaf_el(right_path);
+
+	/* This is a code error, not a disk corruption. */
+	mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
+			"because rightmost leaf block %llu is empty\n",
+			(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+			(unsigned long long)right_leaf_bh->b_blocknr);
+
+	ocfs2_create_empty_extent(right_el);
+
+	ocfs2_journal_dirty(handle, right_leaf_bh);
+
+	/* Do the copy now. */
+	i = le16_to_cpu(left_el->l_next_free_rec) - 1;
+	move_rec = left_el->l_recs[i];
+	right_el->l_recs[0] = move_rec;
+
+	/*
+	 * Clear out the record we just copied and shift everything
+	 * over, leaving an empty extent in the left leaf.
+	 *
+	 * We temporarily subtract from next_free_rec so that the
+	 * shift will lose the tail record (which is now defunct).
+	 */
+	le16_add_cpu(&left_el->l_next_free_rec, -1);
+	ocfs2_shift_records_right(left_el);
+	memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+	le16_add_cpu(&left_el->l_next_free_rec, 1);
+
+	ocfs2_journal_dirty(handle, left_leaf_bh);
+
+	ocfs2_complete_edge_insert(handle, left_path, right_path,
+				   subtree_index);
+
+out:
+	return ret;
+}
+
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the left of the current one.
+ *
+ * Will return zero if the path passed in is already the leftmost path.
+ */
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+				  struct ocfs2_path *path, u32 *cpos)
+{
+	int i, j, ret = 0;
+	u64 blkno;
+	struct ocfs2_extent_list *el;
+
+	BUG_ON(path->p_tree_depth == 0);
+
+	*cpos = 0;
+
+	blkno = path_leaf_bh(path)->b_blocknr;
+
+	/* Start at the tree node just above the leaf and work our way up. */
+	i = path->p_tree_depth - 1;
+	while (i >= 0) {
+		el = path->p_node[i].el;
+
+		/*
+		 * Find the extent record just before the one in our
+		 * path.
+		 */
+		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+				if (j == 0) {
+					if (i == 0) {
+						/*
+						 * We've determined that the
+						 * path specified is already
+						 * the leftmost one - return a
+						 * cpos of zero.
+						 */
+						goto out;
+					}
+					/*
+					 * The leftmost record points to our
+					 * leaf - we need to travel up the
+					 * tree one level.
+					 */
+					goto next_node;
+				}
+
+				*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
+				*cpos = *cpos + ocfs2_rec_clusters(el,
+							   &el->l_recs[j - 1]);
+				*cpos = *cpos - 1;
+				goto out;
+			}
+		}
+
+		/*
+		 * If we got here, we never found a valid node where
+		 * the tree indicated one should be.
+		 */
+		ocfs2_error(sb,
+			    "Invalid extent tree at extent block %llu\n",
+			    (unsigned long long)blkno);
+		ret = -EROFS;
+		goto out;
+
+next_node:
+		blkno = path->p_node[i].bh->b_blocknr;
+		i--;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Extend the transaction by enough credits to complete the rotation,
+ * and still leave at least the original number of credits allocated
+ * to this transaction.
+ */
+static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
+					   int op_credits,
+					   struct ocfs2_path *path)
+{
+	int ret = 0;
+	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
+
+	if (handle->h_buffer_credits < credits)
+		ret = ocfs2_extend_trans(handle,
+					 credits - handle->h_buffer_credits);
+
+	return ret;
+}
+
+/*
+ * Trap the case where we're inserting into the theoretical range past
+ * the _actual_ left leaf range. Otherwise, we'll rotate a record
+ * whose cpos is less than ours into the right leaf.
+ *
+ * It's only necessary to look at the rightmost record of the left
+ * leaf because the logic that calls us should ensure that the
+ * theoretical ranges in the path components above the leaves are
+ * correct.
+ */
+static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
+						 u32 insert_cpos)
+{
+	struct ocfs2_extent_list *left_el;
+	struct ocfs2_extent_rec *rec;
+	int next_free;
+
+	left_el = path_leaf_el(left_path);
+	next_free = le16_to_cpu(left_el->l_next_free_rec);
+	rec = &left_el->l_recs[next_free - 1];
+
+	if (insert_cpos > le32_to_cpu(rec->e_cpos))
+		return 1;
+	return 0;
+}
+
+static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
+{
+	int next_free = le16_to_cpu(el->l_next_free_rec);
+	unsigned int range;
+	struct ocfs2_extent_rec *rec;
+
+	if (next_free == 0)
+		return 0;
+
+	rec = &el->l_recs[0];
+	if (ocfs2_is_empty_extent(rec)) {
+		/* Empty list. */
+		if (next_free == 1)
+			return 0;
+		rec = &el->l_recs[1];
+	}
+
+	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+	if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+		return 1;
+	return 0;
+}
+
+/*
+ * Rotate all the records in a btree right one record, starting at insert_cpos.
+ *
+ * The path to the rightmost leaf should be passed in.
+ *
+ * The array is assumed to be large enough to hold an entire path (tree depth).
+ *
+ * Upon successful return from this function:
+ *
+ * - The 'right_path' array will contain a path to the leaf block
+ *   whose range contains e_cpos.
+ * - That leaf block will have a single empty extent in list index 0.
+ * - In the case that the rotation requires a post-insert update,
+ *   *ret_left_path will contain a valid path which can be passed to
+ *   ocfs2_insert_path().
+ */
+static int ocfs2_rotate_tree_right(handle_t *handle,
+				   struct ocfs2_extent_tree *et,
+				   enum ocfs2_split_type split,
+				   u32 insert_cpos,
+				   struct ocfs2_path *right_path,
+				   struct ocfs2_path **ret_left_path)
+{
+	int ret, start, orig_credits = handle->h_buffer_credits;
+	u32 cpos;
+	struct ocfs2_path *left_path = NULL;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+
+	*ret_left_path = NULL;
+
+	left_path = ocfs2_new_path_from_path(right_path);
+	if (!left_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_rotate_tree_right(
+		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		insert_cpos, cpos);
+
+	/*
+	 * What we want to do here is:
+	 *
+	 * 1) Start with the rightmost path.
+	 *
+	 * 2) Determine a path to the leaf block directly to the left
+	 *    of that leaf.
+	 *
+	 * 3) Determine the 'subtree root' - the lowest level tree node
+	 *    which contains a path to both leaves.
+	 *
+	 * 4) Rotate the subtree.
+	 *
+	 * 5) Find the next subtree by considering the left path to be
+	 *    the new right path.
+	 *
+	 * The check at the top of this while loop also accepts
+	 * insert_cpos == cpos because cpos is only a _theoretical_
+	 * value to get us the left path - insert_cpos might very well
+	 * be filling that hole.
+	 *
+	 * Stop at a cpos of '0' because we either started at the
+	 * leftmost branch (i.e., a tree with one branch and a
+	 * rotation inside of it), or we've gone as far as we can in
+	 * rotating subtrees.
+	 */
+	while (cpos && insert_cpos <= cpos) {
+		trace_ocfs2_rotate_tree_right(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			insert_cpos, cpos);
+
+		ret = ocfs2_find_path(et->et_ci, left_path, cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		mlog_bug_on_msg(path_leaf_bh(left_path) ==
+				path_leaf_bh(right_path),
+				"Owner %llu: error during insert of %u "
+				"(left path cpos %u) results in two identical "
+				"paths ending at %llu\n",
+				(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+				insert_cpos, cpos,
+				(unsigned long long)
+				path_leaf_bh(left_path)->b_blocknr);
+
+		if (split == SPLIT_NONE &&
+		    ocfs2_rotate_requires_path_adjustment(left_path,
+							  insert_cpos)) {
+
+			/*
+			 * We've rotated the tree as much as we
+			 * should. The rest is up to
+			 * ocfs2_insert_path() to complete, after the
+			 * record insertion. We indicate this
+			 * situation by returning the left path.
+			 *
+			 * The reason we don't adjust the records here
+			 * before the record insert is that an error
+			 * later might break the rule where a parent
+			 * record e_cpos will reflect the actual
+			 * e_cpos of the 1st nonempty record of the
+			 * child list.
+			 */
+			*ret_left_path = left_path;
+			goto out_ret_path;
+		}
+
+		start = ocfs2_find_subtree_root(et, left_path, right_path);
+
+		trace_ocfs2_rotate_subtree(start,
+			(unsigned long long)
+			right_path->p_node[start].bh->b_blocknr,
+			right_path->p_tree_depth);
+
+		ret = ocfs2_extend_rotate_transaction(handle, start,
+						      orig_credits, right_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_rotate_subtree_right(handle, et, left_path,
+						 right_path, start);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (split != SPLIT_NONE &&
+		    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
+						insert_cpos)) {
+			/*
+			 * A rotate moves the rightmost left leaf
+			 * record over to the leftmost right leaf
+			 * slot. If we're doing an extent split
+			 * instead of a real insert, then we have to
+			 * check that the extent to be split wasn't
+			 * just moved over. If it was, then we can
+			 * exit here, passing left_path back -
+			 * ocfs2_split_extent() is smart enough to
+			 * search both leaves.
+			 */
+			*ret_left_path = left_path;
+			goto out_ret_path;
+		}
+
+		/*
+		 * There is no need to re-read the next right path
+		 * as we know that it'll be our current left
+		 * path. Optimize by copying values instead.
+		 */
+		ocfs2_mv_path(right_path, left_path);
+
+		ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	ocfs2_free_path(left_path);
+
+out_ret_path:
+	return ret;
+}
+
+static int ocfs2_update_edge_lengths(handle_t *handle,
+				     struct ocfs2_extent_tree *et,
+				     int subtree_index, struct ocfs2_path *path)
+{
+	int i, idx, ret;
+	struct ocfs2_extent_rec *rec;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_block *eb;
+	u32 range;
+
+	/*
+	 * In normal tree rotation process, we will never touch the
+	 * tree branch above subtree_index and ocfs2_extend_rotate_transaction
+	 * doesn't reserve the credits for them either.
+	 *
+	 * But we do have a special case here which will update the rightmost
+	 * records for all the bh in the path.
+	 * So we have to allocate extra credits and access them.
+	 */
+	ret = ocfs2_extend_trans(handle, subtree_index);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Path should always be rightmost. */
+	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+	BUG_ON(eb->h_next_leaf_blk != 0ULL);
+
+	el = &eb->h_list;
+	BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+	idx = le16_to_cpu(el->l_next_free_rec) - 1;
+	rec = &el->l_recs[idx];
+	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+	for (i = 0; i < path->p_tree_depth; i++) {
+		el = path->p_node[i].el;
+		idx = le16_to_cpu(el->l_next_free_rec) - 1;
+		rec = &el->l_recs[idx];
+
+		rec->e_int_clusters = cpu_to_le32(range);
+		le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
+
+		ocfs2_journal_dirty(handle, path->p_node[i].bh);
+	}
+out:
+	return ret;
+}
+
+static void ocfs2_unlink_path(handle_t *handle,
+			      struct ocfs2_extent_tree *et,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc,
+			      struct ocfs2_path *path, int unlink_start)
+{
+	int ret, i;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *bh;
+
+	for(i = unlink_start; i < path_num_items(path); i++) {
+		bh = path->p_node[i].bh;
+
+		eb = (struct ocfs2_extent_block *)bh->b_data;
+		/*
+		 * Not all nodes might have had their final count
+		 * decremented by the caller - handle this here.
+		 */
+		el = &eb->h_list;
+		if (le16_to_cpu(el->l_next_free_rec) > 1) {
+			mlog(ML_ERROR,
+			     "Inode %llu, attempted to remove extent block "
+			     "%llu with %u records\n",
+			     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+			     (unsigned long long)le64_to_cpu(eb->h_blkno),
+			     le16_to_cpu(el->l_next_free_rec));
+
+			ocfs2_journal_dirty(handle, bh);
+			ocfs2_remove_from_cache(et->et_ci, bh);
+			continue;
+		}
+
+		el->l_next_free_rec = 0;
+		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+
+		ocfs2_journal_dirty(handle, bh);
+
+		ret = ocfs2_cache_extent_block_free(dealloc, eb);
+		if (ret)
+			mlog_errno(ret);
+
+		ocfs2_remove_from_cache(et->et_ci, bh);
+	}
+}
+
+static void ocfs2_unlink_subtree(handle_t *handle,
+				 struct ocfs2_extent_tree *et,
+				 struct ocfs2_path *left_path,
+				 struct ocfs2_path *right_path,
+				 int subtree_index,
+				 struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int i;
+	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+	struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_block *eb;
+
+	el = path_leaf_el(left_path);
+
+	eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
+
+	for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+		if (root_el->l_recs[i].e_blkno == eb->h_blkno)
+			break;
+
+	BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
+
+	memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+	le16_add_cpu(&root_el->l_next_free_rec, -1);
+
+	eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+	eb->h_next_leaf_blk = 0;
+
+	ocfs2_journal_dirty(handle, root_bh);
+	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+
+	ocfs2_unlink_path(handle, et, dealloc, right_path,
+			  subtree_index + 1);
+}
+
+static int ocfs2_rotate_subtree_left(handle_t *handle,
+				     struct ocfs2_extent_tree *et,
+				     struct ocfs2_path *left_path,
+				     struct ocfs2_path *right_path,
+				     int subtree_index,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc,
+				     int *deleted)
+{
+	int ret, i, del_right_subtree = 0, right_has_empty = 0;
+	struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
+	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
+	struct ocfs2_extent_block *eb;
+
+	*deleted = 0;
+
+	right_leaf_el = path_leaf_el(right_path);
+	left_leaf_el = path_leaf_el(left_path);
+	root_bh = left_path->p_node[subtree_index].bh;
+	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+	if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
+		return 0;
+
+	eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
+	if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
+		/*
+		 * It's legal for us to proceed if the right leaf is
+		 * the rightmost one and it has an empty extent. There
+		 * are two cases to handle - whether the leaf will be
+		 * empty after removal or not. If the leaf isn't empty
+		 * then just remove the empty extent up front. The
+		 * next block will handle empty leaves by flagging
+		 * them for unlink.
+		 *
+		 * Non rightmost leaves will throw -EAGAIN and the
+		 * caller can manually move the subtree and retry.
+		 */
+
+		if (eb->h_next_leaf_blk != 0ULL)
+			return -EAGAIN;
+
+		if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
+			ret = ocfs2_journal_access_eb(handle, et->et_ci,
+						      path_leaf_bh(right_path),
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ocfs2_remove_empty_extent(right_leaf_el);
+		} else
+			right_has_empty = 1;
+	}
+
+	if (eb->h_next_leaf_blk == 0ULL &&
+	    le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
+		/*
+		 * We have to update i_last_eb_blk during the meta
+		 * data delete.
+		 */
+		ret = ocfs2_et_root_journal_access(handle, et,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		del_right_subtree = 1;
+	}
+
+	/*
+	 * Getting here with an empty extent in the right path implies
+	 * that it's the rightmost path and will be deleted.
+	 */
+	BUG_ON(right_has_empty && !del_right_subtree);
+
+	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
+					   subtree_index);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+						   right_path, i);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+						   left_path, i);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (!right_has_empty) {
+		/*
+		 * Only do this if we're moving a real
+		 * record. Otherwise, the action is delayed until
+		 * after removal of the right path in which case we
+		 * can do a simple shift to remove the empty extent.
+		 */
+		ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
+		memset(&right_leaf_el->l_recs[0], 0,
+		       sizeof(struct ocfs2_extent_rec));
+	}
+	if (eb->h_next_leaf_blk == 0ULL) {
+		/*
+		 * Move recs over to get rid of empty extent, decrease
+		 * next_free. This is allowed to remove the last
+		 * extent in our leaf (setting l_next_free_rec to
+		 * zero) - the delete code below won't care.
+		 */
+		ocfs2_remove_empty_extent(right_leaf_el);
+	}
+
+	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+	ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+
+	if (del_right_subtree) {
+		ocfs2_unlink_subtree(handle, et, left_path, right_path,
+				     subtree_index, dealloc);
+		ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
+						left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
+
+		/*
+		 * Removal of the extent in the left leaf was skipped
+		 * above so we could delete the right path
+		 * 1st.
+		 */
+		if (right_has_empty)
+			ocfs2_remove_empty_extent(left_leaf_el);
+
+		ocfs2_journal_dirty(handle, et_root_bh);
+
+		*deleted = 1;
+	} else
+		ocfs2_complete_edge_insert(handle, left_path, right_path,
+					   subtree_index);
+
+out:
+	return ret;
+}
+
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the right of the current one.
+ *
+ * Will return zero if the path passed in is already the rightmost path.
+ *
+ * This looks similar, but is subtly different to
+ * ocfs2_find_cpos_for_left_leaf().
+ */
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+				   struct ocfs2_path *path, u32 *cpos)
+{
+	int i, j, ret = 0;
+	u64 blkno;
+	struct ocfs2_extent_list *el;
+
+	*cpos = 0;
+
+	if (path->p_tree_depth == 0)
+		return 0;
+
+	blkno = path_leaf_bh(path)->b_blocknr;
+
+	/* Start at the tree node just above the leaf and work our way up. */
+	i = path->p_tree_depth - 1;
+	while (i >= 0) {
+		int next_free;
+
+		el = path->p_node[i].el;
+
+		/*
+		 * Find the extent record just after the one in our
+		 * path.
+		 */
+		next_free = le16_to_cpu(el->l_next_free_rec);
+		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+				if (j == (next_free - 1)) {
+					if (i == 0) {
+						/*
+						 * We've determined that the
+						 * path specified is already
+						 * the rightmost one - return a
+						 * cpos of zero.
+						 */
+						goto out;
+					}
+					/*
+					 * The rightmost record points to our
+					 * leaf - we need to travel up the
+					 * tree one level.
+					 */
+					goto next_node;
+				}
+
+				*cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
+				goto out;
+			}
+		}
+
+		/*
+		 * If we got here, we never found a valid node where
+		 * the tree indicated one should be.
+		 */
+		ocfs2_error(sb,
+			    "Invalid extent tree at extent block %llu\n",
+			    (unsigned long long)blkno);
+		ret = -EROFS;
+		goto out;
+
+next_node:
+		blkno = path->p_node[i].bh->b_blocknr;
+		i--;
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
+					    struct ocfs2_extent_tree *et,
+					    struct ocfs2_path *path)
+{
+	int ret;
+	struct buffer_head *bh = path_leaf_bh(path);
+	struct ocfs2_extent_list *el = path_leaf_el(path);
+
+	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+		return 0;
+
+	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
+					   path_num_items(path) - 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_remove_empty_extent(el);
+	ocfs2_journal_dirty(handle, bh);
+
+out:
+	return ret;
+}
+
+static int __ocfs2_rotate_tree_left(handle_t *handle,
+				    struct ocfs2_extent_tree *et,
+				    int orig_credits,
+				    struct ocfs2_path *path,
+				    struct ocfs2_cached_dealloc_ctxt *dealloc,
+				    struct ocfs2_path **empty_extent_path)
+{
+	int ret, subtree_root, deleted;
+	u32 right_cpos;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_path *right_path = NULL;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+
+	BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
+
+	*empty_extent_path = NULL;
+
+	ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	left_path = ocfs2_new_path_from_path(path);
+	if (!left_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_cp_path(left_path, path);
+
+	right_path = ocfs2_new_path_from_path(path);
+	if (!right_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (right_cpos) {
+		ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		subtree_root = ocfs2_find_subtree_root(et, left_path,
+						       right_path);
+
+		trace_ocfs2_rotate_subtree(subtree_root,
+		     (unsigned long long)
+		     right_path->p_node[subtree_root].bh->b_blocknr,
+		     right_path->p_tree_depth);
+
+		ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+						      orig_credits, left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * Caller might still want to make changes to the
+		 * tree root, so re-add it to the journal here.
+		 */
+		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+						   left_path, 0);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_rotate_subtree_left(handle, et, left_path,
+						right_path, subtree_root,
+						dealloc, &deleted);
+		if (ret == -EAGAIN) {
+			/*
+			 * The rotation has to temporarily stop due to
+			 * the right subtree having an empty
+			 * extent. Pass it back to the caller for a
+			 * fixup.
+			 */
+			*empty_extent_path = right_path;
+			right_path = NULL;
+			goto out;
+		}
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * The subtree rotate might have removed records on
+		 * the rightmost edge. If so, then rotation is
+		 * complete.
+		 */
+		if (deleted)
+			break;
+
+		ocfs2_mv_path(left_path, right_path);
+
+		ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
+						     &right_cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	ocfs2_free_path(right_path);
+	ocfs2_free_path(left_path);
+
+	return ret;
+}
+
+static int ocfs2_remove_rightmost_path(handle_t *handle,
+				struct ocfs2_extent_tree *et,
+				struct ocfs2_path *path,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, subtree_index;
+	u32 cpos;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+
+
+	ret = ocfs2_et_sanity_check(et);
+	if (ret)
+		goto out;
+	/*
+	 * There's two ways we handle this depending on
+	 * whether path is the only existing one.
+	 */
+	ret = ocfs2_extend_rotate_transaction(handle, 0,
+					      handle->h_buffer_credits,
+					      path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+					    path, &cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (cpos) {
+		/*
+		 * We have a path to the left of this one - it needs
+		 * an update too.
+		 */
+		left_path = ocfs2_new_path_from_path(path);
+		if (!left_path) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_find_path(et->et_ci, left_path, cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		subtree_index = ocfs2_find_subtree_root(et, left_path, path);
+
+		ocfs2_unlink_subtree(handle, et, left_path, path,
+				     subtree_index, dealloc);
+		ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
+						left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
+	} else {
+		/*
+		 * 'path' is also the leftmost path which
+		 * means it must be the only one. This gets
+		 * handled differently because we want to
+		 * revert the root back to having extents
+		 * in-line.
+		 */
+		ocfs2_unlink_path(handle, et, dealloc, path, 1);
+
+		el = et->et_root_el;
+		el->l_tree_depth = 0;
+		el->l_next_free_rec = 0;
+		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+
+		ocfs2_et_set_last_eb_blk(et, 0);
+	}
+
+	ocfs2_journal_dirty(handle, path_root_bh(path));
+
+out:
+	ocfs2_free_path(left_path);
+	return ret;
+}
+
+/*
+ * Left rotation of btree records.
+ *
+ * In many ways, this is (unsurprisingly) the opposite of right
+ * rotation. We start at some non-rightmost path containing an empty
+ * extent in the leaf block. The code works its way to the rightmost
+ * path by rotating records to the left in every subtree.
+ *
+ * This is used by any code which reduces the number of extent records
+ * in a leaf. After removal, an empty record should be placed in the
+ * leftmost list position.
+ *
+ * This won't handle a length update of the rightmost path records if
+ * the rightmost tree leaf record is removed so the caller is
+ * responsible for detecting and correcting that.
+ */
+static int ocfs2_rotate_tree_left(handle_t *handle,
+				  struct ocfs2_extent_tree *et,
+				  struct ocfs2_path *path,
+				  struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, orig_credits = handle->h_buffer_credits;
+	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+
+	el = path_leaf_el(path);
+	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+		return 0;
+
+	if (path->p_tree_depth == 0) {
+rightmost_no_delete:
+		/*
+		 * Inline extents. This is trivially handled, so do
+		 * it up front.
+		 */
+		ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Handle rightmost branch now. There's several cases:
+	 *  1) simple rotation leaving records in there. That's trivial.
+	 *  2) rotation requiring a branch delete - there's no more
+	 *     records left. Two cases of this:
+	 *     a) There are branches to the left.
+	 *     b) This is also the leftmost (the only) branch.
+	 *
+	 *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
+	 *  2a) we need the left branch so that we can update it with the unlink
+	 *  2b) we need to bring the root back to inline extents.
+	 */
+
+	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+	el = &eb->h_list;
+	if (eb->h_next_leaf_blk == 0) {
+		/*
+		 * This gets a bit tricky if we're going to delete the
+		 * rightmost path. Get the other cases out of the way
+		 * 1st.
+		 */
+		if (le16_to_cpu(el->l_next_free_rec) > 1)
+			goto rightmost_no_delete;
+
+		if (le16_to_cpu(el->l_next_free_rec) == 0) {
+			ret = -EIO;
+			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+				    "Owner %llu has empty extent block at %llu",
+				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+				    (unsigned long long)le64_to_cpu(eb->h_blkno));
+			goto out;
+		}
+
+		/*
+		 * XXX: The caller can not trust "path" any more after
+		 * this as it will have been deleted. What do we do?
+		 *
+		 * In theory the rotate-for-merge code will never get
+		 * here because it'll always ask for a rotate in a
+		 * nonempty list.
+		 */
+
+		ret = ocfs2_remove_rightmost_path(handle, et, path,
+						  dealloc);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Now we can loop, remembering the path we get from -EAGAIN
+	 * and restarting from there.
+	 */
+try_rotate:
+	ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
+				       dealloc, &restart_path);
+	if (ret && ret != -EAGAIN) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (ret == -EAGAIN) {
+		tmp_path = restart_path;
+		restart_path = NULL;
+
+		ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
+					       tmp_path, dealloc,
+					       &restart_path);
+		if (ret && ret != -EAGAIN) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ocfs2_free_path(tmp_path);
+		tmp_path = NULL;
+
+		if (ret == 0)
+			goto try_rotate;
+	}
+
+out:
+	ocfs2_free_path(tmp_path);
+	ocfs2_free_path(restart_path);
+	return ret;
+}
+
+static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
+				int index)
+{
+	struct ocfs2_extent_rec *rec = &el->l_recs[index];
+	unsigned int size;
+
+	if (rec->e_leaf_clusters == 0) {
+		/*
+		 * We consumed all of the merged-from record. An empty
+		 * extent cannot exist anywhere but the 1st array
+		 * position, so move things over if the merged-from
+		 * record doesn't occupy that position.
+		 *
+		 * This creates a new empty extent so the caller
+		 * should be smart enough to have removed any existing
+		 * ones.
+		 */
+		if (index > 0) {
+			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+			size = index * sizeof(struct ocfs2_extent_rec);
+			memmove(&el->l_recs[1], &el->l_recs[0], size);
+		}
+
+		/*
+		 * Always memset - the caller doesn't check whether it
+		 * created an empty extent, so there could be junk in
+		 * the other fields.
+		 */
+		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+	}
+}
+
+static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
+				struct ocfs2_path *left_path,
+				struct ocfs2_path **ret_right_path)
+{
+	int ret;
+	u32 right_cpos;
+	struct ocfs2_path *right_path = NULL;
+	struct ocfs2_extent_list *left_el;
+
+	*ret_right_path = NULL;
+
+	/* This function shouldn't be called for non-trees. */
+	BUG_ON(left_path->p_tree_depth == 0);
+
+	left_el = path_leaf_el(left_path);
+	BUG_ON(left_el->l_next_free_rec != left_el->l_count);
+
+	ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+					     left_path, &right_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* This function shouldn't be called for the rightmost leaf. */
+	BUG_ON(right_cpos == 0);
+
+	right_path = ocfs2_new_path_from_path(left_path);
+	if (!right_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*ret_right_path = right_path;
+out:
+	if (ret)
+		ocfs2_free_path(right_path);
+	return ret;
+}
+
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the beginning of the record "next" to it.
+ * For index < l_count - 1, the next means the extent rec at index + 1.
+ * For index == l_count - 1, the "next" means the 1st extent rec of the
+ * next extent block.
+ */
+static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
+				 handle_t *handle,
+				 struct ocfs2_extent_tree *et,
+				 struct ocfs2_extent_rec *split_rec,
+				 int index)
+{
+	int ret, next_free, i;
+	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+	struct ocfs2_extent_rec *left_rec;
+	struct ocfs2_extent_rec *right_rec;
+	struct ocfs2_extent_list *right_el;
+	struct ocfs2_path *right_path = NULL;
+	int subtree_index = 0;
+	struct ocfs2_extent_list *el = path_leaf_el(left_path);
+	struct buffer_head *bh = path_leaf_bh(left_path);
+	struct buffer_head *root_bh = NULL;
+
+	BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
+	left_rec = &el->l_recs[index];
+
+	if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
+	    le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
+		/* we meet with a cross extent block merge. */
+		ret = ocfs2_get_right_path(et, left_path, &right_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		right_el = path_leaf_el(right_path);
+		next_free = le16_to_cpu(right_el->l_next_free_rec);
+		BUG_ON(next_free <= 0);
+		right_rec = &right_el->l_recs[0];
+		if (ocfs2_is_empty_extent(right_rec)) {
+			BUG_ON(next_free <= 1);
+			right_rec = &right_el->l_recs[1];
+		}
+
+		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+		       le16_to_cpu(left_rec->e_leaf_clusters) !=
+		       le32_to_cpu(right_rec->e_cpos));
+
+		subtree_index = ocfs2_find_subtree_root(et, left_path,
+							right_path);
+
+		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+						      handle->h_buffer_credits,
+						      right_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		root_bh = left_path->p_node[subtree_index].bh;
+		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+		ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
+						   subtree_index);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		for (i = subtree_index + 1;
+		     i < path_num_items(right_path); i++) {
+			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+							   right_path, i);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+							   left_path, i);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+	} else {
+		BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
+		right_rec = &el->l_recs[index + 1];
+	}
+
+	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
+					   path_num_items(left_path) - 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
+
+	le32_add_cpu(&right_rec->e_cpos, -split_clusters);
+	le64_add_cpu(&right_rec->e_blkno,
+		     -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
+					       split_clusters));
+	le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
+
+	ocfs2_cleanup_merge(el, index);
+
+	ocfs2_journal_dirty(handle, bh);
+	if (right_path) {
+		ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+		ocfs2_complete_edge_insert(handle, left_path, right_path,
+					   subtree_index);
+	}
+out:
+	if (right_path)
+		ocfs2_free_path(right_path);
+	return ret;
+}
+
+static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
+			       struct ocfs2_path *right_path,
+			       struct ocfs2_path **ret_left_path)
+{
+	int ret;
+	u32 left_cpos;
+	struct ocfs2_path *left_path = NULL;
+
+	*ret_left_path = NULL;
+
+	/* This function shouldn't be called for non-trees. */
+	BUG_ON(right_path->p_tree_depth == 0);
+
+	ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+					    right_path, &left_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* This function shouldn't be called for the leftmost leaf. */
+	BUG_ON(left_cpos == 0);
+
+	left_path = ocfs2_new_path_from_path(right_path);
+	if (!left_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*ret_left_path = left_path;
+out:
+	if (ret)
+		ocfs2_free_path(left_path);
+	return ret;
+}
+
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the tail of the record "before" it.
+ * For index > 0, the "before" means the extent rec at index - 1.
+ *
+ * For index == 0, the "before" means the last record of the previous
+ * extent block. And there is also a situation that we may need to
+ * remove the rightmost leaf extent block in the right_path and change
+ * the right path to indicate the new rightmost path.
+ */
+static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
+				handle_t *handle,
+				struct ocfs2_extent_tree *et,
+				struct ocfs2_extent_rec *split_rec,
+				struct ocfs2_cached_dealloc_ctxt *dealloc,
+				int index)
+{
+	int ret, i, subtree_index = 0, has_empty_extent = 0;
+	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+	struct ocfs2_extent_rec *left_rec;
+	struct ocfs2_extent_rec *right_rec;
+	struct ocfs2_extent_list *el = path_leaf_el(right_path);
+	struct buffer_head *bh = path_leaf_bh(right_path);
+	struct buffer_head *root_bh = NULL;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_extent_list *left_el;
+
+	BUG_ON(index < 0);
+
+	right_rec = &el->l_recs[index];
+	if (index == 0) {
+		/* we meet with a cross extent block merge. */
+		ret = ocfs2_get_left_path(et, right_path, &left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		left_el = path_leaf_el(left_path);
+		BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
+		       le16_to_cpu(left_el->l_count));
+
+		left_rec = &left_el->l_recs[
+				le16_to_cpu(left_el->l_next_free_rec) - 1];
+		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+		       le16_to_cpu(left_rec->e_leaf_clusters) !=
+		       le32_to_cpu(split_rec->e_cpos));
+
+		subtree_index = ocfs2_find_subtree_root(et, left_path,
+							right_path);
+
+		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+						      handle->h_buffer_credits,
+						      left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		root_bh = left_path->p_node[subtree_index].bh;
+		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+		ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
+						   subtree_index);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		for (i = subtree_index + 1;
+		     i < path_num_items(right_path); i++) {
+			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+							   right_path, i);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+							   left_path, i);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+	} else {
+		left_rec = &el->l_recs[index - 1];
+		if (ocfs2_is_empty_extent(&el->l_recs[0]))
+			has_empty_extent = 1;
+	}
+
+	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
+					   path_num_items(right_path) - 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (has_empty_extent && index == 1) {
+		/*
+		 * The easy case - we can just plop the record right in.
+		 */
+		*left_rec = *split_rec;
+
+		has_empty_extent = 0;
+	} else
+		le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
+
+	le32_add_cpu(&right_rec->e_cpos, split_clusters);
+	le64_add_cpu(&right_rec->e_blkno,
+		     ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
+					      split_clusters));
+	le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
+
+	ocfs2_cleanup_merge(el, index);
+
+	ocfs2_journal_dirty(handle, bh);
+	if (left_path) {
+		ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+
+		/*
+		 * In the situation that the right_rec is empty and the extent
+		 * block is empty also,  ocfs2_complete_edge_insert can't handle
+		 * it and we need to delete the right extent block.
+		 */
+		if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
+		    le16_to_cpu(el->l_next_free_rec) == 1) {
+
+			ret = ocfs2_remove_rightmost_path(handle, et,
+							  right_path,
+							  dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			/* Now the rightmost extent block has been deleted.
+			 * So we use the new rightmost path.
+			 */
+			ocfs2_mv_path(right_path, left_path);
+			left_path = NULL;
+		} else
+			ocfs2_complete_edge_insert(handle, left_path,
+						   right_path, subtree_index);
+	}
+out:
+	if (left_path)
+		ocfs2_free_path(left_path);
+	return ret;
+}
+
+static int ocfs2_try_to_merge_extent(handle_t *handle,
+				     struct ocfs2_extent_tree *et,
+				     struct ocfs2_path *path,
+				     int split_index,
+				     struct ocfs2_extent_rec *split_rec,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc,
+				     struct ocfs2_merge_ctxt *ctxt)
+{
+	int ret = 0;
+	struct ocfs2_extent_list *el = path_leaf_el(path);
+	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+
+	BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
+
+	if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+		/*
+		 * The merge code will need to create an empty
+		 * extent to take the place of the newly
+		 * emptied slot. Remove any pre-existing empty
+		 * extents - having more than one in a leaf is
+		 * illegal.
+		 */
+		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		split_index--;
+		rec = &el->l_recs[split_index];
+	}
+
+	if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
+		/*
+		 * Left-right contig implies this.
+		 */
+		BUG_ON(!ctxt->c_split_covers_rec);
+
+		/*
+		 * Since the leftright insert always covers the entire
+		 * extent, this call will delete the insert record
+		 * entirely, resulting in an empty extent record added to
+		 * the extent block.
+		 *
+		 * Since the adding of an empty extent shifts
+		 * everything back to the right, there's no need to
+		 * update split_index here.
+		 *
+		 * When the split_index is zero, we need to merge it to the
+		 * prevoius extent block. It is more efficient and easier
+		 * if we do merge_right first and merge_left later.
+		 */
+		ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
+					    split_index);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * We can only get this from logic error above.
+		 */
+		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+
+		/* The merge left us with an empty extent, remove it. */
+		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		rec = &el->l_recs[split_index];
+
+		/*
+		 * Note that we don't pass split_rec here on purpose -
+		 * we've merged it into the rec already.
+		 */
+		ret = ocfs2_merge_rec_left(path, handle, et, rec,
+					   dealloc, split_index);
+
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+		/*
+		 * Error from this last rotate is not critical, so
+		 * print but don't bubble it up.
+		 */
+		if (ret)
+			mlog_errno(ret);
+		ret = 0;
+	} else {
+		/*
+		 * Merge a record to the left or right.
+		 *
+		 * 'contig_type' is relative to the existing record,
+		 * so for example, if we're "right contig", it's to
+		 * the record on the left (hence the left merge).
+		 */
+		if (ctxt->c_contig_type == CONTIG_RIGHT) {
+			ret = ocfs2_merge_rec_left(path, handle, et,
+						   split_rec, dealloc,
+						   split_index);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		} else {
+			ret = ocfs2_merge_rec_right(path, handle,
+						    et, split_rec,
+						    split_index);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		if (ctxt->c_split_covers_rec) {
+			/*
+			 * The merge may have left an empty extent in
+			 * our leaf. Try to rotate it away.
+			 */
+			ret = ocfs2_rotate_tree_left(handle, et, path,
+						     dealloc);
+			if (ret)
+				mlog_errno(ret);
+			ret = 0;
+		}
+	}
+
+out:
+	return ret;
+}
+
+static void ocfs2_subtract_from_rec(struct super_block *sb,
+				    enum ocfs2_split_type split,
+				    struct ocfs2_extent_rec *rec,
+				    struct ocfs2_extent_rec *split_rec)
+{
+	u64 len_blocks;
+
+	len_blocks = ocfs2_clusters_to_blocks(sb,
+				le16_to_cpu(split_rec->e_leaf_clusters));
+
+	if (split == SPLIT_LEFT) {
+		/*
+		 * Region is on the left edge of the existing
+		 * record.
+		 */
+		le32_add_cpu(&rec->e_cpos,
+			     le16_to_cpu(split_rec->e_leaf_clusters));
+		le64_add_cpu(&rec->e_blkno, len_blocks);
+		le16_add_cpu(&rec->e_leaf_clusters,
+			     -le16_to_cpu(split_rec->e_leaf_clusters));
+	} else {
+		/*
+		 * Region is on the right edge of the existing
+		 * record.
+		 */
+		le16_add_cpu(&rec->e_leaf_clusters,
+			     -le16_to_cpu(split_rec->e_leaf_clusters));
+	}
+}
+
+/*
+ * Do the final bits of extent record insertion at the target leaf
+ * list. If this leaf is part of an allocation tree, it is assumed
+ * that the tree above has been prepared.
+ */
+static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
+				 struct ocfs2_extent_rec *insert_rec,
+				 struct ocfs2_extent_list *el,
+				 struct ocfs2_insert_type *insert)
+{
+	int i = insert->ins_contig_index;
+	unsigned int range;
+	struct ocfs2_extent_rec *rec;
+
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+	if (insert->ins_split != SPLIT_NONE) {
+		i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
+		BUG_ON(i == -1);
+		rec = &el->l_recs[i];
+		ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+					insert->ins_split, rec,
+					insert_rec);
+		goto rotate;
+	}
+
+	/*
+	 * Contiguous insert - either left or right.
+	 */
+	if (insert->ins_contig != CONTIG_NONE) {
+		rec = &el->l_recs[i];
+		if (insert->ins_contig == CONTIG_LEFT) {
+			rec->e_blkno = insert_rec->e_blkno;
+			rec->e_cpos = insert_rec->e_cpos;
+		}
+		le16_add_cpu(&rec->e_leaf_clusters,
+			     le16_to_cpu(insert_rec->e_leaf_clusters));
+		return;
+	}
+
+	/*
+	 * Handle insert into an empty leaf.
+	 */
+	if (le16_to_cpu(el->l_next_free_rec) == 0 ||
+	    ((le16_to_cpu(el->l_next_free_rec) == 1) &&
+	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
+		el->l_recs[0] = *insert_rec;
+		el->l_next_free_rec = cpu_to_le16(1);
+		return;
+	}
+
+	/*
+	 * Appending insert.
+	 */
+	if (insert->ins_appending == APPEND_TAIL) {
+		i = le16_to_cpu(el->l_next_free_rec) - 1;
+		rec = &el->l_recs[i];
+		range = le32_to_cpu(rec->e_cpos)
+			+ le16_to_cpu(rec->e_leaf_clusters);
+		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
+
+		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
+				le16_to_cpu(el->l_count),
+				"owner %llu, depth %u, count %u, next free %u, "
+				"rec.cpos %u, rec.clusters %u, "
+				"insert.cpos %u, insert.clusters %u\n",
+				ocfs2_metadata_cache_owner(et->et_ci),
+				le16_to_cpu(el->l_tree_depth),
+				le16_to_cpu(el->l_count),
+				le16_to_cpu(el->l_next_free_rec),
+				le32_to_cpu(el->l_recs[i].e_cpos),
+				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
+				le32_to_cpu(insert_rec->e_cpos),
+				le16_to_cpu(insert_rec->e_leaf_clusters));
+		i++;
+		el->l_recs[i] = *insert_rec;
+		le16_add_cpu(&el->l_next_free_rec, 1);
+		return;
+	}
+
+rotate:
+	/*
+	 * Ok, we have to rotate.
+	 *
+	 * At this point, it is safe to assume that inserting into an
+	 * empty leaf and appending to a leaf have both been handled
+	 * above.
+	 *
+	 * This leaf needs to have space, either by the empty 1st
+	 * extent record, or by virtue of an l_next_rec < l_count.
+	 */
+	ocfs2_rotate_leaf(el, insert_rec);
+}
+
+static void ocfs2_adjust_rightmost_records(handle_t *handle,
+					   struct ocfs2_extent_tree *et,
+					   struct ocfs2_path *path,
+					   struct ocfs2_extent_rec *insert_rec)
+{
+	int ret, i, next_free;
+	struct buffer_head *bh;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+
+	/*
+	 * Update everything except the leaf block.
+	 */
+	for (i = 0; i < path->p_tree_depth; i++) {
+		bh = path->p_node[i].bh;
+		el = path->p_node[i].el;
+
+		next_free = le16_to_cpu(el->l_next_free_rec);
+		if (next_free == 0) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+				    "Owner %llu has a bad extent list",
+				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+			ret = -EIO;
+			return;
+		}
+
+		rec = &el->l_recs[next_free - 1];
+
+		rec->e_int_clusters = insert_rec->e_cpos;
+		le32_add_cpu(&rec->e_int_clusters,
+			     le16_to_cpu(insert_rec->e_leaf_clusters));
+		le32_add_cpu(&rec->e_int_clusters,
+			     -le32_to_cpu(rec->e_cpos));
+
+		ocfs2_journal_dirty(handle, bh);
+	}
+}
+
+static int ocfs2_append_rec_to_path(handle_t *handle,
+				    struct ocfs2_extent_tree *et,
+				    struct ocfs2_extent_rec *insert_rec,
+				    struct ocfs2_path *right_path,
+				    struct ocfs2_path **ret_left_path)
+{
+	int ret, next_free;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_path *left_path = NULL;
+
+	*ret_left_path = NULL;
+
+	/*
+	 * This shouldn't happen for non-trees. The extent rec cluster
+	 * count manipulation below only works for interior nodes.
+	 */
+	BUG_ON(right_path->p_tree_depth == 0);
+
+	/*
+	 * If our appending insert is at the leftmost edge of a leaf,
+	 * then we might need to update the rightmost records of the
+	 * neighboring path.
+	 */
+	el = path_leaf_el(right_path);
+	next_free = le16_to_cpu(el->l_next_free_rec);
+	if (next_free == 0 ||
+	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
+		u32 left_cpos;
+
+		ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+						    right_path, &left_cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		trace_ocfs2_append_rec_to_path(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			le32_to_cpu(insert_rec->e_cpos),
+			left_cpos);
+
+		/*
+		 * No need to worry if the append is already in the
+		 * leftmost leaf.
+		 */
+		if (left_cpos) {
+			left_path = ocfs2_new_path_from_path(right_path);
+			if (!left_path) {
+				ret = -ENOMEM;
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_find_path(et->et_ci, left_path,
+					      left_cpos);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			/*
+			 * ocfs2_insert_path() will pass the left_path to the
+			 * journal for us.
+			 */
+		}
+	}
+
+	ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
+
+	*ret_left_path = left_path;
+	ret = 0;
+out:
+	if (ret != 0)
+		ocfs2_free_path(left_path);
+
+	return ret;
+}
+
+static void ocfs2_split_record(struct ocfs2_extent_tree *et,
+			       struct ocfs2_path *left_path,
+			       struct ocfs2_path *right_path,
+			       struct ocfs2_extent_rec *split_rec,
+			       enum ocfs2_split_type split)
+{
+	int index;
+	u32 cpos = le32_to_cpu(split_rec->e_cpos);
+	struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
+	struct ocfs2_extent_rec *rec, *tmprec;
+
+	right_el = path_leaf_el(right_path);
+	if (left_path)
+		left_el = path_leaf_el(left_path);
+
+	el = right_el;
+	insert_el = right_el;
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index != -1) {
+		if (index == 0 && left_path) {
+			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+
+			/*
+			 * This typically means that the record
+			 * started in the left path but moved to the
+			 * right as a result of rotation. We either
+			 * move the existing record to the left, or we
+			 * do the later insert there.
+			 *
+			 * In this case, the left path should always
+			 * exist as the rotate code will have passed
+			 * it back for a post-insert update.
+			 */
+
+			if (split == SPLIT_LEFT) {
+				/*
+				 * It's a left split. Since we know
+				 * that the rotate code gave us an
+				 * empty extent in the left path, we
+				 * can just do the insert there.
+				 */
+				insert_el = left_el;
+			} else {
+				/*
+				 * Right split - we have to move the
+				 * existing record over to the left
+				 * leaf. The insert will be into the
+				 * newly created empty extent in the
+				 * right leaf.
+				 */
+				tmprec = &right_el->l_recs[index];
+				ocfs2_rotate_leaf(left_el, tmprec);
+				el = left_el;
+
+				memset(tmprec, 0, sizeof(*tmprec));
+				index = ocfs2_search_extent_list(left_el, cpos);
+				BUG_ON(index == -1);
+			}
+		}
+	} else {
+		BUG_ON(!left_path);
+		BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
+		/*
+		 * Left path is easy - we can just allow the insert to
+		 * happen.
+		 */
+		el = left_el;
+		insert_el = left_el;
+		index = ocfs2_search_extent_list(el, cpos);
+		BUG_ON(index == -1);
+	}
+
+	rec = &el->l_recs[index];
+	ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+				split, rec, split_rec);
+	ocfs2_rotate_leaf(insert_el, split_rec);
+}
+
+/*
+ * This function only does inserts on an allocation b-tree. For tree
+ * depth = 0, ocfs2_insert_at_leaf() is called directly.
+ *
+ * right_path is the path we want to do the actual insert
+ * in. left_path should only be passed in if we need to update that
+ * portion of the tree after an edge insert.
+ */
+static int ocfs2_insert_path(handle_t *handle,
+			     struct ocfs2_extent_tree *et,
+			     struct ocfs2_path *left_path,
+			     struct ocfs2_path *right_path,
+			     struct ocfs2_extent_rec *insert_rec,
+			     struct ocfs2_insert_type *insert)
+{
+	int ret, subtree_index;
+	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
+
+	if (left_path) {
+		/*
+		 * There's a chance that left_path got passed back to
+		 * us without being accounted for in the
+		 * journal. Extend our transaction here to be sure we
+		 * can change those blocks.
+		 */
+		ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * Pass both paths to the journal. The majority of inserts
+	 * will be touching all components anyway.
+	 */
+	ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (insert->ins_split != SPLIT_NONE) {
+		/*
+		 * We could call ocfs2_insert_at_leaf() for some types
+		 * of splits, but it's easier to just let one separate
+		 * function sort it all out.
+		 */
+		ocfs2_split_record(et, left_path, right_path,
+				   insert_rec, insert->ins_split);
+
+		/*
+		 * Split might have modified either leaf and we don't
+		 * have a guarantee that the later edge insert will
+		 * dirty this for us.
+		 */
+		if (left_path)
+			ocfs2_journal_dirty(handle,
+					    path_leaf_bh(left_path));
+	} else
+		ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
+				     insert);
+
+	ocfs2_journal_dirty(handle, leaf_bh);
+
+	if (left_path) {
+		/*
+		 * The rotate code has indicated that we need to fix
+		 * up portions of the tree after the insert.
+		 *
+		 * XXX: Should we extend the transaction here?
+		 */
+		subtree_index = ocfs2_find_subtree_root(et, left_path,
+							right_path);
+		ocfs2_complete_edge_insert(handle, left_path, right_path,
+					   subtree_index);
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static int ocfs2_do_insert_extent(handle_t *handle,
+				  struct ocfs2_extent_tree *et,
+				  struct ocfs2_extent_rec *insert_rec,
+				  struct ocfs2_insert_type *type)
+{
+	int ret, rotate = 0;
+	u32 cpos;
+	struct ocfs2_path *right_path = NULL;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_extent_list *el;
+
+	el = et->et_root_el;
+
+	ret = ocfs2_et_root_journal_access(handle, et,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (le16_to_cpu(el->l_tree_depth) == 0) {
+		ocfs2_insert_at_leaf(et, insert_rec, el, type);
+		goto out_update_clusters;
+	}
+
+	right_path = ocfs2_new_path_from_et(et);
+	if (!right_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Determine the path to start with. Rotations need the
+	 * rightmost path, everything else can go directly to the
+	 * target leaf.
+	 */
+	cpos = le32_to_cpu(insert_rec->e_cpos);
+	if (type->ins_appending == APPEND_NONE &&
+	    type->ins_contig == CONTIG_NONE) {
+		rotate = 1;
+		cpos = UINT_MAX;
+	}
+
+	ret = ocfs2_find_path(et->et_ci, right_path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Rotations and appends need special treatment - they modify
+	 * parts of the tree's above them.
+	 *
+	 * Both might pass back a path immediate to the left of the
+	 * one being inserted to. This will be cause
+	 * ocfs2_insert_path() to modify the rightmost records of
+	 * left_path to account for an edge insert.
+	 *
+	 * XXX: When modifying this code, keep in mind that an insert
+	 * can wind up skipping both of these two special cases...
+	 */
+	if (rotate) {
+		ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
+					      le32_to_cpu(insert_rec->e_cpos),
+					      right_path, &left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * ocfs2_rotate_tree_right() might have extended the
+		 * transaction without re-journaling our tree root.
+		 */
+		ret = ocfs2_et_root_journal_access(handle, et,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else if (type->ins_appending == APPEND_TAIL
+		   && type->ins_contig != CONTIG_LEFT) {
+		ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
+					       right_path, &left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_insert_path(handle, et, left_path, right_path,
+				insert_rec, type);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out_update_clusters:
+	if (type->ins_split == SPLIT_NONE)
+		ocfs2_et_update_clusters(et,
+					 le16_to_cpu(insert_rec->e_leaf_clusters));
+
+	ocfs2_journal_dirty(handle, et->et_root_bh);
+
+out:
+	ocfs2_free_path(left_path);
+	ocfs2_free_path(right_path);
+
+	return ret;
+}
+
+static enum ocfs2_contig_type
+ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
+			       struct ocfs2_path *path,
+			       struct ocfs2_extent_list *el, int index,
+			       struct ocfs2_extent_rec *split_rec)
+{
+	int status;
+	enum ocfs2_contig_type ret = CONTIG_NONE;
+	u32 left_cpos, right_cpos;
+	struct ocfs2_extent_rec *rec = NULL;
+	struct ocfs2_extent_list *new_el;
+	struct ocfs2_path *left_path = NULL, *right_path = NULL;
+	struct buffer_head *bh;
+	struct ocfs2_extent_block *eb;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+
+	if (index > 0) {
+		rec = &el->l_recs[index - 1];
+	} else if (path->p_tree_depth > 0) {
+		status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
+		if (status)
+			goto out;
+
+		if (left_cpos != 0) {
+			left_path = ocfs2_new_path_from_path(path);
+			if (!left_path)
+				goto out;
+
+			status = ocfs2_find_path(et->et_ci, left_path,
+						 left_cpos);
+			if (status)
+				goto out;
+
+			new_el = path_leaf_el(left_path);
+
+			if (le16_to_cpu(new_el->l_next_free_rec) !=
+			    le16_to_cpu(new_el->l_count)) {
+				bh = path_leaf_bh(left_path);
+				eb = (struct ocfs2_extent_block *)bh->b_data;
+				ocfs2_error(sb,
+					    "Extent block #%llu has an "
+					    "invalid l_next_free_rec of "
+					    "%d.  It should have "
+					    "matched the l_count of %d",
+					    (unsigned long long)le64_to_cpu(eb->h_blkno),
+					    le16_to_cpu(new_el->l_next_free_rec),
+					    le16_to_cpu(new_el->l_count));
+				status = -EINVAL;
+				goto out;
+			}
+			rec = &new_el->l_recs[
+				le16_to_cpu(new_el->l_next_free_rec) - 1];
+		}
+	}
+
+	/*
+	 * We're careful to check for an empty extent record here -
+	 * the merge code will know what to do if it sees one.
+	 */
+	if (rec) {
+		if (index == 1 && ocfs2_is_empty_extent(rec)) {
+			if (split_rec->e_cpos == el->l_recs[index].e_cpos)
+				ret = CONTIG_RIGHT;
+		} else {
+			ret = ocfs2_et_extent_contig(et, rec, split_rec);
+		}
+	}
+
+	rec = NULL;
+	if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
+		rec = &el->l_recs[index + 1];
+	else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
+		 path->p_tree_depth > 0) {
+		status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
+		if (status)
+			goto out;
+
+		if (right_cpos == 0)
+			goto out;
+
+		right_path = ocfs2_new_path_from_path(path);
+		if (!right_path)
+			goto out;
+
+		status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
+		if (status)
+			goto out;
+
+		new_el = path_leaf_el(right_path);
+		rec = &new_el->l_recs[0];
+		if (ocfs2_is_empty_extent(rec)) {
+			if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
+				bh = path_leaf_bh(right_path);
+				eb = (struct ocfs2_extent_block *)bh->b_data;
+				ocfs2_error(sb,
+					    "Extent block #%llu has an "
+					    "invalid l_next_free_rec of %d",
+					    (unsigned long long)le64_to_cpu(eb->h_blkno),
+					    le16_to_cpu(new_el->l_next_free_rec));
+				status = -EINVAL;
+				goto out;
+			}
+			rec = &new_el->l_recs[1];
+		}
+	}
+
+	if (rec) {
+		enum ocfs2_contig_type contig_type;
+
+		contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
+
+		if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
+			ret = CONTIG_LEFTRIGHT;
+		else if (ret == CONTIG_NONE)
+			ret = contig_type;
+	}
+
+out:
+	if (left_path)
+		ocfs2_free_path(left_path);
+	if (right_path)
+		ocfs2_free_path(right_path);
+
+	return ret;
+}
+
+static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
+				     struct ocfs2_insert_type *insert,
+				     struct ocfs2_extent_list *el,
+				     struct ocfs2_extent_rec *insert_rec)
+{
+	int i;
+	enum ocfs2_contig_type contig_type = CONTIG_NONE;
+
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
+						     insert_rec);
+		if (contig_type != CONTIG_NONE) {
+			insert->ins_contig_index = i;
+			break;
+		}
+	}
+	insert->ins_contig = contig_type;
+
+	if (insert->ins_contig != CONTIG_NONE) {
+		struct ocfs2_extent_rec *rec =
+				&el->l_recs[insert->ins_contig_index];
+		unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
+				   le16_to_cpu(insert_rec->e_leaf_clusters);
+
+		/*
+		 * Caller might want us to limit the size of extents, don't
+		 * calculate contiguousness if we might exceed that limit.
+		 */
+		if (et->et_max_leaf_clusters &&
+		    (len > et->et_max_leaf_clusters))
+			insert->ins_contig = CONTIG_NONE;
+	}
+}
+
+/*
+ * This should only be called against the righmost leaf extent list.
+ *
+ * ocfs2_figure_appending_type() will figure out whether we'll have to
+ * insert at the tail of the rightmost leaf.
+ *
+ * This should also work against the root extent list for tree's with 0
+ * depth. If we consider the root extent list to be the rightmost leaf node
+ * then the logic here makes sense.
+ */
+static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
+					struct ocfs2_extent_list *el,
+					struct ocfs2_extent_rec *insert_rec)
+{
+	int i;
+	u32 cpos = le32_to_cpu(insert_rec->e_cpos);
+	struct ocfs2_extent_rec *rec;
+
+	insert->ins_appending = APPEND_NONE;
+
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+	if (!el->l_next_free_rec)
+		goto set_tail_append;
+
+	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+		/* Were all records empty? */
+		if (le16_to_cpu(el->l_next_free_rec) == 1)
+			goto set_tail_append;
+	}
+
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	rec = &el->l_recs[i];
+
+	if (cpos >=
+	    (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
+		goto set_tail_append;
+
+	return;
+
+set_tail_append:
+	insert->ins_appending = APPEND_TAIL;
+}
+
+/*
+ * Helper function called at the beginning of an insert.
+ *
+ * This computes a few things that are commonly used in the process of
+ * inserting into the btree:
+ *   - Whether the new extent is contiguous with an existing one.
+ *   - The current tree depth.
+ *   - Whether the insert is an appending one.
+ *   - The total # of free records in the tree.
+ *
+ * All of the information is stored on the ocfs2_insert_type
+ * structure.
+ */
+static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
+				    struct buffer_head **last_eb_bh,
+				    struct ocfs2_extent_rec *insert_rec,
+				    int *free_records,
+				    struct ocfs2_insert_type *insert)
+{
+	int ret;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_path *path = NULL;
+	struct buffer_head *bh = NULL;
+
+	insert->ins_split = SPLIT_NONE;
+
+	el = et->et_root_el;
+	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
+
+	if (el->l_tree_depth) {
+		/*
+		 * If we have tree depth, we read in the
+		 * rightmost extent block ahead of time as
+		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
+		 * may want it later.
+		 */
+		ret = ocfs2_read_extent_block(et->et_ci,
+					      ocfs2_et_get_last_eb_blk(et),
+					      &bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		el = &eb->h_list;
+	}
+
+	/*
+	 * Unless we have a contiguous insert, we'll need to know if
+	 * there is room left in our allocation tree for another
+	 * extent record.
+	 *
+	 * XXX: This test is simplistic, we can search for empty
+	 * extent records too.
+	 */
+	*free_records = le16_to_cpu(el->l_count) -
+		le16_to_cpu(el->l_next_free_rec);
+
+	if (!insert->ins_tree_depth) {
+		ocfs2_figure_contig_type(et, insert, el, insert_rec);
+		ocfs2_figure_appending_type(insert, el, insert_rec);
+		return 0;
+	}
+
+	path = ocfs2_new_path_from_et(et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * In the case that we're inserting past what the tree
+	 * currently accounts for, ocfs2_find_path() will return for
+	 * us the rightmost tree path. This is accounted for below in
+	 * the appending code.
+	 */
+	ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+
+	/*
+	 * Now that we have the path, there's two things we want to determine:
+	 * 1) Contiguousness (also set contig_index if this is so)
+	 *
+	 * 2) Are we doing an append? We can trivially break this up
+         *     into two types of appends: simple record append, or a
+         *     rotate inside the tail leaf.
+	 */
+	ocfs2_figure_contig_type(et, insert, el, insert_rec);
+
+	/*
+	 * The insert code isn't quite ready to deal with all cases of
+	 * left contiguousness. Specifically, if it's an insert into
+	 * the 1st record in a leaf, it will require the adjustment of
+	 * cluster count on the last record of the path directly to it's
+	 * left. For now, just catch that case and fool the layers
+	 * above us. This works just fine for tree_depth == 0, which
+	 * is why we allow that above.
+	 */
+	if (insert->ins_contig == CONTIG_LEFT &&
+	    insert->ins_contig_index == 0)
+		insert->ins_contig = CONTIG_NONE;
+
+	/*
+	 * Ok, so we can simply compare against last_eb to figure out
+	 * whether the path doesn't exist. This will only happen in
+	 * the case that we're doing a tail append, so maybe we can
+	 * take advantage of that information somehow.
+	 */
+	if (ocfs2_et_get_last_eb_blk(et) ==
+	    path_leaf_bh(path)->b_blocknr) {
+		/*
+		 * Ok, ocfs2_find_path() returned us the rightmost
+		 * tree path. This might be an appending insert. There are
+		 * two cases:
+		 *    1) We're doing a true append at the tail:
+		 *	-This might even be off the end of the leaf
+		 *    2) We're "appending" by rotating in the tail
+		 */
+		ocfs2_figure_appending_type(insert, el, insert_rec);
+	}
+
+out:
+	ocfs2_free_path(path);
+
+	if (ret == 0)
+		*last_eb_bh = bh;
+	else
+		brelse(bh);
+	return ret;
+}
+
+/*
+ * Insert an extent into a btree.
+ *
+ * The caller needs to update the owning btree's cluster count.
+ */
+int ocfs2_insert_extent(handle_t *handle,
+			struct ocfs2_extent_tree *et,
+			u32 cpos,
+			u64 start_blk,
+			u32 new_clusters,
+			u8 flags,
+			struct ocfs2_alloc_context *meta_ac)
+{
+	int status;
+	int uninitialized_var(free_records);
+	struct buffer_head *last_eb_bh = NULL;
+	struct ocfs2_insert_type insert = {0, };
+	struct ocfs2_extent_rec rec;
+
+	trace_ocfs2_insert_extent_start(
+		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		cpos, new_clusters);
+
+	memset(&rec, 0, sizeof(rec));
+	rec.e_cpos = cpu_to_le32(cpos);
+	rec.e_blkno = cpu_to_le64(start_blk);
+	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+	rec.e_flags = flags;
+	status = ocfs2_et_insert_check(et, &rec);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
+					  &free_records, &insert);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig,
+				  insert.ins_contig_index, free_records,
+				  insert.ins_tree_depth);
+
+	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
+		status = ocfs2_grow_tree(handle, et,
+					 &insert.ins_tree_depth, &last_eb_bh,
+					 meta_ac);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* Finally, we can add clusters. This might rotate the tree for us. */
+	status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
+	if (status < 0)
+		mlog_errno(status);
+	else
+		ocfs2_et_extent_map_insert(et, &rec);
+
+bail:
+	brelse(last_eb_bh);
+
+	return status;
+}
+
+/*
+ * Allcate and add clusters into the extent b-tree.
+ * The new clusters(clusters_to_add) will be inserted at logical_offset.
+ * The extent b-tree's root is specified by et, and
+ * it is not limited to the file storage. Any extent tree can use this
+ * function if it implements the proper ocfs2_extent_tree.
+ */
+int ocfs2_add_clusters_in_btree(handle_t *handle,
+				struct ocfs2_extent_tree *et,
+				u32 *logical_offset,
+				u32 clusters_to_add,
+				int mark_unwritten,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_alloc_context *meta_ac,
+				enum ocfs2_alloc_restarted *reason_ret)
+{
+	int status = 0, err = 0;
+	int free_extents;
+	enum ocfs2_alloc_restarted reason = RESTART_NONE;
+	u32 bit_off, num_bits;
+	u64 block;
+	u8 flags = 0;
+	struct ocfs2_super *osb =
+		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
+
+	BUG_ON(!clusters_to_add);
+
+	if (mark_unwritten)
+		flags = OCFS2_EXT_UNWRITTEN;
+
+	free_extents = ocfs2_num_free_extents(osb, et);
+	if (free_extents < 0) {
+		status = free_extents;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* there are two cases which could cause us to EAGAIN in the
+	 * we-need-more-metadata case:
+	 * 1) we haven't reserved *any*
+	 * 2) we are so fragmented, we've needed to add metadata too
+	 *    many times. */
+	if (!free_extents && !meta_ac) {
+		err = -1;
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	} else if ((!free_extents)
+		   && (ocfs2_alloc_context_bits_left(meta_ac)
+		       < ocfs2_extend_meta_needed(et->et_root_el))) {
+		err = -2;
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	}
+
+	status = __ocfs2_claim_clusters(handle, data_ac, 1,
+					clusters_to_add, &bit_off, &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	BUG_ON(num_bits > clusters_to_add);
+
+	/* reserve our write early -- insert_extent may update the tree root */
+	status = ocfs2_et_root_journal_access(handle, et,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	trace_ocfs2_add_clusters_in_btree(
+	     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+	     bit_off, num_bits);
+	status = ocfs2_insert_extent(handle, et, *logical_offset, block,
+				     num_bits, flags, meta_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	ocfs2_journal_dirty(handle, et->et_root_bh);
+
+	clusters_to_add -= num_bits;
+	*logical_offset += num_bits;
+
+	if (clusters_to_add) {
+		err = clusters_to_add;
+		status = -EAGAIN;
+		reason = RESTART_TRANS;
+	}
+
+leave:
+	if (reason_ret)
+		*reason_ret = reason;
+	trace_ocfs2_add_clusters_in_btree_ret(status, reason, err);
+	return status;
+}
+
+static void ocfs2_make_right_split_rec(struct super_block *sb,
+				       struct ocfs2_extent_rec *split_rec,
+				       u32 cpos,
+				       struct ocfs2_extent_rec *rec)
+{
+	u32 rec_cpos = le32_to_cpu(rec->e_cpos);
+	u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
+
+	memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
+
+	split_rec->e_cpos = cpu_to_le32(cpos);
+	split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
+
+	split_rec->e_blkno = rec->e_blkno;
+	le64_add_cpu(&split_rec->e_blkno,
+		     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
+
+	split_rec->e_flags = rec->e_flags;
+}
+
+static int ocfs2_split_and_insert(handle_t *handle,
+				  struct ocfs2_extent_tree *et,
+				  struct ocfs2_path *path,
+				  struct buffer_head **last_eb_bh,
+				  int split_index,
+				  struct ocfs2_extent_rec *orig_split_rec,
+				  struct ocfs2_alloc_context *meta_ac)
+{
+	int ret = 0, depth;
+	unsigned int insert_range, rec_range, do_leftright = 0;
+	struct ocfs2_extent_rec tmprec;
+	struct ocfs2_extent_list *rightmost_el;
+	struct ocfs2_extent_rec rec;
+	struct ocfs2_extent_rec split_rec = *orig_split_rec;
+	struct ocfs2_insert_type insert;
+	struct ocfs2_extent_block *eb;
+
+leftright:
+	/*
+	 * Store a copy of the record on the stack - it might move
+	 * around as the tree is manipulated below.
+	 */
+	rec = path_leaf_el(path)->l_recs[split_index];
+
+	rightmost_el = et->et_root_el;
+
+	depth = le16_to_cpu(rightmost_el->l_tree_depth);
+	if (depth) {
+		BUG_ON(!(*last_eb_bh));
+		eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
+		rightmost_el = &eb->h_list;
+	}
+
+	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+	    le16_to_cpu(rightmost_el->l_count)) {
+		ret = ocfs2_grow_tree(handle, et,
+				      &depth, last_eb_bh, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+	insert.ins_appending = APPEND_NONE;
+	insert.ins_contig = CONTIG_NONE;
+	insert.ins_tree_depth = depth;
+
+	insert_range = le32_to_cpu(split_rec.e_cpos) +
+		le16_to_cpu(split_rec.e_leaf_clusters);
+	rec_range = le32_to_cpu(rec.e_cpos) +
+		le16_to_cpu(rec.e_leaf_clusters);
+
+	if (split_rec.e_cpos == rec.e_cpos) {
+		insert.ins_split = SPLIT_LEFT;
+	} else if (insert_range == rec_range) {
+		insert.ins_split = SPLIT_RIGHT;
+	} else {
+		/*
+		 * Left/right split. We fake this as a right split
+		 * first and then make a second pass as a left split.
+		 */
+		insert.ins_split = SPLIT_RIGHT;
+
+		ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+					   &tmprec, insert_range, &rec);
+
+		split_rec = tmprec;
+
+		BUG_ON(do_leftright);
+		do_leftright = 1;
+	}
+
+	ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (do_leftright == 1) {
+		u32 cpos;
+		struct ocfs2_extent_list *el;
+
+		do_leftright++;
+		split_rec = *orig_split_rec;
+
+		ocfs2_reinit_path(path, 1);
+
+		cpos = le32_to_cpu(split_rec.e_cpos);
+		ret = ocfs2_find_path(et->et_ci, path, cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		el = path_leaf_el(path);
+		split_index = ocfs2_search_extent_list(el, cpos);
+		goto leftright;
+	}
+out:
+
+	return ret;
+}
+
+static int ocfs2_replace_extent_rec(handle_t *handle,
+				    struct ocfs2_extent_tree *et,
+				    struct ocfs2_path *path,
+				    struct ocfs2_extent_list *el,
+				    int split_index,
+				    struct ocfs2_extent_rec *split_rec)
+{
+	int ret;
+
+	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
+					   path_num_items(path) - 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el->l_recs[split_index] = *split_rec;
+
+	ocfs2_journal_dirty(handle, path_leaf_bh(path));
+out:
+	return ret;
+}
+
+/*
+ * Split part or all of the extent record at split_index in the leaf
+ * pointed to by path. Merge with the contiguous extent record if needed.
+ *
+ * Care is taken to handle contiguousness so as to not grow the tree.
+ *
+ * meta_ac is not strictly necessary - we only truly need it if growth
+ * of the tree is required. All other cases will degrade into a less
+ * optimal tree layout.
+ *
+ * last_eb_bh should be the rightmost leaf block for any extent
+ * btree. Since a split may grow the tree or a merge might shrink it,
+ * the caller cannot trust the contents of that buffer after this call.
+ *
+ * This code is optimized for readability - several passes might be
+ * made over certain portions of the tree. All of those blocks will
+ * have been brought into cache (and pinned via the journal), so the
+ * extra overhead is not expressed in terms of disk reads.
+ */
+int ocfs2_split_extent(handle_t *handle,
+		       struct ocfs2_extent_tree *et,
+		       struct ocfs2_path *path,
+		       int split_index,
+		       struct ocfs2_extent_rec *split_rec,
+		       struct ocfs2_alloc_context *meta_ac,
+		       struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0;
+	struct ocfs2_extent_list *el = path_leaf_el(path);
+	struct buffer_head *last_eb_bh = NULL;
+	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+	struct ocfs2_merge_ctxt ctxt;
+	struct ocfs2_extent_list *rightmost_el;
+
+	if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
+	    ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
+	     (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
+							    split_index,
+							    split_rec);
+
+	/*
+	 * The core merge / split code wants to know how much room is
+	 * left in this allocation tree, so we pass the
+	 * rightmost extent list.
+	 */
+	if (path->p_tree_depth) {
+		struct ocfs2_extent_block *eb;
+
+		ret = ocfs2_read_extent_block(et->et_ci,
+					      ocfs2_et_get_last_eb_blk(et),
+					      &last_eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		rightmost_el = &eb->h_list;
+	} else
+		rightmost_el = path_root_el(path);
+
+	if (rec->e_cpos == split_rec->e_cpos &&
+	    rec->e_leaf_clusters == split_rec->e_leaf_clusters)
+		ctxt.c_split_covers_rec = 1;
+	else
+		ctxt.c_split_covers_rec = 0;
+
+	ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
+
+	trace_ocfs2_split_extent(split_index, ctxt.c_contig_type,
+				 ctxt.c_has_empty_extent,
+				 ctxt.c_split_covers_rec);
+
+	if (ctxt.c_contig_type == CONTIG_NONE) {
+		if (ctxt.c_split_covers_rec)
+			ret = ocfs2_replace_extent_rec(handle, et, path, el,
+						       split_index, split_rec);
+		else
+			ret = ocfs2_split_and_insert(handle, et, path,
+						     &last_eb_bh, split_index,
+						     split_rec, meta_ac);
+		if (ret)
+			mlog_errno(ret);
+	} else {
+		ret = ocfs2_try_to_merge_extent(handle, et, path,
+						split_index, split_rec,
+						dealloc, &ctxt);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	brelse(last_eb_bh);
+	return ret;
+}
+
+/*
+ * Change the flags of the already-existing extent at cpos for len clusters.
+ *
+ * new_flags: the flags we want to set.
+ * clear_flags: the flags we want to clear.
+ * phys: the new physical offset we want this new extent starts from.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_change_extent_flag(handle_t *handle,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 len, u32 phys,
+			     struct ocfs2_alloc_context *meta_ac,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc,
+			     int new_flags, int clear_flags)
+{
+	int ret, index;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+	u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
+	struct ocfs2_extent_rec split_rec;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+
+	left_path = ocfs2_new_path_from_et(et);
+	if (!left_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(et->et_ci, left_path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	el = path_leaf_el(left_path);
+
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+		ocfs2_error(sb,
+			    "Owner %llu has an extent at cpos %u which can no "
+			    "longer be found.\n",
+			     (unsigned long long)
+			     ocfs2_metadata_cache_owner(et->et_ci), cpos);
+		ret = -EROFS;
+		goto out;
+	}
+
+	ret = -EIO;
+	rec = &el->l_recs[index];
+	if (new_flags && (rec->e_flags & new_flags)) {
+		mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
+		     "extent that already had them",
+		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		     new_flags);
+		goto out;
+	}
+
+	if (clear_flags && !(rec->e_flags & clear_flags)) {
+		mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
+		     "extent that didn't have them",
+		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		     clear_flags);
+		goto out;
+	}
+
+	memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
+	split_rec.e_cpos = cpu_to_le32(cpos);
+	split_rec.e_leaf_clusters = cpu_to_le16(len);
+	split_rec.e_blkno = cpu_to_le64(start_blkno);
+	split_rec.e_flags = rec->e_flags;
+	if (new_flags)
+		split_rec.e_flags |= new_flags;
+	if (clear_flags)
+		split_rec.e_flags &= ~clear_flags;
+
+	ret = ocfs2_split_extent(handle, et, left_path,
+				 index, &split_rec, meta_ac,
+				 dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	ocfs2_free_path(left_path);
+	return ret;
+
+}
+
+/*
+ * Mark the already-existing extent at cpos as written for len clusters.
+ * This removes the unwritten extent flag.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_mark_extent_written(struct inode *inode,
+			      struct ocfs2_extent_tree *et,
+			      handle_t *handle, u32 cpos, u32 len, u32 phys,
+			      struct ocfs2_alloc_context *meta_ac,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+
+	trace_ocfs2_mark_extent_written(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		cpos, len, phys);
+
+	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
+			    "that are being written to, but the feature bit "
+			    "is not set in the super block.",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		ret = -EROFS;
+		goto out;
+	}
+
+	/*
+	 * XXX: This should be fixed up so that we just re-insert the
+	 * next extent records.
+	 */
+	ocfs2_et_extent_map_truncate(et, 0);
+
+	ret = ocfs2_change_extent_flag(handle, et, cpos,
+				       len, phys, meta_ac, dealloc,
+				       0, OCFS2_EXT_UNWRITTEN);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
+			    struct ocfs2_path *path,
+			    int index, u32 new_range,
+			    struct ocfs2_alloc_context *meta_ac)
+{
+	int ret, depth, credits;
+	struct buffer_head *last_eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *rightmost_el, *el;
+	struct ocfs2_extent_rec split_rec;
+	struct ocfs2_extent_rec *rec;
+	struct ocfs2_insert_type insert;
+
+	/*
+	 * Setup the record to split before we grow the tree.
+	 */
+	el = path_leaf_el(path);
+	rec = &el->l_recs[index];
+	ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+				   &split_rec, new_range, rec);
+
+	depth = path->p_tree_depth;
+	if (depth > 0) {
+		ret = ocfs2_read_extent_block(et->et_ci,
+					      ocfs2_et_get_last_eb_blk(et),
+					      &last_eb_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		rightmost_el = &eb->h_list;
+	} else
+		rightmost_el = path_leaf_el(path);
+
+	credits = path->p_tree_depth +
+		  ocfs2_extend_meta_needed(et->et_root_el);
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+	    le16_to_cpu(rightmost_el->l_count)) {
+		ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
+				      meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+	insert.ins_appending = APPEND_NONE;
+	insert.ins_contig = CONTIG_NONE;
+	insert.ins_split = SPLIT_RIGHT;
+	insert.ins_tree_depth = depth;
+
+	ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(last_eb_bh);
+	return ret;
+}
+
+static int ocfs2_truncate_rec(handle_t *handle,
+			      struct ocfs2_extent_tree *et,
+			      struct ocfs2_path *path, int index,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc,
+			      u32 cpos, u32 len)
+{
+	int ret;
+	u32 left_cpos, rec_range, trunc_range;
+	int wants_rotate = 0, is_rightmost_tree_rec = 0;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_extent_list *el = path_leaf_el(path);
+	struct ocfs2_extent_rec *rec;
+	struct ocfs2_extent_block *eb;
+
+	if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		index--;
+	}
+
+	if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
+	    path->p_tree_depth) {
+		/*
+		 * Check whether this is the rightmost tree record. If
+		 * we remove all of this record or part of its right
+		 * edge then an update of the record lengths above it
+		 * will be required.
+		 */
+		eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+		if (eb->h_next_leaf_blk == 0)
+			is_rightmost_tree_rec = 1;
+	}
+
+	rec = &el->l_recs[index];
+	if (index == 0 && path->p_tree_depth &&
+	    le32_to_cpu(rec->e_cpos) == cpos) {
+		/*
+		 * Changing the leftmost offset (via partial or whole
+		 * record truncate) of an interior (or rightmost) path
+		 * means we have to update the subtree that is formed
+		 * by this leaf and the one to it's left.
+		 *
+		 * There are two cases we can skip:
+		 *   1) Path is the leftmost one in our btree.
+		 *   2) The leaf is rightmost and will be empty after
+		 *      we remove the extent record - the rotate code
+		 *      knows how to update the newly formed edge.
+		 */
+
+		ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
+			left_path = ocfs2_new_path_from_path(path);
+			if (!left_path) {
+				ret = -ENOMEM;
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_find_path(et->et_ci, left_path,
+					      left_cpos);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+	}
+
+	ret = ocfs2_extend_rotate_transaction(handle, 0,
+					      handle->h_buffer_credits,
+					      path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+	trunc_range = cpos + len;
+
+	if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
+		int next_free;
+
+		memset(rec, 0, sizeof(*rec));
+		ocfs2_cleanup_merge(el, index);
+		wants_rotate = 1;
+
+		next_free = le16_to_cpu(el->l_next_free_rec);
+		if (is_rightmost_tree_rec && next_free > 1) {
+			/*
+			 * We skip the edge update if this path will
+			 * be deleted by the rotate code.
+			 */
+			rec = &el->l_recs[next_free - 1];
+			ocfs2_adjust_rightmost_records(handle, et, path,
+						       rec);
+		}
+	} else if (le32_to_cpu(rec->e_cpos) == cpos) {
+		/* Remove leftmost portion of the record. */
+		le32_add_cpu(&rec->e_cpos, len);
+		le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
+		le16_add_cpu(&rec->e_leaf_clusters, -len);
+	} else if (rec_range == trunc_range) {
+		/* Remove rightmost portion of the record */
+		le16_add_cpu(&rec->e_leaf_clusters, -len);
+		if (is_rightmost_tree_rec)
+			ocfs2_adjust_rightmost_records(handle, et, path, rec);
+	} else {
+		/* Caller should have trapped this. */
+		mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
+		     "(%u, %u)\n",
+		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		     le32_to_cpu(rec->e_cpos),
+		     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
+		BUG();
+	}
+
+	if (left_path) {
+		int subtree_index;
+
+		subtree_index = ocfs2_find_subtree_root(et, left_path, path);
+		ocfs2_complete_edge_insert(handle, left_path, path,
+					   subtree_index);
+	}
+
+	ocfs2_journal_dirty(handle, path_leaf_bh(path));
+
+	ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	ocfs2_free_path(left_path);
+	return ret;
+}
+
+int ocfs2_remove_extent(handle_t *handle,
+			struct ocfs2_extent_tree *et,
+			u32 cpos, u32 len,
+			struct ocfs2_alloc_context *meta_ac,
+			struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, index;
+	u32 rec_range, trunc_range;
+	struct ocfs2_extent_rec *rec;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_path *path = NULL;
+
+	/*
+	 * XXX: Why are we truncating to 0 instead of wherever this
+	 * affects us?
+	 */
+	ocfs2_et_extent_map_truncate(et, 0);
+
+	path = ocfs2_new_path_from_et(et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(et->et_ci, path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+		ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+			    "Owner %llu has an extent at cpos %u which can no "
+			    "longer be found.\n",
+			    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+			    cpos);
+		ret = -EROFS;
+		goto out;
+	}
+
+	/*
+	 * We have 3 cases of extent removal:
+	 *   1) Range covers the entire extent rec
+	 *   2) Range begins or ends on one edge of the extent rec
+	 *   3) Range is in the middle of the extent rec (no shared edges)
+	 *
+	 * For case 1 we remove the extent rec and left rotate to
+	 * fill the hole.
+	 *
+	 * For case 2 we just shrink the existing extent rec, with a
+	 * tree update if the shrinking edge is also the edge of an
+	 * extent block.
+	 *
+	 * For case 3 we do a right split to turn the extent rec into
+	 * something case 2 can handle.
+	 */
+	rec = &el->l_recs[index];
+	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+	trunc_range = cpos + len;
+
+	BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
+
+	trace_ocfs2_remove_extent(
+		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		cpos, len, index, le32_to_cpu(rec->e_cpos),
+		ocfs2_rec_clusters(el, rec));
+
+	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
+		ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
+					 cpos, len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else {
+		ret = ocfs2_split_tree(handle, et, path, index,
+				       trunc_range, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * The split could have manipulated the tree enough to
+		 * move the record location, so we have to look for it again.
+		 */
+		ocfs2_reinit_path(path, 1);
+
+		ret = ocfs2_find_path(et->et_ci, path, cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		el = path_leaf_el(path);
+		index = ocfs2_search_extent_list(el, cpos);
+		if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+				    "Owner %llu: split at cpos %u lost record.",
+				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+				    cpos);
+			ret = -EROFS;
+			goto out;
+		}
+
+		/*
+		 * Double check our values here. If anything is fishy,
+		 * it's easier to catch it at the top level.
+		 */
+		rec = &el->l_recs[index];
+		rec_range = le32_to_cpu(rec->e_cpos) +
+			ocfs2_rec_clusters(el, rec);
+		if (rec_range != trunc_range) {
+			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+				    "Owner %llu: error after split at cpos %u"
+				    "trunc len %u, existing record is (%u,%u)",
+				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+				    cpos, len, le32_to_cpu(rec->e_cpos),
+				    ocfs2_rec_clusters(el, rec));
+			ret = -EROFS;
+			goto out;
+		}
+
+		ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
+					 cpos, len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	ocfs2_free_path(path);
+	return ret;
+}
+
+/*
+ * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
+ * same as ocfs2_lock_alloctors(), except for it accepts a blocks
+ * number to reserve some extra blocks, and it only handles meta
+ * data allocations.
+ *
+ * Currently, only ocfs2_remove_btree_range() uses it for truncating
+ * and punching holes.
+ */
+static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
+					      struct ocfs2_extent_tree *et,
+					      u32 extents_to_split,
+					      struct ocfs2_alloc_context **ac,
+					      int extra_blocks)
+{
+	int ret = 0, num_free_extents;
+	unsigned int max_recs_needed = 2 * extents_to_split;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	*ac = NULL;
+
+	num_free_extents = ocfs2_num_free_extents(osb, et);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!num_free_extents ||
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+		extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+	if (extra_blocks) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	if (ret) {
+		if (*ac) {
+			ocfs2_free_alloc_context(*ac);
+			*ac = NULL;
+		}
+	}
+
+	return ret;
+}
+
+int ocfs2_remove_btree_range(struct inode *inode,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 phys_cpos, u32 len, int flags,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc,
+			     u64 refcount_loc)
+{
+	int ret, credits = 0, extra_blocks = 0;
+	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+
+	if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
+		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+			 OCFS2_HAS_REFCOUNT_FL));
+
+		ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+					       &ref_tree, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_prepare_refcount_change_for_del(inode,
+							    refcount_loc,
+							    phys_blkno,
+							    len,
+							    &credits,
+							    &extra_blocks);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
+						 extra_blocks);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb,
+			ocfs2_remove_extent_credits(osb->sb) + credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_et_root_journal_access(handle, et,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dquot_free_space_nodirty(inode,
+				  ocfs2_clusters_to_bytes(inode->i_sb, len));
+
+	ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_et_update_clusters(et, -len);
+
+	ocfs2_journal_dirty(handle, et->et_root_bh);
+
+	if (phys_blkno) {
+		if (flags & OCFS2_EXT_REFCOUNTED)
+			ret = ocfs2_decrease_refcount(inode, handle,
+					ocfs2_blocks_to_clusters(osb->sb,
+								 phys_blkno),
+					len, meta_ac,
+					dealloc, 1);
+		else
+			ret = ocfs2_truncate_log_append(osb, handle,
+							phys_blkno, len);
+		if (ret)
+			mlog_errno(ret);
+
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+	return ret;
+}
+
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+{
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+
+	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
+			"slot %d, invalid truncate log parameters: used = "
+			"%u, count = %u\n", osb->slot_num,
+			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
+	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
+}
+
+static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
+					   unsigned int new_start)
+{
+	unsigned int tail_index;
+	unsigned int current_tail;
+
+	/* No records, nothing to coalesce */
+	if (!le16_to_cpu(tl->tl_used))
+		return 0;
+
+	tail_index = le16_to_cpu(tl->tl_used) - 1;
+	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
+	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
+
+	return current_tail == new_start;
+}
+
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+			      handle_t *handle,
+			      u64 start_blk,
+			      unsigned int num_clusters)
+{
+	int status, index;
+	unsigned int start_cluster, tl_count;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+
+	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+
+	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+	 * by the underlying call to ocfs2_read_inode_block(), so any
+	 * corruption is a code bug */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+	tl = &di->id2.i_dealloc;
+	tl_count = le16_to_cpu(tl->tl_count);
+	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
+			tl_count == 0,
+			"Truncate record count on #%llu invalid "
+			"wanted %u, actual %u\n",
+			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+			ocfs2_truncate_recs_per_inode(osb->sb),
+			le16_to_cpu(tl->tl_count));
+
+	/* Caller should have known to flush before calling us. */
+	index = le16_to_cpu(tl->tl_used);
+	if (index >= tl_count) {
+		status = -ENOSPC;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	trace_ocfs2_truncate_log_append(
+		(unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index,
+		start_cluster, num_clusters);
+	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
+		/*
+		 * Move index back to the record we are coalescing with.
+		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
+		 */
+		index--;
+
+		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
+		trace_ocfs2_truncate_log_append(
+			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+			index, le32_to_cpu(tl->tl_recs[index].t_start),
+			num_clusters);
+	} else {
+		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
+		tl->tl_used = cpu_to_le16(index + 1);
+	}
+	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
+
+	ocfs2_journal_dirty(handle, tl_bh);
+
+	osb->truncated_clusters += num_clusters;
+bail:
+	return status;
+}
+
+static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
+					 handle_t *handle,
+					 struct inode *data_alloc_inode,
+					 struct buffer_head *data_alloc_bh)
+{
+	int status = 0;
+	int i;
+	unsigned int num_clusters;
+	u64 start_blk;
+	struct ocfs2_truncate_rec rec;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	i = le16_to_cpu(tl->tl_used) - 1;
+	while (i >= 0) {
+		/* Caller has given us at least enough credits to
+		 * update the truncate log dinode */
+		status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		tl->tl_used = cpu_to_le16(i);
+
+		ocfs2_journal_dirty(handle, tl_bh);
+
+		/* TODO: Perhaps we can calculate the bulk of the
+		 * credits up front rather than extending like
+		 * this. */
+		status = ocfs2_extend_trans(handle,
+					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		rec = tl->tl_recs[i];
+		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
+						    le32_to_cpu(rec.t_start));
+		num_clusters = le32_to_cpu(rec.t_clusters);
+
+		/* if start_blk is not set, we ignore the record as
+		 * invalid. */
+		if (start_blk) {
+			trace_ocfs2_replay_truncate_records(
+				(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+				i, le32_to_cpu(rec.t_start), num_clusters);
+
+			status = ocfs2_free_clusters(handle, data_alloc_inode,
+						     data_alloc_bh, start_blk,
+						     num_clusters);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		i--;
+	}
+
+	osb->truncated_clusters = 0;
+
+bail:
+	return status;
+}
+
+/* Expects you to already be holding tl_inode->i_mutex */
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+	int status;
+	unsigned int num_to_flush;
+	handle_t *handle;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct inode *data_alloc_inode = NULL;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct buffer_head *data_alloc_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+
+	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+	 * by the underlying call to ocfs2_read_inode_block(), so any
+	 * corruption is a code bug */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+	tl = &di->id2.i_dealloc;
+	num_to_flush = le16_to_cpu(tl->tl_used);
+	trace_ocfs2_flush_truncate_log(
+		(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+		num_to_flush);
+	if (!num_to_flush) {
+		status = 0;
+		goto out;
+	}
+
+	data_alloc_inode = ocfs2_get_system_file_inode(osb,
+						       GLOBAL_BITMAP_SYSTEM_INODE,
+						       OCFS2_INVALID_SLOT);
+	if (!data_alloc_inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get bitmap inode!\n");
+		goto out;
+	}
+
+	mutex_lock(&data_alloc_inode->i_mutex);
+
+	status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_mutex;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_unlock;
+	}
+
+	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
+					       data_alloc_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+	brelse(data_alloc_bh);
+	ocfs2_inode_unlock(data_alloc_inode, 1);
+
+out_mutex:
+	mutex_unlock(&data_alloc_inode->i_mutex);
+	iput(data_alloc_inode);
+
+out:
+	return status;
+}
+
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	mutex_lock(&tl_inode->i_mutex);
+	status = __ocfs2_flush_truncate_log(osb);
+	mutex_unlock(&tl_inode->i_mutex);
+
+	return status;
+}
+
+static void ocfs2_truncate_log_worker(struct work_struct *work)
+{
+	int status;
+	struct ocfs2_super *osb =
+		container_of(work, struct ocfs2_super,
+			     osb_truncate_log_wq.work);
+
+	status = ocfs2_flush_truncate_log(osb);
+	if (status < 0)
+		mlog_errno(status);
+	else
+		ocfs2_init_steal_slots(osb);
+}
+
+#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+				       int cancel)
+{
+	if (osb->osb_tl_inode) {
+		/* We want to push off log flushes while truncates are
+		 * still running. */
+		if (cancel)
+			cancel_delayed_work(&osb->osb_truncate_log_wq);
+
+		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
+	}
+}
+
+static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
+				       int slot_num,
+				       struct inode **tl_inode,
+				       struct buffer_head **tl_bh)
+{
+	int status;
+	struct inode *inode = NULL;
+	struct buffer_head *bh = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb,
+					   TRUNCATE_LOG_SYSTEM_INODE,
+					   slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
+		goto bail;
+	}
+
+	status = ocfs2_read_inode_block(inode, &bh);
+	if (status < 0) {
+		iput(inode);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*tl_inode = inode;
+	*tl_bh    = bh;
+bail:
+	return status;
+}
+
+/* called during the 1st stage of node recovery. we stamp a clean
+ * truncate log and pass back a copy for processing later. if the
+ * truncate log does not require processing, a *tl_copy is set to
+ * NULL. */
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+				      int slot_num,
+				      struct ocfs2_dinode **tl_copy)
+{
+	int status;
+	struct inode *tl_inode = NULL;
+	struct buffer_head *tl_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	*tl_copy = NULL;
+
+	trace_ocfs2_begin_truncate_log_recovery(slot_num);
+
+	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+
+	/* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
+	 * validated by the underlying call to ocfs2_read_inode_block(),
+	 * so any corruption is a code bug */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+	tl = &di->id2.i_dealloc;
+	if (le16_to_cpu(tl->tl_used)) {
+		trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
+
+		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
+		if (!(*tl_copy)) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* Assuming the write-out below goes well, this copy
+		 * will be passed back to recovery for processing. */
+		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
+
+		/* All we need to do to clear the truncate log is set
+		 * tl_used. */
+		tl->tl_used = 0;
+
+		ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
+		status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+bail:
+	if (tl_inode)
+		iput(tl_inode);
+	brelse(tl_bh);
+
+	if (status < 0 && (*tl_copy)) {
+		kfree(*tl_copy);
+		*tl_copy = NULL;
+		mlog_errno(status);
+	}
+
+	return status;
+}
+
+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
+					 struct ocfs2_dinode *tl_copy)
+{
+	int status = 0;
+	int i;
+	unsigned int clusters, num_recs, start_cluster;
+	u64 start_blk;
+	handle_t *handle;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct ocfs2_truncate_log *tl;
+
+	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
+		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
+		return -EINVAL;
+	}
+
+	tl = &tl_copy->id2.i_dealloc;
+	num_recs = le16_to_cpu(tl->tl_used);
+	trace_ocfs2_complete_truncate_log_recovery(
+		(unsigned long long)le64_to_cpu(tl_copy->i_blkno),
+		num_recs);
+
+	mutex_lock(&tl_inode->i_mutex);
+	for(i = 0; i < num_recs; i++) {
+		if (ocfs2_truncate_log_needs_flush(osb)) {
+			status = __ocfs2_flush_truncate_log(osb);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail_up;
+			}
+		}
+
+		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_up;
+		}
+
+		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
+		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
+		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
+
+		status = ocfs2_truncate_log_append(osb, handle,
+						   start_blk, clusters);
+		ocfs2_commit_trans(osb, handle);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_up;
+		}
+	}
+
+bail_up:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	return status;
+}
+
+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	if (tl_inode) {
+		cancel_delayed_work(&osb->osb_truncate_log_wq);
+		flush_workqueue(ocfs2_wq);
+
+		status = ocfs2_flush_truncate_log(osb);
+		if (status < 0)
+			mlog_errno(status);
+
+		brelse(osb->osb_tl_bh);
+		iput(osb->osb_tl_inode);
+	}
+}
+
+int ocfs2_truncate_log_init(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = NULL;
+	struct buffer_head *tl_bh = NULL;
+
+	status = ocfs2_get_truncate_log_info(osb,
+					     osb->slot_num,
+					     &tl_inode,
+					     &tl_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* ocfs2_truncate_log_shutdown keys on the existence of
+	 * osb->osb_tl_inode so we don't set any of the osb variables
+	 * until we're sure all is well. */
+	INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
+			  ocfs2_truncate_log_worker);
+	osb->osb_tl_bh    = tl_bh;
+	osb->osb_tl_inode = tl_inode;
+
+	return status;
+}
+
+/*
+ * Delayed de-allocation of suballocator blocks.
+ *
+ * Some sets of block de-allocations might involve multiple suballocator inodes.
+ *
+ * The locking for this can get extremely complicated, especially when
+ * the suballocator inodes to delete from aren't known until deep
+ * within an unrelated codepath.
+ *
+ * ocfs2_extent_block structures are a good example of this - an inode
+ * btree could have been grown by any number of nodes each allocating
+ * out of their own suballoc inode.
+ *
+ * These structures allow the delay of block de-allocation until a
+ * later time, when locking of multiple cluster inodes won't cause
+ * deadlock.
+ */
+
+/*
+ * Describe a single bit freed from a suballocator.  For the block
+ * suballocators, it represents one block.  For the global cluster
+ * allocator, it represents some clusters and free_bit indicates
+ * clusters number.
+ */
+struct ocfs2_cached_block_free {
+	struct ocfs2_cached_block_free		*free_next;
+	u64					free_bg;
+	u64					free_blk;
+	unsigned int				free_bit;
+};
+
+struct ocfs2_per_slot_free_list {
+	struct ocfs2_per_slot_free_list		*f_next_suballocator;
+	int					f_inode_type;
+	int					f_slot;
+	struct ocfs2_cached_block_free		*f_first;
+};
+
+static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
+				    int sysfile_type,
+				    int slot,
+				    struct ocfs2_cached_block_free *head)
+{
+	int ret;
+	u64 bg_blkno;
+	handle_t *handle;
+	struct inode *inode;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_cached_block_free *tmp;
+
+	inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
+	if (!inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock(&inode->i_mutex);
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	while (head) {
+		if (head->free_bg)
+			bg_blkno = head->free_bg;
+		else
+			bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+							      head->free_bit);
+		trace_ocfs2_free_cached_blocks(
+		     (unsigned long long)head->free_blk, head->free_bit);
+
+		ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
+					       head->free_bit, bg_blkno, 1);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_journal;
+		}
+
+		ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_journal;
+		}
+
+		tmp = head;
+		head = head->free_next;
+		kfree(tmp);
+	}
+
+out_journal:
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+	ocfs2_inode_unlock(inode, 1);
+	brelse(di_bh);
+out_mutex:
+	mutex_unlock(&inode->i_mutex);
+	iput(inode);
+out:
+	while(head) {
+		/* Premature exit may have left some dangling items. */
+		tmp = head;
+		head = head->free_next;
+		kfree(tmp);
+	}
+
+	return ret;
+}
+
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+				u64 blkno, unsigned int bit)
+{
+	int ret = 0;
+	struct ocfs2_cached_block_free *item;
+
+	item = kzalloc(sizeof(*item), GFP_NOFS);
+	if (item == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit);
+
+	item->free_blk = blkno;
+	item->free_bit = bit;
+	item->free_next = ctxt->c_global_allocator;
+
+	ctxt->c_global_allocator = item;
+	return ret;
+}
+
+static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
+				      struct ocfs2_cached_block_free *head)
+{
+	struct ocfs2_cached_block_free *tmp;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	int ret = 0;
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	while (head) {
+		if (ocfs2_truncate_log_needs_flush(osb)) {
+			ret = __ocfs2_flush_truncate_log(osb);
+			if (ret < 0) {
+				mlog_errno(ret);
+				break;
+			}
+		}
+
+		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
+						head->free_bit);
+
+		ocfs2_commit_trans(osb, handle);
+		tmp = head;
+		head = head->free_next;
+		kfree(tmp);
+
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	mutex_unlock(&tl_inode->i_mutex);
+
+	while (head) {
+		/* Premature exit may have left some dangling items. */
+		tmp = head;
+		head = head->free_next;
+		kfree(tmp);
+	}
+
+	return ret;
+}
+
+int ocfs2_run_deallocs(struct ocfs2_super *osb,
+		       struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+	int ret = 0, ret2;
+	struct ocfs2_per_slot_free_list *fl;
+
+	if (!ctxt)
+		return 0;
+
+	while (ctxt->c_first_suballocator) {
+		fl = ctxt->c_first_suballocator;
+
+		if (fl->f_first) {
+			trace_ocfs2_run_deallocs(fl->f_inode_type,
+						 fl->f_slot);
+			ret2 = ocfs2_free_cached_blocks(osb,
+							fl->f_inode_type,
+							fl->f_slot,
+							fl->f_first);
+			if (ret2)
+				mlog_errno(ret2);
+			if (!ret)
+				ret = ret2;
+		}
+
+		ctxt->c_first_suballocator = fl->f_next_suballocator;
+		kfree(fl);
+	}
+
+	if (ctxt->c_global_allocator) {
+		ret2 = ocfs2_free_cached_clusters(osb,
+						  ctxt->c_global_allocator);
+		if (ret2)
+			mlog_errno(ret2);
+		if (!ret)
+			ret = ret2;
+
+		ctxt->c_global_allocator = NULL;
+	}
+
+	return ret;
+}
+
+static struct ocfs2_per_slot_free_list *
+ocfs2_find_per_slot_free_list(int type,
+			      int slot,
+			      struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
+
+	while (fl) {
+		if (fl->f_inode_type == type && fl->f_slot == slot)
+			return fl;
+
+		fl = fl->f_next_suballocator;
+	}
+
+	fl = kmalloc(sizeof(*fl), GFP_NOFS);
+	if (fl) {
+		fl->f_inode_type = type;
+		fl->f_slot = slot;
+		fl->f_first = NULL;
+		fl->f_next_suballocator = ctxt->c_first_suballocator;
+
+		ctxt->c_first_suballocator = fl;
+	}
+	return fl;
+}
+
+int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+			      int type, int slot, u64 suballoc,
+			      u64 blkno, unsigned int bit)
+{
+	int ret;
+	struct ocfs2_per_slot_free_list *fl;
+	struct ocfs2_cached_block_free *item;
+
+	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
+	if (fl == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	item = kzalloc(sizeof(*item), GFP_NOFS);
+	if (item == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_cache_block_dealloc(type, slot,
+					(unsigned long long)suballoc,
+					(unsigned long long)blkno, bit);
+
+	item->free_bg = suballoc;
+	item->free_blk = blkno;
+	item->free_bit = bit;
+	item->free_next = fl->f_first;
+
+	fl->f_first = item;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+					 struct ocfs2_extent_block *eb)
+{
+	return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
+					 le16_to_cpu(eb->h_suballoc_slot),
+					 le64_to_cpu(eb->h_suballoc_loc),
+					 le64_to_cpu(eb->h_blkno),
+					 le16_to_cpu(eb->h_suballoc_bit));
+}
+
+static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+	set_buffer_uptodate(bh);
+	mark_buffer_dirty(bh);
+	return 0;
+}
+
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+			      unsigned int from, unsigned int to,
+			      struct page *page, int zero, u64 *phys)
+{
+	int ret, partial = 0;
+
+	ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
+	if (ret)
+		mlog_errno(ret);
+
+	if (zero)
+		zero_user_segment(page, from, to);
+
+	/*
+	 * Need to set the buffers we zero'd into uptodate
+	 * here if they aren't - ocfs2_map_page_blocks()
+	 * might've skipped some
+	 */
+	ret = walk_page_buffers(handle, page_buffers(page),
+				from, to, &partial,
+				ocfs2_zero_func);
+	if (ret < 0)
+		mlog_errno(ret);
+	else if (ocfs2_should_order_data(inode)) {
+		ret = ocfs2_jbd2_file_inode(handle, inode);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+	if (!partial)
+		SetPageUptodate(page);
+
+	flush_dcache_page(page);
+}
+
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
+				     loff_t end, struct page **pages,
+				     int numpages, u64 phys, handle_t *handle)
+{
+	int i;
+	struct page *page;
+	unsigned int from, to = PAGE_CACHE_SIZE;
+	struct super_block *sb = inode->i_sb;
+
+	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+	if (numpages == 0)
+		goto out;
+
+	to = PAGE_CACHE_SIZE;
+	for(i = 0; i < numpages; i++) {
+		page = pages[i];
+
+		from = start & (PAGE_CACHE_SIZE - 1);
+		if ((end >> PAGE_CACHE_SHIFT) == page->index)
+			to = end & (PAGE_CACHE_SIZE - 1);
+
+		BUG_ON(from > PAGE_CACHE_SIZE);
+		BUG_ON(to > PAGE_CACHE_SIZE);
+
+		ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
+					 &phys);
+
+		start = (page->index + 1) << PAGE_CACHE_SHIFT;
+	}
+out:
+	if (pages)
+		ocfs2_unlock_and_free_pages(pages, numpages);
+}
+
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+		     struct page **pages, int *num)
+{
+	int numpages, ret = 0;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long index;
+	loff_t last_page_bytes;
+
+	BUG_ON(start > end);
+
+	numpages = 0;
+	last_page_bytes = PAGE_ALIGN(end);
+	index = start >> PAGE_CACHE_SHIFT;
+	do {
+		pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
+		if (!pages[numpages]) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		numpages++;
+		index++;
+	} while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
+
+out:
+	if (ret != 0) {
+		if (pages)
+			ocfs2_unlock_and_free_pages(pages, numpages);
+		numpages = 0;
+	}
+
+	*num = numpages;
+
+	return ret;
+}
+
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
+				struct page **pages, int *num)
+{
+	struct super_block *sb = inode->i_sb;
+
+	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
+	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
+
+	return ocfs2_grab_pages(inode, start, end, pages, num);
+}
+
+/*
+ * Zero the area past i_size but still within an allocated
+ * cluster. This avoids exposing nonzero data on subsequent file
+ * extends.
+ *
+ * We need to call this before i_size is updated on the inode because
+ * otherwise block_write_full_page() will skip writeout of pages past
+ * i_size. The new_i_size parameter is passed for this reason.
+ */
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
+				  u64 range_start, u64 range_end)
+{
+	int ret = 0, numpages;
+	struct page **pages = NULL;
+	u64 phys;
+	unsigned int ext_flags;
+	struct super_block *sb = inode->i_sb;
+
+	/*
+	 * File systems which don't support sparse files zero on every
+	 * extend.
+	 */
+	if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
+		return 0;
+
+	pages = kcalloc(ocfs2_pages_per_cluster(sb),
+			sizeof(struct page *), GFP_NOFS);
+	if (pages == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (range_start == range_end)
+		goto out;
+
+	ret = ocfs2_extent_map_get_blocks(inode,
+					  range_start >> sb->s_blocksize_bits,
+					  &phys, NULL, &ext_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Tail is a hole, or is marked unwritten. In either case, we
+	 * can count on read and write to return/push zero's.
+	 */
+	if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
+		goto out;
+
+	ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
+				   &numpages);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
+				 numpages, phys, handle);
+
+	/*
+	 * Initiate writeout of the pages we zero'd here. We don't
+	 * wait on them - the truncate_inode_pages() call later will
+	 * do that for us.
+	 */
+	ret = filemap_fdatawrite_range(inode->i_mapping, range_start,
+				       range_end - 1);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	if (pages)
+		kfree(pages);
+
+	return ret;
+}
+
+static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
+					     struct ocfs2_dinode *di)
+{
+	unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		memset(&di->id2, 0, blocksize -
+				    offsetof(struct ocfs2_dinode, id2) -
+				    xattrsize);
+	else
+		memset(&di->id2, 0, blocksize -
+				    offsetof(struct ocfs2_dinode, id2));
+}
+
+void ocfs2_dinode_new_extent_list(struct inode *inode,
+				  struct ocfs2_dinode *di)
+{
+	ocfs2_zero_dinode_id2_with_xattr(inode, di);
+	di->id2.i_list.l_tree_depth = 0;
+	di->id2.i_list.l_next_free_rec = 0;
+	di->id2.i_list.l_count = cpu_to_le16(
+		ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
+}
+
+void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_inline_data *idata = &di->id2.i_data;
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	/*
+	 * We clear the entire i_data structure here so that all
+	 * fields can be properly initialized.
+	 */
+	ocfs2_zero_dinode_id2_with_xattr(inode, di);
+
+	idata->id_count = cpu_to_le16(
+			ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
+}
+
+int ocfs2_convert_inline_data_to_extents(struct inode *inode,
+					 struct buffer_head *di_bh)
+{
+	int ret, i, has_data, num_pages = 0;
+	handle_t *handle;
+	u64 uninitialized_var(block);
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct page **pages = NULL;
+	loff_t end = osb->s_clustersize;
+	struct ocfs2_extent_tree et;
+	int did_quota = 0;
+
+	has_data = i_size_read(inode) ? 1 : 0;
+
+	if (has_data) {
+		pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
+				sizeof(struct page *), GFP_NOFS);
+		if (pages == NULL) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb,
+				   ocfs2_inline_to_extents_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (has_data) {
+		u32 bit_off, num;
+		unsigned int page_end;
+		u64 phys;
+
+		ret = dquot_alloc_space_nodirty(inode,
+				       ocfs2_clusters_to_bytes(osb->sb, 1));
+		if (ret)
+			goto out_commit;
+		did_quota = 1;
+
+		data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
+		ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
+					   &num);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		/*
+		 * Save two copies, one for insert, and one that can
+		 * be changed by ocfs2_map_and_dirty_page() below.
+		 */
+		block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
+
+		/*
+		 * Non sparse file systems zero on extend, so no need
+		 * to do that now.
+		 */
+		if (!ocfs2_sparse_alloc(osb) &&
+		    PAGE_CACHE_SIZE < osb->s_clustersize)
+			end = PAGE_CACHE_SIZE;
+
+		ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		/*
+		 * This should populate the 1st page for us and mark
+		 * it up to date.
+		 */
+		ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		page_end = PAGE_CACHE_SIZE;
+		if (PAGE_CACHE_SIZE > osb->s_clustersize)
+			page_end = osb->s_clustersize;
+
+		for (i = 0; i < num_pages; i++)
+			ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
+						 pages[i], i > 0, &phys);
+	}
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	ocfs2_dinode_new_extent_list(inode, di);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+	if (has_data) {
+		/*
+		 * An error at this point should be extremely rare. If
+		 * this proves to be false, we could always re-build
+		 * the in-inode data from our pages.
+		 */
+		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+		ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
+	}
+
+out_commit:
+	if (ret < 0 && did_quota)
+		dquot_free_space_nodirty(inode,
+					  ocfs2_clusters_to_bytes(osb->sb, 1));
+
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+out:
+	if (pages) {
+		ocfs2_unlock_and_free_pages(pages, num_pages);
+		kfree(pages);
+	}
+
+	return ret;
+}
+
+/*
+ * It is expected, that by the time you call this function,
+ * inode->i_size and fe->i_size have been adjusted.
+ *
+ * WARNING: This will kfree the truncate context
+ */
+int ocfs2_commit_truncate(struct ocfs2_super *osb,
+			  struct inode *inode,
+			  struct buffer_head *di_bh)
+{
+	int status = 0, i, flags = 0;
+	u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
+	u64 blkno = 0;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_extent_list *root_el = &(di->id2.i_list);
+	u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
+	struct ocfs2_extent_tree et;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
+						     i_size_read(inode));
+
+	path = ocfs2_new_path(di_bh, &di->id2.i_list,
+			      ocfs2_journal_access_di);
+	if (!path) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_extent_map_trunc(inode, new_highest_cpos);
+
+start:
+	/*
+	 * Check that we still have allocation to delete.
+	 */
+	if (OCFS2_I(inode)->ip_clusters == 0) {
+		status = 0;
+		goto bail;
+	}
+
+	/*
+	 * Truncate always works against the rightmost tree branch.
+	 */
+	status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	trace_ocfs2_commit_truncate(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		new_highest_cpos,
+		OCFS2_I(inode)->ip_clusters,
+		path->p_tree_depth);
+
+	/*
+	 * By now, el will point to the extent list on the bottom most
+	 * portion of this tree. Only the tail record is considered in
+	 * each pass.
+	 *
+	 * We handle the following cases, in order:
+	 * - empty extent: delete the remaining branch
+	 * - remove the entire record
+	 * - remove a partial record
+	 * - no record needs to be removed (truncate has completed)
+	 */
+	el = path_leaf_el(path);
+	if (le16_to_cpu(el->l_next_free_rec) == 0) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %llu has empty extent block at %llu\n",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)path_leaf_bh(path)->b_blocknr);
+		status = -EROFS;
+		goto bail;
+	}
+
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	rec = &el->l_recs[i];
+	flags = rec->e_flags;
+	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+	if (i == 0 && ocfs2_is_empty_extent(rec)) {
+		/*
+		 * Lower levels depend on this never happening, but it's best
+		 * to check it up here before changing the tree.
+		*/
+		if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
+			ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+				    "extent record, depth %u\n", inode->i_ino,
+				    le16_to_cpu(root_el->l_tree_depth));
+			status = -EROFS;
+			goto bail;
+		}
+		trunc_cpos = le32_to_cpu(rec->e_cpos);
+		trunc_len = 0;
+		blkno = 0;
+	} else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
+		/*
+		 * Truncate entire record.
+		 */
+		trunc_cpos = le32_to_cpu(rec->e_cpos);
+		trunc_len = ocfs2_rec_clusters(el, rec);
+		blkno = le64_to_cpu(rec->e_blkno);
+	} else if (range > new_highest_cpos) {
+		/*
+		 * Partial truncate. it also should be
+		 * the last truncate we're doing.
+		 */
+		trunc_cpos = new_highest_cpos;
+		trunc_len = range - new_highest_cpos;
+		coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
+		blkno = le64_to_cpu(rec->e_blkno) +
+				ocfs2_clusters_to_blocks(inode->i_sb, coff);
+	} else {
+		/*
+		 * Truncate completed, leave happily.
+		 */
+		status = 0;
+		goto bail;
+	}
+
+	phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+
+	status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+					  phys_cpos, trunc_len, flags, &dealloc,
+					  refcount_loc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_reinit_path(path, 1);
+
+	/*
+	 * The check above will catch the case where we've truncated
+	 * away all allocation.
+	 */
+	goto start;
+
+bail:
+
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	ocfs2_free_path(path);
+
+	return status;
+}
+
+/*
+ * 'start' is inclusive, 'end' is not.
+ */
+int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
+			  unsigned int start, unsigned int end, int trunc)
+{
+	int ret;
+	unsigned int numbytes;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inline_data *idata = &di->id2.i_data;
+
+	if (end > i_size_read(inode))
+		end = i_size_read(inode);
+
+	BUG_ON(start >= end);
+
+	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
+	    !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
+	    !ocfs2_supports_inline_data(osb)) {
+		ocfs2_error(inode->i_sb,
+			    "Inline data flags for inode %llu don't agree! "
+			    "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    le16_to_cpu(di->i_dyn_features),
+			    OCFS2_I(inode)->ip_dyn_features,
+			    osb->s_feature_incompat);
+		ret = -EROFS;
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	numbytes = end - start;
+	memset(idata->id_data + start, 0, numbytes);
+
+	/*
+	 * No need to worry about the data page here - it's been
+	 * truncated already and inline data doesn't need it for
+	 * pushing zero's to disk, so we'll let readpage pick it up
+	 * later.
+	 */
+	if (trunc) {
+		i_size_write(inode, start);
+		di->i_size = cpu_to_le64(start);
+	}
+
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	return ret;
+}
+
+static int ocfs2_trim_extent(struct super_block *sb,
+			     struct ocfs2_group_desc *gd,
+			     u32 start, u32 count)
+{
+	u64 discard, bcount;
+
+	bcount = ocfs2_clusters_to_blocks(sb, count);
+	discard = le64_to_cpu(gd->bg_blkno) +
+			ocfs2_clusters_to_blocks(sb, start);
+
+	trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
+
+	return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
+}
+
+static int ocfs2_trim_group(struct super_block *sb,
+			    struct ocfs2_group_desc *gd,
+			    u32 start, u32 max, u32 minbits)
+{
+	int ret = 0, count = 0, next;
+	void *bitmap = gd->bg_bitmap;
+
+	if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
+		return 0;
+
+	trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
+			       start, max, minbits);
+
+	while (start < max) {
+		start = ocfs2_find_next_zero_bit(bitmap, max, start);
+		if (start >= max)
+			break;
+		next = ocfs2_find_next_bit(bitmap, max, start);
+
+		if ((next - start) >= minbits) {
+			ret = ocfs2_trim_extent(sb, gd,
+						start, next - start);
+			if (ret < 0) {
+				mlog_errno(ret);
+				break;
+			}
+			count += next - start;
+		}
+		start = next + 1;
+
+		if (fatal_signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+			break;
+	}
+
+	if (ret < 0)
+		count = ret;
+
+	return count;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	u64 start, len, trimmed, first_group, last_group, group;
+	int ret, cnt;
+	u32 first_bit, last_bit, minlen;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct buffer_head *gd_bh = NULL;
+	struct ocfs2_dinode *main_bm;
+	struct ocfs2_group_desc *gd = NULL;
+
+	start = range->start >> osb->s_clustersize_bits;
+	len = range->len >> osb->s_clustersize_bits;
+	minlen = range->minlen >> osb->s_clustersize_bits;
+	trimmed = 0;
+
+	if (!len) {
+		range->len = 0;
+		return 0;
+	}
+
+	if (minlen >= osb->bitmap_cpg)
+		return -EINVAL;
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock(&main_bm_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+	if (start >= le32_to_cpu(main_bm->i_clusters)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (start + len > le32_to_cpu(main_bm->i_clusters))
+		len = le32_to_cpu(main_bm->i_clusters) - start;
+
+	trace_ocfs2_trim_fs(start, len, minlen);
+
+	/* Determine first and last group to examine based on start and len */
+	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+	if (first_group == osb->first_cluster_group_blkno)
+		first_bit = start;
+	else
+		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+	last_bit = osb->bitmap_cpg;
+
+	for (group = first_group; group <= last_group;) {
+		if (first_bit + len >= osb->bitmap_cpg)
+			last_bit = osb->bitmap_cpg;
+		else
+			last_bit = first_bit + len;
+
+		ret = ocfs2_read_group_descriptor(main_bm_inode,
+						  main_bm, group,
+						  &gd_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+
+		gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+		cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+		brelse(gd_bh);
+		gd_bh = NULL;
+		if (cnt < 0) {
+			ret = cnt;
+			mlog_errno(ret);
+			break;
+		}
+
+		trimmed += cnt;
+		len -= osb->bitmap_cpg - first_bit;
+		first_bit = 0;
+		if (group == osb->first_cluster_group_blkno)
+			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+		else
+			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+	}
+	range->len = trimmed * sb->s_blocksize;
+out_unlock:
+	ocfs2_inode_unlock(main_bm_inode, 0);
+	brelse(main_bm_bh);
+out_mutex:
+	mutex_unlock(&main_bm_inode->i_mutex);
+	iput(main_bm_inode);
+out:
+	return ret;
+}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
new file mode 100644
index 00000000..ca381c58
--- /dev/null
+++ b/fs/ocfs2/alloc.h
@@ -0,0 +1,324 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_ALLOC_H
+#define OCFS2_ALLOC_H
+
+
+/*
+ * For xattr tree leaf, we limit the leaf byte size to be 64K.
+ */
+#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536
+
+/*
+ * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
+ * the b-tree operations in ocfs2. Now all the b-tree operations are not
+ * limited to ocfs2_dinode only. Any data which need to allocate clusters
+ * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
+ * and operation.
+ *
+ * ocfs2_extent_tree becomes the first-class object for extent tree
+ * manipulation.  Callers of the alloc.c code need to fill it via one of
+ * the ocfs2_init_*_extent_tree() operations below.
+ *
+ * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
+ * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
+ * functions.  It needs the ocfs2_caching_info structure associated with
+ * I/O on the tree.  With metadata ecc, we now call different journal_access
+ * functions for each type of metadata, so it must have the
+ * root_journal_access function.
+ * ocfs2_extent_tree_operations abstract the normal operations we do for
+ * the root of extent b-tree.
+ */
+struct ocfs2_extent_tree_operations;
+struct ocfs2_extent_tree {
+	struct ocfs2_extent_tree_operations	*et_ops;
+	struct buffer_head			*et_root_bh;
+	struct ocfs2_extent_list		*et_root_el;
+	struct ocfs2_caching_info		*et_ci;
+	ocfs2_journal_access_func		et_root_journal_access;
+	void					*et_object;
+	unsigned int				et_max_leaf_clusters;
+};
+
+/*
+ * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the
+ * specified object buffer.
+ */
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+				   struct ocfs2_caching_info *ci,
+				   struct buffer_head *bh);
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+				       struct ocfs2_caching_info *ci,
+				       struct buffer_head *bh);
+struct ocfs2_xattr_value_buf;
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+					struct ocfs2_caching_info *ci,
+					struct ocfs2_xattr_value_buf *vb);
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+				    struct ocfs2_caching_info *ci,
+				    struct buffer_head *bh);
+void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *bh);
+
+/*
+ * Read an extent block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The extent block will be validated
+ * with ocfs2_validate_extent_block().
+ */
+int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
+			    struct buffer_head **bh);
+
+struct ocfs2_alloc_context;
+int ocfs2_insert_extent(handle_t *handle,
+			struct ocfs2_extent_tree *et,
+			u32 cpos,
+			u64 start_blk,
+			u32 new_clusters,
+			u8 flags,
+			struct ocfs2_alloc_context *meta_ac);
+
+enum ocfs2_alloc_restarted {
+	RESTART_NONE = 0,
+	RESTART_TRANS,
+	RESTART_META
+};
+int ocfs2_add_clusters_in_btree(handle_t *handle,
+				struct ocfs2_extent_tree *et,
+				u32 *logical_offset,
+				u32 clusters_to_add,
+				int mark_unwritten,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_alloc_context *meta_ac,
+				enum ocfs2_alloc_restarted *reason_ret);
+struct ocfs2_cached_dealloc_ctxt;
+struct ocfs2_path;
+int ocfs2_split_extent(handle_t *handle,
+		       struct ocfs2_extent_tree *et,
+		       struct ocfs2_path *path,
+		       int split_index,
+		       struct ocfs2_extent_rec *split_rec,
+		       struct ocfs2_alloc_context *meta_ac,
+		       struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_mark_extent_written(struct inode *inode,
+			      struct ocfs2_extent_tree *et,
+			      handle_t *handle, u32 cpos, u32 len, u32 phys,
+			      struct ocfs2_alloc_context *meta_ac,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_change_extent_flag(handle_t *handle,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 len, u32 phys,
+			     struct ocfs2_alloc_context *meta_ac,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc,
+			     int new_flags, int clear_flags);
+int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
+			u32 cpos, u32 len,
+			struct ocfs2_alloc_context *meta_ac,
+			struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_btree_range(struct inode *inode,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 phys_cpos, u32 len, int flags,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc,
+			     u64 refcount_loc);
+
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+			   struct ocfs2_extent_tree *et);
+
+/*
+ * how many new metadata chunks would an allocation need at maximum?
+ *
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el)
+{
+	/*
+	 * Rather than do all the work of determining how much we need
+	 * (involves a ton of reads and locks), just ask for the
+	 * maximal limit.  That's a tree depth shift.  So, one block for
+	 * level of the tree (current l_tree_depth), one block for the
+	 * new tree_depth==0 extent_block, and one block at the new
+	 * top-of-the tree.
+	 */
+	return le16_to_cpu(root_el->l_tree_depth) + 2;
+}
+
+void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
+void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di);
+int ocfs2_convert_inline_data_to_extents(struct inode *inode,
+					 struct buffer_head *di_bh);
+
+int ocfs2_truncate_log_init(struct ocfs2_super *osb);
+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+				       int cancel);
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+				      int slot_num,
+				      struct ocfs2_dinode **tl_copy);
+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
+					 struct ocfs2_dinode *tl_copy);
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb);
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+			      handle_t *handle,
+			      u64 start_blk,
+			      unsigned int num_clusters);
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+
+/*
+ * Process local structure which describes the block unlinks done
+ * during an operation. This is populated via
+ * ocfs2_cache_block_dealloc().
+ *
+ * ocfs2_run_deallocs() should be called after the potentially
+ * de-allocating routines. No journal handles should be open, and most
+ * locks should have been dropped.
+ */
+struct ocfs2_cached_dealloc_ctxt {
+	struct ocfs2_per_slot_free_list		*c_first_suballocator;
+	struct ocfs2_cached_block_free 		*c_global_allocator;
+};
+static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
+{
+	c->c_first_suballocator = NULL;
+	c->c_global_allocator = NULL;
+}
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+				u64 blkno, unsigned int bit);
+int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+			      int type, int slot, u64 suballoc, u64 blkno,
+			      unsigned int bit);
+static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
+{
+	return c->c_global_allocator != NULL;
+}
+int ocfs2_run_deallocs(struct ocfs2_super *osb,
+		       struct ocfs2_cached_dealloc_ctxt *ctxt);
+
+struct ocfs2_truncate_context {
+	struct ocfs2_cached_dealloc_ctxt tc_dealloc;
+	int tc_ext_alloc_locked; /* is it cluster locked? */
+	/* these get destroyed once it's passed to ocfs2_commit_truncate. */
+	struct buffer_head *tc_last_eb_bh;
+};
+
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
+				  u64 range_start, u64 range_end);
+int ocfs2_commit_truncate(struct ocfs2_super *osb,
+			  struct inode *inode,
+			  struct buffer_head *di_bh);
+int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
+			  unsigned int start, unsigned int end, int trunc);
+
+int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
+		    struct ocfs2_extent_list *root_el, u32 cpos,
+		    struct buffer_head **leaf_bh);
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
+/*
+ * Helper function to look at the # of clusters in an extent record.
+ */
+static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
+					      struct ocfs2_extent_rec *rec)
+{
+	/*
+	 * Cluster count in extent records is slightly different
+	 * between interior nodes and leaf nodes. This is to support
+	 * unwritten extents which need a flags field in leaf node
+	 * records, thus shrinking the available space for a clusters
+	 * field.
+	 */
+	if (el->l_tree_depth)
+		return le32_to_cpu(rec->e_int_clusters);
+	else
+		return le16_to_cpu(rec->e_leaf_clusters);
+}
+
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
+{
+	return !rec->e_leaf_clusters;
+}
+
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+		     struct page **pages, int *num);
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+			      unsigned int from, unsigned int to,
+			      struct page *page, int zero, u64 *phys);
+/*
+ * Structures which describe a path through a btree, and functions to
+ * manipulate them.
+ *
+ * The idea here is to be as generic as possible with the tree
+ * manipulation code.
+ */
+struct ocfs2_path_item {
+	struct buffer_head		*bh;
+	struct ocfs2_extent_list	*el;
+};
+
+#define OCFS2_MAX_PATH_DEPTH	5
+
+struct ocfs2_path {
+	int				p_tree_depth;
+	ocfs2_journal_access_func	p_root_access;
+	struct ocfs2_path_item		p_node[OCFS2_MAX_PATH_DEPTH];
+};
+
+#define path_root_bh(_path) ((_path)->p_node[0].bh)
+#define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
+#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
+#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
+#define path_num_items(_path) ((_path)->p_tree_depth + 1)
+
+void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root);
+void ocfs2_free_path(struct ocfs2_path *path);
+int ocfs2_find_path(struct ocfs2_caching_info *ci,
+		    struct ocfs2_path *path,
+		    u32 cpos);
+struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path);
+struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et);
+int ocfs2_path_bh_journal_access(handle_t *handle,
+				 struct ocfs2_caching_info *ci,
+				 struct ocfs2_path *path,
+				 int idx);
+int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
+			      handle_t *handle,
+			      struct ocfs2_path *path);
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+				   struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+				  struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+			    struct ocfs2_path *left,
+			    struct ocfs2_path *right);
+#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
new file mode 100644
index 00000000..ac97bca2
--- /dev/null
+++ b/fs/ocfs2/aops.c
@@ -0,0 +1,2048 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <asm/byteorder.h>
+#include <linux/swap.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mpage.h>
+#include <linux/quotaops.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "suballoc.h"
+#include "super.h"
+#include "symlink.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int err = -EIO;
+	int status;
+	struct ocfs2_dinode *fe = NULL;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *buffer_cache_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	void *kaddr;
+
+	trace_ocfs2_symlink_get_block(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)iblock, bh_result, create);
+
+	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
+
+	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
+		mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
+		     (unsigned long long)iblock);
+		goto bail;
+	}
+
+	status = ocfs2_read_inode_block(inode, &bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
+						    le32_to_cpu(fe->i_clusters))) {
+		mlog(ML_ERROR, "block offset is outside the allocated size: "
+		     "%llu\n", (unsigned long long)iblock);
+		goto bail;
+	}
+
+	/* We don't use the page cache to create symlink data, so if
+	 * need be, copy it over from the buffer cache. */
+	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
+		u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
+			    iblock;
+		buffer_cache_bh = sb_getblk(osb->sb, blkno);
+		if (!buffer_cache_bh) {
+			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
+			goto bail;
+		}
+
+		/* we haven't locked out transactions, so a commit
+		 * could've happened. Since we've got a reference on
+		 * the bh, even if it commits while we're doing the
+		 * copy, the data is still good. */
+		if (buffer_jbd(buffer_cache_bh)
+		    && ocfs2_inode_is_new(inode)) {
+			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
+			if (!kaddr) {
+				mlog(ML_ERROR, "couldn't kmap!\n");
+				goto bail;
+			}
+			memcpy(kaddr + (bh_result->b_size * iblock),
+			       buffer_cache_bh->b_data,
+			       bh_result->b_size);
+			kunmap_atomic(kaddr, KM_USER0);
+			set_buffer_uptodate(bh_result);
+		}
+		brelse(buffer_cache_bh);
+	}
+
+	map_bh(bh_result, inode->i_sb,
+	       le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
+
+	err = 0;
+
+bail:
+	brelse(bh);
+
+	return err;
+}
+
+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *bh_result, int create)
+{
+	int err = 0;
+	unsigned int ext_flags;
+	u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
+	u64 p_blkno, count, past_eof;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno,
+			      (unsigned long long)iblock, bh_result, create);
+
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
+		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
+		     inode, inode->i_ino);
+
+	if (S_ISLNK(inode->i_mode)) {
+		/* this always does I/O for some reason. */
+		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
+		goto bail;
+	}
+
+	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
+					  &ext_flags);
+	if (err) {
+		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
+		     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
+		     (unsigned long long)p_blkno);
+		goto bail;
+	}
+
+	if (max_blocks < count)
+		count = max_blocks;
+
+	/*
+	 * ocfs2 never allocates in this function - the only time we
+	 * need to use BH_New is when we're extending i_size on a file
+	 * system which doesn't support holes, in which case BH_New
+	 * allows __block_write_begin() to zero.
+	 *
+	 * If we see this on a sparse file system, then a truncate has
+	 * raced us and removed the cluster. In this case, we clear
+	 * the buffers dirty and uptodate bits and let the buffer code
+	 * ignore it as a hole.
+	 */
+	if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
+		clear_buffer_dirty(bh_result);
+		clear_buffer_uptodate(bh_result);
+		goto bail;
+	}
+
+	/* Treat the unwritten extent as a hole for zeroing purposes. */
+	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+		map_bh(bh_result, inode->i_sb, p_blkno);
+
+	bh_result->b_size = count << inode->i_blkbits;
+
+	if (!ocfs2_sparse_alloc(osb)) {
+		if (p_blkno == 0) {
+			err = -EIO;
+			mlog(ML_ERROR,
+			     "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
+			     (unsigned long long)iblock,
+			     (unsigned long long)p_blkno,
+			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+			mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
+			dump_stack();
+			goto bail;
+		}
+	}
+
+	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+
+	trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				  (unsigned long long)past_eof);
+	if (create && (iblock >= past_eof))
+		set_buffer_new(bh_result);
+
+bail:
+	if (err < 0)
+		err = -EIO;
+
+	return err;
+}
+
+int ocfs2_read_inline_data(struct inode *inode, struct page *page,
+			   struct buffer_head *di_bh)
+{
+	void *kaddr;
+	loff_t size;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
+		ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		return -EROFS;
+	}
+
+	size = i_size_read(inode);
+
+	if (size > PAGE_CACHE_SIZE ||
+	    size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %llu has with inline data has bad size: %Lu",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)size);
+		return -EROFS;
+	}
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	if (size)
+		memcpy(kaddr, di->id2.i_data.id_data, size);
+	/* Clear the remaining part of the page */
+	memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	SetPageUptodate(page);
+
+	return 0;
+}
+
+static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
+
+	ret = ocfs2_read_inode_block(inode, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_inline_data(inode, page, di_bh);
+out:
+	unlock_page(page);
+
+	brelse(di_bh);
+	return ret;
+}
+
+static int ocfs2_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	int ret, unlock = 1;
+
+	trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
+			     (page ? page->index : 0));
+
+	ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
+	if (ret != 0) {
+		if (ret == AOP_TRUNCATED_PAGE)
+			unlock = 0;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+		ret = AOP_TRUNCATED_PAGE;
+		goto out_inode_unlock;
+	}
+
+	/*
+	 * i_size might have just been updated as we grabed the meta lock.  We
+	 * might now be discovering a truncate that hit on another node.
+	 * block_read_full_page->get_block freaks out if it is asked to read
+	 * beyond the end of a file, so we check here.  Callers
+	 * (generic_file_read, vm_ops->fault) are clever enough to check i_size
+	 * and notice that the page they just read isn't needed.
+	 *
+	 * XXX sys_readahead() seems to get that wrong?
+	 */
+	if (start >= i_size_read(inode)) {
+		zero_user(page, 0, PAGE_SIZE);
+		SetPageUptodate(page);
+		ret = 0;
+		goto out_alloc;
+	}
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		ret = ocfs2_readpage_inline(inode, page);
+	else
+		ret = block_read_full_page(page, ocfs2_get_block);
+	unlock = 0;
+
+out_alloc:
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+out_inode_unlock:
+	ocfs2_inode_unlock(inode, 0);
+out:
+	if (unlock)
+		unlock_page(page);
+	return ret;
+}
+
+/*
+ * This is used only for read-ahead. Failures or difficult to handle
+ * situations are safe to ignore.
+ *
+ * Right now, we don't bother with BH_Boundary - in-inode extent lists
+ * are quite large (243 extents on 4k blocks), so most inodes don't
+ * grow out to a tree. If need be, detecting boundary extents could
+ * trivially be added in a future version of ocfs2_get_block().
+ */
+static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
+			   struct list_head *pages, unsigned nr_pages)
+{
+	int ret, err = -EIO;
+	struct inode *inode = mapping->host;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	loff_t start;
+	struct page *last;
+
+	/*
+	 * Use the nonblocking flag for the dlm code to avoid page
+	 * lock inversion, but don't bother with retrying.
+	 */
+	ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
+	if (ret)
+		return err;
+
+	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+		ocfs2_inode_unlock(inode, 0);
+		return err;
+	}
+
+	/*
+	 * Don't bother with inline-data. There isn't anything
+	 * to read-ahead in that case anyway...
+	 */
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		goto out_unlock;
+
+	/*
+	 * Check whether a remote node truncated this file - we just
+	 * drop out in that case as it's not worth handling here.
+	 */
+	last = list_entry(pages->prev, struct page, lru);
+	start = (loff_t)last->index << PAGE_CACHE_SHIFT;
+	if (start >= i_size_read(inode))
+		goto out_unlock;
+
+	err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
+
+out_unlock:
+	up_read(&oi->ip_alloc_sem);
+	ocfs2_inode_unlock(inode, 0);
+
+	return err;
+}
+
+/* Note: Because we don't support holes, our allocation has
+ * already happened (allocation writes zeros to the file data)
+ * so we don't have to worry about ordered writes in
+ * ocfs2_writepage.
+ *
+ * ->writepage is called during the process of invalidating the page cache
+ * during blocked lock processing.  It can't block on any cluster locks
+ * to during block mapping.  It's relying on the fact that the block
+ * mapping can't have disappeared under the dirty pages that it is
+ * being asked to write back.
+ */
+static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
+{
+	trace_ocfs2_writepage(
+		(unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
+		page->index);
+
+	return block_write_full_page(page, ocfs2_get_block, wbc);
+}
+
+/* Taken from ext3. We don't necessarily need the full blown
+ * functionality yet, but IMHO it's better to cut and paste the whole
+ * thing so we can avoid introducing our own bugs (and easily pick up
+ * their fixes when they happen) --Mark */
+int walk_page_buffers(	handle_t *handle,
+			struct buffer_head *head,
+			unsigned from,
+			unsigned to,
+			int *partial,
+			int (*fn)(	handle_t *handle,
+					struct buffer_head *bh))
+{
+	struct buffer_head *bh;
+	unsigned block_start, block_end;
+	unsigned blocksize = head->b_size;
+	int err, ret = 0;
+	struct buffer_head *next;
+
+	for (	bh = head, block_start = 0;
+		ret == 0 && (bh != head || !block_start);
+	    	block_start = block_end, bh = next)
+	{
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (partial && !buffer_uptodate(bh))
+				*partial = 1;
+			continue;
+		}
+		err = (*fn)(handle, bh);
+		if (!ret)
+			ret = err;
+	}
+	return ret;
+}
+
+static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
+{
+	sector_t status;
+	u64 p_blkno = 0;
+	int err = 0;
+	struct inode *inode = mapping->host;
+
+	trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
+			 (unsigned long long)block);
+
+	/* We don't need to lock journal system files, since they aren't
+	 * accessed concurrently from multiple nodes.
+	 */
+	if (!INODE_JOURNAL(inode)) {
+		err = ocfs2_inode_lock(inode, NULL, 0);
+		if (err) {
+			if (err != -ENOENT)
+				mlog_errno(err);
+			goto bail;
+		}
+		down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	}
+
+	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+		err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+						  NULL);
+
+	if (!INODE_JOURNAL(inode)) {
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
+		ocfs2_inode_unlock(inode, 0);
+	}
+
+	if (err) {
+		mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
+		     (unsigned long long)block);
+		mlog_errno(err);
+		goto bail;
+	}
+
+bail:
+	status = err ? 0 : p_blkno;
+
+	return status;
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ *  "So what we do is to permit the ->get_blocks function to populate
+ *   bh.b_size with the size of IO which is permitted at this offset and
+ *   this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * 					fs_count, map_bh, dio->rw == WRITE);
+ *
+ * Note that we never bother to allocate blocks here, and thus ignore the
+ * create argument.
+ */
+static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
+				     struct buffer_head *bh_result, int create)
+{
+	int ret;
+	u64 p_blkno, inode_blocks, contig_blocks;
+	unsigned int ext_flags;
+	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
+	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+	/* This function won't even be called if the request isn't all
+	 * nicely aligned and of the right size, so there's no need
+	 * for us to check any of that. */
+
+	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+
+	/* This figures out the size of the next contiguous block, and
+	 * our logical offset */
+	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
+					  &contig_blocks, &ext_flags);
+	if (ret) {
+		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
+		     (unsigned long long)iblock);
+		ret = -EIO;
+		goto bail;
+	}
+
+	/* We should already CoW the refcounted extent in case of create. */
+	BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
+
+	/*
+	 * get_more_blocks() expects us to describe a hole by clearing
+	 * the mapped bit on bh_result().
+	 *
+	 * Consider an unwritten extent as a hole.
+	 */
+	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+		map_bh(bh_result, inode->i_sb, p_blkno);
+	else
+		clear_buffer_mapped(bh_result);
+
+	/* make sure we don't map more than max_blocks blocks here as
+	   that's all the kernel will handle at this point. */
+	if (max_blocks < contig_blocks)
+		contig_blocks = max_blocks;
+	bh_result->b_size = contig_blocks << blocksize_bits;
+bail:
+	return ret;
+}
+
+/*
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
+ * particularly interested in the aio/dio case.  Like the core uses
+ * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
+ * truncation on another.
+ */
+static void ocfs2_dio_end_io(struct kiocb *iocb,
+			     loff_t offset,
+			     ssize_t bytes,
+			     void *private,
+			     int ret,
+			     bool is_async)
+{
+	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+	int level;
+
+	/* this io's submitter should not have unlocked this before we could */
+	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+
+	if (ocfs2_iocb_is_sem_locked(iocb)) {
+		up_read(&inode->i_alloc_sem);
+		ocfs2_iocb_clear_sem_locked(iocb);
+	}
+
+	ocfs2_iocb_clear_rw_locked(iocb);
+
+	level = ocfs2_iocb_rw_locked_level(iocb);
+	ocfs2_rw_unlock(inode, level);
+
+	if (is_async)
+		aio_complete(iocb, ret, 0);
+}
+
+/*
+ * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
+ * from ext3.  PageChecked() bits have been removed as OCFS2 does not
+ * do journalled data.
+ */
+static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
+{
+	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
+
+	jbd2_journal_invalidatepage(journal, page, offset);
+}
+
+static int ocfs2_releasepage(struct page *page, gfp_t wait)
+{
+	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
+
+	if (!page_has_buffers(page))
+		return 0;
+	return jbd2_journal_try_to_free_buffers(journal, page, wait);
+}
+
+static ssize_t ocfs2_direct_IO(int rw,
+			       struct kiocb *iocb,
+			       const struct iovec *iov,
+			       loff_t offset,
+			       unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
+
+	/*
+	 * Fallback to buffered I/O if we see an inode without
+	 * extents.
+	 */
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
+
+	/* Fallback to buffered I/O if we are appending. */
+	if (i_size_read(inode) <= offset)
+		return 0;
+
+	return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+				    iov, offset, nr_segs,
+				    ocfs2_direct_IO_get_blocks,
+				    ocfs2_dio_end_io, NULL, 0);
+}
+
+static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
+					    u32 cpos,
+					    unsigned int *start,
+					    unsigned int *end)
+{
+	unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
+
+	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
+		unsigned int cpp;
+
+		cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
+
+		cluster_start = cpos % cpp;
+		cluster_start = cluster_start << osb->s_clustersize_bits;
+
+		cluster_end = cluster_start + osb->s_clustersize;
+	}
+
+	BUG_ON(cluster_start > PAGE_SIZE);
+	BUG_ON(cluster_end > PAGE_SIZE);
+
+	if (start)
+		*start = cluster_start;
+	if (end)
+		*end = cluster_end;
+}
+
+/*
+ * 'from' and 'to' are the region in the page to avoid zeroing.
+ *
+ * If pagesize > clustersize, this function will avoid zeroing outside
+ * of the cluster boundary.
+ *
+ * from == to == 0 is code for "zero the entire cluster region"
+ */
+static void ocfs2_clear_page_regions(struct page *page,
+				     struct ocfs2_super *osb, u32 cpos,
+				     unsigned from, unsigned to)
+{
+	void *kaddr;
+	unsigned int cluster_start, cluster_end;
+
+	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
+
+	kaddr = kmap_atomic(page, KM_USER0);
+
+	if (from || to) {
+		if (from > cluster_start)
+			memset(kaddr + cluster_start, 0, from - cluster_start);
+		if (to < cluster_end)
+			memset(kaddr + to, 0, cluster_end - to);
+	} else {
+		memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
+	}
+
+	kunmap_atomic(kaddr, KM_USER0);
+}
+
+/*
+ * Nonsparse file systems fully allocate before we get to the write
+ * code. This prevents ocfs2_write() from tagging the write as an
+ * allocating one, which means ocfs2_map_page_blocks() might try to
+ * read-in the blocks at the tail of our file. Avoid reading them by
+ * testing i_size against each block offset.
+ */
+static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
+				 unsigned int block_start)
+{
+	u64 offset = page_offset(page) + block_start;
+
+	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+		return 1;
+
+	if (i_size_read(inode) > offset)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Some of this taken from __block_write_begin(). We already have our
+ * mapping by now though, and the entire write will be allocating or
+ * it won't, so not much need to use BH_New.
+ *
+ * This will also skip zeroing, which is handled externally.
+ */
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+			  struct inode *inode, unsigned int from,
+			  unsigned int to, int new)
+{
+	int ret = 0;
+	struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
+	unsigned int block_end, block_start;
+	unsigned int bsize = 1 << inode->i_blkbits;
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, bsize, 0);
+
+	head = page_buffers(page);
+	for (bh = head, block_start = 0; bh != head || !block_start;
+	     bh = bh->b_this_page, block_start += bsize) {
+		block_end = block_start + bsize;
+
+		clear_buffer_new(bh);
+
+		/*
+		 * Ignore blocks outside of our i/o range -
+		 * they may belong to unallocated clusters.
+		 */
+		if (block_start >= to || block_end <= from) {
+			if (PageUptodate(page))
+				set_buffer_uptodate(bh);
+			continue;
+		}
+
+		/*
+		 * For an allocating write with cluster size >= page
+		 * size, we always write the entire page.
+		 */
+		if (new)
+			set_buffer_new(bh);
+
+		if (!buffer_mapped(bh)) {
+			map_bh(bh, inode->i_sb, *p_blkno);
+			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+		}
+
+		if (PageUptodate(page)) {
+			if (!buffer_uptodate(bh))
+				set_buffer_uptodate(bh);
+		} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+			   !buffer_new(bh) &&
+			   ocfs2_should_read_blk(inode, page, block_start) &&
+			   (block_start < from || block_end > to)) {
+			ll_rw_block(READ, 1, &bh);
+			*wait_bh++=bh;
+		}
+
+		*p_blkno = *p_blkno + 1;
+	}
+
+	/*
+	 * If we issued read requests - let them complete.
+	 */
+	while(wait_bh > wait) {
+		wait_on_buffer(*--wait_bh);
+		if (!buffer_uptodate(*wait_bh))
+			ret = -EIO;
+	}
+
+	if (ret == 0 || !new)
+		return ret;
+
+	/*
+	 * If we get -EIO above, zero out any newly allocated blocks
+	 * to avoid exposing stale data.
+	 */
+	bh = head;
+	block_start = 0;
+	do {
+		block_end = block_start + bsize;
+		if (block_end <= from)
+			goto next_bh;
+		if (block_start >= to)
+			break;
+
+		zero_user(page, block_start, bh->b_size);
+		set_buffer_uptodate(bh);
+		mark_buffer_dirty(bh);
+
+next_bh:
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	return ret;
+}
+
+#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
+#define OCFS2_MAX_CTXT_PAGES	1
+#else
+#define OCFS2_MAX_CTXT_PAGES	(OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
+#endif
+
+#define OCFS2_MAX_CLUSTERS_PER_PAGE	(PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+
+/*
+ * Describe the state of a single cluster to be written to.
+ */
+struct ocfs2_write_cluster_desc {
+	u32		c_cpos;
+	u32		c_phys;
+	/*
+	 * Give this a unique field because c_phys eventually gets
+	 * filled.
+	 */
+	unsigned	c_new;
+	unsigned	c_unwritten;
+	unsigned	c_needs_zero;
+};
+
+struct ocfs2_write_ctxt {
+	/* Logical cluster position / len of write */
+	u32				w_cpos;
+	u32				w_clen;
+
+	/* First cluster allocated in a nonsparse extend */
+	u32				w_first_new_cpos;
+
+	struct ocfs2_write_cluster_desc	w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
+
+	/*
+	 * This is true if page_size > cluster_size.
+	 *
+	 * It triggers a set of special cases during write which might
+	 * have to deal with allocating writes to partial pages.
+	 */
+	unsigned int			w_large_pages;
+
+	/*
+	 * Pages involved in this write.
+	 *
+	 * w_target_page is the page being written to by the user.
+	 *
+	 * w_pages is an array of pages which always contains
+	 * w_target_page, and in the case of an allocating write with
+	 * page_size < cluster size, it will contain zero'd and mapped
+	 * pages adjacent to w_target_page which need to be written
+	 * out in so that future reads from that region will get
+	 * zero's.
+	 */
+	unsigned int			w_num_pages;
+	struct page			*w_pages[OCFS2_MAX_CTXT_PAGES];
+	struct page			*w_target_page;
+
+	/*
+	 * ocfs2_write_end() uses this to know what the real range to
+	 * write in the target should be.
+	 */
+	unsigned int			w_target_from;
+	unsigned int			w_target_to;
+
+	/*
+	 * We could use journal_current_handle() but this is cleaner,
+	 * IMHO -Mark
+	 */
+	handle_t			*w_handle;
+
+	struct buffer_head		*w_di_bh;
+
+	struct ocfs2_cached_dealloc_ctxt w_dealloc;
+};
+
+void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
+{
+	int i;
+
+	for(i = 0; i < num_pages; i++) {
+		if (pages[i]) {
+			unlock_page(pages[i]);
+			mark_page_accessed(pages[i]);
+			page_cache_release(pages[i]);
+		}
+	}
+}
+
+static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+{
+	ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
+
+	brelse(wc->w_di_bh);
+	kfree(wc);
+}
+
+static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
+				  struct ocfs2_super *osb, loff_t pos,
+				  unsigned len, struct buffer_head *di_bh)
+{
+	u32 cend;
+	struct ocfs2_write_ctxt *wc;
+
+	wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
+	if (!wc)
+		return -ENOMEM;
+
+	wc->w_cpos = pos >> osb->s_clustersize_bits;
+	wc->w_first_new_cpos = UINT_MAX;
+	cend = (pos + len - 1) >> osb->s_clustersize_bits;
+	wc->w_clen = cend - wc->w_cpos + 1;
+	get_bh(di_bh);
+	wc->w_di_bh = di_bh;
+
+	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+		wc->w_large_pages = 1;
+	else
+		wc->w_large_pages = 0;
+
+	ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+
+	*wcp = wc;
+
+	return 0;
+}
+
+/*
+ * If a page has any new buffers, zero them out here, and mark them uptodate
+ * and dirty so they'll be written out (in order to prevent uninitialised
+ * block data from leaking). And clear the new bit.
+ */
+static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+	unsigned int block_start, block_end;
+	struct buffer_head *head, *bh;
+
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		return;
+
+	bh = head = page_buffers(page);
+	block_start = 0;
+	do {
+		block_end = block_start + bh->b_size;
+
+		if (buffer_new(bh)) {
+			if (block_end > from && block_start < to) {
+				if (!PageUptodate(page)) {
+					unsigned start, end;
+
+					start = max(from, block_start);
+					end = min(to, block_end);
+
+					zero_user_segment(page, start, end);
+					set_buffer_uptodate(bh);
+				}
+
+				clear_buffer_new(bh);
+				mark_buffer_dirty(bh);
+			}
+		}
+
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+
+/*
+ * Only called when we have a failure during allocating write to write
+ * zero's to the newly allocated region.
+ */
+static void ocfs2_write_failure(struct inode *inode,
+				struct ocfs2_write_ctxt *wc,
+				loff_t user_pos, unsigned user_len)
+{
+	int i;
+	unsigned from = user_pos & (PAGE_CACHE_SIZE - 1),
+		to = user_pos + user_len;
+	struct page *tmppage;
+
+	ocfs2_zero_new_buffers(wc->w_target_page, from, to);
+
+	for(i = 0; i < wc->w_num_pages; i++) {
+		tmppage = wc->w_pages[i];
+
+		if (page_has_buffers(tmppage)) {
+			if (ocfs2_should_order_data(inode))
+				ocfs2_jbd2_file_inode(wc->w_handle, inode);
+
+			block_commit_write(tmppage, from, to);
+		}
+	}
+}
+
+static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
+					struct ocfs2_write_ctxt *wc,
+					struct page *page, u32 cpos,
+					loff_t user_pos, unsigned user_len,
+					int new)
+{
+	int ret;
+	unsigned int map_from = 0, map_to = 0;
+	unsigned int cluster_start, cluster_end;
+	unsigned int user_data_from = 0, user_data_to = 0;
+
+	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
+					&cluster_start, &cluster_end);
+
+	/* treat the write as new if the a hole/lseek spanned across
+	 * the page boundary.
+	 */
+	new = new | ((i_size_read(inode) <= page_offset(page)) &&
+			(page_offset(page) <= user_pos));
+
+	if (page == wc->w_target_page) {
+		map_from = user_pos & (PAGE_CACHE_SIZE - 1);
+		map_to = map_from + user_len;
+
+		if (new)
+			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+						    cluster_start, cluster_end,
+						    new);
+		else
+			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+						    map_from, map_to, new);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		user_data_from = map_from;
+		user_data_to = map_to;
+		if (new) {
+			map_from = cluster_start;
+			map_to = cluster_end;
+		}
+	} else {
+		/*
+		 * If we haven't allocated the new page yet, we
+		 * shouldn't be writing it out without copying user
+		 * data. This is likely a math error from the caller.
+		 */
+		BUG_ON(!new);
+
+		map_from = cluster_start;
+		map_to = cluster_end;
+
+		ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+					    cluster_start, cluster_end, new);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * Parts of newly allocated pages need to be zero'd.
+	 *
+	 * Above, we have also rewritten 'to' and 'from' - as far as
+	 * the rest of the function is concerned, the entire cluster
+	 * range inside of a page needs to be written.
+	 *
+	 * We can skip this if the page is up to date - it's already
+	 * been zero'd from being read in as a hole.
+	 */
+	if (new && !PageUptodate(page))
+		ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
+					 cpos, user_data_from, user_data_to);
+
+	flush_dcache_page(page);
+
+out:
+	return ret;
+}
+
+/*
+ * This function will only grab one clusters worth of pages.
+ */
+static int ocfs2_grab_pages_for_write(struct address_space *mapping,
+				      struct ocfs2_write_ctxt *wc,
+				      u32 cpos, loff_t user_pos,
+				      unsigned user_len, int new,
+				      struct page *mmap_page)
+{
+	int ret = 0, i;
+	unsigned long start, target_index, end_index, index;
+	struct inode *inode = mapping->host;
+	loff_t last_byte;
+
+	target_index = user_pos >> PAGE_CACHE_SHIFT;
+
+	/*
+	 * Figure out how many pages we'll be manipulating here. For
+	 * non allocating write, we just change the one
+	 * page. Otherwise, we'll need a whole clusters worth.  If we're
+	 * writing past i_size, we only need enough pages to cover the
+	 * last page of the write.
+	 */
+	if (new) {
+		wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
+		start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+		/*
+		 * We need the index *past* the last page we could possibly
+		 * touch.  This is the page past the end of the write or
+		 * i_size, whichever is greater.
+		 */
+		last_byte = max(user_pos + user_len, i_size_read(inode));
+		BUG_ON(last_byte < 1);
+		end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
+		if ((start + wc->w_num_pages) > end_index)
+			wc->w_num_pages = end_index - start;
+	} else {
+		wc->w_num_pages = 1;
+		start = target_index;
+	}
+
+	for(i = 0; i < wc->w_num_pages; i++) {
+		index = start + i;
+
+		if (index == target_index && mmap_page) {
+			/*
+			 * ocfs2_pagemkwrite() is a little different
+			 * and wants us to directly use the page
+			 * passed in.
+			 */
+			lock_page(mmap_page);
+
+			if (mmap_page->mapping != mapping) {
+				unlock_page(mmap_page);
+				/*
+				 * Sanity check - the locking in
+				 * ocfs2_pagemkwrite() should ensure
+				 * that this code doesn't trigger.
+				 */
+				ret = -EINVAL;
+				mlog_errno(ret);
+				goto out;
+			}
+
+			page_cache_get(mmap_page);
+			wc->w_pages[i] = mmap_page;
+		} else {
+			wc->w_pages[i] = find_or_create_page(mapping, index,
+							     GFP_NOFS);
+			if (!wc->w_pages[i]) {
+				ret = -ENOMEM;
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		if (index == target_index)
+			wc->w_target_page = wc->w_pages[i];
+	}
+out:
+	return ret;
+}
+
+/*
+ * Prepare a single cluster for write one cluster into the file.
+ */
+static int ocfs2_write_cluster(struct address_space *mapping,
+			       u32 phys, unsigned int unwritten,
+			       unsigned int should_zero,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       struct ocfs2_write_ctxt *wc, u32 cpos,
+			       loff_t user_pos, unsigned user_len)
+{
+	int ret, i, new;
+	u64 v_blkno, p_blkno;
+	struct inode *inode = mapping->host;
+	struct ocfs2_extent_tree et;
+
+	new = phys == 0 ? 1 : 0;
+	if (new) {
+		u32 tmp_pos;
+
+		/*
+		 * This is safe to call with the page locks - it won't take
+		 * any additional semaphores or cluster locks.
+		 */
+		tmp_pos = cpos;
+		ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
+					   &tmp_pos, 1, 0, wc->w_di_bh,
+					   wc->w_handle, data_ac,
+					   meta_ac, NULL);
+		/*
+		 * This shouldn't happen because we must have already
+		 * calculated the correct meta data allocation required. The
+		 * internal tree allocation code should know how to increase
+		 * transaction credits itself.
+		 *
+		 * If need be, we could handle -EAGAIN for a
+		 * RESTART_TRANS here.
+		 */
+		mlog_bug_on_msg(ret == -EAGAIN,
+				"Inode %llu: EAGAIN return during allocation.\n",
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else if (unwritten) {
+		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
+					      wc->w_di_bh);
+		ret = ocfs2_mark_extent_written(inode, &et,
+						wc->w_handle, cpos, 1, phys,
+						meta_ac, &wc->w_dealloc);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (should_zero)
+		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
+	else
+		v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
+
+	/*
+	 * The only reason this should fail is due to an inability to
+	 * find the extent added.
+	 */
+	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
+					  NULL);
+	if (ret < 0) {
+		ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
+			    "at logical block %llu",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)v_blkno);
+		goto out;
+	}
+
+	BUG_ON(p_blkno == 0);
+
+	for(i = 0; i < wc->w_num_pages; i++) {
+		int tmpret;
+
+		tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
+						      wc->w_pages[i], cpos,
+						      user_pos, user_len,
+						      should_zero);
+		if (tmpret) {
+			mlog_errno(tmpret);
+			if (ret == 0)
+				ret = tmpret;
+		}
+	}
+
+	/*
+	 * We only have cleanup to do in case of allocating write.
+	 */
+	if (ret && new)
+		ocfs2_write_failure(inode, wc, user_pos, user_len);
+
+out:
+
+	return ret;
+}
+
+static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
+				       struct ocfs2_alloc_context *data_ac,
+				       struct ocfs2_alloc_context *meta_ac,
+				       struct ocfs2_write_ctxt *wc,
+				       loff_t pos, unsigned len)
+{
+	int ret, i;
+	loff_t cluster_off;
+	unsigned int local_len = len;
+	struct ocfs2_write_cluster_desc *desc;
+	struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb);
+
+	for (i = 0; i < wc->w_clen; i++) {
+		desc = &wc->w_desc[i];
+
+		/*
+		 * We have to make sure that the total write passed in
+		 * doesn't extend past a single cluster.
+		 */
+		local_len = len;
+		cluster_off = pos & (osb->s_clustersize - 1);
+		if ((cluster_off + local_len) > osb->s_clustersize)
+			local_len = osb->s_clustersize - cluster_off;
+
+		ret = ocfs2_write_cluster(mapping, desc->c_phys,
+					  desc->c_unwritten,
+					  desc->c_needs_zero,
+					  data_ac, meta_ac,
+					  wc, desc->c_cpos, pos, local_len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		len -= local_len;
+		pos += local_len;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * ocfs2_write_end() wants to know which parts of the target page it
+ * should complete the write on. It's easiest to compute them ahead of
+ * time when a more complete view of the write is available.
+ */
+static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
+					struct ocfs2_write_ctxt *wc,
+					loff_t pos, unsigned len, int alloc)
+{
+	struct ocfs2_write_cluster_desc *desc;
+
+	wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
+	wc->w_target_to = wc->w_target_from + len;
+
+	if (alloc == 0)
+		return;
+
+	/*
+	 * Allocating write - we may have different boundaries based
+	 * on page size and cluster size.
+	 *
+	 * NOTE: We can no longer compute one value from the other as
+	 * the actual write length and user provided length may be
+	 * different.
+	 */
+
+	if (wc->w_large_pages) {
+		/*
+		 * We only care about the 1st and last cluster within
+		 * our range and whether they should be zero'd or not. Either
+		 * value may be extended out to the start/end of a
+		 * newly allocated cluster.
+		 */
+		desc = &wc->w_desc[0];
+		if (desc->c_needs_zero)
+			ocfs2_figure_cluster_boundaries(osb,
+							desc->c_cpos,
+							&wc->w_target_from,
+							NULL);
+
+		desc = &wc->w_desc[wc->w_clen - 1];
+		if (desc->c_needs_zero)
+			ocfs2_figure_cluster_boundaries(osb,
+							desc->c_cpos,
+							NULL,
+							&wc->w_target_to);
+	} else {
+		wc->w_target_from = 0;
+		wc->w_target_to = PAGE_CACHE_SIZE;
+	}
+}
+
+/*
+ * Populate each single-cluster write descriptor in the write context
+ * with information about the i/o to be done.
+ *
+ * Returns the number of clusters that will have to be allocated, as
+ * well as a worst case estimate of the number of extent records that
+ * would have to be created during a write to an unwritten region.
+ */
+static int ocfs2_populate_write_desc(struct inode *inode,
+				     struct ocfs2_write_ctxt *wc,
+				     unsigned int *clusters_to_alloc,
+				     unsigned int *extents_to_split)
+{
+	int ret;
+	struct ocfs2_write_cluster_desc *desc;
+	unsigned int num_clusters = 0;
+	unsigned int ext_flags = 0;
+	u32 phys = 0;
+	int i;
+
+	*clusters_to_alloc = 0;
+	*extents_to_split = 0;
+
+	for (i = 0; i < wc->w_clen; i++) {
+		desc = &wc->w_desc[i];
+		desc->c_cpos = wc->w_cpos + i;
+
+		if (num_clusters == 0) {
+			/*
+			 * Need to look up the next extent record.
+			 */
+			ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
+						 &num_clusters, &ext_flags);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			/* We should already CoW the refcountd extent. */
+			BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+
+			/*
+			 * Assume worst case - that we're writing in
+			 * the middle of the extent.
+			 *
+			 * We can assume that the write proceeds from
+			 * left to right, in which case the extent
+			 * insert code is smart enough to coalesce the
+			 * next splits into the previous records created.
+			 */
+			if (ext_flags & OCFS2_EXT_UNWRITTEN)
+				*extents_to_split = *extents_to_split + 2;
+		} else if (phys) {
+			/*
+			 * Only increment phys if it doesn't describe
+			 * a hole.
+			 */
+			phys++;
+		}
+
+		/*
+		 * If w_first_new_cpos is < UINT_MAX, we have a non-sparse
+		 * file that got extended.  w_first_new_cpos tells us
+		 * where the newly allocated clusters are so we can
+		 * zero them.
+		 */
+		if (desc->c_cpos >= wc->w_first_new_cpos) {
+			BUG_ON(phys == 0);
+			desc->c_needs_zero = 1;
+		}
+
+		desc->c_phys = phys;
+		if (phys == 0) {
+			desc->c_new = 1;
+			desc->c_needs_zero = 1;
+			*clusters_to_alloc = *clusters_to_alloc + 1;
+		}
+
+		if (ext_flags & OCFS2_EXT_UNWRITTEN) {
+			desc->c_unwritten = 1;
+			desc->c_needs_zero = 1;
+		}
+
+		num_clusters--;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static int ocfs2_write_begin_inline(struct address_space *mapping,
+				    struct inode *inode,
+				    struct ocfs2_write_ctxt *wc)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct page *page;
+	handle_t *handle;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+
+	page = find_or_create_page(mapping, 0, GFP_NOFS);
+	if (!page) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	/*
+	 * If we don't set w_num_pages then this page won't get unlocked
+	 * and freed on cleanup of the write context.
+	 */
+	wc->w_pages[0] = wc->w_target_page = page;
+	wc->w_num_pages = 1;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		ocfs2_commit_trans(osb, handle);
+
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+		ocfs2_set_inode_data_inline(inode, di);
+
+	if (!PageUptodate(page)) {
+		ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
+		if (ret) {
+			ocfs2_commit_trans(osb, handle);
+
+			goto out;
+		}
+	}
+
+	wc->w_handle = handle;
+out:
+	return ret;
+}
+
+int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	if (new_size <= le16_to_cpu(di->id2.i_data.id_count))
+		return 1;
+	return 0;
+}
+
+static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
+					  struct inode *inode, loff_t pos,
+					  unsigned len, struct page *mmap_page,
+					  struct ocfs2_write_ctxt *wc)
+{
+	int ret, written = 0;
+	loff_t end = pos + len;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = NULL;
+
+	trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno,
+					     len, (unsigned long long)pos,
+					     oi->ip_dyn_features);
+
+	/*
+	 * Handle inodes which already have inline data 1st.
+	 */
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		if (mmap_page == NULL &&
+		    ocfs2_size_fits_inline_data(wc->w_di_bh, end))
+			goto do_inline_write;
+
+		/*
+		 * The write won't fit - we have to give this inode an
+		 * inline extent list now.
+		 */
+		ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Check whether the inode can accept inline data.
+	 */
+	if (oi->ip_clusters != 0 || i_size_read(inode) != 0)
+		return 0;
+
+	/*
+	 * Check whether the write can fit.
+	 */
+	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+	if (mmap_page ||
+	    end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di))
+		return 0;
+
+do_inline_write:
+	ret = ocfs2_write_begin_inline(mapping, inode, wc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * This signals to the caller that the data can be written
+	 * inline.
+	 */
+	written = 1;
+out:
+	return written ? written : ret;
+}
+
+/*
+ * This function only does anything for file systems which can't
+ * handle sparse files.
+ *
+ * What we want to do here is fill in any hole between the current end
+ * of allocation and the end of our write. That way the rest of the
+ * write path can treat it as an non-allocating write, which has no
+ * special case code for sparse/nonsparse files.
+ */
+static int ocfs2_expand_nonsparse_inode(struct inode *inode,
+					struct buffer_head *di_bh,
+					loff_t pos, unsigned len,
+					struct ocfs2_write_ctxt *wc)
+{
+	int ret;
+	loff_t newsize = pos + len;
+
+	BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+
+	if (newsize <= i_size_read(inode))
+		return 0;
+
+	ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
+	if (ret)
+		mlog_errno(ret);
+
+	wc->w_first_new_cpos =
+		ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+
+	return ret;
+}
+
+static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+			   loff_t pos)
+{
+	int ret = 0;
+
+	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+	if (pos > i_size_read(inode))
+		ret = ocfs2_zero_extend(inode, di_bh, pos);
+
+	return ret;
+}
+
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+					  unsigned int needed)
+{
+	tid_t target;
+	int ret = 0;
+	unsigned int truncated_clusters;
+
+	mutex_lock(&osb->osb_tl_inode->i_mutex);
+	truncated_clusters = osb->truncated_clusters;
+	mutex_unlock(&osb->osb_tl_inode->i_mutex);
+
+	/*
+	 * Check whether we can succeed in allocating if we free
+	 * the truncate log.
+	 */
+	if (truncated_clusters < needed)
+		goto out;
+
+	ret = ocfs2_flush_truncate_log(osb);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+		jbd2_log_wait_commit(osb->journal->j_journal, target);
+		ret = 1;
+	}
+out:
+	return ret;
+}
+
+int ocfs2_write_begin_nolock(struct file *filp,
+			     struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata,
+			     struct buffer_head *di_bh, struct page *mmap_page)
+{
+	int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
+	unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
+	struct ocfs2_write_ctxt *wc;
+	struct inode *inode = mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	handle_t *handle;
+	struct ocfs2_extent_tree et;
+	int try_free = 1, ret1;
+
+try_again:
+	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if (ocfs2_supports_inline_data(osb)) {
+		ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
+						     mmap_page, wc);
+		if (ret == 1) {
+			ret = 0;
+			goto success;
+		}
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (ocfs2_sparse_alloc(osb))
+		ret = ocfs2_zero_tail(inode, di_bh, pos);
+	else
+		ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+						   wc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_check_range_for_refcount(inode, pos, len);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	} else if (ret == 1) {
+		clusters_need = wc->w_clen;
+		ret = ocfs2_refcount_cow(inode, filp, di_bh,
+					 wc->w_cpos, wc->w_clen, UINT_MAX);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
+					&extents_to_split);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	clusters_need += clusters_to_alloc;
+
+	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+
+	trace_ocfs2_write_begin_nolock(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(long long)i_size_read(inode),
+			le32_to_cpu(di->i_clusters),
+			pos, len, flags, mmap_page,
+			clusters_to_alloc, extents_to_split);
+
+	/*
+	 * We set w_target_from, w_target_to here so that
+	 * ocfs2_write_end() knows which range in the target page to
+	 * write out. An allocation requires that we write the entire
+	 * cluster range.
+	 */
+	if (clusters_to_alloc || extents_to_split) {
+		/*
+		 * XXX: We are stretching the limits of
+		 * ocfs2_lock_allocators(). It greatly over-estimates
+		 * the work to be done.
+		 */
+		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
+					      wc->w_di_bh);
+		ret = ocfs2_lock_allocators(inode, &et,
+					    clusters_to_alloc, extents_to_split,
+					    &data_ac, &meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (data_ac)
+			data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
+		credits = ocfs2_calc_extend_credits(inode->i_sb,
+						    &di->id2.i_list,
+						    clusters_to_alloc);
+
+	}
+
+	/*
+	 * We have to zero sparse allocated clusters, unwritten extent clusters,
+	 * and non-sparse clusters we just extended.  For non-sparse writes,
+	 * we know zeros will only be needed in the first and/or last cluster.
+	 */
+	if (clusters_to_alloc || extents_to_split ||
+	    (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+			    wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+		cluster_of_pages = 1;
+	else
+		cluster_of_pages = 0;
+
+	ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	wc->w_handle = handle;
+
+	if (clusters_to_alloc) {
+		ret = dquot_alloc_space_nodirty(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
+		if (ret)
+			goto out_commit;
+	}
+	/*
+	 * We don't want this to fail in ocfs2_write_end(), so do it
+	 * here.
+	 */
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_quota;
+	}
+
+	/*
+	 * Fill our page array first. That way we've grabbed enough so
+	 * that we can zero and flush if we error after adding the
+	 * extent.
+	 */
+	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
+					 cluster_of_pages, mmap_page);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_quota;
+	}
+
+	ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
+					  len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_quota;
+	}
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+success:
+	*pagep = wc->w_target_page;
+	*fsdata = wc;
+	return 0;
+out_quota:
+	if (clusters_to_alloc)
+		dquot_free_space(inode,
+			  ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	ocfs2_free_write_ctxt(wc);
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	if (ret == -ENOSPC && try_free) {
+		/*
+		 * Try to free some truncate log so that we can have enough
+		 * clusters to allocate.
+		 */
+		try_free = 0;
+
+		ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
+		if (ret1 == 1)
+			goto try_again;
+
+		if (ret1 < 0)
+			mlog_errno(ret1);
+	}
+
+	return ret;
+}
+
+static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct inode *inode = mapping->host;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/*
+	 * Take alloc sem here to prevent concurrent lookups. That way
+	 * the mapping, zeroing and tree manipulation within
+	 * ocfs2_write() will be safe against ->readpage(). This
+	 * should also serve to lock out allocation from a shared
+	 * writeable region.
+	 */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
+				       fsdata, di_bh, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_fail;
+	}
+
+	brelse(di_bh);
+
+	return 0;
+
+out_fail:
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	brelse(di_bh);
+	ocfs2_inode_unlock(inode, 1);
+
+	return ret;
+}
+
+static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
+				   unsigned len, unsigned *copied,
+				   struct ocfs2_dinode *di,
+				   struct ocfs2_write_ctxt *wc)
+{
+	void *kaddr;
+
+	if (unlikely(*copied < len)) {
+		if (!PageUptodate(wc->w_target_page)) {
+			*copied = 0;
+			return;
+		}
+	}
+
+	kaddr = kmap_atomic(wc->w_target_page, KM_USER0);
+	memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	trace_ocfs2_write_end_inline(
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     (unsigned long long)pos, *copied,
+	     le16_to_cpu(di->id2.i_data.id_count),
+	     le16_to_cpu(di->i_dyn_features));
+}
+
+int ocfs2_write_end_nolock(struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata)
+{
+	int i;
+	unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
+	struct inode *inode = mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_write_ctxt *wc = fsdata;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+	handle_t *handle = wc->w_handle;
+	struct page *tmppage;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
+		goto out_write_size;
+	}
+
+	if (unlikely(copied < len)) {
+		if (!PageUptodate(wc->w_target_page))
+			copied = 0;
+
+		ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
+				       start+len);
+	}
+	flush_dcache_page(wc->w_target_page);
+
+	for(i = 0; i < wc->w_num_pages; i++) {
+		tmppage = wc->w_pages[i];
+
+		if (tmppage == wc->w_target_page) {
+			from = wc->w_target_from;
+			to = wc->w_target_to;
+
+			BUG_ON(from > PAGE_CACHE_SIZE ||
+			       to > PAGE_CACHE_SIZE ||
+			       to < from);
+		} else {
+			/*
+			 * Pages adjacent to the target (if any) imply
+			 * a hole-filling write in which case we want
+			 * to flush their entire range.
+			 */
+			from = 0;
+			to = PAGE_CACHE_SIZE;
+		}
+
+		if (page_has_buffers(tmppage)) {
+			if (ocfs2_should_order_data(inode))
+				ocfs2_jbd2_file_inode(wc->w_handle, inode);
+			block_commit_write(tmppage, from, to);
+		}
+	}
+
+out_write_size:
+	pos += copied;
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+	di->i_size = cpu_to_le64((u64)i_size_read(inode));
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	ocfs2_journal_dirty(handle, wc->w_di_bh);
+
+	ocfs2_commit_trans(osb, handle);
+
+	ocfs2_run_deallocs(osb, &wc->w_dealloc);
+
+	ocfs2_free_write_ctxt(wc);
+
+	return copied;
+}
+
+static int ocfs2_write_end(struct file *file, struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata)
+{
+	int ret;
+	struct inode *inode = mapping->host;
+
+	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
+
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	ocfs2_inode_unlock(inode, 1);
+
+	return ret;
+}
+
+const struct address_space_operations ocfs2_aops = {
+	.readpage		= ocfs2_readpage,
+	.readpages		= ocfs2_readpages,
+	.writepage		= ocfs2_writepage,
+	.write_begin		= ocfs2_write_begin,
+	.write_end		= ocfs2_write_end,
+	.bmap			= ocfs2_bmap,
+	.direct_IO		= ocfs2_direct_IO,
+	.invalidatepage		= ocfs2_invalidatepage,
+	.releasepage		= ocfs2_releasepage,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate	= block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
new file mode 100644
index 00000000..75cf3ad9
--- /dev/null
+++ b/fs/ocfs2/aops.h
@@ -0,0 +1,94 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_AOPS_H
+#define OCFS2_AOPS_H
+
+handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
+							 struct page *page,
+							 unsigned from,
+							 unsigned to);
+
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+			  struct inode *inode, unsigned int from,
+			  unsigned int to, int new);
+
+void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages);
+
+int walk_page_buffers(	handle_t *handle,
+			struct buffer_head *head,
+			unsigned from,
+			unsigned to,
+			int *partial,
+			int (*fn)(	handle_t *handle,
+					struct buffer_head *bh));
+
+int ocfs2_write_end_nolock(struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata);
+
+int ocfs2_write_begin_nolock(struct file *filp,
+			     struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata,
+			     struct buffer_head *di_bh, struct page *mmap_page);
+
+int ocfs2_read_inline_data(struct inode *inode, struct page *page,
+			   struct buffer_head *di_bh);
+int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
+
+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *bh_result, int create);
+/* all ocfs2_dio_end_io()'s fault */
+#define ocfs2_iocb_is_rw_locked(iocb) \
+	test_bit(0, (unsigned long *)&iocb->private)
+static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
+{
+	set_bit(0, (unsigned long *)&iocb->private);
+	if (level)
+		set_bit(1, (unsigned long *)&iocb->private);
+	else
+		clear_bit(1, (unsigned long *)&iocb->private);
+}
+
+/*
+ * Using a named enum representing lock types in terms of #N bit stored in
+ * iocb->private, which is going to be used for communication between
+ * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
+ */
+enum ocfs2_iocb_lock_bits {
+	OCFS2_IOCB_RW_LOCK = 0,
+	OCFS2_IOCB_RW_LOCK_LEVEL,
+	OCFS2_IOCB_SEM,
+	OCFS2_IOCB_NUM_LOCKS
+};
+
+#define ocfs2_iocb_clear_rw_locked(iocb) \
+	clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_rw_locked_level(iocb) \
+	test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_sem_locked(iocb) \
+	set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_sem_locked(iocb) \
+	clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_sem_locked(iocb) \
+	test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
new file mode 100644
index 00000000..c7ee03c2
--- /dev/null
+++ b/fs/ocfs2/blockcheck.c
@@ -0,0 +1,645 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.c
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2006, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <asm/byteorder.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "blockcheck.h"
+
+
+/*
+ * We use the following conventions:
+ *
+ * d = # data bits
+ * p = # parity bits
+ * c = # total code bits (d + p)
+ */
+
+
+/*
+ * Calculate the bit offset in the hamming code buffer based on the bit's
+ * offset in the data buffer.  Since the hamming code reserves all
+ * power-of-two bits for parity, the data bit number and the code bit
+ * number are offset by all the parity bits beforehand.
+ *
+ * Recall that bit numbers in hamming code are 1-based.  This function
+ * takes the 0-based data bit from the caller.
+ *
+ * An example.  Take bit 1 of the data buffer.  1 is a power of two (2^0),
+ * so it's a parity bit.  2 is a power of two (2^1), so it's a parity bit.
+ * 3 is not a power of two.  So bit 1 of the data buffer ends up as bit 3
+ * in the code buffer.
+ *
+ * The caller can pass in *p if it wants to keep track of the most recent
+ * number of parity bits added.  This allows the function to start the
+ * calculation at the last place.
+ */
+static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
+{
+	unsigned int b, p = 0;
+
+	/*
+	 * Data bits are 0-based, but we're talking code bits, which
+	 * are 1-based.
+	 */
+	b = i + 1;
+
+	/* Use the cache if it is there */
+	if (p_cache)
+		p = *p_cache;
+        b += p;
+
+	/*
+	 * For every power of two below our bit number, bump our bit.
+	 *
+	 * We compare with (b + 1) because we have to compare with what b
+	 * would be _if_ it were bumped up by the parity bit.  Capice?
+	 *
+	 * p is set above.
+	 */
+	for (; (1 << p) < (b + 1); p++)
+		b++;
+
+	if (p_cache)
+		*p_cache = p;
+
+	return b;
+}
+
+/*
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
+{
+	unsigned int i, b, p = 0;
+
+	BUG_ON(!d);
+
+	/*
+	 * b is the hamming code bit number.  Hamming code specifies a
+	 * 1-based array, but C uses 0-based.  So 'i' is for C, and 'b' is
+	 * for the algorithm.
+	 *
+	 * The i++ in the for loop is so that the start offset passed
+	 * to ocfs2_find_next_bit_set() is one greater than the previously
+	 * found bit.
+	 */
+	for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
+	{
+		/*
+		 * i is the offset in this hunk, nr + i is the total bit
+		 * offset.
+		 */
+		b = calc_code_bit(nr + i, &p);
+
+		/*
+		 * Data bits in the resultant code are checked by
+		 * parity bits that are part of the bit number
+		 * representation.  Huh?
+		 *
+		 * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+		 * In other words, the parity bit at position 2^k
+		 * checks bits in positions having bit k set in
+		 * their binary representation.  Conversely, for
+		 * instance, bit 13, i.e. 1101(2), is checked by
+		 * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
+		 * </wikipedia>
+		 *
+		 * Note that 'k' is the _code_ bit number.  'b' in
+		 * our loop.
+		 */
+		parity ^= b;
+	}
+
+	/* While the data buffer was treated as little endian, the
+	 * return value is in host endian. */
+	return parity;
+}
+
+u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
+{
+	return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
+}
+
+/*
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one hunk, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+		       unsigned int fix)
+{
+	unsigned int i, b;
+
+	BUG_ON(!d);
+
+	/*
+	 * If the bit to fix has an hweight of 1, it's a parity bit.  One
+	 * busted parity bit is its own error.  Nothing to do here.
+	 */
+	if (hweight32(fix) == 1)
+		return;
+
+	/*
+	 * nr + d is the bit right past the data hunk we're looking at.
+	 * If fix after that, nothing to do
+	 */
+	if (fix >= calc_code_bit(nr + d, NULL))
+		return;
+
+	/*
+	 * nr is the offset in the data hunk we're starting at.  Let's
+	 * start b at the offset in the code buffer.  See hamming_encode()
+	 * for a more detailed description of 'b'.
+	 */
+	b = calc_code_bit(nr, NULL);
+	/* If the fix is before this hunk, nothing to do */
+	if (fix < b)
+		return;
+
+	for (i = 0; i < d; i++, b++)
+	{
+		/* Skip past parity bits */
+		while (hweight32(b) == 1)
+			b++;
+
+		/*
+		 * i is the offset in this data hunk.
+		 * nr + i is the offset in the total data buffer.
+		 * b is the offset in the total code buffer.
+		 *
+		 * Thus, when b == fix, bit i in the current hunk needs
+		 * fixing.
+		 */
+		if (b == fix)
+		{
+			if (ocfs2_test_bit(i, data))
+				ocfs2_clear_bit(i, data);
+			else
+				ocfs2_set_bit(i, data);
+			break;
+		}
+	}
+}
+
+void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+			     unsigned int fix)
+{
+	ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
+}
+
+
+/*
+ * Debugfs handling.
+ */
+
+#ifdef CONFIG_DEBUG_FS
+
+static int blockcheck_u64_get(void *data, u64 *val)
+{
+	*val = *(u64 *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
+
+static struct dentry *blockcheck_debugfs_create(const char *name,
+						struct dentry *parent,
+						u64 *value)
+{
+	return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value,
+				   &blockcheck_fops);
+}
+
+static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
+{
+	if (stats) {
+		debugfs_remove(stats->b_debug_check);
+		stats->b_debug_check = NULL;
+		debugfs_remove(stats->b_debug_failure);
+		stats->b_debug_failure = NULL;
+		debugfs_remove(stats->b_debug_recover);
+		stats->b_debug_recover = NULL;
+		debugfs_remove(stats->b_debug_dir);
+		stats->b_debug_dir = NULL;
+	}
+}
+
+static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+					  struct dentry *parent)
+{
+	int rc = -EINVAL;
+
+	if (!stats)
+		goto out;
+
+	stats->b_debug_dir = debugfs_create_dir("blockcheck", parent);
+	if (!stats->b_debug_dir)
+		goto out;
+
+	stats->b_debug_check =
+		blockcheck_debugfs_create("blocks_checked",
+					  stats->b_debug_dir,
+					  &stats->b_check_count);
+
+	stats->b_debug_failure =
+		blockcheck_debugfs_create("checksums_failed",
+					  stats->b_debug_dir,
+					  &stats->b_failure_count);
+
+	stats->b_debug_recover =
+		blockcheck_debugfs_create("ecc_recoveries",
+					  stats->b_debug_dir,
+					  &stats->b_recover_count);
+	if (stats->b_debug_check && stats->b_debug_failure &&
+	    stats->b_debug_recover)
+		rc = 0;
+
+out:
+	if (rc)
+		ocfs2_blockcheck_debug_remove(stats);
+	return rc;
+}
+#else
+static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+						 struct dentry *parent)
+{
+	return 0;
+}
+
+static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
+{
+}
+#endif  /* CONFIG_DEBUG_FS */
+
+/* Always-called wrappers for starting and stopping the debugfs files */
+int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+					   struct dentry *parent)
+{
+	return ocfs2_blockcheck_debug_install(stats, parent);
+}
+
+void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats)
+{
+	ocfs2_blockcheck_debug_remove(stats);
+}
+
+static void ocfs2_blockcheck_inc_check(struct ocfs2_blockcheck_stats *stats)
+{
+	u64 new_count;
+
+	if (!stats)
+		return;
+
+	spin_lock(&stats->b_lock);
+	stats->b_check_count++;
+	new_count = stats->b_check_count;
+	spin_unlock(&stats->b_lock);
+
+	if (!new_count)
+		mlog(ML_NOTICE, "Block check count has wrapped\n");
+}
+
+static void ocfs2_blockcheck_inc_failure(struct ocfs2_blockcheck_stats *stats)
+{
+	u64 new_count;
+
+	if (!stats)
+		return;
+
+	spin_lock(&stats->b_lock);
+	stats->b_failure_count++;
+	new_count = stats->b_failure_count;
+	spin_unlock(&stats->b_lock);
+
+	if (!new_count)
+		mlog(ML_NOTICE, "Checksum failure count has wrapped\n");
+}
+
+static void ocfs2_blockcheck_inc_recover(struct ocfs2_blockcheck_stats *stats)
+{
+	u64 new_count;
+
+	if (!stats)
+		return;
+
+	spin_lock(&stats->b_lock);
+	stats->b_recover_count++;
+	new_count = stats->b_recover_count;
+	spin_unlock(&stats->b_lock);
+
+	if (!new_count)
+		mlog(ML_NOTICE, "ECC recovery count has wrapped\n");
+}
+
+
+
+/*
+ * These are the low-level APIs for using the ocfs2_block_check structure.
+ */
+
+/*
+ * This function generates check information for a block.
+ * data is the block to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc)
+{
+	u32 crc;
+	u32 ecc;
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	crc = crc32_le(~0, data, blocksize);
+	ecc = ocfs2_hamming_encode_block(data, blocksize);
+
+	/*
+	 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+	 * larger than 16 bits.
+	 */
+	BUG_ON(ecc > USHRT_MAX);
+
+	bc->bc_crc32e = cpu_to_le32(crc);
+	bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information.  Like _compute,
+ * the function will take care of zeroing bc before calculating check codes.
+ * If bc is not a pointer inside data, the caller must have zeroed any
+ * inline ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc,
+			       struct ocfs2_blockcheck_stats *stats)
+{
+	int rc = 0;
+	struct ocfs2_block_check check;
+	u32 crc, ecc;
+
+	ocfs2_blockcheck_inc_check(stats);
+
+	check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+	check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	/* Fast path - if the crc32 validates, we're good to go */
+	crc = crc32_le(~0, data, blocksize);
+	if (crc == check.bc_crc32e)
+		goto out;
+
+	ocfs2_blockcheck_inc_failure(stats);
+	mlog(ML_ERROR,
+	     "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+	/* Ok, try ECC fixups */
+	ecc = ocfs2_hamming_encode_block(data, blocksize);
+	ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+
+	/* And check the crc32 again */
+	crc = crc32_le(~0, data, blocksize);
+	if (crc == check.bc_crc32e) {
+		ocfs2_blockcheck_inc_recover(stats);
+		goto out;
+	}
+
+	mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+	rc = -EIO;
+
+out:
+	bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+	bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+
+	return rc;
+}
+
+/*
+ * This function generates check information for a list of buffer_heads.
+ * bhs is the blocks to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc)
+{
+	int i;
+	u32 crc, ecc;
+
+	BUG_ON(nr < 0);
+
+	if (!nr)
+		return;
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
+		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+		/*
+		 * The number of bits in a buffer is obviously b_size*8.
+		 * The offset of this buffer is b_size*i, so the bit offset
+		 * of this buffer is b_size*8*i.
+		 */
+		ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+						bhs[i]->b_size * 8,
+						bhs[i]->b_size * 8 * i);
+	}
+
+	/*
+	 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+	 * larger than 16 bits.
+	 */
+	BUG_ON(ecc > USHRT_MAX);
+
+	bc->bc_crc32e = cpu_to_le32(crc);
+	bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information on a list of
+ * buffer_heads.  Like _compute_bhs, the function will take care of
+ * zeroing bc before calculating check codes.  If bc is not a pointer
+ * inside data, the caller must have zeroed any inline
+ * ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc,
+				   struct ocfs2_blockcheck_stats *stats)
+{
+	int i, rc = 0;
+	struct ocfs2_block_check check;
+	u32 crc, ecc, fix;
+
+	BUG_ON(nr < 0);
+
+	if (!nr)
+		return 0;
+
+	ocfs2_blockcheck_inc_check(stats);
+
+	check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+	check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	/* Fast path - if the crc32 validates, we're good to go */
+	for (i = 0, crc = ~0; i < nr; i++)
+		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+	if (crc == check.bc_crc32e)
+		goto out;
+
+	ocfs2_blockcheck_inc_failure(stats);
+	mlog(ML_ERROR,
+	     "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+	/* Ok, try ECC fixups */
+	for (i = 0, ecc = 0; i < nr; i++) {
+		/*
+		 * The number of bits in a buffer is obviously b_size*8.
+		 * The offset of this buffer is b_size*i, so the bit offset
+		 * of this buffer is b_size*8*i.
+		 */
+		ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+						bhs[i]->b_size * 8,
+						bhs[i]->b_size * 8 * i);
+	}
+	fix = ecc ^ check.bc_ecc;
+	for (i = 0; i < nr; i++) {
+		/*
+		 * Try the fix against each buffer.  It will only affect
+		 * one of them.
+		 */
+		ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
+				  bhs[i]->b_size * 8 * i, fix);
+	}
+
+	/* And check the crc32 again */
+	for (i = 0, crc = ~0; i < nr; i++)
+		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+	if (crc == check.bc_crc32e) {
+		ocfs2_blockcheck_inc_recover(stats);
+		goto out;
+	}
+
+	mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+	rc = -EIO;
+
+out:
+	bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+	bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+
+	return rc;
+}
+
+/*
+ * These are the main API.  They check the superblock flag before
+ * calling the underlying operations.
+ *
+ * They expect the buffer(s) to be in disk format.
+ */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc)
+{
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		ocfs2_block_check_compute(data, sb->s_blocksize, bc);
+}
+
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc)
+{
+	int rc = 0;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (ocfs2_meta_ecc(osb))
+		rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc,
+						&osb->osb_ecc_stats);
+
+	return rc;
+}
+
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc)
+{
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		ocfs2_block_check_compute_bhs(bhs, nr, bc);
+}
+
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc)
+{
+	int rc = 0;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (ocfs2_meta_ecc(osb))
+		rc = ocfs2_block_check_validate_bhs(bhs, nr, bc,
+						    &osb->osb_ecc_stats);
+
+	return rc;
+}
+
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
new file mode 100644
index 00000000..d4b69feb
--- /dev/null
+++ b/fs/ocfs2/blockcheck.h
@@ -0,0 +1,107 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.h
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_BLOCKCHECK_H
+#define OCFS2_BLOCKCHECK_H
+
+
+/* Count errors and error correction from blockcheck.c */
+struct ocfs2_blockcheck_stats {
+	spinlock_t b_lock;
+	u64 b_check_count;	/* Number of blocks we've checked */
+	u64 b_failure_count;	/* Number of failed checksums */
+	u64 b_recover_count;	/* Number of blocks fixed by ecc */
+
+	/*
+	 * debugfs entries, used if this is passed to
+	 * ocfs2_blockcheck_stats_debugfs_install()
+	 */
+	struct dentry *b_debug_dir;	/* Parent of the debugfs  files */
+	struct dentry *b_debug_check;	/* Exposes b_check_count */
+	struct dentry *b_debug_failure;	/* Exposes b_failure_count */
+	struct dentry *b_debug_recover;	/* Exposes b_recover_count */
+};
+
+
+/* High level block API */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc);
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc);
+
+/* Lower level API */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc,
+			       struct ocfs2_blockcheck_stats *stats);
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc,
+				   struct ocfs2_blockcheck_stats *stats);
+
+/* Debug Initialization */
+int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+					   struct dentry *parent);
+void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats);
+
+/*
+ * Hamming code functions
+ */
+
+/*
+ * Encoding hamming code parity bits for a buffer.
+ *
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
+			 unsigned int nr);
+/*
+ * Fix a buffer with a bit error.  The 'fix' is the original parity
+ * xor'd with the parity calculated now.
+ *
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one buffer, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+		       unsigned int fix);
+
+/* Convenience wrappers for a single buffer of data */
+extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
+extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+				    unsigned int fix);
+#endif
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
new file mode 100644
index 00000000..5d18ad10
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.c
@@ -0,0 +1,429 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * io.c
+ *
+ * Buffer cache handling
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "inode.h"
+#include "journal.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+#include "ocfs2_trace.h"
+
+/*
+ * Bits on bh->b_state used by ocfs2.
+ *
+ * These MUST be after the JBD2 bits.  Hence, we use BH_JBDPrivateStart.
+ */
+enum ocfs2_state_bits {
+	BH_NeedsValidate = BH_JBDPrivateStart,
+};
+
+/* Expand the magic b_state functions */
+BUFFER_FNS(NeedsValidate, needs_validate);
+
+int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
+		      struct ocfs2_caching_info *ci)
+{
+	int ret = 0;
+
+	trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci);
+
+	BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
+	BUG_ON(buffer_jbd(bh));
+
+	/* No need to check for a soft readonly file system here. non
+	 * journalled writes are only ever done on system files which
+	 * can get modified during recovery even if read-only. */
+	if (ocfs2_is_hard_readonly(osb)) {
+		ret = -EROFS;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_metadata_cache_io_lock(ci);
+
+	lock_buffer(bh);
+	set_buffer_uptodate(bh);
+
+	/* remove from dirty list before I/O. */
+	clear_buffer_dirty(bh);
+
+	get_bh(bh); /* for end_buffer_write_sync() */
+	bh->b_end_io = end_buffer_write_sync;
+	submit_bh(WRITE, bh);
+
+	wait_on_buffer(bh);
+
+	if (buffer_uptodate(bh)) {
+		ocfs2_set_buffer_uptodate(ci, bh);
+	} else {
+		/* We don't need to remove the clustered uptodate
+		 * information for this bh as it's not marked locally
+		 * uptodate. */
+		ret = -EIO;
+		put_bh(bh);
+		mlog_errno(ret);
+	}
+
+	ocfs2_metadata_cache_io_unlock(ci);
+out:
+	return ret;
+}
+
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+			   unsigned int nr, struct buffer_head *bhs[])
+{
+	int status = 0;
+	unsigned int i;
+	struct buffer_head *bh;
+
+	trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
+
+	if (!nr)
+		goto bail;
+
+	for (i = 0 ; i < nr ; i++) {
+		if (bhs[i] == NULL) {
+			bhs[i] = sb_getblk(osb->sb, block++);
+			if (bhs[i] == NULL) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		bh = bhs[i];
+
+		if (buffer_jbd(bh)) {
+			trace_ocfs2_read_blocks_sync_jbd(
+					(unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		if (buffer_dirty(bh)) {
+			/* This should probably be a BUG, or
+			 * at least return an error. */
+			mlog(ML_ERROR,
+			     "trying to sync read a dirty "
+			     "buffer! (blocknr = %llu), skipping\n",
+			     (unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		lock_buffer(bh);
+		if (buffer_jbd(bh)) {
+			mlog(ML_ERROR,
+			     "block %llu had the JBD bit set "
+			     "while I was in lock_buffer!",
+			     (unsigned long long)bh->b_blocknr);
+			BUG();
+		}
+
+		clear_buffer_uptodate(bh);
+		get_bh(bh); /* for end_buffer_read_sync() */
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, bh);
+	}
+
+	for (i = nr; i > 0; i--) {
+		bh = bhs[i - 1];
+
+		/* No need to wait on the buffer if it's managed by JBD. */
+		if (!buffer_jbd(bh))
+			wait_on_buffer(bh);
+
+		if (!buffer_uptodate(bh)) {
+			/* Status won't be cleared from here on out,
+			 * so we can safely record this and loop back
+			 * to cleanup the other buffers. */
+			status = -EIO;
+			put_bh(bh);
+			bhs[i - 1] = NULL;
+		}
+	}
+
+bail:
+	return status;
+}
+
+int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags,
+		      int (*validate)(struct super_block *sb,
+				      struct buffer_head *bh))
+{
+	int status = 0;
+	int i, ignore_cache = 0;
+	struct buffer_head *bh;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+
+	trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
+
+	BUG_ON(!ci);
+	BUG_ON((flags & OCFS2_BH_READAHEAD) &&
+	       (flags & OCFS2_BH_IGNORE_CACHE));
+
+	if (bhs == NULL) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (nr < 0) {
+		mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (nr == 0) {
+		status = 0;
+		goto bail;
+	}
+
+	ocfs2_metadata_cache_io_lock(ci);
+	for (i = 0 ; i < nr ; i++) {
+		if (bhs[i] == NULL) {
+			bhs[i] = sb_getblk(sb, block++);
+			if (bhs[i] == NULL) {
+				ocfs2_metadata_cache_io_unlock(ci);
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		bh = bhs[i];
+		ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
+
+		/* There are three read-ahead cases here which we need to
+		 * be concerned with. All three assume a buffer has
+		 * previously been submitted with OCFS2_BH_READAHEAD
+		 * and it hasn't yet completed I/O.
+		 *
+		 * 1) The current request is sync to disk. This rarely
+		 *    happens these days, and never when performance
+		 *    matters - the code can just wait on the buffer
+		 *    lock and re-submit.
+		 *
+		 * 2) The current request is cached, but not
+		 *    readahead. ocfs2_buffer_uptodate() will return
+		 *    false anyway, so we'll wind up waiting on the
+		 *    buffer lock to do I/O. We re-check the request
+		 *    with after getting the lock to avoid a re-submit.
+		 *
+		 * 3) The current request is readahead (and so must
+		 *    also be a caching one). We short circuit if the
+		 *    buffer is locked (under I/O) and if it's in the
+		 *    uptodate cache. The re-check from #2 catches the
+		 *    case that the previous read-ahead completes just
+		 *    before our is-it-in-flight check.
+		 */
+
+		if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
+			trace_ocfs2_read_blocks_from_disk(
+			     (unsigned long long)bh->b_blocknr,
+			     (unsigned long long)ocfs2_metadata_cache_owner(ci));
+			/* We're using ignore_cache here to say
+			 * "go to disk" */
+			ignore_cache = 1;
+		}
+
+		trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr,
+			ignore_cache, buffer_jbd(bh), buffer_dirty(bh));
+
+		if (buffer_jbd(bh)) {
+			continue;
+		}
+
+		if (ignore_cache) {
+			if (buffer_dirty(bh)) {
+				/* This should probably be a BUG, or
+				 * at least return an error. */
+				continue;
+			}
+
+			/* A read-ahead request was made - if the
+			 * buffer is already under read-ahead from a
+			 * previously submitted request than we are
+			 * done here. */
+			if ((flags & OCFS2_BH_READAHEAD)
+			    && ocfs2_buffer_read_ahead(ci, bh))
+				continue;
+
+			lock_buffer(bh);
+			if (buffer_jbd(bh)) {
+#ifdef CATCH_BH_JBD_RACES
+				mlog(ML_ERROR, "block %llu had the JBD bit set "
+					       "while I was in lock_buffer!",
+				     (unsigned long long)bh->b_blocknr);
+				BUG();
+#else
+				unlock_buffer(bh);
+				continue;
+#endif
+			}
+
+			/* Re-check ocfs2_buffer_uptodate() as a
+			 * previously read-ahead buffer may have
+			 * completed I/O while we were waiting for the
+			 * buffer lock. */
+			if (!(flags & OCFS2_BH_IGNORE_CACHE)
+			    && !(flags & OCFS2_BH_READAHEAD)
+			    && ocfs2_buffer_uptodate(ci, bh)) {
+				unlock_buffer(bh);
+				continue;
+			}
+
+			clear_buffer_uptodate(bh);
+			get_bh(bh); /* for end_buffer_read_sync() */
+			if (validate)
+				set_buffer_needs_validate(bh);
+			bh->b_end_io = end_buffer_read_sync;
+			submit_bh(READ, bh);
+			continue;
+		}
+	}
+
+	status = 0;
+
+	for (i = (nr - 1); i >= 0; i--) {
+		bh = bhs[i];
+
+		if (!(flags & OCFS2_BH_READAHEAD)) {
+			/* We know this can't have changed as we hold the
+			 * owner sem. Avoid doing any work on the bh if the
+			 * journal has it. */
+			if (!buffer_jbd(bh))
+				wait_on_buffer(bh);
+
+			if (!buffer_uptodate(bh)) {
+				/* Status won't be cleared from here on out,
+				 * so we can safely record this and loop back
+				 * to cleanup the other buffers. Don't need to
+				 * remove the clustered uptodate information
+				 * for this bh as it's not marked locally
+				 * uptodate. */
+				status = -EIO;
+				put_bh(bh);
+				bhs[i] = NULL;
+				continue;
+			}
+
+			if (buffer_needs_validate(bh)) {
+				/* We never set NeedsValidate if the
+				 * buffer was held by the journal, so
+				 * that better not have changed */
+				BUG_ON(buffer_jbd(bh));
+				clear_buffer_needs_validate(bh);
+				status = validate(sb, bh);
+				if (status) {
+					put_bh(bh);
+					bhs[i] = NULL;
+					continue;
+				}
+			}
+		}
+
+		/* Always set the buffer in the cache, even if it was
+		 * a forced read, or read-ahead which hasn't yet
+		 * completed. */
+		ocfs2_set_buffer_uptodate(ci, bh);
+	}
+	ocfs2_metadata_cache_io_unlock(ci);
+
+	trace_ocfs2_read_blocks_end((unsigned long long)block, nr,
+				    flags, ignore_cache);
+
+bail:
+
+	return status;
+}
+
+/* Check whether the blkno is the super block or one of the backups. */
+static void ocfs2_check_super_or_backup(struct super_block *sb,
+					sector_t blkno)
+{
+	int i;
+	u64 backup_blkno;
+
+	if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
+		return;
+
+	for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+		backup_blkno = ocfs2_backup_super_blkno(sb, i);
+		if (backup_blkno == blkno)
+			return;
+	}
+
+	BUG();
+}
+
+/*
+ * Write super block and backups doesn't need to collaborate with journal,
+ * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed
+ * into this function.
+ */
+int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
+				struct buffer_head *bh)
+{
+	int ret = 0;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+	BUG_ON(buffer_jbd(bh));
+	ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
+		ret = -EROFS;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	lock_buffer(bh);
+	set_buffer_uptodate(bh);
+
+	/* remove from dirty list before I/O. */
+	clear_buffer_dirty(bh);
+
+	get_bh(bh); /* for end_buffer_write_sync() */
+	bh->b_end_io = end_buffer_write_sync;
+	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
+	submit_bh(WRITE, bh);
+
+	wait_on_buffer(bh);
+
+	if (!buffer_uptodate(bh)) {
+		ret = -EIO;
+		put_bh(bh);
+		mlog_errno(ret);
+	}
+
+out:
+	return ret;
+}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
new file mode 100644
index 00000000..b97bcc6d
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.h
@@ -0,0 +1,77 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_buffer_head.h
+ *
+ * Buffer cache handling functions defined
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_BUFFER_HEAD_IO_H
+#define OCFS2_BUFFER_HEAD_IO_H
+
+#include <linux/buffer_head.h>
+
+void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
+			     int uptodate);
+
+int ocfs2_write_block(struct ocfs2_super          *osb,
+		      struct buffer_head  *bh,
+		      struct ocfs2_caching_info   *ci);
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+			   unsigned int nr, struct buffer_head *bhs[]);
+
+/*
+ * If not NULL, validate() will be called on a buffer that is freshly
+ * read from disk.  It will not be called if the buffer was in cache.
+ * Note that if validate() is being used for this buffer, it needs to
+ * be set even for a READAHEAD call, as it marks the buffer for later
+ * validation.
+ */
+int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags,
+		      int (*validate)(struct super_block *sb,
+				      struct buffer_head *bh));
+
+int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
+				struct buffer_head *bh);
+
+#define OCFS2_BH_IGNORE_CACHE      1
+#define OCFS2_BH_READAHEAD         8
+
+static inline int ocfs2_read_block(struct ocfs2_caching_info *ci, u64 off,
+				   struct buffer_head **bh,
+				   int (*validate)(struct super_block *sb,
+						   struct buffer_head *bh))
+{
+	int status = 0;
+
+	if (bh == NULL) {
+		printk("ocfs2: bh == NULL\n");
+		status = -EINVAL;
+		goto bail;
+	}
+
+	status = ocfs2_read_blocks(ci, off, 1, bh, 0, validate);
+
+bail:
+	return status;
+}
+
+#endif /* OCFS2_BUFFER_HEAD_IO_H */
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
new file mode 100644
index 00000000..bc8c5e7d
--- /dev/null
+++ b/fs/ocfs2/cluster/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
+
+ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
+	quorum.o tcp.o netdebug.o ver.o
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
new file mode 100644
index 00000000..9a3e6bbf
--- /dev/null
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -0,0 +1,2628 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <linux/configfs.h>
+#include <linux/random.h>
+#include <linux/crc32.h>
+#include <linux/time.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#include "quorum.h"
+
+#include "masklog.h"
+
+
+/*
+ * The first heartbeat pass had one global thread that would serialize all hb
+ * callback calls.  This global serializing sem should only be removed once
+ * we've made sure that all callees can deal with being called concurrently
+ * from multiple hb region threads.
+ */
+static DECLARE_RWSEM(o2hb_callback_sem);
+
+/*
+ * multiple hb threads are watching multiple regions.  A node is live
+ * whenever any of the threads sees activity from the node in its region.
+ */
+static DEFINE_SPINLOCK(o2hb_live_lock);
+static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
+static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+static LIST_HEAD(o2hb_node_events);
+static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+
+/*
+ * In global heartbeat, we maintain a series of region bitmaps.
+ * 	- o2hb_region_bitmap allows us to limit the region number to max region.
+ * 	- o2hb_live_region_bitmap tracks live regions (seen steady iterations).
+ * 	- o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
+ * 		heartbeat on it.
+ * 	- o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
+ */
+static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+
+#define O2HB_DB_TYPE_LIVENODES		0
+#define O2HB_DB_TYPE_LIVEREGIONS	1
+#define O2HB_DB_TYPE_QUORUMREGIONS	2
+#define O2HB_DB_TYPE_FAILEDREGIONS	3
+#define O2HB_DB_TYPE_REGION_LIVENODES	4
+#define O2HB_DB_TYPE_REGION_NUMBER	5
+#define O2HB_DB_TYPE_REGION_ELAPSED_TIME	6
+#define O2HB_DB_TYPE_REGION_PINNED	7
+struct o2hb_debug_buf {
+	int db_type;
+	int db_size;
+	int db_len;
+	void *db_data;
+};
+
+static struct o2hb_debug_buf *o2hb_db_livenodes;
+static struct o2hb_debug_buf *o2hb_db_liveregions;
+static struct o2hb_debug_buf *o2hb_db_quorumregions;
+static struct o2hb_debug_buf *o2hb_db_failedregions;
+
+#define O2HB_DEBUG_DIR			"o2hb"
+#define O2HB_DEBUG_LIVENODES		"livenodes"
+#define O2HB_DEBUG_LIVEREGIONS		"live_regions"
+#define O2HB_DEBUG_QUORUMREGIONS	"quorum_regions"
+#define O2HB_DEBUG_FAILEDREGIONS	"failed_regions"
+#define O2HB_DEBUG_REGION_NUMBER	"num"
+#define O2HB_DEBUG_REGION_ELAPSED_TIME	"elapsed_time_in_ms"
+#define O2HB_DEBUG_REGION_PINNED	"pinned"
+
+static struct dentry *o2hb_debug_dir;
+static struct dentry *o2hb_debug_livenodes;
+static struct dentry *o2hb_debug_liveregions;
+static struct dentry *o2hb_debug_quorumregions;
+static struct dentry *o2hb_debug_failedregions;
+
+static LIST_HEAD(o2hb_all_regions);
+
+static struct o2hb_callback {
+	struct list_head list;
+} o2hb_callbacks[O2HB_NUM_CB];
+
+static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
+
+#define O2HB_DEFAULT_BLOCK_BITS       9
+
+enum o2hb_heartbeat_modes {
+	O2HB_HEARTBEAT_LOCAL		= 0,
+	O2HB_HEARTBEAT_GLOBAL,
+	O2HB_HEARTBEAT_NUM_MODES,
+};
+
+char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
+		"local",	/* O2HB_HEARTBEAT_LOCAL */
+		"global",	/* O2HB_HEARTBEAT_GLOBAL */
+};
+
+unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
+unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
+
+/*
+ * o2hb_dependent_users tracks the number of registered callbacks that depend
+ * on heartbeat. o2net and o2dlm are two entities that register this callback.
+ * However only o2dlm depends on the heartbeat. It does not want the heartbeat
+ * to stop while a dlm domain is still active.
+ */
+unsigned int o2hb_dependent_users;
+
+/*
+ * In global heartbeat mode, all regions are pinned if there are one or more
+ * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
+ * regions are unpinned if the region count exceeds the cut off or the number
+ * of dependent users falls to zero.
+ */
+#define O2HB_PIN_CUT_OFF		3
+
+/*
+ * In local heartbeat mode, we assume the dlm domain name to be the same as
+ * region uuid. This is true for domains created for the file system but not
+ * necessarily true for userdlm domains. This is a known limitation.
+ *
+ * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
+ * works for both file system and userdlm domains.
+ */
+static int o2hb_region_pin(const char *region_uuid);
+static void o2hb_region_unpin(const char *region_uuid);
+
+/* Only sets a new threshold if there are no active regions.
+ *
+ * No locking or otherwise interesting code is required for reading
+ * o2hb_dead_threshold as it can't change once regions are active and
+ * it's not interesting to anyone until then anyway. */
+static void o2hb_dead_threshold_set(unsigned int threshold)
+{
+	if (threshold > O2HB_MIN_DEAD_THRESHOLD) {
+		spin_lock(&o2hb_live_lock);
+		if (list_empty(&o2hb_all_regions))
+			o2hb_dead_threshold = threshold;
+		spin_unlock(&o2hb_live_lock);
+	}
+}
+
+static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
+{
+	int ret = -1;
+
+	if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
+		spin_lock(&o2hb_live_lock);
+		if (list_empty(&o2hb_all_regions)) {
+			o2hb_heartbeat_mode = hb_mode;
+			ret = 0;
+		}
+		spin_unlock(&o2hb_live_lock);
+	}
+
+	return ret;
+}
+
+struct o2hb_node_event {
+	struct list_head        hn_item;
+	enum o2hb_callback_type hn_event_type;
+	struct o2nm_node        *hn_node;
+	int                     hn_node_num;
+};
+
+struct o2hb_disk_slot {
+	struct o2hb_disk_heartbeat_block *ds_raw_block;
+	u8			ds_node_num;
+	u64			ds_last_time;
+	u64			ds_last_generation;
+	u16			ds_equal_samples;
+	u16			ds_changed_samples;
+	struct list_head	ds_live_item;
+};
+
+/* each thread owns a region.. when we're asked to tear down the region
+ * we ask the thread to stop, who cleans up the region */
+struct o2hb_region {
+	struct config_item	hr_item;
+
+	struct list_head	hr_all_item;
+	unsigned		hr_unclean_stop:1,
+				hr_item_pinned:1,
+				hr_item_dropped:1;
+
+	/* protected by the hr_callback_sem */
+	struct task_struct 	*hr_task;
+
+	unsigned int		hr_blocks;
+	unsigned long long	hr_start_block;
+
+	unsigned int		hr_block_bits;
+	unsigned int		hr_block_bytes;
+
+	unsigned int		hr_slots_per_page;
+	unsigned int		hr_num_pages;
+
+	struct page             **hr_slot_data;
+	struct block_device	*hr_bdev;
+	struct o2hb_disk_slot	*hr_slots;
+
+	/* live node map of this region */
+	unsigned long		hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned int		hr_region_num;
+
+	struct dentry		*hr_debug_dir;
+	struct dentry		*hr_debug_livenodes;
+	struct dentry		*hr_debug_regnum;
+	struct dentry		*hr_debug_elapsed_time;
+	struct dentry		*hr_debug_pinned;
+	struct o2hb_debug_buf	*hr_db_livenodes;
+	struct o2hb_debug_buf	*hr_db_regnum;
+	struct o2hb_debug_buf	*hr_db_elapsed_time;
+	struct o2hb_debug_buf	*hr_db_pinned;
+
+	/* let the person setting up hb wait for it to return until it
+	 * has reached a 'steady' state.  This will be fixed when we have
+	 * a more complete api that doesn't lead to this sort of fragility. */
+	atomic_t		hr_steady_iterations;
+
+	char			hr_dev_name[BDEVNAME_SIZE];
+
+	unsigned int		hr_timeout_ms;
+
+	/* randomized as the region goes up and down so that a node
+	 * recognizes a node going up and down in one iteration */
+	u64			hr_generation;
+
+	struct delayed_work	hr_write_timeout_work;
+	unsigned long		hr_last_timeout_start;
+
+	/* Used during o2hb_check_slot to hold a copy of the block
+	 * being checked because we temporarily have to zero out the
+	 * crc field. */
+	struct o2hb_disk_heartbeat_block *hr_tmp_block;
+};
+
+struct o2hb_bio_wait_ctxt {
+	atomic_t          wc_num_reqs;
+	struct completion wc_io_complete;
+	int               wc_error;
+};
+
+static int o2hb_pop_count(void *map, int count)
+{
+	int i = -1, pop = 0;
+
+	while ((i = find_next_bit(map, count, i + 1)) < count)
+		pop++;
+	return pop;
+}
+
+static void o2hb_write_timeout(struct work_struct *work)
+{
+	int failed, quorum;
+	unsigned long flags;
+	struct o2hb_region *reg =
+		container_of(work, struct o2hb_region,
+			     hr_write_timeout_work.work);
+
+	mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
+	     "milliseconds\n", reg->hr_dev_name,
+	     jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
+
+	if (o2hb_global_heartbeat_active()) {
+		spin_lock_irqsave(&o2hb_live_lock, flags);
+		if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+			set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+		failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
+					O2NM_MAX_REGIONS);
+		quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
+					O2NM_MAX_REGIONS);
+		spin_unlock_irqrestore(&o2hb_live_lock, flags);
+
+		mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
+		     quorum, failed);
+
+		/*
+		 * Fence if the number of failed regions >= half the number
+		 * of  quorum regions
+		 */
+		if ((failed << 1) < quorum)
+			return;
+	}
+
+	o2quo_disk_timeout();
+}
+
+static void o2hb_arm_write_timeout(struct o2hb_region *reg)
+{
+	mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
+	     O2HB_MAX_WRITE_TIMEOUT_MS);
+
+	if (o2hb_global_heartbeat_active()) {
+		spin_lock(&o2hb_live_lock);
+		clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+		spin_unlock(&o2hb_live_lock);
+	}
+	cancel_delayed_work(&reg->hr_write_timeout_work);
+	reg->hr_last_timeout_start = jiffies;
+	schedule_delayed_work(&reg->hr_write_timeout_work,
+			      msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
+}
+
+static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
+{
+	cancel_delayed_work_sync(&reg->hr_write_timeout_work);
+}
+
+static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
+{
+	atomic_set(&wc->wc_num_reqs, 1);
+	init_completion(&wc->wc_io_complete);
+	wc->wc_error = 0;
+}
+
+/* Used in error paths too */
+static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
+				     unsigned int num)
+{
+	/* sadly atomic_sub_and_test() isn't available on all platforms.  The
+	 * good news is that the fast path only completes one at a time */
+	while(num--) {
+		if (atomic_dec_and_test(&wc->wc_num_reqs)) {
+			BUG_ON(num > 0);
+			complete(&wc->wc_io_complete);
+		}
+	}
+}
+
+static void o2hb_wait_on_io(struct o2hb_region *reg,
+			    struct o2hb_bio_wait_ctxt *wc)
+{
+	o2hb_bio_wait_dec(wc, 1);
+	wait_for_completion(&wc->wc_io_complete);
+}
+
+static void o2hb_bio_end_io(struct bio *bio,
+			   int error)
+{
+	struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
+
+	if (error) {
+		mlog(ML_ERROR, "IO Error %d\n", error);
+		wc->wc_error = error;
+	}
+
+	o2hb_bio_wait_dec(wc, 1);
+	bio_put(bio);
+}
+
+/* Setup a Bio to cover I/O against num_slots slots starting at
+ * start_slot. */
+static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
+				      struct o2hb_bio_wait_ctxt *wc,
+				      unsigned int *current_slot,
+				      unsigned int max_slots)
+{
+	int len, current_page;
+	unsigned int vec_len, vec_start;
+	unsigned int bits = reg->hr_block_bits;
+	unsigned int spp = reg->hr_slots_per_page;
+	unsigned int cs = *current_slot;
+	struct bio *bio;
+	struct page *page;
+
+	/* Testing has shown this allocation to take long enough under
+	 * GFP_KERNEL that the local node can get fenced. It would be
+	 * nicest if we could pre-allocate these bios and avoid this
+	 * all together. */
+	bio = bio_alloc(GFP_ATOMIC, 16);
+	if (!bio) {
+		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
+		bio = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	/* Must put everything in 512 byte sectors for the bio... */
+	bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9);
+	bio->bi_bdev = reg->hr_bdev;
+	bio->bi_private = wc;
+	bio->bi_end_io = o2hb_bio_end_io;
+
+	vec_start = (cs << bits) % PAGE_CACHE_SIZE;
+	while(cs < max_slots) {
+		current_page = cs / spp;
+		page = reg->hr_slot_data[current_page];
+
+		vec_len = min(PAGE_CACHE_SIZE - vec_start,
+			      (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
+
+		mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
+		     current_page, vec_len, vec_start);
+
+		len = bio_add_page(bio, page, vec_len, vec_start);
+		if (len != vec_len) break;
+
+		cs += vec_len / (PAGE_CACHE_SIZE/spp);
+		vec_start = 0;
+	}
+
+bail:
+	*current_slot = cs;
+	return bio;
+}
+
+static int o2hb_read_slots(struct o2hb_region *reg,
+			   unsigned int max_slots)
+{
+	unsigned int current_slot=0;
+	int status;
+	struct o2hb_bio_wait_ctxt wc;
+	struct bio *bio;
+
+	o2hb_bio_wait_init(&wc);
+
+	while(current_slot < max_slots) {
+		bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots);
+		if (IS_ERR(bio)) {
+			status = PTR_ERR(bio);
+			mlog_errno(status);
+			goto bail_and_wait;
+		}
+
+		atomic_inc(&wc.wc_num_reqs);
+		submit_bio(READ, bio);
+	}
+
+	status = 0;
+
+bail_and_wait:
+	o2hb_wait_on_io(reg, &wc);
+	if (wc.wc_error && !status)
+		status = wc.wc_error;
+
+	return status;
+}
+
+static int o2hb_issue_node_write(struct o2hb_region *reg,
+				 struct o2hb_bio_wait_ctxt *write_wc)
+{
+	int status;
+	unsigned int slot;
+	struct bio *bio;
+
+	o2hb_bio_wait_init(write_wc);
+
+	slot = o2nm_this_node();
+
+	bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1);
+	if (IS_ERR(bio)) {
+		status = PTR_ERR(bio);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	atomic_inc(&write_wc->wc_num_reqs);
+	submit_bio(WRITE, bio);
+
+	status = 0;
+bail:
+	return status;
+}
+
+static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg,
+				     struct o2hb_disk_heartbeat_block *hb_block)
+{
+	__le32 old_cksum;
+	u32 ret;
+
+	/* We want to compute the block crc with a 0 value in the
+	 * hb_cksum field. Save it off here and replace after the
+	 * crc. */
+	old_cksum = hb_block->hb_cksum;
+	hb_block->hb_cksum = 0;
+
+	ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes);
+
+	hb_block->hb_cksum = old_cksum;
+
+	return ret;
+}
+
+static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block)
+{
+	mlog(ML_ERROR, "Dump slot information: seq = 0x%llx, node = %u, "
+	     "cksum = 0x%x, generation 0x%llx\n",
+	     (long long)le64_to_cpu(hb_block->hb_seq),
+	     hb_block->hb_node, le32_to_cpu(hb_block->hb_cksum),
+	     (long long)le64_to_cpu(hb_block->hb_generation));
+}
+
+static int o2hb_verify_crc(struct o2hb_region *reg,
+			   struct o2hb_disk_heartbeat_block *hb_block)
+{
+	u32 read, computed;
+
+	read = le32_to_cpu(hb_block->hb_cksum);
+	computed = o2hb_compute_block_crc_le(reg, hb_block);
+
+	return read == computed;
+}
+
+/* We want to make sure that nobody is heartbeating on top of us --
+ * this will help detect an invalid configuration. */
+static void o2hb_check_last_timestamp(struct o2hb_region *reg)
+{
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+	char *errstr;
+
+	slot = &reg->hr_slots[o2nm_this_node()];
+	/* Don't check on our 1st timestamp */
+	if (!slot->ds_last_time)
+		return;
+
+	hb_block = slot->ds_raw_block;
+	if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time &&
+	    le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation &&
+	    hb_block->hb_node == slot->ds_node_num)
+		return;
+
+#define ERRSTR1		"Another node is heartbeating on device"
+#define ERRSTR2		"Heartbeat generation mismatch on device"
+#define ERRSTR3		"Heartbeat sequence mismatch on device"
+
+	if (hb_block->hb_node != slot->ds_node_num)
+		errstr = ERRSTR1;
+	else if (le64_to_cpu(hb_block->hb_generation) !=
+		 slot->ds_last_generation)
+		errstr = ERRSTR2;
+	else
+		errstr = ERRSTR3;
+
+	mlog(ML_ERROR, "%s (%s): expected(%u:0x%llx, 0x%llx), "
+	     "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_dev_name,
+	     slot->ds_node_num, (unsigned long long)slot->ds_last_generation,
+	     (unsigned long long)slot->ds_last_time, hb_block->hb_node,
+	     (unsigned long long)le64_to_cpu(hb_block->hb_generation),
+	     (unsigned long long)le64_to_cpu(hb_block->hb_seq));
+}
+
+static inline void o2hb_prepare_block(struct o2hb_region *reg,
+				      u64 generation)
+{
+	int node_num;
+	u64 cputime;
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+
+	node_num = o2nm_this_node();
+	slot = &reg->hr_slots[node_num];
+
+	hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
+	memset(hb_block, 0, reg->hr_block_bytes);
+	/* TODO: time stuff */
+	cputime = CURRENT_TIME.tv_sec;
+	if (!cputime)
+		cputime = 1;
+
+	hb_block->hb_seq = cpu_to_le64(cputime);
+	hb_block->hb_node = node_num;
+	hb_block->hb_generation = cpu_to_le64(generation);
+	hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS);
+
+	/* This step must always happen last! */
+	hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
+								   hb_block));
+
+	mlog(ML_HB_BIO, "our node generation = 0x%llx, cksum = 0x%x\n",
+	     (long long)generation,
+	     le32_to_cpu(hb_block->hb_cksum));
+}
+
+static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
+				struct o2nm_node *node,
+				int idx)
+{
+	struct list_head *iter;
+	struct o2hb_callback_func *f;
+
+	list_for_each(iter, &hbcall->list) {
+		f = list_entry(iter, struct o2hb_callback_func, hc_item);
+		mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
+		(f->hc_func)(node, idx, f->hc_data);
+	}
+}
+
+/* Will run the list in order until we process the passed event */
+static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
+{
+	int empty;
+	struct o2hb_callback *hbcall;
+	struct o2hb_node_event *event;
+
+	spin_lock(&o2hb_live_lock);
+	empty = list_empty(&queued_event->hn_item);
+	spin_unlock(&o2hb_live_lock);
+	if (empty)
+		return;
+
+	/* Holding callback sem assures we don't alter the callback
+	 * lists when doing this, and serializes ourselves with other
+	 * processes wanting callbacks. */
+	down_write(&o2hb_callback_sem);
+
+	spin_lock(&o2hb_live_lock);
+	while (!list_empty(&o2hb_node_events)
+	       && !list_empty(&queued_event->hn_item)) {
+		event = list_entry(o2hb_node_events.next,
+				   struct o2hb_node_event,
+				   hn_item);
+		list_del_init(&event->hn_item);
+		spin_unlock(&o2hb_live_lock);
+
+		mlog(ML_HEARTBEAT, "Node %s event for %d\n",
+		     event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN",
+		     event->hn_node_num);
+
+		hbcall = hbcall_from_type(event->hn_event_type);
+
+		/* We should *never* have gotten on to the list with a
+		 * bad type... This isn't something that we should try
+		 * to recover from. */
+		BUG_ON(IS_ERR(hbcall));
+
+		o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num);
+
+		spin_lock(&o2hb_live_lock);
+	}
+	spin_unlock(&o2hb_live_lock);
+
+	up_write(&o2hb_callback_sem);
+}
+
+static void o2hb_queue_node_event(struct o2hb_node_event *event,
+				  enum o2hb_callback_type type,
+				  struct o2nm_node *node,
+				  int node_num)
+{
+	assert_spin_locked(&o2hb_live_lock);
+
+	BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
+
+	event->hn_event_type = type;
+	event->hn_node = node;
+	event->hn_node_num = node_num;
+
+	mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n",
+	     type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num);
+
+	list_add_tail(&event->hn_item, &o2hb_node_events);
+}
+
+static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
+{
+	struct o2hb_node_event event =
+		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
+	struct o2nm_node *node;
+
+	node = o2nm_get_node_by_num(slot->ds_node_num);
+	if (!node)
+		return;
+
+	spin_lock(&o2hb_live_lock);
+	if (!list_empty(&slot->ds_live_item)) {
+		mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n",
+		     slot->ds_node_num);
+
+		list_del_init(&slot->ds_live_item);
+
+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
+					      slot->ds_node_num);
+		}
+	}
+	spin_unlock(&o2hb_live_lock);
+
+	o2hb_run_event_list(&event);
+
+	o2nm_node_put(node);
+}
+
+static void o2hb_set_quorum_device(struct o2hb_region *reg,
+				   struct o2hb_disk_slot *slot)
+{
+	assert_spin_locked(&o2hb_live_lock);
+
+	if (!o2hb_global_heartbeat_active())
+		return;
+
+	if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+		return;
+
+	/*
+	 * A region can be added to the quorum only when it sees all
+	 * live nodes heartbeat on it. In other words, the region has been
+	 * added to all nodes.
+	 */
+	if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+		   sizeof(o2hb_live_node_bitmap)))
+		return;
+
+	if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
+		return;
+
+	printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
+	       config_item_name(&reg->hr_item));
+
+	set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+
+	/*
+	 * If global heartbeat active, unpin all regions if the
+	 * region count > CUT_OFF
+	 */
+	if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+			   O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
+		o2hb_region_unpin(NULL);
+}
+
+static int o2hb_check_slot(struct o2hb_region *reg,
+			   struct o2hb_disk_slot *slot)
+{
+	int changed = 0, gen_changed = 0;
+	struct o2hb_node_event event =
+		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
+	struct o2nm_node *node;
+	struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
+	u64 cputime;
+	unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
+	unsigned int slot_dead_ms;
+	int tmp;
+
+	memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
+
+	/*
+	 * If a node is no longer configured but is still in the livemap, we
+	 * may need to clear that bit from the livemap.
+	 */
+	node = o2nm_get_node_by_num(slot->ds_node_num);
+	if (!node) {
+		spin_lock(&o2hb_live_lock);
+		tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+		spin_unlock(&o2hb_live_lock);
+		if (!tmp)
+			return 0;
+	}
+
+	if (!o2hb_verify_crc(reg, hb_block)) {
+		/* all paths from here will drop o2hb_live_lock for
+		 * us. */
+		spin_lock(&o2hb_live_lock);
+
+		/* Don't print an error on the console in this case -
+		 * a freshly formatted heartbeat area will not have a
+		 * crc set on it. */
+		if (list_empty(&slot->ds_live_item))
+			goto out;
+
+		/* The node is live but pushed out a bad crc. We
+		 * consider it a transient miss but don't populate any
+		 * other values as they may be junk. */
+		mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
+		     slot->ds_node_num, reg->hr_dev_name);
+		o2hb_dump_slot(hb_block);
+
+		slot->ds_equal_samples++;
+		goto fire_callbacks;
+	}
+
+	/* we don't care if these wrap.. the state transitions below
+	 * clear at the right places */
+	cputime = le64_to_cpu(hb_block->hb_seq);
+	if (slot->ds_last_time != cputime)
+		slot->ds_changed_samples++;
+	else
+		slot->ds_equal_samples++;
+	slot->ds_last_time = cputime;
+
+	/* The node changed heartbeat generations. We assume this to
+	 * mean it dropped off but came back before we timed out. We
+	 * want to consider it down for the time being but don't want
+	 * to lose any changed_samples state we might build up to
+	 * considering it live again. */
+	if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) {
+		gen_changed = 1;
+		slot->ds_equal_samples = 0;
+		mlog(ML_HEARTBEAT, "Node %d changed generation (0x%llx "
+		     "to 0x%llx)\n", slot->ds_node_num,
+		     (long long)slot->ds_last_generation,
+		     (long long)le64_to_cpu(hb_block->hb_generation));
+	}
+
+	slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
+
+	mlog(ML_HEARTBEAT, "Slot %d gen 0x%llx cksum 0x%x "
+	     "seq %llu last %llu changed %u equal %u\n",
+	     slot->ds_node_num, (long long)slot->ds_last_generation,
+	     le32_to_cpu(hb_block->hb_cksum),
+	     (unsigned long long)le64_to_cpu(hb_block->hb_seq),
+	     (unsigned long long)slot->ds_last_time, slot->ds_changed_samples,
+	     slot->ds_equal_samples);
+
+	spin_lock(&o2hb_live_lock);
+
+fire_callbacks:
+	/* dead nodes only come to life after some number of
+	 * changes at any time during their dead time */
+	if (list_empty(&slot->ds_live_item) &&
+	    slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) {
+		mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
+		     slot->ds_node_num, (long long)slot->ds_last_generation);
+
+		set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
+		/* first on the list generates a callback */
+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes "
+			     "bitmap\n", slot->ds_node_num);
+			set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+			o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
+					      slot->ds_node_num);
+
+			changed = 1;
+		}
+
+		list_add_tail(&slot->ds_live_item,
+			      &o2hb_live_slots[slot->ds_node_num]);
+
+		slot->ds_equal_samples = 0;
+
+		/* We want to be sure that all nodes agree on the
+		 * number of milliseconds before a node will be
+		 * considered dead. The self-fencing timeout is
+		 * computed from this value, and a discrepancy might
+		 * result in heartbeat calling a node dead when it
+		 * hasn't self-fenced yet. */
+		slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms);
+		if (slot_dead_ms && slot_dead_ms != dead_ms) {
+			/* TODO: Perhaps we can fail the region here. */
+			mlog(ML_ERROR, "Node %d on device %s has a dead count "
+			     "of %u ms, but our count is %u ms.\n"
+			     "Please double check your configuration values "
+			     "for 'O2CB_HEARTBEAT_THRESHOLD'\n",
+			     slot->ds_node_num, reg->hr_dev_name, slot_dead_ms,
+			     dead_ms);
+		}
+		goto out;
+	}
+
+	/* if the list is dead, we're done.. */
+	if (list_empty(&slot->ds_live_item))
+		goto out;
+
+	/* live nodes only go dead after enough consequtive missed
+	 * samples..  reset the missed counter whenever we see
+	 * activity */
+	if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) {
+		mlog(ML_HEARTBEAT, "Node %d left my region\n",
+		     slot->ds_node_num);
+
+		clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
+		/* last off the live_slot generates a callback */
+		list_del_init(&slot->ds_live_item);
+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live "
+			     "nodes bitmap\n", slot->ds_node_num);
+			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+			/* node can be null */
+			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
+					      node, slot->ds_node_num);
+
+			changed = 1;
+		}
+
+		/* We don't clear this because the node is still
+		 * actually writing new blocks. */
+		if (!gen_changed)
+			slot->ds_changed_samples = 0;
+		goto out;
+	}
+	if (slot->ds_changed_samples) {
+		slot->ds_changed_samples = 0;
+		slot->ds_equal_samples = 0;
+	}
+out:
+	o2hb_set_quorum_device(reg, slot);
+
+	spin_unlock(&o2hb_live_lock);
+
+	o2hb_run_event_list(&event);
+
+	if (node)
+		o2nm_node_put(node);
+	return changed;
+}
+
+/* This could be faster if we just implmented a find_last_bit, but I
+ * don't think the circumstances warrant it. */
+static int o2hb_highest_node(unsigned long *nodes,
+			     int numbits)
+{
+	int highest, node;
+
+	highest = numbits;
+	node = -1;
+	while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
+		if (node >= numbits)
+			break;
+
+		highest = node;
+	}
+
+	return highest;
+}
+
+static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
+{
+	int i, ret, highest_node, change = 0;
+	unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	struct o2hb_bio_wait_ctxt write_wc;
+
+	ret = o2nm_configured_node_map(configured_nodes,
+				       sizeof(configured_nodes));
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/*
+	 * If a node is not configured but is in the livemap, we still need
+	 * to read the slot so as to be able to remove it from the livemap.
+	 */
+	o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+	i = -1;
+	while ((i = find_next_bit(live_node_bitmap,
+				  O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+		set_bit(i, configured_nodes);
+	}
+
+	highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
+	if (highest_node >= O2NM_MAX_NODES) {
+		mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
+		return -EINVAL;
+	}
+
+	/* No sense in reading the slots of nodes that don't exist
+	 * yet. Of course, if the node definitions have holes in them
+	 * then we're reading an empty slot anyway... Consider this
+	 * best-effort. */
+	ret = o2hb_read_slots(reg, highest_node + 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* With an up to date view of the slots, we can check that no
+	 * other node has been improperly configured to heartbeat in
+	 * our slot. */
+	o2hb_check_last_timestamp(reg);
+
+	/* fill in the proper info for our next heartbeat */
+	o2hb_prepare_block(reg, reg->hr_generation);
+
+	/* And fire off the write. Note that we don't wait on this I/O
+	 * until later. */
+	ret = o2hb_issue_node_write(reg, &write_wc);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	i = -1;
+	while((i = find_next_bit(configured_nodes,
+				 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+		change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
+	}
+
+	/*
+	 * We have to be sure we've advertised ourselves on disk
+	 * before we can go to steady state.  This ensures that
+	 * people we find in our steady state have seen us.
+	 */
+	o2hb_wait_on_io(reg, &write_wc);
+	if (write_wc.wc_error) {
+		/* Do not re-arm the write timeout on I/O error - we
+		 * can't be sure that the new block ever made it to
+		 * disk */
+		mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
+		     write_wc.wc_error, reg->hr_dev_name);
+		return write_wc.wc_error;
+	}
+
+	o2hb_arm_write_timeout(reg);
+
+	/* let the person who launched us know when things are steady */
+	if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
+		if (atomic_dec_and_test(&reg->hr_steady_iterations))
+			wake_up(&o2hb_steady_queue);
+	}
+
+	return 0;
+}
+
+/* Subtract b from a, storing the result in a. a *must* have a larger
+ * value than b. */
+static void o2hb_tv_subtract(struct timeval *a,
+			     struct timeval *b)
+{
+	/* just return 0 when a is after b */
+	if (a->tv_sec < b->tv_sec ||
+	    (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
+		a->tv_sec = 0;
+		a->tv_usec = 0;
+		return;
+	}
+
+	a->tv_sec -= b->tv_sec;
+	a->tv_usec -= b->tv_usec;
+	while ( a->tv_usec < 0 ) {
+		a->tv_sec--;
+		a->tv_usec += 1000000;
+	}
+}
+
+static unsigned int o2hb_elapsed_msecs(struct timeval *start,
+				       struct timeval *end)
+{
+	struct timeval res = *end;
+
+	o2hb_tv_subtract(&res, start);
+
+	return res.tv_sec * 1000 + res.tv_usec / 1000;
+}
+
+/*
+ * we ride the region ref that the region dir holds.  before the region
+ * dir is removed and drops it ref it will wait to tear down this
+ * thread.
+ */
+static int o2hb_thread(void *data)
+{
+	int i, ret;
+	struct o2hb_region *reg = data;
+	struct o2hb_bio_wait_ctxt write_wc;
+	struct timeval before_hb, after_hb;
+	unsigned int elapsed_msec;
+
+	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
+
+	set_user_nice(current, -20);
+
+	/* Pin node */
+	o2nm_depend_this_node();
+
+	while (!kthread_should_stop() && !reg->hr_unclean_stop) {
+		/* We track the time spent inside
+		 * o2hb_do_disk_heartbeat so that we avoid more than
+		 * hr_timeout_ms between disk writes. On busy systems
+		 * this should result in a heartbeat which is less
+		 * likely to time itself out. */
+		do_gettimeofday(&before_hb);
+
+		i = 0;
+		do {
+			ret = o2hb_do_disk_heartbeat(reg);
+		} while (ret && ++i < 2);
+
+		do_gettimeofday(&after_hb);
+		elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+
+		mlog(ML_HEARTBEAT,
+		     "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
+		     before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
+		     after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
+		     elapsed_msec);
+
+		if (elapsed_msec < reg->hr_timeout_ms) {
+			/* the kthread api has blocked signals for us so no
+			 * need to record the return value. */
+			msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
+		}
+	}
+
+	o2hb_disarm_write_timeout(reg);
+
+	/* unclean stop is only used in very bad situation */
+	for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
+		o2hb_shutdown_slot(&reg->hr_slots[i]);
+
+	/* Explicit down notification - avoid forcing the other nodes
+	 * to timeout on this region when we could just as easily
+	 * write a clear generation - thus indicating to them that
+	 * this node has left this region.
+	 *
+	 * XXX: Should we skip this on unclean_stop? */
+	o2hb_prepare_block(reg, 0);
+	ret = o2hb_issue_node_write(reg, &write_wc);
+	if (ret == 0) {
+		o2hb_wait_on_io(reg, &write_wc);
+	} else {
+		mlog_errno(ret);
+	}
+
+	/* Unpin node */
+	o2nm_undepend_this_node();
+
+	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
+
+	return 0;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
+	struct o2hb_debug_buf *db = inode->i_private;
+	struct o2hb_region *reg;
+	unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	char *buf = NULL;
+	int i = -1;
+	int out = 0;
+
+	/* max_nodes should be the largest bitmap we pass here */
+	BUG_ON(sizeof(map) < db->db_size);
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		goto bail;
+
+	switch (db->db_type) {
+	case O2HB_DB_TYPE_LIVENODES:
+	case O2HB_DB_TYPE_LIVEREGIONS:
+	case O2HB_DB_TYPE_QUORUMREGIONS:
+	case O2HB_DB_TYPE_FAILEDREGIONS:
+		spin_lock(&o2hb_live_lock);
+		memcpy(map, db->db_data, db->db_size);
+		spin_unlock(&o2hb_live_lock);
+		break;
+
+	case O2HB_DB_TYPE_REGION_LIVENODES:
+		spin_lock(&o2hb_live_lock);
+		reg = (struct o2hb_region *)db->db_data;
+		memcpy(map, reg->hr_live_node_bitmap, db->db_size);
+		spin_unlock(&o2hb_live_lock);
+		break;
+
+	case O2HB_DB_TYPE_REGION_NUMBER:
+		reg = (struct o2hb_region *)db->db_data;
+		out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
+				reg->hr_region_num);
+		goto done;
+
+	case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
+		reg = (struct o2hb_region *)db->db_data;
+		out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+				jiffies_to_msecs(jiffies -
+						 reg->hr_last_timeout_start));
+		goto done;
+
+	case O2HB_DB_TYPE_REGION_PINNED:
+		reg = (struct o2hb_region *)db->db_data;
+		out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+				!!reg->hr_item_pinned);
+		goto done;
+
+	default:
+		goto done;
+	}
+
+	while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
+		out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+	out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+
+done:
+	i_size_write(inode, out);
+
+	file->private_data = buf;
+
+	return 0;
+bail:
+	return -ENOMEM;
+}
+
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+				 size_t nbytes, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+				       i_size_read(file->f_mapping->host));
+}
+#else
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+			       size_t nbytes, loff_t *ppos)
+{
+	return 0;
+}
+#endif  /* CONFIG_DEBUG_FS */
+
+static const struct file_operations o2hb_debug_fops = {
+	.open =		o2hb_debug_open,
+	.release =	o2hb_debug_release,
+	.read =		o2hb_debug_read,
+	.llseek =	generic_file_llseek,
+};
+
+void o2hb_exit(void)
+{
+	kfree(o2hb_db_livenodes);
+	kfree(o2hb_db_liveregions);
+	kfree(o2hb_db_quorumregions);
+	kfree(o2hb_db_failedregions);
+	debugfs_remove(o2hb_debug_failedregions);
+	debugfs_remove(o2hb_debug_quorumregions);
+	debugfs_remove(o2hb_debug_liveregions);
+	debugfs_remove(o2hb_debug_livenodes);
+	debugfs_remove(o2hb_debug_dir);
+}
+
+static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
+					struct o2hb_debug_buf **db, int db_len,
+					int type, int size, int len, void *data)
+{
+	*db = kmalloc(db_len, GFP_KERNEL);
+	if (!*db)
+		return NULL;
+
+	(*db)->db_type = type;
+	(*db)->db_size = size;
+	(*db)->db_len = len;
+	(*db)->db_data = data;
+
+	return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
+				   &o2hb_debug_fops);
+}
+
+static int o2hb_debug_init(void)
+{
+	int ret = -ENOMEM;
+
+	o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+	if (!o2hb_debug_dir) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+						 o2hb_debug_dir,
+						 &o2hb_db_livenodes,
+						 sizeof(*o2hb_db_livenodes),
+						 O2HB_DB_TYPE_LIVENODES,
+						 sizeof(o2hb_live_node_bitmap),
+						 O2NM_MAX_NODES,
+						 o2hb_live_node_bitmap);
+	if (!o2hb_debug_livenodes) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
+						   o2hb_debug_dir,
+						   &o2hb_db_liveregions,
+						   sizeof(*o2hb_db_liveregions),
+						   O2HB_DB_TYPE_LIVEREGIONS,
+						   sizeof(o2hb_live_region_bitmap),
+						   O2NM_MAX_REGIONS,
+						   o2hb_live_region_bitmap);
+	if (!o2hb_debug_liveregions) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	o2hb_debug_quorumregions =
+			o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
+					  o2hb_debug_dir,
+					  &o2hb_db_quorumregions,
+					  sizeof(*o2hb_db_quorumregions),
+					  O2HB_DB_TYPE_QUORUMREGIONS,
+					  sizeof(o2hb_quorum_region_bitmap),
+					  O2NM_MAX_REGIONS,
+					  o2hb_quorum_region_bitmap);
+	if (!o2hb_debug_quorumregions) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	o2hb_debug_failedregions =
+			o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
+					  o2hb_debug_dir,
+					  &o2hb_db_failedregions,
+					  sizeof(*o2hb_db_failedregions),
+					  O2HB_DB_TYPE_FAILEDREGIONS,
+					  sizeof(o2hb_failed_region_bitmap),
+					  O2NM_MAX_REGIONS,
+					  o2hb_failed_region_bitmap);
+	if (!o2hb_debug_failedregions) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = 0;
+bail:
+	if (ret)
+		o2hb_exit();
+
+	return ret;
+}
+
+int o2hb_init(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++)
+		INIT_LIST_HEAD(&o2hb_callbacks[i].list);
+
+	for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
+		INIT_LIST_HEAD(&o2hb_live_slots[i]);
+
+	INIT_LIST_HEAD(&o2hb_node_events);
+
+	memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+	memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
+	memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
+	memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
+	memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
+
+	o2hb_dependent_users = 0;
+
+	return o2hb_debug_init();
+}
+
+/* if we're already in a callback then we're already serialized by the sem */
+static void o2hb_fill_node_map_from_callback(unsigned long *map,
+					     unsigned bytes)
+{
+	BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
+
+	memcpy(map, &o2hb_live_node_bitmap, bytes);
+}
+
+/*
+ * get a map of all nodes that are heartbeating in any regions
+ */
+void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
+{
+	/* callers want to serialize this map and callbacks so that they
+	 * can trust that they don't miss nodes coming to the party */
+	down_read(&o2hb_callback_sem);
+	spin_lock(&o2hb_live_lock);
+	o2hb_fill_node_map_from_callback(map, bytes);
+	spin_unlock(&o2hb_live_lock);
+	up_read(&o2hb_callback_sem);
+}
+EXPORT_SYMBOL_GPL(o2hb_fill_node_map);
+
+/*
+ * heartbeat configfs bits.  The heartbeat set is a default set under
+ * the cluster set in nodemanager.c.
+ */
+
+static struct o2hb_region *to_o2hb_region(struct config_item *item)
+{
+	return item ? container_of(item, struct o2hb_region, hr_item) : NULL;
+}
+
+/* drop_item only drops its ref after killing the thread, nothing should
+ * be using the region anymore.  this has to clean up any state that
+ * attributes might have built up. */
+static void o2hb_region_release(struct config_item *item)
+{
+	int i;
+	struct page *page;
+	struct o2hb_region *reg = to_o2hb_region(item);
+
+	if (reg->hr_tmp_block)
+		kfree(reg->hr_tmp_block);
+
+	if (reg->hr_slot_data) {
+		for (i = 0; i < reg->hr_num_pages; i++) {
+			page = reg->hr_slot_data[i];
+			if (page)
+				__free_page(page);
+		}
+		kfree(reg->hr_slot_data);
+	}
+
+	if (reg->hr_bdev)
+		blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
+
+	if (reg->hr_slots)
+		kfree(reg->hr_slots);
+
+	kfree(reg->hr_db_regnum);
+	kfree(reg->hr_db_livenodes);
+	debugfs_remove(reg->hr_debug_livenodes);
+	debugfs_remove(reg->hr_debug_regnum);
+	debugfs_remove(reg->hr_debug_elapsed_time);
+	debugfs_remove(reg->hr_debug_pinned);
+	debugfs_remove(reg->hr_debug_dir);
+
+	spin_lock(&o2hb_live_lock);
+	list_del(&reg->hr_all_item);
+	spin_unlock(&o2hb_live_lock);
+
+	kfree(reg);
+}
+
+static int o2hb_read_block_input(struct o2hb_region *reg,
+				 const char *page,
+				 size_t count,
+				 unsigned long *ret_bytes,
+				 unsigned int *ret_bits)
+{
+	unsigned long bytes;
+	char *p = (char *)page;
+
+	bytes = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	/* Heartbeat and fs min / max block sizes are the same. */
+	if (bytes > 4096 || bytes < 512)
+		return -ERANGE;
+	if (hweight16(bytes) != 1)
+		return -EINVAL;
+
+	if (ret_bytes)
+		*ret_bytes = bytes;
+	if (ret_bits)
+		*ret_bits = ffs(bytes) - 1;
+
+	return 0;
+}
+
+static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
+					    char *page)
+{
+	return sprintf(page, "%u\n", reg->hr_block_bytes);
+}
+
+static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
+					     const char *page,
+					     size_t count)
+{
+	int status;
+	unsigned long block_bytes;
+	unsigned int block_bits;
+
+	if (reg->hr_bdev)
+		return -EINVAL;
+
+	status = o2hb_read_block_input(reg, page, count,
+				       &block_bytes, &block_bits);
+	if (status)
+		return status;
+
+	reg->hr_block_bytes = (unsigned int)block_bytes;
+	reg->hr_block_bits = block_bits;
+
+	return count;
+}
+
+static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
+					    char *page)
+{
+	return sprintf(page, "%llu\n", reg->hr_start_block);
+}
+
+static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
+					     const char *page,
+					     size_t count)
+{
+	unsigned long long tmp;
+	char *p = (char *)page;
+
+	if (reg->hr_bdev)
+		return -EINVAL;
+
+	tmp = simple_strtoull(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	reg->hr_start_block = tmp;
+
+	return count;
+}
+
+static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
+				       char *page)
+{
+	return sprintf(page, "%d\n", reg->hr_blocks);
+}
+
+static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
+					const char *page,
+					size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	if (reg->hr_bdev)
+		return -EINVAL;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp > O2NM_MAX_NODES || tmp == 0)
+		return -ERANGE;
+
+	reg->hr_blocks = (unsigned int)tmp;
+
+	return count;
+}
+
+static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
+				    char *page)
+{
+	unsigned int ret = 0;
+
+	if (reg->hr_bdev)
+		ret = sprintf(page, "%s\n", reg->hr_dev_name);
+
+	return ret;
+}
+
+static void o2hb_init_region_params(struct o2hb_region *reg)
+{
+	reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits;
+	reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS;
+
+	mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n",
+	     reg->hr_start_block, reg->hr_blocks);
+	mlog(ML_HEARTBEAT, "hr_block_bytes = %u, hr_block_bits = %u\n",
+	     reg->hr_block_bytes, reg->hr_block_bits);
+	mlog(ML_HEARTBEAT, "hr_timeout_ms = %u\n", reg->hr_timeout_ms);
+	mlog(ML_HEARTBEAT, "dead threshold = %u\n", o2hb_dead_threshold);
+}
+
+static int o2hb_map_slot_data(struct o2hb_region *reg)
+{
+	int i, j;
+	unsigned int last_slot;
+	unsigned int spp = reg->hr_slots_per_page;
+	struct page *page;
+	char *raw;
+	struct o2hb_disk_slot *slot;
+
+	reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
+	if (reg->hr_tmp_block == NULL) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	reg->hr_slots = kcalloc(reg->hr_blocks,
+				sizeof(struct o2hb_disk_slot), GFP_KERNEL);
+	if (reg->hr_slots == NULL) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	for(i = 0; i < reg->hr_blocks; i++) {
+		slot = &reg->hr_slots[i];
+		slot->ds_node_num = i;
+		INIT_LIST_HEAD(&slot->ds_live_item);
+		slot->ds_raw_block = NULL;
+	}
+
+	reg->hr_num_pages = (reg->hr_blocks + spp - 1) / spp;
+	mlog(ML_HEARTBEAT, "Going to require %u pages to cover %u blocks "
+			   "at %u blocks per page\n",
+	     reg->hr_num_pages, reg->hr_blocks, spp);
+
+	reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
+				    GFP_KERNEL);
+	if (!reg->hr_slot_data) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	for(i = 0; i < reg->hr_num_pages; i++) {
+		page = alloc_page(GFP_KERNEL);
+		if (!page) {
+			mlog_errno(-ENOMEM);
+			return -ENOMEM;
+		}
+
+		reg->hr_slot_data[i] = page;
+
+		last_slot = i * spp;
+		raw = page_address(page);
+		for (j = 0;
+		     (j < spp) && ((j + last_slot) < reg->hr_blocks);
+		     j++) {
+			BUG_ON((j + last_slot) >= reg->hr_blocks);
+
+			slot = &reg->hr_slots[j + last_slot];
+			slot->ds_raw_block =
+				(struct o2hb_disk_heartbeat_block *) raw;
+
+			raw += reg->hr_block_bytes;
+		}
+	}
+
+	return 0;
+}
+
+/* Read in all the slots available and populate the tracking
+ * structures so that we can start with a baseline idea of what's
+ * there. */
+static int o2hb_populate_slot_data(struct o2hb_region *reg)
+{
+	int ret, i;
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+
+	ret = o2hb_read_slots(reg, reg->hr_blocks);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* We only want to get an idea of the values initially in each
+	 * slot, so we do no verification - o2hb_check_slot will
+	 * actually determine if each configured slot is valid and
+	 * whether any values have changed. */
+	for(i = 0; i < reg->hr_blocks; i++) {
+		slot = &reg->hr_slots[i];
+		hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block;
+
+		/* Only fill the values that o2hb_check_slot uses to
+		 * determine changing slots */
+		slot->ds_last_time = le64_to_cpu(hb_block->hb_seq);
+		slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
+	}
+
+out:
+	return ret;
+}
+
+/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
+static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
+				     const char *page,
+				     size_t count)
+{
+	struct task_struct *hb_task;
+	long fd;
+	int sectsize;
+	char *p = (char *)page;
+	struct file *filp = NULL;
+	struct inode *inode = NULL;
+	ssize_t ret = -EINVAL;
+	int live_threshold;
+
+	if (reg->hr_bdev)
+		goto out;
+
+	/* We can't heartbeat without having had our node number
+	 * configured yet. */
+	if (o2nm_this_node() == O2NM_MAX_NODES)
+		goto out;
+
+	fd = simple_strtol(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		goto out;
+
+	if (fd < 0 || fd >= INT_MAX)
+		goto out;
+
+	filp = fget(fd);
+	if (filp == NULL)
+		goto out;
+
+	if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
+	    reg->hr_block_bytes == 0)
+		goto out;
+
+	inode = igrab(filp->f_mapping->host);
+	if (inode == NULL)
+		goto out;
+
+	if (!S_ISBLK(inode->i_mode))
+		goto out;
+
+	reg->hr_bdev = I_BDEV(filp->f_mapping->host);
+	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
+	if (ret) {
+		reg->hr_bdev = NULL;
+		goto out;
+	}
+	inode = NULL;
+
+	bdevname(reg->hr_bdev, reg->hr_dev_name);
+
+	sectsize = bdev_logical_block_size(reg->hr_bdev);
+	if (sectsize != reg->hr_block_bytes) {
+		mlog(ML_ERROR,
+		     "blocksize %u incorrect for device, expected %d",
+		     reg->hr_block_bytes, sectsize);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	o2hb_init_region_params(reg);
+
+	/* Generation of zero is invalid */
+	do {
+		get_random_bytes(&reg->hr_generation,
+				 sizeof(reg->hr_generation));
+	} while (reg->hr_generation == 0);
+
+	ret = o2hb_map_slot_data(reg);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = o2hb_populate_slot_data(reg);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout);
+
+	/*
+	 * A node is considered live after it has beat LIVE_THRESHOLD
+	 * times.  We're not steady until we've given them a chance
+	 * _after_ our first read.
+	 * The default threshold is bare minimum so as to limit the delay
+	 * during mounts. For global heartbeat, the threshold doubled for the
+	 * first region.
+	 */
+	live_threshold = O2HB_LIVE_THRESHOLD;
+	if (o2hb_global_heartbeat_active()) {
+		spin_lock(&o2hb_live_lock);
+		if (o2hb_pop_count(&o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
+			live_threshold <<= 1;
+		spin_unlock(&o2hb_live_lock);
+	}
+	atomic_set(&reg->hr_steady_iterations, live_threshold + 1);
+
+	hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
+			      reg->hr_item.ci_name);
+	if (IS_ERR(hb_task)) {
+		ret = PTR_ERR(hb_task);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	spin_lock(&o2hb_live_lock);
+	reg->hr_task = hb_task;
+	spin_unlock(&o2hb_live_lock);
+
+	ret = wait_event_interruptible(o2hb_steady_queue,
+				atomic_read(&reg->hr_steady_iterations) == 0);
+	if (ret) {
+		/* We got interrupted (hello ptrace!).  Clean up */
+		spin_lock(&o2hb_live_lock);
+		hb_task = reg->hr_task;
+		reg->hr_task = NULL;
+		spin_unlock(&o2hb_live_lock);
+
+		if (hb_task)
+			kthread_stop(hb_task);
+		goto out;
+	}
+
+	/* Ok, we were woken.  Make sure it wasn't by drop_item() */
+	spin_lock(&o2hb_live_lock);
+	hb_task = reg->hr_task;
+	if (o2hb_global_heartbeat_active())
+		set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+	spin_unlock(&o2hb_live_lock);
+
+	if (hb_task)
+		ret = count;
+	else
+		ret = -EIO;
+
+	if (hb_task && o2hb_global_heartbeat_active())
+		printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
+		       config_item_name(&reg->hr_item));
+
+out:
+	if (filp)
+		fput(filp);
+	if (inode)
+		iput(inode);
+	if (ret < 0) {
+		if (reg->hr_bdev) {
+			blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
+			reg->hr_bdev = NULL;
+		}
+	}
+	return ret;
+}
+
+static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
+                                      char *page)
+{
+	pid_t pid = 0;
+
+	spin_lock(&o2hb_live_lock);
+	if (reg->hr_task)
+		pid = task_pid_nr(reg->hr_task);
+	spin_unlock(&o2hb_live_lock);
+
+	if (!pid)
+		return 0;
+
+	return sprintf(page, "%u\n", pid);
+}
+
+struct o2hb_region_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2hb_region *, char *);
+	ssize_t (*store)(struct o2hb_region *, const char *, size_t);
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "block_bytes",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_block_bytes_read,
+	.store	= o2hb_region_block_bytes_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_start_block = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "start_block",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_start_block_read,
+	.store	= o2hb_region_start_block_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_blocks = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "blocks",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_blocks_read,
+	.store	= o2hb_region_blocks_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_dev = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "dev",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_dev_read,
+	.store	= o2hb_region_dev_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_pid = {
+       .attr   = { .ca_owner = THIS_MODULE,
+                   .ca_name = "pid",
+                   .ca_mode = S_IRUGO | S_IRUSR },
+       .show   = o2hb_region_pid_read,
+};
+
+static struct configfs_attribute *o2hb_region_attrs[] = {
+	&o2hb_region_attr_block_bytes.attr,
+	&o2hb_region_attr_start_block.attr,
+	&o2hb_region_attr_blocks.attr,
+	&o2hb_region_attr_dev.attr,
+	&o2hb_region_attr_pid.attr,
+	NULL,
+};
+
+static ssize_t o2hb_region_show(struct config_item *item,
+				struct configfs_attribute *attr,
+				char *page)
+{
+	struct o2hb_region *reg = to_o2hb_region(item);
+	struct o2hb_region_attribute *o2hb_region_attr =
+		container_of(attr, struct o2hb_region_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2hb_region_attr->show)
+		ret = o2hb_region_attr->show(reg, page);
+	return ret;
+}
+
+static ssize_t o2hb_region_store(struct config_item *item,
+				 struct configfs_attribute *attr,
+				 const char *page, size_t count)
+{
+	struct o2hb_region *reg = to_o2hb_region(item);
+	struct o2hb_region_attribute *o2hb_region_attr =
+		container_of(attr, struct o2hb_region_attribute, attr);
+	ssize_t ret = -EINVAL;
+
+	if (o2hb_region_attr->store)
+		ret = o2hb_region_attr->store(reg, page, count);
+	return ret;
+}
+
+static struct configfs_item_operations o2hb_region_item_ops = {
+	.release		= o2hb_region_release,
+	.show_attribute		= o2hb_region_show,
+	.store_attribute	= o2hb_region_store,
+};
+
+static struct config_item_type o2hb_region_type = {
+	.ct_item_ops	= &o2hb_region_item_ops,
+	.ct_attrs	= o2hb_region_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* heartbeat set */
+
+struct o2hb_heartbeat_group {
+	struct config_group hs_group;
+	/* some stuff? */
+};
+
+static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group *group)
+{
+	return group ?
+		container_of(group, struct o2hb_heartbeat_group, hs_group)
+		: NULL;
+}
+
+static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
+{
+	int ret = -ENOMEM;
+
+	reg->hr_debug_dir =
+		debugfs_create_dir(config_item_name(&reg->hr_item), dir);
+	if (!reg->hr_debug_dir) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	reg->hr_debug_livenodes =
+			o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+					  reg->hr_debug_dir,
+					  &(reg->hr_db_livenodes),
+					  sizeof(*(reg->hr_db_livenodes)),
+					  O2HB_DB_TYPE_REGION_LIVENODES,
+					  sizeof(reg->hr_live_node_bitmap),
+					  O2NM_MAX_NODES, reg);
+	if (!reg->hr_debug_livenodes) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	reg->hr_debug_regnum =
+			o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
+					  reg->hr_debug_dir,
+					  &(reg->hr_db_regnum),
+					  sizeof(*(reg->hr_db_regnum)),
+					  O2HB_DB_TYPE_REGION_NUMBER,
+					  0, O2NM_MAX_NODES, reg);
+	if (!reg->hr_debug_regnum) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	reg->hr_debug_elapsed_time =
+			o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
+					  reg->hr_debug_dir,
+					  &(reg->hr_db_elapsed_time),
+					  sizeof(*(reg->hr_db_elapsed_time)),
+					  O2HB_DB_TYPE_REGION_ELAPSED_TIME,
+					  0, 0, reg);
+	if (!reg->hr_debug_elapsed_time) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	reg->hr_debug_pinned =
+			o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
+					  reg->hr_debug_dir,
+					  &(reg->hr_db_pinned),
+					  sizeof(*(reg->hr_db_pinned)),
+					  O2HB_DB_TYPE_REGION_PINNED,
+					  0, 0, reg);
+	if (!reg->hr_debug_pinned) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = 0;
+bail:
+	return ret;
+}
+
+static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
+							  const char *name)
+{
+	struct o2hb_region *reg = NULL;
+	int ret;
+
+	reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
+	if (reg == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) {
+		ret = -ENAMETOOLONG;
+		goto free;
+	}
+
+	spin_lock(&o2hb_live_lock);
+	reg->hr_region_num = 0;
+	if (o2hb_global_heartbeat_active()) {
+		reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
+							 O2NM_MAX_REGIONS);
+		if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
+			spin_unlock(&o2hb_live_lock);
+			ret = -EFBIG;
+			goto free;
+		}
+		set_bit(reg->hr_region_num, o2hb_region_bitmap);
+	}
+	list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
+	spin_unlock(&o2hb_live_lock);
+
+	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+
+	ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
+	if (ret) {
+		config_item_put(&reg->hr_item);
+		goto free;
+	}
+
+	return &reg->hr_item;
+free:
+	kfree(reg);
+	return ERR_PTR(ret);
+}
+
+static void o2hb_heartbeat_group_drop_item(struct config_group *group,
+					   struct config_item *item)
+{
+	struct task_struct *hb_task;
+	struct o2hb_region *reg = to_o2hb_region(item);
+	int quorum_region = 0;
+
+	/* stop the thread when the user removes the region dir */
+	spin_lock(&o2hb_live_lock);
+	if (o2hb_global_heartbeat_active()) {
+		clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+		clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+		if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+			quorum_region = 1;
+		clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+	}
+	hb_task = reg->hr_task;
+	reg->hr_task = NULL;
+	reg->hr_item_dropped = 1;
+	spin_unlock(&o2hb_live_lock);
+
+	if (hb_task)
+		kthread_stop(hb_task);
+
+	/*
+	 * If we're racing a dev_write(), we need to wake them.  They will
+	 * check reg->hr_task
+	 */
+	if (atomic_read(&reg->hr_steady_iterations) != 0) {
+		atomic_set(&reg->hr_steady_iterations, 0);
+		wake_up(&o2hb_steady_queue);
+	}
+
+	if (o2hb_global_heartbeat_active())
+		printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
+		       config_item_name(&reg->hr_item));
+
+	config_item_put(item);
+
+	if (!o2hb_global_heartbeat_active() || !quorum_region)
+		return;
+
+	/*
+	 * If global heartbeat active and there are dependent users,
+	 * pin all regions if quorum region count <= CUT_OFF
+	 */
+	spin_lock(&o2hb_live_lock);
+
+	if (!o2hb_dependent_users)
+		goto unlock;
+
+	if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+			   O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+		o2hb_region_pin(NULL);
+
+unlock:
+	spin_unlock(&o2hb_live_lock);
+}
+
+struct o2hb_heartbeat_group_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
+	ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
+};
+
+static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
+					 struct configfs_attribute *attr,
+					 char *page)
+{
+	struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
+	struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
+		container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2hb_heartbeat_group_attr->show)
+		ret = o2hb_heartbeat_group_attr->show(reg, page);
+	return ret;
+}
+
+static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
+					  struct configfs_attribute *attr,
+					  const char *page, size_t count)
+{
+	struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
+	struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
+		container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
+	ssize_t ret = -EINVAL;
+
+	if (o2hb_heartbeat_group_attr->store)
+		ret = o2hb_heartbeat_group_attr->store(reg, page, count);
+	return ret;
+}
+
+static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
+						     char *page)
+{
+	return sprintf(page, "%u\n", o2hb_dead_threshold);
+}
+
+static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
+						    const char *page,
+						    size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (!p || (*p && (*p != '\n')))
+                return -EINVAL;
+
+	/* this will validate ranges for us. */
+	o2hb_dead_threshold_set((unsigned int) tmp);
+
+	return count;
+}
+
+static
+ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
+				       char *page)
+{
+	return sprintf(page, "%s\n",
+		       o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
+}
+
+static
+ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
+					const char *page, size_t count)
+{
+	unsigned int i;
+	int ret;
+	size_t len;
+
+	len = (page[count - 1] == '\n') ? count - 1 : count;
+	if (!len)
+		return -EINVAL;
+
+	for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
+		if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
+			continue;
+
+		ret = o2hb_global_hearbeat_mode_set(i);
+		if (!ret)
+			printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
+			       o2hb_heartbeat_mode_desc[i]);
+		return count;
+	}
+
+	return -EINVAL;
+
+}
+
+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "dead_threshold",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_heartbeat_group_threshold_show,
+	.store	= o2hb_heartbeat_group_threshold_store,
+};
+
+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
+	.attr   = { .ca_owner = THIS_MODULE,
+		.ca_name = "mode",
+		.ca_mode = S_IRUGO | S_IWUSR },
+	.show   = o2hb_heartbeat_group_mode_show,
+	.store  = o2hb_heartbeat_group_mode_store,
+};
+
+static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
+	&o2hb_heartbeat_group_attr_threshold.attr,
+	&o2hb_heartbeat_group_attr_mode.attr,
+	NULL,
+};
+
+static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
+	.show_attribute		= o2hb_heartbeat_group_show,
+	.store_attribute	= o2hb_heartbeat_group_store,
+};
+
+static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
+	.make_item	= o2hb_heartbeat_group_make_item,
+	.drop_item	= o2hb_heartbeat_group_drop_item,
+};
+
+static struct config_item_type o2hb_heartbeat_group_type = {
+	.ct_group_ops	= &o2hb_heartbeat_group_group_ops,
+	.ct_item_ops	= &o2hb_hearbeat_group_item_ops,
+	.ct_attrs	= o2hb_heartbeat_group_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* this is just here to avoid touching group in heartbeat.h which the
+ * entire damn world #includes */
+struct config_group *o2hb_alloc_hb_set(void)
+{
+	struct o2hb_heartbeat_group *hs = NULL;
+	struct config_group *ret = NULL;
+
+	hs = kzalloc(sizeof(struct o2hb_heartbeat_group), GFP_KERNEL);
+	if (hs == NULL)
+		goto out;
+
+	config_group_init_type_name(&hs->hs_group, "heartbeat",
+				    &o2hb_heartbeat_group_type);
+
+	ret = &hs->hs_group;
+out:
+	if (ret == NULL)
+		kfree(hs);
+	return ret;
+}
+
+void o2hb_free_hb_set(struct config_group *group)
+{
+	struct o2hb_heartbeat_group *hs = to_o2hb_heartbeat_group(group);
+	kfree(hs);
+}
+
+/* hb callback registration and issuing */
+
+static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type)
+{
+	if (type == O2HB_NUM_CB)
+		return ERR_PTR(-EINVAL);
+
+	return &o2hb_callbacks[type];
+}
+
+void o2hb_setup_callback(struct o2hb_callback_func *hc,
+			 enum o2hb_callback_type type,
+			 o2hb_cb_func *func,
+			 void *data,
+			 int priority)
+{
+	INIT_LIST_HEAD(&hc->hc_item);
+	hc->hc_func = func;
+	hc->hc_data = data;
+	hc->hc_priority = priority;
+	hc->hc_type = type;
+	hc->hc_magic = O2HB_CB_MAGIC;
+}
+EXPORT_SYMBOL_GPL(o2hb_setup_callback);
+
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only pin the matching region. In global we pin all the active
+ * regions.
+ */
+static int o2hb_region_pin(const char *region_uuid)
+{
+	int ret = 0, found = 0;
+	struct o2hb_region *reg;
+	char *uuid;
+
+	assert_spin_locked(&o2hb_live_lock);
+
+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		uuid = config_item_name(&reg->hr_item);
+
+		/* local heartbeat */
+		if (region_uuid) {
+			if (strcmp(region_uuid, uuid))
+				continue;
+			found = 1;
+		}
+
+		if (reg->hr_item_pinned || reg->hr_item_dropped)
+			goto skip_pin;
+
+		/* Ignore ENOENT only for local hb (userdlm domain) */
+		ret = o2nm_depend_item(&reg->hr_item);
+		if (!ret) {
+			mlog(ML_CLUSTER, "Pin region %s\n", uuid);
+			reg->hr_item_pinned = 1;
+		} else {
+			if (ret == -ENOENT && found)
+				ret = 0;
+			else {
+				mlog(ML_ERROR, "Pin region %s fails with %d\n",
+				     uuid, ret);
+				break;
+			}
+		}
+skip_pin:
+		if (found)
+			break;
+	}
+
+	return ret;
+}
+
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only unpin the matching region. In global we unpin all the
+ * active regions.
+ */
+static void o2hb_region_unpin(const char *region_uuid)
+{
+	struct o2hb_region *reg;
+	char *uuid;
+	int found = 0;
+
+	assert_spin_locked(&o2hb_live_lock);
+
+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		uuid = config_item_name(&reg->hr_item);
+		if (region_uuid) {
+			if (strcmp(region_uuid, uuid))
+				continue;
+			found = 1;
+		}
+
+		if (reg->hr_item_pinned) {
+			mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
+			o2nm_undepend_item(&reg->hr_item);
+			reg->hr_item_pinned = 0;
+		}
+		if (found)
+			break;
+	}
+}
+
+static int o2hb_region_inc_user(const char *region_uuid)
+{
+	int ret = 0;
+
+	spin_lock(&o2hb_live_lock);
+
+	/* local heartbeat */
+	if (!o2hb_global_heartbeat_active()) {
+	    ret = o2hb_region_pin(region_uuid);
+	    goto unlock;
+	}
+
+	/*
+	 * if global heartbeat active and this is the first dependent user,
+	 * pin all regions if quorum region count <= CUT_OFF
+	 */
+	o2hb_dependent_users++;
+	if (o2hb_dependent_users > 1)
+		goto unlock;
+
+	if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+			   O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+		ret = o2hb_region_pin(NULL);
+
+unlock:
+	spin_unlock(&o2hb_live_lock);
+	return ret;
+}
+
+void o2hb_region_dec_user(const char *region_uuid)
+{
+	spin_lock(&o2hb_live_lock);
+
+	/* local heartbeat */
+	if (!o2hb_global_heartbeat_active()) {
+	    o2hb_region_unpin(region_uuid);
+	    goto unlock;
+	}
+
+	/*
+	 * if global heartbeat active and there are no dependent users,
+	 * unpin all quorum regions
+	 */
+	o2hb_dependent_users--;
+	if (!o2hb_dependent_users)
+		o2hb_region_unpin(NULL);
+
+unlock:
+	spin_unlock(&o2hb_live_lock);
+}
+
+int o2hb_register_callback(const char *region_uuid,
+			   struct o2hb_callback_func *hc)
+{
+	struct o2hb_callback_func *tmp;
+	struct list_head *iter;
+	struct o2hb_callback *hbcall;
+	int ret;
+
+	BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
+	BUG_ON(!list_empty(&hc->hc_item));
+
+	hbcall = hbcall_from_type(hc->hc_type);
+	if (IS_ERR(hbcall)) {
+		ret = PTR_ERR(hbcall);
+		goto out;
+	}
+
+	if (region_uuid) {
+		ret = o2hb_region_inc_user(region_uuid);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	down_write(&o2hb_callback_sem);
+
+	list_for_each(iter, &hbcall->list) {
+		tmp = list_entry(iter, struct o2hb_callback_func, hc_item);
+		if (hc->hc_priority < tmp->hc_priority) {
+			list_add_tail(&hc->hc_item, iter);
+			break;
+		}
+	}
+	if (list_empty(&hc->hc_item))
+		list_add_tail(&hc->hc_item, &hbcall->list);
+
+	up_write(&o2hb_callback_sem);
+	ret = 0;
+out:
+	mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
+	     ret, __builtin_return_address(0), hc);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(o2hb_register_callback);
+
+void o2hb_unregister_callback(const char *region_uuid,
+			      struct o2hb_callback_func *hc)
+{
+	BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
+
+	mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
+	     __builtin_return_address(0), hc);
+
+	/* XXX Can this happen _with_ a region reference? */
+	if (list_empty(&hc->hc_item))
+		return;
+
+	if (region_uuid)
+		o2hb_region_dec_user(region_uuid);
+
+	down_write(&o2hb_callback_sem);
+
+	list_del_init(&hc->hc_item);
+
+	up_write(&o2hb_callback_sem);
+}
+EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
+
+int o2hb_check_node_heartbeating(u8 node_num)
+{
+	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+	o2hb_fill_node_map(testing_map, sizeof(testing_map));
+	if (!test_bit(node_num, testing_map)) {
+		mlog(ML_HEARTBEAT,
+		     "node (%u) does not have heartbeating enabled.\n",
+		     node_num);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
+
+int o2hb_check_node_heartbeating_from_callback(u8 node_num)
+{
+	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+	o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+	if (!test_bit(node_num, testing_map)) {
+		mlog(ML_HEARTBEAT,
+		     "node (%u) does not have heartbeating enabled.\n",
+		     node_num);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
+
+/* Makes sure our local node is configured with a node number, and is
+ * heartbeating. */
+int o2hb_check_local_node_heartbeating(void)
+{
+	u8 node_num;
+
+	/* if this node was set then we have networking */
+	node_num = o2nm_this_node();
+	if (node_num == O2NM_MAX_NODES) {
+		mlog(ML_HEARTBEAT, "this node has not been configured.\n");
+		return 0;
+	}
+
+	return o2hb_check_node_heartbeating(node_num);
+}
+EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
+
+/*
+ * this is just a hack until we get the plumbing which flips file systems
+ * read only and drops the hb ref instead of killing the node dead.
+ */
+void o2hb_stop_all_regions(void)
+{
+	struct o2hb_region *reg;
+
+	mlog(ML_ERROR, "stopping heartbeat on all active regions.\n");
+
+	spin_lock(&o2hb_live_lock);
+
+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item)
+		reg->hr_unclean_stop = 1;
+
+	spin_unlock(&o2hb_live_lock);
+}
+EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
+
+int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
+{
+	struct o2hb_region *reg;
+	int numregs = 0;
+	char *p;
+
+	spin_lock(&o2hb_live_lock);
+
+	p = region_uuids;
+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
+		if (numregs < max_regions) {
+			memcpy(p, config_item_name(&reg->hr_item),
+			       O2HB_MAX_REGION_NAME_LEN);
+			p += O2HB_MAX_REGION_NAME_LEN;
+		}
+		numregs++;
+	}
+
+	spin_unlock(&o2hb_live_lock);
+
+	return numregs;
+}
+EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
+
+int o2hb_global_heartbeat_active(void)
+{
+	return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL);
+}
+EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
new file mode 100644
index 00000000..00ad8e8f
--- /dev/null
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -0,0 +1,89 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_HEARTBEAT_H
+#define O2CLUSTER_HEARTBEAT_H
+
+#include "ocfs2_heartbeat.h"
+
+#define O2HB_REGION_TIMEOUT_MS		2000
+
+#define O2HB_MAX_REGION_NAME_LEN	32
+
+/* number of changes to be seen as live */
+#define O2HB_LIVE_THRESHOLD	   2
+/* number of equal samples to be seen as dead */
+extern unsigned int o2hb_dead_threshold;
+#define O2HB_DEFAULT_DEAD_THRESHOLD	   31
+/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
+#define O2HB_MIN_DEAD_THRESHOLD	  2
+#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
+
+#define O2HB_CB_MAGIC		0x51d1e4ec
+
+/* callback stuff */
+enum o2hb_callback_type {
+	O2HB_NODE_DOWN_CB = 0,
+	O2HB_NODE_UP_CB,
+	O2HB_NUM_CB
+};
+
+struct o2nm_node;
+typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *);
+
+struct o2hb_callback_func {
+	u32			hc_magic;
+	struct list_head	hc_item;
+	o2hb_cb_func		*hc_func;
+	void			*hc_data;
+	int			hc_priority;
+	enum o2hb_callback_type hc_type;
+};
+
+struct config_group *o2hb_alloc_hb_set(void);
+void o2hb_free_hb_set(struct config_group *group);
+
+void o2hb_setup_callback(struct o2hb_callback_func *hc,
+			 enum o2hb_callback_type type,
+			 o2hb_cb_func *func,
+			 void *data,
+			 int priority);
+int o2hb_register_callback(const char *region_uuid,
+			   struct o2hb_callback_func *hc);
+void o2hb_unregister_callback(const char *region_uuid,
+			      struct o2hb_callback_func *hc);
+void o2hb_fill_node_map(unsigned long *map,
+			unsigned bytes);
+void o2hb_exit(void);
+int o2hb_init(void);
+int o2hb_check_node_heartbeating(u8 node_num);
+int o2hb_check_node_heartbeating_from_callback(u8 node_num);
+int o2hb_check_local_node_heartbeating(void);
+void o2hb_stop_all_regions(void);
+int o2hb_get_all_regions(char *region_uuids, u8 numregions);
+int o2hb_global_heartbeat_active(void);
+
+#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
new file mode 100644
index 00000000..07ac24fd
--- /dev/null
+++ b/fs/ocfs2/cluster/masklog.c
@@ -0,0 +1,155 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <asm/uaccess.h>
+
+#include "masklog.h"
+
+struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
+EXPORT_SYMBOL_GPL(mlog_and_bits);
+struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(0);
+EXPORT_SYMBOL_GPL(mlog_not_bits);
+
+static ssize_t mlog_mask_show(u64 mask, char *buf)
+{
+	char *state;
+
+	if (__mlog_test_u64(mask, mlog_and_bits))
+		state = "allow";
+	else if (__mlog_test_u64(mask, mlog_not_bits))
+		state = "deny";
+	else
+		state = "off";
+
+	return snprintf(buf, PAGE_SIZE, "%s\n", state);
+}
+
+static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
+{
+	if (!strnicmp(buf, "allow", 5)) {
+		__mlog_set_u64(mask, mlog_and_bits);
+		__mlog_clear_u64(mask, mlog_not_bits);
+	} else if (!strnicmp(buf, "deny", 4)) {
+		__mlog_set_u64(mask, mlog_not_bits);
+		__mlog_clear_u64(mask, mlog_and_bits);
+	} else if (!strnicmp(buf, "off", 3)) {
+		__mlog_clear_u64(mask, mlog_not_bits);
+		__mlog_clear_u64(mask, mlog_and_bits);
+	} else
+		return -EINVAL;
+
+	return count;
+}
+
+struct mlog_attribute {
+	struct attribute attr;
+	u64 mask;
+};
+
+#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr)
+
+#define define_mask(_name) {			\
+	.attr = {				\
+		.name = #_name,			\
+		.mode = S_IRUGO | S_IWUSR,	\
+	},					\
+	.mask = ML_##_name,			\
+}
+
+static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
+	define_mask(TCP),
+	define_mask(MSG),
+	define_mask(SOCKET),
+	define_mask(HEARTBEAT),
+	define_mask(HB_BIO),
+	define_mask(DLMFS),
+	define_mask(DLM),
+	define_mask(DLM_DOMAIN),
+	define_mask(DLM_THREAD),
+	define_mask(DLM_MASTER),
+	define_mask(DLM_RECOVERY),
+	define_mask(DLM_GLUE),
+	define_mask(VOTE),
+	define_mask(CONN),
+	define_mask(QUORUM),
+	define_mask(BASTS),
+	define_mask(CLUSTER),
+	define_mask(ERROR),
+	define_mask(NOTICE),
+	define_mask(KTHREAD),
+};
+
+static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
+
+static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
+			 char *buf)
+{
+	struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
+
+	return mlog_mask_show(mlog_attr->mask, buf);
+}
+
+static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
+			  const char *buf, size_t count)
+{
+	struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
+
+	return mlog_mask_store(mlog_attr->mask, buf, count);
+}
+
+static const struct sysfs_ops mlog_attr_ops = {
+	.show  = mlog_show,
+	.store = mlog_store,
+};
+
+static struct kobj_type mlog_ktype = {
+	.default_attrs = mlog_attr_ptrs,
+	.sysfs_ops     = &mlog_attr_ops,
+};
+
+static struct kset mlog_kset = {
+	.kobj   = {.ktype = &mlog_ktype},
+};
+
+int mlog_sys_init(struct kset *o2cb_kset)
+{
+	int i = 0;
+
+	while (mlog_attrs[i].attr.mode) {
+		mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
+		i++;
+	}
+	mlog_attr_ptrs[i] = NULL;
+
+	kobject_set_name(&mlog_kset.kobj, "logmask");
+	mlog_kset.kobj.kset = o2cb_kset;
+	return kset_register(&mlog_kset);
+}
+
+void mlog_sys_shutdown(void)
+{
+	kset_unregister(&mlog_kset);
+}
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
new file mode 100644
index 00000000..baa2b9ef
--- /dev/null
+++ b/fs/ocfs2/cluster/masklog.h
@@ -0,0 +1,219 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef O2CLUSTER_MASKLOG_H
+#define O2CLUSTER_MASKLOG_H
+
+/*
+ * For now this is a trivial wrapper around printk() that gives the critical
+ * ability to enable sets of debugging output at run-time.  In the future this
+ * will almost certainly be redirected to relayfs so that it can pay a
+ * substantially lower heisenberg tax.
+ *
+ * Callers associate the message with a bitmask and a global bitmask is
+ * maintained with help from /proc.  If any of the bits match the message is
+ * output.
+ *
+ * We must have efficient bit tests on i386 and it seems gcc still emits crazy
+ * code for the 64bit compare.  It emits very good code for the dual unsigned
+ * long tests, though, completely avoiding tests that can never pass if the
+ * caller gives a constant bitmask that fills one of the longs with all 0s.  So
+ * the desire is to have almost all of the calls decided on by comparing just
+ * one of the longs.  This leads to having infrequently given bits that are
+ * frequently matched in the high bits.
+ *
+ * _ERROR and _NOTICE are used for messages that always go to the console and
+ * have appropriate KERN_ prefixes.  We wrap these in our function instead of
+ * just calling printk() so that this can eventually make its way through
+ * relayfs along with the debugging messages.  Everything else gets KERN_DEBUG.
+ * The inline tests and macro dance give GCC the opportunity to quite cleverly
+ * only emit the appropriage printk() when the caller passes in a constant
+ * mask, as is almost always the case.
+ *
+ * All this bitmask nonsense is managed from the files under
+ * /sys/fs/o2cb/logmask/.  Reading the files gives a straightforward
+ * indication of which bits are allowed (allow) or denied (off/deny).
+ * 	ENTRY deny
+ * 	EXIT deny
+ * 	TCP off
+ * 	MSG off
+ * 	SOCKET off
+ * 	ERROR allow
+ * 	NOTICE allow
+ *
+ * Writing changes the state of a given bit and requires a strictly formatted
+ * single write() call:
+ *
+ * 	write(fd, "allow", 5);
+ *
+ * Echoing allow/deny/off string into the logmask files can flip the bits
+ * on or off as expected; here is the bash script for example:
+ *
+ * log_mask="/sys/fs/o2cb/log_mask"
+ * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do
+ *	echo allow >"$log_mask"/"$node"
+ * done
+ *
+ * The debugfs.ocfs2 tool can also flip the bits with the -l option:
+ *
+ * debugfs.ocfs2 -l TCP allow
+ */
+
+/* for task_struct */
+#include <linux/sched.h>
+
+/* bits that are frequently given and infrequently matched in the low word */
+/* NOTE: If you add a flag, you need to also update masklog.c! */
+#define ML_TCP		0x0000000000000001ULL /* net cluster/tcp.c */
+#define ML_MSG		0x0000000000000002ULL /* net network messages */
+#define ML_SOCKET	0x0000000000000004ULL /* net socket lifetime */
+#define ML_HEARTBEAT	0x0000000000000008ULL /* hb all heartbeat tracking */
+#define ML_HB_BIO	0x0000000000000010ULL /* hb io tracing */
+#define ML_DLMFS	0x0000000000000020ULL /* dlm user dlmfs */
+#define ML_DLM		0x0000000000000040ULL /* dlm general debugging */
+#define ML_DLM_DOMAIN	0x0000000000000080ULL /* dlm domain debugging */
+#define ML_DLM_THREAD	0x0000000000000100ULL /* dlm domain thread */
+#define ML_DLM_MASTER	0x0000000000000200ULL /* dlm master functions */
+#define ML_DLM_RECOVERY	0x0000000000000400ULL /* dlm master functions */
+#define ML_DLM_GLUE	0x0000000000000800ULL /* ocfs2 dlm glue layer */
+#define ML_VOTE		0x0000000000001000ULL /* ocfs2 node messaging  */
+#define ML_CONN		0x0000000000002000ULL /* net connection management */
+#define ML_QUORUM	0x0000000000004000ULL /* net connection quorum */
+#define ML_BASTS	0x0000000000008000ULL /* dlmglue asts and basts */
+#define ML_CLUSTER	0x0000000000010000ULL /* cluster stack */
+
+/* bits that are infrequently given and frequently matched in the high word */
+#define ML_ERROR	0x1000000000000000ULL /* sent to KERN_ERR */
+#define ML_NOTICE	0x2000000000000000ULL /* setn to KERN_NOTICE */
+#define ML_KTHREAD	0x4000000000000000ULL /* kernel thread activity */
+
+#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
+#ifndef MLOG_MASK_PREFIX
+#define MLOG_MASK_PREFIX 0
+#endif
+
+/*
+ * When logging is disabled, force the bit test to 0 for anything other
+ * than errors and notices, allowing gcc to remove the code completely.
+ * When enabled, allow all masks.
+ */
+#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
+#define ML_ALLOWED_BITS ~0
+#else
+#define ML_ALLOWED_BITS (ML_ERROR|ML_NOTICE)
+#endif
+
+#define MLOG_MAX_BITS 64
+
+struct mlog_bits {
+	unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG];
+};
+
+extern struct mlog_bits mlog_and_bits, mlog_not_bits;
+
+#if BITS_PER_LONG == 32
+
+#define __mlog_test_u64(mask, bits)			\
+	( (u32)(mask & 0xffffffff) & bits.words[0] || 	\
+	  ((u64)(mask) >> 32) & bits.words[1] )
+#define __mlog_set_u64(mask, bits) do {			\
+	bits.words[0] |= (u32)(mask & 0xffffffff);	\
+       	bits.words[1] |= (u64)(mask) >> 32;		\
+} while (0)
+#define __mlog_clear_u64(mask, bits) do {		\
+	bits.words[0] &= ~((u32)(mask & 0xffffffff));	\
+       	bits.words[1] &= ~((u64)(mask) >> 32);		\
+} while (0)
+#define MLOG_BITS_RHS(mask) {				\
+	{						\
+		[0] = (u32)(mask & 0xffffffff),		\
+		[1] = (u64)(mask) >> 32,		\
+	}						\
+}
+
+#else /* 32bit long above, 64bit long below */
+
+#define __mlog_test_u64(mask, bits)	((mask) & bits.words[0])
+#define __mlog_set_u64(mask, bits) do {		\
+	bits.words[0] |= (mask);		\
+} while (0)
+#define __mlog_clear_u64(mask, bits) do {	\
+	bits.words[0] &= ~(mask);		\
+} while (0)
+#define MLOG_BITS_RHS(mask) { { (mask) } }
+
+#endif
+
+/*
+ * smp_processor_id() "helpfully" screams when called outside preemptible
+ * regions in current kernels.  sles doesn't have the variants that don't
+ * scream.  just do this instead of trying to guess which we're building
+ * against.. *sigh*.
+ */
+#define __mlog_cpu_guess ({		\
+	unsigned long _cpu = get_cpu();	\
+	put_cpu();			\
+	_cpu;				\
+})
+
+/* In the following two macros, the whitespace after the ',' just
+ * before ##args is intentional. Otherwise, gcc 2.95 will eat the
+ * previous token if args expands to nothing.
+ */
+#define __mlog_printk(level, fmt, args...)				\
+	printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm,		\
+	       task_pid_nr(current), __mlog_cpu_guess,			\
+	       __PRETTY_FUNCTION__, __LINE__ , ##args)
+
+#define mlog(mask, fmt, args...) do {					\
+	u64 __m = MLOG_MASK_PREFIX | (mask);				\
+	if ((__m & ML_ALLOWED_BITS) &&					\
+	    __mlog_test_u64(__m, mlog_and_bits) &&			\
+	    !__mlog_test_u64(__m, mlog_not_bits)) {			\
+		if (__m & ML_ERROR)					\
+			__mlog_printk(KERN_ERR, "ERROR: "fmt , ##args);	\
+		else if (__m & ML_NOTICE)				\
+			__mlog_printk(KERN_NOTICE, fmt , ##args);	\
+		else __mlog_printk(KERN_INFO, fmt , ##args);		\
+	}								\
+} while (0)
+
+#define mlog_errno(st) do {						\
+	int _st = (st);							\
+	if (_st != -ERESTARTSYS && _st != -EINTR &&			\
+	    _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC)		\
+		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
+} while (0)
+
+#define mlog_bug_on_msg(cond, fmt, args...) do {			\
+	if (cond) {							\
+		mlog(ML_ERROR, "bug expression: " #cond "\n");		\
+		mlog(ML_ERROR, fmt, ##args);				\
+		BUG();							\
+	}								\
+} while (0)
+
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+int mlog_sys_init(struct kset *o2cb_subsys);
+void mlog_sys_shutdown(void);
+
+#endif /* O2CLUSTER_MASKLOG_H */
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
new file mode 100644
index 00000000..3a583590
--- /dev/null
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -0,0 +1,543 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * netdebug.c
+ *
+ * debug functionality for o2net
+ *
+ * Copyright (C) 2005, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifdef CONFIG_DEBUG_FS
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+
+#include <linux/uaccess.h>
+
+#include "tcp.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_TCP
+#include "masklog.h"
+
+#include "tcp_internal.h"
+
+#define O2NET_DEBUG_DIR		"o2net"
+#define SC_DEBUG_NAME		"sock_containers"
+#define NST_DEBUG_NAME		"send_tracking"
+#define STATS_DEBUG_NAME	"stats"
+
+#define SHOW_SOCK_CONTAINERS	0
+#define SHOW_SOCK_STATS		1
+
+static struct dentry *o2net_dentry;
+static struct dentry *sc_dentry;
+static struct dentry *nst_dentry;
+static struct dentry *stats_dentry;
+
+static DEFINE_SPINLOCK(o2net_debug_lock);
+
+static LIST_HEAD(sock_containers);
+static LIST_HEAD(send_tracking);
+
+void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+	spin_lock(&o2net_debug_lock);
+	list_add(&nst->st_net_debug_item, &send_tracking);
+	spin_unlock(&o2net_debug_lock);
+}
+
+void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+	spin_lock(&o2net_debug_lock);
+	if (!list_empty(&nst->st_net_debug_item))
+		list_del_init(&nst->st_net_debug_item);
+	spin_unlock(&o2net_debug_lock);
+}
+
+static struct o2net_send_tracking
+			*next_nst(struct o2net_send_tracking *nst_start)
+{
+	struct o2net_send_tracking *nst, *ret = NULL;
+
+	assert_spin_locked(&o2net_debug_lock);
+
+	list_for_each_entry(nst, &nst_start->st_net_debug_item,
+			    st_net_debug_item) {
+		/* discover the head of the list */
+		if (&nst->st_net_debug_item == &send_tracking)
+			break;
+
+		/* use st_task to detect real nsts in the list */
+		if (nst->st_task != NULL) {
+			ret = nst;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void *nst_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+
+	spin_lock(&o2net_debug_lock);
+	nst = next_nst(dummy_nst);
+	spin_unlock(&o2net_debug_lock);
+
+	return nst;
+}
+
+static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+
+	spin_lock(&o2net_debug_lock);
+	nst = next_nst(dummy_nst);
+	list_del_init(&dummy_nst->st_net_debug_item);
+	if (nst)
+		list_add(&dummy_nst->st_net_debug_item,
+			 &nst->st_net_debug_item);
+	spin_unlock(&o2net_debug_lock);
+
+	return nst; /* unused, just needs to be null when done */
+}
+
+static int nst_seq_show(struct seq_file *seq, void *v)
+{
+	struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+	ktime_t now;
+	s64 sock, send, status;
+
+	spin_lock(&o2net_debug_lock);
+	nst = next_nst(dummy_nst);
+	if (!nst)
+		goto out;
+
+	now = ktime_get();
+	sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
+	send = ktime_to_us(ktime_sub(now, nst->st_send_time));
+	status = ktime_to_us(ktime_sub(now, nst->st_status_time));
+
+	/* get_task_comm isn't exported.  oh well. */
+	seq_printf(seq, "%p:\n"
+		   "  pid:          %lu\n"
+		   "  tgid:         %lu\n"
+		   "  process name: %s\n"
+		   "  node:         %u\n"
+		   "  sc:           %p\n"
+		   "  message id:   %d\n"
+		   "  message type: %u\n"
+		   "  message key:  0x%08x\n"
+		   "  sock acquiry: %lld usecs ago\n"
+		   "  send start:   %lld usecs ago\n"
+		   "  wait start:   %lld usecs ago\n",
+		   nst, (unsigned long)task_pid_nr(nst->st_task),
+		   (unsigned long)nst->st_task->tgid,
+		   nst->st_task->comm, nst->st_node,
+		   nst->st_sc, nst->st_id, nst->st_msg_type,
+		   nst->st_msg_key,
+		   (long long)sock,
+		   (long long)send,
+		   (long long)status);
+
+out:
+	spin_unlock(&o2net_debug_lock);
+
+	return 0;
+}
+
+static void nst_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations nst_seq_ops = {
+	.start = nst_seq_start,
+	.next = nst_seq_next,
+	.stop = nst_seq_stop,
+	.show = nst_seq_show,
+};
+
+static int nst_fop_open(struct inode *inode, struct file *file)
+{
+	struct o2net_send_tracking *dummy_nst;
+	struct seq_file *seq;
+	int ret;
+
+	dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL);
+	if (dummy_nst == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	dummy_nst->st_task = NULL;
+
+	ret = seq_open(file, &nst_seq_ops);
+	if (ret)
+		goto out;
+
+	seq = file->private_data;
+	seq->private = dummy_nst;
+	o2net_debug_add_nst(dummy_nst);
+
+	dummy_nst = NULL;
+
+out:
+	kfree(dummy_nst);
+	return ret;
+}
+
+static int nst_fop_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct o2net_send_tracking *dummy_nst = seq->private;
+
+	o2net_debug_del_nst(dummy_nst);
+	return seq_release_private(inode, file);
+}
+
+static const struct file_operations nst_seq_fops = {
+	.open = nst_fop_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = nst_fop_release,
+};
+
+void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+	spin_lock(&o2net_debug_lock);
+	list_add(&sc->sc_net_debug_item, &sock_containers);
+	spin_unlock(&o2net_debug_lock);
+}
+
+void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+	spin_lock(&o2net_debug_lock);
+	list_del_init(&sc->sc_net_debug_item);
+	spin_unlock(&o2net_debug_lock);
+}
+
+struct o2net_sock_debug {
+	int dbg_ctxt;
+	struct o2net_sock_container *dbg_sock;
+};
+
+static struct o2net_sock_container
+			*next_sc(struct o2net_sock_container *sc_start)
+{
+	struct o2net_sock_container *sc, *ret = NULL;
+
+	assert_spin_locked(&o2net_debug_lock);
+
+	list_for_each_entry(sc, &sc_start->sc_net_debug_item,
+			    sc_net_debug_item) {
+		/* discover the head of the list miscast as a sc */
+		if (&sc->sc_net_debug_item == &sock_containers)
+			break;
+
+		/* use sc_page to detect real scs in the list */
+		if (sc->sc_page != NULL) {
+			ret = sc;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct o2net_sock_debug *sd = seq->private;
+	struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
+
+	spin_lock(&o2net_debug_lock);
+	sc = next_sc(dummy_sc);
+	spin_unlock(&o2net_debug_lock);
+
+	return sc;
+}
+
+static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct o2net_sock_debug *sd = seq->private;
+	struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
+
+	spin_lock(&o2net_debug_lock);
+	sc = next_sc(dummy_sc);
+	list_del_init(&dummy_sc->sc_net_debug_item);
+	if (sc)
+		list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item);
+	spin_unlock(&o2net_debug_lock);
+
+	return sc; /* unused, just needs to be null when done */
+}
+
+#ifdef CONFIG_OCFS2_FS_STATS
+# define sc_send_count(_s)		((_s)->sc_send_count)
+# define sc_recv_count(_s)		((_s)->sc_recv_count)
+# define sc_tv_acquiry_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_acquiry_total))
+# define sc_tv_send_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_send_total))
+# define sc_tv_status_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_status_total))
+# define sc_tv_process_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_process_total))
+#else
+# define sc_send_count(_s)		(0U)
+# define sc_recv_count(_s)		(0U)
+# define sc_tv_acquiry_total_ns(_s)	(0LL)
+# define sc_tv_send_total_ns(_s)	(0LL)
+# define sc_tv_status_total_ns(_s)	(0LL)
+# define sc_tv_process_total_ns(_s)	(0LL)
+#endif
+
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define O2NET_STATS_STR_VERSION		1
+static void sc_show_sock_stats(struct seq_file *seq,
+			       struct o2net_sock_container *sc)
+{
+	if (!sc)
+		return;
+
+	seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
+		   sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
+		   (long long)sc_tv_acquiry_total_ns(sc),
+		   (long long)sc_tv_send_total_ns(sc),
+		   (long long)sc_tv_status_total_ns(sc),
+		   (unsigned long)sc_recv_count(sc),
+		   (long long)sc_tv_process_total_ns(sc));
+}
+
+static void sc_show_sock_container(struct seq_file *seq,
+				   struct o2net_sock_container *sc)
+{
+	struct inet_sock *inet = NULL;
+	__be32 saddr = 0, daddr = 0;
+	__be16 sport = 0, dport = 0;
+
+	if (!sc)
+		return;
+
+	if (sc->sc_sock) {
+		inet = inet_sk(sc->sc_sock->sk);
+		/* the stack's structs aren't sparse endian clean */
+		saddr = (__force __be32)inet->inet_saddr;
+		daddr = (__force __be32)inet->inet_daddr;
+		sport = (__force __be16)inet->inet_sport;
+		dport = (__force __be16)inet->inet_dport;
+	}
+
+	/* XXX sigh, inet-> doesn't have sparse annotation so any
+	 * use of it here generates a warning with -Wbitwise */
+	seq_printf(seq, "%p:\n"
+		   "  krefs:           %d\n"
+		   "  sock:            %pI4:%u -> "
+				      "%pI4:%u\n"
+		   "  remote node:     %s\n"
+		   "  page off:        %zu\n"
+		   "  handshake ok:    %u\n"
+		   "  timer:           %lld usecs\n"
+		   "  data ready:      %lld usecs\n"
+		   "  advance start:   %lld usecs\n"
+		   "  advance stop:    %lld usecs\n"
+		   "  func start:      %lld usecs\n"
+		   "  func stop:       %lld usecs\n"
+		   "  func key:        0x%08x\n"
+		   "  func type:       %u\n",
+		   sc,
+		   atomic_read(&sc->sc_kref.refcount),
+		   &saddr, inet ? ntohs(sport) : 0,
+		   &daddr, inet ? ntohs(dport) : 0,
+		   sc->sc_node->nd_name,
+		   sc->sc_page_off,
+		   sc->sc_handshake_ok,
+		   (long long)ktime_to_us(sc->sc_tv_timer),
+		   (long long)ktime_to_us(sc->sc_tv_data_ready),
+		   (long long)ktime_to_us(sc->sc_tv_advance_start),
+		   (long long)ktime_to_us(sc->sc_tv_advance_stop),
+		   (long long)ktime_to_us(sc->sc_tv_func_start),
+		   (long long)ktime_to_us(sc->sc_tv_func_stop),
+		   sc->sc_msg_key,
+		   sc->sc_msg_type);
+}
+
+static int sc_seq_show(struct seq_file *seq, void *v)
+{
+	struct o2net_sock_debug *sd = seq->private;
+	struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
+
+	spin_lock(&o2net_debug_lock);
+	sc = next_sc(dummy_sc);
+
+	if (sc) {
+		if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
+			sc_show_sock_container(seq, sc);
+		else
+			sc_show_sock_stats(seq, sc);
+	}
+
+	spin_unlock(&o2net_debug_lock);
+
+	return 0;
+}
+
+static void sc_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations sc_seq_ops = {
+	.start = sc_seq_start,
+	.next = sc_seq_next,
+	.stop = sc_seq_stop,
+	.show = sc_seq_show,
+};
+
+static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
+{
+	struct o2net_sock_container *dummy_sc;
+	struct seq_file *seq;
+	int ret;
+
+	dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL);
+	if (dummy_sc == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	dummy_sc->sc_page = NULL;
+
+	ret = seq_open(file, &sc_seq_ops);
+	if (ret)
+		goto out;
+
+	seq = file->private_data;
+	seq->private = sd;
+	sd->dbg_sock = dummy_sc;
+	o2net_debug_add_sc(dummy_sc);
+
+	dummy_sc = NULL;
+
+out:
+	kfree(dummy_sc);
+	return ret;
+}
+
+static int sc_fop_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct o2net_sock_debug *sd = seq->private;
+	struct o2net_sock_container *dummy_sc = sd->dbg_sock;
+
+	o2net_debug_del_sc(dummy_sc);
+	return seq_release_private(inode, file);
+}
+
+static int stats_fop_open(struct inode *inode, struct file *file)
+{
+	struct o2net_sock_debug *sd;
+
+	sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+	if (sd == NULL)
+		return -ENOMEM;
+
+	sd->dbg_ctxt = SHOW_SOCK_STATS;
+	sd->dbg_sock = NULL;
+
+	return sc_common_open(file, sd);
+}
+
+static const struct file_operations stats_seq_fops = {
+	.open = stats_fop_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = sc_fop_release,
+};
+
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+	struct o2net_sock_debug *sd;
+
+	sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+	if (sd == NULL)
+		return -ENOMEM;
+
+	sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
+	sd->dbg_sock = NULL;
+
+	return sc_common_open(file, sd);
+}
+
+static const struct file_operations sc_seq_fops = {
+	.open = sc_fop_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = sc_fop_release,
+};
+
+int o2net_debugfs_init(void)
+{
+	o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
+	if (!o2net_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
+					 o2net_dentry, NULL,
+					 &nst_seq_fops);
+	if (!nst_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
+					o2net_dentry, NULL,
+					&sc_seq_fops);
+	if (!sc_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
+					   o2net_dentry, NULL,
+					   &stats_seq_fops);
+	if (!stats_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	return 0;
+bail:
+	debugfs_remove(stats_dentry);
+	debugfs_remove(sc_dentry);
+	debugfs_remove(nst_dentry);
+	debugfs_remove(o2net_dentry);
+	return -ENOMEM;
+}
+
+void o2net_debugfs_exit(void)
+{
+	debugfs_remove(stats_dentry);
+	debugfs_remove(sc_dentry);
+	debugfs_remove(nst_dentry);
+	debugfs_remove(o2net_dentry);
+}
+
+#endif	/* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
new file mode 100644
index 00000000..bb240647
--- /dev/null
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -0,0 +1,989 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/configfs.h>
+
+#include "tcp.h"
+#include "nodemanager.h"
+#include "heartbeat.h"
+#include "masklog.h"
+#include "sys.h"
+#include "ver.h"
+
+/* for now we operate under the assertion that there can be only one
+ * cluster active at a time.  Changing this will require trickling
+ * cluster references throughout where nodes are looked up */
+struct o2nm_cluster *o2nm_single_cluster = NULL;
+
+char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
+		"reset",	/* O2NM_FENCE_RESET */
+		"panic",	/* O2NM_FENCE_PANIC */
+};
+
+struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
+{
+	struct o2nm_node *node = NULL;
+
+	if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL)
+		goto out;
+
+	read_lock(&o2nm_single_cluster->cl_nodes_lock);
+	node = o2nm_single_cluster->cl_nodes[node_num];
+	if (node)
+		config_item_get(&node->nd_item);
+	read_unlock(&o2nm_single_cluster->cl_nodes_lock);
+out:
+	return node;
+}
+EXPORT_SYMBOL_GPL(o2nm_get_node_by_num);
+
+int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
+{
+	struct o2nm_cluster *cluster = o2nm_single_cluster;
+
+	BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap)));
+
+	if (cluster == NULL)
+		return -EINVAL;
+
+	read_lock(&cluster->cl_nodes_lock);
+	memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
+	read_unlock(&cluster->cl_nodes_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(o2nm_configured_node_map);
+
+static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
+						  __be32 ip_needle,
+						  struct rb_node ***ret_p,
+						  struct rb_node **ret_parent)
+{
+	struct rb_node **p = &cluster->cl_node_ip_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct o2nm_node *node, *ret = NULL;
+
+	while (*p) {
+		int cmp;
+
+		parent = *p;
+		node = rb_entry(parent, struct o2nm_node, nd_ip_node);
+
+		cmp = memcmp(&ip_needle, &node->nd_ipv4_address,
+				sizeof(ip_needle));
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else if (cmp > 0)
+			p = &(*p)->rb_right;
+		else {
+			ret = node;
+			break;
+		}
+	}
+
+	if (ret_p != NULL)
+		*ret_p = p;
+	if (ret_parent != NULL)
+		*ret_parent = parent;
+
+	return ret;
+}
+
+struct o2nm_node *o2nm_get_node_by_ip(__be32 addr)
+{
+	struct o2nm_node *node = NULL;
+	struct o2nm_cluster *cluster = o2nm_single_cluster;
+
+	if (cluster == NULL)
+		goto out;
+
+	read_lock(&cluster->cl_nodes_lock);
+	node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL);
+	if (node)
+		config_item_get(&node->nd_item);
+	read_unlock(&cluster->cl_nodes_lock);
+
+out:
+	return node;
+}
+EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip);
+
+void o2nm_node_put(struct o2nm_node *node)
+{
+	config_item_put(&node->nd_item);
+}
+EXPORT_SYMBOL_GPL(o2nm_node_put);
+
+void o2nm_node_get(struct o2nm_node *node)
+{
+	config_item_get(&node->nd_item);
+}
+EXPORT_SYMBOL_GPL(o2nm_node_get);
+
+u8 o2nm_this_node(void)
+{
+	u8 node_num = O2NM_MAX_NODES;
+
+	if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local)
+		node_num = o2nm_single_cluster->cl_local_node;
+
+	return node_num;
+}
+EXPORT_SYMBOL_GPL(o2nm_this_node);
+
+/* node configfs bits */
+
+static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item)
+{
+	return item ?
+		container_of(to_config_group(item), struct o2nm_cluster,
+			     cl_group)
+		: NULL;
+}
+
+static struct o2nm_node *to_o2nm_node(struct config_item *item)
+{
+	return item ? container_of(item, struct o2nm_node, nd_item) : NULL;
+}
+
+static void o2nm_node_release(struct config_item *item)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	kfree(node);
+}
+
+static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%d\n", node->nd_num);
+}
+
+static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
+{
+	/* through the first node_set .parent
+	 * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
+	return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
+}
+
+enum {
+	O2NM_NODE_ATTR_NUM = 0,
+	O2NM_NODE_ATTR_PORT,
+	O2NM_NODE_ATTR_ADDRESS,
+	O2NM_NODE_ATTR_LOCAL,
+};
+
+static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
+				   size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp >= O2NM_MAX_NODES)
+		return -ERANGE;
+
+	/* once we're in the cl_nodes tree networking can look us up by
+	 * node number and try to use our address and port attributes
+	 * to connect to this node.. make sure that they've been set
+	 * before writing the node attribute? */
+	if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
+	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
+		return -EINVAL; /* XXX */
+
+	write_lock(&cluster->cl_nodes_lock);
+	if (cluster->cl_nodes[tmp])
+		p = NULL;
+	else  {
+		cluster->cl_nodes[tmp] = node;
+		node->nd_num = tmp;
+		set_bit(tmp, cluster->cl_nodes_bitmap);
+	}
+	write_unlock(&cluster->cl_nodes_lock);
+	if (p == NULL)
+		return -EEXIST;
+
+	return count;
+}
+static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
+}
+
+static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
+					 const char *page, size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp == 0)
+		return -EINVAL;
+	if (tmp >= (u16)-1)
+		return -ERANGE;
+
+	node->nd_ipv4_port = htons(tmp);
+
+	return count;
+}
+
+static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%pI4\n", &node->nd_ipv4_address);
+}
+
+static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
+					    const char *page,
+					    size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	int ret, i;
+	struct rb_node **p, *parent;
+	unsigned int octets[4];
+	__be32 ipv4_addr = 0;
+
+	ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2],
+		     &octets[1], &octets[0]);
+	if (ret != 4)
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(octets); i++) {
+		if (octets[i] > 255)
+			return -ERANGE;
+		be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
+	}
+
+	ret = 0;
+	write_lock(&cluster->cl_nodes_lock);
+	if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
+		ret = -EEXIST;
+	else {
+		rb_link_node(&node->nd_ip_node, parent, p);
+		rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
+	}
+	write_unlock(&cluster->cl_nodes_lock);
+	if (ret)
+		return ret;
+
+	memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr));
+
+	return count;
+}
+
+static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%d\n", node->nd_local);
+}
+
+static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
+				     size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	unsigned long tmp;
+	char *p = (char *)page;
+	ssize_t ret;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	tmp = !!tmp; /* boolean of whether this node wants to be local */
+
+	/* setting local turns on networking rx for now so we require having
+	 * set everything else first */
+	if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
+	    !test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) ||
+	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
+		return -EINVAL; /* XXX */
+
+	/* the only failure case is trying to set a new local node
+	 * when a different one is already set */
+	if (tmp && tmp == cluster->cl_has_local &&
+	    cluster->cl_local_node != node->nd_num)
+		return -EBUSY;
+
+	/* bring up the rx thread if we're setting the new local node. */
+	if (tmp && !cluster->cl_has_local) {
+		ret = o2net_start_listening(node);
+		if (ret)
+			return ret;
+	}
+
+	if (!tmp && cluster->cl_has_local &&
+	    cluster->cl_local_node == node->nd_num) {
+		o2net_stop_listening(node);
+		cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+	}
+
+	node->nd_local = tmp;
+	if (node->nd_local) {
+		cluster->cl_has_local = tmp;
+		cluster->cl_local_node = node->nd_num;
+	}
+
+	return count;
+}
+
+struct o2nm_node_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2nm_node *, char *);
+	ssize_t (*store)(struct o2nm_node *, const char *, size_t);
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_num = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "num",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_num_read,
+	.store	= o2nm_node_num_write,
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "ipv4_port",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_ipv4_port_read,
+	.store	= o2nm_node_ipv4_port_write,
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "ipv4_address",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_ipv4_address_read,
+	.store	= o2nm_node_ipv4_address_write,
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_local = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "local",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_local_read,
+	.store	= o2nm_node_local_write,
+};
+
+static struct configfs_attribute *o2nm_node_attrs[] = {
+	[O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
+	[O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
+	[O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
+	[O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
+	NULL,
+};
+
+static int o2nm_attr_index(struct configfs_attribute *attr)
+{
+	int i;
+	for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
+		if (attr == o2nm_node_attrs[i])
+			return i;
+	}
+	BUG();
+	return 0;
+}
+
+static ssize_t o2nm_node_show(struct config_item *item,
+			      struct configfs_attribute *attr,
+			      char *page)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	struct o2nm_node_attribute *o2nm_node_attr =
+		container_of(attr, struct o2nm_node_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2nm_node_attr->show)
+		ret = o2nm_node_attr->show(node, page);
+	return ret;
+}
+
+static ssize_t o2nm_node_store(struct config_item *item,
+			       struct configfs_attribute *attr,
+			       const char *page, size_t count)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	struct o2nm_node_attribute *o2nm_node_attr =
+		container_of(attr, struct o2nm_node_attribute, attr);
+	ssize_t ret;
+	int attr_index = o2nm_attr_index(attr);
+
+	if (o2nm_node_attr->store == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (test_bit(attr_index, &node->nd_set_attributes))
+		return -EBUSY;
+
+	ret = o2nm_node_attr->store(node, page, count);
+	if (ret < count)
+		goto out;
+
+	set_bit(attr_index, &node->nd_set_attributes);
+out:
+	return ret;
+}
+
+static struct configfs_item_operations o2nm_node_item_ops = {
+	.release		= o2nm_node_release,
+	.show_attribute		= o2nm_node_show,
+	.store_attribute	= o2nm_node_store,
+};
+
+static struct config_item_type o2nm_node_type = {
+	.ct_item_ops	= &o2nm_node_item_ops,
+	.ct_attrs	= o2nm_node_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* node set */
+
+struct o2nm_node_group {
+	struct config_group ns_group;
+	/* some stuff? */
+};
+
+#if 0
+static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
+{
+	return group ?
+		container_of(group, struct o2nm_node_group, ns_group)
+		: NULL;
+}
+#endif
+
+struct o2nm_cluster_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2nm_cluster *, char *);
+	ssize_t (*store)(struct o2nm_cluster *, const char *, size_t);
+};
+
+static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
+                                       unsigned int *val)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp == 0)
+		return -EINVAL;
+	if (tmp >= (u32)-1)
+		return -ERANGE;
+
+	*val = tmp;
+
+	return count;
+}
+
+static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(
+	struct o2nm_cluster *cluster, char *page)
+{
+	return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms);
+}
+
+static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
+	struct o2nm_cluster *cluster, const char *page, size_t count)
+{
+	ssize_t ret;
+	unsigned int val;
+
+	ret =  o2nm_cluster_attr_write(page, count, &val);
+
+	if (ret > 0) {
+		if (cluster->cl_idle_timeout_ms != val
+			&& o2net_num_connected_peers()) {
+			mlog(ML_NOTICE,
+			     "o2net: cannot change idle timeout after "
+			     "the first peer has agreed to it."
+			     "  %d connected peers\n",
+			     o2net_num_connected_peers());
+			ret = -EINVAL;
+		} else if (val <= cluster->cl_keepalive_delay_ms) {
+			mlog(ML_NOTICE, "o2net: idle timeout must be larger "
+			     "than keepalive delay\n");
+			ret = -EINVAL;
+		} else {
+			cluster->cl_idle_timeout_ms = val;
+		}
+	}
+
+	return ret;
+}
+
+static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read(
+	struct o2nm_cluster *cluster, char *page)
+{
+	return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms);
+}
+
+static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
+	struct o2nm_cluster *cluster, const char *page, size_t count)
+{
+	ssize_t ret;
+	unsigned int val;
+
+	ret =  o2nm_cluster_attr_write(page, count, &val);
+
+	if (ret > 0) {
+		if (cluster->cl_keepalive_delay_ms != val
+		    && o2net_num_connected_peers()) {
+			mlog(ML_NOTICE,
+			     "o2net: cannot change keepalive delay after"
+			     " the first peer has agreed to it."
+			     "  %d connected peers\n",
+			     o2net_num_connected_peers());
+			ret = -EINVAL;
+		} else if (val >= cluster->cl_idle_timeout_ms) {
+			mlog(ML_NOTICE, "o2net: keepalive delay must be "
+			     "smaller than idle timeout\n");
+			ret = -EINVAL;
+		} else {
+			cluster->cl_keepalive_delay_ms = val;
+		}
+	}
+
+	return ret;
+}
+
+static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read(
+	struct o2nm_cluster *cluster, char *page)
+{
+	return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms);
+}
+
+static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
+	struct o2nm_cluster *cluster, const char *page, size_t count)
+{
+	return o2nm_cluster_attr_write(page, count,
+	                               &cluster->cl_reconnect_delay_ms);
+}
+
+static ssize_t o2nm_cluster_attr_fence_method_read(
+	struct o2nm_cluster *cluster, char *page)
+{
+	ssize_t ret = 0;
+
+	if (cluster)
+		ret = sprintf(page, "%s\n",
+			      o2nm_fence_method_desc[cluster->cl_fence_method]);
+	return ret;
+}
+
+static ssize_t o2nm_cluster_attr_fence_method_write(
+	struct o2nm_cluster *cluster, const char *page, size_t count)
+{
+	unsigned int i;
+
+	if (page[count - 1] != '\n')
+		goto bail;
+
+	for (i = 0; i < O2NM_FENCE_METHODS; ++i) {
+		if (count != strlen(o2nm_fence_method_desc[i]) + 1)
+			continue;
+		if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1))
+			continue;
+		if (cluster->cl_fence_method != i) {
+			printk(KERN_INFO "ocfs2: Changing fence method to %s\n",
+			       o2nm_fence_method_desc[i]);
+			cluster->cl_fence_method = i;
+		}
+		return count;
+	}
+
+bail:
+	return -EINVAL;
+}
+
+static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "idle_timeout_ms",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_cluster_attr_idle_timeout_ms_read,
+	.store	= o2nm_cluster_attr_idle_timeout_ms_write,
+};
+
+static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "keepalive_delay_ms",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_cluster_attr_keepalive_delay_ms_read,
+	.store	= o2nm_cluster_attr_keepalive_delay_ms_write,
+};
+
+static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "reconnect_delay_ms",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_cluster_attr_reconnect_delay_ms_read,
+	.store	= o2nm_cluster_attr_reconnect_delay_ms_write,
+};
+
+static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "fence_method",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_cluster_attr_fence_method_read,
+	.store	= o2nm_cluster_attr_fence_method_write,
+};
+
+static struct configfs_attribute *o2nm_cluster_attrs[] = {
+	&o2nm_cluster_attr_idle_timeout_ms.attr,
+	&o2nm_cluster_attr_keepalive_delay_ms.attr,
+	&o2nm_cluster_attr_reconnect_delay_ms.attr,
+	&o2nm_cluster_attr_fence_method.attr,
+	NULL,
+};
+static ssize_t o2nm_cluster_show(struct config_item *item,
+                                 struct configfs_attribute *attr,
+                                 char *page)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+	struct o2nm_cluster_attribute *o2nm_cluster_attr =
+		container_of(attr, struct o2nm_cluster_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2nm_cluster_attr->show)
+		ret = o2nm_cluster_attr->show(cluster, page);
+	return ret;
+}
+
+static ssize_t o2nm_cluster_store(struct config_item *item,
+                                  struct configfs_attribute *attr,
+                                  const char *page, size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+	struct o2nm_cluster_attribute *o2nm_cluster_attr =
+		container_of(attr, struct o2nm_cluster_attribute, attr);
+	ssize_t ret;
+
+	if (o2nm_cluster_attr->store == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = o2nm_cluster_attr->store(cluster, page, count);
+	if (ret < count)
+		goto out;
+out:
+	return ret;
+}
+
+static struct config_item *o2nm_node_group_make_item(struct config_group *group,
+						     const char *name)
+{
+	struct o2nm_node *node = NULL;
+
+	if (strlen(name) > O2NM_MAX_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL);
+	if (node == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
+	config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
+	spin_lock_init(&node->nd_lock);
+
+	mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
+
+	return &node->nd_item;
+}
+
+static void o2nm_node_group_drop_item(struct config_group *group,
+				      struct config_item *item)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
+
+	o2net_disconnect_node(node);
+
+	if (cluster->cl_has_local &&
+	    (cluster->cl_local_node == node->nd_num)) {
+		cluster->cl_has_local = 0;
+		cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+		o2net_stop_listening(node);
+	}
+
+	/* XXX call into net to stop this node from trading messages */
+
+	write_lock(&cluster->cl_nodes_lock);
+
+	/* XXX sloppy */
+	if (node->nd_ipv4_address)
+		rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree);
+
+	/* nd_num might be 0 if the node number hasn't been set.. */
+	if (cluster->cl_nodes[node->nd_num] == node) {
+		cluster->cl_nodes[node->nd_num] = NULL;
+		clear_bit(node->nd_num, cluster->cl_nodes_bitmap);
+	}
+	write_unlock(&cluster->cl_nodes_lock);
+
+	mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
+	     config_item_name(&node->nd_item));
+
+	config_item_put(item);
+}
+
+static struct configfs_group_operations o2nm_node_group_group_ops = {
+	.make_item	= o2nm_node_group_make_item,
+	.drop_item	= o2nm_node_group_drop_item,
+};
+
+static struct config_item_type o2nm_node_group_type = {
+	.ct_group_ops	= &o2nm_node_group_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* cluster */
+
+static void o2nm_cluster_release(struct config_item *item)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+
+	kfree(cluster->cl_group.default_groups);
+	kfree(cluster);
+}
+
+static struct configfs_item_operations o2nm_cluster_item_ops = {
+	.release	= o2nm_cluster_release,
+	.show_attribute		= o2nm_cluster_show,
+	.store_attribute	= o2nm_cluster_store,
+};
+
+static struct config_item_type o2nm_cluster_type = {
+	.ct_item_ops	= &o2nm_cluster_item_ops,
+	.ct_attrs	= o2nm_cluster_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* cluster set */
+
+struct o2nm_cluster_group {
+	struct configfs_subsystem cs_subsys;
+	/* some stuff? */
+};
+
+#if 0
+static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group)
+{
+	return group ?
+		container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys)
+	       : NULL;
+}
+#endif
+
+static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
+							  const char *name)
+{
+	struct o2nm_cluster *cluster = NULL;
+	struct o2nm_node_group *ns = NULL;
+	struct config_group *o2hb_group = NULL, *ret = NULL;
+	void *defs = NULL;
+
+	/* this runs under the parent dir's i_mutex; there can be only
+	 * one caller in here at a time */
+	if (o2nm_single_cluster)
+		return ERR_PTR(-ENOSPC);
+
+	cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
+	ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
+	defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
+	o2hb_group = o2hb_alloc_hb_set();
+	if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
+		goto out;
+
+	config_group_init_type_name(&cluster->cl_group, name,
+				    &o2nm_cluster_type);
+	config_group_init_type_name(&ns->ns_group, "node",
+				    &o2nm_node_group_type);
+
+	cluster->cl_group.default_groups = defs;
+	cluster->cl_group.default_groups[0] = &ns->ns_group;
+	cluster->cl_group.default_groups[1] = o2hb_group;
+	cluster->cl_group.default_groups[2] = NULL;
+	rwlock_init(&cluster->cl_nodes_lock);
+	cluster->cl_node_ip_tree = RB_ROOT;
+	cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT;
+	cluster->cl_idle_timeout_ms    = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
+	cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
+	cluster->cl_fence_method       = O2NM_FENCE_RESET;
+
+	ret = &cluster->cl_group;
+	o2nm_single_cluster = cluster;
+
+out:
+	if (ret == NULL) {
+		kfree(cluster);
+		kfree(ns);
+		o2hb_free_hb_set(o2hb_group);
+		kfree(defs);
+		ret = ERR_PTR(-ENOMEM);
+	}
+
+	return ret;
+}
+
+static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+	int i;
+	struct config_item *killme;
+
+	BUG_ON(o2nm_single_cluster != cluster);
+	o2nm_single_cluster = NULL;
+
+	for (i = 0; cluster->cl_group.default_groups[i]; i++) {
+		killme = &cluster->cl_group.default_groups[i]->cg_item;
+		cluster->cl_group.default_groups[i] = NULL;
+		config_item_put(killme);
+	}
+
+	config_item_put(item);
+}
+
+static struct configfs_group_operations o2nm_cluster_group_group_ops = {
+	.make_group	= o2nm_cluster_group_make_group,
+	.drop_item	= o2nm_cluster_group_drop_item,
+};
+
+static struct config_item_type o2nm_cluster_group_type = {
+	.ct_group_ops	= &o2nm_cluster_group_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct o2nm_cluster_group o2nm_cluster_group = {
+	.cs_subsys = {
+		.su_group = {
+			.cg_item = {
+				.ci_namebuf = "cluster",
+				.ci_type = &o2nm_cluster_group_type,
+			},
+		},
+	},
+};
+
+int o2nm_depend_item(struct config_item *item)
+{
+	return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
+}
+
+void o2nm_undepend_item(struct config_item *item)
+{
+	configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
+}
+
+int o2nm_depend_this_node(void)
+{
+	int ret = 0;
+	struct o2nm_node *local_node;
+
+	local_node = o2nm_get_node_by_num(o2nm_this_node());
+	if (!local_node) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = o2nm_depend_item(&local_node->nd_item);
+	o2nm_node_put(local_node);
+
+out:
+	return ret;
+}
+
+void o2nm_undepend_this_node(void)
+{
+	struct o2nm_node *local_node;
+
+	local_node = o2nm_get_node_by_num(o2nm_this_node());
+	BUG_ON(!local_node);
+
+	o2nm_undepend_item(&local_node->nd_item);
+	o2nm_node_put(local_node);
+}
+
+
+static void __exit exit_o2nm(void)
+{
+	/* XXX sync with hb callbacks and shut down hb? */
+	o2net_unregister_hb_callbacks();
+	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
+	o2cb_sys_shutdown();
+
+	o2net_exit();
+	o2hb_exit();
+}
+
+static int __init init_o2nm(void)
+{
+	int ret = -1;
+
+	cluster_print_version();
+
+	ret = o2hb_init();
+	if (ret)
+		goto out;
+
+	ret = o2net_init();
+	if (ret)
+		goto out_o2hb;
+
+	ret = o2net_register_hb_callbacks();
+	if (ret)
+		goto out_o2net;
+
+	config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
+	mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
+	ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
+	if (ret) {
+		printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
+		goto out_callbacks;
+	}
+
+	ret = o2cb_sys_init();
+	if (!ret)
+		goto out;
+
+	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
+out_callbacks:
+	o2net_unregister_hb_callbacks();
+out_o2net:
+	o2net_exit();
+out_o2hb:
+	o2hb_exit();
+out:
+	return ret;
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+module_init(init_o2nm)
+module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
new file mode 100644
index 00000000..09ea2d38
--- /dev/null
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -0,0 +1,88 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * nodemanager.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_NODEMANAGER_H
+#define O2CLUSTER_NODEMANAGER_H
+
+#include "ocfs2_nodemanager.h"
+
+/* This totally doesn't belong here. */
+#include <linux/configfs.h>
+#include <linux/rbtree.h>
+
+enum o2nm_fence_method {
+	O2NM_FENCE_RESET	= 0,
+	O2NM_FENCE_PANIC,
+	O2NM_FENCE_METHODS,	/* Number of fence methods */
+};
+
+struct o2nm_node {
+	spinlock_t		nd_lock;
+	struct config_item	nd_item;
+	char			nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */
+	__u8			nd_num;
+	/* only one address per node, as attributes, for now. */
+	__be32			nd_ipv4_address;
+	__be16			nd_ipv4_port;
+	struct rb_node		nd_ip_node;
+	/* there can be only one local node for now */
+	int			nd_local;
+
+	unsigned long		nd_set_attributes;
+};
+
+struct o2nm_cluster {
+	struct config_group	cl_group;
+	unsigned		cl_has_local:1;
+	u8			cl_local_node;
+	rwlock_t		cl_nodes_lock;
+	struct o2nm_node  	*cl_nodes[O2NM_MAX_NODES];
+	struct rb_root		cl_node_ip_tree;
+	unsigned int		cl_idle_timeout_ms;
+	unsigned int		cl_keepalive_delay_ms;
+	unsigned int		cl_reconnect_delay_ms;
+	enum o2nm_fence_method	cl_fence_method;
+
+	/* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
+	unsigned long	cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+extern struct o2nm_cluster *o2nm_single_cluster;
+
+u8 o2nm_this_node(void);
+
+int o2nm_configured_node_map(unsigned long *map, unsigned bytes);
+struct o2nm_node *o2nm_get_node_by_num(u8 node_num);
+struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
+void o2nm_node_get(struct o2nm_node *node);
+void o2nm_node_put(struct o2nm_node *node);
+
+int o2nm_depend_item(struct config_item *item);
+void o2nm_undepend_item(struct config_item *item);
+int o2nm_depend_this_node(void);
+void o2nm_undepend_this_node(void);
+
+#endif /* O2CLUSTER_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h
new file mode 100644
index 00000000..3f4151da
--- /dev/null
+++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h
@@ -0,0 +1,38 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_heartbeat.h
+ *
+ * On-disk structures for ocfs2_heartbeat
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS2_HEARTBEAT_H
+#define _OCFS2_HEARTBEAT_H
+
+struct o2hb_disk_heartbeat_block {
+	__le64 hb_seq;
+	__u8  hb_node;
+	__u8  hb_pad1[3];
+	__le32 hb_cksum;
+	__le64 hb_generation;
+	__le32 hb_dead_ms;
+};
+
+#endif /* _OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
new file mode 100644
index 00000000..49b59432
--- /dev/null
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -0,0 +1,45 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_nodemanager.h
+ *
+ * Header describing the interface between userspace and the kernel
+ * for the ocfs2_nodemanager module.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef _OCFS2_NODEMANAGER_H
+#define _OCFS2_NODEMANAGER_H
+
+#define O2NM_API_VERSION	5
+
+#define O2NM_MAX_NODES		255
+#define O2NM_INVALID_NODE_NUM	255
+
+/* host name, group name, cluster name all 64 bytes */
+#define O2NM_MAX_NAME_LEN        64    // __NEW_UTS_LEN
+
+/*
+ * Maximum number of global heartbeat regions allowed.
+ * **CAUTION**  Changing this number will break dlm compatibility.
+ */
+#define O2NM_MAX_REGIONS	32
+
+#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
new file mode 100644
index 00000000..8f9cea15
--- /dev/null
+++ b/fs/ocfs2/cluster/quorum.c
@@ -0,0 +1,331 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ *
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/* This quorum hack is only here until we transition to some more rational
+ * approach that is driven from userspace.  Honest.  No foolin'.
+ *
+ * Imagine two nodes lose network connectivity to each other but they're still
+ * up and operating in every other way.  Presumably a network timeout indicates
+ * that a node is broken and should be recovered.  They can't both recover each
+ * other and both carry on without serialising their access to the file system.
+ * They need to decide who is authoritative.  Now extend that problem to
+ * arbitrary groups of nodes losing connectivity between each other.
+ *
+ * So we declare that a node which has given up on connecting to a majority
+ * of nodes who are still heartbeating will fence itself.
+ *
+ * There are huge opportunities for races here.  After we give up on a node's
+ * connection we need to wait long enough to give heartbeat an opportunity
+ * to declare the node as truly dead.  We also need to be careful with the
+ * race between when we see a node start heartbeating and when we connect
+ * to it.
+ *
+ * So nodes that are in this transtion put a hold on the quorum decision
+ * with a counter.  As they fall out of this transition they drop the count
+ * and if they're the last, they fire off the decision.
+ */
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/reboot.h>
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_QUORUM
+#include "masklog.h"
+#include "quorum.h"
+
+static struct o2quo_state {
+	spinlock_t		qs_lock;
+	struct work_struct	qs_work;
+	int			qs_pending;
+	int			qs_heartbeating;
+	unsigned long		qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int			qs_connected;
+	unsigned long		qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int			qs_holds;
+	unsigned long		qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+} o2quo_state;
+
+/* this is horribly heavy-handed.  It should instead flip the file
+ * system RO and call some userspace script. */
+static void o2quo_fence_self(void)
+{
+	/* panic spins with interrupts enabled.  with preempt
+	 * threads can still schedule, etc, etc */
+	o2hb_stop_all_regions();
+
+	switch (o2nm_single_cluster->cl_fence_method) {
+	case O2NM_FENCE_PANIC:
+		panic("*** ocfs2 is very sorry to be fencing this system by "
+		      "panicing ***\n");
+		break;
+	default:
+		WARN_ON(o2nm_single_cluster->cl_fence_method >=
+			O2NM_FENCE_METHODS);
+	case O2NM_FENCE_RESET:
+		printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
+		       "system by restarting ***\n");
+		emergency_restart();
+		break;
+	};
+}
+
+/* Indicate that a timeout occurred on a hearbeat region write. The
+ * other nodes in the cluster may consider us dead at that time so we
+ * want to "fence" ourselves so that we don't scribble on the disk
+ * after they think they've recovered us. This can't solve all
+ * problems related to writeout after recovery but this hack can at
+ * least close some of those gaps. When we have real fencing, this can
+ * go away as our node would be fenced externally before other nodes
+ * begin recovery. */
+void o2quo_disk_timeout(void)
+{
+	o2quo_fence_self();
+}
+
+static void o2quo_make_decision(struct work_struct *work)
+{
+	int quorum;
+	int lowest_hb, lowest_reachable = 0, fence = 0;
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
+	if (lowest_hb != O2NM_MAX_NODES)
+		lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
+
+	mlog(0, "heartbeating: %d, connected: %d, "
+	     "lowest: %d (%sreachable)\n", qs->qs_heartbeating,
+	     qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
+
+	if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
+	    qs->qs_heartbeating == 1)
+		goto out;
+
+	if (qs->qs_heartbeating & 1) {
+		/* the odd numbered cluster case is straight forward --
+		 * if we can't talk to the majority we're hosed */
+		quorum = (qs->qs_heartbeating + 1)/2;
+		if (qs->qs_connected < quorum) {
+			mlog(ML_ERROR, "fencing this node because it is "
+			     "only connected to %u nodes and %u is needed "
+			     "to make a quorum out of %u heartbeating nodes\n",
+			     qs->qs_connected, quorum,
+			     qs->qs_heartbeating);
+			fence = 1;
+		}
+	} else {
+		/* the even numbered cluster adds the possibility of each half
+		 * of the cluster being able to talk amongst themselves.. in
+		 * that case we're hosed if we can't talk to the group that has
+		 * the lowest numbered node */
+		quorum = qs->qs_heartbeating / 2;
+		if (qs->qs_connected < quorum) {
+			mlog(ML_ERROR, "fencing this node because it is "
+			     "only connected to %u nodes and %u is needed "
+			     "to make a quorum out of %u heartbeating nodes\n",
+			     qs->qs_connected, quorum,
+			     qs->qs_heartbeating);
+			fence = 1;
+		}
+		else if ((qs->qs_connected == quorum) &&
+			 !lowest_reachable) {
+			mlog(ML_ERROR, "fencing this node because it is "
+			     "connected to a half-quorum of %u out of %u "
+			     "nodes which doesn't include the lowest active "
+			     "node %u\n", quorum, qs->qs_heartbeating,
+			     lowest_hb);
+			fence = 1;
+		}
+	}
+
+out:
+	spin_unlock(&qs->qs_lock);
+	if (fence)
+		o2quo_fence_self();
+}
+
+static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
+{
+	assert_spin_locked(&qs->qs_lock);
+
+	if (!test_and_set_bit(node, qs->qs_hold_bm)) {
+		qs->qs_holds++;
+		mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
+			        "node %u\n", node);
+		mlog(0, "node %u, %d total\n", node, qs->qs_holds);
+	}
+}
+
+static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
+{
+	assert_spin_locked(&qs->qs_lock);
+
+	if (test_and_clear_bit(node, qs->qs_hold_bm)) {
+		mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
+		if (--qs->qs_holds == 0) {
+			if (qs->qs_pending) {
+				qs->qs_pending = 0;
+				schedule_work(&qs->qs_work);
+			}
+		}
+		mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
+				node, qs->qs_holds);
+	}
+}
+
+/* as a node comes up we delay the quorum decision until we know the fate of
+ * the connection.  the hold will be droped in conn_up or hb_down.  it might be
+ * perpetuated by con_err until hb_down.  if we already have a conn, we might
+ * be dropping a hold that conn_up got. */
+void o2quo_hb_up(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	qs->qs_heartbeating++;
+	mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
+		        "node %u\n", node);
+	mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
+	set_bit(node, qs->qs_hb_bm);
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
+
+	if (!test_bit(node, qs->qs_conn_bm))
+		o2quo_set_hold(qs, node);
+	else
+		o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* hb going down releases any holds we might have had due to this node from
+ * conn_up, conn_err, or hb_up */
+void o2quo_hb_down(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	qs->qs_heartbeating--;
+	mlog_bug_on_msg(qs->qs_heartbeating < 0,
+			"node %u, %d heartbeating\n",
+			node, qs->qs_heartbeating);
+	mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
+	clear_bit(node, qs->qs_hb_bm);
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
+
+	o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* this tells us that we've decided that the node is still heartbeating
+ * even though we've lost it's conn.  it must only be called after conn_err
+ * and indicates that we must now make a quorum decision in the future,
+ * though we might be doing so after waiting for holds to drain.  Here
+ * we'll be dropping the hold from conn_err. */
+void o2quo_hb_still_up(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	mlog(0, "node %u\n", node);
+
+	qs->qs_pending = 1;
+	o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* This is analogous to hb_up.  as a node's connection comes up we delay the
+ * quorum decision until we see it heartbeating.  the hold will be droped in
+ * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
+ * it's already heartbeating we we might be dropping a hold that conn_up got.
+ * */
+void o2quo_conn_up(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	qs->qs_connected++;
+	mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
+		        "node %u\n", node);
+	mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
+	set_bit(node, qs->qs_conn_bm);
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
+
+	if (!test_bit(node, qs->qs_hb_bm))
+		o2quo_set_hold(qs, node);
+	else
+		o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* we've decided that we won't ever be connecting to the node again.  if it's
+ * still heartbeating we grab a hold that will delay decisions until either the
+ * node stops heartbeating from hb_down or the caller decides that the node is
+ * still up and calls still_up */
+void o2quo_conn_err(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	if (test_bit(node, qs->qs_conn_bm)) {
+		qs->qs_connected--;
+		mlog_bug_on_msg(qs->qs_connected < 0,
+				"node %u, connected %d\n",
+				node, qs->qs_connected);
+
+		clear_bit(node, qs->qs_conn_bm);
+	}
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
+
+	if (test_bit(node, qs->qs_hb_bm))
+		o2quo_set_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+void o2quo_init(void)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock_init(&qs->qs_lock);
+	INIT_WORK(&qs->qs_work, o2quo_make_decision);
+}
+
+void o2quo_exit(void)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	flush_work_sync(&qs->qs_work);
+}
diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h
new file mode 100644
index 00000000..6649cc6f
--- /dev/null
+++ b/fs/ocfs2/cluster/quorum.h
@@ -0,0 +1,36 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_QUORUM_H
+#define O2CLUSTER_QUORUM_H
+
+void o2quo_init(void);
+void o2quo_exit(void);
+
+void o2quo_hb_up(u8 node);
+void o2quo_hb_down(u8 node);
+void o2quo_hb_still_up(u8 node);
+void o2quo_conn_up(u8 node);
+void o2quo_conn_err(u8 node);
+void o2quo_disk_timeout(void);
+
+#endif /* O2CLUSTER_QUORUM_H */
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
new file mode 100644
index 00000000..a4b07730
--- /dev/null
+++ b/fs/ocfs2/cluster/sys.c
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sys.c
+ *
+ * OCFS2 cluster sysfs interface
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation,
+ * version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/fs.h>
+
+#include "ocfs2_nodemanager.h"
+#include "masklog.h"
+#include "sys.h"
+
+
+static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
+			    char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
+}
+static struct kobj_attribute attr_version =
+	__ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL);
+
+static struct attribute *o2cb_attrs[] = {
+	&attr_version.attr,
+	NULL,
+};
+
+static struct attribute_group o2cb_attr_group = {
+	.attrs = o2cb_attrs,
+};
+
+static struct kset *o2cb_kset;
+
+void o2cb_sys_shutdown(void)
+{
+	mlog_sys_shutdown();
+	kset_unregister(o2cb_kset);
+}
+
+int o2cb_sys_init(void)
+{
+	int ret;
+
+	o2cb_kset = kset_create_and_add("o2cb", NULL, fs_kobj);
+	if (!o2cb_kset)
+		return -ENOMEM;
+
+	ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
+	if (ret)
+		goto error;
+
+	ret = mlog_sys_init(o2cb_kset);
+	if (ret)
+		goto error;
+	return 0;
+error:
+	kset_unregister(o2cb_kset);
+	return ret;
+}
diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h
new file mode 100644
index 00000000..d66b8ab0
--- /dev/null
+++ b/fs/ocfs2/cluster/sys.h
@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sys.h
+ *
+ * Function prototypes for o2cb sysfs interface
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation,
+ * version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_SYS_H
+#define O2CLUSTER_SYS_H
+
+void o2cb_sys_shutdown(void);
+int o2cb_sys_init(void);
+
+#endif /* O2CLUSTER_SYS_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
new file mode 100644
index 00000000..db5ee4b4
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp.c
@@ -0,0 +1,2143 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ *
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * ----
+ *
+ * Callers for this were originally written against a very simple synchronus
+ * API.  This implementation reflects those simple callers.  Some day I'm sure
+ * we'll need to move to a more robust posting/callback mechanism.
+ *
+ * Transmit calls pass in kernel virtual addresses and block copying this into
+ * the socket's tx buffers via a usual blocking sendmsg.  They'll block waiting
+ * for a failed socket to timeout.  TX callers can also pass in a poniter to an
+ * 'int' which gets filled with an errno off the wire in response to the
+ * message they send.
+ *
+ * Handlers for unsolicited messages are registered.  Each socket has a page
+ * that incoming data is copied into.  First the header, then the data.
+ * Handlers are called from only one thread with a reference to this per-socket
+ * page.  This page is destroyed after the handler call, so it can't be
+ * referenced beyond the call.  Handlers may block but are discouraged from
+ * doing so.
+ *
+ * Any framing errors (bad magic, large payload lengths) close a connection.
+ *
+ * Our sock_container holds the state we associate with a socket.  It's current
+ * framing state is held there as well as the refcounting we do around when it
+ * is safe to tear down the socket.  The socket is only finally torn down from
+ * the container when the container loses all of its references -- so as long
+ * as you hold a ref on the container you can trust that the socket is valid
+ * for use with kernel socket APIs.
+ *
+ * Connections are initiated between a pair of nodes when the node with the
+ * higher node number gets a heartbeat callback which indicates that the lower
+ * numbered node has started heartbeating.  The lower numbered node is passive
+ * and only accepts the connection if the higher numbered node is heartbeating.
+ */
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/net.h>
+#include <net/tcp.h>
+
+#include <asm/uaccess.h>
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_TCP
+#include "masklog.h"
+#include "quorum.h"
+
+#include "tcp_internal.h"
+
+#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u"
+#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num,	\
+			  &sc->sc_node->nd_ipv4_address,		\
+			  ntohs(sc->sc_node->nd_ipv4_port)
+
+/*
+ * In the following two log macros, the whitespace after the ',' just
+ * before ##args is intentional. Otherwise, gcc 2.95 will eat the
+ * previous token if args expands to nothing.
+ */
+#define msglog(hdr, fmt, args...) do {					\
+	typeof(hdr) __hdr = (hdr);					\
+	mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d "	\
+	     "key %08x num %u] " fmt,					\
+	     be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), 	\
+	     be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status),	\
+	     be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key),	\
+	     be32_to_cpu(__hdr->msg_num) ,  ##args);			\
+} while (0)
+
+#define sclog(sc, fmt, args...) do {					\
+	typeof(sc) __sc = (sc);						\
+	mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p "	\
+	     "pg_off %zu] " fmt, __sc,					\
+	     atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock,	\
+	    __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off ,	\
+	    ##args);							\
+} while (0)
+
+static DEFINE_RWLOCK(o2net_handler_lock);
+static struct rb_root o2net_handler_tree = RB_ROOT;
+
+static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
+
+/* XXX someday we'll need better accounting */
+static struct socket *o2net_listen_sock = NULL;
+
+/*
+ * listen work is only queued by the listening socket callbacks on the
+ * o2net_wq.  teardown detaches the callbacks before destroying the workqueue.
+ * quorum work is queued as sock containers are shutdown.. stop_listening
+ * tears down all the node's sock containers, preventing future shutdowns
+ * and queued quroum work, before canceling delayed quorum work and
+ * destroying the work queue.
+ */
+static struct workqueue_struct *o2net_wq;
+static struct work_struct o2net_listen_work;
+
+static struct o2hb_callback_func o2net_hb_up, o2net_hb_down;
+#define O2NET_HB_PRI 0x1
+
+static struct o2net_handshake *o2net_hand;
+static struct o2net_msg *o2net_keep_req, *o2net_keep_resp;
+
+static int o2net_sys_err_translations[O2NET_ERR_MAX] =
+		{[O2NET_ERR_NONE]	= 0,
+		 [O2NET_ERR_NO_HNDLR]	= -ENOPROTOOPT,
+		 [O2NET_ERR_OVERFLOW]	= -EOVERFLOW,
+		 [O2NET_ERR_DIED]	= -EHOSTDOWN,};
+
+/* can't quite avoid *all* internal declarations :/ */
+static void o2net_sc_connect_completed(struct work_struct *work);
+static void o2net_rx_until_empty(struct work_struct *work);
+static void o2net_shutdown_sc(struct work_struct *work);
+static void o2net_listen_data_ready(struct sock *sk, int bytes);
+static void o2net_sc_send_keep_req(struct work_struct *work);
+static void o2net_idle_timer(unsigned long data);
+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
+static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
+
+#ifdef CONFIG_DEBUG_FS
+static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+			   u32 msgkey, struct task_struct *task, u8 node)
+{
+	INIT_LIST_HEAD(&nst->st_net_debug_item);
+	nst->st_task = task;
+	nst->st_msg_type = msgtype;
+	nst->st_msg_key = msgkey;
+	nst->st_node = node;
+}
+
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+	nst->st_sock_time = ktime_get();
+}
+
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+	nst->st_send_time = ktime_get();
+}
+
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+	nst->st_status_time = ktime_get();
+}
+
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+						struct o2net_sock_container *sc)
+{
+	nst->st_sc = sc;
+}
+
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+					u32 msg_id)
+{
+	nst->st_id = msg_id;
+}
+
+static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
+{
+	sc->sc_tv_timer = ktime_get();
+}
+
+static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
+{
+	sc->sc_tv_data_ready = ktime_get();
+}
+
+static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
+{
+	sc->sc_tv_advance_start = ktime_get();
+}
+
+static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
+{
+	sc->sc_tv_advance_stop = ktime_get();
+}
+
+static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
+{
+	sc->sc_tv_func_start = ktime_get();
+}
+
+static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
+{
+	sc->sc_tv_func_stop = ktime_get();
+}
+
+#else  /* CONFIG_DEBUG_FS */
+# define o2net_init_nst(a, b, c, d, e)
+# define o2net_set_nst_sock_time(a)
+# define o2net_set_nst_send_time(a)
+# define o2net_set_nst_status_time(a)
+# define o2net_set_nst_sock_container(a, b)
+# define o2net_set_nst_msg_id(a, b)
+# define o2net_set_sock_timer(a)
+# define o2net_set_data_ready_time(a)
+# define o2net_set_advance_start_time(a)
+# define o2net_set_advance_stop_time(a)
+# define o2net_set_func_start_time(a)
+# define o2net_set_func_stop_time(a)
+#endif /* CONFIG_DEBUG_FS */
+
+#ifdef CONFIG_OCFS2_FS_STATS
+static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
+{
+	return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
+}
+
+static void o2net_update_send_stats(struct o2net_send_tracking *nst,
+				    struct o2net_sock_container *sc)
+{
+	sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
+					   ktime_sub(ktime_get(),
+						     nst->st_status_time));
+	sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
+					 ktime_sub(nst->st_status_time,
+						   nst->st_send_time));
+	sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
+					    ktime_sub(nst->st_send_time,
+						      nst->st_sock_time));
+	sc->sc_send_count++;
+}
+
+static void o2net_update_recv_stats(struct o2net_sock_container *sc)
+{
+	sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
+					    o2net_get_func_run_time(sc));
+	sc->sc_recv_count++;
+}
+
+#else
+
+# define o2net_update_send_stats(a, b)
+
+# define o2net_update_recv_stats(sc)
+
+#endif /* CONFIG_OCFS2_FS_STATS */
+
+static inline int o2net_reconnect_delay(void)
+{
+	return o2nm_single_cluster->cl_reconnect_delay_ms;
+}
+
+static inline int o2net_keepalive_delay(void)
+{
+	return o2nm_single_cluster->cl_keepalive_delay_ms;
+}
+
+static inline int o2net_idle_timeout(void)
+{
+	return o2nm_single_cluster->cl_idle_timeout_ms;
+}
+
+static inline int o2net_sys_err_to_errno(enum o2net_system_error err)
+{
+	int trans;
+	BUG_ON(err >= O2NET_ERR_MAX);
+	trans = o2net_sys_err_translations[err];
+
+	/* Just in case we mess up the translation table above */
+	BUG_ON(err != O2NET_ERR_NONE && trans == 0);
+	return trans;
+}
+
+static struct o2net_node * o2net_nn_from_num(u8 node_num)
+{
+	BUG_ON(node_num >= ARRAY_SIZE(o2net_nodes));
+	return &o2net_nodes[node_num];
+}
+
+static u8 o2net_num_from_nn(struct o2net_node *nn)
+{
+	BUG_ON(nn == NULL);
+	return nn - o2net_nodes;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
+{
+	int ret = 0;
+
+	do {
+		if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
+			ret = -EAGAIN;
+			break;
+		}
+		spin_lock(&nn->nn_lock);
+		ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
+		if (ret == 0)
+			list_add_tail(&nsw->ns_node_item,
+				      &nn->nn_status_list);
+		spin_unlock(&nn->nn_lock);
+	} while (ret == -EAGAIN);
+
+	if (ret == 0)  {
+		init_waitqueue_head(&nsw->ns_wq);
+		nsw->ns_sys_status = O2NET_ERR_NONE;
+		nsw->ns_status = 0;
+	}
+
+	return ret;
+}
+
+static void o2net_complete_nsw_locked(struct o2net_node *nn,
+				      struct o2net_status_wait *nsw,
+				      enum o2net_system_error sys_status,
+				      s32 status)
+{
+	assert_spin_locked(&nn->nn_lock);
+
+	if (!list_empty(&nsw->ns_node_item)) {
+		list_del_init(&nsw->ns_node_item);
+		nsw->ns_sys_status = sys_status;
+		nsw->ns_status = status;
+		idr_remove(&nn->nn_status_idr, nsw->ns_id);
+		wake_up(&nsw->ns_wq);
+	}
+}
+
+static void o2net_complete_nsw(struct o2net_node *nn,
+			       struct o2net_status_wait *nsw,
+			       u64 id, enum o2net_system_error sys_status,
+			       s32 status)
+{
+	spin_lock(&nn->nn_lock);
+	if (nsw == NULL) {
+		if (id > INT_MAX)
+			goto out;
+
+		nsw = idr_find(&nn->nn_status_idr, id);
+		if (nsw == NULL)
+			goto out;
+	}
+
+	o2net_complete_nsw_locked(nn, nsw, sys_status, status);
+
+out:
+	spin_unlock(&nn->nn_lock);
+	return;
+}
+
+static void o2net_complete_nodes_nsw(struct o2net_node *nn)
+{
+	struct o2net_status_wait *nsw, *tmp;
+	unsigned int num_kills = 0;
+
+	assert_spin_locked(&nn->nn_lock);
+
+	list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) {
+		o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
+		num_kills++;
+	}
+
+	mlog(0, "completed %d messages for node %u\n", num_kills,
+	     o2net_num_from_nn(nn));
+}
+
+static int o2net_nsw_completed(struct o2net_node *nn,
+			       struct o2net_status_wait *nsw)
+{
+	int completed;
+	spin_lock(&nn->nn_lock);
+	completed = list_empty(&nsw->ns_node_item);
+	spin_unlock(&nn->nn_lock);
+	return completed;
+}
+
+/* ------------------------------------------------------------ */
+
+static void sc_kref_release(struct kref *kref)
+{
+	struct o2net_sock_container *sc = container_of(kref,
+					struct o2net_sock_container, sc_kref);
+	BUG_ON(timer_pending(&sc->sc_idle_timeout));
+
+	sclog(sc, "releasing\n");
+
+	if (sc->sc_sock) {
+		sock_release(sc->sc_sock);
+		sc->sc_sock = NULL;
+	}
+
+	o2nm_undepend_item(&sc->sc_node->nd_item);
+	o2nm_node_put(sc->sc_node);
+	sc->sc_node = NULL;
+
+	o2net_debug_del_sc(sc);
+	kfree(sc);
+}
+
+static void sc_put(struct o2net_sock_container *sc)
+{
+	sclog(sc, "put\n");
+	kref_put(&sc->sc_kref, sc_kref_release);
+}
+static void sc_get(struct o2net_sock_container *sc)
+{
+	sclog(sc, "get\n");
+	kref_get(&sc->sc_kref);
+}
+static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
+{
+	struct o2net_sock_container *sc, *ret = NULL;
+	struct page *page = NULL;
+	int status = 0;
+
+	page = alloc_page(GFP_NOFS);
+	sc = kzalloc(sizeof(*sc), GFP_NOFS);
+	if (sc == NULL || page == NULL)
+		goto out;
+
+	kref_init(&sc->sc_kref);
+	o2nm_node_get(node);
+	sc->sc_node = node;
+
+	/* pin the node item of the remote node */
+	status = o2nm_depend_item(&node->nd_item);
+	if (status) {
+		mlog_errno(status);
+		o2nm_node_put(node);
+		goto out;
+	}
+	INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
+	INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
+	INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
+	INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req);
+
+	init_timer(&sc->sc_idle_timeout);
+	sc->sc_idle_timeout.function = o2net_idle_timer;
+	sc->sc_idle_timeout.data = (unsigned long)sc;
+
+	sclog(sc, "alloced\n");
+
+	ret = sc;
+	sc->sc_page = page;
+	o2net_debug_add_sc(sc);
+	sc = NULL;
+	page = NULL;
+
+out:
+	if (page)
+		__free_page(page);
+	kfree(sc);
+
+	return ret;
+}
+
+/* ------------------------------------------------------------ */
+
+static void o2net_sc_queue_work(struct o2net_sock_container *sc,
+				struct work_struct *work)
+{
+	sc_get(sc);
+	if (!queue_work(o2net_wq, work))
+		sc_put(sc);
+}
+static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc,
+					struct delayed_work *work,
+					int delay)
+{
+	sc_get(sc);
+	if (!queue_delayed_work(o2net_wq, work, delay))
+		sc_put(sc);
+}
+static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc,
+					 struct delayed_work *work)
+{
+	if (cancel_delayed_work(work))
+		sc_put(sc);
+}
+
+static atomic_t o2net_connected_peers = ATOMIC_INIT(0);
+
+int o2net_num_connected_peers(void)
+{
+	return atomic_read(&o2net_connected_peers);
+}
+
+static void o2net_set_nn_state(struct o2net_node *nn,
+			       struct o2net_sock_container *sc,
+			       unsigned valid, int err)
+{
+	int was_valid = nn->nn_sc_valid;
+	int was_err = nn->nn_persistent_error;
+	struct o2net_sock_container *old_sc = nn->nn_sc;
+
+	assert_spin_locked(&nn->nn_lock);
+
+	if (old_sc && !sc)
+		atomic_dec(&o2net_connected_peers);
+	else if (!old_sc && sc)
+		atomic_inc(&o2net_connected_peers);
+
+	/* the node num comparison and single connect/accept path should stop
+	 * an non-null sc from being overwritten with another */
+	BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc);
+	mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
+	mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
+
+	if (was_valid && !valid && err == 0)
+		err = -ENOTCONN;
+
+	mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n",
+	     o2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid,
+	     nn->nn_persistent_error, err);
+
+	nn->nn_sc = sc;
+	nn->nn_sc_valid = valid ? 1 : 0;
+	nn->nn_persistent_error = err;
+
+	/* mirrors o2net_tx_can_proceed() */
+	if (nn->nn_persistent_error || nn->nn_sc_valid)
+		wake_up(&nn->nn_sc_wq);
+
+	if (!was_err && nn->nn_persistent_error) {
+		o2quo_conn_err(o2net_num_from_nn(nn));
+		queue_delayed_work(o2net_wq, &nn->nn_still_up,
+				   msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+	}
+
+	if (was_valid && !valid) {
+		printk(KERN_NOTICE "o2net: no longer connected to "
+		       SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
+		o2net_complete_nodes_nsw(nn);
+	}
+
+	if (!was_valid && valid) {
+		o2quo_conn_up(o2net_num_from_nn(nn));
+		cancel_delayed_work(&nn->nn_connect_expired);
+		printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
+		       o2nm_this_node() > sc->sc_node->nd_num ?
+		       		"connected to" : "accepted connection from",
+		       SC_NODEF_ARGS(sc));
+	}
+
+	/* trigger the connecting worker func as long as we're not valid,
+	 * it will back off if it shouldn't connect.  This can be called
+	 * from node config teardown and so needs to be careful about
+	 * the work queue actually being up. */
+	if (!valid && o2net_wq) {
+		unsigned long delay;
+		/* delay if we're within a RECONNECT_DELAY of the
+		 * last attempt */
+		delay = (nn->nn_last_connect_attempt +
+			 msecs_to_jiffies(o2net_reconnect_delay()))
+			- jiffies;
+		if (delay > msecs_to_jiffies(o2net_reconnect_delay()))
+			delay = 0;
+		mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
+		queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
+
+		/*
+		 * Delay the expired work after idle timeout.
+		 *
+		 * We might have lots of failed connection attempts that run
+		 * through here but we only cancel the connect_expired work when
+		 * a connection attempt succeeds.  So only the first enqueue of
+		 * the connect_expired work will do anything.  The rest will see
+		 * that it's already queued and do nothing.
+		 */
+		delay += msecs_to_jiffies(o2net_idle_timeout());
+		queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay);
+	}
+
+	/* keep track of the nn's sc ref for the caller */
+	if ((old_sc == NULL) && sc)
+		sc_get(sc);
+	if (old_sc && (old_sc != sc)) {
+		o2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work);
+		sc_put(old_sc);
+	}
+}
+
+/* see o2net_register_callbacks() */
+static void o2net_data_ready(struct sock *sk, int bytes)
+{
+	void (*ready)(struct sock *sk, int bytes);
+
+	read_lock(&sk->sk_callback_lock);
+	if (sk->sk_user_data) {
+		struct o2net_sock_container *sc = sk->sk_user_data;
+		sclog(sc, "data_ready hit\n");
+		o2net_set_data_ready_time(sc);
+		o2net_sc_queue_work(sc, &sc->sc_rx_work);
+		ready = sc->sc_data_ready;
+	} else {
+		ready = sk->sk_data_ready;
+	}
+	read_unlock(&sk->sk_callback_lock);
+
+	ready(sk, bytes);
+}
+
+/* see o2net_register_callbacks() */
+static void o2net_state_change(struct sock *sk)
+{
+	void (*state_change)(struct sock *sk);
+	struct o2net_sock_container *sc;
+
+	read_lock(&sk->sk_callback_lock);
+	sc = sk->sk_user_data;
+	if (sc == NULL) {
+		state_change = sk->sk_state_change;
+		goto out;
+	}
+
+	sclog(sc, "state_change to %d\n", sk->sk_state);
+
+	state_change = sc->sc_state_change;
+
+	switch(sk->sk_state) {
+		/* ignore connecting sockets as they make progress */
+		case TCP_SYN_SENT:
+		case TCP_SYN_RECV:
+			break;
+		case TCP_ESTABLISHED:
+			o2net_sc_queue_work(sc, &sc->sc_connect_work);
+			break;
+		default:
+			printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
+			      " shutdown, state %d\n",
+			      SC_NODEF_ARGS(sc), sk->sk_state);
+			o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+			break;
+	}
+out:
+	read_unlock(&sk->sk_callback_lock);
+	state_change(sk);
+}
+
+/*
+ * we register callbacks so we can queue work on events before calling
+ * the original callbacks.  our callbacks our careful to test user_data
+ * to discover when they've reaced with o2net_unregister_callbacks().
+ */
+static void o2net_register_callbacks(struct sock *sk,
+				     struct o2net_sock_container *sc)
+{
+	write_lock_bh(&sk->sk_callback_lock);
+
+	/* accepted sockets inherit the old listen socket data ready */
+	if (sk->sk_data_ready == o2net_listen_data_ready) {
+		sk->sk_data_ready = sk->sk_user_data;
+		sk->sk_user_data = NULL;
+	}
+
+	BUG_ON(sk->sk_user_data != NULL);
+	sk->sk_user_data = sc;
+	sc_get(sc);
+
+	sc->sc_data_ready = sk->sk_data_ready;
+	sc->sc_state_change = sk->sk_state_change;
+	sk->sk_data_ready = o2net_data_ready;
+	sk->sk_state_change = o2net_state_change;
+
+	mutex_init(&sc->sc_send_lock);
+
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int o2net_unregister_callbacks(struct sock *sk,
+			           struct o2net_sock_container *sc)
+{
+	int ret = 0;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	if (sk->sk_user_data == sc) {
+		ret = 1;
+		sk->sk_user_data = NULL;
+		sk->sk_data_ready = sc->sc_data_ready;
+		sk->sk_state_change = sc->sc_state_change;
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+
+	return ret;
+}
+
+/*
+ * this is a little helper that is called by callers who have seen a problem
+ * with an sc and want to detach it from the nn if someone already hasn't beat
+ * them to it.  if an error is given then the shutdown will be persistent
+ * and pending transmits will be canceled.
+ */
+static void o2net_ensure_shutdown(struct o2net_node *nn,
+			           struct o2net_sock_container *sc,
+				   int err)
+{
+	spin_lock(&nn->nn_lock);
+	if (nn->nn_sc == sc)
+		o2net_set_nn_state(nn, NULL, 0, err);
+	spin_unlock(&nn->nn_lock);
+}
+
+/*
+ * This work queue function performs the blocking parts of socket shutdown.  A
+ * few paths lead here.  set_nn_state will trigger this callback if it sees an
+ * sc detached from the nn.  state_change will also trigger this callback
+ * directly when it sees errors.  In that case we need to call set_nn_state
+ * ourselves as state_change couldn't get the nn_lock and call set_nn_state
+ * itself.
+ */
+static void o2net_shutdown_sc(struct work_struct *work)
+{
+	struct o2net_sock_container *sc =
+		container_of(work, struct o2net_sock_container,
+			     sc_shutdown_work);
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+	sclog(sc, "shutting down\n");
+
+	/* drop the callbacks ref and call shutdown only once */
+	if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
+		/* we shouldn't flush as we're in the thread, the
+		 * races with pending sc work structs are harmless */
+		del_timer_sync(&sc->sc_idle_timeout);
+		o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
+		sc_put(sc);
+		kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR);
+	}
+
+	/* not fatal so failed connects before the other guy has our
+	 * heartbeat can be retried */
+	o2net_ensure_shutdown(nn, sc, 0);
+	sc_put(sc);
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_handler_cmp(struct o2net_msg_handler *nmh, u32 msg_type,
+			     u32 key)
+{
+	int ret = memcmp(&nmh->nh_key, &key, sizeof(key));
+
+	if (ret == 0)
+		ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type));
+
+	return ret;
+}
+
+static struct o2net_msg_handler *
+o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
+			  struct rb_node **ret_parent)
+{
+        struct rb_node **p = &o2net_handler_tree.rb_node;
+        struct rb_node *parent = NULL;
+	struct o2net_msg_handler *nmh, *ret = NULL;
+	int cmp;
+
+        while (*p) {
+                parent = *p;
+                nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
+		cmp = o2net_handler_cmp(nmh, msg_type, key);
+
+                if (cmp < 0)
+                        p = &(*p)->rb_left;
+                else if (cmp > 0)
+                        p = &(*p)->rb_right;
+                else {
+			ret = nmh;
+                        break;
+		}
+        }
+
+        if (ret_p != NULL)
+                *ret_p = p;
+        if (ret_parent != NULL)
+                *ret_parent = parent;
+
+        return ret;
+}
+
+static void o2net_handler_kref_release(struct kref *kref)
+{
+	struct o2net_msg_handler *nmh;
+	nmh = container_of(kref, struct o2net_msg_handler, nh_kref);
+
+	kfree(nmh);
+}
+
+static void o2net_handler_put(struct o2net_msg_handler *nmh)
+{
+	kref_put(&nmh->nh_kref, o2net_handler_kref_release);
+}
+
+/* max_len is protection for the handler func.  incoming messages won't
+ * be given to the handler if their payload is longer than the max. */
+int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
+			   o2net_msg_handler_func *func, void *data,
+			   o2net_post_msg_handler_func *post_func,
+			   struct list_head *unreg_list)
+{
+	struct o2net_msg_handler *nmh = NULL;
+	struct rb_node **p, *parent;
+	int ret = 0;
+
+	if (max_len > O2NET_MAX_PAYLOAD_BYTES) {
+		mlog(0, "max_len for message handler out of range: %u\n",
+			max_len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!msg_type) {
+		mlog(0, "no message type provided: %u, %p\n", msg_type, func);
+		ret = -EINVAL;
+		goto out;
+
+	}
+	if (!func) {
+		mlog(0, "no message handler provided: %u, %p\n",
+		       msg_type, func);
+		ret = -EINVAL;
+		goto out;
+	}
+
+       	nmh = kzalloc(sizeof(struct o2net_msg_handler), GFP_NOFS);
+	if (nmh == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	nmh->nh_func = func;
+	nmh->nh_func_data = data;
+	nmh->nh_post_func = post_func;
+	nmh->nh_msg_type = msg_type;
+	nmh->nh_max_len = max_len;
+	nmh->nh_key = key;
+	/* the tree and list get this ref.. they're both removed in
+	 * unregister when this ref is dropped */
+	kref_init(&nmh->nh_kref);
+	INIT_LIST_HEAD(&nmh->nh_unregister_item);
+
+	write_lock(&o2net_handler_lock);
+	if (o2net_handler_tree_lookup(msg_type, key, &p, &parent))
+		ret = -EEXIST;
+	else {
+	        rb_link_node(&nmh->nh_node, parent, p);
+		rb_insert_color(&nmh->nh_node, &o2net_handler_tree);
+		list_add_tail(&nmh->nh_unregister_item, unreg_list);
+
+		mlog(ML_TCP, "registered handler func %p type %u key %08x\n",
+		     func, msg_type, key);
+		/* we've had some trouble with handlers seemingly vanishing. */
+		mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,
+							  &parent) == NULL,
+			        "couldn't find handler we *just* registerd "
+				"for type %u key %08x\n", msg_type, key);
+	}
+	write_unlock(&o2net_handler_lock);
+	if (ret)
+		goto out;
+
+out:
+	if (ret)
+		kfree(nmh);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(o2net_register_handler);
+
+void o2net_unregister_handler_list(struct list_head *list)
+{
+	struct o2net_msg_handler *nmh, *n;
+
+	write_lock(&o2net_handler_lock);
+	list_for_each_entry_safe(nmh, n, list, nh_unregister_item) {
+		mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
+		     nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
+		rb_erase(&nmh->nh_node, &o2net_handler_tree);
+		list_del_init(&nmh->nh_unregister_item);
+		kref_put(&nmh->nh_kref, o2net_handler_kref_release);
+	}
+	write_unlock(&o2net_handler_lock);
+}
+EXPORT_SYMBOL_GPL(o2net_unregister_handler_list);
+
+static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
+{
+	struct o2net_msg_handler *nmh;
+
+	read_lock(&o2net_handler_lock);
+	nmh = o2net_handler_tree_lookup(msg_type, key, NULL, NULL);
+	if (nmh)
+		kref_get(&nmh->nh_kref);
+	read_unlock(&o2net_handler_lock);
+
+	return nmh;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
+{
+	int ret;
+	mm_segment_t oldfs;
+	struct kvec vec = {
+		.iov_len = len,
+		.iov_base = data,
+	};
+	struct msghdr msg = {
+		.msg_iovlen = 1,
+		.msg_iov = (struct iovec *)&vec,
+       		.msg_flags = MSG_DONTWAIT,
+	};
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+	ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
+	set_fs(oldfs);
+
+	return ret;
+}
+
+static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
+			      size_t veclen, size_t total)
+{
+	int ret;
+	mm_segment_t oldfs;
+	struct msghdr msg = {
+		.msg_iov = (struct iovec *)vec,
+		.msg_iovlen = veclen,
+	};
+
+	if (sock == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+	ret = sock_sendmsg(sock, &msg, total);
+	set_fs(oldfs);
+	if (ret != total) {
+		mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
+		     total);
+		if (ret >= 0)
+			ret = -EPIPE; /* should be smarter, I bet */
+		goto out;
+	}
+
+	ret = 0;
+out:
+	if (ret < 0)
+		mlog(0, "returning error: %d\n", ret);
+	return ret;
+}
+
+static void o2net_sendpage(struct o2net_sock_container *sc,
+			   void *kmalloced_virt,
+			   size_t size)
+{
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+	ssize_t ret;
+
+	while (1) {
+		mutex_lock(&sc->sc_send_lock);
+		ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
+						 virt_to_page(kmalloced_virt),
+						 (long)kmalloced_virt & ~PAGE_MASK,
+						 size, MSG_DONTWAIT);
+		mutex_unlock(&sc->sc_send_lock);
+		if (ret == size)
+			break;
+		if (ret == (ssize_t)-EAGAIN) {
+			mlog(0, "sendpage of size %zu to " SC_NODEF_FMT
+			     " returned EAGAIN\n", size, SC_NODEF_ARGS(sc));
+			cond_resched();
+			continue;
+		}
+		mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
+		     " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
+		o2net_ensure_shutdown(nn, sc, 0);
+		break;
+	}
+}
+
+static void o2net_init_msg(struct o2net_msg *msg, u16 data_len, u16 msg_type, u32 key)
+{
+	memset(msg, 0, sizeof(struct o2net_msg));
+	msg->magic = cpu_to_be16(O2NET_MSG_MAGIC);
+	msg->data_len = cpu_to_be16(data_len);
+	msg->msg_type = cpu_to_be16(msg_type);
+	msg->sys_status = cpu_to_be32(O2NET_ERR_NONE);
+	msg->status = 0;
+	msg->key = cpu_to_be32(key);
+}
+
+static int o2net_tx_can_proceed(struct o2net_node *nn,
+			        struct o2net_sock_container **sc_ret,
+				int *error)
+{
+	int ret = 0;
+
+	spin_lock(&nn->nn_lock);
+	if (nn->nn_persistent_error) {
+		ret = 1;
+		*sc_ret = NULL;
+		*error = nn->nn_persistent_error;
+	} else if (nn->nn_sc_valid) {
+		kref_get(&nn->nn_sc->sc_kref);
+
+		ret = 1;
+		*sc_ret = nn->nn_sc;
+		*error = 0;
+	}
+	spin_unlock(&nn->nn_lock);
+
+	return ret;
+}
+
+int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
+			   size_t caller_veclen, u8 target_node, int *status)
+{
+	int ret = 0;
+	struct o2net_msg *msg = NULL;
+	size_t veclen, caller_bytes = 0;
+	struct kvec *vec = NULL;
+	struct o2net_sock_container *sc = NULL;
+	struct o2net_node *nn = o2net_nn_from_num(target_node);
+	struct o2net_status_wait nsw = {
+		.ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
+	};
+	struct o2net_send_tracking nst;
+
+	o2net_init_nst(&nst, msg_type, key, current, target_node);
+
+	if (o2net_wq == NULL) {
+		mlog(0, "attempt to tx without o2netd running\n");
+		ret = -ESRCH;
+		goto out;
+	}
+
+	if (caller_veclen == 0) {
+		mlog(0, "bad kvec array length\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen);
+	if (caller_bytes > O2NET_MAX_PAYLOAD_BYTES) {
+		mlog(0, "total payload len %zu too large\n", caller_bytes);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (target_node == o2nm_this_node()) {
+		ret = -ELOOP;
+		goto out;
+	}
+
+	o2net_debug_add_nst(&nst);
+
+	o2net_set_nst_sock_time(&nst);
+
+	wait_event(nn->nn_sc_wq, o2net_tx_can_proceed(nn, &sc, &ret));
+	if (ret)
+		goto out;
+
+	o2net_set_nst_sock_container(&nst, sc);
+
+	veclen = caller_veclen + 1;
+	vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
+	if (vec == NULL) {
+		mlog(0, "failed to %zu element kvec!\n", veclen);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	msg = kmalloc(sizeof(struct o2net_msg), GFP_ATOMIC);
+	if (!msg) {
+		mlog(0, "failed to allocate a o2net_msg!\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	o2net_init_msg(msg, caller_bytes, msg_type, key);
+
+	vec[0].iov_len = sizeof(struct o2net_msg);
+	vec[0].iov_base = msg;
+	memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec));
+
+	ret = o2net_prep_nsw(nn, &nsw);
+	if (ret)
+		goto out;
+
+	msg->msg_num = cpu_to_be32(nsw.ns_id);
+	o2net_set_nst_msg_id(&nst, nsw.ns_id);
+
+	o2net_set_nst_send_time(&nst);
+
+	/* finally, convert the message header to network byte-order
+	 * and send */
+	mutex_lock(&sc->sc_send_lock);
+	ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen,
+				 sizeof(struct o2net_msg) + caller_bytes);
+	mutex_unlock(&sc->sc_send_lock);
+	msglog(msg, "sending returned %d\n", ret);
+	if (ret < 0) {
+		mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret);
+		goto out;
+	}
+
+	/* wait on other node's handler */
+	o2net_set_nst_status_time(&nst);
+	wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
+
+	o2net_update_send_stats(&nst, sc);
+
+	/* Note that we avoid overwriting the callers status return
+	 * variable if a system error was reported on the other
+	 * side. Callers beware. */
+	ret = o2net_sys_err_to_errno(nsw.ns_sys_status);
+	if (status && !ret)
+		*status = nsw.ns_status;
+
+	mlog(0, "woken, returning system status %d, user status %d\n",
+	     ret, nsw.ns_status);
+out:
+	o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
+	if (sc)
+		sc_put(sc);
+	if (vec)
+		kfree(vec);
+	if (msg)
+		kfree(msg);
+	o2net_complete_nsw(nn, &nsw, 0, 0, 0);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(o2net_send_message_vec);
+
+int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
+		       u8 target_node, int *status)
+{
+	struct kvec vec = {
+		.iov_base = data,
+		.iov_len = len,
+	};
+	return o2net_send_message_vec(msg_type, key, &vec, 1,
+				      target_node, status);
+}
+EXPORT_SYMBOL_GPL(o2net_send_message);
+
+static int o2net_send_status_magic(struct socket *sock, struct o2net_msg *hdr,
+				   enum o2net_system_error syserr, int err)
+{
+	struct kvec vec = {
+		.iov_base = hdr,
+		.iov_len = sizeof(struct o2net_msg),
+	};
+
+	BUG_ON(syserr >= O2NET_ERR_MAX);
+
+	/* leave other fields intact from the incoming message, msg_num
+	 * in particular */
+	hdr->sys_status = cpu_to_be32(syserr);
+	hdr->status = cpu_to_be32(err);
+	hdr->magic = cpu_to_be16(O2NET_MSG_STATUS_MAGIC);  // twiddle the magic
+	hdr->data_len = 0;
+
+	msglog(hdr, "about to send status magic %d\n", err);
+	/* hdr has been in host byteorder this whole time */
+	return o2net_send_tcp_msg(sock, &vec, 1, sizeof(struct o2net_msg));
+}
+
+/* this returns -errno if the header was unknown or too large, etc.
+ * after this is called the buffer us reused for the next message */
+static int o2net_process_message(struct o2net_sock_container *sc,
+				 struct o2net_msg *hdr)
+{
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+	int ret = 0, handler_status;
+	enum  o2net_system_error syserr;
+	struct o2net_msg_handler *nmh = NULL;
+	void *ret_data = NULL;
+
+	msglog(hdr, "processing message\n");
+
+	o2net_sc_postpone_idle(sc);
+
+	switch(be16_to_cpu(hdr->magic)) {
+		case O2NET_MSG_STATUS_MAGIC:
+			/* special type for returning message status */
+			o2net_complete_nsw(nn, NULL,
+					   be32_to_cpu(hdr->msg_num),
+					   be32_to_cpu(hdr->sys_status),
+					   be32_to_cpu(hdr->status));
+			goto out;
+		case O2NET_MSG_KEEP_REQ_MAGIC:
+			o2net_sendpage(sc, o2net_keep_resp,
+				       sizeof(*o2net_keep_resp));
+			goto out;
+		case O2NET_MSG_KEEP_RESP_MAGIC:
+			goto out;
+		case O2NET_MSG_MAGIC:
+			break;
+		default:
+			msglog(hdr, "bad magic\n");
+			ret = -EINVAL;
+			goto out;
+			break;
+	}
+
+	/* find a handler for it */
+	handler_status = 0;
+	nmh = o2net_handler_get(be16_to_cpu(hdr->msg_type),
+				be32_to_cpu(hdr->key));
+	if (!nmh) {
+		mlog(ML_TCP, "couldn't find handler for type %u key %08x\n",
+		     be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key));
+		syserr = O2NET_ERR_NO_HNDLR;
+		goto out_respond;
+	}
+
+	syserr = O2NET_ERR_NONE;
+
+	if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len)
+		syserr = O2NET_ERR_OVERFLOW;
+
+	if (syserr != O2NET_ERR_NONE)
+		goto out_respond;
+
+	o2net_set_func_start_time(sc);
+	sc->sc_msg_key = be32_to_cpu(hdr->key);
+	sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
+	handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
+					     be16_to_cpu(hdr->data_len),
+					nmh->nh_func_data, &ret_data);
+	o2net_set_func_stop_time(sc);
+
+	o2net_update_recv_stats(sc);
+
+out_respond:
+	/* this destroys the hdr, so don't use it after this */
+	mutex_lock(&sc->sc_send_lock);
+	ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr,
+				      handler_status);
+	mutex_unlock(&sc->sc_send_lock);
+	hdr = NULL;
+	mlog(0, "sending handler status %d, syserr %d returned %d\n",
+	     handler_status, syserr, ret);
+
+	if (nmh) {
+		BUG_ON(ret_data != NULL && nmh->nh_post_func == NULL);
+		if (nmh->nh_post_func)
+			(nmh->nh_post_func)(handler_status, nmh->nh_func_data,
+					    ret_data);
+	}
+
+out:
+	if (nmh)
+		o2net_handler_put(nmh);
+	return ret;
+}
+
+static int o2net_check_handshake(struct o2net_sock_container *sc)
+{
+	struct o2net_handshake *hand = page_address(sc->sc_page);
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+	if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
+		     "version %llu but %llu is required, disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     (unsigned long long)be64_to_cpu(hand->protocol_version),
+		     O2NET_PROTOCOL_VERSION);
+
+		/* don't bother reconnecting if its the wrong version. */
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	/*
+	 * Ensure timeouts are consistent with other nodes, otherwise
+	 * we can end up with one node thinking that the other must be down,
+	 * but isn't. This can ultimately cause corruption.
+	 */
+	if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
+				o2net_idle_timeout()) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
+		     "%u ms, but we use %u ms locally.  disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     be32_to_cpu(hand->o2net_idle_timeout_ms),
+		     o2net_idle_timeout());
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
+			o2net_keepalive_delay()) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
+		     "%u ms, but we use %u ms locally.  disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     be32_to_cpu(hand->o2net_keepalive_delay_ms),
+		     o2net_keepalive_delay());
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
+			O2HB_MAX_WRITE_TIMEOUT_MS) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of "
+		     "%u ms, but we use %u ms locally.  disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
+		     O2HB_MAX_WRITE_TIMEOUT_MS);
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	sc->sc_handshake_ok = 1;
+
+	spin_lock(&nn->nn_lock);
+	/* set valid and queue the idle timers only if it hasn't been
+	 * shut down already */
+	if (nn->nn_sc == sc) {
+		o2net_sc_reset_idle_timer(sc);
+		atomic_set(&nn->nn_timeout, 0);
+		o2net_set_nn_state(nn, sc, 1, 0);
+	}
+	spin_unlock(&nn->nn_lock);
+
+	/* shift everything up as though it wasn't there */
+	sc->sc_page_off -= sizeof(struct o2net_handshake);
+	if (sc->sc_page_off)
+		memmove(hand, hand + 1, sc->sc_page_off);
+
+	return 0;
+}
+
+/* this demuxes the queued rx bytes into header or payload bits and calls
+ * handlers as each full message is read off the socket.  it returns -error,
+ * == 0 eof, or > 0 for progress made.*/
+static int o2net_advance_rx(struct o2net_sock_container *sc)
+{
+	struct o2net_msg *hdr;
+	int ret = 0;
+	void *data;
+	size_t datalen;
+
+	sclog(sc, "receiving\n");
+	o2net_set_advance_start_time(sc);
+
+	if (unlikely(sc->sc_handshake_ok == 0)) {
+		if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
+			data = page_address(sc->sc_page) + sc->sc_page_off;
+			datalen = sizeof(struct o2net_handshake) - sc->sc_page_off;
+			ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+			if (ret > 0)
+				sc->sc_page_off += ret;
+		}
+
+		if (sc->sc_page_off == sizeof(struct o2net_handshake)) {
+			o2net_check_handshake(sc);
+			if (unlikely(sc->sc_handshake_ok == 0))
+				ret = -EPROTO;
+		}
+		goto out;
+	}
+
+	/* do we need more header? */
+	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
+		data = page_address(sc->sc_page) + sc->sc_page_off;
+		datalen = sizeof(struct o2net_msg) - sc->sc_page_off;
+		ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+		if (ret > 0) {
+			sc->sc_page_off += ret;
+			/* only swab incoming here.. we can
+			 * only get here once as we cross from
+			 * being under to over */
+			if (sc->sc_page_off == sizeof(struct o2net_msg)) {
+				hdr = page_address(sc->sc_page);
+				if (be16_to_cpu(hdr->data_len) >
+				    O2NET_MAX_PAYLOAD_BYTES)
+					ret = -EOVERFLOW;
+			}
+		}
+		if (ret <= 0)
+			goto out;
+	}
+
+	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
+		/* oof, still don't have a header */
+		goto out;
+	}
+
+	/* this was swabbed above when we first read it */
+	hdr = page_address(sc->sc_page);
+
+	msglog(hdr, "at page_off %zu\n", sc->sc_page_off);
+
+	/* do we need more payload? */
+	if (sc->sc_page_off - sizeof(struct o2net_msg) < be16_to_cpu(hdr->data_len)) {
+		/* need more payload */
+		data = page_address(sc->sc_page) + sc->sc_page_off;
+		datalen = (sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len)) -
+			  sc->sc_page_off;
+		ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+		if (ret > 0)
+			sc->sc_page_off += ret;
+		if (ret <= 0)
+			goto out;
+	}
+
+	if (sc->sc_page_off - sizeof(struct o2net_msg) == be16_to_cpu(hdr->data_len)) {
+		/* we can only get here once, the first time we read
+		 * the payload.. so set ret to progress if the handler
+		 * works out. after calling this the message is toast */
+		ret = o2net_process_message(sc, hdr);
+		if (ret == 0)
+			ret = 1;
+		sc->sc_page_off = 0;
+	}
+
+out:
+	sclog(sc, "ret = %d\n", ret);
+	o2net_set_advance_stop_time(sc);
+	return ret;
+}
+
+/* this work func is triggerd by data ready.  it reads until it can read no
+ * more.  it interprets 0, eof, as fatal.  if data_ready hits while we're doing
+ * our work the work struct will be marked and we'll be called again. */
+static void o2net_rx_until_empty(struct work_struct *work)
+{
+	struct o2net_sock_container *sc =
+		container_of(work, struct o2net_sock_container, sc_rx_work);
+	int ret;
+
+	do {
+		ret = o2net_advance_rx(sc);
+	} while (ret > 0);
+
+	if (ret <= 0 && ret != -EAGAIN) {
+		struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+		sclog(sc, "saw error %d, closing\n", ret);
+		/* not permanent so read failed handshake can retry */
+		o2net_ensure_shutdown(nn, sc, 0);
+	}
+
+	sc_put(sc);
+}
+
+static int o2net_set_nodelay(struct socket *sock)
+{
+	int ret, val = 1;
+	mm_segment_t oldfs;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+
+	/*
+	 * Dear unsuspecting programmer,
+	 *
+	 * Don't use sock_setsockopt() for SOL_TCP.  It doesn't check its level
+	 * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
+	 * silently turn into SO_DEBUG.
+	 *
+	 * Yours,
+	 * Keeper of hilariously fragile interfaces.
+	 */
+	ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
+				    (char __user *)&val, sizeof(val));
+
+	set_fs(oldfs);
+	return ret;
+}
+
+static void o2net_initialize_handshake(void)
+{
+	o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
+		O2HB_MAX_WRITE_TIMEOUT_MS);
+	o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout());
+	o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32(
+		o2net_keepalive_delay());
+	o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32(
+		o2net_reconnect_delay());
+}
+
+/* ------------------------------------------------------------ */
+
+/* called when a connect completes and after a sock is accepted.  the
+ * rx path will see the response and mark the sc valid */
+static void o2net_sc_connect_completed(struct work_struct *work)
+{
+	struct o2net_sock_container *sc =
+		container_of(work, struct o2net_sock_container,
+			     sc_connect_work);
+
+	mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n",
+              (unsigned long long)O2NET_PROTOCOL_VERSION,
+	      (unsigned long long)be64_to_cpu(o2net_hand->connector_id));
+
+	o2net_initialize_handshake();
+	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
+	sc_put(sc);
+}
+
+/* this is called as a work_struct func. */
+static void o2net_sc_send_keep_req(struct work_struct *work)
+{
+	struct o2net_sock_container *sc =
+		container_of(work, struct o2net_sock_container,
+			     sc_keepalive_work.work);
+
+	o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req));
+	sc_put(sc);
+}
+
+/* socket shutdown does a del_timer_sync against this as it tears down.
+ * we can't start this timer until we've got to the point in sc buildup
+ * where shutdown is going to be involved */
+static void o2net_idle_timer(unsigned long data)
+{
+	struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+#ifdef CONFIG_DEBUG_FS
+	ktime_t now = ktime_get();
+#endif
+
+	printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
+	     "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
+		     o2net_idle_timeout() / 1000,
+		     o2net_idle_timeout() % 1000);
+
+#ifdef CONFIG_DEBUG_FS
+	mlog(ML_NOTICE, "Here are some times that might help debug the "
+	     "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
+	     "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
+	     (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
+	     (long long)ktime_to_us(sc->sc_tv_data_ready),
+	     (long long)ktime_to_us(sc->sc_tv_advance_start),
+	     (long long)ktime_to_us(sc->sc_tv_advance_stop),
+	     sc->sc_msg_key, sc->sc_msg_type,
+	     (long long)ktime_to_us(sc->sc_tv_func_start),
+	     (long long)ktime_to_us(sc->sc_tv_func_stop));
+#endif
+
+	/*
+	 * Initialize the nn_timeout so that the next connection attempt
+	 * will continue in o2net_start_connect.
+	 */
+	atomic_set(&nn->nn_timeout, 1);
+
+	o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+}
+
+static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
+{
+	o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
+	o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
+		      msecs_to_jiffies(o2net_keepalive_delay()));
+	o2net_set_sock_timer(sc);
+	mod_timer(&sc->sc_idle_timeout,
+	       jiffies + msecs_to_jiffies(o2net_idle_timeout()));
+}
+
+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
+{
+	/* Only push out an existing timer */
+	if (timer_pending(&sc->sc_idle_timeout))
+		o2net_sc_reset_idle_timer(sc);
+}
+
+/* this work func is kicked whenever a path sets the nn state which doesn't
+ * have valid set.  This includes seeing hb come up, losing a connection,
+ * having a connect attempt fail, etc. This centralizes the logic which decides
+ * if a connect attempt should be made or if we should give up and all future
+ * transmit attempts should fail */
+static void o2net_start_connect(struct work_struct *work)
+{
+	struct o2net_node *nn =
+		container_of(work, struct o2net_node, nn_connect_work.work);
+	struct o2net_sock_container *sc = NULL;
+	struct o2nm_node *node = NULL, *mynode = NULL;
+	struct socket *sock = NULL;
+	struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
+	int ret = 0, stop;
+	unsigned int timeout;
+
+	/* if we're greater we initiate tx, otherwise we accept */
+	if (o2nm_this_node() <= o2net_num_from_nn(nn))
+		goto out;
+
+	/* watch for racing with tearing a node down */
+	node = o2nm_get_node_by_num(o2net_num_from_nn(nn));
+	if (node == NULL) {
+		ret = 0;
+		goto out;
+	}
+
+	mynode = o2nm_get_node_by_num(o2nm_this_node());
+	if (mynode == NULL) {
+		ret = 0;
+		goto out;
+	}
+
+	spin_lock(&nn->nn_lock);
+	/*
+	 * see if we already have one pending or have given up.
+	 * For nn_timeout, it is set when we close the connection
+	 * because of the idle time out. So it means that we have
+	 * at least connected to that node successfully once,
+	 * now try to connect to it again.
+	 */
+	timeout = atomic_read(&nn->nn_timeout);
+	stop = (nn->nn_sc ||
+		(nn->nn_persistent_error &&
+		(nn->nn_persistent_error != -ENOTCONN || timeout == 0)));
+	spin_unlock(&nn->nn_lock);
+	if (stop)
+		goto out;
+
+	nn->nn_last_connect_attempt = jiffies;
+
+	sc = sc_alloc(node);
+	if (sc == NULL) {
+		mlog(0, "couldn't allocate sc\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0) {
+		mlog(0, "can't create socket: %d\n", ret);
+		goto out;
+	}
+	sc->sc_sock = sock; /* freed by sc_kref_release */
+
+	sock->sk->sk_allocation = GFP_ATOMIC;
+
+	myaddr.sin_family = AF_INET;
+	myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
+	myaddr.sin_port = htons(0); /* any port */
+
+	ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
+			      sizeof(myaddr));
+	if (ret) {
+		mlog(ML_ERROR, "bind failed with %d at address %pI4\n",
+		     ret, &mynode->nd_ipv4_address);
+		goto out;
+	}
+
+	ret = o2net_set_nodelay(sc->sc_sock);
+	if (ret) {
+		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
+		goto out;
+	}
+
+	o2net_register_callbacks(sc->sc_sock->sk, sc);
+
+	spin_lock(&nn->nn_lock);
+	/* handshake completion will set nn->nn_sc_valid */
+	o2net_set_nn_state(nn, sc, 0, 0);
+	spin_unlock(&nn->nn_lock);
+
+	remoteaddr.sin_family = AF_INET;
+	remoteaddr.sin_addr.s_addr = node->nd_ipv4_address;
+	remoteaddr.sin_port = node->nd_ipv4_port;
+
+	ret = sc->sc_sock->ops->connect(sc->sc_sock,
+					(struct sockaddr *)&remoteaddr,
+					sizeof(remoteaddr),
+					O_NONBLOCK);
+	if (ret == -EINPROGRESS)
+		ret = 0;
+
+out:
+	if (ret) {
+		mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
+		     "with errno %d\n", SC_NODEF_ARGS(sc), ret);
+		/* 0 err so that another will be queued and attempted
+		 * from set_nn_state */
+		if (sc)
+			o2net_ensure_shutdown(nn, sc, 0);
+	}
+	if (sc)
+		sc_put(sc);
+	if (node)
+		o2nm_node_put(node);
+	if (mynode)
+		o2nm_node_put(mynode);
+
+	return;
+}
+
+static void o2net_connect_expired(struct work_struct *work)
+{
+	struct o2net_node *nn =
+		container_of(work, struct o2net_node, nn_connect_expired.work);
+
+	spin_lock(&nn->nn_lock);
+	if (!nn->nn_sc_valid) {
+		mlog(ML_ERROR, "no connection established with node %u after "
+		     "%u.%u seconds, giving up and returning errors.\n",
+		     o2net_num_from_nn(nn),
+		     o2net_idle_timeout() / 1000,
+		     o2net_idle_timeout() % 1000);
+
+		o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
+	}
+	spin_unlock(&nn->nn_lock);
+}
+
+static void o2net_still_up(struct work_struct *work)
+{
+	struct o2net_node *nn =
+		container_of(work, struct o2net_node, nn_still_up.work);
+
+	o2quo_hb_still_up(o2net_num_from_nn(nn));
+}
+
+/* ------------------------------------------------------------ */
+
+void o2net_disconnect_node(struct o2nm_node *node)
+{
+	struct o2net_node *nn = o2net_nn_from_num(node->nd_num);
+
+	/* don't reconnect until it's heartbeating again */
+	spin_lock(&nn->nn_lock);
+	atomic_set(&nn->nn_timeout, 0);
+	o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
+	spin_unlock(&nn->nn_lock);
+
+	if (o2net_wq) {
+		cancel_delayed_work(&nn->nn_connect_expired);
+		cancel_delayed_work(&nn->nn_connect_work);
+		cancel_delayed_work(&nn->nn_still_up);
+		flush_workqueue(o2net_wq);
+	}
+}
+
+static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
+				  void *data)
+{
+	o2quo_hb_down(node_num);
+
+	if (!node)
+		return;
+
+	if (node_num != o2nm_this_node())
+		o2net_disconnect_node(node);
+
+	BUG_ON(atomic_read(&o2net_connected_peers) < 0);
+}
+
+static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
+				void *data)
+{
+	struct o2net_node *nn = o2net_nn_from_num(node_num);
+
+	o2quo_hb_up(node_num);
+
+	BUG_ON(!node);
+
+	/* ensure an immediate connect attempt */
+	nn->nn_last_connect_attempt = jiffies -
+		(msecs_to_jiffies(o2net_reconnect_delay()) + 1);
+
+	if (node_num != o2nm_this_node()) {
+		/* believe it or not, accept and node hearbeating testing
+		 * can succeed for this node before we got here.. so
+		 * only use set_nn_state to clear the persistent error
+		 * if that hasn't already happened */
+		spin_lock(&nn->nn_lock);
+		atomic_set(&nn->nn_timeout, 0);
+		if (nn->nn_persistent_error)
+			o2net_set_nn_state(nn, NULL, 0, 0);
+		spin_unlock(&nn->nn_lock);
+	}
+}
+
+void o2net_unregister_hb_callbacks(void)
+{
+	o2hb_unregister_callback(NULL, &o2net_hb_up);
+	o2hb_unregister_callback(NULL, &o2net_hb_down);
+}
+
+int o2net_register_hb_callbacks(void)
+{
+	int ret;
+
+	o2hb_setup_callback(&o2net_hb_down, O2HB_NODE_DOWN_CB,
+			    o2net_hb_node_down_cb, NULL, O2NET_HB_PRI);
+	o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
+			    o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
+
+	ret = o2hb_register_callback(NULL, &o2net_hb_up);
+	if (ret == 0)
+		ret = o2hb_register_callback(NULL, &o2net_hb_down);
+
+	if (ret)
+		o2net_unregister_hb_callbacks();
+
+	return ret;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_accept_one(struct socket *sock)
+{
+	int ret, slen;
+	struct sockaddr_in sin;
+	struct socket *new_sock = NULL;
+	struct o2nm_node *node = NULL;
+	struct o2nm_node *local_node = NULL;
+	struct o2net_sock_container *sc = NULL;
+	struct o2net_node *nn;
+
+	BUG_ON(sock == NULL);
+	ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
+			       sock->sk->sk_protocol, &new_sock);
+	if (ret)
+		goto out;
+
+	new_sock->type = sock->type;
+	new_sock->ops = sock->ops;
+	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
+	if (ret < 0)
+		goto out;
+
+	new_sock->sk->sk_allocation = GFP_ATOMIC;
+
+	ret = o2net_set_nodelay(new_sock);
+	if (ret) {
+		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
+		goto out;
+	}
+
+	slen = sizeof(sin);
+	ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
+				       &slen, 1);
+	if (ret < 0)
+		goto out;
+
+	node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
+	if (node == NULL) {
+		mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n",
+		     &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (o2nm_this_node() >= node->nd_num) {
+		local_node = o2nm_get_node_by_num(o2nm_this_node());
+		mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' ("
+		     "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n",
+		     local_node->nd_name, local_node->nd_num,
+		     &(local_node->nd_ipv4_address),
+		     ntohs(local_node->nd_ipv4_port),
+		     node->nd_name, node->nd_num, &sin.sin_addr.s_addr,
+		     ntohs(sin.sin_port));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* this happens all the time when the other node sees our heartbeat
+	 * and tries to connect before we see their heartbeat */
+	if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) {
+		mlog(ML_CONN, "attempt to connect from node '%s' at "
+		     "%pI4:%d but it isn't heartbeating\n",
+		     node->nd_name, &sin.sin_addr.s_addr,
+		     ntohs(sin.sin_port));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	nn = o2net_nn_from_num(node->nd_num);
+
+	spin_lock(&nn->nn_lock);
+	if (nn->nn_sc)
+		ret = -EBUSY;
+	else
+		ret = 0;
+	spin_unlock(&nn->nn_lock);
+	if (ret) {
+		mlog(ML_NOTICE, "attempt to connect from node '%s' at "
+		     "%pI4:%d but it already has an open connection\n",
+		     node->nd_name, &sin.sin_addr.s_addr,
+		     ntohs(sin.sin_port));
+		goto out;
+	}
+
+	sc = sc_alloc(node);
+	if (sc == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	sc->sc_sock = new_sock;
+	new_sock = NULL;
+
+	spin_lock(&nn->nn_lock);
+	atomic_set(&nn->nn_timeout, 0);
+	o2net_set_nn_state(nn, sc, 0, 0);
+	spin_unlock(&nn->nn_lock);
+
+	o2net_register_callbacks(sc->sc_sock->sk, sc);
+	o2net_sc_queue_work(sc, &sc->sc_rx_work);
+
+	o2net_initialize_handshake();
+	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
+
+out:
+	if (new_sock)
+		sock_release(new_sock);
+	if (node)
+		o2nm_node_put(node);
+	if (local_node)
+		o2nm_node_put(local_node);
+	if (sc)
+		sc_put(sc);
+	return ret;
+}
+
+static void o2net_accept_many(struct work_struct *work)
+{
+	struct socket *sock = o2net_listen_sock;
+	while (o2net_accept_one(sock) == 0)
+		cond_resched();
+}
+
+static void o2net_listen_data_ready(struct sock *sk, int bytes)
+{
+	void (*ready)(struct sock *sk, int bytes);
+
+	read_lock(&sk->sk_callback_lock);
+	ready = sk->sk_user_data;
+	if (ready == NULL) { /* check for teardown race */
+		ready = sk->sk_data_ready;
+		goto out;
+	}
+
+	/* ->sk_data_ready is also called for a newly established child socket
+	 * before it has been accepted and the acceptor has set up their
+	 * data_ready.. we only want to queue listen work for our listening
+	 * socket */
+	if (sk->sk_state == TCP_LISTEN) {
+		mlog(ML_TCP, "bytes: %d\n", bytes);
+		queue_work(o2net_wq, &o2net_listen_work);
+	}
+
+out:
+	read_unlock(&sk->sk_callback_lock);
+	ready(sk, bytes);
+}
+
+static int o2net_open_listening_sock(__be32 addr, __be16 port)
+{
+	struct socket *sock = NULL;
+	int ret;
+	struct sockaddr_in sin = {
+		.sin_family = PF_INET,
+		.sin_addr = { .s_addr = addr },
+		.sin_port = port,
+	};
+
+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0) {
+		mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
+		goto out;
+	}
+
+	sock->sk->sk_allocation = GFP_ATOMIC;
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_user_data = sock->sk->sk_data_ready;
+	sock->sk->sk_data_ready = o2net_listen_data_ready;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	o2net_listen_sock = sock;
+	INIT_WORK(&o2net_listen_work, o2net_accept_many);
+
+	sock->sk->sk_reuse = 1;
+	ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+	if (ret < 0) {
+		mlog(ML_ERROR, "unable to bind socket at %pI4:%u, "
+		     "ret=%d\n", &addr, ntohs(port), ret);
+		goto out;
+	}
+
+	ret = sock->ops->listen(sock, 64);
+	if (ret < 0) {
+		mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n",
+		     &addr, ntohs(port), ret);
+	}
+
+out:
+	if (ret) {
+		o2net_listen_sock = NULL;
+		if (sock)
+			sock_release(sock);
+	}
+	return ret;
+}
+
+/*
+ * called from node manager when we should bring up our network listening
+ * socket.  node manager handles all the serialization to only call this
+ * once and to match it with o2net_stop_listening().  note,
+ * o2nm_this_node() doesn't work yet as we're being called while it
+ * is being set up.
+ */
+int o2net_start_listening(struct o2nm_node *node)
+{
+	int ret = 0;
+
+	BUG_ON(o2net_wq != NULL);
+	BUG_ON(o2net_listen_sock != NULL);
+
+	mlog(ML_KTHREAD, "starting o2net thread...\n");
+	o2net_wq = create_singlethread_workqueue("o2net");
+	if (o2net_wq == NULL) {
+		mlog(ML_ERROR, "unable to launch o2net thread\n");
+		return -ENOMEM; /* ? */
+	}
+
+	ret = o2net_open_listening_sock(node->nd_ipv4_address,
+					node->nd_ipv4_port);
+	if (ret) {
+		destroy_workqueue(o2net_wq);
+		o2net_wq = NULL;
+	} else
+		o2quo_conn_up(node->nd_num);
+
+	return ret;
+}
+
+/* again, o2nm_this_node() doesn't work here as we're involved in
+ * tearing it down */
+void o2net_stop_listening(struct o2nm_node *node)
+{
+	struct socket *sock = o2net_listen_sock;
+	size_t i;
+
+	BUG_ON(o2net_wq == NULL);
+	BUG_ON(o2net_listen_sock == NULL);
+
+	/* stop the listening socket from generating work */
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_data_ready = sock->sk->sk_user_data;
+	sock->sk->sk_user_data = NULL;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
+		struct o2nm_node *node = o2nm_get_node_by_num(i);
+		if (node) {
+			o2net_disconnect_node(node);
+			o2nm_node_put(node);
+		}
+	}
+
+	/* finish all work and tear down the work queue */
+	mlog(ML_KTHREAD, "waiting for o2net thread to exit....\n");
+	destroy_workqueue(o2net_wq);
+	o2net_wq = NULL;
+
+	sock_release(o2net_listen_sock);
+	o2net_listen_sock = NULL;
+
+	o2quo_conn_err(node->nd_num);
+}
+
+/* ------------------------------------------------------------ */
+
+int o2net_init(void)
+{
+	unsigned long i;
+
+	o2quo_init();
+
+	if (o2net_debugfs_init())
+		return -ENOMEM;
+
+	o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
+	o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
+	o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
+	if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) {
+		kfree(o2net_hand);
+		kfree(o2net_keep_req);
+		kfree(o2net_keep_resp);
+		return -ENOMEM;
+	}
+
+	o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
+	o2net_hand->connector_id = cpu_to_be64(1);
+
+	o2net_keep_req->magic = cpu_to_be16(O2NET_MSG_KEEP_REQ_MAGIC);
+	o2net_keep_resp->magic = cpu_to_be16(O2NET_MSG_KEEP_RESP_MAGIC);
+
+	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
+		struct o2net_node *nn = o2net_nn_from_num(i);
+
+		atomic_set(&nn->nn_timeout, 0);
+		spin_lock_init(&nn->nn_lock);
+		INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect);
+		INIT_DELAYED_WORK(&nn->nn_connect_expired,
+				  o2net_connect_expired);
+		INIT_DELAYED_WORK(&nn->nn_still_up, o2net_still_up);
+		/* until we see hb from a node we'll return einval */
+		nn->nn_persistent_error = -ENOTCONN;
+		init_waitqueue_head(&nn->nn_sc_wq);
+		idr_init(&nn->nn_status_idr);
+		INIT_LIST_HEAD(&nn->nn_status_list);
+	}
+
+	return 0;
+}
+
+void o2net_exit(void)
+{
+	o2quo_exit();
+	kfree(o2net_hand);
+	kfree(o2net_keep_req);
+	kfree(o2net_keep_resp);
+	o2net_debugfs_exit();
+}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
new file mode 100644
index 00000000..fd6179eb
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp.h
@@ -0,0 +1,152 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * tcp.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_TCP_H
+#define O2CLUSTER_TCP_H
+
+#include <linux/socket.h>
+#ifdef __KERNEL__
+#include <net/sock.h>
+#include <linux/tcp.h>
+#else
+#include <sys/socket.h>
+#endif
+#include <linux/inet.h>
+#include <linux/in.h>
+
+struct o2net_msg
+{
+	__be16 magic;
+	__be16 data_len;
+	__be16 msg_type;
+	__be16 pad1;
+	__be32 sys_status;
+	__be32 status;
+	__be32 key;
+	__be32 msg_num;
+	__u8  buf[0];
+};
+
+typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data,
+				     void **ret_data);
+typedef void (o2net_post_msg_handler_func)(int status, void *data,
+					   void *ret_data);
+
+#define O2NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(struct o2net_msg))
+
+/* same as hb delay, we're waiting for another node to recognize our hb */
+#define O2NET_RECONNECT_DELAY_MS_DEFAULT	2000
+
+#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT	2000
+#define O2NET_IDLE_TIMEOUT_MS_DEFAULT		30000
+
+
+/* TODO: figure this out.... */
+static inline int o2net_link_down(int err, struct socket *sock)
+{
+	if (sock) {
+		if (sock->sk->sk_state != TCP_ESTABLISHED &&
+	    	    sock->sk->sk_state != TCP_CLOSE_WAIT)
+			return 1;
+	}
+
+	if (err >= 0)
+		return 0;
+	switch (err) {
+		/* ????????????????????????? */
+		case -ERESTARTSYS:
+		case -EBADF:
+		/* When the server has died, an ICMP port unreachable
+		 * message prompts ECONNREFUSED. */
+		case -ECONNREFUSED:
+		case -ENOTCONN:
+		case -ECONNRESET:
+		case -EPIPE:
+			return 1;
+	}
+	return 0;
+}
+
+enum {
+	O2NET_DRIVER_UNINITED,
+	O2NET_DRIVER_READY,
+};
+
+int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
+		       u8 target_node, int *status);
+int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
+			   size_t veclen, u8 target_node, int *status);
+
+int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
+			   o2net_msg_handler_func *func, void *data,
+			   o2net_post_msg_handler_func *post_func,
+			   struct list_head *unreg_list);
+void o2net_unregister_handler_list(struct list_head *list);
+
+struct o2nm_node;
+int o2net_register_hb_callbacks(void);
+void o2net_unregister_hb_callbacks(void);
+int o2net_start_listening(struct o2nm_node *node);
+void o2net_stop_listening(struct o2nm_node *node);
+void o2net_disconnect_node(struct o2nm_node *node);
+int o2net_num_connected_peers(void);
+
+int o2net_init(void);
+void o2net_exit(void);
+
+struct o2net_send_tracking;
+struct o2net_sock_container;
+
+#ifdef CONFIG_DEBUG_FS
+int o2net_debugfs_init(void);
+void o2net_debugfs_exit(void);
+void o2net_debug_add_nst(struct o2net_send_tracking *nst);
+void o2net_debug_del_nst(struct o2net_send_tracking *nst);
+void o2net_debug_add_sc(struct o2net_sock_container *sc);
+void o2net_debug_del_sc(struct o2net_sock_container *sc);
+#else
+static inline int o2net_debugfs_init(void)
+{
+	return 0;
+}
+static inline void o2net_debugfs_exit(void)
+{
+}
+static inline void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+}
+static inline void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+}
+#endif	/* CONFIG_DEBUG_FS */
+
+#endif /* O2CLUSTER_TCP_H */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
new file mode 100644
index 00000000..4cbcb657
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -0,0 +1,242 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef O2CLUSTER_TCP_INTERNAL_H
+#define O2CLUSTER_TCP_INTERNAL_H
+
+#define O2NET_MSG_MAGIC           ((u16)0xfa55)
+#define O2NET_MSG_STATUS_MAGIC    ((u16)0xfa56)
+#define O2NET_MSG_KEEP_REQ_MAGIC  ((u16)0xfa57)
+#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
+
+/* we're delaying our quorum decision so that heartbeat will have timed
+ * out truly dead nodes by the time we come around to making decisions
+ * on their number */
+#define O2NET_QUORUM_DELAY_MS	((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
+
+/*
+ * This version number represents quite a lot, unfortunately.  It not
+ * only represents the raw network message protocol on the wire but also
+ * locking semantics of the file system using the protocol.  It should
+ * be somewhere else, I'm sure, but right now it isn't.
+ *
+ * With version 11, we separate out the filesystem locking portion.  The
+ * filesystem now has a major.minor version it negotiates.  Version 11
+ * introduces this negotiation to the o2dlm protocol, and as such the
+ * version here in tcp_internal.h should not need to be bumped for
+ * filesystem locking changes.
+ *
+ * New in version 11
+ * 	- Negotiation of filesystem locking in the dlm join.
+ *
+ * New in version 10:
+ * 	- Meta/data locks combined
+ *
+ * New in version 9:
+ * 	- All votes removed
+ *
+ * New in version 8:
+ * 	- Replace delete inode votes with a cluster lock
+ *
+ * New in version 7:
+ * 	- DLM join domain includes the live nodemap
+ *
+ * New in version 6:
+ * 	- DLM lockres remote refcount fixes.
+ *
+ * New in version 5:
+ * 	- Network timeout checking protocol
+ *
+ * New in version 4:
+ * 	- Remove i_generation from lock names for better stat performance.
+ *
+ * New in version 3:
+ * 	- Replace dentry votes with a cluster lock
+ *
+ * New in version 2:
+ * 	- full 64 bit i_size in the metadata lock lvbs
+ * 	- introduction of "rw" lock and pushing meta/data locking down
+ */
+#define O2NET_PROTOCOL_VERSION 11ULL
+struct o2net_handshake {
+	__be64	protocol_version;
+	__be64	connector_id;
+	__be32  o2hb_heartbeat_timeout_ms;
+	__be32  o2net_idle_timeout_ms;
+	__be32  o2net_keepalive_delay_ms;
+	__be32  o2net_reconnect_delay_ms;
+};
+
+struct o2net_node {
+	/* this is never called from int/bh */
+	spinlock_t			nn_lock;
+
+	/* set the moment an sc is allocated and a connect is started */
+	struct o2net_sock_container	*nn_sc;
+	/* _valid is only set after the handshake passes and tx can happen */
+	unsigned			nn_sc_valid:1;
+	/* if this is set tx just returns it */
+	int				nn_persistent_error;
+	/* It is only set to 1 after the idle time out. */
+	atomic_t			nn_timeout;
+
+	/* threads waiting for an sc to arrive wait on the wq for generation
+	 * to increase.  it is increased when a connecting socket succeeds
+	 * or fails or when an accepted socket is attached. */
+	wait_queue_head_t		nn_sc_wq;
+
+	struct idr			nn_status_idr;
+	struct list_head		nn_status_list;
+
+	/* connects are attempted from when heartbeat comes up until either hb
+	 * goes down, the node is unconfigured, no connect attempts succeed
+	 * before O2NET_CONN_IDLE_DELAY, or a connect succeeds.  connect_work
+	 * is queued from set_nn_state both from hb up and from itself if a
+	 * connect attempt fails and so can be self-arming.  shutdown is
+	 * careful to first mark the nn such that no connects will be attempted
+	 * before canceling delayed connect work and flushing the queue. */
+	struct delayed_work		nn_connect_work;
+	unsigned long			nn_last_connect_attempt;
+
+	/* this is queued as nodes come up and is canceled when a connection is
+	 * established.  this expiring gives up on the node and errors out
+	 * transmits */
+	struct delayed_work		nn_connect_expired;
+
+	/* after we give up on a socket we wait a while before deciding
+	 * that it is still heartbeating and that we should do some
+	 * quorum work */
+	struct delayed_work		nn_still_up;
+};
+
+struct o2net_sock_container {
+	struct kref		sc_kref;
+	/* the next two are valid for the life time of the sc */
+	struct socket		*sc_sock;
+	struct o2nm_node	*sc_node;
+
+	/* all of these sc work structs hold refs on the sc while they are
+	 * queued.  they should not be able to ref a freed sc.  the teardown
+	 * race is with o2net_wq destruction in o2net_stop_listening() */
+
+	/* rx and connect work are generated from socket callbacks.  sc
+	 * shutdown removes the callbacks and then flushes the work queue */
+	struct work_struct	sc_rx_work;
+	struct work_struct	sc_connect_work;
+	/* shutdown work is triggered in two ways.  the simple way is
+	 * for a code path calls ensure_shutdown which gets a lock, removes
+	 * the sc from the nn, and queues the work.  in this case the
+	 * work is single-shot.  the work is also queued from a sock
+	 * callback, though, and in this case the work will find the sc
+	 * still on the nn and will call ensure_shutdown itself.. this
+	 * ends up triggering the shutdown work again, though nothing
+	 * will be done in that second iteration.  so work queue teardown
+	 * has to be careful to remove the sc from the nn before waiting
+	 * on the work queue so that the shutdown work doesn't remove the
+	 * sc and rearm itself.
+	 */
+	struct work_struct	sc_shutdown_work;
+
+	struct timer_list	sc_idle_timeout;
+	struct delayed_work	sc_keepalive_work;
+
+	unsigned		sc_handshake_ok:1;
+
+	struct page 		*sc_page;
+	size_t			sc_page_off;
+
+	/* original handlers for the sockets */
+	void			(*sc_state_change)(struct sock *sk);
+	void			(*sc_data_ready)(struct sock *sk, int bytes);
+
+	u32			sc_msg_key;
+	u16			sc_msg_type;
+
+#ifdef CONFIG_DEBUG_FS
+	struct list_head        sc_net_debug_item;
+	ktime_t			sc_tv_timer;
+	ktime_t			sc_tv_data_ready;
+	ktime_t			sc_tv_advance_start;
+	ktime_t			sc_tv_advance_stop;
+	ktime_t			sc_tv_func_start;
+	ktime_t			sc_tv_func_stop;
+#endif
+#ifdef CONFIG_OCFS2_FS_STATS
+	ktime_t			sc_tv_acquiry_total;
+	ktime_t			sc_tv_send_total;
+	ktime_t			sc_tv_status_total;
+	u32			sc_send_count;
+	u32			sc_recv_count;
+	ktime_t			sc_tv_process_total;
+#endif
+	struct mutex		sc_send_lock;
+};
+
+struct o2net_msg_handler {
+	struct rb_node		nh_node;
+	u32			nh_max_len;
+	u32			nh_msg_type;
+	u32			nh_key;
+	o2net_msg_handler_func	*nh_func;
+	o2net_msg_handler_func	*nh_func_data;
+	o2net_post_msg_handler_func
+				*nh_post_func;
+	struct kref		nh_kref;
+	struct list_head	nh_unregister_item;
+};
+
+enum o2net_system_error {
+	O2NET_ERR_NONE = 0,
+	O2NET_ERR_NO_HNDLR,
+	O2NET_ERR_OVERFLOW,
+	O2NET_ERR_DIED,
+	O2NET_ERR_MAX
+};
+
+struct o2net_status_wait {
+	enum o2net_system_error	ns_sys_status;
+	s32			ns_status;
+	int			ns_id;
+	wait_queue_head_t	ns_wq;
+	struct list_head	ns_node_item;
+};
+
+#ifdef CONFIG_DEBUG_FS
+/* just for state dumps */
+struct o2net_send_tracking {
+	struct list_head		st_net_debug_item;
+	struct task_struct		*st_task;
+	struct o2net_sock_container	*st_sc;
+	u32				st_id;
+	u32				st_msg_type;
+	u32				st_msg_key;
+	u8				st_node;
+	ktime_t				st_sock_time;
+	ktime_t				st_send_time;
+	ktime_t				st_status_time;
+};
+#else
+struct o2net_send_tracking {
+	u32	dummy;
+};
+#endif	/* CONFIG_DEBUG_FS */
+
+#endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
new file mode 100644
index 00000000..a56eee6a
--- /dev/null
+++ b/fs/ocfs2/cluster/ver.c
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include "ver.h"
+
+#define CLUSTER_BUILD_VERSION "1.5.0"
+
+#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
+
+void cluster_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
new file mode 100644
index 00000000..32554c33
--- /dev/null
+++ b/fs/ocfs2/cluster/ver.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef O2CLUSTER_VER_H
+#define O2CLUSTER_VER_H
+
+void cluster_print_version(void);
+
+#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
new file mode 100644
index 00000000..e5ba3481
--- /dev/null
+++ b/fs/ocfs2/dcache.c
@@ -0,0 +1,537 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dcache.c
+ *
+ * dentry cache handling code
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dcache.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "super.h"
+#include "ocfs2_trace.h"
+
+void ocfs2_dentry_attach_gen(struct dentry *dentry)
+{
+	unsigned long gen =
+		OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+	BUG_ON(dentry->d_inode);
+	dentry->d_fsdata = (void *)gen;
+}
+
+
+static int ocfs2_dentry_revalidate(struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode *inode;
+	int ret = 0;    /* if all else fails, just return false */
+	struct ocfs2_super *osb;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	osb = OCFS2_SB(dentry->d_sb);
+
+	trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len,
+				      dentry->d_name.name);
+
+	/* For a negative dentry -
+	 * check the generation number of the parent and compare with the
+	 * one stored in the inode.
+	 */
+	if (inode == NULL) {
+		unsigned long gen = (unsigned long) dentry->d_fsdata;
+		unsigned long pgen =
+			OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+
+		trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
+						       dentry->d_name.name,
+						       pgen, gen);
+		if (gen != pgen)
+			goto bail;
+		goto valid;
+	}
+
+	BUG_ON(!osb);
+
+	if (inode == osb->root_inode || is_bad_inode(inode))
+		goto bail;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	/* did we or someone else delete this inode? */
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		trace_ocfs2_dentry_revalidate_delete(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+		goto bail;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/*
+	 * We don't need a cluster lock to test this because once an
+	 * inode nlink hits zero, it never goes back.
+	 */
+	if (inode->i_nlink == 0) {
+		trace_ocfs2_dentry_revalidate_orphaned(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			S_ISDIR(inode->i_mode));
+		goto bail;
+	}
+
+	/*
+	 * If the last lookup failed to create dentry lock, let us
+	 * redo it.
+	 */
+	if (!dentry->d_fsdata) {
+		trace_ocfs2_dentry_revalidate_nofsdata(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+		goto bail;
+	}
+
+valid:
+	ret = 1;
+
+bail:
+	trace_ocfs2_dentry_revalidate_ret(ret);
+	return ret;
+}
+
+static int ocfs2_match_dentry(struct dentry *dentry,
+			      u64 parent_blkno,
+			      int skip_unhashed)
+{
+	struct inode *parent;
+
+	/*
+	 * ocfs2_lookup() does a d_splice_alias() _before_ attaching
+	 * to the lock data, so we skip those here, otherwise
+	 * ocfs2_dentry_attach_lock() will get its original dentry
+	 * back.
+	 */
+	if (!dentry->d_fsdata)
+		return 0;
+
+	if (!dentry->d_parent)
+		return 0;
+
+	if (skip_unhashed && d_unhashed(dentry))
+		return 0;
+
+	parent = dentry->d_parent->d_inode;
+	/* Negative parent dentry? */
+	if (!parent)
+		return 0;
+
+	/* Name is in a different directory. */
+	if (OCFS2_I(parent)->ip_blkno != parent_blkno)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Walk the inode alias list, and find a dentry which has a given
+ * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it
+ * is looking for a dentry_lock reference. The downconvert thread is
+ * looking to unhash aliases, so we allow it to skip any that already
+ * have that property.
+ */
+struct dentry *ocfs2_find_local_alias(struct inode *inode,
+				      u64 parent_blkno,
+				      int skip_unhashed)
+{
+	struct list_head *p;
+	struct dentry *dentry = NULL;
+
+	spin_lock(&inode->i_lock);
+	list_for_each(p, &inode->i_dentry) {
+		dentry = list_entry(p, struct dentry, d_alias);
+
+		spin_lock(&dentry->d_lock);
+		if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
+			trace_ocfs2_find_local_alias(dentry->d_name.len,
+						     dentry->d_name.name);
+
+			dget_dlock(dentry);
+			spin_unlock(&dentry->d_lock);
+			break;
+		}
+		spin_unlock(&dentry->d_lock);
+
+		dentry = NULL;
+	}
+
+	spin_unlock(&inode->i_lock);
+
+	return dentry;
+}
+
+DEFINE_SPINLOCK(dentry_attach_lock);
+
+/*
+ * Attach this dentry to a cluster lock.
+ *
+ * Dentry locks cover all links in a given directory to a particular
+ * inode. We do this so that ocfs2 can build a lock name which all
+ * nodes in the cluster can agree on at all times. Shoving full names
+ * in the cluster lock won't work due to size restrictions. Covering
+ * links inside of a directory is a good compromise because it still
+ * allows us to use the parent directory lock to synchronize
+ * operations.
+ *
+ * Call this function with the parent dir semaphore and the parent dir
+ * cluster lock held.
+ *
+ * The dir semaphore will protect us from having to worry about
+ * concurrent processes on our node trying to attach a lock at the
+ * same time.
+ *
+ * The dir cluster lock (held at either PR or EX mode) protects us
+ * from unlink and rename on other nodes.
+ *
+ * A dput() can happen asynchronously due to pruning, so we cover
+ * attaching and detaching the dentry lock with a
+ * dentry_attach_lock.
+ *
+ * A node which has done lookup on a name retains a protected read
+ * lock until final dput. If the user requests and unlink or rename,
+ * the protected read is upgraded to an exclusive lock. Other nodes
+ * who have seen the dentry will then be informed that they need to
+ * downgrade their lock, which will involve d_delete on the
+ * dentry. This happens in ocfs2_dentry_convert_worker().
+ */
+int ocfs2_dentry_attach_lock(struct dentry *dentry,
+			     struct inode *inode,
+			     u64 parent_blkno)
+{
+	int ret;
+	struct dentry *alias;
+	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+
+	trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
+				       (unsigned long long)parent_blkno, dl);
+
+	/*
+	 * Negative dentry. We ignore these for now.
+	 *
+	 * XXX: Could we can improve ocfs2_dentry_revalidate() by
+	 * tracking these?
+	 */
+	if (!inode)
+		return 0;
+
+	if (!dentry->d_inode && dentry->d_fsdata) {
+		/* Converting a negative dentry to positive
+		   Clear dentry->d_fsdata */
+		dentry->d_fsdata = dl = NULL;
+	}
+
+	if (dl) {
+		mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
+				" \"%.*s\": old parent: %llu, new: %llu\n",
+				dentry->d_name.len, dentry->d_name.name,
+				(unsigned long long)parent_blkno,
+				(unsigned long long)dl->dl_parent_blkno);
+		return 0;
+	}
+
+	alias = ocfs2_find_local_alias(inode, parent_blkno, 0);
+	if (alias) {
+		/*
+		 * Great, an alias exists, which means we must have a
+		 * dentry lock already. We can just grab the lock off
+		 * the alias and add it to the list.
+		 *
+		 * We're depending here on the fact that this dentry
+		 * was found and exists in the dcache and so must have
+		 * a reference to the dentry_lock because we can't
+		 * race creates. Final dput() cannot happen on it
+		 * since we have it pinned, so our reference is safe.
+		 */
+		dl = alias->d_fsdata;
+		mlog_bug_on_msg(!dl, "parent %llu, ino %llu\n",
+				(unsigned long long)parent_blkno,
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+		mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
+				" \"%.*s\": old parent: %llu, new: %llu\n",
+				dentry->d_name.len, dentry->d_name.name,
+				(unsigned long long)parent_blkno,
+				(unsigned long long)dl->dl_parent_blkno);
+
+		trace_ocfs2_dentry_attach_lock_found(dl->dl_lockres.l_name,
+				(unsigned long long)parent_blkno,
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+		goto out_attach;
+	}
+
+	/*
+	 * There are no other aliases
+	 */
+	dl = kmalloc(sizeof(*dl), GFP_NOFS);
+	if (!dl) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	dl->dl_count = 0;
+	/*
+	 * Does this have to happen below, for all attaches, in case
+	 * the struct inode gets blown away by the downconvert thread?
+	 */
+	dl->dl_inode = igrab(inode);
+	dl->dl_parent_blkno = parent_blkno;
+	ocfs2_dentry_lock_res_init(dl, parent_blkno, inode);
+
+out_attach:
+	spin_lock(&dentry_attach_lock);
+	dentry->d_fsdata = dl;
+	dl->dl_count++;
+	spin_unlock(&dentry_attach_lock);
+
+	/*
+	 * This actually gets us our PRMODE level lock. From now on,
+	 * we'll have a notification if one of these names is
+	 * destroyed on another node.
+	 */
+	ret = ocfs2_dentry_lock(dentry, 0);
+	if (!ret)
+		ocfs2_dentry_unlock(dentry, 0);
+	else
+		mlog_errno(ret);
+
+	/*
+	 * In case of error, manually free the allocation and do the iput().
+	 * We need to do this because error here means no d_instantiate(),
+	 * which means iput() will not be called during dput(dentry).
+	 */
+	if (ret < 0 && !alias) {
+		ocfs2_lock_res_free(&dl->dl_lockres);
+		BUG_ON(dl->dl_count != 1);
+		spin_lock(&dentry_attach_lock);
+		dentry->d_fsdata = NULL;
+		spin_unlock(&dentry_attach_lock);
+		kfree(dl);
+		iput(inode);
+	}
+
+	dput(alias);
+
+	return ret;
+}
+
+DEFINE_SPINLOCK(dentry_list_lock);
+
+/* We limit the number of dentry locks to drop in one go. We have
+ * this limit so that we don't starve other users of ocfs2_wq. */
+#define DL_INODE_DROP_COUNT 64
+
+/* Drop inode references from dentry locks */
+static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
+{
+	struct ocfs2_dentry_lock *dl;
+
+	spin_lock(&dentry_list_lock);
+	while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
+		dl = osb->dentry_lock_list;
+		osb->dentry_lock_list = dl->dl_next;
+		spin_unlock(&dentry_list_lock);
+		iput(dl->dl_inode);
+		kfree(dl);
+		spin_lock(&dentry_list_lock);
+	}
+	spin_unlock(&dentry_list_lock);
+}
+
+void ocfs2_drop_dl_inodes(struct work_struct *work)
+{
+	struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+					       dentry_lock_work);
+
+	__ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
+	/*
+	 * Don't queue dropping if umount is in progress. We flush the
+	 * list in ocfs2_dismount_volume
+	 */
+	spin_lock(&dentry_list_lock);
+	if (osb->dentry_lock_list &&
+	    !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
+		queue_work(ocfs2_wq, &osb->dentry_lock_work);
+	spin_unlock(&dentry_list_lock);
+}
+
+/* Flush the whole work queue */
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
+{
+	__ocfs2_drop_dl_inodes(osb, -1);
+}
+
+/*
+ * ocfs2_dentry_iput() and friends.
+ *
+ * At this point, our particular dentry is detached from the inodes
+ * alias list, so there's no way that the locking code can find it.
+ *
+ * The interesting stuff happens when we determine that our lock needs
+ * to go away because this is the last subdir alias in the
+ * system. This function needs to handle a couple things:
+ *
+ * 1) Synchronizing lock shutdown with the downconvert threads. This
+ *    is already handled for us via the lockres release drop function
+ *    called in ocfs2_release_dentry_lock()
+ *
+ * 2) A race may occur when we're doing our lock shutdown and
+ *    another process wants to create a new dentry lock. Right now we
+ *    let them race, which means that for a very short while, this
+ *    node might have two locks on a lock resource. This should be a
+ *    problem though because one of them is in the process of being
+ *    thrown out.
+ */
+static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
+				   struct ocfs2_dentry_lock *dl)
+{
+	ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
+	ocfs2_lock_res_free(&dl->dl_lockres);
+
+	/* We leave dropping of inode reference to ocfs2_wq as that can
+	 * possibly lead to inode deletion which gets tricky */
+	spin_lock(&dentry_list_lock);
+	if (!osb->dentry_lock_list &&
+	    !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
+		queue_work(ocfs2_wq, &osb->dentry_lock_work);
+	dl->dl_next = osb->dentry_lock_list;
+	osb->dentry_lock_list = dl;
+	spin_unlock(&dentry_list_lock);
+}
+
+void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
+			   struct ocfs2_dentry_lock *dl)
+{
+	int unlock;
+
+	BUG_ON(dl->dl_count == 0);
+
+	spin_lock(&dentry_attach_lock);
+	dl->dl_count--;
+	unlock = !dl->dl_count;
+	spin_unlock(&dentry_attach_lock);
+
+	if (unlock)
+		ocfs2_drop_dentry_lock(osb, dl);
+}
+
+static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
+{
+	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+
+	if (!dl) {
+		/*
+		 * No dentry lock is ok if we're disconnected or
+		 * unhashed.
+		 */
+		if (!(dentry->d_flags & DCACHE_DISCONNECTED) &&
+		    !d_unhashed(dentry)) {
+			unsigned long long ino = 0ULL;
+			if (inode)
+				ino = (unsigned long long)OCFS2_I(inode)->ip_blkno;
+			mlog(ML_ERROR, "Dentry is missing cluster lock. "
+			     "inode: %llu, d_flags: 0x%x, d_name: %.*s\n",
+			     ino, dentry->d_flags, dentry->d_name.len,
+			     dentry->d_name.name);
+		}
+
+		goto out;
+	}
+
+	mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n",
+			dentry->d_name.len, dentry->d_name.name,
+			dl->dl_count);
+
+	ocfs2_dentry_lock_put(OCFS2_SB(dentry->d_sb), dl);
+
+out:
+	iput(inode);
+}
+
+/*
+ * d_move(), but keep the locks in sync.
+ *
+ * When we are done, "dentry" will have the parent dir and name of
+ * "target", which will be thrown away.
+ *
+ * We manually update the lock of "dentry" if need be.
+ *
+ * "target" doesn't have it's dentry lock touched - we allow the later
+ * dput() to handle this for us.
+ *
+ * This is called during ocfs2_rename(), while holding parent
+ * directory locks. The dentries have already been deleted on other
+ * nodes via ocfs2_remote_dentry_delete().
+ *
+ * Normally, the VFS handles the d_move() for the file system, after
+ * the ->rename() callback. OCFS2 wants to handle this internally, so
+ * the new lock can be created atomically with respect to the cluster.
+ */
+void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
+		       struct inode *old_dir, struct inode *new_dir)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(old_dir->i_sb);
+	struct inode *inode = dentry->d_inode;
+
+	/*
+	 * Move within the same directory, so the actual lock info won't
+	 * change.
+	 *
+	 * XXX: Is there any advantage to dropping the lock here?
+	 */
+	if (old_dir == new_dir)
+		goto out_move;
+
+	ocfs2_dentry_lock_put(osb, dentry->d_fsdata);
+
+	dentry->d_fsdata = NULL;
+	ret = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(new_dir)->ip_blkno);
+	if (ret)
+		mlog_errno(ret);
+
+out_move:
+	d_move(dentry, target);
+}
+
+const struct dentry_operations ocfs2_dentry_ops = {
+	.d_revalidate		= ocfs2_dentry_revalidate,
+	.d_iput			= ocfs2_dentry_iput,
+};
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
new file mode 100644
index 00000000..b79eff70
--- /dev/null
+++ b/fs/ocfs2/dcache.h
@@ -0,0 +1,69 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dcache.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_DCACHE_H
+#define OCFS2_DCACHE_H
+
+extern const struct dentry_operations ocfs2_dentry_ops;
+
+struct ocfs2_dentry_lock {
+	/* Use count of dentry lock */
+	unsigned int		dl_count;
+	union {
+		/* Linked list of dentry locks to release */
+		struct ocfs2_dentry_lock *dl_next;
+		u64			dl_parent_blkno;
+	};
+
+	/*
+	 * The ocfs2_dentry_lock keeps an inode reference until
+	 * dl_lockres has been destroyed. This is usually done in
+	 * ->d_iput() anyway, so there should be minimal impact.
+	 */
+	struct inode		*dl_inode;
+	struct ocfs2_lock_res	dl_lockres;
+};
+
+int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
+			     u64 parent_blkno);
+
+extern spinlock_t dentry_list_lock;
+
+void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
+			   struct ocfs2_dentry_lock *dl);
+
+void ocfs2_drop_dl_inodes(struct work_struct *work);
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
+
+struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
+				      int skip_unhashed);
+
+void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
+		       struct inode *old_dir, struct inode *new_dir);
+
+extern spinlock_t dentry_attach_lock;
+void ocfs2_dentry_attach_gen(struct dentry *dentry);
+
+#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
new file mode 100644
index 00000000..8582e3f4
--- /dev/null
+++ b/fs/ocfs2/dir.c
@@ -0,0 +1,4555 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.c
+ *
+ * Creates, reads, walks and deletes directory-nodes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ *  Portions of this code from linux/fs/ext3/dir.c
+ *
+ *  Copyright (C) 1992, 1993, 1994, 1995
+ *  Remy Card (card@masi.ibp.fr)
+ *  Laboratoire MASI - Institut Blaise pascal
+ *  Universite Pierre et Marie Curie (Paris VI)
+ *
+ *   from
+ *
+ *   linux/fs/minix/dir.c
+ *
+ *   Copyright (C) 1991, 1992 Linux Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/quotaops.h>
+#include <linux/sort.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "blockcheck.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+#define NAMEI_RA_CHUNKS  2
+#define NAMEI_RA_BLOCKS  4
+#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+
+static unsigned char ocfs2_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int ocfs2_do_extend_dir(struct super_block *sb,
+			       handle_t *handle,
+			       struct inode *dir,
+			       struct buffer_head *parent_fe_bh,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       struct buffer_head **new_bh);
+static int ocfs2_dir_indexed(struct inode *inode);
+
+/*
+ * These are distinct checks because future versions of the file system will
+ * want to have a trailing dirent structure independent of indexing.
+ */
+static int ocfs2_supports_dir_trailer(struct inode *dir)
+{
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
+
+	return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
+}
+
+/*
+ * "new' here refers to the point at which we're creating a new
+ * directory via "mkdir()", but also when we're expanding an inline
+ * directory. In either case, we don't yet have the indexing bit set
+ * on the directory, so the standard checks will fail in when metaecc
+ * is turned off. Only directory-initialization type functions should
+ * use this then. Everything else wants ocfs2_supports_dir_trailer()
+ */
+static int ocfs2_new_dir_wants_trailer(struct inode *dir)
+{
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+	return ocfs2_meta_ecc(osb) ||
+		ocfs2_supports_indexed_dirs(osb);
+}
+
+static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
+{
+	return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
+}
+
+#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
+
+/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
+ * them more consistent? */
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+							    void *data)
+{
+	char *p = data;
+
+	p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
+	return (struct ocfs2_dir_block_trailer *)p;
+}
+
+/*
+ * XXX: This is executed once on every dirent. We should consider optimizing
+ * it.
+ */
+static int ocfs2_skip_dir_trailer(struct inode *dir,
+				  struct ocfs2_dir_entry *de,
+				  unsigned long offset,
+				  unsigned long blklen)
+{
+	unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
+
+	if (!ocfs2_supports_dir_trailer(dir))
+		return 0;
+
+	if (offset != toff)
+		return 0;
+
+	return 1;
+}
+
+static void ocfs2_init_dir_trailer(struct inode *inode,
+				   struct buffer_head *bh, u16 rec_len)
+{
+	struct ocfs2_dir_block_trailer *trailer;
+
+	trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
+	strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
+	trailer->db_compat_rec_len =
+			cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
+	trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+	trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
+	trailer->db_free_rec_len = cpu_to_le16(rec_len);
+}
+/*
+ * Link an unindexed block with a dir trailer structure into the index free
+ * list. This function will modify dirdata_bh, but assumes you've already
+ * passed it to the journal.
+ */
+static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
+				     struct buffer_head *dx_root_bh,
+				     struct buffer_head *dirdata_bh)
+{
+	int ret;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dir_block_trailer *trailer;
+
+	ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+	trailer->db_free_next = dx_root->dr_free_blk;
+	dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+
+	ocfs2_journal_dirty(handle, dx_root_bh);
+
+out:
+	return ret;
+}
+
+static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
+{
+	return res->dl_prev_leaf_bh == NULL;
+}
+
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
+{
+	brelse(res->dl_dx_root_bh);
+	brelse(res->dl_leaf_bh);
+	brelse(res->dl_dx_leaf_bh);
+	brelse(res->dl_prev_leaf_bh);
+}
+
+static int ocfs2_dir_indexed(struct inode *inode)
+{
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
+{
+	return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
+}
+
+/*
+ * Hashing code adapted from ext3
+ */
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+	__u32	sum = 0;
+	__u32	b0 = buf[0], b1 = buf[1];
+	__u32	a = in[0], b = in[1], c = in[2], d = in[3];
+	int	n = 16;
+
+	do {
+		sum += DELTA;
+		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+	} while (--n);
+
+	buf[0] += b0;
+	buf[1] += b1;
+}
+
+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i = 0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = msg[i] + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
+				   struct ocfs2_dx_hinfo *hinfo)
+{
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	const char	*p;
+	__u32		in[8], buf[4];
+
+	/*
+	 * XXX: Is this really necessary, if the index is never looked
+	 * at by readdir? Is a hash value of '0' a bad idea?
+	 */
+	if ((len == 1 && !strncmp(".", name, 1)) ||
+	    (len == 2 && !strncmp("..", name, 2))) {
+		buf[0] = buf[1] = 0;
+		goto out;
+	}
+
+#ifdef OCFS2_DEBUG_DX_DIRS
+	/*
+	 * This makes it very easy to debug indexing problems. We
+	 * should never allow this to be selected without hand editing
+	 * this file though.
+	 */
+	buf[0] = buf[1] = len;
+	goto out;
+#endif
+
+	memcpy(buf, osb->osb_dx_seed, sizeof(buf));
+
+	p = name;
+	while (len > 0) {
+		str2hashbuf(p, len, in, 4);
+		TEA_transform(buf, in);
+		len -= 16;
+		p += 16;
+	}
+
+out:
+	hinfo->major_hash = buf[0];
+	hinfo->minor_hash = buf[1];
+}
+
+/*
+ * bh passed here can be an inode block or a dir data block, depending
+ * on the inode inline data flag.
+ */
+static int ocfs2_check_dir_entry(struct inode * dir,
+				 struct ocfs2_dir_entry * de,
+				 struct buffer_head * bh,
+				 unsigned long offset)
+{
+	const char *error_msg = NULL;
+	const int rlen = le16_to_cpu(de->rec_len);
+
+	if (unlikely(rlen < OCFS2_DIR_REC_LEN(1)))
+		error_msg = "rec_len is smaller than minimal";
+	else if (unlikely(rlen % 4 != 0))
+		error_msg = "rec_len % 4 != 0";
+	else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len)))
+		error_msg = "rec_len is too small for name_len";
+	else if (unlikely(
+		 ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))
+		error_msg = "directory entry across blocks";
+
+	if (unlikely(error_msg != NULL))
+		mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
+		     "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
+		     (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
+		     offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
+		     de->name_len);
+
+	return error_msg == NULL ? 1 : 0;
+}
+
+static inline int ocfs2_match(int len,
+			      const char * const name,
+			      struct ocfs2_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+static inline int ocfs2_search_dirblock(struct buffer_head *bh,
+					struct inode *dir,
+					const char *name, int namelen,
+					unsigned long offset,
+					char *first_de,
+					unsigned int bytes,
+					struct ocfs2_dir_entry **res_dir)
+{
+	struct ocfs2_dir_entry *de;
+	char *dlimit, *de_buf;
+	int de_len;
+	int ret = 0;
+
+	de_buf = first_de;
+	dlimit = de_buf + bytes;
+
+	while (de_buf < dlimit) {
+		/* this code is executed quadratically often */
+		/* do minimal checking `by hand' */
+
+		de = (struct ocfs2_dir_entry *) de_buf;
+
+		if (de_buf + namelen <= dlimit &&
+		    ocfs2_match(namelen, name, de)) {
+			/* found a match - just to be sure, do a full check */
+			if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+				ret = -1;
+				goto bail;
+			}
+			*res_dir = de;
+			ret = 1;
+			goto bail;
+		}
+
+		/* prevent looping on a bad block */
+		de_len = le16_to_cpu(de->rec_len);
+		if (de_len <= 0) {
+			ret = -1;
+			goto bail;
+		}
+
+		de_buf += de_len;
+		offset += de_len;
+	}
+
+bail:
+	trace_ocfs2_search_dirblock(ret);
+	return ret;
+}
+
+static struct buffer_head *ocfs2_find_entry_id(const char *name,
+					       int namelen,
+					       struct inode *dir,
+					       struct ocfs2_dir_entry **res_dir)
+{
+	int ret, found;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_inline_data *data;
+
+	ret = ocfs2_read_inode_block(dir, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	data = &di->id2.i_data;
+
+	found = ocfs2_search_dirblock(di_bh, dir, name, namelen, 0,
+				      data->id_data, i_size_read(dir), res_dir);
+	if (found == 1)
+		return di_bh;
+
+	brelse(di_bh);
+out:
+	return NULL;
+}
+
+static int ocfs2_validate_dir_block(struct super_block *sb,
+				    struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_dir_block_trailer *trailer =
+		ocfs2_trailer_from_bh(bh, sb);
+
+
+	/*
+	 * We don't validate dirents here, that's handled
+	 * in-place when the code walks them.
+	 */
+	trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 *
+	 * Note that we are safe to call this even if the directory
+	 * doesn't have a trailer.  Filesystems without metaecc will do
+	 * nothing, and filesystems with it will have one.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
+	if (rc)
+		mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+
+	return rc;
+}
+
+/*
+ * Validate a directory trailer.
+ *
+ * We check the trailer here rather than in ocfs2_validate_dir_block()
+ * because that function doesn't have the inode to test.
+ */
+static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
+{
+	int rc = 0;
+	struct ocfs2_dir_block_trailer *trailer;
+
+	trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
+	if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+		rc = -EINVAL;
+		ocfs2_error(dir->i_sb,
+			    "Invalid dirblock #%llu: "
+			    "signature = %.*s\n",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    trailer->db_signature);
+		goto out;
+	}
+	if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
+		rc = -EINVAL;
+		ocfs2_error(dir->i_sb,
+			    "Directory block #%llu has an invalid "
+			    "db_blkno of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+		goto out;
+	}
+	if (le64_to_cpu(trailer->db_parent_dinode) !=
+	    OCFS2_I(dir)->ip_blkno) {
+		rc = -EINVAL;
+		ocfs2_error(dir->i_sb,
+			    "Directory block #%llu on dinode "
+			    "#%llu has an invalid parent_dinode "
+			    "of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)OCFS2_I(dir)->ip_blkno,
+			    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+		goto out;
+	}
+out:
+	return rc;
+}
+
+/*
+ * This function forces all errors to -EIO for consistency with its
+ * predecessor, ocfs2_bread().  We haven't audited what returning the
+ * real error codes would do to callers.  We log the real codes with
+ * mlog_errno() before we squash them.
+ */
+static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
+				struct buffer_head **bh, int flags)
+{
+	int rc = 0;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
+				    ocfs2_validate_dir_block);
+	if (rc) {
+		mlog_errno(rc);
+		goto out;
+	}
+
+	if (!(flags & OCFS2_BH_READAHEAD) &&
+	    ocfs2_supports_dir_trailer(inode)) {
+		rc = ocfs2_check_dir_trailer(inode, tmp);
+		if (rc) {
+			if (!*bh)
+				brelse(tmp);
+			mlog_errno(rc);
+			goto out;
+		}
+	}
+
+	/* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+	if (!*bh)
+		*bh = tmp;
+
+out:
+	return rc ? -EIO : 0;
+}
+
+/*
+ * Read the block at 'phys' which belongs to this directory
+ * inode. This function does no virtual->physical block translation -
+ * what's passed in is assumed to be a valid directory block.
+ */
+static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
+				       struct buffer_head **bh)
+{
+	int ret;
+	struct buffer_head *tmp = *bh;
+
+	ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp,
+			       ocfs2_validate_dir_block);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (ocfs2_supports_dir_trailer(dir)) {
+		ret = ocfs2_check_dir_trailer(dir, tmp);
+		if (ret) {
+			if (!*bh)
+				brelse(tmp);
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (!ret && !*bh)
+		*bh = tmp;
+out:
+	return ret;
+}
+
+static int ocfs2_validate_dx_root(struct super_block *sb,
+				  struct buffer_head *bh)
+{
+	int ret;
+	struct ocfs2_dx_root_block *dx_root;
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
+
+	ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
+	if (ret) {
+		mlog(ML_ERROR,
+		     "Checksum failed for dir index root block %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		return ret;
+	}
+
+	if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
+		ocfs2_error(sb,
+			    "Dir Index Root # %llu has bad signature %.*s",
+			    (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+			    7, dx_root->dr_signature);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
+			      struct buffer_head **dx_root_bh)
+{
+	int ret;
+	u64 blkno = le64_to_cpu(di->i_dx_root);
+	struct buffer_head *tmp = *dx_root_bh;
+
+	ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
+			       ocfs2_validate_dx_root);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!ret && !*dx_root_bh)
+		*dx_root_bh = tmp;
+
+	return ret;
+}
+
+static int ocfs2_validate_dx_leaf(struct super_block *sb,
+				  struct buffer_head *bh)
+{
+	int ret;
+	struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
+	if (ret) {
+		mlog(ML_ERROR,
+		     "Checksum failed for dir index leaf block %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		return ret;
+	}
+
+	if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
+		ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
+			    7, dx_leaf->dl_signature);
+		return -EROFS;
+	}
+
+	return 0;
+}
+
+static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
+			      struct buffer_head **dx_leaf_bh)
+{
+	int ret;
+	struct buffer_head *tmp = *dx_leaf_bh;
+
+	ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
+			       ocfs2_validate_dx_leaf);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!ret && !*dx_leaf_bh)
+		*dx_leaf_bh = tmp;
+
+	return ret;
+}
+
+/*
+ * Read a series of dx_leaf blocks. This expects all buffer_head
+ * pointers to be NULL on function entry.
+ */
+static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
+				struct buffer_head **dx_leaf_bhs)
+{
+	int ret;
+
+	ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0,
+				ocfs2_validate_dx_leaf);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
+					       struct inode *dir,
+					       struct ocfs2_dir_entry **res_dir)
+{
+	struct super_block *sb;
+	struct buffer_head *bh_use[NAMEI_RA_SIZE];
+	struct buffer_head *bh, *ret = NULL;
+	unsigned long start, block, b;
+	int ra_max = 0;		/* Number of bh's in the readahead
+				   buffer, bh_use[] */
+	int ra_ptr = 0;		/* Current index into readahead
+				   buffer */
+	int num = 0;
+	int nblocks, i, err;
+
+	sb = dir->i_sb;
+
+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+	start = OCFS2_I(dir)->ip_dir_start_lookup;
+	if (start >= nblocks)
+		start = 0;
+	block = start;
+
+restart:
+	do {
+		/*
+		 * We deal with the read-ahead logic here.
+		 */
+		if (ra_ptr >= ra_max) {
+			/* Refill the readahead buffer */
+			ra_ptr = 0;
+			b = block;
+			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+				/*
+				 * Terminate if we reach the end of the
+				 * directory and must wrap, or if our
+				 * search has finished at this block.
+				 */
+				if (b >= nblocks || (num && block == start)) {
+					bh_use[ra_max] = NULL;
+					break;
+				}
+				num++;
+
+				bh = NULL;
+				err = ocfs2_read_dir_block(dir, b++, &bh,
+							   OCFS2_BH_READAHEAD);
+				bh_use[ra_max] = bh;
+			}
+		}
+		if ((bh = bh_use[ra_ptr++]) == NULL)
+			goto next;
+		if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
+			/* read error, skip block & hope for the best.
+			 * ocfs2_read_dir_block() has released the bh. */
+			ocfs2_error(dir->i_sb, "reading directory %llu, "
+				    "offset %lu\n",
+				    (unsigned long long)OCFS2_I(dir)->ip_blkno,
+				    block);
+			goto next;
+		}
+		i = ocfs2_search_dirblock(bh, dir, name, namelen,
+					  block << sb->s_blocksize_bits,
+					  bh->b_data, sb->s_blocksize,
+					  res_dir);
+		if (i == 1) {
+			OCFS2_I(dir)->ip_dir_start_lookup = block;
+			ret = bh;
+			goto cleanup_and_exit;
+		} else {
+			brelse(bh);
+			if (i < 0)
+				goto cleanup_and_exit;
+		}
+	next:
+		if (++block >= nblocks)
+			block = 0;
+	} while (block != start);
+
+	/*
+	 * If the directory has grown while we were searching, then
+	 * search the last part of the directory before giving up.
+	 */
+	block = nblocks;
+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+	if (block < nblocks) {
+		start = 0;
+		goto restart;
+	}
+
+cleanup_and_exit:
+	/* Clean up the read-ahead blocks */
+	for (; ra_ptr < ra_max; ra_ptr++)
+		brelse(bh_use[ra_ptr]);
+
+	trace_ocfs2_find_entry_el(ret);
+	return ret;
+}
+
+static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
+				   struct ocfs2_extent_list *el,
+				   u32 major_hash,
+				   u32 *ret_cpos,
+				   u64 *ret_phys_blkno,
+				   unsigned int *ret_clen)
+{
+	int ret = 0, i, found;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec = NULL;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash,
+				      &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "btree tree block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	found = 0;
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) <= major_hash) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0) in btree", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	if (ret_phys_blkno)
+		*ret_phys_blkno = le64_to_cpu(rec->e_blkno);
+	if (ret_cpos)
+		*ret_cpos = le32_to_cpu(rec->e_cpos);
+	if (ret_clen)
+		*ret_clen = le16_to_cpu(rec->e_leaf_clusters);
+
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+/*
+ * Returns the block index, from the start of the cluster which this
+ * hash belongs too.
+ */
+static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+						   u32 minor_hash)
+{
+	return minor_hash & osb->osb_dx_mask;
+}
+
+static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+					  struct ocfs2_dx_hinfo *hinfo)
+{
+	return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
+}
+
+static int ocfs2_dx_dir_lookup(struct inode *inode,
+			       struct ocfs2_extent_list *el,
+			       struct ocfs2_dx_hinfo *hinfo,
+			       u32 *ret_cpos,
+			       u64 *ret_phys_blkno)
+{
+	int ret = 0;
+	unsigned int cend, uninitialized_var(clen);
+	u32 uninitialized_var(cpos);
+	u64 uninitialized_var(blkno);
+	u32 name_hash = hinfo->major_hash;
+
+	ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
+				      &clen);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	cend = cpos + clen;
+	if (name_hash >= cend) {
+		/* We want the last cluster */
+		blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
+		cpos += clen - 1;
+	} else {
+		blkno += ocfs2_clusters_to_blocks(inode->i_sb,
+						  name_hash - cpos);
+		cpos = name_hash;
+	}
+
+	/*
+	 * We now have the cluster which should hold our entry. To
+	 * find the exact block from the start of the cluster to
+	 * search, we take the lower bits of the hash.
+	 */
+	blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
+
+	if (ret_phys_blkno)
+		*ret_phys_blkno = blkno;
+	if (ret_cpos)
+		*ret_cpos = cpos;
+
+out:
+
+	return ret;
+}
+
+static int ocfs2_dx_dir_search(const char *name, int namelen,
+			       struct inode *dir,
+			       struct ocfs2_dx_root_block *dx_root,
+			       struct ocfs2_dir_lookup_result *res)
+{
+	int ret, i, found;
+	u64 uninitialized_var(phys);
+	struct buffer_head *dx_leaf_bh = NULL;
+	struct ocfs2_dx_leaf *dx_leaf;
+	struct ocfs2_dx_entry *dx_entry = NULL;
+	struct buffer_head *dir_ent_bh = NULL;
+	struct ocfs2_dir_entry *dir_ent = NULL;
+	struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
+	struct ocfs2_extent_list *dr_el;
+	struct ocfs2_dx_entry_list *entry_list;
+
+	ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
+
+	if (ocfs2_dx_root_inline(dx_root)) {
+		entry_list = &dx_root->dr_entries;
+		goto search;
+	}
+
+	dr_el = &dx_root->dr_list;
+
+	ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno,
+				  namelen, name, hinfo->major_hash,
+				  hinfo->minor_hash, (unsigned long long)phys);
+
+	ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
+
+	trace_ocfs2_dx_dir_search_leaf_info(
+			le16_to_cpu(dx_leaf->dl_list.de_num_used),
+			le16_to_cpu(dx_leaf->dl_list.de_count));
+
+	entry_list = &dx_leaf->dl_list;
+
+search:
+	/*
+	 * Empty leaf is legal, so no need to check for that.
+	 */
+	found = 0;
+	for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+		dx_entry = &entry_list->de_entries[i];
+
+		if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
+		    || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
+			continue;
+
+		/*
+		 * Search unindexed leaf block now. We're not
+		 * guaranteed to find anything.
+		 */
+		ret = ocfs2_read_dir_block_direct(dir,
+					  le64_to_cpu(dx_entry->dx_dirent_blk),
+					  &dir_ent_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * XXX: We should check the unindexed block here,
+		 * before using it.
+		 */
+
+		found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
+					      0, dir_ent_bh->b_data,
+					      dir->i_sb->s_blocksize, &dir_ent);
+		if (found == 1)
+			break;
+
+		if (found == -1) {
+			/* This means we found a bad directory entry. */
+			ret = -EIO;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		brelse(dir_ent_bh);
+		dir_ent_bh = NULL;
+	}
+
+	if (found <= 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	res->dl_leaf_bh = dir_ent_bh;
+	res->dl_entry = dir_ent;
+	res->dl_dx_leaf_bh = dx_leaf_bh;
+	res->dl_dx_entry = dx_entry;
+
+	ret = 0;
+out:
+	if (ret) {
+		brelse(dx_leaf_bh);
+		brelse(dir_ent_bh);
+	}
+	return ret;
+}
+
+static int ocfs2_find_entry_dx(const char *name, int namelen,
+			       struct inode *dir,
+			       struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct buffer_head *dx_root_bh = NULL;
+	struct ocfs2_dx_root_block *dx_root;
+
+	ret = ocfs2_read_inode_block(dir, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+
+	ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
+	if (ret) {
+		if (ret != -ENOENT)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	lookup->dl_dx_root_bh = dx_root_bh;
+	dx_root_bh = NULL;
+out:
+	brelse(di_bh);
+	brelse(dx_root_bh);
+	return ret;
+}
+
+/*
+ * Try to find an entry of the provided name within 'dir'.
+ *
+ * If nothing was found, -ENOENT is returned. Otherwise, zero is
+ * returned and the struct 'res' will contain information useful to
+ * other directory manipulation functions.
+ *
+ * Caller can NOT assume anything about the contents of the
+ * buffer_heads - they are passed back only so that it can be passed
+ * into any one of the manipulation functions (add entry, delete
+ * entry, etc). As an example, bh in the extent directory case is a
+ * data block, in the inline-data case it actually points to an inode,
+ * in the indexed directory case, multiple buffers are involved.
+ */
+int ocfs2_find_entry(const char *name, int namelen,
+		     struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
+{
+	struct buffer_head *bh;
+	struct ocfs2_dir_entry *res_dir = NULL;
+
+	if (ocfs2_dir_indexed(dir))
+		return ocfs2_find_entry_dx(name, namelen, dir, lookup);
+
+	/*
+	 * The unindexed dir code only uses part of the lookup
+	 * structure, so there's no reason to push it down further
+	 * than this.
+	 */
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
+	else
+		bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
+
+	if (bh == NULL)
+		return -ENOENT;
+
+	lookup->dl_leaf_bh = bh;
+	lookup->dl_entry = res_dir;
+	return 0;
+}
+
+/*
+ * Update inode number and type of a previously found directory entry.
+ */
+int ocfs2_update_entry(struct inode *dir, handle_t *handle,
+		       struct ocfs2_dir_lookup_result *res,
+		       struct inode *new_entry_inode)
+{
+	int ret;
+	ocfs2_journal_access_func access = ocfs2_journal_access_db;
+	struct ocfs2_dir_entry *de = res->dl_entry;
+	struct buffer_head *de_bh = res->dl_leaf_bh;
+
+	/*
+	 * The same code works fine for both inline-data and extent
+	 * based directories, so no need to split this up.  The only
+	 * difference is the journal_access function.
+	 */
+
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		access = ocfs2_journal_access_di;
+
+	ret = access(handle, INODE_CACHE(dir), de_bh,
+		     OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	de->inode = cpu_to_le64(OCFS2_I(new_entry_inode)->ip_blkno);
+	ocfs2_set_de_type(de, new_entry_inode->i_mode);
+
+	ocfs2_journal_dirty(handle, de_bh);
+
+out:
+	return ret;
+}
+
+/*
+ * __ocfs2_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
+				struct ocfs2_dir_entry *de_del,
+				struct buffer_head *bh, char *first_de,
+				unsigned int bytes)
+{
+	struct ocfs2_dir_entry *de, *pde;
+	int i, status = -ENOENT;
+	ocfs2_journal_access_func access = ocfs2_journal_access_db;
+
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		access = ocfs2_journal_access_di;
+
+	i = 0;
+	pde = NULL;
+	de = (struct ocfs2_dir_entry *) first_de;
+	while (i < bytes) {
+		if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
+			status = -EIO;
+			mlog_errno(status);
+			goto bail;
+		}
+		if (de == de_del)  {
+			status = access(handle, INODE_CACHE(dir), bh,
+					OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+			if (pde)
+				le16_add_cpu(&pde->rec_len,
+						le16_to_cpu(de->rec_len));
+			else
+				de->inode = 0;
+			dir->i_version++;
+			ocfs2_journal_dirty(handle, bh);
+			goto bail;
+		}
+		i += le16_to_cpu(de->rec_len);
+		pde = de;
+		de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
+	}
+bail:
+	return status;
+}
+
+static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
+{
+	unsigned int hole;
+
+	if (le64_to_cpu(de->inode) == 0)
+		hole = le16_to_cpu(de->rec_len);
+	else
+		hole = le16_to_cpu(de->rec_len) -
+			OCFS2_DIR_REC_LEN(de->name_len);
+
+	return hole;
+}
+
+static int ocfs2_find_max_rec_len(struct super_block *sb,
+				  struct buffer_head *dirblock_bh)
+{
+	int size, this_hole, largest_hole = 0;
+	char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
+	struct ocfs2_dir_entry *de;
+
+	trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
+	size = ocfs2_dir_trailer_blk_off(sb);
+	limit = start + size;
+	de_buf = start;
+	de = (struct ocfs2_dir_entry *)de_buf;
+	do {
+		if (de_buf != trailer) {
+			this_hole = ocfs2_figure_dirent_hole(de);
+			if (this_hole > largest_hole)
+				largest_hole = this_hole;
+		}
+
+		de_buf += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)de_buf;
+	} while (de_buf < limit);
+
+	if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+		return largest_hole;
+	return 0;
+}
+
+static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
+				       int index)
+{
+	int num_used = le16_to_cpu(entry_list->de_num_used);
+
+	if (num_used == 1 || index == (num_used - 1))
+		goto clear;
+
+	memmove(&entry_list->de_entries[index],
+		&entry_list->de_entries[index + 1],
+		(num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
+clear:
+	num_used--;
+	memset(&entry_list->de_entries[num_used], 0,
+	       sizeof(struct ocfs2_dx_entry));
+	entry_list->de_num_used = cpu_to_le16(num_used);
+}
+
+static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
+				 struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret, index, max_rec_len, add_to_free_list = 0;
+	struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+	struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
+	struct ocfs2_dx_leaf *dx_leaf;
+	struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
+	struct ocfs2_dir_block_trailer *trailer;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dx_entry_list *entry_list;
+
+	/*
+	 * This function gets a bit messy because we might have to
+	 * modify the root block, regardless of whether the indexed
+	 * entries are stored inline.
+	 */
+
+	/*
+	 * *Only* set 'entry_list' here, based on where we're looking
+	 * for the indexed entries. Later, we might still want to
+	 * journal both blocks, based on free list state.
+	 */
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	if (ocfs2_dx_root_inline(dx_root)) {
+		entry_list = &dx_root->dr_entries;
+	} else {
+		dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
+		entry_list = &dx_leaf->dl_list;
+	}
+
+	/* Neither of these are a disk corruption - that should have
+	 * been caught by lookup, before we got here. */
+	BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
+	BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
+
+	index = (char *)dx_entry - (char *)entry_list->de_entries;
+	index /= sizeof(*dx_entry);
+
+	if (index >= le16_to_cpu(entry_list->de_num_used)) {
+		mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
+		     (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
+		     entry_list, dx_entry);
+		return -EIO;
+	}
+
+	/*
+	 * We know that removal of this dirent will leave enough room
+	 * for a new one, so add this block to the free list if it
+	 * isn't already there.
+	 */
+	trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+	if (trailer->db_free_rec_len == 0)
+		add_to_free_list = 1;
+
+	/*
+	 * Add the block holding our index into the journal before
+	 * removing the unindexed entry. If we get an error return
+	 * from __ocfs2_delete_entry(), then it hasn't removed the
+	 * entry yet. Likewise, successful return means we *must*
+	 * remove the indexed entry.
+	 *
+	 * We're also careful to journal the root tree block here as
+	 * the entry count needs to be updated. Also, we might be
+	 * adding to the start of the free list.
+	 */
+	ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!ocfs2_dx_root_inline(dx_root)) {
+		ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
+					      lookup->dl_dx_leaf_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno,
+				    index);
+
+	ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
+				   leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
+	trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+	if (add_to_free_list) {
+		trailer->db_free_next = dx_root->dr_free_blk;
+		dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
+		ocfs2_journal_dirty(handle, dx_root_bh);
+	}
+
+	/* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
+	ocfs2_journal_dirty(handle, leaf_bh);
+
+	le32_add_cpu(&dx_root->dr_num_entries, -1);
+	ocfs2_journal_dirty(handle, dx_root_bh);
+
+	ocfs2_dx_list_remove_entry(entry_list, index);
+
+	if (!ocfs2_dx_root_inline(dx_root))
+		ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
+
+out:
+	return ret;
+}
+
+static inline int ocfs2_delete_entry_id(handle_t *handle,
+					struct inode *dir,
+					struct ocfs2_dir_entry *de_del,
+					struct buffer_head *bh)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_inline_data *data;
+
+	ret = ocfs2_read_inode_block(dir, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	data = &di->id2.i_data;
+
+	ret = __ocfs2_delete_entry(handle, dir, de_del, bh, data->id_data,
+				   i_size_read(dir));
+
+	brelse(di_bh);
+out:
+	return ret;
+}
+
+static inline int ocfs2_delete_entry_el(handle_t *handle,
+					struct inode *dir,
+					struct ocfs2_dir_entry *de_del,
+					struct buffer_head *bh)
+{
+	return __ocfs2_delete_entry(handle, dir, de_del, bh, bh->b_data,
+				    bh->b_size);
+}
+
+/*
+ * Delete a directory entry. Hide the details of directory
+ * implementation from the caller.
+ */
+int ocfs2_delete_entry(handle_t *handle,
+		       struct inode *dir,
+		       struct ocfs2_dir_lookup_result *res)
+{
+	if (ocfs2_dir_indexed(dir))
+		return ocfs2_delete_entry_dx(handle, dir, res);
+
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
+					     res->dl_leaf_bh);
+
+	return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
+				     res->dl_leaf_bh);
+}
+
+/*
+ * Check whether 'de' has enough room to hold an entry of
+ * 'new_rec_len' bytes.
+ */
+static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
+					 unsigned int new_rec_len)
+{
+	unsigned int de_really_used;
+
+	/* Check whether this is an empty record with enough space */
+	if (le64_to_cpu(de->inode) == 0 &&
+	    le16_to_cpu(de->rec_len) >= new_rec_len)
+		return 1;
+
+	/*
+	 * Record might have free space at the end which we can
+	 * use.
+	 */
+	de_really_used = OCFS2_DIR_REC_LEN(de->name_len);
+	if (le16_to_cpu(de->rec_len) >= (de_really_used + new_rec_len))
+	    return 1;
+
+	return 0;
+}
+
+static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
+					  struct ocfs2_dx_entry *dx_new_entry)
+{
+	int i;
+
+	i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+	dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
+
+	le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
+}
+
+static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
+				       struct ocfs2_dx_hinfo *hinfo,
+				       u64 dirent_blk)
+{
+	int i;
+	struct ocfs2_dx_entry *dx_entry;
+
+	i = le16_to_cpu(entry_list->de_num_used);
+	dx_entry = &entry_list->de_entries[i];
+
+	memset(dx_entry, 0, sizeof(*dx_entry));
+	dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
+	dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
+	dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
+
+	le16_add_cpu(&entry_list->de_num_used, 1);
+}
+
+static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
+				      struct ocfs2_dx_hinfo *hinfo,
+				      u64 dirent_blk,
+				      struct buffer_head *dx_leaf_bh)
+{
+	int ret;
+	struct ocfs2_dx_leaf *dx_leaf;
+
+	ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+	ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
+	ocfs2_journal_dirty(handle, dx_leaf_bh);
+
+out:
+	return ret;
+}
+
+static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
+					struct ocfs2_dx_hinfo *hinfo,
+					u64 dirent_blk,
+					struct ocfs2_dx_root_block *dx_root)
+{
+	ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
+}
+
+static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
+			       struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret = 0;
+	struct ocfs2_dx_root_block *dx_root;
+	struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+
+	ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
+	if (ocfs2_dx_root_inline(dx_root)) {
+		ocfs2_dx_inline_root_insert(dir, handle,
+					    &lookup->dl_hinfo,
+					    lookup->dl_leaf_bh->b_blocknr,
+					    dx_root);
+	} else {
+		ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
+						 lookup->dl_leaf_bh->b_blocknr,
+						 lookup->dl_dx_leaf_bh);
+		if (ret)
+			goto out;
+	}
+
+	le32_add_cpu(&dx_root->dr_num_entries, 1);
+	ocfs2_journal_dirty(handle, dx_root_bh);
+
+out:
+	return ret;
+}
+
+static void ocfs2_remove_block_from_free_list(struct inode *dir,
+				       handle_t *handle,
+				       struct ocfs2_dir_lookup_result *lookup)
+{
+	struct ocfs2_dir_block_trailer *trailer, *prev;
+	struct ocfs2_dx_root_block *dx_root;
+	struct buffer_head *bh;
+
+	trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+
+	if (ocfs2_free_list_at_root(lookup)) {
+		bh = lookup->dl_dx_root_bh;
+		dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
+		dx_root->dr_free_blk = trailer->db_free_next;
+	} else {
+		bh = lookup->dl_prev_leaf_bh;
+		prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
+		prev->db_free_next = trailer->db_free_next;
+	}
+
+	trailer->db_free_rec_len = cpu_to_le16(0);
+	trailer->db_free_next = cpu_to_le64(0);
+
+	ocfs2_journal_dirty(handle, bh);
+	ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+}
+
+/*
+ * This expects that a journal write has been reserved on
+ * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
+ */
+static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
+				   struct ocfs2_dir_lookup_result *lookup)
+{
+	int max_rec_len;
+	struct ocfs2_dir_block_trailer *trailer;
+
+	/* Walk dl_leaf_bh to figure out what the new free rec_len is. */
+	max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
+	if (max_rec_len) {
+		/*
+		 * There's still room in this block, so no need to remove it
+		 * from the free list. In this case, we just want to update
+		 * the rec len accounting.
+		 */
+		trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+		trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+		ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+	} else {
+		ocfs2_remove_block_from_free_list(dir, handle, lookup);
+	}
+}
+
+/* we don't always have a dentry for what we want to add, so people
+ * like orphan dir can call this instead.
+ *
+ * The lookup context must have been filled from
+ * ocfs2_prepare_dir_for_insert.
+ */
+int __ocfs2_add_entry(handle_t *handle,
+		      struct inode *dir,
+		      const char *name, int namelen,
+		      struct inode *inode, u64 blkno,
+		      struct buffer_head *parent_fe_bh,
+		      struct ocfs2_dir_lookup_result *lookup)
+{
+	unsigned long offset;
+	unsigned short rec_len;
+	struct ocfs2_dir_entry *de, *de1;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+	struct super_block *sb = dir->i_sb;
+	int retval, status;
+	unsigned int size = sb->s_blocksize;
+	struct buffer_head *insert_bh = lookup->dl_leaf_bh;
+	char *data_start = insert_bh->b_data;
+
+	if (!namelen)
+		return -EINVAL;
+
+	if (ocfs2_dir_indexed(dir)) {
+		struct buffer_head *bh;
+
+		/*
+		 * An indexed dir may require that we update the free space
+		 * list. Reserve a write to the previous node in the list so
+		 * that we don't fail later.
+		 *
+		 * XXX: This can be either a dx_root_block, or an unindexed
+		 * directory tree leaf block.
+		 */
+		if (ocfs2_free_list_at_root(lookup)) {
+			bh = lookup->dl_dx_root_bh;
+			retval = ocfs2_journal_access_dr(handle,
+						 INODE_CACHE(dir), bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		} else {
+			bh = lookup->dl_prev_leaf_bh;
+			retval = ocfs2_journal_access_db(handle,
+						 INODE_CACHE(dir), bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		}
+		if (retval) {
+			mlog_errno(retval);
+			return retval;
+		}
+	} else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		data_start = di->id2.i_data.id_data;
+		size = i_size_read(dir);
+
+		BUG_ON(insert_bh != parent_fe_bh);
+	}
+
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+	offset = 0;
+	de = (struct ocfs2_dir_entry *) data_start;
+	while (1) {
+		BUG_ON((char *)de >= (size + data_start));
+
+		/* These checks should've already been passed by the
+		 * prepare function, but I guess we can leave them
+		 * here anyway. */
+		if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
+			retval = -ENOENT;
+			goto bail;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			retval = -EEXIST;
+			goto bail;
+		}
+
+		/* We're guaranteed that we should have space, so we
+		 * can't possibly have hit the trailer...right? */
+		mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
+				"Hit dir trailer trying to insert %.*s "
+			        "(namelen %d) into directory %llu.  "
+				"offset is %lu, trailer offset is %d\n",
+				namelen, name, namelen,
+				(unsigned long long)parent_fe_bh->b_blocknr,
+				offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
+
+		if (ocfs2_dirent_would_fit(de, rec_len)) {
+			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+			retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
+			if (retval < 0) {
+				mlog_errno(retval);
+				goto bail;
+			}
+
+			if (insert_bh == parent_fe_bh)
+				status = ocfs2_journal_access_di(handle,
+								 INODE_CACHE(dir),
+								 insert_bh,
+								 OCFS2_JOURNAL_ACCESS_WRITE);
+			else {
+				status = ocfs2_journal_access_db(handle,
+								 INODE_CACHE(dir),
+								 insert_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+
+				if (ocfs2_dir_indexed(dir)) {
+					status = ocfs2_dx_dir_insert(dir,
+								handle,
+								lookup);
+					if (status) {
+						mlog_errno(status);
+						goto bail;
+					}
+				}
+			}
+
+			/* By now the buffer is marked for journaling */
+			offset += le16_to_cpu(de->rec_len);
+			if (le64_to_cpu(de->inode)) {
+				de1 = (struct ocfs2_dir_entry *)((char *) de +
+					OCFS2_DIR_REC_LEN(de->name_len));
+				de1->rec_len =
+					cpu_to_le16(le16_to_cpu(de->rec_len) -
+					OCFS2_DIR_REC_LEN(de->name_len));
+				de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+				de = de1;
+			}
+			de->file_type = OCFS2_FT_UNKNOWN;
+			if (blkno) {
+				de->inode = cpu_to_le64(blkno);
+				ocfs2_set_de_type(de, inode->i_mode);
+			} else
+				de->inode = 0;
+			de->name_len = namelen;
+			memcpy(de->name, name, namelen);
+
+			if (ocfs2_dir_indexed(dir))
+				ocfs2_recalc_free_list(dir, handle, lookup);
+
+			dir->i_version++;
+			ocfs2_journal_dirty(handle, insert_bh);
+			retval = 0;
+			goto bail;
+		}
+
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
+	}
+
+	/* when you think about it, the assert above should prevent us
+	 * from ever getting here. */
+	retval = -ENOSPC;
+bail:
+	if (retval)
+		mlog_errno(retval);
+
+	return retval;
+}
+
+static int ocfs2_dir_foreach_blk_id(struct inode *inode,
+				    u64 *f_version,
+				    loff_t *f_pos, void *priv,
+				    filldir_t filldir, int *filldir_err)
+{
+	int ret, i, filldir_ret;
+	unsigned long offset = *f_pos;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_inline_data *data;
+	struct ocfs2_dir_entry *de;
+
+	ret = ocfs2_read_inode_block(inode, &di_bh);
+	if (ret) {
+		mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		goto out;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	data = &di->id2.i_data;
+
+	while (*f_pos < i_size_read(inode)) {
+revalidate:
+		/* If the dir block has changed since the last call to
+		 * readdir(2), then we might be pointing to an invalid
+		 * dirent right now.  Scan from the start of the block
+		 * to make sure. */
+		if (*f_version != inode->i_version) {
+			for (i = 0; i < i_size_read(inode) && i < offset; ) {
+				de = (struct ocfs2_dir_entry *)
+					(data->id_data + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (le16_to_cpu(de->rec_len) <
+				    OCFS2_DIR_REC_LEN(1))
+					break;
+				i += le16_to_cpu(de->rec_len);
+			}
+			*f_pos = offset = i;
+			*f_version = inode->i_version;
+		}
+
+		de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos);
+		if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) {
+			/* On error, skip the f_pos to the end. */
+			*f_pos = i_size_read(inode);
+			goto out;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		if (le64_to_cpu(de->inode)) {
+			/* We might block in the next section
+			 * if the data destination is
+			 * currently swapped out.  So, use a
+			 * version stamp to detect whether or
+			 * not the directory has been modified
+			 * during the copy operation.
+			 */
+			u64 version = *f_version;
+			unsigned char d_type = DT_UNKNOWN;
+
+			if (de->file_type < OCFS2_FT_MAX)
+				d_type = ocfs2_filetype_table[de->file_type];
+
+			filldir_ret = filldir(priv, de->name,
+					      de->name_len,
+					      *f_pos,
+					      le64_to_cpu(de->inode),
+					      d_type);
+			if (filldir_ret) {
+				if (filldir_err)
+					*filldir_err = filldir_ret;
+				break;
+			}
+			if (version != *f_version)
+				goto revalidate;
+		}
+		*f_pos += le16_to_cpu(de->rec_len);
+	}
+
+out:
+	brelse(di_bh);
+
+	return 0;
+}
+
+/*
+ * NOTE: This function can be called against unindexed directories,
+ * and indexed ones.
+ */
+static int ocfs2_dir_foreach_blk_el(struct inode *inode,
+				    u64 *f_version,
+				    loff_t *f_pos, void *priv,
+				    filldir_t filldir, int *filldir_err)
+{
+	int error = 0;
+	unsigned long offset, blk, last_ra_blk = 0;
+	int i, stored;
+	struct buffer_head * bh, * tmp;
+	struct ocfs2_dir_entry * de;
+	struct super_block * sb = inode->i_sb;
+	unsigned int ra_sectors = 16;
+
+	stored = 0;
+	bh = NULL;
+
+	offset = (*f_pos) & (sb->s_blocksize - 1);
+
+	while (!error && !stored && *f_pos < i_size_read(inode)) {
+		blk = (*f_pos) >> sb->s_blocksize_bits;
+		if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
+			/* Skip the corrupt dirblock and keep trying */
+			*f_pos += sb->s_blocksize - offset;
+			continue;
+		}
+
+		/* The idea here is to begin with 8k read-ahead and to stay
+		 * 4k ahead of our current position.
+		 *
+		 * TODO: Use the pagecache for this. We just need to
+		 * make sure it's cluster-safe... */
+		if (!last_ra_blk
+		    || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
+			for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
+			     i > 0; i--) {
+				tmp = NULL;
+				if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
+							  OCFS2_BH_READAHEAD))
+					brelse(tmp);
+			}
+			last_ra_blk = blk;
+			ra_sectors = 8;
+		}
+
+revalidate:
+		/* If the dir block has changed since the last call to
+		 * readdir(2), then we might be pointing to an invalid
+		 * dirent right now.  Scan from the start of the block
+		 * to make sure. */
+		if (*f_version != inode->i_version) {
+			for (i = 0; i < sb->s_blocksize && i < offset; ) {
+				de = (struct ocfs2_dir_entry *) (bh->b_data + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (le16_to_cpu(de->rec_len) <
+				    OCFS2_DIR_REC_LEN(1))
+					break;
+				i += le16_to_cpu(de->rec_len);
+			}
+			offset = i;
+			*f_pos = ((*f_pos) & ~(sb->s_blocksize - 1))
+				| offset;
+			*f_version = inode->i_version;
+		}
+
+		while (!error && *f_pos < i_size_read(inode)
+		       && offset < sb->s_blocksize) {
+			de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
+			if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
+				/* On error, skip the f_pos to the
+				   next block. */
+				*f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1;
+				brelse(bh);
+				goto out;
+			}
+			offset += le16_to_cpu(de->rec_len);
+			if (le64_to_cpu(de->inode)) {
+				/* We might block in the next section
+				 * if the data destination is
+				 * currently swapped out.  So, use a
+				 * version stamp to detect whether or
+				 * not the directory has been modified
+				 * during the copy operation.
+				 */
+				unsigned long version = *f_version;
+				unsigned char d_type = DT_UNKNOWN;
+
+				if (de->file_type < OCFS2_FT_MAX)
+					d_type = ocfs2_filetype_table[de->file_type];
+				error = filldir(priv, de->name,
+						de->name_len,
+						*f_pos,
+						le64_to_cpu(de->inode),
+						d_type);
+				if (error) {
+					if (filldir_err)
+						*filldir_err = error;
+					break;
+				}
+				if (version != *f_version)
+					goto revalidate;
+				stored ++;
+			}
+			*f_pos += le16_to_cpu(de->rec_len);
+		}
+		offset = 0;
+		brelse(bh);
+		bh = NULL;
+	}
+
+	stored = 0;
+out:
+	return stored;
+}
+
+static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
+				 loff_t *f_pos, void *priv, filldir_t filldir,
+				 int *filldir_err)
+{
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv,
+						filldir, filldir_err);
+
+	return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir,
+					filldir_err);
+}
+
+/*
+ * This is intended to be called from inside other kernel functions,
+ * so we fake some arguments.
+ */
+int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
+		      filldir_t filldir)
+{
+	int ret = 0, filldir_err = 0;
+	u64 version = inode->i_version;
+
+	while (*f_pos < i_size_read(inode)) {
+		ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
+					    filldir, &filldir_err);
+		if (ret || filldir_err)
+			break;
+	}
+
+	if (ret > 0)
+		ret = -EIO;
+
+	return 0;
+}
+
+/*
+ * ocfs2_readdir()
+ *
+ */
+int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	int error = 0;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int lock_level = 0;
+
+	trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+	if (lock_level && error >= 0) {
+		/* We release EX lock which used to update atime
+		 * and get PR lock again to reduce contention
+		 * on commonly accessed directories. */
+		ocfs2_inode_unlock(inode, 1);
+		lock_level = 0;
+		error = ocfs2_inode_lock(inode, NULL, 0);
+	}
+	if (error < 0) {
+		if (error != -ENOENT)
+			mlog_errno(error);
+		/* we haven't got any yet, so propagate the error. */
+		goto bail_nolock;
+	}
+
+	error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
+				      dirent, filldir, NULL);
+
+	ocfs2_inode_unlock(inode, lock_level);
+	if (error)
+		mlog_errno(error);
+
+bail_nolock:
+
+	return error;
+}
+
+/*
+ * NOTE: this should always be called with parent dir i_mutex taken.
+ */
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
+			     struct ocfs2_dir_lookup_result *lookup)
+{
+	int status = -ENOENT;
+
+	trace_ocfs2_find_files_on_disk(namelen, name, blkno,
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_find_entry(name, namelen, inode, lookup);
+	if (status)
+		goto leave;
+
+	*blkno = le64_to_cpu(lookup->dl_entry->inode);
+
+	status = 0;
+leave:
+
+	return status;
+}
+
+/*
+ * Convenience function for callers which just want the block number
+ * mapped to a name and don't require the full dirent info, etc.
+ */
+int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
+			       int namelen, u64 *blkno)
+{
+	int ret;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+
+	ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
+	ocfs2_free_dir_lookup_result(&lookup);
+
+	return ret;
+}
+
+/* Check for a name within a directory.
+ *
+ * Return 0 if the name does not exist
+ * Return -EEXIST if the directory contains the name
+ *
+ * Callers should have i_mutex + a cluster lock on dir
+ */
+int ocfs2_check_dir_for_entry(struct inode *dir,
+			      const char *name,
+			      int namelen)
+{
+	int ret;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+
+	trace_ocfs2_check_dir_for_entry(
+		(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
+
+	ret = -EEXIST;
+	if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
+		goto bail;
+
+	ret = 0;
+bail:
+	ocfs2_free_dir_lookup_result(&lookup);
+
+	if (ret)
+		mlog_errno(ret);
+	return ret;
+}
+
+struct ocfs2_empty_dir_priv {
+	unsigned seen_dot;
+	unsigned seen_dot_dot;
+	unsigned seen_other;
+	unsigned dx_dir;
+};
+static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
+				   loff_t pos, u64 ino, unsigned type)
+{
+	struct ocfs2_empty_dir_priv *p = priv;
+
+	/*
+	 * Check the positions of "." and ".." records to be sure
+	 * they're in the correct place.
+	 *
+	 * Indexed directories don't need to proceed past the first
+	 * two entries, so we end the scan after seeing '..'. Despite
+	 * that, we allow the scan to proceed In the event that we
+	 * have a corrupted indexed directory (no dot or dot dot
+	 * entries). This allows us to double check for existing
+	 * entries which might not have been found in the index.
+	 */
+	if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
+		p->seen_dot = 1;
+		return 0;
+	}
+
+	if (name_len == 2 && !strncmp("..", name, 2) &&
+	    pos == OCFS2_DIR_REC_LEN(1)) {
+		p->seen_dot_dot = 1;
+
+		if (p->dx_dir && p->seen_dot)
+			return 1;
+
+		return 0;
+	}
+
+	p->seen_other = 1;
+	return 1;
+}
+
+static int ocfs2_empty_dir_dx(struct inode *inode,
+			      struct ocfs2_empty_dir_priv *priv)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct buffer_head *dx_root_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_dx_root_block *dx_root;
+
+	priv->dx_dir = 1;
+
+	ret = ocfs2_read_inode_block(inode, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+	if (le32_to_cpu(dx_root->dr_num_entries) != 2)
+		priv->seen_other = 1;
+
+out:
+	brelse(di_bh);
+	brelse(dx_root_bh);
+	return ret;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ *
+ * Returns 1 if dir is empty, zero otherwise.
+ *
+ * XXX: This is a performance problem for unindexed directories.
+ */
+int ocfs2_empty_dir(struct inode *inode)
+{
+	int ret;
+	loff_t start = 0;
+	struct ocfs2_empty_dir_priv priv;
+
+	memset(&priv, 0, sizeof(priv));
+
+	if (ocfs2_dir_indexed(inode)) {
+		ret = ocfs2_empty_dir_dx(inode, &priv);
+		if (ret)
+			mlog_errno(ret);
+		/*
+		 * We still run ocfs2_dir_foreach to get the checks
+		 * for "." and "..".
+		 */
+	}
+
+	ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
+	if (ret)
+		mlog_errno(ret);
+
+	if (!priv.seen_dot || !priv.seen_dot_dot) {
+		mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		/*
+		 * XXX: Is it really safe to allow an unlink to continue?
+		 */
+		return 1;
+	}
+
+	return !priv.seen_other;
+}
+
+/*
+ * Fills "." and ".." dirents in a new directory block. Returns dirent for
+ * "..", which might be used during creation of a directory with a trailing
+ * header. It is otherwise safe to ignore the return code.
+ */
+static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
+							  struct inode *parent,
+							  char *start,
+							  unsigned int size)
+{
+	struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
+
+	de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+	de->name_len = 1;
+	de->rec_len =
+		cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+	strcpy(de->name, ".");
+	ocfs2_set_de_type(de, S_IFDIR);
+
+	de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
+	de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
+	de->rec_len = cpu_to_le16(size - OCFS2_DIR_REC_LEN(1));
+	de->name_len = 2;
+	strcpy(de->name, "..");
+	ocfs2_set_de_type(de, S_IFDIR);
+
+	return de;
+}
+
+/*
+ * This works together with code in ocfs2_mknod_locked() which sets
+ * the inline-data flag and initializes the inline-data section.
+ */
+static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
+				 handle_t *handle,
+				 struct inode *parent,
+				 struct inode *inode,
+				 struct buffer_head *di_bh)
+{
+	int ret;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inline_data *data = &di->id2.i_data;
+	unsigned int size = le16_to_cpu(data->id_count);
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
+	ocfs2_journal_dirty(handle, di_bh);
+
+	i_size_write(inode, size);
+	inode->i_nlink = 2;
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+
+	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
+				 handle_t *handle,
+				 struct inode *parent,
+				 struct inode *inode,
+				 struct buffer_head *fe_bh,
+				 struct ocfs2_alloc_context *data_ac,
+				 struct buffer_head **ret_new_bh)
+{
+	int status;
+	unsigned int size = osb->sb->s_blocksize;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry *de;
+
+	if (ocfs2_new_dir_wants_trailer(inode))
+		size = ocfs2_dir_trailer_blk_off(parent->i_sb);
+
+	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
+				     data_ac, NULL, &new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
+
+	status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
+
+	de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
+	if (ocfs2_new_dir_wants_trailer(inode)) {
+		int size = le16_to_cpu(de->rec_len);
+
+		/*
+		 * Figure out the size of the hole left over after
+		 * insertion of '.' and '..'. The trailer wants this
+		 * information.
+		 */
+		size -= OCFS2_DIR_REC_LEN(2);
+		size -= sizeof(struct ocfs2_dir_block_trailer);
+
+		ocfs2_init_dir_trailer(inode, new_bh, size);
+	}
+
+	ocfs2_journal_dirty(handle, new_bh);
+
+	i_size_write(inode, inode->i_sb->s_blocksize);
+	inode->i_nlink = 2;
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+	if (ret_new_bh) {
+		*ret_new_bh = new_bh;
+		new_bh = NULL;
+	}
+bail:
+	brelse(new_bh);
+
+	return status;
+}
+
+static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
+				     handle_t *handle, struct inode *dir,
+				     struct buffer_head *di_bh,
+				     struct buffer_head *dirdata_bh,
+				     struct ocfs2_alloc_context *meta_ac,
+				     int dx_inline, u32 num_entries,
+				     struct buffer_head **ret_dx_root_bh)
+{
+	int ret;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+	u16 dr_suballoc_bit;
+	u64 suballoc_loc, dr_blkno;
+	unsigned int num_bits;
+	struct buffer_head *dx_root_bh = NULL;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dir_block_trailer *trailer =
+		ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
+
+	ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+				   &dr_suballoc_bit, &num_bits, &dr_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_dx_dir_attach_index(
+				(unsigned long long)OCFS2_I(dir)->ip_blkno,
+				(unsigned long long)dr_blkno);
+
+	dx_root_bh = sb_getblk(osb->sb, dr_blkno);
+	if (dx_root_bh == NULL) {
+		ret = -EIO;
+		goto out;
+	}
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
+
+	ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	memset(dx_root, 0, osb->sb->s_blocksize);
+	strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
+	dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+	dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
+	dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
+	dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
+	dx_root->dr_blkno = cpu_to_le64(dr_blkno);
+	dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
+	dx_root->dr_num_entries = cpu_to_le32(num_entries);
+	if (le16_to_cpu(trailer->db_free_rec_len))
+		dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+	else
+		dx_root->dr_free_blk = cpu_to_le64(0);
+
+	if (dx_inline) {
+		dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
+		dx_root->dr_entries.de_count =
+			cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
+	} else {
+		dx_root->dr_list.l_count =
+			cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+	}
+	ocfs2_journal_dirty(handle, dx_root_bh);
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	di->i_dx_root = cpu_to_le64(dr_blkno);
+
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
+	di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+	spin_unlock(&OCFS2_I(dir)->ip_lock);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+	*ret_dx_root_bh = dx_root_bh;
+	dx_root_bh = NULL;
+
+out:
+	brelse(dx_root_bh);
+	return ret;
+}
+
+static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
+				       handle_t *handle, struct inode *dir,
+				       struct buffer_head **dx_leaves,
+				       int num_dx_leaves, u64 start_blk)
+{
+	int ret, i;
+	struct ocfs2_dx_leaf *dx_leaf;
+	struct buffer_head *bh;
+
+	for (i = 0; i < num_dx_leaves; i++) {
+		bh = sb_getblk(osb->sb, start_blk + i);
+		if (bh == NULL) {
+			ret = -EIO;
+			goto out;
+		}
+		dx_leaves[i] = bh;
+
+		ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh);
+
+		ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh,
+					      OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
+
+		memset(dx_leaf, 0, osb->sb->s_blocksize);
+		strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
+		dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
+		dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
+		dx_leaf->dl_list.de_count =
+			cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
+
+		trace_ocfs2_dx_dir_format_cluster(
+				(unsigned long long)OCFS2_I(dir)->ip_blkno,
+				(unsigned long long)bh->b_blocknr,
+				le16_to_cpu(dx_leaf->dl_list.de_count));
+
+		ocfs2_journal_dirty(handle, bh);
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * Allocates and formats a new cluster for use in an indexed dir
+ * leaf. This version will not do the extent insert, so that it can be
+ * used by operations which need careful ordering.
+ */
+static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
+				      u32 cpos, handle_t *handle,
+				      struct ocfs2_alloc_context *data_ac,
+				      struct buffer_head **dx_leaves,
+				      int num_dx_leaves, u64 *ret_phys_blkno)
+{
+	int ret;
+	u32 phys, num;
+	u64 phys_blkno;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+	/*
+	 * XXX: For create, this should claim cluster for the index
+	 * *before* the unindexed insert so that we have a better
+	 * chance of contiguousness as the directory grows in number
+	 * of entries.
+	 */
+	ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Format the new cluster first. That way, we're inserting
+	 * valid data.
+	 */
+	phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
+	ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
+					  num_dx_leaves, phys_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*ret_phys_blkno = phys_blkno;
+out:
+	return ret;
+}
+
+static int ocfs2_dx_dir_new_cluster(struct inode *dir,
+				    struct ocfs2_extent_tree *et,
+				    u32 cpos, handle_t *handle,
+				    struct ocfs2_alloc_context *data_ac,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct buffer_head **dx_leaves,
+				    int num_dx_leaves)
+{
+	int ret;
+	u64 phys_blkno;
+
+	ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
+					 num_dx_leaves, &phys_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0,
+				  meta_ac);
+	if (ret)
+		mlog_errno(ret);
+out:
+	return ret;
+}
+
+static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
+							int *ret_num_leaves)
+{
+	int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
+	struct buffer_head **dx_leaves;
+
+	dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
+			    GFP_NOFS);
+	if (dx_leaves && ret_num_leaves)
+		*ret_num_leaves = num_dx_leaves;
+
+	return dx_leaves;
+}
+
+static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
+				 handle_t *handle,
+				 struct inode *parent,
+				 struct inode *inode,
+				 struct buffer_head *di_bh,
+				 struct ocfs2_alloc_context *data_ac,
+				 struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	struct buffer_head *leaf_bh = NULL;
+	struct buffer_head *dx_root_bh = NULL;
+	struct ocfs2_dx_hinfo hinfo;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dx_entry_list *entry_list;
+
+	/*
+	 * Our strategy is to create the directory as though it were
+	 * unindexed, then add the index block. This works with very
+	 * little complication since the state of a new directory is a
+	 * very well known quantity.
+	 *
+	 * Essentially, we have two dirents ("." and ".."), in the 1st
+	 * block which need indexing. These are easily inserted into
+	 * the index block.
+	 */
+
+	ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
+				    data_ac, &leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
+					meta_ac, 1, 2, &dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	entry_list = &dx_root->dr_entries;
+
+	/* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
+	ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
+	ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
+
+	ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
+	ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
+
+out:
+	brelse(dx_root_bh);
+	brelse(leaf_bh);
+	return ret;
+}
+
+int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+		       handle_t *handle,
+		       struct inode *parent,
+		       struct inode *inode,
+		       struct buffer_head *fe_bh,
+		       struct ocfs2_alloc_context *data_ac,
+		       struct ocfs2_alloc_context *meta_ac)
+
+{
+	BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
+
+	if (ocfs2_supports_indexed_dirs(osb))
+		return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
+					     data_ac, meta_ac);
+
+	return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
+				     data_ac, NULL);
+}
+
+static int ocfs2_dx_dir_index_block(struct inode *dir,
+				    handle_t *handle,
+				    struct buffer_head **dx_leaves,
+				    int num_dx_leaves,
+				    u32 *num_dx_entries,
+				    struct buffer_head *dirent_bh)
+{
+	int ret = 0, namelen, i;
+	char *de_buf, *limit;
+	struct ocfs2_dir_entry *de;
+	struct buffer_head *dx_leaf_bh;
+	struct ocfs2_dx_hinfo hinfo;
+	u64 dirent_blk = dirent_bh->b_blocknr;
+
+	de_buf = dirent_bh->b_data;
+	limit = de_buf + dir->i_sb->s_blocksize;
+
+	while (de_buf < limit) {
+		de = (struct ocfs2_dir_entry *)de_buf;
+
+		namelen = de->name_len;
+		if (!namelen || !de->inode)
+			goto inc;
+
+		ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
+
+		i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
+		dx_leaf_bh = dx_leaves[i];
+
+		ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
+						 dirent_blk, dx_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		*num_dx_entries = *num_dx_entries + 1;
+
+inc:
+		de_buf += le16_to_cpu(de->rec_len);
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * XXX: This expects dx_root_bh to already be part of the transaction.
+ */
+static void ocfs2_dx_dir_index_root_block(struct inode *dir,
+					 struct buffer_head *dx_root_bh,
+					 struct buffer_head *dirent_bh)
+{
+	char *de_buf, *limit;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dir_entry *de;
+	struct ocfs2_dx_hinfo hinfo;
+	u64 dirent_blk = dirent_bh->b_blocknr;
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+	de_buf = dirent_bh->b_data;
+	limit = de_buf + dir->i_sb->s_blocksize;
+
+	while (de_buf < limit) {
+		de = (struct ocfs2_dir_entry *)de_buf;
+
+		if (!de->name_len || !de->inode)
+			goto inc;
+
+		ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
+
+		trace_ocfs2_dx_dir_index_root_block(
+				(unsigned long long)dir->i_ino,
+				hinfo.major_hash, hinfo.minor_hash,
+				de->name_len, de->name,
+				le16_to_cpu(dx_root->dr_entries.de_num_used));
+
+		ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
+					   dirent_blk);
+
+		le32_add_cpu(&dx_root->dr_num_entries, 1);
+inc:
+		de_buf += le16_to_cpu(de->rec_len);
+	}
+}
+
+/*
+ * Count the number of inline directory entries in di_bh and compare
+ * them against the number of entries we can hold in an inline dx root
+ * block.
+ */
+static int ocfs2_new_dx_should_be_inline(struct inode *dir,
+					 struct buffer_head *di_bh)
+{
+	int dirent_count = 0;
+	char *de_buf, *limit;
+	struct ocfs2_dir_entry *de;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	de_buf = di->id2.i_data.id_data;
+	limit = de_buf + i_size_read(dir);
+
+	while (de_buf < limit) {
+		de = (struct ocfs2_dir_entry *)de_buf;
+
+		if (de->name_len && de->inode)
+			dirent_count++;
+
+		de_buf += le16_to_cpu(de->rec_len);
+	}
+
+	/* We are careful to leave room for one extra record. */
+	return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
+}
+
+/*
+ * Expand rec_len of the rightmost dirent in a directory block so that it
+ * contains the end of our valid space for dirents. We do this during
+ * expansion from an inline directory to one with extents. The first dir block
+ * in that case is taken from the inline data portion of the inode block.
+ *
+ * This will also return the largest amount of contiguous space for a dirent
+ * in the block. That value is *not* necessarily the last dirent, even after
+ * expansion. The directory indexing code wants this value for free space
+ * accounting. We do this here since we're already walking the entire dir
+ * block.
+ *
+ * We add the dir trailer if this filesystem wants it.
+ */
+static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
+					     struct inode *dir)
+{
+	struct super_block *sb = dir->i_sb;
+	struct ocfs2_dir_entry *de;
+	struct ocfs2_dir_entry *prev_de;
+	char *de_buf, *limit;
+	unsigned int new_size = sb->s_blocksize;
+	unsigned int bytes, this_hole;
+	unsigned int largest_hole = 0;
+
+	if (ocfs2_new_dir_wants_trailer(dir))
+		new_size = ocfs2_dir_trailer_blk_off(sb);
+
+	bytes = new_size - old_size;
+
+	limit = start + old_size;
+	de_buf = start;
+	de = (struct ocfs2_dir_entry *)de_buf;
+	do {
+		this_hole = ocfs2_figure_dirent_hole(de);
+		if (this_hole > largest_hole)
+			largest_hole = this_hole;
+
+		prev_de = de;
+		de_buf += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)de_buf;
+	} while (de_buf < limit);
+
+	le16_add_cpu(&prev_de->rec_len, bytes);
+
+	/* We need to double check this after modification of the final
+	 * dirent. */
+	this_hole = ocfs2_figure_dirent_hole(prev_de);
+	if (this_hole > largest_hole)
+		largest_hole = this_hole;
+
+	if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+		return largest_hole;
+	return 0;
+}
+
+/*
+ * We allocate enough clusters to fulfill "blocks_wanted", but set
+ * i_size to exactly one block. Ocfs2_extend_dir() will handle the
+ * rest automatically for us.
+ *
+ * *first_block_bh is a pointer to the 1st data block allocated to the
+ *  directory.
+ */
+static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
+				   unsigned int blocks_wanted,
+				   struct ocfs2_dir_lookup_result *lookup,
+				   struct buffer_head **first_block_bh)
+{
+	u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
+	struct super_block *sb = dir->i_sb;
+	int ret, i, num_dx_leaves = 0, dx_inline = 0,
+		credits = ocfs2_inline_to_extents_credits(sb);
+	u64 dx_insert_blkno, blkno,
+		bytes = blocks_wanted << sb->s_blocksize_bits;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_inode_info *oi = OCFS2_I(dir);
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct buffer_head *dirdata_bh = NULL;
+	struct buffer_head *dx_root_bh = NULL;
+	struct buffer_head **dx_leaves = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	handle_t *handle;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_extent_tree dx_et;
+	int did_quota = 0, bytes_allocated = 0;
+
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh);
+
+	alloc = ocfs2_clusters_for_bytes(sb, bytes);
+	dx_alloc = 0;
+
+	down_write(&oi->ip_alloc_sem);
+
+	if (ocfs2_supports_indexed_dirs(osb)) {
+		credits += ocfs2_add_dir_index_credits(sb);
+
+		dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
+		if (!dx_inline) {
+			/* Add one more cluster for an index leaf */
+			dx_alloc++;
+			dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
+								&num_dx_leaves);
+			if (!dx_leaves) {
+				ret = -ENOMEM;
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		/* This gets us the dx_root */
+		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * We should never need more than 2 clusters for the unindexed
+	 * tree - maximum dirent size is far less than one block. In
+	 * fact, the only time we'd need more than one cluster is if
+	 * blocksize == clustersize and the dirent won't fit in the
+	 * extra space that the expansion to a single block gives. As
+	 * of today, that only happens on 4k/4k file systems.
+	 */
+	BUG_ON(alloc > 2);
+
+	ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Prepare for worst case allocation scenario of two separate
+	 * extents in the unindexed tree.
+	 */
+	if (alloc == 2)
+		credits += OCFS2_SUBALLOC_ALLOC;
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = dquot_alloc_space_nodirty(dir,
+		ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
+	if (ret)
+		goto out_commit;
+	did_quota = 1;
+
+	if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+		/*
+		 * Allocate our index cluster first, to maximize the
+		 * possibility that unindexed leaves grow
+		 * contiguously.
+		 */
+		ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
+						 dx_leaves, num_dx_leaves,
+						 &dx_insert_blkno);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
+	}
+
+	/*
+	 * Try to claim as many clusters as the bitmap can give though
+	 * if we only get one now, that's enough to continue. The rest
+	 * will be claimed after the conversion to extents.
+	 */
+	if (ocfs2_dir_resv_allowed(osb))
+		data_ac->ac_resv = &oi->ip_la_data_resv;
+	ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+	bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
+
+	/*
+	 * Operations are carefully ordered so that we set up the new
+	 * data block first. The conversion from inline data to
+	 * extents follows.
+	 */
+	blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
+	dirdata_bh = sb_getblk(sb, blkno);
+	if (!dirdata_bh) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh);
+
+	ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
+	memset(dirdata_bh->b_data + i_size_read(dir), 0,
+	       sb->s_blocksize - i_size_read(dir));
+	i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
+	if (ocfs2_new_dir_wants_trailer(dir)) {
+		/*
+		 * Prepare the dir trailer up front. It will otherwise look
+		 * like a valid dirent. Even if inserting the index fails
+		 * (unlikely), then all we'll have done is given first dir
+		 * block a small amount of fragmentation.
+		 */
+		ocfs2_init_dir_trailer(dir, dirdata_bh, i);
+	}
+
+	ocfs2_journal_dirty(handle, dirdata_bh);
+
+	if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+		/*
+		 * Dx dirs with an external cluster need to do this up
+		 * front. Inline dx root's get handled later, after
+		 * we've allocated our root block. We get passed back
+		 * a total number of items so that dr_num_entries can
+		 * be correctly set once the dx_root has been
+		 * allocated.
+		 */
+		ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
+					       num_dx_leaves, &num_dx_entries,
+					       dirdata_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	/*
+	 * Set extent, i_size, etc on the directory. After this, the
+	 * inode should contain the same exact dirents as before and
+	 * be fully accessible from system calls.
+	 *
+	 * We let the later dirent insert modify c/mtime - to the user
+	 * the data hasn't changed.
+	 */
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	ocfs2_dinode_new_extent_list(dir, di);
+
+	i_size_write(dir, sb->s_blocksize);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+
+	di->i_size = cpu_to_le64(sb->s_blocksize);
+	di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
+
+	/*
+	 * This should never fail as our extent list is empty and all
+	 * related blocks have been journaled already.
+	 */
+	ret = ocfs2_insert_extent(handle, &et, 0, blkno, len,
+				  0, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * Set i_blocks after the extent insert for the most up to
+	 * date ip_clusters value.
+	 */
+	dir->i_blocks = ocfs2_inode_sector_count(dir);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+	if (ocfs2_supports_indexed_dirs(osb)) {
+		ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
+						dirdata_bh, meta_ac, dx_inline,
+						num_dx_entries, &dx_root_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		if (dx_inline) {
+			ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
+						      dirdata_bh);
+		} else {
+			ocfs2_init_dx_root_extent_tree(&dx_et,
+						       INODE_CACHE(dir),
+						       dx_root_bh);
+			ret = ocfs2_insert_extent(handle, &dx_et, 0,
+						  dx_insert_blkno, 1, 0, NULL);
+			if (ret)
+				mlog_errno(ret);
+		}
+	}
+
+	/*
+	 * We asked for two clusters, but only got one in the 1st
+	 * pass. Claim the 2nd cluster as a separate extent.
+	 */
+	if (alloc > len) {
+		ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
+					   &len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
+
+		ret = ocfs2_insert_extent(handle, &et, 1,
+					  blkno, len, 0, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
+	}
+
+	*first_block_bh = dirdata_bh;
+	dirdata_bh = NULL;
+	if (ocfs2_supports_indexed_dirs(osb)) {
+		unsigned int off;
+
+		if (!dx_inline) {
+			/*
+			 * We need to return the correct block within the
+			 * cluster which should hold our entry.
+			 */
+			off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
+						    &lookup->dl_hinfo);
+			get_bh(dx_leaves[off]);
+			lookup->dl_dx_leaf_bh = dx_leaves[off];
+		}
+		lookup->dl_dx_root_bh = dx_root_bh;
+		dx_root_bh = NULL;
+	}
+
+out_commit:
+	if (ret < 0 && did_quota)
+		dquot_free_space_nodirty(dir, bytes_allocated);
+
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	up_write(&oi->ip_alloc_sem);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	if (dx_leaves) {
+		for (i = 0; i < num_dx_leaves; i++)
+			brelse(dx_leaves[i]);
+		kfree(dx_leaves);
+	}
+
+	brelse(dirdata_bh);
+	brelse(dx_root_bh);
+
+	return ret;
+}
+
+/* returns a bh of the 1st new block in the allocation. */
+static int ocfs2_do_extend_dir(struct super_block *sb,
+			       handle_t *handle,
+			       struct inode *dir,
+			       struct buffer_head *parent_fe_bh,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       struct buffer_head **new_bh)
+{
+	int status;
+	int extend, did_quota = 0;
+	u64 p_blkno, v_blkno;
+
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
+	spin_unlock(&OCFS2_I(dir)->ip_lock);
+
+	if (extend) {
+		u32 offset = OCFS2_I(dir)->ip_clusters;
+
+		status = dquot_alloc_space_nodirty(dir,
+					ocfs2_clusters_to_bytes(sb, 1));
+		if (status)
+			goto bail;
+		did_quota = 1;
+
+		status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
+					      1, 0, parent_fe_bh, handle,
+					      data_ac, meta_ac, NULL);
+		BUG_ON(status == -EAGAIN);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
+	status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*new_bh = sb_getblk(sb, p_blkno);
+	if (!*new_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+	status = 0;
+bail:
+	if (did_quota && status < 0)
+		dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
+	return status;
+}
+
+/*
+ * Assumes you already have a cluster lock on the directory.
+ *
+ * 'blocks_wanted' is only used if we have an inline directory which
+ * is to be turned into an extent based one. The size of the dirent to
+ * insert might be larger than the space gained by growing to just one
+ * block, so we may have to grow the inode by two blocks in that case.
+ *
+ * If the directory is already indexed, dx_root_bh must be provided.
+ */
+static int ocfs2_extend_dir(struct ocfs2_super *osb,
+			    struct inode *dir,
+			    struct buffer_head *parent_fe_bh,
+			    unsigned int blocks_wanted,
+			    struct ocfs2_dir_lookup_result *lookup,
+			    struct buffer_head **new_de_bh)
+{
+	int status = 0;
+	int credits, num_free_extents, drop_alloc_sem = 0;
+	loff_t dir_i_size;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	struct ocfs2_extent_list *el = &fe->id2.i_list;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	handle_t *handle = NULL;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry * de;
+	struct super_block *sb = osb->sb;
+	struct ocfs2_extent_tree et;
+	struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		/*
+		 * This would be a code error as an inline directory should
+		 * never have an index root.
+		 */
+		BUG_ON(dx_root_bh);
+
+		status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
+						 blocks_wanted, lookup,
+						 &new_bh);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* Expansion from inline to an indexed directory will
+		 * have given us this. */
+		dx_root_bh = lookup->dl_dx_root_bh;
+
+		if (blocks_wanted == 1) {
+			/*
+			 * If the new dirent will fit inside the space
+			 * created by pushing out to one block, then
+			 * we can complete the operation
+			 * here. Otherwise we have to expand i_size
+			 * and format the 2nd block below.
+			 */
+			BUG_ON(new_bh == NULL);
+			goto bail_bh;
+		}
+
+		/*
+		 * Get rid of 'new_bh' - we want to format the 2nd
+		 * data block and return that instead.
+		 */
+		brelse(new_bh);
+		new_bh = NULL;
+
+		down_write(&OCFS2_I(dir)->ip_alloc_sem);
+		drop_alloc_sem = 1;
+		dir_i_size = i_size_read(dir);
+		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
+		goto do_extend;
+	}
+
+	down_write(&OCFS2_I(dir)->ip_alloc_sem);
+	drop_alloc_sem = 1;
+	dir_i_size = i_size_read(dir);
+	trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno,
+			       dir_i_size);
+
+	/* dir->i_size is always block aligned. */
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
+		spin_unlock(&OCFS2_I(dir)->ip_lock);
+		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
+					      parent_fe_bh);
+		num_free_extents = ocfs2_num_free_extents(osb, &et);
+		if (num_free_extents < 0) {
+			status = num_free_extents;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (!num_free_extents) {
+			status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
+			if (status < 0) {
+				if (status != -ENOSPC)
+					mlog_errno(status);
+				goto bail;
+			}
+		}
+
+		status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		if (ocfs2_dir_resv_allowed(osb))
+			data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
+
+		credits = ocfs2_calc_extend_credits(sb, el, 1);
+	} else {
+		spin_unlock(&OCFS2_I(dir)->ip_lock);
+		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
+	}
+
+do_extend:
+	if (ocfs2_dir_indexed(dir))
+		credits++; /* For attaching the new dirent block to the
+			    * dx_root */
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
+				     data_ac, meta_ac, &new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh);
+
+	status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, sb->s_blocksize);
+
+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
+	de->inode = 0;
+	if (ocfs2_supports_dir_trailer(dir)) {
+		de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
+
+		ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
+
+		if (ocfs2_dir_indexed(dir)) {
+			status = ocfs2_dx_dir_link_trailer(dir, handle,
+							   dx_root_bh, new_bh);
+			if (status) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+	} else {
+		de->rec_len = cpu_to_le16(sb->s_blocksize);
+	}
+	ocfs2_journal_dirty(handle, new_bh);
+
+	dir_i_size += dir->i_sb->s_blocksize;
+	i_size_write(dir, dir_i_size);
+	dir->i_blocks = ocfs2_inode_sector_count(dir);
+	status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail_bh:
+	*new_de_bh = new_bh;
+	get_bh(*new_de_bh);
+bail:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+	if (drop_alloc_sem)
+		up_write(&OCFS2_I(dir)->ip_alloc_sem);
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	brelse(new_bh);
+
+	return status;
+}
+
+static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
+				   const char *name, int namelen,
+				   struct buffer_head **ret_de_bh,
+				   unsigned int *blocks_wanted)
+{
+	int ret;
+	struct super_block *sb = dir->i_sb;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_dir_entry *de, *last_de = NULL;
+	char *de_buf, *limit;
+	unsigned long offset = 0;
+	unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
+
+	/*
+	 * This calculates how many free bytes we'd have in block zero, should
+	 * this function force expansion to an extent tree.
+	 */
+	if (ocfs2_new_dir_wants_trailer(dir))
+		free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
+	else
+		free_space = dir->i_sb->s_blocksize - i_size_read(dir);
+
+	de_buf = di->id2.i_data.id_data;
+	limit = de_buf + i_size_read(dir);
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+
+	while (de_buf < limit) {
+		de = (struct ocfs2_dir_entry *)de_buf;
+
+		if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) {
+			ret = -ENOENT;
+			goto out;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			ret = -EEXIST;
+			goto out;
+		}
+		/*
+		 * No need to check for a trailing dirent record here as
+		 * they're not used for inline dirs.
+		 */
+
+		if (ocfs2_dirent_would_fit(de, rec_len)) {
+			/* Ok, we found a spot. Return this bh and let
+			 * the caller actually fill it in. */
+			*ret_de_bh = di_bh;
+			get_bh(*ret_de_bh);
+			ret = 0;
+			goto out;
+		}
+
+		last_de = de;
+		de_buf += le16_to_cpu(de->rec_len);
+		offset += le16_to_cpu(de->rec_len);
+	}
+
+	/*
+	 * We're going to require expansion of the directory - figure
+	 * out how many blocks we'll need so that a place for the
+	 * dirent can be found.
+	 */
+	*blocks_wanted = 1;
+	new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
+	if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
+		*blocks_wanted = 2;
+
+	ret = -ENOSPC;
+out:
+	return ret;
+}
+
+static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
+				   int namelen, struct buffer_head **ret_de_bh)
+{
+	unsigned long offset;
+	struct buffer_head *bh = NULL;
+	unsigned short rec_len;
+	struct ocfs2_dir_entry *de;
+	struct super_block *sb = dir->i_sb;
+	int status;
+	int blocksize = dir->i_sb->s_blocksize;
+
+	status = ocfs2_read_dir_block(dir, 0, &bh, 0);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+	offset = 0;
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	while (1) {
+		if ((char *)de >= sb->s_blocksize + bh->b_data) {
+			brelse(bh);
+			bh = NULL;
+
+			if (i_size_read(dir) <= offset) {
+				/*
+				 * Caller will have to expand this
+				 * directory.
+				 */
+				status = -ENOSPC;
+				goto bail;
+			}
+			status = ocfs2_read_dir_block(dir,
+					     offset >> sb->s_blocksize_bits,
+					     &bh, 0);
+			if (status) {
+				mlog_errno(status);
+				goto bail;
+			}
+			/* move to next block */
+			de = (struct ocfs2_dir_entry *) bh->b_data;
+		}
+		if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+			status = -ENOENT;
+			goto bail;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			status = -EEXIST;
+			goto bail;
+		}
+
+		if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
+					   blocksize))
+			goto next;
+
+		if (ocfs2_dirent_would_fit(de, rec_len)) {
+			/* Ok, we found a spot. Return this bh and let
+			 * the caller actually fill it in. */
+			*ret_de_bh = bh;
+			get_bh(*ret_de_bh);
+			status = 0;
+			goto bail;
+		}
+next:
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
+	}
+
+	status = 0;
+bail:
+	brelse(bh);
+	if (status)
+		mlog_errno(status);
+
+	return status;
+}
+
+static int dx_leaf_sort_cmp(const void *a, const void *b)
+{
+	const struct ocfs2_dx_entry *entry1 = a;
+	const struct ocfs2_dx_entry *entry2 = b;
+	u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
+	u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
+	u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
+	u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
+
+	if (major_hash1 > major_hash2)
+		return 1;
+	if (major_hash1 < major_hash2)
+		return -1;
+
+	/*
+	 * It is not strictly necessary to sort by minor
+	 */
+	if (minor_hash1 > minor_hash2)
+		return 1;
+	if (minor_hash1 < minor_hash2)
+		return -1;
+	return 0;
+}
+
+static void dx_leaf_sort_swap(void *a, void *b, int size)
+{
+	struct ocfs2_dx_entry *entry1 = a;
+	struct ocfs2_dx_entry *entry2 = b;
+	struct ocfs2_dx_entry tmp;
+
+	BUG_ON(size != sizeof(*entry1));
+
+	tmp = *entry1;
+	*entry1 = *entry2;
+	*entry2 = tmp;
+}
+
+static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
+{
+	struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+	int i, num = le16_to_cpu(dl_list->de_num_used);
+
+	for (i = 0; i < (num - 1); i++) {
+		if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
+		    le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Find the optimal value to split this leaf on. This expects the leaf
+ * entries to be in sorted order.
+ *
+ * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
+ * the hash we want to insert.
+ *
+ * This function is only concerned with the major hash - that which
+ * determines which cluster an item belongs to.
+ */
+static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
+					u32 leaf_cpos, u32 insert_hash,
+					u32 *split_hash)
+{
+	struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+	int i, num_used = le16_to_cpu(dl_list->de_num_used);
+	int allsame;
+
+	/*
+	 * There's a couple rare, but nasty corner cases we have to
+	 * check for here. All of them involve a leaf where all value
+	 * have the same hash, which is what we look for first.
+	 *
+	 * Most of the time, all of the above is false, and we simply
+	 * pick the median value for a split.
+	 */
+	allsame = ocfs2_dx_leaf_same_major(dx_leaf);
+	if (allsame) {
+		u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
+
+		if (val == insert_hash) {
+			/*
+			 * No matter where we would choose to split,
+			 * the new entry would want to occupy the same
+			 * block as these. Since there's no space left
+			 * in their existing block, we know there
+			 * won't be space after the split.
+			 */
+			return -ENOSPC;
+		}
+
+		if (val == leaf_cpos) {
+			/*
+			 * Because val is the same as leaf_cpos (which
+			 * is the smallest value this leaf can have),
+			 * yet is not equal to insert_hash, then we
+			 * know that insert_hash *must* be larger than
+			 * val (and leaf_cpos). At least cpos+1 in value.
+			 *
+			 * We also know then, that there cannot be an
+			 * adjacent extent (otherwise we'd be looking
+			 * at it). Choosing this value gives us a
+			 * chance to get some contiguousness.
+			 */
+			*split_hash = leaf_cpos + 1;
+			return 0;
+		}
+
+		if (val > insert_hash) {
+			/*
+			 * val can not be the same as insert hash, and
+			 * also must be larger than leaf_cpos. Also,
+			 * we know that there can't be a leaf between
+			 * cpos and val, otherwise the entries with
+			 * hash 'val' would be there.
+			 */
+			*split_hash = val;
+			return 0;
+		}
+
+		*split_hash = insert_hash;
+		return 0;
+	}
+
+	/*
+	 * Since the records are sorted and the checks above
+	 * guaranteed that not all records in this block are the same,
+	 * we simple travel forward, from the median, and pick the 1st
+	 * record whose value is larger than leaf_cpos.
+	 */
+	for (i = (num_used / 2); i < num_used; i++)
+		if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
+		    leaf_cpos)
+			break;
+
+	BUG_ON(i == num_used); /* Should be impossible */
+	*split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
+	return 0;
+}
+
+/*
+ * Transfer all entries in orig_dx_leaves whose major hash is equal to or
+ * larger than split_hash into new_dx_leaves. We use a temporary
+ * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
+ *
+ * Since the block offset inside a leaf (cluster) is a constant mask
+ * of minor_hash, we can optimize - an item at block offset X within
+ * the original cluster, will be at offset X within the new cluster.
+ */
+static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
+				       handle_t *handle,
+				       struct ocfs2_dx_leaf *tmp_dx_leaf,
+				       struct buffer_head **orig_dx_leaves,
+				       struct buffer_head **new_dx_leaves,
+				       int num_dx_leaves)
+{
+	int i, j, num_used;
+	u32 major_hash;
+	struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
+	struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
+	struct ocfs2_dx_entry *dx_entry;
+
+	tmp_list = &tmp_dx_leaf->dl_list;
+
+	for (i = 0; i < num_dx_leaves; i++) {
+		orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
+		orig_list = &orig_dx_leaf->dl_list;
+		new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
+		new_list = &new_dx_leaf->dl_list;
+
+		num_used = le16_to_cpu(orig_list->de_num_used);
+
+		memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
+		tmp_list->de_num_used = cpu_to_le16(0);
+		memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
+
+		for (j = 0; j < num_used; j++) {
+			dx_entry = &orig_list->de_entries[j];
+			major_hash = le32_to_cpu(dx_entry->dx_major_hash);
+			if (major_hash >= split_hash)
+				ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
+							      dx_entry);
+			else
+				ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
+							      dx_entry);
+		}
+		memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
+
+		ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
+		ocfs2_journal_dirty(handle, new_dx_leaves[i]);
+	}
+}
+
+static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
+					  struct ocfs2_dx_root_block *dx_root)
+{
+	int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
+
+	credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
+	credits += ocfs2_quota_trans_credits(osb->sb);
+	return credits;
+}
+
+/*
+ * Find the median value in dx_leaf_bh and allocate a new leaf to move
+ * half our entries into.
+ */
+static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
+				  struct buffer_head *dx_root_bh,
+				  struct buffer_head *dx_leaf_bh,
+				  struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
+				  u64 leaf_blkno)
+{
+	struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+	int credits, ret, i, num_used, did_quota = 0;
+	u32 cpos, split_hash, insert_hash = hinfo->major_hash;
+	u64 orig_leaves_start;
+	int num_dx_leaves;
+	struct buffer_head **orig_dx_leaves = NULL;
+	struct buffer_head **new_dx_leaves = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
+	struct ocfs2_extent_tree et;
+	handle_t *handle = NULL;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
+
+	trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno,
+				     (unsigned long long)leaf_blkno,
+				     insert_hash);
+
+	ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	/*
+	 * XXX: This is a rather large limit. We should use a more
+	 * realistic value.
+	 */
+	if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
+		return -ENOSPC;
+
+	num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+	if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
+		mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
+		     "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
+		     (unsigned long long)leaf_blkno, num_used);
+		ret = -EIO;
+		goto out;
+	}
+
+	orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+	if (!orig_dx_leaves) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
+	if (!new_dx_leaves) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = dquot_alloc_space_nodirty(dir,
+				       ocfs2_clusters_to_bytes(dir->i_sb, 1));
+	if (ret)
+		goto out_commit;
+	did_quota = 1;
+
+	ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * This block is changing anyway, so we can sort it in place.
+	 */
+	sort(dx_leaf->dl_list.de_entries, num_used,
+	     sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
+	     dx_leaf_sort_swap);
+
+	ocfs2_journal_dirty(handle, dx_leaf_bh);
+
+	ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
+					   &split_hash);
+	if (ret) {
+		mlog_errno(ret);
+		goto  out_commit;
+	}
+
+	trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash);
+
+	/*
+	 * We have to carefully order operations here. There are items
+	 * which want to be in the new cluster before insert, but in
+	 * order to put those items in the new cluster, we alter the
+	 * old cluster. A failure to insert gets nasty.
+	 *
+	 * So, start by reserving writes to the old
+	 * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
+	 * the new cluster for us, before inserting it. The insert
+	 * won't happen if there's an error before that. Once the
+	 * insert is done then, we can transfer from one leaf into the
+	 * other without fear of hitting any error.
+	 */
+
+	/*
+	 * The leaf transfer wants some scratch space so that we don't
+	 * wind up doing a bunch of expensive memmove().
+	 */
+	tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
+	if (!tmp_dx_leaf) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
+	ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
+				   orig_dx_leaves);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	cpos = split_hash;
+	ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+				       data_ac, meta_ac, new_dx_leaves,
+				       num_dx_leaves);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	for (i = 0; i < num_dx_leaves; i++) {
+		ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
+					      orig_dx_leaves[i],
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
+					      new_dx_leaves[i],
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
+				   orig_dx_leaves, new_dx_leaves, num_dx_leaves);
+
+out_commit:
+	if (ret < 0 && did_quota)
+		dquot_free_space_nodirty(dir,
+				ocfs2_clusters_to_bytes(dir->i_sb, 1));
+
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (orig_dx_leaves || new_dx_leaves) {
+		for (i = 0; i < num_dx_leaves; i++) {
+			if (orig_dx_leaves)
+				brelse(orig_dx_leaves[i]);
+			if (new_dx_leaves)
+				brelse(new_dx_leaves[i]);
+		}
+		kfree(orig_dx_leaves);
+		kfree(new_dx_leaves);
+	}
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	kfree(tmp_dx_leaf);
+	return ret;
+}
+
+static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
+				   struct buffer_head *di_bh,
+				   struct buffer_head *dx_root_bh,
+				   const char *name, int namelen,
+				   struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret, rebalanced = 0;
+	struct ocfs2_dx_root_block *dx_root;
+	struct buffer_head *dx_leaf_bh = NULL;
+	struct ocfs2_dx_leaf *dx_leaf;
+	u64 blkno;
+	u32 leaf_cpos;
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+restart_search:
+	ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
+				  &leaf_cpos, &blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+
+	if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
+	    le16_to_cpu(dx_leaf->dl_list.de_count)) {
+		if (rebalanced) {
+			/*
+			 * Rebalancing should have provided us with
+			 * space in an appropriate leaf.
+			 *
+			 * XXX: Is this an abnormal condition then?
+			 * Should we print a message here?
+			 */
+			ret = -ENOSPC;
+			goto out;
+		}
+
+		ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
+					     &lookup->dl_hinfo, leaf_cpos,
+					     blkno);
+		if (ret) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * Restart the lookup. The rebalance might have
+		 * changed which block our item fits into. Mark our
+		 * progress, so we only execute this once.
+		 */
+		brelse(dx_leaf_bh);
+		dx_leaf_bh = NULL;
+		rebalanced = 1;
+		goto restart_search;
+	}
+
+	lookup->dl_dx_leaf_bh = dx_leaf_bh;
+	dx_leaf_bh = NULL;
+
+out:
+	brelse(dx_leaf_bh);
+	return ret;
+}
+
+static int ocfs2_search_dx_free_list(struct inode *dir,
+				     struct buffer_head *dx_root_bh,
+				     int namelen,
+				     struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret = -ENOSPC;
+	struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
+	struct ocfs2_dir_block_trailer *db;
+	u64 next_block;
+	int rec_len = OCFS2_DIR_REC_LEN(namelen);
+	struct ocfs2_dx_root_block *dx_root;
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	next_block = le64_to_cpu(dx_root->dr_free_blk);
+
+	while (next_block) {
+		brelse(prev_leaf_bh);
+		prev_leaf_bh = leaf_bh;
+		leaf_bh = NULL;
+
+		ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+		if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
+			lookup->dl_leaf_bh = leaf_bh;
+			lookup->dl_prev_leaf_bh = prev_leaf_bh;
+			leaf_bh = NULL;
+			prev_leaf_bh = NULL;
+			break;
+		}
+
+		next_block = le64_to_cpu(db->db_free_next);
+	}
+
+	if (!next_block)
+		ret = -ENOSPC;
+
+out:
+
+	brelse(leaf_bh);
+	brelse(prev_leaf_bh);
+	return ret;
+}
+
+static int ocfs2_expand_inline_dx_root(struct inode *dir,
+				       struct buffer_head *dx_root_bh)
+{
+	int ret, num_dx_leaves, i, j, did_quota = 0;
+	struct buffer_head **dx_leaves = NULL;
+	struct ocfs2_extent_tree et;
+	u64 insert_blkno;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	handle_t *handle = NULL;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dx_entry_list *entry_list;
+	struct ocfs2_dx_entry *dx_entry;
+	struct ocfs2_dx_leaf *target_leaf;
+
+	ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+	if (!dx_leaves) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = dquot_alloc_space_nodirty(dir,
+				       ocfs2_clusters_to_bytes(osb->sb, 1));
+	if (ret)
+		goto out_commit;
+	did_quota = 1;
+
+	/*
+	 * We do this up front, before the allocation, so that a
+	 * failure to add the dx_root_bh to the journal won't result
+	 * us losing clusters.
+	 */
+	ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
+					 num_dx_leaves, &insert_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * Transfer the entries from our dx_root into the appropriate
+	 * block
+	 */
+	dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+	entry_list = &dx_root->dr_entries;
+
+	for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+		dx_entry = &entry_list->de_entries[i];
+
+		j = __ocfs2_dx_dir_hash_idx(osb,
+					    le32_to_cpu(dx_entry->dx_minor_hash));
+		target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
+
+		ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
+
+		/* Each leaf has been passed to the journal already
+		 * via __ocfs2_dx_dir_new_cluster() */
+	}
+
+	dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
+	memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
+	       offsetof(struct ocfs2_dx_root_block, dr_list));
+	dx_root->dr_list.l_count =
+		cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+
+	/* This should never fail considering we start with an empty
+	 * dx_root. */
+	ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
+	ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL);
+	if (ret)
+		mlog_errno(ret);
+	did_quota = 0;
+
+	ocfs2_journal_dirty(handle, dx_root_bh);
+
+out_commit:
+	if (ret < 0 && did_quota)
+		dquot_free_space_nodirty(dir,
+					  ocfs2_clusters_to_bytes(dir->i_sb, 1));
+
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	if (dx_leaves) {
+		for (i = 0; i < num_dx_leaves; i++)
+			brelse(dx_leaves[i]);
+		kfree(dx_leaves);
+	}
+	return ret;
+}
+
+static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
+{
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dx_entry_list *entry_list;
+
+	dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+	entry_list = &dx_root->dr_entries;
+
+	if (le16_to_cpu(entry_list->de_num_used) >=
+	    le16_to_cpu(entry_list->de_count))
+		return -ENOSPC;
+
+	return 0;
+}
+
+static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
+					   struct buffer_head *di_bh,
+					   const char *name,
+					   int namelen,
+					   struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret, free_dx_root = 1;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct buffer_head *dx_root_bh = NULL;
+	struct buffer_head *leaf_bh = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_dx_root_block *dx_root;
+
+	ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
+		ret = -ENOSPC;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (ocfs2_dx_root_inline(dx_root)) {
+		ret = ocfs2_inline_dx_has_space(dx_root_bh);
+
+		if (ret == 0)
+			goto search_el;
+
+		/*
+		 * We ran out of room in the root block. Expand it to
+		 * an extent, then allow ocfs2_find_dir_space_dx to do
+		 * the rest.
+		 */
+		ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * Insert preparation for an indexed directory is split into two
+	 * steps. The call to find_dir_space_dx reserves room in the index for
+	 * an additional item. If we run out of space there, it's a real error
+	 * we can't continue on.
+	 */
+	ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
+				      namelen, lookup);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+search_el:
+	/*
+	 * Next, we need to find space in the unindexed tree. This call
+	 * searches using the free space linked list. If the unindexed tree
+	 * lacks sufficient space, we'll expand it below. The expansion code
+	 * is smart enough to add any new blocks to the free space list.
+	 */
+	ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
+	if (ret && ret != -ENOSPC) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Do this up here - ocfs2_extend_dir might need the dx_root */
+	lookup->dl_dx_root_bh = dx_root_bh;
+	free_dx_root = 0;
+
+	if (ret == -ENOSPC) {
+		ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
+
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * We make the assumption here that new leaf blocks are added
+		 * to the front of our free list.
+		 */
+		lookup->dl_prev_leaf_bh = NULL;
+		lookup->dl_leaf_bh = leaf_bh;
+	}
+
+out:
+	if (free_dx_root)
+		brelse(dx_root_bh);
+	return ret;
+}
+
+/*
+ * Get a directory ready for insert. Any directory allocation required
+ * happens here. Success returns zero, and enough context in the dir
+ * lookup result that ocfs2_add_entry() will be able complete the task
+ * with minimal performance impact.
+ */
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+				 struct inode *dir,
+				 struct buffer_head *parent_fe_bh,
+				 const char *name,
+				 int namelen,
+				 struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret;
+	unsigned int blocks_wanted = 1;
+	struct buffer_head *bh = NULL;
+
+	trace_ocfs2_prepare_dir_for_insert(
+		(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen);
+
+	if (!namelen) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Do this up front to reduce confusion.
+	 *
+	 * The directory might start inline, then be turned into an
+	 * indexed one, in which case we'd need to hash deep inside
+	 * ocfs2_find_dir_space_id(). Since
+	 * ocfs2_prepare_dx_dir_for_insert() also needs this hash
+	 * done, there seems no point in spreading out the calls. We
+	 * can optimize away the case where the file system doesn't
+	 * support indexing.
+	 */
+	if (ocfs2_supports_indexed_dirs(osb))
+		ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
+
+	if (ocfs2_dir_indexed(dir)) {
+		ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
+						      name, namelen, lookup);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
+					      namelen, &bh, &blocks_wanted);
+	} else
+		ret = ocfs2_find_dir_space_el(dir, name, namelen, &bh);
+
+	if (ret && ret != -ENOSPC) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (ret == -ENOSPC) {
+		/*
+		 * We have to expand the directory to add this name.
+		 */
+		BUG_ON(bh);
+
+		ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
+				       lookup, &bh);
+		if (ret) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+
+		BUG_ON(!bh);
+	}
+
+	lookup->dl_leaf_bh = bh;
+	bh = NULL;
+out:
+	brelse(bh);
+	return ret;
+}
+
+static int ocfs2_dx_dir_remove_index(struct inode *dir,
+				     struct buffer_head *di_bh,
+				     struct buffer_head *dx_root_bh)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_dx_root_block *dx_root;
+	struct inode *dx_alloc_inode = NULL;
+	struct buffer_head *dx_alloc_bh = NULL;
+	handle_t *handle;
+	u64 blk;
+	u16 bit;
+	u64 bg_blkno;
+
+	dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+
+	dx_alloc_inode = ocfs2_get_system_file_inode(osb,
+					EXTENT_ALLOC_SYSTEM_INODE,
+					le16_to_cpu(dx_root->dr_suballoc_slot));
+	if (!dx_alloc_inode) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	mutex_lock(&dx_alloc_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
+	di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+	spin_unlock(&OCFS2_I(dir)->ip_lock);
+	di->i_dx_root = cpu_to_le64(0ULL);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+	blk = le64_to_cpu(dx_root->dr_blkno);
+	bit = le16_to_cpu(dx_root->dr_suballoc_bit);
+	if (dx_root->dr_suballoc_loc)
+		bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
+	else
+		bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+	ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
+				       bit, bg_blkno, 1);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+	ocfs2_inode_unlock(dx_alloc_inode, 1);
+
+out_mutex:
+	mutex_unlock(&dx_alloc_inode->i_mutex);
+	brelse(dx_alloc_bh);
+out:
+	iput(dx_alloc_inode);
+	return ret;
+}
+
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
+{
+	int ret;
+	unsigned int uninitialized_var(clen);
+	u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
+	u64 uninitialized_var(blkno);
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct buffer_head *dx_root_bh = NULL;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	if (!ocfs2_dir_indexed(dir))
+		return 0;
+
+	ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+	if (ocfs2_dx_root_inline(dx_root))
+		goto remove_index;
+
+	ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
+
+	/* XXX: What if dr_clusters is too large? */
+	while (le32_to_cpu(dx_root->dr_clusters)) {
+		ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
+					      major_hash, &cpos, &blkno, &clen);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
+
+		ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
+					       &dealloc, 0);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (cpos == 0)
+			break;
+
+		major_hash = cpos - 1;
+	}
+
+remove_index:
+	ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh);
+out:
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	brelse(dx_root_bh);
+	return ret;
+}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
new file mode 100644
index 00000000..e683f3de
--- /dev/null
+++ b/fs/ocfs2/dir.h
@@ -0,0 +1,117 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_DIR_H
+#define OCFS2_DIR_H
+
+struct ocfs2_dx_hinfo {
+	u32	major_hash;
+	u32	minor_hash;
+};
+
+struct ocfs2_dir_lookup_result {
+	struct buffer_head		*dl_leaf_bh;	/* Unindexed leaf
+							 * block */
+	struct ocfs2_dir_entry		*dl_entry;	/* Target dirent in
+							 * unindexed leaf */
+
+	struct buffer_head		*dl_dx_root_bh;	/* Root of indexed
+							 * tree */
+
+	struct buffer_head		*dl_dx_leaf_bh;	/* Indexed leaf block */
+	struct ocfs2_dx_entry		*dl_dx_entry;	/* Target dx_entry in
+							 * indexed leaf */
+	struct ocfs2_dx_hinfo		dl_hinfo;	/* Name hash results */
+
+	struct buffer_head		*dl_prev_leaf_bh;/* Previous entry in
+							  * dir free space
+							  * list. NULL if
+							  * previous entry is
+							  * dx root block. */
+};
+
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res);
+
+int ocfs2_find_entry(const char *name, int namelen,
+		     struct inode *dir,
+		     struct ocfs2_dir_lookup_result *lookup);
+int ocfs2_delete_entry(handle_t *handle,
+		       struct inode *dir,
+		       struct ocfs2_dir_lookup_result *res);
+int __ocfs2_add_entry(handle_t *handle,
+		      struct inode *dir,
+		      const char *name, int namelen,
+		      struct inode *inode, u64 blkno,
+		      struct buffer_head *parent_fe_bh,
+		      struct ocfs2_dir_lookup_result *lookup);
+static inline int ocfs2_add_entry(handle_t *handle,
+				  struct dentry *dentry,
+				  struct inode *inode, u64 blkno,
+				  struct buffer_head *parent_fe_bh,
+				  struct ocfs2_dir_lookup_result *lookup)
+{
+	return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
+				 dentry->d_name.name, dentry->d_name.len,
+				 inode, blkno, parent_fe_bh, lookup);
+}
+int ocfs2_update_entry(struct inode *dir, handle_t *handle,
+		       struct ocfs2_dir_lookup_result *res,
+		       struct inode *new_entry_inode);
+
+int ocfs2_check_dir_for_entry(struct inode *dir,
+			      const char *name,
+			      int namelen);
+int ocfs2_empty_dir(struct inode *inode);
+
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
+			     struct ocfs2_dir_lookup_result *res);
+int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
+			       int namelen, u64 *blkno);
+int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
+int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
+		      filldir_t filldir);
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+				 struct inode *dir,
+				 struct buffer_head *parent_fe_bh,
+				 const char *name,
+				 int namelen,
+				 struct ocfs2_dir_lookup_result *lookup);
+struct ocfs2_alloc_context;
+int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+		       handle_t *handle,
+		       struct inode *parent,
+		       struct inode *inode,
+		       struct buffer_head *fe_bh,
+		       struct ocfs2_alloc_context *data_ac,
+		       struct ocfs2_alloc_context *meta_ac);
+
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh);
+
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+							    void *data);
+#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
new file mode 100644
index 00000000..c8a044ef
--- /dev/null
+++ b/fs/ocfs2/dlm/Makefile
@@ -0,0 +1,7 @@
+ccflags-y := -Ifs/ocfs2
+
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
+
+ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
+	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
+
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
new file mode 100644
index 00000000..3cfa114a
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -0,0 +1,220 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmapi.h
+ *
+ * externally exported dlm interfaces
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMAPI_H
+#define DLMAPI_H
+
+struct dlm_lock;
+struct dlm_ctxt;
+
+/* NOTE: changes made to this enum should be reflected in dlmdebug.c */
+enum dlm_status {
+	DLM_NORMAL = 0,           /*  0: request in progress */
+	DLM_GRANTED,              /*  1: request granted */
+	DLM_DENIED,               /*  2: request denied */
+	DLM_DENIED_NOLOCKS,       /*  3: request denied, out of system resources */
+	DLM_WORKING,              /*  4: async request in progress */
+	DLM_BLOCKED,              /*  5: lock request blocked */
+	DLM_BLOCKED_ORPHAN,       /*  6: lock request blocked by a orphan lock*/
+	DLM_DENIED_GRACE_PERIOD,  /*  7: topological change in progress */
+	DLM_SYSERR,               /*  8: system error */
+	DLM_NOSUPPORT,            /*  9: unsupported */
+	DLM_CANCELGRANT,          /* 10: can't cancel convert: already granted */
+	DLM_IVLOCKID,             /* 11: bad lockid */
+	DLM_SYNC,                 /* 12: synchronous request granted */
+	DLM_BADTYPE,              /* 13: bad resource type */
+	DLM_BADRESOURCE,          /* 14: bad resource handle */
+	DLM_MAXHANDLES,           /* 15: no more resource handles */
+	DLM_NOCLINFO,             /* 16: can't contact cluster manager */
+	DLM_NOLOCKMGR,            /* 17: can't contact lock manager */
+	DLM_NOPURGED,             /* 18: can't contact purge daemon */
+	DLM_BADARGS,              /* 19: bad api args */
+	DLM_VOID,                 /* 20: no status */
+	DLM_NOTQUEUED,            /* 21: NOQUEUE was specified and request failed */
+	DLM_IVBUFLEN,             /* 22: invalid resource name length */
+	DLM_CVTUNGRANT,           /* 23: attempted to convert ungranted lock */
+	DLM_BADPARAM,             /* 24: invalid lock mode specified */
+	DLM_VALNOTVALID,          /* 25: value block has been invalidated */
+	DLM_REJECTED,             /* 26: request rejected, unrecognized client */
+	DLM_ABORT,                /* 27: blocked lock request cancelled */
+	DLM_CANCEL,               /* 28: conversion request cancelled */
+	DLM_IVRESHANDLE,          /* 29: invalid resource handle */
+	DLM_DEADLOCK,             /* 30: deadlock recovery refused this request */
+	DLM_DENIED_NOASTS,        /* 31: failed to allocate AST */
+	DLM_FORWARD,              /* 32: request must wait for primary's response */
+	DLM_TIMEOUT,              /* 33: timeout value for lock has expired */
+	DLM_IVGROUPID,            /* 34: invalid group specification */
+	DLM_VERS_CONFLICT,        /* 35: version conflicts prevent request handling */
+	DLM_BAD_DEVICE_PATH,      /* 36: Locks device does not exist or path wrong */
+	DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */
+	DLM_NO_CONTROL_DEVICE,    /* 38: Cannot set options on opened device */
+
+	DLM_RECOVERING,           /* 39: extension, allows caller to fail a lock
+				     request if it is being recovered */
+	DLM_MIGRATING,            /* 40: extension, allows caller to fail a lock
+				     request if it is being migrated */
+	DLM_MAXSTATS,             /* 41: upper limit for return code validation */
+};
+
+/* for pretty-printing dlm_status error messages */
+const char *dlm_errmsg(enum dlm_status err);
+/* for pretty-printing dlm_status error names */
+const char *dlm_errname(enum dlm_status err);
+
+/* Eventually the DLM will use standard errno values, but in the
+ * meantime this lets us track dlm errors as they bubble up. When we
+ * bring its error reporting into line with the rest of the stack,
+ * these can just be replaced with calls to mlog_errno. */
+#define dlm_error(st) do {						\
+	if ((st) != DLM_RECOVERING &&					\
+	    (st) != DLM_MIGRATING &&					\
+	    (st) != DLM_FORWARD)					\
+		mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st)));	\
+} while (0)
+
+#define DLM_LKSB_UNUSED1           0x01
+#define DLM_LKSB_PUT_LVB           0x02
+#define DLM_LKSB_GET_LVB           0x04
+#define DLM_LKSB_UNUSED2           0x08
+#define DLM_LKSB_UNUSED3           0x10
+#define DLM_LKSB_UNUSED4           0x20
+#define DLM_LKSB_UNUSED5           0x40
+#define DLM_LKSB_UNUSED6           0x80
+
+#define DLM_LVB_LEN  64
+
+/* Callers are only allowed access to the lvb and status members of
+ * this struct. */
+struct dlm_lockstatus {
+	enum dlm_status status;
+	u32 flags;
+	struct dlm_lock *lockid;
+	char lvb[DLM_LVB_LEN];
+};
+
+/* Valid lock modes. */
+#define LKM_IVMODE      (-1)            /* invalid mode */
+#define LKM_NLMODE      0               /* null lock */
+#define LKM_CRMODE      1               /* concurrent read    unsupported */
+#define LKM_CWMODE      2               /* concurrent write   unsupported */
+#define LKM_PRMODE      3               /* protected read */
+#define LKM_PWMODE      4               /* protected write    unsupported */
+#define LKM_EXMODE      5               /* exclusive */
+#define LKM_MAXMODE     5
+#define LKM_MODEMASK    0xff
+
+/* Flags passed to dlmlock and dlmunlock:
+ * reserved: flags used by the "real" dlm
+ * only a few are supported by this dlm
+ * (U) = unsupported by ocfs2 dlm */
+#define LKM_ORPHAN       0x00000010  /* this lock is orphanable (U) */
+#define LKM_PARENTABLE   0x00000020  /* this lock was orphaned (U) */
+#define LKM_BLOCK        0x00000040  /* blocking lock request (U) */
+#define LKM_LOCAL        0x00000080  /* local lock request */
+#define LKM_VALBLK       0x00000100  /* lock value block request */
+#define LKM_NOQUEUE      0x00000200  /* non blocking request */
+#define LKM_CONVERT      0x00000400  /* conversion request */
+#define LKM_NODLCKWT     0x00000800  /* this lock wont deadlock (U) */
+#define LKM_UNLOCK       0x00001000  /* deallocate this lock */
+#define LKM_CANCEL       0x00002000  /* cancel conversion request */
+#define LKM_DEQALL       0x00004000  /* remove all locks held by proc (U) */
+#define LKM_INVVALBLK    0x00008000  /* invalidate lock value block */
+#define LKM_SYNCSTS      0x00010000  /* return synchronous status if poss (U) */
+#define LKM_TIMEOUT      0x00020000  /* lock request contains timeout (U) */
+#define LKM_SNGLDLCK     0x00040000  /* request can self-deadlock (U) */
+#define LKM_FINDLOCAL    0x00080000  /* find local lock request (U) */
+#define LKM_PROC_OWNED   0x00100000  /* owned by process, not group (U) */
+#define LKM_XID          0x00200000  /* use transaction id for deadlock (U) */
+#define LKM_XID_CONFLICT 0x00400000  /* do not allow lock inheritance (U) */
+#define LKM_FORCE        0x00800000  /* force unlock flag */
+#define LKM_REVVALBLK    0x01000000  /* temporary solution: re-validate
+					lock value block (U) */
+/* unused */
+#define LKM_UNUSED1      0x00000001  /* unused */
+#define LKM_UNUSED2      0x00000002  /* unused */
+#define LKM_UNUSED3      0x00000004  /* unused */
+#define LKM_UNUSED4      0x00000008  /* unused */
+#define LKM_UNUSED5      0x02000000  /* unused */
+#define LKM_UNUSED6      0x04000000  /* unused */
+#define LKM_UNUSED7      0x08000000  /* unused */
+
+/* ocfs2 extensions: internal only
+ * should never be used by caller */
+#define LKM_MIGRATION    0x10000000  /* extension: lockres is to be migrated
+					to another node */
+#define LKM_PUT_LVB      0x20000000  /* extension: lvb is being passed
+					should be applied to lockres */
+#define LKM_GET_LVB      0x40000000  /* extension: lvb should be copied
+					from lockres when lock is granted */
+#define LKM_RECOVERY     0x80000000  /* extension: flag for recovery lock
+					used to avoid recovery rwsem */
+
+
+typedef void (dlm_astlockfunc_t)(void *);
+typedef void (dlm_bastlockfunc_t)(void *, int);
+typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status);
+
+enum dlm_status dlmlock(struct dlm_ctxt *dlm,
+			int mode,
+			struct dlm_lockstatus *lksb,
+			int flags,
+			const char *name,
+			int namelen,
+			dlm_astlockfunc_t *ast,
+			void *data,
+			dlm_bastlockfunc_t *bast);
+
+enum dlm_status dlmunlock(struct dlm_ctxt *dlm,
+			  struct dlm_lockstatus *lksb,
+			  int flags,
+			  dlm_astunlockfunc_t *unlockast,
+			  void *data);
+
+struct dlm_protocol_version {
+	u8 pv_major;
+	u8 pv_minor;
+};
+struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key,
+				      struct dlm_protocol_version *fs_proto);
+
+void dlm_unregister_domain(struct dlm_ctxt *dlm);
+
+void dlm_print_one_lock(struct dlm_lock *lockid);
+
+typedef void (dlm_eviction_func)(int, void *);
+struct dlm_eviction_cb {
+	struct list_head        ec_item;
+	dlm_eviction_func       *ec_func;
+	void                    *ec_data;
+};
+void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
+			   dlm_eviction_func *f,
+			   void *data);
+void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
+			      struct dlm_eviction_cb *cb);
+void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb);
+
+#endif /* DLMAPI_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
new file mode 100644
index 00000000..3a3ed4bb
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -0,0 +1,502 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmast.c
+ *
+ * AST and BAST functionality for local and remote nodes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			   struct dlm_lock *lock);
+static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+
+/* Should be called as an ast gets queued to see if the new
+ * lock level will obsolete a pending bast.
+ * For example, if dlm_thread queued a bast for an EX lock that
+ * was blocking another EX, but before sending the bast the
+ * lock owner downconverted to NL, the bast is now obsolete.
+ * Only the ast should be sent.
+ * This is needed because the lock and convert paths can queue
+ * asts out-of-band (not waiting for dlm_thread) in order to
+ * allow for LKM_NOQUEUE to get immediate responses. */
+static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	assert_spin_locked(&dlm->ast_lock);
+	assert_spin_locked(&lock->spinlock);
+
+	if (lock->ml.highest_blocked == LKM_IVMODE)
+		return 0;
+	BUG_ON(lock->ml.highest_blocked == LKM_NLMODE);
+
+	if (lock->bast_pending &&
+	    list_empty(&lock->bast_list))
+		/* old bast already sent, ok */
+		return 0;
+
+	if (lock->ml.type == LKM_EXMODE)
+		/* EX blocks anything left, any bast still valid */
+		return 0;
+	else if (lock->ml.type == LKM_NLMODE)
+		/* NL blocks nothing, no reason to send any bast, cancel it */
+		return 1;
+	else if (lock->ml.highest_blocked != LKM_EXMODE)
+		/* PR only blocks EX */
+		return 1;
+
+	return 0;
+}
+
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	struct dlm_lock_resource *res;
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	res = lock->lockres;
+
+	assert_spin_locked(&dlm->ast_lock);
+
+	if (!list_empty(&lock->ast_list)) {
+		mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
+		     "AST list not empty, pending %d, newlevel %d\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		     lock->ast_pending, lock->ml.type);
+		BUG();
+	}
+	if (lock->ast_pending)
+		mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+
+	/* putting lock on list, add a ref */
+	dlm_lock_get(lock);
+	spin_lock(&lock->spinlock);
+
+	/* check to see if this ast obsoletes the bast */
+	if (dlm_should_cancel_bast(dlm, lock)) {
+		mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+		lock->bast_pending = 0;
+		list_del_init(&lock->bast_list);
+		lock->ml.highest_blocked = LKM_IVMODE;
+		/* removing lock from list, remove a ref.  guaranteed
+		 * this won't be the last ref because of the get above,
+		 * so res->spinlock will not be taken here */
+		dlm_lock_put(lock);
+		/* free up the reserved bast that we are cancelling.
+		 * guaranteed that this will not be the last reserved
+		 * ast because *both* an ast and a bast were reserved
+		 * to get to this point.  the res->spinlock will not be
+		 * taken here */
+		dlm_lockres_release_ast(dlm, res);
+	}
+	list_add_tail(&lock->ast_list, &dlm->pending_asts);
+	lock->ast_pending = 1;
+	spin_unlock(&lock->spinlock);
+}
+
+void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	spin_lock(&dlm->ast_lock);
+	__dlm_queue_ast(dlm, lock);
+	spin_unlock(&dlm->ast_lock);
+}
+
+
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	struct dlm_lock_resource *res;
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	assert_spin_locked(&dlm->ast_lock);
+
+	res = lock->lockres;
+
+	BUG_ON(!list_empty(&lock->bast_list));
+	if (lock->bast_pending)
+		mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+
+	/* putting lock on list, add a ref */
+	dlm_lock_get(lock);
+	spin_lock(&lock->spinlock);
+	list_add_tail(&lock->bast_list, &dlm->pending_basts);
+	lock->bast_pending = 1;
+	spin_unlock(&lock->spinlock);
+}
+
+void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	spin_lock(&dlm->ast_lock);
+	__dlm_queue_bast(dlm, lock);
+	spin_unlock(&dlm->ast_lock);
+}
+
+static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			   struct dlm_lock *lock)
+{
+	struct dlm_lockstatus *lksb = lock->lksb;
+	BUG_ON(!lksb);
+
+	/* only updates if this node masters the lockres */
+	spin_lock(&res->spinlock);
+	if (res->owner == dlm->node_num) {
+		/* check the lksb flags for the direction */
+		if (lksb->flags & DLM_LKSB_GET_LVB) {
+			mlog(0, "getting lvb from lockres for %s node\n",
+				  lock->ml.node == dlm->node_num ? "master" :
+				  "remote");
+			memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
+		}
+		/* Do nothing for lvb put requests - they should be done in
+ 		 * place when the lock is downconverted - otherwise we risk
+ 		 * racing gets and puts which could result in old lvb data
+ 		 * being propagated. We leave the put flag set and clear it
+ 		 * here. In the future we might want to clear it at the time
+ 		 * the put is actually done.
+		 */
+	}
+	spin_unlock(&res->spinlock);
+
+	/* reset any lvb flags on the lksb */
+	lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
+}
+
+void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+		      struct dlm_lock *lock)
+{
+	dlm_astlockfunc_t *fn;
+	struct dlm_lockstatus *lksb;
+
+	mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
+	     res->lockname.len, res->lockname.name,
+	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+
+	lksb = lock->lksb;
+	fn = lock->ast;
+	BUG_ON(lock->ml.node != dlm->node_num);
+
+	dlm_update_lvb(dlm, res, lock);
+	(*fn)(lock->astdata);
+}
+
+
+int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+		      struct dlm_lock *lock)
+{
+	int ret;
+	struct dlm_lockstatus *lksb;
+	int lksbflags;
+
+	mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
+	     res->lockname.len, res->lockname.name,
+	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+
+	lksb = lock->lksb;
+	BUG_ON(lock->ml.node == dlm->node_num);
+
+	lksbflags = lksb->flags;
+	dlm_update_lvb(dlm, res, lock);
+
+	/* lock request came from another node
+	 * go do the ast over there */
+	ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags);
+	return ret;
+}
+
+void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+		       struct dlm_lock *lock, int blocked_type)
+{
+	dlm_bastlockfunc_t *fn = lock->bast;
+
+	BUG_ON(lock->ml.node != dlm->node_num);
+
+	mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
+	     dlm->name, res->lockname.len, res->lockname.name,
+	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+	     blocked_type);
+
+	(*fn)(lock->astdata, blocked_type);
+}
+
+
+
+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
+			  void **ret_data)
+{
+	int ret;
+	unsigned int locklen;
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *lock = NULL;
+	struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
+	char *name;
+	struct list_head *iter, *head=NULL;
+	u64 cookie;
+	u32 flags;
+	u8 node;
+
+	if (!dlm_grab(dlm)) {
+		dlm_error(DLM_REJECTED);
+		return DLM_REJECTED;
+	}
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	name = past->name;
+	locklen = past->namelen;
+	cookie = past->cookie;
+	flags = be32_to_cpu(past->flags);
+	node = past->node_idx;
+
+	if (locklen > DLM_LOCKID_NAME_MAX) {
+		ret = DLM_IVBUFLEN;
+		mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
+		     "handler!\n", locklen);
+		goto leave;
+	}
+
+	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
+	     (LKM_PUT_LVB|LKM_GET_LVB)) {
+		mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
+		     flags);
+		ret = DLM_BADARGS;
+		goto leave;
+	}
+
+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
+		  (flags & LKM_GET_LVB ? "get lvb" : "none"));
+
+	mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type);
+
+	if (past->type != DLM_AST &&
+	    past->type != DLM_BAST) {
+		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
+		     "name=%.*s, node=%u\n", past->type,
+		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     locklen, name, node);
+		ret = DLM_IVLOCKID;
+		goto leave;
+	}
+
+	res = dlm_lookup_lockres(dlm, name, locklen);
+	if (!res) {
+		mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
+		     "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
+		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     locklen, name, node);
+		ret = DLM_IVLOCKID;
+		goto leave;
+	}
+
+	/* cannot get a proxy ast message if this node owns it */
+	BUG_ON(res->owner == dlm->node_num);
+
+	mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+	     res->lockname.name);
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		mlog(0, "Responding with DLM_RECOVERING!\n");
+		ret = DLM_RECOVERING;
+		goto unlock_out;
+	}
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		mlog(0, "Responding with DLM_MIGRATING!\n");
+		ret = DLM_MIGRATING;
+		goto unlock_out;
+	}
+	/* try convert queue for both ast/bast */
+	head = &res->converting;
+	lock = NULL;
+	list_for_each(iter, head) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (lock->ml.cookie == cookie)
+			goto do_ast;
+	}
+
+	/* if not on convert, try blocked for ast, granted for bast */
+	if (past->type == DLM_AST)
+		head = &res->blocked;
+	else
+		head = &res->granted;
+
+	list_for_each(iter, head) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (lock->ml.cookie == cookie)
+			goto do_ast;
+	}
+
+	mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
+	     "node=%u\n", past->type == DLM_AST ? "" : "b",
+	     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+	     locklen, name, node);
+
+	ret = DLM_NORMAL;
+unlock_out:
+	spin_unlock(&res->spinlock);
+	goto leave;
+
+do_ast:
+	ret = DLM_NORMAL;
+	if (past->type == DLM_AST) {
+		/* do not alter lock refcount.  switching lists. */
+		list_move_tail(&lock->list, &res->granted);
+		mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     lock->ml.type, lock->ml.convert_type);
+
+		if (lock->ml.convert_type != LKM_IVMODE) {
+			lock->ml.type = lock->ml.convert_type;
+			lock->ml.convert_type = LKM_IVMODE;
+		} else {
+			// should already be there....
+		}
+
+		lock->lksb->status = DLM_NORMAL;
+
+		/* if we requested the lvb, fetch it into our lksb now */
+		if (flags & LKM_GET_LVB) {
+			BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB));
+			memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN);
+		}
+	}
+	spin_unlock(&res->spinlock);
+
+	if (past->type == DLM_AST)
+		dlm_do_local_ast(dlm, res, lock);
+	else
+		dlm_do_local_bast(dlm, res, lock, past->blocked_type);
+
+leave:
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+	return ret;
+}
+
+
+
+int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			   struct dlm_lock *lock, int msg_type,
+			   int blocked_type, int flags)
+{
+	int ret = 0;
+	struct dlm_proxy_ast past;
+	struct kvec vec[2];
+	size_t veclen = 1;
+	int status;
+
+	mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
+	     res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
+	     blocked_type);
+
+	memset(&past, 0, sizeof(struct dlm_proxy_ast));
+	past.node_idx = dlm->node_num;
+	past.type = msg_type;
+	past.blocked_type = blocked_type;
+	past.namelen = res->lockname.len;
+	memcpy(past.name, res->lockname.name, past.namelen);
+	past.cookie = lock->ml.cookie;
+
+	vec[0].iov_len = sizeof(struct dlm_proxy_ast);
+	vec[0].iov_base = &past;
+	if (flags & DLM_LKSB_GET_LVB) {
+		be32_add_cpu(&past.flags, LKM_GET_LVB);
+		vec[1].iov_len = DLM_LVB_LEN;
+		vec[1].iov_base = lock->lksb->lvb;
+		veclen++;
+	}
+
+	ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
+				     lock->ml.node, &status);
+	if (ret < 0)
+		mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
+		     dlm->name, res->lockname.len, res->lockname.name, ret,
+		     lock->ml.node);
+	else {
+		if (status == DLM_RECOVERING) {
+			mlog(ML_ERROR, "sent AST to node %u, it thinks this "
+			     "node is dead!\n", lock->ml.node);
+			BUG();
+		} else if (status == DLM_MIGRATING) {
+			mlog(ML_ERROR, "sent AST to node %u, it returned "
+			     "DLM_MIGRATING!\n", lock->ml.node);
+			BUG();
+		} else if (status != DLM_NORMAL && status != DLM_IVLOCKID) {
+			mlog(ML_ERROR, "AST to node %u returned %d!\n",
+			     lock->ml.node, status);
+			/* ignore it */
+		}
+		ret = 0;
+	}
+	return ret;
+}
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
new file mode 100644
index 00000000..d602abb5
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -0,0 +1,1181 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmcommon.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMCOMMON_H
+#define DLMCOMMON_H
+
+#include <linux/kref.h>
+
+#define DLM_HB_NODE_DOWN_PRI     (0xf000000)
+#define DLM_HB_NODE_UP_PRI       (0x8000000)
+
+#define DLM_LOCKID_NAME_MAX    32
+
+#define DLM_DOMAIN_NAME_MAX_LEN    255
+#define DLM_LOCK_RES_OWNER_UNKNOWN     O2NM_MAX_NODES
+#define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
+#define DLM_THREAD_MS                  200   // flush at least every 200 ms
+
+#define DLM_HASH_SIZE_DEFAULT	(1 << 17)
+#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
+# define DLM_HASH_PAGES		1
+#else
+# define DLM_HASH_PAGES		(DLM_HASH_SIZE_DEFAULT / PAGE_SIZE)
+#endif
+#define DLM_BUCKETS_PER_PAGE	(PAGE_SIZE / sizeof(struct hlist_head))
+#define DLM_HASH_BUCKETS	(DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE)
+
+/* Intended to make it easier for us to switch out hash functions */
+#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
+
+enum dlm_mle_type {
+	DLM_MLE_BLOCK = 0,
+	DLM_MLE_MASTER = 1,
+	DLM_MLE_MIGRATION = 2,
+	DLM_MLE_NUM_TYPES = 3,
+};
+
+struct dlm_master_list_entry {
+	struct hlist_node master_hash_node;
+	struct list_head hb_events;
+	struct dlm_ctxt *dlm;
+	spinlock_t spinlock;
+	wait_queue_head_t wq;
+	atomic_t woken;
+	struct kref mle_refs;
+	int inuse;
+	unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	u8 master;
+	u8 new_master;
+	enum dlm_mle_type type;
+	struct o2hb_callback_func mle_hb_up;
+	struct o2hb_callback_func mle_hb_down;
+	struct dlm_lock_resource *mleres;
+	unsigned char mname[DLM_LOCKID_NAME_MAX];
+	unsigned int mnamelen;
+	unsigned int mnamehash;
+};
+
+enum dlm_ast_type {
+	DLM_AST = 0,
+	DLM_BAST = 1,
+	DLM_ASTUNLOCK = 2,
+};
+
+
+#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \
+			 LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \
+			 LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE)
+
+#define DLM_RECOVERY_LOCK_NAME       "$RECOVERY"
+#define DLM_RECOVERY_LOCK_NAME_LEN   9
+
+static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
+{
+	if (name_len == DLM_RECOVERY_LOCK_NAME_LEN &&
+	    memcmp(lock_name, DLM_RECOVERY_LOCK_NAME, name_len)==0)
+		return 1;
+	return 0;
+}
+
+#define DLM_RECO_STATE_ACTIVE    0x0001
+#define DLM_RECO_STATE_FINALIZE  0x0002
+
+struct dlm_recovery_ctxt
+{
+	struct list_head resources;
+	struct list_head received;
+	struct list_head node_data;
+	u8  new_master;
+	u8  dead_node;
+	u16 state;
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	wait_queue_head_t event;
+};
+
+enum dlm_ctxt_state {
+	DLM_CTXT_NEW = 0,
+	DLM_CTXT_JOINED = 1,
+	DLM_CTXT_IN_SHUTDOWN = 2,
+	DLM_CTXT_LEAVING = 3,
+};
+
+struct dlm_ctxt
+{
+	struct list_head list;
+	struct hlist_head **lockres_hash;
+	struct list_head dirty_list;
+	struct list_head purge_list;
+	struct list_head pending_asts;
+	struct list_head pending_basts;
+	struct list_head tracking_list;
+	unsigned int purge_count;
+	spinlock_t spinlock;
+	spinlock_t ast_lock;
+	spinlock_t track_lock;
+	char *name;
+	u8 node_num;
+	u32 key;
+	u8  joining_node;
+	wait_queue_head_t dlm_join_events;
+	unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	struct dlm_recovery_ctxt reco;
+	spinlock_t master_lock;
+	struct hlist_head **master_hash;
+	struct list_head mle_hb_events;
+
+	/* these give a really vague idea of the system load */
+	atomic_t mle_tot_count[DLM_MLE_NUM_TYPES];
+	atomic_t mle_cur_count[DLM_MLE_NUM_TYPES];
+	atomic_t res_tot_count;
+	atomic_t res_cur_count;
+
+	struct dlm_debug_ctxt *dlm_debug_ctxt;
+	struct dentry *dlm_debugfs_subroot;
+
+	/* NOTE: Next three are protected by dlm_domain_lock */
+	struct kref dlm_refs;
+	enum dlm_ctxt_state dlm_state;
+	unsigned int num_joins;
+
+	struct o2hb_callback_func dlm_hb_up;
+	struct o2hb_callback_func dlm_hb_down;
+	struct task_struct *dlm_thread_task;
+	struct task_struct *dlm_reco_thread_task;
+	struct workqueue_struct *dlm_worker;
+	wait_queue_head_t dlm_thread_wq;
+	wait_queue_head_t dlm_reco_thread_wq;
+	wait_queue_head_t ast_wq;
+	wait_queue_head_t migration_wq;
+
+	struct work_struct dispatched_work;
+	struct list_head work_list;
+	spinlock_t work_lock;
+	struct list_head dlm_domain_handlers;
+	struct list_head	dlm_eviction_callbacks;
+
+	/* The filesystem specifies this at domain registration.  We
+	 * cache it here to know what to tell other nodes. */
+	struct dlm_protocol_version fs_locking_proto;
+	/* This is the inter-dlm communication version */
+	struct dlm_protocol_version dlm_locking_proto;
+};
+
+static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i)
+{
+	return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
+}
+
+static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm,
+						 unsigned i)
+{
+	return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] +
+			(i % DLM_BUCKETS_PER_PAGE);
+}
+
+/* these keventd work queue items are for less-frequently
+ * called functions that cannot be directly called from the
+ * net message handlers for some reason, usually because
+ * they need to send net messages of their own. */
+void dlm_dispatch_work(struct work_struct *work);
+
+struct dlm_lock_resource;
+struct dlm_work_item;
+
+typedef void (dlm_workfunc_t)(struct dlm_work_item *, void *);
+
+struct dlm_request_all_locks_priv
+{
+	u8 reco_master;
+	u8 dead_node;
+};
+
+struct dlm_mig_lockres_priv
+{
+	struct dlm_lock_resource *lockres;
+	u8 real_master;
+	u8 extra_ref;
+};
+
+struct dlm_assert_master_priv
+{
+	struct dlm_lock_resource *lockres;
+	u8 request_from;
+	u32 flags;
+	unsigned ignore_higher:1;
+};
+
+struct dlm_deref_lockres_priv
+{
+	struct dlm_lock_resource *deref_res;
+	u8 deref_node;
+};
+
+struct dlm_work_item
+{
+	struct list_head list;
+	dlm_workfunc_t *func;
+	struct dlm_ctxt *dlm;
+	void *data;
+	union {
+		struct dlm_request_all_locks_priv ral;
+		struct dlm_mig_lockres_priv ml;
+		struct dlm_assert_master_priv am;
+		struct dlm_deref_lockres_priv dl;
+	} u;
+};
+
+static inline void dlm_init_work_item(struct dlm_ctxt *dlm,
+				      struct dlm_work_item *i,
+				      dlm_workfunc_t *f, void *data)
+{
+	memset(i, 0, sizeof(*i));
+	i->func = f;
+	INIT_LIST_HEAD(&i->list);
+	i->data = data;
+	i->dlm = dlm;  /* must have already done a dlm_grab on this! */
+}
+
+
+
+static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
+					  u8 node)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	dlm->joining_node = node;
+	wake_up(&dlm->dlm_join_events);
+}
+
+#define DLM_LOCK_RES_UNINITED             0x00000001
+#define DLM_LOCK_RES_RECOVERING           0x00000002
+#define DLM_LOCK_RES_READY                0x00000004
+#define DLM_LOCK_RES_DIRTY                0x00000008
+#define DLM_LOCK_RES_IN_PROGRESS          0x00000010
+#define DLM_LOCK_RES_MIGRATING            0x00000020
+#define DLM_LOCK_RES_DROPPING_REF         0x00000040
+#define DLM_LOCK_RES_BLOCK_DIRTY          0x00001000
+#define DLM_LOCK_RES_SETREF_INPROG        0x00002000
+
+/* max milliseconds to wait to sync up a network failure with a node death */
+#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
+
+#define DLM_PURGE_INTERVAL_MS   (8 * 1000)
+
+struct dlm_lock_resource
+{
+	/* WARNING: Please see the comment in dlm_init_lockres before
+	 * adding fields here. */
+	struct hlist_node hash_node;
+	struct qstr lockname;
+	struct kref      refs;
+
+	/*
+	 * Please keep granted, converting, and blocked in this order,
+	 * as some funcs want to iterate over all lists.
+	 *
+	 * All four lists are protected by the hash's reference.
+	 */
+	struct list_head granted;
+	struct list_head converting;
+	struct list_head blocked;
+	struct list_head purge;
+
+	/*
+	 * These two lists require you to hold an additional reference
+	 * while they are on the list.
+	 */
+	struct list_head dirty;
+	struct list_head recovering; // dlm_recovery_ctxt.resources list
+
+	/* Added during init and removed during release */
+	struct list_head tracking;	/* dlm->tracking_list */
+
+	/* unused lock resources have their last_used stamped and are
+	 * put on a list for the dlm thread to run. */
+	unsigned long    last_used;
+
+	struct dlm_ctxt *dlm;
+
+	unsigned migration_pending:1;
+	atomic_t asts_reserved;
+	spinlock_t spinlock;
+	wait_queue_head_t wq;
+	u8  owner;              //node which owns the lock resource, or unknown
+	u16 state;
+	char lvb[DLM_LVB_LEN];
+	unsigned int inflight_locks;
+	unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+struct dlm_migratable_lock
+{
+	__be64 cookie;
+
+	/* these 3 are just padding for the in-memory structure, but
+	 * list and flags are actually used when sent over the wire */
+	__be16 pad1;
+	u8 list;  // 0=granted, 1=converting, 2=blocked
+	u8 flags;
+
+	s8 type;
+	s8 convert_type;
+	s8 highest_blocked;
+	u8 node;
+};  // 16 bytes
+
+struct dlm_lock
+{
+	struct dlm_migratable_lock ml;
+
+	struct list_head list;
+	struct list_head ast_list;
+	struct list_head bast_list;
+	struct dlm_lock_resource *lockres;
+	spinlock_t spinlock;
+	struct kref lock_refs;
+
+	// ast and bast must be callable while holding a spinlock!
+	dlm_astlockfunc_t *ast;
+	dlm_bastlockfunc_t *bast;
+	void *astdata;
+	struct dlm_lockstatus *lksb;
+	unsigned ast_pending:1,
+		 bast_pending:1,
+		 convert_pending:1,
+		 lock_pending:1,
+		 cancel_pending:1,
+		 unlock_pending:1,
+		 lksb_kernel_allocated:1;
+};
+
+
+#define DLM_LKSB_UNUSED1           0x01
+#define DLM_LKSB_PUT_LVB           0x02
+#define DLM_LKSB_GET_LVB           0x04
+#define DLM_LKSB_UNUSED2           0x08
+#define DLM_LKSB_UNUSED3           0x10
+#define DLM_LKSB_UNUSED4           0x20
+#define DLM_LKSB_UNUSED5           0x40
+#define DLM_LKSB_UNUSED6           0x80
+
+
+enum dlm_lockres_list {
+	DLM_GRANTED_LIST = 0,
+	DLM_CONVERTING_LIST = 1,
+	DLM_BLOCKED_LIST = 2,
+};
+
+static inline int dlm_lvb_is_empty(char *lvb)
+{
+	int i;
+	for (i=0; i<DLM_LVB_LEN; i++)
+		if (lvb[i])
+			return 0;
+	return 1;
+}
+
+static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
+{
+	if (idx == DLM_GRANTED_LIST)
+		return "granted";
+	else if (idx == DLM_CONVERTING_LIST)
+		return "converting";
+	else if (idx == DLM_BLOCKED_LIST)
+		return "blocked";
+	else
+		return "unknown";
+}
+
+static inline struct list_head *
+dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
+{
+	struct list_head *ret = NULL;
+	if (idx == DLM_GRANTED_LIST)
+		ret = &res->granted;
+	else if (idx == DLM_CONVERTING_LIST)
+		ret = &res->converting;
+	else if (idx == DLM_BLOCKED_LIST)
+		ret = &res->blocked;
+	else
+		BUG();
+	return ret;
+}
+
+
+
+
+struct dlm_node_iter
+{
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int curnode;
+};
+
+
+enum {
+	DLM_MASTER_REQUEST_MSG		= 500,
+	DLM_UNUSED_MSG1			= 501,
+	DLM_ASSERT_MASTER_MSG		= 502,
+	DLM_CREATE_LOCK_MSG		= 503,
+	DLM_CONVERT_LOCK_MSG		= 504,
+	DLM_PROXY_AST_MSG		= 505,
+	DLM_UNLOCK_LOCK_MSG		= 506,
+	DLM_DEREF_LOCKRES_MSG		= 507,
+	DLM_MIGRATE_REQUEST_MSG		= 508,
+	DLM_MIG_LOCKRES_MSG		= 509,
+	DLM_QUERY_JOIN_MSG		= 510,
+	DLM_ASSERT_JOINED_MSG		= 511,
+	DLM_CANCEL_JOIN_MSG		= 512,
+	DLM_EXIT_DOMAIN_MSG		= 513,
+	DLM_MASTER_REQUERY_MSG		= 514,
+	DLM_LOCK_REQUEST_MSG		= 515,
+	DLM_RECO_DATA_DONE_MSG		= 516,
+	DLM_BEGIN_RECO_MSG		= 517,
+	DLM_FINALIZE_RECO_MSG		= 518,
+	DLM_QUERY_REGION		= 519,
+	DLM_QUERY_NODEINFO		= 520,
+	DLM_BEGIN_EXIT_DOMAIN_MSG	= 521,
+};
+
+struct dlm_reco_node_data
+{
+	int state;
+	u8 node_num;
+	struct list_head list;
+};
+
+enum {
+	DLM_RECO_NODE_DATA_DEAD = -1,
+	DLM_RECO_NODE_DATA_INIT = 0,
+	DLM_RECO_NODE_DATA_REQUESTING = 1,
+	DLM_RECO_NODE_DATA_REQUESTED = 2,
+	DLM_RECO_NODE_DATA_RECEIVING = 3,
+	DLM_RECO_NODE_DATA_DONE = 4,
+	DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
+};
+
+
+enum {
+	DLM_MASTER_RESP_NO = 0,
+	DLM_MASTER_RESP_YES = 1,
+	DLM_MASTER_RESP_MAYBE = 2,
+	DLM_MASTER_RESP_ERROR = 3,
+};
+
+
+struct dlm_master_request
+{
+	u8 node_idx;
+	u8 namelen;
+	__be16 pad1;
+	__be32 flags;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+#define DLM_ASSERT_RESPONSE_REASSERT       0x00000001
+#define DLM_ASSERT_RESPONSE_MASTERY_REF    0x00000002
+
+#define DLM_ASSERT_MASTER_MLE_CLEANUP      0x00000001
+#define DLM_ASSERT_MASTER_REQUERY          0x00000002
+#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004
+struct dlm_assert_master
+{
+	u8 node_idx;
+	u8 namelen;
+	__be16 pad1;
+	__be32 flags;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+#define DLM_MIGRATE_RESPONSE_MASTERY_REF   0x00000001
+
+struct dlm_migrate_request
+{
+	u8 master;
+	u8 new_master;
+	u8 namelen;
+	u8 pad1;
+	__be32 pad2;
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_master_requery
+{
+	u8 pad1;
+	u8 pad2;
+	u8 node_idx;
+	u8 namelen;
+	__be32 pad3;
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+#define DLM_MRES_RECOVERY   0x01
+#define DLM_MRES_MIGRATION  0x02
+#define DLM_MRES_ALL_DONE   0x04
+
+/*
+ * We would like to get one whole lockres into a single network
+ * message whenever possible.  Generally speaking, there will be
+ * at most one dlm_lock on a lockres for each node in the cluster,
+ * plus (infrequently) any additional locks coming in from userdlm.
+ *
+ * struct _dlm_lockres_page
+ * {
+ * 	dlm_migratable_lockres mres;
+ * 	dlm_migratable_lock ml[DLM_MAX_MIGRATABLE_LOCKS];
+ * 	u8 pad[DLM_MIG_LOCKRES_RESERVED];
+ * };
+ *
+ * from ../cluster/tcp.h
+ *    NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(net_msg))
+ *    (roughly 4080 bytes)
+ * and sizeof(dlm_migratable_lockres) = 112 bytes
+ * and sizeof(dlm_migratable_lock) = 16 bytes
+ *
+ * Choosing DLM_MAX_MIGRATABLE_LOCKS=240 and
+ * DLM_MIG_LOCKRES_RESERVED=128 means we have this:
+ *
+ *  (DLM_MAX_MIGRATABLE_LOCKS * sizeof(dlm_migratable_lock)) +
+ *     sizeof(dlm_migratable_lockres) + DLM_MIG_LOCKRES_RESERVED =
+ *        NET_MAX_PAYLOAD_BYTES
+ *  (240 * 16) + 112 + 128 = 4080
+ *
+ * So a lockres would need more than 240 locks before it would
+ * use more than one network packet to recover.  Not too bad.
+ */
+#define DLM_MAX_MIGRATABLE_LOCKS   240
+
+struct dlm_migratable_lockres
+{
+	u8 master;
+	u8 lockname_len;
+	u8 num_locks;    // locks sent in this structure
+	u8 flags;
+	__be32 total_locks; // locks to be sent for this migration cookie
+	__be64 mig_cookie;  // cookie for this lockres migration
+			 // or zero if not needed
+	// 16 bytes
+	u8 lockname[DLM_LOCKID_NAME_MAX];
+	// 48 bytes
+	u8 lvb[DLM_LVB_LEN];
+	// 112 bytes
+	struct dlm_migratable_lock ml[0];  // 16 bytes each, begins at byte 112
+};
+#define DLM_MIG_LOCKRES_MAX_LEN  \
+	(sizeof(struct dlm_migratable_lockres) + \
+	 (sizeof(struct dlm_migratable_lock) * \
+	  DLM_MAX_MIGRATABLE_LOCKS) )
+
+/* from above, 128 bytes
+ * for some undetermined future use */
+#define DLM_MIG_LOCKRES_RESERVED   (NET_MAX_PAYLOAD_BYTES - \
+				    DLM_MIG_LOCKRES_MAX_LEN)
+
+struct dlm_create_lock
+{
+	__be64 cookie;
+
+	__be32 flags;
+	u8 pad1;
+	u8 node_idx;
+	s8 requested_type;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_convert_lock
+{
+	__be64 cookie;
+
+	__be32 flags;
+	u8 pad1;
+	u8 node_idx;
+	s8 requested_type;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+
+	s8 lvb[0];
+};
+#define DLM_CONVERT_LOCK_MAX_LEN  (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN)
+
+struct dlm_unlock_lock
+{
+	__be64 cookie;
+
+	__be32 flags;
+	__be16 pad1;
+	u8 node_idx;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+
+	s8 lvb[0];
+};
+#define DLM_UNLOCK_LOCK_MAX_LEN  (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN)
+
+struct dlm_proxy_ast
+{
+	__be64 cookie;
+
+	__be32 flags;
+	u8 node_idx;
+	u8 type;
+	u8 blocked_type;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+
+	s8 lvb[0];
+};
+#define DLM_PROXY_AST_MAX_LEN  (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN)
+
+#define DLM_MOD_KEY (0x666c6172)
+enum dlm_query_join_response_code {
+	JOIN_DISALLOW = 0,
+	JOIN_OK = 1,
+	JOIN_OK_NO_MAP = 2,
+	JOIN_PROTOCOL_MISMATCH = 3,
+};
+
+struct dlm_query_join_packet {
+	u8 code;	/* Response code.  dlm_minor and fs_minor
+			   are only valid if this is JOIN_OK */
+	u8 dlm_minor;	/* The minor version of the protocol the
+			   dlm is speaking. */
+	u8 fs_minor;	/* The minor version of the protocol the
+			   filesystem is speaking. */
+	u8 reserved;
+};
+
+union dlm_query_join_response {
+	u32 intval;
+	struct dlm_query_join_packet packet;
+};
+
+struct dlm_lock_request
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+};
+
+struct dlm_reco_data_done
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+
+	/* unused for now */
+	/* eventually we can use this to attempt
+	 * lvb recovery based on each node's info */
+	u8 reco_lvb[DLM_LVB_LEN];
+};
+
+struct dlm_begin_reco
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+};
+
+
+#define BITS_PER_BYTE 8
+#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE)
+
+struct dlm_query_join_request
+{
+	u8 node_idx;
+	u8 pad1[2];
+	u8 name_len;
+	struct dlm_protocol_version dlm_proto;
+	struct dlm_protocol_version fs_proto;
+	u8 domain[O2NM_MAX_NAME_LEN];
+	u8 node_map[BITS_TO_BYTES(O2NM_MAX_NODES)];
+};
+
+struct dlm_assert_joined
+{
+	u8 node_idx;
+	u8 pad1[2];
+	u8 name_len;
+	u8 domain[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_cancel_join
+{
+	u8 node_idx;
+	u8 pad1[2];
+	u8 name_len;
+	u8 domain[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_query_region {
+	u8 qr_node;
+	u8 qr_numregions;
+	u8 qr_namelen;
+	u8 pad1;
+	u8 qr_domain[O2NM_MAX_NAME_LEN];
+	u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
+};
+
+struct dlm_node_info {
+	u8 ni_nodenum;
+	u8 pad1;
+	u16 ni_ipv4_port;
+	u32 ni_ipv4_address;
+};
+
+struct dlm_query_nodeinfo {
+	u8 qn_nodenum;
+	u8 qn_numnodes;
+	u8 qn_namelen;
+	u8 pad1;
+	u8 qn_domain[O2NM_MAX_NAME_LEN];
+	struct dlm_node_info qn_nodes[O2NM_MAX_NODES];
+};
+
+struct dlm_exit_domain
+{
+	u8 node_idx;
+	u8 pad1[3];
+};
+
+struct dlm_finalize_reco
+{
+	u8 node_idx;
+	u8 dead_node;
+	u8 flags;
+	u8 pad1;
+	__be32 pad2;
+};
+
+struct dlm_deref_lockres
+{
+	u32 pad1;
+	u16 pad2;
+	u8 node_idx;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+static inline enum dlm_status
+__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
+{
+	enum dlm_status status = DLM_NORMAL;
+
+	assert_spin_locked(&res->spinlock);
+
+	if (res->state & DLM_LOCK_RES_RECOVERING)
+		status = DLM_RECOVERING;
+	else if (res->state & DLM_LOCK_RES_MIGRATING)
+		status = DLM_MIGRATING;
+	else if (res->state & DLM_LOCK_RES_IN_PROGRESS)
+		status = DLM_FORWARD;
+
+	return status;
+}
+
+static inline u8 dlm_get_lock_cookie_node(u64 cookie)
+{
+	u8 ret;
+	cookie >>= 56;
+	ret = (u8)(cookie & 0xffULL);
+	return ret;
+}
+
+static inline unsigned long long dlm_get_lock_cookie_seq(u64 cookie)
+{
+	unsigned long long ret;
+	ret = ((unsigned long long)cookie) & 0x00ffffffffffffffULL;
+	return ret;
+}
+
+struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
+			       struct dlm_lockstatus *lksb);
+void dlm_lock_get(struct dlm_lock *lock);
+void dlm_lock_put(struct dlm_lock *lock);
+
+void dlm_lock_attach_lockres(struct dlm_lock *lock,
+			     struct dlm_lock_resource *res);
+
+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+			    void **ret_data);
+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+			     void **ret_data);
+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
+			  void **ret_data);
+
+void dlm_revert_pending_convert(struct dlm_lock_resource *res,
+				struct dlm_lock *lock);
+void dlm_revert_pending_lock(struct dlm_lock_resource *res,
+			     struct dlm_lock *lock);
+
+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+			    void **ret_data);
+void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock);
+void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock);
+
+int dlm_launch_thread(struct dlm_ctxt *dlm);
+void dlm_complete_thread(struct dlm_ctxt *dlm);
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
+void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
+void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
+int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
+int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
+
+void dlm_put(struct dlm_ctxt *dlm);
+struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
+int dlm_domain_fully_joined(struct dlm_ctxt *dlm);
+
+void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res);
+void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			    struct dlm_lock_resource *res);
+static inline void dlm_lockres_get(struct dlm_lock_resource *res)
+{
+	/* This is called on every lookup, so it might be worth
+	 * inlining. */
+	kref_get(&res->refs);
+}
+void dlm_lockres_put(struct dlm_lock_resource *res);
+void __dlm_unhash_lockres(struct dlm_lock_resource *res);
+void __dlm_insert_lockres(struct dlm_ctxt *dlm,
+			  struct dlm_lock_resource *res);
+struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
+						     const char *name,
+						     unsigned int len,
+						     unsigned int hash);
+struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
+						const char *name,
+						unsigned int len,
+						unsigned int hash);
+struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
+					      const char *name,
+					      unsigned int len);
+
+int dlm_is_host_down(int errno);
+
+struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
+						 const char *lockid,
+						 int namelen,
+						 int flags);
+struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
+					  const char *name,
+					  unsigned int namelen);
+
+#define dlm_lockres_set_refmap_bit(bit,res)  \
+	__dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__)
+#define dlm_lockres_clear_refmap_bit(bit,res)  \
+	__dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__)
+
+static inline void __dlm_lockres_set_refmap_bit(int bit,
+						struct dlm_lock_resource *res,
+						const char *file,
+						int line)
+{
+	//printk("%s:%d:%.*s: setting bit %d\n", file, line,
+	//     res->lockname.len, res->lockname.name, bit);
+	set_bit(bit, res->refmap);
+}
+
+static inline void __dlm_lockres_clear_refmap_bit(int bit,
+						  struct dlm_lock_resource *res,
+						  const char *file,
+						  int line)
+{
+	//printk("%s:%d:%.*s: clearing bit %d\n", file, line,
+	//     res->lockname.len, res->lockname.name, bit);
+	clear_bit(bit, res->refmap);
+}
+
+void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+				   struct dlm_lock_resource *res,
+				   const char *file,
+				   int line);
+void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+				   struct dlm_lock_resource *res,
+				   int new_lockres,
+				   const char *file,
+				   int line);
+#define dlm_lockres_drop_inflight_ref(d,r)  \
+	__dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__)
+#define dlm_lockres_grab_inflight_ref(d,r)  \
+	__dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__)
+#define dlm_lockres_grab_inflight_ref_new(d,r)  \
+	__dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__)
+
+void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void dlm_do_local_ast(struct dlm_ctxt *dlm,
+		      struct dlm_lock_resource *res,
+		      struct dlm_lock *lock);
+int dlm_do_remote_ast(struct dlm_ctxt *dlm,
+		      struct dlm_lock_resource *res,
+		      struct dlm_lock *lock);
+void dlm_do_local_bast(struct dlm_ctxt *dlm,
+		       struct dlm_lock_resource *res,
+		       struct dlm_lock *lock,
+		       int blocked_type);
+int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm,
+			   struct dlm_lock_resource *res,
+			   struct dlm_lock *lock,
+			   int msg_type,
+			   int blocked_type, int flags);
+static inline int dlm_send_proxy_bast(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      struct dlm_lock *lock,
+				      int blocked_type)
+{
+	return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_BAST,
+				      blocked_type, 0);
+}
+
+static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_lock *lock,
+				     int flags)
+{
+	return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_AST,
+				      0, flags);
+}
+
+void dlm_print_one_lock_resource(struct dlm_lock_resource *res);
+void __dlm_print_one_lock_resource(struct dlm_lock_resource *res);
+
+u8 dlm_nm_this_node(struct dlm_ctxt *dlm);
+void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+
+
+int dlm_nm_init(struct dlm_ctxt *dlm);
+int dlm_heartbeat_init(struct dlm_ctxt *dlm);
+void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
+void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
+
+int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+int dlm_finish_migration(struct dlm_ctxt *dlm,
+			 struct dlm_lock_resource *res,
+			 u8 old_master);
+void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res);
+void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res);
+
+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
+			       void **ret_data);
+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
+			      void **ret_data);
+void dlm_assert_master_post_handler(int status, void *data, void *ret_data);
+int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
+			      void **ret_data);
+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
+				void **ret_data);
+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
+			    void **ret_data);
+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
+			       void **ret_data);
+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data,
+				  void **ret_data);
+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
+			       void **ret_data);
+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+			   void **ret_data);
+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+			      void **ret_data);
+int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			  u8 nodenum, u8 *real_master);
+
+
+int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res,
+			       int ignore_higher,
+			       u8 request_from,
+			       u32 flags);
+
+
+int dlm_send_one_lockres(struct dlm_ctxt *dlm,
+			 struct dlm_lock_resource *res,
+			 struct dlm_migratable_lockres *mres,
+			 u8 send_to,
+			 u8 flags);
+void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res);
+
+/* will exit holding res->spinlock, but may drop in function */
+void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
+void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
+
+/* will exit holding res->spinlock, but may drop in function */
+static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
+{
+	__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
+				    	  DLM_LOCK_RES_RECOVERING|
+					  DLM_LOCK_RES_MIGRATING));
+}
+
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
+
+/* create/destroy slab caches */
+int dlm_init_master_caches(void);
+void dlm_destroy_master_caches(void);
+
+int dlm_init_lock_cache(void);
+void dlm_destroy_lock_cache(void);
+
+int dlm_init_mle_cache(void);
+void dlm_destroy_mle_cache(void);
+
+void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
+int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
+			 struct dlm_lock_resource *res);
+void dlm_clean_master_list(struct dlm_ctxt *dlm,
+			   u8 dead_node);
+void dlm_force_free_mles(struct dlm_ctxt *dlm);
+int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
+int __dlm_lockres_unused(struct dlm_lock_resource *res);
+
+static inline const char * dlm_lock_mode_name(int mode)
+{
+	switch (mode) {
+		case LKM_EXMODE:
+			return "EX";
+		case LKM_PRMODE:
+			return "PR";
+		case LKM_NLMODE:
+			return "NL";
+	}
+	return "UNKNOWN";
+}
+
+
+static inline int dlm_lock_compatible(int existing, int request)
+{
+	/* NO_LOCK compatible with all */
+	if (request == LKM_NLMODE ||
+	    existing == LKM_NLMODE)
+		return 1;
+
+	/* EX incompatible with all non-NO_LOCK */
+	if (request == LKM_EXMODE)
+		return 0;
+
+	/* request must be PR, which is compatible with PR */
+	if (existing == LKM_PRMODE)
+		return 1;
+
+	return 0;
+}
+
+static inline int dlm_lock_on_list(struct list_head *head,
+				   struct dlm_lock *lock)
+{
+	struct list_head *iter;
+	struct dlm_lock *tmplock;
+
+	list_for_each(iter, head) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+		if (tmplock == lock)
+			return 1;
+	}
+	return 0;
+}
+
+
+static inline enum dlm_status dlm_err_to_dlm_status(int err)
+{
+	enum dlm_status ret;
+	if (err == -ENOMEM)
+		ret = DLM_SYSERR;
+	else if (err == -ETIMEDOUT || o2net_link_down(err, NULL))
+		ret = DLM_NOLOCKMGR;
+	else if (err == -EINVAL)
+		ret = DLM_BADPARAM;
+	else if (err == -ENAMETOOLONG)
+		ret = DLM_IVBUFLEN;
+	else
+		ret = DLM_BADARGS;
+	return ret;
+}
+
+
+static inline void dlm_node_iter_init(unsigned long *map,
+				      struct dlm_node_iter *iter)
+{
+	memcpy(iter->node_map, map, sizeof(iter->node_map));
+	iter->curnode = -1;
+}
+
+static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
+{
+	int bit;
+	bit = find_next_bit(iter->node_map, O2NM_MAX_NODES, iter->curnode+1);
+	if (bit >= O2NM_MAX_NODES) {
+		iter->curnode = O2NM_MAX_NODES;
+		return -ENOENT;
+	}
+	iter->curnode = bit;
+	return bit;
+}
+
+static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
+					 struct dlm_lock_resource *res,
+					 u8 owner)
+{
+	assert_spin_locked(&res->spinlock);
+
+	res->owner = owner;
+}
+
+static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
+					    struct dlm_lock_resource *res,
+					    u8 owner)
+{
+	assert_spin_locked(&res->spinlock);
+
+	if (owner != res->owner)
+		dlm_set_lockres_owner(dlm, res, owner);
+}
+
+#endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
new file mode 100644
index 00000000..29a886d1
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -0,0 +1,548 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmconvert.c
+ *
+ * underlying calls for lock conversion
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#include "dlmconvert.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+/* NOTE: __dlmconvert_master is the only function in here that
+ * needs a spinlock held on entry (res->spinlock) and it is the
+ * only one that holds a lock on exit (res->spinlock).
+ * All other functions in here need no locks and drop all of
+ * the locks that they acquire. */
+static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags,
+					   int type, int *call_ast,
+					   int *kick_thread);
+static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags, int type);
+
+/*
+ * this is only called directly by dlmlock(), and only when the
+ * local node is the owner of the lockres
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: see __dlmconvert_master
+ */
+enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type)
+{
+	int call_ast = 0, kick_thread = 0;
+	enum dlm_status status;
+
+	spin_lock(&res->spinlock);
+	/* we are not in a network handler, this is fine */
+	__dlm_wait_on_lockres(res);
+	__dlm_lockres_reserve_ast(res);
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+
+	status = __dlmconvert_master(dlm, res, lock, flags, type,
+				     &call_ast, &kick_thread);
+
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+	if (status != DLM_NORMAL && status != DLM_NOTQUEUED)
+		dlm_error(status);
+
+	/* either queue the ast or release it */
+	if (call_ast)
+		dlm_queue_ast(dlm, lock);
+	else
+		dlm_lockres_release_ast(dlm, res);
+
+	if (kick_thread)
+		dlm_kick_thread(dlm, res);
+
+	return status;
+}
+
+/* performs lock conversion at the lockres master site
+ * locking:
+ *   caller needs:  res->spinlock
+ *   taken:         takes and drops lock->spinlock
+ *   held on exit:  res->spinlock
+ * returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED
+ *   call_ast: whether ast should be called for this lock
+ *   kick_thread: whether dlm_kick_thread should be called
+ */
+static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags,
+					   int type, int *call_ast,
+					   int *kick_thread)
+{
+	enum dlm_status status = DLM_NORMAL;
+	struct list_head *iter;
+	struct dlm_lock *tmplock=NULL;
+
+	assert_spin_locked(&res->spinlock);
+
+	mlog(0, "type=%d, convert_type=%d, new convert_type=%d\n",
+	     lock->ml.type, lock->ml.convert_type, type);
+
+	spin_lock(&lock->spinlock);
+
+	/* already converting? */
+	if (lock->ml.convert_type != LKM_IVMODE) {
+		mlog(ML_ERROR, "attempted to convert a lock with a lock "
+		     "conversion pending\n");
+		status = DLM_DENIED;
+		goto unlock_exit;
+	}
+
+	/* must be on grant queue to convert */
+	if (!dlm_lock_on_list(&res->granted, lock)) {
+		mlog(ML_ERROR, "attempted to convert a lock not on grant "
+		     "queue\n");
+		status = DLM_DENIED;
+		goto unlock_exit;
+	}
+
+	if (flags & LKM_VALBLK) {
+		switch (lock->ml.type) {
+			case LKM_EXMODE:
+				/* EX + LKM_VALBLK + convert == set lvb */
+				mlog(0, "will set lvb: converting %s->%s\n",
+				     dlm_lock_mode_name(lock->ml.type),
+				     dlm_lock_mode_name(type));
+				lock->lksb->flags |= DLM_LKSB_PUT_LVB;
+				break;
+			case LKM_PRMODE:
+			case LKM_NLMODE:
+				/* refetch if new level is not NL */
+				if (type > LKM_NLMODE) {
+					mlog(0, "will fetch new value into "
+					     "lvb: converting %s->%s\n",
+					     dlm_lock_mode_name(lock->ml.type),
+					     dlm_lock_mode_name(type));
+					lock->lksb->flags |= DLM_LKSB_GET_LVB;
+				} else {
+					mlog(0, "will NOT fetch new value "
+					     "into lvb: converting %s->%s\n",
+					     dlm_lock_mode_name(lock->ml.type),
+					     dlm_lock_mode_name(type));
+					flags &= ~(LKM_VALBLK);
+				}
+				break;
+		}
+	}
+
+
+	/* in-place downconvert? */
+	if (type <= lock->ml.type)
+		goto grant;
+
+	/* upconvert from here on */
+	status = DLM_NORMAL;
+	list_for_each(iter, &res->granted) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+		if (tmplock == lock)
+			continue;
+		if (!dlm_lock_compatible(tmplock->ml.type, type))
+			goto switch_queues;
+	}
+
+	list_for_each(iter, &res->converting) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+		if (!dlm_lock_compatible(tmplock->ml.type, type))
+			goto switch_queues;
+		/* existing conversion requests take precedence */
+		if (!dlm_lock_compatible(tmplock->ml.convert_type, type))
+			goto switch_queues;
+	}
+
+	/* fall thru to grant */
+
+grant:
+	mlog(0, "res %.*s, granting %s lock\n", res->lockname.len,
+	     res->lockname.name, dlm_lock_mode_name(type));
+	/* immediately grant the new lock type */
+	lock->lksb->status = DLM_NORMAL;
+	if (lock->ml.node == dlm->node_num)
+		mlog(0, "doing in-place convert for nonlocal lock\n");
+	lock->ml.type = type;
+	if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
+		memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+
+	status = DLM_NORMAL;
+	*call_ast = 1;
+	goto unlock_exit;
+
+switch_queues:
+	if (flags & LKM_NOQUEUE) {
+		mlog(0, "failed to convert NOQUEUE lock %.*s from "
+		     "%d to %d...\n", res->lockname.len, res->lockname.name,
+		     lock->ml.type, type);
+		status = DLM_NOTQUEUED;
+		goto unlock_exit;
+	}
+	mlog(0, "res %.*s, queueing...\n", res->lockname.len,
+	     res->lockname.name);
+
+	lock->ml.convert_type = type;
+	/* do not alter lock refcount.  switching lists. */
+	list_move_tail(&lock->list, &res->converting);
+
+unlock_exit:
+	spin_unlock(&lock->spinlock);
+	if (status == DLM_DENIED) {
+		__dlm_print_one_lock_resource(res);
+	}
+	if (status == DLM_NORMAL)
+		*kick_thread = 1;
+	return status;
+}
+
+void dlm_revert_pending_convert(struct dlm_lock_resource *res,
+				struct dlm_lock *lock)
+{
+	/* do not alter lock refcount.  switching lists. */
+	list_move_tail(&lock->list, &res->granted);
+	lock->ml.convert_type = LKM_IVMODE;
+	lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
+}
+
+/* messages the master site to do lock conversion
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_RECOVERING, status from remote node
+ */
+enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type)
+{
+	enum dlm_status status;
+
+	mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
+	     lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		mlog(0, "bailing out early since res is RECOVERING "
+		     "on secondary queue\n");
+		/* __dlm_print_one_lock_resource(res); */
+		status = DLM_RECOVERING;
+		goto bail;
+	}
+	/* will exit this call with spinlock held */
+	__dlm_wait_on_lockres(res);
+
+	if (lock->ml.convert_type != LKM_IVMODE) {
+		__dlm_print_one_lock_resource(res);
+		mlog(ML_ERROR, "converting a remote lock that is already "
+		     "converting! (cookie=%u:%llu, conv=%d)\n",
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		     lock->ml.convert_type);
+		status = DLM_DENIED;
+		goto bail;
+	}
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+	/* move lock to local convert queue */
+	/* do not alter lock refcount.  switching lists. */
+	list_move_tail(&lock->list, &res->converting);
+	lock->convert_pending = 1;
+	lock->ml.convert_type = type;
+
+	if (flags & LKM_VALBLK) {
+		if (lock->ml.type == LKM_EXMODE) {
+			flags |= LKM_PUT_LVB;
+			lock->lksb->flags |= DLM_LKSB_PUT_LVB;
+		} else {
+			if (lock->ml.convert_type == LKM_NLMODE)
+				flags &= ~LKM_VALBLK;
+			else {
+				flags |= LKM_GET_LVB;
+				lock->lksb->flags |= DLM_LKSB_GET_LVB;
+			}
+		}
+	}
+	spin_unlock(&res->spinlock);
+
+	/* no locks held here.
+	 * need to wait for a reply as to whether it got queued or not. */
+	status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
+
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	lock->convert_pending = 0;
+	/* if it failed, move it back to granted queue */
+	if (status != DLM_NORMAL) {
+		if (status != DLM_NOTQUEUED)
+			dlm_error(status);
+		dlm_revert_pending_convert(res, lock);
+	}
+bail:
+	spin_unlock(&res->spinlock);
+
+	/* TODO: should this be a wake_one? */
+	/* wake up any IN_PROGRESS waiters */
+	wake_up(&res->wq);
+
+	return status;
+}
+
+/* sends DLM_CONVERT_LOCK_MSG to master site
+ * locking:
+ *   caller needs:  none
+ *   taken:         none
+ *   held on exit:  none
+ * returns: DLM_NOLOCKMGR, status from remote node
+ */
+static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags, int type)
+{
+	struct dlm_convert_lock convert;
+	int tmpret;
+	enum dlm_status ret;
+	int status = 0;
+	struct kvec vec[2];
+	size_t veclen = 1;
+
+	mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
+
+	memset(&convert, 0, sizeof(struct dlm_convert_lock));
+	convert.node_idx = dlm->node_num;
+	convert.requested_type = type;
+	convert.cookie = lock->ml.cookie;
+	convert.namelen = res->lockname.len;
+	convert.flags = cpu_to_be32(flags);
+	memcpy(convert.name, res->lockname.name, convert.namelen);
+
+	vec[0].iov_len = sizeof(struct dlm_convert_lock);
+	vec[0].iov_base = &convert;
+
+	if (flags & LKM_PUT_LVB) {
+		/* extra data to send if we are updating lvb */
+		vec[1].iov_len = DLM_LVB_LEN;
+		vec[1].iov_base = lock->lksb->lvb;
+		veclen++;
+	}
+
+	tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key,
+					vec, veclen, res->owner, &status);
+	if (tmpret >= 0) {
+		// successfully sent and received
+		ret = status;  // this is already a dlm_status
+		if (ret == DLM_RECOVERING) {
+			mlog(0, "node %u returned DLM_RECOVERING from convert "
+			     "message!\n", res->owner);
+		} else if (ret == DLM_MIGRATING) {
+			mlog(0, "node %u returned DLM_MIGRATING from convert "
+			     "message!\n", res->owner);
+		} else if (ret == DLM_FORWARD) {
+			mlog(0, "node %u returned DLM_FORWARD from convert "
+			     "message!\n", res->owner);
+		} else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
+			dlm_error(ret);
+	} else {
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
+		     res->owner);
+		if (dlm_is_host_down(tmpret)) {
+			/* instead of logging the same network error over
+			 * and over, sleep here and wait for the heartbeat
+			 * to notice the node is dead.  times out after 5s. */
+			dlm_wait_for_node_death(dlm, res->owner,
+						DLM_NODE_DEATH_WAIT_MAX);
+			ret = DLM_RECOVERING;
+			mlog(0, "node %u died so returning DLM_RECOVERING "
+			     "from convert message!\n", res->owner);
+		} else {
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+	}
+
+	return ret;
+}
+
+/* handler for DLM_CONVERT_LOCK_MSG on master site
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drop res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS,
+ *          status from __dlmconvert_master
+ */
+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+			     void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	struct list_head *iter;
+	struct dlm_lock *lock = NULL;
+	struct dlm_lockstatus *lksb;
+	enum dlm_status status = DLM_NORMAL;
+	u32 flags;
+	int call_ast = 0, kick_thread = 0, ast_reserved = 0, wake = 0;
+
+	if (!dlm_grab(dlm)) {
+		dlm_error(DLM_REJECTED);
+		return DLM_REJECTED;
+	}
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	if (cnv->namelen > DLM_LOCKID_NAME_MAX) {
+		status = DLM_IVBUFLEN;
+		dlm_error(status);
+		goto leave;
+	}
+
+	flags = be32_to_cpu(cnv->flags);
+
+	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
+	     (LKM_PUT_LVB|LKM_GET_LVB)) {
+		mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+		status = DLM_BADARGS;
+		goto leave;
+	}
+
+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
+	     (flags & LKM_GET_LVB ? "get lvb" : "none"));
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen);
+	if (!res) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	spin_lock(&res->spinlock);
+	status = __dlm_lockres_state_to_status(res);
+	if (status != DLM_NORMAL) {
+		spin_unlock(&res->spinlock);
+		dlm_error(status);
+		goto leave;
+	}
+	list_for_each(iter, &res->granted) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock->ml.cookie == cnv->cookie &&
+		    lock->ml.node == cnv->node_idx) {
+			dlm_lock_get(lock);
+			break;
+		}
+		lock = NULL;
+	}
+	spin_unlock(&res->spinlock);
+	if (!lock) {
+		status = DLM_IVLOCKID;
+		mlog(ML_ERROR, "did not find lock to convert on grant queue! "
+			       "cookie=%u:%llu\n",
+		     dlm_get_lock_cookie_node(be64_to_cpu(cnv->cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(cnv->cookie)));
+		dlm_print_one_lock_resource(res);
+		goto leave;
+	}
+
+	/* found the lock */
+	lksb = lock->lksb;
+
+	/* see if caller needed to get/put lvb */
+	if (flags & LKM_PUT_LVB) {
+		BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
+		lksb->flags |= DLM_LKSB_PUT_LVB;
+		memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN);
+	} else if (flags & LKM_GET_LVB) {
+		BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
+		lksb->flags |= DLM_LKSB_GET_LVB;
+	}
+
+	spin_lock(&res->spinlock);
+	status = __dlm_lockres_state_to_status(res);
+	if (status == DLM_NORMAL) {
+		__dlm_lockres_reserve_ast(res);
+		ast_reserved = 1;
+		res->state |= DLM_LOCK_RES_IN_PROGRESS;
+		status = __dlmconvert_master(dlm, res, lock, flags,
+					     cnv->requested_type,
+					     &call_ast, &kick_thread);
+		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+		wake = 1;
+	}
+	spin_unlock(&res->spinlock);
+	if (wake)
+		wake_up(&res->wq);
+
+	if (status != DLM_NORMAL) {
+		if (status != DLM_NOTQUEUED)
+			dlm_error(status);
+		lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
+	}
+
+leave:
+	if (lock)
+		dlm_lock_put(lock);
+
+	/* either queue the ast or release it, if reserved */
+	if (call_ast)
+		dlm_queue_ast(dlm, lock);
+	else if (ast_reserved)
+		dlm_lockres_release_ast(dlm, res);
+
+	if (kick_thread)
+		dlm_kick_thread(dlm, res);
+
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+
+	return status;
+}
diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h
new file mode 100644
index 00000000..b2e3677d
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmconvert.h
@@ -0,0 +1,35 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmconvert.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMCONVERT_H
+#define DLMCONVERT_H
+
+enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type);
+enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type);
+
+#endif
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
new file mode 100644
index 00000000..56f82cb9
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -0,0 +1,1017 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.c
+ *
+ * debug functionality for the dlm
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/sysctl.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+#include "dlmdebug.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+static int stringify_lockname(const char *lockname, int locklen, char *buf,
+			      int len);
+
+void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
+{
+	spin_lock(&res->spinlock);
+	__dlm_print_one_lock_resource(res);
+	spin_unlock(&res->spinlock);
+}
+
+static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
+{
+	int bit;
+	assert_spin_locked(&res->spinlock);
+
+	printk("  refmap nodes: [ ");
+	bit = 0;
+	while (1) {
+		bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
+		if (bit >= O2NM_MAX_NODES)
+			break;
+		printk("%u ", bit);
+		bit++;
+	}
+	printk("], inflight=%u\n", res->inflight_locks);
+}
+
+static void __dlm_print_lock(struct dlm_lock *lock)
+{
+	spin_lock(&lock->spinlock);
+
+	printk("    type=%d, conv=%d, node=%u, cookie=%u:%llu, "
+	       "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
+	       "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
+	       lock->ml.type, lock->ml.convert_type, lock->ml.node,
+	       dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+	       dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+	       atomic_read(&lock->lock_refs.refcount),
+	       (list_empty(&lock->ast_list) ? 'y' : 'n'),
+	       (lock->ast_pending ? 'y' : 'n'),
+	       (list_empty(&lock->bast_list) ? 'y' : 'n'),
+	       (lock->bast_pending ? 'y' : 'n'),
+	       (lock->convert_pending ? 'y' : 'n'),
+	       (lock->lock_pending ? 'y' : 'n'),
+	       (lock->cancel_pending ? 'y' : 'n'),
+	       (lock->unlock_pending ? 'y' : 'n'));
+
+	spin_unlock(&lock->spinlock);
+}
+
+void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
+{
+	struct list_head *iter2;
+	struct dlm_lock *lock;
+	char buf[DLM_LOCKID_NAME_MAX];
+
+	assert_spin_locked(&res->spinlock);
+
+	stringify_lockname(res->lockname.name, res->lockname.len,
+			   buf, sizeof(buf));
+	printk("lockres: %s, owner=%u, state=%u\n",
+	       buf, res->owner, res->state);
+	printk("  last used: %lu, refcnt: %u, on purge list: %s\n",
+	       res->last_used, atomic_read(&res->refs.refcount),
+	       list_empty(&res->purge) ? "no" : "yes");
+	printk("  on dirty list: %s, on reco list: %s, "
+	       "migrating pending: %s\n",
+	       list_empty(&res->dirty) ? "no" : "yes",
+	       list_empty(&res->recovering) ? "no" : "yes",
+	       res->migration_pending ? "yes" : "no");
+	printk("  inflight locks: %d, asts reserved: %d\n",
+	       res->inflight_locks, atomic_read(&res->asts_reserved));
+	dlm_print_lockres_refmap(res);
+	printk("  granted queue:\n");
+	list_for_each(iter2, &res->granted) {
+		lock = list_entry(iter2, struct dlm_lock, list);
+		__dlm_print_lock(lock);
+	}
+	printk("  converting queue:\n");
+	list_for_each(iter2, &res->converting) {
+		lock = list_entry(iter2, struct dlm_lock, list);
+		__dlm_print_lock(lock);
+	}
+	printk("  blocked queue:\n");
+	list_for_each(iter2, &res->blocked) {
+		lock = list_entry(iter2, struct dlm_lock, list);
+		__dlm_print_lock(lock);
+	}
+}
+
+void dlm_print_one_lock(struct dlm_lock *lockid)
+{
+	dlm_print_one_lock_resource(lockid->lockres);
+}
+EXPORT_SYMBOL_GPL(dlm_print_one_lock);
+
+static const char *dlm_errnames[] = {
+	[DLM_NORMAL] =			"DLM_NORMAL",
+	[DLM_GRANTED] =			"DLM_GRANTED",
+	[DLM_DENIED] =			"DLM_DENIED",
+	[DLM_DENIED_NOLOCKS] =		"DLM_DENIED_NOLOCKS",
+	[DLM_WORKING] =			"DLM_WORKING",
+	[DLM_BLOCKED] =			"DLM_BLOCKED",
+	[DLM_BLOCKED_ORPHAN] =		"DLM_BLOCKED_ORPHAN",
+	[DLM_DENIED_GRACE_PERIOD] =	"DLM_DENIED_GRACE_PERIOD",
+	[DLM_SYSERR] =			"DLM_SYSERR",
+	[DLM_NOSUPPORT] =		"DLM_NOSUPPORT",
+	[DLM_CANCELGRANT] =		"DLM_CANCELGRANT",
+	[DLM_IVLOCKID] =		"DLM_IVLOCKID",
+	[DLM_SYNC] =			"DLM_SYNC",
+	[DLM_BADTYPE] =			"DLM_BADTYPE",
+	[DLM_BADRESOURCE] =		"DLM_BADRESOURCE",
+	[DLM_MAXHANDLES] =		"DLM_MAXHANDLES",
+	[DLM_NOCLINFO] =		"DLM_NOCLINFO",
+	[DLM_NOLOCKMGR] =		"DLM_NOLOCKMGR",
+	[DLM_NOPURGED] =		"DLM_NOPURGED",
+	[DLM_BADARGS] =			"DLM_BADARGS",
+	[DLM_VOID] =			"DLM_VOID",
+	[DLM_NOTQUEUED] =		"DLM_NOTQUEUED",
+	[DLM_IVBUFLEN] =		"DLM_IVBUFLEN",
+	[DLM_CVTUNGRANT] =		"DLM_CVTUNGRANT",
+	[DLM_BADPARAM] =		"DLM_BADPARAM",
+	[DLM_VALNOTVALID] =		"DLM_VALNOTVALID",
+	[DLM_REJECTED] =		"DLM_REJECTED",
+	[DLM_ABORT] =			"DLM_ABORT",
+	[DLM_CANCEL] =			"DLM_CANCEL",
+	[DLM_IVRESHANDLE] =		"DLM_IVRESHANDLE",
+	[DLM_DEADLOCK] =		"DLM_DEADLOCK",
+	[DLM_DENIED_NOASTS] =		"DLM_DENIED_NOASTS",
+	[DLM_FORWARD] =			"DLM_FORWARD",
+	[DLM_TIMEOUT] =			"DLM_TIMEOUT",
+	[DLM_IVGROUPID] =		"DLM_IVGROUPID",
+	[DLM_VERS_CONFLICT] =		"DLM_VERS_CONFLICT",
+	[DLM_BAD_DEVICE_PATH] =		"DLM_BAD_DEVICE_PATH",
+	[DLM_NO_DEVICE_PERMISSION] =	"DLM_NO_DEVICE_PERMISSION",
+	[DLM_NO_CONTROL_DEVICE ] =	"DLM_NO_CONTROL_DEVICE ",
+	[DLM_RECOVERING] =		"DLM_RECOVERING",
+	[DLM_MIGRATING] =		"DLM_MIGRATING",
+	[DLM_MAXSTATS] =		"DLM_MAXSTATS",
+};
+
+static const char *dlm_errmsgs[] = {
+	[DLM_NORMAL] = 			"request in progress",
+	[DLM_GRANTED] = 		"request granted",
+	[DLM_DENIED] = 			"request denied",
+	[DLM_DENIED_NOLOCKS] = 		"request denied, out of system resources",
+	[DLM_WORKING] = 		"async request in progress",
+	[DLM_BLOCKED] = 		"lock request blocked",
+	[DLM_BLOCKED_ORPHAN] = 		"lock request blocked by a orphan lock",
+	[DLM_DENIED_GRACE_PERIOD] = 	"topological change in progress",
+	[DLM_SYSERR] = 			"system error",
+	[DLM_NOSUPPORT] = 		"unsupported",
+	[DLM_CANCELGRANT] = 		"can't cancel convert: already granted",
+	[DLM_IVLOCKID] = 		"bad lockid",
+	[DLM_SYNC] = 			"synchronous request granted",
+	[DLM_BADTYPE] = 		"bad resource type",
+	[DLM_BADRESOURCE] = 		"bad resource handle",
+	[DLM_MAXHANDLES] = 		"no more resource handles",
+	[DLM_NOCLINFO] = 		"can't contact cluster manager",
+	[DLM_NOLOCKMGR] = 		"can't contact lock manager",
+	[DLM_NOPURGED] = 		"can't contact purge daemon",
+	[DLM_BADARGS] = 		"bad api args",
+	[DLM_VOID] = 			"no status",
+	[DLM_NOTQUEUED] = 		"NOQUEUE was specified and request failed",
+	[DLM_IVBUFLEN] = 		"invalid resource name length",
+	[DLM_CVTUNGRANT] = 		"attempted to convert ungranted lock",
+	[DLM_BADPARAM] = 		"invalid lock mode specified",
+	[DLM_VALNOTVALID] = 		"value block has been invalidated",
+	[DLM_REJECTED] = 		"request rejected, unrecognized client",
+	[DLM_ABORT] = 			"blocked lock request cancelled",
+	[DLM_CANCEL] = 			"conversion request cancelled",
+	[DLM_IVRESHANDLE] = 		"invalid resource handle",
+	[DLM_DEADLOCK] = 		"deadlock recovery refused this request",
+	[DLM_DENIED_NOASTS] = 		"failed to allocate AST",
+	[DLM_FORWARD] = 		"request must wait for primary's response",
+	[DLM_TIMEOUT] = 		"timeout value for lock has expired",
+	[DLM_IVGROUPID] = 		"invalid group specification",
+	[DLM_VERS_CONFLICT] = 		"version conflicts prevent request handling",
+	[DLM_BAD_DEVICE_PATH] = 	"Locks device does not exist or path wrong",
+	[DLM_NO_DEVICE_PERMISSION] = 	"Client has insufficient perms for device",
+	[DLM_NO_CONTROL_DEVICE] = 	"Cannot set options on opened device ",
+	[DLM_RECOVERING] = 		"lock resource being recovered",
+	[DLM_MIGRATING] = 		"lock resource being migrated",
+	[DLM_MAXSTATS] = 		"invalid error number",
+};
+
+const char *dlm_errmsg(enum dlm_status err)
+{
+	if (err >= DLM_MAXSTATS || err < 0)
+		return dlm_errmsgs[DLM_MAXSTATS];
+	return dlm_errmsgs[err];
+}
+EXPORT_SYMBOL_GPL(dlm_errmsg);
+
+const char *dlm_errname(enum dlm_status err)
+{
+	if (err >= DLM_MAXSTATS || err < 0)
+		return dlm_errnames[DLM_MAXSTATS];
+	return dlm_errnames[err];
+}
+EXPORT_SYMBOL_GPL(dlm_errname);
+
+/* NOTE: This function converts a lockname into a string. It uses knowledge
+ * of the format of the lockname that should be outside the purview of the dlm.
+ * We are adding only to make dlm debugging slightly easier.
+ *
+ * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h.
+ */
+static int stringify_lockname(const char *lockname, int locklen, char *buf,
+			      int len)
+{
+	int out = 0;
+	__be64 inode_blkno_be;
+
+#define OCFS2_DENTRY_LOCK_INO_START	18
+	if (*lockname == 'N') {
+		memcpy((__be64 *)&inode_blkno_be,
+		       (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START],
+		       sizeof(__be64));
+		out += snprintf(buf + out, len - out, "%.*s%08x",
+				OCFS2_DENTRY_LOCK_INO_START - 1, lockname,
+				(unsigned int)be64_to_cpu(inode_blkno_be));
+	} else
+		out += snprintf(buf + out, len - out, "%.*s",
+				locklen, lockname);
+	return out;
+}
+
+static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
+			     char *buf, int len)
+{
+	int out = 0;
+	int i = -1;
+
+	while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes)
+		out += snprintf(buf + out, len - out, "%d ", i);
+
+	return out;
+}
+
+static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
+{
+	int out = 0;
+	char *mle_type;
+
+	if (mle->type == DLM_MLE_BLOCK)
+		mle_type = "BLK";
+	else if (mle->type == DLM_MLE_MASTER)
+		mle_type = "MAS";
+	else
+		mle_type = "MIG";
+
+	out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out);
+	out += snprintf(buf + out, len - out,
+			"\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
+			mle_type, mle->master, mle->new_master,
+			!list_empty(&mle->hb_events),
+			!!mle->inuse,
+			atomic_read(&mle->mle_refs.refcount));
+
+	out += snprintf(buf + out, len - out, "Maybe=");
+	out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "Vote=");
+	out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "Response=");
+	out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "Node=");
+	out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "\n");
+
+	return out;
+}
+
+void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+{
+	char *buf;
+
+	buf = (char *) get_zeroed_page(GFP_NOFS);
+	if (buf) {
+		dump_mle(mle, buf, PAGE_SIZE - 1);
+		free_page((unsigned long)buf);
+	}
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+static struct dentry *dlm_debugfs_root = NULL;
+
+#define DLM_DEBUGFS_DIR				"o2dlm"
+#define DLM_DEBUGFS_DLM_STATE			"dlm_state"
+#define DLM_DEBUGFS_LOCKING_STATE		"locking_state"
+#define DLM_DEBUGFS_MLE_STATE			"mle_state"
+#define DLM_DEBUGFS_PURGE_LIST			"purge_list"
+
+/* begin - utils funcs */
+static void dlm_debug_free(struct kref *kref)
+{
+	struct dlm_debug_ctxt *dc;
+
+	dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt);
+
+	kfree(dc);
+}
+
+static void dlm_debug_put(struct dlm_debug_ctxt *dc)
+{
+	if (dc)
+		kref_put(&dc->debug_refcnt, dlm_debug_free);
+}
+
+static void dlm_debug_get(struct dlm_debug_ctxt *dc)
+{
+	kref_get(&dc->debug_refcnt);
+}
+
+static int debug_release(struct inode *inode, struct file *file)
+{
+	free_page((unsigned long)file->private_data);
+	return 0;
+}
+
+static ssize_t debug_read(struct file *file, char __user *buf,
+			  size_t nbytes, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+				       i_size_read(file->f_mapping->host));
+}
+/* end - util funcs */
+
+/* begin - purge list funcs */
+static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
+{
+	struct dlm_lock_resource *res;
+	int out = 0;
+	unsigned long total = 0;
+
+	out += snprintf(buf + out, len - out,
+			"Dumping Purgelist for Domain: %s\n", dlm->name);
+
+	spin_lock(&dlm->spinlock);
+	list_for_each_entry(res, &dlm->purge_list, purge) {
+		++total;
+		if (len - out < 100)
+			continue;
+		spin_lock(&res->spinlock);
+		out += stringify_lockname(res->lockname.name,
+					  res->lockname.len,
+					  buf + out, len - out);
+		out += snprintf(buf + out, len - out, "\t%ld\n",
+				(jiffies - res->last_used)/HZ);
+		spin_unlock(&res->spinlock);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
+
+	return out;
+}
+
+static int debug_purgelist_open(struct inode *inode, struct file *file)
+{
+	struct dlm_ctxt *dlm = inode->i_private;
+	char *buf = NULL;
+
+	buf = (char *) get_zeroed_page(GFP_NOFS);
+	if (!buf)
+		goto bail;
+
+	i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
+
+	file->private_data = buf;
+
+	return 0;
+bail:
+	return -ENOMEM;
+}
+
+static const struct file_operations debug_purgelist_fops = {
+	.open =		debug_purgelist_open,
+	.release =	debug_release,
+	.read =		debug_read,
+	.llseek =	generic_file_llseek,
+};
+/* end - purge list funcs */
+
+/* begin - debug mle funcs */
+static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
+{
+	struct dlm_master_list_entry *mle;
+	struct hlist_head *bucket;
+	struct hlist_node *list;
+	int i, out = 0;
+	unsigned long total = 0, longest = 0, bucket_count = 0;
+
+	out += snprintf(buf + out, len - out,
+			"Dumping MLEs for Domain: %s\n", dlm->name);
+
+	spin_lock(&dlm->master_lock);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = dlm_master_hash(dlm, i);
+		hlist_for_each(list, bucket) {
+			mle = hlist_entry(list, struct dlm_master_list_entry,
+					  master_hash_node);
+			++total;
+			++bucket_count;
+			if (len - out < 200)
+				continue;
+			out += dump_mle(mle, buf + out, len - out);
+		}
+		longest = max(longest, bucket_count);
+		bucket_count = 0;
+	}
+	spin_unlock(&dlm->master_lock);
+
+	out += snprintf(buf + out, len - out,
+			"Total: %ld, Longest: %ld\n", total, longest);
+	return out;
+}
+
+static int debug_mle_open(struct inode *inode, struct file *file)
+{
+	struct dlm_ctxt *dlm = inode->i_private;
+	char *buf = NULL;
+
+	buf = (char *) get_zeroed_page(GFP_NOFS);
+	if (!buf)
+		goto bail;
+
+	i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
+
+	file->private_data = buf;
+
+	return 0;
+bail:
+	return -ENOMEM;
+}
+
+static const struct file_operations debug_mle_fops = {
+	.open =		debug_mle_open,
+	.release =	debug_release,
+	.read =		debug_read,
+	.llseek =	generic_file_llseek,
+};
+
+/* end - debug mle funcs */
+
+/* begin - debug lockres funcs */
+static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
+{
+	int out;
+
+#define DEBUG_LOCK_VERSION	1
+	spin_lock(&lock->spinlock);
+	out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
+		       "%d,%d,%d,%d\n",
+		       DEBUG_LOCK_VERSION,
+		       list_type, lock->ml.type, lock->ml.convert_type,
+		       lock->ml.node,
+		       dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		       dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		       !list_empty(&lock->ast_list),
+		       !list_empty(&lock->bast_list),
+		       lock->ast_pending, lock->bast_pending,
+		       lock->convert_pending, lock->lock_pending,
+		       lock->cancel_pending, lock->unlock_pending,
+		       atomic_read(&lock->lock_refs.refcount));
+	spin_unlock(&lock->spinlock);
+
+	return out;
+}
+
+static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
+{
+	struct dlm_lock *lock;
+	int i;
+	int out = 0;
+
+	out += snprintf(buf + out, len - out, "NAME:");
+	out += stringify_lockname(res->lockname.name, res->lockname.len,
+				  buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+#define DEBUG_LRES_VERSION	1
+	out += snprintf(buf + out, len - out,
+			"LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n",
+			DEBUG_LRES_VERSION,
+			res->owner, res->state, res->last_used,
+			!list_empty(&res->purge),
+			!list_empty(&res->dirty),
+			!list_empty(&res->recovering),
+			res->inflight_locks, res->migration_pending,
+			atomic_read(&res->asts_reserved),
+			atomic_read(&res->refs.refcount));
+
+	/* refmap */
+	out += snprintf(buf + out, len - out, "RMAP:");
+	out += stringify_nodemap(res->refmap, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	/* lvb */
+	out += snprintf(buf + out, len - out, "LVBX:");
+	for (i = 0; i < DLM_LVB_LEN; i++)
+		out += snprintf(buf + out, len - out,
+					"%02x", (unsigned char)res->lvb[i]);
+	out += snprintf(buf + out, len - out, "\n");
+
+	/* granted */
+	list_for_each_entry(lock, &res->granted, list)
+		out += dump_lock(lock, 0, buf + out, len - out);
+
+	/* converting */
+	list_for_each_entry(lock, &res->converting, list)
+		out += dump_lock(lock, 1, buf + out, len - out);
+
+	/* blocked */
+	list_for_each_entry(lock, &res->blocked, list)
+		out += dump_lock(lock, 2, buf + out, len - out);
+
+	out += snprintf(buf + out, len - out, "\n");
+
+	return out;
+}
+
+static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct debug_lockres *dl = m->private;
+	struct dlm_ctxt *dlm = dl->dl_ctxt;
+	struct dlm_lock_resource *oldres = dl->dl_res;
+	struct dlm_lock_resource *res = NULL;
+	struct list_head *track_list;
+
+	spin_lock(&dlm->track_lock);
+	if (oldres)
+		track_list = &oldres->tracking;
+	else {
+		track_list = &dlm->tracking_list;
+		if (list_empty(track_list)) {
+			dl = NULL;
+			spin_unlock(&dlm->track_lock);
+			goto bail;
+		}
+	}
+
+	list_for_each_entry(res, track_list, tracking) {
+		if (&res->tracking == &dlm->tracking_list)
+			res = NULL;
+		else
+			dlm_lockres_get(res);
+		break;
+	}
+	spin_unlock(&dlm->track_lock);
+
+	if (oldres)
+		dlm_lockres_put(oldres);
+
+	dl->dl_res = res;
+
+	if (res) {
+		spin_lock(&res->spinlock);
+		dump_lockres(res, dl->dl_buf, dl->dl_len - 1);
+		spin_unlock(&res->spinlock);
+	} else
+		dl = NULL;
+
+bail:
+	/* passed to seq_show */
+	return dl;
+}
+
+static void lockres_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static void *lockres_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return NULL;
+}
+
+static int lockres_seq_show(struct seq_file *s, void *v)
+{
+	struct debug_lockres *dl = (struct debug_lockres *)v;
+
+	seq_printf(s, "%s", dl->dl_buf);
+
+	return 0;
+}
+
+static const struct seq_operations debug_lockres_ops = {
+	.start =	lockres_seq_start,
+	.stop =		lockres_seq_stop,
+	.next =		lockres_seq_next,
+	.show =		lockres_seq_show,
+};
+
+static int debug_lockres_open(struct inode *inode, struct file *file)
+{
+	struct dlm_ctxt *dlm = inode->i_private;
+	int ret = -ENOMEM;
+	struct seq_file *seq;
+	struct debug_lockres *dl = NULL;
+
+	dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL);
+	if (!dl) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	dl->dl_len = PAGE_SIZE;
+	dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL);
+	if (!dl->dl_buf) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = seq_open(file, &debug_lockres_ops);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	seq = file->private_data;
+	seq->private = dl;
+
+	dlm_grab(dlm);
+	dl->dl_ctxt = dlm;
+
+	return 0;
+bail:
+	if (dl)
+		kfree(dl->dl_buf);
+	kfree(dl);
+	return ret;
+}
+
+static int debug_lockres_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct debug_lockres *dl = (struct debug_lockres *)seq->private;
+
+	if (dl->dl_res)
+		dlm_lockres_put(dl->dl_res);
+	dlm_put(dl->dl_ctxt);
+	kfree(dl->dl_buf);
+	return seq_release_private(inode, file);
+}
+
+static const struct file_operations debug_lockres_fops = {
+	.open =		debug_lockres_open,
+	.release =	debug_lockres_release,
+	.read =		seq_read,
+	.llseek =	seq_lseek,
+};
+/* end - debug lockres funcs */
+
+/* begin - debug state funcs */
+static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
+{
+	int out = 0;
+	struct dlm_reco_node_data *node;
+	char *state;
+	int cur_mles = 0, tot_mles = 0;
+	int i;
+
+	spin_lock(&dlm->spinlock);
+
+	switch (dlm->dlm_state) {
+	case DLM_CTXT_NEW:
+		state = "NEW"; break;
+	case DLM_CTXT_JOINED:
+		state = "JOINED"; break;
+	case DLM_CTXT_IN_SHUTDOWN:
+		state = "SHUTDOWN"; break;
+	case DLM_CTXT_LEAVING:
+		state = "LEAVING"; break;
+	default:
+		state = "UNKNOWN"; break;
+	}
+
+	/* Domain: xxxxxxxxxx  Key: 0xdfbac769 */
+	out += snprintf(buf + out, len - out,
+			"Domain: %s  Key: 0x%08x  Protocol: %d.%d\n",
+			dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
+			dlm->dlm_locking_proto.pv_minor);
+
+	/* Thread Pid: xxx  Node: xxx  State: xxxxx */
+	out += snprintf(buf + out, len - out,
+			"Thread Pid: %d  Node: %d  State: %s\n",
+			task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
+
+	/* Number of Joins: xxx  Joining Node: xxx */
+	out += snprintf(buf + out, len - out,
+			"Number of Joins: %d  Joining Node: %d\n",
+			dlm->num_joins, dlm->joining_node);
+
+	/* Domain Map: xx xx xx */
+	out += snprintf(buf + out, len - out, "Domain Map: ");
+	out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	/* Exit Domain Map: xx xx xx */
+	out += snprintf(buf + out, len - out, "Exit Domain Map: ");
+	out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	/* Live Map: xx xx xx */
+	out += snprintf(buf + out, len - out, "Live Map: ");
+	out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	/* Lock Resources: xxx (xxx) */
+	out += snprintf(buf + out, len - out,
+			"Lock Resources: %d (%d)\n",
+			atomic_read(&dlm->res_cur_count),
+			atomic_read(&dlm->res_tot_count));
+
+	for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+		tot_mles += atomic_read(&dlm->mle_tot_count[i]);
+
+	for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+		cur_mles += atomic_read(&dlm->mle_cur_count[i]);
+
+	/* MLEs: xxx (xxx) */
+	out += snprintf(buf + out, len - out,
+			"MLEs: %d (%d)\n", cur_mles, tot_mles);
+
+	/*  Blocking: xxx (xxx) */
+	out += snprintf(buf + out, len - out,
+			"  Blocking: %d (%d)\n",
+			atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
+			atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
+
+	/*  Mastery: xxx (xxx) */
+	out += snprintf(buf + out, len - out,
+			"  Mastery: %d (%d)\n",
+			atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
+			atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
+
+	/*  Migration: xxx (xxx) */
+	out += snprintf(buf + out, len - out,
+			"  Migration: %d (%d)\n",
+			atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
+			atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
+
+	/* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
+	out += snprintf(buf + out, len - out,
+			"Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
+			"PendingBASTs=%s\n",
+			(list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
+			(list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
+			(list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
+			(list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
+
+	/* Purge Count: xxx  Refs: xxx */
+	out += snprintf(buf + out, len - out,
+			"Purge Count: %d  Refs: %d\n", dlm->purge_count,
+			atomic_read(&dlm->dlm_refs.refcount));
+
+	/* Dead Node: xxx */
+	out += snprintf(buf + out, len - out,
+			"Dead Node: %d\n", dlm->reco.dead_node);
+
+	/* What about DLM_RECO_STATE_FINALIZE? */
+	if (dlm->reco.state == DLM_RECO_STATE_ACTIVE)
+		state = "ACTIVE";
+	else
+		state = "INACTIVE";
+
+	/* Recovery Pid: xxxx  Master: xxx  State: xxxx */
+	out += snprintf(buf + out, len - out,
+			"Recovery Pid: %d  Master: %d  State: %s\n",
+			task_pid_nr(dlm->dlm_reco_thread_task),
+			dlm->reco.new_master, state);
+
+	/* Recovery Map: xx xx */
+	out += snprintf(buf + out, len - out, "Recovery Map: ");
+	out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	/* Recovery Node State: */
+	out += snprintf(buf + out, len - out, "Recovery Node State:\n");
+	list_for_each_entry(node, &dlm->reco.node_data, list) {
+		switch (node->state) {
+		case DLM_RECO_NODE_DATA_INIT:
+			state = "INIT";
+			break;
+		case DLM_RECO_NODE_DATA_REQUESTING:
+			state = "REQUESTING";
+			break;
+		case DLM_RECO_NODE_DATA_DEAD:
+			state = "DEAD";
+			break;
+		case DLM_RECO_NODE_DATA_RECEIVING:
+			state = "RECEIVING";
+			break;
+		case DLM_RECO_NODE_DATA_REQUESTED:
+			state = "REQUESTED";
+			break;
+		case DLM_RECO_NODE_DATA_DONE:
+			state = "DONE";
+			break;
+		case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+			state = "FINALIZE-SENT";
+			break;
+		default:
+			state = "BAD";
+			break;
+		}
+		out += snprintf(buf + out, len - out, "\t%u - %s\n",
+				node->node_num, state);
+	}
+
+	spin_unlock(&dlm->spinlock);
+
+	return out;
+}
+
+static int debug_state_open(struct inode *inode, struct file *file)
+{
+	struct dlm_ctxt *dlm = inode->i_private;
+	char *buf = NULL;
+
+	buf = (char *) get_zeroed_page(GFP_NOFS);
+	if (!buf)
+		goto bail;
+
+	i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
+
+	file->private_data = buf;
+
+	return 0;
+bail:
+	return -ENOMEM;
+}
+
+static const struct file_operations debug_state_fops = {
+	.open =		debug_state_open,
+	.release =	debug_release,
+	.read =		debug_read,
+	.llseek =	generic_file_llseek,
+};
+/* end  - debug state funcs */
+
+/* files in subroot */
+int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+	struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+
+	/* for dumping dlm_ctxt */
+	dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
+						     S_IFREG|S_IRUSR,
+						     dlm->dlm_debugfs_subroot,
+						     dlm, &debug_state_fops);
+	if (!dc->debug_state_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	/* for dumping lockres */
+	dc->debug_lockres_dentry =
+			debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
+					    S_IFREG|S_IRUSR,
+					    dlm->dlm_debugfs_subroot,
+					    dlm, &debug_lockres_fops);
+	if (!dc->debug_lockres_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	/* for dumping mles */
+	dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
+						   S_IFREG|S_IRUSR,
+						   dlm->dlm_debugfs_subroot,
+						   dlm, &debug_mle_fops);
+	if (!dc->debug_mle_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	/* for dumping lockres on the purge list */
+	dc->debug_purgelist_dentry =
+			debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
+					    S_IFREG|S_IRUSR,
+					    dlm->dlm_debugfs_subroot,
+					    dlm, &debug_purgelist_fops);
+	if (!dc->debug_purgelist_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	dlm_debug_get(dc);
+	return 0;
+
+bail:
+	dlm_debug_shutdown(dlm);
+	return -ENOMEM;
+}
+
+void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+	struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+
+	if (dc) {
+		debugfs_remove(dc->debug_purgelist_dentry);
+		debugfs_remove(dc->debug_mle_dentry);
+		debugfs_remove(dc->debug_lockres_dentry);
+		debugfs_remove(dc->debug_state_dentry);
+		dlm_debug_put(dc);
+	}
+}
+
+/* subroot - domain dir */
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+	dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
+						      dlm_debugfs_root);
+	if (!dlm->dlm_debugfs_subroot) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
+				      GFP_KERNEL);
+	if (!dlm->dlm_debug_ctxt) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+	kref_init(&dlm->dlm_debug_ctxt->debug_refcnt);
+
+	return 0;
+bail:
+	dlm_destroy_debugfs_subroot(dlm);
+	return -ENOMEM;
+}
+
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+	debugfs_remove(dlm->dlm_debugfs_subroot);
+}
+
+/* debugfs root */
+int dlm_create_debugfs_root(void)
+{
+	dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL);
+	if (!dlm_debugfs_root) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void dlm_destroy_debugfs_root(void)
+{
+	debugfs_remove(dlm_debugfs_root);
+}
+#endif	/* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644
index 00000000..1f27c481
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,81 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.h
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMDEBUG_H
+#define DLMDEBUG_H
+
+void dlm_print_one_mle(struct dlm_master_list_entry *mle);
+
+#ifdef CONFIG_DEBUG_FS
+
+struct dlm_debug_ctxt {
+	struct kref debug_refcnt;
+	struct dentry *debug_state_dentry;
+	struct dentry *debug_lockres_dentry;
+	struct dentry *debug_mle_dentry;
+	struct dentry *debug_purgelist_dentry;
+};
+
+struct debug_lockres {
+	int dl_len;
+	char *dl_buf;
+	struct dlm_ctxt *dl_ctxt;
+	struct dlm_lock_resource *dl_res;
+};
+
+int dlm_debug_init(struct dlm_ctxt *dlm);
+void dlm_debug_shutdown(struct dlm_ctxt *dlm);
+
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
+
+int dlm_create_debugfs_root(void);
+void dlm_destroy_debugfs_root(void);
+
+#else
+
+static inline int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+	return 0;
+}
+static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+}
+static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+	return 0;
+}
+static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+}
+static inline int dlm_create_debugfs_root(void)
+{
+	return 0;
+}
+static inline void dlm_destroy_debugfs_root(void)
+{
+}
+
+#endif	/* CONFIG_DEBUG_FS */
+#endif	/* DLMDEBUG_H */
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
new file mode 100644
index 00000000..6ed6b95d
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -0,0 +1,2397 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdomain.c
+ *
+ * defines domain join / leave apis
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/debugfs.h>
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+#include "dlmdebug.h"
+
+#include "dlmver.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
+#include "cluster/masklog.h"
+
+/*
+ * ocfs2 node maps are array of long int, which limits to send them freely
+ * across the wire due to endianness issues. To workaround this, we convert
+ * long ints to byte arrays. Following 3 routines are helper functions to
+ * set/test/copy bits within those array of bytes
+ */
+static inline void byte_set_bit(u8 nr, u8 map[])
+{
+	map[nr >> 3] |= (1UL << (nr & 7));
+}
+
+static inline int byte_test_bit(u8 nr, u8 map[])
+{
+	return ((1UL << (nr & 7)) & (map[nr >> 3])) != 0;
+}
+
+static inline void byte_copymap(u8 dmap[], unsigned long smap[],
+			unsigned int sz)
+{
+	unsigned int nn;
+
+	if (!sz)
+		return;
+
+	memset(dmap, 0, ((sz + 7) >> 3));
+	for (nn = 0 ; nn < sz; nn++)
+		if (test_bit(nn, smap))
+			byte_set_bit(nn, dmap);
+}
+
+static void dlm_free_pagevec(void **vec, int pages)
+{
+	while (pages--)
+		free_page((unsigned long)vec[pages]);
+	kfree(vec);
+}
+
+static void **dlm_alloc_pagevec(int pages)
+{
+	void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL);
+	int i;
+
+	if (!vec)
+		return NULL;
+
+	for (i = 0; i < pages; i++)
+		if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL)))
+			goto out_free;
+
+	mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n",
+	     pages, (unsigned long)DLM_HASH_PAGES,
+	     (unsigned long)DLM_BUCKETS_PER_PAGE);
+	return vec;
+out_free:
+	dlm_free_pagevec(vec, i);
+	return NULL;
+}
+
+/*
+ *
+ * spinlock lock ordering: if multiple locks are needed, obey this ordering:
+ *    dlm_domain_lock
+ *    struct dlm_ctxt->spinlock
+ *    struct dlm_lock_resource->spinlock
+ *    struct dlm_ctxt->master_lock
+ *    struct dlm_ctxt->ast_lock
+ *    dlm_master_list_entry->spinlock
+ *    dlm_lock->spinlock
+ *
+ */
+
+DEFINE_SPINLOCK(dlm_domain_lock);
+LIST_HEAD(dlm_domains);
+static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
+
+/*
+ * The supported protocol version for DLM communication.  Running domains
+ * will have a negotiated version with the same major number and a minor
+ * number equal or smaller.  The dlm_ctxt->dlm_locking_proto field should
+ * be used to determine what a running domain is actually using.
+ *
+ * New in version 1.1:
+ *	- Message DLM_QUERY_REGION added to support global heartbeat
+ *	- Message DLM_QUERY_NODEINFO added to allow online node removes
+ * New in version 1.2:
+ * 	- Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
+ */
+static const struct dlm_protocol_version dlm_protocol = {
+	.pv_major = 1,
+	.pv_minor = 2,
+};
+
+#define DLM_DOMAIN_BACKOFF_MS 200
+
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
+				  void **ret_data);
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
+				     void **ret_data);
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
+				   void **ret_data);
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+				    void *data, void **ret_data);
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
+				   void **ret_data);
+static int dlm_protocol_compare(struct dlm_protocol_version *existing,
+				struct dlm_protocol_version *request);
+
+static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
+
+void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
+{
+	if (!hlist_unhashed(&lockres->hash_node)) {
+		hlist_del_init(&lockres->hash_node);
+		dlm_lockres_put(lockres);
+	}
+}
+
+void __dlm_insert_lockres(struct dlm_ctxt *dlm,
+		       struct dlm_lock_resource *res)
+{
+	struct hlist_head *bucket;
+	struct qstr *q;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	q = &res->lockname;
+	bucket = dlm_lockres_hash(dlm, q->hash);
+
+	/* get a reference for our hashtable */
+	dlm_lockres_get(res);
+
+	hlist_add_head(&res->hash_node, bucket);
+}
+
+struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
+						     const char *name,
+						     unsigned int len,
+						     unsigned int hash)
+{
+	struct hlist_head *bucket;
+	struct hlist_node *list;
+
+	mlog(0, "%.*s\n", len, name);
+
+	assert_spin_locked(&dlm->spinlock);
+
+	bucket = dlm_lockres_hash(dlm, hash);
+
+	hlist_for_each(list, bucket) {
+		struct dlm_lock_resource *res = hlist_entry(list,
+			struct dlm_lock_resource, hash_node);
+		if (res->lockname.name[0] != name[0])
+			continue;
+		if (unlikely(res->lockname.len != len))
+			continue;
+		if (memcmp(res->lockname.name + 1, name + 1, len - 1))
+			continue;
+		dlm_lockres_get(res);
+		return res;
+	}
+	return NULL;
+}
+
+/* intended to be called by functions which do not care about lock
+ * resources which are being purged (most net _handler functions).
+ * this will return NULL for any lock resource which is found but
+ * currently in the process of dropping its mastery reference.
+ * use __dlm_lookup_lockres_full when you need the lock resource
+ * regardless (e.g. dlm_get_lock_resource) */
+struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
+						const char *name,
+						unsigned int len,
+						unsigned int hash)
+{
+	struct dlm_lock_resource *res = NULL;
+
+	mlog(0, "%.*s\n", len, name);
+
+	assert_spin_locked(&dlm->spinlock);
+
+	res = __dlm_lookup_lockres_full(dlm, name, len, hash);
+	if (res) {
+		spin_lock(&res->spinlock);
+		if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+			spin_unlock(&res->spinlock);
+			dlm_lockres_put(res);
+			return NULL;
+		}
+		spin_unlock(&res->spinlock);
+	}
+
+	return res;
+}
+
+struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
+				    const char *name,
+				    unsigned int len)
+{
+	struct dlm_lock_resource *res;
+	unsigned int hash = dlm_lockid_hash(name, len);
+
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, name, len, hash);
+	spin_unlock(&dlm->spinlock);
+	return res;
+}
+
+static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
+{
+	struct dlm_ctxt *tmp = NULL;
+	struct list_head *iter;
+
+	assert_spin_locked(&dlm_domain_lock);
+
+	/* tmp->name here is always NULL terminated,
+	 * but domain may not be! */
+	list_for_each(iter, &dlm_domains) {
+		tmp = list_entry (iter, struct dlm_ctxt, list);
+		if (strlen(tmp->name) == len &&
+		    memcmp(tmp->name, domain, len)==0)
+			break;
+		tmp = NULL;
+	}
+
+	return tmp;
+}
+
+/* For null terminated domain strings ONLY */
+static struct dlm_ctxt * __dlm_lookup_domain(const char *domain)
+{
+	assert_spin_locked(&dlm_domain_lock);
+
+	return __dlm_lookup_domain_full(domain, strlen(domain));
+}
+
+
+/* returns true on one of two conditions:
+ * 1) the domain does not exist
+ * 2) the domain exists and it's state is "joined" */
+static int dlm_wait_on_domain_helper(const char *domain)
+{
+	int ret = 0;
+	struct dlm_ctxt *tmp = NULL;
+
+	spin_lock(&dlm_domain_lock);
+
+	tmp = __dlm_lookup_domain(domain);
+	if (!tmp)
+		ret = 1;
+	else if (tmp->dlm_state == DLM_CTXT_JOINED)
+		ret = 1;
+
+	spin_unlock(&dlm_domain_lock);
+	return ret;
+}
+
+static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
+{
+	dlm_destroy_debugfs_subroot(dlm);
+
+	if (dlm->lockres_hash)
+		dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+
+	if (dlm->master_hash)
+		dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
+
+	if (dlm->name)
+		kfree(dlm->name);
+
+	kfree(dlm);
+}
+
+/* A little strange - this function will be called while holding
+ * dlm_domain_lock and is expected to be holding it on the way out. We
+ * will however drop and reacquire it multiple times */
+static void dlm_ctxt_release(struct kref *kref)
+{
+	struct dlm_ctxt *dlm;
+
+	dlm = container_of(kref, struct dlm_ctxt, dlm_refs);
+
+	BUG_ON(dlm->num_joins);
+	BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED);
+
+	/* we may still be in the list if we hit an error during join. */
+	list_del_init(&dlm->list);
+
+	spin_unlock(&dlm_domain_lock);
+
+	mlog(0, "freeing memory from domain %s\n", dlm->name);
+
+	wake_up(&dlm_domain_events);
+
+	dlm_free_ctxt_mem(dlm);
+
+	spin_lock(&dlm_domain_lock);
+}
+
+void dlm_put(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm_domain_lock);
+	kref_put(&dlm->dlm_refs, dlm_ctxt_release);
+	spin_unlock(&dlm_domain_lock);
+}
+
+static void __dlm_get(struct dlm_ctxt *dlm)
+{
+	kref_get(&dlm->dlm_refs);
+}
+
+/* given a questionable reference to a dlm object, gets a reference if
+ * it can find it in the list, otherwise returns NULL in which case
+ * you shouldn't trust your pointer. */
+struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
+{
+	struct list_head *iter;
+	struct dlm_ctxt *target = NULL;
+
+	spin_lock(&dlm_domain_lock);
+
+	list_for_each(iter, &dlm_domains) {
+		target = list_entry (iter, struct dlm_ctxt, list);
+
+		if (target == dlm) {
+			__dlm_get(target);
+			break;
+		}
+
+		target = NULL;
+	}
+
+	spin_unlock(&dlm_domain_lock);
+
+	return target;
+}
+
+int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
+{
+	int ret;
+
+	spin_lock(&dlm_domain_lock);
+	ret = (dlm->dlm_state == DLM_CTXT_JOINED) ||
+		(dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN);
+	spin_unlock(&dlm_domain_lock);
+
+	return ret;
+}
+
+static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_worker) {
+		flush_workqueue(dlm->dlm_worker);
+		destroy_workqueue(dlm->dlm_worker);
+		dlm->dlm_worker = NULL;
+	}
+}
+
+static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
+{
+	dlm_unregister_domain_handlers(dlm);
+	dlm_debug_shutdown(dlm);
+	dlm_complete_thread(dlm);
+	dlm_complete_recovery_thread(dlm);
+	dlm_destroy_dlm_worker(dlm);
+
+	/* We've left the domain. Now we can take ourselves out of the
+	 * list and allow the kref stuff to help us free the
+	 * memory. */
+	spin_lock(&dlm_domain_lock);
+	list_del_init(&dlm->list);
+	spin_unlock(&dlm_domain_lock);
+
+	/* Wake up anyone waiting for us to remove this domain */
+	wake_up(&dlm_domain_events);
+}
+
+static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
+{
+	int i, num, n, ret = 0;
+	struct dlm_lock_resource *res;
+	struct hlist_node *iter;
+	struct hlist_head *bucket;
+	int dropped;
+
+	mlog(0, "Migrating locks from domain %s\n", dlm->name);
+
+	num = 0;
+	spin_lock(&dlm->spinlock);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+redo_bucket:
+		n = 0;
+		bucket = dlm_lockres_hash(dlm, i);
+		iter = bucket->first;
+		while (iter) {
+			n++;
+			res = hlist_entry(iter, struct dlm_lock_resource,
+					  hash_node);
+			dlm_lockres_get(res);
+			/* migrate, if necessary.  this will drop the dlm
+			 * spinlock and retake it if it does migration. */
+			dropped = dlm_empty_lockres(dlm, res);
+
+			spin_lock(&res->spinlock);
+			if (dropped)
+				__dlm_lockres_calc_usage(dlm, res);
+			else
+				iter = res->hash_node.next;
+			spin_unlock(&res->spinlock);
+
+			dlm_lockres_put(res);
+
+			if (dropped) {
+				cond_resched_lock(&dlm->spinlock);
+				goto redo_bucket;
+			}
+		}
+		cond_resched_lock(&dlm->spinlock);
+		num += n;
+	}
+	spin_unlock(&dlm->spinlock);
+	wake_up(&dlm->dlm_thread_wq);
+
+	/* let the dlm thread take care of purging, keep scanning until
+	 * nothing remains in the hash */
+	if (num) {
+		mlog(0, "%s: %d lock resources in hash last pass\n",
+		     dlm->name, num);
+		ret = -EAGAIN;
+	}
+	mlog(0, "DONE Migrating locks from domain %s\n", dlm->name);
+	return ret;
+}
+
+static int dlm_no_joining_node(struct dlm_ctxt *dlm)
+{
+	int ret;
+
+	spin_lock(&dlm->spinlock);
+	ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN;
+	spin_unlock(&dlm->spinlock);
+
+	return ret;
+}
+
+static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
+					 void *data, void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	unsigned int node;
+	struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
+
+	if (!dlm_grab(dlm))
+		return 0;
+
+	node = exit_msg->node_idx;
+	mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
+
+	spin_lock(&dlm->spinlock);
+	set_bit(node, dlm->exit_domain_map);
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+
+	return 0;
+}
+
+static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
+{
+	/* Yikes, a double spinlock! I need domain_lock for the dlm
+	 * state and the dlm spinlock for join state... Sorry! */
+again:
+	spin_lock(&dlm_domain_lock);
+	spin_lock(&dlm->spinlock);
+
+	if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
+		mlog(0, "Node %d is joining, we wait on it.\n",
+			  dlm->joining_node);
+		spin_unlock(&dlm->spinlock);
+		spin_unlock(&dlm_domain_lock);
+
+		wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm));
+		goto again;
+	}
+
+	dlm->dlm_state = DLM_CTXT_LEAVING;
+	spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm_domain_lock);
+}
+
+static void __dlm_print_nodes(struct dlm_ctxt *dlm)
+{
+	int node = -1;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
+
+	while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		printk("%d ", node);
+	}
+	printk("\n");
+}
+
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
+				   void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	unsigned int node;
+	struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
+
+	mlog(0, "%p %u %p", msg, len, data);
+
+	if (!dlm_grab(dlm))
+		return 0;
+
+	node = exit_msg->node_idx;
+
+	printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
+
+	spin_lock(&dlm->spinlock);
+	clear_bit(node, dlm->domain_map);
+	clear_bit(node, dlm->exit_domain_map);
+	__dlm_print_nodes(dlm);
+
+	/* notify anything attached to the heartbeat events */
+	dlm_hb_event_notify_attached(dlm, node, 0);
+
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+
+	return 0;
+}
+
+static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
+				    unsigned int node)
+{
+	int status;
+	struct dlm_exit_domain leave_msg;
+
+	mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
+	     msg_type, node);
+
+	memset(&leave_msg, 0, sizeof(leave_msg));
+	leave_msg.node_idx = dlm->node_num;
+
+	status = o2net_send_message(msg_type, dlm->key, &leave_msg,
+				    sizeof(leave_msg), node, NULL);
+	if (status < 0)
+		mlog(ML_ERROR, "Error %d sending domain exit message %u "
+		     "to node %u on domain %s\n", status, msg_type, node,
+		     dlm->name);
+
+	return status;
+}
+
+static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
+{
+	int node = -1;
+
+	/* Support for begin exit domain was added in 1.2 */
+	if (dlm->dlm_locking_proto.pv_major == 1 &&
+	    dlm->dlm_locking_proto.pv_minor < 2)
+		return;
+
+	/*
+	 * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
+	 * informational. Meaning if a node does not receive the message,
+	 * so be it.
+	 */
+	spin_lock(&dlm->spinlock);
+	while (1) {
+		node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
+		if (node >= O2NM_MAX_NODES)
+			break;
+		if (node == dlm->node_num)
+			continue;
+
+		spin_unlock(&dlm->spinlock);
+		dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
+		spin_lock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_leave_domain(struct dlm_ctxt *dlm)
+{
+	int node, clear_node, status;
+
+	/* At this point we've migrated away all our locks and won't
+	 * accept mastership of new ones. The dlm is responsible for
+	 * almost nothing now. We make sure not to confuse any joining
+	 * nodes and then commence shutdown procedure. */
+
+	spin_lock(&dlm->spinlock);
+	/* Clear ourselves from the domain map */
+	clear_bit(dlm->node_num, dlm->domain_map);
+	while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
+				     0)) < O2NM_MAX_NODES) {
+		/* Drop the dlm spinlock. This is safe wrt the domain_map.
+		 * -nodes cannot be added now as the
+		 *   query_join_handlers knows to respond with OK_NO_MAP
+		 * -we catch the right network errors if a node is
+		 *   removed from the map while we're sending him the
+		 *   exit message. */
+		spin_unlock(&dlm->spinlock);
+
+		clear_node = 1;
+
+		status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
+						  node);
+		if (status < 0 &&
+		    status != -ENOPROTOOPT &&
+		    status != -ENOTCONN) {
+			mlog(ML_NOTICE, "Error %d sending domain exit message "
+			     "to node %d\n", status, node);
+
+			/* Not sure what to do here but lets sleep for
+			 * a bit in case this was a transient
+			 * error... */
+			msleep(DLM_DOMAIN_BACKOFF_MS);
+			clear_node = 0;
+		}
+
+		spin_lock(&dlm->spinlock);
+		/* If we're not clearing the node bit then we intend
+		 * to loop back around to try again. */
+		if (clear_node)
+			clear_bit(node, dlm->domain_map);
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+int dlm_joined(struct dlm_ctxt *dlm)
+{
+	int ret = 0;
+
+	spin_lock(&dlm_domain_lock);
+
+	if (dlm->dlm_state == DLM_CTXT_JOINED)
+		ret = 1;
+
+	spin_unlock(&dlm_domain_lock);
+
+	return ret;
+}
+
+int dlm_shutting_down(struct dlm_ctxt *dlm)
+{
+	int ret = 0;
+
+	spin_lock(&dlm_domain_lock);
+
+	if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
+		ret = 1;
+
+	spin_unlock(&dlm_domain_lock);
+
+	return ret;
+}
+
+void dlm_unregister_domain(struct dlm_ctxt *dlm)
+{
+	int leave = 0;
+	struct dlm_lock_resource *res;
+
+	spin_lock(&dlm_domain_lock);
+	BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
+	BUG_ON(!dlm->num_joins);
+
+	dlm->num_joins--;
+	if (!dlm->num_joins) {
+		/* We mark it "in shutdown" now so new register
+		 * requests wait until we've completely left the
+		 * domain. Don't use DLM_CTXT_LEAVING yet as we still
+		 * want new domain joins to communicate with us at
+		 * least until we've completed migration of our
+		 * resources. */
+		dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
+		leave = 1;
+	}
+	spin_unlock(&dlm_domain_lock);
+
+	if (leave) {
+		mlog(0, "shutting down domain %s\n", dlm->name);
+		dlm_begin_exit_domain(dlm);
+
+		/* We changed dlm state, notify the thread */
+		dlm_kick_thread(dlm, NULL);
+
+		while (dlm_migrate_all_locks(dlm)) {
+			/* Give dlm_thread time to purge the lockres' */
+			msleep(500);
+			mlog(0, "%s: more migration to do\n", dlm->name);
+		}
+
+		/* This list should be empty. If not, print remaining lockres */
+		if (!list_empty(&dlm->tracking_list)) {
+			mlog(ML_ERROR, "Following lockres' are still on the "
+			     "tracking list:\n");
+			list_for_each_entry(res, &dlm->tracking_list, tracking)
+				dlm_print_one_lock_resource(res);
+		}
+
+		dlm_mark_domain_leaving(dlm);
+		dlm_leave_domain(dlm);
+		dlm_force_free_mles(dlm);
+		dlm_complete_dlm_shutdown(dlm);
+	}
+	dlm_put(dlm);
+}
+EXPORT_SYMBOL_GPL(dlm_unregister_domain);
+
+static int dlm_query_join_proto_check(char *proto_type, int node,
+				      struct dlm_protocol_version *ours,
+				      struct dlm_protocol_version *request)
+{
+	int rc;
+	struct dlm_protocol_version proto = *request;
+
+	if (!dlm_protocol_compare(ours, &proto)) {
+		mlog(0,
+		     "node %u wanted to join with %s locking protocol "
+		     "%u.%u, we respond with %u.%u\n",
+		     node, proto_type,
+		     request->pv_major,
+		     request->pv_minor,
+		     proto.pv_major, proto.pv_minor);
+		request->pv_minor = proto.pv_minor;
+		rc = 0;
+	} else {
+		mlog(ML_NOTICE,
+		     "Node %u wanted to join with %s locking "
+		     "protocol %u.%u, but we have %u.%u, disallowing\n",
+		     node, proto_type,
+		     request->pv_major,
+		     request->pv_minor,
+		     ours->pv_major,
+		     ours->pv_minor);
+		rc = 1;
+	}
+
+	return rc;
+}
+
+/*
+ * struct dlm_query_join_packet is made up of four one-byte fields.  They
+ * are effectively in big-endian order already.  However, little-endian
+ * machines swap them before putting the packet on the wire (because
+ * query_join's response is a status, and that status is treated as a u32
+ * on the wire).  Thus, a big-endian and little-endian machines will treat
+ * this structure differently.
+ *
+ * The solution is to have little-endian machines swap the structure when
+ * converting from the structure to the u32 representation.  This will
+ * result in the structure having the correct format on the wire no matter
+ * the host endian format.
+ */
+static void dlm_query_join_packet_to_wire(struct dlm_query_join_packet *packet,
+					  u32 *wire)
+{
+	union dlm_query_join_response response;
+
+	response.packet = *packet;
+	*wire = cpu_to_be32(response.intval);
+}
+
+static void dlm_query_join_wire_to_packet(u32 wire,
+					  struct dlm_query_join_packet *packet)
+{
+	union dlm_query_join_response response;
+
+	response.intval = cpu_to_be32(wire);
+	*packet = response.packet;
+}
+
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
+				  void **ret_data)
+{
+	struct dlm_query_join_request *query;
+	struct dlm_query_join_packet packet = {
+		.code = JOIN_DISALLOW,
+	};
+	struct dlm_ctxt *dlm = NULL;
+	u32 response;
+	u8 nodenum;
+
+	query = (struct dlm_query_join_request *) msg->buf;
+
+	mlog(0, "node %u wants to join domain %s\n", query->node_idx,
+		  query->domain);
+
+	/*
+	 * If heartbeat doesn't consider the node live, tell it
+	 * to back off and try again.  This gives heartbeat a chance
+	 * to catch up.
+	 */
+	if (!o2hb_check_node_heartbeating(query->node_idx)) {
+		mlog(0, "node %u is not in our live map yet\n",
+		     query->node_idx);
+
+		packet.code = JOIN_DISALLOW;
+		goto respond;
+	}
+
+	packet.code = JOIN_OK_NO_MAP;
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
+	if (!dlm)
+		goto unlock_respond;
+
+	/*
+	 * There is a small window where the joining node may not see the
+	 * node(s) that just left but still part of the cluster. DISALLOW
+	 * join request if joining node has different node map.
+	 */
+	nodenum=0;
+	while (nodenum < O2NM_MAX_NODES) {
+		if (test_bit(nodenum, dlm->domain_map)) {
+			if (!byte_test_bit(nodenum, query->node_map)) {
+				mlog(0, "disallow join as node %u does not "
+				     "have node %u in its nodemap\n",
+				     query->node_idx, nodenum);
+				packet.code = JOIN_DISALLOW;
+				goto unlock_respond;
+			}
+		}
+		nodenum++;
+	}
+
+	/* Once the dlm ctxt is marked as leaving then we don't want
+	 * to be put in someone's domain map.
+	 * Also, explicitly disallow joining at certain troublesome
+	 * times (ie. during recovery). */
+	if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
+		int bit = query->node_idx;
+		spin_lock(&dlm->spinlock);
+
+		if (dlm->dlm_state == DLM_CTXT_NEW &&
+		    dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			/*If this is a brand new context and we
+			 * haven't started our join process yet, then
+			 * the other node won the race. */
+			packet.code = JOIN_OK_NO_MAP;
+		} else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			/* Disallow parallel joins. */
+			packet.code = JOIN_DISALLOW;
+		} else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
+			mlog(0, "node %u trying to join, but recovery "
+			     "is ongoing.\n", bit);
+			packet.code = JOIN_DISALLOW;
+		} else if (test_bit(bit, dlm->recovery_map)) {
+			mlog(0, "node %u trying to join, but it "
+			     "still needs recovery.\n", bit);
+			packet.code = JOIN_DISALLOW;
+		} else if (test_bit(bit, dlm->domain_map)) {
+			mlog(0, "node %u trying to join, but it "
+			     "is still in the domain! needs recovery?\n",
+			     bit);
+			packet.code = JOIN_DISALLOW;
+		} else {
+			/* Alright we're fully a part of this domain
+			 * so we keep some state as to who's joining
+			 * and indicate to him that needs to be fixed
+			 * up. */
+
+			/* Make sure we speak compatible locking protocols.  */
+			if (dlm_query_join_proto_check("DLM", bit,
+						       &dlm->dlm_locking_proto,
+						       &query->dlm_proto)) {
+				packet.code = JOIN_PROTOCOL_MISMATCH;
+			} else if (dlm_query_join_proto_check("fs", bit,
+							      &dlm->fs_locking_proto,
+							      &query->fs_proto)) {
+				packet.code = JOIN_PROTOCOL_MISMATCH;
+			} else {
+				packet.dlm_minor = query->dlm_proto.pv_minor;
+				packet.fs_minor = query->fs_proto.pv_minor;
+				packet.code = JOIN_OK;
+				__dlm_set_joining_node(dlm, query->node_idx);
+			}
+		}
+
+		spin_unlock(&dlm->spinlock);
+	}
+unlock_respond:
+	spin_unlock(&dlm_domain_lock);
+
+respond:
+	mlog(0, "We respond with %u\n", packet.code);
+
+	dlm_query_join_packet_to_wire(&packet, &response);
+	return response;
+}
+
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
+				     void **ret_data)
+{
+	struct dlm_assert_joined *assert;
+	struct dlm_ctxt *dlm = NULL;
+
+	assert = (struct dlm_assert_joined *) msg->buf;
+
+	mlog(0, "node %u asserts join on domain %s\n", assert->node_idx,
+		  assert->domain);
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len);
+	/* XXX should we consider no dlm ctxt an error? */
+	if (dlm) {
+		spin_lock(&dlm->spinlock);
+
+		/* Alright, this node has officially joined our
+		 * domain. Set him in the map and clean up our
+		 * leftover join state. */
+		BUG_ON(dlm->joining_node != assert->node_idx);
+		set_bit(assert->node_idx, dlm->domain_map);
+		clear_bit(assert->node_idx, dlm->exit_domain_map);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+
+		printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
+		       assert->node_idx, dlm->name);
+		__dlm_print_nodes(dlm);
+
+		/* notify anything attached to the heartbeat events */
+		dlm_hb_event_notify_attached(dlm, assert->node_idx, 1);
+
+		spin_unlock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm_domain_lock);
+
+	return 0;
+}
+
+static int dlm_match_regions(struct dlm_ctxt *dlm,
+			     struct dlm_query_region *qr,
+			     char *local, int locallen)
+{
+	char *remote = qr->qr_regions;
+	char *l, *r;
+	int localnr, i, j, foundit;
+	int status = 0;
+
+	if (!o2hb_global_heartbeat_active()) {
+		if (qr->qr_numregions) {
+			mlog(ML_ERROR, "Domain %s: Joining node %d has global "
+			     "heartbeat enabled but local node %d does not\n",
+			     qr->qr_domain, qr->qr_node, dlm->node_num);
+			status = -EINVAL;
+		}
+		goto bail;
+	}
+
+	if (o2hb_global_heartbeat_active() && !qr->qr_numregions) {
+		mlog(ML_ERROR, "Domain %s: Local node %d has global "
+		     "heartbeat enabled but joining node %d does not\n",
+		     qr->qr_domain, dlm->node_num, qr->qr_node);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	r = remote;
+	for (i = 0; i < qr->qr_numregions; ++i) {
+		mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r);
+		r += O2HB_MAX_REGION_NAME_LEN;
+	}
+
+	localnr = min(O2NM_MAX_REGIONS, locallen/O2HB_MAX_REGION_NAME_LEN);
+	localnr = o2hb_get_all_regions(local, (u8)localnr);
+
+	/* compare local regions with remote */
+	l = local;
+	for (i = 0; i < localnr; ++i) {
+		foundit = 0;
+		r = remote;
+		for (j = 0; j <= qr->qr_numregions; ++j) {
+			if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) {
+				foundit = 1;
+				break;
+			}
+			r += O2HB_MAX_REGION_NAME_LEN;
+		}
+		if (!foundit) {
+			status = -EINVAL;
+			mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+			     "in local node %d but not in joining node %d\n",
+			     qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l,
+			     dlm->node_num, qr->qr_node);
+			goto bail;
+		}
+		l += O2HB_MAX_REGION_NAME_LEN;
+	}
+
+	/* compare remote with local regions */
+	r = remote;
+	for (i = 0; i < qr->qr_numregions; ++i) {
+		foundit = 0;
+		l = local;
+		for (j = 0; j < localnr; ++j) {
+			if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) {
+				foundit = 1;
+				break;
+			}
+			l += O2HB_MAX_REGION_NAME_LEN;
+		}
+		if (!foundit) {
+			status = -EINVAL;
+			mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+			     "in joining node %d but not in local node %d\n",
+			     qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r,
+			     qr->qr_node, dlm->node_num);
+			goto bail;
+		}
+		r += O2HB_MAX_REGION_NAME_LEN;
+	}
+
+bail:
+	return status;
+}
+
+static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+	struct dlm_query_region *qr = NULL;
+	int status, ret = 0, i;
+	char *p;
+
+	if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+		goto bail;
+
+	qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
+	if (!qr) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	qr->qr_node = dlm->node_num;
+	qr->qr_namelen = strlen(dlm->name);
+	memcpy(qr->qr_domain, dlm->name, qr->qr_namelen);
+	/* if local hb, the numregions will be zero */
+	if (o2hb_global_heartbeat_active())
+		qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions,
+							 O2NM_MAX_REGIONS);
+
+	p = qr->qr_regions;
+	for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN)
+		mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p);
+
+	i = -1;
+	while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+				  i + 1)) < O2NM_MAX_NODES) {
+		if (i == dlm->node_num)
+			continue;
+
+		mlog(0, "Sending regions to node %d\n", i);
+
+		ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr,
+					 sizeof(struct dlm_query_region),
+					 i, &status);
+		if (ret >= 0)
+			ret = status;
+		if (ret) {
+			mlog(ML_ERROR, "Region mismatch %d, node %d\n",
+			     ret, i);
+			break;
+		}
+	}
+
+bail:
+	kfree(qr);
+	return ret;
+}
+
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+				    void *data, void **ret_data)
+{
+	struct dlm_query_region *qr;
+	struct dlm_ctxt *dlm = NULL;
+	char *local = NULL;
+	int status = 0;
+	int locked = 0;
+
+	qr = (struct dlm_query_region *) msg->buf;
+
+	mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
+	     qr->qr_domain);
+
+	/* buffer used in dlm_mast_regions() */
+	local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
+	if (!local) {
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	status = -EINVAL;
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen);
+	if (!dlm) {
+		mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+		     "before join domain\n", qr->qr_node, qr->qr_domain);
+		goto bail;
+	}
+
+	spin_lock(&dlm->spinlock);
+	locked = 1;
+	if (dlm->joining_node != qr->qr_node) {
+		mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+		     "but joining node is %d\n", qr->qr_node, qr->qr_domain,
+		     dlm->joining_node);
+		goto bail;
+	}
+
+	/* Support for global heartbeat was added in 1.1 */
+	if (dlm->dlm_locking_proto.pv_major == 1 &&
+	    dlm->dlm_locking_proto.pv_minor == 0) {
+		mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+		     "but active dlm protocol is %d.%d\n", qr->qr_node,
+		     qr->qr_domain, dlm->dlm_locking_proto.pv_major,
+		     dlm->dlm_locking_proto.pv_minor);
+		goto bail;
+	}
+
+	status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions));
+
+bail:
+	if (locked)
+		spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm_domain_lock);
+
+	kfree(local);
+
+	return status;
+}
+
+static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn)
+{
+	struct o2nm_node *local;
+	struct dlm_node_info *remote;
+	int i, j;
+	int status = 0;
+
+	for (j = 0; j < qn->qn_numnodes; ++j)
+		mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum,
+		     &(qn->qn_nodes[j].ni_ipv4_address),
+		     ntohs(qn->qn_nodes[j].ni_ipv4_port));
+
+	for (i = 0; i < O2NM_MAX_NODES && !status; ++i) {
+		local = o2nm_get_node_by_num(i);
+		remote = NULL;
+		for (j = 0; j < qn->qn_numnodes; ++j) {
+			if (qn->qn_nodes[j].ni_nodenum == i) {
+				remote = &(qn->qn_nodes[j]);
+				break;
+			}
+		}
+
+		if (!local && !remote)
+			continue;
+
+		if ((local && !remote) || (!local && remote))
+			status = -EINVAL;
+
+		if (!status &&
+		    ((remote->ni_nodenum != local->nd_num) ||
+		     (remote->ni_ipv4_port != local->nd_ipv4_port) ||
+		     (remote->ni_ipv4_address != local->nd_ipv4_address)))
+			status = -EINVAL;
+
+		if (status) {
+			if (remote && !local)
+				mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+				     "registered in joining node %d but not in "
+				     "local node %d\n", qn->qn_domain,
+				     remote->ni_nodenum,
+				     &(remote->ni_ipv4_address),
+				     ntohs(remote->ni_ipv4_port),
+				     qn->qn_nodenum, dlm->node_num);
+			if (local && !remote)
+				mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+				     "registered in local node %d but not in "
+				     "joining node %d\n", qn->qn_domain,
+				     local->nd_num, &(local->nd_ipv4_address),
+				     ntohs(local->nd_ipv4_port),
+				     dlm->node_num, qn->qn_nodenum);
+			BUG_ON((!local && !remote));
+		}
+
+		if (local)
+			o2nm_node_put(local);
+	}
+
+	return status;
+}
+
+static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+	struct dlm_query_nodeinfo *qn = NULL;
+	struct o2nm_node *node;
+	int ret = 0, status, count, i;
+
+	if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+		goto bail;
+
+	qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
+	if (!qn) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) {
+		node = o2nm_get_node_by_num(i);
+		if (!node)
+			continue;
+		qn->qn_nodes[count].ni_nodenum = node->nd_num;
+		qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port;
+		qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address;
+		mlog(0, "Node %3d, %pI4:%u\n", node->nd_num,
+		     &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port));
+		++count;
+		o2nm_node_put(node);
+	}
+
+	qn->qn_nodenum = dlm->node_num;
+	qn->qn_numnodes = count;
+	qn->qn_namelen = strlen(dlm->name);
+	memcpy(qn->qn_domain, dlm->name, qn->qn_namelen);
+
+	i = -1;
+	while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+				  i + 1)) < O2NM_MAX_NODES) {
+		if (i == dlm->node_num)
+			continue;
+
+		mlog(0, "Sending nodeinfo to node %d\n", i);
+
+		ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+					 qn, sizeof(struct dlm_query_nodeinfo),
+					 i, &status);
+		if (ret >= 0)
+			ret = status;
+		if (ret) {
+			mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i);
+			break;
+		}
+	}
+
+bail:
+	kfree(qn);
+	return ret;
+}
+
+static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
+				      void *data, void **ret_data)
+{
+	struct dlm_query_nodeinfo *qn;
+	struct dlm_ctxt *dlm = NULL;
+	int locked = 0, status = -EINVAL;
+
+	qn = (struct dlm_query_nodeinfo *) msg->buf;
+
+	mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum,
+	     qn->qn_domain);
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen);
+	if (!dlm) {
+		mlog(ML_ERROR, "Node %d queried nodes on domain %s before "
+		     "join domain\n", qn->qn_nodenum, qn->qn_domain);
+		goto bail;
+	}
+
+	spin_lock(&dlm->spinlock);
+	locked = 1;
+	if (dlm->joining_node != qn->qn_nodenum) {
+		mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
+		     "joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
+		     dlm->joining_node);
+		goto bail;
+	}
+
+	/* Support for node query was added in 1.1 */
+	if (dlm->dlm_locking_proto.pv_major == 1 &&
+	    dlm->dlm_locking_proto.pv_minor == 0) {
+		mlog(ML_ERROR, "Node %d queried nodes on domain %s "
+		     "but active dlm protocol is %d.%d\n", qn->qn_nodenum,
+		     qn->qn_domain, dlm->dlm_locking_proto.pv_major,
+		     dlm->dlm_locking_proto.pv_minor);
+		goto bail;
+	}
+
+	status = dlm_match_nodes(dlm, qn);
+
+bail:
+	if (locked)
+		spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm_domain_lock);
+
+	return status;
+}
+
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
+				   void **ret_data)
+{
+	struct dlm_cancel_join *cancel;
+	struct dlm_ctxt *dlm = NULL;
+
+	cancel = (struct dlm_cancel_join *) msg->buf;
+
+	mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx,
+		  cancel->domain);
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len);
+
+	if (dlm) {
+		spin_lock(&dlm->spinlock);
+
+		/* Yikes, this guy wants to cancel his join. No
+		 * problem, we simply cleanup our join state. */
+		BUG_ON(dlm->joining_node != cancel->node_idx);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+
+		spin_unlock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm_domain_lock);
+
+	return 0;
+}
+
+static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
+				    unsigned int node)
+{
+	int status;
+	struct dlm_cancel_join cancel_msg;
+
+	memset(&cancel_msg, 0, sizeof(cancel_msg));
+	cancel_msg.node_idx = dlm->node_num;
+	cancel_msg.name_len = strlen(dlm->name);
+	memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len);
+
+	status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+				    &cancel_msg, sizeof(cancel_msg), node,
+				    NULL);
+	if (status < 0) {
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+		     node);
+		goto bail;
+	}
+
+bail:
+	return status;
+}
+
+/* map_size should be in bytes. */
+static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
+				 unsigned long *node_map,
+				 unsigned int map_size)
+{
+	int status, tmpstat;
+	unsigned int node;
+
+	if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
+			 sizeof(unsigned long))) {
+		mlog(ML_ERROR,
+		     "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n",
+		     map_size, (unsigned)BITS_TO_LONGS(O2NM_MAX_NODES));
+		return -EINVAL;
+	}
+
+	status = 0;
+	node = -1;
+	while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		if (node == dlm->node_num)
+			continue;
+
+		tmpstat = dlm_send_one_join_cancel(dlm, node);
+		if (tmpstat) {
+			mlog(ML_ERROR, "Error return %d cancelling join on "
+			     "node %d\n", tmpstat, node);
+			if (!status)
+				status = tmpstat;
+		}
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static int dlm_request_join(struct dlm_ctxt *dlm,
+			    int node,
+			    enum dlm_query_join_response_code *response)
+{
+	int status;
+	struct dlm_query_join_request join_msg;
+	struct dlm_query_join_packet packet;
+	u32 join_resp;
+
+	mlog(0, "querying node %d\n", node);
+
+	memset(&join_msg, 0, sizeof(join_msg));
+	join_msg.node_idx = dlm->node_num;
+	join_msg.name_len = strlen(dlm->name);
+	memcpy(join_msg.domain, dlm->name, join_msg.name_len);
+	join_msg.dlm_proto = dlm->dlm_locking_proto;
+	join_msg.fs_proto = dlm->fs_locking_proto;
+
+	/* copy live node map to join message */
+	byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
+
+	status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
+				    sizeof(join_msg), node, &join_resp);
+	if (status < 0 && status != -ENOPROTOOPT) {
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+		     node);
+		goto bail;
+	}
+	dlm_query_join_wire_to_packet(join_resp, &packet);
+
+	/* -ENOPROTOOPT from the net code means the other side isn't
+	    listening for our message type -- that's fine, it means
+	    his dlm isn't up, so we can consider him a 'yes' but not
+	    joined into the domain.  */
+	if (status == -ENOPROTOOPT) {
+		status = 0;
+		*response = JOIN_OK_NO_MAP;
+	} else if (packet.code == JOIN_DISALLOW ||
+		   packet.code == JOIN_OK_NO_MAP) {
+		*response = packet.code;
+	} else if (packet.code == JOIN_PROTOCOL_MISMATCH) {
+		mlog(ML_NOTICE,
+		     "This node requested DLM locking protocol %u.%u and "
+		     "filesystem locking protocol %u.%u.  At least one of "
+		     "the protocol versions on node %d is not compatible, "
+		     "disconnecting\n",
+		     dlm->dlm_locking_proto.pv_major,
+		     dlm->dlm_locking_proto.pv_minor,
+		     dlm->fs_locking_proto.pv_major,
+		     dlm->fs_locking_proto.pv_minor,
+		     node);
+		status = -EPROTO;
+		*response = packet.code;
+	} else if (packet.code == JOIN_OK) {
+		*response = packet.code;
+		/* Use the same locking protocol as the remote node */
+		dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
+		dlm->fs_locking_proto.pv_minor = packet.fs_minor;
+		mlog(0,
+		     "Node %d responds JOIN_OK with DLM locking protocol "
+		     "%u.%u and fs locking protocol %u.%u\n",
+		     node,
+		     dlm->dlm_locking_proto.pv_major,
+		     dlm->dlm_locking_proto.pv_minor,
+		     dlm->fs_locking_proto.pv_major,
+		     dlm->fs_locking_proto.pv_minor);
+	} else {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid response %d from node %u\n",
+		     packet.code, node);
+	}
+
+	mlog(0, "status %d, node %d response is %d\n", status, node,
+	     *response);
+
+bail:
+	return status;
+}
+
+static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
+				    unsigned int node)
+{
+	int status;
+	struct dlm_assert_joined assert_msg;
+
+	mlog(0, "Sending join assert to node %u\n", node);
+
+	memset(&assert_msg, 0, sizeof(assert_msg));
+	assert_msg.node_idx = dlm->node_num;
+	assert_msg.name_len = strlen(dlm->name);
+	memcpy(assert_msg.domain, dlm->name, assert_msg.name_len);
+
+	status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+				    &assert_msg, sizeof(assert_msg), node,
+				    NULL);
+	if (status < 0)
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+		     node);
+
+	return status;
+}
+
+static void dlm_send_join_asserts(struct dlm_ctxt *dlm,
+				  unsigned long *node_map)
+{
+	int status, node, live;
+
+	status = 0;
+	node = -1;
+	while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		if (node == dlm->node_num)
+			continue;
+
+		do {
+			/* It is very important that this message be
+			 * received so we spin until either the node
+			 * has died or it gets the message. */
+			status = dlm_send_one_join_assert(dlm, node);
+
+			spin_lock(&dlm->spinlock);
+			live = test_bit(node, dlm->live_nodes_map);
+			spin_unlock(&dlm->spinlock);
+
+			if (status) {
+				mlog(ML_ERROR, "Error return %d asserting "
+				     "join on node %d\n", status, node);
+
+				/* give us some time between errors... */
+				if (live)
+					msleep(DLM_DOMAIN_BACKOFF_MS);
+			}
+		} while (status && live);
+	}
+}
+
+struct domain_join_ctxt {
+	unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+static int dlm_should_restart_join(struct dlm_ctxt *dlm,
+				   struct domain_join_ctxt *ctxt,
+				   enum dlm_query_join_response_code response)
+{
+	int ret;
+
+	if (response == JOIN_DISALLOW) {
+		mlog(0, "Latest response of disallow -- should restart\n");
+		return 1;
+	}
+
+	spin_lock(&dlm->spinlock);
+	/* For now, we restart the process if the node maps have
+	 * changed at all */
+	ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
+		     sizeof(dlm->live_nodes_map));
+	spin_unlock(&dlm->spinlock);
+
+	if (ret)
+		mlog(0, "Node maps changed -- should restart\n");
+
+	return ret;
+}
+
+static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
+{
+	int status = 0, tmpstat, node;
+	struct domain_join_ctxt *ctxt;
+	enum dlm_query_join_response_code response = JOIN_DISALLOW;
+
+	mlog(0, "%p", dlm);
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* group sem locking should work for us here -- we're already
+	 * registered for heartbeat events so filling this should be
+	 * atomic wrt getting those handlers called. */
+	o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
+
+	spin_lock(&dlm->spinlock);
+	memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
+
+	__dlm_set_joining_node(dlm, dlm->node_num);
+
+	spin_unlock(&dlm->spinlock);
+
+	node = -1;
+	while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		if (node == dlm->node_num)
+			continue;
+
+		status = dlm_request_join(dlm, node, &response);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* Ok, either we got a response or the node doesn't have a
+		 * dlm up. */
+		if (response == JOIN_OK)
+			set_bit(node, ctxt->yes_resp_map);
+
+		if (dlm_should_restart_join(dlm, ctxt, response)) {
+			status = -EAGAIN;
+			goto bail;
+		}
+	}
+
+	mlog(0, "Yay, done querying nodes!\n");
+
+	/* Yay, everyone agree's we can join the domain. My domain is
+	 * comprised of all nodes who were put in the
+	 * yes_resp_map. Copy that into our domain map and send a join
+	 * assert message to clean up everyone elses state. */
+	spin_lock(&dlm->spinlock);
+	memcpy(dlm->domain_map, ctxt->yes_resp_map,
+	       sizeof(ctxt->yes_resp_map));
+	set_bit(dlm->node_num, dlm->domain_map);
+	spin_unlock(&dlm->spinlock);
+
+	/* Support for global heartbeat and node info was added in 1.1 */
+	if (dlm->dlm_locking_proto.pv_major > 1 ||
+	    dlm->dlm_locking_proto.pv_minor > 0) {
+		status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+		status = dlm_send_regions(dlm, ctxt->yes_resp_map);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
+
+	/* Joined state *must* be set before the joining node
+	 * information, otherwise the query_join handler may read no
+	 * current joiner but a state of NEW and tell joining nodes
+	 * we're not in the domain. */
+	spin_lock(&dlm_domain_lock);
+	dlm->dlm_state = DLM_CTXT_JOINED;
+	dlm->num_joins++;
+	spin_unlock(&dlm_domain_lock);
+
+bail:
+	spin_lock(&dlm->spinlock);
+	__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+	if (!status)
+		__dlm_print_nodes(dlm);
+	spin_unlock(&dlm->spinlock);
+
+	if (ctxt) {
+		/* Do we need to send a cancel message to any nodes? */
+		if (status < 0) {
+			tmpstat = dlm_send_join_cancels(dlm,
+							ctxt->yes_resp_map,
+							sizeof(ctxt->yes_resp_map));
+			if (tmpstat < 0)
+				mlog_errno(tmpstat);
+		}
+		kfree(ctxt);
+	}
+
+	mlog(0, "returning %d\n", status);
+	return status;
+}
+
+static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
+{
+	o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
+	o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
+	o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
+}
+
+static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
+{
+	int status;
+
+	mlog(0, "registering handlers.\n");
+
+	o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
+			    dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
+	status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
+	if (status)
+		goto bail;
+
+	o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
+			    dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
+	status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key,
+					sizeof(struct dlm_master_request),
+					dlm_master_request_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key,
+					sizeof(struct dlm_assert_master),
+					dlm_assert_master_handler,
+					dlm, dlm_assert_master_post_handler,
+					&dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key,
+					sizeof(struct dlm_create_lock),
+					dlm_create_lock_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key,
+					DLM_CONVERT_LOCK_MAX_LEN,
+					dlm_convert_lock_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key,
+					DLM_UNLOCK_LOCK_MAX_LEN,
+					dlm_unlock_lock_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key,
+					DLM_PROXY_AST_MAX_LEN,
+					dlm_proxy_ast_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key,
+					sizeof(struct dlm_exit_domain),
+					dlm_exit_domain_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_DEREF_LOCKRES_MSG, dlm->key,
+					sizeof(struct dlm_deref_lockres),
+					dlm_deref_lockres_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key,
+					sizeof(struct dlm_migrate_request),
+					dlm_migrate_request_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key,
+					DLM_MIG_LOCKRES_MAX_LEN,
+					dlm_mig_lockres_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key,
+					sizeof(struct dlm_master_requery),
+					dlm_master_requery_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key,
+					sizeof(struct dlm_lock_request),
+					dlm_request_all_locks_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key,
+					sizeof(struct dlm_reco_data_done),
+					dlm_reco_data_done_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key,
+					sizeof(struct dlm_begin_reco),
+					dlm_begin_reco_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key,
+					sizeof(struct dlm_finalize_reco),
+					dlm_finalize_reco_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
+					sizeof(struct dlm_exit_domain),
+					dlm_begin_exit_domain_handler,
+					dlm, NULL, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+bail:
+	if (status)
+		dlm_unregister_domain_handlers(dlm);
+
+	return status;
+}
+
+static int dlm_join_domain(struct dlm_ctxt *dlm)
+{
+	int status;
+	unsigned int backoff;
+	unsigned int total_backoff = 0;
+
+	BUG_ON(!dlm);
+
+	mlog(0, "Join domain %s\n", dlm->name);
+
+	status = dlm_register_domain_handlers(dlm);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = dlm_debug_init(dlm);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = dlm_launch_thread(dlm);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = dlm_launch_recovery_thread(dlm);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
+	if (!dlm->dlm_worker) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	do {
+		status = dlm_try_to_join_domain(dlm);
+
+		/* If we're racing another node to the join, then we
+		 * need to back off temporarily and let them
+		 * complete. */
+#define	DLM_JOIN_TIMEOUT_MSECS	90000
+		if (status == -EAGAIN) {
+			if (signal_pending(current)) {
+				status = -ERESTARTSYS;
+				goto bail;
+			}
+
+			if (total_backoff >
+			    msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) {
+				status = -ERESTARTSYS;
+				mlog(ML_NOTICE, "Timed out joining dlm domain "
+				     "%s after %u msecs\n", dlm->name,
+				     jiffies_to_msecs(total_backoff));
+				goto bail;
+			}
+
+			/*
+			 * <chip> After you!
+			 * <dale> No, after you!
+			 * <chip> I insist!
+			 * <dale> But you first!
+			 * ...
+			 */
+			backoff = (unsigned int)(jiffies & 0x3);
+			backoff *= DLM_DOMAIN_BACKOFF_MS;
+			total_backoff += backoff;
+			mlog(0, "backoff %d\n", backoff);
+			msleep(backoff);
+		}
+	} while (status == -EAGAIN);
+
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	wake_up(&dlm_domain_events);
+
+	if (status) {
+		dlm_unregister_domain_handlers(dlm);
+		dlm_debug_shutdown(dlm);
+		dlm_complete_thread(dlm);
+		dlm_complete_recovery_thread(dlm);
+		dlm_destroy_dlm_worker(dlm);
+	}
+
+	return status;
+}
+
+static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
+				u32 key)
+{
+	int i;
+	int ret;
+	struct dlm_ctxt *dlm = NULL;
+
+	dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
+	if (!dlm) {
+		mlog_errno(-ENOMEM);
+		goto leave;
+	}
+
+	dlm->name = kstrdup(domain, GFP_KERNEL);
+	if (dlm->name == NULL) {
+		mlog_errno(-ENOMEM);
+		kfree(dlm);
+		dlm = NULL;
+		goto leave;
+	}
+
+	dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
+	if (!dlm->lockres_hash) {
+		mlog_errno(-ENOMEM);
+		kfree(dlm->name);
+		kfree(dlm);
+		dlm = NULL;
+		goto leave;
+	}
+
+	for (i = 0; i < DLM_HASH_BUCKETS; i++)
+		INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
+
+	dlm->master_hash = (struct hlist_head **)
+				dlm_alloc_pagevec(DLM_HASH_PAGES);
+	if (!dlm->master_hash) {
+		mlog_errno(-ENOMEM);
+		dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+		kfree(dlm->name);
+		kfree(dlm);
+		dlm = NULL;
+		goto leave;
+	}
+
+	for (i = 0; i < DLM_HASH_BUCKETS; i++)
+		INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
+
+	dlm->key = key;
+	dlm->node_num = o2nm_this_node();
+
+	ret = dlm_create_debugfs_subroot(dlm);
+	if (ret < 0) {
+		dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
+		dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+		kfree(dlm->name);
+		kfree(dlm);
+		dlm = NULL;
+		goto leave;
+	}
+
+	spin_lock_init(&dlm->spinlock);
+	spin_lock_init(&dlm->master_lock);
+	spin_lock_init(&dlm->ast_lock);
+	spin_lock_init(&dlm->track_lock);
+	INIT_LIST_HEAD(&dlm->list);
+	INIT_LIST_HEAD(&dlm->dirty_list);
+	INIT_LIST_HEAD(&dlm->reco.resources);
+	INIT_LIST_HEAD(&dlm->reco.received);
+	INIT_LIST_HEAD(&dlm->reco.node_data);
+	INIT_LIST_HEAD(&dlm->purge_list);
+	INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
+	INIT_LIST_HEAD(&dlm->tracking_list);
+	dlm->reco.state = 0;
+
+	INIT_LIST_HEAD(&dlm->pending_asts);
+	INIT_LIST_HEAD(&dlm->pending_basts);
+
+	mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n",
+		  dlm->recovery_map, &(dlm->recovery_map[0]));
+
+	memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map));
+	memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map));
+	memset(dlm->domain_map, 0, sizeof(dlm->domain_map));
+
+	dlm->dlm_thread_task = NULL;
+	dlm->dlm_reco_thread_task = NULL;
+	dlm->dlm_worker = NULL;
+	init_waitqueue_head(&dlm->dlm_thread_wq);
+	init_waitqueue_head(&dlm->dlm_reco_thread_wq);
+	init_waitqueue_head(&dlm->reco.event);
+	init_waitqueue_head(&dlm->ast_wq);
+	init_waitqueue_head(&dlm->migration_wq);
+	INIT_LIST_HEAD(&dlm->mle_hb_events);
+
+	dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
+	init_waitqueue_head(&dlm->dlm_join_events);
+
+	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+
+	atomic_set(&dlm->res_tot_count, 0);
+	atomic_set(&dlm->res_cur_count, 0);
+	for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) {
+		atomic_set(&dlm->mle_tot_count[i], 0);
+		atomic_set(&dlm->mle_cur_count[i], 0);
+	}
+
+	spin_lock_init(&dlm->work_lock);
+	INIT_LIST_HEAD(&dlm->work_list);
+	INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work);
+
+	kref_init(&dlm->dlm_refs);
+	dlm->dlm_state = DLM_CTXT_NEW;
+
+	INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks);
+
+	mlog(0, "context init: refcount %u\n",
+		  atomic_read(&dlm->dlm_refs.refcount));
+
+leave:
+	return dlm;
+}
+
+/*
+ * Compare a requested locking protocol version against the current one.
+ *
+ * If the major numbers are different, they are incompatible.
+ * If the current minor is greater than the request, they are incompatible.
+ * If the current minor is less than or equal to the request, they are
+ * compatible, and the requester should run at the current minor version.
+ */
+static int dlm_protocol_compare(struct dlm_protocol_version *existing,
+				struct dlm_protocol_version *request)
+{
+	if (existing->pv_major != request->pv_major)
+		return 1;
+
+	if (existing->pv_minor > request->pv_minor)
+		return 1;
+
+	if (existing->pv_minor < request->pv_minor)
+		request->pv_minor = existing->pv_minor;
+
+	return 0;
+}
+
+/*
+ * dlm_register_domain: one-time setup per "domain".
+ *
+ * The filesystem passes in the requested locking version via proto.
+ * If registration was successful, proto will contain the negotiated
+ * locking protocol.
+ */
+struct dlm_ctxt * dlm_register_domain(const char *domain,
+			       u32 key,
+			       struct dlm_protocol_version *fs_proto)
+{
+	int ret;
+	struct dlm_ctxt *dlm = NULL;
+	struct dlm_ctxt *new_ctxt = NULL;
+
+	if (strlen(domain) >= O2NM_MAX_NAME_LEN) {
+		ret = -ENAMETOOLONG;
+		mlog(ML_ERROR, "domain name length too long\n");
+		goto leave;
+	}
+
+	if (!o2hb_check_local_node_heartbeating()) {
+		mlog(ML_ERROR, "the local node has not been configured, or is "
+		     "not heartbeating\n");
+		ret = -EPROTO;
+		goto leave;
+	}
+
+	mlog(0, "register called for domain \"%s\"\n", domain);
+
+retry:
+	dlm = NULL;
+	if (signal_pending(current)) {
+		ret = -ERESTARTSYS;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	spin_lock(&dlm_domain_lock);
+
+	dlm = __dlm_lookup_domain(domain);
+	if (dlm) {
+		if (dlm->dlm_state != DLM_CTXT_JOINED) {
+			spin_unlock(&dlm_domain_lock);
+
+			mlog(0, "This ctxt is not joined yet!\n");
+			wait_event_interruptible(dlm_domain_events,
+						 dlm_wait_on_domain_helper(
+							 domain));
+			goto retry;
+		}
+
+		if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
+			spin_unlock(&dlm_domain_lock);
+			mlog(ML_ERROR,
+			     "Requested locking protocol version is not "
+			     "compatible with already registered domain "
+			     "\"%s\"\n", domain);
+			ret = -EPROTO;
+			goto leave;
+		}
+
+		__dlm_get(dlm);
+		dlm->num_joins++;
+
+		spin_unlock(&dlm_domain_lock);
+
+		ret = 0;
+		goto leave;
+	}
+
+	/* doesn't exist */
+	if (!new_ctxt) {
+		spin_unlock(&dlm_domain_lock);
+
+		new_ctxt = dlm_alloc_ctxt(domain, key);
+		if (new_ctxt)
+			goto retry;
+
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	/* a little variable switch-a-roo here... */
+	dlm = new_ctxt;
+	new_ctxt = NULL;
+
+	/* add the new domain */
+	list_add_tail(&dlm->list, &dlm_domains);
+	spin_unlock(&dlm_domain_lock);
+
+	/*
+	 * Pass the locking protocol version into the join.  If the join
+	 * succeeds, it will have the negotiated protocol set.
+	 */
+	dlm->dlm_locking_proto = dlm_protocol;
+	dlm->fs_locking_proto = *fs_proto;
+
+	ret = dlm_join_domain(dlm);
+	if (ret) {
+		mlog_errno(ret);
+		dlm_put(dlm);
+		goto leave;
+	}
+
+	/* Tell the caller what locking protocol we negotiated */
+	*fs_proto = dlm->fs_locking_proto;
+
+	ret = 0;
+leave:
+	if (new_ctxt)
+		dlm_free_ctxt_mem(new_ctxt);
+
+	if (ret < 0)
+		dlm = ERR_PTR(ret);
+
+	return dlm;
+}
+EXPORT_SYMBOL_GPL(dlm_register_domain);
+
+static LIST_HEAD(dlm_join_handlers);
+
+static void dlm_unregister_net_handlers(void)
+{
+	o2net_unregister_handler_list(&dlm_join_handlers);
+}
+
+static int dlm_register_net_handlers(void)
+{
+	int status = 0;
+
+	status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+					sizeof(struct dlm_query_join_request),
+					dlm_query_join_handler,
+					NULL, NULL, &dlm_join_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+					sizeof(struct dlm_assert_joined),
+					dlm_assert_joined_handler,
+					NULL, NULL, &dlm_join_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+					sizeof(struct dlm_cancel_join),
+					dlm_cancel_join_handler,
+					NULL, NULL, &dlm_join_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY,
+					sizeof(struct dlm_query_region),
+					dlm_query_region_handler,
+					NULL, NULL, &dlm_join_handlers);
+
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+					sizeof(struct dlm_query_nodeinfo),
+					dlm_query_nodeinfo_handler,
+					NULL, NULL, &dlm_join_handlers);
+bail:
+	if (status < 0)
+		dlm_unregister_net_handlers();
+
+	return status;
+}
+
+/* Domain eviction callback handling.
+ *
+ * The file system requires notification of node death *before* the
+ * dlm completes it's recovery work, otherwise it may be able to
+ * acquire locks on resources requiring recovery. Since the dlm can
+ * evict a node from it's domain *before* heartbeat fires, a similar
+ * mechanism is required. */
+
+/* Eviction is not expected to happen often, so a per-domain lock is
+ * not necessary. Eviction callbacks are allowed to sleep for short
+ * periods of time. */
+static DECLARE_RWSEM(dlm_callback_sem);
+
+void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
+					int node_num)
+{
+	struct list_head *iter;
+	struct dlm_eviction_cb *cb;
+
+	down_read(&dlm_callback_sem);
+	list_for_each(iter, &dlm->dlm_eviction_callbacks) {
+		cb = list_entry(iter, struct dlm_eviction_cb, ec_item);
+
+		cb->ec_func(node_num, cb->ec_data);
+	}
+	up_read(&dlm_callback_sem);
+}
+
+void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
+			   dlm_eviction_func *f,
+			   void *data)
+{
+	INIT_LIST_HEAD(&cb->ec_item);
+	cb->ec_func = f;
+	cb->ec_data = data;
+}
+EXPORT_SYMBOL_GPL(dlm_setup_eviction_cb);
+
+void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
+			      struct dlm_eviction_cb *cb)
+{
+	down_write(&dlm_callback_sem);
+	list_add_tail(&cb->ec_item, &dlm->dlm_eviction_callbacks);
+	up_write(&dlm_callback_sem);
+}
+EXPORT_SYMBOL_GPL(dlm_register_eviction_cb);
+
+void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb)
+{
+	down_write(&dlm_callback_sem);
+	list_del_init(&cb->ec_item);
+	up_write(&dlm_callback_sem);
+}
+EXPORT_SYMBOL_GPL(dlm_unregister_eviction_cb);
+
+static int __init dlm_init(void)
+{
+	int status;
+
+	dlm_print_version();
+
+	status = dlm_init_mle_cache();
+	if (status) {
+		mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
+		goto error;
+	}
+
+	status = dlm_init_master_caches();
+	if (status) {
+		mlog(ML_ERROR, "Could not create o2dlm_lockres and "
+		     "o2dlm_lockname slabcaches\n");
+		goto error;
+	}
+
+	status = dlm_init_lock_cache();
+	if (status) {
+		mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n");
+		goto error;
+	}
+
+	status = dlm_register_net_handlers();
+	if (status) {
+		mlog(ML_ERROR, "Unable to register network handlers\n");
+		goto error;
+	}
+
+	status = dlm_create_debugfs_root();
+	if (status)
+		goto error;
+
+	return 0;
+error:
+	dlm_unregister_net_handlers();
+	dlm_destroy_lock_cache();
+	dlm_destroy_master_caches();
+	dlm_destroy_mle_cache();
+	return -1;
+}
+
+static void __exit dlm_exit (void)
+{
+	dlm_destroy_debugfs_root();
+	dlm_unregister_net_handlers();
+	dlm_destroy_lock_cache();
+	dlm_destroy_master_caches();
+	dlm_destroy_mle_cache();
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+module_init(dlm_init);
+module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
new file mode 100644
index 00000000..2f7f60bf
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -0,0 +1,36 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdomain.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMDOMAIN_H
+#define DLMDOMAIN_H
+
+extern spinlock_t dlm_domain_lock;
+extern struct list_head dlm_domains;
+
+int dlm_joined(struct dlm_ctxt *dlm);
+int dlm_shutting_down(struct dlm_ctxt *dlm);
+void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
+					int node_num);
+
+#endif
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
new file mode 100644
index 00000000..8d39e0fd
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -0,0 +1,767 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmlock.c
+ *
+ * underlying calls for lock creation
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#include "dlmconvert.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+static struct kmem_cache *dlm_lock_cache = NULL;
+
+static DEFINE_SPINLOCK(dlm_cookie_lock);
+static u64 dlm_next_cookie = 1;
+
+static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
+					       struct dlm_lock_resource *res,
+					       struct dlm_lock *lock, int flags);
+static void dlm_init_lock(struct dlm_lock *newlock, int type,
+			  u8 node, u64 cookie);
+static void dlm_lock_release(struct kref *kref);
+static void dlm_lock_detach_lockres(struct dlm_lock *lock);
+
+int dlm_init_lock_cache(void)
+{
+	dlm_lock_cache = kmem_cache_create("o2dlm_lock",
+					   sizeof(struct dlm_lock),
+					   0, SLAB_HWCACHE_ALIGN, NULL);
+	if (dlm_lock_cache == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void dlm_destroy_lock_cache(void)
+{
+	if (dlm_lock_cache)
+		kmem_cache_destroy(dlm_lock_cache);
+}
+
+/* Tell us whether we can grant a new lock request.
+ * locking:
+ *   caller needs:  res->spinlock
+ *   taken:         none
+ *   held on exit:  none
+ * returns: 1 if the lock can be granted, 0 otherwise.
+ */
+static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
+				  struct dlm_lock *lock)
+{
+	struct list_head *iter;
+	struct dlm_lock *tmplock;
+
+	list_for_each(iter, &res->granted) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+
+		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
+			return 0;
+	}
+
+	list_for_each(iter, &res->converting) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+
+		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
+			return 0;
+		if (!dlm_lock_compatible(tmplock->ml.convert_type,
+					 lock->ml.type))
+			return 0;
+	}
+
+	return 1;
+}
+
+/* performs lock creation at the lockres master site
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_NOTQUEUED
+ */
+static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      struct dlm_lock *lock, int flags)
+{
+	int call_ast = 0, kick_thread = 0;
+	enum dlm_status status = DLM_NORMAL;
+
+	mlog(0, "type=%d\n", lock->ml.type);
+
+	spin_lock(&res->spinlock);
+	/* if called from dlm_create_lock_handler, need to
+	 * ensure it will not sleep in dlm_wait_on_lockres */
+	status = __dlm_lockres_state_to_status(res);
+	if (status != DLM_NORMAL &&
+	    lock->ml.node != dlm->node_num) {
+		/* erf.  state changed after lock was dropped. */
+		spin_unlock(&res->spinlock);
+		dlm_error(status);
+		return status;
+	}
+	__dlm_wait_on_lockres(res);
+	__dlm_lockres_reserve_ast(res);
+
+	if (dlm_can_grant_new_lock(res, lock)) {
+		mlog(0, "I can grant this lock right away\n");
+		/* got it right away */
+		lock->lksb->status = DLM_NORMAL;
+		status = DLM_NORMAL;
+		dlm_lock_get(lock);
+		list_add_tail(&lock->list, &res->granted);
+
+		/* for the recovery lock, we can't allow the ast
+		 * to be queued since the dlmthread is already
+		 * frozen.  but the recovery lock is always locked
+		 * with LKM_NOQUEUE so we do not need the ast in
+		 * this special case */
+		if (!dlm_is_recovery_lock(res->lockname.name,
+					  res->lockname.len)) {
+			kick_thread = 1;
+			call_ast = 1;
+		} else {
+			mlog(0, "%s: returning DLM_NORMAL to "
+			     "node %u for reco lock\n", dlm->name,
+			     lock->ml.node);
+		}
+	} else {
+		/* for NOQUEUE request, unless we get the
+		 * lock right away, return DLM_NOTQUEUED */
+		if (flags & LKM_NOQUEUE) {
+			status = DLM_NOTQUEUED;
+			if (dlm_is_recovery_lock(res->lockname.name,
+						 res->lockname.len)) {
+				mlog(0, "%s: returning NOTQUEUED to "
+				     "node %u for reco lock\n", dlm->name,
+				     lock->ml.node);
+			}
+		} else {
+			dlm_lock_get(lock);
+			list_add_tail(&lock->list, &res->blocked);
+			kick_thread = 1;
+		}
+	}
+	/* reduce the inflight count, this may result in the lockres
+	 * being purged below during calc_usage */
+	if (lock->ml.node == dlm->node_num)
+		dlm_lockres_drop_inflight_ref(dlm, res);
+
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	/* either queue the ast or release it */
+	if (call_ast)
+		dlm_queue_ast(dlm, lock);
+	else
+		dlm_lockres_release_ast(dlm, res);
+
+	dlm_lockres_calc_usage(dlm, res);
+	if (kick_thread)
+		dlm_kick_thread(dlm, res);
+
+	return status;
+}
+
+void dlm_revert_pending_lock(struct dlm_lock_resource *res,
+			     struct dlm_lock *lock)
+{
+	/* remove from local queue if it failed */
+	list_del_init(&lock->list);
+	lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
+}
+
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_DENIED, DLM_RECOVERING, or net status
+ */
+static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      struct dlm_lock *lock, int flags)
+{
+	enum dlm_status status = DLM_DENIED;
+	int lockres_changed = 1;
+
+	mlog(0, "type=%d, lockres %.*s, flags = 0x%x\n",
+	     lock->ml.type, res->lockname.len,
+	     res->lockname.name, flags);
+
+	spin_lock(&res->spinlock);
+
+	/* will exit this call with spinlock held */
+	__dlm_wait_on_lockres(res);
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+
+	/* add lock to local (secondary) queue */
+	dlm_lock_get(lock);
+	list_add_tail(&lock->list, &res->blocked);
+	lock->lock_pending = 1;
+	spin_unlock(&res->spinlock);
+
+	/* spec seems to say that you will get DLM_NORMAL when the lock
+	 * has been queued, meaning we need to wait for a reply here. */
+	status = dlm_send_remote_lock_request(dlm, res, lock, flags);
+
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	lock->lock_pending = 0;
+	if (status != DLM_NORMAL) {
+		if (status == DLM_RECOVERING &&
+		    dlm_is_recovery_lock(res->lockname.name,
+					 res->lockname.len)) {
+			/* recovery lock was mastered by dead node.
+			 * we need to have calc_usage shoot down this
+			 * lockres and completely remaster it. */
+			mlog(0, "%s: recovery lock was owned by "
+			     "dead node %u, remaster it now.\n",
+			     dlm->name, res->owner);
+		} else if (status != DLM_NOTQUEUED) {
+			/*
+			 * DO NOT call calc_usage, as this would unhash
+			 * the remote lockres before we ever get to use
+			 * it.  treat as if we never made any change to
+			 * the lockres.
+			 */
+			lockres_changed = 0;
+			dlm_error(status);
+		}
+		dlm_revert_pending_lock(res, lock);
+		dlm_lock_put(lock);
+	} else if (dlm_is_recovery_lock(res->lockname.name,
+					res->lockname.len)) {
+		/* special case for the $RECOVERY lock.
+		 * there will never be an AST delivered to put
+		 * this lock on the proper secondary queue
+		 * (granted), so do it manually. */
+		mlog(0, "%s: $RECOVERY lock for this node (%u) is "
+		     "mastered by %u; got lock, manually granting (no ast)\n",
+		     dlm->name, dlm->node_num, res->owner);
+		list_move_tail(&lock->list, &res->granted);
+	}
+	spin_unlock(&res->spinlock);
+
+	if (lockres_changed)
+		dlm_lockres_calc_usage(dlm, res);
+
+	wake_up(&res->wq);
+	return status;
+}
+
+
+/* for remote lock creation.
+ * locking:
+ *   caller needs:  none, but need res->state & DLM_LOCK_RES_IN_PROGRESS
+ *   taken:         none
+ *   held on exit:  none
+ * returns: DLM_NOLOCKMGR, or net status
+ */
+static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
+					       struct dlm_lock_resource *res,
+					       struct dlm_lock *lock, int flags)
+{
+	struct dlm_create_lock create;
+	int tmpret, status = 0;
+	enum dlm_status ret;
+
+	memset(&create, 0, sizeof(create));
+	create.node_idx = dlm->node_num;
+	create.requested_type = lock->ml.type;
+	create.cookie = lock->ml.cookie;
+	create.namelen = res->lockname.len;
+	create.flags = cpu_to_be32(flags);
+	memcpy(create.name, res->lockname.name, create.namelen);
+
+	tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
+				    sizeof(create), res->owner, &status);
+	if (tmpret >= 0) {
+		// successfully sent and received
+		ret = status;  // this is already a dlm_status
+		if (ret == DLM_REJECTED) {
+			mlog(ML_ERROR, "%s:%.*s: BUG.  this is a stale lockres "
+			     "no longer owned by %u.  that node is coming back "
+			     "up currently.\n", dlm->name, create.namelen,
+			     create.name, res->owner);
+			dlm_print_one_lock_resource(res);
+			BUG();
+		}
+	} else {
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
+		     res->owner);
+		if (dlm_is_host_down(tmpret)) {
+			ret = DLM_RECOVERING;
+			mlog(0, "node %u died so returning DLM_RECOVERING "
+			     "from lock message!\n", res->owner);
+		} else {
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+	}
+
+	return ret;
+}
+
+void dlm_lock_get(struct dlm_lock *lock)
+{
+	kref_get(&lock->lock_refs);
+}
+
+void dlm_lock_put(struct dlm_lock *lock)
+{
+	kref_put(&lock->lock_refs, dlm_lock_release);
+}
+
+static void dlm_lock_release(struct kref *kref)
+{
+	struct dlm_lock *lock;
+
+	lock = container_of(kref, struct dlm_lock, lock_refs);
+
+	BUG_ON(!list_empty(&lock->list));
+	BUG_ON(!list_empty(&lock->ast_list));
+	BUG_ON(!list_empty(&lock->bast_list));
+	BUG_ON(lock->ast_pending);
+	BUG_ON(lock->bast_pending);
+
+	dlm_lock_detach_lockres(lock);
+
+	if (lock->lksb_kernel_allocated) {
+		mlog(0, "freeing kernel-allocated lksb\n");
+		kfree(lock->lksb);
+	}
+	kmem_cache_free(dlm_lock_cache, lock);
+}
+
+/* associate a lock with it's lockres, getting a ref on the lockres */
+void dlm_lock_attach_lockres(struct dlm_lock *lock,
+			     struct dlm_lock_resource *res)
+{
+	dlm_lockres_get(res);
+	lock->lockres = res;
+}
+
+/* drop ref on lockres, if there is still one associated with lock */
+static void dlm_lock_detach_lockres(struct dlm_lock *lock)
+{
+	struct dlm_lock_resource *res;
+
+	res = lock->lockres;
+	if (res) {
+		lock->lockres = NULL;
+		mlog(0, "removing lock's lockres reference\n");
+		dlm_lockres_put(res);
+	}
+}
+
+static void dlm_init_lock(struct dlm_lock *newlock, int type,
+			  u8 node, u64 cookie)
+{
+	INIT_LIST_HEAD(&newlock->list);
+	INIT_LIST_HEAD(&newlock->ast_list);
+	INIT_LIST_HEAD(&newlock->bast_list);
+	spin_lock_init(&newlock->spinlock);
+	newlock->ml.type = type;
+	newlock->ml.convert_type = LKM_IVMODE;
+	newlock->ml.highest_blocked = LKM_IVMODE;
+	newlock->ml.node = node;
+	newlock->ml.pad1 = 0;
+	newlock->ml.list = 0;
+	newlock->ml.flags = 0;
+	newlock->ast = NULL;
+	newlock->bast = NULL;
+	newlock->astdata = NULL;
+	newlock->ml.cookie = cpu_to_be64(cookie);
+	newlock->ast_pending = 0;
+	newlock->bast_pending = 0;
+	newlock->convert_pending = 0;
+	newlock->lock_pending = 0;
+	newlock->unlock_pending = 0;
+	newlock->cancel_pending = 0;
+	newlock->lksb_kernel_allocated = 0;
+
+	kref_init(&newlock->lock_refs);
+}
+
+struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
+			       struct dlm_lockstatus *lksb)
+{
+	struct dlm_lock *lock;
+	int kernel_allocated = 0;
+
+	lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
+	if (!lock)
+		return NULL;
+
+	if (!lksb) {
+		/* zero memory only if kernel-allocated */
+		lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
+		if (!lksb) {
+			kfree(lock);
+			return NULL;
+		}
+		kernel_allocated = 1;
+	}
+
+	dlm_init_lock(lock, type, node, cookie);
+	if (kernel_allocated)
+		lock->lksb_kernel_allocated = 1;
+	lock->lksb = lksb;
+	lksb->lockid = lock;
+	return lock;
+}
+
+/* handler for lock creation net message
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED
+ */
+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+			    void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *newlock = NULL;
+	struct dlm_lockstatus *lksb = NULL;
+	enum dlm_status status = DLM_NORMAL;
+	char *name;
+	unsigned int namelen;
+
+	BUG_ON(!dlm);
+
+	if (!dlm_grab(dlm))
+		return DLM_REJECTED;
+
+	name = create->name;
+	namelen = create->namelen;
+	status = DLM_REJECTED;
+	if (!dlm_domain_fully_joined(dlm)) {
+		mlog(ML_ERROR, "Domain %s not fully joined, but node %u is "
+		     "sending a create_lock message for lock %.*s!\n",
+		     dlm->name, create->node_idx, namelen, name);
+		dlm_error(status);
+		goto leave;
+	}
+
+	status = DLM_IVBUFLEN;
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	status = DLM_SYSERR;
+	newlock = dlm_new_lock(create->requested_type,
+			       create->node_idx,
+			       be64_to_cpu(create->cookie), NULL);
+	if (!newlock) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	lksb = newlock->lksb;
+
+	if (be32_to_cpu(create->flags) & LKM_GET_LVB) {
+		lksb->flags |= DLM_LKSB_GET_LVB;
+		mlog(0, "set DLM_LKSB_GET_LVB flag\n");
+	}
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lockres(dlm, name, namelen);
+	if (!res) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	spin_lock(&res->spinlock);
+	status = __dlm_lockres_state_to_status(res);
+	spin_unlock(&res->spinlock);
+
+	if (status != DLM_NORMAL) {
+		mlog(0, "lockres recovering/migrating/in-progress\n");
+		goto leave;
+	}
+
+	dlm_lock_attach_lockres(newlock, res);
+
+	status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags));
+leave:
+	if (status != DLM_NORMAL)
+		if (newlock)
+			dlm_lock_put(newlock);
+
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+
+	return status;
+}
+
+
+/* fetch next node-local (u8 nodenum + u56 cookie) into u64 */
+static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie)
+{
+	u64 tmpnode = node_num;
+
+	/* shift single byte of node num into top 8 bits */
+	tmpnode <<= 56;
+
+	spin_lock(&dlm_cookie_lock);
+	*cookie = (dlm_next_cookie | tmpnode);
+	if (++dlm_next_cookie & 0xff00000000000000ull) {
+		mlog(0, "This node's cookie will now wrap!\n");
+		dlm_next_cookie = 1;
+	}
+	spin_unlock(&dlm_cookie_lock);
+}
+
+enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode,
+			struct dlm_lockstatus *lksb, int flags,
+			const char *name, int namelen, dlm_astlockfunc_t *ast,
+			void *data, dlm_bastlockfunc_t *bast)
+{
+	enum dlm_status status;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *lock = NULL;
+	int convert = 0, recovery = 0;
+
+	/* yes this function is a mess.
+	 * TODO: clean this up.  lots of common code in the
+	 *       lock and convert paths, especially in the retry blocks */
+	if (!lksb) {
+		dlm_error(DLM_BADARGS);
+		return DLM_BADARGS;
+	}
+
+	status = DLM_BADPARAM;
+	if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) {
+		dlm_error(status);
+		goto error;
+	}
+
+	if (flags & ~LKM_VALID_FLAGS) {
+		dlm_error(status);
+		goto error;
+	}
+
+	convert = (flags & LKM_CONVERT);
+	recovery = (flags & LKM_RECOVERY);
+
+	if (recovery &&
+	    (!dlm_is_recovery_lock(name, namelen) || convert) ) {
+		dlm_error(status);
+		goto error;
+	}
+	if (convert && (flags & LKM_LOCAL)) {
+		mlog(ML_ERROR, "strange LOCAL convert request!\n");
+		goto error;
+	}
+
+	if (convert) {
+		/* CONVERT request */
+
+		/* if converting, must pass in a valid dlm_lock */
+		lock = lksb->lockid;
+		if (!lock) {
+			mlog(ML_ERROR, "NULL lock pointer in convert "
+			     "request\n");
+			goto error;
+		}
+
+		res = lock->lockres;
+		if (!res) {
+			mlog(ML_ERROR, "NULL lockres pointer in convert "
+			     "request\n");
+			goto error;
+		}
+		dlm_lockres_get(res);
+
+		/* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are
+	 	 * static after the original lock call.  convert requests will
+		 * ensure that everything is the same, or return DLM_BADARGS.
+	 	 * this means that DLM_DENIED_NOASTS will never be returned.
+	 	 */
+		if (lock->lksb != lksb || lock->ast != ast ||
+		    lock->bast != bast || lock->astdata != data) {
+			status = DLM_BADARGS;
+			mlog(ML_ERROR, "new args:  lksb=%p, ast=%p, bast=%p, "
+			     "astdata=%p\n", lksb, ast, bast, data);
+			mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, "
+			     "astdata=%p\n", lock->lksb, lock->ast,
+			     lock->bast, lock->astdata);
+			goto error;
+		}
+retry_convert:
+		dlm_wait_for_recovery(dlm);
+
+		if (res->owner == dlm->node_num)
+			status = dlmconvert_master(dlm, res, lock, flags, mode);
+		else
+			status = dlmconvert_remote(dlm, res, lock, flags, mode);
+		if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
+		    status == DLM_FORWARD) {
+			/* for now, see how this works without sleeping
+			 * and just retry right away.  I suspect the reco
+			 * or migration will complete fast enough that
+			 * no waiting will be necessary */
+			mlog(0, "retrying convert with migration/recovery/"
+			     "in-progress\n");
+			msleep(100);
+			goto retry_convert;
+		}
+	} else {
+		u64 tmpcookie;
+
+		/* LOCK request */
+		status = DLM_BADARGS;
+		if (!name) {
+			dlm_error(status);
+			goto error;
+		}
+
+		status = DLM_IVBUFLEN;
+		if (namelen > DLM_LOCKID_NAME_MAX || namelen < 1) {
+			dlm_error(status);
+			goto error;
+		}
+
+		dlm_get_next_cookie(dlm->node_num, &tmpcookie);
+		lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb);
+		if (!lock) {
+			dlm_error(status);
+			goto error;
+		}
+
+		if (!recovery)
+			dlm_wait_for_recovery(dlm);
+
+		/* find or create the lock resource */
+		res = dlm_get_lock_resource(dlm, name, namelen, flags);
+		if (!res) {
+			status = DLM_IVLOCKID;
+			dlm_error(status);
+			goto error;
+		}
+
+		mlog(0, "type=%d, flags = 0x%x\n", mode, flags);
+		mlog(0, "creating lock: lock=%p res=%p\n", lock, res);
+
+		dlm_lock_attach_lockres(lock, res);
+		lock->ast = ast;
+		lock->bast = bast;
+		lock->astdata = data;
+
+retry_lock:
+		if (flags & LKM_VALBLK) {
+			mlog(0, "LKM_VALBLK passed by caller\n");
+
+			/* LVB requests for non PR, PW or EX locks are
+			 * ignored. */
+			if (mode < LKM_PRMODE)
+				flags &= ~LKM_VALBLK;
+			else {
+				flags |= LKM_GET_LVB;
+				lock->lksb->flags |= DLM_LKSB_GET_LVB;
+			}
+		}
+
+		if (res->owner == dlm->node_num)
+			status = dlmlock_master(dlm, res, lock, flags);
+		else
+			status = dlmlock_remote(dlm, res, lock, flags);
+
+		if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
+		    status == DLM_FORWARD) {
+			mlog(0, "retrying lock with migration/"
+			     "recovery/in progress\n");
+			msleep(100);
+			/* no waiting for dlm_reco_thread */
+			if (recovery) {
+				if (status != DLM_RECOVERING)
+					goto retry_lock;
+
+				mlog(0, "%s: got RECOVERING "
+				     "for $RECOVERY lock, master "
+				     "was %u\n", dlm->name,
+				     res->owner);
+				/* wait to see the node go down, then
+				 * drop down and allow the lockres to
+				 * get cleaned up.  need to remaster. */
+				dlm_wait_for_node_death(dlm, res->owner,
+						DLM_NODE_DEATH_WAIT_MAX);
+			} else {
+				dlm_wait_for_recovery(dlm);
+				goto retry_lock;
+			}
+		}
+
+		if (status != DLM_NORMAL) {
+			lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
+			if (status != DLM_NOTQUEUED)
+				dlm_error(status);
+			goto error;
+		}
+	}
+
+error:
+	if (status != DLM_NORMAL) {
+		if (lock && !convert)
+			dlm_lock_put(lock);
+		// this is kind of unnecessary
+		lksb->status = status;
+	}
+
+	/* put lockres ref from the convert path
+	 * or from dlm_get_lock_resource */
+	if (res)
+		dlm_lockres_put(res);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(dlmlock);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
new file mode 100644
index 00000000..11eefb8c
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -0,0 +1,3413 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+#include "dlmdebug.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
+#include "cluster/masklog.h"
+
+static void dlm_mle_node_down(struct dlm_ctxt *dlm,
+			      struct dlm_master_list_entry *mle,
+			      struct o2nm_node *node,
+			      int idx);
+static void dlm_mle_node_up(struct dlm_ctxt *dlm,
+			    struct dlm_master_list_entry *mle,
+			    struct o2nm_node *node,
+			    int idx);
+
+static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
+static int dlm_do_assert_master(struct dlm_ctxt *dlm,
+				struct dlm_lock_resource *res,
+				void *nodemap, u32 flags);
+static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data);
+
+static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
+				struct dlm_master_list_entry *mle,
+				const char *name,
+				unsigned int namelen)
+{
+	if (dlm != mle->dlm)
+		return 0;
+
+	if (namelen != mle->mnamelen ||
+	    memcmp(name, mle->mname, namelen) != 0)
+		return 0;
+
+	return 1;
+}
+
+static struct kmem_cache *dlm_lockres_cache = NULL;
+static struct kmem_cache *dlm_lockname_cache = NULL;
+static struct kmem_cache *dlm_mle_cache = NULL;
+
+static void dlm_mle_release(struct kref *kref);
+static void dlm_init_mle(struct dlm_master_list_entry *mle,
+			enum dlm_mle_type type,
+			struct dlm_ctxt *dlm,
+			struct dlm_lock_resource *res,
+			const char *name,
+			unsigned int namelen);
+static void dlm_put_mle(struct dlm_master_list_entry *mle);
+static void __dlm_put_mle(struct dlm_master_list_entry *mle);
+static int dlm_find_mle(struct dlm_ctxt *dlm,
+			struct dlm_master_list_entry **mle,
+			char *name, unsigned int namelen);
+
+static int dlm_do_master_request(struct dlm_lock_resource *res,
+				 struct dlm_master_list_entry *mle, int to);
+
+
+static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_master_list_entry *mle,
+				     int *blocked);
+static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res,
+				    struct dlm_master_list_entry *mle,
+				    int blocked);
+static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 struct dlm_master_list_entry *mle,
+				 struct dlm_master_list_entry **oldmle,
+				 const char *name, unsigned int namelen,
+				 u8 new_master, u8 master);
+
+static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res);
+static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res);
+static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res,
+				       u8 target);
+static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res);
+
+
+int dlm_is_host_down(int errno)
+{
+	switch (errno) {
+		case -EBADF:
+		case -ECONNREFUSED:
+		case -ENOTCONN:
+		case -ECONNRESET:
+		case -EPIPE:
+		case -EHOSTDOWN:
+		case -EHOSTUNREACH:
+		case -ETIMEDOUT:
+		case -ECONNABORTED:
+		case -ENETDOWN:
+		case -ENETUNREACH:
+		case -ENETRESET:
+		case -ESHUTDOWN:
+		case -ENOPROTOOPT:
+		case -EINVAL:   /* if returned from our tcp code,
+				   this means there is no socket */
+			return 1;
+	}
+	return 0;
+}
+
+
+/*
+ * MASTER LIST FUNCTIONS
+ */
+
+
+/*
+ * regarding master list entries and heartbeat callbacks:
+ *
+ * in order to avoid sleeping and allocation that occurs in
+ * heartbeat, master list entries are simply attached to the
+ * dlm's established heartbeat callbacks.  the mle is attached
+ * when it is created, and since the dlm->spinlock is held at
+ * that time, any heartbeat event will be properly discovered
+ * by the mle.  the mle needs to be detached from the
+ * dlm->mle_hb_events list as soon as heartbeat events are no
+ * longer useful to the mle, and before the mle is freed.
+ *
+ * as a general rule, heartbeat events are no longer needed by
+ * the mle once an "answer" regarding the lock master has been
+ * received.
+ */
+static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
+					      struct dlm_master_list_entry *mle)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
+}
+
+
+static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
+					      struct dlm_master_list_entry *mle)
+{
+	if (!list_empty(&mle->hb_events))
+		list_del_init(&mle->hb_events);
+}
+
+
+static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
+					    struct dlm_master_list_entry *mle)
+{
+	spin_lock(&dlm->spinlock);
+	__dlm_mle_detach_hb_events(dlm, mle);
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+	mle->inuse++;
+	kref_get(&mle->mle_refs);
+}
+
+static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	mle->inuse--;
+	__dlm_put_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+}
+
+/* remove from list and free */
+static void __dlm_put_mle(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+	if (!atomic_read(&mle->mle_refs.refcount)) {
+		/* this may or may not crash, but who cares.
+		 * it's a BUG. */
+		mlog(ML_ERROR, "bad mle: %p\n", mle);
+		dlm_print_one_mle(mle);
+		BUG();
+	} else
+		kref_put(&mle->mle_refs, dlm_mle_release);
+}
+
+
+/* must not have any spinlocks coming in */
+static void dlm_put_mle(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	__dlm_put_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+}
+
+static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
+{
+	kref_get(&mle->mle_refs);
+}
+
+static void dlm_init_mle(struct dlm_master_list_entry *mle,
+			enum dlm_mle_type type,
+			struct dlm_ctxt *dlm,
+			struct dlm_lock_resource *res,
+			const char *name,
+			unsigned int namelen)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	mle->dlm = dlm;
+	mle->type = type;
+	INIT_HLIST_NODE(&mle->master_hash_node);
+	INIT_LIST_HEAD(&mle->hb_events);
+	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+	spin_lock_init(&mle->spinlock);
+	init_waitqueue_head(&mle->wq);
+	atomic_set(&mle->woken, 0);
+	kref_init(&mle->mle_refs);
+	memset(mle->response_map, 0, sizeof(mle->response_map));
+	mle->master = O2NM_MAX_NODES;
+	mle->new_master = O2NM_MAX_NODES;
+	mle->inuse = 0;
+
+	BUG_ON(mle->type != DLM_MLE_BLOCK &&
+	       mle->type != DLM_MLE_MASTER &&
+	       mle->type != DLM_MLE_MIGRATION);
+
+	if (mle->type == DLM_MLE_MASTER) {
+		BUG_ON(!res);
+		mle->mleres = res;
+		memcpy(mle->mname, res->lockname.name, res->lockname.len);
+		mle->mnamelen = res->lockname.len;
+		mle->mnamehash = res->lockname.hash;
+	} else {
+		BUG_ON(!name);
+		mle->mleres = NULL;
+		memcpy(mle->mname, name, namelen);
+		mle->mnamelen = namelen;
+		mle->mnamehash = dlm_lockid_hash(name, namelen);
+	}
+
+	atomic_inc(&dlm->mle_tot_count[mle->type]);
+	atomic_inc(&dlm->mle_cur_count[mle->type]);
+
+	/* copy off the node_map and register hb callbacks on our copy */
+	memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
+	memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
+	clear_bit(dlm->node_num, mle->vote_map);
+	clear_bit(dlm->node_num, mle->node_map);
+
+	/* attach the mle to the domain node up/down events */
+	__dlm_mle_attach_hb_events(dlm, mle);
+}
+
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+
+	if (!hlist_unhashed(&mle->master_hash_node))
+		hlist_del_init(&mle->master_hash_node);
+}
+
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+	struct hlist_head *bucket;
+
+	assert_spin_locked(&dlm->master_lock);
+
+	bucket = dlm_master_hash(dlm, mle->mnamehash);
+	hlist_add_head(&mle->master_hash_node, bucket);
+}
+
+/* returns 1 if found, 0 if not */
+static int dlm_find_mle(struct dlm_ctxt *dlm,
+			struct dlm_master_list_entry **mle,
+			char *name, unsigned int namelen)
+{
+	struct dlm_master_list_entry *tmpmle;
+	struct hlist_head *bucket;
+	struct hlist_node *list;
+	unsigned int hash;
+
+	assert_spin_locked(&dlm->master_lock);
+
+	hash = dlm_lockid_hash(name, namelen);
+	bucket = dlm_master_hash(dlm, hash);
+	hlist_for_each(list, bucket) {
+		tmpmle = hlist_entry(list, struct dlm_master_list_entry,
+				     master_hash_node);
+		if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
+			continue;
+		dlm_get_mle(tmpmle);
+		*mle = tmpmle;
+		return 1;
+	}
+	return 0;
+}
+
+void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
+{
+	struct dlm_master_list_entry *mle;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
+		if (node_up)
+			dlm_mle_node_up(dlm, mle, NULL, idx);
+		else
+			dlm_mle_node_down(dlm, mle, NULL, idx);
+	}
+}
+
+static void dlm_mle_node_down(struct dlm_ctxt *dlm,
+			      struct dlm_master_list_entry *mle,
+			      struct o2nm_node *node, int idx)
+{
+	spin_lock(&mle->spinlock);
+
+	if (!test_bit(idx, mle->node_map))
+		mlog(0, "node %u already removed from nodemap!\n", idx);
+	else
+		clear_bit(idx, mle->node_map);
+
+	spin_unlock(&mle->spinlock);
+}
+
+static void dlm_mle_node_up(struct dlm_ctxt *dlm,
+			    struct dlm_master_list_entry *mle,
+			    struct o2nm_node *node, int idx)
+{
+	spin_lock(&mle->spinlock);
+
+	if (test_bit(idx, mle->node_map))
+		mlog(0, "node %u already in node map!\n", idx);
+	else
+		set_bit(idx, mle->node_map);
+
+	spin_unlock(&mle->spinlock);
+}
+
+
+int dlm_init_mle_cache(void)
+{
+	dlm_mle_cache = kmem_cache_create("o2dlm_mle",
+					  sizeof(struct dlm_master_list_entry),
+					  0, SLAB_HWCACHE_ALIGN,
+					  NULL);
+	if (dlm_mle_cache == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void dlm_destroy_mle_cache(void)
+{
+	if (dlm_mle_cache)
+		kmem_cache_destroy(dlm_mle_cache);
+}
+
+static void dlm_mle_release(struct kref *kref)
+{
+	struct dlm_master_list_entry *mle;
+	struct dlm_ctxt *dlm;
+
+	mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
+	dlm = mle->dlm;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+
+	mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
+	     mle->type);
+
+	/* remove from list if not already */
+	__dlm_unlink_mle(dlm, mle);
+
+	/* detach the mle from the domain node up/down events */
+	__dlm_mle_detach_hb_events(dlm, mle);
+
+	atomic_dec(&dlm->mle_cur_count[mle->type]);
+
+	/* NOTE: kfree under spinlock here.
+	 * if this is bad, we can move this to a freelist. */
+	kmem_cache_free(dlm_mle_cache, mle);
+}
+
+
+/*
+ * LOCK RESOURCE FUNCTIONS
+ */
+
+int dlm_init_master_caches(void)
+{
+	dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
+					      sizeof(struct dlm_lock_resource),
+					      0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!dlm_lockres_cache)
+		goto bail;
+
+	dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
+					       DLM_LOCKID_NAME_MAX, 0,
+					       SLAB_HWCACHE_ALIGN, NULL);
+	if (!dlm_lockname_cache)
+		goto bail;
+
+	return 0;
+bail:
+	dlm_destroy_master_caches();
+	return -ENOMEM;
+}
+
+void dlm_destroy_master_caches(void)
+{
+	if (dlm_lockname_cache)
+		kmem_cache_destroy(dlm_lockname_cache);
+
+	if (dlm_lockres_cache)
+		kmem_cache_destroy(dlm_lockres_cache);
+}
+
+static void dlm_lockres_release(struct kref *kref)
+{
+	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm;
+
+	res = container_of(kref, struct dlm_lock_resource, refs);
+	dlm = res->dlm;
+
+	/* This should not happen -- all lockres' have a name
+	 * associated with them at init time. */
+	BUG_ON(!res->lockname.name);
+
+	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
+	     res->lockname.name);
+
+	spin_lock(&dlm->track_lock);
+	if (!list_empty(&res->tracking))
+		list_del_init(&res->tracking);
+	else {
+		mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+		     res->lockname.len, res->lockname.name);
+		dlm_print_one_lock_resource(res);
+	}
+	spin_unlock(&dlm->track_lock);
+
+	atomic_dec(&dlm->res_cur_count);
+
+	if (!hlist_unhashed(&res->hash_node) ||
+	    !list_empty(&res->granted) ||
+	    !list_empty(&res->converting) ||
+	    !list_empty(&res->blocked) ||
+	    !list_empty(&res->dirty) ||
+	    !list_empty(&res->recovering) ||
+	    !list_empty(&res->purge)) {
+		mlog(ML_ERROR,
+		     "Going to BUG for resource %.*s."
+		     "  We're on a list! [%c%c%c%c%c%c%c]\n",
+		     res->lockname.len, res->lockname.name,
+		     !hlist_unhashed(&res->hash_node) ? 'H' : ' ',
+		     !list_empty(&res->granted) ? 'G' : ' ',
+		     !list_empty(&res->converting) ? 'C' : ' ',
+		     !list_empty(&res->blocked) ? 'B' : ' ',
+		     !list_empty(&res->dirty) ? 'D' : ' ',
+		     !list_empty(&res->recovering) ? 'R' : ' ',
+		     !list_empty(&res->purge) ? 'P' : ' ');
+
+		dlm_print_one_lock_resource(res);
+	}
+
+	/* By the time we're ready to blow this guy away, we shouldn't
+	 * be on any lists. */
+	BUG_ON(!hlist_unhashed(&res->hash_node));
+	BUG_ON(!list_empty(&res->granted));
+	BUG_ON(!list_empty(&res->converting));
+	BUG_ON(!list_empty(&res->blocked));
+	BUG_ON(!list_empty(&res->dirty));
+	BUG_ON(!list_empty(&res->recovering));
+	BUG_ON(!list_empty(&res->purge));
+
+	kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
+
+	kmem_cache_free(dlm_lockres_cache, res);
+}
+
+void dlm_lockres_put(struct dlm_lock_resource *res)
+{
+	kref_put(&res->refs, dlm_lockres_release);
+}
+
+static void dlm_init_lockres(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res,
+			     const char *name, unsigned int namelen)
+{
+	char *qname;
+
+	/* If we memset here, we lose our reference to the kmalloc'd
+	 * res->lockname.name, so be sure to init every field
+	 * correctly! */
+
+	qname = (char *) res->lockname.name;
+	memcpy(qname, name, namelen);
+
+	res->lockname.len = namelen;
+	res->lockname.hash = dlm_lockid_hash(name, namelen);
+
+	init_waitqueue_head(&res->wq);
+	spin_lock_init(&res->spinlock);
+	INIT_HLIST_NODE(&res->hash_node);
+	INIT_LIST_HEAD(&res->granted);
+	INIT_LIST_HEAD(&res->converting);
+	INIT_LIST_HEAD(&res->blocked);
+	INIT_LIST_HEAD(&res->dirty);
+	INIT_LIST_HEAD(&res->recovering);
+	INIT_LIST_HEAD(&res->purge);
+	INIT_LIST_HEAD(&res->tracking);
+	atomic_set(&res->asts_reserved, 0);
+	res->migration_pending = 0;
+	res->inflight_locks = 0;
+
+	res->dlm = dlm;
+
+	kref_init(&res->refs);
+
+	atomic_inc(&dlm->res_tot_count);
+	atomic_inc(&dlm->res_cur_count);
+
+	/* just for consistency */
+	spin_lock(&res->spinlock);
+	dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
+	spin_unlock(&res->spinlock);
+
+	res->state = DLM_LOCK_RES_IN_PROGRESS;
+
+	res->last_used = 0;
+
+	spin_lock(&dlm->spinlock);
+	list_add_tail(&res->tracking, &dlm->tracking_list);
+	spin_unlock(&dlm->spinlock);
+
+	memset(res->lvb, 0, DLM_LVB_LEN);
+	memset(res->refmap, 0, sizeof(res->refmap));
+}
+
+struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
+				   const char *name,
+				   unsigned int namelen)
+{
+	struct dlm_lock_resource *res = NULL;
+
+	res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
+	if (!res)
+		goto error;
+
+	res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
+	if (!res->lockname.name)
+		goto error;
+
+	dlm_init_lockres(dlm, res, name, namelen);
+	return res;
+
+error:
+	if (res && res->lockname.name)
+		kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
+
+	if (res)
+		kmem_cache_free(dlm_lockres_cache, res);
+	return NULL;
+}
+
+void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+				   struct dlm_lock_resource *res,
+				   int new_lockres,
+				   const char *file,
+				   int line)
+{
+	if (!new_lockres)
+		assert_spin_locked(&res->spinlock);
+
+	if (!test_bit(dlm->node_num, res->refmap)) {
+		BUG_ON(res->inflight_locks != 0);
+		dlm_lockres_set_refmap_bit(dlm->node_num, res);
+	}
+	res->inflight_locks++;
+	mlog(0, "%s:%.*s: inflight++: now %u\n",
+	     dlm->name, res->lockname.len, res->lockname.name,
+	     res->inflight_locks);
+}
+
+void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+				   struct dlm_lock_resource *res,
+				   const char *file,
+				   int line)
+{
+	assert_spin_locked(&res->spinlock);
+
+	BUG_ON(res->inflight_locks == 0);
+	res->inflight_locks--;
+	mlog(0, "%s:%.*s: inflight--: now %u\n",
+	     dlm->name, res->lockname.len, res->lockname.name,
+	     res->inflight_locks);
+	if (res->inflight_locks == 0)
+		dlm_lockres_clear_refmap_bit(dlm->node_num, res);
+	wake_up(&res->wq);
+}
+
+/*
+ * lookup a lock resource by name.
+ * may already exist in the hashtable.
+ * lockid is null terminated
+ *
+ * if not, allocate enough for the lockres and for
+ * the temporary structure used in doing the mastering.
+ *
+ * also, do a lookup in the dlm->master_list to see
+ * if another node has begun mastering the same lock.
+ * if so, there should be a block entry in there
+ * for this name, and we should *not* attempt to master
+ * the lock here.   need to wait around for that node
+ * to assert_master (or die).
+ *
+ */
+struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
+					  const char *lockid,
+					  int namelen,
+					  int flags)
+{
+	struct dlm_lock_resource *tmpres=NULL, *res=NULL;
+	struct dlm_master_list_entry *mle = NULL;
+	struct dlm_master_list_entry *alloc_mle = NULL;
+	int blocked = 0;
+	int ret, nodenum;
+	struct dlm_node_iter iter;
+	unsigned int hash;
+	int tries = 0;
+	int bit, wait_on_recovery = 0;
+	int drop_inflight_if_nonlocal = 0;
+
+	BUG_ON(!lockid);
+
+	hash = dlm_lockid_hash(lockid, namelen);
+
+	mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
+
+lookup:
+	spin_lock(&dlm->spinlock);
+	tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
+	if (tmpres) {
+		int dropping_ref = 0;
+
+		spin_unlock(&dlm->spinlock);
+
+		spin_lock(&tmpres->spinlock);
+		/* We wait for the other thread that is mastering the resource */
+		if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			__dlm_wait_on_lockres(tmpres);
+			BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+		}
+
+		if (tmpres->owner == dlm->node_num) {
+			BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
+			dlm_lockres_grab_inflight_ref(dlm, tmpres);
+		} else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
+			dropping_ref = 1;
+		spin_unlock(&tmpres->spinlock);
+
+		/* wait until done messaging the master, drop our ref to allow
+		 * the lockres to be purged, start over. */
+		if (dropping_ref) {
+			spin_lock(&tmpres->spinlock);
+			__dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF);
+			spin_unlock(&tmpres->spinlock);
+			dlm_lockres_put(tmpres);
+			tmpres = NULL;
+			goto lookup;
+		}
+
+		mlog(0, "found in hash!\n");
+		if (res)
+			dlm_lockres_put(res);
+		res = tmpres;
+		goto leave;
+	}
+
+	if (!res) {
+		spin_unlock(&dlm->spinlock);
+		mlog(0, "allocating a new resource\n");
+		/* nothing found and we need to allocate one. */
+		alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+		if (!alloc_mle)
+			goto leave;
+		res = dlm_new_lockres(dlm, lockid, namelen);
+		if (!res)
+			goto leave;
+		goto lookup;
+	}
+
+	mlog(0, "no lockres found, allocated our own: %p\n", res);
+
+	if (flags & LKM_LOCAL) {
+		/* caller knows it's safe to assume it's not mastered elsewhere
+		 * DONE!  return right away */
+		spin_lock(&res->spinlock);
+		dlm_change_lockres_owner(dlm, res, dlm->node_num);
+		__dlm_insert_lockres(dlm, res);
+		dlm_lockres_grab_inflight_ref(dlm, res);
+		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
+		/* lockres still marked IN_PROGRESS */
+		goto wake_waiters;
+	}
+
+	/* check master list to see if another node has started mastering it */
+	spin_lock(&dlm->master_lock);
+
+	/* if we found a block, wait for lock to be mastered by another node */
+	blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
+	if (blocked) {
+		int mig;
+		if (mle->type == DLM_MLE_MASTER) {
+			mlog(ML_ERROR, "master entry for nonexistent lock!\n");
+			BUG();
+		}
+		mig = (mle->type == DLM_MLE_MIGRATION);
+		/* if there is a migration in progress, let the migration
+		 * finish before continuing.  we can wait for the absence
+		 * of the MIGRATION mle: either the migrate finished or
+		 * one of the nodes died and the mle was cleaned up.
+		 * if there is a BLOCK here, but it already has a master
+		 * set, we are too late.  the master does not have a ref
+		 * for us in the refmap.  detach the mle and drop it.
+		 * either way, go back to the top and start over. */
+		if (mig || mle->master != O2NM_MAX_NODES) {
+			BUG_ON(mig && mle->master == dlm->node_num);
+			/* we arrived too late.  the master does not
+			 * have a ref for us. retry. */
+			mlog(0, "%s:%.*s: late on %s\n",
+			     dlm->name, namelen, lockid,
+			     mig ?  "MIGRATION" : "BLOCK");
+			spin_unlock(&dlm->master_lock);
+			spin_unlock(&dlm->spinlock);
+
+			/* master is known, detach */
+			if (!mig)
+				dlm_mle_detach_hb_events(dlm, mle);
+			dlm_put_mle(mle);
+			mle = NULL;
+			/* this is lame, but we can't wait on either
+			 * the mle or lockres waitqueue here */
+			if (mig)
+				msleep(100);
+			goto lookup;
+		}
+	} else {
+		/* go ahead and try to master lock on this node */
+		mle = alloc_mle;
+		/* make sure this does not get freed below */
+		alloc_mle = NULL;
+		dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
+		set_bit(dlm->node_num, mle->maybe_map);
+		__dlm_insert_mle(dlm, mle);
+
+		/* still holding the dlm spinlock, check the recovery map
+		 * to see if there are any nodes that still need to be
+		 * considered.  these will not appear in the mle nodemap
+		 * but they might own this lockres.  wait on them. */
+		bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+		if (bit < O2NM_MAX_NODES) {
+			mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
+			     "recover before lock mastery can begin\n",
+			     dlm->name, namelen, (char *)lockid, bit);
+			wait_on_recovery = 1;
+		}
+	}
+
+	/* at this point there is either a DLM_MLE_BLOCK or a
+	 * DLM_MLE_MASTER on the master list, so it's safe to add the
+	 * lockres to the hashtable.  anyone who finds the lock will
+	 * still have to wait on the IN_PROGRESS. */
+
+	/* finally add the lockres to its hash bucket */
+	__dlm_insert_lockres(dlm, res);
+	/* since this lockres is new it doesn't not require the spinlock */
+	dlm_lockres_grab_inflight_ref_new(dlm, res);
+
+	/* if this node does not become the master make sure to drop
+	 * this inflight reference below */
+	drop_inflight_if_nonlocal = 1;
+
+	/* get an extra ref on the mle in case this is a BLOCK
+	 * if so, the creator of the BLOCK may try to put the last
+	 * ref at this time in the assert master handler, so we
+	 * need an extra one to keep from a bad ptr deref. */
+	dlm_get_mle_inuse(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+redo_request:
+	while (wait_on_recovery) {
+		/* any cluster changes that occurred after dropping the
+		 * dlm spinlock would be detectable be a change on the mle,
+		 * so we only need to clear out the recovery map once. */
+		if (dlm_is_recovery_lock(lockid, namelen)) {
+			mlog(ML_NOTICE, "%s: recovery map is not empty, but "
+			     "must master $RECOVERY lock now\n", dlm->name);
+			if (!dlm_pre_master_reco_lockres(dlm, res))
+				wait_on_recovery = 0;
+			else {
+				mlog(0, "%s: waiting 500ms for heartbeat state "
+				    "change\n", dlm->name);
+				msleep(500);
+			}
+			continue;
+		}
+
+		dlm_kick_recovery_thread(dlm);
+		msleep(1000);
+		dlm_wait_for_recovery(dlm);
+
+		spin_lock(&dlm->spinlock);
+		bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+		if (bit < O2NM_MAX_NODES) {
+			mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
+			     "recover before lock mastery can begin\n",
+			     dlm->name, namelen, (char *)lockid, bit);
+			wait_on_recovery = 1;
+		} else
+			wait_on_recovery = 0;
+		spin_unlock(&dlm->spinlock);
+
+		if (wait_on_recovery)
+			dlm_wait_for_node_recovery(dlm, bit, 10000);
+	}
+
+	/* must wait for lock to be mastered elsewhere */
+	if (blocked)
+		goto wait;
+
+	ret = -EINVAL;
+	dlm_node_iter_init(mle->vote_map, &iter);
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		ret = dlm_do_master_request(res, mle, nodenum);
+		if (ret < 0)
+			mlog_errno(ret);
+		if (mle->master != O2NM_MAX_NODES) {
+			/* found a master ! */
+			if (mle->master <= nodenum)
+				break;
+			/* if our master request has not reached the master
+			 * yet, keep going until it does.  this is how the
+			 * master will know that asserts are needed back to
+			 * the lower nodes. */
+			mlog(0, "%s:%.*s: requests only up to %u but master "
+			     "is %u, keep going\n", dlm->name, namelen,
+			     lockid, nodenum, mle->master);
+		}
+	}
+
+wait:
+	/* keep going until the response map includes all nodes */
+	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
+	if (ret < 0) {
+		wait_on_recovery = 1;
+		mlog(0, "%s:%.*s: node map changed, redo the "
+		     "master request now, blocked=%d\n",
+		     dlm->name, res->lockname.len,
+		     res->lockname.name, blocked);
+		if (++tries > 20) {
+			mlog(ML_ERROR, "%s:%.*s: spinning on "
+			     "dlm_wait_for_lock_mastery, blocked=%d\n",
+			     dlm->name, res->lockname.len,
+			     res->lockname.name, blocked);
+			dlm_print_one_lock_resource(res);
+			dlm_print_one_mle(mle);
+			tries = 0;
+		}
+		goto redo_request;
+	}
+
+	mlog(0, "lockres mastered by %u\n", res->owner);
+	/* make sure we never continue without this */
+	BUG_ON(res->owner == O2NM_MAX_NODES);
+
+	/* master is known, detach if not already detached */
+	dlm_mle_detach_hb_events(dlm, mle);
+	dlm_put_mle(mle);
+	/* put the extra ref */
+	dlm_put_mle_inuse(mle);
+
+wake_waiters:
+	spin_lock(&res->spinlock);
+	if (res->owner != dlm->node_num && drop_inflight_if_nonlocal)
+		dlm_lockres_drop_inflight_ref(dlm, res);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+leave:
+	/* need to free the unused mle */
+	if (alloc_mle)
+		kmem_cache_free(dlm_mle_cache, alloc_mle);
+
+	return res;
+}
+
+
+#define DLM_MASTERY_TIMEOUT_MS   5000
+
+static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_master_list_entry *mle,
+				     int *blocked)
+{
+	u8 m;
+	int ret, bit;
+	int map_changed, voting_done;
+	int assert, sleep;
+
+recheck:
+	ret = 0;
+	assert = 0;
+
+	/* check if another node has already become the owner */
+	spin_lock(&res->spinlock);
+	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+		mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name,
+		     res->lockname.len, res->lockname.name, res->owner);
+		spin_unlock(&res->spinlock);
+		/* this will cause the master to re-assert across
+		 * the whole cluster, freeing up mles */
+		if (res->owner != dlm->node_num) {
+			ret = dlm_do_master_request(res, mle, res->owner);
+			if (ret < 0) {
+				/* give recovery a chance to run */
+				mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
+				msleep(500);
+				goto recheck;
+			}
+		}
+		ret = 0;
+		goto leave;
+	}
+	spin_unlock(&res->spinlock);
+
+	spin_lock(&mle->spinlock);
+	m = mle->master;
+	map_changed = (memcmp(mle->vote_map, mle->node_map,
+			      sizeof(mle->vote_map)) != 0);
+	voting_done = (memcmp(mle->vote_map, mle->response_map,
+			     sizeof(mle->vote_map)) == 0);
+
+	/* restart if we hit any errors */
+	if (map_changed) {
+		int b;
+		mlog(0, "%s: %.*s: node map changed, restarting\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
+		b = (mle->type == DLM_MLE_BLOCK);
+		if ((*blocked && !b) || (!*blocked && b)) {
+			mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
+			     dlm->name, res->lockname.len, res->lockname.name,
+			     *blocked, b);
+			*blocked = b;
+		}
+		spin_unlock(&mle->spinlock);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto leave;
+		}
+		mlog(0, "%s:%.*s: restart lock mastery succeeded, "
+		     "rechecking now\n", dlm->name, res->lockname.len,
+		     res->lockname.name);
+		goto recheck;
+	} else {
+		if (!voting_done) {
+			mlog(0, "map not changed and voting not done "
+			     "for %s:%.*s\n", dlm->name, res->lockname.len,
+			     res->lockname.name);
+		}
+	}
+
+	if (m != O2NM_MAX_NODES) {
+		/* another node has done an assert!
+		 * all done! */
+		sleep = 0;
+	} else {
+		sleep = 1;
+		/* have all nodes responded? */
+		if (voting_done && !*blocked) {
+			bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+			if (dlm->node_num <= bit) {
+				/* my node number is lowest.
+			 	 * now tell other nodes that I am
+				 * mastering this. */
+				mle->master = dlm->node_num;
+				/* ref was grabbed in get_lock_resource
+				 * will be dropped in dlmlock_master */
+				assert = 1;
+				sleep = 0;
+			}
+			/* if voting is done, but we have not received
+			 * an assert master yet, we must sleep */
+		}
+	}
+
+	spin_unlock(&mle->spinlock);
+
+	/* sleep if we haven't finished voting yet */
+	if (sleep) {
+		unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
+
+		/*
+		if (atomic_read(&mle->mle_refs.refcount) < 2)
+			mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
+			atomic_read(&mle->mle_refs.refcount),
+			res->lockname.len, res->lockname.name);
+		*/
+		atomic_set(&mle->woken, 0);
+		(void)wait_event_timeout(mle->wq,
+					 (atomic_read(&mle->woken) == 1),
+					 timeo);
+		if (res->owner == O2NM_MAX_NODES) {
+			mlog(0, "%s:%.*s: waiting again\n", dlm->name,
+			     res->lockname.len, res->lockname.name);
+			goto recheck;
+		}
+		mlog(0, "done waiting, master is %u\n", res->owner);
+		ret = 0;
+		goto leave;
+	}
+
+	ret = 0;   /* done */
+	if (assert) {
+		m = dlm->node_num;
+		mlog(0, "about to master %.*s here, this=%u\n",
+		     res->lockname.len, res->lockname.name, m);
+		ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0);
+		if (ret) {
+			/* This is a failure in the network path,
+			 * not in the response to the assert_master
+			 * (any nonzero response is a BUG on this node).
+			 * Most likely a socket just got disconnected
+			 * due to node death. */
+			mlog_errno(ret);
+		}
+		/* no longer need to restart lock mastery.
+		 * all living nodes have been contacted. */
+		ret = 0;
+	}
+
+	/* set the lockres owner */
+	spin_lock(&res->spinlock);
+	/* mastery reference obtained either during
+	 * assert_master_handler or in get_lock_resource */
+	dlm_change_lockres_owner(dlm, res, m);
+	spin_unlock(&res->spinlock);
+
+leave:
+	return ret;
+}
+
+struct dlm_bitmap_diff_iter
+{
+	int curnode;
+	unsigned long *orig_bm;
+	unsigned long *cur_bm;
+	unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+enum dlm_node_state_change
+{
+	NODE_DOWN = -1,
+	NODE_NO_CHANGE = 0,
+	NODE_UP
+};
+
+static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
+				      unsigned long *orig_bm,
+				      unsigned long *cur_bm)
+{
+	unsigned long p1, p2;
+	int i;
+
+	iter->curnode = -1;
+	iter->orig_bm = orig_bm;
+	iter->cur_bm = cur_bm;
+
+	for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
+       		p1 = *(iter->orig_bm + i);
+	       	p2 = *(iter->cur_bm + i);
+		iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
+	}
+}
+
+static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
+				     enum dlm_node_state_change *state)
+{
+	int bit;
+
+	if (iter->curnode >= O2NM_MAX_NODES)
+		return -ENOENT;
+
+	bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
+			    iter->curnode+1);
+	if (bit >= O2NM_MAX_NODES) {
+		iter->curnode = O2NM_MAX_NODES;
+		return -ENOENT;
+	}
+
+	/* if it was there in the original then this node died */
+	if (test_bit(bit, iter->orig_bm))
+		*state = NODE_DOWN;
+	else
+		*state = NODE_UP;
+
+	iter->curnode = bit;
+	return bit;
+}
+
+
+static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res,
+				    struct dlm_master_list_entry *mle,
+				    int blocked)
+{
+	struct dlm_bitmap_diff_iter bdi;
+	enum dlm_node_state_change sc;
+	int node;
+	int ret = 0;
+
+	mlog(0, "something happened such that the "
+	     "master process may need to be restarted!\n");
+
+	assert_spin_locked(&mle->spinlock);
+
+	dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
+	node = dlm_bitmap_diff_iter_next(&bdi, &sc);
+	while (node >= 0) {
+		if (sc == NODE_UP) {
+			/* a node came up.  clear any old vote from
+			 * the response map and set it in the vote map
+			 * then restart the mastery. */
+			mlog(ML_NOTICE, "node %d up while restarting\n", node);
+
+			/* redo the master request, but only for the new node */
+			mlog(0, "sending request to new node\n");
+			clear_bit(node, mle->response_map);
+			set_bit(node, mle->vote_map);
+		} else {
+			mlog(ML_ERROR, "node down! %d\n", node);
+			if (blocked) {
+				int lowest = find_next_bit(mle->maybe_map,
+						       O2NM_MAX_NODES, 0);
+
+				/* act like it was never there */
+				clear_bit(node, mle->maybe_map);
+
+			       	if (node == lowest) {
+					mlog(0, "expected master %u died"
+					    " while this node was blocked "
+					    "waiting on it!\n", node);
+					lowest = find_next_bit(mle->maybe_map,
+						       	O2NM_MAX_NODES,
+						       	lowest+1);
+					if (lowest < O2NM_MAX_NODES) {
+						mlog(0, "%s:%.*s:still "
+						     "blocked. waiting on %u "
+						     "now\n", dlm->name,
+						     res->lockname.len,
+						     res->lockname.name,
+						     lowest);
+					} else {
+						/* mle is an MLE_BLOCK, but
+						 * there is now nothing left to
+						 * block on.  we need to return
+						 * all the way back out and try
+						 * again with an MLE_MASTER.
+						 * dlm_do_local_recovery_cleanup
+						 * has already run, so the mle
+						 * refcount is ok */
+						mlog(0, "%s:%.*s: no "
+						     "longer blocking. try to "
+						     "master this here\n",
+						     dlm->name,
+						     res->lockname.len,
+						     res->lockname.name);
+						mle->type = DLM_MLE_MASTER;
+						mle->mleres = res;
+					}
+				}
+			}
+
+			/* now blank out everything, as if we had never
+			 * contacted anyone */
+			memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+			memset(mle->response_map, 0, sizeof(mle->response_map));
+			/* reset the vote_map to the current node_map */
+			memcpy(mle->vote_map, mle->node_map,
+			       sizeof(mle->node_map));
+			/* put myself into the maybe map */
+			if (mle->type != DLM_MLE_BLOCK)
+				set_bit(dlm->node_num, mle->maybe_map);
+		}
+		ret = -EAGAIN;
+		node = dlm_bitmap_diff_iter_next(&bdi, &sc);
+	}
+	return ret;
+}
+
+
+/*
+ * DLM_MASTER_REQUEST_MSG
+ *
+ * returns: 0 on success,
+ *          -errno on a network error
+ *
+ * on error, the caller should assume the target node is "dead"
+ *
+ */
+
+static int dlm_do_master_request(struct dlm_lock_resource *res,
+				 struct dlm_master_list_entry *mle, int to)
+{
+	struct dlm_ctxt *dlm = mle->dlm;
+	struct dlm_master_request request;
+	int ret, response=0, resend;
+
+	memset(&request, 0, sizeof(request));
+	request.node_idx = dlm->node_num;
+
+	BUG_ON(mle->type == DLM_MLE_MIGRATION);
+
+	request.namelen = (u8)mle->mnamelen;
+	memcpy(request.name, mle->mname, request.namelen);
+
+again:
+	ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
+				 sizeof(request), to, &response);
+	if (ret < 0)  {
+		if (ret == -ESRCH) {
+			/* should never happen */
+			mlog(ML_ERROR, "TCP stack not ready!\n");
+			BUG();
+		} else if (ret == -EINVAL) {
+			mlog(ML_ERROR, "bad args passed to o2net!\n");
+			BUG();
+		} else if (ret == -ENOMEM) {
+			mlog(ML_ERROR, "out of memory while trying to send "
+			     "network message!  retrying\n");
+			/* this is totally crude */
+			msleep(50);
+			goto again;
+		} else if (!dlm_is_host_down(ret)) {
+			/* not a network error. bad. */
+			mlog_errno(ret);
+			mlog(ML_ERROR, "unhandled error!");
+			BUG();
+		}
+		/* all other errors should be network errors,
+		 * and likely indicate node death */
+		mlog(ML_ERROR, "link to %d went down!\n", to);
+		goto out;
+	}
+
+	ret = 0;
+	resend = 0;
+	spin_lock(&mle->spinlock);
+	switch (response) {
+		case DLM_MASTER_RESP_YES:
+			set_bit(to, mle->response_map);
+			mlog(0, "node %u is the master, response=YES\n", to);
+			mlog(0, "%s:%.*s: master node %u now knows I have a "
+			     "reference\n", dlm->name, res->lockname.len,
+			     res->lockname.name, to);
+			mle->master = to;
+			break;
+		case DLM_MASTER_RESP_NO:
+			mlog(0, "node %u not master, response=NO\n", to);
+			set_bit(to, mle->response_map);
+			break;
+		case DLM_MASTER_RESP_MAYBE:
+			mlog(0, "node %u not master, response=MAYBE\n", to);
+			set_bit(to, mle->response_map);
+			set_bit(to, mle->maybe_map);
+			break;
+		case DLM_MASTER_RESP_ERROR:
+			mlog(0, "node %u hit an error, resending\n", to);
+			resend = 1;
+			response = 0;
+			break;
+		default:
+			mlog(ML_ERROR, "bad response! %u\n", response);
+			BUG();
+	}
+	spin_unlock(&mle->spinlock);
+	if (resend) {
+		/* this is also totally crude */
+		msleep(50);
+		goto again;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm->master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
+			       void **ret_data)
+{
+	u8 response = DLM_MASTER_RESP_MAYBE;
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
+	struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
+	char *name;
+	unsigned int namelen, hash;
+	int found, ret;
+	int set_maybe;
+	int dispatch_assert = 0;
+
+	if (!dlm_grab(dlm))
+		return DLM_MASTER_RESP_NO;
+
+	if (!dlm_domain_fully_joined(dlm)) {
+		response = DLM_MASTER_RESP_NO;
+		goto send_response;
+	}
+
+	name = request->name;
+	namelen = request->namelen;
+	hash = dlm_lockid_hash(name, namelen);
+
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		response = DLM_IVBUFLEN;
+		goto send_response;
+	}
+
+way_up_top:
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
+	if (res) {
+		spin_unlock(&dlm->spinlock);
+
+		/* take care of the easy cases up front */
+		spin_lock(&res->spinlock);
+		if (res->state & (DLM_LOCK_RES_RECOVERING|
+				  DLM_LOCK_RES_MIGRATING)) {
+			spin_unlock(&res->spinlock);
+			mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
+			     "being recovered/migrated\n");
+			response = DLM_MASTER_RESP_ERROR;
+			if (mle)
+				kmem_cache_free(dlm_mle_cache, mle);
+			goto send_response;
+		}
+
+		if (res->owner == dlm->node_num) {
+			mlog(0, "%s:%.*s: setting bit %u in refmap\n",
+			     dlm->name, namelen, name, request->node_idx);
+			dlm_lockres_set_refmap_bit(request->node_idx, res);
+			spin_unlock(&res->spinlock);
+			response = DLM_MASTER_RESP_YES;
+			if (mle)
+				kmem_cache_free(dlm_mle_cache, mle);
+
+			/* this node is the owner.
+			 * there is some extra work that needs to
+			 * happen now.  the requesting node has
+			 * caused all nodes up to this one to
+			 * create mles.  this node now needs to
+			 * go back and clean those up. */
+			dispatch_assert = 1;
+			goto send_response;
+		} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			spin_unlock(&res->spinlock);
+			// mlog(0, "node %u is the master\n", res->owner);
+			response = DLM_MASTER_RESP_NO;
+			if (mle)
+				kmem_cache_free(dlm_mle_cache, mle);
+			goto send_response;
+		}
+
+		/* ok, there is no owner.  either this node is
+		 * being blocked, or it is actively trying to
+		 * master this lock. */
+		if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+			mlog(ML_ERROR, "lock with no owner should be "
+			     "in-progress!\n");
+			BUG();
+		}
+
+		// mlog(0, "lockres is in progress...\n");
+		spin_lock(&dlm->master_lock);
+		found = dlm_find_mle(dlm, &tmpmle, name, namelen);
+		if (!found) {
+			mlog(ML_ERROR, "no mle found for this lock!\n");
+			BUG();
+		}
+		set_maybe = 1;
+		spin_lock(&tmpmle->spinlock);
+		if (tmpmle->type == DLM_MLE_BLOCK) {
+			// mlog(0, "this node is waiting for "
+			// "lockres to be mastered\n");
+			response = DLM_MASTER_RESP_NO;
+		} else if (tmpmle->type == DLM_MLE_MIGRATION) {
+			mlog(0, "node %u is master, but trying to migrate to "
+			     "node %u.\n", tmpmle->master, tmpmle->new_master);
+			if (tmpmle->master == dlm->node_num) {
+				mlog(ML_ERROR, "no owner on lockres, but this "
+				     "node is trying to migrate it to %u?!\n",
+				     tmpmle->new_master);
+				BUG();
+			} else {
+				/* the real master can respond on its own */
+				response = DLM_MASTER_RESP_NO;
+			}
+		} else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			set_maybe = 0;
+			if (tmpmle->master == dlm->node_num) {
+				response = DLM_MASTER_RESP_YES;
+				/* this node will be the owner.
+				 * go back and clean the mles on any
+				 * other nodes */
+				dispatch_assert = 1;
+				dlm_lockres_set_refmap_bit(request->node_idx, res);
+				mlog(0, "%s:%.*s: setting bit %u in refmap\n",
+				     dlm->name, namelen, name,
+				     request->node_idx);
+			} else
+				response = DLM_MASTER_RESP_NO;
+		} else {
+			// mlog(0, "this node is attempting to "
+			// "master lockres\n");
+			response = DLM_MASTER_RESP_MAYBE;
+		}
+		if (set_maybe)
+			set_bit(request->node_idx, tmpmle->maybe_map);
+		spin_unlock(&tmpmle->spinlock);
+
+		spin_unlock(&dlm->master_lock);
+		spin_unlock(&res->spinlock);
+
+		/* keep the mle attached to heartbeat events */
+		dlm_put_mle(tmpmle);
+		if (mle)
+			kmem_cache_free(dlm_mle_cache, mle);
+		goto send_response;
+	}
+
+	/*
+	 * lockres doesn't exist on this node
+	 * if there is an MLE_BLOCK, return NO
+	 * if there is an MLE_MASTER, return MAYBE
+	 * otherwise, add an MLE_BLOCK, return NO
+	 */
+	spin_lock(&dlm->master_lock);
+	found = dlm_find_mle(dlm, &tmpmle, name, namelen);
+	if (!found) {
+		/* this lockid has never been seen on this node yet */
+		// mlog(0, "no mle found\n");
+		if (!mle) {
+			spin_unlock(&dlm->master_lock);
+			spin_unlock(&dlm->spinlock);
+
+			mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+			if (!mle) {
+				response = DLM_MASTER_RESP_ERROR;
+				mlog_errno(-ENOMEM);
+				goto send_response;
+			}
+			goto way_up_top;
+		}
+
+		// mlog(0, "this is second time thru, already allocated, "
+		// "add the block.\n");
+		dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
+		set_bit(request->node_idx, mle->maybe_map);
+		__dlm_insert_mle(dlm, mle);
+		response = DLM_MASTER_RESP_NO;
+	} else {
+		// mlog(0, "mle was found\n");
+		set_maybe = 1;
+		spin_lock(&tmpmle->spinlock);
+		if (tmpmle->master == dlm->node_num) {
+			mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
+			BUG();
+		}
+		if (tmpmle->type == DLM_MLE_BLOCK)
+			response = DLM_MASTER_RESP_NO;
+		else if (tmpmle->type == DLM_MLE_MIGRATION) {
+			mlog(0, "migration mle was found (%u->%u)\n",
+			     tmpmle->master, tmpmle->new_master);
+			/* real master can respond on its own */
+			response = DLM_MASTER_RESP_NO;
+		} else
+			response = DLM_MASTER_RESP_MAYBE;
+		if (set_maybe)
+			set_bit(request->node_idx, tmpmle->maybe_map);
+		spin_unlock(&tmpmle->spinlock);
+	}
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	if (found) {
+		/* keep the mle attached to heartbeat events */
+		dlm_put_mle(tmpmle);
+	}
+send_response:
+	/*
+	 * __dlm_lookup_lockres() grabbed a reference to this lockres.
+	 * The reference is released by dlm_assert_master_worker() under
+	 * the call to dlm_dispatch_assert_master().  If
+	 * dlm_assert_master_worker() isn't called, we drop it here.
+	 */
+	if (dispatch_assert) {
+		if (response != DLM_MASTER_RESP_YES)
+			mlog(ML_ERROR, "invalid response %d\n", response);
+		if (!res) {
+			mlog(ML_ERROR, "bad lockres while trying to assert!\n");
+			BUG();
+		}
+		mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
+			     dlm->node_num, res->lockname.len, res->lockname.name);
+		ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
+						 DLM_ASSERT_MASTER_MLE_CLEANUP);
+		if (ret < 0) {
+			mlog(ML_ERROR, "failed to dispatch assert master work\n");
+			response = DLM_MASTER_RESP_ERROR;
+			dlm_lockres_put(res);
+		}
+	} else {
+		if (res)
+			dlm_lockres_put(res);
+	}
+
+	dlm_put(dlm);
+	return response;
+}
+
+/*
+ * DLM_ASSERT_MASTER_MSG
+ */
+
+
+/*
+ * NOTE: this can be used for debugging
+ * can periodically run all locks owned by this node
+ * and re-assert across the cluster...
+ */
+static int dlm_do_assert_master(struct dlm_ctxt *dlm,
+				struct dlm_lock_resource *res,
+				void *nodemap, u32 flags)
+{
+	struct dlm_assert_master assert;
+	int to, tmpret;
+	struct dlm_node_iter iter;
+	int ret = 0;
+	int reassert;
+	const char *lockname = res->lockname.name;
+	unsigned int namelen = res->lockname.len;
+
+	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+
+	spin_lock(&res->spinlock);
+	res->state |= DLM_LOCK_RES_SETREF_INPROG;
+	spin_unlock(&res->spinlock);
+
+again:
+	reassert = 0;
+
+	/* note that if this nodemap is empty, it returns 0 */
+	dlm_node_iter_init(nodemap, &iter);
+	while ((to = dlm_node_iter_next(&iter)) >= 0) {
+		int r = 0;
+		struct dlm_master_list_entry *mle = NULL;
+
+		mlog(0, "sending assert master to %d (%.*s)\n", to,
+		     namelen, lockname);
+		memset(&assert, 0, sizeof(assert));
+		assert.node_idx = dlm->node_num;
+		assert.namelen = namelen;
+		memcpy(assert.name, lockname, namelen);
+		assert.flags = cpu_to_be32(flags);
+
+		tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
+					    &assert, sizeof(assert), to, &r);
+		if (tmpret < 0) {
+			mlog(ML_ERROR, "Error %d when sending message %u (key "
+			     "0x%x) to node %u\n", tmpret,
+			     DLM_ASSERT_MASTER_MSG, dlm->key, to);
+			if (!dlm_is_host_down(tmpret)) {
+				mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
+				BUG();
+			}
+			/* a node died.  finish out the rest of the nodes. */
+			mlog(0, "link to %d went down!\n", to);
+			/* any nonzero status return will do */
+			ret = tmpret;
+			r = 0;
+		} else if (r < 0) {
+			/* ok, something horribly messed.  kill thyself. */
+			mlog(ML_ERROR,"during assert master of %.*s to %u, "
+			     "got %d.\n", namelen, lockname, to, r);
+			spin_lock(&dlm->spinlock);
+			spin_lock(&dlm->master_lock);
+			if (dlm_find_mle(dlm, &mle, (char *)lockname,
+					 namelen)) {
+				dlm_print_one_mle(mle);
+				__dlm_put_mle(mle);
+			}
+			spin_unlock(&dlm->master_lock);
+			spin_unlock(&dlm->spinlock);
+			BUG();
+		}
+
+		if (r & DLM_ASSERT_RESPONSE_REASSERT &&
+		    !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) {
+				mlog(ML_ERROR, "%.*s: very strange, "
+				     "master MLE but no lockres on %u\n",
+				     namelen, lockname, to);
+		}
+
+		if (r & DLM_ASSERT_RESPONSE_REASSERT) {
+			mlog(0, "%.*s: node %u create mles on other "
+			     "nodes and requests a re-assert\n",
+			     namelen, lockname, to);
+			reassert = 1;
+		}
+		if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) {
+			mlog(0, "%.*s: node %u has a reference to this "
+			     "lockres, set the bit in the refmap\n",
+			     namelen, lockname, to);
+			spin_lock(&res->spinlock);
+			dlm_lockres_set_refmap_bit(to, res);
+			spin_unlock(&res->spinlock);
+		}
+	}
+
+	if (reassert)
+		goto again;
+
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	return ret;
+}
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm->master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
+			      void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_master_list_entry *mle = NULL;
+	struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	char *name;
+	unsigned int namelen, hash;
+	u32 flags;
+	int master_request = 0, have_lockres_ref = 0;
+	int ret = 0;
+
+	if (!dlm_grab(dlm))
+		return 0;
+
+	name = assert->name;
+	namelen = assert->namelen;
+	hash = dlm_lockid_hash(name, namelen);
+	flags = be32_to_cpu(assert->flags);
+
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		mlog(ML_ERROR, "Invalid name length!");
+		goto done;
+	}
+
+	spin_lock(&dlm->spinlock);
+
+	if (flags)
+		mlog(0, "assert_master with flags: %u\n", flags);
+
+	/* find the MLE */
+	spin_lock(&dlm->master_lock);
+	if (!dlm_find_mle(dlm, &mle, name, namelen)) {
+		/* not an error, could be master just re-asserting */
+		mlog(0, "just got an assert_master from %u, but no "
+		     "MLE for it! (%.*s)\n", assert->node_idx,
+		     namelen, name);
+	} else {
+		int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
+		if (bit >= O2NM_MAX_NODES) {
+			/* not necessarily an error, though less likely.
+			 * could be master just re-asserting. */
+			mlog(0, "no bits set in the maybe_map, but %u "
+			     "is asserting! (%.*s)\n", assert->node_idx,
+			     namelen, name);
+		} else if (bit != assert->node_idx) {
+			if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
+				mlog(0, "master %u was found, %u should "
+				     "back off\n", assert->node_idx, bit);
+			} else {
+				/* with the fix for bug 569, a higher node
+				 * number winning the mastery will respond
+				 * YES to mastery requests, but this node
+				 * had no way of knowing.  let it pass. */
+				mlog(0, "%u is the lowest node, "
+				     "%u is asserting. (%.*s)  %u must "
+				     "have begun after %u won.\n", bit,
+				     assert->node_idx, namelen, name, bit,
+				     assert->node_idx);
+			}
+		}
+		if (mle->type == DLM_MLE_MIGRATION) {
+			if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
+				mlog(0, "%s:%.*s: got cleanup assert"
+				     " from %u for migration\n",
+				     dlm->name, namelen, name,
+				     assert->node_idx);
+			} else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
+				mlog(0, "%s:%.*s: got unrelated assert"
+				     " from %u for migration, ignoring\n",
+				     dlm->name, namelen, name,
+				     assert->node_idx);
+				__dlm_put_mle(mle);
+				spin_unlock(&dlm->master_lock);
+				spin_unlock(&dlm->spinlock);
+				goto done;
+			}
+		}
+	}
+	spin_unlock(&dlm->master_lock);
+
+	/* ok everything checks out with the MLE
+	 * now check to see if there is a lockres */
+	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
+	if (res) {
+		spin_lock(&res->spinlock);
+		if (res->state & DLM_LOCK_RES_RECOVERING)  {
+			mlog(ML_ERROR, "%u asserting but %.*s is "
+			     "RECOVERING!\n", assert->node_idx, namelen, name);
+			goto kill;
+		}
+		if (!mle) {
+			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
+			    res->owner != assert->node_idx) {
+				mlog(ML_ERROR, "DIE! Mastery assert from %u, "
+				     "but current owner is %u! (%.*s)\n",
+				     assert->node_idx, res->owner, namelen,
+				     name);
+				__dlm_print_one_lock_resource(res);
+				BUG();
+			}
+		} else if (mle->type != DLM_MLE_MIGRATION) {
+			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+				/* owner is just re-asserting */
+				if (res->owner == assert->node_idx) {
+					mlog(0, "owner %u re-asserting on "
+					     "lock %.*s\n", assert->node_idx,
+					     namelen, name);
+					goto ok;
+				}
+				mlog(ML_ERROR, "got assert_master from "
+				     "node %u, but %u is the owner! "
+				     "(%.*s)\n", assert->node_idx,
+				     res->owner, namelen, name);
+				goto kill;
+			}
+			if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+				mlog(ML_ERROR, "got assert from %u, but lock "
+				     "with no owner should be "
+				     "in-progress! (%.*s)\n",
+				     assert->node_idx,
+				     namelen, name);
+				goto kill;
+			}
+		} else /* mle->type == DLM_MLE_MIGRATION */ {
+			/* should only be getting an assert from new master */
+			if (assert->node_idx != mle->new_master) {
+				mlog(ML_ERROR, "got assert from %u, but "
+				     "new master is %u, and old master "
+				     "was %u (%.*s)\n",
+				     assert->node_idx, mle->new_master,
+				     mle->master, namelen, name);
+				goto kill;
+			}
+
+		}
+ok:
+		spin_unlock(&res->spinlock);
+	}
+
+	// mlog(0, "woo!  got an assert_master from node %u!\n",
+	// 	     assert->node_idx);
+	if (mle) {
+		int extra_ref = 0;
+		int nn = -1;
+		int rr, err = 0;
+
+		spin_lock(&mle->spinlock);
+		if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
+			extra_ref = 1;
+		else {
+			/* MASTER mle: if any bits set in the response map
+			 * then the calling node needs to re-assert to clear
+			 * up nodes that this node contacted */
+			while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
+						    nn+1)) < O2NM_MAX_NODES) {
+				if (nn != dlm->node_num && nn != assert->node_idx)
+					master_request = 1;
+			}
+		}
+		mle->master = assert->node_idx;
+		atomic_set(&mle->woken, 1);
+		wake_up(&mle->wq);
+		spin_unlock(&mle->spinlock);
+
+		if (res) {
+			int wake = 0;
+			spin_lock(&res->spinlock);
+			if (mle->type == DLM_MLE_MIGRATION) {
+				mlog(0, "finishing off migration of lockres %.*s, "
+			     		"from %u to %u\n",
+			       		res->lockname.len, res->lockname.name,
+			       		dlm->node_num, mle->new_master);
+				res->state &= ~DLM_LOCK_RES_MIGRATING;
+				wake = 1;
+				dlm_change_lockres_owner(dlm, res, mle->new_master);
+				BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+			} else {
+				dlm_change_lockres_owner(dlm, res, mle->master);
+			}
+			spin_unlock(&res->spinlock);
+			have_lockres_ref = 1;
+			if (wake)
+				wake_up(&res->wq);
+		}
+
+		/* master is known, detach if not already detached.
+		 * ensures that only one assert_master call will happen
+		 * on this mle. */
+		spin_lock(&dlm->master_lock);
+
+		rr = atomic_read(&mle->mle_refs.refcount);
+		if (mle->inuse > 0) {
+			if (extra_ref && rr < 3)
+				err = 1;
+			else if (!extra_ref && rr < 2)
+				err = 1;
+		} else {
+			if (extra_ref && rr < 2)
+				err = 1;
+			else if (!extra_ref && rr < 1)
+				err = 1;
+		}
+		if (err) {
+			mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
+			     "that will mess up this node, refs=%d, extra=%d, "
+			     "inuse=%d\n", dlm->name, namelen, name,
+			     assert->node_idx, rr, extra_ref, mle->inuse);
+			dlm_print_one_mle(mle);
+		}
+		__dlm_unlink_mle(dlm, mle);
+		__dlm_mle_detach_hb_events(dlm, mle);
+		__dlm_put_mle(mle);
+		if (extra_ref) {
+			/* the assert master message now balances the extra
+		 	 * ref given by the master / migration request message.
+		 	 * if this is the last put, it will be removed
+		 	 * from the list. */
+			__dlm_put_mle(mle);
+		}
+		spin_unlock(&dlm->master_lock);
+	} else if (res) {
+		if (res->owner != assert->node_idx) {
+			mlog(0, "assert_master from %u, but current "
+			     "owner is %u (%.*s), no mle\n", assert->node_idx,
+			     res->owner, namelen, name);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+
+done:
+	ret = 0;
+	if (res) {
+		spin_lock(&res->spinlock);
+		res->state |= DLM_LOCK_RES_SETREF_INPROG;
+		spin_unlock(&res->spinlock);
+		*ret_data = (void *)res;
+	}
+	dlm_put(dlm);
+	if (master_request) {
+		mlog(0, "need to tell master to reassert\n");
+		/* positive. negative would shoot down the node. */
+		ret |= DLM_ASSERT_RESPONSE_REASSERT;
+		if (!have_lockres_ref) {
+			mlog(ML_ERROR, "strange, got assert from %u, MASTER "
+			     "mle present here for %s:%.*s, but no lockres!\n",
+			     assert->node_idx, dlm->name, namelen, name);
+		}
+	}
+	if (have_lockres_ref) {
+		/* let the master know we have a reference to the lockres */
+		ret |= DLM_ASSERT_RESPONSE_MASTERY_REF;
+		mlog(0, "%s:%.*s: got assert from %u, need a ref\n",
+		     dlm->name, namelen, name, assert->node_idx);
+	}
+	return ret;
+
+kill:
+	/* kill the caller! */
+	mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
+	     "and killing the other node now!  This node is OK and can continue.\n");
+	__dlm_print_one_lock_resource(res);
+	spin_unlock(&res->spinlock);
+	spin_unlock(&dlm->spinlock);
+	*ret_data = (void *)res;
+	dlm_put(dlm);
+	return -EINVAL;
+}
+
+void dlm_assert_master_post_handler(int status, void *data, void *ret_data)
+{
+	struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data;
+
+	if (ret_data) {
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
+		spin_unlock(&res->spinlock);
+		wake_up(&res->wq);
+		dlm_lockres_put(res);
+	}
+	return;
+}
+
+int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res,
+			       int ignore_higher, u8 request_from, u32 flags)
+{
+	struct dlm_work_item *item;
+	item = kzalloc(sizeof(*item), GFP_NOFS);
+	if (!item)
+		return -ENOMEM;
+
+
+	/* queue up work for dlm_assert_master_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
+	item->u.am.lockres = res; /* already have a ref */
+	/* can optionally ignore node numbers higher than this node */
+	item->u.am.ignore_higher = ignore_higher;
+	item->u.am.request_from = request_from;
+	item->u.am.flags = flags;
+
+	if (ignore_higher)
+		mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
+		     res->lockname.name);
+
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
+	return 0;
+}
+
+static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	int ret = 0;
+	struct dlm_lock_resource *res;
+	unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int ignore_higher;
+	int bit;
+	u8 request_from;
+	u32 flags;
+
+	dlm = item->dlm;
+	res = item->u.am.lockres;
+	ignore_higher = item->u.am.ignore_higher;
+	request_from = item->u.am.request_from;
+	flags = item->u.am.flags;
+
+	spin_lock(&dlm->spinlock);
+	memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
+	spin_unlock(&dlm->spinlock);
+
+	clear_bit(dlm->node_num, nodemap);
+	if (ignore_higher) {
+		/* if is this just to clear up mles for nodes below
+		 * this node, do not send the message to the original
+		 * caller or any node number higher than this */
+		clear_bit(request_from, nodemap);
+		bit = dlm->node_num;
+		while (1) {
+			bit = find_next_bit(nodemap, O2NM_MAX_NODES,
+					    bit+1);
+		       	if (bit >= O2NM_MAX_NODES)
+				break;
+			clear_bit(bit, nodemap);
+		}
+	}
+
+	/*
+	 * If we're migrating this lock to someone else, we are no
+	 * longer allowed to assert out own mastery.  OTOH, we need to
+	 * prevent migration from starting while we're still asserting
+	 * our dominance.  The reserved ast delays migration.
+	 */
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		mlog(0, "Someone asked us to assert mastery, but we're "
+		     "in the middle of migration.  Skipping assert, "
+		     "the new master will handle that.\n");
+		spin_unlock(&res->spinlock);
+		goto put;
+	} else
+		__dlm_lockres_reserve_ast(res);
+	spin_unlock(&res->spinlock);
+
+	/* this call now finishes out the nodemap
+	 * even if one or more nodes die */
+	mlog(0, "worker about to master %.*s here, this=%u\n",
+		     res->lockname.len, res->lockname.name, dlm->node_num);
+	ret = dlm_do_assert_master(dlm, res, nodemap, flags);
+	if (ret < 0) {
+		/* no need to restart, we are done */
+		if (!dlm_is_host_down(ret))
+			mlog_errno(ret);
+	}
+
+	/* Ok, we've asserted ourselves.  Let's let migration start. */
+	dlm_lockres_release_ast(dlm, res);
+
+put:
+	dlm_lockres_put(res);
+
+	mlog(0, "finished with dlm_assert_master_worker\n");
+}
+
+/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
+ * We cannot wait for node recovery to complete to begin mastering this
+ * lockres because this lockres is used to kick off recovery! ;-)
+ * So, do a pre-check on all living nodes to see if any of those nodes
+ * think that $RECOVERY is currently mastered by a dead node.  If so,
+ * we wait a short time to allow that node to get notified by its own
+ * heartbeat stack, then check again.  All $RECOVERY lock resources
+ * mastered by dead nodes are purged when the hearbeat callback is
+ * fired, so we can know for sure that it is safe to continue once
+ * the node returns a live node or no node.  */
+static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res)
+{
+	struct dlm_node_iter iter;
+	int nodenum;
+	int ret = 0;
+	u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		/* do not send to self */
+		if (nodenum == dlm->node_num)
+			continue;
+		ret = dlm_do_master_requery(dlm, res, nodenum, &master);
+		if (ret < 0) {
+			mlog_errno(ret);
+			if (!dlm_is_host_down(ret))
+				BUG();
+			/* host is down, so answer for that node would be
+			 * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
+			ret = 0;
+		}
+
+		if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			/* check to see if this master is in the recovery map */
+			spin_lock(&dlm->spinlock);
+			if (test_bit(master, dlm->recovery_map)) {
+				mlog(ML_NOTICE, "%s: node %u has not seen "
+				     "node %u go down yet, and thinks the "
+				     "dead node is mastering the recovery "
+				     "lock.  must wait.\n", dlm->name,
+				     nodenum, master);
+				ret = -EAGAIN;
+			}
+			spin_unlock(&dlm->spinlock);
+			mlog(0, "%s: reco lock master is %u\n", dlm->name,
+			     master);
+			break;
+		}
+	}
+	return ret;
+}
+
+/*
+ * DLM_DEREF_LOCKRES_MSG
+ */
+
+int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	struct dlm_deref_lockres deref;
+	int ret = 0, r;
+	const char *lockname;
+	unsigned int namelen;
+
+	lockname = res->lockname.name;
+	namelen = res->lockname.len;
+	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+
+	mlog(0, "%s:%.*s: sending deref to %d\n",
+	     dlm->name, namelen, lockname, res->owner);
+	memset(&deref, 0, sizeof(deref));
+	deref.node_idx = dlm->node_num;
+	deref.namelen = namelen;
+	memcpy(deref.name, lockname, namelen);
+
+	ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
+				 &deref, sizeof(deref), res->owner, &r);
+	if (ret < 0)
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
+		     res->owner);
+	else if (r < 0) {
+		/* BAD.  other node says I did not have a ref. */
+		mlog(ML_ERROR,"while dropping ref on %s:%.*s "
+		    "(master=%u) got %d.\n", dlm->name, namelen,
+		    lockname, res->owner, r);
+		dlm_print_one_lock_resource(res);
+		BUG();
+	}
+	return ret;
+}
+
+int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
+			      void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	char *name;
+	unsigned int namelen;
+	int ret = -EINVAL;
+	u8 node;
+	unsigned int hash;
+	struct dlm_work_item *item;
+	int cleared = 0;
+	int dispatch = 0;
+
+	if (!dlm_grab(dlm))
+		return 0;
+
+	name = deref->name;
+	namelen = deref->namelen;
+	node = deref->node_idx;
+
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		mlog(ML_ERROR, "Invalid name length!");
+		goto done;
+	}
+	if (deref->node_idx >= O2NM_MAX_NODES) {
+		mlog(ML_ERROR, "Invalid node number: %u\n", node);
+		goto done;
+	}
+
+	hash = dlm_lockid_hash(name, namelen);
+
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
+	if (!res) {
+		spin_unlock(&dlm->spinlock);
+		mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
+		     dlm->name, namelen, name);
+		goto done;
+	}
+	spin_unlock(&dlm->spinlock);
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_SETREF_INPROG)
+		dispatch = 1;
+	else {
+		BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
+		if (test_bit(node, res->refmap)) {
+			dlm_lockres_clear_refmap_bit(node, res);
+			cleared = 1;
+		}
+	}
+	spin_unlock(&res->spinlock);
+
+	if (!dispatch) {
+		if (cleared)
+			dlm_lockres_calc_usage(dlm, res);
+		else {
+			mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
+		     	"but it is already dropped!\n", dlm->name,
+		     	res->lockname.len, res->lockname.name, node);
+			dlm_print_one_lock_resource(res);
+		}
+		ret = 0;
+		goto done;
+	}
+
+	item = kzalloc(sizeof(*item), GFP_NOFS);
+	if (!item) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto done;
+	}
+
+	dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL);
+	item->u.dl.deref_res = res;
+	item->u.dl.deref_node = node;
+
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
+	return 0;
+
+done:
+	if (res)
+		dlm_lockres_put(res);
+	dlm_put(dlm);
+
+	return ret;
+}
+
+static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_ctxt *dlm;
+	struct dlm_lock_resource *res;
+	u8 node;
+	u8 cleared = 0;
+
+	dlm = item->dlm;
+	res = item->u.dl.deref_res;
+	node = item->u.dl.deref_node;
+
+	spin_lock(&res->spinlock);
+	BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
+	if (test_bit(node, res->refmap)) {
+		__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+		dlm_lockres_clear_refmap_bit(node, res);
+		cleared = 1;
+	}
+	spin_unlock(&res->spinlock);
+
+	if (cleared) {
+		mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
+		     dlm->name, res->lockname.len, res->lockname.name, node);
+		dlm_lockres_calc_usage(dlm, res);
+	} else {
+		mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
+		     "but it is already dropped!\n", dlm->name,
+		     res->lockname.len, res->lockname.name, node);
+		dlm_print_one_lock_resource(res);
+	}
+
+	dlm_lockres_put(res);
+}
+
+/*
+ * A migrateable resource is one that is :
+ * 1. locally mastered, and,
+ * 2. zero local locks, and,
+ * 3. one or more non-local locks, or, one or more references
+ * Returns 1 if yes, 0 if not.
+ */
+static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res)
+{
+	enum dlm_lockres_list idx;
+	int nonlocal = 0, node_ref;
+	struct list_head *queue;
+	struct dlm_lock *lock;
+	u64 cookie;
+
+	assert_spin_locked(&res->spinlock);
+
+	if (res->owner != dlm->node_num)
+		return 0;
+
+        for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+		queue = dlm_list_idx_to_ptr(res, idx);
+		list_for_each_entry(lock, queue, list) {
+			if (lock->ml.node != dlm->node_num) {
+				nonlocal++;
+				continue;
+			}
+			cookie = be64_to_cpu(lock->ml.cookie);
+			mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
+			     "%s list\n", dlm->name, res->lockname.len,
+			     res->lockname.name,
+			     dlm_get_lock_cookie_node(cookie),
+			     dlm_get_lock_cookie_seq(cookie),
+			     dlm_list_in_text(idx));
+			return 0;
+		}
+	}
+
+	if (!nonlocal) {
+		node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+		if (node_ref >= O2NM_MAX_NODES)
+			return 0;
+	}
+
+	mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
+	     res->lockname.name);
+
+	return 1;
+}
+
+/*
+ * DLM_MIGRATE_LOCKRES
+ */
+
+
+static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res, u8 target)
+{
+	struct dlm_master_list_entry *mle = NULL;
+	struct dlm_master_list_entry *oldmle = NULL;
+ 	struct dlm_migratable_lockres *mres = NULL;
+	int ret = 0;
+	const char *name;
+	unsigned int namelen;
+	int mle_added = 0;
+	int wake = 0;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	BUG_ON(target == O2NM_MAX_NODES);
+
+	name = res->lockname.name;
+	namelen = res->lockname.len;
+
+	mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
+	     target);
+
+	/* preallocate up front. if this fails, abort */
+	ret = -ENOMEM;
+	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
+	if (!mres) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+	if (!mle) {
+		mlog_errno(ret);
+		goto leave;
+	}
+	ret = 0;
+
+	/*
+	 * clear any existing master requests and
+	 * add the migration mle to the list
+	 */
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
+				    namelen, target, dlm->node_num);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	if (ret == -EEXIST) {
+		mlog(0, "another process is already migrating it\n");
+		goto fail;
+	}
+	mle_added = 1;
+
+	/*
+	 * set the MIGRATING flag and flush asts
+	 * if we fail after this we need to re-dirty the lockres
+	 */
+	if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
+		mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
+		     "the target went down.\n", res->lockname.len,
+		     res->lockname.name, target);
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_MIGRATING;
+		wake = 1;
+		spin_unlock(&res->spinlock);
+		ret = -EINVAL;
+	}
+
+fail:
+	if (oldmle) {
+		/* master is known, detach if not already detached */
+		dlm_mle_detach_hb_events(dlm, oldmle);
+		dlm_put_mle(oldmle);
+	}
+
+	if (ret < 0) {
+		if (mle_added) {
+			dlm_mle_detach_hb_events(dlm, mle);
+			dlm_put_mle(mle);
+		} else if (mle) {
+			kmem_cache_free(dlm_mle_cache, mle);
+			mle = NULL;
+		}
+		goto leave;
+	}
+
+	/*
+	 * at this point, we have a migration target, an mle
+	 * in the master list, and the MIGRATING flag set on
+	 * the lockres
+	 */
+
+	/* now that remote nodes are spinning on the MIGRATING flag,
+	 * ensure that all assert_master work is flushed. */
+	flush_workqueue(dlm->dlm_worker);
+
+	/* get an extra reference on the mle.
+	 * otherwise the assert_master from the new
+	 * master will destroy this.
+	 * also, make sure that all callers of dlm_get_mle
+	 * take both dlm->spinlock and dlm->master_lock */
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	dlm_get_mle_inuse(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	/* notify new node and send all lock state */
+	/* call send_one_lockres with migration flag.
+	 * this serves as notice to the target node that a
+	 * migration is starting. */
+	ret = dlm_send_one_lockres(dlm, res, mres, target,
+				   DLM_MRES_MIGRATION);
+
+	if (ret < 0) {
+		mlog(0, "migration to node %u failed with %d\n",
+		     target, ret);
+		/* migration failed, detach and clean up mle */
+		dlm_mle_detach_hb_events(dlm, mle);
+		dlm_put_mle(mle);
+		dlm_put_mle_inuse(mle);
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_MIGRATING;
+		wake = 1;
+		spin_unlock(&res->spinlock);
+		if (dlm_is_host_down(ret))
+			dlm_wait_for_node_death(dlm, target,
+						DLM_NODE_DEATH_WAIT_MAX);
+		goto leave;
+	}
+
+	/* at this point, the target sends a message to all nodes,
+	 * (using dlm_do_migrate_request).  this node is skipped since
+	 * we had to put an mle in the list to begin the process.  this
+	 * node now waits for target to do an assert master.  this node
+	 * will be the last one notified, ensuring that the migration
+	 * is complete everywhere.  if the target dies while this is
+	 * going on, some nodes could potentially see the target as the
+	 * master, so it is important that my recovery finds the migration
+	 * mle and sets the master to UNKNOWN. */
+
+
+	/* wait for new node to assert master */
+	while (1) {
+		ret = wait_event_interruptible_timeout(mle->wq,
+					(atomic_read(&mle->woken) == 1),
+					msecs_to_jiffies(5000));
+
+		if (ret >= 0) {
+		       	if (atomic_read(&mle->woken) == 1 ||
+			    res->owner == target)
+				break;
+
+			mlog(0, "%s:%.*s: timed out during migration\n",
+			     dlm->name, res->lockname.len, res->lockname.name);
+			/* avoid hang during shutdown when migrating lockres
+			 * to a node which also goes down */
+			if (dlm_is_node_dead(dlm, target)) {
+				mlog(0, "%s:%.*s: expected migration "
+				     "target %u is no longer up, restarting\n",
+				     dlm->name, res->lockname.len,
+				     res->lockname.name, target);
+				ret = -EINVAL;
+				/* migration failed, detach and clean up mle */
+				dlm_mle_detach_hb_events(dlm, mle);
+				dlm_put_mle(mle);
+				dlm_put_mle_inuse(mle);
+				spin_lock(&res->spinlock);
+				res->state &= ~DLM_LOCK_RES_MIGRATING;
+				wake = 1;
+				spin_unlock(&res->spinlock);
+				goto leave;
+			}
+		} else
+			mlog(0, "%s:%.*s: caught signal during migration\n",
+			     dlm->name, res->lockname.len, res->lockname.name);
+	}
+
+	/* all done, set the owner, clear the flag */
+	spin_lock(&res->spinlock);
+	dlm_set_lockres_owner(dlm, res, target);
+	res->state &= ~DLM_LOCK_RES_MIGRATING;
+	dlm_remove_nonlocal_locks(dlm, res);
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	/* master is known, detach if not already detached */
+	dlm_mle_detach_hb_events(dlm, mle);
+	dlm_put_mle_inuse(mle);
+	ret = 0;
+
+	dlm_lockres_calc_usage(dlm, res);
+
+leave:
+	/* re-dirty the lockres if we failed */
+	if (ret < 0)
+		dlm_kick_thread(dlm, res);
+
+	/* wake up waiters if the MIGRATING flag got set
+	 * but migration failed */
+	if (wake)
+		wake_up(&res->wq);
+
+	if (mres)
+		free_page((unsigned long)mres);
+
+	dlm_put(dlm);
+
+	mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
+	     name, target, ret);
+	return ret;
+}
+
+#define DLM_MIGRATION_RETRY_MS  100
+
+/*
+ * Should be called only after beginning the domain leave process.
+ * There should not be any remaining locks on nonlocal lock resources,
+ * and there should be no local locks left on locally mastered resources.
+ *
+ * Called with the dlm spinlock held, may drop it to do migration, but
+ * will re-acquire before exit.
+ *
+ * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
+ */
+int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	int ret;
+	int lock_dropped = 0;
+	u8 target = O2NM_MAX_NODES;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	spin_lock(&res->spinlock);
+	if (dlm_is_lockres_migrateable(dlm, res))
+		target = dlm_pick_migration_target(dlm, res);
+	spin_unlock(&res->spinlock);
+
+	if (target == O2NM_MAX_NODES)
+		goto leave;
+
+	/* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
+	spin_unlock(&dlm->spinlock);
+	lock_dropped = 1;
+	ret = dlm_migrate_lockres(dlm, res, target);
+	if (ret)
+		mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     target, ret);
+	spin_lock(&dlm->spinlock);
+leave:
+	return lock_dropped;
+}
+
+int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	int ret;
+	spin_lock(&dlm->ast_lock);
+	spin_lock(&lock->spinlock);
+	ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&dlm->ast_lock);
+	return ret;
+}
+
+static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     u8 mig_target)
+{
+	int can_proceed;
+	spin_lock(&res->spinlock);
+	can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
+	spin_unlock(&res->spinlock);
+
+	/* target has died, so make the caller break out of the
+	 * wait_event, but caller must recheck the domain_map */
+	spin_lock(&dlm->spinlock);
+	if (!test_bit(mig_target, dlm->domain_map))
+		can_proceed = 1;
+	spin_unlock(&dlm->spinlock);
+	return can_proceed;
+}
+
+static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm,
+				struct dlm_lock_resource *res)
+{
+	int ret;
+	spin_lock(&res->spinlock);
+	ret = !!(res->state & DLM_LOCK_RES_DIRTY);
+	spin_unlock(&res->spinlock);
+	return ret;
+}
+
+
+static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res,
+				       u8 target)
+{
+	int ret = 0;
+
+	mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
+	       res->lockname.len, res->lockname.name, dlm->node_num,
+	       target);
+	/* need to set MIGRATING flag on lockres.  this is done by
+	 * ensuring that all asts have been flushed for this lockres. */
+	spin_lock(&res->spinlock);
+	BUG_ON(res->migration_pending);
+	res->migration_pending = 1;
+	/* strategy is to reserve an extra ast then release
+	 * it below, letting the release do all of the work */
+	__dlm_lockres_reserve_ast(res);
+	spin_unlock(&res->spinlock);
+
+	/* now flush all the pending asts */
+	dlm_kick_thread(dlm, res);
+	/* before waiting on DIRTY, block processes which may
+	 * try to dirty the lockres before MIGRATING is set */
+	spin_lock(&res->spinlock);
+	BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY);
+	res->state |= DLM_LOCK_RES_BLOCK_DIRTY;
+	spin_unlock(&res->spinlock);
+	/* now wait on any pending asts and the DIRTY state */
+	wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
+	dlm_lockres_release_ast(dlm, res);
+
+	mlog(0, "about to wait on migration_wq, dirty=%s\n",
+	       res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+	/* if the extra ref we just put was the final one, this
+	 * will pass thru immediately.  otherwise, we need to wait
+	 * for the last ast to finish. */
+again:
+	ret = wait_event_interruptible_timeout(dlm->migration_wq,
+		   dlm_migration_can_proceed(dlm, res, target),
+		   msecs_to_jiffies(1000));
+	if (ret < 0) {
+		mlog(0, "woken again: migrating? %s, dead? %s\n",
+		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
+		       test_bit(target, dlm->domain_map) ? "no":"yes");
+	} else {
+		mlog(0, "all is well: migrating? %s, dead? %s\n",
+		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
+		       test_bit(target, dlm->domain_map) ? "no":"yes");
+	}
+	if (!dlm_migration_can_proceed(dlm, res, target)) {
+		mlog(0, "trying again...\n");
+		goto again;
+	}
+
+	ret = 0;
+	/* did the target go down or die? */
+	spin_lock(&dlm->spinlock);
+	if (!test_bit(target, dlm->domain_map)) {
+		mlog(ML_ERROR, "aha. migration target %u just went down\n",
+		     target);
+		ret = -EHOSTDOWN;
+	}
+	spin_unlock(&dlm->spinlock);
+
+	/*
+	 * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
+	 * another try; otherwise, we are sure the MIGRATING state is there,
+	 * drop the unneded state which blocked threads trying to DIRTY
+	 */
+	spin_lock(&res->spinlock);
+	BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
+	res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
+	if (!ret)
+		BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+	spin_unlock(&res->spinlock);
+
+	/*
+	 * at this point:
+	 *
+	 *   o the DLM_LOCK_RES_MIGRATING flag is set if target not down
+	 *   o there are no pending asts on this lockres
+	 *   o all processes trying to reserve an ast on this
+	 *     lockres must wait for the MIGRATING flag to clear
+	 */
+	return ret;
+}
+
+/* last step in the migration process.
+ * original master calls this to free all of the dlm_lock
+ * structures that used to be for other nodes. */
+static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res)
+{
+	struct list_head *queue = &res->granted;
+	int i, bit;
+	struct dlm_lock *lock, *next;
+
+	assert_spin_locked(&res->spinlock);
+
+	BUG_ON(res->owner == dlm->node_num);
+
+	for (i=0; i<3; i++) {
+		list_for_each_entry_safe(lock, next, queue, list) {
+			if (lock->ml.node != dlm->node_num) {
+				mlog(0, "putting lock for node %u\n",
+				     lock->ml.node);
+				/* be extra careful */
+				BUG_ON(!list_empty(&lock->ast_list));
+				BUG_ON(!list_empty(&lock->bast_list));
+				BUG_ON(lock->ast_pending);
+				BUG_ON(lock->bast_pending);
+				dlm_lockres_clear_refmap_bit(lock->ml.node, res);
+				list_del_init(&lock->list);
+				dlm_lock_put(lock);
+				/* In a normal unlock, we would have added a
+				 * DLM_UNLOCK_FREE_LOCK action. Force it. */
+				dlm_lock_put(lock);
+			}
+		}
+		queue++;
+	}
+	bit = 0;
+	while (1) {
+		bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
+		if (bit >= O2NM_MAX_NODES)
+			break;
+		/* do not clear the local node reference, if there is a
+		 * process holding this, let it drop the ref itself */
+		if (bit != dlm->node_num) {
+			mlog(0, "%s:%.*s: node %u had a ref to this "
+			     "migrating lockres, clearing\n", dlm->name,
+			     res->lockname.len, res->lockname.name, bit);
+			dlm_lockres_clear_refmap_bit(bit, res);
+		}
+		bit++;
+	}
+}
+
+/*
+ * Pick a node to migrate the lock resource to. This function selects a
+ * potential target based first on the locks and then on refmap. It skips
+ * nodes that are in the process of exiting the domain.
+ */
+static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res)
+{
+	enum dlm_lockres_list idx;
+	struct list_head *queue = &res->granted;
+	struct dlm_lock *lock;
+	int noderef;
+	u8 nodenum = O2NM_MAX_NODES;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	/* Go through all the locks */
+	for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+		queue = dlm_list_idx_to_ptr(res, idx);
+		list_for_each_entry(lock, queue, list) {
+			if (lock->ml.node == dlm->node_num)
+				continue;
+			if (test_bit(lock->ml.node, dlm->exit_domain_map))
+				continue;
+			nodenum = lock->ml.node;
+			goto bail;
+		}
+	}
+
+	/* Go thru the refmap */
+	noderef = -1;
+	while (1) {
+		noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
+					noderef + 1);
+		if (noderef >= O2NM_MAX_NODES)
+			break;
+		if (noderef == dlm->node_num)
+			continue;
+		if (test_bit(noderef, dlm->exit_domain_map))
+			continue;
+		nodenum = noderef;
+		goto bail;
+	}
+
+bail:
+	return nodenum;
+}
+
+/* this is called by the new master once all lockres
+ * data has been received */
+static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  u8 master, u8 new_master,
+				  struct dlm_node_iter *iter)
+{
+	struct dlm_migrate_request migrate;
+	int ret, skip, status = 0;
+	int nodenum;
+
+	memset(&migrate, 0, sizeof(migrate));
+	migrate.namelen = res->lockname.len;
+	memcpy(migrate.name, res->lockname.name, migrate.namelen);
+	migrate.new_master = new_master;
+	migrate.master = master;
+
+	ret = 0;
+
+	/* send message to all nodes, except the master and myself */
+	while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
+		if (nodenum == master ||
+		    nodenum == new_master)
+			continue;
+
+		/* We could race exit domain. If exited, skip. */
+		spin_lock(&dlm->spinlock);
+		skip = (!test_bit(nodenum, dlm->domain_map));
+		spin_unlock(&dlm->spinlock);
+		if (skip) {
+			clear_bit(nodenum, iter->node_map);
+			continue;
+		}
+
+		ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
+					 &migrate, sizeof(migrate), nodenum,
+					 &status);
+		if (ret < 0) {
+			mlog(ML_ERROR, "Error %d when sending message %u (key "
+			     "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
+			     dlm->key, nodenum);
+			if (!dlm_is_host_down(ret)) {
+				mlog(ML_ERROR, "unhandled error=%d!\n", ret);
+				BUG();
+			}
+			clear_bit(nodenum, iter->node_map);
+			ret = 0;
+		} else if (status < 0) {
+			mlog(0, "migrate request (node %u) returned %d!\n",
+			     nodenum, status);
+			ret = status;
+		} else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) {
+			/* during the migration request we short-circuited
+			 * the mastery of the lockres.  make sure we have
+			 * a mastery ref for nodenum */
+			mlog(0, "%s:%.*s: need ref for node %u\n",
+			     dlm->name, res->lockname.len, res->lockname.name,
+			     nodenum);
+			spin_lock(&res->spinlock);
+			dlm_lockres_set_refmap_bit(nodenum, res);
+			spin_unlock(&res->spinlock);
+		}
+	}
+
+	if (ret < 0)
+		mlog_errno(ret);
+
+	mlog(0, "returning ret=%d\n", ret);
+	return ret;
+}
+
+
+/* if there is an existing mle for this lockres, we now know who the master is.
+ * (the one who sent us *this* message) we can clear it up right away.
+ * since the process that put the mle on the list still has a reference to it,
+ * we can unhash it now, set the master and wake the process.  as a result,
+ * we will have no mle in the list to start with.  now we can add an mle for
+ * the migration and this should be the only one found for those scanning the
+ * list.  */
+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
+				void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
+	struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
+	const char *name;
+	unsigned int namelen, hash;
+	int ret = 0;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	name = migrate->name;
+	namelen = migrate->namelen;
+	hash = dlm_lockid_hash(name, namelen);
+
+	/* preallocate.. if this fails, abort */
+	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+
+	if (!mle) {
+		ret = -ENOMEM;
+		goto leave;
+	}
+
+	/* check for pre-existing lock */
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
+	if (res) {
+		spin_lock(&res->spinlock);
+		if (res->state & DLM_LOCK_RES_RECOVERING) {
+			/* if all is working ok, this can only mean that we got
+		 	* a migrate request from a node that we now see as
+		 	* dead.  what can we do here?  drop it to the floor? */
+			spin_unlock(&res->spinlock);
+			mlog(ML_ERROR, "Got a migrate request, but the "
+			     "lockres is marked as recovering!");
+			kmem_cache_free(dlm_mle_cache, mle);
+			ret = -EINVAL; /* need a better solution */
+			goto unlock;
+		}
+		res->state |= DLM_LOCK_RES_MIGRATING;
+		spin_unlock(&res->spinlock);
+	}
+
+	spin_lock(&dlm->master_lock);
+	/* ignore status.  only nonzero status would BUG. */
+	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
+				    name, namelen,
+				    migrate->new_master,
+				    migrate->master);
+
+	spin_unlock(&dlm->master_lock);
+unlock:
+	spin_unlock(&dlm->spinlock);
+
+	if (oldmle) {
+		/* master is known, detach if not already detached */
+		dlm_mle_detach_hb_events(dlm, oldmle);
+		dlm_put_mle(oldmle);
+	}
+
+	if (res)
+		dlm_lockres_put(res);
+leave:
+	dlm_put(dlm);
+	return ret;
+}
+
+/* must be holding dlm->spinlock and dlm->master_lock
+ * when adding a migration mle, we can clear any other mles
+ * in the master list because we know with certainty that
+ * the master is "master".  so we remove any old mle from
+ * the list after setting it's master field, and then add
+ * the new migration mle.  this way we can hold with the rule
+ * of having only one mle for a given lock name at all times. */
+static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 struct dlm_master_list_entry *mle,
+				 struct dlm_master_list_entry **oldmle,
+				 const char *name, unsigned int namelen,
+				 u8 new_master, u8 master)
+{
+	int found;
+	int ret = 0;
+
+	*oldmle = NULL;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+
+	/* caller is responsible for any ref taken here on oldmle */
+	found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
+	if (found) {
+		struct dlm_master_list_entry *tmp = *oldmle;
+		spin_lock(&tmp->spinlock);
+		if (tmp->type == DLM_MLE_MIGRATION) {
+			if (master == dlm->node_num) {
+				/* ah another process raced me to it */
+				mlog(0, "tried to migrate %.*s, but some "
+				     "process beat me to it\n",
+				     namelen, name);
+				ret = -EEXIST;
+			} else {
+				/* bad.  2 NODES are trying to migrate! */
+				mlog(ML_ERROR, "migration error  mle: "
+				     "master=%u new_master=%u // request: "
+				     "master=%u new_master=%u // "
+				     "lockres=%.*s\n",
+				     tmp->master, tmp->new_master,
+				     master, new_master,
+				     namelen, name);
+				BUG();
+			}
+		} else {
+			/* this is essentially what assert_master does */
+			tmp->master = master;
+			atomic_set(&tmp->woken, 1);
+			wake_up(&tmp->wq);
+			/* remove it so that only one mle will be found */
+			__dlm_unlink_mle(dlm, tmp);
+			__dlm_mle_detach_hb_events(dlm, tmp);
+			ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
+			mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
+			    "telling master to get ref for cleared out mle "
+			    "during migration\n", dlm->name, namelen, name,
+			    master, new_master);
+		}
+		spin_unlock(&tmp->spinlock);
+	}
+
+	/* now add a migration mle to the tail of the list */
+	dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
+	mle->new_master = new_master;
+	/* the new master will be sending an assert master for this.
+	 * at that point we will get the refmap reference */
+	mle->master = master;
+	/* do this for consistency with other mle types */
+	set_bit(new_master, mle->maybe_map);
+	__dlm_insert_mle(dlm, mle);
+
+	return ret;
+}
+
+/*
+ * Sets the owner of the lockres, associated to the mle, to UNKNOWN
+ */
+static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
+					struct dlm_master_list_entry *mle)
+{
+	struct dlm_lock_resource *res;
+
+	/* Find the lockres associated to the mle and set its owner to UNK */
+	res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
+				   mle->mnamehash);
+	if (res) {
+		spin_unlock(&dlm->master_lock);
+
+		/* move lockres onto recovery list */
+		spin_lock(&res->spinlock);
+		dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
+		dlm_move_lockres_to_recovery_list(dlm, res);
+		spin_unlock(&res->spinlock);
+		dlm_lockres_put(res);
+
+		/* about to get rid of mle, detach from heartbeat */
+		__dlm_mle_detach_hb_events(dlm, mle);
+
+		/* dump the mle */
+		spin_lock(&dlm->master_lock);
+		__dlm_put_mle(mle);
+		spin_unlock(&dlm->master_lock);
+	}
+
+	return res;
+}
+
+static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
+				    struct dlm_master_list_entry *mle)
+{
+	__dlm_mle_detach_hb_events(dlm, mle);
+
+	spin_lock(&mle->spinlock);
+	__dlm_unlink_mle(dlm, mle);
+	atomic_set(&mle->woken, 1);
+	spin_unlock(&mle->spinlock);
+
+	wake_up(&mle->wq);
+}
+
+static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
+				struct dlm_master_list_entry *mle, u8 dead_node)
+{
+	int bit;
+
+	BUG_ON(mle->type != DLM_MLE_BLOCK);
+
+	spin_lock(&mle->spinlock);
+	bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+	if (bit != dead_node) {
+		mlog(0, "mle found, but dead node %u would not have been "
+		     "master\n", dead_node);
+		spin_unlock(&mle->spinlock);
+	} else {
+		/* Must drop the refcount by one since the assert_master will
+		 * never arrive. This may result in the mle being unlinked and
+		 * freed, but there may still be a process waiting in the
+		 * dlmlock path which is fine. */
+		mlog(0, "node %u was expected master\n", dead_node);
+		atomic_set(&mle->woken, 1);
+		spin_unlock(&mle->spinlock);
+		wake_up(&mle->wq);
+
+		/* Do not need events any longer, so detach from heartbeat */
+		__dlm_mle_detach_hb_events(dlm, mle);
+		__dlm_put_mle(mle);
+	}
+}
+
+void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct dlm_master_list_entry *mle;
+	struct dlm_lock_resource *res;
+	struct hlist_head *bucket;
+	struct hlist_node *list;
+	unsigned int i;
+
+	mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
+top:
+	assert_spin_locked(&dlm->spinlock);
+
+	/* clean the master list */
+	spin_lock(&dlm->master_lock);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = dlm_master_hash(dlm, i);
+		hlist_for_each(list, bucket) {
+			mle = hlist_entry(list, struct dlm_master_list_entry,
+					  master_hash_node);
+
+			BUG_ON(mle->type != DLM_MLE_BLOCK &&
+			       mle->type != DLM_MLE_MASTER &&
+			       mle->type != DLM_MLE_MIGRATION);
+
+			/* MASTER mles are initiated locally. The waiting
+			 * process will notice the node map change shortly.
+			 * Let that happen as normal. */
+			if (mle->type == DLM_MLE_MASTER)
+				continue;
+
+			/* BLOCK mles are initiated by other nodes. Need to
+			 * clean up if the dead node would have been the
+			 * master. */
+			if (mle->type == DLM_MLE_BLOCK) {
+				dlm_clean_block_mle(dlm, mle, dead_node);
+				continue;
+			}
+
+			/* Everything else is a MIGRATION mle */
+
+			/* The rule for MIGRATION mles is that the master
+			 * becomes UNKNOWN if *either* the original or the new
+			 * master dies. All UNKNOWN lockres' are sent to
+			 * whichever node becomes the recovery master. The new
+			 * master is responsible for determining if there is
+			 * still a master for this lockres, or if he needs to
+			 * take over mastery. Either way, this node should
+			 * expect another message to resolve this. */
+
+			if (mle->master != dead_node &&
+			    mle->new_master != dead_node)
+				continue;
+
+			/* If we have reached this point, this mle needs to be
+			 * removed from the list and freed. */
+			dlm_clean_migration_mle(dlm, mle);
+
+			mlog(0, "%s: node %u died during migration from "
+			     "%u to %u!\n", dlm->name, dead_node, mle->master,
+			     mle->new_master);
+
+			/* If we find a lockres associated with the mle, we've
+			 * hit this rare case that messes up our lock ordering.
+			 * If so, we need to drop the master lock so that we can
+			 * take the lockres lock, meaning that we will have to
+			 * restart from the head of list. */
+			res = dlm_reset_mleres_owner(dlm, mle);
+			if (res)
+				/* restart */
+				goto top;
+
+			/* This may be the last reference */
+			__dlm_put_mle(mle);
+		}
+	}
+	spin_unlock(&dlm->master_lock);
+}
+
+int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			 u8 old_master)
+{
+	struct dlm_node_iter iter;
+	int ret = 0;
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	clear_bit(old_master, iter.node_map);
+	clear_bit(dlm->node_num, iter.node_map);
+	spin_unlock(&dlm->spinlock);
+
+	/* ownership of the lockres is changing.  account for the
+	 * mastery reference here since old_master will briefly have
+	 * a reference after the migration completes */
+	spin_lock(&res->spinlock);
+	dlm_lockres_set_refmap_bit(old_master, res);
+	spin_unlock(&res->spinlock);
+
+	mlog(0, "now time to do a migrate request to other nodes\n");
+	ret = dlm_do_migrate_request(dlm, res, old_master,
+				     dlm->node_num, &iter);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	mlog(0, "doing assert master of %.*s to all except the original node\n",
+	     res->lockname.len, res->lockname.name);
+	/* this call now finishes out the nodemap
+	 * even if one or more nodes die */
+	ret = dlm_do_assert_master(dlm, res, iter.node_map,
+				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
+	if (ret < 0) {
+		/* no longer need to retry.  all living nodes contacted. */
+		mlog_errno(ret);
+		ret = 0;
+	}
+
+	memset(iter.node_map, 0, sizeof(iter.node_map));
+	set_bit(old_master, iter.node_map);
+	mlog(0, "doing assert master of %.*s back to %u\n",
+	     res->lockname.len, res->lockname.name, old_master);
+	ret = dlm_do_assert_master(dlm, res, iter.node_map,
+				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
+	if (ret < 0) {
+		mlog(0, "assert master to original master failed "
+		     "with %d.\n", ret);
+		/* the only nonzero status here would be because of
+		 * a dead original node.  we're done. */
+		ret = 0;
+	}
+
+	/* all done, set the owner, clear the flag */
+	spin_lock(&res->spinlock);
+	dlm_set_lockres_owner(dlm, res, dlm->node_num);
+	res->state &= ~DLM_LOCK_RES_MIGRATING;
+	spin_unlock(&res->spinlock);
+	/* re-dirty it on the new master */
+	dlm_kick_thread(dlm, res);
+	wake_up(&res->wq);
+leave:
+	return ret;
+}
+
+/*
+ * LOCKRES AST REFCOUNT
+ * this is integral to migration
+ */
+
+/* for future intent to call an ast, reserve one ahead of time.
+ * this should be called only after waiting on the lockres
+ * with dlm_wait_on_lockres, and while still holding the
+ * spinlock after the call. */
+void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		__dlm_print_one_lock_resource(res);
+	}
+	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
+
+	atomic_inc(&res->asts_reserved);
+}
+
+/*
+ * used to drop the reserved ast, either because it went unused,
+ * or because the ast/bast was actually called.
+ *
+ * also, if there is a pending migration on this lockres,
+ * and this was the last pending ast on the lockres,
+ * atomically set the MIGRATING flag before we drop the lock.
+ * this is how we ensure that migration can proceed with no
+ * asts in progress.  note that it is ok if the state of the
+ * queues is such that a lock should be granted in the future
+ * or that a bast should be fired, because the new master will
+ * shuffle the lists on this lockres as soon as it is migrated.
+ */
+void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res)
+{
+	if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
+		return;
+
+	if (!res->migration_pending) {
+		spin_unlock(&res->spinlock);
+		return;
+	}
+
+	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
+	res->migration_pending = 0;
+	res->state |= DLM_LOCK_RES_MIGRATING;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+	wake_up(&dlm->migration_wq);
+}
+
+void dlm_force_free_mles(struct dlm_ctxt *dlm)
+{
+	int i;
+	struct hlist_head *bucket;
+	struct dlm_master_list_entry *mle;
+	struct hlist_node *tmp, *list;
+
+	/*
+	 * We notified all other nodes that we are exiting the domain and
+	 * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
+	 * around we force free them and wake any processes that are waiting
+	 * on the mles
+	 */
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+
+	BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
+	BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
+
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = dlm_master_hash(dlm, i);
+		hlist_for_each_safe(list, tmp, bucket) {
+			mle = hlist_entry(list, struct dlm_master_list_entry,
+					  master_hash_node);
+			if (mle->type != DLM_MLE_BLOCK) {
+				mlog(ML_ERROR, "bad mle: %p\n", mle);
+				dlm_print_one_mle(mle);
+			}
+			atomic_set(&mle->woken, 1);
+			wake_up(&mle->wq);
+
+			__dlm_unlink_mle(dlm, mle);
+			__dlm_mle_detach_hb_events(dlm, mle);
+			__dlm_put_mle(mle);
+		}
+	}
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
new file mode 100644
index 00000000..7efab6d2
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -0,0 +1,2886 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmrecovery.c
+ *
+ * recovery stuff
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/timer.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY)
+#include "cluster/masklog.h"
+
+static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
+
+static int dlm_recovery_thread(void *data);
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
+void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
+static int dlm_do_recovery(struct dlm_ctxt *dlm);
+
+static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
+static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
+static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
+static int dlm_request_all_locks(struct dlm_ctxt *dlm,
+				 u8 request_from, u8 dead_node);
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
+
+static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
+static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
+					const char *lockname, int namelen,
+					int total_locks, u64 cookie,
+					u8 flags, u8 master);
+static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
+				    struct dlm_migratable_lockres *mres,
+				    u8 send_to,
+				    struct dlm_lock_resource *res,
+				    int total_locks);
+static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_migratable_lockres *mres);
+static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
+static int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
+				 u8 dead_node, u8 send_to);
+static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node);
+static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
+					struct list_head *list, u8 dead_node);
+static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
+					      u8 dead_node, u8 new_master);
+static void dlm_reco_ast(void *astdata);
+static void dlm_reco_bast(void *astdata, int blocked_type);
+static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
+static void dlm_request_all_locks_worker(struct dlm_work_item *item,
+					 void *data);
+static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      u8 *real_master);
+
+static u64 dlm_get_next_mig_cookie(void);
+
+static DEFINE_SPINLOCK(dlm_reco_state_lock);
+static DEFINE_SPINLOCK(dlm_mig_cookie_lock);
+static u64 dlm_mig_cookie = 1;
+
+static u64 dlm_get_next_mig_cookie(void)
+{
+	u64 c;
+	spin_lock(&dlm_mig_cookie_lock);
+	c = dlm_mig_cookie;
+	if (dlm_mig_cookie == (~0ULL))
+		dlm_mig_cookie = 1;
+	else
+		dlm_mig_cookie++;
+	spin_unlock(&dlm_mig_cookie_lock);
+	return c;
+}
+
+static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm,
+					  u8 dead_node)
+{
+	assert_spin_locked(&dlm->spinlock);
+	if (dlm->reco.dead_node != dead_node)
+		mlog(0, "%s: changing dead_node from %u to %u\n",
+		     dlm->name, dlm->reco.dead_node, dead_node);
+	dlm->reco.dead_node = dead_node;
+}
+
+static inline void dlm_set_reco_master(struct dlm_ctxt *dlm,
+				       u8 master)
+{
+	assert_spin_locked(&dlm->spinlock);
+	mlog(0, "%s: changing new_master from %u to %u\n",
+	     dlm->name, dlm->reco.new_master, master);
+	dlm->reco.new_master = master;
+}
+
+static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+	assert_spin_locked(&dlm->spinlock);
+	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+	dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
+	dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
+}
+
+static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	__dlm_reset_recovery(dlm);
+	spin_unlock(&dlm->spinlock);
+}
+
+/* Worker function used during recovery. */
+void dlm_dispatch_work(struct work_struct *work)
+{
+	struct dlm_ctxt *dlm =
+		container_of(work, struct dlm_ctxt, dispatched_work);
+	LIST_HEAD(tmp_list);
+	struct dlm_work_item *item, *next;
+	dlm_workfunc_t *workfunc;
+	int tot=0;
+
+	spin_lock(&dlm->work_lock);
+	list_splice_init(&dlm->work_list, &tmp_list);
+	spin_unlock(&dlm->work_lock);
+
+	list_for_each_entry(item, &tmp_list, list) {
+		tot++;
+	}
+	mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
+
+	list_for_each_entry_safe(item, next, &tmp_list, list) {
+		workfunc = item->func;
+		list_del_init(&item->list);
+
+		/* already have ref on dlm to avoid having
+		 * it disappear.  just double-check. */
+		BUG_ON(item->dlm != dlm);
+
+		/* this is allowed to sleep and
+		 * call network stuff */
+		workfunc(item, item->data);
+
+		dlm_put(dlm);
+		kfree(item);
+	}
+}
+
+/*
+ * RECOVERY THREAD
+ */
+
+void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
+{
+	/* wake the recovery thread
+	 * this will wake the reco thread in one of three places
+	 * 1) sleeping with no recovery happening
+	 * 2) sleeping with recovery mastered elsewhere
+	 * 3) recovery mastered here, waiting on reco data */
+
+	wake_up(&dlm->dlm_reco_thread_wq);
+}
+
+/* Launch the recovery thread */
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
+{
+	mlog(0, "starting dlm recovery thread...\n");
+
+	dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
+						"dlm_reco_thread");
+	if (IS_ERR(dlm->dlm_reco_thread_task)) {
+		mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
+		dlm->dlm_reco_thread_task = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_reco_thread_task) {
+		mlog(0, "waiting for dlm recovery thread to exit\n");
+		kthread_stop(dlm->dlm_reco_thread_task);
+		dlm->dlm_reco_thread_task = NULL;
+	}
+}
+
+
+
+/*
+ * this is lame, but here's how recovery works...
+ * 1) all recovery threads cluster wide will work on recovering
+ *    ONE node at a time
+ * 2) negotiate who will take over all the locks for the dead node.
+ *    thats right... ALL the locks.
+ * 3) once a new master is chosen, everyone scans all locks
+ *    and moves aside those mastered by the dead guy
+ * 4) each of these locks should be locked until recovery is done
+ * 5) the new master collects up all of secondary lock queue info
+ *    one lock at a time, forcing each node to communicate back
+ *    before continuing
+ * 6) each secondary lock queue responds with the full known lock info
+ * 7) once the new master has run all its locks, it sends a ALLDONE!
+ *    message to everyone
+ * 8) upon receiving this message, the secondary queue node unlocks
+ *    and responds to the ALLDONE
+ * 9) once the new master gets responses from everyone, he unlocks
+ *    everything and recovery for this dead node is done
+ *10) go back to 2) while there are still dead nodes
+ *
+ */
+
+static void dlm_print_reco_node_status(struct dlm_ctxt *dlm)
+{
+	struct dlm_reco_node_data *ndata;
+	struct dlm_lock_resource *res;
+
+	mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n",
+	     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
+	     dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive",
+	     dlm->reco.dead_node, dlm->reco.new_master);
+
+	list_for_each_entry(ndata, &dlm->reco.node_data, list) {
+		char *st = "unknown";
+		switch (ndata->state) {
+			case DLM_RECO_NODE_DATA_INIT:
+				st = "init";
+				break;
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				st = "requesting";
+				break;
+			case DLM_RECO_NODE_DATA_DEAD:
+				st = "dead";
+				break;
+			case DLM_RECO_NODE_DATA_RECEIVING:
+				st = "receiving";
+				break;
+			case DLM_RECO_NODE_DATA_REQUESTED:
+				st = "requested";
+				break;
+			case DLM_RECO_NODE_DATA_DONE:
+				st = "done";
+				break;
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+				st = "finalize-sent";
+				break;
+			default:
+				st = "bad";
+				break;
+		}
+		mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n",
+		     dlm->name, ndata->node_num, st);
+	}
+	list_for_each_entry(res, &dlm->reco.resources, recovering) {
+		mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+	}
+}
+
+#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
+
+static int dlm_recovery_thread(void *data)
+{
+	int status;
+	struct dlm_ctxt *dlm = data;
+	unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS);
+
+	mlog(0, "dlm thread running for %s...\n", dlm->name);
+
+	while (!kthread_should_stop()) {
+		if (dlm_domain_fully_joined(dlm)) {
+			status = dlm_do_recovery(dlm);
+			if (status == -EAGAIN) {
+				/* do not sleep, recheck immediately. */
+				continue;
+			}
+			if (status < 0)
+				mlog_errno(status);
+		}
+
+		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
+						 kthread_should_stop(),
+						 timeout);
+	}
+
+	mlog(0, "quitting DLM recovery thread\n");
+	return 0;
+}
+
+/* returns true when the recovery master has contacted us */
+static int dlm_reco_master_ready(struct dlm_ctxt *dlm)
+{
+	int ready;
+	spin_lock(&dlm->spinlock);
+	ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM);
+	spin_unlock(&dlm->spinlock);
+	return ready;
+}
+
+/* returns true if node is no longer in the domain
+ * could be dead or just not joined */
+int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
+{
+	int dead;
+	spin_lock(&dlm->spinlock);
+	dead = !test_bit(node, dlm->domain_map);
+	spin_unlock(&dlm->spinlock);
+	return dead;
+}
+
+/* returns true if node is no longer in the domain
+ * could be dead or just not joined */
+static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
+{
+	int recovered;
+	spin_lock(&dlm->spinlock);
+	recovered = !test_bit(node, dlm->recovery_map);
+	spin_unlock(&dlm->spinlock);
+	return recovered;
+}
+
+
+int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+	if (timeout) {
+		mlog(ML_NOTICE, "%s: waiting %dms for notification of "
+		     "death of node %u\n", dlm->name, timeout, node);
+		wait_event_timeout(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_dead(dlm, node),
+			   msecs_to_jiffies(timeout));
+	} else {
+		mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
+		     "of death of node %u\n", dlm->name, node);
+		wait_event(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_dead(dlm, node));
+	}
+	/* for now, return 0 */
+	return 0;
+}
+
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+	if (timeout) {
+		mlog(0, "%s: waiting %dms for notification of "
+		     "recovery of node %u\n", dlm->name, timeout, node);
+		wait_event_timeout(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_recovered(dlm, node),
+			   msecs_to_jiffies(timeout));
+	} else {
+		mlog(0, "%s: waiting indefinitely for notification "
+		     "of recovery of node %u\n", dlm->name, node);
+		wait_event(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_recovered(dlm, node));
+	}
+	/* for now, return 0 */
+	return 0;
+}
+
+/* callers of the top-level api calls (dlmlock/dlmunlock) should
+ * block on the dlm->reco.event when recovery is in progress.
+ * the dlm recovery thread will set this state when it begins
+ * recovering a dead node (as the new master or not) and clear
+ * the state and wake as soon as all affected lock resources have
+ * been marked with the RECOVERY flag */
+static int dlm_in_recovery(struct dlm_ctxt *dlm)
+{
+	int in_recovery;
+	spin_lock(&dlm->spinlock);
+	in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+	spin_unlock(&dlm->spinlock);
+	return in_recovery;
+}
+
+
+void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
+{
+	if (dlm_in_recovery(dlm)) {
+		mlog(0, "%s: reco thread %d in recovery: "
+		     "state=%d, master=%u, dead=%u\n",
+		     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
+		     dlm->reco.state, dlm->reco.new_master,
+		     dlm->reco.dead_node);
+	}
+	wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
+}
+
+static void dlm_begin_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+	dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_end_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
+	dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
+	spin_unlock(&dlm->spinlock);
+	wake_up(&dlm->reco.event);
+}
+
+static int dlm_do_recovery(struct dlm_ctxt *dlm)
+{
+	int status = 0;
+	int ret;
+
+	spin_lock(&dlm->spinlock);
+
+	/* check to see if the new master has died */
+	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
+	    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
+		mlog(0, "new master %u died while recovering %u!\n",
+		     dlm->reco.new_master, dlm->reco.dead_node);
+		/* unset the new_master, leave dead_node */
+		dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
+	}
+
+	/* select a target to recover */
+	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+		int bit;
+
+		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
+		if (bit >= O2NM_MAX_NODES || bit < 0)
+			dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
+		else
+			dlm_set_reco_dead_node(dlm, bit);
+	} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
+		/* BUG? */
+		mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
+		     dlm->reco.dead_node);
+		dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
+	}
+
+	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+		// mlog(0, "nothing to recover!  sleeping now!\n");
+		spin_unlock(&dlm->spinlock);
+		/* return to main thread loop and sleep. */
+		return 0;
+	}
+	mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
+	     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
+	     dlm->reco.dead_node);
+	spin_unlock(&dlm->spinlock);
+
+	/* take write barrier */
+	/* (stops the list reshuffling thread, proxy ast handling) */
+	dlm_begin_recovery(dlm);
+
+	if (dlm->reco.new_master == dlm->node_num)
+		goto master_here;
+
+	if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
+		/* choose a new master, returns 0 if this node
+		 * is the master, -EEXIST if it's another node.
+		 * this does not return until a new master is chosen
+		 * or recovery completes entirely. */
+		ret = dlm_pick_recovery_master(dlm);
+		if (!ret) {
+			/* already notified everyone.  go. */
+			goto master_here;
+		}
+		mlog(0, "another node will master this recovery session.\n");
+	}
+	mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
+	     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master,
+	     dlm->node_num, dlm->reco.dead_node);
+
+	/* it is safe to start everything back up here
+	 * because all of the dead node's lock resources
+	 * have been marked as in-recovery */
+	dlm_end_recovery(dlm);
+
+	/* sleep out in main dlm_recovery_thread loop. */
+	return 0;
+
+master_here:
+	mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node "
+	     "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task),
+	     dlm->node_num, dlm->reco.dead_node, dlm->name);
+
+	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
+	if (status < 0) {
+		/* we should never hit this anymore */
+		mlog(ML_ERROR, "error %d remastering locks for node %u, "
+		     "retrying.\n", status, dlm->reco.dead_node);
+		/* yield a bit to allow any final network messages
+		 * to get handled on remaining nodes */
+		msleep(100);
+	} else {
+		/* success!  see if any other nodes need recovery */
+		mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
+		     dlm->name, dlm->reco.dead_node, dlm->node_num);
+		dlm_reset_recovery(dlm);
+	}
+	dlm_end_recovery(dlm);
+
+	/* continue and look for another dead node */
+	return -EAGAIN;
+}
+
+static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	int status = 0;
+	struct dlm_reco_node_data *ndata;
+	int all_nodes_done;
+	int destroy = 0;
+	int pass = 0;
+
+	do {
+		/* we have become recovery master.  there is no escaping
+		 * this, so just keep trying until we get it. */
+		status = dlm_init_recovery_area(dlm, dead_node);
+		if (status < 0) {
+			mlog(ML_ERROR, "%s: failed to alloc recovery area, "
+			     "retrying\n", dlm->name);
+			msleep(1000);
+		}
+	} while (status != 0);
+
+	/* safe to access the node data list without a lock, since this
+	 * process is the only one to change the list */
+	list_for_each_entry(ndata, &dlm->reco.node_data, list) {
+		BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
+		ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
+
+		mlog(0, "requesting lock info from node %u\n",
+		     ndata->node_num);
+
+		if (ndata->node_num == dlm->node_num) {
+			ndata->state = DLM_RECO_NODE_DATA_DONE;
+			continue;
+		}
+
+		do {
+			status = dlm_request_all_locks(dlm, ndata->node_num,
+						       dead_node);
+			if (status < 0) {
+				mlog_errno(status);
+				if (dlm_is_host_down(status)) {
+					/* node died, ignore it for recovery */
+					status = 0;
+					ndata->state = DLM_RECO_NODE_DATA_DEAD;
+					/* wait for the domain map to catch up
+					 * with the network state. */
+					wait_event_timeout(dlm->dlm_reco_thread_wq,
+							   dlm_is_node_dead(dlm,
+								ndata->node_num),
+							   msecs_to_jiffies(1000));
+					mlog(0, "waited 1 sec for %u, "
+					     "dead? %s\n", ndata->node_num,
+					     dlm_is_node_dead(dlm, ndata->node_num) ?
+					     "yes" : "no");
+				} else {
+					/* -ENOMEM on the other node */
+					mlog(0, "%s: node %u returned "
+					     "%d during recovery, retrying "
+					     "after a short wait\n",
+					     dlm->name, ndata->node_num,
+					     status);
+					msleep(100);
+				}
+			}
+		} while (status != 0);
+
+		spin_lock(&dlm_reco_state_lock);
+		switch (ndata->state) {
+			case DLM_RECO_NODE_DATA_INIT:
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+			case DLM_RECO_NODE_DATA_REQUESTED:
+				BUG();
+				break;
+			case DLM_RECO_NODE_DATA_DEAD:
+				mlog(0, "node %u died after requesting "
+				     "recovery info for node %u\n",
+				     ndata->node_num, dead_node);
+				/* fine.  don't need this node's info.
+				 * continue without it. */
+				break;
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
+				mlog(0, "now receiving recovery data from "
+				     "node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+			case DLM_RECO_NODE_DATA_RECEIVING:
+				mlog(0, "already receiving recovery data from "
+				     "node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+			case DLM_RECO_NODE_DATA_DONE:
+				mlog(0, "already DONE receiving recovery data "
+				     "from node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+		}
+		spin_unlock(&dlm_reco_state_lock);
+	}
+
+	mlog(0, "done requesting all lock info\n");
+
+	/* nodes should be sending reco data now
+	 * just need to wait */
+
+	while (1) {
+		/* check all the nodes now to see if we are
+		 * done, or if anyone died */
+		all_nodes_done = 1;
+		spin_lock(&dlm_reco_state_lock);
+		list_for_each_entry(ndata, &dlm->reco.node_data, list) {
+			mlog(0, "checking recovery state of node %u\n",
+			     ndata->node_num);
+			switch (ndata->state) {
+				case DLM_RECO_NODE_DATA_INIT:
+				case DLM_RECO_NODE_DATA_REQUESTING:
+					mlog(ML_ERROR, "bad ndata state for "
+					     "node %u: state=%d\n",
+					     ndata->node_num, ndata->state);
+					BUG();
+					break;
+				case DLM_RECO_NODE_DATA_DEAD:
+					mlog(0, "node %u died after "
+					     "requesting recovery info for "
+					     "node %u\n", ndata->node_num,
+					     dead_node);
+					break;
+				case DLM_RECO_NODE_DATA_RECEIVING:
+				case DLM_RECO_NODE_DATA_REQUESTED:
+					mlog(0, "%s: node %u still in state %s\n",
+					     dlm->name, ndata->node_num,
+					     ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
+					     "receiving" : "requested");
+					all_nodes_done = 0;
+					break;
+				case DLM_RECO_NODE_DATA_DONE:
+					mlog(0, "%s: node %u state is done\n",
+					     dlm->name, ndata->node_num);
+					break;
+				case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+					mlog(0, "%s: node %u state is finalize\n",
+					     dlm->name, ndata->node_num);
+					break;
+			}
+		}
+		spin_unlock(&dlm_reco_state_lock);
+
+		mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
+		     all_nodes_done?"yes":"no");
+		if (all_nodes_done) {
+			int ret;
+
+			/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
+	 		 * just send a finalize message to everyone and
+	 		 * clean up */
+			mlog(0, "all nodes are done! send finalize\n");
+			ret = dlm_send_finalize_reco_message(dlm);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			spin_lock(&dlm->spinlock);
+			dlm_finish_local_lockres_recovery(dlm, dead_node,
+							  dlm->node_num);
+			spin_unlock(&dlm->spinlock);
+			mlog(0, "should be done with recovery!\n");
+
+			mlog(0, "finishing recovery of %s at %lu, "
+			     "dead=%u, this=%u, new=%u\n", dlm->name,
+			     jiffies, dlm->reco.dead_node,
+			     dlm->node_num, dlm->reco.new_master);
+			destroy = 1;
+			status = 0;
+			/* rescan everything marked dirty along the way */
+			dlm_kick_thread(dlm, NULL);
+			break;
+		}
+		/* wait to be signalled, with periodic timeout
+		 * to check for node death */
+		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
+					 kthread_should_stop(),
+					 msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS));
+
+	}
+
+	if (destroy)
+		dlm_destroy_recovery_area(dlm, dead_node);
+
+	return status;
+}
+
+static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	int num=0;
+	struct dlm_reco_node_data *ndata;
+
+	spin_lock(&dlm->spinlock);
+	memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
+	/* nodes can only be removed (by dying) after dropping
+	 * this lock, and death will be trapped later, so this should do */
+	spin_unlock(&dlm->spinlock);
+
+	while (1) {
+		num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num);
+		if (num >= O2NM_MAX_NODES) {
+			break;
+		}
+		BUG_ON(num == dead_node);
+
+		ndata = kzalloc(sizeof(*ndata), GFP_NOFS);
+		if (!ndata) {
+			dlm_destroy_recovery_area(dlm, dead_node);
+			return -ENOMEM;
+		}
+		ndata->node_num = num;
+		ndata->state = DLM_RECO_NODE_DATA_INIT;
+		spin_lock(&dlm_reco_state_lock);
+		list_add_tail(&ndata->list, &dlm->reco.node_data);
+		spin_unlock(&dlm_reco_state_lock);
+		num++;
+	}
+
+	return 0;
+}
+
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct dlm_reco_node_data *ndata, *next;
+	LIST_HEAD(tmplist);
+
+	spin_lock(&dlm_reco_state_lock);
+	list_splice_init(&dlm->reco.node_data, &tmplist);
+	spin_unlock(&dlm_reco_state_lock);
+
+	list_for_each_entry_safe(ndata, next, &tmplist, list) {
+		list_del_init(&ndata->list);
+		kfree(ndata);
+	}
+}
+
+static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
+				 u8 dead_node)
+{
+	struct dlm_lock_request lr;
+	enum dlm_status ret;
+
+	mlog(0, "\n");
+
+
+	mlog(0, "dlm_request_all_locks: dead node is %u, sending request "
+		  "to %u\n", dead_node, request_from);
+
+	memset(&lr, 0, sizeof(lr));
+	lr.node_idx = dlm->node_num;
+	lr.dead_node = dead_node;
+
+	// send message
+	ret = DLM_NOLOCKMGR;
+	ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
+				 &lr, sizeof(lr), request_from, NULL);
+
+	/* negative status is handled by caller */
+	if (ret < 0)
+		mlog(ML_ERROR, "Error %d when sending message %u (key "
+		     "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
+		     dlm->key, request_from);
+
+	// return from here, then
+	// sleep until all received or error
+	return ret;
+
+}
+
+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data,
+				  void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf;
+	char *buf = NULL;
+	struct dlm_work_item *item = NULL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	if (lr->dead_node != dlm->reco.dead_node) {
+		mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local "
+		     "dead_node is %u\n", dlm->name, lr->node_idx,
+		     lr->dead_node, dlm->reco.dead_node);
+		dlm_print_reco_node_status(dlm);
+		/* this is a hack */
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
+	BUG_ON(lr->dead_node != dlm->reco.dead_node);
+
+	item = kzalloc(sizeof(*item), GFP_NOFS);
+	if (!item) {
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
+
+	/* this will get freed by dlm_request_all_locks_worker */
+	buf = (char *) __get_free_page(GFP_NOFS);
+	if (!buf) {
+		kfree(item);
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
+
+	/* queue up work for dlm_request_all_locks_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf);
+	item->u.ral.reco_master = lr->node_idx;
+	item->u.ral.dead_node = lr->dead_node;
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
+
+	dlm_put(dlm);
+	return 0;
+}
+
+static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_migratable_lockres *mres;
+	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm;
+	LIST_HEAD(resources);
+	int ret;
+	u8 dead_node, reco_master;
+	int skip_all_done = 0;
+
+	dlm = item->dlm;
+	dead_node = item->u.ral.dead_node;
+	reco_master = item->u.ral.reco_master;
+	mres = (struct dlm_migratable_lockres *)data;
+
+	mlog(0, "%s: recovery worker started, dead=%u, master=%u\n",
+	     dlm->name, dead_node, reco_master);
+
+	if (dead_node != dlm->reco.dead_node ||
+	    reco_master != dlm->reco.new_master) {
+		/* worker could have been created before the recovery master
+		 * died.  if so, do not continue, but do not error. */
+		if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
+			mlog(ML_NOTICE, "%s: will not send recovery state, "
+			     "recovery master %u died, thread=(dead=%u,mas=%u)"
+			     " current=(dead=%u,mas=%u)\n", dlm->name,
+			     reco_master, dead_node, reco_master,
+			     dlm->reco.dead_node, dlm->reco.new_master);
+		} else {
+			mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
+			     "master=%u), request(dead=%u, master=%u)\n",
+			     dlm->name, dlm->reco.dead_node,
+			     dlm->reco.new_master, dead_node, reco_master);
+		}
+		goto leave;
+	}
+
+	/* lock resources should have already been moved to the
+ 	 * dlm->reco.resources list.  now move items from that list
+ 	 * to a temp list if the dead owner matches.  note that the
+	 * whole cluster recovers only one node at a time, so we
+	 * can safely move UNKNOWN lock resources for each recovery
+	 * session. */
+	dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
+
+	/* now we can begin blasting lockreses without the dlm lock */
+
+	/* any errors returned will be due to the new_master dying,
+	 * the dlm_reco_thread should detect this */
+	list_for_each_entry(res, &resources, recovering) {
+		ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
+				   	DLM_MRES_RECOVERY);
+		if (ret < 0) {
+			mlog(ML_ERROR, "%s: node %u went down while sending "
+			     "recovery state for dead node %u, ret=%d\n", dlm->name,
+			     reco_master, dead_node, ret);
+			skip_all_done = 1;
+			break;
+		}
+	}
+
+	/* move the resources back to the list */
+	spin_lock(&dlm->spinlock);
+	list_splice_init(&resources, &dlm->reco.resources);
+	spin_unlock(&dlm->spinlock);
+
+	if (!skip_all_done) {
+		ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
+		if (ret < 0) {
+			mlog(ML_ERROR, "%s: node %u went down while sending "
+			     "recovery all-done for dead node %u, ret=%d\n",
+			     dlm->name, reco_master, dead_node, ret);
+		}
+	}
+leave:
+	free_page((unsigned long)data);
+}
+
+
+static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
+{
+	int ret, tmpret;
+	struct dlm_reco_data_done done_msg;
+
+	memset(&done_msg, 0, sizeof(done_msg));
+	done_msg.node_idx = dlm->node_num;
+	done_msg.dead_node = dead_node;
+	mlog(0, "sending DATA DONE message to %u, "
+	     "my node=%u, dead node=%u\n", send_to, done_msg.node_idx,
+	     done_msg.dead_node);
+
+	ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
+				 sizeof(done_msg), send_to, &tmpret);
+	if (ret < 0) {
+		mlog(ML_ERROR, "Error %d when sending message %u (key "
+		     "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
+		     dlm->key, send_to);
+		if (!dlm_is_host_down(ret)) {
+			BUG();
+		}
+	} else
+		ret = tmpret;
+	return ret;
+}
+
+
+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
+			       void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
+	struct dlm_reco_node_data *ndata = NULL;
+	int ret = -EINVAL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
+	     "node_idx=%u, this node=%u\n", done->dead_node,
+	     dlm->reco.dead_node, done->node_idx, dlm->node_num);
+
+	mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node),
+			"Got DATA DONE: dead_node=%u, reco.dead_node=%u, "
+			"node_idx=%u, this node=%u\n", done->dead_node,
+			dlm->reco.dead_node, done->node_idx, dlm->node_num);
+
+	spin_lock(&dlm_reco_state_lock);
+	list_for_each_entry(ndata, &dlm->reco.node_data, list) {
+		if (ndata->node_num != done->node_idx)
+			continue;
+
+		switch (ndata->state) {
+			/* should have moved beyond INIT but not to FINALIZE yet */
+			case DLM_RECO_NODE_DATA_INIT:
+			case DLM_RECO_NODE_DATA_DEAD:
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+				mlog(ML_ERROR, "bad ndata state for node %u:"
+				     " state=%d\n", ndata->node_num,
+				     ndata->state);
+				BUG();
+				break;
+			/* these states are possible at this point, anywhere along
+			 * the line of recovery */
+			case DLM_RECO_NODE_DATA_DONE:
+			case DLM_RECO_NODE_DATA_RECEIVING:
+			case DLM_RECO_NODE_DATA_REQUESTED:
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				mlog(0, "node %u is DONE sending "
+					  "recovery data!\n",
+					  ndata->node_num);
+
+				ndata->state = DLM_RECO_NODE_DATA_DONE;
+				ret = 0;
+				break;
+		}
+	}
+	spin_unlock(&dlm_reco_state_lock);
+
+	/* wake the recovery thread, some node is done */
+	if (!ret)
+		dlm_kick_recovery_thread(dlm);
+
+	if (ret < 0)
+		mlog(ML_ERROR, "failed to find recovery node data for node "
+		     "%u\n", done->node_idx);
+	dlm_put(dlm);
+
+	mlog(0, "leaving reco data done handler, ret=%d\n", ret);
+	return ret;
+}
+
+static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
+					struct list_head *list,
+				       	u8 dead_node)
+{
+	struct dlm_lock_resource *res, *next;
+	struct dlm_lock *lock;
+
+	spin_lock(&dlm->spinlock);
+	list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
+		/* always prune any $RECOVERY entries for dead nodes,
+		 * otherwise hangs can occur during later recovery */
+		if (dlm_is_recovery_lock(res->lockname.name,
+					 res->lockname.len)) {
+			spin_lock(&res->spinlock);
+			list_for_each_entry(lock, &res->granted, list) {
+				if (lock->ml.node == dead_node) {
+					mlog(0, "AHA! there was "
+					     "a $RECOVERY lock for dead "
+					     "node %u (%s)!\n",
+					     dead_node, dlm->name);
+					list_del_init(&lock->list);
+					dlm_lock_put(lock);
+					break;
+				}
+			}
+			spin_unlock(&res->spinlock);
+			continue;
+		}
+
+		if (res->owner == dead_node) {
+			mlog(0, "found lockres owned by dead node while "
+				  "doing recovery for node %u. sending it.\n",
+				  dead_node);
+			list_move_tail(&res->recovering, list);
+		} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "found UNKNOWN owner while doing recovery "
+				  "for node %u. sending it.\n", dead_node);
+			list_move_tail(&res->recovering, list);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res)
+{
+	int total_locks = 0;
+	struct list_head *iter, *queue = &res->granted;
+	int i;
+
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue)
+			total_locks++;
+		queue++;
+	}
+	return total_locks;
+}
+
+
+static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
+				      struct dlm_migratable_lockres *mres,
+				      u8 send_to,
+				      struct dlm_lock_resource *res,
+				      int total_locks)
+{
+	u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
+	int mres_total_locks = be32_to_cpu(mres->total_locks);
+	int sz, ret = 0, status = 0;
+	u8 orig_flags = mres->flags,
+	   orig_master = mres->master;
+
+	BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS);
+	if (!mres->num_locks)
+		return 0;
+
+	sz = sizeof(struct dlm_migratable_lockres) +
+		(mres->num_locks * sizeof(struct dlm_migratable_lock));
+
+	/* add an all-done flag if we reached the last lock */
+	orig_flags = mres->flags;
+	BUG_ON(total_locks > mres_total_locks);
+	if (total_locks == mres_total_locks)
+		mres->flags |= DLM_MRES_ALL_DONE;
+
+	mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
+	     dlm->name, res->lockname.len, res->lockname.name,
+	     orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery",
+	     send_to);
+
+	/* send it */
+	ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
+				 sz, send_to, &status);
+	if (ret < 0) {
+		/* XXX: negative status is not handled.
+		 * this will end up killing this node. */
+		mlog(ML_ERROR, "Error %d when sending message %u (key "
+		     "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
+		     dlm->key, send_to);
+	} else {
+		/* might get an -ENOMEM back here */
+		ret = status;
+		if (ret < 0) {
+			mlog_errno(ret);
+
+			if (ret == -EFAULT) {
+				mlog(ML_ERROR, "node %u told me to kill "
+				     "myself!\n", send_to);
+				BUG();
+			}
+		}
+	}
+
+	/* zero and reinit the message buffer */
+	dlm_init_migratable_lockres(mres, res->lockname.name,
+				    res->lockname.len, mres_total_locks,
+				    mig_cookie, orig_flags, orig_master);
+	return ret;
+}
+
+static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
+					const char *lockname, int namelen,
+					int total_locks, u64 cookie,
+					u8 flags, u8 master)
+{
+	/* mres here is one full page */
+	clear_page(mres);
+	mres->lockname_len = namelen;
+	memcpy(mres->lockname, lockname, namelen);
+	mres->num_locks = 0;
+	mres->total_locks = cpu_to_be32(total_locks);
+	mres->mig_cookie = cpu_to_be64(cookie);
+	mres->flags = flags;
+	mres->master = master;
+}
+
+static void dlm_prepare_lvb_for_migration(struct dlm_lock *lock,
+					  struct dlm_migratable_lockres *mres,
+					  int queue)
+{
+	if (!lock->lksb)
+	       return;
+
+	/* Ignore lvb in all locks in the blocked list */
+	if (queue == DLM_BLOCKED_LIST)
+		return;
+
+	/* Only consider lvbs in locks with granted EX or PR lock levels */
+	if (lock->ml.type != LKM_EXMODE && lock->ml.type != LKM_PRMODE)
+		return;
+
+	if (dlm_lvb_is_empty(mres->lvb)) {
+		memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+		return;
+	}
+
+	/* Ensure the lvb copied for migration matches in other valid locks */
+	if (!memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))
+		return;
+
+	mlog(ML_ERROR, "Mismatched lvb in lock cookie=%u:%llu, name=%.*s, "
+	     "node=%u\n",
+	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+	     lock->lockres->lockname.len, lock->lockres->lockname.name,
+	     lock->ml.node);
+	dlm_print_one_lock_resource(lock->lockres);
+	BUG();
+}
+
+/* returns 1 if this lock fills the network structure,
+ * 0 otherwise */
+static int dlm_add_lock_to_array(struct dlm_lock *lock,
+				 struct dlm_migratable_lockres *mres, int queue)
+{
+	struct dlm_migratable_lock *ml;
+	int lock_num = mres->num_locks;
+
+	ml = &(mres->ml[lock_num]);
+	ml->cookie = lock->ml.cookie;
+	ml->type = lock->ml.type;
+	ml->convert_type = lock->ml.convert_type;
+	ml->highest_blocked = lock->ml.highest_blocked;
+	ml->list = queue;
+	if (lock->lksb) {
+		ml->flags = lock->lksb->flags;
+		dlm_prepare_lvb_for_migration(lock, mres, queue);
+	}
+	ml->node = lock->ml.node;
+	mres->num_locks++;
+	/* we reached the max, send this network message */
+	if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS)
+		return 1;
+	return 0;
+}
+
+static void dlm_add_dummy_lock(struct dlm_ctxt *dlm,
+			       struct dlm_migratable_lockres *mres)
+{
+	struct dlm_lock dummy;
+	memset(&dummy, 0, sizeof(dummy));
+	dummy.ml.cookie = 0;
+	dummy.ml.type = LKM_IVMODE;
+	dummy.ml.convert_type = LKM_IVMODE;
+	dummy.ml.highest_blocked = LKM_IVMODE;
+	dummy.lksb = NULL;
+	dummy.ml.node = dlm->node_num;
+	dlm_add_lock_to_array(&dummy, mres, DLM_BLOCKED_LIST);
+}
+
+static inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm,
+				    struct dlm_migratable_lock *ml,
+				    u8 *nodenum)
+{
+	if (unlikely(ml->cookie == 0 &&
+	    ml->type == LKM_IVMODE &&
+	    ml->convert_type == LKM_IVMODE &&
+	    ml->highest_blocked == LKM_IVMODE &&
+	    ml->list == DLM_BLOCKED_LIST)) {
+		*nodenum = ml->node;
+		return 1;
+	}
+	return 0;
+}
+
+int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			 struct dlm_migratable_lockres *mres,
+			 u8 send_to, u8 flags)
+{
+	struct list_head *queue;
+	int total_locks, i;
+	u64 mig_cookie = 0;
+	struct dlm_lock *lock;
+	int ret = 0;
+
+	BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
+
+	mlog(0, "sending to %u\n", send_to);
+
+	total_locks = dlm_num_locks_in_lockres(res);
+	if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) {
+		/* rare, but possible */
+		mlog(0, "argh.  lockres has %d locks.  this will "
+			  "require more than one network packet to "
+			  "migrate\n", total_locks);
+		mig_cookie = dlm_get_next_mig_cookie();
+	}
+
+	dlm_init_migratable_lockres(mres, res->lockname.name,
+				    res->lockname.len, total_locks,
+				    mig_cookie, flags, res->owner);
+
+	total_locks = 0;
+	for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each_entry(lock, queue, list) {
+			/* add another lock. */
+			total_locks++;
+			if (!dlm_add_lock_to_array(lock, mres, i))
+				continue;
+
+			/* this filled the lock message,
+			 * we must send it immediately. */
+			ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
+						       res, total_locks);
+			if (ret < 0)
+				goto error;
+		}
+	}
+	if (total_locks == 0) {
+		/* send a dummy lock to indicate a mastery reference only */
+		mlog(0, "%s:%.*s: sending dummy lock to %u, %s\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     send_to, flags & DLM_MRES_RECOVERY ? "recovery" :
+		     "migration");
+		dlm_add_dummy_lock(dlm, mres);
+	}
+	/* flush any remaining locks */
+	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
+	if (ret < 0)
+		goto error;
+	return ret;
+
+error:
+	mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n",
+	     dlm->name, ret);
+	if (!dlm_is_host_down(ret))
+		BUG();
+	mlog(0, "%s: node %u went down while sending %s "
+	     "lockres %.*s\n", dlm->name, send_to,
+	     flags & DLM_MRES_RECOVERY ?  "recovery" : "migration",
+	     res->lockname.len, res->lockname.name);
+	return ret;
+}
+
+
+
+/*
+ * this message will contain no more than one page worth of
+ * recovery data, and it will work on only one lockres.
+ * there may be many locks in this page, and we may need to wait
+ * for additional packets to complete all the locks (rare, but
+ * possible).
+ */
+/*
+ * NOTE: the allocation error cases here are scary
+ * we really cannot afford to fail an alloc in recovery
+ * do we spin?  returning an error only delays the problem really
+ */
+
+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
+			    void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_migratable_lockres *mres =
+		(struct dlm_migratable_lockres *)msg->buf;
+	int ret = 0;
+	u8 real_master;
+	u8 extra_refs = 0;
+	char *buf = NULL;
+	struct dlm_work_item *item = NULL;
+	struct dlm_lock_resource *res = NULL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
+
+	real_master = mres->master;
+	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* cannot migrate a lockres with no master */
+		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
+	}
+
+	mlog(0, "%s message received from node %u\n",
+		  (mres->flags & DLM_MRES_RECOVERY) ?
+		  "recovery" : "migration", mres->master);
+	if (mres->flags & DLM_MRES_ALL_DONE)
+		mlog(0, "all done flag.  all lockres data received!\n");
+
+	ret = -ENOMEM;
+	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS);
+	item = kzalloc(sizeof(*item), GFP_NOFS);
+	if (!buf || !item)
+		goto leave;
+
+	/* lookup the lock to see if we have a secondary queue for this
+	 * already...  just add the locks in and this will have its owner
+	 * and RECOVERY flag changed when it completes. */
+	res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
+	if (res) {
+	 	/* this will get a ref on res */
+		/* mark it as recovering/migrating and hash it */
+		spin_lock(&res->spinlock);
+		if (mres->flags & DLM_MRES_RECOVERY) {
+			res->state |= DLM_LOCK_RES_RECOVERING;
+		} else {
+			if (res->state & DLM_LOCK_RES_MIGRATING) {
+				/* this is at least the second
+				 * lockres message */
+				mlog(0, "lock %.*s is already migrating\n",
+					  mres->lockname_len,
+					  mres->lockname);
+			} else if (res->state & DLM_LOCK_RES_RECOVERING) {
+				/* caller should BUG */
+				mlog(ML_ERROR, "node is attempting to migrate "
+				     "lock %.*s, but marked as recovering!\n",
+				     mres->lockname_len, mres->lockname);
+				ret = -EFAULT;
+				spin_unlock(&res->spinlock);
+				goto leave;
+			}
+			res->state |= DLM_LOCK_RES_MIGRATING;
+		}
+		spin_unlock(&res->spinlock);
+	} else {
+		/* need to allocate, just like if it was
+		 * mastered here normally  */
+		res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
+		if (!res)
+			goto leave;
+
+		/* to match the ref that we would have gotten if
+		 * dlm_lookup_lockres had succeeded */
+		dlm_lockres_get(res);
+
+		/* mark it as recovering/migrating and hash it */
+		if (mres->flags & DLM_MRES_RECOVERY)
+			res->state |= DLM_LOCK_RES_RECOVERING;
+		else
+			res->state |= DLM_LOCK_RES_MIGRATING;
+
+		spin_lock(&dlm->spinlock);
+		__dlm_insert_lockres(dlm, res);
+		spin_unlock(&dlm->spinlock);
+
+		/* Add an extra ref for this lock-less lockres lest the
+		 * dlm_thread purges it before we get the chance to add
+		 * locks to it */
+		dlm_lockres_get(res);
+
+		/* There are three refs that need to be put.
+		 * 1. Taken above.
+		 * 2. kref_init in dlm_new_lockres()->dlm_init_lockres().
+		 * 3. dlm_lookup_lockres()
+		 * The first one is handled at the end of this function. The
+		 * other two are handled in the worker thread after locks have
+		 * been attached. Yes, we don't wait for purge time to match
+		 * kref_init. The lockres will still have atleast one ref
+		 * added because it is in the hash __dlm_insert_lockres() */
+		extra_refs++;
+
+		/* now that the new lockres is inserted,
+		 * make it usable by other processes */
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+		spin_unlock(&res->spinlock);
+		wake_up(&res->wq);
+	}
+
+	/* at this point we have allocated everything we need,
+	 * and we have a hashed lockres with an extra ref and
+	 * the proper res->state flags. */
+	ret = 0;
+	spin_lock(&res->spinlock);
+	/* drop this either when master requery finds a different master
+	 * or when a lock is added by the recovery worker */
+	dlm_lockres_grab_inflight_ref(dlm, res);
+	if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* migration cannot have an unknown master */
+		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
+		mlog(0, "recovery has passed me a lockres with an "
+			  "unknown owner.. will need to requery: "
+			  "%.*s\n", mres->lockname_len, mres->lockname);
+	} else {
+		/* take a reference now to pin the lockres, drop it
+		 * when locks are added in the worker */
+		dlm_change_lockres_owner(dlm, res, dlm->node_num);
+	}
+	spin_unlock(&res->spinlock);
+
+	/* queue up work for dlm_mig_lockres_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	memcpy(buf, msg->buf, be16_to_cpu(msg->data_len));  /* copy the whole message */
+	dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf);
+	item->u.ml.lockres = res; /* already have a ref */
+	item->u.ml.real_master = real_master;
+	item->u.ml.extra_ref = extra_refs;
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
+
+leave:
+	/* One extra ref taken needs to be put here */
+	if (extra_refs)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+	if (ret < 0) {
+		if (buf)
+			kfree(buf);
+		if (item)
+			kfree(item);
+		mlog_errno(ret);
+	}
+
+	return ret;
+}
+
+
+static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_ctxt *dlm;
+	struct dlm_migratable_lockres *mres;
+	int ret = 0;
+	struct dlm_lock_resource *res;
+	u8 real_master;
+	u8 extra_ref;
+
+	dlm = item->dlm;
+	mres = (struct dlm_migratable_lockres *)data;
+
+	res = item->u.ml.lockres;
+	real_master = item->u.ml.real_master;
+	extra_ref = item->u.ml.extra_ref;
+
+	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* this case is super-rare. only occurs if
+		 * node death happens during migration. */
+again:
+		ret = dlm_lockres_master_requery(dlm, res, &real_master);
+		if (ret < 0) {
+			mlog(0, "dlm_lockres_master_requery ret=%d\n",
+				  ret);
+			goto again;
+		}
+		if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "lockres %.*s not claimed.  "
+				   "this node will take it.\n",
+				   res->lockname.len, res->lockname.name);
+		} else {
+			spin_lock(&res->spinlock);
+			dlm_lockres_drop_inflight_ref(dlm, res);
+			spin_unlock(&res->spinlock);
+			mlog(0, "master needs to respond to sender "
+				  "that node %u still owns %.*s\n",
+				  real_master, res->lockname.len,
+				  res->lockname.name);
+			/* cannot touch this lockres */
+			goto leave;
+		}
+	}
+
+	ret = dlm_process_recovery_data(dlm, res, mres);
+	if (ret < 0)
+		mlog(0, "dlm_process_recovery_data returned  %d\n", ret);
+	else
+		mlog(0, "dlm_process_recovery_data succeeded\n");
+
+	if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) ==
+	                   (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) {
+		ret = dlm_finish_migration(dlm, res, mres->master);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+leave:
+	/* See comment in dlm_mig_lockres_handler() */
+	if (res) {
+		if (extra_ref)
+			dlm_lockres_put(res);
+		dlm_lockres_put(res);
+	}
+	kfree(data);
+}
+
+
+
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      u8 *real_master)
+{
+	struct dlm_node_iter iter;
+	int nodenum;
+	int ret = 0;
+
+	*real_master = DLM_LOCK_RES_OWNER_UNKNOWN;
+
+	/* we only reach here if one of the two nodes in a
+	 * migration died while the migration was in progress.
+	 * at this point we need to requery the master.  we
+	 * know that the new_master got as far as creating
+	 * an mle on at least one node, but we do not know
+	 * if any nodes had actually cleared the mle and set
+	 * the master to the new_master.  the old master
+	 * is supposed to set the owner to UNKNOWN in the
+	 * event of a new_master death, so the only possible
+	 * responses that we can get from nodes here are
+	 * that the master is new_master, or that the master
+	 * is UNKNOWN.
+	 * if all nodes come back with UNKNOWN then we know
+	 * the lock needs remastering here.
+	 * if any node comes back with a valid master, check
+	 * to see if that master is the one that we are
+	 * recovering.  if so, then the new_master died and
+	 * we need to remaster this lock.  if not, then the
+	 * new_master survived and that node will respond to
+	 * other nodes about the owner.
+	 * if there is an owner, this node needs to dump this
+	 * lockres and alert the sender that this lockres
+	 * was rejected. */
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		/* do not send to self */
+		if (nodenum == dlm->node_num)
+			continue;
+		ret = dlm_do_master_requery(dlm, res, nodenum, real_master);
+		if (ret < 0) {
+			mlog_errno(ret);
+			if (!dlm_is_host_down(ret))
+				BUG();
+			/* host is down, so answer for that node would be
+			 * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
+		}
+		if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "lock master is %u\n", *real_master);
+			break;
+		}
+	}
+	return ret;
+}
+
+
+int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			  u8 nodenum, u8 *real_master)
+{
+	int ret = -EINVAL;
+	struct dlm_master_requery req;
+	int status = DLM_LOCK_RES_OWNER_UNKNOWN;
+
+	memset(&req, 0, sizeof(req));
+	req.node_idx = dlm->node_num;
+	req.namelen = res->lockname.len;
+	memcpy(req.name, res->lockname.name, res->lockname.len);
+
+	ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
+				 &req, sizeof(req), nodenum, &status);
+	/* XXX: negative status not handled properly here. */
+	if (ret < 0)
+		mlog(ML_ERROR, "Error %d when sending message %u (key "
+		     "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
+		     dlm->key, nodenum);
+	else {
+		BUG_ON(status < 0);
+		BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
+		*real_master = (u8) (status & 0xff);
+		mlog(0, "node %u responded to master requery with %u\n",
+			  nodenum, *real_master);
+		ret = 0;
+	}
+	return ret;
+}
+
+
+/* this function cannot error, so unless the sending
+ * or receiving of the message failed, the owner can
+ * be trusted */
+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
+			       void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	unsigned int hash;
+	int master = DLM_LOCK_RES_OWNER_UNKNOWN;
+	u32 flags = DLM_ASSERT_MASTER_REQUERY;
+
+	if (!dlm_grab(dlm)) {
+		/* since the domain has gone away on this
+		 * node, the proper response is UNKNOWN */
+		return master;
+	}
+
+	hash = dlm_lockid_hash(req->name, req->namelen);
+
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash);
+	if (res) {
+		spin_lock(&res->spinlock);
+		master = res->owner;
+		if (master == dlm->node_num) {
+			int ret = dlm_dispatch_assert_master(dlm, res,
+							     0, 0, flags);
+			if (ret < 0) {
+				mlog_errno(-ENOMEM);
+				/* retry!? */
+				BUG();
+			}
+		} else /* put.. incase we are not the master */
+			dlm_lockres_put(res);
+		spin_unlock(&res->spinlock);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+	return master;
+}
+
+static inline struct list_head *
+dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num)
+{
+	struct list_head *ret;
+	BUG_ON(list_num < 0);
+	BUG_ON(list_num > 2);
+	ret = &(res->granted);
+	ret += list_num;
+	return ret;
+}
+/* TODO: do ast flush business
+ * TODO: do MIGRATING and RECOVERING spinning
+ */
+
+/*
+* NOTE about in-flight requests during migration:
+*
+* Before attempting the migrate, the master has marked the lockres as
+* MIGRATING and then flushed all of its pending ASTS.  So any in-flight
+* requests either got queued before the MIGRATING flag got set, in which
+* case the lock data will reflect the change and a return message is on
+* the way, or the request failed to get in before MIGRATING got set.  In
+* this case, the caller will be told to spin and wait for the MIGRATING
+* flag to be dropped, then recheck the master.
+* This holds true for the convert, cancel and unlock cases, and since lvb
+* updates are tied to these same messages, it applies to lvb updates as
+* well.  For the lock case, there is no way a lock can be on the master
+* queue and not be on the secondary queue since the lock is always added
+* locally first.  This means that the new target node will never be sent
+* a lock that he doesn't already have on the list.
+* In total, this means that the local lock is correct and should not be
+* updated to match the one sent by the master.  Any messages sent back
+* from the master before the MIGRATING flag will bring the lock properly
+* up-to-date, and the change will be ordered properly for the waiter.
+* We will *not* attempt to modify the lock underneath the waiter.
+*/
+
+static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_migratable_lockres *mres)
+{
+	struct dlm_migratable_lock *ml;
+	struct list_head *queue;
+	struct list_head *tmpq = NULL;
+	struct dlm_lock *newlock = NULL;
+	struct dlm_lockstatus *lksb = NULL;
+	int ret = 0;
+	int i, j, bad;
+	struct dlm_lock *lock = NULL;
+	u8 from = O2NM_MAX_NODES;
+	unsigned int added = 0;
+	__be64 c;
+
+	mlog(0, "running %d locks for this lockres\n", mres->num_locks);
+	for (i=0; i<mres->num_locks; i++) {
+		ml = &(mres->ml[i]);
+
+		if (dlm_is_dummy_lock(dlm, ml, &from)) {
+			/* placeholder, just need to set the refmap bit */
+			BUG_ON(mres->num_locks != 1);
+			mlog(0, "%s:%.*s: dummy lock for %u\n",
+			     dlm->name, mres->lockname_len, mres->lockname,
+			     from);
+			spin_lock(&res->spinlock);
+			dlm_lockres_set_refmap_bit(from, res);
+			spin_unlock(&res->spinlock);
+			added++;
+			break;
+		}
+		BUG_ON(ml->highest_blocked != LKM_IVMODE);
+		newlock = NULL;
+		lksb = NULL;
+
+		queue = dlm_list_num_to_pointer(res, ml->list);
+		tmpq = NULL;
+
+		/* if the lock is for the local node it needs to
+		 * be moved to the proper location within the queue.
+		 * do not allocate a new lock structure. */
+		if (ml->node == dlm->node_num) {
+			/* MIGRATION ONLY! */
+			BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
+
+			spin_lock(&res->spinlock);
+			for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
+				tmpq = dlm_list_idx_to_ptr(res, j);
+				list_for_each_entry(lock, tmpq, list) {
+					if (lock->ml.cookie != ml->cookie)
+						lock = NULL;
+					else
+						break;
+				}
+				if (lock)
+					break;
+			}
+
+			/* lock is always created locally first, and
+			 * destroyed locally last.  it must be on the list */
+			if (!lock) {
+				c = ml->cookie;
+				mlog(ML_ERROR, "Could not find local lock "
+					       "with cookie %u:%llu, node %u, "
+					       "list %u, flags 0x%x, type %d, "
+					       "conv %d, highest blocked %d\n",
+				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+				     ml->node, ml->list, ml->flags, ml->type,
+				     ml->convert_type, ml->highest_blocked);
+				__dlm_print_one_lock_resource(res);
+				BUG();
+			}
+
+			if (lock->ml.node != ml->node) {
+				c = lock->ml.cookie;
+				mlog(ML_ERROR, "Mismatched node# in lock "
+				     "cookie %u:%llu, name %.*s, node %u\n",
+				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+				     res->lockname.len, res->lockname.name,
+				     lock->ml.node);
+				c = ml->cookie;
+				mlog(ML_ERROR, "Migrate lock cookie %u:%llu, "
+				     "node %u, list %u, flags 0x%x, type %d, "
+				     "conv %d, highest blocked %d\n",
+				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+				     ml->node, ml->list, ml->flags, ml->type,
+				     ml->convert_type, ml->highest_blocked);
+				__dlm_print_one_lock_resource(res);
+				BUG();
+			}
+
+			if (tmpq != queue) {
+				c = ml->cookie;
+				mlog(0, "Lock cookie %u:%llu was on list %u "
+				     "instead of list %u for %.*s\n",
+				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+				     j, ml->list, res->lockname.len,
+				     res->lockname.name);
+				__dlm_print_one_lock_resource(res);
+				spin_unlock(&res->spinlock);
+				continue;
+			}
+
+			/* see NOTE above about why we do not update
+			 * to match the master here */
+
+			/* move the lock to its proper place */
+			/* do not alter lock refcount.  switching lists. */
+			list_move_tail(&lock->list, queue);
+			spin_unlock(&res->spinlock);
+			added++;
+
+			mlog(0, "just reordered a local lock!\n");
+			continue;
+		}
+
+		/* lock is for another node. */
+		newlock = dlm_new_lock(ml->type, ml->node,
+				       be64_to_cpu(ml->cookie), NULL);
+		if (!newlock) {
+			ret = -ENOMEM;
+			goto leave;
+		}
+		lksb = newlock->lksb;
+		dlm_lock_attach_lockres(newlock, res);
+
+		if (ml->convert_type != LKM_IVMODE) {
+			BUG_ON(queue != &res->converting);
+			newlock->ml.convert_type = ml->convert_type;
+		}
+		lksb->flags |= (ml->flags &
+				(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
+
+		if (ml->type == LKM_NLMODE)
+			goto skip_lvb;
+
+		if (!dlm_lvb_is_empty(mres->lvb)) {
+			if (lksb->flags & DLM_LKSB_PUT_LVB) {
+				/* other node was trying to update
+				 * lvb when node died.  recreate the
+				 * lksb with the updated lvb. */
+				memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
+				/* the lock resource lvb update must happen
+				 * NOW, before the spinlock is dropped.
+				 * we no longer wait for the AST to update
+				 * the lvb. */
+				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
+			} else {
+				/* otherwise, the node is sending its
+				 * most recent valid lvb info */
+				BUG_ON(ml->type != LKM_EXMODE &&
+				       ml->type != LKM_PRMODE);
+				if (!dlm_lvb_is_empty(res->lvb) &&
+ 				    (ml->type == LKM_EXMODE ||
+ 				     memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
+ 					int i;
+ 					mlog(ML_ERROR, "%s:%.*s: received bad "
+ 					     "lvb! type=%d\n", dlm->name,
+ 					     res->lockname.len,
+ 					     res->lockname.name, ml->type);
+ 					printk("lockres lvb=[");
+ 					for (i=0; i<DLM_LVB_LEN; i++)
+ 						printk("%02x", res->lvb[i]);
+ 					printk("]\nmigrated lvb=[");
+ 					for (i=0; i<DLM_LVB_LEN; i++)
+ 						printk("%02x", mres->lvb[i]);
+ 					printk("]\n");
+ 					dlm_print_one_lock_resource(res);
+ 					BUG();
+				}
+				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
+			}
+		}
+skip_lvb:
+
+		/* NOTE:
+		 * wrt lock queue ordering and recovery:
+		 *    1. order of locks on granted queue is
+		 *       meaningless.
+		 *    2. order of locks on converting queue is
+		 *       LOST with the node death.  sorry charlie.
+		 *    3. order of locks on the blocked queue is
+		 *       also LOST.
+		 * order of locks does not affect integrity, it
+		 * just means that a lock request may get pushed
+		 * back in line as a result of the node death.
+		 * also note that for a given node the lock order
+		 * for its secondary queue locks is preserved
+		 * relative to each other, but clearly *not*
+		 * preserved relative to locks from other nodes.
+		 */
+		bad = 0;
+		spin_lock(&res->spinlock);
+		list_for_each_entry(lock, queue, list) {
+			if (lock->ml.cookie == ml->cookie) {
+				c = lock->ml.cookie;
+				mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
+				     "exists on this lockres!\n", dlm->name,
+				     res->lockname.len, res->lockname.name,
+				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+				     dlm_get_lock_cookie_seq(be64_to_cpu(c)));
+
+				mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, "
+				     "node=%u, cookie=%u:%llu, queue=%d\n",
+	      			     ml->type, ml->convert_type, ml->node,
+				     dlm_get_lock_cookie_node(be64_to_cpu(ml->cookie)),
+				     dlm_get_lock_cookie_seq(be64_to_cpu(ml->cookie)),
+				     ml->list);
+
+				__dlm_print_one_lock_resource(res);
+				bad = 1;
+				break;
+			}
+		}
+		if (!bad) {
+			dlm_lock_get(newlock);
+			list_add_tail(&newlock->list, queue);
+			mlog(0, "%s:%.*s: added lock for node %u, "
+			     "setting refmap bit\n", dlm->name,
+			     res->lockname.len, res->lockname.name, ml->node);
+			dlm_lockres_set_refmap_bit(ml->node, res);
+			added++;
+		}
+		spin_unlock(&res->spinlock);
+	}
+	mlog(0, "done running all the locks\n");
+
+leave:
+	/* balance the ref taken when the work was queued */
+	spin_lock(&res->spinlock);
+	dlm_lockres_drop_inflight_ref(dlm, res);
+	spin_unlock(&res->spinlock);
+
+	if (ret < 0) {
+		mlog_errno(ret);
+		if (newlock)
+			dlm_lock_put(newlock);
+	}
+
+	return ret;
+}
+
+void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res)
+{
+	int i;
+	struct list_head *queue;
+	struct dlm_lock *lock, *next;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+	res->state |= DLM_LOCK_RES_RECOVERING;
+	if (!list_empty(&res->recovering)) {
+		mlog(0,
+		     "Recovering res %s:%.*s, is already on recovery list!\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		list_del_init(&res->recovering);
+		dlm_lockres_put(res);
+	}
+	/* We need to hold a reference while on the recovery list */
+	dlm_lockres_get(res);
+	list_add_tail(&res->recovering, &dlm->reco.resources);
+
+	/* find any pending locks and put them back on proper list */
+	for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each_entry_safe(lock, next, queue, list) {
+			dlm_lock_get(lock);
+			if (lock->convert_pending) {
+				/* move converting lock back to granted */
+				BUG_ON(i != DLM_CONVERTING_LIST);
+				mlog(0, "node died with convert pending "
+				     "on %.*s. move back to granted list.\n",
+				     res->lockname.len, res->lockname.name);
+				dlm_revert_pending_convert(res, lock);
+				lock->convert_pending = 0;
+			} else if (lock->lock_pending) {
+				/* remove pending lock requests completely */
+				BUG_ON(i != DLM_BLOCKED_LIST);
+				mlog(0, "node died with lock pending "
+				     "on %.*s. remove from blocked list and skip.\n",
+				     res->lockname.len, res->lockname.name);
+				/* lock will be floating until ref in
+				 * dlmlock_remote is freed after the network
+				 * call returns.  ok for it to not be on any
+				 * list since no ast can be called
+				 * (the master is dead). */
+				dlm_revert_pending_lock(res, lock);
+				lock->lock_pending = 0;
+			} else if (lock->unlock_pending) {
+				/* if an unlock was in progress, treat as
+				 * if this had completed successfully
+				 * before sending this lock state to the
+				 * new master.  note that the dlm_unlock
+				 * call is still responsible for calling
+				 * the unlockast.  that will happen after
+				 * the network call times out.  for now,
+				 * just move lists to prepare the new
+				 * recovery master.  */
+				BUG_ON(i != DLM_GRANTED_LIST);
+				mlog(0, "node died with unlock pending "
+				     "on %.*s. remove from blocked list and skip.\n",
+				     res->lockname.len, res->lockname.name);
+				dlm_commit_pending_unlock(res, lock);
+				lock->unlock_pending = 0;
+			} else if (lock->cancel_pending) {
+				/* if a cancel was in progress, treat as
+				 * if this had completed successfully
+				 * before sending this lock state to the
+				 * new master */
+				BUG_ON(i != DLM_CONVERTING_LIST);
+				mlog(0, "node died with cancel pending "
+				     "on %.*s. move back to granted list.\n",
+				     res->lockname.len, res->lockname.name);
+				dlm_commit_pending_cancel(res, lock);
+				lock->cancel_pending = 0;
+			}
+			dlm_lock_put(lock);
+		}
+	}
+}
+
+
+
+/* removes all recovered locks from the recovery list.
+ * sets the res->owner to the new master.
+ * unsets the RECOVERY flag and wakes waiters. */
+static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
+					      u8 dead_node, u8 new_master)
+{
+	int i;
+	struct hlist_node *hash_iter;
+	struct hlist_head *bucket;
+	struct dlm_lock_resource *res, *next;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
+		if (res->owner == dead_node) {
+			list_del_init(&res->recovering);
+			spin_lock(&res->spinlock);
+			/* new_master has our reference from
+			 * the lock state sent during recovery */
+			dlm_change_lockres_owner(dlm, res, new_master);
+			res->state &= ~DLM_LOCK_RES_RECOVERING;
+			if (__dlm_lockres_has_locks(res))
+				__dlm_dirty_lockres(dlm, res);
+			spin_unlock(&res->spinlock);
+			wake_up(&res->wq);
+			dlm_lockres_put(res);
+		}
+	}
+
+	/* this will become unnecessary eventually, but
+	 * for now we need to run the whole hash, clear
+	 * the RECOVERING state and set the owner
+	 * if necessary */
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = dlm_lockres_hash(dlm, i);
+		hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
+			if (res->state & DLM_LOCK_RES_RECOVERING) {
+				if (res->owner == dead_node) {
+					mlog(0, "(this=%u) res %.*s owner=%u "
+					     "was not on recovering list, but "
+					     "clearing state anyway\n",
+					     dlm->node_num, res->lockname.len,
+					     res->lockname.name, new_master);
+				} else if (res->owner == dlm->node_num) {
+					mlog(0, "(this=%u) res %.*s owner=%u "
+					     "was not on recovering list, "
+					     "owner is THIS node, clearing\n",
+					     dlm->node_num, res->lockname.len,
+					     res->lockname.name, new_master);
+				} else
+					continue;
+
+				if (!list_empty(&res->recovering)) {
+					mlog(0, "%s:%.*s: lockres was "
+					     "marked RECOVERING, owner=%u\n",
+					     dlm->name, res->lockname.len,
+					     res->lockname.name, res->owner);
+					list_del_init(&res->recovering);
+					dlm_lockres_put(res);
+				}
+				spin_lock(&res->spinlock);
+				/* new_master has our reference from
+				 * the lock state sent during recovery */
+				dlm_change_lockres_owner(dlm, res, new_master);
+				res->state &= ~DLM_LOCK_RES_RECOVERING;
+				if (__dlm_lockres_has_locks(res))
+					__dlm_dirty_lockres(dlm, res);
+				spin_unlock(&res->spinlock);
+				wake_up(&res->wq);
+			}
+		}
+	}
+}
+
+static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
+{
+	if (local) {
+		if (lock->ml.type != LKM_EXMODE &&
+		    lock->ml.type != LKM_PRMODE)
+			return 1;
+	} else if (lock->ml.type == LKM_EXMODE)
+		return 1;
+	return 0;
+}
+
+static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res, u8 dead_node)
+{
+	struct list_head *queue;
+	struct dlm_lock *lock;
+	int blank_lvb = 0, local = 0;
+	int i;
+	u8 search_node;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	if (res->owner == dlm->node_num)
+		/* if this node owned the lockres, and if the dead node
+		 * had an EX when he died, blank out the lvb */
+		search_node = dead_node;
+	else {
+		/* if this is a secondary lockres, and we had no EX or PR
+		 * locks granted, we can no longer trust the lvb */
+		search_node = dlm->node_num;
+		local = 1;  /* check local state for valid lvb */
+	}
+
+	for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each_entry(lock, queue, list) {
+			if (lock->ml.node == search_node) {
+				if (dlm_lvb_needs_invalidation(lock, local)) {
+					/* zero the lksb lvb and lockres lvb */
+					blank_lvb = 1;
+					memset(lock->lksb->lvb, 0, DLM_LVB_LEN);
+				}
+			}
+		}
+	}
+
+	if (blank_lvb) {
+		mlog(0, "clearing %.*s lvb, dead node %u had EX\n",
+		     res->lockname.len, res->lockname.name, dead_node);
+		memset(res->lvb, 0, DLM_LVB_LEN);
+	}
+}
+
+static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
+				struct dlm_lock_resource *res, u8 dead_node)
+{
+	struct dlm_lock *lock, *next;
+	unsigned int freed = 0;
+
+	/* this node is the lockres master:
+	 * 1) remove any stale locks for the dead node
+	 * 2) if the dead node had an EX when he died, blank out the lvb
+	 */
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	/* We do two dlm_lock_put(). One for removing from list and the other is
+	 * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */
+
+	/* TODO: check pending_asts, pending_basts here */
+	list_for_each_entry_safe(lock, next, &res->granted, list) {
+		if (lock->ml.node == dead_node) {
+			list_del_init(&lock->list);
+			dlm_lock_put(lock);
+			/* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
+			dlm_lock_put(lock);
+			freed++;
+		}
+	}
+	list_for_each_entry_safe(lock, next, &res->converting, list) {
+		if (lock->ml.node == dead_node) {
+			list_del_init(&lock->list);
+			dlm_lock_put(lock);
+			/* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
+			dlm_lock_put(lock);
+			freed++;
+		}
+	}
+	list_for_each_entry_safe(lock, next, &res->blocked, list) {
+		if (lock->ml.node == dead_node) {
+			list_del_init(&lock->list);
+			dlm_lock_put(lock);
+			/* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
+			dlm_lock_put(lock);
+			freed++;
+		}
+	}
+
+	if (freed) {
+		mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
+		     "dropping ref from lockres\n", dlm->name,
+		     res->lockname.len, res->lockname.name, freed, dead_node);
+		if(!test_bit(dead_node, res->refmap)) {
+			mlog(ML_ERROR, "%s:%.*s: freed %u locks for dead node %u, "
+			     "but ref was not set\n", dlm->name,
+			     res->lockname.len, res->lockname.name, freed, dead_node);
+			__dlm_print_one_lock_resource(res);
+		}
+		dlm_lockres_clear_refmap_bit(dead_node, res);
+	} else if (test_bit(dead_node, res->refmap)) {
+		mlog(0, "%s:%.*s: dead node %u had a ref, but had "
+		     "no locks and had not purged before dying\n", dlm->name,
+		     res->lockname.len, res->lockname.name, dead_node);
+		dlm_lockres_clear_refmap_bit(dead_node, res);
+	}
+
+	/* do not kick thread yet */
+	__dlm_dirty_lockres(dlm, res);
+}
+
+/* if this node is the recovery master, and there are no
+ * locks for a given lockres owned by this node that are in
+ * either PR or EX mode, zero out the lvb before requesting.
+ *
+ */
+
+
+static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct hlist_node *iter;
+	struct dlm_lock_resource *res;
+	int i;
+	struct hlist_head *bucket;
+	struct dlm_lock *lock;
+
+
+	/* purge any stale mles */
+	dlm_clean_master_list(dlm, dead_node);
+
+	/*
+	 * now clean up all lock resources.  there are two rules:
+	 *
+	 * 1) if the dead node was the master, move the lockres
+	 *    to the recovering list.  set the RECOVERING flag.
+	 *    this lockres needs to be cleaned up before it can
+	 *    be used further.
+	 *
+	 * 2) if this node was the master, remove all locks from
+	 *    each of the lockres queues that were owned by the
+	 *    dead node.  once recovery finishes, the dlm thread
+	 *    can be kicked again to see if any ASTs or BASTs
+	 *    need to be fired as a result.
+	 */
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = dlm_lockres_hash(dlm, i);
+		hlist_for_each_entry(res, iter, bucket, hash_node) {
+ 			/* always prune any $RECOVERY entries for dead nodes,
+ 			 * otherwise hangs can occur during later recovery */
+			if (dlm_is_recovery_lock(res->lockname.name,
+						 res->lockname.len)) {
+				spin_lock(&res->spinlock);
+				list_for_each_entry(lock, &res->granted, list) {
+					if (lock->ml.node == dead_node) {
+						mlog(0, "AHA! there was "
+						     "a $RECOVERY lock for dead "
+						     "node %u (%s)!\n",
+						     dead_node, dlm->name);
+						list_del_init(&lock->list);
+						dlm_lock_put(lock);
+						break;
+					}
+				}
+				spin_unlock(&res->spinlock);
+				continue;
+			}
+			spin_lock(&res->spinlock);
+			/* zero the lvb if necessary */
+			dlm_revalidate_lvb(dlm, res, dead_node);
+			if (res->owner == dead_node) {
+				if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+					mlog(ML_NOTICE, "Ignore %.*s for "
+					     "recovery as it is being freed\n",
+					     res->lockname.len,
+					     res->lockname.name);
+				} else
+					dlm_move_lockres_to_recovery_list(dlm,
+									  res);
+
+			} else if (res->owner == dlm->node_num) {
+				dlm_free_dead_locks(dlm, res, dead_node);
+				__dlm_lockres_calc_usage(dlm, res);
+			}
+			spin_unlock(&res->spinlock);
+		}
+	}
+
+}
+
+static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	if (dlm->reco.new_master == idx) {
+		mlog(0, "%s: recovery master %d just died\n",
+		     dlm->name, idx);
+		if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+			/* finalize1 was reached, so it is safe to clear
+			 * the new_master and dead_node.  that recovery
+			 * is complete. */
+			mlog(0, "%s: dead master %d had reached "
+			     "finalize1 state, clearing\n", dlm->name, idx);
+			dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+			__dlm_reset_recovery(dlm);
+		}
+	}
+
+	/* Clean up join state on node death. */
+	if (dlm->joining_node == idx) {
+		mlog(0, "Clearing join state for node %u\n", idx);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+	}
+
+	/* check to see if the node is already considered dead */
+	if (!test_bit(idx, dlm->live_nodes_map)) {
+		mlog(0, "for domain %s, node %d is already dead. "
+		     "another node likely did recovery already.\n",
+		     dlm->name, idx);
+		return;
+	}
+
+	/* check to see if we do not care about this node */
+	if (!test_bit(idx, dlm->domain_map)) {
+		/* This also catches the case that we get a node down
+		 * but haven't joined the domain yet. */
+		mlog(0, "node %u already removed from domain!\n", idx);
+		return;
+	}
+
+	clear_bit(idx, dlm->live_nodes_map);
+
+	/* make sure local cleanup occurs before the heartbeat events */
+	if (!test_bit(idx, dlm->recovery_map))
+		dlm_do_local_recovery_cleanup(dlm, idx);
+
+	/* notify anything attached to the heartbeat events */
+	dlm_hb_event_notify_attached(dlm, idx, 0);
+
+	mlog(0, "node %u being removed from domain map!\n", idx);
+	clear_bit(idx, dlm->domain_map);
+	clear_bit(idx, dlm->exit_domain_map);
+	/* wake up migration waiters if a node goes down.
+	 * perhaps later we can genericize this for other waiters. */
+	wake_up(&dlm->migration_wq);
+
+	if (test_bit(idx, dlm->recovery_map))
+		mlog(0, "domain %s, node %u already added "
+		     "to recovery map!\n", dlm->name, idx);
+	else
+		set_bit(idx, dlm->recovery_map);
+}
+
+void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+
+	if (!dlm_grab(dlm))
+		return;
+
+	/*
+	 * This will notify any dlm users that a node in our domain
+	 * went away without notifying us first.
+	 */
+	if (test_bit(idx, dlm->domain_map))
+		dlm_fire_domain_eviction_callbacks(dlm, idx);
+
+	spin_lock(&dlm->spinlock);
+	__dlm_hb_node_down(dlm, idx);
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+}
+
+void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+
+	if (!dlm_grab(dlm))
+		return;
+
+	spin_lock(&dlm->spinlock);
+	set_bit(idx, dlm->live_nodes_map);
+	/* do NOT notify mle attached to the heartbeat events.
+	 * new nodes are not interesting in mastery until joined. */
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+}
+
+static void dlm_reco_ast(void *astdata)
+{
+	struct dlm_ctxt *dlm = astdata;
+	mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n",
+	     dlm->node_num, dlm->name);
+}
+static void dlm_reco_bast(void *astdata, int blocked_type)
+{
+	struct dlm_ctxt *dlm = astdata;
+	mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n",
+	     dlm->node_num, dlm->name);
+}
+static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
+{
+	mlog(0, "unlockast for recovery lock fired!\n");
+}
+
+/*
+ * dlm_pick_recovery_master will continually attempt to use
+ * dlmlock() on the special "$RECOVERY" lockres with the
+ * LKM_NOQUEUE flag to get an EX.  every thread that enters
+ * this function on each node racing to become the recovery
+ * master will not stop attempting this until either:
+ * a) this node gets the EX (and becomes the recovery master),
+ * or b) dlm->reco.new_master gets set to some nodenum
+ * != O2NM_INVALID_NODE_NUM (another node will do the reco).
+ * so each time a recovery master is needed, the entire cluster
+ * will sync at this point.  if the new master dies, that will
+ * be detected in dlm_do_recovery */
+static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
+{
+	enum dlm_status ret;
+	struct dlm_lockstatus lksb;
+	int status = -EINVAL;
+
+	mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
+	     dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
+again:
+	memset(&lksb, 0, sizeof(lksb));
+
+	ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
+		      DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN,
+		      dlm_reco_ast, dlm, dlm_reco_bast);
+
+	mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n",
+	     dlm->name, ret, lksb.status);
+
+	if (ret == DLM_NORMAL) {
+		mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
+		     dlm->name, dlm->node_num);
+
+		/* got the EX lock.  check to see if another node
+		 * just became the reco master */
+		if (dlm_reco_master_ready(dlm)) {
+			mlog(0, "%s: got reco EX lock, but %u will "
+			     "do the recovery\n", dlm->name,
+			     dlm->reco.new_master);
+			status = -EEXIST;
+		} else {
+			status = 0;
+
+			/* see if recovery was already finished elsewhere */
+			spin_lock(&dlm->spinlock);
+			if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+				status = -EINVAL;
+				mlog(0, "%s: got reco EX lock, but "
+				     "node got recovered already\n", dlm->name);
+				if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
+					mlog(ML_ERROR, "%s: new master is %u "
+					     "but no dead node!\n",
+					     dlm->name, dlm->reco.new_master);
+					BUG();
+				}
+			}
+			spin_unlock(&dlm->spinlock);
+		}
+
+		/* if this node has actually become the recovery master,
+		 * set the master and send the messages to begin recovery */
+		if (!status) {
+			mlog(0, "%s: dead=%u, this=%u, sending "
+			     "begin_reco now\n", dlm->name,
+			     dlm->reco.dead_node, dlm->node_num);
+			status = dlm_send_begin_reco_message(dlm,
+				      dlm->reco.dead_node);
+			/* this always succeeds */
+			BUG_ON(status);
+
+			/* set the new_master to this node */
+			spin_lock(&dlm->spinlock);
+			dlm_set_reco_master(dlm, dlm->node_num);
+			spin_unlock(&dlm->spinlock);
+		}
+
+		/* recovery lock is a special case.  ast will not get fired,
+		 * so just go ahead and unlock it. */
+		ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
+		if (ret == DLM_DENIED) {
+			mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n");
+			ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm);
+		}
+		if (ret != DLM_NORMAL) {
+			/* this would really suck. this could only happen
+			 * if there was a network error during the unlock
+			 * because of node death.  this means the unlock
+			 * is actually "done" and the lock structure is
+			 * even freed.  we can continue, but only
+			 * because this specific lock name is special. */
+			mlog(ML_ERROR, "dlmunlock returned %d\n", ret);
+		}
+	} else if (ret == DLM_NOTQUEUED) {
+		mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
+		     dlm->name, dlm->node_num);
+		/* another node is master. wait on
+		 * reco.new_master != O2NM_INVALID_NODE_NUM
+		 * for at most one second */
+		wait_event_timeout(dlm->dlm_reco_thread_wq,
+					 dlm_reco_master_ready(dlm),
+					 msecs_to_jiffies(1000));
+		if (!dlm_reco_master_ready(dlm)) {
+			mlog(0, "%s: reco master taking awhile\n",
+			     dlm->name);
+			goto again;
+		}
+		/* another node has informed this one that it is reco master */
+		mlog(0, "%s: reco master %u is ready to recover %u\n",
+		     dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
+		status = -EEXIST;
+	} else if (ret == DLM_RECOVERING) {
+		mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n",
+		     dlm->name, dlm->node_num);
+		goto again;
+	} else {
+		struct dlm_lock_resource *res;
+
+		/* dlmlock returned something other than NOTQUEUED or NORMAL */
+		mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), "
+		     "lksb.status=%s\n", dlm->name, dlm_errname(ret),
+		     dlm_errname(lksb.status));
+		res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
+					 DLM_RECOVERY_LOCK_NAME_LEN);
+		if (res) {
+			dlm_print_one_lock_resource(res);
+			dlm_lockres_put(res);
+		} else {
+			mlog(ML_ERROR, "recovery lock not found\n");
+		}
+		BUG();
+	}
+
+	return status;
+}
+
+static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct dlm_begin_reco br;
+	int ret = 0;
+	struct dlm_node_iter iter;
+	int nodenum;
+	int status;
+
+	mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	clear_bit(dead_node, iter.node_map);
+
+	memset(&br, 0, sizeof(br));
+	br.node_idx = dlm->node_num;
+	br.dead_node = dead_node;
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		ret = 0;
+		if (nodenum == dead_node) {
+			mlog(0, "not sending begin reco to dead node "
+				  "%u\n", dead_node);
+			continue;
+		}
+		if (nodenum == dlm->node_num) {
+			mlog(0, "not sending begin reco to self\n");
+			continue;
+		}
+retry:
+		ret = -EINVAL;
+		mlog(0, "attempting to send begin reco msg to %d\n",
+			  nodenum);
+		ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
+					 &br, sizeof(br), nodenum, &status);
+		/* negative status is handled ok by caller here */
+		if (ret >= 0)
+			ret = status;
+		if (dlm_is_host_down(ret)) {
+			/* node is down.  not involved in recovery
+			 * so just keep going */
+			mlog(ML_NOTICE, "%s: node %u was down when sending "
+			     "begin reco msg (%d)\n", dlm->name, nodenum, ret);
+			ret = 0;
+		}
+
+		/*
+		 * Prior to commit aad1b15310b9bcd59fa81ab8f2b1513b59553ea8,
+		 * dlm_begin_reco_handler() returned EAGAIN and not -EAGAIN.
+		 * We are handling both for compatibility reasons.
+		 */
+		if (ret == -EAGAIN || ret == EAGAIN) {
+			mlog(0, "%s: trying to start recovery of node "
+			     "%u, but node %u is waiting for last recovery "
+			     "to complete, backoff for a bit\n", dlm->name,
+			     dead_node, nodenum);
+			msleep(100);
+			goto retry;
+		}
+		if (ret < 0) {
+			struct dlm_lock_resource *res;
+
+			/* this is now a serious problem, possibly ENOMEM
+			 * in the network stack.  must retry */
+			mlog_errno(ret);
+			mlog(ML_ERROR, "begin reco of dlm %s to node %u "
+			     "returned %d\n", dlm->name, nodenum, ret);
+			res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
+						 DLM_RECOVERY_LOCK_NAME_LEN);
+			if (res) {
+				dlm_print_one_lock_resource(res);
+				dlm_lockres_put(res);
+			} else {
+				mlog(ML_ERROR, "recovery lock not found\n");
+			}
+			/* sleep for a bit in hopes that we can avoid
+			 * another ENOMEM */
+			msleep(100);
+			goto retry;
+		}
+	}
+
+	return ret;
+}
+
+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+			   void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf;
+
+	/* ok to return 0, domain has gone away */
+	if (!dlm_grab(dlm))
+		return 0;
+
+	spin_lock(&dlm->spinlock);
+	if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+		mlog(0, "%s: node %u wants to recover node %u (%u:%u) "
+		     "but this node is in finalize state, waiting on finalize2\n",
+		     dlm->name, br->node_idx, br->dead_node,
+		     dlm->reco.dead_node, dlm->reco.new_master);
+		spin_unlock(&dlm->spinlock);
+		return -EAGAIN;
+	}
+	spin_unlock(&dlm->spinlock);
+
+	mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n",
+	     dlm->name, br->node_idx, br->dead_node,
+	     dlm->reco.dead_node, dlm->reco.new_master);
+
+	dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
+
+	spin_lock(&dlm->spinlock);
+	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
+		if (test_bit(dlm->reco.new_master, dlm->recovery_map)) {
+			mlog(0, "%s: new_master %u died, changing "
+			     "to %u\n", dlm->name, dlm->reco.new_master,
+			     br->node_idx);
+		} else {
+			mlog(0, "%s: new_master %u NOT DEAD, changing "
+			     "to %u\n", dlm->name, dlm->reco.new_master,
+			     br->node_idx);
+			/* may not have seen the new master as dead yet */
+		}
+	}
+	if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
+		mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
+		     "node %u changing it to %u\n", dlm->name,
+		     dlm->reco.dead_node, br->node_idx, br->dead_node);
+	}
+	dlm_set_reco_master(dlm, br->node_idx);
+	dlm_set_reco_dead_node(dlm, br->dead_node);
+	if (!test_bit(br->dead_node, dlm->recovery_map)) {
+		mlog(0, "recovery master %u sees %u as dead, but this "
+		     "node has not yet.  marking %u as dead\n",
+		     br->node_idx, br->dead_node, br->dead_node);
+		if (!test_bit(br->dead_node, dlm->domain_map) ||
+		    !test_bit(br->dead_node, dlm->live_nodes_map))
+			mlog(0, "%u not in domain/live_nodes map "
+			     "so setting it in reco map manually\n",
+			     br->dead_node);
+		/* force the recovery cleanup in __dlm_hb_node_down
+		 * both of these will be cleared in a moment */
+		set_bit(br->dead_node, dlm->domain_map);
+		set_bit(br->dead_node, dlm->live_nodes_map);
+		__dlm_hb_node_down(dlm, br->dead_node);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	dlm_kick_recovery_thread(dlm);
+
+	mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n",
+	     dlm->name, br->node_idx, br->dead_node,
+	     dlm->reco.dead_node, dlm->reco.new_master);
+
+	dlm_put(dlm);
+	return 0;
+}
+
+#define DLM_FINALIZE_STAGE2  0x01
+static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
+{
+	int ret = 0;
+	struct dlm_finalize_reco fr;
+	struct dlm_node_iter iter;
+	int nodenum;
+	int status;
+	int stage = 1;
+
+	mlog(0, "finishing recovery for node %s:%u, "
+	     "stage %d\n", dlm->name, dlm->reco.dead_node, stage);
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+stage2:
+	memset(&fr, 0, sizeof(fr));
+	fr.node_idx = dlm->node_num;
+	fr.dead_node = dlm->reco.dead_node;
+	if (stage == 2)
+		fr.flags |= DLM_FINALIZE_STAGE2;
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		if (nodenum == dlm->node_num)
+			continue;
+		ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
+					 &fr, sizeof(fr), nodenum, &status);
+		if (ret >= 0)
+			ret = status;
+		if (ret < 0) {
+			mlog(ML_ERROR, "Error %d when sending message %u (key "
+			     "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
+			     dlm->key, nodenum);
+			if (dlm_is_host_down(ret)) {
+				/* this has no effect on this recovery
+				 * session, so set the status to zero to
+				 * finish out the last recovery */
+				mlog(ML_ERROR, "node %u went down after this "
+				     "node finished recovery.\n", nodenum);
+				ret = 0;
+				continue;
+			}
+			break;
+		}
+	}
+	if (stage == 1) {
+		/* reset the node_iter back to the top and send finalize2 */
+		iter.curnode = -1;
+		stage = 2;
+		goto stage2;
+	}
+
+	return ret;
+}
+
+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+			      void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
+	int stage = 1;
+
+	/* ok to return 0, domain has gone away */
+	if (!dlm_grab(dlm))
+		return 0;
+
+	if (fr->flags & DLM_FINALIZE_STAGE2)
+		stage = 2;
+
+	mlog(0, "%s: node %u finalizing recovery stage%d of "
+	     "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
+	     fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
+
+	spin_lock(&dlm->spinlock);
+
+	if (dlm->reco.new_master != fr->node_idx) {
+		mlog(ML_ERROR, "node %u sent recovery finalize msg, but node "
+		     "%u is supposed to be the new master, dead=%u\n",
+		     fr->node_idx, dlm->reco.new_master, fr->dead_node);
+		BUG();
+	}
+	if (dlm->reco.dead_node != fr->dead_node) {
+		mlog(ML_ERROR, "node %u sent recovery finalize msg for dead "
+		     "node %u, but node %u is supposed to be dead\n",
+		     fr->node_idx, fr->dead_node, dlm->reco.dead_node);
+		BUG();
+	}
+
+	switch (stage) {
+		case 1:
+			dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
+			if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+				mlog(ML_ERROR, "%s: received finalize1 from "
+				     "new master %u for dead node %u, but "
+				     "this node has already received it!\n",
+				     dlm->name, fr->node_idx, fr->dead_node);
+				dlm_print_reco_node_status(dlm);
+				BUG();
+			}
+			dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+			spin_unlock(&dlm->spinlock);
+			break;
+		case 2:
+			if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) {
+				mlog(ML_ERROR, "%s: received finalize2 from "
+				     "new master %u for dead node %u, but "
+				     "this node did not have finalize1!\n",
+				     dlm->name, fr->node_idx, fr->dead_node);
+				dlm_print_reco_node_status(dlm);
+				BUG();
+			}
+			dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+			spin_unlock(&dlm->spinlock);
+			dlm_reset_recovery(dlm);
+			dlm_kick_recovery_thread(dlm);
+			break;
+		default:
+			BUG();
+	}
+
+	mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
+	     dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master);
+
+	dlm_put(dlm);
+	return 0;
+}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
new file mode 100644
index 00000000..1d6d1d22
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -0,0 +1,762 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmthread.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/timer.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
+#include "cluster/masklog.h"
+
+static int dlm_thread(void *data);
+static void dlm_flush_asts(struct dlm_ctxt *dlm);
+
+#define dlm_lock_is_remote(dlm, lock)     ((lock)->ml.node != (dlm)->node_num)
+
+/* will exit holding res->spinlock, but may drop in function */
+/* waits until flags are cleared on res->state */
+void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	assert_spin_locked(&res->spinlock);
+
+	add_wait_queue(&res->wq, &wait);
+repeat:
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	if (res->state & flags) {
+		spin_unlock(&res->spinlock);
+		schedule();
+		spin_lock(&res->spinlock);
+		goto repeat;
+	}
+	remove_wait_queue(&res->wq, &wait);
+	__set_current_state(TASK_RUNNING);
+}
+
+int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
+{
+	if (list_empty(&res->granted) &&
+	    list_empty(&res->converting) &&
+	    list_empty(&res->blocked))
+		return 0;
+	return 1;
+}
+
+/* "unused": the lockres has no locks, is not on the dirty list,
+ * has no inflight locks (in the gap between mastery and acquiring
+ * the first lock), and has no bits in its refmap.
+ * truly ready to be freed. */
+int __dlm_lockres_unused(struct dlm_lock_resource *res)
+{
+	int bit;
+
+	if (__dlm_lockres_has_locks(res))
+		return 0;
+
+	if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
+		return 0;
+
+	if (res->state & DLM_LOCK_RES_RECOVERING)
+		return 0;
+
+	bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+	if (bit < O2NM_MAX_NODES)
+		return 0;
+
+	/*
+	 * since the bit for dlm->node_num is not set, inflight_locks better
+	 * be zero
+	 */
+	BUG_ON(res->inflight_locks != 0);
+	return 1;
+}
+
+
+/* Call whenever you may have added or deleted something from one of
+ * the lockres queue's. This will figure out whether it belongs on the
+ * unused list or not and does the appropriate thing. */
+void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	if (__dlm_lockres_unused(res)){
+		if (list_empty(&res->purge)) {
+			mlog(0, "%s: Adding res %.*s to purge list\n",
+			     dlm->name, res->lockname.len, res->lockname.name);
+
+			res->last_used = jiffies;
+			dlm_lockres_get(res);
+			list_add_tail(&res->purge, &dlm->purge_list);
+			dlm->purge_count++;
+		}
+	} else if (!list_empty(&res->purge)) {
+		mlog(0, "%s: Removing res %.*s from purge list\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+
+		list_del_init(&res->purge);
+		dlm_lockres_put(res);
+		dlm->purge_count--;
+	}
+}
+
+void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			    struct dlm_lock_resource *res)
+{
+	spin_lock(&dlm->spinlock);
+	spin_lock(&res->spinlock);
+
+	__dlm_lockres_calc_usage(dlm, res);
+
+	spin_unlock(&res->spinlock);
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_purge_lockres(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res)
+{
+	int master;
+	int ret = 0;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	master = (res->owner == dlm->node_num);
+
+	mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
+	     res->lockname.len, res->lockname.name, master);
+
+	if (!master) {
+		res->state |= DLM_LOCK_RES_DROPPING_REF;
+		/* drop spinlock...  retake below */
+		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
+
+		spin_lock(&res->spinlock);
+		/* This ensures that clear refmap is sent after the set */
+		__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+		spin_unlock(&res->spinlock);
+
+		/* clear our bit from the master's refmap, ignore errors */
+		ret = dlm_drop_lockres_ref(dlm, res);
+		if (ret < 0) {
+			mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
+			     res->lockname.len, res->lockname.name, ret);
+			if (!dlm_is_host_down(ret))
+				BUG();
+		}
+		spin_lock(&dlm->spinlock);
+		spin_lock(&res->spinlock);
+	}
+
+	if (!list_empty(&res->purge)) {
+		mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
+		     dlm->name, res->lockname.len, res->lockname.name, master);
+		list_del_init(&res->purge);
+		dlm_lockres_put(res);
+		dlm->purge_count--;
+	}
+
+	if (!__dlm_lockres_unused(res)) {
+		mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		__dlm_print_one_lock_resource(res);
+		BUG();
+	}
+
+	__dlm_unhash_lockres(res);
+
+	/* lockres is not in the hash now.  drop the flag and wake up
+	 * any processes waiting in dlm_get_lock_resource. */
+	if (!master) {
+		res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+		spin_unlock(&res->spinlock);
+		wake_up(&res->wq);
+	} else
+		spin_unlock(&res->spinlock);
+}
+
+static void dlm_run_purge_list(struct dlm_ctxt *dlm,
+			       int purge_now)
+{
+	unsigned int run_max, unused;
+	unsigned long purge_jiffies;
+	struct dlm_lock_resource *lockres;
+
+	spin_lock(&dlm->spinlock);
+	run_max = dlm->purge_count;
+
+	while(run_max && !list_empty(&dlm->purge_list)) {
+		run_max--;
+
+		lockres = list_entry(dlm->purge_list.next,
+				     struct dlm_lock_resource, purge);
+
+		spin_lock(&lockres->spinlock);
+
+		purge_jiffies = lockres->last_used +
+			msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
+
+		/* Make sure that we want to be processing this guy at
+		 * this time. */
+		if (!purge_now && time_after(purge_jiffies, jiffies)) {
+			/* Since resources are added to the purge list
+			 * in tail order, we can stop at the first
+			 * unpurgable resource -- anyone added after
+			 * him will have a greater last_used value */
+			spin_unlock(&lockres->spinlock);
+			break;
+		}
+
+		/* Status of the lockres *might* change so double
+		 * check. If the lockres is unused, holding the dlm
+		 * spinlock will prevent people from getting and more
+		 * refs on it. */
+		unused = __dlm_lockres_unused(lockres);
+		if (!unused ||
+		    (lockres->state & DLM_LOCK_RES_MIGRATING)) {
+			mlog(0, "%s: res %.*s is in use or being remastered, "
+			     "used %d, state %d\n", dlm->name,
+			     lockres->lockname.len, lockres->lockname.name,
+			     !unused, lockres->state);
+			list_move_tail(&dlm->purge_list, &lockres->purge);
+			spin_unlock(&lockres->spinlock);
+			continue;
+		}
+
+		dlm_lockres_get(lockres);
+
+		dlm_purge_lockres(dlm, lockres);
+
+		dlm_lockres_put(lockres);
+
+		/* Avoid adding any scheduling latencies */
+		cond_resched_lock(&dlm->spinlock);
+	}
+
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res)
+{
+	struct dlm_lock *lock, *target;
+	struct list_head *iter;
+	struct list_head *head;
+	int can_grant = 1;
+
+	/*
+	 * Because this function is called with the lockres
+	 * spinlock, and because we know that it is not migrating/
+	 * recovering/in-progress, it is fine to reserve asts and
+	 * basts right before queueing them all throughout
+	 */
+	assert_spin_locked(&dlm->ast_lock);
+	assert_spin_locked(&res->spinlock);
+	BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
+			      DLM_LOCK_RES_RECOVERING|
+			      DLM_LOCK_RES_IN_PROGRESS)));
+
+converting:
+	if (list_empty(&res->converting))
+		goto blocked;
+	mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
+	     res->lockname.len, res->lockname.name);
+
+	target = list_entry(res->converting.next, struct dlm_lock, list);
+	if (target->ml.convert_type == LKM_IVMODE) {
+		mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		BUG();
+	}
+	head = &res->granted;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type,
+					 target->ml.convert_type)) {
+			can_grant = 0;
+			/* queue the BAST if not already */
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				__dlm_queue_bast(dlm, lock);
+			}
+			/* update the highest_blocked if needed */
+			if (lock->ml.highest_blocked < target->ml.convert_type)
+				lock->ml.highest_blocked =
+					target->ml.convert_type;
+		}
+	}
+	head = &res->converting;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type,
+					 target->ml.convert_type)) {
+			can_grant = 0;
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				__dlm_queue_bast(dlm, lock);
+			}
+			if (lock->ml.highest_blocked < target->ml.convert_type)
+				lock->ml.highest_blocked =
+					target->ml.convert_type;
+		}
+	}
+
+	/* we can convert the lock */
+	if (can_grant) {
+		spin_lock(&target->spinlock);
+		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
+
+		mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
+		     "%d => %d, node %u\n", dlm->name, res->lockname.len,
+		     res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
+		     target->ml.type,
+		     target->ml.convert_type, target->ml.node);
+
+		target->ml.type = target->ml.convert_type;
+		target->ml.convert_type = LKM_IVMODE;
+		list_move_tail(&target->list, &res->granted);
+
+		BUG_ON(!target->lksb);
+		target->lksb->status = DLM_NORMAL;
+
+		spin_unlock(&target->spinlock);
+
+		__dlm_lockres_reserve_ast(res);
+		__dlm_queue_ast(dlm, target);
+		/* go back and check for more */
+		goto converting;
+	}
+
+blocked:
+	if (list_empty(&res->blocked))
+		goto leave;
+	target = list_entry(res->blocked.next, struct dlm_lock, list);
+
+	head = &res->granted;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
+			can_grant = 0;
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				__dlm_queue_bast(dlm, lock);
+			}
+			if (lock->ml.highest_blocked < target->ml.type)
+				lock->ml.highest_blocked = target->ml.type;
+		}
+	}
+
+	head = &res->converting;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
+			can_grant = 0;
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				__dlm_queue_bast(dlm, lock);
+			}
+			if (lock->ml.highest_blocked < target->ml.type)
+				lock->ml.highest_blocked = target->ml.type;
+		}
+	}
+
+	/* we can grant the blocked lock (only
+	 * possible if converting list empty) */
+	if (can_grant) {
+		spin_lock(&target->spinlock);
+		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
+
+		mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
+		     "node %u\n", dlm->name, res->lockname.len,
+		     res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
+		     target->ml.type, target->ml.node);
+
+		/* target->ml.type is already correct */
+		list_move_tail(&target->list, &res->granted);
+
+		BUG_ON(!target->lksb);
+		target->lksb->status = DLM_NORMAL;
+
+		spin_unlock(&target->spinlock);
+
+		__dlm_lockres_reserve_ast(res);
+		__dlm_queue_ast(dlm, target);
+		/* go back and check for more */
+		goto converting;
+	}
+
+leave:
+	return;
+}
+
+/* must have NO locks when calling this with res !=NULL * */
+void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	if (res) {
+		spin_lock(&dlm->spinlock);
+		spin_lock(&res->spinlock);
+		__dlm_dirty_lockres(dlm, res);
+		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
+	}
+	wake_up(&dlm->dlm_thread_wq);
+}
+
+void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	/* don't shuffle secondary queues */
+	if ((res->owner == dlm->node_num)) {
+		if (res->state & (DLM_LOCK_RES_MIGRATING |
+				  DLM_LOCK_RES_BLOCK_DIRTY))
+		    return;
+
+		if (list_empty(&res->dirty)) {
+			/* ref for dirty_list */
+			dlm_lockres_get(res);
+			list_add_tail(&res->dirty, &dlm->dirty_list);
+			res->state |= DLM_LOCK_RES_DIRTY;
+		}
+	}
+
+	mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+	     res->lockname.name);
+}
+
+
+/* Launch the NM thread for the mounted volume */
+int dlm_launch_thread(struct dlm_ctxt *dlm)
+{
+	mlog(0, "Starting dlm_thread...\n");
+
+	dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
+	if (IS_ERR(dlm->dlm_thread_task)) {
+		mlog_errno(PTR_ERR(dlm->dlm_thread_task));
+		dlm->dlm_thread_task = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void dlm_complete_thread(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_thread_task) {
+		mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
+		kthread_stop(dlm->dlm_thread_task);
+		dlm->dlm_thread_task = NULL;
+	}
+}
+
+static int dlm_dirty_list_empty(struct dlm_ctxt *dlm)
+{
+	int empty;
+
+	spin_lock(&dlm->spinlock);
+	empty = list_empty(&dlm->dirty_list);
+	spin_unlock(&dlm->spinlock);
+
+	return empty;
+}
+
+static void dlm_flush_asts(struct dlm_ctxt *dlm)
+{
+	int ret;
+	struct dlm_lock *lock;
+	struct dlm_lock_resource *res;
+	u8 hi;
+
+	spin_lock(&dlm->ast_lock);
+	while (!list_empty(&dlm->pending_asts)) {
+		lock = list_entry(dlm->pending_asts.next,
+				  struct dlm_lock, ast_list);
+		/* get an extra ref on lock */
+		dlm_lock_get(lock);
+		res = lock->lockres;
+		mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
+		     "node %u\n", dlm->name, res->lockname.len,
+		     res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		     lock->ml.type, lock->ml.node);
+
+		BUG_ON(!lock->ast_pending);
+
+		/* remove from list (including ref) */
+		list_del_init(&lock->ast_list);
+		dlm_lock_put(lock);
+		spin_unlock(&dlm->ast_lock);
+
+		if (lock->ml.node != dlm->node_num) {
+			ret = dlm_do_remote_ast(dlm, res, lock);
+			if (ret < 0)
+				mlog_errno(ret);
+		} else
+			dlm_do_local_ast(dlm, res, lock);
+
+		spin_lock(&dlm->ast_lock);
+
+		/* possible that another ast was queued while
+		 * we were delivering the last one */
+		if (!list_empty(&lock->ast_list)) {
+			mlog(0, "%s: res %.*s, AST queued while flushing last "
+			     "one\n", dlm->name, res->lockname.len,
+			     res->lockname.name);
+		} else
+			lock->ast_pending = 0;
+
+		/* drop the extra ref.
+		 * this may drop it completely. */
+		dlm_lock_put(lock);
+		dlm_lockres_release_ast(dlm, res);
+	}
+
+	while (!list_empty(&dlm->pending_basts)) {
+		lock = list_entry(dlm->pending_basts.next,
+				  struct dlm_lock, bast_list);
+		/* get an extra ref on lock */
+		dlm_lock_get(lock);
+		res = lock->lockres;
+
+		BUG_ON(!lock->bast_pending);
+
+		/* get the highest blocked lock, and reset */
+		spin_lock(&lock->spinlock);
+		BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE);
+		hi = lock->ml.highest_blocked;
+		lock->ml.highest_blocked = LKM_IVMODE;
+		spin_unlock(&lock->spinlock);
+
+		/* remove from list (including ref) */
+		list_del_init(&lock->bast_list);
+		dlm_lock_put(lock);
+		spin_unlock(&dlm->ast_lock);
+
+		mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
+		     "blocked %d, node %u\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		     hi, lock->ml.node);
+
+		if (lock->ml.node != dlm->node_num) {
+			ret = dlm_send_proxy_bast(dlm, res, lock, hi);
+			if (ret < 0)
+				mlog_errno(ret);
+		} else
+			dlm_do_local_bast(dlm, res, lock, hi);
+
+		spin_lock(&dlm->ast_lock);
+
+		/* possible that another bast was queued while
+		 * we were delivering the last one */
+		if (!list_empty(&lock->bast_list)) {
+			mlog(0, "%s: res %.*s, BAST queued while flushing last "
+			     "one\n", dlm->name, res->lockname.len,
+			     res->lockname.name);
+		} else
+			lock->bast_pending = 0;
+
+		/* drop the extra ref.
+		 * this may drop it completely. */
+		dlm_lock_put(lock);
+		dlm_lockres_release_ast(dlm, res);
+	}
+	wake_up(&dlm->ast_wq);
+	spin_unlock(&dlm->ast_lock);
+}
+
+
+#define DLM_THREAD_TIMEOUT_MS (4 * 1000)
+#define DLM_THREAD_MAX_DIRTY  100
+#define DLM_THREAD_MAX_ASTS   10
+
+static int dlm_thread(void *data)
+{
+	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm = data;
+	unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS);
+
+	mlog(0, "dlm thread running for %s...\n", dlm->name);
+
+	while (!kthread_should_stop()) {
+		int n = DLM_THREAD_MAX_DIRTY;
+
+		/* dlm_shutting_down is very point-in-time, but that
+		 * doesn't matter as we'll just loop back around if we
+		 * get false on the leading edge of a state
+		 * transition. */
+		dlm_run_purge_list(dlm, dlm_shutting_down(dlm));
+
+		/* We really don't want to hold dlm->spinlock while
+		 * calling dlm_shuffle_lists on each lockres that
+		 * needs to have its queues adjusted and AST/BASTs
+		 * run.  So let's pull each entry off the dirty_list
+		 * and drop dlm->spinlock ASAP.  Once off the list,
+		 * res->spinlock needs to be taken again to protect
+		 * the queues while calling dlm_shuffle_lists.  */
+		spin_lock(&dlm->spinlock);
+		while (!list_empty(&dlm->dirty_list)) {
+			int delay = 0;
+			res = list_entry(dlm->dirty_list.next,
+					 struct dlm_lock_resource, dirty);
+
+			/* peel a lockres off, remove it from the list,
+			 * unset the dirty flag and drop the dlm lock */
+			BUG_ON(!res);
+			dlm_lockres_get(res);
+
+			spin_lock(&res->spinlock);
+			/* We clear the DLM_LOCK_RES_DIRTY state once we shuffle lists below */
+			list_del_init(&res->dirty);
+			spin_unlock(&res->spinlock);
+			spin_unlock(&dlm->spinlock);
+			/* Drop dirty_list ref */
+			dlm_lockres_put(res);
+
+		 	/* lockres can be re-dirtied/re-added to the
+			 * dirty_list in this gap, but that is ok */
+
+			spin_lock(&dlm->ast_lock);
+			spin_lock(&res->spinlock);
+			if (res->owner != dlm->node_num) {
+				__dlm_print_one_lock_resource(res);
+				mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
+				     " dirty %d\n", dlm->name,
+				     !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
+				     !!(res->state & DLM_LOCK_RES_MIGRATING),
+				     !!(res->state & DLM_LOCK_RES_RECOVERING),
+				     !!(res->state & DLM_LOCK_RES_DIRTY));
+			}
+			BUG_ON(res->owner != dlm->node_num);
+
+			/* it is now ok to move lockreses in these states
+			 * to the dirty list, assuming that they will only be
+			 * dirty for a short while. */
+			BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
+			if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
+					  DLM_LOCK_RES_RECOVERING)) {
+				/* move it to the tail and keep going */
+				res->state &= ~DLM_LOCK_RES_DIRTY;
+				spin_unlock(&res->spinlock);
+				spin_unlock(&dlm->ast_lock);
+				mlog(0, "%s: res %.*s, inprogress, delay list "
+				     "shuffle, state %d\n", dlm->name,
+				     res->lockname.len, res->lockname.name,
+				     res->state);
+				delay = 1;
+				goto in_progress;
+			}
+
+			/* at this point the lockres is not migrating/
+			 * recovering/in-progress.  we have the lockres
+			 * spinlock and do NOT have the dlm lock.
+			 * safe to reserve/queue asts and run the lists. */
+
+			/* called while holding lockres lock */
+			dlm_shuffle_lists(dlm, res);
+			res->state &= ~DLM_LOCK_RES_DIRTY;
+			spin_unlock(&res->spinlock);
+			spin_unlock(&dlm->ast_lock);
+
+			dlm_lockres_calc_usage(dlm, res);
+
+in_progress:
+
+			spin_lock(&dlm->spinlock);
+			/* if the lock was in-progress, stick
+			 * it on the back of the list */
+			if (delay) {
+				spin_lock(&res->spinlock);
+				__dlm_dirty_lockres(dlm, res);
+				spin_unlock(&res->spinlock);
+			}
+			dlm_lockres_put(res);
+
+			/* unlikely, but we may need to give time to
+			 * other tasks */
+			if (!--n) {
+				mlog(0, "%s: Throttling dlm thread\n",
+				     dlm->name);
+				break;
+			}
+		}
+
+		spin_unlock(&dlm->spinlock);
+		dlm_flush_asts(dlm);
+
+		/* yield and continue right away if there is more work to do */
+		if (!n) {
+			cond_resched();
+			continue;
+		}
+
+		wait_event_interruptible_timeout(dlm->dlm_thread_wq,
+						 !dlm_dirty_list_empty(dlm) ||
+						 kthread_should_stop(),
+						 timeout);
+	}
+
+	mlog(0, "quitting DLM thread\n");
+	return 0;
+}
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
new file mode 100644
index 00000000..850aa7e8
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -0,0 +1,692 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmunlock.c
+ *
+ * underlying calls for unlocking locks
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+#define DLM_UNLOCK_FREE_LOCK           0x00000001
+#define DLM_UNLOCK_CALL_AST            0x00000002
+#define DLM_UNLOCK_REMOVE_LOCK         0x00000004
+#define DLM_UNLOCK_REGRANT_LOCK        0x00000008
+#define DLM_UNLOCK_CLEAR_CONVERT_TYPE  0x00000010
+
+
+static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions);
+static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions);
+
+static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
+						 struct dlm_lock_resource *res,
+						 struct dlm_lock *lock,
+						 struct dlm_lockstatus *lksb,
+						 int flags,
+						 u8 owner);
+
+
+/*
+ * according to the spec:
+ * http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
+ *
+ *  flags & LKM_CANCEL != 0: must be converting or blocked
+ *  flags & LKM_CANCEL == 0: must be granted
+ *
+ * So to unlock a converting lock, you must first cancel the
+ * convert (passing LKM_CANCEL in flags), then call the unlock
+ * again (with no LKM_CANCEL in flags).
+ */
+
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         res->spinlock and lock->spinlock taken and dropped
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
+ * all callers should have taken an extra ref on lock coming in
+ */
+static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
+					struct dlm_lock_resource *res,
+					struct dlm_lock *lock,
+					struct dlm_lockstatus *lksb,
+					int flags, int *call_ast,
+					int master_node)
+{
+	enum dlm_status status;
+	int actions = 0;
+	int in_use;
+        u8 owner;
+
+	mlog(0, "master_node = %d, valblk = %d\n", master_node,
+	     flags & LKM_VALBLK);
+
+	if (master_node)
+		BUG_ON(res->owner != dlm->node_num);
+	else
+		BUG_ON(res->owner == dlm->node_num);
+
+	spin_lock(&dlm->ast_lock);
+	/* We want to be sure that we're not freeing a lock
+	 * that still has AST's pending... */
+	in_use = !list_empty(&lock->ast_list);
+	spin_unlock(&dlm->ast_lock);
+	if (in_use && !(flags & LKM_CANCEL)) {
+	       mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
+		    "while waiting for an ast!", res->lockname.len,
+		    res->lockname.name);
+		return DLM_BADPARAM;
+	}
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
+		if (master_node && !(flags & LKM_CANCEL)) {
+			mlog(ML_ERROR, "lockres in progress!\n");
+			spin_unlock(&res->spinlock);
+			return DLM_FORWARD;
+		}
+		/* ok for this to sleep if not in a network handler */
+		__dlm_wait_on_lockres(res);
+		res->state |= DLM_LOCK_RES_IN_PROGRESS;
+	}
+	spin_lock(&lock->spinlock);
+
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		status = DLM_RECOVERING;
+		goto leave;
+	}
+
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		status = DLM_MIGRATING;
+		goto leave;
+	}
+
+	/* see above for what the spec says about
+	 * LKM_CANCEL and the lock queue state */
+	if (flags & LKM_CANCEL)
+		status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions);
+	else
+		status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions);
+
+	if (status != DLM_NORMAL && (status != DLM_CANCELGRANT || !master_node))
+		goto leave;
+
+	/* By now this has been masked out of cancel requests. */
+	if (flags & LKM_VALBLK) {
+		/* make the final update to the lvb */
+		if (master_node)
+			memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
+		else
+			flags |= LKM_PUT_LVB; /* let the send function
+					       * handle it. */
+	}
+
+	if (!master_node) {
+		owner = res->owner;
+		/* drop locks and send message */
+		if (flags & LKM_CANCEL)
+			lock->cancel_pending = 1;
+		else
+			lock->unlock_pending = 1;
+		spin_unlock(&lock->spinlock);
+		spin_unlock(&res->spinlock);
+		status = dlm_send_remote_unlock_request(dlm, res, lock, lksb,
+							flags, owner);
+		spin_lock(&res->spinlock);
+		spin_lock(&lock->spinlock);
+		/* if the master told us the lock was already granted,
+		 * let the ast handle all of these actions */
+		if (status == DLM_CANCELGRANT) {
+			actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
+				     DLM_UNLOCK_REGRANT_LOCK|
+				     DLM_UNLOCK_CLEAR_CONVERT_TYPE);
+		} else if (status == DLM_RECOVERING ||
+			   status == DLM_MIGRATING ||
+			   status == DLM_FORWARD) {
+			/* must clear the actions because this unlock
+			 * is about to be retried.  cannot free or do
+			 * any list manipulation. */
+			mlog(0, "%s:%.*s: clearing actions, %s\n",
+			     dlm->name, res->lockname.len,
+			     res->lockname.name,
+			     status==DLM_RECOVERING?"recovering":
+			     (status==DLM_MIGRATING?"migrating":
+			      "forward"));
+			actions = 0;
+		}
+		if (flags & LKM_CANCEL)
+			lock->cancel_pending = 0;
+		else
+			lock->unlock_pending = 0;
+
+	}
+
+	/* get an extra ref on lock.  if we are just switching
+	 * lists here, we dont want the lock to go away. */
+	dlm_lock_get(lock);
+
+	if (actions & DLM_UNLOCK_REMOVE_LOCK) {
+		list_del_init(&lock->list);
+		dlm_lock_put(lock);
+	}
+	if (actions & DLM_UNLOCK_REGRANT_LOCK) {
+		dlm_lock_get(lock);
+		list_add_tail(&lock->list, &res->granted);
+	}
+	if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) {
+		mlog(0, "clearing convert_type at %smaster node\n",
+		     master_node ? "" : "non-");
+		lock->ml.convert_type = LKM_IVMODE;
+	}
+
+	/* remove the extra ref on lock */
+	dlm_lock_put(lock);
+
+leave:
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	if (!dlm_lock_on_list(&res->converting, lock))
+		BUG_ON(lock->ml.convert_type != LKM_IVMODE);
+	else
+		BUG_ON(lock->ml.convert_type == LKM_IVMODE);
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	/* let the caller's final dlm_lock_put handle the actual kfree */
+	if (actions & DLM_UNLOCK_FREE_LOCK) {
+		/* this should always be coupled with list removal */
+		BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
+		mlog(0, "lock %u:%llu should be gone now! refs=%d\n",
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		     atomic_read(&lock->lock_refs.refcount)-1);
+		dlm_lock_put(lock);
+	}
+	if (actions & DLM_UNLOCK_CALL_AST)
+		*call_ast = 1;
+
+	/* if cancel or unlock succeeded, lvb work is done */
+	if (status == DLM_NORMAL)
+		lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
+
+	return status;
+}
+
+void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock)
+{
+	/* leave DLM_LKSB_PUT_LVB on the lksb so any final
+	 * update of the lvb will be sent to the new master */
+	list_del_init(&lock->list);
+}
+
+void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock)
+{
+	list_move_tail(&lock->list, &res->granted);
+	lock->ml.convert_type = LKM_IVMODE;
+}
+
+
+static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm,
+					  struct dlm_lock_resource *res,
+					  struct dlm_lock *lock,
+					  struct dlm_lockstatus *lksb,
+					  int flags,
+					  int *call_ast)
+{
+	return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1);
+}
+
+static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm,
+					  struct dlm_lock_resource *res,
+					  struct dlm_lock *lock,
+					  struct dlm_lockstatus *lksb,
+					  int flags, int *call_ast)
+{
+	return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0);
+}
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         none
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
+ */
+static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
+						 struct dlm_lock_resource *res,
+						 struct dlm_lock *lock,
+						 struct dlm_lockstatus *lksb,
+						 int flags,
+						 u8 owner)
+{
+	struct dlm_unlock_lock unlock;
+	int tmpret;
+	enum dlm_status ret;
+	int status = 0;
+	struct kvec vec[2];
+	size_t veclen = 1;
+
+	mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
+
+	if (owner == dlm->node_num) {
+		/* ended up trying to contact ourself.  this means
+		 * that the lockres had been remote but became local
+		 * via a migration.  just retry it, now as local */
+		mlog(0, "%s:%.*s: this node became the master due to a "
+		     "migration, re-evaluate now\n", dlm->name,
+		     res->lockname.len, res->lockname.name);
+		return DLM_FORWARD;
+	}
+
+	memset(&unlock, 0, sizeof(unlock));
+	unlock.node_idx = dlm->node_num;
+	unlock.flags = cpu_to_be32(flags);
+	unlock.cookie = lock->ml.cookie;
+	unlock.namelen = res->lockname.len;
+	memcpy(unlock.name, res->lockname.name, unlock.namelen);
+
+	vec[0].iov_len = sizeof(struct dlm_unlock_lock);
+	vec[0].iov_base = &unlock;
+
+	if (flags & LKM_PUT_LVB) {
+		/* extra data to send if we are updating lvb */
+		vec[1].iov_len = DLM_LVB_LEN;
+		vec[1].iov_base = lock->lksb->lvb;
+		veclen++;
+	}
+
+	tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key,
+					vec, veclen, owner, &status);
+	if (tmpret >= 0) {
+		// successfully sent and received
+		if (status == DLM_FORWARD)
+			mlog(0, "master was in-progress.  retry\n");
+		ret = status;
+	} else {
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
+		if (dlm_is_host_down(tmpret)) {
+			/* NOTE: this seems strange, but it is what we want.
+			 * when the master goes down during a cancel or
+			 * unlock, the recovery code completes the operation
+			 * as if the master had not died, then passes the
+			 * updated state to the recovery master.  this thread
+			 * just needs to finish out the operation and call
+			 * the unlockast. */
+			ret = DLM_NORMAL;
+		} else {
+			/* something bad.  this will BUG in ocfs2 */
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
+ *          return value from dlmunlock_master
+ */
+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+			    void **ret_data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	struct list_head *iter;
+	struct dlm_lock *lock = NULL;
+	enum dlm_status status = DLM_NORMAL;
+	int found = 0, i;
+	struct dlm_lockstatus *lksb = NULL;
+	int ignore;
+	u32 flags;
+	struct list_head *queue;
+
+	flags = be32_to_cpu(unlock->flags);
+
+	if (flags & LKM_GET_LVB) {
+		mlog(ML_ERROR, "bad args!  GET_LVB specified on unlock!\n");
+		return DLM_BADARGS;
+	}
+
+	if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) {
+		mlog(ML_ERROR, "bad args!  cannot modify lvb on a CANCEL "
+		     "request!\n");
+		return DLM_BADARGS;
+	}
+
+	if (unlock->namelen > DLM_LOCKID_NAME_MAX) {
+		mlog(ML_ERROR, "Invalid name length in unlock handler!\n");
+		return DLM_IVBUFLEN;
+	}
+
+	if (!dlm_grab(dlm))
+		return DLM_REJECTED;
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none");
+
+	res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen);
+	if (!res) {
+		/* We assume here that a no lock resource simply means
+		 * it was migrated away and destroyed before the other
+		 * node could detect it. */
+		mlog(0, "returning DLM_FORWARD -- res no longer exists\n");
+		status = DLM_FORWARD;
+		goto not_found;
+	}
+
+	queue=&res->granted;
+	found = 0;
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		spin_unlock(&res->spinlock);
+		mlog(0, "returning DLM_RECOVERING\n");
+		status = DLM_RECOVERING;
+		goto leave;
+	}
+
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		spin_unlock(&res->spinlock);
+		mlog(0, "returning DLM_MIGRATING\n");
+		status = DLM_MIGRATING;
+		goto leave;
+	}
+
+	if (res->owner != dlm->node_num) {
+		spin_unlock(&res->spinlock);
+		mlog(0, "returning DLM_FORWARD -- not master\n");
+		status = DLM_FORWARD;
+		goto leave;
+	}
+
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue) {
+			lock = list_entry(iter, struct dlm_lock, list);
+			if (lock->ml.cookie == unlock->cookie &&
+		    	    lock->ml.node == unlock->node_idx) {
+				dlm_lock_get(lock);
+				found = 1;
+				break;
+			}
+		}
+		if (found)
+			break;
+		/* scan granted -> converting -> blocked queues */
+		queue++;
+	}
+	spin_unlock(&res->spinlock);
+	if (!found) {
+		status = DLM_IVLOCKID;
+		goto not_found;
+	}
+
+	/* lock was found on queue */
+	lksb = lock->lksb;
+	if (flags & (LKM_VALBLK|LKM_PUT_LVB) &&
+	    lock->ml.type != LKM_EXMODE)
+		flags &= ~(LKM_VALBLK|LKM_PUT_LVB);
+
+	/* unlockast only called on originating node */
+	if (flags & LKM_PUT_LVB) {
+		lksb->flags |= DLM_LKSB_PUT_LVB;
+		memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN);
+	}
+
+	/* if this is in-progress, propagate the DLM_FORWARD
+	 * all the way back out */
+	status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore);
+	if (status == DLM_FORWARD)
+		mlog(0, "lockres is in progress\n");
+
+	if (flags & LKM_PUT_LVB)
+		lksb->flags &= ~DLM_LKSB_PUT_LVB;
+
+	dlm_lockres_calc_usage(dlm, res);
+	dlm_kick_thread(dlm, res);
+
+not_found:
+	if (!found)
+		mlog(ML_ERROR, "failed to find lock to unlock! "
+			       "cookie=%u:%llu\n",
+		     dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(unlock->cookie)));
+	else
+		dlm_lock_put(lock);
+
+leave:
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+
+	return status;
+}
+
+
+static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions)
+{
+	enum dlm_status status;
+
+	if (dlm_lock_on_list(&res->blocked, lock)) {
+		/* cancel this outright */
+		status = DLM_NORMAL;
+		*actions = (DLM_UNLOCK_CALL_AST |
+			    DLM_UNLOCK_REMOVE_LOCK);
+	} else if (dlm_lock_on_list(&res->converting, lock)) {
+		/* cancel the request, put back on granted */
+		status = DLM_NORMAL;
+		*actions = (DLM_UNLOCK_CALL_AST |
+			    DLM_UNLOCK_REMOVE_LOCK |
+			    DLM_UNLOCK_REGRANT_LOCK |
+			    DLM_UNLOCK_CLEAR_CONVERT_TYPE);
+	} else if (dlm_lock_on_list(&res->granted, lock)) {
+		/* too late, already granted. */
+		status = DLM_CANCELGRANT;
+		*actions = DLM_UNLOCK_CALL_AST;
+	} else {
+		mlog(ML_ERROR, "lock to cancel is not on any list!\n");
+		status = DLM_IVLOCKID;
+		*actions = 0;
+	}
+	return status;
+}
+
+static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions)
+{
+	enum dlm_status status;
+
+	/* unlock request */
+	if (!dlm_lock_on_list(&res->granted, lock)) {
+		status = DLM_DENIED;
+		dlm_error(status);
+		*actions = 0;
+	} else {
+		/* unlock granted lock */
+		status = DLM_NORMAL;
+		*actions = (DLM_UNLOCK_FREE_LOCK |
+			    DLM_UNLOCK_CALL_AST |
+			    DLM_UNLOCK_REMOVE_LOCK);
+	}
+	return status;
+}
+
+/* there seems to be no point in doing this async
+ * since (even for the remote case) there is really
+ * no work to queue up... so just do it and fire the
+ * unlockast by hand when done... */
+enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
+			  int flags, dlm_astunlockfunc_t *unlockast, void *data)
+{
+	enum dlm_status status;
+	struct dlm_lock_resource *res;
+	struct dlm_lock *lock = NULL;
+	int call_ast, is_master;
+
+	if (!lksb) {
+		dlm_error(DLM_BADARGS);
+		return DLM_BADARGS;
+	}
+
+	if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) {
+		dlm_error(DLM_BADPARAM);
+		return DLM_BADPARAM;
+	}
+
+	if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
+		mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n");
+		flags &= ~LKM_VALBLK;
+	}
+
+	if (!lksb->lockid || !lksb->lockid->lockres) {
+		dlm_error(DLM_BADPARAM);
+		return DLM_BADPARAM;
+	}
+
+	lock = lksb->lockid;
+	BUG_ON(!lock);
+	dlm_lock_get(lock);
+
+	res = lock->lockres;
+	BUG_ON(!res);
+	dlm_lockres_get(res);
+retry:
+	call_ast = 0;
+	/* need to retry up here because owner may have changed */
+	mlog(0, "lock=%p res=%p\n", lock, res);
+
+	spin_lock(&res->spinlock);
+	is_master = (res->owner == dlm->node_num);
+	if (flags & LKM_VALBLK && lock->ml.type != LKM_EXMODE)
+		flags &= ~LKM_VALBLK;
+	spin_unlock(&res->spinlock);
+
+	if (is_master) {
+		status = dlmunlock_master(dlm, res, lock, lksb, flags,
+					  &call_ast);
+		mlog(0, "done calling dlmunlock_master: returned %d, "
+		     "call_ast is %d\n", status, call_ast);
+	} else {
+		status = dlmunlock_remote(dlm, res, lock, lksb, flags,
+					  &call_ast);
+		mlog(0, "done calling dlmunlock_remote: returned %d, "
+		     "call_ast is %d\n", status, call_ast);
+	}
+
+	if (status == DLM_RECOVERING ||
+	    status == DLM_MIGRATING ||
+	    status == DLM_FORWARD) {
+		/* We want to go away for a tiny bit to allow recovery
+		 * / migration to complete on this resource. I don't
+		 * know of any wait queue we could sleep on as this
+		 * may be happening on another node. Perhaps the
+		 * proper solution is to queue up requests on the
+		 * other end? */
+
+		/* do we want to yield(); ?? */
+		msleep(50);
+
+		mlog(0, "retrying unlock due to pending recovery/"
+		     "migration/in-progress\n");
+		goto retry;
+	}
+
+	if (call_ast) {
+		mlog(0, "calling unlockast(%p, %d)\n", data, status);
+		if (is_master) {
+			/* it is possible that there is one last bast
+			 * pending.  make sure it is flushed, then
+			 * call the unlockast.
+			 * not an issue if this is a mastered remotely,
+			 * since this lock has been removed from the
+			 * lockres queues and cannot be found. */
+			dlm_kick_thread(dlm, NULL);
+			wait_event(dlm->ast_wq,
+				   dlm_lock_basts_flushed(dlm, lock));
+		}
+		(*unlockast)(data, status);
+	}
+
+	if (status == DLM_CANCELGRANT)
+		status = DLM_NORMAL;
+
+	if (status == DLM_NORMAL) {
+		mlog(0, "kicking the thread\n");
+		dlm_kick_thread(dlm, res);
+	} else
+		dlm_error(status);
+
+	dlm_lockres_calc_usage(dlm, res);
+	dlm_lockres_put(res);
+	dlm_lock_put(lock);
+
+	mlog(0, "returning status=%d!\n", status);
+	return status;
+}
+EXPORT_SYMBOL_GPL(dlmunlock);
+
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
new file mode 100644
index 00000000..dfc0da4d
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include "dlmver.h"
+
+#define DLM_BUILD_VERSION "1.5.0"
+
+#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
+
+void dlm_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
new file mode 100644
index 00000000..f674aee7
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmver.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfsver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef DLM_VER_H
+#define DLM_VER_H
+
+void dlm_print_version(void);
+
+#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
new file mode 100644
index 00000000..f14be89a
--- /dev/null
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -0,0 +1,5 @@
+ccflags-y := -Ifs/ocfs2
+
+obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
+
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
new file mode 100644
index 00000000..b4207679
--- /dev/null
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -0,0 +1,725 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfs.c
+ *
+ * Code which implements the kernel side of a minimal userspace
+ * interface to our DLM. This file handles the virtual file system
+ * used for communication with userspace. Credit should go to ramfs,
+ * which was a template for the fs side of this module.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/* Simple VFS hooks based on: */
+/*
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/poll.h>
+
+#include <asm/uaccess.h>
+
+#include "stackglue.h"
+#include "userdlm.h"
+#include "dlmfsver.h"
+
+#define MLOG_MASK_PREFIX ML_DLMFS
+#include "cluster/masklog.h"
+
+
+static const struct super_operations dlmfs_ops;
+static const struct file_operations dlmfs_file_operations;
+static const struct inode_operations dlmfs_dir_inode_operations;
+static const struct inode_operations dlmfs_root_inode_operations;
+static const struct inode_operations dlmfs_file_inode_operations;
+static struct kmem_cache *dlmfs_inode_cache;
+
+struct workqueue_struct *user_dlm_worker;
+
+
+
+/*
+ * These are the ABI capabilities of dlmfs.
+ *
+ * Over time, dlmfs has added some features that were not part of the
+ * initial ABI.  Unfortunately, some of these features are not detectable
+ * via standard usage.  For example, Linux's default poll always returns
+ * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
+ * added poll support.  Instead, we provide this list of new capabilities.
+ *
+ * Capabilities is a read-only attribute.  We do it as a module parameter
+ * so we can discover it whether dlmfs is built in, loaded, or even not
+ * loaded.
+ *
+ * The ABI features are local to this machine's dlmfs mount.  This is
+ * distinct from the locking protocol, which is concerned with inter-node
+ * interaction.
+ *
+ * Capabilities:
+ * - bast	: POLLIN against the file descriptor of a held lock
+ *		  signifies a bast fired on the lock.
+ */
+#define DLMFS_CAPABILITIES "bast stackglue"
+static int param_set_dlmfs_capabilities(const char *val,
+					struct kernel_param *kp)
+{
+	printk(KERN_ERR "%s: readonly parameter\n", kp->name);
+	return -EINVAL;
+}
+static int param_get_dlmfs_capabilities(char *buffer,
+					struct kernel_param *kp)
+{
+	return strlcpy(buffer, DLMFS_CAPABILITIES,
+		       strlen(DLMFS_CAPABILITIES) + 1);
+}
+module_param_call(capabilities, param_set_dlmfs_capabilities,
+		  param_get_dlmfs_capabilities, NULL, 0444);
+MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
+
+
+/*
+ * decodes a set of open flags into a valid lock level and a set of flags.
+ * returns < 0 if we have invalid flags
+ * flags which mean something to us:
+ * O_RDONLY -> PRMODE level
+ * O_WRONLY -> EXMODE level
+ *
+ * O_NONBLOCK -> NOQUEUE
+ */
+static int dlmfs_decode_open_flags(int open_flags,
+				   int *level,
+				   int *flags)
+{
+	if (open_flags & (O_WRONLY|O_RDWR))
+		*level = DLM_LOCK_EX;
+	else
+		*level = DLM_LOCK_PR;
+
+	*flags = 0;
+	if (open_flags & O_NONBLOCK)
+		*flags |= DLM_LKF_NOQUEUE;
+
+	return 0;
+}
+
+static int dlmfs_file_open(struct inode *inode,
+			   struct file *file)
+{
+	int status, level, flags;
+	struct dlmfs_filp_private *fp = NULL;
+	struct dlmfs_inode_private *ip;
+
+	if (S_ISDIR(inode->i_mode))
+		BUG();
+
+	mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
+		file->f_flags);
+
+	status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
+	if (status < 0)
+		goto bail;
+
+	/* We don't want to honor O_APPEND at read/write time as it
+	 * doesn't make sense for LVB writes. */
+	file->f_flags &= ~O_APPEND;
+
+	fp = kmalloc(sizeof(*fp), GFP_NOFS);
+	if (!fp) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	fp->fp_lock_level = level;
+
+	ip = DLMFS_I(inode);
+
+	status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
+	if (status < 0) {
+		/* this is a strange error to return here but I want
+		 * to be able userspace to be able to distinguish a
+		 * valid lock request from one that simply couldn't be
+		 * granted. */
+		if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN)
+			status = -ETXTBSY;
+		kfree(fp);
+		goto bail;
+	}
+
+	file->private_data = fp;
+bail:
+	return status;
+}
+
+static int dlmfs_file_release(struct inode *inode,
+			      struct file *file)
+{
+	int level, status;
+	struct dlmfs_inode_private *ip = DLMFS_I(inode);
+	struct dlmfs_filp_private *fp = file->private_data;
+
+	if (S_ISDIR(inode->i_mode))
+		BUG();
+
+	mlog(0, "close called on inode %lu\n", inode->i_ino);
+
+	status = 0;
+	if (fp) {
+		level = fp->fp_lock_level;
+		if (level != DLM_LOCK_IV)
+			user_dlm_cluster_unlock(&ip->ip_lockres, level);
+
+		kfree(fp);
+		file->private_data = NULL;
+	}
+
+	return 0;
+}
+
+/*
+ * We do ->setattr() just to override size changes.  Our size is the size
+ * of the LVB and nothing else.
+ */
+static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int error;
+	struct inode *inode = dentry->d_inode;
+
+	attr->ia_valid &= ~ATTR_SIZE;
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
+{
+	int event = 0;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct dlmfs_inode_private *ip = DLMFS_I(inode);
+
+	poll_wait(file, &ip->ip_lockres.l_event, wait);
+
+	spin_lock(&ip->ip_lockres.l_lock);
+	if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
+		event = POLLIN | POLLRDNORM;
+	spin_unlock(&ip->ip_lockres.l_lock);
+
+	return event;
+}
+
+static ssize_t dlmfs_file_read(struct file *filp,
+			       char __user *buf,
+			       size_t count,
+			       loff_t *ppos)
+{
+	int bytes_left;
+	ssize_t readlen, got;
+	char *lvb_buf;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
+		inode->i_ino, count, *ppos);
+
+	if (*ppos >= i_size_read(inode))
+		return 0;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	/* don't read past the lvb */
+	if ((count + *ppos) > i_size_read(inode))
+		readlen = i_size_read(inode) - *ppos;
+	else
+		readlen = count;
+
+	lvb_buf = kmalloc(readlen, GFP_NOFS);
+	if (!lvb_buf)
+		return -ENOMEM;
+
+	got = user_dlm_read_lvb(inode, lvb_buf, readlen);
+	if (got) {
+		BUG_ON(got != readlen);
+		bytes_left = __copy_to_user(buf, lvb_buf, readlen);
+		readlen -= bytes_left;
+	} else
+		readlen = 0;
+
+	kfree(lvb_buf);
+
+	*ppos = *ppos + readlen;
+
+	mlog(0, "read %zd bytes\n", readlen);
+	return readlen;
+}
+
+static ssize_t dlmfs_file_write(struct file *filp,
+				const char __user *buf,
+				size_t count,
+				loff_t *ppos)
+{
+	int bytes_left;
+	ssize_t writelen;
+	char *lvb_buf;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
+		inode->i_ino, count, *ppos);
+
+	if (*ppos >= i_size_read(inode))
+		return -ENOSPC;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	/* don't write past the lvb */
+	if ((count + *ppos) > i_size_read(inode))
+		writelen = i_size_read(inode) - *ppos;
+	else
+		writelen = count - *ppos;
+
+	lvb_buf = kmalloc(writelen, GFP_NOFS);
+	if (!lvb_buf)
+		return -ENOMEM;
+
+	bytes_left = copy_from_user(lvb_buf, buf, writelen);
+	writelen -= bytes_left;
+	if (writelen)
+		user_dlm_write_lvb(inode, lvb_buf, writelen);
+
+	kfree(lvb_buf);
+
+	*ppos = *ppos + writelen;
+	mlog(0, "wrote %zd bytes\n", writelen);
+	return writelen;
+}
+
+static void dlmfs_init_once(void *foo)
+{
+	struct dlmfs_inode_private *ip =
+		(struct dlmfs_inode_private *) foo;
+
+	ip->ip_conn = NULL;
+	ip->ip_parent = NULL;
+
+	inode_init_once(&ip->ip_vfs_inode);
+}
+
+static struct inode *dlmfs_alloc_inode(struct super_block *sb)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS);
+	if (!ip)
+		return NULL;
+
+	return &ip->ip_vfs_inode;
+}
+
+static void dlmfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
+}
+
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, dlmfs_i_callback);
+}
+
+static void dlmfs_evict_inode(struct inode *inode)
+{
+	int status;
+	struct dlmfs_inode_private *ip;
+
+	end_writeback(inode);
+
+	mlog(0, "inode %lu\n", inode->i_ino);
+
+	ip = DLMFS_I(inode);
+
+	if (S_ISREG(inode->i_mode)) {
+		status = user_dlm_destroy_lock(&ip->ip_lockres);
+		if (status < 0)
+			mlog_errno(status);
+		iput(ip->ip_parent);
+		goto clear_fields;
+	}
+
+	mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
+	/* we must be a directory. If required, lets unregister the
+	 * dlm context now. */
+	if (ip->ip_conn)
+		user_dlm_unregister(ip->ip_conn);
+clear_fields:
+	ip->ip_parent = NULL;
+	ip->ip_conn = NULL;
+}
+
+static struct backing_dev_info dlmfs_backing_dev_info = {
+	.name		= "ocfs2-dlmfs",
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+
+static struct inode *dlmfs_get_root_inode(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+	int mode = S_IFDIR | 0755;
+	struct dlmfs_inode_private *ip;
+
+	if (inode) {
+		ip = DLMFS_I(inode);
+
+		inode->i_ino = get_next_ino();
+		inode->i_mode = mode;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = current_fsgid();
+		inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inc_nlink(inode);
+
+		inode->i_fop = &simple_dir_operations;
+		inode->i_op = &dlmfs_root_inode_operations;
+	}
+
+	return inode;
+}
+
+static struct inode *dlmfs_get_inode(struct inode *parent,
+				     struct dentry *dentry,
+				     int mode)
+{
+	struct super_block *sb = parent->i_sb;
+	struct inode * inode = new_inode(sb);
+	struct dlmfs_inode_private *ip;
+
+	if (!inode)
+		return NULL;
+
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	ip = DLMFS_I(inode);
+	ip->ip_conn = DLMFS_I(parent)->ip_conn;
+
+	switch (mode & S_IFMT) {
+	default:
+		/* for now we don't support anything other than
+		 * directories and regular files. */
+		BUG();
+		break;
+	case S_IFREG:
+		inode->i_op = &dlmfs_file_inode_operations;
+		inode->i_fop = &dlmfs_file_operations;
+
+		i_size_write(inode,  DLM_LVB_LEN);
+
+		user_dlm_lock_res_init(&ip->ip_lockres, dentry);
+
+		/* released at clear_inode time, this insures that we
+		 * get to drop the dlm reference on each lock *before*
+		 * we call the unregister code for releasing parent
+		 * directories. */
+		ip->ip_parent = igrab(parent);
+		BUG_ON(!ip->ip_parent);
+		break;
+	case S_IFDIR:
+		inode->i_op = &dlmfs_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+
+		/* directory inodes start off with i_nlink ==
+		 * 2 (for "." entry) */
+		inc_nlink(inode);
+		break;
+	}
+
+	if (parent->i_mode & S_ISGID) {
+		inode->i_gid = parent->i_gid;
+		if (S_ISDIR(mode))
+			inode->i_mode |= S_ISGID;
+	}
+
+	return inode;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+/* SMP-safe */
+static int dlmfs_mkdir(struct inode * dir,
+		       struct dentry * dentry,
+		       int mode)
+{
+	int status;
+	struct inode *inode = NULL;
+	struct qstr *domain = &dentry->d_name;
+	struct dlmfs_inode_private *ip;
+	struct ocfs2_cluster_connection *conn;
+
+	mlog(0, "mkdir %.*s\n", domain->len, domain->name);
+
+	/* verify that we have a proper domain */
+	if (domain->len >= GROUP_NAME_MAX) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid domain name for directory.\n");
+		goto bail;
+	}
+
+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ip = DLMFS_I(inode);
+
+	conn = user_dlm_register(domain);
+	if (IS_ERR(conn)) {
+		status = PTR_ERR(conn);
+		mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
+		     status, domain->len, domain->name);
+		goto bail;
+	}
+	ip->ip_conn = conn;
+
+	inc_nlink(dir);
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+
+	status = 0;
+bail:
+	if (status < 0)
+		iput(inode);
+	return status;
+}
+
+static int dlmfs_create(struct inode *dir,
+			struct dentry *dentry,
+			int mode,
+			struct nameidata *nd)
+{
+	int status = 0;
+	struct inode *inode;
+	struct qstr *name = &dentry->d_name;
+
+	mlog(0, "create %.*s\n", name->len, name->name);
+
+	/* verify name is valid and doesn't contain any dlm reserved
+	 * characters */
+	if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
+	    name->name[0] == '$') {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
+		     name->name);
+		goto bail;
+	}
+
+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+bail:
+	return status;
+}
+
+static int dlmfs_unlink(struct inode *dir,
+			struct dentry *dentry)
+{
+	int status;
+	struct inode *inode = dentry->d_inode;
+
+	mlog(0, "unlink inode %lu\n", inode->i_ino);
+
+	/* if there are no current holders, or none that are waiting
+	 * to acquire a lock, this basically destroys our lockres. */
+	status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
+	if (status < 0) {
+		mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
+		     dentry->d_name.len, dentry->d_name.name, status);
+		goto bail;
+	}
+	status = simple_unlink(dir, dentry);
+bail:
+	return status;
+}
+
+static int dlmfs_fill_super(struct super_block * sb,
+			    void * data,
+			    int silent)
+{
+	struct inode * inode;
+	struct dentry * root;
+
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = DLMFS_MAGIC;
+	sb->s_op = &dlmfs_ops;
+	inode = dlmfs_get_root_inode(sb);
+	if (!inode)
+		return -ENOMEM;
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+	sb->s_root = root;
+	return 0;
+}
+
+static const struct file_operations dlmfs_file_operations = {
+	.open		= dlmfs_file_open,
+	.release	= dlmfs_file_release,
+	.poll		= dlmfs_file_poll,
+	.read		= dlmfs_file_read,
+	.write		= dlmfs_file_write,
+	.llseek		= default_llseek,
+};
+
+static const struct inode_operations dlmfs_dir_inode_operations = {
+	.create		= dlmfs_create,
+	.lookup		= simple_lookup,
+	.unlink		= dlmfs_unlink,
+};
+
+/* this way we can restrict mkdir to only the toplevel of the fs. */
+static const struct inode_operations dlmfs_root_inode_operations = {
+	.lookup		= simple_lookup,
+	.mkdir		= dlmfs_mkdir,
+	.rmdir		= simple_rmdir,
+};
+
+static const struct super_operations dlmfs_ops = {
+	.statfs		= simple_statfs,
+	.alloc_inode	= dlmfs_alloc_inode,
+	.destroy_inode	= dlmfs_destroy_inode,
+	.evict_inode	= dlmfs_evict_inode,
+	.drop_inode	= generic_delete_inode,
+};
+
+static const struct inode_operations dlmfs_file_inode_operations = {
+	.getattr	= simple_getattr,
+	.setattr	= dlmfs_file_setattr,
+};
+
+static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
+}
+
+static struct file_system_type dlmfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ocfs2_dlmfs",
+	.mount		= dlmfs_mount,
+	.kill_sb	= kill_litter_super,
+};
+
+static int __init init_dlmfs_fs(void)
+{
+	int status;
+	int cleanup_inode = 0, cleanup_worker = 0;
+
+	dlmfs_print_version();
+
+	status = bdi_init(&dlmfs_backing_dev_info);
+	if (status)
+		return status;
+
+	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
+				sizeof(struct dlmfs_inode_private),
+				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+					SLAB_MEM_SPREAD),
+				dlmfs_init_once);
+	if (!dlmfs_inode_cache) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	cleanup_inode = 1;
+
+	user_dlm_worker = create_singlethread_workqueue("user_dlm");
+	if (!user_dlm_worker) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	cleanup_worker = 1;
+
+	user_dlm_set_locking_protocol();
+	status = register_filesystem(&dlmfs_fs_type);
+bail:
+	if (status) {
+		if (cleanup_inode)
+			kmem_cache_destroy(dlmfs_inode_cache);
+		if (cleanup_worker)
+			destroy_workqueue(user_dlm_worker);
+		bdi_destroy(&dlmfs_backing_dev_info);
+	} else
+		printk("OCFS2 User DLM kernel interface loaded\n");
+	return status;
+}
+
+static void __exit exit_dlmfs_fs(void)
+{
+	unregister_filesystem(&dlmfs_fs_type);
+
+	flush_workqueue(user_dlm_worker);
+	destroy_workqueue(user_dlm_worker);
+
+	kmem_cache_destroy(dlmfs_inode_cache);
+
+	bdi_destroy(&dlmfs_backing_dev_info);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+module_init(init_dlmfs_fs)
+module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
new file mode 100644
index 00000000..a733b332
--- /dev/null
+++ b/fs/ocfs2/dlmfs/dlmfsver.c
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfsver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include "dlmfsver.h"
+
+#define DLM_BUILD_VERSION "1.5.0"
+
+#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
+
+void dlmfs_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
new file mode 100644
index 00000000..f35eadbe
--- /dev/null
+++ b/fs/ocfs2/dlmfs/dlmfsver.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef DLMFS_VER_H
+#define DLMFS_VER_H
+
+void dlmfs_print_version(void);
+
+#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
new file mode 100644
index 00000000..0499e3fb
--- /dev/null
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -0,0 +1,688 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * userdlm.c
+ *
+ * Code which implements the kernel side of a minimal userspace
+ * interface to our DLM.
+ *
+ * Many of the functions here are pared down versions of dlmglue.c
+ * functions.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/signal.h>
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+
+#include "ocfs2_lockingver.h"
+#include "stackglue.h"
+#include "userdlm.h"
+
+#define MLOG_MASK_PREFIX ML_DLMFS
+#include "cluster/masklog.h"
+
+
+static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
+{
+	return container_of(lksb, struct user_lock_res, l_lksb);
+}
+
+static inline int user_check_wait_flag(struct user_lock_res *lockres,
+				       int flag)
+{
+	int ret;
+
+	spin_lock(&lockres->l_lock);
+	ret = lockres->l_flags & flag;
+	spin_unlock(&lockres->l_lock);
+
+	return ret;
+}
+
+static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !user_check_wait_flag(lockres, USER_LOCK_BUSY));
+}
+
+static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
+}
+
+/* I heart container_of... */
+static inline struct ocfs2_cluster_connection *
+cluster_connection_from_user_lockres(struct user_lock_res *lockres)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = container_of(lockres,
+			  struct dlmfs_inode_private,
+			  ip_lockres);
+	return ip->ip_conn;
+}
+
+static struct inode *
+user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = container_of(lockres,
+			  struct dlmfs_inode_private,
+			  ip_lockres);
+	return &ip->ip_vfs_inode;
+}
+
+static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
+{
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+}
+
+#define user_log_dlm_error(_func, _stat, _lockres) do {			\
+	mlog(ML_ERROR, "Dlm error %d while calling %s on "		\
+		"resource %.*s\n", _stat, _func,			\
+		_lockres->l_namelen, _lockres->l_name); 		\
+} while (0)
+
+/* WARNING: This function lives in a world where the only three lock
+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
+ * lock types are added. */
+static inline int user_highest_compat_lock_level(int level)
+{
+	int new_level = DLM_LOCK_EX;
+
+	if (level == DLM_LOCK_EX)
+		new_level = DLM_LOCK_NL;
+	else if (level == DLM_LOCK_PR)
+		new_level = DLM_LOCK_PR;
+	return new_level;
+}
+
+static void user_ast(struct ocfs2_dlm_lksb *lksb)
+{
+	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
+	int status;
+
+	mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
+	     lockres->l_namelen, lockres->l_name, lockres->l_level,
+	     lockres->l_requested);
+
+	spin_lock(&lockres->l_lock);
+
+	status = ocfs2_dlm_lock_status(&lockres->l_lksb);
+	if (status) {
+		mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
+		     status, lockres->l_namelen, lockres->l_name);
+		spin_unlock(&lockres->l_lock);
+		return;
+	}
+
+	mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
+			"Lockres %.*s, requested ivmode. flags 0x%x\n",
+			lockres->l_namelen, lockres->l_name, lockres->l_flags);
+
+	/* we're downconverting. */
+	if (lockres->l_requested < lockres->l_level) {
+		if (lockres->l_requested <=
+		    user_highest_compat_lock_level(lockres->l_blocking)) {
+			lockres->l_blocking = DLM_LOCK_NL;
+			lockres->l_flags &= ~USER_LOCK_BLOCKED;
+		}
+	}
+
+	lockres->l_level = lockres->l_requested;
+	lockres->l_requested = DLM_LOCK_IV;
+	lockres->l_flags |= USER_LOCK_ATTACHED;
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
+{
+	struct inode *inode;
+	inode = user_dlm_inode_from_user_lockres(lockres);
+	if (!igrab(inode))
+		BUG();
+}
+
+static void user_dlm_unblock_lock(struct work_struct *work);
+
+static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
+{
+	if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
+		user_dlm_grab_inode_ref(lockres);
+
+		INIT_WORK(&lockres->l_work, user_dlm_unblock_lock);
+
+		queue_work(user_dlm_worker, &lockres->l_work);
+		lockres->l_flags |= USER_LOCK_QUEUED;
+	}
+}
+
+static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
+{
+	int queue = 0;
+
+	if (!(lockres->l_flags & USER_LOCK_BLOCKED))
+		return;
+
+	switch (lockres->l_blocking) {
+	case DLM_LOCK_EX:
+		if (!lockres->l_ex_holders && !lockres->l_ro_holders)
+			queue = 1;
+		break;
+	case DLM_LOCK_PR:
+		if (!lockres->l_ex_holders)
+			queue = 1;
+		break;
+	default:
+		BUG();
+	}
+
+	if (queue)
+		__user_dlm_queue_lockres(lockres);
+}
+
+static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
+{
+	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
+
+	mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n",
+	     lockres->l_namelen, lockres->l_name, level, lockres->l_level);
+
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags |= USER_LOCK_BLOCKED;
+	if (level > lockres->l_blocking)
+		lockres->l_blocking = level;
+
+	__user_dlm_queue_lockres(lockres);
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
+{
+	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
+
+	mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n",
+	     lockres->l_namelen, lockres->l_name, lockres->l_flags);
+
+	if (status)
+		mlog(ML_ERROR, "dlm returns status %d\n", status);
+
+	spin_lock(&lockres->l_lock);
+	/* The teardown flag gets set early during the unlock process,
+	 * so test the cancel flag to make sure that this ast isn't
+	 * for a concurrent cancel. */
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
+	    && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
+		lockres->l_level = DLM_LOCK_IV;
+	} else if (status == DLM_CANCELGRANT) {
+		/* We tried to cancel a convert request, but it was
+		 * already granted. Don't clear the busy flag - the
+		 * ast should've done this already. */
+		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
+		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+		goto out_noclear;
+	} else {
+		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
+		/* Cancel succeeded, we want to re-queue */
+		lockres->l_requested = DLM_LOCK_IV; /* cancel an
+						    * upconvert
+						    * request. */
+		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+		/* we want the unblock thread to look at it again
+		 * now. */
+		if (lockres->l_flags & USER_LOCK_BLOCKED)
+			__user_dlm_queue_lockres(lockres);
+	}
+
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+out_noclear:
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+/*
+ * This is the userdlmfs locking protocol version.
+ *
+ * See fs/ocfs2/dlmglue.c for more details on locking versions.
+ */
+static struct ocfs2_locking_protocol user_dlm_lproto = {
+	.lp_max_version = {
+		.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+		.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+	},
+	.lp_lock_ast		= user_ast,
+	.lp_blocking_ast	= user_bast,
+	.lp_unlock_ast		= user_unlock_ast,
+};
+
+static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
+{
+	struct inode *inode;
+	inode = user_dlm_inode_from_user_lockres(lockres);
+	iput(inode);
+}
+
+static void user_dlm_unblock_lock(struct work_struct *work)
+{
+	int new_level, status;
+	struct user_lock_res *lockres =
+		container_of(work, struct user_lock_res, l_work);
+	struct ocfs2_cluster_connection *conn =
+		cluster_connection_from_user_lockres(lockres);
+
+	mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+
+	mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED),
+			"Lockres %.*s, flags 0x%x\n",
+			lockres->l_namelen, lockres->l_name, lockres->l_flags);
+
+	/* notice that we don't clear USER_LOCK_BLOCKED here. If it's
+	 * set, we want user_ast clear it. */
+	lockres->l_flags &= ~USER_LOCK_QUEUED;
+
+	/* It's valid to get here and no longer be blocked - if we get
+	 * several basts in a row, we might be queued by the first
+	 * one, the unblock thread might run and clear the queued
+	 * flag, and finally we might get another bast which re-queues
+	 * us before our ast for the downconvert is called. */
+	if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
+		mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
+		     lockres->l_namelen, lockres->l_name);
+		spin_unlock(&lockres->l_lock);
+		goto drop_ref;
+	}
+
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+		mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
+		     lockres->l_namelen, lockres->l_name);
+		spin_unlock(&lockres->l_lock);
+		goto drop_ref;
+	}
+
+	if (lockres->l_flags & USER_LOCK_BUSY) {
+		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
+			mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
+			     lockres->l_namelen, lockres->l_name);
+			spin_unlock(&lockres->l_lock);
+			goto drop_ref;
+		}
+
+		lockres->l_flags |= USER_LOCK_IN_CANCEL;
+		spin_unlock(&lockres->l_lock);
+
+		status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
+					  DLM_LKF_CANCEL);
+		if (status)
+			user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
+		goto drop_ref;
+	}
+
+	/* If there are still incompat holders, we can exit safely
+	 * without worrying about re-queueing this lock as that will
+	 * happen on the last call to user_cluster_unlock. */
+	if ((lockres->l_blocking == DLM_LOCK_EX)
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		spin_unlock(&lockres->l_lock);
+		mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
+		     lockres->l_namelen, lockres->l_name,
+		     lockres->l_ex_holders, lockres->l_ro_holders);
+		goto drop_ref;
+	}
+
+	if ((lockres->l_blocking == DLM_LOCK_PR)
+	    && lockres->l_ex_holders) {
+		spin_unlock(&lockres->l_lock);
+		mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
+		     lockres->l_namelen, lockres->l_name,
+		     lockres->l_ex_holders);
+		goto drop_ref;
+	}
+
+	/* yay, we can downconvert now. */
+	new_level = user_highest_compat_lock_level(lockres->l_blocking);
+	lockres->l_requested = new_level;
+	lockres->l_flags |= USER_LOCK_BUSY;
+	mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
+	     lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
+	spin_unlock(&lockres->l_lock);
+
+	/* need lock downconvert request now... */
+	status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
+				DLM_LKF_CONVERT|DLM_LKF_VALBLK,
+				lockres->l_name,
+				lockres->l_namelen);
+	if (status) {
+		user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
+		user_recover_from_dlm_error(lockres);
+	}
+
+drop_ref:
+	user_dlm_drop_inode_ref(lockres);
+}
+
+static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
+					int level)
+{
+	switch(level) {
+	case DLM_LOCK_EX:
+		lockres->l_ex_holders++;
+		break;
+	case DLM_LOCK_PR:
+		lockres->l_ro_holders++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/* predict what lock level we'll be dropping down to on behalf
+ * of another node, and return true if the currently wanted
+ * level will be compatible with it. */
+static inline int
+user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
+				  int wanted)
+{
+	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
+
+	return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
+}
+
+int user_dlm_cluster_lock(struct user_lock_res *lockres,
+			  int level,
+			  int lkm_flags)
+{
+	int status, local_flags;
+	struct ocfs2_cluster_connection *conn =
+		cluster_connection_from_user_lockres(lockres);
+
+	if (level != DLM_LOCK_EX &&
+	    level != DLM_LOCK_PR) {
+		mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
+		     lockres->l_namelen, lockres->l_name);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n",
+	     lockres->l_namelen, lockres->l_name, level, lkm_flags);
+
+again:
+	if (signal_pending(current)) {
+		status = -ERESTARTSYS;
+		goto bail;
+	}
+
+	spin_lock(&lockres->l_lock);
+
+	/* We only compare against the currently granted level
+	 * here. If the lock is blocked waiting on a downconvert,
+	 * we'll get caught below. */
+	if ((lockres->l_flags & USER_LOCK_BUSY) &&
+	    (level > lockres->l_level)) {
+		/* is someone sitting in dlm_lock? If so, wait on
+		 * them. */
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
+	    (!user_may_continue_on_blocked_lock(lockres, level))) {
+		/* is the lock is currently blocked on behalf of
+		 * another node */
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_blocked_lock(lockres);
+		goto again;
+	}
+
+	if (level > lockres->l_level) {
+		local_flags = lkm_flags | DLM_LKF_VALBLK;
+		if (lockres->l_level != DLM_LOCK_IV)
+			local_flags |= DLM_LKF_CONVERT;
+
+		lockres->l_requested = level;
+		lockres->l_flags |= USER_LOCK_BUSY;
+		spin_unlock(&lockres->l_lock);
+
+		BUG_ON(level == DLM_LOCK_IV);
+		BUG_ON(level == DLM_LOCK_NL);
+
+		/* call dlm_lock to upgrade lock now */
+		status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb,
+					local_flags, lockres->l_name,
+					lockres->l_namelen);
+		if (status) {
+			if ((lkm_flags & DLM_LKF_NOQUEUE) &&
+			    (status != -EAGAIN))
+				user_log_dlm_error("ocfs2_dlm_lock",
+						   status, lockres);
+			user_recover_from_dlm_error(lockres);
+			goto bail;
+		}
+
+		user_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	user_dlm_inc_holders(lockres, level);
+	spin_unlock(&lockres->l_lock);
+
+	status = 0;
+bail:
+	return status;
+}
+
+static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
+					int level)
+{
+	switch(level) {
+	case DLM_LOCK_EX:
+		BUG_ON(!lockres->l_ex_holders);
+		lockres->l_ex_holders--;
+		break;
+	case DLM_LOCK_PR:
+		BUG_ON(!lockres->l_ro_holders);
+		lockres->l_ro_holders--;
+		break;
+	default:
+		BUG();
+	}
+}
+
+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
+			     int level)
+{
+	if (level != DLM_LOCK_EX &&
+	    level != DLM_LOCK_PR) {
+		mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
+		     lockres->l_namelen, lockres->l_name);
+		return;
+	}
+
+	spin_lock(&lockres->l_lock);
+	user_dlm_dec_holders(lockres, level);
+	__user_dlm_cond_queue_lockres(lockres);
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_write_lvb(struct inode *inode,
+			const char *val,
+			unsigned int len)
+{
+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
+	char *lvb;
+
+	BUG_ON(len > DLM_LVB_LEN);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(lockres->l_level < DLM_LOCK_EX);
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	memcpy(lvb, val, len);
+
+	spin_unlock(&lockres->l_lock);
+}
+
+ssize_t user_dlm_read_lvb(struct inode *inode,
+			  char *val,
+			  unsigned int len)
+{
+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
+	char *lvb;
+	ssize_t ret = len;
+
+	BUG_ON(len > DLM_LVB_LEN);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(lockres->l_level < DLM_LOCK_PR);
+	if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
+		lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+		memcpy(val, lvb, len);
+	} else
+		ret = 0;
+
+	spin_unlock(&lockres->l_lock);
+	return ret;
+}
+
+void user_dlm_lock_res_init(struct user_lock_res *lockres,
+			    struct dentry *dentry)
+{
+	memset(lockres, 0, sizeof(*lockres));
+
+	spin_lock_init(&lockres->l_lock);
+	init_waitqueue_head(&lockres->l_event);
+	lockres->l_level = DLM_LOCK_IV;
+	lockres->l_requested = DLM_LOCK_IV;
+	lockres->l_blocking = DLM_LOCK_IV;
+
+	/* should have been checked before getting here. */
+	BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
+
+	memcpy(lockres->l_name,
+	       dentry->d_name.name,
+	       dentry->d_name.len);
+	lockres->l_namelen = dentry->d_name.len;
+}
+
+int user_dlm_destroy_lock(struct user_lock_res *lockres)
+{
+	int status = -EBUSY;
+	struct ocfs2_cluster_connection *conn =
+		cluster_connection_from_user_lockres(lockres);
+
+	mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+		spin_unlock(&lockres->l_lock);
+		return 0;
+	}
+
+	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
+
+	while (lockres->l_flags & USER_LOCK_BUSY) {
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_busy_lock(lockres);
+
+		spin_lock(&lockres->l_lock);
+	}
+
+	if (lockres->l_ro_holders || lockres->l_ex_holders) {
+		spin_unlock(&lockres->l_lock);
+		goto bail;
+	}
+
+	status = 0;
+	if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
+		spin_unlock(&lockres->l_lock);
+		goto bail;
+	}
+
+	lockres->l_flags &= ~USER_LOCK_ATTACHED;
+	lockres->l_flags |= USER_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+
+	status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
+	if (status) {
+		user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
+		goto bail;
+	}
+
+	user_wait_on_busy_lock(lockres);
+
+	status = 0;
+bail:
+	return status;
+}
+
+static void user_dlm_recovery_handler_noop(int node_num,
+					   void *recovery_data)
+{
+	/* We ignore recovery events */
+	return;
+}
+
+void user_dlm_set_locking_protocol(void)
+{
+	ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
+}
+
+struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
+{
+	int rc;
+	struct ocfs2_cluster_connection *conn;
+
+	rc = ocfs2_cluster_connect_agnostic(name->name, name->len,
+					    &user_dlm_lproto,
+					    user_dlm_recovery_handler_noop,
+					    NULL, &conn);
+	if (rc)
+		mlog_errno(rc);
+
+	return rc ? ERR_PTR(rc) : conn;
+}
+
+void user_dlm_unregister(struct ocfs2_cluster_connection *conn)
+{
+	ocfs2_cluster_disconnect(conn, 0);
+}
diff --git a/fs/ocfs2/dlmfs/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
new file mode 100644
index 00000000..3b42d795
--- /dev/null
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -0,0 +1,113 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * userdlm.h
+ *
+ * Userspace dlm defines
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef USERDLM_H
+#define USERDLM_H
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+/* user_lock_res->l_flags flags. */
+#define USER_LOCK_ATTACHED      (0x00000001) /* we have initialized
+					       * the lvb */
+#define USER_LOCK_BUSY          (0x00000002) /* we are currently in
+					       * dlm_lock */
+#define USER_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
+					      * downconvert*/
+#define USER_LOCK_IN_TEARDOWN   (0x00000008) /* we're currently
+					      * destroying this
+					      * lock. */
+#define USER_LOCK_QUEUED        (0x00000010) /* lock is on the
+					      * workqueue */
+#define USER_LOCK_IN_CANCEL     (0x00000020)
+
+struct user_lock_res {
+	spinlock_t               l_lock;
+
+	int                      l_flags;
+
+#define USER_DLM_LOCK_ID_MAX_LEN  32
+	char                     l_name[USER_DLM_LOCK_ID_MAX_LEN];
+	int                      l_namelen;
+	int                      l_level;
+	unsigned int             l_ro_holders;
+	unsigned int             l_ex_holders;
+	struct ocfs2_dlm_lksb    l_lksb;
+
+	int                      l_requested;
+	int                      l_blocking;
+
+	wait_queue_head_t        l_event;
+
+	struct work_struct       l_work;
+};
+
+extern struct workqueue_struct *user_dlm_worker;
+
+void user_dlm_lock_res_init(struct user_lock_res *lockres,
+			    struct dentry *dentry);
+int user_dlm_destroy_lock(struct user_lock_res *lockres);
+int user_dlm_cluster_lock(struct user_lock_res *lockres,
+			  int level,
+			  int lkm_flags);
+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
+			     int level);
+void user_dlm_write_lvb(struct inode *inode,
+			const char *val,
+			unsigned int len);
+ssize_t user_dlm_read_lvb(struct inode *inode,
+			  char *val,
+			  unsigned int len);
+struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
+void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
+void user_dlm_set_locking_protocol(void);
+
+struct dlmfs_inode_private {
+	struct ocfs2_cluster_connection	*ip_conn;
+
+	struct user_lock_res ip_lockres; /* unused for directories. */
+	struct inode         *ip_parent;
+
+	struct inode         ip_vfs_inode;
+};
+
+static inline struct dlmfs_inode_private *
+DLMFS_I(struct inode *inode)
+{
+        return container_of(inode,
+			    struct dlmfs_inode_private,
+			    ip_vfs_inode);
+}
+
+struct dlmfs_filp_private {
+	int                  fp_lock_level;
+};
+
+#define DLMFS_MAGIC	0x76a9f425
+
+#endif /* USERDLM_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
new file mode 100644
index 00000000..7642d7ca
--- /dev/null
+++ b/fs/ocfs2/dlmglue.c
@@ -0,0 +1,4033 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmglue.c
+ *
+ * Code which implements an OCFS2 specific interface to our DLM.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/time.h>
+#include <linux/quotaops.h>
+
+#define MLOG_MASK_PREFIX ML_DLM_GLUE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_lockingver.h"
+
+#include "alloc.h"
+#include "dcache.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "stackglue.h"
+#include "slot_map.h"
+#include "super.h"
+#include "uptodate.h"
+#include "quota.h"
+#include "refcounttree.h"
+
+#include "buffer_head_io.h"
+
+struct ocfs2_mask_waiter {
+	struct list_head	mw_item;
+	int			mw_status;
+	struct completion	mw_complete;
+	unsigned long		mw_mask;
+	unsigned long		mw_goal;
+#ifdef CONFIG_OCFS2_FS_STATS
+	ktime_t			mw_lock_start;
+#endif
+};
+
+static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
+
+/*
+ * Return value from ->downconvert_worker functions.
+ *
+ * These control the precise actions of ocfs2_unblock_lock()
+ * and ocfs2_process_blocked_lock()
+ *
+ */
+enum ocfs2_unblock_action {
+	UNBLOCK_CONTINUE	= 0, /* Continue downconvert */
+	UNBLOCK_CONTINUE_POST	= 1, /* Continue downconvert, fire
+				      * ->post_unlock callback */
+	UNBLOCK_STOP_POST	= 2, /* Do not downconvert, fire
+				      * ->post_unlock() callback. */
+};
+
+struct ocfs2_unblock_ctl {
+	int requeue;
+	enum ocfs2_unblock_action unblock_action;
+};
+
+/* Lockdep class keys */
+struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
+
+static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
+					int new_level);
+static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres);
+
+static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
+				     int blocking);
+
+static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
+				       int blocking);
+
+static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
+				     struct ocfs2_lock_res *lockres);
+
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
+
+static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
+					    int new_level);
+static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
+					 int blocking);
+
+#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
+
+/* This aids in debugging situations where a bad LVB might be involved. */
+static void ocfs2_dump_meta_lvb_info(u64 level,
+				     const char *function,
+				     unsigned int line,
+				     struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+
+	mlog(level, "LVB information for %s (called from %s:%u):\n",
+	     lockres->l_name, function, line);
+	mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
+	     lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
+	     be32_to_cpu(lvb->lvb_igeneration));
+	mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
+	     (unsigned long long)be64_to_cpu(lvb->lvb_isize),
+	     be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
+	     be16_to_cpu(lvb->lvb_imode));
+	mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
+	     "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
+	     (long long)be64_to_cpu(lvb->lvb_iatime_packed),
+	     (long long)be64_to_cpu(lvb->lvb_ictime_packed),
+	     (long long)be64_to_cpu(lvb->lvb_imtime_packed),
+	     be32_to_cpu(lvb->lvb_iattr));
+}
+
+
+/*
+ * OCFS2 Lock Resource Operations
+ *
+ * These fine tune the behavior of the generic dlmglue locking infrastructure.
+ *
+ * The most basic of lock types can point ->l_priv to their respective
+ * struct ocfs2_super and allow the default actions to manage things.
+ *
+ * Right now, each lock type also needs to implement an init function,
+ * and trivial lock/unlock wrappers. ocfs2_simple_drop_lockres()
+ * should be called when the lock is no longer needed (i.e., object
+ * destruction time).
+ */
+struct ocfs2_lock_res_ops {
+	/*
+	 * Translate an ocfs2_lock_res * into an ocfs2_super *. Define
+	 * this callback if ->l_priv is not an ocfs2_super pointer
+	 */
+	struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
+
+	/*
+	 * Optionally called in the downconvert thread after a
+	 * successful downconvert. The lockres will not be referenced
+	 * after this callback is called, so it is safe to free
+	 * memory, etc.
+	 *
+	 * The exact semantics of when this is called are controlled
+	 * by ->downconvert_worker()
+	 */
+	void (*post_unlock)(struct ocfs2_super *, struct ocfs2_lock_res *);
+
+	/*
+	 * Allow a lock type to add checks to determine whether it is
+	 * safe to downconvert a lock. Return 0 to re-queue the
+	 * downconvert at a later time, nonzero to continue.
+	 *
+	 * For most locks, the default checks that there are no
+	 * incompatible holders are sufficient.
+	 *
+	 * Called with the lockres spinlock held.
+	 */
+	int (*check_downconvert)(struct ocfs2_lock_res *, int);
+
+	/*
+	 * Allows a lock type to populate the lock value block. This
+	 * is called on downconvert, and when we drop a lock.
+	 *
+	 * Locks that want to use this should set LOCK_TYPE_USES_LVB
+	 * in the flags field.
+	 *
+	 * Called with the lockres spinlock held.
+	 */
+	void (*set_lvb)(struct ocfs2_lock_res *);
+
+	/*
+	 * Called from the downconvert thread when it is determined
+	 * that a lock will be downconverted. This is called without
+	 * any locks held so the function can do work that might
+	 * schedule (syncing out data, etc).
+	 *
+	 * This should return any one of the ocfs2_unblock_action
+	 * values, depending on what it wants the thread to do.
+	 */
+	int (*downconvert_worker)(struct ocfs2_lock_res *, int);
+
+	/*
+	 * LOCK_TYPE_* flags which describe the specific requirements
+	 * of a lock type. Descriptions of each individual flag follow.
+	 */
+	int flags;
+};
+
+/*
+ * Some locks want to "refresh" potentially stale data when a
+ * meaningful (PRMODE or EXMODE) lock level is first obtained. If this
+ * flag is set, the OCFS2_LOCK_NEEDS_REFRESH flag will be set on the
+ * individual lockres l_flags member from the ast function. It is
+ * expected that the locking wrapper will clear the
+ * OCFS2_LOCK_NEEDS_REFRESH flag when done.
+ */
+#define LOCK_TYPE_REQUIRES_REFRESH 0x1
+
+/*
+ * Indicate that a lock type makes use of the lock value block. The
+ * ->set_lvb lock type callback must be defined.
+ */
+#define LOCK_TYPE_USES_LVB		0x2
+
+static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
+	.get_osb	= ocfs2_get_inode_osb,
+	.flags		= 0,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
+	.get_osb	= ocfs2_get_inode_osb,
+	.check_downconvert = ocfs2_check_meta_downconvert,
+	.set_lvb	= ocfs2_set_meta_lvb,
+	.downconvert_worker = ocfs2_data_convert_worker,
+	.flags		= LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_super_lops = {
+	.flags		= LOCK_TYPE_REQUIRES_REFRESH,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
+	.flags		= 0,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
+	.flags		= 0,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
+	.flags		= LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
+	.get_osb	= ocfs2_get_dentry_osb,
+	.post_unlock	= ocfs2_dentry_post_unlock,
+	.downconvert_worker = ocfs2_dentry_convert_worker,
+	.flags		= 0,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+	.get_osb	= ocfs2_get_inode_osb,
+	.flags		= 0,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
+	.get_osb	= ocfs2_get_file_osb,
+	.flags		= 0,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
+	.set_lvb	= ocfs2_set_qinfo_lvb,
+	.get_osb	= ocfs2_get_qinfo_osb,
+	.flags		= LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
+	.check_downconvert = ocfs2_check_refcount_downconvert,
+	.downconvert_worker = ocfs2_refcount_convert_worker,
+	.flags		= 0,
+};
+
+static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
+		lockres->l_type == OCFS2_LOCK_TYPE_RW ||
+		lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
+}
+
+static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
+{
+	return container_of(lksb, struct ocfs2_lock_res, l_lksb);
+}
+
+static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(!ocfs2_is_inode_lock(lockres));
+
+	return (struct inode *) lockres->l_priv;
+}
+
+static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_DENTRY);
+
+	return (struct ocfs2_dentry_lock *)lockres->l_priv;
+}
+
+static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
+
+	return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
+}
+
+static inline struct ocfs2_refcount_tree *
+ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res)
+{
+	return container_of(res, struct ocfs2_refcount_tree, rf_lockres);
+}
+
+static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
+{
+	if (lockres->l_ops->get_osb)
+		return lockres->l_ops->get_osb(lockres);
+
+	return (struct ocfs2_super *)lockres->l_priv;
+}
+
+static int ocfs2_lock_create(struct ocfs2_super *osb,
+			     struct ocfs2_lock_res *lockres,
+			     int level,
+			     u32 dlm_flags);
+static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
+						     int wanted);
+static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
+				   struct ocfs2_lock_res *lockres,
+				   int level, unsigned long caller_ip);
+static inline void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres,
+					int level)
+{
+	__ocfs2_cluster_unlock(osb, lockres, level, _RET_IP_);
+}
+
+static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
+static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
+static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres);
+static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
+						int convert);
+#define ocfs2_log_dlm_error(_func, _err, _lockres) do {					\
+	if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY)				\
+		mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n",	\
+		     _err, _func, _lockres->l_name);					\
+	else										\
+		mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n",	\
+		     _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name,	\
+		     (unsigned int)ocfs2_get_dentry_lock_ino(_lockres));		\
+} while (0)
+static int ocfs2_downconvert_thread(void *arg);
+static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres);
+static int ocfs2_inode_lock_update(struct inode *inode,
+				  struct buffer_head **bh);
+static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
+static inline int ocfs2_highest_compat_lock_level(int level);
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+					      int new_level);
+static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
+				  struct ocfs2_lock_res *lockres,
+				  int new_level,
+				  int lvb,
+				  unsigned int generation);
+static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
+				        struct ocfs2_lock_res *lockres);
+static int ocfs2_cancel_convert(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres);
+
+
+static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
+				  u64 blkno,
+				  u32 generation,
+				  char *name)
+{
+	int len;
+
+	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
+
+	len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
+		       ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD,
+		       (long long)blkno, generation);
+
+	BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
+
+	mlog(0, "built lock resource with name: %s\n", name);
+}
+
+static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
+
+static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
+				       struct ocfs2_dlm_debug *dlm_debug)
+{
+	mlog(0, "Add tracking for lockres %s\n", res->l_name);
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+}
+
+static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
+{
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	if (!list_empty(&res->l_debug_list))
+		list_del_init(&res->l_debug_list);
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+}
+
+#ifdef CONFIG_OCFS2_FS_STATS
+static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
+{
+	res->l_lock_refresh = 0;
+	memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats));
+	memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats));
+}
+
+static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
+				    struct ocfs2_mask_waiter *mw, int ret)
+{
+	u32 usec;
+	ktime_t kt;
+	struct ocfs2_lock_stats *stats;
+
+	if (level == LKM_PRMODE)
+		stats = &res->l_lock_prmode;
+	else if (level == LKM_EXMODE)
+		stats = &res->l_lock_exmode;
+	else
+		return;
+
+	kt = ktime_sub(ktime_get(), mw->mw_lock_start);
+	usec = ktime_to_us(kt);
+
+	stats->ls_gets++;
+	stats->ls_total += ktime_to_ns(kt);
+	/* overflow */
+	if (unlikely(stats->ls_gets) == 0) {
+		stats->ls_gets++;
+		stats->ls_total = ktime_to_ns(kt);
+	}
+
+	if (stats->ls_max < usec)
+		stats->ls_max = usec;
+
+	if (ret)
+		stats->ls_fail++;
+}
+
+static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
+{
+	lockres->l_lock_refresh++;
+}
+
+static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
+{
+	mw->mw_lock_start = ktime_get();
+}
+#else
+static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
+{
+}
+static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res,
+			   int level, struct ocfs2_mask_waiter *mw, int ret)
+{
+}
+static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
+{
+}
+static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
+{
+}
+#endif
+
+static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
+				       struct ocfs2_lock_res *res,
+				       enum ocfs2_lock_type type,
+				       struct ocfs2_lock_res_ops *ops,
+				       void *priv)
+{
+	res->l_type          = type;
+	res->l_ops           = ops;
+	res->l_priv          = priv;
+
+	res->l_level         = DLM_LOCK_IV;
+	res->l_requested     = DLM_LOCK_IV;
+	res->l_blocking      = DLM_LOCK_IV;
+	res->l_action        = OCFS2_AST_INVALID;
+	res->l_unlock_action = OCFS2_UNLOCK_INVALID;
+
+	res->l_flags         = OCFS2_LOCK_INITIALIZED;
+
+	ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
+
+	ocfs2_init_lock_stats(res);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (type != OCFS2_LOCK_TYPE_OPEN)
+		lockdep_init_map(&res->l_lockdep_map, ocfs2_lock_type_strings[type],
+				 &lockdep_keys[type], 0);
+	else
+		res->l_lockdep_map.key = NULL;
+#endif
+}
+
+void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
+{
+	/* This also clears out the lock status block */
+	memset(res, 0, sizeof(struct ocfs2_lock_res));
+	spin_lock_init(&res->l_lock);
+	init_waitqueue_head(&res->l_event);
+	INIT_LIST_HEAD(&res->l_blocked_list);
+	INIT_LIST_HEAD(&res->l_mask_waiters);
+}
+
+void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
+			       enum ocfs2_lock_type type,
+			       unsigned int generation,
+			       struct inode *inode)
+{
+	struct ocfs2_lock_res_ops *ops;
+
+	switch(type) {
+		case OCFS2_LOCK_TYPE_RW:
+			ops = &ocfs2_inode_rw_lops;
+			break;
+		case OCFS2_LOCK_TYPE_META:
+			ops = &ocfs2_inode_inode_lops;
+			break;
+		case OCFS2_LOCK_TYPE_OPEN:
+			ops = &ocfs2_inode_open_lops;
+			break;
+		default:
+			mlog_bug_on_msg(1, "type: %d\n", type);
+			ops = NULL; /* thanks, gcc */
+			break;
+	};
+
+	ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno,
+			      generation, res->l_name);
+	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, ops, inode);
+}
+
+static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
+{
+	struct inode *inode = ocfs2_lock_res_inode(lockres);
+
+	return OCFS2_SB(inode->i_sb);
+}
+
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_mem_dqinfo *info = lockres->l_priv;
+
+	return OCFS2_SB(info->dqi_gi.dqi_sb);
+}
+
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_file_private *fp = lockres->l_priv;
+
+	return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
+}
+
+static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
+{
+	__be64 inode_blkno_be;
+
+	memcpy(&inode_blkno_be, &lockres->l_name[OCFS2_DENTRY_LOCK_INO_START],
+	       sizeof(__be64));
+
+	return be64_to_cpu(inode_blkno_be);
+}
+
+static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_dentry_lock *dl = lockres->l_priv;
+
+	return OCFS2_SB(dl->dl_inode->i_sb);
+}
+
+void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
+				u64 parent, struct inode *inode)
+{
+	int len;
+	u64 inode_blkno = OCFS2_I(inode)->ip_blkno;
+	__be64 inode_blkno_be = cpu_to_be64(inode_blkno);
+	struct ocfs2_lock_res *lockres = &dl->dl_lockres;
+
+	ocfs2_lock_res_init_once(lockres);
+
+	/*
+	 * Unfortunately, the standard lock naming scheme won't work
+	 * here because we have two 16 byte values to use. Instead,
+	 * we'll stuff the inode number as a binary value. We still
+	 * want error prints to show something without garbling the
+	 * display, so drop a null byte in there before the inode
+	 * number. A future version of OCFS2 will likely use all
+	 * binary lock names. The stringified names have been a
+	 * tremendous aid in debugging, but now that the debugfs
+	 * interface exists, we can mangle things there if need be.
+	 *
+	 * NOTE: We also drop the standard "pad" value (the total lock
+	 * name size stays the same though - the last part is all
+	 * zeros due to the memset in ocfs2_lock_res_init_once()
+	 */
+	len = snprintf(lockres->l_name, OCFS2_DENTRY_LOCK_INO_START,
+		       "%c%016llx",
+		       ocfs2_lock_type_char(OCFS2_LOCK_TYPE_DENTRY),
+		       (long long)parent);
+
+	BUG_ON(len != (OCFS2_DENTRY_LOCK_INO_START - 1));
+
+	memcpy(&lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], &inode_blkno_be,
+	       sizeof(__be64));
+
+	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
+				   OCFS2_LOCK_TYPE_DENTRY, &ocfs2_dentry_lops,
+				   dl);
+}
+
+static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
+				      struct ocfs2_super *osb)
+{
+	/* Superblock lockres doesn't come from a slab so we call init
+	 * once on it manually.  */
+	ocfs2_lock_res_init_once(res);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_SUPER, OCFS2_SUPER_BLOCK_BLKNO,
+			      0, res->l_name);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
+				   &ocfs2_super_lops, osb);
+}
+
+static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
+				       struct ocfs2_super *osb)
+{
+	/* Rename lockres doesn't come from a slab so we call init
+	 * once on it manually.  */
+	ocfs2_lock_res_init_once(res);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_RENAME, 0, 0, res->l_name);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME,
+				   &ocfs2_rename_lops, osb);
+}
+
+static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
+					 struct ocfs2_super *osb)
+{
+	/* nfs_sync lockres doesn't come from a slab so we call init
+	 * once on it manually.  */
+	ocfs2_lock_res_init_once(res);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
+				   &ocfs2_nfs_sync_lops, osb);
+}
+
+static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
+					    struct ocfs2_super *osb)
+{
+	ocfs2_lock_res_init_once(res);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+				   &ocfs2_orphan_scan_lops, osb);
+}
+
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+			      struct ocfs2_file_private *fp)
+{
+	struct inode *inode = fp->fp_file->f_mapping->host;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	ocfs2_lock_res_init_once(lockres);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
+			      inode->i_generation, lockres->l_name);
+	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
+				   OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
+				   fp);
+	lockres->l_flags |= OCFS2_LOCK_NOCACHE;
+}
+
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+			       struct ocfs2_mem_dqinfo *info)
+{
+	ocfs2_lock_res_init_once(lockres);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
+			      0, lockres->l_name);
+	ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
+				   OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
+				   info);
+}
+
+void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
+				  struct ocfs2_super *osb, u64 ref_blkno,
+				  unsigned int generation)
+{
+	ocfs2_lock_res_init_once(lockres);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno,
+			      generation, lockres->l_name);
+	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT,
+				   &ocfs2_refcount_block_lops, osb);
+}
+
+void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
+{
+	if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
+		return;
+
+	ocfs2_remove_lockres_tracking(res);
+
+	mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
+			"Lockres %s is on the blocked list\n",
+			res->l_name);
+	mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
+			"Lockres %s has mask waiters pending\n",
+			res->l_name);
+	mlog_bug_on_msg(spin_is_locked(&res->l_lock),
+			"Lockres %s is locked\n",
+			res->l_name);
+	mlog_bug_on_msg(res->l_ro_holders,
+			"Lockres %s has %u ro holders\n",
+			res->l_name, res->l_ro_holders);
+	mlog_bug_on_msg(res->l_ex_holders,
+			"Lockres %s has %u ex holders\n",
+			res->l_name, res->l_ex_holders);
+
+	/* Need to clear out the lock status block for the dlm */
+	memset(&res->l_lksb, 0, sizeof(res->l_lksb));
+
+	res->l_flags = 0UL;
+}
+
+static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	BUG_ON(!lockres);
+
+	switch(level) {
+	case DLM_LOCK_EX:
+		lockres->l_ex_holders++;
+		break;
+	case DLM_LOCK_PR:
+		lockres->l_ro_holders++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	BUG_ON(!lockres);
+
+	switch(level) {
+	case DLM_LOCK_EX:
+		BUG_ON(!lockres->l_ex_holders);
+		lockres->l_ex_holders--;
+		break;
+	case DLM_LOCK_PR:
+		BUG_ON(!lockres->l_ro_holders);
+		lockres->l_ro_holders--;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/* WARNING: This function lives in a world where the only three lock
+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
+ * lock types are added. */
+static inline int ocfs2_highest_compat_lock_level(int level)
+{
+	int new_level = DLM_LOCK_EX;
+
+	if (level == DLM_LOCK_EX)
+		new_level = DLM_LOCK_NL;
+	else if (level == DLM_LOCK_PR)
+		new_level = DLM_LOCK_PR;
+	return new_level;
+}
+
+static void lockres_set_flags(struct ocfs2_lock_res *lockres,
+			      unsigned long newflags)
+{
+	struct ocfs2_mask_waiter *mw, *tmp;
+
+ 	assert_spin_locked(&lockres->l_lock);
+
+	lockres->l_flags = newflags;
+
+	list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) {
+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
+			continue;
+
+		list_del_init(&mw->mw_item);
+		mw->mw_status = 0;
+		complete(&mw->mw_complete);
+	}
+}
+static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
+{
+	lockres_set_flags(lockres, lockres->l_flags | or);
+}
+static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
+				unsigned long clear)
+{
+	lockres_set_flags(lockres, lockres->l_flags & ~clear);
+}
+
+static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+	BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
+
+	lockres->l_level = lockres->l_requested;
+	if (lockres->l_level <=
+	    ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
+		lockres->l_blocking = DLM_LOCK_NL;
+		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
+	}
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+}
+
+static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+
+	/* Convert from RO to EX doesn't really need anything as our
+	 * information is already up to data. Convert from NL to
+	 * *anything* however should mark ourselves as needing an
+	 * update */
+	if (lockres->l_level == DLM_LOCK_NL &&
+	    lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
+		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	lockres->l_level = lockres->l_requested;
+
+	/*
+	 * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
+	 * the OCFS2_LOCK_BUSY flag to prevent the dc thread from
+	 * downconverting the lock before the upconvert has fully completed.
+	 */
+	lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+}
+
+static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
+	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+
+	if (lockres->l_requested > DLM_LOCK_NL &&
+	    !(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
+	    lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
+		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	lockres->l_level = lockres->l_requested;
+	lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+}
+
+static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	int needs_downconvert = 0;
+
+	assert_spin_locked(&lockres->l_lock);
+
+	if (level > lockres->l_blocking) {
+		/* only schedule a downconvert if we haven't already scheduled
+		 * one that goes low enough to satisfy the level we're
+		 * blocking.  this also catches the case where we get
+		 * duplicate BASTs */
+		if (ocfs2_highest_compat_lock_level(level) <
+		    ocfs2_highest_compat_lock_level(lockres->l_blocking))
+			needs_downconvert = 1;
+
+		lockres->l_blocking = level;
+	}
+
+	mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n",
+	     lockres->l_name, level, lockres->l_level, lockres->l_blocking,
+	     needs_downconvert);
+
+	if (needs_downconvert)
+		lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+	mlog(0, "needs_downconvert = %d\n", needs_downconvert);
+	return needs_downconvert;
+}
+
+/*
+ * OCFS2_LOCK_PENDING and l_pending_gen.
+ *
+ * Why does OCFS2_LOCK_PENDING exist?  To close a race between setting
+ * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock().  See ocfs2_unblock_lock()
+ * for more details on the race.
+ *
+ * OCFS2_LOCK_PENDING closes the race quite nicely.  However, it introduces
+ * a race on itself.  In o2dlm, we can get the ast before ocfs2_dlm_lock()
+ * returns.  The ast clears OCFS2_LOCK_BUSY, and must therefore clear
+ * OCFS2_LOCK_PENDING at the same time.  When ocfs2_dlm_lock() returns,
+ * the caller is going to try to clear PENDING again.  If nothing else is
+ * happening, __lockres_clear_pending() sees PENDING is unset and does
+ * nothing.
+ *
+ * But what if another path (eg downconvert thread) has just started a
+ * new locking action?  The other path has re-set PENDING.  Our path
+ * cannot clear PENDING, because that will re-open the original race
+ * window.
+ *
+ * [Example]
+ *
+ * ocfs2_meta_lock()
+ *  ocfs2_cluster_lock()
+ *   set BUSY
+ *   set PENDING
+ *   drop l_lock
+ *   ocfs2_dlm_lock()
+ *    ocfs2_locking_ast()		ocfs2_downconvert_thread()
+ *     clear PENDING			 ocfs2_unblock_lock()
+ *					  take_l_lock
+ *					  !BUSY
+ *					  ocfs2_prepare_downconvert()
+ *					   set BUSY
+ *					   set PENDING
+ *					  drop l_lock
+ *   take l_lock
+ *   clear PENDING
+ *   drop l_lock
+ *			<window>
+ *					  ocfs2_dlm_lock()
+ *
+ * So as you can see, we now have a window where l_lock is not held,
+ * PENDING is not set, and ocfs2_dlm_lock() has not been called.
+ *
+ * The core problem is that ocfs2_cluster_lock() has cleared the PENDING
+ * set by ocfs2_prepare_downconvert().  That wasn't nice.
+ *
+ * To solve this we introduce l_pending_gen.  A call to
+ * lockres_clear_pending() will only do so when it is passed a generation
+ * number that matches the lockres.  lockres_set_pending() will return the
+ * current generation number.  When ocfs2_cluster_lock() goes to clear
+ * PENDING, it passes the generation it got from set_pending().  In our
+ * example above, the generation numbers will *not* match.  Thus,
+ * ocfs2_cluster_lock() will not clear the PENDING set by
+ * ocfs2_prepare_downconvert().
+ */
+
+/* Unlocked version for ocfs2_locking_ast() */
+static void __lockres_clear_pending(struct ocfs2_lock_res *lockres,
+				    unsigned int generation,
+				    struct ocfs2_super *osb)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	/*
+	 * The ast and locking functions can race us here.  The winner
+	 * will clear pending, the loser will not.
+	 */
+	if (!(lockres->l_flags & OCFS2_LOCK_PENDING) ||
+	    (lockres->l_pending_gen != generation))
+		return;
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_PENDING);
+	lockres->l_pending_gen++;
+
+	/*
+	 * The downconvert thread may have skipped us because we
+	 * were PENDING.  Wake it up.
+	 */
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+		ocfs2_wake_downconvert_thread(osb);
+}
+
+/* Locked version for callers of ocfs2_dlm_lock() */
+static void lockres_clear_pending(struct ocfs2_lock_res *lockres,
+				  unsigned int generation,
+				  struct ocfs2_super *osb)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	__lockres_clear_pending(lockres, generation, osb);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
+{
+	assert_spin_locked(&lockres->l_lock);
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+
+	lockres_or_flags(lockres, OCFS2_LOCK_PENDING);
+
+	return lockres->l_pending_gen;
+}
+
+static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
+{
+	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
+	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
+	int needs_downconvert;
+	unsigned long flags;
+
+	BUG_ON(level <= DLM_LOCK_NL);
+
+	mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, "
+	     "type %s\n", lockres->l_name, level, lockres->l_level,
+	     ocfs2_lock_type_string(lockres->l_type));
+
+	/*
+	 * We can skip the bast for locks which don't enable caching -
+	 * they'll be dropped at the earliest possible time anyway.
+	 */
+	if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
+		return;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
+	if (needs_downconvert)
+		ocfs2_schedule_blocked_lock(osb, lockres);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+
+	ocfs2_wake_downconvert_thread(osb);
+}
+
+static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
+{
+	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
+	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
+	unsigned long flags;
+	int status;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	status = ocfs2_dlm_lock_status(&lockres->l_lksb);
+
+	if (status == -EAGAIN) {
+		lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+		goto out;
+	}
+
+	if (status) {
+		mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n",
+		     lockres->l_name, status);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		return;
+	}
+
+	mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, "
+	     "level %d => %d\n", lockres->l_name, lockres->l_action,
+	     lockres->l_unlock_action, lockres->l_level, lockres->l_requested);
+
+	switch(lockres->l_action) {
+	case OCFS2_AST_ATTACH:
+		ocfs2_generic_handle_attach_action(lockres);
+		lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
+		break;
+	case OCFS2_AST_CONVERT:
+		ocfs2_generic_handle_convert_action(lockres);
+		break;
+	case OCFS2_AST_DOWNCONVERT:
+		ocfs2_generic_handle_downconvert_action(lockres);
+		break;
+	default:
+		mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, "
+		     "flags 0x%lx, unlock: %u\n",
+		     lockres->l_name, lockres->l_action, lockres->l_flags,
+		     lockres->l_unlock_action);
+		BUG();
+	}
+out:
+	/* set it to something invalid so if we get called again we
+	 * can catch it. */
+	lockres->l_action = OCFS2_AST_INVALID;
+
+	/* Did we try to cancel this lock?  Clear that state */
+	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT)
+		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+
+	/*
+	 * We may have beaten the locking functions here.  We certainly
+	 * know that dlm_lock() has been called :-)
+	 * Because we can't have two lock calls in flight at once, we
+	 * can use lockres->l_pending_gen.
+	 */
+	__lockres_clear_pending(lockres, lockres->l_pending_gen,  osb);
+
+	wake_up(&lockres->l_event);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
+{
+	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
+	unsigned long flags;
+
+	mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
+	     lockres->l_name, lockres->l_unlock_action);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (error) {
+		mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
+		     "unlock_action %d\n", error, lockres->l_name,
+		     lockres->l_unlock_action);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		return;
+	}
+
+	switch(lockres->l_unlock_action) {
+	case OCFS2_UNLOCK_CANCEL_CONVERT:
+		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
+		lockres->l_action = OCFS2_AST_INVALID;
+		/* Downconvert thread may have requeued this lock, we
+		 * need to wake it. */
+		if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+			ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
+		break;
+	case OCFS2_UNLOCK_DROP_LOCK:
+		lockres->l_level = DLM_LOCK_IV;
+		break;
+	default:
+		BUG();
+	}
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	wake_up(&lockres->l_event);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+/*
+ * This is the filesystem locking protocol.  It provides the lock handling
+ * hooks for the underlying DLM.  It has a maximum version number.
+ * The version number allows interoperability with systems running at
+ * the same major number and an equal or smaller minor number.
+ *
+ * Whenever the filesystem does new things with locks (adds or removes a
+ * lock, orders them differently, does different things underneath a lock),
+ * the version must be changed.  The protocol is negotiated when joining
+ * the dlm domain.  A node may join the domain if its major version is
+ * identical to all other nodes and its minor version is greater than
+ * or equal to all other nodes.  When its minor version is greater than
+ * the other nodes, it will run at the minor version specified by the
+ * other nodes.
+ *
+ * If a locking change is made that will not be compatible with older
+ * versions, the major number must be increased and the minor version set
+ * to zero.  If a change merely adds a behavior that can be disabled when
+ * speaking to older versions, the minor version must be increased.  If a
+ * change adds a fully backwards compatible change (eg, LVB changes that
+ * are just ignored by older versions), the version does not need to be
+ * updated.
+ */
+static struct ocfs2_locking_protocol lproto = {
+	.lp_max_version = {
+		.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+		.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+	},
+	.lp_lock_ast		= ocfs2_locking_ast,
+	.lp_blocking_ast	= ocfs2_blocking_ast,
+	.lp_unlock_ast		= ocfs2_unlock_ast,
+};
+
+void ocfs2_set_locking_protocol(void)
+{
+	ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
+}
+
+static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
+						int convert)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+	if (convert)
+		lockres->l_action = OCFS2_AST_INVALID;
+	else
+		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+}
+
+/* Note: If we detect another process working on the lock (i.e.,
+ * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
+ * to do the right thing in that case.
+ */
+static int ocfs2_lock_create(struct ocfs2_super *osb,
+			     struct ocfs2_lock_res *lockres,
+			     int level,
+			     u32 dlm_flags)
+{
+	int ret = 0;
+	unsigned long flags;
+	unsigned int gen;
+
+	mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
+	     dlm_flags);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
+	    (lockres->l_flags & OCFS2_LOCK_BUSY)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto bail;
+	}
+
+	lockres->l_action = OCFS2_AST_ATTACH;
+	lockres->l_requested = level;
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	gen = lockres_set_pending(lockres);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	ret = ocfs2_dlm_lock(osb->cconn,
+			     level,
+			     &lockres->l_lksb,
+			     dlm_flags,
+			     lockres->l_name,
+			     OCFS2_LOCK_ID_MAX_LEN - 1);
+	lockres_clear_pending(lockres, gen, osb);
+	if (ret) {
+		ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
+		ocfs2_recover_from_dlm_error(lockres, 1);
+	}
+
+	mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
+
+bail:
+	return ret;
+}
+
+static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
+					int flag)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	ret = lockres->l_flags & flag;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ret;
+}
+
+static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
+}
+
+static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
+}
+
+/* predict what lock level we'll be dropping down to on behalf
+ * of another node, and return true if the currently wanted
+ * level will be compatible with it. */
+static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
+						     int wanted)
+{
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+
+	return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
+}
+
+static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
+{
+	INIT_LIST_HEAD(&mw->mw_item);
+	init_completion(&mw->mw_complete);
+	ocfs2_init_start_time(mw);
+}
+
+static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
+{
+	wait_for_completion(&mw->mw_complete);
+	/* Re-arm the completion in case we want to wait on it again */
+	INIT_COMPLETION(mw->mw_complete);
+	return mw->mw_status;
+}
+
+static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
+				    struct ocfs2_mask_waiter *mw,
+				    unsigned long mask,
+				    unsigned long goal)
+{
+	BUG_ON(!list_empty(&mw->mw_item));
+
+	assert_spin_locked(&lockres->l_lock);
+
+	list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
+	mw->mw_mask = mask;
+	mw->mw_goal = goal;
+}
+
+/* returns 0 if the mw that was removed was already satisfied, -EBUSY
+ * if the mask still hadn't reached its goal */
+static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
+				      struct ocfs2_mask_waiter *mw)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (!list_empty(&mw->mw_item)) {
+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
+			ret = -EBUSY;
+
+		list_del_init(&mw->mw_item);
+		init_completion(&mw->mw_complete);
+	}
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ret;
+
+}
+
+static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
+					     struct ocfs2_lock_res *lockres)
+{
+	int ret;
+
+	ret = wait_for_completion_interruptible(&mw->mw_complete);
+	if (ret)
+		lockres_remove_mask_waiter(lockres, mw);
+	else
+		ret = mw->mw_status;
+	/* Re-arm the completion in case we want to wait on it again */
+	INIT_COMPLETION(mw->mw_complete);
+	return ret;
+}
+
+static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres,
+				int level,
+				u32 lkm_flags,
+				int arg_flags,
+				int l_subclass,
+				unsigned long caller_ip)
+{
+	struct ocfs2_mask_waiter mw;
+	int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
+	int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
+	unsigned long flags;
+	unsigned int gen;
+	int noqueue_attempted = 0;
+
+	ocfs2_init_mask_waiter(&mw);
+
+	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
+		lkm_flags |= DLM_LKF_VALBLK;
+
+again:
+	wait = 0;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	if (catch_signals && signal_pending(current)) {
+		ret = -ERESTARTSYS;
+		goto unlock;
+	}
+
+	mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
+			"Cluster lock called on freeing lockres %s! flags "
+			"0x%lx\n", lockres->l_name, lockres->l_flags);
+
+	/* We only compare against the currently granted level
+	 * here. If the lock is blocked waiting on a downconvert,
+	 * we'll get caught below. */
+	if (lockres->l_flags & OCFS2_LOCK_BUSY &&
+	    level > lockres->l_level) {
+		/* is someone sitting in dlm_lock? If so, wait on
+		 * them. */
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+		wait = 1;
+		goto unlock;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) {
+		/*
+		 * We've upconverted. If the lock now has a level we can
+		 * work with, we take it. If, however, the lock is not at the
+		 * required level, we go thru the full cycle. One way this could
+		 * happen is if a process requesting an upconvert to PR is
+		 * closely followed by another requesting upconvert to an EX.
+		 * If the process requesting EX lands here, we want it to
+		 * continue attempting to upconvert and let the process
+		 * requesting PR take the lock.
+		 * If multiple processes request upconvert to PR, the first one
+		 * here will take the lock. The others will have to go thru the
+		 * OCFS2_LOCK_BLOCKED check to ensure that there is no pending
+		 * downconvert request.
+		 */
+		if (level <= lockres->l_level)
+			goto update_holders;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
+	    !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
+		/* is the lock is currently blocked on behalf of
+		 * another node */
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
+		wait = 1;
+		goto unlock;
+	}
+
+	if (level > lockres->l_level) {
+		if (noqueue_attempted > 0) {
+			ret = -EAGAIN;
+			goto unlock;
+		}
+		if (lkm_flags & DLM_LKF_NOQUEUE)
+			noqueue_attempted = 1;
+
+		if (lockres->l_action != OCFS2_AST_INVALID)
+			mlog(ML_ERROR, "lockres %s has action %u pending\n",
+			     lockres->l_name, lockres->l_action);
+
+		if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+			lockres->l_action = OCFS2_AST_ATTACH;
+			lkm_flags &= ~DLM_LKF_CONVERT;
+		} else {
+			lockres->l_action = OCFS2_AST_CONVERT;
+			lkm_flags |= DLM_LKF_CONVERT;
+		}
+
+		lockres->l_requested = level;
+		lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+		gen = lockres_set_pending(lockres);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		BUG_ON(level == DLM_LOCK_IV);
+		BUG_ON(level == DLM_LOCK_NL);
+
+		mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
+		     lockres->l_name, lockres->l_level, level);
+
+		/* call dlm_lock to upgrade lock now */
+		ret = ocfs2_dlm_lock(osb->cconn,
+				     level,
+				     &lockres->l_lksb,
+				     lkm_flags,
+				     lockres->l_name,
+				     OCFS2_LOCK_ID_MAX_LEN - 1);
+		lockres_clear_pending(lockres, gen, osb);
+		if (ret) {
+			if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
+			    (ret != -EAGAIN)) {
+				ocfs2_log_dlm_error("ocfs2_dlm_lock",
+						    ret, lockres);
+			}
+			ocfs2_recover_from_dlm_error(lockres, 1);
+			goto out;
+		}
+
+		mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
+		     lockres->l_name);
+
+		/* At this point we've gone inside the dlm and need to
+		 * complete our work regardless. */
+		catch_signals = 0;
+
+		/* wait for busy to clear and carry on */
+		goto again;
+	}
+
+update_holders:
+	/* Ok, if we get here then we're good to go. */
+	ocfs2_inc_holders(lockres, level);
+
+	ret = 0;
+unlock:
+	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+out:
+	/*
+	 * This is helping work around a lock inversion between the page lock
+	 * and dlm locks.  One path holds the page lock while calling aops
+	 * which block acquiring dlm locks.  The voting thread holds dlm
+	 * locks while acquiring page locks while down converting data locks.
+	 * This block is helping an aop path notice the inversion and back
+	 * off to unlock its page lock before trying the dlm lock again.
+	 */
+	if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
+	    mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
+		wait = 0;
+		if (lockres_remove_mask_waiter(lockres, &mw))
+			ret = -EAGAIN;
+		else
+			goto again;
+	}
+	if (wait) {
+		ret = ocfs2_wait_for_mask(&mw);
+		if (ret == 0)
+			goto again;
+		mlog_errno(ret);
+	}
+	ocfs2_update_lock_stats(lockres, level, &mw, ret);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (!ret && lockres->l_lockdep_map.key != NULL) {
+		if (level == DLM_LOCK_PR)
+			rwsem_acquire_read(&lockres->l_lockdep_map, l_subclass,
+				!!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
+				caller_ip);
+		else
+			rwsem_acquire(&lockres->l_lockdep_map, l_subclass,
+				!!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
+				caller_ip);
+	}
+#endif
+	return ret;
+}
+
+static inline int ocfs2_cluster_lock(struct ocfs2_super *osb,
+				     struct ocfs2_lock_res *lockres,
+				     int level,
+				     u32 lkm_flags,
+				     int arg_flags)
+{
+	return __ocfs2_cluster_lock(osb, lockres, level, lkm_flags, arg_flags,
+				    0, _RET_IP_);
+}
+
+
+static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
+				   struct ocfs2_lock_res *lockres,
+				   int level,
+				   unsigned long caller_ip)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	ocfs2_dec_holders(lockres, level);
+	ocfs2_downconvert_on_unlock(osb, lockres);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (lockres->l_lockdep_map.key != NULL)
+		rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
+#endif
+}
+
+static int ocfs2_create_new_lock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres,
+				 int ex,
+				 int local)
+{
+	int level =  ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	unsigned long flags;
+	u32 lkm_flags = local ? DLM_LKF_LOCAL : 0;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+	lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ocfs2_lock_create(osb, lockres, level, lkm_flags);
+}
+
+/* Grants us an EX lock on the data and metadata resources, skipping
+ * the normal cluster directory lookup. Use this ONLY on newly created
+ * inodes which other nodes can't possibly see, and which haven't been
+ * hashed in the inode hash yet. This can give us a good performance
+ * increase as it'll skip the network broadcast normally associated
+ * with creating a new lock resource. */
+int ocfs2_create_new_inode_locks(struct inode *inode)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!inode);
+	BUG_ON(!ocfs2_inode_is_new(inode));
+
+	mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	/* NOTE: That we don't increment any of the holder counts, nor
+	 * do we add anything to a journal handle. Since this is
+	 * supposed to be a new inode which the cluster doesn't know
+	 * about yet, there is no need to.  As far as the LVB handling
+	 * is concerned, this is basically like acquiring an EX lock
+	 * on a resource which has an invalid one -- we'll set it
+	 * valid when we release the EX. */
+
+	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	/*
+	 * We don't want to use DLM_LKF_LOCAL on a meta data lock as they
+	 * don't use a generation in their lock names.
+	 */
+	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+bail:
+	return ret;
+}
+
+int ocfs2_rw_lock(struct inode *inode, int write)
+{
+	int status, level;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!inode);
+
+	mlog(0, "inode %llu take %s RW lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	if (ocfs2_mount_local(osb))
+		return 0;
+
+	lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+	level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
+				    0);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+void ocfs2_rw_unlock(struct inode *inode, int write)
+{
+	int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "inode %llu drop %s RW lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	if (!ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+}
+
+/*
+ * ocfs2_open_lock always get PR mode lock.
+ */
+int ocfs2_open_lock(struct inode *inode)
+{
+	int status = 0;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!inode);
+
+	mlog(0, "inode %llu take PRMODE open lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	if (ocfs2_mount_local(osb))
+		goto out;
+
+	lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+				    DLM_LOCK_PR, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+out:
+	return status;
+}
+
+int ocfs2_try_open_lock(struct inode *inode, int write)
+{
+	int status = 0, level;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!inode);
+
+	mlog(0, "inode %llu try to take %s open lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	if (ocfs2_mount_local(osb))
+		goto out;
+
+	lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+	level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+	/*
+	 * The file system may already holding a PRMODE/EXMODE open lock.
+	 * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
+	 * other nodes and the -EAGAIN will indicate to the caller that
+	 * this inode is still in use.
+	 */
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+				    level, DLM_LKF_NOQUEUE, 0);
+
+out:
+	return status;
+}
+
+/*
+ * ocfs2_open_unlock unlock PR and EX mode open locks.
+ */
+void ocfs2_open_unlock(struct inode *inode)
+{
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "inode %llu drop open lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	if (ocfs2_mount_local(osb))
+		goto out;
+
+	if(lockres->l_ro_holders)
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+				     DLM_LOCK_PR);
+	if(lockres->l_ex_holders)
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+				     DLM_LOCK_EX);
+
+out:
+	return;
+}
+
+static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	int ret;
+	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
+	unsigned long flags;
+	struct ocfs2_mask_waiter mw;
+
+	ocfs2_init_mask_waiter(&mw);
+
+retry_cancel:
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
+		if (ret) {
+			spin_unlock_irqrestore(&lockres->l_lock, flags);
+			ret = ocfs2_cancel_convert(osb, lockres);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out;
+			}
+			goto retry_cancel;
+		}
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		ocfs2_wait_for_mask(&mw);
+		goto retry_cancel;
+	}
+
+	ret = -ERESTARTSYS;
+	/*
+	 * We may still have gotten the lock, in which case there's no
+	 * point to restarting the syscall.
+	 */
+	if (lockres->l_level == level)
+		ret = 0;
+
+	mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
+	     lockres->l_flags, lockres->l_level, lockres->l_action);
+
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+out:
+	return ret;
+}
+
+/*
+ * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
+ * flock() calls. The locking approach this requires is sufficiently
+ * different from all other cluster lock types that we implement a
+ * separate path to the "low-level" dlm calls. In particular:
+ *
+ * - No optimization of lock levels is done - we take at exactly
+ *   what's been requested.
+ *
+ * - No lock caching is employed. We immediately downconvert to
+ *   no-lock at unlock time. This also means flock locks never go on
+ *   the blocking list).
+ *
+ * - Since userspace can trivially deadlock itself with flock, we make
+ *   sure to allow cancellation of a misbehaving applications flock()
+ *   request.
+ *
+ * - Access to any flock lockres doesn't require concurrency, so we
+ *   can simplify the code by requiring the caller to guarantee
+ *   serialization of dlmglue flock calls.
+ */
+int ocfs2_file_lock(struct file *file, int ex, int trylock)
+{
+	int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
+	unsigned long flags;
+	struct ocfs2_file_private *fp = file->private_data;
+	struct ocfs2_lock_res *lockres = &fp->fp_flock;
+	struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+	struct ocfs2_mask_waiter mw;
+
+	ocfs2_init_mask_waiter(&mw);
+
+	if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
+	    (lockres->l_level > DLM_LOCK_NL)) {
+		mlog(ML_ERROR,
+		     "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
+		     "level: %u\n", lockres->l_name, lockres->l_flags,
+		     lockres->l_level);
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		/*
+		 * Get the lock at NLMODE to start - that way we
+		 * can cancel the upconvert request if need be.
+		 */
+		ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_wait_for_mask(&mw);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		spin_lock_irqsave(&lockres->l_lock, flags);
+	}
+
+	lockres->l_action = OCFS2_AST_CONVERT;
+	lkm_flags |= DLM_LKF_CONVERT;
+	lockres->l_requested = level;
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+
+	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
+			     lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
+	if (ret) {
+		if (!trylock || (ret != -EAGAIN)) {
+			ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
+			ret = -EINVAL;
+		}
+
+		ocfs2_recover_from_dlm_error(lockres, 1);
+		lockres_remove_mask_waiter(lockres, &mw);
+		goto out;
+	}
+
+	ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
+	if (ret == -ERESTARTSYS) {
+		/*
+		 * Userspace can cause deadlock itself with
+		 * flock(). Current behavior locally is to allow the
+		 * deadlock, but abort the system call if a signal is
+		 * received. We follow this example, otherwise a
+		 * poorly written program could sit in kernel until
+		 * reboot.
+		 *
+		 * Handling this is a bit more complicated for Ocfs2
+		 * though. We can't exit this function with an
+		 * outstanding lock request, so a cancel convert is
+		 * required. We intentionally overwrite 'ret' - if the
+		 * cancel fails and the lock was granted, it's easier
+		 * to just bubble success back up to the user.
+		 */
+		ret = ocfs2_flock_handle_signal(lockres, level);
+	} else if (!ret && (level > lockres->l_level)) {
+		/* Trylock failed asynchronously */
+		BUG_ON(!trylock);
+		ret = -EAGAIN;
+	}
+
+out:
+
+	mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
+	     lockres->l_name, ex, trylock, ret);
+	return ret;
+}
+
+void ocfs2_file_unlock(struct file *file)
+{
+	int ret;
+	unsigned int gen;
+	unsigned long flags;
+	struct ocfs2_file_private *fp = file->private_data;
+	struct ocfs2_lock_res *lockres = &fp->fp_flock;
+	struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+	struct ocfs2_mask_waiter mw;
+
+	ocfs2_init_mask_waiter(&mw);
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
+		return;
+
+	if (lockres->l_level == DLM_LOCK_NL)
+		return;
+
+	mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
+	     lockres->l_name, lockres->l_flags, lockres->l_level,
+	     lockres->l_action);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	/*
+	 * Fake a blocking ast for the downconvert code.
+	 */
+	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+	lockres->l_blocking = DLM_LOCK_EX;
+
+	gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
+	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
+	if (ret) {
+		mlog_errno(ret);
+		return;
+	}
+
+	ret = ocfs2_wait_for_mask(&mw);
+	if (ret)
+		mlog_errno(ret);
+}
+
+static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres)
+{
+	int kick = 0;
+
+	/* If we know that another node is waiting on our lock, kick
+	 * the downconvert thread * pre-emptively when we reach a release
+	 * condition. */
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
+		switch(lockres->l_blocking) {
+		case DLM_LOCK_EX:
+			if (!lockres->l_ex_holders && !lockres->l_ro_holders)
+				kick = 1;
+			break;
+		case DLM_LOCK_PR:
+			if (!lockres->l_ex_holders)
+				kick = 1;
+			break;
+		default:
+			BUG();
+		}
+	}
+
+	if (kick)
+		ocfs2_wake_downconvert_thread(osb);
+}
+
+#define OCFS2_SEC_BITS   34
+#define OCFS2_SEC_SHIFT  (64 - 34)
+#define OCFS2_NSEC_MASK  ((1ULL << OCFS2_SEC_SHIFT) - 1)
+
+/* LVB only has room for 64 bits of time here so we pack it for
+ * now. */
+static u64 ocfs2_pack_timespec(struct timespec *spec)
+{
+	u64 res;
+	u64 sec = spec->tv_sec;
+	u32 nsec = spec->tv_nsec;
+
+	res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
+
+	return res;
+}
+
+/* Call this with the lockres locked. I am reasonably sure we don't
+ * need ip_lock in this function as anyone who would be changing those
+ * values is supposed to be blocked in ocfs2_inode_lock right now. */
+static void __ocfs2_stuff_meta_lvb(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
+	struct ocfs2_meta_lvb *lvb;
+
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+
+	/*
+	 * Invalidate the LVB of a deleted inode - this way other
+	 * nodes are forced to go to disk and discover the new inode
+	 * status.
+	 */
+	if (oi->ip_flags & OCFS2_INODE_DELETED) {
+		lvb->lvb_version = 0;
+		goto out;
+	}
+
+	lvb->lvb_version   = OCFS2_LVB_VERSION;
+	lvb->lvb_isize	   = cpu_to_be64(i_size_read(inode));
+	lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
+	lvb->lvb_iuid      = cpu_to_be32(inode->i_uid);
+	lvb->lvb_igid      = cpu_to_be32(inode->i_gid);
+	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);
+	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);
+	lvb->lvb_iatime_packed  =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
+	lvb->lvb_ictime_packed =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
+	lvb->lvb_imtime_packed =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
+	lvb->lvb_iattr    = cpu_to_be32(oi->ip_attr);
+	lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
+	lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
+
+out:
+	mlog_meta_lvb(0, lockres);
+}
+
+static void ocfs2_unpack_timespec(struct timespec *spec,
+				  u64 packed_time)
+{
+	spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
+	spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
+}
+
+static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
+	struct ocfs2_meta_lvb *lvb;
+
+	mlog_meta_lvb(0, lockres);
+
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+
+	/* We're safe here without the lockres lock... */
+	spin_lock(&oi->ip_lock);
+	oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
+	i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
+
+	oi->ip_attr = be32_to_cpu(lvb->lvb_iattr);
+	oi->ip_dyn_features = be16_to_cpu(lvb->lvb_idynfeatures);
+	ocfs2_set_inode_flags(inode);
+
+	/* fast-symlinks are a special case */
+	if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
+
+	inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
+	inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
+	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
+	inode->i_nlink   = be16_to_cpu(lvb->lvb_inlink);
+	ocfs2_unpack_timespec(&inode->i_atime,
+			      be64_to_cpu(lvb->lvb_iatime_packed));
+	ocfs2_unpack_timespec(&inode->i_mtime,
+			      be64_to_cpu(lvb->lvb_imtime_packed));
+	ocfs2_unpack_timespec(&inode->i_ctime,
+			      be64_to_cpu(lvb->lvb_ictime_packed));
+	spin_unlock(&oi->ip_lock);
+}
+
+static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
+					      struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+
+	if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)
+	    && lvb->lvb_version == OCFS2_LVB_VERSION
+	    && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
+		return 1;
+	return 0;
+}
+
+/* Determine whether a lock resource needs to be refreshed, and
+ * arbitrate who gets to refresh it.
+ *
+ *   0 means no refresh needed.
+ *
+ *   > 0 means you need to refresh this and you MUST call
+ *   ocfs2_complete_lock_res_refresh afterwards. */
+static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
+{
+	unsigned long flags;
+	int status = 0;
+
+refresh_check:
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto bail;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		ocfs2_wait_on_refreshing_lock(lockres);
+		goto refresh_check;
+	}
+
+	/* Ok, I'll be the one to refresh this lock. */
+	lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = 1;
+bail:
+	mlog(0, "status %d\n", status);
+	return status;
+}
+
+/* If status is non zero, I'll mark it as not being in refresh
+ * anymroe, but i won't clear the needs refresh flag. */
+static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
+						   int status)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
+	if (!status)
+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+}
+
+/* may or may not return a bh if it went to disk. */
+static int ocfs2_inode_lock_update(struct inode *inode,
+				  struct buffer_head **bh)
+{
+	int status = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (ocfs2_mount_local(osb))
+		goto bail;
+
+	spin_lock(&oi->ip_lock);
+	if (oi->ip_flags & OCFS2_INODE_DELETED) {
+		mlog(0, "Orphaned inode %llu was deleted while we "
+		     "were waiting on a lock. ip_flags = 0x%x\n",
+		     (unsigned long long)oi->ip_blkno, oi->ip_flags);
+		spin_unlock(&oi->ip_lock);
+		status = -ENOENT;
+		goto bail;
+	}
+	spin_unlock(&oi->ip_lock);
+
+	if (!ocfs2_should_refresh_lock_res(lockres))
+		goto bail;
+
+	/* This will discard any caching information we might have had
+	 * for the inode metadata. */
+	ocfs2_metadata_cache_purge(INODE_CACHE(inode));
+
+	ocfs2_extent_map_trunc(inode, 0);
+
+	if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
+		mlog(0, "Trusting LVB on inode %llu\n",
+		     (unsigned long long)oi->ip_blkno);
+		ocfs2_refresh_inode_from_lvb(inode);
+	} else {
+		/* Boo, we have to go to disk. */
+		/* read bh, cast, ocfs2_refresh_inode */
+		status = ocfs2_read_inode_block(inode, bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_refresh;
+		}
+		fe = (struct ocfs2_dinode *) (*bh)->b_data;
+
+		/* This is a good chance to make sure we're not
+		 * locking an invalid object.  ocfs2_read_inode_block()
+		 * already checked that the inode block is sane.
+		 *
+		 * We bug on a stale inode here because we checked
+		 * above whether it was wiped from disk. The wiping
+		 * node provides a guarantee that we receive that
+		 * message and can mark the inode before dropping any
+		 * locks associated with it. */
+		mlog_bug_on_msg(inode->i_generation !=
+				le32_to_cpu(fe->i_generation),
+				"Invalid dinode %llu disk generation: %u "
+				"inode->i_generation: %u\n",
+				(unsigned long long)oi->ip_blkno,
+				le32_to_cpu(fe->i_generation),
+				inode->i_generation);
+		mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
+				!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
+				"Stale dinode %llu dtime: %llu flags: 0x%x\n",
+				(unsigned long long)oi->ip_blkno,
+				(unsigned long long)le64_to_cpu(fe->i_dtime),
+				le32_to_cpu(fe->i_flags));
+
+		ocfs2_refresh_inode(inode, fe);
+		ocfs2_track_lock_refresh(lockres);
+	}
+
+	status = 0;
+bail_refresh:
+	ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+	return status;
+}
+
+static int ocfs2_assign_bh(struct inode *inode,
+			   struct buffer_head **ret_bh,
+			   struct buffer_head *passed_bh)
+{
+	int status;
+
+	if (passed_bh) {
+		/* Ok, the update went to disk for us, use the
+		 * returned bh. */
+		*ret_bh = passed_bh;
+		get_bh(*ret_bh);
+
+		return 0;
+	}
+
+	status = ocfs2_read_inode_block(inode, ret_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+/*
+ * returns < 0 error if the callback will never be called, otherwise
+ * the result of the lock will be communicated via the callback.
+ */
+int ocfs2_inode_lock_full_nested(struct inode *inode,
+				 struct buffer_head **ret_bh,
+				 int ex,
+				 int arg_flags,
+				 int subclass)
+{
+	int status, level, acquired;
+	u32 dlm_flags;
+	struct ocfs2_lock_res *lockres = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *local_bh = NULL;
+
+	BUG_ON(!inode);
+
+	mlog(0, "inode %llu, take %s META lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     ex ? "EXMODE" : "PRMODE");
+
+	status = 0;
+	acquired = 0;
+	/* We'll allow faking a readonly metadata lock for
+	 * rodevices. */
+	if (ocfs2_is_hard_readonly(osb)) {
+		if (ex)
+			status = -EROFS;
+		goto bail;
+	}
+
+	if (ocfs2_mount_local(osb))
+		goto local;
+
+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
+		ocfs2_wait_for_recovery(osb);
+
+	lockres = &OCFS2_I(inode)->ip_inode_lockres;
+	level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	dlm_flags = 0;
+	if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
+		dlm_flags |= DLM_LKF_NOQUEUE;
+
+	status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
+				      arg_flags, subclass, _RET_IP_);
+	if (status < 0) {
+		if (status != -EAGAIN && status != -EIOCBRETRY)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* Notify the error cleanup path to drop the cluster lock. */
+	acquired = 1;
+
+	/* We wait twice because a node may have died while we were in
+	 * the lower dlm layers. The second time though, we've
+	 * committed to owning this lock so we don't allow signals to
+	 * abort the operation. */
+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
+		ocfs2_wait_for_recovery(osb);
+
+local:
+	/*
+	 * We only see this flag if we're being called from
+	 * ocfs2_read_locked_inode(). It means we're locking an inode
+	 * which hasn't been populated yet, so clear the refresh flag
+	 * and let the caller handle it.
+	 */
+	if (inode->i_state & I_NEW) {
+		status = 0;
+		if (lockres)
+			ocfs2_complete_lock_res_refresh(lockres, 0);
+		goto bail;
+	}
+
+	/* This is fun. The caller may want a bh back, or it may
+	 * not. ocfs2_inode_lock_update definitely wants one in, but
+	 * may or may not read one, depending on what's in the
+	 * LVB. The result of all of this is that we've *only* gone to
+	 * disk if we have to, so the complexity is worthwhile. */
+	status = ocfs2_inode_lock_update(inode, &local_bh);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	if (ret_bh) {
+		status = ocfs2_assign_bh(inode, ret_bh, local_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+bail:
+	if (status < 0) {
+		if (ret_bh && (*ret_bh)) {
+			brelse(*ret_bh);
+			*ret_bh = NULL;
+		}
+		if (acquired)
+			ocfs2_inode_unlock(inode, ex);
+	}
+
+	if (local_bh)
+		brelse(local_bh);
+
+	return status;
+}
+
+/*
+ * This is working around a lock inversion between tasks acquiring DLM
+ * locks while holding a page lock and the downconvert thread which
+ * blocks dlm lock acquiry while acquiring page locks.
+ *
+ * ** These _with_page variantes are only intended to be called from aop
+ * methods that hold page locks and return a very specific *positive* error
+ * code that aop methods pass up to the VFS -- test for errors with != 0. **
+ *
+ * The DLM is called such that it returns -EAGAIN if it would have
+ * blocked waiting for the downconvert thread.  In that case we unlock
+ * our page so the downconvert thread can make progress.  Once we've
+ * done this we have to return AOP_TRUNCATED_PAGE so the aop method
+ * that called us can bubble that back up into the VFS who will then
+ * immediately retry the aop call.
+ *
+ * We do a blocking lock and immediate unlock before returning, though, so that
+ * the lock has a great chance of being cached on this node by the time the VFS
+ * calls back to retry the aop.    This has a potential to livelock as nodes
+ * ping locks back and forth, but that's a risk we're willing to take to avoid
+ * the lock inversion simply.
+ */
+int ocfs2_inode_lock_with_page(struct inode *inode,
+			      struct buffer_head **ret_bh,
+			      int ex,
+			      struct page *page)
+{
+	int ret;
+
+	ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
+	if (ret == -EAGAIN) {
+		unlock_page(page);
+		if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
+			ocfs2_inode_unlock(inode, ex);
+		ret = AOP_TRUNCATED_PAGE;
+	}
+
+	return ret;
+}
+
+int ocfs2_inode_lock_atime(struct inode *inode,
+			  struct vfsmount *vfsmnt,
+			  int *level)
+{
+	int ret;
+
+	ret = ocfs2_inode_lock(inode, NULL, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/*
+	 * If we should update atime, we will get EX lock,
+	 * otherwise we just get PR lock.
+	 */
+	if (ocfs2_should_update_atime(inode, vfsmnt)) {
+		struct buffer_head *bh = NULL;
+
+		ocfs2_inode_unlock(inode, 0);
+		ret = ocfs2_inode_lock(inode, &bh, 1);
+		if (ret < 0) {
+			mlog_errno(ret);
+			return ret;
+		}
+		*level = 1;
+		if (ocfs2_should_update_atime(inode, vfsmnt))
+			ocfs2_update_inode_atime(inode, bh);
+		if (bh)
+			brelse(bh);
+	} else
+		*level = 0;
+
+	return ret;
+}
+
+void ocfs2_inode_unlock(struct inode *inode,
+		       int ex)
+{
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "inode %llu drop %s META lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     ex ? "EXMODE" : "PRMODE");
+
+	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
+	    !ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+}
+
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
+{
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_orphan_scan_lvb *lvb;
+	int status = 0;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (ocfs2_mount_local(osb))
+		return 0;
+
+	lockres = &osb->osb_orphan_scan.os_lockres;
+	status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
+	if (status < 0)
+		return status;
+
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+	    lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
+		*seqno = be32_to_cpu(lvb->lvb_os_seqno);
+	else
+		*seqno = osb->osb_orphan_scan.os_seqno + 1;
+
+	return status;
+}
+
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno)
+{
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_orphan_scan_lvb *lvb;
+
+	if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) {
+		lockres = &osb->osb_orphan_scan.os_lockres;
+		lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+		lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
+		lvb->lvb_os_seqno = cpu_to_be32(seqno);
+		ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
+	}
+}
+
+int ocfs2_super_lock(struct ocfs2_super *osb,
+		     int ex)
+{
+	int status = 0;
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (ocfs2_mount_local(osb))
+		goto bail;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* The super block lock path is really in the best position to
+	 * know when resources covered by the lock need to be
+	 * refreshed, so we do it here. Of course, making sense of
+	 * everything is up to the caller :) */
+	status = ocfs2_should_refresh_lock_res(lockres);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (status) {
+		status = ocfs2_refresh_slot_info(osb);
+
+		ocfs2_complete_lock_res_refresh(lockres, status);
+
+		if (status < 0)
+			mlog_errno(status);
+		ocfs2_track_lock_refresh(lockres);
+	}
+bail:
+	return status;
+}
+
+void ocfs2_super_unlock(struct ocfs2_super *osb,
+			int ex)
+{
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
+
+	if (!ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+int ocfs2_rename_lock(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (ocfs2_mount_local(osb))
+		return 0;
+
+	status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+void ocfs2_rename_unlock(struct ocfs2_super *osb)
+{
+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
+
+	if (!ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
+}
+
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
+{
+	int status;
+	struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (ocfs2_mount_local(osb))
+		return 0;
+
+	status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
+				    0, 0);
+	if (status < 0)
+		mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
+
+	return status;
+}
+
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
+{
+	struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+
+	if (!ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres,
+				     ex ? LKM_EXMODE : LKM_PRMODE);
+}
+
+int ocfs2_dentry_lock(struct dentry *dentry, int ex)
+{
+	int ret;
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+
+	BUG_ON(!dl);
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (ocfs2_mount_local(osb))
+		return 0;
+
+	ret = ocfs2_cluster_lock(osb, &dl->dl_lockres, level, 0, 0);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
+{
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+
+	if (!ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
+}
+
+/* Reference counting of the dlm debug structure. We want this because
+ * open references on the debug inodes can live on after a mount, so
+ * we can't rely on the ocfs2_super to always exist. */
+static void ocfs2_dlm_debug_free(struct kref *kref)
+{
+	struct ocfs2_dlm_debug *dlm_debug;
+
+	dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
+
+	kfree(dlm_debug);
+}
+
+void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
+{
+	if (dlm_debug)
+		kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
+}
+
+static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
+{
+	kref_get(&debug->d_refcnt);
+}
+
+struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
+{
+	struct ocfs2_dlm_debug *dlm_debug;
+
+	dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
+	if (!dlm_debug) {
+		mlog_errno(-ENOMEM);
+		goto out;
+	}
+
+	kref_init(&dlm_debug->d_refcnt);
+	INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
+	dlm_debug->d_locking_state = NULL;
+out:
+	return dlm_debug;
+}
+
+/* Access to this is arbitrated for us via seq_file->sem. */
+struct ocfs2_dlm_seq_priv {
+	struct ocfs2_dlm_debug *p_dlm_debug;
+	struct ocfs2_lock_res p_iter_res;
+	struct ocfs2_lock_res p_tmp_res;
+};
+
+static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
+						 struct ocfs2_dlm_seq_priv *priv)
+{
+	struct ocfs2_lock_res *iter, *ret = NULL;
+	struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
+
+	assert_spin_locked(&ocfs2_dlm_tracking_lock);
+
+	list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
+		/* discover the head of the list */
+		if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
+			mlog(0, "End of list found, %p\n", ret);
+			break;
+		}
+
+		/* We track our "dummy" iteration lockres' by a NULL
+		 * l_ops field. */
+		if (iter->l_ops != NULL) {
+			ret = iter;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct ocfs2_dlm_seq_priv *priv = m->private;
+	struct ocfs2_lock_res *iter;
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
+	if (iter) {
+		/* Since lockres' have the lifetime of their container
+		 * (which can be inodes, ocfs2_supers, etc) we want to
+		 * copy this out to a temporary lockres while still
+		 * under the spinlock. Obviously after this we can't
+		 * trust any pointers on the copy returned, but that's
+		 * ok as the information we want isn't typically held
+		 * in them. */
+		priv->p_tmp_res = *iter;
+		iter = &priv->p_tmp_res;
+	}
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+
+	return iter;
+}
+
+static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ocfs2_dlm_seq_priv *priv = m->private;
+	struct ocfs2_lock_res *iter = v;
+	struct ocfs2_lock_res *dummy = &priv->p_iter_res;
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	iter = ocfs2_dlm_next_res(iter, priv);
+	list_del_init(&dummy->l_debug_list);
+	if (iter) {
+		list_add(&dummy->l_debug_list, &iter->l_debug_list);
+		priv->p_tmp_res = *iter;
+		iter = &priv->p_tmp_res;
+	}
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+
+	return iter;
+}
+
+/*
+ * Version is used by debugfs.ocfs2 to determine the format being used
+ *
+ * New in version 2
+ *	- Lock stats printed
+ * New in version 3
+ *	- Max time in lock stats is in usecs (instead of nsecs)
+ */
+#define OCFS2_DLM_DEBUG_STR_VERSION 3
+static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
+{
+	int i;
+	char *lvb;
+	struct ocfs2_lock_res *lockres = v;
+
+	if (!lockres)
+		return -EINVAL;
+
+	seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION);
+
+	if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY)
+		seq_printf(m, "%.*s%08x\t", OCFS2_DENTRY_LOCK_INO_START - 1,
+			   lockres->l_name,
+			   (unsigned int)ocfs2_get_dentry_lock_ino(lockres));
+	else
+		seq_printf(m, "%.*s\t", OCFS2_LOCK_ID_MAX_LEN, lockres->l_name);
+
+	seq_printf(m, "%d\t"
+		   "0x%lx\t"
+		   "0x%x\t"
+		   "0x%x\t"
+		   "%u\t"
+		   "%u\t"
+		   "%d\t"
+		   "%d\t",
+		   lockres->l_level,
+		   lockres->l_flags,
+		   lockres->l_action,
+		   lockres->l_unlock_action,
+		   lockres->l_ro_holders,
+		   lockres->l_ex_holders,
+		   lockres->l_requested,
+		   lockres->l_blocking);
+
+	/* Dump the raw LVB */
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	for(i = 0; i < DLM_LVB_LEN; i++)
+		seq_printf(m, "0x%x\t", lvb[i]);
+
+#ifdef CONFIG_OCFS2_FS_STATS
+# define lock_num_prmode(_l)		((_l)->l_lock_prmode.ls_gets)
+# define lock_num_exmode(_l)		((_l)->l_lock_exmode.ls_gets)
+# define lock_num_prmode_failed(_l)	((_l)->l_lock_prmode.ls_fail)
+# define lock_num_exmode_failed(_l)	((_l)->l_lock_exmode.ls_fail)
+# define lock_total_prmode(_l)		((_l)->l_lock_prmode.ls_total)
+# define lock_total_exmode(_l)		((_l)->l_lock_exmode.ls_total)
+# define lock_max_prmode(_l)		((_l)->l_lock_prmode.ls_max)
+# define lock_max_exmode(_l)		((_l)->l_lock_exmode.ls_max)
+# define lock_refresh(_l)		((_l)->l_lock_refresh)
+#else
+# define lock_num_prmode(_l)		(0)
+# define lock_num_exmode(_l)		(0)
+# define lock_num_prmode_failed(_l)	(0)
+# define lock_num_exmode_failed(_l)	(0)
+# define lock_total_prmode(_l)		(0ULL)
+# define lock_total_exmode(_l)		(0ULL)
+# define lock_max_prmode(_l)		(0)
+# define lock_max_exmode(_l)		(0)
+# define lock_refresh(_l)		(0)
+#endif
+	/* The following seq_print was added in version 2 of this output */
+	seq_printf(m, "%u\t"
+		   "%u\t"
+		   "%u\t"
+		   "%u\t"
+		   "%llu\t"
+		   "%llu\t"
+		   "%u\t"
+		   "%u\t"
+		   "%u\t",
+		   lock_num_prmode(lockres),
+		   lock_num_exmode(lockres),
+		   lock_num_prmode_failed(lockres),
+		   lock_num_exmode_failed(lockres),
+		   lock_total_prmode(lockres),
+		   lock_total_exmode(lockres),
+		   lock_max_prmode(lockres),
+		   lock_max_exmode(lockres),
+		   lock_refresh(lockres));
+
+	/* End the line */
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static const struct seq_operations ocfs2_dlm_seq_ops = {
+	.start =	ocfs2_dlm_seq_start,
+	.stop =		ocfs2_dlm_seq_stop,
+	.next =		ocfs2_dlm_seq_next,
+	.show =		ocfs2_dlm_seq_show,
+};
+
+static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct ocfs2_dlm_seq_priv *priv = seq->private;
+	struct ocfs2_lock_res *res = &priv->p_iter_res;
+
+	ocfs2_remove_lockres_tracking(res);
+	ocfs2_put_dlm_debug(priv->p_dlm_debug);
+	return seq_release_private(inode, file);
+}
+
+static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct ocfs2_dlm_seq_priv *priv;
+	struct seq_file *seq;
+	struct ocfs2_super *osb;
+
+	priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
+	if (!priv) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	osb = inode->i_private;
+	ocfs2_get_dlm_debug(osb->osb_dlm_debug);
+	priv->p_dlm_debug = osb->osb_dlm_debug;
+	INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
+
+	ret = seq_open(file, &ocfs2_dlm_seq_ops);
+	if (ret) {
+		kfree(priv);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	seq = file->private_data;
+	seq->private = priv;
+
+	ocfs2_add_lockres_tracking(&priv->p_iter_res,
+				   priv->p_dlm_debug);
+
+out:
+	return ret;
+}
+
+static const struct file_operations ocfs2_dlm_debug_fops = {
+	.open =		ocfs2_dlm_debug_open,
+	.release =	ocfs2_dlm_debug_release,
+	.read =		seq_read,
+	.llseek =	seq_lseek,
+};
+
+static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
+{
+	int ret = 0;
+	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
+
+	dlm_debug->d_locking_state = debugfs_create_file("locking_state",
+							 S_IFREG|S_IRUSR,
+							 osb->osb_debug_root,
+							 osb,
+							 &ocfs2_dlm_debug_fops);
+	if (!dlm_debug->d_locking_state) {
+		ret = -EINVAL;
+		mlog(ML_ERROR,
+		     "Unable to create locking state debugfs file.\n");
+		goto out;
+	}
+
+	ocfs2_get_dlm_debug(dlm_debug);
+out:
+	return ret;
+}
+
+static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
+{
+	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
+
+	if (dlm_debug) {
+		debugfs_remove(dlm_debug->d_locking_state);
+		ocfs2_put_dlm_debug(dlm_debug);
+	}
+}
+
+int ocfs2_dlm_init(struct ocfs2_super *osb)
+{
+	int status = 0;
+	struct ocfs2_cluster_connection *conn = NULL;
+
+	if (ocfs2_mount_local(osb)) {
+		osb->node_num = 0;
+		goto local;
+	}
+
+	status = ocfs2_dlm_init_debug(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* launch downconvert thread */
+	osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
+	if (IS_ERR(osb->dc_task)) {
+		status = PTR_ERR(osb->dc_task);
+		osb->dc_task = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* for now, uuid == domain */
+	status = ocfs2_cluster_connect(osb->osb_cluster_stack,
+				       osb->uuid_str,
+				       strlen(osb->uuid_str),
+				       &lproto, ocfs2_do_node_down, osb,
+				       &conn);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_cluster_this_node(&osb->node_num);
+	if (status < 0) {
+		mlog_errno(status);
+		mlog(ML_ERROR,
+		     "could not find this host's node number\n");
+		ocfs2_cluster_disconnect(conn, 0);
+		goto bail;
+	}
+
+local:
+	ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
+	ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+	ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+	ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
+
+	osb->cconn = conn;
+
+	status = 0;
+bail:
+	if (status < 0) {
+		ocfs2_dlm_shutdown_debug(osb);
+		if (osb->dc_task)
+			kthread_stop(osb->dc_task);
+	}
+
+	return status;
+}
+
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
+			int hangup_pending)
+{
+	ocfs2_drop_osb_locks(osb);
+
+	/*
+	 * Now that we have dropped all locks and ocfs2_dismount_volume()
+	 * has disabled recovery, the DLM won't be talking to us.  It's
+	 * safe to tear things down before disconnecting the cluster.
+	 */
+
+	if (osb->dc_task) {
+		kthread_stop(osb->dc_task);
+		osb->dc_task = NULL;
+	}
+
+	ocfs2_lock_res_free(&osb->osb_super_lockres);
+	ocfs2_lock_res_free(&osb->osb_rename_lockres);
+	ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
+	ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
+
+	ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
+	osb->cconn = NULL;
+
+	ocfs2_dlm_shutdown_debug(osb);
+}
+
+static int ocfs2_drop_lock(struct ocfs2_super *osb,
+			   struct ocfs2_lock_res *lockres)
+{
+	int ret;
+	unsigned long flags;
+	u32 lkm_flags = 0;
+
+	/* We didn't get anywhere near actually using this lockres. */
+	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
+		goto out;
+
+	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
+		lkm_flags |= DLM_LKF_VALBLK;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
+			"lockres %s, flags 0x%lx\n",
+			lockres->l_name, lockres->l_flags);
+
+	while (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
+		     "%u, unlock_action = %u\n",
+		     lockres->l_name, lockres->l_flags, lockres->l_action,
+		     lockres->l_unlock_action);
+
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		/* XXX: Today we just wait on any busy
+		 * locks... Perhaps we need to cancel converts in the
+		 * future? */
+		ocfs2_wait_on_busy_lock(lockres);
+
+		spin_lock_irqsave(&lockres->l_lock, flags);
+	}
+
+	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
+		if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
+		    lockres->l_level == DLM_LOCK_EX &&
+		    !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
+			lockres->l_ops->set_lvb(lockres);
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_BUSY)
+		mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
+		     lockres->l_name);
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+		mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto out;
+	}
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
+
+	/* make sure we never get here while waiting for an ast to
+	 * fire. */
+	BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
+
+	/* is this necessary? */
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	mlog(0, "lock %s\n", lockres->l_name);
+
+	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
+	if (ret) {
+		ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
+		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
+		ocfs2_dlm_dump_lksb(&lockres->l_lksb);
+		BUG();
+	}
+	mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
+	     lockres->l_name);
+
+	ocfs2_wait_on_busy_lock(lockres);
+out:
+	return 0;
+}
+
+/* Mark the lockres as being dropped. It will no longer be
+ * queued if blocking, but we still may have to wait on it
+ * being dequeued from the downconvert thread before we can consider
+ * it safe to drop.
+ *
+ * You can *not* attempt to call cluster_lock on this lockres anymore. */
+void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
+{
+	int status;
+	struct ocfs2_mask_waiter mw;
+	unsigned long flags;
+
+	ocfs2_init_mask_waiter(&mw);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres->l_flags |= OCFS2_LOCK_FREEING;
+	while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		mlog(0, "Waiting on lockres %s\n", lockres->l_name);
+
+		status = ocfs2_wait_for_mask(&mw);
+		if (status)
+			mlog_errno(status);
+
+		spin_lock_irqsave(&lockres->l_lock, flags);
+	}
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
+			       struct ocfs2_lock_res *lockres)
+{
+	int ret;
+
+	ocfs2_mark_lockres_freeing(lockres);
+	ret = ocfs2_drop_lock(osb, lockres);
+	if (ret)
+		mlog_errno(ret);
+}
+
+static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
+{
+	ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
+	ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
+	ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
+	ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
+}
+
+int ocfs2_drop_inode_locks(struct inode *inode)
+{
+	int status, err;
+
+	/* No need to call ocfs2_mark_lockres_freeing here -
+	 * ocfs2_clear_inode has done it for us. */
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_open_lockres);
+	if (err < 0)
+		mlog_errno(err);
+
+	status = err;
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_inode_lockres);
+	if (err < 0)
+		mlog_errno(err);
+	if (err < 0 && !status)
+		status = err;
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_rw_lockres);
+	if (err < 0)
+		mlog_errno(err);
+	if (err < 0 && !status)
+		status = err;
+
+	return status;
+}
+
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+					      int new_level)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
+
+	if (lockres->l_level <= new_level) {
+		mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, "
+		     "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, "
+		     "block %d, pgen %d\n", lockres->l_name, lockres->l_level,
+		     new_level, list_empty(&lockres->l_blocked_list),
+		     list_empty(&lockres->l_mask_waiters), lockres->l_type,
+		     lockres->l_flags, lockres->l_ro_holders,
+		     lockres->l_ex_holders, lockres->l_action,
+		     lockres->l_unlock_action, lockres->l_requested,
+		     lockres->l_blocking, lockres->l_pending_gen);
+		BUG();
+	}
+
+	mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n",
+	     lockres->l_name, lockres->l_level, new_level, lockres->l_blocking);
+
+	lockres->l_action = OCFS2_AST_DOWNCONVERT;
+	lockres->l_requested = new_level;
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	return lockres_set_pending(lockres);
+}
+
+static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
+				  struct ocfs2_lock_res *lockres,
+				  int new_level,
+				  int lvb,
+				  unsigned int generation)
+{
+	int ret;
+	u32 dlm_flags = DLM_LKF_CONVERT;
+
+	mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
+	     lockres->l_level, new_level);
+
+	if (lvb)
+		dlm_flags |= DLM_LKF_VALBLK;
+
+	ret = ocfs2_dlm_lock(osb->cconn,
+			     new_level,
+			     &lockres->l_lksb,
+			     dlm_flags,
+			     lockres->l_name,
+			     OCFS2_LOCK_ID_MAX_LEN - 1);
+	lockres_clear_pending(lockres, generation, osb);
+	if (ret) {
+		ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
+		ocfs2_recover_from_dlm_error(lockres, 1);
+		goto bail;
+	}
+
+	ret = 0;
+bail:
+	return ret;
+}
+
+/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */
+static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
+				        struct ocfs2_lock_res *lockres)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
+		/* If we're already trying to cancel a lock conversion
+		 * then just drop the spinlock and allow the caller to
+		 * requeue this lock. */
+		mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name);
+		return 0;
+	}
+
+	/* were we in a convert when we got the bast fire? */
+	BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
+	       lockres->l_action != OCFS2_AST_DOWNCONVERT);
+	/* set things up for the unlockast to know to just
+	 * clear out the ast_action and unset busy, etc. */
+	lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
+
+	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
+			"lock %s, invalid flags: 0x%lx\n",
+			lockres->l_name, lockres->l_flags);
+
+	mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
+
+	return 1;
+}
+
+static int ocfs2_cancel_convert(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres)
+{
+	int ret;
+
+	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
+			       DLM_LKF_CANCEL);
+	if (ret) {
+		ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
+		ocfs2_recover_from_dlm_error(lockres, 0);
+	}
+
+	mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
+
+	return ret;
+}
+
+static int ocfs2_unblock_lock(struct ocfs2_super *osb,
+			      struct ocfs2_lock_res *lockres,
+			      struct ocfs2_unblock_ctl *ctl)
+{
+	unsigned long flags;
+	int blocking;
+	int new_level;
+	int level;
+	int ret = 0;
+	int set_lvb = 0;
+	unsigned int gen;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+recheck:
+	/*
+	 * Is it still blocking? If not, we have no more work to do.
+	 */
+	if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
+		BUG_ON(lockres->l_blocking != DLM_LOCK_NL);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		ret = 0;
+		goto leave;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		/* XXX
+		 * This is a *big* race.  The OCFS2_LOCK_PENDING flag
+		 * exists entirely for one reason - another thread has set
+		 * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock().
+		 *
+		 * If we do ocfs2_cancel_convert() before the other thread
+		 * calls dlm_lock(), our cancel will do nothing.  We will
+		 * get no ast, and we will have no way of knowing the
+		 * cancel failed.  Meanwhile, the other thread will call
+		 * into dlm_lock() and wait...forever.
+		 *
+		 * Why forever?  Because another node has asked for the
+		 * lock first; that's why we're here in unblock_lock().
+		 *
+		 * The solution is OCFS2_LOCK_PENDING.  When PENDING is
+		 * set, we just requeue the unblock.  Only when the other
+		 * thread has called dlm_lock() and cleared PENDING will
+		 * we then cancel their request.
+		 *
+		 * All callers of dlm_lock() must set OCFS2_DLM_PENDING
+		 * at the same time they set OCFS2_DLM_BUSY.  They must
+		 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
+		 */
+		if (lockres->l_flags & OCFS2_LOCK_PENDING) {
+			mlog(ML_BASTS, "lockres %s, ReQ: Pending\n",
+			     lockres->l_name);
+			goto leave_requeue;
+		}
+
+		ctl->requeue = 1;
+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		if (ret) {
+			ret = ocfs2_cancel_convert(osb, lockres);
+			if (ret < 0)
+				mlog_errno(ret);
+		}
+		goto leave;
+	}
+
+	/*
+	 * This prevents livelocks. OCFS2_LOCK_UPCONVERT_FINISHING flag is
+	 * set when the ast is received for an upconvert just before the
+	 * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast
+	 * on the heels of the ast, we want to delay the downconvert just
+	 * enough to allow the up requestor to do its task. Because this
+	 * lock is in the blocked queue, the lock will be downconverted
+	 * as soon as the requestor is done with the lock.
+	 */
+	if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING)
+		goto leave_requeue;
+
+	/*
+	 * How can we block and yet be at NL?  We were trying to upconvert
+	 * from NL and got canceled.  The code comes back here, and now
+	 * we notice and clear BLOCKING.
+	 */
+	if (lockres->l_level == DLM_LOCK_NL) {
+		BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders);
+		mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name);
+		lockres->l_blocking = DLM_LOCK_NL;
+		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto leave;
+	}
+
+	/* if we're blocking an exclusive and we have *any* holders,
+	 * then requeue. */
+	if ((lockres->l_blocking == DLM_LOCK_EX)
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n",
+		     lockres->l_name, lockres->l_ex_holders,
+		     lockres->l_ro_holders);
+		goto leave_requeue;
+	}
+
+	/* If it's a PR we're blocking, then only
+	 * requeue if we've got any EX holders */
+	if (lockres->l_blocking == DLM_LOCK_PR &&
+	    lockres->l_ex_holders) {
+		mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n",
+		     lockres->l_name, lockres->l_ex_holders);
+		goto leave_requeue;
+	}
+
+	/*
+	 * Can we get a lock in this state if the holder counts are
+	 * zero? The meta data unblock code used to check this.
+	 */
+	if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
+	    && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) {
+		mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n",
+		     lockres->l_name);
+		goto leave_requeue;
+	}
+
+	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+
+	if (lockres->l_ops->check_downconvert
+	    && !lockres->l_ops->check_downconvert(lockres, new_level)) {
+		mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n",
+		     lockres->l_name);
+		goto leave_requeue;
+	}
+
+	/* If we get here, then we know that there are no more
+	 * incompatible holders (and anyone asking for an incompatible
+	 * lock is blocked). We can now downconvert the lock */
+	if (!lockres->l_ops->downconvert_worker)
+		goto downconvert;
+
+	/* Some lockres types want to do a bit of work before
+	 * downconverting a lock. Allow that here. The worker function
+	 * may sleep, so we save off a copy of what we're blocking as
+	 * it may change while we're not holding the spin lock. */
+	blocking = lockres->l_blocking;
+	level = lockres->l_level;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
+
+	if (ctl->unblock_action == UNBLOCK_STOP_POST) {
+		mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n",
+		     lockres->l_name);
+		goto leave;
+	}
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) {
+		/* If this changed underneath us, then we can't drop
+		 * it just yet. */
+		mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, "
+		     "Recheck\n", lockres->l_name, blocking,
+		     lockres->l_blocking, level, lockres->l_level);
+		goto recheck;
+	}
+
+downconvert:
+	ctl->requeue = 0;
+
+	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
+		if (lockres->l_level == DLM_LOCK_EX)
+			set_lvb = 1;
+
+		/*
+		 * We only set the lvb if the lock has been fully
+		 * refreshed - otherwise we risk setting stale
+		 * data. Otherwise, there's no need to actually clear
+		 * out the lvb here as it's value is still valid.
+		 */
+		if (set_lvb && !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
+			lockres->l_ops->set_lvb(lockres);
+	}
+
+	gen = ocfs2_prepare_downconvert(lockres, new_level);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
+				     gen);
+
+leave:
+	if (ret)
+		mlog_errno(ret);
+	return ret;
+
+leave_requeue:
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	ctl->requeue = 1;
+
+	return 0;
+}
+
+static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
+				     int blocking)
+{
+	struct inode *inode;
+	struct address_space *mapping;
+	struct ocfs2_inode_info *oi;
+
+       	inode = ocfs2_lock_res_inode(lockres);
+	mapping = inode->i_mapping;
+
+	if (S_ISDIR(inode->i_mode)) {
+		oi = OCFS2_I(inode);
+		oi->ip_dir_lock_gen++;
+		mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
+		goto out;
+	}
+
+	if (!S_ISREG(inode->i_mode))
+		goto out;
+
+	/*
+	 * We need this before the filemap_fdatawrite() so that it can
+	 * transfer the dirty bit from the PTE to the
+	 * page. Unfortunately this means that even for EX->PR
+	 * downconverts, we'll lose our mappings and have to build
+	 * them up again.
+	 */
+	unmap_mapping_range(mapping, 0, 0, 0);
+
+	if (filemap_fdatawrite(mapping)) {
+		mlog(ML_ERROR, "Could not sync inode %llu for downconvert!",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	}
+	sync_mapping_buffers(mapping);
+	if (blocking == DLM_LOCK_EX) {
+		truncate_inode_pages(mapping, 0);
+	} else {
+		/* We only need to wait on the I/O if we're not also
+		 * truncating pages because truncate_inode_pages waits
+		 * for us above. We don't truncate pages if we're
+		 * blocking anything < EXMODE because we want to keep
+		 * them around in that case. */
+		filemap_fdatawait(mapping);
+	}
+
+out:
+	return UNBLOCK_CONTINUE;
+}
+
+static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci,
+				 struct ocfs2_lock_res *lockres,
+				 int new_level)
+{
+	int checkpointed = ocfs2_ci_fully_checkpointed(ci);
+
+	BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
+	BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
+
+	if (checkpointed)
+		return 1;
+
+	ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci)));
+	return 0;
+}
+
+static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
+					int new_level)
+{
+	struct inode *inode = ocfs2_lock_res_inode(lockres);
+
+	return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level);
+}
+
+static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
+{
+	struct inode *inode = ocfs2_lock_res_inode(lockres);
+
+	__ocfs2_stuff_meta_lvb(inode);
+}
+
+/*
+ * Does the final reference drop on our dentry lock. Right now this
+ * happens in the downconvert thread, but we could choose to simplify the
+ * dlmglue API and push these off to the ocfs2_wq in the future.
+ */
+static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
+				     struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
+	ocfs2_dentry_lock_put(osb, dl);
+}
+
+/*
+ * d_delete() matching dentries before the lock downconvert.
+ *
+ * At this point, any process waiting to destroy the
+ * dentry_lock due to last ref count is stopped by the
+ * OCFS2_LOCK_QUEUED flag.
+ *
+ * We have two potential problems
+ *
+ * 1) If we do the last reference drop on our dentry_lock (via dput)
+ *    we'll wind up in ocfs2_release_dentry_lock(), waiting on
+ *    the downconvert to finish. Instead we take an elevated
+ *    reference and push the drop until after we've completed our
+ *    unblock processing.
+ *
+ * 2) There might be another process with a final reference,
+ *    waiting on us to finish processing. If this is the case, we
+ *    detect it and exit out - there's no more dentries anyway.
+ */
+static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
+				       int blocking)
+{
+	struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
+	struct ocfs2_inode_info *oi = OCFS2_I(dl->dl_inode);
+	struct dentry *dentry;
+	unsigned long flags;
+	int extra_ref = 0;
+
+	/*
+	 * This node is blocking another node from getting a read
+	 * lock. This happens when we've renamed within a
+	 * directory. We've forced the other nodes to d_delete(), but
+	 * we never actually dropped our lock because it's still
+	 * valid. The downconvert code will retain a PR for this node,
+	 * so there's no further work to do.
+	 */
+	if (blocking == DLM_LOCK_PR)
+		return UNBLOCK_CONTINUE;
+
+	/*
+	 * Mark this inode as potentially orphaned. The code in
+	 * ocfs2_delete_inode() will figure out whether it actually
+	 * needs to be freed or not.
+	 */
+	spin_lock(&oi->ip_lock);
+	oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+	spin_unlock(&oi->ip_lock);
+
+	/*
+	 * Yuck. We need to make sure however that the check of
+	 * OCFS2_LOCK_FREEING and the extra reference are atomic with
+	 * respect to a reference decrement or the setting of that
+	 * flag.
+	 */
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	spin_lock(&dentry_attach_lock);
+	if (!(lockres->l_flags & OCFS2_LOCK_FREEING)
+	    && dl->dl_count) {
+		dl->dl_count++;
+		extra_ref = 1;
+	}
+	spin_unlock(&dentry_attach_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	mlog(0, "extra_ref = %d\n", extra_ref);
+
+	/*
+	 * We have a process waiting on us in ocfs2_dentry_iput(),
+	 * which means we can't have any more outstanding
+	 * aliases. There's no need to do any more work.
+	 */
+	if (!extra_ref)
+		return UNBLOCK_CONTINUE;
+
+	spin_lock(&dentry_attach_lock);
+	while (1) {
+		dentry = ocfs2_find_local_alias(dl->dl_inode,
+						dl->dl_parent_blkno, 1);
+		if (!dentry)
+			break;
+		spin_unlock(&dentry_attach_lock);
+
+		mlog(0, "d_delete(%.*s);\n", dentry->d_name.len,
+		     dentry->d_name.name);
+
+		/*
+		 * The following dcache calls may do an
+		 * iput(). Normally we don't want that from the
+		 * downconverting thread, but in this case it's ok
+		 * because the requesting node already has an
+		 * exclusive lock on the inode, so it can't be queued
+		 * for a downconvert.
+		 */
+		d_delete(dentry);
+		dput(dentry);
+
+		spin_lock(&dentry_attach_lock);
+	}
+	spin_unlock(&dentry_attach_lock);
+
+	/*
+	 * If we are the last holder of this dentry lock, there is no
+	 * reason to downconvert so skip straight to the unlock.
+	 */
+	if (dl->dl_count == 1)
+		return UNBLOCK_STOP_POST;
+
+	return UNBLOCK_CONTINUE_POST;
+}
+
+static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
+					    int new_level)
+{
+	struct ocfs2_refcount_tree *tree =
+				ocfs2_lock_res_refcount_tree(lockres);
+
+	return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level);
+}
+
+static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
+					 int blocking)
+{
+	struct ocfs2_refcount_tree *tree =
+				ocfs2_lock_res_refcount_tree(lockres);
+
+	ocfs2_metadata_cache_purge(&tree->rf_ci);
+
+	return UNBLOCK_CONTINUE;
+}
+
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_qinfo_lvb *lvb;
+	struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
+	struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+					    oinfo->dqi_gi.dqi_type);
+
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
+	lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
+	lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
+	lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
+	lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
+	lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
+	lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
+}
+
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+	struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+	if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
+{
+	struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+					    oinfo->dqi_gi.dqi_type);
+	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+	struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	struct buffer_head *bh = NULL;
+	struct ocfs2_global_disk_dqinfo *gdinfo;
+	int status = 0;
+
+	if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+	    lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
+		info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
+		info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
+		oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
+		oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
+		oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
+		oinfo->dqi_gi.dqi_free_entry =
+					be32_to_cpu(lvb->lvb_free_entry);
+	} else {
+		status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode,
+						     oinfo->dqi_giblk, &bh);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+		gdinfo = (struct ocfs2_global_disk_dqinfo *)
+					(bh->b_data + OCFS2_GLOBAL_INFO_OFF);
+		info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
+		info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
+		oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
+		oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
+		oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
+		oinfo->dqi_gi.dqi_free_entry =
+					le32_to_cpu(gdinfo->dqi_free_entry);
+		brelse(bh);
+		ocfs2_track_lock_refresh(lockres);
+	}
+
+bail:
+	return status;
+}
+
+/* Lock quota info, this function expects at least shared lock on the quota file
+ * so that we can safely refresh quota info from disk. */
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+	struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	int status = 0;
+
+	/* On RO devices, locking really isn't needed... */
+	if (ocfs2_is_hard_readonly(osb)) {
+		if (ex)
+			status = -EROFS;
+		goto bail;
+	}
+	if (ocfs2_mount_local(osb))
+		goto bail;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (!ocfs2_should_refresh_lock_res(lockres))
+		goto bail;
+	/* OK, we have the lock but we need to refresh the quota info */
+	status = ocfs2_refresh_qinfo(oinfo);
+	if (status)
+		ocfs2_qinfo_unlock(oinfo, ex);
+	ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+	return status;
+}
+
+int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex)
+{
+	int status;
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
+	struct ocfs2_super *osb = lockres->l_priv;
+
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (ocfs2_mount_local(osb))
+		return 0;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
+{
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
+	struct ocfs2_super *osb = lockres->l_priv;
+
+	if (!ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+				       struct ocfs2_lock_res *lockres)
+{
+	int status;
+	struct ocfs2_unblock_ctl ctl = {0, 0,};
+	unsigned long flags;
+
+	/* Our reference to the lockres in this function can be
+	 * considered valid until we remove the OCFS2_LOCK_QUEUED
+	 * flag. */
+
+	BUG_ON(!lockres);
+	BUG_ON(!lockres->l_ops);
+
+	mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name);
+
+	/* Detect whether a lock has been marked as going away while
+	 * the downconvert thread was processing other things. A lock can
+	 * still be marked with OCFS2_LOCK_FREEING after this check,
+	 * but short circuiting here will still save us some
+	 * performance. */
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (lockres->l_flags & OCFS2_LOCK_FREEING)
+		goto unqueue;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = ocfs2_unblock_lock(osb, lockres, &ctl);
+	if (status < 0)
+		mlog_errno(status);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+unqueue:
+	if (lockres->l_flags & OCFS2_LOCK_FREEING || !ctl.requeue) {
+		lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
+	} else
+		ocfs2_schedule_blocked_lock(osb, lockres);
+
+	mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
+	     ctl.requeue ? "yes" : "no");
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	if (ctl.unblock_action != UNBLOCK_CONTINUE
+	    && lockres->l_ops->post_unlock)
+		lockres->l_ops->post_unlock(osb, lockres);
+}
+
+static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	if (lockres->l_flags & OCFS2_LOCK_FREEING) {
+		/* Do not schedule a lock for downconvert when it's on
+		 * the way to destruction - any nodes wanting access
+		 * to the resource will get it soon. */
+		mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n",
+		     lockres->l_name, lockres->l_flags);
+		return;
+	}
+
+	lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
+
+	spin_lock(&osb->dc_task_lock);
+	if (list_empty(&lockres->l_blocked_list)) {
+		list_add_tail(&lockres->l_blocked_list,
+			      &osb->blocked_lock_list);
+		osb->blocked_lock_count++;
+	}
+	spin_unlock(&osb->dc_task_lock);
+}
+
+static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
+{
+	unsigned long processed;
+	struct ocfs2_lock_res *lockres;
+
+	spin_lock(&osb->dc_task_lock);
+	/* grab this early so we know to try again if a state change and
+	 * wake happens part-way through our work  */
+	osb->dc_work_sequence = osb->dc_wake_sequence;
+
+	processed = osb->blocked_lock_count;
+	while (processed) {
+		BUG_ON(list_empty(&osb->blocked_lock_list));
+
+		lockres = list_entry(osb->blocked_lock_list.next,
+				     struct ocfs2_lock_res, l_blocked_list);
+		list_del_init(&lockres->l_blocked_list);
+		osb->blocked_lock_count--;
+		spin_unlock(&osb->dc_task_lock);
+
+		BUG_ON(!processed);
+		processed--;
+
+		ocfs2_process_blocked_lock(osb, lockres);
+
+		spin_lock(&osb->dc_task_lock);
+	}
+	spin_unlock(&osb->dc_task_lock);
+}
+
+static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
+{
+	int empty = 0;
+
+	spin_lock(&osb->dc_task_lock);
+	if (list_empty(&osb->blocked_lock_list))
+		empty = 1;
+
+	spin_unlock(&osb->dc_task_lock);
+	return empty;
+}
+
+static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
+{
+	int should_wake = 0;
+
+	spin_lock(&osb->dc_task_lock);
+	if (osb->dc_work_sequence != osb->dc_wake_sequence)
+		should_wake = 1;
+	spin_unlock(&osb->dc_task_lock);
+
+	return should_wake;
+}
+
+static int ocfs2_downconvert_thread(void *arg)
+{
+	int status = 0;
+	struct ocfs2_super *osb = arg;
+
+	/* only quit once we've been asked to stop and there is no more
+	 * work available */
+	while (!(kthread_should_stop() &&
+		ocfs2_downconvert_thread_lists_empty(osb))) {
+
+		wait_event_interruptible(osb->dc_event,
+					 ocfs2_downconvert_thread_should_wake(osb) ||
+					 kthread_should_stop());
+
+		mlog(0, "downconvert_thread: awoken\n");
+
+		ocfs2_downconvert_thread_do_work(osb);
+	}
+
+	osb->dc_task = NULL;
+	return status;
+}
+
+void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
+{
+	spin_lock(&osb->dc_task_lock);
+	/* make sure the voting thread gets a swipe at whatever changes
+	 * the caller may have made to the voting state */
+	osb->dc_wake_sequence++;
+	spin_unlock(&osb->dc_task_lock);
+	wake_up(&osb->dc_event);
+}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
new file mode 100644
index 00000000..1d596d8c
--- /dev/null
+++ b/fs/ocfs2/dlmglue.h
@@ -0,0 +1,172 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmglue.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef DLMGLUE_H
+#define DLMGLUE_H
+
+#include "dcache.h"
+
+#define OCFS2_LVB_VERSION 5
+
+struct ocfs2_meta_lvb {
+	__u8         lvb_version;
+	__u8         lvb_reserved0;
+	__be16       lvb_idynfeatures;
+	__be32       lvb_iclusters;
+	__be32       lvb_iuid;
+	__be32       lvb_igid;
+	__be64       lvb_iatime_packed;
+	__be64       lvb_ictime_packed;
+	__be64       lvb_imtime_packed;
+	__be64       lvb_isize;
+	__be16       lvb_imode;
+	__be16       lvb_inlink;
+	__be32       lvb_iattr;
+	__be32       lvb_igeneration;
+	__be32       lvb_reserved2;
+};
+
+#define OCFS2_QINFO_LVB_VERSION 1
+
+struct ocfs2_qinfo_lvb {
+	__u8	lvb_version;
+	__u8	lvb_reserved[3];
+	__be32	lvb_bgrace;
+	__be32	lvb_igrace;
+	__be32	lvb_syncms;
+	__be32	lvb_blocks;
+	__be32	lvb_free_blk;
+	__be32	lvb_free_entry;
+};
+
+#define OCFS2_ORPHAN_LVB_VERSION 1
+
+struct ocfs2_orphan_scan_lvb {
+	__u8	lvb_version;
+	__u8	lvb_reserved[3];
+	__be32	lvb_os_seqno;
+};
+
+/* ocfs2_inode_lock_full() 'arg_flags' flags */
+/* don't wait on recovery. */
+#define OCFS2_META_LOCK_RECOVERY	(0x01)
+/* Instruct the dlm not to queue ourselves on the other node. */
+#define OCFS2_META_LOCK_NOQUEUE		(0x02)
+/* don't block waiting for the downconvert thread, instead return -EAGAIN */
+#define OCFS2_LOCK_NONBLOCK		(0x04)
+
+/* Locking subclasses of inode cluster lock */
+enum {
+	OI_LS_NORMAL = 0,
+	OI_LS_PARENT,
+	OI_LS_RENAME1,
+	OI_LS_RENAME2,
+	OI_LS_REFLINK_TARGET,
+};
+
+int ocfs2_dlm_init(struct ocfs2_super *osb);
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
+void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
+void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
+			       enum ocfs2_lock_type type,
+			       unsigned int generation,
+			       struct inode *inode);
+void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
+				u64 parent, struct inode *inode);
+struct ocfs2_file_private;
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+			      struct ocfs2_file_private *fp);
+struct ocfs2_mem_dqinfo;
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+                               struct ocfs2_mem_dqinfo *info);
+void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
+				  struct ocfs2_super *osb, u64 ref_blkno,
+				  unsigned int generation);
+void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
+int ocfs2_create_new_inode_locks(struct inode *inode);
+int ocfs2_drop_inode_locks(struct inode *inode);
+int ocfs2_rw_lock(struct inode *inode, int write);
+void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_open_lock(struct inode *inode);
+int ocfs2_try_open_lock(struct inode *inode, int write);
+void ocfs2_open_unlock(struct inode *inode);
+int ocfs2_inode_lock_atime(struct inode *inode,
+			  struct vfsmount *vfsmnt,
+			  int *level);
+int ocfs2_inode_lock_full_nested(struct inode *inode,
+			 struct buffer_head **ret_bh,
+			 int ex,
+			 int arg_flags,
+			 int subclass);
+int ocfs2_inode_lock_with_page(struct inode *inode,
+			      struct buffer_head **ret_bh,
+			      int ex,
+			      struct page *page);
+/* Variants without special locking class or flags */
+#define ocfs2_inode_lock_full(i, r, e, f)\
+		ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL)
+#define ocfs2_inode_lock_nested(i, b, e, s)\
+		ocfs2_inode_lock_full_nested(i, b, e, 0, s)
+/* 99% of the time we don't want to supply any additional flags --
+ * those are for very specific cases only. */
+#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL)
+void ocfs2_inode_unlock(struct inode *inode,
+		       int ex);
+int ocfs2_super_lock(struct ocfs2_super *osb,
+		     int ex);
+void ocfs2_super_unlock(struct ocfs2_super *osb,
+			int ex);
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno);
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno);
+
+int ocfs2_rename_lock(struct ocfs2_super *osb);
+void ocfs2_rename_unlock(struct ocfs2_super *osb);
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
+int ocfs2_dentry_lock(struct dentry *dentry, int ex);
+void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
+int ocfs2_file_lock(struct file *file, int ex, int trylock);
+void ocfs2_file_unlock(struct file *file);
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+struct ocfs2_refcount_tree;
+int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
+void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
+
+
+void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
+void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
+			       struct ocfs2_lock_res *lockres);
+
+/* for the downconvert thread */
+void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
+
+struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
+void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
+
+/* To set the locking protocol on module initialization */
+void ocfs2_set_locking_protocol(void);
+#endif	/* DLMGLUE_H */
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
new file mode 100644
index 00000000..745db425
--- /dev/null
+++ b/fs/ocfs2/export.c
@@ -0,0 +1,276 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * export.c
+ *
+ * Functions to facilitate NFS exporting
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "dcache.h"
+#include "export.h"
+#include "inode.h"
+
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "ocfs2_trace.h"
+
+struct ocfs2_inode_handle
+{
+	u64 ih_blkno;
+	u32 ih_generation;
+};
+
+static struct dentry *ocfs2_get_dentry(struct super_block *sb,
+		struct ocfs2_inode_handle *handle)
+{
+	struct inode *inode;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	u64 blkno = handle->ih_blkno;
+	int status, set;
+	struct dentry *result;
+
+	trace_ocfs2_get_dentry_begin(sb, handle, (unsigned long long)blkno);
+
+	if (blkno == 0) {
+		result = ERR_PTR(-ESTALE);
+		goto bail;
+	}
+
+	inode = ocfs2_ilookup(sb, blkno);
+	/*
+	 * If the inode exists in memory, we only need to check it's
+	 * generation number
+	 */
+	if (inode)
+		goto check_gen;
+
+	/*
+	 * This will synchronize us against ocfs2_delete_inode() on
+	 * all nodes
+	 */
+	status = ocfs2_nfs_sync_lock(osb, 1);
+	if (status < 0) {
+		mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
+		goto check_err;
+	}
+
+	status = ocfs2_test_inode_bit(osb, blkno, &set);
+	trace_ocfs2_get_dentry_test_bit(status, set);
+	if (status < 0) {
+		if (status == -EINVAL) {
+			/*
+			 * The blkno NFS gave us doesn't even show up
+			 * as an inode, we return -ESTALE to be
+			 * nice
+			 */
+			status = -ESTALE;
+		} else
+			mlog(ML_ERROR, "test inode bit failed %d\n", status);
+		goto unlock_nfs_sync;
+	}
+
+	/* If the inode allocator bit is clear, this inode must be stale */
+	if (!set) {
+		status = -ESTALE;
+		goto unlock_nfs_sync;
+	}
+
+	inode = ocfs2_iget(osb, blkno, 0, 0);
+
+unlock_nfs_sync:
+	ocfs2_nfs_sync_unlock(osb, 1);
+
+check_err:
+	if (status < 0) {
+		if (status == -ESTALE) {
+			trace_ocfs2_get_dentry_stale((unsigned long long)blkno,
+						     handle->ih_generation);
+		}
+		result = ERR_PTR(status);
+		goto bail;
+	}
+
+	if (IS_ERR(inode)) {
+		mlog_errno(PTR_ERR(inode));
+		result = (void *)inode;
+		goto bail;
+	}
+
+check_gen:
+	if (handle->ih_generation != inode->i_generation) {
+		iput(inode);
+		trace_ocfs2_get_dentry_generation((unsigned long long)blkno,
+						  handle->ih_generation,
+						  inode->i_generation);
+		result = ERR_PTR(-ESTALE);
+		goto bail;
+	}
+
+	result = d_obtain_alias(inode);
+	if (IS_ERR(result))
+		mlog_errno(PTR_ERR(result));
+
+bail:
+	trace_ocfs2_get_dentry_end(result);
+	return result;
+}
+
+static struct dentry *ocfs2_get_parent(struct dentry *child)
+{
+	int status;
+	u64 blkno;
+	struct dentry *parent;
+	struct inode *dir = child->d_inode;
+
+	trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
+			       (unsigned long long)OCFS2_I(dir)->ip_blkno);
+
+	status = ocfs2_inode_lock(dir, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		parent = ERR_PTR(status);
+		goto bail;
+	}
+
+	status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno);
+	if (status < 0) {
+		parent = ERR_PTR(-ENOENT);
+		goto bail_unlock;
+	}
+
+	parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
+
+bail_unlock:
+	ocfs2_inode_unlock(dir, 0);
+
+bail:
+	trace_ocfs2_get_parent_end(parent);
+
+	return parent;
+}
+
+static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
+			   int connectable)
+{
+	struct inode *inode = dentry->d_inode;
+	int len = *max_len;
+	int type = 1;
+	u64 blkno;
+	u32 generation;
+	__le32 *fh = (__force __le32 *) fh_in;
+
+	trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len,
+				    dentry->d_name.name,
+				    fh, len, connectable);
+
+	if (connectable && (len < 6)) {
+		*max_len = 6;
+		type = 255;
+		goto bail;
+	} else if (len < 3) {
+		*max_len = 3;
+		type = 255;
+		goto bail;
+	}
+
+	blkno = OCFS2_I(inode)->ip_blkno;
+	generation = inode->i_generation;
+
+	trace_ocfs2_encode_fh_self((unsigned long long)blkno, generation);
+
+	len = 3;
+	fh[0] = cpu_to_le32((u32)(blkno >> 32));
+	fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
+	fh[2] = cpu_to_le32(generation);
+
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		struct inode *parent;
+
+		spin_lock(&dentry->d_lock);
+
+		parent = dentry->d_parent->d_inode;
+		blkno = OCFS2_I(parent)->ip_blkno;
+		generation = parent->i_generation;
+
+		fh[3] = cpu_to_le32((u32)(blkno >> 32));
+		fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
+		fh[5] = cpu_to_le32(generation);
+
+		spin_unlock(&dentry->d_lock);
+
+		len = 6;
+		type = 2;
+
+		trace_ocfs2_encode_fh_parent((unsigned long long)blkno,
+					     generation);
+	}
+
+	*max_len = len;
+
+bail:
+	trace_ocfs2_encode_fh_type(type);
+	return type;
+}
+
+static struct dentry *ocfs2_fh_to_dentry(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	struct ocfs2_inode_handle handle;
+
+	if (fh_len < 3 || fh_type > 2)
+		return NULL;
+
+	handle.ih_blkno = (u64)le32_to_cpu(fid->raw[0]) << 32;
+	handle.ih_blkno |= (u64)le32_to_cpu(fid->raw[1]);
+	handle.ih_generation = le32_to_cpu(fid->raw[2]);
+	return ocfs2_get_dentry(sb, &handle);
+}
+
+static struct dentry *ocfs2_fh_to_parent(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	struct ocfs2_inode_handle parent;
+
+	if (fh_type != 2 || fh_len < 6)
+		return NULL;
+
+	parent.ih_blkno = (u64)le32_to_cpu(fid->raw[3]) << 32;
+	parent.ih_blkno |= (u64)le32_to_cpu(fid->raw[4]);
+	parent.ih_generation = le32_to_cpu(fid->raw[5]);
+	return ocfs2_get_dentry(sb, &parent);
+}
+
+const struct export_operations ocfs2_export_ops = {
+	.encode_fh	= ocfs2_encode_fh,
+	.fh_to_dentry	= ocfs2_fh_to_dentry,
+	.fh_to_parent	= ocfs2_fh_to_parent,
+	.get_parent	= ocfs2_get_parent,
+};
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
new file mode 100644
index 00000000..41a73867
--- /dev/null
+++ b/fs/ocfs2/export.h
@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * export.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_EXPORT_H
+#define OCFS2_EXPORT_H
+
+#include <linux/exportfs.h>
+
+extern const struct export_operations ocfs2_export_ops;
+
+#endif /* OCFS2_EXPORT_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
new file mode 100644
index 00000000..23457b49
--- /dev/null
+++ b/fs/ocfs2/extent_map.c
@@ -0,0 +1,902 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * extent_map.c
+ *
+ * Block/Cluster mapping functions
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/fiemap.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "super.h"
+#include "symlink.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+/*
+ * The extent caching implementation is intentionally trivial.
+ *
+ * We only cache a small number of extents stored directly on the
+ * inode, so linear order operations are acceptable. If we ever want
+ * to increase the size of the extent map, then these algorithms must
+ * get smarter.
+ */
+
+void ocfs2_extent_map_init(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	oi->ip_extent_map.em_num_items = 0;
+	INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
+}
+
+static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+				      unsigned int cpos,
+				      struct ocfs2_extent_map_item **ret_emi)
+{
+	unsigned int range;
+	struct ocfs2_extent_map_item *emi;
+
+	*ret_emi = NULL;
+
+	list_for_each_entry(emi, &em->em_list, ei_list) {
+		range = emi->ei_cpos + emi->ei_clusters;
+
+		if (cpos >= emi->ei_cpos && cpos < range) {
+			list_move(&emi->ei_list, &em->em_list);
+
+			*ret_emi = emi;
+			break;
+		}
+	}
+}
+
+static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
+				   unsigned int *phys, unsigned int *len,
+				   unsigned int *flags)
+{
+	unsigned int coff;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_extent_map_item *emi;
+
+	spin_lock(&oi->ip_lock);
+
+	__ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
+	if (emi) {
+		coff = cpos - emi->ei_cpos;
+		*phys = emi->ei_phys + coff;
+		if (len)
+			*len = emi->ei_clusters - coff;
+		if (flags)
+			*flags = emi->ei_flags;
+	}
+
+	spin_unlock(&oi->ip_lock);
+
+	if (emi == NULL)
+		return -ENOENT;
+
+	return 0;
+}
+
+/*
+ * Forget about all clusters equal to or greater than cpos.
+ */
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
+{
+	struct ocfs2_extent_map_item *emi, *n;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_extent_map *em = &oi->ip_extent_map;
+	LIST_HEAD(tmp_list);
+	unsigned int range;
+
+	spin_lock(&oi->ip_lock);
+	list_for_each_entry_safe(emi, n, &em->em_list, ei_list) {
+		if (emi->ei_cpos >= cpos) {
+			/* Full truncate of this record. */
+			list_move(&emi->ei_list, &tmp_list);
+			BUG_ON(em->em_num_items == 0);
+			em->em_num_items--;
+			continue;
+		}
+
+		range = emi->ei_cpos + emi->ei_clusters;
+		if (range > cpos) {
+			/* Partial truncate */
+			emi->ei_clusters = cpos - emi->ei_cpos;
+		}
+	}
+	spin_unlock(&oi->ip_lock);
+
+	list_for_each_entry_safe(emi, n, &tmp_list, ei_list) {
+		list_del(&emi->ei_list);
+		kfree(emi);
+	}
+}
+
+/*
+ * Is any part of emi2 contained within emi1
+ */
+static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
+				 struct ocfs2_extent_map_item *emi2)
+{
+	unsigned int range1, range2;
+
+	/*
+	 * Check if logical start of emi2 is inside emi1
+	 */
+	range1 = emi1->ei_cpos + emi1->ei_clusters;
+	if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
+		return 1;
+
+	/*
+	 * Check if logical end of emi2 is inside emi1
+	 */
+	range2 = emi2->ei_cpos + emi2->ei_clusters;
+	if (range2 > emi1->ei_cpos && range2 <= range1)
+		return 1;
+
+	return 0;
+}
+
+static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
+				  struct ocfs2_extent_map_item *src)
+{
+	dest->ei_cpos = src->ei_cpos;
+	dest->ei_phys = src->ei_phys;
+	dest->ei_clusters = src->ei_clusters;
+	dest->ei_flags = src->ei_flags;
+}
+
+/*
+ * Try to merge emi with ins. Returns 1 if merge succeeds, zero
+ * otherwise.
+ */
+static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
+					 struct ocfs2_extent_map_item *ins)
+{
+	/*
+	 * Handle contiguousness
+	 */
+	if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
+	    ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
+	    ins->ei_flags == emi->ei_flags) {
+		emi->ei_clusters += ins->ei_clusters;
+		return 1;
+	} else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
+		   (ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos &&
+		   ins->ei_flags == emi->ei_flags) {
+		emi->ei_phys = ins->ei_phys;
+		emi->ei_cpos = ins->ei_cpos;
+		emi->ei_clusters += ins->ei_clusters;
+		return 1;
+	}
+
+	/*
+	 * Overlapping extents - this shouldn't happen unless we've
+	 * split an extent to change it's flags. That is exceedingly
+	 * rare, so there's no sense in trying to optimize it yet.
+	 */
+	if (ocfs2_ei_is_contained(emi, ins) ||
+	    ocfs2_ei_is_contained(ins, emi)) {
+		ocfs2_copy_emi_fields(emi, ins);
+		return 1;
+	}
+
+	/* No merge was possible. */
+	return 0;
+}
+
+/*
+ * In order to reduce complexity on the caller, this insert function
+ * is intentionally liberal in what it will accept.
+ *
+ * The only rule is that the truncate call *must* be used whenever
+ * records have been deleted. This avoids inserting overlapping
+ * records with different physical mappings.
+ */
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+				 struct ocfs2_extent_rec *rec)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_extent_map *em = &oi->ip_extent_map;
+	struct ocfs2_extent_map_item *emi, *new_emi = NULL;
+	struct ocfs2_extent_map_item ins;
+
+	ins.ei_cpos = le32_to_cpu(rec->e_cpos);
+	ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
+					       le64_to_cpu(rec->e_blkno));
+	ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
+	ins.ei_flags = rec->e_flags;
+
+search:
+	spin_lock(&oi->ip_lock);
+
+	list_for_each_entry(emi, &em->em_list, ei_list) {
+		if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
+			list_move(&emi->ei_list, &em->em_list);
+			spin_unlock(&oi->ip_lock);
+			goto out;
+		}
+	}
+
+	/*
+	 * No item could be merged.
+	 *
+	 * Either allocate and add a new item, or overwrite the last recently
+	 * inserted.
+	 */
+
+	if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
+		if (new_emi == NULL) {
+			spin_unlock(&oi->ip_lock);
+
+			new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
+			if (new_emi == NULL)
+				goto out;
+
+			goto search;
+		}
+
+		ocfs2_copy_emi_fields(new_emi, &ins);
+		list_add(&new_emi->ei_list, &em->em_list);
+		em->em_num_items++;
+		new_emi = NULL;
+	} else {
+		BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
+		emi = list_entry(em->em_list.prev,
+				 struct ocfs2_extent_map_item, ei_list);
+		list_move(&emi->ei_list, &em->em_list);
+		ocfs2_copy_emi_fields(emi, &ins);
+	}
+
+	spin_unlock(&oi->ip_lock);
+
+out:
+	if (new_emi)
+		kfree(new_emi);
+}
+
+static int ocfs2_last_eb_is_empty(struct inode *inode,
+				  struct ocfs2_dinode *di)
+{
+	int ret, next_free;
+	u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+
+	ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+	el = &eb->h_list;
+
+	if (el->l_tree_depth) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %lu has non zero tree depth in "
+			    "leaf block %llu\n", inode->i_ino,
+			    (unsigned long long)eb_bh->b_blocknr);
+		ret = -EROFS;
+		goto out;
+	}
+
+	next_free = le16_to_cpu(el->l_next_free_rec);
+
+	if (next_free == 0 ||
+	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
+		ret = 1;
+
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+/*
+ * Return the 1st index within el which contains an extent start
+ * larger than v_cluster.
+ */
+static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
+				       u32 v_cluster)
+{
+	int i;
+	struct ocfs2_extent_rec *rec;
+
+	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+
+		if (v_cluster < le32_to_cpu(rec->e_cpos))
+			break;
+	}
+
+	return i;
+}
+
+/*
+ * Figure out the size of a hole which starts at v_cluster within the given
+ * extent list.
+ *
+ * If there is no more allocation past v_cluster, we return the maximum
+ * cluster size minus v_cluster.
+ *
+ * If we have in-inode extents, then el points to the dinode list and
+ * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
+ * containing el.
+ */
+int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
+			       struct ocfs2_extent_list *el,
+			       struct buffer_head *eb_bh,
+			       u32 v_cluster,
+			       u32 *num_clusters)
+{
+	int ret, i;
+	struct buffer_head *next_eb_bh = NULL;
+	struct ocfs2_extent_block *eb, *next_eb;
+
+	i = ocfs2_search_for_hole_index(el, v_cluster);
+
+	if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
+		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+
+		/*
+		 * Check the next leaf for any extents.
+		 */
+
+		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
+			goto no_more_extents;
+
+		ret = ocfs2_read_extent_block(ci,
+					      le64_to_cpu(eb->h_next_leaf_blk),
+					      &next_eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
+		el = &next_eb->h_list;
+		i = ocfs2_search_for_hole_index(el, v_cluster);
+	}
+
+no_more_extents:
+	if (i == le16_to_cpu(el->l_next_free_rec)) {
+		/*
+		 * We're at the end of our existing allocation. Just
+		 * return the maximum number of clusters we could
+		 * possibly allocate.
+		 */
+		*num_clusters = UINT_MAX - v_cluster;
+	} else {
+		*num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
+	}
+
+	ret = 0;
+out:
+	brelse(next_eb_bh);
+	return ret;
+}
+
+static int ocfs2_get_clusters_nocache(struct inode *inode,
+				      struct buffer_head *di_bh,
+				      u32 v_cluster, unsigned int *hole_len,
+				      struct ocfs2_extent_rec *ret_rec,
+				      unsigned int *is_last)
+{
+	int i, ret, tree_height, len;
+	struct ocfs2_dinode *di;
+	struct ocfs2_extent_block *uninitialized_var(eb);
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+	struct buffer_head *eb_bh = NULL;
+
+	memset(ret_rec, 0, sizeof(*ret_rec));
+	if (is_last)
+		*is_last = 0;
+
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	el = &di->id2.i_list;
+	tree_height = le16_to_cpu(el->l_tree_depth);
+
+	if (tree_height > 0) {
+		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
+				      &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	i = ocfs2_search_extent_list(el, v_cluster);
+	if (i == -1) {
+		/*
+		 * Holes can be larger than the maximum size of an
+		 * extent, so we return their lengths in a separate
+		 * field.
+		 */
+		if (hole_len) {
+			ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode),
+							 el, eb_bh,
+							 v_cluster, &len);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			*hole_len = len;
+		}
+		goto out_hole;
+	}
+
+	rec = &el->l_recs[i];
+
+	BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+	if (!rec->e_blkno) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0)", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	*ret_rec = *rec;
+
+	/*
+	 * Checking for last extent is potentially expensive - we
+	 * might have to look at the next leaf over to see if it's
+	 * empty.
+	 *
+	 * The first two checks are to see whether the caller even
+	 * cares for this information, and if the extent is at least
+	 * the last in it's list.
+	 *
+	 * If those hold true, then the extent is last if any of the
+	 * additional conditions hold true:
+	 *  - Extent list is in-inode
+	 *  - Extent list is right-most
+	 *  - Extent list is 2nd to rightmost, with empty right-most
+	 */
+	if (is_last) {
+		if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
+			if (tree_height == 0)
+				*is_last = 1;
+			else if (eb->h_blkno == di->i_last_eb_blk)
+				*is_last = 1;
+			else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
+				ret = ocfs2_last_eb_is_empty(inode, di);
+				if (ret < 0) {
+					mlog_errno(ret);
+					goto out;
+				}
+				if (ret == 1)
+					*is_last = 1;
+			}
+		}
+	}
+
+out_hole:
+	ret = 0;
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+static void ocfs2_relative_extent_offsets(struct super_block *sb,
+					  u32 v_cluster,
+					  struct ocfs2_extent_rec *rec,
+					  u32 *p_cluster, u32 *num_clusters)
+
+{
+	u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
+
+	*p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
+	*p_cluster = *p_cluster + coff;
+
+	if (num_clusters)
+		*num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
+}
+
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+			     u32 *p_cluster, u32 *num_clusters,
+			     struct ocfs2_extent_list *el,
+			     unsigned int *extent_flags)
+{
+	int ret = 0, i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec;
+	u32 coff;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
+				      &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "xattr leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	i = ocfs2_search_extent_list(el, v_cluster);
+	if (i == -1) {
+		ret = -EROFS;
+		mlog_errno(ret);
+		goto out;
+	} else {
+		rec = &el->l_recs[i];
+		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+		if (!rec->e_blkno) {
+			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+				    "record (%u, %u, 0) in xattr", inode->i_ino,
+				    le32_to_cpu(rec->e_cpos),
+				    ocfs2_rec_clusters(el, rec));
+			ret = -EROFS;
+			goto out;
+		}
+		coff = v_cluster - le32_to_cpu(rec->e_cpos);
+		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
+						    le64_to_cpu(rec->e_blkno));
+		*p_cluster = *p_cluster + coff;
+		if (num_clusters)
+			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+
+		if (extent_flags)
+			*extent_flags = rec->e_flags;
+	}
+out:
+	if (eb_bh)
+		brelse(eb_bh);
+	return ret;
+}
+
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+		       u32 *p_cluster, u32 *num_clusters,
+		       unsigned int *extent_flags)
+{
+	int ret;
+	unsigned int uninitialized_var(hole_len), flags = 0;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_extent_rec rec;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = -ERANGE;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+				      num_clusters, extent_flags);
+	if (ret == 0)
+		goto out;
+
+	ret = ocfs2_read_inode_block(inode, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
+					 &rec, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (rec.e_blkno == 0ULL) {
+		/*
+		 * A hole was found. Return some canned values that
+		 * callers can key on. If asked for, num_clusters will
+		 * be populated with the size of the hole.
+		 */
+		*p_cluster = 0;
+		if (num_clusters) {
+			*num_clusters = hole_len;
+		}
+	} else {
+		ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
+					      p_cluster, num_clusters);
+		flags = rec.e_flags;
+
+		ocfs2_extent_map_insert_rec(inode, &rec);
+	}
+
+	if (extent_flags)
+		*extent_flags = flags;
+
+out:
+	brelse(di_bh);
+	return ret;
+}
+
+/*
+ * This expects alloc_sem to be held. The allocation cannot change at
+ * all while the map is in the process of being updated.
+ */
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+				u64 *ret_count, unsigned int *extent_flags)
+{
+	int ret;
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	u32 cpos, num_clusters, p_cluster;
+	u64 boff = 0;
+
+	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
+
+	ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
+				 extent_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * p_cluster == 0 indicates a hole.
+	 */
+	if (p_cluster) {
+		boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+		boff += (v_blkno & (u64)(bpc - 1));
+	}
+
+	*p_blkno = boff;
+
+	if (ret_count) {
+		*ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+		*ret_count -= v_blkno & (u64)(bpc - 1);
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * The ocfs2_fiemap_inline() may be a little bit misleading, since
+ * it not only handles the fiemap for inlined files, but also deals
+ * with the fast symlink, cause they have no difference for extent
+ * mapping per se.
+ */
+static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
+			       struct fiemap_extent_info *fieinfo,
+			       u64 map_start)
+{
+	int ret;
+	unsigned int id_count;
+	struct ocfs2_dinode *di;
+	u64 phys;
+	u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	if (ocfs2_inode_is_fast_symlink(inode))
+		id_count = ocfs2_fast_symlink_chars(inode->i_sb);
+	else
+		id_count = le16_to_cpu(di->id2.i_data.id_count);
+
+	if (map_start < id_count) {
+		phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
+		if (ocfs2_inode_is_fast_symlink(inode))
+			phys += offsetof(struct ocfs2_dinode, id2.i_symlink);
+		else
+			phys += offsetof(struct ocfs2_dinode,
+					 id2.i_data.id_data);
+
+		ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
+					      flags);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+#define OCFS2_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
+
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 u64 map_start, u64 map_len)
+{
+	int ret, is_last;
+	u32 mapping_end, cpos;
+	unsigned int hole_size;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u64 len_bytes, phys_bytes, virt_bytes;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_extent_rec rec;
+
+	ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
+	if (ret)
+		return ret;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	/*
+	 * Handle inline-data and fast symlink separately.
+	 */
+	if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
+	    ocfs2_inode_is_fast_symlink(inode)) {
+		ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
+		goto out_unlock;
+	}
+
+	cpos = map_start >> osb->s_clustersize_bits;
+	mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
+					       map_start + map_len);
+	mapping_end -= cpos;
+	is_last = 0;
+	while (cpos < mapping_end && !is_last) {
+		u32 fe_flags;
+
+		ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
+						 &hole_size, &rec, &is_last);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (rec.e_blkno == 0ULL) {
+			cpos += hole_size;
+			continue;
+		}
+
+		fe_flags = 0;
+		if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
+			fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
+		if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
+			fe_flags |= FIEMAP_EXTENT_SHARED;
+		if (is_last)
+			fe_flags |= FIEMAP_EXTENT_LAST;
+		len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
+		phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
+		virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
+
+		ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
+					      len_bytes, fe_flags);
+		if (ret)
+			break;
+
+		cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
+	}
+
+	if (ret > 0)
+		ret = 0;
+
+out_unlock:
+	brelse(di_bh);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_inode_unlock(inode, 0);
+out:
+
+	return ret;
+}
+
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+			   struct buffer_head *bhs[], int flags,
+			   int (*validate)(struct super_block *sb,
+					   struct buffer_head *bh))
+{
+	int rc = 0;
+	u64 p_block, p_count;
+	int i, count, done = 0;
+
+	trace_ocfs2_read_virt_blocks(
+	     inode, (unsigned long long)v_block, nr, bhs, flags,
+	     validate);
+
+	if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
+	    i_size_read(inode)) {
+		BUG_ON(!(flags & OCFS2_BH_READAHEAD));
+		goto out;
+	}
+
+	while (done < nr) {
+		down_read(&OCFS2_I(inode)->ip_alloc_sem);
+		rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
+						 &p_block, &p_count, NULL);
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
+		if (rc) {
+			mlog_errno(rc);
+			break;
+		}
+
+		if (!p_block) {
+			rc = -EIO;
+			mlog(ML_ERROR,
+			     "Inode #%llu contains a hole at offset %llu\n",
+			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			     (unsigned long long)(v_block + done) <<
+			     inode->i_sb->s_blocksize_bits);
+			break;
+		}
+
+		count = nr - done;
+		if (p_count < count)
+			count = p_count;
+
+		/*
+		 * If the caller passed us bhs, they should have come
+		 * from a previous readahead call to this function.  Thus,
+		 * they should have the right b_blocknr.
+		 */
+		for (i = 0; i < count; i++) {
+			if (!bhs[done + i])
+				continue;
+			BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
+		}
+
+		rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count,
+				       bhs + done, flags, validate);
+		if (rc) {
+			mlog_errno(rc);
+			break;
+		}
+		done += count;
+	}
+
+out:
+	return rc;
+}
+
+
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
new file mode 100644
index 00000000..e79d41c2
--- /dev/null
+++ b/fs/ocfs2/extent_map.h
@@ -0,0 +1,90 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * extent_map.h
+ *
+ * In-memory file extent mappings for OCFS2.
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _EXTENT_MAP_H
+#define _EXTENT_MAP_H
+
+struct ocfs2_extent_map_item {
+	unsigned int			ei_cpos;
+	unsigned int			ei_phys;
+	unsigned int			ei_clusters;
+	unsigned int			ei_flags;
+
+	struct list_head		ei_list;
+};
+
+#define OCFS2_MAX_EXTENT_MAP_ITEMS			3
+struct ocfs2_extent_map {
+	unsigned int			em_num_items;
+	struct list_head		em_list;
+};
+
+void ocfs2_extent_map_init(struct inode *inode);
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+				 struct ocfs2_extent_rec *rec);
+
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
+		       u32 *num_clusters, unsigned int *extent_flags);
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+				u64 *ret_count, unsigned int *extent_flags);
+
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 u64 map_start, u64 map_len);
+
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+			     u32 *p_cluster, u32 *num_clusters,
+			     struct ocfs2_extent_list *el,
+			     unsigned int *extent_flags);
+
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+			   struct buffer_head *bhs[], int flags,
+			   int (*validate)(struct super_block *sb,
+					   struct buffer_head *bh));
+int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
+			       struct ocfs2_extent_list *el,
+			       struct buffer_head *eb_bh,
+			       u32 v_cluster,
+			       u32 *num_clusters);
+static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
+					struct buffer_head **bh,
+					int (*validate)(struct super_block *sb,
+							struct buffer_head *bh))
+{
+	int status = 0;
+
+	if (bh == NULL) {
+		printk("ocfs2: bh == NULL\n");
+		status = -EINVAL;
+		goto bail;
+	}
+
+	status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
+
+bail:
+	return status;
+}
+
+
+#endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
new file mode 100644
index 00000000..b1e35a39
--- /dev/null
+++ b/fs/ocfs2/file.c
@@ -0,0 +1,2688 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.c
+ *
+ * File open, close, extend, truncate
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/splice.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+#include <linux/quotaops.h>
+#include <linux/blkdev.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "sysfile.h"
+#include "inode.h"
+#include "ioctl.h"
+#include "journal.h"
+#include "locks.h"
+#include "mmap.h"
+#include "suballoc.h"
+#include "super.h"
+#include "xattr.h"
+#include "acl.h"
+#include "quota.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_init_file_private(struct inode *inode, struct file *file)
+{
+	struct ocfs2_file_private *fp;
+
+	fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
+	if (!fp)
+		return -ENOMEM;
+
+	fp->fp_file = file;
+	mutex_init(&fp->fp_mutex);
+	ocfs2_file_lock_res_init(&fp->fp_flock, fp);
+	file->private_data = fp;
+
+	return 0;
+}
+
+static void ocfs2_free_file_private(struct inode *inode, struct file *file)
+{
+	struct ocfs2_file_private *fp = file->private_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (fp) {
+		ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
+		ocfs2_lock_res_free(&fp->fp_flock);
+		kfree(fp);
+		file->private_data = NULL;
+	}
+}
+
+static int ocfs2_file_open(struct inode *inode, struct file *file)
+{
+	int status;
+	int mode = file->f_flags;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	trace_ocfs2_file_open(inode, file, file->f_path.dentry,
+			      (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			      file->f_path.dentry->d_name.len,
+			      file->f_path.dentry->d_name.name, mode);
+
+	if (file->f_mode & FMODE_WRITE)
+		dquot_initialize(inode);
+
+	spin_lock(&oi->ip_lock);
+
+	/* Check that the inode hasn't been wiped from disk by another
+	 * node. If it hasn't then we're safe as long as we hold the
+	 * spin lock until our increment of open count. */
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+		spin_unlock(&oi->ip_lock);
+
+		status = -ENOENT;
+		goto leave;
+	}
+
+	if (mode & O_DIRECT)
+		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
+
+	oi->ip_open_count++;
+	spin_unlock(&oi->ip_lock);
+
+	status = ocfs2_init_file_private(inode, file);
+	if (status) {
+		/*
+		 * We want to set open count back if we're failing the
+		 * open.
+		 */
+		spin_lock(&oi->ip_lock);
+		oi->ip_open_count--;
+		spin_unlock(&oi->ip_lock);
+	}
+
+leave:
+	return status;
+}
+
+static int ocfs2_file_release(struct inode *inode, struct file *file)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	spin_lock(&oi->ip_lock);
+	if (!--oi->ip_open_count)
+		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
+
+	trace_ocfs2_file_release(inode, file, file->f_path.dentry,
+				 oi->ip_blkno,
+				 file->f_path.dentry->d_name.len,
+				 file->f_path.dentry->d_name.name,
+				 oi->ip_open_count);
+	spin_unlock(&oi->ip_lock);
+
+	ocfs2_free_file_private(inode, file);
+
+	return 0;
+}
+
+static int ocfs2_dir_open(struct inode *inode, struct file *file)
+{
+	return ocfs2_init_file_private(inode, file);
+}
+
+static int ocfs2_dir_release(struct inode *inode, struct file *file)
+{
+	ocfs2_free_file_private(inode, file);
+	return 0;
+}
+
+static int ocfs2_sync_file(struct file *file, int datasync)
+{
+	int err = 0;
+	journal_t *journal;
+	struct inode *inode = file->f_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
+			      OCFS2_I(inode)->ip_blkno,
+			      file->f_path.dentry->d_name.len,
+			      file->f_path.dentry->d_name.name,
+			      (unsigned long long)datasync);
+
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
+		/*
+		 * We still have to flush drive's caches to get data to the
+		 * platter
+		 */
+		if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
+			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+		goto bail;
+	}
+
+	journal = osb->journal->j_journal;
+	err = jbd2_journal_force_commit(journal);
+
+bail:
+	if (err)
+		mlog_errno(err);
+
+	return (err < 0) ? -EIO : 0;
+}
+
+int ocfs2_should_update_atime(struct inode *inode,
+			      struct vfsmount *vfsmnt)
+{
+	struct timespec now;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return 0;
+
+	if ((inode->i_flags & S_NOATIME) ||
+	    ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
+		return 0;
+
+	/*
+	 * We can be called with no vfsmnt structure - NFSD will
+	 * sometimes do this.
+	 *
+	 * Note that our action here is different than touch_atime() -
+	 * if we can't tell whether this is a noatime mount, then we
+	 * don't know whether to trust the value of s_atime_quantum.
+	 */
+	if (vfsmnt == NULL)
+		return 0;
+
+	if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
+	    ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
+		return 0;
+
+	if (vfsmnt->mnt_flags & MNT_RELATIME) {
+		if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
+		    (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
+			return 1;
+
+		return 0;
+	}
+
+	now = CURRENT_TIME;
+	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
+		return 0;
+	else
+		return 1;
+}
+
+int ocfs2_update_inode_atime(struct inode *inode,
+			     struct buffer_head *bh)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * Don't use ocfs2_mark_inode_dirty() here as we don't always
+	 * have i_mutex to guard against concurrent changes to other
+	 * inode fields.
+	 */
+	inode->i_atime = CURRENT_TIME;
+	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
+	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	ocfs2_journal_dirty(handle, bh);
+
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
+static int ocfs2_set_inode_size(handle_t *handle,
+				struct inode *inode,
+				struct buffer_head *fe_bh,
+				u64 new_i_size)
+{
+	int status;
+
+	i_size_write(inode, new_i_size);
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	return status;
+}
+
+int ocfs2_simple_size_update(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle = NULL;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_set_inode_size(handle, inode, di_bh,
+				   new_i_size);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+static int ocfs2_cow_file_pos(struct inode *inode,
+			      struct buffer_head *fe_bh,
+			      u64 offset)
+{
+	int status;
+	u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	unsigned int num_clusters = 0;
+	unsigned int ext_flags = 0;
+
+	/*
+	 * If the new offset is aligned to the range of the cluster, there is
+	 * no space for ocfs2_zero_range_for_truncate to fill, so no need to
+	 * CoW either.
+	 */
+	if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
+		return 0;
+
+	status = ocfs2_get_clusters(inode, cpos, &phys,
+				    &num_clusters, &ext_flags);
+	if (status) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+		goto out;
+
+	return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
+
+out:
+	return status;
+}
+
+static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
+				     struct inode *inode,
+				     struct buffer_head *fe_bh,
+				     u64 new_i_size)
+{
+	int status;
+	handle_t *handle;
+	struct ocfs2_dinode *di;
+	u64 cluster_bytes;
+
+	/*
+	 * We need to CoW the cluster contains the offset if it is reflinked
+	 * since we will call ocfs2_zero_range_for_truncate later which will
+	 * write "0" from offset to the end of the cluster.
+	 */
+	status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
+	if (status) {
+		mlog_errno(status);
+		return status;
+	}
+
+	/* TODO: This needs to actually orphan the inode in this
+	 * transaction. */
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	/*
+	 * Do this before setting i_size.
+	 */
+	cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+	status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
+					       cluster_bytes);
+	if (status) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	i_size_write(inode, new_i_size);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	di = (struct ocfs2_dinode *) fe_bh->b_data;
+	di->i_size = cpu_to_le64(new_i_size);
+	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	ocfs2_journal_dirty(handle, fe_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return status;
+}
+
+static int ocfs2_truncate_file(struct inode *inode,
+			       struct buffer_head *di_bh,
+			       u64 new_i_size)
+{
+	int status = 0;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
+	 * already validated it */
+	fe = (struct ocfs2_dinode *) di_bh->b_data;
+
+	trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				  (unsigned long long)le64_to_cpu(fe->i_size),
+				  (unsigned long long)new_i_size);
+
+	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
+			"Inode %llu, inode i_size = %lld != di "
+			"i_size = %llu, i_flags = 0x%x\n",
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			i_size_read(inode),
+			(unsigned long long)le64_to_cpu(fe->i_size),
+			le32_to_cpu(fe->i_flags));
+
+	if (new_i_size > le64_to_cpu(fe->i_size)) {
+		trace_ocfs2_truncate_file_error(
+			(unsigned long long)le64_to_cpu(fe->i_size),
+			(unsigned long long)new_i_size);
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* lets handle the simple truncate cases before doing any more
+	 * cluster locking. */
+	if (new_i_size == le64_to_cpu(fe->i_size))
+		goto bail;
+
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_resv_discard(&osb->osb_la_resmap,
+			   &OCFS2_I(inode)->ip_la_data_resv);
+
+	/*
+	 * The inode lock forced other nodes to sync and drop their
+	 * pages, which (correctly) happens even if we have a truncate
+	 * without allocation change - ocfs2 cluster sizes can be much
+	 * greater than page size, so we have to truncate them
+	 * anyway.
+	 */
+	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
+	truncate_inode_pages(inode->i_mapping, new_i_size);
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
+					       i_size_read(inode), 1);
+		if (status)
+			mlog_errno(status);
+
+		goto bail_unlock_sem;
+	}
+
+	/* alright, we're going to need to do a full blown alloc size
+	 * change. Orphan the inode so that recovery can complete the
+	 * truncate if necessary. This does the task of marking
+	 * i_size. */
+	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_sem;
+	}
+
+	status = ocfs2_commit_truncate(osb, inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_sem;
+	}
+
+	/* TODO: orphan dir cleanup here. */
+bail_unlock_sem:
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+bail:
+	if (!status && OCFS2_I(inode)->ip_clusters == 0)
+		status = ocfs2_try_remove_refcount_tree(inode, di_bh);
+
+	return status;
+}
+
+/*
+ * extend file allocation only here.
+ * we'll update all the disk stuff, and oip->alloc_size
+ *
+ * expect stuff to be locked, a transaction started and enough data /
+ * metadata reservations in the contexts.
+ *
+ * Will return -EAGAIN, and a reason if a restart is needed.
+ * If passed in, *reason will always be set, even in error.
+ */
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+			 struct inode *inode,
+			 u32 *logical_offset,
+			 u32 clusters_to_add,
+			 int mark_unwritten,
+			 struct buffer_head *fe_bh,
+			 handle_t *handle,
+			 struct ocfs2_alloc_context *data_ac,
+			 struct ocfs2_alloc_context *meta_ac,
+			 enum ocfs2_alloc_restarted *reason_ret)
+{
+	int ret;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
+	ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
+					  clusters_to_add, mark_unwritten,
+					  data_ac, meta_ac, reason_ret);
+
+	return ret;
+}
+
+static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+				     u32 clusters_to_add, int mark_unwritten)
+{
+	int status = 0;
+	int restart_func = 0;
+	int credits;
+	u32 prev_clusters;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	handle_t *handle = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	enum ocfs2_alloc_restarted why;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_extent_tree et;
+	int did_quota = 0;
+
+	/*
+	 * This function only exists for file systems which don't
+	 * support holes.
+	 */
+	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
+
+	status = ocfs2_read_inode_block(inode, &bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+restart_all:
+	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
+
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
+	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+				       &data_ac, &meta_ac);
+	if (status) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
+					    clusters_to_add);
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+restarted_transaction:
+	trace_ocfs2_extend_allocation(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)i_size_read(inode),
+		le32_to_cpu(fe->i_clusters), clusters_to_add,
+		why, restart_func);
+
+	status = dquot_alloc_space_nodirty(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+	if (status)
+		goto leave;
+	did_quota = 1;
+
+	/* reserve a write to the file entry early on - that we if we
+	 * run out of credits in the allocation path, we can still
+	 * update i_size. */
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	prev_clusters = OCFS2_I(inode)->ip_clusters;
+
+	status = ocfs2_add_inode_data(osb,
+				      inode,
+				      &logical_start,
+				      clusters_to_add,
+				      mark_unwritten,
+				      bh,
+				      handle,
+				      data_ac,
+				      meta_ac,
+				      &why);
+	if ((status < 0) && (status != -EAGAIN)) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	ocfs2_journal_dirty(handle, bh);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	/* Release unused quota reservation */
+	dquot_free_space(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+	did_quota = 0;
+
+	if (why != RESTART_NONE && clusters_to_add) {
+		if (why == RESTART_META) {
+			restart_func = 1;
+			status = 0;
+		} else {
+			BUG_ON(why != RESTART_TRANS);
+
+			/* TODO: This can be more intelligent. */
+			credits = ocfs2_calc_extend_credits(osb->sb,
+							    &fe->id2.i_list,
+							    clusters_to_add);
+			status = ocfs2_extend_trans(handle, credits);
+			if (status < 0) {
+				/* handle still has to be committed at
+				 * this point. */
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto leave;
+			}
+			goto restarted_transaction;
+		}
+	}
+
+	trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
+	     le32_to_cpu(fe->i_clusters),
+	     (unsigned long long)le64_to_cpu(fe->i_size),
+	     OCFS2_I(inode)->ip_clusters,
+	     (unsigned long long)i_size_read(inode));
+
+leave:
+	if (status < 0 && did_quota)
+		dquot_free_space(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+	if (handle) {
+		ocfs2_commit_trans(osb, handle);
+		handle = NULL;
+	}
+	if (data_ac) {
+		ocfs2_free_alloc_context(data_ac);
+		data_ac = NULL;
+	}
+	if (meta_ac) {
+		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+	if ((!status) && restart_func) {
+		restart_func = 0;
+		goto restart_all;
+	}
+	brelse(bh);
+	bh = NULL;
+
+	return status;
+}
+
+/*
+ * While a write will already be ordering the data, a truncate will not.
+ * Thus, we need to explicitly order the zeroed pages.
+ */
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle = NULL;
+	int ret = 0;
+
+	if (!ocfs2_should_order_data(inode))
+		goto out;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_jbd2_file_inode(handle, inode);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out:
+	if (ret) {
+		if (!IS_ERR(handle))
+			ocfs2_commit_trans(osb, handle);
+		handle = ERR_PTR(ret);
+	}
+	return handle;
+}
+
+/* Some parts of this taken from generic_cont_expand, which turned out
+ * to be too fragile to do exactly what we need without us having to
+ * worry about recursive locking in ->write_begin() and ->write_end(). */
+static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
+				 u64 abs_to)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
+	handle_t *handle = NULL;
+	int ret = 0;
+	unsigned zero_from, zero_to, block_start, block_end;
+
+	BUG_ON(abs_from >= abs_to);
+	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
+	BUG_ON(abs_from & (inode->i_blkbits - 1));
+
+	page = find_or_create_page(mapping, index, GFP_NOFS);
+	if (!page) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Get the offsets within the page that we want to zero */
+	zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
+	zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
+	if (!zero_to)
+		zero_to = PAGE_CACHE_SIZE;
+
+	trace_ocfs2_write_zero_page(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)abs_from,
+			(unsigned long long)abs_to,
+			index, zero_from, zero_to);
+
+	/* We know that zero_from is block aligned */
+	for (block_start = zero_from; block_start < zero_to;
+	     block_start = block_end) {
+		block_end = block_start + (1 << inode->i_blkbits);
+
+		/*
+		 * block_start is block-aligned.  Bump it by one to force
+		 * __block_write_begin and block_commit_write to zero the
+		 * whole block.
+		 */
+		ret = __block_write_begin(page, block_start + 1, 0,
+					  ocfs2_get_block);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_unlock;
+		}
+
+		if (!handle) {
+			handle = ocfs2_zero_start_ordered_transaction(inode);
+			if (IS_ERR(handle)) {
+				ret = PTR_ERR(handle);
+				handle = NULL;
+				break;
+			}
+		}
+
+		/* must not update i_size! */
+		ret = block_commit_write(page, block_start + 1,
+					 block_start + 1);
+		if (ret < 0)
+			mlog_errno(ret);
+		else
+			ret = 0;
+	}
+
+	if (handle)
+		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+
+out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+/*
+ * Find the next range to zero.  We do this in terms of bytes because
+ * that's what ocfs2_zero_extend() wants, and it is dealing with the
+ * pagecache.  We may return multiple extents.
+ *
+ * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
+ * needs to be zeroed.  range_start and range_end return the next zeroing
+ * range.  A subsequent call should pass the previous range_end as its
+ * zero_start.  If range_end is 0, there's nothing to do.
+ *
+ * Unwritten extents are skipped over.  Refcounted extents are CoWd.
+ */
+static int ocfs2_zero_extend_get_range(struct inode *inode,
+				       struct buffer_head *di_bh,
+				       u64 zero_start, u64 zero_end,
+				       u64 *range_start, u64 *range_end)
+{
+	int rc = 0, needs_cow = 0;
+	u32 p_cpos, zero_clusters = 0;
+	u32 zero_cpos =
+		zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
+	unsigned int num_clusters = 0;
+	unsigned int ext_flags = 0;
+
+	while (zero_cpos < last_cpos) {
+		rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
+					&num_clusters, &ext_flags);
+		if (rc) {
+			mlog_errno(rc);
+			goto out;
+		}
+
+		if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+			zero_clusters = num_clusters;
+			if (ext_flags & OCFS2_EXT_REFCOUNTED)
+				needs_cow = 1;
+			break;
+		}
+
+		zero_cpos += num_clusters;
+	}
+	if (!zero_clusters) {
+		*range_end = 0;
+		goto out;
+	}
+
+	while ((zero_cpos + zero_clusters) < last_cpos) {
+		rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
+					&p_cpos, &num_clusters,
+					&ext_flags);
+		if (rc) {
+			mlog_errno(rc);
+			goto out;
+		}
+
+		if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
+			break;
+		if (ext_flags & OCFS2_EXT_REFCOUNTED)
+			needs_cow = 1;
+		zero_clusters += num_clusters;
+	}
+	if ((zero_cpos + zero_clusters) > last_cpos)
+		zero_clusters = last_cpos - zero_cpos;
+
+	if (needs_cow) {
+		rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
+					zero_clusters, UINT_MAX);
+		if (rc) {
+			mlog_errno(rc);
+			goto out;
+		}
+	}
+
+	*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
+	*range_end = ocfs2_clusters_to_bytes(inode->i_sb,
+					     zero_cpos + zero_clusters);
+
+out:
+	return rc;
+}
+
+/*
+ * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
+ * has made sure that the entire range needs zeroing.
+ */
+static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
+				   u64 range_end)
+{
+	int rc = 0;
+	u64 next_pos;
+	u64 zero_pos = range_start;
+
+	trace_ocfs2_zero_extend_range(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)range_start,
+			(unsigned long long)range_end);
+	BUG_ON(range_start >= range_end);
+
+	while (zero_pos < range_end) {
+		next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+		if (next_pos > range_end)
+			next_pos = range_end;
+		rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
+		if (rc < 0) {
+			mlog_errno(rc);
+			break;
+		}
+		zero_pos = next_pos;
+
+		/*
+		 * Very large extends have the potential to lock up
+		 * the cpu for extended periods of time.
+		 */
+		cond_resched();
+	}
+
+	return rc;
+}
+
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+		      loff_t zero_to_size)
+{
+	int ret = 0;
+	u64 zero_start, range_start = 0, range_end = 0;
+	struct super_block *sb = inode->i_sb;
+
+	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+	trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				(unsigned long long)zero_start,
+				(unsigned long long)i_size_read(inode));
+	while (zero_start < zero_to_size) {
+		ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
+						  zero_to_size,
+						  &range_start,
+						  &range_end);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+		if (!range_end)
+			break;
+		/* Trim the ends */
+		if (range_start < zero_start)
+			range_start = zero_start;
+		if (range_end > zero_to_size)
+			range_end = zero_to_size;
+
+		ret = ocfs2_zero_extend_range(inode, range_start,
+					      range_end);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+		zero_start = range_end;
+	}
+
+	return ret;
+}
+
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+			  u64 new_i_size, u64 zero_to)
+{
+	int ret;
+	u32 clusters_to_add;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	/*
+	 * Only quota files call this without a bh, and they can't be
+	 * refcounted.
+	 */
+	BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
+
+	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
+	if (clusters_to_add < oi->ip_clusters)
+		clusters_to_add = 0;
+	else
+		clusters_to_add -= oi->ip_clusters;
+
+	if (clusters_to_add) {
+		ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
+						clusters_to_add, 0);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * Call this even if we don't add any clusters to the tree. We
+	 * still need to zero the area between the old i_size and the
+	 * new i_size.
+	 */
+	ret = ocfs2_zero_extend(inode, di_bh, zero_to);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+static int ocfs2_extend_file(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size)
+{
+	int ret = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	BUG_ON(!di_bh);
+
+	/* setattr sometimes calls us like this. */
+	if (new_i_size == 0)
+		goto out;
+
+	if (i_size_read(inode) == new_i_size)
+		goto out;
+	BUG_ON(new_i_size < i_size_read(inode));
+
+	/*
+	 * The alloc sem blocks people in read/write from reading our
+	 * allocation until we're done changing it. We depend on
+	 * i_mutex to block other extend/truncate calls while we're
+	 * here.  We even have to hold it for sparse files because there
+	 * might be some tail zeroing.
+	 */
+	down_write(&oi->ip_alloc_sem);
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		/*
+		 * We can optimize small extends by keeping the inodes
+		 * inline data.
+		 */
+		if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
+			up_write(&oi->ip_alloc_sem);
+			goto out_update_size;
+		}
+
+		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
+		if (ret) {
+			up_write(&oi->ip_alloc_sem);
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+		ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
+	else
+		ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
+					    new_i_size);
+
+	up_write(&oi->ip_alloc_sem);
+
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out_update_size:
+	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int status = 0, size_change;
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	struct buffer_head *bh = NULL;
+	handle_t *handle = NULL;
+	struct dquot *transfer_to[MAXQUOTAS] = { };
+	int qtype;
+
+	trace_ocfs2_setattr(inode, dentry,
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    dentry->d_name.len, dentry->d_name.name,
+			    attr->ia_valid, attr->ia_mode,
+			    attr->ia_uid, attr->ia_gid);
+
+	/* ensuring we don't even attempt to truncate a symlink */
+	if (S_ISLNK(inode->i_mode))
+		attr->ia_valid &= ~ATTR_SIZE;
+
+#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
+			   | ATTR_GID | ATTR_UID | ATTR_MODE)
+	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
+		return 0;
+
+	status = inode_change_ok(inode, attr);
+	if (status)
+		return status;
+
+	if (is_quota_modification(inode, attr))
+		dquot_initialize(inode);
+	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
+	if (size_change) {
+		status = ocfs2_rw_lock(inode, 1);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs2_inode_lock(inode, &bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail_unlock_rw;
+	}
+
+	if (size_change && attr->ia_size != i_size_read(inode)) {
+		status = inode_newsize_ok(inode, attr->ia_size);
+		if (status)
+			goto bail_unlock;
+
+		if (i_size_read(inode) > attr->ia_size) {
+			if (ocfs2_should_order_data(inode)) {
+				status = ocfs2_begin_ordered_truncate(inode,
+								      attr->ia_size);
+				if (status)
+					goto bail_unlock;
+			}
+			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
+		} else
+			status = ocfs2_extend_file(inode, bh, attr->ia_size);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			status = -ENOSPC;
+			goto bail_unlock;
+		}
+	}
+
+	if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+	    (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+		/*
+		 * Gather pointers to quota structures so that allocation /
+		 * freeing of quota structures happens here and not inside
+		 * dquot_transfer() where we have problems with lock ordering
+		 */
+		if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
+		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+			transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
+						      USRQUOTA);
+			if (!transfer_to[USRQUOTA]) {
+				status = -ESRCH;
+				goto bail_unlock;
+			}
+		}
+		if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
+		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+			transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
+						      GRPQUOTA);
+			if (!transfer_to[GRPQUOTA]) {
+				status = -ESRCH;
+				goto bail_unlock;
+			}
+		}
+		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
+					   2 * ocfs2_quota_trans_credits(sb));
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_unlock;
+		}
+		status = __dquot_transfer(inode, transfer_to);
+		if (status < 0)
+			goto bail_commit;
+	} else {
+		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_unlock;
+		}
+	}
+
+	/*
+	 * This will intentionally not wind up calling truncate_setsize(),
+	 * since all the work for a size change has been done above.
+	 * Otherwise, we could get into problems with truncate as
+	 * ip_alloc_sem is used there to protect against i_size
+	 * changes.
+	 *
+	 * XXX: this means the conditional below can probably be removed.
+	 */
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		status = vmtruncate(inode, attr->ia_size);
+		if (status) {
+			mlog_errno(status);
+			goto bail_commit;
+		}
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
+	status = ocfs2_mark_inode_dirty(handle, inode, bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_commit:
+	ocfs2_commit_trans(osb, handle);
+bail_unlock:
+	ocfs2_inode_unlock(inode, 1);
+bail_unlock_rw:
+	if (size_change)
+		ocfs2_rw_unlock(inode, 1);
+bail:
+	brelse(bh);
+
+	/* Release quota pointers in case we acquired them */
+	for (qtype = 0; qtype < MAXQUOTAS; qtype++)
+		dqput(transfer_to[qtype]);
+
+	if (!status && attr->ia_valid & ATTR_MODE) {
+		status = ocfs2_acl_chmod(inode);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	return status;
+}
+
+int ocfs2_getattr(struct vfsmount *mnt,
+		  struct dentry *dentry,
+		  struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dentry->d_inode->i_sb;
+	struct ocfs2_super *osb = sb->s_fs_info;
+	int err;
+
+	err = ocfs2_inode_revalidate(dentry);
+	if (err) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	generic_fillattr(inode, stat);
+
+	/* We set the blksize from the cluster size for performance */
+	stat->blksize = osb->s_clustersize;
+
+bail:
+	return err;
+}
+
+int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	int ret;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	ret = ocfs2_inode_lock(inode, NULL, 0);
+	if (ret) {
+		if (ret != -ENOENT)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
+
+	ocfs2_inode_unlock(inode, 0);
+out:
+	return ret;
+}
+
+static int __ocfs2_write_remove_suid(struct inode *inode,
+				     struct buffer_head *bh)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di;
+
+	trace_ocfs2_write_remove_suid(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			inode->i_mode);
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_trans;
+	}
+
+	inode->i_mode &= ~S_ISUID;
+	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
+		inode->i_mode &= ~S_ISGID;
+
+	di = (struct ocfs2_dinode *) bh->b_data;
+	di->i_mode = cpu_to_le16(inode->i_mode);
+
+	ocfs2_journal_dirty(handle, bh);
+
+out_trans:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+/*
+ * Will look for holes and unwritten extents in the range starting at
+ * pos for count bytes (inclusive).
+ */
+static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
+				       size_t count)
+{
+	int ret = 0;
+	unsigned int extent_flags;
+	u32 cpos, clusters, extent_len, phys_cpos;
+	struct super_block *sb = inode->i_sb;
+
+	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+	while (clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+					 &extent_flags);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
+			ret = 1;
+			break;
+		}
+
+		if (extent_len > clusters)
+			extent_len = clusters;
+
+		clusters -= extent_len;
+		cpos += extent_len;
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_write_remove_suid(struct inode *inode)
+{
+	int ret;
+	struct buffer_head *bh = NULL;
+
+	ret = ocfs2_read_inode_block(inode, &bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret =  __ocfs2_write_remove_suid(inode, bh);
+out:
+	brelse(bh);
+	return ret;
+}
+
+/*
+ * Allocate enough extents to cover the region starting at byte offset
+ * start for len bytes. Existing extents are skipped, any extents
+ * added are marked as "unwritten".
+ */
+static int ocfs2_allocate_unwritten_extents(struct inode *inode,
+					    u64 start, u64 len)
+{
+	int ret;
+	u32 cpos, phys_cpos, clusters, alloc_size;
+	u64 end = start + len;
+	struct buffer_head *di_bh = NULL;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_read_inode_block(inode, &di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * Nothing to do if the requested reservation range
+		 * fits within the inode.
+		 */
+		if (ocfs2_size_fits_inline_data(di_bh, end))
+			goto out;
+
+		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * We consider both start and len to be inclusive.
+	 */
+	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
+	clusters -= cpos;
+
+	while (clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+					 &alloc_size, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * Hole or existing extent len can be arbitrary, so
+		 * cap it to our own allocation request.
+		 */
+		if (alloc_size > clusters)
+			alloc_size = clusters;
+
+		if (phys_cpos) {
+			/*
+			 * We already have an allocation at this
+			 * region so we can safely skip it.
+			 */
+			goto next;
+		}
+
+		ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
+		if (ret) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+
+next:
+		cpos += alloc_size;
+		clusters -= alloc_size;
+	}
+
+	ret = 0;
+out:
+
+	brelse(di_bh);
+	return ret;
+}
+
+/*
+ * Truncate a byte range, avoiding pages within partial clusters. This
+ * preserves those pages for the zeroing code to write to.
+ */
+static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
+					 u64 byte_len)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	loff_t start, end;
+	struct address_space *mapping = inode->i_mapping;
+
+	start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
+	end = byte_start + byte_len;
+	end = end & ~(osb->s_clustersize - 1);
+
+	if (start < end) {
+		unmap_mapping_range(mapping, start, end - start, 0);
+		truncate_inode_pages_range(mapping, start, end - 1);
+	}
+}
+
+static int ocfs2_zero_partial_clusters(struct inode *inode,
+				       u64 start, u64 len)
+{
+	int ret = 0;
+	u64 tmpend, end = start + len;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned int csize = osb->s_clustersize;
+	handle_t *handle;
+
+	/*
+	 * The "start" and "end" values are NOT necessarily part of
+	 * the range whose allocation is being deleted. Rather, this
+	 * is what the user passed in with the request. We must zero
+	 * partial clusters here. There's no need to worry about
+	 * physical allocation - the zeroing code knows to skip holes.
+	 */
+	trace_ocfs2_zero_partial_clusters(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)start, (unsigned long long)end);
+
+	/*
+	 * If both edges are on a cluster boundary then there's no
+	 * zeroing required as the region is part of the allocation to
+	 * be truncated.
+	 */
+	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
+		goto out;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We want to get the byte offset of the end of the 1st cluster.
+	 */
+	tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
+	if (tmpend > end)
+		tmpend = end;
+
+	trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
+						 (unsigned long long)tmpend);
+
+	ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
+	if (ret)
+		mlog_errno(ret);
+
+	if (tmpend < end) {
+		/*
+		 * This may make start and end equal, but the zeroing
+		 * code will skip any work in that case so there's no
+		 * need to catch it up here.
+		 */
+		start = end & ~(osb->s_clustersize - 1);
+
+		trace_ocfs2_zero_partial_clusters_range2(
+			(unsigned long long)start, (unsigned long long)end);
+
+		ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
+{
+	int i;
+	struct ocfs2_extent_rec *rec = NULL;
+
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) < pos)
+			break;
+	}
+
+	return i;
+}
+
+/*
+ * Helper to calculate the punching pos and length in one run, we handle the
+ * following three cases in order:
+ *
+ * - remove the entire record
+ * - remove a partial record
+ * - no record needs to be removed (hole-punching completed)
+*/
+static void ocfs2_calc_trunc_pos(struct inode *inode,
+				 struct ocfs2_extent_list *el,
+				 struct ocfs2_extent_rec *rec,
+				 u32 trunc_start, u32 *trunc_cpos,
+				 u32 *trunc_len, u32 *trunc_end,
+				 u64 *blkno, int *done)
+{
+	int ret = 0;
+	u32 coff, range;
+
+	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+	if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
+		/*
+		 * remove an entire extent record.
+		 */
+		*trunc_cpos = le32_to_cpu(rec->e_cpos);
+		/*
+		 * Skip holes if any.
+		 */
+		if (range < *trunc_end)
+			*trunc_end = range;
+		*trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
+		*blkno = le64_to_cpu(rec->e_blkno);
+		*trunc_end = le32_to_cpu(rec->e_cpos);
+	} else if (range > trunc_start) {
+		/*
+		 * remove a partial extent record, which means we're
+		 * removing the last extent record.
+		 */
+		*trunc_cpos = trunc_start;
+		/*
+		 * skip hole if any.
+		 */
+		if (range < *trunc_end)
+			*trunc_end = range;
+		*trunc_len = *trunc_end - trunc_start;
+		coff = trunc_start - le32_to_cpu(rec->e_cpos);
+		*blkno = le64_to_cpu(rec->e_blkno) +
+				ocfs2_clusters_to_blocks(inode->i_sb, coff);
+		*trunc_end = trunc_start;
+	} else {
+		/*
+		 * It may have two following possibilities:
+		 *
+		 * - last record has been removed
+		 * - trunc_start was within a hole
+		 *
+		 * both two cases mean the completion of hole punching.
+		 */
+		ret = 1;
+	}
+
+	*done = ret;
+}
+
+static int ocfs2_remove_inode_range(struct inode *inode,
+				    struct buffer_head *di_bh, u64 byte_start,
+				    u64 byte_len)
+{
+	int ret = 0, flags = 0, done = 0, i;
+	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
+	u32 cluster_in_el;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct address_space *mapping = inode->i_mapping;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_list *el = NULL;
+	struct ocfs2_extent_rec *rec = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
+
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	trace_ocfs2_remove_inode_range(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)byte_start,
+			(unsigned long long)byte_len);
+
+	if (byte_len == 0)
+		return 0;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
+					    byte_start + byte_len, 0);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		/*
+		 * There's no need to get fancy with the page cache
+		 * truncate of an inline-data inode. We're talking
+		 * about less than a page here, which will be cached
+		 * in the dinode buffer anyway.
+		 */
+		unmap_mapping_range(mapping, 0, 0, 0);
+		truncate_inode_pages(mapping, 0);
+		goto out;
+	}
+
+	/*
+	 * For reflinks, we may need to CoW 2 clusters which might be
+	 * partially zero'd later, if hole's start and end offset were
+	 * within one cluster(means is not exactly aligned to clustersize).
+	 */
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+
+		ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
+	trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
+	cluster_in_el = trunc_end;
+
+	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	path = ocfs2_new_path_from_et(&et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (trunc_end > trunc_start) {
+
+		ret = ocfs2_find_path(INODE_CACHE(inode), path,
+				      cluster_in_el);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		el = path_leaf_el(path);
+
+		i = ocfs2_find_rec(el, trunc_end);
+		/*
+		 * Need to go to previous extent block.
+		 */
+		if (i < 0) {
+			if (path->p_tree_depth == 0)
+				break;
+
+			ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+							    path,
+							    &cluster_in_el);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			/*
+			 * We've reached the leftmost extent block,
+			 * it's safe to leave.
+			 */
+			if (cluster_in_el == 0)
+				break;
+
+			/*
+			 * The 'pos' searched for previous extent block is
+			 * always one cluster less than actual trunc_end.
+			 */
+			trunc_end = cluster_in_el + 1;
+
+			ocfs2_reinit_path(path, 1);
+
+			continue;
+
+		} else
+			rec = &el->l_recs[i];
+
+		ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
+				     &trunc_len, &trunc_end, &blkno, &done);
+		if (done)
+			break;
+
+		flags = rec->e_flags;
+		phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+
+		ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+					       phys_cpos, trunc_len, flags,
+					       &dealloc, refcount_loc);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		cluster_in_el = trunc_end;
+
+		ocfs2_reinit_path(path, 1);
+	}
+
+	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
+
+out:
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	return ret;
+}
+
+/*
+ * Parts of this function taken from xfs_change_file_space()
+ */
+static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
+				     loff_t f_pos, unsigned int cmd,
+				     struct ocfs2_space_resv *sr,
+				     int change_size)
+{
+	int ret;
+	s64 llen;
+	loff_t size;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *di_bh = NULL;
+	handle_t *handle;
+	unsigned long long max_off = inode->i_sb->s_maxbytes;
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * This prevents concurrent writes on other nodes
+	 */
+	ret = ocfs2_rw_lock(inode, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_rw_unlock;
+	}
+
+	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+		ret = -EPERM;
+		goto out_inode_unlock;
+	}
+
+	switch (sr->l_whence) {
+	case 0: /*SEEK_SET*/
+		break;
+	case 1: /*SEEK_CUR*/
+		sr->l_start += f_pos;
+		break;
+	case 2: /*SEEK_END*/
+		sr->l_start += i_size_read(inode);
+		break;
+	default:
+		ret = -EINVAL;
+		goto out_inode_unlock;
+	}
+	sr->l_whence = 0;
+
+	llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
+
+	if (sr->l_start < 0
+	    || sr->l_start > max_off
+	    || (sr->l_start + llen) < 0
+	    || (sr->l_start + llen) > max_off) {
+		ret = -EINVAL;
+		goto out_inode_unlock;
+	}
+	size = sr->l_start + sr->l_len;
+
+	if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
+		if (sr->l_len <= 0) {
+			ret = -EINVAL;
+			goto out_inode_unlock;
+		}
+	}
+
+	if (file && should_remove_suid(file->f_path.dentry)) {
+		ret = __ocfs2_write_remove_suid(inode, di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_inode_unlock;
+		}
+	}
+
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+	switch (cmd) {
+	case OCFS2_IOC_RESVSP:
+	case OCFS2_IOC_RESVSP64:
+		/*
+		 * This takes unsigned offsets, but the signed ones we
+		 * pass have been checked against overflow above.
+		 */
+		ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
+						       sr->l_len);
+		break;
+	case OCFS2_IOC_UNRESVSP:
+	case OCFS2_IOC_UNRESVSP64:
+		ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
+					       sr->l_len);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_inode_unlock;
+	}
+
+	/*
+	 * We update c/mtime for these changes
+	 */
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_inode_unlock;
+	}
+
+	if (change_size && i_size_read(inode) < size)
+		i_size_write(inode, size);
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(osb, handle);
+
+out_inode_unlock:
+	brelse(di_bh);
+	ocfs2_inode_unlock(inode, 1);
+out_rw_unlock:
+	ocfs2_rw_unlock(inode, 1);
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+int ocfs2_change_file_space(struct file *file, unsigned int cmd,
+			    struct ocfs2_space_resv *sr)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
+	    !ocfs2_writes_unwritten_extents(osb))
+		return -ENOTTY;
+	else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
+		 !ocfs2_sparse_alloc(osb))
+		return -ENOTTY;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	if (!(file->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
+}
+
+static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
+			    loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_space_resv sr;
+	int change_size = 1;
+	int cmd = OCFS2_IOC_RESVSP64;
+
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+	if (!ocfs2_writes_unwritten_extents(osb))
+		return -EOPNOTSUPP;
+
+	if (mode & FALLOC_FL_KEEP_SIZE)
+		change_size = 0;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		cmd = OCFS2_IOC_UNRESVSP64;
+
+	sr.l_whence = 0;
+	sr.l_start = (s64)offset;
+	sr.l_len = (s64)len;
+
+	return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
+					 change_size);
+}
+
+int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
+				   size_t count)
+{
+	int ret = 0;
+	unsigned int extent_flags;
+	u32 cpos, clusters, extent_len, phys_cpos;
+	struct super_block *sb = inode->i_sb;
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
+	    !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
+	    OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
+
+	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+	while (clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+					 &extent_flags);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
+			ret = 1;
+			break;
+		}
+
+		if (extent_len > clusters)
+			extent_len = clusters;
+
+		clusters -= extent_len;
+		cpos += extent_len;
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+					    struct file *file,
+					    loff_t pos, size_t count,
+					    int *meta_level)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	u32 clusters =
+		ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*meta_level = 1;
+
+	ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(di_bh);
+	return ret;
+}
+
+static int ocfs2_prepare_inode_for_write(struct file *file,
+					 loff_t *ppos,
+					 size_t count,
+					 int appending,
+					 int *direct_io,
+					 int *has_refcount)
+{
+	int ret = 0, meta_level = 0;
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	loff_t saved_pos = 0, end;
+
+	/*
+	 * We start with a read level meta lock and only jump to an ex
+	 * if we need to make modifications here.
+	 */
+	for(;;) {
+		ret = ocfs2_inode_lock(inode, NULL, meta_level);
+		if (ret < 0) {
+			meta_level = -1;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/* Clear suid / sgid if necessary. We do this here
+		 * instead of later in the write path because
+		 * remove_suid() calls ->setattr without any hint that
+		 * we may have already done our cluster locking. Since
+		 * ocfs2_setattr() *must* take cluster locks to
+		 * proceeed, this will lead us to recursively lock the
+		 * inode. There's also the dinode i_size state which
+		 * can be lost via setattr during extending writes (we
+		 * set inode->i_size at the end of a write. */
+		if (should_remove_suid(dentry)) {
+			if (meta_level == 0) {
+				ocfs2_inode_unlock(inode, meta_level);
+				meta_level = 1;
+				continue;
+			}
+
+			ret = ocfs2_write_remove_suid(inode);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out_unlock;
+			}
+		}
+
+		/* work on a copy of ppos until we're sure that we won't have
+		 * to recalculate it due to relocking. */
+		if (appending)
+			saved_pos = i_size_read(inode);
+		else
+			saved_pos = *ppos;
+
+		end = saved_pos + count;
+
+		ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
+		if (ret == 1) {
+			ocfs2_inode_unlock(inode, meta_level);
+			meta_level = -1;
+
+			ret = ocfs2_prepare_inode_for_refcount(inode,
+							       file,
+							       saved_pos,
+							       count,
+							       &meta_level);
+			if (has_refcount)
+				*has_refcount = 1;
+			if (direct_io)
+				*direct_io = 0;
+		}
+
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_unlock;
+		}
+
+		/*
+		 * Skip the O_DIRECT checks if we don't need
+		 * them.
+		 */
+		if (!direct_io || !(*direct_io))
+			break;
+
+		/*
+		 * There's no sane way to do direct writes to an inode
+		 * with inline data.
+		 */
+		if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+			*direct_io = 0;
+			break;
+		}
+
+		/*
+		 * Allowing concurrent direct writes means
+		 * i_size changes wouldn't be synchronized, so
+		 * one node could wind up truncating another
+		 * nodes writes.
+		 */
+		if (end > i_size_read(inode)) {
+			*direct_io = 0;
+			break;
+		}
+
+		/*
+		 * We don't fill holes during direct io, so
+		 * check for them here. If any are found, the
+		 * caller will have to retake some cluster
+		 * locks and initiate the io as buffered.
+		 */
+		ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
+		if (ret == 1) {
+			*direct_io = 0;
+			ret = 0;
+		} else if (ret < 0)
+			mlog_errno(ret);
+		break;
+	}
+
+	if (appending)
+		*ppos = saved_pos;
+
+out_unlock:
+	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
+					    saved_pos, appending, count,
+					    direct_io, has_refcount);
+
+	if (meta_level >= 0)
+		ocfs2_inode_unlock(inode, meta_level);
+
+out:
+	return ret;
+}
+
+static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
+				    const struct iovec *iov,
+				    unsigned long nr_segs,
+				    loff_t pos)
+{
+	int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
+	int can_do_direct, has_refcount = 0;
+	ssize_t written = 0;
+	size_t ocount;		/* original count */
+	size_t count;		/* after file limit checks */
+	loff_t old_size, *ppos = &iocb->ki_pos;
+	u32 old_clusters;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int full_coherency = !(osb->s_mount_opt &
+			       OCFS2_MOUNT_COHERENCY_BUFFERED);
+
+	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		file->f_path.dentry->d_name.len,
+		file->f_path.dentry->d_name.name,
+		(unsigned int)nr_segs);
+
+	if (iocb->ki_left == 0)
+		return 0;
+
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+	appending = file->f_flags & O_APPEND ? 1 : 0;
+	direct_io = file->f_flags & O_DIRECT ? 1 : 0;
+
+	mutex_lock(&inode->i_mutex);
+
+	ocfs2_iocb_clear_sem_locked(iocb);
+
+relock:
+	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
+	if (direct_io) {
+		down_read(&inode->i_alloc_sem);
+		have_alloc_sem = 1;
+		/* communicate with ocfs2_dio_end_io */
+		ocfs2_iocb_set_sem_locked(iocb);
+	}
+
+	/*
+	 * Concurrent O_DIRECT writes are allowed with
+	 * mount_option "coherency=buffered".
+	 */
+	rw_level = (!direct_io || full_coherency);
+
+	ret = ocfs2_rw_lock(inode, rw_level);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_sems;
+	}
+
+	/*
+	 * O_DIRECT writes with "coherency=full" need to take EX cluster
+	 * inode_lock to guarantee coherency.
+	 */
+	if (direct_io && full_coherency) {
+		/*
+		 * We need to take and drop the inode lock to force
+		 * other nodes to drop their caches.  Buffered I/O
+		 * already does this in write_begin().
+		 */
+		ret = ocfs2_inode_lock(inode, NULL, 1);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_sems;
+		}
+
+		ocfs2_inode_unlock(inode, 1);
+	}
+
+	can_do_direct = direct_io;
+	ret = ocfs2_prepare_inode_for_write(file, ppos,
+					    iocb->ki_left, appending,
+					    &can_do_direct, &has_refcount);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We can't complete the direct I/O as requested, fall back to
+	 * buffered I/O.
+	 */
+	if (direct_io && !can_do_direct) {
+		ocfs2_rw_unlock(inode, rw_level);
+		up_read(&inode->i_alloc_sem);
+
+		have_alloc_sem = 0;
+		rw_level = -1;
+
+		direct_io = 0;
+		goto relock;
+	}
+
+	/*
+	 * To later detect whether a journal commit for sync writes is
+	 * necessary, we sample i_size, and cluster count here.
+	 */
+	old_size = i_size_read(inode);
+	old_clusters = OCFS2_I(inode)->ip_clusters;
+
+	/* communicate with ocfs2_dio_end_io */
+	ocfs2_iocb_set_rw_locked(iocb, rw_level);
+
+	ret = generic_segment_checks(iov, &nr_segs, &ocount,
+				     VERIFY_READ);
+	if (ret)
+		goto out_dio;
+
+	count = ocount;
+	ret = generic_write_checks(file, ppos, &count,
+				   S_ISBLK(inode->i_mode));
+	if (ret)
+		goto out_dio;
+
+	if (direct_io) {
+		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
+						    ppos, count, ocount);
+		if (written < 0) {
+			ret = written;
+			goto out_dio;
+		}
+	} else {
+		current->backing_dev_info = file->f_mapping->backing_dev_info;
+		written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
+						      ppos, count, 0);
+		current->backing_dev_info = NULL;
+	}
+
+out_dio:
+	/* buffered aio wouldn't have proper lock coverage today */
+	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
+
+	if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
+	    ((file->f_flags & O_DIRECT) && !direct_io)) {
+		ret = filemap_fdatawrite_range(file->f_mapping, pos,
+					       pos + count - 1);
+		if (ret < 0)
+			written = ret;
+
+		if (!ret && ((old_size != i_size_read(inode)) ||
+			     (old_clusters != OCFS2_I(inode)->ip_clusters) ||
+			     has_refcount)) {
+			ret = jbd2_journal_force_commit(osb->journal->j_journal);
+			if (ret < 0)
+				written = ret;
+		}
+
+		if (!ret)
+			ret = filemap_fdatawait_range(file->f_mapping, pos,
+						      pos + count - 1);
+	}
+
+	/*
+	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
+	 * function pointer which is called when o_direct io completes so that
+	 * it can unlock our rw lock.  (it's the clustered equivalent of
+	 * i_alloc_sem; protects truncate from racing with pending ios).
+	 * Unfortunately there are error cases which call end_io and others
+	 * that don't.  so we don't have to unlock the rw_lock if either an
+	 * async dio is going to do it in the future or an end_io after an
+	 * error has already done it.
+	 */
+	if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
+		rw_level = -1;
+		have_alloc_sem = 0;
+	}
+
+out:
+	if (rw_level != -1)
+		ocfs2_rw_unlock(inode, rw_level);
+
+out_sems:
+	if (have_alloc_sem) {
+		up_read(&inode->i_alloc_sem);
+		ocfs2_iocb_clear_sem_locked(iocb);
+	}
+
+	mutex_unlock(&inode->i_mutex);
+
+	if (written)
+		ret = written;
+	return ret;
+}
+
+static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
+				struct file *out,
+				struct splice_desc *sd)
+{
+	int ret;
+
+	ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
+					    sd->total_len, 0, NULL, NULL);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	return splice_from_pipe_feed(pipe, sd, pipe_to_file);
+}
+
+static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
+				       struct file *out,
+				       loff_t *ppos,
+				       size_t len,
+				       unsigned int flags)
+{
+	int ret;
+	struct address_space *mapping = out->f_mapping;
+	struct inode *inode = mapping->host;
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+		.u.file = out,
+	};
+
+
+	trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			out->f_path.dentry->d_name.len,
+			out->f_path.dentry->d_name.name, len);
+
+	if (pipe->inode)
+		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
+
+	splice_from_pipe_begin(&sd);
+	do {
+		ret = splice_from_pipe_next(pipe, &sd);
+		if (ret <= 0)
+			break;
+
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		ret = ocfs2_rw_lock(inode, 1);
+		if (ret < 0)
+			mlog_errno(ret);
+		else {
+			ret = ocfs2_splice_to_file(pipe, out, &sd);
+			ocfs2_rw_unlock(inode, 1);
+		}
+		mutex_unlock(&inode->i_mutex);
+	} while (ret > 0);
+	splice_from_pipe_end(pipe, &sd);
+
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
+
+	if (sd.num_spliced)
+		ret = sd.num_spliced;
+
+	if (ret > 0) {
+		unsigned long nr_pages;
+		int err;
+
+		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+		err = generic_write_sync(out, *ppos, ret);
+		if (err)
+			ret = err;
+		else
+			*ppos += ret;
+
+		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+	}
+
+	return ret;
+}
+
+static ssize_t ocfs2_file_splice_read(struct file *in,
+				      loff_t *ppos,
+				      struct pipe_inode_info *pipe,
+				      size_t len,
+				      unsigned int flags)
+{
+	int ret = 0, lock_level = 0;
+	struct inode *inode = in->f_path.dentry->d_inode;
+
+	trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			in->f_path.dentry->d_name.len,
+			in->f_path.dentry->d_name.name, len);
+
+	/*
+	 * See the comment in ocfs2_file_aio_read()
+	 */
+	ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto bail;
+	}
+	ocfs2_inode_unlock(inode, lock_level);
+
+	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
+
+bail:
+	return ret;
+}
+
+static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
+				   const struct iovec *iov,
+				   unsigned long nr_segs,
+				   loff_t pos)
+{
+	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			filp->f_path.dentry->d_name.len,
+			filp->f_path.dentry->d_name.name, nr_segs);
+
+
+	if (!inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ocfs2_iocb_clear_sem_locked(iocb);
+
+	/*
+	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
+	 * need locks to protect pending reads from racing with truncate.
+	 */
+	if (filp->f_flags & O_DIRECT) {
+		down_read(&inode->i_alloc_sem);
+		have_alloc_sem = 1;
+		ocfs2_iocb_set_sem_locked(iocb);
+
+		ret = ocfs2_rw_lock(inode, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto bail;
+		}
+		rw_level = 0;
+		/* communicate with ocfs2_dio_end_io */
+		ocfs2_iocb_set_rw_locked(iocb, rw_level);
+	}
+
+	/*
+	 * We're fine letting folks race truncates and extending
+	 * writes with read across the cluster, just like they can
+	 * locally. Hence no rw_lock during read.
+	 *
+	 * Take and drop the meta data lock to update inode fields
+	 * like i_size. This allows the checks down below
+	 * generic_file_aio_read() a chance of actually working.
+	 */
+	ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto bail;
+	}
+	ocfs2_inode_unlock(inode, lock_level);
+
+	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
+	trace_generic_file_aio_read_ret(ret);
+
+	/* buffered aio wouldn't have proper lock coverage today */
+	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+
+	/* see ocfs2_file_aio_write */
+	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+		rw_level = -1;
+		have_alloc_sem = 0;
+	}
+
+bail:
+	if (have_alloc_sem) {
+		up_read(&inode->i_alloc_sem);
+		ocfs2_iocb_clear_sem_locked(iocb);
+	}
+	if (rw_level != -1)
+		ocfs2_rw_unlock(inode, rw_level);
+
+	return ret;
+}
+
+const struct inode_operations ocfs2_file_iops = {
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+	.permission	= ocfs2_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
+	.fiemap		= ocfs2_fiemap,
+};
+
+const struct inode_operations ocfs2_special_file_iops = {
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+	.permission	= ocfs2_permission,
+};
+
+/*
+ * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
+ * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
+ */
+const struct file_operations ocfs2_fops = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.mmap		= ocfs2_mmap,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_file_release,
+	.open		= ocfs2_file_open,
+	.aio_read	= ocfs2_file_aio_read,
+	.aio_write	= ocfs2_file_aio_write,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
+	.lock		= ocfs2_lock,
+	.flock		= ocfs2_flock,
+	.splice_read	= ocfs2_file_splice_read,
+	.splice_write	= ocfs2_file_splice_write,
+	.fallocate	= ocfs2_fallocate,
+};
+
+const struct file_operations ocfs2_dops = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= ocfs2_readdir,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_dir_release,
+	.open		= ocfs2_dir_open,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
+	.lock		= ocfs2_lock,
+	.flock		= ocfs2_flock,
+};
+
+/*
+ * POSIX-lockless variants of our file_operations.
+ *
+ * These will be used if the underlying cluster stack does not support
+ * posix file locking, if the user passes the "localflocks" mount
+ * option, or if we have a local-only fs.
+ *
+ * ocfs2_flock is in here because all stacks handle UNIX file locks,
+ * so we still want it in the case of no stack support for
+ * plocks. Internally, it will do the right thing when asked to ignore
+ * the cluster.
+ */
+const struct file_operations ocfs2_fops_no_plocks = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.mmap		= ocfs2_mmap,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_file_release,
+	.open		= ocfs2_file_open,
+	.aio_read	= ocfs2_file_aio_read,
+	.aio_write	= ocfs2_file_aio_write,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
+	.flock		= ocfs2_flock,
+	.splice_read	= ocfs2_file_splice_read,
+	.splice_write	= ocfs2_file_splice_write,
+	.fallocate	= ocfs2_fallocate,
+};
+
+const struct file_operations ocfs2_dops_no_plocks = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= ocfs2_readdir,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_dir_release,
+	.open		= ocfs2_dir_open,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
+	.flock		= ocfs2_flock,
+};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
new file mode 100644
index 00000000..f5afbbef
--- /dev/null
+++ b/fs/ocfs2/file.h
@@ -0,0 +1,76 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_FILE_H
+#define OCFS2_FILE_H
+
+extern const struct file_operations ocfs2_fops;
+extern const struct file_operations ocfs2_dops;
+extern const struct file_operations ocfs2_fops_no_plocks;
+extern const struct file_operations ocfs2_dops_no_plocks;
+extern const struct inode_operations ocfs2_file_iops;
+extern const struct inode_operations ocfs2_special_file_iops;
+struct ocfs2_alloc_context;
+enum ocfs2_alloc_restarted;
+
+struct ocfs2_file_private {
+	struct file		*fp_file;
+	struct mutex		fp_mutex;
+	struct ocfs2_lock_res	fp_flock;
+};
+
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+			 struct inode *inode,
+			 u32 *logical_offset,
+			 u32 clusters_to_add,
+			 int mark_unwritten,
+			 struct buffer_head *fe_bh,
+			 handle_t *handle,
+			 struct ocfs2_alloc_context *data_ac,
+			 struct ocfs2_alloc_context *meta_ac,
+			 enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_simple_size_update(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size);
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+			  u64 new_i_size, u64 zero_to);
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+		      loff_t zero_to);
+int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
+int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		  struct kstat *stat);
+int ocfs2_permission(struct inode *inode, int mask, unsigned int flags);
+
+int ocfs2_should_update_atime(struct inode *inode,
+			      struct vfsmount *vfsmnt);
+int ocfs2_update_inode_atime(struct inode *inode,
+			     struct buffer_head *bh);
+
+int ocfs2_change_file_space(struct file *file, unsigned int cmd,
+			    struct ocfs2_space_resv *sr);
+
+int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
+				   size_t count);
+#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
new file mode 100644
index 00000000..d8208b20
--- /dev/null
+++ b/fs/ocfs2/heartbeat.c
@@ -0,0 +1,134 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.c
+ *
+ * Register ourselves with the heartbaet service, keep our node maps
+ * up to date, and fire off recovery when needed.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
+					    int bit);
+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
+					      int bit);
+
+/* special case -1 for now
+ * TODO: should *really* make sure the calling func never passes -1!!  */
+static void ocfs2_node_map_init(struct ocfs2_node_map *map)
+{
+	map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
+	memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
+	       sizeof(unsigned long));
+}
+
+void ocfs2_init_node_maps(struct ocfs2_super *osb)
+{
+	spin_lock_init(&osb->node_map_lock);
+	ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
+}
+
+void ocfs2_do_node_down(int node_num, void *data)
+{
+	struct ocfs2_super *osb = data;
+
+	BUG_ON(osb->node_num == node_num);
+
+	trace_ocfs2_do_node_down(node_num);
+
+	if (!osb->cconn) {
+		/*
+		 * No cluster connection means we're not even ready to
+		 * participate yet.  We check the slots after the cluster
+		 * comes up, so we will notice the node death then.  We
+		 * can safely ignore it here.
+		 */
+		return;
+	}
+
+	ocfs2_recovery_thread(osb, node_num);
+}
+
+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
+					    int bit)
+{
+	set_bit(bit, map->map);
+}
+
+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit)
+{
+	if (bit==-1)
+		return;
+	BUG_ON(bit >= map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_set_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
+
+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
+					      int bit)
+{
+	clear_bit(bit, map->map);
+}
+
+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
+			      struct ocfs2_node_map *map,
+			      int bit)
+{
+	if (bit==-1)
+		return;
+	BUG_ON(bit >= map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_clear_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
+
+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit)
+{
+	int ret;
+	if (bit >= map->num_nodes) {
+		mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
+		BUG();
+	}
+	spin_lock(&osb->node_map_lock);
+	ret = test_bit(bit, map->map);
+	spin_unlock(&osb->node_map_lock);
+	return ret;
+}
+
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
new file mode 100644
index 00000000..74b9c5dd
--- /dev/null
+++ b/fs/ocfs2/heartbeat.h
@@ -0,0 +1,45 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_HEARTBEAT_H
+#define OCFS2_HEARTBEAT_H
+
+void ocfs2_init_node_maps(struct ocfs2_super *osb);
+
+void ocfs2_do_node_down(int node_num, void *data);
+
+/* node map functions - used to keep track of mounted and in-recovery
+ * nodes. */
+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit);
+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
+			      struct ocfs2_node_map *map,
+			      int bit);
+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit);
+
+#endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
new file mode 100644
index 00000000..b4c8bb6b
--- /dev/null
+++ b/fs/ocfs2/inode.c
@@ -0,0 +1,1447 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.c
+ *
+ * vfs' aops, fops, dops and iops
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+
+#include <asm/byteorder.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dir.h"
+#include "blockcheck.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "super.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "xattr.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+struct ocfs2_find_inode_args
+{
+	u64		fi_blkno;
+	unsigned long	fi_ino;
+	unsigned int	fi_flags;
+	unsigned int	fi_sysfile_type;
+};
+
+static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
+
+static int ocfs2_read_locked_inode(struct inode *inode,
+				   struct ocfs2_find_inode_args *args);
+static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
+static int ocfs2_find_actor(struct inode *inode, void *opaque);
+static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh);
+
+void ocfs2_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = OCFS2_I(inode)->ip_attr;
+
+	inode->i_flags &= ~(S_IMMUTABLE |
+		S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC);
+
+	if (flags & OCFS2_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+
+	if (flags & OCFS2_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & OCFS2_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & OCFS2_NOATIME_FL)
+		inode->i_flags |= S_NOATIME;
+	if (flags & OCFS2_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+}
+
+/* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */
+void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
+{
+	unsigned int flags = oi->vfs_inode.i_flags;
+
+	oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL|
+			OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL);
+	if (flags & S_SYNC)
+		oi->ip_attr |= OCFS2_SYNC_FL;
+	if (flags & S_APPEND)
+		oi->ip_attr |= OCFS2_APPEND_FL;
+	if (flags & S_IMMUTABLE)
+		oi->ip_attr |= OCFS2_IMMUTABLE_FL;
+	if (flags & S_NOATIME)
+		oi->ip_attr |= OCFS2_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		oi->ip_attr |= OCFS2_DIRSYNC_FL;
+}
+
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
+{
+	struct ocfs2_find_inode_args args;
+
+	args.fi_blkno = blkno;
+	args.fi_flags = 0;
+	args.fi_ino = ino_from_blkno(sb, blkno);
+	args.fi_sysfile_type = 0;
+
+	return ilookup5(sb, blkno, ocfs2_find_actor, &args);
+}
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
+			 int sysfile_type)
+{
+	struct inode *inode = NULL;
+	struct super_block *sb = osb->sb;
+	struct ocfs2_find_inode_args args;
+
+	trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
+			       sysfile_type);
+
+	/* Ok. By now we've either got the offsets passed to us by the
+	 * caller, or we just pulled them off the bh. Lets do some
+	 * sanity checks to make sure they're OK. */
+	if (blkno == 0) {
+		inode = ERR_PTR(-EINVAL);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+
+	args.fi_blkno = blkno;
+	args.fi_flags = flags;
+	args.fi_ino = ino_from_blkno(sb, blkno);
+	args.fi_sysfile_type = sysfile_type;
+
+	inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
+			     ocfs2_init_locked_inode, &args);
+	/* inode was *not* in the inode cache. 2.6.x requires
+	 * us to do our own read_inode call and unlock it
+	 * afterwards. */
+	if (inode == NULL) {
+		inode = ERR_PTR(-ENOMEM);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+	trace_ocfs2_iget5_locked(inode->i_state);
+	if (inode->i_state & I_NEW) {
+		ocfs2_read_locked_inode(inode, &args);
+		unlock_new_inode(inode);
+	}
+	if (is_bad_inode(inode)) {
+		iput(inode);
+		inode = ERR_PTR(-ESTALE);
+		goto bail;
+	}
+
+bail:
+	if (!IS_ERR(inode)) {
+		trace_ocfs2_iget_end(inode, 
+			(unsigned long long)OCFS2_I(inode)->ip_blkno);
+	}
+
+	return inode;
+}
+
+
+/*
+ * here's how inodes get read from disk:
+ * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR
+ * found? : return the in-memory inode
+ * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE
+ */
+
+static int ocfs2_find_actor(struct inode *inode, void *opaque)
+{
+	struct ocfs2_find_inode_args *args = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	int ret = 0;
+
+	args = opaque;
+
+	mlog_bug_on_msg(!inode, "No inode in find actor!\n");
+
+	trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno);
+
+	if (oi->ip_blkno != args->fi_blkno)
+		goto bail;
+
+	ret = 1;
+bail:
+	return ret;
+}
+
+/*
+ * initialize the new inode, but don't do anything that would cause
+ * us to sleep.
+ * return 0 on success, 1 on failure
+ */
+static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
+{
+	struct ocfs2_find_inode_args *args = opaque;
+	static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
+				     ocfs2_file_ip_alloc_sem_key;
+
+	inode->i_ino = args->fi_ino;
+	OCFS2_I(inode)->ip_blkno = args->fi_blkno;
+	if (args->fi_sysfile_type != 0)
+		lockdep_set_class(&inode->i_mutex,
+			&ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
+	if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE ||
+	    args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE ||
+	    args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
+	    args->fi_sysfile_type == LOCAL_GROUP_QUOTA_SYSTEM_INODE)
+		lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
+				  &ocfs2_quota_ip_alloc_sem_key);
+	else
+		lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
+				  &ocfs2_file_ip_alloc_sem_key);
+
+	return 0;
+}
+
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+			  int create_ino)
+{
+	struct super_block *sb;
+	struct ocfs2_super *osb;
+	int use_plocks = 1;
+
+	sb = inode->i_sb;
+	osb = OCFS2_SB(sb);
+
+	if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+	    ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
+		use_plocks = 0;
+
+	/*
+	 * These have all been checked by ocfs2_read_inode_block() or set
+	 * by ocfs2_mknod_locked(), so a failure is a code bug.
+	 */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));  /* This means that read_inode
+						cannot create a superblock
+						inode today.  change if
+						that is needed. */
+	BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)));
+	BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation);
+
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
+	OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
+
+	inode->i_version = 1;
+	inode->i_generation = le32_to_cpu(fe->i_generation);
+	inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+	inode->i_mode = le16_to_cpu(fe->i_mode);
+	inode->i_uid = le32_to_cpu(fe->i_uid);
+	inode->i_gid = le32_to_cpu(fe->i_gid);
+
+	/* Fast symlinks will have i_size but no allocated clusters. */
+	if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
+	inode->i_mapping->a_ops = &ocfs2_aops;
+	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
+	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
+	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
+	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
+	inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+
+	if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno))
+		mlog(ML_ERROR,
+		     "ip_blkno %llu != i_blkno %llu!\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+		     (unsigned long long)le64_to_cpu(fe->i_blkno));
+
+	inode->i_nlink = ocfs2_read_links_count(fe);
+
+	trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno,
+				   le32_to_cpu(fe->i_flags));
+	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
+		inode->i_flags |= S_NOQUOTA;
+	}
+  
+	if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
+		inode->i_flags |= S_NOQUOTA;
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
+		/* we can't actually hit this as read_inode can't
+		 * handle superblocks today ;-) */
+		BUG();
+	}
+
+	switch (inode->i_mode & S_IFMT) {
+	    case S_IFREG:
+		    if (use_plocks)
+			    inode->i_fop = &ocfs2_fops;
+		    else
+			    inode->i_fop = &ocfs2_fops_no_plocks;
+		    inode->i_op = &ocfs2_file_iops;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    case S_IFDIR:
+		    inode->i_op = &ocfs2_dir_iops;
+		    if (use_plocks)
+			    inode->i_fop = &ocfs2_dops;
+		    else
+			    inode->i_fop = &ocfs2_dops_no_plocks;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    OCFS2_I(inode)->ip_dir_lock_gen = 1;
+		    break;
+	    case S_IFLNK:
+		    if (ocfs2_inode_is_fast_symlink(inode))
+			inode->i_op = &ocfs2_fast_symlink_inode_operations;
+		    else
+			inode->i_op = &ocfs2_symlink_inode_operations;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    default:
+		    inode->i_op = &ocfs2_special_file_iops;
+		    init_special_inode(inode, inode->i_mode,
+				       inode->i_rdev);
+		    break;
+	}
+
+	if (create_ino) {
+		inode->i_ino = ino_from_blkno(inode->i_sb,
+			       le64_to_cpu(fe->i_blkno));
+
+		/*
+		 * If we ever want to create system files from kernel,
+		 * the generation argument to
+		 * ocfs2_inode_lock_res_init() will have to change.
+		 */
+		BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
+
+		ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
+					  OCFS2_LOCK_TYPE_META, 0, inode);
+
+		ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+					  OCFS2_LOCK_TYPE_OPEN, 0, inode);
+	}
+
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
+				  OCFS2_LOCK_TYPE_RW, inode->i_generation,
+				  inode);
+
+	ocfs2_set_inode_flags(inode);
+
+	OCFS2_I(inode)->ip_last_used_slot = 0;
+	OCFS2_I(inode)->ip_last_used_group = 0;
+
+	if (S_ISDIR(inode->i_mode))
+		ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
+				    OCFS2_RESV_FLAG_DIR);
+}
+
+static int ocfs2_read_locked_inode(struct inode *inode,
+				   struct ocfs2_find_inode_args *args)
+{
+	struct super_block *sb;
+	struct ocfs2_super *osb;
+	struct ocfs2_dinode *fe;
+	struct buffer_head *bh = NULL;
+	int status, can_lock;
+	u32 generation = 0;
+
+	status = -EINVAL;
+	if (inode == NULL || inode->i_sb == NULL) {
+		mlog(ML_ERROR, "bad inode\n");
+		return status;
+	}
+	sb = inode->i_sb;
+	osb = OCFS2_SB(sb);
+
+	if (!args) {
+		mlog(ML_ERROR, "bad inode args\n");
+		make_bad_inode(inode);
+		return status;
+	}
+
+	/*
+	 * To improve performance of cold-cache inode stats, we take
+	 * the cluster lock here if possible.
+	 *
+	 * Generally, OCFS2 never trusts the contents of an inode
+	 * unless it's holding a cluster lock, so taking it here isn't
+	 * a correctness issue as much as it is a performance
+	 * improvement.
+	 *
+	 * There are three times when taking the lock is not a good idea:
+	 *
+	 * 1) During startup, before we have initialized the DLM.
+	 *
+	 * 2) If we are reading certain system files which never get
+	 *    cluster locks (local alloc, truncate log).
+	 *
+	 * 3) If the process doing the iget() is responsible for
+	 *    orphan dir recovery. We're holding the orphan dir lock and
+	 *    can get into a deadlock with another process on another
+	 *    node in ->delete_inode().
+	 *
+	 * #1 and #2 can be simply solved by never taking the lock
+	 * here for system files (which are the only type we read
+	 * during mount). It's a heavier approach, but our main
+	 * concern is user-accessible files anyway.
+	 *
+	 * #3 works itself out because we'll eventually take the
+	 * cluster lock before trusting anything anyway.
+	 */
+	can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
+		&& !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
+		&& !ocfs2_mount_local(osb);
+
+	trace_ocfs2_read_locked_inode(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock);
+
+	/*
+	 * To maintain backwards compatibility with older versions of
+	 * ocfs2-tools, we still store the generation value for system
+	 * files. The only ones that actually matter to userspace are
+	 * the journals, but it's easier and inexpensive to just flag
+	 * all system files similarly.
+	 */
+	if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
+		generation = osb->fs_generation;
+
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
+				  OCFS2_LOCK_TYPE_META,
+				  generation, inode);
+
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+				  OCFS2_LOCK_TYPE_OPEN,
+				  0, inode);
+
+	if (can_lock) {
+		status = ocfs2_open_lock(inode);
+		if (status) {
+			make_bad_inode(inode);
+			mlog_errno(status);
+			return status;
+		}
+		status = ocfs2_inode_lock(inode, NULL, 0);
+		if (status) {
+			make_bad_inode(inode);
+			mlog_errno(status);
+			return status;
+		}
+	}
+
+	if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
+		status = ocfs2_try_open_lock(inode, 0);
+		if (status) {
+			make_bad_inode(inode);
+			return status;
+		}
+	}
+
+	if (can_lock) {
+		status = ocfs2_read_inode_block_full(inode, &bh,
+						     OCFS2_BH_IGNORE_CACHE);
+	} else {
+		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
+		/*
+		 * If buffer is in jbd, then its checksum may not have been
+		 * computed as yet.
+		 */
+		if (!status && !buffer_jbd(bh))
+			status = ocfs2_validate_inode_block(osb->sb, bh);
+	}
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = -EINVAL;
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	/*
+	 * This is a code bug. Right now the caller needs to
+	 * understand whether it is asking for a system file inode or
+	 * not so the proper lock names can be built.
+	 */
+	mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) !=
+			!!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE),
+			"Inode %llu: system file state is ambigous\n",
+			(unsigned long long)args->fi_blkno);
+
+	if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
+	    S_ISBLK(le16_to_cpu(fe->i_mode)))
+		inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+
+	ocfs2_populate_inode(inode, fe, 0);
+
+	BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
+
+	status = 0;
+
+bail:
+	if (can_lock)
+		ocfs2_inode_unlock(inode, 0);
+
+	if (status < 0)
+		make_bad_inode(inode);
+
+	if (args && bh)
+		brelse(bh);
+
+	return status;
+}
+
+void ocfs2_sync_blockdev(struct super_block *sb)
+{
+	sync_blockdev(sb->s_bdev);
+}
+
+static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
+				     struct inode *inode,
+				     struct buffer_head *fe_bh)
+{
+	int status = 0;
+	struct ocfs2_dinode *fe;
+	handle_t *handle = NULL;
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	/*
+	 * This check will also skip truncate of inodes with inline
+	 * data and fast symlinks.
+	 */
+	if (fe->i_clusters) {
+		if (ocfs2_should_order_data(inode))
+			ocfs2_begin_ordered_truncate(inode, 0);
+
+		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			handle = NULL;
+			mlog_errno(status);
+			goto out;
+		}
+
+		status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+						 fe_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
+
+		i_size_write(inode, 0);
+
+		status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
+
+		ocfs2_commit_trans(osb, handle);
+		handle = NULL;
+
+		status = ocfs2_commit_truncate(osb, inode, fe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
+	}
+
+out:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+	return status;
+}
+
+static int ocfs2_remove_inode(struct inode *inode,
+			      struct buffer_head *di_bh,
+			      struct inode *orphan_dir_inode,
+			      struct buffer_head *orphan_dir_bh)
+{
+	int status;
+	struct inode *inode_alloc_inode = NULL;
+	struct buffer_head *inode_alloc_bh = NULL;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+
+	inode_alloc_inode =
+		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+					    le16_to_cpu(di->i_suballoc_slot));
+	if (!inode_alloc_inode) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mutex_lock(&inode_alloc_inode->i_mutex);
+	status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
+	if (status < 0) {
+		mutex_unlock(&inode_alloc_inode->i_mutex);
+
+		mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
+				   ocfs2_quota_trans_credits(inode->i_sb));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto bail_unlock;
+	}
+
+	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+		status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+					  orphan_dir_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_commit;
+		}
+	}
+
+	/* set the inodes dtime */
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+	di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+	ocfs2_journal_dirty(handle, di_bh);
+
+	ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
+	dquot_free_inode(inode);
+
+	status = ocfs2_free_dinode(handle, inode_alloc_inode,
+				   inode_alloc_bh, di);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_commit:
+	ocfs2_commit_trans(osb, handle);
+bail_unlock:
+	ocfs2_inode_unlock(inode_alloc_inode, 1);
+	mutex_unlock(&inode_alloc_inode->i_mutex);
+	brelse(inode_alloc_bh);
+bail:
+	iput(inode_alloc_inode);
+
+	return status;
+}
+
+/*
+ * Serialize with orphan dir recovery. If the process doing
+ * recovery on this orphan dir does an iget() with the dir
+ * i_mutex held, we'll deadlock here. Instead we detect this
+ * and exit early - recovery will wipe this inode for us.
+ */
+static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
+					     int slot)
+{
+	int ret = 0;
+
+	spin_lock(&osb->osb_lock);
+	if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) {
+		ret = -EDEADLK;
+		goto out;
+	}
+	/* This signals to the orphan recovery process that it should
+	 * wait for us to handle the wipe. */
+	osb->osb_orphan_wipes[slot]++;
+out:
+	spin_unlock(&osb->osb_lock);
+	trace_ocfs2_check_orphan_recovery_state(slot, ret);
+	return ret;
+}
+
+static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,
+					 int slot)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_orphan_wipes[slot]--;
+	spin_unlock(&osb->osb_lock);
+
+	wake_up(&osb->osb_wipe_event);
+}
+
+static int ocfs2_wipe_inode(struct inode *inode,
+			    struct buffer_head *di_bh)
+{
+	int status, orphaned_slot = -1;
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+
+	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+		orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
+
+		status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
+		if (status)
+			return status;
+
+		orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+							       ORPHAN_DIR_SYSTEM_INODE,
+							       orphaned_slot);
+		if (!orphan_dir_inode) {
+			status = -EEXIST;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* Lock the orphan dir. The lock will be held for the entire
+		 * delete_inode operation. We do this now to avoid races with
+		 * recovery completion on other nodes. */
+		mutex_lock(&orphan_dir_inode->i_mutex);
+		status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+		if (status < 0) {
+			mutex_unlock(&orphan_dir_inode->i_mutex);
+
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* we do this while holding the orphan dir lock because we
+	 * don't want recovery being run from another node to try an
+	 * inode delete underneath us -- this will result in two nodes
+	 * truncating the same file! */
+	status = ocfs2_truncate_for_delete(osb, inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
+	/* Remove any dir index tree */
+	if (S_ISDIR(inode->i_mode)) {
+		status = ocfs2_dx_dir_truncate(inode, di_bh);
+		if (status) {
+			mlog_errno(status);
+			goto bail_unlock_dir;
+		}
+	}
+
+	/*Free extended attribute resources associated with this inode.*/
+	status = ocfs2_xattr_remove(inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
+	status = ocfs2_remove_refcount_tree(inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
+	status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
+				    orphan_dir_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_unlock_dir:
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)
+		return status;
+
+	ocfs2_inode_unlock(orphan_dir_inode, 1);
+	mutex_unlock(&orphan_dir_inode->i_mutex);
+	brelse(orphan_dir_bh);
+bail:
+	iput(orphan_dir_inode);
+	ocfs2_signal_wipe_completion(osb, orphaned_slot);
+
+	return status;
+}
+
+/* There is a series of simple checks that should be done before a
+ * trylock is even considered. Encapsulate those in this function. */
+static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
+{
+	int ret = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task,
+					     (unsigned long long)oi->ip_blkno,
+					     oi->ip_flags);
+
+	/* We shouldn't be getting here for the root directory
+	 * inode.. */
+	if (inode == osb->root_inode) {
+		mlog(ML_ERROR, "Skipping delete of root inode.\n");
+		goto bail;
+	}
+
+	/* If we're coming from downconvert_thread we can't go into our own
+	 * voting [hello, deadlock city!], so unforuntately we just
+	 * have to skip deleting this guy. That's OK though because
+	 * the node who's doing the actual deleting should handle it
+	 * anyway. */
+	if (current == osb->dc_task)
+		goto bail;
+
+	spin_lock(&oi->ip_lock);
+	/* OCFS2 *never* deletes system files. This should technically
+	 * never get here as system file inodes should always have a
+	 * positive link count. */
+	if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+		mlog(ML_ERROR, "Skipping delete of system file %llu\n",
+		     (unsigned long long)oi->ip_blkno);
+		goto bail_unlock;
+	}
+
+	/* If we have allowd wipe of this inode for another node, it
+	 * will be marked here so we can safely skip it. Recovery will
+	 * cleanup any inodes we might inadvertently skip here. */
+	if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE)
+		goto bail_unlock;
+
+	ret = 1;
+bail_unlock:
+	spin_unlock(&oi->ip_lock);
+bail:
+	return ret;
+}
+
+/* Query the cluster to determine whether we should wipe an inode from
+ * disk or not.
+ *
+ * Requires the inode to have the cluster lock. */
+static int ocfs2_query_inode_wipe(struct inode *inode,
+				  struct buffer_head *di_bh,
+				  int *wipe)
+{
+	int status = 0, reason = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di;
+
+	*wipe = 0;
+
+	trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno,
+					   inode->i_nlink);
+
+	/* While we were waiting for the cluster lock in
+	 * ocfs2_delete_inode, another node might have asked to delete
+	 * the inode. Recheck our flags to catch this. */
+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
+		reason = 1;
+		goto bail;
+	}
+
+	/* Now that we have an up to date inode, we can double check
+	 * the link count. */
+	if (inode->i_nlink)
+		goto bail;
+
+	/* Do some basic inode verification... */
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) &&
+	    !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+		/*
+		 * Inodes in the orphan dir must have ORPHANED_FL.  The only
+		 * inodes that come back out of the orphan dir are reflink
+		 * targets. A reflink target may be moved out of the orphan
+		 * dir between the time we scan the directory and the time we
+		 * process it. This would lead to HAS_REFCOUNT_FL being set but
+		 * ORPHANED_FL not.
+		 */
+		if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
+			reason = 2;
+			goto bail;
+		}
+
+		/* for lack of a better error? */
+		status = -EEXIST;
+		mlog(ML_ERROR,
+		     "Inode %llu (on-disk %llu) not orphaned! "
+		     "Disk flags  0x%x, inode flags 0x%x\n",
+		     (unsigned long long)oi->ip_blkno,
+		     (unsigned long long)le64_to_cpu(di->i_blkno),
+		     le32_to_cpu(di->i_flags), oi->ip_flags);
+		goto bail;
+	}
+
+	/* has someone already deleted us?! baaad... */
+	if (di->i_dtime) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/*
+	 * This is how ocfs2 determines whether an inode is still live
+	 * within the cluster. Every node takes a shared read lock on
+	 * the inode open lock in ocfs2_read_locked_inode(). When we
+	 * get to ->delete_inode(), each node tries to convert it's
+	 * lock to an exclusive. Trylocks are serialized by the inode
+	 * meta data lock. If the upconvert succeeds, we know the inode
+	 * is no longer live and can be deleted.
+	 *
+	 * Though we call this with the meta data lock held, the
+	 * trylock keeps us from ABBA deadlock.
+	 */
+	status = ocfs2_try_open_lock(inode, 1);
+	if (status == -EAGAIN) {
+		status = 0;
+		reason = 3;
+		goto bail;
+	}
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*wipe = 1;
+	trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot));
+
+bail:
+	trace_ocfs2_query_inode_wipe_end(status, reason);
+	return status;
+}
+
+/* Support function for ocfs2_delete_inode. Will help us keep the
+ * inode data in a consistent state for clear_inode. Always truncates
+ * pages, optionally sync's them first. */
+static void ocfs2_cleanup_delete_inode(struct inode *inode,
+				       int sync_data)
+{
+	trace_ocfs2_cleanup_delete_inode(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
+	if (sync_data)
+		write_inode_now(inode, 1);
+	truncate_inode_pages(&inode->i_data, 0);
+}
+
+static void ocfs2_delete_inode(struct inode *inode)
+{
+	int wipe, status;
+	sigset_t oldset;
+	struct buffer_head *di_bh = NULL;
+
+	trace_ocfs2_delete_inode(inode->i_ino,
+				 (unsigned long long)OCFS2_I(inode)->ip_blkno,
+				 is_bad_inode(inode));
+
+	/* When we fail in read_inode() we mark inode as bad. The second test
+	 * catches the case when inode allocation fails before allocating
+	 * a block for inode. */
+	if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)
+		goto bail;
+
+	dquot_initialize(inode);
+
+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
+		/* It's probably not necessary to truncate_inode_pages
+		 * here but we do it for safety anyway (it will most
+		 * likely be a no-op anyway) */
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail;
+	}
+
+	/* We want to block signals in delete_inode as the lock and
+	 * messaging paths may return us -ERESTARTSYS. Which would
+	 * cause us to exit early, resulting in inodes being orphaned
+	 * forever. */
+	ocfs2_block_signals(&oldset);
+
+	/*
+	 * Synchronize us against ocfs2_get_dentry. We take this in
+	 * shared mode so that all nodes can still concurrently
+	 * process deletes.
+	 */
+	status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0);
+	if (status < 0) {
+		mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status);
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail_unblock;
+	}
+	/* Lock down the inode. This gives us an up to date view of
+	 * it's metadata (for verification), and allows us to
+	 * serialize delete_inode on multiple nodes.
+	 *
+	 * Even though we might be doing a truncate, we don't take the
+	 * allocation lock here as it won't be needed - nobody will
+	 * have the file open.
+	 */
+	status = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail_unlock_nfs_sync;
+	}
+
+	/* Query the cluster. This will be the final decision made
+	 * before we go ahead and wipe the inode. */
+	status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
+	if (!wipe || status < 0) {
+		/* Error and remote inode busy both mean we won't be
+		 * removing the inode, so they take almost the same
+		 * path. */
+		if (status < 0)
+			mlog_errno(status);
+
+		/* Someone in the cluster has disallowed a wipe of
+		 * this inode, or it was never completely
+		 * orphaned. Write out the pages and exit now. */
+		ocfs2_cleanup_delete_inode(inode, 1);
+		goto bail_unlock_inode;
+	}
+
+	ocfs2_cleanup_delete_inode(inode, 0);
+
+	status = ocfs2_wipe_inode(inode, di_bh);
+	if (status < 0) {
+		if (status != -EDEADLK)
+			mlog_errno(status);
+		goto bail_unlock_inode;
+	}
+
+	/*
+	 * Mark the inode as successfully deleted.
+	 *
+	 * This is important for ocfs2_clear_inode() as it will check
+	 * this flag and skip any checkpointing work
+	 *
+	 * ocfs2_stuff_meta_lvb() also uses this flag to invalidate
+	 * the LVB for other nodes.
+	 */
+	OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
+
+bail_unlock_inode:
+	ocfs2_inode_unlock(inode, 1);
+	brelse(di_bh);
+
+bail_unlock_nfs_sync:
+	ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
+
+bail_unblock:
+	ocfs2_unblock_signals(&oldset);
+bail:
+	return;
+}
+
+static void ocfs2_clear_inode(struct inode *inode)
+{
+	int status;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	end_writeback(inode);
+	trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
+				inode->i_nlink);
+
+	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
+			"Inode=%lu\n", inode->i_ino);
+
+	dquot_drop(inode);
+
+	/* To preven remote deletes we hold open lock before, now it
+	 * is time to unlock PR and EX open locks. */
+	ocfs2_open_unlock(inode);
+
+	/* Do these before all the other work so that we don't bounce
+	 * the downconvert thread while waiting to destroy the locks. */
+	ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
+	ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
+	ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
+
+	ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
+			   &oi->ip_la_data_resv);
+	ocfs2_resv_init_once(&oi->ip_la_data_resv);
+
+	/* We very well may get a clear_inode before all an inodes
+	 * metadata has hit disk. Of course, we can't drop any cluster
+	 * locks until the journal has finished with it. The only
+	 * exception here are successfully wiped inodes - their
+	 * metadata can now be considered to be part of the system
+	 * inodes from which it came. */
+	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
+		ocfs2_checkpoint_inode(inode);
+
+	mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
+			"Clear inode of %llu, inode has io markers\n",
+			(unsigned long long)oi->ip_blkno);
+
+	ocfs2_extent_map_trunc(inode, 0);
+
+	status = ocfs2_drop_inode_locks(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_lock_res_free(&oi->ip_rw_lockres);
+	ocfs2_lock_res_free(&oi->ip_inode_lockres);
+	ocfs2_lock_res_free(&oi->ip_open_lockres);
+
+	ocfs2_metadata_cache_exit(INODE_CACHE(inode));
+
+	mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached,
+			"Clear inode of %llu, inode has %u cache items\n",
+			(unsigned long long)oi->ip_blkno,
+			INODE_CACHE(inode)->ci_num_cached);
+
+	mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE),
+			"Clear inode of %llu, inode has a bad flag\n",
+			(unsigned long long)oi->ip_blkno);
+
+	mlog_bug_on_msg(spin_is_locked(&oi->ip_lock),
+			"Clear inode of %llu, inode is locked\n",
+			(unsigned long long)oi->ip_blkno);
+
+	mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex),
+			"Clear inode of %llu, io_mutex is locked\n",
+			(unsigned long long)oi->ip_blkno);
+	mutex_unlock(&oi->ip_io_mutex);
+
+	/*
+	 * down_trylock() returns 0, down_write_trylock() returns 1
+	 * kernel 1, world 0
+	 */
+	mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem),
+			"Clear inode of %llu, alloc_sem is locked\n",
+			(unsigned long long)oi->ip_blkno);
+	up_write(&oi->ip_alloc_sem);
+
+	mlog_bug_on_msg(oi->ip_open_count,
+			"Clear inode of %llu has open count %d\n",
+			(unsigned long long)oi->ip_blkno, oi->ip_open_count);
+
+	/* Clear all other flags. */
+	oi->ip_flags = 0;
+	oi->ip_dir_start_lookup = 0;
+	oi->ip_blkno = 0ULL;
+
+	/*
+	 * ip_jinode is used to track txns against this inode. We ensure that
+	 * the journal is flushed before journal shutdown. Thus it is safe to
+	 * have inodes get cleaned up after journal shutdown.
+	 */
+	jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
+				       &oi->ip_jinode);
+}
+
+void ocfs2_evict_inode(struct inode *inode)
+{
+	if (!inode->i_nlink ||
+	    (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
+		ocfs2_delete_inode(inode);
+	} else {
+		truncate_inode_pages(&inode->i_data, 0);
+	}
+	ocfs2_clear_inode(inode);
+}
+
+/* Called under inode_lock, with no more references on the
+ * struct inode, so it's safe here to check the flags field
+ * and to manipulate i_nlink without any other locks. */
+int ocfs2_drop_inode(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	int res;
+
+	trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
+				inode->i_nlink, oi->ip_flags);
+
+	if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
+		res = 1;
+	else
+		res = generic_drop_inode(inode);
+
+	return res;
+}
+
+/*
+ * This is called from our getattr.
+ */
+int ocfs2_inode_revalidate(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int status = 0;
+
+	trace_ocfs2_inode_revalidate(inode,
+		inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL,
+		inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0);
+
+	if (!inode) {
+		status = -ENOENT;
+		goto bail;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		status = -ENOENT;
+		goto bail;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* Let ocfs2_inode_lock do the work of updating our struct
+	 * inode for us. */
+	status = ocfs2_inode_lock(inode, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_inode_unlock(inode, 0);
+bail:
+	return status;
+}
+
+/*
+ * Updates a disk inode from a
+ * struct inode.
+ * Only takes ip_lock.
+ */
+int ocfs2_mark_inode_dirty(handle_t *handle,
+			   struct inode *inode,
+			   struct buffer_head *bh)
+{
+	int status;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
+
+	trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
+	ocfs2_get_inode_flags(OCFS2_I(inode));
+	fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr);
+	fe->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	fe->i_size = cpu_to_le64(i_size_read(inode));
+	ocfs2_set_links_count(fe, inode->i_nlink);
+	fe->i_uid = cpu_to_le32(inode->i_uid);
+	fe->i_gid = cpu_to_le32(inode->i_gid);
+	fe->i_mode = cpu_to_le16(inode->i_mode);
+	fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
+	fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+	fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+
+	ocfs2_journal_dirty(handle, bh);
+leave:
+	return status;
+}
+
+/*
+ *
+ * Updates a struct inode from a disk inode.
+ * does no i/o, only takes ip_lock.
+ */
+void ocfs2_refresh_inode(struct inode *inode,
+			 struct ocfs2_dinode *fe)
+{
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
+	OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
+	ocfs2_set_inode_flags(inode);
+	i_size_write(inode, le64_to_cpu(fe->i_size));
+	inode->i_nlink = ocfs2_read_links_count(fe);
+	inode->i_uid = le32_to_cpu(fe->i_uid);
+	inode->i_gid = le32_to_cpu(fe->i_gid);
+	inode->i_mode = le16_to_cpu(fe->i_mode);
+	if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
+	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
+	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
+	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
+	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
+	inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+
+int ocfs2_validate_inode_block(struct super_block *sb,
+			       struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+	trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+	if (rc) {
+		mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		goto bail;
+	}
+
+	/*
+	 * Errors after here are fatal.
+	 */
+
+	rc = -EINVAL;
+
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    di->i_signature);
+		goto bail;
+	}
+
+	if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(di->i_blkno));
+		goto bail;
+	}
+
+	if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+		ocfs2_error(sb,
+			    "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+			    (unsigned long long)bh->b_blocknr);
+		goto bail;
+	}
+
+	if (le32_to_cpu(di->i_fs_generation) !=
+	    OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Invalid dinode #%llu: fs_generation is %u\n",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(di->i_fs_generation));
+		goto bail;
+	}
+
+	rc = 0;
+
+bail:
+	return rc;
+}
+
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+				int flags)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno,
+			       1, &tmp, flags, ocfs2_validate_inode_block);
+
+	/* If ocfs2_read_blocks() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
+{
+	return ocfs2_read_inode_block_full(inode, bh, 0);
+}
+
+
+static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+	return oi->ip_blkno;
+}
+
+static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+	return oi->vfs_inode.i_sb;
+}
+
+static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+	spin_lock(&oi->ip_lock);
+}
+
+static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+	spin_unlock(&oi->ip_lock);
+}
+
+static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+	mutex_lock(&oi->ip_io_mutex);
+}
+
+static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+	mutex_unlock(&oi->ip_io_mutex);
+}
+
+const struct ocfs2_caching_operations ocfs2_inode_caching_ops = {
+	.co_owner		= ocfs2_inode_cache_owner,
+	.co_get_super		= ocfs2_inode_cache_get_super,
+	.co_cache_lock		= ocfs2_inode_cache_lock,
+	.co_cache_unlock	= ocfs2_inode_cache_unlock,
+	.co_io_lock		= ocfs2_inode_cache_io_lock,
+	.co_io_unlock		= ocfs2_inode_cache_io_unlock,
+};
+
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
new file mode 100644
index 00000000..1c508b14
--- /dev/null
+++ b/fs/ocfs2/inode.h
@@ -0,0 +1,180 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_INODE_H
+#define OCFS2_INODE_H
+
+#include "extent_map.h"
+
+/* OCFS2 Inode Private Data */
+struct ocfs2_inode_info
+{
+	u64			ip_blkno;
+
+	struct ocfs2_lock_res		ip_rw_lockres;
+	struct ocfs2_lock_res		ip_inode_lockres;
+	struct ocfs2_lock_res		ip_open_lockres;
+
+	/* protects allocation changes on this inode. */
+	struct rw_semaphore		ip_alloc_sem;
+
+	/* protects extended attribute changes on this inode */
+	struct rw_semaphore		ip_xattr_sem;
+
+	/* These fields are protected by ip_lock */
+	spinlock_t			ip_lock;
+	u32				ip_open_count;
+	struct list_head		ip_io_markers;
+	u32				ip_clusters;
+
+	u16				ip_dyn_features;
+	struct mutex			ip_io_mutex;
+	u32				ip_flags; /* see below */
+	u32				ip_attr; /* inode attributes */
+
+	/* protected by recovery_lock. */
+	struct inode			*ip_next_orphan;
+
+	struct ocfs2_caching_info	ip_metadata_cache;
+	struct ocfs2_extent_map		ip_extent_map;
+	struct inode			vfs_inode;
+	struct jbd2_inode		ip_jinode;
+
+	u32				ip_dir_start_lookup;
+
+	/* Only valid if the inode is the dir. */
+	u32				ip_last_used_slot;
+	u64				ip_last_used_group;
+	u32				ip_dir_lock_gen;
+
+	struct ocfs2_alloc_reservation	ip_la_data_resv;
+};
+
+/*
+ * Flags for the ip_flags field
+ */
+/* System file inodes  */
+#define OCFS2_INODE_SYSTEM_FILE		0x00000001
+#define OCFS2_INODE_JOURNAL		0x00000002
+#define OCFS2_INODE_BITMAP		0x00000004
+/* This inode has been wiped from disk */
+#define OCFS2_INODE_DELETED		0x00000008
+/* Another node is deleting, so our delete is a nop */
+#define OCFS2_INODE_SKIP_DELETE		0x00000010
+/* Has the inode been orphaned on another node?
+ *
+ * This hints to ocfs2_drop_inode that it should clear i_nlink before
+ * continuing.
+ *
+ * We *only* set this on unlink vote from another node. If the inode
+ * was locally orphaned, then we're sure of the state and don't need
+ * to twiddle i_nlink later - it's either zero or not depending on
+ * whether our unlink succeeded. Otherwise we got this from a node
+ * whose intention was to orphan the inode, however he may have
+ * crashed, failed etc, so we let ocfs2_drop_inode zero the value and
+ * rely on ocfs2_delete_inode to sort things out under the proper
+ * cluster locks.
+ */
+#define OCFS2_INODE_MAYBE_ORPHANED	0x00000020
+/* Does someone have the file open O_DIRECT */
+#define OCFS2_INODE_OPEN_DIRECT		0x00000040
+/* Tell the inode wipe code it's not in orphan dir */
+#define OCFS2_INODE_SKIP_ORPHAN_DIR     0x00000080
+
+static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
+{
+	return container_of(inode, struct ocfs2_inode_info, vfs_inode);
+}
+
+#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
+#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
+
+extern struct kmem_cache *ocfs2_inode_cache;
+
+extern const struct address_space_operations ocfs2_aops;
+extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops;
+
+static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
+{
+	return &OCFS2_I(inode)->ip_metadata_cache;
+}
+
+void ocfs2_evict_inode(struct inode *inode);
+int ocfs2_drop_inode(struct inode *inode);
+
+/* Flags for ocfs2_iget() */
+#define OCFS2_FI_FLAG_SYSFILE		0x1
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY	0x2
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
+			 int sysfile_type);
+int ocfs2_inode_init_private(struct inode *inode);
+int ocfs2_inode_revalidate(struct dentry *dentry);
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+			  int create_ino);
+void ocfs2_read_inode(struct inode *inode);
+void ocfs2_read_inode2(struct inode *inode, void *opaque);
+ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
+			size_t size, loff_t *offp);
+void ocfs2_sync_blockdev(struct super_block *sb);
+void ocfs2_refresh_inode(struct inode *inode,
+			 struct ocfs2_dinode *fe);
+int ocfs2_mark_inode_dirty(handle_t *handle,
+			   struct inode *inode,
+			   struct buffer_head *bh);
+int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
+int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+struct buffer_head *ocfs2_bread(struct inode *inode,
+				int block, int *err, int reada);
+
+void ocfs2_set_inode_flags(struct inode *inode);
+void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
+
+static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
+{
+	int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
+
+	return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
+}
+
+/* Validate that a bh contains a valid inode */
+int ocfs2_validate_inode_block(struct super_block *sb,
+			       struct buffer_head *bh);
+/*
+ * Read an inode block into *bh.  If *bh is NULL, a bh will be allocated.
+ * This is a cached read.  The inode will be validated with
+ * ocfs2_validate_inode_block().
+ */
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
+/* The same, but can be passed OCFS2_BH_* flags */
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+				int flags);
+
+static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_info *ci)
+{
+	return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache);
+}
+
+#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
new file mode 100644
index 00000000..bc91072b
--- /dev/null
+++ b/fs/ocfs2/ioctl.c
@@ -0,0 +1,1031 @@
+/*
+ * linux/fs/ocfs2/ioctl.c
+ *
+ * Copyright (C) 2006 Herbert Poetzl
+ * adapted from Remy Card's ext2/ioctl.c
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/compat.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+
+#include "ocfs2_fs.h"
+#include "ioctl.h"
+#include "resize.h"
+#include "refcounttree.h"
+#include "sysfile.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "move_extents.h"
+
+#include <linux/ext2_fs.h>
+
+#define o2info_from_user(a, b)	\
+		copy_from_user(&(a), (b), sizeof(a))
+#define o2info_to_user(a, b)	\
+		copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
+
+/*
+ * This call is void because we are already reporting an error that may
+ * be -EFAULT.  The error will be returned from the ioctl(2) call.  It's
+ * just a best-effort to tell userspace that this request caused the error.
+ */
+static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
+					struct ocfs2_info_request __user *req)
+{
+	kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
+	(void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
+}
+
+static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
+{
+	req->ir_flags |= OCFS2_INFO_FL_FILLED;
+}
+
+static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
+{
+	req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
+}
+
+static inline int o2info_coherent(struct ocfs2_info_request *req)
+{
+	return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
+}
+
+static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
+{
+	int status;
+
+	status = ocfs2_inode_lock(inode, NULL, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		return status;
+	}
+	ocfs2_get_inode_flags(OCFS2_I(inode));
+	*flags = OCFS2_I(inode)->ip_attr;
+	ocfs2_inode_unlock(inode, 0);
+
+	return status;
+}
+
+static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
+				unsigned mask)
+{
+	struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle = NULL;
+	struct buffer_head *bh = NULL;
+	unsigned oldflags;
+	int status;
+
+	mutex_lock(&inode->i_mutex);
+
+	status = ocfs2_inode_lock(inode, &bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = -EACCES;
+	if (!inode_owner_or_capable(inode))
+		goto bail_unlock;
+
+	if (!S_ISDIR(inode->i_mode))
+		flags &= ~OCFS2_DIRSYNC_FL;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto bail_unlock;
+	}
+
+	oldflags = ocfs2_inode->ip_attr;
+	flags = flags & mask;
+	flags |= oldflags & ~mask;
+
+	/*
+	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+	 * the relevant capability.
+	 */
+	status = -EPERM;
+	if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
+		(OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
+		if (!capable(CAP_LINUX_IMMUTABLE))
+			goto bail_unlock;
+	}
+
+	ocfs2_inode->ip_attr = flags;
+	ocfs2_set_inode_flags(inode);
+
+	status = ocfs2_mark_inode_dirty(handle, inode, bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_commit_trans(osb, handle);
+bail_unlock:
+	ocfs2_inode_unlock(inode, 1);
+bail:
+	mutex_unlock(&inode->i_mutex);
+
+	brelse(bh);
+
+	return status;
+}
+
+int ocfs2_info_handle_blocksize(struct inode *inode,
+				struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_blocksize oib;
+
+	if (o2info_from_user(oib, req))
+		goto bail;
+
+	oib.ib_blocksize = inode->i_sb->s_blocksize;
+
+	o2info_set_request_filled(&oib.ib_req);
+
+	if (o2info_to_user(oib, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(&oib.ib_req, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_clustersize(struct inode *inode,
+				  struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_clustersize oic;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oic, req))
+		goto bail;
+
+	oic.ic_clustersize = osb->s_clustersize;
+
+	o2info_set_request_filled(&oic.ic_req);
+
+	if (o2info_to_user(oic, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(&oic.ic_req, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_maxslots(struct inode *inode,
+			       struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_maxslots oim;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oim, req))
+		goto bail;
+
+	oim.im_max_slots = osb->max_slots;
+
+	o2info_set_request_filled(&oim.im_req);
+
+	if (o2info_to_user(oim, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(&oim.im_req, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_label(struct inode *inode,
+			    struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_label oil;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oil, req))
+		goto bail;
+
+	memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
+
+	o2info_set_request_filled(&oil.il_req);
+
+	if (o2info_to_user(oil, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(&oil.il_req, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_uuid(struct inode *inode,
+			   struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_uuid oiu;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oiu, req))
+		goto bail;
+
+	memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
+
+	o2info_set_request_filled(&oiu.iu_req);
+
+	if (o2info_to_user(oiu, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(&oiu.iu_req, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_fs_features(struct inode *inode,
+				  struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_fs_features oif;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oif, req))
+		goto bail;
+
+	oif.if_compat_features = osb->s_feature_compat;
+	oif.if_incompat_features = osb->s_feature_incompat;
+	oif.if_ro_compat_features = osb->s_feature_ro_compat;
+
+	o2info_set_request_filled(&oif.if_req);
+
+	if (o2info_to_user(oif, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(&oif.if_req, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_journal_size(struct inode *inode,
+				   struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_journal_size oij;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oij, req))
+		goto bail;
+
+	oij.ij_journal_size = osb->journal->j_inode->i_size;
+
+	o2info_set_request_filled(&oij.ij_req);
+
+	if (o2info_to_user(oij, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(&oij.ij_req, req);
+
+	return status;
+}
+
+int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
+				struct inode *inode_alloc, u64 blkno,
+				struct ocfs2_info_freeinode *fi, u32 slot)
+{
+	int status = 0, unlock = 0;
+
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *dinode_alloc = NULL;
+
+	if (inode_alloc)
+		mutex_lock(&inode_alloc->i_mutex);
+
+	if (o2info_coherent(&fi->ifi_req)) {
+		status = ocfs2_inode_lock(inode_alloc, &bh, 0);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		unlock = 1;
+	} else {
+		status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
+
+	fi->ifi_stat[slot].lfi_total =
+		le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
+	fi->ifi_stat[slot].lfi_free =
+		le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
+		le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
+
+bail:
+	if (unlock)
+		ocfs2_inode_unlock(inode_alloc, 0);
+
+	if (inode_alloc)
+		mutex_unlock(&inode_alloc->i_mutex);
+
+	brelse(bh);
+
+	return status;
+}
+
+int ocfs2_info_handle_freeinode(struct inode *inode,
+				struct ocfs2_info_request __user *req)
+{
+	u32 i;
+	u64 blkno = -1;
+	char namebuf[40];
+	int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+	struct ocfs2_info_freeinode *oifi = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *inode_alloc = NULL;
+
+	oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
+	if (!oifi) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (o2info_from_user(*oifi, req))
+		goto bail;
+
+	oifi->ifi_slotnum = osb->max_slots;
+
+	for (i = 0; i < oifi->ifi_slotnum; i++) {
+		if (o2info_coherent(&oifi->ifi_req)) {
+			inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
+			if (!inode_alloc) {
+				mlog(ML_ERROR, "unable to get alloc inode in "
+				    "slot %u\n", i);
+				status = -EIO;
+				goto bail;
+			}
+		} else {
+			ocfs2_sprintf_system_inode_name(namebuf,
+							sizeof(namebuf),
+							type, i);
+			status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+							    namebuf,
+							    strlen(namebuf),
+							    &blkno);
+			if (status < 0) {
+				status = -ENOENT;
+				goto bail;
+			}
+		}
+
+		status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
+		if (status < 0)
+			goto bail;
+
+		iput(inode_alloc);
+		inode_alloc = NULL;
+	}
+
+	o2info_set_request_filled(&oifi->ifi_req);
+
+	if (o2info_to_user(*oifi, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(&oifi->ifi_req, req);
+
+	kfree(oifi);
+
+	return status;
+}
+
+static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
+				   unsigned int chunksize)
+{
+	int index;
+
+	index = __ilog2_u32(chunksize);
+	if (index >= OCFS2_INFO_MAX_HIST)
+		index = OCFS2_INFO_MAX_HIST - 1;
+
+	hist->fc_chunks[index]++;
+	hist->fc_clusters[index] += chunksize;
+}
+
+static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
+			       unsigned int chunksize)
+{
+	if (chunksize > stats->ffs_max)
+		stats->ffs_max = chunksize;
+
+	if (chunksize < stats->ffs_min)
+		stats->ffs_min = chunksize;
+
+	stats->ffs_avg += chunksize;
+	stats->ffs_free_chunks_real++;
+}
+
+void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
+			   unsigned int chunksize)
+{
+	o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
+	o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
+}
+
+int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
+				   struct inode *gb_inode,
+				   struct ocfs2_dinode *gb_dinode,
+				   struct ocfs2_chain_rec *rec,
+				   struct ocfs2_info_freefrag *ffg,
+				   u32 chunks_in_group)
+{
+	int status = 0, used;
+	u64 blkno;
+
+	struct buffer_head *bh = NULL;
+	struct ocfs2_group_desc *bg = NULL;
+
+	unsigned int max_bits, num_clusters;
+	unsigned int offset = 0, cluster, chunk;
+	unsigned int chunk_free, last_chunksize = 0;
+
+	if (!le32_to_cpu(rec->c_free))
+		goto bail;
+
+	do {
+		if (!bg)
+			blkno = le64_to_cpu(rec->c_blkno);
+		else
+			blkno = le64_to_cpu(bg->bg_next_group);
+
+		if (bh) {
+			brelse(bh);
+			bh = NULL;
+		}
+
+		if (o2info_coherent(&ffg->iff_req))
+			status = ocfs2_read_group_descriptor(gb_inode,
+							     gb_dinode,
+							     blkno, &bh);
+		else
+			status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+
+		if (status < 0) {
+			mlog(ML_ERROR, "Can't read the group descriptor # "
+			     "%llu from device.", (unsigned long long)blkno);
+			status = -EIO;
+			goto bail;
+		}
+
+		bg = (struct ocfs2_group_desc *)bh->b_data;
+
+		if (!le16_to_cpu(bg->bg_free_bits_count))
+			continue;
+
+		max_bits = le16_to_cpu(bg->bg_bits);
+		offset = 0;
+
+		for (chunk = 0; chunk < chunks_in_group; chunk++) {
+			/*
+			 * last chunk may be not an entire one.
+			 */
+			if ((offset + ffg->iff_chunksize) > max_bits)
+				num_clusters = max_bits - offset;
+			else
+				num_clusters = ffg->iff_chunksize;
+
+			chunk_free = 0;
+			for (cluster = 0; cluster < num_clusters; cluster++) {
+				used = ocfs2_test_bit(offset,
+						(unsigned long *)bg->bg_bitmap);
+				/*
+				 * - chunk_free counts free clusters in #N chunk.
+				 * - last_chunksize records the size(in) clusters
+				 *   for the last real free chunk being counted.
+				 */
+				if (!used) {
+					last_chunksize++;
+					chunk_free++;
+				}
+
+				if (used && last_chunksize) {
+					ocfs2_info_update_ffg(ffg,
+							      last_chunksize);
+					last_chunksize = 0;
+				}
+
+				offset++;
+			}
+
+			if (chunk_free == ffg->iff_chunksize)
+				ffg->iff_ffs.ffs_free_chunks++;
+		}
+
+		/*
+		 * need to update the info for last free chunk.
+		 */
+		if (last_chunksize)
+			ocfs2_info_update_ffg(ffg, last_chunksize);
+
+	} while (le64_to_cpu(bg->bg_next_group));
+
+bail:
+	brelse(bh);
+
+	return status;
+}
+
+int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
+				    struct inode *gb_inode, u64 blkno,
+				    struct ocfs2_info_freefrag *ffg)
+{
+	u32 chunks_in_group;
+	int status = 0, unlock = 0, i;
+
+	struct buffer_head *bh = NULL;
+	struct ocfs2_chain_list *cl = NULL;
+	struct ocfs2_chain_rec *rec = NULL;
+	struct ocfs2_dinode *gb_dinode = NULL;
+
+	if (gb_inode)
+		mutex_lock(&gb_inode->i_mutex);
+
+	if (o2info_coherent(&ffg->iff_req)) {
+		status = ocfs2_inode_lock(gb_inode, &bh, 0);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		unlock = 1;
+	} else {
+		status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	gb_dinode = (struct ocfs2_dinode *)bh->b_data;
+	cl = &(gb_dinode->id2.i_chain);
+
+	/*
+	 * Chunksize(in) clusters from userspace should be
+	 * less than clusters in a group.
+	 */
+	if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
+		status = -EINVAL;
+		goto bail;
+	}
+
+	memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
+
+	ffg->iff_ffs.ffs_min = ~0U;
+	ffg->iff_ffs.ffs_clusters =
+			le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
+	ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
+			le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
+
+	chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
+
+	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+		rec = &(cl->cl_recs[i]);
+		status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
+							gb_dinode,
+							rec, ffg,
+							chunks_in_group);
+		if (status)
+			goto bail;
+	}
+
+	if (ffg->iff_ffs.ffs_free_chunks_real)
+		ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
+					ffg->iff_ffs.ffs_free_chunks_real);
+bail:
+	if (unlock)
+		ocfs2_inode_unlock(gb_inode, 0);
+
+	if (gb_inode)
+		mutex_unlock(&gb_inode->i_mutex);
+
+	if (gb_inode)
+		iput(gb_inode);
+
+	brelse(bh);
+
+	return status;
+}
+
+int ocfs2_info_handle_freefrag(struct inode *inode,
+			       struct ocfs2_info_request __user *req)
+{
+	u64 blkno = -1;
+	char namebuf[40];
+	int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+
+	struct ocfs2_info_freefrag *oiff;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *gb_inode = NULL;
+
+	oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
+	if (!oiff) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (o2info_from_user(*oiff, req))
+		goto bail;
+	/*
+	 * chunksize from userspace should be power of 2.
+	 */
+	if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
+	    (!oiff->iff_chunksize)) {
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (o2info_coherent(&oiff->iff_req)) {
+		gb_inode = ocfs2_get_system_file_inode(osb, type,
+						       OCFS2_INVALID_SLOT);
+		if (!gb_inode) {
+			mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+			status = -EIO;
+			goto bail;
+		}
+	} else {
+		ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
+						OCFS2_INVALID_SLOT);
+		status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+						    namebuf,
+						    strlen(namebuf),
+						    &blkno);
+		if (status < 0) {
+			status = -ENOENT;
+			goto bail;
+		}
+	}
+
+	status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
+	if (status < 0)
+		goto bail;
+
+	o2info_set_request_filled(&oiff->iff_req);
+
+	if (o2info_to_user(*oiff, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(&oiff->iff_req, req);
+
+	kfree(oiff);
+
+	return status;
+}
+
+int ocfs2_info_handle_unknown(struct inode *inode,
+			      struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_request oir;
+
+	if (o2info_from_user(oir, req))
+		goto bail;
+
+	o2info_clear_request_filled(&oir);
+
+	if (o2info_to_user(oir, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(&oir, req);
+
+	return status;
+}
+
+/*
+ * Validate and distinguish OCFS2_IOC_INFO requests.
+ *
+ * - validate the magic number.
+ * - distinguish different requests.
+ * - validate size of different requests.
+ */
+int ocfs2_info_handle_request(struct inode *inode,
+			      struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_request oir;
+
+	if (o2info_from_user(oir, req))
+		goto bail;
+
+	status = -EINVAL;
+	if (oir.ir_magic != OCFS2_INFO_MAGIC)
+		goto bail;
+
+	switch (oir.ir_code) {
+	case OCFS2_INFO_BLOCKSIZE:
+		if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
+			status = ocfs2_info_handle_blocksize(inode, req);
+		break;
+	case OCFS2_INFO_CLUSTERSIZE:
+		if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
+			status = ocfs2_info_handle_clustersize(inode, req);
+		break;
+	case OCFS2_INFO_MAXSLOTS:
+		if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
+			status = ocfs2_info_handle_maxslots(inode, req);
+		break;
+	case OCFS2_INFO_LABEL:
+		if (oir.ir_size == sizeof(struct ocfs2_info_label))
+			status = ocfs2_info_handle_label(inode, req);
+		break;
+	case OCFS2_INFO_UUID:
+		if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
+			status = ocfs2_info_handle_uuid(inode, req);
+		break;
+	case OCFS2_INFO_FS_FEATURES:
+		if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
+			status = ocfs2_info_handle_fs_features(inode, req);
+		break;
+	case OCFS2_INFO_JOURNAL_SIZE:
+		if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
+			status = ocfs2_info_handle_journal_size(inode, req);
+		break;
+	case OCFS2_INFO_FREEINODE:
+		if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
+			status = ocfs2_info_handle_freeinode(inode, req);
+		break;
+	case OCFS2_INFO_FREEFRAG:
+		if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
+			status = ocfs2_info_handle_freefrag(inode, req);
+		break;
+	default:
+		status = ocfs2_info_handle_unknown(inode, req);
+		break;
+	}
+
+bail:
+	return status;
+}
+
+int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
+			  u64 *req_addr, int compat_flag)
+{
+	int status = -EFAULT;
+	u64 __user *bp = NULL;
+
+	if (compat_flag) {
+#ifdef CONFIG_COMPAT
+		/*
+		 * pointer bp stores the base address of a pointers array,
+		 * which collects all addresses of separate request.
+		 */
+		bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
+#else
+		BUG();
+#endif
+	} else
+		bp = (u64 __user *)(unsigned long)(info->oi_requests);
+
+	if (o2info_from_user(*req_addr, bp + idx))
+		goto bail;
+
+	status = 0;
+bail:
+	return status;
+}
+
+/*
+ * OCFS2_IOC_INFO handles an array of requests passed from userspace.
+ *
+ * ocfs2_info_handle() recevies a large info aggregation, grab and
+ * validate the request count from header, then break it into small
+ * pieces, later specific handlers can handle them one by one.
+ *
+ * Idea here is to make each separate request small enough to ensure
+ * a better backward&forward compatibility, since a small piece of
+ * request will be less likely to be broken if disk layout get changed.
+ */
+int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
+		      int compat_flag)
+{
+	int i, status = 0;
+	u64 req_addr;
+	struct ocfs2_info_request __user *reqp;
+
+	if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
+	    (!info->oi_requests)) {
+		status = -EINVAL;
+		goto bail;
+	}
+
+	for (i = 0; i < info->oi_count; i++) {
+
+		status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
+		if (status)
+			break;
+
+		reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
+		if (!reqp) {
+			status = -EINVAL;
+			goto bail;
+		}
+
+		status = ocfs2_info_handle_request(inode, reqp);
+		if (status)
+			break;
+	}
+
+bail:
+	return status;
+}
+
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	unsigned int flags;
+	int new_clusters;
+	int status;
+	struct ocfs2_space_resv sr;
+	struct ocfs2_new_group_input input;
+	struct reflink_arguments args;
+	const char *old_path, *new_path;
+	bool preserve;
+	struct ocfs2_info info;
+
+	switch (cmd) {
+	case OCFS2_IOC_GETFLAGS:
+		status = ocfs2_get_inode_attr(inode, &flags);
+		if (status < 0)
+			return status;
+
+		flags &= OCFS2_FL_VISIBLE;
+		return put_user(flags, (int __user *) arg);
+	case OCFS2_IOC_SETFLAGS:
+		if (get_user(flags, (int __user *) arg))
+			return -EFAULT;
+
+		status = mnt_want_write(filp->f_path.mnt);
+		if (status)
+			return status;
+		status = ocfs2_set_inode_attr(inode, flags,
+			OCFS2_FL_MODIFIABLE);
+		mnt_drop_write(filp->f_path.mnt);
+		return status;
+	case OCFS2_IOC_RESVSP:
+	case OCFS2_IOC_RESVSP64:
+	case OCFS2_IOC_UNRESVSP:
+	case OCFS2_IOC_UNRESVSP64:
+		if (copy_from_user(&sr, (int __user *) arg, sizeof(sr)))
+			return -EFAULT;
+
+		return ocfs2_change_file_space(filp, cmd, &sr);
+	case OCFS2_IOC_GROUP_EXTEND:
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		if (get_user(new_clusters, (int __user *)arg))
+			return -EFAULT;
+
+		return ocfs2_group_extend(inode, new_clusters);
+	case OCFS2_IOC_GROUP_ADD:
+	case OCFS2_IOC_GROUP_ADD64:
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
+			return -EFAULT;
+
+		return ocfs2_group_add(inode, &input);
+	case OCFS2_IOC_REFLINK:
+		if (copy_from_user(&args, (struct reflink_arguments *)arg,
+				   sizeof(args)))
+			return -EFAULT;
+		old_path = (const char *)(unsigned long)args.old_path;
+		new_path = (const char *)(unsigned long)args.new_path;
+		preserve = (args.preserve != 0);
+
+		return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+	case OCFS2_IOC_INFO:
+		if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+				   sizeof(struct ocfs2_info)))
+			return -EFAULT;
+
+		return ocfs2_info_handle(inode, &info, 0);
+	case FITRIM:
+	{
+		struct super_block *sb = inode->i_sb;
+		struct fstrim_range range;
+		int ret = 0;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&range, (struct fstrim_range *)arg,
+		    sizeof(range)))
+			return -EFAULT;
+
+		ret = ocfs2_trim_fs(sb, &range);
+		if (ret < 0)
+			return ret;
+
+		if (copy_to_user((struct fstrim_range *)arg, &range,
+		    sizeof(range)))
+			return -EFAULT;
+
+		return 0;
+	}
+	case OCFS2_IOC_MOVE_EXT:
+		return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	bool preserve;
+	struct reflink_arguments args;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct ocfs2_info info;
+
+	switch (cmd) {
+	case OCFS2_IOC32_GETFLAGS:
+		cmd = OCFS2_IOC_GETFLAGS;
+		break;
+	case OCFS2_IOC32_SETFLAGS:
+		cmd = OCFS2_IOC_SETFLAGS;
+		break;
+	case OCFS2_IOC_RESVSP:
+	case OCFS2_IOC_RESVSP64:
+	case OCFS2_IOC_UNRESVSP:
+	case OCFS2_IOC_UNRESVSP64:
+	case OCFS2_IOC_GROUP_EXTEND:
+	case OCFS2_IOC_GROUP_ADD:
+	case OCFS2_IOC_GROUP_ADD64:
+	case FITRIM:
+		break;
+	case OCFS2_IOC_REFLINK:
+		if (copy_from_user(&args, (struct reflink_arguments *)arg,
+				   sizeof(args)))
+			return -EFAULT;
+		preserve = (args.preserve != 0);
+
+		return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
+					   compat_ptr(args.new_path), preserve);
+	case OCFS2_IOC_INFO:
+		if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+				   sizeof(struct ocfs2_info)))
+			return -EFAULT;
+
+		return ocfs2_info_handle(inode, &info, 1);
+	case OCFS2_IOC_MOVE_EXT:
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return ocfs2_ioctl(file, cmd, arg);
+}
+#endif
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
new file mode 100644
index 00000000..0cd5323b
--- /dev/null
+++ b/fs/ocfs2/ioctl.h
@@ -0,0 +1,16 @@
+/*
+ * ioctl.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2006 Herbert Poetzl
+ *
+ */
+
+#ifndef OCFS2_IOCTL_PROTO_H
+#define OCFS2_IOCTL_PROTO_H
+
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
+
+#endif /* OCFS2_IOCTL_PROTO_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
new file mode 100644
index 00000000..295d5645
--- /dev/null
+++ b/fs/ocfs2/journal.c
@@ -0,0 +1,2192 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * journal.c
+ *
+ * Defines functions of journalling api
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/random.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "blockcheck.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "slot_map.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "quota.h"
+
+#include "buffer_head_io.h"
+#include "ocfs2_trace.h"
+
+DEFINE_SPINLOCK(trans_inc_lock);
+
+#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
+
+static int ocfs2_force_read_journal(struct inode *inode);
+static int ocfs2_recover_node(struct ocfs2_super *osb,
+			      int node_num, int slot_num);
+static int __ocfs2_recovery_thread(void *arg);
+static int ocfs2_commit_cache(struct ocfs2_super *osb);
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
+static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
+				      int dirty, int replayed);
+static int ocfs2_trylock_journal(struct ocfs2_super *osb,
+				 int slot_num);
+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
+				 int slot);
+static int ocfs2_commit_thread(void *arg);
+static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
+					    int slot_num,
+					    struct ocfs2_dinode *la_dinode,
+					    struct ocfs2_dinode *tl_dinode,
+					    struct ocfs2_quota_recovery *qrec);
+
+static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+{
+	return __ocfs2_wait_on_mount(osb, 0);
+}
+
+static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
+{
+	return __ocfs2_wait_on_mount(osb, 1);
+}
+
+/*
+ * This replay_map is to track online/offline slots, so we could recover
+ * offline slots during recovery and mount
+ */
+
+enum ocfs2_replay_state {
+	REPLAY_UNNEEDED = 0,	/* Replay is not needed, so ignore this map */
+	REPLAY_NEEDED, 		/* Replay slots marked in rm_replay_slots */
+	REPLAY_DONE 		/* Replay was already queued */
+};
+
+struct ocfs2_replay_map {
+	unsigned int rm_slots;
+	enum ocfs2_replay_state rm_state;
+	unsigned char rm_replay_slots[0];
+};
+
+void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
+{
+	if (!osb->replay_map)
+		return;
+
+	/* If we've already queued the replay, we don't have any more to do */
+	if (osb->replay_map->rm_state == REPLAY_DONE)
+		return;
+
+	osb->replay_map->rm_state = state;
+}
+
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
+{
+	struct ocfs2_replay_map *replay_map;
+	int i, node_num;
+
+	/* If replay map is already set, we don't do it again */
+	if (osb->replay_map)
+		return 0;
+
+	replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
+			     (osb->max_slots * sizeof(char)), GFP_KERNEL);
+
+	if (!replay_map) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	spin_lock(&osb->osb_lock);
+
+	replay_map->rm_slots = osb->max_slots;
+	replay_map->rm_state = REPLAY_UNNEEDED;
+
+	/* set rm_replay_slots for offline slot(s) */
+	for (i = 0; i < replay_map->rm_slots; i++) {
+		if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT)
+			replay_map->rm_replay_slots[i] = 1;
+	}
+
+	osb->replay_map = replay_map;
+	spin_unlock(&osb->osb_lock);
+	return 0;
+}
+
+void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
+{
+	struct ocfs2_replay_map *replay_map = osb->replay_map;
+	int i;
+
+	if (!replay_map)
+		return;
+
+	if (replay_map->rm_state != REPLAY_NEEDED)
+		return;
+
+	for (i = 0; i < replay_map->rm_slots; i++)
+		if (replay_map->rm_replay_slots[i])
+			ocfs2_queue_recovery_completion(osb->journal, i, NULL,
+							NULL, NULL);
+	replay_map->rm_state = REPLAY_DONE;
+}
+
+void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+{
+	struct ocfs2_replay_map *replay_map = osb->replay_map;
+
+	if (!osb->replay_map)
+		return;
+
+	kfree(replay_map);
+	osb->replay_map = NULL;
+}
+
+int ocfs2_recovery_init(struct ocfs2_super *osb)
+{
+	struct ocfs2_recovery_map *rm;
+
+	mutex_init(&osb->recovery_lock);
+	osb->disable_recovery = 0;
+	osb->recovery_thread_task = NULL;
+	init_waitqueue_head(&osb->recovery_event);
+
+	rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
+		     osb->max_slots * sizeof(unsigned int),
+		     GFP_KERNEL);
+	if (!rm) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	rm->rm_entries = (unsigned int *)((char *)rm +
+					  sizeof(struct ocfs2_recovery_map));
+	osb->recovery_map = rm;
+
+	return 0;
+}
+
+/* we can't grab the goofy sem lock from inside wait_event, so we use
+ * memory barriers to make sure that we'll see the null task before
+ * being woken up */
+static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
+{
+	mb();
+	return osb->recovery_thread_task != NULL;
+}
+
+void ocfs2_recovery_exit(struct ocfs2_super *osb)
+{
+	struct ocfs2_recovery_map *rm;
+
+	/* disable any new recovery threads and wait for any currently
+	 * running ones to exit. Do this before setting the vol_state. */
+	mutex_lock(&osb->recovery_lock);
+	osb->disable_recovery = 1;
+	mutex_unlock(&osb->recovery_lock);
+	wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
+
+	/* At this point, we know that no more recovery threads can be
+	 * launched, so wait for any recovery completion work to
+	 * complete. */
+	flush_workqueue(ocfs2_wq);
+
+	/*
+	 * Now that recovery is shut down, and the osb is about to be
+	 * freed,  the osb_lock is not taken here.
+	 */
+	rm = osb->recovery_map;
+	/* XXX: Should we bug if there are dirty entries? */
+
+	kfree(rm);
+}
+
+static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
+				     unsigned int node_num)
+{
+	int i;
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+	assert_spin_locked(&osb->osb_lock);
+
+	for (i = 0; i < rm->rm_used; i++) {
+		if (rm->rm_entries[i] == node_num)
+			return 1;
+	}
+
+	return 0;
+}
+
+/* Behaves like test-and-set.  Returns the previous value */
+static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+				  unsigned int node_num)
+{
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+	spin_lock(&osb->osb_lock);
+	if (__ocfs2_recovery_map_test(osb, node_num)) {
+		spin_unlock(&osb->osb_lock);
+		return 1;
+	}
+
+	/* XXX: Can this be exploited? Not from o2dlm... */
+	BUG_ON(rm->rm_used >= osb->max_slots);
+
+	rm->rm_entries[rm->rm_used] = node_num;
+	rm->rm_used++;
+	spin_unlock(&osb->osb_lock);
+
+	return 0;
+}
+
+static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+				     unsigned int node_num)
+{
+	int i;
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+	spin_lock(&osb->osb_lock);
+
+	for (i = 0; i < rm->rm_used; i++) {
+		if (rm->rm_entries[i] == node_num)
+			break;
+	}
+
+	if (i < rm->rm_used) {
+		/* XXX: be careful with the pointer math */
+		memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]),
+			(rm->rm_used - i - 1) * sizeof(unsigned int));
+		rm->rm_used--;
+	}
+
+	spin_unlock(&osb->osb_lock);
+}
+
+static int ocfs2_commit_cache(struct ocfs2_super *osb)
+{
+	int status = 0;
+	unsigned int flushed;
+	struct ocfs2_journal *journal = NULL;
+
+	journal = osb->journal;
+
+	/* Flush all pending commits and checkpoint the journal. */
+	down_write(&journal->j_trans_barrier);
+
+	flushed = atomic_read(&journal->j_num_trans);
+	trace_ocfs2_commit_cache_begin(flushed);
+	if (flushed == 0) {
+		up_write(&journal->j_trans_barrier);
+		goto finally;
+	}
+
+	jbd2_journal_lock_updates(journal->j_journal);
+	status = jbd2_journal_flush(journal->j_journal);
+	jbd2_journal_unlock_updates(journal->j_journal);
+	if (status < 0) {
+		up_write(&journal->j_trans_barrier);
+		mlog_errno(status);
+		goto finally;
+	}
+
+	ocfs2_inc_trans_id(journal);
+
+	flushed = atomic_read(&journal->j_num_trans);
+	atomic_set(&journal->j_num_trans, 0);
+	up_write(&journal->j_trans_barrier);
+
+	trace_ocfs2_commit_cache_end(journal->j_trans_id, flushed);
+
+	ocfs2_wake_downconvert_thread(osb);
+	wake_up(&journal->j_checkpointed);
+finally:
+	return status;
+}
+
+handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
+{
+	journal_t *journal = osb->journal->j_journal;
+	handle_t *handle;
+
+	BUG_ON(!osb || !osb->journal->j_journal);
+
+	if (ocfs2_is_hard_readonly(osb))
+		return ERR_PTR(-EROFS);
+
+	BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
+	BUG_ON(max_buffs <= 0);
+
+	/* Nested transaction? Just return the handle... */
+	if (journal_current_handle())
+		return jbd2_journal_start(journal, max_buffs);
+
+	down_read(&osb->journal->j_trans_barrier);
+
+	handle = jbd2_journal_start(journal, max_buffs);
+	if (IS_ERR(handle)) {
+		up_read(&osb->journal->j_trans_barrier);
+
+		mlog_errno(PTR_ERR(handle));
+
+		if (is_journal_aborted(journal)) {
+			ocfs2_abort(osb->sb, "Detected aborted journal");
+			handle = ERR_PTR(-EROFS);
+		}
+	} else {
+		if (!ocfs2_mount_local(osb))
+			atomic_inc(&(osb->journal->j_num_trans));
+	}
+
+	return handle;
+}
+
+int ocfs2_commit_trans(struct ocfs2_super *osb,
+		       handle_t *handle)
+{
+	int ret, nested;
+	struct ocfs2_journal *journal = osb->journal;
+
+	BUG_ON(!handle);
+
+	nested = handle->h_ref > 1;
+	ret = jbd2_journal_stop(handle);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	if (!nested)
+		up_read(&journal->j_trans_barrier);
+
+	return ret;
+}
+
+/*
+ * 'nblocks' is what you want to add to the current transaction.
+ *
+ * This might call jbd2_journal_restart() which will commit dirty buffers
+ * and then restart the transaction. Before calling
+ * ocfs2_extend_trans(), any changed blocks should have been
+ * dirtied. After calling it, all blocks which need to be changed must
+ * go through another set of journal_access/journal_dirty calls.
+ *
+ * WARNING: This will not release any semaphores or disk locks taken
+ * during the transaction, so make sure they were taken *before*
+ * start_trans or we'll have ordering deadlocks.
+ *
+ * WARNING2: Note that we do *not* drop j_trans_barrier here. This is
+ * good because transaction ids haven't yet been recorded on the
+ * cluster locks associated with this handle.
+ */
+int ocfs2_extend_trans(handle_t *handle, int nblocks)
+{
+	int status, old_nblocks;
+
+	BUG_ON(!handle);
+	BUG_ON(nblocks < 0);
+
+	if (!nblocks)
+		return 0;
+
+	old_nblocks = handle->h_buffer_credits;
+
+	trace_ocfs2_extend_trans(old_nblocks, nblocks);
+
+#ifdef CONFIG_OCFS2_DEBUG_FS
+	status = 1;
+#else
+	status = jbd2_journal_extend(handle, nblocks);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+#endif
+
+	if (status > 0) {
+		trace_ocfs2_extend_trans_restart(old_nblocks + nblocks);
+		status = jbd2_journal_restart(handle,
+					      old_nblocks + nblocks);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = 0;
+bail:
+	return status;
+}
+
+struct ocfs2_triggers {
+	struct jbd2_buffer_trigger_type	ot_triggers;
+	int				ot_offset;
+};
+
+static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
+{
+	return container_of(triggers, struct ocfs2_triggers, ot_triggers);
+}
+
+static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
+				 struct buffer_head *bh,
+				 void *data, size_t size)
+{
+	struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
+
+	/*
+	 * We aren't guaranteed to have the superblock here, so we
+	 * must unconditionally compute the ecc data.
+	 * __ocfs2_journal_access() will only set the triggers if
+	 * metaecc is enabled.
+	 */
+	ocfs2_block_check_compute(data, size, data + ot->ot_offset);
+}
+
+/*
+ * Quota blocks have their own trigger because the struct ocfs2_block_check
+ * offset depends on the blocksize.
+ */
+static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
+				 struct buffer_head *bh,
+				 void *data, size_t size)
+{
+	struct ocfs2_disk_dqtrailer *dqt =
+		ocfs2_block_dqtrailer(size, data);
+
+	/*
+	 * We aren't guaranteed to have the superblock here, so we
+	 * must unconditionally compute the ecc data.
+	 * __ocfs2_journal_access() will only set the triggers if
+	 * metaecc is enabled.
+	 */
+	ocfs2_block_check_compute(data, size, &dqt->dq_check);
+}
+
+/*
+ * Directory blocks also have their own trigger because the
+ * struct ocfs2_block_check offset depends on the blocksize.
+ */
+static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
+				 struct buffer_head *bh,
+				 void *data, size_t size)
+{
+	struct ocfs2_dir_block_trailer *trailer =
+		ocfs2_dir_trailer_from_size(size, data);
+
+	/*
+	 * We aren't guaranteed to have the superblock here, so we
+	 * must unconditionally compute the ecc data.
+	 * __ocfs2_journal_access() will only set the triggers if
+	 * metaecc is enabled.
+	 */
+	ocfs2_block_check_compute(data, size, &trailer->db_check);
+}
+
+static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
+				struct buffer_head *bh)
+{
+	mlog(ML_ERROR,
+	     "ocfs2_abort_trigger called by JBD2.  bh = 0x%lx, "
+	     "bh->b_blocknr = %llu\n",
+	     (unsigned long)bh,
+	     (unsigned long long)bh->b_blocknr);
+
+	/* We aren't guaranteed to have the superblock here - but if we
+	 * don't, it'll just crash. */
+	ocfs2_error(bh->b_assoc_map->host->i_sb,
+		    "JBD2 has aborted our journal, ocfs2 cannot continue\n");
+}
+
+static struct ocfs2_triggers di_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_dinode, i_check),
+};
+
+static struct ocfs2_triggers eb_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_extent_block, h_check),
+};
+
+static struct ocfs2_triggers rb_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_refcount_block, rf_check),
+};
+
+static struct ocfs2_triggers gd_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_group_desc, bg_check),
+};
+
+static struct ocfs2_triggers db_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_db_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+};
+
+static struct ocfs2_triggers xb_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_xattr_block, xb_check),
+};
+
+static struct ocfs2_triggers dq_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_dq_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+};
+
+static struct ocfs2_triggers dr_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_dx_root_block, dr_check),
+};
+
+static struct ocfs2_triggers dl_triggers = {
+	.ot_triggers = {
+		.t_frozen = ocfs2_frozen_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_dx_leaf, dl_check),
+};
+
+static int __ocfs2_journal_access(handle_t *handle,
+				  struct ocfs2_caching_info *ci,
+				  struct buffer_head *bh,
+				  struct ocfs2_triggers *triggers,
+				  int type)
+{
+	int status;
+	struct ocfs2_super *osb =
+		OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+	BUG_ON(!ci || !ci->ci_ops);
+	BUG_ON(!handle);
+	BUG_ON(!bh);
+
+	trace_ocfs2_journal_access(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)bh->b_blocknr, type, bh->b_size);
+
+	/* we can safely remove this assertion after testing. */
+	if (!buffer_uptodate(bh)) {
+		mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
+		mlog(ML_ERROR, "b_blocknr=%llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		BUG();
+	}
+
+	/* Set the current transaction information on the ci so
+	 * that the locking code knows whether it can drop it's locks
+	 * on this ci or not. We're protected from the commit
+	 * thread updating the current transaction id until
+	 * ocfs2_commit_trans() because ocfs2_start_trans() took
+	 * j_trans_barrier for us. */
+	ocfs2_set_ci_lock_trans(osb->journal, ci);
+
+	ocfs2_metadata_cache_io_lock(ci);
+	switch (type) {
+	case OCFS2_JOURNAL_ACCESS_CREATE:
+	case OCFS2_JOURNAL_ACCESS_WRITE:
+		status = jbd2_journal_get_write_access(handle, bh);
+		break;
+
+	case OCFS2_JOURNAL_ACCESS_UNDO:
+		status = jbd2_journal_get_undo_access(handle, bh);
+		break;
+
+	default:
+		status = -EINVAL;
+		mlog(ML_ERROR, "Unknown access type!\n");
+	}
+	if (!status && ocfs2_meta_ecc(osb) && triggers)
+		jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
+	ocfs2_metadata_cache_io_unlock(ci);
+
+	if (status < 0)
+		mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
+		     status, type);
+
+	return status;
+}
+
+int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type);
+}
+
+int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type);
+}
+
+int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &rb_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type);
+}
+
+int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type);
+}
+
+int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type);
+}
+
+int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type);
+}
+
+int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type);
+}
+
+int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type);
+}
+
+int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
+			 struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, NULL, type);
+}
+
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
+{
+	int status;
+
+	trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
+
+	status = jbd2_journal_dirty_metadata(handle, bh);
+	BUG_ON(status);
+}
+
+#define OCFS2_DEFAULT_COMMIT_INTERVAL	(HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
+
+void ocfs2_set_journal_params(struct ocfs2_super *osb)
+{
+	journal_t *journal = osb->journal->j_journal;
+	unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
+
+	if (osb->osb_commit_interval)
+		commit_interval = osb->osb_commit_interval;
+
+	write_lock(&journal->j_state_lock);
+	journal->j_commit_interval = commit_interval;
+	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
+		journal->j_flags |= JBD2_BARRIER;
+	else
+		journal->j_flags &= ~JBD2_BARRIER;
+	write_unlock(&journal->j_state_lock);
+}
+
+int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
+{
+	int status = -1;
+	struct inode *inode = NULL; /* the journal inode */
+	journal_t *j_journal = NULL;
+	struct ocfs2_dinode *di = NULL;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_super *osb;
+	int inode_lock = 0;
+
+	BUG_ON(!journal);
+
+	osb = journal->j_osb;
+
+	/* already have the inode for our journal */
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    osb->slot_num);
+	if (inode == NULL) {
+		status = -EACCES;
+		mlog_errno(status);
+		goto done;
+	}
+	if (is_bad_inode(inode)) {
+		mlog(ML_ERROR, "access error (bad inode)\n");
+		iput(inode);
+		inode = NULL;
+		status = -EACCES;
+		goto done;
+	}
+
+	SET_INODE_JOURNAL(inode);
+	OCFS2_I(inode)->ip_open_count++;
+
+	/* Skip recovery waits here - journal inode metadata never
+	 * changes in a live cluster so it can be considered an
+	 * exception to the rule. */
+	status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
+	if (status < 0) {
+		if (status != -ERESTARTSYS)
+			mlog(ML_ERROR, "Could not get lock on journal!\n");
+		goto done;
+	}
+
+	inode_lock = 1;
+	di = (struct ocfs2_dinode *)bh->b_data;
+
+	if (inode->i_size <  OCFS2_MIN_JOURNAL_SIZE) {
+		mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
+		     inode->i_size);
+		status = -EINVAL;
+		goto done;
+	}
+
+	trace_ocfs2_journal_init(inode->i_size,
+				 (unsigned long long)inode->i_blocks,
+				 OCFS2_I(inode)->ip_clusters);
+
+	/* call the kernels journal init function now */
+	j_journal = jbd2_journal_init_inode(inode);
+	if (j_journal == NULL) {
+		mlog(ML_ERROR, "Linux journal layer error\n");
+		status = -EINVAL;
+		goto done;
+	}
+
+	trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen);
+
+	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
+		  OCFS2_JOURNAL_DIRTY_FL);
+
+	journal->j_journal = j_journal;
+	journal->j_inode = inode;
+	journal->j_bh = bh;
+
+	ocfs2_set_journal_params(osb);
+
+	journal->j_state = OCFS2_JOURNAL_LOADED;
+
+	status = 0;
+done:
+	if (status < 0) {
+		if (inode_lock)
+			ocfs2_inode_unlock(inode, 1);
+		brelse(bh);
+		if (inode) {
+			OCFS2_I(inode)->ip_open_count--;
+			iput(inode);
+		}
+	}
+
+	return status;
+}
+
+static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di)
+{
+	le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1);
+}
+
+static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di)
+{
+	return le32_to_cpu(di->id1.journal1.ij_recovery_generation);
+}
+
+static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
+				      int dirty, int replayed)
+{
+	int status;
+	unsigned int flags;
+	struct ocfs2_journal *journal = osb->journal;
+	struct buffer_head *bh = journal->j_bh;
+	struct ocfs2_dinode *fe;
+
+	fe = (struct ocfs2_dinode *)bh->b_data;
+
+	/* The journal bh on the osb always comes from ocfs2_journal_init()
+	 * and was validated there inside ocfs2_inode_lock_full().  It's a
+	 * code bug if we mess it up. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+	if (dirty)
+		flags |= OCFS2_JOURNAL_DIRTY_FL;
+	else
+		flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+
+	if (replayed)
+		ocfs2_bump_recovery_generation(fe);
+
+	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
+	status = ocfs2_write_block(osb, bh, INODE_CACHE(journal->j_inode));
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+/*
+ * If the journal has been kmalloc'd it needs to be freed after this
+ * call.
+ */
+void ocfs2_journal_shutdown(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal *journal = NULL;
+	int status = 0;
+	struct inode *inode = NULL;
+	int num_running_trans = 0;
+
+	BUG_ON(!osb);
+
+	journal = osb->journal;
+	if (!journal)
+		goto done;
+
+	inode = journal->j_inode;
+
+	if (journal->j_state != OCFS2_JOURNAL_LOADED)
+		goto done;
+
+	/* need to inc inode use count - jbd2_journal_destroy will iput. */
+	if (!igrab(inode))
+		BUG();
+
+	num_running_trans = atomic_read(&(osb->journal->j_num_trans));
+	trace_ocfs2_journal_shutdown(num_running_trans);
+
+	/* Do a commit_cache here. It will flush our journal, *and*
+	 * release any locks that are still held.
+	 * set the SHUTDOWN flag and release the trans lock.
+	 * the commit thread will take the trans lock for us below. */
+	journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN;
+
+	/* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not
+	 * drop the trans_lock (which we want to hold until we
+	 * completely destroy the journal. */
+	if (osb->commit_task) {
+		/* Wait for the commit thread */
+		trace_ocfs2_journal_shutdown_wait(osb->commit_task);
+		kthread_stop(osb->commit_task);
+		osb->commit_task = NULL;
+	}
+
+	BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
+
+	if (ocfs2_mount_local(osb)) {
+		jbd2_journal_lock_updates(journal->j_journal);
+		status = jbd2_journal_flush(journal->j_journal);
+		jbd2_journal_unlock_updates(journal->j_journal);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	if (status == 0) {
+		/*
+		 * Do not toggle if flush was unsuccessful otherwise
+		 * will leave dirty metadata in a "clean" journal
+		 */
+		status = ocfs2_journal_toggle_dirty(osb, 0, 0);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	/* Shutdown the kernel journal system */
+	jbd2_journal_destroy(journal->j_journal);
+	journal->j_journal = NULL;
+
+	OCFS2_I(inode)->ip_open_count--;
+
+	/* unlock our journal */
+	ocfs2_inode_unlock(inode, 1);
+
+	brelse(journal->j_bh);
+	journal->j_bh = NULL;
+
+	journal->j_state = OCFS2_JOURNAL_FREE;
+
+//	up_write(&journal->j_trans_barrier);
+done:
+	if (inode)
+		iput(inode);
+}
+
+static void ocfs2_clear_journal_error(struct super_block *sb,
+				      journal_t *journal,
+				      int slot)
+{
+	int olderr;
+
+	olderr = jbd2_journal_errno(journal);
+	if (olderr) {
+		mlog(ML_ERROR, "File system error %d recorded in "
+		     "journal %u.\n", olderr, slot);
+		mlog(ML_ERROR, "File system on device %s needs checking.\n",
+		     sb->s_id);
+
+		jbd2_journal_ack_err(journal);
+		jbd2_journal_clear_err(journal);
+	}
+}
+
+int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
+{
+	int status = 0;
+	struct ocfs2_super *osb;
+
+	BUG_ON(!journal);
+
+	osb = journal->j_osb;
+
+	status = jbd2_journal_load(journal->j_journal);
+	if (status < 0) {
+		mlog(ML_ERROR, "Failed to load journal!\n");
+		goto done;
+	}
+
+	ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
+
+	status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* Launch the commit thread */
+	if (!local) {
+		osb->commit_task = kthread_run(ocfs2_commit_thread, osb,
+					       "ocfs2cmt");
+		if (IS_ERR(osb->commit_task)) {
+			status = PTR_ERR(osb->commit_task);
+			osb->commit_task = NULL;
+			mlog(ML_ERROR, "unable to launch ocfs2commit thread, "
+			     "error=%d", status);
+			goto done;
+		}
+	} else
+		osb->commit_task = NULL;
+
+done:
+	return status;
+}
+
+
+/* 'full' flag tells us whether we clear out all blocks or if we just
+ * mark the journal clean */
+int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
+{
+	int status;
+
+	BUG_ON(!journal);
+
+	status = jbd2_journal_wipe(journal->j_journal, full);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	return status;
+}
+
+static int ocfs2_recovery_completed(struct ocfs2_super *osb)
+{
+	int empty;
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+	spin_lock(&osb->osb_lock);
+	empty = (rm->rm_used == 0);
+	spin_unlock(&osb->osb_lock);
+
+	return empty;
+}
+
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
+{
+	wait_event(osb->recovery_event, ocfs2_recovery_completed(osb));
+}
+
+/*
+ * JBD Might read a cached version of another nodes journal file. We
+ * don't want this as this file changes often and we get no
+ * notification on those changes. The only way to be sure that we've
+ * got the most up to date version of those blocks then is to force
+ * read them off disk. Just searching through the buffer cache won't
+ * work as there may be pages backing this file which are still marked
+ * up to date. We know things can't change on this file underneath us
+ * as we have the lock by now :)
+ */
+static int ocfs2_force_read_journal(struct inode *inode)
+{
+	int status = 0;
+	int i;
+	u64 v_blkno, p_blkno, p_blocks, num_blocks;
+#define CONCURRENT_JOURNAL_FILL 32ULL
+	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
+
+	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
+
+	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
+	v_blkno = 0;
+	while (v_blkno < num_blocks) {
+		status = ocfs2_extent_map_get_blocks(inode, v_blkno,
+						     &p_blkno, &p_blocks, NULL);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (p_blocks > CONCURRENT_JOURNAL_FILL)
+			p_blocks = CONCURRENT_JOURNAL_FILL;
+
+		/* We are reading journal data which should not
+		 * be put in the uptodate cache */
+		status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
+						p_blkno, p_blocks, bhs);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		for(i = 0; i < p_blocks; i++) {
+			brelse(bhs[i]);
+			bhs[i] = NULL;
+		}
+
+		v_blkno += p_blocks;
+	}
+
+bail:
+	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
+		brelse(bhs[i]);
+	return status;
+}
+
+struct ocfs2_la_recovery_item {
+	struct list_head	lri_list;
+	int			lri_slot;
+	struct ocfs2_dinode	*lri_la_dinode;
+	struct ocfs2_dinode	*lri_tl_dinode;
+	struct ocfs2_quota_recovery *lri_qrec;
+};
+
+/* Does the second half of the recovery process. By this point, the
+ * node is marked clean and can actually be considered recovered,
+ * hence it's no longer in the recovery map, but there's still some
+ * cleanup we can do which shouldn't happen within the recovery thread
+ * as locking in that context becomes very difficult if we are to take
+ * recovering nodes into account.
+ *
+ * NOTE: This function can and will sleep on recovery of other nodes
+ * during cluster locking, just like any other ocfs2 process.
+ */
+void ocfs2_complete_recovery(struct work_struct *work)
+{
+	int ret = 0;
+	struct ocfs2_journal *journal =
+		container_of(work, struct ocfs2_journal, j_recovery_work);
+	struct ocfs2_super *osb = journal->j_osb;
+	struct ocfs2_dinode *la_dinode, *tl_dinode;
+	struct ocfs2_la_recovery_item *item, *n;
+	struct ocfs2_quota_recovery *qrec;
+	LIST_HEAD(tmp_la_list);
+
+	trace_ocfs2_complete_recovery(
+		(unsigned long long)OCFS2_I(journal->j_inode)->ip_blkno);
+
+	spin_lock(&journal->j_lock);
+	list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
+	spin_unlock(&journal->j_lock);
+
+	list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
+		list_del_init(&item->lri_list);
+
+		ocfs2_wait_on_quotas(osb);
+
+		la_dinode = item->lri_la_dinode;
+		tl_dinode = item->lri_tl_dinode;
+		qrec = item->lri_qrec;
+
+		trace_ocfs2_complete_recovery_slot(item->lri_slot,
+			la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
+			tl_dinode ? le64_to_cpu(tl_dinode->i_blkno) : 0,
+			qrec);
+
+		if (la_dinode) {
+			ret = ocfs2_complete_local_alloc_recovery(osb,
+								  la_dinode);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			kfree(la_dinode);
+		}
+
+		if (tl_dinode) {
+			ret = ocfs2_complete_truncate_log_recovery(osb,
+								   tl_dinode);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			kfree(tl_dinode);
+		}
+
+		ret = ocfs2_recover_orphans(osb, item->lri_slot);
+		if (ret < 0)
+			mlog_errno(ret);
+
+		if (qrec) {
+			ret = ocfs2_finish_quota_recovery(osb, qrec,
+							  item->lri_slot);
+			if (ret < 0)
+				mlog_errno(ret);
+			/* Recovery info is already freed now */
+		}
+
+		kfree(item);
+	}
+
+	trace_ocfs2_complete_recovery_end(ret);
+}
+
+/* NOTE: This function always eats your references to la_dinode and
+ * tl_dinode, either manually on error, or by passing them to
+ * ocfs2_complete_recovery */
+static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
+					    int slot_num,
+					    struct ocfs2_dinode *la_dinode,
+					    struct ocfs2_dinode *tl_dinode,
+					    struct ocfs2_quota_recovery *qrec)
+{
+	struct ocfs2_la_recovery_item *item;
+
+	item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_NOFS);
+	if (!item) {
+		/* Though we wish to avoid it, we are in fact safe in
+		 * skipping local alloc cleanup as fsck.ocfs2 is more
+		 * than capable of reclaiming unused space. */
+		if (la_dinode)
+			kfree(la_dinode);
+
+		if (tl_dinode)
+			kfree(tl_dinode);
+
+		if (qrec)
+			ocfs2_free_quota_recovery(qrec);
+
+		mlog_errno(-ENOMEM);
+		return;
+	}
+
+	INIT_LIST_HEAD(&item->lri_list);
+	item->lri_la_dinode = la_dinode;
+	item->lri_slot = slot_num;
+	item->lri_tl_dinode = tl_dinode;
+	item->lri_qrec = qrec;
+
+	spin_lock(&journal->j_lock);
+	list_add_tail(&item->lri_list, &journal->j_la_cleanups);
+	queue_work(ocfs2_wq, &journal->j_recovery_work);
+	spin_unlock(&journal->j_lock);
+}
+
+/* Called by the mount code to queue recovery the last part of
+ * recovery for it's own and offline slot(s). */
+void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal *journal = osb->journal;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return;
+
+	/* No need to queue up our truncate_log as regular cleanup will catch
+	 * that */
+	ocfs2_queue_recovery_completion(journal, osb->slot_num,
+					osb->local_alloc_copy, NULL, NULL);
+	ocfs2_schedule_truncate_log_flush(osb, 0);
+
+	osb->local_alloc_copy = NULL;
+	osb->dirty = 0;
+
+	/* queue to recover orphan slots for all offline slots */
+	ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
+	ocfs2_queue_replay_slots(osb);
+	ocfs2_free_replay_slots(osb);
+}
+
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
+{
+	if (osb->quota_rec) {
+		ocfs2_queue_recovery_completion(osb->journal,
+						osb->slot_num,
+						NULL,
+						NULL,
+						osb->quota_rec);
+		osb->quota_rec = NULL;
+	}
+}
+
+static int __ocfs2_recovery_thread(void *arg)
+{
+	int status, node_num, slot_num;
+	struct ocfs2_super *osb = arg;
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+	int *rm_quota = NULL;
+	int rm_quota_used = 0, i;
+	struct ocfs2_quota_recovery *qrec;
+
+	status = ocfs2_wait_on_mount(osb);
+	if (status < 0) {
+		goto bail;
+	}
+
+	rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
+	if (!rm_quota) {
+		status = -ENOMEM;
+		goto bail;
+	}
+restart:
+	status = ocfs2_super_lock(osb, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_compute_replay_slots(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* queue recovery for our own slot */
+	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
+					NULL, NULL);
+
+	spin_lock(&osb->osb_lock);
+	while (rm->rm_used) {
+		/* It's always safe to remove entry zero, as we won't
+		 * clear it until ocfs2_recover_node() has succeeded. */
+		node_num = rm->rm_entries[0];
+		spin_unlock(&osb->osb_lock);
+		slot_num = ocfs2_node_num_to_slot(osb, node_num);
+		trace_ocfs2_recovery_thread_node(node_num, slot_num);
+		if (slot_num == -ENOENT) {
+			status = 0;
+			goto skip_recovery;
+		}
+
+		/* It is a bit subtle with quota recovery. We cannot do it
+		 * immediately because we have to obtain cluster locks from
+		 * quota files and we also don't want to just skip it because
+		 * then quota usage would be out of sync until some node takes
+		 * the slot. So we remember which nodes need quota recovery
+		 * and when everything else is done, we recover quotas. */
+		for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
+		if (i == rm_quota_used)
+			rm_quota[rm_quota_used++] = slot_num;
+
+		status = ocfs2_recover_node(osb, node_num, slot_num);
+skip_recovery:
+		if (!status) {
+			ocfs2_recovery_map_clear(osb, node_num);
+		} else {
+			mlog(ML_ERROR,
+			     "Error %d recovering node %d on device (%u,%u)!\n",
+			     status, node_num,
+			     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+			mlog(ML_ERROR, "Volume requires unmount.\n");
+		}
+
+		spin_lock(&osb->osb_lock);
+	}
+	spin_unlock(&osb->osb_lock);
+	trace_ocfs2_recovery_thread_end(status);
+
+	/* Refresh all journal recovery generations from disk */
+	status = ocfs2_check_journals_nolocks(osb);
+	status = (status == -EROFS) ? 0 : status;
+	if (status < 0)
+		mlog_errno(status);
+
+	/* Now it is right time to recover quotas... We have to do this under
+	 * superblock lock so that no one can start using the slot (and crash)
+	 * before we recover it */
+	for (i = 0; i < rm_quota_used; i++) {
+		qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+		if (IS_ERR(qrec)) {
+			status = PTR_ERR(qrec);
+			mlog_errno(status);
+			continue;
+		}
+		ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
+						NULL, NULL, qrec);
+	}
+
+	ocfs2_super_unlock(osb, 1);
+
+	/* queue recovery for offline slots */
+	ocfs2_queue_replay_slots(osb);
+
+bail:
+	mutex_lock(&osb->recovery_lock);
+	if (!status && !ocfs2_recovery_completed(osb)) {
+		mutex_unlock(&osb->recovery_lock);
+		goto restart;
+	}
+
+	ocfs2_free_replay_slots(osb);
+	osb->recovery_thread_task = NULL;
+	mb(); /* sync with ocfs2_recovery_thread_running */
+	wake_up(&osb->recovery_event);
+
+	mutex_unlock(&osb->recovery_lock);
+
+	if (rm_quota)
+		kfree(rm_quota);
+
+	/* no one is callint kthread_stop() for us so the kthread() api
+	 * requires that we call do_exit().  And it isn't exported, but
+	 * complete_and_exit() seems to be a minimal wrapper around it. */
+	complete_and_exit(NULL, status);
+	return status;
+}
+
+void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
+{
+	mutex_lock(&osb->recovery_lock);
+
+	trace_ocfs2_recovery_thread(node_num, osb->node_num,
+		osb->disable_recovery, osb->recovery_thread_task,
+		osb->disable_recovery ?
+		-1 : ocfs2_recovery_map_set(osb, node_num));
+
+	if (osb->disable_recovery)
+		goto out;
+
+	if (osb->recovery_thread_task)
+		goto out;
+
+	osb->recovery_thread_task =  kthread_run(__ocfs2_recovery_thread, osb,
+						 "ocfs2rec");
+	if (IS_ERR(osb->recovery_thread_task)) {
+		mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
+		osb->recovery_thread_task = NULL;
+	}
+
+out:
+	mutex_unlock(&osb->recovery_lock);
+	wake_up(&osb->recovery_event);
+}
+
+static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
+				    int slot_num,
+				    struct buffer_head **bh,
+				    struct inode **ret_inode)
+{
+	int status = -EACCES;
+	struct inode *inode = NULL;
+
+	BUG_ON(slot_num >= osb->max_slots);
+
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    slot_num);
+	if (!inode || is_bad_inode(inode)) {
+		mlog_errno(status);
+		goto bail;
+	}
+	SET_INODE_JOURNAL(inode);
+
+	status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+
+bail:
+	if (inode) {
+		if (status || !ret_inode)
+			iput(inode);
+		else
+			*ret_inode = inode;
+	}
+	return status;
+}
+
+/* Does the actual journal replay and marks the journal inode as
+ * clean. Will only replay if the journal inode is marked dirty. */
+static int ocfs2_replay_journal(struct ocfs2_super *osb,
+				int node_num,
+				int slot_num)
+{
+	int status;
+	int got_lock = 0;
+	unsigned int flags;
+	struct inode *inode = NULL;
+	struct ocfs2_dinode *fe;
+	journal_t *journal = NULL;
+	struct buffer_head *bh = NULL;
+	u32 slot_reco_gen;
+
+	status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode);
+	if (status) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	fe = (struct ocfs2_dinode *)bh->b_data;
+	slot_reco_gen = ocfs2_get_recovery_generation(fe);
+	brelse(bh);
+	bh = NULL;
+
+	/*
+	 * As the fs recovery is asynchronous, there is a small chance that
+	 * another node mounted (and recovered) the slot before the recovery
+	 * thread could get the lock. To handle that, we dirty read the journal
+	 * inode for that slot to get the recovery generation. If it is
+	 * different than what we expected, the slot has been recovered.
+	 * If not, it needs recovery.
+	 */
+	if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
+		trace_ocfs2_replay_journal_recovered(slot_num,
+		     osb->slot_recovery_generations[slot_num], slot_reco_gen);
+		osb->slot_recovery_generations[slot_num] = slot_reco_gen;
+		status = -EBUSY;
+		goto done;
+	}
+
+	/* Continue with recovery as the journal has not yet been recovered */
+
+	status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
+	if (status < 0) {
+		trace_ocfs2_replay_journal_lock_err(status);
+		if (status != -ERESTARTSYS)
+			mlog(ML_ERROR, "Could not lock journal!\n");
+		goto done;
+	}
+	got_lock = 1;
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+	slot_reco_gen = ocfs2_get_recovery_generation(fe);
+
+	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
+		trace_ocfs2_replay_journal_skip(node_num);
+		/* Refresh recovery generation for the slot */
+		osb->slot_recovery_generations[slot_num] = slot_reco_gen;
+		goto done;
+	}
+
+	/* we need to run complete recovery for offline orphan slots */
+	ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
+
+	mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
+	     node_num, slot_num,
+	     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+
+	status = ocfs2_force_read_journal(inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	journal = jbd2_journal_init_inode(inode);
+	if (journal == NULL) {
+		mlog(ML_ERROR, "Linux journal layer error\n");
+		status = -EIO;
+		goto done;
+	}
+
+	status = jbd2_journal_load(journal);
+	if (status < 0) {
+		mlog_errno(status);
+		if (!igrab(inode))
+			BUG();
+		jbd2_journal_destroy(journal);
+		goto done;
+	}
+
+	ocfs2_clear_journal_error(osb->sb, journal, slot_num);
+
+	/* wipe the journal */
+	jbd2_journal_lock_updates(journal);
+	status = jbd2_journal_flush(journal);
+	jbd2_journal_unlock_updates(journal);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* This will mark the node clean */
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+	flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+
+	/* Increment recovery generation to indicate successful recovery */
+	ocfs2_bump_recovery_generation(fe);
+	osb->slot_recovery_generations[slot_num] =
+					ocfs2_get_recovery_generation(fe);
+
+	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
+	status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
+	if (status < 0)
+		mlog_errno(status);
+
+	if (!igrab(inode))
+		BUG();
+
+	jbd2_journal_destroy(journal);
+
+done:
+	/* drop the lock on this nodes journal */
+	if (got_lock)
+		ocfs2_inode_unlock(inode, 1);
+
+	if (inode)
+		iput(inode);
+
+	brelse(bh);
+
+	return status;
+}
+
+/*
+ * Do the most important parts of node recovery:
+ *  - Replay it's journal
+ *  - Stamp a clean local allocator file
+ *  - Stamp a clean truncate log
+ *  - Mark the node clean
+ *
+ * If this function completes without error, a node in OCFS2 can be
+ * said to have been safely recovered. As a result, failure during the
+ * second part of a nodes recovery process (local alloc recovery) is
+ * far less concerning.
+ */
+static int ocfs2_recover_node(struct ocfs2_super *osb,
+			      int node_num, int slot_num)
+{
+	int status = 0;
+	struct ocfs2_dinode *la_copy = NULL;
+	struct ocfs2_dinode *tl_copy = NULL;
+
+	trace_ocfs2_recover_node(node_num, slot_num, osb->node_num);
+
+	/* Should not ever be called to recover ourselves -- in that
+	 * case we should've called ocfs2_journal_load instead. */
+	BUG_ON(osb->node_num == node_num);
+
+	status = ocfs2_replay_journal(osb, node_num, slot_num);
+	if (status < 0) {
+		if (status == -EBUSY) {
+			trace_ocfs2_recover_node_skip(slot_num, node_num);
+			status = 0;
+			goto done;
+		}
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* Stamp a clean local alloc file AFTER recovering the journal... */
+	status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* An error from begin_truncate_log_recovery is not
+	 * serious enough to warrant halting the rest of
+	 * recovery. */
+	status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* Likewise, this would be a strange but ultimately not so
+	 * harmful place to get an error... */
+	status = ocfs2_clear_slot(osb, slot_num);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* This will kfree the memory pointed to by la_copy and tl_copy */
+	ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
+					tl_copy, NULL);
+
+	status = 0;
+done:
+
+	return status;
+}
+
+/* Test node liveness by trylocking his journal. If we get the lock,
+ * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
+ * still alive (we couldn't get the lock) and < 0 on error. */
+static int ocfs2_trylock_journal(struct ocfs2_super *osb,
+				 int slot_num)
+{
+	int status, flags;
+	struct inode *inode = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    slot_num);
+	if (inode == NULL) {
+		mlog(ML_ERROR, "access error\n");
+		status = -EACCES;
+		goto bail;
+	}
+	if (is_bad_inode(inode)) {
+		mlog(ML_ERROR, "access error (bad inode)\n");
+		iput(inode);
+		inode = NULL;
+		status = -EACCES;
+		goto bail;
+	}
+	SET_INODE_JOURNAL(inode);
+
+	flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
+	status = ocfs2_inode_lock_full(inode, NULL, 1, flags);
+	if (status < 0) {
+		if (status != -EAGAIN)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_inode_unlock(inode, 1);
+bail:
+	if (inode)
+		iput(inode);
+
+	return status;
+}
+
+/* Call this underneath ocfs2_super_lock. It also assumes that the
+ * slot info struct has been updated from disk. */
+int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
+{
+	unsigned int node_num;
+	int status, i;
+	u32 gen;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *di;
+
+	/* This is called with the super block cluster lock, so we
+	 * know that the slot map can't change underneath us. */
+
+	for (i = 0; i < osb->max_slots; i++) {
+		/* Read journal inode to get the recovery generation */
+		status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+		di = (struct ocfs2_dinode *)bh->b_data;
+		gen = ocfs2_get_recovery_generation(di);
+		brelse(bh);
+		bh = NULL;
+
+		spin_lock(&osb->osb_lock);
+		osb->slot_recovery_generations[i] = gen;
+
+		trace_ocfs2_mark_dead_nodes(i,
+					    osb->slot_recovery_generations[i]);
+
+		if (i == osb->slot_num) {
+			spin_unlock(&osb->osb_lock);
+			continue;
+		}
+
+		status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
+		if (status == -ENOENT) {
+			spin_unlock(&osb->osb_lock);
+			continue;
+		}
+
+		if (__ocfs2_recovery_map_test(osb, node_num)) {
+			spin_unlock(&osb->osb_lock);
+			continue;
+		}
+		spin_unlock(&osb->osb_lock);
+
+		/* Ok, we have a slot occupied by another node which
+		 * is not in the recovery map. We trylock his journal
+		 * file here to test if he's alive. */
+		status = ocfs2_trylock_journal(osb, i);
+		if (!status) {
+			/* Since we're called from mount, we know that
+			 * the recovery thread can't race us on
+			 * setting / checking the recovery bits. */
+			ocfs2_recovery_thread(osb, node_num);
+		} else if ((status < 0) && (status != -EAGAIN)) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = 0;
+bail:
+	return status;
+}
+
+/*
+ * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
+ * randomness to the timeout to minimize multple nodes firing the timer at the
+ * same time.
+ */
+static inline unsigned long ocfs2_orphan_scan_timeout(void)
+{
+	unsigned long time;
+
+	get_random_bytes(&time, sizeof(time));
+	time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
+	return msecs_to_jiffies(time);
+}
+
+/*
+ * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
+ * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
+ * is done to catch any orphans that are left over in orphan directories.
+ *
+ * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
+ * seconds.  It gets an EX lock on os_lockres and checks sequence number
+ * stored in LVB. If the sequence number has changed, it means some other
+ * node has done the scan.  This node skips the scan and tracks the
+ * sequence number.  If the sequence number didn't change, it means a scan
+ * hasn't happened.  The node queues a scan and increments the
+ * sequence number in the LVB.
+ */
+void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
+{
+	struct ocfs2_orphan_scan *os;
+	int status, i;
+	u32 seqno = 0;
+
+	os = &osb->osb_orphan_scan;
+
+	if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+		goto out;
+
+	trace_ocfs2_queue_orphan_scan_begin(os->os_count, os->os_seqno,
+					    atomic_read(&os->os_state));
+
+	status = ocfs2_orphan_scan_lock(osb, &seqno);
+	if (status < 0) {
+		if (status != -EAGAIN)
+			mlog_errno(status);
+		goto out;
+	}
+
+	/* Do no queue the tasks if the volume is being umounted */
+	if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+		goto unlock;
+
+	if (os->os_seqno != seqno) {
+		os->os_seqno = seqno;
+		goto unlock;
+	}
+
+	for (i = 0; i < osb->max_slots; i++)
+		ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
+						NULL);
+	/*
+	 * We queued a recovery on orphan slots, increment the sequence
+	 * number and update LVB so other node will skip the scan for a while
+	 */
+	seqno++;
+	os->os_count++;
+	os->os_scantime = CURRENT_TIME;
+unlock:
+	ocfs2_orphan_scan_unlock(osb, seqno);
+out:
+	trace_ocfs2_queue_orphan_scan_end(os->os_count, os->os_seqno,
+					  atomic_read(&os->os_state));
+	return;
+}
+
+/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
+void ocfs2_orphan_scan_work(struct work_struct *work)
+{
+	struct ocfs2_orphan_scan *os;
+	struct ocfs2_super *osb;
+
+	os = container_of(work, struct ocfs2_orphan_scan,
+			  os_orphan_scan_work.work);
+	osb = os->os_osb;
+
+	mutex_lock(&os->os_lock);
+	ocfs2_queue_orphan_scan(osb);
+	if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
+		queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+				      ocfs2_orphan_scan_timeout());
+	mutex_unlock(&os->os_lock);
+}
+
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
+{
+	struct ocfs2_orphan_scan *os;
+
+	os = &osb->osb_orphan_scan;
+	if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) {
+		atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
+		mutex_lock(&os->os_lock);
+		cancel_delayed_work(&os->os_orphan_scan_work);
+		mutex_unlock(&os->os_lock);
+	}
+}
+
+void ocfs2_orphan_scan_init(struct ocfs2_super *osb)
+{
+	struct ocfs2_orphan_scan *os;
+
+	os = &osb->osb_orphan_scan;
+	os->os_osb = osb;
+	os->os_count = 0;
+	os->os_seqno = 0;
+	mutex_init(&os->os_lock);
+	INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work);
+}
+
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
+{
+	struct ocfs2_orphan_scan *os;
+
+	os = &osb->osb_orphan_scan;
+	os->os_scantime = CURRENT_TIME;
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
+		atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
+	else {
+		atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
+		queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+				   ocfs2_orphan_scan_timeout());
+	}
+}
+
+struct ocfs2_orphan_filldir_priv {
+	struct inode		*head;
+	struct ocfs2_super	*osb;
+};
+
+static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
+				loff_t pos, u64 ino, unsigned type)
+{
+	struct ocfs2_orphan_filldir_priv *p = priv;
+	struct inode *iter;
+
+	if (name_len == 1 && !strncmp(".", name, 1))
+		return 0;
+	if (name_len == 2 && !strncmp("..", name, 2))
+		return 0;
+
+	/* Skip bad inodes so that recovery can continue */
+	iter = ocfs2_iget(p->osb, ino,
+			  OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
+	if (IS_ERR(iter))
+		return 0;
+
+	trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
+	/* No locking is required for the next_orphan queue as there
+	 * is only ever a single process doing orphan recovery. */
+	OCFS2_I(iter)->ip_next_orphan = p->head;
+	p->head = iter;
+
+	return 0;
+}
+
+static int ocfs2_queue_orphans(struct ocfs2_super *osb,
+			       int slot,
+			       struct inode **head)
+{
+	int status;
+	struct inode *orphan_dir_inode = NULL;
+	struct ocfs2_orphan_filldir_priv priv;
+	loff_t pos = 0;
+
+	priv.osb = osb;
+	priv.head = *head;
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       slot);
+	if  (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		return status;
+	}
+
+	mutex_lock(&orphan_dir_inode->i_mutex);
+	status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv,
+				   ocfs2_orphan_filldir);
+	if (status) {
+		mlog_errno(status);
+		goto out_cluster;
+	}
+
+	*head = priv.head;
+
+out_cluster:
+	ocfs2_inode_unlock(orphan_dir_inode, 0);
+out:
+	mutex_unlock(&orphan_dir_inode->i_mutex);
+	iput(orphan_dir_inode);
+	return status;
+}
+
+static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb,
+					      int slot)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = !osb->osb_orphan_wipes[slot];
+	spin_unlock(&osb->osb_lock);
+	return ret;
+}
+
+static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb,
+					     int slot)
+{
+	spin_lock(&osb->osb_lock);
+	/* Mark ourselves such that new processes in delete_inode()
+	 * know to quit early. */
+	ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
+	while (osb->osb_orphan_wipes[slot]) {
+		/* If any processes are already in the middle of an
+		 * orphan wipe on this dir, then we need to wait for
+		 * them. */
+		spin_unlock(&osb->osb_lock);
+		wait_event_interruptible(osb->osb_wipe_event,
+					 ocfs2_orphan_recovery_can_continue(osb, slot));
+		spin_lock(&osb->osb_lock);
+	}
+	spin_unlock(&osb->osb_lock);
+}
+
+static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
+					      int slot)
+{
+	ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
+}
+
+/*
+ * Orphan recovery. Each mounted node has it's own orphan dir which we
+ * must run during recovery. Our strategy here is to build a list of
+ * the inodes in the orphan dir and iget/iput them. The VFS does
+ * (most) of the rest of the work.
+ *
+ * Orphan recovery can happen at any time, not just mount so we have a
+ * couple of extra considerations.
+ *
+ * - We grab as many inodes as we can under the orphan dir lock -
+ *   doing iget() outside the orphan dir risks getting a reference on
+ *   an invalid inode.
+ * - We must be sure not to deadlock with other processes on the
+ *   system wanting to run delete_inode(). This can happen when they go
+ *   to lock the orphan dir and the orphan recovery process attempts to
+ *   iget() inside the orphan dir lock. This can be avoided by
+ *   advertising our state to ocfs2_delete_inode().
+ */
+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
+				 int slot)
+{
+	int ret = 0;
+	struct inode *inode = NULL;
+	struct inode *iter;
+	struct ocfs2_inode_info *oi;
+
+	trace_ocfs2_recover_orphans(slot);
+
+	ocfs2_mark_recovering_orphan_dir(osb, slot);
+	ret = ocfs2_queue_orphans(osb, slot, &inode);
+	ocfs2_clear_recovering_orphan_dir(osb, slot);
+
+	/* Error here should be noted, but we want to continue with as
+	 * many queued inodes as we've got. */
+	if (ret)
+		mlog_errno(ret);
+
+	while (inode) {
+		oi = OCFS2_I(inode);
+		trace_ocfs2_recover_orphans_iput(
+					(unsigned long long)oi->ip_blkno);
+
+		iter = oi->ip_next_orphan;
+
+		spin_lock(&oi->ip_lock);
+		/* The remote delete code may have set these on the
+		 * assumption that the other node would wipe them
+		 * successfully.  If they are still in the node's
+		 * orphan dir, we need to reset that state. */
+		oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
+
+		/* Set the proper information to get us going into
+		 * ocfs2_delete_inode. */
+		oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+		spin_unlock(&oi->ip_lock);
+
+		iput(inode);
+
+		inode = iter;
+	}
+
+	return ret;
+}
+
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
+{
+	/* This check is good because ocfs2 will wait on our recovery
+	 * thread before changing it to something other than MOUNTED
+	 * or DISABLED. */
+	wait_event(osb->osb_mount_event,
+		  (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) ||
+		   atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS ||
+		   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
+
+	/* If there's an error on mount, then we may never get to the
+	 * MOUNTED flag, but this is set right before
+	 * dismount_volume() so we can trust it. */
+	if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
+		trace_ocfs2_wait_on_mount(VOLUME_DISABLED);
+		mlog(0, "mount error, exiting!\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int ocfs2_commit_thread(void *arg)
+{
+	int status;
+	struct ocfs2_super *osb = arg;
+	struct ocfs2_journal *journal = osb->journal;
+
+	/* we can trust j_num_trans here because _should_stop() is only set in
+	 * shutdown and nobody other than ourselves should be able to start
+	 * transactions.  committing on shutdown might take a few iterations
+	 * as final transactions put deleted inodes on the list */
+	while (!(kthread_should_stop() &&
+		 atomic_read(&journal->j_num_trans) == 0)) {
+
+		wait_event_interruptible(osb->checkpoint_event,
+					 atomic_read(&journal->j_num_trans)
+					 || kthread_should_stop());
+
+		status = ocfs2_commit_cache(osb);
+		if (status < 0)
+			mlog_errno(status);
+
+		if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
+			mlog(ML_KTHREAD,
+			     "commit_thread: %u transactions pending on "
+			     "shutdown\n",
+			     atomic_read(&journal->j_num_trans));
+		}
+	}
+
+	return 0;
+}
+
+/* Reads all the journal inodes without taking any cluster locks. Used
+ * for hard readonly access to determine whether any journal requires
+ * recovery. Also used to refresh the recovery generation numbers after
+ * a journal has been recovered by another node.
+ */
+int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
+{
+	int ret = 0;
+	unsigned int slot;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	int journal_dirty = 0;
+
+	for(slot = 0; slot < osb->max_slots; slot++) {
+		ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		di = (struct ocfs2_dinode *) di_bh->b_data;
+
+		osb->slot_recovery_generations[slot] =
+					ocfs2_get_recovery_generation(di);
+
+		if (le32_to_cpu(di->id1.journal1.ij_flags) &
+		    OCFS2_JOURNAL_DIRTY_FL)
+			journal_dirty = 1;
+
+		brelse(di_bh);
+		di_bh = NULL;
+	}
+
+out:
+	if (journal_dirty)
+		ret = -EROFS;
+	return ret;
+}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
new file mode 100644
index 00000000..68cf2f6d
--- /dev/null
+++ b/fs/ocfs2/journal.h
@@ -0,0 +1,619 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * journal.h
+ *
+ * Defines journalling api and structures.
+ *
+ * Copyright (C) 2003, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_JOURNAL_H
+#define OCFS2_JOURNAL_H
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+
+enum ocfs2_journal_state {
+	OCFS2_JOURNAL_FREE = 0,
+	OCFS2_JOURNAL_LOADED,
+	OCFS2_JOURNAL_IN_SHUTDOWN,
+};
+
+struct ocfs2_super;
+struct ocfs2_dinode;
+
+/*
+ * The recovery_list is a simple linked list of node numbers to recover.
+ * It is protected by the recovery_lock.
+ */
+
+struct ocfs2_recovery_map {
+	unsigned int rm_used;
+	unsigned int *rm_entries;
+};
+
+
+struct ocfs2_journal {
+	enum ocfs2_journal_state   j_state;    /* Journals current state   */
+
+	journal_t                 *j_journal; /* The kernels journal type */
+	struct inode              *j_inode;   /* Kernel inode pointing to
+					       * this journal             */
+	struct ocfs2_super        *j_osb;     /* pointer to the super
+					       * block for the node
+					       * we're currently
+					       * running on -- not
+					       * necessarily the super
+					       * block from the node
+					       * which we usually run
+					       * from (recovery,
+					       * etc)                     */
+	struct buffer_head        *j_bh;      /* Journal disk inode block */
+	atomic_t                  j_num_trans; /* Number of transactions
+					        * currently in the system. */
+	spinlock_t                j_lock;
+	unsigned long             j_trans_id;
+	struct rw_semaphore       j_trans_barrier;
+	wait_queue_head_t         j_checkpointed;
+
+	/* both fields protected by j_lock*/
+	struct list_head          j_la_cleanups;
+	struct work_struct        j_recovery_work;
+};
+
+extern spinlock_t trans_inc_lock;
+
+/* wrap j_trans_id so we never have it equal to zero. */
+static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
+{
+	unsigned long old_id;
+	spin_lock(&trans_inc_lock);
+	old_id = j->j_trans_id++;
+	if (unlikely(!j->j_trans_id))
+		j->j_trans_id = 1;
+	spin_unlock(&trans_inc_lock);
+	return old_id;
+}
+
+static inline void ocfs2_set_ci_lock_trans(struct ocfs2_journal *journal,
+					   struct ocfs2_caching_info *ci)
+{
+	spin_lock(&trans_inc_lock);
+	ci->ci_last_trans = journal->j_trans_id;
+	spin_unlock(&trans_inc_lock);
+}
+
+/* Used to figure out whether it's safe to drop a metadata lock on an
+ * cached object. Returns true if all the object's changes have been
+ * checkpointed to disk. You should be holding the spinlock on the
+ * metadata lock while calling this to be sure that nobody can take
+ * the lock and put it on another transaction. */
+static inline int ocfs2_ci_fully_checkpointed(struct ocfs2_caching_info *ci)
+{
+	int ret;
+	struct ocfs2_journal *journal =
+		OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
+
+	spin_lock(&trans_inc_lock);
+	ret = time_after(journal->j_trans_id, ci->ci_last_trans);
+	spin_unlock(&trans_inc_lock);
+	return ret;
+}
+
+/* convenience function to check if an object backed by struct
+ * ocfs2_caching_info  is still new (has never hit disk) Will do you a
+ * favor and set created_trans = 0 when you've
+ * been checkpointed.  returns '1' if the ci is still new. */
+static inline int ocfs2_ci_is_new(struct ocfs2_caching_info *ci)
+{
+	int ret;
+	struct ocfs2_journal *journal =
+		OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
+
+	spin_lock(&trans_inc_lock);
+	ret = !(time_after(journal->j_trans_id, ci->ci_created_trans));
+	if (!ret)
+		ci->ci_created_trans = 0;
+	spin_unlock(&trans_inc_lock);
+	return ret;
+}
+
+/* Wrapper for inodes so we can check system files */
+static inline int ocfs2_inode_is_new(struct inode *inode)
+{
+	/* System files are never "new" as they're written out by
+	 * mkfs. This helps us early during mount, before we have the
+	 * journal open and j_trans_id could be junk. */
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
+		return 0;
+
+	return ocfs2_ci_is_new(INODE_CACHE(inode));
+}
+
+static inline void ocfs2_ci_set_new(struct ocfs2_super *osb,
+				    struct ocfs2_caching_info *ci)
+{
+	spin_lock(&trans_inc_lock);
+	ci->ci_created_trans = osb->journal->j_trans_id;
+	spin_unlock(&trans_inc_lock);
+}
+
+/* Exported only for the journal struct init code in super.c. Do not call. */
+void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
+
+void ocfs2_complete_recovery(struct work_struct *work);
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
+
+int ocfs2_recovery_init(struct ocfs2_super *osb);
+void ocfs2_recovery_exit(struct ocfs2_super *osb);
+
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
+/*
+ *  Journal Control:
+ *  Initialize, Load, Shutdown, Wipe a journal.
+ *
+ *  ocfs2_journal_init     - Initialize journal structures in the OSB.
+ *  ocfs2_journal_load     - Load the given journal off disk. Replay it if
+ *                          there's transactions still in there.
+ *  ocfs2_journal_shutdown - Shutdown a journal, this will flush all
+ *                          uncommitted, uncheckpointed transactions.
+ *  ocfs2_journal_wipe     - Wipe transactions from a journal. Optionally
+ *                          zero out each block.
+ *  ocfs2_recovery_thread  - Perform recovery on a node. osb is our own osb.
+ *  ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
+ *                          event on.
+ *  ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
+ */
+void   ocfs2_set_journal_params(struct ocfs2_super *osb);
+int    ocfs2_journal_init(struct ocfs2_journal *journal,
+			  int *dirty);
+void   ocfs2_journal_shutdown(struct ocfs2_super *osb);
+int    ocfs2_journal_wipe(struct ocfs2_journal *journal,
+			  int full);
+int    ocfs2_journal_load(struct ocfs2_journal *journal, int local,
+			  int replayed);
+int    ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
+void   ocfs2_recovery_thread(struct ocfs2_super *osb,
+			     int node_num);
+int    ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
+void   ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
+
+static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
+{
+	atomic_set(&osb->needs_checkpoint, 1);
+	wake_up(&osb->checkpoint_event);
+}
+
+static inline void ocfs2_checkpoint_inode(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (ocfs2_mount_local(osb))
+		return;
+
+	if (!ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))) {
+		/* WARNING: This only kicks off a single
+		 * checkpoint. If someone races you and adds more
+		 * metadata to the journal, you won't know, and will
+		 * wind up waiting *a lot* longer than necessary. Right
+		 * now we only use this in clear_inode so that's
+		 * OK. */
+		ocfs2_start_checkpoint(osb);
+
+		wait_event(osb->journal->j_checkpointed,
+			   ocfs2_ci_fully_checkpointed(INODE_CACHE(inode)));
+	}
+}
+
+/*
+ *  Transaction Handling:
+ *  Manage the lifetime of a transaction handle.
+ *
+ *  ocfs2_start_trans      - Begin a transaction. Give it an upper estimate of
+ *                          the number of blocks that will be changed during
+ *                          this handle.
+ *  ocfs2_commit_trans - Complete a handle. It might return -EIO if
+ *                       the journal was aborted. The majority of paths don't
+ *                       check the return value as an error there comes too
+ *                       late to do anything (and will be picked up in a
+ *                       later transaction).
+ *  ocfs2_extend_trans     - Extend a handle by nblocks credits. This may
+ *                          commit the handle to disk in the process, but will
+ *                          not release any locks taken during the transaction.
+ *  ocfs2_journal_access* - Notify the handle that we want to journal this
+ *                          buffer. Will have to call ocfs2_journal_dirty once
+ *                          we've actually dirtied it. Type is one of . or .
+ *                          Always call the specific flavor of
+ *                          ocfs2_journal_access_*() unless you intend to
+ *                          manage the checksum by hand.
+ *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
+ *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
+ *                           the current handle commits.
+ */
+
+/* You must always start_trans with a number of buffs > 0, but it's
+ * perfectly legal to go through an entire transaction without having
+ * dirtied any buffers. */
+handle_t		    *ocfs2_start_trans(struct ocfs2_super *osb,
+					       int max_buffs);
+int			     ocfs2_commit_trans(struct ocfs2_super *osb,
+						handle_t *handle);
+int			     ocfs2_extend_trans(handle_t *handle, int nblocks);
+
+/*
+ * Create access is for when we get a newly created buffer and we're
+ * not gonna read it off disk, but rather fill it ourselves.  Right
+ * now, we don't do anything special with this (it turns into a write
+ * request), but this is a good placeholder in case we do...
+ *
+ * Write access is for when we read a block off disk and are going to
+ * modify it. This way the journalling layer knows it may need to make
+ * a copy of that block (if it's part of another, uncommitted
+ * transaction) before we do so.
+ */
+#define OCFS2_JOURNAL_ACCESS_CREATE 0
+#define OCFS2_JOURNAL_ACCESS_WRITE  1
+#define OCFS2_JOURNAL_ACCESS_UNDO   2
+
+
+/* ocfs2_inode */
+int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* ocfs2_extent_block */
+int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* ocfs2_refcount_block */
+int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* ocfs2_group_desc */
+int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* ocfs2_xattr_block */
+int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* quota blocks */
+int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* dirblock */
+int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* ocfs2_dx_root_block */
+int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* ocfs2_dx_leaf */
+int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
+/* Anything that has no ecc */
+int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
+			 struct buffer_head *bh, int type);
+
+/*
+ * A word about the journal_access/journal_dirty "dance". It is
+ * entirely legal to journal_access a buffer more than once (as long
+ * as the access type is the same -- I'm not sure what will happen if
+ * access type is different but this should never happen anyway) It is
+ * also legal to journal_dirty a buffer more than once. In fact, you
+ * can even journal_access a buffer after you've done a
+ * journal_access/journal_dirty pair. The only thing you cannot do
+ * however, is journal_dirty a buffer which you haven't yet passed to
+ * journal_access at least once.
+ *
+ * That said, 99% of the time this doesn't matter and this is what the
+ * path looks like:
+ *
+ *	<read a bh>
+ *	ocfs2_journal_access(handle, bh,	OCFS2_JOURNAL_ACCESS_WRITE);
+ *	<modify the bh>
+ * 	ocfs2_journal_dirty(handle, bh);
+ */
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
+
+/*
+ *  Credit Macros:
+ *  Convenience macros to calculate number of credits needed.
+ *
+ *  For convenience sake, I have a set of macros here which calculate
+ *  the *maximum* number of sectors which will be changed for various
+ *  metadata updates.
+ */
+
+/* simple file updates like chmod, etc. */
+#define OCFS2_INODE_UPDATE_CREDITS 1
+
+/* extended attribute block update */
+#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+
+/* Update of a single quota block */
+#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1
+
+/* global quotafile inode update, data block */
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
+				   OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
+
+#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS
+/*
+ * The two writes below can accidentally see global info dirty due
+ * to set_info() quotactl so make them prepared for the writes.
+ */
+/* quota data block, global info */
+/* Write to local quota file */
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+			      OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
+
+/* global quota data block, local quota data block, global quota inode,
+ * global quota info */
+#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+			     2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
+
+static inline int ocfs2_quota_trans_credits(struct super_block *sb)
+{
+	int credits = 0;
+
+	if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
+		credits += OCFS2_QWRITE_CREDITS;
+	if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
+		credits += OCFS2_QWRITE_CREDITS;
+	return credits;
+}
+
+/* group extend. inode update and last group update. */
+#define OCFS2_GROUP_EXTEND_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/* group add. inode update and the new group update. */
+#define OCFS2_GROUP_ADD_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/* get one bit out of a suballocator: dinode + group descriptor +
+ * prev. group desc. if we relink. */
+#define OCFS2_SUBALLOC_ALLOC (3)
+
+static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
+{
+	return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
+	       ocfs2_quota_trans_credits(sb);
+}
+
+/* dinode + group descriptor update. We don't relink on free yet. */
+#define OCFS2_SUBALLOC_FREE  (2)
+
+#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS
+#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE 		      \
+					 + OCFS2_TRUNCATE_LOG_UPDATE)
+
+static inline int ocfs2_remove_extent_credits(struct super_block *sb)
+{
+	return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
+	       ocfs2_quota_trans_credits(sb);
+}
+
+/* data block for new dir/symlink, allocation of directory block, dx_root
+ * update for free list */
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1)
+
+static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
+{
+	/* 1 block for index, 2 allocs (data, metadata), 1 clusters
+	 * worth of blocks for initial extent. */
+	return 1 + 2 * OCFS2_SUBALLOC_ALLOC +
+		ocfs2_clusters_to_blocks(sb, 1);
+}
+
+/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode
+ * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr
+ * blocks + quota update */
+static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
+				      int xattr_credits)
+{
+	int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS;
+
+	if (is_dir)
+		dir_credits += ocfs2_add_dir_index_credits(sb);
+
+	return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits +
+	       ocfs2_quota_trans_credits(sb);
+}
+
+/* local alloc metadata change + main bitmap updates */
+#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS                 \
+				  + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE)
+
+/* used when we don't need an allocation change for a dir extend. One
+ * for the dinode, one for the new block. */
+#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
+
+/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
+ * update on dir + index leaf + dx root update for free list */
+static inline int ocfs2_link_credits(struct super_block *sb)
+{
+	return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
+	       ocfs2_quota_trans_credits(sb);
+}
+
+/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
+ * dir inode link + dir inode index leaf + dir index root */
+static inline int ocfs2_unlink_credits(struct super_block *sb)
+{
+	/* The quota update from ocfs2_link_credits is unused here... */
+	return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb);
+}
+
+/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
+ * inode alloc group descriptor + orphan dir index root +
+ * orphan dir index leaf */
+#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
+
+/* dinode update, old dir dinode update, new dir dinode update, old
+ * dir dir entry, new dir dir entry, dir entry update for renaming
+ * directory + target unlink + 3 x dir index leaves */
+static inline int ocfs2_rename_credits(struct super_block *sb)
+{
+	return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb);
+}
+
+/* global bitmap dinode, group desc., relinked group,
+ * suballocator dinode, group desc., relinked group,
+ * dinode, xattr block */
+#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \
+					  + OCFS2_INODE_UPDATE_CREDITS \
+					  + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
+
+/* inode update, removal of dx root block from allocator */
+#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS +	\
+				      OCFS2_SUBALLOC_FREE)
+
+static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
+{
+	int credits = 1 + OCFS2_SUBALLOC_ALLOC;
+
+	credits += ocfs2_clusters_to_blocks(sb, 1);
+	credits += ocfs2_quota_trans_credits(sb);
+
+	return credits;
+}
+
+/* inode update, new refcount block and its allocation credits. */
+#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1 \
+					    + OCFS2_SUBALLOC_ALLOC)
+
+/* inode and the refcount block update. */
+#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/*
+ * inode and the refcount block update.
+ * It doesn't include the credits for sub alloc change.
+ * So if we need to free the bit, OCFS2_SUBALLOC_FREE needs to be added.
+ */
+#define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/* 2 metadata alloc, 2 new blocks and root refcount block */
+#define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3)
+
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
+static inline int ocfs2_calc_extend_credits(struct super_block *sb,
+					    struct ocfs2_extent_list *root_el,
+					    u32 bits_wanted)
+{
+	int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
+
+	/* bitmap dinode, group desc. + relinked group. */
+	bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
+
+	/* we might need to shift tree depth so lets assume an
+	 * absolute worst case of complete fragmentation.  Even with
+	 * that, we only need one update for the dinode, and then
+	 * however many metadata chunks needed * a remaining suballoc
+	 * alloc. */
+	sysfile_bitmap_blocks = 1 +
+		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el);
+
+	/* this does not include *new* metadata blocks, which are
+	 * accounted for in sysfile_bitmap_blocks. root_el +
+	 * prev. last_eb_blk + blocks along edge of tree.
+	 * calc_symlink_credits passes because we just need 1
+	 * credit for the dinode there. */
+	extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
+
+	return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
+	       ocfs2_quota_trans_credits(sb);
+}
+
+static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
+{
+	int blocks = ocfs2_mknod_credits(sb, 0, 0);
+
+	/* links can be longer than one block so we may update many
+	 * within our single allocated extent. */
+	blocks += ocfs2_clusters_to_blocks(sb, 1);
+
+	return blocks + ocfs2_quota_trans_credits(sb);
+}
+
+static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
+						 unsigned int cpg)
+{
+	int blocks;
+	int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1;
+	/* parent inode update + new block group header + bitmap inode update
+	   + bitmap blocks affected */
+	blocks = 1 + 1 + 1 + bitmap_blocks;
+	return blocks;
+}
+
+/*
+ * Allocating a discontiguous block group requires the credits from
+ * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
+ * the group descriptor's extent list.  The caller already has started
+ * the transaction with ocfs2_calc_group_alloc_credits().  They extend
+ * it with these credits.
+ */
+static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
+{
+	return ocfs2_extent_recs_per_gd(sb);
+}
+
+static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
+						unsigned int clusters_to_del,
+						struct ocfs2_dinode *fe,
+						struct ocfs2_extent_list *last_el)
+{
+ 	/* for dinode + all headers in this pass + update to next leaf */
+	u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
+	u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
+	int credits = 1 + tree_depth + 1;
+	int i;
+
+	i = next_free - 1;
+	BUG_ON(i < 0);
+
+	/* We may be deleting metadata blocks, so metadata alloc dinode +
+	   one desc. block for each possible delete. */
+	if (tree_depth && next_free == 1 &&
+	    ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
+		credits += 1 + tree_depth;
+
+	/* update to the truncate log. */
+	credits += OCFS2_TRUNCATE_LOG_UPDATE;
+
+	credits += ocfs2_quota_trans_credits(sb);
+
+	return credits;
+}
+
+static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+	return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+}
+
+static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
+					       loff_t new_size)
+{
+	return jbd2_journal_begin_ordered_truncate(
+				OCFS2_SB(inode->i_sb)->journal->j_journal,
+				&OCFS2_I(inode)->ip_jinode,
+				new_size);
+}
+
+#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
new file mode 100644
index 00000000..210c3523
--- /dev/null
+++ b/fs/ocfs2/localalloc.c
@@ -0,0 +1,1307 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * localalloc.c
+ *
+ * Node local data allocation
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "blockcheck.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "super.h"
+#include "sysfile.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
+
+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
+
+static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
+					     struct ocfs2_dinode *alloc,
+					     u32 *numbits,
+					     struct ocfs2_alloc_reservation *resv);
+
+static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
+
+static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
+				    handle_t *handle,
+				    struct ocfs2_dinode *alloc,
+				    struct inode *main_bm_inode,
+				    struct buffer_head *main_bm_bh);
+
+static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
+						struct ocfs2_alloc_context **ac,
+						struct inode **bitmap_inode,
+						struct buffer_head **bitmap_bh);
+
+static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
+					handle_t *handle,
+					struct ocfs2_alloc_context *ac);
+
+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
+					  struct inode *local_alloc_inode);
+
+/*
+ * ocfs2_la_default_mb() - determine a default size, in megabytes of
+ * the local alloc.
+ *
+ * Generally, we'd like to pick as large a local alloc as
+ * possible. Performance on large workloads tends to scale
+ * proportionally to la size. In addition to that, the reservations
+ * code functions more efficiently as it can reserve more windows for
+ * write.
+ *
+ * Some things work against us when trying to choose a large local alloc:
+ *
+ * - We need to ensure our sizing is picked to leave enough space in
+ *   group descriptors for other allocations (such as block groups,
+ *   etc). Picking default sizes which are a multiple of 4 could help
+ *   - block groups are allocated in 2mb and 4mb chunks.
+ *
+ * - Likewise, we don't want to starve other nodes of bits on small
+ *   file systems. This can easily be taken care of by limiting our
+ *   default to a reasonable size (256M) on larger cluster sizes.
+ *
+ * - Some file systems can't support very large sizes - 4k and 8k in
+ *   particular are limited to less than 128 and 256 megabytes respectively.
+ *
+ * The following reference table shows group descriptor and local
+ * alloc maximums at various cluster sizes (4k blocksize)
+ *
+ * csize: 4K	group: 126M	la: 121M
+ * csize: 8K	group: 252M	la: 243M
+ * csize: 16K	group: 504M	la: 486M
+ * csize: 32K	group: 1008M	la: 972M
+ * csize: 64K	group: 2016M	la: 1944M
+ * csize: 128K	group: 4032M	la: 3888M
+ * csize: 256K	group: 8064M	la: 7776M
+ * csize: 512K	group: 16128M	la: 15552M
+ * csize: 1024K	group: 32256M	la: 31104M
+ */
+#define	OCFS2_LA_MAX_DEFAULT_MB	256
+#define	OCFS2_LA_OLD_DEFAULT	8
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
+{
+	unsigned int la_mb;
+	unsigned int gd_mb;
+	unsigned int la_max_mb;
+	unsigned int megs_per_slot;
+	struct super_block *sb = osb->sb;
+
+	gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
+		8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
+
+	/*
+	 * This takes care of files systems with very small group
+	 * descriptors - 512 byte blocksize at cluster sizes lower
+	 * than 16K and also 1k blocksize with 4k cluster size.
+	 */
+	if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
+	    || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
+		return OCFS2_LA_OLD_DEFAULT;
+
+	/*
+	 * Leave enough room for some block groups and make the final
+	 * value we work from a multiple of 4.
+	 */
+	gd_mb -= 16;
+	gd_mb &= 0xFFFFFFFB;
+
+	la_mb = gd_mb;
+
+	/*
+	 * Keep window sizes down to a reasonable default
+	 */
+	if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
+		/*
+		 * Some clustersize / blocksize combinations will have
+		 * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
+		 * default size, but get poor distribution when
+		 * limited to exactly 256 megabytes.
+		 *
+		 * As an example, 16K clustersize at 4K blocksize
+		 * gives us a cluster group size of 504M. Paring the
+		 * local alloc size down to 256 however, would give us
+		 * only one window and around 200MB left in the
+		 * cluster group. Instead, find the first size below
+		 * 256 which would give us an even distribution.
+		 *
+		 * Larger cluster group sizes actually work out pretty
+		 * well when pared to 256, so we don't have to do this
+		 * for any group that fits more than two
+		 * OCFS2_LA_MAX_DEFAULT_MB windows.
+		 */
+		if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
+			la_mb = 256;
+		else {
+			unsigned int gd_mult = gd_mb;
+
+			while (gd_mult > 256)
+				gd_mult = gd_mult >> 1;
+
+			la_mb = gd_mult;
+		}
+	}
+
+	megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
+	megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
+	/* Too many nodes, too few disk clusters. */
+	if (megs_per_slot < la_mb)
+		la_mb = megs_per_slot;
+
+	/* We can't store more bits than we can in a block. */
+	la_max_mb = ocfs2_clusters_to_megabytes(osb->sb,
+						ocfs2_local_alloc_size(sb) * 8);
+	if (la_mb > la_max_mb)
+		la_mb = la_max_mb;
+
+	return la_mb;
+}
+
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
+{
+	struct super_block *sb = osb->sb;
+	unsigned int la_default_mb = ocfs2_la_default_mb(osb);
+	unsigned int la_max_mb;
+
+	la_max_mb = ocfs2_clusters_to_megabytes(sb,
+						ocfs2_local_alloc_size(sb) * 8);
+
+	trace_ocfs2_la_set_sizes(requested_mb, la_max_mb, la_default_mb);
+
+	if (requested_mb == -1) {
+		/* No user request - use defaults */
+		osb->local_alloc_default_bits =
+			ocfs2_megabytes_to_clusters(sb, la_default_mb);
+	} else if (requested_mb > la_max_mb) {
+		/* Request is too big, we give the maximum available */
+		osb->local_alloc_default_bits =
+			ocfs2_megabytes_to_clusters(sb, la_max_mb);
+	} else {
+		osb->local_alloc_default_bits =
+			ocfs2_megabytes_to_clusters(sb, requested_mb);
+	}
+
+	osb->local_alloc_bits = osb->local_alloc_default_bits;
+}
+
+static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
+{
+	return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
+		osb->local_alloc_state == OCFS2_LA_ENABLED);
+}
+
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+				      unsigned int num_clusters)
+{
+	spin_lock(&osb->osb_lock);
+	if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
+	    osb->local_alloc_state == OCFS2_LA_THROTTLED)
+		if (num_clusters >= osb->local_alloc_default_bits) {
+			cancel_delayed_work(&osb->la_enable_wq);
+			osb->local_alloc_state = OCFS2_LA_ENABLED;
+		}
+	spin_unlock(&osb->osb_lock);
+}
+
+void ocfs2_la_enable_worker(struct work_struct *work)
+{
+	struct ocfs2_super *osb =
+		container_of(work, struct ocfs2_super,
+			     la_enable_wq.work);
+	spin_lock(&osb->osb_lock);
+	osb->local_alloc_state = OCFS2_LA_ENABLED;
+	spin_unlock(&osb->osb_lock);
+}
+
+/*
+ * Tell us whether a given allocation should use the local alloc
+ * file. Otherwise, it has to go to the main bitmap.
+ *
+ * This function does semi-dirty reads of local alloc size and state!
+ * This is ok however, as the values are re-checked once under mutex.
+ */
+int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
+{
+	int ret = 0;
+	int la_bits;
+
+	spin_lock(&osb->osb_lock);
+	la_bits = osb->local_alloc_bits;
+
+	if (!ocfs2_la_state_enabled(osb))
+		goto bail;
+
+	/* la_bits should be at least twice the size (in clusters) of
+	 * a new block group. We want to be sure block group
+	 * allocations go through the local alloc, so allow an
+	 * allocation to take up to half the bitmap. */
+	if (bits > (la_bits / 2))
+		goto bail;
+
+	ret = 1;
+bail:
+	trace_ocfs2_alloc_should_use_local(
+	     (unsigned long long)bits, osb->local_alloc_state, la_bits, ret);
+	spin_unlock(&osb->osb_lock);
+	return ret;
+}
+
+int ocfs2_load_local_alloc(struct ocfs2_super *osb)
+{
+	int status = 0;
+	struct ocfs2_dinode *alloc = NULL;
+	struct buffer_head *alloc_bh = NULL;
+	u32 num_used;
+	struct inode *inode = NULL;
+	struct ocfs2_local_alloc *la;
+
+	if (osb->local_alloc_bits == 0)
+		goto bail;
+
+	if (osb->local_alloc_bits >= osb->bitmap_cpg) {
+		mlog(ML_NOTICE, "Requested local alloc window %d is larger "
+		     "than max possible %u. Using defaults.\n",
+		     osb->local_alloc_bits, (osb->bitmap_cpg - 1));
+		osb->local_alloc_bits =
+			ocfs2_megabytes_to_clusters(osb->sb,
+						    ocfs2_la_default_mb(osb));
+	}
+
+	/* read the alloc off disk */
+	inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+					     OCFS2_BH_IGNORE_CACHE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	if (!(le32_to_cpu(alloc->i_flags) &
+	    (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) {
+		mlog(ML_ERROR, "Invalid local alloc inode, %llu\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if ((la->la_size == 0) ||
+	    (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) {
+		mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n",
+		     le16_to_cpu(la->la_size));
+		status = -EINVAL;
+		goto bail;
+	}
+
+	/* do a little verification. */
+	num_used = ocfs2_local_alloc_count_bits(alloc);
+
+	/* hopefully the local alloc has always been recovered before
+	 * we load it. */
+	if (num_used
+	    || alloc->id1.bitmap1.i_used
+	    || alloc->id1.bitmap1.i_total
+	    || la->la_bm_off)
+		mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
+		     "found = %u, set = %u, taken = %u, off = %u\n",
+		     num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
+		     le32_to_cpu(alloc->id1.bitmap1.i_total),
+		     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+
+	osb->local_alloc_bh = alloc_bh;
+	osb->local_alloc_state = OCFS2_LA_ENABLED;
+
+bail:
+	if (status < 0)
+		brelse(alloc_bh);
+	if (inode)
+		iput(inode);
+
+	trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * return any unused bits to the bitmap and write out a clean
+ * local_alloc.
+ *
+ * local_alloc_bh is optional. If not passed, we will simply use the
+ * one off osb. If you do pass it however, be warned that it *will* be
+ * returned brelse'd and NULL'd out.*/
+void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
+{
+	int status;
+	handle_t *handle;
+	struct inode *local_alloc_inode = NULL;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_dinode *alloc_copy = NULL;
+	struct ocfs2_dinode *alloc = NULL;
+
+	cancel_delayed_work(&osb->la_enable_wq);
+	flush_workqueue(ocfs2_wq);
+
+	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
+		goto out;
+
+	local_alloc_inode =
+		ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!local_alloc_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto out;
+	}
+
+	osb->local_alloc_state = OCFS2_LA_DISABLED;
+
+	ocfs2_resmap_uninit(&osb->osb_la_resmap);
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto out;
+	}
+
+	mutex_lock(&main_bm_inode->i_mutex);
+
+	status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_mutex;
+	}
+
+	/* WINDOW_MOVE_CREDITS is a bit heavy... */
+	handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		mlog_errno(PTR_ERR(handle));
+		handle = NULL;
+		goto out_unlock;
+	}
+
+	bh = osb->local_alloc_bh;
+	alloc = (struct ocfs2_dinode *) bh->b_data;
+
+	alloc_copy = kmalloc(bh->b_size, GFP_NOFS);
+	if (!alloc_copy) {
+		status = -ENOMEM;
+		goto out_commit;
+	}
+	memcpy(alloc_copy, alloc, bh->b_size);
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode),
+					 bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	ocfs2_clear_local_alloc(alloc);
+	ocfs2_journal_dirty(handle, bh);
+
+	brelse(bh);
+	osb->local_alloc_bh = NULL;
+	osb->local_alloc_state = OCFS2_LA_UNUSED;
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+	brelse(main_bm_bh);
+
+	ocfs2_inode_unlock(main_bm_inode, 1);
+
+out_mutex:
+	mutex_unlock(&main_bm_inode->i_mutex);
+	iput(main_bm_inode);
+
+out:
+	if (local_alloc_inode)
+		iput(local_alloc_inode);
+
+	if (alloc_copy)
+		kfree(alloc_copy);
+}
+
+/*
+ * We want to free the bitmap bits outside of any recovery context as
+ * we'll need a cluster lock to do so, but we must clear the local
+ * alloc before giving up the recovered nodes journal. To solve this,
+ * we kmalloc a copy of the local alloc before it's change for the
+ * caller to process with ocfs2_complete_local_alloc_recovery
+ */
+int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
+				     int slot_num,
+				     struct ocfs2_dinode **alloc_copy)
+{
+	int status = 0;
+	struct buffer_head *alloc_bh = NULL;
+	struct inode *inode = NULL;
+	struct ocfs2_dinode *alloc;
+
+	trace_ocfs2_begin_local_alloc_recovery(slot_num);
+
+	*alloc_copy = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mutex_lock(&inode->i_mutex);
+
+	status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+					     OCFS2_BH_IGNORE_CACHE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL);
+	if (!(*alloc_copy)) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size);
+
+	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
+	ocfs2_clear_local_alloc(alloc);
+
+	ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
+	status = ocfs2_write_block(osb, alloc_bh, INODE_CACHE(inode));
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if ((status < 0) && (*alloc_copy)) {
+		kfree(*alloc_copy);
+		*alloc_copy = NULL;
+	}
+
+	brelse(alloc_bh);
+
+	if (inode) {
+		mutex_unlock(&inode->i_mutex);
+		iput(inode);
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * Step 2: By now, we've completed the journal recovery, we've stamped
+ * a clean local alloc on disk and dropped the node out of the
+ * recovery map. Dlm locks will no longer stall, so lets clear out the
+ * main bitmap.
+ */
+int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
+					struct ocfs2_dinode *alloc)
+{
+	int status;
+	handle_t *handle;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode;
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto out;
+	}
+
+	mutex_lock(&main_bm_inode->i_mutex);
+
+	status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_mutex;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto out_unlock;
+	}
+
+	/* we want the bitmap change to be recorded on disk asap */
+	handle->h_sync = 1;
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+	ocfs2_inode_unlock(main_bm_inode, 1);
+
+out_mutex:
+	mutex_unlock(&main_bm_inode->i_mutex);
+
+	brelse(main_bm_bh);
+
+	iput(main_bm_inode);
+
+out:
+	if (!status)
+		ocfs2_init_steal_slots(osb);
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * make sure we've got at least bits_wanted contiguous bits in the
+ * local alloc. You lose them when you drop i_mutex.
+ *
+ * We will add ourselves to the transaction passed in, but may start
+ * our own in order to shift windows.
+ */
+int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
+				   u32 bits_wanted,
+				   struct ocfs2_alloc_context *ac)
+{
+	int status;
+	struct ocfs2_dinode *alloc;
+	struct inode *local_alloc_inode;
+	unsigned int free_bits;
+
+	BUG_ON(!ac);
+
+	local_alloc_inode =
+		ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!local_alloc_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mutex_lock(&local_alloc_inode->i_mutex);
+
+	/*
+	 * We must double check state and allocator bits because
+	 * another process may have changed them while holding i_mutex.
+	 */
+	spin_lock(&osb->osb_lock);
+	if (!ocfs2_la_state_enabled(osb) ||
+	    (bits_wanted > osb->local_alloc_bits)) {
+		spin_unlock(&osb->osb_lock);
+		status = -ENOSPC;
+		goto bail;
+	}
+	spin_unlock(&osb->osb_lock);
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+
+#ifdef CONFIG_OCFS2_DEBUG_FS
+	if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
+	    ocfs2_local_alloc_count_bits(alloc)) {
+		ocfs2_error(osb->sb, "local alloc inode %llu says it has "
+			    "%u free bits, but a count shows %u",
+			    (unsigned long long)le64_to_cpu(alloc->i_blkno),
+			    le32_to_cpu(alloc->id1.bitmap1.i_used),
+			    ocfs2_local_alloc_count_bits(alloc));
+		status = -EIO;
+		goto bail;
+	}
+#endif
+
+	free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+		le32_to_cpu(alloc->id1.bitmap1.i_used);
+	if (bits_wanted > free_bits) {
+		/* uhoh, window change time. */
+		status =
+			ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		/*
+		 * Under certain conditions, the window slide code
+		 * might have reduced the number of bits available or
+		 * disabled the the local alloc entirely. Re-check
+		 * here and return -ENOSPC if necessary.
+		 */
+		status = -ENOSPC;
+		if (!ocfs2_la_state_enabled(osb))
+			goto bail;
+
+		free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+			le32_to_cpu(alloc->id1.bitmap1.i_used);
+		if (bits_wanted > free_bits)
+			goto bail;
+	}
+
+	ac->ac_inode = local_alloc_inode;
+	/* We should never use localalloc from another slot */
+	ac->ac_alloc_slot = osb->slot_num;
+	ac->ac_which = OCFS2_AC_USE_LOCAL;
+	get_bh(osb->local_alloc_bh);
+	ac->ac_bh = osb->local_alloc_bh;
+	status = 0;
+bail:
+	if (status < 0 && local_alloc_inode) {
+		mutex_unlock(&local_alloc_inode->i_mutex);
+		iput(local_alloc_inode);
+	}
+
+	trace_ocfs2_reserve_local_alloc_bits(
+		(unsigned long long)ac->ac_max_block,
+		bits_wanted, osb->slot_num, status);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
+				 handle_t *handle,
+				 struct ocfs2_alloc_context *ac,
+				 u32 bits_wanted,
+				 u32 *bit_off,
+				 u32 *num_bits)
+{
+	int status, start;
+	struct inode *local_alloc_inode;
+	void *bitmap;
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_local_alloc *la;
+
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
+
+	local_alloc_inode = ac->ac_inode;
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
+						  ac->ac_resv);
+	if (start == -1) {
+		/* TODO: Shouldn't we just BUG here? */
+		status = -ENOSPC;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bitmap = la->la_bitmap;
+	*bit_off = le32_to_cpu(la->la_bm_off) + start;
+	*num_bits = bits_wanted;
+
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(local_alloc_inode),
+					 osb->local_alloc_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
+				  bits_wanted);
+
+	while(bits_wanted--)
+		ocfs2_set_bit(start++, bitmap);
+
+	le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
+	ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
+{
+	int i;
+	u8 *buffer;
+	u32 count = 0;
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+
+	buffer = la->la_bitmap;
+	for (i = 0; i < le16_to_cpu(la->la_size); i++)
+		count += hweight8(buffer[i]);
+
+	trace_ocfs2_local_alloc_count_bits(count);
+	return count;
+}
+
+static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
+				     struct ocfs2_dinode *alloc,
+				     u32 *numbits,
+				     struct ocfs2_alloc_reservation *resv)
+{
+	int numfound, bitoff, left, startoff, lastzero;
+	int local_resv = 0;
+	struct ocfs2_alloc_reservation r;
+	void *bitmap = NULL;
+	struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
+
+	if (!alloc->id1.bitmap1.i_total) {
+		bitoff = -1;
+		goto bail;
+	}
+
+	if (!resv) {
+		local_resv = 1;
+		ocfs2_resv_init_once(&r);
+		ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
+		resv = &r;
+	}
+
+	numfound = *numbits;
+	if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
+		if (numfound < *numbits)
+			*numbits = numfound;
+		goto bail;
+	}
+
+	/*
+	 * Code error. While reservations are enabled, local
+	 * allocation should _always_ go through them.
+	 */
+	BUG_ON(osb->osb_resv_level != 0);
+
+	/*
+	 * Reservations are disabled. Handle this the old way.
+	 */
+
+	bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
+
+	numfound = bitoff = startoff = 0;
+	lastzero = -1;
+	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
+	while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
+		if (bitoff == left) {
+			/* mlog(0, "bitoff (%d) == left", bitoff); */
+			break;
+		}
+		/* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
+		   "numfound = %d\n", bitoff, startoff, numfound);*/
+
+		/* Ok, we found a zero bit... is it contig. or do we
+		 * start over?*/
+		if (bitoff == startoff) {
+			/* we found a zero */
+			numfound++;
+			startoff++;
+		} else {
+			/* got a zero after some ones */
+			numfound = 1;
+			startoff = bitoff+1;
+		}
+		/* we got everything we needed */
+		if (numfound == *numbits) {
+			/* mlog(0, "Found it all!\n"); */
+			break;
+		}
+	}
+
+	trace_ocfs2_local_alloc_find_clear_bits_search_bitmap(bitoff, numfound);
+
+	if (numfound == *numbits)
+		bitoff = startoff - numfound;
+	else
+		bitoff = -1;
+
+bail:
+	if (local_resv)
+		ocfs2_resv_discard(resmap, resv);
+
+	trace_ocfs2_local_alloc_find_clear_bits(*numbits,
+		le32_to_cpu(alloc->id1.bitmap1.i_total),
+		bitoff, numfound);
+
+	return bitoff;
+}
+
+static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
+{
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+	int i;
+
+	alloc->id1.bitmap1.i_total = 0;
+	alloc->id1.bitmap1.i_used = 0;
+	la->la_bm_off = 0;
+	for(i = 0; i < le16_to_cpu(la->la_size); i++)
+		la->la_bitmap[i] = 0;
+}
+
+#if 0
+/* turn this on and uncomment below to aid debugging window shifts. */
+static void ocfs2_verify_zero_bits(unsigned long *bitmap,
+				   unsigned int start,
+				   unsigned int count)
+{
+	unsigned int tmp = count;
+	while(tmp--) {
+		if (ocfs2_test_bit(start + tmp, bitmap)) {
+			printk("ocfs2_verify_zero_bits: start = %u, count = "
+			       "%u\n", start, count);
+			printk("ocfs2_verify_zero_bits: bit %u is set!",
+			       start + tmp);
+			BUG();
+		}
+	}
+}
+#endif
+
+/*
+ * sync the local alloc to main bitmap.
+ *
+ * assumes you've already locked the main bitmap -- the bitmap inode
+ * passed is used for caching.
+ */
+static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
+				    handle_t *handle,
+				    struct ocfs2_dinode *alloc,
+				    struct inode *main_bm_inode,
+				    struct buffer_head *main_bm_bh)
+{
+	int status = 0;
+	int bit_off, left, count, start;
+	u64 la_start_blk;
+	u64 blkno;
+	void *bitmap;
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+
+	trace_ocfs2_sync_local_to_main(
+	     le32_to_cpu(alloc->id1.bitmap1.i_total),
+	     le32_to_cpu(alloc->id1.bitmap1.i_used));
+
+	if (!alloc->id1.bitmap1.i_total) {
+		goto bail;
+	}
+
+	if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
+	    le32_to_cpu(alloc->id1.bitmap1.i_total)) {
+		goto bail;
+	}
+
+	la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
+						le32_to_cpu(la->la_bm_off));
+	bitmap = la->la_bitmap;
+	start = count = bit_off = 0;
+	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
+
+	while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
+	       != -1) {
+		if ((bit_off < left) && (bit_off == start)) {
+			count++;
+			start++;
+			continue;
+		}
+		if (count) {
+			blkno = la_start_blk +
+				ocfs2_clusters_to_blocks(osb->sb,
+							 start - count);
+
+			trace_ocfs2_sync_local_to_main_free(
+			     count, start - count,
+			     (unsigned long long)la_start_blk,
+			     (unsigned long long)blkno);
+
+			status = ocfs2_release_clusters(handle,
+							main_bm_inode,
+							main_bm_bh, blkno,
+							count);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		if (bit_off >= left)
+			break;
+		count = 1;
+		start = bit_off + 1;
+	}
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+enum ocfs2_la_event {
+	OCFS2_LA_EVENT_SLIDE,		/* Normal window slide. */
+	OCFS2_LA_EVENT_FRAGMENTED,	/* The global bitmap has
+					 * enough bits theoretically
+					 * free, but a contiguous
+					 * allocation could not be
+					 * found. */
+	OCFS2_LA_EVENT_ENOSPC,		/* Global bitmap doesn't have
+					 * enough bits free to satisfy
+					 * our request. */
+};
+#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ)
+/*
+ * Given an event, calculate the size of our next local alloc window.
+ *
+ * This should always be called under i_mutex of the local alloc inode
+ * so that local alloc disabling doesn't race with processes trying to
+ * use the allocator.
+ *
+ * Returns the state which the local alloc was left in. This value can
+ * be ignored by some paths.
+ */
+static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
+				  enum ocfs2_la_event event)
+{
+	unsigned int bits;
+	int state;
+
+	spin_lock(&osb->osb_lock);
+	if (osb->local_alloc_state == OCFS2_LA_DISABLED) {
+		WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED);
+		goto out_unlock;
+	}
+
+	/*
+	 * ENOSPC and fragmentation are treated similarly for now.
+	 */
+	if (event == OCFS2_LA_EVENT_ENOSPC ||
+	    event == OCFS2_LA_EVENT_FRAGMENTED) {
+		/*
+		 * We ran out of contiguous space in the primary
+		 * bitmap. Drastically reduce the number of bits used
+		 * by local alloc until we have to disable it.
+		 */
+		bits = osb->local_alloc_bits >> 1;
+		if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) {
+			/*
+			 * By setting state to THROTTLED, we'll keep
+			 * the number of local alloc bits used down
+			 * until an event occurs which would give us
+			 * reason to assume the bitmap situation might
+			 * have changed.
+			 */
+			osb->local_alloc_state = OCFS2_LA_THROTTLED;
+			osb->local_alloc_bits = bits;
+		} else {
+			osb->local_alloc_state = OCFS2_LA_DISABLED;
+		}
+		queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+				   OCFS2_LA_ENABLE_INTERVAL);
+		goto out_unlock;
+	}
+
+	/*
+	 * Don't increase the size of the local alloc window until we
+	 * know we might be able to fulfill the request. Otherwise, we
+	 * risk bouncing around the global bitmap during periods of
+	 * low space.
+	 */
+	if (osb->local_alloc_state != OCFS2_LA_THROTTLED)
+		osb->local_alloc_bits = osb->local_alloc_default_bits;
+
+out_unlock:
+	state = osb->local_alloc_state;
+	spin_unlock(&osb->osb_lock);
+
+	return state;
+}
+
+static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
+						struct ocfs2_alloc_context **ac,
+						struct inode **bitmap_inode,
+						struct buffer_head **bitmap_bh)
+{
+	int status;
+
+	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+retry_enospc:
+	(*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
+	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+	if (status == -ENOSPC) {
+		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
+		    OCFS2_LA_DISABLED)
+			goto bail;
+
+		ocfs2_free_ac_resource(*ac);
+		memset(*ac, 0, sizeof(struct ocfs2_alloc_context));
+		goto retry_enospc;
+	}
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*bitmap_inode = (*ac)->ac_inode;
+	igrab(*bitmap_inode);
+	*bitmap_bh = (*ac)->ac_bh;
+	get_bh(*bitmap_bh);
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * pass it the bitmap lock in lock_bh if you have it.
+ */
+static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
+					handle_t *handle,
+					struct ocfs2_alloc_context *ac)
+{
+	int status = 0;
+	u32 cluster_off, cluster_count;
+	struct ocfs2_dinode *alloc = NULL;
+	struct ocfs2_local_alloc *la;
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	trace_ocfs2_local_alloc_new_window(
+		le32_to_cpu(alloc->id1.bitmap1.i_total),
+		osb->local_alloc_bits);
+
+	/* Instruct the allocation code to try the most recently used
+	 * cluster group. We'll re-record the group used this pass
+	 * below. */
+	ac->ac_last_group = osb->la_last_gd;
+
+	/* we used the generic suballoc reserve function, but we set
+	 * everything up nicely, so there's no reason why we can't use
+	 * the more specific cluster api to claim bits. */
+	status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
+				      &cluster_off, &cluster_count);
+	if (status == -ENOSPC) {
+retry_enospc:
+		/*
+		 * Note: We could also try syncing the journal here to
+		 * allow use of any free bits which the current
+		 * transaction can't give us access to. --Mark
+		 */
+		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) ==
+		    OCFS2_LA_DISABLED)
+			goto bail;
+
+		ac->ac_bits_wanted = osb->local_alloc_default_bits;
+		status = ocfs2_claim_clusters(handle, ac,
+					      osb->local_alloc_bits,
+					      &cluster_off,
+					      &cluster_count);
+		if (status == -ENOSPC)
+			goto retry_enospc;
+		/*
+		 * We only shrunk the *minimum* number of in our
+		 * request - it's entirely possible that the allocator
+		 * might give us more than we asked for.
+		 */
+		if (status == 0) {
+			spin_lock(&osb->osb_lock);
+			osb->local_alloc_bits = cluster_count;
+			spin_unlock(&osb->osb_lock);
+		}
+	}
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	osb->la_last_gd = ac->ac_last_group;
+
+	la->la_bm_off = cpu_to_le32(cluster_off);
+	alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
+	/* just in case... In the future when we find space ourselves,
+	 * we don't have to get all contiguous -- but we'll have to
+	 * set all previously used bits in bitmap and update
+	 * la_bits_set before setting the bits in the main bitmap. */
+	alloc->id1.bitmap1.i_used = 0;
+	memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
+	       le16_to_cpu(la->la_size));
+
+	ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
+			     OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
+
+	trace_ocfs2_local_alloc_new_window_result(
+		OCFS2_LOCAL_ALLOC(alloc)->la_bm_off,
+		le32_to_cpu(alloc->id1.bitmap1.i_total));
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/* Note that we do *NOT* lock the local alloc inode here as
+ * it's been locked already for us. */
+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
+					  struct inode *local_alloc_inode)
+{
+	int status = 0;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	handle_t *handle = NULL;
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_dinode *alloc_copy = NULL;
+	struct ocfs2_alloc_context *ac = NULL;
+
+	ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
+
+	/* This will lock the main bitmap for us. */
+	status = ocfs2_local_alloc_reserve_for_window(osb,
+						      &ac,
+						      &main_bm_inode,
+						      &main_bm_bh);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+
+	/* We want to clear the local alloc before doing anything
+	 * else, so that if we error later during this operation,
+	 * local alloc shutdown won't try to double free main bitmap
+	 * bits. Make a copy so the sync function knows which bits to
+	 * free. */
+	alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_NOFS);
+	if (!alloc_copy) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+	memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
+
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(local_alloc_inode),
+					 osb->local_alloc_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_clear_local_alloc(alloc);
+	ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_local_alloc_new_window(osb, handle, ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	atomic_inc(&osb->alloc_stats.moves);
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	brelse(main_bm_bh);
+
+	if (main_bm_inode)
+		iput(main_bm_inode);
+
+	if (alloc_copy)
+		kfree(alloc_copy);
+
+	if (ac)
+		ocfs2_free_alloc_context(ac);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
new file mode 100644
index 00000000..1be9b586
--- /dev/null
+++ b/fs/ocfs2/localalloc.h
@@ -0,0 +1,62 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * localalloc.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_LOCALALLOC_H
+#define OCFS2_LOCALALLOC_H
+
+int ocfs2_load_local_alloc(struct ocfs2_super *osb);
+
+void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
+
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
+
+int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
+				     int node_num,
+				     struct ocfs2_dinode **alloc_copy);
+
+int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
+					struct ocfs2_dinode *alloc);
+
+int ocfs2_alloc_should_use_local(struct ocfs2_super *osb,
+				 u64 bits);
+
+struct ocfs2_alloc_context;
+int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
+				   u32 bits_wanted,
+				   struct ocfs2_alloc_context *ac);
+
+int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
+				 handle_t *handle,
+				 struct ocfs2_alloc_context *ac,
+				 u32 bits_wanted,
+				 u32 *bit_off,
+				 u32 *num_bits);
+
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+				      unsigned int num_clusters);
+void ocfs2_la_enable_worker(struct work_struct *work);
+
+#endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
new file mode 100644
index 00000000..e57c8040
--- /dev/null
+++ b/fs/ocfs2/locks.c
@@ -0,0 +1,139 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * locks.c
+ *
+ * Userspace file locking support
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/fcntl.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "locks.h"
+
+static int ocfs2_do_flock(struct file *file, struct inode *inode,
+			  int cmd, struct file_lock *fl)
+{
+	int ret = 0, level = 0, trylock = 0;
+	struct ocfs2_file_private *fp = file->private_data;
+	struct ocfs2_lock_res *lockres = &fp->fp_flock;
+
+	if (fl->fl_type == F_WRLCK)
+		level = 1;
+	if (!IS_SETLKW(cmd))
+		trylock = 1;
+
+	mutex_lock(&fp->fp_mutex);
+
+	if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
+	    lockres->l_level > LKM_NLMODE) {
+		int old_level = 0;
+
+		if (lockres->l_level == LKM_EXMODE)
+			old_level = 1;
+
+		if (level == old_level)
+			goto out;
+
+		/*
+		 * Converting an existing lock is not guaranteed to be
+		 * atomic, so we can get away with simply unlocking
+		 * here and allowing the lock code to try at the new
+		 * level.
+		 */
+
+		flock_lock_file_wait(file,
+				     &(struct file_lock){.fl_type = F_UNLCK});
+
+		ocfs2_file_unlock(file);
+	}
+
+	ret = ocfs2_file_lock(file, level, trylock);
+	if (ret) {
+		if (ret == -EAGAIN && trylock)
+			ret = -EWOULDBLOCK;
+		else
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = flock_lock_file_wait(file, fl);
+
+out:
+	mutex_unlock(&fp->fp_mutex);
+
+	return ret;
+}
+
+static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
+{
+	int ret;
+	struct ocfs2_file_private *fp = file->private_data;
+
+	mutex_lock(&fp->fp_mutex);
+	ocfs2_file_unlock(file);
+	ret = flock_lock_file_wait(file, fl);
+	mutex_unlock(&fp->fp_mutex);
+
+	return ret;
+}
+
+/*
+ * Overall flow of ocfs2_flock() was influenced by gfs2_flock().
+ */
+int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!(fl->fl_flags & FL_FLOCK))
+		return -ENOLCK;
+	if (__mandatory_lock(inode))
+		return -ENOLCK;
+
+	if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+	    ocfs2_mount_local(osb))
+		return flock_lock_file_wait(file, fl);
+
+	if (fl->fl_type == F_UNLCK)
+		return ocfs2_do_funlock(file, cmd, fl);
+	else
+		return ocfs2_do_flock(file, inode, cmd, fl);
+}
+
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!(fl->fl_flags & FL_POSIX))
+		return -ENOLCK;
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+		return -ENOLCK;
+
+	return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
+}
diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h
new file mode 100644
index 00000000..496d488b
--- /dev/null
+++ b/fs/ocfs2/locks.h
@@ -0,0 +1,32 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * locks.h
+ *
+ * Function prototypes for Userspace file locking support
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_LOCKS_H
+#define OCFS2_LOCKS_H
+
+int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl);
+
+#endif /* OCFS2_LOCKS_H */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
new file mode 100644
index 00000000..3e9393ca
--- /dev/null
+++ b/fs/ocfs2/mmap.c
@@ -0,0 +1,197 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * mmap.c
+ *
+ * Code to deal with the mess that is clustered mmap.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/signal.h>
+#include <linux/rbtree.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "aops.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "mmap.h"
+#include "super.h"
+#include "ocfs2_trace.h"
+
+
+static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
+{
+	sigset_t oldset;
+	int ret;
+
+	ocfs2_block_signals(&oldset);
+	ret = filemap_fault(area, vmf);
+	ocfs2_unblock_signals(&oldset);
+
+	trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno,
+			  area, vmf->page, vmf->pgoff);
+	return ret;
+}
+
+static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
+				struct page *page)
+{
+	int ret;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	loff_t pos = page_offset(page);
+	unsigned int len = PAGE_CACHE_SIZE;
+	pgoff_t last_index;
+	struct page *locked_page = NULL;
+	void *fsdata;
+	loff_t size = i_size_read(inode);
+
+	/*
+	 * Another node might have truncated while we were waiting on
+	 * cluster locks.
+	 * We don't check size == 0 before the shift. This is borrowed
+	 * from do_generic_file_read.
+	 */
+	last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+	if (unlikely(!size || page->index > last_index)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * The i_size check above doesn't catch the case where nodes
+	 * truncated and then re-extended the file. We'll re-check the
+	 * page mapping after taking the page lock inside of
+	 * ocfs2_write_begin_nolock().
+	 */
+	if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
+		/*
+		 * the page has been umapped in ocfs2_data_downconvert_worker.
+		 * So return 0 here and let VFS retry.
+		 */
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * Call ocfs2_write_begin() and ocfs2_write_end() to take
+	 * advantage of the allocation code there. We pass a write
+	 * length of the whole page (chopped to i_size) to make sure
+	 * the whole thing is allocated.
+	 *
+	 * Since we know the page is up to date, we don't have to
+	 * worry about ocfs2_write_begin() skipping some buffer reads
+	 * because the "write" would invalidate their data.
+	 */
+	if (page->index == last_index)
+		len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
+
+	ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
+				       &fsdata, di_bh, page);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+				     fsdata);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+	BUG_ON(ret != len);
+	ret = 0;
+out:
+	return ret;
+}
+
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct buffer_head *di_bh = NULL;
+	sigset_t oldset;
+	int ret;
+
+	ocfs2_block_signals(&oldset);
+
+	/*
+	 * The cluster locks taken will block a truncate from another
+	 * node. Taking the data lock will also ensure that we don't
+	 * attempt page truncation as part of a downconvert.
+	 */
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * The alloc sem should be enough to serialize with
+	 * ocfs2_truncate_file() changing i_size as well as any thread
+	 * modifying the inode btree.
+	 */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
+
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	brelse(di_bh);
+	ocfs2_inode_unlock(inode, 1);
+
+out:
+	ocfs2_unblock_signals(&oldset);
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
+	return ret;
+}
+
+static const struct vm_operations_struct ocfs2_file_vm_ops = {
+	.fault		= ocfs2_fault,
+	.page_mkwrite	= ocfs2_page_mkwrite,
+};
+
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int ret = 0, lock_level = 0;
+
+	ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode,
+				    file->f_vfsmnt, &lock_level);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+	ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
+out:
+	vma->vm_ops = &ocfs2_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+	return 0;
+}
+
diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h
new file mode 100644
index 00000000..1274ee0f
--- /dev/null
+++ b/fs/ocfs2/mmap.h
@@ -0,0 +1,6 @@
+#ifndef OCFS2_MMAP_H
+#define OCFS2_MMAP_H
+
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
+
+#endif  /* OCFS2_MMAP_H */
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
new file mode 100644
index 00000000..cd942702
--- /dev/null
+++ b/fs/ocfs2/move_extents.c
@@ -0,0 +1,1152 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.c
+ *
+ * Copyright (C) 2011 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/swap.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_ioctl.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "super.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "sysfile.h"
+#include "suballoc.h"
+#include "refcounttree.h"
+#include "move_extents.h"
+
+struct ocfs2_move_extents_context {
+	struct inode *inode;
+	struct file *file;
+	int auto_defrag;
+	int partial;
+	int credits;
+	u32 new_phys_cpos;
+	u32 clusters_moved;
+	u64 refcount_loc;
+	struct ocfs2_move_extents *range;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_alloc_context *data_ac;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+};
+
+static int __ocfs2_move_extent(handle_t *handle,
+			       struct ocfs2_move_extents_context *context,
+			       u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
+			       int ext_flags)
+{
+	int ret = 0, index;
+	struct inode *inode = context->inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_extent_rec *rec, replace_rec;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_list *el;
+	u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
+	u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
+
+	ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
+					       p_cpos, new_p_cpos, len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	memset(&replace_rec, 0, sizeof(replace_rec));
+	replace_rec.e_cpos = cpu_to_le32(cpos);
+	replace_rec.e_leaf_clusters = cpu_to_le16(len);
+	replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+								   new_p_cpos));
+
+	path = ocfs2_new_path_from_et(&context->et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %llu has an extent at cpos %u which can no "
+			    "longer be found.\n",
+			    (unsigned long long)ino, cpos);
+		ret = -EROFS;
+		goto out;
+	}
+
+	rec = &el->l_recs[index];
+
+	BUG_ON(ext_flags != rec->e_flags);
+	/*
+	 * after moving/defraging to new location, the extent is not going
+	 * to be refcounted anymore.
+	 */
+	replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+				      context->et.et_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_split_extent(handle, &context->et, path, index,
+				 &replace_rec, context->meta_ac,
+				 &context->dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_journal_dirty(handle, context->et.et_root_bh);
+
+	context->new_phys_cpos = new_p_cpos;
+
+	/*
+	 * need I to append truncate log for old clusters?
+	 */
+	if (old_blkno) {
+		if (ext_flags & OCFS2_EXT_REFCOUNTED)
+			ret = ocfs2_decrease_refcount(inode, handle,
+					ocfs2_blocks_to_clusters(osb->sb,
+								 old_blkno),
+					len, context->meta_ac,
+					&context->dealloc, 1);
+		else
+			ret = ocfs2_truncate_log_append(osb, handle,
+							old_blkno, len);
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * lock allocators, and reserving appropriate number of bits for
+ * meta blocks and data clusters.
+ *
+ * in some cases, we don't need to reserve clusters, just let data_ac
+ * be NULL.
+ */
+static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+					struct ocfs2_extent_tree *et,
+					u32 clusters_to_move,
+					u32 extents_to_split,
+					struct ocfs2_alloc_context **meta_ac,
+					struct ocfs2_alloc_context **data_ac,
+					int extra_blocks,
+					int *credits)
+{
+	int ret, num_free_extents;
+	unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	num_free_extents = ocfs2_num_free_extents(osb, et);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!num_free_extents ||
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+		extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+	ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (data_ac) {
+		ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
+					      clusters_to_move + 2);
+
+	mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
+	     extra_blocks, clusters_to_move, *credits);
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Using one journal handle to guarantee the data consistency in case
+ * crash happens anywhere.
+ *
+ *  XXX: defrag can end up with finishing partial extent as requested,
+ * due to not enough contiguous clusters can be found in allocator.
+ */
+static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
+			       u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
+{
+	int ret, credits = 0, extra_blocks = 0, partial = context->partial;
+	handle_t *handle;
+	struct inode *inode = context->inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+	u32 new_phys_cpos, new_len;
+	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
+
+		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+			 OCFS2_HAS_REFCOUNT_FL));
+
+		BUG_ON(!context->refcount_loc);
+
+		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+					       &ref_tree, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			return ret;
+		}
+
+		ret = ocfs2_prepare_refcount_change_for_del(inode,
+							context->refcount_loc,
+							phys_blkno,
+							*len,
+							&credits,
+							&extra_blocks);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
+						 &context->meta_ac,
+						 &context->data_ac,
+						 extra_blocks, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * should be using allocation reservation strategy there?
+	 *
+	 * if (context->data_ac)
+	 *	context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+	 */
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_unlock_mutex;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock_mutex;
+	}
+
+	ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
+				     &new_phys_cpos, &new_len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * allowing partial extent moving is kind of 'pros and cons', it makes
+	 * whole defragmentation less likely to fail, on the contrary, the bad
+	 * thing is it may make the fs even more fragmented after moving, let
+	 * userspace make a good decision here.
+	 */
+	if (new_len != *len) {
+		mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
+		if (!partial) {
+			context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
+			ret = -ENOSPC;
+			goto out_commit;
+		}
+	}
+
+	mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
+	     phys_cpos, new_phys_cpos);
+
+	ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
+				  new_phys_cpos, ext_flags);
+	if (ret)
+		mlog_errno(ret);
+
+	if (partial && (new_len != *len))
+		*len = new_len;
+
+	/*
+	 * Here we should write the new page out first if we are
+	 * in write-back mode.
+	 */
+	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock_mutex:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (context->data_ac) {
+		ocfs2_free_alloc_context(context->data_ac);
+		context->data_ac = NULL;
+	}
+
+	if (context->meta_ac) {
+		ocfs2_free_alloc_context(context->meta_ac);
+		context->meta_ac = NULL;
+	}
+
+out:
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+	return ret;
+}
+
+/*
+ * find the victim alloc group, where #blkno fits.
+ */
+static int ocfs2_find_victim_alloc_group(struct inode *inode,
+					 u64 vict_blkno,
+					 int type, int slot,
+					 int *vict_bit,
+					 struct buffer_head **ret_bh)
+{
+	int ret, i, bits_per_unit = 0;
+	u64 blkno;
+	char namebuf[40];
+
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_chain_rec *rec;
+	struct ocfs2_dinode *ac_dinode;
+	struct ocfs2_group_desc *bg;
+
+	ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
+	ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
+					 strlen(namebuf), &blkno);
+	if (ret) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
+	cl = &(ac_dinode->id2.i_chain);
+	rec = &(cl->cl_recs[0]);
+
+	if (type == GLOBAL_BITMAP_SYSTEM_INODE)
+		bits_per_unit = osb->s_clustersize_bits -
+					inode->i_sb->s_blocksize_bits;
+	/*
+	 * 'vict_blkno' was out of the valid range.
+	 */
+	if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
+	    (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
+				bits_per_unit))) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+
+		rec = &(cl->cl_recs[i]);
+		if (!rec)
+			continue;
+
+		bg = NULL;
+
+		do {
+			if (!bg)
+				blkno = le64_to_cpu(rec->c_blkno);
+			else
+				blkno = le64_to_cpu(bg->bg_next_group);
+
+			if (gd_bh) {
+				brelse(gd_bh);
+				gd_bh = NULL;
+			}
+
+			ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+			if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
+						le16_to_cpu(bg->bg_bits))) {
+
+				*ret_bh = gd_bh;
+				*vict_bit = (vict_blkno - blkno) >>
+							bits_per_unit;
+				mlog(0, "find the victim group: #%llu, "
+				     "total_bits: %u, vict_bit: %u\n",
+				     blkno, le16_to_cpu(bg->bg_bits),
+				     *vict_bit);
+				goto out;
+			}
+
+		} while (le64_to_cpu(bg->bg_next_group));
+	}
+
+	ret = -EINVAL;
+out:
+	brelse(ac_bh);
+
+	/*
+	 * caller has to release the gd_bh properly.
+	 */
+	return ret;
+}
+
+/*
+ * XXX: helper to validate and adjust moving goal.
+ */
+static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
+					       struct ocfs2_move_extents *range)
+{
+	int ret, goal_bit = 0;
+
+	struct buffer_head *gd_bh = NULL;
+	struct ocfs2_group_desc *bg = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int c_to_b = 1 << (osb->s_clustersize_bits -
+					inode->i_sb->s_blocksize_bits);
+
+	/*
+	 * make goal become cluster aligned.
+	 */
+	range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
+						      range->me_goal);
+	/*
+	 * moving goal is not allowd to start with a group desc blok(#0 blk)
+	 * let's compromise to the latter cluster.
+	 */
+	if (range->me_goal == le64_to_cpu(bg->bg_blkno))
+		range->me_goal += c_to_b;
+
+	/*
+	 * validate goal sits within global_bitmap, and return the victim
+	 * group desc
+	 */
+	ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
+					    GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT,
+					    &goal_bit, &gd_bh);
+	if (ret)
+		goto out;
+
+	bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+	/*
+	 * movement is not gonna cross two groups.
+	 */
+	if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
+								range->me_len) {
+		ret = -EINVAL;
+		goto out;
+	}
+	/*
+	 * more exact validations/adjustments will be performed later during
+	 * moving operation for each extent range.
+	 */
+	mlog(0, "extents get ready to be moved to #%llu block\n",
+	     range->me_goal);
+
+out:
+	brelse(gd_bh);
+
+	return ret;
+}
+
+static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
+				    int *goal_bit, u32 move_len, u32 max_hop,
+				    u32 *phys_cpos)
+{
+	int i, used, last_free_bits = 0, base_bit = *goal_bit;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+	u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+						 le64_to_cpu(gd->bg_blkno));
+
+	for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
+
+		used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
+		if (used) {
+			/*
+			 * we even tried searching the free chunk by jumping
+			 * a 'max_hop' distance, but still failed.
+			 */
+			if ((i - base_bit) > max_hop) {
+				*phys_cpos = 0;
+				break;
+			}
+
+			if (last_free_bits)
+				last_free_bits = 0;
+
+			continue;
+		} else
+			last_free_bits++;
+
+		if (last_free_bits == move_len) {
+			*goal_bit = i;
+			*phys_cpos = base_cpos + i;
+			break;
+		}
+	}
+
+	mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
+}
+
+static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+				       handle_t *handle,
+				       struct buffer_head *di_bh,
+				       u32 num_bits,
+				       u16 chain)
+{
+	int ret;
+	u32 tmp_used;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+	struct ocfs2_chain_list *cl =
+				(struct ocfs2_chain_list *) &di->id2.i_chain;
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
+	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
+	ocfs2_journal_dirty(handle, di_bh);
+
+out:
+	return ret;
+}
+
+static inline int ocfs2_block_group_set_bits(handle_t *handle,
+					     struct inode *alloc_inode,
+					     struct ocfs2_group_desc *bg,
+					     struct buffer_head *group_bh,
+					     unsigned int bit_off,
+					     unsigned int num_bits)
+{
+	int status;
+	void *bitmap = bg->bg_bitmap;
+	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+
+	/* All callers get the descriptor via
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
+
+	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
+	     num_bits);
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+
+	status = ocfs2_journal_access_gd(handle,
+					 INODE_CACHE(alloc_inode),
+					 group_bh,
+					 journal_type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+			    " count %u but claims %u are freed. num_bits %d",
+			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
+			    le16_to_cpu(bg->bg_bits),
+			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
+		return -EROFS;
+	}
+	while (num_bits--)
+		ocfs2_set_bit(bit_off++, bitmap);
+
+	ocfs2_journal_dirty(handle, group_bh);
+
+bail:
+	return status;
+}
+
+static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
+			     u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
+			     u32 len, int ext_flags)
+{
+	int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
+	handle_t *handle;
+	struct inode *inode = context->inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct inode *gb_inode = NULL;
+	struct buffer_head *gb_bh = NULL;
+	struct buffer_head *gd_bh = NULL;
+	struct ocfs2_group_desc *gd;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+	u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
+						    context->range->me_threshold);
+	u64 phys_blkno, new_phys_blkno;
+
+	phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
+
+		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+			 OCFS2_HAS_REFCOUNT_FL));
+
+		BUG_ON(!context->refcount_loc);
+
+		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+					       &ref_tree, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			return ret;
+		}
+
+		ret = ocfs2_prepare_refcount_change_for_del(inode,
+							context->refcount_loc,
+							phys_blkno,
+							len,
+							&credits,
+							&extra_blocks);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
+						 &context->meta_ac,
+						 NULL, extra_blocks, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * need to count 2 extra credits for global_bitmap inode and
+	 * group descriptor.
+	 */
+	credits += OCFS2_INODE_UPDATE_CREDITS + 1;
+
+	/*
+	 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
+	 * logic, while we still need to lock the global_bitmap.
+	 */
+	gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+					       OCFS2_INVALID_SLOT);
+	if (!gb_inode) {
+		mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+		ret = -EIO;
+		goto out;
+	}
+
+	mutex_lock(&gb_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock_gb_mutex;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock_tl_inode;
+	}
+
+	new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
+	ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
+					    GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT,
+					    &goal_bit, &gd_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * probe the victim cluster group to find a proper
+	 * region to fit wanted movement, it even will perfrom
+	 * a best-effort attempt by compromising to a threshold
+	 * around the goal.
+	 */
+	ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
+				new_phys_cpos);
+	if (!new_phys_cpos) {
+		ret = -ENOSPC;
+		goto out_commit;
+	}
+
+	ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
+				  *new_phys_cpos, ext_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+	ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
+					       le16_to_cpu(gd->bg_chain));
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
+					 goal_bit, len);
+	if (ret)
+		mlog_errno(ret);
+
+	/*
+	 * Here we should write the new page out first if we are
+	 * in write-back mode.
+	 */
+	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+	brelse(gd_bh);
+
+out_unlock_tl_inode:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	ocfs2_inode_unlock(gb_inode, 1);
+out_unlock_gb_mutex:
+	mutex_unlock(&gb_inode->i_mutex);
+	brelse(gb_bh);
+	iput(gb_inode);
+
+out:
+	if (context->meta_ac) {
+		ocfs2_free_alloc_context(context->meta_ac);
+		context->meta_ac = NULL;
+	}
+
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+	return ret;
+}
+
+/*
+ * Helper to calculate the defraging length in one run according to threshold.
+ */
+static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
+					 u32 threshold, int *skip)
+{
+	if ((*alloc_size + *len_defraged) < threshold) {
+		/*
+		 * proceed defragmentation until we meet the thresh
+		 */
+		*len_defraged += *alloc_size;
+	} else if (*len_defraged == 0) {
+		/*
+		 * XXX: skip a large extent.
+		 */
+		*skip = 1;
+	} else {
+		/*
+		 * split this extent to coalesce with former pieces as
+		 * to reach the threshold.
+		 *
+		 * we're done here with one cycle of defragmentation
+		 * in a size of 'thresh', resetting 'len_defraged'
+		 * forces a new defragmentation.
+		 */
+		*alloc_size = threshold - *len_defraged;
+		*len_defraged = 0;
+	}
+}
+
+static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
+				struct ocfs2_move_extents_context *context)
+{
+	int ret = 0, flags, do_defrag, skip = 0;
+	u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
+	u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
+
+	struct inode *inode = context->inode;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_move_extents *range = context->range;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if ((inode->i_size == 0) || (range->me_len == 0))
+		return 0;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
+
+	context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
+
+	ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
+	ocfs2_init_dealloc_ctxt(&context->dealloc);
+
+	/*
+	 * TO-DO XXX:
+	 *
+	 * - xattr extents.
+	 */
+
+	do_defrag = context->auto_defrag;
+
+	/*
+	 * extents moving happens in unit of clusters, for the sake
+	 * of simplicity, we may ignore two clusters where 'byte_start'
+	 * and 'byte_start + len' were within.
+	 */
+	move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
+	len_to_move = (range->me_start + range->me_len) >>
+						osb->s_clustersize_bits;
+	if (len_to_move >= move_start)
+		len_to_move -= move_start;
+	else
+		len_to_move = 0;
+
+	if (do_defrag) {
+		defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
+		if (defrag_thresh <= 1)
+			goto done;
+	} else
+		new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+							 range->me_goal);
+
+	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
+	     "thresh: %u\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     (unsigned long long)range->me_start,
+	     (unsigned long long)range->me_len,
+	     move_start, len_to_move, defrag_thresh);
+
+	cpos = move_start;
+	while (len_to_move) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
+					 &flags);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (alloc_size > len_to_move)
+			alloc_size = len_to_move;
+
+		/*
+		 * XXX: how to deal with a hole:
+		 *
+		 * - skip the hole of course
+		 * - force a new defragmentation
+		 */
+		if (!phys_cpos) {
+			if (do_defrag)
+				len_defraged = 0;
+
+			goto next;
+		}
+
+		if (do_defrag) {
+			ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
+						     defrag_thresh, &skip);
+			/*
+			 * skip large extents
+			 */
+			if (skip) {
+				skip = 0;
+				goto next;
+			}
+
+			mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
+			     "alloc_size: %u, len_defraged: %u\n",
+			     cpos, phys_cpos, alloc_size, len_defraged);
+
+			ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
+						  &alloc_size, flags);
+		} else {
+			ret = ocfs2_move_extent(context, cpos, phys_cpos,
+						&new_phys_cpos, alloc_size,
+						flags);
+
+			new_phys_cpos += alloc_size;
+		}
+
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		context->clusters_moved += alloc_size;
+next:
+		cpos += alloc_size;
+		len_to_move -= alloc_size;
+	}
+
+done:
+	range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
+
+out:
+	range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
+						      context->clusters_moved);
+	range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
+						       context->new_phys_cpos);
+
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &context->dealloc);
+
+	return ret;
+}
+
+static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
+{
+	int status;
+	handle_t *handle;
+	struct inode *inode = context->inode;
+	struct ocfs2_dinode *di;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!inode)
+		return -ENOENT;
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * This prevents concurrent writes from other nodes
+	 */
+	status = ocfs2_rw_lock(inode, 1);
+	if (status) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (status) {
+		mlog_errno(status);
+		goto out_rw_unlock;
+	}
+
+	/*
+	 * rememer ip_xattr_sem also needs to be held if necessary
+	 */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	status = __ocfs2_move_extents_range(di_bh, context);
+
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	if (status) {
+		mlog_errno(status);
+		goto out_inode_unlock;
+	}
+
+	/*
+	 * We update ctime for these changes
+	 */
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_inode_unlock;
+	}
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	inode->i_ctime = CURRENT_TIME;
+	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_inode_unlock:
+	brelse(di_bh);
+	ocfs2_inode_unlock(inode, 1);
+out_rw_unlock:
+	ocfs2_rw_unlock(inode, 1);
+out:
+	mutex_unlock(&inode->i_mutex);
+
+	return status;
+}
+
+int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
+{
+	int status;
+
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct ocfs2_move_extents range;
+	struct ocfs2_move_extents_context *context = NULL;
+
+	status = mnt_want_write(filp->f_path.mnt);
+	if (status)
+		return status;
+
+	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
+		goto out;
+
+	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+		status = -EPERM;
+		goto out;
+	}
+
+	context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
+	if (!context) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out;
+	}
+
+	context->inode = inode;
+	context->file = filp;
+
+	if (argp) {
+		if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
+				   sizeof(range))) {
+			status = -EFAULT;
+			goto out;
+		}
+	} else {
+		status = -EINVAL;
+		goto out;
+	}
+
+	if (range.me_start > i_size_read(inode))
+		goto out;
+
+	if (range.me_start + range.me_len > i_size_read(inode))
+			range.me_len = i_size_read(inode) - range.me_start;
+
+	context->range = &range;
+
+	if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
+		context->auto_defrag = 1;
+		/*
+		 * ok, the default theshold for the defragmentation
+		 * is 1M, since our maximum clustersize was 1M also.
+		 * any thought?
+		 */
+		if (!range.me_threshold)
+			range.me_threshold = 1024 * 1024;
+
+		if (range.me_threshold > i_size_read(inode))
+			range.me_threshold = i_size_read(inode);
+
+		if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
+			context->partial = 1;
+	} else {
+		/*
+		 * first best-effort attempt to validate and adjust the goal
+		 * (physical address in block), while it can't guarantee later
+		 * operation can succeed all the time since global_bitmap may
+		 * change a bit over time.
+		 */
+
+		status = ocfs2_validate_and_adjust_move_goal(inode, &range);
+		if (status)
+			goto out;
+	}
+
+	status = ocfs2_move_extents(context);
+	if (status)
+		mlog_errno(status);
+out:
+	/*
+	 * movement/defragmentation may end up being partially completed,
+	 * that's the reason why we need to return userspace the finished
+	 * length and new_offset even if failure happens somewhere.
+	 */
+	if (argp) {
+		if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
+				sizeof(range)))
+			status = -EFAULT;
+	}
+
+	kfree(context);
+
+	mnt_drop_write(filp->f_path.mnt);
+
+	return status;
+}
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
new file mode 100644
index 00000000..4e143e81
--- /dev/null
+++ b/fs/ocfs2/move_extents.h
@@ -0,0 +1,22 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.h
+ *
+ * Copyright (C) 2011 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_MOVE_EXTENTS_H
+#define OCFS2_MOVE_EXTENTS_H
+
+int ocfs2_ioctl_move_extents(struct file *filp,  void __user *argp);
+
+#endif /* OCFS2_MOVE_EXTENTS_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
new file mode 100644
index 00000000..e5d738cd
--- /dev/null
+++ b/fs/ocfs2/namei.c
@@ -0,0 +1,2501 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * namei.c
+ *
+ * Create and rename file, directory, symlinks
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ *  Portions of this code from linux/fs/ext3/dir.c
+ *
+ *  Copyright (C) 1992, 1993, 1994, 1995
+ *  Remy Card (card@masi.ibp.fr)
+ *  Laboratoire MASI - Institut Blaise pascal
+ *  Universite Pierre et Marie Curie (Paris VI)
+ *
+ *   from
+ *
+ *   linux/fs/minix/dir.c
+ *
+ *   Copyright (C) 1991, 1992 Linux Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/quotaops.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dcache.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "super.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "xattr.h"
+#include "acl.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+			      struct inode *dir,
+			      struct inode *inode,
+			      dev_t dev,
+			      struct buffer_head **new_fe_bh,
+			      struct buffer_head *parent_fe_bh,
+			      handle_t *handle,
+			      struct ocfs2_alloc_context *inode_ac);
+
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+				    struct inode **ret_orphan_dir,
+				    u64 blkno,
+				    char *name,
+				    struct ocfs2_dir_lookup_result *lookup);
+
+static int ocfs2_orphan_add(struct ocfs2_super *osb,
+			    handle_t *handle,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    char *name,
+			    struct ocfs2_dir_lookup_result *lookup,
+			    struct inode *orphan_dir_inode);
+
+static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
+				     handle_t *handle,
+				     struct inode *inode,
+				     const char *symname);
+
+/* An orphan dir name is an 8 byte value, printed as a hex string */
+#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
+
+static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	int status;
+	u64 blkno;
+	struct inode *inode = NULL;
+	struct dentry *ret;
+	struct ocfs2_inode_info *oi;
+
+	trace_ocfs2_lookup(dir, dentry, dentry->d_name.len,
+			   dentry->d_name.name,
+			   (unsigned long long)OCFS2_I(dir)->ip_blkno, 0);
+
+	if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
+		ret = ERR_PTR(-ENAMETOOLONG);
+		goto bail;
+	}
+
+	status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		ret = ERR_PTR(status);
+		goto bail;
+	}
+
+	status = ocfs2_lookup_ino_from_name(dir, dentry->d_name.name,
+					    dentry->d_name.len, &blkno);
+	if (status < 0)
+		goto bail_add;
+
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
+	if (IS_ERR(inode)) {
+		ret = ERR_PTR(-EACCES);
+		goto bail_unlock;
+	}
+
+	oi = OCFS2_I(inode);
+	/* Clear any orphaned state... If we were able to look up the
+	 * inode from a directory, it certainly can't be orphaned. We
+	 * might have the bad state from a node which intended to
+	 * orphan this inode but crashed before it could commit the
+	 * unlink. */
+	spin_lock(&oi->ip_lock);
+	oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
+	spin_unlock(&oi->ip_lock);
+
+bail_add:
+	ret = d_splice_alias(inode, dentry);
+
+	if (inode) {
+		/*
+		 * If d_splice_alias() finds a DCACHE_DISCONNECTED
+		 * dentry, it will d_move() it on top of ourse. The
+		 * return value will indicate this however, so in
+		 * those cases, we switch them around for the locking
+		 * code.
+		 *
+		 * NOTE: This dentry already has ->d_op set from
+		 * ocfs2_get_parent() and ocfs2_get_dentry()
+		 */
+		if (ret)
+			dentry = ret;
+
+		status = ocfs2_dentry_attach_lock(dentry, inode,
+						  OCFS2_I(dir)->ip_blkno);
+		if (status) {
+			mlog_errno(status);
+			ret = ERR_PTR(status);
+			goto bail_unlock;
+		}
+	} else
+		ocfs2_dentry_attach_gen(dentry);
+
+bail_unlock:
+	/* Don't drop the cluster lock until *after* the d_add --
+	 * unlink on another node will message us to remove that
+	 * dentry under this lock so otherwise we can race this with
+	 * the downconvert thread and have a stale dentry. */
+	ocfs2_inode_unlock(dir, 0);
+
+bail:
+
+	trace_ocfs2_lookup_ret(ret);
+
+	return ret;
+}
+
+static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
+{
+	struct inode *inode;
+
+	inode = new_inode(dir->i_sb);
+	if (!inode) {
+		mlog(ML_ERROR, "new_inode failed!\n");
+		return NULL;
+	}
+
+	/* populate as many fields early on as possible - many of
+	 * these are used by the support functions here and in
+	 * callers. */
+	if (S_ISDIR(mode))
+		inode->i_nlink = 2;
+	else
+		inode->i_nlink = 1;
+	inode_init_owner(inode, dir, mode);
+	dquot_initialize(inode);
+	return inode;
+}
+
+static int ocfs2_mknod(struct inode *dir,
+		       struct dentry *dentry,
+		       int mode,
+		       dev_t dev)
+{
+	int status = 0;
+	struct buffer_head *parent_fe_bh = NULL;
+	handle_t *handle = NULL;
+	struct ocfs2_super *osb;
+	struct ocfs2_dinode *dirfe;
+	struct buffer_head *new_fe_bh = NULL;
+	struct inode *inode = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	int want_clusters = 0;
+	int want_meta = 0;
+	int xattr_credits = 0;
+	struct ocfs2_security_xattr_info si = {
+		.enable = 1,
+	};
+	int did_quota_inode = 0;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+	sigset_t oldset;
+	int did_block_signals = 0;
+
+	trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+			  (unsigned long long)OCFS2_I(dir)->ip_blkno,
+			  (unsigned long)dev, mode);
+
+	dquot_initialize(dir);
+
+	/* get our super block */
+	osb = OCFS2_SB(dir->i_sb);
+
+	status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		return status;
+	}
+
+	if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) {
+		status = -EMLINK;
+		goto leave;
+	}
+
+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	if (!ocfs2_read_links_count(dirfe)) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto leave;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto leave;
+
+	/* get a spot inside the dir. */
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* reserve an inode spot */
+	status = ocfs2_reserve_new_inode(osb, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	inode = ocfs2_get_init_inode(dir, mode);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* get security xattr */
+	status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
+	if (status) {
+		if (status == -EOPNOTSUPP)
+			si.enable = 0;
+		else {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	/* calculate meta data/clusters for setting security and acl xattr */
+	status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
+				       &si, &want_clusters,
+				       &xattr_credits, &want_meta);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* Reserve a cluster if creating an extent based directory. */
+	if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
+		want_clusters += 1;
+
+		/* Dir indexing requires extra space as well */
+		if (ocfs2_supports_indexed_dirs(osb))
+			want_meta++;
+	}
+
+	status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
+							    S_ISDIR(mode),
+							    xattr_credits));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* Starting to change things, restart is no longer possible. */
+	ocfs2_block_signals(&oldset);
+	did_block_signals = 1;
+
+	status = dquot_alloc_inode(inode);
+	if (status)
+		goto leave;
+	did_quota_inode = 1;
+
+	/* do the real work now. */
+	status = ocfs2_mknod_locked(osb, dir, inode, dev,
+				    &new_fe_bh, parent_fe_bh, handle,
+				    inode_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(mode)) {
+		status = ocfs2_fill_new_dir(osb, handle, dir, inode,
+					    new_fe_bh, data_ac, meta_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+
+		status = ocfs2_journal_access_di(handle, INODE_CACHE(dir),
+						 parent_fe_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+		ocfs2_add_links_count(dirfe, 1);
+		ocfs2_journal_dirty(handle, parent_fe_bh);
+		inc_nlink(dir);
+	}
+
+	status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+				meta_ac, data_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (si.enable) {
+		status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+						 meta_ac, data_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	/*
+	 * Do this before adding the entry to the directory. We add
+	 * also set d_op after success so that ->d_iput() will cleanup
+	 * the dentry lock even if ocfs2_add_entry() fails below.
+	 */
+	status = ocfs2_dentry_attach_lock(dentry, inode,
+					  OCFS2_I(dir)->ip_blkno);
+	if (status) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
+				 &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	insert_inode_hash(inode);
+	d_instantiate(dentry, inode);
+	status = 0;
+leave:
+	if (status < 0 && did_quota_inode)
+		dquot_free_inode(inode);
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	ocfs2_inode_unlock(dir, 1);
+	if (did_block_signals)
+		ocfs2_unblock_signals(&oldset);
+
+	brelse(new_fe_bh);
+	brelse(parent_fe_bh);
+	kfree(si.name);
+	kfree(si.value);
+
+	ocfs2_free_dir_lookup_result(&lookup);
+
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	/*
+	 * We should call iput after the i_mutex of the bitmap been
+	 * unlocked in ocfs2_free_alloc_context, or the
+	 * ocfs2_delete_inode will mutex_lock again.
+	 */
+	if ((status < 0) && inode) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
+		clear_nlink(inode);
+		iput(inode);
+	}
+
+	if (status)
+		mlog_errno(status);
+
+	return status;
+}
+
+static int __ocfs2_mknod_locked(struct inode *dir,
+				struct inode *inode,
+				dev_t dev,
+				struct buffer_head **new_fe_bh,
+				struct buffer_head *parent_fe_bh,
+				handle_t *handle,
+				struct ocfs2_alloc_context *inode_ac,
+				u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
+{
+	int status = 0;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_extent_list *fel;
+	u16 feat;
+
+	*new_fe_bh = NULL;
+
+	/* populate as many fields early on as possible - many of
+	 * these are used by the support functions here and in
+	 * callers. */
+	inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
+	OCFS2_I(inode)->ip_blkno = fe_blkno;
+	spin_lock(&osb->osb_lock);
+	inode->i_generation = osb->s_next_generation++;
+	spin_unlock(&osb->osb_lock);
+
+	*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
+	if (!*new_fe_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto leave;
+	}
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), *new_fe_bh);
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+					 *new_fe_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data;
+	memset(fe, 0, osb->sb->s_blocksize);
+
+	fe->i_generation = cpu_to_le32(inode->i_generation);
+	fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
+	fe->i_blkno = cpu_to_le64(fe_blkno);
+	fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
+	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
+	fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
+	fe->i_uid = cpu_to_le32(inode->i_uid);
+	fe->i_gid = cpu_to_le32(inode->i_gid);
+	fe->i_mode = cpu_to_le16(inode->i_mode);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
+
+	ocfs2_set_links_count(fe, inode->i_nlink);
+
+	fe->i_last_eb_blk = 0;
+	strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
+	le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
+	fe->i_atime = fe->i_ctime = fe->i_mtime =
+		cpu_to_le64(CURRENT_TIME.tv_sec);
+	fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
+		cpu_to_le32(CURRENT_TIME.tv_nsec);
+	fe->i_dtime = 0;
+
+	/*
+	 * If supported, directories start with inline data. If inline
+	 * isn't supported, but indexing is, we start them as indexed.
+	 */
+	feat = le16_to_cpu(fe->i_dyn_features);
+	if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
+		fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
+
+		fe->id2.i_data.id_count = cpu_to_le16(
+				ocfs2_max_inline_data_with_xattr(osb->sb, fe));
+	} else {
+		fel = &fe->id2.i_list;
+		fel->l_tree_depth = 0;
+		fel->l_next_free_rec = 0;
+		fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
+	}
+
+	ocfs2_journal_dirty(handle, *new_fe_bh);
+
+	ocfs2_populate_inode(inode, fe, 1);
+	ocfs2_ci_set_new(osb, INODE_CACHE(inode));
+	if (!ocfs2_mount_local(osb)) {
+		status = ocfs2_create_new_inode_locks(inode);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	status = 0; /* error in ocfs2_create_new_inode_locks is not
+		     * critical */
+
+leave:
+	if (status < 0) {
+		if (*new_fe_bh) {
+			brelse(*new_fe_bh);
+			*new_fe_bh = NULL;
+		}
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+			      struct inode *dir,
+			      struct inode *inode,
+			      dev_t dev,
+			      struct buffer_head **new_fe_bh,
+			      struct buffer_head *parent_fe_bh,
+			      handle_t *handle,
+			      struct ocfs2_alloc_context *inode_ac)
+{
+	int status = 0;
+	u64 suballoc_loc, fe_blkno = 0;
+	u16 suballoc_bit;
+
+	*new_fe_bh = NULL;
+
+	status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
+				       inode_ac, &suballoc_loc,
+				       &suballoc_bit, &fe_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		return status;
+	}
+
+	return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+				    parent_fe_bh, handle, inode_ac,
+				    fe_blkno, suballoc_loc, suballoc_bit);
+}
+
+static int ocfs2_mkdir(struct inode *dir,
+		       struct dentry *dentry,
+		       int mode)
+{
+	int ret;
+
+	trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+			  OCFS2_I(dir)->ip_blkno, mode);
+	ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+static int ocfs2_create(struct inode *dir,
+			struct dentry *dentry,
+			int mode,
+			struct nameidata *nd)
+{
+	int ret;
+
+	trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+			   (unsigned long long)OCFS2_I(dir)->ip_blkno, mode);
+	ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+static int ocfs2_link(struct dentry *old_dentry,
+		      struct inode *dir,
+		      struct dentry *dentry)
+{
+	handle_t *handle;
+	struct inode *inode = old_dentry->d_inode;
+	int err;
+	struct buffer_head *fe_bh = NULL;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+	sigset_t oldset;
+
+	trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno,
+			 old_dentry->d_name.len, old_dentry->d_name.name,
+			 dentry->d_name.len, dentry->d_name.name);
+
+	if (S_ISDIR(inode->i_mode))
+		return -EPERM;
+
+	dquot_initialize(dir);
+
+	err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
+	if (err < 0) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		return err;
+	}
+
+	if (!dir->i_nlink) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					dentry->d_name.len);
+	if (err)
+		goto out;
+
+	err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					   dentry->d_name.name,
+					   dentry->d_name.len, &lookup);
+	if (err < 0) {
+		mlog_errno(err);
+		goto out;
+	}
+
+	err = ocfs2_inode_lock(inode, &fe_bh, 1);
+	if (err < 0) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto out;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) {
+		err = -EMLINK;
+		goto out_unlock_inode;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(err);
+		goto out_unlock_inode;
+	}
+
+	/* Starting to change things, restart is no longer possible. */
+	ocfs2_block_signals(&oldset);
+
+	err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (err < 0) {
+		mlog_errno(err);
+		goto out_commit;
+	}
+
+	inc_nlink(inode);
+	inode->i_ctime = CURRENT_TIME;
+	ocfs2_set_links_count(fe, inode->i_nlink);
+	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	ocfs2_journal_dirty(handle, fe_bh);
+
+	err = ocfs2_add_entry(handle, dentry, inode,
+			      OCFS2_I(inode)->ip_blkno,
+			      parent_fe_bh, &lookup);
+	if (err) {
+		ocfs2_add_links_count(fe, -1);
+		drop_nlink(inode);
+		mlog_errno(err);
+		goto out_commit;
+	}
+
+	err = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
+	if (err) {
+		mlog_errno(err);
+		goto out_commit;
+	}
+
+	ihold(inode);
+	d_instantiate(dentry, inode);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+	ocfs2_unblock_signals(&oldset);
+out_unlock_inode:
+	ocfs2_inode_unlock(inode, 1);
+
+out:
+	ocfs2_inode_unlock(dir, 1);
+
+	brelse(fe_bh);
+	brelse(parent_fe_bh);
+
+	ocfs2_free_dir_lookup_result(&lookup);
+
+	if (err)
+		mlog_errno(err);
+
+	return err;
+}
+
+/*
+ * Takes and drops an exclusive lock on the given dentry. This will
+ * force other nodes to drop it.
+ */
+static int ocfs2_remote_dentry_delete(struct dentry *dentry)
+{
+	int ret;
+
+	ret = ocfs2_dentry_lock(dentry, 1);
+	if (ret)
+		mlog_errno(ret);
+	else
+		ocfs2_dentry_unlock(dentry, 1);
+
+	return ret;
+}
+
+static inline int inode_is_unlinkable(struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode)) {
+		if (inode->i_nlink == 2)
+			return 1;
+		return 0;
+	}
+
+	if (inode->i_nlink == 1)
+		return 1;
+	return 0;
+}
+
+static int ocfs2_unlink(struct inode *dir,
+			struct dentry *dentry)
+{
+	int status;
+	int child_locked = 0;
+	struct inode *inode = dentry->d_inode;
+	struct inode *orphan_dir = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	u64 blkno;
+	struct ocfs2_dinode *fe = NULL;
+	struct buffer_head *fe_bh = NULL;
+	struct buffer_head *parent_node_bh = NULL;
+	handle_t *handle = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+
+	trace_ocfs2_unlink(dir, dentry, dentry->d_name.len,
+			   dentry->d_name.name,
+			   (unsigned long long)OCFS2_I(dir)->ip_blkno,
+			   (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	dquot_initialize(dir);
+
+	BUG_ON(dentry->d_parent->d_inode != dir);
+
+	if (inode == osb->root_inode)
+		return -EPERM;
+
+	status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1,
+					 OI_LS_PARENT);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		return status;
+	}
+
+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
+					  dentry->d_name.len, &blkno, dir,
+					  &lookup);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	if (OCFS2_I(inode)->ip_blkno != blkno) {
+		status = -ENOENT;
+
+		trace_ocfs2_unlink_noent(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno,
+				(unsigned long long)blkno,
+				OCFS2_I(inode)->ip_flags);
+		goto leave;
+	}
+
+	status = ocfs2_inode_lock(inode, &fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+	child_locked = 1;
+
+	if (S_ISDIR(inode->i_mode)) {
+		if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) {
+			status = -ENOTEMPTY;
+			goto leave;
+		}
+	}
+
+	status = ocfs2_remote_dentry_delete(dentry);
+	if (status < 0) {
+		/* This remote delete should succeed under all normal
+		 * circumstances. */
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (inode_is_unlinkable(inode)) {
+		status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
+						  OCFS2_I(inode)->ip_blkno,
+						  orphan_name, &orphan_insert);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	if (inode_is_unlinkable(inode)) {
+		status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
+					  &orphan_insert, orphan_dir);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	/* delete the name from the parent dir */
+	status = ocfs2_delete_entry(handle, dir, &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(inode->i_mode))
+		drop_nlink(inode);
+	drop_nlink(inode);
+	ocfs2_set_links_count(fe, inode->i_nlink);
+	ocfs2_journal_dirty(handle, fe_bh);
+
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	if (S_ISDIR(inode->i_mode))
+		drop_nlink(dir);
+
+	status = ocfs2_mark_inode_dirty(handle, dir, parent_node_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		if (S_ISDIR(inode->i_mode))
+			inc_nlink(dir);
+	}
+
+leave:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	if (child_locked)
+		ocfs2_inode_unlock(inode, 1);
+
+	ocfs2_inode_unlock(dir, 1);
+
+	if (orphan_dir) {
+		/* This was locked for us in ocfs2_prepare_orphan_dir() */
+		ocfs2_inode_unlock(orphan_dir, 1);
+		mutex_unlock(&orphan_dir->i_mutex);
+		iput(orphan_dir);
+	}
+
+	brelse(fe_bh);
+	brelse(parent_node_bh);
+
+	ocfs2_free_dir_lookup_result(&orphan_insert);
+	ocfs2_free_dir_lookup_result(&lookup);
+
+	if (status)
+		mlog_errno(status);
+
+	return status;
+}
+
+/*
+ * The only place this should be used is rename!
+ * if they have the same id, then the 1st one is the only one locked.
+ */
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+			     struct buffer_head **bh1,
+			     struct inode *inode1,
+			     struct buffer_head **bh2,
+			     struct inode *inode2)
+{
+	int status;
+	struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
+	struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
+	struct buffer_head **tmpbh;
+	struct inode *tmpinode;
+
+	trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+				(unsigned long long)oi2->ip_blkno);
+
+	if (*bh1)
+		*bh1 = NULL;
+	if (*bh2)
+		*bh2 = NULL;
+
+	/* we always want to lock the one with the lower lockid first. */
+	if (oi1->ip_blkno != oi2->ip_blkno) {
+		if (oi1->ip_blkno < oi2->ip_blkno) {
+			/* switch id1 and id2 around */
+			tmpbh = bh2;
+			bh2 = bh1;
+			bh1 = tmpbh;
+
+			tmpinode = inode2;
+			inode2 = inode1;
+			inode1 = tmpinode;
+		}
+		/* lock id2 */
+		status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+						 OI_LS_RENAME1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* lock id1 */
+	status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2);
+	if (status < 0) {
+		/*
+		 * An error return must mean that no cluster locks
+		 * were held on function exit.
+		 */
+		if (oi1->ip_blkno != oi2->ip_blkno) {
+			ocfs2_inode_unlock(inode2, 1);
+			brelse(*bh2);
+			*bh2 = NULL;
+		}
+
+		if (status != -ENOENT)
+			mlog_errno(status);
+	}
+
+	trace_ocfs2_double_lock_end(
+			(unsigned long long)OCFS2_I(inode1)->ip_blkno,
+			(unsigned long long)OCFS2_I(inode2)->ip_blkno);
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
+{
+	ocfs2_inode_unlock(inode1, 1);
+
+	if (inode1 != inode2)
+		ocfs2_inode_unlock(inode2, 1);
+}
+
+static int ocfs2_rename(struct inode *old_dir,
+			struct dentry *old_dentry,
+			struct inode *new_dir,
+			struct dentry *new_dentry)
+{
+	int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
+	int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct inode *orphan_dir = NULL;
+	struct ocfs2_dinode *newfe = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *newfe_bh = NULL;
+	struct buffer_head *old_inode_bh = NULL;
+	struct ocfs2_super *osb = NULL;
+	u64 newfe_blkno, old_de_ino;
+	handle_t *handle = NULL;
+	struct buffer_head *old_dir_bh = NULL;
+	struct buffer_head *new_dir_bh = NULL;
+	nlink_t old_dir_nlink = old_dir->i_nlink;
+	struct ocfs2_dinode *old_di;
+	struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
+	struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
+	struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
+	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+	struct ocfs2_dir_lookup_result target_insert = { NULL, };
+
+	/* At some point it might be nice to break this function up a
+	 * bit. */
+
+	trace_ocfs2_rename(old_dir, old_dentry, new_dir, new_dentry,
+			   old_dentry->d_name.len, old_dentry->d_name.name,
+			   new_dentry->d_name.len, new_dentry->d_name.name);
+
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
+	osb = OCFS2_SB(old_dir->i_sb);
+
+	if (new_inode) {
+		if (!igrab(new_inode))
+			BUG();
+	}
+
+	/* Assume a directory hierarchy thusly:
+	 * a/b/c
+	 * a/d
+	 * a,b,c, and d are all directories.
+	 *
+	 * from cwd of 'a' on both nodes:
+	 * node1: mv b/c d
+	 * node2: mv d   b/c
+	 *
+	 * And that's why, just like the VFS, we need a file system
+	 * rename lock. */
+	if (old_dir != new_dir && S_ISDIR(old_inode->i_mode)) {
+		status = ocfs2_rename_lock(osb);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		rename_lock = 1;
+	}
+
+	/* if old and new are the same, this'll just do one lock. */
+	status = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
+				   &new_dir_bh, new_dir);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	parents_locked = 1;
+
+	/* make sure both dirs have bhs
+	 * get an extra ref on old_dir_bh if old==new */
+	if (!new_dir_bh) {
+		if (old_dir_bh) {
+			new_dir_bh = old_dir_bh;
+			get_bh(new_dir_bh);
+		} else {
+			mlog(ML_ERROR, "no old_dir_bh!\n");
+			status = -EIO;
+			goto bail;
+		}
+	}
+
+	/*
+	 * Aside from allowing a meta data update, the locking here
+	 * also ensures that the downconvert thread on other nodes
+	 * won't have to concurrently downconvert the inode and the
+	 * dentry locks.
+	 */
+	status = ocfs2_inode_lock_nested(old_inode, &old_inode_bh, 1,
+					 OI_LS_PARENT);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+	old_child_locked = 1;
+
+	status = ocfs2_remote_dentry_delete(old_dentry);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		u64 old_inode_parent;
+
+		update_dot_dot = 1;
+		status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
+						  old_inode,
+						  &old_inode_dot_dot_res);
+		if (status) {
+			status = -EIO;
+			goto bail;
+		}
+
+		if (old_inode_parent != OCFS2_I(old_dir)->ip_blkno) {
+			status = -EIO;
+			goto bail;
+		}
+
+		if (!new_inode && new_dir != old_dir &&
+		    new_dir->i_nlink >= ocfs2_link_max(osb)) {
+			status = -EMLINK;
+			goto bail;
+		}
+	}
+
+	status = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
+					    old_dentry->d_name.len,
+					    &old_de_ino);
+	if (status) {
+		status = -ENOENT;
+		goto bail;
+	}
+
+	/*
+	 *  Check for inode number is _not_ due to possible IO errors.
+	 *  We might rmdir the source, keep it as pwd of some process
+	 *  and merrily kill the link to whatever was created under the
+	 *  same name. Goodbye sticky bit ;-<
+	 */
+	if (old_de_ino != OCFS2_I(old_inode)->ip_blkno) {
+		status = -ENOENT;
+		goto bail;
+	}
+
+	/* check if the target already exists (in which case we need
+	 * to delete it */
+	status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
+					  new_dentry->d_name.len,
+					  &newfe_blkno, new_dir,
+					  &target_lookup_res);
+	/* The only error we allow here is -ENOENT because the new
+	 * file not existing is perfectly valid. */
+	if ((status < 0) && (status != -ENOENT)) {
+		/* If we cannot find the file specified we should just */
+		/* return the error... */
+		mlog_errno(status);
+		goto bail;
+	}
+	if (status == 0)
+		target_exists = 1;
+
+	if (!target_exists && new_inode) {
+		/*
+		 * Target was unlinked by another node while we were
+		 * waiting to get to ocfs2_rename(). There isn't
+		 * anything we can do here to help the situation, so
+		 * bubble up the appropriate error.
+		 */
+		status = -ENOENT;
+		goto bail;
+	}
+
+	/* In case we need to overwrite an existing file, we blow it
+	 * away first */
+	if (target_exists) {
+		/* VFS didn't think there existed an inode here, but
+		 * someone else in the cluster must have raced our
+		 * rename to create one. Today we error cleanly, in
+		 * the future we should consider calling iget to build
+		 * a new struct inode for this entry. */
+		if (!new_inode) {
+			status = -EACCES;
+
+			trace_ocfs2_rename_target_exists(new_dentry->d_name.len,
+						new_dentry->d_name.name);
+			goto bail;
+		}
+
+		if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
+			status = -EACCES;
+
+			trace_ocfs2_rename_disagree(
+			     (unsigned long long)OCFS2_I(new_inode)->ip_blkno,
+			     (unsigned long long)newfe_blkno,
+			     OCFS2_I(new_inode)->ip_flags);
+			goto bail;
+		}
+
+		status = ocfs2_inode_lock(new_inode, &newfe_bh, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+		new_child_locked = 1;
+
+		status = ocfs2_remote_dentry_delete(new_dentry);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
+
+		trace_ocfs2_rename_over_existing(
+		     (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
+		     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
+
+		if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
+			status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
+						OCFS2_I(new_inode)->ip_blkno,
+						orphan_name, &orphan_insert);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+	} else {
+		BUG_ON(new_dentry->d_parent->d_inode != new_dir);
+
+		status = ocfs2_check_dir_for_entry(new_dir,
+						   new_dentry->d_name.name,
+						   new_dentry->d_name.len);
+		if (status)
+			goto bail;
+
+		status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
+						      new_dentry->d_name.name,
+						      new_dentry->d_name.len,
+						      &target_insert);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (target_exists) {
+		if (S_ISDIR(new_inode->i_mode)) {
+			if (new_inode->i_nlink != 2 ||
+			    !ocfs2_empty_dir(new_inode)) {
+				status = -ENOTEMPTY;
+				goto bail;
+			}
+		}
+		status = ocfs2_journal_access_di(handle, INODE_CACHE(new_inode),
+						 newfe_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (S_ISDIR(new_inode->i_mode) ||
+		    (ocfs2_read_links_count(newfe) == 1)) {
+			status = ocfs2_orphan_add(osb, handle, new_inode,
+						  newfe_bh, orphan_name,
+						  &orphan_insert, orphan_dir);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+
+		/* change the dirent to point to the correct inode */
+		status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
+					    old_inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		new_dir->i_version++;
+
+		if (S_ISDIR(new_inode->i_mode))
+			ocfs2_set_links_count(newfe, 0);
+		else
+			ocfs2_add_links_count(newfe, -1);
+		ocfs2_journal_dirty(handle, newfe_bh);
+	} else {
+		/* if the name was not found in new_dir, add it now */
+		status = ocfs2_add_entry(handle, new_dentry, old_inode,
+					 OCFS2_I(old_inode)->ip_blkno,
+					 new_dir_bh, &target_insert);
+	}
+
+	old_inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(old_inode);
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode),
+					 old_inode_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status >= 0) {
+		old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
+
+		old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
+		old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
+		ocfs2_journal_dirty(handle, old_inode_bh);
+	} else
+		mlog_errno(status);
+
+	/*
+	 * Now that the name has been added to new_dir, remove the old name.
+	 *
+	 * We don't keep any directory entry context around until now
+	 * because the insert might have changed the type of directory
+	 * we're dealing with.
+	 */
+	status = ocfs2_find_entry(old_dentry->d_name.name,
+				  old_dentry->d_name.len, old_dir,
+				  &old_entry_lookup);
+	if (status)
+		goto bail;
+
+	status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (new_inode) {
+		new_inode->i_nlink--;
+		new_inode->i_ctime = CURRENT_TIME;
+	}
+	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+
+	if (update_dot_dot) {
+		status = ocfs2_update_entry(old_inode, handle,
+					    &old_inode_dot_dot_res, new_dir);
+		old_dir->i_nlink--;
+		if (new_inode) {
+			new_inode->i_nlink--;
+		} else {
+			inc_nlink(new_dir);
+			mark_inode_dirty(new_dir);
+		}
+	}
+	mark_inode_dirty(old_dir);
+	ocfs2_mark_inode_dirty(handle, old_dir, old_dir_bh);
+	if (new_inode) {
+		mark_inode_dirty(new_inode);
+		ocfs2_mark_inode_dirty(handle, new_inode, newfe_bh);
+	}
+
+	if (old_dir != new_dir) {
+		/* Keep the same times on both directories.*/
+		new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime;
+
+		/*
+		 * This will also pick up the i_nlink change from the
+		 * block above.
+		 */
+		ocfs2_mark_inode_dirty(handle, new_dir, new_dir_bh);
+	}
+
+	if (old_dir_nlink != old_dir->i_nlink) {
+		if (!old_dir_bh) {
+			mlog(ML_ERROR, "need to change nlink for old dir "
+			     "%llu from %d to %d but bh is NULL!\n",
+			     (unsigned long long)OCFS2_I(old_dir)->ip_blkno,
+			     (int)old_dir_nlink, old_dir->i_nlink);
+		} else {
+			struct ocfs2_dinode *fe;
+			status = ocfs2_journal_access_di(handle,
+							 INODE_CACHE(old_dir),
+							 old_dir_bh,
+							 OCFS2_JOURNAL_ACCESS_WRITE);
+			fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
+			ocfs2_set_links_count(fe, old_dir->i_nlink);
+			ocfs2_journal_dirty(handle, old_dir_bh);
+		}
+	}
+	ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
+	status = 0;
+bail:
+	if (rename_lock)
+		ocfs2_rename_unlock(osb);
+
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	if (parents_locked)
+		ocfs2_double_unlock(old_dir, new_dir);
+
+	if (old_child_locked)
+		ocfs2_inode_unlock(old_inode, 1);
+
+	if (new_child_locked)
+		ocfs2_inode_unlock(new_inode, 1);
+
+	if (orphan_dir) {
+		/* This was locked for us in ocfs2_prepare_orphan_dir() */
+		ocfs2_inode_unlock(orphan_dir, 1);
+		mutex_unlock(&orphan_dir->i_mutex);
+		iput(orphan_dir);
+	}
+
+	if (new_inode)
+		sync_mapping_buffers(old_inode->i_mapping);
+
+	if (new_inode)
+		iput(new_inode);
+
+	ocfs2_free_dir_lookup_result(&target_lookup_res);
+	ocfs2_free_dir_lookup_result(&old_entry_lookup);
+	ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res);
+	ocfs2_free_dir_lookup_result(&orphan_insert);
+	ocfs2_free_dir_lookup_result(&target_insert);
+
+	brelse(newfe_bh);
+	brelse(old_inode_bh);
+	brelse(old_dir_bh);
+	brelse(new_dir_bh);
+
+	if (status)
+		mlog_errno(status);
+
+	return status;
+}
+
+/*
+ * we expect i_size = strlen(symname). Copy symname into the file
+ * data, including the null terminator.
+ */
+static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
+				     handle_t *handle,
+				     struct inode *inode,
+				     const char *symname)
+{
+	struct buffer_head **bhs = NULL;
+	const char *c;
+	struct super_block *sb = osb->sb;
+	u64 p_blkno, p_blocks;
+	int virtual, blocks, status, i, bytes_left;
+
+	bytes_left = i_size_read(inode) + 1;
+	/* we can't trust i_blocks because we're actually going to
+	 * write i_size + 1 bytes. */
+	blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+
+	trace_ocfs2_create_symlink_data((unsigned long long)inode->i_blocks,
+					i_size_read(inode), blocks);
+
+	/* Sanity check -- make sure we're going to fit. */
+	if (bytes_left >
+	    ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL);
+	if (!bhs) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
+					     NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* links can never be larger than one cluster so we know this
+	 * is all going to be contiguous, but do a sanity check
+	 * anyway. */
+	if ((p_blocks << sb->s_blocksize_bits) < bytes_left) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	virtual = 0;
+	while(bytes_left > 0) {
+		c = &symname[virtual * sb->s_blocksize];
+
+		bhs[virtual] = sb_getblk(sb, p_blkno);
+		if (!bhs[virtual]) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+		ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode),
+					      bhs[virtual]);
+
+		status = ocfs2_journal_access(handle, INODE_CACHE(inode),
+					      bhs[virtual],
+					      OCFS2_JOURNAL_ACCESS_CREATE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		memset(bhs[virtual]->b_data, 0, sb->s_blocksize);
+
+		memcpy(bhs[virtual]->b_data, c,
+		       (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
+		       bytes_left);
+
+		ocfs2_journal_dirty(handle, bhs[virtual]);
+
+		virtual++;
+		p_blkno++;
+		bytes_left -= sb->s_blocksize;
+	}
+
+	status = 0;
+bail:
+
+	if (bhs) {
+		for(i = 0; i < blocks; i++)
+			brelse(bhs[i]);
+		kfree(bhs);
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static int ocfs2_symlink(struct inode *dir,
+			 struct dentry *dentry,
+			 const char *symname)
+{
+	int status, l, credits;
+	u64 newsize;
+	struct ocfs2_super *osb = NULL;
+	struct inode *inode = NULL;
+	struct super_block *sb;
+	struct buffer_head *new_fe_bh = NULL;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_dinode *dirfe;
+	handle_t *handle = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *xattr_ac = NULL;
+	int want_clusters = 0;
+	int xattr_credits = 0;
+	struct ocfs2_security_xattr_info si = {
+		.enable = 1,
+	};
+	int did_quota = 0, did_quota_inode = 0;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+	sigset_t oldset;
+	int did_block_signals = 0;
+
+	trace_ocfs2_symlink_begin(dir, dentry, symname,
+				  dentry->d_name.len, dentry->d_name.name);
+
+	dquot_initialize(dir);
+
+	sb = dir->i_sb;
+	osb = OCFS2_SB(sb);
+
+	l = strlen(symname) + 1;
+
+	credits = ocfs2_calc_symlink_credits(sb);
+
+	/* lock the parent directory */
+	status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		return status;
+	}
+
+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	if (!ocfs2_read_links_count(dirfe)) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto bail;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto bail;
+
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_reserve_new_inode(osb, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* get security xattr */
+	status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
+	if (status) {
+		if (status == -EOPNOTSUPP)
+			si.enable = 0;
+		else {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* calculate meta data/clusters for setting security xattr */
+	if (si.enable) {
+		status = ocfs2_calc_security_init(dir, &si, &want_clusters,
+						  &xattr_credits, &xattr_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* don't reserve bitmap space for fast symlinks. */
+	if (l > ocfs2_fast_symlink_chars(sb))
+		want_clusters += 1;
+
+	status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, credits + xattr_credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* Starting to change things, restart is no longer possible. */
+	ocfs2_block_signals(&oldset);
+	did_block_signals = 1;
+
+	status = dquot_alloc_inode(inode);
+	if (status)
+		goto bail;
+	did_quota_inode = 1;
+
+	trace_ocfs2_symlink_create(dir, dentry, dentry->d_name.len,
+				   dentry->d_name.name,
+				   (unsigned long long)OCFS2_I(dir)->ip_blkno,
+				   inode->i_mode);
+
+	status = ocfs2_mknod_locked(osb, dir, inode,
+				    0, &new_fe_bh, parent_fe_bh, handle,
+				    inode_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
+	inode->i_rdev = 0;
+	newsize = l - 1;
+	if (l > ocfs2_fast_symlink_chars(sb)) {
+		u32 offset = 0;
+
+		inode->i_op = &ocfs2_symlink_inode_operations;
+		status = dquot_alloc_space_nodirty(inode,
+		    ocfs2_clusters_to_bytes(osb->sb, 1));
+		if (status)
+			goto bail;
+		did_quota = 1;
+		status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
+					      new_fe_bh,
+					      handle, data_ac, NULL,
+					      NULL);
+		if (status < 0) {
+			if (status != -ENOSPC && status != -EINTR) {
+				mlog(ML_ERROR,
+				     "Failed to extend file to %llu\n",
+				     (unsigned long long)newsize);
+				mlog_errno(status);
+				status = -ENOSPC;
+			}
+			goto bail;
+		}
+		i_size_write(inode, newsize);
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
+	} else {
+		inode->i_op = &ocfs2_fast_symlink_inode_operations;
+		memcpy((char *) fe->id2.i_symlink, symname, l);
+		i_size_write(inode, newsize);
+		inode->i_blocks = 0;
+	}
+
+	status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (!ocfs2_inode_is_fast_symlink(inode)) {
+		status = ocfs2_create_symlink_data(osb, handle, inode,
+						   symname);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	if (si.enable) {
+		status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+						 xattr_ac, data_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/*
+	 * Do this before adding the entry to the directory. We add
+	 * also set d_op after success so that ->d_iput() will cleanup
+	 * the dentry lock even if ocfs2_add_entry() fails below.
+	 */
+	status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
+				 &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	insert_inode_hash(inode);
+	d_instantiate(dentry, inode);
+bail:
+	if (status < 0 && did_quota)
+		dquot_free_space_nodirty(inode,
+					ocfs2_clusters_to_bytes(osb->sb, 1));
+	if (status < 0 && did_quota_inode)
+		dquot_free_inode(inode);
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	ocfs2_inode_unlock(dir, 1);
+	if (did_block_signals)
+		ocfs2_unblock_signals(&oldset);
+
+	brelse(new_fe_bh);
+	brelse(parent_fe_bh);
+	kfree(si.name);
+	kfree(si.value);
+	ocfs2_free_dir_lookup_result(&lookup);
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (xattr_ac)
+		ocfs2_free_alloc_context(xattr_ac);
+	if ((status < 0) && inode) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
+		clear_nlink(inode);
+		iput(inode);
+	}
+
+	if (status)
+		mlog_errno(status);
+
+	return status;
+}
+
+static int ocfs2_blkno_stringify(u64 blkno, char *name)
+{
+	int status, namelen;
+
+	namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx",
+			   (long long)blkno);
+	if (namelen <= 0) {
+		if (namelen)
+			status = namelen;
+		else
+			status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+	if (namelen != OCFS2_ORPHAN_NAMELEN) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	trace_ocfs2_blkno_stringify(blkno, name, namelen);
+
+	status = 0;
+bail:
+	if (status < 0)
+		mlog_errno(status);
+	return status;
+}
+
+static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
+					struct inode **ret_orphan_dir,
+					struct buffer_head **ret_orphan_dir_bh)
+{
+	struct inode *orphan_dir_inode;
+	struct buffer_head *orphan_dir_bh = NULL;
+	int ret = 0;
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       osb->slot_num);
+	if (!orphan_dir_inode) {
+		ret = -ENOENT;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&orphan_dir_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+	if (ret < 0) {
+		mutex_unlock(&orphan_dir_inode->i_mutex);
+		iput(orphan_dir_inode);
+
+		mlog_errno(ret);
+		return ret;
+	}
+
+	*ret_orphan_dir = orphan_dir_inode;
+	*ret_orphan_dir_bh = orphan_dir_bh;
+
+	return 0;
+}
+
+static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
+				      struct buffer_head *orphan_dir_bh,
+				      u64 blkno,
+				      char *name,
+				      struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
+
+	ret = ocfs2_blkno_stringify(blkno, name);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+					   orphan_dir_bh, name,
+					   OCFS2_ORPHAN_NAMELEN, lookup);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for
+ * insertion of an orphan.
+ * @osb: ocfs2 file system
+ * @ret_orphan_dir: Orphan dir inode - returned locked!
+ * @blkno: Actual block number of the inode to be inserted into orphan dir.
+ * @lookup: dir lookup result, to be passed back into functions like
+ *          ocfs2_orphan_add
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure. 
+ */
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+				    struct inode **ret_orphan_dir,
+				    u64 blkno,
+				    char *name,
+				    struct ocfs2_dir_lookup_result *lookup)
+{
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	int ret = 0;
+
+	ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode,
+					   &orphan_dir_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
+					 blkno, name, lookup);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*ret_orphan_dir = orphan_dir_inode;
+
+out:
+	brelse(orphan_dir_bh);
+
+	if (ret) {
+		ocfs2_inode_unlock(orphan_dir_inode, 1);
+		mutex_unlock(&orphan_dir_inode->i_mutex);
+		iput(orphan_dir_inode);
+	}
+
+	if (ret)
+		mlog_errno(ret);
+	return ret;
+}
+
+static int ocfs2_orphan_add(struct ocfs2_super *osb,
+			    handle_t *handle,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    char *name,
+			    struct ocfs2_dir_lookup_result *lookup,
+			    struct inode *orphan_dir_inode)
+{
+	struct buffer_head *orphan_dir_bh = NULL;
+	int status = 0;
+	struct ocfs2_dinode *orphan_fe;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	trace_ocfs2_orphan_add_begin(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(orphan_dir_inode),
+					 orphan_dir_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* we're a cluster, and nlink can change on disk from
+	 * underneath us... */
+	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
+	if (S_ISDIR(inode->i_mode))
+		ocfs2_add_links_count(orphan_fe, 1);
+	orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+	ocfs2_journal_dirty(handle, orphan_dir_bh);
+
+	status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
+				   OCFS2_ORPHAN_NAMELEN, inode,
+				   OCFS2_I(inode)->ip_blkno,
+				   orphan_dir_bh, lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/*
+	 * We're going to journal the change of i_flags and i_orphaned_slot.
+	 * It's safe anyway, though some callers may duplicate the journaling.
+	 * Journaling within the func just make the logic look more
+	 * straightforward.
+	 */
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(inode),
+					 fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+	OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
+
+	/* Record which orphan dir our inode now resides
+	 * in. delete_inode will use this to determine which orphan
+	 * dir to lock. */
+	fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+
+	ocfs2_journal_dirty(handle, fe_bh);
+
+	trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				   osb->slot_num);
+
+leave:
+	brelse(orphan_dir_bh);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/* unlike orphan_add, we expect the orphan dir to already be locked here. */
+int ocfs2_orphan_del(struct ocfs2_super *osb,
+		     handle_t *handle,
+		     struct inode *orphan_dir_inode,
+		     struct inode *inode,
+		     struct buffer_head *orphan_dir_bh)
+{
+	char name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct ocfs2_dinode *orphan_fe;
+	int status = 0;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+
+	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	trace_ocfs2_orphan_del(
+	     (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
+	     name, OCFS2_ORPHAN_NAMELEN);
+
+	/* find it's spot in the orphan directory */
+	status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
+				  &lookup);
+	if (status) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* remove it from the orphan directory */
+	status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(orphan_dir_inode),
+					 orphan_dir_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* do the i_nlink dance! :) */
+	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
+	if (S_ISDIR(inode->i_mode))
+		ocfs2_add_links_count(orphan_fe, -1);
+	orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+	ocfs2_journal_dirty(handle, orphan_dir_bh);
+
+leave:
+	ocfs2_free_dir_lookup_result(&lookup);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/**
+ * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to receive a newly
+ * allocated file. This is different from the typical 'add to orphan dir'
+ * operation in that the inode does not yet exist. This is a problem because
+ * the orphan dir stringifies the inode block number to come up with it's
+ * dirent. Obviously if the inode does not yet exist we have a chicken and egg
+ * problem. This function works around it by calling deeper into the orphan
+ * and suballoc code than other callers. Use this only by necessity.
+ * @dir: The directory which this inode will ultimately wind up under - not the
+ * orphan dir!
+ * @dir_bh: buffer_head the @dir inode block
+ * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled
+ * with the string to be used for orphan dirent. Pass back to the orphan dir
+ * code.
+ * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan
+ * dir code.
+ * @ret_di_blkno: block number where the new inode will be allocated.
+ * @orphan_insert: Dir insert context to be passed back into orphan dir code.
+ * @ret_inode_ac: Inode alloc context to be passed back to the allocator.
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure. 
+ */
+static int ocfs2_prep_new_orphaned_file(struct inode *dir,
+					struct buffer_head *dir_bh,
+					char *orphan_name,
+					struct inode **ret_orphan_dir,
+					u64 *ret_di_blkno,
+					struct ocfs2_dir_lookup_result *orphan_insert,
+					struct ocfs2_alloc_context **ret_inode_ac)
+{
+	int ret;
+	u64 di_blkno;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct inode *orphan_dir = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+
+	ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* reserve an inode spot */
+	ret = ocfs2_reserve_new_inode(osb, &inode_ac);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac,
+				       &di_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
+					 di_blkno, orphan_name, orphan_insert);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (ret == 0) {
+		*ret_orphan_dir = orphan_dir;
+		*ret_di_blkno = di_blkno;
+		*ret_inode_ac = inode_ac;
+		/*
+		 * orphan_name and orphan_insert are already up to
+		 * date via prepare_orphan_dir
+		 */
+	} else {
+		/* Unroll reserve_new_inode* */
+		if (inode_ac)
+			ocfs2_free_alloc_context(inode_ac);
+
+		/* Unroll orphan dir locking */
+		mutex_unlock(&orphan_dir->i_mutex);
+		ocfs2_inode_unlock(orphan_dir, 1);
+		iput(orphan_dir);
+	}
+
+	brelse(orphan_dir_bh);
+
+	return 0;
+}
+
+int ocfs2_create_inode_in_orphan(struct inode *dir,
+				 int mode,
+				 struct inode **new_inode)
+{
+	int status, did_quota_inode = 0;
+	struct inode *inode = NULL;
+	struct inode *orphan_dir = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dinode *di = NULL;
+	handle_t *handle = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *parent_di_bh = NULL;
+	struct buffer_head *new_di_bh = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+	u64 uninitialized_var(di_blkno), suballoc_loc;
+	u16 suballoc_bit;
+
+	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		return status;
+	}
+
+	status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh,
+					      orphan_name, &orphan_dir,
+					      &di_blkno, &orphan_insert, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	inode = ocfs2_get_init_inode(dir, mode);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, 0, 0));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = dquot_alloc_inode(inode);
+	if (status)
+		goto leave;
+	did_quota_inode = 1;
+
+	status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac,
+					      &suballoc_loc,
+					      &suballoc_bit, di_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	inode->i_nlink = 0;
+	/* do the real work now. */
+	status = __ocfs2_mknod_locked(dir, inode,
+				      0, &new_di_bh, parent_di_bh, handle,
+				      inode_ac, di_blkno, suballoc_loc,
+				      suballoc_bit);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	di = (struct ocfs2_dinode *)new_di_bh->b_data;
+	status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
+				  &orphan_insert, orphan_dir);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* get open lock so that only nodes can't remove it from orphan dir. */
+	status = ocfs2_open_lock(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	insert_inode_hash(inode);
+leave:
+	if (status < 0 && did_quota_inode)
+		dquot_free_inode(inode);
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	if (orphan_dir) {
+		/* This was locked for us in ocfs2_prepare_orphan_dir() */
+		ocfs2_inode_unlock(orphan_dir, 1);
+		mutex_unlock(&orphan_dir->i_mutex);
+		iput(orphan_dir);
+	}
+
+	if ((status < 0) && inode) {
+		clear_nlink(inode);
+		iput(inode);
+	}
+
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+
+	brelse(new_di_bh);
+
+	if (!status)
+		*new_inode = inode;
+
+	ocfs2_free_dir_lookup_result(&orphan_insert);
+
+	ocfs2_inode_unlock(dir, 1);
+	brelse(parent_di_bh);
+	return status;
+}
+
+int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+				   struct inode *inode,
+				   struct dentry *dentry)
+{
+	int status = 0;
+	struct buffer_head *parent_di_bh = NULL;
+	handle_t *handle = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dinode *dir_di, *di;
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+
+	trace_ocfs2_mv_orphaned_inode_to_new(dir, dentry,
+				dentry->d_name.len, dentry->d_name.name,
+				(unsigned long long)OCFS2_I(dir)->ip_blkno,
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		return status;
+	}
+
+	dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
+	if (!dir_di->i_links_count) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto leave;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto leave;
+
+	/* get a spot inside the dir. */
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       osb->slot_num);
+	if (!orphan_dir_inode) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	mutex_lock(&orphan_dir_inode->i_mutex);
+
+	status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		mutex_unlock(&orphan_dir_inode->i_mutex);
+		iput(orphan_dir_inode);
+		goto leave;
+	}
+
+	status = ocfs2_read_inode_block(inode, &di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto orphan_unlock;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto orphan_unlock;
+	}
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+					 di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+				  orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
+	di->i_orphaned_slot = 0;
+	inode->i_nlink = 1;
+	ocfs2_set_links_count(di, inode->i_nlink);
+	ocfs2_journal_dirty(handle, di_bh);
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 OCFS2_I(inode)->ip_blkno, parent_di_bh,
+				 &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	status = ocfs2_dentry_attach_lock(dentry, inode,
+					  OCFS2_I(dir)->ip_blkno);
+	if (status) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	d_instantiate(dentry, inode);
+	status = 0;
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+orphan_unlock:
+	ocfs2_inode_unlock(orphan_dir_inode, 1);
+	mutex_unlock(&orphan_dir_inode->i_mutex);
+	iput(orphan_dir_inode);
+leave:
+
+	ocfs2_inode_unlock(dir, 1);
+
+	brelse(di_bh);
+	brelse(parent_di_bh);
+	brelse(orphan_dir_bh);
+
+	ocfs2_free_dir_lookup_result(&lookup);
+
+	if (status)
+		mlog_errno(status);
+
+	return status;
+}
+
+const struct inode_operations ocfs2_dir_iops = {
+	.create		= ocfs2_create,
+	.lookup		= ocfs2_lookup,
+	.link		= ocfs2_link,
+	.unlink		= ocfs2_unlink,
+	.rmdir		= ocfs2_unlink,
+	.symlink	= ocfs2_symlink,
+	.mkdir		= ocfs2_mkdir,
+	.mknod		= ocfs2_mknod,
+	.rename		= ocfs2_rename,
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+	.permission	= ocfs2_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
+	.fiemap         = ocfs2_fiemap,
+};
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
new file mode 100644
index 00000000..e5d059d4
--- /dev/null
+++ b/fs/ocfs2/namei.h
@@ -0,0 +1,45 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * namei.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_NAMEI_H
+#define OCFS2_NAMEI_H
+
+extern const struct inode_operations ocfs2_dir_iops;
+
+struct dentry *ocfs2_get_parent(struct dentry *child);
+
+int ocfs2_orphan_del(struct ocfs2_super *osb,
+		     handle_t *handle,
+		     struct inode *orphan_dir_inode,
+		     struct inode *inode,
+		     struct buffer_head *orphan_dir_bh);
+int ocfs2_create_inode_in_orphan(struct inode *dir,
+				 int mode,
+				 struct inode **new_inode);
+int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+				   struct inode *new_inode,
+				   struct dentry *new_dentry);
+
+#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h
new file mode 100644
index 00000000..dfb313bd
--- /dev/null
+++ b/fs/ocfs2/ocfs1_fs_compat.h
@@ -0,0 +1,109 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs1_fs_compat.h
+ *
+ * OCFS1 volume header definitions.  OCFS2 creates valid but unmountable
+ * OCFS1 volume headers on the first two sectors of an OCFS2 volume.
+ * This allows an OCFS1 volume to see the partition and cleanly fail to
+ * mount it.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS1_FS_COMPAT_H
+#define _OCFS1_FS_COMPAT_H
+
+#define OCFS1_MAX_VOL_SIGNATURE_LEN          128
+#define OCFS1_MAX_MOUNT_POINT_LEN            128
+#define OCFS1_MAX_VOL_ID_LENGTH               16
+#define OCFS1_MAX_VOL_LABEL_LEN               64
+#define OCFS1_MAX_CLUSTER_NAME_LEN            64
+
+#define OCFS1_MAJOR_VERSION              (2)
+#define OCFS1_MINOR_VERSION              (0)
+#define OCFS1_VOLUME_SIGNATURE		 "OracleCFS"
+
+/*
+ * OCFS1 superblock.  Lives at sector 0.
+ */
+struct ocfs1_vol_disk_hdr
+{
+/*00*/	__u32 minor_version;
+	__u32 major_version;
+/*08*/	__u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN];
+/*88*/	__u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN];
+/*108*/	__u64 serial_num;
+/*110*/	__u64 device_size;
+	__u64 start_off;
+/*120*/	__u64 bitmap_off;
+	__u64 publ_off;
+/*130*/	__u64 vote_off;
+	__u64 root_bitmap_off;
+/*140*/	__u64 data_start_off;
+	__u64 root_bitmap_size;
+/*150*/	__u64 root_off;
+	__u64 root_size;
+/*160*/	__u64 cluster_size;
+	__u64 num_nodes;
+/*170*/	__u64 num_clusters;
+	__u64 dir_node_size;
+/*180*/	__u64 file_node_size;
+	__u64 internal_off;
+/*190*/	__u64 node_cfg_off;
+	__u64 node_cfg_size;
+/*1A0*/	__u64 new_cfg_off;
+	__u32 prot_bits;
+	__s32 excl_mount;
+/*1B0*/
+};
+
+
+struct ocfs1_disk_lock
+{
+/*00*/	__u32 curr_master;
+	__u8 file_lock;
+	__u8 compat_pad[3];  /* Not in original definition.  Used to
+				make the already existing alignment
+				explicit */
+	__u64 last_write_time;
+/*10*/	__u64 last_read_time;
+	__u32 writer_node_num;
+	__u32 reader_node_num;
+/*20*/	__u64 oin_node_map;
+	__u64 dlock_seq_num;
+/*30*/
+};
+
+/*
+ * OCFS1 volume label.  Lives at sector 1.
+ */
+struct ocfs1_vol_label
+{
+/*00*/	struct ocfs1_disk_lock disk_lock;
+/*30*/	__u8 label[OCFS1_MAX_VOL_LABEL_LEN];
+/*70*/	__u16 label_len;
+/*72*/	__u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH];
+/*82*/	__u16 vol_id_len;
+/*84*/	__u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN];
+/*A4*/	__u16 cluster_name_len;
+/*A6*/
+};
+
+
+#endif /* _OCFS1_FS_COMPAT_H */
+
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
new file mode 100644
index 00000000..40928585
--- /dev/null
+++ b/fs/ocfs2/ocfs2.h
@@ -0,0 +1,853 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2.h
+ *
+ * Defines macros and structures used in OCFS2
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_H
+#define OCFS2_H
+
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/workqueue.h>
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/lockdep.h>
+#include <linux/jbd2.h>
+
+/* For union ocfs2_dlm_lksb */
+#include "stackglue.h"
+
+#include "ocfs2_fs.h"
+#include "ocfs2_lockid.h"
+#include "ocfs2_ioctl.h"
+
+/* For struct ocfs2_blockcheck_stats */
+#include "blockcheck.h"
+
+#include "reservations.h"
+
+/* Caching of metadata buffers */
+
+/* Most user visible OCFS2 inodes will have very few pieces of
+ * metadata, but larger files (including bitmaps, etc) must be taken
+ * into account when designing an access scheme. We allow a small
+ * amount of inlined blocks to be stored on an array and grow the
+ * structure into a rb tree when necessary. */
+#define OCFS2_CACHE_INFO_MAX_ARRAY 2
+
+/* Flags for ocfs2_caching_info */
+
+enum ocfs2_caching_info_flags {
+	/* Indicates that the metadata cache is using the inline array */
+	OCFS2_CACHE_FL_INLINE	= 1<<1,
+};
+
+struct ocfs2_caching_operations;
+struct ocfs2_caching_info {
+	/*
+	 * The parent structure provides the locks, but because the
+	 * parent structure can differ, it provides locking operations
+	 * to struct ocfs2_caching_info.
+	 */
+	const struct ocfs2_caching_operations *ci_ops;
+
+	/* next two are protected by trans_inc_lock */
+	/* which transaction were we created on? Zero if none. */
+	unsigned long		ci_created_trans;
+	/* last transaction we were a part of. */
+	unsigned long		ci_last_trans;
+
+	/* Cache structures */
+	unsigned int		ci_flags;
+	unsigned int		ci_num_cached;
+	union {
+	sector_t	ci_array[OCFS2_CACHE_INFO_MAX_ARRAY];
+		struct rb_root	ci_tree;
+	} ci_cache;
+};
+/*
+ * Need this prototype here instead of in uptodate.h because journal.h
+ * uses it.
+ */
+struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci);
+
+/* this limits us to 256 nodes
+ * if we need more, we can do a kmalloc for the map */
+#define OCFS2_NODE_MAP_MAX_NODES    256
+struct ocfs2_node_map {
+	u16 num_nodes;
+	unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)];
+};
+
+enum ocfs2_ast_action {
+	OCFS2_AST_INVALID = 0,
+	OCFS2_AST_ATTACH,
+	OCFS2_AST_CONVERT,
+	OCFS2_AST_DOWNCONVERT,
+};
+
+/* actions for an unlockast function to take. */
+enum ocfs2_unlock_action {
+	OCFS2_UNLOCK_INVALID = 0,
+	OCFS2_UNLOCK_CANCEL_CONVERT,
+	OCFS2_UNLOCK_DROP_LOCK,
+};
+
+/* ocfs2_lock_res->l_flags flags. */
+#define OCFS2_LOCK_ATTACHED      (0x00000001) /* we have initialized
+					       * the lvb */
+#define OCFS2_LOCK_BUSY          (0x00000002) /* we are currently in
+					       * dlm_lock */
+#define OCFS2_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
+					       * downconvert*/
+#define OCFS2_LOCK_LOCAL         (0x00000008) /* newly created inode */
+#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
+#define OCFS2_LOCK_REFRESHING    (0x00000020)
+#define OCFS2_LOCK_INITIALIZED   (0x00000040) /* track initialization
+					       * for shutdown paths */
+#define OCFS2_LOCK_FREEING       (0x00000080) /* help dlmglue track
+					       * when to skip queueing
+					       * a lock because it's
+					       * about to be
+					       * dropped. */
+#define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
+#define OCFS2_LOCK_NOCACHE       (0x00000200) /* don't use a holder count */
+#define OCFS2_LOCK_PENDING       (0x00000400) /* This lockres is pending a
+						 call to dlm_lock.  Only
+						 exists with BUSY set. */
+#define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread
+						     * from downconverting
+						     * before the upconvert
+						     * has completed */
+
+struct ocfs2_lock_res_ops;
+
+typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
+
+#ifdef CONFIG_OCFS2_FS_STATS
+struct ocfs2_lock_stats {
+	u64		ls_total;	/* Total wait in NSEC */
+	u32		ls_gets;	/* Num acquires */
+	u32		ls_fail;	/* Num failed acquires */
+
+	/* Storing max wait in usecs saves 24 bytes per inode */
+	u32		ls_max;		/* Max wait in USEC */
+};
+#endif
+
+struct ocfs2_lock_res {
+	void                    *l_priv;
+	struct ocfs2_lock_res_ops *l_ops;
+
+
+	struct list_head         l_blocked_list;
+	struct list_head         l_mask_waiters;
+
+	unsigned long		 l_flags;
+	char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
+	unsigned int             l_ro_holders;
+	unsigned int             l_ex_holders;
+	signed char		 l_level;
+	signed char		 l_requested;
+	signed char		 l_blocking;
+
+	/* Data packed - type enum ocfs2_lock_type */
+	unsigned char            l_type;
+
+	/* used from AST/BAST funcs. */
+	/* Data packed - enum type ocfs2_ast_action */
+	unsigned char            l_action;
+	/* Data packed - enum type ocfs2_unlock_action */
+	unsigned char            l_unlock_action;
+	unsigned int             l_pending_gen;
+
+	spinlock_t               l_lock;
+
+	struct ocfs2_dlm_lksb    l_lksb;
+
+	wait_queue_head_t        l_event;
+
+	struct list_head         l_debug_list;
+
+#ifdef CONFIG_OCFS2_FS_STATS
+	struct ocfs2_lock_stats  l_lock_prmode;		/* PR mode stats */
+	u32                      l_lock_refresh;	/* Disk refreshes */
+	struct ocfs2_lock_stats  l_lock_exmode;		/* EX mode stats */
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	 l_lockdep_map;
+#endif
+};
+
+enum ocfs2_orphan_scan_state {
+	ORPHAN_SCAN_ACTIVE,
+	ORPHAN_SCAN_INACTIVE
+};
+
+struct ocfs2_orphan_scan {
+	struct mutex 		os_lock;
+	struct ocfs2_super 	*os_osb;
+	struct ocfs2_lock_res 	os_lockres;     /* lock to synchronize scans */
+	struct delayed_work 	os_orphan_scan_work;
+	struct timespec		os_scantime;  /* time this node ran the scan */
+	u32			os_count;      /* tracks node specific scans */
+	u32  			os_seqno;       /* tracks cluster wide scans */
+	atomic_t		os_state;              /* ACTIVE or INACTIVE */
+};
+
+struct ocfs2_dlm_debug {
+	struct kref d_refcnt;
+	struct dentry *d_locking_state;
+	struct list_head d_lockres_tracking;
+};
+
+enum ocfs2_vol_state
+{
+	VOLUME_INIT = 0,
+	VOLUME_MOUNTED,
+	VOLUME_MOUNTED_QUOTAS,
+	VOLUME_DISMOUNTED,
+	VOLUME_DISABLED
+};
+
+struct ocfs2_alloc_stats
+{
+	atomic_t moves;
+	atomic_t local_data;
+	atomic_t bitmap_data;
+	atomic_t bg_allocs;
+	atomic_t bg_extends;
+};
+
+enum ocfs2_local_alloc_state
+{
+	OCFS2_LA_UNUSED = 0,	/* Local alloc will never be used for
+				 * this mountpoint. */
+	OCFS2_LA_ENABLED,	/* Local alloc is in use. */
+	OCFS2_LA_THROTTLED,	/* Local alloc is in use, but number
+				 * of bits has been reduced. */
+	OCFS2_LA_DISABLED	/* Local alloc has temporarily been
+				 * disabled. */
+};
+
+enum ocfs2_mount_options
+{
+	OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
+	OCFS2_MOUNT_BARRIER = 1 << 1,	/* Use block barriers */
+	OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
+	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
+	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
+	OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
+	OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
+	OCFS2_MOUNT_INODE64 = 1 << 7,	/* Allow inode numbers > 2^32 */
+	OCFS2_MOUNT_POSIX_ACL = 1 << 8,	/* Force POSIX access control lists */
+	OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9,	/* Disable POSIX access
+						   control lists */
+	OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
+	OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
+	OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
+						     writes */
+	OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
+	OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
+};
+
+#define OCFS2_OSB_SOFT_RO			0x0001
+#define OCFS2_OSB_HARD_RO			0x0002
+#define OCFS2_OSB_ERROR_FS			0x0004
+#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED	0x0008
+
+#define OCFS2_DEFAULT_ATIME_QUANTUM		60
+
+struct ocfs2_journal;
+struct ocfs2_slot_info;
+struct ocfs2_recovery_map;
+struct ocfs2_replay_map;
+struct ocfs2_quota_recovery;
+struct ocfs2_dentry_lock;
+struct ocfs2_super
+{
+	struct task_struct *commit_task;
+	struct super_block *sb;
+	struct inode *root_inode;
+	struct inode *sys_root_inode;
+	struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
+	struct inode **local_system_inodes;
+
+	struct ocfs2_slot_info *slot_info;
+
+	u32 *slot_recovery_generations;
+
+	spinlock_t node_map_lock;
+
+	u64 root_blkno;
+	u64 system_dir_blkno;
+	u64 bitmap_blkno;
+	u32 bitmap_cpg;
+	u8 *uuid;
+	char *uuid_str;
+	u32 uuid_hash;
+	u8 *vol_label;
+	u64 first_cluster_group_blkno;
+	u32 fs_generation;
+
+	u32 s_feature_compat;
+	u32 s_feature_incompat;
+	u32 s_feature_ro_compat;
+
+	/* Protects s_next_generation, osb_flags and s_inode_steal_slot.
+	 * Could protect more on osb as it's very short lived.
+	 */
+	spinlock_t osb_lock;
+	u32 s_next_generation;
+	unsigned long osb_flags;
+	s16 s_inode_steal_slot;
+	s16 s_meta_steal_slot;
+	atomic_t s_num_inodes_stolen;
+	atomic_t s_num_meta_stolen;
+
+	unsigned long s_mount_opt;
+	unsigned int s_atime_quantum;
+
+	unsigned int max_slots;
+	unsigned int node_num;
+	int slot_num;
+	int preferred_slot;
+	int s_sectsize_bits;
+	int s_clustersize;
+	int s_clustersize_bits;
+	unsigned int s_xattr_inline_size;
+
+	atomic_t vol_state;
+	struct mutex recovery_lock;
+	struct ocfs2_recovery_map *recovery_map;
+	struct ocfs2_replay_map *replay_map;
+	struct task_struct *recovery_thread_task;
+	int disable_recovery;
+	wait_queue_head_t checkpoint_event;
+	atomic_t needs_checkpoint;
+	struct ocfs2_journal *journal;
+	unsigned long osb_commit_interval;
+
+	struct delayed_work		la_enable_wq;
+
+	/*
+	 * Must hold local alloc i_mutex and osb->osb_lock to change
+	 * local_alloc_bits. Reads can be done under either lock.
+	 */
+	unsigned int local_alloc_bits;
+	unsigned int local_alloc_default_bits;
+	/* osb_clusters_at_boot can become stale! Do not trust it to
+	 * be up to date. */
+	unsigned int osb_clusters_at_boot;
+
+	enum ocfs2_local_alloc_state local_alloc_state; /* protected
+							 * by osb_lock */
+
+	struct buffer_head *local_alloc_bh;
+
+	u64 la_last_gd;
+
+	struct ocfs2_reservation_map	osb_la_resmap;
+
+	unsigned int	osb_resv_level;
+	unsigned int	osb_dir_resv_level;
+
+	/* Next three fields are for local node slot recovery during
+	 * mount. */
+	int dirty;
+	struct ocfs2_dinode *local_alloc_copy;
+	struct ocfs2_quota_recovery *quota_rec;
+
+	struct ocfs2_blockcheck_stats osb_ecc_stats;
+	struct ocfs2_alloc_stats alloc_stats;
+	char dev_str[20];		/* "major,minor" of the device */
+
+	u8 osb_stackflags;
+
+	char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+	struct ocfs2_cluster_connection *cconn;
+	struct ocfs2_lock_res osb_super_lockres;
+	struct ocfs2_lock_res osb_rename_lockres;
+	struct ocfs2_lock_res osb_nfs_sync_lockres;
+	struct ocfs2_dlm_debug *osb_dlm_debug;
+
+	struct dentry *osb_debug_root;
+	struct dentry *osb_ctxt;
+
+	wait_queue_head_t recovery_event;
+
+	spinlock_t dc_task_lock;
+	struct task_struct *dc_task;
+	wait_queue_head_t dc_event;
+	unsigned long dc_wake_sequence;
+	unsigned long dc_work_sequence;
+
+	/*
+	 * Any thread can add locks to the list, but the downconvert
+	 * thread is the only one allowed to remove locks. Any change
+	 * to this rule requires updating
+	 * ocfs2_downconvert_thread_do_work().
+	 */
+	struct list_head blocked_lock_list;
+	unsigned long blocked_lock_count;
+
+	/* List of dentry locks to release. Anyone can add locks to
+	 * the list, ocfs2_wq processes the list  */
+	struct ocfs2_dentry_lock *dentry_lock_list;
+	struct work_struct dentry_lock_work;
+
+	wait_queue_head_t		osb_mount_event;
+
+	/* Truncate log info */
+	struct inode			*osb_tl_inode;
+	struct buffer_head		*osb_tl_bh;
+	struct delayed_work		osb_truncate_log_wq;
+	/*
+	 * How many clusters in our truncate log.
+	 * It must be protected by osb_tl_inode->i_mutex.
+	 */
+	unsigned int truncated_clusters;
+
+	struct ocfs2_node_map		osb_recovering_orphan_dirs;
+	unsigned int			*osb_orphan_wipes;
+	wait_queue_head_t		osb_wipe_event;
+
+	struct ocfs2_orphan_scan	osb_orphan_scan;
+
+	/* used to protect metaecc calculation check of xattr. */
+	spinlock_t osb_xattr_lock;
+
+	unsigned int			osb_dx_mask;
+	u32				osb_dx_seed[4];
+
+	/* the group we used to allocate inodes. */
+	u64				osb_inode_alloc_group;
+
+	/* rb tree root for refcount lock. */
+	struct rb_root	osb_rf_lock_tree;
+	struct ocfs2_refcount_tree *osb_ref_tree_lru;
+};
+
+#define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
+
+/* Useful typedef for passing around journal access functions */
+typedef int (*ocfs2_journal_access_func)(handle_t *handle,
+					 struct ocfs2_caching_info *ci,
+					 struct buffer_head *bh, int type);
+
+static inline int ocfs2_should_order_data(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)
+		return 0;
+	return 1;
+}
+
+static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
+{
+	/*
+	 * Support for sparse files is a pre-requisite
+	 */
+	if (!ocfs2_sparse_alloc(osb))
+		return 0;
+
+	if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN)
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+		return 1;
+	return 0;
+}
+
+static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
+{
+	if (ocfs2_supports_indexed_dirs(osb))
+		return OCFS2_DX_LINK_MAX;
+	return OCFS2_LINK_MAX;
+}
+
+static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
+{
+	u32 nlink = le16_to_cpu(di->i_links_count);
+	u32 hi = le16_to_cpu(di->i_links_count_hi);
+
+	if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
+		nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
+
+	return nlink;
+}
+
+static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink)
+{
+	u16 lo, hi;
+
+	lo = nlink;
+	hi = nlink >> OCFS2_LINKS_HI_SHIFT;
+
+	di->i_links_count = cpu_to_le16(lo);
+	di->i_links_count_hi = cpu_to_le16(hi);
+}
+
+static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
+{
+	u32 links = ocfs2_read_links_count(di);
+
+	links += n;
+
+	ocfs2_set_links_count(di, links);
+}
+
+static inline int ocfs2_refcount_tree(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
+		return 1;
+	return 0;
+}
+
+/* set / clear functions because cluster events can make these happen
+ * in parallel so we want the transitions to be atomic. this also
+ * means that any future flags osb_flags must be protected by spinlock
+ * too! */
+static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
+				      unsigned long flag)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_flags |= flag;
+	spin_unlock(&osb->osb_lock);
+}
+
+
+static inline unsigned long  ocfs2_test_osb_flag(struct ocfs2_super *osb,
+						 unsigned long flag)
+{
+	unsigned long ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = osb->osb_flags & flag;
+	spin_unlock(&osb->osb_lock);
+	return ret;
+}
+
+static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
+				     int hard)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO);
+	if (hard)
+		osb->osb_flags |= OCFS2_OSB_HARD_RO;
+	else
+		osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+	spin_unlock(&osb->osb_lock);
+}
+
+static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = osb->osb_flags & OCFS2_OSB_HARD_RO;
+	spin_unlock(&osb->osb_lock);
+
+	return ret;
+}
+
+static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = osb->osb_flags & OCFS2_OSB_SOFT_RO;
+	spin_unlock(&osb->osb_lock);
+
+	return ret;
+}
+
+static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
+{
+	return (osb->s_feature_incompat &
+		(OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
+		 OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
+}
+
+static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+{
+	if (ocfs2_clusterinfo_valid(osb) &&
+	    memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+		   OCFS2_STACK_LABEL_LEN))
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
+{
+	if (ocfs2_clusterinfo_valid(osb) &&
+	    !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+		   OCFS2_STACK_LABEL_LEN))
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
+{
+	return ocfs2_o2cb_stack(osb) &&
+		(osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
+}
+
+static inline int ocfs2_mount_local(struct ocfs2_super *osb)
+{
+	return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
+}
+
+static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
+{
+	return (osb->s_feature_incompat &
+		OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP);
+}
+
+
+#define OCFS2_IS_VALID_DINODE(ptr)					\
+	(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
+
+#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)				\
+	(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
+
+#define OCFS2_IS_VALID_GROUP_DESC(ptr)					\
+	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
+
+
+#define OCFS2_IS_VALID_XATTR_BLOCK(ptr)					\
+	(!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
+
+#define OCFS2_IS_VALID_DIR_TRAILER(ptr)					\
+	(!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
+
+#define OCFS2_IS_VALID_DX_ROOT(ptr)					\
+	(!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE))
+
+#define OCFS2_IS_VALID_DX_LEAF(ptr)					\
+	(!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
+
+#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr)				\
+	(!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE))
+
+static inline unsigned long ino_from_blkno(struct super_block *sb,
+					   u64 blkno)
+{
+	return (unsigned long)(blkno & (u64)ULONG_MAX);
+}
+
+static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
+					   u32 clusters)
+{
+	int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits -
+		sb->s_blocksize_bits;
+
+	return (u64)clusters << c_to_b_bits;
+}
+
+static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
+					   u64 blocks)
+{
+	int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
+		sb->s_blocksize_bits;
+
+	return (u32)(blocks >> b_to_c_bits);
+}
+
+static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
+						    u64 bytes)
+{
+	int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int clusters;
+
+	bytes += OCFS2_SB(sb)->s_clustersize - 1;
+	/* OCFS2 just cannot have enough clusters to overflow this */
+	clusters = (unsigned int)(bytes >> cl_bits);
+
+	return clusters;
+}
+
+static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
+					 u64 bytes)
+{
+	bytes += sb->s_blocksize - 1;
+	return bytes >> sb->s_blocksize_bits;
+}
+
+static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
+					  u32 clusters)
+{
+	return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
+}
+
+static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb,
+					       u64 blocks)
+{
+	int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits;
+	unsigned int clusters;
+
+	clusters = ocfs2_blocks_to_clusters(sb, blocks);
+	return (u64)clusters << bits;
+}
+
+static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
+						u64 bytes)
+{
+	int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int clusters;
+
+	clusters = ocfs2_clusters_for_bytes(sb, bytes);
+	return (u64)clusters << cl_bits;
+}
+
+static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
+					      u64 bytes)
+{
+	u64 blocks;
+
+        blocks = ocfs2_blocks_for_bytes(sb, bytes);
+	return blocks << sb->s_blocksize_bits;
+}
+
+static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
+{
+	return (unsigned long)((bytes + 511) >> 9);
+}
+
+static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
+							unsigned long pg_index)
+{
+	u32 clusters = pg_index;
+	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+
+	if (unlikely(PAGE_CACHE_SHIFT > cbits))
+		clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
+	else if (PAGE_CACHE_SHIFT < cbits)
+		clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
+
+	return clusters;
+}
+
+/*
+ * Find the 1st page index which covers the given clusters.
+ */
+static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb,
+							u32 clusters)
+{
+	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+        pgoff_t index = clusters;
+
+	if (PAGE_CACHE_SHIFT > cbits) {
+		index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits);
+	} else if (PAGE_CACHE_SHIFT < cbits) {
+		index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT);
+	}
+
+	return index;
+}
+
+static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
+{
+	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int pages_per_cluster = 1;
+
+	if (PAGE_CACHE_SHIFT < cbits)
+		pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+
+	return pages_per_cluster;
+}
+
+static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
+						       unsigned int megs)
+{
+	BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576);
+
+	return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
+static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
+						       unsigned int clusters)
+{
+	return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
+static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
+{
+	__test_and_set_bit_le(bit, bitmap);
+}
+#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
+
+static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
+{
+	__test_and_clear_bit_le(bit, bitmap);
+}
+#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
+
+#define ocfs2_test_bit test_bit_le
+#define ocfs2_find_next_zero_bit find_next_zero_bit_le
+#define ocfs2_find_next_bit find_next_bit_le
+#endif  /* OCFS2_H */
+
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
new file mode 100644
index 00000000..938387a1
--- /dev/null
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -0,0 +1,1640 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_fs.h
+ *
+ * On-disk structures for OCFS2.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS2_FS_H
+#define _OCFS2_FS_H
+
+/* Version */
+#define OCFS2_MAJOR_REV_LEVEL		0
+#define OCFS2_MINOR_REV_LEVEL          	90
+
+/*
+ * An OCFS2 volume starts this way:
+ * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS.
+ * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS.
+ * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock.
+ *
+ * All other structures are found from the superblock information.
+ *
+ * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors.  eg, for a
+ * blocksize of 2K, it is 4096 bytes into disk.
+ */
+#define OCFS2_SUPER_BLOCK_BLKNO		2
+
+/*
+ * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could
+ * grow if needed.
+ */
+#define OCFS2_MIN_CLUSTERSIZE		4096
+#define OCFS2_MAX_CLUSTERSIZE		1048576
+
+/*
+ * Blocks cannot be bigger than clusters, so the maximum blocksize is the
+ * minimum cluster size.
+ */
+#define OCFS2_MIN_BLOCKSIZE		512
+#define OCFS2_MAX_BLOCKSIZE		OCFS2_MIN_CLUSTERSIZE
+
+/* Filesystem magic number */
+#define OCFS2_SUPER_MAGIC		0x7461636f
+
+/* Object signatures */
+#define OCFS2_SUPER_BLOCK_SIGNATURE	"OCFSV2"
+#define OCFS2_INODE_SIGNATURE		"INODE01"
+#define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
+#define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
+#define OCFS2_XATTR_BLOCK_SIGNATURE	"XATTR01"
+#define OCFS2_DIR_TRAILER_SIGNATURE	"DIRTRL1"
+#define OCFS2_DX_ROOT_SIGNATURE		"DXDIR01"
+#define OCFS2_DX_LEAF_SIGNATURE		"DXLEAF1"
+#define OCFS2_REFCOUNT_BLOCK_SIGNATURE	"REFCNT1"
+
+/* Compatibility flags */
+#define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_compat & (mask) )
+#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_ro_compat & (mask) )
+#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_incompat & (mask) )
+#define OCFS2_SET_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_compat |= (mask)
+#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_ro_compat |= (mask)
+#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_incompat |= (mask)
+#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_compat &= ~(mask)
+#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask)
+#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
+
+#define OCFS2_FEATURE_COMPAT_SUPP	(OCFS2_FEATURE_COMPAT_BACKUP_SB	\
+					 | OCFS2_FEATURE_COMPAT_JBD2_SB)
+#define OCFS2_FEATURE_INCOMPAT_SUPP	(OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
+					 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
+					 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
+					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
+					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
+					 | OCFS2_FEATURE_INCOMPAT_XATTR \
+					 | OCFS2_FEATURE_INCOMPAT_META_ECC \
+					 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
+					 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
+					 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG	\
+					 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
+#define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
+					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
+					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+
+/*
+ * Heartbeat-only devices are missing journals and other files.  The
+ * filesystem driver can't load them, but the library can.  Never put
+ * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*.
+ */
+#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV	0x0002
+
+/*
+ * tunefs sets this incompat flag before starting the resize and clears it
+ * at the end. This flag protects users from inadvertently mounting the fs
+ * after an aborted run without fsck-ing.
+ */
+#define OCFS2_FEATURE_INCOMPAT_RESIZE_INPROG    0x0004
+
+/* Used to denote a non-clustered volume */
+#define OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT	0x0008
+
+/* Support for sparse allocation in b-trees */
+#define OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC	0x0010
+
+/*
+ * Tunefs sets this incompat flag before starting an operation which
+ * would require cleanup on abort. This is done to protect users from
+ * inadvertently mounting the fs after an aborted run without
+ * fsck-ing.
+ *
+ * s_tunefs_flags on the super block describes precisely which
+ * operations were in progress.
+ */
+#define OCFS2_FEATURE_INCOMPAT_TUNEFS_INPROG	0x0020
+
+/* Support for data packed into inode blocks */
+#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA	0x0040
+
+/*
+ * Support for alternate, userspace cluster stacks.  If set, the superblock
+ * field s_cluster_info contains a tag for the alternate stack in use as
+ * well as the name of the cluster being joined.
+ * mount.ocfs2 must pass in a matching stack name.
+ *
+ * If not set, the classic stack will be used.  This is compatbile with
+ * all older versions.
+ */
+#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK	0x0080
+
+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+
+/* Support for extended attributes */
+#define OCFS2_FEATURE_INCOMPAT_XATTR		0x0200
+
+/* Support for indexed directores */
+#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS	0x0400
+
+/* Metadata checksum and error correction */
+#define OCFS2_FEATURE_INCOMPAT_META_ECC		0x0800
+
+/* Refcount tree support */
+#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE	0x1000
+
+/* Discontigous block groups */
+#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG	0x2000
+
+/*
+ * Incompat bit to indicate useable clusterinfo with stackflags for all
+ * cluster stacks (userspace adnd o2cb). If this bit is set,
+ * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
+ */
+#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO	0x4000
+
+/*
+ * backup superblock flag is used to indicate that this volume
+ * has backup superblocks.
+ */
+#define OCFS2_FEATURE_COMPAT_BACKUP_SB		0x0001
+
+/*
+ * The filesystem will correctly handle journal feature bits.
+ */
+#define OCFS2_FEATURE_COMPAT_JBD2_SB		0x0002
+
+/*
+ * Unwritten extents support.
+ */
+#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN	0x0001
+
+/*
+ * Maintain quota information for this filesystem
+ */
+#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA	0x0002
+#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA	0x0004
+
+/* The byte offset of the first backup block will be 1G.
+ * The following will be 4G, 16G, 64G, 256G and 1T.
+ */
+#define OCFS2_BACKUP_SB_START			1 << 30
+
+/* the max backup superblock nums */
+#define OCFS2_MAX_BACKUP_SUPERBLOCKS	6
+
+/*
+ * Flags on ocfs2_super_block.s_tunefs_flags
+ */
+#define OCFS2_TUNEFS_INPROG_REMOVE_SLOT		0x0001	/* Removing slots */
+
+/*
+ * Flags on ocfs2_dinode.i_flags
+ */
+#define OCFS2_VALID_FL		(0x00000001)	/* Inode is valid */
+#define OCFS2_UNUSED2_FL	(0x00000002)
+#define OCFS2_ORPHANED_FL	(0x00000004)	/* On the orphan list */
+#define OCFS2_UNUSED3_FL	(0x00000008)
+/* System inode flags */
+#define OCFS2_SYSTEM_FL		(0x00000010)	/* System inode */
+#define OCFS2_SUPER_BLOCK_FL	(0x00000020)	/* Super block */
+#define OCFS2_LOCAL_ALLOC_FL	(0x00000040)	/* Slot local alloc bitmap */
+#define OCFS2_BITMAP_FL		(0x00000080)	/* Allocation bitmap */
+#define OCFS2_JOURNAL_FL	(0x00000100)	/* Slot local journal */
+#define OCFS2_HEARTBEAT_FL	(0x00000200)	/* Heartbeat area */
+#define OCFS2_CHAIN_FL		(0x00000400)	/* Chain allocator */
+#define OCFS2_DEALLOC_FL	(0x00000800)	/* Truncate log */
+#define OCFS2_QUOTA_FL		(0x00001000)	/* Quota file */
+
+/*
+ * Flags on ocfs2_dinode.i_dyn_features
+ *
+ * These can change much more often than i_flags. When adding flags,
+ * keep in mind that i_dyn_features is only 16 bits wide.
+ */
+#define OCFS2_INLINE_DATA_FL	(0x0001)	/* Data stored in inode block */
+#define OCFS2_HAS_XATTR_FL	(0x0002)
+#define OCFS2_INLINE_XATTR_FL	(0x0004)
+#define OCFS2_INDEXED_DIR_FL	(0x0008)
+#define OCFS2_HAS_REFCOUNT_FL   (0x0010)
+
+/* Inode attributes, keep in sync with EXT2 */
+#define OCFS2_SECRM_FL			FS_SECRM_FL	/* Secure deletion */
+#define OCFS2_UNRM_FL			FS_UNRM_FL	/* Undelete */
+#define OCFS2_COMPR_FL			FS_COMPR_FL	/* Compress file */
+#define OCFS2_SYNC_FL			FS_SYNC_FL	/* Synchronous updates */
+#define OCFS2_IMMUTABLE_FL		FS_IMMUTABLE_FL	/* Immutable file */
+#define OCFS2_APPEND_FL			FS_APPEND_FL	/* writes to file may only append */
+#define OCFS2_NODUMP_FL			FS_NODUMP_FL	/* do not dump file */
+#define OCFS2_NOATIME_FL		FS_NOATIME_FL	/* do not update atime */
+/* Reserved for compression usage... */
+#define OCFS2_DIRTY_FL			FS_DIRTY_FL
+#define OCFS2_COMPRBLK_FL		FS_COMPRBLK_FL	/* One or more compressed clusters */
+#define OCFS2_NOCOMP_FL			FS_NOCOMP_FL	/* Don't compress */
+#define OCFS2_ECOMPR_FL			FS_ECOMPR_FL	/* Compression error */
+/* End compression flags --- maybe not all used */
+#define OCFS2_BTREE_FL			FS_BTREE_FL	/* btree format dir */
+#define OCFS2_INDEX_FL			FS_INDEX_FL	/* hash-indexed directory */
+#define OCFS2_IMAGIC_FL			FS_IMAGIC_FL	/* AFS directory */
+#define OCFS2_JOURNAL_DATA_FL		FS_JOURNAL_DATA_FL /* Reserved for ext3 */
+#define OCFS2_NOTAIL_FL			FS_NOTAIL_FL	/* file tail should not be merged */
+#define OCFS2_DIRSYNC_FL		FS_DIRSYNC_FL	/* dirsync behaviour (directories only) */
+#define OCFS2_TOPDIR_FL			FS_TOPDIR_FL	/* Top of directory hierarchies*/
+#define OCFS2_RESERVED_FL		FS_RESERVED_FL	/* reserved for ext2 lib */
+
+#define OCFS2_FL_VISIBLE		FS_FL_USER_VISIBLE	/* User visible flags */
+#define OCFS2_FL_MODIFIABLE		FS_FL_USER_MODIFIABLE	/* User modifiable flags */
+
+/*
+ * Extent record flags (e_node.leaf.flags)
+ */
+#define OCFS2_EXT_UNWRITTEN		(0x01)	/* Extent is allocated but
+						 * unwritten */
+#define OCFS2_EXT_REFCOUNTED		(0x02)  /* Extent is reference
+						 * counted in an associated
+						 * refcount tree */
+
+/*
+ * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
+ */
+#define OCFS2_JOURNAL_DIRTY_FL	(0x00000001)	/* Journal needs recovery */
+
+/*
+ * superblock s_state flags
+ */
+#define OCFS2_ERROR_FS		(0x00000001)	/* FS saw errors */
+
+/* Limit of space in ocfs2_dir_entry */
+#define OCFS2_MAX_FILENAME_LEN		255
+
+/* Maximum slots on an ocfs2 file system */
+#define OCFS2_MAX_SLOTS			255
+
+/* Slot map indicator for an empty slot */
+#define OCFS2_INVALID_SLOT		-1
+
+#define OCFS2_VOL_UUID_LEN		16
+#define OCFS2_MAX_VOL_LABEL_LEN		64
+
+/* The cluster stack fields */
+#define OCFS2_STACK_LABEL_LEN		4
+#define OCFS2_CLUSTER_NAME_LEN		16
+
+/* Classic (historically speaking) cluster stack */
+#define OCFS2_CLASSIC_CLUSTER_STACK	"o2cb"
+
+/* Journal limits (in bytes) */
+#define OCFS2_MIN_JOURNAL_SIZE		(4 * 1024 * 1024)
+
+/*
+ * Inline extended attribute size (in bytes)
+ * The value chosen should be aligned to 16 byte boundaries.
+ */
+#define OCFS2_MIN_XATTR_INLINE_SIZE     256
+
+/*
+ * Cluster info flags (ocfs2_cluster_info.ci_stackflags)
+ */
+#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT	(0x01)
+
+struct ocfs2_system_inode_info {
+	char	*si_name;
+	int	si_iflags;
+	int	si_mode;
+};
+
+/* System file index */
+enum {
+	BAD_BLOCK_SYSTEM_INODE = 0,
+	GLOBAL_INODE_ALLOC_SYSTEM_INODE,
+	SLOT_MAP_SYSTEM_INODE,
+#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
+	HEARTBEAT_SYSTEM_INODE,
+	GLOBAL_BITMAP_SYSTEM_INODE,
+	USER_QUOTA_SYSTEM_INODE,
+	GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
+#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
+	ORPHAN_DIR_SYSTEM_INODE,
+	EXTENT_ALLOC_SYSTEM_INODE,
+	INODE_ALLOC_SYSTEM_INODE,
+	JOURNAL_SYSTEM_INODE,
+	LOCAL_ALLOC_SYSTEM_INODE,
+	TRUNCATE_LOG_SYSTEM_INODE,
+	LOCAL_USER_QUOTA_SYSTEM_INODE,
+	LOCAL_GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
+	NUM_SYSTEM_INODES
+};
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE
+#define NUM_LOCAL_SYSTEM_INODES	\
+		(NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
+
+static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
+	/* Global system inodes (single copy) */
+	/* The first two are only used from userspace mfks/tunefs */
+	[BAD_BLOCK_SYSTEM_INODE]		= { "bad_blocks", 0, S_IFREG | 0644 },
+	[GLOBAL_INODE_ALLOC_SYSTEM_INODE] 	= { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+
+	/* These are used by the running filesystem */
+	[SLOT_MAP_SYSTEM_INODE]			= { "slot_map", 0, S_IFREG | 0644 },
+	[HEARTBEAT_SYSTEM_INODE]		= { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
+	[GLOBAL_BITMAP_SYSTEM_INODE]		= { "global_bitmap", 0, S_IFREG | 0644 },
+	[USER_QUOTA_SYSTEM_INODE]		= { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+	[GROUP_QUOTA_SYSTEM_INODE]		= { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+
+	/* Slot-specific system inodes (one copy per slot) */
+	[ORPHAN_DIR_SYSTEM_INODE]		= { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
+	[EXTENT_ALLOC_SYSTEM_INODE]		= { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+	[INODE_ALLOC_SYSTEM_INODE]		= { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+	[JOURNAL_SYSTEM_INODE]			= { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
+	[LOCAL_ALLOC_SYSTEM_INODE]		= { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
+	[TRUNCATE_LOG_SYSTEM_INODE]		= { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 },
+	[LOCAL_USER_QUOTA_SYSTEM_INODE]		= { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+	[LOCAL_GROUP_QUOTA_SYSTEM_INODE]	= { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+};
+
+/* Parameter passed from mount.ocfs2 to module */
+#define OCFS2_HB_NONE			"heartbeat=none"
+#define OCFS2_HB_LOCAL			"heartbeat=local"
+#define OCFS2_HB_GLOBAL			"heartbeat=global"
+
+/*
+ * OCFS2 directory file types.  Only the low 3 bits are used.  The
+ * other bits are reserved for now.
+ */
+#define OCFS2_FT_UNKNOWN	0
+#define OCFS2_FT_REG_FILE	1
+#define OCFS2_FT_DIR		2
+#define OCFS2_FT_CHRDEV		3
+#define OCFS2_FT_BLKDEV		4
+#define OCFS2_FT_FIFO		5
+#define OCFS2_FT_SOCK		6
+#define OCFS2_FT_SYMLINK	7
+
+#define OCFS2_FT_MAX		8
+
+/*
+ * OCFS2_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define OCFS2_DIR_PAD			4
+#define OCFS2_DIR_ROUND			(OCFS2_DIR_PAD - 1)
+#define OCFS2_DIR_MEMBER_LEN 		offsetof(struct ocfs2_dir_entry, name)
+#define OCFS2_DIR_REC_LEN(name_len)	(((name_len) + OCFS2_DIR_MEMBER_LEN + \
+                                          OCFS2_DIR_ROUND) & \
+					 ~OCFS2_DIR_ROUND)
+#define OCFS2_DIR_MIN_REC_LEN	OCFS2_DIR_REC_LEN(1)
+
+#define OCFS2_LINK_MAX		32000
+#define	OCFS2_DX_LINK_MAX	((1U << 31) - 1U)
+#define	OCFS2_LINKS_HI_SHIFT	16
+#define	OCFS2_DX_ENTRIES_MAX	(0xffffffffU)
+
+#define S_SHIFT			12
+static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]  = OCFS2_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]  = OCFS2_FT_DIR,
+	[S_IFCHR >> S_SHIFT]  = OCFS2_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]  = OCFS2_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]  = OCFS2_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]  = OCFS2_FT_SYMLINK,
+};
+
+
+/*
+ * Convenience casts
+ */
+#define OCFS2_RAW_SB(dinode)		(&((dinode)->id2.i_super))
+
+/*
+ * Block checking structure.  This is used in metadata to validate the
+ * contents.  If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all
+ * zeros.
+ */
+struct ocfs2_block_check {
+/*00*/	__le32 bc_crc32e;	/* 802.3 Ethernet II CRC32 */
+	__le16 bc_ecc;		/* Single-error-correction parity vector.
+				   This is a simple Hamming code dependent
+				   on the blocksize.  OCFS2's maximum
+				   blocksize, 4K, requires 16 parity bits,
+				   so we fit in __le16. */
+	__le16 bc_reserved1;
+/*08*/
+};
+
+/*
+ * On disk extent record for OCFS2
+ * It describes a range of clusters on disk.
+ *
+ * Length fields are divided into interior and leaf node versions.
+ * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
+ */
+struct ocfs2_extent_rec {
+/*00*/	__le32 e_cpos;		/* Offset into the file, in clusters */
+	union {
+		__le32 e_int_clusters; /* Clusters covered by all children */
+		struct {
+			__le16 e_leaf_clusters; /* Clusters covered by this
+						   extent */
+			__u8 e_reserved1;
+			__u8 e_flags; /* Extent flags */
+		};
+	};
+	__le64 e_blkno;		/* Physical disk offset, in blocks */
+/*10*/
+};
+
+struct ocfs2_chain_rec {
+	__le32 c_free;	/* Number of free bits in this chain. */
+	__le32 c_total;	/* Number of total bits in this chain */
+	__le64 c_blkno;	/* Physical disk offset (blocks) of 1st group */
+};
+
+struct ocfs2_truncate_rec {
+	__le32 t_start;		/* 1st cluster in this log */
+	__le32 t_clusters;	/* Number of total clusters covered */
+};
+
+/*
+ * On disk extent list for OCFS2 (node in the tree).  Note that this
+ * is contained inside ocfs2_dinode or ocfs2_extent_block, so the
+ * offsets are relative to ocfs2_dinode.id2.i_list or
+ * ocfs2_extent_block.h_list, respectively.
+ */
+struct ocfs2_extent_list {
+/*00*/	__le16 l_tree_depth;		/* Extent tree depth from this
+					   point.  0 means data extents
+					   hang directly off this
+					   header (a leaf)
+					   NOTE: The high 8 bits cannot be
+					   used - tree_depth is never that big.
+					*/
+	__le16 l_count;			/* Number of extent records */
+	__le16 l_next_free_rec;		/* Next unused extent slot */
+	__le16 l_reserved1;
+	__le64 l_reserved2;		/* Pad to
+					   sizeof(ocfs2_extent_rec) */
+/*10*/	struct ocfs2_extent_rec l_recs[0];	/* Extent records */
+};
+
+/*
+ * On disk allocation chain list for OCFS2.  Note that this is
+ * contained inside ocfs2_dinode, so the offsets are relative to
+ * ocfs2_dinode.id2.i_chain.
+ */
+struct ocfs2_chain_list {
+/*00*/	__le16 cl_cpg;			/* Clusters per Block Group */
+	__le16 cl_bpc;			/* Bits per cluster */
+	__le16 cl_count;		/* Total chains in this list */
+	__le16 cl_next_free_rec;	/* Next unused chain slot */
+	__le64 cl_reserved1;
+/*10*/	struct ocfs2_chain_rec cl_recs[0];	/* Chain records */
+};
+
+/*
+ * On disk deallocation log for OCFS2.  Note that this is
+ * contained inside ocfs2_dinode, so the offsets are relative to
+ * ocfs2_dinode.id2.i_dealloc.
+ */
+struct ocfs2_truncate_log {
+/*00*/	__le16 tl_count;		/* Total records in this log */
+	__le16 tl_used;			/* Number of records in use */
+	__le32 tl_reserved1;
+/*08*/	struct ocfs2_truncate_rec tl_recs[0];	/* Truncate records */
+};
+
+/*
+ * On disk extent block (indirect block) for OCFS2
+ */
+struct ocfs2_extent_block
+{
+/*00*/	__u8 h_signature[8];		/* Signature for verification */
+	struct ocfs2_block_check h_check;	/* Error checking */
+/*10*/	__le16 h_suballoc_slot;		/* Slot suballocator this
+					   extent_header belongs to */
+	__le16 h_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+	__le32 h_fs_generation;		/* Must match super block */
+	__le64 h_blkno;			/* Offset on disk, in blocks */
+/*20*/	__le64 h_suballoc_loc;		/* Suballocator block group this
+					   eb belongs to.  Only valid
+					   if allocated from a
+					   discontiguous block group */
+	__le64 h_next_leaf_blk;		/* Offset on disk, in blocks,
+					   of next leaf header pointing
+					   to data */
+/*30*/	struct ocfs2_extent_list h_list;	/* Extent record list */
+/* Actual on-disk size is one block */
+};
+
+/*
+ * On disk slot map for OCFS2.  This defines the contents of the "slot_map"
+ * system file.  A slot is valid if it contains a node number >= 0.  The
+ * value -1 (0xFFFF) is OCFS2_INVALID_SLOT.  This marks a slot empty.
+ */
+struct ocfs2_slot_map {
+/*00*/	__le16 sm_slots[0];
+/*
+ * Actual on-disk size is one block.  OCFS2_MAX_SLOTS is 255,
+ * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize.
+ */
+};
+
+struct ocfs2_extended_slot {
+/*00*/	__u8	es_valid;
+	__u8	es_reserved1[3];
+	__le32	es_node_num;
+/*10*/
+};
+
+/*
+ * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP
+ * is set.  It separates out the valid marker from the node number, and
+ * has room to grow.  Unlike the old slot map, this format is defined by
+ * i_size.
+ */
+struct ocfs2_slot_map_extended {
+/*00*/	struct ocfs2_extended_slot se_slots[0];
+/*
+ * Actual size is i_size of the slot_map system file.  It should
+ * match s_max_slots * sizeof(struct ocfs2_extended_slot)
+ */
+};
+
+/*
+ * ci_stackflags is only valid if the incompat bit
+ * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set.
+ */
+struct ocfs2_cluster_info {
+/*00*/	__u8   ci_stack[OCFS2_STACK_LABEL_LEN];
+	union {
+		__le32 ci_reserved;
+		struct {
+			__u8 ci_stackflags;
+			__u8 ci_reserved1;
+			__u8 ci_reserved2;
+			__u8 ci_reserved3;
+		};
+	};
+/*08*/	__u8   ci_cluster[OCFS2_CLUSTER_NAME_LEN];
+/*18*/
+};
+
+/*
+ * On disk superblock for OCFS2
+ * Note that it is contained inside an ocfs2_dinode, so all offsets
+ * are relative to the start of ocfs2_dinode.id2.
+ */
+struct ocfs2_super_block {
+/*00*/	__le16 s_major_rev_level;
+	__le16 s_minor_rev_level;
+	__le16 s_mnt_count;
+	__le16 s_max_mnt_count;
+	__le16 s_state;			/* File system state */
+	__le16 s_errors;			/* Behaviour when detecting errors */
+	__le32 s_checkinterval;		/* Max time between checks */
+/*10*/	__le64 s_lastcheck;		/* Time of last check */
+	__le32 s_creator_os;		/* OS */
+	__le32 s_feature_compat;		/* Compatible feature set */
+/*20*/	__le32 s_feature_incompat;	/* Incompatible feature set */
+	__le32 s_feature_ro_compat;	/* Readonly-compatible feature set */
+	__le64 s_root_blkno;		/* Offset, in blocks, of root directory
+					   dinode */
+/*30*/	__le64 s_system_dir_blkno;	/* Offset, in blocks, of system
+					   directory dinode */
+	__le32 s_blocksize_bits;		/* Blocksize for this fs */
+	__le32 s_clustersize_bits;	/* Clustersize for this fs */
+/*40*/	__le16 s_max_slots;		/* Max number of simultaneous mounts
+					   before tunefs required */
+	__le16 s_tunefs_flag;
+	__le32 s_uuid_hash;		/* hash value of uuid */
+	__le64 s_first_cluster_group;	/* Block offset of 1st cluster
+					 * group header */
+/*50*/	__u8  s_label[OCFS2_MAX_VOL_LABEL_LEN];	/* Label for mounting, etc. */
+/*90*/	__u8  s_uuid[OCFS2_VOL_UUID_LEN];	/* 128-bit uuid */
+/*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Only valid if either
+						     userspace or clusterinfo
+						     INCOMPAT flag set. */
+/*B8*/	__le16 s_xattr_inline_size;	/* extended attribute inline size
+					   for this fs*/
+	__le16 s_reserved0;
+	__le32 s_dx_seed[3];		/* seed[0-2] for dx dir hash.
+					 * s_uuid_hash serves as seed[3]. */
+/*C0*/  __le64 s_reserved2[15];		/* Fill out superblock */
+/*140*/
+
+	/*
+	 * NOTE: As stated above, all offsets are relative to
+	 * ocfs2_dinode.id2, which is at 0xC0 in the inode.
+	 * 0xC0 + 0x140 = 0x200 or 512 bytes.  A superblock must fit within
+	 * our smallest blocksize, which is 512 bytes.  To ensure this,
+	 * we reserve the space in s_reserved2.  Anything past s_reserved2
+	 * will not be available on the smallest blocksize.
+	 */
+};
+
+/*
+ * Local allocation bitmap for OCFS2 slots
+ * Note that it exists inside an ocfs2_dinode, so all offsets are
+ * relative to the start of ocfs2_dinode.id2.
+ */
+struct ocfs2_local_alloc
+{
+/*00*/	__le32 la_bm_off;	/* Starting bit offset in main bitmap */
+	__le16 la_size;		/* Size of included bitmap, in bytes */
+	__le16 la_reserved1;
+	__le64 la_reserved2;
+/*10*/	__u8   la_bitmap[0];
+};
+
+/*
+ * Data-in-inode header. This is only used if i_dyn_features has
+ * OCFS2_INLINE_DATA_FL set.
+ */
+struct ocfs2_inline_data
+{
+/*00*/	__le16	id_count;	/* Number of bytes that can be used
+				 * for data, starting at id_data */
+	__le16	id_reserved0;
+	__le32	id_reserved1;
+	__u8	id_data[0];	/* Start of user data */
+};
+
+/*
+ * On disk inode for OCFS2
+ */
+struct ocfs2_dinode {
+/*00*/	__u8 i_signature[8];		/* Signature for validation */
+	__le32 i_generation;		/* Generation number */
+	__le16 i_suballoc_slot;		/* Slot suballocator this inode
+					   belongs to */
+	__le16 i_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+/*10*/	__le16 i_links_count_hi;	/* High 16 bits of links count */
+	__le16 i_xattr_inline_size;
+	__le32 i_clusters;		/* Cluster count */
+	__le32 i_uid;			/* Owner UID */
+	__le32 i_gid;			/* Owning GID */
+/*20*/	__le64 i_size;			/* Size in bytes */
+	__le16 i_mode;			/* File mode */
+	__le16 i_links_count;		/* Links count */
+	__le32 i_flags;			/* File flags */
+/*30*/	__le64 i_atime;			/* Access time */
+	__le64 i_ctime;			/* Creation time */
+/*40*/	__le64 i_mtime;			/* Modification time */
+	__le64 i_dtime;			/* Deletion time */
+/*50*/	__le64 i_blkno;			/* Offset on disk, in blocks */
+	__le64 i_last_eb_blk;		/* Pointer to last extent
+					   block */
+/*60*/	__le32 i_fs_generation;		/* Generation per fs-instance */
+	__le32 i_atime_nsec;
+	__le32 i_ctime_nsec;
+	__le32 i_mtime_nsec;
+/*70*/	__le32 i_attr;
+	__le16 i_orphaned_slot;		/* Only valid when OCFS2_ORPHANED_FL
+					   was set in i_flags */
+	__le16 i_dyn_features;
+	__le64 i_xattr_loc;
+/*80*/	struct ocfs2_block_check i_check;	/* Error checking */
+/*88*/	__le64 i_dx_root;		/* Pointer to dir index root block */
+/*90*/	__le64 i_refcount_loc;
+	__le64 i_suballoc_loc;		/* Suballocator block group this
+					   inode belongs to.  Only valid
+					   if allocated from a
+					   discontiguous block group */
+/*A0*/	__le64 i_reserved2[3];
+/*B8*/	union {
+		__le64 i_pad1;		/* Generic way to refer to this
+					   64bit union */
+		struct {
+			__le64 i_rdev;	/* Device number */
+		} dev1;
+		struct {		/* Info for bitmap system
+					   inodes */
+			__le32 i_used;	/* Bits (ie, clusters) used  */
+			__le32 i_total;	/* Total bits (clusters)
+					   available */
+		} bitmap1;
+		struct {		/* Info for journal system
+					   inodes */
+			__le32 ij_flags;	/* Mounted, version, etc. */
+			__le32 ij_recovery_generation; /* Incremented when the
+							  journal is recovered
+							  after an unclean
+							  shutdown */
+		} journal1;
+	} id1;				/* Inode type dependent 1 */
+/*C0*/	union {
+		struct ocfs2_super_block	i_super;
+		struct ocfs2_local_alloc	i_lab;
+		struct ocfs2_chain_list		i_chain;
+		struct ocfs2_extent_list	i_list;
+		struct ocfs2_truncate_log	i_dealloc;
+		struct ocfs2_inline_data	i_data;
+		__u8               		i_symlink[0];
+	} id2;
+/* Actual on-disk size is one block */
+};
+
+/*
+ * On-disk directory entry structure for OCFS2
+ *
+ * Packed as this structure could be accessed unaligned on 64-bit platforms
+ */
+struct ocfs2_dir_entry {
+/*00*/	__le64   inode;                  /* Inode number */
+	__le16   rec_len;                /* Directory entry length */
+	__u8    name_len;               /* Name length */
+	__u8    file_type;
+/*0C*/	char    name[OCFS2_MAX_FILENAME_LEN];   /* File name */
+/* Actual on-disk length specified by rec_len */
+} __attribute__ ((packed));
+
+/*
+ * Per-block record for the unindexed directory btree. This is carefully
+ * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are
+ * mirrored. That way, the directory manipulation code needs a minimal amount
+ * of update.
+ *
+ * NOTE: Keep this structure aligned to a multiple of 4 bytes.
+ */
+struct ocfs2_dir_block_trailer {
+/*00*/	__le64		db_compat_inode;	/* Always zero. Was inode */
+
+	__le16		db_compat_rec_len;	/* Backwards compatible with
+						 * ocfs2_dir_entry. */
+	__u8		db_compat_name_len;	/* Always zero. Was name_len */
+	__u8		db_reserved0;
+	__le16		db_reserved1;
+	__le16		db_free_rec_len;	/* Size of largest empty hole
+						 * in this block. (unused) */
+/*10*/	__u8		db_signature[8];	/* Signature for verification */
+	__le64		db_reserved2;
+	__le64		db_free_next;		/* Next block in list (unused) */
+/*20*/	__le64		db_blkno;		/* Offset on disk, in blocks */
+	__le64		db_parent_dinode;	/* dinode which owns me, in
+						   blocks */
+/*30*/	struct ocfs2_block_check db_check;	/* Error checking */
+/*40*/
+};
+
+ /*
+ * A directory entry in the indexed tree. We don't store the full name here,
+ * but instead provide a pointer to the full dirent in the unindexed tree.
+ *
+ * We also store name_len here so as to reduce the number of leaf blocks we
+ * need to search in case of collisions.
+ */
+struct ocfs2_dx_entry {
+	__le32		dx_major_hash;	/* Used to find logical
+					 * cluster in index */
+	__le32		dx_minor_hash;	/* Lower bits used to find
+					 * block in cluster */
+	__le64		dx_dirent_blk;	/* Physical block in unindexed
+					 * tree holding this dirent. */
+};
+
+struct ocfs2_dx_entry_list {
+	__le32		de_reserved;
+	__le16		de_count;	/* Maximum number of entries
+					 * possible in de_entries */
+	__le16		de_num_used;	/* Current number of
+					 * de_entries entries */
+	struct	ocfs2_dx_entry		de_entries[0];	/* Indexed dir entries
+							 * in a packed array of
+							 * length de_num_used */
+};
+
+#define OCFS2_DX_FLAG_INLINE	0x01
+
+/*
+ * A directory indexing block. Each indexed directory has one of these,
+ * pointed to by ocfs2_dinode.
+ *
+ * This block stores an indexed btree root, and a set of free space
+ * start-of-list pointers.
+ */
+struct ocfs2_dx_root_block {
+	__u8		dr_signature[8];	/* Signature for verification */
+	struct ocfs2_block_check dr_check;	/* Error checking */
+	__le16		dr_suballoc_slot;	/* Slot suballocator this
+						 * block belongs to. */
+	__le16		dr_suballoc_bit;	/* Bit offset in suballocator
+						 * block group */
+	__le32		dr_fs_generation;	/* Must match super block */
+	__le64		dr_blkno;		/* Offset on disk, in blocks */
+	__le64		dr_last_eb_blk;		/* Pointer to last
+						 * extent block */
+	__le32		dr_clusters;		/* Clusters allocated
+						 * to the indexed tree. */
+	__u8		dr_flags;		/* OCFS2_DX_FLAG_* flags */
+	__u8		dr_reserved0;
+	__le16		dr_reserved1;
+	__le64		dr_dir_blkno;		/* Pointer to parent inode */
+	__le32		dr_num_entries;		/* Total number of
+						 * names stored in
+						 * this directory.*/
+	__le32		dr_reserved2;
+	__le64		dr_free_blk;		/* Pointer to head of free
+						 * unindexed block list. */
+	__le64		dr_suballoc_loc;	/* Suballocator block group
+						   this root belongs to.
+						   Only valid if allocated
+						   from a discontiguous
+						   block group */
+	__le64		dr_reserved3[14];
+	union {
+		struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
+						   * bits for maximum space
+						   * efficiency. */
+		struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of
+							* entries. We grow out
+							* to extents if this
+							* gets too big. */
+	};
+};
+
+/*
+ * The header of a leaf block in the indexed tree.
+ */
+struct ocfs2_dx_leaf {
+	__u8		dl_signature[8];/* Signature for verification */
+	struct ocfs2_block_check dl_check;	/* Error checking */
+	__le64		dl_blkno;	/* Offset on disk, in blocks */
+	__le32		dl_fs_generation;/* Must match super block */
+	__le32		dl_reserved0;
+	__le64		dl_reserved1;
+	struct ocfs2_dx_entry_list	dl_list;
+};
+
+/*
+ * Largest bitmap for a block (suballocator) group in bytes.  This limit
+ * does not affect cluster groups (global allocator).  Cluster group
+ * bitmaps run to the end of the block.
+ */
+#define OCFS2_MAX_BG_BITMAP_SIZE	256
+
+/*
+ * On disk allocator group structure for OCFS2
+ */
+struct ocfs2_group_desc
+{
+/*00*/	__u8    bg_signature[8];        /* Signature for validation */
+	__le16   bg_size;                /* Size of included bitmap in
+					   bytes. */
+	__le16   bg_bits;                /* Bits represented by this
+					   group. */
+	__le16	bg_free_bits_count;     /* Free bits count */
+	__le16   bg_chain;               /* What chain I am in. */
+/*10*/	__le32   bg_generation;
+	__le32	bg_reserved1;
+	__le64   bg_next_group;          /* Next group in my list, in
+					   blocks */
+/*20*/	__le64   bg_parent_dinode;       /* dinode which owns me, in
+					   blocks */
+	__le64   bg_blkno;               /* Offset on disk, in blocks */
+/*30*/	struct ocfs2_block_check bg_check;	/* Error checking */
+	__le64   bg_reserved2;
+/*40*/	union {
+		__u8    bg_bitmap[0];
+		struct {
+			/*
+			 * Block groups may be discontiguous when
+			 * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
+			 * The extents of a discontigous block group are
+			 * stored in bg_list.  It is a flat list.
+			 * l_tree_depth must always be zero.  A
+			 * discontiguous group is signified by a non-zero
+			 * bg_list->l_next_free_rec.  Only block groups
+			 * can be discontiguous; Cluster groups cannot.
+			 * We've never made a block group with more than
+			 * 2048 blocks (256 bytes of bg_bitmap).  This
+			 * codifies that limit so that we can fit bg_list.
+			 * bg_size of a discontiguous block group will
+			 * be 256 to match bg_bitmap_filler.
+			 */
+			__u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
+/*140*/			struct ocfs2_extent_list bg_list;
+		};
+	};
+/* Actual on-disk size is one block */
+};
+
+struct ocfs2_refcount_rec {
+/*00*/	__le64 r_cpos;		/* Physical offset, in clusters */
+	__le32 r_clusters;	/* Clusters covered by this extent */
+	__le32 r_refcount;	/* Reference count of this extent */
+/*10*/
+};
+#define OCFS2_32BIT_POS_MASK		(0xffffffffULL)
+
+#define OCFS2_REFCOUNT_LEAF_FL          (0x00000001)
+#define OCFS2_REFCOUNT_TREE_FL          (0x00000002)
+
+struct ocfs2_refcount_list {
+/*00*/	__le16 rl_count;	/* Maximum number of entries possible
+				   in rl_records */
+	__le16 rl_used;		/* Current number of used records */
+	__le32 rl_reserved2;
+	__le64 rl_reserved1;	/* Pad to sizeof(ocfs2_refcount_record) */
+/*10*/	struct ocfs2_refcount_rec rl_recs[0];	/* Refcount records */
+};
+
+
+struct ocfs2_refcount_block {
+/*00*/	__u8 rf_signature[8];		/* Signature for verification */
+	__le16 rf_suballoc_slot;	/* Slot suballocator this block
+					   belongs to */
+	__le16 rf_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+	__le32 rf_fs_generation;	/* Must match superblock */
+/*10*/	__le64 rf_blkno;		/* Offset on disk, in blocks */
+	__le64 rf_parent;		/* Parent block, only valid if
+					   OCFS2_REFCOUNT_LEAF_FL is set in
+					   rf_flags */
+/*20*/	struct ocfs2_block_check rf_check;	/* Error checking */
+	__le64 rf_last_eb_blk;		/* Pointer to last extent block */
+/*30*/	__le32 rf_count;		/* Number of inodes sharing this
+					   refcount tree */
+	__le32 rf_flags;		/* See the flags above */
+	__le32 rf_clusters;		/* clusters covered by refcount tree. */
+	__le32 rf_cpos;			/* cluster offset in refcount tree.*/
+/*40*/	__le32 rf_generation;		/* generation number. all be the same
+					 * for the same refcount tree. */
+	__le32 rf_reserved0;
+	__le64 rf_suballoc_loc;		/* Suballocator block group this
+					   refcount block belongs to. Only
+					   valid if allocated from a
+					   discontiguous block group */
+/*50*/	__le64 rf_reserved1[6];
+/*80*/	union {
+		struct ocfs2_refcount_list rf_records;  /* List of refcount
+							  records */
+		struct ocfs2_extent_list rf_list;	/* Extent record list,
+							only valid if
+							OCFS2_REFCOUNT_TREE_FL
+							is set in rf_flags */
+	};
+/* Actual on-disk size is one block */
+};
+
+/*
+ * On disk extended attribute structure for OCFS2.
+ */
+
+/*
+ * ocfs2_xattr_entry indicates one extend attribute.
+ *
+ * Note that it can be stored in inode, one block or one xattr bucket.
+ */
+struct ocfs2_xattr_entry {
+	__le32	xe_name_hash;    /* hash value of xattr prefix+suffix. */
+	__le16	xe_name_offset;  /* byte offset from the 1st entry in the
+				    local xattr storage(inode, xattr block or
+				    xattr bucket). */
+	__u8	xe_name_len;	 /* xattr name len, doesn't include prefix. */
+	__u8	xe_type;         /* the low 7 bits indicate the name prefix
+				  * type and the highest bit indicates whether
+				  * the EA is stored in the local storage. */
+	__le64	xe_value_size;	 /* real xattr value length. */
+};
+
+/*
+ * On disk structure for xattr header.
+ *
+ * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in
+ * the local xattr storage.
+ */
+struct ocfs2_xattr_header {
+	__le16	xh_count;                       /* contains the count of how
+						   many records are in the
+						   local xattr storage. */
+	__le16	xh_free_start;                  /* current offset for storing
+						   xattr. */
+	__le16	xh_name_value_len;              /* total length of name/value
+						   length in this bucket. */
+	__le16	xh_num_buckets;                 /* Number of xattr buckets
+						   in this extent record,
+						   only valid in the first
+						   bucket. */
+	struct ocfs2_block_check xh_check;	/* Error checking
+						   (Note, this is only
+						    used for xattr
+						    buckets.  A block uses
+						    xb_check and sets
+						    this field to zero.) */
+	struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
+};
+
+/*
+ * On disk structure for xattr value root.
+ *
+ * When an xattr's value is large enough, it is stored in an external
+ * b-tree like file data.  The xattr value root points to this structure.
+ */
+struct ocfs2_xattr_value_root {
+/*00*/	__le32	xr_clusters;              /* clusters covered by xattr value. */
+	__le32	xr_reserved0;
+	__le64	xr_last_eb_blk;           /* Pointer to last extent block */
+/*10*/	struct ocfs2_extent_list xr_list; /* Extent record list */
+};
+
+/*
+ * On disk structure for xattr tree root.
+ *
+ * It is used when there are too many extended attributes for one file. These
+ * attributes will be organized and stored in an indexed-btree.
+ */
+struct ocfs2_xattr_tree_root {
+/*00*/	__le32	xt_clusters;              /* clusters covered by xattr. */
+	__le32	xt_reserved0;
+	__le64	xt_last_eb_blk;           /* Pointer to last extent block */
+/*10*/	struct ocfs2_extent_list xt_list; /* Extent record list */
+};
+
+#define OCFS2_XATTR_INDEXED	0x1
+#define OCFS2_HASH_SHIFT	5
+#define OCFS2_XATTR_ROUND	3
+#define OCFS2_XATTR_SIZE(size)	(((size) + OCFS2_XATTR_ROUND) & \
+				~(OCFS2_XATTR_ROUND))
+
+#define OCFS2_XATTR_BUCKET_SIZE			4096
+#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET 	(OCFS2_XATTR_BUCKET_SIZE \
+						 / OCFS2_MIN_BLOCKSIZE)
+
+/*
+ * On disk structure for xattr block.
+ */
+struct ocfs2_xattr_block {
+/*00*/	__u8	xb_signature[8];     /* Signature for verification */
+	__le16	xb_suballoc_slot;    /* Slot suballocator this
+					block belongs to. */
+	__le16	xb_suballoc_bit;     /* Bit offset in suballocator
+					block group */
+	__le32	xb_fs_generation;    /* Must match super block */
+/*10*/	__le64	xb_blkno;            /* Offset on disk, in blocks */
+	struct ocfs2_block_check xb_check;	/* Error checking */
+/*20*/	__le16	xb_flags;            /* Indicates whether this block contains
+					real xattr or a xattr tree. */
+	__le16	xb_reserved0;
+	__le32  xb_reserved1;
+	__le64	xb_suballoc_loc;	/* Suballocator block group this
+					   xattr block belongs to. Only
+					   valid if allocated from a
+					   discontiguous block group */
+/*30*/	union {
+		struct ocfs2_xattr_header xb_header; /* xattr header if this
+							block contains xattr */
+		struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this
+							block cotains xattr
+							tree. */
+	} xb_attrs;
+};
+
+#define OCFS2_XATTR_ENTRY_LOCAL		0x80
+#define OCFS2_XATTR_TYPE_MASK		0x7F
+static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe,
+					 int local)
+{
+	if (local)
+		xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL;
+	else
+		xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe)
+{
+	return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type)
+{
+	xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK;
+}
+
+static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
+{
+	return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
+}
+
+/*
+ *  On disk structures for global quota file
+ */
+
+/* Magic numbers and known versions for global quota files */
+#define OCFS2_GLOBAL_QMAGICS {\
+	0x0cf52470, /* USRQUOTA */ \
+	0x0cf52471  /* GRPQUOTA */ \
+}
+
+#define OCFS2_GLOBAL_QVERSIONS {\
+	0, \
+	0, \
+}
+
+
+/* Each block of each quota file has a certain fixed number of bytes reserved
+ * for OCFS2 internal use at its end. OCFS2 can use it for things like
+ * checksums, etc. */
+#define OCFS2_QBLK_RESERVED_SPACE 8
+
+/* Generic header of all quota files */
+struct ocfs2_disk_dqheader {
+	__le32 dqh_magic;	/* Magic number identifying file */
+	__le32 dqh_version;	/* Quota format version */
+};
+
+#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of global quota file (immediately follows the generic
+ * header) */
+struct ocfs2_global_disk_dqinfo {
+/*00*/	__le32 dqi_bgrace;	/* Grace time for space softlimit excess */
+	__le32 dqi_igrace;	/* Grace time for inode softlimit excess */
+	__le32 dqi_syncms;	/* Time after which we sync local changes to
+				 * global quota file */
+	__le32 dqi_blocks;	/* Number of blocks in quota file */
+/*10*/	__le32 dqi_free_blk;	/* First free block in quota file */
+	__le32 dqi_free_entry;	/* First block with free dquot entry in quota
+				 * file */
+};
+
+/* Structure with global user / group information. We reserve some space
+ * for future use. */
+struct ocfs2_global_disk_dqblk {
+/*00*/	__le32 dqb_id;          /* ID the structure belongs to */
+	__le32 dqb_use_count;   /* Number of nodes having reference to this structure */
+	__le64 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+/*10*/	__le64 dqb_isoftlimit;  /* preferred inode limit */
+	__le64 dqb_curinodes;   /* current # allocated inodes */
+/*20*/	__le64 dqb_bhardlimit;  /* absolute limit on disk space */
+	__le64 dqb_bsoftlimit;  /* preferred limit on disk space */
+/*30*/	__le64 dqb_curspace;    /* current space occupied */
+	__le64 dqb_btime;       /* time limit for excessive disk use */
+/*40*/	__le64 dqb_itime;       /* time limit for excessive inode use */
+	__le64 dqb_pad1;
+/*50*/	__le64 dqb_pad2;
+};
+
+/*
+ *  On-disk structures for local quota file
+ */
+
+/* Magic numbers and known versions for local quota files */
+#define OCFS2_LOCAL_QMAGICS {\
+	0x0cf524c0, /* USRQUOTA */ \
+	0x0cf524c1  /* GRPQUOTA */ \
+}
+
+#define OCFS2_LOCAL_QVERSIONS {\
+	0, \
+	0, \
+}
+
+/* Quota flags in dqinfo header */
+#define OLQF_CLEAN	0x0001	/* Quota file is empty (this should be after\
+				 * quota has been cleanly turned off) */
+
+#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of local quota file (immediately follows the generic
+ * header) */
+struct ocfs2_local_disk_dqinfo {
+	__le32 dqi_flags;	/* Flags for quota file */
+	__le32 dqi_chunks;	/* Number of chunks of quota structures
+				 * with a bitmap */
+	__le32 dqi_blocks;	/* Number of blocks allocated for quota file */
+};
+
+/* Header of one chunk of a quota file */
+struct ocfs2_local_disk_chunk {
+	__le32 dqc_free;	/* Number of free entries in the bitmap */
+	__u8 dqc_bitmap[0];	/* Bitmap of entries in the corresponding
+				 * chunk of quota file */
+};
+
+/* One entry in local quota file */
+struct ocfs2_local_disk_dqblk {
+/*00*/	__le64 dqb_id;		/* id this quota applies to */
+	__le64 dqb_spacemod;	/* Change in the amount of used space */
+/*10*/	__le64 dqb_inodemod;	/* Change in the amount of used inodes */
+};
+
+
+/*
+ * The quota trailer lives at the end of each quota block.
+ */
+
+struct ocfs2_disk_dqtrailer {
+/*00*/	struct ocfs2_block_check dq_check;	/* Error checking */
+/*08*/	/* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */
+};
+
+static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize,
+								 void *buf)
+{
+	char *ptr = buf;
+	ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE;
+
+	return (struct ocfs2_disk_dqtrailer *)ptr;
+}
+
+#ifdef __KERNEL__
+static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
+{
+	return  sb->s_blocksize -
+		 offsetof(struct ocfs2_dinode, id2.i_symlink);
+}
+
+static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
+						   struct ocfs2_dinode *di)
+{
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		return sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+			xattrsize;
+	else
+		return sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+}
+
+static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_extent_recs_per_inode_with_xattr(
+						struct super_block *sb,
+						struct ocfs2_dinode *di)
+{
+	int size;
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		size = sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs) -
+			xattrsize;
+	else
+		size = sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dx_root_block, dr_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct ocfs2_chain_rec);
+}
+
+static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_extent_block, h_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_group_desc, bg_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dx_leaf, dl_list.de_entries);
+
+	return size / sizeof(struct ocfs2_dx_entry);
+}
+
+static inline int ocfs2_dx_entries_per_root(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries);
+
+	return size / sizeof(struct ocfs2_dx_entry);
+}
+
+static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
+{
+	u16 size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_group_bitmap_size(struct super_block *sb,
+					  int suballocator,
+					  u32 feature_incompat)
+{
+	int size = sb->s_blocksize -
+		offsetof(struct ocfs2_group_desc, bg_bitmap);
+
+	/*
+	 * The cluster allocator uses the entire block.  Suballocators have
+	 * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+	 * code expects bg_size set to the maximum.  Thus we must keep
+	 * bg_size as-is unless discontig_bg is enabled.
+	 */
+	if (suballocator &&
+	    (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+		size = OCFS2_MAX_BG_BITMAP_SIZE;
+
+	return size;
+}
+
+static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
+
+	return size / sizeof(struct ocfs2_truncate_rec);
+}
+
+static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index)
+{
+	u64 offset = OCFS2_BACKUP_SB_START;
+
+	if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) {
+		offset <<= (2 * index);
+		offset >>= sb->s_blocksize_bits;
+		return offset;
+	}
+
+	return 0;
+
+}
+
+static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_xattr_block,
+			 xb_attrs.xb_root.xt_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_refcount_block, rf_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_refcount_block, rf_records.rl_recs);
+
+	return size / sizeof(struct ocfs2_refcount_rec);
+}
+
+static inline u32
+ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec)
+{
+	return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
+}
+#else
+static inline int ocfs2_fast_symlink_chars(int blocksize)
+{
+	return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
+}
+
+static inline int ocfs2_max_inline_data_with_xattr(int blocksize,
+						   struct ocfs2_dinode *di)
+{
+	if (di && (di->i_dyn_features & OCFS2_INLINE_XATTR_FL))
+		return blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+			di->i_xattr_inline_size;
+	else
+		return blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+}
+
+static inline int ocfs2_extent_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_chain_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct ocfs2_chain_rec);
+}
+
+static inline int ocfs2_extent_recs_per_eb(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_extent_block, h_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_extent_recs_per_gd(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_group_desc, bg_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_local_alloc_size(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_group_bitmap_size(int blocksize,
+					  int suballocator,
+					  uint32_t feature_incompat)
+{
+	int size = sb->s_blocksize -
+		offsetof(struct ocfs2_group_desc, bg_bitmap);
+
+	/*
+	 * The cluster allocator uses the entire block.  Suballocators have
+	 * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+	 * code expects bg_size set to the maximum.  Thus we must keep
+	 * bg_size as-is unless discontig_bg is enabled.
+	 */
+	if (suballocator &&
+	    (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+		size = OCFS2_MAX_BG_BITMAP_SIZE;
+
+	return size;
+}
+
+static inline int ocfs2_truncate_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
+
+	return size / sizeof(struct ocfs2_truncate_rec);
+}
+
+static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index)
+{
+	uint64_t offset = OCFS2_BACKUP_SB_START;
+
+	if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) {
+		offset <<= (2 * index);
+		offset /= blocksize;
+		return offset;
+	}
+
+	return 0;
+}
+
+static inline int ocfs2_xattr_recs_per_xb(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_xattr_block,
+			 xb_attrs.xb_root.xt_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+#endif  /* __KERNEL__ */
+
+
+static inline int ocfs2_system_inode_is_global(int type)
+{
+	return ((type >= 0) &&
+		(type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE));
+}
+
+static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
+						  int type, int slot)
+{
+	int chars;
+
+        /*
+         * Global system inodes can only have one copy.  Everything
+         * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode
+         * list has a copy per slot.
+         */
+	if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
+		chars = snprintf(buf, len, "%s",
+				 ocfs2_system_inodes[type].si_name);
+	else
+		chars = snprintf(buf, len,
+				 ocfs2_system_inodes[type].si_name,
+				 slot);
+
+	return chars;
+}
+
+static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
+				    umode_t mode)
+{
+	de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
+{
+	if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
+	     le16_to_cpu(gd->bg_size)) !=
+	    offsetof(struct ocfs2_group_desc, bg_list))
+		return 0;
+	/*
+	 * Only valid to check l_next_free_rec if
+	 * bg_bitmap + bg_size == bg_list.
+	 */
+	if (!gd->bg_list.l_next_free_rec)
+		return 0;
+	return 1;
+}
+#endif  /* _OCFS2_FS_H */
+
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
new file mode 100644
index 00000000..5b27ff1f
--- /dev/null
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -0,0 +1,242 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_ioctl.h
+ *
+ * Defines OCFS2 ioctls.
+ *
+ * Copyright (C) 2010 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_IOCTL_H
+#define OCFS2_IOCTL_H
+
+/*
+ * ioctl commands
+ */
+#define OCFS2_IOC_GETFLAGS	FS_IOC_GETFLAGS
+#define OCFS2_IOC_SETFLAGS	FS_IOC_SETFLAGS
+#define OCFS2_IOC32_GETFLAGS	FS_IOC32_GETFLAGS
+#define OCFS2_IOC32_SETFLAGS	FS_IOC32_SETFLAGS
+
+/*
+ * Space reservation / allocation / free ioctls and argument structure
+ * are designed to be compatible with XFS.
+ *
+ * ALLOCSP* and FREESP* are not and will never be supported, but are
+ * included here for completeness.
+ */
+struct ocfs2_space_resv {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start;
+	__s64		l_len;		/* len == 0 means until end of file */
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserve area			    */
+};
+
+#define OCFS2_IOC_ALLOCSP		_IOW ('X', 10, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP		_IOW ('X', 11, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP		_IOW ('X', 40, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP	_IOW ('X', 41, struct ocfs2_space_resv)
+#define OCFS2_IOC_ALLOCSP64	_IOW ('X', 36, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP64	_IOW ('X', 37, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP64	_IOW ('X', 42, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP64	_IOW ('X', 43, struct ocfs2_space_resv)
+
+/* Used to pass group descriptor data when online resize is done */
+struct ocfs2_new_group_input {
+	__u64 group;		/* Group descriptor's blkno. */
+	__u32 clusters;		/* Total number of clusters in this group */
+	__u32 frees;		/* Total free clusters in this group */
+	__u16 chain;		/* Chain for this group */
+	__u16 reserved1;
+	__u32 reserved2;
+};
+
+#define OCFS2_IOC_GROUP_EXTEND	_IOW('o', 1, int)
+#define OCFS2_IOC_GROUP_ADD	_IOW('o', 2,struct ocfs2_new_group_input)
+#define OCFS2_IOC_GROUP_ADD64	_IOW('o', 3,struct ocfs2_new_group_input)
+
+/* Used to pass 2 file names to reflink. */
+struct reflink_arguments {
+	__u64 old_path;
+	__u64 new_path;
+	__u64 preserve;
+};
+#define OCFS2_IOC_REFLINK	_IOW('o', 4, struct reflink_arguments)
+
+/* Following definitions dedicated for ocfs2_info_request ioctls. */
+#define OCFS2_INFO_MAX_REQUEST		(50)
+#define OCFS2_TEXT_UUID_LEN		(OCFS2_VOL_UUID_LEN * 2)
+
+/* Magic number of all requests */
+#define OCFS2_INFO_MAGIC		(0x4F32494E)
+
+/*
+ * Always try to separate info request into small pieces to
+ * guarantee the backward&forward compatibility.
+ */
+struct ocfs2_info {
+	__u64 oi_requests;	/* Array of __u64 pointers to requests */
+	__u32 oi_count;		/* Number of requests in info_requests */
+	__u32 oi_pad;
+};
+
+struct ocfs2_info_request {
+/*00*/	__u32 ir_magic;	/* Magic number */
+	__u32 ir_code;	/* Info request code */
+	__u32 ir_size;	/* Size of request */
+	__u32 ir_flags;	/* Request flags */
+/*10*/			/* Request specific fields */
+};
+
+struct ocfs2_info_clustersize {
+	struct ocfs2_info_request ic_req;
+	__u32 ic_clustersize;
+	__u32 ic_pad;
+};
+
+struct ocfs2_info_blocksize {
+	struct ocfs2_info_request ib_req;
+	__u32 ib_blocksize;
+	__u32 ib_pad;
+};
+
+struct ocfs2_info_maxslots {
+	struct ocfs2_info_request im_req;
+	__u32 im_max_slots;
+	__u32 im_pad;
+};
+
+struct ocfs2_info_label {
+	struct ocfs2_info_request il_req;
+	__u8	il_label[OCFS2_MAX_VOL_LABEL_LEN];
+} __attribute__ ((packed));
+
+struct ocfs2_info_uuid {
+	struct ocfs2_info_request iu_req;
+	__u8	iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
+} __attribute__ ((packed));
+
+struct ocfs2_info_fs_features {
+	struct ocfs2_info_request if_req;
+	__u32 if_compat_features;
+	__u32 if_incompat_features;
+	__u32 if_ro_compat_features;
+	__u32 if_pad;
+};
+
+struct ocfs2_info_journal_size {
+	struct ocfs2_info_request ij_req;
+	__u64 ij_journal_size;
+};
+
+struct ocfs2_info_freeinode {
+	struct ocfs2_info_request ifi_req;
+	struct ocfs2_info_local_freeinode {
+		__u64 lfi_total;
+		__u64 lfi_free;
+	} ifi_stat[OCFS2_MAX_SLOTS];
+	__u32 ifi_slotnum; /* out */
+	__u32 ifi_pad;
+};
+
+#define OCFS2_INFO_MAX_HIST     (32)
+
+struct ocfs2_info_freefrag {
+	struct ocfs2_info_request iff_req;
+	struct ocfs2_info_freefrag_stats { /* (out) */
+		struct ocfs2_info_free_chunk_list {
+			__u32 fc_chunks[OCFS2_INFO_MAX_HIST];
+			__u32 fc_clusters[OCFS2_INFO_MAX_HIST];
+		} ffs_fc_hist;
+		__u32 ffs_clusters;
+		__u32 ffs_free_clusters;
+		__u32 ffs_free_chunks;
+		__u32 ffs_free_chunks_real;
+		__u32 ffs_min; /* Minimum free chunksize in clusters */
+		__u32 ffs_max;
+		__u32 ffs_avg;
+		__u32 ffs_pad;
+	} iff_ffs;
+	__u32 iff_chunksize; /* chunksize in clusters(in) */
+	__u32 iff_pad;
+};
+
+/* Codes for ocfs2_info_request */
+enum ocfs2_info_type {
+	OCFS2_INFO_CLUSTERSIZE = 1,
+	OCFS2_INFO_BLOCKSIZE,
+	OCFS2_INFO_MAXSLOTS,
+	OCFS2_INFO_LABEL,
+	OCFS2_INFO_UUID,
+	OCFS2_INFO_FS_FEATURES,
+	OCFS2_INFO_JOURNAL_SIZE,
+	OCFS2_INFO_FREEINODE,
+	OCFS2_INFO_FREEFRAG,
+	OCFS2_INFO_NUM_TYPES
+};
+
+/* Flags for struct ocfs2_info_request */
+/* Filled by the caller */
+#define OCFS2_INFO_FL_NON_COHERENT	(0x00000001)	/* Cluster coherency not
+							   required. This is a hint.
+							   It is up to ocfs2 whether
+							   the request can be fulfilled
+							   without locking. */
+/* Filled by ocfs2 */
+#define OCFS2_INFO_FL_FILLED		(0x40000000)	/* Filesystem understood
+							   this request and
+							   filled in the answer */
+
+#define OCFS2_INFO_FL_ERROR		(0x80000000)	/* Error happened during
+							   request handling. */
+
+#define OCFS2_IOC_INFO		_IOR('o', 5, struct ocfs2_info)
+
+struct ocfs2_move_extents {
+/* All values are in bytes */
+	/* in */
+	__u64 me_start;		/* Virtual start in the file to move */
+	__u64 me_len;		/* Length of the extents to be moved */
+	__u64 me_goal;		/* Physical offset of the goal,
+				   it's in block unit */
+	__u64 me_threshold;	/* Maximum distance from goal or threshold
+				   for auto defragmentation */
+	__u64 me_flags;		/* Flags for the operation:
+				 * - auto defragmentation.
+				 * - refcount,xattr cases.
+				 */
+	/* out */
+	__u64 me_moved_len;	/* Moved/defraged length */
+	__u64 me_new_offset;	/* Resulting physical location */
+	__u32 me_reserved[2];	/* Reserved for futhure */
+};
+
+#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG	(0x00000001)	/* Kernel manages to
+							   claim new clusters
+							   as the goal place
+							   for extents moving */
+#define OCFS2_MOVE_EXT_FL_PART_DEFRAG	(0x00000002)	/* Allow partial extent
+							   moving, is to make
+							   movement less likely
+							   to fail, may make fs
+							   even more fragmented */
+#define OCFS2_MOVE_EXT_FL_COMPLETE	(0x00000004)	/* Move or defragmenation
+							   completely gets done.
+							 */
+
+#define OCFS2_IOC_MOVE_EXT	_IOW('o', 6, struct ocfs2_move_extents)
+
+#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
new file mode 100644
index 00000000..d277aabf
--- /dev/null
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -0,0 +1,128 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_lockid.h
+ *
+ * Defines OCFS2 lockid bits.
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_LOCKID_H
+#define OCFS2_LOCKID_H
+
+/* lock ids are made up in the following manner:
+ * name[0]     --> type
+ * name[1-6]   --> 6 pad characters, reserved for now
+ * name[7-22]  --> block number, expressed in hex as 16 chars
+ * name[23-30] --> i_generation, expressed in hex 8 chars
+ * name[31]    --> '\0' */
+#define OCFS2_LOCK_ID_MAX_LEN  32
+#define OCFS2_LOCK_ID_PAD "000000"
+
+#define OCFS2_DENTRY_LOCK_INO_START 18
+
+enum ocfs2_lock_type {
+	OCFS2_LOCK_TYPE_META = 0,
+	OCFS2_LOCK_TYPE_DATA,
+	OCFS2_LOCK_TYPE_SUPER,
+	OCFS2_LOCK_TYPE_RENAME,
+	OCFS2_LOCK_TYPE_RW,
+	OCFS2_LOCK_TYPE_DENTRY,
+	OCFS2_LOCK_TYPE_OPEN,
+	OCFS2_LOCK_TYPE_FLOCK,
+	OCFS2_LOCK_TYPE_QINFO,
+	OCFS2_LOCK_TYPE_NFS_SYNC,
+	OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+	OCFS2_LOCK_TYPE_REFCOUNT,
+	OCFS2_NUM_LOCK_TYPES
+};
+
+static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
+{
+	char c;
+	switch (type) {
+		case OCFS2_LOCK_TYPE_META:
+			c = 'M';
+			break;
+		case OCFS2_LOCK_TYPE_DATA:
+			c = 'D';
+			break;
+		case OCFS2_LOCK_TYPE_SUPER:
+			c = 'S';
+			break;
+		case OCFS2_LOCK_TYPE_RENAME:
+			c = 'R';
+			break;
+		case OCFS2_LOCK_TYPE_RW:
+			c = 'W';
+			break;
+		case OCFS2_LOCK_TYPE_DENTRY:
+			c = 'N';
+			break;
+		case OCFS2_LOCK_TYPE_OPEN:
+			c = 'O';
+			break;
+		case OCFS2_LOCK_TYPE_FLOCK:
+			c = 'F';
+			break;
+		case OCFS2_LOCK_TYPE_QINFO:
+			c = 'Q';
+			break;
+		case OCFS2_LOCK_TYPE_NFS_SYNC:
+			c = 'Y';
+			break;
+		case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
+			c = 'P';
+			break;
+		case OCFS2_LOCK_TYPE_REFCOUNT:
+			c = 'T';
+			break;
+		default:
+			c = '\0';
+	}
+
+	return c;
+}
+
+static char *ocfs2_lock_type_strings[] = {
+	[OCFS2_LOCK_TYPE_META] = "Meta",
+	[OCFS2_LOCK_TYPE_DATA] = "Data",
+	[OCFS2_LOCK_TYPE_SUPER] = "Super",
+	[OCFS2_LOCK_TYPE_RENAME] = "Rename",
+	/* Need to differntiate from [R]ename.. serializing writes is the
+	 * important job it does, anyway. */
+	[OCFS2_LOCK_TYPE_RW] = "Write/Read",
+	[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
+	[OCFS2_LOCK_TYPE_OPEN] = "Open",
+	[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
+	[OCFS2_LOCK_TYPE_QINFO] = "Quota",
+	[OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
+	[OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
+	[OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount",
+};
+
+static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
+{
+#ifdef __KERNEL__
+	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
+#endif
+	return ocfs2_lock_type_strings[type];
+}
+
+#endif  /* OCFS2_LOCKID_H */
diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
new file mode 100644
index 00000000..2e45c8d2
--- /dev/null
+++ b/fs/ocfs2/ocfs2_lockingver.h
@@ -0,0 +1,32 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_lockingver.h
+ *
+ * Defines OCFS2 Locking version values.
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_LOCKINGVER_H
+#define OCFS2_LOCKINGVER_H
+
+/*
+ * The protocol version for ocfs2 cluster locking.  See dlmglue.c for
+ * more details.
+ *
+ * 1.0 - Initial locking version from ocfs2 1.4.
+ */
+#define OCFS2_LOCKING_PROTOCOL_MAJOR 1
+#define OCFS2_LOCKING_PROTOCOL_MINOR 0
+
+#endif  /* OCFS2_LOCKINGVER_H */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
new file mode 100644
index 00000000..3b481f49
--- /dev/null
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -0,0 +1,2764 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ocfs2
+
+#if !defined(_TRACE_OCFS2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OCFS2_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(ocfs2__int,
+	TP_PROTO(int num),
+	TP_ARGS(num),
+	TP_STRUCT__entry(
+		__field(int, num)
+	),
+	TP_fast_assign(
+		__entry->num = num;
+	),
+	TP_printk("%d", __entry->num)
+);
+
+#define DEFINE_OCFS2_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__int, name,	\
+	TP_PROTO(int num),	\
+	TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__uint,
+	TP_PROTO(unsigned int num),
+	TP_ARGS(num),
+	TP_STRUCT__entry(
+		__field(	unsigned int,	num		)
+	),
+	TP_fast_assign(
+		__entry->num	= 	num;
+	),
+	TP_printk("%u", __entry->num)
+);
+
+#define DEFINE_OCFS2_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint, name,	\
+	TP_PROTO(unsigned int num),	\
+	TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__ull,
+	TP_PROTO(unsigned long long blkno),
+	TP_ARGS(blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+	),
+	TP_printk("%llu", __entry->blkno)
+);
+
+#define DEFINE_OCFS2_ULL_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull, name,	\
+	TP_PROTO(unsigned long long num),	\
+	TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__pointer,
+	TP_PROTO(void *pointer),
+	TP_ARGS(pointer),
+	TP_STRUCT__entry(
+		__field(void *, pointer)
+	),
+	TP_fast_assign(
+		__entry->pointer = pointer;
+	),
+	TP_printk("%p", __entry->pointer)
+);
+
+#define DEFINE_OCFS2_POINTER_EVENT(name)	\
+DEFINE_EVENT(ocfs2__pointer, name,	\
+	TP_PROTO(void *pointer),	\
+	TP_ARGS(pointer))
+
+DECLARE_EVENT_CLASS(ocfs2__string,
+	TP_PROTO(const char *name),
+	TP_ARGS(name),
+	TP_STRUCT__entry(
+		__string(name,name)
+	),
+	TP_fast_assign(
+		__assign_str(name, name);
+	),
+	TP_printk("%s", __get_str(name))
+);
+
+#define DEFINE_OCFS2_STRING_EVENT(name)	\
+DEFINE_EVENT(ocfs2__string, name,	\
+	TP_PROTO(const char *name),	\
+	TP_ARGS(name))
+
+DECLARE_EVENT_CLASS(ocfs2__int_int,
+	TP_PROTO(int value1, int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(int, value1)
+		__field(int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1	= value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%d %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_INT_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__int_int, name,	\
+	TP_PROTO(int val1, int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_int,
+	TP_PROTO(unsigned int value1, int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned int, value1)
+		__field(int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1	= value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%u %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_UINT_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint_int, name,	\
+	TP_PROTO(unsigned int val1, int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_uint,
+	TP_PROTO(unsigned int value1, unsigned int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned int, value1)
+		__field(unsigned int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+	),
+	TP_printk("%u %u", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint_uint, name,	\
+	TP_PROTO(unsigned int val1, unsigned int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint,
+	TP_PROTO(unsigned long long value1, unsigned int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+	),
+	TP_printk("%llu %u", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_uint, name,	\
+	TP_PROTO(unsigned long long val1, unsigned int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_int,
+	TP_PROTO(unsigned long long value1, int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1	= value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%llu %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_int, name,	\
+	TP_PROTO(unsigned long long val1, int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull,
+	TP_PROTO(unsigned long long value1, unsigned long long value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+	),
+	TP_printk("%llu %llu", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull, name,	\
+	TP_PROTO(unsigned long long val1, unsigned long long val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint,
+	TP_PROTO(unsigned long long value1,
+		 unsigned long long value2, unsigned int value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+		__field(unsigned int, value3)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3 = value3;
+	),
+	TP_printk("%llu %llu %u",
+		  __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull_uint, name,	\
+	TP_PROTO(unsigned long long val1,	\
+		 unsigned long long val2, unsigned int val3),	\
+	TP_ARGS(val1, val2, val3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint,
+	TP_PROTO(unsigned long long value1,
+		 unsigned int value2, unsigned int value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned int, value2)
+		__field(unsigned int, value3)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3	= value3;
+	),
+	TP_printk("%llu %u %u", __entry->value1,
+		  __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_uint_uint, name,	\
+	TP_PROTO(unsigned long long val1,	\
+		 unsigned int val2, unsigned int val3),	\
+	TP_ARGS(val1, val2, val3))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_uint_uint,
+	TP_PROTO(unsigned int value1, unsigned int value2,
+		 unsigned int value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(	unsigned int,	value1		)
+		__field(	unsigned int,	value2		)
+		__field(	unsigned int,	value3		)
+	),
+	TP_fast_assign(
+		__entry->value1	= 	value1;
+		__entry->value2	= 	value2;
+		__entry->value3	= 	value3;
+	),
+	TP_printk("%u %u %u", __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_UINT_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint_uint_uint, name,	\
+	TP_PROTO(unsigned int value1, unsigned int value2,	\
+		 unsigned int value3),	\
+	TP_ARGS(value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_ull,
+	TP_PROTO(unsigned long long value1,
+		 unsigned long long value2, unsigned long long value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+		__field(unsigned long long, value3)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3 = value3;
+	),
+	TP_printk("%llu %llu %llu",
+		  __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_ULL_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull_ull, name,	\
+	TP_PROTO(unsigned long long value1, unsigned long long value2,	\
+		 unsigned long long value3),	\
+	TP_ARGS(value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_int_int_int,
+	TP_PROTO(unsigned long long ull, int value1, int value2, int value3),
+	TP_ARGS(ull, value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(	unsigned long long,	ull	)
+		__field(	int,	value1			)
+		__field(	int,	value2			)
+		__field(	int,	value3			)
+	),
+	TP_fast_assign(
+		__entry->ull		= ull;
+		__entry->value1		= value1;
+		__entry->value2		= value2;
+		__entry->value3		= value3;
+	),
+	TP_printk("%llu %d %d %d",
+		  __entry->ull, __entry->value1,
+		  __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_int_int_int, name,	\
+	TP_PROTO(unsigned long long ull, int value1,	\
+		 int value2, int value3),	\
+	TP_ARGS(ull, value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint_uint,
+	TP_PROTO(unsigned long long ull, unsigned int value1,
+		 unsigned int value2, unsigned int value3),
+	TP_ARGS(ull, value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ull)
+		__field(unsigned int, value1)
+		__field(unsigned int, value2)
+		__field(unsigned int, value3)
+	),
+	TP_fast_assign(
+		__entry->ull = ull;
+		__entry->value1 = value1;
+		__entry->value2	= value2;
+		__entry->value3	= value3;
+	),
+	TP_printk("%llu %u %u %u",
+		  __entry->ull, __entry->value1,
+		  __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_uint_uint_uint, name,	\
+	TP_PROTO(unsigned long long ull, unsigned int value1,	\
+		 unsigned int value2, unsigned int value3),	\
+	TP_ARGS(ull, value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint_uint,
+	TP_PROTO(unsigned long long value1, unsigned long long value2,
+		 unsigned int value3, unsigned int value4),
+	TP_ARGS(value1, value2, value3, value4),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+		__field(unsigned int, value3)
+		__field(unsigned int, value4)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3 = value3;
+		__entry->value4 = value4;
+	),
+	TP_printk("%llu %llu %u %u",
+		  __entry->value1, __entry->value2,
+		  __entry->value3, __entry->value4)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull_uint_uint, name,	\
+	TP_PROTO(unsigned long long ull, unsigned long long ull1,	\
+		 unsigned int value2, unsigned int value3),	\
+	TP_ARGS(ull, ull1, value2, value3))
+
+/* Trace events for fs/ocfs2/alloc.c. */
+DECLARE_EVENT_CLASS(ocfs2__btree_ops,
+	TP_PROTO(unsigned long long owner,\
+		 unsigned int value1, unsigned int value2),
+	TP_ARGS(owner, value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned int, value1)
+		__field(unsigned int, value2)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->value1 = value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%llu %u %u",
+		  __entry->owner, __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_BTREE_EVENT(name)	\
+DEFINE_EVENT(ocfs2__btree_ops, name,	\
+	TP_PROTO(unsigned long long owner,	\
+		 unsigned int value1, unsigned int value2),	\
+	TP_ARGS(owner, value1, value2))
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_adjust_rightmost_branch);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_rotate_tree_right);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_append_rec_to_path);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_insert_extent_start);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_add_clusters_in_btree);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_num_free_extents);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_edge_insert);
+
+TRACE_EVENT(ocfs2_grow_tree,
+	TP_PROTO(unsigned long long owner, int depth),
+	TP_ARGS(owner, depth),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(int, depth)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->depth = depth;
+	),
+	TP_printk("%llu %d", __entry->owner, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_rotate_subtree,
+	TP_PROTO(int subtree_root, unsigned long long blkno,
+		 int depth),
+	TP_ARGS(subtree_root, blkno, depth),
+	TP_STRUCT__entry(
+		__field(int, subtree_root)
+		__field(unsigned long long, blkno)
+		__field(int, depth)
+	),
+	TP_fast_assign(
+		__entry->subtree_root = subtree_root;
+		__entry->blkno = blkno;
+		__entry->depth = depth;
+	),
+	TP_printk("%d %llu %d", __entry->subtree_root,
+		  __entry->blkno, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_insert_extent,
+	TP_PROTO(unsigned int ins_appending, unsigned int ins_contig,
+		 int ins_contig_index, int free_records, int ins_tree_depth),
+	TP_ARGS(ins_appending, ins_contig, ins_contig_index, free_records,
+		ins_tree_depth),
+	TP_STRUCT__entry(
+		__field(unsigned int, ins_appending)
+		__field(unsigned int, ins_contig)
+		__field(int, ins_contig_index)
+		__field(int, free_records)
+		__field(int, ins_tree_depth)
+	),
+	TP_fast_assign(
+		__entry->ins_appending = ins_appending;
+		__entry->ins_contig = ins_contig;
+		__entry->ins_contig_index = ins_contig_index;
+		__entry->free_records = free_records;
+		__entry->ins_tree_depth = ins_tree_depth;
+	),
+	TP_printk("%u %u %d %d %d",
+		  __entry->ins_appending, __entry->ins_contig,
+		  __entry->ins_contig_index, __entry->free_records,
+		  __entry->ins_tree_depth)
+);
+
+TRACE_EVENT(ocfs2_split_extent,
+	TP_PROTO(int split_index, unsigned int c_contig_type,
+		 unsigned int c_has_empty_extent,
+		 unsigned int c_split_covers_rec),
+	TP_ARGS(split_index, c_contig_type,
+		c_has_empty_extent, c_split_covers_rec),
+	TP_STRUCT__entry(
+		__field(int, split_index)
+		__field(unsigned int, c_contig_type)
+		__field(unsigned int, c_has_empty_extent)
+		__field(unsigned int, c_split_covers_rec)
+	),
+	TP_fast_assign(
+		__entry->split_index = split_index;
+		__entry->c_contig_type = c_contig_type;
+		__entry->c_has_empty_extent = c_has_empty_extent;
+		__entry->c_split_covers_rec = c_split_covers_rec;
+	),
+	TP_printk("%d %u %u %u", __entry->split_index, __entry->c_contig_type,
+		  __entry->c_has_empty_extent, __entry->c_split_covers_rec)
+);
+
+TRACE_EVENT(ocfs2_remove_extent,
+	TP_PROTO(unsigned long long owner, unsigned int cpos,
+		 unsigned int len, int index,
+		 unsigned int e_cpos, unsigned int clusters),
+	TP_ARGS(owner, cpos, len, index, e_cpos, clusters),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned int, cpos)
+		__field(unsigned int, len)
+		__field(int, index)
+		__field(unsigned int, e_cpos)
+		__field(unsigned int, clusters)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->index = index;
+		__entry->e_cpos = e_cpos;
+		__entry->clusters = clusters;
+	),
+	TP_printk("%llu %u %u %d %u %u",
+		  __entry->owner, __entry->cpos, __entry->len, __entry->index,
+		  __entry->e_cpos, __entry->clusters)
+);
+
+TRACE_EVENT(ocfs2_commit_truncate,
+	TP_PROTO(unsigned long long ino, unsigned int new_cpos,
+		 unsigned int clusters, unsigned int depth),
+	TP_ARGS(ino, new_cpos, clusters, depth),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, new_cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned int, depth)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->new_cpos = new_cpos;
+		__entry->clusters = clusters;
+		__entry->depth = depth;
+	),
+	TP_printk("%llu %u %u %u",
+		  __entry->ino, __entry->new_cpos,
+		  __entry->clusters, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_validate_extent_block,
+	TP_PROTO(unsigned long long blkno),
+	TP_ARGS(blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+	),
+	TP_printk("%llu ", __entry->blkno)
+);
+
+TRACE_EVENT(ocfs2_rotate_leaf,
+	TP_PROTO(unsigned int insert_cpos, int insert_index,
+		 int has_empty, int next_free,
+		 unsigned int l_count),
+	TP_ARGS(insert_cpos, insert_index, has_empty,
+		next_free, l_count),
+	TP_STRUCT__entry(
+		__field(unsigned int, insert_cpos)
+		__field(int, insert_index)
+		__field(int, has_empty)
+		__field(int, next_free)
+		__field(unsigned int, l_count)
+	),
+	TP_fast_assign(
+		__entry->insert_cpos = insert_cpos;
+		__entry->insert_index = insert_index;
+		__entry->has_empty = has_empty;
+		__entry->next_free = next_free;
+		__entry->l_count = l_count;
+	),
+	TP_printk("%u %d %d %d %u", __entry->insert_cpos,
+		  __entry->insert_index, __entry->has_empty,
+		  __entry->next_free, __entry->l_count)
+);
+
+TRACE_EVENT(ocfs2_add_clusters_in_btree_ret,
+	TP_PROTO(int status, int reason, int err),
+	TP_ARGS(status, reason, err),
+	TP_STRUCT__entry(
+		__field(int, status)
+		__field(int, reason)
+		__field(int, err)
+	),
+	TP_fast_assign(
+		__entry->status = status;
+		__entry->reason = reason;
+		__entry->err = err;
+	),
+	TP_printk("%d %d %d", __entry->status,
+		  __entry->reason, __entry->err)
+);
+
+TRACE_EVENT(ocfs2_mark_extent_written,
+	TP_PROTO(unsigned long long owner, unsigned int cpos,
+		 unsigned int len, unsigned int phys),
+	TP_ARGS(owner, cpos, len, phys),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned int, cpos)
+		__field(unsigned int, len)
+		__field(unsigned int, phys)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->phys = phys;
+	),
+	TP_printk("%llu %u %u %u",
+		  __entry->owner, __entry->cpos,
+		  __entry->len, __entry->phys)
+);
+
+DECLARE_EVENT_CLASS(ocfs2__truncate_log_ops,
+	TP_PROTO(unsigned long long blkno, int index,
+		 unsigned int start, unsigned int num),
+	TP_ARGS(blkno, index, start, num),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+		__field(int, index)
+		__field(unsigned int, start)
+		__field(unsigned int, num)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+		__entry->index = index;
+		__entry->start = start;
+		__entry->num = num;
+	),
+	TP_printk("%llu %d %u %u",
+		  __entry->blkno, __entry->index,
+		  __entry->start, __entry->num)
+);
+
+#define DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(name)	\
+DEFINE_EVENT(ocfs2__truncate_log_ops, name,	\
+	TP_PROTO(unsigned long long blkno, int index,	\
+		 unsigned int start, unsigned int num),	\
+	TP_ARGS(blkno, index, start, num))
+
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_truncate_log_append);
+
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_replay_truncate_records);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_flush_truncate_log);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_truncate_log_recovery);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_truncate_log_recovery_num);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_complete_truncate_log_recovery);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_free_cached_blocks);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_cache_cluster_dealloc);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_run_deallocs);
+
+TRACE_EVENT(ocfs2_cache_block_dealloc,
+	TP_PROTO(int type, int slot, unsigned long long suballoc,
+		 unsigned long long blkno, unsigned int bit),
+	TP_ARGS(type, slot, suballoc, blkno, bit),
+	TP_STRUCT__entry(
+		__field(int, type)
+		__field(int, slot)
+		__field(unsigned long long, suballoc)
+		__field(unsigned long long, blkno)
+		__field(unsigned int, bit)
+	),
+	TP_fast_assign(
+		__entry->type = type;
+		__entry->slot = slot;
+		__entry->suballoc = suballoc;
+		__entry->blkno = blkno;
+		__entry->bit = bit;
+	),
+	TP_printk("%d %d %llu %llu %u",
+		  __entry->type, __entry->slot, __entry->suballoc,
+		  __entry->blkno, __entry->bit)
+);
+
+TRACE_EVENT(ocfs2_trim_extent,
+	TP_PROTO(struct super_block *sb, unsigned long long blk,
+		 unsigned long long count),
+	TP_ARGS(sb, blk, count),
+	TP_STRUCT__entry(
+		__field(int, dev_major)
+		__field(int, dev_minor)
+		__field(unsigned long long, blk)
+		__field(__u64,	count)
+	),
+	TP_fast_assign(
+		__entry->dev_major = MAJOR(sb->s_dev);
+		__entry->dev_minor = MINOR(sb->s_dev);
+		__entry->blk = blk;
+		__entry->count = count;
+	),
+	TP_printk("%d %d %llu %llu",
+		  __entry->dev_major, __entry->dev_minor,
+		  __entry->blk, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
+
+/* End of trace events for fs/ocfs2/alloc.c. */
+
+/* Trace events for fs/ocfs2/localalloc.c. */
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_la_set_sizes);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_alloc_should_use_local);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_load_local_alloc);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_local_alloc_recovery);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_reserve_local_alloc_bits);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_local_alloc_count_bits);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits_search_bitmap);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_sync_local_to_main);
+
+TRACE_EVENT(ocfs2_sync_local_to_main_free,
+	TP_PROTO(int count, int bit, unsigned long long start_blk,
+		 unsigned long long blkno),
+	TP_ARGS(count, bit, start_blk, blkno),
+	TP_STRUCT__entry(
+		__field(int, count)
+		__field(int, bit)
+		__field(unsigned long long, start_blk)
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->count = count;
+		__entry->bit = bit;
+		__entry->start_blk = start_blk;
+		__entry->blkno = blkno;
+	),
+	TP_printk("%d %d %llu %llu",
+		  __entry->count, __entry->bit, __entry->start_blk,
+		  __entry->blkno)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_new_window);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_local_alloc_new_window_result);
+
+/* End of trace events for fs/ocfs2/localalloc.c. */
+
+/* Trace events for fs/ocfs2/resize.c. */
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_update_last_group_and_inode);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_group_extend);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_group_add);
+
+/* End of trace events for fs/ocfs2/resize.c. */
+
+/* Trace events for fs/ocfs2/suballoc.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_group_descriptor);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_contig);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_discontig);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_block_group_alloc);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_nospc);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_no_new_group);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_reserve_new_inode_new_group);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_set_bits);
+
+TRACE_EVENT(ocfs2_relink_block_group,
+	TP_PROTO(unsigned long long i_blkno, unsigned int chain,
+		 unsigned long long bg_blkno,
+		 unsigned long long prev_blkno),
+	TP_ARGS(i_blkno, chain, bg_blkno, prev_blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, i_blkno)
+		__field(unsigned int, chain)
+		__field(unsigned long long, bg_blkno)
+		__field(unsigned long long, prev_blkno)
+	),
+	TP_fast_assign(
+		__entry->i_blkno = i_blkno;
+		__entry->chain = chain;
+		__entry->bg_blkno = bg_blkno;
+		__entry->prev_blkno = prev_blkno;
+	),
+	TP_printk("%llu %u %llu %llu",
+		  __entry->i_blkno, __entry->chain, __entry->bg_blkno,
+		  __entry->prev_blkno)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_cluster_group_search_wrong_max_bits);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cluster_group_search_max_block);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_block_group_search_max_block);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_search_chain_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_succ);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_end);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_claim_suballoc_bits);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_claim_new_inode_at_loc);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_clear_bits);
+
+TRACE_EVENT(ocfs2_free_suballoc_bits,
+	TP_PROTO(unsigned long long inode, unsigned long long group,
+		 unsigned int start_bit, unsigned int count),
+	TP_ARGS(inode, group, start_bit, count),
+	TP_STRUCT__entry(
+		__field(unsigned long long, inode)
+		__field(unsigned long long, group)
+		__field(unsigned int, start_bit)
+		__field(unsigned int, count)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->group = group;
+		__entry->start_bit = start_bit;
+		__entry->count = count;
+	),
+	TP_printk("%llu %llu %u %u", __entry->inode, __entry->group,
+		  __entry->start_bit, __entry->count)
+);
+
+TRACE_EVENT(ocfs2_free_clusters,
+	TP_PROTO(unsigned long long bg_blkno, unsigned long long start_blk,
+		 unsigned int start_bit, unsigned int count),
+	TP_ARGS(bg_blkno, start_blk, start_bit, count),
+	TP_STRUCT__entry(
+		__field(unsigned long long, bg_blkno)
+		__field(unsigned long long, start_blk)
+		__field(unsigned int, start_bit)
+		__field(unsigned int, count)
+	),
+	TP_fast_assign(
+		__entry->bg_blkno = bg_blkno;
+		__entry->start_blk = start_blk;
+		__entry->start_bit = start_bit;
+		__entry->count = count;
+	),
+	TP_printk("%llu %llu %u %u", __entry->bg_blkno, __entry->start_blk,
+		  __entry->start_bit, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_get_suballoc_slot_bit);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_test_suballoc_bit);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_test_inode_bit);
+
+/* End of trace events for fs/ocfs2/suballoc.c. */
+
+/* Trace events for fs/ocfs2/refcounttree.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_refcount_block);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_refcount_trees);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree_blkno);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_change_refcount_rec);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_expand_inline_ref_root);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_divide_leaf_refcount_block);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_new_leaf_refcount_block);
+
+DECLARE_EVENT_CLASS(ocfs2__refcount_tree_ops,
+	TP_PROTO(unsigned long long blkno, int index,
+		 unsigned long long cpos,
+		 unsigned int clusters, unsigned int refcount),
+	TP_ARGS(blkno, index, cpos, clusters, refcount),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+		__field(int, index)
+		__field(unsigned long long, cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned int, refcount)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+		__entry->index = index;
+		__entry->cpos = cpos;
+		__entry->clusters = clusters;
+		__entry->refcount = refcount;
+	),
+	TP_printk("%llu %d %llu %u %u", __entry->blkno, __entry->index,
+		  __entry->cpos, __entry->clusters, __entry->refcount)
+);
+
+#define DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(name)	\
+DEFINE_EVENT(ocfs2__refcount_tree_ops, name,		\
+	TP_PROTO(unsigned long long blkno, int index,	\
+		 unsigned long long cpos,		\
+		 unsigned int count, unsigned int refcount),	\
+	TP_ARGS(blkno, index, cpos, count, refcount))
+
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_insert_refcount_rec);
+
+TRACE_EVENT(ocfs2_split_refcount_rec,
+	TP_PROTO(unsigned long long cpos,
+		 unsigned int clusters, unsigned int refcount,
+		 unsigned long long split_cpos,
+		 unsigned int split_clusters, unsigned int split_refcount),
+	TP_ARGS(cpos, clusters, refcount,
+		split_cpos, split_clusters, split_refcount),
+	TP_STRUCT__entry(
+		__field(unsigned long long, cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned int, refcount)
+		__field(unsigned long long, split_cpos)
+		__field(unsigned int, split_clusters)
+		__field(unsigned int, split_refcount)
+	),
+	TP_fast_assign(
+		__entry->cpos = cpos;
+		__entry->clusters = clusters;
+		__entry->refcount = refcount;
+		__entry->split_cpos = split_cpos;
+		__entry->split_clusters = split_clusters;
+		__entry->split_refcount	= split_refcount;
+	),
+	TP_printk("%llu %u %u %llu %u %u",
+		  __entry->cpos, __entry->clusters, __entry->refcount,
+		  __entry->split_cpos, __entry->split_clusters,
+		  __entry->split_refcount)
+);
+
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_split_refcount_rec_insert);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_increase_refcount_begin);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_change);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_increase_refcount_insert);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_split);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_remove_refcount_extent);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_restore_refcount_block);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_decrease_refcount_rec);
+
+TRACE_EVENT(ocfs2_decrease_refcount,
+	TP_PROTO(unsigned long long owner,
+		 unsigned long long cpos,
+		 unsigned int len, int delete),
+	TP_ARGS(owner, cpos, len, delete),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned long long, cpos)
+		__field(unsigned int, len)
+		__field(int, delete)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->delete = delete;
+	),
+	TP_printk("%llu %llu %u %d",
+		  __entry->owner, __entry->cpos, __entry->len, __entry->delete)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_mark_extent_refcounted);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_calc_refcount_meta_credits);
+
+TRACE_EVENT(ocfs2_calc_refcount_meta_credits_iterate,
+	TP_PROTO(int recs_add, unsigned long long cpos,
+		 unsigned int clusters, unsigned long long r_cpos,
+		 unsigned int r_clusters, unsigned int refcount, int index),
+	TP_ARGS(recs_add, cpos, clusters, r_cpos, r_clusters, refcount, index),
+	TP_STRUCT__entry(
+		__field(int, recs_add)
+		__field(unsigned long long, cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned long long, r_cpos)
+		__field(unsigned int, r_clusters)
+		__field(unsigned int, refcount)
+		__field(int, index)
+	),
+	TP_fast_assign(
+		__entry->recs_add = recs_add;
+		__entry->cpos = cpos;
+		__entry->clusters = clusters;
+		__entry->r_cpos = r_cpos;
+		__entry->r_clusters = r_clusters;
+		__entry->refcount = refcount;
+		__entry->index = index;
+	),
+	TP_printk("%d %llu %u %llu %u %u %d",
+		  __entry->recs_add, __entry->cpos, __entry->clusters,
+		  __entry->r_cpos, __entry->r_clusters,
+		  __entry->refcount, __entry->index)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_add_refcount_flag);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_prepare_refcount_change_for_del);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_lock_refcount_allocators);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_page);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_jbd);
+
+TRACE_EVENT(ocfs2_clear_ext_refcount,
+	TP_PROTO(unsigned long long ino, unsigned int cpos,
+		 unsigned int len, unsigned int p_cluster,
+		 unsigned int ext_flags),
+	TP_ARGS(ino, cpos, len, p_cluster, ext_flags),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, cpos)
+		__field(unsigned int, len)
+		__field(unsigned int, p_cluster)
+		__field(unsigned int, ext_flags)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->p_cluster = p_cluster;
+		__entry->ext_flags = ext_flags;
+	),
+	TP_printk("%llu %u %u %u %u",
+		  __entry->ino, __entry->cpos, __entry->len,
+		  __entry->p_cluster, __entry->ext_flags)
+);
+
+TRACE_EVENT(ocfs2_replace_clusters,
+	TP_PROTO(unsigned long long ino, unsigned int cpos,
+		 unsigned int old, unsigned int new, unsigned int len,
+		 unsigned int ext_flags),
+	TP_ARGS(ino, cpos, old, new, len, ext_flags),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, cpos)
+		__field(unsigned int, old)
+		__field(unsigned int, new)
+		__field(unsigned int, len)
+		__field(unsigned int, ext_flags)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->cpos = cpos;
+		__entry->old = old;
+		__entry->new = new;
+		__entry->len = len;
+		__entry->ext_flags = ext_flags;
+	),
+	TP_printk("%llu %u %u %u %u %u",
+		  __entry->ino, __entry->cpos, __entry->old, __entry->new,
+		  __entry->len, __entry->ext_flags)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_make_clusters_writable);
+
+TRACE_EVENT(ocfs2_refcount_cow_hunk,
+	TP_PROTO(unsigned long long ino, unsigned int cpos,
+		 unsigned int write_len, unsigned int max_cpos,
+		 unsigned int cow_start, unsigned int cow_len),
+	TP_ARGS(ino, cpos, write_len, max_cpos, cow_start, cow_len),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, cpos)
+		__field(unsigned int, write_len)
+		__field(unsigned int, max_cpos)
+		__field(unsigned int, cow_start)
+		__field(unsigned int, cow_len)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->cpos = cpos;
+		__entry->write_len = write_len;
+		__entry->max_cpos = max_cpos;
+		__entry->cow_start = cow_start;
+		__entry->cow_len = cow_len;
+	),
+	TP_printk("%llu %u %u %u %u %u",
+		  __entry->ino, __entry->cpos, __entry->write_len,
+		  __entry->max_cpos, __entry->cow_start, __entry->cow_len)
+);
+
+/* End of trace events for fs/ocfs2/refcounttree.c. */
+
+/* Trace events for fs/ocfs2/aops.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__get_block,
+	TP_PROTO(unsigned long long ino, unsigned long long iblock,
+		 void *bh_result, int create),
+	TP_ARGS(ino, iblock, bh_result, create),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, iblock)
+		__field(void *, bh_result)
+		__field(int, create)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->iblock = iblock;
+		__entry->bh_result = bh_result;
+		__entry->create = create;
+	),
+	TP_printk("%llu %llu %p %d",
+		  __entry->ino, __entry->iblock,
+		  __entry->bh_result, __entry->create)
+);
+
+#define DEFINE_OCFS2_GET_BLOCK_EVENT(name)	\
+DEFINE_EVENT(ocfs2__get_block, name,	\
+	TP_PROTO(unsigned long long ino, unsigned long long iblock,	\
+		 void *bh_result, int create),	\
+	TP_ARGS(ino, iblock, bh_result, create))
+
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_symlink_get_block);
+
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_get_block);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap);
+
+TRACE_EVENT(ocfs2_try_to_write_inline_data,
+	TP_PROTO(unsigned long long ino, unsigned int len,
+		 unsigned long long pos, unsigned int flags),
+	TP_ARGS(ino, len, pos, flags),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, len)
+		__field(unsigned long long, pos)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->len = len;
+		__entry->pos = pos;
+		__entry->flags = flags;
+	),
+	TP_printk("%llu %u %llu 0x%x",
+		  __entry->ino, __entry->len, __entry->pos, __entry->flags)
+);
+
+TRACE_EVENT(ocfs2_write_begin_nolock,
+	TP_PROTO(unsigned long long ino,
+		 long long i_size, unsigned int i_clusters,
+		 unsigned long long pos, unsigned int len,
+		 unsigned int flags, void *page,
+		 unsigned int clusters, unsigned int extents_to_split),
+	TP_ARGS(ino, i_size, i_clusters, pos, len, flags,
+		page, clusters, extents_to_split),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(long long, i_size)
+		__field(unsigned int, i_clusters)
+		__field(unsigned long long, pos)
+		__field(unsigned int, len)
+		__field(unsigned int, flags)
+		__field(void *, page)
+		__field(unsigned int, clusters)
+		__field(unsigned int, extents_to_split)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->i_size = i_size;
+		__entry->i_clusters = i_clusters;
+		__entry->pos = pos;
+		__entry->len = len;
+		__entry->flags = flags;
+		__entry->page = page;
+		__entry->clusters = clusters;
+		__entry->extents_to_split = extents_to_split;
+	),
+	TP_printk("%llu %lld %u %llu %u %u %p %u %u",
+		  __entry->ino, __entry->i_size, __entry->i_clusters,
+		  __entry->pos, __entry->len,
+		  __entry->flags, __entry->page, __entry->clusters,
+		  __entry->extents_to_split)
+);
+
+TRACE_EVENT(ocfs2_write_end_inline,
+	TP_PROTO(unsigned long long ino,
+		 unsigned long long pos, unsigned int copied,
+		 unsigned int id_count, unsigned int features),
+	TP_ARGS(ino, pos, copied, id_count, features),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, pos)
+		__field(unsigned int, copied)
+		__field(unsigned int, id_count)
+		__field(unsigned int, features)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->pos = pos;
+		__entry->copied = copied;
+		__entry->id_count = id_count;
+		__entry->features = features;
+	),
+	TP_printk("%llu %llu %u %u %u",
+		  __entry->ino, __entry->pos, __entry->copied,
+		  __entry->id_count, __entry->features)
+);
+
+/* End of trace events for fs/ocfs2/aops.c. */
+
+/* Trace events for fs/ocfs2/mmap.c. */
+
+TRACE_EVENT(ocfs2_fault,
+	TP_PROTO(unsigned long long ino,
+		 void *area, void *page, unsigned long pgoff),
+	TP_ARGS(ino, area, page, pgoff),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(void *, area)
+		__field(void *, page)
+		__field(unsigned long, pgoff)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->area = area;
+		__entry->page = page;
+		__entry->pgoff = pgoff;
+	),
+	TP_printk("%llu %p %p %lu",
+		  __entry->ino, __entry->area, __entry->page, __entry->pgoff)
+);
+
+/* End of trace events for fs/ocfs2/mmap.c. */
+
+/* Trace events for fs/ocfs2/file.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__file_ops,
+	TP_PROTO(void *inode, void *file, void *dentry,
+		 unsigned long long ino,
+		 unsigned int d_len, const unsigned char *d_name,
+		 unsigned long long para),
+	TP_ARGS(inode, file, dentry, ino, d_len, d_name, para),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(void *, file)
+		__field(void *, dentry)
+		__field(unsigned long long, ino)
+		__field(unsigned int, d_len)
+		__string(d_name, d_name)
+		__field(unsigned long long, para)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->file = file;
+		__entry->dentry = dentry;
+		__entry->ino = ino;
+		__entry->d_len = d_len;
+		__assign_str(d_name, d_name);
+		__entry->para = para;
+	),
+	TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file,
+		  __entry->dentry, __entry->ino, __entry->para,
+		  __entry->d_len, __get_str(d_name))
+);
+
+#define DEFINE_OCFS2_FILE_OPS(name)				\
+DEFINE_EVENT(ocfs2__file_ops, name,				\
+TP_PROTO(void *inode, void *file, void *dentry,			\
+	 unsigned long long ino,				\
+	 unsigned int d_len, const unsigned char *d_name,	\
+	 unsigned long long mode),				\
+	TP_ARGS(inode, file, dentry, ino, d_len, d_name, mode))
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_open);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_release);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error);
+
+TRACE_EVENT(ocfs2_extend_allocation,
+	TP_PROTO(unsigned long long ip_blkno, unsigned long long size,
+		 unsigned int clusters, unsigned int clusters_to_add,
+		 int why, int restart_func),
+	TP_ARGS(ip_blkno, size, clusters, clusters_to_add, why, restart_func),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ip_blkno)
+		__field(unsigned long long, size)
+		__field(unsigned int, clusters)
+		__field(unsigned int, clusters_to_add)
+		__field(int, why)
+		__field(int, restart_func)
+	),
+	TP_fast_assign(
+		__entry->ip_blkno = ip_blkno;
+		__entry->size = size;
+		__entry->clusters = clusters;
+		__entry->clusters_to_add = clusters_to_add;
+		__entry->why = why;
+		__entry->restart_func = restart_func;
+	),
+	TP_printk("%llu %llu %u %u %d %d",
+		  __entry->ip_blkno, __entry->size, __entry->clusters,
+		  __entry->clusters_to_add, __entry->why, __entry->restart_func)
+);
+
+TRACE_EVENT(ocfs2_extend_allocation_end,
+	TP_PROTO(unsigned long long ino,
+		 unsigned int di_clusters, unsigned long long di_size,
+		 unsigned int ip_clusters, unsigned long long i_size),
+	TP_ARGS(ino, di_clusters, di_size, ip_clusters, i_size),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, di_clusters)
+		__field(unsigned long long, di_size)
+		__field(unsigned int, ip_clusters)
+		__field(unsigned long long, i_size)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->di_clusters = di_clusters;
+		__entry->di_size = di_size;
+		__entry->ip_clusters = ip_clusters;
+		__entry->i_size = i_size;
+	),
+	TP_printk("%llu %u %llu %u %llu", __entry->ino, __entry->di_clusters,
+		  __entry->di_size, __entry->ip_clusters, __entry->i_size)
+);
+
+TRACE_EVENT(ocfs2_write_zero_page,
+	TP_PROTO(unsigned long long ino,
+		 unsigned long long abs_from, unsigned long long abs_to,
+		 unsigned long index, unsigned int zero_from,
+		 unsigned int zero_to),
+	TP_ARGS(ino, abs_from, abs_to, index, zero_from, zero_to),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, abs_from)
+		__field(unsigned long long, abs_to)
+		__field(unsigned long, index)
+		__field(unsigned int, zero_from)
+		__field(unsigned int, zero_to)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->abs_from = abs_from;
+		__entry->abs_to = abs_to;
+		__entry->index = index;
+		__entry->zero_from = zero_from;
+		__entry->zero_to = zero_to;
+	),
+	TP_printk("%llu %llu %llu %lu %u %u", __entry->ino,
+		  __entry->abs_from, __entry->abs_to,
+		  __entry->index, __entry->zero_from, __entry->zero_to)
+);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend_range);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend);
+
+TRACE_EVENT(ocfs2_setattr,
+	TP_PROTO(void *inode, void *dentry,
+		 unsigned long long ino,
+		 unsigned int d_len, const unsigned char *d_name,
+		 unsigned int ia_valid, unsigned int ia_mode,
+		 unsigned int ia_uid, unsigned int ia_gid),
+	TP_ARGS(inode, dentry, ino, d_len, d_name,
+		ia_valid, ia_mode, ia_uid, ia_gid),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(void *, dentry)
+		__field(unsigned long long, ino)
+		__field(unsigned int, d_len)
+		__string(d_name, d_name)
+		__field(unsigned int, ia_valid)
+		__field(unsigned int, ia_mode)
+		__field(unsigned int, ia_uid)
+		__field(unsigned int, ia_gid)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->dentry = dentry;
+		__entry->ino = ino;
+		__entry->d_len = d_len;
+		__assign_str(d_name, d_name);
+		__entry->ia_valid = ia_valid;
+		__entry->ia_mode = ia_mode;
+		__entry->ia_uid = ia_uid;
+		__entry->ia_gid = ia_gid;
+	),
+	TP_printk("%p %p %llu %.*s %u %u %u %u", __entry->inode,
+		  __entry->dentry, __entry->ino, __entry->d_len,
+		  __get_str(d_name), __entry->ia_valid, __entry->ia_mode,
+		  __entry->ia_uid, __entry->ia_gid)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_write_remove_suid);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_partial_clusters);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range1);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range2);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
+
+TRACE_EVENT(ocfs2_prepare_inode_for_write,
+	TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
+		 int appending, unsigned long count,
+		 int *direct_io, int *has_refcount),
+	TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, saved_pos)
+		__field(int, appending)
+		__field(unsigned long, count)
+		__field(int, direct_io)
+		__field(int, has_refcount)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->saved_pos = saved_pos;
+		__entry->appending = appending;
+		__entry->count = count;
+		__entry->direct_io = direct_io ? *direct_io : -1;
+		__entry->has_refcount = has_refcount ? *has_refcount : -1;
+	),
+	TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
+		  __entry->saved_pos, __entry->appending, __entry->count,
+		  __entry->direct_io, __entry->has_refcount)
+);
+
+DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
+
+/* End of trace events for fs/ocfs2/file.c. */
+
+/* Trace events for fs/ocfs2/inode.c. */
+
+TRACE_EVENT(ocfs2_iget_begin,
+	TP_PROTO(unsigned long long ino, unsigned int flags, int sysfile_type),
+	TP_ARGS(ino, flags, sysfile_type),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, flags)
+		__field(int, sysfile_type)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->flags = flags;
+		__entry->sysfile_type = sysfile_type;
+	),
+	TP_printk("%llu %u %d", __entry->ino,
+		  __entry->flags, __entry->sysfile_type)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_iget5_locked);
+
+TRACE_EVENT(ocfs2_iget_end,
+	TP_PROTO(void *inode, unsigned long long ino),
+	TP_ARGS(inode, ino),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, ino)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->ino = ino;
+	),
+	TP_printk("%p %llu", __entry->inode, __entry->ino)
+);
+
+TRACE_EVENT(ocfs2_find_actor,
+	TP_PROTO(void *inode, unsigned long long ino,
+		 void *args,  unsigned long long fi_blkno),
+	TP_ARGS(inode, ino, args, fi_blkno),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, ino)
+		__field(void *, args)
+		__field(unsigned long long, fi_blkno)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->ino = ino;
+		__entry->args = args;
+		__entry->fi_blkno = fi_blkno;
+	),
+	TP_printk("%p %llu %p %llu", __entry->inode, __entry->ino,
+		  __entry->args, __entry->fi_blkno)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_populate_inode);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+
+TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
+	TP_PROTO(void *task, void *dc_task, unsigned long long ino,
+		 unsigned int flags),
+	TP_ARGS(task, dc_task, ino, flags),
+	TP_STRUCT__entry(
+		__field(void *, task)
+		__field(void *, dc_task)
+		__field(unsigned long long, ino)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->task = task;
+		__entry->dc_task = dc_task;
+		__entry->ino = ino;
+		__entry->flags = flags;
+	),
+	TP_printk("%p %p %llu %u", __entry->task, __entry->dc_task,
+		  __entry->ino, __entry->flags)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_query_inode_wipe_begin);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_query_inode_wipe_succ);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_query_inode_wipe_end);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_cleanup_delete_inode);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode);
+
+TRACE_EVENT(ocfs2_inode_revalidate,
+	TP_PROTO(void *inode, unsigned long long ino,
+		 unsigned int flags),
+	TP_ARGS(inode, ino, flags),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, ino)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->ino = ino;
+		__entry->flags = flags;
+	),
+	TP_printk("%p %llu %u", __entry->inode, __entry->ino, __entry->flags)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_mark_inode_dirty);
+
+/* End of trace events for fs/ocfs2/inode.c. */
+
+/* Trace events for fs/ocfs2/extent_map.c. */
+
+TRACE_EVENT(ocfs2_read_virt_blocks,
+	TP_PROTO(void *inode, unsigned long long vblock, int nr,
+		 void *bhs, unsigned int flags, void *validate),
+	TP_ARGS(inode, vblock, nr, bhs, flags, validate),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, vblock)
+		__field(int, nr)
+		__field(void *, bhs)
+		__field(unsigned int, flags)
+		__field(void *, validate)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->vblock = vblock;
+		__entry->nr = nr;
+		__entry->bhs = bhs;
+		__entry->flags = flags;
+		__entry->validate = validate;
+	),
+	TP_printk("%p %llu %d %p %x %p", __entry->inode, __entry->vblock,
+		  __entry->nr, __entry->bhs, __entry->flags, __entry->validate)
+);
+
+/* End of trace events for fs/ocfs2/extent_map.c. */
+
+/* Trace events for fs/ocfs2/slot_map.c. */
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_refresh_slot_info);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers_block);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_find_slot);
+
+/* End of trace events for fs/ocfs2/slot_map.c. */
+
+/* Trace events for fs/ocfs2/heartbeat.c. */
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_do_node_down);
+
+/* End of trace events for fs/ocfs2/heartbeat.c. */
+
+/* Trace events for fs/ocfs2/super.c. */
+
+TRACE_EVENT(ocfs2_remount,
+	TP_PROTO(unsigned long s_flags, unsigned long osb_flags, int flags),
+	TP_ARGS(s_flags, osb_flags, flags),
+	TP_STRUCT__entry(
+		__field(unsigned long, s_flags)
+		__field(unsigned long, osb_flags)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->s_flags = s_flags;
+		__entry->osb_flags = osb_flags;
+		__entry->flags = flags;
+	),
+	TP_printk("%lu %lu %d", __entry->s_flags,
+		  __entry->osb_flags, __entry->flags)
+);
+
+TRACE_EVENT(ocfs2_fill_super,
+	TP_PROTO(void *sb, void *data, int silent),
+	TP_ARGS(sb, data, silent),
+	TP_STRUCT__entry(
+		__field(void *, sb)
+		__field(void *, data)
+		__field(int, silent)
+	),
+	TP_fast_assign(
+		__entry->sb = sb;
+		__entry->data = data;
+		__entry->silent = silent;
+	),
+	TP_printk("%p %p %d", __entry->sb,
+		  __entry->data, __entry->silent)
+);
+
+TRACE_EVENT(ocfs2_parse_options,
+	TP_PROTO(int is_remount, char *options),
+	TP_ARGS(is_remount, options),
+	TP_STRUCT__entry(
+		__field(int, is_remount)
+		__string(options, options)
+	),
+	TP_fast_assign(
+		__entry->is_remount = is_remount;
+		__assign_str(options, options);
+	),
+	TP_printk("%d %s", __entry->is_remount, __get_str(options))
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super);
+
+TRACE_EVENT(ocfs2_statfs,
+	TP_PROTO(void *sb, void *buf),
+	TP_ARGS(sb, buf),
+	TP_STRUCT__entry(
+		__field(void *, sb)
+		__field(void *, buf)
+	),
+	TP_fast_assign(
+		__entry->sb = sb;
+		__entry->buf = buf;
+	),
+	TP_printk("%p %p", __entry->sb, __entry->buf)
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_dismount_volume);
+
+TRACE_EVENT(ocfs2_initialize_super,
+	TP_PROTO(char *label, char *uuid_str, unsigned long long root_dir,
+		 unsigned long long system_dir, int cluster_bits),
+	TP_ARGS(label, uuid_str, root_dir, system_dir, cluster_bits),
+	TP_STRUCT__entry(
+		__string(label, label)
+		__string(uuid_str, uuid_str)
+		__field(unsigned long long, root_dir)
+		__field(unsigned long long, system_dir)
+		__field(int, cluster_bits)
+	),
+	TP_fast_assign(
+		__assign_str(label, label);
+		__assign_str(uuid_str, uuid_str);
+		__entry->root_dir = root_dir;
+		__entry->system_dir = system_dir;
+		__entry->cluster_bits = cluster_bits;
+	),
+	TP_printk("%s %s %llu %llu %d", __get_str(label), __get_str(uuid_str),
+		  __entry->root_dir, __entry->system_dir, __entry->cluster_bits)
+);
+
+/* End of trace events for fs/ocfs2/super.c. */
+
+/* Trace events for fs/ocfs2/xattr.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_xattr_block);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_xattr_extend_allocation);
+
+TRACE_EVENT(ocfs2_init_xattr_set_ctxt,
+	TP_PROTO(const char *name, int meta, int clusters, int credits),
+	TP_ARGS(name, meta, clusters, credits),
+	TP_STRUCT__entry(
+		__string(name, name)
+		__field(int, meta)
+		__field(int, clusters)
+		__field(int, credits)
+	),
+	TP_fast_assign(
+		__assign_str(name, name);
+		__entry->meta = meta;
+		__entry->clusters = clusters;
+		__entry->credits = credits;
+	),
+	TP_printk("%s %d %d %d", __get_str(name), __entry->meta,
+		  __entry->clusters, __entry->credits)
+);
+
+DECLARE_EVENT_CLASS(ocfs2__xattr_find,
+	TP_PROTO(unsigned long long ino, const char *name, int name_index,
+		 unsigned int hash, unsigned long long location,
+		 int xe_index),
+	TP_ARGS(ino, name, name_index, hash, location, xe_index),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__string(name, name)
+		__field(int, name_index)
+		__field(unsigned int, hash)
+		__field(unsigned long long, location)
+		__field(int, xe_index)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__assign_str(name, name);
+		__entry->name_index = name_index;
+		__entry->hash = hash;
+		__entry->location = location;
+		__entry->xe_index = xe_index;
+	),
+	TP_printk("%llu %s %d %u %llu %d", __entry->ino, __get_str(name),
+		  __entry->name_index, __entry->hash, __entry->location,
+		  __entry->xe_index)
+);
+
+#define DEFINE_OCFS2_XATTR_FIND_EVENT(name)					\
+DEFINE_EVENT(ocfs2__xattr_find, name,					\
+TP_PROTO(unsigned long long ino, const char *name, int name_index,	\
+	 unsigned int hash, unsigned long long bucket,			\
+	 int xe_index),							\
+	TP_ARGS(ino, name, name_index, hash, bucket, xe_index))
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_bucket_find);
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find);
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find_rec);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_iterate_xattr_buckets);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_iterate_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cp_xattr_block_to_bucket_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cp_xattr_block_to_bucket_end);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block_begin);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_defrag_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_bucket_cross_cluster);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_divide_xattr_bucket_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_divide_xattr_bucket_move);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_cp_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_buckets);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_adjust_xattr_cross_cluster);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_add_new_xattr_cluster);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_insert);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_extend_xattr_bucket);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_add_new_xattr_bucket);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_xattr_bucket_value_truncate);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_rm_xattr_cluster);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_header);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_create_empty_xattr_block);
+
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_bucket);
+
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_index_block);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_xattr_bucket_value_refcount);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_reflink_xattr_buckets);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_rec);
+
+/* End of trace events for fs/ocfs2/xattr.c. */
+
+/* Trace events for fs/ocfs2/reservations.c. */
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_insert);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_begin);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_end);
+
+TRACE_EVENT(ocfs2_resv_find_window_begin,
+	TP_PROTO(unsigned int r_start, unsigned int r_end, unsigned int goal,
+		 unsigned int wanted, int empty_root),
+	TP_ARGS(r_start, r_end, goal, wanted, empty_root),
+	TP_STRUCT__entry(
+		__field(unsigned int, r_start)
+		__field(unsigned int, r_end)
+		__field(unsigned int, goal)
+		__field(unsigned int, wanted)
+		__field(int, empty_root)
+	),
+	TP_fast_assign(
+		__entry->r_start = r_start;
+		__entry->r_end = r_end;
+		__entry->goal = goal;
+		__entry->wanted = wanted;
+		__entry->empty_root = empty_root;
+	),
+	TP_printk("%u %u %u %u %d", __entry->r_start, __entry->r_end,
+		  __entry->goal, __entry->wanted, __entry->empty_root)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_find_window_prev);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_resv_find_window_next);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cannibalize_resv_begin);
+
+TRACE_EVENT(ocfs2_cannibalize_resv_end,
+	TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+		 unsigned int last_start, unsigned int last_len),
+	TP_ARGS(start, end, len, last_start, last_len),
+	TP_STRUCT__entry(
+		__field(unsigned int, start)
+		__field(unsigned int, end)
+		__field(unsigned int, len)
+		__field(unsigned int, last_start)
+		__field(unsigned int, last_len)
+	),
+	TP_fast_assign(
+		__entry->start = start;
+		__entry->end = end;
+		__entry->len = len;
+		__entry->last_start = last_start;
+		__entry->last_len = last_len;
+	),
+	TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+		  __entry->len, __entry->last_start, __entry->last_len)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_resv_bits);
+
+TRACE_EVENT(ocfs2_resmap_claimed_bits_begin,
+	TP_PROTO(unsigned int cstart, unsigned int cend, unsigned int clen,
+		 unsigned int r_start, unsigned int r_end, unsigned int r_len,
+		 unsigned int last_start, unsigned int last_len),
+	TP_ARGS(cstart, cend, clen, r_start, r_end,
+		r_len, last_start, last_len),
+	TP_STRUCT__entry(
+		__field(unsigned int, cstart)
+		__field(unsigned int, cend)
+		__field(unsigned int, clen)
+		__field(unsigned int, r_start)
+		__field(unsigned int, r_end)
+		__field(unsigned int, r_len)
+		__field(unsigned int, last_start)
+		__field(unsigned int, last_len)
+	),
+	TP_fast_assign(
+		__entry->cstart = cstart;
+		__entry->cend = cend;
+		__entry->clen = clen;
+		__entry->r_start = r_start;
+		__entry->r_end = r_end;
+		__entry->r_len = r_len;
+		__entry->last_start = last_start;
+		__entry->last_len = last_len;
+	),
+	TP_printk("%u %u %u %u %u %u %u %u",
+		  __entry->cstart, __entry->cend, __entry->clen,
+		  __entry->r_start, __entry->r_end, __entry->r_len,
+		  __entry->last_start, __entry->last_len)
+);
+
+TRACE_EVENT(ocfs2_resmap_claimed_bits_end,
+	TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+		 unsigned int last_start, unsigned int last_len),
+	TP_ARGS(start, end, len, last_start, last_len),
+	TP_STRUCT__entry(
+		__field(unsigned int, start)
+		__field(unsigned int, end)
+		__field(unsigned int, len)
+		__field(unsigned int, last_start)
+		__field(unsigned int, last_len)
+	),
+	TP_fast_assign(
+		__entry->start = start;
+		__entry->end = end;
+		__entry->len = len;
+		__entry->last_start = last_start;
+		__entry->last_len = last_len;
+	),
+	TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+		  __entry->len, __entry->last_start, __entry->last_len)
+);
+
+/* End of trace events for fs/ocfs2/reservations.c. */
+
+/* Trace events for fs/ocfs2/quota_local.c. */
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_recover_local_quota_file);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_finish_quota_recovery);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(olq_set_dquot);
+
+/* End of trace events for fs/ocfs2/quota_local.c. */
+
+/* Trace events for fs/ocfs2/quota_global.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_quota_block);
+
+TRACE_EVENT(ocfs2_sync_dquot,
+	TP_PROTO(unsigned int dq_id, long long dqb_curspace,
+		 long long spacechange, long long curinodes,
+		 long long inodechange),
+	TP_ARGS(dq_id, dqb_curspace, spacechange, curinodes, inodechange),
+	TP_STRUCT__entry(
+		__field(unsigned int, dq_id)
+		__field(long long, dqb_curspace)
+		__field(long long, spacechange)
+		__field(long long, curinodes)
+		__field(long long, inodechange)
+	),
+	TP_fast_assign(
+		__entry->dq_id = dq_id;
+		__entry->dqb_curspace = dqb_curspace;
+		__entry->spacechange = spacechange;
+		__entry->curinodes = curinodes;
+		__entry->inodechange = inodechange;
+	),
+	TP_printk("%u %lld %lld %lld %lld", __entry->dq_id,
+		  __entry->dqb_curspace, __entry->spacechange,
+		  __entry->curinodes, __entry->inodechange)
+);
+
+TRACE_EVENT(ocfs2_sync_dquot_helper,
+	TP_PROTO(unsigned int dq_id, unsigned int dq_type, unsigned long type,
+		 const char *s_id),
+	TP_ARGS(dq_id, dq_type, type, s_id),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, dq_id)
+		__field(unsigned int, dq_type)
+		__field(unsigned long, type)
+		__string(s_id, s_id)
+	),
+	TP_fast_assign(
+		__entry->dq_id = dq_id;
+		__entry->dq_type = dq_type;
+		__entry->type = type;
+		__assign_str(s_id, s_id);
+	),
+	TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type,
+		  __entry->type, __get_str(s_id))
+);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_write_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty);
+
+/* End of trace events for fs/ocfs2/quota_global.c. */
+
+/* Trace events for fs/ocfs2/dir.c. */
+DEFINE_OCFS2_INT_EVENT(ocfs2_search_dirblock);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_dir_block);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_find_entry_el);
+
+TRACE_EVENT(ocfs2_dx_dir_search,
+	TP_PROTO(unsigned long long ino, int namelen, const char *name,
+		 unsigned int major_hash, unsigned int minor_hash,
+		 unsigned long long blkno),
+	TP_ARGS(ino, namelen, name, major_hash, minor_hash, blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(int, namelen)
+		__string(name, name)
+		__field(unsigned int, major_hash)
+		__field(unsigned int,minor_hash)
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+		__entry->major_hash = major_hash;
+		__entry->minor_hash = minor_hash;
+		__entry->blkno = blkno;
+	),
+	TP_printk("%llu %.*s %u %u %llu", __entry->ino,
+		   __entry->namelen, __get_str(name),
+		  __entry->major_hash, __entry->minor_hash, __entry->blkno)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_dx_dir_search_leaf_info);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_delete_entry_dx);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_readdir);
+
+TRACE_EVENT(ocfs2_find_files_on_disk,
+	TP_PROTO(int namelen, const char *name, void *blkno,
+		 unsigned long long dir),
+	TP_ARGS(namelen, name, blkno, dir),
+	TP_STRUCT__entry(
+		__field(int, namelen)
+		__string(name, name)
+		__field(void *, blkno)
+		__field(unsigned long long, dir)
+	),
+	TP_fast_assign(
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+		__entry->blkno = blkno;
+		__entry->dir = dir;
+	),
+	TP_printk("%.*s %p %llu", __entry->namelen, __get_str(name),
+		  __entry->blkno, __entry->dir)
+);
+
+TRACE_EVENT(ocfs2_check_dir_for_entry,
+	TP_PROTO(unsigned long long dir, int namelen, const char *name),
+	TP_ARGS(dir, namelen, name),
+	TP_STRUCT__entry(
+		__field(unsigned long long, dir)
+		__field(int, namelen)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+	),
+	TP_printk("%llu %.*s", __entry->dir,
+		  __entry->namelen, __get_str(name))
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_dx_dir_attach_index);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_format_cluster);
+
+TRACE_EVENT(ocfs2_dx_dir_index_root_block,
+	TP_PROTO(unsigned long long dir,
+		 unsigned int major_hash, unsigned int minor_hash,
+		 int namelen, const char *name, unsigned int num_used),
+	TP_ARGS(dir, major_hash, minor_hash, namelen, name, num_used),
+	TP_STRUCT__entry(
+		__field(unsigned long long, dir)
+		__field(unsigned int, major_hash)
+		__field(unsigned int, minor_hash)
+		__field(int, namelen)
+		__string(name, name)
+		__field(unsigned int, num_used)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->major_hash = major_hash;
+		__entry->minor_hash = minor_hash;
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+		__entry->num_used = num_used;
+	),
+	TP_printk("%llu %x %x %.*s %u", __entry->dir,
+		  __entry->major_hash, __entry->minor_hash,
+		   __entry->namelen, __get_str(name), __entry->num_used)
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_extend_dir);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_rebalance);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_dx_dir_rebalance_split);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_prepare_dir_for_insert);
+
+/* End of trace events for fs/ocfs2/dir.c. */
+
+/* Trace events for fs/ocfs2/namei.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__dentry_ops,
+	TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+		 unsigned long long dir_blkno, unsigned long long extra),
+	TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra),
+	TP_STRUCT__entry(
+		__field(void *, dir)
+		__field(void *, dentry)
+		__field(int, name_len)
+		__string(name, name)
+		__field(unsigned long long, dir_blkno)
+		__field(unsigned long long, extra)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->dentry = dentry;
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+		__entry->dir_blkno = dir_blkno;
+		__entry->extra = extra;
+	),
+	TP_printk("%p %p %.*s %llu %llu", __entry->dir, __entry->dentry,
+		  __entry->name_len, __get_str(name),
+		  __entry->dir_blkno, __entry->extra)
+);
+
+#define DEFINE_OCFS2_DENTRY_OPS(name)					\
+DEFINE_EVENT(ocfs2__dentry_ops, name,					\
+TP_PROTO(void *dir, void *dentry, int name_len, const char *name,	\
+	 unsigned long long dir_blkno, unsigned long long extra),	\
+	TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra))
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_lookup);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mkdir);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_create);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_unlink);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_symlink_create);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mv_orphaned_inode_to_new);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_lookup_ret);
+
+TRACE_EVENT(ocfs2_mknod,
+	TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+		 unsigned long long dir_blkno, unsigned long dev, int mode),
+	TP_ARGS(dir, dentry, name_len, name, dir_blkno, dev, mode),
+	TP_STRUCT__entry(
+		__field(void *, dir)
+		__field(void *, dentry)
+		__field(int, name_len)
+		__string(name, name)
+		__field(unsigned long long, dir_blkno)
+		__field(unsigned long, dev)
+		__field(int, mode)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->dentry = dentry;
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+		__entry->dir_blkno = dir_blkno;
+		__entry->dev = dev;
+		__entry->mode = mode;
+	),
+	TP_printk("%p %p %.*s %llu %lu %d", __entry->dir, __entry->dentry,
+		  __entry->name_len, __get_str(name),
+		  __entry->dir_blkno, __entry->dev, __entry->mode)
+);
+
+TRACE_EVENT(ocfs2_link,
+	TP_PROTO(unsigned long long ino, int old_len, const char *old_name,
+		 int name_len, const char *name),
+	TP_ARGS(ino, old_len, old_name, name_len, name),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(int, old_len)
+		__string(old_name, old_name)
+		__field(int, name_len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->old_len = old_len;
+		__assign_str(old_name, old_name);
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+	),
+	TP_printk("%llu %.*s %.*s", __entry->ino,
+		  __entry->old_len, __get_str(old_name),
+		  __entry->name_len, __get_str(name))
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_unlink_noent);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock_end);
+
+TRACE_EVENT(ocfs2_rename,
+	TP_PROTO(void *old_dir, void *old_dentry,
+		 void *new_dir, void *new_dentry,
+		 int old_len, const char *old_name,
+		 int new_len, const char *new_name),
+	TP_ARGS(old_dir, old_dentry, new_dir, new_dentry,
+		old_len, old_name, new_len, new_name),
+	TP_STRUCT__entry(
+		__field(void *, old_dir)
+		__field(void *, old_dentry)
+		__field(void *, new_dir)
+		__field(void *, new_dentry)
+		__field(int, old_len)
+		__string(old_name, old_name)
+		__field(int, new_len)
+		__string(new_name, new_name)
+	),
+	TP_fast_assign(
+		__entry->old_dir = old_dir;
+		__entry->old_dentry = old_dentry;
+		__entry->new_dir = new_dir;
+		__entry->new_dentry = new_dentry;
+		__entry->old_len = old_len;
+		__assign_str(old_name, old_name);
+		__entry->new_len = new_len;
+		__assign_str(new_name, new_name);
+	),
+	TP_printk("%p %p %p %p %.*s %.*s",
+		  __entry->old_dir, __entry->old_dentry,
+		  __entry->new_dir, __entry->new_dentry,
+		  __entry->old_len, __get_str(old_name),
+		  __entry->new_len, __get_str(new_name))
+);
+
+TRACE_EVENT(ocfs2_rename_target_exists,
+	TP_PROTO(int new_len, const char *new_name),
+	TP_ARGS(new_len, new_name),
+	TP_STRUCT__entry(
+		__field(int, new_len)
+		__string(new_name, new_name)
+	),
+	TP_fast_assign(
+		__entry->new_len = new_len;
+		__assign_str(new_name, new_name);
+	),
+	TP_printk("%.*s", __entry->new_len, __get_str(new_name))
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_rename_disagree);
+
+TRACE_EVENT(ocfs2_rename_over_existing,
+	TP_PROTO(unsigned long long new_blkno, void *new_bh,
+		 unsigned long long newdi_blkno),
+	TP_ARGS(new_blkno, new_bh, newdi_blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, new_blkno)
+		__field(void *, new_bh)
+		__field(unsigned long long, newdi_blkno)
+	),
+	TP_fast_assign(
+		__entry->new_blkno = new_blkno;
+		__entry->new_bh = new_bh;
+		__entry->newdi_blkno = newdi_blkno;
+	),
+	TP_printk("%llu %p %llu", __entry->new_blkno, __entry->new_bh,
+		  __entry->newdi_blkno)
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_create_symlink_data);
+
+TRACE_EVENT(ocfs2_symlink_begin,
+	TP_PROTO(void *dir, void *dentry, const char *symname,
+		 int len, const char *name),
+	TP_ARGS(dir, dentry, symname, len, name),
+	TP_STRUCT__entry(
+		__field(void *, dir)
+		__field(void *, dentry)
+		__field(const char *, symname)
+		__field(int, len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->dentry = dentry;
+		__entry->symname = symname;
+		__entry->len = len;
+		__assign_str(name, name);
+	),
+	TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry,
+		  __entry->symname, __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_blkno_stringify,
+	TP_PROTO(unsigned long long blkno, const char *name, int namelen),
+	TP_ARGS(blkno, name, namelen),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+		__string(name, name)
+		__field(int, namelen)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+		__assign_str(name, name);
+		__entry->namelen = namelen;
+	),
+	TP_printk("%llu %s %d", __entry->blkno, __get_str(name),
+		  __entry->namelen)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_add_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_orphan_add_end);
+
+TRACE_EVENT(ocfs2_orphan_del,
+	TP_PROTO(unsigned long long dir, const char *name, int namelen),
+	TP_ARGS(dir, name, namelen),
+	TP_STRUCT__entry(
+		__field(unsigned long long, dir)
+		__string(name, name)
+		__field(int, namelen)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__assign_str(name, name);
+		__entry->namelen = namelen;
+	),
+	TP_printk("%llu %s %d", __entry->dir, __get_str(name),
+		  __entry->namelen)
+);
+
+/* End of trace events for fs/ocfs2/namei.c. */
+
+/* Trace events for fs/ocfs2/dcache.c. */
+
+TRACE_EVENT(ocfs2_dentry_revalidate,
+	TP_PROTO(void *dentry, int len, const char *name),
+	TP_ARGS(dentry, len, name),
+	TP_STRUCT__entry(
+		__field(void *, dentry)
+		__field(int, len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->dentry = dentry;
+		__entry->len = len;
+		__assign_str(name, name);
+	),
+	TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_dentry_revalidate_negative,
+	TP_PROTO(int len, const char *name, unsigned long pgen,
+		 unsigned long gen),
+	TP_ARGS(len, name, pgen, gen),
+	TP_STRUCT__entry(
+		__field(int, len)
+		__string(name, name)
+		__field(unsigned long, pgen)
+		__field(unsigned long, gen)
+	),
+	TP_fast_assign(
+		__entry->len = len;
+		__assign_str(name, name);
+		__entry->pgen = pgen;
+		__entry->gen = gen;
+	),
+	TP_printk("%.*s %lu %lu", __entry->len, __get_str(name),
+		  __entry->pgen, __entry->gen)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_delete);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_dentry_revalidate_orphaned);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_nofsdata);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_dentry_revalidate_ret);
+
+TRACE_EVENT(ocfs2_find_local_alias,
+	TP_PROTO(int len, const char *name),
+	TP_ARGS(len, name),
+	TP_STRUCT__entry(
+		__field(int, len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->len = len;
+		__assign_str(name, name);
+	),
+	TP_printk("%.*s", __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_dentry_attach_lock,
+	TP_PROTO(int len, const char *name,
+		 unsigned long long parent, void *fsdata),
+	TP_ARGS(len, name, parent, fsdata),
+	TP_STRUCT__entry(
+		__field(int, len)
+		__string(name, name)
+		__field(unsigned long long, parent)
+		__field(void *, fsdata)
+	),
+	TP_fast_assign(
+		__entry->len = len;
+		__assign_str(name, name);
+		__entry->parent = parent;
+		__entry->fsdata = fsdata;
+	),
+	TP_printk("%.*s %llu %p", __entry->len, __get_str(name),
+		  __entry->parent, __entry->fsdata)
+);
+
+TRACE_EVENT(ocfs2_dentry_attach_lock_found,
+	TP_PROTO(const char *name, unsigned long long parent,
+		 unsigned long long ino),
+	TP_ARGS(name, parent, ino),
+	TP_STRUCT__entry(
+		__string(name, name)
+		__field(unsigned long long, parent)
+		__field(unsigned long long, ino)
+	),
+	TP_fast_assign(
+		__assign_str(name, name);
+		__entry->parent = parent;
+		__entry->ino = ino;
+	),
+	TP_printk("%s %llu %llu", __get_str(name), __entry->parent, __entry->ino)
+);
+/* End of trace events for fs/ocfs2/dcache.c. */
+
+/* Trace events for fs/ocfs2/export.c. */
+
+TRACE_EVENT(ocfs2_get_dentry_begin,
+	TP_PROTO(void *sb, void *handle, unsigned long long blkno),
+	TP_ARGS(sb, handle, blkno),
+	TP_STRUCT__entry(
+		__field(void *, sb)
+		__field(void *, handle)
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->sb = sb;
+		__entry->handle = handle;
+		__entry->blkno = blkno;
+	),
+	TP_printk("%p %p %llu", __entry->sb, __entry->handle, __entry->blkno)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_get_dentry_test_bit);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_get_dentry_stale);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_get_dentry_generation);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_dentry_end);
+
+TRACE_EVENT(ocfs2_get_parent,
+	TP_PROTO(void *child, int len, const char *name,
+		 unsigned long long ino),
+	TP_ARGS(child, len, name, ino),
+	TP_STRUCT__entry(
+		__field(void *,	child)
+		__field(int, len)
+		__string(name, name)
+		__field(unsigned long long, ino)
+	),
+	TP_fast_assign(
+		__entry->child = child;
+		__entry->len = len;
+		__assign_str(name, name);
+		__entry->ino = ino;
+	),
+	TP_printk("%p %.*s %llu", __entry->child, __entry->len,
+		  __get_str(name), __entry->ino)
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_parent_end);
+
+TRACE_EVENT(ocfs2_encode_fh_begin,
+	TP_PROTO(void *dentry, int name_len, const char *name,
+		 void *fh, int len, int connectable),
+	TP_ARGS(dentry, name_len, name, fh, len, connectable),
+	TP_STRUCT__entry(
+		__field(void *, dentry)
+		__field(int, name_len)
+		__string(name, name)
+		__field(void *, fh)
+		__field(int, len)
+		__field(int, connectable)
+	),
+	TP_fast_assign(
+		__entry->dentry = dentry;
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+		__entry->fh = fh;
+		__entry->len = len;
+		__entry->connectable = connectable;
+	),
+	TP_printk("%p %.*s %p %d %d", __entry->dentry, __entry->name_len,
+		  __get_str(name), __entry->fh, __entry->len,
+		  __entry->connectable)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_self);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_parent);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_encode_fh_type);
+
+/* End of trace events for fs/ocfs2/export.c. */
+
+/* Trace events for fs/ocfs2/journal.c. */
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_commit_cache_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_journal_init);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_journal_init_maxlen);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_journal_shutdown);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_journal_shutdown_wait);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_complete_recovery);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_recovery_end);
+
+TRACE_EVENT(ocfs2_complete_recovery_slot,
+	TP_PROTO(int slot, unsigned long long la_ino,
+		 unsigned long long tl_ino, void *qrec),
+	TP_ARGS(slot, la_ino, tl_ino, qrec),
+	TP_STRUCT__entry(
+		__field(int, slot)
+		__field(unsigned long long, la_ino)
+		__field(unsigned long long, tl_ino)
+		__field(void *, qrec)
+	),
+	TP_fast_assign(
+		__entry->slot = slot;
+		__entry->la_ino = la_ino;
+		__entry->tl_ino = tl_ino;
+		__entry->qrec = qrec;
+	),
+	TP_printk("%d %llu %llu %p", __entry->slot, __entry->la_ino,
+		  __entry->tl_ino, __entry->qrec)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_recovery_thread_node);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_recovery_thread_end);
+
+TRACE_EVENT(ocfs2_recovery_thread,
+	TP_PROTO(int node_num, int osb_node_num, int disable,
+		 void *recovery_thread, int map_set),
+	TP_ARGS(node_num, osb_node_num, disable, recovery_thread, map_set),
+	TP_STRUCT__entry(
+		__field(int, node_num)
+		__field(int, osb_node_num)
+		__field(int,disable)
+		__field(void *, recovery_thread)
+		__field(int,map_set)
+	),
+	TP_fast_assign(
+		__entry->node_num = node_num;
+		__entry->osb_node_num = osb_node_num;
+		__entry->disable = disable;
+		__entry->recovery_thread = recovery_thread;
+		__entry->map_set = map_set;
+	),
+	TP_printk("%d %d %d %p %d", __entry->node_num,
+		   __entry->osb_node_num, __entry->disable,
+		   __entry->recovery_thread, __entry->map_set)
+);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_replay_journal_recovered);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_lock_err);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_skip);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_recover_node);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_recover_node_skip);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_mark_dead_nodes);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_end);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_filldir);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_recover_orphans);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_recover_orphans_iput);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_wait_on_mount);
+
+/* End of trace events for fs/ocfs2/journal.c. */
+
+/* Trace events for fs/ocfs2/buffer_head_io.c. */
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_read_blocks_sync);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_read_blocks_sync_jbd);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_read_blocks_from_disk);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_bh);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_end);
+
+TRACE_EVENT(ocfs2_write_block,
+	TP_PROTO(unsigned long long block, void *ci),
+	TP_ARGS(block, ci),
+	TP_STRUCT__entry(
+		__field(unsigned long long, block)
+		__field(void *, ci)
+	),
+	TP_fast_assign(
+		__entry->block = block;
+		__entry->ci = ci;
+	),
+	TP_printk("%llu %p", __entry->block, __entry->ci)
+);
+
+TRACE_EVENT(ocfs2_read_blocks_begin,
+	TP_PROTO(void *ci, unsigned long long block,
+		 unsigned int nr, int flags),
+	TP_ARGS(ci, block, nr, flags),
+	TP_STRUCT__entry(
+		__field(void *, ci)
+		__field(unsigned long long, block)
+		__field(unsigned int, nr)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->ci = ci;
+		__entry->block = block;
+		__entry->nr = nr;
+		__entry->flags = flags;
+	),
+	TP_printk("%p %llu %u %d", __entry->ci, __entry->block,
+		  __entry->nr, __entry->flags)
+);
+
+/* End of trace events for fs/ocfs2/buffer_head_io.c. */
+
+/* Trace events for fs/ocfs2/uptodate.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_copied_metadata_tree);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_metadata_cache_purge);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_buffer_cached_begin);
+
+TRACE_EVENT(ocfs2_buffer_cached_end,
+	TP_PROTO(int index, void *item),
+	TP_ARGS(index, item),
+	TP_STRUCT__entry(
+		__field(int, index)
+		__field(void *, item)
+	),
+	TP_fast_assign(
+		__entry->index = index;
+		__entry->item = item;
+	),
+	TP_printk("%d %p", __entry->index, __entry->item)
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_append_cache_array);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_insert_cache_tree);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_expand_cache);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_set_buffer_uptodate);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_set_buffer_uptodate_begin);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_remove_metadata_array);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_remove_metadata_tree);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_remove_block_from_cache);
+
+/* End of trace events for fs/ocfs2/uptodate.c. */
+#endif /* _TRACE_OCFS2_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE ocfs2_trace
+#include <trace/define_trace.h>
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
new file mode 100644
index 00000000..d5ab56cb
--- /dev/null
+++ b/fs/ocfs2/quota.h
@@ -0,0 +1,117 @@
+/*
+ * quota.h for OCFS2
+ *
+ * On disk quota structures for local and global quota file, in-memory
+ * structures.
+ *
+ */
+
+#ifndef _OCFS2_QUOTA_H
+#define _OCFS2_QUOTA_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/list.h>
+#include <linux/dqblk_qtree.h>
+
+#include "ocfs2.h"
+
+/*
+ * In-memory structures
+ */
+struct ocfs2_dquot {
+	struct dquot dq_dquot;	/* Generic VFS dquot */
+	loff_t dq_local_off;	/* Offset in the local quota file */
+	u64 dq_local_phys_blk;	/* Physical block carrying quota structure */
+	struct ocfs2_quota_chunk *dq_chunk;	/* Chunk dquot is in */
+	unsigned int dq_use_count;	/* Number of nodes having reference to this entry in global quota file */
+	s64 dq_origspace;	/* Last globally synced space usage */
+	s64 dq_originodes;	/* Last globally synced inode usage */
+};
+
+/* Description of one chunk to recover in memory */
+struct ocfs2_recovery_chunk {
+	struct list_head rc_list;	/* List of chunks */
+	int rc_chunk;			/* Chunk number */
+	unsigned long *rc_bitmap;	/* Bitmap of entries to recover */
+};
+
+struct ocfs2_quota_recovery {
+	struct list_head r_list[MAXQUOTAS];	/* List of chunks to recover */
+};
+
+/* In-memory structure with quota header information */
+struct ocfs2_mem_dqinfo {
+	unsigned int dqi_type;		/* Quota type this structure describes */
+	unsigned int dqi_chunks;	/* Number of chunks in local quota file */
+	unsigned int dqi_blocks;	/* Number of blocks allocated for local quota file */
+	unsigned int dqi_syncms;	/* How often should we sync with other nodes */
+	struct list_head dqi_chunk;	/* List of chunks */
+	struct inode *dqi_gqinode;	/* Global quota file inode */
+	struct ocfs2_lock_res dqi_gqlock;	/* Lock protecting quota information structure */
+	struct buffer_head *dqi_gqi_bh;	/* Buffer head with global quota file inode - set only if inode lock is obtained */
+	int dqi_gqi_count;		/* Number of holders of dqi_gqi_bh */
+	u64 dqi_giblk;			/* Number of block with global information header */
+	struct buffer_head *dqi_lqi_bh;	/* Buffer head with local quota file inode */
+	struct buffer_head *dqi_libh;	/* Buffer with local information header */
+	struct qtree_mem_dqinfo dqi_gi;	/* Info about global file */
+	struct delayed_work dqi_sync_work;	/* Work for syncing dquots */
+	struct ocfs2_quota_recovery *dqi_rec;	/* Pointer to recovery
+						 * information, in case we
+						 * enable quotas on file
+						 * needing it */
+};
+
+static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
+{
+	return container_of(dquot, struct ocfs2_dquot, dq_dquot);
+}
+
+struct ocfs2_quota_chunk {
+	struct list_head qc_chunk;	/* List of quotafile chunks */
+	int qc_num;			/* Number of quota chunk */
+	struct buffer_head *qc_headerbh;	/* Buffer head with chunk header */
+};
+
+extern struct kmem_cache *ocfs2_dquot_cachep;
+extern struct kmem_cache *ocfs2_qf_chunk_cachep;
+
+extern struct qtree_fmt_operations ocfs2_global_ops;
+
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+				struct ocfs2_super *osb, int slot_num);
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+				struct ocfs2_quota_recovery *rec,
+				int slot_num);
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+			 size_t len, loff_t off);
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+			  const char *data, size_t len, loff_t off);
+int ocfs2_global_read_info(struct super_block *sb, int type);
+int ocfs2_global_write_info(struct super_block *sb, int type);
+int ocfs2_global_read_dquot(struct dquot *dquot);
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
+static inline int ocfs2_sync_dquot(struct dquot *dquot)
+{
+	return __ocfs2_sync_dquot(dquot, 0);
+}
+static inline int ocfs2_global_release_dquot(struct dquot *dquot)
+{
+	return __ocfs2_sync_dquot(dquot, 1);
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh);
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
+				struct buffer_head **bh);
+int ocfs2_create_local_dquot(struct dquot *dquot);
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
+int ocfs2_local_write_dquot(struct dquot *dquot);
+
+extern const struct dquot_operations ocfs2_quota_operations;
+extern struct quota_format_type ocfs2_quota_format;
+
+#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
new file mode 100644
index 00000000..92fcd575
--- /dev/null
+++ b/fs/ocfs2/quota_global.c
@@ -0,0 +1,922 @@
+/*
+ *  Implementation of operations over global quota file
+ */
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/dqblk_qtree.h>
+#include <linux/jiffies.h>
+#include <linux/writeback.h>
+#include <linux/workqueue.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "alloc.h"
+#include "blockcheck.h"
+#include "inode.h"
+#include "journal.h"
+#include "file.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "uptodate.h"
+#include "super.h"
+#include "buffer_head_io.h"
+#include "quota.h"
+#include "ocfs2_trace.h"
+
+/*
+ * Locking of quotas with OCFS2 is rather complex. Here are rules that
+ * should be obeyed by all the functions:
+ * - any write of quota structure (either to local or global file) is protected
+ *   by dqio_mutex or dquot->dq_lock.
+ * - any modification of global quota file holds inode cluster lock, i_mutex,
+ *   and ip_alloc_sem of the global quota file (achieved by
+ *   ocfs2_lock_global_qf). It also has to hold qinfo_lock.
+ * - an allocation of new blocks for local quota file is protected by
+ *   its ip_alloc_sem
+ *
+ * A rough sketch of locking dependencies (lf = local file, gf = global file):
+ * Normal filesystem operation:
+ *   start_trans -> dqio_mutex -> write to lf
+ * Syncing of local and global file:
+ *   ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ *     write to gf
+ *						       -> write to lf
+ * Acquire dquot for the first time:
+ *   dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf
+ *				     -> alloc space for gf
+ *				     -> start_trans -> qinfo_lock -> write to gf
+ *	     -> ip_alloc_sem of lf -> alloc space for lf
+ *	     -> write to lf
+ * Release last reference to dquot:
+ *   dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf
+ *	     -> write to lf
+ * Note that all the above operations also hold the inode cluster lock of lf.
+ * Recovery:
+ *   inode cluster lock of recovered lf
+ *     -> read bitmaps -> ip_alloc_sem of lf
+ *     -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ *        write to gf
+ */
+
+static void qsync_work_fn(struct work_struct *work);
+
+static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
+{
+	struct ocfs2_global_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
+	/* Update from disk only entries not set by the admin */
+	if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
+		m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+		m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+	}
+	if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+		m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+	if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
+		m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
+		m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
+	}
+	if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+		m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+	if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
+		m->dqb_btime = le64_to_cpu(d->dqb_btime);
+	if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
+		m->dqb_itime = le64_to_cpu(d->dqb_itime);
+	OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
+}
+
+static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+	struct ocfs2_global_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
+	d->dqb_id = cpu_to_le32(dquot->dq_id);
+	d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
+	d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+	d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+	d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+	d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
+	d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
+	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+	d->dqb_btime = cpu_to_le64(m->dqb_btime);
+	d->dqb_itime = cpu_to_le64(m->dqb_itime);
+	d->dqb_pad1 = d->dqb_pad2 = 0;
+}
+
+static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
+{
+	struct ocfs2_global_disk_dqblk *d = dp;
+	struct ocfs2_mem_dqinfo *oinfo =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+	if (qtree_entry_unused(&oinfo->dqi_gi, dp))
+		return 0;
+	return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+}
+
+struct qtree_fmt_operations ocfs2_global_ops = {
+	.mem2disk_dqblk = ocfs2_global_mem2diskdqb,
+	.disk2mem_dqblk = ocfs2_global_disk2memdqb,
+	.is_id = ocfs2_global_is_id,
+};
+
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
+{
+	struct ocfs2_disk_dqtrailer *dqt =
+		ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
+
+	trace_ocfs2_validate_quota_block((unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
+}
+
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
+				struct buffer_head **bhp)
+{
+	int rc;
+
+	*bhp = NULL;
+	rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0,
+			       ocfs2_validate_quota_block);
+	if (rc)
+		mlog_errno(rc);
+	return rc;
+}
+
+/* Read data from global quotafile - avoid pagecache and such because we cannot
+ * afford acquiring the locks... We use quota cluster lock to serialize
+ * operations. Caller is responsible for acquiring it. */
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+			 size_t len, loff_t off)
+{
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct inode *gqinode = oinfo->dqi_gqinode;
+	loff_t i_size = i_size_read(gqinode);
+	int offset = off & (sb->s_blocksize - 1);
+	sector_t blk = off >> sb->s_blocksize_bits;
+	int err = 0;
+	struct buffer_head *bh;
+	size_t toread, tocopy;
+	u64 pblock = 0, pcount = 0;
+
+	if (off > i_size)
+		return 0;
+	if (off + len > i_size)
+		len = i_size - off;
+	toread = len;
+	while (toread > 0) {
+		tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+		if (!pcount) {
+			err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock,
+							  &pcount, NULL);
+			if (err) {
+				mlog_errno(err);
+				return err;
+			}
+		} else {
+			pcount--;
+			pblock++;
+		}
+		bh = NULL;
+		err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
+		if (err) {
+			mlog_errno(err);
+			return err;
+		}
+		memcpy(data, bh->b_data + offset, tocopy);
+		brelse(bh);
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+			  const char *data, size_t len, loff_t off)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct inode *gqinode = oinfo->dqi_gqinode;
+	int offset = off & (sb->s_blocksize - 1);
+	sector_t blk = off >> sb->s_blocksize_bits;
+	int err = 0, new = 0, ja_type;
+	struct buffer_head *bh = NULL;
+	handle_t *handle = journal_current_handle();
+	u64 pblock, pcount;
+
+	if (!handle) {
+		mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
+		     "because transaction was not started.\n",
+		     (unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+	if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
+		WARN_ON(1);
+		len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
+	}
+
+	if (gqinode->i_size < off + len) {
+		loff_t rounded_end =
+				ocfs2_align_bytes_to_blocks(sb, off + len);
+
+		/* Space is already allocated in ocfs2_acquire_dquot() */
+		err = ocfs2_simple_size_update(gqinode,
+					       oinfo->dqi_gqi_bh,
+					       rounded_end);
+		if (err < 0)
+			goto out;
+		new = 1;
+	}
+	err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL);
+	if (err) {
+		mlog_errno(err);
+		goto out;
+	}
+	/* Not rewriting whole block? */
+	if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
+	    !new) {
+		err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
+		ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
+	} else {
+		bh = sb_getblk(sb, pblock);
+		if (!bh)
+			err = -ENOMEM;
+		ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
+	}
+	if (err) {
+		mlog_errno(err);
+		goto out;
+	}
+	lock_buffer(bh);
+	if (new)
+		memset(bh->b_data, 0, sb->s_blocksize);
+	memcpy(bh->b_data + offset, data, len);
+	flush_dcache_page(bh->b_page);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh);
+	err = ocfs2_journal_access_dq(handle, INODE_CACHE(gqinode), bh,
+				      ja_type);
+	if (err < 0) {
+		brelse(bh);
+		goto out;
+	}
+	ocfs2_journal_dirty(handle, bh);
+	brelse(bh);
+out:
+	if (err) {
+		mlog_errno(err);
+		return err;
+	}
+	gqinode->i_version++;
+	ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
+	return len;
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	int status;
+	struct buffer_head *bh = NULL;
+
+	status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
+	if (status < 0)
+		return status;
+	spin_lock(&dq_data_lock);
+	if (!oinfo->dqi_gqi_count++)
+		oinfo->dqi_gqi_bh = bh;
+	else
+		WARN_ON(bh != oinfo->dqi_gqi_bh);
+	spin_unlock(&dq_data_lock);
+	if (ex) {
+		mutex_lock(&oinfo->dqi_gqinode->i_mutex);
+		down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+	} else {
+		down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+	}
+	return 0;
+}
+
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	if (ex) {
+		up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+		mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
+	} else {
+		up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+	}
+	ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
+	brelse(oinfo->dqi_gqi_bh);
+	spin_lock(&dq_data_lock);
+	if (!--oinfo->dqi_gqi_count)
+		oinfo->dqi_gqi_bh = NULL;
+	spin_unlock(&dq_data_lock);
+}
+
+/* Read information header from global quota file */
+int ocfs2_global_read_info(struct super_block *sb, int type)
+{
+	struct inode *gqinode = NULL;
+	unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+					GROUP_QUOTA_SYSTEM_INODE };
+	struct ocfs2_global_disk_dqinfo dinfo;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	u64 pcount;
+	int status;
+
+	/* Read global header */
+	gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+			OCFS2_INVALID_SLOT);
+	if (!gqinode) {
+		mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
+			type);
+		status = -EINVAL;
+		goto out_err;
+	}
+	oinfo->dqi_gi.dqi_sb = sb;
+	oinfo->dqi_gi.dqi_type = type;
+	ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
+	oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
+	oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
+	oinfo->dqi_gqi_bh = NULL;
+	oinfo->dqi_gqi_count = 0;
+	oinfo->dqi_gqinode = gqinode;
+	status = ocfs2_lock_global_qf(oinfo, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
+					     &pcount, NULL);
+	if (status < 0)
+		goto out_unlock;
+
+	status = ocfs2_qinfo_lock(oinfo, 0);
+	if (status < 0)
+		goto out_unlock;
+	status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+				      sizeof(struct ocfs2_global_disk_dqinfo),
+				      OCFS2_GLOBAL_INFO_OFF);
+	ocfs2_qinfo_unlock(oinfo, 0);
+	ocfs2_unlock_global_qf(oinfo, 0);
+	if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
+		mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
+		     status);
+		if (status >= 0)
+			status = -EIO;
+		mlog_errno(status);
+		goto out_err;
+	}
+	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+	oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
+	oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+	oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+	oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+	oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
+	oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
+						OCFS2_QBLK_RESERVED_SPACE;
+	oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
+	INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
+	schedule_delayed_work(&oinfo->dqi_sync_work,
+			      msecs_to_jiffies(oinfo->dqi_syncms));
+
+out_err:
+	if (status)
+		mlog_errno(status);
+	return status;
+out_unlock:
+	ocfs2_unlock_global_qf(oinfo, 0);
+	mlog_errno(status);
+	goto out_err;
+}
+
+/* Write information to global quota file. Expects exlusive lock on quota
+ * file inode and quota info */
+static int __ocfs2_global_write_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_global_disk_dqinfo dinfo;
+	ssize_t size;
+
+	spin_lock(&dq_data_lock);
+	info->dqi_flags &= ~DQF_INFO_DIRTY;
+	dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+	dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+	spin_unlock(&dq_data_lock);
+	dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
+	dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
+	dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
+	dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
+	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
+				     sizeof(struct ocfs2_global_disk_dqinfo),
+				     OCFS2_GLOBAL_INFO_OFF);
+	if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
+		mlog(ML_ERROR, "Cannot write global quota info structure\n");
+		if (size >= 0)
+			size = -EIO;
+		return size;
+	}
+	return 0;
+}
+
+int ocfs2_global_write_info(struct super_block *sb, int type)
+{
+	int err;
+	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+
+	err = ocfs2_qinfo_lock(info, 1);
+	if (err < 0)
+		return err;
+	err = __ocfs2_global_write_info(sb, type);
+	ocfs2_qinfo_unlock(info, 1);
+	return err;
+}
+
+static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
+{
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+
+	/*
+	 * We may need to allocate tree blocks and a leaf block but not the
+	 * root block
+	 */
+	return oinfo->dqi_gi.dqi_qtree_depth;
+}
+
+static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
+{
+	/* We modify all the allocated blocks, tree root, info block and
+	 * the inode */
+	return (ocfs2_global_qinit_alloc(sb, type) + 2) *
+			OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1;
+}
+
+/* Sync local information about quota modifications with global quota file.
+ * Caller must have started the transaction and obtained exclusive lock for
+ * global quota file inode */
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
+{
+	int err, err2;
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_type;
+	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_global_disk_dqblk dqblk;
+	s64 spacechange, inodechange;
+	time_t olditime, oldbtime;
+
+	err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+				   sizeof(struct ocfs2_global_disk_dqblk),
+				   dquot->dq_off);
+	if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
+		if (err >= 0) {
+			mlog(ML_ERROR, "Short read from global quota file "
+				       "(%u read)\n", err);
+			err = -EIO;
+		}
+		goto out;
+	}
+
+	/* Update space and inode usage. Get also other information from
+	 * global quota file so that we don't overwrite any changes there.
+	 * We are */
+	spin_lock(&dq_data_lock);
+	spacechange = dquot->dq_dqb.dqb_curspace -
+					OCFS2_DQUOT(dquot)->dq_origspace;
+	inodechange = dquot->dq_dqb.dqb_curinodes -
+					OCFS2_DQUOT(dquot)->dq_originodes;
+	olditime = dquot->dq_dqb.dqb_itime;
+	oldbtime = dquot->dq_dqb.dqb_btime;
+	ocfs2_global_disk2memdqb(dquot, &dqblk);
+	trace_ocfs2_sync_dquot(dquot->dq_id, dquot->dq_dqb.dqb_curspace,
+			       (long long)spacechange,
+			       dquot->dq_dqb.dqb_curinodes,
+			       (long long)inodechange);
+	if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+		dquot->dq_dqb.dqb_curspace += spacechange;
+	if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+		dquot->dq_dqb.dqb_curinodes += inodechange;
+	/* Set properly space grace time... */
+	if (dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
+		if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
+		    oldbtime > 0) {
+			if (dquot->dq_dqb.dqb_btime > 0)
+				dquot->dq_dqb.dqb_btime =
+					min(dquot->dq_dqb.dqb_btime, oldbtime);
+			else
+				dquot->dq_dqb.dqb_btime = oldbtime;
+		}
+	} else {
+		dquot->dq_dqb.dqb_btime = 0;
+		clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+	}
+	/* Set properly inode grace time... */
+	if (dquot->dq_dqb.dqb_isoftlimit &&
+	    dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
+		if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
+		    olditime > 0) {
+			if (dquot->dq_dqb.dqb_itime > 0)
+				dquot->dq_dqb.dqb_itime =
+					min(dquot->dq_dqb.dqb_itime, olditime);
+			else
+				dquot->dq_dqb.dqb_itime = olditime;
+		}
+	} else {
+		dquot->dq_dqb.dqb_itime = 0;
+		clear_bit(DQ_INODES_B, &dquot->dq_flags);
+	}
+	/* All information is properly updated, clear the flags */
+	__clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+	spin_unlock(&dq_data_lock);
+	err = ocfs2_qinfo_lock(info, freeing);
+	if (err < 0) {
+		mlog(ML_ERROR, "Failed to lock quota info, losing quota write"
+			       " (type=%d, id=%u)\n", dquot->dq_type,
+			       (unsigned)dquot->dq_id);
+		goto out;
+	}
+	if (freeing)
+		OCFS2_DQUOT(dquot)->dq_use_count--;
+	err = qtree_write_dquot(&info->dqi_gi, dquot);
+	if (err < 0)
+		goto out_qlock;
+	if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
+		err = qtree_release_dquot(&info->dqi_gi, dquot);
+		if (info_dirty(sb_dqinfo(sb, type))) {
+			err2 = __ocfs2_global_write_info(sb, type);
+			if (!err)
+				err = err2;
+		}
+	}
+out_qlock:
+	ocfs2_qinfo_unlock(info, freeing);
+out:
+	if (err < 0)
+		mlog_errno(err);
+	return err;
+}
+
+/*
+ *  Functions for periodic syncing of dquots with global file
+ */
+static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
+{
+	handle_t *handle;
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	int status = 0;
+
+	trace_ocfs2_sync_dquot_helper(dquot->dq_id, dquot->dq_type,
+				      type, sb->s_id);
+	if (type != dquot->dq_type)
+		goto out;
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+
+	handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+	status = ocfs2_sync_dquot(dquot);
+	if (status < 0)
+		mlog_errno(status);
+	/* We have to write local structure as well... */
+	status = ocfs2_local_write_dquot(dquot);
+	if (status < 0)
+		mlog_errno(status);
+	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	return status;
+}
+
+static void qsync_work_fn(struct work_struct *work)
+{
+	struct ocfs2_mem_dqinfo *oinfo = container_of(work,
+						      struct ocfs2_mem_dqinfo,
+						      dqi_sync_work.work);
+	struct super_block *sb = oinfo->dqi_gqinode->i_sb;
+
+	dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+	schedule_delayed_work(&oinfo->dqi_sync_work,
+			      msecs_to_jiffies(oinfo->dqi_syncms));
+}
+
+/*
+ *  Wrappers for generic quota functions
+ */
+
+static int ocfs2_write_dquot(struct dquot *dquot)
+{
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+	int status = 0;
+
+	trace_ocfs2_write_dquot(dquot->dq_id, dquot->dq_type);
+
+	handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+	mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+	status = ocfs2_local_write_dquot(dquot);
+	mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+	ocfs2_commit_trans(osb, handle);
+out:
+	return status;
+}
+
+static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+{
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	/*
+	 * We modify tree, leaf block, global info, local chunk header,
+	 * global and local inode; OCFS2_QINFO_WRITE_CREDITS already
+	 * accounts for inode update
+	 */
+	return (oinfo->dqi_gi.dqi_qtree_depth + 2) *
+	       OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
+	       OCFS2_QINFO_WRITE_CREDITS +
+	       OCFS2_INODE_UPDATE_CREDITS;
+}
+
+static int ocfs2_release_dquot(struct dquot *dquot)
+{
+	handle_t *handle;
+	struct ocfs2_mem_dqinfo *oinfo =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+	int status = 0;
+
+	trace_ocfs2_release_dquot(dquot->dq_id, dquot->dq_type);
+
+	mutex_lock(&dquot->dq_lock);
+	/* Check whether we are not racing with some other dqget() */
+	if (atomic_read(&dquot->dq_count) > 1)
+		goto out;
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(osb,
+		ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+
+	status = ocfs2_global_release_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	status = ocfs2_local_release_dquot(handle, dquot);
+	/*
+	 * If we fail here, we cannot do much as global structure is
+	 * already released. So just complain...
+	 */
+	if (status < 0)
+		mlog_errno(status);
+	clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out_trans:
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mutex_unlock(&dquot->dq_lock);
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * Read global dquot structure from disk or create it if it does
+ * not exist. Also update use count of the global structure and
+ * create structure in node-local quota file.
+ */
+static int ocfs2_acquire_dquot(struct dquot *dquot)
+{
+	int status = 0, err;
+	int ex = 0;
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	int type = dquot->dq_type;
+	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+	struct inode *gqinode = info->dqi_gqinode;
+	int need_alloc = ocfs2_global_qinit_alloc(sb, type);
+	handle_t *handle;
+
+	trace_ocfs2_acquire_dquot(dquot->dq_id, type);
+	mutex_lock(&dquot->dq_lock);
+	/*
+	 * We need an exclusive lock, because we're going to update use count
+	 * and instantiate possibly new dquot structure
+	 */
+	status = ocfs2_lock_global_qf(info, 1);
+	if (status < 0)
+		goto out;
+	if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
+		status = ocfs2_qinfo_lock(info, 0);
+		if (status < 0)
+			goto out_dq;
+		status = qtree_read_dquot(&info->dqi_gi, dquot);
+		ocfs2_qinfo_unlock(info, 0);
+		if (status < 0)
+			goto out_dq;
+	}
+	set_bit(DQ_READ_B, &dquot->dq_flags);
+
+	OCFS2_DQUOT(dquot)->dq_use_count++;
+	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+	if (!dquot->dq_off) {	/* No real quota entry? */
+		ex = 1;
+		/*
+		 * Add blocks to quota file before we start a transaction since
+		 * locking allocators ranks above a transaction start
+		 */
+		WARN_ON(journal_current_handle());
+		status = ocfs2_extend_no_holes(gqinode, NULL,
+			gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
+			gqinode->i_size);
+		if (status < 0)
+			goto out_dq;
+	}
+
+	handle = ocfs2_start_trans(osb,
+				   ocfs2_calc_global_qinit_credits(sb, type));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		goto out_dq;
+	}
+	status = ocfs2_qinfo_lock(info, ex);
+	if (status < 0)
+		goto out_trans;
+	status = qtree_write_dquot(&info->dqi_gi, dquot);
+	if (ex && info_dirty(sb_dqinfo(sb, type))) {
+		err = __ocfs2_global_write_info(sb, type);
+		if (!status)
+			status = err;
+	}
+	ocfs2_qinfo_unlock(info, ex);
+out_trans:
+	ocfs2_commit_trans(osb, handle);
+out_dq:
+	ocfs2_unlock_global_qf(info, 1);
+	if (status < 0)
+		goto out;
+
+	status = ocfs2_create_local_dquot(dquot);
+	if (status < 0)
+		goto out;
+	set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out:
+	mutex_unlock(&dquot->dq_lock);
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
+{
+	unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_INODES_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_ITIME_B));
+	int sync = 0;
+	int status;
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_type;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	trace_ocfs2_mark_dquot_dirty(dquot->dq_id, type);
+
+	/* In case user set some limits, sync dquot immediately to global
+	 * quota file so that information propagates quicker */
+	spin_lock(&dq_data_lock);
+	if (dquot->dq_flags & mask)
+		sync = 1;
+	spin_unlock(&dq_data_lock);
+	/* This is a slight hack but we can't afford getting global quota
+	 * lock if we already have a transaction started. */
+	if (!sync || journal_current_handle()) {
+		status = ocfs2_write_dquot(dquot);
+		goto out;
+	}
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+	status = ocfs2_sync_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_dlock;
+	}
+	/* Now write updated local dquot structure */
+	status = ocfs2_local_write_dquot(dquot);
+out_dlock:
+	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/* This should happen only after set_dqinfo(). */
+static int ocfs2_write_info(struct super_block *sb, int type)
+{
+	handle_t *handle;
+	int status = 0;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	status = dquot_commit_info(sb, type);
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
+{
+	struct ocfs2_dquot *dquot =
+				kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
+
+	if (!dquot)
+		return NULL;
+	return &dquot->dq_dquot;
+}
+
+static void ocfs2_destroy_dquot(struct dquot *dquot)
+{
+	kmem_cache_free(ocfs2_dquot_cachep, dquot);
+}
+
+const struct dquot_operations ocfs2_quota_operations = {
+	/* We never make dquot dirty so .write_dquot is never called */
+	.acquire_dquot	= ocfs2_acquire_dquot,
+	.release_dquot	= ocfs2_release_dquot,
+	.mark_dirty	= ocfs2_mark_dquot_dirty,
+	.write_info	= ocfs2_write_info,
+	.alloc_dquot	= ocfs2_alloc_dquot,
+	.destroy_dquot	= ocfs2_destroy_dquot,
+};
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
new file mode 100644
index 00000000..dc8007fc
--- /dev/null
+++ b/fs/ocfs2/quota_local.c
@@ -0,0 +1,1316 @@
+/*
+ *  Implementation of operations over local quota file
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/module.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "file.h"
+#include "buffer_head_io.h"
+#include "journal.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "quota.h"
+#include "uptodate.h"
+#include "super.h"
+#include "ocfs2_trace.h"
+
+/* Number of local quota structures per block */
+static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
+{
+	return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) /
+		sizeof(struct ocfs2_local_disk_dqblk));
+}
+
+/* Number of blocks with entries in one chunk */
+static inline unsigned int ol_chunk_blocks(struct super_block *sb)
+{
+	return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+		 OCFS2_QBLK_RESERVED_SPACE) << 3) /
+	       ol_quota_entries_per_block(sb);
+}
+
+/* Number of entries in a chunk bitmap */
+static unsigned int ol_chunk_entries(struct super_block *sb)
+{
+	return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb);
+}
+
+/* Offset of the chunk in quota file */
+static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
+{
+	/* 1 block for local quota file info, 1 block per chunk for chunk info */
+	return 1 + (ol_chunk_blocks(sb) + 1) * c;
+}
+
+static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off)
+{
+	int epb = ol_quota_entries_per_block(sb);
+
+	return ol_quota_chunk_block(sb, c) + 1 + off / epb;
+}
+
+static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off)
+{
+	int epb = ol_quota_entries_per_block(sb);
+
+	return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Offset of the dquot structure in the quota file */
+static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
+{
+	return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) +
+	       ol_dqblk_block_off(sb, c, off);
+}
+
+/* Compute block number from given offset */
+static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
+{
+	return off >> sb->s_blocksize_bits;
+}
+
+static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
+{
+	return off & ((1 << sb->s_blocksize_bits) - 1);
+}
+
+/* Compute offset in the chunk of a structure with the given offset */
+static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off)
+{
+	int epb = ol_quota_entries_per_block(sb);
+
+	return ((off >> sb->s_blocksize_bits) -
+			ol_quota_chunk_block(sb, c) - 1) * epb
+	       + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) /
+		 sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Write bufferhead into the fs */
+static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
+		void (*modify)(struct buffer_head *, void *), void *private)
+{
+	struct super_block *sb = inode->i_sb;
+	handle_t *handle;
+	int status;
+
+	handle = ocfs2_start_trans(OCFS2_SB(sb),
+				   OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		return status;
+	}
+	status = ocfs2_journal_access_dq(handle, INODE_CACHE(inode), bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		ocfs2_commit_trans(OCFS2_SB(sb), handle);
+		return status;
+	}
+	lock_buffer(bh);
+	modify(bh, private);
+	unlock_buffer(bh);
+	ocfs2_journal_dirty(handle, bh);
+
+	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+	if (status < 0) {
+		mlog_errno(status);
+		return status;
+	}
+	return 0;
+}
+
+/*
+ * Read quota block from a given logical offset.
+ *
+ * This function acquires ip_alloc_sem and thus it must not be called with a
+ * transaction started.
+ */
+static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+				  struct buffer_head **bh)
+{
+	int rc = 0;
+	struct buffer_head *tmp = *bh;
+
+	if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+		ocfs2_error(inode->i_sb,
+			    "Quota file %llu is probably corrupted! Requested "
+			    "to read block %Lu but file has size only %Lu\n",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)v_block,
+			    (unsigned long long)i_size_read(inode));
+		return -EIO;
+	}
+	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+				    ocfs2_validate_quota_block);
+	if (rc)
+		mlog_errno(rc);
+
+	/* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
+/* Check whether we understand format of quota files */
+static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
+{
+	unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
+	unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
+	unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
+	unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
+	unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+					GROUP_QUOTA_SYSTEM_INODE };
+	struct buffer_head *bh = NULL;
+	struct inode *linode = sb_dqopt(sb)->files[type];
+	struct inode *ginode = NULL;
+	struct ocfs2_disk_dqheader *dqhead;
+	int status, ret = 0;
+
+	/* First check whether we understand local quota file */
+	status = ocfs2_read_quota_block(linode, 0, &bh);
+	if (status) {
+		mlog_errno(status);
+		mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
+			type);
+		goto out_err;
+	}
+	dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+	if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) {
+		mlog(ML_ERROR, "quota file magic does not match (%u != %u),"
+			" type=%d\n", le32_to_cpu(dqhead->dqh_magic),
+			lmagics[type], type);
+		goto out_err;
+	}
+	if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) {
+		mlog(ML_ERROR, "quota file version does not match (%u != %u),"
+			" type=%d\n", le32_to_cpu(dqhead->dqh_version),
+			lversions[type], type);
+		goto out_err;
+	}
+	brelse(bh);
+	bh = NULL;
+
+	/* Next check whether we understand global quota file */
+	ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+						OCFS2_INVALID_SLOT);
+	if (!ginode) {
+		mlog(ML_ERROR, "cannot get global quota file inode "
+				"(type=%d)\n", type);
+		goto out_err;
+	}
+	/* Since the header is read only, we don't care about locking */
+	status = ocfs2_read_quota_block(ginode, 0, &bh);
+	if (status) {
+		mlog_errno(status);
+		mlog(ML_ERROR, "failed to read global quota file header "
+				"(type=%d)\n", type);
+		goto out_err;
+	}
+	dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+	if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) {
+		mlog(ML_ERROR, "global quota file magic does not match "
+			"(%u != %u), type=%d\n",
+			le32_to_cpu(dqhead->dqh_magic), gmagics[type], type);
+		goto out_err;
+	}
+	if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) {
+		mlog(ML_ERROR, "global quota file version does not match "
+			"(%u != %u), type=%d\n",
+			le32_to_cpu(dqhead->dqh_version), gversions[type],
+			type);
+		goto out_err;
+	}
+
+	ret = 1;
+out_err:
+	brelse(bh);
+	iput(ginode);
+	return ret;
+}
+
+/* Release given list of quota file chunks */
+static void ocfs2_release_local_quota_bitmaps(struct list_head *head)
+{
+	struct ocfs2_quota_chunk *pos, *next;
+
+	list_for_each_entry_safe(pos, next, head, qc_chunk) {
+		list_del(&pos->qc_chunk);
+		brelse(pos->qc_headerbh);
+		kmem_cache_free(ocfs2_qf_chunk_cachep, pos);
+	}
+}
+
+/* Load quota bitmaps into memory */
+static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
+			struct ocfs2_local_disk_dqinfo *ldinfo,
+			struct list_head *head)
+{
+	struct ocfs2_quota_chunk *newchunk;
+	int i, status;
+
+	INIT_LIST_HEAD(head);
+	for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) {
+		newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+		if (!newchunk) {
+			ocfs2_release_local_quota_bitmaps(head);
+			return -ENOMEM;
+		}
+		newchunk->qc_num = i;
+		newchunk->qc_headerbh = NULL;
+		status = ocfs2_read_quota_block(inode,
+				ol_quota_chunk_block(inode->i_sb, i),
+				&newchunk->qc_headerbh);
+		if (status) {
+			mlog_errno(status);
+			kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
+			ocfs2_release_local_quota_bitmaps(head);
+			return status;
+		}
+		list_add_tail(&newchunk->qc_chunk, head);
+	}
+	return 0;
+}
+
+static void olq_update_info(struct buffer_head *bh, void *private)
+{
+	struct mem_dqinfo *info = private;
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+
+	ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+						OCFS2_LOCAL_INFO_OFF);
+	spin_lock(&dq_data_lock);
+	ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+	ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
+	ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
+	spin_unlock(&dq_data_lock);
+}
+
+static int ocfs2_add_recovery_chunk(struct super_block *sb,
+				    struct ocfs2_local_disk_chunk *dchunk,
+				    int chunk,
+				    struct list_head *head)
+{
+	struct ocfs2_recovery_chunk *rc;
+
+	rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS);
+	if (!rc)
+		return -ENOMEM;
+	rc->rc_chunk = chunk;
+	rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+	if (!rc->rc_bitmap) {
+		kfree(rc);
+		return -ENOMEM;
+	}
+	memcpy(rc->rc_bitmap, dchunk->dqc_bitmap,
+	       (ol_chunk_entries(sb) + 7) >> 3);
+	list_add_tail(&rc->rc_list, head);
+	return 0;
+}
+
+static void free_recovery_list(struct list_head *head)
+{
+	struct ocfs2_recovery_chunk *next;
+	struct ocfs2_recovery_chunk *rchunk;
+
+	list_for_each_entry_safe(rchunk, next, head, rc_list) {
+		list_del(&rchunk->rc_list);
+		kfree(rchunk->rc_bitmap);
+		kfree(rchunk);
+	}
+}
+
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
+{
+	int type;
+
+	for (type = 0; type < MAXQUOTAS; type++)
+		free_recovery_list(&(rec->r_list[type]));
+	kfree(rec);
+}
+
+/* Load entries in our quota file we have to recover*/
+static int ocfs2_recovery_load_quota(struct inode *lqinode,
+				     struct ocfs2_local_disk_dqinfo *ldinfo,
+				     int type,
+				     struct list_head *head)
+{
+	struct super_block *sb = lqinode->i_sb;
+	struct buffer_head *hbh;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int i, chunks = le32_to_cpu(ldinfo->dqi_chunks);
+	int status = 0;
+
+	for (i = 0; i < chunks; i++) {
+		hbh = NULL;
+		status = ocfs2_read_quota_block(lqinode,
+						ol_quota_chunk_block(sb, i),
+						&hbh);
+		if (status) {
+			mlog_errno(status);
+			break;
+		}
+		dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+		if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb))
+			status = ocfs2_add_recovery_chunk(sb, dchunk, i, head);
+		brelse(hbh);
+		if (status < 0)
+			break;
+	}
+	if (status < 0)
+		free_recovery_list(head);
+	return status;
+}
+
+static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
+{
+	int type;
+	struct ocfs2_quota_recovery *rec;
+
+	rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
+	if (!rec)
+		return NULL;
+	for (type = 0; type < MAXQUOTAS; type++)
+		INIT_LIST_HEAD(&(rec->r_list[type]));
+	return rec;
+}
+
+/* Load information we need for quota recovery into memory */
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+						struct ocfs2_super *osb,
+						int slot_num)
+{
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+	unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+					LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+	struct super_block *sb = osb->sb;
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+	struct inode *lqinode;
+	struct buffer_head *bh;
+	int type;
+	int status = 0;
+	struct ocfs2_quota_recovery *rec;
+
+	mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+	rec = ocfs2_alloc_quota_recovery();
+	if (!rec)
+		return ERR_PTR(-ENOMEM);
+	/* First init... */
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+			continue;
+		/* At this point, journal of the slot is already replayed so
+		 * we can trust metadata and data of the quota file */
+		lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+		if (!lqinode) {
+			status = -ENOENT;
+			goto out;
+		}
+		status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+					       OCFS2_META_LOCK_RECOVERY);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out_put;
+		}
+		/* Now read local header */
+		bh = NULL;
+		status = ocfs2_read_quota_block(lqinode, 0, &bh);
+		if (status) {
+			mlog_errno(status);
+			mlog(ML_ERROR, "failed to read quota file info header "
+				"(slot=%d type=%d)\n", slot_num, type);
+			goto out_lock;
+		}
+		ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+							OCFS2_LOCAL_INFO_OFF);
+		status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+						   &rec->r_list[type]);
+		brelse(bh);
+out_lock:
+		ocfs2_inode_unlock(lqinode, 1);
+out_put:
+		iput(lqinode);
+		if (status < 0)
+			break;
+	}
+out:
+	if (status < 0) {
+		ocfs2_free_quota_recovery(rec);
+		rec = ERR_PTR(status);
+	}
+	return rec;
+}
+
+/* Sync changes in local quota file into global quota file and
+ * reinitialize local quota file.
+ * The function expects local quota file to be already locked and
+ * dqonoff_mutex locked. */
+static int ocfs2_recover_local_quota_file(struct inode *lqinode,
+					  int type,
+					  struct ocfs2_quota_recovery *rec)
+{
+	struct super_block *sb = lqinode->i_sb;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_local_disk_chunk *dchunk;
+	struct ocfs2_local_disk_dqblk *dqblk;
+	struct dquot *dquot;
+	handle_t *handle;
+	struct buffer_head *hbh = NULL, *qbh = NULL;
+	int status = 0;
+	int bit, chunk;
+	struct ocfs2_recovery_chunk *rchunk, *next;
+	qsize_t spacechange, inodechange;
+
+	trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type);
+
+	list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
+		chunk = rchunk->rc_chunk;
+		hbh = NULL;
+		status = ocfs2_read_quota_block(lqinode,
+						ol_quota_chunk_block(sb, chunk),
+						&hbh);
+		if (status) {
+			mlog_errno(status);
+			break;
+		}
+		dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+		for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+			qbh = NULL;
+			status = ocfs2_read_quota_block(lqinode,
+						ol_dqblk_block(sb, chunk, bit),
+						&qbh);
+			if (status) {
+				mlog_errno(status);
+				break;
+			}
+			dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
+				ol_dqblk_block_off(sb, chunk, bit));
+			dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type);
+			if (!dquot) {
+				status = -EIO;
+				mlog(ML_ERROR, "Failed to get quota structure "
+				     "for id %u, type %d. Cannot finish quota "
+				     "file recovery.\n",
+				     (unsigned)le64_to_cpu(dqblk->dqb_id),
+				     type);
+				goto out_put_bh;
+			}
+			status = ocfs2_lock_global_qf(oinfo, 1);
+			if (status < 0) {
+				mlog_errno(status);
+				goto out_put_dquot;
+			}
+
+			handle = ocfs2_start_trans(OCFS2_SB(sb),
+						   OCFS2_QSYNC_CREDITS);
+			if (IS_ERR(handle)) {
+				status = PTR_ERR(handle);
+				mlog_errno(status);
+				goto out_drop_lock;
+			}
+			mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+			spin_lock(&dq_data_lock);
+			/* Add usage from quota entry into quota changes
+			 * of our node. Auxiliary variables are important
+			 * due to signedness */
+			spacechange = le64_to_cpu(dqblk->dqb_spacemod);
+			inodechange = le64_to_cpu(dqblk->dqb_inodemod);
+			dquot->dq_dqb.dqb_curspace += spacechange;
+			dquot->dq_dqb.dqb_curinodes += inodechange;
+			spin_unlock(&dq_data_lock);
+			/* We want to drop reference held by the crashed
+			 * node. Since we have our own reference we know
+			 * global structure actually won't be freed. */
+			status = ocfs2_global_release_dquot(dquot);
+			if (status < 0) {
+				mlog_errno(status);
+				goto out_commit;
+			}
+			/* Release local quota file entry */
+			status = ocfs2_journal_access_dq(handle,
+					INODE_CACHE(lqinode),
+					qbh, OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto out_commit;
+			}
+			lock_buffer(qbh);
+			WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
+			ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+			le32_add_cpu(&dchunk->dqc_free, 1);
+			unlock_buffer(qbh);
+			ocfs2_journal_dirty(handle, qbh);
+out_commit:
+			mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+			ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_drop_lock:
+			ocfs2_unlock_global_qf(oinfo, 1);
+out_put_dquot:
+			dqput(dquot);
+out_put_bh:
+			brelse(qbh);
+			if (status < 0)
+				break;
+		}
+		brelse(hbh);
+		list_del(&rchunk->rc_list);
+		kfree(rchunk->rc_bitmap);
+		kfree(rchunk);
+		if (status < 0)
+			break;
+	}
+	if (status < 0)
+		free_recovery_list(&(rec->r_list[type]));
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/* Recover local quota files for given node different from us */
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+				struct ocfs2_quota_recovery *rec,
+				int slot_num)
+{
+	unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+					LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+	struct super_block *sb = osb->sb;
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+	struct buffer_head *bh;
+	handle_t *handle;
+	int type;
+	int status = 0;
+	struct inode *lqinode;
+	unsigned int flags;
+
+	mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (list_empty(&(rec->r_list[type])))
+			continue;
+		trace_ocfs2_finish_quota_recovery(slot_num);
+		lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+		if (!lqinode) {
+			status = -ENOENT;
+			goto out;
+		}
+		status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+						       OCFS2_META_LOCK_NOQUEUE);
+		/* Someone else is holding the lock? Then he must be
+		 * doing the recovery. Just skip the file... */
+		if (status == -EAGAIN) {
+			mlog(ML_NOTICE, "skipping quota recovery for slot %d "
+			     "because quota file is locked.\n", slot_num);
+			status = 0;
+			goto out_put;
+		} else if (status < 0) {
+			mlog_errno(status);
+			goto out_put;
+		}
+		/* Now read local header */
+		bh = NULL;
+		status = ocfs2_read_quota_block(lqinode, 0, &bh);
+		if (status) {
+			mlog_errno(status);
+			mlog(ML_ERROR, "failed to read quota file info header "
+				"(slot=%d type=%d)\n", slot_num, type);
+			goto out_lock;
+		}
+		ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+							OCFS2_LOCAL_INFO_OFF);
+		/* Is recovery still needed? */
+		flags = le32_to_cpu(ldinfo->dqi_flags);
+		if (!(flags & OLQF_CLEAN))
+			status = ocfs2_recover_local_quota_file(lqinode,
+								type,
+								rec);
+		/* We don't want to mark file as clean when it is actually
+		 * active */
+		if (slot_num == osb->slot_num)
+			goto out_bh;
+		/* Mark quota file as clean if we are recovering quota file of
+		 * some other node. */
+		handle = ocfs2_start_trans(osb,
+					   OCFS2_LOCAL_QINFO_WRITE_CREDITS);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto out_bh;
+		}
+		status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
+						 bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out_trans;
+		}
+		lock_buffer(bh);
+		ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
+		unlock_buffer(bh);
+		ocfs2_journal_dirty(handle, bh);
+out_trans:
+		ocfs2_commit_trans(osb, handle);
+out_bh:
+		brelse(bh);
+out_lock:
+		ocfs2_inode_unlock(lqinode, 1);
+out_put:
+		iput(lqinode);
+		if (status < 0)
+			break;
+	}
+out:
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	kfree(rec);
+	return status;
+}
+
+/* Read information header from quota file */
+static int ocfs2_local_read_info(struct super_block *sb, int type)
+{
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	int status;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_quota_recovery *rec;
+	int locked = 0;
+
+	/* We don't need the lock and we have to acquire quota file locks
+	 * which will later depend on this lock */
+	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+	info->dqi_maxblimit = 0x7fffffffffffffffLL;
+	info->dqi_maxilimit = 0x7fffffffffffffffLL;
+	oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
+	if (!oinfo) {
+		mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
+			       " info.");
+		goto out_err;
+	}
+	info->dqi_priv = oinfo;
+	oinfo->dqi_type = type;
+	INIT_LIST_HEAD(&oinfo->dqi_chunk);
+	oinfo->dqi_rec = NULL;
+	oinfo->dqi_lqi_bh = NULL;
+	oinfo->dqi_libh = NULL;
+
+	status = ocfs2_global_read_info(sb, type);
+	if (status < 0)
+		goto out_err;
+
+	status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+	locked = 1;
+
+	/* Now read local header */
+	status = ocfs2_read_quota_block(lqinode, 0, &bh);
+	if (status) {
+		mlog_errno(status);
+		mlog(ML_ERROR, "failed to read quota file info header "
+			"(type=%d)\n", type);
+		goto out_err;
+	}
+	ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+						OCFS2_LOCAL_INFO_OFF);
+	info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+	oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
+	oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
+	oinfo->dqi_libh = bh;
+
+	/* We crashed when using local quota file? */
+	if (!(info->dqi_flags & OLQF_CLEAN)) {
+		rec = OCFS2_SB(sb)->quota_rec;
+		if (!rec) {
+			rec = ocfs2_alloc_quota_recovery();
+			if (!rec) {
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto out_err;
+			}
+			OCFS2_SB(sb)->quota_rec = rec;
+		}
+
+		status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+                                                   &rec->r_list[type]);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out_err;
+		}
+	}
+
+	status = ocfs2_load_local_quota_bitmaps(lqinode,
+						ldinfo,
+						&oinfo->dqi_chunk);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	/* Now mark quota file as used */
+	info->dqi_flags &= ~OLQF_CLEAN;
+	status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+	return 0;
+out_err:
+	if (oinfo) {
+		iput(oinfo->dqi_gqinode);
+		ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+		ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+		brelse(oinfo->dqi_lqi_bh);
+		if (locked)
+			ocfs2_inode_unlock(lqinode, 1);
+		ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+		kfree(oinfo);
+	}
+	brelse(bh);
+	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+	return -1;
+}
+
+/* Write local info to quota file */
+static int ocfs2_local_write_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
+						->dqi_libh;
+	int status;
+
+	status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
+				 info);
+	if (status < 0) {
+		mlog_errno(status);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Release info from memory */
+static int ocfs2_local_free_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_quota_chunk *chunk;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int mark_clean = 1, len;
+	int status;
+
+	iput(oinfo->dqi_gqinode);
+	ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+	ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+	list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+		dchunk = (struct ocfs2_local_disk_chunk *)
+					(chunk->qc_headerbh->b_data);
+		if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+			len = ol_chunk_entries(sb);
+		} else {
+			len = (oinfo->dqi_blocks -
+			       ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+			      * ol_quota_entries_per_block(sb);
+		}
+		/* Not all entries free? Bug! */
+		if (le32_to_cpu(dchunk->dqc_free) != len) {
+			mlog(ML_ERROR, "releasing quota file with used "
+					"entries (type=%d)\n", type);
+			mark_clean = 0;
+		}
+	}
+	ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+
+	/* dqonoff_mutex protects us against racing with recovery thread... */
+	if (oinfo->dqi_rec) {
+		ocfs2_free_quota_recovery(oinfo->dqi_rec);
+		mark_clean = 0;
+	}
+
+	if (!mark_clean)
+		goto out;
+
+	/* Mark local file as clean */
+	info->dqi_flags |= OLQF_CLEAN;
+	status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
+				 oinfo->dqi_libh,
+				 olq_update_info,
+				 info);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+out:
+	ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
+	brelse(oinfo->dqi_libh);
+	brelse(oinfo->dqi_lqi_bh);
+	kfree(oinfo);
+	return 0;
+}
+
+static void olq_set_dquot(struct buffer_head *bh, void *private)
+{
+	struct ocfs2_dquot *od = private;
+	struct ocfs2_local_disk_dqblk *dqblk;
+	struct super_block *sb = od->dq_dquot.dq_sb;
+
+	dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
+		+ ol_dqblk_block_offset(sb, od->dq_local_off));
+
+	dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id);
+	spin_lock(&dq_data_lock);
+	dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
+					  od->dq_origspace);
+	dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
+					  od->dq_originodes);
+	spin_unlock(&dq_data_lock);
+	trace_olq_set_dquot(
+		(unsigned long long)le64_to_cpu(dqblk->dqb_spacemod),
+		(unsigned long long)le64_to_cpu(dqblk->dqb_inodemod),
+		od->dq_dquot.dq_id);
+}
+
+/* Write dquot to local quota file */
+int ocfs2_local_write_dquot(struct dquot *dquot)
+{
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+	struct buffer_head *bh;
+	struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_type];
+	int status;
+
+	status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk,
+					     &bh);
+	if (status) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+out:
+	brelse(bh);
+	return status;
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
+						       int type,
+						       int *offset)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_quota_chunk *chunk;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int found = 0, len;
+
+	list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+		dchunk = (struct ocfs2_local_disk_chunk *)
+						chunk->qc_headerbh->b_data;
+		if (le32_to_cpu(dchunk->dqc_free) > 0) {
+			found = 1;
+			break;
+		}
+	}
+	if (!found)
+		return NULL;
+
+	if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+		len = ol_chunk_entries(sb);
+	} else {
+		len = (oinfo->dqi_blocks -
+		       ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+		      * ol_quota_entries_per_block(sb);
+	}
+
+	found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+	/* We failed? */
+	if (found == len) {
+		mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
+		     " entries free (type=%d)\n", chunk->qc_num,
+		     le32_to_cpu(dchunk->dqc_free), type);
+		return ERR_PTR(-EIO);
+	}
+	*offset = found;
+	return chunk;
+}
+
+/* Add new chunk to the local quota file */
+static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
+							struct super_block *sb,
+							int type,
+							int *offset)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	struct ocfs2_quota_chunk *chunk = NULL;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int status;
+	handle_t *handle;
+	struct buffer_head *bh = NULL, *dbh = NULL;
+	u64 p_blkno;
+
+	/* We are protected by dqio_sem so no locking needed */
+	status = ocfs2_extend_no_holes(lqinode, NULL,
+				       lqinode->i_size + 2 * sb->s_blocksize,
+				       lqinode->i_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+					  lqinode->i_size + 2 * sb->s_blocksize);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+	if (!chunk) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out;
+	}
+	/* Local quota info and two new blocks we initialize */
+	handle = ocfs2_start_trans(OCFS2_SB(sb),
+			OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+			2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+
+	/* Initialize chunk header */
+	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+					     &p_blkno, NULL, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	bh = sb_getblk(sb, p_blkno);
+	if (!bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out_trans;
+	}
+	dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
+	status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	lock_buffer(bh);
+	dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb));
+	memset(dchunk->dqc_bitmap, 0,
+	       sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+	       OCFS2_QBLK_RESERVED_SPACE);
+	unlock_buffer(bh);
+	ocfs2_journal_dirty(handle, bh);
+
+	/* Initialize new block with structures */
+	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
+					     &p_blkno, NULL, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	dbh = sb_getblk(sb, p_blkno);
+	if (!dbh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out_trans;
+	}
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), dbh);
+	status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), dbh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	lock_buffer(dbh);
+	memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
+	unlock_buffer(dbh);
+	ocfs2_journal_dirty(handle, dbh);
+
+	/* Update local quotafile info */
+	oinfo->dqi_blocks += 2;
+	oinfo->dqi_chunks++;
+	status = ocfs2_local_write_info(sb, type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk);
+	chunk->qc_num = list_entry(chunk->qc_chunk.prev,
+				   struct ocfs2_quota_chunk,
+				   qc_chunk)->qc_num + 1;
+	chunk->qc_headerbh = bh;
+	*offset = 0;
+	return chunk;
+out_trans:
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+	brelse(bh);
+	brelse(dbh);
+	kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
+	return ERR_PTR(status);
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
+						       struct super_block *sb,
+						       int type,
+						       int *offset)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_quota_chunk *chunk;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	struct ocfs2_local_disk_chunk *dchunk;
+	int epb = ol_quota_entries_per_block(sb);
+	unsigned int chunk_blocks;
+	struct buffer_head *bh;
+	u64 p_blkno;
+	int status;
+	handle_t *handle;
+
+	if (list_empty(&oinfo->dqi_chunk))
+		return ocfs2_local_quota_add_chunk(sb, type, offset);
+	/* Is the last chunk full? */
+	chunk = list_entry(oinfo->dqi_chunk.prev,
+			struct ocfs2_quota_chunk, qc_chunk);
+	chunk_blocks = oinfo->dqi_blocks -
+			ol_quota_chunk_block(sb, chunk->qc_num) - 1;
+	if (ol_chunk_blocks(sb) == chunk_blocks)
+		return ocfs2_local_quota_add_chunk(sb, type, offset);
+
+	/* We are protected by dqio_sem so no locking needed */
+	status = ocfs2_extend_no_holes(lqinode, NULL,
+				       lqinode->i_size + sb->s_blocksize,
+				       lqinode->i_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+					  lqinode->i_size + sb->s_blocksize);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	/* Get buffer from the just added block */
+	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+					     &p_blkno, NULL, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	bh = sb_getblk(sb, p_blkno);
+	if (!bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out;
+	}
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
+
+	/* Local quota info, chunk header and the new block we initialize */
+	handle = ocfs2_start_trans(OCFS2_SB(sb),
+			OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+			2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+	/* Zero created block */
+	status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
+				 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	lock_buffer(bh);
+	memset(bh->b_data, 0, sb->s_blocksize);
+	unlock_buffer(bh);
+	ocfs2_journal_dirty(handle, bh);
+
+	/* Update chunk header */
+	status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
+					 chunk->qc_headerbh,
+				 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+
+	dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data;
+	lock_buffer(chunk->qc_headerbh);
+	le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
+	unlock_buffer(chunk->qc_headerbh);
+	ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+
+	/* Update file header */
+	oinfo->dqi_blocks++;
+	status = ocfs2_local_write_info(sb, type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+
+	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	*offset = chunk_blocks * epb;
+	return chunk;
+out_trans:
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+	return ERR_PTR(status);
+}
+
+static void olq_alloc_dquot(struct buffer_head *bh, void *private)
+{
+	int *offset = private;
+	struct ocfs2_local_disk_chunk *dchunk;
+
+	dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+	ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+	le32_add_cpu(&dchunk->dqc_free, -1);
+}
+
+/* Create dquot in the local file for given id */
+int ocfs2_create_local_dquot(struct dquot *dquot)
+{
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_type;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	struct ocfs2_quota_chunk *chunk;
+	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+	int offset;
+	int status;
+	u64 pcount;
+
+	down_write(&OCFS2_I(lqinode)->ip_alloc_sem);
+	chunk = ocfs2_find_free_entry(sb, type, &offset);
+	if (!chunk) {
+		chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
+		if (IS_ERR(chunk)) {
+			status = PTR_ERR(chunk);
+			goto out;
+		}
+	} else if (IS_ERR(chunk)) {
+		status = PTR_ERR(chunk);
+		goto out;
+	}
+	od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
+	od->dq_chunk = chunk;
+	status = ocfs2_extent_map_get_blocks(lqinode,
+				     ol_dqblk_block(sb, chunk->qc_num, offset),
+				     &od->dq_local_phys_blk,
+				     &pcount,
+				     NULL);
+
+	/* Initialize dquot structure on disk */
+	status = ocfs2_local_write_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	/* Mark structure as allocated */
+	status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot,
+				 &offset);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+out:
+	up_write(&OCFS2_I(lqinode)->ip_alloc_sem);
+	return status;
+}
+
+/*
+ * Release dquot structure from local quota file. ocfs2_release_dquot() has
+ * already started a transaction and written all changes to global quota file
+ */
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
+{
+	int status;
+	int type = dquot->dq_type;
+	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int offset;
+
+	status = ocfs2_journal_access_dq(handle,
+			INODE_CACHE(sb_dqopt(sb)->files[type]),
+			od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num,
+					     od->dq_local_off);
+	dchunk = (struct ocfs2_local_disk_chunk *)
+			(od->dq_chunk->qc_headerbh->b_data);
+	/* Mark structure as freed */
+	lock_buffer(od->dq_chunk->qc_headerbh);
+	ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+	le32_add_cpu(&dchunk->dqc_free, 1);
+	unlock_buffer(od->dq_chunk->qc_headerbh);
+	ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+
+out:
+	/* Clear the read bit so that next time someone uses this
+	 * dquot he reads fresh info from disk and allocates local
+	 * dquot structure */
+	clear_bit(DQ_READ_B, &dquot->dq_flags);
+	return status;
+}
+
+static const struct quota_format_ops ocfs2_format_ops = {
+	.check_quota_file	= ocfs2_local_check_quota_file,
+	.read_file_info		= ocfs2_local_read_info,
+	.write_file_info	= ocfs2_global_write_info,
+	.free_file_info		= ocfs2_local_free_info,
+};
+
+struct quota_format_type ocfs2_quota_format = {
+	.qf_fmt_id = QFMT_OCFS2,
+	.qf_ops = &ocfs2_format_ops,
+	.qf_owner = THIS_MODULE
+};
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
new file mode 100644
index 00000000..15d29cce
--- /dev/null
+++ b/fs/ocfs2/refcounttree.c
@@ -0,0 +1,4514 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * refcounttree.c
+ *
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/sort.h>
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "suballoc.h"
+#include "journal.h"
+#include "uptodate.h"
+#include "super.h"
+#include "buffer_head_io.h"
+#include "blockcheck.h"
+#include "refcounttree.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "aops.h"
+#include "xattr.h"
+#include "namei.h"
+#include "ocfs2_trace.h"
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+#include <linux/security.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+
+struct ocfs2_cow_context {
+	struct inode *inode;
+	struct file *file;
+	u32 cow_start;
+	u32 cow_len;
+	struct ocfs2_extent_tree data_et;
+	struct ocfs2_refcount_tree *ref_tree;
+	struct buffer_head *ref_root_bh;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_alloc_context *data_ac;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	void *cow_object;
+	struct ocfs2_post_refcount *post_refcount;
+	int extra_credits;
+	int (*get_clusters)(struct ocfs2_cow_context *context,
+			    u32 v_cluster, u32 *p_cluster,
+			    u32 *num_clusters,
+			    unsigned int *extent_flags);
+	int (*cow_duplicate_clusters)(handle_t *handle,
+				      struct file *file,
+				      u32 cpos, u32 old_cluster,
+				      u32 new_cluster, u32 new_len);
+};
+
+static inline struct ocfs2_refcount_tree *
+cache_info_to_refcount(struct ocfs2_caching_info *ci)
+{
+	return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
+}
+
+static int ocfs2_validate_refcount_block(struct super_block *sb,
+					 struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_refcount_block *rb =
+		(struct ocfs2_refcount_block *)bh->b_data;
+
+	trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
+	if (rc) {
+		mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		return rc;
+	}
+
+
+	if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
+		ocfs2_error(sb,
+			    "Refcount block #%llu has bad signature %.*s",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    rb->rf_signature);
+		return -EINVAL;
+	}
+
+	if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb,
+			    "Refcount block #%llu has an invalid rf_blkno "
+			    "of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(rb->rf_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Refcount block #%llu has an invalid "
+			    "rf_fs_generation of #%u",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(rb->rf_fs_generation));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
+				     u64 rb_blkno,
+				     struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(ci, rb_blkno, &tmp,
+			      ocfs2_validate_refcount_block);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
+static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	return rf->rf_blkno;
+}
+
+static struct super_block *
+ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	return rf->rf_sb;
+}
+
+static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	spin_lock(&rf->rf_lock);
+}
+
+static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	spin_unlock(&rf->rf_lock);
+}
+
+static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	mutex_lock(&rf->rf_io_mutex);
+}
+
+static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	mutex_unlock(&rf->rf_io_mutex);
+}
+
+static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
+	.co_owner		= ocfs2_refcount_cache_owner,
+	.co_get_super		= ocfs2_refcount_cache_get_super,
+	.co_cache_lock		= ocfs2_refcount_cache_lock,
+	.co_cache_unlock	= ocfs2_refcount_cache_unlock,
+	.co_io_lock		= ocfs2_refcount_cache_io_lock,
+	.co_io_unlock		= ocfs2_refcount_cache_io_unlock,
+};
+
+static struct ocfs2_refcount_tree *
+ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
+{
+	struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
+	struct ocfs2_refcount_tree *tree = NULL;
+
+	while (n) {
+		tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
+
+		if (blkno < tree->rf_blkno)
+			n = n->rb_left;
+		else if (blkno > tree->rf_blkno)
+			n = n->rb_right;
+		else
+			return tree;
+	}
+
+	return NULL;
+}
+
+/* osb_lock is already locked. */
+static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
+				       struct ocfs2_refcount_tree *new)
+{
+	u64 rf_blkno = new->rf_blkno;
+	struct rb_node *parent = NULL;
+	struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
+	struct ocfs2_refcount_tree *tmp;
+
+	while (*p) {
+		parent = *p;
+
+		tmp = rb_entry(parent, struct ocfs2_refcount_tree,
+			       rf_node);
+
+		if (rf_blkno < tmp->rf_blkno)
+			p = &(*p)->rb_left;
+		else if (rf_blkno > tmp->rf_blkno)
+			p = &(*p)->rb_right;
+		else {
+			/* This should never happen! */
+			mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
+			     (unsigned long long)rf_blkno);
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->rf_node, parent, p);
+	rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
+}
+
+static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
+{
+	ocfs2_metadata_cache_exit(&tree->rf_ci);
+	ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
+	ocfs2_lock_res_free(&tree->rf_lockres);
+	kfree(tree);
+}
+
+static inline void
+ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
+					struct ocfs2_refcount_tree *tree)
+{
+	rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
+	if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
+		osb->osb_ref_tree_lru = NULL;
+}
+
+static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
+					struct ocfs2_refcount_tree *tree)
+{
+	spin_lock(&osb->osb_lock);
+	ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
+	spin_unlock(&osb->osb_lock);
+}
+
+static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
+{
+	struct ocfs2_refcount_tree *tree =
+		container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
+
+	ocfs2_free_refcount_tree(tree);
+}
+
+static inline void
+ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
+{
+	kref_get(&tree->rf_getcnt);
+}
+
+static inline void
+ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
+{
+	kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
+}
+
+static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
+					       struct super_block *sb)
+{
+	ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
+	mutex_init(&new->rf_io_mutex);
+	new->rf_sb = sb;
+	spin_lock_init(&new->rf_lock);
+}
+
+static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
+					struct ocfs2_refcount_tree *new,
+					u64 rf_blkno, u32 generation)
+{
+	init_rwsem(&new->rf_sem);
+	ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
+				     rf_blkno, generation);
+}
+
+static struct ocfs2_refcount_tree*
+ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
+{
+	struct ocfs2_refcount_tree *new;
+
+	new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
+	if (!new)
+		return NULL;
+
+	new->rf_blkno = rf_blkno;
+	kref_init(&new->rf_getcnt);
+	ocfs2_init_refcount_tree_ci(new, osb->sb);
+
+	return new;
+}
+
+static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
+				   struct ocfs2_refcount_tree **ret_tree)
+{
+	int ret = 0;
+	struct ocfs2_refcount_tree *tree, *new = NULL;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_block *ref_rb;
+
+	spin_lock(&osb->osb_lock);
+	if (osb->osb_ref_tree_lru &&
+	    osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
+		tree = osb->osb_ref_tree_lru;
+	else
+		tree = ocfs2_find_refcount_tree(osb, rf_blkno);
+	if (tree)
+		goto out;
+
+	spin_unlock(&osb->osb_lock);
+
+	new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
+	if (!new) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		return ret;
+	}
+	/*
+	 * We need the generation to create the refcount tree lock and since
+	 * it isn't changed during the tree modification, we are safe here to
+	 * read without protection.
+	 * We also have to purge the cache after we create the lock since the
+	 * refcount block may have the stale data. It can only be trusted when
+	 * we hold the refcount lock.
+	 */
+	ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		ocfs2_metadata_cache_exit(&new->rf_ci);
+		kfree(new);
+		return ret;
+	}
+
+	ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
+	ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
+				      new->rf_generation);
+	ocfs2_metadata_cache_purge(&new->rf_ci);
+
+	spin_lock(&osb->osb_lock);
+	tree = ocfs2_find_refcount_tree(osb, rf_blkno);
+	if (tree)
+		goto out;
+
+	ocfs2_insert_refcount_tree(osb, new);
+
+	tree = new;
+	new = NULL;
+
+out:
+	*ret_tree = tree;
+
+	osb->osb_ref_tree_lru = tree;
+
+	spin_unlock(&osb->osb_lock);
+
+	if (new)
+		ocfs2_free_refcount_tree(new);
+
+	brelse(ref_root_bh);
+	return ret;
+}
+
+static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+
+	ret = ocfs2_read_inode_block(inode, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	*ref_blkno = le64_to_cpu(di->i_refcount_loc);
+	brelse(di_bh);
+out:
+	return ret;
+}
+
+static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
+				      struct ocfs2_refcount_tree *tree, int rw)
+{
+	int ret;
+
+	ret = ocfs2_refcount_lock(tree, rw);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (rw)
+		down_write(&tree->rf_sem);
+	else
+		down_read(&tree->rf_sem);
+
+out:
+	return ret;
+}
+
+/*
+ * Lock the refcount tree pointed by ref_blkno and return the tree.
+ * In most case, we lock the tree and read the refcount block.
+ * So read it here if the caller really needs it.
+ *
+ * If the tree has been re-created by other node, it will free the
+ * old one and re-create it.
+ */
+int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
+			     u64 ref_blkno, int rw,
+			     struct ocfs2_refcount_tree **ret_tree,
+			     struct buffer_head **ref_bh)
+{
+	int ret, delete_tree = 0;
+	struct ocfs2_refcount_tree *tree = NULL;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_block *rb;
+
+again:
+	ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ocfs2_refcount_tree_get(tree);
+
+	ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
+	if (ret) {
+		mlog_errno(ret);
+		ocfs2_refcount_tree_put(tree);
+		goto out;
+	}
+
+	ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
+					&ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		ocfs2_unlock_refcount_tree(osb, tree, rw);
+		ocfs2_refcount_tree_put(tree);
+		goto out;
+	}
+
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	/*
+	 * If the refcount block has been freed and re-created, we may need
+	 * to recreate the refcount tree also.
+	 *
+	 * Here we just remove the tree from the rb-tree, and the last
+	 * kref holder will unlock and delete this refcount_tree.
+	 * Then we goto "again" and ocfs2_get_refcount_tree will create
+	 * the new refcount tree for us.
+	 */
+	if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
+		if (!tree->rf_removed) {
+			ocfs2_erase_refcount_tree_from_list(osb, tree);
+			tree->rf_removed = 1;
+			delete_tree = 1;
+		}
+
+		ocfs2_unlock_refcount_tree(osb, tree, rw);
+		/*
+		 * We get an extra reference when we create the refcount
+		 * tree, so another put will destroy it.
+		 */
+		if (delete_tree)
+			ocfs2_refcount_tree_put(tree);
+		brelse(ref_root_bh);
+		ref_root_bh = NULL;
+		goto again;
+	}
+
+	*ret_tree = tree;
+	if (ref_bh) {
+		*ref_bh = ref_root_bh;
+		ref_root_bh = NULL;
+	}
+out:
+	brelse(ref_root_bh);
+	return ret;
+}
+
+void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
+				struct ocfs2_refcount_tree *tree, int rw)
+{
+	if (rw)
+		up_write(&tree->rf_sem);
+	else
+		up_read(&tree->rf_sem);
+
+	ocfs2_refcount_unlock(tree, rw);
+	ocfs2_refcount_tree_put(tree);
+}
+
+void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
+{
+	struct rb_node *node;
+	struct ocfs2_refcount_tree *tree;
+	struct rb_root *root = &osb->osb_rf_lock_tree;
+
+	while ((node = rb_last(root)) != NULL) {
+		tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
+
+		trace_ocfs2_purge_refcount_trees(
+				(unsigned long long) tree->rf_blkno);
+
+		rb_erase(&tree->rf_node, root);
+		ocfs2_free_refcount_tree(tree);
+	}
+}
+
+/*
+ * Create a refcount tree for an inode.
+ * We take for granted that the inode is already locked.
+ */
+static int ocfs2_create_refcount_tree(struct inode *inode,
+				      struct buffer_head *di_bh)
+{
+	int ret;
+	handle_t *handle = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 suballoc_loc, first_blkno;
+
+	BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+
+	trace_ocfs2_create_refcount_tree(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+				   &suballoc_bit_start, &num_got,
+				   &first_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
+	if (!new_tree) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	new_bh = sb_getblk(inode->i_sb, first_blkno);
+	ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
+
+	ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/* Initialize ocfs2_refcount_block. */
+	rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+	memset(rb, 0, inode->i_sb->s_blocksize);
+	strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+	rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+	rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
+	rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
+	rb->rf_blkno = cpu_to_le64(first_blkno);
+	rb->rf_count = cpu_to_le32(1);
+	rb->rf_records.rl_count =
+			cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
+	spin_lock(&osb->osb_lock);
+	rb->rf_generation = osb->s_next_generation++;
+	spin_unlock(&osb->osb_lock);
+
+	ocfs2_journal_dirty(handle, new_bh);
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	di->i_refcount_loc = cpu_to_le64(first_blkno);
+	spin_unlock(&oi->ip_lock);
+
+	trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+	/*
+	 * We have to init the tree lock here since it will use
+	 * the generation number to create it.
+	 */
+	new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
+	ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
+				      new_tree->rf_generation);
+
+	spin_lock(&osb->osb_lock);
+	tree = ocfs2_find_refcount_tree(osb, first_blkno);
+
+	/*
+	 * We've just created a new refcount tree in this block.  If
+	 * we found a refcount tree on the ocfs2_super, it must be
+	 * one we just deleted.  We free the old tree before
+	 * inserting the new tree.
+	 */
+	BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
+	if (tree)
+		ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
+	ocfs2_insert_refcount_tree(osb, new_tree);
+	spin_unlock(&osb->osb_lock);
+	new_tree = NULL;
+	if (tree)
+		ocfs2_refcount_tree_put(tree);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (new_tree) {
+		ocfs2_metadata_cache_exit(&new_tree->rf_ci);
+		kfree(new_tree);
+	}
+
+	brelse(new_bh);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return ret;
+}
+
+static int ocfs2_set_refcount_tree(struct inode *inode,
+				   struct buffer_head *di_bh,
+				   u64 refcount_loc)
+{
+	int ret;
+	handle_t *handle = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_tree *ref_tree;
+
+	BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+
+	ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+				       &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	le32_add_cpu(&rb->rf_count, 1);
+
+	ocfs2_journal_dirty(handle, ref_root_bh);
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	di->i_refcount_loc = cpu_to_le64(refcount_loc);
+	spin_unlock(&oi->ip_lock);
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
+
+	return ret;
+}
+
+int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
+{
+	int ret, delete_tree = 0;
+	handle_t *handle = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_refcount_block *rb;
+	struct inode *alloc_inode = NULL;
+	struct buffer_head *alloc_bh = NULL;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_refcount_tree *ref_tree;
+	int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
+	u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
+	u16 bit = 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
+		return 0;
+
+	BUG_ON(!ref_blkno);
+	ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
+
+	/*
+	 * If we are the last user, we need to free the block.
+	 * So lock the allocator ahead.
+	 */
+	if (le32_to_cpu(rb->rf_count) == 1) {
+		blk = le64_to_cpu(rb->rf_blkno);
+		bit = le16_to_cpu(rb->rf_suballoc_bit);
+		if (rb->rf_suballoc_loc)
+			bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
+		else
+			bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+		alloc_inode = ocfs2_get_system_file_inode(osb,
+					EXTENT_ALLOC_SYSTEM_INODE,
+					le16_to_cpu(rb->rf_suballoc_slot));
+		if (!alloc_inode) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+		mutex_lock(&alloc_inode->i_mutex);
+
+		ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_mutex;
+		}
+
+		credits += OCFS2_SUBALLOC_FREE;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	di->i_refcount_loc = 0;
+	spin_unlock(&oi->ip_lock);
+	ocfs2_journal_dirty(handle, di_bh);
+
+	le32_add_cpu(&rb->rf_count , -1);
+	ocfs2_journal_dirty(handle, blk_bh);
+
+	if (!rb->rf_count) {
+		delete_tree = 1;
+		ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
+		ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
+					       alloc_bh, bit, bg_blkno, 1);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out_unlock:
+	if (alloc_inode) {
+		ocfs2_inode_unlock(alloc_inode, 1);
+		brelse(alloc_bh);
+	}
+out_mutex:
+	if (alloc_inode) {
+		mutex_unlock(&alloc_inode->i_mutex);
+		iput(alloc_inode);
+	}
+out:
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	if (delete_tree)
+		ocfs2_refcount_tree_put(ref_tree);
+	brelse(blk_bh);
+
+	return ret;
+}
+
+static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
+					  struct buffer_head *ref_leaf_bh,
+					  u64 cpos, unsigned int len,
+					  struct ocfs2_refcount_rec *ret_rec,
+					  int *index)
+{
+	int i = 0;
+	struct ocfs2_refcount_block *rb =
+		(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_rec *rec = NULL;
+
+	for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
+		rec = &rb->rf_records.rl_recs[i];
+
+		if (le64_to_cpu(rec->r_cpos) +
+		    le32_to_cpu(rec->r_clusters) <= cpos)
+			continue;
+		else if (le64_to_cpu(rec->r_cpos) > cpos)
+			break;
+
+		/* ok, cpos fail in this rec. Just return. */
+		if (ret_rec)
+			*ret_rec = *rec;
+		goto out;
+	}
+
+	if (ret_rec) {
+		/* We meet with a hole here, so fake the rec. */
+		ret_rec->r_cpos = cpu_to_le64(cpos);
+		ret_rec->r_refcount = 0;
+		if (i < le16_to_cpu(rb->rf_records.rl_used) &&
+		    le64_to_cpu(rec->r_cpos) < cpos + len)
+			ret_rec->r_clusters =
+				cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
+		else
+			ret_rec->r_clusters = cpu_to_le32(len);
+	}
+
+out:
+	*index = i;
+}
+
+/*
+ * Try to remove refcount tree. The mechanism is:
+ * 1) Check whether i_clusters == 0, if no, exit.
+ * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
+ * 3) Check whether we have inline xattr stored outside, if yes, exit.
+ * 4) Remove the tree.
+ */
+int ocfs2_try_remove_refcount_tree(struct inode *inode,
+				   struct buffer_head *di_bh)
+{
+	int ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_write(&oi->ip_xattr_sem);
+	down_write(&oi->ip_alloc_sem);
+
+	if (oi->ip_clusters)
+		goto out;
+
+	if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
+		goto out;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
+	    ocfs2_has_inline_xattr_value_outside(inode, di))
+		goto out;
+
+	ret = ocfs2_remove_refcount_tree(inode, di_bh);
+	if (ret)
+		mlog_errno(ret);
+out:
+	up_write(&oi->ip_alloc_sem);
+	up_write(&oi->ip_xattr_sem);
+	return 0;
+}
+
+/*
+ * Find the end range for a leaf refcount block indicated by
+ * el->l_recs[index].e_blkno.
+ */
+static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
+				       struct buffer_head *ref_root_bh,
+				       struct ocfs2_extent_block *eb,
+				       struct ocfs2_extent_list *el,
+				       int index,  u32 *cpos_end)
+{
+	int ret, i, subtree_root;
+	u32 cpos;
+	u64 blkno;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct ocfs2_path *left_path = NULL, *right_path = NULL;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_extent_list *tmp_el;
+
+	if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
+		/*
+		 * We have a extent rec after index, so just use the e_cpos
+		 * of the next extent rec.
+		 */
+		*cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
+		return 0;
+	}
+
+	if (!eb || (eb && !eb->h_next_leaf_blk)) {
+		/*
+		 * We are the last extent rec, so any high cpos should
+		 * be stored in this leaf refcount block.
+		 */
+		*cpos_end = UINT_MAX;
+		return 0;
+	}
+
+	/*
+	 * If the extent block isn't the last one, we have to find
+	 * the subtree root between this extent block and the next
+	 * leaf extent block and get the corresponding e_cpos from
+	 * the subroot. Otherwise we may corrupt the b-tree.
+	 */
+	ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+
+	left_path = ocfs2_new_path_from_et(&et);
+	if (!left_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
+	ret = ocfs2_find_path(ci, left_path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	right_path = ocfs2_new_path_from_path(left_path);
+	if (!right_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(ci, right_path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	subtree_root = ocfs2_find_subtree_root(&et, left_path,
+					       right_path);
+
+	tmp_el = left_path->p_node[subtree_root].el;
+	blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
+	for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) {
+		if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
+			*cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
+			break;
+		}
+	}
+
+	BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec));
+
+out:
+	ocfs2_free_path(left_path);
+	ocfs2_free_path(right_path);
+	return ret;
+}
+
+/*
+ * Given a cpos and len, try to find the refcount record which contains cpos.
+ * 1. If cpos can be found in one refcount record, return the record.
+ * 2. If cpos can't be found, return a fake record which start from cpos
+ *    and end at a small value between cpos+len and start of the next record.
+ *    This fake record has r_refcount = 0.
+ */
+static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
+				  struct buffer_head *ref_root_bh,
+				  u64 cpos, unsigned int len,
+				  struct ocfs2_refcount_rec *ret_rec,
+				  int *index,
+				  struct buffer_head **ret_bh)
+{
+	int ret = 0, i, found;
+	u32 low_cpos, uninitialized_var(cpos_end);
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec = NULL;
+	struct ocfs2_extent_block *eb = NULL;
+	struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
+		ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
+					      ret_rec, index);
+		*ret_bh = ref_root_bh;
+		get_bh(ref_root_bh);
+		return 0;
+	}
+
+	el = &rb->rf_list;
+	low_cpos = cpos & OCFS2_32BIT_POS_MASK;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(sb,
+			"refcount tree %llu has non zero tree "
+			"depth in leaf btree tree block %llu\n",
+			(unsigned long long)ocfs2_metadata_cache_owner(ci),
+			(unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	found = 0;
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
+						  eb, el, i, &cpos_end);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (cpos_end < low_cpos + len)
+			len = cpos_end - low_cpos;
+	}
+
+	ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
+					&ref_leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
+				      ret_rec, index);
+	*ret_bh = ref_leaf_bh;
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+enum ocfs2_ref_rec_contig {
+	REF_CONTIG_NONE = 0,
+	REF_CONTIG_LEFT,
+	REF_CONTIG_RIGHT,
+	REF_CONTIG_LEFTRIGHT,
+};
+
+static enum ocfs2_ref_rec_contig
+	ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
+				    int index)
+{
+	if ((rb->rf_records.rl_recs[index].r_refcount ==
+	    rb->rf_records.rl_recs[index + 1].r_refcount) &&
+	    (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
+	    le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
+	    le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
+		return REF_CONTIG_RIGHT;
+
+	return REF_CONTIG_NONE;
+}
+
+static enum ocfs2_ref_rec_contig
+	ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
+				  int index)
+{
+	enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
+
+	if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
+		ret = ocfs2_refcount_rec_adjacent(rb, index);
+
+	if (index > 0) {
+		enum ocfs2_ref_rec_contig tmp;
+
+		tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
+
+		if (tmp == REF_CONTIG_RIGHT) {
+			if (ret == REF_CONTIG_RIGHT)
+				ret = REF_CONTIG_LEFTRIGHT;
+			else
+				ret = REF_CONTIG_LEFT;
+		}
+	}
+
+	return ret;
+}
+
+static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
+					   int index)
+{
+	BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
+	       rb->rf_records.rl_recs[index+1].r_refcount);
+
+	le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
+		     le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
+
+	if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
+		memmove(&rb->rf_records.rl_recs[index + 1],
+			&rb->rf_records.rl_recs[index + 2],
+			sizeof(struct ocfs2_refcount_rec) *
+			(le16_to_cpu(rb->rf_records.rl_used) - index - 2));
+
+	memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
+	       0, sizeof(struct ocfs2_refcount_rec));
+	le16_add_cpu(&rb->rf_records.rl_used, -1);
+}
+
+/*
+ * Merge the refcount rec if we are contiguous with the adjacent recs.
+ */
+static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
+				     int index)
+{
+	enum ocfs2_ref_rec_contig contig =
+				ocfs2_refcount_rec_contig(rb, index);
+
+	if (contig == REF_CONTIG_NONE)
+		return;
+
+	if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
+		BUG_ON(index == 0);
+		index--;
+	}
+
+	ocfs2_rotate_refcount_rec_left(rb, index);
+
+	if (contig == REF_CONTIG_LEFTRIGHT)
+		ocfs2_rotate_refcount_rec_left(rb, index);
+}
+
+/*
+ * Change the refcount indexed by "index" in ref_bh.
+ * If refcount reaches 0, remove it.
+ */
+static int ocfs2_change_refcount_rec(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_leaf_bh,
+				     int index, int merge, int change)
+{
+	int ret;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_list *rl = &rb->rf_records;
+	struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_change_refcount_rec(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		index, le32_to_cpu(rec->r_refcount), change);
+	le32_add_cpu(&rec->r_refcount, change);
+
+	if (!rec->r_refcount) {
+		if (index != le16_to_cpu(rl->rl_used) - 1) {
+			memmove(rec, rec + 1,
+				(le16_to_cpu(rl->rl_used) - index - 1) *
+				sizeof(struct ocfs2_refcount_rec));
+			memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
+			       0, sizeof(struct ocfs2_refcount_rec));
+		}
+
+		le16_add_cpu(&rl->rl_used, -1);
+	} else if (merge)
+		ocfs2_refcount_rec_merge(rb, index);
+
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
+out:
+	return ret;
+}
+
+static int ocfs2_expand_inline_ref_root(handle_t *handle,
+					struct ocfs2_caching_info *ci,
+					struct buffer_head *ref_root_bh,
+					struct buffer_head **ref_leaf_bh,
+					struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 suballoc_loc, blkno;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_refcount_block *new_rb;
+	struct ocfs2_refcount_block *root_rb =
+			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+				   &suballoc_bit_start, &num_got,
+				   &blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	new_bh = sb_getblk(sb, blkno);
+	if (new_bh == NULL) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+	ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+	ret = ocfs2_journal_access_rb(handle, ci, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Initialize ocfs2_refcount_block.
+	 * It should contain the same information as the old root.
+	 * so just memcpy it and change the corresponding field.
+	 */
+	memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
+
+	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+	new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+	new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
+	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	new_rb->rf_blkno = cpu_to_le64(blkno);
+	new_rb->rf_cpos = cpu_to_le32(0);
+	new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
+	new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
+	ocfs2_journal_dirty(handle, new_bh);
+
+	/* Now change the root. */
+	memset(&root_rb->rf_list, 0, sb->s_blocksize -
+	       offsetof(struct ocfs2_refcount_block, rf_list));
+	root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
+	root_rb->rf_clusters = cpu_to_le32(1);
+	root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
+	root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+	root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+	root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
+
+	ocfs2_journal_dirty(handle, ref_root_bh);
+
+	trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno,
+		le16_to_cpu(new_rb->rf_records.rl_used));
+
+	*ref_leaf_bh = new_bh;
+	new_bh = NULL;
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
+					   struct ocfs2_refcount_rec *next)
+{
+	if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
+		ocfs2_get_ref_rec_low_cpos(next))
+		return 1;
+
+	return 0;
+}
+
+static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
+{
+	const struct ocfs2_refcount_rec *l = a, *r = b;
+	u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
+	u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
+
+	if (l_cpos > r_cpos)
+		return 1;
+	if (l_cpos < r_cpos)
+		return -1;
+	return 0;
+}
+
+static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
+{
+	const struct ocfs2_refcount_rec *l = a, *r = b;
+	u64 l_cpos = le64_to_cpu(l->r_cpos);
+	u64 r_cpos = le64_to_cpu(r->r_cpos);
+
+	if (l_cpos > r_cpos)
+		return 1;
+	if (l_cpos < r_cpos)
+		return -1;
+	return 0;
+}
+
+static void swap_refcount_rec(void *a, void *b, int size)
+{
+	struct ocfs2_refcount_rec *l = a, *r = b, tmp;
+
+	tmp = *(struct ocfs2_refcount_rec *)l;
+	*(struct ocfs2_refcount_rec *)l =
+			*(struct ocfs2_refcount_rec *)r;
+	*(struct ocfs2_refcount_rec *)r = tmp;
+}
+
+/*
+ * The refcount cpos are ordered by their 64bit cpos,
+ * But we will use the low 32 bit to be the e_cpos in the b-tree.
+ * So we need to make sure that this pos isn't intersected with others.
+ *
+ * Note: The refcount block is already sorted by their low 32 bit cpos,
+ *       So just try the middle pos first, and we will exit when we find
+ *       the good position.
+ */
+static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
+					 u32 *split_pos, int *split_index)
+{
+	int num_used = le16_to_cpu(rl->rl_used);
+	int delta, middle = num_used / 2;
+
+	for (delta = 0; delta < middle; delta++) {
+		/* Let's check delta earlier than middle */
+		if (ocfs2_refcount_rec_no_intersect(
+					&rl->rl_recs[middle - delta - 1],
+					&rl->rl_recs[middle - delta])) {
+			*split_index = middle - delta;
+			break;
+		}
+
+		/* For even counts, don't walk off the end */
+		if ((middle + delta + 1) == num_used)
+			continue;
+
+		/* Now try delta past middle */
+		if (ocfs2_refcount_rec_no_intersect(
+					&rl->rl_recs[middle + delta],
+					&rl->rl_recs[middle + delta + 1])) {
+			*split_index = middle + delta + 1;
+			break;
+		}
+	}
+
+	if (delta >= middle)
+		return -ENOSPC;
+
+	*split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
+	return 0;
+}
+
+static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
+					    struct buffer_head *new_bh,
+					    u32 *split_cpos)
+{
+	int split_index = 0, num_moved, ret;
+	u32 cpos = 0;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_list *rl = &rb->rf_records;
+	struct ocfs2_refcount_block *new_rb =
+			(struct ocfs2_refcount_block *)new_bh->b_data;
+	struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
+
+	trace_ocfs2_divide_leaf_refcount_block(
+		(unsigned long long)ref_leaf_bh->b_blocknr,
+		le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used));
+
+	/*
+	 * XXX: Improvement later.
+	 * If we know all the high 32 bit cpos is the same, no need to sort.
+	 *
+	 * In order to make the whole process safe, we do:
+	 * 1. sort the entries by their low 32 bit cpos first so that we can
+	 *    find the split cpos easily.
+	 * 2. call ocfs2_insert_extent to insert the new refcount block.
+	 * 3. move the refcount rec to the new block.
+	 * 4. sort the entries by their 64 bit cpos.
+	 * 5. dirty the new_rb and rb.
+	 */
+	sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
+	     sizeof(struct ocfs2_refcount_rec),
+	     cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
+
+	ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	new_rb->rf_cpos = cpu_to_le32(cpos);
+
+	/* move refcount records starting from split_index to the new block. */
+	num_moved = le16_to_cpu(rl->rl_used) - split_index;
+	memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
+	       num_moved * sizeof(struct ocfs2_refcount_rec));
+
+	/*ok, remove the entries we just moved over to the other block. */
+	memset(&rl->rl_recs[split_index], 0,
+	       num_moved * sizeof(struct ocfs2_refcount_rec));
+
+	/* change old and new rl_used accordingly. */
+	le16_add_cpu(&rl->rl_used, -num_moved);
+	new_rl->rl_used = cpu_to_le16(num_moved);
+
+	sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
+	     sizeof(struct ocfs2_refcount_rec),
+	     cmp_refcount_rec_by_cpos, swap_refcount_rec);
+
+	sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
+	     sizeof(struct ocfs2_refcount_rec),
+	     cmp_refcount_rec_by_cpos, swap_refcount_rec);
+
+	*split_cpos = cpos;
+	return 0;
+}
+
+static int ocfs2_new_leaf_refcount_block(handle_t *handle,
+					 struct ocfs2_caching_info *ci,
+					 struct buffer_head *ref_root_bh,
+					 struct buffer_head *ref_leaf_bh,
+					 struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	u16 suballoc_bit_start;
+	u32 num_got, new_cpos;
+	u64 suballoc_loc, blkno;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct ocfs2_refcount_block *root_rb =
+			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_refcount_block *new_rb;
+	struct ocfs2_extent_tree ref_et;
+
+	BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+				   &suballoc_bit_start, &num_got,
+				   &blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	new_bh = sb_getblk(sb, blkno);
+	if (new_bh == NULL) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+	ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+	ret = ocfs2_journal_access_rb(handle, ci, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Initialize ocfs2_refcount_block. */
+	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+	memset(new_rb, 0, sb->s_blocksize);
+	strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+	new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+	new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
+	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+	new_rb->rf_blkno = cpu_to_le64(blkno);
+	new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
+	new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
+	new_rb->rf_records.rl_count =
+				cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
+	new_rb->rf_generation = root_rb->rf_generation;
+
+	ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
+	ocfs2_journal_dirty(handle, new_bh);
+
+	ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
+
+	trace_ocfs2_new_leaf_refcount_block(
+			(unsigned long long)new_bh->b_blocknr, new_cpos);
+
+	/* Insert the new leaf block with the specific offset cpos. */
+	ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
+				  1, 0, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+static int ocfs2_expand_refcount_tree(handle_t *handle,
+				      struct ocfs2_caching_info *ci,
+				      struct buffer_head *ref_root_bh,
+				      struct buffer_head *ref_leaf_bh,
+				      struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	struct buffer_head *expand_bh = NULL;
+
+	if (ref_root_bh == ref_leaf_bh) {
+		/*
+		 * the old root bh hasn't been expanded to a b-tree,
+		 * so expand it first.
+		 */
+		ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
+						   &expand_bh, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else {
+		expand_bh = ref_leaf_bh;
+		get_bh(expand_bh);
+	}
+
+
+	/* Now add a new refcount block into the tree.*/
+	ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
+					    expand_bh, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(expand_bh);
+	return ret;
+}
+
+/*
+ * Adjust the extent rec in b-tree representing ref_leaf_bh.
+ *
+ * Only called when we have inserted a new refcount rec at index 0
+ * which means ocfs2_extent_rec.e_cpos may need some change.
+ */
+static int ocfs2_adjust_refcount_rec(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     struct buffer_head *ref_leaf_bh,
+				     struct ocfs2_refcount_rec *rec)
+{
+	int ret = 0, i;
+	u32 new_cpos, old_cpos;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_refcount_block *rb =
+		(struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	struct ocfs2_extent_list *el;
+
+	if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
+		goto out;
+
+	rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	old_cpos = le32_to_cpu(rb->rf_cpos);
+	new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
+	if (old_cpos <= new_cpos)
+		goto out;
+
+	ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+
+	path = ocfs2_new_path_from_et(&et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(ci, path, old_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * 2 more credits, one for the leaf refcount block, one for
+	 * the extent block contains the extent rec.
+	 */
+	ret = ocfs2_extend_trans(handle, 2);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* change the leaf extent block first. */
+	el = path_leaf_el(path);
+
+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
+		if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
+			break;
+
+	BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
+
+	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
+
+	/* change the r_cpos in the leaf block. */
+	rb->rf_cpos = cpu_to_le32(new_cpos);
+
+	ocfs2_journal_dirty(handle, path_leaf_bh(path));
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
+
+out:
+	ocfs2_free_path(path);
+	return ret;
+}
+
+static int ocfs2_insert_refcount_rec(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     struct buffer_head *ref_leaf_bh,
+				     struct ocfs2_refcount_rec *rec,
+				     int index, int merge,
+				     struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_list *rf_list = &rb->rf_records;
+	struct buffer_head *new_bh = NULL;
+
+	BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
+
+	if (rf_list->rl_used == rf_list->rl_count) {
+		u64 cpos = le64_to_cpu(rec->r_cpos);
+		u32 len = le32_to_cpu(rec->r_clusters);
+
+		ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
+						 ref_leaf_bh, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, NULL, &index,
+					     &new_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ref_leaf_bh = new_bh;
+		rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+		rf_list = &rb->rf_records;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (index < le16_to_cpu(rf_list->rl_used))
+		memmove(&rf_list->rl_recs[index + 1],
+			&rf_list->rl_recs[index],
+			(le16_to_cpu(rf_list->rl_used) - index) *
+			 sizeof(struct ocfs2_refcount_rec));
+
+	trace_ocfs2_insert_refcount_rec(
+		(unsigned long long)ref_leaf_bh->b_blocknr, index,
+		(unsigned long long)le64_to_cpu(rec->r_cpos),
+		le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount));
+
+	rf_list->rl_recs[index] = *rec;
+
+	le16_add_cpu(&rf_list->rl_used, 1);
+
+	if (merge)
+		ocfs2_refcount_rec_merge(rb, index);
+
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
+
+	if (index == 0) {
+		ret = ocfs2_adjust_refcount_rec(handle, ci,
+						ref_root_bh,
+						ref_leaf_bh, rec);
+		if (ret)
+			mlog_errno(ret);
+	}
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+/*
+ * Split the refcount_rec indexed by "index" in ref_leaf_bh.
+ * This is much simple than our b-tree code.
+ * split_rec is the new refcount rec we want to insert.
+ * If split_rec->r_refcount > 0, we are changing the refcount(in case we
+ * increase refcount or decrease a refcount to non-zero).
+ * If split_rec->r_refcount == 0, we are punching a hole in current refcount
+ * rec( in case we decrease a refcount to zero).
+ */
+static int ocfs2_split_refcount_rec(handle_t *handle,
+				    struct ocfs2_caching_info *ci,
+				    struct buffer_head *ref_root_bh,
+				    struct buffer_head *ref_leaf_bh,
+				    struct ocfs2_refcount_rec *split_rec,
+				    int index, int merge,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, recs_need;
+	u32 len;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_list *rf_list = &rb->rf_records;
+	struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
+	struct ocfs2_refcount_rec *tail_rec = NULL;
+	struct buffer_head *new_bh = NULL;
+
+	BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
+
+	trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos),
+		le32_to_cpu(orig_rec->r_clusters),
+		le32_to_cpu(orig_rec->r_refcount),
+		le64_to_cpu(split_rec->r_cpos),
+		le32_to_cpu(split_rec->r_clusters),
+		le32_to_cpu(split_rec->r_refcount));
+
+	/*
+	 * If we just need to split the header or tail clusters,
+	 * no more recs are needed, just split is OK.
+	 * Otherwise we at least need one new recs.
+	 */
+	if (!split_rec->r_refcount &&
+	    (split_rec->r_cpos == orig_rec->r_cpos ||
+	     le64_to_cpu(split_rec->r_cpos) +
+	     le32_to_cpu(split_rec->r_clusters) ==
+	     le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
+		recs_need = 0;
+	else
+		recs_need = 1;
+
+	/*
+	 * We need one more rec if we split in the middle and the new rec have
+	 * some refcount in it.
+	 */
+	if (split_rec->r_refcount &&
+	    (split_rec->r_cpos != orig_rec->r_cpos &&
+	     le64_to_cpu(split_rec->r_cpos) +
+	     le32_to_cpu(split_rec->r_clusters) !=
+	     le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
+		recs_need++;
+
+	/* If the leaf block don't have enough record, expand it. */
+	if (le16_to_cpu(rf_list->rl_used) + recs_need >
+					 le16_to_cpu(rf_list->rl_count)) {
+		struct ocfs2_refcount_rec tmp_rec;
+		u64 cpos = le64_to_cpu(orig_rec->r_cpos);
+		len = le32_to_cpu(orig_rec->r_clusters);
+		ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
+						 ref_leaf_bh, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * We have to re-get it since now cpos may be moved to
+		 * another leaf block.
+		 */
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, &tmp_rec, &index,
+					     &new_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ref_leaf_bh = new_bh;
+		rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+		rf_list = &rb->rf_records;
+		orig_rec = &rf_list->rl_recs[index];
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We have calculated out how many new records we need and store
+	 * in recs_need, so spare enough space first by moving the records
+	 * after "index" to the end.
+	 */
+	if (index != le16_to_cpu(rf_list->rl_used) - 1)
+		memmove(&rf_list->rl_recs[index + 1 + recs_need],
+			&rf_list->rl_recs[index + 1],
+			(le16_to_cpu(rf_list->rl_used) - index - 1) *
+			 sizeof(struct ocfs2_refcount_rec));
+
+	len = (le64_to_cpu(orig_rec->r_cpos) +
+	      le32_to_cpu(orig_rec->r_clusters)) -
+	      (le64_to_cpu(split_rec->r_cpos) +
+	      le32_to_cpu(split_rec->r_clusters));
+
+	/*
+	 * If we have "len", the we will split in the tail and move it
+	 * to the end of the space we have just spared.
+	 */
+	if (len) {
+		tail_rec = &rf_list->rl_recs[index + recs_need];
+
+		memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
+		le64_add_cpu(&tail_rec->r_cpos,
+			     le32_to_cpu(tail_rec->r_clusters) - len);
+		tail_rec->r_clusters = cpu_to_le32(len);
+	}
+
+	/*
+	 * If the split pos isn't the same as the original one, we need to
+	 * split in the head.
+	 *
+	 * Note: We have the chance that split_rec.r_refcount = 0,
+	 * recs_need = 0 and len > 0, which means we just cut the head from
+	 * the orig_rec and in that case we have done some modification in
+	 * orig_rec above, so the check for r_cpos is faked.
+	 */
+	if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
+		len = le64_to_cpu(split_rec->r_cpos) -
+		      le64_to_cpu(orig_rec->r_cpos);
+		orig_rec->r_clusters = cpu_to_le32(len);
+		index++;
+	}
+
+	le16_add_cpu(&rf_list->rl_used, recs_need);
+
+	if (split_rec->r_refcount) {
+		rf_list->rl_recs[index] = *split_rec;
+		trace_ocfs2_split_refcount_rec_insert(
+			(unsigned long long)ref_leaf_bh->b_blocknr, index,
+			(unsigned long long)le64_to_cpu(split_rec->r_cpos),
+			le32_to_cpu(split_rec->r_clusters),
+			le32_to_cpu(split_rec->r_refcount));
+
+		if (merge)
+			ocfs2_refcount_rec_merge(rb, index);
+	}
+
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
+
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+static int __ocfs2_increase_refcount(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     u64 cpos, u32 len, int merge,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0, index;
+	struct buffer_head *ref_leaf_bh = NULL;
+	struct ocfs2_refcount_rec rec;
+	unsigned int set_len = 0;
+
+	trace_ocfs2_increase_refcount_begin(
+	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
+	     (unsigned long long)cpos, len);
+
+	while (len) {
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, &rec, &index,
+					     &ref_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		set_len = le32_to_cpu(rec.r_clusters);
+
+		/*
+		 * Here we may meet with 3 situations:
+		 *
+		 * 1. If we find an already existing record, and the length
+		 *    is the same, cool, we just need to increase the r_refcount
+		 *    and it is OK.
+		 * 2. If we find a hole, just insert it with r_refcount = 1.
+		 * 3. If we are in the middle of one extent record, split
+		 *    it.
+		 */
+		if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
+		    set_len <= len) {
+			trace_ocfs2_increase_refcount_change(
+				(unsigned long long)cpos, set_len,
+				le32_to_cpu(rec.r_refcount));
+			ret = ocfs2_change_refcount_rec(handle, ci,
+							ref_leaf_bh, index,
+							merge, 1);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		} else if (!rec.r_refcount) {
+			rec.r_refcount = cpu_to_le32(1);
+
+			trace_ocfs2_increase_refcount_insert(
+			     (unsigned long long)le64_to_cpu(rec.r_cpos),
+			     set_len);
+			ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
+							ref_leaf_bh,
+							&rec, index,
+							merge, meta_ac);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		} else  {
+			set_len = min((u64)(cpos + len),
+				      le64_to_cpu(rec.r_cpos) + set_len) - cpos;
+			rec.r_cpos = cpu_to_le64(cpos);
+			rec.r_clusters = cpu_to_le32(set_len);
+			le32_add_cpu(&rec.r_refcount, 1);
+
+			trace_ocfs2_increase_refcount_split(
+			     (unsigned long long)le64_to_cpu(rec.r_cpos),
+			     set_len, le32_to_cpu(rec.r_refcount));
+			ret = ocfs2_split_refcount_rec(handle, ci,
+						       ref_root_bh, ref_leaf_bh,
+						       &rec, index, merge,
+						       meta_ac, dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		cpos += set_len;
+		len -= set_len;
+		brelse(ref_leaf_bh);
+		ref_leaf_bh = NULL;
+	}
+
+out:
+	brelse(ref_leaf_bh);
+	return ret;
+}
+
+static int ocfs2_remove_refcount_extent(handle_t *handle,
+				struct ocfs2_caching_info *ci,
+				struct buffer_head *ref_root_bh,
+				struct buffer_head *ref_leaf_bh,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_extent_tree et;
+
+	BUG_ON(rb->rf_records.rl_used);
+
+	trace_ocfs2_remove_refcount_extent(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)ref_leaf_bh->b_blocknr,
+		le32_to_cpu(rb->rf_cpos));
+
+	ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+	ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
+				  1, meta_ac, dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_remove_from_cache(ci, ref_leaf_bh);
+
+	/*
+	 * add the freed block to the dealloc so that it will be freed
+	 * when we run dealloc.
+	 */
+	ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
+					le16_to_cpu(rb->rf_suballoc_slot),
+					le64_to_cpu(rb->rf_suballoc_loc),
+					le64_to_cpu(rb->rf_blkno),
+					le16_to_cpu(rb->rf_suballoc_bit));
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	le32_add_cpu(&rb->rf_clusters, -1);
+
+	/*
+	 * check whether we need to restore the root refcount block if
+	 * there is no leaf extent block at atll.
+	 */
+	if (!rb->rf_list.l_next_free_rec) {
+		BUG_ON(rb->rf_clusters);
+
+		trace_ocfs2_restore_refcount_block(
+		     (unsigned long long)ref_root_bh->b_blocknr);
+
+		rb->rf_flags = 0;
+		rb->rf_parent = 0;
+		rb->rf_cpos = 0;
+		memset(&rb->rf_records, 0, sb->s_blocksize -
+		       offsetof(struct ocfs2_refcount_block, rf_records));
+		rb->rf_records.rl_count =
+				cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
+	}
+
+	ocfs2_journal_dirty(handle, ref_root_bh);
+
+out:
+	return ret;
+}
+
+int ocfs2_increase_refcount(handle_t *handle,
+			    struct ocfs2_caching_info *ci,
+			    struct buffer_head *ref_root_bh,
+			    u64 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
+					 cpos, len, 1,
+					 meta_ac, dealloc);
+}
+
+static int ocfs2_decrease_refcount_rec(handle_t *handle,
+				struct ocfs2_caching_info *ci,
+				struct buffer_head *ref_root_bh,
+				struct buffer_head *ref_leaf_bh,
+				int index, u64 cpos, unsigned int len,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
+
+	BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
+	BUG_ON(cpos + len >
+	       le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
+
+	trace_ocfs2_decrease_refcount_rec(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)cpos, len);
+
+	if (cpos == le64_to_cpu(rec->r_cpos) &&
+	    len == le32_to_cpu(rec->r_clusters))
+		ret = ocfs2_change_refcount_rec(handle, ci,
+						ref_leaf_bh, index, 1, -1);
+	else {
+		struct ocfs2_refcount_rec split = *rec;
+		split.r_cpos = cpu_to_le64(cpos);
+		split.r_clusters = cpu_to_le32(len);
+
+		le32_add_cpu(&split.r_refcount, -1);
+
+		ret = ocfs2_split_refcount_rec(handle, ci,
+					       ref_root_bh, ref_leaf_bh,
+					       &split, index, 1,
+					       meta_ac, dealloc);
+	}
+
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Remove the leaf refcount block if it contains no refcount record. */
+	if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
+		ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
+						   ref_leaf_bh, meta_ac,
+						   dealloc);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	return ret;
+}
+
+static int __ocfs2_decrease_refcount(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     u64 cpos, u32 len,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc,
+				     int delete)
+{
+	int ret = 0, index = 0;
+	struct ocfs2_refcount_rec rec;
+	unsigned int r_count = 0, r_len;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct buffer_head *ref_leaf_bh = NULL;
+
+	trace_ocfs2_decrease_refcount(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)cpos, len, delete);
+
+	while (len) {
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, &rec, &index,
+					     &ref_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		r_count = le32_to_cpu(rec.r_refcount);
+		BUG_ON(r_count == 0);
+		if (!delete)
+			BUG_ON(r_count > 1);
+
+		r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
+			      le32_to_cpu(rec.r_clusters)) - cpos;
+
+		ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
+						  ref_leaf_bh, index,
+						  cpos, r_len,
+						  meta_ac, dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
+			ret = ocfs2_cache_cluster_dealloc(dealloc,
+					  ocfs2_clusters_to_blocks(sb, cpos),
+							  r_len);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		cpos += r_len;
+		len -= r_len;
+		brelse(ref_leaf_bh);
+		ref_leaf_bh = NULL;
+	}
+
+out:
+	brelse(ref_leaf_bh);
+	return ret;
+}
+
+/* Caller must hold refcount tree lock. */
+int ocfs2_decrease_refcount(struct inode *inode,
+			    handle_t *handle, u32 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    int delete)
+{
+	int ret;
+	u64 ref_blkno;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_tree *tree;
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	ret = ocfs2_get_refcount_block(inode, &ref_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
+					&ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
+					cpos, len, meta_ac, dealloc, delete);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(ref_root_bh);
+	return ret;
+}
+
+/*
+ * Mark the already-existing extent at cpos as refcounted for len clusters.
+ * This adds the refcount extent flag.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+static int ocfs2_mark_extent_refcounted(struct inode *inode,
+				struct ocfs2_extent_tree *et,
+				handle_t *handle, u32 cpos,
+				u32 len, u32 phys,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+
+	trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno,
+					   cpos, len, phys);
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+			    "tree, but the feature bit is not set in the "
+			    "super block.", inode->i_ino);
+		ret = -EROFS;
+		goto out;
+	}
+
+	ret = ocfs2_change_extent_flag(handle, et, cpos,
+				       len, phys, meta_ac, dealloc,
+				       OCFS2_EXT_REFCOUNTED, 0);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+/*
+ * Given some contiguous physical clusters, calculate what we need
+ * for modifying their refcount.
+ */
+static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
+					    struct ocfs2_caching_info *ci,
+					    struct buffer_head *ref_root_bh,
+					    u64 start_cpos,
+					    u32 clusters,
+					    int *meta_add,
+					    int *credits)
+{
+	int ret = 0, index, ref_blocks = 0, recs_add = 0;
+	u64 cpos = start_cpos;
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_rec rec;
+	struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
+	u32 len;
+
+	while (clusters) {
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, clusters, &rec,
+					     &index, &ref_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (ref_leaf_bh != prev_bh) {
+			/*
+			 * Now we encounter a new leaf block, so calculate
+			 * whether we need to extend the old leaf.
+			 */
+			if (prev_bh) {
+				rb = (struct ocfs2_refcount_block *)
+							prev_bh->b_data;
+
+				if (le16_to_cpu(rb->rf_records.rl_used) +
+				    recs_add >
+				    le16_to_cpu(rb->rf_records.rl_count))
+					ref_blocks++;
+			}
+
+			recs_add = 0;
+			*credits += 1;
+			brelse(prev_bh);
+			prev_bh = ref_leaf_bh;
+			get_bh(prev_bh);
+		}
+
+		rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+
+		trace_ocfs2_calc_refcount_meta_credits_iterate(
+				recs_add, (unsigned long long)cpos, clusters,
+				(unsigned long long)le64_to_cpu(rec.r_cpos),
+				le32_to_cpu(rec.r_clusters),
+				le32_to_cpu(rec.r_refcount), index);
+
+		len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
+			  le32_to_cpu(rec.r_clusters)) - cpos;
+		/*
+		 * We record all the records which will be inserted to the
+		 * same refcount block, so that we can tell exactly whether
+		 * we need a new refcount block or not.
+		 *
+		 * If we will insert a new one, this is easy and only happens
+		 * during adding refcounted flag to the extent, so we don't
+		 * have a chance of spliting. We just need one record.
+		 *
+		 * If the refcount rec already exists, that would be a little
+		 * complicated. we may have to:
+		 * 1) split at the beginning if the start pos isn't aligned.
+		 *    we need 1 more record in this case.
+		 * 2) split int the end if the end pos isn't aligned.
+		 *    we need 1 more record in this case.
+		 * 3) split in the middle because of file system fragmentation.
+		 *    we need 2 more records in this case(we can't detect this
+		 *    beforehand, so always think of the worst case).
+		 */
+		if (rec.r_refcount) {
+			recs_add += 2;
+			/* Check whether we need a split at the beginning. */
+			if (cpos == start_cpos &&
+			    cpos != le64_to_cpu(rec.r_cpos))
+				recs_add++;
+
+			/* Check whether we need a split in the end. */
+			if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
+			    le32_to_cpu(rec.r_clusters))
+				recs_add++;
+		} else
+			recs_add++;
+
+		brelse(ref_leaf_bh);
+		ref_leaf_bh = NULL;
+		clusters -= len;
+		cpos += len;
+	}
+
+	if (prev_bh) {
+		rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
+
+		if (le16_to_cpu(rb->rf_records.rl_used) + recs_add >
+		    le16_to_cpu(rb->rf_records.rl_count))
+			ref_blocks++;
+
+		*credits += 1;
+	}
+
+	if (!ref_blocks)
+		goto out;
+
+	*meta_add += ref_blocks;
+	*credits += ref_blocks;
+
+	/*
+	 * So we may need ref_blocks to insert into the tree.
+	 * That also means we need to change the b-tree and add that number
+	 * of records since we never merge them.
+	 * We need one more block for expansion since the new created leaf
+	 * block is also full and needs split.
+	 */
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
+		struct ocfs2_extent_tree et;
+
+		ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+		*meta_add += ocfs2_extend_meta_needed(et.et_root_el);
+		*credits += ocfs2_calc_extend_credits(sb,
+						      et.et_root_el,
+						      ref_blocks);
+	} else {
+		*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+		*meta_add += 1;
+	}
+
+out:
+
+	trace_ocfs2_calc_refcount_meta_credits(
+		(unsigned long long)start_cpos, clusters,
+		*meta_add, *credits);
+	brelse(ref_leaf_bh);
+	brelse(prev_bh);
+	return ret;
+}
+
+/*
+ * For refcount tree, we will decrease some contiguous clusters
+ * refcount count, so just go through it to see how many blocks
+ * we gonna touch and whether we need to create new blocks.
+ *
+ * Normally the refcount blocks store these refcount should be
+ * contiguous also, so that we can get the number easily.
+ * We will at most add split 2 refcount records and 2 more
+ * refcount blocks, so just check it in a rough way.
+ *
+ * Caller must hold refcount tree lock.
+ */
+int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
+					  u64 refcount_loc,
+					  u64 phys_blkno,
+					  u32 clusters,
+					  int *credits,
+					  int *ref_blocks)
+{
+	int ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_tree *tree;
+	u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+			    "tree, but the feature bit is not set in the "
+			    "super block.", inode->i_ino);
+		ret = -EROFS;
+		goto out;
+	}
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
+				      refcount_loc, &tree);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
+					&ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
+					       &tree->rf_ci,
+					       ref_root_bh,
+					       start_cpos, clusters,
+					       ref_blocks, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits);
+
+out:
+	brelse(ref_root_bh);
+	return ret;
+}
+
+#define	MAX_CONTIG_BYTES	1048576
+
+static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
+{
+	return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
+}
+
+static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
+{
+	return ~(ocfs2_cow_contig_clusters(sb) - 1);
+}
+
+/*
+ * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
+ * find an offset (start + (n * contig_clusters)) that is closest to cpos
+ * while still being less than or equal to it.
+ *
+ * The goal is to break the extent at a multiple of contig_clusters.
+ */
+static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
+						 unsigned int start,
+						 unsigned int cpos)
+{
+	BUG_ON(start > cpos);
+
+	return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
+}
+
+/*
+ * Given a cluster count of len, pad it out so that it is a multiple
+ * of contig_clusters.
+ */
+static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
+						  unsigned int len)
+{
+	unsigned int padded =
+		(len + (ocfs2_cow_contig_clusters(sb) - 1)) &
+		ocfs2_cow_contig_mask(sb);
+
+	/* Did we wrap? */
+	if (padded < len)
+		padded = UINT_MAX;
+
+	return padded;
+}
+
+/*
+ * Calculate out the start and number of virtual clusters we need to to CoW.
+ *
+ * cpos is vitual start cluster position we want to do CoW in a
+ * file and write_len is the cluster length.
+ * max_cpos is the place where we want to stop CoW intentionally.
+ *
+ * Normal we will start CoW from the beginning of extent record cotaining cpos.
+ * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
+ * get good I/O from the resulting extent tree.
+ */
+static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
+					   struct ocfs2_extent_list *el,
+					   u32 cpos,
+					   u32 write_len,
+					   u32 max_cpos,
+					   u32 *cow_start,
+					   u32 *cow_len)
+{
+	int ret = 0;
+	int tree_height = le16_to_cpu(el->l_tree_depth), i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb = NULL;
+	struct ocfs2_extent_rec *rec;
+	unsigned int want_clusters, rec_end = 0;
+	int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
+	int leaf_clusters;
+
+	BUG_ON(cpos + write_len > max_cpos);
+
+	if (tree_height > 0) {
+		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	*cow_len = 0;
+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+
+		if (ocfs2_is_empty_extent(rec)) {
+			mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
+					"index %d\n", inode->i_ino, i);
+			continue;
+		}
+
+		if (le32_to_cpu(rec->e_cpos) +
+		    le16_to_cpu(rec->e_leaf_clusters) <= cpos)
+			continue;
+
+		if (*cow_len == 0) {
+			/*
+			 * We should find a refcounted record in the
+			 * first pass.
+			 */
+			BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
+			*cow_start = le32_to_cpu(rec->e_cpos);
+		}
+
+		/*
+		 * If we encounter a hole, a non-refcounted record or
+		 * pass the max_cpos, stop the search.
+		 */
+		if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
+		    (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
+		    (max_cpos <= le32_to_cpu(rec->e_cpos)))
+			break;
+
+		leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
+		rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
+		if (rec_end > max_cpos) {
+			rec_end = max_cpos;
+			leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
+		}
+
+		/*
+		 * How many clusters do we actually need from
+		 * this extent?  First we see how many we actually
+		 * need to complete the write.  If that's smaller
+		 * than contig_clusters, we try for contig_clusters.
+		 */
+		if (!*cow_len)
+			want_clusters = write_len;
+		else
+			want_clusters = (cpos + write_len) -
+				(*cow_start + *cow_len);
+		if (want_clusters < contig_clusters)
+			want_clusters = contig_clusters;
+
+		/*
+		 * If the write does not cover the whole extent, we
+		 * need to calculate how we're going to split the extent.
+		 * We try to do it on contig_clusters boundaries.
+		 *
+		 * Any extent smaller than contig_clusters will be
+		 * CoWed in its entirety.
+		 */
+		if (leaf_clusters <= contig_clusters)
+			*cow_len += leaf_clusters;
+		else if (*cow_len || (*cow_start == cpos)) {
+			/*
+			 * This extent needs to be CoW'd from its
+			 * beginning, so all we have to do is compute
+			 * how many clusters to grab.  We align
+			 * want_clusters to the edge of contig_clusters
+			 * to get better I/O.
+			 */
+			want_clusters = ocfs2_cow_align_length(inode->i_sb,
+							       want_clusters);
+
+			if (leaf_clusters < want_clusters)
+				*cow_len += leaf_clusters;
+			else
+				*cow_len += want_clusters;
+		} else if ((*cow_start + contig_clusters) >=
+			   (cpos + write_len)) {
+			/*
+			 * Breaking off contig_clusters at the front
+			 * of the extent will cover our write.  That's
+			 * easy.
+			 */
+			*cow_len = contig_clusters;
+		} else if ((rec_end - cpos) <= contig_clusters) {
+			/*
+			 * Breaking off contig_clusters at the tail of
+			 * this extent will cover cpos.
+			 */
+			*cow_start = rec_end - contig_clusters;
+			*cow_len = contig_clusters;
+		} else if ((rec_end - cpos) <= want_clusters) {
+			/*
+			 * While we can't fit the entire write in this
+			 * extent, we know that the write goes from cpos
+			 * to the end of the extent.  Break that off.
+			 * We try to break it at some multiple of
+			 * contig_clusters from the front of the extent.
+			 * Failing that (ie, cpos is within
+			 * contig_clusters of the front), we'll CoW the
+			 * entire extent.
+			 */
+			*cow_start = ocfs2_cow_align_start(inode->i_sb,
+							   *cow_start, cpos);
+			*cow_len = rec_end - *cow_start;
+		} else {
+			/*
+			 * Ok, the entire write lives in the middle of
+			 * this extent.  Let's try to slice the extent up
+			 * nicely.  Optimally, our CoW region starts at
+			 * m*contig_clusters from the beginning of the
+			 * extent and goes for n*contig_clusters,
+			 * covering the entire write.
+			 */
+			*cow_start = ocfs2_cow_align_start(inode->i_sb,
+							   *cow_start, cpos);
+
+			want_clusters = (cpos + write_len) - *cow_start;
+			want_clusters = ocfs2_cow_align_length(inode->i_sb,
+							       want_clusters);
+			if (*cow_start + want_clusters <= rec_end)
+				*cow_len = want_clusters;
+			else
+				*cow_len = rec_end - *cow_start;
+		}
+
+		/* Have we covered our entire write yet? */
+		if ((*cow_start + *cow_len) >= (cpos + write_len))
+			break;
+
+		/*
+		 * If we reach the end of the extent block and don't get enough
+		 * clusters, continue with the next extent block if possible.
+		 */
+		if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
+		    eb && eb->h_next_leaf_blk) {
+			brelse(eb_bh);
+			eb_bh = NULL;
+
+			ret = ocfs2_read_extent_block(INODE_CACHE(inode),
+					       le64_to_cpu(eb->h_next_leaf_blk),
+					       &eb_bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+			el = &eb->h_list;
+			i = -1;
+		}
+	}
+
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+/*
+ * Prepare meta_ac, data_ac and calculate credits when we want to add some
+ * num_clusters in data_tree "et" and change the refcount for the old
+ * clusters(starting form p_cluster) in the refcount tree.
+ *
+ * Note:
+ * 1. since we may split the old tree, so we at most will need num_clusters + 2
+ *    more new leaf records.
+ * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
+ *    just give data_ac = NULL.
+ */
+static int ocfs2_lock_refcount_allocators(struct super_block *sb,
+					u32 p_cluster, u32 num_clusters,
+					struct ocfs2_extent_tree *et,
+					struct ocfs2_caching_info *ref_ci,
+					struct buffer_head *ref_root_bh,
+					struct ocfs2_alloc_context **meta_ac,
+					struct ocfs2_alloc_context **data_ac,
+					int *credits)
+{
+	int ret = 0, meta_add = 0;
+	int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
+
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (num_free_extents < num_clusters + 2)
+		meta_add =
+			ocfs2_extend_meta_needed(et->et_root_el);
+
+	*credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
+					      num_clusters + 2);
+
+	ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
+					       p_cluster, num_clusters,
+					       &meta_add, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_lock_refcount_allocators(meta_add, *credits);
+	ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
+						meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (data_ac) {
+		ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
+					     data_ac);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
+{
+	BUG_ON(buffer_dirty(bh));
+
+	clear_buffer_mapped(bh);
+
+	return 0;
+}
+
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+				     struct file *file,
+				     u32 cpos, u32 old_cluster,
+				     u32 new_cluster, u32 new_len)
+{
+	int ret = 0, partial;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct ocfs2_caching_info *ci = INODE_CACHE(inode);
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
+	struct page *page;
+	pgoff_t page_index;
+	unsigned int from, to, readahead_pages;
+	loff_t offset, end, map_end;
+	struct address_space *mapping = inode->i_mapping;
+
+	trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
+					       new_cluster, new_len);
+
+	readahead_pages =
+		(ocfs2_cow_contig_clusters(sb) <<
+		 OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
+	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+	end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
+	/*
+	 * We only duplicate pages until we reach the page contains i_size - 1.
+	 * So trim 'end' to i_size.
+	 */
+	if (end > i_size_read(inode))
+		end = i_size_read(inode);
+
+	while (offset < end) {
+		page_index = offset >> PAGE_CACHE_SHIFT;
+		map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
+		if (map_end > end)
+			map_end = end;
+
+		/* from, to is the offset within the page. */
+		from = offset & (PAGE_CACHE_SIZE - 1);
+		to = PAGE_CACHE_SIZE;
+		if (map_end & (PAGE_CACHE_SIZE - 1))
+			to = map_end & (PAGE_CACHE_SIZE - 1);
+
+		page = find_or_create_page(mapping, page_index, GFP_NOFS);
+
+		/*
+		 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
+		 * can't be dirtied before we CoW it out.
+		 */
+		if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
+			BUG_ON(PageDirty(page));
+
+		if (PageReadahead(page)) {
+			page_cache_async_readahead(mapping,
+						   &file->f_ra, file,
+						   page, page_index,
+						   readahead_pages);
+		}
+
+		if (!PageUptodate(page)) {
+			ret = block_read_full_page(page, ocfs2_get_block);
+			if (ret) {
+				mlog_errno(ret);
+				goto unlock;
+			}
+			lock_page(page);
+		}
+
+		if (page_has_buffers(page)) {
+			ret = walk_page_buffers(handle, page_buffers(page),
+						from, to, &partial,
+						ocfs2_clear_cow_buffer);
+			if (ret) {
+				mlog_errno(ret);
+				goto unlock;
+			}
+		}
+
+		ocfs2_map_and_dirty_page(inode, handle, from, to,
+					 page, 0, &new_block);
+		mark_page_accessed(page);
+unlock:
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+		offset = map_end;
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+				    struct file *file,
+				    u32 cpos, u32 old_cluster,
+				    u32 new_cluster, u32 new_len)
+{
+	int ret = 0;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_caching_info *ci = INODE_CACHE(inode);
+	int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
+	u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
+	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	struct buffer_head *old_bh = NULL;
+	struct buffer_head *new_bh = NULL;
+
+	trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
+					       new_cluster, new_len);
+
+	for (i = 0; i < blocks; i++, old_block++, new_block++) {
+		new_bh = sb_getblk(osb->sb, new_block);
+		if (new_bh == NULL) {
+			ret = -EIO;
+			mlog_errno(ret);
+			break;
+		}
+
+		ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+		ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_journal_access(handle, ci, new_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
+		ocfs2_journal_dirty(handle, new_bh);
+
+		brelse(new_bh);
+		brelse(old_bh);
+		new_bh = NULL;
+		old_bh = NULL;
+	}
+
+	brelse(new_bh);
+	brelse(old_bh);
+	return ret;
+}
+
+static int ocfs2_clear_ext_refcount(handle_t *handle,
+				    struct ocfs2_extent_tree *et,
+				    u32 cpos, u32 p_cluster, u32 len,
+				    unsigned int ext_flags,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, index;
+	struct ocfs2_extent_rec replace_rec;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_list *el;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+	u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
+
+	trace_ocfs2_clear_ext_refcount((unsigned long long)ino,
+				       cpos, len, p_cluster, ext_flags);
+
+	memset(&replace_rec, 0, sizeof(replace_rec));
+	replace_rec.e_cpos = cpu_to_le32(cpos);
+	replace_rec.e_leaf_clusters = cpu_to_le16(len);
+	replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
+								   p_cluster));
+	replace_rec.e_flags = ext_flags;
+	replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
+
+	path = ocfs2_new_path_from_et(et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(et->et_ci, path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+		ocfs2_error(sb,
+			    "Inode %llu has an extent at cpos %u which can no "
+			    "longer be found.\n",
+			    (unsigned long long)ino, cpos);
+		ret = -EROFS;
+		goto out;
+	}
+
+	ret = ocfs2_split_extent(handle, et, path, index,
+				 &replace_rec, meta_ac, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	ocfs2_free_path(path);
+	return ret;
+}
+
+static int ocfs2_replace_clusters(handle_t *handle,
+				  struct ocfs2_cow_context *context,
+				  u32 cpos, u32 old,
+				  u32 new, u32 len,
+				  unsigned int ext_flags)
+{
+	int ret;
+	struct ocfs2_caching_info *ci = context->data_et.et_ci;
+	u64 ino = ocfs2_metadata_cache_owner(ci);
+
+	trace_ocfs2_replace_clusters((unsigned long long)ino,
+				     cpos, old, new, len, ext_flags);
+
+	/*If the old clusters is unwritten, no need to duplicate. */
+	if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+		ret = context->cow_duplicate_clusters(handle, context->file,
+						      cpos, old, new, len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
+				       cpos, new, len, ext_flags,
+				       context->meta_ac, &context->dealloc);
+	if (ret)
+		mlog_errno(ret);
+out:
+	return ret;
+}
+
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+			     struct inode *inode,
+			     u32 cpos, u32 num_clusters)
+{
+	int ret = 0;
+	loff_t offset, end, map_end;
+	pgoff_t page_index;
+	struct page *page;
+
+	if (ocfs2_should_order_data(inode))
+		return 0;
+
+	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+	end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
+
+	ret = filemap_fdatawrite_range(inode->i_mapping,
+				       offset, end - 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	while (offset < end) {
+		page_index = offset >> PAGE_CACHE_SHIFT;
+		map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
+		if (map_end > end)
+			map_end = end;
+
+		page = find_or_create_page(inode->i_mapping,
+					   page_index, GFP_NOFS);
+		BUG_ON(!page);
+
+		wait_on_page_writeback(page);
+		if (PageError(page)) {
+			ret = -EIO;
+			mlog_errno(ret);
+		} else
+			mark_page_accessed(page);
+
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+		offset = map_end;
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
+				 u32 v_cluster, u32 *p_cluster,
+				 u32 *num_clusters,
+				 unsigned int *extent_flags)
+{
+	return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
+				  num_clusters, extent_flags);
+}
+
+static int ocfs2_make_clusters_writable(struct super_block *sb,
+					struct ocfs2_cow_context *context,
+					u32 cpos, u32 p_cluster,
+					u32 num_clusters, unsigned int e_flags)
+{
+	int ret, delete, index, credits =  0;
+	u32 new_bit, new_len, orig_num_clusters;
+	unsigned int set_len;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	handle_t *handle;
+	struct buffer_head *ref_leaf_bh = NULL;
+	struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
+	struct ocfs2_refcount_rec rec;
+
+	trace_ocfs2_make_clusters_writable(cpos, p_cluster,
+					   num_clusters, e_flags);
+
+	ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
+					     &context->data_et,
+					     ref_ci,
+					     context->ref_root_bh,
+					     &context->meta_ac,
+					     &context->data_ac, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if (context->post_refcount)
+		credits += context->post_refcount->credits;
+
+	credits += context->extra_credits;
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	orig_num_clusters = num_clusters;
+
+	while (num_clusters) {
+		ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
+					     p_cluster, num_clusters,
+					     &rec, &index, &ref_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		BUG_ON(!rec.r_refcount);
+		set_len = min((u64)p_cluster + num_clusters,
+			      le64_to_cpu(rec.r_cpos) +
+			      le32_to_cpu(rec.r_clusters)) - p_cluster;
+
+		/*
+		 * There are many different situation here.
+		 * 1. If refcount == 1, remove the flag and don't COW.
+		 * 2. If refcount > 1, allocate clusters.
+		 *    Here we may not allocate r_len once at a time, so continue
+		 *    until we reach num_clusters.
+		 */
+		if (le32_to_cpu(rec.r_refcount) == 1) {
+			delete = 0;
+			ret = ocfs2_clear_ext_refcount(handle,
+						       &context->data_et,
+						       cpos, p_cluster,
+						       set_len, e_flags,
+						       context->meta_ac,
+						       &context->dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+		} else {
+			delete = 1;
+
+			ret = __ocfs2_claim_clusters(handle,
+						     context->data_ac,
+						     1, set_len,
+						     &new_bit, &new_len);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+
+			ret = ocfs2_replace_clusters(handle, context,
+						     cpos, p_cluster, new_bit,
+						     new_len, e_flags);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+			set_len = new_len;
+		}
+
+		ret = __ocfs2_decrease_refcount(handle, ref_ci,
+						context->ref_root_bh,
+						p_cluster, set_len,
+						context->meta_ac,
+						&context->dealloc, delete);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		cpos += set_len;
+		p_cluster += set_len;
+		num_clusters -= set_len;
+		brelse(ref_leaf_bh);
+		ref_leaf_bh = NULL;
+	}
+
+	/* handle any post_cow action. */
+	if (context->post_refcount && context->post_refcount->func) {
+		ret = context->post_refcount->func(context->inode, handle,
+						context->post_refcount->para);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	/*
+	 * Here we should write the new page out first if we are
+	 * in write-back mode.
+	 */
+	if (context->get_clusters == ocfs2_di_get_clusters) {
+		ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
+					       orig_num_clusters);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (context->data_ac) {
+		ocfs2_free_alloc_context(context->data_ac);
+		context->data_ac = NULL;
+	}
+	if (context->meta_ac) {
+		ocfs2_free_alloc_context(context->meta_ac);
+		context->meta_ac = NULL;
+	}
+	brelse(ref_leaf_bh);
+
+	return ret;
+}
+
+static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
+{
+	int ret = 0;
+	struct inode *inode = context->inode;
+	u32 cow_start = context->cow_start, cow_len = context->cow_len;
+	u32 p_cluster, num_clusters;
+	unsigned int ext_flags;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+			    "tree, but the feature bit is not set in the "
+			    "super block.", inode->i_ino);
+		return -EROFS;
+	}
+
+	ocfs2_init_dealloc_ctxt(&context->dealloc);
+
+	while (cow_len) {
+		ret = context->get_clusters(context, cow_start, &p_cluster,
+					    &num_clusters, &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
+
+		if (cow_len < num_clusters)
+			num_clusters = cow_len;
+
+		ret = ocfs2_make_clusters_writable(inode->i_sb, context,
+						   cow_start, p_cluster,
+						   num_clusters, ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		cow_len -= num_clusters;
+		cow_start += num_clusters;
+	}
+
+	if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+		ocfs2_run_deallocs(osb, &context->dealloc);
+	}
+
+	return ret;
+}
+
+static void ocfs2_readahead_for_cow(struct inode *inode,
+				    struct file *file,
+				    u32 start, u32 len)
+{
+	struct address_space *mapping;
+	pgoff_t index;
+	unsigned long num_pages;
+	int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+
+	if (!file)
+		return;
+
+	mapping = file->f_mapping;
+	num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
+	if (!num_pages)
+		num_pages = 1;
+
+	index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
+	page_cache_sync_readahead(mapping, &file->f_ra, file,
+				  index, num_pages);
+}
+
+/*
+ * Starting at cpos, try to CoW write_len clusters.  Don't CoW
+ * past max_cpos.  This will stop when it runs into a hole or an
+ * unrefcounted extent.
+ */
+static int ocfs2_refcount_cow_hunk(struct inode *inode,
+				   struct file *file,
+				   struct buffer_head *di_bh,
+				   u32 cpos, u32 write_len, u32 max_cpos)
+{
+	int ret;
+	u32 cow_start = 0, cow_len = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_tree *ref_tree;
+	struct ocfs2_cow_context *context = NULL;
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
+					      cpos, write_len, max_cpos,
+					      &cow_start, &cow_len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno,
+				      cpos, write_len, max_cpos,
+				      cow_start, cow_len);
+
+	BUG_ON(cow_len == 0);
+
+	ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
+
+	context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
+	if (!context) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+				       1, &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	context->inode = inode;
+	context->cow_start = cow_start;
+	context->cow_len = cow_len;
+	context->ref_tree = ref_tree;
+	context->ref_root_bh = ref_root_bh;
+	context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
+	context->get_clusters = ocfs2_di_get_clusters;
+	context->file = file;
+
+	ocfs2_init_dinode_extent_tree(&context->data_et,
+				      INODE_CACHE(inode), di_bh);
+
+	ret = ocfs2_replace_cow(context);
+	if (ret)
+		mlog_errno(ret);
+
+	/*
+	 * truncate the extent map here since no matter whether we meet with
+	 * any error during the action, we shouldn't trust cached extent map
+	 * any more.
+	 */
+	ocfs2_extent_map_trunc(inode, cow_start);
+
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
+out:
+	kfree(context);
+	return ret;
+}
+
+/*
+ * CoW any and all clusters between cpos and cpos+write_len.
+ * Don't CoW past max_cpos.  If this returns successfully, all
+ * clusters between cpos and cpos+write_len are safe to modify.
+ */
+int ocfs2_refcount_cow(struct inode *inode,
+		       struct file *file,
+		       struct buffer_head *di_bh,
+		       u32 cpos, u32 write_len, u32 max_cpos)
+{
+	int ret = 0;
+	u32 p_cluster, num_clusters;
+	unsigned int ext_flags;
+
+	while (write_len) {
+		ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
+					 &num_clusters, &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		if (write_len < num_clusters)
+			num_clusters = write_len;
+
+		if (ext_flags & OCFS2_EXT_REFCOUNTED) {
+			ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
+						      num_clusters, max_cpos);
+			if (ret) {
+				mlog_errno(ret);
+				break;
+			}
+		}
+
+		write_len -= num_clusters;
+		cpos += num_clusters;
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
+					  u32 v_cluster, u32 *p_cluster,
+					  u32 *num_clusters,
+					  unsigned int *extent_flags)
+{
+	struct inode *inode = context->inode;
+	struct ocfs2_xattr_value_root *xv = context->cow_object;
+
+	return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
+					num_clusters, &xv->xr_list,
+					extent_flags);
+}
+
+/*
+ * Given a xattr value root, calculate the most meta/credits we need for
+ * refcount tree change if we truncate it to 0.
+ */
+int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
+				       struct ocfs2_caching_info *ref_ci,
+				       struct buffer_head *ref_root_bh,
+				       struct ocfs2_xattr_value_root *xv,
+				       int *meta_add, int *credits)
+{
+	int ret = 0, index, ref_blocks = 0;
+	u32 p_cluster, num_clusters;
+	u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_rec rec;
+	struct buffer_head *ref_leaf_bh = NULL;
+
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, &xv->xr_list,
+					       NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		cpos += num_clusters;
+
+		while (num_clusters) {
+			ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
+						     p_cluster, num_clusters,
+						     &rec, &index,
+						     &ref_leaf_bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			BUG_ON(!rec.r_refcount);
+
+			rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+
+			/*
+			 * We really don't know whether the other clusters is in
+			 * this refcount block or not, so just take the worst
+			 * case that all the clusters are in this block and each
+			 * one will split a refcount rec, so totally we need
+			 * clusters * 2 new refcount rec.
+			 */
+			if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
+			    le16_to_cpu(rb->rf_records.rl_count))
+				ref_blocks++;
+
+			*credits += 1;
+			brelse(ref_leaf_bh);
+			ref_leaf_bh = NULL;
+
+			if (num_clusters <= le32_to_cpu(rec.r_clusters))
+				break;
+			else
+				num_clusters -= le32_to_cpu(rec.r_clusters);
+			p_cluster += num_clusters;
+		}
+	}
+
+	*meta_add += ref_blocks;
+	if (!ref_blocks)
+		goto out;
+
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+		*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+	else {
+		struct ocfs2_extent_tree et;
+
+		ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
+		*credits += ocfs2_calc_extend_credits(inode->i_sb,
+						      et.et_root_el,
+						      ref_blocks);
+	}
+
+out:
+	brelse(ref_leaf_bh);
+	return ret;
+}
+
+/*
+ * Do CoW for xattr.
+ */
+int ocfs2_refcount_cow_xattr(struct inode *inode,
+			     struct ocfs2_dinode *di,
+			     struct ocfs2_xattr_value_buf *vb,
+			     struct ocfs2_refcount_tree *ref_tree,
+			     struct buffer_head *ref_root_bh,
+			     u32 cpos, u32 write_len,
+			     struct ocfs2_post_refcount *post)
+{
+	int ret;
+	struct ocfs2_xattr_value_root *xv = vb->vb_xv;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_cow_context *context = NULL;
+	u32 cow_start, cow_len;
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
+					      cpos, write_len, UINT_MAX,
+					      &cow_start, &cow_len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(cow_len == 0);
+
+	context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
+	if (!context) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	context->inode = inode;
+	context->cow_start = cow_start;
+	context->cow_len = cow_len;
+	context->ref_tree = ref_tree;
+	context->ref_root_bh = ref_root_bh;
+	context->cow_object = xv;
+
+	context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
+	/* We need the extra credits for duplicate_clusters by jbd. */
+	context->extra_credits =
+		ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
+	context->get_clusters = ocfs2_xattr_value_get_clusters;
+	context->post_refcount = post;
+
+	ocfs2_init_xattr_value_extent_tree(&context->data_et,
+					   INODE_CACHE(inode), vb);
+
+	ret = ocfs2_replace_cow(context);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	kfree(context);
+	return ret;
+}
+
+/*
+ * Insert a new extent into refcount tree and mark a extent rec
+ * as refcounted in the dinode tree.
+ */
+int ocfs2_add_refcount_flag(struct inode *inode,
+			    struct ocfs2_extent_tree *data_et,
+			    struct ocfs2_caching_info *ref_ci,
+			    struct buffer_head *ref_root_bh,
+			    u32 cpos, u32 p_cluster, u32 num_clusters,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    struct ocfs2_post_refcount *post)
+{
+	int ret;
+	handle_t *handle;
+	int credits = 1, ref_blocks = 0;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+
+	ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
+					       ref_ci, ref_root_bh,
+					       p_cluster, num_clusters,
+					       &ref_blocks, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_add_refcount_flag(ref_blocks, credits);
+
+	if (ref_blocks) {
+		ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+							ref_blocks, &meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (post)
+		credits += post->credits;
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
+					   cpos, num_clusters, p_cluster,
+					   meta_ac, dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+					p_cluster, num_clusters, 0,
+					meta_ac, dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (post && post->func) {
+		ret = post->func(inode, handle, post->para);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+static int ocfs2_change_ctime(struct inode *inode,
+			      struct buffer_head *di_bh)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	inode->i_ctime = CURRENT_TIME;
+	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
+static int ocfs2_attach_refcount_tree(struct inode *inode,
+				      struct buffer_head *di_bh)
+{
+	int ret, data_changed = 0;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_refcount_tree *ref_tree;
+	unsigned int ext_flags;
+	loff_t size;
+	u32 cpos, num_clusters, clusters, p_cluster;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_extent_tree di_et;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
+		ret = ocfs2_create_refcount_tree(inode, di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	BUG_ON(!di->i_refcount_loc);
+	ret = ocfs2_lock_refcount_tree(osb,
+				       le64_to_cpu(di->i_refcount_loc), 1,
+				       &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		goto attach_xattr;
+
+	ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
+
+	size = i_size_read(inode);
+	clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
+					 &num_clusters, &ext_flags);
+
+		if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
+			ret = ocfs2_add_refcount_flag(inode, &di_et,
+						      &ref_tree->rf_ci,
+						      ref_root_bh, cpos,
+						      p_cluster, num_clusters,
+						      &dealloc, NULL);
+			if (ret) {
+				mlog_errno(ret);
+				goto unlock;
+			}
+
+			data_changed = 1;
+		}
+		cpos += num_clusters;
+	}
+
+attach_xattr:
+	if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+		ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
+						       &ref_tree->rf_ci,
+						       ref_root_bh,
+						       &dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto unlock;
+		}
+	}
+
+	if (data_changed) {
+		ret = ocfs2_change_ctime(inode, di_bh);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+unlock:
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
+
+	if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+		ocfs2_run_deallocs(osb, &dealloc);
+	}
+out:
+	/*
+	 * Empty the extent map so that we may get the right extent
+	 * record from the disk.
+	 */
+	ocfs2_extent_map_trunc(inode, 0);
+
+	return ret;
+}
+
+static int ocfs2_add_refcounted_extent(struct inode *inode,
+				   struct ocfs2_extent_tree *et,
+				   struct ocfs2_caching_info *ref_ci,
+				   struct buffer_head *ref_root_bh,
+				   u32 cpos, u32 p_cluster, u32 num_clusters,
+				   unsigned int ext_flags,
+				   struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	handle_t *handle;
+	int credits = 0;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+
+	ret = ocfs2_lock_refcount_allocators(inode->i_sb,
+					     p_cluster, num_clusters,
+					     et, ref_ci,
+					     ref_root_bh, &meta_ac,
+					     NULL, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_insert_extent(handle, et, cpos,
+			ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
+			num_clusters, ext_flags, meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+				      p_cluster, num_clusters,
+				      meta_ac, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+static int ocfs2_duplicate_inline_data(struct inode *s_inode,
+				       struct buffer_head *s_bh,
+				       struct inode *t_inode,
+				       struct buffer_head *t_bh)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
+	struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
+	struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
+
+	BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
+	memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
+	       le16_to_cpu(s_di->id2.i_data.id_count));
+	spin_lock(&OCFS2_I(t_inode)->ip_lock);
+	OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
+	t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
+	spin_unlock(&OCFS2_I(t_inode)->ip_lock);
+
+	ocfs2_journal_dirty(handle, t_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+static int ocfs2_duplicate_extent_list(struct inode *s_inode,
+				struct inode *t_inode,
+				struct buffer_head *t_bh,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0;
+	u32 p_cluster, num_clusters, clusters, cpos;
+	loff_t size;
+	unsigned int ext_flags;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
+
+	size = i_size_read(s_inode);
+	clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
+					 &num_clusters, &ext_flags);
+
+		if (p_cluster) {
+			ret = ocfs2_add_refcounted_extent(t_inode, &et,
+							  ref_ci, ref_root_bh,
+							  cpos, p_cluster,
+							  num_clusters,
+							  ext_flags,
+							  dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		cpos += num_clusters;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * change the new file's attributes to the src.
+ *
+ * reflink creates a snapshot of a file, that means the attributes
+ * must be identical except for three exceptions - nlink, ino, and ctime.
+ */
+static int ocfs2_complete_reflink(struct inode *s_inode,
+				  struct buffer_head *s_bh,
+				  struct inode *t_inode,
+				  struct buffer_head *t_bh,
+				  bool preserve)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
+	loff_t size = i_size_read(s_inode);
+
+	handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	spin_lock(&OCFS2_I(t_inode)->ip_lock);
+	OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
+	OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
+	OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
+	spin_unlock(&OCFS2_I(t_inode)->ip_lock);
+	i_size_write(t_inode, size);
+	t_inode->i_blocks = s_inode->i_blocks;
+
+	di->i_xattr_inline_size = s_di->i_xattr_inline_size;
+	di->i_clusters = s_di->i_clusters;
+	di->i_size = s_di->i_size;
+	di->i_dyn_features = s_di->i_dyn_features;
+	di->i_attr = s_di->i_attr;
+
+	if (preserve) {
+		t_inode->i_uid = s_inode->i_uid;
+		t_inode->i_gid = s_inode->i_gid;
+		t_inode->i_mode = s_inode->i_mode;
+		di->i_uid = s_di->i_uid;
+		di->i_gid = s_di->i_gid;
+		di->i_mode = s_di->i_mode;
+
+		/*
+		 * update time.
+		 * we want mtime to appear identical to the source and
+		 * update ctime.
+		 */
+		t_inode->i_ctime = CURRENT_TIME;
+
+		di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
+		di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
+
+		t_inode->i_mtime = s_inode->i_mtime;
+		di->i_mtime = s_di->i_mtime;
+		di->i_mtime_nsec = s_di->i_mtime_nsec;
+	}
+
+	ocfs2_journal_dirty(handle, t_bh);
+
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
+	return ret;
+}
+
+static int ocfs2_create_reflink_node(struct inode *s_inode,
+				     struct buffer_head *s_bh,
+				     struct inode *t_inode,
+				     struct buffer_head *t_bh,
+				     bool preserve)
+{
+	int ret;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
+	struct ocfs2_refcount_tree *ref_tree;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+				      le64_to_cpu(di->i_refcount_loc));
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
+						  t_inode, t_bh);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+				       1, &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
+					  &ref_tree->rf_ci, ref_root_bh,
+					  &dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock_refcount;
+	}
+
+out_unlock_refcount:
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
+out:
+	if (ocfs2_dealloc_has_cluster(&dealloc)) {
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+		ocfs2_run_deallocs(osb, &dealloc);
+	}
+
+	return ret;
+}
+
+static int __ocfs2_reflink(struct dentry *old_dentry,
+			   struct buffer_head *old_bh,
+			   struct inode *new_inode,
+			   bool preserve)
+{
+	int ret;
+	struct inode *inode = old_dentry->d_inode;
+	struct buffer_head *new_bh = NULL;
+
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = filemap_fdatawrite(inode->i_mapping);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_attach_refcount_tree(inode, old_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
+	ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
+				      OI_LS_REFLINK_TARGET);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_create_reflink_node(inode, old_bh,
+					new_inode, new_bh, preserve);
+	if (ret) {
+		mlog_errno(ret);
+		goto inode_unlock;
+	}
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+		ret = ocfs2_reflink_xattrs(inode, old_bh,
+					   new_inode, new_bh,
+					   preserve);
+		if (ret) {
+			mlog_errno(ret);
+			goto inode_unlock;
+		}
+	}
+
+	ret = ocfs2_complete_reflink(inode, old_bh,
+				     new_inode, new_bh, preserve);
+	if (ret)
+		mlog_errno(ret);
+
+inode_unlock:
+	ocfs2_inode_unlock(new_inode, 1);
+	brelse(new_bh);
+out_unlock:
+	mutex_unlock(&new_inode->i_mutex);
+out:
+	if (!ret) {
+		ret = filemap_fdatawait(inode->i_mapping);
+		if (ret)
+			mlog_errno(ret);
+	}
+	return ret;
+}
+
+static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
+			 struct dentry *new_dentry, bool preserve)
+{
+	int error;
+	struct inode *inode = old_dentry->d_inode;
+	struct buffer_head *old_bh = NULL;
+	struct inode *new_orphan_inode = NULL;
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
+					     &new_orphan_inode);
+	if (error) {
+		mlog_errno(error);
+		goto out;
+	}
+
+	error = ocfs2_inode_lock(inode, &old_bh, 1);
+	if (error) {
+		mlog_errno(error);
+		goto out;
+	}
+
+	down_write(&OCFS2_I(inode)->ip_xattr_sem);
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+	error = __ocfs2_reflink(old_dentry, old_bh,
+				new_orphan_inode, preserve);
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	up_write(&OCFS2_I(inode)->ip_xattr_sem);
+
+	ocfs2_inode_unlock(inode, 1);
+	brelse(old_bh);
+
+	if (error) {
+		mlog_errno(error);
+		goto out;
+	}
+
+	/* If the security isn't preserved, we need to re-initialize them. */
+	if (!preserve) {
+		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
+						    &new_dentry->d_name);
+		if (error)
+			mlog_errno(error);
+	}
+out:
+	if (!error) {
+		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
+						       new_dentry);
+		if (error)
+			mlog_errno(error);
+	}
+
+	if (new_orphan_inode) {
+		/*
+		 * We need to open_unlock the inode no matter whether we
+		 * succeed or not, so that other nodes can delete it later.
+		 */
+		ocfs2_open_unlock(new_orphan_inode);
+		if (error)
+			iput(new_orphan_inode);
+	}
+
+	return error;
+}
+
+/*
+ * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
+ * sys_reflink().  This will go away when vfs_reflink() exists in
+ * fs/namei.c.
+ */
+
+/* copied from may_create in VFS. */
+static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
+{
+	if (child->d_inode)
+		return -EEXIST;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/* copied from user_path_parent. */
+static int ocfs2_user_path_parent(const char __user *path,
+				  struct nameidata *nd, char **name)
+{
+	char *s = getname(path);
+	int error;
+
+	if (IS_ERR(s))
+		return PTR_ERR(s);
+
+	error = kern_path_parent(s, nd);
+	if (error)
+		putname(s);
+	else
+		*name = s;
+
+	return error;
+}
+
+/**
+ * ocfs2_vfs_reflink - Create a reference-counted link
+ *
+ * @old_dentry:        source dentry + inode
+ * @dir:       directory to create the target
+ * @new_dentry:        target dentry
+ * @preserve:  if true, preserve all file attributes
+ */
+static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
+			     struct dentry *new_dentry, bool preserve)
+{
+	struct inode *inode = old_dentry->d_inode;
+	int error;
+
+	if (!inode)
+		return -ENOENT;
+
+	error = ocfs2_may_create(dir, new_dentry);
+	if (error)
+		return error;
+
+	if (dir->i_sb != inode->i_sb)
+		return -EXDEV;
+
+	/*
+	 * A reflink to an append-only or immutable file cannot be created.
+	 */
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return -EPERM;
+
+	/* Only regular files can be reflinked. */
+	if (!S_ISREG(inode->i_mode))
+		return -EPERM;
+
+	/*
+	 * If the caller wants to preserve ownership, they require the
+	 * rights to do so.
+	 */
+	if (preserve) {
+		if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN))
+			return -EPERM;
+		if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
+			return -EPERM;
+	}
+
+	/*
+	 * If the caller is modifying any aspect of the attributes, they
+	 * are not creating a snapshot.  They need read permission on the
+	 * file.
+	 */
+	if (!preserve) {
+		error = inode_permission(inode, MAY_READ);
+		if (error)
+			return error;
+	}
+
+	mutex_lock(&inode->i_mutex);
+	dquot_initialize(dir);
+	error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
+	mutex_unlock(&inode->i_mutex);
+	if (!error)
+		fsnotify_create(dir, new_dentry);
+	return error;
+}
+/*
+ * Most codes are copied from sys_linkat.
+ */
+int ocfs2_reflink_ioctl(struct inode *inode,
+			const char __user *oldname,
+			const char __user *newname,
+			bool preserve)
+{
+	struct dentry *new_dentry;
+	struct nameidata nd;
+	struct path old_path;
+	int error;
+	char *to = NULL;
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
+	if (error) {
+		mlog_errno(error);
+		return error;
+	}
+
+	error = ocfs2_user_path_parent(newname, &nd, &to);
+	if (error) {
+		mlog_errno(error);
+		goto out;
+	}
+
+	error = -EXDEV;
+	if (old_path.mnt != nd.path.mnt)
+		goto out_release;
+	new_dentry = lookup_create(&nd, 0);
+	error = PTR_ERR(new_dentry);
+	if (IS_ERR(new_dentry)) {
+		mlog_errno(error);
+		goto out_unlock;
+	}
+
+	error = mnt_want_write(nd.path.mnt);
+	if (error) {
+		mlog_errno(error);
+		goto out_dput;
+	}
+
+	error = ocfs2_vfs_reflink(old_path.dentry,
+				  nd.path.dentry->d_inode,
+				  new_dentry, preserve);
+	mnt_drop_write(nd.path.mnt);
+out_dput:
+	dput(new_dentry);
+out_unlock:
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+out_release:
+	path_put(&nd.path);
+	putname(to);
+out:
+	path_put(&old_path);
+
+	return error;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
new file mode 100644
index 00000000..7754608c
--- /dev/null
+++ b/fs/ocfs2/refcounttree.h
@@ -0,0 +1,118 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * refcounttree.h
+ *
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_REFCOUNTTREE_H
+#define OCFS2_REFCOUNTTREE_H
+
+struct ocfs2_refcount_tree {
+	struct rb_node rf_node;
+	u64 rf_blkno;
+	u32 rf_generation;
+	struct kref rf_getcnt;
+	struct rw_semaphore rf_sem;
+	struct ocfs2_lock_res rf_lockres;
+	int rf_removed;
+
+	/* the following 4 fields are used by caching_info. */
+	spinlock_t rf_lock;
+	struct ocfs2_caching_info rf_ci;
+	struct mutex rf_io_mutex;
+	struct super_block *rf_sb;
+};
+
+void ocfs2_purge_refcount_trees(struct ocfs2_super *osb);
+int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw,
+			     struct ocfs2_refcount_tree **tree,
+			     struct buffer_head **ref_bh);
+void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
+				struct ocfs2_refcount_tree *tree,
+				int rw);
+
+int ocfs2_decrease_refcount(struct inode *inode,
+			    handle_t *handle, u32 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    int delete);
+int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
+					  u64 refcount_loc,
+					  u64 phys_blkno,
+					  u32 clusters,
+					  int *credits,
+					  int *ref_blocks);
+int ocfs2_refcount_cow(struct inode *inode,
+		       struct file *filep, struct buffer_head *di_bh,
+		       u32 cpos, u32 write_len, u32 max_cpos);
+
+typedef int (ocfs2_post_refcount_func)(struct inode *inode,
+				       handle_t *handle,
+				       void *para);
+/*
+ * Some refcount caller need to do more work after we modify the data b-tree
+ * during refcount operation(including CoW and add refcount flag), and make the
+ * transaction complete. So it must give us this structure so that we can do it
+ * within our transaction.
+ *
+ */
+struct ocfs2_post_refcount {
+	int credits;			/* credits it need for journal. */
+	ocfs2_post_refcount_func *func;	/* real function. */
+	void *para;
+};
+
+int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
+				       struct ocfs2_caching_info *ref_ci,
+				       struct buffer_head *ref_root_bh,
+				       struct ocfs2_xattr_value_root *xv,
+				       int *meta_add, int *credits);
+int ocfs2_refcount_cow_xattr(struct inode *inode,
+			     struct ocfs2_dinode *di,
+			     struct ocfs2_xattr_value_buf *vb,
+			     struct ocfs2_refcount_tree *ref_tree,
+			     struct buffer_head *ref_root_bh,
+			     u32 cpos, u32 write_len,
+			     struct ocfs2_post_refcount *post);
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+				     struct file *file,
+				     u32 cpos, u32 old_cluster,
+				     u32 new_cluster, u32 new_len);
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+				    struct file *file,
+				    u32 cpos, u32 old_cluster,
+				    u32 new_cluster, u32 new_len);
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+			     struct inode *inode,
+			     u32 cpos, u32 num_clusters);
+int ocfs2_add_refcount_flag(struct inode *inode,
+			    struct ocfs2_extent_tree *data_et,
+			    struct ocfs2_caching_info *ref_ci,
+			    struct buffer_head *ref_root_bh,
+			    u32 cpos, u32 p_cluster, u32 num_clusters,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    struct ocfs2_post_refcount *post);
+int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
+int ocfs2_try_remove_refcount_tree(struct inode *inode,
+				   struct buffer_head *di_bh);
+int ocfs2_increase_refcount(handle_t *handle,
+			    struct ocfs2_caching_info *ci,
+			    struct buffer_head *ref_root_bh,
+			    u64 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_reflink_ioctl(struct inode *inode,
+			const char __user *oldname,
+			const char __user *newname,
+			bool preserve);
+#endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644
index 00000000..41ffd36c
--- /dev/null
+++ b/fs/ocfs2/reservations.c
@@ -0,0 +1,839 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.c
+ *
+ * Allocation reservations implementation
+ *
+ * Some code borrowed from fs/ext3/balloc.c and is:
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * The rest is copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_trace.h"
+
+#ifdef CONFIG_OCFS2_DEBUG_FS
+#define OCFS2_CHECK_RESERVATIONS
+#endif
+
+DEFINE_SPINLOCK(resv_lock);
+
+#define	OCFS2_MIN_RESV_WINDOW_BITS	8
+#define	OCFS2_MAX_RESV_WINDOW_BITS	1024
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
+{
+	return (osb->osb_resv_level && osb->osb_dir_resv_level);
+}
+
+static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
+					   struct ocfs2_alloc_reservation *resv)
+{
+	struct ocfs2_super *osb = resmap->m_osb;
+	unsigned int bits;
+
+	if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
+		/* 8, 16, 32, 64, 128, 256, 512, 1024 */
+		bits = 4 << osb->osb_resv_level;
+	} else {
+		bits = 4 << osb->osb_dir_resv_level;
+	}
+	return bits;
+}
+
+static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
+{
+	if (resv->r_len)
+		return resv->r_start + resv->r_len - 1;
+	return resv->r_start;
+}
+
+static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
+{
+	return !!(resv->r_len == 0);
+}
+
+static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
+{
+	if (resmap->m_osb->osb_resv_level == 0)
+		return 1;
+	return 0;
+}
+
+static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
+{
+	struct ocfs2_super *osb = resmap->m_osb;
+	struct rb_node *node;
+	struct ocfs2_alloc_reservation *resv;
+	int i = 0;
+
+	mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
+	     osb->dev_str, resmap->m_bitmap_len);
+
+	node = rb_first(&resmap->m_reservations);
+	while (node) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
+		     "\tlast_len: %u\n", resv->r_start,
+		     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+		     resv->r_last_len);
+
+		node = rb_next(node);
+		i++;
+	}
+
+	mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
+
+	i = 0;
+	list_for_each_entry(resv, &resmap->m_lru, r_lru) {
+		mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
+		     "last_start: %u\tlast_len: %u\n", i, resv->r_start,
+		     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+		     resv->r_last_len);
+
+		i++;
+	}
+}
+
+#ifdef OCFS2_CHECK_RESERVATIONS
+static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
+				      int i,
+				      struct ocfs2_alloc_reservation *resv)
+{
+	char *disk_bitmap = resmap->m_disk_bitmap;
+	unsigned int start = resv->r_start;
+	unsigned int end = ocfs2_resv_end(resv);
+
+	while (start <= end) {
+		if (ocfs2_test_bit(start, disk_bitmap)) {
+			mlog(ML_ERROR,
+			     "reservation %d covers an allocated area "
+			     "starting at bit %u!\n", i, start);
+			return 1;
+		}
+
+		start++;
+	}
+	return 0;
+}
+
+static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+	unsigned int off = 0;
+	int i = 0;
+	struct rb_node *node;
+	struct ocfs2_alloc_reservation *resv;
+
+	node = rb_first(&resmap->m_reservations);
+	while (node) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		if (i > 0 && resv->r_start <= off) {
+			mlog(ML_ERROR, "reservation %d has bad start off!\n",
+			     i);
+			goto bad;
+		}
+
+		if (resv->r_len == 0) {
+			mlog(ML_ERROR, "reservation %d has no length!\n",
+			     i);
+			goto bad;
+		}
+
+		if (resv->r_start > ocfs2_resv_end(resv)) {
+			mlog(ML_ERROR, "reservation %d has invalid range!\n",
+			     i);
+			goto bad;
+		}
+
+		if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
+			mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
+			     i);
+			goto bad;
+		}
+
+		if (ocfs2_validate_resmap_bits(resmap, i, resv))
+			goto bad;
+
+		off = ocfs2_resv_end(resv);
+		node = rb_next(node);
+
+		i++;
+	}
+	return;
+
+bad:
+	ocfs2_dump_resv(resmap);
+	BUG();
+}
+#else
+static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+
+}
+#endif
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
+{
+	memset(resv, 0, sizeof(*resv));
+	INIT_LIST_HEAD(&resv->r_lru);
+}
+
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+			 unsigned int flags)
+{
+	BUG_ON(flags & ~OCFS2_RESV_TYPES);
+
+	resv->r_flags |= flags;
+}
+
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+		      struct ocfs2_reservation_map *resmap)
+{
+	memset(resmap, 0, sizeof(*resmap));
+
+	resmap->m_osb = osb;
+	resmap->m_reservations = RB_ROOT;
+	/* m_bitmap_len is initialized to zero by the above memset. */
+	INIT_LIST_HEAD(&resmap->m_lru);
+
+	return 0;
+}
+
+static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
+				struct ocfs2_alloc_reservation *resv)
+{
+	assert_spin_locked(&resv_lock);
+
+	if (!list_empty(&resv->r_lru))
+		list_del_init(&resv->r_lru);
+
+	list_add_tail(&resv->r_lru, &resmap->m_lru);
+}
+
+static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
+{
+	resv->r_len = 0;
+	resv->r_start = 0;
+}
+
+static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
+			      struct ocfs2_alloc_reservation *resv)
+{
+	if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
+		list_del_init(&resv->r_lru);
+		rb_erase(&resv->r_node, &resmap->m_reservations);
+		resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
+	}
+}
+
+static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+				 struct ocfs2_alloc_reservation *resv)
+{
+	assert_spin_locked(&resv_lock);
+
+	__ocfs2_resv_trunc(resv);
+	/*
+	 * last_len and last_start no longer make sense if
+	 * we're changing the range of our allocations.
+	 */
+	resv->r_last_len = resv->r_last_start = 0;
+
+	ocfs2_resv_remove(resmap, resv);
+}
+
+/* does nothing if 'resv' is null */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+			struct ocfs2_alloc_reservation *resv)
+{
+	if (resv) {
+		spin_lock(&resv_lock);
+		__ocfs2_resv_discard(resmap, resv);
+		spin_unlock(&resv_lock);
+	}
+}
+
+static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
+{
+	struct rb_node *node;
+	struct ocfs2_alloc_reservation *resv;
+
+	assert_spin_locked(&resv_lock);
+
+	while ((node = rb_last(&resmap->m_reservations)) != NULL) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		__ocfs2_resv_discard(resmap, resv);
+	}
+}
+
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+			  unsigned int clen, char *disk_bitmap)
+{
+	if (ocfs2_resmap_disabled(resmap))
+		return;
+
+	spin_lock(&resv_lock);
+
+	ocfs2_resmap_clear_all_resv(resmap);
+	resmap->m_bitmap_len = clen;
+	resmap->m_disk_bitmap = disk_bitmap;
+
+	spin_unlock(&resv_lock);
+}
+
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
+{
+	/* Does nothing for now. Keep this around for API symmetry */
+}
+
+static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
+			      struct ocfs2_alloc_reservation *new)
+{
+	struct rb_root *root = &resmap->m_reservations;
+	struct rb_node *parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct ocfs2_alloc_reservation *tmp;
+
+	assert_spin_locked(&resv_lock);
+
+	trace_ocfs2_resv_insert(new->r_start, new->r_len);
+
+	while (*p) {
+		parent = *p;
+
+		tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
+
+		if (new->r_start < tmp->r_start) {
+			p = &(*p)->rb_left;
+
+			/*
+			 * This is a good place to check for
+			 * overlapping reservations.
+			 */
+			BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
+		} else if (new->r_start > ocfs2_resv_end(tmp)) {
+			p = &(*p)->rb_right;
+		} else {
+			/* This should never happen! */
+			mlog(ML_ERROR, "Duplicate reservation window!\n");
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->r_node, parent, p);
+	rb_insert_color(&new->r_node, root);
+	new->r_flags |= OCFS2_RESV_FLAG_INUSE;
+
+	ocfs2_resv_mark_lru(resmap, new);
+
+	ocfs2_check_resmap(resmap);
+}
+
+/**
+ * ocfs2_find_resv_lhs() - find the window which contains goal
+ * @resmap: reservation map to search
+ * @goal: which bit to search for
+ *
+ * If a window containing that goal is not found, we return the window
+ * which comes before goal. Returns NULL on empty rbtree or no window
+ * before goal.
+ */
+static struct ocfs2_alloc_reservation *
+ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
+{
+	struct ocfs2_alloc_reservation *resv = NULL;
+	struct ocfs2_alloc_reservation *prev_resv = NULL;
+	struct rb_node *node = resmap->m_reservations.rb_node;
+
+	assert_spin_locked(&resv_lock);
+
+	if (!node)
+		return NULL;
+
+	node = rb_first(&resmap->m_reservations);
+	while (node) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
+			break;
+
+		/* Check if we overshot the reservation just before goal? */
+		if (resv->r_start > goal) {
+			resv = prev_resv;
+			break;
+		}
+
+		prev_resv = resv;
+		node = rb_next(node);
+	}
+
+	return resv;
+}
+
+/*
+ * We are given a range within the bitmap, which corresponds to a gap
+ * inside the reservations tree (search_start, search_len). The range
+ * can be anything from the whole bitmap, to a gap between
+ * reservations.
+ *
+ * The start value of *rstart is insignificant.
+ *
+ * This function searches the bitmap range starting at search_start
+ * with length search_len for a set of contiguous free bits. We try
+ * to find up to 'wanted' bits, but can sometimes return less.
+ *
+ * Returns the length of allocation, 0 if no free bits are found.
+ *
+ * *cstart and *clen will also be populated with the result.
+ */
+static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
+				       unsigned int wanted,
+				       unsigned int search_start,
+				       unsigned int search_len,
+				       unsigned int *rstart,
+				       unsigned int *rlen)
+{
+	void *bitmap = resmap->m_disk_bitmap;
+	unsigned int best_start, best_len = 0;
+	int offset, start, found;
+
+	trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len,
+						wanted, resmap->m_bitmap_len);
+
+	found = best_start = best_len = 0;
+
+	start = search_start;
+	while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
+						 start)) != -1) {
+		/* Search reached end of the region */
+		if (offset >= (search_start + search_len))
+			break;
+
+		if (offset == start) {
+			/* we found a zero */
+			found++;
+			/* move start to the next bit to test */
+			start++;
+		} else {
+			/* got a zero after some ones */
+			found = 1;
+			start = offset + 1;
+		}
+		if (found > best_len) {
+			best_len = found;
+			best_start = start - found;
+		}
+
+		if (found >= wanted)
+			break;
+	}
+
+	if (best_len == 0)
+		return 0;
+
+	if (best_len >= wanted)
+		best_len = wanted;
+
+	*rlen = best_len;
+	*rstart = best_start;
+
+	trace_ocfs2_resmap_find_free_bits_end(best_start, best_len);
+
+	return *rlen;
+}
+
+static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+				     struct ocfs2_alloc_reservation *resv,
+				     unsigned int goal, unsigned int wanted)
+{
+	struct rb_root *root = &resmap->m_reservations;
+	unsigned int gap_start, gap_end, gap_len;
+	struct ocfs2_alloc_reservation *prev_resv, *next_resv;
+	struct rb_node *prev, *next;
+	unsigned int cstart, clen;
+	unsigned int best_start = 0, best_len = 0;
+
+	/*
+	 * Nasty cases to consider:
+	 *
+	 * - rbtree is empty
+	 * - our window should be first in all reservations
+	 * - our window should be last in all reservations
+	 * - need to make sure we don't go past end of bitmap
+	 */
+	trace_ocfs2_resv_find_window_begin(resv->r_start, ocfs2_resv_end(resv),
+					   goal, wanted, RB_EMPTY_ROOT(root));
+
+	assert_spin_locked(&resv_lock);
+
+	if (RB_EMPTY_ROOT(root)) {
+		/*
+		 * Easiest case - empty tree. We can just take
+		 * whatever window of free bits we want.
+		 */
+		clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+						   resmap->m_bitmap_len - goal,
+						   &cstart, &clen);
+
+		/*
+		 * This should never happen - the local alloc window
+		 * will always have free bits when we're called.
+		 */
+		BUG_ON(goal == 0 && clen == 0);
+
+		if (clen == 0)
+			return;
+
+		resv->r_start = cstart;
+		resv->r_len = clen;
+
+		ocfs2_resv_insert(resmap, resv);
+		return;
+	}
+
+	prev_resv = ocfs2_find_resv_lhs(resmap, goal);
+
+	if (prev_resv == NULL) {
+		/*
+		 * A NULL here means that the search code couldn't
+		 * find a window that starts before goal.
+		 *
+		 * However, we can take the first window after goal,
+		 * which is also by definition, the leftmost window in
+		 * the entire tree. If we can find free bits in the
+		 * gap between goal and the LHS window, then the
+		 * reservation can safely be placed there.
+		 *
+		 * Otherwise we fall back to a linear search, checking
+		 * the gaps in between windows for a place to
+		 * allocate.
+		 */
+
+		next = rb_first(root);
+		next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
+				     r_node);
+
+		/*
+		 * The search should never return such a window. (see
+		 * comment above
+		 */
+		if (next_resv->r_start <= goal) {
+			mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
+			     goal, next_resv->r_start, next_resv->r_len);
+			ocfs2_dump_resv(resmap);
+			BUG();
+		}
+
+		clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+						   next_resv->r_start - goal,
+						   &cstart, &clen);
+		if (clen) {
+			best_len = clen;
+			best_start = cstart;
+			if (best_len == wanted)
+				goto out_insert;
+		}
+
+		prev_resv = next_resv;
+		next_resv = NULL;
+	}
+
+	trace_ocfs2_resv_find_window_prev(prev_resv->r_start,
+					  ocfs2_resv_end(prev_resv));
+
+	prev = &prev_resv->r_node;
+
+	/* Now we do a linear search for a window, starting at 'prev_rsv' */
+	while (1) {
+		next = rb_next(prev);
+		if (next) {
+			next_resv = rb_entry(next,
+					     struct ocfs2_alloc_reservation,
+					     r_node);
+
+			gap_start = ocfs2_resv_end(prev_resv) + 1;
+			gap_end = next_resv->r_start - 1;
+			gap_len = gap_end - gap_start + 1;
+		} else {
+			/*
+			 * We're at the rightmost edge of the
+			 * tree. See if a reservation between this
+			 * window and the end of the bitmap will work.
+			 */
+			gap_start = ocfs2_resv_end(prev_resv) + 1;
+			gap_len = resmap->m_bitmap_len - gap_start;
+			gap_end = resmap->m_bitmap_len - 1;
+		}
+
+		trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1,
+					next ? ocfs2_resv_end(next_resv) : -1);
+		/*
+		 * No need to check this gap if we have already found
+		 * a larger region of free bits.
+		 */
+		if (gap_len <= best_len)
+			goto next_resv;
+
+		clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
+						   gap_len, &cstart, &clen);
+		if (clen == wanted) {
+			best_len = clen;
+			best_start = cstart;
+			goto out_insert;
+		} else if (clen > best_len) {
+			best_len = clen;
+			best_start = cstart;
+		}
+
+next_resv:
+		if (!next)
+			break;
+
+		prev = next;
+		prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
+				     r_node);
+	}
+
+out_insert:
+	if (best_len) {
+		resv->r_start = best_start;
+		resv->r_len = best_len;
+		ocfs2_resv_insert(resmap, resv);
+	}
+}
+
+static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
+				   struct ocfs2_alloc_reservation *resv,
+				   unsigned int wanted)
+{
+	struct ocfs2_alloc_reservation *lru_resv;
+	int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
+	unsigned int min_bits;
+
+	if (!tmpwindow)
+		min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
+	else
+		min_bits = wanted; /* We at know the temp window will use all
+				    * of these bits */
+
+	/*
+	 * Take the first reservation off the LRU as our 'target'. We
+	 * don't try to be smart about it. There might be a case for
+	 * searching based on size but I don't have enough data to be
+	 * sure. --Mark (3/16/2010)
+	 */
+	lru_resv = list_first_entry(&resmap->m_lru,
+				    struct ocfs2_alloc_reservation, r_lru);
+
+	trace_ocfs2_cannibalize_resv_begin(lru_resv->r_start,
+					   lru_resv->r_len,
+					   ocfs2_resv_end(lru_resv));
+
+	/*
+	 * Cannibalize (some or all) of the target reservation and
+	 * feed it to the current window.
+	 */
+	if (lru_resv->r_len <= min_bits) {
+		/*
+		 * Discard completely if size is less than or equal to a
+		 * reasonable threshold - 50% of window bits for non temporary
+		 * windows.
+		 */
+		resv->r_start = lru_resv->r_start;
+		resv->r_len = lru_resv->r_len;
+
+		__ocfs2_resv_discard(resmap, lru_resv);
+	} else {
+		unsigned int shrink;
+		if (tmpwindow)
+			shrink = min_bits;
+		else
+			shrink = lru_resv->r_len / 2;
+
+		lru_resv->r_len -= shrink;
+
+		resv->r_start = ocfs2_resv_end(lru_resv) + 1;
+		resv->r_len = shrink;
+	}
+
+	trace_ocfs2_cannibalize_resv_end(resv->r_start, ocfs2_resv_end(resv),
+					 resv->r_len, resv->r_last_start,
+					 resv->r_last_len);
+
+	ocfs2_resv_insert(resmap, resv);
+}
+
+static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+				   struct ocfs2_alloc_reservation *resv,
+				   unsigned int wanted)
+{
+	unsigned int goal = 0;
+
+	BUG_ON(!ocfs2_resv_empty(resv));
+
+	/*
+	 * Begin by trying to get a window as close to the previous
+	 * one as possible. Using the most recent allocation as a
+	 * start goal makes sense.
+	 */
+	if (resv->r_last_len) {
+		goal = resv->r_last_start + resv->r_last_len;
+		if (goal >= resmap->m_bitmap_len)
+			goal = 0;
+	}
+
+	__ocfs2_resv_find_window(resmap, resv, goal, wanted);
+
+	/* Search from last alloc didn't work, try once more from beginning. */
+	if (ocfs2_resv_empty(resv) && goal != 0)
+		__ocfs2_resv_find_window(resmap, resv, 0, wanted);
+
+	if (ocfs2_resv_empty(resv)) {
+		/*
+		 * Still empty? Pull oldest one off the LRU, remove it from
+		 * tree, put this one in it's place.
+		 */
+		ocfs2_cannibalize_resv(resmap, resv, wanted);
+	}
+
+	BUG_ON(ocfs2_resv_empty(resv));
+}
+
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+			   struct ocfs2_alloc_reservation *resv,
+			   int *cstart, int *clen)
+{
+	if (resv == NULL || ocfs2_resmap_disabled(resmap))
+		return -ENOSPC;
+
+	spin_lock(&resv_lock);
+
+	if (ocfs2_resv_empty(resv)) {
+		/*
+		 * We don't want to over-allocate for temporary
+		 * windows. Otherwise, we run the risk of fragmenting the
+		 * allocation space.
+		 */
+		unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
+
+		if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+			wanted = *clen;
+
+		/*
+		 * Try to get a window here. If it works, we must fall
+		 * through and test the bitmap . This avoids some
+		 * ping-ponging of windows due to non-reserved space
+		 * being allocation before we initialize a window for
+		 * that inode.
+		 */
+		ocfs2_resv_find_window(resmap, resv, wanted);
+		trace_ocfs2_resmap_resv_bits(resv->r_start, resv->r_len);
+	}
+
+	BUG_ON(ocfs2_resv_empty(resv));
+
+	*cstart = resv->r_start;
+	*clen = resv->r_len;
+
+	spin_unlock(&resv_lock);
+	return 0;
+}
+
+static void
+	ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
+				     struct ocfs2_alloc_reservation *resv,
+				     unsigned int start, unsigned int end)
+{
+	unsigned int rhs = 0;
+	unsigned int old_end = ocfs2_resv_end(resv);
+
+	BUG_ON(start != resv->r_start || old_end < end);
+
+	/*
+	 * Completely used? We can remove it then.
+	 */
+	if (old_end == end) {
+		__ocfs2_resv_discard(resmap, resv);
+		return;
+	}
+
+	rhs = old_end - end;
+
+	/*
+	 * This should have been trapped above.
+	 */
+	BUG_ON(rhs == 0);
+
+	resv->r_start = end + 1;
+	resv->r_len = old_end - resv->r_start + 1;
+}
+
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+			       struct ocfs2_alloc_reservation *resv,
+			       u32 cstart, u32 clen)
+{
+	unsigned int cend = cstart + clen - 1;
+
+	if (resmap == NULL || ocfs2_resmap_disabled(resmap))
+		return;
+
+	if (resv == NULL)
+		return;
+
+	BUG_ON(cstart != resv->r_start);
+
+	spin_lock(&resv_lock);
+
+	trace_ocfs2_resmap_claimed_bits_begin(cstart, cend, clen, resv->r_start,
+					      ocfs2_resv_end(resv), resv->r_len,
+					      resv->r_last_start,
+					      resv->r_last_len);
+
+	BUG_ON(cstart < resv->r_start);
+	BUG_ON(cstart > ocfs2_resv_end(resv));
+	BUG_ON(cend > ocfs2_resv_end(resv));
+
+	ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
+	resv->r_last_start = cstart;
+	resv->r_last_len = clen;
+
+	/*
+	 * May have been discarded above from
+	 * ocfs2_adjust_resv_from_alloc().
+	 */
+	if (!ocfs2_resv_empty(resv))
+		ocfs2_resv_mark_lru(resmap, resv);
+
+	trace_ocfs2_resmap_claimed_bits_end(resv->r_start, ocfs2_resv_end(resv),
+					    resv->r_len, resv->r_last_start,
+					    resv->r_last_len);
+
+	ocfs2_check_resmap(resmap);
+
+	spin_unlock(&resv_lock);
+}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644
index 00000000..42c2b804
--- /dev/null
+++ b/fs/ocfs2/reservations.h
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.h
+ *
+ * Allocation reservations function prototypes and structures.
+ *
+ * Copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef	OCFS2_RESERVATIONS_H
+#define	OCFS2_RESERVATIONS_H
+
+#include <linux/rbtree.h>
+
+#define OCFS2_DEFAULT_RESV_LEVEL	2
+#define OCFS2_MAX_RESV_LEVEL	9
+#define OCFS2_MIN_RESV_LEVEL	0
+
+struct ocfs2_alloc_reservation {
+	struct rb_node	r_node;
+
+	unsigned int	r_start;	/* Beginning of current window */
+	unsigned int	r_len;		/* Length of the window */
+
+	unsigned int	r_last_len;	/* Length of most recent alloc */
+	unsigned int	r_last_start;	/* Start of most recent alloc */
+	struct list_head	r_lru;	/* LRU list head */
+
+	unsigned int	r_flags;
+};
+
+#define	OCFS2_RESV_FLAG_INUSE	0x01	/* Set when r_node is part of a btree */
+#define	OCFS2_RESV_FLAG_TMP	0x02	/* Temporary reservation, will be
+					 * destroyed immedately after use */
+#define	OCFS2_RESV_FLAG_DIR	0x04	/* Reservation is for an unindexed
+					 * directory btree */
+
+struct ocfs2_reservation_map {
+	struct rb_root		m_reservations;
+	char			*m_disk_bitmap;
+
+	struct ocfs2_super	*m_osb;
+
+	/* The following are not initialized to meaningful values until a disk
+	 * bitmap is provided. */
+	u32			m_bitmap_len;	/* Number of valid
+						 * bits available */
+
+	struct list_head	m_lru;		/* LRU of reservations
+						 * structures. */
+
+};
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
+
+#define OCFS2_RESV_TYPES	(OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+			 unsigned int flags);
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
+
+/**
+ * ocfs2_resv_discard() - truncate a reservation
+ * @resmap:
+ * @resv: the reservation to truncate.
+ *
+ * After this function is called, the reservation will be empty, and
+ * unlinked from the rbtree.
+ */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+			struct ocfs2_alloc_reservation *resv);
+
+
+/**
+ * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @resmap: struct ocfs2_reservation_map to initialize
+ * @obj: unused for now
+ * @ops: unused for now
+ * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
+ *
+ * Only possible return value other than '0' is -ENOMEM for failure to
+ * allocation mirror bitmap.
+ */
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+		      struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_restart() - "restart" a reservation bitmap
+ * @resmap: reservations bitmap
+ * @clen: Number of valid bits in the bitmap
+ * @disk_bitmap: the disk bitmap this resmap should refer to.
+ *
+ * Re-initialize the parameters of a reservation bitmap. This is
+ * useful for local alloc window slides.
+ *
+ * This function will call ocfs2_trunc_resv against all existing
+ * reservations. A future version will recalculate existing
+ * reservations based on the new bitmap.
+ */
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+			  unsigned int clen, char *disk_bitmap);
+
+/**
+ * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
+ * @resmap: the struct ocfs2_reservation_map to uninitialize
+ */
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
+ * @resmap: reservations bitmap
+ * @resv: reservation to base search from
+ * @cstart: start of proposed allocation
+ * @clen: length (in clusters) of proposed allocation
+ *
+ * Using the reservation data from resv, this function will compare
+ * resmap and resmap->m_disk_bitmap to determine what part (if any) of
+ * the reservation window is still clear to use. If resv is empty,
+ * this function will try to allocate a window for it.
+ *
+ * On success, zero is returned and the valid allocation area is set in cstart
+ * and clen.
+ *
+ * Returns -ENOSPC if reservations are disabled.
+ */
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+			   struct ocfs2_alloc_reservation *resv,
+			   int *cstart, int *clen);
+
+/**
+ * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
+ * @resmap: reservations bitmap
+ * @resv: optional reservation to recalulate based on new bitmap
+ * @cstart: start of allocation in clusters
+ * @clen: end of allocation in clusters.
+ *
+ * Tell the reservation code that bits were used to fulfill allocation in
+ * resmap. The bits don't have to have been part of any existing
+ * reservation. But we must always call this function when bits are claimed.
+ * Internally, the reservations code will use this information to mark the
+ * reservations bitmap. If resv is passed, it's next allocation window will be
+ * calculated. It also expects that 'cstart' is the same as we passed back
+ * from ocfs2_resmap_resv_bits().
+ */
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+			       struct ocfs2_alloc_reservation *resv,
+			       u32 cstart, u32 clen);
+
+#endif	/* OCFS2_RESERVATIONS_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
new file mode 100644
index 00000000..ec55add7
--- /dev/null
+++ b/fs/ocfs2/resize.c
@@ -0,0 +1,589 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * resize.c
+ *
+ * volume resize.
+ * Inspired by ext3/resize.c.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "resize.h"
+
+/*
+ * Check whether there are new backup superblocks exist
+ * in the last group. If there are some, mark them or clear
+ * them in the bitmap.
+ *
+ * Return how many backups we find in the last group.
+ */
+static u16 ocfs2_calc_new_backup_super(struct inode *inode,
+				       struct ocfs2_group_desc *gd,
+				       int new_clusters,
+				       u32 first_new_cluster,
+				       u16 cl_cpg,
+				       int set)
+{
+	int i;
+	u16 backups = 0;
+	u32 cluster;
+	u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
+
+	for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+		blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
+		cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+
+		gd_blkno = ocfs2_which_cluster_group(inode, cluster);
+		if (gd_blkno < lgd_blkno)
+			continue;
+		else if (gd_blkno > lgd_blkno)
+			break;
+
+		if (set)
+			ocfs2_set_bit(cluster % cl_cpg,
+				      (unsigned long *)gd->bg_bitmap);
+		else
+			ocfs2_clear_bit(cluster % cl_cpg,
+					(unsigned long *)gd->bg_bitmap);
+		backups++;
+	}
+
+	return backups;
+}
+
+static int ocfs2_update_last_group_and_inode(handle_t *handle,
+					     struct inode *bm_inode,
+					     struct buffer_head *bm_bh,
+					     struct buffer_head *group_bh,
+					     u32 first_new_cluster,
+					     int new_clusters)
+{
+	int ret = 0;
+	struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb);
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data;
+	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
+	struct ocfs2_chain_rec *cr;
+	struct ocfs2_group_desc *group;
+	u16 chain, num_bits, backups = 0;
+	u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
+	u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
+
+	trace_ocfs2_update_last_group_and_inode(new_clusters,
+						first_new_cluster);
+
+	ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode),
+				      group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	group = (struct ocfs2_group_desc *)group_bh->b_data;
+
+	/* update the group first. */
+	num_bits = new_clusters * cl_bpc;
+	le16_add_cpu(&group->bg_bits, num_bits);
+	le16_add_cpu(&group->bg_free_bits_count, num_bits);
+
+	/*
+	 * check whether there are some new backup superblocks exist in
+	 * this group and update the group bitmap accordingly.
+	 */
+	if (OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+				     OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
+		backups = ocfs2_calc_new_backup_super(bm_inode,
+						     group,
+						     new_clusters,
+						     first_new_cluster,
+						     cl_cpg, 1);
+		le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
+	}
+
+	ocfs2_journal_dirty(handle, group_bh);
+
+	/* update the inode accordingly. */
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_rollback;
+	}
+
+	chain = le16_to_cpu(group->bg_chain);
+	cr = (&cl->cl_recs[chain]);
+	le32_add_cpu(&cr->c_total, num_bits);
+	le32_add_cpu(&cr->c_free, num_bits);
+	le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits);
+	le32_add_cpu(&fe->i_clusters, new_clusters);
+
+	if (backups) {
+		le32_add_cpu(&cr->c_free, -1 * backups);
+		le32_add_cpu(&fe->id1.bitmap1.i_used, backups);
+	}
+
+	spin_lock(&OCFS2_I(bm_inode)->ip_lock);
+	OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits);
+	spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
+	i_size_write(bm_inode, le64_to_cpu(fe->i_size));
+
+	ocfs2_journal_dirty(handle, bm_bh);
+
+out_rollback:
+	if (ret < 0) {
+		ocfs2_calc_new_backup_super(bm_inode,
+					    group,
+					    new_clusters,
+					    first_new_cluster,
+					    cl_cpg, 0);
+		le16_add_cpu(&group->bg_free_bits_count, backups);
+		le16_add_cpu(&group->bg_bits, -1 * num_bits);
+		le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
+	}
+out:
+	if (ret)
+		mlog_errno(ret);
+	return ret;
+}
+
+static int update_backups(struct inode * inode, u32 clusters, char *data)
+{
+	int i, ret = 0;
+	u32 cluster;
+	u64 blkno;
+	struct buffer_head *backup = NULL;
+	struct ocfs2_dinode *backup_di = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* calculate the real backups we need to update. */
+	for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+		blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
+		cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+		if (cluster > clusters)
+			break;
+
+		ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+
+		memcpy(backup->b_data, data, inode->i_sb->s_blocksize);
+
+		backup_di = (struct ocfs2_dinode *)backup->b_data;
+		backup_di->i_blkno = cpu_to_le64(blkno);
+
+		ret = ocfs2_write_super_or_backup(osb, backup);
+		brelse(backup);
+		backup = NULL;
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void ocfs2_update_super_and_backups(struct inode *inode,
+					   int new_clusters)
+{
+	int ret;
+	u32 clusters = 0;
+	struct buffer_head *super_bh = NULL;
+	struct ocfs2_dinode *super_di = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/*
+	 * update the superblock last.
+	 * It doesn't matter if the write failed.
+	 */
+	ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1,
+				     &super_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	super_di = (struct ocfs2_dinode *)super_bh->b_data;
+	le32_add_cpu(&super_di->i_clusters, new_clusters);
+	clusters = le32_to_cpu(super_di->i_clusters);
+
+	ret = ocfs2_write_super_or_backup(osb, super_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB))
+		ret = update_backups(inode, clusters, super_bh->b_data);
+
+out:
+	brelse(super_bh);
+	if (ret)
+		printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s"
+			" during fs resize. This condition is not fatal,"
+			" but fsck.ocfs2 should be run to fix it\n",
+			osb->dev_str);
+	return;
+}
+
+/*
+ * Extend the filesystem to the new number of clusters specified.  This entry
+ * point is only used to extend the current filesystem to the end of the last
+ * existing group.
+ */
+int ocfs2_group_extend(struct inode * inode, int new_clusters)
+{
+	int ret;
+	handle_t *handle;
+	struct buffer_head *main_bm_bh = NULL;
+	struct buffer_head *group_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_group_desc *group = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u16 cl_bpc;
+	u32 first_new_cluster;
+	u64 lgd_blkno;
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	if (new_clusters < 0)
+		return -EINVAL;
+	else if (new_clusters == 0)
+		return 0;
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock(&main_bm_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+	/* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
+	 * so any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
+	if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
+		ocfs2_group_bitmap_size(osb->sb, 0,
+					osb->s_feature_incompat) * 8) {
+		mlog(ML_ERROR, "The disk is too old and small. "
+		     "Force to do offline resize.");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	first_new_cluster = le32_to_cpu(fe->i_clusters);
+	lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
+					      first_new_cluster - 1);
+
+	ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
+					  &group_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+	group = (struct ocfs2_group_desc *)group_bh->b_data;
+
+	cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
+	if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
+		le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+
+	trace_ocfs2_group_extend(
+	     (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
+
+	handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
+	if (IS_ERR(handle)) {
+		mlog_errno(PTR_ERR(handle));
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* update the last group descriptor and inode. */
+	ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode,
+						main_bm_bh, group_bh,
+						first_new_cluster,
+						new_clusters);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_update_super_and_backups(main_bm_inode, new_clusters);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out_unlock:
+	brelse(group_bh);
+	brelse(main_bm_bh);
+
+	ocfs2_inode_unlock(main_bm_inode, 1);
+
+out_mutex:
+	mutex_unlock(&main_bm_inode->i_mutex);
+	iput(main_bm_inode);
+
+out:
+	return ret;
+}
+
+static int ocfs2_check_new_group(struct inode *inode,
+				 struct ocfs2_dinode *di,
+				 struct ocfs2_new_group_input *input,
+				 struct buffer_head *group_bh)
+{
+	int ret;
+	struct ocfs2_group_desc *gd =
+		(struct ocfs2_group_desc *)group_bh->b_data;
+	u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
+
+	ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
+	if (ret)
+		goto out;
+
+	ret = -EINVAL;
+	if (le16_to_cpu(gd->bg_chain) != input->chain)
+		mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
+		     "while input has %u set.\n",
+		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+		     le16_to_cpu(gd->bg_chain), input->chain);
+	else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc)
+		mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
+		     "input has %u clusters set\n",
+		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+		     le16_to_cpu(gd->bg_bits), input->clusters);
+	else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc)
+		mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u "
+		     "but it should have %u set\n",
+		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+		     le16_to_cpu(gd->bg_bits),
+		     input->frees * cl_bpc);
+	else
+		ret = 0;
+
+out:
+	return ret;
+}
+
+static int ocfs2_verify_group_and_input(struct inode *inode,
+					struct ocfs2_dinode *di,
+					struct ocfs2_new_group_input *input,
+					struct buffer_head *group_bh)
+{
+	u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count);
+	u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
+	u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec);
+	u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group);
+	u32 total_clusters = le32_to_cpu(di->i_clusters);
+	int ret = -EINVAL;
+
+	if (cluster < total_clusters)
+		mlog(ML_ERROR, "add a group which is in the current volume.\n");
+	else if (input->chain >= cl_count)
+		mlog(ML_ERROR, "input chain exceeds the limit.\n");
+	else if (next_free != cl_count && next_free != input->chain)
+		mlog(ML_ERROR,
+		     "the add group should be in chain %u\n", next_free);
+	else if (total_clusters + input->clusters < total_clusters)
+		mlog(ML_ERROR, "add group's clusters overflow.\n");
+	else if (input->clusters > cl_cpg)
+		mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n");
+	else if (input->frees > input->clusters)
+		mlog(ML_ERROR, "the free cluster exceeds the total clusters\n");
+	else if (total_clusters % cl_cpg != 0)
+		mlog(ML_ERROR,
+		     "the last group isn't full. Use group extend first.\n");
+	else if (input->group != ocfs2_which_cluster_group(inode, cluster))
+		mlog(ML_ERROR, "group blkno is invalid\n");
+	else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh)))
+		mlog(ML_ERROR, "group descriptor check failed.\n");
+	else
+		ret = 0;
+
+	return ret;
+}
+
+/* Add a new group descriptor to global_bitmap. */
+int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
+{
+	int ret;
+	handle_t *handle;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *group_bh = NULL;
+	struct ocfs2_group_desc *group = NULL;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_chain_rec *cr;
+	u16 cl_bpc;
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock(&main_bm_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+	if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
+		ocfs2_group_bitmap_size(osb->sb, 0,
+					osb->s_feature_incompat) * 8) {
+		mlog(ML_ERROR, "The disk is too old and small."
+		     " Force to do offline resize.");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh);
+	if (ret < 0) {
+		mlog(ML_ERROR, "Can't read the group descriptor # %llu "
+		     "from the device.", (unsigned long long)input->group);
+		goto out_unlock;
+	}
+
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), group_bh);
+
+	ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	trace_ocfs2_group_add((unsigned long long)input->group,
+			       input->chain, input->clusters, input->frees);
+
+	handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
+	if (IS_ERR(handle)) {
+		mlog_errno(PTR_ERR(handle));
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
+	cl = &fe->id2.i_chain;
+	cr = &cl->cl_recs[input->chain];
+
+	ret = ocfs2_journal_access_gd(handle, INODE_CACHE(main_bm_inode),
+				      group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	group = (struct ocfs2_group_desc *)group_bh->b_data;
+	group->bg_next_group = cr->c_blkno;
+	ocfs2_journal_dirty(handle, group_bh);
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
+				      main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) {
+		le16_add_cpu(&cl->cl_next_free_rec, 1);
+		memset(cr, 0, sizeof(struct ocfs2_chain_rec));
+	}
+
+	cr->c_blkno = cpu_to_le64(input->group);
+	le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
+	le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
+
+	le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc);
+	le32_add_cpu(&fe->id1.bitmap1.i_used,
+		     (input->clusters - input->frees) * cl_bpc);
+	le32_add_cpu(&fe->i_clusters, input->clusters);
+
+	ocfs2_journal_dirty(handle, main_bm_bh);
+
+	spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
+	OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
+	spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
+	i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
+
+	ocfs2_update_super_and_backups(main_bm_inode, input->clusters);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out_unlock:
+	brelse(group_bh);
+	brelse(main_bm_bh);
+
+	ocfs2_inode_unlock(main_bm_inode, 1);
+
+out_mutex:
+	mutex_unlock(&main_bm_inode->i_mutex);
+	iput(main_bm_inode);
+
+out:
+	return ret;
+}
diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h
new file mode 100644
index 00000000..f38841ab
--- /dev/null
+++ b/fs/ocfs2/resize.h
@@ -0,0 +1,32 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * resize.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_RESIZE_H
+#define OCFS2_RESIZE_H
+
+int ocfs2_group_extend(struct inode * inode, int new_clusters);
+int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input);
+
+#endif /* OCFS2_RESIZE_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
new file mode 100644
index 00000000..26fc0014
--- /dev/null
+++ b/fs/ocfs2/slot_map.c
@@ -0,0 +1,538 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slot_map.c
+ *
+ *
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "slot_map.h"
+#include "super.h"
+#include "sysfile.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+
+struct ocfs2_slot {
+	int sl_valid;
+	unsigned int sl_node_num;
+};
+
+struct ocfs2_slot_info {
+	int si_extended;
+	int si_slots_per_block;
+	struct inode *si_inode;
+	unsigned int si_blocks;
+	struct buffer_head **si_bh;
+	unsigned int si_num_slots;
+	struct ocfs2_slot *si_slots;
+};
+
+
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+				    unsigned int node_num);
+
+static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
+				  int slot_num)
+{
+	BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+	si->si_slots[slot_num].sl_valid = 0;
+}
+
+static void ocfs2_set_slot(struct ocfs2_slot_info *si,
+			   int slot_num, unsigned int node_num)
+{
+	BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+
+	si->si_slots[slot_num].sl_valid = 1;
+	si->si_slots[slot_num].sl_node_num = node_num;
+}
+
+/* This version is for the extended slot map */
+static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si)
+{
+	int b, i, slotno;
+	struct ocfs2_slot_map_extended *se;
+
+	slotno = 0;
+	for (b = 0; b < si->si_blocks; b++) {
+		se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data;
+		for (i = 0;
+		     (i < si->si_slots_per_block) &&
+		     (slotno < si->si_num_slots);
+		     i++, slotno++) {
+			if (se->se_slots[i].es_valid)
+				ocfs2_set_slot(si, slotno,
+					       le32_to_cpu(se->se_slots[i].es_node_num));
+			else
+				ocfs2_invalidate_slot(si, slotno);
+		}
+	}
+}
+
+/*
+ * Post the slot information on disk into our slot_info struct.
+ * Must be protected by osb_lock.
+ */
+static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si)
+{
+	int i;
+	struct ocfs2_slot_map *sm;
+
+	sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
+
+	for (i = 0; i < si->si_num_slots; i++) {
+		if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT)
+			ocfs2_invalidate_slot(si, i);
+		else
+			ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]));
+	}
+}
+
+static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+{
+	/*
+	 * The slot data will have been refreshed when ocfs2_super_lock
+	 * was taken.
+	 */
+	if (si->si_extended)
+		ocfs2_update_slot_info_extended(si);
+	else
+		ocfs2_update_slot_info_old(si);
+}
+
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
+{
+	int ret;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	if (si == NULL)
+		return 0;
+
+	BUG_ON(si->si_blocks == 0);
+	BUG_ON(si->si_bh == NULL);
+
+	trace_ocfs2_refresh_slot_info(si->si_blocks);
+
+	/*
+	 * We pass -1 as blocknr because we expect all of si->si_bh to
+	 * be !NULL.  Thus, ocfs2_read_blocks() will ignore blocknr.  If
+	 * this is not true, the read of -1 (UINT64_MAX) will fail.
+	 */
+	ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks,
+				si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL);
+	if (ret == 0) {
+		spin_lock(&osb->osb_lock);
+		ocfs2_update_slot_info(si);
+		spin_unlock(&osb->osb_lock);
+	}
+
+	return ret;
+}
+
+/* post the our slot info stuff into it's destination bh and write it
+ * out. */
+static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si,
+					    int slot_num,
+					    struct buffer_head **bh)
+{
+	int blkind = slot_num / si->si_slots_per_block;
+	int slotno = slot_num % si->si_slots_per_block;
+	struct ocfs2_slot_map_extended *se;
+
+	BUG_ON(blkind >= si->si_blocks);
+
+	se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data;
+	se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid;
+	if (si->si_slots[slot_num].sl_valid)
+		se->se_slots[slotno].es_node_num =
+			cpu_to_le32(si->si_slots[slot_num].sl_node_num);
+	*bh = si->si_bh[blkind];
+}
+
+static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si,
+				       int slot_num,
+				       struct buffer_head **bh)
+{
+	int i;
+	struct ocfs2_slot_map *sm;
+
+	sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
+	for (i = 0; i < si->si_num_slots; i++) {
+		if (si->si_slots[i].sl_valid)
+			sm->sm_slots[i] =
+				cpu_to_le16(si->si_slots[i].sl_node_num);
+		else
+			sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
+	}
+	*bh = si->si_bh[0];
+}
+
+static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
+				  struct ocfs2_slot_info *si,
+				  int slot_num)
+{
+	int status;
+	struct buffer_head *bh;
+
+	spin_lock(&osb->osb_lock);
+	if (si->si_extended)
+		ocfs2_update_disk_slot_extended(si, slot_num, &bh);
+	else
+		ocfs2_update_disk_slot_old(si, slot_num, &bh);
+	spin_unlock(&osb->osb_lock);
+
+	status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode));
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+/*
+ * Calculate how many bytes are needed by the slot map.  Returns
+ * an error if the slot map file is too small.
+ */
+static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
+					struct inode *inode,
+					unsigned long long *bytes)
+{
+	unsigned long long bytes_needed;
+
+	if (ocfs2_uses_extended_slot_map(osb)) {
+		bytes_needed = osb->max_slots *
+			sizeof(struct ocfs2_extended_slot);
+	} else {
+		bytes_needed = osb->max_slots * sizeof(__le16);
+	}
+	if (bytes_needed > i_size_read(inode)) {
+		mlog(ML_ERROR,
+		     "Slot map file is too small!  (size %llu, needed %llu)\n",
+		     i_size_read(inode), bytes_needed);
+		return -ENOSPC;
+	}
+
+	*bytes = bytes_needed;
+	return 0;
+}
+
+/* try to find global node in the slot info. Returns -ENOENT
+ * if nothing is found. */
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+				    unsigned int node_num)
+{
+	int i, ret = -ENOENT;
+
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (si->si_slots[i].sl_valid &&
+		    (node_num == si->si_slots[i].sl_node_num)) {
+			ret = i;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
+				   int preferred)
+{
+	int i, ret = -ENOSPC;
+
+	if ((preferred >= 0) && (preferred < si->si_num_slots)) {
+		if (!si->si_slots[preferred].sl_valid) {
+			ret = preferred;
+			goto out;
+		}
+	}
+
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (!si->si_slots[i].sl_valid) {
+			ret = i;
+			break;
+		}
+	}
+out:
+	return ret;
+}
+
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
+{
+	int slot;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	spin_lock(&osb->osb_lock);
+	slot = __ocfs2_node_num_to_slot(si, node_num);
+	spin_unlock(&osb->osb_lock);
+
+	return slot;
+}
+
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+				  unsigned int *node_num)
+{
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	assert_spin_locked(&osb->osb_lock);
+
+	BUG_ON(slot_num < 0);
+	BUG_ON(slot_num > osb->max_slots);
+
+	if (!si->si_slots[slot_num].sl_valid)
+		return -ENOENT;
+
+	*node_num = si->si_slots[slot_num].sl_node_num;
+	return 0;
+}
+
+static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
+{
+	unsigned int i;
+
+	if (si == NULL)
+		return;
+
+	if (si->si_inode)
+		iput(si->si_inode);
+	if (si->si_bh) {
+		for (i = 0; i < si->si_blocks; i++) {
+			if (si->si_bh[i]) {
+				brelse(si->si_bh[i]);
+				si->si_bh[i] = NULL;
+			}
+		}
+		kfree(si->si_bh);
+	}
+
+	kfree(si);
+}
+
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
+{
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	if (si == NULL)
+		return 0;
+
+	spin_lock(&osb->osb_lock);
+	ocfs2_invalidate_slot(si, slot_num);
+	spin_unlock(&osb->osb_lock);
+
+	return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num);
+}
+
+static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
+				  struct ocfs2_slot_info *si)
+{
+	int status = 0;
+	u64 blkno;
+	unsigned long long blocks, bytes = 0;
+	unsigned int i;
+	struct buffer_head *bh;
+
+	status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes);
+	if (status)
+		goto bail;
+
+	blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes);
+	BUG_ON(blocks > UINT_MAX);
+	si->si_blocks = blocks;
+	if (!si->si_blocks)
+		goto bail;
+
+	if (si->si_extended)
+		si->si_slots_per_block =
+			(osb->sb->s_blocksize /
+			 sizeof(struct ocfs2_extended_slot));
+	else
+		si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16);
+
+	/* The size checks above should ensure this */
+	BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
+
+	trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);
+
+	si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
+			    GFP_KERNEL);
+	if (!si->si_bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	for (i = 0; i < si->si_blocks; i++) {
+		status = ocfs2_extent_map_get_blocks(si->si_inode, i,
+						     &blkno, NULL, NULL);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		trace_ocfs2_map_slot_buffers_block((unsigned long long)blkno, i);
+
+		bh = NULL;  /* Acquire a fresh bh */
+		status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
+					   1, &bh, OCFS2_BH_IGNORE_CACHE, NULL);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		si->si_bh[i] = bh;
+	}
+
+bail:
+	return status;
+}
+
+int ocfs2_init_slot_info(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *inode = NULL;
+	struct ocfs2_slot_info *si;
+
+	si = kzalloc(sizeof(struct ocfs2_slot_info) +
+		     (sizeof(struct ocfs2_slot) * osb->max_slots),
+		     GFP_KERNEL);
+	if (!si) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	si->si_extended = ocfs2_uses_extended_slot_map(osb);
+	si->si_num_slots = osb->max_slots;
+	si->si_slots = (struct ocfs2_slot *)((char *)si +
+					     sizeof(struct ocfs2_slot_info));
+
+	inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	si->si_inode = inode;
+	status = ocfs2_map_slot_buffers(osb, si);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	osb->slot_info = (struct ocfs2_slot_info *)si;
+bail:
+	if (status < 0 && si)
+		__ocfs2_free_slot_info(si);
+
+	return status;
+}
+
+void ocfs2_free_slot_info(struct ocfs2_super *osb)
+{
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	osb->slot_info = NULL;
+	__ocfs2_free_slot_info(si);
+}
+
+int ocfs2_find_slot(struct ocfs2_super *osb)
+{
+	int status;
+	int slot;
+	struct ocfs2_slot_info *si;
+
+	si = osb->slot_info;
+
+	spin_lock(&osb->osb_lock);
+	ocfs2_update_slot_info(si);
+
+	/* search for ourselves first and take the slot if it already
+	 * exists. Perhaps we need to mark this in a variable for our
+	 * own journal recovery? Possibly not, though we certainly
+	 * need to warn to the user */
+	slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+	if (slot < 0) {
+		/* if no slot yet, then just take 1st available
+		 * one. */
+		slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
+		if (slot < 0) {
+			spin_unlock(&osb->osb_lock);
+			mlog(ML_ERROR, "no free slots available!\n");
+			status = -EINVAL;
+			goto bail;
+		}
+	} else
+		mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
+		     slot);
+
+	ocfs2_set_slot(si, slot, osb->node_num);
+	osb->slot_num = slot;
+	spin_unlock(&osb->osb_lock);
+
+	trace_ocfs2_find_slot(osb->slot_num);
+
+	status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	return status;
+}
+
+void ocfs2_put_slot(struct ocfs2_super *osb)
+{
+	int status, slot_num;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	if (!si)
+		return;
+
+	spin_lock(&osb->osb_lock);
+	ocfs2_update_slot_info(si);
+
+	slot_num = osb->slot_num;
+	ocfs2_invalidate_slot(si, osb->slot_num);
+	osb->slot_num = OCFS2_INVALID_SLOT;
+	spin_unlock(&osb->osb_lock);
+
+	status = ocfs2_update_disk_slot(osb, si, slot_num);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	ocfs2_free_slot_info(osb);
+}
+
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
new file mode 100644
index 00000000..601c95fd
--- /dev/null
+++ b/fs/ocfs2/slot_map.h
@@ -0,0 +1,44 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slotmap.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef SLOTMAP_H
+#define SLOTMAP_H
+
+int ocfs2_init_slot_info(struct ocfs2_super *osb);
+void ocfs2_free_slot_info(struct ocfs2_super *osb);
+
+int ocfs2_find_slot(struct ocfs2_super *osb);
+void ocfs2_put_slot(struct ocfs2_super *osb);
+
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb);
+
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num);
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+				  unsigned int *node_num);
+
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num);
+
+#endif
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
new file mode 100644
index 00000000..19965b00
--- /dev/null
+++ b/fs/ocfs2/stack_o2cb.c
@@ -0,0 +1,393 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_o2cb.c
+ *
+ * Code which interfaces ocfs2 with the o2cb stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/crc32.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
+#include <linux/fs.h>
+
+#include "cluster/masklog.h"
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+
+#include "stackglue.h"
+
+struct o2dlm_private {
+	struct dlm_eviction_cb op_eviction_cb;
+};
+
+static struct ocfs2_stack_plugin o2cb_stack;
+
+/* These should be identical */
+#if (DLM_LOCK_IV != LKM_IVMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_NL != LKM_NLMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CR != LKM_CRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CW != LKM_CWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PR != LKM_PRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PW != LKM_PWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_EX != LKM_EXMODE)
+# error Lock modes do not match
+#endif
+static inline int mode_to_o2dlm(int mode)
+{
+	BUG_ON(mode > LKM_MAXMODE);
+
+	return mode;
+}
+
+#define map_flag(_generic, _o2dlm)		\
+	if (flags & (_generic)) {		\
+		flags &= ~(_generic);		\
+		o2dlm_flags |= (_o2dlm);	\
+	}
+static int flags_to_o2dlm(u32 flags)
+{
+	int o2dlm_flags = 0;
+
+	map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE);
+	map_flag(DLM_LKF_CANCEL, LKM_CANCEL);
+	map_flag(DLM_LKF_CONVERT, LKM_CONVERT);
+	map_flag(DLM_LKF_VALBLK, LKM_VALBLK);
+	map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK);
+	map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN);
+	map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE);
+	map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT);
+	map_flag(DLM_LKF_LOCAL, LKM_LOCAL);
+
+	/* map_flag() should have cleared every flag passed in */
+	BUG_ON(flags != 0);
+
+	return o2dlm_flags;
+}
+#undef map_flag
+
+/*
+ * Map an o2dlm status to standard errno values.
+ *
+ * o2dlm only uses a handful of these, and returns even fewer to the
+ * caller. Still, we try to assign sane values to each error.
+ *
+ * The following value pairs have special meanings to dlmglue, thus
+ * the right hand side needs to stay unique - never duplicate the
+ * mapping elsewhere in the table!
+ *
+ * DLM_NORMAL:		0
+ * DLM_NOTQUEUED:	-EAGAIN
+ * DLM_CANCELGRANT:	-EBUSY
+ * DLM_CANCEL:		-DLM_ECANCEL
+ */
+/* Keep in sync with dlmapi.h */
+static int status_map[] = {
+	[DLM_NORMAL]			= 0,		/* Success */
+	[DLM_GRANTED]			= -EINVAL,
+	[DLM_DENIED]			= -EACCES,
+	[DLM_DENIED_NOLOCKS]		= -EACCES,
+	[DLM_WORKING]			= -EACCES,
+	[DLM_BLOCKED]			= -EINVAL,
+	[DLM_BLOCKED_ORPHAN]		= -EINVAL,
+	[DLM_DENIED_GRACE_PERIOD]	= -EACCES,
+	[DLM_SYSERR]			= -ENOMEM,	/* It is what it is */
+	[DLM_NOSUPPORT]			= -EPROTO,
+	[DLM_CANCELGRANT]		= -EBUSY,	/* Cancel after grant */
+	[DLM_IVLOCKID]			= -EINVAL,
+	[DLM_SYNC]			= -EINVAL,
+	[DLM_BADTYPE]			= -EINVAL,
+	[DLM_BADRESOURCE]		= -EINVAL,
+	[DLM_MAXHANDLES]		= -ENOMEM,
+	[DLM_NOCLINFO]			= -EINVAL,
+	[DLM_NOLOCKMGR]			= -EINVAL,
+	[DLM_NOPURGED]			= -EINVAL,
+	[DLM_BADARGS]			= -EINVAL,
+	[DLM_VOID]			= -EINVAL,
+	[DLM_NOTQUEUED]			= -EAGAIN,	/* Trylock failed */
+	[DLM_IVBUFLEN]			= -EINVAL,
+	[DLM_CVTUNGRANT]		= -EPERM,
+	[DLM_BADPARAM]			= -EINVAL,
+	[DLM_VALNOTVALID]		= -EINVAL,
+	[DLM_REJECTED]			= -EPERM,
+	[DLM_ABORT]			= -EINVAL,
+	[DLM_CANCEL]			= -DLM_ECANCEL,	/* Successful cancel */
+	[DLM_IVRESHANDLE]		= -EINVAL,
+	[DLM_DEADLOCK]			= -EDEADLK,
+	[DLM_DENIED_NOASTS]		= -EINVAL,
+	[DLM_FORWARD]			= -EINVAL,
+	[DLM_TIMEOUT]			= -ETIMEDOUT,
+	[DLM_IVGROUPID]			= -EINVAL,
+	[DLM_VERS_CONFLICT]		= -EOPNOTSUPP,
+	[DLM_BAD_DEVICE_PATH]		= -ENOENT,
+	[DLM_NO_DEVICE_PERMISSION]	= -EPERM,
+	[DLM_NO_CONTROL_DEVICE]		= -ENOENT,
+	[DLM_RECOVERING]		= -ENOTCONN,
+	[DLM_MIGRATING]			= -ERESTART,
+	[DLM_MAXSTATS]			= -EINVAL,
+};
+
+static int dlm_status_to_errno(enum dlm_status status)
+{
+	BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map));
+
+	return status_map[status];
+}
+
+static void o2dlm_lock_ast_wrapper(void *astarg)
+{
+	struct ocfs2_dlm_lksb *lksb = astarg;
+
+	lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
+}
+
+static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
+{
+	struct ocfs2_dlm_lksb *lksb = astarg;
+
+	lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
+}
+
+static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
+{
+	struct ocfs2_dlm_lksb *lksb = astarg;
+	int error = dlm_status_to_errno(status);
+
+	/*
+	 * In o2dlm, you can get both the lock_ast() for the lock being
+	 * granted and the unlock_ast() for the CANCEL failing.  A
+	 * successful cancel sends DLM_NORMAL here.  If the
+	 * lock grant happened before the cancel arrived, you get
+	 * DLM_CANCELGRANT.
+	 *
+	 * There's no need for the double-ast.  If we see DLM_CANCELGRANT,
+	 * we just ignore it.  We expect the lock_ast() to handle the
+	 * granted lock.
+	 */
+	if (status == DLM_CANCELGRANT)
+		return;
+
+	lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error);
+}
+
+static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
+			 int mode,
+			 struct ocfs2_dlm_lksb *lksb,
+			 u32 flags,
+			 void *name,
+			 unsigned int namelen)
+{
+	enum dlm_status status;
+	int o2dlm_mode = mode_to_o2dlm(mode);
+	int o2dlm_flags = flags_to_o2dlm(flags);
+	int ret;
+
+	status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
+			 o2dlm_flags, name, namelen,
+			 o2dlm_lock_ast_wrapper, lksb,
+			 o2dlm_blocking_ast_wrapper);
+	ret = dlm_status_to_errno(status);
+	return ret;
+}
+
+static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
+			   struct ocfs2_dlm_lksb *lksb,
+			   u32 flags)
+{
+	enum dlm_status status;
+	int o2dlm_flags = flags_to_o2dlm(flags);
+	int ret;
+
+	status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
+			   o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb);
+	ret = dlm_status_to_errno(status);
+	return ret;
+}
+
+static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
+{
+	return dlm_status_to_errno(lksb->lksb_o2dlm.status);
+}
+
+/*
+ * o2dlm aways has a "valid" LVB. If the dlm loses track of the LVB
+ * contents, it will zero out the LVB.  Thus the caller can always trust
+ * the contents.
+ */
+static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
+{
+	return 1;
+}
+
+static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
+{
+	return (void *)(lksb->lksb_o2dlm.lvb);
+}
+
+static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
+{
+	dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
+}
+
+/*
+ * Called from the dlm when it's about to evict a node. This is how the
+ * classic stack signals node death.
+ */
+static void o2dlm_eviction_cb(int node_num, void *data)
+{
+	struct ocfs2_cluster_connection *conn = data;
+
+	mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
+	     node_num, conn->cc_namelen, conn->cc_name);
+
+	conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
+}
+
+static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+	int rc = 0;
+	u32 dlm_key;
+	struct dlm_ctxt *dlm;
+	struct o2dlm_private *priv;
+	struct dlm_protocol_version fs_version;
+
+	BUG_ON(conn == NULL);
+	BUG_ON(conn->cc_proto == NULL);
+
+	/* for now we only have one cluster/node, make sure we see it
+	 * in the heartbeat universe */
+	if (!o2hb_check_local_node_heartbeating()) {
+		if (o2hb_global_heartbeat_active())
+			mlog(ML_ERROR, "Global heartbeat not started\n");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
+	if (!priv) {
+		rc = -ENOMEM;
+		goto out_free;
+	}
+
+	/* This just fills the structure in.  It is safe to pass conn. */
+	dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
+			      conn);
+
+	conn->cc_private = priv;
+
+	/* used by the dlm code to make message headers unique, each
+	 * node in this domain must agree on this. */
+	dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
+	fs_version.pv_major = conn->cc_version.pv_major;
+	fs_version.pv_minor = conn->cc_version.pv_minor;
+
+	dlm = dlm_register_domain(conn->cc_name, dlm_key, &fs_version);
+	if (IS_ERR(dlm)) {
+		rc = PTR_ERR(dlm);
+		mlog_errno(rc);
+		goto out_free;
+	}
+
+	conn->cc_version.pv_major = fs_version.pv_major;
+	conn->cc_version.pv_minor = fs_version.pv_minor;
+	conn->cc_lockspace = dlm;
+
+	dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
+
+out_free:
+	if (rc && conn->cc_private)
+		kfree(conn->cc_private);
+
+out:
+	return rc;
+}
+
+static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+	struct dlm_ctxt *dlm = conn->cc_lockspace;
+	struct o2dlm_private *priv = conn->cc_private;
+
+	dlm_unregister_eviction_cb(&priv->op_eviction_cb);
+	conn->cc_private = NULL;
+	kfree(priv);
+
+	dlm_unregister_domain(dlm);
+	conn->cc_lockspace = NULL;
+
+	return 0;
+}
+
+static int o2cb_cluster_this_node(unsigned int *node)
+{
+	int node_num;
+
+	node_num = o2nm_this_node();
+	if (node_num == O2NM_INVALID_NODE_NUM)
+		return -ENOENT;
+
+	if (node_num >= O2NM_MAX_NODES)
+		return -EOVERFLOW;
+
+	*node = node_num;
+	return 0;
+}
+
+static struct ocfs2_stack_operations o2cb_stack_ops = {
+	.connect	= o2cb_cluster_connect,
+	.disconnect	= o2cb_cluster_disconnect,
+	.this_node	= o2cb_cluster_this_node,
+	.dlm_lock	= o2cb_dlm_lock,
+	.dlm_unlock	= o2cb_dlm_unlock,
+	.lock_status	= o2cb_dlm_lock_status,
+	.lvb_valid	= o2cb_dlm_lvb_valid,
+	.lock_lvb	= o2cb_dlm_lvb,
+	.dump_lksb	= o2cb_dump_lksb,
+};
+
+static struct ocfs2_stack_plugin o2cb_stack = {
+	.sp_name	= "o2cb",
+	.sp_ops		= &o2cb_stack_ops,
+	.sp_owner	= THIS_MODULE,
+};
+
+static int __init o2cb_stack_init(void)
+{
+	return ocfs2_stack_glue_register(&o2cb_stack);
+}
+
+static void __exit o2cb_stack_exit(void)
+{
+	ocfs2_stack_glue_unregister(&o2cb_stack);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack");
+MODULE_LICENSE("GPL");
+module_init(o2cb_stack_init);
+module_exit(o2cb_stack_exit);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
new file mode 100644
index 00000000..a5ebe421
--- /dev/null
+++ b/fs/ocfs2/stack_user.c
@@ -0,0 +1,908 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_user.c
+ *
+ * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/reboot.h>
+#include <asm/uaccess.h>
+
+#include "stackglue.h"
+
+#include <linux/dlm_plock.h>
+
+/*
+ * The control protocol starts with a handshake.  Until the handshake
+ * is complete, the control device will fail all write(2)s.
+ *
+ * The handshake is simple.  First, the client reads until EOF.  Each line
+ * of output is a supported protocol tag.  All protocol tags are a single
+ * character followed by a two hex digit version number.  Currently the
+ * only things supported is T01, for "Text-base version 0x01".  Next, the
+ * client writes the version they would like to use, including the newline.
+ * Thus, the protocol tag is 'T01\n'.  If the version tag written is
+ * unknown, -EINVAL is returned.  Once the negotiation is complete, the
+ * client can start sending messages.
+ *
+ * The T01 protocol has three messages.  First is the "SETN" message.
+ * It has the following syntax:
+ *
+ *  SETN<space><8-char-hex-nodenum><newline>
+ *
+ * This is 14 characters.
+ *
+ * The "SETN" message must be the first message following the protocol.
+ * It tells ocfs2_control the local node number.
+ *
+ * Next comes the "SETV" message.  It has the following syntax:
+ *
+ *  SETV<space><2-char-hex-major><space><2-char-hex-minor><newline>
+ *
+ * This is 11 characters.
+ *
+ * The "SETV" message sets the filesystem locking protocol version as
+ * negotiated by the client.  The client negotiates based on the maximum
+ * version advertised in /sys/fs/ocfs2/max_locking_protocol.  The major
+ * number from the "SETV" message must match
+ * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number
+ * must be less than or equal to ...sp_max_version.pv_minor.
+ *
+ * Once this information has been set, mounts will be allowed.  From this
+ * point on, the "DOWN" message can be sent for node down notification.
+ * It has the following syntax:
+ *
+ *  DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline>
+ *
+ * eg:
+ *
+ *  DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n
+ *
+ * This is 47 characters.
+ */
+
+/*
+ * Whether or not the client has done the handshake.
+ * For now, we have just one protocol version.
+ */
+#define OCFS2_CONTROL_PROTO			"T01\n"
+#define OCFS2_CONTROL_PROTO_LEN			4
+
+/* Handshake states */
+#define OCFS2_CONTROL_HANDSHAKE_INVALID		(0)
+#define OCFS2_CONTROL_HANDSHAKE_READ		(1)
+#define OCFS2_CONTROL_HANDSHAKE_PROTOCOL	(2)
+#define OCFS2_CONTROL_HANDSHAKE_VALID		(3)
+
+/* Messages */
+#define OCFS2_CONTROL_MESSAGE_OP_LEN		4
+#define OCFS2_CONTROL_MESSAGE_SETNODE_OP	"SETN"
+#define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN	14
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_OP	"SETV"
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN	11
+#define OCFS2_CONTROL_MESSAGE_DOWN_OP		"DOWN"
+#define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN	47
+#define OCFS2_TEXT_UUID_LEN			32
+#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN	2
+#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN	8
+
+/*
+ * ocfs2_live_connection is refcounted because the filesystem and
+ * miscdevice sides can detach in different order.  Let's just be safe.
+ */
+struct ocfs2_live_connection {
+	struct list_head		oc_list;
+	struct ocfs2_cluster_connection	*oc_conn;
+};
+
+struct ocfs2_control_private {
+	struct list_head op_list;
+	int op_state;
+	int op_this_node;
+	struct ocfs2_protocol_version op_proto;
+};
+
+/* SETN<space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_setn {
+	char	tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+	char	space;
+	char	nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+	char	newline;
+};
+
+/* SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> */
+struct ocfs2_control_message_setv {
+	char	tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+	char	space1;
+	char	major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+	char	space2;
+	char	minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+	char	newline;
+};
+
+/* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_down {
+	char	tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+	char	space1;
+	char	uuid[OCFS2_TEXT_UUID_LEN];
+	char	space2;
+	char	nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+	char	newline;
+};
+
+union ocfs2_control_message {
+	char					tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+	struct ocfs2_control_message_setn	u_setn;
+	struct ocfs2_control_message_setv	u_setv;
+	struct ocfs2_control_message_down	u_down;
+};
+
+static struct ocfs2_stack_plugin ocfs2_user_plugin;
+
+static atomic_t ocfs2_control_opened;
+static int ocfs2_control_this_node = -1;
+static struct ocfs2_protocol_version running_proto;
+
+static LIST_HEAD(ocfs2_live_connection_list);
+static LIST_HEAD(ocfs2_control_private_list);
+static DEFINE_MUTEX(ocfs2_control_lock);
+
+static inline void ocfs2_control_set_handshake_state(struct file *file,
+						     int state)
+{
+	struct ocfs2_control_private *p = file->private_data;
+	p->op_state = state;
+}
+
+static inline int ocfs2_control_get_handshake_state(struct file *file)
+{
+	struct ocfs2_control_private *p = file->private_data;
+	return p->op_state;
+}
+
+static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
+{
+	size_t len = strlen(name);
+	struct ocfs2_live_connection *c;
+
+	BUG_ON(!mutex_is_locked(&ocfs2_control_lock));
+
+	list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) {
+		if ((c->oc_conn->cc_namelen == len) &&
+		    !strncmp(c->oc_conn->cc_name, name, len))
+			return c;
+	}
+
+	return NULL;
+}
+
+/*
+ * ocfs2_live_connection structures are created underneath the ocfs2
+ * mount path.  Since the VFS prevents multiple calls to
+ * fill_super(), we can't get dupes here.
+ */
+static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
+				     struct ocfs2_live_connection **c_ret)
+{
+	int rc = 0;
+	struct ocfs2_live_connection *c;
+
+	c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+	if (!c)
+		return -ENOMEM;
+
+	mutex_lock(&ocfs2_control_lock);
+	c->oc_conn = conn;
+
+	if (atomic_read(&ocfs2_control_opened))
+		list_add(&c->oc_list, &ocfs2_live_connection_list);
+	else {
+		printk(KERN_ERR
+		       "ocfs2: Userspace control daemon is not present\n");
+		rc = -ESRCH;
+	}
+
+	mutex_unlock(&ocfs2_control_lock);
+
+	if (!rc)
+		*c_ret = c;
+	else
+		kfree(c);
+
+	return rc;
+}
+
+/*
+ * This function disconnects the cluster connection from ocfs2_control.
+ * Afterwards, userspace can't affect the cluster connection.
+ */
+static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c)
+{
+	mutex_lock(&ocfs2_control_lock);
+	list_del_init(&c->oc_list);
+	c->oc_conn = NULL;
+	mutex_unlock(&ocfs2_control_lock);
+
+	kfree(c);
+}
+
+static int ocfs2_control_cfu(void *target, size_t target_len,
+			     const char __user *buf, size_t count)
+{
+	/* The T01 expects write(2) calls to have exactly one command */
+	if ((count != target_len) ||
+	    (count > sizeof(union ocfs2_control_message)))
+		return -EINVAL;
+
+	if (copy_from_user(target, buf, target_len))
+		return -EFAULT;
+
+	return 0;
+}
+
+static ssize_t ocfs2_control_validate_protocol(struct file *file,
+					       const char __user *buf,
+					       size_t count)
+{
+	ssize_t ret;
+	char kbuf[OCFS2_CONTROL_PROTO_LEN];
+
+	ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN,
+				buf, count);
+	if (ret)
+		return ret;
+
+	if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN))
+		return -EINVAL;
+
+	ocfs2_control_set_handshake_state(file,
+					  OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
+
+	return count;
+}
+
+static void ocfs2_control_send_down(const char *uuid,
+				    int nodenum)
+{
+	struct ocfs2_live_connection *c;
+
+	mutex_lock(&ocfs2_control_lock);
+
+	c = ocfs2_connection_find(uuid);
+	if (c) {
+		BUG_ON(c->oc_conn == NULL);
+		c->oc_conn->cc_recovery_handler(nodenum,
+						c->oc_conn->cc_recovery_data);
+	}
+
+	mutex_unlock(&ocfs2_control_lock);
+}
+
+/*
+ * Called whenever configuration elements are sent to /dev/ocfs2_control.
+ * If all configuration elements are present, try to set the global
+ * values.  If there is a problem, return an error.  Skip any missing
+ * elements, and only bump ocfs2_control_opened when we have all elements
+ * and are successful.
+ */
+static int ocfs2_control_install_private(struct file *file)
+{
+	int rc = 0;
+	int set_p = 1;
+	struct ocfs2_control_private *p = file->private_data;
+
+	BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
+
+	mutex_lock(&ocfs2_control_lock);
+
+	if (p->op_this_node < 0) {
+		set_p = 0;
+	} else if ((ocfs2_control_this_node >= 0) &&
+		   (ocfs2_control_this_node != p->op_this_node)) {
+		rc = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (!p->op_proto.pv_major) {
+		set_p = 0;
+	} else if (!list_empty(&ocfs2_live_connection_list) &&
+		   ((running_proto.pv_major != p->op_proto.pv_major) ||
+		    (running_proto.pv_minor != p->op_proto.pv_minor))) {
+		rc = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (set_p) {
+		ocfs2_control_this_node = p->op_this_node;
+		running_proto.pv_major = p->op_proto.pv_major;
+		running_proto.pv_minor = p->op_proto.pv_minor;
+	}
+
+out_unlock:
+	mutex_unlock(&ocfs2_control_lock);
+
+	if (!rc && set_p) {
+		/* We set the global values successfully */
+		atomic_inc(&ocfs2_control_opened);
+		ocfs2_control_set_handshake_state(file,
+					OCFS2_CONTROL_HANDSHAKE_VALID);
+	}
+
+	return rc;
+}
+
+static int ocfs2_control_get_this_node(void)
+{
+	int rc;
+
+	mutex_lock(&ocfs2_control_lock);
+	if (ocfs2_control_this_node < 0)
+		rc = -EINVAL;
+	else
+		rc = ocfs2_control_this_node;
+	mutex_unlock(&ocfs2_control_lock);
+
+	return rc;
+}
+
+static int ocfs2_control_do_setnode_msg(struct file *file,
+					struct ocfs2_control_message_setn *msg)
+{
+	long nodenum;
+	char *ptr = NULL;
+	struct ocfs2_control_private *p = file->private_data;
+
+	if (ocfs2_control_get_handshake_state(file) !=
+	    OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+		return -EINVAL;
+
+	if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+		    OCFS2_CONTROL_MESSAGE_OP_LEN))
+		return -EINVAL;
+
+	if ((msg->space != ' ') || (msg->newline != '\n'))
+		return -EINVAL;
+	msg->space = msg->newline = '\0';
+
+	nodenum = simple_strtol(msg->nodestr, &ptr, 16);
+	if (!ptr || *ptr)
+		return -EINVAL;
+
+	if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+	    (nodenum > INT_MAX) || (nodenum < 0))
+		return -ERANGE;
+	p->op_this_node = nodenum;
+
+	return ocfs2_control_install_private(file);
+}
+
+static int ocfs2_control_do_setversion_msg(struct file *file,
+					   struct ocfs2_control_message_setv *msg)
+ {
+	long major, minor;
+	char *ptr = NULL;
+	struct ocfs2_control_private *p = file->private_data;
+	struct ocfs2_protocol_version *max =
+		&ocfs2_user_plugin.sp_max_proto;
+
+	if (ocfs2_control_get_handshake_state(file) !=
+	    OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+		return -EINVAL;
+
+	if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+		    OCFS2_CONTROL_MESSAGE_OP_LEN))
+		return -EINVAL;
+
+	if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+	    (msg->newline != '\n'))
+		return -EINVAL;
+	msg->space1 = msg->space2 = msg->newline = '\0';
+
+	major = simple_strtol(msg->major, &ptr, 16);
+	if (!ptr || *ptr)
+		return -EINVAL;
+	minor = simple_strtol(msg->minor, &ptr, 16);
+	if (!ptr || *ptr)
+		return -EINVAL;
+
+	/*
+	 * The major must be between 1 and 255, inclusive.  The minor
+	 * must be between 0 and 255, inclusive.  The version passed in
+	 * must be within the maximum version supported by the filesystem.
+	 */
+	if ((major == LONG_MIN) || (major == LONG_MAX) ||
+	    (major > (u8)-1) || (major < 1))
+		return -ERANGE;
+	if ((minor == LONG_MIN) || (minor == LONG_MAX) ||
+	    (minor > (u8)-1) || (minor < 0))
+		return -ERANGE;
+	if ((major != max->pv_major) ||
+	    (minor > max->pv_minor))
+		return -EINVAL;
+
+	p->op_proto.pv_major = major;
+	p->op_proto.pv_minor = minor;
+
+	return ocfs2_control_install_private(file);
+}
+
+static int ocfs2_control_do_down_msg(struct file *file,
+				     struct ocfs2_control_message_down *msg)
+{
+	long nodenum;
+	char *p = NULL;
+
+	if (ocfs2_control_get_handshake_state(file) !=
+	    OCFS2_CONTROL_HANDSHAKE_VALID)
+		return -EINVAL;
+
+	if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+		    OCFS2_CONTROL_MESSAGE_OP_LEN))
+		return -EINVAL;
+
+	if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+	    (msg->newline != '\n'))
+		return -EINVAL;
+	msg->space1 = msg->space2 = msg->newline = '\0';
+
+	nodenum = simple_strtol(msg->nodestr, &p, 16);
+	if (!p || *p)
+		return -EINVAL;
+
+	if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+	    (nodenum > INT_MAX) || (nodenum < 0))
+		return -ERANGE;
+
+	ocfs2_control_send_down(msg->uuid, nodenum);
+
+	return 0;
+}
+
+static ssize_t ocfs2_control_message(struct file *file,
+				     const char __user *buf,
+				     size_t count)
+{
+	ssize_t ret;
+	union ocfs2_control_message msg;
+
+	/* Try to catch padding issues */
+	WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) !=
+		(sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1)));
+
+	memset(&msg, 0, sizeof(union ocfs2_control_message));
+	ret = ocfs2_control_cfu(&msg, count, buf, count);
+	if (ret)
+		goto out;
+
+	if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) &&
+	    !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+		     OCFS2_CONTROL_MESSAGE_OP_LEN))
+		ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn);
+	else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) &&
+		 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+			  OCFS2_CONTROL_MESSAGE_OP_LEN))
+		ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv);
+	else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) &&
+		 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+			  OCFS2_CONTROL_MESSAGE_OP_LEN))
+		ret = ocfs2_control_do_down_msg(file, &msg.u_down);
+	else
+		ret = -EINVAL;
+
+out:
+	return ret ? ret : count;
+}
+
+static ssize_t ocfs2_control_write(struct file *file,
+				   const char __user *buf,
+				   size_t count,
+				   loff_t *ppos)
+{
+	ssize_t ret;
+
+	switch (ocfs2_control_get_handshake_state(file)) {
+		case OCFS2_CONTROL_HANDSHAKE_INVALID:
+			ret = -EINVAL;
+			break;
+
+		case OCFS2_CONTROL_HANDSHAKE_READ:
+			ret = ocfs2_control_validate_protocol(file, buf,
+							      count);
+			break;
+
+		case OCFS2_CONTROL_HANDSHAKE_PROTOCOL:
+		case OCFS2_CONTROL_HANDSHAKE_VALID:
+			ret = ocfs2_control_message(file, buf, count);
+			break;
+
+		default:
+			BUG();
+			ret = -EIO;
+			break;
+	}
+
+	return ret;
+}
+
+/*
+ * This is a naive version.  If we ever have a new protocol, we'll expand
+ * it.  Probably using seq_file.
+ */
+static ssize_t ocfs2_control_read(struct file *file,
+				  char __user *buf,
+				  size_t count,
+				  loff_t *ppos)
+{
+	ssize_t ret;
+
+	ret = simple_read_from_buffer(buf, count, ppos,
+			OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN);
+
+	/* Have we read the whole protocol list? */
+	if (ret > 0 && *ppos >= OCFS2_CONTROL_PROTO_LEN)
+		ocfs2_control_set_handshake_state(file,
+						  OCFS2_CONTROL_HANDSHAKE_READ);
+
+	return ret;
+}
+
+static int ocfs2_control_release(struct inode *inode, struct file *file)
+{
+	struct ocfs2_control_private *p = file->private_data;
+
+	mutex_lock(&ocfs2_control_lock);
+
+	if (ocfs2_control_get_handshake_state(file) !=
+	    OCFS2_CONTROL_HANDSHAKE_VALID)
+		goto out;
+
+	if (atomic_dec_and_test(&ocfs2_control_opened)) {
+		if (!list_empty(&ocfs2_live_connection_list)) {
+			/* XXX: Do bad things! */
+			printk(KERN_ERR
+			       "ocfs2: Unexpected release of ocfs2_control!\n"
+			       "       Loss of cluster connection requires "
+			       "an emergency restart!\n");
+			emergency_restart();
+		}
+		/*
+		 * Last valid close clears the node number and resets
+		 * the locking protocol version
+		 */
+		ocfs2_control_this_node = -1;
+		running_proto.pv_major = 0;
+		running_proto.pv_major = 0;
+	}
+
+out:
+	list_del_init(&p->op_list);
+	file->private_data = NULL;
+
+	mutex_unlock(&ocfs2_control_lock);
+
+	kfree(p);
+
+	return 0;
+}
+
+static int ocfs2_control_open(struct inode *inode, struct file *file)
+{
+	struct ocfs2_control_private *p;
+
+	p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+	p->op_this_node = -1;
+
+	mutex_lock(&ocfs2_control_lock);
+	file->private_data = p;
+	list_add(&p->op_list, &ocfs2_control_private_list);
+	mutex_unlock(&ocfs2_control_lock);
+
+	return 0;
+}
+
+static const struct file_operations ocfs2_control_fops = {
+	.open    = ocfs2_control_open,
+	.release = ocfs2_control_release,
+	.read    = ocfs2_control_read,
+	.write   = ocfs2_control_write,
+	.owner   = THIS_MODULE,
+	.llseek  = default_llseek,
+};
+
+static struct miscdevice ocfs2_control_device = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "ocfs2_control",
+	.fops		= &ocfs2_control_fops,
+};
+
+static int ocfs2_control_init(void)
+{
+	int rc;
+
+	atomic_set(&ocfs2_control_opened, 0);
+
+	rc = misc_register(&ocfs2_control_device);
+	if (rc)
+		printk(KERN_ERR
+		       "ocfs2: Unable to register ocfs2_control device "
+		       "(errno %d)\n",
+		       -rc);
+
+	return rc;
+}
+
+static void ocfs2_control_exit(void)
+{
+	int rc;
+
+	rc = misc_deregister(&ocfs2_control_device);
+	if (rc)
+		printk(KERN_ERR
+		       "ocfs2: Unable to deregister ocfs2_control device "
+		       "(errno %d)\n",
+		       -rc);
+}
+
+static void fsdlm_lock_ast_wrapper(void *astarg)
+{
+	struct ocfs2_dlm_lksb *lksb = astarg;
+	int status = lksb->lksb_fsdlm.sb_status;
+
+	/*
+	 * For now we're punting on the issue of other non-standard errors
+	 * where we can't tell if the unlock_ast or lock_ast should be called.
+	 * The main "other error" that's possible is EINVAL which means the
+	 * function was called with invalid args, which shouldn't be possible
+	 * since the caller here is under our control.  Other non-standard
+	 * errors probably fall into the same category, or otherwise are fatal
+	 * which means we can't carry on anyway.
+	 */
+
+	if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
+		lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0);
+	else
+		lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
+}
+
+static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
+{
+	struct ocfs2_dlm_lksb *lksb = astarg;
+
+	lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
+}
+
+static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
+			 int mode,
+			 struct ocfs2_dlm_lksb *lksb,
+			 u32 flags,
+			 void *name,
+			 unsigned int namelen)
+{
+	int ret;
+
+	if (!lksb->lksb_fsdlm.sb_lvbptr)
+		lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
+					     sizeof(struct dlm_lksb);
+
+	ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
+		       flags|DLM_LKF_NODLCKWT, name, namelen, 0,
+		       fsdlm_lock_ast_wrapper, lksb,
+		       fsdlm_blocking_ast_wrapper);
+	return ret;
+}
+
+static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
+			   struct ocfs2_dlm_lksb *lksb,
+			   u32 flags)
+{
+	int ret;
+
+	ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
+			 flags, &lksb->lksb_fsdlm, lksb);
+	return ret;
+}
+
+static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
+{
+	return lksb->lksb_fsdlm.sb_status;
+}
+
+static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
+{
+	int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
+
+	return !invalid;
+}
+
+static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
+{
+	if (!lksb->lksb_fsdlm.sb_lvbptr)
+		lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
+					     sizeof(struct dlm_lksb);
+	return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
+}
+
+static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
+{
+}
+
+static int user_plock(struct ocfs2_cluster_connection *conn,
+		      u64 ino,
+		      struct file *file,
+		      int cmd,
+		      struct file_lock *fl)
+{
+	/*
+	 * This more or less just demuxes the plock request into any
+	 * one of three dlm calls.
+	 *
+	 * Internally, fs/dlm will pass these to a misc device, which
+	 * a userspace daemon will read and write to.
+	 *
+	 * For now, cancel requests (which happen internally only),
+	 * are turned into unlocks. Most of this function taken from
+	 * gfs2_lock.
+	 */
+
+	if (cmd == F_CANCELLK) {
+		cmd = F_SETLK;
+		fl->fl_type = F_UNLCK;
+	}
+
+	if (IS_GETLK(cmd))
+		return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
+	else if (fl->fl_type == F_UNLCK)
+		return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
+	else
+		return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
+}
+
+/*
+ * Compare a requested locking protocol version against the current one.
+ *
+ * If the major numbers are different, they are incompatible.
+ * If the current minor is greater than the request, they are incompatible.
+ * If the current minor is less than or equal to the request, they are
+ * compatible, and the requester should run at the current minor version.
+ */
+static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
+			       struct ocfs2_protocol_version *request)
+{
+	if (existing->pv_major != request->pv_major)
+		return 1;
+
+	if (existing->pv_minor > request->pv_minor)
+		return 1;
+
+	if (existing->pv_minor < request->pv_minor)
+		request->pv_minor = existing->pv_minor;
+
+	return 0;
+}
+
+static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+	dlm_lockspace_t *fsdlm;
+	struct ocfs2_live_connection *uninitialized_var(control);
+	int rc = 0;
+
+	BUG_ON(conn == NULL);
+
+	rc = ocfs2_live_connection_new(conn, &control);
+	if (rc)
+		goto out;
+
+	/*
+	 * running_proto must have been set before we allowed any mounts
+	 * to proceed.
+	 */
+	if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
+		printk(KERN_ERR
+		       "Unable to mount with fs locking protocol version "
+		       "%u.%u because the userspace control daemon has "
+		       "negotiated %u.%u\n",
+		       conn->cc_version.pv_major, conn->cc_version.pv_minor,
+		       running_proto.pv_major, running_proto.pv_minor);
+		rc = -EPROTO;
+		ocfs2_live_connection_drop(control);
+		goto out;
+	}
+
+	rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name),
+			       &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN);
+	if (rc) {
+		ocfs2_live_connection_drop(control);
+		goto out;
+	}
+
+	conn->cc_private = control;
+	conn->cc_lockspace = fsdlm;
+out:
+	return rc;
+}
+
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+	dlm_release_lockspace(conn->cc_lockspace, 2);
+	conn->cc_lockspace = NULL;
+	ocfs2_live_connection_drop(conn->cc_private);
+	conn->cc_private = NULL;
+	return 0;
+}
+
+static int user_cluster_this_node(unsigned int *this_node)
+{
+	int rc;
+
+	rc = ocfs2_control_get_this_node();
+	if (rc < 0)
+		return rc;
+
+	*this_node = rc;
+	return 0;
+}
+
+static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
+	.connect	= user_cluster_connect,
+	.disconnect	= user_cluster_disconnect,
+	.this_node	= user_cluster_this_node,
+	.dlm_lock	= user_dlm_lock,
+	.dlm_unlock	= user_dlm_unlock,
+	.lock_status	= user_dlm_lock_status,
+	.lvb_valid	= user_dlm_lvb_valid,
+	.lock_lvb	= user_dlm_lvb,
+	.plock		= user_plock,
+	.dump_lksb	= user_dlm_dump_lksb,
+};
+
+static struct ocfs2_stack_plugin ocfs2_user_plugin = {
+	.sp_name	= "user",
+	.sp_ops		= &ocfs2_user_plugin_ops,
+	.sp_owner	= THIS_MODULE,
+};
+
+
+static int __init ocfs2_user_plugin_init(void)
+{
+	int rc;
+
+	rc = ocfs2_control_init();
+	if (!rc) {
+		rc = ocfs2_stack_glue_register(&ocfs2_user_plugin);
+		if (rc)
+			ocfs2_control_exit();
+	}
+
+	return rc;
+}
+
+static void __exit ocfs2_user_plugin_exit(void)
+{
+	ocfs2_stack_glue_unregister(&ocfs2_user_plugin);
+	ocfs2_control_exit();
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks");
+MODULE_LICENSE("GPL");
+module_init(ocfs2_user_plugin_init);
+module_exit(ocfs2_user_plugin_exit);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
new file mode 100644
index 00000000..39abf896
--- /dev/null
+++ b/fs/ocfs2/stackglue.c
@@ -0,0 +1,726 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.c
+ *
+ * Code which implements an OCFS2 specific interface to underlying
+ * cluster stacks.
+ *
+ * Copyright (C) 2007, 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/sysctl.h>
+
+#include "ocfs2_fs.h"
+
+#include "stackglue.h"
+
+#define OCFS2_STACK_PLUGIN_O2CB		"o2cb"
+#define OCFS2_STACK_PLUGIN_USER		"user"
+#define OCFS2_MAX_HB_CTL_PATH		256
+
+static struct ocfs2_protocol_version locking_max_version;
+static DEFINE_SPINLOCK(ocfs2_stack_lock);
+static LIST_HEAD(ocfs2_stack_list);
+static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
+static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
+
+/*
+ * The stack currently in use.  If not null, active_stack->sp_count > 0,
+ * the module is pinned, and the locking protocol cannot be changed.
+ */
+static struct ocfs2_stack_plugin *active_stack;
+
+static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
+{
+	struct ocfs2_stack_plugin *p;
+
+	assert_spin_locked(&ocfs2_stack_lock);
+
+	list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+		if (!strcmp(p->sp_name, name))
+			return p;
+	}
+
+	return NULL;
+}
+
+static int ocfs2_stack_driver_request(const char *stack_name,
+				      const char *plugin_name)
+{
+	int rc;
+	struct ocfs2_stack_plugin *p;
+
+	spin_lock(&ocfs2_stack_lock);
+
+	/*
+	 * If the stack passed by the filesystem isn't the selected one,
+	 * we can't continue.
+	 */
+	if (strcmp(stack_name, cluster_stack_name)) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	if (active_stack) {
+		/*
+		 * If the active stack isn't the one we want, it cannot
+		 * be selected right now.
+		 */
+		if (!strcmp(active_stack->sp_name, plugin_name))
+			rc = 0;
+		else
+			rc = -EBUSY;
+		goto out;
+	}
+
+	p = ocfs2_stack_lookup(plugin_name);
+	if (!p || !try_module_get(p->sp_owner)) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	active_stack = p;
+	rc = 0;
+
+out:
+	/* If we found it, pin it */
+	if (!rc)
+		active_stack->sp_count++;
+
+	spin_unlock(&ocfs2_stack_lock);
+	return rc;
+}
+
+/*
+ * This function looks up the appropriate stack and makes it active.  If
+ * there is no stack, it tries to load it.  It will fail if the stack still
+ * cannot be found.  It will also fail if a different stack is in use.
+ */
+static int ocfs2_stack_driver_get(const char *stack_name)
+{
+	int rc;
+	char *plugin_name = OCFS2_STACK_PLUGIN_O2CB;
+
+	/*
+	 * Classic stack does not pass in a stack name.  This is
+	 * compatible with older tools as well.
+	 */
+	if (!stack_name || !*stack_name)
+		stack_name = OCFS2_STACK_PLUGIN_O2CB;
+
+	if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) {
+		printk(KERN_ERR
+		       "ocfs2 passed an invalid cluster stack label: \"%s\"\n",
+		       stack_name);
+		return -EINVAL;
+	}
+
+	/* Anything that isn't the classic stack is a user stack */
+	if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB))
+		plugin_name = OCFS2_STACK_PLUGIN_USER;
+
+	rc = ocfs2_stack_driver_request(stack_name, plugin_name);
+	if (rc == -ENOENT) {
+		request_module("ocfs2_stack_%s", plugin_name);
+		rc = ocfs2_stack_driver_request(stack_name, plugin_name);
+	}
+
+	if (rc == -ENOENT) {
+		printk(KERN_ERR
+		       "ocfs2: Cluster stack driver \"%s\" cannot be found\n",
+		       plugin_name);
+	} else if (rc == -EBUSY) {
+		printk(KERN_ERR
+		       "ocfs2: A different cluster stack is in use\n");
+	}
+
+	return rc;
+}
+
+static void ocfs2_stack_driver_put(void)
+{
+	spin_lock(&ocfs2_stack_lock);
+	BUG_ON(active_stack == NULL);
+	BUG_ON(active_stack->sp_count == 0);
+
+	active_stack->sp_count--;
+	if (!active_stack->sp_count) {
+		module_put(active_stack->sp_owner);
+		active_stack = NULL;
+	}
+	spin_unlock(&ocfs2_stack_lock);
+}
+
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
+{
+	int rc;
+
+	spin_lock(&ocfs2_stack_lock);
+	if (!ocfs2_stack_lookup(plugin->sp_name)) {
+		plugin->sp_count = 0;
+		plugin->sp_max_proto = locking_max_version;
+		list_add(&plugin->sp_list, &ocfs2_stack_list);
+		printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
+		       plugin->sp_name);
+		rc = 0;
+	} else {
+		printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n",
+		       plugin->sp_name);
+		rc = -EEXIST;
+	}
+	spin_unlock(&ocfs2_stack_lock);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register);
+
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
+{
+	struct ocfs2_stack_plugin *p;
+
+	spin_lock(&ocfs2_stack_lock);
+	p = ocfs2_stack_lookup(plugin->sp_name);
+	if (p) {
+		BUG_ON(p != plugin);
+		BUG_ON(plugin == active_stack);
+		BUG_ON(plugin->sp_count != 0);
+		list_del_init(&plugin->sp_list);
+		printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n",
+		       plugin->sp_name);
+	} else {
+		printk(KERN_ERR "Stack \"%s\" is not registered\n",
+		       plugin->sp_name);
+	}
+	spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
+
+void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto)
+{
+	struct ocfs2_stack_plugin *p;
+
+	spin_lock(&ocfs2_stack_lock);
+	if (memcmp(max_proto, &locking_max_version,
+		   sizeof(struct ocfs2_protocol_version))) {
+		BUG_ON(locking_max_version.pv_major != 0);
+
+		locking_max_version = *max_proto;
+		list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+			p->sp_max_proto = locking_max_version;
+		}
+	}
+	spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version);
+
+
+/*
+ * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument
+ * for the ast and bast functions.  They will pass the lksb to the ast
+ * and bast.  The caller can wrap the lksb with their own structure to
+ * get more information.
+ */
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
+		   int mode,
+		   struct ocfs2_dlm_lksb *lksb,
+		   u32 flags,
+		   void *name,
+		   unsigned int namelen)
+{
+	if (!lksb->lksb_conn)
+		lksb->lksb_conn = conn;
+	else
+		BUG_ON(lksb->lksb_conn != conn);
+	return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
+					      name, namelen);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
+
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
+		     struct ocfs2_dlm_lksb *lksb,
+		     u32 flags)
+{
+	BUG_ON(lksb->lksb_conn == NULL);
+
+	return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
+
+int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
+{
+	return active_stack->sp_ops->lock_status(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
+
+int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
+{
+	return active_stack->sp_ops->lvb_valid(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
+
+void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
+{
+	return active_stack->sp_ops->lock_lvb(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
+
+void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
+{
+	active_stack->sp_ops->dump_lksb(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
+
+int ocfs2_stack_supports_plocks(void)
+{
+	return active_stack && active_stack->sp_ops->plock;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks);
+
+/*
+ * ocfs2_plock() can only be safely called if
+ * ocfs2_stack_supports_plocks() returned true
+ */
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+		struct file *file, int cmd, struct file_lock *fl)
+{
+	WARN_ON_ONCE(active_stack->sp_ops->plock == NULL);
+	if (active_stack->sp_ops->plock)
+		return active_stack->sp_ops->plock(conn, ino, file, cmd, fl);
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(ocfs2_plock);
+
+int ocfs2_cluster_connect(const char *stack_name,
+			  const char *group,
+			  int grouplen,
+			  struct ocfs2_locking_protocol *lproto,
+			  void (*recovery_handler)(int node_num,
+						   void *recovery_data),
+			  void *recovery_data,
+			  struct ocfs2_cluster_connection **conn)
+{
+	int rc = 0;
+	struct ocfs2_cluster_connection *new_conn;
+
+	BUG_ON(group == NULL);
+	BUG_ON(conn == NULL);
+	BUG_ON(recovery_handler == NULL);
+
+	if (grouplen > GROUP_NAME_MAX) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (memcmp(&lproto->lp_max_version, &locking_max_version,
+		   sizeof(struct ocfs2_protocol_version))) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
+			   GFP_KERNEL);
+	if (!new_conn) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	memcpy(new_conn->cc_name, group, grouplen);
+	new_conn->cc_namelen = grouplen;
+	new_conn->cc_recovery_handler = recovery_handler;
+	new_conn->cc_recovery_data = recovery_data;
+
+	new_conn->cc_proto = lproto;
+	/* Start the new connection at our maximum compatibility level */
+	new_conn->cc_version = lproto->lp_max_version;
+
+	/* This will pin the stack driver if successful */
+	rc = ocfs2_stack_driver_get(stack_name);
+	if (rc)
+		goto out_free;
+
+	rc = active_stack->sp_ops->connect(new_conn);
+	if (rc) {
+		ocfs2_stack_driver_put();
+		goto out_free;
+	}
+
+	*conn = new_conn;
+
+out_free:
+	if (rc)
+		kfree(new_conn);
+
+out:
+	return rc;
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
+
+/* The caller will ensure all nodes have the same cluster stack */
+int ocfs2_cluster_connect_agnostic(const char *group,
+				   int grouplen,
+				   struct ocfs2_locking_protocol *lproto,
+				   void (*recovery_handler)(int node_num,
+							    void *recovery_data),
+				   void *recovery_data,
+				   struct ocfs2_cluster_connection **conn)
+{
+	char *stack_name = NULL;
+
+	if (cluster_stack_name[0])
+		stack_name = cluster_stack_name;
+	return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
+				     recovery_handler, recovery_data, conn);
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
+
+/* If hangup_pending is 0, the stack driver will be dropped */
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+			     int hangup_pending)
+{
+	int ret;
+
+	BUG_ON(conn == NULL);
+
+	ret = active_stack->sp_ops->disconnect(conn);
+
+	/* XXX Should we free it anyway? */
+	if (!ret) {
+		kfree(conn);
+		if (!hangup_pending)
+			ocfs2_stack_driver_put();
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
+
+/*
+ * Leave the group for this filesystem.  This is executed by a userspace
+ * program (stored in ocfs2_hb_ctl_path).
+ */
+static void ocfs2_leave_group(const char *group)
+{
+	int ret;
+	char *argv[5], *envp[3];
+
+	argv[0] = ocfs2_hb_ctl_path;
+	argv[1] = "-K";
+	argv[2] = "-u";
+	argv[3] = (char *)group;
+	argv[4] = NULL;
+
+	/* minimal command environment taken from cpu_run_sbin_hotplug */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+
+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	if (ret < 0) {
+		printk(KERN_ERR
+		       "ocfs2: Error %d running user helper "
+		       "\"%s %s %s %s\"\n",
+		       ret, argv[0], argv[1], argv[2], argv[3]);
+	}
+}
+
+/*
+ * Hangup is a required post-umount.  ocfs2-tools software expects the
+ * filesystem to call "ocfs2_hb_ctl" during unmount.  This happens
+ * regardless of whether the DLM got started, so we can't do it
+ * in ocfs2_cluster_disconnect().  The ocfs2_leave_group() function does
+ * the actual work.
+ */
+void ocfs2_cluster_hangup(const char *group, int grouplen)
+{
+	BUG_ON(group == NULL);
+	BUG_ON(group[grouplen] != '\0');
+
+	ocfs2_leave_group(group);
+
+	/* cluster_disconnect() was called with hangup_pending==1 */
+	ocfs2_stack_driver_put();
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
+
+int ocfs2_cluster_this_node(unsigned int *node)
+{
+	return active_stack->sp_ops->this_node(node);
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
+
+
+/*
+ * Sysfs bits
+ */
+
+static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
+					       struct kobj_attribute *attr,
+					       char *buf)
+{
+	ssize_t ret = 0;
+
+	spin_lock(&ocfs2_stack_lock);
+	if (locking_max_version.pv_major)
+		ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
+			       locking_max_version.pv_major,
+			       locking_max_version.pv_minor);
+	spin_unlock(&ocfs2_stack_lock);
+
+	return ret;
+}
+
+static struct kobj_attribute ocfs2_attr_max_locking_protocol =
+	__ATTR(max_locking_protocol, S_IFREG | S_IRUGO,
+	       ocfs2_max_locking_protocol_show, NULL);
+
+static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
+						 struct kobj_attribute *attr,
+						 char *buf)
+{
+	ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+	struct ocfs2_stack_plugin *p;
+
+	spin_lock(&ocfs2_stack_lock);
+	list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+		ret = snprintf(buf, remain, "%s\n",
+			       p->sp_name);
+		if (ret < 0) {
+			total = ret;
+			break;
+		}
+		if (ret == remain) {
+			/* snprintf() didn't fit */
+			total = -E2BIG;
+			break;
+		}
+		total += ret;
+		remain -= ret;
+	}
+	spin_unlock(&ocfs2_stack_lock);
+
+	return total;
+}
+
+static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
+	__ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO,
+	       ocfs2_loaded_cluster_plugins_show, NULL);
+
+static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
+						struct kobj_attribute *attr,
+						char *buf)
+{
+	ssize_t ret = 0;
+
+	spin_lock(&ocfs2_stack_lock);
+	if (active_stack) {
+		ret = snprintf(buf, PAGE_SIZE, "%s\n",
+			       active_stack->sp_name);
+		if (ret == PAGE_SIZE)
+			ret = -E2BIG;
+	}
+	spin_unlock(&ocfs2_stack_lock);
+
+	return ret;
+}
+
+static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
+	__ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
+	       ocfs2_active_cluster_plugin_show, NULL);
+
+static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					char *buf)
+{
+	ssize_t ret;
+	spin_lock(&ocfs2_stack_lock);
+	ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name);
+	spin_unlock(&ocfs2_stack_lock);
+
+	return ret;
+}
+
+static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 const char *buf, size_t count)
+{
+	size_t len = count;
+	ssize_t ret;
+
+	if (len == 0)
+		return len;
+
+	if (buf[len - 1] == '\n')
+		len--;
+
+	if ((len != OCFS2_STACK_LABEL_LEN) ||
+	    (strnlen(buf, len) != len))
+		return -EINVAL;
+
+	spin_lock(&ocfs2_stack_lock);
+	if (active_stack) {
+		if (!strncmp(buf, cluster_stack_name, len))
+			ret = count;
+		else
+			ret = -EBUSY;
+	} else {
+		memcpy(cluster_stack_name, buf, len);
+		ret = count;
+	}
+	spin_unlock(&ocfs2_stack_lock);
+
+	return ret;
+}
+
+
+static struct kobj_attribute ocfs2_attr_cluster_stack =
+	__ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR,
+	       ocfs2_cluster_stack_show,
+	       ocfs2_cluster_stack_store);
+
+static struct attribute *ocfs2_attrs[] = {
+	&ocfs2_attr_max_locking_protocol.attr,
+	&ocfs2_attr_loaded_cluster_plugins.attr,
+	&ocfs2_attr_active_cluster_plugin.attr,
+	&ocfs2_attr_cluster_stack.attr,
+	NULL,
+};
+
+static struct attribute_group ocfs2_attr_group = {
+	.attrs = ocfs2_attrs,
+};
+
+static struct kset *ocfs2_kset;
+
+static void ocfs2_sysfs_exit(void)
+{
+	kset_unregister(ocfs2_kset);
+}
+
+static int ocfs2_sysfs_init(void)
+{
+	int ret;
+
+	ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj);
+	if (!ocfs2_kset)
+		return -ENOMEM;
+
+	ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group);
+	if (ret)
+		goto error;
+
+	return 0;
+
+error:
+	kset_unregister(ocfs2_kset);
+	return ret;
+}
+
+/*
+ * Sysctl bits
+ *
+ * The sysctl lives at /proc/sys/fs/ocfs2/nm/hb_ctl_path.  The 'nm' doesn't
+ * make as much sense in a multiple cluster stack world, but it's safer
+ * and easier to preserve the name.
+ */
+
+#define FS_OCFS2_NM		1
+
+static ctl_table ocfs2_nm_table[] = {
+	{
+		.procname	= "hb_ctl_path",
+		.data		= ocfs2_hb_ctl_path,
+		.maxlen		= OCFS2_MAX_HB_CTL_PATH,
+		.mode		= 0644,
+		.proc_handler	= proc_dostring,
+	},
+	{ }
+};
+
+static ctl_table ocfs2_mod_table[] = {
+	{
+		.procname	= "nm",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_nm_table
+	},
+	{ }
+};
+
+static ctl_table ocfs2_kern_table[] = {
+	{
+		.procname	= "ocfs2",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_mod_table
+	},
+	{ }
+};
+
+static ctl_table ocfs2_root_table[] = {
+	{
+		.procname	= "fs",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_kern_table
+	},
+	{ }
+};
+
+static struct ctl_table_header *ocfs2_table_header = NULL;
+
+
+/*
+ * Initialization
+ */
+
+static int __init ocfs2_stack_glue_init(void)
+{
+	strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
+
+	ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
+	if (!ocfs2_table_header) {
+		printk(KERN_ERR
+		       "ocfs2 stack glue: unable to register sysctl\n");
+		return -ENOMEM; /* or something. */
+	}
+
+	return ocfs2_sysfs_init();
+}
+
+static void __exit ocfs2_stack_glue_exit(void)
+{
+	memset(&locking_max_version, 0,
+	       sizeof(struct ocfs2_protocol_version));
+	locking_max_version.pv_major = 0;
+	locking_max_version.pv_minor = 0;
+	ocfs2_sysfs_exit();
+	if (ocfs2_table_header)
+		unregister_sysctl_table(ocfs2_table_header);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 cluter stack glue layer");
+MODULE_LICENSE("GPL");
+module_init(ocfs2_stack_glue_init);
+module_exit(ocfs2_stack_glue_exit);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
new file mode 100644
index 00000000..1ec56fdb
--- /dev/null
+++ b/fs/ocfs2/stackglue.h
@@ -0,0 +1,292 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.h
+ *
+ * Glue to the underlying cluster stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+
+#ifndef STACKGLUE_H
+#define STACKGLUE_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/dlmconstants.h>
+
+#include "dlm/dlmapi.h"
+#include <linux/dlm.h>
+
+/* Needed for plock-related prototypes */
+struct file;
+struct file_lock;
+
+/*
+ * dlmconstants.h does not have a LOCAL flag.  We hope to remove it
+ * some day, but right now we need it.  Let's fake it.  This value is larger
+ * than any flag in dlmconstants.h.
+ */
+#define DLM_LKF_LOCAL		0x00100000
+
+/*
+ * This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h.  That probably
+ * wants to be in a public header.
+ */
+#define GROUP_NAME_MAX		64
+
+
+/*
+ * ocfs2_protocol_version changes when ocfs2 does something different in
+ * its inter-node behavior.  See dlmglue.c for more information.
+ */
+struct ocfs2_protocol_version {
+	u8 pv_major;
+	u8 pv_minor;
+};
+
+/*
+ * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
+ * has a pointer to separately allocated lvb space.  This struct exists only to
+ * include in the lksb union to make space for a combined dlm_lksb and lvb.
+ */
+struct fsdlm_lksb_plus_lvb {
+	struct dlm_lksb lksb;
+	char lvb[DLM_LVB_LEN];
+};
+
+/*
+ * A union of all lock status structures.  We define it here so that the
+ * size of the union is known.  Lock status structures are embedded in
+ * ocfs2 inodes.
+ */
+struct ocfs2_cluster_connection;
+struct ocfs2_dlm_lksb {
+	 union {
+		 struct dlm_lockstatus lksb_o2dlm;
+		 struct dlm_lksb lksb_fsdlm;
+		 struct fsdlm_lksb_plus_lvb padding;
+	 };
+	 struct ocfs2_cluster_connection *lksb_conn;
+};
+
+/*
+ * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
+ */
+struct ocfs2_locking_protocol {
+	struct ocfs2_protocol_version lp_max_version;
+	void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
+	void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
+	void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
+};
+
+
+/*
+ * A cluster connection.  Mostly opaque to ocfs2, the connection holds
+ * state for the underlying stack.  ocfs2 does use cc_version to determine
+ * locking compatibility.
+ */
+struct ocfs2_cluster_connection {
+	char cc_name[GROUP_NAME_MAX];
+	int cc_namelen;
+	struct ocfs2_protocol_version cc_version;
+	struct ocfs2_locking_protocol *cc_proto;
+	void (*cc_recovery_handler)(int node_num, void *recovery_data);
+	void *cc_recovery_data;
+	void *cc_lockspace;
+	void *cc_private;
+};
+
+/*
+ * Each cluster stack implements the stack operations structure.  Not used
+ * in the ocfs2 code, the stackglue code translates generic cluster calls
+ * into stack operations.
+ */
+struct ocfs2_stack_operations {
+	/*
+	 * The fs code calls ocfs2_cluster_connect() to attach a new
+	 * filesystem to the cluster stack.  The ->connect() op is passed
+	 * an ocfs2_cluster_connection with the name and recovery field
+	 * filled in.
+	 *
+	 * The stack must set up any notification mechanisms and create
+	 * the filesystem lockspace in the DLM.  The lockspace should be
+	 * stored on cc_lockspace.  Any other information can be stored on
+	 * cc_private.
+	 *
+	 * ->connect() must not return until it is guaranteed that
+	 *
+	 *  - Node down notifications for the filesystem will be received
+	 *    and passed to conn->cc_recovery_handler().
+	 *  - Locking requests for the filesystem will be processed.
+	 */
+	int (*connect)(struct ocfs2_cluster_connection *conn);
+
+	/*
+	 * The fs code calls ocfs2_cluster_disconnect() when a filesystem
+	 * no longer needs cluster services.  All DLM locks have been
+	 * dropped, and recovery notification is being ignored by the
+	 * fs code.  The stack must disengage from the DLM and discontinue
+	 * recovery notification.
+	 *
+	 * Once ->disconnect() has returned, the connection structure will
+	 * be freed.  Thus, a stack must not return from ->disconnect()
+	 * until it will no longer reference the conn pointer.
+	 *
+	 * Once this call returns, the stack glue will be dropping this
+	 * connection's reference on the module.
+	 */
+	int (*disconnect)(struct ocfs2_cluster_connection *conn);
+
+	/*
+	 * ->this_node() returns the cluster's unique identifier for the
+	 * local node.
+	 */
+	int (*this_node)(unsigned int *node);
+
+	/*
+	 * Call the underlying dlm lock function.  The ->dlm_lock()
+	 * callback should convert the flags and mode as appropriate.
+	 *
+	 * ast and bast functions are not part of the call because the
+	 * stack will likely want to wrap ast and bast calls before passing
+	 * them to stack->sp_proto.  There is no astarg.  The lksb will
+	 * be passed back to the ast and bast functions.  The caller can
+	 * use this to find their object.
+	 */
+	int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
+			int mode,
+			struct ocfs2_dlm_lksb *lksb,
+			u32 flags,
+			void *name,
+			unsigned int namelen);
+
+	/*
+	 * Call the underlying dlm unlock function.  The ->dlm_unlock()
+	 * function should convert the flags as appropriate.
+	 *
+	 * The unlock ast is not passed, as the stack will want to wrap
+	 * it before calling stack->sp_proto->lp_unlock_ast().  There is
+	 * no astarg.  The lksb will be passed back to the unlock ast
+	 * function.  The caller can use this to find their object.
+	 */
+	int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
+			  struct ocfs2_dlm_lksb *lksb,
+			  u32 flags);
+
+	/*
+	 * Return the status of the current lock status block.  The fs
+	 * code should never dereference the union.  The ->lock_status()
+	 * callback pulls out the stack-specific lksb, converts the status
+	 * to a proper errno, and returns it.
+	 */
+	int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
+
+	/*
+	 * Return non-zero if the LVB is valid.
+	 */
+	int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
+
+	/*
+	 * Pull the lvb pointer off of the stack-specific lksb.
+	 */
+	void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
+
+	/*
+	 * Cluster-aware posix locks
+	 *
+	 * This is NULL for stacks which do not support posix locks.
+	 */
+	int (*plock)(struct ocfs2_cluster_connection *conn,
+		     u64 ino,
+		     struct file *file,
+		     int cmd,
+		     struct file_lock *fl);
+
+	/*
+	 * This is an optoinal debugging hook.  If provided, the
+	 * stack can dump debugging information about this lock.
+	 */
+	void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
+};
+
+/*
+ * Each stack plugin must describe itself by registering a
+ * ocfs2_stack_plugin structure.  This is only seen by stackglue and the
+ * stack driver.
+ */
+struct ocfs2_stack_plugin {
+	char *sp_name;
+	struct ocfs2_stack_operations *sp_ops;
+	struct module *sp_owner;
+
+	/* These are managed by the stackglue code. */
+	struct list_head sp_list;
+	unsigned int sp_count;
+	struct ocfs2_protocol_version sp_max_proto;
+};
+
+
+/* Used by the filesystem */
+int ocfs2_cluster_connect(const char *stack_name,
+			  const char *group,
+			  int grouplen,
+			  struct ocfs2_locking_protocol *lproto,
+			  void (*recovery_handler)(int node_num,
+						   void *recovery_data),
+			  void *recovery_data,
+			  struct ocfs2_cluster_connection **conn);
+/*
+ * Used by callers that don't store their stack name.  They must ensure
+ * all nodes have the same stack.
+ */
+int ocfs2_cluster_connect_agnostic(const char *group,
+				   int grouplen,
+				   struct ocfs2_locking_protocol *lproto,
+				   void (*recovery_handler)(int node_num,
+							    void *recovery_data),
+				   void *recovery_data,
+				   struct ocfs2_cluster_connection **conn);
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+			     int hangup_pending);
+void ocfs2_cluster_hangup(const char *group, int grouplen);
+int ocfs2_cluster_this_node(unsigned int *node);
+
+struct ocfs2_lock_res;
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
+		   int mode,
+		   struct ocfs2_dlm_lksb *lksb,
+		   u32 flags,
+		   void *name,
+		   unsigned int namelen);
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
+		     struct ocfs2_dlm_lksb *lksb,
+		     u32 flags);
+
+int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
+int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
+void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
+void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
+
+int ocfs2_stack_supports_plocks(void);
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+		struct file *file, int cmd, struct file_lock *fl);
+
+void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
+
+
+/* Used by stack plugins */
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+
+#endif  /* STACKGLUE_H */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
new file mode 100644
index 00000000..f169da46
--- /dev/null
+++ b/fs/ocfs2/suballoc.c
@@ -0,0 +1,2908 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.c
+ *
+ * metadata alloc and free
+ * Inspired by ext3 block groups.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "blockcheck.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+
+#define NOT_ALLOC_NEW_GROUP		0
+#define ALLOC_NEW_GROUP			0x1
+#define ALLOC_GROUPS_FROM_GLOBAL	0x2
+
+#define OCFS2_MAX_TO_STEAL		1024
+
+struct ocfs2_suballoc_result {
+	u64		sr_bg_blkno;	/* The bg we allocated from.  Set
+					   to 0 when a block group is
+					   contiguous. */
+	u64		sr_bg_stable_blkno; /*
+					     * Doesn't change, always
+					     * set to target block
+					     * group descriptor
+					     * block.
+					     */
+	u64		sr_blkno;	/* The first allocated block */
+	unsigned int	sr_bit_offset;	/* The bit in the bg */
+	unsigned int	sr_bits;	/* How many bits we claimed */
+};
+
+static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
+{
+	if (res->sr_blkno == 0)
+		return 0;
+
+	if (res->sr_bg_blkno)
+		return res->sr_bg_blkno;
+
+	return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
+}
+
+static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
+static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
+static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
+static int ocfs2_block_group_fill(handle_t *handle,
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  unsigned int group_clusters,
+				  u16 my_chain,
+				  struct ocfs2_chain_list *cl);
+static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
+				   struct inode *alloc_inode,
+				   struct buffer_head *bh,
+				   u64 max_block,
+				   u64 *last_alloc_group,
+				   int flags);
+
+static int ocfs2_cluster_group_search(struct inode *inode,
+				      struct buffer_head *group_bh,
+				      u32 bits_wanted, u32 min_bits,
+				      u64 max_block,
+				      struct ocfs2_suballoc_result *res);
+static int ocfs2_block_group_search(struct inode *inode,
+				    struct buffer_head *group_bh,
+				    u32 bits_wanted, u32 min_bits,
+				    u64 max_block,
+				    struct ocfs2_suballoc_result *res);
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
+				     handle_t *handle,
+				     u32 bits_wanted,
+				     u32 min_bits,
+				     struct ocfs2_suballoc_result *res);
+static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
+					 int nr);
+static inline int ocfs2_block_group_set_bits(handle_t *handle,
+					     struct inode *alloc_inode,
+					     struct ocfs2_group_desc *bg,
+					     struct buffer_head *group_bh,
+					     unsigned int bit_off,
+					     unsigned int num_bits);
+static int ocfs2_relink_block_group(handle_t *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head *bg_bh,
+				    struct buffer_head *prev_bg_bh,
+				    u16 chain);
+static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
+						     u32 wanted);
+static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
+						   u64 bg_blkno,
+						   u16 bg_bit_off);
+static inline void ocfs2_block_to_cluster_group(struct inode *inode,
+						u64 data_blkno,
+						u64 *bg_blkno,
+						u16 *bg_bit_off);
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+					     u32 bits_wanted, u64 max_block,
+					     int flags,
+					     struct ocfs2_alloc_context **ac);
+
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
+{
+	struct inode *inode = ac->ac_inode;
+
+	if (inode) {
+		if (ac->ac_which != OCFS2_AC_USE_LOCAL)
+			ocfs2_inode_unlock(inode, 1);
+
+		mutex_unlock(&inode->i_mutex);
+
+		iput(inode);
+		ac->ac_inode = NULL;
+	}
+	brelse(ac->ac_bh);
+	ac->ac_bh = NULL;
+	ac->ac_resv = NULL;
+	if (ac->ac_find_loc_priv) {
+		kfree(ac->ac_find_loc_priv);
+		ac->ac_find_loc_priv = NULL;
+	}
+}
+
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+{
+	ocfs2_free_ac_resource(ac);
+	kfree(ac);
+}
+
+static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
+{
+	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
+}
+
+#define do_error(fmt, ...)						\
+	do{								\
+		if (resize)					\
+			mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);	\
+		else							\
+			ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
+	} while (0)
+
+static int ocfs2_validate_gd_self(struct super_block *sb,
+				  struct buffer_head *bh,
+				  int resize)
+{
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
+		do_error("Group descriptor #%llu has bad signature %.*s",
+			 (unsigned long long)bh->b_blocknr, 7,
+			 gd->bg_signature);
+		return -EINVAL;
+	}
+
+	if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
+		do_error("Group descriptor #%llu has an invalid bg_blkno "
+			 "of %llu",
+			 (unsigned long long)bh->b_blocknr,
+			 (unsigned long long)le64_to_cpu(gd->bg_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
+		do_error("Group descriptor #%llu has an invalid "
+			 "fs_generation of #%u",
+			 (unsigned long long)bh->b_blocknr,
+			 le32_to_cpu(gd->bg_generation));
+		return -EINVAL;
+	}
+
+	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+		do_error("Group descriptor #%llu has bit count %u but "
+			 "claims that %u are free",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_bits),
+			 le16_to_cpu(gd->bg_free_bits_count));
+		return -EINVAL;
+	}
+
+	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+		do_error("Group descriptor #%llu has bit count %u but "
+			 "max bitmap bits of %u",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_bits),
+			 8 * le16_to_cpu(gd->bg_size));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_validate_gd_parent(struct super_block *sb,
+				    struct ocfs2_dinode *di,
+				    struct buffer_head *bh,
+				    int resize)
+{
+	unsigned int max_bits;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+	if (di->i_blkno != gd->bg_parent_dinode) {
+		do_error("Group descriptor #%llu has bad parent "
+			 "pointer (%llu, expected %llu)",
+			 (unsigned long long)bh->b_blocknr,
+			 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+			 (unsigned long long)le64_to_cpu(di->i_blkno));
+		return -EINVAL;
+	}
+
+	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
+	if (le16_to_cpu(gd->bg_bits) > max_bits) {
+		do_error("Group descriptor #%llu has bit count of %u",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_bits));
+		return -EINVAL;
+	}
+
+	/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
+	if ((le16_to_cpu(gd->bg_chain) >
+	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
+	    ((le16_to_cpu(gd->bg_chain) ==
+	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
+		do_error("Group descriptor #%llu has bad chain %u",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_chain));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#undef do_error
+
+/*
+ * This version only prints errors.  It does not fail the filesystem, and
+ * exists only for resize.
+ */
+int ocfs2_check_group_descriptor(struct super_block *sb,
+				 struct ocfs2_dinode *di,
+				 struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+	if (rc) {
+		mlog(ML_ERROR,
+		     "Checksum failed for group descriptor %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+	} else
+		rc = ocfs2_validate_gd_self(sb, bh, 1);
+	if (!rc)
+		rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
+
+	return rc;
+}
+
+static int ocfs2_validate_group_descriptor(struct super_block *sb,
+					   struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+	trace_ocfs2_validate_group_descriptor(
+					(unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+	if (rc)
+		return rc;
+
+	/*
+	 * Errors after here are fatal.
+	 */
+
+	return ocfs2_validate_gd_self(sb, bh, 0);
+}
+
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+				u64 gd_blkno, struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
+			      ocfs2_validate_group_descriptor);
+	if (rc)
+		goto out;
+
+	rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
+	if (rc) {
+		brelse(tmp);
+		goto out;
+	}
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!*bh)
+		*bh = tmp;
+
+out:
+	return rc;
+}
+
+static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
+					  struct ocfs2_group_desc *bg,
+					  struct ocfs2_chain_list *cl,
+					  u64 p_blkno, unsigned int clusters)
+{
+	struct ocfs2_extent_list *el = &bg->bg_list;
+	struct ocfs2_extent_rec *rec;
+
+	BUG_ON(!ocfs2_supports_discontig_bg(osb));
+	if (!el->l_next_free_rec)
+		el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
+	rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
+	rec->e_blkno = cpu_to_le64(p_blkno);
+	rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
+				  le16_to_cpu(cl->cl_bpc));
+	rec->e_leaf_clusters = cpu_to_le16(clusters);
+	le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
+	le16_add_cpu(&bg->bg_free_bits_count,
+		     clusters * le16_to_cpu(cl->cl_bpc));
+	le16_add_cpu(&el->l_next_free_rec, 1);
+}
+
+static int ocfs2_block_group_fill(handle_t *handle,
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  unsigned int group_clusters,
+				  u16 my_chain,
+				  struct ocfs2_chain_list *cl)
+{
+	int status = 0;
+	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	struct super_block * sb = alloc_inode->i_sb;
+
+	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
+		ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
+			    "b_blocknr (%llu)",
+			    (unsigned long long)group_blkno,
+			    (unsigned long long) bg_bh->b_blocknr);
+		status = -EIO;
+		goto bail;
+	}
+
+	status = ocfs2_journal_access_gd(handle,
+					 INODE_CACHE(alloc_inode),
+					 bg_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	memset(bg, 0, sb->s_blocksize);
+	strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
+	bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
+						osb->s_feature_incompat));
+	bg->bg_chain = cpu_to_le16(my_chain);
+	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
+	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
+	bg->bg_blkno = cpu_to_le64(group_blkno);
+	if (group_clusters == le16_to_cpu(cl->cl_cpg))
+		bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+	else
+		ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
+					      group_clusters);
+
+	/* set the 1st bit in the bitmap to account for the descriptor block */
+	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
+	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
+
+	ocfs2_journal_dirty(handle, bg_bh);
+
+	/* There is no need to zero out or otherwise initialize the
+	 * other blocks in a group - All valid FS metadata in a block
+	 * group stores the superblock fs_generation value at
+	 * allocation time. */
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	best = curr = 0;
+	while (curr < le16_to_cpu(cl->cl_count)) {
+		if (le32_to_cpu(cl->cl_recs[best].c_total) >
+		    le32_to_cpu(cl->cl_recs[curr].c_total))
+			best = curr;
+		curr++;
+	}
+	return best;
+}
+
+static struct buffer_head *
+ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
+			       struct inode *alloc_inode,
+			       struct ocfs2_alloc_context *ac,
+			       struct ocfs2_chain_list *cl)
+{
+	int status;
+	u32 bit_off, num_bits;
+	u64 bg_blkno;
+	struct buffer_head *bg_bh;
+	unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+
+	status = ocfs2_claim_clusters(handle, ac,
+				      le16_to_cpu(cl->cl_cpg), &bit_off,
+				      &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* setup the group */
+	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	trace_ocfs2_block_group_alloc_contig(
+	     (unsigned long long)bg_blkno, alloc_rec);
+
+	bg_bh = sb_getblk(osb->sb, bg_blkno);
+	if (!bg_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+
+	status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+					bg_blkno, num_bits, alloc_rec, cl);
+	if (status < 0) {
+		brelse(bg_bh);
+		mlog_errno(status);
+	}
+
+bail:
+	return status ? ERR_PTR(status) : bg_bh;
+}
+
+static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
+					handle_t *handle,
+					struct ocfs2_alloc_context *ac,
+					unsigned int min_bits,
+					u32 *bit_off, u32 *num_bits)
+{
+	int status = 0;
+
+	while (min_bits) {
+		status = ocfs2_claim_clusters(handle, ac, min_bits,
+					      bit_off, num_bits);
+		if (status != -ENOSPC)
+			break;
+
+		min_bits >>= 1;
+	}
+
+	return status;
+}
+
+static int ocfs2_block_group_grow_discontig(handle_t *handle,
+					    struct inode *alloc_inode,
+					    struct buffer_head *bg_bh,
+					    struct ocfs2_alloc_context *ac,
+					    struct ocfs2_chain_list *cl,
+					    unsigned int min_bits)
+{
+	int status;
+	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+	struct ocfs2_group_desc *bg =
+		(struct ocfs2_group_desc *)bg_bh->b_data;
+	unsigned int needed = le16_to_cpu(cl->cl_cpg) -
+			 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+	u32 p_cpos, clusters;
+	u64 p_blkno;
+	struct ocfs2_extent_list *el = &bg->bg_list;
+
+	status = ocfs2_journal_access_gd(handle,
+					 INODE_CACHE(alloc_inode),
+					 bg_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
+				le16_to_cpu(el->l_count))) {
+		if (min_bits > needed)
+			min_bits = needed;
+		status = ocfs2_block_group_claim_bits(osb, handle, ac,
+						      min_bits, &p_cpos,
+						      &clusters);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+		p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
+		ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
+					      clusters);
+
+		min_bits = clusters;
+		needed = le16_to_cpu(cl->cl_cpg) -
+			 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+	}
+
+	if (needed > 0) {
+		/*
+		 * We have used up all the extent rec but can't fill up
+		 * the cpg. So bail out.
+		 */
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	ocfs2_journal_dirty(handle, bg_bh);
+
+bail:
+	return status;
+}
+
+static void ocfs2_bg_alloc_cleanup(handle_t *handle,
+				   struct ocfs2_alloc_context *cluster_ac,
+				   struct inode *alloc_inode,
+				   struct buffer_head *bg_bh)
+{
+	int i, ret;
+	struct ocfs2_group_desc *bg;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+
+	if (!bg_bh)
+		return;
+
+	bg = (struct ocfs2_group_desc *)bg_bh->b_data;
+	el = &bg->bg_list;
+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+		ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
+					  cluster_ac->ac_bh,
+					  le64_to_cpu(rec->e_blkno),
+					  le16_to_cpu(rec->e_leaf_clusters));
+		if (ret)
+			mlog_errno(ret);
+		/* Try all the clusters to free */
+	}
+
+	ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
+	brelse(bg_bh);
+}
+
+static struct buffer_head *
+ocfs2_block_group_alloc_discontig(handle_t *handle,
+				  struct inode *alloc_inode,
+				  struct ocfs2_alloc_context *ac,
+				  struct ocfs2_chain_list *cl)
+{
+	int status;
+	u32 bit_off, num_bits;
+	u64 bg_blkno;
+	unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
+	struct buffer_head *bg_bh = NULL;
+	unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+
+	if (!ocfs2_supports_discontig_bg(osb)) {
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	status = ocfs2_extend_trans(handle,
+				    ocfs2_calc_bg_discontig_credits(osb->sb));
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/*
+	 * We're going to be grabbing from multiple cluster groups.
+	 * We don't have enough credits to relink them all, and the
+	 * cluster groups will be staying in cache for the duration of
+	 * this operation.
+	 */
+	ac->ac_allow_chain_relink = 0;
+
+	/* Claim the first region */
+	status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
+					      &bit_off, &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+	min_bits = num_bits;
+
+	/* setup the group */
+	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	trace_ocfs2_block_group_alloc_discontig(
+				(unsigned long long)bg_blkno, alloc_rec);
+
+	bg_bh = sb_getblk(osb->sb, bg_blkno);
+	if (!bg_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+
+	status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+					bg_blkno, num_bits, alloc_rec, cl);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
+						  bg_bh, ac, cl, min_bits);
+	if (status)
+		mlog_errno(status);
+
+bail:
+	if (status)
+		ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
+	return status ? ERR_PTR(status) : bg_bh;
+}
+
+/*
+ * We expect the block group allocator to already be locked.
+ */
+static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
+				   struct inode *alloc_inode,
+				   struct buffer_head *bh,
+				   u64 max_block,
+				   u64 *last_alloc_group,
+				   int flags)
+{
+	int status, credits;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_alloc_context *ac = NULL;
+	handle_t *handle = NULL;
+	u16 alloc_rec;
+	struct buffer_head *bg_bh = NULL;
+	struct ocfs2_group_desc *bg;
+
+	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
+
+	cl = &fe->id2.i_chain;
+	status = ocfs2_reserve_clusters_with_limit(osb,
+						   le16_to_cpu(cl->cl_cpg),
+						   max_block, flags, &ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	credits = ocfs2_calc_group_alloc_credits(osb->sb,
+						 le16_to_cpu(cl->cl_cpg));
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (last_alloc_group && *last_alloc_group != 0) {
+		trace_ocfs2_block_group_alloc(
+				(unsigned long long)*last_alloc_group);
+		ac->ac_last_group = *last_alloc_group;
+	}
+
+	bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
+					       ac, cl);
+	if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
+		bg_bh = ocfs2_block_group_alloc_discontig(handle,
+							  alloc_inode,
+							  ac, cl);
+	if (IS_ERR(bg_bh)) {
+		status = PTR_ERR(bg_bh);
+		bg_bh = NULL;
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
+					 bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	alloc_rec = le16_to_cpu(bg->bg_chain);
+	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
+		     le16_to_cpu(bg->bg_free_bits_count));
+	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
+		     le16_to_cpu(bg->bg_bits));
+	cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
+	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
+		le16_add_cpu(&cl->cl_next_free_rec, 1);
+
+	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
+					le16_to_cpu(bg->bg_free_bits_count));
+	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
+	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
+
+	ocfs2_journal_dirty(handle, bh);
+
+	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
+	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
+					     le32_to_cpu(fe->i_clusters)));
+	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
+	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
+	alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
+
+	status = 0;
+
+	/* save the new last alloc group so that the caller can cache it. */
+	if (last_alloc_group)
+		*last_alloc_group = ac->ac_last_group;
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	if (ac)
+		ocfs2_free_alloc_context(ac);
+
+	brelse(bg_bh);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
+				       struct ocfs2_alloc_context *ac,
+				       int type,
+				       u32 slot,
+				       u64 *last_alloc_group,
+				       int flags)
+{
+	int status;
+	u32 bits_wanted = ac->ac_bits_wanted;
+	struct inode *alloc_inode;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *fe;
+	u32 free_bits;
+
+	alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
+	if (!alloc_inode) {
+		mlog_errno(-EINVAL);
+		return -EINVAL;
+	}
+
+	mutex_lock(&alloc_inode->i_mutex);
+
+	status = ocfs2_inode_lock(alloc_inode, &bh, 1);
+	if (status < 0) {
+		mutex_unlock(&alloc_inode->i_mutex);
+		iput(alloc_inode);
+
+		mlog_errno(status);
+		return status;
+	}
+
+	ac->ac_inode = alloc_inode;
+	ac->ac_alloc_slot = slot;
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	/* The bh was validated by the inode read inside
+	 * ocfs2_inode_lock().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
+	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
+		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
+			    (unsigned long long)le64_to_cpu(fe->i_blkno));
+		status = -EIO;
+		goto bail;
+	}
+
+	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
+		le32_to_cpu(fe->id1.bitmap1.i_used);
+
+	if (bits_wanted > free_bits) {
+		/* cluster bitmap never grows */
+		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
+			trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
+								free_bits);
+			status = -ENOSPC;
+			goto bail;
+		}
+
+		if (!(flags & ALLOC_NEW_GROUP)) {
+			trace_ocfs2_reserve_suballoc_bits_no_new_group(
+						slot, bits_wanted, free_bits);
+			status = -ENOSPC;
+			goto bail;
+		}
+
+		status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
+						 ac->ac_max_block,
+						 last_alloc_group, flags);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+		atomic_inc(&osb->alloc_stats.bg_extends);
+
+		/* You should never ask for this much metadata */
+		BUG_ON(bits_wanted >
+		       (le32_to_cpu(fe->id1.bitmap1.i_total)
+			- le32_to_cpu(fe->id1.bitmap1.i_used)));
+	}
+
+	get_bh(bh);
+	ac->ac_bh = bh;
+bail:
+	brelse(bh);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
+{
+	spin_lock(&osb->osb_lock);
+	osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
+	spin_unlock(&osb->osb_lock);
+	atomic_set(&osb->s_num_inodes_stolen, 0);
+}
+
+static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
+{
+	spin_lock(&osb->osb_lock);
+	osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
+	spin_unlock(&osb->osb_lock);
+	atomic_set(&osb->s_num_meta_stolen, 0);
+}
+
+void ocfs2_init_steal_slots(struct ocfs2_super *osb)
+{
+	ocfs2_init_inode_steal_slot(osb);
+	ocfs2_init_meta_steal_slot(osb);
+}
+
+static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
+{
+	spin_lock(&osb->osb_lock);
+	if (type == INODE_ALLOC_SYSTEM_INODE)
+		osb->s_inode_steal_slot = slot;
+	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
+		osb->s_meta_steal_slot = slot;
+	spin_unlock(&osb->osb_lock);
+}
+
+static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
+{
+	int slot = OCFS2_INVALID_SLOT;
+
+	spin_lock(&osb->osb_lock);
+	if (type == INODE_ALLOC_SYSTEM_INODE)
+		slot = osb->s_inode_steal_slot;
+	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
+		slot = osb->s_meta_steal_slot;
+	spin_unlock(&osb->osb_lock);
+
+	return slot;
+}
+
+static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
+{
+	return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
+{
+	return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_steal_resource(struct ocfs2_super *osb,
+				struct ocfs2_alloc_context *ac,
+				int type)
+{
+	int i, status = -ENOSPC;
+	int slot = __ocfs2_get_steal_slot(osb, type);
+
+	/* Start to steal resource from the first slot after ours. */
+	if (slot == OCFS2_INVALID_SLOT)
+		slot = osb->slot_num + 1;
+
+	for (i = 0; i < osb->max_slots; i++, slot++) {
+		if (slot == osb->max_slots)
+			slot = 0;
+
+		if (slot == osb->slot_num)
+			continue;
+
+		status = ocfs2_reserve_suballoc_bits(osb, ac,
+						     type,
+						     (u32)slot, NULL,
+						     NOT_ALLOC_NEW_GROUP);
+		if (status >= 0) {
+			__ocfs2_set_steal_slot(osb, slot, type);
+			break;
+		}
+
+		ocfs2_free_ac_resource(ac);
+	}
+
+	return status;
+}
+
+static int ocfs2_steal_inode(struct ocfs2_super *osb,
+			     struct ocfs2_alloc_context *ac)
+{
+	return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_steal_meta(struct ocfs2_super *osb,
+			    struct ocfs2_alloc_context *ac)
+{
+	return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
+}
+
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+				      int blocks,
+				      struct ocfs2_alloc_context **ac)
+{
+	int status;
+	int slot = ocfs2_get_meta_steal_slot(osb);
+
+	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = blocks;
+	(*ac)->ac_which = OCFS2_AC_USE_META;
+	(*ac)->ac_group_search = ocfs2_block_group_search;
+
+	if (slot != OCFS2_INVALID_SLOT &&
+		atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
+		goto extent_steal;
+
+	atomic_set(&osb->s_num_meta_stolen, 0);
+	status = ocfs2_reserve_suballoc_bits(osb, (*ac),
+					     EXTENT_ALLOC_SYSTEM_INODE,
+					     (u32)osb->slot_num, NULL,
+					     ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
+
+
+	if (status >= 0) {
+		status = 0;
+		if (slot != OCFS2_INVALID_SLOT)
+			ocfs2_init_meta_steal_slot(osb);
+		goto bail;
+	} else if (status < 0 && status != -ENOSPC) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_free_ac_resource(*ac);
+
+extent_steal:
+	status = ocfs2_steal_meta(osb, *ac);
+	atomic_inc(&osb->s_num_meta_stolen);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_extent_list *root_el,
+			       struct ocfs2_alloc_context **ac)
+{
+	return ocfs2_reserve_new_metadata_blocks(osb,
+					ocfs2_extend_meta_needed(root_el),
+					ac);
+}
+
+int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
+			    struct ocfs2_alloc_context **ac)
+{
+	int status;
+	int slot = ocfs2_get_inode_steal_slot(osb);
+	u64 alloc_group;
+
+	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = 1;
+	(*ac)->ac_which = OCFS2_AC_USE_INODE;
+
+	(*ac)->ac_group_search = ocfs2_block_group_search;
+
+	/*
+	 * stat(2) can't handle i_ino > 32bits, so we tell the
+	 * lower levels not to allocate us a block group past that
+	 * limit.  The 'inode64' mount option avoids this behavior.
+	 */
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
+		(*ac)->ac_max_block = (u32)~0U;
+
+	/*
+	 * slot is set when we successfully steal inode from other nodes.
+	 * It is reset in 3 places:
+	 * 1. when we flush the truncate log
+	 * 2. when we complete local alloc recovery.
+	 * 3. when we successfully allocate from our own slot.
+	 * After it is set, we will go on stealing inodes until we find the
+	 * need to check our slots to see whether there is some space for us.
+	 */
+	if (slot != OCFS2_INVALID_SLOT &&
+	    atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
+		goto inode_steal;
+
+	atomic_set(&osb->s_num_inodes_stolen, 0);
+	alloc_group = osb->osb_inode_alloc_group;
+	status = ocfs2_reserve_suballoc_bits(osb, *ac,
+					     INODE_ALLOC_SYSTEM_INODE,
+					     (u32)osb->slot_num,
+					     &alloc_group,
+					     ALLOC_NEW_GROUP |
+					     ALLOC_GROUPS_FROM_GLOBAL);
+	if (status >= 0) {
+		status = 0;
+
+		spin_lock(&osb->osb_lock);
+		osb->osb_inode_alloc_group = alloc_group;
+		spin_unlock(&osb->osb_lock);
+		trace_ocfs2_reserve_new_inode_new_group(
+			(unsigned long long)alloc_group);
+
+		/*
+		 * Some inodes must be freed by us, so try to allocate
+		 * from our own next time.
+		 */
+		if (slot != OCFS2_INVALID_SLOT)
+			ocfs2_init_inode_steal_slot(osb);
+		goto bail;
+	} else if (status < 0 && status != -ENOSPC) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_free_ac_resource(*ac);
+
+inode_steal:
+	status = ocfs2_steal_inode(osb, *ac);
+	atomic_inc(&osb->s_num_inodes_stolen);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/* local alloc code has to do the same thing, so rather than do this
+ * twice.. */
+int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
+				      struct ocfs2_alloc_context *ac)
+{
+	int status;
+
+	ac->ac_which = OCFS2_AC_USE_MAIN;
+	ac->ac_group_search = ocfs2_cluster_group_search;
+
+	status = ocfs2_reserve_suballoc_bits(osb, ac,
+					     GLOBAL_BITMAP_SYSTEM_INODE,
+					     OCFS2_INVALID_SLOT, NULL,
+					     ALLOC_NEW_GROUP);
+	if (status < 0 && status != -ENOSPC) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	return status;
+}
+
+/* Callers don't need to care which bitmap (local alloc or main) to
+ * use so we figure it out for them, but unfortunately this clutters
+ * things a bit. */
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+					     u32 bits_wanted, u64 max_block,
+					     int flags,
+					     struct ocfs2_alloc_context **ac)
+{
+	int status;
+
+	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = bits_wanted;
+	(*ac)->ac_max_block = max_block;
+
+	status = -ENOSPC;
+	if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
+	    ocfs2_alloc_should_use_local(osb, bits_wanted)) {
+		status = ocfs2_reserve_local_alloc_bits(osb,
+							bits_wanted,
+							*ac);
+		if ((status < 0) && (status != -ENOSPC)) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	if (status == -ENOSPC) {
+		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac)
+{
+	return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
+						 ALLOC_NEW_GROUP, ac);
+}
+
+/*
+ * More or less lifted from ext3. I'll leave their description below:
+ *
+ * "For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy.  This
+ * prevents deletes from freeing up the page for reuse until we have
+ * committed the delete transaction.
+ *
+ * If we didn't do this, then deleting something and reallocating it as
+ * data would allow the old block to be overwritten before the
+ * transaction committed (because we force data to disk before commit).
+ * This would lead to corruption if we crashed between overwriting the
+ * data and committing the delete.
+ *
+ * @@@ We may want to make this allocation behaviour conditional on
+ * data-writes at some point, and disable it for metadata allocations or
+ * sync-data inodes."
+ *
+ * Note: OCFS2 already does this differently for metadata vs data
+ * allocations, as those bitmaps are separate and undo access is never
+ * called on a metadata group descriptor.
+ */
+static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
+					 int nr)
+{
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	int ret;
+
+	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
+		return 0;
+
+	if (!buffer_jbd(bg_bh))
+		return 1;
+
+	jbd_lock_bh_state(bg_bh);
+	bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
+	if (bg)
+		ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+	else
+		ret = 1;
+	jbd_unlock_bh_state(bg_bh);
+
+	return ret;
+}
+
+static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
+					     struct buffer_head *bg_bh,
+					     unsigned int bits_wanted,
+					     unsigned int total_bits,
+					     struct ocfs2_suballoc_result *res)
+{
+	void *bitmap;
+	u16 best_offset, best_size;
+	int offset, start, found, status = 0;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	/* Callers got this descriptor from
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+
+	found = start = best_offset = best_size = 0;
+	bitmap = bg->bg_bitmap;
+
+	while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
+		if (offset == total_bits)
+			break;
+
+		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
+			/* We found a zero, but we can't use it as it
+			 * hasn't been put to disk yet! */
+			found = 0;
+			start = offset + 1;
+		} else if (offset == start) {
+			/* we found a zero */
+			found++;
+			/* move start to the next bit to test */
+			start++;
+		} else {
+			/* got a zero after some ones */
+			found = 1;
+			start = offset + 1;
+		}
+		if (found > best_size) {
+			best_size = found;
+			best_offset = start - found;
+		}
+		/* we got everything we needed */
+		if (found == bits_wanted) {
+			/* mlog(0, "Found it all!\n"); */
+			break;
+		}
+	}
+
+	if (best_size) {
+		res->sr_bit_offset = best_offset;
+		res->sr_bits = best_size;
+	} else {
+		status = -ENOSPC;
+		/* No error log here -- see the comment above
+		 * ocfs2_test_bg_bit_allocatable */
+	}
+
+	return status;
+}
+
+static inline int ocfs2_block_group_set_bits(handle_t *handle,
+					     struct inode *alloc_inode,
+					     struct ocfs2_group_desc *bg,
+					     struct buffer_head *group_bh,
+					     unsigned int bit_off,
+					     unsigned int num_bits)
+{
+	int status;
+	void *bitmap = bg->bg_bitmap;
+	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+
+	/* All callers get the descriptor via
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
+
+	trace_ocfs2_block_group_set_bits(bit_off, num_bits);
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+
+	status = ocfs2_journal_access_gd(handle,
+					 INODE_CACHE(alloc_inode),
+					 group_bh,
+					 journal_type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+			    " count %u but claims %u are freed. num_bits %d",
+			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
+			    le16_to_cpu(bg->bg_bits),
+			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
+		return -EROFS;
+	}
+	while(num_bits--)
+		ocfs2_set_bit(bit_off++, bitmap);
+
+	ocfs2_journal_dirty(handle, group_bh);
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/* find the one with the most empty bits */
+static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	BUG_ON(!cl->cl_next_free_rec);
+
+	best = curr = 0;
+	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
+		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
+		    le32_to_cpu(cl->cl_recs[best].c_free))
+			best = curr;
+		curr++;
+	}
+
+	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
+	return best;
+}
+
+static int ocfs2_relink_block_group(handle_t *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head *bg_bh,
+				    struct buffer_head *prev_bg_bh,
+				    u16 chain)
+{
+	int status;
+	/* there is a really tiny chance the journal calls could fail,
+	 * but we wouldn't want inconsistent blocks in *any* case. */
+	u64 fe_ptr, bg_ptr, prev_bg_ptr;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
+
+	/* The caller got these descriptors from
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
+
+	trace_ocfs2_relink_block_group(
+		(unsigned long long)le64_to_cpu(fe->i_blkno), chain,
+		(unsigned long long)le64_to_cpu(bg->bg_blkno),
+		(unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
+
+	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
+	bg_ptr = le64_to_cpu(bg->bg_next_group);
+	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
+
+	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
+					 prev_bg_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	prev_bg->bg_next_group = bg->bg_next_group;
+	ocfs2_journal_dirty(handle, prev_bg_bh);
+
+	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
+					 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+	ocfs2_journal_dirty(handle, bg_bh);
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
+					 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+	ocfs2_journal_dirty(handle, fe_bh);
+
+out_rollback:
+	if (status < 0) {
+		fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
+		bg->bg_next_group = cpu_to_le64(bg_ptr);
+		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
+						     u32 wanted)
+{
+	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
+}
+
+/* return 0 on success, -ENOSPC to keep searching and any other < 0
+ * value on error. */
+static int ocfs2_cluster_group_search(struct inode *inode,
+				      struct buffer_head *group_bh,
+				      u32 bits_wanted, u32 min_bits,
+				      u64 max_block,
+				      struct ocfs2_suballoc_result *res)
+{
+	int search = -ENOSPC;
+	int ret;
+	u64 blkoff;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned int max_bits, gd_cluster_off;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	if (gd->bg_free_bits_count) {
+		max_bits = le16_to_cpu(gd->bg_bits);
+
+		/* Tail groups in cluster bitmaps which aren't cpg
+		 * aligned are prone to partial extension by a failed
+		 * fs resize. If the file system resize never got to
+		 * update the dinode cluster count, then we don't want
+		 * to trust any clusters past it, regardless of what
+		 * the group descriptor says. */
+		gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
+							  le64_to_cpu(gd->bg_blkno));
+		if ((gd_cluster_off + max_bits) >
+		    OCFS2_I(inode)->ip_clusters) {
+			max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
+			trace_ocfs2_cluster_group_search_wrong_max_bits(
+				(unsigned long long)le64_to_cpu(gd->bg_blkno),
+				le16_to_cpu(gd->bg_bits),
+				OCFS2_I(inode)->ip_clusters, max_bits);
+		}
+
+		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
+							group_bh, bits_wanted,
+							max_bits, res);
+		if (ret)
+			return ret;
+
+		if (max_block) {
+			blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
+							  gd_cluster_off +
+							  res->sr_bit_offset +
+							  res->sr_bits);
+			trace_ocfs2_cluster_group_search_max_block(
+				(unsigned long long)blkoff,
+				(unsigned long long)max_block);
+			if (blkoff > max_block)
+				return -ENOSPC;
+		}
+
+		/* ocfs2_block_group_find_clear_bits() might
+		 * return success, but we still want to return
+		 * -ENOSPC unless it found the minimum number
+		 * of bits. */
+		if (min_bits <= res->sr_bits)
+			search = 0; /* success */
+		else if (res->sr_bits) {
+			/*
+			 * Don't show bits which we'll be returning
+			 * for allocation to the local alloc bitmap.
+			 */
+			ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
+		}
+	}
+
+	return search;
+}
+
+static int ocfs2_block_group_search(struct inode *inode,
+				    struct buffer_head *group_bh,
+				    u32 bits_wanted, u32 min_bits,
+				    u64 max_block,
+				    struct ocfs2_suballoc_result *res)
+{
+	int ret = -ENOSPC;
+	u64 blkoff;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
+
+	BUG_ON(min_bits != 1);
+	BUG_ON(ocfs2_is_cluster_bitmap(inode));
+
+	if (bg->bg_free_bits_count) {
+		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
+							group_bh, bits_wanted,
+							le16_to_cpu(bg->bg_bits),
+							res);
+		if (!ret && max_block) {
+			blkoff = le64_to_cpu(bg->bg_blkno) +
+				res->sr_bit_offset + res->sr_bits;
+			trace_ocfs2_block_group_search_max_block(
+				(unsigned long long)blkoff,
+				(unsigned long long)max_block);
+			if (blkoff > max_block)
+				ret = -ENOSPC;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+				       handle_t *handle,
+				       struct buffer_head *di_bh,
+				       u32 num_bits,
+				       u16 chain)
+{
+	int ret;
+	u32 tmp_used;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
+	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
+	ocfs2_journal_dirty(handle, di_bh);
+
+out:
+	return ret;
+}
+
+static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
+					 struct ocfs2_extent_rec *rec,
+					 struct ocfs2_chain_list *cl)
+{
+	unsigned int bpc = le16_to_cpu(cl->cl_bpc);
+	unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
+	unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
+
+	if (res->sr_bit_offset < bitoff)
+		return 0;
+	if (res->sr_bit_offset >= (bitoff + bitcount))
+		return 0;
+	res->sr_blkno = le64_to_cpu(rec->e_blkno) +
+		(res->sr_bit_offset - bitoff);
+	if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
+		res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
+	return 1;
+}
+
+static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
+					  struct ocfs2_group_desc *bg,
+					  struct ocfs2_suballoc_result *res)
+{
+	int i;
+	u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
+	struct ocfs2_extent_rec *rec;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
+	struct ocfs2_chain_list *cl = &di->id2.i_chain;
+
+	if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
+		res->sr_blkno = 0;
+		return;
+	}
+
+	res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
+	res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
+	if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
+	    !bg->bg_list.l_next_free_rec)
+		return;
+
+	for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
+		rec = &bg->bg_list.l_recs[i];
+		if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
+			res->sr_bg_blkno = bg_blkno;  /* Restore */
+			break;
+		}
+	}
+}
+
+static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
+				  handle_t *handle,
+				  u32 bits_wanted,
+				  u32 min_bits,
+				  struct ocfs2_suballoc_result *res,
+				  u16 *bits_left)
+{
+	int ret;
+	struct buffer_head *group_bh = NULL;
+	struct ocfs2_group_desc *gd;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
+	struct inode *alloc_inode = ac->ac_inode;
+
+	ret = ocfs2_read_group_descriptor(alloc_inode, di,
+					  res->sr_bg_blkno, &group_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	gd = (struct ocfs2_group_desc *) group_bh->b_data;
+	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
+				  ac->ac_max_block, res);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	if (!ret)
+		ocfs2_bg_discontig_fix_result(ac, gd, res);
+
+	/*
+	 * sr_bg_blkno might have been changed by
+	 * ocfs2_bg_discontig_fix_result
+	 */
+	res->sr_bg_stable_blkno = group_bh->b_blocknr;
+
+	if (ac->ac_find_loc_only)
+		goto out_loc_only;
+
+	ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
+					       res->sr_bits,
+					       le16_to_cpu(gd->bg_chain));
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
+					 res->sr_bit_offset, res->sr_bits);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out_loc_only:
+	*bits_left = le16_to_cpu(gd->bg_free_bits_count);
+
+out:
+	brelse(group_bh);
+
+	return ret;
+}
+
+static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
+			      handle_t *handle,
+			      u32 bits_wanted,
+			      u32 min_bits,
+			      struct ocfs2_suballoc_result *res,
+			      u16 *bits_left)
+{
+	int status;
+	u16 chain;
+	u64 next_group;
+	struct inode *alloc_inode = ac->ac_inode;
+	struct buffer_head *group_bh = NULL;
+	struct buffer_head *prev_group_bh = NULL;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
+	struct ocfs2_group_desc *bg;
+
+	chain = ac->ac_chain;
+	trace_ocfs2_search_chain_begin(
+		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
+		bits_wanted, chain);
+
+	status = ocfs2_read_group_descriptor(alloc_inode, fe,
+					     le64_to_cpu(cl->cl_recs[chain].c_blkno),
+					     &group_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	bg = (struct ocfs2_group_desc *) group_bh->b_data;
+
+	status = -ENOSPC;
+	/* for now, the chain search is a bit simplistic. We just use
+	 * the 1st group with any empty bits. */
+	while ((status = ac->ac_group_search(alloc_inode, group_bh,
+					     bits_wanted, min_bits,
+					     ac->ac_max_block,
+					     res)) == -ENOSPC) {
+		if (!bg->bg_next_group)
+			break;
+
+		brelse(prev_group_bh);
+		prev_group_bh = NULL;
+
+		next_group = le64_to_cpu(bg->bg_next_group);
+		prev_group_bh = group_bh;
+		group_bh = NULL;
+		status = ocfs2_read_group_descriptor(alloc_inode, fe,
+						     next_group, &group_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		bg = (struct ocfs2_group_desc *) group_bh->b_data;
+	}
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	trace_ocfs2_search_chain_succ(
+		(unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
+
+	res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
+
+	BUG_ON(res->sr_bits == 0);
+	if (!status)
+		ocfs2_bg_discontig_fix_result(ac, bg, res);
+
+	/*
+	 * sr_bg_blkno might have been changed by
+	 * ocfs2_bg_discontig_fix_result
+	 */
+	res->sr_bg_stable_blkno = group_bh->b_blocknr;
+
+	/*
+	 * Keep track of previous block descriptor read. When
+	 * we find a target, if we have read more than X
+	 * number of descriptors, and the target is reasonably
+	 * empty, relink him to top of his chain.
+	 *
+	 * We've read 0 extra blocks and only send one more to
+	 * the transaction, yet the next guy to search has a
+	 * much easier time.
+	 *
+	 * Do this *after* figuring out how many bits we're taking out
+	 * of our target group.
+	 */
+	if (ac->ac_allow_chain_relink &&
+	    (prev_group_bh) &&
+	    (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
+		status = ocfs2_relink_block_group(handle, alloc_inode,
+						  ac->ac_bh, group_bh,
+						  prev_group_bh, chain);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	if (ac->ac_find_loc_only)
+		goto out_loc_only;
+
+	status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
+						  ac->ac_bh, res->sr_bits,
+						  chain);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_block_group_set_bits(handle,
+					    alloc_inode,
+					    bg,
+					    group_bh,
+					    res->sr_bit_offset,
+					    res->sr_bits);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	trace_ocfs2_search_chain_end(
+			(unsigned long long)le64_to_cpu(fe->i_blkno),
+			res->sr_bits);
+
+out_loc_only:
+	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
+bail:
+	brelse(group_bh);
+	brelse(prev_group_bh);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/* will give out up to bits_wanted contiguous bits. */
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
+				     handle_t *handle,
+				     u32 bits_wanted,
+				     u32 min_bits,
+				     struct ocfs2_suballoc_result *res)
+{
+	int status;
+	u16 victim, i;
+	u16 bits_left = 0;
+	u64 hint = ac->ac_last_group;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_dinode *fe;
+
+	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
+	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
+	BUG_ON(!ac->ac_bh);
+
+	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+
+	/* The bh was validated by the inode read during
+	 * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
+	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
+	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
+		ocfs2_error(ac->ac_inode->i_sb,
+			    "Chain allocator dinode %llu has %u used "
+			    "bits but only %u total.",
+			    (unsigned long long)le64_to_cpu(fe->i_blkno),
+			    le32_to_cpu(fe->id1.bitmap1.i_used),
+			    le32_to_cpu(fe->id1.bitmap1.i_total));
+		status = -EIO;
+		goto bail;
+	}
+
+	res->sr_bg_blkno = hint;
+	if (res->sr_bg_blkno) {
+		/* Attempt to short-circuit the usual search mechanism
+		 * by jumping straight to the most recently used
+		 * allocation group. This helps us maintain some
+		 * contiguousness across allocations. */
+		status = ocfs2_search_one_group(ac, handle, bits_wanted,
+						min_bits, res, &bits_left);
+		if (!status)
+			goto set_hint;
+		if (status < 0 && status != -ENOSPC) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
+
+	victim = ocfs2_find_victim_chain(cl);
+	ac->ac_chain = victim;
+	ac->ac_allow_chain_relink = 1;
+
+	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
+				    res, &bits_left);
+	if (!status) {
+		hint = ocfs2_group_from_res(res);
+		goto set_hint;
+	}
+	if (status < 0 && status != -ENOSPC) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	trace_ocfs2_claim_suballoc_bits(victim);
+
+	/* If we didn't pick a good victim, then just default to
+	 * searching each chain in order. Don't allow chain relinking
+	 * because we only calculate enough journal credits for one
+	 * relink per alloc. */
+	ac->ac_allow_chain_relink = 0;
+	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
+		if (i == victim)
+			continue;
+		if (!cl->cl_recs[i].c_free)
+			continue;
+
+		ac->ac_chain = i;
+		status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
+					    res, &bits_left);
+		if (!status) {
+			hint = ocfs2_group_from_res(res);
+			break;
+		}
+		if (status < 0 && status != -ENOSPC) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+set_hint:
+	if (status != -ENOSPC) {
+		/* If the next search of this group is not likely to
+		 * yield a suitable extent, then we reset the last
+		 * group hint so as to not waste a disk read */
+		if (bits_left < min_bits)
+			ac->ac_last_group = 0;
+		else
+			ac->ac_last_group = hint;
+	}
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+int ocfs2_claim_metadata(handle_t *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 bits_wanted,
+			 u64 *suballoc_loc,
+			 u16 *suballoc_bit_start,
+			 unsigned int *num_bits,
+			 u64 *blkno_start)
+{
+	int status;
+	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
+
+	status = ocfs2_claim_suballoc_bits(ac,
+					   handle,
+					   bits_wanted,
+					   1,
+					   &res);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
+
+	*suballoc_loc = res.sr_bg_blkno;
+	*suballoc_bit_start = res.sr_bit_offset;
+	*blkno_start = res.sr_blkno;
+	ac->ac_bits_given += res.sr_bits;
+	*num_bits = res.sr_bits;
+	status = 0;
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static void ocfs2_init_inode_ac_group(struct inode *dir,
+				      struct buffer_head *parent_di_bh,
+				      struct ocfs2_alloc_context *ac)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
+	/*
+	 * Try to allocate inodes from some specific group.
+	 *
+	 * If the parent dir has recorded the last group used in allocation,
+	 * cool, use it. Otherwise if we try to allocate new inode from the
+	 * same slot the parent dir belongs to, use the same chunk.
+	 *
+	 * We are very careful here to avoid the mistake of setting
+	 * ac_last_group to a group descriptor from a different (unlocked) slot.
+	 */
+	if (OCFS2_I(dir)->ip_last_used_group &&
+	    OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
+		ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
+	else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
+		if (di->i_suballoc_loc)
+			ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
+		else
+			ac->ac_last_group = ocfs2_which_suballoc_group(
+					le64_to_cpu(di->i_blkno),
+					le16_to_cpu(di->i_suballoc_bit));
+	}
+}
+
+static inline void ocfs2_save_inode_ac_group(struct inode *dir,
+					     struct ocfs2_alloc_context *ac)
+{
+	OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
+	OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
+}
+
+int ocfs2_find_new_inode_loc(struct inode *dir,
+			     struct buffer_head *parent_fe_bh,
+			     struct ocfs2_alloc_context *ac,
+			     u64 *fe_blkno)
+{
+	int ret;
+	handle_t *handle = NULL;
+	struct ocfs2_suballoc_result *res;
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_given != 0);
+	BUG_ON(ac->ac_bits_wanted != 1);
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+
+	res = kzalloc(sizeof(*res), GFP_NOFS);
+	if (res == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
+
+	/*
+	 * The handle started here is for chain relink. Alternatively,
+	 * we could just disable relink for these calls.
+	 */
+	handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * This will instruct ocfs2_claim_suballoc_bits and
+	 * ocfs2_search_one_group to search but save actual allocation
+	 * for later.
+	 */
+	ac->ac_find_loc_only = 1;
+
+	ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ac->ac_find_loc_priv = res;
+	*fe_blkno = res->sr_blkno;
+
+out:
+	if (handle)
+		ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
+
+	if (ret)
+		kfree(res);
+
+	return ret;
+}
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+				 struct inode *dir,
+				 struct ocfs2_alloc_context *ac,
+				 u64 *suballoc_loc,
+				 u16 *suballoc_bit,
+				 u64 di_blkno)
+{
+	int ret;
+	u16 chain;
+	struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
+	struct buffer_head *bg_bh = NULL;
+	struct ocfs2_group_desc *bg;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+
+	/*
+	 * Since di_blkno is being passed back in, we check for any
+	 * inconsistencies which may have happened between
+	 * calls. These are code bugs as di_blkno is not expected to
+	 * change once returned from ocfs2_find_new_inode_loc()
+	 */
+	BUG_ON(res->sr_blkno != di_blkno);
+
+	ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
+					  res->sr_bg_stable_blkno, &bg_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	chain = le16_to_cpu(bg->bg_chain);
+
+	ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
+					       ac->ac_bh, res->sr_bits,
+					       chain);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_block_group_set_bits(handle,
+					 ac->ac_inode,
+					 bg,
+					 bg_bh,
+					 res->sr_bit_offset,
+					 res->sr_bits);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
+					   res->sr_bits);
+
+	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
+
+	BUG_ON(res->sr_bits != 1);
+
+	*suballoc_loc = res->sr_bg_blkno;
+	*suballoc_bit = res->sr_bit_offset;
+	ac->ac_bits_given++;
+	ocfs2_save_inode_ac_group(dir, ac);
+
+out:
+	brelse(bg_bh);
+
+	return ret;
+}
+
+int ocfs2_claim_new_inode(handle_t *handle,
+			  struct inode *dir,
+			  struct buffer_head *parent_fe_bh,
+			  struct ocfs2_alloc_context *ac,
+			  u64 *suballoc_loc,
+			  u16 *suballoc_bit,
+			  u64 *fe_blkno)
+{
+	int status;
+	struct ocfs2_suballoc_result res;
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_given != 0);
+	BUG_ON(ac->ac_bits_wanted != 1);
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+
+	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
+
+	status = ocfs2_claim_suballoc_bits(ac,
+					   handle,
+					   1,
+					   1,
+					   &res);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
+
+	BUG_ON(res.sr_bits != 1);
+
+	*suballoc_loc = res.sr_bg_blkno;
+	*suballoc_bit = res.sr_bit_offset;
+	*fe_blkno = res.sr_blkno;
+	ac->ac_bits_given++;
+	ocfs2_save_inode_ac_group(dir, ac);
+	status = 0;
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/* translate a group desc. blkno and it's bitmap offset into
+ * disk cluster offset. */
+static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
+						   u64 bg_blkno,
+						   u16 bg_bit_off)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 cluster = 0;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	if (bg_blkno != osb->first_cluster_group_blkno)
+		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
+	cluster += (u32) bg_bit_off;
+	return cluster;
+}
+
+/* given a cluster offset, calculate which block group it belongs to
+ * and return that block offset. */
+u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 group_no;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	group_no = cluster / osb->bitmap_cpg;
+	if (!group_no)
+		return osb->first_cluster_group_blkno;
+	return ocfs2_clusters_to_blocks(inode->i_sb,
+					group_no * osb->bitmap_cpg);
+}
+
+/* given the block number of a cluster start, calculate which cluster
+ * group and descriptor bitmap offset that corresponds to. */
+static inline void ocfs2_block_to_cluster_group(struct inode *inode,
+						u64 data_blkno,
+						u64 *bg_blkno,
+						u16 *bg_bit_off)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	*bg_blkno = ocfs2_which_cluster_group(inode,
+					      data_cluster);
+
+	if (*bg_blkno == osb->first_cluster_group_blkno)
+		*bg_bit_off = (u16) data_cluster;
+	else
+		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
+							     data_blkno - *bg_blkno);
+}
+
+/*
+ * min_bits - minimum contiguous chunk from this total allocation we
+ * can handle. set to what we asked for originally for a full
+ * contig. allocation, set to '1' to indicate we can deal with extents
+ * of any size.
+ */
+int __ocfs2_claim_clusters(handle_t *handle,
+			   struct ocfs2_alloc_context *ac,
+			   u32 min_clusters,
+			   u32 max_clusters,
+			   u32 *cluster_start,
+			   u32 *num_clusters)
+{
+	int status;
+	unsigned int bits_wanted = max_clusters;
+	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
+	struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
+
+	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
+
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
+	       && ac->ac_which != OCFS2_AC_USE_MAIN);
+
+	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+		WARN_ON(min_clusters > 1);
+
+		status = ocfs2_claim_local_alloc_bits(osb,
+						      handle,
+						      ac,
+						      bits_wanted,
+						      cluster_start,
+						      num_clusters);
+		if (!status)
+			atomic_inc(&osb->alloc_stats.local_data);
+	} else {
+		if (min_clusters > (osb->bitmap_cpg - 1)) {
+			/* The only paths asking for contiguousness
+			 * should know about this already. */
+			mlog(ML_ERROR, "minimum allocation requested %u exceeds "
+			     "group bitmap size %u!\n", min_clusters,
+			     osb->bitmap_cpg);
+			status = -ENOSPC;
+			goto bail;
+		}
+		/* clamp the current request down to a realistic size. */
+		if (bits_wanted > (osb->bitmap_cpg - 1))
+			bits_wanted = osb->bitmap_cpg - 1;
+
+		status = ocfs2_claim_suballoc_bits(ac,
+						   handle,
+						   bits_wanted,
+						   min_clusters,
+						   &res);
+		if (!status) {
+			BUG_ON(res.sr_blkno); /* cluster alloc can't set */
+			*cluster_start =
+				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
+								 res.sr_bg_blkno,
+								 res.sr_bit_offset);
+			atomic_inc(&osb->alloc_stats.bitmap_data);
+			*num_clusters = res.sr_bits;
+		}
+	}
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	ac->ac_bits_given += *num_clusters;
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+int ocfs2_claim_clusters(handle_t *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 min_clusters,
+			 u32 *cluster_start,
+			 u32 *num_clusters)
+{
+	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+
+	return __ocfs2_claim_clusters(handle, ac, min_clusters,
+				      bits_wanted, cluster_start, num_clusters);
+}
+
+static int ocfs2_block_group_clear_bits(handle_t *handle,
+					struct inode *alloc_inode,
+					struct ocfs2_group_desc *bg,
+					struct buffer_head *group_bh,
+					unsigned int bit_off,
+					unsigned int num_bits,
+					void (*undo_fn)(unsigned int bit,
+							unsigned long *bmap))
+{
+	int status;
+	unsigned int tmp;
+	struct ocfs2_group_desc *undo_bg = NULL;
+
+	/* The caller got this descriptor from
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+
+	trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
+
+	BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
+	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
+					 group_bh,
+					 undo_fn ?
+					 OCFS2_JOURNAL_ACCESS_UNDO :
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (undo_fn) {
+		jbd_lock_bh_state(group_bh);
+		undo_bg = (struct ocfs2_group_desc *)
+					bh2jh(group_bh)->b_committed_data;
+		BUG_ON(!undo_bg);
+	}
+
+	tmp = num_bits;
+	while(tmp--) {
+		ocfs2_clear_bit((bit_off + tmp),
+				(unsigned long *) bg->bg_bitmap);
+		if (undo_fn)
+			undo_fn(bit_off + tmp,
+				(unsigned long *) undo_bg->bg_bitmap);
+	}
+	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+			    " count %u but claims %u are freed. num_bits %d",
+			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
+			    le16_to_cpu(bg->bg_bits),
+			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
+		return -EROFS;
+	}
+
+	if (undo_fn)
+		jbd_unlock_bh_state(group_bh);
+
+	ocfs2_journal_dirty(handle, group_bh);
+bail:
+	return status;
+}
+
+/*
+ * expects the suballoc inode to already be locked.
+ */
+static int _ocfs2_free_suballoc_bits(handle_t *handle,
+				     struct inode *alloc_inode,
+				     struct buffer_head *alloc_bh,
+				     unsigned int start_bit,
+				     u64 bg_blkno,
+				     unsigned int count,
+				     void (*undo_fn)(unsigned int bit,
+						     unsigned long *bitmap))
+{
+	int status = 0;
+	u32 tmp_used;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
+	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
+	struct buffer_head *group_bh = NULL;
+	struct ocfs2_group_desc *group;
+
+	/* The alloc_bh comes from ocfs2_free_dinode() or
+	 * ocfs2_free_clusters().  The callers have all locked the
+	 * allocator and gotten alloc_bh from the lock call.  This
+	 * validates the dinode buffer.  Any corruption that has happened
+	 * is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
+
+	trace_ocfs2_free_suballoc_bits(
+		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
+		(unsigned long long)bg_blkno,
+		start_bit, count);
+
+	status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
+					     &group_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	group = (struct ocfs2_group_desc *) group_bh->b_data;
+
+	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
+
+	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
+					      group, group_bh,
+					      start_bit, count, undo_fn);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
+					 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
+		     count);
+	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
+	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
+	ocfs2_journal_dirty(handle, alloc_bh);
+
+bail:
+	brelse(group_bh);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+int ocfs2_free_suballoc_bits(handle_t *handle,
+			     struct inode *alloc_inode,
+			     struct buffer_head *alloc_bh,
+			     unsigned int start_bit,
+			     u64 bg_blkno,
+			     unsigned int count)
+{
+	return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
+					 start_bit, bg_blkno, count, NULL);
+}
+
+int ocfs2_free_dinode(handle_t *handle,
+		      struct inode *inode_alloc_inode,
+		      struct buffer_head *inode_alloc_bh,
+		      struct ocfs2_dinode *di)
+{
+	u64 blk = le64_to_cpu(di->i_blkno);
+	u16 bit = le16_to_cpu(di->i_suballoc_bit);
+	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+	if (di->i_suballoc_loc)
+		bg_blkno = le64_to_cpu(di->i_suballoc_loc);
+	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
+					inode_alloc_bh, bit, bg_blkno, 1);
+}
+
+static int _ocfs2_free_clusters(handle_t *handle,
+				struct inode *bitmap_inode,
+				struct buffer_head *bitmap_bh,
+				u64 start_blk,
+				unsigned int num_clusters,
+				void (*undo_fn)(unsigned int bit,
+						unsigned long *bitmap))
+{
+	int status;
+	u16 bg_start_bit;
+	u64 bg_blkno;
+	struct ocfs2_dinode *fe;
+
+	/* You can't ever have a contiguous set of clusters
+	 * bigger than a block group bitmap so we never have to worry
+	 * about looping on them.
+	 * This is expensive. We can safely remove once this stuff has
+	 * gotten tested really well. */
+	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
+
+	fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
+
+	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
+				     &bg_start_bit);
+
+	trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
+			(unsigned long long)start_blk,
+			bg_start_bit, num_clusters);
+
+	status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
+					   bg_start_bit, bg_blkno,
+					   num_clusters, undo_fn);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
+					 num_clusters);
+
+out:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+int ocfs2_free_clusters(handle_t *handle,
+			struct inode *bitmap_inode,
+			struct buffer_head *bitmap_bh,
+			u64 start_blk,
+			unsigned int num_clusters)
+{
+	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+				    start_blk, num_clusters,
+				    _ocfs2_set_bit);
+}
+
+/*
+ * Give never-used clusters back to the global bitmap.  We don't need
+ * to protect these bits in the undo buffer.
+ */
+int ocfs2_release_clusters(handle_t *handle,
+			   struct inode *bitmap_inode,
+			   struct buffer_head *bitmap_bh,
+			   u64 start_blk,
+			   unsigned int num_clusters)
+{
+	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+				    start_blk, num_clusters,
+				    _ocfs2_clear_bit);
+}
+
+static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
+{
+	printk("Block Group:\n");
+	printk("bg_signature:       %s\n", bg->bg_signature);
+	printk("bg_size:            %u\n", bg->bg_size);
+	printk("bg_bits:            %u\n", bg->bg_bits);
+	printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
+	printk("bg_chain:           %u\n", bg->bg_chain);
+	printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
+	printk("bg_next_group:      %llu\n",
+	       (unsigned long long)bg->bg_next_group);
+	printk("bg_parent_dinode:   %llu\n",
+	       (unsigned long long)bg->bg_parent_dinode);
+	printk("bg_blkno:           %llu\n",
+	       (unsigned long long)bg->bg_blkno);
+}
+
+static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
+{
+	int i;
+
+	printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
+	printk("i_signature:                  %s\n", fe->i_signature);
+	printk("i_size:                       %llu\n",
+	       (unsigned long long)fe->i_size);
+	printk("i_clusters:                   %u\n", fe->i_clusters);
+	printk("i_generation:                 %u\n",
+	       le32_to_cpu(fe->i_generation));
+	printk("id1.bitmap1.i_used:           %u\n",
+	       le32_to_cpu(fe->id1.bitmap1.i_used));
+	printk("id1.bitmap1.i_total:          %u\n",
+	       le32_to_cpu(fe->id1.bitmap1.i_total));
+	printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
+	printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
+	printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
+	printk("id2.i_chain.cl_next_free_rec: %u\n",
+	       fe->id2.i_chain.cl_next_free_rec);
+	for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
+		printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_free);
+		printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_total);
+		printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
+		       (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
+	}
+}
+
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
+ * and ocfs2_allocate_unwritten_extents().
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
+ */
+int ocfs2_lock_allocators(struct inode *inode,
+			  struct ocfs2_extent_tree *et,
+			  u32 clusters_to_add, u32 extents_to_split,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac)
+{
+	int ret = 0, num_free_extents;
+	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	*meta_ac = NULL;
+	if (data_ac)
+		*data_ac = NULL;
+
+	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
+
+	num_free_extents = ocfs2_num_free_extents(osb, et);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Sparse allocation file systems need to be more conservative
+	 * with reserving room for expansion - the actual allocation
+	 * happens while we've got a journal handle open so re-taking
+	 * a cluster lock (because we ran out of room for another
+	 * extent) will violate ordering rules.
+	 *
+	 * Most of the time we'll only be seeing this 1 cluster at a time
+	 * anyway.
+	 *
+	 * Always lock for any unwritten extents - we might want to
+	 * add blocks during a split.
+	 */
+	if (!num_free_extents ||
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
+		ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (clusters_to_add == 0)
+		goto out;
+
+	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+
+		/*
+		 * We cannot have an error and a non null *data_ac.
+		 */
+	}
+
+	return ret;
+}
+
+/*
+ * Read the inode specified by blkno to get suballoc_slot and
+ * suballoc_bit.
+ */
+static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
+				       u16 *suballoc_slot, u64 *group_blkno,
+				       u16 *suballoc_bit)
+{
+	int status;
+	struct buffer_head *inode_bh = NULL;
+	struct ocfs2_dinode *inode_fe;
+
+	trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
+
+	/* dirty read disk */
+	status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
+	if (status < 0) {
+		mlog(ML_ERROR, "read block %llu failed %d\n",
+		     (unsigned long long)blkno, status);
+		goto bail;
+	}
+
+	inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
+		mlog(ML_ERROR, "invalid inode %llu requested\n",
+		     (unsigned long long)blkno);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
+	    (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
+		mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
+		     (unsigned long long)blkno,
+		     (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (suballoc_slot)
+		*suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
+	if (suballoc_bit)
+		*suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
+	if (group_blkno)
+		*group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
+
+bail:
+	brelse(inode_bh);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * test whether bit is SET in allocator bitmap or not.  on success, 0
+ * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
+ * is returned and *res is meaningless.  Call this after you have
+ * cluster locked against suballoc, or you may get a result based on
+ * non-up2date contents
+ */
+static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
+				   struct inode *suballoc,
+				   struct buffer_head *alloc_bh,
+				   u64 group_blkno, u64 blkno,
+				   u16 bit, int *res)
+{
+	struct ocfs2_dinode *alloc_di;
+	struct ocfs2_group_desc *group;
+	struct buffer_head *group_bh = NULL;
+	u64 bg_blkno;
+	int status;
+
+	trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
+				      (unsigned int)bit);
+
+	alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
+	if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
+		mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
+		     (unsigned int)bit,
+		     ocfs2_bits_per_group(&alloc_di->id2.i_chain));
+		status = -EINVAL;
+		goto bail;
+	}
+
+	bg_blkno = group_blkno ? group_blkno :
+		   ocfs2_which_suballoc_group(blkno, bit);
+	status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
+					     &group_bh);
+	if (status < 0) {
+		mlog(ML_ERROR, "read group %llu failed %d\n",
+		     (unsigned long long)bg_blkno, status);
+		goto bail;
+	}
+
+	group = (struct ocfs2_group_desc *) group_bh->b_data;
+	*res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
+
+bail:
+	brelse(group_bh);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * Test if the bit representing this inode (blkno) is set in the
+ * suballocator.
+ *
+ * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
+ *
+ * In the event of failure, a negative value is returned and *res is
+ * meaningless.
+ *
+ * Callers must make sure to hold nfs_sync_lock to prevent
+ * ocfs2_delete_inode() on another node from accessing the same
+ * suballocator concurrently.
+ */
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
+{
+	int status;
+	u64 group_blkno = 0;
+	u16 suballoc_bit = 0, suballoc_slot = 0;
+	struct inode *inode_alloc_inode;
+	struct buffer_head *alloc_bh = NULL;
+
+	trace_ocfs2_test_inode_bit((unsigned long long)blkno);
+
+	status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
+					     &group_blkno, &suballoc_bit);
+	if (status < 0) {
+		mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
+		goto bail;
+	}
+
+	inode_alloc_inode =
+		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+					    suballoc_slot);
+	if (!inode_alloc_inode) {
+		/* the error code could be inaccurate, but we are not able to
+		 * get the correct one. */
+		status = -EINVAL;
+		mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
+		     (u32)suballoc_slot);
+		goto bail;
+	}
+
+	mutex_lock(&inode_alloc_inode->i_mutex);
+	status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
+	if (status < 0) {
+		mutex_unlock(&inode_alloc_inode->i_mutex);
+		mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
+		     (u32)suballoc_slot, status);
+		goto bail;
+	}
+
+	status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
+					 group_blkno, blkno, suballoc_bit, res);
+	if (status < 0)
+		mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
+
+	ocfs2_inode_unlock(inode_alloc_inode, 0);
+	mutex_unlock(&inode_alloc_inode->i_mutex);
+
+	iput(inode_alloc_inode);
+	brelse(alloc_bh);
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
new file mode 100644
index 00000000..b8afabfe
--- /dev/null
+++ b/fs/ocfs2/suballoc.h
@@ -0,0 +1,221 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.h
+ *
+ * Defines sub allocator api
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _CHAINALLOC_H_
+#define _CHAINALLOC_H_
+
+struct ocfs2_suballoc_result;
+typedef int (group_search_t)(struct inode *,
+			     struct buffer_head *,
+			     u32,			/* bits_wanted */
+			     u32,			/* min_bits */
+			     u64,			/* max_block */
+			     struct ocfs2_suballoc_result *);
+							/* found bits */
+
+struct ocfs2_alloc_context {
+	struct inode *ac_inode;    /* which bitmap are we allocating from? */
+	struct buffer_head *ac_bh; /* file entry bh */
+	u32    ac_alloc_slot;   /* which slot are we allocating from? */
+	u32    ac_bits_wanted;
+	u32    ac_bits_given;
+#define OCFS2_AC_USE_LOCAL 1
+#define OCFS2_AC_USE_MAIN  2
+#define OCFS2_AC_USE_INODE 3
+#define OCFS2_AC_USE_META  4
+	u32    ac_which;
+
+	/* these are used by the chain search */
+	u16    ac_chain;
+	int    ac_allow_chain_relink;
+	group_search_t *ac_group_search;
+
+	u64    ac_last_group;
+	u64    ac_max_block;  /* Highest block number to allocate. 0 is
+				 is the same as ~0 - unlimited */
+
+	int    ac_find_loc_only;  /* hack for reflink operation ordering */
+	struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
+
+	struct ocfs2_alloc_reservation	*ac_resv;
+};
+
+void ocfs2_init_steal_slots(struct ocfs2_super *osb);
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
+static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
+{
+	return ac->ac_bits_wanted - ac->ac_bits_given;
+}
+
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_extent_list *root_el,
+			       struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+				      int blocks,
+				      struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
+			    struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac);
+
+int ocfs2_claim_metadata(handle_t *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 bits_wanted,
+			 u64 *suballoc_loc,
+			 u16 *suballoc_bit_start,
+			 u32 *num_bits,
+			 u64 *blkno_start);
+int ocfs2_claim_new_inode(handle_t *handle,
+			  struct inode *dir,
+			  struct buffer_head *parent_fe_bh,
+			  struct ocfs2_alloc_context *ac,
+			  u64 *suballoc_loc,
+			  u16 *suballoc_bit,
+			  u64 *fe_blkno);
+int ocfs2_claim_clusters(handle_t *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 min_clusters,
+			 u32 *cluster_start,
+			 u32 *num_clusters);
+/*
+ * Use this variant of ocfs2_claim_clusters to specify a maxiumum
+ * number of clusters smaller than the allocation reserved.
+ */
+int __ocfs2_claim_clusters(handle_t *handle,
+			   struct ocfs2_alloc_context *ac,
+			   u32 min_clusters,
+			   u32 max_clusters,
+			   u32 *cluster_start,
+			   u32 *num_clusters);
+
+int ocfs2_free_suballoc_bits(handle_t *handle,
+			     struct inode *alloc_inode,
+			     struct buffer_head *alloc_bh,
+			     unsigned int start_bit,
+			     u64 bg_blkno,
+			     unsigned int count);
+int ocfs2_free_dinode(handle_t *handle,
+		      struct inode *inode_alloc_inode,
+		      struct buffer_head *inode_alloc_bh,
+		      struct ocfs2_dinode *di);
+int ocfs2_free_clusters(handle_t *handle,
+			struct inode *bitmap_inode,
+			struct buffer_head *bitmap_bh,
+			u64 start_blk,
+			unsigned int num_clusters);
+int ocfs2_release_clusters(handle_t *handle,
+			   struct inode *bitmap_inode,
+			   struct buffer_head *bitmap_bh,
+			   u64 start_blk,
+			   unsigned int num_clusters);
+
+static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
+{
+	u64 group = block - (u64) bit;
+
+	return group;
+}
+
+static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
+					  u64 bg_blkno)
+{
+	/* This should work for all block group descriptors as only
+	 * the 1st group descriptor of the cluster bitmap is
+	 * different. */
+
+	if (bg_blkno == osb->first_cluster_group_blkno)
+		return 0;
+
+	/* the rest of the block groups are located at the beginning
+	 * of their 1st cluster, so a direct translation just
+	 * works. */
+	return ocfs2_blocks_to_clusters(osb->sb, bg_blkno);
+}
+
+static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno;
+}
+
+/* This is for local alloc ONLY. Others should use the task-specific
+ * apis above. */
+int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
+				      struct ocfs2_alloc_context *ac);
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
+
+/* given a cluster offset, calculate which block group it belongs to
+ * and return that block offset. */
+u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
+
+/*
+ * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
+ * finds a problem.  A caller that wants to check a group descriptor
+ * without going readonly should read the block with ocfs2_read_block[s]()
+ * and then checking it with this function.  This is only resize, really.
+ * Everyone else should be using ocfs2_read_group_descriptor().
+ */
+int ocfs2_check_group_descriptor(struct super_block *sb,
+				 struct ocfs2_dinode *di,
+				 struct buffer_head *bh);
+/*
+ * Read a group descriptor block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The descriptor will be validated with
+ * ocfs2_validate_group_descriptor().
+ */
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+				u64 gd_blkno, struct buffer_head **bh);
+
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
+			  u32 clusters_to_add, u32 extents_to_split,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac);
+
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
+
+
+
+/*
+ * The following two interfaces are for ocfs2_create_inode_in_orphan().
+ */
+int ocfs2_find_new_inode_loc(struct inode *dir,
+			     struct buffer_head *parent_fe_bh,
+			     struct ocfs2_alloc_context *ac,
+			     u64 *fe_blkno);
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+				 struct inode *dir,
+				 struct ocfs2_alloc_context *ac,
+				 u64 *suballoc_loc,
+				 u16 *suballoc_bit,
+				 u64 di_blkno);
+
+#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
new file mode 100644
index 00000000..56f61027
--- /dev/null
+++ b/fs/ocfs2/super.c
@@ -0,0 +1,2651 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * super.c
+ *
+ * load/unload driver, mount/dismount volumes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/parser.h>
+#include <linux/crc32.h>
+#include <linux/debugfs.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/quotaops.h>
+#include <linux/cleancache.h>
+
+#define CREATE_TRACE_POINTS
+#include "ocfs2_trace.h"
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+/* this should be the only file to include a version 1 header */
+#include "ocfs1_fs_compat.h"
+
+#include "alloc.h"
+#include "blockcheck.h"
+#include "dlmglue.h"
+#include "export.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "namei.h"
+#include "slot_map.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "ver.h"
+#include "xattr.h"
+#include "quota.h"
+#include "refcounttree.h"
+#include "suballoc.h"
+
+#include "buffer_head_io.h"
+
+static struct kmem_cache *ocfs2_inode_cachep = NULL;
+struct kmem_cache *ocfs2_dquot_cachep;
+struct kmem_cache *ocfs2_qf_chunk_cachep;
+
+/* OCFS2 needs to schedule several different types of work which
+ * require cluster locking, disk I/O, recovery waits, etc. Since these
+ * types of work tend to be heavy we avoid using the kernel events
+ * workqueue and schedule on our own. */
+struct workqueue_struct *ocfs2_wq = NULL;
+
+static struct dentry *ocfs2_debugfs_root = NULL;
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+struct mount_options
+{
+	unsigned long	commit_interval;
+	unsigned long	mount_opt;
+	unsigned int	atime_quantum;
+	signed short	slot;
+	int		localalloc_opt;
+	unsigned int	resv_level;
+	int		dir_resv_level;
+	char		cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+};
+
+static int ocfs2_parse_options(struct super_block *sb, char *options,
+			       struct mount_options *mopt,
+			       int is_remount);
+static int ocfs2_check_set_options(struct super_block *sb,
+				   struct mount_options *options);
+static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt);
+static void ocfs2_put_super(struct super_block *sb);
+static int ocfs2_mount_volume(struct super_block *sb);
+static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
+static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err);
+static int ocfs2_initialize_mem_caches(void);
+static void ocfs2_free_mem_caches(void);
+static void ocfs2_delete_osb(struct ocfs2_super *osb);
+
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf);
+
+static int ocfs2_sync_fs(struct super_block *sb, int wait);
+
+static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
+static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
+static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
+static int ocfs2_check_volume(struct ocfs2_super *osb);
+static int ocfs2_verify_volume(struct ocfs2_dinode *di,
+			       struct buffer_head *bh,
+			       u32 sectsize,
+			       struct ocfs2_blockcheck_stats *stats);
+static int ocfs2_initialize_super(struct super_block *sb,
+				  struct buffer_head *bh,
+				  int sector_size,
+				  struct ocfs2_blockcheck_stats *stats);
+static int ocfs2_get_sector(struct super_block *sb,
+			    struct buffer_head **bh,
+			    int block,
+			    int sect_size);
+static struct inode *ocfs2_alloc_inode(struct super_block *sb);
+static void ocfs2_destroy_inode(struct inode *inode);
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
+static int ocfs2_enable_quotas(struct ocfs2_super *osb);
+static void ocfs2_disable_quotas(struct ocfs2_super *osb);
+
+static const struct super_operations ocfs2_sops = {
+	.statfs		= ocfs2_statfs,
+	.alloc_inode	= ocfs2_alloc_inode,
+	.destroy_inode	= ocfs2_destroy_inode,
+	.drop_inode	= ocfs2_drop_inode,
+	.evict_inode	= ocfs2_evict_inode,
+	.sync_fs	= ocfs2_sync_fs,
+	.put_super	= ocfs2_put_super,
+	.remount_fs	= ocfs2_remount,
+	.show_options   = ocfs2_show_options,
+	.quota_read	= ocfs2_quota_read,
+	.quota_write	= ocfs2_quota_write,
+};
+
+enum {
+	Opt_barrier,
+	Opt_err_panic,
+	Opt_err_ro,
+	Opt_intr,
+	Opt_nointr,
+	Opt_hb_none,
+	Opt_hb_local,
+	Opt_hb_global,
+	Opt_data_ordered,
+	Opt_data_writeback,
+	Opt_atime_quantum,
+	Opt_slot,
+	Opt_commit,
+	Opt_localalloc,
+	Opt_localflocks,
+	Opt_stack,
+	Opt_user_xattr,
+	Opt_nouser_xattr,
+	Opt_inode64,
+	Opt_acl,
+	Opt_noacl,
+	Opt_usrquota,
+	Opt_grpquota,
+	Opt_coherency_buffered,
+	Opt_coherency_full,
+	Opt_resv_level,
+	Opt_dir_resv_level,
+	Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_barrier, "barrier=%u"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_intr, "intr"},
+	{Opt_nointr, "nointr"},
+	{Opt_hb_none, OCFS2_HB_NONE},
+	{Opt_hb_local, OCFS2_HB_LOCAL},
+	{Opt_hb_global, OCFS2_HB_GLOBAL},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_atime_quantum, "atime_quantum=%u"},
+	{Opt_slot, "preferred_slot=%u"},
+	{Opt_commit, "commit=%u"},
+	{Opt_localalloc, "localalloc=%d"},
+	{Opt_localflocks, "localflocks"},
+	{Opt_stack, "cluster_stack=%s"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_inode64, "inode64"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_grpquota, "grpquota"},
+	{Opt_coherency_buffered, "coherency=buffered"},
+	{Opt_coherency_full, "coherency=full"},
+	{Opt_resv_level, "resv_level=%u"},
+	{Opt_dir_resv_level, "dir_resv_level=%u"},
+	{Opt_err, NULL}
+};
+
+#ifdef CONFIG_DEBUG_FS
+static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
+{
+	struct ocfs2_cluster_connection *cconn = osb->cconn;
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+	struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
+	int i, out = 0;
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Id: %-s  Uuid: %-s  Gen: 0x%X  Label: %-s\n",
+			"Device", osb->dev_str, osb->uuid_str,
+			osb->fs_generation, osb->vol_label);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => State: %d  Flags: 0x%lX\n", "Volume",
+			atomic_read(&osb->vol_state), osb->osb_flags);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Block: %lu  Cluster: %d\n", "Sizes",
+			osb->sb->s_blocksize, osb->s_clustersize);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Compat: 0x%X  Incompat: 0x%X  "
+			"ROcompat: 0x%X\n",
+			"Features", osb->s_feature_compat,
+			osb->s_feature_incompat, osb->s_feature_ro_compat);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Opts: 0x%lX  AtimeQuanta: %u\n", "Mount",
+			osb->s_mount_opt, osb->s_atime_quantum);
+
+	if (cconn) {
+		out += snprintf(buf + out, len - out,
+				"%10s => Stack: %s  Name: %*s  "
+				"Version: %d.%d\n", "Cluster",
+				(*osb->osb_cluster_stack == '\0' ?
+				 "o2cb" : osb->osb_cluster_stack),
+				cconn->cc_namelen, cconn->cc_name,
+				cconn->cc_version.pv_major,
+				cconn->cc_version.pv_minor);
+	}
+
+	spin_lock(&osb->dc_task_lock);
+	out += snprintf(buf + out, len - out,
+			"%10s => Pid: %d  Count: %lu  WakeSeq: %lu  "
+			"WorkSeq: %lu\n", "DownCnvt",
+			(osb->dc_task ?  task_pid_nr(osb->dc_task) : -1),
+			osb->blocked_lock_count, osb->dc_wake_sequence,
+			osb->dc_work_sequence);
+	spin_unlock(&osb->dc_task_lock);
+
+	spin_lock(&osb->osb_lock);
+	out += snprintf(buf + out, len - out, "%10s => Pid: %d  Nodes:",
+			"Recovery",
+			(osb->recovery_thread_task ?
+			 task_pid_nr(osb->recovery_thread_task) : -1));
+	if (rm->rm_used == 0)
+		out += snprintf(buf + out, len - out, " None\n");
+	else {
+		for (i = 0; i < rm->rm_used; i++)
+			out += snprintf(buf + out, len - out, " %d",
+					rm->rm_entries[i]);
+		out += snprintf(buf + out, len - out, "\n");
+	}
+	spin_unlock(&osb->osb_lock);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Pid: %d  Interval: %lu  Needs: %d\n", "Commit",
+			(osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
+			osb->osb_commit_interval,
+			atomic_read(&osb->needs_checkpoint));
+
+	out += snprintf(buf + out, len - out,
+			"%10s => State: %d  TxnId: %lu  NumTxns: %d\n",
+			"Journal", osb->journal->j_state,
+			osb->journal->j_trans_id,
+			atomic_read(&osb->journal->j_num_trans));
+
+	out += snprintf(buf + out, len - out,
+			"%10s => GlobalAllocs: %d  LocalAllocs: %d  "
+			"SubAllocs: %d  LAWinMoves: %d  SAExtends: %d\n",
+			"Stats",
+			atomic_read(&osb->alloc_stats.bitmap_data),
+			atomic_read(&osb->alloc_stats.local_data),
+			atomic_read(&osb->alloc_stats.bg_allocs),
+			atomic_read(&osb->alloc_stats.moves),
+			atomic_read(&osb->alloc_stats.bg_extends));
+
+	out += snprintf(buf + out, len - out,
+			"%10s => State: %u  Descriptor: %llu  Size: %u bits  "
+			"Default: %u bits\n",
+			"LocalAlloc", osb->local_alloc_state,
+			(unsigned long long)osb->la_last_gd,
+			osb->local_alloc_bits, osb->local_alloc_default_bits);
+
+	spin_lock(&osb->osb_lock);
+	out += snprintf(buf + out, len - out,
+			"%10s => InodeSlot: %d  StolenInodes: %d, "
+			"MetaSlot: %d  StolenMeta: %d\n", "Steal",
+			osb->s_inode_steal_slot,
+			atomic_read(&osb->s_num_inodes_stolen),
+			osb->s_meta_steal_slot,
+			atomic_read(&osb->s_num_meta_stolen));
+	spin_unlock(&osb->osb_lock);
+
+	out += snprintf(buf + out, len - out, "OrphanScan => ");
+	out += snprintf(buf + out, len - out, "Local: %u  Global: %u ",
+			os->os_count, os->os_seqno);
+	out += snprintf(buf + out, len - out, " Last Scan: ");
+	if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+		out += snprintf(buf + out, len - out, "Disabled\n");
+	else
+		out += snprintf(buf + out, len - out, "%lu seconds ago\n",
+				(get_seconds() - os->os_scantime.tv_sec));
+
+	out += snprintf(buf + out, len - out, "%10s => %3s  %10s\n",
+			"Slots", "Num", "RecoGen");
+	for (i = 0; i < osb->max_slots; ++i) {
+		out += snprintf(buf + out, len - out,
+				"%10s  %c %3d  %10d\n",
+				" ",
+				(i == osb->slot_num ? '*' : ' '),
+				i, osb->slot_recovery_generations[i]);
+	}
+
+	return out;
+}
+
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+	struct ocfs2_super *osb = inode->i_private;
+	char *buf = NULL;
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		goto bail;
+
+	i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE));
+
+	file->private_data = buf;
+
+	return 0;
+bail:
+	return -ENOMEM;
+}
+
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+				size_t nbytes, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+				       i_size_read(file->f_mapping->host));
+}
+#else
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+				size_t nbytes, loff_t *ppos)
+{
+	return 0;
+}
+#endif	/* CONFIG_DEBUG_FS */
+
+static const struct file_operations ocfs2_osb_debug_fops = {
+	.open =		ocfs2_osb_debug_open,
+	.release =	ocfs2_debug_release,
+	.read =		ocfs2_debug_read,
+	.llseek =	generic_file_llseek,
+};
+
+static int ocfs2_sync_fs(struct super_block *sb, int wait)
+{
+	int status;
+	tid_t target;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (wait) {
+		status = ocfs2_flush_truncate_log(osb);
+		if (status < 0)
+			mlog_errno(status);
+	} else {
+		ocfs2_schedule_truncate_log_flush(osb, 0);
+	}
+
+	if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
+				      &target)) {
+		if (wait)
+			jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+					     target);
+	}
+	return 0;
+}
+
+static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
+{
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+	    && (ino == USER_QUOTA_SYSTEM_INODE
+		|| ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
+		return 0;
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+	    && (ino == GROUP_QUOTA_SYSTEM_INODE
+		|| ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
+		return 0;
+	return 1;
+}
+
+static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
+{
+	struct inode *new = NULL;
+	int status = 0;
+	int i;
+
+	new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
+	if (IS_ERR(new)) {
+		status = PTR_ERR(new);
+		mlog_errno(status);
+		goto bail;
+	}
+	osb->root_inode = new;
+
+	new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
+	if (IS_ERR(new)) {
+		status = PTR_ERR(new);
+		mlog_errno(status);
+		goto bail;
+	}
+	osb->sys_root_inode = new;
+
+	for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
+	     i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
+		if (!ocfs2_need_system_inode(osb, i))
+			continue;
+		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
+		if (!new) {
+			ocfs2_release_system_inodes(osb);
+			status = -EINVAL;
+			mlog_errno(status);
+			/* FIXME: Should ERROR_RO_FS */
+			mlog(ML_ERROR, "Unable to load system inode %d, "
+			     "possibly corrupt fs?", i);
+			goto bail;
+		}
+		// the array now has one ref, so drop this one
+		iput(new);
+	}
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
+{
+	struct inode *new = NULL;
+	int status = 0;
+	int i;
+
+	for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
+	     i < NUM_SYSTEM_INODES;
+	     i++) {
+		if (!ocfs2_need_system_inode(osb, i))
+			continue;
+		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
+		if (!new) {
+			ocfs2_release_system_inodes(osb);
+			status = -EINVAL;
+			mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
+			     status, i, osb->slot_num);
+			goto bail;
+		}
+		/* the array now has one ref, so drop this one */
+		iput(new);
+	}
+
+bail:
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
+{
+	int i;
+	struct inode *inode;
+
+	for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
+		inode = osb->global_system_inodes[i];
+		if (inode) {
+			iput(inode);
+			osb->global_system_inodes[i] = NULL;
+		}
+	}
+
+	inode = osb->sys_root_inode;
+	if (inode) {
+		iput(inode);
+		osb->sys_root_inode = NULL;
+	}
+
+	inode = osb->root_inode;
+	if (inode) {
+		iput(inode);
+		osb->root_inode = NULL;
+	}
+
+	if (!osb->local_system_inodes)
+		return;
+
+	for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
+		if (osb->local_system_inodes[i]) {
+			iput(osb->local_system_inodes[i]);
+			osb->local_system_inodes[i] = NULL;
+		}
+	}
+
+	kfree(osb->local_system_inodes);
+	osb->local_system_inodes = NULL;
+}
+
+/* We're allocating fs objects, use GFP_NOFS */
+static struct inode *ocfs2_alloc_inode(struct super_block *sb)
+{
+	struct ocfs2_inode_info *oi;
+
+	oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS);
+	if (!oi)
+		return NULL;
+
+	jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
+	return &oi->vfs_inode;
+}
+
+static void ocfs2_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
+}
+
+static void ocfs2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ocfs2_i_callback);
+}
+
+static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
+						unsigned int cbits)
+{
+	unsigned int bytes = 1 << cbits;
+	unsigned int trim = bytes;
+	unsigned int bitshift = 32;
+
+	/*
+	 * i_size and all block offsets in ocfs2 are always 64 bits
+	 * wide. i_clusters is 32 bits, in cluster-sized units. So on
+	 * 64 bit platforms, cluster size will be the limiting factor.
+	 */
+
+#if BITS_PER_LONG == 32
+# if defined(CONFIG_LBDAF)
+	BUILD_BUG_ON(sizeof(sector_t) != 8);
+	/*
+	 * We might be limited by page cache size.
+	 */
+	if (bytes > PAGE_CACHE_SIZE) {
+		bytes = PAGE_CACHE_SIZE;
+		trim = 1;
+		/*
+		 * Shift by 31 here so that we don't get larger than
+		 * MAX_LFS_FILESIZE
+		 */
+		bitshift = 31;
+	}
+# else
+	/*
+	 * We are limited by the size of sector_t. Use block size, as
+	 * that's what we expose to the VFS.
+	 */
+	bytes = 1 << bbits;
+	trim = 1;
+	bitshift = 31;
+# endif
+#endif
+
+	/*
+	 * Trim by a whole cluster when we can actually approach the
+	 * on-disk limits. Otherwise we can overflow i_clusters when
+	 * an extent start is at the max offset.
+	 */
+	return (((unsigned long long)bytes) << bitshift) - trim;
+}
+
+static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
+{
+	int incompat_features;
+	int ret = 0;
+	struct mount_options parsed_options;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	u32 tmp;
+
+	if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
+	    !ocfs2_check_set_options(sb, &parsed_options)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+		OCFS2_MOUNT_HB_NONE;
+	if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
+		goto out;
+	}
+
+	if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
+	    (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot change data mode on remount\n");
+		goto out;
+	}
+
+	/* Probably don't want this on remount; it might
+	 * mess with other nodes */
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
+	    (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
+		goto out;
+	}
+
+	/* We're going to/from readonly mode. */
+	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+		/* Disable quota accounting before remounting RO */
+		if (*flags & MS_RDONLY) {
+			ret = ocfs2_susp_quotas(osb, 0);
+			if (ret < 0)
+				goto out;
+		}
+		/* Lock here so the check of HARD_RO and the potential
+		 * setting of SOFT_RO is atomic. */
+		spin_lock(&osb->osb_lock);
+		if (osb->osb_flags & OCFS2_OSB_HARD_RO) {
+			mlog(ML_ERROR, "Remount on readonly device is forbidden.\n");
+			ret = -EROFS;
+			goto unlock_osb;
+		}
+
+		if (*flags & MS_RDONLY) {
+			sb->s_flags |= MS_RDONLY;
+			osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+		} else {
+			if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
+				mlog(ML_ERROR, "Cannot remount RDWR "
+				     "filesystem due to previous errors.\n");
+				ret = -EROFS;
+				goto unlock_osb;
+			}
+			incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP);
+			if (incompat_features) {
+				mlog(ML_ERROR, "Cannot remount RDWR because "
+				     "of unsupported optional features "
+				     "(%x).\n", incompat_features);
+				ret = -EINVAL;
+				goto unlock_osb;
+			}
+			sb->s_flags &= ~MS_RDONLY;
+			osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
+		}
+		trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
+unlock_osb:
+		spin_unlock(&osb->osb_lock);
+		/* Enable quota accounting after remounting RW */
+		if (!ret && !(*flags & MS_RDONLY)) {
+			if (sb_any_quota_suspended(sb))
+				ret = ocfs2_susp_quotas(osb, 1);
+			else
+				ret = ocfs2_enable_quotas(osb);
+			if (ret < 0) {
+				/* Return back changes... */
+				spin_lock(&osb->osb_lock);
+				sb->s_flags |= MS_RDONLY;
+				osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+				spin_unlock(&osb->osb_lock);
+				goto out;
+			}
+		}
+	}
+
+	if (!ret) {
+		/* Only save off the new mount options in case of a successful
+		 * remount. */
+		osb->s_mount_opt = parsed_options.mount_opt;
+		osb->s_atime_quantum = parsed_options.atime_quantum;
+		osb->preferred_slot = parsed_options.slot;
+		if (parsed_options.commit_interval)
+			osb->osb_commit_interval = parsed_options.commit_interval;
+
+		if (!ocfs2_is_hard_readonly(osb))
+			ocfs2_set_journal_params(osb);
+
+		sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+			((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
+							MS_POSIXACL : 0);
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_sb_probe(struct super_block *sb,
+			  struct buffer_head **bh,
+			  int *sector_size,
+			  struct ocfs2_blockcheck_stats *stats)
+{
+	int status, tmpstat;
+	struct ocfs1_vol_disk_hdr *hdr;
+	struct ocfs2_dinode *di;
+	int blksize;
+
+	*bh = NULL;
+
+	/* may be > 512 */
+	*sector_size = bdev_logical_block_size(sb->s_bdev);
+	if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
+		mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
+		     *sector_size, OCFS2_MAX_BLOCKSIZE);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	/* Can this really happen? */
+	if (*sector_size < OCFS2_MIN_BLOCKSIZE)
+		*sector_size = OCFS2_MIN_BLOCKSIZE;
+
+	/* check block zero for old format */
+	status = ocfs2_get_sector(sb, bh, 0, *sector_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data;
+	if (hdr->major_version == OCFS1_MAJOR_VERSION) {
+		mlog(ML_ERROR, "incompatible version: %u.%u\n",
+		     hdr->major_version, hdr->minor_version);
+		status = -EINVAL;
+	}
+	if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE,
+		   strlen(OCFS1_VOLUME_SIGNATURE)) == 0) {
+		mlog(ML_ERROR, "incompatible volume signature: %8s\n",
+		     hdr->signature);
+		status = -EINVAL;
+	}
+	brelse(*bh);
+	*bh = NULL;
+	if (status < 0) {
+		mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be "
+		     "upgraded before mounting with ocfs v2\n");
+		goto bail;
+	}
+
+	/*
+	 * Now check at magic offset for 512, 1024, 2048, 4096
+	 * blocksizes.  4096 is the maximum blocksize because it is
+	 * the minimum clustersize.
+	 */
+	status = -EINVAL;
+	for (blksize = *sector_size;
+	     blksize <= OCFS2_MAX_BLOCKSIZE;
+	     blksize <<= 1) {
+		tmpstat = ocfs2_get_sector(sb, bh,
+					   OCFS2_SUPER_BLOCK_BLKNO,
+					   blksize);
+		if (tmpstat < 0) {
+			status = tmpstat;
+			mlog_errno(status);
+			break;
+		}
+		di = (struct ocfs2_dinode *) (*bh)->b_data;
+		memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
+		spin_lock_init(&stats->b_lock);
+		tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats);
+		if (tmpstat < 0) {
+			brelse(*bh);
+			*bh = NULL;
+		}
+		if (tmpstat != -EAGAIN) {
+			status = tmpstat;
+			break;
+		}
+	}
+
+bail:
+	return status;
+}
+
+static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
+{
+	u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
+
+	if (osb->s_mount_opt & hb_enabled) {
+		if (ocfs2_mount_local(osb)) {
+			mlog(ML_ERROR, "Cannot heartbeat on a locally "
+			     "mounted device.\n");
+			return -EINVAL;
+		}
+		if (ocfs2_userspace_stack(osb)) {
+			mlog(ML_ERROR, "Userspace stack expected, but "
+			     "o2cb heartbeat arguments passed to mount\n");
+			return -EINVAL;
+		}
+		if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
+		     !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
+		    ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
+		     ocfs2_cluster_o2cb_global_heartbeat(osb))) {
+			mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
+			return -EINVAL;
+		}
+	}
+
+	if (!(osb->s_mount_opt & hb_enabled)) {
+		if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
+		    !ocfs2_userspace_stack(osb)) {
+			mlog(ML_ERROR, "Heartbeat has to be started to mount "
+			     "a read-write clustered device.\n");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * If we're using a userspace stack, mount should have passed
+ * a name that matches the disk.  If not, mount should not
+ * have passed a stack.
+ */
+static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
+					struct mount_options *mopt)
+{
+	if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) {
+		mlog(ML_ERROR,
+		     "cluster stack passed to mount, but this filesystem "
+		     "does not support it\n");
+		return -EINVAL;
+	}
+
+	if (ocfs2_userspace_stack(osb) &&
+	    strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
+		    OCFS2_STACK_LABEL_LEN)) {
+		mlog(ML_ERROR,
+		     "cluster stack passed to mount (\"%s\") does not "
+		     "match the filesystem (\"%s\")\n",
+		     mopt->cluster_stack,
+		     osb->osb_cluster_stack);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
+{
+	int type;
+	struct super_block *sb = osb->sb;
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+	int status = 0;
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+			continue;
+		if (unsuspend)
+			status = dquot_resume(sb, type);
+		else {
+			struct ocfs2_mem_dqinfo *oinfo;
+
+			/* Cancel periodic syncing before suspending */
+			oinfo = sb_dqinfo(sb, type)->dqi_priv;
+			cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+			status = dquot_suspend(sb, type);
+		}
+		if (status < 0)
+			break;
+	}
+	if (status < 0)
+		mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
+		     "remount (error = %d).\n", status);
+	return status;
+}
+
+static int ocfs2_enable_quotas(struct ocfs2_super *osb)
+{
+	struct inode *inode[MAXQUOTAS] = { NULL, NULL };
+	struct super_block *sb = osb->sb;
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+	unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+					LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+	int status;
+	int type;
+
+	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+			continue;
+		inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
+							osb->slot_num);
+		if (!inode[type]) {
+			status = -ENOENT;
+			goto out_quota_off;
+		}
+		status = dquot_enable(inode[type], type, QFMT_OCFS2,
+				      DQUOT_USAGE_ENABLED);
+		if (status < 0)
+			goto out_quota_off;
+	}
+
+	for (type = 0; type < MAXQUOTAS; type++)
+		iput(inode[type]);
+	return 0;
+out_quota_off:
+	ocfs2_disable_quotas(osb);
+	for (type = 0; type < MAXQUOTAS; type++)
+		iput(inode[type]);
+	mlog_errno(status);
+	return status;
+}
+
+static void ocfs2_disable_quotas(struct ocfs2_super *osb)
+{
+	int type;
+	struct inode *inode;
+	struct super_block *sb = osb->sb;
+	struct ocfs2_mem_dqinfo *oinfo;
+
+	/* We mostly ignore errors in this function because there's not much
+	 * we can do when we see them */
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!sb_has_quota_loaded(sb, type))
+			continue;
+		/* Cancel periodic syncing before we grab dqonoff_mutex */
+		oinfo = sb_dqinfo(sb, type)->dqi_priv;
+		cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+		inode = igrab(sb->s_dquot.files[type]);
+		/* Turn off quotas. This will remove all dquot structures from
+		 * memory and so they will be automatically synced to global
+		 * quota files */
+		dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
+					DQUOT_LIMITS_ENABLED);
+		if (!inode)
+			continue;
+		iput(inode);
+	}
+}
+
+/* Handle quota on quotactl */
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
+{
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+		return -EINVAL;
+
+	return dquot_enable(sb_dqopt(sb)->files[type], type,
+			    format_id, DQUOT_LIMITS_ENABLED);
+}
+
+/* Handle quota off quotactl */
+static int ocfs2_quota_off(struct super_block *sb, int type)
+{
+	return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+}
+
+static const struct quotactl_ops ocfs2_quotactl_ops = {
+	.quota_on_meta	= ocfs2_quota_on,
+	.quota_off	= ocfs2_quota_off,
+	.quota_sync	= dquot_quota_sync,
+	.get_info	= dquot_get_dqinfo,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk,
+};
+
+static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct dentry *root;
+	int status, sector_size;
+	struct mount_options parsed_options;
+	struct inode *inode = NULL;
+	struct ocfs2_super *osb = NULL;
+	struct buffer_head *bh = NULL;
+	char nodestr[8];
+	struct ocfs2_blockcheck_stats stats;
+
+	trace_ocfs2_fill_super(sb, data, silent);
+
+	if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
+		status = -EINVAL;
+		goto read_super_error;
+	}
+
+	/* probe for superblock */
+	status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats);
+	if (status < 0) {
+		mlog(ML_ERROR, "superblock probe failed!\n");
+		goto read_super_error;
+	}
+
+	status = ocfs2_initialize_super(sb, bh, sector_size, &stats);
+	osb = OCFS2_SB(sb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto read_super_error;
+	}
+	brelse(bh);
+	bh = NULL;
+
+	if (!ocfs2_check_set_options(sb, &parsed_options)) {
+		status = -EINVAL;
+		goto read_super_error;
+	}
+	osb->s_mount_opt = parsed_options.mount_opt;
+	osb->s_atime_quantum = parsed_options.atime_quantum;
+	osb->preferred_slot = parsed_options.slot;
+	osb->osb_commit_interval = parsed_options.commit_interval;
+
+	ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
+	osb->osb_resv_level = parsed_options.resv_level;
+	osb->osb_dir_resv_level = parsed_options.resv_level;
+	if (parsed_options.dir_resv_level == -1)
+		osb->osb_dir_resv_level = parsed_options.resv_level;
+	else
+		osb->osb_dir_resv_level = parsed_options.dir_resv_level;
+
+	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
+	if (status)
+		goto read_super_error;
+
+	sb->s_magic = OCFS2_SUPER_MAGIC;
+
+	sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
+		((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
+	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
+	 * heartbeat=none */
+	if (bdev_read_only(sb->s_bdev)) {
+		if (!(sb->s_flags & MS_RDONLY)) {
+			status = -EACCES;
+			mlog(ML_ERROR, "Readonly device detected but readonly "
+			     "mount was not specified.\n");
+			goto read_super_error;
+		}
+
+		/* You should not be able to start a local heartbeat
+		 * on a readonly device. */
+		if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+			status = -EROFS;
+			mlog(ML_ERROR, "Local heartbeat specified on readonly "
+			     "device.\n");
+			goto read_super_error;
+		}
+
+		status = ocfs2_check_journals_nolocks(osb);
+		if (status < 0) {
+			if (status == -EROFS)
+				mlog(ML_ERROR, "Recovery required on readonly "
+				     "file system, but write access is "
+				     "unavailable.\n");
+			else
+				mlog_errno(status);
+			goto read_super_error;
+		}
+
+		ocfs2_set_ro_flag(osb, 1);
+
+		printk(KERN_NOTICE "Readonly device detected. No cluster "
+		       "services will be utilized for this mount. Recovery "
+		       "will be skipped.\n");
+	}
+
+	if (!ocfs2_is_hard_readonly(osb)) {
+		if (sb->s_flags & MS_RDONLY)
+			ocfs2_set_ro_flag(osb, 0);
+	}
+
+	status = ocfs2_verify_heartbeat(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto read_super_error;
+	}
+
+	osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
+						 ocfs2_debugfs_root);
+	if (!osb->osb_debug_root) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
+		goto read_super_error;
+	}
+
+	osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
+					    osb->osb_debug_root,
+					    osb,
+					    &ocfs2_osb_debug_fops);
+	if (!osb->osb_ctxt) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto read_super_error;
+	}
+
+	if (ocfs2_meta_ecc(osb)) {
+		status = ocfs2_blockcheck_stats_debugfs_install(
+						&osb->osb_ecc_stats,
+						osb->osb_debug_root);
+		if (status) {
+			mlog(ML_ERROR,
+			     "Unable to create blockcheck statistics "
+			     "files\n");
+			goto read_super_error;
+		}
+	}
+
+	status = ocfs2_mount_volume(sb);
+	if (osb->root_inode)
+		inode = igrab(osb->root_inode);
+
+	if (status < 0)
+		goto read_super_error;
+
+	if (!inode) {
+		status = -EIO;
+		mlog_errno(status);
+		goto read_super_error;
+	}
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto read_super_error;
+	}
+
+	sb->s_root = root;
+
+	ocfs2_complete_mount_recovery(osb);
+
+	if (ocfs2_mount_local(osb))
+		snprintf(nodestr, sizeof(nodestr), "local");
+	else
+		snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
+
+	printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) "
+	       "with %s data mode.\n",
+	       osb->dev_str, nodestr, osb->slot_num,
+	       osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
+	       "ordered");
+
+	atomic_set(&osb->vol_state, VOLUME_MOUNTED);
+	wake_up(&osb->osb_mount_event);
+
+	/* Now we can initialize quotas because we can afford to wait
+	 * for cluster locks recovery now. That also means that truncation
+	 * log recovery can happen but that waits for proper quota setup */
+	if (!(sb->s_flags & MS_RDONLY)) {
+		status = ocfs2_enable_quotas(osb);
+		if (status < 0) {
+			/* We have to err-out specially here because
+			 * s_root is already set */
+			mlog_errno(status);
+			atomic_set(&osb->vol_state, VOLUME_DISABLED);
+			wake_up(&osb->osb_mount_event);
+			return status;
+		}
+	}
+
+	ocfs2_complete_quota_recovery(osb);
+
+	/* Now we wake up again for processes waiting for quotas */
+	atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
+	wake_up(&osb->osb_mount_event);
+
+	/* Start this when the mount is almost sure of being successful */
+	ocfs2_orphan_scan_start(osb);
+
+	return status;
+
+read_super_error:
+	brelse(bh);
+
+	if (inode)
+		iput(inode);
+
+	if (osb) {
+		atomic_set(&osb->vol_state, VOLUME_DISABLED);
+		wake_up(&osb->osb_mount_event);
+		ocfs2_dismount_volume(sb, 1);
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
+			int flags,
+			const char *dev_name,
+			void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
+}
+
+static void ocfs2_kill_sb(struct super_block *sb)
+{
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	/* Failed mount? */
+	if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED)
+		goto out;
+
+	/* Prevent further queueing of inode drop events */
+	spin_lock(&dentry_list_lock);
+	ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
+	spin_unlock(&dentry_list_lock);
+	/* Wait for work to finish and/or remove it */
+	cancel_work_sync(&osb->dentry_lock_work);
+out:
+	kill_block_super(sb);
+}
+
+static struct file_system_type ocfs2_fs_type = {
+	.owner          = THIS_MODULE,
+	.name           = "ocfs2",
+	.mount          = ocfs2_mount,
+	.kill_sb        = ocfs2_kill_sb,
+
+	.fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
+	.next           = NULL
+};
+
+static int ocfs2_check_set_options(struct super_block *sb,
+				   struct mount_options *options)
+{
+	if (options->mount_opt & OCFS2_MOUNT_USRQUOTA &&
+	    !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+					 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+		mlog(ML_ERROR, "User quotas were requested, but this "
+		     "filesystem does not have the feature enabled.\n");
+		return 0;
+	}
+	if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+	    !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+					 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+		mlog(ML_ERROR, "Group quotas were requested, but this "
+		     "filesystem does not have the feature enabled.\n");
+		return 0;
+	}
+	if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+	    !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) {
+		mlog(ML_ERROR, "ACL support requested but extended attributes "
+		     "feature is not enabled\n");
+		return 0;
+	}
+	/* No ACL setting specified? Use XATTR feature... */
+	if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL |
+				    OCFS2_MOUNT_NO_POSIX_ACL))) {
+		if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR))
+			options->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+		else
+			options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
+	}
+	return 1;
+}
+
+static int ocfs2_parse_options(struct super_block *sb,
+			       char *options,
+			       struct mount_options *mopt,
+			       int is_remount)
+{
+	int status, user_stack = 0;
+	char *p;
+	u32 tmp;
+
+	trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
+
+	mopt->commit_interval = 0;
+	mopt->mount_opt = OCFS2_MOUNT_NOINTR;
+	mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
+	mopt->slot = OCFS2_INVALID_SLOT;
+	mopt->localalloc_opt = -1;
+	mopt->cluster_stack[0] = '\0';
+	mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+	mopt->dir_resv_level = -1;
+
+	if (!options) {
+		status = 1;
+		goto bail;
+	}
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token, option;
+		substring_t args[MAX_OPT_ARGS];
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_hb_local:
+			mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
+			break;
+		case Opt_hb_none:
+			mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
+			break;
+		case Opt_hb_global:
+			mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
+			break;
+		case Opt_barrier:
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option)
+				mopt->mount_opt |= OCFS2_MOUNT_BARRIER;
+			else
+				mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER;
+			break;
+		case Opt_intr:
+			mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR;
+			break;
+		case Opt_nointr:
+			mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
+			break;
+		case Opt_err_panic:
+			mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
+			break;
+		case Opt_err_ro:
+			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+			break;
+		case Opt_data_ordered:
+			mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
+			break;
+		case Opt_data_writeback:
+			mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
+			break;
+		case Opt_user_xattr:
+			mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
+			break;
+		case Opt_nouser_xattr:
+			mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
+			break;
+		case Opt_atime_quantum:
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option >= 0)
+				mopt->atime_quantum = option;
+			break;
+		case Opt_slot:
+			option = 0;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option)
+				mopt->slot = (s16)option;
+			break;
+		case Opt_commit:
+			option = 0;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option < 0)
+				return 0;
+			if (option == 0)
+				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
+			mopt->commit_interval = HZ * option;
+			break;
+		case Opt_localalloc:
+			option = 0;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option >= 0)
+				mopt->localalloc_opt = option;
+			break;
+		case Opt_localflocks:
+			/*
+			 * Changing this during remount could race
+			 * flock() requests, or "unbalance" existing
+			 * ones (e.g., a lock is taken in one mode but
+			 * dropped in the other). If users care enough
+			 * to flip locking modes during remount, we
+			 * could add a "local" flag to individual
+			 * flock structures for proper tracking of
+			 * state.
+			 */
+			if (!is_remount)
+				mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
+			break;
+		case Opt_stack:
+			/* Check both that the option we were passed
+			 * is of the right length and that it is a proper
+			 * string of the right length.
+			 */
+			if (((args[0].to - args[0].from) !=
+			     OCFS2_STACK_LABEL_LEN) ||
+			    (strnlen(args[0].from,
+				     OCFS2_STACK_LABEL_LEN) !=
+			     OCFS2_STACK_LABEL_LEN)) {
+				mlog(ML_ERROR,
+				     "Invalid cluster_stack option\n");
+				status = 0;
+				goto bail;
+			}
+			memcpy(mopt->cluster_stack, args[0].from,
+			       OCFS2_STACK_LABEL_LEN);
+			mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+			/*
+			 * Open code the memcmp here as we don't have
+			 * an osb to pass to
+			 * ocfs2_userspace_stack().
+			 */
+			if (memcmp(mopt->cluster_stack,
+				   OCFS2_CLASSIC_CLUSTER_STACK,
+				   OCFS2_STACK_LABEL_LEN))
+				user_stack = 1;
+			break;
+		case Opt_inode64:
+			mopt->mount_opt |= OCFS2_MOUNT_INODE64;
+			break;
+		case Opt_usrquota:
+			mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
+			break;
+		case Opt_grpquota:
+			mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
+			break;
+		case Opt_coherency_buffered:
+			mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
+			break;
+		case Opt_coherency_full:
+			mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+			break;
+		case Opt_acl:
+			mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+			mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
+			break;
+		case Opt_noacl:
+			mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
+			mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+			break;
+		case Opt_resv_level:
+			if (is_remount)
+				break;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option >= OCFS2_MIN_RESV_LEVEL &&
+			    option < OCFS2_MAX_RESV_LEVEL)
+				mopt->resv_level = option;
+			break;
+		case Opt_dir_resv_level:
+			if (is_remount)
+				break;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option >= OCFS2_MIN_RESV_LEVEL &&
+			    option < OCFS2_MAX_RESV_LEVEL)
+				mopt->dir_resv_level = option;
+			break;
+		default:
+			mlog(ML_ERROR,
+			     "Unrecognized mount option \"%s\" "
+			     "or missing value\n", p);
+			status = 0;
+			goto bail;
+		}
+	}
+
+	if (user_stack == 0) {
+		/* Ensure only one heartbeat mode */
+		tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
+					 OCFS2_MOUNT_HB_GLOBAL |
+					 OCFS2_MOUNT_HB_NONE);
+		if (hweight32(tmp) != 1) {
+			mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+			status = 0;
+			goto bail;
+		}
+	}
+
+	status = 1;
+
+bail:
+	return status;
+}
+
+static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+	struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
+	unsigned long opts = osb->s_mount_opt;
+	unsigned int local_alloc_megs;
+
+	if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
+		seq_printf(s, ",_netdev");
+		if (opts & OCFS2_MOUNT_HB_LOCAL)
+			seq_printf(s, ",%s", OCFS2_HB_LOCAL);
+		else
+			seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
+	} else
+		seq_printf(s, ",%s", OCFS2_HB_NONE);
+
+	if (opts & OCFS2_MOUNT_NOINTR)
+		seq_printf(s, ",nointr");
+
+	if (opts & OCFS2_MOUNT_DATA_WRITEBACK)
+		seq_printf(s, ",data=writeback");
+	else
+		seq_printf(s, ",data=ordered");
+
+	if (opts & OCFS2_MOUNT_BARRIER)
+		seq_printf(s, ",barrier=1");
+
+	if (opts & OCFS2_MOUNT_ERRORS_PANIC)
+		seq_printf(s, ",errors=panic");
+	else
+		seq_printf(s, ",errors=remount-ro");
+
+	if (osb->preferred_slot != OCFS2_INVALID_SLOT)
+		seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
+
+	if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME))
+		seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
+
+	if (osb->osb_commit_interval)
+		seq_printf(s, ",commit=%u",
+			   (unsigned) (osb->osb_commit_interval / HZ));
+
+	local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
+	if (local_alloc_megs != ocfs2_la_default_mb(osb))
+		seq_printf(s, ",localalloc=%d", local_alloc_megs);
+
+	if (opts & OCFS2_MOUNT_LOCALFLOCKS)
+		seq_printf(s, ",localflocks,");
+
+	if (osb->osb_cluster_stack[0])
+		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
+			   osb->osb_cluster_stack);
+	if (opts & OCFS2_MOUNT_USRQUOTA)
+		seq_printf(s, ",usrquota");
+	if (opts & OCFS2_MOUNT_GRPQUOTA)
+		seq_printf(s, ",grpquota");
+
+	if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
+		seq_printf(s, ",coherency=buffered");
+	else
+		seq_printf(s, ",coherency=full");
+
+	if (opts & OCFS2_MOUNT_NOUSERXATTR)
+		seq_printf(s, ",nouser_xattr");
+	else
+		seq_printf(s, ",user_xattr");
+
+	if (opts & OCFS2_MOUNT_INODE64)
+		seq_printf(s, ",inode64");
+
+	if (opts & OCFS2_MOUNT_POSIX_ACL)
+		seq_printf(s, ",acl");
+	else
+		seq_printf(s, ",noacl");
+
+	if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
+		seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
+
+	if (osb->osb_dir_resv_level != osb->osb_resv_level)
+		seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
+
+	return 0;
+}
+
+static int __init ocfs2_init(void)
+{
+	int status;
+
+	ocfs2_print_version();
+
+	status = init_ocfs2_uptodate_cache();
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_initialize_mem_caches();
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+	if (!ocfs2_wq) {
+		status = -ENOMEM;
+		goto leave;
+	}
+
+	ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
+	if (!ocfs2_debugfs_root) {
+		status = -EFAULT;
+		mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
+	}
+
+	ocfs2_set_locking_protocol();
+
+	status = register_quota_format(&ocfs2_quota_format);
+leave:
+	if (status < 0) {
+		ocfs2_free_mem_caches();
+		exit_ocfs2_uptodate_cache();
+		mlog_errno(status);
+	}
+
+	if (status >= 0) {
+		return register_filesystem(&ocfs2_fs_type);
+	} else
+		return -1;
+}
+
+static void __exit ocfs2_exit(void)
+{
+	if (ocfs2_wq) {
+		flush_workqueue(ocfs2_wq);
+		destroy_workqueue(ocfs2_wq);
+	}
+
+	unregister_quota_format(&ocfs2_quota_format);
+
+	debugfs_remove(ocfs2_debugfs_root);
+
+	ocfs2_free_mem_caches();
+
+	unregister_filesystem(&ocfs2_fs_type);
+
+	exit_ocfs2_uptodate_cache();
+}
+
+static void ocfs2_put_super(struct super_block *sb)
+{
+	trace_ocfs2_put_super(sb);
+
+	ocfs2_sync_blockdev(sb);
+	ocfs2_dismount_volume(sb, 0);
+}
+
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct ocfs2_super *osb;
+	u32 numbits, freebits;
+	int status;
+	struct ocfs2_dinode *bm_lock;
+	struct buffer_head *bh = NULL;
+	struct inode *inode = NULL;
+
+	trace_ocfs2_statfs(dentry->d_sb, buf);
+
+	osb = OCFS2_SB(dentry->d_sb);
+
+	inode = ocfs2_get_system_file_inode(osb,
+					    GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		mlog(ML_ERROR, "failed to get bitmap inode\n");
+		status = -EIO;
+		goto bail;
+	}
+
+	status = ocfs2_inode_lock(inode, &bh, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bm_lock = (struct ocfs2_dinode *) bh->b_data;
+
+	numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total);
+	freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
+
+	buf->f_type = OCFS2_SUPER_MAGIC;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
+	buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
+	buf->f_blocks = ((sector_t) numbits) *
+			(osb->s_clustersize >> osb->sb->s_blocksize_bits);
+	buf->f_bfree = ((sector_t) freebits) *
+		       (osb->s_clustersize >> osb->sb->s_blocksize_bits);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = numbits;
+	buf->f_ffree = freebits;
+	buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN)
+				& 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN,
+				OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL;
+
+	brelse(bh);
+
+	ocfs2_inode_unlock(inode, 0);
+	status = 0;
+bail:
+	if (inode)
+		iput(inode);
+
+	if (status)
+		mlog_errno(status);
+
+	return status;
+}
+
+static void ocfs2_inode_init_once(void *data)
+{
+	struct ocfs2_inode_info *oi = data;
+
+	oi->ip_flags = 0;
+	oi->ip_open_count = 0;
+	spin_lock_init(&oi->ip_lock);
+	ocfs2_extent_map_init(&oi->vfs_inode);
+	INIT_LIST_HEAD(&oi->ip_io_markers);
+	oi->ip_dir_start_lookup = 0;
+
+	init_rwsem(&oi->ip_alloc_sem);
+	init_rwsem(&oi->ip_xattr_sem);
+	mutex_init(&oi->ip_io_mutex);
+
+	oi->ip_blkno = 0ULL;
+	oi->ip_clusters = 0;
+
+	ocfs2_resv_init_once(&oi->ip_la_data_resv);
+
+	ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
+	ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
+	ocfs2_lock_res_init_once(&oi->ip_open_lockres);
+
+	ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
+				  &ocfs2_inode_caching_ops);
+
+	inode_init_once(&oi->vfs_inode);
+}
+
+static int ocfs2_initialize_mem_caches(void)
+{
+	ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache",
+				       sizeof(struct ocfs2_inode_info),
+				       0,
+				       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+				       ocfs2_inode_init_once);
+	ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
+					sizeof(struct ocfs2_dquot),
+					0,
+					(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					NULL);
+	ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
+					sizeof(struct ocfs2_quota_chunk),
+					0,
+					(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+					NULL);
+	if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
+	    !ocfs2_qf_chunk_cachep) {
+		if (ocfs2_inode_cachep)
+			kmem_cache_destroy(ocfs2_inode_cachep);
+		if (ocfs2_dquot_cachep)
+			kmem_cache_destroy(ocfs2_dquot_cachep);
+		if (ocfs2_qf_chunk_cachep)
+			kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void ocfs2_free_mem_caches(void)
+{
+	if (ocfs2_inode_cachep)
+		kmem_cache_destroy(ocfs2_inode_cachep);
+	ocfs2_inode_cachep = NULL;
+
+	if (ocfs2_dquot_cachep)
+		kmem_cache_destroy(ocfs2_dquot_cachep);
+	ocfs2_dquot_cachep = NULL;
+
+	if (ocfs2_qf_chunk_cachep)
+		kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+	ocfs2_qf_chunk_cachep = NULL;
+}
+
+static int ocfs2_get_sector(struct super_block *sb,
+			    struct buffer_head **bh,
+			    int block,
+			    int sect_size)
+{
+	if (!sb_set_blocksize(sb, sect_size)) {
+		mlog(ML_ERROR, "unable to set blocksize\n");
+		return -EIO;
+	}
+
+	*bh = sb_getblk(sb, block);
+	if (!*bh) {
+		mlog_errno(-EIO);
+		return -EIO;
+	}
+	lock_buffer(*bh);
+	if (!buffer_dirty(*bh))
+		clear_buffer_uptodate(*bh);
+	unlock_buffer(*bh);
+	ll_rw_block(READ, 1, bh);
+	wait_on_buffer(*bh);
+	if (!buffer_uptodate(*bh)) {
+		mlog_errno(-EIO);
+		brelse(*bh);
+		*bh = NULL;
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int ocfs2_mount_volume(struct super_block *sb)
+{
+	int status = 0;
+	int unlock_super = 0;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (ocfs2_is_hard_readonly(osb))
+		goto leave;
+
+	status = ocfs2_dlm_init(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_super_lock(osb, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+	unlock_super = 1;
+
+	/* This will load up the node map and add ourselves to it. */
+	status = ocfs2_find_slot(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* load all node-local system inodes */
+	status = ocfs2_init_local_system_inodes(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_check_volume(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_truncate_log_init(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+leave:
+	if (unlock_super)
+		ocfs2_super_unlock(osb, 1);
+
+	return status;
+}
+
+static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
+{
+	int tmp, hangup_needed = 0;
+	struct ocfs2_super *osb = NULL;
+	char nodestr[8];
+
+	trace_ocfs2_dismount_volume(sb);
+
+	BUG_ON(!sb);
+	osb = OCFS2_SB(sb);
+	BUG_ON(!osb);
+
+	debugfs_remove(osb->osb_ctxt);
+
+	/*
+	 * Flush inode dropping work queue so that deletes are
+	 * performed while the filesystem is still working
+	 */
+	ocfs2_drop_all_dl_inodes(osb);
+
+	/* Orphan scan should be stopped as early as possible */
+	ocfs2_orphan_scan_stop(osb);
+
+	ocfs2_disable_quotas(osb);
+
+	ocfs2_shutdown_local_alloc(osb);
+
+	ocfs2_truncate_log_shutdown(osb);
+
+	/* This will disable recovery and flush any recovery work. */
+	ocfs2_recovery_exit(osb);
+
+	ocfs2_journal_shutdown(osb);
+
+	ocfs2_sync_blockdev(sb);
+
+	ocfs2_purge_refcount_trees(osb);
+
+	/* No cluster connection means we've failed during mount, so skip
+	 * all the steps which depended on that to complete. */
+	if (osb->cconn) {
+		tmp = ocfs2_super_lock(osb, 1);
+		if (tmp < 0) {
+			mlog_errno(tmp);
+			return;
+		}
+	}
+
+	if (osb->slot_num != OCFS2_INVALID_SLOT)
+		ocfs2_put_slot(osb);
+
+	if (osb->cconn)
+		ocfs2_super_unlock(osb, 1);
+
+	ocfs2_release_system_inodes(osb);
+
+	/*
+	 * If we're dismounting due to mount error, mount.ocfs2 will clean
+	 * up heartbeat.  If we're a local mount, there is no heartbeat.
+	 * If we failed before we got a uuid_str yet, we can't stop
+	 * heartbeat.  Otherwise, do it.
+	 */
+	if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+		hangup_needed = 1;
+
+	if (osb->cconn)
+		ocfs2_dlm_shutdown(osb, hangup_needed);
+
+	ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
+	debugfs_remove(osb->osb_debug_root);
+
+	if (hangup_needed)
+		ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
+
+	atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
+
+	if (ocfs2_mount_local(osb))
+		snprintf(nodestr, sizeof(nodestr), "local");
+	else
+		snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
+
+	printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n",
+	       osb->dev_str, nodestr);
+
+	ocfs2_delete_osb(osb);
+	kfree(osb);
+	sb->s_dev = 0;
+	sb->s_fs_info = NULL;
+}
+
+static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid,
+				unsigned uuid_bytes)
+{
+	int i, ret;
+	char *ptr;
+
+	BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN);
+
+	osb->uuid_str = kzalloc(OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL);
+	if (osb->uuid_str == NULL)
+		return -ENOMEM;
+
+	for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
+		/* print with null */
+		ret = snprintf(ptr, 3, "%02X", uuid[i]);
+		if (ret != 2) /* drop super cleans up */
+			return -EINVAL;
+		/* then only advance past the last char */
+		ptr += 2;
+	}
+
+	return 0;
+}
+
+/* Make sure entire volume is addressable by our journal.  Requires
+   osb_clusters_at_boot to be valid and for the journal to have been
+   initialized by ocfs2_journal_init(). */
+static int ocfs2_journal_addressable(struct ocfs2_super *osb)
+{
+	int status = 0;
+	u64 max_block =
+		ocfs2_clusters_to_blocks(osb->sb,
+					 osb->osb_clusters_at_boot) - 1;
+
+	/* 32-bit block number is always OK. */
+	if (max_block <= (u32)~0ULL)
+		goto out;
+
+	/* Volume is "huge", so see if our journal is new enough to
+	   support it. */
+	if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+				       OCFS2_FEATURE_COMPAT_JBD2_SB) &&
+	      jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
+					       JBD2_FEATURE_INCOMPAT_64BIT))) {
+		mlog(ML_ERROR, "The journal cannot address the entire volume. "
+		     "Enable the 'block64' journal option with tunefs.ocfs2");
+		status = -EFBIG;
+		goto out;
+	}
+
+ out:
+	return status;
+}
+
+static int ocfs2_initialize_super(struct super_block *sb,
+				  struct buffer_head *bh,
+				  int sector_size,
+				  struct ocfs2_blockcheck_stats *stats)
+{
+	int status;
+	int i, cbits, bbits;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+	struct inode *inode = NULL;
+	struct ocfs2_journal *journal;
+	__le32 uuid_net_key;
+	struct ocfs2_super *osb;
+	u64 total_blocks;
+
+	osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL);
+	if (!osb) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	sb->s_fs_info = osb;
+	sb->s_op = &ocfs2_sops;
+	sb->s_d_op = &ocfs2_dentry_ops;
+	sb->s_export_op = &ocfs2_export_ops;
+	sb->s_qcop = &ocfs2_quotactl_ops;
+	sb->dq_op = &ocfs2_quota_operations;
+	sb->s_xattr = ocfs2_xattr_handlers;
+	sb->s_time_gran = 1;
+	sb->s_flags |= MS_NOATIME;
+	/* this is needed to support O_LARGEFILE */
+	cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
+	bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
+	sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
+
+	osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
+
+	for (i = 0; i < 3; i++)
+		osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
+	osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
+
+	osb->sb = sb;
+	/* Save off for ocfs2_rw_direct */
+	osb->s_sectsize_bits = blksize_bits(sector_size);
+	BUG_ON(!osb->s_sectsize_bits);
+
+	spin_lock_init(&osb->dc_task_lock);
+	init_waitqueue_head(&osb->dc_event);
+	osb->dc_work_sequence = 0;
+	osb->dc_wake_sequence = 0;
+	INIT_LIST_HEAD(&osb->blocked_lock_list);
+	osb->blocked_lock_count = 0;
+	spin_lock_init(&osb->osb_lock);
+	spin_lock_init(&osb->osb_xattr_lock);
+	ocfs2_init_steal_slots(osb);
+
+	atomic_set(&osb->alloc_stats.moves, 0);
+	atomic_set(&osb->alloc_stats.local_data, 0);
+	atomic_set(&osb->alloc_stats.bitmap_data, 0);
+	atomic_set(&osb->alloc_stats.bg_allocs, 0);
+	atomic_set(&osb->alloc_stats.bg_extends, 0);
+
+	/* Copy the blockcheck stats from the superblock probe */
+	osb->osb_ecc_stats = *stats;
+
+	ocfs2_init_node_maps(osb);
+
+	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
+		 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+
+	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
+	if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
+		mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
+		     osb->max_slots);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	ocfs2_orphan_scan_init(osb);
+
+	status = ocfs2_recovery_init(osb);
+	if (status) {
+		mlog(ML_ERROR, "Unable to initialize recovery state\n");
+		mlog_errno(status);
+		goto bail;
+	}
+
+	init_waitqueue_head(&osb->checkpoint_event);
+	atomic_set(&osb->needs_checkpoint, 0);
+
+	osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
+
+	osb->slot_num = OCFS2_INVALID_SLOT;
+
+	osb->s_xattr_inline_size = le16_to_cpu(
+					di->id2.i_super.s_xattr_inline_size);
+
+	osb->local_alloc_state = OCFS2_LA_UNUSED;
+	osb->local_alloc_bh = NULL;
+	INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker);
+
+	init_waitqueue_head(&osb->osb_mount_event);
+
+	status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
+	if (!osb->vol_label) {
+		mlog(ML_ERROR, "unable to alloc vol label\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	osb->slot_recovery_generations =
+		kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
+			GFP_KERNEL);
+	if (!osb->slot_recovery_generations) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	init_waitqueue_head(&osb->osb_wipe_event);
+	osb->osb_orphan_wipes = kcalloc(osb->max_slots,
+					sizeof(*osb->osb_orphan_wipes),
+					GFP_KERNEL);
+	if (!osb->osb_orphan_wipes) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	osb->osb_rf_lock_tree = RB_ROOT;
+
+	osb->s_feature_compat =
+		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
+	osb->s_feature_ro_compat =
+		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat);
+	osb->s_feature_incompat =
+		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat);
+
+	if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) {
+		mlog(ML_ERROR, "couldn't mount because of unsupported "
+		     "optional features (%x).\n", i);
+		status = -EINVAL;
+		goto bail;
+	}
+	if (!(osb->sb->s_flags & MS_RDONLY) &&
+	    (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
+		mlog(ML_ERROR, "couldn't mount RDWR because of "
+		     "unsupported optional features (%x).\n", i);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (ocfs2_clusterinfo_valid(osb)) {
+		osb->osb_stackflags =
+			OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
+		memcpy(osb->osb_cluster_stack,
+		       OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
+		       OCFS2_STACK_LABEL_LEN);
+		osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+		if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
+			mlog(ML_ERROR,
+			     "couldn't mount because of an invalid "
+			     "cluster stack label (%s) \n",
+			     osb->osb_cluster_stack);
+			status = -EINVAL;
+			goto bail;
+		}
+	} else {
+		/* The empty string is identical with classic tools that
+		 * don't know about s_cluster_info. */
+		osb->osb_cluster_stack[0] = '\0';
+	}
+
+	get_random_bytes(&osb->s_next_generation, sizeof(u32));
+
+	/* FIXME
+	 * This should be done in ocfs2_journal_init(), but unknown
+	 * ordering issues will cause the filesystem to crash.
+	 * If anyone wants to figure out what part of the code
+	 * refers to osb->journal before ocfs2_journal_init() is run,
+	 * be my guest.
+	 */
+	/* initialize our journal structure */
+
+	journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
+	if (!journal) {
+		mlog(ML_ERROR, "unable to alloc journal\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+	osb->journal = journal;
+	journal->j_osb = osb;
+
+	atomic_set(&journal->j_num_trans, 0);
+	init_rwsem(&journal->j_trans_barrier);
+	init_waitqueue_head(&journal->j_checkpointed);
+	spin_lock_init(&journal->j_lock);
+	journal->j_trans_id = (unsigned long) 1;
+	INIT_LIST_HEAD(&journal->j_la_cleanups);
+	INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
+	journal->j_state = OCFS2_JOURNAL_FREE;
+
+	INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes);
+	osb->dentry_lock_list = NULL;
+
+	/* get some pseudo constants for clustersize bits */
+	osb->s_clustersize_bits =
+		le32_to_cpu(di->id2.i_super.s_clustersize_bits);
+	osb->s_clustersize = 1 << osb->s_clustersize_bits;
+
+	if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
+	    osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
+		mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
+		     osb->s_clustersize);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	total_blocks = ocfs2_clusters_to_blocks(osb->sb,
+						le32_to_cpu(di->i_clusters));
+
+	status = generic_check_addressable(osb->sb->s_blocksize_bits,
+					   total_blocks);
+	if (status) {
+		mlog(ML_ERROR, "Volume too large "
+		     "to mount safely on this system");
+		status = -EFBIG;
+		goto bail;
+	}
+
+	if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
+				 sizeof(di->id2.i_super.s_uuid))) {
+		mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
+
+	strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
+	osb->vol_label[63] = '\0';
+	osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
+	osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
+	osb->first_cluster_group_blkno =
+		le64_to_cpu(di->id2.i_super.s_first_cluster_group);
+	osb->fs_generation = le32_to_cpu(di->i_fs_generation);
+	osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
+	trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str,
+				     (unsigned long long)osb->root_blkno,
+				     (unsigned long long)osb->system_dir_blkno,
+				     osb->s_clustersize_bits);
+
+	osb->osb_dlm_debug = ocfs2_new_dlm_debug();
+	if (!osb->osb_dlm_debug) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	atomic_set(&osb->vol_state, VOLUME_INIT);
+
+	/* load root, system_dir, and all global system inodes */
+	status = ocfs2_init_global_system_inodes(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/*
+	 * global bitmap
+	 */
+	inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
+	osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
+	iput(inode);
+
+	osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
+				 osb->s_feature_incompat) * 8;
+
+	status = ocfs2_init_slot_info(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	cleancache_init_shared_fs((char *)&uuid_net_key, sb);
+
+bail:
+	return status;
+}
+
+/*
+ * will return: -EAGAIN if it is ok to keep searching for superblocks
+ *              -EINVAL if there is a bad superblock
+ *              0 on success
+ */
+static int ocfs2_verify_volume(struct ocfs2_dinode *di,
+			       struct buffer_head *bh,
+			       u32 blksz,
+			       struct ocfs2_blockcheck_stats *stats)
+{
+	int status = -EAGAIN;
+
+	if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
+		   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
+		/* We have to do a raw check of the feature here */
+		if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
+		    OCFS2_FEATURE_INCOMPAT_META_ECC) {
+			status = ocfs2_block_check_validate(bh->b_data,
+							    bh->b_size,
+							    &di->i_check,
+							    stats);
+			if (status)
+				goto out;
+		}
+		status = -EINVAL;
+		if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
+			mlog(ML_ERROR, "found superblock with incorrect block "
+			     "size: found %u, should be %u\n",
+			     1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
+			       blksz);
+		} else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
+			   OCFS2_MAJOR_REV_LEVEL ||
+			   le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
+			   OCFS2_MINOR_REV_LEVEL) {
+			mlog(ML_ERROR, "found superblock with bad version: "
+			     "found %u.%u, should be %u.%u\n",
+			     le16_to_cpu(di->id2.i_super.s_major_rev_level),
+			     le16_to_cpu(di->id2.i_super.s_minor_rev_level),
+			     OCFS2_MAJOR_REV_LEVEL,
+			     OCFS2_MINOR_REV_LEVEL);
+		} else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
+			mlog(ML_ERROR, "bad block number on superblock: "
+			     "found %llu, should be %llu\n",
+			     (unsigned long long)le64_to_cpu(di->i_blkno),
+			     (unsigned long long)bh->b_blocknr);
+		} else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
+			    le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
+			mlog(ML_ERROR, "bad cluster size found: %u\n",
+			     1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits));
+		} else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) {
+			mlog(ML_ERROR, "bad root_blkno: 0\n");
+		} else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) {
+			mlog(ML_ERROR, "bad system_dir_blkno: 0\n");
+		} else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) {
+			mlog(ML_ERROR,
+			     "Superblock slots found greater than file system "
+			     "maximum: found %u, max %u\n",
+			     le16_to_cpu(di->id2.i_super.s_max_slots),
+			     OCFS2_MAX_SLOTS);
+		} else {
+			/* found it! */
+			status = 0;
+		}
+	}
+
+out:
+	if (status && status != -EAGAIN)
+		mlog_errno(status);
+	return status;
+}
+
+static int ocfs2_check_volume(struct ocfs2_super *osb)
+{
+	int status;
+	int dirty;
+	int local;
+	struct ocfs2_dinode *local_alloc = NULL; /* only used if we
+						  * recover
+						  * ourselves. */
+
+	/* Init our journal object. */
+	status = ocfs2_journal_init(osb->journal, &dirty);
+	if (status < 0) {
+		mlog(ML_ERROR, "Could not initialize journal!\n");
+		goto finally;
+	}
+
+	/* Now that journal has been initialized, check to make sure
+	   entire volume is addressable. */
+	status = ocfs2_journal_addressable(osb);
+	if (status)
+		goto finally;
+
+	/* If the journal was unmounted cleanly then we don't want to
+	 * recover anything. Otherwise, journal_load will do that
+	 * dirty work for us :) */
+	if (!dirty) {
+		status = ocfs2_journal_wipe(osb->journal, 0);
+		if (status < 0) {
+			mlog_errno(status);
+			goto finally;
+		}
+	} else {
+		mlog(ML_NOTICE, "File system was not unmounted cleanly, "
+		     "recovering volume.\n");
+	}
+
+	local = ocfs2_mount_local(osb);
+
+	/* will play back anything left in the journal. */
+	status = ocfs2_journal_load(osb->journal, local, dirty);
+	if (status < 0) {
+		mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
+		goto finally;
+	}
+
+	if (dirty) {
+		/* recover my local alloc if we didn't unmount cleanly. */
+		status = ocfs2_begin_local_alloc_recovery(osb,
+							  osb->slot_num,
+							  &local_alloc);
+		if (status < 0) {
+			mlog_errno(status);
+			goto finally;
+		}
+		/* we complete the recovery process after we've marked
+		 * ourselves as mounted. */
+	}
+
+	status = ocfs2_load_local_alloc(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto finally;
+	}
+
+	if (dirty) {
+		/* Recovery will be completed after we've mounted the
+		 * rest of the volume. */
+		osb->dirty = 1;
+		osb->local_alloc_copy = local_alloc;
+		local_alloc = NULL;
+	}
+
+	/* go through each journal, trylock it and if you get the
+	 * lock, and it's marked as dirty, set the bit in the recover
+	 * map and launch a recovery thread for it. */
+	status = ocfs2_mark_dead_nodes(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto finally;
+	}
+
+	status = ocfs2_compute_replay_slots(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+finally:
+	if (local_alloc)
+		kfree(local_alloc);
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+/*
+ * The routine gets called from dismount or close whenever a dismount on
+ * volume is requested and the osb open count becomes 1.
+ * It will remove the osb from the global list and also free up all the
+ * initialized resources and fileobject.
+ */
+static void ocfs2_delete_osb(struct ocfs2_super *osb)
+{
+	/* This function assumes that the caller has the main osb resource */
+
+	ocfs2_free_slot_info(osb);
+
+	kfree(osb->osb_orphan_wipes);
+	kfree(osb->slot_recovery_generations);
+	/* FIXME
+	 * This belongs in journal shutdown, but because we have to
+	 * allocate osb->journal at the start of ocfs2_initialize_osb(),
+	 * we free it here.
+	 */
+	kfree(osb->journal);
+	if (osb->local_alloc_copy)
+		kfree(osb->local_alloc_copy);
+	kfree(osb->uuid_str);
+	ocfs2_put_dlm_debug(osb->osb_dlm_debug);
+	memset(osb, 0, sizeof(struct ocfs2_super));
+}
+
+/* Put OCFS2 into a readonly state, or (if the user specifies it),
+ * panic(). We do not support continue-on-error operation. */
+static void ocfs2_handle_error(struct super_block *sb)
+{
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
+		panic("OCFS2: (device %s): panic forced after error\n",
+		      sb->s_id);
+
+	ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+
+	if (sb->s_flags & MS_RDONLY &&
+	    (ocfs2_is_soft_readonly(osb) ||
+	     ocfs2_is_hard_readonly(osb)))
+		return;
+
+	printk(KERN_CRIT "File system is now read-only due to the potential "
+	       "of on-disk corruption. Please run fsck.ocfs2 once the file "
+	       "system is unmounted.\n");
+	sb->s_flags |= MS_RDONLY;
+	ocfs2_set_ro_flag(osb, 0);
+}
+
+static char error_buf[1024];
+
+void __ocfs2_error(struct super_block *sb,
+		   const char *function,
+		   const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+	va_end(args);
+
+	/* Not using mlog here because we want to show the actual
+	 * function the error came from. */
+	printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
+	       sb->s_id, function, error_buf);
+
+	ocfs2_handle_error(sb);
+}
+
+/* Handle critical errors. This is intentionally more drastic than
+ * ocfs2_handle_error, so we only use for things like journal errors,
+ * etc. */
+void __ocfs2_abort(struct super_block* sb,
+		   const char *function,
+		   const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+	va_end(args);
+
+	printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
+	       sb->s_id, function, error_buf);
+
+	/* We don't have the cluster support yet to go straight to
+	 * hard readonly in here. Until then, we want to keep
+	 * ocfs2_abort() so that we can at least mark critical
+	 * errors.
+	 *
+	 * TODO: This should abort the journal and alert other nodes
+	 * that our slot needs recovery. */
+
+	/* Force a panic(). This stinks, but it's better than letting
+	 * things continue without having a proper hard readonly
+	 * here. */
+	if (!ocfs2_mount_local(OCFS2_SB(sb)))
+		OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
+	ocfs2_handle_error(sb);
+}
+
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset)
+{
+	int rc;
+	sigset_t blocked;
+
+	sigfillset(&blocked);
+	rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
+	BUG_ON(rc);
+}
+
+void ocfs2_unblock_signals(sigset_t *oldset)
+{
+	int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
+	BUG_ON(rc);
+}
+
+module_init(ocfs2_init);
+module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
new file mode 100644
index 00000000..40c7de08
--- /dev/null
+++ b/fs/ocfs2/super.h
@@ -0,0 +1,55 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * super.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_SUPER_H
+#define OCFS2_SUPER_H
+
+extern struct workqueue_struct *ocfs2_wq;
+
+int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
+				  int node_num);
+
+void __ocfs2_error(struct super_block *sb,
+		   const char *function,
+		   const char *fmt, ...)
+	__attribute__ ((format (printf, 3, 4)));
+
+#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
+
+void __ocfs2_abort(struct super_block *sb,
+		   const char *function,
+		   const char *fmt, ...)
+	__attribute__ ((format (printf, 3, 4)));
+
+#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset);
+void ocfs2_unblock_signals(sigset_t *oldset);
+
+#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
new file mode 100644
index 00000000..5d22872e
--- /dev/null
+++ b/fs/ocfs2/symlink.c
@@ -0,0 +1,173 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ *  linux/cluster/ssi/cfs/symlink.c
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation; either version 2 of
+ *	the License, or (at your option) any later version.
+ *
+ *	This program is distributed in the hope that it will be useful,
+ *	but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *	MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE
+ *	or NON INFRINGEMENT.  See the GNU General Public License for more
+ *	details.
+ *
+ * 	You should have received a copy of the GNU General Public License
+ * 	along with this program; if not, write to the Free Software
+ * 	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *	Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  Optimization changes Copyright (C) 1994 Florian La Roche
+ *
+ *  Jun 7 1999, cache symlink lookups in the page cache.  -DaveM
+ *
+ *  Portions Copyright (C) 2001 Compaq Computer Corporation
+ *
+ *  ocfs2 symlink handling code.
+ *
+ *  Copyright (C) 2004, 2005 Oracle.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "symlink.h"
+#include "xattr.h"
+
+#include "buffer_head_io.h"
+
+
+static char *ocfs2_fast_symlink_getlink(struct inode *inode,
+					struct buffer_head **bh)
+{
+	int status;
+	char *link = NULL;
+	struct ocfs2_dinode *fe;
+
+	status = ocfs2_read_inode_block(inode, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		link = ERR_PTR(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) (*bh)->b_data;
+	link = (char *) fe->id2.i_symlink;
+bail:
+
+	return link;
+}
+
+static int ocfs2_readlink(struct dentry *dentry,
+			  char __user *buffer,
+			  int buflen)
+{
+	int ret;
+	char *link;
+	struct buffer_head *bh = NULL;
+	struct inode *inode = dentry->d_inode;
+
+	link = ocfs2_fast_symlink_getlink(inode, &bh);
+	if (IS_ERR(link)) {
+		ret = PTR_ERR(link);
+		goto out;
+	}
+
+	/*
+	 * Without vfsmount we can't update atime now,
+	 * but we will update atime here ultimately.
+	 */
+	ret = vfs_readlink(dentry, buffer, buflen, link);
+
+	brelse(bh);
+out:
+	if (ret < 0)
+		mlog_errno(ret);
+	return ret;
+}
+
+static void *ocfs2_fast_follow_link(struct dentry *dentry,
+				    struct nameidata *nd)
+{
+	int status = 0;
+	int len;
+	char *target, *link = ERR_PTR(-ENOMEM);
+	struct inode *inode = dentry->d_inode;
+	struct buffer_head *bh = NULL;
+
+	BUG_ON(!ocfs2_inode_is_fast_symlink(inode));
+	target = ocfs2_fast_symlink_getlink(inode, &bh);
+	if (IS_ERR(target)) {
+		status = PTR_ERR(target);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* Fast symlinks can't be large */
+	len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
+	link = kzalloc(len + 1, GFP_NOFS);
+	if (!link) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	memcpy(link, target, len);
+
+bail:
+	nd_set_link(nd, status ? ERR_PTR(status) : link);
+	brelse(bh);
+
+	if (status)
+		mlog_errno(status);
+	return NULL;
+}
+
+static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+	char *link = nd_get_link(nd);
+	if (!IS_ERR(link))
+		kfree(link);
+}
+
+const struct inode_operations ocfs2_symlink_inode_operations = {
+	.readlink	= page_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.getattr	= ocfs2_getattr,
+	.setattr	= ocfs2_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
+	.fiemap		= ocfs2_fiemap,
+};
+const struct inode_operations ocfs2_fast_symlink_inode_operations = {
+	.readlink	= ocfs2_readlink,
+	.follow_link	= ocfs2_fast_follow_link,
+	.put_link	= ocfs2_fast_put_link,
+	.getattr	= ocfs2_getattr,
+	.setattr	= ocfs2_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
+	.fiemap		= ocfs2_fiemap,
+};
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
new file mode 100644
index 00000000..65a6c9c6
--- /dev/null
+++ b/fs/ocfs2/symlink.h
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * symlink.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_SYMLINK_H
+#define OCFS2_SYMLINK_H
+
+extern const struct inode_operations ocfs2_symlink_inode_operations;
+extern const struct inode_operations ocfs2_fast_symlink_inode_operations;
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int ocfs2_inode_is_fast_symlink(struct inode *inode)
+{
+	return (S_ISLNK(inode->i_mode) &&
+		inode->i_blocks == 0);
+}
+
+
+#endif /* OCFS2_SYMLINK_H */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
new file mode 100644
index 00000000..3d635f4b
--- /dev/null
+++ b/fs/ocfs2/sysfile.c
@@ -0,0 +1,180 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sysfile.c
+ *
+ * Initialize, read, write, etc. system files.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dir.h"
+#include "inode.h"
+#include "journal.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+						   int type,
+						   u32 slot);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
+#endif
+
+static inline int is_global_system_inode(int type)
+{
+	return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE &&
+		type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
+}
+
+static struct inode **get_local_system_inode(struct ocfs2_super *osb,
+					     int type,
+					     u32 slot)
+{
+	int index;
+	struct inode **local_system_inodes, **free = NULL;
+
+	BUG_ON(slot == OCFS2_INVALID_SLOT);
+	BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
+	       type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
+
+	spin_lock(&osb->osb_lock);
+	local_system_inodes = osb->local_system_inodes;
+	spin_unlock(&osb->osb_lock);
+
+	if (unlikely(!local_system_inodes)) {
+		local_system_inodes = kzalloc(sizeof(struct inode *) *
+					      NUM_LOCAL_SYSTEM_INODES *
+					      osb->max_slots,
+					      GFP_NOFS);
+		if (!local_system_inodes) {
+			mlog_errno(-ENOMEM);
+			/*
+			 * return NULL here so that ocfs2_get_sytem_file_inodes
+			 * will try to create an inode and use it. We will try
+			 * to initialize local_system_inodes next time.
+			 */
+			return NULL;
+		}
+
+		spin_lock(&osb->osb_lock);
+		if (osb->local_system_inodes) {
+			/* Someone has initialized it for us. */
+			free = local_system_inodes;
+			local_system_inodes = osb->local_system_inodes;
+		} else
+			osb->local_system_inodes = local_system_inodes;
+		spin_unlock(&osb->osb_lock);
+		if (unlikely(free))
+			kfree(free);
+	}
+
+	index = (slot * NUM_LOCAL_SYSTEM_INODES) +
+		(type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
+
+	return &local_system_inodes[index];
+}
+
+struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+					  int type,
+					  u32 slot)
+{
+	struct inode *inode = NULL;
+	struct inode **arr = NULL;
+
+	/* avoid the lookup if cached in local system file array */
+	if (is_global_system_inode(type)) {
+		arr = &(osb->global_system_inodes[type]);
+	} else
+		arr = get_local_system_inode(osb, type, slot);
+
+	if (arr && ((inode = *arr) != NULL)) {
+		/* get a ref in addition to the array ref */
+		inode = igrab(inode);
+		BUG_ON(!inode);
+
+		return inode;
+	}
+
+	/* this gets one ref thru iget */
+	inode = _ocfs2_get_system_file_inode(osb, type, slot);
+
+	/* add one more if putting into array for first time */
+	if (arr && inode) {
+		*arr = igrab(inode);
+		BUG_ON(!*arr);
+	}
+	return inode;
+}
+
+static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+						   int type,
+						   u32 slot)
+{
+	char namebuf[40];
+	struct inode *inode = NULL;
+	u64 blkno;
+	int status = 0;
+
+	ocfs2_sprintf_system_inode_name(namebuf,
+					sizeof(namebuf),
+					type, slot);
+
+	status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
+					    strlen(namebuf), &blkno);
+	if (status < 0) {
+		goto bail;
+	}
+
+	inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type);
+	if (IS_ERR(inode)) {
+		mlog_errno(PTR_ERR(inode));
+		inode = NULL;
+		goto bail;
+	}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
+	    type == LOCAL_GROUP_QUOTA_SYSTEM_INODE ||
+	    type == JOURNAL_SYSTEM_INODE) {
+		/* Ignore inode lock on these inodes as the lock does not
+		 * really belong to any process and lockdep cannot handle
+		 * that */
+		OCFS2_I(inode)->ip_inode_lockres.l_lockdep_map.key = NULL;
+	} else {
+		lockdep_init_map(&OCFS2_I(inode)->ip_inode_lockres.
+								l_lockdep_map,
+				 ocfs2_system_inodes[type].si_name,
+				 &ocfs2_sysfile_cluster_lock_key[type], 0);
+	}
+#endif
+bail:
+
+	return inode;
+}
+
diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h
new file mode 100644
index 00000000..cc9ea661
--- /dev/null
+++ b/fs/ocfs2/sysfile.h
@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sysfile.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_SYSFILE_H
+#define OCFS2_SYSFILE_H
+
+struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+					   int type,
+					   u32 slot);
+
+#endif /* OCFS2_SYSFILE_H */
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
new file mode 100644
index 00000000..52eaf33d
--- /dev/null
+++ b/fs/ocfs2/uptodate.c
@@ -0,0 +1,638 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * uptodate.c
+ *
+ * Tracking the up-to-date-ness of a local buffer_head with respect to
+ * the cluster.
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Standard buffer head caching flags (uptodate, etc) are insufficient
+ * in a clustered environment - a buffer may be marked up to date on
+ * our local node but could have been modified by another cluster
+ * member. As a result an additional (and performant) caching scheme
+ * is required. A further requirement is that we consume as little
+ * memory as possible - we never pin buffer_head structures in order
+ * to cache them.
+ *
+ * We track the existence of up to date buffers on the inodes which
+ * are associated with them. Because we don't want to pin
+ * buffer_heads, this is only a (strong) hint and several other checks
+ * are made in the I/O path to ensure that we don't use a stale or
+ * invalid buffer without going to disk:
+ *	- buffer_jbd is used liberally - if a bh is in the journal on
+ *	  this node then it *must* be up to date.
+ *	- the standard buffer_uptodate() macro is used to detect buffers
+ *	  which may be invalid (even if we have an up to date tracking
+ * 	  item for them)
+ *
+ * For a full understanding of how this code works together, one
+ * should read the callers in dlmglue.c, the I/O functions in
+ * buffer_head_io.c and ocfs2_journal_access in journal.c
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/buffer_head.h>
+#include <linux/rbtree.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "inode.h"
+#include "uptodate.h"
+#include "ocfs2_trace.h"
+
+struct ocfs2_meta_cache_item {
+	struct rb_node	c_node;
+	sector_t	c_block;
+};
+
+static struct kmem_cache *ocfs2_uptodate_cachep = NULL;
+
+u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)
+{
+	BUG_ON(!ci || !ci->ci_ops);
+
+	return ci->ci_ops->co_owner(ci);
+}
+
+struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci)
+{
+	BUG_ON(!ci || !ci->ci_ops);
+
+	return ci->ci_ops->co_get_super(ci);
+}
+
+static void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci)
+{
+	BUG_ON(!ci || !ci->ci_ops);
+
+	ci->ci_ops->co_cache_lock(ci);
+}
+
+static void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci)
+{
+	BUG_ON(!ci || !ci->ci_ops);
+
+	ci->ci_ops->co_cache_unlock(ci);
+}
+
+void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+	BUG_ON(!ci || !ci->ci_ops);
+
+	ci->ci_ops->co_io_lock(ci);
+}
+
+void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci)
+{
+	BUG_ON(!ci || !ci->ci_ops);
+
+	ci->ci_ops->co_io_unlock(ci);
+}
+
+
+static void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci,
+				       int clear)
+{
+	ci->ci_flags |= OCFS2_CACHE_FL_INLINE;
+	ci->ci_num_cached = 0;
+
+	if (clear) {
+		ci->ci_created_trans = 0;
+		ci->ci_last_trans = 0;
+	}
+}
+
+void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
+			       const struct ocfs2_caching_operations *ops)
+{
+	BUG_ON(!ops);
+
+	ci->ci_ops = ops;
+	ocfs2_metadata_cache_reset(ci, 1);
+}
+
+void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci)
+{
+	ocfs2_metadata_cache_purge(ci);
+	ocfs2_metadata_cache_reset(ci, 1);
+}
+
+
+/* No lock taken here as 'root' is not expected to be visible to other
+ * processes. */
+static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
+{
+	unsigned int purged = 0;
+	struct rb_node *node;
+	struct ocfs2_meta_cache_item *item;
+
+	while ((node = rb_last(root)) != NULL) {
+		item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
+
+		trace_ocfs2_purge_copied_metadata_tree(
+					(unsigned long long) item->c_block);
+
+		rb_erase(&item->c_node, root);
+		kmem_cache_free(ocfs2_uptodate_cachep, item);
+
+		purged++;
+	}
+	return purged;
+}
+
+/* Called from locking and called from ocfs2_clear_inode. Dump the
+ * cache for a given inode.
+ *
+ * This function is a few more lines longer than necessary due to some
+ * accounting done here, but I think it's worth tracking down those
+ * bugs sooner -- Mark */
+void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci)
+{
+	unsigned int tree, to_purge, purged;
+	struct rb_root root = RB_ROOT;
+
+	BUG_ON(!ci || !ci->ci_ops);
+
+	ocfs2_metadata_cache_lock(ci);
+	tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE);
+	to_purge = ci->ci_num_cached;
+
+	trace_ocfs2_metadata_cache_purge(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		to_purge, tree);
+
+	/* If we're a tree, save off the root so that we can safely
+	 * initialize the cache. We do the work to free tree members
+	 * without the spinlock. */
+	if (tree)
+		root = ci->ci_cache.ci_tree;
+
+	ocfs2_metadata_cache_reset(ci, 0);
+	ocfs2_metadata_cache_unlock(ci);
+
+	purged = ocfs2_purge_copied_metadata_tree(&root);
+	/* If possible, track the number wiped so that we can more
+	 * easily detect counting errors. Unfortunately, this is only
+	 * meaningful for trees. */
+	if (tree && purged != to_purge)
+		mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n",
+		     (unsigned long long)ocfs2_metadata_cache_owner(ci),
+		     to_purge, purged);
+}
+
+/* Returns the index in the cache array, -1 if not found.
+ * Requires ip_lock. */
+static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
+				    sector_t item)
+{
+	int i;
+
+	for (i = 0; i < ci->ci_num_cached; i++) {
+		if (item == ci->ci_cache.ci_array[i])
+			return i;
+	}
+
+	return -1;
+}
+
+/* Returns the cache item if found, otherwise NULL.
+ * Requires ip_lock. */
+static struct ocfs2_meta_cache_item *
+ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
+			sector_t block)
+{
+	struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
+	struct ocfs2_meta_cache_item *item = NULL;
+
+	while (n) {
+		item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);
+
+		if (block < item->c_block)
+			n = n->rb_left;
+		else if (block > item->c_block)
+			n = n->rb_right;
+		else
+			return item;
+	}
+
+	return NULL;
+}
+
+static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
+			       struct buffer_head *bh)
+{
+	int index = -1;
+	struct ocfs2_meta_cache_item *item = NULL;
+
+	ocfs2_metadata_cache_lock(ci);
+
+	trace_ocfs2_buffer_cached_begin(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long) bh->b_blocknr,
+		!!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
+
+	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE)
+		index = ocfs2_search_cache_array(ci, bh->b_blocknr);
+	else
+		item = ocfs2_search_cache_tree(ci, bh->b_blocknr);
+
+	ocfs2_metadata_cache_unlock(ci);
+
+	trace_ocfs2_buffer_cached_end(index, item);
+
+	return (index != -1) || (item != NULL);
+}
+
+/* Warning: even if it returns true, this does *not* guarantee that
+ * the block is stored in our inode metadata cache.
+ *
+ * This can be called under lock_buffer()
+ */
+int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
+			  struct buffer_head *bh)
+{
+	/* Doesn't matter if the bh is in our cache or not -- if it's
+	 * not marked uptodate then we know it can't have correct
+	 * data. */
+	if (!buffer_uptodate(bh))
+		return 0;
+
+	/* OCFS2 does not allow multiple nodes to be changing the same
+	 * block at the same time. */
+	if (buffer_jbd(bh))
+		return 1;
+
+	/* Ok, locally the buffer is marked as up to date, now search
+	 * our cache to see if we can trust that. */
+	return ocfs2_buffer_cached(ci, bh);
+}
+
+/*
+ * Determine whether a buffer is currently out on a read-ahead request.
+ * ci_io_sem should be held to serialize submitters with the logic here.
+ */
+int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh)
+{
+	return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh);
+}
+
+/* Requires ip_lock */
+static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
+				     sector_t block)
+{
+	BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY);
+
+	trace_ocfs2_append_cache_array(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)block, ci->ci_num_cached);
+
+	ci->ci_cache.ci_array[ci->ci_num_cached] = block;
+	ci->ci_num_cached++;
+}
+
+/* By now the caller should have checked that the item does *not*
+ * exist in the tree.
+ * Requires ip_lock. */
+static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
+				      struct ocfs2_meta_cache_item *new)
+{
+	sector_t block = new->c_block;
+	struct rb_node *parent = NULL;
+	struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
+	struct ocfs2_meta_cache_item *tmp;
+
+	trace_ocfs2_insert_cache_tree(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)block, ci->ci_num_cached);
+
+	while(*p) {
+		parent = *p;
+
+		tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);
+
+		if (block < tmp->c_block)
+			p = &(*p)->rb_left;
+		else if (block > tmp->c_block)
+			p = &(*p)->rb_right;
+		else {
+			/* This should never happen! */
+			mlog(ML_ERROR, "Duplicate block %llu cached!\n",
+			     (unsigned long long) block);
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->c_node, parent, p);
+	rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
+	ci->ci_num_cached++;
+}
+
+/* co_cache_lock() must be held */
+static inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci)
+{
+	return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) &&
+		(ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY);
+}
+
+/* tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the
+ * pointers in tree after we use them - this allows caller to detect
+ * when to free in case of error.
+ *
+ * The co_cache_lock() must be held. */
+static void ocfs2_expand_cache(struct ocfs2_caching_info *ci,
+			       struct ocfs2_meta_cache_item **tree)
+{
+	int i;
+
+	mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY,
+			"Owner %llu, num cached = %u, should be %u\n",
+			(unsigned long long)ocfs2_metadata_cache_owner(ci),
+			ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY);
+	mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE),
+			"Owner %llu not marked as inline anymore!\n",
+			(unsigned long long)ocfs2_metadata_cache_owner(ci));
+
+	/* Be careful to initialize the tree members *first* because
+	 * once the ci_tree is used, the array is junk... */
+	for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
+		tree[i]->c_block = ci->ci_cache.ci_array[i];
+
+	ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE;
+	ci->ci_cache.ci_tree = RB_ROOT;
+	/* this will be set again by __ocfs2_insert_cache_tree */
+	ci->ci_num_cached = 0;
+
+	for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
+		__ocfs2_insert_cache_tree(ci, tree[i]);
+		tree[i] = NULL;
+	}
+
+	trace_ocfs2_expand_cache(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		ci->ci_flags, ci->ci_num_cached);
+}
+
+/* Slow path function - memory allocation is necessary. See the
+ * comment above ocfs2_set_buffer_uptodate for more information. */
+static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
+					sector_t block,
+					int expand_tree)
+{
+	int i;
+	struct ocfs2_meta_cache_item *new = NULL;
+	struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] =
+		{ NULL, };
+
+	trace_ocfs2_set_buffer_uptodate(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)block, expand_tree);
+
+	new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
+	if (!new) {
+		mlog_errno(-ENOMEM);
+		return;
+	}
+	new->c_block = block;
+
+	if (expand_tree) {
+		/* Do *not* allocate an array here - the removal code
+		 * has no way of tracking that. */
+		for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
+			tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
+						   GFP_NOFS);
+			if (!tree[i]) {
+				mlog_errno(-ENOMEM);
+				goto out_free;
+			}
+
+			/* These are initialized in ocfs2_expand_cache! */
+		}
+	}
+
+	ocfs2_metadata_cache_lock(ci);
+	if (ocfs2_insert_can_use_array(ci)) {
+		/* Ok, items were removed from the cache in between
+		 * locks. Detect this and revert back to the fast path */
+		ocfs2_append_cache_array(ci, block);
+		ocfs2_metadata_cache_unlock(ci);
+		goto out_free;
+	}
+
+	if (expand_tree)
+		ocfs2_expand_cache(ci, tree);
+
+	__ocfs2_insert_cache_tree(ci, new);
+	ocfs2_metadata_cache_unlock(ci);
+
+	new = NULL;
+out_free:
+	if (new)
+		kmem_cache_free(ocfs2_uptodate_cachep, new);
+
+	/* If these were used, then ocfs2_expand_cache re-set them to
+	 * NULL for us. */
+	if (tree[0]) {
+		for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
+			if (tree[i])
+				kmem_cache_free(ocfs2_uptodate_cachep,
+						tree[i]);
+	}
+}
+
+/* Item insertion is guarded by co_io_lock(), so the insertion path takes
+ * advantage of this by not rechecking for a duplicate insert during
+ * the slow case. Additionally, if the cache needs to be bumped up to
+ * a tree, the code will not recheck after acquiring the lock --
+ * multiple paths cannot be expanding to a tree at the same time.
+ *
+ * The slow path takes into account that items can be removed
+ * (including the whole tree wiped and reset) when this process it out
+ * allocating memory. In those cases, it reverts back to the fast
+ * path.
+ *
+ * Note that this function may actually fail to insert the block if
+ * memory cannot be allocated. This is not fatal however (but may
+ * result in a performance penalty)
+ *
+ * Readahead buffers can be passed in here before the I/O request is
+ * completed.
+ */
+void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
+			       struct buffer_head *bh)
+{
+	int expand;
+
+	/* The block may very well exist in our cache already, so avoid
+	 * doing any more work in that case. */
+	if (ocfs2_buffer_cached(ci, bh))
+		return;
+
+	trace_ocfs2_set_buffer_uptodate_begin(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)bh->b_blocknr);
+
+	/* No need to recheck under spinlock - insertion is guarded by
+	 * co_io_lock() */
+	ocfs2_metadata_cache_lock(ci);
+	if (ocfs2_insert_can_use_array(ci)) {
+		/* Fast case - it's an array and there's a free
+		 * spot. */
+		ocfs2_append_cache_array(ci, bh->b_blocknr);
+		ocfs2_metadata_cache_unlock(ci);
+		return;
+	}
+
+	expand = 0;
+	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
+		/* We need to bump things up to a tree. */
+		expand = 1;
+	}
+	ocfs2_metadata_cache_unlock(ci);
+
+	__ocfs2_set_buffer_uptodate(ci, bh->b_blocknr, expand);
+}
+
+/* Called against a newly allocated buffer. Most likely nobody should
+ * be able to read this sort of metadata while it's still being
+ * allocated, but this is careful to take co_io_lock() anyway. */
+void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
+				   struct buffer_head *bh)
+{
+	/* This should definitely *not* exist in our cache */
+	BUG_ON(ocfs2_buffer_cached(ci, bh));
+
+	set_buffer_uptodate(bh);
+
+	ocfs2_metadata_cache_io_lock(ci);
+	ocfs2_set_buffer_uptodate(ci, bh);
+	ocfs2_metadata_cache_io_unlock(ci);
+}
+
+/* Requires ip_lock. */
+static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
+					int index)
+{
+	sector_t *array = ci->ci_cache.ci_array;
+	int bytes;
+
+	BUG_ON(index < 0 || index >= OCFS2_CACHE_INFO_MAX_ARRAY);
+	BUG_ON(index >= ci->ci_num_cached);
+	BUG_ON(!ci->ci_num_cached);
+
+	trace_ocfs2_remove_metadata_array(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		index, ci->ci_num_cached);
+
+	ci->ci_num_cached--;
+
+	/* don't need to copy if the array is now empty, or if we
+	 * removed at the tail */
+	if (ci->ci_num_cached && index < ci->ci_num_cached) {
+		bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
+		memmove(&array[index], &array[index + 1], bytes);
+	}
+}
+
+/* Requires ip_lock. */
+static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
+				       struct ocfs2_meta_cache_item *item)
+{
+	trace_ocfs2_remove_metadata_tree(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)item->c_block);
+
+	rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
+	ci->ci_num_cached--;
+}
+
+static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci,
+					  sector_t block)
+{
+	int index;
+	struct ocfs2_meta_cache_item *item = NULL;
+
+	ocfs2_metadata_cache_lock(ci);
+	trace_ocfs2_remove_block_from_cache(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long) block, ci->ci_num_cached,
+		ci->ci_flags);
+
+	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
+		index = ocfs2_search_cache_array(ci, block);
+		if (index != -1)
+			ocfs2_remove_metadata_array(ci, index);
+	} else {
+		item = ocfs2_search_cache_tree(ci, block);
+		if (item)
+			ocfs2_remove_metadata_tree(ci, item);
+	}
+	ocfs2_metadata_cache_unlock(ci);
+
+	if (item)
+		kmem_cache_free(ocfs2_uptodate_cachep, item);
+}
+
+/*
+ * Called when we remove a chunk of metadata from an inode. We don't
+ * bother reverting things to an inlined array in the case of a remove
+ * which moves us back under the limit.
+ */
+void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
+			     struct buffer_head *bh)
+{
+	sector_t block = bh->b_blocknr;
+
+	ocfs2_remove_block_from_cache(ci, block);
+}
+
+/* Called when we remove xattr clusters from an inode. */
+void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
+					    sector_t block,
+					    u32 c_len)
+{
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, 1) * c_len;
+
+	for (i = 0; i < b_len; i++, block++)
+		ocfs2_remove_block_from_cache(ci, block);
+}
+
+int __init init_ocfs2_uptodate_cache(void)
+{
+	ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
+				  sizeof(struct ocfs2_meta_cache_item),
+				  0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!ocfs2_uptodate_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void exit_ocfs2_uptodate_cache(void)
+{
+	if (ocfs2_uptodate_cachep)
+		kmem_cache_destroy(ocfs2_uptodate_cachep);
+}
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
new file mode 100644
index 00000000..0d826fe2
--- /dev/null
+++ b/fs/ocfs2/uptodate.h
@@ -0,0 +1,84 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * uptodate.h
+ *
+ * Cluster uptodate tracking
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_UPTODATE_H
+#define OCFS2_UPTODATE_H
+
+/*
+ * The caching code relies on locking provided by the user of
+ * struct ocfs2_caching_info.  These operations connect that up.
+ */
+struct ocfs2_caching_operations {
+	/*
+	 * A u64 representing the owning structure.  Usually this
+	 * is the block number (i_blkno or whatnot).  This is used so
+	 * that caching log messages can identify the owning structure.
+	 */
+	u64	(*co_owner)(struct ocfs2_caching_info *ci);
+
+	/* The superblock is needed during I/O. */
+	struct super_block *(*co_get_super)(struct ocfs2_caching_info *ci);
+	/*
+	 * Lock and unlock the caching data.  These will not sleep, and
+	 * should probably be spinlocks.
+	 */
+	void	(*co_cache_lock)(struct ocfs2_caching_info *ci);
+	void	(*co_cache_unlock)(struct ocfs2_caching_info *ci);
+
+	/*
+	 * Lock and unlock for disk I/O.  These will sleep, and should
+	 * be mutexes.
+	 */
+	void	(*co_io_lock)(struct ocfs2_caching_info *ci);
+	void	(*co_io_unlock)(struct ocfs2_caching_info *ci);
+};
+
+int __init init_ocfs2_uptodate_cache(void);
+void exit_ocfs2_uptodate_cache(void);
+
+void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
+			       const struct ocfs2_caching_operations *ops);
+void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci);
+void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci);
+
+u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci);
+void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci);
+void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci);
+
+int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
+			  struct buffer_head *bh);
+void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
+			       struct buffer_head *bh);
+void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
+				   struct buffer_head *bh);
+void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
+			     struct buffer_head *bh);
+void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
+					    sector_t block,
+					    u32 c_len);
+int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh);
+
+#endif /* OCFS2_UPTODATE_H */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
new file mode 100644
index 00000000..e2488f41
--- /dev/null
+++ b/fs/ocfs2/ver.c
@@ -0,0 +1,43 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+
+#include "ver.h"
+
+#define OCFS2_BUILD_VERSION "1.5.0"
+
+#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
+
+void ocfs2_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
new file mode 100644
index 00000000..d7395cb9
--- /dev/null
+++ b/fs/ocfs2/ver.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_VER_H
+#define OCFS2_VER_H
+
+void ocfs2_print_version(void);
+
+#endif /* OCFS2_VER_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
new file mode 100644
index 00000000..81ecf9c0
--- /dev/null
+++ b/fs/ocfs2/xattr.c
@@ -0,0 +1,7388 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.c
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/xattr.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/splice.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+#include <linux/sort.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/security.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "blockcheck.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_fs.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+#include "super.h"
+#include "xattr.h"
+#include "refcounttree.h"
+#include "acl.h"
+#include "ocfs2_trace.h"
+
+struct ocfs2_xattr_def_value_root {
+	struct ocfs2_xattr_value_root	xv;
+	struct ocfs2_extent_rec		er;
+};
+
+struct ocfs2_xattr_bucket {
+	/* The inode these xattrs are associated with */
+	struct inode *bu_inode;
+
+	/* The actual buffers that make up the bucket */
+	struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+
+	/* How many blocks make up one bucket for this filesystem */
+	int bu_blocks;
+};
+
+struct ocfs2_xattr_set_ctxt {
+	handle_t *handle;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_alloc_context *data_ac;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	int set_abort;
+};
+
+#define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
+#define OCFS2_XATTR_INLINE_SIZE	80
+#define OCFS2_XATTR_HEADER_GAP	4
+#define OCFS2_XATTR_FREE_IN_IBODY	(OCFS2_MIN_XATTR_INLINE_SIZE \
+					 - sizeof(struct ocfs2_xattr_header) \
+					 - OCFS2_XATTR_HEADER_GAP)
+#define OCFS2_XATTR_FREE_IN_BLOCK(ptr)	((ptr)->i_sb->s_blocksize \
+					 - sizeof(struct ocfs2_xattr_block) \
+					 - sizeof(struct ocfs2_xattr_header) \
+					 - OCFS2_XATTR_HEADER_GAP)
+
+static struct ocfs2_xattr_def_value_root def_xv = {
+	.xv.xr_list.l_count = cpu_to_le16(1),
+};
+
+const struct xattr_handler *ocfs2_xattr_handlers[] = {
+	&ocfs2_xattr_user_handler,
+	&ocfs2_xattr_acl_access_handler,
+	&ocfs2_xattr_acl_default_handler,
+	&ocfs2_xattr_trusted_handler,
+	&ocfs2_xattr_security_handler,
+	NULL
+};
+
+static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
+	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,
+	[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
+					= &ocfs2_xattr_acl_access_handler,
+	[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
+					= &ocfs2_xattr_acl_default_handler,
+	[OCFS2_XATTR_INDEX_TRUSTED]	= &ocfs2_xattr_trusted_handler,
+	[OCFS2_XATTR_INDEX_SECURITY]	= &ocfs2_xattr_security_handler,
+};
+
+struct ocfs2_xattr_info {
+	int		xi_name_index;
+	const char	*xi_name;
+	int		xi_name_len;
+	const void	*xi_value;
+	size_t		xi_value_len;
+};
+
+struct ocfs2_xattr_search {
+	struct buffer_head *inode_bh;
+	/*
+	 * xattr_bh point to the block buffer head which has extended attribute
+	 * when extended attribute in inode, xattr_bh is equal to inode_bh.
+	 */
+	struct buffer_head *xattr_bh;
+	struct ocfs2_xattr_header *header;
+	struct ocfs2_xattr_bucket *bucket;
+	void *base;
+	void *end;
+	struct ocfs2_xattr_entry *here;
+	int not_found;
+};
+
+/* Operations on struct ocfs2_xa_entry */
+struct ocfs2_xa_loc;
+struct ocfs2_xa_loc_operations {
+	/*
+	 * Journal functions
+	 */
+	int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc,
+				  int type);
+	void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc);
+
+	/*
+	 * Return a pointer to the appropriate buffer in loc->xl_storage
+	 * at the given offset from loc->xl_header.
+	 */
+	void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset);
+
+	/* Can we reuse the existing entry for the new value? */
+	int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc,
+			     struct ocfs2_xattr_info *xi);
+
+	/* How much space is needed for the new value? */
+	int (*xlo_check_space)(struct ocfs2_xa_loc *loc,
+			       struct ocfs2_xattr_info *xi);
+
+	/*
+	 * Return the offset of the first name+value pair.  This is
+	 * the start of our downward-filling free space.
+	 */
+	int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc);
+
+	/*
+	 * Remove the name+value at this location.  Do whatever is
+	 * appropriate with the remaining name+value pairs.
+	 */
+	void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc);
+
+	/* Fill xl_entry with a new entry */
+	void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash);
+
+	/* Add name+value storage to an entry */
+	void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size);
+
+	/*
+	 * Initialize the value buf's access and bh fields for this entry.
+	 * ocfs2_xa_fill_value_buf() will handle the xv pointer.
+	 */
+	void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc,
+				   struct ocfs2_xattr_value_buf *vb);
+};
+
+/*
+ * Describes an xattr entry location.  This is a memory structure
+ * tracking the on-disk structure.
+ */
+struct ocfs2_xa_loc {
+	/* This xattr belongs to this inode */
+	struct inode *xl_inode;
+
+	/* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */
+	struct ocfs2_xattr_header *xl_header;
+
+	/* Bytes from xl_header to the end of the storage */
+	int xl_size;
+
+	/*
+	 * The ocfs2_xattr_entry this location describes.  If this is
+	 * NULL, this location describes the on-disk structure where it
+	 * would have been.
+	 */
+	struct ocfs2_xattr_entry *xl_entry;
+
+	/*
+	 * Internal housekeeping
+	 */
+
+	/* Buffer(s) containing this entry */
+	void *xl_storage;
+
+	/* Operations on the storage backing this location */
+	const struct ocfs2_xa_loc_operations *xl_ops;
+};
+
+/*
+ * Convenience functions to calculate how much space is needed for a
+ * given name+value pair
+ */
+static int namevalue_size(int name_len, uint64_t value_len)
+{
+	if (value_len > OCFS2_XATTR_INLINE_SIZE)
+		return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+	else
+		return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
+}
+
+static int namevalue_size_xi(struct ocfs2_xattr_info *xi)
+{
+	return namevalue_size(xi->xi_name_len, xi->xi_value_len);
+}
+
+static int namevalue_size_xe(struct ocfs2_xattr_entry *xe)
+{
+	u64 value_len = le64_to_cpu(xe->xe_value_size);
+
+	BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) &&
+	       ocfs2_xattr_is_local(xe));
+	return namevalue_size(xe->xe_name_len, value_len);
+}
+
+
+static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
+					     struct ocfs2_xattr_header *xh,
+					     int index,
+					     int *block_off,
+					     int *new_offset);
+
+static int ocfs2_xattr_block_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs);
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+					struct buffer_head *root_bh,
+					int name_index,
+					const char *name,
+					struct ocfs2_xattr_search *xs);
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+					struct buffer_head *blk_bh,
+					char *buffer,
+					size_t buffer_size);
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+					  struct ocfs2_xattr_search *xs,
+					  struct ocfs2_xattr_set_ctxt *ctxt);
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+					     struct ocfs2_xattr_info *xi,
+					     struct ocfs2_xattr_search *xs,
+					     struct ocfs2_xattr_set_ctxt *ctxt);
+
+typedef int (xattr_tree_rec_func)(struct inode *inode,
+				  struct buffer_head *root_bh,
+				  u64 blkno, u32 cpos, u32 len, void *para);
+static int ocfs2_iterate_xattr_index_block(struct inode *inode,
+					   struct buffer_head *root_bh,
+					   xattr_tree_rec_func *rec_func,
+					   void *para);
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+					struct ocfs2_xattr_bucket *bucket,
+					void *para);
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+				  struct buffer_head *root_bh,
+				  u64 blkno,
+				  u32 cpos,
+				  u32 len,
+				  void *para);
+
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+				  u64 src_blk, u64 last_blk, u64 to_blk,
+				  unsigned int start_bucket,
+				  u32 *first_hash);
+static int ocfs2_prepare_refcount_xattr(struct inode *inode,
+					struct ocfs2_dinode *di,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xis,
+					struct ocfs2_xattr_search *xbs,
+					struct ocfs2_refcount_tree **ref_tree,
+					int *meta_need,
+					int *credits);
+static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
+					   struct ocfs2_xattr_bucket *bucket,
+					   int offset,
+					   struct ocfs2_xattr_value_root **xv,
+					   struct buffer_head **bh);
+
+static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+{
+	return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+}
+
+static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
+{
+	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
+}
+
+#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
+#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
+#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
+
+static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode)
+{
+	struct ocfs2_xattr_bucket *bucket;
+	int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET);
+
+	bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS);
+	if (bucket) {
+		bucket->bu_inode = inode;
+		bucket->bu_blocks = blks;
+	}
+
+	return bucket;
+}
+
+static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket)
+{
+	int i;
+
+	for (i = 0; i < bucket->bu_blocks; i++) {
+		brelse(bucket->bu_bhs[i]);
+		bucket->bu_bhs[i] = NULL;
+	}
+}
+
+static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
+{
+	if (bucket) {
+		ocfs2_xattr_bucket_relse(bucket);
+		bucket->bu_inode = NULL;
+		kfree(bucket);
+	}
+}
+
+/*
+ * A bucket that has never been written to disk doesn't need to be
+ * read.  We just need the buffer_heads.  Don't call this for
+ * buckets that are already on disk.  ocfs2_read_xattr_bucket() initializes
+ * them fully.
+ */
+static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+				   u64 xb_blkno)
+{
+	int i, rc = 0;
+
+	for (i = 0; i < bucket->bu_blocks; i++) {
+		bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
+					      xb_blkno + i);
+		if (!bucket->bu_bhs[i]) {
+			rc = -EIO;
+			mlog_errno(rc);
+			break;
+		}
+
+		if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+					   bucket->bu_bhs[i]))
+			ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+						      bucket->bu_bhs[i]);
+	}
+
+	if (rc)
+		ocfs2_xattr_bucket_relse(bucket);
+	return rc;
+}
+
+/* Read the xattr bucket at xb_blkno */
+static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+				   u64 xb_blkno)
+{
+	int rc;
+
+	rc = ocfs2_read_blocks(INODE_CACHE(bucket->bu_inode), xb_blkno,
+			       bucket->bu_blocks, bucket->bu_bhs, 0,
+			       NULL);
+	if (!rc) {
+		spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
+		rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
+						 bucket->bu_bhs,
+						 bucket->bu_blocks,
+						 &bucket_xh(bucket)->xh_check);
+		spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
+		if (rc)
+			mlog_errno(rc);
+	}
+
+	if (rc)
+		ocfs2_xattr_bucket_relse(bucket);
+	return rc;
+}
+
+static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
+					     struct ocfs2_xattr_bucket *bucket,
+					     int type)
+{
+	int i, rc = 0;
+
+	for (i = 0; i < bucket->bu_blocks; i++) {
+		rc = ocfs2_journal_access(handle,
+					  INODE_CACHE(bucket->bu_inode),
+					  bucket->bu_bhs[i], type);
+		if (rc) {
+			mlog_errno(rc);
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
+					     struct ocfs2_xattr_bucket *bucket)
+{
+	int i;
+
+	spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
+	ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
+				   bucket->bu_bhs, bucket->bu_blocks,
+				   &bucket_xh(bucket)->xh_check);
+	spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
+
+	for (i = 0; i < bucket->bu_blocks; i++)
+		ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
+}
+
+static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
+					 struct ocfs2_xattr_bucket *src)
+{
+	int i;
+	int blocksize = src->bu_inode->i_sb->s_blocksize;
+
+	BUG_ON(dest->bu_blocks != src->bu_blocks);
+	BUG_ON(dest->bu_inode != src->bu_inode);
+
+	for (i = 0; i < src->bu_blocks; i++) {
+		memcpy(bucket_block(dest, i), bucket_block(src, i),
+		       blocksize);
+	}
+}
+
+static int ocfs2_validate_xattr_block(struct super_block *sb,
+				      struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_xattr_block *xb =
+		(struct ocfs2_xattr_block *)bh->b_data;
+
+	trace_ocfs2_validate_xattr_block((unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check);
+	if (rc)
+		return rc;
+
+	/*
+	 * Errors after here are fatal
+	 */
+
+	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+		ocfs2_error(sb,
+			    "Extended attribute block #%llu has bad "
+			    "signature %.*s",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    xb->xb_signature);
+		return -EINVAL;
+	}
+
+	if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb,
+			    "Extended attribute block #%llu has an "
+			    "invalid xb_blkno of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(xb->xb_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Extended attribute block #%llu has an invalid "
+			    "xb_fs_generation of #%u",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(xb->xb_fs_generation));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
+				  struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(INODE_CACHE(inode), xb_blkno, &tmp,
+			      ocfs2_validate_xattr_block);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
+static inline const char *ocfs2_xattr_prefix(int name_index)
+{
+	const struct xattr_handler *handler = NULL;
+
+	if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
+		handler = ocfs2_xattr_handler_map[name_index];
+
+	return handler ? handler->prefix : NULL;
+}
+
+static u32 ocfs2_xattr_name_hash(struct inode *inode,
+				 const char *name,
+				 int name_len)
+{
+	/* Get hash value of uuid from super block */
+	u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash;
+	int i;
+
+	/* hash extended attribute name */
+	for (i = 0; i < name_len; i++) {
+		hash = (hash << OCFS2_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^
+		       *name++;
+	}
+
+	return hash;
+}
+
+static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
+{
+	return namevalue_size(name_len, value_len) +
+		sizeof(struct ocfs2_xattr_entry);
+}
+
+static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi)
+{
+	return namevalue_size_xi(xi) +
+		sizeof(struct ocfs2_xattr_entry);
+}
+
+static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe)
+{
+	return namevalue_size_xe(xe) +
+		sizeof(struct ocfs2_xattr_entry);
+}
+
+int ocfs2_calc_security_init(struct inode *dir,
+			     struct ocfs2_security_xattr_info *si,
+			     int *want_clusters,
+			     int *xattr_credits,
+			     struct ocfs2_alloc_context **xattr_ac)
+{
+	int ret = 0;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	int s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+						 si->value_len);
+
+	/*
+	 * The max space of security xattr taken inline is
+	 * 256(name) + 80(value) + 16(entry) = 352 bytes,
+	 * So reserve one metadata block for it is ok.
+	 */
+	if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+	    s_size > OCFS2_XATTR_FREE_IN_IBODY) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+		if (ret) {
+			mlog_errno(ret);
+			return ret;
+		}
+		*xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+	}
+
+	/* reserve clusters for xattr value which will be set in B tree*/
+	if (si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+							    si->value_len);
+
+		*xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+							   new_clusters);
+		*want_clusters += new_clusters;
+	}
+	return ret;
+}
+
+int ocfs2_calc_xattr_init(struct inode *dir,
+			  struct buffer_head *dir_bh,
+			  int mode,
+			  struct ocfs2_security_xattr_info *si,
+			  int *want_clusters,
+			  int *xattr_credits,
+			  int *want_meta)
+{
+	int ret = 0;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	int s_size = 0, a_size = 0, acl_len = 0, new_clusters;
+
+	if (si->enable)
+		s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+						     si->value_len);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+		acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
+					OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+					"", NULL, 0);
+		if (acl_len > 0) {
+			a_size = ocfs2_xattr_entry_real_size(0, acl_len);
+			if (S_ISDIR(mode))
+				a_size <<= 1;
+		} else if (acl_len != 0 && acl_len != -ENODATA) {
+			mlog_errno(ret);
+			return ret;
+		}
+	}
+
+	if (!(s_size + a_size))
+		return ret;
+
+	/*
+	 * The max space of security xattr taken inline is
+	 * 256(name) + 80(value) + 16(entry) = 352 bytes,
+	 * The max space of acl xattr taken inline is
+	 * 80(value) + 16(entry) * 2(if directory) = 192 bytes,
+	 * when blocksize = 512, may reserve one more cluser for
+	 * xattr bucket, otherwise reserve one metadata block
+	 * for them is ok.
+	 * If this is a new directory with inline data,
+	 * we choose to reserve the entire inline area for
+	 * directory contents and force an external xattr block.
+	 */
+	if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+	    (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
+	    (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
+		*want_meta = *want_meta + 1;
+		*xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+	}
+
+	if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE &&
+	    (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) {
+		*want_clusters += 1;
+		*xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb);
+	}
+
+	/*
+	 * reserve credits and clusters for xattrs which has large value
+	 * and have to be set outside
+	 */
+	if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+							si->value_len);
+		*xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+							   new_clusters);
+		*want_clusters += new_clusters;
+	}
+	if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+	    acl_len > OCFS2_XATTR_INLINE_SIZE) {
+		/* for directory, it has DEFAULT and ACCESS two types of acls */
+		new_clusters = (S_ISDIR(mode) ? 2 : 1) *
+				ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
+		*xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+							   new_clusters);
+		*want_clusters += new_clusters;
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_extend_allocation(struct inode *inode,
+					 u32 clusters_to_add,
+					 struct ocfs2_xattr_value_buf *vb,
+					 struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int status = 0, credits;
+	handle_t *handle = ctxt->handle;
+	enum ocfs2_alloc_restarted why;
+	u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
+
+	while (clusters_to_add) {
+		trace_ocfs2_xattr_extend_allocation(clusters_to_add);
+
+		status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+				       OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			break;
+		}
+
+		prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
+		status = ocfs2_add_clusters_in_btree(handle,
+						     &et,
+						     &logical_start,
+						     clusters_to_add,
+						     0,
+						     ctxt->data_ac,
+						     ctxt->meta_ac,
+						     &why);
+		if ((status < 0) && (status != -EAGAIN)) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			break;
+		}
+
+		ocfs2_journal_dirty(handle, vb->vb_bh);
+
+		clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) -
+					 prev_clusters;
+
+		if (why != RESTART_NONE && clusters_to_add) {
+			/*
+			 * We can only fail in case the alloc file doesn't give
+			 * up enough clusters.
+			 */
+			BUG_ON(why == RESTART_META);
+
+			credits = ocfs2_calc_extend_credits(inode->i_sb,
+							    &vb->vb_xv->xr_list,
+							    clusters_to_add);
+			status = ocfs2_extend_trans(handle, credits);
+			if (status < 0) {
+				status = -ENOMEM;
+				mlog_errno(status);
+				break;
+			}
+		}
+	}
+
+	return status;
+}
+
+static int __ocfs2_remove_xattr_range(struct inode *inode,
+				      struct ocfs2_xattr_value_buf *vb,
+				      u32 cpos, u32 phys_cpos, u32 len,
+				      unsigned int ext_flags,
+				      struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+	handle_t *handle = ctxt->handle;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
+
+	ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+			    OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_remove_extent(handle, &et, cpos, len, ctxt->meta_ac,
+				  &ctxt->dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
+	ocfs2_journal_dirty(handle, vb->vb_bh);
+
+	if (ext_flags & OCFS2_EXT_REFCOUNTED)
+		ret = ocfs2_decrease_refcount(inode, handle,
+					ocfs2_blocks_to_clusters(inode->i_sb,
+								 phys_blkno),
+					len, ctxt->meta_ac, &ctxt->dealloc, 1);
+	else
+		ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc,
+						  phys_blkno, len);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_shrink_size(struct inode *inode,
+				   u32 old_clusters,
+				   u32 new_clusters,
+				   struct ocfs2_xattr_value_buf *vb,
+				   struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret = 0;
+	unsigned int ext_flags;
+	u32 trunc_len, cpos, phys_cpos, alloc_size;
+	u64 block;
+
+	if (old_clusters <= new_clusters)
+		return 0;
+
+	cpos = new_clusters;
+	trunc_len = old_clusters - new_clusters;
+	while (trunc_len) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
+					       &alloc_size,
+					       &vb->vb_xv->xr_list, &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (alloc_size > trunc_len)
+			alloc_size = trunc_len;
+
+		ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
+						 phys_cpos, alloc_size,
+						 ext_flags, ctxt);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+		ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode),
+						       block, alloc_size);
+		cpos += alloc_size;
+		trunc_len -= alloc_size;
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_value_truncate(struct inode *inode,
+				      struct ocfs2_xattr_value_buf *vb,
+				      int len,
+				      struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
+	u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
+
+	if (new_clusters == old_clusters)
+		return 0;
+
+	if (new_clusters > old_clusters)
+		ret = ocfs2_xattr_extend_allocation(inode,
+						    new_clusters - old_clusters,
+						    vb, ctxt);
+	else
+		ret = ocfs2_xattr_shrink_size(inode,
+					      old_clusters, new_clusters,
+					      vb, ctxt);
+
+	return ret;
+}
+
+static int ocfs2_xattr_list_entry(char *buffer, size_t size,
+				  size_t *result, const char *prefix,
+				  const char *name, int name_len)
+{
+	char *p = buffer + *result;
+	int prefix_len = strlen(prefix);
+	int total_len = prefix_len + name_len + 1;
+
+	*result += total_len;
+
+	/* we are just looking for how big our buffer needs to be */
+	if (!size)
+		return 0;
+
+	if (*result > size)
+		return -ERANGE;
+
+	memcpy(p, prefix, prefix_len);
+	memcpy(p + prefix_len, name, name_len);
+	p[prefix_len + name_len] = '\0';
+
+	return 0;
+}
+
+static int ocfs2_xattr_list_entries(struct inode *inode,
+				    struct ocfs2_xattr_header *header,
+				    char *buffer, size_t buffer_size)
+{
+	size_t result = 0;
+	int i, type, ret;
+	const char *prefix, *name;
+
+	for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+		type = ocfs2_xattr_get_type(entry);
+		prefix = ocfs2_xattr_prefix(type);
+
+		if (prefix) {
+			name = (const char *)header +
+				le16_to_cpu(entry->xe_name_offset);
+
+			ret = ocfs2_xattr_list_entry(buffer, buffer_size,
+						     &result, prefix, name,
+						     entry->xe_name_len);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return result;
+}
+
+int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
+					 struct ocfs2_dinode *di)
+{
+	struct ocfs2_xattr_header *xh;
+	int i;
+
+	xh = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 le16_to_cpu(di->i_xattr_inline_size));
+
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++)
+		if (!ocfs2_xattr_is_local(&xh->xh_entries[i]))
+			return 1;
+
+	return 0;
+}
+
+static int ocfs2_xattr_ibody_list(struct inode *inode,
+				  struct ocfs2_dinode *di,
+				  char *buffer,
+				  size_t buffer_size)
+{
+	struct ocfs2_xattr_header *header = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	int ret = 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+		return ret;
+
+	header = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 le16_to_cpu(di->i_xattr_inline_size));
+
+	ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
+
+	return ret;
+}
+
+static int ocfs2_xattr_block_list(struct inode *inode,
+				  struct ocfs2_dinode *di,
+				  char *buffer,
+				  size_t buffer_size)
+{
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	int ret = 0;
+
+	if (!di->i_xattr_loc)
+		return ret;
+
+	ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+		ret = ocfs2_xattr_list_entries(inode, header,
+					       buffer, buffer_size);
+	} else
+		ret = ocfs2_xattr_tree_list_index_block(inode, blk_bh,
+						   buffer, buffer_size);
+
+	brelse(blk_bh);
+
+	return ret;
+}
+
+ssize_t ocfs2_listxattr(struct dentry *dentry,
+			char *buffer,
+			size_t size)
+{
+	int ret = 0, i_ret = 0, b_ret = 0;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode);
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb)))
+		return -EOPNOTSUPP;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+		return ret;
+
+	ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_read(&oi->ip_xattr_sem);
+	i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size);
+	if (i_ret < 0)
+		b_ret = 0;
+	else {
+		if (buffer) {
+			buffer += i_ret;
+			size -= i_ret;
+		}
+		b_ret = ocfs2_xattr_block_list(dentry->d_inode, di,
+					       buffer, size);
+		if (b_ret < 0)
+			i_ret = 0;
+	}
+	up_read(&oi->ip_xattr_sem);
+	ocfs2_inode_unlock(dentry->d_inode, 0);
+
+	brelse(di_bh);
+
+	return i_ret + b_ret;
+}
+
+static int ocfs2_xattr_find_entry(int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_xattr_entry *entry;
+	size_t name_len;
+	int i, cmp = 1;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	entry = xs->here;
+	for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+		cmp = name_index - ocfs2_xattr_get_type(entry);
+		if (!cmp)
+			cmp = name_len - entry->xe_name_len;
+		if (!cmp)
+			cmp = memcmp(name, (xs->base +
+				     le16_to_cpu(entry->xe_name_offset)),
+				     name_len);
+		if (cmp == 0)
+			break;
+		entry += 1;
+	}
+	xs->here = entry;
+
+	return cmp ? -ENODATA : 0;
+}
+
+static int ocfs2_xattr_get_value_outside(struct inode *inode,
+					 struct ocfs2_xattr_value_root *xv,
+					 void *buffer,
+					 size_t len)
+{
+	u32 cpos, p_cluster, num_clusters, bpc, clusters;
+	u64 blkno;
+	int i, ret = 0;
+	size_t cplen, blocksize;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_extent_list *el;
+
+	el = &xv->xr_list;
+	clusters = le32_to_cpu(xv->xr_clusters);
+	bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	blocksize = inode->i_sb->s_blocksize;
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, el, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+		/* Copy ocfs2_xattr_value */
+		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+			ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
+					       &bh, NULL);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			cplen = len >= blocksize ? blocksize : len;
+			memcpy(buffer, bh->b_data, cplen);
+			len -= cplen;
+			buffer += cplen;
+
+			brelse(bh);
+			bh = NULL;
+			if (len == 0)
+				break;
+		}
+		cpos += num_clusters;
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_ibody_get(struct inode *inode,
+				 int name_index,
+				 const char *name,
+				 void *buffer,
+				 size_t buffer_size,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct ocfs2_xattr_value_root *xv;
+	size_t size;
+	int ret = 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+		return -ENODATA;
+
+	xs->end = (void *)di + inode->i_sb->s_blocksize;
+	xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - le16_to_cpu(di->i_xattr_inline_size));
+	xs->base = (void *)xs->header;
+	xs->here = xs->header->xh_entries;
+
+	ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	if (ret)
+		return ret;
+	size = le64_to_cpu(xs->here->xe_value_size);
+	if (buffer) {
+		if (size > buffer_size)
+			return -ERANGE;
+		if (ocfs2_xattr_is_local(xs->here)) {
+			memcpy(buffer, (void *)xs->base +
+			       le16_to_cpu(xs->here->xe_name_offset) +
+			       OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
+		} else {
+			xv = (struct ocfs2_xattr_value_root *)
+				(xs->base + le16_to_cpu(
+				 xs->here->xe_name_offset) +
+				OCFS2_XATTR_SIZE(xs->here->xe_name_len));
+			ret = ocfs2_xattr_get_value_outside(inode, xv,
+							    buffer, size);
+			if (ret < 0) {
+				mlog_errno(ret);
+				return ret;
+			}
+		}
+	}
+
+	return size;
+}
+
+static int ocfs2_xattr_block_get(struct inode *inode,
+				 int name_index,
+				 const char *name,
+				 void *buffer,
+				 size_t buffer_size,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_xattr_block *xb;
+	struct ocfs2_xattr_value_root *xv;
+	size_t size;
+	int ret = -ENODATA, name_offset, name_len, i;
+	int uninitialized_var(block_off);
+
+	xs->bucket = ocfs2_xattr_bucket_new(inode);
+	if (!xs->bucket) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto cleanup;
+	}
+
+	ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
+	if (ret) {
+		mlog_errno(ret);
+		goto cleanup;
+	}
+
+	if (xs->not_found) {
+		ret = -ENODATA;
+		goto cleanup;
+	}
+
+	xb = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+	size = le64_to_cpu(xs->here->xe_value_size);
+	if (buffer) {
+		ret = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+
+		name_offset = le16_to_cpu(xs->here->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len);
+		i = xs->here - xs->header->xh_entries;
+
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+								bucket_xh(xs->bucket),
+								i,
+								&block_off,
+								&name_offset);
+			xs->base = bucket_block(xs->bucket, block_off);
+		}
+		if (ocfs2_xattr_is_local(xs->here)) {
+			memcpy(buffer, (void *)xs->base +
+			       name_offset + name_len, size);
+		} else {
+			xv = (struct ocfs2_xattr_value_root *)
+				(xs->base + name_offset + name_len);
+			ret = ocfs2_xattr_get_value_outside(inode, xv,
+							    buffer, size);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto cleanup;
+			}
+		}
+	}
+	ret = size;
+cleanup:
+	ocfs2_xattr_bucket_free(xs->bucket);
+
+	brelse(xs->xattr_bh);
+	xs->xattr_bh = NULL;
+	return ret;
+}
+
+int ocfs2_xattr_get_nolock(struct inode *inode,
+			   struct buffer_head *di_bh,
+			   int name_index,
+			   const char *name,
+			   void *buffer,
+			   size_t buffer_size)
+{
+	int ret;
+	struct ocfs2_dinode *di = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+		ret = -ENODATA;
+
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
+				    buffer_size, &xis);
+	if (ret == -ENODATA && di->i_xattr_loc)
+		ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
+					    buffer_size, &xbs);
+
+	return ret;
+}
+
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+static int ocfs2_xattr_get(struct inode *inode,
+			   int name_index,
+			   const char *name,
+			   void *buffer,
+			   size_t buffer_size)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	down_read(&OCFS2_I(inode)->ip_xattr_sem);
+	ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+				     name, buffer, buffer_size);
+	up_read(&OCFS2_I(inode)->ip_xattr_sem);
+
+	ocfs2_inode_unlock(inode, 0);
+
+	brelse(di_bh);
+
+	return ret;
+}
+
+static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+					   handle_t *handle,
+					   struct ocfs2_xattr_value_buf *vb,
+					   const void *value,
+					   int value_len)
+{
+	int ret = 0, i, cp_len;
+	u16 blocksize = inode->i_sb->s_blocksize;
+	u32 p_cluster, num_clusters;
+	u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
+	u64 blkno;
+	struct buffer_head *bh = NULL;
+	unsigned int ext_flags;
+	struct ocfs2_xattr_value_root *xv = vb->vb_xv;
+
+	BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
+
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, &xv->xr_list,
+					       &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+
+		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+
+		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+			ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
+					       &bh, NULL);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_journal_access(handle,
+						   INODE_CACHE(inode),
+						   bh,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			cp_len = value_len > blocksize ? blocksize : value_len;
+			memcpy(bh->b_data, value, cp_len);
+			value_len -= cp_len;
+			value += cp_len;
+			if (cp_len < blocksize)
+				memset(bh->b_data + cp_len, 0,
+				       blocksize - cp_len);
+
+			ocfs2_journal_dirty(handle, bh);
+			brelse(bh);
+			bh = NULL;
+
+			/*
+			 * XXX: do we need to empty all the following
+			 * blocks in this cluster?
+			 */
+			if (!value_len)
+				break;
+		}
+		cpos += num_clusters;
+	}
+out:
+	brelse(bh);
+
+	return ret;
+}
+
+static int ocfs2_xa_check_space_helper(int needed_space, int free_start,
+				       int num_entries)
+{
+	int free_space;
+
+	if (!needed_space)
+		return 0;
+
+	free_space = free_start -
+		sizeof(struct ocfs2_xattr_header) -
+		(num_entries * sizeof(struct ocfs2_xattr_entry)) -
+		OCFS2_XATTR_HEADER_GAP;
+	if (free_space < 0)
+		return -EIO;
+	if (free_space < needed_space)
+		return -ENOSPC;
+
+	return 0;
+}
+
+static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc,
+				   int type)
+{
+	return loc->xl_ops->xlo_journal_access(handle, loc, type);
+}
+
+static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc)
+{
+	loc->xl_ops->xlo_journal_dirty(handle, loc);
+}
+
+/* Give a pointer into the storage for the given offset */
+static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset)
+{
+	BUG_ON(offset >= loc->xl_size);
+	return loc->xl_ops->xlo_offset_pointer(loc, offset);
+}
+
+/*
+ * Wipe the name+value pair and allow the storage to reclaim it.  This
+ * must be followed by either removal of the entry or a call to
+ * ocfs2_xa_add_namevalue().
+ */
+static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+	loc->xl_ops->xlo_wipe_namevalue(loc);
+}
+
+/*
+ * Find lowest offset to a name+value pair.  This is the start of our
+ * downward-growing free space.
+ */
+static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc)
+{
+	return loc->xl_ops->xlo_get_free_start(loc);
+}
+
+/* Can we reuse loc->xl_entry for xi? */
+static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc,
+				    struct ocfs2_xattr_info *xi)
+{
+	return loc->xl_ops->xlo_can_reuse(loc, xi);
+}
+
+/* How much free space is needed to set the new value */
+static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
+				struct ocfs2_xattr_info *xi)
+{
+	return loc->xl_ops->xlo_check_space(loc, xi);
+}
+
+static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+	loc->xl_ops->xlo_add_entry(loc, name_hash);
+	loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
+	/*
+	 * We can't leave the new entry's xe_name_offset at zero or
+	 * add_namevalue() will go nuts.  We set it to the size of our
+	 * storage so that it can never be less than any other entry.
+	 */
+	loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
+}
+
+static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
+				   struct ocfs2_xattr_info *xi)
+{
+	int size = namevalue_size_xi(xi);
+	int nameval_offset;
+	char *nameval_buf;
+
+	loc->xl_ops->xlo_add_namevalue(loc, size);
+	loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
+	loc->xl_entry->xe_name_len = xi->xi_name_len;
+	ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index);
+	ocfs2_xattr_set_local(loc->xl_entry,
+			      xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE);
+
+	nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
+	memset(nameval_buf, 0, size);
+	memcpy(nameval_buf, xi->xi_name, xi->xi_name_len);
+}
+
+static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc,
+				    struct ocfs2_xattr_value_buf *vb)
+{
+	int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
+
+	/* Value bufs are for value trees */
+	BUG_ON(ocfs2_xattr_is_local(loc->xl_entry));
+	BUG_ON(namevalue_size_xe(loc->xl_entry) !=
+	       (name_size + OCFS2_XATTR_ROOT_SIZE));
+
+	loc->xl_ops->xlo_fill_value_buf(loc, vb);
+	vb->vb_xv =
+		(struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc,
+							nameval_offset +
+							name_size);
+}
+
+static int ocfs2_xa_block_journal_access(handle_t *handle,
+					 struct ocfs2_xa_loc *loc, int type)
+{
+	struct buffer_head *bh = loc->xl_storage;
+	ocfs2_journal_access_func access;
+
+	if (loc->xl_size == (bh->b_size -
+			     offsetof(struct ocfs2_xattr_block,
+				      xb_attrs.xb_header)))
+		access = ocfs2_journal_access_xb;
+	else
+		access = ocfs2_journal_access_di;
+	return access(handle, INODE_CACHE(loc->xl_inode), bh, type);
+}
+
+static void ocfs2_xa_block_journal_dirty(handle_t *handle,
+					 struct ocfs2_xa_loc *loc)
+{
+	struct buffer_head *bh = loc->xl_storage;
+
+	ocfs2_journal_dirty(handle, bh);
+}
+
+static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
+					   int offset)
+{
+	return (char *)loc->xl_header + offset;
+}
+
+static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc,
+				    struct ocfs2_xattr_info *xi)
+{
+	/*
+	 * Block storage is strict.  If the sizes aren't exact, we will
+	 * remove the old one and reinsert the new.
+	 */
+	return namevalue_size_xe(loc->xl_entry) ==
+		namevalue_size_xi(xi);
+}
+
+static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	int i, count = le16_to_cpu(xh->xh_count);
+	int offset, free_start = loc->xl_size;
+
+	for (i = 0; i < count; i++) {
+		offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
+		if (offset < free_start)
+			free_start = offset;
+	}
+
+	return free_start;
+}
+
+static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc,
+				      struct ocfs2_xattr_info *xi)
+{
+	int count = le16_to_cpu(loc->xl_header->xh_count);
+	int free_start = ocfs2_xa_get_free_start(loc);
+	int needed_space = ocfs2_xi_entry_usage(xi);
+
+	/*
+	 * Block storage will reclaim the original entry before inserting
+	 * the new value, so we only need the difference.  If the new
+	 * entry is smaller than the old one, we don't need anything.
+	 */
+	if (loc->xl_entry) {
+		/* Don't need space if we're reusing! */
+		if (ocfs2_xa_can_reuse_entry(loc, xi))
+			needed_space = 0;
+		else
+			needed_space -= ocfs2_xe_entry_usage(loc->xl_entry);
+	}
+	if (needed_space < 0)
+		needed_space = 0;
+	return ocfs2_xa_check_space_helper(needed_space, free_start, count);
+}
+
+/*
+ * Block storage for xattrs keeps the name+value pairs compacted.  When
+ * we remove one, we have to shift any that preceded it towards the end.
+ */
+static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+	int i, offset;
+	int namevalue_offset, first_namevalue_offset, namevalue_size;
+	struct ocfs2_xattr_entry *entry = loc->xl_entry;
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	int count = le16_to_cpu(xh->xh_count);
+
+	namevalue_offset = le16_to_cpu(entry->xe_name_offset);
+	namevalue_size = namevalue_size_xe(entry);
+	first_namevalue_offset = ocfs2_xa_get_free_start(loc);
+
+	/* Shift the name+value pairs */
+	memmove((char *)xh + first_namevalue_offset + namevalue_size,
+		(char *)xh + first_namevalue_offset,
+		namevalue_offset - first_namevalue_offset);
+	memset((char *)xh + first_namevalue_offset, 0, namevalue_size);
+
+	/* Now tell xh->xh_entries about it */
+	for (i = 0; i < count; i++) {
+		offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
+		if (offset <= namevalue_offset)
+			le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
+				     namevalue_size);
+	}
+
+	/*
+	 * Note that we don't update xh_free_start or xh_name_value_len
+	 * because they're not used in block-stored xattrs.
+	 */
+}
+
+static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+	int count = le16_to_cpu(loc->xl_header->xh_count);
+	loc->xl_entry = &(loc->xl_header->xh_entries[count]);
+	le16_add_cpu(&loc->xl_header->xh_count, 1);
+	memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
+}
+
+static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size)
+{
+	int free_start = ocfs2_xa_get_free_start(loc);
+
+	loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size);
+}
+
+static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc,
+					  struct ocfs2_xattr_value_buf *vb)
+{
+	struct buffer_head *bh = loc->xl_storage;
+
+	if (loc->xl_size == (bh->b_size -
+			     offsetof(struct ocfs2_xattr_block,
+				      xb_attrs.xb_header)))
+		vb->vb_access = ocfs2_journal_access_xb;
+	else
+		vb->vb_access = ocfs2_journal_access_di;
+	vb->vb_bh = bh;
+}
+
+/*
+ * Operations for xattrs stored in blocks.  This includes inline inode
+ * storage and unindexed ocfs2_xattr_blocks.
+ */
+static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
+	.xlo_journal_access	= ocfs2_xa_block_journal_access,
+	.xlo_journal_dirty	= ocfs2_xa_block_journal_dirty,
+	.xlo_offset_pointer	= ocfs2_xa_block_offset_pointer,
+	.xlo_check_space	= ocfs2_xa_block_check_space,
+	.xlo_can_reuse		= ocfs2_xa_block_can_reuse,
+	.xlo_get_free_start	= ocfs2_xa_block_get_free_start,
+	.xlo_wipe_namevalue	= ocfs2_xa_block_wipe_namevalue,
+	.xlo_add_entry		= ocfs2_xa_block_add_entry,
+	.xlo_add_namevalue	= ocfs2_xa_block_add_namevalue,
+	.xlo_fill_value_buf	= ocfs2_xa_block_fill_value_buf,
+};
+
+static int ocfs2_xa_bucket_journal_access(handle_t *handle,
+					  struct ocfs2_xa_loc *loc, int type)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+
+	return ocfs2_xattr_bucket_journal_access(handle, bucket, type);
+}
+
+static void ocfs2_xa_bucket_journal_dirty(handle_t *handle,
+					  struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+
+	ocfs2_xattr_bucket_journal_dirty(handle, bucket);
+}
+
+static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
+					    int offset)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+	int block, block_offset;
+
+	/* The header is at the front of the bucket */
+	block = offset >> loc->xl_inode->i_sb->s_blocksize_bits;
+	block_offset = offset % loc->xl_inode->i_sb->s_blocksize;
+
+	return bucket_block(bucket, block) + block_offset;
+}
+
+static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc,
+				     struct ocfs2_xattr_info *xi)
+{
+	return namevalue_size_xe(loc->xl_entry) >=
+		namevalue_size_xi(xi);
+}
+
+static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+	return le16_to_cpu(bucket_xh(bucket)->xh_free_start);
+}
+
+static int ocfs2_bucket_align_free_start(struct super_block *sb,
+					 int free_start, int size)
+{
+	/*
+	 * We need to make sure that the name+value pair fits within
+	 * one block.
+	 */
+	if (((free_start - size) >> sb->s_blocksize_bits) !=
+	    ((free_start - 1) >> sb->s_blocksize_bits))
+		free_start -= free_start % sb->s_blocksize;
+
+	return free_start;
+}
+
+static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc,
+				       struct ocfs2_xattr_info *xi)
+{
+	int rc;
+	int count = le16_to_cpu(loc->xl_header->xh_count);
+	int free_start = ocfs2_xa_get_free_start(loc);
+	int needed_space = ocfs2_xi_entry_usage(xi);
+	int size = namevalue_size_xi(xi);
+	struct super_block *sb = loc->xl_inode->i_sb;
+
+	/*
+	 * Bucket storage does not reclaim name+value pairs it cannot
+	 * reuse.  They live as holes until the bucket fills, and then
+	 * the bucket is defragmented.  However, the bucket can reclaim
+	 * the ocfs2_xattr_entry.
+	 */
+	if (loc->xl_entry) {
+		/* Don't need space if we're reusing! */
+		if (ocfs2_xa_can_reuse_entry(loc, xi))
+			needed_space = 0;
+		else
+			needed_space -= sizeof(struct ocfs2_xattr_entry);
+	}
+	BUG_ON(needed_space < 0);
+
+	if (free_start < size) {
+		if (needed_space)
+			return -ENOSPC;
+	} else {
+		/*
+		 * First we check if it would fit in the first place.
+		 * Below, we align the free start to a block.  This may
+		 * slide us below the minimum gap.  By checking unaligned
+		 * first, we avoid that error.
+		 */
+		rc = ocfs2_xa_check_space_helper(needed_space, free_start,
+						 count);
+		if (rc)
+			return rc;
+		free_start = ocfs2_bucket_align_free_start(sb, free_start,
+							   size);
+	}
+	return ocfs2_xa_check_space_helper(needed_space, free_start, count);
+}
+
+static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+	le16_add_cpu(&loc->xl_header->xh_name_value_len,
+		     -namevalue_size_xe(loc->xl_entry));
+}
+
+static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	int count = le16_to_cpu(xh->xh_count);
+	int low = 0, high = count - 1, tmp;
+	struct ocfs2_xattr_entry *tmp_xe;
+
+	/*
+	 * We keep buckets sorted by name_hash, so we need to find
+	 * our insert place.
+	 */
+	while (low <= high && count) {
+		tmp = (low + high) / 2;
+		tmp_xe = &xh->xh_entries[tmp];
+
+		if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
+			low = tmp + 1;
+		else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash))
+			high = tmp - 1;
+		else {
+			low = tmp;
+			break;
+		}
+	}
+
+	if (low != count)
+		memmove(&xh->xh_entries[low + 1],
+			&xh->xh_entries[low],
+			((count - low) * sizeof(struct ocfs2_xattr_entry)));
+
+	le16_add_cpu(&xh->xh_count, 1);
+	loc->xl_entry = &xh->xh_entries[low];
+	memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
+}
+
+static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size)
+{
+	int free_start = ocfs2_xa_get_free_start(loc);
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	struct super_block *sb = loc->xl_inode->i_sb;
+	int nameval_offset;
+
+	free_start = ocfs2_bucket_align_free_start(sb, free_start, size);
+	nameval_offset = free_start - size;
+	loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset);
+	xh->xh_free_start = cpu_to_le16(nameval_offset);
+	le16_add_cpu(&xh->xh_name_value_len, size);
+
+}
+
+static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc,
+					   struct ocfs2_xattr_value_buf *vb)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+	struct super_block *sb = loc->xl_inode->i_sb;
+	int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	int size = namevalue_size_xe(loc->xl_entry);
+	int block_offset = nameval_offset >> sb->s_blocksize_bits;
+
+	/* Values are not allowed to straddle block boundaries */
+	BUG_ON(block_offset !=
+	       ((nameval_offset + size - 1) >> sb->s_blocksize_bits));
+	/* We expect the bucket to be filled in */
+	BUG_ON(!bucket->bu_bhs[block_offset]);
+
+	vb->vb_access = ocfs2_journal_access;
+	vb->vb_bh = bucket->bu_bhs[block_offset];
+}
+
+/* Operations for xattrs stored in buckets. */
+static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
+	.xlo_journal_access	= ocfs2_xa_bucket_journal_access,
+	.xlo_journal_dirty	= ocfs2_xa_bucket_journal_dirty,
+	.xlo_offset_pointer	= ocfs2_xa_bucket_offset_pointer,
+	.xlo_check_space	= ocfs2_xa_bucket_check_space,
+	.xlo_can_reuse		= ocfs2_xa_bucket_can_reuse,
+	.xlo_get_free_start	= ocfs2_xa_bucket_get_free_start,
+	.xlo_wipe_namevalue	= ocfs2_xa_bucket_wipe_namevalue,
+	.xlo_add_entry		= ocfs2_xa_bucket_add_entry,
+	.xlo_add_namevalue	= ocfs2_xa_bucket_add_namevalue,
+	.xlo_fill_value_buf	= ocfs2_xa_bucket_fill_value_buf,
+};
+
+static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_value_buf vb;
+
+	if (ocfs2_xattr_is_local(loc->xl_entry))
+		return 0;
+
+	ocfs2_xa_fill_value_buf(loc, &vb);
+	return le32_to_cpu(vb.vb_xv->xr_clusters);
+}
+
+static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes,
+				   struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int trunc_rc, access_rc;
+	struct ocfs2_xattr_value_buf vb;
+
+	ocfs2_xa_fill_value_buf(loc, &vb);
+	trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes,
+					      ctxt);
+
+	/*
+	 * The caller of ocfs2_xa_value_truncate() has already called
+	 * ocfs2_xa_journal_access on the loc.  However, The truncate code
+	 * calls ocfs2_extend_trans().  This may commit the previous
+	 * transaction and open a new one.  If this is a bucket, truncate
+	 * could leave only vb->vb_bh set up for journaling.  Meanwhile,
+	 * the caller is expecting to dirty the entire bucket.  So we must
+	 * reset the journal work.  We do this even if truncate has failed,
+	 * as it could have failed after committing the extend.
+	 */
+	access_rc = ocfs2_xa_journal_access(ctxt->handle, loc,
+					    OCFS2_JOURNAL_ACCESS_WRITE);
+
+	/* Errors in truncate take precedence */
+	return trunc_rc ? trunc_rc : access_rc;
+}
+
+static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
+{
+	int index, count;
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	struct ocfs2_xattr_entry *entry = loc->xl_entry;
+
+	ocfs2_xa_wipe_namevalue(loc);
+	loc->xl_entry = NULL;
+
+	le16_add_cpu(&xh->xh_count, -1);
+	count = le16_to_cpu(xh->xh_count);
+
+	/*
+	 * Only zero out the entry if there are more remaining.  This is
+	 * important for an empty bucket, as it keeps track of the
+	 * bucket's hash value.  It doesn't hurt empty block storage.
+	 */
+	if (count) {
+		index = ((char *)entry - (char *)&xh->xh_entries) /
+			sizeof(struct ocfs2_xattr_entry);
+		memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1],
+			(count - index) * sizeof(struct ocfs2_xattr_entry));
+		memset(&xh->xh_entries[count], 0,
+		       sizeof(struct ocfs2_xattr_entry));
+	}
+}
+
+/*
+ * If we have a problem adjusting the size of an external value during
+ * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr
+ * in an intermediate state.  For example, the value may be partially
+ * truncated.
+ *
+ * If the value tree hasn't changed, the extend/truncate went nowhere.
+ * We have nothing to do.  The caller can treat it as a straight error.
+ *
+ * If the value tree got partially truncated, we now have a corrupted
+ * extended attribute.  We're going to wipe its entry and leak the
+ * clusters.  Better to leak some storage than leave a corrupt entry.
+ *
+ * If the value tree grew, it obviously didn't grow enough for the
+ * new entry.  We're not going to try and reclaim those clusters either.
+ * If there was already an external value there (orig_clusters != 0),
+ * the new clusters are attached safely and we can just leave the old
+ * value in place.  If there was no external value there, we remove
+ * the entry.
+ *
+ * This way, the xattr block we store in the journal will be consistent.
+ * If the size change broke because of the journal, no changes will hit
+ * disk anyway.
+ */
+static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc,
+					    const char *what,
+					    unsigned int orig_clusters)
+{
+	unsigned int new_clusters = ocfs2_xa_value_clusters(loc);
+	char *nameval_buf = ocfs2_xa_offset_pointer(loc,
+				le16_to_cpu(loc->xl_entry->xe_name_offset));
+
+	if (new_clusters < orig_clusters) {
+		mlog(ML_ERROR,
+		     "Partial truncate while %s xattr %.*s.  Leaking "
+		     "%u clusters and removing the entry\n",
+		     what, loc->xl_entry->xe_name_len, nameval_buf,
+		     orig_clusters - new_clusters);
+		ocfs2_xa_remove_entry(loc);
+	} else if (!orig_clusters) {
+		mlog(ML_ERROR,
+		     "Unable to allocate an external value for xattr "
+		     "%.*s safely.  Leaking %u clusters and removing the "
+		     "entry\n",
+		     loc->xl_entry->xe_name_len, nameval_buf,
+		     new_clusters - orig_clusters);
+		ocfs2_xa_remove_entry(loc);
+	} else if (new_clusters > orig_clusters)
+		mlog(ML_ERROR,
+		     "Unable to grow xattr %.*s safely.  %u new clusters "
+		     "have been added, but the value will not be "
+		     "modified\n",
+		     loc->xl_entry->xe_name_len, nameval_buf,
+		     new_clusters - orig_clusters);
+}
+
+static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
+			   struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int rc = 0;
+	unsigned int orig_clusters;
+
+	if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+		orig_clusters = ocfs2_xa_value_clusters(loc);
+		rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+		if (rc) {
+			mlog_errno(rc);
+			/*
+			 * Since this is remove, we can return 0 if
+			 * ocfs2_xa_cleanup_value_truncate() is going to
+			 * wipe the entry anyway.  So we check the
+			 * cluster count as well.
+			 */
+			if (orig_clusters != ocfs2_xa_value_clusters(loc))
+				rc = 0;
+			ocfs2_xa_cleanup_value_truncate(loc, "removing",
+							orig_clusters);
+			if (rc)
+				goto out;
+		}
+	}
+
+	ocfs2_xa_remove_entry(loc);
+
+out:
+	return rc;
+}
+
+static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc)
+{
+	int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
+	char *nameval_buf;
+
+	nameval_buf = ocfs2_xa_offset_pointer(loc,
+				le16_to_cpu(loc->xl_entry->xe_name_offset));
+	memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE);
+}
+
+/*
+ * Take an existing entry and make it ready for the new value.  This
+ * won't allocate space, but it may free space.  It should be ready for
+ * ocfs2_xa_prepare_entry() to finish the work.
+ */
+static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc,
+				struct ocfs2_xattr_info *xi,
+				struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int rc = 0;
+	int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
+	unsigned int orig_clusters;
+	char *nameval_buf;
+	int xe_local = ocfs2_xattr_is_local(loc->xl_entry);
+	int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE;
+
+	BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) !=
+	       name_size);
+
+	nameval_buf = ocfs2_xa_offset_pointer(loc,
+				le16_to_cpu(loc->xl_entry->xe_name_offset));
+	if (xe_local) {
+		memset(nameval_buf + name_size, 0,
+		       namevalue_size_xe(loc->xl_entry) - name_size);
+		if (!xi_local)
+			ocfs2_xa_install_value_root(loc);
+	} else {
+		orig_clusters = ocfs2_xa_value_clusters(loc);
+		if (xi_local) {
+			rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+			if (rc < 0)
+				mlog_errno(rc);
+			else
+				memset(nameval_buf + name_size, 0,
+				       namevalue_size_xe(loc->xl_entry) -
+				       name_size);
+		} else if (le64_to_cpu(loc->xl_entry->xe_value_size) >
+			   xi->xi_value_len) {
+			rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len,
+						     ctxt);
+			if (rc < 0)
+				mlog_errno(rc);
+		}
+
+		if (rc) {
+			ocfs2_xa_cleanup_value_truncate(loc, "reusing",
+							orig_clusters);
+			goto out;
+		}
+	}
+
+	loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
+	ocfs2_xattr_set_local(loc->xl_entry, xi_local);
+
+out:
+	return rc;
+}
+
+/*
+ * Prepares loc->xl_entry to receive the new xattr.  This includes
+ * properly setting up the name+value pair region.  If loc->xl_entry
+ * already exists, it will take care of modifying it appropriately.
+ *
+ * Note that this modifies the data.  You did journal_access already,
+ * right?
+ */
+static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
+				  struct ocfs2_xattr_info *xi,
+				  u32 name_hash,
+				  struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int rc = 0;
+	unsigned int orig_clusters;
+	__le64 orig_value_size = 0;
+
+	rc = ocfs2_xa_check_space(loc, xi);
+	if (rc)
+		goto out;
+
+	if (loc->xl_entry) {
+		if (ocfs2_xa_can_reuse_entry(loc, xi)) {
+			orig_value_size = loc->xl_entry->xe_value_size;
+			rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
+			if (rc)
+				goto out;
+			goto alloc_value;
+		}
+
+		if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+			orig_clusters = ocfs2_xa_value_clusters(loc);
+			rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+			if (rc) {
+				mlog_errno(rc);
+				ocfs2_xa_cleanup_value_truncate(loc,
+								"overwriting",
+								orig_clusters);
+				goto out;
+			}
+		}
+		ocfs2_xa_wipe_namevalue(loc);
+	} else
+		ocfs2_xa_add_entry(loc, name_hash);
+
+	/*
+	 * If we get here, we have a blank entry.  Fill it.  We grow our
+	 * name+value pair back from the end.
+	 */
+	ocfs2_xa_add_namevalue(loc, xi);
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
+		ocfs2_xa_install_value_root(loc);
+
+alloc_value:
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+		orig_clusters = ocfs2_xa_value_clusters(loc);
+		rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
+		if (rc < 0) {
+			ctxt->set_abort = 1;
+			ocfs2_xa_cleanup_value_truncate(loc, "growing",
+							orig_clusters);
+			/*
+			 * If we were growing an existing value,
+			 * ocfs2_xa_cleanup_value_truncate() won't remove
+			 * the entry. We need to restore the original value
+			 * size.
+			 */
+			if (loc->xl_entry) {
+				BUG_ON(!orig_value_size);
+				loc->xl_entry->xe_value_size = orig_value_size;
+			}
+			mlog_errno(rc);
+		}
+	}
+
+out:
+	return rc;
+}
+
+/*
+ * Store the value portion of the name+value pair.  This will skip
+ * values that are stored externally.  Their tree roots were set up
+ * by ocfs2_xa_prepare_entry().
+ */
+static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc,
+				struct ocfs2_xattr_info *xi,
+				struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int rc = 0;
+	int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
+	char *nameval_buf;
+	struct ocfs2_xattr_value_buf vb;
+
+	nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+		ocfs2_xa_fill_value_buf(loc, &vb);
+		rc = __ocfs2_xattr_set_value_outside(loc->xl_inode,
+						     ctxt->handle, &vb,
+						     xi->xi_value,
+						     xi->xi_value_len);
+	} else
+		memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len);
+
+	return rc;
+}
+
+static int ocfs2_xa_set(struct ocfs2_xa_loc *loc,
+			struct ocfs2_xattr_info *xi,
+			struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name,
+					      xi->xi_name_len);
+
+	ret = ocfs2_xa_journal_access(ctxt->handle, loc,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * From here on out, everything is going to modify the buffer a
+	 * little.  Errors are going to leave the xattr header in a
+	 * sane state.  Thus, even with errors we dirty the sucker.
+	 */
+
+	/* Don't worry, we are never called with !xi_value and !xl_entry */
+	if (!xi->xi_value) {
+		ret = ocfs2_xa_remove(loc, ctxt);
+		goto out_dirty;
+	}
+
+	ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out_dirty;
+	}
+
+	ret = ocfs2_xa_store_value(loc, xi, ctxt);
+	if (ret)
+		mlog_errno(ret);
+
+out_dirty:
+	ocfs2_xa_journal_dirty(ctxt->handle, loc);
+
+out:
+	return ret;
+}
+
+static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
+				     struct inode *inode,
+				     struct buffer_head *bh,
+				     struct ocfs2_xattr_entry *entry)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL));
+
+	loc->xl_inode = inode;
+	loc->xl_ops = &ocfs2_xa_block_loc_ops;
+	loc->xl_storage = bh;
+	loc->xl_entry = entry;
+	loc->xl_size = le16_to_cpu(di->i_xattr_inline_size);
+	loc->xl_header =
+		(struct ocfs2_xattr_header *)(bh->b_data + bh->b_size -
+					      loc->xl_size);
+}
+
+static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc,
+					  struct inode *inode,
+					  struct buffer_head *bh,
+					  struct ocfs2_xattr_entry *entry)
+{
+	struct ocfs2_xattr_block *xb =
+		(struct ocfs2_xattr_block *)bh->b_data;
+
+	BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED);
+
+	loc->xl_inode = inode;
+	loc->xl_ops = &ocfs2_xa_block_loc_ops;
+	loc->xl_storage = bh;
+	loc->xl_header = &(xb->xb_attrs.xb_header);
+	loc->xl_entry = entry;
+	loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block,
+					     xb_attrs.xb_header);
+}
+
+static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
+					   struct ocfs2_xattr_bucket *bucket,
+					   struct ocfs2_xattr_entry *entry)
+{
+	loc->xl_inode = bucket->bu_inode;
+	loc->xl_ops = &ocfs2_xa_bucket_loc_ops;
+	loc->xl_storage = bucket;
+	loc->xl_header = bucket_xh(bucket);
+	loc->xl_entry = entry;
+	loc->xl_size = OCFS2_XATTR_BUCKET_SIZE;
+}
+
+/*
+ * In xattr remove, if it is stored outside and refcounted, we may have
+ * the chance to split the refcount tree. So need the allocators.
+ */
+static int ocfs2_lock_xattr_remove_allocators(struct inode *inode,
+					struct ocfs2_xattr_value_root *xv,
+					struct ocfs2_caching_info *ref_ci,
+					struct buffer_head *ref_root_bh,
+					struct ocfs2_alloc_context **meta_ac,
+					int *ref_credits)
+{
+	int ret, meta_add = 0;
+	u32 p_cluster, num_clusters;
+	unsigned int ext_flags;
+
+	*ref_credits = 0;
+	ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
+				       &num_clusters,
+				       &xv->xr_list,
+				       &ext_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+		goto out;
+
+	ret = ocfs2_refcounted_xattr_delete_need(inode, ref_ci,
+						 ref_root_bh, xv,
+						 &meta_add, ref_credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+						meta_add, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+static int ocfs2_remove_value_outside(struct inode*inode,
+				      struct ocfs2_xattr_value_buf *vb,
+				      struct ocfs2_xattr_header *header,
+				      struct ocfs2_caching_info *ref_ci,
+				      struct buffer_head *ref_root_bh)
+{
+	int ret = 0, i, ref_credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+	void *val;
+
+	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
+
+	for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+
+		if (ocfs2_xattr_is_local(entry))
+			continue;
+
+		val = (void *)header +
+			le16_to_cpu(entry->xe_name_offset);
+		vb->vb_xv = (struct ocfs2_xattr_value_root *)
+			(val + OCFS2_XATTR_SIZE(entry->xe_name_len));
+
+		ret = ocfs2_lock_xattr_remove_allocators(inode, vb->vb_xv,
+							 ref_ci, ref_root_bh,
+							 &ctxt.meta_ac,
+							 &ref_credits);
+
+		ctxt.handle = ocfs2_start_trans(osb, ref_credits +
+					ocfs2_remove_extent_credits(osb->sb));
+		if (IS_ERR(ctxt.handle)) {
+			ret = PTR_ERR(ctxt.handle);
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ocfs2_commit_trans(osb, ctxt.handle);
+		if (ctxt.meta_ac) {
+			ocfs2_free_alloc_context(ctxt.meta_ac);
+			ctxt.meta_ac = NULL;
+		}
+	}
+
+	if (ctxt.meta_ac)
+		ocfs2_free_alloc_context(ctxt.meta_ac);
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &ctxt.dealloc);
+	return ret;
+}
+
+static int ocfs2_xattr_ibody_remove(struct inode *inode,
+				    struct buffer_head *di_bh,
+				    struct ocfs2_caching_info *ref_ci,
+				    struct buffer_head *ref_root_bh)
+{
+
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_xattr_header *header;
+	int ret;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = di_bh,
+		.vb_access = ocfs2_journal_access_di,
+	};
+
+	header = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 le16_to_cpu(di->i_xattr_inline_size));
+
+	ret = ocfs2_remove_value_outside(inode, &vb, header,
+					 ref_ci, ref_root_bh);
+
+	return ret;
+}
+
+struct ocfs2_rm_xattr_bucket_para {
+	struct ocfs2_caching_info *ref_ci;
+	struct buffer_head *ref_root_bh;
+};
+
+static int ocfs2_xattr_block_remove(struct inode *inode,
+				    struct buffer_head *blk_bh,
+				    struct ocfs2_caching_info *ref_ci,
+				    struct buffer_head *ref_root_bh)
+{
+	struct ocfs2_xattr_block *xb;
+	int ret = 0;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = blk_bh,
+		.vb_access = ocfs2_journal_access_xb,
+	};
+	struct ocfs2_rm_xattr_bucket_para args = {
+		.ref_ci = ref_ci,
+		.ref_root_bh = ref_root_bh,
+	};
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
+		ret = ocfs2_remove_value_outside(inode, &vb, header,
+						 ref_ci, ref_root_bh);
+	} else
+		ret = ocfs2_iterate_xattr_index_block(inode,
+						blk_bh,
+						ocfs2_rm_xattr_cluster,
+						&args);
+
+	return ret;
+}
+
+static int ocfs2_xattr_free_block(struct inode *inode,
+				  u64 block,
+				  struct ocfs2_caching_info *ref_ci,
+				  struct buffer_head *ref_root_bh)
+{
+	struct inode *xb_alloc_inode;
+	struct buffer_head *xb_alloc_bh = NULL;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle;
+	int ret = 0;
+	u64 blk, bg_blkno;
+	u16 bit;
+
+	ret = ocfs2_read_xattr_block(inode, block, &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_block_remove(inode, blk_bh, ref_ci, ref_root_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	blk = le64_to_cpu(xb->xb_blkno);
+	bit = le16_to_cpu(xb->xb_suballoc_bit);
+	if (xb->xb_suballoc_loc)
+		bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
+	else
+		bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+	xb_alloc_inode = ocfs2_get_system_file_inode(osb,
+				EXTENT_ALLOC_SYSTEM_INODE,
+				le16_to_cpu(xb->xb_suballoc_slot));
+	if (!xb_alloc_inode) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	mutex_lock(&xb_alloc_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh,
+				       bit, bg_blkno, 1);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(osb, handle);
+out_unlock:
+	ocfs2_inode_unlock(xb_alloc_inode, 1);
+	brelse(xb_alloc_bh);
+out_mutex:
+	mutex_unlock(&xb_alloc_inode->i_mutex);
+	iput(xb_alloc_inode);
+out:
+	brelse(blk_bh);
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_remove()
+ *
+ * Free extended attribute resources associated with this inode.
+ */
+int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_caching_info *ref_ci = NULL;
+	handle_t *handle;
+	int ret;
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+		return 0;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+		ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb),
+					       le64_to_cpu(di->i_refcount_loc),
+					       1, &ref_tree, &ref_root_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		ref_ci = &ref_tree->rf_ci;
+
+	}
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_ibody_remove(inode, di_bh,
+					       ref_ci, ref_root_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (di->i_xattr_loc) {
+		ret = ocfs2_xattr_free_block(inode,
+					     le64_to_cpu(di->i_xattr_loc),
+					     ref_ci, ref_root_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	di->i_xattr_loc = 0;
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	ocfs2_journal_dirty(handle, di_bh);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(OCFS2_SB(inode->i_sb), ref_tree, 1);
+	brelse(ref_root_bh);
+	return ret;
+}
+
+static int ocfs2_xattr_has_space_inline(struct inode *inode,
+					struct ocfs2_dinode *di)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size;
+	int free;
+
+	if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE)
+		return 0;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		struct ocfs2_inline_data *idata = &di->id2.i_data;
+		free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size);
+	} else if (ocfs2_inode_is_fast_symlink(inode)) {
+		free = ocfs2_fast_symlink_chars(inode->i_sb) -
+			le64_to_cpu(di->i_size);
+	} else {
+		struct ocfs2_extent_list *el = &di->id2.i_list;
+		free = (le16_to_cpu(el->l_count) -
+			le16_to_cpu(el->l_next_free_rec)) *
+			sizeof(struct ocfs2_extent_rec);
+	}
+	if (free >= xattrsize)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * ocfs2_xattr_ibody_find()
+ *
+ * Find extended attribute in inode block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_ibody_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	int ret;
+	int has_space = 0;
+
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+		return 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+		down_read(&oi->ip_alloc_sem);
+		has_space = ocfs2_xattr_has_space_inline(inode, di);
+		up_read(&oi->ip_alloc_sem);
+		if (!has_space)
+			return 0;
+	}
+
+	xs->xattr_bh = xs->inode_bh;
+	xs->end = (void *)di + inode->i_sb->s_blocksize;
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)
+		xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - le16_to_cpu(di->i_xattr_inline_size));
+	else
+		xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size);
+	xs->base = (void *)xs->header;
+	xs->here = xs->header->xh_entries;
+
+	/* Find the named attribute. */
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+		if (ret && ret != -ENODATA)
+			return ret;
+		xs->not_found = ret;
+	}
+
+	return 0;
+}
+
+static int ocfs2_xattr_ibody_init(struct inode *inode,
+				  struct buffer_head *di_bh,
+				  struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned int xattrsize = osb->s_xattr_inline_size;
+
+	if (!ocfs2_xattr_has_space_inline(inode, di)) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Adjust extent record count or inline data size
+	 * to reserve space for extended attribute.
+	 */
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		struct ocfs2_inline_data *idata = &di->id2.i_data;
+		le16_add_cpu(&idata->id_count, -xattrsize);
+	} else if (!(ocfs2_inode_is_fast_symlink(inode))) {
+		struct ocfs2_extent_list *el = &di->id2.i_list;
+		le16_add_cpu(&el->l_count, -(xattrsize /
+					     sizeof(struct ocfs2_extent_rec)));
+	}
+	di->i_xattr_inline_size = cpu_to_le16(xattrsize);
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	ocfs2_journal_dirty(ctxt->handle, di_bh);
+
+out:
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_ibody_set()
+ *
+ * Set, replace or remove an extended attribute into inode block.
+ *
+ */
+static int ocfs2_xattr_ibody_set(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs,
+				 struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct ocfs2_xa_loc loc;
+
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+		return -ENOSPC;
+
+	down_write(&oi->ip_alloc_sem);
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+		if (!ocfs2_xattr_has_space_inline(inode, di)) {
+			ret = -ENOSPC;
+			goto out;
+		}
+	}
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+		ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
+		if (ret) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
+				 xs->not_found ? NULL : xs->here);
+	ret = ocfs2_xa_set(&loc, xi, ctxt);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+	xs->here = loc.xl_entry;
+
+out:
+	up_write(&oi->ip_alloc_sem);
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_block_find()
+ *
+ * Find extended attribute in external block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_block_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	int ret = 0;
+
+	if (!di->i_xattr_loc)
+		return ret;
+
+	ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	xs->xattr_bh = blk_bh;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		xs->header = &xb->xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+		xs->here = xs->header->xh_entries;
+
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	} else
+		ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+						   name_index,
+						   name, xs);
+
+	if (ret && ret != -ENODATA) {
+		xs->xattr_bh = NULL;
+		goto cleanup;
+	}
+	xs->not_found = ret;
+	return 0;
+cleanup:
+	brelse(blk_bh);
+
+	return ret;
+}
+
+static int ocfs2_create_xattr_block(struct inode *inode,
+				    struct buffer_head *inode_bh,
+				    struct ocfs2_xattr_set_ctxt *ctxt,
+				    int indexed,
+				    struct buffer_head **ret_bh)
+{
+	int ret;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 suballoc_loc, first_blkno;
+	struct ocfs2_dinode *di =  (struct ocfs2_dinode *)inode_bh->b_data;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_xattr_block *xblk;
+
+	ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
+				      inode_bh, OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto end;
+	}
+
+	ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
+				   &suballoc_loc, &suballoc_bit_start,
+				   &num_got, &first_blkno);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto end;
+	}
+
+	new_bh = sb_getblk(inode->i_sb, first_blkno);
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
+
+	ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
+				      new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto end;
+	}
+
+	/* Initialize ocfs2_xattr_block */
+	xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
+	memset(xblk, 0, inode->i_sb->s_blocksize);
+	strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
+	xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
+	xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
+	xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	xblk->xb_fs_generation =
+		cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
+	xblk->xb_blkno = cpu_to_le64(first_blkno);
+	if (indexed) {
+		struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
+		xr->xt_clusters = cpu_to_le32(1);
+		xr->xt_last_eb_blk = 0;
+		xr->xt_list.l_tree_depth = 0;
+		xr->xt_list.l_count = cpu_to_le16(
+					ocfs2_xattr_recs_per_xb(inode->i_sb));
+		xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+		xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
+	}
+	ocfs2_journal_dirty(ctxt->handle, new_bh);
+
+	/* Add it to the inode */
+	di->i_xattr_loc = cpu_to_le64(first_blkno);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
+	di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	ocfs2_journal_dirty(ctxt->handle, inode_bh);
+
+	*ret_bh = new_bh;
+	new_bh = NULL;
+
+end:
+	brelse(new_bh);
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_block_set()
+ *
+ * Set, replace or remove an extended attribute into external block.
+ *
+ */
+static int ocfs2_xattr_block_set(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs,
+				 struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_xattr_block *xblk = NULL;
+	int ret;
+	struct ocfs2_xa_loc loc;
+
+	if (!xs->xattr_bh) {
+		ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt,
+					       0, &new_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto end;
+		}
+
+		xs->xattr_bh = new_bh;
+		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+		xs->header = &xblk->xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)xblk + inode->i_sb->s_blocksize;
+		xs->here = xs->header->xh_entries;
+	} else
+		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+
+	if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
+					      xs->not_found ? NULL : xs->here);
+
+		ret = ocfs2_xa_set(&loc, xi, ctxt);
+		if (!ret)
+			xs->here = loc.xl_entry;
+		else if ((ret != -ENOSPC) || ctxt->set_abort)
+			goto end;
+		else {
+			ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
+			if (ret)
+				goto end;
+		}
+	}
+
+	if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)
+		ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
+
+end:
+	return ret;
+}
+
+/* Check whether the new xattr can be inserted into the inode. */
+static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
+				       struct ocfs2_xattr_info *xi,
+				       struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_xattr_entry *last;
+	int free, i;
+	size_t min_offs = xs->end - xs->base;
+
+	if (!xs->header)
+		return 0;
+
+	last = xs->header->xh_entries;
+
+	for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+		size_t offs = le16_to_cpu(last->xe_name_offset);
+		if (offs < min_offs)
+			min_offs = offs;
+		last += 1;
+	}
+
+	free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
+	if (free < 0)
+		return 0;
+
+	BUG_ON(!xs->not_found);
+
+	if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi)))
+		return 1;
+
+	return 0;
+}
+
+static int ocfs2_calc_xattr_set_need(struct inode *inode,
+				     struct ocfs2_dinode *di,
+				     struct ocfs2_xattr_info *xi,
+				     struct ocfs2_xattr_search *xis,
+				     struct ocfs2_xattr_search *xbs,
+				     int *clusters_need,
+				     int *meta_need,
+				     int *credits_need)
+{
+	int ret = 0, old_in_xb = 0;
+	int clusters_add = 0, meta_add = 0, credits = 0;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_xattr_block *xb = NULL;
+	struct ocfs2_xattr_entry *xe = NULL;
+	struct ocfs2_xattr_value_root *xv = NULL;
+	char *base = NULL;
+	int name_offset, name_len = 0;
+	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+						    xi->xi_value_len);
+	u64 value_size;
+
+	/*
+	 * Calculate the clusters we need to write.
+	 * No matter whether we replace an old one or add a new one,
+	 * we need this for writing.
+	 */
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
+		credits += new_clusters *
+			   ocfs2_clusters_to_blocks(inode->i_sb, 1);
+
+	if (xis->not_found && xbs->not_found) {
+		credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+		if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+			clusters_add += new_clusters;
+			credits += ocfs2_calc_extend_credits(inode->i_sb,
+							&def_xv.xv.xr_list,
+							new_clusters);
+		}
+
+		goto meta_guess;
+	}
+
+	if (!xis->not_found) {
+		xe = xis->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		base = xis->base;
+		credits += OCFS2_INODE_UPDATE_CREDITS;
+	} else {
+		int i, block_off = 0;
+		xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+		xe = xbs->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		i = xbs->here - xbs->header->xh_entries;
+		old_in_xb = 1;
+
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+							bucket_xh(xbs->bucket),
+							i, &block_off,
+							&name_offset);
+			base = bucket_block(xbs->bucket, block_off);
+			credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+		} else {
+			base = xbs->base;
+			credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS;
+		}
+	}
+
+	/*
+	 * delete a xattr doesn't need metadata and cluster allocation.
+	 * so just calculate the credits and return.
+	 *
+	 * The credits for removing the value tree will be extended
+	 * by ocfs2_remove_extent itself.
+	 */
+	if (!xi->xi_value) {
+		if (!ocfs2_xattr_is_local(xe))
+			credits += ocfs2_remove_extent_credits(inode->i_sb);
+
+		goto out;
+	}
+
+	/* do cluster allocation guess first. */
+	value_size = le64_to_cpu(xe->xe_value_size);
+
+	if (old_in_xb) {
+		/*
+		 * In xattr set, we always try to set the xe in inode first,
+		 * so if it can be inserted into inode successfully, the old
+		 * one will be removed from the xattr block, and this xattr
+		 * will be inserted into inode as a new xattr in inode.
+		 */
+		if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
+			clusters_add += new_clusters;
+			credits += ocfs2_remove_extent_credits(inode->i_sb) +
+				    OCFS2_INODE_UPDATE_CREDITS;
+			if (!ocfs2_xattr_is_local(xe))
+				credits += ocfs2_calc_extend_credits(
+							inode->i_sb,
+							&def_xv.xv.xr_list,
+							new_clusters);
+			goto out;
+		}
+	}
+
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+		/* the new values will be stored outside. */
+		u32 old_clusters = 0;
+
+		if (!ocfs2_xattr_is_local(xe)) {
+			old_clusters =	ocfs2_clusters_for_bytes(inode->i_sb,
+								 value_size);
+			xv = (struct ocfs2_xattr_value_root *)
+			     (base + name_offset + name_len);
+			value_size = OCFS2_XATTR_ROOT_SIZE;
+		} else
+			xv = &def_xv.xv;
+
+		if (old_clusters >= new_clusters) {
+			credits += ocfs2_remove_extent_credits(inode->i_sb);
+			goto out;
+		} else {
+			meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
+			clusters_add += new_clusters - old_clusters;
+			credits += ocfs2_calc_extend_credits(inode->i_sb,
+							     &xv->xr_list,
+							     new_clusters -
+							     old_clusters);
+			if (value_size >= OCFS2_XATTR_ROOT_SIZE)
+				goto out;
+		}
+	} else {
+		/*
+		 * Now the new value will be stored inside. So if the new
+		 * value is smaller than the size of value root or the old
+		 * value, we don't need any allocation, otherwise we have
+		 * to guess metadata allocation.
+		 */
+		if ((ocfs2_xattr_is_local(xe) &&
+		     (value_size >= xi->xi_value_len)) ||
+		    (!ocfs2_xattr_is_local(xe) &&
+		     OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len))
+			goto out;
+	}
+
+meta_guess:
+	/* calculate metadata allocation. */
+	if (di->i_xattr_loc) {
+		if (!xbs->xattr_bh) {
+			ret = ocfs2_read_xattr_block(inode,
+						     le64_to_cpu(di->i_xattr_loc),
+						     &bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			xb = (struct ocfs2_xattr_block *)bh->b_data;
+		} else
+			xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+
+		/*
+		 * If there is already an xattr tree, good, we can calculate
+		 * like other b-trees. Otherwise we may have the chance of
+		 * create a tree, the credit calculation is borrowed from
+		 * ocfs2_calc_extend_credits with root_el = NULL. And the
+		 * new tree will be cluster based, so no meta is needed.
+		 */
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			struct ocfs2_extent_list *el =
+				 &xb->xb_attrs.xb_root.xt_list;
+			meta_add += ocfs2_extend_meta_needed(el);
+			credits += ocfs2_calc_extend_credits(inode->i_sb,
+							     el, 1);
+		} else
+			credits += OCFS2_SUBALLOC_ALLOC + 1;
+
+		/*
+		 * This cluster will be used either for new bucket or for
+		 * new xattr block.
+		 * If the cluster size is the same as the bucket size, one
+		 * more is needed since we may need to extend the bucket
+		 * also.
+		 */
+		clusters_add += 1;
+		credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+		if (OCFS2_XATTR_BUCKET_SIZE ==
+			OCFS2_SB(inode->i_sb)->s_clustersize) {
+			credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+			clusters_add += 1;
+		}
+	} else {
+		meta_add += 1;
+		credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+	}
+out:
+	if (clusters_need)
+		*clusters_need = clusters_add;
+	if (meta_need)
+		*meta_need = meta_add;
+	if (credits_need)
+		*credits_need = credits;
+	brelse(bh);
+	return ret;
+}
+
+static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
+				     struct ocfs2_dinode *di,
+				     struct ocfs2_xattr_info *xi,
+				     struct ocfs2_xattr_search *xis,
+				     struct ocfs2_xattr_search *xbs,
+				     struct ocfs2_xattr_set_ctxt *ctxt,
+				     int extra_meta,
+				     int *credits)
+{
+	int clusters_add, meta_add, ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt));
+
+	ocfs2_init_dealloc_ctxt(&ctxt->dealloc);
+
+	ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs,
+					&clusters_add, &meta_add, credits);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	meta_add += extra_meta;
+	trace_ocfs2_init_xattr_set_ctxt(xi->xi_name, meta_add,
+					clusters_add, *credits);
+
+	if (meta_add) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
+							&ctxt->meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (clusters_add) {
+		ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac);
+		if (ret)
+			mlog_errno(ret);
+	}
+out:
+	if (ret) {
+		if (ctxt->meta_ac) {
+			ocfs2_free_alloc_context(ctxt->meta_ac);
+			ctxt->meta_ac = NULL;
+		}
+
+		/*
+		 * We cannot have an error and a non null ctxt->data_ac.
+		 */
+	}
+
+	return ret;
+}
+
+static int __ocfs2_xattr_set_handle(struct inode *inode,
+				    struct ocfs2_dinode *di,
+				    struct ocfs2_xattr_info *xi,
+				    struct ocfs2_xattr_search *xis,
+				    struct ocfs2_xattr_search *xbs,
+				    struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret = 0, credits, old_found;
+
+	if (!xi->xi_value) {
+		/* Remove existing extended attribute */
+		if (!xis->not_found)
+			ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+		else if (!xbs->not_found)
+			ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+	} else {
+		/* We always try to set extended attribute into inode first*/
+		ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+		if (!ret && !xbs->not_found) {
+			/*
+			 * If succeed and that extended attribute existing in
+			 * external block, then we will remove it.
+			 */
+			xi->xi_value = NULL;
+			xi->xi_value_len = 0;
+
+			old_found = xis->not_found;
+			xis->not_found = -ENODATA;
+			ret = ocfs2_calc_xattr_set_need(inode,
+							di,
+							xi,
+							xis,
+							xbs,
+							NULL,
+							NULL,
+							&credits);
+			xis->not_found = old_found;
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_extend_trans(ctxt->handle, credits);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+			ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+		} else if ((ret == -ENOSPC) && !ctxt->set_abort) {
+			if (di->i_xattr_loc && !xbs->xattr_bh) {
+				ret = ocfs2_xattr_block_find(inode,
+							     xi->xi_name_index,
+							     xi->xi_name, xbs);
+				if (ret)
+					goto out;
+
+				old_found = xis->not_found;
+				xis->not_found = -ENODATA;
+				ret = ocfs2_calc_xattr_set_need(inode,
+								di,
+								xi,
+								xis,
+								xbs,
+								NULL,
+								NULL,
+								&credits);
+				xis->not_found = old_found;
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+
+				ret = ocfs2_extend_trans(ctxt->handle, credits);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+			}
+			/*
+			 * If no space in inode, we will set extended attribute
+			 * into external block.
+			 */
+			ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+			if (ret)
+				goto out;
+			if (!xis->not_found) {
+				/*
+				 * If succeed and that extended attribute
+				 * existing in inode, we will remove it.
+				 */
+				xi->xi_value = NULL;
+				xi->xi_value_len = 0;
+				xbs->not_found = -ENODATA;
+				ret = ocfs2_calc_xattr_set_need(inode,
+								di,
+								xi,
+								xis,
+								xbs,
+								NULL,
+								NULL,
+								&credits);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+
+				ret = ocfs2_extend_trans(ctxt->handle, credits);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+				ret = ocfs2_xattr_ibody_set(inode, xi,
+							    xis, ctxt);
+			}
+		}
+	}
+
+	if (!ret) {
+		/* Update inode ctime. */
+		ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
+					      xis->inode_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		inode->i_ctime = CURRENT_TIME;
+		di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+		di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+		ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
+	}
+out:
+	return ret;
+}
+
+/*
+ * This function only called duing creating inode
+ * for init security/acl xattrs of the new inode.
+ * All transanction credits have been reserved in mknod.
+ */
+int ocfs2_xattr_set_handle(handle_t *handle,
+			   struct inode *inode,
+			   struct buffer_head *di_bh,
+			   int name_index,
+			   const char *name,
+			   const void *value,
+			   size_t value_len,
+			   int flags,
+			   struct ocfs2_alloc_context *meta_ac,
+			   struct ocfs2_alloc_context *data_ac)
+{
+	struct ocfs2_dinode *di;
+	int ret;
+
+	struct ocfs2_xattr_info xi = {
+		.xi_name_index = name_index,
+		.xi_name = name,
+		.xi_name_len = strlen(name),
+		.xi_value = value,
+		.xi_value_len = value_len,
+	};
+
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	struct ocfs2_xattr_set_ctxt ctxt = {
+		.handle = handle,
+		.meta_ac = meta_ac,
+		.data_ac = data_ac,
+	};
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	/*
+	 * In extreme situation, may need xattr bucket when
+	 * block size is too small. And we have already reserved
+	 * the credits for bucket in mknod.
+	 */
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) {
+		xbs.bucket = ocfs2_xattr_bucket_new(inode);
+		if (!xbs.bucket) {
+			mlog_errno(-ENOMEM);
+			return -ENOMEM;
+		}
+	}
+
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_write(&OCFS2_I(inode)->ip_xattr_sem);
+
+	ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+	if (ret)
+		goto cleanup;
+	if (xis.not_found) {
+		ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+		if (ret)
+			goto cleanup;
+	}
+
+	ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+
+cleanup:
+	up_write(&OCFS2_I(inode)->ip_xattr_sem);
+	brelse(xbs.xattr_bh);
+	ocfs2_xattr_bucket_free(xbs.bucket);
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_set()
+ *
+ * Set, replace or remove an extended attribute for this inode.
+ * value is NULL to remove an existing extended attribute, else either
+ * create or replace an extended attribute.
+ */
+int ocfs2_xattr_set(struct inode *inode,
+		    int name_index,
+		    const char *name,
+		    const void *value,
+		    size_t value_len,
+		    int flags)
+{
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	int ret, credits, ref_meta = 0, ref_credits = 0;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+
+	struct ocfs2_xattr_info xi = {
+		.xi_name_index = name_index,
+		.xi_name = name,
+		.xi_name_len = strlen(name),
+		.xi_value = value,
+		.xi_value_len = value_len,
+	};
+
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	/*
+	 * Only xbs will be used on indexed trees.  xis doesn't need a
+	 * bucket.
+	 */
+	xbs.bucket = ocfs2_xattr_bucket_new(inode);
+	if (!xbs.bucket) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto cleanup_nolock;
+	}
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_write(&OCFS2_I(inode)->ip_xattr_sem);
+	/*
+	 * Scan inode and external block to find the same name
+	 * extended attribute and collect search information.
+	 */
+	ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+	if (ret)
+		goto cleanup;
+	if (xis.not_found) {
+		ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+		if (ret)
+			goto cleanup;
+	}
+
+	if (xis.not_found && xbs.not_found) {
+		ret = -ENODATA;
+		if (flags & XATTR_REPLACE)
+			goto cleanup;
+		ret = 0;
+		if (!value)
+			goto cleanup;
+	} else {
+		ret = -EEXIST;
+		if (flags & XATTR_CREATE)
+			goto cleanup;
+	}
+
+	/* Check whether the value is refcounted and do some preparation. */
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
+	    (!xis.not_found || !xbs.not_found)) {
+		ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
+						   &xis, &xbs, &ref_tree,
+						   &ref_meta, &ref_credits);
+		if (ret) {
+			mlog_errno(ret);
+			goto cleanup;
+		}
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mutex_unlock(&tl_inode->i_mutex);
+			mlog_errno(ret);
+			goto cleanup;
+		}
+	}
+	mutex_unlock(&tl_inode->i_mutex);
+
+	ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
+					&xbs, &ctxt, ref_meta, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto cleanup;
+	}
+
+	/* we need to update inode's ctime field, so add credit for it. */
+	credits += OCFS2_INODE_UPDATE_CREDITS;
+	ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
+	if (IS_ERR(ctxt.handle)) {
+		ret = PTR_ERR(ctxt.handle);
+		mlog_errno(ret);
+		goto cleanup;
+	}
+
+	ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+
+	ocfs2_commit_trans(osb, ctxt.handle);
+
+	if (ctxt.data_ac)
+		ocfs2_free_alloc_context(ctxt.data_ac);
+	if (ctxt.meta_ac)
+		ocfs2_free_alloc_context(ctxt.meta_ac);
+	if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &ctxt.dealloc);
+
+cleanup:
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	up_write(&OCFS2_I(inode)->ip_xattr_sem);
+	if (!value && !ret) {
+		ret = ocfs2_try_remove_refcount_tree(inode, di_bh);
+		if (ret)
+			mlog_errno(ret);
+	}
+	ocfs2_inode_unlock(inode, 1);
+cleanup_nolock:
+	brelse(di_bh);
+	brelse(xbs.xattr_bh);
+	ocfs2_xattr_bucket_free(xbs.bucket);
+
+	return ret;
+}
+
+/*
+ * Find the xattr extent rec which may contains name_hash.
+ * e_cpos will be the first name hash of the xattr rec.
+ * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list.
+ */
+static int ocfs2_xattr_get_rec(struct inode *inode,
+			       u32 name_hash,
+			       u64 *p_blkno,
+			       u32 *e_cpos,
+			       u32 *num_clusters,
+			       struct ocfs2_extent_list *el)
+{
+	int ret = 0, i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec = NULL;
+	u64 e_blkno = 0;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, name_hash,
+				      &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "xattr tree block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) <= name_hash) {
+			e_blkno = le64_to_cpu(rec->e_blkno);
+			break;
+		}
+	}
+
+	if (!e_blkno) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0) in xattr", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	*p_blkno = le64_to_cpu(rec->e_blkno);
+	*num_clusters = le16_to_cpu(rec->e_leaf_clusters);
+	if (e_cpos)
+		*e_cpos = le32_to_cpu(rec->e_cpos);
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+typedef int (xattr_bucket_func)(struct inode *inode,
+				struct ocfs2_xattr_bucket *bucket,
+				void *para);
+
+static int ocfs2_find_xe_in_bucket(struct inode *inode,
+				   struct ocfs2_xattr_bucket *bucket,
+				   int name_index,
+				   const char *name,
+				   u32 name_hash,
+				   u16 *xe_index,
+				   int *found)
+{
+	int i, ret = 0, cmp = 1, block_off, new_offset;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+	size_t name_len = strlen(name);
+	struct ocfs2_xattr_entry *xe = NULL;
+	char *xe_name;
+
+	/*
+	 * We don't use binary search in the bucket because there
+	 * may be multiple entries with the same name hash.
+	 */
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+
+		if (name_hash > le32_to_cpu(xe->xe_name_hash))
+			continue;
+		else if (name_hash < le32_to_cpu(xe->xe_name_hash))
+			break;
+
+		cmp = name_index - ocfs2_xattr_get_type(xe);
+		if (!cmp)
+			cmp = name_len - xe->xe_name_len;
+		if (cmp)
+			continue;
+
+		ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+							xh,
+							i,
+							&block_off,
+							&new_offset);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+
+		xe_name = bucket_block(bucket, block_off) + new_offset;
+		if (!memcmp(name, xe_name, name_len)) {
+			*xe_index = i;
+			*found = 1;
+			ret = 0;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Find the specified xattr entry in a series of buckets.
+ * This series start from p_blkno and last for num_clusters.
+ * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains
+ * the num of the valid buckets.
+ *
+ * Return the buffer_head this xattr should reside in. And if the xattr's
+ * hash is in the gap of 2 buckets, return the lower bucket.
+ */
+static int ocfs2_xattr_bucket_find(struct inode *inode,
+				   int name_index,
+				   const char *name,
+				   u32 name_hash,
+				   u64 p_blkno,
+				   u32 first_hash,
+				   u32 num_clusters,
+				   struct ocfs2_xattr_search *xs)
+{
+	int ret, found = 0;
+	struct ocfs2_xattr_header *xh = NULL;
+	struct ocfs2_xattr_entry *xe = NULL;
+	u16 index = 0;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int low_bucket = 0, bucket, high_bucket;
+	struct ocfs2_xattr_bucket *search;
+	u32 last_hash;
+	u64 blkno, lower_blkno = 0;
+
+	search = ocfs2_xattr_bucket_new(inode);
+	if (!search) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(search, p_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xh = bucket_xh(search);
+	high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
+	while (low_bucket <= high_bucket) {
+		ocfs2_xattr_bucket_relse(search);
+
+		bucket = (low_bucket + high_bucket) / 2;
+		blkno = p_blkno + bucket * blk_per_bucket;
+		ret = ocfs2_read_xattr_bucket(search, blkno);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		xh = bucket_xh(search);
+		xe = &xh->xh_entries[0];
+		if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
+			high_bucket = bucket - 1;
+			continue;
+		}
+
+		/*
+		 * Check whether the hash of the last entry in our
+		 * bucket is larger than the search one. for an empty
+		 * bucket, the last one is also the first one.
+		 */
+		if (xh->xh_count)
+			xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
+
+		last_hash = le32_to_cpu(xe->xe_name_hash);
+
+		/* record lower_blkno which may be the insert place. */
+		lower_blkno = blkno;
+
+		if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
+			low_bucket = bucket + 1;
+			continue;
+		}
+
+		/* the searched xattr should reside in this bucket if exists. */
+		ret = ocfs2_find_xe_in_bucket(inode, search,
+					      name_index, name, name_hash,
+					      &index, &found);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		break;
+	}
+
+	/*
+	 * Record the bucket we have found.
+	 * When the xattr's hash value is in the gap of 2 buckets, we will
+	 * always set it to the previous bucket.
+	 */
+	if (!lower_blkno)
+		lower_blkno = p_blkno;
+
+	/* This should be in cache - we just read it during the search */
+	ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xs->header = bucket_xh(xs->bucket);
+	xs->base = bucket_block(xs->bucket, 0);
+	xs->end = xs->base + inode->i_sb->s_blocksize;
+
+	if (found) {
+		xs->here = &xs->header->xh_entries[index];
+		trace_ocfs2_xattr_bucket_find(OCFS2_I(inode)->ip_blkno,
+			name, name_index, name_hash,
+			(unsigned long long)bucket_blkno(xs->bucket),
+			index);
+	} else
+		ret = -ENODATA;
+
+out:
+	ocfs2_xattr_bucket_free(search);
+	return ret;
+}
+
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+					struct buffer_head *root_bh,
+					int name_index,
+					const char *name,
+					struct ocfs2_xattr_search *xs)
+{
+	int ret;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)root_bh->b_data;
+	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+	struct ocfs2_extent_list *el = &xb_root->xt_list;
+	u64 p_blkno = 0;
+	u32 first_hash, num_clusters = 0;
+	u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+
+	if (le16_to_cpu(el->l_next_free_rec) == 0)
+		return -ENODATA;
+
+	trace_ocfs2_xattr_index_block_find(OCFS2_I(inode)->ip_blkno,
+					name, name_index, name_hash,
+					(unsigned long long)root_bh->b_blocknr,
+					-1);
+
+	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
+				  &num_clusters, el);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
+
+	trace_ocfs2_xattr_index_block_find_rec(OCFS2_I(inode)->ip_blkno,
+					name, name_index, first_hash,
+					(unsigned long long)p_blkno,
+					num_clusters);
+
+	ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
+				      p_blkno, first_hash, num_clusters, xs);
+
+out:
+	return ret;
+}
+
+static int ocfs2_iterate_xattr_buckets(struct inode *inode,
+				       u64 blkno,
+				       u32 clusters,
+				       xattr_bucket_func *func,
+				       void *para)
+{
+	int i, ret = 0;
+	u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+	u32 num_buckets = clusters * bpc;
+	struct ocfs2_xattr_bucket *bucket;
+
+	bucket = ocfs2_xattr_bucket_new(inode);
+	if (!bucket) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	trace_ocfs2_iterate_xattr_buckets(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)blkno, clusters);
+
+	for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
+		ret = ocfs2_read_xattr_bucket(bucket, blkno);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		/*
+		 * The real bucket num in this series of blocks is stored
+		 * in the 1st bucket.
+		 */
+		if (i == 0)
+			num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
+
+		trace_ocfs2_iterate_xattr_bucket((unsigned long long)blkno,
+		     le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
+		if (func) {
+			ret = func(inode, bucket, para);
+			if (ret && ret != -ERANGE)
+				mlog_errno(ret);
+			/* Fall through to bucket_relse() */
+		}
+
+		ocfs2_xattr_bucket_relse(bucket);
+		if (ret)
+			break;
+	}
+
+	ocfs2_xattr_bucket_free(bucket);
+	return ret;
+}
+
+struct ocfs2_xattr_tree_list {
+	char *buffer;
+	size_t buffer_size;
+	size_t result;
+};
+
+static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
+					     struct ocfs2_xattr_header *xh,
+					     int index,
+					     int *block_off,
+					     int *new_offset)
+{
+	u16 name_offset;
+
+	if (index < 0 || index >= le16_to_cpu(xh->xh_count))
+		return -EINVAL;
+
+	name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
+
+	*block_off = name_offset >> sb->s_blocksize_bits;
+	*new_offset = name_offset % sb->s_blocksize;
+
+	return 0;
+}
+
+static int ocfs2_list_xattr_bucket(struct inode *inode,
+				   struct ocfs2_xattr_bucket *bucket,
+				   void *para)
+{
+	int ret = 0, type;
+	struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
+	int i, block_off, new_offset;
+	const char *prefix, *name;
+
+	for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
+		type = ocfs2_xattr_get_type(entry);
+		prefix = ocfs2_xattr_prefix(type);
+
+		if (prefix) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+								bucket_xh(bucket),
+								i,
+								&block_off,
+								&new_offset);
+			if (ret)
+				break;
+
+			name = (const char *)bucket_block(bucket, block_off) +
+				new_offset;
+			ret = ocfs2_xattr_list_entry(xl->buffer,
+						     xl->buffer_size,
+						     &xl->result,
+						     prefix, name,
+						     entry->xe_name_len);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_iterate_xattr_index_block(struct inode *inode,
+					   struct buffer_head *blk_bh,
+					   xattr_tree_rec_func *rec_func,
+					   void *para)
+{
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)blk_bh->b_data;
+	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+	int ret = 0;
+	u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
+	u64 p_blkno = 0;
+
+	if (!el->l_next_free_rec || !rec_func)
+		return 0;
+
+	while (name_hash > 0) {
+		ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+					  &e_cpos, &num_clusters, el);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = rec_func(inode, blk_bh, p_blkno, e_cpos,
+			       num_clusters, para);
+		if (ret) {
+			if (ret != -ERANGE)
+				mlog_errno(ret);
+			break;
+		}
+
+		if (e_cpos == 0)
+			break;
+
+		name_hash = e_cpos - 1;
+	}
+
+	return ret;
+
+}
+
+static int ocfs2_list_xattr_tree_rec(struct inode *inode,
+				     struct buffer_head *root_bh,
+				     u64 blkno, u32 cpos, u32 len, void *para)
+{
+	return ocfs2_iterate_xattr_buckets(inode, blkno, len,
+					   ocfs2_list_xattr_bucket, para);
+}
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+					     struct buffer_head *blk_bh,
+					     char *buffer,
+					     size_t buffer_size)
+{
+	int ret;
+	struct ocfs2_xattr_tree_list xl = {
+		.buffer = buffer,
+		.buffer_size = buffer_size,
+		.result = 0,
+	};
+
+	ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
+					      ocfs2_list_xattr_tree_rec, &xl);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = xl.result;
+out:
+	return ret;
+}
+
+static int cmp_xe(const void *a, const void *b)
+{
+	const struct ocfs2_xattr_entry *l = a, *r = b;
+	u32 l_hash = le32_to_cpu(l->xe_name_hash);
+	u32 r_hash = le32_to_cpu(r->xe_name_hash);
+
+	if (l_hash > r_hash)
+		return 1;
+	if (l_hash < r_hash)
+		return -1;
+	return 0;
+}
+
+static void swap_xe(void *a, void *b, int size)
+{
+	struct ocfs2_xattr_entry *l = a, *r = b, tmp;
+
+	tmp = *l;
+	memcpy(l, r, sizeof(struct ocfs2_xattr_entry));
+	memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry));
+}
+
+/*
+ * When the ocfs2_xattr_block is filled up, new bucket will be created
+ * and all the xattr entries will be moved to the new bucket.
+ * The header goes at the start of the bucket, and the names+values are
+ * filled from the end.  This is why *target starts as the last buffer.
+ * Note: we need to sort the entries since they are not saved in order
+ * in the ocfs2_xattr_block.
+ */
+static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
+					   struct buffer_head *xb_bh,
+					   struct ocfs2_xattr_bucket *bucket)
+{
+	int i, blocksize = inode->i_sb->s_blocksize;
+	int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u16 offset, size, off_change;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_block *xb =
+				(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+	u16 count = le16_to_cpu(xb_xh->xh_count);
+	char *src = xb_bh->b_data;
+	char *target = bucket_block(bucket, blks - 1);
+
+	trace_ocfs2_cp_xattr_block_to_bucket_begin(
+				(unsigned long long)xb_bh->b_blocknr,
+				(unsigned long long)bucket_blkno(bucket));
+
+	for (i = 0; i < blks; i++)
+		memset(bucket_block(bucket, i), 0, blocksize);
+
+	/*
+	 * Since the xe_name_offset is based on ocfs2_xattr_header,
+	 * there is a offset change corresponding to the change of
+	 * ocfs2_xattr_header's position.
+	 */
+	off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	xe = &xb_xh->xh_entries[count - 1];
+	offset = le16_to_cpu(xe->xe_name_offset) + off_change;
+	size = blocksize - offset;
+
+	/* copy all the names and values. */
+	memcpy(target + offset, src + offset, size);
+
+	/* Init new header now. */
+	xh->xh_count = xb_xh->xh_count;
+	xh->xh_num_buckets = cpu_to_le16(1);
+	xh->xh_name_value_len = cpu_to_le16(size);
+	xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
+
+	/* copy all the entries. */
+	target = bucket_block(bucket, 0);
+	offset = offsetof(struct ocfs2_xattr_header, xh_entries);
+	size = count * sizeof(struct ocfs2_xattr_entry);
+	memcpy(target + offset, (char *)xb_xh + offset, size);
+
+	/* Change the xe offset for all the xe because of the move. */
+	off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize +
+		 offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	for (i = 0; i < count; i++)
+		le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
+
+	trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change);
+
+	sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe, swap_xe);
+}
+
+/*
+ * After we move xattr from block to index btree, we have to
+ * update ocfs2_xattr_search to the new xe and base.
+ *
+ * When the entry is in xattr block, xattr_bh indicates the storage place.
+ * While if the entry is in index b-tree, "bucket" indicates the
+ * real place of the xattr.
+ */
+static void ocfs2_xattr_update_xattr_search(struct inode *inode,
+					    struct ocfs2_xattr_search *xs,
+					    struct buffer_head *old_bh)
+{
+	char *buf = old_bh->b_data;
+	struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
+	struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
+	int i;
+
+	xs->header = bucket_xh(xs->bucket);
+	xs->base = bucket_block(xs->bucket, 0);
+	xs->end = xs->base + inode->i_sb->s_blocksize;
+
+	if (xs->not_found)
+		return;
+
+	i = xs->here - old_xh->xh_entries;
+	xs->here = &xs->header->xh_entries[i];
+}
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+					  struct ocfs2_xattr_search *xs,
+					  struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	u32 bit_off, len;
+	u64 blkno;
+	handle_t *handle = ctxt->handle;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct buffer_head *xb_bh = xs->xattr_bh;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_tree_root *xr;
+	u16 xb_flags = le16_to_cpu(xb->xb_flags);
+
+	trace_ocfs2_xattr_create_index_block_begin(
+				(unsigned long long)xb_bh->b_blocknr);
+
+	BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
+	BUG_ON(!xs->bucket);
+
+	/*
+	 * XXX:
+	 * We can use this lock for now, and maybe move to a dedicated mutex
+	 * if performance becomes a problem later.
+	 */
+	down_write(&oi->ip_alloc_sem);
+
+	ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), xb_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
+				     1, 1, &bit_off, &len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * The bucket may spread in many blocks, and
+	 * we will only touch the 1st block and the last block
+	 * in the whole bucket(one for entry and one for data).
+	 */
+	blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
+
+	trace_ocfs2_xattr_create_index_block((unsigned long long)blkno);
+
+	ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+						OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket);
+	ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
+
+	ocfs2_xattr_update_xattr_search(inode, xs, xb_bh);
+
+	/* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
+	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
+	       offsetof(struct ocfs2_xattr_block, xb_attrs));
+
+	xr = &xb->xb_attrs.xb_root;
+	xr->xt_clusters = cpu_to_le32(1);
+	xr->xt_last_eb_blk = 0;
+	xr->xt_list.l_tree_depth = 0;
+	xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb));
+	xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+
+	xr->xt_list.l_recs[0].e_cpos = 0;
+	xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+	xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+
+	xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
+
+	ocfs2_journal_dirty(handle, xb_bh);
+
+out:
+	up_write(&oi->ip_alloc_sem);
+
+	return ret;
+}
+
+static int cmp_xe_offset(const void *a, const void *b)
+{
+	const struct ocfs2_xattr_entry *l = a, *r = b;
+	u32 l_name_offset = le16_to_cpu(l->xe_name_offset);
+	u32 r_name_offset = le16_to_cpu(r->xe_name_offset);
+
+	if (l_name_offset < r_name_offset)
+		return 1;
+	if (l_name_offset > r_name_offset)
+		return -1;
+	return 0;
+}
+
+/*
+ * defrag a xattr bucket if we find that the bucket has some
+ * holes beteen name/value pairs.
+ * We will move all the name/value pairs to the end of the bucket
+ * so that we can spare some space for insertion.
+ */
+static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+				     handle_t *handle,
+				     struct ocfs2_xattr_bucket *bucket)
+{
+	int ret, i;
+	size_t end, offset, len;
+	struct ocfs2_xattr_header *xh;
+	char *entries, *buf, *bucket_buf = NULL;
+	u64 blkno = bucket_blkno(bucket);
+	u16 xh_free_start;
+	size_t blocksize = inode->i_sb->s_blocksize;
+	struct ocfs2_xattr_entry *xe;
+
+	/*
+	 * In order to make the operation more efficient and generic,
+	 * we copy all the blocks into a contiguous memory and do the
+	 * defragment there, so if anything is error, we will not touch
+	 * the real block.
+	 */
+	bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS);
+	if (!bucket_buf) {
+		ret = -EIO;
+		goto out;
+	}
+
+	buf = bucket_buf;
+	for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
+		memcpy(buf, bucket_block(bucket, i), blocksize);
+
+	ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xh = (struct ocfs2_xattr_header *)bucket_buf;
+	entries = (char *)xh->xh_entries;
+	xh_free_start = le16_to_cpu(xh->xh_free_start);
+
+	trace_ocfs2_defrag_xattr_bucket(
+	     (unsigned long long)blkno, le16_to_cpu(xh->xh_count),
+	     xh_free_start, le16_to_cpu(xh->xh_name_value_len));
+
+	/*
+	 * sort all the entries by their offset.
+	 * the largest will be the first, so that we can
+	 * move them to the end one by one.
+	 */
+	sort(entries, le16_to_cpu(xh->xh_count),
+	     sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe_offset, swap_xe);
+
+	/* Move all name/values to the end of the bucket. */
+	xe = xh->xh_entries;
+	end = OCFS2_XATTR_BUCKET_SIZE;
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
+		offset = le16_to_cpu(xe->xe_name_offset);
+		len = namevalue_size_xe(xe);
+
+		/*
+		 * We must make sure that the name/value pair
+		 * exist in the same block. So adjust end to
+		 * the previous block end if needed.
+		 */
+		if (((end - len) / blocksize !=
+			(end - 1) / blocksize))
+			end = end - end % blocksize;
+
+		if (end > offset + len) {
+			memmove(bucket_buf + end - len,
+				bucket_buf + offset, len);
+			xe->xe_name_offset = cpu_to_le16(end - len);
+		}
+
+		mlog_bug_on_msg(end < offset + len, "Defrag check failed for "
+				"bucket %llu\n", (unsigned long long)blkno);
+
+		end -= len;
+	}
+
+	mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for "
+			"bucket %llu\n", (unsigned long long)blkno);
+
+	if (xh_free_start == end)
+		goto out;
+
+	memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
+	xh->xh_free_start = cpu_to_le16(end);
+
+	/* sort the entries by their name_hash. */
+	sort(entries, le16_to_cpu(xh->xh_count),
+	     sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe, swap_xe);
+
+	buf = bucket_buf;
+	for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
+		memcpy(bucket_block(bucket, i), buf, blocksize);
+	ocfs2_xattr_bucket_journal_dirty(handle, bucket);
+
+out:
+	kfree(bucket_buf);
+	return ret;
+}
+
+/*
+ * prev_blkno points to the start of an existing extent.  new_blkno
+ * points to a newly allocated extent.  Because we know each of our
+ * clusters contains more than bucket, we can easily split one cluster
+ * at a bucket boundary.  So we take the last cluster of the existing
+ * extent and split it down the middle.  We move the last half of the
+ * buckets in the last cluster of the existing extent over to the new
+ * extent.
+ *
+ * first_bh is the buffer at prev_blkno so we can update the existing
+ * extent's bucket count.  header_bh is the bucket were we were hoping
+ * to insert our xattr.  If the bucket move places the target in the new
+ * extent, we'll update first_bh and header_bh after modifying the old
+ * extent.
+ *
+ * first_hash will be set as the 1st xe's name_hash in the new extent.
+ */
+static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
+					       handle_t *handle,
+					       struct ocfs2_xattr_bucket *first,
+					       struct ocfs2_xattr_bucket *target,
+					       u64 new_blkno,
+					       u32 num_clusters,
+					       u32 *first_hash)
+{
+	int ret;
+	struct super_block *sb = inode->i_sb;
+	int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb);
+	int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
+	int to_move = num_buckets / 2;
+	u64 src_blkno;
+	u64 last_cluster_blkno = bucket_blkno(first) +
+		((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1));
+
+	BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
+	BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
+
+	trace_ocfs2_mv_xattr_bucket_cross_cluster(
+				(unsigned long long)last_cluster_blkno,
+				(unsigned long long)new_blkno);
+
+	ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
+				     last_cluster_blkno, new_blkno,
+				     to_move, first_hash);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* This is the first bucket that got moved */
+	src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
+
+	/*
+	 * If the target bucket was part of the moved buckets, we need to
+	 * update first and target.
+	 */
+	if (bucket_blkno(target) >= src_blkno) {
+		/* Find the block for the new target bucket */
+		src_blkno = new_blkno +
+			(bucket_blkno(target) - src_blkno);
+
+		ocfs2_xattr_bucket_relse(first);
+		ocfs2_xattr_bucket_relse(target);
+
+		/*
+		 * These shouldn't fail - the buffers are in the
+		 * journal from ocfs2_cp_xattr_bucket().
+		 */
+		ret = ocfs2_read_xattr_bucket(first, new_blkno);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		ret = ocfs2_read_xattr_bucket(target, src_blkno);
+		if (ret)
+			mlog_errno(ret);
+
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Find the suitable pos when we divide a bucket into 2.
+ * We have to make sure the xattrs with the same hash value exist
+ * in the same bucket.
+ *
+ * If this ocfs2_xattr_header covers more than one hash value, find a
+ * place where the hash value changes.  Try to find the most even split.
+ * The most common case is that all entries have different hash values,
+ * and the first check we make will find a place to split.
+ */
+static int ocfs2_xattr_find_divide_pos(struct ocfs2_xattr_header *xh)
+{
+	struct ocfs2_xattr_entry *entries = xh->xh_entries;
+	int count = le16_to_cpu(xh->xh_count);
+	int delta, middle = count / 2;
+
+	/*
+	 * We start at the middle.  Each step gets farther away in both
+	 * directions.  We therefore hit the change in hash value
+	 * nearest to the middle.  Note that this loop does not execute for
+	 * count < 2.
+	 */
+	for (delta = 0; delta < middle; delta++) {
+		/* Let's check delta earlier than middle */
+		if (cmp_xe(&entries[middle - delta - 1],
+			   &entries[middle - delta]))
+			return middle - delta;
+
+		/* For even counts, don't walk off the end */
+		if ((middle + delta + 1) == count)
+			continue;
+
+		/* Now try delta past middle */
+		if (cmp_xe(&entries[middle + delta],
+			   &entries[middle + delta + 1]))
+			return middle + delta + 1;
+	}
+
+	/* Every entry had the same hash */
+	return count;
+}
+
+/*
+ * Move some xattrs in old bucket(blk) to new bucket(new_blk).
+ * first_hash will record the 1st hash of the new bucket.
+ *
+ * Normally half of the xattrs will be moved.  But we have to make
+ * sure that the xattrs with the same hash value are stored in the
+ * same bucket. If all the xattrs in this bucket have the same hash
+ * value, the new bucket will be initialized as an empty one and the
+ * first_hash will be initialized as (hash_value+1).
+ */
+static int ocfs2_divide_xattr_bucket(struct inode *inode,
+				    handle_t *handle,
+				    u64 blk,
+				    u64 new_blk,
+				    u32 *first_hash,
+				    int new_bucket_head)
+{
+	int ret, i;
+	int count, start, len, name_value_len = 0, name_offset = 0;
+	struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
+	struct ocfs2_xattr_header *xh;
+	struct ocfs2_xattr_entry *xe;
+	int blocksize = inode->i_sb->s_blocksize;
+
+	trace_ocfs2_divide_xattr_bucket_begin((unsigned long long)blk,
+					      (unsigned long long)new_blk);
+
+	s_bucket = ocfs2_xattr_bucket_new(inode);
+	t_bucket = ocfs2_xattr_bucket_new(inode);
+	if (!s_bucket || !t_bucket) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(s_bucket, blk);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Even if !new_bucket_head, we're overwriting t_bucket.  Thus,
+	 * there's no need to read it.
+	 */
+	ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Hey, if we're overwriting t_bucket, what difference does
+	 * ACCESS_CREATE vs ACCESS_WRITE make?  See the comment in the
+	 * same part of ocfs2_cp_xattr_bucket().
+	 */
+	ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+						new_bucket_head ?
+						OCFS2_JOURNAL_ACCESS_CREATE :
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xh = bucket_xh(s_bucket);
+	count = le16_to_cpu(xh->xh_count);
+	start = ocfs2_xattr_find_divide_pos(xh);
+
+	if (start == count) {
+		xe = &xh->xh_entries[start-1];
+
+		/*
+		 * initialized a new empty bucket here.
+		 * The hash value is set as one larger than
+		 * that of the last entry in the previous bucket.
+		 */
+		for (i = 0; i < t_bucket->bu_blocks; i++)
+			memset(bucket_block(t_bucket, i), 0, blocksize);
+
+		xh = bucket_xh(t_bucket);
+		xh->xh_free_start = cpu_to_le16(blocksize);
+		xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
+		le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
+
+		goto set_num_buckets;
+	}
+
+	/* copy the whole bucket to the new first. */
+	ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
+
+	/* update the new bucket. */
+	xh = bucket_xh(t_bucket);
+
+	/*
+	 * Calculate the total name/value len and xh_free_start for
+	 * the old bucket first.
+	 */
+	name_offset = OCFS2_XATTR_BUCKET_SIZE;
+	name_value_len = 0;
+	for (i = 0; i < start; i++) {
+		xe = &xh->xh_entries[i];
+		name_value_len += namevalue_size_xe(xe);
+		if (le16_to_cpu(xe->xe_name_offset) < name_offset)
+			name_offset = le16_to_cpu(xe->xe_name_offset);
+	}
+
+	/*
+	 * Now begin the modification to the new bucket.
+	 *
+	 * In the new bucket, We just move the xattr entry to the beginning
+	 * and don't touch the name/value. So there will be some holes in the
+	 * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is
+	 * called.
+	 */
+	xe = &xh->xh_entries[start];
+	len = sizeof(struct ocfs2_xattr_entry) * (count - start);
+	trace_ocfs2_divide_xattr_bucket_move(len,
+			(int)((char *)xe - (char *)xh),
+			(int)((char *)xh->xh_entries - (char *)xh));
+	memmove((char *)xh->xh_entries, (char *)xe, len);
+	xe = &xh->xh_entries[count - start];
+	len = sizeof(struct ocfs2_xattr_entry) * start;
+	memset((char *)xe, 0, len);
+
+	le16_add_cpu(&xh->xh_count, -start);
+	le16_add_cpu(&xh->xh_name_value_len, -name_value_len);
+
+	/* Calculate xh_free_start for the new bucket. */
+	xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+		if (le16_to_cpu(xe->xe_name_offset) <
+		    le16_to_cpu(xh->xh_free_start))
+			xh->xh_free_start = xe->xe_name_offset;
+	}
+
+set_num_buckets:
+	/* set xh->xh_num_buckets for the new xh. */
+	if (new_bucket_head)
+		xh->xh_num_buckets = cpu_to_le16(1);
+	else
+		xh->xh_num_buckets = 0;
+
+	ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
+
+	/* store the first_hash of the new bucket. */
+	if (first_hash)
+		*first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+
+	/*
+	 * Now only update the 1st block of the old bucket.  If we
+	 * just added a new empty bucket, there is no need to modify
+	 * it.
+	 */
+	if (start == count)
+		goto out;
+
+	xh = bucket_xh(s_bucket);
+	memset(&xh->xh_entries[start], 0,
+	       sizeof(struct ocfs2_xattr_entry) * (count - start));
+	xh->xh_count = cpu_to_le16(start);
+	xh->xh_free_start = cpu_to_le16(name_offset);
+	xh->xh_name_value_len = cpu_to_le16(name_value_len);
+
+	ocfs2_xattr_bucket_journal_dirty(handle, s_bucket);
+
+out:
+	ocfs2_xattr_bucket_free(s_bucket);
+	ocfs2_xattr_bucket_free(t_bucket);
+
+	return ret;
+}
+
+/*
+ * Copy xattr from one bucket to another bucket.
+ *
+ * The caller must make sure that the journal transaction
+ * has enough space for journaling.
+ */
+static int ocfs2_cp_xattr_bucket(struct inode *inode,
+				 handle_t *handle,
+				 u64 s_blkno,
+				 u64 t_blkno,
+				 int t_is_new)
+{
+	int ret;
+	struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
+
+	BUG_ON(s_blkno == t_blkno);
+
+	trace_ocfs2_cp_xattr_bucket((unsigned long long)s_blkno,
+				    (unsigned long long)t_blkno,
+				    t_is_new);
+
+	s_bucket = ocfs2_xattr_bucket_new(inode);
+	t_bucket = ocfs2_xattr_bucket_new(inode);
+	if (!s_bucket || !t_bucket) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno);
+	if (ret)
+		goto out;
+
+	/*
+	 * Even if !t_is_new, we're overwriting t_bucket.  Thus,
+	 * there's no need to read it.
+	 */
+	ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
+	if (ret)
+		goto out;
+
+	/*
+	 * Hey, if we're overwriting t_bucket, what difference does
+	 * ACCESS_CREATE vs ACCESS_WRITE make?  Well, if we allocated a new
+	 * cluster to fill, we came here from
+	 * ocfs2_mv_xattr_buckets(), and it is really new -
+	 * ACCESS_CREATE is required.  But we also might have moved data
+	 * out of t_bucket before extending back into it.
+	 * ocfs2_add_new_xattr_bucket() can do this - its call to
+	 * ocfs2_add_new_xattr_cluster() may have created a new extent
+	 * and copied out the end of the old extent.  Then it re-extends
+	 * the old extent back to create space for new xattrs.  That's
+	 * how we get here, and the bucket isn't really new.
+	 */
+	ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+						t_is_new ?
+						OCFS2_JOURNAL_ACCESS_CREATE :
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret)
+		goto out;
+
+	ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
+	ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
+
+out:
+	ocfs2_xattr_bucket_free(t_bucket);
+	ocfs2_xattr_bucket_free(s_bucket);
+
+	return ret;
+}
+
+/*
+ * src_blk points to the start of an existing extent.  last_blk points to
+ * last cluster in that extent.  to_blk points to a newly allocated
+ * extent.  We copy the buckets from the cluster at last_blk to the new
+ * extent.  If start_bucket is non-zero, we skip that many buckets before
+ * we start copying.  The new extent's xh_num_buckets gets set to the
+ * number of buckets we copied.  The old extent's xh_num_buckets shrinks
+ * by the same amount.
+ */
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+				  u64 src_blk, u64 last_blk, u64 to_blk,
+				  unsigned int start_bucket,
+				  u32 *first_hash)
+{
+	int i, ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+	struct ocfs2_xattr_bucket *old_first, *new_first;
+
+	trace_ocfs2_mv_xattr_buckets((unsigned long long)last_blk,
+				     (unsigned long long)to_blk);
+
+	BUG_ON(start_bucket >= num_buckets);
+	if (start_bucket) {
+		num_buckets -= start_bucket;
+		last_blk += (start_bucket * blks_per_bucket);
+	}
+
+	/* The first bucket of the original extent */
+	old_first = ocfs2_xattr_bucket_new(inode);
+	/* The first bucket of the new extent */
+	new_first = ocfs2_xattr_bucket_new(inode);
+	if (!old_first || !new_first) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(old_first, src_blk);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We need to update the first bucket of the old extent and all
+	 * the buckets going to the new extent.
+	 */
+	credits = ((num_buckets + 1) * blks_per_bucket);
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < num_buckets; i++) {
+		ret = ocfs2_cp_xattr_bucket(inode, handle,
+					    last_blk + (i * blks_per_bucket),
+					    to_blk + (i * blks_per_bucket),
+					    1);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * Get the new bucket ready before we dirty anything
+	 * (This actually shouldn't fail, because we already dirtied
+	 * it once in ocfs2_cp_xattr_bucket()).
+	 */
+	ret = ocfs2_read_xattr_bucket(new_first, to_blk);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Now update the headers */
+	le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets);
+	ocfs2_xattr_bucket_journal_dirty(handle, old_first);
+
+	bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets);
+	ocfs2_xattr_bucket_journal_dirty(handle, new_first);
+
+	if (first_hash)
+		*first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
+
+out:
+	ocfs2_xattr_bucket_free(new_first);
+	ocfs2_xattr_bucket_free(old_first);
+	return ret;
+}
+
+/*
+ * Move some xattrs in this cluster to the new cluster.
+ * This function should only be called when bucket size == cluster size.
+ * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
+ */
+static int ocfs2_divide_xattr_cluster(struct inode *inode,
+				      handle_t *handle,
+				      u64 prev_blk,
+				      u64 new_blk,
+				      u32 *first_hash)
+{
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int ret, credits = 2 * blk_per_bucket;
+
+	BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
+
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* Move half of the xattr in start_blk to the next bucket. */
+	return  ocfs2_divide_xattr_bucket(inode, handle, prev_blk,
+					  new_blk, first_hash, 1);
+}
+
+/*
+ * Move some xattrs from the old cluster to the new one since they are not
+ * contiguous in ocfs2 xattr tree.
+ *
+ * new_blk starts a new separate cluster, and we will move some xattrs from
+ * prev_blk to it. v_start will be set as the first name hash value in this
+ * new cluster so that it can be used as e_cpos during tree insertion and
+ * don't collide with our original b-tree operations. first_bh and header_bh
+ * will also be updated since they will be used in ocfs2_extend_xattr_bucket
+ * to extend the insert bucket.
+ *
+ * The problem is how much xattr should we move to the new one and when should
+ * we update first_bh and header_bh?
+ * 1. If cluster size > bucket size, that means the previous cluster has more
+ *    than 1 bucket, so just move half nums of bucket into the new cluster and
+ *    update the first_bh and header_bh if the insert bucket has been moved
+ *    to the new cluster.
+ * 2. If cluster_size == bucket_size:
+ *    a) If the previous extent rec has more than one cluster and the insert
+ *       place isn't in the last cluster, copy the entire last cluster to the
+ *       new one. This time, we don't need to upate the first_bh and header_bh
+ *       since they will not be moved into the new cluster.
+ *    b) Otherwise, move the bottom half of the xattrs in the last cluster into
+ *       the new one. And we set the extend flag to zero if the insert place is
+ *       moved into the new allocated cluster since no extend is needed.
+ */
+static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
+					    handle_t *handle,
+					    struct ocfs2_xattr_bucket *first,
+					    struct ocfs2_xattr_bucket *target,
+					    u64 new_blk,
+					    u32 prev_clusters,
+					    u32 *v_start,
+					    int *extend)
+{
+	int ret;
+
+	trace_ocfs2_adjust_xattr_cross_cluster(
+			(unsigned long long)bucket_blkno(first),
+			(unsigned long long)new_blk, prev_clusters);
+
+	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
+		ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
+							  handle,
+							  first, target,
+							  new_blk,
+							  prev_clusters,
+							  v_start);
+		if (ret)
+			mlog_errno(ret);
+	} else {
+		/* The start of the last cluster in the first extent */
+		u64 last_blk = bucket_blkno(first) +
+			((prev_clusters - 1) *
+			 ocfs2_clusters_to_blocks(inode->i_sb, 1));
+
+		if (prev_clusters > 1 && bucket_blkno(target) != last_blk) {
+			ret = ocfs2_mv_xattr_buckets(inode, handle,
+						     bucket_blkno(first),
+						     last_blk, new_blk, 0,
+						     v_start);
+			if (ret)
+				mlog_errno(ret);
+		} else {
+			ret = ocfs2_divide_xattr_cluster(inode, handle,
+							 last_blk, new_blk,
+							 v_start);
+			if (ret)
+				mlog_errno(ret);
+
+			if ((bucket_blkno(target) == last_blk) && extend)
+				*extend = 0;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Add a new cluster for xattr storage.
+ *
+ * If the new cluster is contiguous with the previous one, it will be
+ * appended to the same extent record, and num_clusters will be updated.
+ * If not, we will insert a new extent for it and move some xattrs in
+ * the last cluster into the new allocated one.
+ * We also need to limit the maximum size of a btree leaf, otherwise we'll
+ * lose the benefits of hashing because we'll have to search large leaves.
+ * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize,
+ * if it's bigger).
+ *
+ * first_bh is the first block of the previous extent rec and header_bh
+ * indicates the bucket we will insert the new xattrs. They will be updated
+ * when the header_bh is moved into the new cluster.
+ */
+static int ocfs2_add_new_xattr_cluster(struct inode *inode,
+				       struct buffer_head *root_bh,
+				       struct ocfs2_xattr_bucket *first,
+				       struct ocfs2_xattr_bucket *target,
+				       u32 *num_clusters,
+				       u32 prev_cpos,
+				       int *extend,
+				       struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	u32 prev_clusters = *num_clusters;
+	u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
+	u64 block;
+	handle_t *handle = ctxt->handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_extent_tree et;
+
+	trace_ocfs2_add_new_xattr_cluster_begin(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)bucket_blkno(first),
+		prev_cpos, prev_clusters);
+
+	ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
+
+	ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
+				     clusters_to_add, &bit_off, &num_bits);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto leave;
+	}
+
+	BUG_ON(num_bits > clusters_to_add);
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	trace_ocfs2_add_new_xattr_cluster((unsigned long long)block, num_bits);
+
+	if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
+	    (prev_clusters + num_bits) << osb->s_clustersize_bits <=
+	     OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
+		/*
+		 * If this cluster is contiguous with the old one and
+		 * adding this new cluster, we don't surpass the limit of
+		 * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be
+		 * initialized and used like other buckets in the previous
+		 * cluster.
+		 * So add it as a contiguous one. The caller will handle
+		 * its init process.
+		 */
+		v_start = prev_cpos + prev_clusters;
+		*num_clusters = prev_clusters + num_bits;
+	} else {
+		ret = ocfs2_adjust_xattr_cross_cluster(inode,
+						       handle,
+						       first,
+						       target,
+						       block,
+						       prev_clusters,
+						       &v_start,
+						       extend);
+		if (ret) {
+			mlog_errno(ret);
+			goto leave;
+		}
+	}
+
+	trace_ocfs2_add_new_xattr_cluster_insert((unsigned long long)block,
+						 v_start, num_bits);
+	ret = ocfs2_insert_extent(handle, &et, v_start, block,
+				  num_bits, 0, ctxt->meta_ac);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ocfs2_journal_dirty(handle, root_bh);
+
+leave:
+	return ret;
+}
+
+/*
+ * We are given an extent.  'first' is the bucket at the very front of
+ * the extent.  The extent has space for an additional bucket past
+ * bucket_xh(first)->xh_num_buckets.  'target_blkno' is the block number
+ * of the target bucket.  We wish to shift every bucket past the target
+ * down one, filling in that additional space.  When we get back to the
+ * target, we split the target between itself and the now-empty bucket
+ * at target+1 (aka, target_blkno + blks_per_bucket).
+ */
+static int ocfs2_extend_xattr_bucket(struct inode *inode,
+				     handle_t *handle,
+				     struct ocfs2_xattr_bucket *first,
+				     u64 target_blk,
+				     u32 num_clusters)
+{
+	int ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u64 end_blk;
+	u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
+
+	trace_ocfs2_extend_xattr_bucket((unsigned long long)target_blk,
+					(unsigned long long)bucket_blkno(first),
+					num_clusters, new_bucket);
+
+	/* The extent must have room for an additional bucket */
+	BUG_ON(new_bucket >=
+	       (num_clusters * ocfs2_xattr_buckets_per_cluster(osb)));
+
+	/* end_blk points to the last existing bucket */
+	end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket);
+
+	/*
+	 * end_blk is the start of the last existing bucket.
+	 * Thus, (end_blk - target_blk) covers the target bucket and
+	 * every bucket after it up to, but not including, the last
+	 * existing bucket.  Then we add the last existing bucket, the
+	 * new bucket, and the first bucket (3 * blk_per_bucket).
+	 */
+	credits = (end_blk - target_blk) + (3 * blk_per_bucket);
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_bucket_journal_access(handle, first,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (end_blk != target_blk) {
+		ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
+					    end_blk + blk_per_bucket, 0);
+		if (ret)
+			goto out;
+		end_blk -= blk_per_bucket;
+	}
+
+	/* Move half of the xattr in target_blkno to the next bucket. */
+	ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk,
+					target_blk + blk_per_bucket, NULL, 0);
+
+	le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1);
+	ocfs2_xattr_bucket_journal_dirty(handle, first);
+
+out:
+	return ret;
+}
+
+/*
+ * Add new xattr bucket in an extent record and adjust the buckets
+ * accordingly.  xb_bh is the ocfs2_xattr_block, and target is the
+ * bucket we want to insert into.
+ *
+ * In the easy case, we will move all the buckets after target down by
+ * one. Half of target's xattrs will be moved to the next bucket.
+ *
+ * If current cluster is full, we'll allocate a new one.  This may not
+ * be contiguous.  The underlying calls will make sure that there is
+ * space for the insert, shifting buckets around if necessary.
+ * 'target' may be moved by those calls.
+ */
+static int ocfs2_add_new_xattr_bucket(struct inode *inode,
+				      struct buffer_head *xb_bh,
+				      struct ocfs2_xattr_bucket *target,
+				      struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+	struct ocfs2_extent_list *el = &xb_root->xt_list;
+	u32 name_hash =
+		le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int ret, num_buckets, extend = 1;
+	u64 p_blkno;
+	u32 e_cpos, num_clusters;
+	/* The bucket at the front of the extent */
+	struct ocfs2_xattr_bucket *first;
+
+	trace_ocfs2_add_new_xattr_bucket(
+				(unsigned long long)bucket_blkno(target));
+
+	/* The first bucket of the original extent */
+	first = ocfs2_xattr_bucket_new(inode);
+	if (!first) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
+				  &num_clusters, el);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(first, p_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
+	if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) {
+		/*
+		 * This can move first+target if the target bucket moves
+		 * to the new extent.
+		 */
+		ret = ocfs2_add_new_xattr_cluster(inode,
+						  xb_bh,
+						  first,
+						  target,
+						  &num_clusters,
+						  e_cpos,
+						  &extend,
+						  ctxt);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (extend) {
+		ret = ocfs2_extend_xattr_bucket(inode,
+						ctxt->handle,
+						first,
+						bucket_blkno(target),
+						num_clusters);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	ocfs2_xattr_bucket_free(first);
+
+	return ret;
+}
+
+static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
+					struct ocfs2_xattr_bucket *bucket,
+					int offs)
+{
+	int block_off = offs >> inode->i_sb->s_blocksize_bits;
+
+	offs = offs % inode->i_sb->s_blocksize;
+	return bucket_block(bucket, block_off) + offs;
+}
+
+/*
+ * Truncate the specified xe_off entry in xattr bucket.
+ * bucket is indicated by header_bh and len is the new length.
+ * Both the ocfs2_xattr_value_root and the entry will be updated here.
+ *
+ * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
+ */
+static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
+					     struct ocfs2_xattr_bucket *bucket,
+					     int xe_off,
+					     int len,
+					     struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret, offset;
+	u64 value_blk;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+	size_t blocksize = inode->i_sb->s_blocksize;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_access = ocfs2_journal_access,
+	};
+
+	xe = &xh->xh_entries[xe_off];
+
+	BUG_ON(!xe || ocfs2_xattr_is_local(xe));
+
+	offset = le16_to_cpu(xe->xe_name_offset) +
+		 OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+	value_blk = offset / blocksize;
+
+	/* We don't allow ocfs2_xattr_value to be stored in different block. */
+	BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
+
+	vb.vb_bh = bucket->bu_bhs[value_blk];
+	BUG_ON(!vb.vb_bh);
+
+	vb.vb_xv = (struct ocfs2_xattr_value_root *)
+		(vb.vb_bh->b_data + offset % blocksize);
+
+	/*
+	 * From here on out we have to dirty the bucket.  The generic
+	 * value calls only modify one of the bucket's bhs, but we need
+	 * to send the bucket at once.  So if they error, they *could* have
+	 * modified something.  We have to assume they did, and dirty
+	 * the whole bucket.  This leaves us in a consistent state.
+	 */
+	trace_ocfs2_xattr_bucket_value_truncate(
+			(unsigned long long)bucket_blkno(bucket), xe_off, len);
+	ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xe->xe_value_size = cpu_to_le64(len);
+
+	ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
+
+out:
+	return ret;
+}
+
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+				  struct buffer_head *root_bh,
+				  u64 blkno,
+				  u32 cpos,
+				  u32 len,
+				  void *para)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)root_bh->b_data;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_extent_tree et;
+
+	ret = ocfs2_iterate_xattr_buckets(inode, blkno, len,
+					  ocfs2_delete_xattr_in_bucket, para);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	trace_ocfs2_rm_xattr_cluster(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)blkno, cpos, len);
+
+	ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno,
+					       len);
+
+	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_remove_extent(handle, &et, cpos, len, meta_ac,
+				  &dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
+	ocfs2_journal_dirty(handle, root_bh);
+
+	ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	return ret;
+}
+
+/*
+ * check whether the xattr bucket is filled up with the same hash value.
+ * If we want to insert the xattr with the same hash, return -ENOSPC.
+ * If we want to insert a xattr with different hash value, go ahead
+ * and ocfs2_divide_xattr_bucket will handle this.
+ */
+static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
+					      struct ocfs2_xattr_bucket *bucket,
+					      const char *name)
+{
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+	u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+
+	if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
+		return 0;
+
+	if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
+	    xh->xh_entries[0].xe_name_hash) {
+		mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
+		     "hash = %u\n",
+		     (unsigned long long)bucket_blkno(bucket),
+		     le32_to_cpu(xh->xh_entries[0].xe_name_hash));
+		return -ENOSPC;
+	}
+
+	return 0;
+}
+
+/*
+ * Try to set the entry in the current bucket.  If we fail, the caller
+ * will handle getting us another bucket.
+ */
+static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xs,
+					struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	struct ocfs2_xa_loc loc;
+
+	trace_ocfs2_xattr_set_entry_bucket(xi->xi_name);
+
+	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
+				       xs->not_found ? NULL : xs->here);
+	ret = ocfs2_xa_set(&loc, xi, ctxt);
+	if (!ret) {
+		xs->here = loc.xl_entry;
+		goto out;
+	}
+	if (ret != -ENOSPC) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Ok, we need space.  Let's try defragmenting the bucket. */
+	ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
+					xs->bucket);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xa_set(&loc, xi, ctxt);
+	if (!ret) {
+		xs->here = loc.xl_entry;
+		goto out;
+	}
+	if (ret != -ENOSPC)
+		mlog_errno(ret);
+
+
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+					     struct ocfs2_xattr_info *xi,
+					     struct ocfs2_xattr_search *xs,
+					     struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+
+	trace_ocfs2_xattr_set_entry_index_block(xi->xi_name);
+
+	ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
+	if (!ret)
+		goto out;
+	if (ret != -ENOSPC) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Ack, need more space.  Let's try to get another bucket! */
+
+	/*
+	 * We do not allow for overlapping ranges between buckets. And
+	 * the maximum number of collisions we will allow for then is
+	 * one bucket's worth, so check it here whether we need to
+	 * add a new bucket for the insert.
+	 */
+	ret = ocfs2_check_xattr_bucket_collision(inode,
+						 xs->bucket,
+						 xi->xi_name);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_add_new_xattr_bucket(inode,
+					 xs->xattr_bh,
+					 xs->bucket,
+					 ctxt);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * ocfs2_add_new_xattr_bucket() will have updated
+	 * xs->bucket if it moved, but it will not have updated
+	 * any of the other search fields.  Thus, we drop it and
+	 * re-search.  Everything should be cached, so it'll be
+	 * quick.
+	 */
+	ocfs2_xattr_bucket_relse(xs->bucket);
+	ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
+					   xi->xi_name_index,
+					   xi->xi_name, xs);
+	if (ret && ret != -ENODATA)
+		goto out;
+	xs->not_found = ret;
+
+	/* Ok, we have a new bucket, let's try again */
+	ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
+	if (ret && (ret != -ENOSPC))
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+					struct ocfs2_xattr_bucket *bucket,
+					void *para)
+{
+	int ret = 0, ref_credits;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+	u16 i;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
+	int credits = ocfs2_remove_extent_credits(osb->sb) +
+		ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_rm_xattr_bucket_para *args =
+			(struct ocfs2_rm_xattr_bucket_para *)para;
+
+	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
+
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
+						      i, &xv, NULL);
+
+		ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
+							 args->ref_ci,
+							 args->ref_root_bh,
+							 &ctxt.meta_ac,
+							 &ref_credits);
+
+		ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
+		if (IS_ERR(ctxt.handle)) {
+			ret = PTR_ERR(ctxt.handle);
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
+							i, 0, &ctxt);
+
+		ocfs2_commit_trans(osb, ctxt.handle);
+		if (ctxt.meta_ac) {
+			ocfs2_free_alloc_context(ctxt.meta_ac);
+			ctxt.meta_ac = NULL;
+		}
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	if (ctxt.meta_ac)
+		ocfs2_free_alloc_context(ctxt.meta_ac);
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &ctxt.dealloc);
+	return ret;
+}
+
+/*
+ * Whenever we modify a xattr value root in the bucket(e.g, CoW
+ * or change the extent record flag), we need to recalculate
+ * the metaecc for the whole bucket. So it is done here.
+ *
+ * Note:
+ * We have to give the extra credits for the caller.
+ */
+static int ocfs2_xattr_bucket_post_refcount(struct inode *inode,
+					    handle_t *handle,
+					    void *para)
+{
+	int ret;
+	struct ocfs2_xattr_bucket *bucket =
+			(struct ocfs2_xattr_bucket *)para;
+
+	ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ocfs2_xattr_bucket_journal_dirty(handle, bucket);
+
+	return 0;
+}
+
+/*
+ * Special action we need if the xattr value is refcounted.
+ *
+ * 1. If the xattr is refcounted, lock the tree.
+ * 2. CoW the xattr if we are setting the new value and the value
+ *    will be stored outside.
+ * 3. In other case, decrease_refcount will work for us, so just
+ *    lock the refcount tree, calculate the meta and credits is OK.
+ *
+ * We have to do CoW before ocfs2_init_xattr_set_ctxt since
+ * currently CoW is a completed transaction, while this function
+ * will also lock the allocators and let us deadlock. So we will
+ * CoW the whole xattr value.
+ */
+static int ocfs2_prepare_refcount_xattr(struct inode *inode,
+					struct ocfs2_dinode *di,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xis,
+					struct ocfs2_xattr_search *xbs,
+					struct ocfs2_refcount_tree **ref_tree,
+					int *meta_add,
+					int *credits)
+{
+	int ret = 0;
+	struct ocfs2_xattr_block *xb;
+	struct ocfs2_xattr_entry *xe;
+	char *base;
+	u32 p_cluster, num_clusters;
+	unsigned int ext_flags;
+	int name_offset, name_len;
+	struct ocfs2_xattr_value_buf vb;
+	struct ocfs2_xattr_bucket *bucket = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_post_refcount refcount;
+	struct ocfs2_post_refcount *p = NULL;
+	struct buffer_head *ref_root_bh = NULL;
+
+	if (!xis->not_found) {
+		xe = xis->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		base = xis->base;
+		vb.vb_bh = xis->inode_bh;
+		vb.vb_access = ocfs2_journal_access_di;
+	} else {
+		int i, block_off = 0;
+		xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+		xe = xbs->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		i = xbs->here - xbs->header->xh_entries;
+
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+							bucket_xh(xbs->bucket),
+							i, &block_off,
+							&name_offset);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+			base = bucket_block(xbs->bucket, block_off);
+			vb.vb_bh = xbs->bucket->bu_bhs[block_off];
+			vb.vb_access = ocfs2_journal_access;
+
+			if (ocfs2_meta_ecc(osb)) {
+				/*create parameters for ocfs2_post_refcount. */
+				bucket = xbs->bucket;
+				refcount.credits = bucket->bu_blocks;
+				refcount.para = bucket;
+				refcount.func =
+					ocfs2_xattr_bucket_post_refcount;
+				p = &refcount;
+			}
+		} else {
+			base = xbs->base;
+			vb.vb_bh = xbs->xattr_bh;
+			vb.vb_access = ocfs2_journal_access_xb;
+		}
+	}
+
+	if (ocfs2_xattr_is_local(xe))
+		goto out;
+
+	vb.vb_xv = (struct ocfs2_xattr_value_root *)
+				(base + name_offset + name_len);
+
+	ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
+				       &num_clusters, &vb.vb_xv->xr_list,
+				       &ext_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We just need to check the 1st extent record, since we always
+	 * CoW the whole xattr. So there shouldn't be a xattr with
+	 * some REFCOUNT extent recs after the 1st one.
+	 */
+	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+		goto out;
+
+	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+				       1, ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * If we are deleting the xattr or the new size will be stored inside,
+	 * cool, leave it there, the xattr truncate process will remove them
+	 * for us(it still needs the refcount tree lock and the meta, credits).
+	 * And the worse case is that every cluster truncate will split the
+	 * refcount tree, and make the original extent become 3. So we will need
+	 * 2 * cluster more extent recs at most.
+	 */
+	if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) {
+
+		ret = ocfs2_refcounted_xattr_delete_need(inode,
+							 &(*ref_tree)->rf_ci,
+							 ref_root_bh, vb.vb_xv,
+							 meta_add, credits);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_refcount_cow_xattr(inode, di, &vb,
+				       *ref_tree, ref_root_bh, 0,
+				       le32_to_cpu(vb.vb_xv->xr_clusters), p);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(ref_root_bh);
+	return ret;
+}
+
+/*
+ * Add the REFCOUNTED flags for all the extent rec in ocfs2_xattr_value_root.
+ * The physical clusters will be added to refcount tree.
+ */
+static int ocfs2_xattr_value_attach_refcount(struct inode *inode,
+				struct ocfs2_xattr_value_root *xv,
+				struct ocfs2_extent_tree *value_et,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc,
+				struct ocfs2_post_refcount *refcount)
+{
+	int ret = 0;
+	u32 clusters = le32_to_cpu(xv->xr_clusters);
+	u32 cpos, p_cluster, num_clusters;
+	struct ocfs2_extent_list *el = &xv->xr_list;
+	unsigned int ext_flags;
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, el, &ext_flags);
+
+		cpos += num_clusters;
+		if ((ext_flags & OCFS2_EXT_REFCOUNTED))
+			continue;
+
+		BUG_ON(!p_cluster);
+
+		ret = ocfs2_add_refcount_flag(inode, value_et,
+					      ref_ci, ref_root_bh,
+					      cpos - num_clusters,
+					      p_cluster, num_clusters,
+					      dealloc, refcount);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Given a normal ocfs2_xattr_header, refcount all the entries which
+ * have value stored outside.
+ * Used for xattrs stored in inode and ocfs2_xattr_block.
+ */
+static int ocfs2_xattr_attach_refcount_normal(struct inode *inode,
+				struct ocfs2_xattr_value_buf *vb,
+				struct ocfs2_xattr_header *header,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_extent_tree et;
+	int i, ret = 0;
+
+	for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+		xe = &header->xh_entries[i];
+
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		xv = (struct ocfs2_xattr_value_root *)((void *)header +
+			le16_to_cpu(xe->xe_name_offset) +
+			OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+		vb->vb_xv = xv;
+		ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
+
+		ret = ocfs2_xattr_value_attach_refcount(inode, xv, &et,
+							ref_ci, ref_root_bh,
+							dealloc, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_inline_attach_refcount(struct inode *inode,
+				struct buffer_head *fe_bh,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+	struct ocfs2_xattr_header *header = (struct ocfs2_xattr_header *)
+				(fe_bh->b_data + inode->i_sb->s_blocksize -
+				le16_to_cpu(di->i_xattr_inline_size));
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = fe_bh,
+		.vb_access = ocfs2_journal_access_di,
+	};
+
+	return ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
+						  ref_ci, ref_root_bh, dealloc);
+}
+
+struct ocfs2_xattr_tree_value_refcount_para {
+	struct ocfs2_caching_info *ref_ci;
+	struct buffer_head *ref_root_bh;
+	struct ocfs2_cached_dealloc_ctxt *dealloc;
+};
+
+static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
+					   struct ocfs2_xattr_bucket *bucket,
+					   int offset,
+					   struct ocfs2_xattr_value_root **xv,
+					   struct buffer_head **bh)
+{
+	int ret, block_off, name_offset;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+	struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
+	void *base;
+
+	ret = ocfs2_xattr_bucket_get_name_value(sb,
+						bucket_xh(bucket),
+						offset,
+						&block_off,
+						&name_offset);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	base = bucket_block(bucket, block_off);
+
+	*xv = (struct ocfs2_xattr_value_root *)(base + name_offset +
+			 OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+	if (bh)
+		*bh = bucket->bu_bhs[block_off];
+out:
+	return ret;
+}
+
+/*
+ * For a given xattr bucket, refcount all the entries which
+ * have value stored outside.
+ */
+static int ocfs2_xattr_bucket_value_refcount(struct inode *inode,
+					     struct ocfs2_xattr_bucket *bucket,
+					     void *para)
+{
+	int i, ret = 0;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_xattr_tree_value_refcount_para *ref =
+			(struct ocfs2_xattr_tree_value_refcount_para *)para;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_access = ocfs2_journal_access,
+	};
+	struct ocfs2_post_refcount refcount = {
+		.credits = bucket->bu_blocks,
+		.para = bucket,
+		.func = ocfs2_xattr_bucket_post_refcount,
+	};
+	struct ocfs2_post_refcount *p = NULL;
+
+	/* We only need post_refcount if we support metaecc. */
+	if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)))
+		p = &refcount;
+
+	trace_ocfs2_xattr_bucket_value_refcount(
+				(unsigned long long)bucket_blkno(bucket),
+				le16_to_cpu(xh->xh_count));
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i,
+						      &vb.vb_xv, &vb.vb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ocfs2_init_xattr_value_extent_tree(&et,
+						   INODE_CACHE(inode), &vb);
+
+		ret = ocfs2_xattr_value_attach_refcount(inode, vb.vb_xv,
+							&et, ref->ref_ci,
+							ref->ref_root_bh,
+							ref->dealloc, p);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+
+}
+
+static int ocfs2_refcount_xattr_tree_rec(struct inode *inode,
+				     struct buffer_head *root_bh,
+				     u64 blkno, u32 cpos, u32 len, void *para)
+{
+	return ocfs2_iterate_xattr_buckets(inode, blkno, len,
+					   ocfs2_xattr_bucket_value_refcount,
+					   para);
+}
+
+static int ocfs2_xattr_block_attach_refcount(struct inode *inode,
+				struct buffer_head *blk_bh,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0;
+	struct ocfs2_xattr_block *xb =
+				(struct ocfs2_xattr_block *)blk_bh->b_data;
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+		struct ocfs2_xattr_value_buf vb = {
+			.vb_bh = blk_bh,
+			.vb_access = ocfs2_journal_access_xb,
+		};
+
+		ret = ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
+							 ref_ci, ref_root_bh,
+							 dealloc);
+	} else {
+		struct ocfs2_xattr_tree_value_refcount_para para = {
+			.ref_ci = ref_ci,
+			.ref_root_bh = ref_root_bh,
+			.dealloc = dealloc,
+		};
+
+		ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
+						ocfs2_refcount_xattr_tree_rec,
+						&para);
+	}
+
+	return ret;
+}
+
+int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
+				     struct buffer_head *fe_bh,
+				     struct ocfs2_caching_info *ref_ci,
+				     struct buffer_head *ref_root_bh,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_inline_attach_refcount(inode, fe_bh,
+							 ref_ci, ref_root_bh,
+							 dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (!di->i_xattr_loc)
+		goto out;
+
+	ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_block_attach_refcount(inode, blk_bh, ref_ci,
+						ref_root_bh, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+	brelse(blk_bh);
+out:
+
+	return ret;
+}
+
+typedef int (should_xattr_reflinked)(struct ocfs2_xattr_entry *xe);
+/*
+ * Store the information we need in xattr reflink.
+ * old_bh and new_bh are inode bh for the old and new inode.
+ */
+struct ocfs2_xattr_reflink {
+	struct inode *old_inode;
+	struct inode *new_inode;
+	struct buffer_head *old_bh;
+	struct buffer_head *new_bh;
+	struct ocfs2_caching_info *ref_ci;
+	struct buffer_head *ref_root_bh;
+	struct ocfs2_cached_dealloc_ctxt *dealloc;
+	should_xattr_reflinked *xattr_reflinked;
+};
+
+/*
+ * Given a xattr header and xe offset,
+ * return the proper xv and the corresponding bh.
+ * xattr in inode, block and xattr tree have different implementaions.
+ */
+typedef int (get_xattr_value_root)(struct super_block *sb,
+				   struct buffer_head *bh,
+				   struct ocfs2_xattr_header *xh,
+				   int offset,
+				   struct ocfs2_xattr_value_root **xv,
+				   struct buffer_head **ret_bh,
+				   void *para);
+
+/*
+ * Calculate all the xattr value root metadata stored in this xattr header and
+ * credits we need if we create them from the scratch.
+ * We use get_xattr_value_root so that all types of xattr container can use it.
+ */
+static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
+					     struct buffer_head *bh,
+					     struct ocfs2_xattr_header *xh,
+					     int *metas, int *credits,
+					     int *num_recs,
+					     get_xattr_value_root *func,
+					     void *para)
+{
+	int i, ret = 0;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_xattr_entry *xe;
+
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = func(sb, bh, xh, i, &xv, NULL, para);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		*metas += le16_to_cpu(xv->xr_list.l_tree_depth) *
+			  le16_to_cpu(xv->xr_list.l_next_free_rec);
+
+		*credits += ocfs2_calc_extend_credits(sb,
+						&def_xv.xv.xr_list,
+						le32_to_cpu(xv->xr_clusters));
+
+		/*
+		 * If the value is a tree with depth > 1, We don't go deep
+		 * to the extent block, so just calculate a maximum record num.
+		 */
+		if (!xv->xr_list.l_tree_depth)
+			*num_recs += le16_to_cpu(xv->xr_list.l_next_free_rec);
+		else
+			*num_recs += ocfs2_clusters_for_bytes(sb,
+							      XATTR_SIZE_MAX);
+	}
+
+	return ret;
+}
+
+/* Used by xattr inode and block to return the right xv and buffer_head. */
+static int ocfs2_get_xattr_value_root(struct super_block *sb,
+				      struct buffer_head *bh,
+				      struct ocfs2_xattr_header *xh,
+				      int offset,
+				      struct ocfs2_xattr_value_root **xv,
+				      struct buffer_head **ret_bh,
+				      void *para)
+{
+	struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
+
+	*xv = (struct ocfs2_xattr_value_root *)((void *)xh +
+		le16_to_cpu(xe->xe_name_offset) +
+		OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+	if (ret_bh)
+		*ret_bh = bh;
+
+	return 0;
+}
+
+/*
+ * Lock the meta_ac and caculate how much credits we need for reflink xattrs.
+ * It is only used for inline xattr and xattr block.
+ */
+static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb,
+					struct ocfs2_xattr_header *xh,
+					struct buffer_head *ref_root_bh,
+					int *credits,
+					struct ocfs2_alloc_context **meta_ac)
+{
+	int ret, meta_add = 0, num_recs = 0;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	*credits = 0;
+
+	ret = ocfs2_value_metas_in_xattr_header(osb->sb, NULL, xh,
+						&meta_add, credits, &num_recs,
+						ocfs2_get_xattr_value_root,
+						NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We need to add/modify num_recs in refcount tree, so just calculate
+	 * an approximate number we need for refcount tree change.
+	 * Sometimes we need to split the tree, and after split,  half recs
+	 * will be moved to the new block, and a new block can only provide
+	 * half number of recs. So we multiple new blocks by 2.
+	 */
+	num_recs = num_recs / ocfs2_refcount_recs_per_rb(osb->sb) * 2;
+	meta_add += num_recs;
+	*credits += num_recs + num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+		*credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
+			    le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
+	else
+		*credits += 1;
+
+	ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+/*
+ * Given a xattr header, reflink all the xattrs in this container.
+ * It can be used for inode, block and bucket.
+ *
+ * NOTE:
+ * Before we call this function, the caller has memcpy the xattr in
+ * old_xh to the new_xh.
+ *
+ * If args.xattr_reflinked is set, call it to decide whether the xe should
+ * be reflinked or not. If not, remove it from the new xattr header.
+ */
+static int ocfs2_reflink_xattr_header(handle_t *handle,
+				      struct ocfs2_xattr_reflink *args,
+				      struct buffer_head *old_bh,
+				      struct ocfs2_xattr_header *xh,
+				      struct buffer_head *new_bh,
+				      struct ocfs2_xattr_header *new_xh,
+				      struct ocfs2_xattr_value_buf *vb,
+				      struct ocfs2_alloc_context *meta_ac,
+				      get_xattr_value_root *func,
+				      void *para)
+{
+	int ret = 0, i, j;
+	struct super_block *sb = args->old_inode->i_sb;
+	struct buffer_head *value_bh;
+	struct ocfs2_xattr_entry *xe, *last;
+	struct ocfs2_xattr_value_root *xv, *new_xv;
+	struct ocfs2_extent_tree data_et;
+	u32 clusters, cpos, p_cluster, num_clusters;
+	unsigned int ext_flags = 0;
+
+	trace_ocfs2_reflink_xattr_header((unsigned long long)old_bh->b_blocknr,
+					 le16_to_cpu(xh->xh_count));
+
+	last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)];
+	for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) {
+		xe = &xh->xh_entries[i];
+
+		if (args->xattr_reflinked && !args->xattr_reflinked(xe)) {
+			xe = &new_xh->xh_entries[j];
+
+			le16_add_cpu(&new_xh->xh_count, -1);
+			if (new_xh->xh_count) {
+				memmove(xe, xe + 1,
+					(void *)last - (void *)xe);
+				memset(last, 0,
+				       sizeof(struct ocfs2_xattr_entry));
+			}
+
+			/*
+			 * We don't want j to increase in the next round since
+			 * it is already moved ahead.
+			 */
+			j--;
+			continue;
+		}
+
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = func(sb, old_bh, xh, i, &xv, NULL, para);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = func(sb, new_bh, new_xh, j, &new_xv, &value_bh, para);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		/*
+		 * For the xattr which has l_tree_depth = 0, all the extent
+		 * recs have already be copied to the new xh with the
+		 * propriate OCFS2_EXT_REFCOUNTED flag we just need to
+		 * increase the refount count int the refcount tree.
+		 *
+		 * For the xattr which has l_tree_depth > 0, we need
+		 * to initialize it to the empty default value root,
+		 * and then insert the extents one by one.
+		 */
+		if (xv->xr_list.l_tree_depth) {
+			memcpy(new_xv, &def_xv, sizeof(def_xv));
+			vb->vb_xv = new_xv;
+			vb->vb_bh = value_bh;
+			ocfs2_init_xattr_value_extent_tree(&data_et,
+					INODE_CACHE(args->new_inode), vb);
+		}
+
+		clusters = le32_to_cpu(xv->xr_clusters);
+		cpos = 0;
+		while (cpos < clusters) {
+			ret = ocfs2_xattr_get_clusters(args->old_inode,
+						       cpos,
+						       &p_cluster,
+						       &num_clusters,
+						       &xv->xr_list,
+						       &ext_flags);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			BUG_ON(!p_cluster);
+
+			if (xv->xr_list.l_tree_depth) {
+				ret = ocfs2_insert_extent(handle,
+						&data_et, cpos,
+						ocfs2_clusters_to_blocks(
+							args->old_inode->i_sb,
+							p_cluster),
+						num_clusters, ext_flags,
+						meta_ac);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+			}
+
+			ret = ocfs2_increase_refcount(handle, args->ref_ci,
+						      args->ref_root_bh,
+						      p_cluster, num_clusters,
+						      meta_ac, args->dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			cpos += num_clusters;
+		}
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
+{
+	int ret = 0, credits = 0;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(args->old_inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)args->old_bh->b_data;
+	int inline_size = le16_to_cpu(di->i_xattr_inline_size);
+	int header_off = osb->sb->s_blocksize - inline_size;
+	struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)
+					(args->old_bh->b_data + header_off);
+	struct ocfs2_xattr_header *new_xh = (struct ocfs2_xattr_header *)
+					(args->new_bh->b_data + header_off);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_inode_info *new_oi;
+	struct ocfs2_dinode *new_di;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = args->new_bh,
+		.vb_access = ocfs2_journal_access_di,
+	};
+
+	ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
+						  &credits, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(args->new_inode),
+				      args->new_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	memcpy(args->new_bh->b_data + header_off,
+	       args->old_bh->b_data + header_off, inline_size);
+
+	new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
+	new_di->i_xattr_inline_size = cpu_to_le16(inline_size);
+
+	ret = ocfs2_reflink_xattr_header(handle, args, args->old_bh, xh,
+					 args->new_bh, new_xh, &vb, meta_ac,
+					 ocfs2_get_xattr_value_root, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	new_oi = OCFS2_I(args->new_inode);
+	spin_lock(&new_oi->ip_lock);
+	new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
+	new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
+	spin_unlock(&new_oi->ip_lock);
+
+	ocfs2_journal_dirty(handle, args->new_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+static int ocfs2_create_empty_xattr_block(struct inode *inode,
+					  struct buffer_head *fe_bh,
+					  struct buffer_head **ret_bh,
+					  int indexed)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_set_ctxt ctxt;
+
+	memset(&ctxt, 0, sizeof(ctxt));
+	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+	if (IS_ERR(ctxt.handle)) {
+		ret = PTR_ERR(ctxt.handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	trace_ocfs2_create_empty_xattr_block(
+				(unsigned long long)fe_bh->b_blocknr, indexed);
+	ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
+				       ret_bh);
+	if (ret)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(osb, ctxt.handle);
+out:
+	ocfs2_free_alloc_context(ctxt.meta_ac);
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_block(struct ocfs2_xattr_reflink *args,
+				     struct buffer_head *blk_bh,
+				     struct buffer_head *new_blk_bh)
+{
+	int ret = 0, credits = 0;
+	handle_t *handle;
+	struct ocfs2_inode_info *new_oi = OCFS2_I(args->new_inode);
+	struct ocfs2_dinode *new_di;
+	struct ocfs2_super *osb = OCFS2_SB(args->new_inode->i_sb);
+	int header_off = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)blk_bh->b_data;
+	struct ocfs2_xattr_header *xh = &xb->xb_attrs.xb_header;
+	struct ocfs2_xattr_block *new_xb =
+			(struct ocfs2_xattr_block *)new_blk_bh->b_data;
+	struct ocfs2_xattr_header *new_xh = &new_xb->xb_attrs.xb_header;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = new_blk_bh,
+		.vb_access = ocfs2_journal_access_xb,
+	};
+
+	ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
+						  &credits, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* One more credits in case we need to add xattr flags in new inode. */
+	handle = ocfs2_start_trans(osb, credits + 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
+		ret = ocfs2_journal_access_di(handle,
+					      INODE_CACHE(args->new_inode),
+					      args->new_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	ret = ocfs2_journal_access_xb(handle, INODE_CACHE(args->new_inode),
+				      new_blk_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	memcpy(new_blk_bh->b_data + header_off, blk_bh->b_data + header_off,
+	       osb->sb->s_blocksize - header_off);
+
+	ret = ocfs2_reflink_xattr_header(handle, args, blk_bh, xh,
+					 new_blk_bh, new_xh, &vb, meta_ac,
+					 ocfs2_get_xattr_value_root, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_journal_dirty(handle, new_blk_bh);
+
+	if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
+		new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
+		spin_lock(&new_oi->ip_lock);
+		new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
+		new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
+		spin_unlock(&new_oi->ip_lock);
+
+		ocfs2_journal_dirty(handle, args->new_bh);
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+struct ocfs2_reflink_xattr_tree_args {
+	struct ocfs2_xattr_reflink *reflink;
+	struct buffer_head *old_blk_bh;
+	struct buffer_head *new_blk_bh;
+	struct ocfs2_xattr_bucket *old_bucket;
+	struct ocfs2_xattr_bucket *new_bucket;
+};
+
+/*
+ * NOTE:
+ * We have to handle the case that both old bucket and new bucket
+ * will call this function to get the right ret_bh.
+ * So The caller must give us the right bh.
+ */
+static int ocfs2_get_reflink_xattr_value_root(struct super_block *sb,
+					struct buffer_head *bh,
+					struct ocfs2_xattr_header *xh,
+					int offset,
+					struct ocfs2_xattr_value_root **xv,
+					struct buffer_head **ret_bh,
+					void *para)
+{
+	struct ocfs2_reflink_xattr_tree_args *args =
+			(struct ocfs2_reflink_xattr_tree_args *)para;
+	struct ocfs2_xattr_bucket *bucket;
+
+	if (bh == args->old_bucket->bu_bhs[0])
+		bucket = args->old_bucket;
+	else
+		bucket = args->new_bucket;
+
+	return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
+					       xv, ret_bh);
+}
+
+struct ocfs2_value_tree_metas {
+	int num_metas;
+	int credits;
+	int num_recs;
+};
+
+static int ocfs2_value_tree_metas_in_bucket(struct super_block *sb,
+					struct buffer_head *bh,
+					struct ocfs2_xattr_header *xh,
+					int offset,
+					struct ocfs2_xattr_value_root **xv,
+					struct buffer_head **ret_bh,
+					void *para)
+{
+	struct ocfs2_xattr_bucket *bucket =
+				(struct ocfs2_xattr_bucket *)para;
+
+	return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
+					       xv, ret_bh);
+}
+
+static int ocfs2_calc_value_tree_metas(struct inode *inode,
+				      struct ocfs2_xattr_bucket *bucket,
+				      void *para)
+{
+	struct ocfs2_value_tree_metas *metas =
+			(struct ocfs2_value_tree_metas *)para;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
+
+	/* Add the credits for this bucket first. */
+	metas->credits += bucket->bu_blocks;
+	return ocfs2_value_metas_in_xattr_header(inode->i_sb, bucket->bu_bhs[0],
+					xh, &metas->num_metas,
+					&metas->credits, &metas->num_recs,
+					ocfs2_value_tree_metas_in_bucket,
+					bucket);
+}
+
+/*
+ * Given a xattr extent rec starting from blkno and having len clusters,
+ * iterate all the buckets calculate how much metadata we need for reflinking
+ * all the ocfs2_xattr_value_root and lock the allocators accordingly.
+ */
+static int ocfs2_lock_reflink_xattr_rec_allocators(
+				struct ocfs2_reflink_xattr_tree_args *args,
+				struct ocfs2_extent_tree *xt_et,
+				u64 blkno, u32 len, int *credits,
+				struct ocfs2_alloc_context **meta_ac,
+				struct ocfs2_alloc_context **data_ac)
+{
+	int ret, num_free_extents;
+	struct ocfs2_value_tree_metas metas;
+	struct ocfs2_super *osb = OCFS2_SB(args->reflink->old_inode->i_sb);
+	struct ocfs2_refcount_block *rb;
+
+	memset(&metas, 0, sizeof(metas));
+
+	ret = ocfs2_iterate_xattr_buckets(args->reflink->old_inode, blkno, len,
+					  ocfs2_calc_value_tree_metas, &metas);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*credits = metas.credits;
+
+	/*
+	 * Calculate we need for refcount tree change.
+	 *
+	 * We need to add/modify num_recs in refcount tree, so just calculate
+	 * an approximate number we need for refcount tree change.
+	 * Sometimes we need to split the tree, and after split,  half recs
+	 * will be moved to the new block, and a new block can only provide
+	 * half number of recs. So we multiple new blocks by 2.
+	 * In the end, we have to add credits for modifying the already
+	 * existed refcount block.
+	 */
+	rb = (struct ocfs2_refcount_block *)args->reflink->ref_root_bh->b_data;
+	metas.num_recs =
+		(metas.num_recs + ocfs2_refcount_recs_per_rb(osb->sb) - 1) /
+		 ocfs2_refcount_recs_per_rb(osb->sb) * 2;
+	metas.num_metas += metas.num_recs;
+	*credits += metas.num_recs +
+		    metas.num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+		*credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
+			    le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
+	else
+		*credits += 1;
+
+	/* count in the xattr tree change. */
+	num_free_extents = ocfs2_num_free_extents(osb, xt_et);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (num_free_extents < len)
+		metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);
+
+	*credits += ocfs2_calc_extend_credits(osb->sb,
+					      xt_et->et_root_el, len);
+
+	if (metas.num_metas) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas,
+							meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (len) {
+		ret = ocfs2_reserve_clusters(osb, len, data_ac);
+		if (ret)
+			mlog_errno(ret);
+	}
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			meta_ac = NULL;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_bucket(handle_t *handle,
+				u64 blkno, u64 new_blkno, u32 clusters,
+				u32 *cpos, int num_buckets,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_reflink_xattr_tree_args *args)
+{
+	int i, j, ret = 0;
+	struct super_block *sb = args->reflink->old_inode->i_sb;
+	int bpb = args->old_bucket->bu_blocks;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_access = ocfs2_journal_access,
+	};
+
+	for (i = 0; i < num_buckets; i++, blkno += bpb, new_blkno += bpb) {
+		ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_xattr_bucket_journal_access(handle,
+						args->new_bucket,
+						OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		for (j = 0; j < bpb; j++)
+			memcpy(bucket_block(args->new_bucket, j),
+			       bucket_block(args->old_bucket, j),
+			       sb->s_blocksize);
+
+		/*
+		 * Record the start cpos so that we can use it to initialize
+		 * our xattr tree we also set the xh_num_bucket for the new
+		 * bucket.
+		 */
+		if (i == 0) {
+			*cpos = le32_to_cpu(bucket_xh(args->new_bucket)->
+					    xh_entries[0].xe_name_hash);
+			bucket_xh(args->new_bucket)->xh_num_buckets =
+				cpu_to_le16(num_buckets);
+		}
+
+		ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
+		ret = ocfs2_reflink_xattr_header(handle, args->reflink,
+					args->old_bucket->bu_bhs[0],
+					bucket_xh(args->old_bucket),
+					args->new_bucket->bu_bhs[0],
+					bucket_xh(args->new_bucket),
+					&vb, meta_ac,
+					ocfs2_get_reflink_xattr_value_root,
+					args);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		/*
+		 * Re-access and dirty the bucket to calculate metaecc.
+		 * Because we may extend the transaction in reflink_xattr_header
+		 * which will let the already accessed block gone.
+		 */
+		ret = ocfs2_xattr_bucket_journal_access(handle,
+						args->new_bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
+		ocfs2_xattr_bucket_relse(args->old_bucket);
+		ocfs2_xattr_bucket_relse(args->new_bucket);
+	}
+
+	ocfs2_xattr_bucket_relse(args->old_bucket);
+	ocfs2_xattr_bucket_relse(args->new_bucket);
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+				struct inode *inode,
+				struct ocfs2_reflink_xattr_tree_args *args,
+				struct ocfs2_extent_tree *et,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_alloc_context *data_ac,
+				u64 blkno, u32 cpos, u32 len)
+{
+	int ret, first_inserted = 0;
+	u32 p_cluster, num_clusters, reflink_cpos = 0;
+	u64 new_blkno;
+	unsigned int num_buckets, reflink_buckets;
+	unsigned int bpc =
+		ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+
+	ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets);
+	ocfs2_xattr_bucket_relse(args->old_bucket);
+
+	while (len && num_buckets) {
+		ret = ocfs2_claim_clusters(handle, data_ac,
+					   1, &p_cluster, &num_clusters);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+		reflink_buckets = min(num_buckets, bpc * num_clusters);
+
+		ret = ocfs2_reflink_xattr_bucket(handle, blkno,
+						 new_blkno, num_clusters,
+						 &reflink_cpos, reflink_buckets,
+						 meta_ac, data_ac, args);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * For the 1st allocated cluster, we make it use the same cpos
+		 * so that the xattr tree looks the same as the original one
+		 * in the most case.
+		 */
+		if (!first_inserted) {
+			reflink_cpos = cpos;
+			first_inserted = 1;
+		}
+		ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno,
+					  num_clusters, 0, meta_ac);
+		if (ret)
+			mlog_errno(ret);
+
+		trace_ocfs2_reflink_xattr_buckets((unsigned long long)new_blkno,
+						  num_clusters, reflink_cpos);
+
+		len -= num_clusters;
+		blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+		num_buckets -= reflink_buckets;
+	}
+out:
+	return ret;
+}
+
+/*
+ * Create the same xattr extent record in the new inode's xattr tree.
+ */
+static int ocfs2_reflink_xattr_rec(struct inode *inode,
+				   struct buffer_head *root_bh,
+				   u64 blkno,
+				   u32 cpos,
+				   u32 len,
+				   void *para)
+{
+	int ret, credits = 0;
+	handle_t *handle;
+	struct ocfs2_reflink_xattr_tree_args *args =
+			(struct ocfs2_reflink_xattr_tree_args *)para;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_extent_tree et;
+
+	trace_ocfs2_reflink_xattr_rec((unsigned long long)blkno, len);
+
+	ocfs2_init_xattr_tree_extent_tree(&et,
+					  INODE_CACHE(args->reflink->new_inode),
+					  args->new_blk_bh);
+
+	ret = ocfs2_lock_reflink_xattr_rec_allocators(args, &et, blkno,
+						      len, &credits,
+						      &meta_ac, &data_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et,
+					  meta_ac, data_ac,
+					  blkno, cpos, len);
+	if (ret)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	return ret;
+}
+
+/*
+ * Create reflinked xattr buckets.
+ * We will add bucket one by one, and refcount all the xattrs in the bucket
+ * if they are stored outside.
+ */
+static int ocfs2_reflink_xattr_tree(struct ocfs2_xattr_reflink *args,
+				    struct buffer_head *blk_bh,
+				    struct buffer_head *new_blk_bh)
+{
+	int ret;
+	struct ocfs2_reflink_xattr_tree_args para;
+
+	memset(&para, 0, sizeof(para));
+	para.reflink = args;
+	para.old_blk_bh = blk_bh;
+	para.new_blk_bh = new_blk_bh;
+
+	para.old_bucket = ocfs2_xattr_bucket_new(args->old_inode);
+	if (!para.old_bucket) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	para.new_bucket = ocfs2_xattr_bucket_new(args->new_inode);
+	if (!para.new_bucket) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_iterate_xattr_index_block(args->old_inode, blk_bh,
+					      ocfs2_reflink_xattr_rec,
+					      &para);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	ocfs2_xattr_bucket_free(para.old_bucket);
+	ocfs2_xattr_bucket_free(para.new_bucket);
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
+					struct buffer_head *blk_bh)
+{
+	int ret, indexed = 0;
+	struct buffer_head *new_blk_bh = NULL;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)blk_bh->b_data;
+
+
+	if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)
+		indexed = 1;
+
+	ret = ocfs2_create_empty_xattr_block(args->new_inode, args->new_bh,
+					     &new_blk_bh, indexed);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!indexed)
+		ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
+	else
+		ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(new_blk_bh);
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_no_security(struct ocfs2_xattr_entry *xe)
+{
+	int type = ocfs2_xattr_get_type(xe);
+
+	return type != OCFS2_XATTR_INDEX_SECURITY &&
+	       type != OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS &&
+	       type != OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+}
+
+int ocfs2_reflink_xattrs(struct inode *old_inode,
+			 struct buffer_head *old_bh,
+			 struct inode *new_inode,
+			 struct buffer_head *new_bh,
+			 bool preserve_security)
+{
+	int ret;
+	struct ocfs2_xattr_reflink args;
+	struct ocfs2_inode_info *oi = OCFS2_I(old_inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)old_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_refcount_tree *ref_tree;
+	struct buffer_head *ref_root_bh = NULL;
+
+	ret = ocfs2_lock_refcount_tree(OCFS2_SB(old_inode->i_sb),
+				       le64_to_cpu(di->i_refcount_loc),
+				       1, &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	args.old_inode = old_inode;
+	args.new_inode = new_inode;
+	args.old_bh = old_bh;
+	args.new_bh = new_bh;
+	args.ref_ci = &ref_tree->rf_ci;
+	args.ref_root_bh = ref_root_bh;
+	args.dealloc = &dealloc;
+	if (preserve_security)
+		args.xattr_reflinked = NULL;
+	else
+		args.xattr_reflinked = ocfs2_reflink_xattr_no_security;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_reflink_xattr_inline(&args);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_unlock;
+		}
+	}
+
+	if (!di->i_xattr_loc)
+		goto out_unlock;
+
+	ret = ocfs2_read_xattr_block(old_inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_reflink_xattr_in_block(&args, blk_bh);
+	if (ret)
+		mlog_errno(ret);
+
+	brelse(blk_bh);
+
+out_unlock:
+	ocfs2_unlock_refcount_tree(OCFS2_SB(old_inode->i_sb),
+				   ref_tree, 1);
+	brelse(ref_root_bh);
+
+	if (ocfs2_dealloc_has_cluster(&dealloc)) {
+		ocfs2_schedule_truncate_log_flush(OCFS2_SB(old_inode->i_sb), 1);
+		ocfs2_run_deallocs(OCFS2_SB(old_inode->i_sb), &dealloc);
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Initialize security and acl for a already created inode.
+ * Used for reflink a non-preserve-security file.
+ *
+ * It uses common api like ocfs2_xattr_set, so the caller
+ * must not hold any lock expect i_mutex.
+ */
+int ocfs2_init_security_and_acl(struct inode *dir,
+				struct inode *inode,
+				const struct qstr *qstr)
+{
+	int ret = 0;
+	struct buffer_head *dir_bh = NULL;
+	struct ocfs2_security_xattr_info si = {
+		.enable = 1,
+	};
+
+	ret = ocfs2_init_security_get(inode, dir, qstr, &si);
+	if (!ret) {
+		ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
+				      si.name, si.value, si.value_len,
+				      XATTR_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			goto leave;
+		}
+	} else if (ret != -EOPNOTSUPP) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = ocfs2_inode_lock(dir, &dir_bh, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
+	if (ret)
+		mlog_errno(ret);
+
+	ocfs2_inode_unlock(dir, 0);
+	brelse(dir_bh);
+leave:
+	return ret;
+}
+/*
+ * 'security' attributes support
+ */
+static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
+					size_t list_size, const char *name,
+					size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
+				    void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
+			       name, buffer, size);
+}
+
+static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
+			       name, value, size, flags);
+}
+
+int ocfs2_init_security_get(struct inode *inode,
+			    struct inode *dir,
+			    const struct qstr *qstr,
+			    struct ocfs2_security_xattr_info *si)
+{
+	/* check whether ocfs2 support feature xattr */
+	if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
+		return -EOPNOTSUPP;
+	return security_inode_init_security(inode, dir, qstr, &si->name,
+					    &si->value, &si->value_len);
+}
+
+int ocfs2_init_security_set(handle_t *handle,
+			    struct inode *inode,
+			    struct buffer_head *di_bh,
+			    struct ocfs2_security_xattr_info *si,
+			    struct ocfs2_alloc_context *xattr_ac,
+			    struct ocfs2_alloc_context *data_ac)
+{
+	return ocfs2_xattr_set_handle(handle, inode, di_bh,
+				     OCFS2_XATTR_INDEX_SECURITY,
+				     si->name, si->value, si->value_len, 0,
+				     xattr_ac, data_ac);
+}
+
+const struct xattr_handler ocfs2_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= ocfs2_xattr_security_list,
+	.get	= ocfs2_xattr_security_get,
+	.set	= ocfs2_xattr_security_set,
+};
+
+/*
+ * 'trusted' attributes support
+ */
+static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
+				       size_t list_size, const char *name,
+				       size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
+			       name, buffer, size);
+}
+
+static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
+			       name, value, size, flags);
+}
+
+const struct xattr_handler ocfs2_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= ocfs2_xattr_trusted_list,
+	.get	= ocfs2_xattr_trusted_get,
+	.set	= ocfs2_xattr_trusted_set,
+};
+
+/*
+ * 'user' attributes support
+ */
+static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
+				    size_t list_size, const char *name,
+				    size_t name_len, int type)
+{
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_USER_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
+		void *buffer, size_t size, int type)
+{
+	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return -EOPNOTSUPP;
+	return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name,
+			       buffer, size);
+}
+
+static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return -EOPNOTSUPP;
+
+	return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER,
+			       name, value, size, flags);
+}
+
+const struct xattr_handler ocfs2_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= ocfs2_xattr_user_list,
+	.get	= ocfs2_xattr_user_get,
+	.set	= ocfs2_xattr_user_set,
+};
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
new file mode 100644
index 00000000..d63cfb72
--- /dev/null
+++ b/fs/ocfs2/xattr.h
@@ -0,0 +1,100 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.h
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_XATTR_H
+#define OCFS2_XATTR_H
+
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+enum ocfs2_xattr_type {
+	OCFS2_XATTR_INDEX_USER = 1,
+	OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS,
+	OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+	OCFS2_XATTR_INDEX_TRUSTED,
+	OCFS2_XATTR_INDEX_SECURITY,
+	OCFS2_XATTR_MAX
+};
+
+struct ocfs2_security_xattr_info {
+	int enable;
+	char *name;
+	void *value;
+	size_t value_len;
+};
+
+extern const struct xattr_handler ocfs2_xattr_user_handler;
+extern const struct xattr_handler ocfs2_xattr_trusted_handler;
+extern const struct xattr_handler ocfs2_xattr_security_handler;
+extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
+extern const struct xattr_handler *ocfs2_xattr_handlers[];
+
+ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
+			   const char *, void *, size_t);
+int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
+		    size_t, int);
+int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
+			   int, const char *, const void *, size_t, int,
+			   struct ocfs2_alloc_context *,
+			   struct ocfs2_alloc_context *);
+int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
+					 struct ocfs2_dinode *di);
+int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
+int ocfs2_init_security_get(struct inode *, struct inode *,
+			    const struct qstr *,
+			    struct ocfs2_security_xattr_info *);
+int ocfs2_init_security_set(handle_t *, struct inode *,
+			    struct buffer_head *,
+			    struct ocfs2_security_xattr_info *,
+			    struct ocfs2_alloc_context *,
+			    struct ocfs2_alloc_context *);
+int ocfs2_calc_security_init(struct inode *,
+			     struct ocfs2_security_xattr_info *,
+			     int *, int *, struct ocfs2_alloc_context **);
+int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
+			  int, struct ocfs2_security_xattr_info *,
+			  int *, int *, int *);
+
+/*
+ * xattrs can live inside an inode, as part of an external xattr block,
+ * or inside an xattr bucket, which is the leaf of a tree rooted in an
+ * xattr block.  Some of the xattr calls, especially the value setting
+ * functions, want to treat each of these locations as equal.  Let's wrap
+ * them in a structure that we can pass around instead of raw buffer_heads.
+ */
+struct ocfs2_xattr_value_buf {
+	struct buffer_head		*vb_bh;
+	ocfs2_journal_access_func	vb_access;
+	struct ocfs2_xattr_value_root	*vb_xv;
+};
+
+int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
+				     struct buffer_head *fe_bh,
+				     struct ocfs2_caching_info *ref_ci,
+				     struct buffer_head *ref_root_bh,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_reflink_xattrs(struct inode *old_inode,
+			 struct buffer_head *old_bh,
+			 struct inode *new_inode,
+			 struct buffer_head *new_bh,
+			 bool preserve_security);
+int ocfs2_init_security_and_acl(struct inode *dir,
+				struct inode *inode,
+				const struct qstr *qstr);
+#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig
new file mode 100644
index 00000000..b1b9a0ab
--- /dev/null
+++ b/fs/omfs/Kconfig
@@ -0,0 +1,13 @@
+config OMFS_FS
+	tristate "SonicBlue Optimized MPEG File System support"
+	depends on BLOCK
+	select CRC_ITU_T
+	help
+	  This is the proprietary file system used by the Rio Karma music
+	  player and ReplayTV DVR.  Despite the name, this filesystem is not
+	  more efficient than a standard FS for MPEG files, in fact likely
+	  the opposite is true.  Say Y if you have either of these devices
+	  and wish to mount its disk.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called omfs.  If unsure, say N.
diff --git a/fs/omfs/Makefile b/fs/omfs/Makefile
new file mode 100644
index 00000000..8b82b63f
--- /dev/null
+++ b/fs/omfs/Makefile
@@ -0,0 +1,4 @@
+
+obj-$(CONFIG_OMFS_FS) += omfs.o
+
+omfs-y := bitmap.o dir.o file.o inode.o
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
new file mode 100644
index 00000000..08223458
--- /dev/null
+++ b/fs/omfs/bitmap.c
@@ -0,0 +1,193 @@
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <asm/div64.h>
+#include "omfs.h"
+
+unsigned long omfs_count_free(struct super_block *sb)
+{
+	unsigned int i;
+	unsigned long sum = 0;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int nbits = sb->s_blocksize * 8;
+
+	for (i = 0; i < sbi->s_imap_size; i++)
+		sum += nbits - bitmap_weight(sbi->s_imap[i], nbits);
+
+	return sum;
+}
+
+/*
+ *  Counts the run of zero bits starting at bit up to max.
+ *  It handles the case where a run might spill over a buffer.
+ *  Called with bitmap lock.
+ */
+static int count_run(unsigned long **addr, int nbits,
+		int addrlen, int bit, int max)
+{
+	int count = 0;
+	int x;
+
+	for (; addrlen > 0; addrlen--, addr++) {
+		x = find_next_bit(*addr, nbits, bit);
+		count += x - bit;
+
+		if (x < nbits || count > max)
+			return min(count, max);
+
+		bit = 0;
+	}
+	return min(count, max);
+}
+
+/*
+ * Sets or clears the run of count bits starting with bit.
+ * Called with bitmap lock.
+ */
+static int set_run(struct super_block *sb, int map,
+		int nbits, int bit, int count, int set)
+{
+	int i;
+	int err;
+	struct buffer_head *bh;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+
+ 	err = -ENOMEM;
+	bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+	if (!bh)
+		goto out;
+
+	for (i = 0; i < count; i++, bit++) {
+		if (bit >= nbits) {
+			bit = 0;
+			map++;
+
+			mark_buffer_dirty(bh);
+			brelse(bh);
+			bh = sb_bread(sb,
+				clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+			if (!bh)
+				goto out;
+		}
+		if (set) {
+			set_bit(bit, sbi->s_imap[map]);
+			set_bit(bit, (unsigned long *)bh->b_data);
+		} else {
+			clear_bit(bit, sbi->s_imap[map]);
+			clear_bit(bit, (unsigned long *)bh->b_data);
+		}
+	}
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	err = 0;
+out:
+	return err;
+}
+
+/*
+ * Tries to allocate exactly one block.  Returns true if successful.
+ */
+int omfs_allocate_block(struct super_block *sb, u64 block)
+{
+	struct buffer_head *bh;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int bits_per_entry = 8 * sb->s_blocksize;
+	unsigned int map, bit;
+	int ret = 0;
+	u64 tmp;
+
+	tmp = block;
+	bit = do_div(tmp, bits_per_entry);
+	map = tmp;
+
+	mutex_lock(&sbi->s_bitmap_lock);
+	if (map >= sbi->s_imap_size || test_and_set_bit(bit, sbi->s_imap[map]))
+		goto out;
+
+	if (sbi->s_bitmap_ino > 0) {
+		bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+		if (!bh)
+			goto out;
+
+		set_bit(bit, (unsigned long *)bh->b_data);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+	}
+	ret = 1;
+out:
+	mutex_unlock(&sbi->s_bitmap_lock);
+	return ret;
+}
+
+
+/*
+ *  Tries to allocate a set of blocks.	The request size depends on the
+ *  type: for inodes, we must allocate sbi->s_mirrors blocks, and for file
+ *  blocks, we try to allocate sbi->s_clustersize, but can always get away
+ *  with just one block.
+ */
+int omfs_allocate_range(struct super_block *sb,
+			int min_request,
+			int max_request,
+			u64 *return_block,
+			int *return_size)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int bits_per_entry = 8 * sb->s_blocksize;
+	int ret = 0;
+	int i, run, bit;
+
+	mutex_lock(&sbi->s_bitmap_lock);
+	for (i = 0; i < sbi->s_imap_size; i++) {
+		bit = 0;
+		while (bit < bits_per_entry) {
+			bit = find_next_zero_bit(sbi->s_imap[i], bits_per_entry,
+				bit);
+
+			if (bit == bits_per_entry)
+				break;
+
+			run = count_run(&sbi->s_imap[i], bits_per_entry,
+				sbi->s_imap_size-i, bit, max_request);
+
+			if (run >= min_request)
+				goto found;
+			bit += run;
+		}
+	}
+	ret = -ENOSPC;
+	goto out;
+
+found:
+	*return_block = i * bits_per_entry + bit;
+	*return_size = run;
+	ret = set_run(sb, i, bits_per_entry, bit, run, 1);
+
+out:
+	mutex_unlock(&sbi->s_bitmap_lock);
+	return ret;
+}
+
+/*
+ * Clears count bits starting at a given block.
+ */
+int omfs_clear_range(struct super_block *sb, u64 block, int count)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int bits_per_entry = 8 * sb->s_blocksize;
+	u64 tmp;
+	unsigned int map, bit;
+	int ret;
+
+	tmp = block;
+	bit = do_div(tmp, bits_per_entry);
+	map = tmp;
+
+	if (map >= sbi->s_imap_size)
+		return 0;
+
+	mutex_lock(&sbi->s_bitmap_lock);
+	ret = set_run(sb, map, bits_per_entry, bit, count, 0);
+	mutex_unlock(&sbi->s_bitmap_lock);
+	return ret;
+}
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
new file mode 100644
index 00000000..3b8d3979
--- /dev/null
+++ b/fs/omfs/dir.c
@@ -0,0 +1,475 @@
+/*
+ * OMFS (as used by RIO Karma) directory operations.
+ * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/buffer_head.h>
+#include "omfs.h"
+
+static int omfs_hash(const char *name, int namelen, int mod)
+{
+	int i, hash = 0;
+	for (i = 0; i < namelen; i++)
+		hash ^= tolower(name[i]) << (i % 24);
+	return hash % mod;
+}
+
+/*
+ * Finds the bucket for a given name and reads the containing block;
+ * *ofs is set to the offset of the first list entry.
+ */
+static struct buffer_head *omfs_get_bucket(struct inode *dir,
+		const char *name, int namelen, int *ofs)
+{
+	int nbuckets = (dir->i_size - OMFS_DIR_START)/8;
+	int bucket = omfs_hash(name, namelen, nbuckets);
+
+	*ofs = OMFS_DIR_START + bucket * 8;
+	return omfs_bread(dir->i_sb, dir->i_ino);
+}
+
+static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block,
+				const char *name, int namelen,
+				u64 *prev_block)
+{
+	struct buffer_head *bh;
+	struct omfs_inode *oi;
+	int err = -ENOENT;
+	*prev_block = ~0;
+
+	while (block != ~0) {
+		bh = omfs_bread(dir->i_sb, block);
+		if (!bh) {
+			err = -EIO;
+			goto err;
+		}
+
+		oi = (struct omfs_inode *) bh->b_data;
+		if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, block)) {
+			brelse(bh);
+			goto err;
+		}
+
+		if (strncmp(oi->i_name, name, namelen) == 0)
+			return bh;
+
+		*prev_block = block;
+		block = be64_to_cpu(oi->i_sibling);
+		brelse(bh);
+	}
+err:
+	return ERR_PTR(err);
+}
+
+static struct buffer_head *omfs_find_entry(struct inode *dir,
+					   const char *name, int namelen)
+{
+	struct buffer_head *bh;
+	int ofs;
+	u64 block, dummy;
+
+	bh = omfs_get_bucket(dir, name, namelen, &ofs);
+	if (!bh)
+		return ERR_PTR(-EIO);
+
+	block = be64_to_cpu(*((__be64 *) &bh->b_data[ofs]));
+	brelse(bh);
+
+	return omfs_scan_list(dir, block, name, namelen, &dummy);
+}
+
+int omfs_make_empty(struct inode *inode, struct super_block *sb)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	struct buffer_head *bh;
+	struct omfs_inode *oi;
+
+	bh = omfs_bread(sb, inode->i_ino);
+	if (!bh)
+		return -ENOMEM;
+
+	memset(bh->b_data, 0, sizeof(struct omfs_inode));
+
+	if (inode->i_mode & S_IFDIR) {
+		memset(&bh->b_data[OMFS_DIR_START], 0xff,
+			sbi->s_sys_blocksize - OMFS_DIR_START);
+	} else
+		omfs_make_empty_table(bh, OMFS_EXTENT_START);
+
+	oi = (struct omfs_inode *) bh->b_data;
+	oi->i_head.h_self = cpu_to_be64(inode->i_ino);
+	oi->i_sibling = ~cpu_to_be64(0ULL);
+
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	return 0;
+}
+
+static int omfs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct omfs_inode *oi;
+	struct buffer_head *bh;
+	u64 block;
+	__be64 *entry;
+	int ofs;
+
+	/* just prepend to head of queue in proper bucket */
+	bh = omfs_get_bucket(dir, name, namelen, &ofs);
+	if (!bh)
+		goto out;
+
+	entry = (__be64 *) &bh->b_data[ofs];
+	block = be64_to_cpu(*entry);
+	*entry = cpu_to_be64(inode->i_ino);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+
+	/* now set the sibling and parent pointers on the new inode */
+	bh = omfs_bread(dir->i_sb, inode->i_ino);
+	if (!bh)
+		goto out;
+
+	oi = (struct omfs_inode *) bh->b_data;
+	memcpy(oi->i_name, name, namelen);
+	memset(oi->i_name + namelen, 0, OMFS_NAMELEN - namelen);
+	oi->i_sibling = cpu_to_be64(block);
+	oi->i_parent = cpu_to_be64(dir->i_ino);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+
+	dir->i_ctime = CURRENT_TIME_SEC;
+
+	/* mark affected inodes dirty to rebuild checksums */
+	mark_inode_dirty(dir);
+	mark_inode_dirty(inode);
+	return 0;
+out:
+	return -ENOMEM;
+}
+
+static int omfs_delete_entry(struct dentry *dentry)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct inode *dirty;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct omfs_inode *oi;
+	struct buffer_head *bh, *bh2;
+	__be64 *entry, next;
+	u64 block, prev;
+	int ofs;
+	int err = -ENOMEM;
+
+	/* delete the proper node in the bucket's linked list */
+	bh = omfs_get_bucket(dir, name, namelen, &ofs);
+	if (!bh)
+		goto out;
+
+	entry = (__be64 *) &bh->b_data[ofs];
+	block = be64_to_cpu(*entry);
+
+	bh2 = omfs_scan_list(dir, block, name, namelen, &prev);
+	if (IS_ERR(bh2)) {
+		err = PTR_ERR(bh2);
+		goto out_free_bh;
+	}
+
+	oi = (struct omfs_inode *) bh2->b_data;
+	next = oi->i_sibling;
+	brelse(bh2);
+
+	if (prev != ~0) {
+		/* found in middle of list, get list ptr */
+		brelse(bh);
+		bh = omfs_bread(dir->i_sb, prev);
+		if (!bh)
+			goto out;
+
+		oi = (struct omfs_inode *) bh->b_data;
+		entry = &oi->i_sibling;
+	}
+
+	*entry = next;
+	mark_buffer_dirty(bh);
+
+	if (prev != ~0) {
+		dirty = omfs_iget(dir->i_sb, prev);
+		if (!IS_ERR(dirty)) {
+			mark_inode_dirty(dirty);
+			iput(dirty);
+		}
+	}
+
+	err = 0;
+out_free_bh:
+	brelse(bh);
+out:
+	return err;
+}
+
+static int omfs_dir_is_empty(struct inode *inode)
+{
+	int nbuckets = (inode->i_size - OMFS_DIR_START) / 8;
+	struct buffer_head *bh;
+	u64 *ptr;
+	int i;
+
+	bh = omfs_bread(inode->i_sb, inode->i_ino);
+
+	if (!bh)
+		return 0;
+
+	ptr = (u64 *) &bh->b_data[OMFS_DIR_START];
+
+	for (i = 0; i < nbuckets; i++, ptr++)
+		if (*ptr != ~0)
+			break;
+
+	brelse(bh);
+	return *ptr != ~0;
+}
+
+static int omfs_remove(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int ret;
+
+
+	if (S_ISDIR(inode->i_mode) &&
+	    !omfs_dir_is_empty(inode))
+		return -ENOTEMPTY;
+
+	ret = omfs_delete_entry(dentry);
+	if (ret)
+		return ret;
+	
+	clear_nlink(inode);
+	mark_inode_dirty(inode);
+	mark_inode_dirty(dir);
+	return 0;
+}
+
+static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int err;
+	struct inode *inode = omfs_new_inode(dir, mode);
+
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	err = omfs_make_empty(inode, dir->i_sb);
+	if (err)
+		goto out_free_inode;
+
+	err = omfs_add_link(dentry, inode);
+	if (err)
+		goto out_free_inode;
+
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_free_inode:
+	iput(inode);
+	return err;
+}
+
+static int omfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	return omfs_add_node(dir, dentry, mode | S_IFDIR);
+}
+
+static int omfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	return omfs_add_node(dir, dentry, mode | S_IFREG);
+}
+
+static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry,
+				  struct nameidata *nd)
+{
+	struct buffer_head *bh;
+	struct inode *inode = NULL;
+
+	if (dentry->d_name.len > OMFS_NAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	bh = omfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len);
+	if (!IS_ERR(bh)) {
+		struct omfs_inode *oi = (struct omfs_inode *)bh->b_data;
+		ino_t ino = be64_to_cpu(oi->i_head.h_self);
+		brelse(bh);
+		inode = omfs_iget(dir->i_sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	d_add(dentry, inode);
+	return NULL;
+}
+
+/* sanity check block's self pointer */
+int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
+	u64 fsblock)
+{
+	int is_bad;
+	u64 ino = be64_to_cpu(header->h_self);
+	is_bad = ((ino != fsblock) || (ino < sbi->s_root_ino) ||
+		(ino > sbi->s_num_blocks));
+
+	if (is_bad)
+		printk(KERN_WARNING "omfs: bad hash chain detected\n");
+
+	return is_bad;
+}
+
+static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
+		u64 fsblock, int hindex)
+{
+	struct inode *dir = filp->f_dentry->d_inode;
+	struct buffer_head *bh;
+	struct omfs_inode *oi;
+	u64 self;
+	int res = 0;
+	unsigned char d_type;
+
+	/* follow chain in this bucket */
+	while (fsblock != ~0) {
+		bh = omfs_bread(dir->i_sb, fsblock);
+		if (!bh)
+			goto out;
+
+		oi = (struct omfs_inode *) bh->b_data;
+		if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) {
+			brelse(bh);
+			goto out;
+		}
+
+		self = fsblock;
+		fsblock = be64_to_cpu(oi->i_sibling);
+
+		/* skip visited nodes */
+		if (hindex) {
+			hindex--;
+			brelse(bh);
+			continue;
+		}
+
+		d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG;
+
+		res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
+			OMFS_NAMELEN), filp->f_pos, self, d_type);
+		brelse(bh);
+		if (res < 0)
+			break;
+		filp->f_pos++;
+	}
+out:
+	return res;
+}
+
+static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *new_inode = new_dentry->d_inode;
+	struct inode *old_inode = old_dentry->d_inode;
+	int err;
+
+	if (new_inode) {
+		/* overwriting existing file/dir */
+		err = omfs_remove(new_dir, new_dentry);
+		if (err)
+			goto out;
+	}
+
+	/* since omfs locates files by name, we need to unlink _before_
+	 * adding the new link or we won't find the old one */
+	err = omfs_delete_entry(old_dentry);
+	if (err)
+		goto out;
+
+	mark_inode_dirty(old_dir);
+	err = omfs_add_link(new_dentry, old_inode);
+	if (err)
+		goto out;
+
+	old_inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(old_inode);
+out:
+	return err;
+}
+
+static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *dir = filp->f_dentry->d_inode;
+	struct buffer_head *bh;
+	loff_t offset, res;
+	unsigned int hchain, hindex;
+	int nbuckets;
+	u64 fsblock;
+	int ret = -EINVAL;
+
+	if (filp->f_pos >> 32)
+		goto success;
+
+	switch ((unsigned long) filp->f_pos) {
+	case 0:
+		if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
+			goto success;
+		filp->f_pos++;
+		/* fall through */
+	case 1:
+		if (filldir(dirent, "..", 2, 1,
+		    parent_ino(filp->f_dentry), DT_DIR) < 0)
+			goto success;
+		filp->f_pos = 1 << 20;
+		/* fall through */
+	}
+
+	nbuckets = (dir->i_size - OMFS_DIR_START) / 8;
+
+	/* high 12 bits store bucket + 1 and low 20 bits store hash index */
+	hchain = (filp->f_pos >> 20) - 1;
+	hindex = filp->f_pos & 0xfffff;
+
+	bh = omfs_bread(dir->i_sb, dir->i_ino);
+	if (!bh)
+		goto out;
+
+	offset = OMFS_DIR_START + hchain * 8;
+
+	for (; hchain < nbuckets; hchain++, offset += 8) {
+		fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset]));
+
+		res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex);
+		hindex = 0;
+		if (res < 0)
+			break;
+
+		filp->f_pos = (hchain+2) << 20;
+	}
+	brelse(bh);
+success:
+	ret = 0;
+out:
+	return ret;
+}
+
+const struct inode_operations omfs_dir_inops = {
+	.lookup = omfs_lookup,
+	.mkdir = omfs_mkdir,
+	.rename = omfs_rename,
+	.create = omfs_create,
+	.unlink = omfs_remove,
+	.rmdir = omfs_remove,
+};
+
+const struct file_operations omfs_dir_operations = {
+	.read = generic_read_dir,
+	.readdir = omfs_readdir,
+	.llseek = generic_file_llseek,
+};
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
new file mode 100644
index 00000000..2c6d9525
--- /dev/null
+++ b/fs/omfs/file.c
@@ -0,0 +1,378 @@
+/*
+ * OMFS (as used by RIO Karma) file operations.
+ * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include "omfs.h"
+
+static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
+{
+	return (sbi->s_sys_blocksize - offset -
+		sizeof(struct omfs_extent)) /
+		sizeof(struct omfs_extent_entry) + 1;
+}
+
+void omfs_make_empty_table(struct buffer_head *bh, int offset)
+{
+	struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset];
+
+	oe->e_next = ~cpu_to_be64(0ULL);
+	oe->e_extent_count = cpu_to_be32(1),
+	oe->e_fill = cpu_to_be32(0x22),
+	oe->e_entry.e_cluster = ~cpu_to_be64(0ULL);
+	oe->e_entry.e_blocks = ~cpu_to_be64(0ULL);
+}
+
+int omfs_shrink_inode(struct inode *inode)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	struct omfs_extent *oe;
+	struct omfs_extent_entry *entry;
+	struct buffer_head *bh;
+	u64 next, last;
+	u32 extent_count;
+	u32 max_extents;
+	int ret;
+
+	/* traverse extent table, freeing each entry that is greater
+	 * than inode->i_size;
+	 */
+	next = inode->i_ino;
+
+	/* only support truncate -> 0 for now */
+	ret = -EIO;
+	if (inode->i_size != 0)
+		goto out;
+
+	bh = omfs_bread(inode->i_sb, next);
+	if (!bh)
+		goto out;
+
+	oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+	max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
+
+	for (;;) {
+
+		if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next))
+			goto out_brelse;
+
+		extent_count = be32_to_cpu(oe->e_extent_count);
+
+		if (extent_count > max_extents)
+			goto out_brelse;
+
+		last = next;
+		next = be64_to_cpu(oe->e_next);
+		entry = &oe->e_entry;
+
+		/* ignore last entry as it is the terminator */
+		for (; extent_count > 1; extent_count--) {
+			u64 start, count;
+			start = be64_to_cpu(entry->e_cluster);
+			count = be64_to_cpu(entry->e_blocks);
+
+			omfs_clear_range(inode->i_sb, start, (int) count);
+			entry++;
+		}
+		omfs_make_empty_table(bh, (char *) oe - bh->b_data);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+
+		if (last != inode->i_ino)
+			omfs_clear_range(inode->i_sb, last, sbi->s_mirrors);
+
+		if (next == ~0)
+			break;
+
+		bh = omfs_bread(inode->i_sb, next);
+		if (!bh)
+			goto out;
+		oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+		max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
+	}
+	ret = 0;
+out:
+	return ret;
+out_brelse:
+	brelse(bh);
+	return ret;
+}
+
+static void omfs_truncate(struct inode *inode)
+{
+	omfs_shrink_inode(inode);
+	mark_inode_dirty(inode);
+}
+
+/*
+ * Add new blocks to the current extent, or create new entries/continuations
+ * as necessary.
+ */
+static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,
+			u64 *ret_block)
+{
+	struct omfs_extent_entry *terminator;
+	struct omfs_extent_entry *entry = &oe->e_entry;
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	u32 extent_count = be32_to_cpu(oe->e_extent_count);
+	u64 new_block = 0;
+	u32 max_count;
+	int new_count;
+	int ret = 0;
+
+	/* reached the end of the extent table with no blocks mapped.
+	 * there are three possibilities for adding: grow last extent,
+	 * add a new extent to the current extent table, and add a
+	 * continuation inode.  in last two cases need an allocator for
+	 * sbi->s_cluster_size
+	 */
+
+	/* TODO: handle holes */
+
+	/* should always have a terminator */
+	if (extent_count < 1)
+		return -EIO;
+
+	/* trivially grow current extent, if next block is not taken */
+	terminator = entry + extent_count - 1;
+	if (extent_count > 1) {
+		entry = terminator-1;
+		new_block = be64_to_cpu(entry->e_cluster) +
+			be64_to_cpu(entry->e_blocks);
+
+		if (omfs_allocate_block(inode->i_sb, new_block)) {
+			entry->e_blocks =
+				cpu_to_be64(be64_to_cpu(entry->e_blocks) + 1);
+			terminator->e_blocks = ~(cpu_to_be64(
+				be64_to_cpu(~terminator->e_blocks) + 1));
+			goto out;
+		}
+	}
+	max_count = omfs_max_extents(sbi, OMFS_EXTENT_START);
+
+	/* TODO: add a continuation block here */
+	if (be32_to_cpu(oe->e_extent_count) > max_count-1)
+		return -EIO;
+
+	/* try to allocate a new cluster */
+	ret = omfs_allocate_range(inode->i_sb, 1, sbi->s_clustersize,
+		&new_block, &new_count);
+	if (ret)
+		goto out_fail;
+
+	/* copy terminator down an entry */
+	entry = terminator;
+	terminator++;
+	memcpy(terminator, entry, sizeof(struct omfs_extent_entry));
+
+	entry->e_cluster = cpu_to_be64(new_block);
+	entry->e_blocks = cpu_to_be64((u64) new_count);
+
+	terminator->e_blocks = ~(cpu_to_be64(
+		be64_to_cpu(~terminator->e_blocks) + (u64) new_count));
+
+	/* write in new entry */
+	oe->e_extent_count = cpu_to_be32(1 + be32_to_cpu(oe->e_extent_count));
+
+out:
+	*ret_block = new_block;
+out_fail:
+	return ret;
+}
+
+/*
+ * Scans across the directory table for a given file block number.
+ * If block not found, return 0.
+ */
+static sector_t find_block(struct inode *inode, struct omfs_extent_entry *ent,
+			sector_t block, int count, int *left)
+{
+	/* count > 1 because of terminator */
+	sector_t searched = 0;
+	for (; count > 1; count--) {
+		int numblocks = clus_to_blk(OMFS_SB(inode->i_sb),
+			be64_to_cpu(ent->e_blocks));
+
+		if (block >= searched  &&
+		    block < searched + numblocks) {
+			/*
+			 * found it at cluster + (block - searched)
+			 * numblocks - (block - searched) is remainder
+			 */
+			*left = numblocks - (block - searched);
+			return clus_to_blk(OMFS_SB(inode->i_sb),
+				be64_to_cpu(ent->e_cluster)) +
+				block - searched;
+		}
+		searched += numblocks;
+		ent++;
+	}
+	return 0;
+}
+
+static int omfs_get_block(struct inode *inode, sector_t block,
+			  struct buffer_head *bh_result, int create)
+{
+	struct buffer_head *bh;
+	sector_t next, offset;
+	int ret;
+	u64 uninitialized_var(new_block);
+	u32 max_extents;
+	int extent_count;
+	struct omfs_extent *oe;
+	struct omfs_extent_entry *entry;
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	int max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int remain;
+
+	ret = -EIO;
+	bh = omfs_bread(inode->i_sb, inode->i_ino);
+	if (!bh)
+		goto out;
+
+	oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+	max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
+	next = inode->i_ino;
+
+	for (;;) {
+
+		if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next))
+			goto out_brelse;
+
+		extent_count = be32_to_cpu(oe->e_extent_count);
+		next = be64_to_cpu(oe->e_next);
+		entry = &oe->e_entry;
+
+		if (extent_count > max_extents)
+			goto out_brelse;
+
+		offset = find_block(inode, entry, block, extent_count, &remain);
+		if (offset > 0) {
+			ret = 0;
+			map_bh(bh_result, inode->i_sb, offset);
+			if (remain > max_blocks)
+				remain = max_blocks;
+			bh_result->b_size = (remain << inode->i_blkbits);
+			goto out_brelse;
+		}
+		if (next == ~0)
+			break;
+
+		brelse(bh);
+		bh = omfs_bread(inode->i_sb, next);
+		if (!bh)
+			goto out;
+		oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+		max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
+	}
+	if (create) {
+		ret = omfs_grow_extent(inode, oe, &new_block);
+		if (ret == 0) {
+			mark_buffer_dirty(bh);
+			mark_inode_dirty(inode);
+			map_bh(bh_result, inode->i_sb,
+					clus_to_blk(sbi, new_block));
+		}
+	}
+out_brelse:
+	brelse(bh);
+out:
+	return ret;
+}
+
+static int omfs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, omfs_get_block);
+}
+
+static int omfs_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, omfs_get_block);
+}
+
+static int omfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, omfs_get_block, wbc);
+}
+
+static int
+omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, omfs_get_block);
+}
+
+static int omfs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				omfs_get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
+
+static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, omfs_get_block);
+}
+
+const struct file_operations omfs_file_operations = {
+	.llseek = generic_file_llseek,
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = generic_file_aio_read,
+	.aio_write = generic_file_aio_write,
+	.mmap = generic_file_mmap,
+	.fsync = generic_file_fsync,
+	.splice_read = generic_file_splice_read,
+};
+
+static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+const struct inode_operations omfs_file_inops = {
+	.setattr = omfs_setattr,
+	.truncate = omfs_truncate
+};
+
+const struct address_space_operations omfs_aops = {
+	.readpage = omfs_readpage,
+	.readpages = omfs_readpages,
+	.writepage = omfs_writepage,
+	.writepages = omfs_writepages,
+	.write_begin = omfs_write_begin,
+	.write_end = generic_write_end,
+	.bmap = omfs_bmap,
+};
+
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
new file mode 100644
index 00000000..e043c4cb
--- /dev/null
+++ b/fs/omfs/inode.c
@@ -0,0 +1,585 @@
+/*
+ * Optimized MPEG FS - inode and super operations.
+ * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/parser.h>
+#include <linux/buffer_head.h>
+#include <linux/vmalloc.h>
+#include <linux/writeback.h>
+#include <linux/crc-itu-t.h>
+#include "omfs.h"
+
+MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>");
+MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux");
+MODULE_LICENSE("GPL");
+
+struct buffer_head *omfs_bread(struct super_block *sb, sector_t block)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	if (block >= sbi->s_num_blocks)
+		return NULL;
+
+	return sb_bread(sb, clus_to_blk(sbi, block));
+}
+
+struct inode *omfs_new_inode(struct inode *dir, int mode)
+{
+	struct inode *inode;
+	u64 new_block;
+	int err;
+	int len;
+	struct omfs_sb_info *sbi = OMFS_SB(dir->i_sb);
+
+	inode = new_inode(dir->i_sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	err = omfs_allocate_range(dir->i_sb, sbi->s_mirrors, sbi->s_mirrors,
+			&new_block, &len);
+	if (err)
+		goto fail;
+
+	inode->i_ino = new_block;
+	inode_init_owner(inode, NULL, mode);
+	inode->i_mapping->a_ops = &omfs_aops;
+
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	switch (mode & S_IFMT) {
+	case S_IFDIR:
+		inode->i_op = &omfs_dir_inops;
+		inode->i_fop = &omfs_dir_operations;
+		inode->i_size = sbi->s_sys_blocksize;
+		inc_nlink(inode);
+		break;
+	case S_IFREG:
+		inode->i_op = &omfs_file_inops;
+		inode->i_fop = &omfs_file_operations;
+		inode->i_size = 0;
+		break;
+	}
+
+	insert_inode_hash(inode);
+	mark_inode_dirty(inode);
+	return inode;
+fail:
+	make_bad_inode(inode);
+	iput(inode);
+	return ERR_PTR(err);
+}
+
+/*
+ * Update the header checksums for a dirty inode based on its contents.
+ * Caller is expected to hold the buffer head underlying oi and mark it
+ * dirty.
+ */
+static void omfs_update_checksums(struct omfs_inode *oi)
+{
+	int xor, i, ofs = 0, count;
+	u16 crc = 0;
+	unsigned char *ptr = (unsigned char *) oi;
+
+	count = be32_to_cpu(oi->i_head.h_body_size);
+	ofs = sizeof(struct omfs_header);
+
+	crc = crc_itu_t(crc, ptr + ofs, count);
+	oi->i_head.h_crc = cpu_to_be16(crc);
+
+	xor = ptr[0];
+	for (i = 1; i < OMFS_XOR_COUNT; i++)
+		xor ^= ptr[i];
+
+	oi->i_head.h_check_xor = xor;
+}
+
+static int __omfs_write_inode(struct inode *inode, int wait)
+{
+	struct omfs_inode *oi;
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	struct buffer_head *bh, *bh2;
+	u64 ctime;
+	int i;
+	int ret = -EIO;
+	int sync_failed = 0;
+
+	/* get current inode since we may have written sibling ptrs etc. */
+	bh = omfs_bread(inode->i_sb, inode->i_ino);
+	if (!bh)
+		goto out;
+
+	oi = (struct omfs_inode *) bh->b_data;
+
+	oi->i_head.h_self = cpu_to_be64(inode->i_ino);
+	if (S_ISDIR(inode->i_mode))
+		oi->i_type = OMFS_DIR;
+	else if (S_ISREG(inode->i_mode))
+		oi->i_type = OMFS_FILE;
+	else {
+		printk(KERN_WARNING "omfs: unknown file type: %d\n",
+			inode->i_mode);
+		goto out_brelse;
+	}
+
+	oi->i_head.h_body_size = cpu_to_be32(sbi->s_sys_blocksize -
+		sizeof(struct omfs_header));
+	oi->i_head.h_version = 1;
+	oi->i_head.h_type = OMFS_INODE_NORMAL;
+	oi->i_head.h_magic = OMFS_IMAGIC;
+	oi->i_size = cpu_to_be64(inode->i_size);
+
+	ctime = inode->i_ctime.tv_sec * 1000LL +
+		((inode->i_ctime.tv_nsec + 999)/1000);
+	oi->i_ctime = cpu_to_be64(ctime);
+
+	omfs_update_checksums(oi);
+
+	mark_buffer_dirty(bh);
+	if (wait) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			sync_failed = 1;
+	}
+
+	/* if mirroring writes, copy to next fsblock */
+	for (i = 1; i < sbi->s_mirrors; i++) {
+		bh2 = omfs_bread(inode->i_sb, inode->i_ino + i);
+		if (!bh2)
+			goto out_brelse;
+
+		memcpy(bh2->b_data, bh->b_data, bh->b_size);
+		mark_buffer_dirty(bh2);
+		if (wait) {
+			sync_dirty_buffer(bh2);
+			if (buffer_req(bh2) && !buffer_uptodate(bh2))
+				sync_failed = 1;
+		}
+		brelse(bh2);
+	}
+	ret = (sync_failed) ? -EIO : 0;
+out_brelse:
+	brelse(bh);
+out:
+	return ret;
+}
+
+static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
+int omfs_sync_inode(struct inode *inode)
+{
+	return __omfs_write_inode(inode, 1);
+}
+
+/*
+ * called when an entry is deleted, need to clear the bits in the
+ * bitmaps.
+ */
+static void omfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+
+	if (inode->i_nlink)
+		return;
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_size = 0;
+		omfs_shrink_inode(inode);
+	}
+
+	omfs_clear_range(inode->i_sb, inode->i_ino, 2);
+}
+
+struct inode *omfs_iget(struct super_block *sb, ino_t ino)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	struct omfs_inode *oi;
+	struct buffer_head *bh;
+	u64 ctime;
+	unsigned long nsecs;
+	struct inode *inode;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	bh = omfs_bread(inode->i_sb, ino);
+	if (!bh)
+		goto iget_failed;
+
+	oi = (struct omfs_inode *)bh->b_data;
+
+	/* check self */
+	if (ino != be64_to_cpu(oi->i_head.h_self))
+		goto fail_bh;
+
+	inode->i_uid = sbi->s_uid;
+	inode->i_gid = sbi->s_gid;
+
+	ctime = be64_to_cpu(oi->i_ctime);
+	nsecs = do_div(ctime, 1000) * 1000L;
+
+	inode->i_atime.tv_sec = ctime;
+	inode->i_mtime.tv_sec = ctime;
+	inode->i_ctime.tv_sec = ctime;
+	inode->i_atime.tv_nsec = nsecs;
+	inode->i_mtime.tv_nsec = nsecs;
+	inode->i_ctime.tv_nsec = nsecs;
+
+	inode->i_mapping->a_ops = &omfs_aops;
+
+	switch (oi->i_type) {
+	case OMFS_DIR:
+		inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask);
+		inode->i_op = &omfs_dir_inops;
+		inode->i_fop = &omfs_dir_operations;
+		inode->i_size = sbi->s_sys_blocksize;
+		inc_nlink(inode);
+		break;
+	case OMFS_FILE:
+		inode->i_mode = S_IFREG | (S_IRWXUGO & ~sbi->s_fmask);
+		inode->i_fop = &omfs_file_operations;
+		inode->i_size = be64_to_cpu(oi->i_size);
+		break;
+	}
+	brelse(bh);
+	unlock_new_inode(inode);
+	return inode;
+fail_bh:
+	brelse(bh);
+iget_failed:
+	iget_failed(inode);
+	return ERR_PTR(-EIO);
+}
+
+static void omfs_put_super(struct super_block *sb)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	kfree(sbi->s_imap);
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+}
+
+static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *s = dentry->d_sb;
+	struct omfs_sb_info *sbi = OMFS_SB(s);
+	u64 id = huge_encode_dev(s->s_bdev->bd_dev);
+
+	buf->f_type = OMFS_MAGIC;
+	buf->f_bsize = sbi->s_blocksize;
+	buf->f_blocks = sbi->s_num_blocks;
+	buf->f_files = sbi->s_num_blocks;
+	buf->f_namelen = OMFS_NAMELEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	buf->f_bfree = buf->f_bavail = buf->f_ffree =
+		omfs_count_free(s);
+
+	return 0;
+}
+
+static const struct super_operations omfs_sops = {
+	.write_inode	= omfs_write_inode,
+	.evict_inode	= omfs_evict_inode,
+	.put_super	= omfs_put_super,
+	.statfs		= omfs_statfs,
+	.show_options	= generic_show_options,
+};
+
+/*
+ * For Rio Karma, there is an on-disk free bitmap whose location is
+ * stored in the root block.  For ReplayTV, there is no such free bitmap
+ * so we have to walk the tree.  Both inodes and file data are allocated
+ * from the same map.  This array can be big (300k) so we allocate
+ * in units of the blocksize.
+ */
+static int omfs_get_imap(struct super_block *sb)
+{
+	int bitmap_size;
+	int array_size;
+	int count;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	struct buffer_head *bh;
+	unsigned long **ptr;
+	sector_t block;
+
+	bitmap_size = DIV_ROUND_UP(sbi->s_num_blocks, 8);
+	array_size = DIV_ROUND_UP(bitmap_size, sb->s_blocksize);
+
+	if (sbi->s_bitmap_ino == ~0ULL)
+		goto out;
+
+	sbi->s_imap_size = array_size;
+	sbi->s_imap = kzalloc(array_size * sizeof(unsigned long *), GFP_KERNEL);
+	if (!sbi->s_imap)
+		goto nomem;
+
+	block = clus_to_blk(sbi, sbi->s_bitmap_ino);
+	if (block >= sbi->s_num_blocks)
+		goto nomem;
+
+	ptr = sbi->s_imap;
+	for (count = bitmap_size; count > 0; count -= sb->s_blocksize) {
+		bh = sb_bread(sb, block++);
+		if (!bh)
+			goto nomem_free;
+		*ptr = kmalloc(sb->s_blocksize, GFP_KERNEL);
+		if (!*ptr) {
+			brelse(bh);
+			goto nomem_free;
+		}
+		memcpy(*ptr, bh->b_data, sb->s_blocksize);
+		if (count < sb->s_blocksize)
+			memset((void *)*ptr + count, 0xff,
+				sb->s_blocksize - count);
+		brelse(bh);
+		ptr++;
+	}
+out:
+	return 0;
+
+nomem_free:
+	for (count = 0; count < array_size; count++)
+		kfree(sbi->s_imap[count]);
+
+	kfree(sbi->s_imap);
+nomem:
+	sbi->s_imap = NULL;
+	sbi->s_imap_size = 0;
+	return -ENOMEM;
+}
+
+enum {
+	Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask
+};
+
+static const match_table_t tokens = {
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_umask, "umask=%o"},
+	{Opt_dmask, "dmask=%o"},
+	{Opt_fmask, "fmask=%o"},
+};
+
+static int parse_options(char *options, struct omfs_sb_info *sbi)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_uid = option;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_gid = option;
+			break;
+		case Opt_umask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			sbi->s_fmask = sbi->s_dmask = option;
+			break;
+		case Opt_dmask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			sbi->s_dmask = option;
+			break;
+		case Opt_fmask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			sbi->s_fmask = option;
+			break;
+		default:
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static int omfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct buffer_head *bh, *bh2;
+	struct omfs_super_block *omfs_sb;
+	struct omfs_root_block *omfs_rb;
+	struct omfs_sb_info *sbi;
+	struct inode *root;
+	int ret = -EINVAL;
+
+	save_mount_options(sb, (char *) data);
+
+	sbi = kzalloc(sizeof(struct omfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sb->s_fs_info = sbi;
+
+	sbi->s_uid = current_uid();
+	sbi->s_gid = current_gid();
+	sbi->s_dmask = sbi->s_fmask = current_umask();
+
+	if (!parse_options((char *) data, sbi))
+		goto end;
+
+	sb->s_maxbytes = 0xffffffff;
+
+	sb_set_blocksize(sb, 0x200);
+
+	bh = sb_bread(sb, 0);
+	if (!bh)
+		goto end;
+
+	omfs_sb = (struct omfs_super_block *)bh->b_data;
+
+	if (omfs_sb->s_magic != cpu_to_be32(OMFS_MAGIC)) {
+		if (!silent)
+			printk(KERN_ERR "omfs: Invalid superblock (%x)\n",
+				   omfs_sb->s_magic);
+		goto out_brelse_bh;
+	}
+	sb->s_magic = OMFS_MAGIC;
+
+	sbi->s_num_blocks = be64_to_cpu(omfs_sb->s_num_blocks);
+	sbi->s_blocksize = be32_to_cpu(omfs_sb->s_blocksize);
+	sbi->s_mirrors = be32_to_cpu(omfs_sb->s_mirrors);
+	sbi->s_root_ino = be64_to_cpu(omfs_sb->s_root_block);
+	sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize);
+	mutex_init(&sbi->s_bitmap_lock);
+
+	if (sbi->s_sys_blocksize > PAGE_SIZE) {
+		printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n",
+			sbi->s_sys_blocksize);
+		goto out_brelse_bh;
+	}
+
+	if (sbi->s_blocksize < sbi->s_sys_blocksize ||
+	    sbi->s_blocksize > OMFS_MAX_BLOCK_SIZE) {
+		printk(KERN_ERR "omfs: block size (%d) is out of range\n",
+			sbi->s_blocksize);
+		goto out_brelse_bh;
+	}
+
+	/*
+	 * Use sys_blocksize as the fs block since it is smaller than a
+	 * page while the fs blocksize can be larger.
+	 */
+	sb_set_blocksize(sb, sbi->s_sys_blocksize);
+
+	/*
+	 * ...and the difference goes into a shift.  sys_blocksize is always
+	 * a power of two factor of blocksize.
+	 */
+	sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) -
+		get_bitmask_order(sbi->s_sys_blocksize);
+
+	bh2 = omfs_bread(sb, be64_to_cpu(omfs_sb->s_root_block));
+	if (!bh2)
+		goto out_brelse_bh;
+
+	omfs_rb = (struct omfs_root_block *)bh2->b_data;
+
+	sbi->s_bitmap_ino = be64_to_cpu(omfs_rb->r_bitmap);
+	sbi->s_clustersize = be32_to_cpu(omfs_rb->r_clustersize);
+
+	if (sbi->s_num_blocks != be64_to_cpu(omfs_rb->r_num_blocks)) {
+		printk(KERN_ERR "omfs: block count discrepancy between "
+			"super and root blocks (%llx, %llx)\n",
+			(unsigned long long)sbi->s_num_blocks,
+			(unsigned long long)be64_to_cpu(omfs_rb->r_num_blocks));
+		goto out_brelse_bh2;
+	}
+
+	if (sbi->s_bitmap_ino != ~0ULL &&
+	    sbi->s_bitmap_ino > sbi->s_num_blocks) {
+		printk(KERN_ERR "omfs: free space bitmap location is corrupt "
+			"(%llx, total blocks %llx)\n",
+			(unsigned long long) sbi->s_bitmap_ino,
+			(unsigned long long) sbi->s_num_blocks);
+		goto out_brelse_bh2;
+	}
+	if (sbi->s_clustersize < 1 ||
+	    sbi->s_clustersize > OMFS_MAX_CLUSTER_SIZE) {
+		printk(KERN_ERR "omfs: cluster size out of range (%d)",
+			sbi->s_clustersize);
+		goto out_brelse_bh2;
+	}
+
+	ret = omfs_get_imap(sb);
+	if (ret)
+		goto out_brelse_bh2;
+
+	sb->s_op = &omfs_sops;
+
+	root = omfs_iget(sb, be64_to_cpu(omfs_rb->r_root_dir));
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out_brelse_bh2;
+	}
+
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		iput(root);
+		goto out_brelse_bh2;
+	}
+	printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name);
+
+	ret = 0;
+out_brelse_bh2:
+	brelse(bh2);
+out_brelse_bh:
+	brelse(bh);
+end:
+	if (ret)
+		kfree(sbi);
+	return ret;
+}
+
+static struct dentry *omfs_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super);
+}
+
+static struct file_system_type omfs_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "omfs",
+	.mount = omfs_mount,
+	.kill_sb = kill_block_super,
+	.fs_flags = FS_REQUIRES_DEV,
+};
+
+static int __init init_omfs_fs(void)
+{
+	return register_filesystem(&omfs_fs_type);
+}
+
+static void __exit exit_omfs_fs(void)
+{
+	unregister_filesystem(&omfs_fs_type);
+}
+
+module_init(init_omfs_fs);
+module_exit(exit_omfs_fs);
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
new file mode 100644
index 00000000..7d414fef
--- /dev/null
+++ b/fs/omfs/omfs.h
@@ -0,0 +1,68 @@
+#ifndef _OMFS_H
+#define _OMFS_H
+
+#include <linux/module.h>
+#include <linux/fs.h>
+
+#include "omfs_fs.h"
+
+/* In-memory structures */
+struct omfs_sb_info {
+	u64 s_num_blocks;
+	u64 s_bitmap_ino;
+	u64 s_root_ino;
+	u32 s_blocksize;
+	u32 s_mirrors;
+	u32 s_sys_blocksize;
+	u32 s_clustersize;
+	int s_block_shift;
+	unsigned long **s_imap;
+	int s_imap_size;
+	struct mutex s_bitmap_lock;
+	int s_uid;
+	int s_gid;
+	int s_dmask;
+	int s_fmask;
+};
+
+/* convert a cluster number to a scaled block number */
+static inline sector_t clus_to_blk(struct omfs_sb_info *sbi, sector_t block)
+{
+	return block << sbi->s_block_shift;
+}
+
+static inline struct omfs_sb_info *OMFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/* bitmap.c */
+extern unsigned long omfs_count_free(struct super_block *sb);
+extern int omfs_allocate_block(struct super_block *sb, u64 block);
+extern int omfs_allocate_range(struct super_block *sb, int min_request,
+			int max_request, u64 *return_block, int *return_size);
+extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
+
+/* dir.c */
+extern const struct file_operations omfs_dir_operations;
+extern const struct inode_operations omfs_dir_inops;
+extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
+extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
+			u64 fsblock);
+
+/* file.c */
+extern const struct file_operations omfs_file_operations;
+extern const struct inode_operations omfs_file_inops;
+extern const struct address_space_operations omfs_aops;
+extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
+extern int omfs_shrink_inode(struct inode *inode);
+
+/* inode.c */
+extern struct buffer_head *omfs_bread(struct super_block *sb, sector_t block);
+extern struct inode *omfs_iget(struct super_block *sb, ino_t inode);
+extern struct inode *omfs_new_inode(struct inode *dir, int mode);
+extern int omfs_reserve_block(struct super_block *sb, sector_t block);
+extern int omfs_find_empty_block(struct super_block *sb, int mode, ino_t *ino);
+extern int omfs_sync_inode(struct inode *inode);
+
+#endif
diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h
new file mode 100644
index 00000000..ee5e4327
--- /dev/null
+++ b/fs/omfs/omfs_fs.h
@@ -0,0 +1,81 @@
+#ifndef _OMFS_FS_H
+#define _OMFS_FS_H
+
+/* OMFS On-disk structures */
+
+#define OMFS_MAGIC 0xC2993D87
+#define OMFS_IMAGIC 0xD2
+
+#define OMFS_DIR 'D'
+#define OMFS_FILE 'F'
+#define OMFS_INODE_NORMAL 'e'
+#define OMFS_INODE_CONTINUATION 'c'
+#define OMFS_INODE_SYSTEM 's'
+#define OMFS_NAMELEN 256
+#define OMFS_DIR_START 0x1b8
+#define OMFS_EXTENT_START 0x1d0
+#define OMFS_EXTENT_CONT 0x40
+#define OMFS_XOR_COUNT 19
+#define OMFS_MAX_BLOCK_SIZE 8192
+#define OMFS_MAX_CLUSTER_SIZE 8
+
+struct omfs_super_block {
+	char s_fill1[256];
+	__be64 s_root_block;		/* block number of omfs_root_block */
+	__be64 s_num_blocks;		/* total number of FS blocks */
+	__be32 s_magic;			/* OMFS_MAGIC */
+	__be32 s_blocksize;		/* size of a block */
+	__be32 s_mirrors;		/* # of mirrors of system blocks */
+	__be32 s_sys_blocksize;		/* size of non-data blocks */
+};
+
+struct omfs_header {
+	__be64 h_self;			/* FS block where this is located */
+	__be32 h_body_size;		/* size of useful data after header */
+	__be16 h_crc;			/* crc-ccitt of body_size bytes */
+	char h_fill1[2];
+	u8 h_version;			/* version, always 1 */
+	char h_type;			/* OMFS_INODE_X */
+	u8 h_magic;			/* OMFS_IMAGIC */
+	u8 h_check_xor;			/* XOR of header bytes before this */
+	__be32 h_fill2;
+};
+
+struct omfs_root_block {
+	struct omfs_header r_head;	/* header */
+	__be64 r_fill1;
+	__be64 r_num_blocks;		/* total number of FS blocks */
+	__be64 r_root_dir;		/* block # of root directory */
+	__be64 r_bitmap;		/* block # of free space bitmap */
+	__be32 r_blocksize;		/* size of a block */
+	__be32 r_clustersize;		/* size allocated for data blocks */
+	__be64 r_mirrors;		/* # of mirrors of system blocks */
+	char r_name[OMFS_NAMELEN];	/* partition label */
+};
+
+struct omfs_inode {
+	struct omfs_header i_head;	/* header */
+	__be64 i_parent;		/* parent containing this inode */
+	__be64 i_sibling;		/* next inode in hash bucket */
+	__be64 i_ctime;			/* ctime, in milliseconds */
+	char i_fill1[35];
+	char i_type;			/* OMFS_[DIR,FILE] */
+	__be32 i_fill2;
+	char i_fill3[64];
+	char i_name[OMFS_NAMELEN];	/* filename */
+	__be64 i_size;			/* size of file, in bytes */
+};
+
+struct omfs_extent_entry {
+	__be64 e_cluster;		/* start location of a set of blocks */
+	__be64 e_blocks;		/* number of blocks after e_cluster */
+};
+
+struct omfs_extent {
+	__be64 e_next;			/* next extent table location */
+	__be32 e_extent_count;		/* total # extents in this table */
+	__be32 e_fill;
+	struct omfs_extent_entry e_entry;	/* start of extent entries */
+};
+
+#endif
diff --git a/fs/open.c b/fs/open.c
new file mode 100644
index 00000000..b52cf013
--- /dev/null
+++ b/fs/open.c
@@ -0,0 +1,1161 @@
+/*
+ *  linux/fs/open.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/fsnotify.h>
+#include <linux/module.h>
+#include <linux/tty.h>
+#include <linux/namei.h>
+#include <linux/backing-dev.h>
+#include <linux/capability.h>
+#include <linux/securebits.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/fcntl.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/fs.h>
+#include <linux/personality.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/rcupdate.h>
+#include <linux/audit.h>
+#include <linux/falloc.h>
+#include <linux/fs_struct.h>
+#include <linux/ima.h>
+#include <linux/dnotify.h>
+
+#include "internal.h"
+
+int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
+	struct file *filp)
+{
+	int ret;
+	struct iattr newattrs;
+
+	/* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
+	if (length < 0)
+		return -EINVAL;
+
+	newattrs.ia_size = length;
+	newattrs.ia_valid = ATTR_SIZE | time_attrs;
+	if (filp) {
+		newattrs.ia_file = filp;
+		newattrs.ia_valid |= ATTR_FILE;
+	}
+
+	/* Remove suid/sgid on truncate too */
+	ret = should_remove_suid(dentry);
+	if (ret)
+		newattrs.ia_valid |= ret | ATTR_FORCE;
+
+	mutex_lock(&dentry->d_inode->i_mutex);
+	ret = notify_change(dentry, &newattrs);
+	mutex_unlock(&dentry->d_inode->i_mutex);
+	return ret;
+}
+
+static long do_sys_truncate(const char __user *pathname, loff_t length)
+{
+	struct path path;
+	struct inode *inode;
+	int error;
+
+	error = -EINVAL;
+	if (length < 0)	/* sorry, but loff_t says... */
+		goto out;
+
+	error = user_path(pathname, &path);
+	if (error)
+		goto out;
+	inode = path.dentry->d_inode;
+
+	/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
+	error = -EISDIR;
+	if (S_ISDIR(inode->i_mode))
+		goto dput_and_out;
+
+	error = -EINVAL;
+	if (!S_ISREG(inode->i_mode))
+		goto dput_and_out;
+
+	error = mnt_want_write(path.mnt);
+	if (error)
+		goto dput_and_out;
+
+	error = inode_permission(inode, MAY_WRITE);
+	if (error)
+		goto mnt_drop_write_and_out;
+
+	error = -EPERM;
+	if (IS_APPEND(inode))
+		goto mnt_drop_write_and_out;
+
+	error = get_write_access(inode);
+	if (error)
+		goto mnt_drop_write_and_out;
+
+	/*
+	 * Make sure that there are no leases.  get_write_access() protects
+	 * against the truncate racing with a lease-granting setlease().
+	 */
+	error = break_lease(inode, O_WRONLY);
+	if (error)
+		goto put_write_and_out;
+
+	error = locks_verify_truncate(inode, NULL, length);
+	if (!error)
+		error = security_path_truncate(&path);
+	if (!error)
+		error = do_truncate(path.dentry, length, 0, NULL);
+
+put_write_and_out:
+	put_write_access(inode);
+mnt_drop_write_and_out:
+	mnt_drop_write(path.mnt);
+dput_and_out:
+	path_put(&path);
+out:
+	return error;
+}
+
+SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
+{
+	return do_sys_truncate(path, length);
+}
+
+static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+{
+	struct inode * inode;
+	struct dentry *dentry;
+	struct file * file;
+	int error;
+
+	error = -EINVAL;
+	if (length < 0)
+		goto out;
+	error = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	/* explicitly opened as large or we are on 64-bit box */
+	if (file->f_flags & O_LARGEFILE)
+		small = 0;
+
+	dentry = file->f_path.dentry;
+	inode = dentry->d_inode;
+	error = -EINVAL;
+	if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
+		goto out_putf;
+
+	error = -EINVAL;
+	/* Cannot ftruncate over 2^31 bytes without large file support */
+	if (small && length > MAX_NON_LFS)
+		goto out_putf;
+
+	error = -EPERM;
+	if (IS_APPEND(inode))
+		goto out_putf;
+
+	error = locks_verify_truncate(inode, file, length);
+	if (!error)
+		error = security_path_truncate(&file->f_path);
+	if (!error)
+		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
+out_putf:
+	fput(file);
+out:
+	return error;
+}
+
+SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
+{
+	long ret = do_sys_ftruncate(fd, length, 1);
+	/* avoid REGPARM breakage on x86: */
+	asmlinkage_protect(2, ret, fd, length);
+	return ret;
+}
+
+/* LFS versions of truncate are only needed on 32 bit machines */
+#if BITS_PER_LONG == 32
+SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length)
+{
+	return do_sys_truncate(path, length);
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_truncate64(long path, loff_t length)
+{
+	return SYSC_truncate64((const char __user *) path, length);
+}
+SYSCALL_ALIAS(sys_truncate64, SyS_truncate64);
+#endif
+
+SYSCALL_DEFINE(ftruncate64)(unsigned int fd, loff_t length)
+{
+	long ret = do_sys_ftruncate(fd, length, 0);
+	/* avoid REGPARM breakage on x86: */
+	asmlinkage_protect(2, ret, fd, length);
+	return ret;
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_ftruncate64(long fd, loff_t length)
+{
+	return SYSC_ftruncate64((unsigned int) fd, length);
+}
+SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);
+#endif
+#endif /* BITS_PER_LONG == 32 */
+
+
+int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	long ret;
+
+	if (offset < 0 || len <= 0)
+		return -EINVAL;
+
+	/* Return error if mode is not supported */
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	/* Punch hole must have keep size set */
+	if ((mode & FALLOC_FL_PUNCH_HOLE) &&
+	    !(mode & FALLOC_FL_KEEP_SIZE))
+		return -EOPNOTSUPP;
+
+	if (!(file->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	/* It's not possible punch hole on append only file */
+	if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode))
+		return -EPERM;
+
+	if (IS_IMMUTABLE(inode))
+		return -EPERM;
+
+	/*
+	 * Revalidate the write permissions, in case security policy has
+	 * changed since the files were opened.
+	 */
+	ret = security_file_permission(file, MAY_WRITE);
+	if (ret)
+		return ret;
+
+	if (S_ISFIFO(inode->i_mode))
+		return -ESPIPE;
+
+	/*
+	 * Let individual file system decide if it supports preallocation
+	 * for directories or not.
+	 */
+	if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+		return -ENODEV;
+
+	/* Check for wrap through zero too */
+	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
+		return -EFBIG;
+
+	if (!file->f_op->fallocate)
+		return -EOPNOTSUPP;
+
+	return file->f_op->fallocate(file, mode, offset, len);
+}
+
+SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
+{
+	struct file *file;
+	int error = -EBADF;
+
+	file = fget(fd);
+	if (file) {
+		error = do_fallocate(file, mode, offset, len);
+		fput(file);
+	}
+
+	return error;
+}
+
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len)
+{
+	return SYSC_fallocate((int)fd, (int)mode, offset, len);
+}
+SYSCALL_ALIAS(sys_fallocate, SyS_fallocate);
+#endif
+
+/*
+ * access() needs to use the real uid/gid, not the effective uid/gid.
+ * We do this by temporarily clearing all FS-related capabilities and
+ * switching the fsuid/fsgid around to the real ones.
+ */
+SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
+{
+	const struct cred *old_cred;
+	struct cred *override_cred;
+	struct path path;
+	struct inode *inode;
+	int res;
+
+	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
+		return -EINVAL;
+
+	override_cred = prepare_creds();
+	if (!override_cred)
+		return -ENOMEM;
+
+	override_cred->fsuid = override_cred->uid;
+	override_cred->fsgid = override_cred->gid;
+
+	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+		/* Clear the capabilities if we switch to a non-root user */
+		if (override_cred->uid)
+			cap_clear(override_cred->cap_effective);
+		else
+			override_cred->cap_effective =
+				override_cred->cap_permitted;
+	}
+
+	old_cred = override_creds(override_cred);
+
+	res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+	if (res)
+		goto out;
+
+	inode = path.dentry->d_inode;
+
+	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
+		/*
+		 * MAY_EXEC on regular files is denied if the fs is mounted
+		 * with the "noexec" flag.
+		 */
+		res = -EACCES;
+		if (path.mnt->mnt_flags & MNT_NOEXEC)
+			goto out_path_release;
+	}
+
+	res = inode_permission(inode, mode | MAY_ACCESS);
+	/* SuS v2 requires we report a read only fs too */
+	if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
+		goto out_path_release;
+	/*
+	 * This is a rare case where using __mnt_is_readonly()
+	 * is OK without a mnt_want/drop_write() pair.  Since
+	 * no actual write to the fs is performed here, we do
+	 * not need to telegraph to that to anyone.
+	 *
+	 * By doing this, we accept that this access is
+	 * inherently racy and know that the fs may change
+	 * state before we even see this result.
+	 */
+	if (__mnt_is_readonly(path.mnt))
+		res = -EROFS;
+
+out_path_release:
+	path_put(&path);
+out:
+	revert_creds(old_cred);
+	put_cred(override_cred);
+	return res;
+}
+
+SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
+{
+	return sys_faccessat(AT_FDCWD, filename, mode);
+}
+
+SYSCALL_DEFINE1(chdir, const char __user *, filename)
+{
+	struct path path;
+	int error;
+
+	error = user_path_dir(filename, &path);
+	if (error)
+		goto out;
+
+	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+	if (error)
+		goto dput_and_out;
+
+	set_fs_pwd(current->fs, &path);
+
+dput_and_out:
+	path_put(&path);
+out:
+	return error;
+}
+
+SYSCALL_DEFINE1(fchdir, unsigned int, fd)
+{
+	struct file *file;
+	struct inode *inode;
+	int error;
+
+	error = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	inode = file->f_path.dentry->d_inode;
+
+	error = -ENOTDIR;
+	if (!S_ISDIR(inode->i_mode))
+		goto out_putf;
+
+	error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
+	if (!error)
+		set_fs_pwd(current->fs, &file->f_path);
+out_putf:
+	fput(file);
+out:
+	return error;
+}
+
+SYSCALL_DEFINE1(chroot, const char __user *, filename)
+{
+	struct path path;
+	int error;
+
+	error = user_path_dir(filename, &path);
+	if (error)
+		goto out;
+
+	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+	if (error)
+		goto dput_and_out;
+
+	error = -EPERM;
+	if (!capable(CAP_SYS_CHROOT))
+		goto dput_and_out;
+	error = security_path_chroot(&path);
+	if (error)
+		goto dput_and_out;
+
+	set_fs_root(current->fs, &path);
+	error = 0;
+dput_and_out:
+	path_put(&path);
+out:
+	return error;
+}
+
+SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
+{
+	struct inode * inode;
+	struct dentry * dentry;
+	struct file * file;
+	int err = -EBADF;
+	struct iattr newattrs;
+
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	dentry = file->f_path.dentry;
+	inode = dentry->d_inode;
+
+	audit_inode(NULL, dentry);
+
+	err = mnt_want_write_file(file);
+	if (err)
+		goto out_putf;
+	mutex_lock(&inode->i_mutex);
+	err = security_path_chmod(dentry, file->f_vfsmnt, mode);
+	if (err)
+		goto out_unlock;
+	if (mode == (mode_t) -1)
+		mode = inode->i_mode;
+	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
+	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+	err = notify_change(dentry, &newattrs);
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	mnt_drop_write(file->f_path.mnt);
+out_putf:
+	fput(file);
+out:
+	return err;
+}
+
+SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
+{
+	struct path path;
+	struct inode *inode;
+	int error;
+	struct iattr newattrs;
+
+	error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+	if (error)
+		goto out;
+	inode = path.dentry->d_inode;
+
+	error = mnt_want_write(path.mnt);
+	if (error)
+		goto dput_and_out;
+	mutex_lock(&inode->i_mutex);
+	error = security_path_chmod(path.dentry, path.mnt, mode);
+	if (error)
+		goto out_unlock;
+	if (mode == (mode_t) -1)
+		mode = inode->i_mode;
+	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
+	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+	error = notify_change(path.dentry, &newattrs);
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	mnt_drop_write(path.mnt);
+dput_and_out:
+	path_put(&path);
+out:
+	return error;
+}
+
+SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode)
+{
+	return sys_fchmodat(AT_FDCWD, filename, mode);
+}
+
+static int chown_common(struct path *path, uid_t user, gid_t group)
+{
+	struct inode *inode = path->dentry->d_inode;
+	int error;
+	struct iattr newattrs;
+
+	newattrs.ia_valid =  ATTR_CTIME;
+	if (user != (uid_t) -1) {
+		newattrs.ia_valid |= ATTR_UID;
+		newattrs.ia_uid = user;
+	}
+	if (group != (gid_t) -1) {
+		newattrs.ia_valid |= ATTR_GID;
+		newattrs.ia_gid = group;
+	}
+	if (!S_ISDIR(inode->i_mode))
+		newattrs.ia_valid |=
+			ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
+	mutex_lock(&inode->i_mutex);
+	error = security_path_chown(path, user, group);
+	if (!error)
+		error = notify_change(path->dentry, &newattrs);
+	mutex_unlock(&inode->i_mutex);
+
+	return error;
+}
+
+SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
+{
+	struct path path;
+	int error;
+
+	error = user_path(filename, &path);
+	if (error)
+		goto out;
+	error = mnt_want_write(path.mnt);
+	if (error)
+		goto out_release;
+	error = chown_common(&path, user, group);
+	mnt_drop_write(path.mnt);
+out_release:
+	path_put(&path);
+out:
+	return error;
+}
+
+SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
+		gid_t, group, int, flag)
+{
+	struct path path;
+	int error = -EINVAL;
+	int lookup_flags;
+
+	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+		goto out;
+
+	lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+	error = user_path_at(dfd, filename, lookup_flags, &path);
+	if (error)
+		goto out;
+	error = mnt_want_write(path.mnt);
+	if (error)
+		goto out_release;
+	error = chown_common(&path, user, group);
+	mnt_drop_write(path.mnt);
+out_release:
+	path_put(&path);
+out:
+	return error;
+}
+
+SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
+{
+	struct path path;
+	int error;
+
+	error = user_lpath(filename, &path);
+	if (error)
+		goto out;
+	error = mnt_want_write(path.mnt);
+	if (error)
+		goto out_release;
+	error = chown_common(&path, user, group);
+	mnt_drop_write(path.mnt);
+out_release:
+	path_put(&path);
+out:
+	return error;
+}
+
+SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
+{
+	struct file * file;
+	int error = -EBADF;
+	struct dentry * dentry;
+
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	error = mnt_want_write_file(file);
+	if (error)
+		goto out_fput;
+	dentry = file->f_path.dentry;
+	audit_inode(NULL, dentry);
+	error = chown_common(&file->f_path, user, group);
+	mnt_drop_write(file->f_path.mnt);
+out_fput:
+	fput(file);
+out:
+	return error;
+}
+
+/*
+ * You have to be very careful that these write
+ * counts get cleaned up in error cases and
+ * upon __fput().  This should probably never
+ * be called outside of __dentry_open().
+ */
+static inline int __get_file_write_access(struct inode *inode,
+					  struct vfsmount *mnt)
+{
+	int error;
+	error = get_write_access(inode);
+	if (error)
+		return error;
+	/*
+	 * Do not take mount writer counts on
+	 * special files since no writes to
+	 * the mount itself will occur.
+	 */
+	if (!special_file(inode->i_mode)) {
+		/*
+		 * Balanced in __fput()
+		 */
+		error = mnt_want_write(mnt);
+		if (error)
+			put_write_access(inode);
+	}
+	return error;
+}
+
+static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
+					struct file *f,
+					int (*open)(struct inode *, struct file *),
+					const struct cred *cred)
+{
+	static const struct file_operations empty_fops = {};
+	struct inode *inode;
+	int error;
+
+	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
+				FMODE_PREAD | FMODE_PWRITE;
+
+	if (unlikely(f->f_flags & O_PATH))
+		f->f_mode = FMODE_PATH;
+
+	inode = dentry->d_inode;
+	if (f->f_mode & FMODE_WRITE) {
+		error = __get_file_write_access(inode, mnt);
+		if (error)
+			goto cleanup_file;
+		if (!special_file(inode->i_mode))
+			file_take_write(f);
+	}
+
+	f->f_mapping = inode->i_mapping;
+	f->f_path.dentry = dentry;
+	f->f_path.mnt = mnt;
+	f->f_pos = 0;
+	file_sb_list_add(f, inode->i_sb);
+
+	if (unlikely(f->f_mode & FMODE_PATH)) {
+		f->f_op = &empty_fops;
+		return f;
+	}
+
+	f->f_op = fops_get(inode->i_fop);
+
+	error = security_dentry_open(f, cred);
+	if (error)
+		goto cleanup_all;
+
+	if (!open && f->f_op)
+		open = f->f_op->open;
+	if (open) {
+		error = open(inode, f);
+		if (error)
+			goto cleanup_all;
+	}
+	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		i_readcount_inc(inode);
+
+	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+
+	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
+
+	/* NB: we're sure to have correct a_ops only after f_op->open */
+	if (f->f_flags & O_DIRECT) {
+		if (!f->f_mapping->a_ops ||
+		    ((!f->f_mapping->a_ops->direct_IO) &&
+		    (!f->f_mapping->a_ops->get_xip_mem))) {
+			fput(f);
+			f = ERR_PTR(-EINVAL);
+		}
+	}
+
+	return f;
+
+cleanup_all:
+	fops_put(f->f_op);
+	if (f->f_mode & FMODE_WRITE) {
+		put_write_access(inode);
+		if (!special_file(inode->i_mode)) {
+			/*
+			 * We don't consider this a real
+			 * mnt_want/drop_write() pair
+			 * because it all happenend right
+			 * here, so just reset the state.
+			 */
+			file_reset_write(f);
+			mnt_drop_write(mnt);
+		}
+	}
+	file_sb_list_del(f);
+	f->f_path.dentry = NULL;
+	f->f_path.mnt = NULL;
+cleanup_file:
+	put_filp(f);
+	dput(dentry);
+	mntput(mnt);
+	return ERR_PTR(error);
+}
+
+/**
+ * lookup_instantiate_filp - instantiates the open intent filp
+ * @nd: pointer to nameidata
+ * @dentry: pointer to dentry
+ * @open: open callback
+ *
+ * Helper for filesystems that want to use lookup open intents and pass back
+ * a fully instantiated struct file to the caller.
+ * This function is meant to be called from within a filesystem's
+ * lookup method.
+ * Beware of calling it for non-regular files! Those ->open methods might block
+ * (e.g. in fifo_open), leaving you with parent locked (and in case of fifo,
+ * leading to a deadlock, as nobody can open that fifo anymore, because
+ * another process to open fifo will block on locked parent when doing lookup).
+ * Note that in case of error, nd->intent.open.file is destroyed, but the
+ * path information remains valid.
+ * If the open callback is set to NULL, then the standard f_op->open()
+ * filesystem callback is substituted.
+ */
+struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
+		int (*open)(struct inode *, struct file *))
+{
+	const struct cred *cred = current_cred();
+
+	if (IS_ERR(nd->intent.open.file))
+		goto out;
+	if (IS_ERR(dentry))
+		goto out_err;
+	nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt),
+					     nd->intent.open.file,
+					     open, cred);
+out:
+	return nd->intent.open.file;
+out_err:
+	release_open_intent(nd);
+	nd->intent.open.file = (struct file *)dentry;
+	goto out;
+}
+EXPORT_SYMBOL_GPL(lookup_instantiate_filp);
+
+/**
+ * nameidata_to_filp - convert a nameidata to an open filp.
+ * @nd: pointer to nameidata
+ * @flags: open flags
+ *
+ * Note that this function destroys the original nameidata
+ */
+struct file *nameidata_to_filp(struct nameidata *nd)
+{
+	const struct cred *cred = current_cred();
+	struct file *filp;
+
+	/* Pick up the filp from the open intent */
+	filp = nd->intent.open.file;
+	nd->intent.open.file = NULL;
+
+	/* Has the filesystem initialised the file for us? */
+	if (filp->f_path.dentry == NULL) {
+		path_get(&nd->path);
+		filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
+				     NULL, cred);
+	}
+	return filp;
+}
+
+/*
+ * dentry_open() will have done dput(dentry) and mntput(mnt) if it returns an
+ * error.
+ */
+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
+			 const struct cred *cred)
+{
+	int error;
+	struct file *f;
+
+	validate_creds(cred);
+
+	/* We must always pass in a valid mount pointer. */
+	BUG_ON(!mnt);
+
+	error = -ENFILE;
+	f = get_empty_filp();
+	if (f == NULL) {
+		dput(dentry);
+		mntput(mnt);
+		return ERR_PTR(error);
+	}
+
+	f->f_flags = flags;
+	return __dentry_open(dentry, mnt, f, NULL, cred);
+}
+EXPORT_SYMBOL(dentry_open);
+
+static void __put_unused_fd(struct files_struct *files, unsigned int fd)
+{
+	struct fdtable *fdt = files_fdtable(files);
+	__FD_CLR(fd, fdt->open_fds);
+	if (fd < files->next_fd)
+		files->next_fd = fd;
+}
+
+void put_unused_fd(unsigned int fd)
+{
+	struct files_struct *files = current->files;
+	spin_lock(&files->file_lock);
+	__put_unused_fd(files, fd);
+	spin_unlock(&files->file_lock);
+}
+
+EXPORT_SYMBOL(put_unused_fd);
+
+/*
+ * Install a file pointer in the fd array.
+ *
+ * The VFS is full of places where we drop the files lock between
+ * setting the open_fds bitmap and installing the file in the file
+ * array.  At any such point, we are vulnerable to a dup2() race
+ * installing a file in the array before us.  We need to detect this and
+ * fput() the struct file we are about to overwrite in this case.
+ *
+ * It should never happen - if we allow dup2() do it, _really_ bad things
+ * will follow.
+ */
+
+void fd_install(unsigned int fd, struct file *file)
+{
+	struct files_struct *files = current->files;
+	struct fdtable *fdt;
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	BUG_ON(fdt->fd[fd] != NULL);
+	rcu_assign_pointer(fdt->fd[fd], file);
+	spin_unlock(&files->file_lock);
+}
+
+EXPORT_SYMBOL(fd_install);
+
+static inline int build_open_flags(int flags, int mode, struct open_flags *op)
+{
+	int lookup_flags = 0;
+	int acc_mode;
+
+	if (!(flags & O_CREAT))
+		mode = 0;
+	op->mode = mode;
+
+	/* Must never be set by userspace */
+	flags &= ~FMODE_NONOTIFY;
+
+	/*
+	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
+	 * check for O_DSYNC if the need any syncing at all we enforce it's
+	 * always set instead of having to deal with possibly weird behaviour
+	 * for malicious applications setting only __O_SYNC.
+	 */
+	if (flags & __O_SYNC)
+		flags |= O_DSYNC;
+
+	/*
+	 * If we have O_PATH in the open flag. Then we
+	 * cannot have anything other than the below set of flags
+	 */
+	if (flags & O_PATH) {
+		flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
+		acc_mode = 0;
+	} else {
+		acc_mode = MAY_OPEN | ACC_MODE(flags);
+	}
+
+	op->open_flag = flags;
+
+	/* O_TRUNC implies we need access checks for write permissions */
+	if (flags & O_TRUNC)
+		acc_mode |= MAY_WRITE;
+
+	/* Allow the LSM permission hook to distinguish append
+	   access from general write access. */
+	if (flags & O_APPEND)
+		acc_mode |= MAY_APPEND;
+
+	op->acc_mode = acc_mode;
+
+	op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
+
+	if (flags & O_CREAT) {
+		op->intent |= LOOKUP_CREATE;
+		if (flags & O_EXCL)
+			op->intent |= LOOKUP_EXCL;
+	}
+
+	if (flags & O_DIRECTORY)
+		lookup_flags |= LOOKUP_DIRECTORY;
+	if (!(flags & O_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
+	return lookup_flags;
+}
+
+/**
+ * filp_open - open file and return file pointer
+ *
+ * @filename:	path to open
+ * @flags:	open flags as per the open(2) second argument
+ * @mode:	mode for the new file if O_CREAT is set, else ignored
+ *
+ * This is the helper to open a file from kernelspace if you really
+ * have to.  But in generally you should not do this, so please move
+ * along, nothing to see here..
+ */
+struct file *filp_open(const char *filename, int flags, int mode)
+{
+	struct open_flags op;
+	int lookup = build_open_flags(flags, mode, &op);
+	return do_filp_open(AT_FDCWD, filename, &op, lookup);
+}
+EXPORT_SYMBOL(filp_open);
+
+struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+			    const char *filename, int flags)
+{
+	struct open_flags op;
+	int lookup = build_open_flags(flags, 0, &op);
+	if (flags & O_CREAT)
+		return ERR_PTR(-EINVAL);
+	if (!filename && (flags & O_DIRECTORY))
+		if (!dentry->d_inode->i_op->lookup)
+			return ERR_PTR(-ENOTDIR);
+	return do_file_open_root(dentry, mnt, filename, &op, lookup);
+}
+EXPORT_SYMBOL(file_open_root);
+
+long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
+{
+	struct open_flags op;
+	int lookup = build_open_flags(flags, mode, &op);
+	char *tmp = getname(filename);
+	int fd = PTR_ERR(tmp);
+
+	if (!IS_ERR(tmp)) {
+		fd = get_unused_fd_flags(flags);
+		if (fd >= 0) {
+			struct file *f = do_filp_open(dfd, tmp, &op, lookup);
+			if (IS_ERR(f)) {
+				put_unused_fd(fd);
+				fd = PTR_ERR(f);
+			} else {
+				fsnotify_open(f);
+				fd_install(fd, f);
+			}
+		}
+		putname(tmp);
+	}
+	return fd;
+}
+
+SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
+{
+	long ret;
+
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
+	ret = do_sys_open(AT_FDCWD, filename, flags, mode);
+	/* avoid REGPARM breakage on x86: */
+	asmlinkage_protect(3, ret, filename, flags, mode);
+	return ret;
+}
+
+SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
+		int, mode)
+{
+	long ret;
+
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
+	ret = do_sys_open(dfd, filename, flags, mode);
+	/* avoid REGPARM breakage on x86: */
+	asmlinkage_protect(4, ret, dfd, filename, flags, mode);
+	return ret;
+}
+
+#ifndef __alpha__
+
+/*
+ * For backward compatibility?  Maybe this should be moved
+ * into arch/i386 instead?
+ */
+SYSCALL_DEFINE2(creat, const char __user *, pathname, int, mode)
+{
+	return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode);
+}
+
+#endif
+
+/*
+ * "id" is the POSIX thread ID. We use the
+ * files pointer for this..
+ */
+int filp_close(struct file *filp, fl_owner_t id)
+{
+	int retval = 0;
+
+	if (!file_count(filp)) {
+		printk(KERN_ERR "VFS: Close: file count is 0\n");
+		return 0;
+	}
+
+	if (filp->f_op && filp->f_op->flush)
+		retval = filp->f_op->flush(filp, id);
+
+	if (likely(!(filp->f_mode & FMODE_PATH))) {
+		dnotify_flush(filp, id);
+		locks_remove_posix(filp, id);
+	}
+	fput(filp);
+	return retval;
+}
+
+EXPORT_SYMBOL(filp_close);
+
+/*
+ * Careful here! We test whether the file pointer is NULL before
+ * releasing the fd. This ensures that one clone task can't release
+ * an fd while another clone is opening it.
+ */
+SYSCALL_DEFINE1(close, unsigned int, fd)
+{
+	struct file * filp;
+	struct files_struct *files = current->files;
+	struct fdtable *fdt;
+	int retval;
+
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	if (fd >= fdt->max_fds)
+		goto out_unlock;
+	filp = fdt->fd[fd];
+	if (!filp)
+		goto out_unlock;
+	rcu_assign_pointer(fdt->fd[fd], NULL);
+	FD_CLR(fd, fdt->close_on_exec);
+	__put_unused_fd(files, fd);
+	spin_unlock(&files->file_lock);
+	retval = filp_close(filp, files);
+
+	/* can't restart close syscall because file table entry was cleared */
+	if (unlikely(retval == -ERESTARTSYS ||
+		     retval == -ERESTARTNOINTR ||
+		     retval == -ERESTARTNOHAND ||
+		     retval == -ERESTART_RESTARTBLOCK))
+		retval = -EINTR;
+
+	return retval;
+
+out_unlock:
+	spin_unlock(&files->file_lock);
+	return -EBADF;
+}
+EXPORT_SYMBOL(sys_close);
+
+/*
+ * This routine simulates a hangup on the tty, to arrange that users
+ * are given clean terminals at login time.
+ */
+SYSCALL_DEFINE0(vhangup)
+{
+	if (capable(CAP_SYS_TTY_CONFIG)) {
+		tty_vhangup_self();
+		return 0;
+	}
+	return -EPERM;
+}
+
+/*
+ * Called when an inode is about to be open.
+ * We use this to disallow opening large files on 32bit systems if
+ * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
+ * on this flag in sys_open.
+ */
+int generic_file_open(struct inode * inode, struct file * filp)
+{
+	if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+		return -EOVERFLOW;
+	return 0;
+}
+
+EXPORT_SYMBOL(generic_file_open);
+
+/*
+ * This is used by subsystems that don't want seekable
+ * file descriptors. The function is not supposed to ever fail, the only
+ * reason it returns an 'int' and not 'void' is so that it can be plugged
+ * directly into file_operations structure.
+ */
+int nonseekable_open(struct inode *inode, struct file *filp)
+{
+	filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
+	return 0;
+}
+
+EXPORT_SYMBOL(nonseekable_open);
diff --git a/fs/openpromfs/Makefile b/fs/openpromfs/Makefile
new file mode 100644
index 00000000..4563199b
--- /dev/null
+++ b/fs/openpromfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux Sun Openprom filesystem routines.
+#
+
+obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs.o
+
+openpromfs-objs := inode.o
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
new file mode 100644
index 00000000..a2a5bff7
--- /dev/null
+++ b/fs/openpromfs/inode.c
@@ -0,0 +1,473 @@
+/* inode.c: /proc/openprom handling routines
+ *
+ * Copyright (C) 1996-1999 Jakub Jelinek  (jakub@redhat.com)
+ * Copyright (C) 1998      Eddie C. Dost  (ecd@skynet.be)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/magic.h>
+
+#include <asm/openprom.h>
+#include <asm/oplib.h>
+#include <asm/prom.h>
+#include <asm/uaccess.h>
+
+static DEFINE_MUTEX(op_mutex);
+
+#define OPENPROM_ROOT_INO	0
+
+enum op_inode_type {
+	op_inode_node,
+	op_inode_prop,
+};
+
+union op_inode_data {
+	struct device_node	*node;
+	struct property		*prop;
+};
+
+struct op_inode_info {
+	struct inode		vfs_inode;
+	enum op_inode_type	type;
+	union op_inode_data	u;
+};
+
+static struct inode *openprom_iget(struct super_block *sb, ino_t ino);
+
+static inline struct op_inode_info *OP_I(struct inode *inode)
+{
+	return container_of(inode, struct op_inode_info, vfs_inode);
+}
+
+static int is_string(unsigned char *p, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++) {
+		unsigned char val = p[i];
+
+		if ((i && !val) ||
+		    (val >= ' ' && val <= '~'))
+			continue;
+
+		return 0;
+	}
+
+	return 1;
+}
+
+static int property_show(struct seq_file *f, void *v)
+{
+	struct property *prop = f->private;
+	void *pval;
+	int len;
+
+	len = prop->length;
+	pval = prop->value;
+
+	if (is_string(pval, len)) {
+		while (len > 0) {
+			int n = strlen(pval);
+
+			seq_printf(f, "%s", (char *) pval);
+
+			/* Skip over the NULL byte too.  */
+			pval += n + 1;
+			len -= n + 1;
+
+			if (len > 0)
+				seq_printf(f, " + ");
+		}
+	} else {
+		if (len & 3) {
+			while (len) {
+				len--;
+				if (len)
+					seq_printf(f, "%02x.",
+						   *(unsigned char *) pval);
+				else
+					seq_printf(f, "%02x",
+						   *(unsigned char *) pval);
+				pval++;
+			}
+		} else {
+			while (len >= 4) {
+				len -= 4;
+
+				if (len)
+					seq_printf(f, "%08x.",
+						   *(unsigned int *) pval);
+				else
+					seq_printf(f, "%08x",
+						   *(unsigned int *) pval);
+				pval += 4;
+			}
+		}
+	}
+	seq_printf(f, "\n");
+
+	return 0;
+}
+
+static void *property_start(struct seq_file *f, loff_t *pos)
+{
+	if (*pos == 0)
+		return pos;
+	return NULL;
+}
+
+static void *property_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return NULL;
+}
+
+static void property_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static const struct seq_operations property_op = {
+	.start		= property_start,
+	.next		= property_next,
+	.stop		= property_stop,
+	.show		= property_show
+};
+
+static int property_open(struct inode *inode, struct file *file)
+{
+	struct op_inode_info *oi = OP_I(inode);
+	int ret;
+
+	BUG_ON(oi->type != op_inode_prop);
+
+	ret = seq_open(file, &property_op);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = oi->u.prop;
+	}
+	return ret;
+}
+
+static const struct file_operations openpromfs_prop_ops = {
+	.open		= property_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int openpromfs_readdir(struct file *, void *, filldir_t);
+
+static const struct file_operations openprom_operations = {
+	.read		= generic_read_dir,
+	.readdir	= openpromfs_readdir,
+	.llseek		= generic_file_llseek,
+};
+
+static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *);
+
+static const struct inode_operations openprom_inode_operations = {
+	.lookup		= openpromfs_lookup,
+};
+
+static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct op_inode_info *ent_oi, *oi = OP_I(dir);
+	struct device_node *dp, *child;
+	struct property *prop;
+	enum op_inode_type ent_type;
+	union op_inode_data ent_data;
+	const char *name;
+	struct inode *inode;
+	unsigned int ino;
+	int len;
+	
+	BUG_ON(oi->type != op_inode_node);
+
+	dp = oi->u.node;
+
+	name = dentry->d_name.name;
+	len = dentry->d_name.len;
+
+	mutex_lock(&op_mutex);
+
+	child = dp->child;
+	while (child) {
+		int n = strlen(child->path_component_name);
+
+		if (len == n &&
+		    !strncmp(child->path_component_name, name, len)) {
+			ent_type = op_inode_node;
+			ent_data.node = child;
+			ino = child->unique_id;
+			goto found;
+		}
+		child = child->sibling;
+	}
+
+	prop = dp->properties;
+	while (prop) {
+		int n = strlen(prop->name);
+
+		if (len == n && !strncmp(prop->name, name, len)) {
+			ent_type = op_inode_prop;
+			ent_data.prop = prop;
+			ino = prop->unique_id;
+			goto found;
+		}
+
+		prop = prop->next;
+	}
+
+	mutex_unlock(&op_mutex);
+	return ERR_PTR(-ENOENT);
+
+found:
+	inode = openprom_iget(dir->i_sb, ino);
+	mutex_unlock(&op_mutex);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	ent_oi = OP_I(inode);
+	ent_oi->type = ent_type;
+	ent_oi->u = ent_data;
+
+	switch (ent_type) {
+	case op_inode_node:
+		inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
+		inode->i_op = &openprom_inode_operations;
+		inode->i_fop = &openprom_operations;
+		inode->i_nlink = 2;
+		break;
+	case op_inode_prop:
+		if (!strcmp(dp->name, "options") && (len == 17) &&
+		    !strncmp (name, "security-password", 17))
+			inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
+		else
+			inode->i_mode = S_IFREG | S_IRUGO;
+		inode->i_fop = &openpromfs_prop_ops;
+		inode->i_nlink = 1;
+		inode->i_size = ent_oi->u.prop->length;
+		break;
+	}
+
+	d_add(dentry, inode);
+	return NULL;
+}
+
+static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct op_inode_info *oi = OP_I(inode);
+	struct device_node *dp = oi->u.node;
+	struct device_node *child;
+	struct property *prop;
+	unsigned int ino;
+	int i;
+
+	mutex_lock(&op_mutex);
+	
+	ino = inode->i_ino;
+	i = filp->f_pos;
+	switch (i) {
+	case 0:
+		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+			goto out;
+		i++;
+		filp->f_pos++;
+		/* fall thru */
+	case 1:
+		if (filldir(dirent, "..", 2, i,
+			    (dp->parent == NULL ?
+			     OPENPROM_ROOT_INO :
+			     dp->parent->unique_id), DT_DIR) < 0) 
+			goto out;
+		i++;
+		filp->f_pos++;
+		/* fall thru */
+	default:
+		i -= 2;
+
+		/* First, the children nodes as directories.  */
+		child = dp->child;
+		while (i && child) {
+			child = child->sibling;
+			i--;
+		}
+		while (child) {
+			if (filldir(dirent,
+				    child->path_component_name,
+				    strlen(child->path_component_name),
+				    filp->f_pos, child->unique_id, DT_DIR) < 0)
+				goto out;
+
+			filp->f_pos++;
+			child = child->sibling;
+		}
+
+		/* Next, the properties as files.  */
+		prop = dp->properties;
+		while (i && prop) {
+			prop = prop->next;
+			i--;
+		}
+		while (prop) {
+			if (filldir(dirent, prop->name, strlen(prop->name),
+				    filp->f_pos, prop->unique_id, DT_REG) < 0)
+				goto out;
+
+			filp->f_pos++;
+			prop = prop->next;
+		}
+	}
+out:
+	mutex_unlock(&op_mutex);
+	return 0;
+}
+
+static struct kmem_cache *op_inode_cachep;
+
+static struct inode *openprom_alloc_inode(struct super_block *sb)
+{
+	struct op_inode_info *oi;
+
+	oi = kmem_cache_alloc(op_inode_cachep, GFP_KERNEL);
+	if (!oi)
+		return NULL;
+
+	return &oi->vfs_inode;
+}
+
+static void openprom_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(op_inode_cachep, OP_I(inode));
+}
+
+static void openprom_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, openprom_i_callback);
+}
+
+static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
+{
+	struct inode *inode;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (inode->i_state & I_NEW) {
+		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+		if (inode->i_ino == OPENPROM_ROOT_INO) {
+			inode->i_op = &openprom_inode_operations;
+			inode->i_fop = &openprom_operations;
+			inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
+		}
+		unlock_new_inode(inode);
+	}
+	return inode;
+}
+
+static int openprom_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_NOATIME;
+	return 0;
+}
+
+static const struct super_operations openprom_sops = {
+	.alloc_inode	= openprom_alloc_inode,
+	.destroy_inode	= openprom_destroy_inode,
+	.statfs		= simple_statfs,
+	.remount_fs	= openprom_remount,
+};
+
+static int openprom_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct inode *root_inode;
+	struct op_inode_info *oi;
+	int ret;
+
+	s->s_flags |= MS_NOATIME;
+	s->s_blocksize = 1024;
+	s->s_blocksize_bits = 10;
+	s->s_magic = OPENPROM_SUPER_MAGIC;
+	s->s_op = &openprom_sops;
+	s->s_time_gran = 1;
+	root_inode = openprom_iget(s, OPENPROM_ROOT_INO);
+	if (IS_ERR(root_inode)) {
+		ret = PTR_ERR(root_inode);
+		goto out_no_root;
+	}
+
+	oi = OP_I(root_inode);
+	oi->type = op_inode_node;
+	oi->u.node = of_find_node_by_path("/");
+
+	s->s_root = d_alloc_root(root_inode);
+	if (!s->s_root)
+		goto out_no_root_dentry;
+	return 0;
+
+out_no_root_dentry:
+	iput(root_inode);
+	ret = -ENOMEM;
+out_no_root:
+	printk("openprom_fill_super: get root inode failed\n");
+	return ret;
+}
+
+static struct dentry *openprom_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_single(fs_type, flags, data, openprom_fill_super);
+}
+
+static struct file_system_type openprom_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "openpromfs",
+	.mount		= openprom_mount,
+	.kill_sb	= kill_anon_super,
+};
+
+static void op_inode_init_once(void *data)
+{
+	struct op_inode_info *oi = (struct op_inode_info *) data;
+
+	inode_init_once(&oi->vfs_inode);
+}
+
+static int __init init_openprom_fs(void)
+{
+	int err;
+
+	op_inode_cachep = kmem_cache_create("op_inode_cache",
+					    sizeof(struct op_inode_info),
+					    0,
+					    (SLAB_RECLAIM_ACCOUNT |
+					     SLAB_MEM_SPREAD),
+					    op_inode_init_once);
+	if (!op_inode_cachep)
+		return -ENOMEM;
+
+	err = register_filesystem(&openprom_fs_type);
+	if (err)
+		kmem_cache_destroy(op_inode_cachep);
+
+	return err;
+}
+
+static void __exit exit_openprom_fs(void)
+{
+	unregister_filesystem(&openprom_fs_type);
+	kmem_cache_destroy(op_inode_cachep);
+}
+
+module_init(init_openprom_fs)
+module_exit(exit_openprom_fs)
+MODULE_LICENSE("GPL");
diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig
new file mode 100644
index 00000000..cb5f0a3f
--- /dev/null
+++ b/fs/partitions/Kconfig
@@ -0,0 +1,251 @@
+#
+# Partition configuration
+#
+config PARTITION_ADVANCED
+	bool "Advanced partition selection"
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned under an operating system running on a different
+	  architecture than your Linux system.
+
+	  Note that the answer to this question won't directly affect the
+	  kernel: saying N will just cause the configurator to skip all
+	  the questions about foreign partitioning schemes.
+
+	  If unsure, say N.
+
+config ACORN_PARTITION
+	bool "Acorn partition support" if PARTITION_ADVANCED
+	default y if ARCH_ACORN
+	help
+	  Support hard disks partitioned under Acorn operating systems.
+
+config ACORN_PARTITION_CUMANA
+	bool "Cumana partition support" if PARTITION_ADVANCED
+	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned using the Cumana interface on Acorn machines.
+
+config ACORN_PARTITION_EESOX
+	bool "EESOX partition support" if PARTITION_ADVANCED
+	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
+
+config ACORN_PARTITION_ICS
+	bool "ICS partition support" if PARTITION_ADVANCED
+	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned using the ICS interface on Acorn machines.
+
+config ACORN_PARTITION_ADFS
+	bool "Native filecore partition support" if PARTITION_ADVANCED
+	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
+	help
+	  The Acorn Disc Filing System is the standard file system of the
+	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
+	  systems and the Acorn Archimedes range of machines.  If you say
+	  `Y' here, Linux will support disk partitions created under ADFS.
+
+config ACORN_PARTITION_POWERTEC
+	bool "PowerTec partition support" if PARTITION_ADVANCED
+	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
+	help
+	  Support reading partition tables created on Acorn machines using
+	  the PowerTec SCSI drive.
+
+config ACORN_PARTITION_RISCIX
+	bool "RISCiX partition support" if PARTITION_ADVANCED
+	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
+	help
+	  Once upon a time, there was a native Unix port for the Acorn series
+	  of machines called RISCiX.  If you say 'Y' here, Linux will be able
+	  to read disks partitioned under RISCiX.
+
+config OSF_PARTITION
+	bool "Alpha OSF partition support" if PARTITION_ADVANCED
+	default y if ALPHA
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned on an Alpha machine.
+
+config AMIGA_PARTITION
+	bool "Amiga partition table support" if PARTITION_ADVANCED
+	default y if (AMIGA || AFFS_FS=y)
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned under AmigaOS.
+
+config ATARI_PARTITION
+	bool "Atari partition table support" if PARTITION_ADVANCED
+	default y if ATARI
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned under the Atari OS.
+
+config IBM_PARTITION
+	bool "IBM disk label and partition support"
+	depends on PARTITION_ADVANCED && S390
+	help
+	  Say Y here if you would like to be able to read the hard disk
+	  partition table format used by IBM DASD disks operating under CMS.
+	  Otherwise, say N.
+
+config MAC_PARTITION
+	bool "Macintosh partition map support" if PARTITION_ADVANCED
+	default y if (MAC || PPC_PMAC)
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned on a Macintosh.
+
+config MSDOS_PARTITION
+	bool "PC BIOS (MSDOS partition tables) support" if PARTITION_ADVANCED
+	default y
+	help
+	  Say Y here.
+
+config BSD_DISKLABEL
+	bool "BSD disklabel (FreeBSD partition tables) support"
+	depends on PARTITION_ADVANCED && MSDOS_PARTITION
+	help
+	  FreeBSD uses its own hard disk partition scheme on your PC. It
+	  requires only one entry in the primary partition table of your disk
+	  and manages it similarly to DOS extended partitions, putting in its
+	  first sector a new partition table in BSD disklabel format. Saying Y
+	  here allows you to read these disklabels and further mount FreeBSD
+	  partitions from within Linux if you have also said Y to "UFS
+	  file system support", above. If you don't know what all this is
+	  about, say N.
+
+config MINIX_SUBPARTITION
+	bool "Minix subpartition support"
+	depends on PARTITION_ADVANCED && MSDOS_PARTITION
+	help
+	  Minix 2.0.0/2.0.2 subpartition table support for Linux.
+	  Say Y here if you want to mount and use Minix 2.0.0/2.0.2
+	  subpartitions.
+
+config SOLARIS_X86_PARTITION
+	bool "Solaris (x86) partition table support"
+	depends on PARTITION_ADVANCED && MSDOS_PARTITION
+	help
+	  Like most systems, Solaris x86 uses its own hard disk partition
+	  table format, incompatible with all others. Saying Y here allows you
+	  to read these partition tables and further mount Solaris x86
+	  partitions from within Linux if you have also said Y to "UFS
+	  file system support", above.
+
+config UNIXWARE_DISKLABEL
+	bool "Unixware slices support"
+	depends on PARTITION_ADVANCED && MSDOS_PARTITION
+	---help---
+	  Like some systems, UnixWare uses its own slice table inside a
+	  partition (VTOC - Virtual Table of Contents). Its format is
+	  incompatible with all other OSes. Saying Y here allows you to read
+	  VTOC and further mount UnixWare partitions read-only from within
+	  Linux if you have also said Y to "UFS file system support" or
+	  "System V and Coherent file system support", above.
+
+	  This is mainly used to carry data from a UnixWare box to your
+	  Linux box via a removable medium like magneto-optical, ZIP or
+	  removable IDE drives. Note, however, that a good portable way to
+	  transport files and directories between unixes (and even other
+	  operating systems) is given by the tar program ("man tar" or
+	  preferably "info tar").
+
+	  If you don't know what all this is about, say N.
+
+config LDM_PARTITION
+	bool "Windows Logical Disk Manager (Dynamic Disk) support"
+	depends on PARTITION_ADVANCED
+	---help---
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned using Windows 2000's/XP's or Vista's Logical Disk
+	  Manager.  They are also known as "Dynamic Disks".
+
+	  Note this driver only supports Dynamic Disks with a protective MBR
+	  label, i.e. DOS partition table.  It does not support GPT labelled
+	  Dynamic Disks yet as can be created with Vista.
+
+	  Windows 2000 introduced the concept of Dynamic Disks to get around
+	  the limitations of the PC's partitioning scheme.  The Logical Disk
+	  Manager allows the user to repartition a disk and create spanned,
+	  mirrored, striped or RAID volumes, all without the need for
+	  rebooting.
+
+	  Normal partitions are now called Basic Disks under Windows 2000, XP,
+	  and Vista.
+
+	  For a fuller description read <file:Documentation/ldm.txt>.
+
+	  If unsure, say N.
+
+config LDM_DEBUG
+	bool "Windows LDM extra logging"
+	depends on LDM_PARTITION
+	help
+	  Say Y here if you would like LDM to log verbosely.  This could be
+	  helpful if the driver doesn't work as expected and you'd like to
+	  report a bug.
+
+	  If unsure, say N.
+
+config SGI_PARTITION
+	bool "SGI partition support" if PARTITION_ADVANCED
+	default y if DEFAULT_SGI_PARTITION
+	help
+	  Say Y here if you would like to be able to read the hard disk
+	  partition table format used by SGI machines.
+
+config ULTRIX_PARTITION
+	bool "Ultrix partition table support" if PARTITION_ADVANCED
+	default y if MACH_DECSTATION
+	help
+	  Say Y here if you would like to be able to read the hard disk
+	  partition table format used by DEC (now Compaq) Ultrix machines.
+	  Otherwise, say N.
+
+config SUN_PARTITION
+	bool "Sun partition tables support" if PARTITION_ADVANCED
+	default y if (SPARC || SUN3 || SUN3X)
+	---help---
+	  Like most systems, SunOS uses its own hard disk partition table
+	  format, incompatible with all others. Saying Y here allows you to
+	  read these partition tables and further mount SunOS partitions from
+	  within Linux if you have also said Y to "UFS file system support",
+	  above. This is mainly used to carry data from a SPARC under SunOS to
+	  your Linux box via a removable medium like magneto-optical or ZIP
+	  drives; note however that a good portable way to transport files and
+	  directories between unixes (and even other operating systems) is
+	  given by the tar program ("man tar" or preferably "info tar"). If
+	  you don't know what all this is about, say N.
+
+config KARMA_PARTITION
+	bool "Karma Partition support"
+	depends on PARTITION_ADVANCED
+	help
+	  Say Y here if you would like to mount the Rio Karma MP3 player, as it
+	  uses a proprietary partition table.
+
+config EFI_PARTITION
+	bool "EFI GUID Partition support"
+	depends on PARTITION_ADVANCED
+	select CRC32
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned using EFI GPT.
+
+config SYSV68_PARTITION
+	bool "SYSV68 partition table support" if PARTITION_ADVANCED
+	default y if VME
+	help
+	  Say Y here if you would like to be able to read the hard disk
+	  partition table format used by Motorola Delta machines (using
+	  sysv68).
+	  Otherwise, say N.
diff --git a/fs/partitions/Makefile b/fs/partitions/Makefile
new file mode 100644
index 00000000..03af8eac
--- /dev/null
+++ b/fs/partitions/Makefile
@@ -0,0 +1,20 @@
+#
+# Makefile for the linux kernel.
+#
+
+obj-$(CONFIG_BLOCK) := check.o
+
+obj-$(CONFIG_ACORN_PARTITION) += acorn.o
+obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
+obj-$(CONFIG_ATARI_PARTITION) += atari.o
+obj-$(CONFIG_MAC_PARTITION) += mac.o
+obj-$(CONFIG_LDM_PARTITION) += ldm.o
+obj-$(CONFIG_MSDOS_PARTITION) += msdos.o
+obj-$(CONFIG_OSF_PARTITION) += osf.o
+obj-$(CONFIG_SGI_PARTITION) += sgi.o
+obj-$(CONFIG_SUN_PARTITION) += sun.o
+obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o
+obj-$(CONFIG_IBM_PARTITION) += ibm.o
+obj-$(CONFIG_EFI_PARTITION) += efi.o
+obj-$(CONFIG_KARMA_PARTITION) += karma.o
+obj-$(CONFIG_SYSV68_PARTITION) += sysv68.o
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
new file mode 100644
index 00000000..fbeb6973
--- /dev/null
+++ b/fs/partitions/acorn.c
@@ -0,0 +1,556 @@
+/*
+ *  linux/fs/partitions/acorn.c
+ *
+ *  Copyright (c) 1996-2000 Russell King.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  Scan ADFS partitions on hard disk drives.  Unfortunately, there
+ *  isn't a standard for partitioning drives on Acorn machines, so
+ *  every single manufacturer of SCSI and IDE cards created their own
+ *  method.
+ */
+#include <linux/buffer_head.h>
+#include <linux/adfs_fs.h>
+
+#include "check.h"
+#include "acorn.h"
+
+/*
+ * Partition types. (Oh for reusability)
+ */
+#define PARTITION_RISCIX_MFM	1
+#define PARTITION_RISCIX_SCSI	2
+#define PARTITION_LINUX		9
+
+#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
+	defined(CONFIG_ACORN_PARTITION_ADFS)
+static struct adfs_discrecord *
+adfs_partition(struct parsed_partitions *state, char *name, char *data,
+	       unsigned long first_sector, int slot)
+{
+	struct adfs_discrecord *dr;
+	unsigned int nr_sects;
+
+	if (adfs_checkbblk(data))
+		return NULL;
+
+	dr = (struct adfs_discrecord *)(data + 0x1c0);
+
+	if (dr->disc_size == 0 && dr->disc_size_high == 0)
+		return NULL;
+
+	nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) |
+		   (le32_to_cpu(dr->disc_size) >> 9);
+
+	if (name) {
+		strlcat(state->pp_buf, " [", PAGE_SIZE);
+		strlcat(state->pp_buf, name, PAGE_SIZE);
+		strlcat(state->pp_buf, "]", PAGE_SIZE);
+	}
+	put_partition(state, slot, first_sector, nr_sects);
+	return dr;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_RISCIX
+
+struct riscix_part {
+	__le32	start;
+	__le32	length;
+	__le32	one;
+	char	name[16];
+};
+
+struct riscix_record {
+	__le32	magic;
+#define RISCIX_MAGIC	cpu_to_le32(0x4a657320)
+	__le32	date;
+	struct riscix_part part[8];
+};
+
+#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
+	defined(CONFIG_ACORN_PARTITION_ADFS)
+static int riscix_partition(struct parsed_partitions *state,
+			    unsigned long first_sect, int slot,
+			    unsigned long nr_sects)
+{
+	Sector sect;
+	struct riscix_record *rr;
+	
+	rr = read_part_sector(state, first_sect, &sect);
+	if (!rr)
+		return -1;
+
+	strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE);
+
+
+	if (rr->magic == RISCIX_MAGIC) {
+		unsigned long size = nr_sects > 2 ? 2 : nr_sects;
+		int part;
+
+		strlcat(state->pp_buf, " <", PAGE_SIZE);
+
+		put_partition(state, slot++, first_sect, size);
+		for (part = 0; part < 8; part++) {
+			if (rr->part[part].one &&
+			    memcmp(rr->part[part].name, "All\0", 4)) {
+				put_partition(state, slot++,
+					le32_to_cpu(rr->part[part].start),
+					le32_to_cpu(rr->part[part].length));
+				strlcat(state->pp_buf, "(", PAGE_SIZE);
+				strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE);
+				strlcat(state->pp_buf, ")", PAGE_SIZE);
+			}
+		}
+
+		strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+	} else {
+		put_partition(state, slot++, first_sect, nr_sects);
+	}
+
+	put_dev_sector(sect);
+	return slot;
+}
+#endif
+#endif
+
+#define LINUX_NATIVE_MAGIC 0xdeafa1de
+#define LINUX_SWAP_MAGIC   0xdeafab1e
+
+struct linux_part {
+	__le32 magic;
+	__le32 start_sect;
+	__le32 nr_sects;
+};
+
+#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
+	defined(CONFIG_ACORN_PARTITION_ADFS)
+static int linux_partition(struct parsed_partitions *state,
+			   unsigned long first_sect, int slot,
+			   unsigned long nr_sects)
+{
+	Sector sect;
+	struct linux_part *linuxp;
+	unsigned long size = nr_sects > 2 ? 2 : nr_sects;
+
+	strlcat(state->pp_buf, " [Linux]", PAGE_SIZE);
+
+	put_partition(state, slot++, first_sect, size);
+
+	linuxp = read_part_sector(state, first_sect, &sect);
+	if (!linuxp)
+		return -1;
+
+	strlcat(state->pp_buf, " <", PAGE_SIZE);
+	while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) ||
+	       linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) {
+		if (slot == state->limit)
+			break;
+		put_partition(state, slot++, first_sect +
+				 le32_to_cpu(linuxp->start_sect),
+				 le32_to_cpu(linuxp->nr_sects));
+		linuxp ++;
+	}
+	strlcat(state->pp_buf, " >", PAGE_SIZE);
+
+	put_dev_sector(sect);
+	return slot;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_CUMANA
+int adfspart_check_CUMANA(struct parsed_partitions *state)
+{
+	unsigned long first_sector = 0;
+	unsigned int start_blk = 0;
+	Sector sect;
+	unsigned char *data;
+	char *name = "CUMANA/ADFS";
+	int first = 1;
+	int slot = 1;
+
+	/*
+	 * Try Cumana style partitions - sector 6 contains ADFS boot block
+	 * with pointer to next 'drive'.
+	 *
+	 * There are unknowns in this code - is the 'cylinder number' of the
+	 * next partition relative to the start of this one - I'm assuming
+	 * it is.
+	 *
+	 * Also, which ID did Cumana use?
+	 *
+	 * This is totally unfinished, and will require more work to get it
+	 * going. Hence it is totally untested.
+	 */
+	do {
+		struct adfs_discrecord *dr;
+		unsigned int nr_sects;
+
+		data = read_part_sector(state, start_blk * 2 + 6, &sect);
+		if (!data)
+			return -1;
+
+		if (slot == state->limit)
+			break;
+
+		dr = adfs_partition(state, name, data, first_sector, slot++);
+		if (!dr)
+			break;
+
+		name = NULL;
+
+		nr_sects = (data[0x1fd] + (data[0x1fe] << 8)) *
+			   (dr->heads + (dr->lowsector & 0x40 ? 1 : 0)) *
+			   dr->secspertrack;
+
+		if (!nr_sects)
+			break;
+
+		first = 0;
+		first_sector += nr_sects;
+		start_blk += nr_sects >> (BLOCK_SIZE_BITS - 9);
+		nr_sects = 0; /* hmm - should be partition size */
+
+		switch (data[0x1fc] & 15) {
+		case 0: /* No partition / ADFS? */
+			break;
+
+#ifdef CONFIG_ACORN_PARTITION_RISCIX
+		case PARTITION_RISCIX_SCSI:
+			/* RISCiX - we don't know how to find the next one. */
+			slot = riscix_partition(state, first_sector, slot,
+						nr_sects);
+			break;
+#endif
+
+		case PARTITION_LINUX:
+			slot = linux_partition(state, first_sector, slot,
+					       nr_sects);
+			break;
+		}
+		put_dev_sector(sect);
+		if (slot == -1)
+			return -1;
+	} while (1);
+	put_dev_sector(sect);
+	return first ? 0 : 1;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_ADFS
+/*
+ * Purpose: allocate ADFS partitions.
+ *
+ * Params : hd		- pointer to gendisk structure to store partition info.
+ *	    dev		- device number to access.
+ *
+ * Returns: -1 on error, 0 for no ADFS boot sector, 1 for ok.
+ *
+ * Alloc  : hda  = whole drive
+ *	    hda1 = ADFS partition on first drive.
+ *	    hda2 = non-ADFS partition.
+ */
+int adfspart_check_ADFS(struct parsed_partitions *state)
+{
+	unsigned long start_sect, nr_sects, sectscyl, heads;
+	Sector sect;
+	unsigned char *data;
+	struct adfs_discrecord *dr;
+	unsigned char id;
+	int slot = 1;
+
+	data = read_part_sector(state, 6, &sect);
+	if (!data)
+		return -1;
+
+	dr = adfs_partition(state, "ADFS", data, 0, slot++);
+	if (!dr) {
+		put_dev_sector(sect);
+    		return 0;
+	}
+
+	heads = dr->heads + ((dr->lowsector >> 6) & 1);
+	sectscyl = dr->secspertrack * heads;
+	start_sect = ((data[0x1fe] << 8) + data[0x1fd]) * sectscyl;
+	id = data[0x1fc] & 15;
+	put_dev_sector(sect);
+
+	/*
+	 * Work out start of non-adfs partition.
+	 */
+	nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
+
+	if (start_sect) {
+		switch (id) {
+#ifdef CONFIG_ACORN_PARTITION_RISCIX
+		case PARTITION_RISCIX_SCSI:
+		case PARTITION_RISCIX_MFM:
+			slot = riscix_partition(state, start_sect, slot,
+						nr_sects);
+			break;
+#endif
+
+		case PARTITION_LINUX:
+			slot = linux_partition(state, start_sect, slot,
+					       nr_sects);
+			break;
+		}
+	}
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	return 1;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_ICS
+
+struct ics_part {
+	__le32 start;
+	__le32 size;
+};
+
+static int adfspart_check_ICSLinux(struct parsed_partitions *state,
+				   unsigned long block)
+{
+	Sector sect;
+	unsigned char *data = read_part_sector(state, block, &sect);
+	int result = 0;
+
+	if (data) {
+		if (memcmp(data, "LinuxPart", 9) == 0)
+			result = 1;
+		put_dev_sector(sect);
+	}
+
+	return result;
+}
+
+/*
+ * Check for a valid ICS partition using the checksum.
+ */
+static inline int valid_ics_sector(const unsigned char *data)
+{
+	unsigned long sum;
+	int i;
+
+	for (i = 0, sum = 0x50617274; i < 508; i++)
+		sum += data[i];
+
+	sum -= le32_to_cpu(*(__le32 *)(&data[508]));
+
+	return sum == 0;
+}
+
+/*
+ * Purpose: allocate ICS partitions.
+ * Params : hd		- pointer to gendisk structure to store partition info.
+ *	    dev		- device number to access.
+ * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok.
+ * Alloc  : hda  = whole drive
+ *	    hda1 = ADFS partition 0 on first drive.
+ *	    hda2 = ADFS partition 1 on first drive.
+ *		..etc..
+ */
+int adfspart_check_ICS(struct parsed_partitions *state)
+{
+	const unsigned char *data;
+	const struct ics_part *p;
+	int slot;
+	Sector sect;
+
+	/*
+	 * Try ICS style partitions - sector 0 contains partition info.
+	 */
+	data = read_part_sector(state, 0, &sect);
+	if (!data)
+	    	return -1;
+
+	if (!valid_ics_sector(data)) {
+	    	put_dev_sector(sect);
+		return 0;
+	}
+
+	strlcat(state->pp_buf, " [ICS]", PAGE_SIZE);
+
+	for (slot = 1, p = (const struct ics_part *)data; p->size; p++) {
+		u32 start = le32_to_cpu(p->start);
+		s32 size = le32_to_cpu(p->size); /* yes, it's signed. */
+
+		if (slot == state->limit)
+			break;
+
+		/*
+		 * Negative sizes tell the RISC OS ICS driver to ignore
+		 * this partition - in effect it says that this does not
+		 * contain an ADFS filesystem.
+		 */
+		if (size < 0) {
+			size = -size;
+
+			/*
+			 * Our own extension - We use the first sector
+			 * of the partition to identify what type this
+			 * partition is.  We must not make this visible
+			 * to the filesystem.
+			 */
+			if (size > 1 && adfspart_check_ICSLinux(state, start)) {
+				start += 1;
+				size -= 1;
+			}
+		}
+
+		if (size)
+			put_partition(state, slot++, start, size);
+	}
+
+	put_dev_sector(sect);
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	return 1;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_POWERTEC
+struct ptec_part {
+	__le32 unused1;
+	__le32 unused2;
+	__le32 start;
+	__le32 size;
+	__le32 unused5;
+	char type[8];
+};
+
+static inline int valid_ptec_sector(const unsigned char *data)
+{
+	unsigned char checksum = 0x2a;
+	int i;
+
+	/*
+	 * If it looks like a PC/BIOS partition, then it
+	 * probably isn't PowerTec.
+	 */
+	if (data[510] == 0x55 && data[511] == 0xaa)
+		return 0;
+
+	for (i = 0; i < 511; i++)
+		checksum += data[i];
+
+	return checksum == data[511];
+}
+
+/*
+ * Purpose: allocate ICS partitions.
+ * Params : hd		- pointer to gendisk structure to store partition info.
+ *	    dev		- device number to access.
+ * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok.
+ * Alloc  : hda  = whole drive
+ *	    hda1 = ADFS partition 0 on first drive.
+ *	    hda2 = ADFS partition 1 on first drive.
+ *		..etc..
+ */
+int adfspart_check_POWERTEC(struct parsed_partitions *state)
+{
+	Sector sect;
+	const unsigned char *data;
+	const struct ptec_part *p;
+	int slot = 1;
+	int i;
+
+	data = read_part_sector(state, 0, &sect);
+	if (!data)
+		return -1;
+
+	if (!valid_ptec_sector(data)) {
+		put_dev_sector(sect);
+		return 0;
+	}
+
+	strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE);
+
+	for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) {
+		u32 start = le32_to_cpu(p->start);
+		u32 size = le32_to_cpu(p->size);
+
+		if (size)
+			put_partition(state, slot++, start, size);
+	}
+
+	put_dev_sector(sect);
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	return 1;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_EESOX
+struct eesox_part {
+	char	magic[6];
+	char	name[10];
+	__le32	start;
+	__le32	unused6;
+	__le32	unused7;
+	__le32	unused8;
+};
+
+/*
+ * Guess who created this format?
+ */
+static const char eesox_name[] = {
+	'N', 'e', 'i', 'l', ' ',
+	'C', 'r', 'i', 't', 'c', 'h', 'e', 'l', 'l', ' ', ' '
+};
+
+/*
+ * EESOX SCSI partition format.
+ *
+ * This is a goddamned awful partition format.  We don't seem to store
+ * the size of the partition in this table, only the start addresses.
+ *
+ * There are two possibilities where the size comes from:
+ *  1. The individual ADFS boot block entries that are placed on the disk.
+ *  2. The start address of the next entry.
+ */
+int adfspart_check_EESOX(struct parsed_partitions *state)
+{
+	Sector sect;
+	const unsigned char *data;
+	unsigned char buffer[256];
+	struct eesox_part *p;
+	sector_t start = 0;
+	int i, slot = 1;
+
+	data = read_part_sector(state, 7, &sect);
+	if (!data)
+		return -1;
+
+	/*
+	 * "Decrypt" the partition table.  God knows why...
+	 */
+	for (i = 0; i < 256; i++)
+		buffer[i] = data[i] ^ eesox_name[i & 15];
+
+	put_dev_sector(sect);
+
+	for (i = 0, p = (struct eesox_part *)buffer; i < 8; i++, p++) {
+		sector_t next;
+
+		if (memcmp(p->magic, "Eesox", 6))
+			break;
+
+		next = le32_to_cpu(p->start);
+		if (i)
+			put_partition(state, slot++, start, next - start);
+		start = next;
+	}
+
+	if (i != 0) {
+		sector_t size;
+
+		size = get_capacity(state->bdev->bd_disk);
+		put_partition(state, slot++, start, size - start);
+		strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	}
+
+	return i ? 1 : 0;
+}
+#endif
diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h
new file mode 100644
index 00000000..ede82852
--- /dev/null
+++ b/fs/partitions/acorn.h
@@ -0,0 +1,14 @@
+/*
+ * linux/fs/partitions/acorn.h
+ *
+ * Copyright (C) 1996-2001 Russell King.
+ *
+ *  I _hate_ this partitioning mess - why can't we have one defined
+ *  format, and everyone stick to it?
+ */
+
+int adfspart_check_CUMANA(struct parsed_partitions *state);
+int adfspart_check_ADFS(struct parsed_partitions *state);
+int adfspart_check_ICS(struct parsed_partitions *state);
+int adfspart_check_POWERTEC(struct parsed_partitions *state);
+int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
new file mode 100644
index 00000000..70cbf44a
--- /dev/null
+++ b/fs/partitions/amiga.c
@@ -0,0 +1,139 @@
+/*
+ *  fs/partitions/amiga.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *
+ *  Copyright (C) 1991-1998  Linus Torvalds
+ *  Re-organised Feb 1998 Russell King
+ */
+
+#include <linux/types.h>
+#include <linux/affs_hardblocks.h>
+
+#include "check.h"
+#include "amiga.h"
+
+static __inline__ u32
+checksum_block(__be32 *m, int size)
+{
+	u32 sum = 0;
+
+	while (size--)
+		sum += be32_to_cpu(*m++);
+	return sum;
+}
+
+int amiga_partition(struct parsed_partitions *state)
+{
+	Sector sect;
+	unsigned char *data;
+	struct RigidDiskBlock *rdb;
+	struct PartitionBlock *pb;
+	int start_sect, nr_sects, blk, part, res = 0;
+	int blksize = 1;	/* Multiplier for disk block size */
+	int slot = 1;
+	char b[BDEVNAME_SIZE];
+
+	for (blk = 0; ; blk++, put_dev_sector(sect)) {
+		if (blk == RDB_ALLOCATION_LIMIT)
+			goto rdb_done;
+		data = read_part_sector(state, blk, &sect);
+		if (!data) {
+			if (warn_no_part)
+				printk("Dev %s: unable to read RDB block %d\n",
+				       bdevname(state->bdev, b), blk);
+			res = -1;
+			goto rdb_done;
+		}
+		if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK))
+			continue;
+
+		rdb = (struct RigidDiskBlock *)data;
+		if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0)
+			break;
+		/* Try again with 0xdc..0xdf zeroed, Windows might have
+		 * trashed it.
+		 */
+		*(__be32 *)(data+0xdc) = 0;
+		if (checksum_block((__be32 *)data,
+				be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
+			printk("Warning: Trashed word at 0xd0 in block %d "
+				"ignored in checksum calculation\n",blk);
+			break;
+		}
+
+		printk("Dev %s: RDB in block %d has bad checksum\n",
+		       bdevname(state->bdev, b), blk);
+	}
+
+	/* blksize is blocks per 512 byte standard block */
+	blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512;
+
+	{
+		char tmp[7 + 10 + 1 + 1];
+
+		/* Be more informative */
+		snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	}
+	blk = be32_to_cpu(rdb->rdb_PartitionList);
+	put_dev_sector(sect);
+	for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
+		blk *= blksize;	/* Read in terms partition table understands */
+		data = read_part_sector(state, blk, &sect);
+		if (!data) {
+			if (warn_no_part)
+				printk("Dev %s: unable to read partition block %d\n",
+				       bdevname(state->bdev, b), blk);
+			res = -1;
+			goto rdb_done;
+		}
+		pb  = (struct PartitionBlock *)data;
+		blk = be32_to_cpu(pb->pb_Next);
+		if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION))
+			continue;
+		if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 )
+			continue;
+
+		/* Tell Kernel about it */
+
+		nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 -
+			    be32_to_cpu(pb->pb_Environment[9])) *
+			   be32_to_cpu(pb->pb_Environment[3]) *
+			   be32_to_cpu(pb->pb_Environment[5]) *
+			   blksize;
+		if (!nr_sects)
+			continue;
+		start_sect = be32_to_cpu(pb->pb_Environment[9]) *
+			     be32_to_cpu(pb->pb_Environment[3]) *
+			     be32_to_cpu(pb->pb_Environment[5]) *
+			     blksize;
+		put_partition(state,slot++,start_sect,nr_sects);
+		{
+			/* Be even more informative to aid mounting */
+			char dostype[4];
+			char tmp[42];
+
+			__be32 *dt = (__be32 *)dostype;
+			*dt = pb->pb_Environment[16];
+			if (dostype[3] < ' ')
+				snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)",
+					dostype[0], dostype[1],
+					dostype[2], dostype[3] + '@' );
+			else
+				snprintf(tmp, sizeof(tmp), " (%c%c%c%c)",
+					dostype[0], dostype[1],
+					dostype[2], dostype[3]);
+			strlcat(state->pp_buf, tmp, PAGE_SIZE);
+			snprintf(tmp, sizeof(tmp), "(res %d spb %d)",
+				be32_to_cpu(pb->pb_Environment[6]),
+				be32_to_cpu(pb->pb_Environment[4]));
+			strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		}
+		res = 1;
+	}
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+rdb_done:
+	return res;
+}
diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h
new file mode 100644
index 00000000..d094585c
--- /dev/null
+++ b/fs/partitions/amiga.h
@@ -0,0 +1,6 @@
+/*
+ *  fs/partitions/amiga.h
+ */
+
+int amiga_partition(struct parsed_partitions *state);
+
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
new file mode 100644
index 00000000..9875b05e
--- /dev/null
+++ b/fs/partitions/atari.c
@@ -0,0 +1,149 @@
+/*
+ *  fs/partitions/atari.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *
+ *  Copyright (C) 1991-1998  Linus Torvalds
+ *  Re-organised Feb 1998 Russell King
+ */
+
+#include <linux/ctype.h>
+#include "check.h"
+#include "atari.h"
+
+/* ++guenther: this should be settable by the user ("make config")?.
+ */
+#define ICD_PARTS
+
+/* check if a partition entry looks valid -- Atari format is assumed if at
+   least one of the primary entries is ok this way */
+#define	VALID_PARTITION(pi,hdsiz)					     \
+    (((pi)->flg & 1) &&							     \
+     isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \
+     be32_to_cpu((pi)->st) <= (hdsiz) &&				     \
+     be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz))
+
+static inline int OK_id(char *s)
+{
+	return  memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 ||
+		memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 ||
+		memcmp (s, "RAW", 3) == 0 ;
+}
+
+int atari_partition(struct parsed_partitions *state)
+{
+	Sector sect;
+	struct rootsector *rs;
+	struct partition_info *pi;
+	u32 extensect;
+	u32 hd_size;
+	int slot;
+#ifdef ICD_PARTS
+	int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
+#endif
+
+	rs = read_part_sector(state, 0, &sect);
+	if (!rs)
+		return -1;
+
+	/* Verify this is an Atari rootsector: */
+	hd_size = state->bdev->bd_inode->i_size >> 9;
+	if (!VALID_PARTITION(&rs->part[0], hd_size) &&
+	    !VALID_PARTITION(&rs->part[1], hd_size) &&
+	    !VALID_PARTITION(&rs->part[2], hd_size) &&
+	    !VALID_PARTITION(&rs->part[3], hd_size)) {
+		/*
+		 * if there's no valid primary partition, assume that no Atari
+		 * format partition table (there's no reliable magic or the like
+	         * :-()
+		 */
+		put_dev_sector(sect);
+		return 0;
+	}
+
+	pi = &rs->part[0];
+	strlcat(state->pp_buf, " AHDI", PAGE_SIZE);
+	for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) {
+		struct rootsector *xrs;
+		Sector sect2;
+		ulong partsect;
+
+		if ( !(pi->flg & 1) )
+			continue;
+		/* active partition */
+		if (memcmp (pi->id, "XGM", 3) != 0) {
+			/* we don't care about other id's */
+			put_partition (state, slot, be32_to_cpu(pi->st),
+					be32_to_cpu(pi->siz));
+			continue;
+		}
+		/* extension partition */
+#ifdef ICD_PARTS
+		part_fmt = 1;
+#endif
+		strlcat(state->pp_buf, " XGM<", PAGE_SIZE);
+		partsect = extensect = be32_to_cpu(pi->st);
+		while (1) {
+			xrs = read_part_sector(state, partsect, &sect2);
+			if (!xrs) {
+				printk (" block %ld read failed\n", partsect);
+				put_dev_sector(sect);
+				return -1;
+			}
+
+			/* ++roman: sanity check: bit 0 of flg field must be set */
+			if (!(xrs->part[0].flg & 1)) {
+				printk( "\nFirst sub-partition in extended partition is not valid!\n" );
+				put_dev_sector(sect2);
+				break;
+			}
+
+			put_partition(state, slot,
+				   partsect + be32_to_cpu(xrs->part[0].st),
+				   be32_to_cpu(xrs->part[0].siz));
+
+			if (!(xrs->part[1].flg & 1)) {
+				/* end of linked partition list */
+				put_dev_sector(sect2);
+				break;
+			}
+			if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) {
+				printk("\nID of extended partition is not XGM!\n");
+				put_dev_sector(sect2);
+				break;
+			}
+
+			partsect = be32_to_cpu(xrs->part[1].st) + extensect;
+			put_dev_sector(sect2);
+			if (++slot == state->limit) {
+				printk( "\nMaximum number of partitions reached!\n" );
+				break;
+			}
+		}
+		strlcat(state->pp_buf, " >", PAGE_SIZE);
+	}
+#ifdef ICD_PARTS
+	if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */
+		pi = &rs->icdpart[0];
+		/* sanity check: no ICD format if first partition invalid */
+		if (OK_id(pi->id)) {
+			strlcat(state->pp_buf, " ICD<", PAGE_SIZE);
+			for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) {
+				/* accept only GEM,BGM,RAW,LNX,SWP partitions */
+				if (!((pi->flg & 1) && OK_id(pi->id)))
+					continue;
+				part_fmt = 2;
+				put_partition (state, slot,
+						be32_to_cpu(pi->st),
+						be32_to_cpu(pi->siz));
+			}
+			strlcat(state->pp_buf, " >", PAGE_SIZE);
+		}
+	}
+#endif
+	put_dev_sector(sect);
+
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+	return 1;
+}
diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h
new file mode 100644
index 00000000..fe2d32a8
--- /dev/null
+++ b/fs/partitions/atari.h
@@ -0,0 +1,34 @@
+/*
+ *  fs/partitions/atari.h
+ *  Moved by Russell King from:
+ *
+ * linux/include/linux/atari_rootsec.h
+ * definitions for Atari Rootsector layout
+ * by Andreas Schwab (schwab@ls5.informatik.uni-dortmund.de)
+ *
+ * modified for ICD/Supra partitioning scheme restricted to at most 12
+ * partitions
+ * by Guenther Kelleter (guenther@pool.informatik.rwth-aachen.de)
+ */
+
+struct partition_info
+{
+  u8 flg;			/* bit 0: active; bit 7: bootable */
+  char id[3];			/* "GEM", "BGM", "XGM", or other */
+  __be32 st;			/* start of partition */
+  __be32 siz;			/* length of partition */
+};
+
+struct rootsector
+{
+  char unused[0x156];		/* room for boot code */
+  struct partition_info icdpart[8];	/* info for ICD-partitions 5..12 */
+  char unused2[0xc];
+  u32 hd_siz;			/* size of disk in blocks */
+  struct partition_info part[4];
+  u32 bsl_st;			/* start of bad sector list */
+  u32 bsl_cnt;			/* length of bad sector list */
+  u16 checksum;			/* checksum for bootable disks */
+} __attribute__((__packed__));
+
+int atari_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
new file mode 100644
index 00000000..811960a5
--- /dev/null
+++ b/fs/partitions/check.c
@@ -0,0 +1,730 @@
+/*
+ *  fs/partitions/check.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *  Copyright (C) 1991-1998  Linus Torvalds
+ *  Re-organised Feb 1998 Russell King
+ *
+ *  We now have independent partition support from the
+ *  block drivers, which allows all the partition code to
+ *  be grouped in one location, and it to be mostly self
+ *  contained.
+ *
+ *  Added needed MAJORS for new pairs, {hdi,hdj}, {hdk,hdl}
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/ctype.h>
+#include <linux/genhd.h>
+#include <linux/blktrace_api.h>
+
+#include "check.h"
+
+#include "acorn.h"
+#include "amiga.h"
+#include "atari.h"
+#include "ldm.h"
+#include "mac.h"
+#include "msdos.h"
+#include "osf.h"
+#include "sgi.h"
+#include "sun.h"
+#include "ibm.h"
+#include "ultrix.h"
+#include "efi.h"
+#include "karma.h"
+#include "sysv68.h"
+
+#ifdef CONFIG_BLK_DEV_MD
+extern void md_autodetect_dev(dev_t dev);
+#endif
+
+int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
+
+static int (*check_part[])(struct parsed_partitions *) = {
+	/*
+	 * Probe partition formats with tables at disk address 0
+	 * that also have an ADFS boot block at 0xdc0.
+	 */
+#ifdef CONFIG_ACORN_PARTITION_ICS
+	adfspart_check_ICS,
+#endif
+#ifdef CONFIG_ACORN_PARTITION_POWERTEC
+	adfspart_check_POWERTEC,
+#endif
+#ifdef CONFIG_ACORN_PARTITION_EESOX
+	adfspart_check_EESOX,
+#endif
+
+	/*
+	 * Now move on to formats that only have partition info at
+	 * disk address 0xdc0.  Since these may also have stale
+	 * PC/BIOS partition tables, they need to come before
+	 * the msdos entry.
+	 */
+#ifdef CONFIG_ACORN_PARTITION_CUMANA
+	adfspart_check_CUMANA,
+#endif
+#ifdef CONFIG_ACORN_PARTITION_ADFS
+	adfspart_check_ADFS,
+#endif
+
+#ifdef CONFIG_EFI_PARTITION
+	efi_partition,		/* this must come before msdos */
+#endif
+#ifdef CONFIG_SGI_PARTITION
+	sgi_partition,
+#endif
+#ifdef CONFIG_LDM_PARTITION
+	ldm_partition,		/* this must come before msdos */
+#endif
+#ifdef CONFIG_MSDOS_PARTITION
+	msdos_partition,
+#endif
+#ifdef CONFIG_OSF_PARTITION
+	osf_partition,
+#endif
+#ifdef CONFIG_SUN_PARTITION
+	sun_partition,
+#endif
+#ifdef CONFIG_AMIGA_PARTITION
+	amiga_partition,
+#endif
+#ifdef CONFIG_ATARI_PARTITION
+	atari_partition,
+#endif
+#ifdef CONFIG_MAC_PARTITION
+	mac_partition,
+#endif
+#ifdef CONFIG_ULTRIX_PARTITION
+	ultrix_partition,
+#endif
+#ifdef CONFIG_IBM_PARTITION
+	ibm_partition,
+#endif
+#ifdef CONFIG_KARMA_PARTITION
+	karma_partition,
+#endif
+#ifdef CONFIG_SYSV68_PARTITION
+	sysv68_partition,
+#endif
+	NULL
+};
+ 
+/*
+ * disk_name() is used by partition check code and the genhd driver.
+ * It formats the devicename of the indicated disk into
+ * the supplied buffer (of size at least 32), and returns
+ * a pointer to that same buffer (for convenience).
+ */
+
+char *disk_name(struct gendisk *hd, int partno, char *buf)
+{
+	if (!partno)
+		snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
+	else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
+		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
+	else
+		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
+
+	return buf;
+}
+
+const char *bdevname(struct block_device *bdev, char *buf)
+{
+	return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
+}
+
+EXPORT_SYMBOL(bdevname);
+
+/*
+ * There's very little reason to use this, you should really
+ * have a struct block_device just about everywhere and use
+ * bdevname() instead.
+ */
+const char *__bdevname(dev_t dev, char *buffer)
+{
+	scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)",
+				MAJOR(dev), MINOR(dev));
+	return buffer;
+}
+
+EXPORT_SYMBOL(__bdevname);
+
+static struct parsed_partitions *
+check_partition(struct gendisk *hd, struct block_device *bdev)
+{
+	struct parsed_partitions *state;
+	int i, res, err;
+
+	state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
+	if (!state)
+		return NULL;
+	state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
+	if (!state->pp_buf) {
+		kfree(state);
+		return NULL;
+	}
+	state->pp_buf[0] = '\0';
+
+	state->bdev = bdev;
+	disk_name(hd, 0, state->name);
+	snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
+	if (isdigit(state->name[strlen(state->name)-1]))
+		sprintf(state->name, "p");
+
+	state->limit = disk_max_parts(hd);
+	i = res = err = 0;
+	while (!res && check_part[i]) {
+		memset(&state->parts, 0, sizeof(state->parts));
+		res = check_part[i++](state);
+		if (res < 0) {
+			/* We have hit an I/O error which we don't report now.
+		 	* But record it, and let the others do their job.
+		 	*/
+			err = res;
+			res = 0;
+		}
+
+	}
+	if (res > 0) {
+		printk(KERN_INFO "%s", state->pp_buf);
+
+		free_page((unsigned long)state->pp_buf);
+		return state;
+	}
+	if (state->access_beyond_eod)
+		err = -ENOSPC;
+	if (err)
+	/* The partition is unrecognized. So report I/O errors if there were any */
+		res = err;
+	if (!res)
+		strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE);
+	else if (warn_no_part)
+		strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE);
+
+	printk(KERN_INFO "%s", state->pp_buf);
+
+	free_page((unsigned long)state->pp_buf);
+	kfree(state);
+	return ERR_PTR(res);
+}
+
+static ssize_t part_partition_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%d\n", p->partno);
+}
+
+static ssize_t part_start_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
+}
+
+ssize_t part_size_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
+}
+
+ssize_t part_ro_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%d\n", p->policy ? 1 : 0);
+}
+
+ssize_t part_alignment_offset_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
+}
+
+ssize_t part_discard_alignment_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%u\n", p->discard_alignment);
+}
+
+ssize_t part_stat_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	int cpu;
+
+	cpu = part_stat_lock();
+	part_round_stats(cpu, p);
+	part_stat_unlock();
+	return sprintf(buf,
+		"%8lu %8lu %8llu %8u "
+		"%8lu %8lu %8llu %8u "
+		"%8u %8u %8u"
+		"\n",
+		part_stat_read(p, ios[READ]),
+		part_stat_read(p, merges[READ]),
+		(unsigned long long)part_stat_read(p, sectors[READ]),
+		jiffies_to_msecs(part_stat_read(p, ticks[READ])),
+		part_stat_read(p, ios[WRITE]),
+		part_stat_read(p, merges[WRITE]),
+		(unsigned long long)part_stat_read(p, sectors[WRITE]),
+		jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
+		part_in_flight(p),
+		jiffies_to_msecs(part_stat_read(p, io_ticks)),
+		jiffies_to_msecs(part_stat_read(p, time_in_queue)));
+}
+
+ssize_t part_inflight_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
+		atomic_read(&p->in_flight[1]));
+}
+
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+ssize_t part_fail_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%d\n", p->make_it_fail);
+}
+
+ssize_t part_fail_store(struct device *dev,
+			struct device_attribute *attr,
+			const char *buf, size_t count)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	int i;
+
+	if (count > 0 && sscanf(buf, "%d", &i) > 0)
+		p->make_it_fail = (i == 0) ? 0 : 1;
+
+	return count;
+}
+#endif
+
+static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
+static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
+static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
+		   NULL);
+static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+static struct device_attribute dev_attr_fail =
+	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
+#endif
+
+static struct attribute *part_attrs[] = {
+	&dev_attr_partition.attr,
+	&dev_attr_start.attr,
+	&dev_attr_size.attr,
+	&dev_attr_ro.attr,
+	&dev_attr_alignment_offset.attr,
+	&dev_attr_discard_alignment.attr,
+	&dev_attr_stat.attr,
+	&dev_attr_inflight.attr,
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+	&dev_attr_fail.attr,
+#endif
+	NULL
+};
+
+static struct attribute_group part_attr_group = {
+	.attrs = part_attrs,
+};
+
+static const struct attribute_group *part_attr_groups[] = {
+	&part_attr_group,
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+	&blk_trace_attr_group,
+#endif
+	NULL
+};
+
+static void part_release(struct device *dev)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	free_part_stats(p);
+	free_part_info(p);
+	kfree(p);
+}
+
+static int part_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct hd_struct *part = dev_to_part(dev);
+
+	add_uevent_var(env, "PARTN=%u", part->partno);
+	if (part->info && part->info->volname[0])
+		add_uevent_var(env, "PARTNAME=%s", part->info->volname);
+	return 0;
+}
+
+struct device_type part_type = {
+	.name		= "partition",
+	.groups		= part_attr_groups,
+	.release	= part_release,
+	.uevent		= part_uevent,
+};
+
+static void delete_partition_rcu_cb(struct rcu_head *head)
+{
+	struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
+
+	part->start_sect = 0;
+	part->nr_sects = 0;
+	part_stat_set_all(part, 0);
+	put_device(part_to_dev(part));
+}
+
+void __delete_partition(struct hd_struct *part)
+{
+	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+}
+
+void delete_partition(struct gendisk *disk, int partno)
+{
+	struct disk_part_tbl *ptbl = disk->part_tbl;
+	struct hd_struct *part;
+
+	if (partno >= ptbl->len)
+		return;
+
+	part = ptbl->part[partno];
+	if (!part)
+		return;
+
+	blk_free_devt(part_devt(part));
+	rcu_assign_pointer(ptbl->part[partno], NULL);
+	rcu_assign_pointer(ptbl->last_lookup, NULL);
+	kobject_put(part->holder_dir);
+	device_del(part_to_dev(part));
+
+	hd_struct_put(part);
+}
+
+static ssize_t whole_disk_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	return 0;
+}
+static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
+		   whole_disk_show, NULL);
+
+struct hd_struct *add_partition(struct gendisk *disk, int partno,
+				sector_t start, sector_t len, int flags,
+				struct partition_meta_info *info)
+{
+	struct hd_struct *p;
+	dev_t devt = MKDEV(0, 0);
+	struct device *ddev = disk_to_dev(disk);
+	struct device *pdev;
+	struct disk_part_tbl *ptbl;
+	const char *dname;
+	int err;
+
+	err = disk_expand_part_tbl(disk, partno);
+	if (err)
+		return ERR_PTR(err);
+	ptbl = disk->part_tbl;
+
+	if (ptbl->part[partno])
+		return ERR_PTR(-EBUSY);
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return ERR_PTR(-EBUSY);
+
+	if (!init_part_stats(p)) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+	pdev = part_to_dev(p);
+
+	p->start_sect = start;
+	p->alignment_offset =
+		queue_limit_alignment_offset(&disk->queue->limits, start);
+	p->discard_alignment =
+		queue_limit_discard_alignment(&disk->queue->limits, start);
+	p->nr_sects = len;
+	p->partno = partno;
+	p->policy = get_disk_ro(disk);
+
+	if (info) {
+		struct partition_meta_info *pinfo = alloc_part_info(disk);
+		if (!pinfo)
+			goto out_free_stats;
+		memcpy(pinfo, info, sizeof(*info));
+		p->info = pinfo;
+	}
+
+	dname = dev_name(ddev);
+	if (isdigit(dname[strlen(dname) - 1]))
+		dev_set_name(pdev, "%sp%d", dname, partno);
+	else
+		dev_set_name(pdev, "%s%d", dname, partno);
+
+	device_initialize(pdev);
+	pdev->class = &block_class;
+	pdev->type = &part_type;
+	pdev->parent = ddev;
+
+	err = blk_alloc_devt(p, &devt);
+	if (err)
+		goto out_free_info;
+	pdev->devt = devt;
+
+	/* delay uevent until 'holders' subdir is created */
+	dev_set_uevent_suppress(pdev, 1);
+	err = device_add(pdev);
+	if (err)
+		goto out_put;
+
+	err = -ENOMEM;
+	p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
+	if (!p->holder_dir)
+		goto out_del;
+
+	dev_set_uevent_suppress(pdev, 0);
+	if (flags & ADDPART_FLAG_WHOLEDISK) {
+		err = device_create_file(pdev, &dev_attr_whole_disk);
+		if (err)
+			goto out_del;
+	}
+
+	/* everything is up and running, commence */
+	rcu_assign_pointer(ptbl->part[partno], p);
+
+	/* suppress uevent if the disk suppresses it */
+	if (!dev_get_uevent_suppress(ddev))
+		kobject_uevent(&pdev->kobj, KOBJ_ADD);
+
+	hd_ref_init(p);
+	return p;
+
+out_free_info:
+	free_part_info(p);
+out_free_stats:
+	free_part_stats(p);
+out_free:
+	kfree(p);
+	return ERR_PTR(err);
+out_del:
+	kobject_put(p->holder_dir);
+	device_del(pdev);
+out_put:
+	put_device(pdev);
+	blk_free_devt(devt);
+	return ERR_PTR(err);
+}
+
+static bool disk_unlock_native_capacity(struct gendisk *disk)
+{
+	const struct block_device_operations *bdops = disk->fops;
+
+	if (bdops->unlock_native_capacity &&
+	    !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
+		printk(KERN_CONT "enabling native capacity\n");
+		bdops->unlock_native_capacity(disk);
+		disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+		return true;
+	} else {
+		printk(KERN_CONT "truncated\n");
+		return false;
+	}
+}
+
+static int drop_partitions(struct gendisk *disk, struct block_device *bdev)
+{
+	struct disk_part_iter piter;
+	struct hd_struct *part;
+	int res;
+
+	if (bdev->bd_part_count)
+		return -EBUSY;
+	res = invalidate_partition(disk, 0);
+	if (res)
+		return res;
+
+	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
+	while ((part = disk_part_iter_next(&piter)))
+		delete_partition(disk, part->partno);
+	disk_part_iter_exit(&piter);
+
+	return 0;
+}
+
+int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
+{
+	struct parsed_partitions *state = NULL;
+	struct hd_struct *part;
+	int p, highest, res;
+rescan:
+	if (state && !IS_ERR(state)) {
+		kfree(state);
+		state = NULL;
+	}
+
+	res = drop_partitions(disk, bdev);
+	if (res)
+		return res;
+
+	if (disk->fops->revalidate_disk)
+		disk->fops->revalidate_disk(disk);
+	check_disk_size_change(disk, bdev);
+	bdev->bd_invalidated = 0;
+	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
+		return 0;
+	if (IS_ERR(state)) {
+		/*
+		 * I/O error reading the partition table.  If any
+		 * partition code tried to read beyond EOD, retry
+		 * after unlocking native capacity.
+		 */
+		if (PTR_ERR(state) == -ENOSPC) {
+			printk(KERN_WARNING "%s: partition table beyond EOD, ",
+			       disk->disk_name);
+			if (disk_unlock_native_capacity(disk))
+				goto rescan;
+		}
+		return -EIO;
+	}
+	/*
+	 * If any partition code tried to read beyond EOD, try
+	 * unlocking native capacity even if partition table is
+	 * successfully read as we could be missing some partitions.
+	 */
+	if (state->access_beyond_eod) {
+		printk(KERN_WARNING
+		       "%s: partition table partially beyond EOD, ",
+		       disk->disk_name);
+		if (disk_unlock_native_capacity(disk))
+			goto rescan;
+	}
+
+	/* tell userspace that the media / partition table may have changed */
+	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+
+	/* Detect the highest partition number and preallocate
+	 * disk->part_tbl.  This is an optimization and not strictly
+	 * necessary.
+	 */
+	for (p = 1, highest = 0; p < state->limit; p++)
+		if (state->parts[p].size)
+			highest = p;
+
+	disk_expand_part_tbl(disk, highest);
+
+	/* add partitions */
+	for (p = 1; p < state->limit; p++) {
+		sector_t size, from;
+		struct partition_meta_info *info = NULL;
+
+		size = state->parts[p].size;
+		if (!size)
+			continue;
+
+		from = state->parts[p].from;
+		if (from >= get_capacity(disk)) {
+			printk(KERN_WARNING
+			       "%s: p%d start %llu is beyond EOD, ",
+			       disk->disk_name, p, (unsigned long long) from);
+			if (disk_unlock_native_capacity(disk))
+				goto rescan;
+			continue;
+		}
+
+		if (from + size > get_capacity(disk)) {
+			printk(KERN_WARNING
+			       "%s: p%d size %llu extends beyond EOD, ",
+			       disk->disk_name, p, (unsigned long long) size);
+
+			if (disk_unlock_native_capacity(disk)) {
+				/* free state and restart */
+				goto rescan;
+			} else {
+				/*
+				 * we can not ignore partitions of broken tables
+				 * created by for example camera firmware, but
+				 * we limit them to the end of the disk to avoid
+				 * creating invalid block devices
+				 */
+				size = get_capacity(disk) - from;
+			}
+		}
+
+		if (state->parts[p].has_info)
+			info = &state->parts[p].info;
+		part = add_partition(disk, p, from, size,
+				     state->parts[p].flags,
+				     &state->parts[p].info);
+		if (IS_ERR(part)) {
+			printk(KERN_ERR " %s: p%d could not be added: %ld\n",
+			       disk->disk_name, p, -PTR_ERR(part));
+			continue;
+		}
+#ifdef CONFIG_BLK_DEV_MD
+		if (state->parts[p].flags & ADDPART_FLAG_RAID)
+			md_autodetect_dev(part_to_dev(part)->devt);
+#endif
+	}
+	kfree(state);
+	return 0;
+}
+
+int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
+{
+	int res;
+
+	if (!bdev->bd_invalidated)
+		return 0;
+
+	res = drop_partitions(disk, bdev);
+	if (res)
+		return res;
+
+	set_capacity(disk, 0);
+	check_disk_size_change(disk, bdev);
+	bdev->bd_invalidated = 0;
+	/* tell userspace that the media / partition table may have changed */
+	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+
+	return 0;
+}
+
+unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
+{
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+	struct page *page;
+
+	page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
+				 NULL);
+	if (!IS_ERR(page)) {
+		if (PageError(page))
+			goto fail;
+		p->v = page;
+		return (unsigned char *)page_address(page) +  ((n & ((1 << (PAGE_CACHE_SHIFT - 9)) - 1)) << 9);
+fail:
+		page_cache_release(page);
+	}
+	p->v = NULL;
+	return NULL;
+}
+
+EXPORT_SYMBOL(read_dev_sector);
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
new file mode 100644
index 00000000..d68bf4dc
--- /dev/null
+++ b/fs/partitions/check.h
@@ -0,0 +1,49 @@
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+
+/*
+ * add_gd_partition adds a partitions details to the devices partition
+ * description.
+ */
+struct parsed_partitions {
+	struct block_device *bdev;
+	char name[BDEVNAME_SIZE];
+	struct {
+		sector_t from;
+		sector_t size;
+		int flags;
+		bool has_info;
+		struct partition_meta_info info;
+	} parts[DISK_MAX_PARTS];
+	int next;
+	int limit;
+	bool access_beyond_eod;
+	char *pp_buf;
+};
+
+static inline void *read_part_sector(struct parsed_partitions *state,
+				     sector_t n, Sector *p)
+{
+	if (n >= get_capacity(state->bdev->bd_disk)) {
+		state->access_beyond_eod = true;
+		return NULL;
+	}
+	return read_dev_sector(state->bdev, n, p);
+}
+
+static inline void
+put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
+{
+	if (n < p->limit) {
+		char tmp[1 + BDEVNAME_SIZE + 10 + 1];
+
+		p->parts[n].from = from;
+		p->parts[n].size = size;
+		snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
+		strlcat(p->pp_buf, tmp, PAGE_SIZE);
+	}
+}
+
+extern int warn_no_part;
+
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
new file mode 100644
index 00000000..6296b403
--- /dev/null
+++ b/fs/partitions/efi.c
@@ -0,0 +1,675 @@
+/************************************************************
+ * EFI GUID Partition Table handling
+ *
+ * http://www.uefi.org/specs/
+ * http://www.intel.com/technology/efi/
+ *
+ * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
+ *   Copyright 2000,2001,2002,2004 Dell Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * TODO:
+ *
+ * Changelog:
+ * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com>
+ * - test for valid PMBR and valid PGPT before ever reading
+ *   AGPT, allow override with 'gpt' kernel command line option.
+ * - check for first/last_usable_lba outside of size of disk
+ *
+ * Tue  Mar 26 2002 Matt Domsch <Matt_Domsch@dell.com>
+ * - Ported to 2.5.7-pre1 and 2.5.7-dj2
+ * - Applied patch to avoid fault in alternate header handling
+ * - cleaned up find_valid_gpt
+ * - On-disk structure and copy in memory is *always* LE now - 
+ *   swab fields as needed
+ * - remove print_gpt_header()
+ * - only use first max_p partition entries, to keep the kernel minor number
+ *   and partition numbers tied.
+ *
+ * Mon  Feb 04 2002 Matt Domsch <Matt_Domsch@dell.com>
+ * - Removed __PRIPTR_PREFIX - not being used
+ *
+ * Mon  Jan 14 2002 Matt Domsch <Matt_Domsch@dell.com>
+ * - Ported to 2.5.2-pre11 + library crc32 patch Linus applied
+ *
+ * Thu Dec 6 2001 Matt Domsch <Matt_Domsch@dell.com>
+ * - Added compare_gpts().
+ * - moved le_efi_guid_to_cpus() back into this file.  GPT is the only
+ *   thing that keeps EFI GUIDs on disk.
+ * - Changed gpt structure names and members to be simpler and more Linux-like.
+ * 
+ * Wed Oct 17 2001 Matt Domsch <Matt_Domsch@dell.com>
+ * - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck
+ *
+ * Wed Oct 10 2001 Matt Domsch <Matt_Domsch@dell.com>
+ * - Changed function comments to DocBook style per Andreas Dilger suggestion.
+ *
+ * Mon Oct 08 2001 Matt Domsch <Matt_Domsch@dell.com>
+ * - Change read_lba() to use the page cache per Al Viro's work.
+ * - print u64s properly on all architectures
+ * - fixed debug_printk(), now Dprintk()
+ *
+ * Mon Oct 01 2001 Matt Domsch <Matt_Domsch@dell.com>
+ * - Style cleanups
+ * - made most functions static
+ * - Endianness addition
+ * - remove test for second alternate header, as it's not per spec,
+ *   and is unnecessary.  There's now a method to read/write the last
+ *   sector of an odd-sized disk from user space.  No tools have ever
+ *   been released which used this code, so it's effectively dead.
+ * - Per Asit Mallick of Intel, added a test for a valid PMBR.
+ * - Added kernel command line option 'gpt' to override valid PMBR test.
+ *
+ * Wed Jun  6 2001 Martin Wilck <Martin.Wilck@Fujitsu-Siemens.com>
+ * - added devfs volume UUID support (/dev/volumes/uuids) for
+ *   mounting file systems by the partition GUID. 
+ *
+ * Tue Dec  5 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * - Moved crc32() to linux/lib, added efi_crc32().
+ *
+ * Thu Nov 30 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * - Replaced Intel's CRC32 function with an equivalent
+ *   non-license-restricted version.
+ *
+ * Wed Oct 25 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * - Fixed the last_lba() call to return the proper last block
+ *
+ * Thu Oct 12 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * - Thanks to Andries Brouwer for his debugging assistance.
+ * - Code works, detects all the partitions.
+ *
+ ************************************************************/
+#include <linux/crc32.h>
+#include <linux/ctype.h>
+#include <linux/math64.h>
+#include <linux/slab.h>
+#include "check.h"
+#include "efi.h"
+
+/* This allows a kernel command line option 'gpt' to override
+ * the test for invalid PMBR.  Not __initdata because reloading
+ * the partition tables happens after init too.
+ */
+static int force_gpt;
+static int __init
+force_gpt_fn(char *str)
+{
+	force_gpt = 1;
+	return 1;
+}
+__setup("gpt", force_gpt_fn);
+
+
+/**
+ * efi_crc32() - EFI version of crc32 function
+ * @buf: buffer to calculate crc32 of
+ * @len - length of buf
+ *
+ * Description: Returns EFI-style CRC32 value for @buf
+ * 
+ * This function uses the little endian Ethernet polynomial
+ * but seeds the function with ~0, and xor's with ~0 at the end.
+ * Note, the EFI Specification, v1.02, has a reference to
+ * Dr. Dobbs Journal, May 1994 (actually it's in May 1992).
+ */
+static inline u32
+efi_crc32(const void *buf, unsigned long len)
+{
+	return (crc32(~0L, buf, len) ^ ~0L);
+}
+
+/**
+ * last_lba(): return number of last logical block of device
+ * @bdev: block device
+ * 
+ * Description: Returns last LBA value on success, 0 on error.
+ * This is stored (by sd and ide-geometry) in
+ *  the part[0] entry for this disk, and is the number of
+ *  physical sectors available on the disk.
+ */
+static u64 last_lba(struct block_device *bdev)
+{
+	if (!bdev || !bdev->bd_inode)
+		return 0;
+	return div_u64(bdev->bd_inode->i_size,
+		       bdev_logical_block_size(bdev)) - 1ULL;
+}
+
+static inline int
+pmbr_part_valid(struct partition *part)
+{
+        if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT &&
+            le32_to_cpu(part->start_sect) == 1UL)
+                return 1;
+        return 0;
+}
+
+/**
+ * is_pmbr_valid(): test Protective MBR for validity
+ * @mbr: pointer to a legacy mbr structure
+ *
+ * Description: Returns 1 if PMBR is valid, 0 otherwise.
+ * Validity depends on two things:
+ *  1) MSDOS signature is in the last two bytes of the MBR
+ *  2) One partition of type 0xEE is found
+ */
+static int
+is_pmbr_valid(legacy_mbr *mbr)
+{
+	int i;
+	if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE)
+                return 0;
+	for (i = 0; i < 4; i++)
+		if (pmbr_part_valid(&mbr->partition_record[i]))
+                        return 1;
+	return 0;
+}
+
+/**
+ * read_lba(): Read bytes from disk, starting at given LBA
+ * @state
+ * @lba
+ * @buffer
+ * @size_t
+ *
+ * Description: Reads @count bytes from @state->bdev into @buffer.
+ * Returns number of bytes read on success, 0 on error.
+ */
+static size_t read_lba(struct parsed_partitions *state,
+		       u64 lba, u8 *buffer, size_t count)
+{
+	size_t totalreadcount = 0;
+	struct block_device *bdev = state->bdev;
+	sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
+
+	if (!buffer || lba > last_lba(bdev))
+                return 0;
+
+	while (count) {
+		int copied = 512;
+		Sector sect;
+		unsigned char *data = read_part_sector(state, n++, &sect);
+		if (!data)
+			break;
+		if (copied > count)
+			copied = count;
+		memcpy(buffer, data, copied);
+		put_dev_sector(sect);
+		buffer += copied;
+		totalreadcount +=copied;
+		count -= copied;
+	}
+	return totalreadcount;
+}
+
+/**
+ * alloc_read_gpt_entries(): reads partition entries from disk
+ * @state
+ * @gpt - GPT header
+ * 
+ * Description: Returns ptes on success,  NULL on error.
+ * Allocates space for PTEs based on information found in @gpt.
+ * Notes: remember to free pte when you're done!
+ */
+static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
+					 gpt_header *gpt)
+{
+	size_t count;
+	gpt_entry *pte;
+
+	if (!gpt)
+		return NULL;
+
+	count = le32_to_cpu(gpt->num_partition_entries) *
+                le32_to_cpu(gpt->sizeof_partition_entry);
+	if (!count)
+		return NULL;
+	pte = kzalloc(count, GFP_KERNEL);
+	if (!pte)
+		return NULL;
+
+	if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
+                     (u8 *) pte,
+		     count) < count) {
+		kfree(pte);
+                pte=NULL;
+		return NULL;
+	}
+	return pte;
+}
+
+/**
+ * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
+ * @state
+ * @lba is the Logical Block Address of the partition table
+ * 
+ * Description: returns GPT header on success, NULL on error.   Allocates
+ * and fills a GPT header starting at @ from @state->bdev.
+ * Note: remember to free gpt when finished with it.
+ */
+static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
+					 u64 lba)
+{
+	gpt_header *gpt;
+	unsigned ssz = bdev_logical_block_size(state->bdev);
+
+	gpt = kzalloc(ssz, GFP_KERNEL);
+	if (!gpt)
+		return NULL;
+
+	if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
+		kfree(gpt);
+                gpt=NULL;
+		return NULL;
+	}
+
+	return gpt;
+}
+
+/**
+ * is_gpt_valid() - tests one GPT header and PTEs for validity
+ * @state
+ * @lba is the logical block address of the GPT header to test
+ * @gpt is a GPT header ptr, filled on return.
+ * @ptes is a PTEs ptr, filled on return.
+ *
+ * Description: returns 1 if valid,  0 on error.
+ * If valid, returns pointers to newly allocated GPT header and PTEs.
+ */
+static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
+			gpt_header **gpt, gpt_entry **ptes)
+{
+	u32 crc, origcrc;
+	u64 lastlba;
+
+	if (!ptes)
+		return 0;
+	if (!(*gpt = alloc_read_gpt_header(state, lba)))
+		return 0;
+
+	/* Check the GUID Partition Table signature */
+	if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) {
+		pr_debug("GUID Partition Table Header signature is wrong:"
+			 "%lld != %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->signature),
+			 (unsigned long long)GPT_HEADER_SIGNATURE);
+		goto fail;
+	}
+
+	/* Check the GUID Partition Table header size */
+	if (le32_to_cpu((*gpt)->header_size) >
+			bdev_logical_block_size(state->bdev)) {
+		pr_debug("GUID Partition Table Header size is wrong: %u > %u\n",
+			le32_to_cpu((*gpt)->header_size),
+			bdev_logical_block_size(state->bdev));
+		goto fail;
+	}
+
+	/* Check the GUID Partition Table CRC */
+	origcrc = le32_to_cpu((*gpt)->header_crc32);
+	(*gpt)->header_crc32 = 0;
+	crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size));
+
+	if (crc != origcrc) {
+		pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n",
+			 crc, origcrc);
+		goto fail;
+	}
+	(*gpt)->header_crc32 = cpu_to_le32(origcrc);
+
+	/* Check that the my_lba entry points to the LBA that contains
+	 * the GUID Partition Table */
+	if (le64_to_cpu((*gpt)->my_lba) != lba) {
+		pr_debug("GPT my_lba incorrect: %lld != %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->my_lba),
+			 (unsigned long long)lba);
+		goto fail;
+	}
+
+	/* Check the first_usable_lba and last_usable_lba are
+	 * within the disk.
+	 */
+	lastlba = last_lba(state->bdev);
+	if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
+		pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
+			 (unsigned long long)lastlba);
+		goto fail;
+	}
+	if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) {
+		pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
+			 (unsigned long long)lastlba);
+		goto fail;
+	}
+
+	/* Check that sizeof_partition_entry has the correct value */
+	if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) {
+		pr_debug("GUID Partitition Entry Size check failed.\n");
+		goto fail;
+	}
+
+	if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
+		goto fail;
+
+	/* Check the GUID Partition Entry Array CRC */
+	crc = efi_crc32((const unsigned char *) (*ptes),
+			le32_to_cpu((*gpt)->num_partition_entries) *
+			le32_to_cpu((*gpt)->sizeof_partition_entry));
+
+	if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) {
+		pr_debug("GUID Partitition Entry Array CRC check failed.\n");
+		goto fail_ptes;
+	}
+
+	/* We're done, all's well */
+	return 1;
+
+ fail_ptes:
+	kfree(*ptes);
+	*ptes = NULL;
+ fail:
+	kfree(*gpt);
+	*gpt = NULL;
+	return 0;
+}
+
+/**
+ * is_pte_valid() - tests one PTE for validity
+ * @pte is the pte to check
+ * @lastlba is last lba of the disk
+ *
+ * Description: returns 1 if valid,  0 on error.
+ */
+static inline int
+is_pte_valid(const gpt_entry *pte, const u64 lastlba)
+{
+	if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) ||
+	    le64_to_cpu(pte->starting_lba) > lastlba         ||
+	    le64_to_cpu(pte->ending_lba)   > lastlba)
+		return 0;
+	return 1;
+}
+
+/**
+ * compare_gpts() - Search disk for valid GPT headers and PTEs
+ * @pgpt is the primary GPT header
+ * @agpt is the alternate GPT header
+ * @lastlba is the last LBA number
+ * Description: Returns nothing.  Sanity checks pgpt and agpt fields
+ * and prints warnings on discrepancies.
+ * 
+ */
+static void
+compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
+{
+	int error_found = 0;
+	if (!pgpt || !agpt)
+		return;
+	if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) {
+		printk(KERN_WARNING
+		       "GPT:Primary header LBA != Alt. header alternate_lba\n");
+		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		       (unsigned long long)le64_to_cpu(pgpt->my_lba),
+                       (unsigned long long)le64_to_cpu(agpt->alternate_lba));
+		error_found++;
+	}
+	if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) {
+		printk(KERN_WARNING
+		       "GPT:Primary header alternate_lba != Alt. header my_lba\n");
+		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		       (unsigned long long)le64_to_cpu(pgpt->alternate_lba),
+                       (unsigned long long)le64_to_cpu(agpt->my_lba));
+		error_found++;
+	}
+	if (le64_to_cpu(pgpt->first_usable_lba) !=
+            le64_to_cpu(agpt->first_usable_lba)) {
+		printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n");
+		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		       (unsigned long long)le64_to_cpu(pgpt->first_usable_lba),
+                       (unsigned long long)le64_to_cpu(agpt->first_usable_lba));
+		error_found++;
+	}
+	if (le64_to_cpu(pgpt->last_usable_lba) !=
+            le64_to_cpu(agpt->last_usable_lba)) {
+		printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n");
+		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		       (unsigned long long)le64_to_cpu(pgpt->last_usable_lba),
+                       (unsigned long long)le64_to_cpu(agpt->last_usable_lba));
+		error_found++;
+	}
+	if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) {
+		printk(KERN_WARNING "GPT:disk_guids don't match.\n");
+		error_found++;
+	}
+	if (le32_to_cpu(pgpt->num_partition_entries) !=
+            le32_to_cpu(agpt->num_partition_entries)) {
+		printk(KERN_WARNING "GPT:num_partition_entries don't match: "
+		       "0x%x != 0x%x\n",
+		       le32_to_cpu(pgpt->num_partition_entries),
+		       le32_to_cpu(agpt->num_partition_entries));
+		error_found++;
+	}
+	if (le32_to_cpu(pgpt->sizeof_partition_entry) !=
+            le32_to_cpu(agpt->sizeof_partition_entry)) {
+		printk(KERN_WARNING
+		       "GPT:sizeof_partition_entry values don't match: "
+		       "0x%x != 0x%x\n",
+                       le32_to_cpu(pgpt->sizeof_partition_entry),
+		       le32_to_cpu(agpt->sizeof_partition_entry));
+		error_found++;
+	}
+	if (le32_to_cpu(pgpt->partition_entry_array_crc32) !=
+            le32_to_cpu(agpt->partition_entry_array_crc32)) {
+		printk(KERN_WARNING
+		       "GPT:partition_entry_array_crc32 values don't match: "
+		       "0x%x != 0x%x\n",
+                       le32_to_cpu(pgpt->partition_entry_array_crc32),
+		       le32_to_cpu(agpt->partition_entry_array_crc32));
+		error_found++;
+	}
+	if (le64_to_cpu(pgpt->alternate_lba) != lastlba) {
+		printk(KERN_WARNING
+		       "GPT:Primary header thinks Alt. header is not at the end of the disk.\n");
+		printk(KERN_WARNING "GPT:%lld != %lld\n",
+			(unsigned long long)le64_to_cpu(pgpt->alternate_lba),
+			(unsigned long long)lastlba);
+		error_found++;
+	}
+
+	if (le64_to_cpu(agpt->my_lba) != lastlba) {
+		printk(KERN_WARNING
+		       "GPT:Alternate GPT header not at the end of the disk.\n");
+		printk(KERN_WARNING "GPT:%lld != %lld\n",
+			(unsigned long long)le64_to_cpu(agpt->my_lba),
+			(unsigned long long)lastlba);
+		error_found++;
+	}
+
+	if (error_found)
+		printk(KERN_WARNING
+		       "GPT: Use GNU Parted to correct GPT errors.\n");
+	return;
+}
+
+/**
+ * find_valid_gpt() - Search disk for valid GPT headers and PTEs
+ * @state
+ * @gpt is a GPT header ptr, filled on return.
+ * @ptes is a PTEs ptr, filled on return.
+ * Description: Returns 1 if valid, 0 on error.
+ * If valid, returns pointers to newly allocated GPT header and PTEs.
+ * Validity depends on PMBR being valid (or being overridden by the
+ * 'gpt' kernel command line option) and finding either the Primary
+ * GPT header and PTEs valid, or the Alternate GPT header and PTEs
+ * valid.  If the Primary GPT header is not valid, the Alternate GPT header
+ * is not checked unless the 'gpt' kernel command line option is passed.
+ * This protects against devices which misreport their size, and forces
+ * the user to decide to use the Alternate GPT.
+ */
+static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
+			  gpt_entry **ptes)
+{
+	int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
+	gpt_header *pgpt = NULL, *agpt = NULL;
+	gpt_entry *pptes = NULL, *aptes = NULL;
+	legacy_mbr *legacymbr;
+	u64 lastlba;
+
+	if (!ptes)
+		return 0;
+
+	lastlba = last_lba(state->bdev);
+        if (!force_gpt) {
+                /* This will be added to the EFI Spec. per Intel after v1.02. */
+                legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
+                if (legacymbr) {
+                        read_lba(state, 0, (u8 *) legacymbr,
+				 sizeof (*legacymbr));
+                        good_pmbr = is_pmbr_valid(legacymbr);
+                        kfree(legacymbr);
+                }
+                if (!good_pmbr)
+                        goto fail;
+        }
+
+	good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
+				 &pgpt, &pptes);
+        if (good_pgpt)
+		good_agpt = is_gpt_valid(state,
+					 le64_to_cpu(pgpt->alternate_lba),
+					 &agpt, &aptes);
+        if (!good_agpt && force_gpt)
+                good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
+
+        /* The obviously unsuccessful case */
+        if (!good_pgpt && !good_agpt)
+                goto fail;
+
+        compare_gpts(pgpt, agpt, lastlba);
+
+        /* The good cases */
+        if (good_pgpt) {
+                *gpt  = pgpt;
+                *ptes = pptes;
+                kfree(agpt);
+                kfree(aptes);
+                if (!good_agpt) {
+                        printk(KERN_WARNING 
+			       "Alternate GPT is invalid, "
+                               "using primary GPT.\n");
+                }
+                return 1;
+        }
+        else if (good_agpt) {
+                *gpt  = agpt;
+                *ptes = aptes;
+                kfree(pgpt);
+                kfree(pptes);
+                printk(KERN_WARNING 
+                       "Primary GPT is invalid, using alternate GPT.\n");
+                return 1;
+        }
+
+ fail:
+        kfree(pgpt);
+        kfree(agpt);
+        kfree(pptes);
+        kfree(aptes);
+        *gpt = NULL;
+        *ptes = NULL;
+        return 0;
+}
+
+/**
+ * efi_partition(struct parsed_partitions *state)
+ * @state
+ *
+ * Description: called from check.c, if the disk contains GPT
+ * partitions, sets up partition entries in the kernel.
+ *
+ * If the first block on the disk is a legacy MBR,
+ * it will get handled by msdos_partition().
+ * If it's a Protective MBR, we'll handle it here.
+ *
+ * We do not create a Linux partition for GPT, but
+ * only for the actual data partitions.
+ * Returns:
+ * -1 if unable to read the partition table
+ *  0 if this isn't our partition table
+ *  1 if successful
+ *
+ */
+int efi_partition(struct parsed_partitions *state)
+{
+	gpt_header *gpt = NULL;
+	gpt_entry *ptes = NULL;
+	u32 i;
+	unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
+	u8 unparsed_guid[37];
+
+	if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
+		kfree(gpt);
+		kfree(ptes);
+		return 0;
+	}
+
+	pr_debug("GUID Partition Table is valid!  Yea!\n");
+
+	for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+		struct partition_meta_info *info;
+		unsigned label_count = 0;
+		unsigned label_max;
+		u64 start = le64_to_cpu(ptes[i].starting_lba);
+		u64 size = le64_to_cpu(ptes[i].ending_lba) -
+			   le64_to_cpu(ptes[i].starting_lba) + 1ULL;
+
+		if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
+			continue;
+
+		put_partition(state, i+1, start * ssz, size * ssz);
+
+		/* If this is a RAID volume, tell md */
+		if (!efi_guidcmp(ptes[i].partition_type_guid,
+				 PARTITION_LINUX_RAID_GUID))
+			state->parts[i + 1].flags = ADDPART_FLAG_RAID;
+
+		info = &state->parts[i + 1].info;
+		/* Instead of doing a manual swap to big endian, reuse the
+		 * common ASCII hex format as the interim.
+		 */
+		efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
+		part_pack_uuid(unparsed_guid, info->uuid);
+
+		/* Naively convert UTF16-LE to 7 bits. */
+		label_max = min(sizeof(info->volname) - 1,
+				sizeof(ptes[i].partition_name));
+		info->volname[label_max] = 0;
+		while (label_count < label_max) {
+			u8 c = ptes[i].partition_name[label_count] & 0xff;
+			if (c && !isprint(c))
+				c = '!';
+			info->volname[label_count] = c;
+			label_count++;
+		}
+		state->parts[i + 1].has_info = true;
+	}
+	kfree(ptes);
+	kfree(gpt);
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	return 1;
+}
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
new file mode 100644
index 00000000..b69ab729
--- /dev/null
+++ b/fs/partitions/efi.h
@@ -0,0 +1,134 @@
+/************************************************************
+ * EFI GUID Partition Table
+ * Per Intel EFI Specification v1.02
+ * http://developer.intel.com/technology/efi/efi.htm
+ *
+ * By Matt Domsch <Matt_Domsch@dell.com>  Fri Sep 22 22:15:56 CDT 2000  
+ *   Copyright 2000,2001 Dell Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * 
+ ************************************************************/
+
+#ifndef FS_PART_EFI_H_INCLUDED
+#define FS_PART_EFI_H_INCLUDED
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/string.h>
+#include <linux/efi.h>
+
+#define MSDOS_MBR_SIGNATURE 0xaa55
+#define EFI_PMBR_OSTYPE_EFI 0xEF
+#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
+
+#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
+#define GPT_HEADER_REVISION_V1 0x00010000
+#define GPT_PRIMARY_PARTITION_TABLE_LBA 1
+
+#define PARTITION_SYSTEM_GUID \
+    EFI_GUID( 0xC12A7328, 0xF81F, 0x11d2, \
+              0xBA, 0x4B, 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B) 
+#define LEGACY_MBR_PARTITION_GUID \
+    EFI_GUID( 0x024DEE41, 0x33E7, 0x11d3, \
+              0x9D, 0x69, 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F)
+#define PARTITION_MSFT_RESERVED_GUID \
+    EFI_GUID( 0xE3C9E316, 0x0B5C, 0x4DB8, \
+              0x81, 0x7D, 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE)
+#define PARTITION_BASIC_DATA_GUID \
+    EFI_GUID( 0xEBD0A0A2, 0xB9E5, 0x4433, \
+              0x87, 0xC0, 0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7)
+#define PARTITION_LINUX_RAID_GUID \
+    EFI_GUID( 0xa19d880f, 0x05fc, 0x4d3b, \
+              0xa0, 0x06, 0x74, 0x3f, 0x0f, 0x84, 0x91, 0x1e)
+#define PARTITION_LINUX_SWAP_GUID \
+    EFI_GUID( 0x0657fd6d, 0xa4ab, 0x43c4, \
+              0x84, 0xe5, 0x09, 0x33, 0xc8, 0x4b, 0x4f, 0x4f)
+#define PARTITION_LINUX_LVM_GUID \
+    EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \
+              0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28)
+
+typedef struct _gpt_header {
+	__le64 signature;
+	__le32 revision;
+	__le32 header_size;
+	__le32 header_crc32;
+	__le32 reserved1;
+	__le64 my_lba;
+	__le64 alternate_lba;
+	__le64 first_usable_lba;
+	__le64 last_usable_lba;
+	efi_guid_t disk_guid;
+	__le64 partition_entry_lba;
+	__le32 num_partition_entries;
+	__le32 sizeof_partition_entry;
+	__le32 partition_entry_array_crc32;
+
+	/* The rest of the logical block is reserved by UEFI and must be zero.
+	 * EFI standard handles this by:
+	 *
+	 * uint8_t		reserved2[ BlockSize - 92 ];
+	 */
+} __attribute__ ((packed)) gpt_header;
+
+typedef struct _gpt_entry_attributes {
+	u64 required_to_function:1;
+	u64 reserved:47;
+        u64 type_guid_specific:16;
+} __attribute__ ((packed)) gpt_entry_attributes;
+
+typedef struct _gpt_entry {
+	efi_guid_t partition_type_guid;
+	efi_guid_t unique_partition_guid;
+	__le64 starting_lba;
+	__le64 ending_lba;
+	gpt_entry_attributes attributes;
+	efi_char16_t partition_name[72 / sizeof (efi_char16_t)];
+} __attribute__ ((packed)) gpt_entry;
+
+typedef struct _legacy_mbr {
+	u8 boot_code[440];
+	__le32 unique_mbr_signature;
+	__le16 unknown;
+	struct partition partition_record[4];
+	__le16 signature;
+} __attribute__ ((packed)) legacy_mbr;
+
+/* Functions */
+extern int efi_partition(struct parsed_partitions *state);
+
+#endif
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * --------------------------------------------------------------------------
+ * Local variables:
+ * c-indent-level: 4 
+ * c-brace-imaginary-offset: 0
+ * c-brace-offset: -4
+ * c-argdecl-indent: 4
+ * c-label-offset: -4
+ * c-continued-statement-offset: 4
+ * c-continued-brace-offset: 0
+ * indent-tabs-mode: nil
+ * tab-width: 8
+ * End:
+ */
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
new file mode 100644
index 00000000..d513a07f
--- /dev/null
+++ b/fs/partitions/ibm.c
@@ -0,0 +1,275 @@
+/*
+ * File...........: linux/fs/partitions/ibm.c
+ * Author(s)......: Holger Smolinski <Holger.Smolinski@de.ibm.com>
+ *                  Volker Sameske <sameske@de.ibm.com>
+ * Bugreports.to..: <Linux390@de.ibm.com>
+ * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/hdreg.h>
+#include <linux/slab.h>
+#include <asm/dasd.h>
+#include <asm/ebcdic.h>
+#include <asm/uaccess.h>
+#include <asm/vtoc.h>
+
+#include "check.h"
+#include "ibm.h"
+
+/*
+ * compute the block number from a
+ * cyl-cyl-head-head structure
+ */
+static sector_t
+cchh2blk (struct vtoc_cchh *ptr, struct hd_geometry *geo) {
+
+	sector_t cyl;
+	__u16 head;
+
+	/*decode cylinder and heads for large volumes */
+	cyl = ptr->hh & 0xFFF0;
+	cyl <<= 12;
+	cyl |= ptr->cc;
+	head = ptr->hh & 0x000F;
+	return cyl * geo->heads * geo->sectors +
+	       head * geo->sectors;
+}
+
+/*
+ * compute the block number from a
+ * cyl-cyl-head-head-block structure
+ */
+static sector_t
+cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
+
+	sector_t cyl;
+	__u16 head;
+
+	/*decode cylinder and heads for large volumes */
+	cyl = ptr->hh & 0xFFF0;
+	cyl <<= 12;
+	cyl |= ptr->cc;
+	head = ptr->hh & 0x000F;
+	return	cyl * geo->heads * geo->sectors +
+		head * geo->sectors +
+		ptr->b;
+}
+
+/*
+ */
+int ibm_partition(struct parsed_partitions *state)
+{
+	struct block_device *bdev = state->bdev;
+	int blocksize, res;
+	loff_t i_size, offset, size, fmt_size;
+	dasd_information2_t *info;
+	struct hd_geometry *geo;
+	char type[5] = {0,};
+	char name[7] = {0,};
+	union label_t {
+		struct vtoc_volume_label_cdl vol;
+		struct vtoc_volume_label_ldl lnx;
+		struct vtoc_cms_label cms;
+	} *label;
+	unsigned char *data;
+	Sector sect;
+	sector_t labelsect;
+	char tmp[64];
+
+	res = 0;
+	blocksize = bdev_logical_block_size(bdev);
+	if (blocksize <= 0)
+		goto out_exit;
+	i_size = i_size_read(bdev->bd_inode);
+	if (i_size == 0)
+		goto out_exit;
+
+	info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL);
+	if (info == NULL)
+		goto out_exit;
+	geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL);
+	if (geo == NULL)
+		goto out_nogeo;
+	label = kmalloc(sizeof(union label_t), GFP_KERNEL);
+	if (label == NULL)
+		goto out_nolab;
+
+	if (ioctl_by_bdev(bdev, BIODASDINFO2, (unsigned long)info) != 0 ||
+	    ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
+		goto out_freeall;
+
+	/*
+	 * Special case for FBA disks: label sector does not depend on
+	 * blocksize.
+	 */
+	if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
+	    (info->cu_type == 0x3880 && info->dev_type == 0x3370))
+		labelsect = info->label_block;
+	else
+		labelsect = info->label_block * (blocksize >> 9);
+
+	/*
+	 * Get volume label, extract name and type.
+	 */
+	data = read_part_sector(state, labelsect, &sect);
+	if (data == NULL)
+		goto out_readerr;
+
+	memcpy(label, data, sizeof(union label_t));
+	put_dev_sector(sect);
+
+	if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD"))) {
+		strncpy(type, label->vol.vollbl, 4);
+		strncpy(name, label->vol.volid, 6);
+	} else {
+		strncpy(type, label->lnx.vollbl, 4);
+		strncpy(name, label->lnx.volid, 6);
+	}
+	EBCASC(type, 4);
+	EBCASC(name, 6);
+
+	res = 1;
+
+	/*
+	 * Three different formats: LDL, CDL and unformated disk
+	 *
+	 * identified by info->format
+	 *
+	 * unformated disks we do not have to care about
+	 */
+	if (info->format == DASD_FORMAT_LDL) {
+		if (strncmp(type, "CMS1", 4) == 0) {
+			/*
+			 * VM style CMS1 labeled disk
+			 */
+			blocksize = label->cms.block_size;
+			if (label->cms.disk_offset != 0) {
+				snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name);
+				strlcat(state->pp_buf, tmp, PAGE_SIZE);
+				/* disk is reserved minidisk */
+				offset = label->cms.disk_offset;
+				size = (label->cms.block_count - 1)
+					* (blocksize >> 9);
+			} else {
+				snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name);
+				strlcat(state->pp_buf, tmp, PAGE_SIZE);
+				offset = (info->label_block + 1);
+				size = label->cms.block_count
+					* (blocksize >> 9);
+			}
+			put_partition(state, 1, offset*(blocksize >> 9),
+				      size-offset*(blocksize >> 9));
+		} else {
+			if (strncmp(type, "LNX1", 4) == 0) {
+				snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name);
+				strlcat(state->pp_buf, tmp, PAGE_SIZE);
+				if (label->lnx.ldl_version == 0xf2) {
+					fmt_size = label->lnx.formatted_blocks
+						* (blocksize >> 9);
+				} else if (!strcmp(info->type, "ECKD")) {
+					/* formated w/o large volume support */
+					fmt_size = geo->cylinders * geo->heads
+					      * geo->sectors * (blocksize >> 9);
+				} else {
+					/* old label and no usable disk geometry
+					 * (e.g. DIAG) */
+					fmt_size = i_size >> 9;
+				}
+				size = i_size >> 9;
+				if (fmt_size < size)
+					size = fmt_size;
+				offset = (info->label_block + 1);
+			} else {
+				/* unlabeled disk */
+				strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
+				size = i_size >> 9;
+				offset = (info->label_block + 1);
+			}
+			put_partition(state, 1, offset*(blocksize >> 9),
+				      size-offset*(blocksize >> 9));
+		}
+	} else if (info->format == DASD_FORMAT_CDL) {
+		/*
+		 * New style CDL formatted disk
+		 */
+		sector_t blk;
+		int counter;
+
+		/*
+		 * check if VOL1 label is available
+		 * if not, something is wrong, skipping partition detection
+		 */
+		if (strncmp(type, "VOL1",  4) == 0) {
+			snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name);
+			strlcat(state->pp_buf, tmp, PAGE_SIZE);
+			/*
+			 * get block number and read then go through format1
+			 * labels
+			 */
+			blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
+			counter = 0;
+			data = read_part_sector(state, blk * (blocksize/512),
+						&sect);
+			while (data != NULL) {
+				struct vtoc_format1_label f1;
+
+				memcpy(&f1, data,
+				       sizeof(struct vtoc_format1_label));
+				put_dev_sector(sect);
+
+				/* skip FMT4 / FMT5 / FMT7 labels */
+				if (f1.DS1FMTID == _ascebc['4']
+				    || f1.DS1FMTID == _ascebc['5']
+				    || f1.DS1FMTID == _ascebc['7']
+				    || f1.DS1FMTID == _ascebc['9']) {
+					blk++;
+					data = read_part_sector(state,
+						blk * (blocksize/512), &sect);
+					continue;
+				}
+
+				/* only FMT1 and 8 labels valid at this point */
+				if (f1.DS1FMTID != _ascebc['1'] &&
+				    f1.DS1FMTID != _ascebc['8'])
+					break;
+
+				/* OK, we got valid partition data */
+				offset = cchh2blk(&f1.DS1EXT1.llimit, geo);
+				size  = cchh2blk(&f1.DS1EXT1.ulimit, geo) -
+					offset + geo->sectors;
+				if (counter >= state->limit)
+					break;
+				put_partition(state, counter + 1,
+					      offset * (blocksize >> 9),
+					      size * (blocksize >> 9));
+				counter++;
+				blk++;
+				data = read_part_sector(state,
+						blk * (blocksize/512), &sect);
+			}
+
+			if (!data)
+				/* Are we not supposed to report this ? */
+				goto out_readerr;
+		} else
+			printk(KERN_WARNING "Warning, expected Label VOL1 not "
+			       "found, treating as CDL formated Disk");
+
+	}
+
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	goto out_freeall;
+
+
+out_readerr:
+	res = -1;
+out_freeall:
+	kfree(label);
+out_nolab:
+	kfree(geo);
+out_nogeo:
+	kfree(info);
+out_exit:
+	return res;
+}
diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h
new file mode 100644
index 00000000..08fb0804
--- /dev/null
+++ b/fs/partitions/ibm.h
@@ -0,0 +1 @@
+int ibm_partition(struct parsed_partitions *);
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
new file mode 100644
index 00000000..0ea19312
--- /dev/null
+++ b/fs/partitions/karma.c
@@ -0,0 +1,57 @@
+/*
+ *  fs/partitions/karma.c
+ *  Rio Karma partition info.
+ *
+ *  Copyright (C) 2006 Bob Copeland (me@bobcopeland.com)
+ *  based on osf.c
+ */
+
+#include "check.h"
+#include "karma.h"
+
+int karma_partition(struct parsed_partitions *state)
+{
+	int i;
+	int slot = 1;
+	Sector sect;
+	unsigned char *data;
+	struct disklabel {
+		u8 d_reserved[270];
+		struct d_partition {
+			__le32 p_res;
+			u8 p_fstype;
+			u8 p_res2[3];
+			__le32 p_offset;
+			__le32 p_size;
+		} d_partitions[2];
+		u8 d_blank[208];
+		__le16 d_magic;
+	} __attribute__((packed)) *label;
+	struct d_partition *p;
+
+	data = read_part_sector(state, 0, &sect);
+	if (!data)
+		return -1;
+
+	label = (struct disklabel *)data;
+	if (le16_to_cpu(label->d_magic) != KARMA_LABEL_MAGIC) {
+		put_dev_sector(sect);
+		return 0;
+	}
+
+	p = label->d_partitions;
+	for (i = 0 ; i < 2; i++, p++) {
+		if (slot == state->limit)
+			break;
+
+		if (p->p_fstype == 0x4d && le32_to_cpu(p->p_size)) {
+			put_partition(state, slot, le32_to_cpu(p->p_offset),
+				le32_to_cpu(p->p_size));
+		}
+		slot++;
+	}
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	put_dev_sector(sect);
+	return 1;
+}
+
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h
new file mode 100644
index 00000000..c764b2e9
--- /dev/null
+++ b/fs/partitions/karma.h
@@ -0,0 +1,8 @@
+/*
+ *  fs/partitions/karma.h
+ */
+
+#define KARMA_LABEL_MAGIC		0xAB56
+
+int karma_partition(struct parsed_partitions *state);
+
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
new file mode 100644
index 00000000..af9fdf04
--- /dev/null
+++ b/fs/partitions/ldm.c
@@ -0,0 +1,1568 @@
+/**
+ * ldm - Support for Windows Logical Disk Manager (Dynamic Disks)
+ *
+ * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
+ *
+ * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads 
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation; either version 2 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program (in the main directory of the source in the file COPYING); if
+ * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA  02111-1307  USA
+ */
+
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/stringify.h>
+#include <linux/kernel.h>
+#include "ldm.h"
+#include "check.h"
+#include "msdos.h"
+
+/**
+ * ldm_debug/info/error/crit - Output an error message
+ * @f:    A printf format string containing the message
+ * @...:  Variables to substitute into @f
+ *
+ * ldm_debug() writes a DEBUG level message to the syslog but only if the
+ * driver was compiled with debug enabled. Otherwise, the call turns into a NOP.
+ */
+#ifndef CONFIG_LDM_DEBUG
+#define ldm_debug(...)	do {} while (0)
+#else
+#define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __func__, f, ##a)
+#endif
+
+#define ldm_crit(f, a...)  _ldm_printk (KERN_CRIT,  __func__, f, ##a)
+#define ldm_error(f, a...) _ldm_printk (KERN_ERR,   __func__, f, ##a)
+#define ldm_info(f, a...)  _ldm_printk (KERN_INFO,  __func__, f, ##a)
+
+__attribute__ ((format (printf, 3, 4)))
+static void _ldm_printk (const char *level, const char *function,
+			 const char *fmt, ...)
+{
+	static char buf[128];
+	va_list args;
+
+	va_start (args, fmt);
+	vsnprintf (buf, sizeof (buf), fmt, args);
+	va_end (args);
+
+	printk ("%s%s(): %s\n", level, function, buf);
+}
+
+/**
+ * ldm_parse_hexbyte - Convert a ASCII hex number to a byte
+ * @src:  Pointer to at least 2 characters to convert.
+ *
+ * Convert a two character ASCII hex string to a number.
+ *
+ * Return:  0-255  Success, the byte was parsed correctly
+ *          -1     Error, an invalid character was supplied
+ */
+static int ldm_parse_hexbyte (const u8 *src)
+{
+	unsigned int x;		/* For correct wrapping */
+	int h;
+
+	/* high part */
+	x = h = hex_to_bin(src[0]);
+	if (h < 0)
+		return -1;
+
+	/* low part */
+	h = hex_to_bin(src[1]);
+	if (h < 0)
+		return -1;
+
+	return (x << 4) + h;
+}
+
+/**
+ * ldm_parse_guid - Convert GUID from ASCII to binary
+ * @src:   36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
+ * @dest:  Memory block to hold binary GUID (16 bytes)
+ *
+ * N.B. The GUID need not be NULL terminated.
+ *
+ * Return:  'true'   @dest contains binary GUID
+ *          'false'  @dest contents are undefined
+ */
+static bool ldm_parse_guid (const u8 *src, u8 *dest)
+{
+	static const int size[] = { 4, 2, 2, 2, 6 };
+	int i, j, v;
+
+	if (src[8]  != '-' || src[13] != '-' ||
+	    src[18] != '-' || src[23] != '-')
+		return false;
+
+	for (j = 0; j < 5; j++, src++)
+		for (i = 0; i < size[j]; i++, src+=2, *dest++ = v)
+			if ((v = ldm_parse_hexbyte (src)) < 0)
+				return false;
+
+	return true;
+}
+
+/**
+ * ldm_parse_privhead - Read the LDM Database PRIVHEAD structure
+ * @data:  Raw database PRIVHEAD structure loaded from the device
+ * @ph:    In-memory privhead structure in which to return parsed information
+ *
+ * This parses the LDM database PRIVHEAD structure supplied in @data and
+ * sets up the in-memory privhead structure @ph with the obtained information.
+ *
+ * Return:  'true'   @ph contains the PRIVHEAD data
+ *          'false'  @ph contents are undefined
+ */
+static bool ldm_parse_privhead(const u8 *data, struct privhead *ph)
+{
+	bool is_vista = false;
+
+	BUG_ON(!data || !ph);
+	if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) {
+		ldm_error("Cannot find PRIVHEAD structure. LDM database is"
+			" corrupt. Aborting.");
+		return false;
+	}
+	ph->ver_major = get_unaligned_be16(data + 0x000C);
+	ph->ver_minor = get_unaligned_be16(data + 0x000E);
+	ph->logical_disk_start = get_unaligned_be64(data + 0x011B);
+	ph->logical_disk_size = get_unaligned_be64(data + 0x0123);
+	ph->config_start = get_unaligned_be64(data + 0x012B);
+	ph->config_size = get_unaligned_be64(data + 0x0133);
+	/* Version 2.11 is Win2k/XP and version 2.12 is Vista. */
+	if (ph->ver_major == 2 && ph->ver_minor == 12)
+		is_vista = true;
+	if (!is_vista && (ph->ver_major != 2 || ph->ver_minor != 11)) {
+		ldm_error("Expected PRIVHEAD version 2.11 or 2.12, got %d.%d."
+			" Aborting.", ph->ver_major, ph->ver_minor);
+		return false;
+	}
+	ldm_debug("PRIVHEAD version %d.%d (Windows %s).", ph->ver_major,
+			ph->ver_minor, is_vista ? "Vista" : "2000/XP");
+	if (ph->config_size != LDM_DB_SIZE) {	/* 1 MiB in sectors. */
+		/* Warn the user and continue, carefully. */
+		ldm_info("Database is normally %u bytes, it claims to "
+			"be %llu bytes.", LDM_DB_SIZE,
+			(unsigned long long)ph->config_size);
+	}
+	if ((ph->logical_disk_size == 0) || (ph->logical_disk_start +
+			ph->logical_disk_size > ph->config_start)) {
+		ldm_error("PRIVHEAD disk size doesn't match real disk size");
+		return false;
+	}
+	if (!ldm_parse_guid(data + 0x0030, ph->disk_id)) {
+		ldm_error("PRIVHEAD contains an invalid GUID.");
+		return false;
+	}
+	ldm_debug("Parsed PRIVHEAD successfully.");
+	return true;
+}
+
+/**
+ * ldm_parse_tocblock - Read the LDM Database TOCBLOCK structure
+ * @data:  Raw database TOCBLOCK structure loaded from the device
+ * @toc:   In-memory toc structure in which to return parsed information
+ *
+ * This parses the LDM Database TOCBLOCK (table of contents) structure supplied
+ * in @data and sets up the in-memory tocblock structure @toc with the obtained
+ * information.
+ *
+ * N.B.  The *_start and *_size values returned in @toc are not range-checked.
+ *
+ * Return:  'true'   @toc contains the TOCBLOCK data
+ *          'false'  @toc contents are undefined
+ */
+static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
+{
+	BUG_ON (!data || !toc);
+
+	if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) {
+		ldm_crit ("Cannot find TOCBLOCK, database may be corrupt.");
+		return false;
+	}
+	strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name));
+	toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0;
+	toc->bitmap1_start = get_unaligned_be64(data + 0x2E);
+	toc->bitmap1_size  = get_unaligned_be64(data + 0x36);
+
+	if (strncmp (toc->bitmap1_name, TOC_BITMAP1,
+			sizeof (toc->bitmap1_name)) != 0) {
+		ldm_crit ("TOCBLOCK's first bitmap is '%s', should be '%s'.",
+				TOC_BITMAP1, toc->bitmap1_name);
+		return false;
+	}
+	strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name));
+	toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0;
+	toc->bitmap2_start = get_unaligned_be64(data + 0x50);
+	toc->bitmap2_size  = get_unaligned_be64(data + 0x58);
+	if (strncmp (toc->bitmap2_name, TOC_BITMAP2,
+			sizeof (toc->bitmap2_name)) != 0) {
+		ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.",
+				TOC_BITMAP2, toc->bitmap2_name);
+		return false;
+	}
+	ldm_debug ("Parsed TOCBLOCK successfully.");
+	return true;
+}
+
+/**
+ * ldm_parse_vmdb - Read the LDM Database VMDB structure
+ * @data:  Raw database VMDB structure loaded from the device
+ * @vm:    In-memory vmdb structure in which to return parsed information
+ *
+ * This parses the LDM Database VMDB structure supplied in @data and sets up
+ * the in-memory vmdb structure @vm with the obtained information.
+ *
+ * N.B.  The *_start, *_size and *_seq values will be range-checked later.
+ *
+ * Return:  'true'   @vm contains VMDB info
+ *          'false'  @vm contents are undefined
+ */
+static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
+{
+	BUG_ON (!data || !vm);
+
+	if (MAGIC_VMDB != get_unaligned_be32(data)) {
+		ldm_crit ("Cannot find the VMDB, database may be corrupt.");
+		return false;
+	}
+
+	vm->ver_major = get_unaligned_be16(data + 0x12);
+	vm->ver_minor = get_unaligned_be16(data + 0x14);
+	if ((vm->ver_major != 4) || (vm->ver_minor != 10)) {
+		ldm_error ("Expected VMDB version %d.%d, got %d.%d. "
+			"Aborting.", 4, 10, vm->ver_major, vm->ver_minor);
+		return false;
+	}
+
+	vm->vblk_size     = get_unaligned_be32(data + 0x08);
+	if (vm->vblk_size == 0) {
+		ldm_error ("Illegal VBLK size");
+		return false;
+	}
+
+	vm->vblk_offset   = get_unaligned_be32(data + 0x0C);
+	vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
+
+	ldm_debug ("Parsed VMDB successfully.");
+	return true;
+}
+
+/**
+ * ldm_compare_privheads - Compare two privhead objects
+ * @ph1:  First privhead
+ * @ph2:  Second privhead
+ *
+ * This compares the two privhead structures @ph1 and @ph2.
+ *
+ * Return:  'true'   Identical
+ *          'false'  Different
+ */
+static bool ldm_compare_privheads (const struct privhead *ph1,
+				   const struct privhead *ph2)
+{
+	BUG_ON (!ph1 || !ph2);
+
+	return ((ph1->ver_major          == ph2->ver_major)		&&
+		(ph1->ver_minor          == ph2->ver_minor)		&&
+		(ph1->logical_disk_start == ph2->logical_disk_start)	&&
+		(ph1->logical_disk_size  == ph2->logical_disk_size)	&&
+		(ph1->config_start       == ph2->config_start)		&&
+		(ph1->config_size        == ph2->config_size)		&&
+		!memcmp (ph1->disk_id, ph2->disk_id, GUID_SIZE));
+}
+
+/**
+ * ldm_compare_tocblocks - Compare two tocblock objects
+ * @toc1:  First toc
+ * @toc2:  Second toc
+ *
+ * This compares the two tocblock structures @toc1 and @toc2.
+ *
+ * Return:  'true'   Identical
+ *          'false'  Different
+ */
+static bool ldm_compare_tocblocks (const struct tocblock *toc1,
+				   const struct tocblock *toc2)
+{
+	BUG_ON (!toc1 || !toc2);
+
+	return ((toc1->bitmap1_start == toc2->bitmap1_start)	&&
+		(toc1->bitmap1_size  == toc2->bitmap1_size)	&&
+		(toc1->bitmap2_start == toc2->bitmap2_start)	&&
+		(toc1->bitmap2_size  == toc2->bitmap2_size)	&&
+		!strncmp (toc1->bitmap1_name, toc2->bitmap1_name,
+			sizeof (toc1->bitmap1_name))		&&
+		!strncmp (toc1->bitmap2_name, toc2->bitmap2_name,
+			sizeof (toc1->bitmap2_name)));
+}
+
+/**
+ * ldm_validate_privheads - Compare the primary privhead with its backups
+ * @state: Partition check state including device holding the LDM Database
+ * @ph1:   Memory struct to fill with ph contents
+ *
+ * Read and compare all three privheads from disk.
+ *
+ * The privheads on disk show the size and location of the main disk area and
+ * the configuration area (the database).  The values are range-checked against
+ * @hd, which contains the real size of the disk.
+ *
+ * Return:  'true'   Success
+ *          'false'  Error
+ */
+static bool ldm_validate_privheads(struct parsed_partitions *state,
+				   struct privhead *ph1)
+{
+	static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
+	struct privhead *ph[3] = { ph1 };
+	Sector sect;
+	u8 *data;
+	bool result = false;
+	long num_sects;
+	int i;
+
+	BUG_ON (!state || !ph1);
+
+	ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
+	ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
+	if (!ph[1] || !ph[2]) {
+		ldm_crit ("Out of memory.");
+		goto out;
+	}
+
+	/* off[1 & 2] are relative to ph[0]->config_start */
+	ph[0]->config_start = 0;
+
+	/* Read and parse privheads */
+	for (i = 0; i < 3; i++) {
+		data = read_part_sector(state, ph[0]->config_start + off[i],
+					&sect);
+		if (!data) {
+			ldm_crit ("Disk read failed.");
+			goto out;
+		}
+		result = ldm_parse_privhead (data, ph[i]);
+		put_dev_sector (sect);
+		if (!result) {
+			ldm_error ("Cannot find PRIVHEAD %d.", i+1); /* Log again */
+			if (i < 2)
+				goto out;	/* Already logged */
+			else
+				break;	/* FIXME ignore for now, 3rd PH can fail on odd-sized disks */
+		}
+	}
+
+	num_sects = state->bdev->bd_inode->i_size >> 9;
+
+	if ((ph[0]->config_start > num_sects) ||
+	   ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
+		ldm_crit ("Database extends beyond the end of the disk.");
+		goto out;
+	}
+
+	if ((ph[0]->logical_disk_start > ph[0]->config_start) ||
+	   ((ph[0]->logical_disk_start + ph[0]->logical_disk_size)
+		    > ph[0]->config_start)) {
+		ldm_crit ("Disk and database overlap.");
+		goto out;
+	}
+
+	if (!ldm_compare_privheads (ph[0], ph[1])) {
+		ldm_crit ("Primary and backup PRIVHEADs don't match.");
+		goto out;
+	}
+	/* FIXME ignore this for now
+	if (!ldm_compare_privheads (ph[0], ph[2])) {
+		ldm_crit ("Primary and backup PRIVHEADs don't match.");
+		goto out;
+	}*/
+	ldm_debug ("Validated PRIVHEADs successfully.");
+	result = true;
+out:
+	kfree (ph[1]);
+	kfree (ph[2]);
+	return result;
+}
+
+/**
+ * ldm_validate_tocblocks - Validate the table of contents and its backups
+ * @state: Partition check state including device holding the LDM Database
+ * @base:  Offset, into @state->bdev, of the database
+ * @ldb:   Cache of the database structures
+ *
+ * Find and compare the four tables of contents of the LDM Database stored on
+ * @state->bdev and return the parsed information into @toc1.
+ *
+ * The offsets and sizes of the configs are range-checked against a privhead.
+ *
+ * Return:  'true'   @toc1 contains validated TOCBLOCK info
+ *          'false'  @toc1 contents are undefined
+ */
+static bool ldm_validate_tocblocks(struct parsed_partitions *state,
+				   unsigned long base, struct ldmdb *ldb)
+{
+	static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
+	struct tocblock *tb[4];
+	struct privhead *ph;
+	Sector sect;
+	u8 *data;
+	int i, nr_tbs;
+	bool result = false;
+
+	BUG_ON(!state || !ldb);
+	ph = &ldb->ph;
+	tb[0] = &ldb->toc;
+	tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
+	if (!tb[1]) {
+		ldm_crit("Out of memory.");
+		goto err;
+	}
+	tb[2] = (struct tocblock*)((u8*)tb[1] + sizeof(*tb[1]));
+	tb[3] = (struct tocblock*)((u8*)tb[2] + sizeof(*tb[2]));
+	/*
+	 * Try to read and parse all four TOCBLOCKs.
+	 *
+	 * Windows Vista LDM v2.12 does not always have all four TOCBLOCKs so
+	 * skip any that fail as long as we get at least one valid TOCBLOCK.
+	 */
+	for (nr_tbs = i = 0; i < 4; i++) {
+		data = read_part_sector(state, base + off[i], &sect);
+		if (!data) {
+			ldm_error("Disk read failed for TOCBLOCK %d.", i);
+			continue;
+		}
+		if (ldm_parse_tocblock(data, tb[nr_tbs]))
+			nr_tbs++;
+		put_dev_sector(sect);
+	}
+	if (!nr_tbs) {
+		ldm_crit("Failed to find a valid TOCBLOCK.");
+		goto err;
+	}
+	/* Range check the TOCBLOCK against a privhead. */
+	if (((tb[0]->bitmap1_start + tb[0]->bitmap1_size) > ph->config_size) ||
+			((tb[0]->bitmap2_start + tb[0]->bitmap2_size) >
+			ph->config_size)) {
+		ldm_crit("The bitmaps are out of range.  Giving up.");
+		goto err;
+	}
+	/* Compare all loaded TOCBLOCKs. */
+	for (i = 1; i < nr_tbs; i++) {
+		if (!ldm_compare_tocblocks(tb[0], tb[i])) {
+			ldm_crit("TOCBLOCKs 0 and %d do not match.", i);
+			goto err;
+		}
+	}
+	ldm_debug("Validated %d TOCBLOCKs successfully.", nr_tbs);
+	result = true;
+err:
+	kfree(tb[1]);
+	return result;
+}
+
+/**
+ * ldm_validate_vmdb - Read the VMDB and validate it
+ * @state: Partition check state including device holding the LDM Database
+ * @base:  Offset, into @bdev, of the database
+ * @ldb:   Cache of the database structures
+ *
+ * Find the vmdb of the LDM Database stored on @bdev and return the parsed
+ * information in @ldb.
+ *
+ * Return:  'true'   @ldb contains validated VBDB info
+ *          'false'  @ldb contents are undefined
+ */
+static bool ldm_validate_vmdb(struct parsed_partitions *state,
+			      unsigned long base, struct ldmdb *ldb)
+{
+	Sector sect;
+	u8 *data;
+	bool result = false;
+	struct vmdb *vm;
+	struct tocblock *toc;
+
+	BUG_ON (!state || !ldb);
+
+	vm  = &ldb->vm;
+	toc = &ldb->toc;
+
+	data = read_part_sector(state, base + OFF_VMDB, &sect);
+	if (!data) {
+		ldm_crit ("Disk read failed.");
+		return false;
+	}
+
+	if (!ldm_parse_vmdb (data, vm))
+		goto out;				/* Already logged */
+
+	/* Are there uncommitted transactions? */
+	if (get_unaligned_be16(data + 0x10) != 0x01) {
+		ldm_crit ("Database is not in a consistent state.  Aborting.");
+		goto out;
+	}
+
+	if (vm->vblk_offset != 512)
+		ldm_info ("VBLKs start at offset 0x%04x.", vm->vblk_offset);
+
+	/*
+	 * The last_vblkd_seq can be before the end of the vmdb, just make sure
+	 * it is not out of bounds.
+	 */
+	if ((vm->vblk_size * vm->last_vblk_seq) > (toc->bitmap1_size << 9)) {
+		ldm_crit ("VMDB exceeds allowed size specified by TOCBLOCK.  "
+				"Database is corrupt.  Aborting.");
+		goto out;
+	}
+
+	result = true;
+out:
+	put_dev_sector (sect);
+	return result;
+}
+
+
+/**
+ * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
+ * @state: Partition check state including device holding the LDM Database
+ *
+ * This function provides a weak test to decide whether the device is a dynamic
+ * disk or not.  It looks for an MS-DOS-style partition table containing at
+ * least one partition of type 0x42 (formerly SFS, now used by Windows for
+ * dynamic disks).
+ *
+ * N.B.  The only possible error can come from the read_part_sector and that is
+ *       only likely to happen if the underlying device is strange.  If that IS
+ *       the case we should return zero to let someone else try.
+ *
+ * Return:  'true'   @state->bdev is a dynamic disk
+ *          'false'  @state->bdev is not a dynamic disk, or an error occurred
+ */
+static bool ldm_validate_partition_table(struct parsed_partitions *state)
+{
+	Sector sect;
+	u8 *data;
+	struct partition *p;
+	int i;
+	bool result = false;
+
+	BUG_ON(!state);
+
+	data = read_part_sector(state, 0, &sect);
+	if (!data) {
+		ldm_info ("Disk read failed.");
+		return false;
+	}
+
+	if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC))
+		goto out;
+
+	p = (struct partition*)(data + 0x01BE);
+	for (i = 0; i < 4; i++, p++)
+		if (SYS_IND (p) == LDM_PARTITION) {
+			result = true;
+			break;
+		}
+
+	if (result)
+		ldm_debug ("Found W2K dynamic disk partition type.");
+
+out:
+	put_dev_sector (sect);
+	return result;
+}
+
+/**
+ * ldm_get_disk_objid - Search a linked list of vblk's for a given Disk Id
+ * @ldb:  Cache of the database structures
+ *
+ * The LDM Database contains a list of all partitions on all dynamic disks.
+ * The primary PRIVHEAD, at the beginning of the physical disk, tells us
+ * the GUID of this disk.  This function searches for the GUID in a linked
+ * list of vblk's.
+ *
+ * Return:  Pointer, A matching vblk was found
+ *          NULL,    No match, or an error
+ */
+static struct vblk * ldm_get_disk_objid (const struct ldmdb *ldb)
+{
+	struct list_head *item;
+
+	BUG_ON (!ldb);
+
+	list_for_each (item, &ldb->v_disk) {
+		struct vblk *v = list_entry (item, struct vblk, list);
+		if (!memcmp (v->vblk.disk.disk_id, ldb->ph.disk_id, GUID_SIZE))
+			return v;
+	}
+
+	return NULL;
+}
+
+/**
+ * ldm_create_data_partitions - Create data partitions for this device
+ * @pp:   List of the partitions parsed so far
+ * @ldb:  Cache of the database structures
+ *
+ * The database contains ALL the partitions for ALL disk groups, so we need to
+ * filter out this specific disk. Using the disk's object id, we can find all
+ * the partitions in the database that belong to this disk.
+ *
+ * Add each partition in our database, to the parsed_partitions structure.
+ *
+ * N.B.  This function creates the partitions in the order it finds partition
+ *       objects in the linked list.
+ *
+ * Return:  'true'   Partition created
+ *          'false'  Error, probably a range checking problem
+ */
+static bool ldm_create_data_partitions (struct parsed_partitions *pp,
+					const struct ldmdb *ldb)
+{
+	struct list_head *item;
+	struct vblk *vb;
+	struct vblk *disk;
+	struct vblk_part *part;
+	int part_num = 1;
+
+	BUG_ON (!pp || !ldb);
+
+	disk = ldm_get_disk_objid (ldb);
+	if (!disk) {
+		ldm_crit ("Can't find the ID of this disk in the database.");
+		return false;
+	}
+
+	strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE);
+
+	/* Create the data partitions */
+	list_for_each (item, &ldb->v_part) {
+		vb = list_entry (item, struct vblk, list);
+		part = &vb->vblk.part;
+
+		if (part->disk_id != disk->obj_id)
+			continue;
+
+		put_partition (pp, part_num, ldb->ph.logical_disk_start +
+				part->start, part->size);
+		part_num++;
+	}
+
+	strlcat(pp->pp_buf, "\n", PAGE_SIZE);
+	return true;
+}
+
+
+/**
+ * ldm_relative - Calculate the next relative offset
+ * @buffer:  Block of data being worked on
+ * @buflen:  Size of the block of data
+ * @base:    Size of the previous fixed width fields
+ * @offset:  Cumulative size of the previous variable-width fields
+ *
+ * Because many of the VBLK fields are variable-width, it's necessary
+ * to calculate each offset based on the previous one and the length
+ * of the field it pointed to.
+ *
+ * Return:  -1 Error, the calculated offset exceeded the size of the buffer
+ *           n OK, a range-checked offset into buffer
+ */
+static int ldm_relative(const u8 *buffer, int buflen, int base, int offset)
+{
+
+	base += offset;
+	if (!buffer || offset < 0 || base > buflen) {
+		if (!buffer)
+			ldm_error("!buffer");
+		if (offset < 0)
+			ldm_error("offset (%d) < 0", offset);
+		if (base > buflen)
+			ldm_error("base (%d) > buflen (%d)", base, buflen);
+		return -1;
+	}
+	if (base + buffer[base] >= buflen) {
+		ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base,
+				buffer[base], buflen);
+		return -1;
+	}
+	return buffer[base] + offset + 1;
+}
+
+/**
+ * ldm_get_vnum - Convert a variable-width, big endian number, into cpu order
+ * @block:  Pointer to the variable-width number to convert
+ *
+ * Large numbers in the LDM Database are often stored in a packed format.  Each
+ * number is prefixed by a one byte width marker.  All numbers in the database
+ * are stored in big-endian byte order.  This function reads one of these
+ * numbers and returns the result
+ *
+ * N.B.  This function DOES NOT perform any range checking, though the most
+ *       it will read is eight bytes.
+ *
+ * Return:  n A number
+ *          0 Zero, or an error occurred
+ */
+static u64 ldm_get_vnum (const u8 *block)
+{
+	u64 tmp = 0;
+	u8 length;
+
+	BUG_ON (!block);
+
+	length = *block++;
+
+	if (length && length <= 8)
+		while (length--)
+			tmp = (tmp << 8) | *block++;
+	else
+		ldm_error ("Illegal length %d.", length);
+
+	return tmp;
+}
+
+/**
+ * ldm_get_vstr - Read a length-prefixed string into a buffer
+ * @block:   Pointer to the length marker
+ * @buffer:  Location to copy string to
+ * @buflen:  Size of the output buffer
+ *
+ * Many of the strings in the LDM Database are not NULL terminated.  Instead
+ * they are prefixed by a one byte length marker.  This function copies one of
+ * these strings into a buffer.
+ *
+ * N.B.  This function DOES NOT perform any range checking on the input.
+ *       If the buffer is too small, the output will be truncated.
+ *
+ * Return:  0, Error and @buffer contents are undefined
+ *          n, String length in characters (excluding NULL)
+ *          buflen-1, String was truncated.
+ */
+static int ldm_get_vstr (const u8 *block, u8 *buffer, int buflen)
+{
+	int length;
+
+	BUG_ON (!block || !buffer);
+
+	length = block[0];
+	if (length >= buflen) {
+		ldm_error ("Truncating string %d -> %d.", length, buflen);
+		length = buflen - 1;
+	}
+	memcpy (buffer, block + 1, length);
+	buffer[length] = 0;
+	return length;
+}
+
+
+/**
+ * ldm_parse_cmp3 - Read a raw VBLK Component object into a vblk structure
+ * @buffer:  Block of data being worked on
+ * @buflen:  Size of the block of data
+ * @vb:      In-memory vblk in which to return information
+ *
+ * Read a raw VBLK Component object (version 3) into a vblk structure.
+ *
+ * Return:  'true'   @vb contains a Component VBLK
+ *          'false'  @vb contents are not defined
+ */
+static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb)
+{
+	int r_objid, r_name, r_vstate, r_child, r_parent, r_stripe, r_cols, len;
+	struct vblk_comp *comp;
+
+	BUG_ON (!buffer || !vb);
+
+	r_objid  = ldm_relative (buffer, buflen, 0x18, 0);
+	r_name   = ldm_relative (buffer, buflen, 0x18, r_objid);
+	r_vstate = ldm_relative (buffer, buflen, 0x18, r_name);
+	r_child  = ldm_relative (buffer, buflen, 0x1D, r_vstate);
+	r_parent = ldm_relative (buffer, buflen, 0x2D, r_child);
+
+	if (buffer[0x12] & VBLK_FLAG_COMP_STRIPE) {
+		r_stripe = ldm_relative (buffer, buflen, 0x2E, r_parent);
+		r_cols   = ldm_relative (buffer, buflen, 0x2E, r_stripe);
+		len = r_cols;
+	} else {
+		r_stripe = 0;
+		r_cols   = 0;
+		len = r_parent;
+	}
+	if (len < 0)
+		return false;
+
+	len += VBLK_SIZE_CMP3;
+	if (len != get_unaligned_be32(buffer + 0x14))
+		return false;
+
+	comp = &vb->vblk.comp;
+	ldm_get_vstr (buffer + 0x18 + r_name, comp->state,
+		sizeof (comp->state));
+	comp->type      = buffer[0x18 + r_vstate];
+	comp->children  = ldm_get_vnum (buffer + 0x1D + r_vstate);
+	comp->parent_id = ldm_get_vnum (buffer + 0x2D + r_child);
+	comp->chunksize = r_stripe ? ldm_get_vnum (buffer+r_parent+0x2E) : 0;
+
+	return true;
+}
+
+/**
+ * ldm_parse_dgr3 - Read a raw VBLK Disk Group object into a vblk structure
+ * @buffer:  Block of data being worked on
+ * @buflen:  Size of the block of data
+ * @vb:      In-memory vblk in which to return information
+ *
+ * Read a raw VBLK Disk Group object (version 3) into a vblk structure.
+ *
+ * Return:  'true'   @vb contains a Disk Group VBLK
+ *          'false'  @vb contents are not defined
+ */
+static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb)
+{
+	int r_objid, r_name, r_diskid, r_id1, r_id2, len;
+	struct vblk_dgrp *dgrp;
+
+	BUG_ON (!buffer || !vb);
+
+	r_objid  = ldm_relative (buffer, buflen, 0x18, 0);
+	r_name   = ldm_relative (buffer, buflen, 0x18, r_objid);
+	r_diskid = ldm_relative (buffer, buflen, 0x18, r_name);
+
+	if (buffer[0x12] & VBLK_FLAG_DGR3_IDS) {
+		r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid);
+		r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1);
+		len = r_id2;
+	} else {
+		r_id1 = 0;
+		r_id2 = 0;
+		len = r_diskid;
+	}
+	if (len < 0)
+		return false;
+
+	len += VBLK_SIZE_DGR3;
+	if (len != get_unaligned_be32(buffer + 0x14))
+		return false;
+
+	dgrp = &vb->vblk.dgrp;
+	ldm_get_vstr (buffer + 0x18 + r_name, dgrp->disk_id,
+		sizeof (dgrp->disk_id));
+	return true;
+}
+
+/**
+ * ldm_parse_dgr4 - Read a raw VBLK Disk Group object into a vblk structure
+ * @buffer:  Block of data being worked on
+ * @buflen:  Size of the block of data
+ * @vb:      In-memory vblk in which to return information
+ *
+ * Read a raw VBLK Disk Group object (version 4) into a vblk structure.
+ *
+ * Return:  'true'   @vb contains a Disk Group VBLK
+ *          'false'  @vb contents are not defined
+ */
+static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
+{
+	char buf[64];
+	int r_objid, r_name, r_id1, r_id2, len;
+	struct vblk_dgrp *dgrp;
+
+	BUG_ON (!buffer || !vb);
+
+	r_objid  = ldm_relative (buffer, buflen, 0x18, 0);
+	r_name   = ldm_relative (buffer, buflen, 0x18, r_objid);
+
+	if (buffer[0x12] & VBLK_FLAG_DGR4_IDS) {
+		r_id1 = ldm_relative (buffer, buflen, 0x44, r_name);
+		r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1);
+		len = r_id2;
+	} else {
+		r_id1 = 0;
+		r_id2 = 0;
+		len = r_name;
+	}
+	if (len < 0)
+		return false;
+
+	len += VBLK_SIZE_DGR4;
+	if (len != get_unaligned_be32(buffer + 0x14))
+		return false;
+
+	dgrp = &vb->vblk.dgrp;
+
+	ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf));
+	return true;
+}
+
+/**
+ * ldm_parse_dsk3 - Read a raw VBLK Disk object into a vblk structure
+ * @buffer:  Block of data being worked on
+ * @buflen:  Size of the block of data
+ * @vb:      In-memory vblk in which to return information
+ *
+ * Read a raw VBLK Disk object (version 3) into a vblk structure.
+ *
+ * Return:  'true'   @vb contains a Disk VBLK
+ *          'false'  @vb contents are not defined
+ */
+static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb)
+{
+	int r_objid, r_name, r_diskid, r_altname, len;
+	struct vblk_disk *disk;
+
+	BUG_ON (!buffer || !vb);
+
+	r_objid   = ldm_relative (buffer, buflen, 0x18, 0);
+	r_name    = ldm_relative (buffer, buflen, 0x18, r_objid);
+	r_diskid  = ldm_relative (buffer, buflen, 0x18, r_name);
+	r_altname = ldm_relative (buffer, buflen, 0x18, r_diskid);
+	len = r_altname;
+	if (len < 0)
+		return false;
+
+	len += VBLK_SIZE_DSK3;
+	if (len != get_unaligned_be32(buffer + 0x14))
+		return false;
+
+	disk = &vb->vblk.disk;
+	ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name,
+		sizeof (disk->alt_name));
+	if (!ldm_parse_guid (buffer + 0x19 + r_name, disk->disk_id))
+		return false;
+
+	return true;
+}
+
+/**
+ * ldm_parse_dsk4 - Read a raw VBLK Disk object into a vblk structure
+ * @buffer:  Block of data being worked on
+ * @buflen:  Size of the block of data
+ * @vb:      In-memory vblk in which to return information
+ *
+ * Read a raw VBLK Disk object (version 4) into a vblk structure.
+ *
+ * Return:  'true'   @vb contains a Disk VBLK
+ *          'false'  @vb contents are not defined
+ */
+static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
+{
+	int r_objid, r_name, len;
+	struct vblk_disk *disk;
+
+	BUG_ON (!buffer || !vb);
+
+	r_objid = ldm_relative (buffer, buflen, 0x18, 0);
+	r_name  = ldm_relative (buffer, buflen, 0x18, r_objid);
+	len     = r_name;
+	if (len < 0)
+		return false;
+
+	len += VBLK_SIZE_DSK4;
+	if (len != get_unaligned_be32(buffer + 0x14))
+		return false;
+
+	disk = &vb->vblk.disk;
+	memcpy (disk->disk_id, buffer + 0x18 + r_name, GUID_SIZE);
+	return true;
+}
+
+/**
+ * ldm_parse_prt3 - Read a raw VBLK Partition object into a vblk structure
+ * @buffer:  Block of data being worked on
+ * @buflen:  Size of the block of data
+ * @vb:      In-memory vblk in which to return information
+ *
+ * Read a raw VBLK Partition object (version 3) into a vblk structure.
+ *
+ * Return:  'true'   @vb contains a Partition VBLK
+ *          'false'  @vb contents are not defined
+ */
+static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb)
+{
+	int r_objid, r_name, r_size, r_parent, r_diskid, r_index, len;
+	struct vblk_part *part;
+
+	BUG_ON(!buffer || !vb);
+	r_objid = ldm_relative(buffer, buflen, 0x18, 0);
+	if (r_objid < 0) {
+		ldm_error("r_objid %d < 0", r_objid);
+		return false;
+	}
+	r_name = ldm_relative(buffer, buflen, 0x18, r_objid);
+	if (r_name < 0) {
+		ldm_error("r_name %d < 0", r_name);
+		return false;
+	}
+	r_size = ldm_relative(buffer, buflen, 0x34, r_name);
+	if (r_size < 0) {
+		ldm_error("r_size %d < 0", r_size);
+		return false;
+	}
+	r_parent = ldm_relative(buffer, buflen, 0x34, r_size);
+	if (r_parent < 0) {
+		ldm_error("r_parent %d < 0", r_parent);
+		return false;
+	}
+	r_diskid = ldm_relative(buffer, buflen, 0x34, r_parent);
+	if (r_diskid < 0) {
+		ldm_error("r_diskid %d < 0", r_diskid);
+		return false;
+	}
+	if (buffer[0x12] & VBLK_FLAG_PART_INDEX) {
+		r_index = ldm_relative(buffer, buflen, 0x34, r_diskid);
+		if (r_index < 0) {
+			ldm_error("r_index %d < 0", r_index);
+			return false;
+		}
+		len = r_index;
+	} else {
+		r_index = 0;
+		len = r_diskid;
+	}
+	if (len < 0) {
+		ldm_error("len %d < 0", len);
+		return false;
+	}
+	len += VBLK_SIZE_PRT3;
+	if (len > get_unaligned_be32(buffer + 0x14)) {
+		ldm_error("len %d > BE32(buffer + 0x14) %d", len,
+				get_unaligned_be32(buffer + 0x14));
+		return false;
+	}
+	part = &vb->vblk.part;
+	part->start = get_unaligned_be64(buffer + 0x24 + r_name);
+	part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name);
+	part->size = ldm_get_vnum(buffer + 0x34 + r_name);
+	part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size);
+	part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent);
+	if (vb->flags & VBLK_FLAG_PART_INDEX)
+		part->partnum = buffer[0x35 + r_diskid];
+	else
+		part->partnum = 0;
+	return true;
+}
+
+/**
+ * ldm_parse_vol5 - Read a raw VBLK Volume object into a vblk structure
+ * @buffer:  Block of data being worked on
+ * @buflen:  Size of the block of data
+ * @vb:      In-memory vblk in which to return information
+ *
+ * Read a raw VBLK Volume object (version 5) into a vblk structure.
+ *
+ * Return:  'true'   @vb contains a Volume VBLK
+ *          'false'  @vb contents are not defined
+ */
+static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb)
+{
+	int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size;
+	int r_id1, r_id2, r_size2, r_drive, len;
+	struct vblk_volu *volu;
+
+	BUG_ON(!buffer || !vb);
+	r_objid = ldm_relative(buffer, buflen, 0x18, 0);
+	if (r_objid < 0) {
+		ldm_error("r_objid %d < 0", r_objid);
+		return false;
+	}
+	r_name = ldm_relative(buffer, buflen, 0x18, r_objid);
+	if (r_name < 0) {
+		ldm_error("r_name %d < 0", r_name);
+		return false;
+	}
+	r_vtype = ldm_relative(buffer, buflen, 0x18, r_name);
+	if (r_vtype < 0) {
+		ldm_error("r_vtype %d < 0", r_vtype);
+		return false;
+	}
+	r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype);
+	if (r_disable_drive_letter < 0) {
+		ldm_error("r_disable_drive_letter %d < 0",
+				r_disable_drive_letter);
+		return false;
+	}
+	r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter);
+	if (r_child < 0) {
+		ldm_error("r_child %d < 0", r_child);
+		return false;
+	}
+	r_size = ldm_relative(buffer, buflen, 0x3D, r_child);
+	if (r_size < 0) {
+		ldm_error("r_size %d < 0", r_size);
+		return false;
+	}
+	if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) {
+		r_id1 = ldm_relative(buffer, buflen, 0x52, r_size);
+		if (r_id1 < 0) {
+			ldm_error("r_id1 %d < 0", r_id1);
+			return false;
+		}
+	} else
+		r_id1 = r_size;
+	if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) {
+		r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1);
+		if (r_id2 < 0) {
+			ldm_error("r_id2 %d < 0", r_id2);
+			return false;
+		}
+	} else
+		r_id2 = r_id1;
+	if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) {
+		r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2);
+		if (r_size2 < 0) {
+			ldm_error("r_size2 %d < 0", r_size2);
+			return false;
+		}
+	} else
+		r_size2 = r_id2;
+	if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
+		r_drive = ldm_relative(buffer, buflen, 0x52, r_size2);
+		if (r_drive < 0) {
+			ldm_error("r_drive %d < 0", r_drive);
+			return false;
+		}
+	} else
+		r_drive = r_size2;
+	len = r_drive;
+	if (len < 0) {
+		ldm_error("len %d < 0", len);
+		return false;
+	}
+	len += VBLK_SIZE_VOL5;
+	if (len > get_unaligned_be32(buffer + 0x14)) {
+		ldm_error("len %d > BE32(buffer + 0x14) %d", len,
+				get_unaligned_be32(buffer + 0x14));
+		return false;
+	}
+	volu = &vb->vblk.volu;
+	ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type,
+			sizeof(volu->volume_type));
+	memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter,
+			sizeof(volu->volume_state));
+	volu->size = ldm_get_vnum(buffer + 0x3D + r_child);
+	volu->partition_type = buffer[0x41 + r_size];
+	memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid));
+	if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
+		ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint,
+				sizeof(volu->drive_hint));
+	}
+	return true;
+}
+
+/**
+ * ldm_parse_vblk - Read a raw VBLK object into a vblk structure
+ * @buf:  Block of data being worked on
+ * @len:  Size of the block of data
+ * @vb:   In-memory vblk in which to return information
+ *
+ * Read a raw VBLK object into a vblk structure.  This function just reads the
+ * information common to all VBLK types, then delegates the rest of the work to
+ * helper functions: ldm_parse_*.
+ *
+ * Return:  'true'   @vb contains a VBLK
+ *          'false'  @vb contents are not defined
+ */
+static bool ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb)
+{
+	bool result = false;
+	int r_objid;
+
+	BUG_ON (!buf || !vb);
+
+	r_objid = ldm_relative (buf, len, 0x18, 0);
+	if (r_objid < 0) {
+		ldm_error ("VBLK header is corrupt.");
+		return false;
+	}
+
+	vb->flags  = buf[0x12];
+	vb->type   = buf[0x13];
+	vb->obj_id = ldm_get_vnum (buf + 0x18);
+	ldm_get_vstr (buf+0x18+r_objid, vb->name, sizeof (vb->name));
+
+	switch (vb->type) {
+		case VBLK_CMP3:  result = ldm_parse_cmp3 (buf, len, vb); break;
+		case VBLK_DSK3:  result = ldm_parse_dsk3 (buf, len, vb); break;
+		case VBLK_DSK4:  result = ldm_parse_dsk4 (buf, len, vb); break;
+		case VBLK_DGR3:  result = ldm_parse_dgr3 (buf, len, vb); break;
+		case VBLK_DGR4:  result = ldm_parse_dgr4 (buf, len, vb); break;
+		case VBLK_PRT3:  result = ldm_parse_prt3 (buf, len, vb); break;
+		case VBLK_VOL5:  result = ldm_parse_vol5 (buf, len, vb); break;
+	}
+
+	if (result)
+		ldm_debug ("Parsed VBLK 0x%llx (type: 0x%02x) ok.",
+			 (unsigned long long) vb->obj_id, vb->type);
+	else
+		ldm_error ("Failed to parse VBLK 0x%llx (type: 0x%02x).",
+			(unsigned long long) vb->obj_id, vb->type);
+
+	return result;
+}
+
+
+/**
+ * ldm_ldmdb_add - Adds a raw VBLK entry to the ldmdb database
+ * @data:  Raw VBLK to add to the database
+ * @len:   Size of the raw VBLK
+ * @ldb:   Cache of the database structures
+ *
+ * The VBLKs are sorted into categories.  Partitions are also sorted by offset.
+ *
+ * N.B.  This function does not check the validity of the VBLKs.
+ *
+ * Return:  'true'   The VBLK was added
+ *          'false'  An error occurred
+ */
+static bool ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb)
+{
+	struct vblk *vb;
+	struct list_head *item;
+
+	BUG_ON (!data || !ldb);
+
+	vb = kmalloc (sizeof (*vb), GFP_KERNEL);
+	if (!vb) {
+		ldm_crit ("Out of memory.");
+		return false;
+	}
+
+	if (!ldm_parse_vblk (data, len, vb)) {
+		kfree(vb);
+		return false;			/* Already logged */
+	}
+
+	/* Put vblk into the correct list. */
+	switch (vb->type) {
+	case VBLK_DGR3:
+	case VBLK_DGR4:
+		list_add (&vb->list, &ldb->v_dgrp);
+		break;
+	case VBLK_DSK3:
+	case VBLK_DSK4:
+		list_add (&vb->list, &ldb->v_disk);
+		break;
+	case VBLK_VOL5:
+		list_add (&vb->list, &ldb->v_volu);
+		break;
+	case VBLK_CMP3:
+		list_add (&vb->list, &ldb->v_comp);
+		break;
+	case VBLK_PRT3:
+		/* Sort by the partition's start sector. */
+		list_for_each (item, &ldb->v_part) {
+			struct vblk *v = list_entry (item, struct vblk, list);
+			if ((v->vblk.part.disk_id == vb->vblk.part.disk_id) &&
+			    (v->vblk.part.start > vb->vblk.part.start)) {
+				list_add_tail (&vb->list, &v->list);
+				return true;
+			}
+		}
+		list_add_tail (&vb->list, &ldb->v_part);
+		break;
+	}
+	return true;
+}
+
+/**
+ * ldm_frag_add - Add a VBLK fragment to a list
+ * @data:   Raw fragment to be added to the list
+ * @size:   Size of the raw fragment
+ * @frags:  Linked list of VBLK fragments
+ *
+ * Fragmented VBLKs may not be consecutive in the database, so they are placed
+ * in a list so they can be pieced together later.
+ *
+ * Return:  'true'   Success, the VBLK was added to the list
+ *          'false'  Error, a problem occurred
+ */
+static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
+{
+	struct frag *f;
+	struct list_head *item;
+	int rec, num, group;
+
+	BUG_ON (!data || !frags);
+
+	if (size < 2 * VBLK_SIZE_HEAD) {
+		ldm_error("Value of size is to small.");
+		return false;
+	}
+
+	group = get_unaligned_be32(data + 0x08);
+	rec   = get_unaligned_be16(data + 0x0C);
+	num   = get_unaligned_be16(data + 0x0E);
+	if ((num < 1) || (num > 4)) {
+		ldm_error ("A VBLK claims to have %d parts.", num);
+		return false;
+	}
+	if (rec >= num) {
+		ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num);
+		return false;
+	}
+
+	list_for_each (item, frags) {
+		f = list_entry (item, struct frag, list);
+		if (f->group == group)
+			goto found;
+	}
+
+	f = kmalloc (sizeof (*f) + size*num, GFP_KERNEL);
+	if (!f) {
+		ldm_crit ("Out of memory.");
+		return false;
+	}
+
+	f->group = group;
+	f->num   = num;
+	f->rec   = rec;
+	f->map   = 0xFF << num;
+
+	list_add_tail (&f->list, frags);
+found:
+	if (rec >= f->num) {
+		ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num);
+		return false;
+	}
+
+	if (f->map & (1 << rec)) {
+		ldm_error ("Duplicate VBLK, part %d.", rec);
+		f->map &= 0x7F;			/* Mark the group as broken */
+		return false;
+	}
+
+	f->map |= (1 << rec);
+
+	data += VBLK_SIZE_HEAD;
+	size -= VBLK_SIZE_HEAD;
+
+	memcpy (f->data+rec*(size-VBLK_SIZE_HEAD)+VBLK_SIZE_HEAD, data, size);
+
+	return true;
+}
+
+/**
+ * ldm_frag_free - Free a linked list of VBLK fragments
+ * @list:  Linked list of fragments
+ *
+ * Free a linked list of VBLK fragments
+ *
+ * Return:  none
+ */
+static void ldm_frag_free (struct list_head *list)
+{
+	struct list_head *item, *tmp;
+
+	BUG_ON (!list);
+
+	list_for_each_safe (item, tmp, list)
+		kfree (list_entry (item, struct frag, list));
+}
+
+/**
+ * ldm_frag_commit - Validate fragmented VBLKs and add them to the database
+ * @frags:  Linked list of VBLK fragments
+ * @ldb:    Cache of the database structures
+ *
+ * Now that all the fragmented VBLKs have been collected, they must be added to
+ * the database for later use.
+ *
+ * Return:  'true'   All the fragments we added successfully
+ *          'false'  One or more of the fragments we invalid
+ */
+static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
+{
+	struct frag *f;
+	struct list_head *item;
+
+	BUG_ON (!frags || !ldb);
+
+	list_for_each (item, frags) {
+		f = list_entry (item, struct frag, list);
+
+		if (f->map != 0xFF) {
+			ldm_error ("VBLK group %d is incomplete (0x%02x).",
+				f->group, f->map);
+			return false;
+		}
+
+		if (!ldm_ldmdb_add (f->data, f->num*ldb->vm.vblk_size, ldb))
+			return false;		/* Already logged */
+	}
+	return true;
+}
+
+/**
+ * ldm_get_vblks - Read the on-disk database of VBLKs into memory
+ * @state: Partition check state including device holding the LDM Database
+ * @base:  Offset, into @state->bdev, of the database
+ * @ldb:   Cache of the database structures
+ *
+ * To use the information from the VBLKs, they need to be read from the disk,
+ * unpacked and validated.  We cache them in @ldb according to their type.
+ *
+ * Return:  'true'   All the VBLKs were read successfully
+ *          'false'  An error occurred
+ */
+static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
+			  struct ldmdb *ldb)
+{
+	int size, perbuf, skip, finish, s, v, recs;
+	u8 *data = NULL;
+	Sector sect;
+	bool result = false;
+	LIST_HEAD (frags);
+
+	BUG_ON(!state || !ldb);
+
+	size   = ldb->vm.vblk_size;
+	perbuf = 512 / size;
+	skip   = ldb->vm.vblk_offset >> 9;		/* Bytes to sectors */
+	finish = (size * ldb->vm.last_vblk_seq) >> 9;
+
+	for (s = skip; s < finish; s++) {		/* For each sector */
+		data = read_part_sector(state, base + OFF_VMDB + s, &sect);
+		if (!data) {
+			ldm_crit ("Disk read failed.");
+			goto out;
+		}
+
+		for (v = 0; v < perbuf; v++, data+=size) {  /* For each vblk */
+			if (MAGIC_VBLK != get_unaligned_be32(data)) {
+				ldm_error ("Expected to find a VBLK.");
+				goto out;
+			}
+
+			recs = get_unaligned_be16(data + 0x0E);	/* Number of records */
+			if (recs == 1) {
+				if (!ldm_ldmdb_add (data, size, ldb))
+					goto out;	/* Already logged */
+			} else if (recs > 1) {
+				if (!ldm_frag_add (data, size, &frags))
+					goto out;	/* Already logged */
+			}
+			/* else Record is not in use, ignore it. */
+		}
+		put_dev_sector (sect);
+		data = NULL;
+	}
+
+	result = ldm_frag_commit (&frags, ldb);	/* Failures, already logged */
+out:
+	if (data)
+		put_dev_sector (sect);
+	ldm_frag_free (&frags);
+
+	return result;
+}
+
+/**
+ * ldm_free_vblks - Free a linked list of vblk's
+ * @lh:  Head of a linked list of struct vblk
+ *
+ * Free a list of vblk's and free the memory used to maintain the list.
+ *
+ * Return:  none
+ */
+static void ldm_free_vblks (struct list_head *lh)
+{
+	struct list_head *item, *tmp;
+
+	BUG_ON (!lh);
+
+	list_for_each_safe (item, tmp, lh)
+		kfree (list_entry (item, struct vblk, list));
+}
+
+
+/**
+ * ldm_partition - Find out whether a device is a dynamic disk and handle it
+ * @state: Partition check state including device holding the LDM Database
+ *
+ * This determines whether the device @bdev is a dynamic disk and if so creates
+ * the partitions necessary in the gendisk structure pointed to by @hd.
+ *
+ * We create a dummy device 1, which contains the LDM database, and then create
+ * each partition described by the LDM database in sequence as devices 2+. For
+ * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
+ * and so on: the actual data containing partitions.
+ *
+ * Return:  1 Success, @state->bdev is a dynamic disk and we handled it
+ *          0 Success, @state->bdev is not a dynamic disk
+ *         -1 An error occurred before enough information had been read
+ *            Or @state->bdev is a dynamic disk, but it may be corrupted
+ */
+int ldm_partition(struct parsed_partitions *state)
+{
+	struct ldmdb  *ldb;
+	unsigned long base;
+	int result = -1;
+
+	BUG_ON(!state);
+
+	/* Look for signs of a Dynamic Disk */
+	if (!ldm_validate_partition_table(state))
+		return 0;
+
+	ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
+	if (!ldb) {
+		ldm_crit ("Out of memory.");
+		goto out;
+	}
+
+	/* Parse and check privheads. */
+	if (!ldm_validate_privheads(state, &ldb->ph))
+		goto out;		/* Already logged */
+
+	/* All further references are relative to base (database start). */
+	base = ldb->ph.config_start;
+
+	/* Parse and check tocs and vmdb. */
+	if (!ldm_validate_tocblocks(state, base, ldb) ||
+	    !ldm_validate_vmdb(state, base, ldb))
+	    	goto out;		/* Already logged */
+
+	/* Initialize vblk lists in ldmdb struct */
+	INIT_LIST_HEAD (&ldb->v_dgrp);
+	INIT_LIST_HEAD (&ldb->v_disk);
+	INIT_LIST_HEAD (&ldb->v_volu);
+	INIT_LIST_HEAD (&ldb->v_comp);
+	INIT_LIST_HEAD (&ldb->v_part);
+
+	if (!ldm_get_vblks(state, base, ldb)) {
+		ldm_crit ("Failed to read the VBLKs from the database.");
+		goto cleanup;
+	}
+
+	/* Finally, create the data partition devices. */
+	if (ldm_create_data_partitions(state, ldb)) {
+		ldm_debug ("Parsed LDM database successfully.");
+		result = 1;
+	}
+	/* else Already logged */
+
+cleanup:
+	ldm_free_vblks (&ldb->v_dgrp);
+	ldm_free_vblks (&ldb->v_disk);
+	ldm_free_vblks (&ldb->v_volu);
+	ldm_free_vblks (&ldb->v_comp);
+	ldm_free_vblks (&ldb->v_part);
+out:
+	kfree (ldb);
+	return result;
+}
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
new file mode 100644
index 00000000..374242c0
--- /dev/null
+++ b/fs/partitions/ldm.h
@@ -0,0 +1,215 @@
+/**
+ * ldm - Part of the Linux-NTFS project.
+ *
+ * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
+ *
+ * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads 
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS source
+ * in the file COPYING); if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _FS_PT_LDM_H_
+#define _FS_PT_LDM_H_
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/genhd.h>
+#include <linux/fs.h>
+#include <asm/unaligned.h>
+#include <asm/byteorder.h>
+
+struct parsed_partitions;
+
+/* Magic numbers in CPU format. */
+#define MAGIC_VMDB	0x564D4442		/* VMDB */
+#define MAGIC_VBLK	0x56424C4B		/* VBLK */
+#define MAGIC_PRIVHEAD	0x5052495648454144ULL	/* PRIVHEAD */
+#define MAGIC_TOCBLOCK	0x544F43424C4F434BULL	/* TOCBLOCK */
+
+/* The defined vblk types. */
+#define VBLK_VOL5		0x51		/* Volume,     version 5 */
+#define VBLK_CMP3		0x32		/* Component,  version 3 */
+#define VBLK_PRT3		0x33		/* Partition,  version 3 */
+#define VBLK_DSK3		0x34		/* Disk,       version 3 */
+#define VBLK_DSK4		0x44		/* Disk,       version 4 */
+#define VBLK_DGR3		0x35		/* Disk Group, version 3 */
+#define VBLK_DGR4		0x45		/* Disk Group, version 4 */
+
+/* vblk flags indicating extra information will be present */
+#define	VBLK_FLAG_COMP_STRIPE	0x10
+#define	VBLK_FLAG_PART_INDEX	0x08
+#define	VBLK_FLAG_DGR3_IDS	0x08
+#define	VBLK_FLAG_DGR4_IDS	0x08
+#define	VBLK_FLAG_VOLU_ID1	0x08
+#define	VBLK_FLAG_VOLU_ID2	0x20
+#define	VBLK_FLAG_VOLU_SIZE	0x80
+#define	VBLK_FLAG_VOLU_DRIVE	0x02
+
+/* size of a vblk's static parts */
+#define VBLK_SIZE_HEAD		16
+#define VBLK_SIZE_CMP3		22		/* Name and version */
+#define VBLK_SIZE_DGR3		12
+#define VBLK_SIZE_DGR4		44
+#define VBLK_SIZE_DSK3		12
+#define VBLK_SIZE_DSK4		45
+#define VBLK_SIZE_PRT3		28
+#define VBLK_SIZE_VOL5		58
+
+/* component types */
+#define COMP_STRIPE		0x01		/* Stripe-set */
+#define COMP_BASIC		0x02		/* Basic disk */
+#define COMP_RAID		0x03		/* Raid-set */
+
+/* Other constants. */
+#define LDM_DB_SIZE		2048		/* Size in sectors (= 1MiB). */
+
+#define OFF_PRIV1		6		/* Offset of the first privhead
+						   relative to the start of the
+						   device in sectors */
+
+/* Offsets to structures within the LDM Database in sectors. */
+#define OFF_PRIV2		1856		/* Backup private headers. */
+#define OFF_PRIV3		2047
+
+#define OFF_TOCB1		1		/* Tables of contents. */
+#define OFF_TOCB2		2
+#define OFF_TOCB3		2045
+#define OFF_TOCB4		2046
+
+#define OFF_VMDB		17		/* List of partitions. */
+
+#define LDM_PARTITION		0x42		/* Formerly SFS (Landis). */
+
+#define TOC_BITMAP1		"config"	/* Names of the two defined */
+#define TOC_BITMAP2		"log"		/* bitmaps in the TOCBLOCK. */
+
+/* Borrowed from msdos.c */
+#define SYS_IND(p)		(get_unaligned(&(p)->sys_ind))
+
+struct frag {				/* VBLK Fragment handling */
+	struct list_head list;
+	u32		group;
+	u8		num;		/* Total number of records */
+	u8		rec;		/* This is record number n */
+	u8		map;		/* Which portions are in use */
+	u8		data[0];
+};
+
+/* In memory LDM database structures. */
+
+#define GUID_SIZE		16
+
+struct privhead {			/* Offsets and sizes are in sectors. */
+	u16	ver_major;
+	u16	ver_minor;
+	u64	logical_disk_start;
+	u64	logical_disk_size;
+	u64	config_start;
+	u64	config_size;
+	u8	disk_id[GUID_SIZE];
+};
+
+struct tocblock {			/* We have exactly two bitmaps. */
+	u8	bitmap1_name[16];
+	u64	bitmap1_start;
+	u64	bitmap1_size;
+	u8	bitmap2_name[16];
+	u64	bitmap2_start;
+	u64	bitmap2_size;
+};
+
+struct vmdb {				/* VMDB: The database header */
+	u16	ver_major;
+	u16	ver_minor;
+	u32	vblk_size;
+	u32	vblk_offset;
+	u32	last_vblk_seq;
+};
+
+struct vblk_comp {			/* VBLK Component */
+	u8	state[16];
+	u64	parent_id;
+	u8	type;
+	u8	children;
+	u16	chunksize;
+};
+
+struct vblk_dgrp {			/* VBLK Disk Group */
+	u8	disk_id[64];
+};
+
+struct vblk_disk {			/* VBLK Disk */
+	u8	disk_id[GUID_SIZE];
+	u8	alt_name[128];
+};
+
+struct vblk_part {			/* VBLK Partition */
+	u64	start;
+	u64	size;			/* start, size and vol_off in sectors */
+	u64	volume_offset;
+	u64	parent_id;
+	u64	disk_id;
+	u8	partnum;
+};
+
+struct vblk_volu {			/* VBLK Volume */
+	u8	volume_type[16];
+	u8	volume_state[16];
+	u8	guid[16];
+	u8	drive_hint[4];
+	u64	size;
+	u8	partition_type;
+};
+
+struct vblk_head {			/* VBLK standard header */
+	u32 group;
+	u16 rec;
+	u16 nrec;
+};
+
+struct vblk {				/* Generalised VBLK */
+	u8	name[64];
+	u64	obj_id;
+	u32	sequence;
+	u8	flags;
+	u8	type;
+	union {
+		struct vblk_comp comp;
+		struct vblk_dgrp dgrp;
+		struct vblk_disk disk;
+		struct vblk_part part;
+		struct vblk_volu volu;
+	} vblk;
+	struct list_head list;
+};
+
+struct ldmdb {				/* Cache of the database */
+	struct privhead ph;
+	struct tocblock toc;
+	struct vmdb     vm;
+	struct list_head v_dgrp;
+	struct list_head v_disk;
+	struct list_head v_volu;
+	struct list_head v_comp;
+	struct list_head v_part;
+};
+
+int ldm_partition(struct parsed_partitions *state);
+
+#endif /* _FS_PT_LDM_H_ */
+
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
new file mode 100644
index 00000000..11f688bd
--- /dev/null
+++ b/fs/partitions/mac.c
@@ -0,0 +1,134 @@
+/*
+ *  fs/partitions/mac.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *  Copyright (C) 1991-1998  Linus Torvalds
+ *  Re-organised Feb 1998 Russell King
+ */
+
+#include <linux/ctype.h>
+#include "check.h"
+#include "mac.h"
+
+#ifdef CONFIG_PPC_PMAC
+#include <asm/machdep.h>
+extern void note_bootable_part(dev_t dev, int part, int goodness);
+#endif
+
+/*
+ * Code to understand MacOS partition tables.
+ */
+
+static inline void mac_fix_string(char *stg, int len)
+{
+	int i;
+
+	for (i = len - 1; i >= 0 && stg[i] == ' '; i--)
+		stg[i] = 0;
+}
+
+int mac_partition(struct parsed_partitions *state)
+{
+	Sector sect;
+	unsigned char *data;
+	int slot, blocks_in_map;
+	unsigned secsize;
+#ifdef CONFIG_PPC_PMAC
+	int found_root = 0;
+	int found_root_goodness = 0;
+#endif
+	struct mac_partition *part;
+	struct mac_driver_desc *md;
+
+	/* Get 0th block and look at the first partition map entry. */
+	md = read_part_sector(state, 0, &sect);
+	if (!md)
+		return -1;
+	if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
+		put_dev_sector(sect);
+		return 0;
+	}
+	secsize = be16_to_cpu(md->block_size);
+	put_dev_sector(sect);
+	data = read_part_sector(state, secsize/512, &sect);
+	if (!data)
+		return -1;
+	part = (struct mac_partition *) (data + secsize%512);
+	if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) {
+		put_dev_sector(sect);
+		return 0;		/* not a MacOS disk */
+	}
+	blocks_in_map = be32_to_cpu(part->map_count);
+	if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) {
+		put_dev_sector(sect);
+		return 0;
+	}
+	strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
+	for (slot = 1; slot <= blocks_in_map; ++slot) {
+		int pos = slot * secsize;
+		put_dev_sector(sect);
+		data = read_part_sector(state, pos/512, &sect);
+		if (!data)
+			return -1;
+		part = (struct mac_partition *) (data + pos%512);
+		if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC)
+			break;
+		put_partition(state, slot,
+			be32_to_cpu(part->start_block) * (secsize/512),
+			be32_to_cpu(part->block_count) * (secsize/512));
+
+		if (!strnicmp(part->type, "Linux_RAID", 10))
+			state->parts[slot].flags = ADDPART_FLAG_RAID;
+#ifdef CONFIG_PPC_PMAC
+		/*
+		 * If this is the first bootable partition, tell the
+		 * setup code, in case it wants to make this the root.
+		 */
+		if (machine_is(powermac)) {
+			int goodness = 0;
+
+			mac_fix_string(part->processor, 16);
+			mac_fix_string(part->name, 32);
+			mac_fix_string(part->type, 32);					
+		    
+			if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE)
+			    && strcasecmp(part->processor, "powerpc") == 0)
+				goodness++;
+
+			if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0
+			    || (strnicmp(part->type, "Linux", 5) == 0
+			        && strcasecmp(part->type, "Linux_swap") != 0)) {
+				int i, l;
+
+				goodness++;
+				l = strlen(part->name);
+				if (strcmp(part->name, "/") == 0)
+					goodness++;
+				for (i = 0; i <= l - 4; ++i) {
+					if (strnicmp(part->name + i, "root",
+						     4) == 0) {
+						goodness += 2;
+						break;
+					}
+				}
+				if (strnicmp(part->name, "swap", 4) == 0)
+					goodness--;
+			}
+
+			if (goodness > found_root_goodness) {
+				found_root = slot;
+				found_root_goodness = goodness;
+			}
+		}
+#endif /* CONFIG_PPC_PMAC */
+	}
+#ifdef CONFIG_PPC_PMAC
+	if (found_root_goodness)
+		note_bootable_part(state->bdev->bd_dev, found_root,
+				   found_root_goodness);
+#endif
+
+	put_dev_sector(sect);
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	return 1;
+}
diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h
new file mode 100644
index 00000000..3c7d9843
--- /dev/null
+++ b/fs/partitions/mac.h
@@ -0,0 +1,44 @@
+/*
+ *  fs/partitions/mac.h
+ */
+
+#define MAC_PARTITION_MAGIC	0x504d
+
+/* type field value for A/UX or other Unix partitions */
+#define APPLE_AUX_TYPE	"Apple_UNIX_SVR2"
+
+struct mac_partition {
+	__be16	signature;	/* expected to be MAC_PARTITION_MAGIC */
+	__be16	res1;
+	__be32	map_count;	/* # blocks in partition map */
+	__be32	start_block;	/* absolute starting block # of partition */
+	__be32	block_count;	/* number of blocks in partition */
+	char	name[32];	/* partition name */
+	char	type[32];	/* string type description */
+	__be32	data_start;	/* rel block # of first data block */
+	__be32	data_count;	/* number of data blocks */
+	__be32	status;		/* partition status bits */
+	__be32	boot_start;
+	__be32	boot_size;
+	__be32	boot_load;
+	__be32	boot_load2;
+	__be32	boot_entry;
+	__be32	boot_entry2;
+	__be32	boot_cksum;
+	char	processor[16];	/* identifies ISA of boot */
+	/* there is more stuff after this that we don't need */
+};
+
+#define MAC_STATUS_BOOTABLE	8	/* partition is bootable */
+
+#define MAC_DRIVER_MAGIC	0x4552
+
+/* Driver descriptor structure, in block 0 */
+struct mac_driver_desc {
+	__be16	signature;	/* expected to be MAC_DRIVER_MAGIC */
+	__be16	block_size;
+	__be32	block_count;
+    /* ... more stuff */
+};
+
+int mac_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
new file mode 100644
index 00000000..5f79a667
--- /dev/null
+++ b/fs/partitions/msdos.c
@@ -0,0 +1,552 @@
+/*
+ *  fs/partitions/msdos.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *  Copyright (C) 1991-1998  Linus Torvalds
+ *
+ *  Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug
+ *  in the early extended-partition checks and added DM partitions
+ *
+ *  Support for DiskManager v6.0x added by Mark Lord,
+ *  with information provided by OnTrack.  This now works for linux fdisk
+ *  and LILO, as well as loadlin and bootln.  Note that disks other than
+ *  /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1).
+ *
+ *  More flexible handling of extended partitions - aeb, 950831
+ *
+ *  Check partition table on IDE disks for common CHS translations
+ *
+ *  Re-organised Feb 1998 Russell King
+ */
+#include <linux/msdos_fs.h>
+
+#include "check.h"
+#include "msdos.h"
+#include "efi.h"
+
+/*
+ * Many architectures don't like unaligned accesses, while
+ * the nr_sects and start_sect partition table entries are
+ * at a 2 (mod 4) address.
+ */
+#include <asm/unaligned.h>
+
+#define SYS_IND(p)	get_unaligned(&p->sys_ind)
+
+static inline sector_t nr_sects(struct partition *p)
+{
+	return (sector_t)get_unaligned_le32(&p->nr_sects);
+}
+
+static inline sector_t start_sect(struct partition *p)
+{
+	return (sector_t)get_unaligned_le32(&p->start_sect);
+}
+
+static inline int is_extended_partition(struct partition *p)
+{
+	return (SYS_IND(p) == DOS_EXTENDED_PARTITION ||
+		SYS_IND(p) == WIN98_EXTENDED_PARTITION ||
+		SYS_IND(p) == LINUX_EXTENDED_PARTITION);
+}
+
+#define MSDOS_LABEL_MAGIC1	0x55
+#define MSDOS_LABEL_MAGIC2	0xAA
+
+static inline int
+msdos_magic_present(unsigned char *p)
+{
+	return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2);
+}
+
+/* Value is EBCDIC 'IBMA' */
+#define AIX_LABEL_MAGIC1	0xC9
+#define AIX_LABEL_MAGIC2	0xC2
+#define AIX_LABEL_MAGIC3	0xD4
+#define AIX_LABEL_MAGIC4	0xC1
+static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
+{
+	struct partition *pt = (struct partition *) (p + 0x1be);
+	Sector sect;
+	unsigned char *d;
+	int slot, ret = 0;
+
+	if (!(p[0] == AIX_LABEL_MAGIC1 &&
+		p[1] == AIX_LABEL_MAGIC2 &&
+		p[2] == AIX_LABEL_MAGIC3 &&
+		p[3] == AIX_LABEL_MAGIC4))
+		return 0;
+	/* Assume the partition table is valid if Linux partitions exists */
+	for (slot = 1; slot <= 4; slot++, pt++) {
+		if (pt->sys_ind == LINUX_SWAP_PARTITION ||
+			pt->sys_ind == LINUX_RAID_PARTITION ||
+			pt->sys_ind == LINUX_DATA_PARTITION ||
+			pt->sys_ind == LINUX_LVM_PARTITION ||
+			is_extended_partition(pt))
+			return 0;
+	}
+	d = read_part_sector(state, 7, &sect);
+	if (d) {
+		if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
+			ret = 1;
+		put_dev_sector(sect);
+	};
+	return ret;
+}
+
+/*
+ * Create devices for each logical partition in an extended partition.
+ * The logical partitions form a linked list, with each entry being
+ * a partition table with two entries.  The first entry
+ * is the real data partition (with a start relative to the partition
+ * table start).  The second is a pointer to the next logical partition
+ * (with a start relative to the entire extended partition).
+ * We do not create a Linux partition for the partition tables, but
+ * only for the actual data partitions.
+ */
+
+static void parse_extended(struct parsed_partitions *state,
+			   sector_t first_sector, sector_t first_size)
+{
+	struct partition *p;
+	Sector sect;
+	unsigned char *data;
+	sector_t this_sector, this_size;
+	sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
+	int loopct = 0;		/* number of links followed
+				   without finding a data partition */
+	int i;
+
+	this_sector = first_sector;
+	this_size = first_size;
+
+	while (1) {
+		if (++loopct > 100)
+			return;
+		if (state->next == state->limit)
+			return;
+		data = read_part_sector(state, this_sector, &sect);
+		if (!data)
+			return;
+
+		if (!msdos_magic_present(data + 510))
+			goto done; 
+
+		p = (struct partition *) (data + 0x1be);
+
+		/*
+		 * Usually, the first entry is the real data partition,
+		 * the 2nd entry is the next extended partition, or empty,
+		 * and the 3rd and 4th entries are unused.
+		 * However, DRDOS sometimes has the extended partition as
+		 * the first entry (when the data partition is empty),
+		 * and OS/2 seems to use all four entries.
+		 */
+
+		/* 
+		 * First process the data partition(s)
+		 */
+		for (i=0; i<4; i++, p++) {
+			sector_t offs, size, next;
+			if (!nr_sects(p) || is_extended_partition(p))
+				continue;
+
+			/* Check the 3rd and 4th entries -
+			   these sometimes contain random garbage */
+			offs = start_sect(p)*sector_size;
+			size = nr_sects(p)*sector_size;
+			next = this_sector + offs;
+			if (i >= 2) {
+				if (offs + size > this_size)
+					continue;
+				if (next < first_sector)
+					continue;
+				if (next + size > first_sector + first_size)
+					continue;
+			}
+
+			put_partition(state, state->next, next, size);
+			if (SYS_IND(p) == LINUX_RAID_PARTITION)
+				state->parts[state->next].flags = ADDPART_FLAG_RAID;
+			loopct = 0;
+			if (++state->next == state->limit)
+				goto done;
+		}
+		/*
+		 * Next, process the (first) extended partition, if present.
+		 * (So far, there seems to be no reason to make
+		 *  parse_extended()  recursive and allow a tree
+		 *  of extended partitions.)
+		 * It should be a link to the next logical partition.
+		 */
+		p -= 4;
+		for (i=0; i<4; i++, p++)
+			if (nr_sects(p) && is_extended_partition(p))
+				break;
+		if (i == 4)
+			goto done;	 /* nothing left to do */
+
+		this_sector = first_sector + start_sect(p) * sector_size;
+		this_size = nr_sects(p) * sector_size;
+		put_dev_sector(sect);
+	}
+done:
+	put_dev_sector(sect);
+}
+
+/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
+   indicates linux swap.  Be careful before believing this is Solaris. */
+
+static void parse_solaris_x86(struct parsed_partitions *state,
+			      sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_SOLARIS_X86_PARTITION
+	Sector sect;
+	struct solaris_x86_vtoc *v;
+	int i;
+	short max_nparts;
+
+	v = read_part_sector(state, offset + 1, &sect);
+	if (!v)
+		return;
+	if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
+		put_dev_sector(sect);
+		return;
+	}
+	{
+		char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1];
+
+		snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	}
+	if (le32_to_cpu(v->v_version) != 1) {
+		char tmp[64];
+
+		snprintf(tmp, sizeof(tmp), "  cannot handle version %d vtoc>\n",
+			 le32_to_cpu(v->v_version));
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		put_dev_sector(sect);
+		return;
+	}
+	/* Ensure we can handle previous case of VTOC with 8 entries gracefully */
+	max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
+	for (i=0; i<max_nparts && state->next<state->limit; i++) {
+		struct solaris_x86_slice *s = &v->v_slice[i];
+		char tmp[3 + 10 + 1 + 1];
+
+		if (s->s_size == 0)
+			continue;
+		snprintf(tmp, sizeof(tmp), " [s%d]", i);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		/* solaris partitions are relative to current MS-DOS
+		 * one; must add the offset of the current partition */
+		put_partition(state, state->next++,
+				 le32_to_cpu(s->s_start)+offset,
+				 le32_to_cpu(s->s_size));
+	}
+	put_dev_sector(sect);
+	strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+#endif
+}
+
+#if defined(CONFIG_BSD_DISKLABEL)
+/* 
+ * Create devices for BSD partitions listed in a disklabel, under a
+ * dos-like partition. See parse_extended() for more information.
+ */
+static void parse_bsd(struct parsed_partitions *state,
+		      sector_t offset, sector_t size, int origin, char *flavour,
+		      int max_partitions)
+{
+	Sector sect;
+	struct bsd_disklabel *l;
+	struct bsd_partition *p;
+	char tmp[64];
+
+	l = read_part_sector(state, offset + 1, &sect);
+	if (!l)
+		return;
+	if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
+		put_dev_sector(sect);
+		return;
+	}
+
+	snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour);
+	strlcat(state->pp_buf, tmp, PAGE_SIZE);
+
+	if (le16_to_cpu(l->d_npartitions) < max_partitions)
+		max_partitions = le16_to_cpu(l->d_npartitions);
+	for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
+		sector_t bsd_start, bsd_size;
+
+		if (state->next == state->limit)
+			break;
+		if (p->p_fstype == BSD_FS_UNUSED) 
+			continue;
+		bsd_start = le32_to_cpu(p->p_offset);
+		bsd_size = le32_to_cpu(p->p_size);
+		if (offset == bsd_start && size == bsd_size)
+			/* full parent partition, we have it already */
+			continue;
+		if (offset > bsd_start || offset+size < bsd_start+bsd_size) {
+			strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE);
+			continue;
+		}
+		put_partition(state, state->next++, bsd_start, bsd_size);
+	}
+	put_dev_sector(sect);
+	if (le16_to_cpu(l->d_npartitions) > max_partitions) {
+		snprintf(tmp, sizeof(tmp), " (ignored %d more)",
+			 le16_to_cpu(l->d_npartitions) - max_partitions);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	}
+	strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+}
+#endif
+
+static void parse_freebsd(struct parsed_partitions *state,
+			  sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_BSD_DISKLABEL
+	parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
+#endif
+}
+
+static void parse_netbsd(struct parsed_partitions *state,
+			 sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_BSD_DISKLABEL
+	parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
+#endif
+}
+
+static void parse_openbsd(struct parsed_partitions *state,
+			  sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_BSD_DISKLABEL
+	parse_bsd(state, offset, size, origin, "openbsd",
+		  OPENBSD_MAXPARTITIONS);
+#endif
+}
+
+/*
+ * Create devices for Unixware partitions listed in a disklabel, under a
+ * dos-like partition. See parse_extended() for more information.
+ */
+static void parse_unixware(struct parsed_partitions *state,
+			   sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_UNIXWARE_DISKLABEL
+	Sector sect;
+	struct unixware_disklabel *l;
+	struct unixware_slice *p;
+
+	l = read_part_sector(state, offset + 29, &sect);
+	if (!l)
+		return;
+	if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
+	    le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) {
+		put_dev_sector(sect);
+		return;
+	}
+	{
+		char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1];
+
+		snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	}
+	p = &l->vtoc.v_slice[1];
+	/* I omit the 0th slice as it is the same as whole disk. */
+	while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
+		if (state->next == state->limit)
+			break;
+
+		if (p->s_label != UNIXWARE_FS_UNUSED)
+			put_partition(state, state->next++,
+				      le32_to_cpu(p->start_sect),
+				      le32_to_cpu(p->nr_sects));
+		p++;
+	}
+	put_dev_sector(sect);
+	strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+#endif
+}
+
+/*
+ * Minix 2.0.0/2.0.2 subpartition support.
+ * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
+ * Rajeev V. Pillai    <rajeevvp@yahoo.com>
+ */
+static void parse_minix(struct parsed_partitions *state,
+			sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_MINIX_SUBPARTITION
+	Sector sect;
+	unsigned char *data;
+	struct partition *p;
+	int i;
+
+	data = read_part_sector(state, offset, &sect);
+	if (!data)
+		return;
+
+	p = (struct partition *)(data + 0x1be);
+
+	/* The first sector of a Minix partition can have either
+	 * a secondary MBR describing its subpartitions, or
+	 * the normal boot sector. */
+	if (msdos_magic_present (data + 510) &&
+	    SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
+		char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
+
+		snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) {
+			if (state->next == state->limit)
+				break;
+			/* add each partition in use */
+			if (SYS_IND(p) == MINIX_PARTITION)
+				put_partition(state, state->next++,
+					      start_sect(p), nr_sects(p));
+		}
+		strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+	}
+	put_dev_sector(sect);
+#endif /* CONFIG_MINIX_SUBPARTITION */
+}
+
+static struct {
+	unsigned char id;
+	void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
+} subtypes[] = {
+	{FREEBSD_PARTITION, parse_freebsd},
+	{NETBSD_PARTITION, parse_netbsd},
+	{OPENBSD_PARTITION, parse_openbsd},
+	{MINIX_PARTITION, parse_minix},
+	{UNIXWARE_PARTITION, parse_unixware},
+	{SOLARIS_X86_PARTITION, parse_solaris_x86},
+	{NEW_SOLARIS_X86_PARTITION, parse_solaris_x86},
+	{0, NULL},
+};
+ 
+int msdos_partition(struct parsed_partitions *state)
+{
+	sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
+	Sector sect;
+	unsigned char *data;
+	struct partition *p;
+	struct fat_boot_sector *fb;
+	int slot;
+
+	data = read_part_sector(state, 0, &sect);
+	if (!data)
+		return -1;
+	if (!msdos_magic_present(data + 510)) {
+		put_dev_sector(sect);
+		return 0;
+	}
+
+	if (aix_magic_present(state, data)) {
+		put_dev_sector(sect);
+		strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
+		return 0;
+	}
+
+	/*
+	 * Now that the 55aa signature is present, this is probably
+	 * either the boot sector of a FAT filesystem or a DOS-type
+	 * partition table. Reject this in case the boot indicator
+	 * is not 0 or 0x80.
+	 */
+	p = (struct partition *) (data + 0x1be);
+	for (slot = 1; slot <= 4; slot++, p++) {
+		if (p->boot_ind != 0 && p->boot_ind != 0x80) {
+			/*
+			 * Even without a valid boot inidicator value
+			 * its still possible this is valid FAT filesystem
+			 * without a partition table.
+			 */
+			fb = (struct fat_boot_sector *) data;
+			if (slot == 1 && fb->reserved && fb->fats
+				&& fat_valid_media(fb->media)) {
+				strlcat(state->pp_buf, "\n", PAGE_SIZE);
+				put_dev_sector(sect);
+				return 1;
+			} else {
+				put_dev_sector(sect);
+				return 0;
+			}
+		}
+	}
+
+#ifdef CONFIG_EFI_PARTITION
+	p = (struct partition *) (data + 0x1be);
+	for (slot = 1 ; slot <= 4 ; slot++, p++) {
+		/* If this is an EFI GPT disk, msdos should ignore it. */
+		if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) {
+			put_dev_sector(sect);
+			return 0;
+		}
+	}
+#endif
+	p = (struct partition *) (data + 0x1be);
+
+	/*
+	 * Look for partitions in two passes:
+	 * First find the primary and DOS-type extended partitions.
+	 * On the second pass look inside *BSD, Unixware and Solaris partitions.
+	 */
+
+	state->next = 5;
+	for (slot = 1 ; slot <= 4 ; slot++, p++) {
+		sector_t start = start_sect(p)*sector_size;
+		sector_t size = nr_sects(p)*sector_size;
+		if (!size)
+			continue;
+		if (is_extended_partition(p)) {
+			/*
+			 * prevent someone doing mkfs or mkswap on an
+			 * extended partition, but leave room for LILO
+			 * FIXME: this uses one logical sector for > 512b
+			 * sector, although it may not be enough/proper.
+			 */
+			sector_t n = 2;
+			n = min(size, max(sector_size, n));
+			put_partition(state, slot, start, n);
+
+			strlcat(state->pp_buf, " <", PAGE_SIZE);
+			parse_extended(state, start, size);
+			strlcat(state->pp_buf, " >", PAGE_SIZE);
+			continue;
+		}
+		put_partition(state, slot, start, size);
+		if (SYS_IND(p) == LINUX_RAID_PARTITION)
+			state->parts[slot].flags = ADDPART_FLAG_RAID;
+		if (SYS_IND(p) == DM6_PARTITION)
+			strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
+		if (SYS_IND(p) == EZD_PARTITION)
+			strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
+	}
+
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+	/* second pass - output for each on a separate line */
+	p = (struct partition *) (0x1be + data);
+	for (slot = 1 ; slot <= 4 ; slot++, p++) {
+		unsigned char id = SYS_IND(p);
+		int n;
+
+		if (!nr_sects(p))
+			continue;
+
+		for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
+			;
+
+		if (!subtypes[n].parse)
+			continue;
+		subtypes[n].parse(state, start_sect(p) * sector_size,
+				  nr_sects(p) * sector_size, slot);
+	}
+	put_dev_sector(sect);
+	return 1;
+}
diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h
new file mode 100644
index 00000000..38c781c4
--- /dev/null
+++ b/fs/partitions/msdos.h
@@ -0,0 +1,8 @@
+/*
+ *  fs/partitions/msdos.h
+ */
+
+#define MSDOS_LABEL_MAGIC		0xAA55
+
+int msdos_partition(struct parsed_partitions *state);
+
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
new file mode 100644
index 00000000..764b86a0
--- /dev/null
+++ b/fs/partitions/osf.c
@@ -0,0 +1,86 @@
+/*
+ *  fs/partitions/osf.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *
+ *  Copyright (C) 1991-1998  Linus Torvalds
+ *  Re-organised Feb 1998 Russell King
+ */
+
+#include "check.h"
+#include "osf.h"
+
+#define MAX_OSF_PARTITIONS 18
+
+int osf_partition(struct parsed_partitions *state)
+{
+	int i;
+	int slot = 1;
+	unsigned int npartitions;
+	Sector sect;
+	unsigned char *data;
+	struct disklabel {
+		__le32 d_magic;
+		__le16 d_type,d_subtype;
+		u8 d_typename[16];
+		u8 d_packname[16];
+		__le32 d_secsize;
+		__le32 d_nsectors;
+		__le32 d_ntracks;
+		__le32 d_ncylinders;
+		__le32 d_secpercyl;
+		__le32 d_secprtunit;
+		__le16 d_sparespertrack;
+		__le16 d_sparespercyl;
+		__le32 d_acylinders;
+		__le16 d_rpm, d_interleave, d_trackskew, d_cylskew;
+		__le32 d_headswitch, d_trkseek, d_flags;
+		__le32 d_drivedata[5];
+		__le32 d_spare[5];
+		__le32 d_magic2;
+		__le16 d_checksum;
+		__le16 d_npartitions;
+		__le32 d_bbsize, d_sbsize;
+		struct d_partition {
+			__le32 p_size;
+			__le32 p_offset;
+			__le32 p_fsize;
+			u8  p_fstype;
+			u8  p_frag;
+			__le16 p_cpg;
+		} d_partitions[MAX_OSF_PARTITIONS];
+	} * label;
+	struct d_partition * partition;
+
+	data = read_part_sector(state, 0, &sect);
+	if (!data)
+		return -1;
+
+	label = (struct disklabel *) (data+64);
+	partition = label->d_partitions;
+	if (le32_to_cpu(label->d_magic) != DISKLABELMAGIC) {
+		put_dev_sector(sect);
+		return 0;
+	}
+	if (le32_to_cpu(label->d_magic2) != DISKLABELMAGIC) {
+		put_dev_sector(sect);
+		return 0;
+	}
+	npartitions = le16_to_cpu(label->d_npartitions);
+	if (npartitions > MAX_OSF_PARTITIONS) {
+		put_dev_sector(sect);
+		return 0;
+	}
+	for (i = 0 ; i < npartitions; i++, partition++) {
+		if (slot == state->limit)
+		        break;
+		if (le32_to_cpu(partition->p_size))
+			put_partition(state, slot,
+				le32_to_cpu(partition->p_offset),
+				le32_to_cpu(partition->p_size));
+		slot++;
+	}
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	put_dev_sector(sect);
+	return 1;
+}
diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h
new file mode 100644
index 00000000..20ed2315
--- /dev/null
+++ b/fs/partitions/osf.h
@@ -0,0 +1,7 @@
+/*
+ *  fs/partitions/osf.h
+ */
+
+#define DISKLABELMAGIC (0x82564557UL)
+
+int osf_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
new file mode 100644
index 00000000..ea8a86dc
--- /dev/null
+++ b/fs/partitions/sgi.c
@@ -0,0 +1,82 @@
+/*
+ *  fs/partitions/sgi.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ */
+
+#include "check.h"
+#include "sgi.h"
+
+struct sgi_disklabel {
+	__be32 magic_mushroom;		/* Big fat spliff... */
+	__be16 root_part_num;		/* Root partition number */
+	__be16 swap_part_num;		/* Swap partition number */
+	s8 boot_file[16];		/* Name of boot file for ARCS */
+	u8 _unused0[48];		/* Device parameter useless crapola.. */
+	struct sgi_volume {
+		s8 name[8];		/* Name of volume */
+		__be32 block_num;		/* Logical block number */
+		__be32 num_bytes;		/* How big, in bytes */
+	} volume[15];
+	struct sgi_partition {
+		__be32 num_blocks;		/* Size in logical blocks */
+		__be32 first_block;	/* First logical block */
+		__be32 type;		/* Type of this partition */
+	} partitions[16];
+	__be32 csum;			/* Disk label checksum */
+	__be32 _unused1;			/* Padding */
+};
+
+int sgi_partition(struct parsed_partitions *state)
+{
+	int i, csum;
+	__be32 magic;
+	int slot = 1;
+	unsigned int start, blocks;
+	__be32 *ui, cs;
+	Sector sect;
+	struct sgi_disklabel *label;
+	struct sgi_partition *p;
+	char b[BDEVNAME_SIZE];
+
+	label = read_part_sector(state, 0, &sect);
+	if (!label)
+		return -1;
+	p = &label->partitions[0];
+	magic = label->magic_mushroom;
+	if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) {
+		/*printk("Dev %s SGI disklabel: bad magic %08x\n",
+		       bdevname(bdev, b), be32_to_cpu(magic));*/
+		put_dev_sector(sect);
+		return 0;
+	}
+	ui = ((__be32 *) (label + 1)) - 1;
+	for(csum = 0; ui >= ((__be32 *) label);) {
+		cs = *ui--;
+		csum += be32_to_cpu(cs);
+	}
+	if(csum) {
+		printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
+		       bdevname(state->bdev, b));
+		put_dev_sector(sect);
+		return 0;
+	}
+	/* All SGI disk labels have 16 partitions, disks under Linux only
+	 * have 15 minor's.  Luckily there are always a few zero length
+	 * partitions which we don't care about so we never overflow the
+	 * current_minor.
+	 */
+	for(i = 0; i < 16; i++, p++) {
+		blocks = be32_to_cpu(p->num_blocks);
+		start  = be32_to_cpu(p->first_block);
+		if (blocks) {
+			put_partition(state, slot, start, blocks);
+			if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION)
+				state->parts[slot].flags = ADDPART_FLAG_RAID;
+		}
+		slot++;
+	}
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	put_dev_sector(sect);
+	return 1;
+}
diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h
new file mode 100644
index 00000000..b9553ebd
--- /dev/null
+++ b/fs/partitions/sgi.h
@@ -0,0 +1,8 @@
+/*
+ *  fs/partitions/sgi.h
+ */
+
+extern int sgi_partition(struct parsed_partitions *state);
+
+#define SGI_LABEL_MAGIC 0x0be5a941
+
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
new file mode 100644
index 00000000..b5b6fcfb
--- /dev/null
+++ b/fs/partitions/sun.c
@@ -0,0 +1,122 @@
+/*
+ *  fs/partitions/sun.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *
+ *  Copyright (C) 1991-1998  Linus Torvalds
+ *  Re-organised Feb 1998 Russell King
+ */
+
+#include "check.h"
+#include "sun.h"
+
+int sun_partition(struct parsed_partitions *state)
+{
+	int i;
+	__be16 csum;
+	int slot = 1;
+	__be16 *ush;
+	Sector sect;
+	struct sun_disklabel {
+		unsigned char info[128];   /* Informative text string */
+		struct sun_vtoc {
+		    __be32 version;     /* Layout version */
+		    char   volume[8];   /* Volume name */
+		    __be16 nparts;      /* Number of partitions */
+		    struct sun_info {           /* Partition hdrs, sec 2 */
+			__be16 id;
+			__be16 flags;
+		    } infos[8];
+		    __be16 padding;     /* Alignment padding */
+		    __be32 bootinfo[3];  /* Info needed by mboot */
+		    __be32 sanity;       /* To verify vtoc sanity */
+		    __be32 reserved[10]; /* Free space */
+		    __be32 timestamp[8]; /* Partition timestamp */
+		} vtoc;
+		__be32 write_reinstruct; /* sectors to skip, writes */
+		__be32 read_reinstruct;  /* sectors to skip, reads */
+		unsigned char spare[148]; /* Padding */
+		__be16 rspeed;     /* Disk rotational speed */
+		__be16 pcylcount;  /* Physical cylinder count */
+		__be16 sparecyl;   /* extra sects per cylinder */
+		__be16 obs1;       /* gap1 */
+		__be16 obs2;       /* gap2 */
+		__be16 ilfact;     /* Interleave factor */
+		__be16 ncyl;       /* Data cylinder count */
+		__be16 nacyl;      /* Alt. cylinder count */
+		__be16 ntrks;      /* Tracks per cylinder */
+		__be16 nsect;      /* Sectors per track */
+		__be16 obs3;       /* bhead - Label head offset */
+		__be16 obs4;       /* ppart - Physical Partition */
+		struct sun_partition {
+			__be32 start_cylinder;
+			__be32 num_sectors;
+		} partitions[8];
+		__be16 magic;      /* Magic number */
+		__be16 csum;       /* Label xor'd checksum */
+	} * label;
+	struct sun_partition *p;
+	unsigned long spc;
+	char b[BDEVNAME_SIZE];
+	int use_vtoc;
+	int nparts;
+
+	label = read_part_sector(state, 0, &sect);
+	if (!label)
+		return -1;
+
+	p = label->partitions;
+	if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) {
+/*		printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n",
+		       bdevname(bdev, b), be16_to_cpu(label->magic)); */
+		put_dev_sector(sect);
+		return 0;
+	}
+	/* Look at the checksum */
+	ush = ((__be16 *) (label+1)) - 1;
+	for (csum = 0; ush >= ((__be16 *) label);)
+		csum ^= *ush--;
+	if (csum) {
+		printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
+		       bdevname(state->bdev, b));
+		put_dev_sector(sect);
+		return 0;
+	}
+
+	/* Check to see if we can use the VTOC table */
+	use_vtoc = ((be32_to_cpu(label->vtoc.sanity) == SUN_VTOC_SANITY) &&
+		    (be32_to_cpu(label->vtoc.version) == 1) &&
+		    (be16_to_cpu(label->vtoc.nparts) <= 8));
+
+	/* Use 8 partition entries if not specified in validated VTOC */
+	nparts = (use_vtoc) ? be16_to_cpu(label->vtoc.nparts) : 8;
+
+	/*
+	 * So that old Linux-Sun partitions continue to work,
+	 * alow the VTOC to be used under the additional condition ...
+	 */
+	use_vtoc = use_vtoc || !(label->vtoc.sanity ||
+				 label->vtoc.version || label->vtoc.nparts);
+	spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect);
+	for (i = 0; i < nparts; i++, p++) {
+		unsigned long st_sector;
+		unsigned int num_sectors;
+
+		st_sector = be32_to_cpu(p->start_cylinder) * spc;
+		num_sectors = be32_to_cpu(p->num_sectors);
+		if (num_sectors) {
+			put_partition(state, slot, st_sector, num_sectors);
+			state->parts[slot].flags = 0;
+			if (use_vtoc) {
+				if (be16_to_cpu(label->vtoc.infos[i].id) == LINUX_RAID_PARTITION)
+					state->parts[slot].flags |= ADDPART_FLAG_RAID;
+				else if (be16_to_cpu(label->vtoc.infos[i].id) == SUN_WHOLE_DISK)
+					state->parts[slot].flags |= ADDPART_FLAG_WHOLEDISK;
+			}
+		}
+		slot++;
+	}
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	put_dev_sector(sect);
+	return 1;
+}
diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h
new file mode 100644
index 00000000..2424baa8
--- /dev/null
+++ b/fs/partitions/sun.h
@@ -0,0 +1,8 @@
+/*
+ *  fs/partitions/sun.h
+ */
+
+#define SUN_LABEL_MAGIC          0xDABE
+#define SUN_VTOC_SANITY          0x600DDEEE
+
+int sun_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
new file mode 100644
index 00000000..9627ccff
--- /dev/null
+++ b/fs/partitions/sysv68.c
@@ -0,0 +1,95 @@
+/*
+ *  fs/partitions/sysv68.c
+ *
+ *  Copyright (C) 2007 Philippe De Muyter <phdm@macqel.be>
+ */
+
+#include "check.h"
+#include "sysv68.h"
+
+/*
+ *	Volume ID structure: on first 256-bytes sector of disk
+ */
+
+struct volumeid {
+	u8	vid_unused[248];
+	u8	vid_mac[8];	/* ASCII string "MOTOROLA" */
+};
+
+/*
+ *	config block: second 256-bytes sector on disk
+ */
+
+struct dkconfig {
+	u8	ios_unused0[128];
+	__be32	ios_slcblk;	/* Slice table block number */
+	__be16	ios_slccnt;	/* Number of entries in slice table */
+	u8	ios_unused1[122];
+};
+
+/*
+ *	combined volumeid and dkconfig block
+ */
+
+struct dkblk0 {
+	struct volumeid dk_vid;
+	struct dkconfig dk_ios;
+};
+
+/*
+ *	Slice Table Structure
+ */
+
+struct slice {
+	__be32	nblocks;		/* slice size (in blocks) */
+	__be32	blkoff;			/* block offset of slice */
+};
+
+
+int sysv68_partition(struct parsed_partitions *state)
+{
+	int i, slices;
+	int slot = 1;
+	Sector sect;
+	unsigned char *data;
+	struct dkblk0 *b;
+	struct slice *slice;
+	char tmp[64];
+
+	data = read_part_sector(state, 0, &sect);
+	if (!data)
+		return -1;
+
+	b = (struct dkblk0 *)data;
+	if (memcmp(b->dk_vid.vid_mac, "MOTOROLA", sizeof(b->dk_vid.vid_mac))) {
+		put_dev_sector(sect);
+		return 0;
+	}
+	slices = be16_to_cpu(b->dk_ios.ios_slccnt);
+	i = be32_to_cpu(b->dk_ios.ios_slcblk);
+	put_dev_sector(sect);
+
+	data = read_part_sector(state, i, &sect);
+	if (!data)
+		return -1;
+
+	slices -= 1; /* last slice is the whole disk */
+	snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices);
+	strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	slice = (struct slice *)data;
+	for (i = 0; i < slices; i++, slice++) {
+		if (slot == state->limit)
+			break;
+		if (be32_to_cpu(slice->nblocks)) {
+			put_partition(state, slot,
+				be32_to_cpu(slice->blkoff),
+				be32_to_cpu(slice->nblocks));
+			snprintf(tmp, sizeof(tmp), "(s%u)", i);
+			strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		}
+		slot++;
+	}
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	put_dev_sector(sect);
+	return 1;
+}
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h
new file mode 100644
index 00000000..bf2f5ffa
--- /dev/null
+++ b/fs/partitions/sysv68.h
@@ -0,0 +1 @@
+extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
new file mode 100644
index 00000000..8dbaf9f7
--- /dev/null
+++ b/fs/partitions/ultrix.c
@@ -0,0 +1,48 @@
+/*
+ *  fs/partitions/ultrix.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *
+ *  Re-organised Jul 1999 Russell King
+ */
+
+#include "check.h"
+#include "ultrix.h"
+
+int ultrix_partition(struct parsed_partitions *state)
+{
+	int i;
+	Sector sect;
+	unsigned char *data;
+	struct ultrix_disklabel {
+		s32	pt_magic;	/* magic no. indicating part. info exits */
+		s32	pt_valid;	/* set by driver if pt is current */
+		struct  pt_info {
+			s32		pi_nblocks; /* no. of sectors */
+			u32		pi_blkoff;  /* block offset for start */
+		} pt_part[8];
+	} *label;
+
+#define PT_MAGIC	0x032957	/* Partition magic number */
+#define PT_VALID	1		/* Indicates if struct is valid */
+
+	data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
+	if (!data)
+		return -1;
+	
+	label = (struct ultrix_disklabel *)(data + 512 - sizeof(*label));
+
+	if (label->pt_magic == PT_MAGIC && label->pt_valid == PT_VALID) {
+		for (i=0; i<8; i++)
+			if (label->pt_part[i].pi_nblocks)
+				put_partition(state, i+1, 
+					      label->pt_part[i].pi_blkoff,
+					      label->pt_part[i].pi_nblocks);
+		put_dev_sector(sect);
+		strlcat(state->pp_buf, "\n", PAGE_SIZE);
+		return 1;
+	} else {
+		put_dev_sector(sect);
+		return 0;
+	}
+}
diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h
new file mode 100644
index 00000000..a3cc00b2
--- /dev/null
+++ b/fs/partitions/ultrix.h
@@ -0,0 +1,5 @@
+/*
+ *  fs/partitions/ultrix.h
+ */
+
+int ultrix_partition(struct parsed_partitions *state);
diff --git a/fs/pipe.c b/fs/pipe.c
new file mode 100644
index 00000000..0499a962
--- /dev/null
+++ b/fs/pipe.c
@@ -0,0 +1,1326 @@
+/*
+ *  linux/fs/pipe.c
+ *
+ *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
+ */
+
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/log2.h>
+#include <linux/mount.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/uio.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/audit.h>
+#include <linux/syscalls.h>
+#include <linux/fcntl.h>
+
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+
+/*
+ * The max size that a non-root user is allowed to grow the pipe. Can
+ * be set by root in /proc/sys/fs/pipe-max-size
+ */
+unsigned int pipe_max_size = 1048576;
+
+/*
+ * Minimum pipe size, as required by POSIX
+ */
+unsigned int pipe_min_size = PAGE_SIZE;
+
+/*
+ * We use a start+len construction, which provides full use of the 
+ * allocated memory.
+ * -- Florian Coosmann (FGC)
+ * 
+ * Reads with count = 0 should always return 0.
+ * -- Julian Bradfield 1999-06-07.
+ *
+ * FIFOs and Pipes now generate SIGIO for both readers and writers.
+ * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
+ *
+ * pipe_read & write cleanup
+ * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
+ */
+
+static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
+{
+	if (pipe->inode)
+		mutex_lock_nested(&pipe->inode->i_mutex, subclass);
+}
+
+void pipe_lock(struct pipe_inode_info *pipe)
+{
+	/*
+	 * pipe_lock() nests non-pipe inode locks (for writing to a file)
+	 */
+	pipe_lock_nested(pipe, I_MUTEX_PARENT);
+}
+EXPORT_SYMBOL(pipe_lock);
+
+void pipe_unlock(struct pipe_inode_info *pipe)
+{
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
+}
+EXPORT_SYMBOL(pipe_unlock);
+
+void pipe_double_lock(struct pipe_inode_info *pipe1,
+		      struct pipe_inode_info *pipe2)
+{
+	BUG_ON(pipe1 == pipe2);
+
+	if (pipe1 < pipe2) {
+		pipe_lock_nested(pipe1, I_MUTEX_PARENT);
+		pipe_lock_nested(pipe2, I_MUTEX_CHILD);
+	} else {
+		pipe_lock_nested(pipe2, I_MUTEX_PARENT);
+		pipe_lock_nested(pipe1, I_MUTEX_CHILD);
+	}
+}
+
+/* Drop the inode semaphore and wait for a pipe event, atomically */
+void pipe_wait(struct pipe_inode_info *pipe)
+{
+	DEFINE_WAIT(wait);
+
+	/*
+	 * Pipes are system-local resources, so sleeping on them
+	 * is considered a noninteractive wait:
+	 */
+	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
+	pipe_unlock(pipe);
+	schedule();
+	finish_wait(&pipe->wait, &wait);
+	pipe_lock(pipe);
+}
+
+static int
+pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
+			int atomic)
+{
+	unsigned long copy;
+
+	while (len > 0) {
+		while (!iov->iov_len)
+			iov++;
+		copy = min_t(unsigned long, len, iov->iov_len);
+
+		if (atomic) {
+			if (__copy_from_user_inatomic(to, iov->iov_base, copy))
+				return -EFAULT;
+		} else {
+			if (copy_from_user(to, iov->iov_base, copy))
+				return -EFAULT;
+		}
+		to += copy;
+		len -= copy;
+		iov->iov_base += copy;
+		iov->iov_len -= copy;
+	}
+	return 0;
+}
+
+static int
+pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
+		      int atomic)
+{
+	unsigned long copy;
+
+	while (len > 0) {
+		while (!iov->iov_len)
+			iov++;
+		copy = min_t(unsigned long, len, iov->iov_len);
+
+		if (atomic) {
+			if (__copy_to_user_inatomic(iov->iov_base, from, copy))
+				return -EFAULT;
+		} else {
+			if (copy_to_user(iov->iov_base, from, copy))
+				return -EFAULT;
+		}
+		from += copy;
+		len -= copy;
+		iov->iov_base += copy;
+		iov->iov_len -= copy;
+	}
+	return 0;
+}
+
+/*
+ * Attempt to pre-fault in the user memory, so we can use atomic copies.
+ * Returns the number of bytes not faulted in.
+ */
+static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
+{
+	while (!iov->iov_len)
+		iov++;
+
+	while (len > 0) {
+		unsigned long this_len;
+
+		this_len = min_t(unsigned long, len, iov->iov_len);
+		if (fault_in_pages_writeable(iov->iov_base, this_len))
+			break;
+
+		len -= this_len;
+		iov++;
+	}
+
+	return len;
+}
+
+/*
+ * Pre-fault in the user memory, so we can use atomic copies.
+ */
+static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
+{
+	while (!iov->iov_len)
+		iov++;
+
+	while (len > 0) {
+		unsigned long this_len;
+
+		this_len = min_t(unsigned long, len, iov->iov_len);
+		fault_in_pages_readable(iov->iov_base, this_len);
+		len -= this_len;
+		iov++;
+	}
+}
+
+static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
+				  struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+
+	/*
+	 * If nobody else uses this page, and we don't already have a
+	 * temporary page, let's keep track of it as a one-deep
+	 * allocation cache. (Otherwise just release our reference to it)
+	 */
+	if (page_count(page) == 1 && !pipe->tmp_page)
+		pipe->tmp_page = page;
+	else
+		page_cache_release(page);
+}
+
+/**
+ * generic_pipe_buf_map - virtually map a pipe buffer
+ * @pipe:	the pipe that the buffer belongs to
+ * @buf:	the buffer that should be mapped
+ * @atomic:	whether to use an atomic map
+ *
+ * Description:
+ *	This function returns a kernel virtual address mapping for the
+ *	pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided
+ *	and the caller has to be careful not to fault before calling
+ *	the unmap function.
+ *
+ *	Note that this function occupies KM_USER0 if @atomic != 0.
+ */
+void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
+			   struct pipe_buffer *buf, int atomic)
+{
+	if (atomic) {
+		buf->flags |= PIPE_BUF_FLAG_ATOMIC;
+		return kmap_atomic(buf->page, KM_USER0);
+	}
+
+	return kmap(buf->page);
+}
+EXPORT_SYMBOL(generic_pipe_buf_map);
+
+/**
+ * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
+ * @pipe:	the pipe that the buffer belongs to
+ * @buf:	the buffer that should be unmapped
+ * @map_data:	the data that the mapping function returned
+ *
+ * Description:
+ *	This function undoes the mapping that ->map() provided.
+ */
+void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
+			    struct pipe_buffer *buf, void *map_data)
+{
+	if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
+		buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
+		kunmap_atomic(map_data, KM_USER0);
+	} else
+		kunmap(buf->page);
+}
+EXPORT_SYMBOL(generic_pipe_buf_unmap);
+
+/**
+ * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
+ * @pipe:	the pipe that the buffer belongs to
+ * @buf:	the buffer to attempt to steal
+ *
+ * Description:
+ *	This function attempts to steal the &struct page attached to
+ *	@buf. If successful, this function returns 0 and returns with
+ *	the page locked. The caller may then reuse the page for whatever
+ *	he wishes; the typical use is insertion into a different file
+ *	page cache.
+ */
+int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
+			   struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+
+	/*
+	 * A reference of one is golden, that means that the owner of this
+	 * page is the only one holding a reference to it. lock the page
+	 * and return OK.
+	 */
+	if (page_count(page) == 1) {
+		lock_page(page);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(generic_pipe_buf_steal);
+
+/**
+ * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
+ * @pipe:	the pipe that the buffer belongs to
+ * @buf:	the buffer to get a reference to
+ *
+ * Description:
+ *	This function grabs an extra reference to @buf. It's used in
+ *	in the tee() system call, when we duplicate the buffers in one
+ *	pipe into another.
+ */
+void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
+{
+	page_cache_get(buf->page);
+}
+EXPORT_SYMBOL(generic_pipe_buf_get);
+
+/**
+ * generic_pipe_buf_confirm - verify contents of the pipe buffer
+ * @info:	the pipe that the buffer belongs to
+ * @buf:	the buffer to confirm
+ *
+ * Description:
+ *	This function does nothing, because the generic pipe code uses
+ *	pages that are always good when inserted into the pipe.
+ */
+int generic_pipe_buf_confirm(struct pipe_inode_info *info,
+			     struct pipe_buffer *buf)
+{
+	return 0;
+}
+EXPORT_SYMBOL(generic_pipe_buf_confirm);
+
+/**
+ * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
+ * @pipe:	the pipe that the buffer belongs to
+ * @buf:	the buffer to put a reference to
+ *
+ * Description:
+ *	This function releases a reference to @buf.
+ */
+void generic_pipe_buf_release(struct pipe_inode_info *pipe,
+			      struct pipe_buffer *buf)
+{
+	page_cache_release(buf->page);
+}
+EXPORT_SYMBOL(generic_pipe_buf_release);
+
+static const struct pipe_buf_operations anon_pipe_buf_ops = {
+	.can_merge = 1,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = anon_pipe_buf_release,
+	.steal = generic_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static const struct pipe_buf_operations packet_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = anon_pipe_buf_release,
+	.steal = generic_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static ssize_t
+pipe_read(struct kiocb *iocb, const struct iovec *_iov,
+	   unsigned long nr_segs, loff_t pos)
+{
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct pipe_inode_info *pipe;
+	int do_wakeup;
+	ssize_t ret;
+	struct iovec *iov = (struct iovec *)_iov;
+	size_t total_len;
+
+	total_len = iov_length(iov, nr_segs);
+	/* Null read succeeds. */
+	if (unlikely(total_len == 0))
+		return 0;
+
+	do_wakeup = 0;
+	ret = 0;
+	mutex_lock(&inode->i_mutex);
+	pipe = inode->i_pipe;
+	for (;;) {
+		int bufs = pipe->nrbufs;
+		if (bufs) {
+			int curbuf = pipe->curbuf;
+			struct pipe_buffer *buf = pipe->bufs + curbuf;
+			const struct pipe_buf_operations *ops = buf->ops;
+			void *addr;
+			size_t chars = buf->len;
+			int error, atomic;
+
+			if (chars > total_len)
+				chars = total_len;
+
+			error = ops->confirm(pipe, buf);
+			if (error) {
+				if (!ret)
+					ret = error;
+				break;
+			}
+
+			atomic = !iov_fault_in_pages_write(iov, chars);
+redo:
+			addr = ops->map(pipe, buf, atomic);
+			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
+			ops->unmap(pipe, buf, addr);
+			if (unlikely(error)) {
+				/*
+				 * Just retry with the slow path if we failed.
+				 */
+				if (atomic) {
+					atomic = 0;
+					goto redo;
+				}
+				if (!ret)
+					ret = error;
+				break;
+			}
+			ret += chars;
+			buf->offset += chars;
+			buf->len -= chars;
+
+			/* Was it a packet buffer? Clean up and exit */
+			if (buf->flags & PIPE_BUF_FLAG_PACKET) {
+				total_len = chars;
+				buf->len = 0;
+			}
+
+			if (!buf->len) {
+				buf->ops = NULL;
+				ops->release(pipe, buf);
+				curbuf = (curbuf + 1) & (pipe->buffers - 1);
+				pipe->curbuf = curbuf;
+				pipe->nrbufs = --bufs;
+				do_wakeup = 1;
+			}
+			total_len -= chars;
+			if (!total_len)
+				break;	/* common path: read succeeded */
+		}
+		if (bufs)	/* More to do? */
+			continue;
+		if (!pipe->writers)
+			break;
+		if (!pipe->waiting_writers) {
+			/* syscall merging: Usually we must not sleep
+			 * if O_NONBLOCK is set, or if we got some data.
+			 * But if a writer sleeps in kernel space, then
+			 * we can wait for that data without violating POSIX.
+			 */
+			if (ret)
+				break;
+			if (filp->f_flags & O_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
+			}
+		}
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -ERESTARTSYS;
+			break;
+		}
+		if (do_wakeup) {
+			wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
+ 			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+		}
+		pipe_wait(pipe);
+	}
+	mutex_unlock(&inode->i_mutex);
+
+	/* Signal writers asynchronously that there is more room. */
+	if (do_wakeup) {
+		wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
+		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+	}
+	if (ret > 0)
+		file_accessed(filp);
+	return ret;
+}
+
+static inline int is_packetized(struct file *file)
+{
+	return (file->f_flags & O_DIRECT) != 0;
+}
+
+static ssize_t
+pipe_write(struct kiocb *iocb, const struct iovec *_iov,
+	    unsigned long nr_segs, loff_t ppos)
+{
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct pipe_inode_info *pipe;
+	ssize_t ret;
+	int do_wakeup;
+	struct iovec *iov = (struct iovec *)_iov;
+	size_t total_len;
+	ssize_t chars;
+
+	total_len = iov_length(iov, nr_segs);
+	/* Null write succeeds. */
+	if (unlikely(total_len == 0))
+		return 0;
+
+	do_wakeup = 0;
+	ret = 0;
+	mutex_lock(&inode->i_mutex);
+	pipe = inode->i_pipe;
+
+	if (!pipe->readers) {
+		send_sig(SIGPIPE, current, 0);
+		ret = -EPIPE;
+		goto out;
+	}
+
+	/* We try to merge small writes */
+	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
+	if (pipe->nrbufs && chars != 0) {
+		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
+							(pipe->buffers - 1);
+		struct pipe_buffer *buf = pipe->bufs + lastbuf;
+		const struct pipe_buf_operations *ops = buf->ops;
+		int offset = buf->offset + buf->len;
+
+		if (ops->can_merge && offset + chars <= PAGE_SIZE) {
+			int error, atomic = 1;
+			void *addr;
+
+			error = ops->confirm(pipe, buf);
+			if (error)
+				goto out;
+
+			iov_fault_in_pages_read(iov, chars);
+redo1:
+			addr = ops->map(pipe, buf, atomic);
+			error = pipe_iov_copy_from_user(offset + addr, iov,
+							chars, atomic);
+			ops->unmap(pipe, buf, addr);
+			ret = error;
+			do_wakeup = 1;
+			if (error) {
+				if (atomic) {
+					atomic = 0;
+					goto redo1;
+				}
+				goto out;
+			}
+			buf->len += chars;
+			total_len -= chars;
+			ret = chars;
+			if (!total_len)
+				goto out;
+		}
+	}
+
+	for (;;) {
+		int bufs;
+
+		if (!pipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+		bufs = pipe->nrbufs;
+		if (bufs < pipe->buffers) {
+			int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
+			struct pipe_buffer *buf = pipe->bufs + newbuf;
+			struct page *page = pipe->tmp_page;
+			char *src;
+			int error, atomic = 1;
+
+			if (!page) {
+				page = alloc_page(GFP_HIGHUSER);
+				if (unlikely(!page)) {
+					ret = ret ? : -ENOMEM;
+					break;
+				}
+				pipe->tmp_page = page;
+			}
+			/* Always wake up, even if the copy fails. Otherwise
+			 * we lock up (O_NONBLOCK-)readers that sleep due to
+			 * syscall merging.
+			 * FIXME! Is this really true?
+			 */
+			do_wakeup = 1;
+			chars = PAGE_SIZE;
+			if (chars > total_len)
+				chars = total_len;
+
+			iov_fault_in_pages_read(iov, chars);
+redo2:
+			if (atomic)
+				src = kmap_atomic(page, KM_USER0);
+			else
+				src = kmap(page);
+
+			error = pipe_iov_copy_from_user(src, iov, chars,
+							atomic);
+			if (atomic)
+				kunmap_atomic(src, KM_USER0);
+			else
+				kunmap(page);
+
+			if (unlikely(error)) {
+				if (atomic) {
+					atomic = 0;
+					goto redo2;
+				}
+				if (!ret)
+					ret = error;
+				break;
+			}
+			ret += chars;
+
+			/* Insert it into the buffer array */
+			buf->page = page;
+			buf->ops = &anon_pipe_buf_ops;
+			buf->offset = 0;
+			buf->len = chars;
+			buf->flags = 0;
+			if (is_packetized(filp)) {
+				buf->ops = &packet_pipe_buf_ops;
+				buf->flags = PIPE_BUF_FLAG_PACKET;
+			}
+			pipe->nrbufs = ++bufs;
+			pipe->tmp_page = NULL;
+
+			total_len -= chars;
+			if (!total_len)
+				break;
+		}
+		if (bufs < pipe->buffers)
+			continue;
+		if (filp->f_flags & O_NONBLOCK) {
+			if (!ret)
+				ret = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -ERESTARTSYS;
+			break;
+		}
+		if (do_wakeup) {
+			wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
+			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+			do_wakeup = 0;
+		}
+		pipe->waiting_writers++;
+		pipe_wait(pipe);
+		pipe->waiting_writers--;
+	}
+out:
+	mutex_unlock(&inode->i_mutex);
+	if (do_wakeup) {
+		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+	}
+	if (ret > 0)
+		file_update_time(filp);
+	return ret;
+}
+
+static ssize_t
+bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
+{
+	return -EBADF;
+}
+
+static ssize_t
+bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
+	   loff_t *ppos)
+{
+	return -EBADF;
+}
+
+static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct pipe_inode_info *pipe;
+	int count, buf, nrbufs;
+
+	switch (cmd) {
+		case FIONREAD:
+			mutex_lock(&inode->i_mutex);
+			pipe = inode->i_pipe;
+			count = 0;
+			buf = pipe->curbuf;
+			nrbufs = pipe->nrbufs;
+			while (--nrbufs >= 0) {
+				count += pipe->bufs[buf].len;
+				buf = (buf+1) & (pipe->buffers - 1);
+			}
+			mutex_unlock(&inode->i_mutex);
+
+			return put_user(count, (int __user *)arg);
+		default:
+			return -EINVAL;
+	}
+}
+
+/* No kernel lock held - fine */
+static unsigned int
+pipe_poll(struct file *filp, poll_table *wait)
+{
+	unsigned int mask;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct pipe_inode_info *pipe = inode->i_pipe;
+	int nrbufs;
+
+	poll_wait(filp, &pipe->wait, wait);
+
+	/* Reading only -- no need for acquiring the semaphore.  */
+	nrbufs = pipe->nrbufs;
+	mask = 0;
+	if (filp->f_mode & FMODE_READ) {
+		mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
+		if (!pipe->writers && filp->f_version != pipe->w_counter)
+			mask |= POLLHUP;
+	}
+
+	if (filp->f_mode & FMODE_WRITE) {
+		mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
+		/*
+		 * Most Unices do not set POLLERR for FIFOs but on Linux they
+		 * behave exactly like pipes for poll().
+		 */
+		if (!pipe->readers)
+			mask |= POLLERR;
+	}
+
+	return mask;
+}
+
+static int
+pipe_release(struct inode *inode, int decr, int decw)
+{
+	struct pipe_inode_info *pipe;
+
+	mutex_lock(&inode->i_mutex);
+	pipe = inode->i_pipe;
+	pipe->readers -= decr;
+	pipe->writers -= decw;
+
+	if (!pipe->readers && !pipe->writers) {
+		free_pipe_info(inode);
+	} else {
+		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+	}
+	mutex_unlock(&inode->i_mutex);
+
+	return 0;
+}
+
+static int
+pipe_read_fasync(int fd, struct file *filp, int on)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int retval;
+
+	mutex_lock(&inode->i_mutex);
+	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
+	mutex_unlock(&inode->i_mutex);
+
+	return retval;
+}
+
+
+static int
+pipe_write_fasync(int fd, struct file *filp, int on)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int retval;
+
+	mutex_lock(&inode->i_mutex);
+	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
+	mutex_unlock(&inode->i_mutex);
+
+	return retval;
+}
+
+
+static int
+pipe_rdwr_fasync(int fd, struct file *filp, int on)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct pipe_inode_info *pipe = inode->i_pipe;
+	int retval;
+
+	mutex_lock(&inode->i_mutex);
+	retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
+	if (retval >= 0) {
+		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
+		if (retval < 0) /* this can happen only if on == T */
+			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
+	}
+	mutex_unlock(&inode->i_mutex);
+	return retval;
+}
+
+
+static int
+pipe_read_release(struct inode *inode, struct file *filp)
+{
+	return pipe_release(inode, 1, 0);
+}
+
+static int
+pipe_write_release(struct inode *inode, struct file *filp)
+{
+	return pipe_release(inode, 0, 1);
+}
+
+static int
+pipe_rdwr_release(struct inode *inode, struct file *filp)
+{
+	int decr, decw;
+
+	decr = (filp->f_mode & FMODE_READ) != 0;
+	decw = (filp->f_mode & FMODE_WRITE) != 0;
+	return pipe_release(inode, decr, decw);
+}
+
+static int
+pipe_read_open(struct inode *inode, struct file *filp)
+{
+	int ret = -ENOENT;
+
+	mutex_lock(&inode->i_mutex);
+
+	if (inode->i_pipe) {
+		ret = 0;
+		inode->i_pipe->readers++;
+	}
+
+	mutex_unlock(&inode->i_mutex);
+
+	return ret;
+}
+
+static int
+pipe_write_open(struct inode *inode, struct file *filp)
+{
+	int ret = -ENOENT;
+
+	mutex_lock(&inode->i_mutex);
+
+	if (inode->i_pipe) {
+		ret = 0;
+		inode->i_pipe->writers++;
+	}
+
+	mutex_unlock(&inode->i_mutex);
+
+	return ret;
+}
+
+static int
+pipe_rdwr_open(struct inode *inode, struct file *filp)
+{
+	int ret = -ENOENT;
+
+	mutex_lock(&inode->i_mutex);
+
+	if (inode->i_pipe) {
+		ret = 0;
+		if (filp->f_mode & FMODE_READ)
+			inode->i_pipe->readers++;
+		if (filp->f_mode & FMODE_WRITE)
+			inode->i_pipe->writers++;
+	}
+
+	mutex_unlock(&inode->i_mutex);
+
+	return ret;
+}
+
+/*
+ * The file_operations structs are not static because they
+ * are also used in linux/fs/fifo.c to do operations on FIFOs.
+ *
+ * Pipes reuse fifos' file_operations structs.
+ */
+const struct file_operations read_pipefifo_fops = {
+	.llseek		= no_llseek,
+	.read		= do_sync_read,
+	.aio_read	= pipe_read,
+	.write		= bad_pipe_w,
+	.poll		= pipe_poll,
+	.unlocked_ioctl	= pipe_ioctl,
+	.open		= pipe_read_open,
+	.release	= pipe_read_release,
+	.fasync		= pipe_read_fasync,
+};
+
+const struct file_operations write_pipefifo_fops = {
+	.llseek		= no_llseek,
+	.read		= bad_pipe_r,
+	.write		= do_sync_write,
+	.aio_write	= pipe_write,
+	.poll		= pipe_poll,
+	.unlocked_ioctl	= pipe_ioctl,
+	.open		= pipe_write_open,
+	.release	= pipe_write_release,
+	.fasync		= pipe_write_fasync,
+};
+
+const struct file_operations rdwr_pipefifo_fops = {
+	.llseek		= no_llseek,
+	.read		= do_sync_read,
+	.aio_read	= pipe_read,
+	.write		= do_sync_write,
+	.aio_write	= pipe_write,
+	.poll		= pipe_poll,
+	.unlocked_ioctl	= pipe_ioctl,
+	.open		= pipe_rdwr_open,
+	.release	= pipe_rdwr_release,
+	.fasync		= pipe_rdwr_fasync,
+};
+
+struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
+{
+	struct pipe_inode_info *pipe;
+
+	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+	if (pipe) {
+		pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
+		if (pipe->bufs) {
+			init_waitqueue_head(&pipe->wait);
+			pipe->r_counter = pipe->w_counter = 1;
+			pipe->inode = inode;
+			pipe->buffers = PIPE_DEF_BUFFERS;
+			return pipe;
+		}
+		kfree(pipe);
+	}
+
+	return NULL;
+}
+
+void __free_pipe_info(struct pipe_inode_info *pipe)
+{
+	int i;
+
+	for (i = 0; i < pipe->buffers; i++) {
+		struct pipe_buffer *buf = pipe->bufs + i;
+		if (buf->ops)
+			buf->ops->release(pipe, buf);
+	}
+	if (pipe->tmp_page)
+		__free_page(pipe->tmp_page);
+	kfree(pipe->bufs);
+	kfree(pipe);
+}
+
+void free_pipe_info(struct inode *inode)
+{
+	__free_pipe_info(inode->i_pipe);
+	inode->i_pipe = NULL;
+}
+
+static struct vfsmount *pipe_mnt __read_mostly;
+
+/*
+ * pipefs_dname() is called from d_path().
+ */
+static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
+				dentry->d_inode->i_ino);
+}
+
+static const struct dentry_operations pipefs_dentry_operations = {
+	.d_dname	= pipefs_dname,
+};
+
+static struct inode * get_pipe_inode(void)
+{
+	struct inode *inode = new_inode(pipe_mnt->mnt_sb);
+	struct pipe_inode_info *pipe;
+
+	if (!inode)
+		goto fail_inode;
+
+	inode->i_ino = get_next_ino();
+
+	pipe = alloc_pipe_info(inode);
+	if (!pipe)
+		goto fail_iput;
+	inode->i_pipe = pipe;
+
+	pipe->readers = pipe->writers = 1;
+	inode->i_fop = &rdwr_pipefifo_fops;
+
+	/*
+	 * Mark the inode dirty from the very beginning,
+	 * that way it will never be moved to the dirty
+	 * list because "mark_inode_dirty()" will think
+	 * that it already _is_ on the dirty list.
+	 */
+	inode->i_state = I_DIRTY;
+	inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	return inode;
+
+fail_iput:
+	iput(inode);
+
+fail_inode:
+	return NULL;
+}
+
+struct file *create_write_pipe(int flags)
+{
+	int err;
+	struct inode *inode;
+	struct file *f;
+	struct path path;
+	struct qstr name = { .name = "" };
+
+	err = -ENFILE;
+	inode = get_pipe_inode();
+	if (!inode)
+		goto err;
+
+	err = -ENOMEM;
+	path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
+	if (!path.dentry)
+		goto err_inode;
+	path.mnt = mntget(pipe_mnt);
+
+	d_instantiate(path.dentry, inode);
+
+	err = -ENFILE;
+	f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
+	if (!f)
+		goto err_dentry;
+	f->f_mapping = inode->i_mapping;
+
+	f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
+	f->f_version = 0;
+
+	return f;
+
+ err_dentry:
+	free_pipe_info(inode);
+	path_put(&path);
+	return ERR_PTR(err);
+
+ err_inode:
+	free_pipe_info(inode);
+	iput(inode);
+ err:
+	return ERR_PTR(err);
+}
+
+void free_write_pipe(struct file *f)
+{
+	free_pipe_info(f->f_dentry->d_inode);
+	path_put(&f->f_path);
+	put_filp(f);
+}
+
+struct file *create_read_pipe(struct file *wrf, int flags)
+{
+	/* Grab pipe from the writer */
+	struct file *f = alloc_file(&wrf->f_path, FMODE_READ,
+				    &read_pipefifo_fops);
+	if (!f)
+		return ERR_PTR(-ENFILE);
+
+	path_get(&wrf->f_path);
+	f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
+
+	return f;
+}
+
+int do_pipe_flags(int *fd, int flags)
+{
+	struct file *fw, *fr;
+	int error;
+	int fdw, fdr;
+
+	if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
+		return -EINVAL;
+
+	fw = create_write_pipe(flags);
+	if (IS_ERR(fw))
+		return PTR_ERR(fw);
+	fr = create_read_pipe(fw, flags);
+	error = PTR_ERR(fr);
+	if (IS_ERR(fr))
+		goto err_write_pipe;
+
+	error = get_unused_fd_flags(flags);
+	if (error < 0)
+		goto err_read_pipe;
+	fdr = error;
+
+	error = get_unused_fd_flags(flags);
+	if (error < 0)
+		goto err_fdr;
+	fdw = error;
+
+	audit_fd_pair(fdr, fdw);
+	fd_install(fdr, fr);
+	fd_install(fdw, fw);
+	fd[0] = fdr;
+	fd[1] = fdw;
+
+	return 0;
+
+ err_fdr:
+	put_unused_fd(fdr);
+ err_read_pipe:
+	path_put(&fr->f_path);
+	put_filp(fr);
+ err_write_pipe:
+	free_write_pipe(fw);
+	return error;
+}
+
+/*
+ * sys_pipe() is the normal C calling standard for creating
+ * a pipe. It's not the way Unix traditionally does this, though.
+ */
+SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
+{
+	int fd[2];
+	int error;
+
+	error = do_pipe_flags(fd, flags);
+	if (!error) {
+		if (copy_to_user(fildes, fd, sizeof(fd))) {
+			sys_close(fd[0]);
+			sys_close(fd[1]);
+			error = -EFAULT;
+		}
+	}
+	return error;
+}
+
+SYSCALL_DEFINE1(pipe, int __user *, fildes)
+{
+	return sys_pipe2(fildes, 0);
+}
+
+/*
+ * Allocate a new array of pipe buffers and copy the info over. Returns the
+ * pipe size if successful, or return -ERROR on error.
+ */
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
+{
+	struct pipe_buffer *bufs;
+
+	/*
+	 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
+	 * expect a lot of shrink+grow operations, just free and allocate
+	 * again like we would do for growing. If the pipe currently
+	 * contains more buffers than arg, then return busy.
+	 */
+	if (nr_pages < pipe->nrbufs)
+		return -EBUSY;
+
+	bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
+	if (unlikely(!bufs))
+		return -ENOMEM;
+
+	/*
+	 * The pipe array wraps around, so just start the new one at zero
+	 * and adjust the indexes.
+	 */
+	if (pipe->nrbufs) {
+		unsigned int tail;
+		unsigned int head;
+
+		tail = pipe->curbuf + pipe->nrbufs;
+		if (tail < pipe->buffers)
+			tail = 0;
+		else
+			tail &= (pipe->buffers - 1);
+
+		head = pipe->nrbufs - tail;
+		if (head)
+			memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
+		if (tail)
+			memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
+	}
+
+	pipe->curbuf = 0;
+	kfree(pipe->bufs);
+	pipe->bufs = bufs;
+	pipe->buffers = nr_pages;
+	return nr_pages * PAGE_SIZE;
+}
+
+/*
+ * Currently we rely on the pipe array holding a power-of-2 number
+ * of pages.
+ */
+static inline unsigned int round_pipe_size(unsigned int size)
+{
+	unsigned long nr_pages;
+
+	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+}
+
+/*
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * will return an error.
+ */
+int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
+		 size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
+	if (ret < 0 || !write)
+		return ret;
+
+	pipe_max_size = round_pipe_size(pipe_max_size);
+	return ret;
+}
+
+/*
+ * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
+ * location, so checking ->i_pipe is not enough to verify that this is a
+ * pipe.
+ */
+struct pipe_inode_info *get_pipe_info(struct file *file)
+{
+	struct inode *i = file->f_path.dentry->d_inode;
+
+	return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
+}
+
+long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pipe_inode_info *pipe;
+	long ret;
+
+	pipe = get_pipe_info(file);
+	if (!pipe)
+		return -EBADF;
+
+	mutex_lock(&pipe->inode->i_mutex);
+
+	switch (cmd) {
+	case F_SETPIPE_SZ: {
+		unsigned int size, nr_pages;
+
+		size = round_pipe_size(arg);
+		nr_pages = size >> PAGE_SHIFT;
+
+		ret = -EINVAL;
+		if (!nr_pages)
+			goto out;
+
+		if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
+			ret = -EPERM;
+			goto out;
+		}
+		ret = pipe_set_size(pipe, nr_pages);
+		break;
+		}
+	case F_GETPIPE_SZ:
+		ret = pipe->buffers * PAGE_SIZE;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+out:
+	mutex_unlock(&pipe->inode->i_mutex);
+	return ret;
+}
+
+static const struct super_operations pipefs_ops = {
+	.destroy_inode = free_inode_nonrcu,
+};
+
+/*
+ * pipefs should _never_ be mounted by userland - too much of security hassle,
+ * no real gain from having the whole whorehouse mounted. So we don't need
+ * any operations on the root directory. However, we need a non-trivial
+ * d_name - pipe: will go nicely and kill the special-casing in procfs.
+ */
+static struct dentry *pipefs_mount(struct file_system_type *fs_type,
+			 int flags, const char *dev_name, void *data)
+{
+	return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
+			&pipefs_dentry_operations, PIPEFS_MAGIC);
+}
+
+static struct file_system_type pipe_fs_type = {
+	.name		= "pipefs",
+	.mount		= pipefs_mount,
+	.kill_sb	= kill_anon_super,
+};
+
+static int __init init_pipe_fs(void)
+{
+	int err = register_filesystem(&pipe_fs_type);
+
+	if (!err) {
+		pipe_mnt = kern_mount(&pipe_fs_type);
+		if (IS_ERR(pipe_mnt)) {
+			err = PTR_ERR(pipe_mnt);
+			unregister_filesystem(&pipe_fs_type);
+		}
+	}
+	return err;
+}
+
+static void __exit exit_pipe_fs(void)
+{
+	unregister_filesystem(&pipe_fs_type);
+	mntput(pipe_mnt);
+}
+
+fs_initcall(init_pipe_fs);
+module_exit(exit_pipe_fs);
diff --git a/fs/pnode.c b/fs/pnode.c
new file mode 100644
index 00000000..d42514e3
--- /dev/null
+++ b/fs/pnode.c
@@ -0,0 +1,371 @@
+/*
+ *  linux/fs/pnode.c
+ *
+ * (C) Copyright IBM Corporation 2005.
+ *	Released under GPL v2.
+ *	Author : Ram Pai (linuxram@us.ibm.com)
+ *
+ */
+#include <linux/mnt_namespace.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
+#include "internal.h"
+#include "pnode.h"
+
+/* return the next shared peer mount of @p */
+static inline struct vfsmount *next_peer(struct vfsmount *p)
+{
+	return list_entry(p->mnt_share.next, struct vfsmount, mnt_share);
+}
+
+static inline struct vfsmount *first_slave(struct vfsmount *p)
+{
+	return list_entry(p->mnt_slave_list.next, struct vfsmount, mnt_slave);
+}
+
+static inline struct vfsmount *next_slave(struct vfsmount *p)
+{
+	return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave);
+}
+
+/*
+ * Return true if path is reachable from root
+ *
+ * namespace_sem is held, and mnt is attached
+ */
+static bool is_path_reachable(struct vfsmount *mnt, struct dentry *dentry,
+			 const struct path *root)
+{
+	while (mnt != root->mnt && mnt->mnt_parent != mnt) {
+		dentry = mnt->mnt_mountpoint;
+		mnt = mnt->mnt_parent;
+	}
+	return mnt == root->mnt && is_subdir(dentry, root->dentry);
+}
+
+static struct vfsmount *get_peer_under_root(struct vfsmount *mnt,
+					    struct mnt_namespace *ns,
+					    const struct path *root)
+{
+	struct vfsmount *m = mnt;
+
+	do {
+		/* Check the namespace first for optimization */
+		if (m->mnt_ns == ns && is_path_reachable(m, m->mnt_root, root))
+			return m;
+
+		m = next_peer(m);
+	} while (m != mnt);
+
+	return NULL;
+}
+
+/*
+ * Get ID of closest dominating peer group having a representative
+ * under the given root.
+ *
+ * Caller must hold namespace_sem
+ */
+int get_dominating_id(struct vfsmount *mnt, const struct path *root)
+{
+	struct vfsmount *m;
+
+	for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) {
+		struct vfsmount *d = get_peer_under_root(m, mnt->mnt_ns, root);
+		if (d)
+			return d->mnt_group_id;
+	}
+
+	return 0;
+}
+
+static int do_make_slave(struct vfsmount *mnt)
+{
+	struct vfsmount *peer_mnt = mnt, *master = mnt->mnt_master;
+	struct vfsmount *slave_mnt;
+
+	/*
+	 * slave 'mnt' to a peer mount that has the
+	 * same root dentry. If none is available then
+	 * slave it to anything that is available.
+	 */
+	while ((peer_mnt = next_peer(peer_mnt)) != mnt &&
+	       peer_mnt->mnt_root != mnt->mnt_root) ;
+
+	if (peer_mnt == mnt) {
+		peer_mnt = next_peer(mnt);
+		if (peer_mnt == mnt)
+			peer_mnt = NULL;
+	}
+	if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share))
+		mnt_release_group_id(mnt);
+
+	list_del_init(&mnt->mnt_share);
+	mnt->mnt_group_id = 0;
+
+	if (peer_mnt)
+		master = peer_mnt;
+
+	if (master) {
+		list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
+			slave_mnt->mnt_master = master;
+		list_move(&mnt->mnt_slave, &master->mnt_slave_list);
+		list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
+		INIT_LIST_HEAD(&mnt->mnt_slave_list);
+	} else {
+		struct list_head *p = &mnt->mnt_slave_list;
+		while (!list_empty(p)) {
+                        slave_mnt = list_first_entry(p,
+					struct vfsmount, mnt_slave);
+			list_del_init(&slave_mnt->mnt_slave);
+			slave_mnt->mnt_master = NULL;
+		}
+	}
+	mnt->mnt_master = master;
+	CLEAR_MNT_SHARED(mnt);
+	return 0;
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+void change_mnt_propagation(struct vfsmount *mnt, int type)
+{
+	if (type == MS_SHARED) {
+		set_mnt_shared(mnt);
+		return;
+	}
+	do_make_slave(mnt);
+	if (type != MS_SLAVE) {
+		list_del_init(&mnt->mnt_slave);
+		mnt->mnt_master = NULL;
+		if (type == MS_UNBINDABLE)
+			mnt->mnt_flags |= MNT_UNBINDABLE;
+		else
+			mnt->mnt_flags &= ~MNT_UNBINDABLE;
+	}
+}
+
+/*
+ * get the next mount in the propagation tree.
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
+ */
+static struct vfsmount *propagation_next(struct vfsmount *m,
+					 struct vfsmount *origin)
+{
+	/* are there any slaves of this mount? */
+	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+		return first_slave(m);
+
+	while (1) {
+		struct vfsmount *next;
+		struct vfsmount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			next = next_peer(m);
+			return ((next == origin) ? NULL : next);
+		} else if (m->mnt_slave.next != &master->mnt_slave_list)
+			return next_slave(m);
+
+		/* back at master */
+		m = master;
+	}
+}
+
+/*
+ * return the source mount to be used for cloning
+ *
+ * @dest 	the current destination mount
+ * @last_dest  	the last seen destination mount
+ * @last_src  	the last seen source mount
+ * @type	return CL_SLAVE if the new mount has to be
+ * 		cloned as a slave.
+ */
+static struct vfsmount *get_source(struct vfsmount *dest,
+					struct vfsmount *last_dest,
+					struct vfsmount *last_src,
+					int *type)
+{
+	struct vfsmount *p_last_src = NULL;
+	struct vfsmount *p_last_dest = NULL;
+
+	while (last_dest != dest->mnt_master) {
+		p_last_dest = last_dest;
+		p_last_src = last_src;
+		last_dest = last_dest->mnt_master;
+		last_src = last_src->mnt_master;
+	}
+
+	if (p_last_dest) {
+		do {
+			p_last_dest = next_peer(p_last_dest);
+		} while (IS_MNT_NEW(p_last_dest));
+		/* is that a peer of the earlier? */
+		if (dest == p_last_dest) {
+			*type = CL_MAKE_SHARED;
+			return p_last_src;
+		}
+	}
+	/* slave of the earlier, then */
+	*type = CL_SLAVE;
+	/* beginning of peer group among the slaves? */
+	if (IS_MNT_SHARED(dest))
+		*type |= CL_MAKE_SHARED;
+	return last_src;
+}
+
+/*
+ * mount 'source_mnt' under the destination 'dest_mnt' at
+ * dentry 'dest_dentry'. And propagate that mount to
+ * all the peer and slave mounts of 'dest_mnt'.
+ * Link all the new mounts into a propagation tree headed at
+ * source_mnt. Also link all the new mounts using ->mnt_list
+ * headed at source_mnt's ->mnt_list
+ *
+ * @dest_mnt: destination mount.
+ * @dest_dentry: destination dentry.
+ * @source_mnt: source mount.
+ * @tree_list : list of heads of trees to be attached.
+ */
+int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
+		    struct vfsmount *source_mnt, struct list_head *tree_list)
+{
+	struct vfsmount *m, *child;
+	int ret = 0;
+	struct vfsmount *prev_dest_mnt = dest_mnt;
+	struct vfsmount *prev_src_mnt  = source_mnt;
+	LIST_HEAD(tmp_list);
+	LIST_HEAD(umount_list);
+
+	for (m = propagation_next(dest_mnt, dest_mnt); m;
+			m = propagation_next(m, dest_mnt)) {
+		int type;
+		struct vfsmount *source;
+
+		if (IS_MNT_NEW(m))
+			continue;
+
+		source =  get_source(m, prev_dest_mnt, prev_src_mnt, &type);
+
+		if (!(child = copy_tree(source, source->mnt_root, type))) {
+			ret = -ENOMEM;
+			list_splice(tree_list, tmp_list.prev);
+			goto out;
+		}
+
+		if (is_subdir(dest_dentry, m->mnt_root)) {
+			mnt_set_mountpoint(m, dest_dentry, child);
+			list_add_tail(&child->mnt_hash, tree_list);
+		} else {
+			/*
+			 * This can happen if the parent mount was bind mounted
+			 * on some subdirectory of a shared/slave mount.
+			 */
+			list_add_tail(&child->mnt_hash, &tmp_list);
+		}
+		prev_dest_mnt = m;
+		prev_src_mnt  = child;
+	}
+out:
+	br_write_lock(vfsmount_lock);
+	while (!list_empty(&tmp_list)) {
+		child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
+		umount_tree(child, 0, &umount_list);
+	}
+	br_write_unlock(vfsmount_lock);
+	release_mounts(&umount_list);
+	return ret;
+}
+
+/*
+ * return true if the refcount is greater than count
+ */
+static inline int do_refcount_check(struct vfsmount *mnt, int count)
+{
+	int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts;
+	return (mycount > count);
+}
+
+/*
+ * check if the mount 'mnt' can be unmounted successfully.
+ * @mnt: the mount to be checked for unmount
+ * NOTE: unmounting 'mnt' would naturally propagate to all
+ * other mounts its parent propagates to.
+ * Check if any of these mounts that **do not have submounts**
+ * have more references than 'refcnt'. If so return busy.
+ *
+ * vfsmount lock must be held for write
+ */
+int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
+{
+	struct vfsmount *m, *child;
+	struct vfsmount *parent = mnt->mnt_parent;
+	int ret = 0;
+
+	if (mnt == parent)
+		return do_refcount_check(mnt, refcnt);
+
+	/*
+	 * quickly check if the current mount can be unmounted.
+	 * If not, we don't have to go checking for all other
+	 * mounts
+	 */
+	if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt))
+		return 1;
+
+	for (m = propagation_next(parent, parent); m;
+	     		m = propagation_next(m, parent)) {
+		child = __lookup_mnt(m, mnt->mnt_mountpoint, 0);
+		if (child && list_empty(&child->mnt_mounts) &&
+		    (ret = do_refcount_check(child, 1)))
+			break;
+	}
+	return ret;
+}
+
+/*
+ * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
+ * parent propagates to.
+ */
+static void __propagate_umount(struct vfsmount *mnt)
+{
+	struct vfsmount *parent = mnt->mnt_parent;
+	struct vfsmount *m;
+
+	BUG_ON(parent == mnt);
+
+	for (m = propagation_next(parent, parent); m;
+			m = propagation_next(m, parent)) {
+
+		struct vfsmount *child = __lookup_mnt(m,
+					mnt->mnt_mountpoint, 0);
+		/*
+		 * umount the child only if the child has no
+		 * other children
+		 */
+		if (child && list_empty(&child->mnt_mounts))
+			list_move_tail(&child->mnt_hash, &mnt->mnt_hash);
+	}
+}
+
+/*
+ * collect all mounts that receive propagation from the mount in @list,
+ * and return these additional mounts in the same list.
+ * @list: the list of mounts to be unmounted.
+ *
+ * vfsmount lock must be held for write
+ */
+int propagate_umount(struct list_head *list)
+{
+	struct vfsmount *mnt;
+
+	list_for_each_entry(mnt, list, mnt_hash)
+		__propagate_umount(mnt);
+	return 0;
+}
diff --git a/fs/pnode.h b/fs/pnode.h
new file mode 100644
index 00000000..1ea4ae1e
--- /dev/null
+++ b/fs/pnode.h
@@ -0,0 +1,39 @@
+/*
+ *  linux/fs/pnode.h
+ *
+ * (C) Copyright IBM Corporation 2005.
+ *	Released under GPL v2.
+ *
+ */
+#ifndef _LINUX_PNODE_H
+#define _LINUX_PNODE_H
+
+#include <linux/list.h>
+#include <linux/mount.h>
+
+#define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED)
+#define IS_MNT_SLAVE(mnt) (mnt->mnt_master)
+#define IS_MNT_NEW(mnt)  (!mnt->mnt_ns)
+#define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~MNT_SHARED)
+#define IS_MNT_UNBINDABLE(mnt) (mnt->mnt_flags & MNT_UNBINDABLE)
+
+#define CL_EXPIRE    		0x01
+#define CL_SLAVE     		0x02
+#define CL_COPY_ALL 		0x04
+#define CL_MAKE_SHARED 		0x08
+#define CL_PRIVATE 		0x10
+
+static inline void set_mnt_shared(struct vfsmount *mnt)
+{
+	mnt->mnt_flags &= ~MNT_SHARED_MASK;
+	mnt->mnt_flags |= MNT_SHARED;
+}
+
+void change_mnt_propagation(struct vfsmount *, int);
+int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *,
+		struct list_head *);
+int propagate_umount(struct list_head *);
+int propagate_mount_busy(struct vfsmount *, int);
+void mnt_release_group_id(struct vfsmount *);
+int get_dominating_id(struct vfsmount *mnt, const struct path *root);
+#endif /* _LINUX_PNODE_H */
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
new file mode 100644
index 00000000..b1cf6bf4
--- /dev/null
+++ b/fs/posix_acl.c
@@ -0,0 +1,388 @@
+/*
+ * linux/fs/posix_acl.c
+ *
+ *  Copyright (C) 2002 by Andreas Gruenbacher <a.gruenbacher@computer.org>
+ *
+ *  Fixes from William Schumacher incorporated on 15 March 2001.
+ *     (Reported by Charles Bertsch, <CBertsch@microtest.com>).
+ */
+
+/*
+ *  This file contains generic functions for manipulating
+ *  POSIX 1003.1e draft standard 17 ACLs.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <asm/atomic.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/posix_acl.h>
+#include <linux/module.h>
+
+#include <linux/errno.h>
+
+EXPORT_SYMBOL(posix_acl_init);
+EXPORT_SYMBOL(posix_acl_alloc);
+EXPORT_SYMBOL(posix_acl_clone);
+EXPORT_SYMBOL(posix_acl_valid);
+EXPORT_SYMBOL(posix_acl_equiv_mode);
+EXPORT_SYMBOL(posix_acl_from_mode);
+EXPORT_SYMBOL(posix_acl_create_masq);
+EXPORT_SYMBOL(posix_acl_chmod_masq);
+EXPORT_SYMBOL(posix_acl_permission);
+
+/*
+ * Init a fresh posix_acl
+ */
+void
+posix_acl_init(struct posix_acl *acl, int count)
+{
+	atomic_set(&acl->a_refcount, 1);
+	acl->a_count = count;
+}
+
+/*
+ * Allocate a new ACL with the specified number of entries.
+ */
+struct posix_acl *
+posix_acl_alloc(int count, gfp_t flags)
+{
+	const size_t size = sizeof(struct posix_acl) +
+	                    count * sizeof(struct posix_acl_entry);
+	struct posix_acl *acl = kmalloc(size, flags);
+	if (acl)
+		posix_acl_init(acl, count);
+	return acl;
+}
+
+/*
+ * Clone an ACL.
+ */
+struct posix_acl *
+posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
+{
+	struct posix_acl *clone = NULL;
+
+	if (acl) {
+		int size = sizeof(struct posix_acl) + acl->a_count *
+		           sizeof(struct posix_acl_entry);
+		clone = kmemdup(acl, size, flags);
+		if (clone)
+			atomic_set(&clone->a_refcount, 1);
+	}
+	return clone;
+}
+
+/*
+ * Check if an acl is valid. Returns 0 if it is, or -E... otherwise.
+ */
+int
+posix_acl_valid(const struct posix_acl *acl)
+{
+	const struct posix_acl_entry *pa, *pe;
+	int state = ACL_USER_OBJ;
+	unsigned int id = 0;  /* keep gcc happy */
+	int needs_mask = 0;
+
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+		if (pa->e_perm & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
+			return -EINVAL;
+		switch (pa->e_tag) {
+			case ACL_USER_OBJ:
+				if (state == ACL_USER_OBJ) {
+					id = 0;
+					state = ACL_USER;
+					break;
+				}
+				return -EINVAL;
+
+			case ACL_USER:
+				if (state != ACL_USER)
+					return -EINVAL;
+				if (pa->e_id == ACL_UNDEFINED_ID ||
+				    pa->e_id < id)
+					return -EINVAL;
+				id = pa->e_id + 1;
+				needs_mask = 1;
+				break;
+
+			case ACL_GROUP_OBJ:
+				if (state == ACL_USER) {
+					id = 0;
+					state = ACL_GROUP;
+					break;
+				}
+				return -EINVAL;
+
+			case ACL_GROUP:
+				if (state != ACL_GROUP)
+					return -EINVAL;
+				if (pa->e_id == ACL_UNDEFINED_ID ||
+				    pa->e_id < id)
+					return -EINVAL;
+				id = pa->e_id + 1;
+				needs_mask = 1;
+				break;
+
+			case ACL_MASK:
+				if (state != ACL_GROUP)
+					return -EINVAL;
+				state = ACL_OTHER;
+				break;
+
+			case ACL_OTHER:
+				if (state == ACL_OTHER ||
+				    (state == ACL_GROUP && !needs_mask)) {
+					state = 0;
+					break;
+				}
+				return -EINVAL;
+
+			default:
+				return -EINVAL;
+		}
+	}
+	if (state == 0)
+		return 0;
+	return -EINVAL;
+}
+
+/*
+ * Returns 0 if the acl can be exactly represented in the traditional
+ * file mode permission bits, or else 1. Returns -E... on error.
+ */
+int
+posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p)
+{
+	const struct posix_acl_entry *pa, *pe;
+	mode_t mode = 0;
+	int not_equiv = 0;
+
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+		switch (pa->e_tag) {
+			case ACL_USER_OBJ:
+				mode |= (pa->e_perm & S_IRWXO) << 6;
+				break;
+			case ACL_GROUP_OBJ:
+				mode |= (pa->e_perm & S_IRWXO) << 3;
+				break;
+			case ACL_OTHER:
+				mode |= pa->e_perm & S_IRWXO;
+				break;
+			case ACL_MASK:
+				mode = (mode & ~S_IRWXG) |
+				       ((pa->e_perm & S_IRWXO) << 3);
+				not_equiv = 1;
+				break;
+			case ACL_USER:
+			case ACL_GROUP:
+				not_equiv = 1;
+				break;
+			default:
+				return -EINVAL;
+		}
+	}
+        if (mode_p)
+                *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
+        return not_equiv;
+}
+
+/*
+ * Create an ACL representing the file mode permission bits of an inode.
+ */
+struct posix_acl *
+posix_acl_from_mode(mode_t mode, gfp_t flags)
+{
+	struct posix_acl *acl = posix_acl_alloc(3, flags);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+
+	acl->a_entries[0].e_tag  = ACL_USER_OBJ;
+	acl->a_entries[0].e_id   = ACL_UNDEFINED_ID;
+	acl->a_entries[0].e_perm = (mode & S_IRWXU) >> 6;
+
+	acl->a_entries[1].e_tag  = ACL_GROUP_OBJ;
+	acl->a_entries[1].e_id   = ACL_UNDEFINED_ID;
+	acl->a_entries[1].e_perm = (mode & S_IRWXG) >> 3;
+
+	acl->a_entries[2].e_tag  = ACL_OTHER;
+	acl->a_entries[2].e_id   = ACL_UNDEFINED_ID;
+	acl->a_entries[2].e_perm = (mode & S_IRWXO);
+	return acl;
+}
+
+/*
+ * Return 0 if current is granted want access to the inode
+ * by the acl. Returns -E... otherwise.
+ */
+int
+posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
+{
+	const struct posix_acl_entry *pa, *pe, *mask_obj;
+	int found = 0;
+
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+                switch(pa->e_tag) {
+                        case ACL_USER_OBJ:
+				/* (May have been checked already) */
+				if (inode->i_uid == current_fsuid())
+                                        goto check_perm;
+                                break;
+                        case ACL_USER:
+				if (pa->e_id == current_fsuid())
+                                        goto mask;
+				break;
+                        case ACL_GROUP_OBJ:
+                                if (in_group_p(inode->i_gid)) {
+					found = 1;
+					if ((pa->e_perm & want) == want)
+						goto mask;
+                                }
+				break;
+                        case ACL_GROUP:
+                                if (in_group_p(pa->e_id)) {
+					found = 1;
+					if ((pa->e_perm & want) == want)
+						goto mask;
+                                }
+                                break;
+                        case ACL_MASK:
+                                break;
+                        case ACL_OTHER:
+				if (found)
+					return -EACCES;
+				else
+					goto check_perm;
+			default:
+				return -EIO;
+                }
+        }
+	return -EIO;
+
+mask:
+	for (mask_obj = pa+1; mask_obj != pe; mask_obj++) {
+		if (mask_obj->e_tag == ACL_MASK) {
+			if ((pa->e_perm & mask_obj->e_perm & want) == want)
+				return 0;
+			return -EACCES;
+		}
+	}
+
+check_perm:
+	if ((pa->e_perm & want) == want)
+		return 0;
+	return -EACCES;
+}
+
+/*
+ * Modify acl when creating a new inode. The caller must ensure the acl is
+ * only referenced once.
+ *
+ * mode_p initially must contain the mode parameter to the open() / creat()
+ * system calls. All permissions that are not granted by the acl are removed.
+ * The permissions in the acl are changed to reflect the mode_p parameter.
+ */
+int
+posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p)
+{
+	struct posix_acl_entry *pa, *pe;
+	struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
+	mode_t mode = *mode_p;
+	int not_equiv = 0;
+
+	/* assert(atomic_read(acl->a_refcount) == 1); */
+
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+                switch(pa->e_tag) {
+                        case ACL_USER_OBJ:
+				pa->e_perm &= (mode >> 6) | ~S_IRWXO;
+				mode &= (pa->e_perm << 6) | ~S_IRWXU;
+				break;
+
+			case ACL_USER:
+			case ACL_GROUP:
+				not_equiv = 1;
+				break;
+
+                        case ACL_GROUP_OBJ:
+				group_obj = pa;
+                                break;
+
+                        case ACL_OTHER:
+				pa->e_perm &= mode | ~S_IRWXO;
+				mode &= pa->e_perm | ~S_IRWXO;
+                                break;
+
+                        case ACL_MASK:
+				mask_obj = pa;
+				not_equiv = 1;
+                                break;
+
+			default:
+				return -EIO;
+                }
+        }
+
+	if (mask_obj) {
+		mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
+		mode &= (mask_obj->e_perm << 3) | ~S_IRWXG;
+	} else {
+		if (!group_obj)
+			return -EIO;
+		group_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
+		mode &= (group_obj->e_perm << 3) | ~S_IRWXG;
+	}
+
+	*mode_p = (*mode_p & ~S_IRWXUGO) | mode;
+        return not_equiv;
+}
+
+/*
+ * Modify the ACL for the chmod syscall.
+ */
+int
+posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode)
+{
+	struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
+	struct posix_acl_entry *pa, *pe;
+
+	/* assert(atomic_read(acl->a_refcount) == 1); */
+
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+		switch(pa->e_tag) {
+			case ACL_USER_OBJ:
+				pa->e_perm = (mode & S_IRWXU) >> 6;
+				break;
+
+			case ACL_USER:
+			case ACL_GROUP:
+				break;
+
+			case ACL_GROUP_OBJ:
+				group_obj = pa;
+				break;
+
+			case ACL_MASK:
+				mask_obj = pa;
+				break;
+
+			case ACL_OTHER:
+				pa->e_perm = (mode & S_IRWXO);
+				break;
+
+			default:
+				return -EIO;
+		}
+	}
+
+	if (mask_obj) {
+		mask_obj->e_perm = (mode & S_IRWXG) >> 3;
+	} else {
+		if (!group_obj)
+			return -EIO;
+		group_obj->e_perm = (mode & S_IRWXG) >> 3;
+	}
+
+	return 0;
+}
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
new file mode 100644
index 00000000..15af6222
--- /dev/null
+++ b/fs/proc/Kconfig
@@ -0,0 +1,69 @@
+config PROC_FS
+	bool "/proc file system support" if EXPERT
+	default y
+	help
+	  This is a virtual file system providing information about the status
+	  of the system. "Virtual" means that it doesn't take up any space on
+	  your hard disk: the files are created on the fly by the kernel when
+	  you try to access them. Also, you cannot read the files with older
+	  version of the program less: you need to use more or cat.
+
+	  It's totally cool; for example, "cat /proc/interrupts" gives
+	  information about what the different IRQs are used for at the moment
+	  (there is a small number of Interrupt ReQuest lines in your computer
+	  that are used by the attached devices to gain the CPU's attention --
+	  often a source of trouble if two devices are mistakenly configured
+	  to use the same IRQ). The program procinfo to display some
+	  information about your system gathered from the /proc file system.
+
+	  Before you can use the /proc file system, it has to be mounted,
+	  meaning it has to be given a location in the directory hierarchy.
+	  That location should be /proc. A command such as "mount -t proc proc
+	  /proc" or the equivalent line in /etc/fstab does the job.
+
+	  The /proc file system is explained in the file
+	  <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage
+	  ("man 5 proc").
+
+	  This option will enlarge your kernel by about 67 KB. Several
+	  programs depend on this, so everyone should say Y here.
+
+config PROC_KCORE
+	bool "/proc/kcore support" if !ARM
+	depends on PROC_FS && MMU
+
+config PROC_VMCORE
+	bool "/proc/vmcore support"
+	depends on PROC_FS && CRASH_DUMP
+	default y
+        help
+        Exports the dump image of crashed kernel in ELF format.
+
+config PROC_SYSCTL
+	bool "Sysctl support (/proc/sys)" if EXPERT
+	depends on PROC_FS
+	select SYSCTL
+	default y
+	---help---
+	  The sysctl interface provides a means of dynamically changing
+	  certain kernel parameters and variables on the fly without requiring
+	  a recompile of the kernel or reboot of the system.  The primary
+	  interface is through /proc/sys.  If you say Y here a tree of
+	  modifiable sysctl entries will be generated beneath the
+          /proc/sys directory. They are explained in the files
+	  in <file:Documentation/sysctl/>.  Note that enabling this
+	  option will enlarge the kernel by at least 8 KB.
+
+	  As it is generally a good thing, you should say Y here unless
+	  building a kernel for install/rescue disks or your system is very
+	  limited in memory.
+
+config PROC_PAGE_MONITOR
+ 	default y
+	depends on PROC_FS && MMU
+	bool "Enable /proc page monitoring" if EXPERT
+ 	help
+	  Various /proc files exist to monitor process memory utilization:
+	  /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
+	  /proc/kpagecount, and /proc/kpageflags. Disabling these
+          interfaces will reduce the size of the kernel by approximately 4kb.
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
new file mode 100644
index 00000000..c1c72933
--- /dev/null
+++ b/fs/proc/Makefile
@@ -0,0 +1,30 @@
+#
+# Makefile for the Linux proc filesystem routines.
+#
+
+obj-y   += proc.o
+
+proc-y			:= nommu.o task_nommu.o
+proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
+
+proc-y       += inode.o root.o base.o generic.o array.o \
+		proc_tty.o
+proc-y	+= cmdline.o
+proc-y	+= consoles.o
+proc-y	+= cpuinfo.o
+proc-y	+= devices.o
+proc-y	+= interrupts.o
+proc-y	+= loadavg.o
+proc-y	+= meminfo.o
+proc-y	+= stat.o
+proc-y	+= uptime.o
+proc-y	+= version.o
+proc-y	+= softirqs.o
+proc-y	+= namespaces.o
+proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
+proc-$(CONFIG_NET)		+= proc_net.o
+proc-$(CONFIG_PROC_KCORE)	+= kcore.o
+proc-$(CONFIG_PROC_VMCORE)	+= vmcore.o
+proc-$(CONFIG_PROC_DEVICETREE)	+= proc_devtree.o
+proc-$(CONFIG_PRINTK)	+= kmsg.o
+proc-$(CONFIG_PROC_PAGE_MONITOR)	+= page.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
new file mode 100644
index 00000000..9b45ee84
--- /dev/null
+++ b/fs/proc/array.c
@@ -0,0 +1,546 @@
+/*
+ *  linux/fs/proc/array.c
+ *
+ *  Copyright (C) 1992  by Linus Torvalds
+ *  based on ideas by Darren Senn
+ *
+ * Fixes:
+ * Michael. K. Johnson: stat,statm extensions.
+ *                      <johnsonm@stolaf.edu>
+ *
+ * Pauline Middelink :  Made cmdline,envline only break at '\0's, to
+ *                      make sure SET_PROCTITLE works. Also removed
+ *                      bad '!' which forced address recalculation for
+ *                      EVERY character on the current page.
+ *                      <middelin@polyware.iaf.nl>
+ *
+ * Danny ter Haar    :	added cpuinfo
+ *			<dth@cistron.nl>
+ *
+ * Alessandro Rubini :  profile extension.
+ *                      <rubini@ipvvis.unipv.it>
+ *
+ * Jeff Tranter      :  added BogoMips field to cpuinfo
+ *                      <Jeff_Tranter@Mitel.COM>
+ *
+ * Bruno Haible      :  remove 4K limit for the maps file
+ *			<haible@ma2s2.mathematik.uni-karlsruhe.de>
+ *
+ * Yves Arrouye      :  remove removal of trailing spaces in get_array.
+ *			<Yves.Arrouye@marin.fdn.fr>
+ *
+ * Jerome Forissier  :  added per-CPU time information to /proc/stat
+ *                      and /proc/<pid>/cpu extension
+ *                      <forissier@isia.cma.fr>
+ *			- Incorporation and non-SMP safe operation
+ *			of forissier patch in 2.1.78 by
+ *			Hans Marcus <crowbar@concepts.nl>
+ *
+ * aeb@cwi.nl        :  /proc/partitions
+ *
+ *
+ * Alan Cox	     :  security fixes.
+ *			<alan@lxorguk.ukuu.org.uk>
+ *
+ * Al Viro           :  safe handling of mm_struct
+ *
+ * Gerhard Wichert   :  added BIGMEM support
+ * Siemens AG           <Gerhard.Wichert@pdb.siemens.de>
+ *
+ * Al Viro & Jeff Garzik :  moved most of the thing into base.c and
+ *			 :  proc_misc.c. The rest may eventually go into
+ *			 :  base.c too.
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/tty.h>
+#include <linux/string.h>
+#include <linux/mman.h>
+#include <linux/proc_fs.h>
+#include <linux/ioport.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/signal.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/times.h>
+#include <linux/cpuset.h>
+#include <linux/rcupdate.h>
+#include <linux/delayacct.h>
+#include <linux/seq_file.h>
+#include <linux/pid_namespace.h>
+#include <linux/ptrace.h>
+#include <linux/tracehook.h>
+
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include "internal.h"
+
+static inline void task_name(struct seq_file *m, struct task_struct *p)
+{
+	int i;
+	char *buf, *end;
+	char *name;
+	char tcomm[sizeof(p->comm)];
+
+	get_task_comm(tcomm, p);
+
+	seq_puts(m, "Name:\t");
+	end = m->buf + m->size;
+	buf = m->buf + m->count;
+	name = tcomm;
+	i = sizeof(tcomm);
+	while (i && (buf < end)) {
+		unsigned char c = *name;
+		name++;
+		i--;
+		*buf = c;
+		if (!c)
+			break;
+		if (c == '\\') {
+			buf++;
+			if (buf < end)
+				*buf++ = c;
+			continue;
+		}
+		if (c == '\n') {
+			*buf++ = '\\';
+			if (buf < end)
+				*buf++ = 'n';
+			continue;
+		}
+		buf++;
+	}
+	m->count = buf - m->buf;
+	seq_putc(m, '\n');
+}
+
+/*
+ * The task state array is a strange "bitmap" of
+ * reasons to sleep. Thus "running" is zero, and
+ * you can test for combinations of others with
+ * simple bit tests.
+ */
+static const char * const task_state_array[] = {
+	"R (running)",		/*   0 */
+	"S (sleeping)",		/*   1 */
+	"D (disk sleep)",	/*   2 */
+	"T (stopped)",		/*   4 */
+	"t (tracing stop)",	/*   8 */
+	"Z (zombie)",		/*  16 */
+	"X (dead)",		/*  32 */
+	"x (dead)",		/*  64 */
+	"K (wakekill)",		/* 128 */
+	"W (waking)",		/* 256 */
+};
+
+static inline const char *get_task_state(struct task_struct *tsk)
+{
+	unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state;
+	const char * const *p = &task_state_array[0];
+
+	BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array));
+
+	while (state) {
+		p++;
+		state >>= 1;
+	}
+	return *p;
+}
+
+static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
+				struct pid *pid, struct task_struct *p)
+{
+	struct group_info *group_info;
+	int g;
+	struct fdtable *fdt = NULL;
+	const struct cred *cred;
+	pid_t ppid, tpid;
+
+	rcu_read_lock();
+	ppid = pid_alive(p) ?
+		task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
+	tpid = 0;
+	if (pid_alive(p)) {
+		struct task_struct *tracer = tracehook_tracer_task(p);
+		if (tracer)
+			tpid = task_pid_nr_ns(tracer, ns);
+	}
+	cred = get_task_cred(p);
+	seq_printf(m,
+		"State:\t%s\n"
+		"Tgid:\t%d\n"
+		"Pid:\t%d\n"
+		"PPid:\t%d\n"
+		"TracerPid:\t%d\n"
+		"Uid:\t%d\t%d\t%d\t%d\n"
+		"Gid:\t%d\t%d\t%d\t%d\n",
+		get_task_state(p),
+		task_tgid_nr_ns(p, ns),
+		pid_nr_ns(pid, ns),
+		ppid, tpid,
+		cred->uid, cred->euid, cred->suid, cred->fsuid,
+		cred->gid, cred->egid, cred->sgid, cred->fsgid);
+
+	task_lock(p);
+	if (p->files)
+		fdt = files_fdtable(p->files);
+	seq_printf(m,
+		"FDSize:\t%d\n"
+		"Groups:\t",
+		fdt ? fdt->max_fds : 0);
+	rcu_read_unlock();
+
+	group_info = cred->group_info;
+	task_unlock(p);
+
+	for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
+		seq_printf(m, "%d ", GROUP_AT(group_info, g));
+	put_cred(cred);
+
+	seq_putc(m, '\n');
+}
+
+static void render_sigset_t(struct seq_file *m, const char *header,
+				sigset_t *set)
+{
+	int i;
+
+	seq_puts(m, header);
+
+	i = _NSIG;
+	do {
+		int x = 0;
+
+		i -= 4;
+		if (sigismember(set, i+1)) x |= 1;
+		if (sigismember(set, i+2)) x |= 2;
+		if (sigismember(set, i+3)) x |= 4;
+		if (sigismember(set, i+4)) x |= 8;
+		seq_printf(m, "%x", x);
+	} while (i >= 4);
+
+	seq_putc(m, '\n');
+}
+
+static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
+				    sigset_t *catch)
+{
+	struct k_sigaction *k;
+	int i;
+
+	k = p->sighand->action;
+	for (i = 1; i <= _NSIG; ++i, ++k) {
+		if (k->sa.sa_handler == SIG_IGN)
+			sigaddset(ign, i);
+		else if (k->sa.sa_handler != SIG_DFL)
+			sigaddset(catch, i);
+	}
+}
+
+static inline void task_sig(struct seq_file *m, struct task_struct *p)
+{
+	unsigned long flags;
+	sigset_t pending, shpending, blocked, ignored, caught;
+	int num_threads = 0;
+	unsigned long qsize = 0;
+	unsigned long qlim = 0;
+
+	sigemptyset(&pending);
+	sigemptyset(&shpending);
+	sigemptyset(&blocked);
+	sigemptyset(&ignored);
+	sigemptyset(&caught);
+
+	if (lock_task_sighand(p, &flags)) {
+		pending = p->pending.signal;
+		shpending = p->signal->shared_pending.signal;
+		blocked = p->blocked;
+		collect_sigign_sigcatch(p, &ignored, &caught);
+		num_threads = get_nr_threads(p);
+		rcu_read_lock();  /* FIXME: is this correct? */
+		qsize = atomic_read(&__task_cred(p)->user->sigpending);
+		rcu_read_unlock();
+		qlim = task_rlimit(p, RLIMIT_SIGPENDING);
+		unlock_task_sighand(p, &flags);
+	}
+
+	seq_printf(m, "Threads:\t%d\n", num_threads);
+	seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);
+
+	/* render them all */
+	render_sigset_t(m, "SigPnd:\t", &pending);
+	render_sigset_t(m, "ShdPnd:\t", &shpending);
+	render_sigset_t(m, "SigBlk:\t", &blocked);
+	render_sigset_t(m, "SigIgn:\t", &ignored);
+	render_sigset_t(m, "SigCgt:\t", &caught);
+}
+
+static void render_cap_t(struct seq_file *m, const char *header,
+			kernel_cap_t *a)
+{
+	unsigned __capi;
+
+	seq_puts(m, header);
+	CAP_FOR_EACH_U32(__capi) {
+		seq_printf(m, "%08x",
+			   a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
+	}
+	seq_putc(m, '\n');
+}
+
+static inline void task_cap(struct seq_file *m, struct task_struct *p)
+{
+	const struct cred *cred;
+	kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
+
+	rcu_read_lock();
+	cred = __task_cred(p);
+	cap_inheritable	= cred->cap_inheritable;
+	cap_permitted	= cred->cap_permitted;
+	cap_effective	= cred->cap_effective;
+	cap_bset	= cred->cap_bset;
+	rcu_read_unlock();
+
+	render_cap_t(m, "CapInh:\t", &cap_inheritable);
+	render_cap_t(m, "CapPrm:\t", &cap_permitted);
+	render_cap_t(m, "CapEff:\t", &cap_effective);
+	render_cap_t(m, "CapBnd:\t", &cap_bset);
+}
+
+static inline void task_context_switch_counts(struct seq_file *m,
+						struct task_struct *p)
+{
+	seq_printf(m,	"voluntary_ctxt_switches:\t%lu\n"
+			"nonvoluntary_ctxt_switches:\t%lu\n",
+			p->nvcsw,
+			p->nivcsw);
+}
+
+static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
+{
+	seq_puts(m, "Cpus_allowed:\t");
+	seq_cpumask(m, &task->cpus_allowed);
+	seq_putc(m, '\n');
+	seq_puts(m, "Cpus_allowed_list:\t");
+	seq_cpumask_list(m, &task->cpus_allowed);
+	seq_putc(m, '\n');
+}
+
+int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
+			struct pid *pid, struct task_struct *task)
+{
+	struct mm_struct *mm = get_task_mm(task);
+
+	task_name(m, task);
+	task_state(m, ns, pid, task);
+
+	if (mm) {
+		task_mem(m, mm);
+		mmput(mm);
+	}
+	task_sig(m, task);
+	task_cap(m, task);
+	task_cpus_allowed(m, task);
+	cpuset_task_status_allowed(m, task);
+	task_context_switch_counts(m, task);
+	return 0;
+}
+
+static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
+			struct pid *pid, struct task_struct *task, int whole)
+{
+	unsigned long vsize, eip, esp, wchan = ~0UL;
+	long priority, nice;
+	int tty_pgrp = -1, tty_nr = 0;
+	sigset_t sigign, sigcatch;
+	char state;
+	pid_t ppid = 0, pgid = -1, sid = -1;
+	int num_threads = 0;
+	int permitted;
+	struct mm_struct *mm;
+	unsigned long long start_time;
+	unsigned long cmin_flt = 0, cmaj_flt = 0;
+	unsigned long  min_flt = 0,  maj_flt = 0;
+	cputime_t cutime, cstime, utime, stime;
+	cputime_t cgtime, gtime;
+	unsigned long rsslim = 0;
+	char tcomm[sizeof(task->comm)];
+	unsigned long flags;
+
+	state = *get_task_state(task);
+	vsize = eip = esp = 0;
+	permitted = ptrace_may_access(task, PTRACE_MODE_READ);
+	mm = get_task_mm(task);
+	if (mm) {
+		vsize = task_vsize(mm);
+		if (permitted) {
+			eip = KSTK_EIP(task);
+			esp = KSTK_ESP(task);
+		}
+	}
+
+	get_task_comm(tcomm, task);
+
+	sigemptyset(&sigign);
+	sigemptyset(&sigcatch);
+	cutime = cstime = utime = stime = cputime_zero;
+	cgtime = gtime = cputime_zero;
+
+	if (lock_task_sighand(task, &flags)) {
+		struct signal_struct *sig = task->signal;
+
+		if (sig->tty) {
+			struct pid *pgrp = tty_get_pgrp(sig->tty);
+			tty_pgrp = pid_nr_ns(pgrp, ns);
+			put_pid(pgrp);
+			tty_nr = new_encode_dev(tty_devnum(sig->tty));
+		}
+
+		num_threads = get_nr_threads(task);
+		collect_sigign_sigcatch(task, &sigign, &sigcatch);
+
+		cmin_flt = sig->cmin_flt;
+		cmaj_flt = sig->cmaj_flt;
+		cutime = sig->cutime;
+		cstime = sig->cstime;
+		cgtime = sig->cgtime;
+		rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
+
+		/* add up live thread stats at the group level */
+		if (whole) {
+			struct task_struct *t = task;
+			do {
+				min_flt += t->min_flt;
+				maj_flt += t->maj_flt;
+				gtime = cputime_add(gtime, t->gtime);
+				t = next_thread(t);
+			} while (t != task);
+
+			min_flt += sig->min_flt;
+			maj_flt += sig->maj_flt;
+			thread_group_times(task, &utime, &stime);
+			gtime = cputime_add(gtime, sig->gtime);
+		}
+
+		sid = task_session_nr_ns(task, ns);
+		ppid = task_tgid_nr_ns(task->real_parent, ns);
+		pgid = task_pgrp_nr_ns(task, ns);
+
+		unlock_task_sighand(task, &flags);
+	}
+
+	if (permitted && (!whole || num_threads < 2))
+		wchan = get_wchan(task);
+	if (!whole) {
+		min_flt = task->min_flt;
+		maj_flt = task->maj_flt;
+		task_times(task, &utime, &stime);
+		gtime = task->gtime;
+	}
+
+	/* scale priority and nice values from timeslices to -20..20 */
+	/* to make it look like a "normal" Unix priority/nice value  */
+	priority = task_prio(task);
+	nice = task_nice(task);
+
+	/* Temporary variable needed for gcc-2.96 */
+	/* convert timespec -> nsec*/
+	start_time =
+		(unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
+				+ task->real_start_time.tv_nsec;
+	/* convert nsec -> ticks */
+	start_time = nsec_to_clock_t(start_time);
+
+	seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
+%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
+		pid_nr_ns(pid, ns),
+		tcomm,
+		state,
+		ppid,
+		pgid,
+		sid,
+		tty_nr,
+		tty_pgrp,
+		task->flags,
+		min_flt,
+		cmin_flt,
+		maj_flt,
+		cmaj_flt,
+		cputime_to_clock_t(utime),
+		cputime_to_clock_t(stime),
+		cputime_to_clock_t(cutime),
+		cputime_to_clock_t(cstime),
+		priority,
+		nice,
+		num_threads,
+		start_time,
+		vsize,
+		mm ? get_mm_rss(mm) : 0,
+		rsslim,
+		mm ? (permitted ? mm->start_code : 1) : 0,
+		mm ? (permitted ? mm->end_code : 1) : 0,
+		(permitted && mm) ? mm->start_stack : 0,
+		esp,
+		eip,
+		/* The signal information here is obsolete.
+		 * It must be decimal for Linux 2.0 compatibility.
+		 * Use /proc/#/status for real-time signals.
+		 */
+		task->pending.signal.sig[0] & 0x7fffffffUL,
+		task->blocked.sig[0] & 0x7fffffffUL,
+		sigign      .sig[0] & 0x7fffffffUL,
+		sigcatch    .sig[0] & 0x7fffffffUL,
+		wchan,
+		0UL,
+		0UL,
+		task->exit_signal,
+		task_cpu(task),
+		task->rt_priority,
+		task->policy,
+		(unsigned long long)delayacct_blkio_ticks(task),
+		cputime_to_clock_t(gtime),
+		cputime_to_clock_t(cgtime));
+	if (mm)
+		mmput(mm);
+	return 0;
+}
+
+int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
+			struct pid *pid, struct task_struct *task)
+{
+	return do_task_stat(m, ns, pid, task, 0);
+}
+
+int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
+			struct pid *pid, struct task_struct *task)
+{
+	return do_task_stat(m, ns, pid, task, 1);
+}
+
+int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
+			struct pid *pid, struct task_struct *task)
+{
+	unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
+	struct mm_struct *mm = get_task_mm(task);
+
+	if (mm) {
+		size = task_statm(mm, &shared, &text, &data, &resident);
+		mmput(mm);
+	}
+	seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
+			size, resident, shared, text, data);
+
+	return 0;
+}
diff --git a/fs/proc/base.c b/fs/proc/base.c
new file mode 100644
index 00000000..bfa13ac6
--- /dev/null
+++ b/fs/proc/base.c
@@ -0,0 +1,3429 @@
+/*
+ *  linux/fs/proc/base.c
+ *
+ *  Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ *  proc base directory handling functions
+ *
+ *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
+ *  Instead of using magical inumbers to determine the kind of object
+ *  we allocate and fill in-core inodes upon lookup. They don't even
+ *  go into icache. We cache the reference to task_struct upon lookup too.
+ *  Eventually it should become a filesystem in its own. We don't use the
+ *  rest of procfs anymore.
+ *
+ *
+ *  Changelog:
+ *  17-Jan-2005
+ *  Allan Bezerra
+ *  Bruna Moreira <bruna.moreira@indt.org.br>
+ *  Edjard Mota <edjard.mota@indt.org.br>
+ *  Ilias Biris <ilias.biris@indt.org.br>
+ *  Mauricio Lin <mauricio.lin@indt.org.br>
+ *
+ *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+ *
+ *  A new process specific entry (smaps) included in /proc. It shows the
+ *  size of rss for each memory area. The maps entry lacks information
+ *  about physical memory size (rss) for each mapped file, i.e.,
+ *  rss information for executables and library files.
+ *  This additional information is useful for any tools that need to know
+ *  about physical memory consumption for a process specific library.
+ *
+ *  Changelog:
+ *  21-Feb-2005
+ *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+ *  Pud inclusion in the page table walking.
+ *
+ *  ChangeLog:
+ *  10-Mar-2005
+ *  10LE Instituto Nokia de Tecnologia - INdT:
+ *  A better way to walks through the page table as suggested by Hugh Dickins.
+ *
+ *  Simo Piiroinen <simo.piiroinen@nokia.com>:
+ *  Smaps information related to shared, private, clean and dirty pages.
+ *
+ *  Paul Mundt <paul.mundt@nokia.com>:
+ *  Overall revision about smaps.
+ */
+
+#include <asm/uaccess.h>
+
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/init.h>
+#include <linux/capability.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+#include <linux/mnt_namespace.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/rcupdate.h>
+#include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
+#include <linux/resource.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/security.h>
+#include <linux/ptrace.h>
+#include <linux/tracehook.h>
+#include <linux/cgroup.h>
+#include <linux/cpuset.h>
+#include <linux/audit.h>
+#include <linux/poll.h>
+#include <linux/nsproxy.h>
+#include <linux/oom.h>
+#include <linux/elf.h>
+#include <linux/pid_namespace.h>
+#include <linux/fs_struct.h>
+#include <linux/slab.h>
+#ifdef CONFIG_HARDWALL
+#include <asm/hardwall.h>
+#endif
+#include "internal.h"
+
+/* NOTE:
+ *	Implementing inode permission operations in /proc is almost
+ *	certainly an error.  Permission checks need to happen during
+ *	each system call not at open time.  The reason is that most of
+ *	what we wish to check for permissions in /proc varies at runtime.
+ *
+ *	The classic example of a problem is opening file descriptors
+ *	in /proc for a task before it execs a suid executable.
+ */
+
+struct pid_entry {
+	char *name;
+	int len;
+	mode_t mode;
+	const struct inode_operations *iop;
+	const struct file_operations *fop;
+	union proc_op op;
+};
+
+#define NOD(NAME, MODE, IOP, FOP, OP) {			\
+	.name = (NAME),					\
+	.len  = sizeof(NAME) - 1,			\
+	.mode = MODE,					\
+	.iop  = IOP,					\
+	.fop  = FOP,					\
+	.op   = OP,					\
+}
+
+#define DIR(NAME, MODE, iops, fops)	\
+	NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
+#define LNK(NAME, get_link)					\
+	NOD(NAME, (S_IFLNK|S_IRWXUGO),				\
+		&proc_pid_link_inode_operations, NULL,		\
+		{ .proc_get_link = get_link } )
+#define REG(NAME, MODE, fops)				\
+	NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
+#define INF(NAME, MODE, read)				\
+	NOD(NAME, (S_IFREG|(MODE)), 			\
+		NULL, &proc_info_file_operations,	\
+		{ .proc_read = read } )
+#define ONE(NAME, MODE, show)				\
+	NOD(NAME, (S_IFREG|(MODE)), 			\
+		NULL, &proc_single_file_operations,	\
+		{ .proc_show = show } )
+
+/* ANDROID is for special files in /proc. */
+#define ANDROID(NAME, MODE, OTYPE)			\
+	NOD(NAME, (S_IFREG|(MODE)),			\
+		&proc_##OTYPE##_inode_operations,	\
+		&proc_##OTYPE##_operations, {})
+
+/*
+ * Count the number of hardlinks for the pid_entry table, excluding the .
+ * and .. links.
+ */
+static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
+	unsigned int n)
+{
+	unsigned int i;
+	unsigned int count;
+
+	count = 0;
+	for (i = 0; i < n; ++i) {
+		if (S_ISDIR(entries[i].mode))
+			++count;
+	}
+
+	return count;
+}
+
+static int get_task_root(struct task_struct *task, struct path *root)
+{
+	int result = -ENOENT;
+
+	task_lock(task);
+	if (task->fs) {
+		get_fs_root(task->fs, root);
+		result = 0;
+	}
+	task_unlock(task);
+	return result;
+}
+
+static int proc_cwd_link(struct inode *inode, struct path *path)
+{
+	struct task_struct *task = get_proc_task(inode);
+	int result = -ENOENT;
+
+	if (task) {
+		task_lock(task);
+		if (task->fs) {
+			get_fs_pwd(task->fs, path);
+			result = 0;
+		}
+		task_unlock(task);
+		put_task_struct(task);
+	}
+	return result;
+}
+
+static int proc_root_link(struct inode *inode, struct path *path)
+{
+	struct task_struct *task = get_proc_task(inode);
+	int result = -ENOENT;
+
+	if (task) {
+		result = get_task_root(task, path);
+		put_task_struct(task);
+	}
+	return result;
+}
+
+static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
+{
+	struct mm_struct *mm;
+	int err;
+
+	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
+	if (err)
+		return ERR_PTR(err);
+
+	mm = get_task_mm(task);
+	if (mm && mm != current->mm &&
+			!ptrace_may_access(task, PTRACE_MODE_READ) &&
+			!capable(CAP_SYS_RESOURCE)) {
+		mmput(mm);
+		mm = ERR_PTR(-EACCES);
+	}
+	mutex_unlock(&task->signal->cred_guard_mutex);
+
+	return mm;
+}
+
+struct mm_struct *mm_for_maps(struct task_struct *task)
+{
+	return mm_access(task, PTRACE_MODE_READ);
+}
+
+static int proc_pid_cmdline(struct task_struct *task, char * buffer)
+{
+	int res = 0;
+	unsigned int len;
+	struct mm_struct *mm = get_task_mm(task);
+	if (!mm)
+		goto out;
+	if (!mm->arg_end)
+		goto out_mm;	/* Shh! No looking before we're done */
+
+ 	len = mm->arg_end - mm->arg_start;
+ 
+	if (len > PAGE_SIZE)
+		len = PAGE_SIZE;
+ 
+	res = access_process_vm(task, mm->arg_start, buffer, len, 0);
+
+	// If the nul at the end of args has been overwritten, then
+	// assume application is using setproctitle(3).
+	if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) {
+		len = strnlen(buffer, res);
+		if (len < res) {
+		    res = len;
+		} else {
+			len = mm->env_end - mm->env_start;
+			if (len > PAGE_SIZE - res)
+				len = PAGE_SIZE - res;
+			res += access_process_vm(task, mm->env_start, buffer+res, len, 0);
+			res = strnlen(buffer, res);
+		}
+	}
+out_mm:
+	mmput(mm);
+out:
+	return res;
+}
+
+static int proc_pid_auxv(struct task_struct *task, char *buffer)
+{
+	struct mm_struct *mm = mm_for_maps(task);
+	int res = PTR_ERR(mm);
+	if (mm && !IS_ERR(mm)) {
+		unsigned int nwords = 0;
+		do {
+			nwords += 2;
+		} while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
+		res = nwords * sizeof(mm->saved_auxv[0]);
+		if (res > PAGE_SIZE)
+			res = PAGE_SIZE;
+		memcpy(buffer, mm->saved_auxv, res);
+		mmput(mm);
+	}
+	return res;
+}
+
+
+#ifdef CONFIG_KALLSYMS
+/*
+ * Provides a wchan file via kallsyms in a proper one-value-per-file format.
+ * Returns the resolved symbol.  If that fails, simply return the address.
+ */
+static int proc_pid_wchan(struct task_struct *task, char *buffer)
+{
+	unsigned long wchan;
+	char symname[KSYM_NAME_LEN];
+
+	wchan = get_wchan(task);
+
+	if (lookup_symbol_name(wchan, symname) < 0)
+		if (!ptrace_may_access(task, PTRACE_MODE_READ))
+			return 0;
+		else
+			return sprintf(buffer, "%lu", wchan);
+	else
+		return sprintf(buffer, "%s", symname);
+}
+#endif /* CONFIG_KALLSYMS */
+
+static int lock_trace(struct task_struct *task)
+{
+	int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+	if (err)
+		return err;
+	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+		mutex_unlock(&task->signal->cred_guard_mutex);
+		return -EPERM;
+	}
+	return 0;
+}
+
+static void unlock_trace(struct task_struct *task)
+{
+	mutex_unlock(&task->signal->cred_guard_mutex);
+}
+
+#ifdef CONFIG_STACKTRACE
+
+#define MAX_STACK_TRACE_DEPTH	64
+
+static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
+			  struct pid *pid, struct task_struct *task)
+{
+	struct stack_trace trace;
+	unsigned long *entries;
+	int err;
+	int i;
+
+	entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
+	if (!entries)
+		return -ENOMEM;
+
+	trace.nr_entries	= 0;
+	trace.max_entries	= MAX_STACK_TRACE_DEPTH;
+	trace.entries		= entries;
+	trace.skip		= 0;
+
+	err = lock_trace(task);
+	if (!err) {
+		save_stack_trace_tsk(task, &trace);
+
+		for (i = 0; i < trace.nr_entries; i++) {
+			seq_printf(m, "[<%pK>] %pS\n",
+				   (void *)entries[i], (void *)entries[i]);
+		}
+		unlock_trace(task);
+	}
+	kfree(entries);
+
+	return err;
+}
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+/*
+ * Provides /proc/PID/schedstat
+ */
+static int proc_pid_schedstat(struct task_struct *task, char *buffer)
+{
+	return sprintf(buffer, "%llu %llu %lu\n",
+			(unsigned long long)task->se.sum_exec_runtime,
+			(unsigned long long)task->sched_info.run_delay,
+			task->sched_info.pcount);
+}
+#endif
+
+#ifdef CONFIG_LATENCYTOP
+static int lstats_show_proc(struct seq_file *m, void *v)
+{
+	int i;
+	struct inode *inode = m->private;
+	struct task_struct *task = get_proc_task(inode);
+
+	if (!task)
+		return -ESRCH;
+	seq_puts(m, "Latency Top version : v0.1\n");
+	for (i = 0; i < 32; i++) {
+		struct latency_record *lr = &task->latency_record[i];
+		if (lr->backtrace[0]) {
+			int q;
+			seq_printf(m, "%i %li %li",
+				   lr->count, lr->time, lr->max);
+			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+				unsigned long bt = lr->backtrace[q];
+				if (!bt)
+					break;
+				if (bt == ULONG_MAX)
+					break;
+				seq_printf(m, " %ps", (void *)bt);
+			}
+			seq_putc(m, '\n');
+		}
+
+	}
+	put_task_struct(task);
+	return 0;
+}
+
+static int lstats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, lstats_show_proc, inode);
+}
+
+static ssize_t lstats_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *offs)
+{
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+
+	if (!task)
+		return -ESRCH;
+	clear_all_latency_tracing(task);
+	put_task_struct(task);
+
+	return count;
+}
+
+static const struct file_operations proc_lstats_operations = {
+	.open		= lstats_open,
+	.read		= seq_read,
+	.write		= lstats_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif
+
+static int proc_oom_score(struct task_struct *task, char *buffer)
+{
+	unsigned long points = 0;
+
+	read_lock(&tasklist_lock);
+	if (pid_alive(task))
+		points = oom_badness(task, NULL, NULL,
+					totalram_pages + total_swap_pages);
+	read_unlock(&tasklist_lock);
+	return sprintf(buffer, "%lu\n", points);
+}
+
+struct limit_names {
+	char *name;
+	char *unit;
+};
+
+static const struct limit_names lnames[RLIM_NLIMITS] = {
+	[RLIMIT_CPU] = {"Max cpu time", "seconds"},
+	[RLIMIT_FSIZE] = {"Max file size", "bytes"},
+	[RLIMIT_DATA] = {"Max data size", "bytes"},
+	[RLIMIT_STACK] = {"Max stack size", "bytes"},
+	[RLIMIT_CORE] = {"Max core file size", "bytes"},
+	[RLIMIT_RSS] = {"Max resident set", "bytes"},
+	[RLIMIT_NPROC] = {"Max processes", "processes"},
+	[RLIMIT_NOFILE] = {"Max open files", "files"},
+	[RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
+	[RLIMIT_AS] = {"Max address space", "bytes"},
+	[RLIMIT_LOCKS] = {"Max file locks", "locks"},
+	[RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
+	[RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
+	[RLIMIT_NICE] = {"Max nice priority", NULL},
+	[RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
+	[RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
+};
+
+/* Display limits for a process */
+static int proc_pid_limits(struct task_struct *task, char *buffer)
+{
+	unsigned int i;
+	int count = 0;
+	unsigned long flags;
+	char *bufptr = buffer;
+
+	struct rlimit rlim[RLIM_NLIMITS];
+
+	if (!lock_task_sighand(task, &flags))
+		return 0;
+	memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
+	unlock_task_sighand(task, &flags);
+
+	/*
+	 * print the file header
+	 */
+	count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n",
+			"Limit", "Soft Limit", "Hard Limit", "Units");
+
+	for (i = 0; i < RLIM_NLIMITS; i++) {
+		if (rlim[i].rlim_cur == RLIM_INFINITY)
+			count += sprintf(&bufptr[count], "%-25s %-20s ",
+					 lnames[i].name, "unlimited");
+		else
+			count += sprintf(&bufptr[count], "%-25s %-20lu ",
+					 lnames[i].name, rlim[i].rlim_cur);
+
+		if (rlim[i].rlim_max == RLIM_INFINITY)
+			count += sprintf(&bufptr[count], "%-20s ", "unlimited");
+		else
+			count += sprintf(&bufptr[count], "%-20lu ",
+					 rlim[i].rlim_max);
+
+		if (lnames[i].unit)
+			count += sprintf(&bufptr[count], "%-10s\n",
+					 lnames[i].unit);
+		else
+			count += sprintf(&bufptr[count], "\n");
+	}
+
+	return count;
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+static int proc_pid_syscall(struct task_struct *task, char *buffer)
+{
+	long nr;
+	unsigned long args[6], sp, pc;
+	int res = lock_trace(task);
+	if (res)
+		return res;
+
+	if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
+		res = sprintf(buffer, "running\n");
+	else if (nr < 0)
+		res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+	else
+		res = sprintf(buffer,
+		       "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
+		       nr,
+		       args[0], args[1], args[2], args[3], args[4], args[5],
+		       sp, pc);
+	unlock_trace(task);
+	return res;
+}
+#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
+
+/************************************************************************/
+/*                       Here the fs part begins                        */
+/************************************************************************/
+
+/* permission checks */
+static int proc_fd_access_allowed(struct inode *inode)
+{
+	struct task_struct *task;
+	int allowed = 0;
+	/* Allow access to a task's file descriptors if it is us or we
+	 * may use ptrace attach to the process and find out that
+	 * information.
+	 */
+	task = get_proc_task(inode);
+	if (task) {
+		allowed = ptrace_may_access(task, PTRACE_MODE_READ);
+		put_task_struct(task);
+	}
+	return allowed;
+}
+
+int proc_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int error;
+	struct inode *inode = dentry->d_inode;
+
+	if (attr->ia_valid & ATTR_MODE)
+		return -EPERM;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static const struct inode_operations proc_def_inode_operations = {
+	.setattr	= proc_setattr,
+};
+
+static int mounts_open_common(struct inode *inode, struct file *file,
+			      const struct seq_operations *op)
+{
+	struct task_struct *task = get_proc_task(inode);
+	struct nsproxy *nsp;
+	struct mnt_namespace *ns = NULL;
+	struct path root;
+	struct proc_mounts *p;
+	int ret = -EINVAL;
+
+	if (task) {
+		rcu_read_lock();
+		nsp = task_nsproxy(task);
+		if (nsp) {
+			ns = nsp->mnt_ns;
+			if (ns)
+				get_mnt_ns(ns);
+		}
+		rcu_read_unlock();
+		if (ns && get_task_root(task, &root) == 0)
+			ret = 0;
+		put_task_struct(task);
+	}
+
+	if (!ns)
+		goto err;
+	if (ret)
+		goto err_put_ns;
+
+	ret = -ENOMEM;
+	p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
+	if (!p)
+		goto err_put_path;
+
+	file->private_data = &p->m;
+	ret = seq_open(file, op);
+	if (ret)
+		goto err_free;
+
+	p->m.private = p;
+	p->ns = ns;
+	p->root = root;
+	p->event = ns->event;
+
+	return 0;
+
+ err_free:
+	kfree(p);
+ err_put_path:
+	path_put(&root);
+ err_put_ns:
+	put_mnt_ns(ns);
+ err:
+	return ret;
+}
+
+static int mounts_release(struct inode *inode, struct file *file)
+{
+	struct proc_mounts *p = file->private_data;
+	path_put(&p->root);
+	put_mnt_ns(p->ns);
+	return seq_release(inode, file);
+}
+
+static unsigned mounts_poll(struct file *file, poll_table *wait)
+{
+	struct proc_mounts *p = file->private_data;
+	unsigned res = POLLIN | POLLRDNORM;
+
+	poll_wait(file, &p->ns->poll, wait);
+	if (mnt_had_events(p))
+		res |= POLLERR | POLLPRI;
+
+	return res;
+}
+
+static int mounts_open(struct inode *inode, struct file *file)
+{
+	return mounts_open_common(inode, file, &mounts_op);
+}
+
+static const struct file_operations proc_mounts_operations = {
+	.open		= mounts_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= mounts_release,
+	.poll		= mounts_poll,
+};
+
+static int mountinfo_open(struct inode *inode, struct file *file)
+{
+	return mounts_open_common(inode, file, &mountinfo_op);
+}
+
+static const struct file_operations proc_mountinfo_operations = {
+	.open		= mountinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= mounts_release,
+	.poll		= mounts_poll,
+};
+
+static int mountstats_open(struct inode *inode, struct file *file)
+{
+	return mounts_open_common(inode, file, &mountstats_op);
+}
+
+static const struct file_operations proc_mountstats_operations = {
+	.open		= mountstats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= mounts_release,
+};
+
+#define PROC_BLOCK_SIZE	(3*1024)		/* 4K page size but our output routines use some slack for overruns */
+
+static ssize_t proc_info_read(struct file * file, char __user * buf,
+			  size_t count, loff_t *ppos)
+{
+	struct inode * inode = file->f_path.dentry->d_inode;
+	unsigned long page;
+	ssize_t length;
+	struct task_struct *task = get_proc_task(inode);
+
+	length = -ESRCH;
+	if (!task)
+		goto out_no_task;
+
+	if (count > PROC_BLOCK_SIZE)
+		count = PROC_BLOCK_SIZE;
+
+	length = -ENOMEM;
+	if (!(page = __get_free_page(GFP_TEMPORARY)))
+		goto out;
+
+	length = PROC_I(inode)->op.proc_read(task, (char*)page);
+
+	if (length >= 0)
+		length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
+	free_page(page);
+out:
+	put_task_struct(task);
+out_no_task:
+	return length;
+}
+
+static const struct file_operations proc_info_file_operations = {
+	.read		= proc_info_read,
+	.llseek		= generic_file_llseek,
+};
+
+static int proc_single_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct pid_namespace *ns;
+	struct pid *pid;
+	struct task_struct *task;
+	int ret;
+
+	ns = inode->i_sb->s_fs_info;
+	pid = proc_pid(inode);
+	task = get_pid_task(pid, PIDTYPE_PID);
+	if (!task)
+		return -ESRCH;
+
+	ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);
+
+	put_task_struct(task);
+	return ret;
+}
+
+static int proc_single_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, proc_single_show, inode);
+}
+
+static const struct file_operations proc_single_file_operations = {
+	.open		= proc_single_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int mem_open(struct inode* inode, struct file* file)
+{
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	struct mm_struct *mm;
+
+	if (!task)
+		return -ESRCH;
+
+	mm = mm_access(task, PTRACE_MODE_ATTACH);
+	put_task_struct(task);
+
+	if (IS_ERR(mm))
+		return PTR_ERR(mm);
+
+	if (mm) {
+		/* ensure this mm_struct can't be freed */
+		atomic_inc(&mm->mm_count);
+		/* but do not pin its memory */
+		mmput(mm);
+	}
+
+	/* OK to pass negative loff_t, we can catch out-of-range */
+	file->f_mode |= FMODE_UNSIGNED_OFFSET;
+	file->private_data = mm;
+
+	return 0;
+}
+
+static ssize_t mem_rw(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos, int write)
+{
+	struct mm_struct *mm = file->private_data;
+	unsigned long addr = *ppos;
+	ssize_t copied;
+	char *page;
+
+	if (!mm)
+		return 0;
+
+	page = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!page)
+		return -ENOMEM;
+
+	copied = 0;
+	if (!atomic_inc_not_zero(&mm->mm_users))
+		goto free;
+
+	while (count > 0) {
+		int this_len = min_t(int, count, PAGE_SIZE);
+
+		if (write && copy_from_user(page, buf, this_len)) {
+			copied = -EFAULT;
+			break;
+		}
+
+		this_len = access_remote_vm(mm, addr, page, this_len, write);
+		if (!this_len) {
+			if (!copied)
+				copied = -EIO;
+			break;
+		}
+
+		if (!write && copy_to_user(buf, page, this_len)) {
+			copied = -EFAULT;
+			break;
+		}
+
+		buf += this_len;
+		addr += this_len;
+		copied += this_len;
+		count -= this_len;
+	}
+	*ppos = addr;
+
+	mmput(mm);
+free:
+	free_page((unsigned long) page);
+	return copied;
+}
+
+static ssize_t mem_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	return mem_rw(file, buf, count, ppos, 0);
+}
+
+#define mem_write NULL
+
+#ifndef mem_write
+/* This is a security hazard */
+static ssize_t mem_write(struct file *file, const char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	return mem_rw(file, (char __user*)buf, count, ppos, 1);
+}
+#endif
+
+loff_t mem_lseek(struct file *file, loff_t offset, int orig)
+{
+	switch (orig) {
+	case 0:
+		file->f_pos = offset;
+		break;
+	case 1:
+		file->f_pos += offset;
+		break;
+	default:
+		return -EINVAL;
+	}
+	force_successful_syscall_return();
+	return file->f_pos;
+}
+
+static int mem_release(struct inode *inode, struct file *file)
+{
+	struct mm_struct *mm = file->private_data;
+	if (mm)
+		mmdrop(mm);
+	return 0;
+}
+
+static const struct file_operations proc_mem_operations = {
+	.llseek		= mem_lseek,
+	.read		= mem_read,
+	.write		= mem_write,
+	.open		= mem_open,
+	.release	= mem_release,
+};
+
+static ssize_t environ_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+	char *page;
+	unsigned long src = *ppos;
+	int ret = -ESRCH;
+	struct mm_struct *mm;
+
+	if (!task)
+		goto out_no_task;
+
+	ret = -ENOMEM;
+	page = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!page)
+		goto out;
+
+
+	mm = mm_for_maps(task);
+	ret = PTR_ERR(mm);
+	if (!mm || IS_ERR(mm))
+		goto out_free;
+
+	ret = 0;
+	while (count > 0) {
+		int this_len, retval, max_len;
+
+		this_len = mm->env_end - (mm->env_start + src);
+
+		if (this_len <= 0)
+			break;
+
+		max_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
+		this_len = (this_len > max_len) ? max_len : this_len;
+
+		retval = access_process_vm(task, (mm->env_start + src),
+			page, this_len, 0);
+
+		if (retval <= 0) {
+			ret = retval;
+			break;
+		}
+
+		if (copy_to_user(buf, page, retval)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret += retval;
+		src += retval;
+		buf += retval;
+		count -= retval;
+	}
+	*ppos = src;
+
+	mmput(mm);
+out_free:
+	free_page((unsigned long) page);
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_environ_operations = {
+	.read		= environ_read,
+	.llseek		= generic_file_llseek,
+};
+
+static ssize_t oom_adjust_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	char buffer[PROC_NUMBUF];
+	size_t len;
+	int oom_adjust = OOM_DISABLE;
+	unsigned long flags;
+
+	if (!task)
+		return -ESRCH;
+
+	if (lock_task_sighand(task, &flags)) {
+		oom_adjust = task->signal->oom_adj;
+		unlock_task_sighand(task, &flags);
+	}
+
+	put_task_struct(task);
+
+	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
+
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF];
+	int oom_adjust;
+	unsigned long flags;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	err = kstrtoint(strstrip(buffer), 0, &oom_adjust);
+	if (err)
+		goto out;
+	if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
+	     oom_adjust != OOM_DISABLE) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task) {
+		err = -ESRCH;
+		goto out;
+	}
+
+	task_lock(task);
+	if (!task->mm) {
+		err = -EINVAL;
+		goto err_task_lock;
+	}
+
+	if (!lock_task_sighand(task, &flags)) {
+		err = -ESRCH;
+		goto err_task_lock;
+	}
+
+	if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
+		err = -EACCES;
+		goto err_sighand;
+	}
+
+	if (oom_adjust != task->signal->oom_adj) {
+		if (oom_adjust == OOM_DISABLE)
+			atomic_inc(&task->mm->oom_disable_count);
+		if (task->signal->oom_adj == OOM_DISABLE)
+			atomic_dec(&task->mm->oom_disable_count);
+	}
+
+	/*
+	 * Warn that /proc/pid/oom_adj is deprecated, see
+	 * Documentation/feature-removal-schedule.txt.
+	 */
+	printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, "
+			"please use /proc/%d/oom_score_adj instead.\n",
+			current->comm, task_pid_nr(current),
+			task_pid_nr(task), task_pid_nr(task));
+	task->signal->oom_adj = oom_adjust;
+	/*
+	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+	 * value is always attainable.
+	 */
+	if (task->signal->oom_adj == OOM_ADJUST_MAX)
+		task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
+	else
+		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
+								-OOM_DISABLE;
+err_sighand:
+	unlock_task_sighand(task, &flags);
+err_task_lock:
+	task_unlock(task);
+	put_task_struct(task);
+out:
+	return err < 0 ? err : count;
+}
+
+static int oom_adjust_permission(struct inode *inode, int mask,
+				 unsigned int flags)
+{
+	uid_t uid;
+	struct task_struct *p;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	p = get_proc_task(inode);
+	if(p) {
+		uid = task_uid(p);
+		put_task_struct(p);
+	}
+
+	/*
+	 * System Server (uid == 1000) is granted access to oom_adj of all 
+	 * android applications (uid > 10000) as and services (uid >= 1000)
+	 */
+	if (p && (current_fsuid() == 1000) && (uid >= 1000)) {
+		if (inode->i_mode >> 6 & mask) {
+			return 0;
+		}
+	}
+
+	/* Fall back to default. */
+	return generic_permission(inode, mask, flags, NULL);
+}
+
+static const struct inode_operations proc_oom_adjust_inode_operations = {
+	.permission	= oom_adjust_permission,
+};
+
+static const struct file_operations proc_oom_adjust_operations = {
+	.read		= oom_adjust_read,
+	.write		= oom_adjust_write,
+	.llseek		= generic_file_llseek,
+};
+
+static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	char buffer[PROC_NUMBUF];
+	int oom_score_adj = OOM_SCORE_ADJ_MIN;
+	unsigned long flags;
+	size_t len;
+
+	if (!task)
+		return -ESRCH;
+	if (lock_task_sighand(task, &flags)) {
+		oom_score_adj = task->signal->oom_score_adj;
+		unlock_task_sighand(task, &flags);
+	}
+	put_task_struct(task);
+	len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF];
+	unsigned long flags;
+	int oom_score_adj;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
+	if (err)
+		goto out;
+	if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
+			oom_score_adj > OOM_SCORE_ADJ_MAX) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task) {
+		err = -ESRCH;
+		goto out;
+	}
+
+	task_lock(task);
+	if (!task->mm) {
+		err = -EINVAL;
+		goto err_task_lock;
+	}
+
+	if (!lock_task_sighand(task, &flags)) {
+		err = -ESRCH;
+		goto err_task_lock;
+	}
+
+	if (oom_score_adj < task->signal->oom_score_adj_min &&
+			!capable(CAP_SYS_RESOURCE)) {
+		err = -EACCES;
+		goto err_sighand;
+	}
+
+	if (oom_score_adj != task->signal->oom_score_adj) {
+		if (oom_score_adj == OOM_SCORE_ADJ_MIN)
+			atomic_inc(&task->mm->oom_disable_count);
+		if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+			atomic_dec(&task->mm->oom_disable_count);
+	}
+	task->signal->oom_score_adj = oom_score_adj;
+	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
+		task->signal->oom_score_adj_min = oom_score_adj;
+	/*
+	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
+	 * always attainable.
+	 */
+	if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+		task->signal->oom_adj = OOM_DISABLE;
+	else
+		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
+							OOM_SCORE_ADJ_MAX;
+err_sighand:
+	unlock_task_sighand(task, &flags);
+err_task_lock:
+	task_unlock(task);
+	put_task_struct(task);
+out:
+	return err < 0 ? err : count;
+}
+
+static const struct file_operations proc_oom_score_adj_operations = {
+	.read		= oom_score_adj_read,
+	.write		= oom_score_adj_write,
+	.llseek		= default_llseek,
+};
+
+#ifdef CONFIG_AUDITSYSCALL
+#define TMPBUFLEN 21
+static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
+				  size_t count, loff_t *ppos)
+{
+	struct inode * inode = file->f_path.dentry->d_inode;
+	struct task_struct *task = get_proc_task(inode);
+	ssize_t length;
+	char tmpbuf[TMPBUFLEN];
+
+	if (!task)
+		return -ESRCH;
+	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
+				audit_get_loginuid(task));
+	put_task_struct(task);
+	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
+}
+
+static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
+				   size_t count, loff_t *ppos)
+{
+	struct inode * inode = file->f_path.dentry->d_inode;
+	char *page, *tmp;
+	ssize_t length;
+	uid_t loginuid;
+
+	if (!capable(CAP_AUDIT_CONTROL))
+		return -EPERM;
+
+	rcu_read_lock();
+	if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
+		rcu_read_unlock();
+		return -EPERM;
+	}
+	rcu_read_unlock();
+
+	if (count >= PAGE_SIZE)
+		count = PAGE_SIZE - 1;
+
+	if (*ppos != 0) {
+		/* No partial writes. */
+		return -EINVAL;
+	}
+	page = (char*)__get_free_page(GFP_TEMPORARY);
+	if (!page)
+		return -ENOMEM;
+	length = -EFAULT;
+	if (copy_from_user(page, buf, count))
+		goto out_free_page;
+
+	page[count] = '\0';
+	loginuid = simple_strtoul(page, &tmp, 10);
+	if (tmp == page) {
+		length = -EINVAL;
+		goto out_free_page;
+
+	}
+	length = audit_set_loginuid(current, loginuid);
+	if (likely(length == 0))
+		length = count;
+
+out_free_page:
+	free_page((unsigned long) page);
+	return length;
+}
+
+static const struct file_operations proc_loginuid_operations = {
+	.read		= proc_loginuid_read,
+	.write		= proc_loginuid_write,
+	.llseek		= generic_file_llseek,
+};
+
+static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
+				  size_t count, loff_t *ppos)
+{
+	struct inode * inode = file->f_path.dentry->d_inode;
+	struct task_struct *task = get_proc_task(inode);
+	ssize_t length;
+	char tmpbuf[TMPBUFLEN];
+
+	if (!task)
+		return -ESRCH;
+	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
+				audit_get_sessionid(task));
+	put_task_struct(task);
+	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
+}
+
+static const struct file_operations proc_sessionid_operations = {
+	.read		= proc_sessionid_read,
+	.llseek		= generic_file_llseek,
+};
+#endif
+
+#ifdef CONFIG_FAULT_INJECTION
+static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
+				      size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+	char buffer[PROC_NUMBUF];
+	size_t len;
+	int make_it_fail;
+
+	if (!task)
+		return -ESRCH;
+	make_it_fail = task->make_it_fail;
+	put_task_struct(task);
+
+	len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
+
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t proc_fault_inject_write(struct file * file,
+			const char __user * buf, size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF], *end;
+	int make_it_fail;
+
+	if (!capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+	make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
+	if (*end)
+		return -EINVAL;
+	task = get_proc_task(file->f_dentry->d_inode);
+	if (!task)
+		return -ESRCH;
+	task->make_it_fail = make_it_fail;
+	put_task_struct(task);
+
+	return count;
+}
+
+static const struct file_operations proc_fault_inject_operations = {
+	.read		= proc_fault_inject_read,
+	.write		= proc_fault_inject_write,
+	.llseek		= generic_file_llseek,
+};
+#endif
+
+
+#ifdef CONFIG_SCHED_DEBUG
+/*
+ * Print out various scheduling related per-task fields:
+ */
+static int sched_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+	proc_sched_show_task(p, m);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static ssize_t
+sched_write(struct file *file, const char __user *buf,
+	    size_t count, loff_t *offset)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+	proc_sched_set_task(p);
+
+	put_task_struct(p);
+
+	return count;
+}
+
+static int sched_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, sched_show, inode);
+}
+
+static const struct file_operations proc_pid_sched_operations = {
+	.open		= sched_open,
+	.read		= seq_read,
+	.write		= sched_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+	proc_sched_autogroup_show_task(p, m);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+	    size_t count, loff_t *offset)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct task_struct *p;
+	char buffer[PROC_NUMBUF];
+	int nice;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+
+	err = kstrtoint(strstrip(buffer), 0, &nice);
+	if (err < 0)
+		return err;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	err = nice;
+	err = proc_sched_autogroup_set_nice(p, &err);
+	if (err)
+		count = err;
+
+	put_task_struct(p);
+
+	return count;
+}
+
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
+	int ret;
+
+	ret = single_open(filp, sched_autogroup_show, NULL);
+	if (!ret) {
+		struct seq_file *m = filp->private_data;
+
+		m->private = inode;
+	}
+	return ret;
+}
+
+static const struct file_operations proc_pid_sched_autogroup_operations = {
+	.open		= sched_autogroup_open,
+	.read		= seq_read,
+	.write		= sched_autogroup_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
+
+static ssize_t comm_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *offset)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct task_struct *p;
+	char buffer[TASK_COMM_LEN];
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	if (same_thread_group(current, p))
+		set_task_comm(p, buffer);
+	else
+		count = -EINVAL;
+
+	put_task_struct(p);
+
+	return count;
+}
+
+static int comm_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	task_lock(p);
+	seq_printf(m, "%s\n", p->comm);
+	task_unlock(p);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static int comm_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, comm_show, inode);
+}
+
+static const struct file_operations proc_pid_set_comm_operations = {
+	.open		= comm_open,
+	.read		= seq_read,
+	.write		= comm_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int proc_exe_link(struct inode *inode, struct path *exe_path)
+{
+	struct task_struct *task;
+	struct mm_struct *mm;
+	struct file *exe_file;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ENOENT;
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		return -ENOENT;
+	exe_file = get_mm_exe_file(mm);
+	mmput(mm);
+	if (exe_file) {
+		*exe_path = exe_file->f_path;
+		path_get(&exe_file->f_path);
+		fput(exe_file);
+		return 0;
+	} else
+		return -ENOENT;
+}
+
+static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	int error = -EACCES;
+
+	/* We don't need a base pointer in the /proc filesystem */
+	path_put(&nd->path);
+
+	/* Are we allowed to snoop on the tasks file descriptors? */
+	if (!proc_fd_access_allowed(inode))
+		goto out;
+
+	error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+out:
+	return ERR_PTR(error);
+}
+
+static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
+{
+	char *tmp = (char*)__get_free_page(GFP_TEMPORARY);
+	char *pathname;
+	int len;
+
+	if (!tmp)
+		return -ENOMEM;
+
+	pathname = d_path(path, tmp, PAGE_SIZE);
+	len = PTR_ERR(pathname);
+	if (IS_ERR(pathname))
+		goto out;
+	len = tmp + PAGE_SIZE - 1 - pathname;
+
+	if (len > buflen)
+		len = buflen;
+	if (copy_to_user(buffer, pathname, len))
+		len = -EFAULT;
+ out:
+	free_page((unsigned long)tmp);
+	return len;
+}
+
+static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
+{
+	int error = -EACCES;
+	struct inode *inode = dentry->d_inode;
+	struct path path;
+
+	/* Are we allowed to snoop on the tasks file descriptors? */
+	if (!proc_fd_access_allowed(inode))
+		goto out;
+
+	error = PROC_I(inode)->op.proc_get_link(inode, &path);
+	if (error)
+		goto out;
+
+	error = do_proc_readlink(&path, buffer, buflen);
+	path_put(&path);
+out:
+	return error;
+}
+
+static const struct inode_operations proc_pid_link_inode_operations = {
+	.readlink	= proc_pid_readlink,
+	.follow_link	= proc_pid_follow_link,
+	.setattr	= proc_setattr,
+};
+
+
+/* building an inode */
+
+static int task_dumpable(struct task_struct *task)
+{
+	int dumpable = 0;
+	struct mm_struct *mm;
+
+	task_lock(task);
+	mm = task->mm;
+	if (mm)
+		dumpable = get_dumpable(mm);
+	task_unlock(task);
+	if(dumpable == 1)
+		return 1;
+	return 0;
+}
+
+struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
+{
+	struct inode * inode;
+	struct proc_inode *ei;
+	const struct cred *cred;
+
+	/* We need a new inode */
+
+	inode = new_inode(sb);
+	if (!inode)
+		goto out;
+
+	/* Common stuff */
+	ei = PROC_I(inode);
+	inode->i_ino = get_next_ino();
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode->i_op = &proc_def_inode_operations;
+
+	/*
+	 * grab the reference to task.
+	 */
+	ei->pid = get_task_pid(task, PIDTYPE_PID);
+	if (!ei->pid)
+		goto out_unlock;
+
+	if (task_dumpable(task)) {
+		rcu_read_lock();
+		cred = __task_cred(task);
+		inode->i_uid = cred->euid;
+		inode->i_gid = cred->egid;
+		rcu_read_unlock();
+	}
+	security_task_to_inode(task, inode);
+
+out:
+	return inode;
+
+out_unlock:
+	iput(inode);
+	return NULL;
+}
+
+int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	const struct cred *cred;
+
+	generic_fillattr(inode, stat);
+
+	rcu_read_lock();
+	stat->uid = 0;
+	stat->gid = 0;
+	task = pid_task(proc_pid(inode), PIDTYPE_PID);
+	if (task) {
+		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+		    task_dumpable(task)) {
+			cred = __task_cred(task);
+			stat->uid = cred->euid;
+			stat->gid = cred->egid;
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
+/* dentry stuff */
+
+/*
+ *	Exceptional case: normally we are not allowed to unhash a busy
+ * directory. In this case, however, we can do it - no aliasing problems
+ * due to the way we treat inodes.
+ *
+ * Rewrite the inode's ownerships here because the owning task may have
+ * performed a setuid(), etc.
+ *
+ * Before the /proc/pid/status file was created the only way to read
+ * the effective uid of a /process was to stat /proc/pid.  Reading
+ * /proc/pid/status is slow enough that procps and other packages
+ * kept stating /proc/pid.  To keep the rules in /proc simple I have
+ * made this apply to all per process world readable and executable
+ * directories.
+ */
+int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode;
+	struct task_struct *task;
+	const struct cred *cred;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+
+	if (task) {
+		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+		    task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = 0;
+			inode->i_gid = 0;
+		}
+		inode->i_mode &= ~(S_ISUID | S_ISGID);
+		security_task_to_inode(task, inode);
+		put_task_struct(task);
+		return 1;
+	}
+	d_drop(dentry);
+	return 0;
+}
+
+static int pid_delete_dentry(const struct dentry * dentry)
+{
+	/* Is the task we represent dead?
+	 * If so, then don't put the dentry on the lru list,
+	 * kill it immediately.
+	 */
+	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
+}
+
+const struct dentry_operations pid_dentry_operations =
+{
+	.d_revalidate	= pid_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+/* Lookups */
+
+/*
+ * Fill a directory entry.
+ *
+ * If possible create the dcache entry and derive our inode number and
+ * file type from dcache entry.
+ *
+ * Since all of the proc inode numbers are dynamically generated, the inode
+ * numbers do not exist until the inode is cache.  This means creating the
+ * the dcache entry in readdir is necessary to keep the inode numbers
+ * reported by readdir in sync with the inode numbers reported
+ * by stat.
+ */
+int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+	const char *name, int len,
+	instantiate_t instantiate, struct task_struct *task, const void *ptr)
+{
+	struct dentry *child, *dir = filp->f_path.dentry;
+	struct inode *inode;
+	struct qstr qname;
+	ino_t ino = 0;
+	unsigned type = DT_UNKNOWN;
+
+	qname.name = name;
+	qname.len  = len;
+	qname.hash = full_name_hash(name, len);
+
+	child = d_lookup(dir, &qname);
+	if (!child) {
+		struct dentry *new;
+		new = d_alloc(dir, &qname);
+		if (new) {
+			child = instantiate(dir->d_inode, new, task, ptr);
+			if (child)
+				dput(new);
+			else
+				child = new;
+		}
+	}
+	if (!child || IS_ERR(child) || !child->d_inode)
+		goto end_instantiate;
+	inode = child->d_inode;
+	if (inode) {
+		ino = inode->i_ino;
+		type = inode->i_mode >> 12;
+	}
+	dput(child);
+end_instantiate:
+	if (!ino)
+		ino = find_inode_number(dir, &qname);
+	if (!ino)
+		ino = 1;
+	return filldir(dirent, name, len, filp->f_pos, ino, type);
+}
+
+static unsigned name_to_int(struct dentry *dentry)
+{
+	const char *name = dentry->d_name.name;
+	int len = dentry->d_name.len;
+	unsigned n = 0;
+
+	if (len > 1 && *name == '0')
+		goto out;
+	while (len-- > 0) {
+		unsigned c = *name++ - '0';
+		if (c > 9)
+			goto out;
+		if (n >= (~0U-9)/10)
+			goto out;
+		n *= 10;
+		n += c;
+	}
+	return n;
+out:
+	return ~0U;
+}
+
+#define PROC_FDINFO_MAX 64
+
+static int proc_fd_info(struct inode *inode, struct path *path, char *info)
+{
+	struct task_struct *task = get_proc_task(inode);
+	struct files_struct *files = NULL;
+	struct file *file;
+	int fd = proc_fd(inode);
+
+	if (task) {
+		files = get_files_struct(task);
+		put_task_struct(task);
+	}
+	if (files) {
+		/*
+		 * We are not taking a ref to the file structure, so we must
+		 * hold ->file_lock.
+		 */
+		spin_lock(&files->file_lock);
+		file = fcheck_files(files, fd);
+		if (file) {
+			unsigned int f_flags;
+			struct fdtable *fdt;
+
+			fdt = files_fdtable(files);
+			f_flags = file->f_flags & ~O_CLOEXEC;
+			if (FD_ISSET(fd, fdt->close_on_exec))
+				f_flags |= O_CLOEXEC;
+
+			if (path) {
+				*path = file->f_path;
+				path_get(&file->f_path);
+			}
+			if (info)
+				snprintf(info, PROC_FDINFO_MAX,
+					 "pos:\t%lli\n"
+					 "flags:\t0%o\n",
+					 (long long) file->f_pos,
+					 f_flags);
+			spin_unlock(&files->file_lock);
+			put_files_struct(files);
+			return 0;
+		}
+		spin_unlock(&files->file_lock);
+		put_files_struct(files);
+	}
+	return -ENOENT;
+}
+
+static int proc_fd_link(struct inode *inode, struct path *path)
+{
+	return proc_fd_info(inode, path, NULL);
+}
+
+static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode;
+	struct task_struct *task;
+	int fd;
+	struct files_struct *files;
+	const struct cred *cred;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	fd = proc_fd(inode);
+
+	if (task) {
+		files = get_files_struct(task);
+		if (files) {
+			rcu_read_lock();
+			if (fcheck_files(files, fd)) {
+				rcu_read_unlock();
+				put_files_struct(files);
+				if (task_dumpable(task)) {
+					rcu_read_lock();
+					cred = __task_cred(task);
+					inode->i_uid = cred->euid;
+					inode->i_gid = cred->egid;
+					rcu_read_unlock();
+				} else {
+					inode->i_uid = 0;
+					inode->i_gid = 0;
+				}
+				inode->i_mode &= ~(S_ISUID | S_ISGID);
+				security_task_to_inode(task, inode);
+				put_task_struct(task);
+				return 1;
+			}
+			rcu_read_unlock();
+			put_files_struct(files);
+		}
+		put_task_struct(task);
+	}
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_fd_dentry_operations =
+{
+	.d_revalidate	= tid_fd_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static struct dentry *proc_fd_instantiate(struct inode *dir,
+	struct dentry *dentry, struct task_struct *task, const void *ptr)
+{
+	unsigned fd = *(const unsigned *)ptr;
+	struct file *file;
+	struct files_struct *files;
+ 	struct inode *inode;
+ 	struct proc_inode *ei;
+	struct dentry *error = ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+	ei = PROC_I(inode);
+	ei->fd = fd;
+	files = get_files_struct(task);
+	if (!files)
+		goto out_iput;
+	inode->i_mode = S_IFLNK;
+
+	/*
+	 * We are not taking a ref to the file structure, so we must
+	 * hold ->file_lock.
+	 */
+	spin_lock(&files->file_lock);
+	file = fcheck_files(files, fd);
+	if (!file)
+		goto out_unlock;
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR | S_IXUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR | S_IXUSR;
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+
+	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_size = 64;
+	ei->op.proc_get_link = proc_fd_link;
+	d_set_d_op(dentry, &tid_fd_dentry_operations);
+	d_add(dentry, inode);
+	/* Close the race of the process dying before we return the dentry */
+	if (tid_fd_revalidate(dentry, NULL))
+		error = NULL;
+
+ out:
+	return error;
+out_unlock:
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+out_iput:
+	iput(inode);
+	goto out;
+}
+
+static struct dentry *proc_lookupfd_common(struct inode *dir,
+					   struct dentry *dentry,
+					   instantiate_t instantiate)
+{
+	struct task_struct *task = get_proc_task(dir);
+	unsigned fd = name_to_int(dentry);
+	struct dentry *result = ERR_PTR(-ENOENT);
+
+	if (!task)
+		goto out_no_task;
+	if (fd == ~0U)
+		goto out;
+
+	result = instantiate(dir, dentry, task, &fd);
+out:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static int proc_readfd_common(struct file * filp, void * dirent,
+			      filldir_t filldir, instantiate_t instantiate)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *p = get_proc_task(inode);
+	unsigned int fd, ino;
+	int retval;
+	struct files_struct * files;
+
+	retval = -ENOENT;
+	if (!p)
+		goto out_no_task;
+	retval = 0;
+
+	fd = filp->f_pos;
+	switch (fd) {
+		case 0:
+			if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
+				goto out;
+			filp->f_pos++;
+		case 1:
+			ino = parent_ino(dentry);
+			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+				goto out;
+			filp->f_pos++;
+		default:
+			files = get_files_struct(p);
+			if (!files)
+				goto out;
+			rcu_read_lock();
+			for (fd = filp->f_pos-2;
+			     fd < files_fdtable(files)->max_fds;
+			     fd++, filp->f_pos++) {
+				char name[PROC_NUMBUF];
+				int len;
+
+				if (!fcheck_files(files, fd))
+					continue;
+				rcu_read_unlock();
+
+				len = snprintf(name, sizeof(name), "%d", fd);
+				if (proc_fill_cache(filp, dirent, filldir,
+						    name, len, instantiate,
+						    p, &fd) < 0) {
+					rcu_read_lock();
+					break;
+				}
+				rcu_read_lock();
+			}
+			rcu_read_unlock();
+			put_files_struct(files);
+	}
+out:
+	put_task_struct(p);
+out_no_task:
+	return retval;
+}
+
+static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
+				    struct nameidata *nd)
+{
+	return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
+}
+
+static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
+{
+	return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
+}
+
+static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
+				      size_t len, loff_t *ppos)
+{
+	char tmp[PROC_FDINFO_MAX];
+	int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
+	if (!err)
+		err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
+	return err;
+}
+
+static const struct file_operations proc_fdinfo_file_operations = {
+	.open           = nonseekable_open,
+	.read		= proc_fdinfo_read,
+	.llseek		= no_llseek,
+};
+
+static const struct file_operations proc_fd_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_readfd,
+	.llseek		= default_llseek,
+};
+
+/*
+ * /proc/pid/fd needs a special permission handler so that a process can still
+ * access /proc/self/fd after it has executed a setuid().
+ */
+static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	int rv = generic_permission(inode, mask, flags, NULL);
+	if (rv == 0)
+		return 0;
+	if (task_pid(current) == proc_pid(inode))
+		rv = 0;
+	return rv;
+}
+
+/*
+ * proc directories can do almost nothing..
+ */
+static const struct inode_operations proc_fd_inode_operations = {
+	.lookup		= proc_lookupfd,
+	.permission	= proc_fd_permission,
+	.setattr	= proc_setattr,
+};
+
+static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
+	struct dentry *dentry, struct task_struct *task, const void *ptr)
+{
+	unsigned fd = *(unsigned *)ptr;
+ 	struct inode *inode;
+ 	struct proc_inode *ei;
+	struct dentry *error = ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+	ei = PROC_I(inode);
+	ei->fd = fd;
+	inode->i_mode = S_IFREG | S_IRUSR;
+	inode->i_fop = &proc_fdinfo_file_operations;
+	d_set_d_op(dentry, &tid_fd_dentry_operations);
+	d_add(dentry, inode);
+	/* Close the race of the process dying before we return the dentry */
+	if (tid_fd_revalidate(dentry, NULL))
+		error = NULL;
+
+ out:
+	return error;
+}
+
+static struct dentry *proc_lookupfdinfo(struct inode *dir,
+					struct dentry *dentry,
+					struct nameidata *nd)
+{
+	return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
+}
+
+static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
+{
+	return proc_readfd_common(filp, dirent, filldir,
+				  proc_fdinfo_instantiate);
+}
+
+static const struct file_operations proc_fdinfo_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_readfdinfo,
+	.llseek		= default_llseek,
+};
+
+/*
+ * proc directories can do almost nothing..
+ */
+static const struct inode_operations proc_fdinfo_inode_operations = {
+	.lookup		= proc_lookupfdinfo,
+	.setattr	= proc_setattr,
+};
+
+
+static struct dentry *proc_pident_instantiate(struct inode *dir,
+	struct dentry *dentry, struct task_struct *task, const void *ptr)
+{
+	const struct pid_entry *p = ptr;
+	struct inode *inode;
+	struct proc_inode *ei;
+	struct dentry *error = ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+
+	ei = PROC_I(inode);
+	inode->i_mode = p->mode;
+	if (S_ISDIR(inode->i_mode))
+		inode->i_nlink = 2;	/* Use getattr to fix if necessary */
+	if (p->iop)
+		inode->i_op = p->iop;
+	if (p->fop)
+		inode->i_fop = p->fop;
+	ei->op = p->op;
+	d_set_d_op(dentry, &pid_dentry_operations);
+	d_add(dentry, inode);
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, NULL))
+		error = NULL;
+out:
+	return error;
+}
+
+static struct dentry *proc_pident_lookup(struct inode *dir, 
+					 struct dentry *dentry,
+					 const struct pid_entry *ents,
+					 unsigned int nents)
+{
+	struct dentry *error;
+	struct task_struct *task = get_proc_task(dir);
+	const struct pid_entry *p, *last;
+
+	error = ERR_PTR(-ENOENT);
+
+	if (!task)
+		goto out_no_task;
+
+	/*
+	 * Yes, it does not scale. And it should not. Don't add
+	 * new entries into /proc/<tgid>/ without very good reasons.
+	 */
+	last = &ents[nents - 1];
+	for (p = ents; p <= last; p++) {
+		if (p->len != dentry->d_name.len)
+			continue;
+		if (!memcmp(dentry->d_name.name, p->name, p->len))
+			break;
+	}
+	if (p > last)
+		goto out;
+
+	error = proc_pident_instantiate(dir, dentry, task, p);
+out:
+	put_task_struct(task);
+out_no_task:
+	return error;
+}
+
+static int proc_pident_fill_cache(struct file *filp, void *dirent,
+	filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
+{
+	return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
+				proc_pident_instantiate, task, p);
+}
+
+static int proc_pident_readdir(struct file *filp,
+		void *dirent, filldir_t filldir,
+		const struct pid_entry *ents, unsigned int nents)
+{
+	int i;
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task = get_proc_task(inode);
+	const struct pid_entry *p, *last;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	if (!task)
+		goto out_no_task;
+
+	ret = 0;
+	i = filp->f_pos;
+	switch (i) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+			goto out;
+		i++;
+		filp->f_pos++;
+		/* fall through */
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+			goto out;
+		i++;
+		filp->f_pos++;
+		/* fall through */
+	default:
+		i -= 2;
+		if (i >= nents) {
+			ret = 1;
+			goto out;
+		}
+		p = ents + i;
+		last = &ents[nents - 1];
+		while (p <= last) {
+			if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
+				goto out;
+			filp->f_pos++;
+			p++;
+		}
+	}
+
+	ret = 1;
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+#ifdef CONFIG_SECURITY
+static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
+				  size_t count, loff_t *ppos)
+{
+	struct inode * inode = file->f_path.dentry->d_inode;
+	char *p = NULL;
+	ssize_t length;
+	struct task_struct *task = get_proc_task(inode);
+
+	if (!task)
+		return -ESRCH;
+
+	length = security_getprocattr(task,
+				      (char*)file->f_path.dentry->d_name.name,
+				      &p);
+	put_task_struct(task);
+	if (length > 0)
+		length = simple_read_from_buffer(buf, count, ppos, p, length);
+	kfree(p);
+	return length;
+}
+
+static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
+				   size_t count, loff_t *ppos)
+{
+	struct inode * inode = file->f_path.dentry->d_inode;
+	char *page;
+	ssize_t length;
+	struct task_struct *task = get_proc_task(inode);
+
+	length = -ESRCH;
+	if (!task)
+		goto out_no_task;
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
+
+	/* No partial writes. */
+	length = -EINVAL;
+	if (*ppos != 0)
+		goto out;
+
+	length = -ENOMEM;
+	page = (char*)__get_free_page(GFP_TEMPORARY);
+	if (!page)
+		goto out;
+
+	length = -EFAULT;
+	if (copy_from_user(page, buf, count))
+		goto out_free;
+
+	/* Guard against adverse ptrace interaction */
+	length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
+	if (length < 0)
+		goto out_free;
+
+	length = security_setprocattr(task,
+				      (char*)file->f_path.dentry->d_name.name,
+				      (void*)page, count);
+	mutex_unlock(&task->signal->cred_guard_mutex);
+out_free:
+	free_page((unsigned long) page);
+out:
+	put_task_struct(task);
+out_no_task:
+	return length;
+}
+
+static const struct file_operations proc_pid_attr_operations = {
+	.read		= proc_pid_attr_read,
+	.write		= proc_pid_attr_write,
+	.llseek		= generic_file_llseek,
+};
+
+static const struct pid_entry attr_dir_stuff[] = {
+	REG("current",    S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("prev",       S_IRUGO,	   proc_pid_attr_operations),
+	REG("exec",       S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("fscreate",   S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("keycreate",  S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+};
+
+static int proc_attr_dir_readdir(struct file * filp,
+			     void * dirent, filldir_t filldir)
+{
+	return proc_pident_readdir(filp,dirent,filldir,
+				   attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff));
+}
+
+static const struct file_operations proc_attr_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_attr_dir_readdir,
+	.llseek		= default_llseek,
+};
+
+static struct dentry *proc_attr_dir_lookup(struct inode *dir,
+				struct dentry *dentry, struct nameidata *nd)
+{
+	return proc_pident_lookup(dir, dentry,
+				  attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
+}
+
+static const struct inode_operations proc_attr_dir_inode_operations = {
+	.lookup		= proc_attr_dir_lookup,
+	.getattr	= pid_getattr,
+	.setattr	= proc_setattr,
+};
+
+#endif
+
+#ifdef CONFIG_ELF_CORE
+static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
+					 size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+	struct mm_struct *mm;
+	char buffer[PROC_NUMBUF];
+	size_t len;
+	int ret;
+
+	if (!task)
+		return -ESRCH;
+
+	ret = 0;
+	mm = get_task_mm(task);
+	if (mm) {
+		len = snprintf(buffer, sizeof(buffer), "%08lx\n",
+			       ((mm->flags & MMF_DUMP_FILTER_MASK) >>
+				MMF_DUMP_FILTER_SHIFT));
+		mmput(mm);
+		ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
+	}
+
+	put_task_struct(task);
+
+	return ret;
+}
+
+static ssize_t proc_coredump_filter_write(struct file *file,
+					  const char __user *buf,
+					  size_t count,
+					  loff_t *ppos)
+{
+	struct task_struct *task;
+	struct mm_struct *mm;
+	char buffer[PROC_NUMBUF], *end;
+	unsigned int val;
+	int ret;
+	int i;
+	unsigned long mask;
+
+	ret = -EFAULT;
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		goto out_no_task;
+
+	ret = -EINVAL;
+	val = (unsigned int)simple_strtoul(buffer, &end, 0);
+	if (*end == '\n')
+		end++;
+	if (end - buffer == 0)
+		goto out_no_task;
+
+	ret = -ESRCH;
+	task = get_proc_task(file->f_dentry->d_inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = end - buffer;
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
+		if (val & mask)
+			set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+		else
+			clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+	}
+
+	mmput(mm);
+ out_no_mm:
+	put_task_struct(task);
+ out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_coredump_filter_operations = {
+	.read		= proc_coredump_filter_read,
+	.write		= proc_coredump_filter_write,
+	.llseek		= generic_file_llseek,
+};
+#endif
+
+/*
+ * /proc/self:
+ */
+static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
+			      int buflen)
+{
+	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	pid_t tgid = task_tgid_nr_ns(current, ns);
+	char tmp[PROC_NUMBUF];
+	if (!tgid)
+		return -ENOENT;
+	sprintf(tmp, "%d", tgid);
+	return vfs_readlink(dentry,buffer,buflen,tmp);
+}
+
+static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	pid_t tgid = task_tgid_nr_ns(current, ns);
+	char *name = ERR_PTR(-ENOENT);
+	if (tgid) {
+		name = __getname();
+		if (!name)
+			name = ERR_PTR(-ENOMEM);
+		else
+			sprintf(name, "%d", tgid);
+	}
+	nd_set_link(nd, name);
+	return NULL;
+}
+
+static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
+				void *cookie)
+{
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		__putname(s);
+}
+
+static const struct inode_operations proc_self_inode_operations = {
+	.readlink	= proc_self_readlink,
+	.follow_link	= proc_self_follow_link,
+	.put_link	= proc_self_put_link,
+};
+
+/*
+ * proc base
+ *
+ * These are the directory entries in the root directory of /proc
+ * that properly belong to the /proc filesystem, as they describe
+ * describe something that is process related.
+ */
+static const struct pid_entry proc_base_stuff[] = {
+	NOD("self", S_IFLNK|S_IRWXUGO,
+		&proc_self_inode_operations, NULL, {}),
+};
+
+static struct dentry *proc_base_instantiate(struct inode *dir,
+	struct dentry *dentry, struct task_struct *task, const void *ptr)
+{
+	const struct pid_entry *p = ptr;
+	struct inode *inode;
+	struct proc_inode *ei;
+	struct dentry *error;
+
+	/* Allocate the inode */
+	error = ERR_PTR(-ENOMEM);
+	inode = new_inode(dir->i_sb);
+	if (!inode)
+		goto out;
+
+	/* Initialize the inode */
+	ei = PROC_I(inode);
+	inode->i_ino = get_next_ino();
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+	/*
+	 * grab the reference to the task.
+	 */
+	ei->pid = get_task_pid(task, PIDTYPE_PID);
+	if (!ei->pid)
+		goto out_iput;
+
+	inode->i_mode = p->mode;
+	if (S_ISDIR(inode->i_mode))
+		inode->i_nlink = 2;
+	if (S_ISLNK(inode->i_mode))
+		inode->i_size = 64;
+	if (p->iop)
+		inode->i_op = p->iop;
+	if (p->fop)
+		inode->i_fop = p->fop;
+	ei->op = p->op;
+	d_add(dentry, inode);
+	error = NULL;
+out:
+	return error;
+out_iput:
+	iput(inode);
+	goto out;
+}
+
+static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
+{
+	struct dentry *error;
+	struct task_struct *task = get_proc_task(dir);
+	const struct pid_entry *p, *last;
+
+	error = ERR_PTR(-ENOENT);
+
+	if (!task)
+		goto out_no_task;
+
+	/* Lookup the directory entry */
+	last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
+	for (p = proc_base_stuff; p <= last; p++) {
+		if (p->len != dentry->d_name.len)
+			continue;
+		if (!memcmp(dentry->d_name.name, p->name, p->len))
+			break;
+	}
+	if (p > last)
+		goto out;
+
+	error = proc_base_instantiate(dir, dentry, task, p);
+
+out:
+	put_task_struct(task);
+out_no_task:
+	return error;
+}
+
+static int proc_base_fill_cache(struct file *filp, void *dirent,
+	filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
+{
+	return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
+				proc_base_instantiate, task, p);
+}
+
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
+{
+	struct task_io_accounting acct = task->ioac;
+	unsigned long flags;
+	int result;
+
+	result = mutex_lock_killable(&task->signal->cred_guard_mutex);
+	if (result)
+		return result;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+		result = -EACCES;
+		goto out_unlock;
+	}
+
+	if (whole && lock_task_sighand(task, &flags)) {
+		struct task_struct *t = task;
+
+		task_io_accounting_add(&acct, &task->signal->ioac);
+		while_each_thread(task, t)
+			task_io_accounting_add(&acct, &t->ioac);
+
+		unlock_task_sighand(task, &flags);
+	}
+	result = sprintf(buffer,
+			"rchar: %llu\n"
+			"wchar: %llu\n"
+			"syscr: %llu\n"
+			"syscw: %llu\n"
+			"read_bytes: %llu\n"
+			"write_bytes: %llu\n"
+			"cancelled_write_bytes: %llu\n",
+			(unsigned long long)acct.rchar,
+			(unsigned long long)acct.wchar,
+			(unsigned long long)acct.syscr,
+			(unsigned long long)acct.syscw,
+			(unsigned long long)acct.read_bytes,
+			(unsigned long long)acct.write_bytes,
+			(unsigned long long)acct.cancelled_write_bytes);
+out_unlock:
+	mutex_unlock(&task->signal->cred_guard_mutex);
+	return result;
+}
+
+static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
+{
+	return do_io_accounting(task, buffer, 0);
+}
+
+static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
+{
+	return do_io_accounting(task, buffer, 1);
+}
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
+
+static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
+				struct pid *pid, struct task_struct *task)
+{
+	int err = lock_trace(task);
+	if (!err) {
+		seq_printf(m, "%08x\n", task->personality);
+		unlock_trace(task);
+	}
+	return err;
+}
+
+/*
+ * Thread groups
+ */
+static const struct file_operations proc_task_operations;
+static const struct inode_operations proc_task_inode_operations;
+
+static const struct pid_entry tgid_base_stuff[] = {
+	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
+	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
+#ifdef CONFIG_NET
+	DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
+#endif
+	REG("environ",    S_IRUSR, proc_environ_operations),
+	INF("auxv",       S_IRUSR, proc_pid_auxv),
+	ONE("status",     S_IRUGO, proc_pid_status),
+	ONE("personality", S_IRUGO, proc_pid_personality),
+	INF("limits",	  S_IRUGO, proc_pid_limits),
+#ifdef CONFIG_SCHED_DEBUG
+	REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
+#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+	REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
+#endif
+	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+	INF("syscall",    S_IRUGO, proc_pid_syscall),
+#endif
+	INF("cmdline",    S_IRUGO, proc_pid_cmdline),
+	ONE("stat",       S_IRUGO, proc_tgid_stat),
+	ONE("statm",      S_IRUGO, proc_pid_statm),
+	REG("maps",       S_IRUGO, proc_maps_operations),
+#ifdef CONFIG_NUMA
+	REG("numa_maps",  S_IRUGO, proc_numa_maps_operations),
+#endif
+	REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
+	LNK("cwd",        proc_cwd_link),
+	LNK("root",       proc_root_link),
+	LNK("exe",        proc_exe_link),
+	REG("mounts",     S_IRUGO, proc_mounts_operations),
+	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
+	REG("mountstats", S_IRUSR, proc_mountstats_operations),
+#ifdef CONFIG_PROC_PAGE_MONITOR
+	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
+	REG("smaps",      S_IRUGO, proc_smaps_operations),
+	REG("pagemap",    S_IRUGO, proc_pagemap_operations),
+#endif
+#ifdef CONFIG_SECURITY
+	DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
+#endif
+#ifdef CONFIG_KALLSYMS
+	INF("wchan",      S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+	ONE("stack",      S_IRUGO, proc_pid_stack),
+#endif
+#ifdef CONFIG_SCHEDSTATS
+	INF("schedstat",  S_IRUGO, proc_pid_schedstat),
+#endif
+#ifdef CONFIG_LATENCYTOP
+	REG("latency",  S_IRUGO, proc_lstats_operations),
+#endif
+#ifdef CONFIG_PROC_PID_CPUSET
+	REG("cpuset",     S_IRUGO, proc_cpuset_operations),
+#endif
+#ifdef CONFIG_CGROUPS
+	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
+#endif
+	INF("oom_score",  S_IRUGO, proc_oom_score),
+	ANDROID("oom_adj",S_IRUGO|S_IWUSR, oom_adjust),
+	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+#ifdef CONFIG_AUDITSYSCALL
+	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
+	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
+#endif
+#ifdef CONFIG_FAULT_INJECTION
+	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
+#endif
+#ifdef CONFIG_ELF_CORE
+	REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
+#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	INF("io",	S_IRUSR, proc_tgid_io_accounting),
+#endif
+#ifdef CONFIG_HARDWALL
+	INF("hardwall",   S_IRUGO, proc_pid_hardwall),
+#endif
+};
+
+static int proc_tgid_base_readdir(struct file * filp,
+			     void * dirent, filldir_t filldir)
+{
+	return proc_pident_readdir(filp,dirent,filldir,
+				   tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff));
+}
+
+static const struct file_operations proc_tgid_base_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_tgid_base_readdir,
+	.llseek		= default_llseek,
+};
+
+static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
+	return proc_pident_lookup(dir, dentry,
+				  tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+}
+
+static const struct inode_operations proc_tgid_base_inode_operations = {
+	.lookup		= proc_tgid_base_lookup,
+	.getattr	= pid_getattr,
+	.setattr	= proc_setattr,
+};
+
+static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
+{
+	struct dentry *dentry, *leader, *dir;
+	char buf[PROC_NUMBUF];
+	struct qstr name;
+
+	name.name = buf;
+	name.len = snprintf(buf, sizeof(buf), "%d", pid);
+	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
+	if (dentry) {
+		shrink_dcache_parent(dentry);
+		d_drop(dentry);
+		dput(dentry);
+	}
+
+	name.name = buf;
+	name.len = snprintf(buf, sizeof(buf), "%d", tgid);
+	leader = d_hash_and_lookup(mnt->mnt_root, &name);
+	if (!leader)
+		goto out;
+
+	name.name = "task";
+	name.len = strlen(name.name);
+	dir = d_hash_and_lookup(leader, &name);
+	if (!dir)
+		goto out_put_leader;
+
+	name.name = buf;
+	name.len = snprintf(buf, sizeof(buf), "%d", pid);
+	dentry = d_hash_and_lookup(dir, &name);
+	if (dentry) {
+		shrink_dcache_parent(dentry);
+		d_drop(dentry);
+		dput(dentry);
+	}
+
+	dput(dir);
+out_put_leader:
+	dput(leader);
+out:
+	return;
+}
+
+/**
+ * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
+ * @task: task that should be flushed.
+ *
+ * When flushing dentries from proc, one needs to flush them from global
+ * proc (proc_mnt) and from all the namespaces' procs this task was seen
+ * in. This call is supposed to do all of this job.
+ *
+ * Looks in the dcache for
+ * /proc/@pid
+ * /proc/@tgid/task/@pid
+ * if either directory is present flushes it and all of it'ts children
+ * from the dcache.
+ *
+ * It is safe and reasonable to cache /proc entries for a task until
+ * that task exits.  After that they just clog up the dcache with
+ * useless entries, possibly causing useful dcache entries to be
+ * flushed instead.  This routine is proved to flush those useless
+ * dcache entries at process exit time.
+ *
+ * NOTE: This routine is just an optimization so it does not guarantee
+ *       that no dcache entries will exist at process exit time it
+ *       just makes it very unlikely that any will persist.
+ */
+
+void proc_flush_task(struct task_struct *task)
+{
+	int i;
+	struct pid *pid, *tgid;
+	struct upid *upid;
+
+	pid = task_pid(task);
+	tgid = task_tgid(task);
+
+	for (i = 0; i <= pid->level; i++) {
+		upid = &pid->numbers[i];
+		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
+					tgid->numbers[i].nr);
+	}
+
+	upid = &pid->numbers[pid->level];
+	if (upid->nr == 1)
+		pid_ns_release_proc(upid->ns);
+}
+
+static struct dentry *proc_pid_instantiate(struct inode *dir,
+					   struct dentry * dentry,
+					   struct task_struct *task, const void *ptr)
+{
+	struct dentry *error = ERR_PTR(-ENOENT);
+	struct inode *inode;
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+
+	inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
+	inode->i_op = &proc_tgid_base_inode_operations;
+	inode->i_fop = &proc_tgid_base_operations;
+	inode->i_flags|=S_IMMUTABLE;
+
+	inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
+		ARRAY_SIZE(tgid_base_stuff));
+
+	d_set_d_op(dentry, &pid_dentry_operations);
+
+	d_add(dentry, inode);
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, NULL))
+		error = NULL;
+out:
+	return error;
+}
+
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+{
+	struct dentry *result;
+	struct task_struct *task;
+	unsigned tgid;
+	struct pid_namespace *ns;
+
+	result = proc_base_lookup(dir, dentry);
+	if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
+		goto out;
+
+	tgid = name_to_int(dentry);
+	if (tgid == ~0U)
+		goto out;
+
+	ns = dentry->d_sb->s_fs_info;
+	rcu_read_lock();
+	task = find_task_by_pid_ns(tgid, ns);
+	if (task)
+		get_task_struct(task);
+	rcu_read_unlock();
+	if (!task)
+		goto out;
+
+	result = proc_pid_instantiate(dir, dentry, task, NULL);
+	put_task_struct(task);
+out:
+	return result;
+}
+
+/*
+ * Find the first task with tgid >= tgid
+ *
+ */
+struct tgid_iter {
+	unsigned int tgid;
+	struct task_struct *task;
+};
+static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
+{
+	struct pid *pid;
+
+	if (iter.task)
+		put_task_struct(iter.task);
+	rcu_read_lock();
+retry:
+	iter.task = NULL;
+	pid = find_ge_pid(iter.tgid, ns);
+	if (pid) {
+		iter.tgid = pid_nr_ns(pid, ns);
+		iter.task = pid_task(pid, PIDTYPE_PID);
+		/* What we to know is if the pid we have find is the
+		 * pid of a thread_group_leader.  Testing for task
+		 * being a thread_group_leader is the obvious thing
+		 * todo but there is a window when it fails, due to
+		 * the pid transfer logic in de_thread.
+		 *
+		 * So we perform the straight forward test of seeing
+		 * if the pid we have found is the pid of a thread
+		 * group leader, and don't worry if the task we have
+		 * found doesn't happen to be a thread group leader.
+		 * As we don't care in the case of readdir.
+		 */
+		if (!iter.task || !has_group_leader_pid(iter.task)) {
+			iter.tgid += 1;
+			goto retry;
+		}
+		get_task_struct(iter.task);
+	}
+	rcu_read_unlock();
+	return iter;
+}
+
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
+
+static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+	struct tgid_iter iter)
+{
+	char name[PROC_NUMBUF];
+	int len = snprintf(name, sizeof(name), "%d", iter.tgid);
+	return proc_fill_cache(filp, dirent, filldir, name, len,
+				proc_pid_instantiate, iter.task, NULL);
+}
+
+/* for the /proc/ directory itself, after non-process stuff has been done */
+int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	unsigned int nr;
+	struct task_struct *reaper;
+	struct tgid_iter iter;
+	struct pid_namespace *ns;
+
+	if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
+		goto out_no_task;
+	nr = filp->f_pos - FIRST_PROCESS_ENTRY;
+
+	reaper = get_proc_task(filp->f_path.dentry->d_inode);
+	if (!reaper)
+		goto out_no_task;
+
+	for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
+		const struct pid_entry *p = &proc_base_stuff[nr];
+		if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
+			goto out;
+	}
+
+	ns = filp->f_dentry->d_sb->s_fs_info;
+	iter.task = NULL;
+	iter.tgid = filp->f_pos - TGID_OFFSET;
+	for (iter = next_tgid(ns, iter);
+	     iter.task;
+	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
+		filp->f_pos = iter.tgid + TGID_OFFSET;
+		if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
+			put_task_struct(iter.task);
+			goto out;
+		}
+	}
+	filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
+out:
+	put_task_struct(reaper);
+out_no_task:
+	return 0;
+}
+
+/*
+ * Tasks
+ */
+static const struct pid_entry tid_base_stuff[] = {
+	DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+	DIR("ns",	 S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
+	REG("environ",   S_IRUSR, proc_environ_operations),
+	INF("auxv",      S_IRUSR, proc_pid_auxv),
+	ONE("status",    S_IRUGO, proc_pid_status),
+	ONE("personality", S_IRUGO, proc_pid_personality),
+	INF("limits",	 S_IRUGO, proc_pid_limits),
+#ifdef CONFIG_SCHED_DEBUG
+	REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
+#endif
+	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+	INF("syscall",   S_IRUGO, proc_pid_syscall),
+#endif
+	INF("cmdline",   S_IRUGO, proc_pid_cmdline),
+	ONE("stat",      S_IRUGO, proc_tid_stat),
+	ONE("statm",     S_IRUGO, proc_pid_statm),
+	REG("maps",      S_IRUGO, proc_maps_operations),
+#ifdef CONFIG_NUMA
+	REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
+#endif
+	REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
+	LNK("cwd",       proc_cwd_link),
+	LNK("root",      proc_root_link),
+	LNK("exe",       proc_exe_link),
+	REG("mounts",    S_IRUGO, proc_mounts_operations),
+	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
+#ifdef CONFIG_PROC_PAGE_MONITOR
+	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
+	REG("smaps",     S_IRUGO, proc_smaps_operations),
+	REG("pagemap",    S_IRUGO, proc_pagemap_operations),
+#endif
+#ifdef CONFIG_SECURITY
+	DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
+#endif
+#ifdef CONFIG_KALLSYMS
+	INF("wchan",     S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+	ONE("stack",      S_IRUGO, proc_pid_stack),
+#endif
+#ifdef CONFIG_SCHEDSTATS
+	INF("schedstat", S_IRUGO, proc_pid_schedstat),
+#endif
+#ifdef CONFIG_LATENCYTOP
+	REG("latency",  S_IRUGO, proc_lstats_operations),
+#endif
+#ifdef CONFIG_PROC_PID_CPUSET
+	REG("cpuset",    S_IRUGO, proc_cpuset_operations),
+#endif
+#ifdef CONFIG_CGROUPS
+	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
+#endif
+	INF("oom_score", S_IRUGO, proc_oom_score),
+	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
+	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+#ifdef CONFIG_AUDITSYSCALL
+	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
+	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
+#endif
+#ifdef CONFIG_FAULT_INJECTION
+	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
+#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	INF("io",	S_IRUSR, proc_tid_io_accounting),
+#endif
+#ifdef CONFIG_HARDWALL
+	INF("hardwall",   S_IRUGO, proc_pid_hardwall),
+#endif
+};
+
+static int proc_tid_base_readdir(struct file * filp,
+			     void * dirent, filldir_t filldir)
+{
+	return proc_pident_readdir(filp,dirent,filldir,
+				   tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
+}
+
+static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
+	return proc_pident_lookup(dir, dentry,
+				  tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+}
+
+static const struct file_operations proc_tid_base_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_tid_base_readdir,
+	.llseek		= default_llseek,
+};
+
+static const struct inode_operations proc_tid_base_inode_operations = {
+	.lookup		= proc_tid_base_lookup,
+	.getattr	= pid_getattr,
+	.setattr	= proc_setattr,
+};
+
+static struct dentry *proc_task_instantiate(struct inode *dir,
+	struct dentry *dentry, struct task_struct *task, const void *ptr)
+{
+	struct dentry *error = ERR_PTR(-ENOENT);
+	struct inode *inode;
+	inode = proc_pid_make_inode(dir->i_sb, task);
+
+	if (!inode)
+		goto out;
+	inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
+	inode->i_op = &proc_tid_base_inode_operations;
+	inode->i_fop = &proc_tid_base_operations;
+	inode->i_flags|=S_IMMUTABLE;
+
+	inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
+		ARRAY_SIZE(tid_base_stuff));
+
+	d_set_d_op(dentry, &pid_dentry_operations);
+
+	d_add(dentry, inode);
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, NULL))
+		error = NULL;
+out:
+	return error;
+}
+
+static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+{
+	struct dentry *result = ERR_PTR(-ENOENT);
+	struct task_struct *task;
+	struct task_struct *leader = get_proc_task(dir);
+	unsigned tid;
+	struct pid_namespace *ns;
+
+	if (!leader)
+		goto out_no_task;
+
+	tid = name_to_int(dentry);
+	if (tid == ~0U)
+		goto out;
+
+	ns = dentry->d_sb->s_fs_info;
+	rcu_read_lock();
+	task = find_task_by_pid_ns(tid, ns);
+	if (task)
+		get_task_struct(task);
+	rcu_read_unlock();
+	if (!task)
+		goto out;
+	if (!same_thread_group(leader, task))
+		goto out_drop_task;
+
+	result = proc_task_instantiate(dir, dentry, task, NULL);
+out_drop_task:
+	put_task_struct(task);
+out:
+	put_task_struct(leader);
+out_no_task:
+	return result;
+}
+
+/*
+ * Find the first tid of a thread group to return to user space.
+ *
+ * Usually this is just the thread group leader, but if the users
+ * buffer was too small or there was a seek into the middle of the
+ * directory we have more work todo.
+ *
+ * In the case of a short read we start with find_task_by_pid.
+ *
+ * In the case of a seek we start with the leader and walk nr
+ * threads past it.
+ */
+static struct task_struct *first_tid(struct task_struct *leader,
+		int tid, int nr, struct pid_namespace *ns)
+{
+	struct task_struct *pos;
+
+	rcu_read_lock();
+	/* Attempt to start with the pid of a thread */
+	if (tid && (nr > 0)) {
+		pos = find_task_by_pid_ns(tid, ns);
+		if (pos && (pos->group_leader == leader))
+			goto found;
+	}
+
+	/* If nr exceeds the number of threads there is nothing todo */
+	pos = NULL;
+	if (nr && nr >= get_nr_threads(leader))
+		goto out;
+
+	/* If we haven't found our starting place yet start
+	 * with the leader and walk nr threads forward.
+	 */
+	for (pos = leader; nr > 0; --nr) {
+		pos = next_thread(pos);
+		if (pos == leader) {
+			pos = NULL;
+			goto out;
+		}
+	}
+found:
+	get_task_struct(pos);
+out:
+	rcu_read_unlock();
+	return pos;
+}
+
+/*
+ * Find the next thread in the thread list.
+ * Return NULL if there is an error or no next thread.
+ *
+ * The reference to the input task_struct is released.
+ */
+static struct task_struct *next_tid(struct task_struct *start)
+{
+	struct task_struct *pos = NULL;
+	rcu_read_lock();
+	if (pid_alive(start)) {
+		pos = next_thread(start);
+		if (thread_group_leader(pos))
+			pos = NULL;
+		else
+			get_task_struct(pos);
+	}
+	rcu_read_unlock();
+	put_task_struct(start);
+	return pos;
+}
+
+static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+	struct task_struct *task, int tid)
+{
+	char name[PROC_NUMBUF];
+	int len = snprintf(name, sizeof(name), "%d", tid);
+	return proc_fill_cache(filp, dirent, filldir, name, len,
+				proc_task_instantiate, task, NULL);
+}
+
+/* for the /proc/TGID/task/ directories */
+static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *leader = NULL;
+	struct task_struct *task;
+	int retval = -ENOENT;
+	ino_t ino;
+	int tid;
+	struct pid_namespace *ns;
+
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_no_task;
+	rcu_read_lock();
+	if (pid_alive(task)) {
+		leader = task->group_leader;
+		get_task_struct(leader);
+	}
+	rcu_read_unlock();
+	put_task_struct(task);
+	if (!leader)
+		goto out_no_task;
+	retval = 0;
+
+	switch ((unsigned long)filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+		/* fall through */
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+		/* fall through */
+	}
+
+	/* f_version caches the tgid value that the last readdir call couldn't
+	 * return. lseek aka telldir automagically resets f_version to 0.
+	 */
+	ns = filp->f_dentry->d_sb->s_fs_info;
+	tid = (int)filp->f_version;
+	filp->f_version = 0;
+	for (task = first_tid(leader, tid, filp->f_pos - 2, ns);
+	     task;
+	     task = next_tid(task), filp->f_pos++) {
+		tid = task_pid_nr_ns(task, ns);
+		if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
+			/* returning this tgid failed, save it as the first
+			 * pid for the next readir call */
+			filp->f_version = (u64)tid;
+			put_task_struct(task);
+			break;
+		}
+	}
+out:
+	put_task_struct(leader);
+out_no_task:
+	return retval;
+}
+
+static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *p = get_proc_task(inode);
+	generic_fillattr(inode, stat);
+
+	if (p) {
+		stat->nlink += get_nr_threads(p);
+		put_task_struct(p);
+	}
+
+	return 0;
+}
+
+static const struct inode_operations proc_task_inode_operations = {
+	.lookup		= proc_task_lookup,
+	.getattr	= proc_task_getattr,
+	.setattr	= proc_setattr,
+};
+
+static const struct file_operations proc_task_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_task_readdir,
+	.llseek		= default_llseek,
+};
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
new file mode 100644
index 00000000..82676e3f
--- /dev/null
+++ b/fs/proc/cmdline.c
@@ -0,0 +1,29 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+static int cmdline_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%s\n", saved_command_line);
+	return 0;
+}
+
+static int cmdline_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cmdline_proc_show, NULL);
+}
+
+static const struct file_operations cmdline_proc_fops = {
+	.open		= cmdline_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_cmdline_init(void)
+{
+	proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
+	return 0;
+}
+module_init(proc_cmdline_init);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
new file mode 100644
index 00000000..b701eaa4
--- /dev/null
+++ b/fs/proc/consoles.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2010 Werner Fink, Jiri Slaby
+ *
+ * Licensed under GPLv2
+ */
+
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/tty_driver.h>
+
+/*
+ * This is handler for /proc/consoles
+ */
+static int show_console_dev(struct seq_file *m, void *v)
+{
+	static const struct {
+		short flag;
+		char name;
+	} con_flags[] = {
+		{ CON_ENABLED,		'E' },
+		{ CON_CONSDEV,		'C' },
+		{ CON_BOOT,		'B' },
+		{ CON_PRINTBUFFER,	'p' },
+		{ CON_BRL,		'b' },
+		{ CON_ANYTIME,		'a' },
+	};
+	char flags[ARRAY_SIZE(con_flags) + 1];
+	struct console *con = v;
+	unsigned int a;
+	int len;
+	dev_t dev = 0;
+
+	if (con->device) {
+		const struct tty_driver *driver;
+		int index;
+		driver = con->device(con, &index);
+		if (driver) {
+			dev = MKDEV(driver->major, driver->minor_start);
+			dev += index;
+		}
+	}
+
+	for (a = 0; a < ARRAY_SIZE(con_flags); a++)
+		flags[a] = (con->flags & con_flags[a].flag) ?
+			con_flags[a].name : ' ';
+	flags[a] = 0;
+
+	seq_printf(m, "%s%d%n", con->name, con->index, &len);
+	len = 21 - len;
+	if (len < 1)
+		len = 1;
+	seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-',
+			con->write ? 'W' : '-', con->unblank ? 'U' : '-',
+			flags);
+	if (dev)
+		seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
+
+	seq_printf(m, "\n");
+
+	return 0;
+}
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+	struct console *con;
+	loff_t off = 0;
+
+	console_lock();
+	for_each_console(con)
+		if (off++ == *pos)
+			break;
+
+	return con;
+}
+
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct console *con = v;
+	++*pos;
+	return con->next;
+}
+
+static void c_stop(struct seq_file *m, void *v)
+{
+	console_unlock();
+}
+
+static const struct seq_operations consoles_op = {
+	.start	= c_start,
+	.next	= c_next,
+	.stop	= c_stop,
+	.show	= show_console_dev
+};
+
+static int consoles_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &consoles_op);
+}
+
+static const struct file_operations proc_consoles_operations = {
+	.open		= consoles_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_consoles_init(void)
+{
+	proc_create("consoles", 0, NULL, &proc_consoles_operations);
+	return 0;
+}
+module_init(proc_consoles_init);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
new file mode 100644
index 00000000..5a1e539a
--- /dev/null
+++ b/fs/proc/cpuinfo.c
@@ -0,0 +1,24 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+extern const struct seq_operations cpuinfo_op;
+static int cpuinfo_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cpuinfo_op);
+}
+
+static const struct file_operations proc_cpuinfo_operations = {
+	.open		= cpuinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_cpuinfo_init(void)
+{
+	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
+	return 0;
+}
+module_init(proc_cpuinfo_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
new file mode 100644
index 00000000..b1434716
--- /dev/null
+++ b/fs/proc/devices.c
@@ -0,0 +1,70 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+static int devinfo_show(struct seq_file *f, void *v)
+{
+	int i = *(loff_t *) v;
+
+	if (i < CHRDEV_MAJOR_HASH_SIZE) {
+		if (i == 0)
+			seq_puts(f, "Character devices:\n");
+		chrdev_show(f, i);
+	}
+#ifdef CONFIG_BLOCK
+	else {
+		i -= CHRDEV_MAJOR_HASH_SIZE;
+		if (i == 0)
+			seq_puts(f, "\nBlock devices:\n");
+		blkdev_show(f, i);
+	}
+#endif
+	return 0;
+}
+
+static void *devinfo_start(struct seq_file *f, loff_t *pos)
+{
+	if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
+		return pos;
+	return NULL;
+}
+
+static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
+		return NULL;
+	return pos;
+}
+
+static void devinfo_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static const struct seq_operations devinfo_ops = {
+	.start = devinfo_start,
+	.next  = devinfo_next,
+	.stop  = devinfo_stop,
+	.show  = devinfo_show
+};
+
+static int devinfo_open(struct inode *inode, struct file *filp)
+{
+	return seq_open(filp, &devinfo_ops);
+}
+
+static const struct file_operations proc_devinfo_operations = {
+	.open		= devinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_devices_init(void)
+{
+	proc_create("devices", 0, NULL, &proc_devinfo_operations);
+	return 0;
+}
+module_init(proc_devices_init);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
new file mode 100644
index 00000000..f1637f17
--- /dev/null
+++ b/fs/proc/generic.c
@@ -0,0 +1,853 @@
+/*
+ * proc/fs/generic.c --- generic routines for the proc-fs
+ *
+ * This file contains generic proc-fs routines for handling
+ * directories and files.
+ * 
+ * Copyright (C) 1991, 1992 Linus Torvalds.
+ * Copyright (C) 1997 Theodore Ts'o
+ */
+
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/idr.h>
+#include <linux/namei.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <asm/uaccess.h>
+
+#include "internal.h"
+
+DEFINE_SPINLOCK(proc_subdir_lock);
+
+static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
+{
+	if (de->namelen != len)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+/* buffer size is one page but our output routines use some slack for overruns */
+#define PROC_BLOCK_SIZE	(PAGE_SIZE - 1024)
+
+static ssize_t
+__proc_file_read(struct file *file, char __user *buf, size_t nbytes,
+	       loff_t *ppos)
+{
+	struct inode * inode = file->f_path.dentry->d_inode;
+	char 	*page;
+	ssize_t	retval=0;
+	int	eof=0;
+	ssize_t	n, count;
+	char	*start;
+	struct proc_dir_entry * dp;
+	unsigned long long pos;
+
+	/*
+	 * Gaah, please just use "seq_file" instead. The legacy /proc
+	 * interfaces cut loff_t down to off_t for reads, and ignore
+	 * the offset entirely for writes..
+	 */
+	pos = *ppos;
+	if (pos > MAX_NON_LFS)
+		return 0;
+	if (nbytes > MAX_NON_LFS - pos)
+		nbytes = MAX_NON_LFS - pos;
+
+	dp = PDE(inode);
+	if (!(page = (char*) __get_free_page(GFP_TEMPORARY)))
+		return -ENOMEM;
+
+	while ((nbytes > 0) && !eof) {
+		count = min_t(size_t, PROC_BLOCK_SIZE, nbytes);
+
+		start = NULL;
+		if (dp->read_proc) {
+			/*
+			 * How to be a proc read function
+			 * ------------------------------
+			 * Prototype:
+			 *    int f(char *buffer, char **start, off_t offset,
+			 *          int count, int *peof, void *dat)
+			 *
+			 * Assume that the buffer is "count" bytes in size.
+			 *
+			 * If you know you have supplied all the data you
+			 * have, set *peof.
+			 *
+			 * You have three ways to return data:
+			 * 0) Leave *start = NULL.  (This is the default.)
+			 *    Put the data of the requested offset at that
+			 *    offset within the buffer.  Return the number (n)
+			 *    of bytes there are from the beginning of the
+			 *    buffer up to the last byte of data.  If the
+			 *    number of supplied bytes (= n - offset) is 
+			 *    greater than zero and you didn't signal eof
+			 *    and the reader is prepared to take more data
+			 *    you will be called again with the requested
+			 *    offset advanced by the number of bytes 
+			 *    absorbed.  This interface is useful for files
+			 *    no larger than the buffer.
+			 * 1) Set *start = an unsigned long value less than
+			 *    the buffer address but greater than zero.
+			 *    Put the data of the requested offset at the
+			 *    beginning of the buffer.  Return the number of
+			 *    bytes of data placed there.  If this number is
+			 *    greater than zero and you didn't signal eof
+			 *    and the reader is prepared to take more data
+			 *    you will be called again with the requested
+			 *    offset advanced by *start.  This interface is
+			 *    useful when you have a large file consisting
+			 *    of a series of blocks which you want to count
+			 *    and return as wholes.
+			 *    (Hack by Paul.Russell@rustcorp.com.au)
+			 * 2) Set *start = an address within the buffer.
+			 *    Put the data of the requested offset at *start.
+			 *    Return the number of bytes of data placed there.
+			 *    If this number is greater than zero and you
+			 *    didn't signal eof and the reader is prepared to
+			 *    take more data you will be called again with the
+			 *    requested offset advanced by the number of bytes
+			 *    absorbed.
+			 */
+			n = dp->read_proc(page, &start, *ppos,
+					  count, &eof, dp->data);
+		} else
+			break;
+
+		if (n == 0)   /* end of file */
+			break;
+		if (n < 0) {  /* error */
+			if (retval == 0)
+				retval = n;
+			break;
+		}
+
+		if (start == NULL) {
+			if (n > PAGE_SIZE) {
+				printk(KERN_ERR
+				       "proc_file_read: Apparent buffer overflow!\n");
+				n = PAGE_SIZE;
+			}
+			n -= *ppos;
+			if (n <= 0)
+				break;
+			if (n > count)
+				n = count;
+			start = page + *ppos;
+		} else if (start < page) {
+			if (n > PAGE_SIZE) {
+				printk(KERN_ERR
+				       "proc_file_read: Apparent buffer overflow!\n");
+				n = PAGE_SIZE;
+			}
+			if (n > count) {
+				/*
+				 * Don't reduce n because doing so might
+				 * cut off part of a data block.
+				 */
+				printk(KERN_WARNING
+				       "proc_file_read: Read count exceeded\n");
+			}
+		} else /* start >= page */ {
+			unsigned long startoff = (unsigned long)(start - page);
+			if (n > (PAGE_SIZE - startoff)) {
+				printk(KERN_ERR
+				       "proc_file_read: Apparent buffer overflow!\n");
+				n = PAGE_SIZE - startoff;
+			}
+			if (n > count)
+				n = count;
+		}
+		
+ 		n -= copy_to_user(buf, start < page ? page : start, n);
+		if (n == 0) {
+			if (retval == 0)
+				retval = -EFAULT;
+			break;
+		}
+
+		*ppos += start < page ? (unsigned long)start : n;
+		nbytes -= n;
+		buf += n;
+		retval += n;
+	}
+	free_page((unsigned long) page);
+	return retval;
+}
+
+static ssize_t
+proc_file_read(struct file *file, char __user *buf, size_t nbytes,
+	       loff_t *ppos)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	ssize_t rv = -EIO;
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	spin_unlock(&pde->pde_unload_lock);
+
+	rv = __proc_file_read(file, buf, nbytes, ppos);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+static ssize_t
+proc_file_write(struct file *file, const char __user *buffer,
+		size_t count, loff_t *ppos)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	ssize_t rv = -EIO;
+
+	if (pde->write_proc) {
+		spin_lock(&pde->pde_unload_lock);
+		if (!pde->proc_fops) {
+			spin_unlock(&pde->pde_unload_lock);
+			return rv;
+		}
+		pde->pde_users++;
+		spin_unlock(&pde->pde_unload_lock);
+
+		/* FIXME: does this routine need ppos?  probably... */
+		rv = pde->write_proc(file, buffer, count, pde->data);
+		pde_users_dec(pde);
+	}
+	return rv;
+}
+
+
+static loff_t
+proc_file_lseek(struct file *file, loff_t offset, int orig)
+{
+	loff_t retval = -EINVAL;
+	switch (orig) {
+	case 1:
+		offset += file->f_pos;
+	/* fallthrough */
+	case 0:
+		if (offset < 0 || offset > MAX_NON_LFS)
+			break;
+		file->f_pos = retval = offset;
+	}
+	return retval;
+}
+
+static const struct file_operations proc_file_operations = {
+	.llseek		= proc_file_lseek,
+	.read		= proc_file_read,
+	.write		= proc_file_write,
+};
+
+static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct proc_dir_entry *de = PDE(inode);
+	int error;
+
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		return error;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, iattr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+	
+	de->uid = inode->i_uid;
+	de->gid = inode->i_gid;
+	de->mode = inode->i_mode;
+	return 0;
+}
+
+static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct proc_dir_entry *de = PROC_I(inode)->pde;
+	if (de && de->nlink)
+		inode->i_nlink = de->nlink;
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+static const struct inode_operations proc_file_inode_operations = {
+	.setattr	= proc_notify_change,
+};
+
+/*
+ * This function parses a name such as "tty/driver/serial", and
+ * returns the struct proc_dir_entry for "/proc/tty/driver", and
+ * returns "serial" in residual.
+ */
+static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
+			     const char **residual)
+{
+	const char     		*cp = name, *next;
+	struct proc_dir_entry	*de;
+	unsigned int		len;
+
+	de = *ret;
+	if (!de)
+		de = &proc_root;
+
+	while (1) {
+		next = strchr(cp, '/');
+		if (!next)
+			break;
+
+		len = next - cp;
+		for (de = de->subdir; de ; de = de->next) {
+			if (proc_match(len, cp, de))
+				break;
+		}
+		if (!de) {
+			WARN(1, "name '%s'\n", name);
+			return -ENOENT;
+		}
+		cp += len + 1;
+	}
+	*residual = cp;
+	*ret = de;
+	return 0;
+}
+
+static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
+			   const char **residual)
+{
+	int rv;
+
+	spin_lock(&proc_subdir_lock);
+	rv = __xlate_proc_name(name, ret, residual);
+	spin_unlock(&proc_subdir_lock);
+	return rv;
+}
+
+static DEFINE_IDA(proc_inum_ida);
+static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
+
+#define PROC_DYNAMIC_FIRST 0xF0000000U
+
+/*
+ * Return an inode number between PROC_DYNAMIC_FIRST and
+ * 0xffffffff, or zero on failure.
+ */
+static unsigned int get_inode_number(void)
+{
+	unsigned int i;
+	int error;
+
+retry:
+	if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0)
+		return 0;
+
+	spin_lock(&proc_inum_lock);
+	error = ida_get_new(&proc_inum_ida, &i);
+	spin_unlock(&proc_inum_lock);
+	if (error == -EAGAIN)
+		goto retry;
+	else if (error)
+		return 0;
+
+	if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
+		spin_lock(&proc_inum_lock);
+		ida_remove(&proc_inum_ida, i);
+		spin_unlock(&proc_inum_lock);
+		return 0;
+	}
+	return PROC_DYNAMIC_FIRST + i;
+}
+
+static void release_inode_number(unsigned int inum)
+{
+	spin_lock(&proc_inum_lock);
+	ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
+	spin_unlock(&proc_inum_lock);
+}
+
+static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	nd_set_link(nd, PDE(dentry->d_inode)->data);
+	return NULL;
+}
+
+static const struct inode_operations proc_link_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= proc_follow_link,
+};
+
+/*
+ * As some entries in /proc are volatile, we want to 
+ * get rid of unused dentries.  This could be made 
+ * smarter: we could keep a "volatile" flag in the 
+ * inode to indicate which ones to keep.
+ */
+static int proc_delete_dentry(const struct dentry * dentry)
+{
+	return 1;
+}
+
+static const struct dentry_operations proc_dentry_operations =
+{
+	.d_delete	= proc_delete_dentry,
+};
+
+/*
+ * Don't create negative dentries here, return -ENOENT by hand
+ * instead.
+ */
+struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *inode = NULL;
+	int error = -ENOENT;
+
+	spin_lock(&proc_subdir_lock);
+	for (de = de->subdir; de ; de = de->next) {
+		if (de->namelen != dentry->d_name.len)
+			continue;
+		if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
+			pde_get(de);
+			spin_unlock(&proc_subdir_lock);
+			error = -EINVAL;
+			inode = proc_get_inode(dir->i_sb, de);
+			goto out_unlock;
+		}
+	}
+	spin_unlock(&proc_subdir_lock);
+out_unlock:
+
+	if (inode) {
+		d_set_d_op(dentry, &proc_dentry_operations);
+		d_add(dentry, inode);
+		return NULL;
+	}
+	if (de)
+		pde_put(de);
+	return ERR_PTR(error);
+}
+
+struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
+		struct nameidata *nd)
+{
+	return proc_lookup_de(PDE(dir), dir, dentry);
+}
+
+/*
+ * This returns non-zero if at EOF, so that the /proc
+ * root directory can use this and check if it should
+ * continue with the <pid> entries..
+ *
+ * Note that the VFS-layer doesn't care about the return
+ * value of the readdir() call, as long as it's non-negative
+ * for success..
+ */
+int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
+		filldir_t filldir)
+{
+	unsigned int ino;
+	int i;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int ret = 0;
+
+	ino = inode->i_ino;
+	i = filp->f_pos;
+	switch (i) {
+		case 0:
+			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+				goto out;
+			i++;
+			filp->f_pos++;
+			/* fall through */
+		case 1:
+			if (filldir(dirent, "..", 2, i,
+				    parent_ino(filp->f_path.dentry),
+				    DT_DIR) < 0)
+				goto out;
+			i++;
+			filp->f_pos++;
+			/* fall through */
+		default:
+			spin_lock(&proc_subdir_lock);
+			de = de->subdir;
+			i -= 2;
+			for (;;) {
+				if (!de) {
+					ret = 1;
+					spin_unlock(&proc_subdir_lock);
+					goto out;
+				}
+				if (!i)
+					break;
+				de = de->next;
+				i--;
+			}
+
+			do {
+				struct proc_dir_entry *next;
+
+				/* filldir passes info to user space */
+				pde_get(de);
+				spin_unlock(&proc_subdir_lock);
+				if (filldir(dirent, de->name, de->namelen, filp->f_pos,
+					    de->low_ino, de->mode >> 12) < 0) {
+					pde_put(de);
+					goto out;
+				}
+				spin_lock(&proc_subdir_lock);
+				filp->f_pos++;
+				next = de->next;
+				pde_put(de);
+				de = next;
+			} while (de);
+			spin_unlock(&proc_subdir_lock);
+	}
+	ret = 1;
+out:
+	return ret;	
+}
+
+int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	return proc_readdir_de(PDE(inode), filp, dirent, filldir);
+}
+
+/*
+ * These are the generic /proc directory operations. They
+ * use the in-memory "struct proc_dir_entry" tree to parse
+ * the /proc directory.
+ */
+static const struct file_operations proc_dir_operations = {
+	.llseek			= generic_file_llseek,
+	.read			= generic_read_dir,
+	.readdir		= proc_readdir,
+};
+
+/*
+ * proc directories can do almost nothing..
+ */
+static const struct inode_operations proc_dir_inode_operations = {
+	.lookup		= proc_lookup,
+	.getattr	= proc_getattr,
+	.setattr	= proc_notify_change,
+};
+
+static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
+{
+	unsigned int i;
+	struct proc_dir_entry *tmp;
+	
+	i = get_inode_number();
+	if (i == 0)
+		return -EAGAIN;
+	dp->low_ino = i;
+
+	if (S_ISDIR(dp->mode)) {
+		if (dp->proc_iops == NULL) {
+			dp->proc_fops = &proc_dir_operations;
+			dp->proc_iops = &proc_dir_inode_operations;
+		}
+		dir->nlink++;
+	} else if (S_ISLNK(dp->mode)) {
+		if (dp->proc_iops == NULL)
+			dp->proc_iops = &proc_link_inode_operations;
+	} else if (S_ISREG(dp->mode)) {
+		if (dp->proc_fops == NULL)
+			dp->proc_fops = &proc_file_operations;
+		if (dp->proc_iops == NULL)
+			dp->proc_iops = &proc_file_inode_operations;
+	}
+
+	spin_lock(&proc_subdir_lock);
+
+	for (tmp = dir->subdir; tmp; tmp = tmp->next)
+		if (strcmp(tmp->name, dp->name) == 0) {
+			WARN(1, KERN_WARNING "proc_dir_entry '%s/%s' already registered\n",
+				dir->name, dp->name);
+			break;
+		}
+
+	dp->next = dir->subdir;
+	dp->parent = dir;
+	dir->subdir = dp;
+	spin_unlock(&proc_subdir_lock);
+
+	return 0;
+}
+
+static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
+					  const char *name,
+					  mode_t mode,
+					  nlink_t nlink)
+{
+	struct proc_dir_entry *ent = NULL;
+	const char *fn = name;
+	unsigned int len;
+
+	/* make sure name is valid */
+	if (!name || !strlen(name)) goto out;
+
+	if (xlate_proc_name(name, parent, &fn) != 0)
+		goto out;
+
+	/* At this point there must not be any '/' characters beyond *fn */
+	if (strchr(fn, '/'))
+		goto out;
+
+	len = strlen(fn);
+
+	ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
+	if (!ent) goto out;
+
+	memset(ent, 0, sizeof(struct proc_dir_entry));
+	memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1);
+	ent->name = ((char *) ent) + sizeof(*ent);
+	ent->namelen = len;
+	ent->mode = mode;
+	ent->nlink = nlink;
+	atomic_set(&ent->count, 1);
+	ent->pde_users = 0;
+	spin_lock_init(&ent->pde_unload_lock);
+	ent->pde_unload_completion = NULL;
+	INIT_LIST_HEAD(&ent->pde_openers);
+ out:
+	return ent;
+}
+
+struct proc_dir_entry *proc_symlink(const char *name,
+		struct proc_dir_entry *parent, const char *dest)
+{
+	struct proc_dir_entry *ent;
+
+	ent = __proc_create(&parent, name,
+			  (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);
+
+	if (ent) {
+		ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
+		if (ent->data) {
+			strcpy((char*)ent->data,dest);
+			if (proc_register(parent, ent) < 0) {
+				kfree(ent->data);
+				kfree(ent);
+				ent = NULL;
+			}
+		} else {
+			kfree(ent);
+			ent = NULL;
+		}
+	}
+	return ent;
+}
+EXPORT_SYMBOL(proc_symlink);
+
+struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
+		struct proc_dir_entry *parent)
+{
+	struct proc_dir_entry *ent;
+
+	ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
+	if (ent) {
+		if (proc_register(parent, ent) < 0) {
+			kfree(ent);
+			ent = NULL;
+		}
+	}
+	return ent;
+}
+EXPORT_SYMBOL(proc_mkdir_mode);
+
+struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
+		struct proc_dir_entry *parent)
+{
+	struct proc_dir_entry *ent;
+
+	ent = __proc_create(&parent, name, S_IFDIR | S_IRUGO | S_IXUGO, 2);
+	if (ent) {
+		ent->data = net;
+		if (proc_register(parent, ent) < 0) {
+			kfree(ent);
+			ent = NULL;
+		}
+	}
+	return ent;
+}
+EXPORT_SYMBOL_GPL(proc_net_mkdir);
+
+struct proc_dir_entry *proc_mkdir(const char *name,
+		struct proc_dir_entry *parent)
+{
+	return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
+}
+EXPORT_SYMBOL(proc_mkdir);
+
+struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
+					 struct proc_dir_entry *parent)
+{
+	struct proc_dir_entry *ent;
+	nlink_t nlink;
+
+	if (S_ISDIR(mode)) {
+		if ((mode & S_IALLUGO) == 0)
+			mode |= S_IRUGO | S_IXUGO;
+		nlink = 2;
+	} else {
+		if ((mode & S_IFMT) == 0)
+			mode |= S_IFREG;
+		if ((mode & S_IALLUGO) == 0)
+			mode |= S_IRUGO;
+		nlink = 1;
+	}
+
+	ent = __proc_create(&parent, name, mode, nlink);
+	if (ent) {
+		if (proc_register(parent, ent) < 0) {
+			kfree(ent);
+			ent = NULL;
+		}
+	}
+	return ent;
+}
+EXPORT_SYMBOL(create_proc_entry);
+
+struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
+					struct proc_dir_entry *parent,
+					const struct file_operations *proc_fops,
+					void *data)
+{
+	struct proc_dir_entry *pde;
+	nlink_t nlink;
+
+	if (S_ISDIR(mode)) {
+		if ((mode & S_IALLUGO) == 0)
+			mode |= S_IRUGO | S_IXUGO;
+		nlink = 2;
+	} else {
+		if ((mode & S_IFMT) == 0)
+			mode |= S_IFREG;
+		if ((mode & S_IALLUGO) == 0)
+			mode |= S_IRUGO;
+		nlink = 1;
+	}
+
+	pde = __proc_create(&parent, name, mode, nlink);
+	if (!pde)
+		goto out;
+	pde->proc_fops = proc_fops;
+	pde->data = data;
+	if (proc_register(parent, pde) < 0)
+		goto out_free;
+	return pde;
+out_free:
+	kfree(pde);
+out:
+	return NULL;
+}
+EXPORT_SYMBOL(proc_create_data);
+
+static void free_proc_entry(struct proc_dir_entry *de)
+{
+	release_inode_number(de->low_ino);
+
+	if (S_ISLNK(de->mode))
+		kfree(de->data);
+	kfree(de);
+}
+
+void pde_put(struct proc_dir_entry *pde)
+{
+	if (atomic_dec_and_test(&pde->count))
+		free_proc_entry(pde);
+}
+
+/*
+ * Remove a /proc entry and free it if it's not currently in use.
+ */
+void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+{
+	struct proc_dir_entry **p;
+	struct proc_dir_entry *de = NULL;
+	const char *fn = name;
+	unsigned int len;
+
+	spin_lock(&proc_subdir_lock);
+	if (__xlate_proc_name(name, &parent, &fn) != 0) {
+		spin_unlock(&proc_subdir_lock);
+		return;
+	}
+	len = strlen(fn);
+
+	for (p = &parent->subdir; *p; p=&(*p)->next ) {
+		if (proc_match(len, fn, *p)) {
+			de = *p;
+			*p = de->next;
+			de->next = NULL;
+			break;
+		}
+	}
+	spin_unlock(&proc_subdir_lock);
+	if (!de) {
+		WARN(1, "name '%s'\n", name);
+		return;
+	}
+
+	spin_lock(&de->pde_unload_lock);
+	/*
+	 * Stop accepting new callers into module. If you're
+	 * dynamically allocating ->proc_fops, save a pointer somewhere.
+	 */
+	de->proc_fops = NULL;
+	/* Wait until all existing callers into module are done. */
+	if (de->pde_users > 0) {
+		DECLARE_COMPLETION_ONSTACK(c);
+
+		if (!de->pde_unload_completion)
+			de->pde_unload_completion = &c;
+
+		spin_unlock(&de->pde_unload_lock);
+
+		wait_for_completion(de->pde_unload_completion);
+
+		spin_lock(&de->pde_unload_lock);
+	}
+
+	while (!list_empty(&de->pde_openers)) {
+		struct pde_opener *pdeo;
+
+		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
+		list_del(&pdeo->lh);
+		spin_unlock(&de->pde_unload_lock);
+		pdeo->release(pdeo->inode, pdeo->file);
+		kfree(pdeo);
+		spin_lock(&de->pde_unload_lock);
+	}
+	spin_unlock(&de->pde_unload_lock);
+
+	if (S_ISDIR(de->mode))
+		parent->nlink--;
+	de->nlink = 0;
+	WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory "
+			"'%s/%s', leaking at least '%s'\n", __func__,
+			de->parent->name, de->name, de->subdir->name);
+	pde_put(de);
+}
+EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
new file mode 100644
index 00000000..74b48cfa
--- /dev/null
+++ b/fs/proc/inode.c
@@ -0,0 +1,497 @@
+/*
+ *  linux/fs/proc/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/completion.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/limits.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include "internal.h"
+
+static void proc_evict_inode(struct inode *inode)
+{
+	struct proc_dir_entry *de;
+	struct ctl_table_header *head;
+	const struct proc_ns_operations *ns_ops;
+
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+
+	/* Stop tracking associated processes */
+	put_pid(PROC_I(inode)->pid);
+
+	/* Let go of any associated proc directory entry */
+	de = PROC_I(inode)->pde;
+	if (de)
+		pde_put(de);
+	head = PROC_I(inode)->sysctl;
+	if (head) {
+		rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
+		sysctl_head_put(head);
+	}
+	/* Release any associated namespace */
+	ns_ops = PROC_I(inode)->ns_ops;
+	if (ns_ops && ns_ops->put)
+		ns_ops->put(PROC_I(inode)->ns);
+}
+
+static struct kmem_cache * proc_inode_cachep;
+
+static struct inode *proc_alloc_inode(struct super_block *sb)
+{
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	ei->pid = NULL;
+	ei->fd = 0;
+	ei->op.proc_get_link = NULL;
+	ei->pde = NULL;
+	ei->sysctl = NULL;
+	ei->sysctl_entry = NULL;
+	ei->ns = NULL;
+	ei->ns_ops = NULL;
+	inode = &ei->vfs_inode;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	return inode;
+}
+
+static void proc_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(proc_inode_cachep, PROC_I(inode));
+}
+
+static void proc_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, proc_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct proc_inode *ei = (struct proc_inode *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+void __init proc_init_inodecache(void)
+{
+	proc_inode_cachep = kmem_cache_create("proc_inode_cache",
+					     sizeof(struct proc_inode),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD|SLAB_PANIC),
+					     init_once);
+}
+
+static const struct super_operations proc_sops = {
+	.alloc_inode	= proc_alloc_inode,
+	.destroy_inode	= proc_destroy_inode,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= proc_evict_inode,
+	.statfs		= simple_statfs,
+};
+
+static void __pde_users_dec(struct proc_dir_entry *pde)
+{
+	pde->pde_users--;
+	if (pde->pde_unload_completion && pde->pde_users == 0)
+		complete(pde->pde_unload_completion);
+}
+
+void pde_users_dec(struct proc_dir_entry *pde)
+{
+	spin_lock(&pde->pde_unload_lock);
+	__pde_users_dec(pde);
+	spin_unlock(&pde->pde_unload_lock);
+}
+
+static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	loff_t rv = -EINVAL;
+	loff_t (*llseek)(struct file *, loff_t, int);
+
+	spin_lock(&pde->pde_unload_lock);
+	/*
+	 * remove_proc_entry() is going to delete PDE (as part of module
+	 * cleanup sequence). No new callers into module allowed.
+	 */
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	/*
+	 * Bump refcount so that remove_proc_entry will wail for ->llseek to
+	 * complete.
+	 */
+	pde->pde_users++;
+	/*
+	 * Save function pointer under lock, to protect against ->proc_fops
+	 * NULL'ifying right after ->pde_unload_lock is dropped.
+	 */
+	llseek = pde->proc_fops->llseek;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (!llseek)
+		llseek = default_llseek;
+	rv = llseek(file, offset, whence);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	ssize_t rv = -EIO;
+	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	read = pde->proc_fops->read;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (read)
+		rv = read(file, buf, count, ppos);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	ssize_t rv = -EIO;
+	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	write = pde->proc_fops->write;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (write)
+		rv = write(file, buf, count, ppos);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	unsigned int rv = DEFAULT_POLLMASK;
+	unsigned int (*poll)(struct file *, struct poll_table_struct *);
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	poll = pde->proc_fops->poll;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (poll)
+		rv = poll(file, pts);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	long rv = -ENOTTY;
+	long (*ioctl)(struct file *, unsigned int, unsigned long);
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	ioctl = pde->proc_fops->unlocked_ioctl;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (ioctl)
+		rv = ioctl(file, cmd, arg);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+#ifdef CONFIG_COMPAT
+static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	long rv = -ENOTTY;
+	long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	compat_ioctl = pde->proc_fops->compat_ioctl;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (compat_ioctl)
+		rv = compat_ioctl(file, cmd, arg);
+
+	pde_users_dec(pde);
+	return rv;
+}
+#endif
+
+static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	int rv = -EIO;
+	int (*mmap)(struct file *, struct vm_area_struct *);
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	mmap = pde->proc_fops->mmap;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (mmap)
+		rv = mmap(file, vma);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+static int proc_reg_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *pde = PDE(inode);
+	int rv = 0;
+	int (*open)(struct inode *, struct file *);
+	int (*release)(struct inode *, struct file *);
+	struct pde_opener *pdeo;
+
+	/*
+	 * What for, you ask? Well, we can have open, rmmod, remove_proc_entry
+	 * sequence. ->release won't be called because ->proc_fops will be
+	 * cleared. Depending on complexity of ->release, consequences vary.
+	 *
+	 * We can't wait for mercy when close will be done for real, it's
+	 * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release
+	 * by hand in remove_proc_entry(). For this, save opener's credentials
+	 * for later.
+	 */
+	pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
+	if (!pdeo)
+		return -ENOMEM;
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		kfree(pdeo);
+		return -EINVAL;
+	}
+	pde->pde_users++;
+	open = pde->proc_fops->open;
+	release = pde->proc_fops->release;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (open)
+		rv = open(inode, file);
+
+	spin_lock(&pde->pde_unload_lock);
+	if (rv == 0 && release) {
+		/* To know what to release. */
+		pdeo->inode = inode;
+		pdeo->file = file;
+		/* Strictly for "too late" ->release in proc_reg_release(). */
+		pdeo->release = release;
+		list_add(&pdeo->lh, &pde->pde_openers);
+	} else
+		kfree(pdeo);
+	__pde_users_dec(pde);
+	spin_unlock(&pde->pde_unload_lock);
+	return rv;
+}
+
+static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde,
+					struct inode *inode, struct file *file)
+{
+	struct pde_opener *pdeo;
+
+	list_for_each_entry(pdeo, &pde->pde_openers, lh) {
+		if (pdeo->inode == inode && pdeo->file == file)
+			return pdeo;
+	}
+	return NULL;
+}
+
+static int proc_reg_release(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *pde = PDE(inode);
+	int rv = 0;
+	int (*release)(struct inode *, struct file *);
+	struct pde_opener *pdeo;
+
+	spin_lock(&pde->pde_unload_lock);
+	pdeo = find_pde_opener(pde, inode, file);
+	if (!pde->proc_fops) {
+		/*
+		 * Can't simply exit, __fput() will think that everything is OK,
+		 * and move on to freeing struct file. remove_proc_entry() will
+		 * find slacker in opener's list and will try to do non-trivial
+		 * things with struct file. Therefore, remove opener from list.
+		 *
+		 * But if opener is removed from list, who will ->release it?
+		 */
+		if (pdeo) {
+			list_del(&pdeo->lh);
+			spin_unlock(&pde->pde_unload_lock);
+			rv = pdeo->release(inode, file);
+			kfree(pdeo);
+		} else
+			spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	release = pde->proc_fops->release;
+	if (pdeo) {
+		list_del(&pdeo->lh);
+		kfree(pdeo);
+	}
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (release)
+		rv = release(inode, file);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+static const struct file_operations proc_reg_file_ops = {
+	.llseek		= proc_reg_llseek,
+	.read		= proc_reg_read,
+	.write		= proc_reg_write,
+	.poll		= proc_reg_poll,
+	.unlocked_ioctl	= proc_reg_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= proc_reg_compat_ioctl,
+#endif
+	.mmap		= proc_reg_mmap,
+	.open		= proc_reg_open,
+	.release	= proc_reg_release,
+};
+
+#ifdef CONFIG_COMPAT
+static const struct file_operations proc_reg_file_ops_no_compat = {
+	.llseek		= proc_reg_llseek,
+	.read		= proc_reg_read,
+	.write		= proc_reg_write,
+	.poll		= proc_reg_poll,
+	.unlocked_ioctl	= proc_reg_unlocked_ioctl,
+	.mmap		= proc_reg_mmap,
+	.open		= proc_reg_open,
+	.release	= proc_reg_release,
+};
+#endif
+
+struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
+{
+	struct inode * inode;
+
+	inode = iget_locked(sb, de->low_ino);
+	if (!inode)
+		return NULL;
+	if (inode->i_state & I_NEW) {
+		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+		PROC_I(inode)->fd = 0;
+		PROC_I(inode)->pde = de;
+
+		if (de->mode) {
+			inode->i_mode = de->mode;
+			inode->i_uid = de->uid;
+			inode->i_gid = de->gid;
+		}
+		if (de->size)
+			inode->i_size = de->size;
+		if (de->nlink)
+			inode->i_nlink = de->nlink;
+		if (de->proc_iops)
+			inode->i_op = de->proc_iops;
+		if (de->proc_fops) {
+			if (S_ISREG(inode->i_mode)) {
+#ifdef CONFIG_COMPAT
+				if (!de->proc_fops->compat_ioctl)
+					inode->i_fop =
+						&proc_reg_file_ops_no_compat;
+				else
+#endif
+					inode->i_fop = &proc_reg_file_ops;
+			} else {
+				inode->i_fop = de->proc_fops;
+			}
+		}
+		unlock_new_inode(inode);
+	} else
+	       pde_put(de);
+	return inode;
+}			
+
+int proc_fill_super(struct super_block *s)
+{
+	struct inode * root_inode;
+
+	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
+	s->s_blocksize = 1024;
+	s->s_blocksize_bits = 10;
+	s->s_magic = PROC_SUPER_MAGIC;
+	s->s_op = &proc_sops;
+	s->s_time_gran = 1;
+	
+	pde_get(&proc_root);
+	root_inode = proc_get_inode(s, &proc_root);
+	if (!root_inode)
+		goto out_no_root;
+	root_inode->i_uid = 0;
+	root_inode->i_gid = 0;
+	s->s_root = d_alloc_root(root_inode);
+	if (!s->s_root)
+		goto out_no_root;
+	return 0;
+
+out_no_root:
+	printk("proc_read_super: get root inode failed\n");
+	iput(root_inode);
+	pde_put(&proc_root);
+	return -ENOMEM;
+}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
new file mode 100644
index 00000000..7838e5cf
--- /dev/null
+++ b/fs/proc/internal.h
@@ -0,0 +1,147 @@
+/* internal.h: internal procfs definitions
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/proc_fs.h>
+
+extern struct proc_dir_entry proc_root;
+#ifdef CONFIG_PROC_SYSCTL
+extern int proc_sys_init(void);
+#else
+static inline void proc_sys_init(void) { }
+#endif
+#ifdef CONFIG_NET
+extern int proc_net_init(void);
+#else
+static inline int proc_net_init(void) { return 0; }
+#endif
+
+struct vmalloc_info {
+	unsigned long	used;
+	unsigned long	largest_chunk;
+};
+
+extern struct mm_struct *mm_for_maps(struct task_struct *);
+
+#ifdef CONFIG_MMU
+#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
+extern void get_vmalloc_info(struct vmalloc_info *vmi);
+#else
+
+#define VMALLOC_TOTAL 0UL
+#define get_vmalloc_info(vmi)			\
+do {						\
+	(vmi)->used = 0;			\
+	(vmi)->largest_chunk = 0;		\
+} while(0)
+#endif
+
+extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
+				struct pid *pid, struct task_struct *task);
+extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
+				struct pid *pid, struct task_struct *task);
+extern int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
+				struct pid *pid, struct task_struct *task);
+extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
+				struct pid *pid, struct task_struct *task);
+extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
+
+extern const struct file_operations proc_maps_operations;
+extern const struct file_operations proc_numa_maps_operations;
+extern const struct file_operations proc_smaps_operations;
+extern const struct file_operations proc_clear_refs_operations;
+extern const struct file_operations proc_pagemap_operations;
+extern const struct file_operations proc_net_operations;
+extern const struct inode_operations proc_net_inode_operations;
+
+struct proc_maps_private {
+	struct pid *pid;
+	struct task_struct *task;
+#ifdef CONFIG_MMU
+	struct vm_area_struct *tail_vma;
+#endif
+};
+
+void proc_init_inodecache(void);
+
+static inline struct pid *proc_pid(struct inode *inode)
+{
+	return PROC_I(inode)->pid;
+}
+
+static inline struct task_struct *get_proc_task(struct inode *inode)
+{
+	return get_pid_task(proc_pid(inode), PIDTYPE_PID);
+}
+
+static inline int proc_fd(struct inode *inode)
+{
+	return PROC_I(inode)->fd;
+}
+
+struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
+		struct dentry *dentry);
+int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
+		filldir_t filldir);
+
+struct pde_opener {
+	struct inode *inode;
+	struct file *file;
+	int (*release)(struct inode *, struct file *);
+	struct list_head lh;
+};
+void pde_users_dec(struct proc_dir_entry *pde);
+
+extern spinlock_t proc_subdir_lock;
+
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
+int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
+unsigned long task_vsize(struct mm_struct *);
+unsigned long task_statm(struct mm_struct *,
+	unsigned long *, unsigned long *, unsigned long *, unsigned long *);
+void task_mem(struct seq_file *, struct mm_struct *);
+
+static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
+{
+	atomic_inc(&pde->count);
+	return pde;
+}
+void pde_put(struct proc_dir_entry *pde);
+
+int proc_fill_super(struct super_block *);
+struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
+
+/*
+ * These are generic /proc routines that use the internal
+ * "struct proc_dir_entry" tree to traverse the filesystem.
+ *
+ * The /proc root directory has extended versions to take care
+ * of the /proc/<pid> subdirectories.
+ */
+int proc_readdir(struct file *, void *, filldir_t);
+struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
+
+
+
+/* Lookups */
+typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
+				struct task_struct *, const void *);
+int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+	const char *name, int len,
+	instantiate_t instantiate, struct task_struct *task, const void *ptr);
+int pid_revalidate(struct dentry *dentry, struct nameidata *nd);
+struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task);
+extern const struct dentry_operations pid_dentry_operations;
+int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
+int proc_setattr(struct dentry *dentry, struct iattr *attr);
+
+extern const struct inode_operations proc_ns_dir_inode_operations;
+extern const struct file_operations proc_ns_dir_operations;
+
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
new file mode 100644
index 00000000..05029c0e
--- /dev/null
+++ b/fs/proc/interrupts.c
@@ -0,0 +1,53 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/irqnr.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+/*
+ * /proc/interrupts
+ */
+static void *int_seq_start(struct seq_file *f, loff_t *pos)
+{
+	return (*pos <= nr_irqs) ? pos : NULL;
+}
+
+static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos > nr_irqs)
+		return NULL;
+	return pos;
+}
+
+static void int_seq_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static const struct seq_operations int_seq_ops = {
+	.start = int_seq_start,
+	.next  = int_seq_next,
+	.stop  = int_seq_stop,
+	.show  = show_interrupts
+};
+
+static int interrupts_open(struct inode *inode, struct file *filp)
+{
+	return seq_open(filp, &int_seq_ops);
+}
+
+static const struct file_operations proc_interrupts_operations = {
+	.open		= interrupts_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_interrupts_init(void)
+{
+	proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
+	return 0;
+}
+module_init(proc_interrupts_init);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
new file mode 100644
index 00000000..d245cb23
--- /dev/null
+++ b/fs/proc/kcore.c
@@ -0,0 +1,635 @@
+/*
+ *	fs/proc/kcore.c kernel ELF core dumper
+ *
+ *	Modelled on fs/exec.c:aout_core_dump()
+ *	Jeremy Fitzhardinge <jeremy@sw.oz.au>
+ *	ELF version written by David Howells <David.Howells@nexor.co.uk>
+ *	Modified and incorporated into 2.3.x by Tigran Aivazian <tigran@veritas.com>
+ *	Support to dump vmalloc'd areas (ELF only), Tigran Aivazian <tigran@veritas.com>
+ *	Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/user.h>
+#include <linux/capability.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <linux/list.h>
+#include <linux/ioport.h>
+#include <linux/memory.h>
+#include <asm/sections.h>
+
+#define CORE_STR "CORE"
+
+#ifndef ELF_CORE_EFLAGS
+#define ELF_CORE_EFLAGS	0
+#endif
+
+static struct proc_dir_entry *proc_root_kcore;
+
+
+#ifndef kc_vaddr_to_offset
+#define	kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET)
+#endif
+#ifndef	kc_offset_to_vaddr
+#define	kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
+#endif
+
+/* An ELF note in memory */
+struct memelfnote
+{
+	const char *name;
+	int type;
+	unsigned int datasz;
+	void *data;
+};
+
+static LIST_HEAD(kclist_head);
+static DEFINE_RWLOCK(kclist_lock);
+static int kcore_need_update = 1;
+
+void
+kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
+{
+	new->addr = (unsigned long)addr;
+	new->size = size;
+	new->type = type;
+
+	write_lock(&kclist_lock);
+	list_add_tail(&new->list, &kclist_head);
+	write_unlock(&kclist_lock);
+}
+
+static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
+{
+	size_t try, size;
+	struct kcore_list *m;
+
+	*nphdr = 1; /* PT_NOTE */
+	size = 0;
+
+	list_for_each_entry(m, &kclist_head, list) {
+		try = kc_vaddr_to_offset((size_t)m->addr + m->size);
+		if (try > size)
+			size = try;
+		*nphdr = *nphdr + 1;
+	}
+	*elf_buflen =	sizeof(struct elfhdr) + 
+			(*nphdr + 2)*sizeof(struct elf_phdr) + 
+			3 * ((sizeof(struct elf_note)) +
+			     roundup(sizeof(CORE_STR), 4)) +
+			roundup(sizeof(struct elf_prstatus), 4) +
+			roundup(sizeof(struct elf_prpsinfo), 4) +
+			roundup(sizeof(struct task_struct), 4);
+	*elf_buflen = PAGE_ALIGN(*elf_buflen);
+	return size + *elf_buflen;
+}
+
+static void free_kclist_ents(struct list_head *head)
+{
+	struct kcore_list *tmp, *pos;
+
+	list_for_each_entry_safe(pos, tmp, head, list) {
+		list_del(&pos->list);
+		kfree(pos);
+	}
+}
+/*
+ * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
+ */
+static void __kcore_update_ram(struct list_head *list)
+{
+	int nphdr;
+	size_t size;
+	struct kcore_list *tmp, *pos;
+	LIST_HEAD(garbage);
+
+	write_lock(&kclist_lock);
+	if (kcore_need_update) {
+		list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
+			if (pos->type == KCORE_RAM
+				|| pos->type == KCORE_VMEMMAP)
+				list_move(&pos->list, &garbage);
+		}
+		list_splice_tail(list, &kclist_head);
+	} else
+		list_splice(list, &garbage);
+	kcore_need_update = 0;
+	proc_root_kcore->size = get_kcore_size(&nphdr, &size);
+	write_unlock(&kclist_lock);
+
+	free_kclist_ents(&garbage);
+}
+
+
+#ifdef CONFIG_HIGHMEM
+/*
+ * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
+ * because memory hole is not as big as !HIGHMEM case.
+ * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
+ */
+static int kcore_update_ram(void)
+{
+	LIST_HEAD(head);
+	struct kcore_list *ent;
+	int ret = 0;
+
+	ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+	if (!ent)
+		return -ENOMEM;
+	ent->addr = (unsigned long)__va(0);
+	ent->size = max_low_pfn << PAGE_SHIFT;
+	ent->type = KCORE_RAM;
+	list_add(&ent->list, &head);
+	__kcore_update_ram(&head);
+	return ret;
+}
+
+#else /* !CONFIG_HIGHMEM */
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/* calculate vmemmap's address from given system ram pfn and register it */
+int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+{
+	unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;
+	unsigned long nr_pages = ent->size >> PAGE_SHIFT;
+	unsigned long start, end;
+	struct kcore_list *vmm, *tmp;
+
+
+	start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK;
+	end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1;
+	end = ALIGN(end, PAGE_SIZE);
+	/* overlap check (because we have to align page */
+	list_for_each_entry(tmp, head, list) {
+		if (tmp->type != KCORE_VMEMMAP)
+			continue;
+		if (start < tmp->addr + tmp->size)
+			if (end > tmp->addr)
+				end = tmp->addr;
+	}
+	if (start < end) {
+		vmm = kmalloc(sizeof(*vmm), GFP_KERNEL);
+		if (!vmm)
+			return 0;
+		vmm->addr = start;
+		vmm->size = end - start;
+		vmm->type = KCORE_VMEMMAP;
+		list_add_tail(&vmm->list, head);
+	}
+	return 1;
+
+}
+#else
+int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+{
+	return 1;
+}
+
+#endif
+
+static int
+kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
+{
+	struct list_head *head = (struct list_head *)arg;
+	struct kcore_list *ent;
+
+	ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+	if (!ent)
+		return -ENOMEM;
+	ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT));
+	ent->size = nr_pages << PAGE_SHIFT;
+
+	/* Sanity check: Can happen in 32bit arch...maybe */
+	if (ent->addr < (unsigned long) __va(0))
+		goto free_out;
+
+	/* cut not-mapped area. ....from ppc-32 code. */
+	if (ULONG_MAX - ent->addr < ent->size)
+		ent->size = ULONG_MAX - ent->addr;
+
+	/* cut when vmalloc() area is higher than direct-map area */
+	if (VMALLOC_START > (unsigned long)__va(0)) {
+		if (ent->addr > VMALLOC_START)
+			goto free_out;
+		if (VMALLOC_START - ent->addr < ent->size)
+			ent->size = VMALLOC_START - ent->addr;
+	}
+
+	ent->type = KCORE_RAM;
+	list_add_tail(&ent->list, head);
+
+	if (!get_sparsemem_vmemmap_info(ent, head)) {
+		list_del(&ent->list);
+		goto free_out;
+	}
+
+	return 0;
+free_out:
+	kfree(ent);
+	return 1;
+}
+
+static int kcore_update_ram(void)
+{
+	int nid, ret;
+	unsigned long end_pfn;
+	LIST_HEAD(head);
+
+	/* Not inialized....update now */
+	/* find out "max pfn" */
+	end_pfn = 0;
+	for_each_node_state(nid, N_HIGH_MEMORY) {
+		unsigned long node_end;
+		node_end  = NODE_DATA(nid)->node_start_pfn +
+			NODE_DATA(nid)->node_spanned_pages;
+		if (end_pfn < node_end)
+			end_pfn = node_end;
+	}
+	/* scan 0 to max_pfn */
+	ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private);
+	if (ret) {
+		free_kclist_ents(&head);
+		return -ENOMEM;
+	}
+	__kcore_update_ram(&head);
+	return ret;
+}
+#endif /* CONFIG_HIGHMEM */
+
+/*****************************************************************************/
+/*
+ * determine size of ELF note
+ */
+static int notesize(struct memelfnote *en)
+{
+	int sz;
+
+	sz = sizeof(struct elf_note);
+	sz += roundup((strlen(en->name) + 1), 4);
+	sz += roundup(en->datasz, 4);
+
+	return sz;
+} /* end notesize() */
+
+/*****************************************************************************/
+/*
+ * store a note in the header buffer
+ */
+static char *storenote(struct memelfnote *men, char *bufp)
+{
+	struct elf_note en;
+
+#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0)
+
+	en.n_namesz = strlen(men->name) + 1;
+	en.n_descsz = men->datasz;
+	en.n_type = men->type;
+
+	DUMP_WRITE(&en, sizeof(en));
+	DUMP_WRITE(men->name, en.n_namesz);
+
+	/* XXX - cast from long long to long to avoid need for libgcc.a */
+	bufp = (char*) roundup((unsigned long)bufp,4);
+	DUMP_WRITE(men->data, men->datasz);
+	bufp = (char*) roundup((unsigned long)bufp,4);
+
+#undef DUMP_WRITE
+
+	return bufp;
+} /* end storenote() */
+
+/*
+ * store an ELF coredump header in the supplied buffer
+ * nphdr is the number of elf_phdr to insert
+ */
+static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
+{
+	struct elf_prstatus prstatus;	/* NT_PRSTATUS */
+	struct elf_prpsinfo prpsinfo;	/* NT_PRPSINFO */
+	struct elf_phdr *nhdr, *phdr;
+	struct elfhdr *elf;
+	struct memelfnote notes[3];
+	off_t offset = 0;
+	struct kcore_list *m;
+
+	/* setup ELF header */
+	elf = (struct elfhdr *) bufp;
+	bufp += sizeof(struct elfhdr);
+	offset += sizeof(struct elfhdr);
+	memcpy(elf->e_ident, ELFMAG, SELFMAG);
+	elf->e_ident[EI_CLASS]	= ELF_CLASS;
+	elf->e_ident[EI_DATA]	= ELF_DATA;
+	elf->e_ident[EI_VERSION]= EV_CURRENT;
+	elf->e_ident[EI_OSABI] = ELF_OSABI;
+	memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
+	elf->e_type	= ET_CORE;
+	elf->e_machine	= ELF_ARCH;
+	elf->e_version	= EV_CURRENT;
+	elf->e_entry	= 0;
+	elf->e_phoff	= sizeof(struct elfhdr);
+	elf->e_shoff	= 0;
+	elf->e_flags	= ELF_CORE_EFLAGS;
+	elf->e_ehsize	= sizeof(struct elfhdr);
+	elf->e_phentsize= sizeof(struct elf_phdr);
+	elf->e_phnum	= nphdr;
+	elf->e_shentsize= 0;
+	elf->e_shnum	= 0;
+	elf->e_shstrndx	= 0;
+
+	/* setup ELF PT_NOTE program header */
+	nhdr = (struct elf_phdr *) bufp;
+	bufp += sizeof(struct elf_phdr);
+	offset += sizeof(struct elf_phdr);
+	nhdr->p_type	= PT_NOTE;
+	nhdr->p_offset	= 0;
+	nhdr->p_vaddr	= 0;
+	nhdr->p_paddr	= 0;
+	nhdr->p_filesz	= 0;
+	nhdr->p_memsz	= 0;
+	nhdr->p_flags	= 0;
+	nhdr->p_align	= 0;
+
+	/* setup ELF PT_LOAD program header for every area */
+	list_for_each_entry(m, &kclist_head, list) {
+		phdr = (struct elf_phdr *) bufp;
+		bufp += sizeof(struct elf_phdr);
+		offset += sizeof(struct elf_phdr);
+
+		phdr->p_type	= PT_LOAD;
+		phdr->p_flags	= PF_R|PF_W|PF_X;
+		phdr->p_offset	= kc_vaddr_to_offset(m->addr) + dataoff;
+		phdr->p_vaddr	= (size_t)m->addr;
+		phdr->p_paddr	= 0;
+		phdr->p_filesz	= phdr->p_memsz	= m->size;
+		phdr->p_align	= PAGE_SIZE;
+	}
+
+	/*
+	 * Set up the notes in similar form to SVR4 core dumps made
+	 * with info from their /proc.
+	 */
+	nhdr->p_offset	= offset;
+
+	/* set up the process status */
+	notes[0].name = CORE_STR;
+	notes[0].type = NT_PRSTATUS;
+	notes[0].datasz = sizeof(struct elf_prstatus);
+	notes[0].data = &prstatus;
+
+	memset(&prstatus, 0, sizeof(struct elf_prstatus));
+
+	nhdr->p_filesz	= notesize(&notes[0]);
+	bufp = storenote(&notes[0], bufp);
+
+	/* set up the process info */
+	notes[1].name	= CORE_STR;
+	notes[1].type	= NT_PRPSINFO;
+	notes[1].datasz	= sizeof(struct elf_prpsinfo);
+	notes[1].data	= &prpsinfo;
+
+	memset(&prpsinfo, 0, sizeof(struct elf_prpsinfo));
+	prpsinfo.pr_state	= 0;
+	prpsinfo.pr_sname	= 'R';
+	prpsinfo.pr_zomb	= 0;
+
+	strcpy(prpsinfo.pr_fname, "vmlinux");
+	strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ);
+
+	nhdr->p_filesz	+= notesize(&notes[1]);
+	bufp = storenote(&notes[1], bufp);
+
+	/* set up the task structure */
+	notes[2].name	= CORE_STR;
+	notes[2].type	= NT_TASKSTRUCT;
+	notes[2].datasz	= sizeof(struct task_struct);
+	notes[2].data	= current;
+
+	nhdr->p_filesz	+= notesize(&notes[2]);
+	bufp = storenote(&notes[2], bufp);
+
+} /* end elf_kcore_store_hdr() */
+
+/*****************************************************************************/
+/*
+ * read from the ELF header and then kernel memory
+ */
+static ssize_t
+read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
+{
+	ssize_t acc = 0;
+	size_t size, tsz;
+	size_t elf_buflen;
+	int nphdr;
+	unsigned long start;
+
+	read_lock(&kclist_lock);
+	size = get_kcore_size(&nphdr, &elf_buflen);
+
+	if (buflen == 0 || *fpos >= size) {
+		read_unlock(&kclist_lock);
+		return 0;
+	}
+
+	/* trim buflen to not go beyond EOF */
+	if (buflen > size - *fpos)
+		buflen = size - *fpos;
+
+	/* construct an ELF core header if we'll need some of it */
+	if (*fpos < elf_buflen) {
+		char * elf_buf;
+
+		tsz = elf_buflen - *fpos;
+		if (buflen < tsz)
+			tsz = buflen;
+		elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
+		if (!elf_buf) {
+			read_unlock(&kclist_lock);
+			return -ENOMEM;
+		}
+		elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
+		read_unlock(&kclist_lock);
+		if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
+			kfree(elf_buf);
+			return -EFAULT;
+		}
+		kfree(elf_buf);
+		buflen -= tsz;
+		*fpos += tsz;
+		buffer += tsz;
+		acc += tsz;
+
+		/* leave now if filled buffer already */
+		if (buflen == 0)
+			return acc;
+	} else
+		read_unlock(&kclist_lock);
+
+	/*
+	 * Check to see if our file offset matches with any of
+	 * the addresses in the elf_phdr on our list.
+	 */
+	start = kc_offset_to_vaddr(*fpos - elf_buflen);
+	if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
+		tsz = buflen;
+		
+	while (buflen) {
+		struct kcore_list *m;
+
+		read_lock(&kclist_lock);
+		list_for_each_entry(m, &kclist_head, list) {
+			if (start >= m->addr && start < (m->addr+m->size))
+				break;
+		}
+		read_unlock(&kclist_lock);
+
+		if (&m->list == &kclist_head) {
+			if (clear_user(buffer, tsz))
+				return -EFAULT;
+		} else if (is_vmalloc_or_module_addr((void *)start)) {
+			char * elf_buf;
+
+			elf_buf = kzalloc(tsz, GFP_KERNEL);
+			if (!elf_buf)
+				return -ENOMEM;
+			vread(elf_buf, (char *)start, tsz);
+			/* we have to zero-fill user buffer even if no read */
+			if (copy_to_user(buffer, elf_buf, tsz)) {
+				kfree(elf_buf);
+				return -EFAULT;
+			}
+			kfree(elf_buf);
+		} else {
+			if (kern_addr_valid(start)) {
+				unsigned long n;
+
+				n = copy_to_user(buffer, (char *)start, tsz);
+				/*
+				 * We cannot distingush between fault on source
+				 * and fault on destination. When this happens
+				 * we clear too and hope it will trigger the
+				 * EFAULT again.
+				 */
+				if (n) { 
+					if (clear_user(buffer + tsz - n,
+								n))
+						return -EFAULT;
+				}
+			} else {
+				if (clear_user(buffer, tsz))
+					return -EFAULT;
+			}
+		}
+		buflen -= tsz;
+		*fpos += tsz;
+		buffer += tsz;
+		acc += tsz;
+		start += tsz;
+		tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
+	}
+
+	return acc;
+}
+
+
+static int open_kcore(struct inode *inode, struct file *filp)
+{
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+	if (kcore_need_update)
+		kcore_update_ram();
+	if (i_size_read(inode) != proc_root_kcore->size) {
+		mutex_lock(&inode->i_mutex);
+		i_size_write(inode, proc_root_kcore->size);
+		mutex_unlock(&inode->i_mutex);
+	}
+	return 0;
+}
+
+
+static const struct file_operations proc_kcore_operations = {
+	.read		= read_kcore,
+	.open		= open_kcore,
+	.llseek		= default_llseek,
+};
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/* just remember that we have to update kcore */
+static int __meminit kcore_callback(struct notifier_block *self,
+				    unsigned long action, void *arg)
+{
+	switch (action) {
+	case MEM_ONLINE:
+	case MEM_OFFLINE:
+		write_lock(&kclist_lock);
+		kcore_need_update = 1;
+		write_unlock(&kclist_lock);
+	}
+	return NOTIFY_OK;
+}
+#endif
+
+
+static struct kcore_list kcore_vmalloc;
+
+#ifdef CONFIG_ARCH_PROC_KCORE_TEXT
+static struct kcore_list kcore_text;
+/*
+ * If defined, special segment is used for mapping kernel text instead of
+ * direct-map area. We need to create special TEXT section.
+ */
+static void __init proc_kcore_text_init(void)
+{
+	kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
+}
+#else
+static void __init proc_kcore_text_init(void)
+{
+}
+#endif
+
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+/*
+ * MODULES_VADDR has no intersection with VMALLOC_ADDR.
+ */
+struct kcore_list kcore_modules;
+static void __init add_modules_range(void)
+{
+	kclist_add(&kcore_modules, (void *)MODULES_VADDR,
+			MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
+}
+#else
+static void __init add_modules_range(void)
+{
+}
+#endif
+
+static int __init proc_kcore_init(void)
+{
+	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
+				      &proc_kcore_operations);
+	if (!proc_root_kcore) {
+		printk(KERN_ERR "couldn't create /proc/kcore\n");
+		return 0; /* Always returns 0. */
+	}
+	/* Store text area if it's special */
+	proc_kcore_text_init();
+	/* Store vmalloc area */
+	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
+		VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
+	add_modules_range();
+	/* Store direct-map area from physical memory map */
+	kcore_update_ram();
+	hotplug_memory_notifier(kcore_callback, 0);
+
+	return 0;
+}
+module_init(proc_kcore_init);
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
new file mode 100644
index 00000000..bd4b5a74
--- /dev/null
+++ b/fs/proc/kmsg.c
@@ -0,0 +1,64 @@
+/*
+ *  linux/fs/proc/kmsg.c
+ *
+ *  Copyright (C) 1992  by Linus Torvalds
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/syslog.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+extern wait_queue_head_t log_wait;
+
+static int kmsg_open(struct inode * inode, struct file * file)
+{
+	return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE);
+}
+
+static int kmsg_release(struct inode * inode, struct file * file)
+{
+	(void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE);
+	return 0;
+}
+
+static ssize_t kmsg_read(struct file *file, char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	if ((file->f_flags & O_NONBLOCK) &&
+	    !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
+		return -EAGAIN;
+	return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE);
+}
+
+static unsigned int kmsg_poll(struct file *file, poll_table *wait)
+{
+	poll_wait(file, &log_wait, wait);
+	if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
+
+static const struct file_operations proc_kmsg_operations = {
+	.read		= kmsg_read,
+	.poll		= kmsg_poll,
+	.open		= kmsg_open,
+	.release	= kmsg_release,
+	.llseek		= generic_file_llseek,
+};
+
+static int __init proc_kmsg_init(void)
+{
+	proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
+	return 0;
+}
+module_init(proc_kmsg_init);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
new file mode 100644
index 00000000..1afa4dd4
--- /dev/null
+++ b/fs/proc/loadavg.c
@@ -0,0 +1,45 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/pid_namespace.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/seqlock.h>
+#include <linux/time.h>
+
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+
+static int loadavg_proc_show(struct seq_file *m, void *v)
+{
+	unsigned long avnrun[3];
+
+	get_avenrun(avnrun, FIXED_1/200, 0);
+
+	seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+		LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
+		nr_running(), nr_threads,
+		task_active_pid_ns(current)->last_pid);
+	return 0;
+}
+
+static int loadavg_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, loadavg_proc_show, NULL);
+}
+
+static const struct file_operations loadavg_proc_fops = {
+	.open		= loadavg_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_loadavg_init(void)
+{
+	proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
+	return 0;
+}
+module_init(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
new file mode 100644
index 00000000..a9628278
--- /dev/null
+++ b/fs/proc/meminfo.c
@@ -0,0 +1,194 @@
+#include <linux/fs.h>
+#include <linux/hugetlb.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmzone.h>
+#include <linux/proc_fs.h>
+#include <linux/quicklist.h>
+#include <linux/seq_file.h>
+#include <linux/swap.h>
+#include <linux/vmstat.h>
+#include <asm/atomic.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include "internal.h"
+
+void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
+{
+}
+
+static int meminfo_proc_show(struct seq_file *m, void *v)
+{
+	struct sysinfo i;
+	unsigned long committed;
+	unsigned long allowed;
+	struct vmalloc_info vmi;
+	long cached;
+	unsigned long pages[NR_LRU_LISTS];
+	int lru;
+
+/*
+ * display in kilobytes.
+ */
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+	si_meminfo(&i);
+	si_swapinfo(&i);
+	committed = percpu_counter_read_positive(&vm_committed_as);
+	allowed = ((totalram_pages - hugetlb_total_pages())
+		* sysctl_overcommit_ratio / 100) + total_swap_pages;
+
+	cached = global_page_state(NR_FILE_PAGES) -
+			total_swapcache_pages - i.bufferram;
+	if (cached < 0)
+		cached = 0;
+
+	get_vmalloc_info(&vmi);
+
+	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+		pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
+	/*
+	 * Tagged format, for easy grepping and expansion.
+	 */
+	seq_printf(m,
+		"MemTotal:       %8lu kB\n"
+		"MemFree:        %8lu kB\n"
+		"Buffers:        %8lu kB\n"
+		"Cached:         %8lu kB\n"
+		"SwapCached:     %8lu kB\n"
+		"Active:         %8lu kB\n"
+		"Inactive:       %8lu kB\n"
+		"Active(anon):   %8lu kB\n"
+		"Inactive(anon): %8lu kB\n"
+		"Active(file):   %8lu kB\n"
+		"Inactive(file): %8lu kB\n"
+		"Unevictable:    %8lu kB\n"
+		"Mlocked:        %8lu kB\n"
+#ifdef CONFIG_HIGHMEM
+		"HighTotal:      %8lu kB\n"
+		"HighFree:       %8lu kB\n"
+		"LowTotal:       %8lu kB\n"
+		"LowFree:        %8lu kB\n"
+#endif
+#ifndef CONFIG_MMU
+		"MmapCopy:       %8lu kB\n"
+#endif
+		"SwapTotal:      %8lu kB\n"
+		"SwapFree:       %8lu kB\n"
+		"Dirty:          %8lu kB\n"
+		"Writeback:      %8lu kB\n"
+		"AnonPages:      %8lu kB\n"
+		"Mapped:         %8lu kB\n"
+		"Shmem:          %8lu kB\n"
+		"Slab:           %8lu kB\n"
+		"SReclaimable:   %8lu kB\n"
+		"SUnreclaim:     %8lu kB\n"
+		"KernelStack:    %8lu kB\n"
+		"PageTables:     %8lu kB\n"
+#ifdef CONFIG_QUICKLIST
+		"Quicklists:     %8lu kB\n"
+#endif
+		"NFS_Unstable:   %8lu kB\n"
+		"Bounce:         %8lu kB\n"
+		"WritebackTmp:   %8lu kB\n"
+		"CommitLimit:    %8lu kB\n"
+		"Committed_AS:   %8lu kB\n"
+		"VmallocTotal:   %8lu kB\n"
+		"VmallocUsed:    %8lu kB\n"
+		"VmallocChunk:   %8lu kB\n"
+#ifdef CONFIG_MEMORY_FAILURE
+		"HardwareCorrupted: %5lu kB\n"
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		"AnonHugePages:  %8lu kB\n"
+#endif
+		,
+		K(i.totalram),
+		K(i.freeram),
+		K(i.bufferram),
+		K(cached),
+		K(total_swapcache_pages),
+		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
+		K(pages[LRU_ACTIVE_ANON]),
+		K(pages[LRU_INACTIVE_ANON]),
+		K(pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_FILE]),
+		K(pages[LRU_UNEVICTABLE]),
+		K(global_page_state(NR_MLOCK)),
+#ifdef CONFIG_HIGHMEM
+		K(i.totalhigh),
+		K(i.freehigh),
+		K(i.totalram-i.totalhigh),
+		K(i.freeram-i.freehigh),
+#endif
+#ifndef CONFIG_MMU
+		K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
+#endif
+		K(i.totalswap),
+		K(i.freeswap),
+		K(global_page_state(NR_FILE_DIRTY)),
+		K(global_page_state(NR_WRITEBACK)),
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		K(global_page_state(NR_ANON_PAGES)
+		  + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+		  HPAGE_PMD_NR),
+#else
+		K(global_page_state(NR_ANON_PAGES)),
+#endif
+		K(global_page_state(NR_FILE_MAPPED)),
+		K(global_page_state(NR_SHMEM)),
+		K(global_page_state(NR_SLAB_RECLAIMABLE) +
+				global_page_state(NR_SLAB_UNRECLAIMABLE)),
+		K(global_page_state(NR_SLAB_RECLAIMABLE)),
+		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
+		global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
+		K(global_page_state(NR_PAGETABLE)),
+#ifdef CONFIG_QUICKLIST
+		K(quicklist_total_size()),
+#endif
+		K(global_page_state(NR_UNSTABLE_NFS)),
+		K(global_page_state(NR_BOUNCE)),
+		K(global_page_state(NR_WRITEBACK_TEMP)),
+		K(allowed),
+		K(committed),
+		(unsigned long)VMALLOC_TOTAL >> 10,
+		vmi.used >> 10,
+		vmi.largest_chunk >> 10
+#ifdef CONFIG_MEMORY_FAILURE
+		,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+		   HPAGE_PMD_NR)
+#endif
+		);
+
+	hugetlb_report_meminfo(m);
+
+	arch_report_meminfo(m);
+
+	return 0;
+#undef K
+}
+
+static int meminfo_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, meminfo_proc_show, NULL);
+}
+
+static const struct file_operations meminfo_proc_fops = {
+	.open		= meminfo_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_meminfo_init(void)
+{
+	proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
+	return 0;
+}
+module_init(proc_meminfo_init);
diff --git a/fs/proc/mmu.c b/fs/proc/mmu.c
new file mode 100644
index 00000000..8ae221df
--- /dev/null
+++ b/fs/proc/mmu.c
@@ -0,0 +1,60 @@
+/* mmu.c: mmu memory info files
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <asm/pgtable.h>
+#include "internal.h"
+
+void get_vmalloc_info(struct vmalloc_info *vmi)
+{
+	struct vm_struct *vma;
+	unsigned long free_area_size;
+	unsigned long prev_end;
+
+	vmi->used = 0;
+
+	if (!vmlist) {
+		vmi->largest_chunk = VMALLOC_TOTAL;
+	}
+	else {
+		vmi->largest_chunk = 0;
+
+		prev_end = VMALLOC_START;
+
+		read_lock(&vmlist_lock);
+
+		for (vma = vmlist; vma; vma = vma->next) {
+			unsigned long addr = (unsigned long) vma->addr;
+
+			/*
+			 * Some archs keep another range for modules in vmlist
+			 */
+			if (addr < VMALLOC_START)
+				continue;
+			if (addr >= VMALLOC_END)
+				break;
+
+			vmi->used += vma->size;
+
+			free_area_size = addr - prev_end;
+			if (vmi->largest_chunk < free_area_size)
+				vmi->largest_chunk = free_area_size;
+
+			prev_end = vma->size + addr;
+		}
+
+		if (VMALLOC_END - prev_end > vmi->largest_chunk)
+			vmi->largest_chunk = VMALLOC_END - prev_end;
+
+		read_unlock(&vmlist_lock);
+	}
+}
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
new file mode 100644
index 00000000..d6c078ea
--- /dev/null
+++ b/fs/proc/namespaces.c
@@ -0,0 +1,201 @@
+#include <linux/proc_fs.h>
+#include <linux/nsproxy.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/fs_struct.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <net/net_namespace.h>
+#include <linux/mnt_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
+
+
+static const struct proc_ns_operations *ns_entries[] = {
+#ifdef CONFIG_NET_NS
+	&netns_operations,
+#endif
+#ifdef CONFIG_UTS_NS
+	&utsns_operations,
+#endif
+#ifdef CONFIG_IPC_NS
+	&ipcns_operations,
+#endif
+};
+
+static const struct file_operations ns_file_operations = {
+	.llseek		= no_llseek,
+};
+
+static struct dentry *proc_ns_instantiate(struct inode *dir,
+	struct dentry *dentry, struct task_struct *task, const void *ptr)
+{
+	const struct proc_ns_operations *ns_ops = ptr;
+	struct inode *inode;
+	struct proc_inode *ei;
+	struct dentry *error = ERR_PTR(-ENOENT);
+	void *ns;
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+
+	ns = ns_ops->get(task);
+	if (!ns)
+		goto out_iput;
+
+	ei = PROC_I(inode);
+	inode->i_mode = S_IFREG|S_IRUSR;
+	inode->i_fop  = &ns_file_operations;
+	ei->ns_ops    = ns_ops;
+	ei->ns	      = ns;
+
+	d_set_d_op(dentry, &pid_dentry_operations);
+	d_add(dentry, inode);
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, NULL))
+		error = NULL;
+out:
+	return error;
+out_iput:
+	iput(inode);
+	goto out;
+}
+
+static int proc_ns_fill_cache(struct file *filp, void *dirent,
+	filldir_t filldir, struct task_struct *task,
+	const struct proc_ns_operations *ops)
+{
+	return proc_fill_cache(filp, dirent, filldir,
+				ops->name, strlen(ops->name),
+				proc_ns_instantiate, task, ops);
+}
+
+static int proc_ns_dir_readdir(struct file *filp, void *dirent,
+				filldir_t filldir)
+{
+	int i;
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task = get_proc_task(inode);
+	const struct proc_ns_operations **entry, **last;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	if (!task)
+		goto out_no_task;
+
+	ret = -EPERM;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	ret = 0;
+	i = filp->f_pos;
+	switch (i) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+			goto out;
+		i++;
+		filp->f_pos++;
+		/* fall through */
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+			goto out;
+		i++;
+		filp->f_pos++;
+		/* fall through */
+	default:
+		i -= 2;
+		if (i >= ARRAY_SIZE(ns_entries)) {
+			ret = 1;
+			goto out;
+		}
+		entry = ns_entries + i;
+		last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+		while (entry <= last) {
+			if (proc_ns_fill_cache(filp, dirent, filldir,
+						task, *entry) < 0)
+				goto out;
+			filp->f_pos++;
+			entry++;
+		}
+	}
+
+	ret = 1;
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+const struct file_operations proc_ns_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_ns_dir_readdir,
+};
+
+static struct dentry *proc_ns_dir_lookup(struct inode *dir,
+				struct dentry *dentry, struct nameidata *nd)
+{
+	struct dentry *error;
+	struct task_struct *task = get_proc_task(dir);
+	const struct proc_ns_operations **entry, **last;
+	unsigned int len = dentry->d_name.len;
+
+	error = ERR_PTR(-ENOENT);
+
+	if (!task)
+		goto out_no_task;
+
+	error = ERR_PTR(-EPERM);
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+	for (entry = ns_entries; entry <= last; entry++) {
+		if (strlen((*entry)->name) != len)
+			continue;
+		if (!memcmp(dentry->d_name.name, (*entry)->name, len))
+			break;
+	}
+	error = ERR_PTR(-ENOENT);
+	if (entry > last)
+		goto out;
+
+	error = proc_ns_instantiate(dir, dentry, task, *entry);
+out:
+	put_task_struct(task);
+out_no_task:
+	return error;
+}
+
+const struct inode_operations proc_ns_dir_inode_operations = {
+	.lookup		= proc_ns_dir_lookup,
+	.getattr	= pid_getattr,
+	.setattr	= proc_setattr,
+};
+
+struct file *proc_ns_fget(int fd)
+{
+	struct file *file;
+
+	file = fget(fd);
+	if (!file)
+		return ERR_PTR(-EBADF);
+
+	if (file->f_op != &ns_file_operations)
+		goto out_invalid;
+
+	return file;
+
+out_invalid:
+	fput(file);
+	return ERR_PTR(-EINVAL);
+}
+
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
new file mode 100644
index 00000000..b1822dde
--- /dev/null
+++ b/fs/proc/nommu.c
@@ -0,0 +1,136 @@
+/* nommu.c: mmu-less memory info files
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mman.h>
+#include <linux/proc_fs.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/seq_file.h>
+#include <linux/hugetlb.h>
+#include <linux/vmalloc.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/div64.h>
+#include "internal.h"
+
+/*
+ * display a single region to a sequenced file
+ */
+static int nommu_region_show(struct seq_file *m, struct vm_region *region)
+{
+	unsigned long ino = 0;
+	struct file *file;
+	dev_t dev = 0;
+	int flags, len;
+
+	flags = region->vm_flags;
+	file = region->vm_file;
+
+	if (file) {
+		struct inode *inode = region->vm_file->f_path.dentry->d_inode;
+		dev = inode->i_sb->s_dev;
+		ino = inode->i_ino;
+	}
+
+	seq_printf(m,
+		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+		   region->vm_start,
+		   region->vm_end,
+		   flags & VM_READ ? 'r' : '-',
+		   flags & VM_WRITE ? 'w' : '-',
+		   flags & VM_EXEC ? 'x' : '-',
+		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
+		   ((loff_t)region->vm_pgoff) << PAGE_SHIFT,
+		   MAJOR(dev), MINOR(dev), ino, &len);
+
+	if (file) {
+		len = 25 + sizeof(void *) * 6 - len;
+		if (len < 1)
+			len = 1;
+		seq_printf(m, "%*c", len, ' ');
+		seq_path(m, &file->f_path, "");
+	}
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
+/*
+ * display a list of all the REGIONs the kernel knows about
+ * - nommu kernels have a single flat list
+ */
+static int nommu_region_list_show(struct seq_file *m, void *_p)
+{
+	struct rb_node *p = _p;
+
+	return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb));
+}
+
+static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos)
+{
+	struct rb_node *p;
+	loff_t pos = *_pos;
+
+	down_read(&nommu_region_sem);
+
+	for (p = rb_first(&nommu_region_tree); p; p = rb_next(p))
+		if (pos-- == 0)
+			return p;
+	return NULL;
+}
+
+static void nommu_region_list_stop(struct seq_file *m, void *v)
+{
+	up_read(&nommu_region_sem);
+}
+
+static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return rb_next((struct rb_node *) v);
+}
+
+static const struct seq_operations proc_nommu_region_list_seqop = {
+	.start	= nommu_region_list_start,
+	.next	= nommu_region_list_next,
+	.stop	= nommu_region_list_stop,
+	.show	= nommu_region_list_show
+};
+
+static int proc_nommu_region_list_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &proc_nommu_region_list_seqop);
+}
+
+static const struct file_operations proc_nommu_region_list_operations = {
+	.open    = proc_nommu_region_list_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+static int __init proc_nommu_init(void)
+{
+	proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations);
+	return 0;
+}
+
+module_init(proc_nommu_init);
diff --git a/fs/proc/page.c b/fs/proc/page.c
new file mode 100644
index 00000000..6d8e6a9e
--- /dev/null
+++ b/fs/proc/page.c
@@ -0,0 +1,212 @@
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/ksm.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/hugetlb.h>
+#include <linux/kernel-page-flags.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+#define KPMSIZE sizeof(u64)
+#define KPMMASK (KPMSIZE - 1)
+
+/* /proc/kpagecount - an array exposing page counts
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page count.
+ */
+static ssize_t kpagecount_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	u64 __user *out = (u64 __user *)buf;
+	struct page *ppage;
+	unsigned long src = *ppos;
+	unsigned long pfn;
+	ssize_t ret = 0;
+	u64 pcount;
+
+	pfn = src / KPMSIZE;
+	count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
+	if (src & KPMMASK || count & KPMMASK)
+		return -EINVAL;
+
+	while (count > 0) {
+		if (pfn_valid(pfn))
+			ppage = pfn_to_page(pfn);
+		else
+			ppage = NULL;
+		if (!ppage || PageSlab(ppage))
+			pcount = 0;
+		else
+			pcount = page_mapcount(ppage);
+
+		if (put_user(pcount, out)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		pfn++;
+		out++;
+		count -= KPMSIZE;
+	}
+
+	*ppos += (char __user *)out - buf;
+	if (!ret)
+		ret = (char __user *)out - buf;
+	return ret;
+}
+
+static const struct file_operations proc_kpagecount_operations = {
+	.llseek = mem_lseek,
+	.read = kpagecount_read,
+};
+
+/* /proc/kpageflags - an array exposing page flags
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page flags.
+ */
+
+static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
+{
+	return ((kflags >> kbit) & 1) << ubit;
+}
+
+u64 stable_page_flags(struct page *page)
+{
+	u64 k;
+	u64 u;
+
+	/*
+	 * pseudo flag: KPF_NOPAGE
+	 * it differentiates a memory hole from a page with no flags
+	 */
+	if (!page)
+		return 1 << KPF_NOPAGE;
+
+	k = page->flags;
+	u = 0;
+
+	/*
+	 * pseudo flags for the well known (anonymous) memory mapped pages
+	 *
+	 * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
+	 * simple test in page_mapped() is not enough.
+	 */
+	if (!PageSlab(page) && page_mapped(page))
+		u |= 1 << KPF_MMAP;
+	if (PageAnon(page))
+		u |= 1 << KPF_ANON;
+	if (PageKsm(page))
+		u |= 1 << KPF_KSM;
+
+	/*
+	 * compound pages: export both head/tail info
+	 * they together define a compound page's start/end pos and order
+	 */
+	if (PageHead(page))
+		u |= 1 << KPF_COMPOUND_HEAD;
+	if (PageTail(page))
+		u |= 1 << KPF_COMPOUND_TAIL;
+	if (PageHuge(page))
+		u |= 1 << KPF_HUGE;
+
+	/*
+	 * Caveats on high order pages: page->_count will only be set
+	 * -1 on the head page; SLUB/SLQB do the same for PG_slab;
+	 * SLOB won't set PG_slab at all on compound pages.
+	 */
+	if (PageBuddy(page))
+		u |= 1 << KPF_BUDDY;
+
+	u |= kpf_copy_bit(k, KPF_LOCKED,	PG_locked);
+
+	u |= kpf_copy_bit(k, KPF_SLAB,		PG_slab);
+
+	u |= kpf_copy_bit(k, KPF_ERROR,		PG_error);
+	u |= kpf_copy_bit(k, KPF_DIRTY,		PG_dirty);
+	u |= kpf_copy_bit(k, KPF_UPTODATE,	PG_uptodate);
+	u |= kpf_copy_bit(k, KPF_WRITEBACK,	PG_writeback);
+
+	u |= kpf_copy_bit(k, KPF_LRU,		PG_lru);
+	u |= kpf_copy_bit(k, KPF_REFERENCED,	PG_referenced);
+	u |= kpf_copy_bit(k, KPF_ACTIVE,	PG_active);
+	u |= kpf_copy_bit(k, KPF_RECLAIM,	PG_reclaim);
+
+	u |= kpf_copy_bit(k, KPF_SWAPCACHE,	PG_swapcache);
+	u |= kpf_copy_bit(k, KPF_SWAPBACKED,	PG_swapbacked);
+
+	u |= kpf_copy_bit(k, KPF_UNEVICTABLE,	PG_unevictable);
+	u |= kpf_copy_bit(k, KPF_MLOCKED,	PG_mlocked);
+
+#ifdef CONFIG_MEMORY_FAILURE
+	u |= kpf_copy_bit(k, KPF_HWPOISON,	PG_hwpoison);
+#endif
+
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
+	u |= kpf_copy_bit(k, KPF_UNCACHED,	PG_uncached);
+#endif
+
+	u |= kpf_copy_bit(k, KPF_RESERVED,	PG_reserved);
+	u |= kpf_copy_bit(k, KPF_MAPPEDTODISK,	PG_mappedtodisk);
+	u |= kpf_copy_bit(k, KPF_PRIVATE,	PG_private);
+	u |= kpf_copy_bit(k, KPF_PRIVATE_2,	PG_private_2);
+	u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE,	PG_owner_priv_1);
+	u |= kpf_copy_bit(k, KPF_ARCH,		PG_arch_1);
+
+	return u;
+};
+
+static ssize_t kpageflags_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	u64 __user *out = (u64 __user *)buf;
+	struct page *ppage;
+	unsigned long src = *ppos;
+	unsigned long pfn;
+	ssize_t ret = 0;
+
+	pfn = src / KPMSIZE;
+	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+	if (src & KPMMASK || count & KPMMASK)
+		return -EINVAL;
+
+	while (count > 0) {
+		if (pfn_valid(pfn))
+			ppage = pfn_to_page(pfn);
+		else
+			ppage = NULL;
+
+		if (put_user(stable_page_flags(ppage), out)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		pfn++;
+		out++;
+		count -= KPMSIZE;
+	}
+
+	*ppos += (char __user *)out - buf;
+	if (!ret)
+		ret = (char __user *)out - buf;
+	return ret;
+}
+
+static const struct file_operations proc_kpageflags_operations = {
+	.llseek = mem_lseek,
+	.read = kpageflags_read,
+};
+
+static int __init proc_page_init(void)
+{
+	proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
+	proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+	return 0;
+}
+module_init(proc_page_init);
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
new file mode 100644
index 00000000..927cbd11
--- /dev/null
+++ b/fs/proc/proc_devtree.c
@@ -0,0 +1,241 @@
+/*
+ * proc_devtree.c - handles /proc/device-tree
+ *
+ * Copyright 1997 Paul Mackerras
+ */
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/of.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/prom.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static inline void set_node_proc_entry(struct device_node *np,
+				       struct proc_dir_entry *de)
+{
+#ifdef HAVE_ARCH_DEVTREE_FIXUPS
+	np->pde = de;
+#endif
+}
+
+static struct proc_dir_entry *proc_device_tree;
+
+/*
+ * Supply data on a read from /proc/device-tree/node/property.
+ */
+static int property_proc_show(struct seq_file *m, void *v)
+{
+	struct property *pp = m->private;
+
+	seq_write(m, pp->value, pp->length);
+	return 0;
+}
+
+static int property_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, property_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations property_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= property_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+/*
+ * For a node with a name like "gc@10", we make symlinks called "gc"
+ * and "@10" to it.
+ */
+
+/*
+ * Add a property to a node
+ */
+static struct proc_dir_entry *
+__proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
+		const char *name)
+{
+	struct proc_dir_entry *ent;
+
+	/*
+	 * Unfortunately proc_register puts each new entry
+	 * at the beginning of the list.  So we rearrange them.
+	 */
+	ent = proc_create_data(name,
+			       strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR,
+			       de, &property_proc_fops, pp);
+	if (ent == NULL)
+		return NULL;
+
+	if (!strncmp(name, "security-", 9))
+		ent->size = 0; /* don't leak number of password chars */
+	else
+		ent->size = pp->length;
+
+	return ent;
+}
+
+
+void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop)
+{
+	__proc_device_tree_add_prop(pde, prop, prop->name);
+}
+
+void proc_device_tree_remove_prop(struct proc_dir_entry *pde,
+				  struct property *prop)
+{
+	remove_proc_entry(prop->name, pde);
+}
+
+void proc_device_tree_update_prop(struct proc_dir_entry *pde,
+				  struct property *newprop,
+				  struct property *oldprop)
+{
+	struct proc_dir_entry *ent;
+
+	for (ent = pde->subdir; ent != NULL; ent = ent->next)
+		if (ent->data == oldprop)
+			break;
+	if (ent == NULL) {
+		printk(KERN_WARNING "device-tree: property \"%s\" "
+		       " does not exist\n", oldprop->name);
+	} else {
+		ent->data = newprop;
+		ent->size = newprop->length;
+	}
+}
+
+/*
+ * Various dodgy firmware might give us nodes and/or properties with
+ * conflicting names. That's generally ok, except for exporting via /proc,
+ * so munge names here to ensure they're unique.
+ */
+
+static int duplicate_name(struct proc_dir_entry *de, const char *name)
+{
+	struct proc_dir_entry *ent;
+	int found = 0;
+
+	spin_lock(&proc_subdir_lock);
+
+	for (ent = de->subdir; ent != NULL; ent = ent->next) {
+		if (strcmp(ent->name, name) == 0) {
+			found = 1;
+			break;
+		}
+	}
+
+	spin_unlock(&proc_subdir_lock);
+
+	return found;
+}
+
+static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de,
+		const char *name)
+{
+	char *fixed_name;
+	int fixup_len = strlen(name) + 2 + 1; /* name + #x + \0 */
+	int i = 1, size;
+
+realloc:
+	fixed_name = kmalloc(fixup_len, GFP_KERNEL);
+	if (fixed_name == NULL) {
+		printk(KERN_ERR "device-tree: Out of memory trying to fixup "
+				"name \"%s\"\n", name);
+		return name;
+	}
+
+retry:
+	size = snprintf(fixed_name, fixup_len, "%s#%d", name, i);
+	size++; /* account for NULL */
+
+	if (size > fixup_len) {
+		/* We ran out of space, free and reallocate. */
+		kfree(fixed_name);
+		fixup_len = size;
+		goto realloc;
+	}
+
+	if (duplicate_name(de, fixed_name)) {
+		/* Multiple duplicates. Retry with a different offset. */
+		i++;
+		goto retry;
+	}
+
+	printk(KERN_WARNING "device-tree: Duplicate name in %s, "
+			"renamed to \"%s\"\n", np->full_name, fixed_name);
+
+	return fixed_name;
+}
+
+/*
+ * Process a node, adding entries for its children and its properties.
+ */
+void proc_device_tree_add_node(struct device_node *np,
+			       struct proc_dir_entry *de)
+{
+	struct property *pp;
+	struct proc_dir_entry *ent;
+	struct device_node *child;
+	const char *p;
+
+	set_node_proc_entry(np, de);
+	for (child = NULL; (child = of_get_next_child(np, child));) {
+		/* Use everything after the last slash, or the full name */
+		p = strrchr(child->full_name, '/');
+		if (!p)
+			p = child->full_name;
+		else
+			++p;
+
+		if (duplicate_name(de, p))
+			p = fixup_name(np, de, p);
+
+		ent = proc_mkdir(p, de);
+		if (ent == NULL)
+			break;
+		proc_device_tree_add_node(child, ent);
+	}
+	of_node_put(child);
+
+	for (pp = np->properties; pp != NULL; pp = pp->next) {
+		p = pp->name;
+
+		if (strchr(p, '/'))
+			continue;
+
+		if (duplicate_name(de, p))
+			p = fixup_name(np, de, p);
+
+		ent = __proc_device_tree_add_prop(de, pp, p);
+		if (ent == NULL)
+			break;
+	}
+}
+
+/*
+ * Called on initialization to set up the /proc/device-tree subtree
+ */
+void __init proc_device_tree_init(void)
+{
+	struct device_node *root;
+
+	proc_device_tree = proc_mkdir("device-tree", NULL);
+	if (proc_device_tree == NULL)
+		return;
+	root = of_find_node_by_path("/");
+	if (root == NULL) {
+		pr_debug("/proc/device-tree: can't find root\n");
+		return;
+	}
+	proc_device_tree_add_node(root, proc_device_tree);
+	of_node_put(root);
+}
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
new file mode 100644
index 00000000..9020ac15
--- /dev/null
+++ b/fs/proc/proc_net.c
@@ -0,0 +1,241 @@
+/*
+ *  linux/fs/proc/net.c
+ *
+ *  Copyright (C) 2007
+ *
+ *  Author: Eric Biederman <ebiederm@xmission.com>
+ *
+ *  proc net directory handling functions
+ */
+
+#include <asm/uaccess.h>
+
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/mount.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+#include <linux/seq_file.h>
+
+#include "internal.h"
+
+
+static struct net *get_proc_net(const struct inode *inode)
+{
+	return maybe_get_net(PDE_NET(PDE(inode)));
+}
+
+int seq_open_net(struct inode *ino, struct file *f,
+		 const struct seq_operations *ops, int size)
+{
+	struct net *net;
+	struct seq_net_private *p;
+
+	BUG_ON(size < sizeof(*p));
+
+	net = get_proc_net(ino);
+	if (net == NULL)
+		return -ENXIO;
+
+	p = __seq_open_private(f, ops, size);
+	if (p == NULL) {
+		put_net(net);
+		return -ENOMEM;
+	}
+#ifdef CONFIG_NET_NS
+	p->net = net;
+#endif
+	return 0;
+}
+EXPORT_SYMBOL_GPL(seq_open_net);
+
+int single_open_net(struct inode *inode, struct file *file,
+		int (*show)(struct seq_file *, void *))
+{
+	int err;
+	struct net *net;
+
+	err = -ENXIO;
+	net = get_proc_net(inode);
+	if (net == NULL)
+		goto err_net;
+
+	err = single_open(file, show, net);
+	if (err < 0)
+		goto err_open;
+
+	return 0;
+
+err_open:
+	put_net(net);
+err_net:
+	return err;
+}
+EXPORT_SYMBOL_GPL(single_open_net);
+
+int seq_release_net(struct inode *ino, struct file *f)
+{
+	struct seq_file *seq;
+
+	seq = f->private_data;
+
+	put_net(seq_file_net(seq));
+	seq_release_private(ino, f);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(seq_release_net);
+
+int single_release_net(struct inode *ino, struct file *f)
+{
+	struct seq_file *seq = f->private_data;
+	put_net(seq->private);
+	return single_release(ino, f);
+}
+EXPORT_SYMBOL_GPL(single_release_net);
+
+static struct net *get_proc_task_net(struct inode *dir)
+{
+	struct task_struct *task;
+	struct nsproxy *ns;
+	struct net *net = NULL;
+
+	rcu_read_lock();
+	task = pid_task(proc_pid(dir), PIDTYPE_PID);
+	if (task != NULL) {
+		ns = task_nsproxy(task);
+		if (ns != NULL)
+			net = get_net(ns->net_ns);
+	}
+	rcu_read_unlock();
+
+	return net;
+}
+
+static struct dentry *proc_tgid_net_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	struct dentry *de;
+	struct net *net;
+
+	de = ERR_PTR(-ENOENT);
+	net = get_proc_task_net(dir);
+	if (net != NULL) {
+		de = proc_lookup_de(net->proc_net, dir, dentry);
+		put_net(net);
+	}
+	return de;
+}
+
+static int proc_tgid_net_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct net *net;
+
+	net = get_proc_task_net(inode);
+
+	generic_fillattr(inode, stat);
+
+	if (net != NULL) {
+		stat->nlink = net->proc_net->nlink;
+		put_net(net);
+	}
+
+	return 0;
+}
+
+const struct inode_operations proc_net_inode_operations = {
+	.lookup		= proc_tgid_net_lookup,
+	.getattr	= proc_tgid_net_getattr,
+};
+
+static int proc_tgid_net_readdir(struct file *filp, void *dirent,
+		filldir_t filldir)
+{
+	int ret;
+	struct net *net;
+
+	ret = -EINVAL;
+	net = get_proc_task_net(filp->f_path.dentry->d_inode);
+	if (net != NULL) {
+		ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
+		put_net(net);
+	}
+	return ret;
+}
+
+const struct file_operations proc_net_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= proc_tgid_net_readdir,
+};
+
+
+struct proc_dir_entry *proc_net_fops_create(struct net *net,
+	const char *name, mode_t mode, const struct file_operations *fops)
+{
+	return proc_create(name, mode, net->proc_net, fops);
+}
+EXPORT_SYMBOL_GPL(proc_net_fops_create);
+
+void proc_net_remove(struct net *net, const char *name)
+{
+	remove_proc_entry(name, net->proc_net);
+}
+EXPORT_SYMBOL_GPL(proc_net_remove);
+
+static __net_init int proc_net_ns_init(struct net *net)
+{
+	struct proc_dir_entry *netd, *net_statd;
+	int err;
+
+	err = -ENOMEM;
+	netd = kzalloc(sizeof(*netd), GFP_KERNEL);
+	if (!netd)
+		goto out;
+
+	netd->data = net;
+	netd->nlink = 2;
+	netd->name = "net";
+	netd->namelen = 3;
+	netd->parent = &proc_root;
+
+	err = -EEXIST;
+	net_statd = proc_net_mkdir(net, "stat", netd);
+	if (!net_statd)
+		goto free_net;
+
+	net->proc_net = netd;
+	net->proc_net_stat = net_statd;
+	return 0;
+
+free_net:
+	kfree(netd);
+out:
+	return err;
+}
+
+static __net_exit void proc_net_ns_exit(struct net *net)
+{
+	remove_proc_entry("stat", net->proc_net);
+	kfree(net->proc_net);
+}
+
+static struct pernet_operations __net_initdata proc_net_ns_ops = {
+	.init = proc_net_ns_init,
+	.exit = proc_net_ns_exit,
+};
+
+int __init proc_net_init(void)
+{
+	proc_symlink("net", NULL, "self/net");
+
+	return register_pernet_subsys(&proc_net_ns_ops);
+}
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
new file mode 100644
index 00000000..d167de36
--- /dev/null
+++ b/fs/proc/proc_sysctl.c
@@ -0,0 +1,436 @@
+/*
+ * /proc/sys support
+ */
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/namei.h>
+#include "internal.h"
+
+static const struct dentry_operations proc_sys_dentry_operations;
+static const struct file_operations proc_sys_file_operations;
+static const struct inode_operations proc_sys_inode_operations;
+static const struct file_operations proc_sys_dir_file_operations;
+static const struct inode_operations proc_sys_dir_operations;
+
+static struct inode *proc_sys_make_inode(struct super_block *sb,
+		struct ctl_table_header *head, struct ctl_table *table)
+{
+	struct inode *inode;
+	struct proc_inode *ei;
+
+	inode = new_inode(sb);
+	if (!inode)
+		goto out;
+
+	inode->i_ino = get_next_ino();
+
+	sysctl_head_get(head);
+	ei = PROC_I(inode);
+	ei->sysctl = head;
+	ei->sysctl_entry = table;
+
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode->i_mode = table->mode;
+	if (!table->child) {
+		inode->i_mode |= S_IFREG;
+		inode->i_op = &proc_sys_inode_operations;
+		inode->i_fop = &proc_sys_file_operations;
+	} else {
+		inode->i_mode |= S_IFDIR;
+		inode->i_nlink = 0;
+		inode->i_op = &proc_sys_dir_operations;
+		inode->i_fop = &proc_sys_dir_file_operations;
+	}
+out:
+	return inode;
+}
+
+static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
+{
+	int len;
+	for ( ; p->procname; p++) {
+
+		if (!p->procname)
+			continue;
+
+		len = strlen(p->procname);
+		if (len != name->len)
+			continue;
+
+		if (memcmp(p->procname, name->name, len) != 0)
+			continue;
+
+		/* I have a match */
+		return p;
+	}
+	return NULL;
+}
+
+static struct ctl_table_header *grab_header(struct inode *inode)
+{
+	if (PROC_I(inode)->sysctl)
+		return sysctl_head_grab(PROC_I(inode)->sysctl);
+	else
+		return sysctl_head_next(NULL);
+}
+
+static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
+					struct nameidata *nd)
+{
+	struct ctl_table_header *head = grab_header(dir);
+	struct ctl_table *table = PROC_I(dir)->sysctl_entry;
+	struct ctl_table_header *h = NULL;
+	struct qstr *name = &dentry->d_name;
+	struct ctl_table *p;
+	struct inode *inode;
+	struct dentry *err = ERR_PTR(-ENOENT);
+
+	if (IS_ERR(head))
+		return ERR_CAST(head);
+
+	if (table && !table->child) {
+		WARN_ON(1);
+		goto out;
+	}
+
+	table = table ? table->child : head->ctl_table;
+
+	p = find_in_table(table, name);
+	if (!p) {
+		for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
+			if (h->attached_to != table)
+				continue;
+			p = find_in_table(h->attached_by, name);
+			if (p)
+				break;
+		}
+	}
+
+	if (!p)
+		goto out;
+
+	err = ERR_PTR(-ENOMEM);
+	inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
+	if (h)
+		sysctl_head_finish(h);
+
+	if (!inode)
+		goto out;
+
+	err = NULL;
+	d_set_d_op(dentry, &proc_sys_dentry_operations);
+	d_add(dentry, inode);
+
+out:
+	sysctl_head_finish(head);
+	return err;
+}
+
+static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
+		size_t count, loff_t *ppos, int write)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	ssize_t error;
+	size_t res;
+
+	if (IS_ERR(head))
+		return PTR_ERR(head);
+
+	/*
+	 * At this point we know that the sysctl was not unregistered
+	 * and won't be until we finish.
+	 */
+	error = -EPERM;
+	if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ))
+		goto out;
+
+	/* if that can happen at all, it should be -EINVAL, not -EISDIR */
+	error = -EINVAL;
+	if (!table->proc_handler)
+		goto out;
+
+	/* careful: calling conventions are nasty here */
+	res = count;
+	error = table->proc_handler(table, write, buf, &res, ppos);
+	if (!error)
+		error = res;
+out:
+	sysctl_head_finish(head);
+
+	return error;
+}
+
+static ssize_t proc_sys_read(struct file *filp, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0);
+}
+
+static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
+}
+
+
+static int proc_sys_fill_cache(struct file *filp, void *dirent,
+				filldir_t filldir,
+				struct ctl_table_header *head,
+				struct ctl_table *table)
+{
+	struct dentry *child, *dir = filp->f_path.dentry;
+	struct inode *inode;
+	struct qstr qname;
+	ino_t ino = 0;
+	unsigned type = DT_UNKNOWN;
+
+	qname.name = table->procname;
+	qname.len  = strlen(table->procname);
+	qname.hash = full_name_hash(qname.name, qname.len);
+
+	child = d_lookup(dir, &qname);
+	if (!child) {
+		child = d_alloc(dir, &qname);
+		if (child) {
+			inode = proc_sys_make_inode(dir->d_sb, head, table);
+			if (!inode) {
+				dput(child);
+				return -ENOMEM;
+			} else {
+				d_set_d_op(child, &proc_sys_dentry_operations);
+				d_add(child, inode);
+			}
+		} else {
+			return -ENOMEM;
+		}
+	}
+	inode = child->d_inode;
+	ino  = inode->i_ino;
+	type = inode->i_mode >> 12;
+	dput(child);
+	return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
+}
+
+static int scan(struct ctl_table_header *head, ctl_table *table,
+		unsigned long *pos, struct file *file,
+		void *dirent, filldir_t filldir)
+{
+
+	for (; table->procname; table++, (*pos)++) {
+		int res;
+
+		/* Can't do anything without a proc name */
+		if (!table->procname)
+			continue;
+
+		if (*pos < file->f_pos)
+			continue;
+
+		res = proc_sys_fill_cache(file, dirent, filldir, head, table);
+		if (res)
+			return res;
+
+		file->f_pos = *pos + 1;
+	}
+	return 0;
+}
+
+static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	struct ctl_table_header *h = NULL;
+	unsigned long pos;
+	int ret = -EINVAL;
+
+	if (IS_ERR(head))
+		return PTR_ERR(head);
+
+	if (table && !table->child) {
+		WARN_ON(1);
+		goto out;
+	}
+
+	table = table ? table->child : head->ctl_table;
+
+	ret = 0;
+	/* Avoid a switch here: arm builds fail with missing __cmpdi2 */
+	if (filp->f_pos == 0) {
+		if (filldir(dirent, ".", 1, filp->f_pos,
+				inode->i_ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	}
+	if (filp->f_pos == 1) {
+		if (filldir(dirent, "..", 2, filp->f_pos,
+				parent_ino(dentry), DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	}
+	pos = 2;
+
+	ret = scan(head, table, &pos, filp, dirent, filldir);
+	if (ret)
+		goto out;
+
+	for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
+		if (h->attached_to != table)
+			continue;
+		ret = scan(h, h->attached_by, &pos, filp, dirent, filldir);
+		if (ret) {
+			sysctl_head_finish(h);
+			break;
+		}
+	}
+	ret = 1;
+out:
+	sysctl_head_finish(head);
+	return ret;
+}
+
+static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags)
+{
+	/*
+	 * sysctl entries that are not writeable,
+	 * are _NOT_ writeable, capabilities or not.
+	 */
+	struct ctl_table_header *head;
+	struct ctl_table *table;
+	int error;
+
+	/* Executable files are not allowed under /proc/sys/ */
+	if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
+		return -EACCES;
+
+	head = grab_header(inode);
+	if (IS_ERR(head))
+		return PTR_ERR(head);
+
+	table = PROC_I(inode)->sysctl_entry;
+	if (!table) /* global root - r-xr-xr-x */
+		error = mask & MAY_WRITE ? -EACCES : 0;
+	else /* Use the permissions on the sysctl table entry */
+		error = sysctl_perm(head->root, table, mask);
+
+	sysctl_head_finish(head);
+	return error;
+}
+
+static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
+		return -EPERM;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+	if (IS_ERR(head))
+		return PTR_ERR(head);
+
+	generic_fillattr(inode, stat);
+	if (table)
+		stat->mode = (stat->mode & S_IFMT) | table->mode;
+
+	sysctl_head_finish(head);
+	return 0;
+}
+
+static const struct file_operations proc_sys_file_operations = {
+	.read		= proc_sys_read,
+	.write		= proc_sys_write,
+	.llseek		= default_llseek,
+};
+
+static const struct file_operations proc_sys_dir_file_operations = {
+	.readdir	= proc_sys_readdir,
+	.llseek		= generic_file_llseek,
+};
+
+static const struct inode_operations proc_sys_inode_operations = {
+	.permission	= proc_sys_permission,
+	.setattr	= proc_sys_setattr,
+	.getattr	= proc_sys_getattr,
+};
+
+static const struct inode_operations proc_sys_dir_operations = {
+	.lookup		= proc_sys_lookup,
+	.permission	= proc_sys_permission,
+	.setattr	= proc_sys_setattr,
+	.getattr	= proc_sys_getattr,
+};
+
+static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+	return !PROC_I(dentry->d_inode)->sysctl->unregistering;
+}
+
+static int proc_sys_delete(const struct dentry *dentry)
+{
+	return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
+}
+
+static int proc_sys_compare(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	struct ctl_table_header *head;
+	/* Although proc doesn't have negative dentries, rcu-walk means
+	 * that inode here can be NULL */
+	/* AV: can it, indeed? */
+	if (!inode)
+		return 1;
+	if (name->len != len)
+		return 1;
+	if (memcmp(name->name, str, len))
+		return 1;
+	head = rcu_dereference(PROC_I(inode)->sysctl);
+	return !head || !sysctl_is_seen(head);
+}
+
+static const struct dentry_operations proc_sys_dentry_operations = {
+	.d_revalidate	= proc_sys_revalidate,
+	.d_delete	= proc_sys_delete,
+	.d_compare	= proc_sys_compare,
+};
+
+int __init proc_sys_init(void)
+{
+	struct proc_dir_entry *proc_sys_root;
+
+	proc_sys_root = proc_mkdir("sys", NULL);
+	proc_sys_root->proc_iops = &proc_sys_dir_operations;
+	proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
+	proc_sys_root->nlink = 0;
+	return 0;
+}
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
new file mode 100644
index 00000000..cb761f01
--- /dev/null
+++ b/fs/proc/proc_tty.c
@@ -0,0 +1,189 @@
+/*
+ * proc_tty.c -- handles /proc/tty
+ *
+ * Copyright 1997, Theodore Ts'o
+ */
+
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/tty.h>
+#include <linux/seq_file.h>
+#include <linux/bitops.h>
+
+/*
+ * The /proc/tty directory inodes...
+ */
+static struct proc_dir_entry *proc_tty_ldisc, *proc_tty_driver;
+
+/*
+ * This is the handler for /proc/tty/drivers
+ */
+static void show_tty_range(struct seq_file *m, struct tty_driver *p,
+	dev_t from, int num)
+{
+	seq_printf(m, "%-20s ", p->driver_name ? p->driver_name : "unknown");
+	seq_printf(m, "/dev/%-8s ", p->name);
+	if (p->num > 1) {
+		seq_printf(m, "%3d %d-%d ", MAJOR(from), MINOR(from),
+			MINOR(from) + num - 1);
+	} else {
+		seq_printf(m, "%3d %7d ", MAJOR(from), MINOR(from));
+	}
+	switch (p->type) {
+	case TTY_DRIVER_TYPE_SYSTEM:
+		seq_puts(m, "system");
+		if (p->subtype == SYSTEM_TYPE_TTY)
+			seq_puts(m, ":/dev/tty");
+		else if (p->subtype == SYSTEM_TYPE_SYSCONS)
+			seq_puts(m, ":console");
+		else if (p->subtype == SYSTEM_TYPE_CONSOLE)
+			seq_puts(m, ":vtmaster");
+		break;
+	case TTY_DRIVER_TYPE_CONSOLE:
+		seq_puts(m, "console");
+		break;
+	case TTY_DRIVER_TYPE_SERIAL:
+		seq_puts(m, "serial");
+		break;
+	case TTY_DRIVER_TYPE_PTY:
+		if (p->subtype == PTY_TYPE_MASTER)
+			seq_puts(m, "pty:master");
+		else if (p->subtype == PTY_TYPE_SLAVE)
+			seq_puts(m, "pty:slave");
+		else
+			seq_puts(m, "pty");
+		break;
+	default:
+		seq_printf(m, "type:%d.%d", p->type, p->subtype);
+	}
+	seq_putc(m, '\n');
+}
+
+static int show_tty_driver(struct seq_file *m, void *v)
+{
+	struct tty_driver *p = list_entry(v, struct tty_driver, tty_drivers);
+	dev_t from = MKDEV(p->major, p->minor_start);
+	dev_t to = from + p->num;
+
+	if (&p->tty_drivers == tty_drivers.next) {
+		/* pseudo-drivers first */
+		seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
+		seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0);
+		seq_puts(m, "system:/dev/tty\n");
+		seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console");
+		seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1);
+		seq_puts(m, "system:console\n");
+#ifdef CONFIG_UNIX98_PTYS
+		seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx");
+		seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2);
+		seq_puts(m, "system\n");
+#endif
+#ifdef CONFIG_VT
+		seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0");
+		seq_printf(m, "%3d %7d ", TTY_MAJOR, 0);
+		seq_puts(m, "system:vtmaster\n");
+#endif
+	}
+
+	while (MAJOR(from) < MAJOR(to)) {
+		dev_t next = MKDEV(MAJOR(from)+1, 0);
+		show_tty_range(m, p, from, next - from);
+		from = next;
+	}
+	if (from != to)
+		show_tty_range(m, p, from, to - from);
+	return 0;
+}
+
+/* iterator */
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	mutex_lock(&tty_mutex);
+	return seq_list_start(&tty_drivers, *pos);
+}
+
+static void *t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &tty_drivers, pos);
+}
+
+static void t_stop(struct seq_file *m, void *v)
+{
+	mutex_unlock(&tty_mutex);
+}
+
+static const struct seq_operations tty_drivers_op = {
+	.start	= t_start,
+	.next	= t_next,
+	.stop	= t_stop,
+	.show	= show_tty_driver
+};
+
+static int tty_drivers_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &tty_drivers_op);
+}
+
+static const struct file_operations proc_tty_drivers_operations = {
+	.open		= tty_drivers_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+/*
+ * This function is called by tty_register_driver() to handle
+ * registering the driver's /proc handler into /proc/tty/driver/<foo>
+ */
+void proc_tty_register_driver(struct tty_driver *driver)
+{
+	struct proc_dir_entry *ent;
+		
+	if (!driver->driver_name || driver->proc_entry ||
+	    !driver->ops->proc_fops)
+		return;
+
+	ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
+			       driver->ops->proc_fops, driver);
+	driver->proc_entry = ent;
+}
+
+/*
+ * This function is called by tty_unregister_driver()
+ */
+void proc_tty_unregister_driver(struct tty_driver *driver)
+{
+	struct proc_dir_entry *ent;
+
+	ent = driver->proc_entry;
+	if (!ent)
+		return;
+		
+	remove_proc_entry(driver->driver_name, proc_tty_driver);
+	
+	driver->proc_entry = NULL;
+}
+
+/*
+ * Called by proc_root_init() to initialize the /proc/tty subtree
+ */
+void __init proc_tty_init(void)
+{
+	if (!proc_mkdir("tty", NULL))
+		return;
+	proc_tty_ldisc = proc_mkdir("tty/ldisc", NULL);
+	/*
+	 * /proc/tty/driver/serial reveals the exact character counts for
+	 * serial links which is just too easy to abuse for inferring
+	 * password lengths and inter-keystroke timings during password
+	 * entry.
+	 */
+	proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL);
+	proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops);
+	proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations);
+}
diff --git a/fs/proc/root.c b/fs/proc/root.c
new file mode 100644
index 00000000..d6c3b416
--- /dev/null
+++ b/fs/proc/root.c
@@ -0,0 +1,213 @@
+/*
+ *  linux/fs/proc/root.c
+ *
+ *  Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ *  proc root directory handling functions
+ */
+
+#include <asm/uaccess.h>
+
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/mount.h>
+#include <linux/pid_namespace.h>
+
+#include "internal.h"
+
+static int proc_test_super(struct super_block *sb, void *data)
+{
+	return sb->s_fs_info == data;
+}
+
+static int proc_set_super(struct super_block *sb, void *data)
+{
+	int err = set_anon_super(sb, NULL);
+	if (!err) {
+		struct pid_namespace *ns = (struct pid_namespace *)data;
+		sb->s_fs_info = get_pid_ns(ns);
+	}
+	return err;
+}
+
+static struct dentry *proc_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	int err;
+	struct super_block *sb;
+	struct pid_namespace *ns;
+	struct proc_inode *ei;
+
+	if (flags & MS_KERNMOUNT)
+		ns = (struct pid_namespace *)data;
+	else
+		ns = current->nsproxy->pid_ns;
+
+	sb = sget(fs_type, proc_test_super, proc_set_super, ns);
+	if (IS_ERR(sb))
+		return ERR_CAST(sb);
+
+	if (!sb->s_root) {
+		sb->s_flags = flags;
+		err = proc_fill_super(sb);
+		if (err) {
+			deactivate_locked_super(sb);
+			return ERR_PTR(err);
+		}
+
+		sb->s_flags |= MS_ACTIVE;
+	}
+
+	ei = PROC_I(sb->s_root->d_inode);
+	if (!ei->pid) {
+		rcu_read_lock();
+		ei->pid = get_pid(find_pid_ns(1, ns));
+		rcu_read_unlock();
+	}
+
+	return dget(sb->s_root);
+}
+
+static void proc_kill_sb(struct super_block *sb)
+{
+	struct pid_namespace *ns;
+
+	ns = (struct pid_namespace *)sb->s_fs_info;
+	kill_anon_super(sb);
+	put_pid_ns(ns);
+}
+
+static struct file_system_type proc_fs_type = {
+	.name		= "proc",
+	.mount		= proc_mount,
+	.kill_sb	= proc_kill_sb,
+};
+
+void __init proc_root_init(void)
+{
+	struct vfsmount *mnt;
+	int err;
+
+	proc_init_inodecache();
+	err = register_filesystem(&proc_fs_type);
+	if (err)
+		return;
+	mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
+	if (IS_ERR(mnt)) {
+		unregister_filesystem(&proc_fs_type);
+		return;
+	}
+
+	init_pid_ns.proc_mnt = mnt;
+	proc_symlink("mounts", NULL, "self/mounts");
+
+	proc_net_init();
+
+#ifdef CONFIG_SYSVIPC
+	proc_mkdir("sysvipc", NULL);
+#endif
+	proc_mkdir("fs", NULL);
+	proc_mkdir("driver", NULL);
+	proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */
+#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
+	/* just give it a mountpoint */
+	proc_mkdir("openprom", NULL);
+#endif
+	proc_tty_init();
+#ifdef CONFIG_PROC_DEVICETREE
+	proc_device_tree_init();
+#endif
+	proc_mkdir("bus", NULL);
+	proc_sys_init();
+}
+
+static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat
+)
+{
+	generic_fillattr(dentry->d_inode, stat);
+	stat->nlink = proc_root.nlink + nr_processes();
+	return 0;
+}
+
+static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+{
+	if (!proc_lookup(dir, dentry, nd)) {
+		return NULL;
+	}
+	
+	return proc_pid_lookup(dir, dentry, nd);
+}
+
+static int proc_root_readdir(struct file * filp,
+	void * dirent, filldir_t filldir)
+{
+	unsigned int nr = filp->f_pos;
+	int ret;
+
+	if (nr < FIRST_PROCESS_ENTRY) {
+		int error = proc_readdir(filp, dirent, filldir);
+		if (error <= 0)
+			return error;
+		filp->f_pos = FIRST_PROCESS_ENTRY;
+	}
+
+	ret = proc_pid_readdir(filp, dirent, filldir);
+	return ret;
+}
+
+/*
+ * The root /proc directory is special, as it has the
+ * <pid> directories. Thus we don't use the generic
+ * directory handling functions for that..
+ */
+static const struct file_operations proc_root_operations = {
+	.read		 = generic_read_dir,
+	.readdir	 = proc_root_readdir,
+	.llseek		= default_llseek,
+};
+
+/*
+ * proc root can do almost nothing..
+ */
+static const struct inode_operations proc_root_inode_operations = {
+	.lookup		= proc_root_lookup,
+	.getattr	= proc_root_getattr,
+};
+
+/*
+ * This is the root "inode" in the /proc tree..
+ */
+struct proc_dir_entry proc_root = {
+	.low_ino	= PROC_ROOT_INO, 
+	.namelen	= 5, 
+	.name		= "/proc",
+	.mode		= S_IFDIR | S_IRUGO | S_IXUGO, 
+	.nlink		= 2, 
+	.count		= ATOMIC_INIT(1),
+	.proc_iops	= &proc_root_inode_operations, 
+	.proc_fops	= &proc_root_operations,
+	.parent		= &proc_root,
+};
+
+int pid_ns_prepare_proc(struct pid_namespace *ns)
+{
+	struct vfsmount *mnt;
+
+	mnt = kern_mount_data(&proc_fs_type, ns);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+
+	ns->proc_mnt = mnt;
+	return 0;
+}
+
+void pid_ns_release_proc(struct pid_namespace *ns)
+{
+	mntput(ns->proc_mnt);
+}
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
new file mode 100644
index 00000000..62604be9
--- /dev/null
+++ b/fs/proc/softirqs.c
@@ -0,0 +1,44 @@
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+/*
+ * /proc/softirqs  ... display the number of softirqs
+ */
+static int show_softirqs(struct seq_file *p, void *v)
+{
+	int i, j;
+
+	seq_puts(p, "                    ");
+	for_each_possible_cpu(i)
+		seq_printf(p, "CPU%-8d", i);
+	seq_putc(p, '\n');
+
+	for (i = 0; i < NR_SOFTIRQS; i++) {
+		seq_printf(p, "%12s:", softirq_to_name[i]);
+		for_each_possible_cpu(j)
+			seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
+		seq_putc(p, '\n');
+	}
+	return 0;
+}
+
+static int softirqs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_softirqs, NULL);
+}
+
+static const struct file_operations proc_softirqs_operations = {
+	.open		= softirqs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_softirqs_init(void)
+{
+	proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
+	return 0;
+}
+module_init(proc_softirqs_init);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
new file mode 100644
index 00000000..9758b654
--- /dev/null
+++ b/fs/proc/stat.c
@@ -0,0 +1,170 @@
+#include <linux/cpumask.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/irqnr.h>
+#include <asm/cputime.h>
+
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
+#ifndef arch_idle_time
+#define arch_idle_time(cpu) 0
+#endif
+
+static int show_stat(struct seq_file *p, void *v)
+{
+	int i, j;
+	unsigned long jif;
+	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
+	cputime64_t guest, guest_nice;
+	u64 sum = 0;
+	u64 sum_softirq = 0;
+	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
+	struct timespec boottime;
+
+	user = nice = system = idle = iowait =
+		irq = softirq = steal = cputime64_zero;
+	guest = guest_nice = cputime64_zero;
+	getboottime(&boottime);
+	jif = boottime.tv_sec;
+
+	for_each_possible_cpu(i) {
+		user = cputime64_add(user, kstat_cpu(i).cpustat.user);
+		nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
+		system = cputime64_add(system, kstat_cpu(i).cpustat.system);
+		idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
+		idle = cputime64_add(idle, arch_idle_time(i));
+		iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
+		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
+		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
+		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
+		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+		guest_nice = cputime64_add(guest_nice,
+			kstat_cpu(i).cpustat.guest_nice);
+		sum += kstat_cpu_irqs_sum(i);
+		sum += arch_irq_stat_cpu(i);
+
+		for (j = 0; j < NR_SOFTIRQS; j++) {
+			unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
+
+			per_softirq_sums[j] += softirq_stat;
+			sum_softirq += softirq_stat;
+		}
+	}
+	sum += arch_irq_stat();
+
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+		"%llu\n",
+		(unsigned long long)cputime64_to_clock_t(user),
+		(unsigned long long)cputime64_to_clock_t(nice),
+		(unsigned long long)cputime64_to_clock_t(system),
+		(unsigned long long)cputime64_to_clock_t(idle),
+		(unsigned long long)cputime64_to_clock_t(iowait),
+		(unsigned long long)cputime64_to_clock_t(irq),
+		(unsigned long long)cputime64_to_clock_t(softirq),
+		(unsigned long long)cputime64_to_clock_t(steal),
+		(unsigned long long)cputime64_to_clock_t(guest),
+		(unsigned long long)cputime64_to_clock_t(guest_nice));
+	for_each_online_cpu(i) {
+
+		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
+		user = kstat_cpu(i).cpustat.user;
+		nice = kstat_cpu(i).cpustat.nice;
+		system = kstat_cpu(i).cpustat.system;
+		idle = kstat_cpu(i).cpustat.idle;
+		idle = cputime64_add(idle, arch_idle_time(i));
+		iowait = kstat_cpu(i).cpustat.iowait;
+		irq = kstat_cpu(i).cpustat.irq;
+		softirq = kstat_cpu(i).cpustat.softirq;
+		steal = kstat_cpu(i).cpustat.steal;
+		guest = kstat_cpu(i).cpustat.guest;
+		guest_nice = kstat_cpu(i).cpustat.guest_nice;
+		seq_printf(p,
+			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+			"%llu\n",
+			i,
+			(unsigned long long)cputime64_to_clock_t(user),
+			(unsigned long long)cputime64_to_clock_t(nice),
+			(unsigned long long)cputime64_to_clock_t(system),
+			(unsigned long long)cputime64_to_clock_t(idle),
+			(unsigned long long)cputime64_to_clock_t(iowait),
+			(unsigned long long)cputime64_to_clock_t(irq),
+			(unsigned long long)cputime64_to_clock_t(softirq),
+			(unsigned long long)cputime64_to_clock_t(steal),
+			(unsigned long long)cputime64_to_clock_t(guest),
+			(unsigned long long)cputime64_to_clock_t(guest_nice));
+	}
+	seq_printf(p, "intr %llu", (unsigned long long)sum);
+
+	/* sum again ? it could be updated? */
+	for_each_irq_nr(j)
+		seq_printf(p, " %u", kstat_irqs(j));
+
+	seq_printf(p,
+		"\nctxt %llu\n"
+		"btime %lu\n"
+		"processes %lu\n"
+		"procs_running %lu\n"
+		"procs_blocked %lu\n",
+		nr_context_switches(),
+		(unsigned long)jif,
+		total_forks,
+		nr_running(),
+		nr_iowait());
+
+	seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
+
+	for (i = 0; i < NR_SOFTIRQS; i++)
+		seq_printf(p, " %u", per_softirq_sums[i]);
+	seq_putc(p, '\n');
+
+	return 0;
+}
+
+static int stat_open(struct inode *inode, struct file *file)
+{
+	unsigned size = 4096 * (1 + num_possible_cpus() / 32);
+	char *buf;
+	struct seq_file *m;
+	int res;
+
+	/* don't ask for more than the kmalloc() max size */
+	if (size > KMALLOC_MAX_SIZE)
+		size = KMALLOC_MAX_SIZE;
+	buf = kmalloc(size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	res = single_open(file, show_stat, NULL);
+	if (!res) {
+		m = file->private_data;
+		m->buf = buf;
+		m->size = size;
+	} else
+		kfree(buf);
+	return res;
+}
+
+static const struct file_operations proc_stat_operations = {
+	.open		= stat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_stat_init(void)
+{
+	proc_create("stat", 0, NULL, &proc_stat_operations);
+	return 0;
+}
+module_init(proc_stat_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
new file mode 100644
index 00000000..55a1f494
--- /dev/null
+++ b/fs/proc/task_mmu.c
@@ -0,0 +1,1125 @@
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/huge_mm.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+
+#include <asm/elf.h>
+#include <asm/uaccess.h>
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+void task_mem(struct seq_file *m, struct mm_struct *mm)
+{
+	unsigned long data, text, lib, swap;
+	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+
+	/*
+	 * Note: to minimize their overhead, mm maintains hiwater_vm and
+	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
+	 * collector of these hiwater stats must therefore get total_vm
+	 * and rss too, which will usually be the higher.  Barriers? not
+	 * worth the effort, such snapshots can always be inconsistent.
+	 */
+	hiwater_vm = total_vm = mm->total_vm;
+	if (hiwater_vm < mm->hiwater_vm)
+		hiwater_vm = mm->hiwater_vm;
+	hiwater_rss = total_rss = get_mm_rss(mm);
+	if (hiwater_rss < mm->hiwater_rss)
+		hiwater_rss = mm->hiwater_rss;
+
+	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
+	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
+	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+	swap = get_mm_counter(mm, MM_SWAPENTS);
+	seq_printf(m,
+		"VmPeak:\t%8lu kB\n"
+		"VmSize:\t%8lu kB\n"
+		"VmLck:\t%8lu kB\n"
+		"VmHWM:\t%8lu kB\n"
+		"VmRSS:\t%8lu kB\n"
+		"VmData:\t%8lu kB\n"
+		"VmStk:\t%8lu kB\n"
+		"VmExe:\t%8lu kB\n"
+		"VmLib:\t%8lu kB\n"
+		"VmPTE:\t%8lu kB\n"
+		"VmSwap:\t%8lu kB\n",
+		hiwater_vm << (PAGE_SHIFT-10),
+		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+		mm->locked_vm << (PAGE_SHIFT-10),
+		hiwater_rss << (PAGE_SHIFT-10),
+		total_rss << (PAGE_SHIFT-10),
+		data << (PAGE_SHIFT-10),
+		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
+		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+		swap << (PAGE_SHIFT-10));
+}
+
+unsigned long task_vsize(struct mm_struct *mm)
+{
+	return PAGE_SIZE * mm->total_vm;
+}
+
+unsigned long task_statm(struct mm_struct *mm,
+			 unsigned long *shared, unsigned long *text,
+			 unsigned long *data, unsigned long *resident)
+{
+	*shared = get_mm_counter(mm, MM_FILEPAGES);
+	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
+								>> PAGE_SHIFT;
+	*data = mm->total_vm - mm->shared_vm;
+	*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
+	return mm->total_vm;
+}
+
+static void pad_len_spaces(struct seq_file *m, int len)
+{
+	len = 25 + sizeof(void*) * 6 - len;
+	if (len < 1)
+		len = 1;
+	seq_printf(m, "%*c", len, ' ');
+}
+
+static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
+{
+	if (vma && vma != priv->tail_vma) {
+		struct mm_struct *mm = vma->vm_mm;
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+	}
+}
+
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+	struct proc_maps_private *priv = m->private;
+	unsigned long last_addr = m->version;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma, *tail_vma = NULL;
+	loff_t l = *pos;
+
+	/* Clear the per syscall fields in priv */
+	priv->task = NULL;
+	priv->tail_vma = NULL;
+
+	/*
+	 * We remember last_addr rather than next_addr to hit with
+	 * mmap_cache most of the time. We have zero last_addr at
+	 * the beginning and also after lseek. We will have -1 last_addr
+	 * after the end of the vmas.
+	 */
+
+	if (last_addr == -1UL)
+		return NULL;
+
+	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+	if (!priv->task)
+		return ERR_PTR(-ESRCH);
+
+	mm = mm_for_maps(priv->task);
+	if (!mm || IS_ERR(mm))
+		return mm;
+	down_read(&mm->mmap_sem);
+
+	tail_vma = get_gate_vma(priv->task->mm);
+	priv->tail_vma = tail_vma;
+
+	/* Start with last addr hint */
+	vma = find_vma(mm, last_addr);
+	if (last_addr && vma) {
+		vma = vma->vm_next;
+		goto out;
+	}
+
+	/*
+	 * Check the vma index is within the range and do
+	 * sequential scan until m_index.
+	 */
+	vma = NULL;
+	if ((unsigned long)l < mm->map_count) {
+		vma = mm->mmap;
+		while (l-- && vma)
+			vma = vma->vm_next;
+		goto out;
+	}
+
+	if (l != mm->map_count)
+		tail_vma = NULL; /* After gate vma */
+
+out:
+	if (vma)
+		return vma;
+
+	/* End of vmas has been reached */
+	m->version = (tail_vma != NULL)? 0: -1UL;
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+	return tail_vma;
+}
+
+static void *m_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct proc_maps_private *priv = m->private;
+	struct vm_area_struct *vma = v;
+	struct vm_area_struct *tail_vma = priv->tail_vma;
+
+	(*pos)++;
+	if (vma && (vma != tail_vma) && vma->vm_next)
+		return vma->vm_next;
+	vma_stop(priv, vma);
+	return (vma != tail_vma)? tail_vma: NULL;
+}
+
+static void m_stop(struct seq_file *m, void *v)
+{
+	struct proc_maps_private *priv = m->private;
+	struct vm_area_struct *vma = v;
+
+	if (!IS_ERR(vma))
+		vma_stop(priv, vma);
+	if (priv->task)
+		put_task_struct(priv->task);
+}
+
+static int do_maps_open(struct inode *inode, struct file *file,
+			const struct seq_operations *ops)
+{
+	struct proc_maps_private *priv;
+	int ret = -ENOMEM;
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv) {
+		priv->pid = proc_pid(inode);
+		ret = seq_open(file, ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = priv;
+		} else {
+			kfree(priv);
+		}
+	}
+	return ret;
+}
+
+static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct file *file = vma->vm_file;
+	vm_flags_t flags = vma->vm_flags;
+	unsigned long ino = 0;
+	unsigned long long pgoff = 0;
+	unsigned long start, end;
+	dev_t dev = 0;
+	int len;
+
+	if (file) {
+		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+		dev = inode->i_sb->s_dev;
+		ino = inode->i_ino;
+		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
+	}
+
+	/* We don't show the stack guard page in /proc/maps */
+	start = vma->vm_start;
+	if (stack_guard_page_start(vma, start))
+		start += PAGE_SIZE;
+	end = vma->vm_end;
+	if (stack_guard_page_end(vma, end))
+		end -= PAGE_SIZE;
+
+	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+			start,
+			end,
+			flags & VM_READ ? 'r' : '-',
+			flags & VM_WRITE ? 'w' : '-',
+			flags & VM_EXEC ? 'x' : '-',
+			flags & VM_MAYSHARE ? 's' : 'p',
+			pgoff,
+			MAJOR(dev), MINOR(dev), ino, &len);
+
+	/*
+	 * Print the dentry name for named mappings, and a
+	 * special [heap] marker for the heap:
+	 */
+	if (file) {
+		pad_len_spaces(m, len);
+		seq_path(m, &file->f_path, "\n");
+	} else {
+		const char *name = arch_vma_name(vma);
+		if (!name) {
+			if (mm) {
+				if (vma->vm_start <= mm->brk &&
+						vma->vm_end >= mm->start_brk) {
+					name = "[heap]";
+				} else if (vma->vm_start <= mm->start_stack &&
+					   vma->vm_end >= mm->start_stack) {
+					name = "[stack]";
+				}
+			} else {
+				name = "[vdso]";
+			}
+		}
+		if (name) {
+			pad_len_spaces(m, len);
+			seq_puts(m, name);
+		}
+	}
+	seq_putc(m, '\n');
+}
+
+static int show_map(struct seq_file *m, void *v)
+{
+	struct vm_area_struct *vma = v;
+	struct proc_maps_private *priv = m->private;
+	struct task_struct *task = priv->task;
+
+	show_map_vma(m, vma);
+
+	if (m->count < m->size)  /* vma is copied successfully */
+		m->version = (vma != get_gate_vma(task->mm))
+			? vma->vm_start : 0;
+	return 0;
+}
+
+static const struct seq_operations proc_pid_maps_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_map
+};
+
+static int maps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_pid_maps_op);
+}
+
+const struct file_operations proc_maps_operations = {
+	.open		= maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+
+/*
+ * Proportional Set Size(PSS): my share of RSS.
+ *
+ * PSS of a process is the count of pages it has in memory, where each
+ * page is divided by the number of processes sharing it.  So if a
+ * process has 1000 pages all to itself, and 1000 shared with one other
+ * process, its PSS will be 1500.
+ *
+ * To keep (accumulated) division errors low, we adopt a 64bit
+ * fixed-point pss counter to minimize division errors. So (pss >>
+ * PSS_SHIFT) would be the real byte count.
+ *
+ * A shift of 12 before division means (assuming 4K page size):
+ * 	- 1M 3-user-pages add up to 8KB errors;
+ * 	- supports mapcount up to 2^24, or 16M;
+ * 	- supports PSS up to 2^52 bytes, or 4PB.
+ */
+#define PSS_SHIFT 12
+
+#ifdef CONFIG_PROC_PAGE_MONITOR
+struct mem_size_stats {
+	struct vm_area_struct *vma;
+	unsigned long resident;
+	unsigned long shared_clean;
+	unsigned long shared_dirty;
+	unsigned long private_clean;
+	unsigned long private_dirty;
+	unsigned long referenced;
+	unsigned long anonymous;
+	unsigned long anonymous_thp;
+	unsigned long swap;
+	u64 pss;
+};
+
+
+static void smaps_pte_entry(pte_t ptent, unsigned long addr,
+		unsigned long ptent_size, struct mm_walk *walk)
+{
+	struct mem_size_stats *mss = walk->private;
+	struct vm_area_struct *vma = mss->vma;
+	struct page *page;
+	int mapcount;
+
+	if (is_swap_pte(ptent)) {
+		mss->swap += ptent_size;
+		return;
+	}
+
+	if (!pte_present(ptent))
+		return;
+
+	page = vm_normal_page(vma, addr, ptent);
+	if (!page)
+		return;
+
+	if (PageAnon(page))
+		mss->anonymous += ptent_size;
+
+	mss->resident += ptent_size;
+	/* Accumulate the size in pages that have been accessed. */
+	if (pte_young(ptent) || PageReferenced(page))
+		mss->referenced += ptent_size;
+	mapcount = page_mapcount(page);
+	if (mapcount >= 2) {
+		if (pte_dirty(ptent) || PageDirty(page))
+			mss->shared_dirty += ptent_size;
+		else
+			mss->shared_clean += ptent_size;
+		mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
+	} else {
+		if (pte_dirty(ptent) || PageDirty(page))
+			mss->private_dirty += ptent_size;
+		else
+			mss->private_clean += ptent_size;
+		mss->pss += (ptent_size << PSS_SHIFT);
+	}
+}
+
+static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			   struct mm_walk *walk)
+{
+	struct mem_size_stats *mss = walk->private;
+	struct vm_area_struct *vma = mss->vma;
+	pte_t *pte;
+	spinlock_t *ptl;
+
+	spin_lock(&walk->mm->page_table_lock);
+	if (pmd_trans_huge(*pmd)) {
+		if (pmd_trans_splitting(*pmd)) {
+			spin_unlock(&walk->mm->page_table_lock);
+			wait_split_huge_page(vma->anon_vma, pmd);
+		} else {
+			smaps_pte_entry(*(pte_t *)pmd, addr,
+					HPAGE_PMD_SIZE, walk);
+			spin_unlock(&walk->mm->page_table_lock);
+			mss->anonymous_thp += HPAGE_PMD_SIZE;
+			return 0;
+		}
+	} else {
+		spin_unlock(&walk->mm->page_table_lock);
+	}
+
+	if (pmd_trans_unstable(pmd))
+		return 0;
+	/*
+	 * The mmap_sem held all the way back in m_start() is what
+	 * keeps khugepaged out of here and from collapsing things
+	 * in here.
+	 */
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (; addr != end; pte++, addr += PAGE_SIZE)
+		smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
+	return 0;
+}
+
+static int show_smap(struct seq_file *m, void *v)
+{
+	struct proc_maps_private *priv = m->private;
+	struct task_struct *task = priv->task;
+	struct vm_area_struct *vma = v;
+	struct mem_size_stats mss;
+	struct mm_walk smaps_walk = {
+		.pmd_entry = smaps_pte_range,
+		.mm = vma->vm_mm,
+		.private = &mss,
+	};
+
+	memset(&mss, 0, sizeof mss);
+	mss.vma = vma;
+	/* mmap_sem is held in m_start */
+	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
+		walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
+
+	show_map_vma(m, vma);
+
+	seq_printf(m,
+		   "Size:           %8lu kB\n"
+		   "Rss:            %8lu kB\n"
+		   "Pss:            %8lu kB\n"
+		   "Shared_Clean:   %8lu kB\n"
+		   "Shared_Dirty:   %8lu kB\n"
+		   "Private_Clean:  %8lu kB\n"
+		   "Private_Dirty:  %8lu kB\n"
+		   "Referenced:     %8lu kB\n"
+		   "Anonymous:      %8lu kB\n"
+		   "AnonHugePages:  %8lu kB\n"
+		   "Swap:           %8lu kB\n"
+		   "KernelPageSize: %8lu kB\n"
+		   "MMUPageSize:    %8lu kB\n"
+		   "Locked:         %8lu kB\n",
+		   (vma->vm_end - vma->vm_start) >> 10,
+		   mss.resident >> 10,
+		   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
+		   mss.shared_clean  >> 10,
+		   mss.shared_dirty  >> 10,
+		   mss.private_clean >> 10,
+		   mss.private_dirty >> 10,
+		   mss.referenced >> 10,
+		   mss.anonymous >> 10,
+		   mss.anonymous_thp >> 10,
+		   mss.swap >> 10,
+		   vma_kernel_pagesize(vma) >> 10,
+		   vma_mmu_pagesize(vma) >> 10,
+		   (vma->vm_flags & VM_LOCKED) ?
+			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
+
+	if (m->count < m->size)  /* vma is copied successfully */
+		m->version = (vma != get_gate_vma(task->mm))
+			? vma->vm_start : 0;
+	return 0;
+}
+
+static const struct seq_operations proc_pid_smaps_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_smap
+};
+
+static int smaps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_pid_smaps_op);
+}
+
+const struct file_operations proc_smaps_operations = {
+	.open		= smaps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+
+static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
+				unsigned long end, struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->private;
+	pte_t *pte, ptent;
+	spinlock_t *ptl;
+	struct page *page;
+
+	split_huge_page_pmd(walk->mm, pmd);
+	if (pmd_trans_unstable(pmd))
+		return 0;
+
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (; addr != end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+		if (!pte_present(ptent))
+			continue;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
+			continue;
+
+		if (PageReserved(page))
+			continue;
+
+		/* Clear accessed and referenced bits. */
+		ptep_test_and_clear_young(vma, addr, pte);
+		ClearPageReferenced(page);
+	}
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
+	return 0;
+}
+
+#define CLEAR_REFS_ALL 1
+#define CLEAR_REFS_ANON 2
+#define CLEAR_REFS_MAPPED 3
+
+static ssize_t clear_refs_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF];
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	int type;
+	int rv;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+	rv = kstrtoint(strstrip(buffer), 10, &type);
+	if (rv < 0)
+		return rv;
+	if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
+		return -EINVAL;
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task)
+		return -ESRCH;
+	mm = get_task_mm(task);
+	if (mm) {
+		struct mm_walk clear_refs_walk = {
+			.pmd_entry = clear_refs_pte_range,
+			.mm = mm,
+		};
+		down_read(&mm->mmap_sem);
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			clear_refs_walk.private = vma;
+			if (is_vm_hugetlb_page(vma))
+				continue;
+			/*
+			 * Writing 1 to /proc/pid/clear_refs affects all pages.
+			 *
+			 * Writing 2 to /proc/pid/clear_refs only affects
+			 * Anonymous pages.
+			 *
+			 * Writing 3 to /proc/pid/clear_refs only affects file
+			 * mapped pages.
+			 */
+			if (type == CLEAR_REFS_ANON && vma->vm_file)
+				continue;
+			if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
+				continue;
+			walk_page_range(vma->vm_start, vma->vm_end,
+					&clear_refs_walk);
+		}
+		flush_tlb_mm(mm);
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+	}
+	put_task_struct(task);
+
+	return count;
+}
+
+const struct file_operations proc_clear_refs_operations = {
+	.write		= clear_refs_write,
+	.llseek		= noop_llseek,
+};
+
+struct pagemapread {
+	int pos, len;
+	u64 *buffer;
+};
+
+#define PM_ENTRY_BYTES      sizeof(u64)
+#define PM_STATUS_BITS      3
+#define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)
+#define PM_STATUS_MASK      (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
+#define PM_STATUS(nr)       (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
+#define PM_PSHIFT_BITS      6
+#define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
+#define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
+#define PM_PSHIFT(x)        (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
+#define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
+
+#define PM_PRESENT          PM_STATUS(4LL)
+#define PM_SWAP             PM_STATUS(2LL)
+#define PM_NOT_PRESENT      PM_PSHIFT(PAGE_SHIFT)
+#define PM_END_OF_BUFFER    1
+
+static int add_to_pagemap(unsigned long addr, u64 pfn,
+			  struct pagemapread *pm)
+{
+	pm->buffer[pm->pos++] = pfn;
+	if (pm->pos >= pm->len)
+		return PM_END_OF_BUFFER;
+	return 0;
+}
+
+static int pagemap_pte_hole(unsigned long start, unsigned long end,
+				struct mm_walk *walk)
+{
+	struct pagemapread *pm = walk->private;
+	unsigned long addr;
+	int err = 0;
+	for (addr = start; addr < end; addr += PAGE_SIZE) {
+		err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+static u64 swap_pte_to_pagemap_entry(pte_t pte)
+{
+	swp_entry_t e = pte_to_swp_entry(pte);
+	return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
+}
+
+static u64 pte_to_pagemap_entry(pte_t pte)
+{
+	u64 pme = 0;
+	if (is_swap_pte(pte))
+		pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte))
+			| PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
+	else if (pte_present(pte))
+		pme = PM_PFRAME(pte_pfn(pte))
+			| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
+	return pme;
+}
+
+static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			     struct mm_walk *walk)
+{
+	struct vm_area_struct *vma;
+	struct pagemapread *pm = walk->private;
+	pte_t *pte;
+	int err = 0;
+
+	split_huge_page_pmd(walk->mm, pmd);
+	if (pmd_trans_unstable(pmd))
+		return 0;
+
+	/* find the first VMA at or above 'addr' */
+	vma = find_vma(walk->mm, addr);
+	for (; addr != end; addr += PAGE_SIZE) {
+		u64 pfn = PM_NOT_PRESENT;
+
+		/* check to see if we've left 'vma' behind
+		 * and need a new, higher one */
+		if (vma && (addr >= vma->vm_end))
+			vma = find_vma(walk->mm, addr);
+
+		/* check that 'vma' actually covers this address,
+		 * and that it isn't a huge page vma */
+		if (vma && (vma->vm_start <= addr) &&
+		    !is_vm_hugetlb_page(vma)) {
+			pte = pte_offset_map(pmd, addr);
+			pfn = pte_to_pagemap_entry(*pte);
+			/* unmap before userspace copy */
+			pte_unmap(pte);
+		}
+		err = add_to_pagemap(addr, pfn, pm);
+		if (err)
+			return err;
+	}
+
+	cond_resched();
+
+	return err;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
+{
+	u64 pme = 0;
+	if (pte_present(pte))
+		pme = PM_PFRAME(pte_pfn(pte) + offset)
+			| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
+	return pme;
+}
+
+/* This function walks within one hugetlb entry in the single call */
+static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
+				 unsigned long addr, unsigned long end,
+				 struct mm_walk *walk)
+{
+	struct pagemapread *pm = walk->private;
+	int err = 0;
+	u64 pfn;
+
+	for (; addr != end; addr += PAGE_SIZE) {
+		int offset = (addr & ~hmask) >> PAGE_SHIFT;
+		pfn = huge_pte_to_pagemap_entry(*pte, offset);
+		err = add_to_pagemap(addr, pfn, pm);
+		if (err)
+			return err;
+	}
+
+	cond_resched();
+
+	return err;
+}
+#endif /* HUGETLB_PAGE */
+
+/*
+ * /proc/pid/pagemap - an array mapping virtual pages to pfns
+ *
+ * For each page in the address space, this file contains one 64-bit entry
+ * consisting of the following:
+ *
+ * Bits 0-55  page frame number (PFN) if present
+ * Bits 0-4   swap type if swapped
+ * Bits 5-55  swap offset if swapped
+ * Bits 55-60 page shift (page size = 1<<page shift)
+ * Bit  61    reserved for future use
+ * Bit  62    page swapped
+ * Bit  63    page present
+ *
+ * If the page is not present but in swap, then the PFN contains an
+ * encoding of the swap file number and the page's offset into the
+ * swap. Unmapped pages return a null PFN. This allows determining
+ * precisely which pages are mapped (or in swap) and comparing mapped
+ * pages between processes.
+ *
+ * Efficient users of this interface will use /proc/pid/maps to
+ * determine which areas of memory are actually mapped and llseek to
+ * skip over unmapped regions.
+ */
+#define PAGEMAP_WALK_SIZE	(PMD_SIZE)
+#define PAGEMAP_WALK_MASK	(PMD_MASK)
+static ssize_t pagemap_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	struct mm_struct *mm;
+	struct pagemapread pm;
+	int ret = -ESRCH;
+	struct mm_walk pagemap_walk = {};
+	unsigned long src;
+	unsigned long svpfn;
+	unsigned long start_vaddr;
+	unsigned long end_vaddr;
+	int copied = 0;
+
+	if (!task)
+		goto out;
+
+	ret = -EINVAL;
+	/* file position must be aligned */
+	if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
+		goto out_task;
+
+	ret = 0;
+	if (!count)
+		goto out_task;
+
+	pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
+	pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
+	ret = -ENOMEM;
+	if (!pm.buffer)
+		goto out_task;
+
+	mm = mm_for_maps(task);
+	ret = PTR_ERR(mm);
+	if (!mm || IS_ERR(mm))
+		goto out_free;
+
+	pagemap_walk.pmd_entry = pagemap_pte_range;
+	pagemap_walk.pte_hole = pagemap_pte_hole;
+#ifdef CONFIG_HUGETLB_PAGE
+	pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
+#endif
+	pagemap_walk.mm = mm;
+	pagemap_walk.private = &pm;
+
+	src = *ppos;
+	svpfn = src / PM_ENTRY_BYTES;
+	start_vaddr = svpfn << PAGE_SHIFT;
+	end_vaddr = TASK_SIZE_OF(task);
+
+	/* watch out for wraparound */
+	if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+		start_vaddr = end_vaddr;
+
+	/*
+	 * The odds are that this will stop walking way
+	 * before end_vaddr, because the length of the
+	 * user buffer is tracked in "pm", and the walk
+	 * will stop when we hit the end of the buffer.
+	 */
+	ret = 0;
+	while (count && (start_vaddr < end_vaddr)) {
+		int len;
+		unsigned long end;
+
+		pm.pos = 0;
+		end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
+		/* overflow ? */
+		if (end < start_vaddr || end > end_vaddr)
+			end = end_vaddr;
+		down_read(&mm->mmap_sem);
+		ret = walk_page_range(start_vaddr, end, &pagemap_walk);
+		up_read(&mm->mmap_sem);
+		start_vaddr = end;
+
+		len = min(count, PM_ENTRY_BYTES * pm.pos);
+		if (copy_to_user(buf, pm.buffer, len)) {
+			ret = -EFAULT;
+			goto out_mm;
+		}
+		copied += len;
+		buf += len;
+		count -= len;
+	}
+	*ppos += copied;
+	if (!ret || ret == PM_END_OF_BUFFER)
+		ret = copied;
+
+out_mm:
+	mmput(mm);
+out_free:
+	kfree(pm.buffer);
+out_task:
+	put_task_struct(task);
+out:
+	return ret;
+}
+
+const struct file_operations proc_pagemap_operations = {
+	.llseek		= mem_lseek, /* borrow this */
+	.read		= pagemap_read,
+};
+#endif /* CONFIG_PROC_PAGE_MONITOR */
+
+#ifdef CONFIG_NUMA
+
+struct numa_maps {
+	struct vm_area_struct *vma;
+	unsigned long pages;
+	unsigned long anon;
+	unsigned long active;
+	unsigned long writeback;
+	unsigned long mapcount_max;
+	unsigned long dirty;
+	unsigned long swapcache;
+	unsigned long node[MAX_NUMNODES];
+};
+
+struct numa_maps_private {
+	struct proc_maps_private proc_maps;
+	struct numa_maps md;
+};
+
+static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
+			unsigned long nr_pages)
+{
+	int count = page_mapcount(page);
+
+	md->pages += nr_pages;
+	if (pte_dirty || PageDirty(page))
+		md->dirty += nr_pages;
+
+	if (PageSwapCache(page))
+		md->swapcache += nr_pages;
+
+	if (PageActive(page) || PageUnevictable(page))
+		md->active += nr_pages;
+
+	if (PageWriteback(page))
+		md->writeback += nr_pages;
+
+	if (PageAnon(page))
+		md->anon += nr_pages;
+
+	if (count > md->mapcount_max)
+		md->mapcount_max = count;
+
+	md->node[page_to_nid(page)] += nr_pages;
+}
+
+static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
+		unsigned long addr)
+{
+	struct page *page;
+	int nid;
+
+	if (!pte_present(pte))
+		return NULL;
+
+	page = vm_normal_page(vma, addr, pte);
+	if (!page)
+		return NULL;
+
+	if (PageReserved(page))
+		return NULL;
+
+	nid = page_to_nid(page);
+	if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
+		return NULL;
+
+	return page;
+}
+
+static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
+		unsigned long end, struct mm_walk *walk)
+{
+	struct numa_maps *md;
+	spinlock_t *ptl;
+	pte_t *orig_pte;
+	pte_t *pte;
+
+	md = walk->private;
+	spin_lock(&walk->mm->page_table_lock);
+	if (pmd_trans_huge(*pmd)) {
+		if (pmd_trans_splitting(*pmd)) {
+			spin_unlock(&walk->mm->page_table_lock);
+			wait_split_huge_page(md->vma->anon_vma, pmd);
+		} else {
+			pte_t huge_pte = *(pte_t *)pmd;
+			struct page *page;
+
+			page = can_gather_numa_stats(huge_pte, md->vma, addr);
+			if (page)
+				gather_stats(page, md, pte_dirty(huge_pte),
+						HPAGE_PMD_SIZE/PAGE_SIZE);
+			spin_unlock(&walk->mm->page_table_lock);
+			return 0;
+		}
+	} else {
+		spin_unlock(&walk->mm->page_table_lock);
+	}
+
+	if (pmd_trans_unstable(pmd))
+		return 0;
+	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+	do {
+		struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
+		if (!page)
+			continue;
+		gather_stats(page, md, pte_dirty(*pte), 1);
+
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap_unlock(orig_pte, ptl);
+	return 0;
+}
+#ifdef CONFIG_HUGETLB_PAGE
+static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+		unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+	struct numa_maps *md;
+	struct page *page;
+
+	if (pte_none(*pte))
+		return 0;
+
+	page = pte_page(*pte);
+	if (!page)
+		return 0;
+
+	md = walk->private;
+	gather_stats(page, md, pte_dirty(*pte), 1);
+	return 0;
+}
+
+#else
+static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+		unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+	return 0;
+}
+#endif
+
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+static int show_numa_map(struct seq_file *m, void *v)
+{
+	struct numa_maps_private *numa_priv = m->private;
+	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
+	struct vm_area_struct *vma = v;
+	struct numa_maps *md = &numa_priv->md;
+	struct file *file = vma->vm_file;
+	struct mm_struct *mm = vma->vm_mm;
+	struct mm_walk walk = {};
+	struct mempolicy *pol;
+	int n;
+	char buffer[50];
+
+	if (!mm)
+		return 0;
+
+	/* Ensure we start with an empty set of numa_maps statistics. */
+	memset(md, 0, sizeof(*md));
+
+	md->vma = vma;
+
+	walk.hugetlb_entry = gather_hugetbl_stats;
+	walk.pmd_entry = gather_pte_stats;
+	walk.private = md;
+	walk.mm = mm;
+
+	pol = get_vma_policy(proc_priv->task, vma, vma->vm_start);
+	mpol_to_str(buffer, sizeof(buffer), pol, 0);
+	mpol_cond_put(pol);
+
+	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
+
+	if (file) {
+		seq_printf(m, " file=");
+		seq_path(m, &file->f_path, "\n\t= ");
+	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+		seq_printf(m, " heap");
+	} else if (vma->vm_start <= mm->start_stack &&
+			vma->vm_end >= mm->start_stack) {
+		seq_printf(m, " stack");
+	}
+
+	if (is_vm_hugetlb_page(vma))
+		seq_printf(m, " huge");
+
+	walk_page_range(vma->vm_start, vma->vm_end, &walk);
+
+	if (!md->pages)
+		goto out;
+
+	if (md->anon)
+		seq_printf(m, " anon=%lu", md->anon);
+
+	if (md->dirty)
+		seq_printf(m, " dirty=%lu", md->dirty);
+
+	if (md->pages != md->anon && md->pages != md->dirty)
+		seq_printf(m, " mapped=%lu", md->pages);
+
+	if (md->mapcount_max > 1)
+		seq_printf(m, " mapmax=%lu", md->mapcount_max);
+
+	if (md->swapcache)
+		seq_printf(m, " swapcache=%lu", md->swapcache);
+
+	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+		seq_printf(m, " active=%lu", md->active);
+
+	if (md->writeback)
+		seq_printf(m, " writeback=%lu", md->writeback);
+
+	for_each_node_state(n, N_HIGH_MEMORY)
+		if (md->node[n])
+			seq_printf(m, " N%d=%lu", n, md->node[n]);
+out:
+	seq_putc(m, '\n');
+
+	if (m->count < m->size)
+		m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
+	return 0;
+}
+
+static const struct seq_operations proc_pid_numa_maps_op = {
+        .start  = m_start,
+        .next   = m_next,
+        .stop   = m_stop,
+        .show   = show_numa_map,
+};
+
+static int numa_maps_open(struct inode *inode, struct file *file)
+{
+	struct numa_maps_private *priv;
+	int ret = -ENOMEM;
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv) {
+		priv->proc_maps.pid = proc_pid(inode);
+		ret = seq_open(file, &proc_pid_numa_maps_op);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = priv;
+		} else {
+			kfree(priv);
+		}
+	}
+	return ret;
+}
+
+const struct file_operations proc_numa_maps_operations = {
+	.open		= numa_maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+#endif /* CONFIG_NUMA */
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
new file mode 100644
index 00000000..980de547
--- /dev/null
+++ b/fs/proc/task_nommu.c
@@ -0,0 +1,271 @@
+
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/fs_struct.h>
+#include <linux/mount.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * Logic: we've got two memory sums for each process, "shared", and
+ * "non-shared". Shared memory may get counted more than once, for
+ * each process that owns it. Non-shared memory is counted
+ * accurately.
+ */
+void task_mem(struct seq_file *m, struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	struct vm_region *region;
+	struct rb_node *p;
+	unsigned long bytes = 0, sbytes = 0, slack = 0, size;
+        
+	down_read(&mm->mmap_sem);
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+		vma = rb_entry(p, struct vm_area_struct, vm_rb);
+
+		bytes += kobjsize(vma);
+
+		region = vma->vm_region;
+		if (region) {
+			size = kobjsize(region);
+			size += region->vm_end - region->vm_start;
+		} else {
+			size = vma->vm_end - vma->vm_start;
+		}
+
+		if (atomic_read(&mm->mm_count) > 1 ||
+		    vma->vm_flags & VM_MAYSHARE) {
+			sbytes += size;
+		} else {
+			bytes += size;
+			if (region)
+				slack = region->vm_end - vma->vm_end;
+		}
+	}
+
+	if (atomic_read(&mm->mm_count) > 1)
+		sbytes += kobjsize(mm);
+	else
+		bytes += kobjsize(mm);
+	
+	if (current->fs && current->fs->users > 1)
+		sbytes += kobjsize(current->fs);
+	else
+		bytes += kobjsize(current->fs);
+
+	if (current->files && atomic_read(&current->files->count) > 1)
+		sbytes += kobjsize(current->files);
+	else
+		bytes += kobjsize(current->files);
+
+	if (current->sighand && atomic_read(&current->sighand->count) > 1)
+		sbytes += kobjsize(current->sighand);
+	else
+		bytes += kobjsize(current->sighand);
+
+	bytes += kobjsize(current); /* includes kernel stack */
+
+	seq_printf(m,
+		"Mem:\t%8lu bytes\n"
+		"Slack:\t%8lu bytes\n"
+		"Shared:\t%8lu bytes\n",
+		bytes, slack, sbytes);
+
+	up_read(&mm->mmap_sem);
+}
+
+unsigned long task_vsize(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	struct rb_node *p;
+	unsigned long vsize = 0;
+
+	down_read(&mm->mmap_sem);
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+		vma = rb_entry(p, struct vm_area_struct, vm_rb);
+		vsize += vma->vm_end - vma->vm_start;
+	}
+	up_read(&mm->mmap_sem);
+	return vsize;
+}
+
+unsigned long task_statm(struct mm_struct *mm,
+			 unsigned long *shared, unsigned long *text,
+			 unsigned long *data, unsigned long *resident)
+{
+	struct vm_area_struct *vma;
+	struct vm_region *region;
+	struct rb_node *p;
+	unsigned long size = kobjsize(mm);
+
+	down_read(&mm->mmap_sem);
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+		vma = rb_entry(p, struct vm_area_struct, vm_rb);
+		size += kobjsize(vma);
+		region = vma->vm_region;
+		if (region) {
+			size += kobjsize(region);
+			size += region->vm_end - region->vm_start;
+		}
+	}
+
+	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
+		>> PAGE_SHIFT;
+	*data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK))
+		>> PAGE_SHIFT;
+	up_read(&mm->mmap_sem);
+	size >>= PAGE_SHIFT;
+	size += *text + *data;
+	*resident = size;
+	return size;
+}
+
+static void pad_len_spaces(struct seq_file *m, int len)
+{
+	len = 25 + sizeof(void*) * 6 - len;
+	if (len < 1)
+		len = 1;
+	seq_printf(m, "%*c", len, ' ');
+}
+
+/*
+ * display a single VMA to a sequenced file
+ */
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long ino = 0;
+	struct file *file;
+	dev_t dev = 0;
+	int flags, len;
+	unsigned long long pgoff = 0;
+
+	flags = vma->vm_flags;
+	file = vma->vm_file;
+
+	if (file) {
+		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+		dev = inode->i_sb->s_dev;
+		ino = inode->i_ino;
+		pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+	}
+
+	seq_printf(m,
+		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+		   vma->vm_start,
+		   vma->vm_end,
+		   flags & VM_READ ? 'r' : '-',
+		   flags & VM_WRITE ? 'w' : '-',
+		   flags & VM_EXEC ? 'x' : '-',
+		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
+		   pgoff,
+		   MAJOR(dev), MINOR(dev), ino, &len);
+
+	if (file) {
+		pad_len_spaces(m, len);
+		seq_path(m, &file->f_path, "");
+	} else if (mm) {
+		if (vma->vm_start <= mm->start_stack &&
+			vma->vm_end >= mm->start_stack) {
+			pad_len_spaces(m, len);
+			seq_puts(m, "[stack]");
+		}
+	}
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
+/*
+ * display mapping lines for a particular process's /proc/pid/maps
+ */
+static int show_map(struct seq_file *m, void *_p)
+{
+	struct rb_node *p = _p;
+
+	return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
+}
+
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+	struct proc_maps_private *priv = m->private;
+	struct mm_struct *mm;
+	struct rb_node *p;
+	loff_t n = *pos;
+
+	/* pin the task and mm whilst we play with them */
+	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+	if (!priv->task)
+		return ERR_PTR(-ESRCH);
+
+	mm = mm_for_maps(priv->task);
+	if (!mm || IS_ERR(mm)) {
+		put_task_struct(priv->task);
+		priv->task = NULL;
+		return mm;
+	}
+	down_read(&mm->mmap_sem);
+
+	/* start from the Nth VMA */
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
+		if (n-- == 0)
+			return p;
+	return NULL;
+}
+
+static void m_stop(struct seq_file *m, void *_vml)
+{
+	struct proc_maps_private *priv = m->private;
+
+	if (priv->task) {
+		struct mm_struct *mm = priv->task->mm;
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+		put_task_struct(priv->task);
+	}
+}
+
+static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
+{
+	struct rb_node *p = _p;
+
+	(*pos)++;
+	return p ? rb_next(p) : NULL;
+}
+
+static const struct seq_operations proc_pid_maps_ops = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_map
+};
+
+static int maps_open(struct inode *inode, struct file *file)
+{
+	struct proc_maps_private *priv;
+	int ret = -ENOMEM;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv) {
+		priv->pid = proc_pid(inode);
+		ret = seq_open(file, &proc_pid_maps_ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = priv;
+		} else {
+			kfree(priv);
+		}
+	}
+	return ret;
+}
+
+const struct file_operations proc_maps_operations = {
+	.open		= maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
new file mode 100644
index 00000000..29166ecd
--- /dev/null
+++ b/fs/proc/uptime.c
@@ -0,0 +1,53 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/time.h>
+#include <linux/kernel_stat.h>
+#include <asm/cputime.h>
+
+static int uptime_proc_show(struct seq_file *m, void *v)
+{
+	struct timespec uptime;
+	struct timespec idle;
+	cputime64_t idletime;
+	u64 nsec;
+	u32 rem;
+	int i;
+
+	idletime = 0;
+	for_each_possible_cpu(i)
+		idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle);
+
+	do_posix_clock_monotonic_gettime(&uptime);
+	monotonic_to_bootbased(&uptime);
+	nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
+	idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+	idle.tv_nsec = rem;
+	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
+			(unsigned long) uptime.tv_sec,
+			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
+			(unsigned long) idle.tv_sec,
+			(idle.tv_nsec / (NSEC_PER_SEC / 100)));
+	return 0;
+}
+
+static int uptime_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, uptime_proc_show, NULL);
+}
+
+static const struct file_operations uptime_proc_fops = {
+	.open		= uptime_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_uptime_init(void)
+{
+	proc_create("uptime", 0, NULL, &uptime_proc_fops);
+	return 0;
+}
+module_init(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
new file mode 100644
index 00000000..76817a60
--- /dev/null
+++ b/fs/proc/version.c
@@ -0,0 +1,34 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/utsname.h>
+
+static int version_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, linux_proc_banner,
+		utsname()->sysname,
+		utsname()->release,
+		utsname()->version);
+	return 0;
+}
+
+static int version_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, version_proc_show, NULL);
+}
+
+static const struct file_operations version_proc_fops = {
+	.open		= version_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_version_init(void)
+{
+	proc_create("version", 0, NULL, &version_proc_fops);
+	return 0;
+}
+module_init(proc_version_init);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
new file mode 100644
index 00000000..cd99bf55
--- /dev/null
+++ b/fs/proc/vmcore.c
@@ -0,0 +1,701 @@
+/*
+ *	fs/proc/vmcore.c Interface for accessing the crash
+ * 				 dump from the system's previous life.
+ * 	Heavily borrowed from fs/proc/kcore.c
+ *	Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *	Copyright (C) IBM Corporation, 2004. All rights reserved
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/user.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/crash_dump.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+/* List representing chunks of contiguous memory areas and their offsets in
+ * vmcore file.
+ */
+static LIST_HEAD(vmcore_list);
+
+/* Stores the pointer to the buffer containing kernel elf core headers. */
+static char *elfcorebuf;
+static size_t elfcorebuf_sz;
+
+/* Total size of vmcore file. */
+static u64 vmcore_size;
+
+static struct proc_dir_entry *proc_vmcore = NULL;
+
+/*
+ * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
+ * The called function has to take care of module refcounting.
+ */
+static int (*oldmem_pfn_is_ram)(unsigned long pfn);
+
+int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn))
+{
+	if (oldmem_pfn_is_ram)
+		return -EBUSY;
+	oldmem_pfn_is_ram = fn;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram);
+
+void unregister_oldmem_pfn_is_ram(void)
+{
+	oldmem_pfn_is_ram = NULL;
+	wmb();
+}
+EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram);
+
+static int pfn_is_ram(unsigned long pfn)
+{
+	int (*fn)(unsigned long pfn);
+	/* pfn is ram unless fn() checks pagetype */
+	int ret = 1;
+
+	/*
+	 * Ask hypervisor if the pfn is really ram.
+	 * A ballooned page contains no data and reading from such a page
+	 * will cause high load in the hypervisor.
+	 */
+	fn = oldmem_pfn_is_ram;
+	if (fn)
+		ret = fn(pfn);
+
+	return ret;
+}
+
+/* Reads a page from the oldmem device from given offset. */
+static ssize_t read_from_oldmem(char *buf, size_t count,
+				u64 *ppos, int userbuf)
+{
+	unsigned long pfn, offset;
+	size_t nr_bytes;
+	ssize_t read = 0, tmp;
+
+	if (!count)
+		return 0;
+
+	offset = (unsigned long)(*ppos % PAGE_SIZE);
+	pfn = (unsigned long)(*ppos / PAGE_SIZE);
+
+	do {
+		if (count > (PAGE_SIZE - offset))
+			nr_bytes = PAGE_SIZE - offset;
+		else
+			nr_bytes = count;
+
+		/* If pfn is not ram, return zeros for sparse dump files */
+		if (pfn_is_ram(pfn) == 0)
+			memset(buf, 0, nr_bytes);
+		else {
+			tmp = copy_oldmem_page(pfn, buf, nr_bytes,
+						offset, userbuf);
+			if (tmp < 0)
+				return tmp;
+		}
+		*ppos += nr_bytes;
+		count -= nr_bytes;
+		buf += nr_bytes;
+		read += nr_bytes;
+		++pfn;
+		offset = 0;
+	} while (count);
+
+	return read;
+}
+
+/* Maps vmcore file offset to respective physical address in memroy. */
+static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
+					struct vmcore **m_ptr)
+{
+	struct vmcore *m;
+	u64 paddr;
+
+	list_for_each_entry(m, vc_list, list) {
+		u64 start, end;
+		start = m->offset;
+		end = m->offset + m->size - 1;
+		if (offset >= start && offset <= end) {
+			paddr = m->paddr + offset - start;
+			*m_ptr = m;
+			return paddr;
+		}
+	}
+	*m_ptr = NULL;
+	return 0;
+}
+
+/* Read from the ELF header and then the crash dump. On error, negative value is
+ * returned otherwise number of bytes read are returned.
+ */
+static ssize_t read_vmcore(struct file *file, char __user *buffer,
+				size_t buflen, loff_t *fpos)
+{
+	ssize_t acc = 0, tmp;
+	size_t tsz;
+	u64 start, nr_bytes;
+	struct vmcore *curr_m = NULL;
+
+	if (buflen == 0 || *fpos >= vmcore_size)
+		return 0;
+
+	/* trim buflen to not go beyond EOF */
+	if (buflen > vmcore_size - *fpos)
+		buflen = vmcore_size - *fpos;
+
+	/* Read ELF core header */
+	if (*fpos < elfcorebuf_sz) {
+		tsz = elfcorebuf_sz - *fpos;
+		if (buflen < tsz)
+			tsz = buflen;
+		if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
+			return -EFAULT;
+		buflen -= tsz;
+		*fpos += tsz;
+		buffer += tsz;
+		acc += tsz;
+
+		/* leave now if filled buffer already */
+		if (buflen == 0)
+			return acc;
+	}
+
+	start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m);
+	if (!curr_m)
+        	return -EINVAL;
+	if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
+		tsz = buflen;
+
+	/* Calculate left bytes in current memory segment. */
+	nr_bytes = (curr_m->size - (start - curr_m->paddr));
+	if (tsz > nr_bytes)
+		tsz = nr_bytes;
+
+	while (buflen) {
+		tmp = read_from_oldmem(buffer, tsz, &start, 1);
+		if (tmp < 0)
+			return tmp;
+		buflen -= tsz;
+		*fpos += tsz;
+		buffer += tsz;
+		acc += tsz;
+		if (start >= (curr_m->paddr + curr_m->size)) {
+			if (curr_m->list.next == &vmcore_list)
+				return acc;	/*EOF*/
+			curr_m = list_entry(curr_m->list.next,
+						struct vmcore, list);
+			start = curr_m->paddr;
+		}
+		if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
+			tsz = buflen;
+		/* Calculate left bytes in current memory segment. */
+		nr_bytes = (curr_m->size - (start - curr_m->paddr));
+		if (tsz > nr_bytes)
+			tsz = nr_bytes;
+	}
+	return acc;
+}
+
+static const struct file_operations proc_vmcore_operations = {
+	.read		= read_vmcore,
+	.llseek		= default_llseek,
+};
+
+static struct vmcore* __init get_new_element(void)
+{
+	return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
+}
+
+static u64 __init get_vmcore_size_elf64(char *elfptr)
+{
+	int i;
+	u64 size;
+	Elf64_Ehdr *ehdr_ptr;
+	Elf64_Phdr *phdr_ptr;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
+	size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr));
+	for (i = 0; i < ehdr_ptr->e_phnum; i++) {
+		size += phdr_ptr->p_memsz;
+		phdr_ptr++;
+	}
+	return size;
+}
+
+static u64 __init get_vmcore_size_elf32(char *elfptr)
+{
+	int i;
+	u64 size;
+	Elf32_Ehdr *ehdr_ptr;
+	Elf32_Phdr *phdr_ptr;
+
+	ehdr_ptr = (Elf32_Ehdr *)elfptr;
+	phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
+	size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr));
+	for (i = 0; i < ehdr_ptr->e_phnum; i++) {
+		size += phdr_ptr->p_memsz;
+		phdr_ptr++;
+	}
+	return size;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
+						struct list_head *vc_list)
+{
+	int i, nr_ptnote=0, rc=0;
+	char *tmp;
+	Elf64_Ehdr *ehdr_ptr;
+	Elf64_Phdr phdr, *phdr_ptr;
+	Elf64_Nhdr *nhdr_ptr;
+	u64 phdr_sz = 0, note_off;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		int j;
+		void *notes_section;
+		struct vmcore *new;
+		u64 offset, max_sz, sz, real_sz = 0;
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		nr_ptnote++;
+		max_sz = phdr_ptr->p_memsz;
+		offset = phdr_ptr->p_offset;
+		notes_section = kmalloc(max_sz, GFP_KERNEL);
+		if (!notes_section)
+			return -ENOMEM;
+		rc = read_from_oldmem(notes_section, max_sz, &offset, 0);
+		if (rc < 0) {
+			kfree(notes_section);
+			return rc;
+		}
+		nhdr_ptr = notes_section;
+		for (j = 0; j < max_sz; j += sz) {
+			if (nhdr_ptr->n_namesz == 0)
+				break;
+			sz = sizeof(Elf64_Nhdr) +
+				((nhdr_ptr->n_namesz + 3) & ~3) +
+				((nhdr_ptr->n_descsz + 3) & ~3);
+			real_sz += sz;
+			nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
+		}
+
+		/* Add this contiguous chunk of notes section to vmcore list.*/
+		new = get_new_element();
+		if (!new) {
+			kfree(notes_section);
+			return -ENOMEM;
+		}
+		new->paddr = phdr_ptr->p_offset;
+		new->size = real_sz;
+		list_add_tail(&new->list, vc_list);
+		phdr_sz += real_sz;
+		kfree(notes_section);
+	}
+
+	/* Prepare merged PT_NOTE program header. */
+	phdr.p_type    = PT_NOTE;
+	phdr.p_flags   = 0;
+	note_off = sizeof(Elf64_Ehdr) +
+			(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
+	phdr.p_offset  = note_off;
+	phdr.p_vaddr   = phdr.p_paddr = 0;
+	phdr.p_filesz  = phdr.p_memsz = phdr_sz;
+	phdr.p_align   = 0;
+
+	/* Add merged PT_NOTE program header*/
+	tmp = elfptr + sizeof(Elf64_Ehdr);
+	memcpy(tmp, &phdr, sizeof(phdr));
+	tmp += sizeof(phdr);
+
+	/* Remove unwanted PT_NOTE program headers. */
+	i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
+	*elfsz = *elfsz - i;
+	memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
+
+	/* Modify e_phnum to reflect merged headers. */
+	ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+
+	return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
+						struct list_head *vc_list)
+{
+	int i, nr_ptnote=0, rc=0;
+	char *tmp;
+	Elf32_Ehdr *ehdr_ptr;
+	Elf32_Phdr phdr, *phdr_ptr;
+	Elf32_Nhdr *nhdr_ptr;
+	u64 phdr_sz = 0, note_off;
+
+	ehdr_ptr = (Elf32_Ehdr *)elfptr;
+	phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		int j;
+		void *notes_section;
+		struct vmcore *new;
+		u64 offset, max_sz, sz, real_sz = 0;
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		nr_ptnote++;
+		max_sz = phdr_ptr->p_memsz;
+		offset = phdr_ptr->p_offset;
+		notes_section = kmalloc(max_sz, GFP_KERNEL);
+		if (!notes_section)
+			return -ENOMEM;
+		rc = read_from_oldmem(notes_section, max_sz, &offset, 0);
+		if (rc < 0) {
+			kfree(notes_section);
+			return rc;
+		}
+		nhdr_ptr = notes_section;
+		for (j = 0; j < max_sz; j += sz) {
+			if (nhdr_ptr->n_namesz == 0)
+				break;
+			sz = sizeof(Elf32_Nhdr) +
+				((nhdr_ptr->n_namesz + 3) & ~3) +
+				((nhdr_ptr->n_descsz + 3) & ~3);
+			real_sz += sz;
+			nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
+		}
+
+		/* Add this contiguous chunk of notes section to vmcore list.*/
+		new = get_new_element();
+		if (!new) {
+			kfree(notes_section);
+			return -ENOMEM;
+		}
+		new->paddr = phdr_ptr->p_offset;
+		new->size = real_sz;
+		list_add_tail(&new->list, vc_list);
+		phdr_sz += real_sz;
+		kfree(notes_section);
+	}
+
+	/* Prepare merged PT_NOTE program header. */
+	phdr.p_type    = PT_NOTE;
+	phdr.p_flags   = 0;
+	note_off = sizeof(Elf32_Ehdr) +
+			(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr);
+	phdr.p_offset  = note_off;
+	phdr.p_vaddr   = phdr.p_paddr = 0;
+	phdr.p_filesz  = phdr.p_memsz = phdr_sz;
+	phdr.p_align   = 0;
+
+	/* Add merged PT_NOTE program header*/
+	tmp = elfptr + sizeof(Elf32_Ehdr);
+	memcpy(tmp, &phdr, sizeof(phdr));
+	tmp += sizeof(phdr);
+
+	/* Remove unwanted PT_NOTE program headers. */
+	i = (nr_ptnote - 1) * sizeof(Elf32_Phdr);
+	*elfsz = *elfsz - i;
+	memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr)));
+
+	/* Modify e_phnum to reflect merged headers. */
+	ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+
+	return 0;
+}
+
+/* Add memory chunks represented by program headers to vmcore list. Also update
+ * the new offset fields of exported program headers. */
+static int __init process_ptload_program_headers_elf64(char *elfptr,
+						size_t elfsz,
+						struct list_head *vc_list)
+{
+	int i;
+	Elf64_Ehdr *ehdr_ptr;
+	Elf64_Phdr *phdr_ptr;
+	loff_t vmcore_off;
+	struct vmcore *new;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
+
+	/* First program header is PT_NOTE header. */
+	vmcore_off = sizeof(Elf64_Ehdr) +
+			(ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) +
+			phdr_ptr->p_memsz; /* Note sections */
+
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		if (phdr_ptr->p_type != PT_LOAD)
+			continue;
+
+		/* Add this contiguous chunk of memory to vmcore list.*/
+		new = get_new_element();
+		if (!new)
+			return -ENOMEM;
+		new->paddr = phdr_ptr->p_offset;
+		new->size = phdr_ptr->p_memsz;
+		list_add_tail(&new->list, vc_list);
+
+		/* Update the program header offset. */
+		phdr_ptr->p_offset = vmcore_off;
+		vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+	}
+	return 0;
+}
+
+static int __init process_ptload_program_headers_elf32(char *elfptr,
+						size_t elfsz,
+						struct list_head *vc_list)
+{
+	int i;
+	Elf32_Ehdr *ehdr_ptr;
+	Elf32_Phdr *phdr_ptr;
+	loff_t vmcore_off;
+	struct vmcore *new;
+
+	ehdr_ptr = (Elf32_Ehdr *)elfptr;
+	phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
+
+	/* First program header is PT_NOTE header. */
+	vmcore_off = sizeof(Elf32_Ehdr) +
+			(ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) +
+			phdr_ptr->p_memsz; /* Note sections */
+
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		if (phdr_ptr->p_type != PT_LOAD)
+			continue;
+
+		/* Add this contiguous chunk of memory to vmcore list.*/
+		new = get_new_element();
+		if (!new)
+			return -ENOMEM;
+		new->paddr = phdr_ptr->p_offset;
+		new->size = phdr_ptr->p_memsz;
+		list_add_tail(&new->list, vc_list);
+
+		/* Update the program header offset */
+		phdr_ptr->p_offset = vmcore_off;
+		vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+	}
+	return 0;
+}
+
+/* Sets offset fields of vmcore elements. */
+static void __init set_vmcore_list_offsets_elf64(char *elfptr,
+						struct list_head *vc_list)
+{
+	loff_t vmcore_off;
+	Elf64_Ehdr *ehdr_ptr;
+	struct vmcore *m;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+
+	/* Skip Elf header and program headers. */
+	vmcore_off = sizeof(Elf64_Ehdr) +
+			(ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr);
+
+	list_for_each_entry(m, vc_list, list) {
+		m->offset = vmcore_off;
+		vmcore_off += m->size;
+	}
+}
+
+/* Sets offset fields of vmcore elements. */
+static void __init set_vmcore_list_offsets_elf32(char *elfptr,
+						struct list_head *vc_list)
+{
+	loff_t vmcore_off;
+	Elf32_Ehdr *ehdr_ptr;
+	struct vmcore *m;
+
+	ehdr_ptr = (Elf32_Ehdr *)elfptr;
+
+	/* Skip Elf header and program headers. */
+	vmcore_off = sizeof(Elf32_Ehdr) +
+			(ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr);
+
+	list_for_each_entry(m, vc_list, list) {
+		m->offset = vmcore_off;
+		vmcore_off += m->size;
+	}
+}
+
+static int __init parse_crash_elf64_headers(void)
+{
+	int rc=0;
+	Elf64_Ehdr ehdr;
+	u64 addr;
+
+	addr = elfcorehdr_addr;
+
+	/* Read Elf header */
+	rc = read_from_oldmem((char*)&ehdr, sizeof(Elf64_Ehdr), &addr, 0);
+	if (rc < 0)
+		return rc;
+
+	/* Do some basic Verification. */
+	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
+		(ehdr.e_type != ET_CORE) ||
+		!vmcore_elf64_check_arch(&ehdr) ||
+		ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
+		ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
+		ehdr.e_version != EV_CURRENT ||
+		ehdr.e_ehsize != sizeof(Elf64_Ehdr) ||
+		ehdr.e_phentsize != sizeof(Elf64_Phdr) ||
+		ehdr.e_phnum == 0) {
+		printk(KERN_WARNING "Warning: Core image elf header is not"
+					"sane\n");
+		return -EINVAL;
+	}
+
+	/* Read in all elf headers. */
+	elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr);
+	elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+	if (!elfcorebuf)
+		return -ENOMEM;
+	addr = elfcorehdr_addr;
+	rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
+	if (rc < 0) {
+		kfree(elfcorebuf);
+		return rc;
+	}
+
+	/* Merge all PT_NOTE headers into one. */
+	rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
+	if (rc) {
+		kfree(elfcorebuf);
+		return rc;
+	}
+	rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
+							&vmcore_list);
+	if (rc) {
+		kfree(elfcorebuf);
+		return rc;
+	}
+	set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list);
+	return 0;
+}
+
+static int __init parse_crash_elf32_headers(void)
+{
+	int rc=0;
+	Elf32_Ehdr ehdr;
+	u64 addr;
+
+	addr = elfcorehdr_addr;
+
+	/* Read Elf header */
+	rc = read_from_oldmem((char*)&ehdr, sizeof(Elf32_Ehdr), &addr, 0);
+	if (rc < 0)
+		return rc;
+
+	/* Do some basic Verification. */
+	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
+		(ehdr.e_type != ET_CORE) ||
+		!elf_check_arch(&ehdr) ||
+		ehdr.e_ident[EI_CLASS] != ELFCLASS32||
+		ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
+		ehdr.e_version != EV_CURRENT ||
+		ehdr.e_ehsize != sizeof(Elf32_Ehdr) ||
+		ehdr.e_phentsize != sizeof(Elf32_Phdr) ||
+		ehdr.e_phnum == 0) {
+		printk(KERN_WARNING "Warning: Core image elf header is not"
+					"sane\n");
+		return -EINVAL;
+	}
+
+	/* Read in all elf headers. */
+	elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
+	elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+	if (!elfcorebuf)
+		return -ENOMEM;
+	addr = elfcorehdr_addr;
+	rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
+	if (rc < 0) {
+		kfree(elfcorebuf);
+		return rc;
+	}
+
+	/* Merge all PT_NOTE headers into one. */
+	rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
+	if (rc) {
+		kfree(elfcorebuf);
+		return rc;
+	}
+	rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz,
+								&vmcore_list);
+	if (rc) {
+		kfree(elfcorebuf);
+		return rc;
+	}
+	set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list);
+	return 0;
+}
+
+static int __init parse_crash_elf_headers(void)
+{
+	unsigned char e_ident[EI_NIDENT];
+	u64 addr;
+	int rc=0;
+
+	addr = elfcorehdr_addr;
+	rc = read_from_oldmem(e_ident, EI_NIDENT, &addr, 0);
+	if (rc < 0)
+		return rc;
+	if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) {
+		printk(KERN_WARNING "Warning: Core image elf header"
+					" not found\n");
+		return -EINVAL;
+	}
+
+	if (e_ident[EI_CLASS] == ELFCLASS64) {
+		rc = parse_crash_elf64_headers();
+		if (rc)
+			return rc;
+
+		/* Determine vmcore size. */
+		vmcore_size = get_vmcore_size_elf64(elfcorebuf);
+	} else if (e_ident[EI_CLASS] == ELFCLASS32) {
+		rc = parse_crash_elf32_headers();
+		if (rc)
+			return rc;
+
+		/* Determine vmcore size. */
+		vmcore_size = get_vmcore_size_elf32(elfcorebuf);
+	} else {
+		printk(KERN_WARNING "Warning: Core image elf header is not"
+					" sane\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/* Init function for vmcore module. */
+static int __init vmcore_init(void)
+{
+	int rc = 0;
+
+	/* If elfcorehdr= has been passed in cmdline, then capture the dump.*/
+	if (!(is_vmcore_usable()))
+		return rc;
+	rc = parse_crash_elf_headers();
+	if (rc) {
+		printk(KERN_WARNING "Kdump: vmcore not initialized\n");
+		return rc;
+	}
+
+	proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
+	if (proc_vmcore)
+		proc_vmcore->size = vmcore_size;
+	return 0;
+}
+module_init(vmcore_init)
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
new file mode 100644
index 00000000..8007ae7c
--- /dev/null
+++ b/fs/pstore/Kconfig
@@ -0,0 +1,13 @@
+config PSTORE
+	bool "Persistent store support"
+	default n
+	help
+	   This option enables generic access to platform level
+	   persistent storage via "pstore" filesystem that can
+	   be mounted as /dev/pstore.  Only useful if you have
+	   a platform level driver that registers with pstore to
+	   provide the data, so you probably should just go say "Y"
+	   (or "M") to a platform specific persistent store driver
+	   (e.g. ACPI_APEI on X86) which will select this for you.
+	   If you don't have a platform persistent store driver,
+	   say N.
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
new file mode 100644
index 00000000..760f4bce
--- /dev/null
+++ b/fs/pstore/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux pstorefs routines.
+#
+
+obj-y += pstore.o
+
+pstore-objs += inode.o platform.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
new file mode 100644
index 00000000..977ed272
--- /dev/null
+++ b/fs/pstore/inode.c
@@ -0,0 +1,311 @@
+/*
+ * Persistent Storage - ramfs parts.
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/mount.h>
+#include <linux/ramfs.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/magic.h>
+#include <linux/pstore.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+#define	PSTORE_NAMELEN	64
+
+struct pstore_private {
+	u64	id;
+	int	(*erase)(u64);
+	ssize_t	size;
+	char	data[];
+};
+
+static int pstore_file_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
+						size_t count, loff_t *ppos)
+{
+	struct pstore_private *ps = file->private_data;
+
+	return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size);
+}
+
+static const struct file_operations pstore_file_operations = {
+	.open	= pstore_file_open,
+	.read	= pstore_file_read,
+	.llseek	= default_llseek,
+};
+
+/*
+ * When a file is unlinked from our file system we call the
+ * platform driver to erase the record from persistent store.
+ */
+static int pstore_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct pstore_private *p = dentry->d_inode->i_private;
+
+	p->erase(p->id);
+
+	return simple_unlink(dir, dentry);
+}
+
+static void pstore_evict_inode(struct inode *inode)
+{
+	end_writeback(inode);
+	kfree(inode->i_private);
+}
+
+static const struct inode_operations pstore_dir_inode_operations = {
+	.lookup		= simple_lookup,
+	.unlink		= pstore_unlink,
+};
+
+static struct inode *pstore_get_inode(struct super_block *sb,
+					const struct inode *dir, int mode, dev_t dev)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode->i_uid = inode->i_gid = 0;
+		inode->i_mode = mode;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		switch (mode & S_IFMT) {
+		case S_IFREG:
+			inode->i_fop = &pstore_file_operations;
+			break;
+		case S_IFDIR:
+			inode->i_op = &pstore_dir_inode_operations;
+			inode->i_fop = &simple_dir_operations;
+			inc_nlink(inode);
+			break;
+		}
+	}
+	return inode;
+}
+
+enum {
+	Opt_kmsg_bytes, Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_kmsg_bytes, "kmsg_bytes=%u"},
+	{Opt_err, NULL}
+};
+
+static void parse_options(char *options)
+{
+	char		*p;
+	substring_t	args[MAX_OPT_ARGS];
+	int		option;
+
+	if (!options)
+		return;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_kmsg_bytes:
+			if (!match_int(&args[0], &option))
+				pstore_set_kmsg_bytes(option);
+			break;
+		}
+	}
+}
+
+static int pstore_remount(struct super_block *sb, int *flags, char *data)
+{
+	parse_options(data);
+
+	return 0;
+}
+
+static const struct super_operations pstore_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= pstore_evict_inode,
+	.remount_fs	= pstore_remount,
+	.show_options	= generic_show_options,
+};
+
+static struct super_block *pstore_sb;
+
+int pstore_is_mounted(void)
+{
+	return pstore_sb != NULL;
+}
+
+/*
+ * Make a regular file in the root directory of our file system.
+ * Load it up with "size" bytes of data from "buf".
+ * Set the mtime & ctime to the date that this record was originally stored.
+ */
+int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
+			      char *data, size_t size,
+			      struct timespec time, int (*erase)(u64))
+{
+	struct dentry		*root = pstore_sb->s_root;
+	struct dentry		*dentry;
+	struct inode		*inode;
+	int			rc;
+	char			name[PSTORE_NAMELEN];
+	struct pstore_private	*private;
+
+	rc = -ENOMEM;
+	inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
+	if (!inode)
+		goto fail;
+	private = kmalloc(sizeof *private + size, GFP_KERNEL);
+	if (!private)
+		goto fail_alloc;
+	private->id = id;
+	private->erase = erase;
+
+	switch (type) {
+	case PSTORE_TYPE_DMESG:
+		sprintf(name, "dmesg-%s-%lld", psname, id);
+		break;
+	case PSTORE_TYPE_MCE:
+		sprintf(name, "mce-%s-%lld", psname, id);
+		break;
+	case PSTORE_TYPE_UNKNOWN:
+		sprintf(name, "unknown-%s-%lld", psname, id);
+		break;
+	default:
+		sprintf(name, "type%d-%s-%lld", type, psname, id);
+		break;
+	}
+
+	mutex_lock(&root->d_inode->i_mutex);
+
+	rc = -ENOSPC;
+	dentry = d_alloc_name(root, name);
+	if (IS_ERR(dentry))
+		goto fail_lockedalloc;
+
+	memcpy(private->data, data, size);
+	inode->i_size = private->size = size;
+
+	inode->i_private = private;
+
+	if (time.tv_sec)
+		inode->i_mtime = inode->i_ctime = time;
+
+	d_add(dentry, inode);
+
+	mutex_unlock(&root->d_inode->i_mutex);
+
+	return 0;
+
+fail_lockedalloc:
+	mutex_unlock(&root->d_inode->i_mutex);
+	kfree(private);
+fail_alloc:
+	iput(inode);
+
+fail:
+	return rc;
+}
+
+int pstore_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode = NULL;
+	struct dentry *root;
+	int err;
+
+	save_mount_options(sb, data);
+
+	pstore_sb = sb;
+
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_blocksize		= PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits	= PAGE_CACHE_SHIFT;
+	sb->s_magic		= PSTOREFS_MAGIC;
+	sb->s_op		= &pstore_ops;
+	sb->s_time_gran		= 1;
+
+	parse_options(data);
+
+	inode = pstore_get_inode(sb, NULL, S_IFDIR | 0755, 0);
+	if (!inode) {
+		err = -ENOMEM;
+		goto fail;
+	}
+	/* override ramfs "dir" options so we catch unlink(2) */
+	inode->i_op = &pstore_dir_inode_operations;
+
+	root = d_alloc_root(inode);
+	sb->s_root = root;
+	if (!root) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	pstore_get_records();
+
+	return 0;
+fail:
+	iput(inode);
+	return err;
+}
+
+static struct dentry *pstore_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_single(fs_type, flags, data, pstore_fill_super);
+}
+
+static void pstore_kill_sb(struct super_block *sb)
+{
+	kill_litter_super(sb);
+	pstore_sb = NULL;
+}
+
+static struct file_system_type pstore_fs_type = {
+	.name		= "pstore",
+	.mount		= pstore_mount,
+	.kill_sb	= pstore_kill_sb,
+};
+
+static int __init init_pstore_fs(void)
+{
+	return register_filesystem(&pstore_fs_type);
+}
+module_init(init_pstore_fs)
+
+MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
+MODULE_LICENSE("GPL");
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
new file mode 100644
index 00000000..8c9f23eb
--- /dev/null
+++ b/fs/pstore/internal.h
@@ -0,0 +1,6 @@
+extern void	pstore_set_kmsg_bytes(int);
+extern void	pstore_get_records(void);
+extern int	pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
+			      char *data, size_t size,
+			      struct timespec time, int (*erase)(u64));
+extern int	pstore_is_mounted(void);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
new file mode 100644
index 00000000..f2c3ff20
--- /dev/null
+++ b/fs/pstore/platform.c
@@ -0,0 +1,207 @@
+/*
+ * Persistent Storage - platform driver interface parts.
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/atomic.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kmsg_dump.h>
+#include <linux/module.h>
+#include <linux/pstore.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+/*
+ * pstore_lock just protects "psinfo" during
+ * calls to pstore_register()
+ */
+static DEFINE_SPINLOCK(pstore_lock);
+static struct pstore_info *psinfo;
+
+/* How much of the console log to snapshot */
+static unsigned long kmsg_bytes = 10240;
+
+void pstore_set_kmsg_bytes(int bytes)
+{
+	kmsg_bytes = bytes;
+}
+
+/* Tag each group of saved records with a sequence number */
+static int	oopscount;
+
+static char *reason_str[] = {
+	"Oops", "Panic", "Kexec", "Restart", "Halt", "Poweroff", "Emergency"
+};
+
+/*
+ * callback from kmsg_dump. (s2,l2) has the most recently
+ * written bytes, older bytes are in (s1,l1). Save as much
+ * as we can from the end of the buffer.
+ */
+static void pstore_dump(struct kmsg_dumper *dumper,
+	    enum kmsg_dump_reason reason,
+	    const char *s1, unsigned long l1,
+	    const char *s2, unsigned long l2)
+{
+	unsigned long	s1_start, s2_start;
+	unsigned long	l1_cpy, l2_cpy;
+	unsigned long	size, total = 0;
+	char		*dst, *why;
+	u64		id;
+	int		hsize, part = 1;
+
+	if (reason < ARRAY_SIZE(reason_str))
+		why = reason_str[reason];
+	else
+		why = "Unknown";
+
+	mutex_lock(&psinfo->buf_mutex);
+	oopscount++;
+	while (total < kmsg_bytes) {
+		dst = psinfo->buf;
+		hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part++);
+		size = psinfo->bufsize - hsize;
+		dst += hsize;
+
+		l2_cpy = min(l2, size);
+		l1_cpy = min(l1, size - l2_cpy);
+
+		if (l1_cpy + l2_cpy == 0)
+			break;
+
+		s2_start = l2 - l2_cpy;
+		s1_start = l1 - l1_cpy;
+
+		memcpy(dst, s1 + s1_start, l1_cpy);
+		memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
+
+		id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy);
+		if (reason == KMSG_DUMP_OOPS && pstore_is_mounted())
+			pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
+				      psinfo->buf, hsize + l1_cpy + l2_cpy,
+				      CURRENT_TIME, psinfo->erase);
+		l1 -= l1_cpy;
+		l2 -= l2_cpy;
+		total += l1_cpy + l2_cpy;
+	}
+	mutex_unlock(&psinfo->buf_mutex);
+}
+
+static struct kmsg_dumper pstore_dumper = {
+	.dump = pstore_dump,
+};
+
+/*
+ * platform specific persistent storage driver registers with
+ * us here. If pstore is already mounted, call the platform
+ * read function right away to populate the file system. If not
+ * then the pstore mount code will call us later to fill out
+ * the file system.
+ *
+ * Register with kmsg_dump to save last part of console log on panic.
+ */
+int pstore_register(struct pstore_info *psi)
+{
+	struct module *owner = psi->owner;
+
+	spin_lock(&pstore_lock);
+	if (psinfo) {
+		spin_unlock(&pstore_lock);
+		return -EBUSY;
+	}
+	psinfo = psi;
+	spin_unlock(&pstore_lock);
+
+	if (owner && !try_module_get(owner)) {
+		psinfo = NULL;
+		return -EINVAL;
+	}
+
+	if (pstore_is_mounted())
+		pstore_get_records();
+
+	kmsg_dump_register(&pstore_dumper);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_register);
+
+/*
+ * Read all the records from the persistent store. Create and
+ * file files in our filesystem.
+ */
+void pstore_get_records(void)
+{
+	struct pstore_info *psi = psinfo;
+	ssize_t			size;
+	u64			id;
+	enum pstore_type_id	type;
+	struct timespec		time;
+	int			failed = 0, rc;
+
+	if (!psi)
+		return;
+
+	mutex_lock(&psinfo->buf_mutex);
+	rc = psi->open(psi);
+	if (rc)
+		goto out;
+
+	while ((size = psi->read(&id, &type, &time)) > 0) {
+		if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size,
+				  time, psi->erase))
+			failed++;
+	}
+	psi->close(psi);
+out:
+	mutex_unlock(&psinfo->buf_mutex);
+
+	if (failed)
+		printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
+		       failed, psi->name);
+}
+
+/*
+ * Call platform driver to write a record to the
+ * persistent store.
+ */
+int pstore_write(enum pstore_type_id type, char *buf, size_t size)
+{
+	u64	id;
+
+	if (!psinfo)
+		return -ENODEV;
+
+	if (size > psinfo->bufsize)
+		return -EFBIG;
+
+	mutex_lock(&psinfo->buf_mutex);
+	memcpy(psinfo->buf, buf, size);
+	id = psinfo->write(type, size);
+	if (pstore_is_mounted())
+		pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
+			      size, CURRENT_TIME, psinfo->erase);
+	mutex_unlock(&psinfo->buf_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_write);
diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
new file mode 100644
index 00000000..5f608999
--- /dev/null
+++ b/fs/qnx4/Kconfig
@@ -0,0 +1,14 @@
+config QNX4FS_FS
+	tristate "QNX4 file system support (read only)"
+	depends on BLOCK
+	help
+	  This is the file system used by the real-time operating systems
+	  QNX 4 and QNX 6 (the latter is also called QNX RTP).
+	  Further information is available at <http://www.qnx.com/>.
+	  Say Y if you intend to mount QNX hard disks or floppies.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called qnx4.
+
+	  If you don't know whether you need it, then you don't need it:
+	  answer N.
diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile
new file mode 100644
index 00000000..4a283b3f
--- /dev/null
+++ b/fs/qnx4/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux qnx4-filesystem routines.
+#
+
+obj-$(CONFIG_QNX4FS_FS) += qnx4.o
+
+qnx4-objs := inode.o dir.o namei.o bitmap.o
diff --git a/fs/qnx4/README b/fs/qnx4/README
new file mode 100644
index 00000000..1f1e320d
--- /dev/null
+++ b/fs/qnx4/README
@@ -0,0 +1,9 @@
+
+  This is a snapshot of the QNX4 filesystem for Linux.
+  Please send diffs and remarks to <al@alarsen.net> .
+  
+Credits :
+
+Richard "Scuba" A. Frowijn     <scuba@wxs.nl>
+Frank "Jedi/Sector One" Denis  <j@pureftpd.org>
+Anders Larsen                  <al@alarsen.net> (Maintainer)
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
new file mode 100644
index 00000000..22e0d60e
--- /dev/null
+++ b/fs/qnx4/bitmap.c
@@ -0,0 +1,58 @@
+/*
+ * QNX4 file system, Linux implementation.
+ *
+ * Version : 0.2.1
+ *
+ * Using parts of the xiafs filesystem.
+ *
+ * History :
+ *
+ * 28-05-1998 by Richard Frowijn : first release.
+ * 20-06-1998 by Frank Denis : basic optimisations.
+ * 25-06-1998 by Frank Denis : qnx4_is_free, qnx4_set_bitmap, qnx4_bmap .
+ * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) .
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include "qnx4.h"
+
+static void count_bits(register const char *bmPart, register int size,
+		       int *const tf)
+{
+	char b;
+	int tot = *tf;
+
+	if (size > QNX4_BLOCK_SIZE) {
+		size = QNX4_BLOCK_SIZE;
+	}
+	do {
+		b = *bmPart++;
+		tot += 8 - hweight8(b);
+		size--;
+	} while (size != 0);
+	*tf = tot;
+}
+
+unsigned long qnx4_count_free_blocks(struct super_block *sb)
+{
+	int start = le32_to_cpu(qnx4_sb(sb)->BitMap->di_first_xtnt.xtnt_blk) - 1;
+	int total = 0;
+	int total_free = 0;
+	int offset = 0;
+	int size = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size);
+	struct buffer_head *bh;
+
+	while (total < size) {
+		if ((bh = sb_bread(sb, start + offset)) == NULL) {
+			printk(KERN_ERR "qnx4: I/O error in counting free blocks\n");
+			break;
+		}
+		count_bits(bh->b_data, size - total, &total_free);
+		brelse(bh);
+		total += QNX4_BLOCK_SIZE;
+		offset++;
+	}
+
+	return total_free;
+}
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
new file mode 100644
index 00000000..7b032946
--- /dev/null
+++ b/fs/qnx4/dir.c
@@ -0,0 +1,85 @@
+/*
+ * QNX4 file system, Linux implementation.
+ *
+ * Version : 0.2.1
+ *
+ * Using parts of the xiafs filesystem.
+ *
+ * History :
+ *
+ * 28-05-1998 by Richard Frowijn : first release.
+ * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
+ */
+
+#include <linux/buffer_head.h>
+#include "qnx4.h"
+
+static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	unsigned int offset;
+	struct buffer_head *bh;
+	struct qnx4_inode_entry *de;
+	struct qnx4_link_info *le;
+	unsigned long blknum;
+	int ix, ino;
+	int size;
+
+	QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
+	QNX4DEBUG((KERN_INFO "filp->f_pos         = %ld\n", (long) filp->f_pos));
+
+	while (filp->f_pos < inode->i_size) {
+		blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS );
+		bh = sb_bread(inode->i_sb, blknum);
+		if(bh==NULL) {
+			printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum);
+			break;
+		}
+		ix = (int)(filp->f_pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
+		while (ix < QNX4_INODES_PER_BLOCK) {
+			offset = ix * QNX4_DIR_ENTRY_SIZE;
+			de = (struct qnx4_inode_entry *) (bh->b_data + offset);
+			size = strlen(de->di_fname);
+			if (size) {
+				if ( !( de->di_status & QNX4_FILE_LINK ) && size > QNX4_SHORT_NAME_MAX )
+					size = QNX4_SHORT_NAME_MAX;
+				else if ( size > QNX4_NAME_MAX )
+					size = QNX4_NAME_MAX;
+
+				if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) {
+					QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
+					if ( ( de->di_status & QNX4_FILE_LINK ) == 0 )
+						ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
+					else {
+						le  = (struct qnx4_link_info*)de;
+						ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
+							QNX4_INODES_PER_BLOCK +
+							le->dl_inode_ndx;
+					}
+					if (filldir(dirent, de->di_fname, size, filp->f_pos, ino, DT_UNKNOWN) < 0) {
+						brelse(bh);
+						goto out;
+					}
+				}
+			}
+			ix++;
+			filp->f_pos += QNX4_DIR_ENTRY_SIZE;
+		}
+		brelse(bh);
+	}
+out:
+	return 0;
+}
+
+const struct file_operations qnx4_dir_operations =
+{
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= qnx4_readdir,
+	.fsync		= generic_file_fsync,
+};
+
+const struct inode_operations qnx4_dir_inode_operations =
+{
+	.lookup		= qnx4_lookup,
+};
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
new file mode 100644
index 00000000..2b064661
--- /dev/null
+++ b/fs/qnx4/inode.c
@@ -0,0 +1,504 @@
+/*
+ * QNX4 file system, Linux implementation.
+ *
+ * Version : 0.2.1
+ *
+ * Using parts of the xiafs filesystem.
+ *
+ * History :
+ *
+ * 01-06-1998 by Richard Frowijn : first release.
+ * 20-06-1998 by Frank Denis : Linux 2.1.99+ support, boot signature, misc.
+ * 30-06-1998 by Frank Denis : first step to write inodes.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include "qnx4.h"
+
+#define QNX4_VERSION  4
+#define QNX4_BMNAME   ".bitmap"
+
+static const struct super_operations qnx4_sops;
+
+static void qnx4_put_super(struct super_block *sb);
+static struct inode *qnx4_alloc_inode(struct super_block *sb);
+static void qnx4_destroy_inode(struct inode *inode);
+static int qnx4_remount(struct super_block *sb, int *flags, char *data);
+static int qnx4_statfs(struct dentry *, struct kstatfs *);
+
+static const struct super_operations qnx4_sops =
+{
+	.alloc_inode	= qnx4_alloc_inode,
+	.destroy_inode	= qnx4_destroy_inode,
+	.put_super	= qnx4_put_super,
+	.statfs		= qnx4_statfs,
+	.remount_fs	= qnx4_remount,
+};
+
+static int qnx4_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct qnx4_sb_info *qs;
+
+	qs = qnx4_sb(sb);
+	qs->Version = QNX4_VERSION;
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+static struct buffer_head *qnx4_getblk(struct inode *inode, int nr,
+				       int create)
+{
+	struct buffer_head *result = NULL;
+
+	if ( nr >= 0 )
+		nr = qnx4_block_map( inode, nr );
+	if (nr) {
+		result = sb_getblk(inode->i_sb, nr);
+		return result;
+	}
+	return NULL;
+}
+
+struct buffer_head *qnx4_bread(struct inode *inode, int block, int create)
+{
+	struct buffer_head *bh;
+
+	bh = qnx4_getblk(inode, block, create);
+	if (!bh || buffer_uptodate(bh)) {
+		return bh;
+	}
+	ll_rw_block(READ, 1, &bh);
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		return bh;
+	}
+	brelse(bh);
+
+	return NULL;
+}
+
+static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_head *bh, int create )
+{
+	unsigned long phys;
+
+	QNX4DEBUG((KERN_INFO "qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock));
+
+	phys = qnx4_block_map( inode, iblock );
+	if ( phys ) {
+		// logical block is before EOF
+		map_bh(bh, inode->i_sb, phys);
+	}
+	return 0;
+}
+
+unsigned long qnx4_block_map( struct inode *inode, long iblock )
+{
+	int ix;
+	long offset, i_xblk;
+	unsigned long block = 0;
+	struct buffer_head *bh = NULL;
+	struct qnx4_xblk *xblk = NULL;
+	struct qnx4_inode_entry *qnx4_inode = qnx4_raw_inode(inode);
+	u16 nxtnt = le16_to_cpu(qnx4_inode->di_num_xtnts);
+
+	if ( iblock < le32_to_cpu(qnx4_inode->di_first_xtnt.xtnt_size) ) {
+		// iblock is in the first extent. This is easy.
+		block = le32_to_cpu(qnx4_inode->di_first_xtnt.xtnt_blk) + iblock - 1;
+	} else {
+		// iblock is beyond first extent. We have to follow the extent chain.
+		i_xblk = le32_to_cpu(qnx4_inode->di_xblk);
+		offset = iblock - le32_to_cpu(qnx4_inode->di_first_xtnt.xtnt_size);
+		ix = 0;
+		while ( --nxtnt > 0 ) {
+			if ( ix == 0 ) {
+				// read next xtnt block.
+				bh = sb_bread(inode->i_sb, i_xblk - 1);
+				if ( !bh ) {
+					QNX4DEBUG((KERN_ERR "qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1));
+					return -EIO;
+				}
+				xblk = (struct qnx4_xblk*)bh->b_data;
+				if ( memcmp( xblk->xblk_signature, "IamXblk", 7 ) ) {
+					QNX4DEBUG((KERN_ERR "qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk));
+					return -EIO;
+				}
+			}
+			if ( offset < le32_to_cpu(xblk->xblk_xtnts[ix].xtnt_size) ) {
+				// got it!
+				block = le32_to_cpu(xblk->xblk_xtnts[ix].xtnt_blk) + offset - 1;
+				break;
+			}
+			offset -= le32_to_cpu(xblk->xblk_xtnts[ix].xtnt_size);
+			if ( ++ix >= xblk->xblk_num_xtnts ) {
+				i_xblk = le32_to_cpu(xblk->xblk_next_xblk);
+				ix = 0;
+				brelse( bh );
+				bh = NULL;
+			}
+		}
+		if ( bh )
+			brelse( bh );
+	}
+
+	QNX4DEBUG((KERN_INFO "qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block));
+	return block;
+}
+
+static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type    = sb->s_magic;
+	buf->f_bsize   = sb->s_blocksize;
+	buf->f_blocks  = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8;
+	buf->f_bfree   = qnx4_count_free_blocks(sb);
+	buf->f_bavail  = buf->f_bfree;
+	buf->f_namelen = QNX4_NAME_MAX;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+/*
+ * Check the root directory of the filesystem to make sure
+ * it really _is_ a qnx4 filesystem, and to check the size
+ * of the directory entry.
+ */
+static const char *qnx4_checkroot(struct super_block *sb)
+{
+	struct buffer_head *bh;
+	struct qnx4_inode_entry *rootdir;
+	int rd, rl;
+	int i, j;
+	int found = 0;
+
+	if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') {
+		return "no qnx4 filesystem (no root dir).";
+	} else {
+		QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
+		rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
+		rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
+		for (j = 0; j < rl; j++) {
+			bh = sb_bread(sb, rd + j);	/* root dir, first block */
+			if (bh == NULL) {
+				return "unable to read root entry.";
+			}
+			for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) {
+				rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
+				if (rootdir->di_fname != NULL) {
+					QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
+					if (!strcmp(rootdir->di_fname,
+						    QNX4_BMNAME)) {
+						found = 1;
+						qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL );
+						if (!qnx4_sb(sb)->BitMap) {
+							brelse (bh);
+							return "not enough memory for bitmap inode";
+						}
+						memcpy( qnx4_sb(sb)->BitMap, rootdir, sizeof( struct qnx4_inode_entry ) );	/* keep bitmap inode known */
+						break;
+					}
+				}
+			}
+			brelse(bh);
+			if (found != 0) {
+				break;
+			}
+		}
+		if (found == 0) {
+			return "bitmap file not found.";
+		}
+	}
+	return NULL;
+}
+
+static int qnx4_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct buffer_head *bh;
+	struct inode *root;
+	const char *errmsg;
+	struct qnx4_sb_info *qs;
+	int ret = -EINVAL;
+
+	qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
+	if (!qs)
+		return -ENOMEM;
+	s->s_fs_info = qs;
+
+	sb_set_blocksize(s, QNX4_BLOCK_SIZE);
+
+	/* Check the superblock signature. Since the qnx4 code is
+	   dangerous, we should leave as quickly as possible
+	   if we don't belong here... */
+	bh = sb_bread(s, 1);
+	if (!bh) {
+		printk(KERN_ERR "qnx4: unable to read the superblock\n");
+		goto outnobh;
+	}
+	if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) {
+		if (!silent)
+			printk(KERN_ERR "qnx4: wrong fsid in superblock.\n");
+		goto out;
+	}
+	s->s_op = &qnx4_sops;
+	s->s_magic = QNX4_SUPER_MAGIC;
+	s->s_flags |= MS_RDONLY;	/* Yup, read-only yet */
+	qnx4_sb(s)->sb_buf = bh;
+	qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data;
+
+
+ 	/* check before allocating dentries, inodes, .. */
+	errmsg = qnx4_checkroot(s);
+	if (errmsg != NULL) {
+ 		if (!silent)
+			printk(KERN_ERR "qnx4: %s\n", errmsg);
+		goto out;
+	}
+
+ 	/* does root not have inode number QNX4_ROOT_INO ?? */
+	root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK);
+	if (IS_ERR(root)) {
+		printk(KERN_ERR "qnx4: get inode failed\n");
+		ret = PTR_ERR(root);
+ 		goto out;
+ 	}
+
+	ret = -ENOMEM;
+ 	s->s_root = d_alloc_root(root);
+ 	if (s->s_root == NULL)
+ 		goto outi;
+
+	brelse(bh);
+	return 0;
+
+      outi:
+	iput(root);
+      out:
+	brelse(bh);
+      outnobh:
+	kfree(qs);
+	s->s_fs_info = NULL;
+	return ret;
+}
+
+static void qnx4_put_super(struct super_block *sb)
+{
+	struct qnx4_sb_info *qs = qnx4_sb(sb);
+	kfree( qs->BitMap );
+	kfree( qs );
+	sb->s_fs_info = NULL;
+	return;
+}
+
+static int qnx4_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page,qnx4_get_block, wbc);
+}
+
+static int qnx4_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page,qnx4_get_block);
+}
+
+static int qnx4_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	struct qnx4_inode_info *qnx4_inode = qnx4_i(mapping->host);
+	int ret;
+
+	*pagep = NULL;
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+				qnx4_get_block,
+				&qnx4_inode->mmu_private);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
+static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,qnx4_get_block);
+}
+static const struct address_space_operations qnx4_aops = {
+	.readpage	= qnx4_readpage,
+	.writepage	= qnx4_writepage,
+	.write_begin	= qnx4_write_begin,
+	.write_end	= generic_write_end,
+	.bmap		= qnx4_bmap
+};
+
+struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
+{
+	struct buffer_head *bh;
+	struct qnx4_inode_entry *raw_inode;
+	int block;
+	struct qnx4_inode_entry *qnx4_inode;
+	struct inode *inode;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	qnx4_inode = qnx4_raw_inode(inode);
+	inode->i_mode = 0;
+
+	QNX4DEBUG((KERN_INFO "reading inode : [%d]\n", ino));
+	if (!ino) {
+		printk(KERN_ERR "qnx4: bad inode number on dev %s: %lu is "
+				"out of range\n",
+		       sb->s_id, ino);
+		iget_failed(inode);
+		return ERR_PTR(-EIO);
+	}
+	block = ino / QNX4_INODES_PER_BLOCK;
+
+	if (!(bh = sb_bread(sb, block))) {
+		printk(KERN_ERR "qnx4: major problem: unable to read inode from dev "
+		       "%s\n", sb->s_id);
+		iget_failed(inode);
+		return ERR_PTR(-EIO);
+	}
+	raw_inode = ((struct qnx4_inode_entry *) bh->b_data) +
+	    (ino % QNX4_INODES_PER_BLOCK);
+
+	inode->i_mode    = le16_to_cpu(raw_inode->di_mode);
+	inode->i_uid     = (uid_t)le16_to_cpu(raw_inode->di_uid);
+	inode->i_gid     = (gid_t)le16_to_cpu(raw_inode->di_gid);
+	inode->i_nlink   = le16_to_cpu(raw_inode->di_nlink);
+	inode->i_size    = le32_to_cpu(raw_inode->di_size);
+	inode->i_mtime.tv_sec   = le32_to_cpu(raw_inode->di_mtime);
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_atime.tv_sec   = le32_to_cpu(raw_inode->di_atime);
+	inode->i_atime.tv_nsec = 0;
+	inode->i_ctime.tv_sec   = le32_to_cpu(raw_inode->di_ctime);
+	inode->i_ctime.tv_nsec = 0;
+	inode->i_blocks  = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size);
+
+	memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE);
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_fop = &generic_ro_fops;
+		inode->i_mapping->a_ops = &qnx4_aops;
+		qnx4_i(inode)->mmu_private = inode->i_size;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &qnx4_dir_inode_operations;
+		inode->i_fop = &qnx4_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &page_symlink_inode_operations;
+		inode->i_mapping->a_ops = &qnx4_aops;
+		qnx4_i(inode)->mmu_private = inode->i_size;
+	} else {
+		printk(KERN_ERR "qnx4: bad inode %lu on dev %s\n",
+			ino, sb->s_id);
+		iget_failed(inode);
+		brelse(bh);
+		return ERR_PTR(-EIO);
+	}
+	brelse(bh);
+	unlock_new_inode(inode);
+	return inode;
+}
+
+static struct kmem_cache *qnx4_inode_cachep;
+
+static struct inode *qnx4_alloc_inode(struct super_block *sb)
+{
+	struct qnx4_inode_info *ei;
+	ei = kmem_cache_alloc(qnx4_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void qnx4_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
+}
+
+static void qnx4_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, qnx4_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache",
+					     sizeof(struct qnx4_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (qnx4_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(qnx4_inode_cachep);
+}
+
+static struct dentry *qnx4_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
+}
+
+static struct file_system_type qnx4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "qnx4",
+	.mount		= qnx4_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_qnx4_fs(void)
+{
+	int err;
+
+	err = init_inodecache();
+	if (err)
+		return err;
+
+	err = register_filesystem(&qnx4_fs_type);
+	if (err) {
+		destroy_inodecache();
+		return err;
+	}
+
+	printk(KERN_INFO "QNX4 filesystem 0.2.3 registered.\n");
+	return 0;
+}
+
+static void __exit exit_qnx4_fs(void)
+{
+	unregister_filesystem(&qnx4_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_qnx4_fs)
+module_exit(exit_qnx4_fs)
+MODULE_LICENSE("GPL");
+
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
new file mode 100644
index 00000000..275327b5
--- /dev/null
+++ b/fs/qnx4/namei.c
@@ -0,0 +1,132 @@
+/* 
+ * QNX4 file system, Linux implementation.
+ * 
+ * Version : 0.2.1
+ * 
+ * Using parts of the xiafs filesystem.
+ * 
+ * History :
+ * 
+ * 01-06-1998 by Richard Frowijn : first release.
+ * 21-06-1998 by Frank Denis : dcache support, fixed error codes.
+ * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
+ */
+
+#include <linux/buffer_head.h>
+#include "qnx4.h"
+
+
+/*
+ * check if the filename is correct. For some obscure reason, qnx writes a
+ * new file twice in the directory entry, first with all possible options at 0
+ * and for a second time the way it is, they want us not to access the qnx
+ * filesystem when whe are using linux.
+ */
+static int qnx4_match(int len, const char *name,
+		      struct buffer_head *bh, unsigned long *offset)
+{
+	struct qnx4_inode_entry *de;
+	int namelen, thislen;
+
+	if (bh == NULL) {
+		printk(KERN_WARNING "qnx4: matching unassigned buffer !\n");
+		return 0;
+	}
+	de = (struct qnx4_inode_entry *) (bh->b_data + *offset);
+	*offset += QNX4_DIR_ENTRY_SIZE;
+	if ((de->di_status & QNX4_FILE_LINK) != 0) {
+		namelen = QNX4_NAME_MAX;
+	} else {
+		namelen = QNX4_SHORT_NAME_MAX;
+	}
+	/* "" means "." ---> so paths like "/usr/lib//libc.a" work */
+	if (!len && (de->di_fname[0] == '.') && (de->di_fname[1] == '\0')) {
+		return 1;
+	}
+	thislen = strlen( de->di_fname );
+	if ( thislen > namelen )
+		thislen = namelen;
+	if (len != thislen) {
+		return 0;
+	}
+	if (strncmp(name, de->di_fname, len) == 0) {
+		if ((de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)) != 0) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,
+	   const char *name, struct qnx4_inode_entry **res_dir, int *ino)
+{
+	unsigned long block, offset, blkofs;
+	struct buffer_head *bh;
+
+	*res_dir = NULL;
+	if (!dir->i_sb) {
+		printk(KERN_WARNING "qnx4: no superblock on dir.\n");
+		return NULL;
+	}
+	bh = NULL;
+	block = offset = blkofs = 0;
+	while (blkofs * QNX4_BLOCK_SIZE + offset < dir->i_size) {
+		if (!bh) {
+			bh = qnx4_bread(dir, blkofs, 0);
+			if (!bh) {
+				blkofs++;
+				continue;
+			}
+		}
+		*res_dir = (struct qnx4_inode_entry *) (bh->b_data + offset);
+		if (qnx4_match(len, name, bh, &offset)) {
+			block = qnx4_block_map( dir, blkofs );
+			*ino = block * QNX4_INODES_PER_BLOCK +
+			    (offset / QNX4_DIR_ENTRY_SIZE) - 1;
+			return bh;
+		}
+		if (offset < bh->b_size) {
+			continue;
+		}
+		brelse(bh);
+		bh = NULL;
+		offset = 0;
+		blkofs++;
+	}
+	brelse(bh);
+	*res_dir = NULL;
+	return NULL;
+}
+
+struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	int ino;
+	struct qnx4_inode_entry *de;
+	struct qnx4_link_info *lnk;
+	struct buffer_head *bh;
+	const char *name = dentry->d_name.name;
+	int len = dentry->d_name.len;
+	struct inode *foundinode = NULL;
+
+	if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino)))
+		goto out;
+	/* The entry is linked, let's get the real info */
+	if ((de->di_status & QNX4_FILE_LINK) == QNX4_FILE_LINK) {
+		lnk = (struct qnx4_link_info *) de;
+		ino = (le32_to_cpu(lnk->dl_inode_blk) - 1) *
+                    QNX4_INODES_PER_BLOCK +
+		    lnk->dl_inode_ndx;
+	}
+	brelse(bh);
+
+	foundinode = qnx4_iget(dir->i_sb, ino);
+	if (IS_ERR(foundinode)) {
+		QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
+			   PTR_ERR(foundinode)));
+		return ERR_CAST(foundinode);
+	}
+out:
+	d_add(dentry, foundinode);
+
+	return NULL;
+}
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
new file mode 100644
index 00000000..33a60858
--- /dev/null
+++ b/fs/qnx4/qnx4.h
@@ -0,0 +1,49 @@
+#include <linux/fs.h>
+#include <linux/qnx4_fs.h>
+
+#define QNX4_DEBUG 0
+
+#if QNX4_DEBUG
+#define QNX4DEBUG(X) printk X
+#else
+#define QNX4DEBUG(X) (void) 0
+#endif
+
+struct qnx4_sb_info {
+	struct buffer_head	*sb_buf;	/* superblock buffer */
+	struct qnx4_super_block	*sb;		/* our superblock */
+	unsigned int		Version;	/* may be useful */
+	struct qnx4_inode_entry	*BitMap;	/* useful */
+};
+
+struct qnx4_inode_info {
+	struct qnx4_inode_entry raw;
+	loff_t mmu_private;
+	struct inode vfs_inode;
+};
+
+extern struct inode *qnx4_iget(struct super_block *, unsigned long);
+extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
+extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
+extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
+
+extern struct buffer_head *qnx4_bread(struct inode *, int, int);
+
+extern const struct inode_operations qnx4_dir_inode_operations;
+extern const struct file_operations qnx4_dir_operations;
+extern int qnx4_is_free(struct super_block *sb, long block);
+
+static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct qnx4_inode_info *qnx4_i(struct inode *inode)
+{
+	return container_of(inode, struct qnx4_inode_info, vfs_inode);
+}
+
+static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
+{
+	return &qnx4_i(inode)->raw;
+}
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
new file mode 100644
index 00000000..880fd988
--- /dev/null
+++ b/fs/quota/Kconfig
@@ -0,0 +1,74 @@
+#
+#  Quota configuration
+#
+
+config QUOTA
+	bool "Quota support"
+	select QUOTACTL
+	help
+	  If you say Y here, you will be able to set per user limits for disk
+	  usage (also called disk quotas). Currently, it works for the
+	  ext2, ext3, and reiserfs file system. ext3 also supports journalled
+	  quotas for which you don't need to run quotacheck(8) after an unclean
+	  shutdown.
+	  For further details, read the Quota mini-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>, or the documentation provided
+	  with the quota tools. Probably the quota support is only useful for
+	  multi user systems. If unsure, say N.
+
+config QUOTA_NETLINK_INTERFACE
+	bool "Report quota messages through netlink interface"
+	depends on QUOTACTL && NET
+	help
+	  If you say Y here, quota warnings (about exceeding softlimit, reaching
+	  hardlimit, etc.) will be reported through netlink interface. If unsure,
+	  say Y.
+
+config PRINT_QUOTA_WARNING
+	bool "Print quota warnings to console (OBSOLETE)"
+	depends on QUOTA
+	default y
+	help
+	  If you say Y here, quota warnings (about exceeding softlimit, reaching
+	  hardlimit, etc.) will be printed to the process' controlling terminal.
+	  Note that this behavior is currently deprecated and may go away in
+	  future. Please use notification via netlink socket instead.
+
+config QUOTA_DEBUG
+	bool "Additional quota sanity checks"
+	depends on QUOTA
+	default n
+	help
+	  If you say Y here, quota subsystem will perform some additional
+	  sanity checks of quota internal structures. If unsure, say N.
+
+# Generic support for tree structured quota files. Selected when needed.
+config QUOTA_TREE
+	 tristate
+
+config QFMT_V1
+	tristate "Old quota format support"
+	depends on QUOTA
+	help
+	  This quota format was (is) used by kernels earlier than 2.4.22. If
+	  you have quota working and you don't want to convert to new quota
+	  format say Y here.
+
+config QFMT_V2
+	tristate "Quota format vfsv0 and vfsv1 support"
+	depends on QUOTA
+	select QUOTA_TREE
+	help
+	  This config option enables kernel support for vfsv0 and vfsv1 quota
+	  formats. Both these formats support 32-bit UIDs/GIDs and vfsv1 format
+	  also supports 64-bit inode and block quota limits. If you need this
+	  functionality say Y here.
+
+config QUOTACTL
+	bool
+	default n
+
+config QUOTACTL_COMPAT
+	bool
+	depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT
+	default y
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
new file mode 100644
index 00000000..5f9e9e27
--- /dev/null
+++ b/fs/quota/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_QUOTA)		+= dquot.o
+obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
+obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
+obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
+obj-$(CONFIG_QUOTACTL)		+= quota.o
+obj-$(CONFIG_QUOTACTL_COMPAT)	+= compat.o
+obj-$(CONFIG_QUOTA_NETLINK_INTERFACE)	+= netlink.o
diff --git a/fs/quota/compat.c b/fs/quota/compat.c
new file mode 100644
index 00000000..fb1892fe
--- /dev/null
+++ b/fs/quota/compat.c
@@ -0,0 +1,118 @@
+
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/quotaops.h>
+
+/*
+ * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
+ * and is necessary due to alignment problems.
+ */
+struct compat_if_dqblk {
+	compat_u64 dqb_bhardlimit;
+	compat_u64 dqb_bsoftlimit;
+	compat_u64 dqb_curspace;
+	compat_u64 dqb_ihardlimit;
+	compat_u64 dqb_isoftlimit;
+	compat_u64 dqb_curinodes;
+	compat_u64 dqb_btime;
+	compat_u64 dqb_itime;
+	compat_uint_t dqb_valid;
+};
+
+/* XFS structures */
+struct compat_fs_qfilestat {
+	compat_u64 dqb_bhardlimit;
+	compat_u64 qfs_nblks;
+	compat_uint_t qfs_nextents;
+};
+
+struct compat_fs_quota_stat {
+	__s8		qs_version;
+	__u16		qs_flags;
+	__s8		qs_pad;
+	struct compat_fs_qfilestat	qs_uquota;
+	struct compat_fs_qfilestat	qs_gquota;
+	compat_uint_t	qs_incoredqs;
+	compat_int_t	qs_btimelimit;
+	compat_int_t	qs_itimelimit;
+	compat_int_t	qs_rtbtimelimit;
+	__u16		qs_bwarnlimit;
+	__u16		qs_iwarnlimit;
+};
+
+asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
+						qid_t id, void __user *addr)
+{
+	unsigned int cmds;
+	struct if_dqblk __user *dqblk;
+	struct compat_if_dqblk __user *compat_dqblk;
+	struct fs_quota_stat __user *fsqstat;
+	struct compat_fs_quota_stat __user *compat_fsqstat;
+	compat_uint_t data;
+	u16 xdata;
+	long ret;
+
+	cmds = cmd >> SUBCMDSHIFT;
+
+	switch (cmds) {
+	case Q_GETQUOTA:
+		dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
+		compat_dqblk = addr;
+		ret = sys_quotactl(cmd, special, id, dqblk);
+		if (ret)
+			break;
+		if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
+			get_user(data, &dqblk->dqb_valid) ||
+			put_user(data, &compat_dqblk->dqb_valid))
+			ret = -EFAULT;
+		break;
+	case Q_SETQUOTA:
+		dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
+		compat_dqblk = addr;
+		ret = -EFAULT;
+		if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
+			get_user(data, &compat_dqblk->dqb_valid) ||
+			put_user(data, &dqblk->dqb_valid))
+			break;
+		ret = sys_quotactl(cmd, special, id, dqblk);
+		break;
+	case Q_XGETQSTAT:
+		fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
+		compat_fsqstat = addr;
+		ret = sys_quotactl(cmd, special, id, fsqstat);
+		if (ret)
+			break;
+		ret = -EFAULT;
+		/* Copying qs_version, qs_flags, qs_pad */
+		if (copy_in_user(compat_fsqstat, fsqstat,
+			offsetof(struct compat_fs_quota_stat, qs_uquota)))
+			break;
+		/* Copying qs_uquota */
+		if (copy_in_user(&compat_fsqstat->qs_uquota,
+			&fsqstat->qs_uquota,
+			sizeof(compat_fsqstat->qs_uquota)) ||
+			get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
+			put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
+			break;
+		/* Copying qs_gquota */
+		if (copy_in_user(&compat_fsqstat->qs_gquota,
+			&fsqstat->qs_gquota,
+			sizeof(compat_fsqstat->qs_gquota)) ||
+			get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
+			put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
+			break;
+		/* Copying the rest */
+		if (copy_in_user(&compat_fsqstat->qs_incoredqs,
+			&fsqstat->qs_incoredqs,
+			sizeof(struct compat_fs_quota_stat) -
+			offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
+			get_user(xdata, &fsqstat->qs_iwarnlimit) ||
+			put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
+			break;
+		ret = 0;
+		break;
+	default:
+		ret = sys_quotactl(cmd, special, id, addr);
+	}
+	return ret;
+}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
new file mode 100644
index 00000000..5b572c89
--- /dev/null
+++ b/fs/quota/dquot.c
@@ -0,0 +1,2661 @@
+/*
+ * Implementation of the diskquota system for the LINUX operating system. QUOTA
+ * is implemented using the BSD system call interface as the means of
+ * communication with the user level. This file contains the generic routines
+ * called by the different filesystems on allocation of an inode or block.
+ * These routines take care of the administration needed to have a consistent
+ * diskquota tracking system. The ideas of both user and group quotas are based
+ * on the Melbourne quota system as used on BSD derived systems. The internal
+ * implementation is based on one of the several variants of the LINUX
+ * inode-subsystem with added complexity of the diskquota system.
+ * 
+ * Author:	Marco van Wieringen <mvw@planets.elm.net>
+ *
+ * Fixes:   Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96
+ *
+ *		Revised list management to avoid races
+ *		-- Bill Hawes, <whawes@star.net>, 9/98
+ *
+ *		Fixed races in dquot_transfer(), dqget() and dquot_alloc_...().
+ *		As the consequence the locking was moved from dquot_decr_...(),
+ *		dquot_incr_...() to calling functions.
+ *		invalidate_dquots() now writes modified dquots.
+ *		Serialized quota_off() and quota_on() for mount point.
+ *		Fixed a few bugs in grow_dquots().
+ *		Fixed deadlock in write_dquot() - we no longer account quotas on
+ *		quota files
+ *		remove_dquot_ref() moved to inode.c - it now traverses through inodes
+ *		add_dquot_ref() restarts after blocking
+ *		Added check for bogus uid and fixed check for group in quotactl.
+ *		Jan Kara, <jack@suse.cz>, sponsored by SuSE CR, 10-11/99
+ *
+ *		Used struct list_head instead of own list struct
+ *		Invalidation of referenced dquots is no longer possible
+ *		Improved free_dquots list management
+ *		Quota and i_blocks are now updated in one place to avoid races
+ *		Warnings are now delayed so we won't block in critical section
+ *		Write updated not to require dquot lock
+ *		Jan Kara, <jack@suse.cz>, 9/2000
+ *
+ *		Added dynamic quota structure allocation
+ *		Jan Kara <jack@suse.cz> 12/2000
+ *
+ *		Rewritten quota interface. Implemented new quota format and
+ *		formats registering.
+ *		Jan Kara, <jack@suse.cz>, 2001,2002
+ *
+ *		New SMP locking.
+ *		Jan Kara, <jack@suse.cz>, 10/2002
+ *
+ *		Added journalled quota support, fix lock inversion problems
+ *		Jan Kara, <jack@suse.cz>, 2003,2004
+ *
+ * (C) Copyright 1994 - 1997 Marco van Wieringen 
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/mm.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/tty.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/kmod.h>
+#include <linux/namei.h>
+#include <linux/buffer_head.h>
+#include <linux/capability.h>
+#include <linux/quotaops.h>
+#include "../internal.h" /* ugh */
+
+#include <asm/uaccess.h>
+
+/*
+ * There are three quota SMP locks. dq_list_lock protects all lists with quotas
+ * and quota formats.
+ * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
+ * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
+ * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
+ * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects
+ * modifications of quota state (on quotaon and quotaoff) and readers who care
+ * about latest values take it as well.
+ *
+ * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock,
+ *   dq_list_lock > dq_state_lock
+ *
+ * Note that some things (eg. sb pointer, type, id) doesn't change during
+ * the life of the dquot structure and so needn't to be protected by a lock
+ *
+ * Any operation working on dquots via inode pointers must hold dqptr_sem.  If
+ * operation is just reading pointers from inode (or not using them at all) the
+ * read lock is enough. If pointers are altered function must hold write lock.
+ * Special care needs to be taken about S_NOQUOTA inode flag (marking that
+ * inode is a quota file). Functions adding pointers from inode to dquots have
+ * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they
+ * have to do all pointer modifications before dropping dqptr_sem. This makes
+ * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
+ * then drops all pointers to dquots from an inode.
+ *
+ * Each dquot has its dq_lock mutex. Locked dquots might not be referenced
+ * from inodes (dquot_alloc_space() and such don't check the dq_lock).
+ * Currently dquot is locked only when it is being read to memory (or space for
+ * it is being allocated) on the first dqget() and when it is being released on
+ * the last dqput(). The allocation and release oparations are serialized by
+ * the dq_lock and by checking the use count in dquot_release().  Write
+ * operations on dquots don't hold dq_lock as they copy data under dq_data_lock
+ * spinlock to internal buffers before writing.
+ *
+ * Lock ordering (including related VFS locks) is the following:
+ *   i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock >
+ *   dqio_mutex
+ * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem >
+ * dqptr_sem. But filesystem has to count with the fact that functions such as
+ * dquot_alloc_space() acquire dqptr_sem and they usually have to be called
+ * from inside a transaction to keep filesystem consistency after a crash. Also
+ * filesystems usually want to do some IO on dquot from ->mark_dirty which is
+ * called with dqptr_sem held.
+ * i_mutex on quota files is special (it's below dqio_mutex)
+ */
+
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
+EXPORT_SYMBOL(dq_data_lock);
+
+void __quota_error(struct super_block *sb, const char *func,
+		   const char *fmt, ...)
+{
+	if (printk_ratelimit()) {
+		va_list args;
+		struct va_format vaf;
+
+		va_start(args, fmt);
+
+		vaf.fmt = fmt;
+		vaf.va = &args;
+
+		printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
+		       sb->s_id, func, &vaf);
+
+		va_end(args);
+	}
+}
+EXPORT_SYMBOL(__quota_error);
+
+#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
+static char *quotatypes[] = INITQFNAMES;
+#endif
+static struct quota_format_type *quota_formats;	/* List of registered formats */
+static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
+
+/* SLAB cache for dquot structures */
+static struct kmem_cache *dquot_cachep;
+
+int register_quota_format(struct quota_format_type *fmt)
+{
+	spin_lock(&dq_list_lock);
+	fmt->qf_next = quota_formats;
+	quota_formats = fmt;
+	spin_unlock(&dq_list_lock);
+	return 0;
+}
+EXPORT_SYMBOL(register_quota_format);
+
+void unregister_quota_format(struct quota_format_type *fmt)
+{
+	struct quota_format_type **actqf;
+
+	spin_lock(&dq_list_lock);
+	for (actqf = &quota_formats; *actqf && *actqf != fmt;
+	     actqf = &(*actqf)->qf_next)
+		;
+	if (*actqf)
+		*actqf = (*actqf)->qf_next;
+	spin_unlock(&dq_list_lock);
+}
+EXPORT_SYMBOL(unregister_quota_format);
+
+static struct quota_format_type *find_quota_format(int id)
+{
+	struct quota_format_type *actqf;
+
+	spin_lock(&dq_list_lock);
+	for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
+	     actqf = actqf->qf_next)
+		;
+	if (!actqf || !try_module_get(actqf->qf_owner)) {
+		int qm;
+
+		spin_unlock(&dq_list_lock);
+		
+		for (qm = 0; module_names[qm].qm_fmt_id &&
+			     module_names[qm].qm_fmt_id != id; qm++)
+			;
+		if (!module_names[qm].qm_fmt_id ||
+		    request_module(module_names[qm].qm_mod_name))
+			return NULL;
+
+		spin_lock(&dq_list_lock);
+		for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
+		     actqf = actqf->qf_next)
+			;
+		if (actqf && !try_module_get(actqf->qf_owner))
+			actqf = NULL;
+	}
+	spin_unlock(&dq_list_lock);
+	return actqf;
+}
+
+static void put_quota_format(struct quota_format_type *fmt)
+{
+	module_put(fmt->qf_owner);
+}
+
+/*
+ * Dquot List Management:
+ * The quota code uses three lists for dquot management: the inuse_list,
+ * free_dquots, and dquot_hash[] array. A single dquot structure may be
+ * on all three lists, depending on its current state.
+ *
+ * All dquots are placed to the end of inuse_list when first created, and this
+ * list is used for invalidate operation, which must look at every dquot.
+ *
+ * Unused dquots (dq_count == 0) are added to the free_dquots list when freed,
+ * and this list is searched whenever we need an available dquot.  Dquots are
+ * removed from the list as soon as they are used again, and
+ * dqstats.free_dquots gives the number of dquots on the list. When
+ * dquot is invalidated it's completely released from memory.
+ *
+ * Dquots with a specific identity (device, type and id) are placed on
+ * one of the dquot_hash[] hash chains. The provides an efficient search
+ * mechanism to locate a specific dquot.
+ */
+
+static LIST_HEAD(inuse_list);
+static LIST_HEAD(free_dquots);
+static unsigned int dq_hash_bits, dq_hash_mask;
+static struct hlist_head *dquot_hash;
+
+struct dqstats dqstats;
+EXPORT_SYMBOL(dqstats);
+
+static qsize_t inode_get_rsv_space(struct inode *inode);
+static void __dquot_initialize(struct inode *inode, int type);
+
+static inline unsigned int
+hashfn(const struct super_block *sb, unsigned int id, int type)
+{
+	unsigned long tmp;
+
+	tmp = (((unsigned long)sb>>L1_CACHE_SHIFT) ^ id) * (MAXQUOTAS - type);
+	return (tmp + (tmp >> dq_hash_bits)) & dq_hash_mask;
+}
+
+/*
+ * Following list functions expect dq_list_lock to be held
+ */
+static inline void insert_dquot_hash(struct dquot *dquot)
+{
+	struct hlist_head *head;
+	head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id, dquot->dq_type);
+	hlist_add_head(&dquot->dq_hash, head);
+}
+
+static inline void remove_dquot_hash(struct dquot *dquot)
+{
+	hlist_del_init(&dquot->dq_hash);
+}
+
+static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
+				unsigned int id, int type)
+{
+	struct hlist_node *node;
+	struct dquot *dquot;
+
+	hlist_for_each (node, dquot_hash+hashent) {
+		dquot = hlist_entry(node, struct dquot, dq_hash);
+		if (dquot->dq_sb == sb && dquot->dq_id == id &&
+		    dquot->dq_type == type)
+			return dquot;
+	}
+	return NULL;
+}
+
+/* Add a dquot to the tail of the free list */
+static inline void put_dquot_last(struct dquot *dquot)
+{
+	list_add_tail(&dquot->dq_free, &free_dquots);
+	dqstats_inc(DQST_FREE_DQUOTS);
+}
+
+static inline void remove_free_dquot(struct dquot *dquot)
+{
+	if (list_empty(&dquot->dq_free))
+		return;
+	list_del_init(&dquot->dq_free);
+	dqstats_dec(DQST_FREE_DQUOTS);
+}
+
+static inline void put_inuse(struct dquot *dquot)
+{
+	/* We add to the back of inuse list so we don't have to restart
+	 * when traversing this list and we block */
+	list_add_tail(&dquot->dq_inuse, &inuse_list);
+	dqstats_inc(DQST_ALLOC_DQUOTS);
+}
+
+static inline void remove_inuse(struct dquot *dquot)
+{
+	dqstats_dec(DQST_ALLOC_DQUOTS);
+	list_del(&dquot->dq_inuse);
+}
+/*
+ * End of list functions needing dq_list_lock
+ */
+
+static void wait_on_dquot(struct dquot *dquot)
+{
+	mutex_lock(&dquot->dq_lock);
+	mutex_unlock(&dquot->dq_lock);
+}
+
+static inline int dquot_dirty(struct dquot *dquot)
+{
+	return test_bit(DQ_MOD_B, &dquot->dq_flags);
+}
+
+static inline int mark_dquot_dirty(struct dquot *dquot)
+{
+	return dquot->dq_sb->dq_op->mark_dirty(dquot);
+}
+
+/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */
+int dquot_mark_dquot_dirty(struct dquot *dquot)
+{
+	int ret = 1;
+
+	/* If quota is dirty already, we don't have to acquire dq_list_lock */
+	if (test_bit(DQ_MOD_B, &dquot->dq_flags))
+		return 1;
+
+	spin_lock(&dq_list_lock);
+	if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
+		list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
+				info[dquot->dq_type].dqi_dirty_list);
+		ret = 0;
+	}
+	spin_unlock(&dq_list_lock);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_mark_dquot_dirty);
+
+/* Dirtify all the dquots - this can block when journalling */
+static inline int mark_all_dquot_dirty(struct dquot * const *dquot)
+{
+	int ret, err, cnt;
+
+	ret = err = 0;
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (dquot[cnt])
+			/* Even in case of error we have to continue */
+			ret = mark_dquot_dirty(dquot[cnt]);
+		if (!err)
+			err = ret;
+	}
+	return err;
+}
+
+static inline void dqput_all(struct dquot **dquot)
+{
+	unsigned int cnt;
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		dqput(dquot[cnt]);
+}
+
+/* This function needs dq_list_lock */
+static inline int clear_dquot_dirty(struct dquot *dquot)
+{
+	if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags))
+		return 0;
+	list_del_init(&dquot->dq_dirty);
+	return 1;
+}
+
+void mark_info_dirty(struct super_block *sb, int type)
+{
+	set_bit(DQF_INFO_DIRTY_B, &sb_dqopt(sb)->info[type].dqi_flags);
+}
+EXPORT_SYMBOL(mark_info_dirty);
+
+/*
+ *	Read dquot from disk and alloc space for it
+ */
+
+int dquot_acquire(struct dquot *dquot)
+{
+	int ret = 0, ret2 = 0;
+	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+
+	mutex_lock(&dquot->dq_lock);
+	mutex_lock(&dqopt->dqio_mutex);
+	if (!test_bit(DQ_READ_B, &dquot->dq_flags))
+		ret = dqopt->ops[dquot->dq_type]->read_dqblk(dquot);
+	if (ret < 0)
+		goto out_iolock;
+	set_bit(DQ_READ_B, &dquot->dq_flags);
+	/* Instantiate dquot if needed */
+	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) {
+		ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot);
+		/* Write the info if needed */
+		if (info_dirty(&dqopt->info[dquot->dq_type])) {
+			ret2 = dqopt->ops[dquot->dq_type]->write_file_info(
+						dquot->dq_sb, dquot->dq_type);
+		}
+		if (ret < 0)
+			goto out_iolock;
+		if (ret2 < 0) {
+			ret = ret2;
+			goto out_iolock;
+		}
+	}
+	set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out_iolock:
+	mutex_unlock(&dqopt->dqio_mutex);
+	mutex_unlock(&dquot->dq_lock);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_acquire);
+
+/*
+ *	Write dquot to disk
+ */
+int dquot_commit(struct dquot *dquot)
+{
+	int ret = 0;
+	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+
+	mutex_lock(&dqopt->dqio_mutex);
+	spin_lock(&dq_list_lock);
+	if (!clear_dquot_dirty(dquot)) {
+		spin_unlock(&dq_list_lock);
+		goto out_sem;
+	}
+	spin_unlock(&dq_list_lock);
+	/* Inactive dquot can be only if there was error during read/init
+	 * => we have better not writing it */
+	if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+		ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot);
+	else
+		ret = -EIO;
+out_sem:
+	mutex_unlock(&dqopt->dqio_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_commit);
+
+/*
+ *	Release dquot
+ */
+int dquot_release(struct dquot *dquot)
+{
+	int ret = 0, ret2 = 0;
+	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+
+	mutex_lock(&dquot->dq_lock);
+	/* Check whether we are not racing with some other dqget() */
+	if (atomic_read(&dquot->dq_count) > 1)
+		goto out_dqlock;
+	mutex_lock(&dqopt->dqio_mutex);
+	if (dqopt->ops[dquot->dq_type]->release_dqblk) {
+		ret = dqopt->ops[dquot->dq_type]->release_dqblk(dquot);
+		/* Write the info */
+		if (info_dirty(&dqopt->info[dquot->dq_type])) {
+			ret2 = dqopt->ops[dquot->dq_type]->write_file_info(
+						dquot->dq_sb, dquot->dq_type);
+		}
+		if (ret >= 0)
+			ret = ret2;
+	}
+	clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+	mutex_unlock(&dqopt->dqio_mutex);
+out_dqlock:
+	mutex_unlock(&dquot->dq_lock);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_release);
+
+void dquot_destroy(struct dquot *dquot)
+{
+	kmem_cache_free(dquot_cachep, dquot);
+}
+EXPORT_SYMBOL(dquot_destroy);
+
+static inline void do_destroy_dquot(struct dquot *dquot)
+{
+	dquot->dq_sb->dq_op->destroy_dquot(dquot);
+}
+
+/* Invalidate all dquots on the list. Note that this function is called after
+ * quota is disabled and pointers from inodes removed so there cannot be new
+ * quota users. There can still be some users of quotas due to inodes being
+ * just deleted or pruned by prune_icache() (those are not attached to any
+ * list) or parallel quotactl call. We have to wait for such users.
+ */
+static void invalidate_dquots(struct super_block *sb, int type)
+{
+	struct dquot *dquot, *tmp;
+
+restart:
+	spin_lock(&dq_list_lock);
+	list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) {
+		if (dquot->dq_sb != sb)
+			continue;
+		if (dquot->dq_type != type)
+			continue;
+		/* Wait for dquot users */
+		if (atomic_read(&dquot->dq_count)) {
+			DEFINE_WAIT(wait);
+
+			atomic_inc(&dquot->dq_count);
+			prepare_to_wait(&dquot->dq_wait_unused, &wait,
+					TASK_UNINTERRUPTIBLE);
+			spin_unlock(&dq_list_lock);
+			/* Once dqput() wakes us up, we know it's time to free
+			 * the dquot.
+			 * IMPORTANT: we rely on the fact that there is always
+			 * at most one process waiting for dquot to free.
+			 * Otherwise dq_count would be > 1 and we would never
+			 * wake up.
+			 */
+			if (atomic_read(&dquot->dq_count) > 1)
+				schedule();
+			finish_wait(&dquot->dq_wait_unused, &wait);
+			dqput(dquot);
+			/* At this moment dquot() need not exist (it could be
+			 * reclaimed by prune_dqcache(). Hence we must
+			 * restart. */
+			goto restart;
+		}
+		/*
+		 * Quota now has no users and it has been written on last
+		 * dqput()
+		 */
+		remove_dquot_hash(dquot);
+		remove_free_dquot(dquot);
+		remove_inuse(dquot);
+		do_destroy_dquot(dquot);
+	}
+	spin_unlock(&dq_list_lock);
+}
+
+/* Call callback for every active dquot on given filesystem */
+int dquot_scan_active(struct super_block *sb,
+		      int (*fn)(struct dquot *dquot, unsigned long priv),
+		      unsigned long priv)
+{
+	struct dquot *dquot, *old_dquot = NULL;
+	int ret = 0;
+
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	spin_lock(&dq_list_lock);
+	list_for_each_entry(dquot, &inuse_list, dq_inuse) {
+		if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+			continue;
+		if (dquot->dq_sb != sb)
+			continue;
+		/* Now we have active dquot so we can just increase use count */
+		atomic_inc(&dquot->dq_count);
+		spin_unlock(&dq_list_lock);
+		dqstats_inc(DQST_LOOKUPS);
+		dqput(old_dquot);
+		old_dquot = dquot;
+		ret = fn(dquot, priv);
+		if (ret < 0)
+			goto out;
+		spin_lock(&dq_list_lock);
+		/* We are safe to continue now because our dquot could not
+		 * be moved out of the inuse list while we hold the reference */
+	}
+	spin_unlock(&dq_list_lock);
+out:
+	dqput(old_dquot);
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_scan_active);
+
+int dquot_quota_sync(struct super_block *sb, int type, int wait)
+{
+	struct list_head *dirty;
+	struct dquot *dquot;
+	struct quota_info *dqopt = sb_dqopt(sb);
+	int cnt;
+
+	mutex_lock(&dqopt->dqonoff_mutex);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		spin_lock(&dq_list_lock);
+		dirty = &dqopt->info[cnt].dqi_dirty_list;
+		while (!list_empty(dirty)) {
+			dquot = list_first_entry(dirty, struct dquot,
+						 dq_dirty);
+			/* Dirty and inactive can be only bad dquot... */
+			if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+				clear_dquot_dirty(dquot);
+				continue;
+			}
+			/* Now we have active dquot from which someone is
+ 			 * holding reference so we can safely just increase
+			 * use count */
+			atomic_inc(&dquot->dq_count);
+			spin_unlock(&dq_list_lock);
+			dqstats_inc(DQST_LOOKUPS);
+			sb->dq_op->write_dquot(dquot);
+			dqput(dquot);
+			spin_lock(&dq_list_lock);
+		}
+		spin_unlock(&dq_list_lock);
+	}
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
+		    && info_dirty(&dqopt->info[cnt]))
+			sb->dq_op->write_info(sb, cnt);
+	dqstats_inc(DQST_SYNCS);
+	mutex_unlock(&dqopt->dqonoff_mutex);
+
+	if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
+		return 0;
+
+	/* This is not very clever (and fast) but currently I don't know about
+	 * any other simple way of getting quota data to disk and we must get
+	 * them there for userspace to be visible... */
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, 1);
+	sync_blockdev(sb->s_bdev);
+
+	/*
+	 * Now when everything is written we can discard the pagecache so
+	 * that userspace sees the changes.
+	 */
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
+				  I_MUTEX_QUOTA);
+		truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
+		mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
+	}
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(dquot_quota_sync);
+
+/* Free unused dquots from cache */
+static void prune_dqcache(int count)
+{
+	struct list_head *head;
+	struct dquot *dquot;
+
+	head = free_dquots.prev;
+	while (head != &free_dquots && count) {
+		dquot = list_entry(head, struct dquot, dq_free);
+		remove_dquot_hash(dquot);
+		remove_free_dquot(dquot);
+		remove_inuse(dquot);
+		do_destroy_dquot(dquot);
+		count--;
+		head = free_dquots.prev;
+	}
+}
+
+/*
+ * This is called from kswapd when we think we need some
+ * more memory
+ */
+static int shrink_dqcache_memory(struct shrinker *shrink,
+				 struct shrink_control *sc)
+{
+	int nr = sc->nr_to_scan;
+
+	if (nr) {
+		spin_lock(&dq_list_lock);
+		prune_dqcache(nr);
+		spin_unlock(&dq_list_lock);
+	}
+	return ((unsigned)
+		percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
+		/100) * sysctl_vfs_cache_pressure;
+}
+
+static struct shrinker dqcache_shrinker = {
+	.shrink = shrink_dqcache_memory,
+	.seeks = DEFAULT_SEEKS,
+};
+
+/*
+ * Put reference to dquot
+ * NOTE: If you change this function please check whether dqput_blocks() works right...
+ */
+void dqput(struct dquot *dquot)
+{
+	int ret;
+
+	if (!dquot)
+		return;
+#ifdef CONFIG_QUOTA_DEBUG
+	if (!atomic_read(&dquot->dq_count)) {
+		quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
+			    quotatypes[dquot->dq_type], dquot->dq_id);
+		BUG();
+	}
+#endif
+	dqstats_inc(DQST_DROPS);
+we_slept:
+	spin_lock(&dq_list_lock);
+	if (atomic_read(&dquot->dq_count) > 1) {
+		/* We have more than one user... nothing to do */
+		atomic_dec(&dquot->dq_count);
+		/* Releasing dquot during quotaoff phase? */
+		if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) &&
+		    atomic_read(&dquot->dq_count) == 1)
+			wake_up(&dquot->dq_wait_unused);
+		spin_unlock(&dq_list_lock);
+		return;
+	}
+	/* Need to release dquot? */
+	if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && dquot_dirty(dquot)) {
+		spin_unlock(&dq_list_lock);
+		/* Commit dquot before releasing */
+		ret = dquot->dq_sb->dq_op->write_dquot(dquot);
+		if (ret < 0) {
+			quota_error(dquot->dq_sb, "Can't write quota structure"
+				    " (error %d). Quota may get out of sync!",
+				    ret);
+			/*
+			 * We clear dirty bit anyway, so that we avoid
+			 * infinite loop here
+			 */
+			spin_lock(&dq_list_lock);
+			clear_dquot_dirty(dquot);
+			spin_unlock(&dq_list_lock);
+		}
+		goto we_slept;
+	}
+	/* Clear flag in case dquot was inactive (something bad happened) */
+	clear_dquot_dirty(dquot);
+	if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+		spin_unlock(&dq_list_lock);
+		dquot->dq_sb->dq_op->release_dquot(dquot);
+		goto we_slept;
+	}
+	atomic_dec(&dquot->dq_count);
+#ifdef CONFIG_QUOTA_DEBUG
+	/* sanity check */
+	BUG_ON(!list_empty(&dquot->dq_free));
+#endif
+	put_dquot_last(dquot);
+	spin_unlock(&dq_list_lock);
+}
+EXPORT_SYMBOL(dqput);
+
+struct dquot *dquot_alloc(struct super_block *sb, int type)
+{
+	return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+}
+EXPORT_SYMBOL(dquot_alloc);
+
+static struct dquot *get_empty_dquot(struct super_block *sb, int type)
+{
+	struct dquot *dquot;
+
+	dquot = sb->dq_op->alloc_dquot(sb, type);
+	if(!dquot)
+		return NULL;
+
+	mutex_init(&dquot->dq_lock);
+	INIT_LIST_HEAD(&dquot->dq_free);
+	INIT_LIST_HEAD(&dquot->dq_inuse);
+	INIT_HLIST_NODE(&dquot->dq_hash);
+	INIT_LIST_HEAD(&dquot->dq_dirty);
+	init_waitqueue_head(&dquot->dq_wait_unused);
+	dquot->dq_sb = sb;
+	dquot->dq_type = type;
+	atomic_set(&dquot->dq_count, 1);
+
+	return dquot;
+}
+
+/*
+ * Get reference to dquot
+ *
+ * Locking is slightly tricky here. We are guarded from parallel quotaoff()
+ * destroying our dquot by:
+ *   a) checking for quota flags under dq_list_lock and
+ *   b) getting a reference to dquot before we release dq_list_lock
+ */
+struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
+{
+	unsigned int hashent = hashfn(sb, id, type);
+	struct dquot *dquot = NULL, *empty = NULL;
+
+        if (!sb_has_quota_active(sb, type))
+		return NULL;
+we_slept:
+	spin_lock(&dq_list_lock);
+	spin_lock(&dq_state_lock);
+	if (!sb_has_quota_active(sb, type)) {
+		spin_unlock(&dq_state_lock);
+		spin_unlock(&dq_list_lock);
+		goto out;
+	}
+	spin_unlock(&dq_state_lock);
+
+	dquot = find_dquot(hashent, sb, id, type);
+	if (!dquot) {
+		if (!empty) {
+			spin_unlock(&dq_list_lock);
+			empty = get_empty_dquot(sb, type);
+			if (!empty)
+				schedule();	/* Try to wait for a moment... */
+			goto we_slept;
+		}
+		dquot = empty;
+		empty = NULL;
+		dquot->dq_id = id;
+		/* all dquots go on the inuse_list */
+		put_inuse(dquot);
+		/* hash it first so it can be found */
+		insert_dquot_hash(dquot);
+		spin_unlock(&dq_list_lock);
+		dqstats_inc(DQST_LOOKUPS);
+	} else {
+		if (!atomic_read(&dquot->dq_count))
+			remove_free_dquot(dquot);
+		atomic_inc(&dquot->dq_count);
+		spin_unlock(&dq_list_lock);
+		dqstats_inc(DQST_CACHE_HITS);
+		dqstats_inc(DQST_LOOKUPS);
+	}
+	/* Wait for dq_lock - after this we know that either dquot_release() is
+	 * already finished or it will be canceled due to dq_count > 1 test */
+	wait_on_dquot(dquot);
+	/* Read the dquot / allocate space in quota file */
+	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) &&
+	    sb->dq_op->acquire_dquot(dquot) < 0) {
+		dqput(dquot);
+		dquot = NULL;
+		goto out;
+	}
+#ifdef CONFIG_QUOTA_DEBUG
+	BUG_ON(!dquot->dq_sb);	/* Has somebody invalidated entry under us? */
+#endif
+out:
+	if (empty)
+		do_destroy_dquot(empty);
+
+	return dquot;
+}
+EXPORT_SYMBOL(dqget);
+
+static int dqinit_needed(struct inode *inode, int type)
+{
+	int cnt;
+
+	if (IS_NOQUOTA(inode))
+		return 0;
+	if (type != -1)
+		return !inode->i_dquot[type];
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (!inode->i_dquot[cnt])
+			return 1;
+	return 0;
+}
+
+/* This routine is guarded by dqonoff_mutex mutex */
+static void add_dquot_ref(struct super_block *sb, int type)
+{
+	struct inode *inode, *old_inode = NULL;
+#ifdef CONFIG_QUOTA_DEBUG
+	int reserved = 0;
+#endif
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		spin_lock(&inode->i_lock);
+		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+		    !atomic_read(&inode->i_writecount) ||
+		    !dqinit_needed(inode, type)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+#ifdef CONFIG_QUOTA_DEBUG
+		if (unlikely(inode_get_rsv_space(inode) > 0))
+			reserved = 1;
+#endif
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_sb_list_lock);
+
+		iput(old_inode);
+		__dquot_initialize(inode, type);
+
+		/*
+		 * We hold a reference to 'inode' so it couldn't have been
+		 * removed from s_inodes list while we dropped the
+		 * inode_sb_list_lock We cannot iput the inode now as we can be
+		 * holding the last reference and we cannot iput it under
+		 * inode_sb_list_lock. So we keep the reference and iput it
+		 * later.
+		 */
+		old_inode = inode;
+		spin_lock(&inode_sb_list_lock);
+	}
+	spin_unlock(&inode_sb_list_lock);
+	iput(old_inode);
+
+#ifdef CONFIG_QUOTA_DEBUG
+	if (reserved) {
+		quota_error(sb, "Writes happened before quota was turned on "
+			"thus quota information is probably inconsistent. "
+			"Please run quotacheck(8)");
+	}
+#endif
+}
+
+/*
+ * Return 0 if dqput() won't block.
+ * (note that 1 doesn't necessarily mean blocking)
+ */
+static inline int dqput_blocks(struct dquot *dquot)
+{
+	if (atomic_read(&dquot->dq_count) <= 1)
+		return 1;
+	return 0;
+}
+
+/*
+ * Remove references to dquots from inode and add dquot to list for freeing
+ * if we have the last reference to dquot
+ * We can't race with anybody because we hold dqptr_sem for writing...
+ */
+static int remove_inode_dquot_ref(struct inode *inode, int type,
+				  struct list_head *tofree_head)
+{
+	struct dquot *dquot = inode->i_dquot[type];
+
+	inode->i_dquot[type] = NULL;
+	if (dquot) {
+		if (dqput_blocks(dquot)) {
+#ifdef CONFIG_QUOTA_DEBUG
+			if (atomic_read(&dquot->dq_count) != 1)
+				quota_error(inode->i_sb, "Adding dquot with "
+					    "dq_count %d to dispose list",
+					    atomic_read(&dquot->dq_count));
+#endif
+			spin_lock(&dq_list_lock);
+			/* As dquot must have currently users it can't be on
+			 * the free list... */
+			list_add(&dquot->dq_free, tofree_head);
+			spin_unlock(&dq_list_lock);
+			return 1;
+		}
+		else
+			dqput(dquot);   /* We have guaranteed we won't block */
+	}
+	return 0;
+}
+
+/*
+ * Free list of dquots
+ * Dquots are removed from inodes and no new references can be got so we are
+ * the only ones holding reference
+ */
+static void put_dquot_list(struct list_head *tofree_head)
+{
+	struct list_head *act_head;
+	struct dquot *dquot;
+
+	act_head = tofree_head->next;
+	while (act_head != tofree_head) {
+		dquot = list_entry(act_head, struct dquot, dq_free);
+		act_head = act_head->next;
+		/* Remove dquot from the list so we won't have problems... */
+		list_del_init(&dquot->dq_free);
+		dqput(dquot);
+	}
+}
+
+static void remove_dquot_ref(struct super_block *sb, int type,
+		struct list_head *tofree_head)
+{
+	struct inode *inode;
+	int reserved = 0;
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		/*
+		 *  We have to scan also I_NEW inodes because they can already
+		 *  have quota pointer initialized. Luckily, we need to touch
+		 *  only quota pointers and these have separate locking
+		 *  (dqptr_sem).
+		 */
+		if (!IS_NOQUOTA(inode)) {
+			if (unlikely(inode_get_rsv_space(inode) > 0))
+				reserved = 1;
+			remove_inode_dquot_ref(inode, type, tofree_head);
+		}
+	}
+	spin_unlock(&inode_sb_list_lock);
+#ifdef CONFIG_QUOTA_DEBUG
+	if (reserved) {
+		printk(KERN_WARNING "VFS (%s): Writes happened after quota"
+			" was disabled thus quota information is probably "
+			"inconsistent. Please run quotacheck(8).\n", sb->s_id);
+	}
+#endif
+}
+
+/* Gather all references from inodes and drop them */
+static void drop_dquot_ref(struct super_block *sb, int type)
+{
+	LIST_HEAD(tofree_head);
+
+	if (sb->dq_op) {
+		down_write(&sb_dqopt(sb)->dqptr_sem);
+		remove_dquot_ref(sb, type, &tofree_head);
+		up_write(&sb_dqopt(sb)->dqptr_sem);
+		put_dquot_list(&tofree_head);
+	}
+}
+
+static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
+{
+	dquot->dq_dqb.dqb_curinodes += number;
+}
+
+static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
+{
+	dquot->dq_dqb.dqb_curspace += number;
+}
+
+static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
+{
+	dquot->dq_dqb.dqb_rsvspace += number;
+}
+
+/*
+ * Claim reserved quota space
+ */
+static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number)
+{
+	if (dquot->dq_dqb.dqb_rsvspace < number) {
+		WARN_ON_ONCE(1);
+		number = dquot->dq_dqb.dqb_rsvspace;
+	}
+	dquot->dq_dqb.dqb_curspace += number;
+	dquot->dq_dqb.dqb_rsvspace -= number;
+}
+
+static inline
+void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
+{
+	if (dquot->dq_dqb.dqb_rsvspace >= number)
+		dquot->dq_dqb.dqb_rsvspace -= number;
+	else {
+		WARN_ON_ONCE(1);
+		dquot->dq_dqb.dqb_rsvspace = 0;
+	}
+}
+
+static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
+{
+	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+	    dquot->dq_dqb.dqb_curinodes >= number)
+		dquot->dq_dqb.dqb_curinodes -= number;
+	else
+		dquot->dq_dqb.dqb_curinodes = 0;
+	if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
+		dquot->dq_dqb.dqb_itime = (time_t) 0;
+	clear_bit(DQ_INODES_B, &dquot->dq_flags);
+}
+
+static void dquot_decr_space(struct dquot *dquot, qsize_t number)
+{
+	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+	    dquot->dq_dqb.dqb_curspace >= number)
+		dquot->dq_dqb.dqb_curspace -= number;
+	else
+		dquot->dq_dqb.dqb_curspace = 0;
+	if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
+		dquot->dq_dqb.dqb_btime = (time_t) 0;
+	clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+}
+
+static int warning_issued(struct dquot *dquot, const int warntype)
+{
+	int flag = (warntype == QUOTA_NL_BHARDWARN ||
+		warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B :
+		((warntype == QUOTA_NL_IHARDWARN ||
+		warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0);
+
+	if (!flag)
+		return 0;
+	return test_and_set_bit(flag, &dquot->dq_flags);
+}
+
+#ifdef CONFIG_PRINT_QUOTA_WARNING
+static int flag_print_warnings = 1;
+
+static int need_print_warning(struct dquot *dquot)
+{
+	if (!flag_print_warnings)
+		return 0;
+
+	switch (dquot->dq_type) {
+		case USRQUOTA:
+			return current_fsuid() == dquot->dq_id;
+		case GRPQUOTA:
+			return in_group_p(dquot->dq_id);
+	}
+	return 0;
+}
+
+/* Print warning to user which exceeded quota */
+static void print_warning(struct dquot *dquot, const int warntype)
+{
+	char *msg = NULL;
+	struct tty_struct *tty;
+
+	if (warntype == QUOTA_NL_IHARDBELOW ||
+	    warntype == QUOTA_NL_ISOFTBELOW ||
+	    warntype == QUOTA_NL_BHARDBELOW ||
+	    warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot))
+		return;
+
+	tty = get_current_tty();
+	if (!tty)
+		return;
+	tty_write_message(tty, dquot->dq_sb->s_id);
+	if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)
+		tty_write_message(tty, ": warning, ");
+	else
+		tty_write_message(tty, ": write failed, ");
+	tty_write_message(tty, quotatypes[dquot->dq_type]);
+	switch (warntype) {
+		case QUOTA_NL_IHARDWARN:
+			msg = " file limit reached.\r\n";
+			break;
+		case QUOTA_NL_ISOFTLONGWARN:
+			msg = " file quota exceeded too long.\r\n";
+			break;
+		case QUOTA_NL_ISOFTWARN:
+			msg = " file quota exceeded.\r\n";
+			break;
+		case QUOTA_NL_BHARDWARN:
+			msg = " block limit reached.\r\n";
+			break;
+		case QUOTA_NL_BSOFTLONGWARN:
+			msg = " block quota exceeded too long.\r\n";
+			break;
+		case QUOTA_NL_BSOFTWARN:
+			msg = " block quota exceeded.\r\n";
+			break;
+	}
+	tty_write_message(tty, msg);
+	tty_kref_put(tty);
+}
+#endif
+
+/*
+ * Write warnings to the console and send warning messages over netlink.
+ *
+ * Note that this function can sleep.
+ */
+static void flush_warnings(struct dquot *const *dquots, char *warntype)
+{
+	struct dquot *dq;
+	int i;
+
+	for (i = 0; i < MAXQUOTAS; i++) {
+		dq = dquots[i];
+		if (dq && warntype[i] != QUOTA_NL_NOWARN &&
+		    !warning_issued(dq, warntype[i])) {
+#ifdef CONFIG_PRINT_QUOTA_WARNING
+			print_warning(dq, warntype[i]);
+#endif
+			quota_send_warning(dq->dq_type, dq->dq_id,
+					   dq->dq_sb->s_dev, warntype[i]);
+		}
+	}
+}
+
+static int ignore_hardlimit(struct dquot *dquot)
+{
+	struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
+
+	return capable(CAP_SYS_RESOURCE) &&
+	       (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
+		!(info->dqi_flags & V1_DQF_RSQUASH));
+}
+
+/* needs dq_data_lock */
+static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
+{
+	qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
+
+	*warntype = QUOTA_NL_NOWARN;
+	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
+		return 0;
+
+	if (dquot->dq_dqb.dqb_ihardlimit &&
+	    newinodes > dquot->dq_dqb.dqb_ihardlimit &&
+            !ignore_hardlimit(dquot)) {
+		*warntype = QUOTA_NL_IHARDWARN;
+		return -EDQUOT;
+	}
+
+	if (dquot->dq_dqb.dqb_isoftlimit &&
+	    newinodes > dquot->dq_dqb.dqb_isoftlimit &&
+	    dquot->dq_dqb.dqb_itime &&
+	    get_seconds() >= dquot->dq_dqb.dqb_itime &&
+            !ignore_hardlimit(dquot)) {
+		*warntype = QUOTA_NL_ISOFTLONGWARN;
+		return -EDQUOT;
+	}
+
+	if (dquot->dq_dqb.dqb_isoftlimit &&
+	    newinodes > dquot->dq_dqb.dqb_isoftlimit &&
+	    dquot->dq_dqb.dqb_itime == 0) {
+		*warntype = QUOTA_NL_ISOFTWARN;
+		dquot->dq_dqb.dqb_itime = get_seconds() +
+		    sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
+	}
+
+	return 0;
+}
+
+/* needs dq_data_lock */
+static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
+{
+	qsize_t tspace;
+	struct super_block *sb = dquot->dq_sb;
+
+	*warntype = QUOTA_NL_NOWARN;
+	if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) ||
+	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
+		return 0;
+
+	tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
+		+ space;
+
+	if (dquot->dq_dqb.dqb_bhardlimit &&
+	    tspace > dquot->dq_dqb.dqb_bhardlimit &&
+            !ignore_hardlimit(dquot)) {
+		if (!prealloc)
+			*warntype = QUOTA_NL_BHARDWARN;
+		return -EDQUOT;
+	}
+
+	if (dquot->dq_dqb.dqb_bsoftlimit &&
+	    tspace > dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_btime &&
+	    get_seconds() >= dquot->dq_dqb.dqb_btime &&
+            !ignore_hardlimit(dquot)) {
+		if (!prealloc)
+			*warntype = QUOTA_NL_BSOFTLONGWARN;
+		return -EDQUOT;
+	}
+
+	if (dquot->dq_dqb.dqb_bsoftlimit &&
+	    tspace > dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_btime == 0) {
+		if (!prealloc) {
+			*warntype = QUOTA_NL_BSOFTWARN;
+			dquot->dq_dqb.dqb_btime = get_seconds() +
+			    sb_dqopt(sb)->info[dquot->dq_type].dqi_bgrace;
+		}
+		else
+			/*
+			 * We don't allow preallocation to exceed softlimit so exceeding will
+			 * be always printed
+			 */
+			return -EDQUOT;
+	}
+
+	return 0;
+}
+
+static int info_idq_free(struct dquot *dquot, qsize_t inodes)
+{
+	qsize_t newinodes;
+
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
+	    dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
+	    !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
+		return QUOTA_NL_NOWARN;
+
+	newinodes = dquot->dq_dqb.dqb_curinodes - inodes;
+	if (newinodes <= dquot->dq_dqb.dqb_isoftlimit)
+		return QUOTA_NL_ISOFTBELOW;
+	if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit &&
+	    newinodes < dquot->dq_dqb.dqb_ihardlimit)
+		return QUOTA_NL_IHARDBELOW;
+	return QUOTA_NL_NOWARN;
+}
+
+static int info_bdq_free(struct dquot *dquot, qsize_t space)
+{
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
+	    dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
+		return QUOTA_NL_NOWARN;
+
+	if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
+		return QUOTA_NL_BSOFTBELOW;
+	if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit &&
+	    dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit)
+		return QUOTA_NL_BHARDBELOW;
+	return QUOTA_NL_NOWARN;
+}
+
+static int dquot_active(const struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (IS_NOQUOTA(inode))
+		return 0;
+	return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
+}
+
+/*
+ * Initialize quota pointers in inode
+ *
+ * We do things in a bit complicated way but by that we avoid calling
+ * dqget() and thus filesystem callbacks under dqptr_sem.
+ *
+ * It is better to call this function outside of any transaction as it
+ * might need a lot of space in journal for dquot structure allocation.
+ */
+static void __dquot_initialize(struct inode *inode, int type)
+{
+	unsigned int id = 0;
+	int cnt;
+	struct dquot *got[MAXQUOTAS];
+	struct super_block *sb = inode->i_sb;
+	qsize_t rsv;
+
+	/* First test before acquiring mutex - solves deadlocks when we
+         * re-enter the quota code and are already holding the mutex */
+	if (!dquot_active(inode))
+		return;
+
+	/* First get references to structures we might need. */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		got[cnt] = NULL;
+		if (type != -1 && cnt != type)
+			continue;
+		switch (cnt) {
+		case USRQUOTA:
+			id = inode->i_uid;
+			break;
+		case GRPQUOTA:
+			id = inode->i_gid;
+			break;
+		}
+		got[cnt] = dqget(sb, id, cnt);
+	}
+
+	down_write(&sb_dqopt(sb)->dqptr_sem);
+	if (IS_NOQUOTA(inode))
+		goto out_err;
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+		/* Avoid races with quotaoff() */
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		/* We could race with quotaon or dqget() could have failed */
+		if (!got[cnt])
+			continue;
+		if (!inode->i_dquot[cnt]) {
+			inode->i_dquot[cnt] = got[cnt];
+			got[cnt] = NULL;
+			/*
+			 * Make quota reservation system happy if someone
+			 * did a write before quota was turned on
+			 */
+			rsv = inode_get_rsv_space(inode);
+			if (unlikely(rsv))
+				dquot_resv_space(inode->i_dquot[cnt], rsv);
+		}
+	}
+out_err:
+	up_write(&sb_dqopt(sb)->dqptr_sem);
+	/* Drop unused references */
+	dqput_all(got);
+}
+
+void dquot_initialize(struct inode *inode)
+{
+	__dquot_initialize(inode, -1);
+}
+EXPORT_SYMBOL(dquot_initialize);
+
+/*
+ * 	Release all quotas referenced by inode
+ */
+static void __dquot_drop(struct inode *inode)
+{
+	int cnt;
+	struct dquot *put[MAXQUOTAS];
+
+	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		put[cnt] = inode->i_dquot[cnt];
+		inode->i_dquot[cnt] = NULL;
+	}
+	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	dqput_all(put);
+}
+
+void dquot_drop(struct inode *inode)
+{
+	int cnt;
+
+	if (IS_NOQUOTA(inode))
+		return;
+
+	/*
+	 * Test before calling to rule out calls from proc and such
+	 * where we are not allowed to block. Note that this is
+	 * actually reliable test even without the lock - the caller
+	 * must assure that nobody can come after the DQUOT_DROP and
+	 * add quota pointers back anyway.
+	 */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (inode->i_dquot[cnt])
+			break;
+	}
+
+	if (cnt < MAXQUOTAS)
+		__dquot_drop(inode);
+}
+EXPORT_SYMBOL(dquot_drop);
+
+/*
+ * inode_reserved_space is managed internally by quota, and protected by
+ * i_lock similar to i_blocks+i_bytes.
+ */
+static qsize_t *inode_reserved_space(struct inode * inode)
+{
+	/* Filesystem must explicitly define it's own method in order to use
+	 * quota reservation interface */
+	BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
+	return inode->i_sb->dq_op->get_reserved_space(inode);
+}
+
+void inode_add_rsv_space(struct inode *inode, qsize_t number)
+{
+	spin_lock(&inode->i_lock);
+	*inode_reserved_space(inode) += number;
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(inode_add_rsv_space);
+
+void inode_claim_rsv_space(struct inode *inode, qsize_t number)
+{
+	spin_lock(&inode->i_lock);
+	*inode_reserved_space(inode) -= number;
+	__inode_add_bytes(inode, number);
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(inode_claim_rsv_space);
+
+void inode_sub_rsv_space(struct inode *inode, qsize_t number)
+{
+	spin_lock(&inode->i_lock);
+	*inode_reserved_space(inode) -= number;
+	spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(inode_sub_rsv_space);
+
+static qsize_t inode_get_rsv_space(struct inode *inode)
+{
+	qsize_t ret;
+
+	if (!inode->i_sb->dq_op->get_reserved_space)
+		return 0;
+	spin_lock(&inode->i_lock);
+	ret = *inode_reserved_space(inode);
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+static void inode_incr_space(struct inode *inode, qsize_t number,
+				int reserve)
+{
+	if (reserve)
+		inode_add_rsv_space(inode, number);
+	else
+		inode_add_bytes(inode, number);
+}
+
+static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
+{
+	if (reserve)
+		inode_sub_rsv_space(inode, number);
+	else
+		inode_sub_bytes(inode, number);
+}
+
+/*
+ * This functions updates i_blocks+i_bytes fields and quota information
+ * (together with appropriate checks).
+ *
+ * NOTE: We absolutely rely on the fact that caller dirties the inode
+ * (usually helpers in quotaops.h care about this) and holds a handle for
+ * the current transaction so that dquot write and inode write go into the
+ * same transaction.
+ */
+
+/*
+ * This operation can block, but only after everything is updated
+ */
+int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
+{
+	int cnt, ret = 0;
+	char warntype[MAXQUOTAS];
+	int warn = flags & DQUOT_SPACE_WARN;
+	int reserve = flags & DQUOT_SPACE_RESERVE;
+	int nofail = flags & DQUOT_SPACE_NOFAIL;
+
+	/*
+	 * First test before acquiring mutex - solves deadlocks when we
+	 * re-enter the quota code and are already holding the mutex
+	 */
+	if (!dquot_active(inode)) {
+		inode_incr_space(inode, number, reserve);
+		goto out;
+	}
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		warntype[cnt] = QUOTA_NL_NOWARN;
+
+	spin_lock(&dq_data_lock);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!inode->i_dquot[cnt])
+			continue;
+		ret = check_bdq(inode->i_dquot[cnt], number, !warn,
+				warntype+cnt);
+		if (ret && !nofail) {
+			spin_unlock(&dq_data_lock);
+			goto out_flush_warn;
+		}
+	}
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!inode->i_dquot[cnt])
+			continue;
+		if (reserve)
+			dquot_resv_space(inode->i_dquot[cnt], number);
+		else
+			dquot_incr_space(inode->i_dquot[cnt], number);
+	}
+	inode_incr_space(inode, number, reserve);
+	spin_unlock(&dq_data_lock);
+
+	if (reserve)
+		goto out_flush_warn;
+	mark_all_dquot_dirty(inode->i_dquot);
+out_flush_warn:
+	flush_warnings(inode->i_dquot, warntype);
+	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(__dquot_alloc_space);
+
+/*
+ * This operation can block, but only after everything is updated
+ */
+int dquot_alloc_inode(const struct inode *inode)
+{
+	int cnt, ret = 0;
+	char warntype[MAXQUOTAS];
+
+	/* First test before acquiring mutex - solves deadlocks when we
+         * re-enter the quota code and are already holding the mutex */
+	if (!dquot_active(inode))
+		return 0;
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		warntype[cnt] = QUOTA_NL_NOWARN;
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	spin_lock(&dq_data_lock);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!inode->i_dquot[cnt])
+			continue;
+		ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt);
+		if (ret)
+			goto warn_put_all;
+	}
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!inode->i_dquot[cnt])
+			continue;
+		dquot_incr_inodes(inode->i_dquot[cnt], 1);
+	}
+
+warn_put_all:
+	spin_unlock(&dq_data_lock);
+	if (ret == 0)
+		mark_all_dquot_dirty(inode->i_dquot);
+	flush_warnings(inode->i_dquot, warntype);
+	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_alloc_inode);
+
+/*
+ * Convert in-memory reserved quotas to real consumed quotas
+ */
+int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
+{
+	int cnt;
+
+	if (!dquot_active(inode)) {
+		inode_claim_rsv_space(inode, number);
+		return 0;
+	}
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	spin_lock(&dq_data_lock);
+	/* Claim reserved quotas to allocated quotas */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (inode->i_dquot[cnt])
+			dquot_claim_reserved_space(inode->i_dquot[cnt],
+							number);
+	}
+	/* Update inode bytes */
+	inode_claim_rsv_space(inode, number);
+	spin_unlock(&dq_data_lock);
+	mark_all_dquot_dirty(inode->i_dquot);
+	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	return 0;
+}
+EXPORT_SYMBOL(dquot_claim_space_nodirty);
+
+/*
+ * This operation can block, but only after everything is updated
+ */
+void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
+{
+	unsigned int cnt;
+	char warntype[MAXQUOTAS];
+	int reserve = flags & DQUOT_SPACE_RESERVE;
+
+	/* First test before acquiring mutex - solves deadlocks when we
+         * re-enter the quota code and are already holding the mutex */
+	if (!dquot_active(inode)) {
+		inode_decr_space(inode, number, reserve);
+		return;
+	}
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	spin_lock(&dq_data_lock);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!inode->i_dquot[cnt])
+			continue;
+		warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
+		if (reserve)
+			dquot_free_reserved_space(inode->i_dquot[cnt], number);
+		else
+			dquot_decr_space(inode->i_dquot[cnt], number);
+	}
+	inode_decr_space(inode, number, reserve);
+	spin_unlock(&dq_data_lock);
+
+	if (reserve)
+		goto out_unlock;
+	mark_all_dquot_dirty(inode->i_dquot);
+out_unlock:
+	flush_warnings(inode->i_dquot, warntype);
+	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+}
+EXPORT_SYMBOL(__dquot_free_space);
+
+/*
+ * This operation can block, but only after everything is updated
+ */
+void dquot_free_inode(const struct inode *inode)
+{
+	unsigned int cnt;
+	char warntype[MAXQUOTAS];
+
+	/* First test before acquiring mutex - solves deadlocks when we
+         * re-enter the quota code and are already holding the mutex */
+	if (!dquot_active(inode))
+		return;
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	spin_lock(&dq_data_lock);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!inode->i_dquot[cnt])
+			continue;
+		warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1);
+		dquot_decr_inodes(inode->i_dquot[cnt], 1);
+	}
+	spin_unlock(&dq_data_lock);
+	mark_all_dquot_dirty(inode->i_dquot);
+	flush_warnings(inode->i_dquot, warntype);
+	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+}
+EXPORT_SYMBOL(dquot_free_inode);
+
+/*
+ * Transfer the number of inode and blocks from one diskquota to an other.
+ * On success, dquot references in transfer_to are consumed and references
+ * to original dquots that need to be released are placed there. On failure,
+ * references are kept untouched.
+ *
+ * This operation can block, but only after everything is updated
+ * A transaction must be started when entering this function.
+ *
+ */
+int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
+{
+	qsize_t space, cur_space;
+	qsize_t rsv_space = 0;
+	struct dquot *transfer_from[MAXQUOTAS] = {};
+	int cnt, ret = 0;
+	char is_valid[MAXQUOTAS] = {};
+	char warntype_to[MAXQUOTAS];
+	char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
+
+	/* First test before acquiring mutex - solves deadlocks when we
+         * re-enter the quota code and are already holding the mutex */
+	if (IS_NOQUOTA(inode))
+		return 0;
+	/* Initialize the arrays */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		warntype_to[cnt] = QUOTA_NL_NOWARN;
+	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	if (IS_NOQUOTA(inode)) {	/* File without quota accounting? */
+		up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+		return 0;
+	}
+	spin_lock(&dq_data_lock);
+	cur_space = inode_get_bytes(inode);
+	rsv_space = inode_get_rsv_space(inode);
+	space = cur_space + rsv_space;
+	/* Build the transfer_from list and check the limits */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		/*
+		 * Skip changes for same uid or gid or for turned off quota-type.
+		 */
+		if (!transfer_to[cnt])
+			continue;
+		/* Avoid races with quotaoff() */
+		if (!sb_has_quota_active(inode->i_sb, cnt))
+			continue;
+		is_valid[cnt] = 1;
+		transfer_from[cnt] = inode->i_dquot[cnt];
+		ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
+		if (ret)
+			goto over_quota;
+		ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt);
+		if (ret)
+			goto over_quota;
+	}
+
+	/*
+	 * Finally perform the needed transfer from transfer_from to transfer_to
+	 */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!is_valid[cnt])
+			continue;
+		/* Due to IO error we might not have transfer_from[] structure */
+		if (transfer_from[cnt]) {
+			warntype_from_inodes[cnt] =
+				info_idq_free(transfer_from[cnt], 1);
+			warntype_from_space[cnt] =
+				info_bdq_free(transfer_from[cnt], space);
+			dquot_decr_inodes(transfer_from[cnt], 1);
+			dquot_decr_space(transfer_from[cnt], cur_space);
+			dquot_free_reserved_space(transfer_from[cnt],
+						  rsv_space);
+		}
+
+		dquot_incr_inodes(transfer_to[cnt], 1);
+		dquot_incr_space(transfer_to[cnt], cur_space);
+		dquot_resv_space(transfer_to[cnt], rsv_space);
+
+		inode->i_dquot[cnt] = transfer_to[cnt];
+	}
+	spin_unlock(&dq_data_lock);
+	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+
+	mark_all_dquot_dirty(transfer_from);
+	mark_all_dquot_dirty(transfer_to);
+	flush_warnings(transfer_to, warntype_to);
+	flush_warnings(transfer_from, warntype_from_inodes);
+	flush_warnings(transfer_from, warntype_from_space);
+	/* Pass back references to put */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (is_valid[cnt])
+			transfer_to[cnt] = transfer_from[cnt];
+	return 0;
+over_quota:
+	spin_unlock(&dq_data_lock);
+	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	flush_warnings(transfer_to, warntype_to);
+	return ret;
+}
+EXPORT_SYMBOL(__dquot_transfer);
+
+/* Wrapper for transferring ownership of an inode for uid/gid only
+ * Called from FSXXX_setattr()
+ */
+int dquot_transfer(struct inode *inode, struct iattr *iattr)
+{
+	struct dquot *transfer_to[MAXQUOTAS] = {};
+	struct super_block *sb = inode->i_sb;
+	int ret;
+
+	if (!dquot_active(inode))
+		return 0;
+
+	if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
+		transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA);
+	if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)
+		transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA);
+
+	ret = __dquot_transfer(inode, transfer_to);
+	dqput_all(transfer_to);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_transfer);
+
+/*
+ * Write info of quota file to disk
+ */
+int dquot_commit_info(struct super_block *sb, int type)
+{
+	int ret;
+	struct quota_info *dqopt = sb_dqopt(sb);
+
+	mutex_lock(&dqopt->dqio_mutex);
+	ret = dqopt->ops[type]->write_file_info(sb, type);
+	mutex_unlock(&dqopt->dqio_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(dquot_commit_info);
+
+/*
+ * Definitions of diskquota operations.
+ */
+const struct dquot_operations dquot_operations = {
+	.write_dquot	= dquot_commit,
+	.acquire_dquot	= dquot_acquire,
+	.release_dquot	= dquot_release,
+	.mark_dirty	= dquot_mark_dquot_dirty,
+	.write_info	= dquot_commit_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
+};
+EXPORT_SYMBOL(dquot_operations);
+
+/*
+ * Generic helper for ->open on filesystems supporting disk quotas.
+ */
+int dquot_file_open(struct inode *inode, struct file *file)
+{
+	int error;
+
+	error = generic_file_open(inode, file);
+	if (!error && (file->f_mode & FMODE_WRITE))
+		dquot_initialize(inode);
+	return error;
+}
+EXPORT_SYMBOL(dquot_file_open);
+
+/*
+ * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
+ */
+int dquot_disable(struct super_block *sb, int type, unsigned int flags)
+{
+	int cnt, ret = 0;
+	struct quota_info *dqopt = sb_dqopt(sb);
+	struct inode *toputinode[MAXQUOTAS];
+
+	/* Cannot turn off usage accounting without turning off limits, or
+	 * suspend quotas and simultaneously turn quotas off. */
+	if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
+	    || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
+	    DQUOT_USAGE_ENABLED)))
+		return -EINVAL;
+
+	/* We need to serialize quota_off() for device */
+	mutex_lock(&dqopt->dqonoff_mutex);
+
+	/*
+	 * Skip everything if there's nothing to do. We have to do this because
+	 * sometimes we are called when fill_super() failed and calling
+	 * sync_fs() in such cases does no good.
+	 */
+	if (!sb_any_quota_loaded(sb)) {
+		mutex_unlock(&dqopt->dqonoff_mutex);
+		return 0;
+	}
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		toputinode[cnt] = NULL;
+		if (type != -1 && cnt != type)
+			continue;
+		if (!sb_has_quota_loaded(sb, cnt))
+			continue;
+
+		if (flags & DQUOT_SUSPENDED) {
+			spin_lock(&dq_state_lock);
+			dqopt->flags |=
+				dquot_state_flag(DQUOT_SUSPENDED, cnt);
+			spin_unlock(&dq_state_lock);
+		} else {
+			spin_lock(&dq_state_lock);
+			dqopt->flags &= ~dquot_state_flag(flags, cnt);
+			/* Turning off suspended quotas? */
+			if (!sb_has_quota_loaded(sb, cnt) &&
+			    sb_has_quota_suspended(sb, cnt)) {
+				dqopt->flags &=	~dquot_state_flag(
+							DQUOT_SUSPENDED, cnt);
+				spin_unlock(&dq_state_lock);
+				iput(dqopt->files[cnt]);
+				dqopt->files[cnt] = NULL;
+				continue;
+			}
+			spin_unlock(&dq_state_lock);
+		}
+
+		/* We still have to keep quota loaded? */
+		if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
+			continue;
+
+		/* Note: these are blocking operations */
+		drop_dquot_ref(sb, cnt);
+		invalidate_dquots(sb, cnt);
+		/*
+		 * Now all dquots should be invalidated, all writes done so we
+		 * should be only users of the info. No locks needed.
+		 */
+		if (info_dirty(&dqopt->info[cnt]))
+			sb->dq_op->write_info(sb, cnt);
+		if (dqopt->ops[cnt]->free_file_info)
+			dqopt->ops[cnt]->free_file_info(sb, cnt);
+		put_quota_format(dqopt->info[cnt].dqi_format);
+
+		toputinode[cnt] = dqopt->files[cnt];
+		if (!sb_has_quota_loaded(sb, cnt))
+			dqopt->files[cnt] = NULL;
+		dqopt->info[cnt].dqi_flags = 0;
+		dqopt->info[cnt].dqi_igrace = 0;
+		dqopt->info[cnt].dqi_bgrace = 0;
+		dqopt->ops[cnt] = NULL;
+	}
+	mutex_unlock(&dqopt->dqonoff_mutex);
+
+	/* Skip syncing and setting flags if quota files are hidden */
+	if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+		goto put_inodes;
+
+	/* Sync the superblock so that buffers with quota data are written to
+	 * disk (and so userspace sees correct data afterwards). */
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, 1);
+	sync_blockdev(sb->s_bdev);
+	/* Now the quota files are just ordinary files and we can set the
+	 * inode flags back. Moreover we discard the pagecache so that
+	 * userspace sees the writes we did bypassing the pagecache. We
+	 * must also discard the blockdev buffers so that we see the
+	 * changes done by userspace on the next quotaon() */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (toputinode[cnt]) {
+			mutex_lock(&dqopt->dqonoff_mutex);
+			/* If quota was reenabled in the meantime, we have
+			 * nothing to do */
+			if (!sb_has_quota_loaded(sb, cnt)) {
+				mutex_lock_nested(&toputinode[cnt]->i_mutex,
+						  I_MUTEX_QUOTA);
+				toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
+				  S_NOATIME | S_NOQUOTA);
+				truncate_inode_pages(&toputinode[cnt]->i_data,
+						     0);
+				mutex_unlock(&toputinode[cnt]->i_mutex);
+				mark_inode_dirty_sync(toputinode[cnt]);
+			}
+			mutex_unlock(&dqopt->dqonoff_mutex);
+		}
+	if (sb->s_bdev)
+		invalidate_bdev(sb->s_bdev);
+put_inodes:
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (toputinode[cnt]) {
+			/* On remount RO, we keep the inode pointer so that we
+			 * can reenable quota on the subsequent remount RW. We
+			 * have to check 'flags' variable and not use sb_has_
+			 * function because another quotaon / quotaoff could
+			 * change global state before we got here. We refuse
+			 * to suspend quotas when there is pending delete on
+			 * the quota file... */
+			if (!(flags & DQUOT_SUSPENDED))
+				iput(toputinode[cnt]);
+			else if (!toputinode[cnt]->i_nlink)
+				ret = -EBUSY;
+		}
+	return ret;
+}
+EXPORT_SYMBOL(dquot_disable);
+
+int dquot_quota_off(struct super_block *sb, int type)
+{
+	return dquot_disable(sb, type,
+			     DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+}
+EXPORT_SYMBOL(dquot_quota_off);
+
+/*
+ *	Turn quotas on on a device
+ */
+
+/*
+ * Helper function to turn quotas on when we already have the inode of
+ * quota file and no quota information is loaded.
+ */
+static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
+	unsigned int flags)
+{
+	struct quota_format_type *fmt = find_quota_format(format_id);
+	struct super_block *sb = inode->i_sb;
+	struct quota_info *dqopt = sb_dqopt(sb);
+	int error;
+	int oldflags = -1;
+
+	if (!fmt)
+		return -ESRCH;
+	if (!S_ISREG(inode->i_mode)) {
+		error = -EACCES;
+		goto out_fmt;
+	}
+	if (IS_RDONLY(inode)) {
+		error = -EROFS;
+		goto out_fmt;
+	}
+	if (!sb->s_op->quota_write || !sb->s_op->quota_read) {
+		error = -EINVAL;
+		goto out_fmt;
+	}
+	/* Usage always has to be set... */
+	if (!(flags & DQUOT_USAGE_ENABLED)) {
+		error = -EINVAL;
+		goto out_fmt;
+	}
+
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+		/* As we bypass the pagecache we must now flush all the
+		 * dirty data and invalidate caches so that kernel sees
+		 * changes from userspace. It is not enough to just flush
+		 * the quota file since if blocksize < pagesize, invalidation
+		 * of the cache could fail because of other unrelated dirty
+		 * data */
+		sync_filesystem(sb);
+		invalidate_bdev(sb->s_bdev);
+	}
+	mutex_lock(&dqopt->dqonoff_mutex);
+	if (sb_has_quota_loaded(sb, type)) {
+		error = -EBUSY;
+		goto out_lock;
+	}
+
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+		/* We don't want quota and atime on quota files (deadlocks
+		 * possible) Also nobody should write to the file - we use
+		 * special IO operations which ignore the immutable bit. */
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
+		oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
+					     S_NOQUOTA);
+		inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+		mutex_unlock(&inode->i_mutex);
+		/*
+		 * When S_NOQUOTA is set, remove dquot references as no more
+		 * references can be added
+		 */
+		__dquot_drop(inode);
+	}
+
+	error = -EIO;
+	dqopt->files[type] = igrab(inode);
+	if (!dqopt->files[type])
+		goto out_lock;
+	error = -EINVAL;
+	if (!fmt->qf_ops->check_quota_file(sb, type))
+		goto out_file_init;
+
+	dqopt->ops[type] = fmt->qf_ops;
+	dqopt->info[type].dqi_format = fmt;
+	dqopt->info[type].dqi_fmt_id = format_id;
+	INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
+	mutex_lock(&dqopt->dqio_mutex);
+	error = dqopt->ops[type]->read_file_info(sb, type);
+	if (error < 0) {
+		mutex_unlock(&dqopt->dqio_mutex);
+		goto out_file_init;
+	}
+	mutex_unlock(&dqopt->dqio_mutex);
+	spin_lock(&dq_state_lock);
+	dqopt->flags |= dquot_state_flag(flags, type);
+	spin_unlock(&dq_state_lock);
+
+	add_dquot_ref(sb, type);
+	mutex_unlock(&dqopt->dqonoff_mutex);
+
+	return 0;
+
+out_file_init:
+	dqopt->files[type] = NULL;
+	iput(inode);
+out_lock:
+	if (oldflags != -1) {
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
+		/* Set the flags back (in the case of accidental quotaon()
+		 * on a wrong file we don't want to mess up the flags) */
+		inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
+		inode->i_flags |= oldflags;
+		mutex_unlock(&inode->i_mutex);
+	}
+	mutex_unlock(&dqopt->dqonoff_mutex);
+out_fmt:
+	put_quota_format(fmt);
+
+	return error; 
+}
+
+/* Reenable quotas on remount RW */
+int dquot_resume(struct super_block *sb, int type)
+{
+	struct quota_info *dqopt = sb_dqopt(sb);
+	struct inode *inode;
+	int ret = 0, cnt;
+	unsigned int flags;
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+
+		mutex_lock(&dqopt->dqonoff_mutex);
+		if (!sb_has_quota_suspended(sb, cnt)) {
+			mutex_unlock(&dqopt->dqonoff_mutex);
+			continue;
+		}
+		inode = dqopt->files[cnt];
+		dqopt->files[cnt] = NULL;
+		spin_lock(&dq_state_lock);
+		flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+							DQUOT_LIMITS_ENABLED,
+							cnt);
+		dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
+		spin_unlock(&dq_state_lock);
+		mutex_unlock(&dqopt->dqonoff_mutex);
+
+		flags = dquot_generic_flag(flags, cnt);
+		ret = vfs_load_quota_inode(inode, cnt,
+				dqopt->info[cnt].dqi_fmt_id, flags);
+		iput(inode);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(dquot_resume);
+
+int dquot_quota_on(struct super_block *sb, int type, int format_id,
+		   struct path *path)
+{
+	int error = security_quota_on(path->dentry);
+	if (error)
+		return error;
+	/* Quota file not on the same filesystem? */
+	if (path->mnt->mnt_sb != sb)
+		error = -EXDEV;
+	else
+		error = vfs_load_quota_inode(path->dentry->d_inode, type,
+					     format_id, DQUOT_USAGE_ENABLED |
+					     DQUOT_LIMITS_ENABLED);
+	return error;
+}
+EXPORT_SYMBOL(dquot_quota_on);
+
+/*
+ * More powerful function for turning on quotas allowing setting
+ * of individual quota flags
+ */
+int dquot_enable(struct inode *inode, int type, int format_id,
+		 unsigned int flags)
+{
+	int ret = 0;
+	struct super_block *sb = inode->i_sb;
+	struct quota_info *dqopt = sb_dqopt(sb);
+
+	/* Just unsuspend quotas? */
+	BUG_ON(flags & DQUOT_SUSPENDED);
+
+	if (!flags)
+		return 0;
+	/* Just updating flags needed? */
+	if (sb_has_quota_loaded(sb, type)) {
+		mutex_lock(&dqopt->dqonoff_mutex);
+		/* Now do a reliable test... */
+		if (!sb_has_quota_loaded(sb, type)) {
+			mutex_unlock(&dqopt->dqonoff_mutex);
+			goto load_quota;
+		}
+		if (flags & DQUOT_USAGE_ENABLED &&
+		    sb_has_quota_usage_enabled(sb, type)) {
+			ret = -EBUSY;
+			goto out_lock;
+		}
+		if (flags & DQUOT_LIMITS_ENABLED &&
+		    sb_has_quota_limits_enabled(sb, type)) {
+			ret = -EBUSY;
+			goto out_lock;
+		}
+		spin_lock(&dq_state_lock);
+		sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
+		spin_unlock(&dq_state_lock);
+out_lock:
+		mutex_unlock(&dqopt->dqonoff_mutex);
+		return ret;
+	}
+
+load_quota:
+	return vfs_load_quota_inode(inode, type, format_id, flags);
+}
+EXPORT_SYMBOL(dquot_enable);
+
+/*
+ * This function is used when filesystem needs to initialize quotas
+ * during mount time.
+ */
+int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
+		int format_id, int type)
+{
+	struct dentry *dentry;
+	int error;
+
+	mutex_lock(&sb->s_root->d_inode->i_mutex);
+	dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
+	mutex_unlock(&sb->s_root->d_inode->i_mutex);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	if (!dentry->d_inode) {
+		error = -ENOENT;
+		goto out;
+	}
+
+	error = security_quota_on(dentry);
+	if (!error)
+		error = vfs_load_quota_inode(dentry->d_inode, type, format_id,
+				DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
+out:
+	dput(dentry);
+	return error;
+}
+EXPORT_SYMBOL(dquot_quota_on_mount);
+
+static inline qsize_t qbtos(qsize_t blocks)
+{
+	return blocks << QIF_DQBLKSIZE_BITS;
+}
+
+static inline qsize_t stoqb(qsize_t space)
+{
+	return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
+
+/* Generic routine for getting common part of quota structure */
+static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
+{
+	struct mem_dqblk *dm = &dquot->dq_dqb;
+
+	memset(di, 0, sizeof(*di));
+	di->d_version = FS_DQUOT_VERSION;
+	di->d_flags = dquot->dq_type == USRQUOTA ?
+			FS_USER_QUOTA : FS_GROUP_QUOTA;
+	di->d_id = dquot->dq_id;
+
+	spin_lock(&dq_data_lock);
+	di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit);
+	di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit);
+	di->d_ino_hardlimit = dm->dqb_ihardlimit;
+	di->d_ino_softlimit = dm->dqb_isoftlimit;
+	di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace;
+	di->d_icount = dm->dqb_curinodes;
+	di->d_btimer = dm->dqb_btime;
+	di->d_itimer = dm->dqb_itime;
+	spin_unlock(&dq_data_lock);
+}
+
+int dquot_get_dqblk(struct super_block *sb, int type, qid_t id,
+		    struct fs_disk_quota *di)
+{
+	struct dquot *dquot;
+
+	dquot = dqget(sb, id, type);
+	if (!dquot)
+		return -ESRCH;
+	do_get_dqblk(dquot, di);
+	dqput(dquot);
+
+	return 0;
+}
+EXPORT_SYMBOL(dquot_get_dqblk);
+
+#define VFS_FS_DQ_MASK \
+	(FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
+	 FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \
+	 FS_DQ_BTIMER | FS_DQ_ITIMER)
+
+/* Generic routine for setting common part of quota structure */
+static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
+{
+	struct mem_dqblk *dm = &dquot->dq_dqb;
+	int check_blim = 0, check_ilim = 0;
+	struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
+
+	if (di->d_fieldmask & ~VFS_FS_DQ_MASK)
+		return -EINVAL;
+
+	if (((di->d_fieldmask & FS_DQ_BSOFT) &&
+	     (di->d_blk_softlimit > dqi->dqi_maxblimit)) ||
+	    ((di->d_fieldmask & FS_DQ_BHARD) &&
+	     (di->d_blk_hardlimit > dqi->dqi_maxblimit)) ||
+	    ((di->d_fieldmask & FS_DQ_ISOFT) &&
+	     (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
+	    ((di->d_fieldmask & FS_DQ_IHARD) &&
+	     (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
+		return -ERANGE;
+
+	spin_lock(&dq_data_lock);
+	if (di->d_fieldmask & FS_DQ_BCOUNT) {
+		dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace;
+		check_blim = 1;
+		set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+	}
+
+	if (di->d_fieldmask & FS_DQ_BSOFT)
+		dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit);
+	if (di->d_fieldmask & FS_DQ_BHARD)
+		dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit);
+	if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) {
+		check_blim = 1;
+		set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+	}
+
+	if (di->d_fieldmask & FS_DQ_ICOUNT) {
+		dm->dqb_curinodes = di->d_icount;
+		check_ilim = 1;
+		set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+	}
+
+	if (di->d_fieldmask & FS_DQ_ISOFT)
+		dm->dqb_isoftlimit = di->d_ino_softlimit;
+	if (di->d_fieldmask & FS_DQ_IHARD)
+		dm->dqb_ihardlimit = di->d_ino_hardlimit;
+	if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) {
+		check_ilim = 1;
+		set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+	}
+
+	if (di->d_fieldmask & FS_DQ_BTIMER) {
+		dm->dqb_btime = di->d_btimer;
+		check_blim = 1;
+		set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+	}
+
+	if (di->d_fieldmask & FS_DQ_ITIMER) {
+		dm->dqb_itime = di->d_itimer;
+		check_ilim = 1;
+		set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+	}
+
+	if (check_blim) {
+		if (!dm->dqb_bsoftlimit ||
+		    dm->dqb_curspace < dm->dqb_bsoftlimit) {
+			dm->dqb_btime = 0;
+			clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+		} else if (!(di->d_fieldmask & FS_DQ_BTIMER))
+			/* Set grace only if user hasn't provided his own... */
+			dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
+	}
+	if (check_ilim) {
+		if (!dm->dqb_isoftlimit ||
+		    dm->dqb_curinodes < dm->dqb_isoftlimit) {
+			dm->dqb_itime = 0;
+			clear_bit(DQ_INODES_B, &dquot->dq_flags);
+		} else if (!(di->d_fieldmask & FS_DQ_ITIMER))
+			/* Set grace only if user hasn't provided his own... */
+			dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
+	}
+	if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit ||
+	    dm->dqb_isoftlimit)
+		clear_bit(DQ_FAKE_B, &dquot->dq_flags);
+	else
+		set_bit(DQ_FAKE_B, &dquot->dq_flags);
+	spin_unlock(&dq_data_lock);
+	mark_dquot_dirty(dquot);
+
+	return 0;
+}
+
+int dquot_set_dqblk(struct super_block *sb, int type, qid_t id,
+		  struct fs_disk_quota *di)
+{
+	struct dquot *dquot;
+	int rc;
+
+	dquot = dqget(sb, id, type);
+	if (!dquot) {
+		rc = -ESRCH;
+		goto out;
+	}
+	rc = do_set_dqblk(dquot, di);
+	dqput(dquot);
+out:
+	return rc;
+}
+EXPORT_SYMBOL(dquot_set_dqblk);
+
+/* Generic routine for getting common part of quota file information */
+int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+{
+	struct mem_dqinfo *mi;
+  
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	if (!sb_has_quota_active(sb, type)) {
+		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+		return -ESRCH;
+	}
+	mi = sb_dqopt(sb)->info + type;
+	spin_lock(&dq_data_lock);
+	ii->dqi_bgrace = mi->dqi_bgrace;
+	ii->dqi_igrace = mi->dqi_igrace;
+	ii->dqi_flags = mi->dqi_flags & DQF_MASK;
+	ii->dqi_valid = IIF_ALL;
+	spin_unlock(&dq_data_lock);
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(dquot_get_dqinfo);
+
+/* Generic routine for setting common part of quota file information */
+int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+{
+	struct mem_dqinfo *mi;
+	int err = 0;
+
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	if (!sb_has_quota_active(sb, type)) {
+		err = -ESRCH;
+		goto out;
+	}
+	mi = sb_dqopt(sb)->info + type;
+	spin_lock(&dq_data_lock);
+	if (ii->dqi_valid & IIF_BGRACE)
+		mi->dqi_bgrace = ii->dqi_bgrace;
+	if (ii->dqi_valid & IIF_IGRACE)
+		mi->dqi_igrace = ii->dqi_igrace;
+	if (ii->dqi_valid & IIF_FLAGS)
+		mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) |
+				(ii->dqi_flags & DQF_MASK);
+	spin_unlock(&dq_data_lock);
+	mark_info_dirty(sb, type);
+	/* Force write to disk */
+	sb->dq_op->write_info(sb, type);
+out:
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	return err;
+}
+EXPORT_SYMBOL(dquot_set_dqinfo);
+
+const struct quotactl_ops dquot_quotactl_ops = {
+	.quota_on	= dquot_quota_on,
+	.quota_off	= dquot_quota_off,
+	.quota_sync	= dquot_quota_sync,
+	.get_info	= dquot_get_dqinfo,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk
+};
+EXPORT_SYMBOL(dquot_quotactl_ops);
+
+static int do_proc_dqstats(struct ctl_table *table, int write,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	unsigned int type = (int *)table->data - dqstats.stat;
+
+	/* Update global table */
+	dqstats.stat[type] =
+			percpu_counter_sum_positive(&dqstats.counter[type]);
+	return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+
+static ctl_table fs_dqstats_table[] = {
+	{
+		.procname	= "lookups",
+		.data		= &dqstats.stat[DQST_LOOKUPS],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+	{
+		.procname	= "drops",
+		.data		= &dqstats.stat[DQST_DROPS],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+	{
+		.procname	= "reads",
+		.data		= &dqstats.stat[DQST_READS],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+	{
+		.procname	= "writes",
+		.data		= &dqstats.stat[DQST_WRITES],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+	{
+		.procname	= "cache_hits",
+		.data		= &dqstats.stat[DQST_CACHE_HITS],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+	{
+		.procname	= "allocated_dquots",
+		.data		= &dqstats.stat[DQST_ALLOC_DQUOTS],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+	{
+		.procname	= "free_dquots",
+		.data		= &dqstats.stat[DQST_FREE_DQUOTS],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+	{
+		.procname	= "syncs",
+		.data		= &dqstats.stat[DQST_SYNCS],
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= do_proc_dqstats,
+	},
+#ifdef CONFIG_PRINT_QUOTA_WARNING
+	{
+		.procname	= "warnings",
+		.data		= &flag_print_warnings,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
+	{ },
+};
+
+static ctl_table fs_table[] = {
+	{
+		.procname	= "quota",
+		.mode		= 0555,
+		.child		= fs_dqstats_table,
+	},
+	{ },
+};
+
+static ctl_table sys_table[] = {
+	{
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= fs_table,
+	},
+	{ },
+};
+
+static int __init dquot_init(void)
+{
+	int i, ret;
+	unsigned long nr_hash, order;
+
+	printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
+
+	register_sysctl_table(sys_table);
+
+	dquot_cachep = kmem_cache_create("dquot",
+			sizeof(struct dquot), sizeof(unsigned long) * 4,
+			(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+				SLAB_MEM_SPREAD|SLAB_PANIC),
+			NULL);
+
+	order = 0;
+	dquot_hash = (struct hlist_head *)__get_free_pages(GFP_ATOMIC, order);
+	if (!dquot_hash)
+		panic("Cannot create dquot hash table");
+
+	for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
+		ret = percpu_counter_init(&dqstats.counter[i], 0);
+		if (ret)
+			panic("Cannot create dquot stat counters");
+	}
+
+	/* Find power-of-two hlist_heads which can fit into allocation */
+	nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
+	dq_hash_bits = 0;
+	do {
+		dq_hash_bits++;
+	} while (nr_hash >> dq_hash_bits);
+	dq_hash_bits--;
+
+	nr_hash = 1UL << dq_hash_bits;
+	dq_hash_mask = nr_hash - 1;
+	for (i = 0; i < nr_hash; i++)
+		INIT_HLIST_HEAD(dquot_hash + i);
+
+	printk("Dquot-cache hash table entries: %ld (order %ld, %ld bytes)\n",
+			nr_hash, order, (PAGE_SIZE << order));
+
+	register_shrinker(&dqcache_shrinker);
+
+	return 0;
+}
+module_init(dquot_init);
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
new file mode 100644
index 00000000..d67908b4
--- /dev/null
+++ b/fs/quota/netlink.c
@@ -0,0 +1,96 @@
+
+#include <linux/cred.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/quotaops.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+/* Netlink family structure for quota */
+static struct genl_family quota_genl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = 0,
+	.name = "VFS_DQUOT",
+	.version = 1,
+	.maxattr = QUOTA_NL_A_MAX,
+};
+
+/**
+ * quota_send_warning - Send warning to userspace about exceeded quota
+ * @type: The quota type: USRQQUOTA, GRPQUOTA,...
+ * @id: The user or group id of the quota that was exceeded
+ * @dev: The device on which the fs is mounted (sb->s_dev)
+ * @warntype: The type of the warning: QUOTA_NL_...
+ *
+ * This can be used by filesystems (including those which don't use
+ * dquot) to send a message to userspace relating to quota limits.
+ *
+ */
+
+void quota_send_warning(short type, unsigned int id, dev_t dev,
+			const char warntype)
+{
+	static atomic_t seq;
+	struct sk_buff *skb;
+	void *msg_head;
+	int ret;
+	int msg_size = 4 * nla_total_size(sizeof(u32)) +
+		       2 * nla_total_size(sizeof(u64));
+
+	/* We have to allocate using GFP_NOFS as we are called from a
+	 * filesystem performing write and thus further recursion into
+	 * the fs to free some data could cause deadlocks. */
+	skb = genlmsg_new(msg_size, GFP_NOFS);
+	if (!skb) {
+		printk(KERN_ERR
+		  "VFS: Not enough memory to send quota warning.\n");
+		return;
+	}
+	msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
+			&quota_genl_family, 0, QUOTA_NL_C_WARNING);
+	if (!msg_head) {
+		printk(KERN_ERR
+		  "VFS: Cannot store netlink header in quota warning.\n");
+		goto err_out;
+	}
+	ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
+	if (ret)
+		goto attr_err_out;
+	genlmsg_end(skb, msg_head);
+
+	genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
+	return;
+attr_err_out:
+	printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
+err_out:
+	kfree_skb(skb);
+}
+EXPORT_SYMBOL(quota_send_warning);
+
+static int __init quota_init(void)
+{
+	if (genl_register_family(&quota_genl_family) != 0)
+		printk(KERN_ERR
+		       "VFS: Failed to create quota netlink interface.\n");
+	return 0;
+};
+
+module_init(quota_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
new file mode 100644
index 00000000..10b6be3c
--- /dev/null
+++ b/fs/quota/quota.c
@@ -0,0 +1,375 @@
+/*
+ * Quota code necessary even when VFS quota support is not compiled
+ * into the kernel.  The interesting stuff is over in dquot.c, here
+ * we have symbols for initial quotactl(2) handling, the sysctl(2)
+ * variables, etc - things needed even when quota support disabled.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <asm/current.h>
+#include <asm/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/buffer_head.h>
+#include <linux/capability.h>
+#include <linux/quotaops.h>
+#include <linux/types.h>
+#include <linux/writeback.h>
+
+static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
+				     qid_t id)
+{
+	switch (cmd) {
+	/* these commands do not require any special privilegues */
+	case Q_GETFMT:
+	case Q_SYNC:
+	case Q_GETINFO:
+	case Q_XGETQSTAT:
+	case Q_XQUOTASYNC:
+		break;
+	/* allow to query information for dquots we "own" */
+	case Q_GETQUOTA:
+	case Q_XGETQUOTA:
+		if ((type == USRQUOTA && current_euid() == id) ||
+		    (type == GRPQUOTA && in_egroup_p(id)))
+			break;
+		/*FALLTHROUGH*/
+	default:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+	}
+
+	return security_quotactl(cmd, type, id, sb);
+}
+
+static void quota_sync_one(struct super_block *sb, void *arg)
+{
+	if (sb->s_qcop && sb->s_qcop->quota_sync)
+		sb->s_qcop->quota_sync(sb, *(int *)arg, 1);
+}
+
+static int quota_sync_all(int type)
+{
+	int ret;
+
+	if (type >= MAXQUOTAS)
+		return -EINVAL;
+	ret = security_quotactl(Q_SYNC, type, 0, NULL);
+	if (!ret)
+		iterate_supers(quota_sync_one, &type);
+	return ret;
+}
+
+static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
+		         struct path *path)
+{
+	if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
+		return -ENOSYS;
+	if (sb->s_qcop->quota_on_meta)
+		return sb->s_qcop->quota_on_meta(sb, type, id);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	return sb->s_qcop->quota_on(sb, type, id, path);
+}
+
+static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
+{
+	__u32 fmt;
+
+	down_read(&sb_dqopt(sb)->dqptr_sem);
+	if (!sb_has_quota_active(sb, type)) {
+		up_read(&sb_dqopt(sb)->dqptr_sem);
+		return -ESRCH;
+	}
+	fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
+	up_read(&sb_dqopt(sb)->dqptr_sem);
+	if (copy_to_user(addr, &fmt, sizeof(fmt)))
+		return -EFAULT;
+	return 0;
+}
+
+static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
+{
+	struct if_dqinfo info;
+	int ret;
+
+	if (!sb->s_qcop->get_info)
+		return -ENOSYS;
+	ret = sb->s_qcop->get_info(sb, type, &info);
+	if (!ret && copy_to_user(addr, &info, sizeof(info)))
+		return -EFAULT;
+	return ret;
+}
+
+static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
+{
+	struct if_dqinfo info;
+
+	if (copy_from_user(&info, addr, sizeof(info)))
+		return -EFAULT;
+	if (!sb->s_qcop->set_info)
+		return -ENOSYS;
+	return sb->s_qcop->set_info(sb, type, &info);
+}
+
+static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
+{
+	dst->dqb_bhardlimit = src->d_blk_hardlimit;
+	dst->dqb_bsoftlimit = src->d_blk_softlimit;
+	dst->dqb_curspace = src->d_bcount;
+	dst->dqb_ihardlimit = src->d_ino_hardlimit;
+	dst->dqb_isoftlimit = src->d_ino_softlimit;
+	dst->dqb_curinodes = src->d_icount;
+	dst->dqb_btime = src->d_btimer;
+	dst->dqb_itime = src->d_itimer;
+	dst->dqb_valid = QIF_ALL;
+}
+
+static int quota_getquota(struct super_block *sb, int type, qid_t id,
+			  void __user *addr)
+{
+	struct fs_disk_quota fdq;
+	struct if_dqblk idq;
+	int ret;
+
+	if (!sb->s_qcop->get_dqblk)
+		return -ENOSYS;
+	ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
+	if (ret)
+		return ret;
+	copy_to_if_dqblk(&idq, &fdq);
+	if (copy_to_user(addr, &idq, sizeof(idq)))
+		return -EFAULT;
+	return 0;
+}
+
+static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src)
+{
+	dst->d_blk_hardlimit = src->dqb_bhardlimit;
+	dst->d_blk_softlimit  = src->dqb_bsoftlimit;
+	dst->d_bcount = src->dqb_curspace;
+	dst->d_ino_hardlimit = src->dqb_ihardlimit;
+	dst->d_ino_softlimit = src->dqb_isoftlimit;
+	dst->d_icount = src->dqb_curinodes;
+	dst->d_btimer = src->dqb_btime;
+	dst->d_itimer = src->dqb_itime;
+
+	dst->d_fieldmask = 0;
+	if (src->dqb_valid & QIF_BLIMITS)
+		dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD;
+	if (src->dqb_valid & QIF_SPACE)
+		dst->d_fieldmask |= FS_DQ_BCOUNT;
+	if (src->dqb_valid & QIF_ILIMITS)
+		dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD;
+	if (src->dqb_valid & QIF_INODES)
+		dst->d_fieldmask |= FS_DQ_ICOUNT;
+	if (src->dqb_valid & QIF_BTIME)
+		dst->d_fieldmask |= FS_DQ_BTIMER;
+	if (src->dqb_valid & QIF_ITIME)
+		dst->d_fieldmask |= FS_DQ_ITIMER;
+}
+
+static int quota_setquota(struct super_block *sb, int type, qid_t id,
+			  void __user *addr)
+{
+	struct fs_disk_quota fdq;
+	struct if_dqblk idq;
+
+	if (copy_from_user(&idq, addr, sizeof(idq)))
+		return -EFAULT;
+	if (!sb->s_qcop->set_dqblk)
+		return -ENOSYS;
+	copy_from_if_dqblk(&fdq, &idq);
+	return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
+}
+
+static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
+{
+	__u32 flags;
+
+	if (copy_from_user(&flags, addr, sizeof(flags)))
+		return -EFAULT;
+	if (!sb->s_qcop->set_xstate)
+		return -ENOSYS;
+	return sb->s_qcop->set_xstate(sb, flags, cmd);
+}
+
+static int quota_getxstate(struct super_block *sb, void __user *addr)
+{
+	struct fs_quota_stat fqs;
+	int ret;
+
+	if (!sb->s_qcop->get_xstate)
+		return -ENOSYS;
+	ret = sb->s_qcop->get_xstate(sb, &fqs);
+	if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
+		return -EFAULT;
+	return ret;
+}
+
+static int quota_setxquota(struct super_block *sb, int type, qid_t id,
+			   void __user *addr)
+{
+	struct fs_disk_quota fdq;
+
+	if (copy_from_user(&fdq, addr, sizeof(fdq)))
+		return -EFAULT;
+	if (!sb->s_qcop->set_dqblk)
+		return -ENOSYS;
+	return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
+}
+
+static int quota_getxquota(struct super_block *sb, int type, qid_t id,
+			   void __user *addr)
+{
+	struct fs_disk_quota fdq;
+	int ret;
+
+	if (!sb->s_qcop->get_dqblk)
+		return -ENOSYS;
+	ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
+	if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
+		return -EFAULT;
+	return ret;
+}
+
+/* Copy parameters and call proper function */
+static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
+		       void __user *addr, struct path *path)
+{
+	int ret;
+
+	if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS))
+		return -EINVAL;
+	if (!sb->s_qcop)
+		return -ENOSYS;
+
+	ret = check_quotactl_permission(sb, type, cmd, id);
+	if (ret < 0)
+		return ret;
+
+	switch (cmd) {
+	case Q_QUOTAON:
+		return quota_quotaon(sb, type, cmd, id, path);
+	case Q_QUOTAOFF:
+		if (!sb->s_qcop->quota_off)
+			return -ENOSYS;
+		return sb->s_qcop->quota_off(sb, type);
+	case Q_GETFMT:
+		return quota_getfmt(sb, type, addr);
+	case Q_GETINFO:
+		return quota_getinfo(sb, type, addr);
+	case Q_SETINFO:
+		return quota_setinfo(sb, type, addr);
+	case Q_GETQUOTA:
+		return quota_getquota(sb, type, id, addr);
+	case Q_SETQUOTA:
+		return quota_setquota(sb, type, id, addr);
+	case Q_SYNC:
+		if (!sb->s_qcop->quota_sync)
+			return -ENOSYS;
+		return sb->s_qcop->quota_sync(sb, type, 1);
+	case Q_XQUOTAON:
+	case Q_XQUOTAOFF:
+	case Q_XQUOTARM:
+		return quota_setxstate(sb, cmd, addr);
+	case Q_XGETQSTAT:
+		return quota_getxstate(sb, addr);
+	case Q_XSETQLIM:
+		return quota_setxquota(sb, type, id, addr);
+	case Q_XGETQUOTA:
+		return quota_getxquota(sb, type, id, addr);
+	case Q_XQUOTASYNC:
+		/* caller already holds s_umount */
+		if (sb->s_flags & MS_RDONLY)
+			return -EROFS;
+		writeback_inodes_sb(sb);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+/*
+ * look up a superblock on which quota ops will be performed
+ * - use the name of a block device to find the superblock thereon
+ */
+static struct super_block *quotactl_block(const char __user *special)
+{
+#ifdef CONFIG_BLOCK
+	struct block_device *bdev;
+	struct super_block *sb;
+	char *tmp = getname(special);
+
+	if (IS_ERR(tmp))
+		return ERR_CAST(tmp);
+	bdev = lookup_bdev(tmp);
+	putname(tmp);
+	if (IS_ERR(bdev))
+		return ERR_CAST(bdev);
+	sb = get_super(bdev);
+	bdput(bdev);
+	if (!sb)
+		return ERR_PTR(-ENODEV);
+
+	return sb;
+#else
+	return ERR_PTR(-ENODEV);
+#endif
+}
+
+/*
+ * This is the system call interface. This communicates with
+ * the user-level programs. Currently this only supports diskquota
+ * calls. Maybe we need to add the process quotas etc. in the future,
+ * but we probably should use rlimits for that.
+ */
+SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
+		qid_t, id, void __user *, addr)
+{
+	uint cmds, type;
+	struct super_block *sb = NULL;
+	struct path path, *pathp = NULL;
+	int ret;
+
+	cmds = cmd >> SUBCMDSHIFT;
+	type = cmd & SUBCMDMASK;
+
+	/*
+	 * As a special case Q_SYNC can be called without a specific device.
+	 * It will iterate all superblocks that have quota enabled and call
+	 * the sync action on each of them.
+	 */
+	if (!special) {
+		if (cmds == Q_SYNC)
+			return quota_sync_all(type);
+		return -ENODEV;
+	}
+
+	/*
+	 * Path for quotaon has to be resolved before grabbing superblock
+	 * because that gets s_umount sem which is also possibly needed by path
+	 * resolution (think about autofs) and thus deadlocks could arise.
+	 */
+	if (cmds == Q_QUOTAON) {
+		ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
+		if (ret)
+			pathp = ERR_PTR(ret);
+		else
+			pathp = &path;
+	}
+
+	sb = quotactl_block(special);
+	if (IS_ERR(sb))
+		return PTR_ERR(sb);
+
+	ret = do_quotactl(sb, type, cmds, id, addr, pathp);
+
+	drop_super(sb);
+	if (pathp && !IS_ERR(pathp))
+		path_put(pathp);
+	return ret;
+}
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
new file mode 100644
index 00000000..e41c1bec
--- /dev/null
+++ b/fs/quota/quota_tree.c
@@ -0,0 +1,659 @@
+/*
+ *	vfsv0 quota IO operations on file
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/dqblk_v2.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/quotaops.h>
+
+#include <asm/byteorder.h>
+
+#include "quota_tree.h"
+
+MODULE_AUTHOR("Jan Kara");
+MODULE_DESCRIPTION("Quota trie support");
+MODULE_LICENSE("GPL");
+
+#define __QUOTA_QT_PARANOIA
+
+static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
+{
+	unsigned int epb = info->dqi_usable_bs >> 2;
+
+	depth = info->dqi_qtree_depth - depth - 1;
+	while (depth--)
+		id /= epb;
+	return id % epb;
+}
+
+/* Number of entries in one blocks */
+static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
+{
+	return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
+	       / info->dqi_entry_size;
+}
+
+static char *getdqbuf(size_t size)
+{
+	char *buf = kmalloc(size, GFP_NOFS);
+	if (!buf)
+		printk(KERN_WARNING
+		       "VFS: Not enough memory for quota buffers.\n");
+	return buf;
+}
+
+static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
+{
+	struct super_block *sb = info->dqi_sb;
+
+	memset(buf, 0, info->dqi_usable_bs);
+	return sb->s_op->quota_read(sb, info->dqi_type, buf,
+	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+
+static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
+{
+	struct super_block *sb = info->dqi_sb;
+	ssize_t ret;
+
+	ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
+	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+	if (ret != info->dqi_usable_bs) {
+		quota_error(sb, "dquota write failed");
+		if (ret >= 0)
+			ret = -EIO;
+	}
+	return ret;
+}
+
+/* Remove empty block from list and return it */
+static int get_free_dqblk(struct qtree_mem_dqinfo *info)
+{
+	char *buf = getdqbuf(info->dqi_usable_bs);
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	int ret, blk;
+
+	if (!buf)
+		return -ENOMEM;
+	if (info->dqi_free_blk) {
+		blk = info->dqi_free_blk;
+		ret = read_blk(info, blk, buf);
+		if (ret < 0)
+			goto out_buf;
+		info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
+	}
+	else {
+		memset(buf, 0, info->dqi_usable_bs);
+		/* Assure block allocation... */
+		ret = write_blk(info, info->dqi_blocks, buf);
+		if (ret < 0)
+			goto out_buf;
+		blk = info->dqi_blocks++;
+	}
+	mark_info_dirty(info->dqi_sb, info->dqi_type);
+	ret = blk;
+out_buf:
+	kfree(buf);
+	return ret;
+}
+
+/* Insert empty block to the list */
+static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk)
+{
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	int err;
+
+	dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk);
+	dh->dqdh_prev_free = cpu_to_le32(0);
+	dh->dqdh_entries = cpu_to_le16(0);
+	err = write_blk(info, blk, buf);
+	if (err < 0)
+		return err;
+	info->dqi_free_blk = blk;
+	mark_info_dirty(info->dqi_sb, info->dqi_type);
+	return 0;
+}
+
+/* Remove given block from the list of blocks with free entries */
+static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
+			       uint blk)
+{
+	char *tmpbuf = getdqbuf(info->dqi_usable_bs);
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	uint nextblk = le32_to_cpu(dh->dqdh_next_free);
+	uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
+	int err;
+
+	if (!tmpbuf)
+		return -ENOMEM;
+	if (nextblk) {
+		err = read_blk(info, nextblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+		((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+							dh->dqdh_prev_free;
+		err = write_blk(info, nextblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+	}
+	if (prevblk) {
+		err = read_blk(info, prevblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+		((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free =
+							dh->dqdh_next_free;
+		err = write_blk(info, prevblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+	} else {
+		info->dqi_free_entry = nextblk;
+		mark_info_dirty(info->dqi_sb, info->dqi_type);
+	}
+	kfree(tmpbuf);
+	dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
+	/* No matter whether write succeeds block is out of list */
+	if (write_blk(info, blk, buf) < 0)
+		quota_error(info->dqi_sb, "Can't write block (%u) "
+			    "with free entries", blk);
+	return 0;
+out_buf:
+	kfree(tmpbuf);
+	return err;
+}
+
+/* Insert given block to the beginning of list with free entries */
+static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
+			       uint blk)
+{
+	char *tmpbuf = getdqbuf(info->dqi_usable_bs);
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	int err;
+
+	if (!tmpbuf)
+		return -ENOMEM;
+	dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry);
+	dh->dqdh_prev_free = cpu_to_le32(0);
+	err = write_blk(info, blk, buf);
+	if (err < 0)
+		goto out_buf;
+	if (info->dqi_free_entry) {
+		err = read_blk(info, info->dqi_free_entry, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+		((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+							cpu_to_le32(blk);
+		err = write_blk(info, info->dqi_free_entry, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+	}
+	kfree(tmpbuf);
+	info->dqi_free_entry = blk;
+	mark_info_dirty(info->dqi_sb, info->dqi_type);
+	return 0;
+out_buf:
+	kfree(tmpbuf);
+	return err;
+}
+
+/* Is the entry in the block free? */
+int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk)
+{
+	int i;
+
+	for (i = 0; i < info->dqi_entry_size; i++)
+		if (disk[i])
+			return 0;
+	return 1;
+}
+EXPORT_SYMBOL(qtree_entry_unused);
+
+/* Find space for dquot */
+static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
+			      struct dquot *dquot, int *err)
+{
+	uint blk, i;
+	struct qt_disk_dqdbheader *dh;
+	char *buf = getdqbuf(info->dqi_usable_bs);
+	char *ddquot;
+
+	*err = 0;
+	if (!buf) {
+		*err = -ENOMEM;
+		return 0;
+	}
+	dh = (struct qt_disk_dqdbheader *)buf;
+	if (info->dqi_free_entry) {
+		blk = info->dqi_free_entry;
+		*err = read_blk(info, blk, buf);
+		if (*err < 0)
+			goto out_buf;
+	} else {
+		blk = get_free_dqblk(info);
+		if ((int)blk < 0) {
+			*err = blk;
+			kfree(buf);
+			return 0;
+		}
+		memset(buf, 0, info->dqi_usable_bs);
+		/* This is enough as the block is already zeroed and the entry
+		 * list is empty... */
+		info->dqi_free_entry = blk;
+		mark_info_dirty(dquot->dq_sb, dquot->dq_type);
+	}
+	/* Block will be full? */
+	if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
+		*err = remove_free_dqentry(info, buf, blk);
+		if (*err < 0) {
+			quota_error(dquot->dq_sb, "Can't remove block (%u) "
+				    "from entry free list", blk);
+			goto out_buf;
+		}
+	}
+	le16_add_cpu(&dh->dqdh_entries, 1);
+	/* Find free structure in block */
+	ddquot = buf + sizeof(struct qt_disk_dqdbheader);
+	for (i = 0; i < qtree_dqstr_in_blk(info); i++) {
+		if (qtree_entry_unused(info, ddquot))
+			break;
+		ddquot += info->dqi_entry_size;
+	}
+#ifdef __QUOTA_QT_PARANOIA
+	if (i == qtree_dqstr_in_blk(info)) {
+		quota_error(dquot->dq_sb, "Data block full but it shouldn't");
+		*err = -EIO;
+		goto out_buf;
+	}
+#endif
+	*err = write_blk(info, blk, buf);
+	if (*err < 0) {
+		quota_error(dquot->dq_sb, "Can't write quota data block %u",
+			    blk);
+		goto out_buf;
+	}
+	dquot->dq_off = (blk << info->dqi_blocksize_bits) +
+			sizeof(struct qt_disk_dqdbheader) +
+			i * info->dqi_entry_size;
+	kfree(buf);
+	return blk;
+out_buf:
+	kfree(buf);
+	return 0;
+}
+
+/* Insert reference to structure into the trie */
+static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+			  uint *treeblk, int depth)
+{
+	char *buf = getdqbuf(info->dqi_usable_bs);
+	int ret = 0, newson = 0, newact = 0;
+	__le32 *ref;
+	uint newblk;
+
+	if (!buf)
+		return -ENOMEM;
+	if (!*treeblk) {
+		ret = get_free_dqblk(info);
+		if (ret < 0)
+			goto out_buf;
+		*treeblk = ret;
+		memset(buf, 0, info->dqi_usable_bs);
+		newact = 1;
+	} else {
+		ret = read_blk(info, *treeblk, buf);
+		if (ret < 0) {
+			quota_error(dquot->dq_sb, "Can't read tree quota "
+				    "block %u", *treeblk);
+			goto out_buf;
+		}
+	}
+	ref = (__le32 *)buf;
+	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (!newblk)
+		newson = 1;
+	if (depth == info->dqi_qtree_depth - 1) {
+#ifdef __QUOTA_QT_PARANOIA
+		if (newblk) {
+			quota_error(dquot->dq_sb, "Inserting already present "
+				    "quota entry (block %u)",
+				    le32_to_cpu(ref[get_index(info,
+						dquot->dq_id, depth)]));
+			ret = -EIO;
+			goto out_buf;
+		}
+#endif
+		newblk = find_free_dqentry(info, dquot, &ret);
+	} else {
+		ret = do_insert_tree(info, dquot, &newblk, depth+1);
+	}
+	if (newson && ret >= 0) {
+		ref[get_index(info, dquot->dq_id, depth)] =
+							cpu_to_le32(newblk);
+		ret = write_blk(info, *treeblk, buf);
+	} else if (newact && ret < 0) {
+		put_free_dqblk(info, buf, *treeblk);
+	}
+out_buf:
+	kfree(buf);
+	return ret;
+}
+
+/* Wrapper for inserting quota structure into tree */
+static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
+				 struct dquot *dquot)
+{
+	int tmp = QT_TREEOFF;
+	return do_insert_tree(info, dquot, &tmp, 0);
+}
+
+/*
+ * We don't have to be afraid of deadlocks as we never have quotas on quota
+ * files...
+ */
+int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	int type = dquot->dq_type;
+	struct super_block *sb = dquot->dq_sb;
+	ssize_t ret;
+	char *ddquot = getdqbuf(info->dqi_entry_size);
+
+	if (!ddquot)
+		return -ENOMEM;
+
+	/* dq_off is guarded by dqio_mutex */
+	if (!dquot->dq_off) {
+		ret = dq_insert_tree(info, dquot);
+		if (ret < 0) {
+			quota_error(sb, "Error %zd occurred while creating "
+				    "quota", ret);
+			kfree(ddquot);
+			return ret;
+		}
+	}
+	spin_lock(&dq_data_lock);
+	info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
+	spin_unlock(&dq_data_lock);
+	ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
+				    dquot->dq_off);
+	if (ret != info->dqi_entry_size) {
+		quota_error(sb, "dquota write failed");
+		if (ret >= 0)
+			ret = -ENOSPC;
+	} else {
+		ret = 0;
+	}
+	dqstats_inc(DQST_WRITES);
+	kfree(ddquot);
+
+	return ret;
+}
+EXPORT_SYMBOL(qtree_write_dquot);
+
+/* Free dquot entry in data block */
+static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+			uint blk)
+{
+	struct qt_disk_dqdbheader *dh;
+	char *buf = getdqbuf(info->dqi_usable_bs);
+	int ret = 0;
+
+	if (!buf)
+		return -ENOMEM;
+	if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
+		quota_error(dquot->dq_sb, "Quota structure has offset to "
+			"other block (%u) than it should (%u)", blk,
+			(uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+		goto out_buf;
+	}
+	ret = read_blk(info, blk, buf);
+	if (ret < 0) {
+		quota_error(dquot->dq_sb, "Can't read quota data block %u",
+			    blk);
+		goto out_buf;
+	}
+	dh = (struct qt_disk_dqdbheader *)buf;
+	le16_add_cpu(&dh->dqdh_entries, -1);
+	if (!le16_to_cpu(dh->dqdh_entries)) {	/* Block got free? */
+		ret = remove_free_dqentry(info, buf, blk);
+		if (ret >= 0)
+			ret = put_free_dqblk(info, buf, blk);
+		if (ret < 0) {
+			quota_error(dquot->dq_sb, "Can't move quota data block "
+				    "(%u) to free list", blk);
+			goto out_buf;
+		}
+	} else {
+		memset(buf +
+		       (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)),
+		       0, info->dqi_entry_size);
+		if (le16_to_cpu(dh->dqdh_entries) ==
+		    qtree_dqstr_in_blk(info) - 1) {
+			/* Insert will write block itself */
+			ret = insert_free_dqentry(info, buf, blk);
+			if (ret < 0) {
+				quota_error(dquot->dq_sb, "Can't insert quota "
+				    "data block (%u) to free entry list", blk);
+				goto out_buf;
+			}
+		} else {
+			ret = write_blk(info, blk, buf);
+			if (ret < 0) {
+				quota_error(dquot->dq_sb, "Can't write quota "
+					    "data block %u", blk);
+				goto out_buf;
+			}
+		}
+	}
+	dquot->dq_off = 0;	/* Quota is now unattached */
+out_buf:
+	kfree(buf);
+	return ret;
+}
+
+/* Remove reference to dquot from tree */
+static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+		       uint *blk, int depth)
+{
+	char *buf = getdqbuf(info->dqi_usable_bs);
+	int ret = 0;
+	uint newblk;
+	__le32 *ref = (__le32 *)buf;
+
+	if (!buf)
+		return -ENOMEM;
+	ret = read_blk(info, *blk, buf);
+	if (ret < 0) {
+		quota_error(dquot->dq_sb, "Can't read quota data block %u",
+			    *blk);
+		goto out_buf;
+	}
+	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (depth == info->dqi_qtree_depth - 1) {
+		ret = free_dqentry(info, dquot, newblk);
+		newblk = 0;
+	} else {
+		ret = remove_tree(info, dquot, &newblk, depth+1);
+	}
+	if (ret >= 0 && !newblk) {
+		int i;
+		ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
+		/* Block got empty? */
+		for (i = 0; i < (info->dqi_usable_bs >> 2) && !ref[i]; i++)
+			;
+		/* Don't put the root block into the free block list */
+		if (i == (info->dqi_usable_bs >> 2)
+		    && *blk != QT_TREEOFF) {
+			put_free_dqblk(info, buf, *blk);
+			*blk = 0;
+		} else {
+			ret = write_blk(info, *blk, buf);
+			if (ret < 0)
+				quota_error(dquot->dq_sb,
+					    "Can't write quota tree block %u",
+					    *blk);
+		}
+	}
+out_buf:
+	kfree(buf);
+	return ret;
+}
+
+/* Delete dquot from tree */
+int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	uint tmp = QT_TREEOFF;
+
+	if (!dquot->dq_off)	/* Even not allocated? */
+		return 0;
+	return remove_tree(info, dquot, &tmp, 0);
+}
+EXPORT_SYMBOL(qtree_delete_dquot);
+
+/* Find entry in block */
+static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
+				 struct dquot *dquot, uint blk)
+{
+	char *buf = getdqbuf(info->dqi_usable_bs);
+	loff_t ret = 0;
+	int i;
+	char *ddquot;
+
+	if (!buf)
+		return -ENOMEM;
+	ret = read_blk(info, blk, buf);
+	if (ret < 0) {
+		quota_error(dquot->dq_sb, "Can't read quota tree "
+			    "block %u", blk);
+		goto out_buf;
+	}
+	ddquot = buf + sizeof(struct qt_disk_dqdbheader);
+	for (i = 0; i < qtree_dqstr_in_blk(info); i++) {
+		if (info->dqi_ops->is_id(ddquot, dquot))
+			break;
+		ddquot += info->dqi_entry_size;
+	}
+	if (i == qtree_dqstr_in_blk(info)) {
+		quota_error(dquot->dq_sb, "Quota for id %u referenced "
+			    "but not present", dquot->dq_id);
+		ret = -EIO;
+		goto out_buf;
+	} else {
+		ret = (blk << info->dqi_blocksize_bits) + sizeof(struct
+		  qt_disk_dqdbheader) + i * info->dqi_entry_size;
+	}
+out_buf:
+	kfree(buf);
+	return ret;
+}
+
+/* Find entry for given id in the tree */
+static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
+				struct dquot *dquot, uint blk, int depth)
+{
+	char *buf = getdqbuf(info->dqi_usable_bs);
+	loff_t ret = 0;
+	__le32 *ref = (__le32 *)buf;
+
+	if (!buf)
+		return -ENOMEM;
+	ret = read_blk(info, blk, buf);
+	if (ret < 0) {
+		quota_error(dquot->dq_sb, "Can't read quota tree block %u",
+			    blk);
+		goto out_buf;
+	}
+	ret = 0;
+	blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (!blk)	/* No reference? */
+		goto out_buf;
+	if (depth < info->dqi_qtree_depth - 1)
+		ret = find_tree_dqentry(info, dquot, blk, depth+1);
+	else
+		ret = find_block_dqentry(info, dquot, blk);
+out_buf:
+	kfree(buf);
+	return ret;
+}
+
+/* Find entry for given id in the tree - wrapper function */
+static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
+				  struct dquot *dquot)
+{
+	return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
+}
+
+int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	int type = dquot->dq_type;
+	struct super_block *sb = dquot->dq_sb;
+	loff_t offset;
+	char *ddquot;
+	int ret = 0;
+
+#ifdef __QUOTA_QT_PARANOIA
+	/* Invalidated quota? */
+	if (!sb_dqopt(dquot->dq_sb)->files[type]) {
+		quota_error(sb, "Quota invalidated while reading!");
+		return -EIO;
+	}
+#endif
+	/* Do we know offset of the dquot entry in the quota file? */
+	if (!dquot->dq_off) {
+		offset = find_dqentry(info, dquot);
+		if (offset <= 0) {	/* Entry not present? */
+			if (offset < 0)
+				quota_error(sb, "Can't read quota structure "
+					    "for id %u", dquot->dq_id);
+			dquot->dq_off = 0;
+			set_bit(DQ_FAKE_B, &dquot->dq_flags);
+			memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+			ret = offset;
+			goto out;
+		}
+		dquot->dq_off = offset;
+	}
+	ddquot = getdqbuf(info->dqi_entry_size);
+	if (!ddquot)
+		return -ENOMEM;
+	ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size,
+				   dquot->dq_off);
+	if (ret != info->dqi_entry_size) {
+		if (ret >= 0)
+			ret = -EIO;
+		quota_error(sb, "Error while reading quota structure for id %u",
+			    dquot->dq_id);
+		set_bit(DQ_FAKE_B, &dquot->dq_flags);
+		memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+		kfree(ddquot);
+		goto out;
+	}
+	spin_lock(&dq_data_lock);
+	info->dqi_ops->disk2mem_dqblk(dquot, ddquot);
+	if (!dquot->dq_dqb.dqb_bhardlimit &&
+	    !dquot->dq_dqb.dqb_bsoftlimit &&
+	    !dquot->dq_dqb.dqb_ihardlimit &&
+	    !dquot->dq_dqb.dqb_isoftlimit)
+		set_bit(DQ_FAKE_B, &dquot->dq_flags);
+	spin_unlock(&dq_data_lock);
+	kfree(ddquot);
+out:
+	dqstats_inc(DQST_READS);
+	return ret;
+}
+EXPORT_SYMBOL(qtree_read_dquot);
+
+/* Check whether dquot should not be deleted. We know we are
+ * the only one operating on dquot (thanks to dq_lock) */
+int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) &&
+	    !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
+		return qtree_delete_dquot(info, dquot);
+	return 0;
+}
+EXPORT_SYMBOL(qtree_release_dquot);
diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h
new file mode 100644
index 00000000..a1ab8db8
--- /dev/null
+++ b/fs/quota/quota_tree.h
@@ -0,0 +1,25 @@
+/*
+ *	Definitions of structures for vfsv0 quota format
+ */
+
+#ifndef _LINUX_QUOTA_TREE_H
+#define _LINUX_QUOTA_TREE_H
+
+#include <linux/types.h>
+#include <linux/quota.h>
+
+/*
+ *  Structure of header of block with quota structures. It is padded to 16 bytes so
+ *  there will be space for exactly 21 quota-entries in a block
+ */
+struct qt_disk_dqdbheader {
+	__le32 dqdh_next_free;	/* Number of next block with free entry */
+	__le32 dqdh_prev_free;	/* Number of previous block with free entry */
+	__le16 dqdh_entries;	/* Number of valid entries in block */
+	__le16 dqdh_pad1;
+	__le32 dqdh_pad2;
+};
+
+#define QT_TREEOFF	1		/* Offset of tree in file in blocks */
+
+#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
new file mode 100644
index 00000000..34b37a67
--- /dev/null
+++ b/fs/quota/quota_v1.c
@@ -0,0 +1,233 @@
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/dqblk_v1.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <asm/byteorder.h>
+
+#include "quotaio_v1.h"
+
+MODULE_AUTHOR("Jan Kara");
+MODULE_DESCRIPTION("Old quota format support");
+MODULE_LICENSE("GPL");
+
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+
+static inline qsize_t v1_stoqb(qsize_t space)
+{
+	return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+
+static inline qsize_t v1_qbtos(qsize_t blocks)
+{
+	return blocks << QUOTABLOCK_BITS;
+}
+
+static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d)
+{
+	m->dqb_ihardlimit = d->dqb_ihardlimit;
+	m->dqb_isoftlimit = d->dqb_isoftlimit;
+	m->dqb_curinodes = d->dqb_curinodes;
+	m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit);
+	m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit);
+	m->dqb_curspace = v1_qbtos(d->dqb_curblocks);
+	m->dqb_itime = d->dqb_itime;
+	m->dqb_btime = d->dqb_btime;
+}
+
+static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m)
+{
+	d->dqb_ihardlimit = m->dqb_ihardlimit;
+	d->dqb_isoftlimit = m->dqb_isoftlimit;
+	d->dqb_curinodes = m->dqb_curinodes;
+	d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit);
+	d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit);
+	d->dqb_curblocks = v1_stoqb(m->dqb_curspace);
+	d->dqb_itime = m->dqb_itime;
+	d->dqb_btime = m->dqb_btime;
+}
+
+static int v1_read_dqblk(struct dquot *dquot)
+{
+	int type = dquot->dq_type;
+	struct v1_disk_dqblk dqblk;
+
+	if (!sb_dqopt(dquot->dq_sb)->files[type])
+		return -EINVAL;
+
+	/* Set structure to 0s in case read fails/is after end of file */
+	memset(&dqblk, 0, sizeof(struct v1_disk_dqblk));
+	dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, (char *)&dqblk,
+			sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id));
+
+	v1_disk2mem_dqblk(&dquot->dq_dqb, &dqblk);
+	if (dquot->dq_dqb.dqb_bhardlimit == 0 &&
+	    dquot->dq_dqb.dqb_bsoftlimit == 0 &&
+	    dquot->dq_dqb.dqb_ihardlimit == 0 &&
+	    dquot->dq_dqb.dqb_isoftlimit == 0)
+		set_bit(DQ_FAKE_B, &dquot->dq_flags);
+	dqstats_inc(DQST_READS);
+
+	return 0;
+}
+
+static int v1_commit_dqblk(struct dquot *dquot)
+{
+	short type = dquot->dq_type;
+	ssize_t ret;
+	struct v1_disk_dqblk dqblk;
+
+	v1_mem2disk_dqblk(&dqblk, &dquot->dq_dqb);
+	if (dquot->dq_id == 0) {
+		dqblk.dqb_btime =
+			sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace;
+		dqblk.dqb_itime =
+			sb_dqopt(dquot->dq_sb)->info[type].dqi_igrace;
+	}
+	ret = 0;
+	if (sb_dqopt(dquot->dq_sb)->files[type])
+		ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
+			(char *)&dqblk, sizeof(struct v1_disk_dqblk),
+			v1_dqoff(dquot->dq_id));
+	if (ret != sizeof(struct v1_disk_dqblk)) {
+		quota_error(dquot->dq_sb, "dquota write failed");
+		if (ret >= 0)
+			ret = -EIO;
+		goto out;
+	}
+	ret = 0;
+
+out:
+	dqstats_inc(DQST_WRITES);
+
+	return ret;
+}
+
+/* Magics of new quota format */
+#define V2_INITQMAGICS {\
+	0xd9c01f11,     /* USRQUOTA */\
+	0xd9c01927      /* GRPQUOTA */\
+}
+
+/* Header of new quota format */
+struct v2_disk_dqheader {
+	__le32 dqh_magic;        /* Magic number identifying file */
+	__le32 dqh_version;      /* File version */
+};
+
+static int v1_check_quota_file(struct super_block *sb, int type)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	ulong blocks;
+	size_t off; 
+	struct v2_disk_dqheader dqhead;
+	ssize_t size;
+	loff_t isize;
+	static const uint quota_magics[] = V2_INITQMAGICS;
+
+	isize = i_size_read(inode);
+	if (!isize)
+		return 0;
+	blocks = isize >> BLOCK_SIZE_BITS;
+	off = isize & (BLOCK_SIZE - 1);
+	if ((blocks % sizeof(struct v1_disk_dqblk) * BLOCK_SIZE + off) %
+	    sizeof(struct v1_disk_dqblk))
+		return 0;
+	/* Doublecheck whether we didn't get file with new format - with old
+	 * quotactl() this could happen */
+	size = sb->s_op->quota_read(sb, type, (char *)&dqhead,
+				    sizeof(struct v2_disk_dqheader), 0);
+	if (size != sizeof(struct v2_disk_dqheader))
+		return 1;	/* Probably not new format */
+	if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type])
+		return 1;	/* Definitely not new format */
+	printk(KERN_INFO
+	       "VFS: %s: Refusing to turn on old quota format on given file."
+	       " It probably contains newer quota format.\n", sb->s_id);
+        return 0;		/* Seems like a new format file -> refuse it */
+}
+
+static int v1_read_file_info(struct super_block *sb, int type)
+{
+	struct quota_info *dqopt = sb_dqopt(sb);
+	struct v1_disk_dqblk dqblk;
+	int ret;
+
+	ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+				sizeof(struct v1_disk_dqblk), v1_dqoff(0));
+	if (ret != sizeof(struct v1_disk_dqblk)) {
+		if (ret >= 0)
+			ret = -EIO;
+		goto out;
+	}
+	ret = 0;
+	/* limits are stored as unsigned 32-bit data */
+	dqopt->info[type].dqi_maxblimit = 0xffffffff;
+	dqopt->info[type].dqi_maxilimit = 0xffffffff;
+	dqopt->info[type].dqi_igrace =
+			dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
+	dqopt->info[type].dqi_bgrace =
+			dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
+out:
+	return ret;
+}
+
+static int v1_write_file_info(struct super_block *sb, int type)
+{
+	struct quota_info *dqopt = sb_dqopt(sb);
+	struct v1_disk_dqblk dqblk;
+	int ret;
+
+	dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY;
+	ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+				sizeof(struct v1_disk_dqblk), v1_dqoff(0));
+	if (ret != sizeof(struct v1_disk_dqblk)) {
+		if (ret >= 0)
+			ret = -EIO;
+		goto out;
+	}
+	dqblk.dqb_itime = dqopt->info[type].dqi_igrace;
+	dqblk.dqb_btime = dqopt->info[type].dqi_bgrace;
+	ret = sb->s_op->quota_write(sb, type, (char *)&dqblk,
+	      sizeof(struct v1_disk_dqblk), v1_dqoff(0));
+	if (ret == sizeof(struct v1_disk_dqblk))
+		ret = 0;
+	else if (ret > 0)
+		ret = -EIO;
+out:
+	return ret;
+}
+
+static const struct quota_format_ops v1_format_ops = {
+	.check_quota_file	= v1_check_quota_file,
+	.read_file_info		= v1_read_file_info,
+	.write_file_info	= v1_write_file_info,
+	.free_file_info		= NULL,
+	.read_dqblk		= v1_read_dqblk,
+	.commit_dqblk		= v1_commit_dqblk,
+};
+
+static struct quota_format_type v1_quota_format = {
+	.qf_fmt_id	= QFMT_VFS_OLD,
+	.qf_ops		= &v1_format_ops,
+	.qf_owner	= THIS_MODULE
+};
+
+static int __init init_v1_quota_format(void)
+{
+        return register_quota_format(&v1_quota_format);
+}
+
+static void __exit exit_v1_quota_format(void)
+{
+        unregister_quota_format(&v1_quota_format);
+}
+
+module_init(init_v1_quota_format);
+module_exit(exit_v1_quota_format);
+
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
new file mode 100644
index 00000000..f1ab3604
--- /dev/null
+++ b/fs/quota/quota_v2.c
@@ -0,0 +1,336 @@
+/*
+ *	vfsv0 quota IO operations on file
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/dqblk_v2.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/quotaops.h>
+
+#include <asm/byteorder.h>
+
+#include "quota_tree.h"
+#include "quotaio_v2.h"
+
+MODULE_AUTHOR("Jan Kara");
+MODULE_DESCRIPTION("Quota format v2 support");
+MODULE_LICENSE("GPL");
+
+#define __QUOTA_V2_PARANOIA
+
+static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot);
+static void v2r0_disk2memdqb(struct dquot *dquot, void *dp);
+static int v2r0_is_id(void *dp, struct dquot *dquot);
+static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot);
+static void v2r1_disk2memdqb(struct dquot *dquot, void *dp);
+static int v2r1_is_id(void *dp, struct dquot *dquot);
+
+static struct qtree_fmt_operations v2r0_qtree_ops = {
+	.mem2disk_dqblk = v2r0_mem2diskdqb,
+	.disk2mem_dqblk = v2r0_disk2memdqb,
+	.is_id = v2r0_is_id,
+};
+
+static struct qtree_fmt_operations v2r1_qtree_ops = {
+	.mem2disk_dqblk = v2r1_mem2diskdqb,
+	.disk2mem_dqblk = v2r1_disk2memdqb,
+	.is_id = v2r1_is_id,
+};
+
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+
+static inline qsize_t v2_stoqb(qsize_t space)
+{
+	return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+
+static inline qsize_t v2_qbtos(qsize_t blocks)
+{
+	return blocks << QUOTABLOCK_BITS;
+}
+
+static int v2_read_header(struct super_block *sb, int type,
+			  struct v2_disk_dqheader *dqhead)
+{
+	ssize_t size;
+
+	size = sb->s_op->quota_read(sb, type, (char *)dqhead,
+				    sizeof(struct v2_disk_dqheader), 0);
+	if (size != sizeof(struct v2_disk_dqheader)) {
+		quota_error(sb, "Failed header read: expected=%zd got=%zd",
+			    sizeof(struct v2_disk_dqheader), size);
+		return 0;
+	}
+	return 1;
+}
+
+/* Check whether given file is really vfsv0 quotafile */
+static int v2_check_quota_file(struct super_block *sb, int type)
+{
+	struct v2_disk_dqheader dqhead;
+	static const uint quota_magics[] = V2_INITQMAGICS;
+	static const uint quota_versions[] = V2_INITQVERSIONS;
+ 
+	if (!v2_read_header(sb, type, &dqhead))
+		return 0;
+	if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] ||
+	    le32_to_cpu(dqhead.dqh_version) > quota_versions[type])
+		return 0;
+	return 1;
+}
+
+/* Read information header from quota file */
+static int v2_read_file_info(struct super_block *sb, int type)
+{
+	struct v2_disk_dqinfo dinfo;
+	struct v2_disk_dqheader dqhead;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct qtree_mem_dqinfo *qinfo;
+	ssize_t size;
+	unsigned int version;
+
+	if (!v2_read_header(sb, type, &dqhead))
+		return -1;
+	version = le32_to_cpu(dqhead.dqh_version);
+	if ((info->dqi_fmt_id == QFMT_VFS_V0 && version != 0) ||
+	    (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1))
+		return -1;
+
+	size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
+	if (size != sizeof(struct v2_disk_dqinfo)) {
+		quota_error(sb, "Can't read info structure");
+		return -1;
+	}
+	info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
+	if (!info->dqi_priv) {
+		printk(KERN_WARNING
+		       "Not enough memory for quota information structure.\n");
+		return -ENOMEM;
+	}
+	qinfo = info->dqi_priv;
+	if (version == 0) {
+		/* limits are stored as unsigned 32-bit data */
+		info->dqi_maxblimit = 0xffffffff;
+		info->dqi_maxilimit = 0xffffffff;
+	} else {
+		/* used space is stored as unsigned 64-bit value */
+		info->dqi_maxblimit = 0xffffffffffffffffULL;	/* 2^64-1 */
+		info->dqi_maxilimit = 0xffffffffffffffffULL;
+	}
+	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+	info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
+	qinfo->dqi_sb = sb;
+	qinfo->dqi_type = type;
+	qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+	qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+	qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+	qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
+	qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
+	qinfo->dqi_qtree_depth = qtree_depth(qinfo);
+	if (version == 0) {
+		qinfo->dqi_entry_size = sizeof(struct v2r0_disk_dqblk);
+		qinfo->dqi_ops = &v2r0_qtree_ops;
+	} else {
+		qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk);
+		qinfo->dqi_ops = &v2r1_qtree_ops;
+	}
+	return 0;
+}
+
+/* Write information header to quota file */
+static int v2_write_file_info(struct super_block *sb, int type)
+{
+	struct v2_disk_dqinfo dinfo;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
+	ssize_t size;
+
+	spin_lock(&dq_data_lock);
+	info->dqi_flags &= ~DQF_INFO_DIRTY;
+	dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+	dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+	dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+	spin_unlock(&dq_data_lock);
+	dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
+	dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
+	dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
+	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
+	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
+	if (size != sizeof(struct v2_disk_dqinfo)) {
+		quota_error(sb, "Can't write info structure");
+		return -1;
+	}
+	return 0;
+}
+
+static void v2r0_disk2memdqb(struct dquot *dquot, void *dp)
+{
+	struct v2r0_disk_dqblk *d = dp, empty;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
+	m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
+	m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
+	m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
+	m->dqb_itime = le64_to_cpu(d->dqb_itime);
+	m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit));
+	m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit));
+	m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+	m->dqb_btime = le64_to_cpu(d->dqb_btime);
+	/* We need to escape back all-zero structure */
+	memset(&empty, 0, sizeof(struct v2r0_disk_dqblk));
+	empty.dqb_itime = cpu_to_le64(1);
+	if (!memcmp(&empty, dp, sizeof(struct v2r0_disk_dqblk)))
+		m->dqb_itime = 0;
+}
+
+static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+	struct v2r0_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+	struct qtree_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+	d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
+	d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
+	d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
+	d->dqb_itime = cpu_to_le64(m->dqb_itime);
+	d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit));
+	d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit));
+	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+	d->dqb_btime = cpu_to_le64(m->dqb_btime);
+	d->dqb_id = cpu_to_le32(dquot->dq_id);
+	if (qtree_entry_unused(info, dp))
+		d->dqb_itime = cpu_to_le64(1);
+}
+
+static int v2r0_is_id(void *dp, struct dquot *dquot)
+{
+	struct v2r0_disk_dqblk *d = dp;
+	struct qtree_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+	if (qtree_entry_unused(info, dp))
+		return 0;
+	return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+}
+
+static void v2r1_disk2memdqb(struct dquot *dquot, void *dp)
+{
+	struct v2r1_disk_dqblk *d = dp, empty;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
+	m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+	m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+	m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+	m->dqb_itime = le64_to_cpu(d->dqb_itime);
+	m->dqb_bhardlimit = v2_qbtos(le64_to_cpu(d->dqb_bhardlimit));
+	m->dqb_bsoftlimit = v2_qbtos(le64_to_cpu(d->dqb_bsoftlimit));
+	m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+	m->dqb_btime = le64_to_cpu(d->dqb_btime);
+	/* We need to escape back all-zero structure */
+	memset(&empty, 0, sizeof(struct v2r1_disk_dqblk));
+	empty.dqb_itime = cpu_to_le64(1);
+	if (!memcmp(&empty, dp, sizeof(struct v2r1_disk_dqblk)))
+		m->dqb_itime = 0;
+}
+
+static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+	struct v2r1_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+	struct qtree_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+	d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+	d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+	d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+	d->dqb_itime = cpu_to_le64(m->dqb_itime);
+	d->dqb_bhardlimit = cpu_to_le64(v2_stoqb(m->dqb_bhardlimit));
+	d->dqb_bsoftlimit = cpu_to_le64(v2_stoqb(m->dqb_bsoftlimit));
+	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+	d->dqb_btime = cpu_to_le64(m->dqb_btime);
+	d->dqb_id = cpu_to_le32(dquot->dq_id);
+	if (qtree_entry_unused(info, dp))
+		d->dqb_itime = cpu_to_le64(1);
+}
+
+static int v2r1_is_id(void *dp, struct dquot *dquot)
+{
+	struct v2r1_disk_dqblk *d = dp;
+	struct qtree_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+	if (qtree_entry_unused(info, dp))
+		return 0;
+	return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+}
+
+static int v2_read_dquot(struct dquot *dquot)
+{
+	return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
+}
+
+static int v2_write_dquot(struct dquot *dquot)
+{
+	return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
+}
+
+static int v2_release_dquot(struct dquot *dquot)
+{
+	return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
+}
+
+static int v2_free_file_info(struct super_block *sb, int type)
+{
+	kfree(sb_dqinfo(sb, type)->dqi_priv);
+	return 0;
+}
+
+static const struct quota_format_ops v2_format_ops = {
+	.check_quota_file	= v2_check_quota_file,
+	.read_file_info		= v2_read_file_info,
+	.write_file_info	= v2_write_file_info,
+	.free_file_info		= v2_free_file_info,
+	.read_dqblk		= v2_read_dquot,
+	.commit_dqblk		= v2_write_dquot,
+	.release_dqblk		= v2_release_dquot,
+};
+
+static struct quota_format_type v2r0_quota_format = {
+	.qf_fmt_id	= QFMT_VFS_V0,
+	.qf_ops		= &v2_format_ops,
+	.qf_owner	= THIS_MODULE
+};
+
+static struct quota_format_type v2r1_quota_format = {
+	.qf_fmt_id	= QFMT_VFS_V1,
+	.qf_ops		= &v2_format_ops,
+	.qf_owner	= THIS_MODULE
+};
+
+static int __init init_v2_quota_format(void)
+{
+	int ret;
+
+	ret = register_quota_format(&v2r0_quota_format);
+	if (ret)
+		return ret;
+	return register_quota_format(&v2r1_quota_format);
+}
+
+static void __exit exit_v2_quota_format(void)
+{
+	unregister_quota_format(&v2r0_quota_format);
+	unregister_quota_format(&v2r1_quota_format);
+}
+
+module_init(init_v2_quota_format);
+module_exit(exit_v2_quota_format);
diff --git a/fs/quota/quotaio_v1.h b/fs/quota/quotaio_v1.h
new file mode 100644
index 00000000..746654b5
--- /dev/null
+++ b/fs/quota/quotaio_v1.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_QUOTAIO_V1_H
+#define _LINUX_QUOTAIO_V1_H
+
+#include <linux/types.h>
+
+/*
+ * The following constants define the amount of time given a user
+ * before the soft limits are treated as hard limits (usually resulting
+ * in an allocation failure). The timer is started when the user crosses
+ * their soft limit, it is reset when they go below their soft limit.
+ */
+#define MAX_IQ_TIME  604800	/* (7*24*60*60) 1 week */
+#define MAX_DQ_TIME  604800	/* (7*24*60*60) 1 week */
+
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is an array of these structures
+ * indexed by user or group number.
+ */
+struct v1_disk_dqblk {
+	__u32 dqb_bhardlimit;	/* absolute limit on disk blks alloc */
+	__u32 dqb_bsoftlimit;	/* preferred limit on disk blks */
+	__u32 dqb_curblocks;	/* current block count */
+	__u32 dqb_ihardlimit;	/* absolute limit on allocated inodes */
+	__u32 dqb_isoftlimit;	/* preferred inode limit */
+	__u32 dqb_curinodes;	/* current # allocated inodes */
+	time_t dqb_btime;	/* time limit for excessive disk use */
+	time_t dqb_itime;	/* time limit for excessive inode use */
+};
+
+#define v1_dqoff(UID)      ((loff_t)((UID) * sizeof (struct v1_disk_dqblk)))
+
+#endif	/* _LINUX_QUOTAIO_V1_H */
diff --git a/fs/quota/quotaio_v2.h b/fs/quota/quotaio_v2.h
new file mode 100644
index 00000000..f1966b42
--- /dev/null
+++ b/fs/quota/quotaio_v2.h
@@ -0,0 +1,73 @@
+/*
+ *	Definitions of structures for vfsv0 quota format
+ */
+
+#ifndef _LINUX_QUOTAIO_V2_H
+#define _LINUX_QUOTAIO_V2_H
+
+#include <linux/types.h>
+#include <linux/quota.h>
+
+/*
+ * Definitions of magics and versions of current quota files
+ */
+#define V2_INITQMAGICS {\
+	0xd9c01f11,	/* USRQUOTA */\
+	0xd9c01927	/* GRPQUOTA */\
+}
+
+#define V2_INITQVERSIONS {\
+	1,		/* USRQUOTA */\
+	1		/* GRPQUOTA */\
+}
+
+/* First generic header */
+struct v2_disk_dqheader {
+	__le32 dqh_magic;	/* Magic number identifying file */
+	__le32 dqh_version;	/* File version */
+};
+
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is a radix tree whose leaves point
+ * to blocks of these structures.
+ */
+struct v2r0_disk_dqblk {
+	__le32 dqb_id;		/* id this quota applies to */
+	__le32 dqb_ihardlimit;	/* absolute limit on allocated inodes */
+	__le32 dqb_isoftlimit;	/* preferred inode limit */
+	__le32 dqb_curinodes;	/* current # allocated inodes */
+	__le32 dqb_bhardlimit;	/* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+	__le32 dqb_bsoftlimit;	/* preferred limit on disk space (in QUOTABLOCK_SIZE) */
+	__le64 dqb_curspace;	/* current space occupied (in bytes) */
+	__le64 dqb_btime;	/* time limit for excessive disk use */
+	__le64 dqb_itime;	/* time limit for excessive inode use */
+};
+
+struct v2r1_disk_dqblk {
+	__le32 dqb_id;		/* id this quota applies to */
+	__le32 dqb_pad;
+	__le64 dqb_ihardlimit;	/* absolute limit on allocated inodes */
+	__le64 dqb_isoftlimit;	/* preferred inode limit */
+	__le64 dqb_curinodes;	/* current # allocated inodes */
+	__le64 dqb_bhardlimit;	/* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+	__le64 dqb_bsoftlimit;	/* preferred limit on disk space (in QUOTABLOCK_SIZE) */
+	__le64 dqb_curspace;	/* current space occupied (in bytes) */
+	__le64 dqb_btime;	/* time limit for excessive disk use */
+	__le64 dqb_itime;	/* time limit for excessive inode use */
+};
+
+/* Header with type and version specific information */
+struct v2_disk_dqinfo {
+	__le32 dqi_bgrace;	/* Time before block soft limit becomes hard limit */
+	__le32 dqi_igrace;	/* Time before inode soft limit becomes hard limit */
+	__le32 dqi_flags;	/* Flags for quotafile (DQF_*) */
+	__le32 dqi_blocks;	/* Number of blocks in file */
+	__le32 dqi_free_blk;	/* Number of first free block in the list */
+	__le32 dqi_free_entry;	/* Number of block with at least one free entry */
+};
+
+#define V2_DQINFOOFF	sizeof(struct v2_disk_dqheader)	/* Offset of info header in file */
+#define V2_DQBLKSIZE_BITS 10				/* Size of leaf block in tree */
+
+#endif /* _LINUX_QUOTAIO_V2_H */
diff --git a/fs/ramfs/Makefile b/fs/ramfs/Makefile
new file mode 100644
index 00000000..c71e65dc
--- /dev/null
+++ b/fs/ramfs/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the linux ramfs routines.
+#
+
+obj-y += ramfs.o
+
+file-mmu-y := file-nommu.o
+file-mmu-$(CONFIG_MMU) := file-mmu.o
+ramfs-objs += inode.o $(file-mmu-y)
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
new file mode 100644
index 00000000..4884ac5a
--- /dev/null
+++ b/fs/ramfs/file-mmu.c
@@ -0,0 +1,55 @@
+/* file-mmu.c: ramfs MMU-based file operations
+ *
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ *
+ * Usage limits added by David Gibson, Linuxcare Australia.
+ * This file is released under the GPL.
+ */
+
+/*
+ * NOTE! This filesystem is probably most useful
+ * not as a real filesystem, but as an example of
+ * how virtual filesystems can be written.
+ *
+ * It doesn't get much simpler than this. Consider
+ * that this file implements the full semantics of
+ * a POSIX-compliant read-write filesystem.
+ *
+ * Note in particular how the filesystem does not
+ * need to implement any data structures of its own
+ * to keep track of the virtual data: using the VFS
+ * caches is sufficient.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/ramfs.h>
+
+#include "internal.h"
+
+const struct address_space_operations ramfs_aops = {
+	.readpage	= simple_readpage,
+	.write_begin	= simple_write_begin,
+	.write_end	= simple_write_end,
+	.set_page_dirty = __set_page_dirty_no_writeback,
+};
+
+const struct file_operations ramfs_file_operations = {
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.fsync		= noop_fsync,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+	.llseek		= generic_file_llseek,
+};
+
+const struct inode_operations ramfs_file_inode_operations = {
+	.setattr	= simple_setattr,
+	.getattr	= simple_getattr,
+};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
new file mode 100644
index 00000000..fbb0b478
--- /dev/null
+++ b/fs/ramfs/file-nommu.c
@@ -0,0 +1,266 @@
+/* file-nommu.c: no-MMU version of ramfs
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+#include <linux/pagevec.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
+
+const struct address_space_operations ramfs_aops = {
+	.readpage		= simple_readpage,
+	.write_begin		= simple_write_begin,
+	.write_end		= simple_write_end,
+	.set_page_dirty		= __set_page_dirty_no_writeback,
+};
+
+const struct file_operations ramfs_file_operations = {
+	.mmap			= ramfs_nommu_mmap,
+	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
+	.read			= do_sync_read,
+	.aio_read		= generic_file_aio_read,
+	.write			= do_sync_write,
+	.aio_write		= generic_file_aio_write,
+	.fsync			= noop_fsync,
+	.splice_read		= generic_file_splice_read,
+	.splice_write		= generic_file_splice_write,
+	.llseek			= generic_file_llseek,
+};
+
+const struct inode_operations ramfs_file_inode_operations = {
+	.setattr		= ramfs_nommu_setattr,
+	.getattr		= simple_getattr,
+};
+
+/*****************************************************************************/
+/*
+ * add a contiguous set of pages into a ramfs inode when it's truncated from
+ * size 0 on the assumption that it's going to be used for an mmap of shared
+ * memory
+ */
+int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+{
+	unsigned long npages, xpages, loop;
+	struct page *pages;
+	unsigned order;
+	void *data;
+	int ret;
+
+	/* make various checks */
+	order = get_order(newsize);
+	if (unlikely(order >= MAX_ORDER))
+		return -EFBIG;
+
+	ret = inode_newsize_ok(inode, newsize);
+	if (ret)
+		return ret;
+
+	i_size_write(inode, newsize);
+
+	/* allocate enough contiguous pages to be able to satisfy the
+	 * request */
+	pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order);
+	if (!pages)
+		return -ENOMEM;
+
+	/* split the high-order page into an array of single pages */
+	xpages = 1UL << order;
+	npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	split_page(pages, order);
+
+	/* trim off any pages we don't actually require */
+	for (loop = npages; loop < xpages; loop++)
+		__free_page(pages + loop);
+
+	/* clear the memory we allocated */
+	newsize = PAGE_SIZE * npages;
+	data = page_address(pages);
+	memset(data, 0, newsize);
+
+	/* attach all the pages to the inode's address space */
+	for (loop = 0; loop < npages; loop++) {
+		struct page *page = pages + loop;
+
+		ret = add_to_page_cache_lru(page, inode->i_mapping, loop,
+					GFP_KERNEL);
+		if (ret < 0)
+			goto add_error;
+
+		/* prevent the page from being discarded on memory pressure */
+		SetPageDirty(page);
+
+		unlock_page(page);
+		put_page(page);
+	}
+
+	return 0;
+
+add_error:
+	while (loop < npages)
+		__free_page(pages + loop++);
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ *
+ */
+static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
+{
+	int ret;
+
+	/* assume a truncate from zero size is going to be for the purposes of
+	 * shared mmap */
+	if (size == 0) {
+		if (unlikely(newsize >> 32))
+			return -EFBIG;
+
+		return ramfs_nommu_expand_for_mapping(inode, newsize);
+	}
+
+	/* check that a decrease in size doesn't cut off any shared mappings */
+	if (newsize < size) {
+		ret = nommu_shrink_inode_mappings(inode, size, newsize);
+		if (ret < 0)
+			return ret;
+	}
+
+	truncate_setsize(inode, newsize);
+	return 0;
+}
+
+/*****************************************************************************/
+/*
+ * handle a change of attributes
+ * - we're specifically interested in a change of size
+ */
+static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	struct inode *inode = dentry->d_inode;
+	unsigned int old_ia_valid = ia->ia_valid;
+	int ret = 0;
+
+	/* POSIX UID/GID verification for setting inode attributes */
+	ret = inode_change_ok(inode, ia);
+	if (ret)
+		return ret;
+
+	/* pick out size-changing events */
+	if (ia->ia_valid & ATTR_SIZE) {
+		loff_t size = inode->i_size;
+
+		if (ia->ia_size != size) {
+			ret = ramfs_nommu_resize(inode, ia->ia_size, size);
+			if (ret < 0 || ia->ia_valid == ATTR_SIZE)
+				goto out;
+		} else {
+			/* we skipped the truncate but must still update
+			 * timestamps
+			 */
+			ia->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+		}
+	}
+
+	setattr_copy(inode, ia);
+ out:
+	ia->ia_valid = old_ia_valid;
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * try to determine where a shared mapping can be made
+ * - we require that:
+ *   - the pages to be mapped must exist
+ *   - the pages be physically contiguous in sequence
+ */
+unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+					    unsigned long addr, unsigned long len,
+					    unsigned long pgoff, unsigned long flags)
+{
+	unsigned long maxpages, lpages, nr, loop, ret;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct page **pages = NULL, **ptr, *page;
+	loff_t isize;
+
+	if (!(flags & MAP_SHARED))
+		return addr;
+
+	/* the mapping mustn't extend beyond the EOF */
+	lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	isize = i_size_read(inode);
+
+	ret = -EINVAL;
+	maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (pgoff >= maxpages)
+		goto out;
+
+	if (maxpages - pgoff < lpages)
+		goto out;
+
+	/* gang-find the pages */
+	ret = -ENOMEM;
+	pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		goto out_free;
+
+	nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
+	if (nr != lpages)
+		goto out_free_pages; /* leave if some pages were missing */
+
+	/* check the pages for physical adjacency */
+	ptr = pages;
+	page = *ptr++;
+	page++;
+	for (loop = lpages; loop > 1; loop--)
+		if (*ptr++ != page++)
+			goto out_free_pages;
+
+	/* okay - all conditions fulfilled */
+	ret = (unsigned long) page_address(pages[0]);
+
+out_free_pages:
+	ptr = pages;
+	for (loop = nr; loop > 0; loop--)
+		put_page(*ptr++);
+out_free:
+	kfree(pages);
+out:
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * set up a mapping for shared memory segments
+ */
+int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if (!(vma->vm_flags & VM_SHARED))
+		return -ENOSYS;
+
+	file_accessed(file);
+	vma->vm_ops = &generic_file_vm_ops;
+	return 0;
+}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
new file mode 100644
index 00000000..eacb166f
--- /dev/null
+++ b/fs/ramfs/inode.c
@@ -0,0 +1,315 @@
+/*
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ *
+ * Usage limits added by David Gibson, Linuxcare Australia.
+ * This file is released under the GPL.
+ */
+
+/*
+ * NOTE! This filesystem is probably most useful
+ * not as a real filesystem, but as an example of
+ * how virtual filesystems can be written.
+ *
+ * It doesn't get much simpler than this. Consider
+ * that this file implements the full semantics of
+ * a POSIX-compliant read-write filesystem.
+ *
+ * Note in particular how the filesystem does not
+ * need to implement any data structures of its own
+ * to keep track of the virtual data: using the VFS
+ * caches is sufficient.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+#include <linux/sched.h>
+#include <linux/parser.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+#define RAMFS_DEFAULT_MODE	0755
+
+static const struct super_operations ramfs_ops;
+static const struct inode_operations ramfs_dir_inode_operations;
+
+static struct backing_dev_info ramfs_backing_dev_info = {
+	.name		= "ramfs",
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK |
+			  BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
+			  BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
+};
+
+struct inode *ramfs_get_inode(struct super_block *sb,
+				const struct inode *dir, int mode, dev_t dev)
+{
+	struct inode * inode = new_inode(sb);
+
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode_init_owner(inode, dir, mode);
+		inode->i_mapping->a_ops = &ramfs_aops;
+		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
+		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+		mapping_set_unevictable(inode->i_mapping);
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		switch (mode & S_IFMT) {
+		default:
+			init_special_inode(inode, mode, dev);
+			break;
+		case S_IFREG:
+			inode->i_op = &ramfs_file_inode_operations;
+			inode->i_fop = &ramfs_file_operations;
+			break;
+		case S_IFDIR:
+			inode->i_op = &ramfs_dir_inode_operations;
+			inode->i_fop = &simple_dir_operations;
+
+			/* directory inodes start off with i_nlink == 2 (for "." entry) */
+			inc_nlink(inode);
+			break;
+		case S_IFLNK:
+			inode->i_op = &page_symlink_inode_operations;
+			break;
+		}
+	}
+	return inode;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+/* SMP-safe */
+static int
+ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+	struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
+	int error = -ENOSPC;
+
+	if (inode) {
+		d_instantiate(dentry, inode);
+		dget(dentry);	/* Extra count - pin the dentry in core */
+		error = 0;
+		dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	}
+	return error;
+}
+
+static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+{
+	int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+	if (!retval)
+		inc_nlink(dir);
+	return retval;
+}
+
+static int ramfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
+{
+	return ramfs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
+{
+	struct inode *inode;
+	int error = -ENOSPC;
+
+	inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
+	if (inode) {
+		int l = strlen(symname)+1;
+		error = page_symlink(inode, symname, l);
+		if (!error) {
+			d_instantiate(dentry, inode);
+			dget(dentry);
+			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+		} else
+			iput(inode);
+	}
+	return error;
+}
+
+static const struct inode_operations ramfs_dir_inode_operations = {
+	.create		= ramfs_create,
+	.lookup		= simple_lookup,
+	.link		= simple_link,
+	.unlink		= simple_unlink,
+	.symlink	= ramfs_symlink,
+	.mkdir		= ramfs_mkdir,
+	.rmdir		= simple_rmdir,
+	.mknod		= ramfs_mknod,
+	.rename		= simple_rename,
+};
+
+static const struct super_operations ramfs_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.show_options	= generic_show_options,
+};
+
+struct ramfs_mount_opts {
+	umode_t mode;
+};
+
+enum {
+	Opt_mode,
+	Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_mode, "mode=%o"},
+	{Opt_err, NULL}
+};
+
+struct ramfs_fs_info {
+	struct ramfs_mount_opts mount_opts;
+};
+
+static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
+{
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	int token;
+	char *p;
+
+	opts->mode = RAMFS_DEFAULT_MODE;
+
+	while ((p = strsep(&data, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_mode:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->mode = option & S_IALLUGO;
+			break;
+		/*
+		 * We might like to report bad mount options here;
+		 * but traditionally ramfs has ignored all mount options,
+		 * and as it is used as a !CONFIG_SHMEM simple substitute
+		 * for tmpfs, better continue to ignore other mount options.
+		 */
+		}
+	}
+
+	return 0;
+}
+
+int ramfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct ramfs_fs_info *fsi;
+	struct inode *inode = NULL;
+	struct dentry *root;
+	int err;
+
+	save_mount_options(sb, data);
+
+	fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
+	sb->s_fs_info = fsi;
+	if (!fsi) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	err = ramfs_parse_options(data, &fsi->mount_opts);
+	if (err)
+		goto fail;
+
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_blocksize		= PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits	= PAGE_CACHE_SHIFT;
+	sb->s_magic		= RAMFS_MAGIC;
+	sb->s_op		= &ramfs_ops;
+	sb->s_time_gran		= 1;
+
+	inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
+	if (!inode) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	root = d_alloc_root(inode);
+	sb->s_root = root;
+	if (!root) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	return 0;
+fail:
+	kfree(fsi);
+	sb->s_fs_info = NULL;
+	iput(inode);
+	return err;
+}
+
+struct dentry *ramfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_nodev(fs_type, flags, data, ramfs_fill_super);
+}
+
+static struct dentry *rootfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
+}
+
+static void ramfs_kill_sb(struct super_block *sb)
+{
+	kfree(sb->s_fs_info);
+	kill_litter_super(sb);
+}
+
+static struct file_system_type ramfs_fs_type = {
+	.name		= "ramfs",
+	.mount		= ramfs_mount,
+	.kill_sb	= ramfs_kill_sb,
+};
+static struct file_system_type rootfs_fs_type = {
+	.name		= "rootfs",
+	.mount		= rootfs_mount,
+	.kill_sb	= kill_litter_super,
+};
+
+static int __init init_ramfs_fs(void)
+{
+	return register_filesystem(&ramfs_fs_type);
+}
+
+static void __exit exit_ramfs_fs(void)
+{
+	unregister_filesystem(&ramfs_fs_type);
+}
+
+module_init(init_ramfs_fs)
+module_exit(exit_ramfs_fs)
+
+int __init init_rootfs(void)
+{
+	int err;
+
+	err = bdi_init(&ramfs_backing_dev_info);
+	if (err)
+		return err;
+
+	err = register_filesystem(&rootfs_fs_type);
+	if (err)
+		bdi_destroy(&ramfs_backing_dev_info);
+
+	return err;
+}
+
+MODULE_LICENSE("GPL");
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
new file mode 100644
index 00000000..6b330639
--- /dev/null
+++ b/fs/ramfs/internal.h
@@ -0,0 +1,14 @@
+/* internal.h: ramfs internal definitions
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+extern const struct address_space_operations ramfs_aops;
+extern const struct inode_operations ramfs_file_inode_operations;
diff --git a/fs/read_write.c b/fs/read_write.c
new file mode 100644
index 00000000..5520f8ad
--- /dev/null
+++ b/fs/read_write.c
@@ -0,0 +1,948 @@
+/*
+ *  linux/fs/read_write.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/slab.h> 
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/fsnotify.h>
+#include <linux/security.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+#include <linux/pagemap.h>
+#include <linux/splice.h>
+#include "read_write.h"
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+const struct file_operations generic_ro_fops = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.mmap		= generic_file_readonly_mmap,
+	.splice_read	= generic_file_splice_read,
+};
+
+EXPORT_SYMBOL(generic_ro_fops);
+
+static inline int unsigned_offsets(struct file *file)
+{
+	return file->f_mode & FMODE_UNSIGNED_OFFSET;
+}
+
+/**
+ * generic_file_llseek_unlocked - lockless generic llseek implementation
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @origin:	type of seek
+ *
+ * Updates the file offset to the value specified by @offset and @origin.
+ * Locking must be provided by the caller.
+ */
+loff_t
+generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_mapping->host;
+
+	switch (origin) {
+	case SEEK_END:
+		offset += inode->i_size;
+		break;
+	case SEEK_CUR:
+		/*
+		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+		 * position-querying operation.  Avoid rewriting the "same"
+		 * f_pos value back to the file because a concurrent read(),
+		 * write() or lseek() might have altered it
+		 */
+		if (offset == 0)
+			return file->f_pos;
+		offset += file->f_pos;
+		break;
+	}
+
+	if (offset < 0 && !unsigned_offsets(file))
+		return -EINVAL;
+	if (offset > inode->i_sb->s_maxbytes)
+		return -EINVAL;
+
+	/* Special lock needed here? */
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
+	}
+
+	return offset;
+}
+EXPORT_SYMBOL(generic_file_llseek_unlocked);
+
+/**
+ * generic_file_llseek - generic llseek implementation for regular files
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @origin:	type of seek
+ *
+ * This is a generic implemenation of ->llseek useable for all normal local
+ * filesystems.  It just updates the file offset to the value specified by
+ * @offset and @origin under i_mutex.
+ */
+loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t rval;
+
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
+	rval = generic_file_llseek_unlocked(file, offset, origin);
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+
+	return rval;
+}
+EXPORT_SYMBOL(generic_file_llseek);
+
+/**
+ * noop_llseek - No Operation Performed llseek implementation
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @origin:	type of seek
+ *
+ * This is an implementation of ->llseek useable for the rare special case when
+ * userspace expects the seek to succeed but the (device) file is actually not
+ * able to perform the seek. In this case you use noop_llseek() instead of
+ * falling back to the default implementation of ->llseek.
+ */
+loff_t noop_llseek(struct file *file, loff_t offset, int origin)
+{
+	return file->f_pos;
+}
+EXPORT_SYMBOL(noop_llseek);
+
+loff_t no_llseek(struct file *file, loff_t offset, int origin)
+{
+	return -ESPIPE;
+}
+EXPORT_SYMBOL(no_llseek);
+
+loff_t default_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t retval;
+
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
+	switch (origin) {
+		case SEEK_END:
+			offset += i_size_read(file->f_path.dentry->d_inode);
+			break;
+		case SEEK_CUR:
+			if (offset == 0) {
+				retval = file->f_pos;
+				goto out;
+			}
+			offset += file->f_pos;
+	}
+	retval = -EINVAL;
+	if (offset >= 0 || unsigned_offsets(file)) {
+		if (offset != file->f_pos) {
+			file->f_pos = offset;
+			file->f_version = 0;
+		}
+		retval = offset;
+	}
+out:
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+	return retval;
+}
+EXPORT_SYMBOL(default_llseek);
+
+loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t (*fn)(struct file *, loff_t, int);
+
+	fn = no_llseek;
+	if (file->f_mode & FMODE_LSEEK) {
+		if (file->f_op && file->f_op->llseek)
+			fn = file->f_op->llseek;
+	}
+	return fn(file, offset, origin);
+}
+EXPORT_SYMBOL(vfs_llseek);
+
+SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
+{
+	off_t retval;
+	struct file * file;
+	int fput_needed;
+
+	retval = -EBADF;
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		goto bad;
+
+	retval = -EINVAL;
+	if (origin <= SEEK_MAX) {
+		loff_t res = vfs_llseek(file, offset, origin);
+		retval = res;
+		if (res != (loff_t)retval)
+			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
+	}
+	fput_light(file, fput_needed);
+bad:
+	return retval;
+}
+
+#ifdef __ARCH_WANT_SYS_LLSEEK
+SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
+		unsigned long, offset_low, loff_t __user *, result,
+		unsigned int, origin)
+{
+	int retval;
+	struct file * file;
+	loff_t offset;
+	int fput_needed;
+
+	retval = -EBADF;
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		goto bad;
+
+	retval = -EINVAL;
+	if (origin > SEEK_MAX)
+		goto out_putf;
+
+	offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low,
+			origin);
+
+	retval = (int)offset;
+	if (offset >= 0) {
+		retval = -EFAULT;
+		if (!copy_to_user(result, &offset, sizeof(offset)))
+			retval = 0;
+	}
+out_putf:
+	fput_light(file, fput_needed);
+bad:
+	return retval;
+}
+#endif
+
+
+/*
+ * rw_verify_area doesn't like huge counts. We limit
+ * them to something that fits in "int" so that others
+ * won't have to do range checks all the time.
+ */
+int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
+{
+	struct inode *inode;
+	loff_t pos;
+	int retval = -EINVAL;
+
+	inode = file->f_path.dentry->d_inode;
+	if (unlikely((ssize_t) count < 0))
+		return retval;
+	pos = *ppos;
+	if (unlikely(pos < 0)) {
+		if (!unsigned_offsets(file))
+			return retval;
+		if (count >= -pos) /* both values are in 0..LLONG_MAX */
+			return -EOVERFLOW;
+	} else if (unlikely((loff_t) (pos + count) < 0)) {
+		if (!unsigned_offsets(file))
+			return retval;
+	}
+
+	if (unlikely(inode->i_flock && mandatory_lock(inode))) {
+		retval = locks_mandatory_area(
+			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
+			inode, file, pos, count);
+		if (retval < 0)
+			return retval;
+	}
+	retval = security_file_permission(file,
+				read_write == READ ? MAY_READ : MAY_WRITE);
+	if (retval)
+		return retval;
+	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
+}
+
+static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
+{
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	if (!kiocbIsKicked(iocb))
+		schedule();
+	else
+		kiocbClearKicked(iocb);
+	__set_current_state(TASK_RUNNING);
+}
+
+ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+{
+	struct iovec iov = { .iov_base = buf, .iov_len = len };
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, filp);
+	kiocb.ki_pos = *ppos;
+	kiocb.ki_left = len;
+	kiocb.ki_nbytes = len;
+
+	for (;;) {
+		ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
+		if (ret != -EIOCBRETRY)
+			break;
+		wait_on_retry_sync_kiocb(&kiocb);
+	}
+
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&kiocb);
+	*ppos = kiocb.ki_pos;
+	return ret;
+}
+
+EXPORT_SYMBOL(do_sync_read);
+
+ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
+{
+	ssize_t ret;
+
+	if (!(file->f_mode & FMODE_READ))
+		return -EBADF;
+	if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
+		return -EINVAL;
+	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
+		return -EFAULT;
+
+	ret = rw_verify_area(READ, file, pos, count);
+	if (ret >= 0) {
+		count = ret;
+		if (file->f_op->read)
+			ret = file->f_op->read(file, buf, count, pos);
+		else
+			ret = do_sync_read(file, buf, count, pos);
+		if (ret > 0) {
+			fsnotify_access(file);
+			add_rchar(current, ret);
+		}
+		inc_syscr(current);
+	}
+
+	return ret;
+}
+
+EXPORT_SYMBOL(vfs_read);
+
+ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
+{
+	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, filp);
+	kiocb.ki_pos = *ppos;
+	kiocb.ki_left = len;
+	kiocb.ki_nbytes = len;
+
+	for (;;) {
+		ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
+		if (ret != -EIOCBRETRY)
+			break;
+		wait_on_retry_sync_kiocb(&kiocb);
+	}
+
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&kiocb);
+	*ppos = kiocb.ki_pos;
+	return ret;
+}
+
+EXPORT_SYMBOL(do_sync_write);
+
+ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
+{
+	ssize_t ret;
+
+	if (!(file->f_mode & FMODE_WRITE))
+		return -EBADF;
+	if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
+		return -EINVAL;
+	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
+		return -EFAULT;
+
+	ret = rw_verify_area(WRITE, file, pos, count);
+	if (ret >= 0) {
+		count = ret;
+		if (file->f_op->write)
+			ret = file->f_op->write(file, buf, count, pos);
+		else
+			ret = do_sync_write(file, buf, count, pos);
+		if (ret > 0) {
+			fsnotify_modify(file);
+			add_wchar(current, ret);
+		}
+		inc_syscw(current);
+	}
+
+	return ret;
+}
+
+EXPORT_SYMBOL(vfs_write);
+
+static inline loff_t file_pos_read(struct file *file)
+{
+	return file->f_pos;
+}
+
+static inline void file_pos_write(struct file *file, loff_t pos)
+{
+	file->f_pos = pos;
+}
+
+SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
+{
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		loff_t pos = file_pos_read(file);
+		ret = vfs_read(file, buf, count, &pos);
+		file_pos_write(file, pos);
+		fput_light(file, fput_needed);
+	}
+
+	return ret;
+}
+
+SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
+		size_t, count)
+{
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		loff_t pos = file_pos_read(file);
+		ret = vfs_write(file, buf, count, &pos);
+		file_pos_write(file, pos);
+		fput_light(file, fput_needed);
+	}
+
+	return ret;
+}
+
+SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
+			size_t count, loff_t pos)
+{
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		ret = -ESPIPE;
+		if (file->f_mode & FMODE_PREAD)
+			ret = vfs_read(file, buf, count, &pos);
+		fput_light(file, fput_needed);
+	}
+
+	return ret;
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
+{
+	return SYSC_pread64((unsigned int) fd, (char __user *) buf,
+			    (size_t) count, pos);
+}
+SYSCALL_ALIAS(sys_pread64, SyS_pread64);
+#endif
+
+SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
+			 size_t count, loff_t pos)
+{
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		ret = -ESPIPE;
+		if (file->f_mode & FMODE_PWRITE)  
+			ret = vfs_write(file, buf, count, &pos);
+		fput_light(file, fput_needed);
+	}
+
+	return ret;
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
+{
+	return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
+			     (size_t) count, pos);
+}
+SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
+#endif
+
+/*
+ * Reduce an iovec's length in-place.  Return the resulting number of segments
+ */
+unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
+{
+	unsigned long seg = 0;
+	size_t len = 0;
+
+	while (seg < nr_segs) {
+		seg++;
+		if (len + iov->iov_len >= to) {
+			iov->iov_len = to - len;
+			break;
+		}
+		len += iov->iov_len;
+		iov++;
+	}
+	return seg;
+}
+EXPORT_SYMBOL(iov_shorten);
+
+ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
+		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, filp);
+	kiocb.ki_pos = *ppos;
+	kiocb.ki_left = len;
+	kiocb.ki_nbytes = len;
+
+	for (;;) {
+		ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
+		if (ret != -EIOCBRETRY)
+			break;
+		wait_on_retry_sync_kiocb(&kiocb);
+	}
+
+	if (ret == -EIOCBQUEUED)
+		ret = wait_on_sync_kiocb(&kiocb);
+	*ppos = kiocb.ki_pos;
+	return ret;
+}
+
+/* Do it by hand, with file-ops */
+ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
+		unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
+{
+	struct iovec *vector = iov;
+	ssize_t ret = 0;
+
+	while (nr_segs > 0) {
+		void __user *base;
+		size_t len;
+		ssize_t nr;
+
+		base = vector->iov_base;
+		len = vector->iov_len;
+		vector++;
+		nr_segs--;
+
+		nr = fn(filp, base, len, ppos);
+
+		if (nr < 0) {
+			if (!ret)
+				ret = nr;
+			break;
+		}
+		ret += nr;
+		if (nr != len)
+			break;
+	}
+
+	return ret;
+}
+
+/* A write operation does a read from user space and vice versa */
+#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
+
+ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
+			      unsigned long nr_segs, unsigned long fast_segs,
+			      struct iovec *fast_pointer,
+			      struct iovec **ret_pointer)
+{
+	unsigned long seg;
+	ssize_t ret;
+	struct iovec *iov = fast_pointer;
+
+	/*
+	 * SuS says "The readv() function *may* fail if the iovcnt argument
+	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+	 * traditionally returned zero for zero segments, so...
+	 */
+	if (nr_segs == 0) {
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * First get the "struct iovec" from user memory and
+	 * verify all the pointers
+	 */
+	if (nr_segs > UIO_MAXIOV) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (nr_segs > fast_segs) {
+		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+		if (iov == NULL) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/*
+	 * According to the Single Unix Specification we should return EINVAL
+	 * if an element length is < 0 when cast to ssize_t or if the
+	 * total length would overflow the ssize_t return value of the
+	 * system call.
+	 *
+	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
+	 * overflow case.
+	 */
+	ret = 0;
+	for (seg = 0; seg < nr_segs; seg++) {
+		void __user *buf = iov[seg].iov_base;
+		ssize_t len = (ssize_t)iov[seg].iov_len;
+
+		/* see if we we're about to use an invalid len or if
+		 * it's about to overflow ssize_t */
+		if (len < 0) {
+			ret = -EINVAL;
+			goto out;
+		}
+		if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
+			ret = -EFAULT;
+			goto out;
+		}
+		if (len > MAX_RW_COUNT - ret) {
+			len = MAX_RW_COUNT - ret;
+			iov[seg].iov_len = len;
+		}
+		ret += len;
+	}
+out:
+	*ret_pointer = iov;
+	return ret;
+}
+
+static ssize_t do_readv_writev(int type, struct file *file,
+			       const struct iovec __user * uvector,
+			       unsigned long nr_segs, loff_t *pos)
+{
+	size_t tot_len;
+	struct iovec iovstack[UIO_FASTIOV];
+	struct iovec *iov = iovstack;
+	ssize_t ret;
+	io_fn_t fn;
+	iov_fn_t fnv;
+
+	if (!file->f_op) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = rw_copy_check_uvector(type, uvector, nr_segs,
+			ARRAY_SIZE(iovstack), iovstack, &iov);
+	if (ret <= 0)
+		goto out;
+
+	tot_len = ret;
+	ret = rw_verify_area(type, file, pos, tot_len);
+	if (ret < 0)
+		goto out;
+
+	fnv = NULL;
+	if (type == READ) {
+		fn = file->f_op->read;
+		fnv = file->f_op->aio_read;
+	} else {
+		fn = (io_fn_t)file->f_op->write;
+		fnv = file->f_op->aio_write;
+	}
+
+	if (fnv)
+		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
+						pos, fnv);
+	else
+		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+
+out:
+	if (iov != iovstack)
+		kfree(iov);
+	if ((ret + (type == READ)) > 0) {
+		if (type == READ)
+			fsnotify_access(file);
+		else
+			fsnotify_modify(file);
+	}
+	return ret;
+}
+
+ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
+		  unsigned long vlen, loff_t *pos)
+{
+	if (!(file->f_mode & FMODE_READ))
+		return -EBADF;
+	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
+		return -EINVAL;
+
+	return do_readv_writev(READ, file, vec, vlen, pos);
+}
+
+EXPORT_SYMBOL(vfs_readv);
+
+ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
+		   unsigned long vlen, loff_t *pos)
+{
+	if (!(file->f_mode & FMODE_WRITE))
+		return -EBADF;
+	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
+		return -EINVAL;
+
+	return do_readv_writev(WRITE, file, vec, vlen, pos);
+}
+
+EXPORT_SYMBOL(vfs_writev);
+
+SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen)
+{
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		loff_t pos = file_pos_read(file);
+		ret = vfs_readv(file, vec, vlen, &pos);
+		file_pos_write(file, pos);
+		fput_light(file, fput_needed);
+	}
+
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
+	return ret;
+}
+
+SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen)
+{
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		loff_t pos = file_pos_read(file);
+		ret = vfs_writev(file, vec, vlen, &pos);
+		file_pos_write(file, pos);
+		fput_light(file, fput_needed);
+	}
+
+	if (ret > 0)
+		add_wchar(current, ret);
+	inc_syscw(current);
+	return ret;
+}
+
+static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
+{
+#define HALF_LONG_BITS (BITS_PER_LONG / 2)
+	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
+}
+
+SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		ret = -ESPIPE;
+		if (file->f_mode & FMODE_PREAD)
+			ret = vfs_readv(file, vec, vlen, &pos);
+		fput_light(file, fput_needed);
+	}
+
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
+	return ret;
+}
+
+SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		ret = -ESPIPE;
+		if (file->f_mode & FMODE_PWRITE)
+			ret = vfs_writev(file, vec, vlen, &pos);
+		fput_light(file, fput_needed);
+	}
+
+	if (ret > 0)
+		add_wchar(current, ret);
+	inc_syscw(current);
+	return ret;
+}
+
+static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
+			   size_t count, loff_t max)
+{
+	struct file * in_file, * out_file;
+	struct inode * in_inode, * out_inode;
+	loff_t pos;
+	ssize_t retval;
+	int fput_needed_in, fput_needed_out, fl;
+
+	/*
+	 * Get input file, and verify that it is ok..
+	 */
+	retval = -EBADF;
+	in_file = fget_light(in_fd, &fput_needed_in);
+	if (!in_file)
+		goto out;
+	if (!(in_file->f_mode & FMODE_READ))
+		goto fput_in;
+	retval = -ESPIPE;
+	if (!ppos)
+		ppos = &in_file->f_pos;
+	else
+		if (!(in_file->f_mode & FMODE_PREAD))
+			goto fput_in;
+	retval = rw_verify_area(READ, in_file, ppos, count);
+	if (retval < 0)
+		goto fput_in;
+	count = retval;
+
+	/*
+	 * Get output file, and verify that it is ok..
+	 */
+	retval = -EBADF;
+	out_file = fget_light(out_fd, &fput_needed_out);
+	if (!out_file)
+		goto fput_in;
+	if (!(out_file->f_mode & FMODE_WRITE))
+		goto fput_out;
+	retval = -EINVAL;
+	in_inode = in_file->f_path.dentry->d_inode;
+	out_inode = out_file->f_path.dentry->d_inode;
+	retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
+	if (retval < 0)
+		goto fput_out;
+	count = retval;
+
+	if (!max)
+		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
+
+	pos = *ppos;
+	if (unlikely(pos + count > max)) {
+		retval = -EOVERFLOW;
+		if (pos >= max)
+			goto fput_out;
+		count = max - pos;
+	}
+
+	fl = 0;
+#if 0
+	/*
+	 * We need to debate whether we can enable this or not. The
+	 * man page documents EAGAIN return for the output at least,
+	 * and the application is arguably buggy if it doesn't expect
+	 * EAGAIN on a non-blocking file descriptor.
+	 */
+	if (in_file->f_flags & O_NONBLOCK)
+		fl = SPLICE_F_NONBLOCK;
+#endif
+	retval = do_splice_direct(in_file, ppos, out_file, count, fl);
+
+	if (retval > 0) {
+		add_rchar(current, retval);
+		add_wchar(current, retval);
+	}
+
+	inc_syscr(current);
+	inc_syscw(current);
+	if (*ppos > max)
+		retval = -EOVERFLOW;
+
+fput_out:
+	fput_light(out_file, fput_needed_out);
+fput_in:
+	fput_light(in_file, fput_needed_in);
+out:
+	return retval;
+}
+
+SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
+{
+	loff_t pos;
+	off_t off;
+	ssize_t ret;
+
+	if (offset) {
+		if (unlikely(get_user(off, offset)))
+			return -EFAULT;
+		pos = off;
+		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
+		if (unlikely(put_user(pos, offset)))
+			return -EFAULT;
+		return ret;
+	}
+
+	return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+
+SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
+{
+	loff_t pos;
+	ssize_t ret;
+
+	if (offset) {
+		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
+			return -EFAULT;
+		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
+		if (unlikely(put_user(pos, offset)))
+			return -EFAULT;
+		return ret;
+	}
+
+	return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
diff --git a/fs/read_write.h b/fs/read_write.h
new file mode 100644
index 00000000..d07b954c
--- /dev/null
+++ b/fs/read_write.h
@@ -0,0 +1,14 @@
+/*
+ * This file is only for sharing some helpers from read_write.c with compat.c.
+ * Don't use anywhere else.
+ */
+
+
+typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
+typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
+		unsigned long, loff_t);
+
+ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
+		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn);
+ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
+		unsigned long nr_segs, loff_t *ppos, io_fn_t fn);
diff --git a/fs/readdir.c b/fs/readdir.c
new file mode 100644
index 00000000..356f7152
--- /dev/null
+++ b/fs/readdir.c
@@ -0,0 +1,311 @@
+/*
+ *  linux/fs/readdir.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ */
+
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/dirent.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+
+#include <asm/uaccess.h>
+
+int vfs_readdir(struct file *file, filldir_t filler, void *buf)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int res = -ENOTDIR;
+	if (!file->f_op || !file->f_op->readdir)
+		goto out;
+
+	res = security_file_permission(file, MAY_READ);
+	if (res)
+		goto out;
+
+	res = mutex_lock_killable(&inode->i_mutex);
+	if (res)
+		goto out;
+
+	res = -ENOENT;
+	if (!IS_DEADDIR(inode)) {
+		res = file->f_op->readdir(file, buf, filler);
+		file_accessed(file);
+	}
+	mutex_unlock(&inode->i_mutex);
+out:
+	return res;
+}
+
+EXPORT_SYMBOL(vfs_readdir);
+
+/*
+ * Traditional linux readdir() handling..
+ *
+ * "count=1" is a special case, meaning that the buffer is one
+ * dirent-structure in size and that the code can't handle more
+ * anyway. Thus the special "fillonedir()" function for that
+ * case (the low-level handlers don't need to care about this).
+ */
+
+#ifdef __ARCH_WANT_OLD_READDIR
+
+struct old_linux_dirent {
+	unsigned long	d_ino;
+	unsigned long	d_offset;
+	unsigned short	d_namlen;
+	char		d_name[1];
+};
+
+struct readdir_callback {
+	struct old_linux_dirent __user * dirent;
+	int result;
+};
+
+static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset,
+		      u64 ino, unsigned int d_type)
+{
+	struct readdir_callback * buf = (struct readdir_callback *) __buf;
+	struct old_linux_dirent __user * dirent;
+	unsigned long d_ino;
+
+	if (buf->result)
+		return -EINVAL;
+	d_ino = ino;
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+		buf->result = -EOVERFLOW;
+		return -EOVERFLOW;
+	}
+	buf->result++;
+	dirent = buf->dirent;
+	if (!access_ok(VERIFY_WRITE, dirent,
+			(unsigned long)(dirent->d_name + namlen + 1) -
+				(unsigned long)dirent))
+		goto efault;
+	if (	__put_user(d_ino, &dirent->d_ino) ||
+		__put_user(offset, &dirent->d_offset) ||
+		__put_user(namlen, &dirent->d_namlen) ||
+		__copy_to_user(dirent->d_name, name, namlen) ||
+		__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	return 0;
+efault:
+	buf->result = -EFAULT;
+	return -EFAULT;
+}
+
+SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
+		struct old_linux_dirent __user *, dirent, unsigned int, count)
+{
+	int error;
+	struct file * file;
+	struct readdir_callback buf;
+
+	error = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	buf.result = 0;
+	buf.dirent = dirent;
+
+	error = vfs_readdir(file, fillonedir, &buf);
+	if (buf.result)
+		error = buf.result;
+
+	fput(file);
+out:
+	return error;
+}
+
+#endif /* __ARCH_WANT_OLD_READDIR */
+
+/*
+ * New, all-improved, singing, dancing, iBCS2-compliant getdents()
+ * interface. 
+ */
+struct linux_dirent {
+	unsigned long	d_ino;
+	unsigned long	d_off;
+	unsigned short	d_reclen;
+	char		d_name[1];
+};
+
+struct getdents_callback {
+	struct linux_dirent __user * current_dir;
+	struct linux_dirent __user * previous;
+	int count;
+	int error;
+};
+
+static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
+		   u64 ino, unsigned int d_type)
+{
+	struct linux_dirent __user * dirent;
+	struct getdents_callback * buf = (struct getdents_callback *) __buf;
+	unsigned long d_ino;
+	int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
+		sizeof(long));
+
+	buf->error = -EINVAL;	/* only used if we fail.. */
+	if (reclen > buf->count)
+		return -EINVAL;
+	d_ino = ino;
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+		buf->error = -EOVERFLOW;
+		return -EOVERFLOW;
+	}
+	dirent = buf->previous;
+	if (dirent) {
+		if (__put_user(offset, &dirent->d_off))
+			goto efault;
+	}
+	dirent = buf->current_dir;
+	if (__put_user(d_ino, &dirent->d_ino))
+		goto efault;
+	if (__put_user(reclen, &dirent->d_reclen))
+		goto efault;
+	if (copy_to_user(dirent->d_name, name, namlen))
+		goto efault;
+	if (__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	if (__put_user(d_type, (char __user *) dirent + reclen - 1))
+		goto efault;
+	buf->previous = dirent;
+	dirent = (void __user *)dirent + reclen;
+	buf->current_dir = dirent;
+	buf->count -= reclen;
+	return 0;
+efault:
+	buf->error = -EFAULT;
+	return -EFAULT;
+}
+
+SYSCALL_DEFINE3(getdents, unsigned int, fd,
+		struct linux_dirent __user *, dirent, unsigned int, count)
+{
+	struct file * file;
+	struct linux_dirent __user * lastdirent;
+	struct getdents_callback buf;
+	int error;
+
+	error = -EFAULT;
+	if (!access_ok(VERIFY_WRITE, dirent, count))
+		goto out;
+
+	error = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	buf.current_dir = dirent;
+	buf.previous = NULL;
+	buf.count = count;
+	buf.error = 0;
+
+	error = vfs_readdir(file, filldir, &buf);
+	if (error >= 0)
+		error = buf.error;
+	lastdirent = buf.previous;
+	if (lastdirent) {
+		if (put_user(file->f_pos, &lastdirent->d_off))
+			error = -EFAULT;
+		else
+			error = count - buf.count;
+	}
+	fput(file);
+out:
+	return error;
+}
+
+struct getdents_callback64 {
+	struct linux_dirent64 __user * current_dir;
+	struct linux_dirent64 __user * previous;
+	int count;
+	int error;
+};
+
+static int filldir64(void * __buf, const char * name, int namlen, loff_t offset,
+		     u64 ino, unsigned int d_type)
+{
+	struct linux_dirent64 __user *dirent;
+	struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf;
+	int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
+		sizeof(u64));
+
+	buf->error = -EINVAL;	/* only used if we fail.. */
+	if (reclen > buf->count)
+		return -EINVAL;
+	dirent = buf->previous;
+	if (dirent) {
+		if (__put_user(offset, &dirent->d_off))
+			goto efault;
+	}
+	dirent = buf->current_dir;
+	if (__put_user(ino, &dirent->d_ino))
+		goto efault;
+	if (__put_user(0, &dirent->d_off))
+		goto efault;
+	if (__put_user(reclen, &dirent->d_reclen))
+		goto efault;
+	if (__put_user(d_type, &dirent->d_type))
+		goto efault;
+	if (copy_to_user(dirent->d_name, name, namlen))
+		goto efault;
+	if (__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	buf->previous = dirent;
+	dirent = (void __user *)dirent + reclen;
+	buf->current_dir = dirent;
+	buf->count -= reclen;
+	return 0;
+efault:
+	buf->error = -EFAULT;
+	return -EFAULT;
+}
+
+SYSCALL_DEFINE3(getdents64, unsigned int, fd,
+		struct linux_dirent64 __user *, dirent, unsigned int, count)
+{
+	struct file * file;
+	struct linux_dirent64 __user * lastdirent;
+	struct getdents_callback64 buf;
+	int error;
+
+	error = -EFAULT;
+	if (!access_ok(VERIFY_WRITE, dirent, count))
+		goto out;
+
+	error = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	buf.current_dir = dirent;
+	buf.previous = NULL;
+	buf.count = count;
+	buf.error = 0;
+
+	error = vfs_readdir(file, filldir64, &buf);
+	if (error >= 0)
+		error = buf.error;
+	lastdirent = buf.previous;
+	if (lastdirent) {
+		typeof(lastdirent->d_off) d_off = file->f_pos;
+		if (__put_user(d_off, &lastdirent->d_off))
+			error = -EFAULT;
+		else
+			error = count - buf.count;
+	}
+	fput(file);
+out:
+	return error;
+}
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
new file mode 100644
index 00000000..7cd46666
--- /dev/null
+++ b/fs/reiserfs/Kconfig
@@ -0,0 +1,88 @@
+config REISERFS_FS
+	tristate "Reiserfs support"
+	select CRC32
+	help
+	  Stores not just filenames but the files themselves in a balanced
+	  tree.  Uses journalling.
+
+	  Balanced trees are more efficient than traditional file system
+	  architectural foundations.
+
+	  In general, ReiserFS is as fast as ext2, but is very efficient with
+	  large directories and small files.  Additional patches are needed
+	  for NFS and quotas, please see 
+	  <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links.
+
+	  It is more easily extended to have features currently found in
+	  database and keyword search systems than block allocation based file
+	  systems are.  The next version will be so extended, and will support
+	  plugins consistent with our motto ``It takes more than a license to
+	  make source code open.''
+
+	  Read <https://reiser4.wiki.kernel.org/index.php/Main_Page> 
+	  to learn more about reiserfs.
+
+	  Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
+
+	  If you like it, you can pay us to add new features to it that you
+	  need, buy a support contract, or pay us to port it to another OS.
+
+config REISERFS_CHECK
+	bool "Enable reiserfs debug mode"
+	depends on REISERFS_FS
+	help
+	  If you set this to Y, then ReiserFS will perform every check it can
+	  possibly imagine of its internal consistency throughout its
+	  operation.  It will also go substantially slower.  More than once we
+	  have forgotten that this was on, and then gone despondent over the
+	  latest benchmarks.:-) Use of this option allows our team to go all
+	  out in checking for consistency when debugging without fear of its
+	  effect on end users.  If you are on the verge of sending in a bug
+	  report, say Y and you might get a useful error message.  Almost
+	  everyone should say N.
+
+config REISERFS_PROC_INFO
+	bool "Stats in /proc/fs/reiserfs"
+	depends on REISERFS_FS && PROC_FS
+	help
+	  Create under /proc/fs/reiserfs a hierarchy of files, displaying
+	  various ReiserFS statistics and internal data at the expense of
+	  making your kernel or module slightly larger (+8 KB). This also
+	  increases the amount of kernel memory required for each mount.
+	  Almost everyone but ReiserFS developers and people fine-tuning
+	  reiserfs or tracing problems should say N.
+
+config REISERFS_FS_XATTR
+	bool "ReiserFS extended attributes"
+	depends on REISERFS_FS
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config REISERFS_FS_POSIX_ACL
+	bool "ReiserFS POSIX Access Control Lists"
+	depends on REISERFS_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config REISERFS_FS_SECURITY
+	bool "ReiserFS Security Labels"
+	depends on REISERFS_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ReiserFS filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
new file mode 100644
index 00000000..3c3b0016
--- /dev/null
+++ b/fs/reiserfs/Makefile
@@ -0,0 +1,38 @@
+#
+# Makefile for the linux reiser-filesystem routines.
+#
+
+obj-$(CONFIG_REISERFS_FS) += reiserfs.o
+
+reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
+		 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
+		 hashes.o tail_conversion.o journal.o resize.o \
+		 item_ops.o ioctl.o xattr.o lock.o
+
+ifeq ($(CONFIG_REISERFS_PROC_INFO),y)
+reiserfs-objs += procfs.o
+endif
+
+ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
+reiserfs-objs += xattr_user.o xattr_trusted.o
+endif
+
+ifeq ($(CONFIG_REISERFS_FS_SECURITY),y)
+reiserfs-objs += xattr_security.o
+endif
+
+ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y)
+reiserfs-objs += xattr_acl.o
+endif
+
+# gcc -O2 (the kernel default)  is overaggressive on ppc32 when many inline
+# functions are used.  This causes the compiler to advance the stack
+# pointer out of the available stack space, corrupting kernel space,
+# and causing a panic. Since this behavior only affects ppc32, this ifeq
+# will work around it. If any other architecture displays this behavior,
+# add it here.
+ccflags-$(CONFIG_PPC32) := $(call cc-ifversion, -lt, 0400, -O1)
+
+TAGS:
+	etags *.c
+
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
new file mode 100644
index 00000000..e2f7a264
--- /dev/null
+++ b/fs/reiserfs/README
@@ -0,0 +1,161 @@
+[LICENSING]
+
+ReiserFS is hereby licensed under the GNU General
+Public License version 2.
+
+Source code files that contain the phrase "licensing governed by
+reiserfs/README" are "governed files" throughout this file.  Governed
+files are licensed under the GPL.  The portions of them owned by Hans
+Reiser, or authorized to be licensed by him, have been in the past,
+and likely will be in the future, licensed to other parties under
+other licenses.  If you add your code to governed files, and don't
+want it to be owned by Hans Reiser, put your copyright label on that
+code so the poor blight and his customers can keep things straight.
+All portions of governed files not labeled otherwise are owned by Hans
+Reiser, and by adding your code to it, widely distributing it to
+others or sending us a patch, and leaving the sentence in stating that
+licensing is governed by the statement in this file, you accept this.
+It will be a kindness if you identify whether Hans Reiser is allowed
+to license code labeled as owned by you on your behalf other than
+under the GPL, because he wants to know if it is okay to do so and put
+a check in the mail to you (for non-trivial improvements) when he
+makes his next sale.  He makes no guarantees as to the amount if any,
+though he feels motivated to motivate contributors, and you can surely
+discuss this with him before or after contributing.  You have the
+right to decline to allow him to license your code contribution other
+than under the GPL.
+
+Further licensing options are available for commercial and/or other
+interests directly from Hans Reiser: hans@reiser.to.  If you interpret
+the GPL as not allowing those additional licensing options, you read
+it wrongly, and Richard Stallman agrees with me, when carefully read
+you can see that those restrictions on additional terms do not apply
+to the owner of the copyright, and my interpretation of this shall
+govern for this license.
+
+Finally, nothing in this license shall be interpreted to allow you to
+fail to fairly credit me, or to remove my credits, without my
+permission, unless you are an end user not redistributing to others.
+If you have doubts about how to properly do that, or about what is
+fair, ask.  (Last I spoke with him Richard was contemplating how best
+to address the fair crediting issue in the next GPL version.)
+
+[END LICENSING]
+
+Reiserfs is a file system based on balanced tree algorithms, which is
+described at https://reiser4.wiki.kernel.org/index.php/Main_Page 
+
+Stop reading here.  Go there, then return.
+
+Send bug reports to yura@namesys.botik.ru.
+
+mkreiserfs and other utilities are in reiserfs/utils, or wherever your
+Linux provider put them.  There is some disagreement about how useful
+it is for users to get their fsck and mkreiserfs out of sync with the
+version of reiserfs that is in their kernel, with many important
+distributors wanting them out of sync.:-) Please try to remember to
+recompile and reinstall fsck and mkreiserfs with every update of
+reiserfs, this is a common source of confusion.  Note that some of the
+utilities cannot be compiled without accessing the balancing code
+which is in the kernel code, and relocating the utilities may require
+you to specify where that code can be found.
+
+Yes, if you update your reiserfs kernel module you do have to
+recompile your kernel, most of the time.  The errors you get will be
+quite cryptic if your forget to do so.
+
+Real users, as opposed to folks who want to hack and then understand
+what went wrong, will want REISERFS_CHECK off.
+
+Hideous Commercial Pitch: Spread your development costs across other OS
+vendors.  Select from the best in the world, not the best in your
+building, by buying from third party OS component suppliers.  Leverage
+the software component development power of the internet.  Be the most
+aggressive in taking advantage of the commercial possibilities of
+decentralized internet development, and add value through your branded
+integration that you sell as an operating system.  Let your competitors
+be the ones to compete against the entire internet by themselves.  Be
+hip, get with the new economic trend, before your competitors do.  Send
+email to hans@reiser.to.
+
+To understand the code, after reading the website, start reading the
+code by reading reiserfs_fs.h first.
+
+Hans Reiser was the project initiator, primary architect, source of all
+funding for the first 5.5 years, and one of the programmers.  He owns
+the copyright.
+
+Vladimir Saveljev was one of the programmers, and he worked long hours
+writing the cleanest code.  He always made the effort to be the best he
+could be, and to make his code the best that it could be.  What resulted
+was quite remarkable. I don't think that money can ever motivate someone
+to work the way he did, he is one of the most selfless men I know.
+
+Yura helps with benchmarking, coding hashes, and block pre-allocation
+code.
+
+Anatoly Pinchuk is a former member of our team who worked closely with
+Vladimir throughout the project's development.  He wrote a quite
+substantial portion of the total code.  He realized that there was a
+space problem with packing tails of files for files larger than a node
+that start on a node aligned boundary (there are reasons to want to node
+align files), and he invented and implemented indirect items and
+unformatted nodes as the solution.
+
+Konstantin Shvachko, with the help of the Russian version of a VC,
+tried to put me in a position where I was forced into giving control
+of the project to him.  (Fortunately, as the person paying the money
+for all salaries from my dayjob I owned all copyrights, and you can't
+really force takeovers of sole proprietorships.)  This was something
+curious, because he never really understood the value of our project,
+why we should do what we do, or why innovation was possible in
+general, but he was sure that he ought to be controlling it.  Every
+innovation had to be forced past him while he was with us.  He added
+two years to the time required to complete reiserfs, and was a net
+loss for me.  Mikhail Gilula was a brilliant innovator who also left
+in a destructive way that erased the value of his contributions, and
+that he was shown much generosity just makes it more painful.
+
+Grigory Zaigralin was an extremely effective system administrator for
+our group.
+
+Igor Krasheninnikov was wonderful at hardware procurement, repair, and
+network installation.
+
+Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a
+textbook he got the algorithm from in the code.  Note that his analysis
+of how we could use the hashing code in making 32 bit NFS cookies work
+was probably more important than the actual algorithm.  Colin Plumb also
+contributed to it.
+
+Chris Mason dived right into our code, and in just a few months produced
+the journaling code that dramatically increased the value of ReiserFS.
+He is just an amazing programmer.
+
+Igor Zagorovsky is writing much of the new item handler and extent code
+for our next major release.
+
+Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
+resizer, and is hard at work on implementing allocate on flush.  SGI
+implemented allocate on flush before us for XFS, and generously took
+the time to convince me we should do it also.  They are great people,
+and a great company.
+
+Yuri Shevchuk and Nikita Danilov are doing squid cache optimization.
+
+Vitaly Fertman is doing fsck.
+
+Jeff Mahoney, of SuSE, contributed a few cleanup fixes, most notably
+the endian safe patches which allow ReiserFS to run on any platform
+supported by the Linux kernel.
+
+SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the
+Alpha PC Company made it possible for me to not have a day job
+anymore, and to dramatically increase our staffing.  Ecila funded
+hypertext feature development, MP3.com funded journaling, SuSE funded
+core development, IntegratedLinux.com funded squid web cache
+appliances, bigstorage.com funded HSM, and the alpha PC company funded
+the alpha port.  Many of these tasks were helped by sponsors other
+than the ones just named.  SuSE has helped in much more than just
+funding....
+
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
new file mode 100644
index 00000000..483442e6
--- /dev/null
+++ b/fs/reiserfs/bitmap.c
@@ -0,0 +1,1300 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+/* Reiserfs block (de)allocator, bitmap-based. */
+
+#include <linux/time.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/errno.h>
+#include <linux/buffer_head.h>
+#include <linux/kernel.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/reiserfs_fs_sb.h>
+#include <linux/reiserfs_fs_i.h>
+#include <linux/quotaops.h>
+
+#define PREALLOCATION_SIZE 9
+
+/* different reiserfs block allocator options */
+
+#define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits)
+
+#define  _ALLOC_concentrating_formatted_nodes 0
+#define  _ALLOC_displacing_large_files 1
+#define  _ALLOC_displacing_new_packing_localities 2
+#define  _ALLOC_old_hashed_relocation 3
+#define  _ALLOC_new_hashed_relocation 4
+#define  _ALLOC_skip_busy 5
+#define  _ALLOC_displace_based_on_dirid 6
+#define  _ALLOC_hashed_formatted_nodes 7
+#define  _ALLOC_old_way 8
+#define  _ALLOC_hundredth_slices 9
+#define  _ALLOC_dirid_groups 10
+#define  _ALLOC_oid_groups 11
+#define  _ALLOC_packing_groups 12
+
+#define  concentrating_formatted_nodes(s)	test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s))
+#define  displacing_large_files(s)		test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s))
+#define  displacing_new_packing_localities(s)	test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s))
+
+#define SET_OPTION(optname) \
+   do { \
+	reiserfs_info(s, "block allocator option \"%s\" is set", #optname); \
+	set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \
+    } while(0)
+#define TEST_OPTION(optname, s) \
+    test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s))
+
+static inline void get_bit_address(struct super_block *s,
+				   b_blocknr_t block,
+				   unsigned int *bmap_nr,
+				   unsigned int *offset)
+{
+	/* It is in the bitmap block number equal to the block
+	 * number divided by the number of bits in a block. */
+	*bmap_nr = block >> (s->s_blocksize_bits + 3);
+	/* Within that bitmap block it is located at bit offset *offset. */
+	*offset = block & ((s->s_blocksize << 3) - 1);
+}
+
+int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
+{
+	unsigned int bmap, offset;
+	unsigned int bmap_count = reiserfs_bmap_count(s);
+
+	if (block == 0 || block >= SB_BLOCK_COUNT(s)) {
+		reiserfs_error(s, "vs-4010",
+			       "block number is out of range %lu (%u)",
+			       block, SB_BLOCK_COUNT(s));
+		return 0;
+	}
+
+	get_bit_address(s, block, &bmap, &offset);
+
+	/* Old format filesystem? Unlikely, but the bitmaps are all up front so
+	 * we need to account for it. */
+	if (unlikely(test_bit(REISERFS_OLD_FORMAT,
+			      &(REISERFS_SB(s)->s_properties)))) {
+		b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1;
+		if (block >= bmap1 &&
+		    block <= bmap1 + bmap_count) {
+			reiserfs_error(s, "vs-4019", "bitmap block %lu(%u) "
+				       "can't be freed or reused",
+				       block, bmap_count);
+			return 0;
+		}
+	} else {
+		if (offset == 0) {
+			reiserfs_error(s, "vs-4020", "bitmap block %lu(%u) "
+				       "can't be freed or reused",
+				       block, bmap_count);
+			return 0;
+		}
+	}
+
+	if (bmap >= bmap_count) {
+		reiserfs_error(s, "vs-4030", "bitmap for requested block "
+			       "is out of range: block=%lu, bitmap_nr=%u",
+			       block, bmap);
+		return 0;
+	}
+
+	if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) {
+		reiserfs_error(s, "vs-4050", "this is root block (%u), "
+			       "it must be busy", SB_ROOT_BLOCK(s));
+		return 0;
+	}
+
+	return 1;
+}
+
+/* searches in journal structures for a given block number (bmap, off). If block
+   is found in reiserfs journal it suggests next free block candidate to test. */
+static inline int is_block_in_journal(struct super_block *s, unsigned int bmap,
+				      int off, int *next)
+{
+	b_blocknr_t tmp;
+
+	if (reiserfs_in_journal(s, bmap, off, 1, &tmp)) {
+		if (tmp) {	/* hint supplied */
+			*next = tmp;
+			PROC_INFO_INC(s, scan_bitmap.in_journal_hint);
+		} else {
+			(*next) = off + 1;	/* inc offset to avoid looping. */
+			PROC_INFO_INC(s, scan_bitmap.in_journal_nohint);
+		}
+		PROC_INFO_INC(s, scan_bitmap.retry);
+		return 1;
+	}
+	return 0;
+}
+
+/* it searches for a window of zero bits with given minimum and maximum lengths in one bitmap
+ * block; */
+static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
+			     unsigned int bmap_n, int *beg, int boundary,
+			     int min, int max, int unfm)
+{
+	struct super_block *s = th->t_super;
+	struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n];
+	struct buffer_head *bh;
+	int end, next;
+	int org = *beg;
+
+	BUG_ON(!th->t_trans_id);
+
+	RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of "
+	       "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1);
+	PROC_INFO_INC(s, scan_bitmap.bmap);
+/* this is unclear and lacks comments, explain how journal bitmaps
+   work here for the reader.  Convey a sense of the design here. What
+   is a window? */
+/* - I mean `a window of zero bits' as in description of this function - Zam. */
+
+	if (!bi) {
+		reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer "
+			       "for bitmap %d", bmap_n);
+		return 0;
+	}
+
+	bh = reiserfs_read_bitmap_block(s, bmap_n);
+	if (bh == NULL)
+		return 0;
+
+	while (1) {
+	      cont:
+		if (bi->free_count < min) {
+			brelse(bh);
+			return 0;	// No free blocks in this bitmap
+		}
+
+		/* search for a first zero bit -- beginning of a window */
+		*beg = reiserfs_find_next_zero_le_bit
+		    ((unsigned long *)(bh->b_data), boundary, *beg);
+
+		if (*beg + min > boundary) {	/* search for a zero bit fails or the rest of bitmap block
+						 * cannot contain a zero window of minimum size */
+			brelse(bh);
+			return 0;
+		}
+
+		if (unfm && is_block_in_journal(s, bmap_n, *beg, beg))
+			continue;
+		/* first zero bit found; we check next bits */
+		for (end = *beg + 1;; end++) {
+			if (end >= *beg + max || end >= boundary
+			    || reiserfs_test_le_bit(end, bh->b_data)) {
+				next = end;
+				break;
+			}
+			/* finding the other end of zero bit window requires looking into journal structures (in
+			 * case of searching for free blocks for unformatted nodes) */
+			if (unfm && is_block_in_journal(s, bmap_n, end, &next))
+				break;
+		}
+
+		/* now (*beg) points to beginning of zero bits window,
+		 * (end) points to one bit after the window end */
+		if (end - *beg >= min) {	/* it seems we have found window of proper size */
+			int i;
+			reiserfs_prepare_for_journal(s, bh, 1);
+			/* try to set all blocks used checking are they still free */
+			for (i = *beg; i < end; i++) {
+				/* It seems that we should not check in journal again. */
+				if (reiserfs_test_and_set_le_bit
+				    (i, bh->b_data)) {
+					/* bit was set by another process
+					 * while we slept in prepare_for_journal() */
+					PROC_INFO_INC(s, scan_bitmap.stolen);
+					if (i >= *beg + min) {	/* we can continue with smaller set of allocated blocks,
+								 * if length of this set is more or equal to `min' */
+						end = i;
+						break;
+					}
+					/* otherwise we clear all bit were set ... */
+					while (--i >= *beg)
+						reiserfs_test_and_clear_le_bit
+						    (i, bh->b_data);
+					reiserfs_restore_prepared_buffer(s, bh);
+					*beg = org;
+					/* ... and search again in current block from beginning */
+					goto cont;
+				}
+			}
+			bi->free_count -= (end - *beg);
+			journal_mark_dirty(th, s, bh);
+			brelse(bh);
+
+			/* free block count calculation */
+			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
+						     1);
+			PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg));
+			journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s));
+
+			return end - (*beg);
+		} else {
+			*beg = next;
+		}
+	}
+}
+
+static int bmap_hash_id(struct super_block *s, u32 id)
+{
+	char *hash_in = NULL;
+	unsigned long hash;
+	unsigned bm;
+
+	if (id <= 2) {
+		bm = 1;
+	} else {
+		hash_in = (char *)(&id);
+		hash = keyed_hash(hash_in, 4);
+		bm = hash % reiserfs_bmap_count(s);
+		if (!bm)
+			bm = 1;
+	}
+	/* this can only be true when SB_BMAP_NR = 1 */
+	if (bm >= reiserfs_bmap_count(s))
+		bm = 0;
+	return bm;
+}
+
+/*
+ * hashes the id and then returns > 0 if the block group for the
+ * corresponding hash is full
+ */
+static inline int block_group_used(struct super_block *s, u32 id)
+{
+	int bm = bmap_hash_id(s, id);
+	struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm];
+
+	/* If we don't have cached information on this bitmap block, we're
+	 * going to have to load it later anyway. Loading it here allows us
+	 * to make a better decision. This favors long-term performance gain
+	 * with a better on-disk layout vs. a short term gain of skipping the
+	 * read and potentially having a bad placement. */
+	if (info->free_count == UINT_MAX) {
+		struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm);
+		brelse(bh);
+	}
+
+	if (info->free_count > ((s->s_blocksize << 3) * 60 / 100)) {
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * the packing is returned in disk byte order
+ */
+__le32 reiserfs_choose_packing(struct inode * dir)
+{
+	__le32 packing;
+	if (TEST_OPTION(packing_groups, dir->i_sb)) {
+		u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id);
+		/*
+		 * some versions of reiserfsck expect packing locality 1 to be
+		 * special
+		 */
+		if (parent_dir == 1 || block_group_used(dir->i_sb, parent_dir))
+			packing = INODE_PKEY(dir)->k_objectid;
+		else
+			packing = INODE_PKEY(dir)->k_dir_id;
+	} else
+		packing = INODE_PKEY(dir)->k_objectid;
+	return packing;
+}
+
+/* Tries to find contiguous zero bit window (given size) in given region of
+ * bitmap and place new blocks there. Returns number of allocated blocks. */
+static int scan_bitmap(struct reiserfs_transaction_handle *th,
+		       b_blocknr_t * start, b_blocknr_t finish,
+		       int min, int max, int unfm, sector_t file_block)
+{
+	int nr_allocated = 0;
+	struct super_block *s = th->t_super;
+	/* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr
+	 * - Hans, it is not a block number - Zam. */
+
+	unsigned int bm, off;
+	unsigned int end_bm, end_off;
+	unsigned int off_max = s->s_blocksize << 3;
+
+	BUG_ON(!th->t_trans_id);
+
+	PROC_INFO_INC(s, scan_bitmap.call);
+	if (SB_FREE_BLOCKS(s) <= 0)
+		return 0;	// No point in looking for more free blocks
+
+	get_bit_address(s, *start, &bm, &off);
+	get_bit_address(s, finish, &end_bm, &end_off);
+	if (bm > reiserfs_bmap_count(s))
+		return 0;
+	if (end_bm > reiserfs_bmap_count(s))
+		end_bm = reiserfs_bmap_count(s);
+
+	/* When the bitmap is more than 10% free, anyone can allocate.
+	 * When it's less than 10% free, only files that already use the
+	 * bitmap are allowed. Once we pass 80% full, this restriction
+	 * is lifted.
+	 *
+	 * We do this so that files that grow later still have space close to
+	 * their original allocation. This improves locality, and presumably
+	 * performance as a result.
+	 *
+	 * This is only an allocation policy and does not make up for getting a
+	 * bad hint. Decent hinting must be implemented for this to work well.
+	 */
+	if (TEST_OPTION(skip_busy, s)
+	    && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s) / 20) {
+		for (; bm < end_bm; bm++, off = 0) {
+			if ((off && (!unfm || (file_block != 0)))
+			    || SB_AP_BITMAP(s)[bm].free_count >
+			    (s->s_blocksize << 3) / 10)
+				nr_allocated =
+				    scan_bitmap_block(th, bm, &off, off_max,
+						      min, max, unfm);
+			if (nr_allocated)
+				goto ret;
+		}
+		/* we know from above that start is a reasonable number */
+		get_bit_address(s, *start, &bm, &off);
+	}
+
+	for (; bm < end_bm; bm++, off = 0) {
+		nr_allocated =
+		    scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
+		if (nr_allocated)
+			goto ret;
+	}
+
+	nr_allocated =
+	    scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm);
+
+      ret:
+	*start = bm * off_max + off;
+	return nr_allocated;
+
+}
+
+static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
+				 struct inode *inode, b_blocknr_t block,
+				 int for_unformatted)
+{
+	struct super_block *s = th->t_super;
+	struct reiserfs_super_block *rs;
+	struct buffer_head *sbh, *bmbh;
+	struct reiserfs_bitmap_info *apbi;
+	unsigned int nr, offset;
+
+	BUG_ON(!th->t_trans_id);
+
+	PROC_INFO_INC(s, free_block);
+
+	rs = SB_DISK_SUPER_BLOCK(s);
+	sbh = SB_BUFFER_WITH_SB(s);
+	apbi = SB_AP_BITMAP(s);
+
+	get_bit_address(s, block, &nr, &offset);
+
+	if (nr >= reiserfs_bmap_count(s)) {
+		reiserfs_error(s, "vs-4075", "block %lu is out of range",
+			       block);
+		return;
+	}
+
+	bmbh = reiserfs_read_bitmap_block(s, nr);
+	if (!bmbh)
+		return;
+
+	reiserfs_prepare_for_journal(s, bmbh, 1);
+
+	/* clear bit for the given block in bit map */
+	if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) {
+		reiserfs_error(s, "vs-4080",
+			       "block %lu: bit already cleared", block);
+	}
+	apbi[nr].free_count++;
+	journal_mark_dirty(th, s, bmbh);
+	brelse(bmbh);
+
+	reiserfs_prepare_for_journal(s, sbh, 1);
+	/* update super block */
+	set_sb_free_blocks(rs, sb_free_blocks(rs) + 1);
+
+	journal_mark_dirty(th, s, sbh);
+	if (for_unformatted)
+		dquot_free_block_nodirty(inode, 1);
+}
+
+void reiserfs_free_block(struct reiserfs_transaction_handle *th,
+			 struct inode *inode, b_blocknr_t block,
+			 int for_unformatted)
+{
+	struct super_block *s = th->t_super;
+	BUG_ON(!th->t_trans_id);
+
+	RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
+	if (!is_reusable(s, block, 1))
+		return;
+
+	if (block > sb_block_count(REISERFS_SB(s)->s_rs)) {
+		reiserfs_error(th->t_super, "bitmap-4072",
+			       "Trying to free block outside file system "
+			       "boundaries (%lu > %lu)",
+			       block, sb_block_count(REISERFS_SB(s)->s_rs));
+		return;
+	}
+	/* mark it before we clear it, just in case */
+	journal_mark_freed(th, s, block);
+	_reiserfs_free_block(th, inode, block, for_unformatted);
+}
+
+/* preallocated blocks don't need to be run through journal_mark_freed */
+static void reiserfs_free_prealloc_block(struct reiserfs_transaction_handle *th,
+					 struct inode *inode, b_blocknr_t block)
+{
+	BUG_ON(!th->t_trans_id);
+	RFALSE(!th->t_super,
+	       "vs-4060: trying to free block on nonexistent device");
+	if (!is_reusable(th->t_super, block, 1))
+		return;
+	_reiserfs_free_block(th, inode, block, 1);
+}
+
+static void __discard_prealloc(struct reiserfs_transaction_handle *th,
+			       struct reiserfs_inode_info *ei)
+{
+	unsigned long save = ei->i_prealloc_block;
+	int dirty = 0;
+	struct inode *inode = &ei->vfs_inode;
+	BUG_ON(!th->t_trans_id);
+#ifdef CONFIG_REISERFS_CHECK
+	if (ei->i_prealloc_count < 0)
+		reiserfs_error(th->t_super, "zam-4001",
+			       "inode has negative prealloc blocks count.");
+#endif
+	while (ei->i_prealloc_count > 0) {
+		reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block);
+		ei->i_prealloc_block++;
+		ei->i_prealloc_count--;
+		dirty = 1;
+	}
+	if (dirty)
+		reiserfs_update_sd(th, inode);
+	ei->i_prealloc_block = save;
+	list_del_init(&(ei->i_prealloc_list));
+}
+
+/* FIXME: It should be inline function */
+void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
+			       struct inode *inode)
+{
+	struct reiserfs_inode_info *ei = REISERFS_I(inode);
+	BUG_ON(!th->t_trans_id);
+	if (ei->i_prealloc_count)
+		__discard_prealloc(th, ei);
+}
+
+void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th)
+{
+	struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list;
+
+	BUG_ON(!th->t_trans_id);
+
+	while (!list_empty(plist)) {
+		struct reiserfs_inode_info *ei;
+		ei = list_entry(plist->next, struct reiserfs_inode_info,
+				i_prealloc_list);
+#ifdef CONFIG_REISERFS_CHECK
+		if (!ei->i_prealloc_count) {
+			reiserfs_error(th->t_super, "zam-4001",
+				       "inode is in prealloc list but has "
+				       "no preallocated blocks.");
+		}
+#endif
+		__discard_prealloc(th, ei);
+	}
+}
+
+void reiserfs_init_alloc_options(struct super_block *s)
+{
+	set_bit(_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
+	set_bit(_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
+	set_bit(_ALLOC_packing_groups, &SB_ALLOC_OPTS(s));
+}
+
+/* block allocator related options are parsed here */
+int reiserfs_parse_alloc_options(struct super_block *s, char *options)
+{
+	char *this_char, *value;
+
+	REISERFS_SB(s)->s_alloc_options.bits = 0;	/* clear default settings */
+
+	while ((this_char = strsep(&options, ":")) != NULL) {
+		if ((value = strchr(this_char, '=')) != NULL)
+			*value++ = 0;
+
+		if (!strcmp(this_char, "concentrating_formatted_nodes")) {
+			int temp;
+			SET_OPTION(concentrating_formatted_nodes);
+			temp = (value
+				&& *value) ? simple_strtoul(value, &value,
+							    0) : 10;
+			if (temp <= 0 || temp > 100) {
+				REISERFS_SB(s)->s_alloc_options.border = 10;
+			} else {
+				REISERFS_SB(s)->s_alloc_options.border =
+				    100 / temp;
+			}
+			continue;
+		}
+		if (!strcmp(this_char, "displacing_large_files")) {
+			SET_OPTION(displacing_large_files);
+			REISERFS_SB(s)->s_alloc_options.large_file_size =
+			    (value
+			     && *value) ? simple_strtoul(value, &value, 0) : 16;
+			continue;
+		}
+		if (!strcmp(this_char, "displacing_new_packing_localities")) {
+			SET_OPTION(displacing_new_packing_localities);
+			continue;
+		};
+
+		if (!strcmp(this_char, "old_hashed_relocation")) {
+			SET_OPTION(old_hashed_relocation);
+			continue;
+		}
+
+		if (!strcmp(this_char, "new_hashed_relocation")) {
+			SET_OPTION(new_hashed_relocation);
+			continue;
+		}
+
+		if (!strcmp(this_char, "dirid_groups")) {
+			SET_OPTION(dirid_groups);
+			continue;
+		}
+		if (!strcmp(this_char, "oid_groups")) {
+			SET_OPTION(oid_groups);
+			continue;
+		}
+		if (!strcmp(this_char, "packing_groups")) {
+			SET_OPTION(packing_groups);
+			continue;
+		}
+		if (!strcmp(this_char, "hashed_formatted_nodes")) {
+			SET_OPTION(hashed_formatted_nodes);
+			continue;
+		}
+
+		if (!strcmp(this_char, "skip_busy")) {
+			SET_OPTION(skip_busy);
+			continue;
+		}
+
+		if (!strcmp(this_char, "hundredth_slices")) {
+			SET_OPTION(hundredth_slices);
+			continue;
+		}
+
+		if (!strcmp(this_char, "old_way")) {
+			SET_OPTION(old_way);
+			continue;
+		}
+
+		if (!strcmp(this_char, "displace_based_on_dirid")) {
+			SET_OPTION(displace_based_on_dirid);
+			continue;
+		}
+
+		if (!strcmp(this_char, "preallocmin")) {
+			REISERFS_SB(s)->s_alloc_options.preallocmin =
+			    (value
+			     && *value) ? simple_strtoul(value, &value, 0) : 4;
+			continue;
+		}
+
+		if (!strcmp(this_char, "preallocsize")) {
+			REISERFS_SB(s)->s_alloc_options.preallocsize =
+			    (value
+			     && *value) ? simple_strtoul(value, &value,
+							 0) :
+			    PREALLOCATION_SIZE;
+			continue;
+		}
+
+		reiserfs_warning(s, "zam-4001", "unknown option - %s",
+				 this_char);
+		return 1;
+	}
+
+	reiserfs_info(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
+	return 0;
+}
+
+static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint)
+{
+	char *hash_in;
+	if (hint->formatted_node) {
+		hash_in = (char *)&hint->key.k_dir_id;
+	} else {
+		if (!hint->inode) {
+			//hint->search_start = hint->beg;
+			hash_in = (char *)&hint->key.k_dir_id;
+		} else
+		    if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
+			hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
+		else
+			hash_in =
+			    (char *)(&INODE_PKEY(hint->inode)->k_objectid);
+	}
+
+	hint->search_start =
+	    hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
+}
+
+/*
+ * Relocation based on dirid, hashing them into a given bitmap block
+ * files. Formatted nodes are unaffected, a separate policy covers them
+ */
+static void dirid_groups(reiserfs_blocknr_hint_t * hint)
+{
+	unsigned long hash;
+	__u32 dirid = 0;
+	int bm = 0;
+	struct super_block *sb = hint->th->t_super;
+	if (hint->inode)
+		dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
+	else if (hint->formatted_node)
+		dirid = hint->key.k_dir_id;
+
+	if (dirid) {
+		bm = bmap_hash_id(sb, dirid);
+		hash = bm * (sb->s_blocksize << 3);
+		/* give a portion of the block group to metadata */
+		if (hint->inode)
+			hash += sb->s_blocksize / 2;
+		hint->search_start = hash;
+	}
+}
+
+/*
+ * Relocation based on oid, hashing them into a given bitmap block
+ * files. Formatted nodes are unaffected, a separate policy covers them
+ */
+static void oid_groups(reiserfs_blocknr_hint_t * hint)
+{
+	if (hint->inode) {
+		unsigned long hash;
+		__u32 oid;
+		__u32 dirid;
+		int bm;
+
+		dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
+
+		/* keep the root dir and it's first set of subdirs close to
+		 * the start of the disk
+		 */
+		if (dirid <= 2)
+			hash = (hint->inode->i_sb->s_blocksize << 3);
+		else {
+			oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
+			bm = bmap_hash_id(hint->inode->i_sb, oid);
+			hash = bm * (hint->inode->i_sb->s_blocksize << 3);
+		}
+		hint->search_start = hash;
+	}
+}
+
+/* returns 1 if it finds an indirect item and gets valid hint info
+ * from it, otherwise 0
+ */
+static int get_left_neighbor(reiserfs_blocknr_hint_t * hint)
+{
+	struct treepath *path;
+	struct buffer_head *bh;
+	struct item_head *ih;
+	int pos_in_item;
+	__le32 *item;
+	int ret = 0;
+
+	if (!hint->path)	/* reiserfs code can call this function w/o pointer to path
+				 * structure supplied; then we rely on supplied search_start */
+		return 0;
+
+	path = hint->path;
+	bh = get_last_bh(path);
+	RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor");
+	ih = get_ih(path);
+	pos_in_item = path->pos_in_item;
+	item = get_item(path);
+
+	hint->search_start = bh->b_blocknr;
+
+	if (!hint->formatted_node && is_indirect_le_ih(ih)) {
+		/* for indirect item: go to left and look for the first non-hole entry
+		   in the indirect item */
+		if (pos_in_item == I_UNFM_NUM(ih))
+			pos_in_item--;
+//          pos_in_item = I_UNFM_NUM (ih) - 1;
+		while (pos_in_item >= 0) {
+			int t = get_block_num(item, pos_in_item);
+			if (t) {
+				hint->search_start = t;
+				ret = 1;
+				break;
+			}
+			pos_in_item--;
+		}
+	}
+
+	/* does result value fit into specified region? */
+	return ret;
+}
+
+/* should be, if formatted node, then try to put on first part of the device
+   specified as number of percent with mount option device, else try to put
+   on last of device.  This is not to say it is good code to do so,
+   but the effect should be measured.  */
+static inline void set_border_in_hint(struct super_block *s,
+				      reiserfs_blocknr_hint_t * hint)
+{
+	b_blocknr_t border =
+	    SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border;
+
+	if (hint->formatted_node)
+		hint->end = border - 1;
+	else
+		hint->beg = border;
+}
+
+static inline void displace_large_file(reiserfs_blocknr_hint_t * hint)
+{
+	if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
+		hint->search_start =
+		    hint->beg +
+		    keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id),
+			       4) % (hint->end - hint->beg);
+	else
+		hint->search_start =
+		    hint->beg +
+		    keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid),
+			       4) % (hint->end - hint->beg);
+}
+
+static inline void hash_formatted_node(reiserfs_blocknr_hint_t * hint)
+{
+	char *hash_in;
+
+	if (!hint->inode)
+		hash_in = (char *)&hint->key.k_dir_id;
+	else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
+		hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
+	else
+		hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
+
+	hint->search_start =
+	    hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
+}
+
+static inline int
+this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t *
+						   hint)
+{
+	return hint->block ==
+	    REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size;
+}
+
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+static inline void displace_new_packing_locality(reiserfs_blocknr_hint_t * hint)
+{
+	struct in_core_key *key = &hint->key;
+
+	hint->th->displace_new_blocks = 0;
+	hint->search_start =
+	    hint->beg + keyed_hash((char *)(&key->k_objectid),
+				   4) % (hint->end - hint->beg);
+}
+#endif
+
+static inline int old_hashed_relocation(reiserfs_blocknr_hint_t * hint)
+{
+	b_blocknr_t border;
+	u32 hash_in;
+
+	if (hint->formatted_node || hint->inode == NULL) {
+		return 0;
+	}
+
+	hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id);
+	border =
+	    hint->beg + (u32) keyed_hash(((char *)(&hash_in)),
+					 4) % (hint->end - hint->beg - 1);
+	if (border > hint->search_start)
+		hint->search_start = border;
+
+	return 1;
+}
+
+static inline int old_way(reiserfs_blocknr_hint_t * hint)
+{
+	b_blocknr_t border;
+
+	if (hint->formatted_node || hint->inode == NULL) {
+		return 0;
+	}
+
+	border =
+	    hint->beg +
+	    le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end -
+							      hint->beg);
+	if (border > hint->search_start)
+		hint->search_start = border;
+
+	return 1;
+}
+
+static inline void hundredth_slices(reiserfs_blocknr_hint_t * hint)
+{
+	struct in_core_key *key = &hint->key;
+	b_blocknr_t slice_start;
+
+	slice_start =
+	    (keyed_hash((char *)(&key->k_dir_id), 4) % 100) * (hint->end / 100);
+	if (slice_start > hint->search_start
+	    || slice_start + (hint->end / 100) <= hint->search_start) {
+		hint->search_start = slice_start;
+	}
+}
+
+static void determine_search_start(reiserfs_blocknr_hint_t * hint,
+				   int amount_needed)
+{
+	struct super_block *s = hint->th->t_super;
+	int unfm_hint;
+
+	hint->beg = 0;
+	hint->end = SB_BLOCK_COUNT(s) - 1;
+
+	/* This is former border algorithm. Now with tunable border offset */
+	if (concentrating_formatted_nodes(s))
+		set_border_in_hint(s, hint);
+
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+	/* whenever we create a new directory, we displace it.  At first we will
+	   hash for location, later we might look for a moderately empty place for
+	   it */
+	if (displacing_new_packing_localities(s)
+	    && hint->th->displace_new_blocks) {
+		displace_new_packing_locality(hint);
+
+		/* we do not continue determine_search_start,
+		 * if new packing locality is being displaced */
+		return;
+	}
+#endif
+
+	/* all persons should feel encouraged to add more special cases here and
+	 * test them */
+
+	if (displacing_large_files(s) && !hint->formatted_node
+	    && this_blocknr_allocation_would_make_it_a_large_file(hint)) {
+		displace_large_file(hint);
+		return;
+	}
+
+	/* if none of our special cases is relevant, use the left neighbor in the
+	   tree order of the new node we are allocating for */
+	if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) {
+		hash_formatted_node(hint);
+		return;
+	}
+
+	unfm_hint = get_left_neighbor(hint);
+
+	/* Mimic old block allocator behaviour, that is if VFS allowed for preallocation,
+	   new blocks are displaced based on directory ID. Also, if suggested search_start
+	   is less than last preallocated block, we start searching from it, assuming that
+	   HDD dataflow is faster in forward direction */
+	if (TEST_OPTION(old_way, s)) {
+		if (!hint->formatted_node) {
+			if (!reiserfs_hashed_relocation(s))
+				old_way(hint);
+			else if (!reiserfs_no_unhashed_relocation(s))
+				old_hashed_relocation(hint);
+
+			if (hint->inode
+			    && hint->search_start <
+			    REISERFS_I(hint->inode)->i_prealloc_block)
+				hint->search_start =
+				    REISERFS_I(hint->inode)->i_prealloc_block;
+		}
+		return;
+	}
+
+	/* This is an approach proposed by Hans */
+	if (TEST_OPTION(hundredth_slices, s)
+	    && !(displacing_large_files(s) && !hint->formatted_node)) {
+		hundredth_slices(hint);
+		return;
+	}
+
+	/* old_hashed_relocation only works on unformatted */
+	if (!unfm_hint && !hint->formatted_node &&
+	    TEST_OPTION(old_hashed_relocation, s)) {
+		old_hashed_relocation(hint);
+	}
+	/* new_hashed_relocation works with both formatted/unformatted nodes */
+	if ((!unfm_hint || hint->formatted_node) &&
+	    TEST_OPTION(new_hashed_relocation, s)) {
+		new_hashed_relocation(hint);
+	}
+	/* dirid grouping works only on unformatted nodes */
+	if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
+		dirid_groups(hint);
+	}
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+	if (hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
+		dirid_groups(hint);
+	}
+#endif
+
+	/* oid grouping works only on unformatted nodes */
+	if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups, s)) {
+		oid_groups(hint);
+	}
+	return;
+}
+
+static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint)
+{
+	/* make minimum size a mount option and benchmark both ways */
+	/* we preallocate blocks only for regular files, specific size */
+	/* benchmark preallocating always and see what happens */
+
+	hint->prealloc_size = 0;
+
+	if (!hint->formatted_node && hint->preallocate) {
+		if (S_ISREG(hint->inode->i_mode)
+		    && hint->inode->i_size >=
+		    REISERFS_SB(hint->th->t_super)->s_alloc_options.
+		    preallocmin * hint->inode->i_sb->s_blocksize)
+			hint->prealloc_size =
+			    REISERFS_SB(hint->th->t_super)->s_alloc_options.
+			    preallocsize - 1;
+	}
+	return CARRY_ON;
+}
+
+/* XXX I know it could be merged with upper-level function;
+   but may be result function would be too complex. */
+static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint,
+						 b_blocknr_t * new_blocknrs,
+						 b_blocknr_t start,
+						 b_blocknr_t finish, int min,
+						 int amount_needed,
+						 int prealloc_size)
+{
+	int rest = amount_needed;
+	int nr_allocated;
+
+	while (rest > 0 && start <= finish) {
+		nr_allocated = scan_bitmap(hint->th, &start, finish, min,
+					   rest + prealloc_size,
+					   !hint->formatted_node, hint->block);
+
+		if (nr_allocated == 0)	/* no new blocks allocated, return */
+			break;
+
+		/* fill free_blocknrs array first */
+		while (rest > 0 && nr_allocated > 0) {
+			*new_blocknrs++ = start++;
+			rest--;
+			nr_allocated--;
+		}
+
+		/* do we have something to fill prealloc. array also ? */
+		if (nr_allocated > 0) {
+			/* it means prealloc_size was greater that 0 and we do preallocation */
+			list_add(&REISERFS_I(hint->inode)->i_prealloc_list,
+				 &SB_JOURNAL(hint->th->t_super)->
+				 j_prealloc_list);
+			REISERFS_I(hint->inode)->i_prealloc_block = start;
+			REISERFS_I(hint->inode)->i_prealloc_count =
+			    nr_allocated;
+			break;
+		}
+	}
+
+	return (amount_needed - rest);
+}
+
+static inline int blocknrs_and_prealloc_arrays_from_search_start
+    (reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs,
+     int amount_needed) {
+	struct super_block *s = hint->th->t_super;
+	b_blocknr_t start = hint->search_start;
+	b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
+	int passno = 0;
+	int nr_allocated = 0;
+
+	determine_prealloc_size(hint);
+	if (!hint->formatted_node) {
+		int quota_ret;
+#ifdef REISERQUOTA_DEBUG
+		reiserfs_debug(s, REISERFS_DEBUG_CODE,
+			       "reiserquota: allocating %d blocks id=%u",
+			       amount_needed, hint->inode->i_uid);
+#endif
+		quota_ret =
+		    dquot_alloc_block_nodirty(hint->inode, amount_needed);
+		if (quota_ret)	/* Quota exceeded? */
+			return QUOTA_EXCEEDED;
+		if (hint->preallocate && hint->prealloc_size) {
+#ifdef REISERQUOTA_DEBUG
+			reiserfs_debug(s, REISERFS_DEBUG_CODE,
+				       "reiserquota: allocating (prealloc) %d blocks id=%u",
+				       hint->prealloc_size, hint->inode->i_uid);
+#endif
+			quota_ret = dquot_prealloc_block_nodirty(hint->inode,
+							 hint->prealloc_size);
+			if (quota_ret)
+				hint->preallocate = hint->prealloc_size = 0;
+		}
+		/* for unformatted nodes, force large allocations */
+	}
+
+	do {
+		switch (passno++) {
+		case 0:	/* Search from hint->search_start to end of disk */
+			start = hint->search_start;
+			finish = SB_BLOCK_COUNT(s) - 1;
+			break;
+		case 1:	/* Search from hint->beg to hint->search_start */
+			start = hint->beg;
+			finish = hint->search_start;
+			break;
+		case 2:	/* Last chance: Search from 0 to hint->beg */
+			start = 0;
+			finish = hint->beg;
+			break;
+		default:	/* We've tried searching everywhere, not enough space */
+			/* Free the blocks */
+			if (!hint->formatted_node) {
+#ifdef REISERQUOTA_DEBUG
+				reiserfs_debug(s, REISERFS_DEBUG_CODE,
+					       "reiserquota: freeing (nospace) %d blocks id=%u",
+					       amount_needed +
+					       hint->prealloc_size -
+					       nr_allocated,
+					       hint->inode->i_uid);
+#endif
+				/* Free not allocated blocks */
+				dquot_free_block_nodirty(hint->inode,
+					amount_needed + hint->prealloc_size -
+					nr_allocated);
+			}
+			while (nr_allocated--)
+				reiserfs_free_block(hint->th, hint->inode,
+						    new_blocknrs[nr_allocated],
+						    !hint->formatted_node);
+
+			return NO_DISK_SPACE;
+		}
+	} while ((nr_allocated += allocate_without_wrapping_disk(hint,
+								 new_blocknrs +
+								 nr_allocated,
+								 start, finish,
+								 1,
+								 amount_needed -
+								 nr_allocated,
+								 hint->
+								 prealloc_size))
+		 < amount_needed);
+	if (!hint->formatted_node &&
+	    amount_needed + hint->prealloc_size >
+	    nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) {
+		/* Some of preallocation blocks were not allocated */
+#ifdef REISERQUOTA_DEBUG
+		reiserfs_debug(s, REISERFS_DEBUG_CODE,
+			       "reiserquota: freeing (failed prealloc) %d blocks id=%u",
+			       amount_needed + hint->prealloc_size -
+			       nr_allocated -
+			       REISERFS_I(hint->inode)->i_prealloc_count,
+			       hint->inode->i_uid);
+#endif
+		dquot_free_block_nodirty(hint->inode, amount_needed +
+					 hint->prealloc_size - nr_allocated -
+					 REISERFS_I(hint->inode)->
+					 i_prealloc_count);
+	}
+
+	return CARRY_ON;
+}
+
+/* grab new blocknrs from preallocated list */
+/* return amount still needed after using them */
+static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint,
+					      b_blocknr_t * new_blocknrs,
+					      int amount_needed)
+{
+	struct inode *inode = hint->inode;
+
+	if (REISERFS_I(inode)->i_prealloc_count > 0) {
+		while (amount_needed) {
+
+			*new_blocknrs++ = REISERFS_I(inode)->i_prealloc_block++;
+			REISERFS_I(inode)->i_prealloc_count--;
+
+			amount_needed--;
+
+			if (REISERFS_I(inode)->i_prealloc_count <= 0) {
+				list_del(&REISERFS_I(inode)->i_prealloc_list);
+				break;
+			}
+		}
+	}
+	/* return amount still needed after using preallocated blocks */
+	return amount_needed;
+}
+
+int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, int amount_needed, int reserved_by_us	/* Amount of blocks we have
+																	   already reserved */ )
+{
+	int initial_amount_needed = amount_needed;
+	int ret;
+	struct super_block *s = hint->th->t_super;
+
+	/* Check if there is enough space, taking into account reserved space */
+	if (SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks <
+	    amount_needed - reserved_by_us)
+		return NO_DISK_SPACE;
+	/* should this be if !hint->inode &&  hint->preallocate? */
+	/* do you mean hint->formatted_node can be removed ? - Zam */
+	/* hint->formatted_node cannot be removed because we try to access
+	   inode information here, and there is often no inode assotiated with
+	   metadata allocations - green */
+
+	if (!hint->formatted_node && hint->preallocate) {
+		amount_needed = use_preallocated_list_if_available
+		    (hint, new_blocknrs, amount_needed);
+		if (amount_needed == 0)	/* all blocknrs we need we got from
+					   prealloc. list */
+			return CARRY_ON;
+		new_blocknrs += (initial_amount_needed - amount_needed);
+	}
+
+	/* find search start and save it in hint structure */
+	determine_search_start(hint, amount_needed);
+	if (hint->search_start >= SB_BLOCK_COUNT(s))
+		hint->search_start = SB_BLOCK_COUNT(s) - 1;
+
+	/* allocation itself; fill new_blocknrs and preallocation arrays */
+	ret = blocknrs_and_prealloc_arrays_from_search_start
+	    (hint, new_blocknrs, amount_needed);
+
+	/* we used prealloc. list to fill (partially) new_blocknrs array. If final allocation fails we
+	 * need to return blocks back to prealloc. list or just free them. -- Zam (I chose second
+	 * variant) */
+
+	if (ret != CARRY_ON) {
+		while (amount_needed++ < initial_amount_needed) {
+			reiserfs_free_block(hint->th, hint->inode,
+					    *(--new_blocknrs), 1);
+		}
+	}
+	return ret;
+}
+
+void reiserfs_cache_bitmap_metadata(struct super_block *sb,
+                                    struct buffer_head *bh,
+                                    struct reiserfs_bitmap_info *info)
+{
+	unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size);
+
+	/* The first bit must ALWAYS be 1 */
+	if (!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data))
+		reiserfs_error(sb, "reiserfs-2025", "bitmap block %lu is "
+			       "corrupted: first bit must be 1", bh->b_blocknr);
+
+	info->free_count = 0;
+
+	while (--cur >= (unsigned long *)bh->b_data) {
+		int i;
+
+		/* 0 and ~0 are special, we can optimize for them */
+		if (*cur == 0)
+			info->free_count += BITS_PER_LONG;
+		else if (*cur != ~0L)	/* A mix, investigate */
+			for (i = BITS_PER_LONG - 1; i >= 0; i--)
+				if (!reiserfs_test_le_bit(i, cur))
+					info->free_count++;
+	}
+}
+
+struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
+                                               unsigned int bitmap)
+{
+	b_blocknr_t block = (sb->s_blocksize << 3) * bitmap;
+	struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap;
+	struct buffer_head *bh;
+
+	/* Way old format filesystems had the bitmaps packed up front.
+	 * I doubt there are any of these left, but just in case... */
+	if (unlikely(test_bit(REISERFS_OLD_FORMAT,
+	                      &(REISERFS_SB(sb)->s_properties))))
+		block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap;
+	else if (bitmap == 0)
+		block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
+
+	reiserfs_write_unlock(sb);
+	bh = sb_bread(sb, block);
+	reiserfs_write_lock(sb);
+	if (bh == NULL)
+		reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
+		                 "reading failed", __func__, block);
+	else {
+		if (buffer_locked(bh)) {
+			PROC_INFO_INC(sb, scan_bitmap.wait);
+			reiserfs_write_unlock(sb);
+			__wait_on_buffer(bh);
+			reiserfs_write_lock(sb);
+		}
+		BUG_ON(!buffer_uptodate(bh));
+		BUG_ON(atomic_read(&bh->b_count) == 0);
+
+		if (info->free_count == UINT_MAX)
+			reiserfs_cache_bitmap_metadata(sb, bh, info);
+	}
+
+	return bh;
+}
+
+int reiserfs_init_bitmap_cache(struct super_block *sb)
+{
+	struct reiserfs_bitmap_info *bitmap;
+	unsigned int bmap_nr = reiserfs_bmap_count(sb);
+
+	/* Avoid lock recursion in fault case */
+	reiserfs_write_unlock(sb);
+	bitmap = vmalloc(sizeof(*bitmap) * bmap_nr);
+	reiserfs_write_lock(sb);
+	if (bitmap == NULL)
+		return -ENOMEM;
+
+	memset(bitmap, 0xff, sizeof(*bitmap) * bmap_nr);
+
+	SB_AP_BITMAP(sb) = bitmap;
+
+	return 0;
+}
+
+void reiserfs_free_bitmap_cache(struct super_block *sb)
+{
+	if (SB_AP_BITMAP(sb)) {
+		vfree(SB_AP_BITMAP(sb));
+		SB_AP_BITMAP(sb) = NULL;
+	}
+}
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
new file mode 100644
index 00000000..198dabf1
--- /dev/null
+++ b/fs/reiserfs/dir.c
@@ -0,0 +1,310 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/stat.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+
+extern const struct reiserfs_key MIN_KEY;
+
+static int reiserfs_readdir(struct file *, void *, filldir_t);
+static int reiserfs_dir_fsync(struct file *filp, int datasync);
+
+const struct file_operations reiserfs_dir_operations = {
+	.llseek = generic_file_llseek,
+	.read = generic_read_dir,
+	.readdir = reiserfs_readdir,
+	.fsync = reiserfs_dir_fsync,
+	.unlocked_ioctl = reiserfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = reiserfs_compat_ioctl,
+#endif
+};
+
+static int reiserfs_dir_fsync(struct file *filp, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int err;
+	reiserfs_write_lock(inode->i_sb);
+	err = reiserfs_commit_for_inode(inode);
+	reiserfs_write_unlock(inode->i_sb);
+	if (err < 0)
+		return err;
+	return 0;
+}
+
+#define store_ih(where,what) copy_item_head (where, what)
+
+static inline bool is_privroot_deh(struct dentry *dir,
+				   struct reiserfs_de_head *deh)
+{
+	struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
+	return (dir == dir->d_parent && privroot->d_inode &&
+	        deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
+}
+
+int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
+			   filldir_t filldir, loff_t *pos)
+{
+	struct inode *inode = dentry->d_inode;
+	struct cpu_key pos_key;	/* key of current position in the directory (key of directory entry) */
+	INITIALIZE_PATH(path_to_entry);
+	struct buffer_head *bh;
+	int item_num, entry_num;
+	const struct reiserfs_key *rkey;
+	struct item_head *ih, tmp_ih;
+	int search_res;
+	char *local_buf;
+	loff_t next_pos;
+	char small_buf[32];	/* avoid kmalloc if we can */
+	struct reiserfs_dir_entry de;
+	int ret = 0;
+
+	reiserfs_write_lock(inode->i_sb);
+
+	reiserfs_check_lock_depth(inode->i_sb, "readdir");
+
+	/* form key for search the next directory entry using f_pos field of
+	   file structure */
+	make_cpu_key(&pos_key, inode, *pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
+	next_pos = cpu_key_k_offset(&pos_key);
+
+	path_to_entry.reada = PATH_READA;
+	while (1) {
+	      research:
+		/* search the directory item, containing entry with specified key */
+		search_res =
+		    search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry,
+					&de);
+		if (search_res == IO_ERROR) {
+			// FIXME: we could just skip part of directory which could
+			// not be read
+			ret = -EIO;
+			goto out;
+		}
+		entry_num = de.de_entry_num;
+		bh = de.de_bh;
+		item_num = de.de_item_num;
+		ih = de.de_ih;
+		store_ih(&tmp_ih, ih);
+
+		/* we must have found item, that is item of this directory, */
+		RFALSE(COMP_SHORT_KEYS(&(ih->ih_key), &pos_key),
+		       "vs-9000: found item %h does not match to dir we readdir %K",
+		       ih, &pos_key);
+		RFALSE(item_num > B_NR_ITEMS(bh) - 1,
+		       "vs-9005 item_num == %d, item amount == %d",
+		       item_num, B_NR_ITEMS(bh));
+
+		/* and entry must be not more than number of entries in the item */
+		RFALSE(I_ENTRY_COUNT(ih) < entry_num,
+		       "vs-9010: entry number is too big %d (%d)",
+		       entry_num, I_ENTRY_COUNT(ih));
+
+		if (search_res == POSITION_FOUND
+		    || entry_num < I_ENTRY_COUNT(ih)) {
+			/* go through all entries in the directory item beginning from the entry, that has been found */
+			struct reiserfs_de_head *deh =
+			    B_I_DEH(bh, ih) + entry_num;
+
+			for (; entry_num < I_ENTRY_COUNT(ih);
+			     entry_num++, deh++) {
+				int d_reclen;
+				char *d_name;
+				off_t d_off;
+				ino_t d_ino;
+
+				if (!de_visible(deh))
+					/* it is hidden entry */
+					continue;
+				d_reclen = entry_length(bh, ih, entry_num);
+				d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh);
+
+				if (d_reclen <= 0 ||
+				    d_name + d_reclen > bh->b_data + bh->b_size) {
+					/* There is corrupted data in entry,
+					 * We'd better stop here */
+					pathrelse(&path_to_entry);
+					ret = -EIO;
+					goto out;
+				}
+
+				if (!d_name[d_reclen - 1])
+					d_reclen = strlen(d_name);
+
+				if (d_reclen >
+				    REISERFS_MAX_NAME(inode->i_sb->
+						      s_blocksize)) {
+					/* too big to send back to VFS */
+					continue;
+				}
+
+				/* Ignore the .reiserfs_priv entry */
+				if (is_privroot_deh(dentry, deh))
+					continue;
+
+				d_off = deh_offset(deh);
+				*pos = d_off;
+				d_ino = deh_objectid(deh);
+				if (d_reclen <= 32) {
+					local_buf = small_buf;
+				} else {
+					local_buf = kmalloc(d_reclen,
+							    GFP_NOFS);
+					if (!local_buf) {
+						pathrelse(&path_to_entry);
+						ret = -ENOMEM;
+						goto out;
+					}
+					if (item_moved(&tmp_ih, &path_to_entry)) {
+						kfree(local_buf);
+						goto research;
+					}
+				}
+				// Note, that we copy name to user space via temporary
+				// buffer (local_buf) because filldir will block if
+				// user space buffer is swapped out. At that time
+				// entry can move to somewhere else
+				memcpy(local_buf, d_name, d_reclen);
+
+				/*
+				 * Since filldir might sleep, we can release
+				 * the write lock here for other waiters
+				 */
+				reiserfs_write_unlock(inode->i_sb);
+				if (filldir
+				    (dirent, local_buf, d_reclen, d_off, d_ino,
+				     DT_UNKNOWN) < 0) {
+					reiserfs_write_lock(inode->i_sb);
+					if (local_buf != small_buf) {
+						kfree(local_buf);
+					}
+					goto end;
+				}
+				reiserfs_write_lock(inode->i_sb);
+				if (local_buf != small_buf) {
+					kfree(local_buf);
+				}
+				// next entry should be looked for with such offset
+				next_pos = deh_offset(deh) + 1;
+
+				if (item_moved(&tmp_ih, &path_to_entry)) {
+					goto research;
+				}
+			}	/* for */
+		}
+
+		if (item_num != B_NR_ITEMS(bh) - 1)
+			// end of directory has been reached
+			goto end;
+
+		/* item we went through is last item of node. Using right
+		   delimiting key check is it directory end */
+		rkey = get_rkey(&path_to_entry, inode->i_sb);
+		if (!comp_le_keys(rkey, &MIN_KEY)) {
+			/* set pos_key to key, that is the smallest and greater
+			   that key of the last entry in the item */
+			set_cpu_key_k_offset(&pos_key, next_pos);
+			continue;
+		}
+
+		if (COMP_SHORT_KEYS(rkey, &pos_key)) {
+			// end of directory has been reached
+			goto end;
+		}
+
+		/* directory continues in the right neighboring block */
+		set_cpu_key_k_offset(&pos_key,
+				     le_key_k_offset(KEY_FORMAT_3_5, rkey));
+
+	}			/* while */
+
+end:
+	*pos = next_pos;
+	pathrelse(&path_to_entry);
+	reiserfs_check_path(&path_to_entry);
+out:
+	reiserfs_write_unlock(inode->i_sb);
+	return ret;
+}
+
+static int reiserfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	return reiserfs_readdir_dentry(dentry, dirent, filldir, &file->f_pos);
+}
+
+/* compose directory item containing "." and ".." entries (entries are
+   not aligned to 4 byte boundary) */
+/* the last four params are LE */
+void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
+			    __le32 par_dirid, __le32 par_objid)
+{
+	struct reiserfs_de_head *deh;
+
+	memset(body, 0, EMPTY_DIR_SIZE_V1);
+	deh = (struct reiserfs_de_head *)body;
+
+	/* direntry header of "." */
+	put_deh_offset(&(deh[0]), DOT_OFFSET);
+	/* these two are from make_le_item_head, and are are LE */
+	deh[0].deh_dir_id = dirid;
+	deh[0].deh_objectid = objid;
+	deh[0].deh_state = 0;	/* Endian safe if 0 */
+	put_deh_location(&(deh[0]), EMPTY_DIR_SIZE_V1 - strlen("."));
+	mark_de_visible(&(deh[0]));
+
+	/* direntry header of ".." */
+	put_deh_offset(&(deh[1]), DOT_DOT_OFFSET);
+	/* key of ".." for the root directory */
+	/* these two are from the inode, and are are LE */
+	deh[1].deh_dir_id = par_dirid;
+	deh[1].deh_objectid = par_objid;
+	deh[1].deh_state = 0;	/* Endian safe if 0 */
+	put_deh_location(&(deh[1]), deh_location(&(deh[0])) - strlen(".."));
+	mark_de_visible(&(deh[1]));
+
+	/* copy ".." and "." */
+	memcpy(body + deh_location(&(deh[0])), ".", 1);
+	memcpy(body + deh_location(&(deh[1])), "..", 2);
+}
+
+/* compose directory item containing "." and ".." entries */
+void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
+			 __le32 par_dirid, __le32 par_objid)
+{
+	struct reiserfs_de_head *deh;
+
+	memset(body, 0, EMPTY_DIR_SIZE);
+	deh = (struct reiserfs_de_head *)body;
+
+	/* direntry header of "." */
+	put_deh_offset(&(deh[0]), DOT_OFFSET);
+	/* these two are from make_le_item_head, and are are LE */
+	deh[0].deh_dir_id = dirid;
+	deh[0].deh_objectid = objid;
+	deh[0].deh_state = 0;	/* Endian safe if 0 */
+	put_deh_location(&(deh[0]), EMPTY_DIR_SIZE - ROUND_UP(strlen(".")));
+	mark_de_visible(&(deh[0]));
+
+	/* direntry header of ".." */
+	put_deh_offset(&(deh[1]), DOT_DOT_OFFSET);
+	/* key of ".." for the root directory */
+	/* these two are from the inode, and are are LE */
+	deh[1].deh_dir_id = par_dirid;
+	deh[1].deh_objectid = par_objid;
+	deh[1].deh_state = 0;	/* Endian safe if 0 */
+	put_deh_location(&(deh[1]),
+			 deh_location(&(deh[0])) - ROUND_UP(strlen("..")));
+	mark_de_visible(&(deh[1]));
+
+	/* copy ".." and "." */
+	memcpy(body + deh_location(&(deh[0])), ".", 1);
+	memcpy(body + deh_location(&(deh[1])), "..", 2);
+}
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
new file mode 100644
index 00000000..60c08044
--- /dev/null
+++ b/fs/reiserfs/do_balan.c
@@ -0,0 +1,2074 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+/* Now we have all buffers that must be used in balancing of the tree 	*/
+/* Further calculations can not cause schedule(), and thus the buffer 	*/
+/* tree will be stable until the balancing will be finished 		*/
+/* balance the tree according to the analysis made before,		*/
+/* and using buffers obtained after all above.				*/
+
+/**
+ ** balance_leaf_when_delete
+ ** balance_leaf
+ ** do_balance
+ **
+ **/
+
+#include <asm/uaccess.h>
+#include <linux/time.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/kernel.h>
+
+static inline void buffer_info_init_left(struct tree_balance *tb,
+                                         struct buffer_info *bi)
+{
+	bi->tb          = tb;
+	bi->bi_bh       = tb->L[0];
+	bi->bi_parent   = tb->FL[0];
+	bi->bi_position = get_left_neighbor_position(tb, 0);
+}
+
+static inline void buffer_info_init_right(struct tree_balance *tb,
+                                          struct buffer_info *bi)
+{
+	bi->tb          = tb;
+	bi->bi_bh       = tb->R[0];
+	bi->bi_parent   = tb->FR[0];
+	bi->bi_position = get_right_neighbor_position(tb, 0);
+}
+
+static inline void buffer_info_init_tbS0(struct tree_balance *tb,
+                                         struct buffer_info *bi)
+{
+	bi->tb          = tb;
+	bi->bi_bh        = PATH_PLAST_BUFFER(tb->tb_path);
+	bi->bi_parent   = PATH_H_PPARENT(tb->tb_path, 0);
+	bi->bi_position = PATH_H_POSITION(tb->tb_path, 1);
+}
+
+static inline void buffer_info_init_bh(struct tree_balance *tb,
+                                       struct buffer_info *bi,
+                                       struct buffer_head *bh)
+{
+	bi->tb          = tb;
+	bi->bi_bh       = bh;
+	bi->bi_parent   = NULL;
+	bi->bi_position = 0;
+}
+
+inline void do_balance_mark_leaf_dirty(struct tree_balance *tb,
+				       struct buffer_head *bh, int flag)
+{
+	journal_mark_dirty(tb->transaction_handle,
+			   tb->transaction_handle->t_super, bh);
+}
+
+#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
+#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
+
+/* summary:
+ if deleting something ( tb->insert_size[0] < 0 )
+   return(balance_leaf_when_delete()); (flag d handled here)
+ else
+   if lnum is larger than 0 we put items into the left node
+   if rnum is larger than 0 we put items into the right node
+   if snum1 is larger than 0 we put items into the new node s1
+   if snum2 is larger than 0 we put items into the new node s2
+Note that all *num* count new items being created.
+
+It would be easier to read balance_leaf() if each of these summary
+lines was a separate procedure rather than being inlined.  I think
+that there are many passages here and in balance_leaf_when_delete() in
+which two calls to one procedure can replace two passages, and it
+might save cache space and improve software maintenance costs to do so.
+
+Vladimir made the perceptive comment that we should offload most of
+the decision making in this function into fix_nodes/check_balance, and
+then create some sort of structure in tb that says what actions should
+be performed by do_balance.
+
+-Hans */
+
+/* Balance leaf node in case of delete or cut: insert_size[0] < 0
+ *
+ * lnum, rnum can have values >= -1
+ *	-1 means that the neighbor must be joined with S
+ *	 0 means that nothing should be done with the neighbor
+ *	>0 means to shift entirely or partly the specified number of items to the neighbor
+ */
+static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int item_pos = PATH_LAST_POSITION(tb->tb_path);
+	int pos_in_item = tb->tb_path->pos_in_item;
+	struct buffer_info bi;
+	int n;
+	struct item_head *ih;
+
+	RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1,
+	       "vs- 12000: level: wrong FR %z", tb->FR[0]);
+	RFALSE(tb->blknum[0] > 1,
+	       "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]);
+	RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0),
+	       "PAP-12010: tree can not be empty");
+
+	ih = B_N_PITEM_HEAD(tbS0, item_pos);
+	buffer_info_init_tbS0(tb, &bi);
+
+	/* Delete or truncate the item */
+
+	switch (flag) {
+	case M_DELETE:		/* delete item in S[0] */
+
+		RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0],
+		       "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
+		       -tb->insert_size[0], ih);
+
+		leaf_delete_items(&bi, 0, item_pos, 1, -1);
+
+		if (!item_pos && tb->CFL[0]) {
+			if (B_NR_ITEMS(tbS0)) {
+				replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0,
+					    0);
+			} else {
+				if (!PATH_H_POSITION(tb->tb_path, 1))
+					replace_key(tb, tb->CFL[0], tb->lkey[0],
+						    PATH_H_PPARENT(tb->tb_path,
+								   0), 0);
+			}
+		}
+
+		RFALSE(!item_pos && !tb->CFL[0],
+		       "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0],
+		       tb->L[0]);
+
+		break;
+
+	case M_CUT:{		/* cut item in S[0] */
+			if (is_direntry_le_ih(ih)) {
+
+				/* UFS unlink semantics are such that you can only delete one directory entry at a time. */
+				/* when we cut a directory tb->insert_size[0] means number of entries to be cut (always 1) */
+				tb->insert_size[0] = -1;
+				leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
+						     -tb->insert_size[0]);
+
+				RFALSE(!item_pos && !pos_in_item && !tb->CFL[0],
+				       "PAP-12030: can not change delimiting key. CFL[0]=%p",
+				       tb->CFL[0]);
+
+				if (!item_pos && !pos_in_item && tb->CFL[0]) {
+					replace_key(tb, tb->CFL[0], tb->lkey[0],
+						    tbS0, 0);
+				}
+			} else {
+				leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
+						     -tb->insert_size[0]);
+
+				RFALSE(!ih_item_len(ih),
+				       "PAP-12035: cut must leave non-zero dynamic length of item");
+			}
+			break;
+		}
+
+	default:
+		print_cur_tb("12040");
+		reiserfs_panic(tb->tb_sb, "PAP-12040",
+			       "unexpected mode: %s(%d)",
+			       (flag ==
+				M_PASTE) ? "PASTE" : ((flag ==
+						       M_INSERT) ? "INSERT" :
+						      "UNKNOWN"), flag);
+	}
+
+	/* the rule is that no shifting occurs unless by shifting a node can be freed */
+	n = B_NR_ITEMS(tbS0);
+	if (tb->lnum[0]) {	/* L[0] takes part in balancing */
+		if (tb->lnum[0] == -1) {	/* L[0] must be joined with S[0] */
+			if (tb->rnum[0] == -1) {	/* R[0] must be also joined with S[0] */
+				if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) {
+					/* all contents of all the 3 buffers will be in L[0] */
+					if (PATH_H_POSITION(tb->tb_path, 1) == 0
+					    && 1 < B_NR_ITEMS(tb->FR[0]))
+						replace_key(tb, tb->CFL[0],
+							    tb->lkey[0],
+							    tb->FR[0], 1);
+
+					leaf_move_items(LEAF_FROM_S_TO_L, tb, n,
+							-1, NULL);
+					leaf_move_items(LEAF_FROM_R_TO_L, tb,
+							B_NR_ITEMS(tb->R[0]),
+							-1, NULL);
+
+					reiserfs_invalidate_buffer(tb, tbS0);
+					reiserfs_invalidate_buffer(tb,
+								   tb->R[0]);
+
+					return 0;
+				}
+				/* all contents of all the 3 buffers will be in R[0] */
+				leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1,
+						NULL);
+				leaf_move_items(LEAF_FROM_L_TO_R, tb,
+						B_NR_ITEMS(tb->L[0]), -1, NULL);
+
+				/* right_delimiting_key is correct in R[0] */
+				replace_key(tb, tb->CFR[0], tb->rkey[0],
+					    tb->R[0], 0);
+
+				reiserfs_invalidate_buffer(tb, tbS0);
+				reiserfs_invalidate_buffer(tb, tb->L[0]);
+
+				return -1;
+			}
+
+			RFALSE(tb->rnum[0] != 0,
+			       "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]);
+			/* all contents of L[0] and S[0] will be in L[0] */
+			leaf_shift_left(tb, n, -1);
+
+			reiserfs_invalidate_buffer(tb, tbS0);
+
+			return 0;
+		}
+		/* a part of contents of S[0] will be in L[0] and the rest part of S[0] will be in R[0] */
+
+		RFALSE((tb->lnum[0] + tb->rnum[0] < n) ||
+		       (tb->lnum[0] + tb->rnum[0] > n + 1),
+		       "PAP-12050: rnum(%d) and lnum(%d) and item number(%d) in S[0] are not consistent",
+		       tb->rnum[0], tb->lnum[0], n);
+		RFALSE((tb->lnum[0] + tb->rnum[0] == n) &&
+		       (tb->lbytes != -1 || tb->rbytes != -1),
+		       "PAP-12055: bad rbytes (%d)/lbytes (%d) parameters when items are not split",
+		       tb->rbytes, tb->lbytes);
+		RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) &&
+		       (tb->lbytes < 1 || tb->rbytes != -1),
+		       "PAP-12060: bad rbytes (%d)/lbytes (%d) parameters when items are split",
+		       tb->rbytes, tb->lbytes);
+
+		leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+		leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+
+		reiserfs_invalidate_buffer(tb, tbS0);
+
+		return 0;
+	}
+
+	if (tb->rnum[0] == -1) {
+		/* all contents of R[0] and S[0] will be in R[0] */
+		leaf_shift_right(tb, n, -1);
+		reiserfs_invalidate_buffer(tb, tbS0);
+		return 0;
+	}
+
+	RFALSE(tb->rnum[0],
+	       "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]);
+	return 0;
+}
+
+static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item header of inserted item (this is on little endian) */
+			const char *body,	/* body  of inserted item or bytes to paste */
+			int flag,	/* i - insert, d - delete, c - cut, p - paste
+					   (see comment to do_balance) */
+			struct item_head *insert_key,	/* in our processing of one level we sometimes determine what
+							   must be inserted into the next higher level.  This insertion
+							   consists of a key or two keys and their corresponding
+							   pointers */
+			struct buffer_head **insert_ptr	/* inserted node-ptrs for the next level */
+    )
+{
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int item_pos = PATH_LAST_POSITION(tb->tb_path);	/*  index into the array of item headers in S[0]
+							   of the affected item */
+	struct buffer_info bi;
+	struct buffer_head *S_new[2];	/* new nodes allocated to hold what could not fit into S */
+	int snum[2];		/* number of items that will be placed
+				   into S_new (includes partially shifted
+				   items) */
+	int sbytes[2];		/* if an item is partially shifted into S_new then
+				   if it is a directory item
+				   it is the number of entries from the item that are shifted into S_new
+				   else
+				   it is the number of bytes from the item that are shifted into S_new
+				 */
+	int n, i;
+	int ret_val;
+	int pos_in_item;
+	int zeros_num;
+
+	PROC_INFO_INC(tb->tb_sb, balance_at[0]);
+
+	/* Make balance in case insert_size[0] < 0 */
+	if (tb->insert_size[0] < 0)
+		return balance_leaf_when_delete(tb, flag);
+
+	zeros_num = 0;
+	if (flag == M_INSERT && !body)
+		zeros_num = ih_item_len(ih);
+
+	pos_in_item = tb->tb_path->pos_in_item;
+	/* for indirect item pos_in_item is measured in unformatted node
+	   pointers. Recalculate to bytes */
+	if (flag != M_INSERT
+	    && is_indirect_le_ih(B_N_PITEM_HEAD(tbS0, item_pos)))
+		pos_in_item *= UNFM_P_SIZE;
+
+	if (tb->lnum[0] > 0) {
+		/* Shift lnum[0] items from S[0] to the left neighbor L[0] */
+		if (item_pos < tb->lnum[0]) {
+			/* new item or it part falls to L[0], shift it too */
+			n = B_NR_ITEMS(tb->L[0]);
+
+			switch (flag) {
+			case M_INSERT:	/* insert item into L[0] */
+
+				if (item_pos == tb->lnum[0] - 1
+				    && tb->lbytes != -1) {
+					/* part of new item falls into L[0] */
+					int new_item_len;
+					int version;
+
+					ret_val =
+					    leaf_shift_left(tb, tb->lnum[0] - 1,
+							    -1);
+
+					/* Calculate item length to insert to S[0] */
+					new_item_len =
+					    ih_item_len(ih) - tb->lbytes;
+					/* Calculate and check item length to insert to L[0] */
+					put_ih_item_len(ih,
+							ih_item_len(ih) -
+							new_item_len);
+
+					RFALSE(ih_item_len(ih) <= 0,
+					       "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d",
+					       ih_item_len(ih));
+
+					/* Insert new item into L[0] */
+					buffer_info_init_left(tb, &bi);
+					leaf_insert_into_buf(&bi,
+							     n + item_pos -
+							     ret_val, ih, body,
+							     zeros_num >
+							     ih_item_len(ih) ?
+							     ih_item_len(ih) :
+							     zeros_num);
+
+					version = ih_version(ih);
+
+					/* Calculate key component, item length and body to insert into S[0] */
+					set_le_ih_k_offset(ih,
+							   le_ih_k_offset(ih) +
+							   (tb->
+							    lbytes <<
+							    (is_indirect_le_ih
+							     (ih) ? tb->tb_sb->
+							     s_blocksize_bits -
+							     UNFM_P_SHIFT :
+							     0)));
+
+					put_ih_item_len(ih, new_item_len);
+					if (tb->lbytes > zeros_num) {
+						body +=
+						    (tb->lbytes - zeros_num);
+						zeros_num = 0;
+					} else
+						zeros_num -= tb->lbytes;
+
+					RFALSE(ih_item_len(ih) <= 0,
+					       "PAP-12085: there is nothing to insert into S[0]: ih_item_len=%d",
+					       ih_item_len(ih));
+				} else {
+					/* new item in whole falls into L[0] */
+					/* Shift lnum[0]-1 items to L[0] */
+					ret_val =
+					    leaf_shift_left(tb, tb->lnum[0] - 1,
+							    tb->lbytes);
+					/* Insert new item into L[0] */
+					buffer_info_init_left(tb, &bi);
+					leaf_insert_into_buf(&bi,
+							     n + item_pos -
+							     ret_val, ih, body,
+							     zeros_num);
+					tb->insert_size[0] = 0;
+					zeros_num = 0;
+				}
+				break;
+
+			case M_PASTE:	/* append item in L[0] */
+
+				if (item_pos == tb->lnum[0] - 1
+				    && tb->lbytes != -1) {
+					/* we must shift the part of the appended item */
+					if (is_direntry_le_ih
+					    (B_N_PITEM_HEAD(tbS0, item_pos))) {
+
+						RFALSE(zeros_num,
+						       "PAP-12090: invalid parameter in case of a directory");
+						/* directory item */
+						if (tb->lbytes > pos_in_item) {
+							/* new directory entry falls into L[0] */
+							struct item_head
+							    *pasted;
+							int l_pos_in_item =
+							    pos_in_item;
+
+							/* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */
+							ret_val =
+							    leaf_shift_left(tb,
+									    tb->
+									    lnum
+									    [0],
+									    tb->
+									    lbytes
+									    -
+									    1);
+							if (ret_val
+							    && !item_pos) {
+								pasted =
+								    B_N_PITEM_HEAD
+								    (tb->L[0],
+								     B_NR_ITEMS
+								     (tb->
+								      L[0]) -
+								     1);
+								l_pos_in_item +=
+								    I_ENTRY_COUNT
+								    (pasted) -
+								    (tb->
+								     lbytes -
+								     1);
+							}
+
+							/* Append given directory entry to directory item */
+							buffer_info_init_left(tb, &bi);
+							leaf_paste_in_buffer
+							    (&bi,
+							     n + item_pos -
+							     ret_val,
+							     l_pos_in_item,
+							     tb->insert_size[0],
+							     body, zeros_num);
+
+							/* previous string prepared space for pasting new entry, following string pastes this entry */
+
+							/* when we have merge directory item, pos_in_item has been changed too */
+
+							/* paste new directory entry. 1 is entry number */
+							leaf_paste_entries(&bi,
+									   n +
+									   item_pos
+									   -
+									   ret_val,
+									   l_pos_in_item,
+									   1,
+									   (struct
+									    reiserfs_de_head
+									    *)
+									   body,
+									   body
+									   +
+									   DEH_SIZE,
+									   tb->
+									   insert_size
+									   [0]
+							    );
+							tb->insert_size[0] = 0;
+						} else {
+							/* new directory item doesn't fall into L[0] */
+							/* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */
+							leaf_shift_left(tb,
+									tb->
+									lnum[0],
+									tb->
+									lbytes);
+						}
+						/* Calculate new position to append in item body */
+						pos_in_item -= tb->lbytes;
+					} else {
+						/* regular object */
+						RFALSE(tb->lbytes <= 0,
+						       "PAP-12095: there is nothing to shift to L[0]. lbytes=%d",
+						       tb->lbytes);
+						RFALSE(pos_in_item !=
+						       ih_item_len
+						       (B_N_PITEM_HEAD
+							(tbS0, item_pos)),
+						       "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d",
+						       ih_item_len
+						       (B_N_PITEM_HEAD
+							(tbS0, item_pos)),
+						       pos_in_item);
+
+						if (tb->lbytes >= pos_in_item) {
+							/* appended item will be in L[0] in whole */
+							int l_n;
+
+							/* this bytes number must be appended to the last item of L[h] */
+							l_n =
+							    tb->lbytes -
+							    pos_in_item;
+
+							/* Calculate new insert_size[0] */
+							tb->insert_size[0] -=
+							    l_n;
+
+							RFALSE(tb->
+							       insert_size[0] <=
+							       0,
+							       "PAP-12105: there is nothing to paste into L[0]. insert_size=%d",
+							       tb->
+							       insert_size[0]);
+							ret_val =
+							    leaf_shift_left(tb,
+									    tb->
+									    lnum
+									    [0],
+									    ih_item_len
+									    (B_N_PITEM_HEAD
+									     (tbS0,
+									      item_pos)));
+							/* Append to body of item in L[0] */
+							buffer_info_init_left(tb, &bi);
+							leaf_paste_in_buffer
+							    (&bi,
+							     n + item_pos -
+							     ret_val,
+							     ih_item_len
+							     (B_N_PITEM_HEAD
+							      (tb->L[0],
+							       n + item_pos -
+							       ret_val)), l_n,
+							     body,
+							     zeros_num >
+							     l_n ? l_n :
+							     zeros_num);
+							/* 0-th item in S0 can be only of DIRECT type when l_n != 0 */
+							{
+								int version;
+								int temp_l =
+								    l_n;
+
+								RFALSE
+								    (ih_item_len
+								     (B_N_PITEM_HEAD
+								      (tbS0,
+								       0)),
+								     "PAP-12106: item length must be 0");
+								RFALSE
+								    (comp_short_le_keys
+								     (B_N_PKEY
+								      (tbS0, 0),
+								      B_N_PKEY
+								      (tb->L[0],
+								       n +
+								       item_pos
+								       -
+								       ret_val)),
+								     "PAP-12107: items must be of the same file");
+								if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val))) {
+									temp_l =
+									    l_n
+									    <<
+									    (tb->
+									     tb_sb->
+									     s_blocksize_bits
+									     -
+									     UNFM_P_SHIFT);
+								}
+								/* update key of first item in S0 */
+								version =
+								    ih_version
+								    (B_N_PITEM_HEAD
+								     (tbS0, 0));
+								set_le_key_k_offset
+								    (version,
+								     B_N_PKEY
+								     (tbS0, 0),
+								     le_key_k_offset
+								     (version,
+								      B_N_PKEY
+								      (tbS0,
+								       0)) +
+								     temp_l);
+								/* update left delimiting key */
+								set_le_key_k_offset
+								    (version,
+								     B_N_PDELIM_KEY
+								     (tb->
+								      CFL[0],
+								      tb->
+								      lkey[0]),
+								     le_key_k_offset
+								     (version,
+								      B_N_PDELIM_KEY
+								      (tb->
+								       CFL[0],
+								       tb->
+								       lkey[0]))
+								     + temp_l);
+							}
+
+							/* Calculate new body, position in item and insert_size[0] */
+							if (l_n > zeros_num) {
+								body +=
+								    (l_n -
+								     zeros_num);
+								zeros_num = 0;
+							} else
+								zeros_num -=
+								    l_n;
+							pos_in_item = 0;
+
+							RFALSE
+							    (comp_short_le_keys
+							     (B_N_PKEY(tbS0, 0),
+							      B_N_PKEY(tb->L[0],
+								       B_NR_ITEMS
+								       (tb->
+									L[0]) -
+								       1))
+							     ||
+							     !op_is_left_mergeable
+							     (B_N_PKEY(tbS0, 0),
+							      tbS0->b_size)
+							     ||
+							     !op_is_left_mergeable
+							     (B_N_PDELIM_KEY
+							      (tb->CFL[0],
+							       tb->lkey[0]),
+							      tbS0->b_size),
+							     "PAP-12120: item must be merge-able with left neighboring item");
+						} else {	/* only part of the appended item will be in L[0] */
+
+							/* Calculate position in item for append in S[0] */
+							pos_in_item -=
+							    tb->lbytes;
+
+							RFALSE(pos_in_item <= 0,
+							       "PAP-12125: no place for paste. pos_in_item=%d",
+							       pos_in_item);
+
+							/* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
+							leaf_shift_left(tb,
+									tb->
+									lnum[0],
+									tb->
+									lbytes);
+						}
+					}
+				} else {	/* appended item will be in L[0] in whole */
+
+					struct item_head *pasted;
+
+					if (!item_pos && op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size)) {	/* if we paste into first item of S[0] and it is left mergable */
+						/* then increment pos_in_item by the size of the last item in L[0] */
+						pasted =
+						    B_N_PITEM_HEAD(tb->L[0],
+								   n - 1);
+						if (is_direntry_le_ih(pasted))
+							pos_in_item +=
+							    ih_entry_count
+							    (pasted);
+						else
+							pos_in_item +=
+							    ih_item_len(pasted);
+					}
+
+					/* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
+					ret_val =
+					    leaf_shift_left(tb, tb->lnum[0],
+							    tb->lbytes);
+					/* Append to body of item in L[0] */
+					buffer_info_init_left(tb, &bi);
+					leaf_paste_in_buffer(&bi,
+							     n + item_pos -
+							     ret_val,
+							     pos_in_item,
+							     tb->insert_size[0],
+							     body, zeros_num);
+
+					/* if appended item is directory, paste entry */
+					pasted =
+					    B_N_PITEM_HEAD(tb->L[0],
+							   n + item_pos -
+							   ret_val);
+					if (is_direntry_le_ih(pasted))
+						leaf_paste_entries(&bi,
+								   n +
+								   item_pos -
+								   ret_val,
+								   pos_in_item,
+								   1,
+								   (struct
+								    reiserfs_de_head
+								    *)body,
+								   body +
+								   DEH_SIZE,
+								   tb->
+								   insert_size
+								   [0]
+						    );
+					/* if appended item is indirect item, put unformatted node into un list */
+					if (is_indirect_le_ih(pasted))
+						set_ih_free_space(pasted, 0);
+					tb->insert_size[0] = 0;
+					zeros_num = 0;
+				}
+				break;
+			default:	/* cases d and t */
+				reiserfs_panic(tb->tb_sb, "PAP-12130",
+					       "lnum > 0: unexpected mode: "
+					       " %s(%d)",
+					       (flag ==
+						M_DELETE) ? "DELETE" : ((flag ==
+									 M_CUT)
+									? "CUT"
+									:
+									"UNKNOWN"),
+					       flag);
+			}
+		} else {
+			/* new item doesn't fall into L[0] */
+			leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+		}
+	}
+
+	/* tb->lnum[0] > 0 */
+	/* Calculate new item position */
+	item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0));
+
+	if (tb->rnum[0] > 0) {
+		/* shift rnum[0] items from S[0] to the right neighbor R[0] */
+		n = B_NR_ITEMS(tbS0);
+		switch (flag) {
+
+		case M_INSERT:	/* insert item */
+			if (n - tb->rnum[0] < item_pos) {	/* new item or its part falls to R[0] */
+				if (item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) {	/* part of new item falls into R[0] */
+					loff_t old_key_comp, old_len,
+					    r_zeros_number;
+					const char *r_body;
+					int version;
+					loff_t offset;
+
+					leaf_shift_right(tb, tb->rnum[0] - 1,
+							 -1);
+
+					version = ih_version(ih);
+					/* Remember key component and item length */
+					old_key_comp = le_ih_k_offset(ih);
+					old_len = ih_item_len(ih);
+
+					/* Calculate key component and item length to insert into R[0] */
+					offset =
+					    le_ih_k_offset(ih) +
+					    ((old_len -
+					      tb->
+					      rbytes) << (is_indirect_le_ih(ih)
+							  ? tb->tb_sb->
+							  s_blocksize_bits -
+							  UNFM_P_SHIFT : 0));
+					set_le_ih_k_offset(ih, offset);
+					put_ih_item_len(ih, tb->rbytes);
+					/* Insert part of the item into R[0] */
+					buffer_info_init_right(tb, &bi);
+					if ((old_len - tb->rbytes) > zeros_num) {
+						r_zeros_number = 0;
+						r_body =
+						    body + (old_len -
+							    tb->rbytes) -
+						    zeros_num;
+					} else {
+						r_body = body;
+						r_zeros_number =
+						    zeros_num - (old_len -
+								 tb->rbytes);
+						zeros_num -= r_zeros_number;
+					}
+
+					leaf_insert_into_buf(&bi, 0, ih, r_body,
+							     r_zeros_number);
+
+					/* Replace right delimiting key by first key in R[0] */
+					replace_key(tb, tb->CFR[0], tb->rkey[0],
+						    tb->R[0], 0);
+
+					/* Calculate key component and item length to insert into S[0] */
+					set_le_ih_k_offset(ih, old_key_comp);
+					put_ih_item_len(ih,
+							old_len - tb->rbytes);
+
+					tb->insert_size[0] -= tb->rbytes;
+
+				} else {	/* whole new item falls into R[0] */
+
+					/* Shift rnum[0]-1 items to R[0] */
+					ret_val =
+					    leaf_shift_right(tb,
+							     tb->rnum[0] - 1,
+							     tb->rbytes);
+					/* Insert new item into R[0] */
+					buffer_info_init_right(tb, &bi);
+					leaf_insert_into_buf(&bi,
+							     item_pos - n +
+							     tb->rnum[0] - 1,
+							     ih, body,
+							     zeros_num);
+
+					if (item_pos - n + tb->rnum[0] - 1 == 0) {
+						replace_key(tb, tb->CFR[0],
+							    tb->rkey[0],
+							    tb->R[0], 0);
+
+					}
+					zeros_num = tb->insert_size[0] = 0;
+				}
+			} else {	/* new item or part of it doesn't fall into R[0] */
+
+				leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+			}
+			break;
+
+		case M_PASTE:	/* append item */
+
+			if (n - tb->rnum[0] <= item_pos) {	/* pasted item or part of it falls to R[0] */
+				if (item_pos == n - tb->rnum[0] && tb->rbytes != -1) {	/* we must shift the part of the appended item */
+					if (is_direntry_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) {	/* we append to directory item */
+						int entry_count;
+
+						RFALSE(zeros_num,
+						       "PAP-12145: invalid parameter in case of a directory");
+						entry_count =
+						    I_ENTRY_COUNT(B_N_PITEM_HEAD
+								  (tbS0,
+								   item_pos));
+						if (entry_count - tb->rbytes <
+						    pos_in_item)
+							/* new directory entry falls into R[0] */
+						{
+							int paste_entry_position;
+
+							RFALSE(tb->rbytes - 1 >=
+							       entry_count
+							       || !tb->
+							       insert_size[0],
+							       "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d",
+							       tb->rbytes,
+							       entry_count);
+							/* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */
+							leaf_shift_right(tb,
+									 tb->
+									 rnum
+									 [0],
+									 tb->
+									 rbytes
+									 - 1);
+							/* Paste given directory entry to directory item */
+							paste_entry_position =
+							    pos_in_item -
+							    entry_count +
+							    tb->rbytes - 1;
+							buffer_info_init_right(tb, &bi);
+							leaf_paste_in_buffer
+							    (&bi, 0,
+							     paste_entry_position,
+							     tb->insert_size[0],
+							     body, zeros_num);
+							/* paste entry */
+							leaf_paste_entries(&bi,
+									   0,
+									   paste_entry_position,
+									   1,
+									   (struct
+									    reiserfs_de_head
+									    *)
+									   body,
+									   body
+									   +
+									   DEH_SIZE,
+									   tb->
+									   insert_size
+									   [0]
+							    );
+
+							if (paste_entry_position
+							    == 0) {
+								/* change delimiting keys */
+								replace_key(tb,
+									    tb->
+									    CFR
+									    [0],
+									    tb->
+									    rkey
+									    [0],
+									    tb->
+									    R
+									    [0],
+									    0);
+							}
+
+							tb->insert_size[0] = 0;
+							pos_in_item++;
+						} else {	/* new directory entry doesn't fall into R[0] */
+
+							leaf_shift_right(tb,
+									 tb->
+									 rnum
+									 [0],
+									 tb->
+									 rbytes);
+						}
+					} else {	/* regular object */
+
+						int n_shift, n_rem,
+						    r_zeros_number;
+						const char *r_body;
+
+						/* Calculate number of bytes which must be shifted from appended item */
+						if ((n_shift =
+						     tb->rbytes -
+						     tb->insert_size[0]) < 0)
+							n_shift = 0;
+
+						RFALSE(pos_in_item !=
+						       ih_item_len
+						       (B_N_PITEM_HEAD
+							(tbS0, item_pos)),
+						       "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d",
+						       pos_in_item,
+						       ih_item_len
+						       (B_N_PITEM_HEAD
+							(tbS0, item_pos)));
+
+						leaf_shift_right(tb,
+								 tb->rnum[0],
+								 n_shift);
+						/* Calculate number of bytes which must remain in body after appending to R[0] */
+						if ((n_rem =
+						     tb->insert_size[0] -
+						     tb->rbytes) < 0)
+							n_rem = 0;
+
+						{
+							int version;
+							unsigned long temp_rem =
+							    n_rem;
+
+							version =
+							    ih_version
+							    (B_N_PITEM_HEAD
+							     (tb->R[0], 0));
+							if (is_indirect_le_key
+							    (version,
+							     B_N_PKEY(tb->R[0],
+								      0))) {
+								temp_rem =
+								    n_rem <<
+								    (tb->tb_sb->
+								     s_blocksize_bits
+								     -
+								     UNFM_P_SHIFT);
+							}
+							set_le_key_k_offset
+							    (version,
+							     B_N_PKEY(tb->R[0],
+								      0),
+							     le_key_k_offset
+							     (version,
+							      B_N_PKEY(tb->R[0],
+								       0)) +
+							     temp_rem);
+							set_le_key_k_offset
+							    (version,
+							     B_N_PDELIM_KEY(tb->
+									    CFR
+									    [0],
+									    tb->
+									    rkey
+									    [0]),
+							     le_key_k_offset
+							     (version,
+							      B_N_PDELIM_KEY
+							      (tb->CFR[0],
+							       tb->rkey[0])) +
+							     temp_rem);
+						}
+/*		  k_offset (B_N_PKEY(tb->R[0],0)) += n_rem;
+		  k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/
+						do_balance_mark_internal_dirty
+						    (tb, tb->CFR[0], 0);
+
+						/* Append part of body into R[0] */
+						buffer_info_init_right(tb, &bi);
+						if (n_rem > zeros_num) {
+							r_zeros_number = 0;
+							r_body =
+							    body + n_rem -
+							    zeros_num;
+						} else {
+							r_body = body;
+							r_zeros_number =
+							    zeros_num - n_rem;
+							zeros_num -=
+							    r_zeros_number;
+						}
+
+						leaf_paste_in_buffer(&bi, 0,
+								     n_shift,
+								     tb->
+								     insert_size
+								     [0] -
+								     n_rem,
+								     r_body,
+								     r_zeros_number);
+
+						if (is_indirect_le_ih
+						    (B_N_PITEM_HEAD
+						     (tb->R[0], 0))) {
+#if 0
+							RFALSE(n_rem,
+							       "PAP-12160: paste more than one unformatted node pointer");
+#endif
+							set_ih_free_space
+							    (B_N_PITEM_HEAD
+							     (tb->R[0], 0), 0);
+						}
+						tb->insert_size[0] = n_rem;
+						if (!n_rem)
+							pos_in_item++;
+					}
+				} else {	/* pasted item in whole falls into R[0] */
+
+					struct item_head *pasted;
+
+					ret_val =
+					    leaf_shift_right(tb, tb->rnum[0],
+							     tb->rbytes);
+					/* append item in R[0] */
+					if (pos_in_item >= 0) {
+						buffer_info_init_right(tb, &bi);
+						leaf_paste_in_buffer(&bi,
+								     item_pos -
+								     n +
+								     tb->
+								     rnum[0],
+								     pos_in_item,
+								     tb->
+								     insert_size
+								     [0], body,
+								     zeros_num);
+					}
+
+					/* paste new entry, if item is directory item */
+					pasted =
+					    B_N_PITEM_HEAD(tb->R[0],
+							   item_pos - n +
+							   tb->rnum[0]);
+					if (is_direntry_le_ih(pasted)
+					    && pos_in_item >= 0) {
+						leaf_paste_entries(&bi,
+								   item_pos -
+								   n +
+								   tb->rnum[0],
+								   pos_in_item,
+								   1,
+								   (struct
+								    reiserfs_de_head
+								    *)body,
+								   body +
+								   DEH_SIZE,
+								   tb->
+								   insert_size
+								   [0]
+						    );
+						if (!pos_in_item) {
+
+							RFALSE(item_pos - n +
+							       tb->rnum[0],
+							       "PAP-12165: directory item must be first item of node when pasting is in 0th position");
+
+							/* update delimiting keys */
+							replace_key(tb,
+								    tb->CFR[0],
+								    tb->rkey[0],
+								    tb->R[0],
+								    0);
+						}
+					}
+
+					if (is_indirect_le_ih(pasted))
+						set_ih_free_space(pasted, 0);
+					zeros_num = tb->insert_size[0] = 0;
+				}
+			} else {	/* new item doesn't fall into R[0] */
+
+				leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+			}
+			break;
+		default:	/* cases d and t */
+			reiserfs_panic(tb->tb_sb, "PAP-12175",
+				       "rnum > 0: unexpected mode: %s(%d)",
+				       (flag ==
+					M_DELETE) ? "DELETE" : ((flag ==
+								 M_CUT) ? "CUT"
+								: "UNKNOWN"),
+				       flag);
+		}
+
+	}
+
+	/* tb->rnum[0] > 0 */
+	RFALSE(tb->blknum[0] > 3,
+	       "PAP-12180: blknum can not be %d. It must be <= 3",
+	       tb->blknum[0]);
+	RFALSE(tb->blknum[0] < 0,
+	       "PAP-12185: blknum can not be %d. It must be >= 0",
+	       tb->blknum[0]);
+
+	/* if while adding to a node we discover that it is possible to split
+	   it in two, and merge the left part into the left neighbor and the
+	   right part into the right neighbor, eliminating the node */
+	if (tb->blknum[0] == 0) {	/* node S[0] is empty now */
+
+		RFALSE(!tb->lnum[0] || !tb->rnum[0],
+		       "PAP-12190: lnum and rnum must not be zero");
+		/* if insertion was done before 0-th position in R[0], right
+		   delimiting key of the tb->L[0]'s and left delimiting key are
+		   not set correctly */
+		if (tb->CFL[0]) {
+			if (!tb->CFR[0])
+				reiserfs_panic(tb->tb_sb, "vs-12195",
+					       "CFR not initialized");
+			copy_key(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]),
+				 B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]));
+			do_balance_mark_internal_dirty(tb, tb->CFL[0], 0);
+		}
+
+		reiserfs_invalidate_buffer(tb, tbS0);
+		return 0;
+	}
+
+	/* Fill new nodes that appear in place of S[0] */
+
+	/* I am told that this copying is because we need an array to enable
+	   the looping code. -Hans */
+	snum[0] = tb->s1num, snum[1] = tb->s2num;
+	sbytes[0] = tb->s1bytes;
+	sbytes[1] = tb->s2bytes;
+	for (i = tb->blknum[0] - 2; i >= 0; i--) {
+
+		RFALSE(!snum[i], "PAP-12200: snum[%d] == %d. Must be > 0", i,
+		       snum[i]);
+
+		/* here we shift from S to S_new nodes */
+
+		S_new[i] = get_FEB(tb);
+
+		/* initialized block type and tree level */
+		set_blkh_level(B_BLK_HEAD(S_new[i]), DISK_LEAF_NODE_LEVEL);
+
+		n = B_NR_ITEMS(tbS0);
+
+		switch (flag) {
+		case M_INSERT:	/* insert item */
+
+			if (n - snum[i] < item_pos) {	/* new item or it's part falls to first new node S_new[i] */
+				if (item_pos == n - snum[i] + 1 && sbytes[i] != -1) {	/* part of new item falls into S_new[i] */
+					int old_key_comp, old_len,
+					    r_zeros_number;
+					const char *r_body;
+					int version;
+
+					/* Move snum[i]-1 items from S[0] to S_new[i] */
+					leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+							snum[i] - 1, -1,
+							S_new[i]);
+					/* Remember key component and item length */
+					version = ih_version(ih);
+					old_key_comp = le_ih_k_offset(ih);
+					old_len = ih_item_len(ih);
+
+					/* Calculate key component and item length to insert into S_new[i] */
+					set_le_ih_k_offset(ih,
+							   le_ih_k_offset(ih) +
+							   ((old_len -
+							     sbytes[i]) <<
+							    (is_indirect_le_ih
+							     (ih) ? tb->tb_sb->
+							     s_blocksize_bits -
+							     UNFM_P_SHIFT :
+							     0)));
+
+					put_ih_item_len(ih, sbytes[i]);
+
+					/* Insert part of the item into S_new[i] before 0-th item */
+					buffer_info_init_bh(tb, &bi, S_new[i]);
+
+					if ((old_len - sbytes[i]) > zeros_num) {
+						r_zeros_number = 0;
+						r_body =
+						    body + (old_len -
+							    sbytes[i]) -
+						    zeros_num;
+					} else {
+						r_body = body;
+						r_zeros_number =
+						    zeros_num - (old_len -
+								 sbytes[i]);
+						zeros_num -= r_zeros_number;
+					}
+
+					leaf_insert_into_buf(&bi, 0, ih, r_body,
+							     r_zeros_number);
+
+					/* Calculate key component and item length to insert into S[i] */
+					set_le_ih_k_offset(ih, old_key_comp);
+					put_ih_item_len(ih,
+							old_len - sbytes[i]);
+					tb->insert_size[0] -= sbytes[i];
+				} else {	/* whole new item falls into S_new[i] */
+
+					/* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */
+					leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+							snum[i] - 1, sbytes[i],
+							S_new[i]);
+
+					/* Insert new item into S_new[i] */
+					buffer_info_init_bh(tb, &bi, S_new[i]);
+					leaf_insert_into_buf(&bi,
+							     item_pos - n +
+							     snum[i] - 1, ih,
+							     body, zeros_num);
+
+					zeros_num = tb->insert_size[0] = 0;
+				}
+			}
+
+			else {	/* new item or it part don't falls into S_new[i] */
+
+				leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+						snum[i], sbytes[i], S_new[i]);
+			}
+			break;
+
+		case M_PASTE:	/* append item */
+
+			if (n - snum[i] <= item_pos) {	/* pasted item or part if it falls to S_new[i] */
+				if (item_pos == n - snum[i] && sbytes[i] != -1) {	/* we must shift part of the appended item */
+					struct item_head *aux_ih;
+
+					RFALSE(ih, "PAP-12210: ih must be 0");
+
+					aux_ih = B_N_PITEM_HEAD(tbS0, item_pos);
+					if (is_direntry_le_ih(aux_ih)) {
+						/* we append to directory item */
+
+						int entry_count;
+
+						entry_count =
+						    ih_entry_count(aux_ih);
+
+						if (entry_count - sbytes[i] <
+						    pos_in_item
+						    && pos_in_item <=
+						    entry_count) {
+							/* new directory entry falls into S_new[i] */
+
+							RFALSE(!tb->
+							       insert_size[0],
+							       "PAP-12215: insert_size is already 0");
+							RFALSE(sbytes[i] - 1 >=
+							       entry_count,
+							       "PAP-12220: there are no so much entries (%d), only %d",
+							       sbytes[i] - 1,
+							       entry_count);
+
+							/* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */
+							leaf_move_items
+							    (LEAF_FROM_S_TO_SNEW,
+							     tb, snum[i],
+							     sbytes[i] - 1,
+							     S_new[i]);
+							/* Paste given directory entry to directory item */
+							buffer_info_init_bh(tb, &bi, S_new[i]);
+							leaf_paste_in_buffer
+							    (&bi, 0,
+							     pos_in_item -
+							     entry_count +
+							     sbytes[i] - 1,
+							     tb->insert_size[0],
+							     body, zeros_num);
+							/* paste new directory entry */
+							leaf_paste_entries(&bi,
+									   0,
+									   pos_in_item
+									   -
+									   entry_count
+									   +
+									   sbytes
+									   [i] -
+									   1, 1,
+									   (struct
+									    reiserfs_de_head
+									    *)
+									   body,
+									   body
+									   +
+									   DEH_SIZE,
+									   tb->
+									   insert_size
+									   [0]
+							    );
+							tb->insert_size[0] = 0;
+							pos_in_item++;
+						} else {	/* new directory entry doesn't fall into S_new[i] */
+							leaf_move_items
+							    (LEAF_FROM_S_TO_SNEW,
+							     tb, snum[i],
+							     sbytes[i],
+							     S_new[i]);
+						}
+					} else {	/* regular object */
+
+						int n_shift, n_rem,
+						    r_zeros_number;
+						const char *r_body;
+
+						RFALSE(pos_in_item !=
+						       ih_item_len
+						       (B_N_PITEM_HEAD
+							(tbS0, item_pos))
+						       || tb->insert_size[0] <=
+						       0,
+						       "PAP-12225: item too short or insert_size <= 0");
+
+						/* Calculate number of bytes which must be shifted from appended item */
+						n_shift =
+						    sbytes[i] -
+						    tb->insert_size[0];
+						if (n_shift < 0)
+							n_shift = 0;
+						leaf_move_items
+						    (LEAF_FROM_S_TO_SNEW, tb,
+						     snum[i], n_shift,
+						     S_new[i]);
+
+						/* Calculate number of bytes which must remain in body after append to S_new[i] */
+						n_rem =
+						    tb->insert_size[0] -
+						    sbytes[i];
+						if (n_rem < 0)
+							n_rem = 0;
+						/* Append part of body into S_new[0] */
+						buffer_info_init_bh(tb, &bi, S_new[i]);
+						if (n_rem > zeros_num) {
+							r_zeros_number = 0;
+							r_body =
+							    body + n_rem -
+							    zeros_num;
+						} else {
+							r_body = body;
+							r_zeros_number =
+							    zeros_num - n_rem;
+							zeros_num -=
+							    r_zeros_number;
+						}
+
+						leaf_paste_in_buffer(&bi, 0,
+								     n_shift,
+								     tb->
+								     insert_size
+								     [0] -
+								     n_rem,
+								     r_body,
+								     r_zeros_number);
+						{
+							struct item_head *tmp;
+
+							tmp =
+							    B_N_PITEM_HEAD(S_new
+									   [i],
+									   0);
+							if (is_indirect_le_ih
+							    (tmp)) {
+								set_ih_free_space
+								    (tmp, 0);
+								set_le_ih_k_offset
+								    (tmp,
+								     le_ih_k_offset
+								     (tmp) +
+								     (n_rem <<
+								      (tb->
+								       tb_sb->
+								       s_blocksize_bits
+								       -
+								       UNFM_P_SHIFT)));
+							} else {
+								set_le_ih_k_offset
+								    (tmp,
+								     le_ih_k_offset
+								     (tmp) +
+								     n_rem);
+							}
+						}
+
+						tb->insert_size[0] = n_rem;
+						if (!n_rem)
+							pos_in_item++;
+					}
+				} else
+					/* item falls wholly into S_new[i] */
+				{
+					int leaf_mi;
+					struct item_head *pasted;
+
+#ifdef CONFIG_REISERFS_CHECK
+					struct item_head *ih_check =
+					    B_N_PITEM_HEAD(tbS0, item_pos);
+
+					if (!is_direntry_le_ih(ih_check)
+					    && (pos_in_item != ih_item_len(ih_check)
+						|| tb->insert_size[0] <= 0))
+						reiserfs_panic(tb->tb_sb,
+							     "PAP-12235",
+							     "pos_in_item "
+							     "must be equal "
+							     "to ih_item_len");
+#endif				/* CONFIG_REISERFS_CHECK */
+
+					leaf_mi =
+					    leaf_move_items(LEAF_FROM_S_TO_SNEW,
+							    tb, snum[i],
+							    sbytes[i],
+							    S_new[i]);
+
+					RFALSE(leaf_mi,
+					       "PAP-12240: unexpected value returned by leaf_move_items (%d)",
+					       leaf_mi);
+
+					/* paste into item */
+					buffer_info_init_bh(tb, &bi, S_new[i]);
+					leaf_paste_in_buffer(&bi,
+							     item_pos - n +
+							     snum[i],
+							     pos_in_item,
+							     tb->insert_size[0],
+							     body, zeros_num);
+
+					pasted =
+					    B_N_PITEM_HEAD(S_new[i],
+							   item_pos - n +
+							   snum[i]);
+					if (is_direntry_le_ih(pasted)) {
+						leaf_paste_entries(&bi,
+								   item_pos -
+								   n + snum[i],
+								   pos_in_item,
+								   1,
+								   (struct
+								    reiserfs_de_head
+								    *)body,
+								   body +
+								   DEH_SIZE,
+								   tb->
+								   insert_size
+								   [0]
+						    );
+					}
+
+					/* if we paste to indirect item update ih_free_space */
+					if (is_indirect_le_ih(pasted))
+						set_ih_free_space(pasted, 0);
+					zeros_num = tb->insert_size[0] = 0;
+				}
+			}
+
+			else {	/* pasted item doesn't fall into S_new[i] */
+
+				leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+						snum[i], sbytes[i], S_new[i]);
+			}
+			break;
+		default:	/* cases d and t */
+			reiserfs_panic(tb->tb_sb, "PAP-12245",
+				       "blknum > 2: unexpected mode: %s(%d)",
+				       (flag ==
+					M_DELETE) ? "DELETE" : ((flag ==
+								 M_CUT) ? "CUT"
+								: "UNKNOWN"),
+				       flag);
+		}
+
+		memcpy(insert_key + i, B_N_PKEY(S_new[i], 0), KEY_SIZE);
+		insert_ptr[i] = S_new[i];
+
+		RFALSE(!buffer_journaled(S_new[i])
+		       || buffer_journal_dirty(S_new[i])
+		       || buffer_dirty(S_new[i]), "PAP-12247: S_new[%d] : (%b)",
+		       i, S_new[i]);
+	}
+
+	/* if the affected item was not wholly shifted then we perform all necessary operations on that part or whole of the
+	   affected item which remains in S */
+	if (0 <= item_pos && item_pos < tb->s0num) {	/* if we must insert or append into buffer S[0] */
+
+		switch (flag) {
+		case M_INSERT:	/* insert item into S[0] */
+			buffer_info_init_tbS0(tb, &bi);
+			leaf_insert_into_buf(&bi, item_pos, ih, body,
+					     zeros_num);
+
+			/* If we insert the first key change the delimiting key */
+			if (item_pos == 0) {
+				if (tb->CFL[0])	/* can be 0 in reiserfsck */
+					replace_key(tb, tb->CFL[0], tb->lkey[0],
+						    tbS0, 0);
+
+			}
+			break;
+
+		case M_PASTE:{	/* append item in S[0] */
+				struct item_head *pasted;
+
+				pasted = B_N_PITEM_HEAD(tbS0, item_pos);
+				/* when directory, may be new entry already pasted */
+				if (is_direntry_le_ih(pasted)) {
+					if (pos_in_item >= 0 &&
+					    pos_in_item <=
+					    ih_entry_count(pasted)) {
+
+						RFALSE(!tb->insert_size[0],
+						       "PAP-12260: insert_size is 0 already");
+
+						/* prepare space */
+						buffer_info_init_tbS0(tb, &bi);
+						leaf_paste_in_buffer(&bi,
+								     item_pos,
+								     pos_in_item,
+								     tb->
+								     insert_size
+								     [0], body,
+								     zeros_num);
+
+						/* paste entry */
+						leaf_paste_entries(&bi,
+								   item_pos,
+								   pos_in_item,
+								   1,
+								   (struct
+								    reiserfs_de_head
+								    *)body,
+								   body +
+								   DEH_SIZE,
+								   tb->
+								   insert_size
+								   [0]
+						    );
+						if (!item_pos && !pos_in_item) {
+							RFALSE(!tb->CFL[0]
+							       || !tb->L[0],
+							       "PAP-12270: CFL[0]/L[0] must be specified");
+							if (tb->CFL[0]) {
+								replace_key(tb,
+									    tb->
+									    CFL
+									    [0],
+									    tb->
+									    lkey
+									    [0],
+									    tbS0,
+									    0);
+
+							}
+						}
+						tb->insert_size[0] = 0;
+					}
+				} else {	/* regular object */
+					if (pos_in_item == ih_item_len(pasted)) {
+
+						RFALSE(tb->insert_size[0] <= 0,
+						       "PAP-12275: insert size must not be %d",
+						       tb->insert_size[0]);
+						buffer_info_init_tbS0(tb, &bi);
+						leaf_paste_in_buffer(&bi,
+								     item_pos,
+								     pos_in_item,
+								     tb->
+								     insert_size
+								     [0], body,
+								     zeros_num);
+
+						if (is_indirect_le_ih(pasted)) {
+#if 0
+							RFALSE(tb->
+							       insert_size[0] !=
+							       UNFM_P_SIZE,
+							       "PAP-12280: insert_size for indirect item must be %d, not %d",
+							       UNFM_P_SIZE,
+							       tb->
+							       insert_size[0]);
+#endif
+							set_ih_free_space
+							    (pasted, 0);
+						}
+						tb->insert_size[0] = 0;
+					}
+#ifdef CONFIG_REISERFS_CHECK
+					else {
+						if (tb->insert_size[0]) {
+							print_cur_tb("12285");
+							reiserfs_panic(tb->
+								       tb_sb,
+							    "PAP-12285",
+							    "insert_size "
+							    "must be 0 "
+							    "(%d)",
+							    tb->insert_size[0]);
+						}
+					}
+#endif				/* CONFIG_REISERFS_CHECK */
+
+				}
+			}	/* case M_PASTE: */
+		}
+	}
+#ifdef CONFIG_REISERFS_CHECK
+	if (flag == M_PASTE && tb->insert_size[0]) {
+		print_cur_tb("12290");
+		reiserfs_panic(tb->tb_sb,
+			       "PAP-12290", "insert_size is still not 0 (%d)",
+			       tb->insert_size[0]);
+	}
+#endif				/* CONFIG_REISERFS_CHECK */
+	return 0;
+}				/* Leaf level of the tree is balanced (end of balance_leaf) */
+
+/* Make empty node */
+void make_empty_node(struct buffer_info *bi)
+{
+	struct block_head *blkh;
+
+	RFALSE(bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL");
+
+	blkh = B_BLK_HEAD(bi->bi_bh);
+	set_blkh_nr_item(blkh, 0);
+	set_blkh_free_space(blkh, MAX_CHILD_SIZE(bi->bi_bh));
+
+	if (bi->bi_parent)
+		B_N_CHILD(bi->bi_parent, bi->bi_position)->dc_size = 0;	/* Endian safe if 0 */
+}
+
+/* Get first empty buffer */
+struct buffer_head *get_FEB(struct tree_balance *tb)
+{
+	int i;
+	struct buffer_info bi;
+
+	for (i = 0; i < MAX_FEB_SIZE; i++)
+		if (tb->FEB[i] != NULL)
+			break;
+
+	if (i == MAX_FEB_SIZE)
+		reiserfs_panic(tb->tb_sb, "vs-12300", "FEB list is empty");
+
+	buffer_info_init_bh(tb, &bi, tb->FEB[i]);
+	make_empty_node(&bi);
+	set_buffer_uptodate(tb->FEB[i]);
+	tb->used[i] = tb->FEB[i];
+	tb->FEB[i] = NULL;
+
+	return tb->used[i];
+}
+
+/* This is now used because reiserfs_free_block has to be able to
+** schedule.
+*/
+static void store_thrown(struct tree_balance *tb, struct buffer_head *bh)
+{
+	int i;
+
+	if (buffer_dirty(bh))
+		reiserfs_warning(tb->tb_sb, "reiserfs-12320",
+				 "called with dirty buffer");
+	for (i = 0; i < ARRAY_SIZE(tb->thrown); i++)
+		if (!tb->thrown[i]) {
+			tb->thrown[i] = bh;
+			get_bh(bh);	/* free_thrown puts this */
+			return;
+		}
+	reiserfs_warning(tb->tb_sb, "reiserfs-12321",
+			 "too many thrown buffers");
+}
+
+static void free_thrown(struct tree_balance *tb)
+{
+	int i;
+	b_blocknr_t blocknr;
+	for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) {
+		if (tb->thrown[i]) {
+			blocknr = tb->thrown[i]->b_blocknr;
+			if (buffer_dirty(tb->thrown[i]))
+				reiserfs_warning(tb->tb_sb, "reiserfs-12322",
+						 "called with dirty buffer %d",
+						 blocknr);
+			brelse(tb->thrown[i]);	/* incremented in store_thrown */
+			reiserfs_free_block(tb->transaction_handle, NULL,
+					    blocknr, 0);
+		}
+	}
+}
+
+void reiserfs_invalidate_buffer(struct tree_balance *tb, struct buffer_head *bh)
+{
+	struct block_head *blkh;
+	blkh = B_BLK_HEAD(bh);
+	set_blkh_level(blkh, FREE_LEVEL);
+	set_blkh_nr_item(blkh, 0);
+
+	clear_buffer_dirty(bh);
+	store_thrown(tb, bh);
+}
+
+/* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/
+void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest,
+		 struct buffer_head *src, int n_src)
+{
+
+	RFALSE(dest == NULL || src == NULL,
+	       "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)",
+	       src, dest);
+	RFALSE(!B_IS_KEYS_LEVEL(dest),
+	       "vs-12310: invalid level (%z) for destination buffer. dest must be leaf",
+	       dest);
+	RFALSE(n_dest < 0 || n_src < 0,
+	       "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest);
+	RFALSE(n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src),
+	       "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big",
+	       n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest));
+
+	if (B_IS_ITEMS_LEVEL(src))
+		/* source buffer contains leaf node */
+		memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PITEM_HEAD(src, n_src),
+		       KEY_SIZE);
+	else
+		memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PDELIM_KEY(src, n_src),
+		       KEY_SIZE);
+
+	do_balance_mark_internal_dirty(tb, dest, 0);
+}
+
+int get_left_neighbor_position(struct tree_balance *tb, int h)
+{
+	int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
+
+	RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FL[h] == NULL,
+	       "vs-12325: FL[%d](%p) or F[%d](%p) does not exist",
+	       h, tb->FL[h], h, PATH_H_PPARENT(tb->tb_path, h));
+
+	if (Sh_position == 0)
+		return B_NR_ITEMS(tb->FL[h]);
+	else
+		return Sh_position - 1;
+}
+
+int get_right_neighbor_position(struct tree_balance *tb, int h)
+{
+	int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
+
+	RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FR[h] == NULL,
+	       "vs-12330: F[%d](%p) or FR[%d](%p) does not exist",
+	       h, PATH_H_PPARENT(tb->tb_path, h), h, tb->FR[h]);
+
+	if (Sh_position == B_NR_ITEMS(PATH_H_PPARENT(tb->tb_path, h)))
+		return 0;
+	else
+		return Sh_position + 1;
+}
+
+#ifdef CONFIG_REISERFS_CHECK
+
+int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
+static void check_internal_node(struct super_block *s, struct buffer_head *bh,
+				char *mes)
+{
+	struct disk_child *dc;
+	int i;
+
+	RFALSE(!bh, "PAP-12336: bh == 0");
+
+	if (!bh || !B_IS_IN_TREE(bh))
+		return;
+
+	RFALSE(!buffer_dirty(bh) &&
+	       !(buffer_journaled(bh) || buffer_journal_dirty(bh)),
+	       "PAP-12337: buffer (%b) must be dirty", bh);
+	dc = B_N_CHILD(bh, 0);
+
+	for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) {
+		if (!is_reusable(s, dc_block_number(dc), 1)) {
+			print_cur_tb(mes);
+			reiserfs_panic(s, "PAP-12338",
+				       "invalid child pointer %y in %b",
+				       dc, bh);
+		}
+	}
+}
+
+static int locked_or_not_in_tree(struct tree_balance *tb,
+				  struct buffer_head *bh, char *which)
+{
+	if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) ||
+	    !B_IS_IN_TREE(bh)) {
+		reiserfs_warning(tb->tb_sb, "vs-12339", "%s (%b)", which, bh);
+		return 1;
+	}
+	return 0;
+}
+
+static int check_before_balancing(struct tree_balance *tb)
+{
+	int retval = 0;
+
+	if (REISERFS_SB(tb->tb_sb)->cur_tb) {
+		reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
+			       "occurred based on cur_tb not being null at "
+			       "this point in code. do_balance cannot properly "
+			       "handle concurrent tree accesses on a same "
+			       "mount point.");
+	}
+
+	/* double check that buffers that we will modify are unlocked. (fix_nodes should already have
+	   prepped all of these for us). */
+	if (tb->lnum[0]) {
+		retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]");
+		retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]");
+		retval |= locked_or_not_in_tree(tb, tb->CFL[0], "CFL[0]");
+		check_leaf(tb->L[0]);
+	}
+	if (tb->rnum[0]) {
+		retval |= locked_or_not_in_tree(tb, tb->R[0], "R[0]");
+		retval |= locked_or_not_in_tree(tb, tb->FR[0], "FR[0]");
+		retval |= locked_or_not_in_tree(tb, tb->CFR[0], "CFR[0]");
+		check_leaf(tb->R[0]);
+	}
+	retval |= locked_or_not_in_tree(tb, PATH_PLAST_BUFFER(tb->tb_path),
+					"S[0]");
+	check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
+
+	return retval;
+}
+
+static void check_after_balance_leaf(struct tree_balance *tb)
+{
+	if (tb->lnum[0]) {
+		if (B_FREE_SPACE(tb->L[0]) !=
+		    MAX_CHILD_SIZE(tb->L[0]) -
+		    dc_size(B_N_CHILD
+			    (tb->FL[0], get_left_neighbor_position(tb, 0)))) {
+			print_cur_tb("12221");
+			reiserfs_panic(tb->tb_sb, "PAP-12355",
+				       "shift to left was incorrect");
+		}
+	}
+	if (tb->rnum[0]) {
+		if (B_FREE_SPACE(tb->R[0]) !=
+		    MAX_CHILD_SIZE(tb->R[0]) -
+		    dc_size(B_N_CHILD
+			    (tb->FR[0], get_right_neighbor_position(tb, 0)))) {
+			print_cur_tb("12222");
+			reiserfs_panic(tb->tb_sb, "PAP-12360",
+				       "shift to right was incorrect");
+		}
+	}
+	if (PATH_H_PBUFFER(tb->tb_path, 1) &&
+	    (B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)) !=
+	     (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
+	      dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
+				PATH_H_POSITION(tb->tb_path, 1)))))) {
+		int left = B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0));
+		int right = (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
+			     dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
+					       PATH_H_POSITION(tb->tb_path,
+							       1))));
+		print_cur_tb("12223");
+		reiserfs_warning(tb->tb_sb, "reiserfs-12363",
+				 "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; "
+				 "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d",
+				 left,
+				 MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)),
+				 PATH_H_PBUFFER(tb->tb_path, 1),
+				 PATH_H_POSITION(tb->tb_path, 1),
+				 dc_size(B_N_CHILD
+					 (PATH_H_PBUFFER(tb->tb_path, 1),
+					  PATH_H_POSITION(tb->tb_path, 1))),
+				 right);
+		reiserfs_panic(tb->tb_sb, "PAP-12365", "S is incorrect");
+	}
+}
+
+static void check_leaf_level(struct tree_balance *tb)
+{
+	check_leaf(tb->L[0]);
+	check_leaf(tb->R[0]);
+	check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
+}
+
+static void check_internal_levels(struct tree_balance *tb)
+{
+	int h;
+
+	/* check all internal nodes */
+	for (h = 1; tb->insert_size[h]; h++) {
+		check_internal_node(tb->tb_sb, PATH_H_PBUFFER(tb->tb_path, h),
+				    "BAD BUFFER ON PATH");
+		if (tb->lnum[h])
+			check_internal_node(tb->tb_sb, tb->L[h], "BAD L");
+		if (tb->rnum[h])
+			check_internal_node(tb->tb_sb, tb->R[h], "BAD R");
+	}
+
+}
+
+#endif
+
+/* Now we have all of the buffers that must be used in balancing of
+   the tree.  We rely on the assumption that schedule() will not occur
+   while do_balance works. ( Only interrupt handlers are acceptable.)
+   We balance the tree according to the analysis made before this,
+   using buffers already obtained.  For SMP support it will someday be
+   necessary to add ordered locking of tb. */
+
+/* Some interesting rules of balancing:
+
+   we delete a maximum of two nodes per level per balancing: we never
+   delete R, when we delete two of three nodes L, S, R then we move
+   them into R.
+
+   we only delete L if we are deleting two nodes, if we delete only
+   one node we delete S
+
+   if we shift leaves then we shift as much as we can: this is a
+   deliberate policy of extremism in node packing which results in
+   higher average utilization after repeated random balance operations
+   at the cost of more memory copies and more balancing as a result of
+   small insertions to full nodes.
+
+   if we shift internal nodes we try to evenly balance the node
+   utilization, with consequent less balancing at the cost of lower
+   utilization.
+
+   one could argue that the policy for directories in leaves should be
+   that of internal nodes, but we will wait until another day to
+   evaluate this....  It would be nice to someday measure and prove
+   these assumptions as to what is optimal....
+
+*/
+
+static inline void do_balance_starts(struct tree_balance *tb)
+{
+	/* use print_cur_tb() to see initial state of struct
+	   tree_balance */
+
+	/* store_print_tb (tb); */
+
+	/* do not delete, just comment it out */
+/*    print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb,
+	     "check");*/
+	RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
+#ifdef CONFIG_REISERFS_CHECK
+	REISERFS_SB(tb->tb_sb)->cur_tb = tb;
+#endif
+}
+
+static inline void do_balance_completed(struct tree_balance *tb)
+{
+
+#ifdef CONFIG_REISERFS_CHECK
+	check_leaf_level(tb);
+	check_internal_levels(tb);
+	REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
+#endif
+
+	/* reiserfs_free_block is no longer schedule safe.  So, we need to
+	 ** put the buffers we want freed on the thrown list during do_balance,
+	 ** and then free them now
+	 */
+
+	REISERFS_SB(tb->tb_sb)->s_do_balance++;
+
+	/* release all nodes hold to perform the balancing */
+	unfix_nodes(tb);
+
+	free_thrown(tb);
+}
+
+void do_balance(struct tree_balance *tb,	/* tree_balance structure */
+		struct item_head *ih,	/* item header of inserted item */
+		const char *body,	/* body  of inserted item or bytes to paste */
+		int flag)
+{				/* i - insert, d - delete
+				   c - cut, p - paste
+
+				   Cut means delete part of an item
+				   (includes removing an entry from a
+				   directory).
+
+				   Delete means delete whole item.
+
+				   Insert means add a new item into the
+				   tree.
+
+				   Paste means to append to the end of an
+				   existing file or to insert a directory
+				   entry.  */
+	int child_pos,		/* position of a child node in its parent */
+	 h;			/* level of the tree being processed */
+	struct item_head insert_key[2];	/* in our processing of one level
+					   we sometimes determine what
+					   must be inserted into the next
+					   higher level.  This insertion
+					   consists of a key or two keys
+					   and their corresponding
+					   pointers */
+	struct buffer_head *insert_ptr[2];	/* inserted node-ptrs for the next
+						   level */
+
+	tb->tb_mode = flag;
+	tb->need_balance_dirty = 0;
+
+	if (FILESYSTEM_CHANGED_TB(tb)) {
+		reiserfs_panic(tb->tb_sb, "clm-6000", "fs generation has "
+			       "changed");
+	}
+	/* if we have no real work to do  */
+	if (!tb->insert_size[0]) {
+		reiserfs_warning(tb->tb_sb, "PAP-12350",
+				 "insert_size == 0, mode == %c", flag);
+		unfix_nodes(tb);
+		return;
+	}
+
+	atomic_inc(&(fs_generation(tb->tb_sb)));
+	do_balance_starts(tb);
+
+	/* balance leaf returns 0 except if combining L R and S into
+	   one node.  see balance_internal() for explanation of this
+	   line of code. */
+	child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) +
+	    balance_leaf(tb, ih, body, flag, insert_key, insert_ptr);
+
+#ifdef CONFIG_REISERFS_CHECK
+	check_after_balance_leaf(tb);
+#endif
+
+	/* Balance internal level of the tree. */
+	for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++)
+		child_pos =
+		    balance_internal(tb, h, child_pos, insert_key, insert_ptr);
+
+	do_balance_completed(tb);
+
+}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
new file mode 100644
index 00000000..91f080cc
--- /dev/null
+++ b/fs/reiserfs/file.c
@@ -0,0 +1,315 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/time.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/reiserfs_acl.h>
+#include <linux/reiserfs_xattr.h>
+#include <asm/uaccess.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/quotaops.h>
+
+/*
+** We pack the tails of files on file close, not at the time they are written.
+** This implies an unnecessary copy of the tail and an unnecessary indirect item
+** insertion/balancing, for files that are written in one write.
+** It avoids unnecessary tail packings (balances) for files that are written in
+** multiple writes and are small enough to have tails.
+**
+** file_release is called by the VFS layer when the file is closed.  If
+** this is the last open file descriptor, and the file
+** small enough to have a tail, and the tail is currently in an
+** unformatted node, the tail is converted back into a direct item.
+**
+** We use reiserfs_truncate_file to pack the tail, since it already has
+** all the conditions coded.
+*/
+static int reiserfs_file_release(struct inode *inode, struct file *filp)
+{
+
+	struct reiserfs_transaction_handle th;
+	int err;
+	int jbegin_failure = 0;
+
+	BUG_ON(!S_ISREG(inode->i_mode));
+
+        if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1))
+		return 0;
+
+	mutex_lock(&(REISERFS_I(inode)->tailpack));
+
+        if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) {
+		mutex_unlock(&(REISERFS_I(inode)->tailpack));
+		return 0;
+	}
+
+	/* fast out for when nothing needs to be done */
+	if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
+	     !tail_has_to_be_packed(inode)) &&
+	    REISERFS_I(inode)->i_prealloc_count <= 0) {
+		mutex_unlock(&(REISERFS_I(inode)->tailpack));
+		return 0;
+	}
+
+	reiserfs_write_lock(inode->i_sb);
+	/* freeing preallocation only involves relogging blocks that
+	 * are already in the current transaction.  preallocation gets
+	 * freed at the end of each transaction, so it is impossible for
+	 * us to log any additional blocks (including quota blocks)
+	 */
+	err = journal_begin(&th, inode->i_sb, 1);
+	if (err) {
+		/* uh oh, we can't allow the inode to go away while there
+		 * is still preallocation blocks pending.  Try to join the
+		 * aborted transaction
+		 */
+		jbegin_failure = err;
+		err = journal_join_abort(&th, inode->i_sb, 1);
+
+		if (err) {
+			/* hmpf, our choices here aren't good.  We can pin the inode
+			 * which will disallow unmount from every happening, we can
+			 * do nothing, which will corrupt random memory on unmount,
+			 * or we can forcibly remove the file from the preallocation
+			 * list, which will leak blocks on disk.  Lets pin the inode
+			 * and let the admin know what is going on.
+			 */
+			igrab(inode);
+			reiserfs_warning(inode->i_sb, "clm-9001",
+					 "pinning inode %lu because the "
+					 "preallocation can't be freed",
+					 inode->i_ino);
+			goto out;
+		}
+	}
+	reiserfs_update_inode_transaction(inode);
+
+#ifdef REISERFS_PREALLOCATE
+	reiserfs_discard_prealloc(&th, inode);
+#endif
+	err = journal_end(&th, inode->i_sb, 1);
+
+	/* copy back the error code from journal_begin */
+	if (!err)
+		err = jbegin_failure;
+
+	if (!err &&
+	    (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
+	    tail_has_to_be_packed(inode)) {
+
+		/* if regular file is released by last holder and it has been
+		   appended (we append by unformatted node only) or its direct
+		   item(s) had to be converted, then it may have to be
+		   indirect2direct converted */
+		err = reiserfs_truncate_file(inode, 0);
+	}
+      out:
+	reiserfs_write_unlock(inode->i_sb);
+	mutex_unlock(&(REISERFS_I(inode)->tailpack));
+	return err;
+}
+
+static int reiserfs_file_open(struct inode *inode, struct file *file)
+{
+	int err = dquot_file_open(inode, file);
+        if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
+		/* somebody might be tailpacking on final close; wait for it */
+		mutex_lock(&(REISERFS_I(inode)->tailpack));
+		atomic_inc(&REISERFS_I(inode)->openers);
+		mutex_unlock(&(REISERFS_I(inode)->tailpack));
+	}
+	return err;
+}
+
+static void reiserfs_vfs_truncate_file(struct inode *inode)
+{
+	mutex_lock(&(REISERFS_I(inode)->tailpack));
+	reiserfs_truncate_file(inode, 1);
+	mutex_unlock(&(REISERFS_I(inode)->tailpack));
+}
+
+/* Sync a reiserfs file. */
+
+/*
+ * FIXME: sync_mapping_buffers() never has anything to sync.  Can
+ * be removed...
+ */
+
+static int reiserfs_sync_file(struct file *filp, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	int err;
+	int barrier_done;
+
+	BUG_ON(!S_ISREG(inode->i_mode));
+	err = sync_mapping_buffers(inode->i_mapping);
+	reiserfs_write_lock(inode->i_sb);
+	barrier_done = reiserfs_commit_for_inode(inode);
+	reiserfs_write_unlock(inode->i_sb);
+	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+	if (barrier_done < 0)
+		return barrier_done;
+	return (err < 0) ? -EIO : 0;
+}
+
+/* taken fs/buffer.c:__block_commit_write */
+int reiserfs_commit_page(struct inode *inode, struct page *page,
+			 unsigned from, unsigned to)
+{
+	unsigned block_start, block_end;
+	int partial = 0;
+	unsigned blocksize;
+	struct buffer_head *bh, *head;
+	unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	int new;
+	int logit = reiserfs_file_data_log(inode);
+	struct super_block *s = inode->i_sb;
+	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
+	struct reiserfs_transaction_handle th;
+	int ret = 0;
+
+	th.t_trans_id = 0;
+	blocksize = 1 << inode->i_blkbits;
+
+	if (logit) {
+		reiserfs_write_lock(s);
+		ret = journal_begin(&th, s, bh_per_page + 1);
+		if (ret)
+			goto drop_write_lock;
+		reiserfs_update_inode_transaction(inode);
+	}
+	for (bh = head = page_buffers(page), block_start = 0;
+	     bh != head || !block_start;
+	     block_start = block_end, bh = bh->b_this_page) {
+
+		new = buffer_new(bh);
+		clear_buffer_new(bh);
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (!buffer_uptodate(bh))
+				partial = 1;
+		} else {
+			set_buffer_uptodate(bh);
+			if (logit) {
+				reiserfs_prepare_for_journal(s, bh, 1);
+				journal_mark_dirty(&th, s, bh);
+			} else if (!buffer_dirty(bh)) {
+				mark_buffer_dirty(bh);
+				/* do data=ordered on any page past the end
+				 * of file and any buffer marked BH_New.
+				 */
+				if (reiserfs_data_ordered(inode->i_sb) &&
+				    (new || page->index >= i_size_index)) {
+					reiserfs_add_ordered_list(inode, bh);
+				}
+			}
+		}
+	}
+	if (logit) {
+		ret = journal_end(&th, s, bh_per_page + 1);
+	      drop_write_lock:
+		reiserfs_write_unlock(s);
+	}
+	/*
+	 * If this is a partial write which happened to make all buffers
+	 * uptodate then we can optimize away a bogus readpage() for
+	 * the next read(). Here we 'discover' whether the page went
+	 * uptodate as a result of this (potentially partial) write.
+	 */
+	if (!partial)
+		SetPageUptodate(page);
+	return ret;
+}
+
+/* Write @count bytes at position @ppos in a file indicated by @file
+   from the buffer @buf.
+
+   generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
+   something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
+   written for (ext2/3).  This is for several reasons:
+
+   * It has no understanding of any filesystem specific optimizations.
+
+   * It enters the filesystem repeatedly for each page that is written.
+
+   * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
+   * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
+   * to reiserfs which allows for fewer tree traversals.
+
+   * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
+
+   * Asking the block allocation code for blocks one at a time is slightly less efficient.
+
+   All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
+   use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
+   things right finally.
+
+   Future Features: providing search_by_key with hints.
+
+*/
+static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going to write into */
+				   const char __user * buf,	/*  pointer to user supplied data
+								   (in userspace) */
+				   size_t count,	/* amount of bytes to write */
+				   loff_t * ppos	/* pointer to position in file that we start writing at. Should be updated to
+							 * new current position before returning. */
+				   )
+{
+	struct inode *inode = file->f_path.dentry->d_inode;	// Inode of the file that we are writing to.
+	/* To simplify coding at this time, we store
+	   locked pages in array for now */
+	struct reiserfs_transaction_handle th;
+	th.t_trans_id = 0;
+
+	/* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items
+	* lying around (most of the disk, in fact). Despite the filesystem
+	* now being a v3.6 format, the old items still can't support large
+	* file sizes. Catch this case here, as the rest of the VFS layer is
+	* oblivious to the different limitations between old and new items.
+	* reiserfs_setattr catches this for truncates. This chunk is lifted
+	* from generic_write_checks. */
+	if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
+	    *ppos + count > MAX_NON_LFS) {
+		if (*ppos >= MAX_NON_LFS) {
+			return -EFBIG;
+		}
+		if (count > MAX_NON_LFS - (unsigned long)*ppos)
+			count = MAX_NON_LFS - (unsigned long)*ppos;
+	}
+
+	return do_sync_write(file, buf, count, ppos);
+}
+
+const struct file_operations reiserfs_file_operations = {
+	.read = do_sync_read,
+	.write = reiserfs_file_write,
+	.unlocked_ioctl = reiserfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = reiserfs_compat_ioctl,
+#endif
+	.mmap = generic_file_mmap,
+	.open = reiserfs_file_open,
+	.release = reiserfs_file_release,
+	.fsync = reiserfs_sync_file,
+	.aio_read = generic_file_aio_read,
+	.aio_write = generic_file_aio_write,
+	.splice_read = generic_file_splice_read,
+	.splice_write = generic_file_splice_write,
+	.llseek = generic_file_llseek,
+};
+
+const struct inode_operations reiserfs_file_inode_operations = {
+	.truncate = reiserfs_vfs_truncate_file,
+	.setattr = reiserfs_setattr,
+	.setxattr = reiserfs_setxattr,
+	.getxattr = reiserfs_getxattr,
+	.listxattr = reiserfs_listxattr,
+	.removexattr = reiserfs_removexattr,
+	.permission = reiserfs_permission,
+};
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
new file mode 100644
index 00000000..1e4250bc
--- /dev/null
+++ b/fs/reiserfs/fix_node.c
@@ -0,0 +1,2593 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+/**
+ ** old_item_num
+ ** old_entry_num
+ ** set_entry_sizes
+ ** create_virtual_node
+ ** check_left
+ ** check_right
+ ** directory_part_size
+ ** get_num_ver
+ ** set_parameters
+ ** is_leaf_removable
+ ** are_leaves_removable
+ ** get_empty_nodes
+ ** get_lfree
+ ** get_rfree
+ ** is_left_neighbor_in_cache
+ ** decrement_key
+ ** get_far_parent
+ ** get_parents
+ ** can_node_be_removed
+ ** ip_check_balance
+ ** dc_check_balance_internal
+ ** dc_check_balance_leaf
+ ** dc_check_balance
+ ** check_balance
+ ** get_direct_parent
+ ** get_neighbors
+ ** fix_nodes
+ **
+ **
+ **/
+
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/buffer_head.h>
+
+/* To make any changes in the tree we find a node, that contains item
+   to be changed/deleted or position in the node we insert a new item
+   to. We call this node S. To do balancing we need to decide what we
+   will shift to left/right neighbor, or to a new node, where new item
+   will be etc. To make this analysis simpler we build virtual
+   node. Virtual node is an array of items, that will replace items of
+   node S. (For instance if we are going to delete an item, virtual
+   node does not contain it). Virtual node keeps information about
+   item sizes and types, mergeability of first and last items, sizes
+   of all entries in directory item. We use this array of items when
+   calculating what we can shift to neighbors and how many nodes we
+   have to have if we do not any shiftings, if we shift to left/right
+   neighbor or to both. */
+
+/* taking item number in virtual node, returns number of item, that it has in source buffer */
+static inline int old_item_num(int new_num, int affected_item_num, int mode)
+{
+	if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
+		return new_num;
+
+	if (mode == M_INSERT) {
+
+		RFALSE(new_num == 0,
+		       "vs-8005: for INSERT mode and item number of inserted item");
+
+		return new_num - 1;
+	}
+
+	RFALSE(mode != M_DELETE,
+	       "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
+	       mode);
+	/* delete mode */
+	return new_num + 1;
+}
+
+static void create_virtual_node(struct tree_balance *tb, int h)
+{
+	struct item_head *ih;
+	struct virtual_node *vn = tb->tb_vn;
+	int new_num;
+	struct buffer_head *Sh;	/* this comes from tb->S[h] */
+
+	Sh = PATH_H_PBUFFER(tb->tb_path, h);
+
+	/* size of changed node */
+	vn->vn_size =
+	    MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
+
+	/* for internal nodes array if virtual items is not created */
+	if (h) {
+		vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
+		return;
+	}
+
+	/* number of items in virtual node  */
+	vn->vn_nr_item =
+	    B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
+	    ((vn->vn_mode == M_DELETE) ? 1 : 0);
+
+	/* first virtual item */
+	vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
+	memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
+	vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
+
+	/* first item in the node */
+	ih = B_N_PITEM_HEAD(Sh, 0);
+
+	/* define the mergeability for 0-th item (if it is not being deleted) */
+	if (op_is_left_mergeable(&(ih->ih_key), Sh->b_size)
+	    && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
+		vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
+
+	/* go through all items those remain in the virtual node (except for the new (inserted) one) */
+	for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
+		int j;
+		struct virtual_item *vi = vn->vn_vi + new_num;
+		int is_affected =
+		    ((new_num != vn->vn_affected_item_num) ? 0 : 1);
+
+		if (is_affected && vn->vn_mode == M_INSERT)
+			continue;
+
+		/* get item number in source node */
+		j = old_item_num(new_num, vn->vn_affected_item_num,
+				 vn->vn_mode);
+
+		vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
+		vi->vi_ih = ih + j;
+		vi->vi_item = B_I_PITEM(Sh, ih + j);
+		vi->vi_uarea = vn->vn_free_ptr;
+
+		// FIXME: there is no check, that item operation did not
+		// consume too much memory
+		vn->vn_free_ptr +=
+		    op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
+		if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
+			reiserfs_panic(tb->tb_sb, "vs-8030",
+				       "virtual node space consumed");
+
+		if (!is_affected)
+			/* this is not being changed */
+			continue;
+
+		if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
+			vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
+			vi->vi_new_data = vn->vn_data;	// pointer to data which is going to be pasted
+		}
+	}
+
+	/* virtual inserted item is not defined yet */
+	if (vn->vn_mode == M_INSERT) {
+		struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
+
+		RFALSE(vn->vn_ins_ih == NULL,
+		       "vs-8040: item header of inserted item is not specified");
+		vi->vi_item_len = tb->insert_size[0];
+		vi->vi_ih = vn->vn_ins_ih;
+		vi->vi_item = vn->vn_data;
+		vi->vi_uarea = vn->vn_free_ptr;
+
+		op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
+			     tb->insert_size[0]);
+	}
+
+	/* set right merge flag we take right delimiting key and check whether it is a mergeable item */
+	if (tb->CFR[0]) {
+		struct reiserfs_key *key;
+
+		key = B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]);
+		if (op_is_left_mergeable(key, Sh->b_size)
+		    && (vn->vn_mode != M_DELETE
+			|| vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
+			vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
+			    VI_TYPE_RIGHT_MERGEABLE;
+
+#ifdef CONFIG_REISERFS_CHECK
+		if (op_is_left_mergeable(key, Sh->b_size) &&
+		    !(vn->vn_mode != M_DELETE
+		      || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
+			/* we delete last item and it could be merged with right neighbor's first item */
+			if (!
+			    (B_NR_ITEMS(Sh) == 1
+			     && is_direntry_le_ih(B_N_PITEM_HEAD(Sh, 0))
+			     && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) {
+				/* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
+				print_block(Sh, 0, -1, -1);
+				reiserfs_panic(tb->tb_sb, "vs-8045",
+					       "rdkey %k, affected item==%d "
+					       "(mode==%c) Must be %c",
+					       key, vn->vn_affected_item_num,
+					       vn->vn_mode, M_DELETE);
+			}
+		}
+#endif
+
+	}
+}
+
+/* using virtual node check, how many items can be shifted to left
+   neighbor */
+static void check_left(struct tree_balance *tb, int h, int cur_free)
+{
+	int i;
+	struct virtual_node *vn = tb->tb_vn;
+	struct virtual_item *vi;
+	int d_size, ih_size;
+
+	RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
+
+	/* internal level */
+	if (h > 0) {
+		tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
+		return;
+	}
+
+	/* leaf level */
+
+	if (!cur_free || !vn->vn_nr_item) {
+		/* no free space or nothing to move */
+		tb->lnum[h] = 0;
+		tb->lbytes = -1;
+		return;
+	}
+
+	RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
+	       "vs-8055: parent does not exist or invalid");
+
+	vi = vn->vn_vi;
+	if ((unsigned int)cur_free >=
+	    (vn->vn_size -
+	     ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
+		/* all contents of S[0] fits into L[0] */
+
+		RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
+		       "vs-8055: invalid mode or balance condition failed");
+
+		tb->lnum[0] = vn->vn_nr_item;
+		tb->lbytes = -1;
+		return;
+	}
+
+	d_size = 0, ih_size = IH_SIZE;
+
+	/* first item may be merge with last item in left neighbor */
+	if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
+		d_size = -((int)IH_SIZE), ih_size = 0;
+
+	tb->lnum[0] = 0;
+	for (i = 0; i < vn->vn_nr_item;
+	     i++, ih_size = IH_SIZE, d_size = 0, vi++) {
+		d_size += vi->vi_item_len;
+		if (cur_free >= d_size) {
+			/* the item can be shifted entirely */
+			cur_free -= d_size;
+			tb->lnum[0]++;
+			continue;
+		}
+
+		/* the item cannot be shifted entirely, try to split it */
+		/* check whether L[0] can hold ih and at least one byte of the item body */
+		if (cur_free <= ih_size) {
+			/* cannot shift even a part of the current item */
+			tb->lbytes = -1;
+			return;
+		}
+		cur_free -= ih_size;
+
+		tb->lbytes = op_check_left(vi, cur_free, 0, 0);
+		if (tb->lbytes != -1)
+			/* count partially shifted item */
+			tb->lnum[0]++;
+
+		break;
+	}
+
+	return;
+}
+
+/* using virtual node check, how many items can be shifted to right
+   neighbor */
+static void check_right(struct tree_balance *tb, int h, int cur_free)
+{
+	int i;
+	struct virtual_node *vn = tb->tb_vn;
+	struct virtual_item *vi;
+	int d_size, ih_size;
+
+	RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
+
+	/* internal level */
+	if (h > 0) {
+		tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
+		return;
+	}
+
+	/* leaf level */
+
+	if (!cur_free || !vn->vn_nr_item) {
+		/* no free space  */
+		tb->rnum[h] = 0;
+		tb->rbytes = -1;
+		return;
+	}
+
+	RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
+	       "vs-8075: parent does not exist or invalid");
+
+	vi = vn->vn_vi + vn->vn_nr_item - 1;
+	if ((unsigned int)cur_free >=
+	    (vn->vn_size -
+	     ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
+		/* all contents of S[0] fits into R[0] */
+
+		RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
+		       "vs-8080: invalid mode or balance condition failed");
+
+		tb->rnum[h] = vn->vn_nr_item;
+		tb->rbytes = -1;
+		return;
+	}
+
+	d_size = 0, ih_size = IH_SIZE;
+
+	/* last item may be merge with first item in right neighbor */
+	if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
+		d_size = -(int)IH_SIZE, ih_size = 0;
+
+	tb->rnum[0] = 0;
+	for (i = vn->vn_nr_item - 1; i >= 0;
+	     i--, d_size = 0, ih_size = IH_SIZE, vi--) {
+		d_size += vi->vi_item_len;
+		if (cur_free >= d_size) {
+			/* the item can be shifted entirely */
+			cur_free -= d_size;
+			tb->rnum[0]++;
+			continue;
+		}
+
+		/* check whether R[0] can hold ih and at least one byte of the item body */
+		if (cur_free <= ih_size) {	/* cannot shift even a part of the current item */
+			tb->rbytes = -1;
+			return;
+		}
+
+		/* R[0] can hold the header of the item and at least one byte of its body */
+		cur_free -= ih_size;	/* cur_free is still > 0 */
+
+		tb->rbytes = op_check_right(vi, cur_free);
+		if (tb->rbytes != -1)
+			/* count partially shifted item */
+			tb->rnum[0]++;
+
+		break;
+	}
+
+	return;
+}
+
+/*
+ * from - number of items, which are shifted to left neighbor entirely
+ * to - number of item, which are shifted to right neighbor entirely
+ * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
+ * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
+static int get_num_ver(int mode, struct tree_balance *tb, int h,
+		       int from, int from_bytes,
+		       int to, int to_bytes, short *snum012, int flow)
+{
+	int i;
+	int cur_free;
+	//    int bytes;
+	int units;
+	struct virtual_node *vn = tb->tb_vn;
+	//    struct virtual_item * vi;
+
+	int total_node_size, max_node_size, current_item_size;
+	int needed_nodes;
+	int start_item,		/* position of item we start filling node from */
+	 end_item,		/* position of item we finish filling node by */
+	 start_bytes,		/* number of first bytes (entries for directory) of start_item-th item
+				   we do not include into node that is being filled */
+	 end_bytes;		/* number of last bytes (entries for directory) of end_item-th item
+				   we do node include into node that is being filled */
+	int split_item_positions[2];	/* these are positions in virtual item of
+					   items, that are split between S[0] and
+					   S1new and S1new and S2new */
+
+	split_item_positions[0] = -1;
+	split_item_positions[1] = -1;
+
+	/* We only create additional nodes if we are in insert or paste mode
+	   or we are in replace mode at the internal level. If h is 0 and
+	   the mode is M_REPLACE then in fix_nodes we change the mode to
+	   paste or insert before we get here in the code.  */
+	RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
+	       "vs-8100: insert_size < 0 in overflow");
+
+	max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
+
+	/* snum012 [0-2] - number of items, that lay
+	   to S[0], first new node and second new node */
+	snum012[3] = -1;	/* s1bytes */
+	snum012[4] = -1;	/* s2bytes */
+
+	/* internal level */
+	if (h > 0) {
+		i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
+		if (i == max_node_size)
+			return 1;
+		return (i / max_node_size + 1);
+	}
+
+	/* leaf level */
+	needed_nodes = 1;
+	total_node_size = 0;
+	cur_free = max_node_size;
+
+	// start from 'from'-th item
+	start_item = from;
+	// skip its first 'start_bytes' units
+	start_bytes = ((from_bytes != -1) ? from_bytes : 0);
+
+	// last included item is the 'end_item'-th one
+	end_item = vn->vn_nr_item - to - 1;
+	// do not count last 'end_bytes' units of 'end_item'-th item
+	end_bytes = (to_bytes != -1) ? to_bytes : 0;
+
+	/* go through all item beginning from the start_item-th item and ending by
+	   the end_item-th item. Do not count first 'start_bytes' units of
+	   'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
+
+	for (i = start_item; i <= end_item; i++) {
+		struct virtual_item *vi = vn->vn_vi + i;
+		int skip_from_end = ((i == end_item) ? end_bytes : 0);
+
+		RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
+
+		/* get size of current item */
+		current_item_size = vi->vi_item_len;
+
+		/* do not take in calculation head part (from_bytes) of from-th item */
+		current_item_size -=
+		    op_part_size(vi, 0 /*from start */ , start_bytes);
+
+		/* do not take in calculation tail part of last item */
+		current_item_size -=
+		    op_part_size(vi, 1 /*from end */ , skip_from_end);
+
+		/* if item fits into current node entierly */
+		if (total_node_size + current_item_size <= max_node_size) {
+			snum012[needed_nodes - 1]++;
+			total_node_size += current_item_size;
+			start_bytes = 0;
+			continue;
+		}
+
+		if (current_item_size > max_node_size) {
+			/* virtual item length is longer, than max size of item in
+			   a node. It is impossible for direct item */
+			RFALSE(is_direct_le_ih(vi->vi_ih),
+			       "vs-8110: "
+			       "direct item length is %d. It can not be longer than %d",
+			       current_item_size, max_node_size);
+			/* we will try to split it */
+			flow = 1;
+		}
+
+		if (!flow) {
+			/* as we do not split items, take new node and continue */
+			needed_nodes++;
+			i--;
+			total_node_size = 0;
+			continue;
+		}
+		// calculate number of item units which fit into node being
+		// filled
+		{
+			int free_space;
+
+			free_space = max_node_size - total_node_size - IH_SIZE;
+			units =
+			    op_check_left(vi, free_space, start_bytes,
+					  skip_from_end);
+			if (units == -1) {
+				/* nothing fits into current node, take new node and continue */
+				needed_nodes++, i--, total_node_size = 0;
+				continue;
+			}
+		}
+
+		/* something fits into the current node */
+		//if (snum012[3] != -1 || needed_nodes != 1)
+		//  reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
+		//snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
+		start_bytes += units;
+		snum012[needed_nodes - 1 + 3] = units;
+
+		if (needed_nodes > 2)
+			reiserfs_warning(tb->tb_sb, "vs-8111",
+					 "split_item_position is out of range");
+		snum012[needed_nodes - 1]++;
+		split_item_positions[needed_nodes - 1] = i;
+		needed_nodes++;
+		/* continue from the same item with start_bytes != -1 */
+		start_item = i;
+		i--;
+		total_node_size = 0;
+	}
+
+	// sum012[4] (if it is not -1) contains number of units of which
+	// are to be in S1new, snum012[3] - to be in S0. They are supposed
+	// to be S1bytes and S2bytes correspondingly, so recalculate
+	if (snum012[4] > 0) {
+		int split_item_num;
+		int bytes_to_r, bytes_to_l;
+		int bytes_to_S1new;
+
+		split_item_num = split_item_positions[1];
+		bytes_to_l =
+		    ((from == split_item_num
+		      && from_bytes != -1) ? from_bytes : 0);
+		bytes_to_r =
+		    ((end_item == split_item_num
+		      && end_bytes != -1) ? end_bytes : 0);
+		bytes_to_S1new =
+		    ((split_item_positions[0] ==
+		      split_item_positions[1]) ? snum012[3] : 0);
+
+		// s2bytes
+		snum012[4] =
+		    op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
+		    bytes_to_r - bytes_to_l - bytes_to_S1new;
+
+		if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
+		    vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
+			reiserfs_warning(tb->tb_sb, "vs-8115",
+					 "not directory or indirect item");
+	}
+
+	/* now we know S2bytes, calculate S1bytes */
+	if (snum012[3] > 0) {
+		int split_item_num;
+		int bytes_to_r, bytes_to_l;
+		int bytes_to_S2new;
+
+		split_item_num = split_item_positions[0];
+		bytes_to_l =
+		    ((from == split_item_num
+		      && from_bytes != -1) ? from_bytes : 0);
+		bytes_to_r =
+		    ((end_item == split_item_num
+		      && end_bytes != -1) ? end_bytes : 0);
+		bytes_to_S2new =
+		    ((split_item_positions[0] == split_item_positions[1]
+		      && snum012[4] != -1) ? snum012[4] : 0);
+
+		// s1bytes
+		snum012[3] =
+		    op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
+		    bytes_to_r - bytes_to_l - bytes_to_S2new;
+	}
+
+	return needed_nodes;
+}
+
+
+/* Set parameters for balancing.
+ * Performs write of results of analysis of balancing into structure tb,
+ * where it will later be used by the functions that actually do the balancing.
+ * Parameters:
+ *	tb	tree_balance structure;
+ *	h	current level of the node;
+ *	lnum	number of items from S[h] that must be shifted to L[h];
+ *	rnum	number of items from S[h] that must be shifted to R[h];
+ *	blk_num	number of blocks that S[h] will be splitted into;
+ *	s012	number of items that fall into splitted nodes.
+ *	lbytes	number of bytes which flow to the left neighbor from the item that is not
+ *		not shifted entirely
+ *	rbytes	number of bytes which flow to the right neighbor from the item that is not
+ *		not shifted entirely
+ *	s1bytes	number of bytes which flow to the first  new node when S[0] splits (this number is contained in s012 array)
+ */
+
+static void set_parameters(struct tree_balance *tb, int h, int lnum,
+			   int rnum, int blk_num, short *s012, int lb, int rb)
+{
+
+	tb->lnum[h] = lnum;
+	tb->rnum[h] = rnum;
+	tb->blknum[h] = blk_num;
+
+	if (h == 0) {		/* only for leaf level */
+		if (s012 != NULL) {
+			tb->s0num = *s012++,
+			    tb->s1num = *s012++, tb->s2num = *s012++;
+			tb->s1bytes = *s012++;
+			tb->s2bytes = *s012;
+		}
+		tb->lbytes = lb;
+		tb->rbytes = rb;
+	}
+	PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
+	PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
+
+	PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
+	PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
+}
+
+/* check, does node disappear if we shift tb->lnum[0] items to left
+   neighbor and tb->rnum[0] to the right one. */
+static int is_leaf_removable(struct tree_balance *tb)
+{
+	struct virtual_node *vn = tb->tb_vn;
+	int to_left, to_right;
+	int size;
+	int remain_items;
+
+	/* number of items, that will be shifted to left (right) neighbor
+	   entirely */
+	to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
+	to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
+	remain_items = vn->vn_nr_item;
+
+	/* how many items remain in S[0] after shiftings to neighbors */
+	remain_items -= (to_left + to_right);
+
+	if (remain_items < 1) {
+		/* all content of node can be shifted to neighbors */
+		set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
+			       NULL, -1, -1);
+		return 1;
+	}
+
+	if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
+		/* S[0] is not removable */
+		return 0;
+
+	/* check, whether we can divide 1 remaining item between neighbors */
+
+	/* get size of remaining item (in item units) */
+	size = op_unit_num(&(vn->vn_vi[to_left]));
+
+	if (tb->lbytes + tb->rbytes >= size) {
+		set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
+			       tb->lbytes, -1);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* check whether L, S, R can be joined in one node */
+static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
+{
+	struct virtual_node *vn = tb->tb_vn;
+	int ih_size;
+	struct buffer_head *S0;
+
+	S0 = PATH_H_PBUFFER(tb->tb_path, 0);
+
+	ih_size = 0;
+	if (vn->vn_nr_item) {
+		if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
+			ih_size += IH_SIZE;
+
+		if (vn->vn_vi[vn->vn_nr_item - 1].
+		    vi_type & VI_TYPE_RIGHT_MERGEABLE)
+			ih_size += IH_SIZE;
+	} else {
+		/* there was only one item and it will be deleted */
+		struct item_head *ih;
+
+		RFALSE(B_NR_ITEMS(S0) != 1,
+		       "vs-8125: item number must be 1: it is %d",
+		       B_NR_ITEMS(S0));
+
+		ih = B_N_PITEM_HEAD(S0, 0);
+		if (tb->CFR[0]
+		    && !comp_short_le_keys(&(ih->ih_key),
+					   B_N_PDELIM_KEY(tb->CFR[0],
+							  tb->rkey[0])))
+			if (is_direntry_le_ih(ih)) {
+				/* Directory must be in correct state here: that is
+				   somewhere at the left side should exist first directory
+				   item. But the item being deleted can not be that first
+				   one because its right neighbor is item of the same
+				   directory. (But first item always gets deleted in last
+				   turn). So, neighbors of deleted item can be merged, so
+				   we can save ih_size */
+				ih_size = IH_SIZE;
+
+				/* we might check that left neighbor exists and is of the
+				   same directory */
+				RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
+				       "vs-8130: first directory item can not be removed until directory is not empty");
+			}
+
+	}
+
+	if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
+		set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
+		PROC_INFO_INC(tb->tb_sb, leaves_removable);
+		return 1;
+	}
+	return 0;
+
+}
+
+/* when we do not split item, lnum and rnum are numbers of entire items */
+#define SET_PAR_SHIFT_LEFT \
+if (h)\
+{\
+   int to_l;\
+   \
+   to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
+	      (MAX_NR_KEY(Sh) + 1 - lpar);\
+	      \
+	      set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
+}\
+else \
+{\
+   if (lset==LEFT_SHIFT_FLOW)\
+     set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
+		     tb->lbytes, -1);\
+   else\
+     set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
+		     -1, -1);\
+}
+
+#define SET_PAR_SHIFT_RIGHT \
+if (h)\
+{\
+   int to_r;\
+   \
+   to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
+   \
+   set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
+}\
+else \
+{\
+   if (rset==RIGHT_SHIFT_FLOW)\
+     set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
+		  -1, tb->rbytes);\
+   else\
+     set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
+		  -1, -1);\
+}
+
+static void free_buffers_in_tb(struct tree_balance *tb)
+{
+	int i;
+
+	pathrelse(tb->tb_path);
+
+	for (i = 0; i < MAX_HEIGHT; i++) {
+		brelse(tb->L[i]);
+		brelse(tb->R[i]);
+		brelse(tb->FL[i]);
+		brelse(tb->FR[i]);
+		brelse(tb->CFL[i]);
+		brelse(tb->CFR[i]);
+
+		tb->L[i] = NULL;
+		tb->R[i] = NULL;
+		tb->FL[i] = NULL;
+		tb->FR[i] = NULL;
+		tb->CFL[i] = NULL;
+		tb->CFR[i] = NULL;
+	}
+}
+
+/* Get new buffers for storing new nodes that are created while balancing.
+ * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
+ *	        CARRY_ON - schedule didn't occur while the function worked;
+ *	        NO_DISK_SPACE - no disk space.
+ */
+/* The function is NOT SCHEDULE-SAFE! */
+static int get_empty_nodes(struct tree_balance *tb, int h)
+{
+	struct buffer_head *new_bh,
+	    *Sh = PATH_H_PBUFFER(tb->tb_path, h);
+	b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
+	int counter, number_of_freeblk, amount_needed,	/* number of needed empty blocks */
+	 retval = CARRY_ON;
+	struct super_block *sb = tb->tb_sb;
+
+	/* number_of_freeblk is the number of empty blocks which have been
+	   acquired for use by the balancing algorithm minus the number of
+	   empty blocks used in the previous levels of the analysis,
+	   number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
+	   after empty blocks are acquired, and the balancing analysis is
+	   then restarted, amount_needed is the number needed by this level
+	   (h) of the balancing analysis.
+
+	   Note that for systems with many processes writing, it would be
+	   more layout optimal to calculate the total number needed by all
+	   levels and then to run reiserfs_new_blocks to get all of them at once.  */
+
+	/* Initiate number_of_freeblk to the amount acquired prior to the restart of
+	   the analysis or 0 if not restarted, then subtract the amount needed
+	   by all of the levels of the tree below h. */
+	/* blknum includes S[h], so we subtract 1 in this calculation */
+	for (counter = 0, number_of_freeblk = tb->cur_blknum;
+	     counter < h; counter++)
+		number_of_freeblk -=
+		    (tb->blknum[counter]) ? (tb->blknum[counter] -
+						   1) : 0;
+
+	/* Allocate missing empty blocks. */
+	/* if Sh == 0  then we are getting a new root */
+	amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
+	/*  Amount_needed = the amount that we need more than the amount that we have. */
+	if (amount_needed > number_of_freeblk)
+		amount_needed -= number_of_freeblk;
+	else			/* If we have enough already then there is nothing to do. */
+		return CARRY_ON;
+
+	/* No need to check quota - is not allocated for blocks used for formatted nodes */
+	if (reiserfs_new_form_blocknrs(tb, blocknrs,
+				       amount_needed) == NO_DISK_SPACE)
+		return NO_DISK_SPACE;
+
+	/* for each blocknumber we just got, get a buffer and stick it on FEB */
+	for (blocknr = blocknrs, counter = 0;
+	     counter < amount_needed; blocknr++, counter++) {
+
+		RFALSE(!*blocknr,
+		       "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
+
+		new_bh = sb_getblk(sb, *blocknr);
+		RFALSE(buffer_dirty(new_bh) ||
+		       buffer_journaled(new_bh) ||
+		       buffer_journal_dirty(new_bh),
+		       "PAP-8140: journaled or dirty buffer %b for the new block",
+		       new_bh);
+
+		/* Put empty buffers into the array. */
+		RFALSE(tb->FEB[tb->cur_blknum],
+		       "PAP-8141: busy slot for new buffer");
+
+		set_buffer_journal_new(new_bh);
+		tb->FEB[tb->cur_blknum++] = new_bh;
+	}
+
+	if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
+		retval = REPEAT_SEARCH;
+
+	return retval;
+}
+
+/* Get free space of the left neighbor, which is stored in the parent
+ * node of the left neighbor.  */
+static int get_lfree(struct tree_balance *tb, int h)
+{
+	struct buffer_head *l, *f;
+	int order;
+
+	if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
+	    (l = tb->FL[h]) == NULL)
+		return 0;
+
+	if (f == l)
+		order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
+	else {
+		order = B_NR_ITEMS(l);
+		f = l;
+	}
+
+	return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
+}
+
+/* Get free space of the right neighbor,
+ * which is stored in the parent node of the right neighbor.
+ */
+static int get_rfree(struct tree_balance *tb, int h)
+{
+	struct buffer_head *r, *f;
+	int order;
+
+	if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
+	    (r = tb->FR[h]) == NULL)
+		return 0;
+
+	if (f == r)
+		order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
+	else {
+		order = 0;
+		f = r;
+	}
+
+	return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
+
+}
+
+/* Check whether left neighbor is in memory. */
+static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
+{
+	struct buffer_head *father, *left;
+	struct super_block *sb = tb->tb_sb;
+	b_blocknr_t left_neighbor_blocknr;
+	int left_neighbor_position;
+
+	/* Father of the left neighbor does not exist. */
+	if (!tb->FL[h])
+		return 0;
+
+	/* Calculate father of the node to be balanced. */
+	father = PATH_H_PBUFFER(tb->tb_path, h + 1);
+
+	RFALSE(!father ||
+	       !B_IS_IN_TREE(father) ||
+	       !B_IS_IN_TREE(tb->FL[h]) ||
+	       !buffer_uptodate(father) ||
+	       !buffer_uptodate(tb->FL[h]),
+	       "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
+	       father, tb->FL[h]);
+
+	/* Get position of the pointer to the left neighbor into the left father. */
+	left_neighbor_position = (father == tb->FL[h]) ?
+	    tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
+	/* Get left neighbor block number. */
+	left_neighbor_blocknr =
+	    B_N_CHILD_NUM(tb->FL[h], left_neighbor_position);
+	/* Look for the left neighbor in the cache. */
+	if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) {
+
+		RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
+		       "vs-8170: left neighbor (%b %z) is not in the tree",
+		       left, left);
+		put_bh(left);
+		return 1;
+	}
+
+	return 0;
+}
+
+#define LEFT_PARENTS  'l'
+#define RIGHT_PARENTS 'r'
+
+static void decrement_key(struct cpu_key *key)
+{
+	// call item specific function for this key
+	item_ops[cpu_key_k_type(key)]->decrement_key(key);
+}
+
+/* Calculate far left/right parent of the left/right neighbor of the current node, that
+ * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
+ * Calculate left/right common parent of the current node and L[h]/R[h].
+ * Calculate left/right delimiting key position.
+ * Returns:	PATH_INCORRECT   - path in the tree is not correct;
+ 		SCHEDULE_OCCURRED - schedule occurred while the function worked;
+ *	        CARRY_ON         - schedule didn't occur while the function worked;
+ */
+static int get_far_parent(struct tree_balance *tb,
+			  int h,
+			  struct buffer_head **pfather,
+			  struct buffer_head **pcom_father, char c_lr_par)
+{
+	struct buffer_head *parent;
+	INITIALIZE_PATH(s_path_to_neighbor_father);
+	struct treepath *path = tb->tb_path;
+	struct cpu_key s_lr_father_key;
+	int counter,
+	    position = INT_MAX,
+	    first_last_position = 0,
+	    path_offset = PATH_H_PATH_OFFSET(path, h);
+
+	/* Starting from F[h] go upwards in the tree, and look for the common
+	   ancestor of F[h], and its neighbor l/r, that should be obtained. */
+
+	counter = path_offset;
+
+	RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET,
+	       "PAP-8180: invalid path length");
+
+	for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
+		/* Check whether parent of the current buffer in the path is really parent in the tree. */
+		if (!B_IS_IN_TREE
+		    (parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
+			return REPEAT_SEARCH;
+		/* Check whether position in the parent is correct. */
+		if ((position =
+		     PATH_OFFSET_POSITION(path,
+					  counter - 1)) >
+		    B_NR_ITEMS(parent))
+			return REPEAT_SEARCH;
+		/* Check whether parent at the path really points to the child. */
+		if (B_N_CHILD_NUM(parent, position) !=
+		    PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
+			return REPEAT_SEARCH;
+		/* Return delimiting key if position in the parent is not equal to first/last one. */
+		if (c_lr_par == RIGHT_PARENTS)
+			first_last_position = B_NR_ITEMS(parent);
+		if (position != first_last_position) {
+			*pcom_father = parent;
+			get_bh(*pcom_father);
+			/*(*pcom_father = parent)->b_count++; */
+			break;
+		}
+	}
+
+	/* if we are in the root of the tree, then there is no common father */
+	if (counter == FIRST_PATH_ELEMENT_OFFSET) {
+		/* Check whether first buffer in the path is the root of the tree. */
+		if (PATH_OFFSET_PBUFFER
+		    (tb->tb_path,
+		     FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
+		    SB_ROOT_BLOCK(tb->tb_sb)) {
+			*pfather = *pcom_father = NULL;
+			return CARRY_ON;
+		}
+		return REPEAT_SEARCH;
+	}
+
+	RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL,
+	       "PAP-8185: (%b %z) level too small",
+	       *pcom_father, *pcom_father);
+
+	/* Check whether the common parent is locked. */
+
+	if (buffer_locked(*pcom_father)) {
+
+		/* Release the write lock while the buffer is busy */
+		reiserfs_write_unlock(tb->tb_sb);
+		__wait_on_buffer(*pcom_father);
+		reiserfs_write_lock(tb->tb_sb);
+		if (FILESYSTEM_CHANGED_TB(tb)) {
+			brelse(*pcom_father);
+			return REPEAT_SEARCH;
+		}
+	}
+
+	/* So, we got common parent of the current node and its left/right neighbor.
+	   Now we are geting the parent of the left/right neighbor. */
+
+	/* Form key to get parent of the left/right neighbor. */
+	le_key2cpu_key(&s_lr_father_key,
+		       B_N_PDELIM_KEY(*pcom_father,
+				      (c_lr_par ==
+				       LEFT_PARENTS) ? (tb->lkey[h - 1] =
+							position -
+							1) : (tb->rkey[h -
+									   1] =
+							      position)));
+
+	if (c_lr_par == LEFT_PARENTS)
+		decrement_key(&s_lr_father_key);
+
+	if (search_by_key
+	    (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
+	     h + 1) == IO_ERROR)
+		// path is released
+		return IO_ERROR;
+
+	if (FILESYSTEM_CHANGED_TB(tb)) {
+		pathrelse(&s_path_to_neighbor_father);
+		brelse(*pcom_father);
+		return REPEAT_SEARCH;
+	}
+
+	*pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
+
+	RFALSE(B_LEVEL(*pfather) != h + 1,
+	       "PAP-8190: (%b %z) level too small", *pfather, *pfather);
+	RFALSE(s_path_to_neighbor_father.path_length <
+	       FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
+
+	s_path_to_neighbor_father.path_length--;
+	pathrelse(&s_path_to_neighbor_father);
+	return CARRY_ON;
+}
+
+/* Get parents of neighbors of node in the path(S[path_offset]) and common parents of
+ * S[path_offset] and L[path_offset]/R[path_offset]: F[path_offset], FL[path_offset],
+ * FR[path_offset], CFL[path_offset], CFR[path_offset].
+ * Calculate numbers of left and right delimiting keys position: lkey[path_offset], rkey[path_offset].
+ * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
+ *	        CARRY_ON - schedule didn't occur while the function worked;
+ */
+static int get_parents(struct tree_balance *tb, int h)
+{
+	struct treepath *path = tb->tb_path;
+	int position,
+	    ret,
+	    path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
+	struct buffer_head *curf, *curcf;
+
+	/* Current node is the root of the tree or will be root of the tree */
+	if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
+		/* The root can not have parents.
+		   Release nodes which previously were obtained as parents of the current node neighbors. */
+		brelse(tb->FL[h]);
+		brelse(tb->CFL[h]);
+		brelse(tb->FR[h]);
+		brelse(tb->CFR[h]);
+		tb->FL[h]  = NULL;
+		tb->CFL[h] = NULL;
+		tb->FR[h]  = NULL;
+		tb->CFR[h] = NULL;
+		return CARRY_ON;
+	}
+
+	/* Get parent FL[path_offset] of L[path_offset]. */
+	position = PATH_OFFSET_POSITION(path, path_offset - 1);
+	if (position) {
+		/* Current node is not the first child of its parent. */
+		curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
+		curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
+		get_bh(curf);
+		get_bh(curf);
+		tb->lkey[h] = position - 1;
+	} else {
+		/* Calculate current parent of L[path_offset], which is the left neighbor of the current node.
+		   Calculate current common parent of L[path_offset] and the current node. Note that
+		   CFL[path_offset] not equal FL[path_offset] and CFL[path_offset] not equal F[path_offset].
+		   Calculate lkey[path_offset]. */
+		if ((ret = get_far_parent(tb, h + 1, &curf,
+						  &curcf,
+						  LEFT_PARENTS)) != CARRY_ON)
+			return ret;
+	}
+
+	brelse(tb->FL[h]);
+	tb->FL[h] = curf;	/* New initialization of FL[h]. */
+	brelse(tb->CFL[h]);
+	tb->CFL[h] = curcf;	/* New initialization of CFL[h]. */
+
+	RFALSE((curf && !B_IS_IN_TREE(curf)) ||
+	       (curcf && !B_IS_IN_TREE(curcf)),
+	       "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
+
+/* Get parent FR[h] of R[h]. */
+
+/* Current node is the last child of F[h]. FR[h] != F[h]. */
+	if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
+/* Calculate current parent of R[h], which is the right neighbor of F[h].
+   Calculate current common parent of R[h] and current node. Note that CFR[h]
+   not equal FR[path_offset] and CFR[h] not equal F[h]. */
+		if ((ret =
+		     get_far_parent(tb, h + 1, &curf, &curcf,
+				    RIGHT_PARENTS)) != CARRY_ON)
+			return ret;
+	} else {
+/* Current node is not the last child of its parent F[h]. */
+		curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
+		curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
+		get_bh(curf);
+		get_bh(curf);
+		tb->rkey[h] = position;
+	}
+
+	brelse(tb->FR[h]);
+	/* New initialization of FR[path_offset]. */
+	tb->FR[h] = curf;
+
+	brelse(tb->CFR[h]);
+	/* New initialization of CFR[path_offset]. */
+	tb->CFR[h] = curcf;
+
+	RFALSE((curf && !B_IS_IN_TREE(curf)) ||
+	       (curcf && !B_IS_IN_TREE(curcf)),
+	       "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf);
+
+	return CARRY_ON;
+}
+
+/* it is possible to remove node as result of shiftings to
+   neighbors even when we insert or paste item. */
+static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
+				      struct tree_balance *tb, int h)
+{
+	struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
+	int levbytes = tb->insert_size[h];
+	struct item_head *ih;
+	struct reiserfs_key *r_key = NULL;
+
+	ih = B_N_PITEM_HEAD(Sh, 0);
+	if (tb->CFR[h])
+		r_key = B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]);
+
+	if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
+	    /* shifting may merge items which might save space */
+	    -
+	    ((!h
+	      && op_is_left_mergeable(&(ih->ih_key), Sh->b_size)) ? IH_SIZE : 0)
+	    -
+	    ((!h && r_key
+	      && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
+	    + ((h) ? KEY_SIZE : 0)) {
+		/* node can not be removed */
+		if (sfree >= levbytes) {	/* new item fits into node S[h] without any shifting */
+			if (!h)
+				tb->s0num =
+				    B_NR_ITEMS(Sh) +
+				    ((mode == M_INSERT) ? 1 : 0);
+			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+			return NO_BALANCING_NEEDED;
+		}
+	}
+	PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
+	return !NO_BALANCING_NEEDED;
+}
+
+/* Check whether current node S[h] is balanced when increasing its size by
+ * Inserting or Pasting.
+ * Calculate parameters for balancing for current level h.
+ * Parameters:
+ *	tb	tree_balance structure;
+ *	h	current level of the node;
+ *	inum	item number in S[h];
+ *	mode	i - insert, p - paste;
+ * Returns:	1 - schedule occurred;
+ *	        0 - balancing for higher levels needed;
+ *	       -1 - no balancing for higher levels needed;
+ *	       -2 - no disk space.
+ */
+/* ip means Inserting or Pasting */
+static int ip_check_balance(struct tree_balance *tb, int h)
+{
+	struct virtual_node *vn = tb->tb_vn;
+	int levbytes,		/* Number of bytes that must be inserted into (value
+				   is negative if bytes are deleted) buffer which
+				   contains node being balanced.  The mnemonic is
+				   that the attempted change in node space used level
+				   is levbytes bytes. */
+	 ret;
+
+	int lfree, sfree, rfree /* free space in L, S and R */ ;
+
+	/* nver is short for number of vertixes, and lnver is the number if
+	   we shift to the left, rnver is the number if we shift to the
+	   right, and lrnver is the number if we shift in both directions.
+	   The goal is to minimize first the number of vertixes, and second,
+	   the number of vertixes whose contents are changed by shifting,
+	   and third the number of uncached vertixes whose contents are
+	   changed by shifting and must be read from disk.  */
+	int nver, lnver, rnver, lrnver;
+
+	/* used at leaf level only, S0 = S[0] is the node being balanced,
+	   sInum [ I = 0,1,2 ] is the number of items that will
+	   remain in node SI after balancing.  S1 and S2 are new
+	   nodes that might be created. */
+
+	/* we perform 8 calls to get_num_ver().  For each call we calculate five parameters.
+	   where 4th parameter is s1bytes and 5th - s2bytes
+	 */
+	short snum012[40] = { 0, };	/* s0num, s1num, s2num for 8 cases
+					   0,1 - do not shift and do not shift but bottle
+					   2 - shift only whole item to left
+					   3 - shift to left and bottle as much as possible
+					   4,5 - shift to right (whole items and as much as possible
+					   6,7 - shift to both directions (whole items and as much as possible)
+					 */
+
+	/* Sh is the node whose balance is currently being checked */
+	struct buffer_head *Sh;
+
+	Sh = PATH_H_PBUFFER(tb->tb_path, h);
+	levbytes = tb->insert_size[h];
+
+	/* Calculate balance parameters for creating new root. */
+	if (!Sh) {
+		if (!h)
+			reiserfs_panic(tb->tb_sb, "vs-8210",
+				       "S[0] can not be 0");
+		switch (ret = get_empty_nodes(tb, h)) {
+		case CARRY_ON:
+			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+			return NO_BALANCING_NEEDED;	/* no balancing for higher levels needed */
+
+		case NO_DISK_SPACE:
+		case REPEAT_SEARCH:
+			return ret;
+		default:
+			reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect "
+				       "return value of get_empty_nodes");
+		}
+	}
+
+	if ((ret = get_parents(tb, h)) != CARRY_ON)	/* get parents of S[h] neighbors. */
+		return ret;
+
+	sfree = B_FREE_SPACE(Sh);
+
+	/* get free space of neighbors */
+	rfree = get_rfree(tb, h);
+	lfree = get_lfree(tb, h);
+
+	if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
+	    NO_BALANCING_NEEDED)
+		/* and new item fits into node S[h] without any shifting */
+		return NO_BALANCING_NEEDED;
+
+	create_virtual_node(tb, h);
+
+	/*
+	   determine maximal number of items we can shift to the left neighbor (in tb structure)
+	   and the maximal number of bytes that can flow to the left neighbor
+	   from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
+	 */
+	check_left(tb, h, lfree);
+
+	/*
+	   determine maximal number of items we can shift to the right neighbor (in tb structure)
+	   and the maximal number of bytes that can flow to the right neighbor
+	   from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
+	 */
+	check_right(tb, h, rfree);
+
+	/* all contents of internal node S[h] can be moved into its
+	   neighbors, S[h] will be removed after balancing */
+	if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
+		int to_r;
+
+		/* Since we are working on internal nodes, and our internal
+		   nodes have fixed size entries, then we can balance by the
+		   number of items rather than the space they consume.  In this
+		   routine we set the left node equal to the right node,
+		   allowing a difference of less than or equal to 1 child
+		   pointer. */
+		to_r =
+		    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
+		     vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
+						tb->rnum[h]);
+		set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
+			       -1, -1);
+		return CARRY_ON;
+	}
+
+	/* this checks balance condition, that any two neighboring nodes can not fit in one node */
+	RFALSE(h &&
+	       (tb->lnum[h] >= vn->vn_nr_item + 1 ||
+		tb->rnum[h] >= vn->vn_nr_item + 1),
+	       "vs-8220: tree is not balanced on internal level");
+	RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
+		      (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
+	       "vs-8225: tree is not balanced on leaf level");
+
+	/* all contents of S[0] can be moved into its neighbors
+	   S[0] will be removed after balancing. */
+	if (!h && is_leaf_removable(tb))
+		return CARRY_ON;
+
+	/* why do we perform this check here rather than earlier??
+	   Answer: we can win 1 node in some cases above. Moreover we
+	   checked it above, when we checked, that S[0] is not removable
+	   in principle */
+	if (sfree >= levbytes) {	/* new item fits into node S[h] without any shifting */
+		if (!h)
+			tb->s0num = vn->vn_nr_item;
+		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+		return NO_BALANCING_NEEDED;
+	}
+
+	{
+		int lpar, rpar, nset, lset, rset, lrset;
+		/*
+		 * regular overflowing of the node
+		 */
+
+		/* get_num_ver works in 2 modes (FLOW & NO_FLOW)
+		   lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
+		   nset, lset, rset, lrset - shows, whether flowing items give better packing
+		 */
+#define FLOW 1
+#define NO_FLOW 0		/* do not any splitting */
+
+		/* we choose one the following */
+#define NOTHING_SHIFT_NO_FLOW	0
+#define NOTHING_SHIFT_FLOW	5
+#define LEFT_SHIFT_NO_FLOW	10
+#define LEFT_SHIFT_FLOW		15
+#define RIGHT_SHIFT_NO_FLOW	20
+#define RIGHT_SHIFT_FLOW	25
+#define LR_SHIFT_NO_FLOW	30
+#define LR_SHIFT_FLOW		35
+
+		lpar = tb->lnum[h];
+		rpar = tb->rnum[h];
+
+		/* calculate number of blocks S[h] must be split into when
+		   nothing is shifted to the neighbors,
+		   as well as number of items in each part of the split node (s012 numbers),
+		   and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
+		nset = NOTHING_SHIFT_NO_FLOW;
+		nver = get_num_ver(vn->vn_mode, tb, h,
+				   0, -1, h ? vn->vn_nr_item : 0, -1,
+				   snum012, NO_FLOW);
+
+		if (!h) {
+			int nver1;
+
+			/* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
+			nver1 = get_num_ver(vn->vn_mode, tb, h,
+					    0, -1, 0, -1,
+					    snum012 + NOTHING_SHIFT_FLOW, FLOW);
+			if (nver > nver1)
+				nset = NOTHING_SHIFT_FLOW, nver = nver1;
+		}
+
+		/* calculate number of blocks S[h] must be split into when
+		   l_shift_num first items and l_shift_bytes of the right most
+		   liquid item to be shifted are shifted to the left neighbor,
+		   as well as number of items in each part of the splitted node (s012 numbers),
+		   and number of bytes (s1bytes) of the shared drop which flow to S1 if any
+		 */
+		lset = LEFT_SHIFT_NO_FLOW;
+		lnver = get_num_ver(vn->vn_mode, tb, h,
+				    lpar - ((h || tb->lbytes == -1) ? 0 : 1),
+				    -1, h ? vn->vn_nr_item : 0, -1,
+				    snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
+		if (!h) {
+			int lnver1;
+
+			lnver1 = get_num_ver(vn->vn_mode, tb, h,
+					     lpar -
+					     ((tb->lbytes != -1) ? 1 : 0),
+					     tb->lbytes, 0, -1,
+					     snum012 + LEFT_SHIFT_FLOW, FLOW);
+			if (lnver > lnver1)
+				lset = LEFT_SHIFT_FLOW, lnver = lnver1;
+		}
+
+		/* calculate number of blocks S[h] must be split into when
+		   r_shift_num first items and r_shift_bytes of the left most
+		   liquid item to be shifted are shifted to the right neighbor,
+		   as well as number of items in each part of the splitted node (s012 numbers),
+		   and number of bytes (s1bytes) of the shared drop which flow to S1 if any
+		 */
+		rset = RIGHT_SHIFT_NO_FLOW;
+		rnver = get_num_ver(vn->vn_mode, tb, h,
+				    0, -1,
+				    h ? (vn->vn_nr_item - rpar) : (rpar -
+								   ((tb->
+								     rbytes !=
+								     -1) ? 1 :
+								    0)), -1,
+				    snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
+		if (!h) {
+			int rnver1;
+
+			rnver1 = get_num_ver(vn->vn_mode, tb, h,
+					     0, -1,
+					     (rpar -
+					      ((tb->rbytes != -1) ? 1 : 0)),
+					     tb->rbytes,
+					     snum012 + RIGHT_SHIFT_FLOW, FLOW);
+
+			if (rnver > rnver1)
+				rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
+		}
+
+		/* calculate number of blocks S[h] must be split into when
+		   items are shifted in both directions,
+		   as well as number of items in each part of the splitted node (s012 numbers),
+		   and number of bytes (s1bytes) of the shared drop which flow to S1 if any
+		 */
+		lrset = LR_SHIFT_NO_FLOW;
+		lrnver = get_num_ver(vn->vn_mode, tb, h,
+				     lpar - ((h || tb->lbytes == -1) ? 0 : 1),
+				     -1,
+				     h ? (vn->vn_nr_item - rpar) : (rpar -
+								    ((tb->
+								      rbytes !=
+								      -1) ? 1 :
+								     0)), -1,
+				     snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
+		if (!h) {
+			int lrnver1;
+
+			lrnver1 = get_num_ver(vn->vn_mode, tb, h,
+					      lpar -
+					      ((tb->lbytes != -1) ? 1 : 0),
+					      tb->lbytes,
+					      (rpar -
+					       ((tb->rbytes != -1) ? 1 : 0)),
+					      tb->rbytes,
+					      snum012 + LR_SHIFT_FLOW, FLOW);
+			if (lrnver > lrnver1)
+				lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
+		}
+
+		/* Our general shifting strategy is:
+		   1) to minimized number of new nodes;
+		   2) to minimized number of neighbors involved in shifting;
+		   3) to minimized number of disk reads; */
+
+		/* we can win TWO or ONE nodes by shifting in both directions */
+		if (lrnver < lnver && lrnver < rnver) {
+			RFALSE(h &&
+			       (tb->lnum[h] != 1 ||
+				tb->rnum[h] != 1 ||
+				lrnver != 1 || rnver != 2 || lnver != 2
+				|| h != 1), "vs-8230: bad h");
+			if (lrset == LR_SHIFT_FLOW)
+				set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
+					       lrnver, snum012 + lrset,
+					       tb->lbytes, tb->rbytes);
+			else
+				set_parameters(tb, h,
+					       tb->lnum[h] -
+					       ((tb->lbytes == -1) ? 0 : 1),
+					       tb->rnum[h] -
+					       ((tb->rbytes == -1) ? 0 : 1),
+					       lrnver, snum012 + lrset, -1, -1);
+
+			return CARRY_ON;
+		}
+
+		/* if shifting doesn't lead to better packing then don't shift */
+		if (nver == lrnver) {
+			set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
+				       -1);
+			return CARRY_ON;
+		}
+
+		/* now we know that for better packing shifting in only one
+		   direction either to the left or to the right is required */
+
+		/*  if shifting to the left is better than shifting to the right */
+		if (lnver < rnver) {
+			SET_PAR_SHIFT_LEFT;
+			return CARRY_ON;
+		}
+
+		/* if shifting to the right is better than shifting to the left */
+		if (lnver > rnver) {
+			SET_PAR_SHIFT_RIGHT;
+			return CARRY_ON;
+		}
+
+		/* now shifting in either direction gives the same number
+		   of nodes and we can make use of the cached neighbors */
+		if (is_left_neighbor_in_cache(tb, h)) {
+			SET_PAR_SHIFT_LEFT;
+			return CARRY_ON;
+		}
+
+		/* shift to the right independently on whether the right neighbor in cache or not */
+		SET_PAR_SHIFT_RIGHT;
+		return CARRY_ON;
+	}
+}
+
+/* Check whether current node S[h] is balanced when Decreasing its size by
+ * Deleting or Cutting for INTERNAL node of S+tree.
+ * Calculate parameters for balancing for current level h.
+ * Parameters:
+ *	tb	tree_balance structure;
+ *	h	current level of the node;
+ *	inum	item number in S[h];
+ *	mode	i - insert, p - paste;
+ * Returns:	1 - schedule occurred;
+ *	        0 - balancing for higher levels needed;
+ *	       -1 - no balancing for higher levels needed;
+ *	       -2 - no disk space.
+ *
+ * Note: Items of internal nodes have fixed size, so the balance condition for
+ * the internal part of S+tree is as for the B-trees.
+ */
+static int dc_check_balance_internal(struct tree_balance *tb, int h)
+{
+	struct virtual_node *vn = tb->tb_vn;
+
+	/* Sh is the node whose balance is currently being checked,
+	   and Fh is its father.  */
+	struct buffer_head *Sh, *Fh;
+	int maxsize, ret;
+	int lfree, rfree /* free space in L and R */ ;
+
+	Sh = PATH_H_PBUFFER(tb->tb_path, h);
+	Fh = PATH_H_PPARENT(tb->tb_path, h);
+
+	maxsize = MAX_CHILD_SIZE(Sh);
+
+/*   using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
+/*   new_nr_item = number of items node would have if operation is */
+/* 	performed without balancing (new_nr_item); */
+	create_virtual_node(tb, h);
+
+	if (!Fh) {		/* S[h] is the root. */
+		if (vn->vn_nr_item > 0) {
+			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+			return NO_BALANCING_NEEDED;	/* no balancing for higher levels needed */
+		}
+		/* new_nr_item == 0.
+		 * Current root will be deleted resulting in
+		 * decrementing the tree height. */
+		set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
+		return CARRY_ON;
+	}
+
+	if ((ret = get_parents(tb, h)) != CARRY_ON)
+		return ret;
+
+	/* get free space of neighbors */
+	rfree = get_rfree(tb, h);
+	lfree = get_lfree(tb, h);
+
+	/* determine maximal number of items we can fit into neighbors */
+	check_left(tb, h, lfree);
+	check_right(tb, h, rfree);
+
+	if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) {	/* Balance condition for the internal node is valid.
+						 * In this case we balance only if it leads to better packing. */
+		if (vn->vn_nr_item == MIN_NR_KEY(Sh)) {	/* Here we join S[h] with one of its neighbors,
+							 * which is impossible with greater values of new_nr_item. */
+			if (tb->lnum[h] >= vn->vn_nr_item + 1) {
+				/* All contents of S[h] can be moved to L[h]. */
+				int n;
+				int order_L;
+
+				order_L =
+				    ((n =
+				      PATH_H_B_ITEM_ORDER(tb->tb_path,
+							  h)) ==
+				     0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
+				n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
+				    (DC_SIZE + KEY_SIZE);
+				set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
+					       -1);
+				return CARRY_ON;
+			}
+
+			if (tb->rnum[h] >= vn->vn_nr_item + 1) {
+				/* All contents of S[h] can be moved to R[h]. */
+				int n;
+				int order_R;
+
+				order_R =
+				    ((n =
+				      PATH_H_B_ITEM_ORDER(tb->tb_path,
+							  h)) ==
+				     B_NR_ITEMS(Fh)) ? 0 : n + 1;
+				n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
+				    (DC_SIZE + KEY_SIZE);
+				set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
+					       -1);
+				return CARRY_ON;
+			}
+		}
+
+		if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
+			/* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
+			int to_r;
+
+			to_r =
+			    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
+			     tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
+			    (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
+			set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
+				       0, NULL, -1, -1);
+			return CARRY_ON;
+		}
+
+		/* Balancing does not lead to better packing. */
+		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+		return NO_BALANCING_NEEDED;
+	}
+
+	/* Current node contain insufficient number of items. Balancing is required. */
+	/* Check whether we can merge S[h] with left neighbor. */
+	if (tb->lnum[h] >= vn->vn_nr_item + 1)
+		if (is_left_neighbor_in_cache(tb, h)
+		    || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
+			int n;
+			int order_L;
+
+			order_L =
+			    ((n =
+			      PATH_H_B_ITEM_ORDER(tb->tb_path,
+						  h)) ==
+			     0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
+			n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
+								      KEY_SIZE);
+			set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
+			return CARRY_ON;
+		}
+
+	/* Check whether we can merge S[h] with right neighbor. */
+	if (tb->rnum[h] >= vn->vn_nr_item + 1) {
+		int n;
+		int order_R;
+
+		order_R =
+		    ((n =
+		      PATH_H_B_ITEM_ORDER(tb->tb_path,
+					  h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
+		n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
+							      KEY_SIZE);
+		set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
+		return CARRY_ON;
+	}
+
+	/* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
+	if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
+		int to_r;
+
+		to_r =
+		    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
+		     vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
+						tb->rnum[h]);
+		set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
+			       -1, -1);
+		return CARRY_ON;
+	}
+
+	/* For internal nodes try to borrow item from a neighbor */
+	RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
+
+	/* Borrow one or two items from caching neighbor */
+	if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
+		int from_l;
+
+		from_l =
+		    (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
+		     1) / 2 - (vn->vn_nr_item + 1);
+		set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
+		return CARRY_ON;
+	}
+
+	set_parameters(tb, h, 0,
+		       -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
+			  1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
+	return CARRY_ON;
+}
+
+/* Check whether current node S[h] is balanced when Decreasing its size by
+ * Deleting or Truncating for LEAF node of S+tree.
+ * Calculate parameters for balancing for current level h.
+ * Parameters:
+ *	tb	tree_balance structure;
+ *	h	current level of the node;
+ *	inum	item number in S[h];
+ *	mode	i - insert, p - paste;
+ * Returns:	1 - schedule occurred;
+ *	        0 - balancing for higher levels needed;
+ *	       -1 - no balancing for higher levels needed;
+ *	       -2 - no disk space.
+ */
+static int dc_check_balance_leaf(struct tree_balance *tb, int h)
+{
+	struct virtual_node *vn = tb->tb_vn;
+
+	/* Number of bytes that must be deleted from
+	   (value is negative if bytes are deleted) buffer which
+	   contains node being balanced.  The mnemonic is that the
+	   attempted change in node space used level is levbytes bytes. */
+	int levbytes;
+	/* the maximal item size */
+	int maxsize, ret;
+	/* S0 is the node whose balance is currently being checked,
+	   and F0 is its father.  */
+	struct buffer_head *S0, *F0;
+	int lfree, rfree /* free space in L and R */ ;
+
+	S0 = PATH_H_PBUFFER(tb->tb_path, 0);
+	F0 = PATH_H_PPARENT(tb->tb_path, 0);
+
+	levbytes = tb->insert_size[h];
+
+	maxsize = MAX_CHILD_SIZE(S0);	/* maximal possible size of an item */
+
+	if (!F0) {		/* S[0] is the root now. */
+
+		RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
+		       "vs-8240: attempt to create empty buffer tree");
+
+		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+		return NO_BALANCING_NEEDED;
+	}
+
+	if ((ret = get_parents(tb, h)) != CARRY_ON)
+		return ret;
+
+	/* get free space of neighbors */
+	rfree = get_rfree(tb, h);
+	lfree = get_lfree(tb, h);
+
+	create_virtual_node(tb, h);
+
+	/* if 3 leaves can be merge to one, set parameters and return */
+	if (are_leaves_removable(tb, lfree, rfree))
+		return CARRY_ON;
+
+	/* determine maximal number of items we can shift to the left/right  neighbor
+	   and the maximal number of bytes that can flow to the left/right neighbor
+	   from the left/right most liquid item that cannot be shifted from S[0] entirely
+	 */
+	check_left(tb, h, lfree);
+	check_right(tb, h, rfree);
+
+	/* check whether we can merge S with left neighbor. */
+	if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
+		if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) ||	/* S can not be merged with R */
+		    !tb->FR[h]) {
+
+			RFALSE(!tb->FL[h],
+			       "vs-8245: dc_check_balance_leaf: FL[h] must exist");
+
+			/* set parameter to merge S[0] with its left neighbor */
+			set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
+			return CARRY_ON;
+		}
+
+	/* check whether we can merge S[0] with right neighbor. */
+	if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
+		set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
+		return CARRY_ON;
+	}
+
+	/* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
+	if (is_leaf_removable(tb))
+		return CARRY_ON;
+
+	/* Balancing is not required. */
+	tb->s0num = vn->vn_nr_item;
+	set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+	return NO_BALANCING_NEEDED;
+}
+
+/* Check whether current node S[h] is balanced when Decreasing its size by
+ * Deleting or Cutting.
+ * Calculate parameters for balancing for current level h.
+ * Parameters:
+ *	tb	tree_balance structure;
+ *	h	current level of the node;
+ *	inum	item number in S[h];
+ *	mode	d - delete, c - cut.
+ * Returns:	1 - schedule occurred;
+ *	        0 - balancing for higher levels needed;
+ *	       -1 - no balancing for higher levels needed;
+ *	       -2 - no disk space.
+ */
+static int dc_check_balance(struct tree_balance *tb, int h)
+{
+	RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
+	       "vs-8250: S is not initialized");
+
+	if (h)
+		return dc_check_balance_internal(tb, h);
+	else
+		return dc_check_balance_leaf(tb, h);
+}
+
+/* Check whether current node S[h] is balanced.
+ * Calculate parameters for balancing for current level h.
+ * Parameters:
+ *
+ *	tb	tree_balance structure:
+ *
+ *              tb is a large structure that must be read about in the header file
+ *              at the same time as this procedure if the reader is to successfully
+ *              understand this procedure
+ *
+ *	h	current level of the node;
+ *	inum	item number in S[h];
+ *	mode	i - insert, p - paste, d - delete, c - cut.
+ * Returns:	1 - schedule occurred;
+ *	        0 - balancing for higher levels needed;
+ *	       -1 - no balancing for higher levels needed;
+ *	       -2 - no disk space.
+ */
+static int check_balance(int mode,
+			 struct tree_balance *tb,
+			 int h,
+			 int inum,
+			 int pos_in_item,
+			 struct item_head *ins_ih, const void *data)
+{
+	struct virtual_node *vn;
+
+	vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
+	vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
+	vn->vn_mode = mode;
+	vn->vn_affected_item_num = inum;
+	vn->vn_pos_in_item = pos_in_item;
+	vn->vn_ins_ih = ins_ih;
+	vn->vn_data = data;
+
+	RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
+	       "vs-8255: ins_ih can not be 0 in insert mode");
+
+	if (tb->insert_size[h] > 0)
+		/* Calculate balance parameters when size of node is increasing. */
+		return ip_check_balance(tb, h);
+
+	/* Calculate balance parameters when  size of node is decreasing. */
+	return dc_check_balance(tb, h);
+}
+
+/* Check whether parent at the path is the really parent of the current node.*/
+static int get_direct_parent(struct tree_balance *tb, int h)
+{
+	struct buffer_head *bh;
+	struct treepath *path = tb->tb_path;
+	int position,
+	    path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
+
+	/* We are in the root or in the new root. */
+	if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
+
+		RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
+		       "PAP-8260: invalid offset in the path");
+
+		if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)->
+		    b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
+			/* Root is not changed. */
+			PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL;
+			PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
+			return CARRY_ON;
+		}
+		return REPEAT_SEARCH;	/* Root is changed and we must recalculate the path. */
+	}
+
+	if (!B_IS_IN_TREE
+	    (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
+		return REPEAT_SEARCH;	/* Parent in the path is not in the tree. */
+
+	if ((position =
+	     PATH_OFFSET_POSITION(path,
+				  path_offset - 1)) > B_NR_ITEMS(bh))
+		return REPEAT_SEARCH;
+
+	if (B_N_CHILD_NUM(bh, position) !=
+	    PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
+		/* Parent in the path is not parent of the current node in the tree. */
+		return REPEAT_SEARCH;
+
+	if (buffer_locked(bh)) {
+		reiserfs_write_unlock(tb->tb_sb);
+		__wait_on_buffer(bh);
+		reiserfs_write_lock(tb->tb_sb);
+		if (FILESYSTEM_CHANGED_TB(tb))
+			return REPEAT_SEARCH;
+	}
+
+	return CARRY_ON;	/* Parent in the path is unlocked and really parent of the current node.  */
+}
+
+/* Using lnum[h] and rnum[h] we should determine what neighbors
+ * of S[h] we
+ * need in order to balance S[h], and get them if necessary.
+ * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
+ *	        CARRY_ON - schedule didn't occur while the function worked;
+ */
+static int get_neighbors(struct tree_balance *tb, int h)
+{
+	int child_position,
+	    path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1);
+	unsigned long son_number;
+	struct super_block *sb = tb->tb_sb;
+	struct buffer_head *bh;
+
+	PROC_INFO_INC(sb, get_neighbors[h]);
+
+	if (tb->lnum[h]) {
+		/* We need left neighbor to balance S[h]. */
+		PROC_INFO_INC(sb, need_l_neighbor[h]);
+		bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
+
+		RFALSE(bh == tb->FL[h] &&
+		       !PATH_OFFSET_POSITION(tb->tb_path, path_offset),
+		       "PAP-8270: invalid position in the parent");
+
+		child_position =
+		    (bh ==
+		     tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
+								       FL[h]);
+		son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
+		reiserfs_write_unlock(sb);
+		bh = sb_bread(sb, son_number);
+		reiserfs_write_lock(sb);
+		if (!bh)
+			return IO_ERROR;
+		if (FILESYSTEM_CHANGED_TB(tb)) {
+			brelse(bh);
+			PROC_INFO_INC(sb, get_neighbors_restart[h]);
+			return REPEAT_SEARCH;
+		}
+
+		RFALSE(!B_IS_IN_TREE(tb->FL[h]) ||
+		       child_position > B_NR_ITEMS(tb->FL[h]) ||
+		       B_N_CHILD_NUM(tb->FL[h], child_position) !=
+		       bh->b_blocknr, "PAP-8275: invalid parent");
+		RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
+		RFALSE(!h &&
+		       B_FREE_SPACE(bh) !=
+		       MAX_CHILD_SIZE(bh) -
+		       dc_size(B_N_CHILD(tb->FL[0], child_position)),
+		       "PAP-8290: invalid child size of left neighbor");
+
+		brelse(tb->L[h]);
+		tb->L[h] = bh;
+	}
+
+	/* We need right neighbor to balance S[path_offset]. */
+	if (tb->rnum[h]) {	/* We need right neighbor to balance S[path_offset]. */
+		PROC_INFO_INC(sb, need_r_neighbor[h]);
+		bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
+
+		RFALSE(bh == tb->FR[h] &&
+		       PATH_OFFSET_POSITION(tb->tb_path,
+					    path_offset) >=
+		       B_NR_ITEMS(bh),
+		       "PAP-8295: invalid position in the parent");
+
+		child_position =
+		    (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
+		son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
+		reiserfs_write_unlock(sb);
+		bh = sb_bread(sb, son_number);
+		reiserfs_write_lock(sb);
+		if (!bh)
+			return IO_ERROR;
+		if (FILESYSTEM_CHANGED_TB(tb)) {
+			brelse(bh);
+			PROC_INFO_INC(sb, get_neighbors_restart[h]);
+			return REPEAT_SEARCH;
+		}
+		brelse(tb->R[h]);
+		tb->R[h] = bh;
+
+		RFALSE(!h
+		       && B_FREE_SPACE(bh) !=
+		       MAX_CHILD_SIZE(bh) -
+		       dc_size(B_N_CHILD(tb->FR[0], child_position)),
+		       "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
+		       B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
+		       dc_size(B_N_CHILD(tb->FR[0], child_position)));
+
+	}
+	return CARRY_ON;
+}
+
+static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
+{
+	int max_num_of_items;
+	int max_num_of_entries;
+	unsigned long blocksize = sb->s_blocksize;
+
+#define MIN_NAME_LEN 1
+
+	max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
+	max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
+	    (DEH_SIZE + MIN_NAME_LEN);
+
+	return sizeof(struct virtual_node) +
+	    max(max_num_of_items * sizeof(struct virtual_item),
+		sizeof(struct virtual_item) + sizeof(struct direntry_uarea) +
+		(max_num_of_entries - 1) * sizeof(__u16));
+}
+
+/* maybe we should fail balancing we are going to perform when kmalloc
+   fails several times. But now it will loop until kmalloc gets
+   required memory */
+static int get_mem_for_virtual_node(struct tree_balance *tb)
+{
+	int check_fs = 0;
+	int size;
+	char *buf;
+
+	size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
+
+	if (size > tb->vn_buf_size) {
+		/* we have to allocate more memory for virtual node */
+		if (tb->vn_buf) {
+			/* free memory allocated before */
+			kfree(tb->vn_buf);
+			/* this is not needed if kfree is atomic */
+			check_fs = 1;
+		}
+
+		/* virtual node requires now more memory */
+		tb->vn_buf_size = size;
+
+		/* get memory for virtual item */
+		buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
+		if (!buf) {
+			/* getting memory with GFP_KERNEL priority may involve
+			   balancing now (due to indirect_to_direct conversion on
+			   dcache shrinking). So, release path and collected
+			   resources here */
+			free_buffers_in_tb(tb);
+			buf = kmalloc(size, GFP_NOFS);
+			if (!buf) {
+				tb->vn_buf_size = 0;
+			}
+			tb->vn_buf = buf;
+			schedule();
+			return REPEAT_SEARCH;
+		}
+
+		tb->vn_buf = buf;
+	}
+
+	if (check_fs && FILESYSTEM_CHANGED_TB(tb))
+		return REPEAT_SEARCH;
+
+	return CARRY_ON;
+}
+
+#ifdef CONFIG_REISERFS_CHECK
+static void tb_buffer_sanity_check(struct super_block *sb,
+				   struct buffer_head *bh,
+				   const char *descr, int level)
+{
+	if (bh) {
+		if (atomic_read(&(bh->b_count)) <= 0)
+
+			reiserfs_panic(sb, "jmacd-1", "negative or zero "
+				       "reference counter for buffer %s[%d] "
+				       "(%b)", descr, level, bh);
+
+		if (!buffer_uptodate(bh))
+			reiserfs_panic(sb, "jmacd-2", "buffer is not up "
+				       "to date %s[%d] (%b)",
+				       descr, level, bh);
+
+		if (!B_IS_IN_TREE(bh))
+			reiserfs_panic(sb, "jmacd-3", "buffer is not "
+				       "in tree %s[%d] (%b)",
+				       descr, level, bh);
+
+		if (bh->b_bdev != sb->s_bdev)
+			reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
+				       "device %s[%d] (%b)",
+				       descr, level, bh);
+
+		if (bh->b_size != sb->s_blocksize)
+			reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
+				       "blocksize %s[%d] (%b)",
+				       descr, level, bh);
+
+		if (bh->b_blocknr > SB_BLOCK_COUNT(sb))
+			reiserfs_panic(sb, "jmacd-6", "buffer block "
+				       "number too high %s[%d] (%b)",
+				       descr, level, bh);
+	}
+}
+#else
+static void tb_buffer_sanity_check(struct super_block *sb,
+				   struct buffer_head *bh,
+				   const char *descr, int level)
+{;
+}
+#endif
+
+static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
+{
+	return reiserfs_prepare_for_journal(s, bh, 0);
+}
+
+static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
+{
+	struct buffer_head *locked;
+#ifdef CONFIG_REISERFS_CHECK
+	int repeat_counter = 0;
+#endif
+	int i;
+
+	do {
+
+		locked = NULL;
+
+		for (i = tb->tb_path->path_length;
+		     !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
+			if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
+				/* if I understand correctly, we can only be sure the last buffer
+				 ** in the path is in the tree --clm
+				 */
+#ifdef CONFIG_REISERFS_CHECK
+				if (PATH_PLAST_BUFFER(tb->tb_path) ==
+				    PATH_OFFSET_PBUFFER(tb->tb_path, i))
+					tb_buffer_sanity_check(tb->tb_sb,
+							       PATH_OFFSET_PBUFFER
+							       (tb->tb_path,
+								i), "S",
+							       tb->tb_path->
+							       path_length - i);
+#endif
+				if (!clear_all_dirty_bits(tb->tb_sb,
+							  PATH_OFFSET_PBUFFER
+							  (tb->tb_path,
+							   i))) {
+					locked =
+					    PATH_OFFSET_PBUFFER(tb->tb_path,
+								i);
+				}
+			}
+		}
+
+		for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i];
+		     i++) {
+
+			if (tb->lnum[i]) {
+
+				if (tb->L[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->L[i],
+							       "L", i);
+					if (!clear_all_dirty_bits
+					    (tb->tb_sb, tb->L[i]))
+						locked = tb->L[i];
+				}
+
+				if (!locked && tb->FL[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->FL[i],
+							       "FL", i);
+					if (!clear_all_dirty_bits
+					    (tb->tb_sb, tb->FL[i]))
+						locked = tb->FL[i];
+				}
+
+				if (!locked && tb->CFL[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->CFL[i],
+							       "CFL", i);
+					if (!clear_all_dirty_bits
+					    (tb->tb_sb, tb->CFL[i]))
+						locked = tb->CFL[i];
+				}
+
+			}
+
+			if (!locked && (tb->rnum[i])) {
+
+				if (tb->R[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->R[i],
+							       "R", i);
+					if (!clear_all_dirty_bits
+					    (tb->tb_sb, tb->R[i]))
+						locked = tb->R[i];
+				}
+
+				if (!locked && tb->FR[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->FR[i],
+							       "FR", i);
+					if (!clear_all_dirty_bits
+					    (tb->tb_sb, tb->FR[i]))
+						locked = tb->FR[i];
+				}
+
+				if (!locked && tb->CFR[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->CFR[i],
+							       "CFR", i);
+					if (!clear_all_dirty_bits
+					    (tb->tb_sb, tb->CFR[i]))
+						locked = tb->CFR[i];
+				}
+			}
+		}
+		/* as far as I can tell, this is not required.  The FEB list seems
+		 ** to be full of newly allocated nodes, which will never be locked,
+		 ** dirty, or anything else.
+		 ** To be safe, I'm putting in the checks and waits in.  For the moment,
+		 ** they are needed to keep the code in journal.c from complaining
+		 ** about the buffer.  That code is inside CONFIG_REISERFS_CHECK as well.
+		 ** --clm
+		 */
+		for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
+			if (tb->FEB[i]) {
+				if (!clear_all_dirty_bits
+				    (tb->tb_sb, tb->FEB[i]))
+					locked = tb->FEB[i];
+			}
+		}
+
+		if (locked) {
+#ifdef CONFIG_REISERFS_CHECK
+			repeat_counter++;
+			if ((repeat_counter % 10000) == 0) {
+				reiserfs_warning(tb->tb_sb, "reiserfs-8200",
+						 "too many iterations waiting "
+						 "for buffer to unlock "
+						 "(%b)", locked);
+
+				/* Don't loop forever.  Try to recover from possible error. */
+
+				return (FILESYSTEM_CHANGED_TB(tb)) ?
+				    REPEAT_SEARCH : CARRY_ON;
+			}
+#endif
+			reiserfs_write_unlock(tb->tb_sb);
+			__wait_on_buffer(locked);
+			reiserfs_write_lock(tb->tb_sb);
+			if (FILESYSTEM_CHANGED_TB(tb))
+				return REPEAT_SEARCH;
+		}
+
+	} while (locked);
+
+	return CARRY_ON;
+}
+
+/* Prepare for balancing, that is
+ *	get all necessary parents, and neighbors;
+ *	analyze what and where should be moved;
+ *	get sufficient number of new nodes;
+ * Balancing will start only after all resources will be collected at a time.
+ *
+ * When ported to SMP kernels, only at the last moment after all needed nodes
+ * are collected in cache, will the resources be locked using the usual
+ * textbook ordered lock acquisition algorithms.  Note that ensuring that
+ * this code neither write locks what it does not need to write lock nor locks out of order
+ * will be a pain in the butt that could have been avoided.  Grumble grumble. -Hans
+ *
+ * fix is meant in the sense of render unchanging
+ *
+ * Latency might be improved by first gathering a list of what buffers are needed
+ * and then getting as many of them in parallel as possible? -Hans
+ *
+ * Parameters:
+ *	op_mode	i - insert, d - delete, c - cut (truncate), p - paste (append)
+ *	tb	tree_balance structure;
+ *	inum	item number in S[h];
+ *      pos_in_item - comment this if you can
+ *      ins_ih	item head of item being inserted
+ *	data	inserted item or data to be pasted
+ * Returns:	1 - schedule occurred while the function worked;
+ *	        0 - schedule didn't occur while the function worked;
+ *             -1 - if no_disk_space
+ */
+
+int fix_nodes(int op_mode, struct tree_balance *tb,
+	      struct item_head *ins_ih, const void *data)
+{
+	int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
+	int pos_in_item;
+
+	/* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
+	 ** during wait_tb_buffers_run
+	 */
+	int wait_tb_buffers_run = 0;
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+
+	++REISERFS_SB(tb->tb_sb)->s_fix_nodes;
+
+	pos_in_item = tb->tb_path->pos_in_item;
+
+	tb->fs_gen = get_generation(tb->tb_sb);
+
+	/* we prepare and log the super here so it will already be in the
+	 ** transaction when do_balance needs to change it.
+	 ** This way do_balance won't have to schedule when trying to prepare
+	 ** the super for logging
+	 */
+	reiserfs_prepare_for_journal(tb->tb_sb,
+				     SB_BUFFER_WITH_SB(tb->tb_sb), 1);
+	journal_mark_dirty(tb->transaction_handle, tb->tb_sb,
+			   SB_BUFFER_WITH_SB(tb->tb_sb));
+	if (FILESYSTEM_CHANGED_TB(tb))
+		return REPEAT_SEARCH;
+
+	/* if it possible in indirect_to_direct conversion */
+	if (buffer_locked(tbS0)) {
+		reiserfs_write_unlock(tb->tb_sb);
+		__wait_on_buffer(tbS0);
+		reiserfs_write_lock(tb->tb_sb);
+		if (FILESYSTEM_CHANGED_TB(tb))
+			return REPEAT_SEARCH;
+	}
+#ifdef CONFIG_REISERFS_CHECK
+	if (REISERFS_SB(tb->tb_sb)->cur_tb) {
+		print_cur_tb("fix_nodes");
+		reiserfs_panic(tb->tb_sb, "PAP-8305",
+			       "there is pending do_balance");
+	}
+
+	if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0))
+		reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
+			       "not uptodate at the beginning of fix_nodes "
+			       "or not in tree (mode %c)",
+			       tbS0, tbS0, op_mode);
+
+	/* Check parameters. */
+	switch (op_mode) {
+	case M_INSERT:
+		if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0))
+			reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect "
+				       "item number %d (in S0 - %d) in case "
+				       "of insert", item_num,
+				       B_NR_ITEMS(tbS0));
+		break;
+	case M_PASTE:
+	case M_DELETE:
+	case M_CUT:
+		if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) {
+			print_block(tbS0, 0, -1, -1);
+			reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect "
+				       "item number(%d); mode = %c "
+				       "insert_size = %d",
+				       item_num, op_mode,
+				       tb->insert_size[0]);
+		}
+		break;
+	default:
+		reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode "
+			       "of operation");
+	}
+#endif
+
+	if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
+		// FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
+		return REPEAT_SEARCH;
+
+	/* Starting from the leaf level; for all levels h of the tree. */
+	for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) {
+		ret = get_direct_parent(tb, h);
+		if (ret != CARRY_ON)
+			goto repeat;
+
+		ret = check_balance(op_mode, tb, h, item_num,
+				    pos_in_item, ins_ih, data);
+		if (ret != CARRY_ON) {
+			if (ret == NO_BALANCING_NEEDED) {
+				/* No balancing for higher levels needed. */
+				ret = get_neighbors(tb, h);
+				if (ret != CARRY_ON)
+					goto repeat;
+				if (h != MAX_HEIGHT - 1)
+					tb->insert_size[h + 1] = 0;
+				/* ok, analysis and resource gathering are complete */
+				break;
+			}
+			goto repeat;
+		}
+
+		ret = get_neighbors(tb, h);
+		if (ret != CARRY_ON)
+			goto repeat;
+
+		/* No disk space, or schedule occurred and analysis may be
+		 * invalid and needs to be redone. */
+		ret = get_empty_nodes(tb, h);
+		if (ret != CARRY_ON)
+			goto repeat;
+
+		if (!PATH_H_PBUFFER(tb->tb_path, h)) {
+			/* We have a positive insert size but no nodes exist on this
+			   level, this means that we are creating a new root. */
+
+			RFALSE(tb->blknum[h] != 1,
+			       "PAP-8350: creating new empty root");
+
+			if (h < MAX_HEIGHT - 1)
+				tb->insert_size[h + 1] = 0;
+		} else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
+			if (tb->blknum[h] > 1) {
+				/* The tree needs to be grown, so this node S[h]
+				   which is the root node is split into two nodes,
+				   and a new node (S[h+1]) will be created to
+				   become the root node.  */
+
+				RFALSE(h == MAX_HEIGHT - 1,
+				       "PAP-8355: attempt to create too high of a tree");
+
+				tb->insert_size[h + 1] =
+				    (DC_SIZE +
+				     KEY_SIZE) * (tb->blknum[h] - 1) +
+				    DC_SIZE;
+			} else if (h < MAX_HEIGHT - 1)
+				tb->insert_size[h + 1] = 0;
+		} else
+			tb->insert_size[h + 1] =
+			    (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1);
+	}
+
+	ret = wait_tb_buffers_until_unlocked(tb);
+	if (ret == CARRY_ON) {
+		if (FILESYSTEM_CHANGED_TB(tb)) {
+			wait_tb_buffers_run = 1;
+			ret = REPEAT_SEARCH;
+			goto repeat;
+		} else {
+			return CARRY_ON;
+		}
+	} else {
+		wait_tb_buffers_run = 1;
+		goto repeat;
+	}
+
+      repeat:
+	// fix_nodes was unable to perform its calculation due to
+	// filesystem got changed under us, lack of free disk space or i/o
+	// failure. If the first is the case - the search will be
+	// repeated. For now - free all resources acquired so far except
+	// for the new allocated nodes
+	{
+		int i;
+
+		/* Release path buffers. */
+		if (wait_tb_buffers_run) {
+			pathrelse_and_restore(tb->tb_sb, tb->tb_path);
+		} else {
+			pathrelse(tb->tb_path);
+		}
+		/* brelse all resources collected for balancing */
+		for (i = 0; i < MAX_HEIGHT; i++) {
+			if (wait_tb_buffers_run) {
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->L[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->R[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->FL[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->FR[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->
+								 CFL[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->
+								 CFR[i]);
+			}
+
+			brelse(tb->L[i]);
+			brelse(tb->R[i]);
+			brelse(tb->FL[i]);
+			brelse(tb->FR[i]);
+			brelse(tb->CFL[i]);
+			brelse(tb->CFR[i]);
+
+			tb->L[i] = NULL;
+			tb->R[i] = NULL;
+			tb->FL[i] = NULL;
+			tb->FR[i] = NULL;
+			tb->CFL[i] = NULL;
+			tb->CFR[i] = NULL;
+		}
+
+		if (wait_tb_buffers_run) {
+			for (i = 0; i < MAX_FEB_SIZE; i++) {
+				if (tb->FEB[i])
+					reiserfs_restore_prepared_buffer
+					    (tb->tb_sb, tb->FEB[i]);
+			}
+		}
+		return ret;
+	}
+
+}
+
+/* Anatoly will probably forgive me renaming tb to tb. I just
+   wanted to make lines shorter */
+void unfix_nodes(struct tree_balance *tb)
+{
+	int i;
+
+	/* Release path buffers. */
+	pathrelse_and_restore(tb->tb_sb, tb->tb_path);
+
+	/* brelse all resources collected for balancing */
+	for (i = 0; i < MAX_HEIGHT; i++) {
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
+
+		brelse(tb->L[i]);
+		brelse(tb->R[i]);
+		brelse(tb->FL[i]);
+		brelse(tb->FR[i]);
+		brelse(tb->CFL[i]);
+		brelse(tb->CFR[i]);
+	}
+
+	/* deal with list of allocated (used and unused) nodes */
+	for (i = 0; i < MAX_FEB_SIZE; i++) {
+		if (tb->FEB[i]) {
+			b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
+			/* de-allocated block which was not used by balancing and
+			   bforget about buffer for it */
+			brelse(tb->FEB[i]);
+			reiserfs_free_block(tb->transaction_handle, NULL,
+					    blocknr, 0);
+		}
+		if (tb->used[i]) {
+			/* release used as new nodes including a new root */
+			brelse(tb->used[i]);
+		}
+	}
+
+	kfree(tb->vn_buf);
+
+}
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
new file mode 100644
index 00000000..6471c670
--- /dev/null
+++ b/fs/reiserfs/hashes.c
@@ -0,0 +1,182 @@
+
+/*
+ * Keyed 32-bit hash function using TEA in a Davis-Meyer function
+ *   H0 = Key
+ *   Hi = E Mi(Hi-1) + Hi-1
+ *
+ * (see Applied Cryptography, 2nd edition, p448).
+ *
+ * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
+ *
+ * Jeremy has agreed to the contents of reiserfs/README. -Hans
+ * Yura's function is added (04/07/2000)
+ */
+
+//
+// keyed_hash
+// yura_hash
+// r5_hash
+//
+
+#include <linux/kernel.h>
+#include <linux/reiserfs_fs.h>
+#include <asm/types.h>
+
+#define DELTA 0x9E3779B9
+#define FULLROUNDS 10		/* 32 is overkill, 16 is strong crypto */
+#define PARTROUNDS 6		/* 6 gets complete mixing */
+
+/* a, b, c, d - data; h0, h1 - accumulated hash */
+#define TEACORE(rounds)							\
+	do {								\
+		u32 sum = 0;						\
+		int n = rounds;						\
+		u32 b0, b1;						\
+									\
+		b0 = h0;						\
+		b1 = h1;						\
+									\
+		do							\
+		{							\
+			sum += DELTA;					\
+			b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);	\
+			b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);	\
+		} while(--n);						\
+									\
+		h0 += b0;						\
+		h1 += b1;						\
+	} while(0)
+
+u32 keyed_hash(const signed char *msg, int len)
+{
+	u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3 };
+
+	u32 h0 = k[0], h1 = k[1];
+	u32 a, b, c, d;
+	u32 pad;
+	int i;
+
+	//      assert(len >= 0 && len < 256);
+
+	pad = (u32) len | ((u32) len << 8);
+	pad |= pad << 16;
+
+	while (len >= 16) {
+		a = (u32) msg[0] |
+		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
+		b = (u32) msg[4] |
+		    (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
+		c = (u32) msg[8] |
+		    (u32) msg[9] << 8 |
+		    (u32) msg[10] << 16 | (u32) msg[11] << 24;
+		d = (u32) msg[12] |
+		    (u32) msg[13] << 8 |
+		    (u32) msg[14] << 16 | (u32) msg[15] << 24;
+
+		TEACORE(PARTROUNDS);
+
+		len -= 16;
+		msg += 16;
+	}
+
+	if (len >= 12) {
+		a = (u32) msg[0] |
+		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
+		b = (u32) msg[4] |
+		    (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
+		c = (u32) msg[8] |
+		    (u32) msg[9] << 8 |
+		    (u32) msg[10] << 16 | (u32) msg[11] << 24;
+
+		d = pad;
+		for (i = 12; i < len; i++) {
+			d <<= 8;
+			d |= msg[i];
+		}
+	} else if (len >= 8) {
+		a = (u32) msg[0] |
+		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
+		b = (u32) msg[4] |
+		    (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
+
+		c = d = pad;
+		for (i = 8; i < len; i++) {
+			c <<= 8;
+			c |= msg[i];
+		}
+	} else if (len >= 4) {
+		a = (u32) msg[0] |
+		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
+
+		b = c = d = pad;
+		for (i = 4; i < len; i++) {
+			b <<= 8;
+			b |= msg[i];
+		}
+	} else {
+		a = b = c = d = pad;
+		for (i = 0; i < len; i++) {
+			a <<= 8;
+			a |= msg[i];
+		}
+	}
+
+	TEACORE(FULLROUNDS);
+
+/*	return 0;*/
+	return h0 ^ h1;
+}
+
+/* What follows in this file is copyright 2000 by Hans Reiser, and the
+ * licensing of what follows is governed by reiserfs/README */
+
+u32 yura_hash(const signed char *msg, int len)
+{
+	int j, pow;
+	u32 a, c;
+	int i;
+
+	for (pow = 1, i = 1; i < len; i++)
+		pow = pow * 10;
+
+	if (len == 1)
+		a = msg[0] - 48;
+	else
+		a = (msg[0] - 48) * pow;
+
+	for (i = 1; i < len; i++) {
+		c = msg[i] - 48;
+		for (pow = 1, j = i; j < len - 1; j++)
+			pow = pow * 10;
+		a = a + c * pow;
+	}
+
+	for (; i < 40; i++) {
+		c = '0' - 48;
+		for (pow = 1, j = i; j < len - 1; j++)
+			pow = pow * 10;
+		a = a + c * pow;
+	}
+
+	for (; i < 256; i++) {
+		c = i;
+		for (pow = 1, j = i; j < len - 1; j++)
+			pow = pow * 10;
+		a = a + c * pow;
+	}
+
+	a = a << 7;
+	return a;
+}
+
+u32 r5_hash(const signed char *msg, int len)
+{
+	u32 a = 0;
+	while (*msg) {
+		a += *msg << 4;
+		a += *msg >> 4;
+		a *= 11;
+		msg++;
+	}
+	return a;
+}
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
new file mode 100644
index 00000000..2074fd95
--- /dev/null
+++ b/fs/reiserfs/ibalance.c
@@ -0,0 +1,1089 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <asm/uaccess.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/buffer_head.h>
+
+/* this is one and only function that is used outside (do_balance.c) */
+int balance_internal(struct tree_balance *,
+		     int, int, struct item_head *, struct buffer_head **);
+
+/* modes of internal_shift_left, internal_shift_right and internal_insert_childs */
+#define INTERNAL_SHIFT_FROM_S_TO_L 0
+#define INTERNAL_SHIFT_FROM_R_TO_S 1
+#define INTERNAL_SHIFT_FROM_L_TO_S 2
+#define INTERNAL_SHIFT_FROM_S_TO_R 3
+#define INTERNAL_INSERT_TO_S 4
+#define INTERNAL_INSERT_TO_L 5
+#define INTERNAL_INSERT_TO_R 6
+
+static void internal_define_dest_src_infos(int shift_mode,
+					   struct tree_balance *tb,
+					   int h,
+					   struct buffer_info *dest_bi,
+					   struct buffer_info *src_bi,
+					   int *d_key, struct buffer_head **cf)
+{
+	memset(dest_bi, 0, sizeof(struct buffer_info));
+	memset(src_bi, 0, sizeof(struct buffer_info));
+	/* define dest, src, dest parent, dest position */
+	switch (shift_mode) {
+	case INTERNAL_SHIFT_FROM_S_TO_L:	/* used in internal_shift_left */
+		src_bi->tb = tb;
+		src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
+		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->L[h];
+		dest_bi->bi_parent = tb->FL[h];
+		dest_bi->bi_position = get_left_neighbor_position(tb, h);
+		*d_key = tb->lkey[h];
+		*cf = tb->CFL[h];
+		break;
+	case INTERNAL_SHIFT_FROM_L_TO_S:
+		src_bi->tb = tb;
+		src_bi->bi_bh = tb->L[h];
+		src_bi->bi_parent = tb->FL[h];
+		src_bi->bi_position = get_left_neighbor_position(tb, h);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
+		dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);	/* dest position is analog of dest->b_item_order */
+		*d_key = tb->lkey[h];
+		*cf = tb->CFL[h];
+		break;
+
+	case INTERNAL_SHIFT_FROM_R_TO_S:	/* used in internal_shift_left */
+		src_bi->tb = tb;
+		src_bi->bi_bh = tb->R[h];
+		src_bi->bi_parent = tb->FR[h];
+		src_bi->bi_position = get_right_neighbor_position(tb, h);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
+		dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		*d_key = tb->rkey[h];
+		*cf = tb->CFR[h];
+		break;
+
+	case INTERNAL_SHIFT_FROM_S_TO_R:
+		src_bi->tb = tb;
+		src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
+		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->R[h];
+		dest_bi->bi_parent = tb->FR[h];
+		dest_bi->bi_position = get_right_neighbor_position(tb, h);
+		*d_key = tb->rkey[h];
+		*cf = tb->CFR[h];
+		break;
+
+	case INTERNAL_INSERT_TO_L:
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->L[h];
+		dest_bi->bi_parent = tb->FL[h];
+		dest_bi->bi_position = get_left_neighbor_position(tb, h);
+		break;
+
+	case INTERNAL_INSERT_TO_S:
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
+		dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		break;
+
+	case INTERNAL_INSERT_TO_R:
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->R[h];
+		dest_bi->bi_parent = tb->FR[h];
+		dest_bi->bi_position = get_right_neighbor_position(tb, h);
+		break;
+
+	default:
+		reiserfs_panic(tb->tb_sb, "ibalance-1",
+			       "shift type is unknown (%d)",
+			       shift_mode);
+	}
+}
+
+/* Insert count node pointers into buffer cur before position to + 1.
+ * Insert count items into buffer cur before position to.
+ * Items and node pointers are specified by inserted and bh respectively.
+ */
+static void internal_insert_childs(struct buffer_info *cur_bi,
+				   int to, int count,
+				   struct item_head *inserted,
+				   struct buffer_head **bh)
+{
+	struct buffer_head *cur = cur_bi->bi_bh;
+	struct block_head *blkh;
+	int nr;
+	struct reiserfs_key *ih;
+	struct disk_child new_dc[2];
+	struct disk_child *dc;
+	int i;
+
+	if (count <= 0)
+		return;
+
+	blkh = B_BLK_HEAD(cur);
+	nr = blkh_nr_item(blkh);
+
+	RFALSE(count > 2, "too many children (%d) are to be inserted", count);
+	RFALSE(B_FREE_SPACE(cur) < count * (KEY_SIZE + DC_SIZE),
+	       "no enough free space (%d), needed %d bytes",
+	       B_FREE_SPACE(cur), count * (KEY_SIZE + DC_SIZE));
+
+	/* prepare space for count disk_child */
+	dc = B_N_CHILD(cur, to + 1);
+
+	memmove(dc + count, dc, (nr + 1 - (to + 1)) * DC_SIZE);
+
+	/* copy to_be_insert disk children */
+	for (i = 0; i < count; i++) {
+		put_dc_size(&(new_dc[i]),
+			    MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i]));
+		put_dc_block_number(&(new_dc[i]), bh[i]->b_blocknr);
+	}
+	memcpy(dc, new_dc, DC_SIZE * count);
+
+	/* prepare space for count items  */
+	ih = B_N_PDELIM_KEY(cur, ((to == -1) ? 0 : to));
+
+	memmove(ih + count, ih,
+		(nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE);
+
+	/* copy item headers (keys) */
+	memcpy(ih, inserted, KEY_SIZE);
+	if (count > 1)
+		memcpy(ih + 1, inserted + 1, KEY_SIZE);
+
+	/* sizes, item number */
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + count);
+	set_blkh_free_space(blkh,
+			    blkh_free_space(blkh) - count * (DC_SIZE +
+							     KEY_SIZE));
+
+	do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
+
+	/*&&&&&&&&&&&&&&&&&&&&&&&& */
+	check_internal(cur);
+	/*&&&&&&&&&&&&&&&&&&&&&&&& */
+
+	if (cur_bi->bi_parent) {
+		struct disk_child *t_dc =
+		    B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
+		put_dc_size(t_dc,
+			    dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE)));
+		do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
+					       0);
+
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+		check_internal(cur_bi->bi_parent);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+	}
+
+}
+
+/* Delete del_num items and node pointers from buffer cur starting from *
+ * the first_i'th item and first_p'th pointers respectively.		*/
+static void internal_delete_pointers_items(struct buffer_info *cur_bi,
+					   int first_p,
+					   int first_i, int del_num)
+{
+	struct buffer_head *cur = cur_bi->bi_bh;
+	int nr;
+	struct block_head *blkh;
+	struct reiserfs_key *key;
+	struct disk_child *dc;
+
+	RFALSE(cur == NULL, "buffer is 0");
+	RFALSE(del_num < 0,
+	       "negative number of items (%d) can not be deleted", del_num);
+	RFALSE(first_p < 0 || first_p + del_num > B_NR_ITEMS(cur) + 1
+	       || first_i < 0,
+	       "first pointer order (%d) < 0 or "
+	       "no so many pointers (%d), only (%d) or "
+	       "first key order %d < 0", first_p, first_p + del_num,
+	       B_NR_ITEMS(cur) + 1, first_i);
+	if (del_num == 0)
+		return;
+
+	blkh = B_BLK_HEAD(cur);
+	nr = blkh_nr_item(blkh);
+
+	if (first_p == 0 && del_num == nr + 1) {
+		RFALSE(first_i != 0,
+		       "1st deleted key must have order 0, not %d", first_i);
+		make_empty_node(cur_bi);
+		return;
+	}
+
+	RFALSE(first_i + del_num > B_NR_ITEMS(cur),
+	       "first_i = %d del_num = %d "
+	       "no so many keys (%d) in the node (%b)(%z)",
+	       first_i, del_num, first_i + del_num, cur, cur);
+
+	/* deleting */
+	dc = B_N_CHILD(cur, first_p);
+
+	memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE);
+	key = B_N_PDELIM_KEY(cur, first_i);
+	memmove(key, key + del_num,
+		(nr - first_i - del_num) * KEY_SIZE + (nr + 1 -
+						       del_num) * DC_SIZE);
+
+	/* sizes, item number */
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
+	set_blkh_free_space(blkh,
+			    blkh_free_space(blkh) +
+			    (del_num * (KEY_SIZE + DC_SIZE)));
+
+	do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
+	/*&&&&&&&&&&&&&&&&&&&&&&& */
+	check_internal(cur);
+	/*&&&&&&&&&&&&&&&&&&&&&&& */
+
+	if (cur_bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
+		put_dc_size(t_dc,
+			    dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE)));
+
+		do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
+					       0);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+		check_internal(cur_bi->bi_parent);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+	}
+}
+
+/* delete n node pointers and items starting from given position */
+static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n)
+{
+	int i_from;
+
+	i_from = (from == 0) ? from : from - 1;
+
+	/* delete n pointers starting from `from' position in CUR;
+	   delete n keys starting from 'i_from' position in CUR;
+	 */
+	internal_delete_pointers_items(cur_bi, from, i_from, n);
+}
+
+/* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest
+* last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest
+ * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest
+ */
+static void internal_copy_pointers_items(struct buffer_info *dest_bi,
+					 struct buffer_head *src,
+					 int last_first, int cpy_num)
+{
+	/* ATTENTION! Number of node pointers in DEST is equal to number of items in DEST *
+	 * as delimiting key have already inserted to buffer dest.*/
+	struct buffer_head *dest = dest_bi->bi_bh;
+	int nr_dest, nr_src;
+	int dest_order, src_order;
+	struct block_head *blkh;
+	struct reiserfs_key *key;
+	struct disk_child *dc;
+
+	nr_src = B_NR_ITEMS(src);
+
+	RFALSE(dest == NULL || src == NULL,
+	       "src (%p) or dest (%p) buffer is 0", src, dest);
+	RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
+	       "invalid last_first parameter (%d)", last_first);
+	RFALSE(nr_src < cpy_num - 1,
+	       "no so many items (%d) in src (%d)", cpy_num, nr_src);
+	RFALSE(cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num);
+	RFALSE(cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest),
+	       "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)",
+	       cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest));
+
+	if (cpy_num == 0)
+		return;
+
+	/* coping */
+	blkh = B_BLK_HEAD(dest);
+	nr_dest = blkh_nr_item(blkh);
+
+	/*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest; */
+	/*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0; */
+	(last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order =
+					 nr_src - cpy_num + 1) : (dest_order =
+								  nr_dest,
+								  src_order =
+								  0);
+
+	/* prepare space for cpy_num pointers */
+	dc = B_N_CHILD(dest, dest_order);
+
+	memmove(dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE);
+
+	/* insert pointers */
+	memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num);
+
+	/* prepare space for cpy_num - 1 item headers */
+	key = B_N_PDELIM_KEY(dest, dest_order);
+	memmove(key + cpy_num - 1, key,
+		KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest +
+							       cpy_num));
+
+	/* insert headers */
+	memcpy(key, B_N_PDELIM_KEY(src, src_order), KEY_SIZE * (cpy_num - 1));
+
+	/* sizes, item number */
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1));
+	set_blkh_free_space(blkh,
+			    blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) +
+						     DC_SIZE * cpy_num));
+
+	do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
+
+	/*&&&&&&&&&&&&&&&&&&&&&&&& */
+	check_internal(dest);
+	/*&&&&&&&&&&&&&&&&&&&&&&&& */
+
+	if (dest_bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
+		put_dc_size(t_dc,
+			    dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) +
+					     DC_SIZE * cpy_num));
+
+		do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
+					       0);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+		check_internal(dest_bi->bi_parent);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+	}
+
+}
+
+/* Copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest.
+ * Delete cpy_num - del_par items and node pointers from buffer src.
+ * last_first == FIRST_TO_LAST means, that we copy/delete first items from src.
+ * last_first == LAST_TO_FIRST means, that we copy/delete last items from src.
+ */
+static void internal_move_pointers_items(struct buffer_info *dest_bi,
+					 struct buffer_info *src_bi,
+					 int last_first, int cpy_num,
+					 int del_par)
+{
+	int first_pointer;
+	int first_item;
+
+	internal_copy_pointers_items(dest_bi, src_bi->bi_bh, last_first,
+				     cpy_num);
+
+	if (last_first == FIRST_TO_LAST) {	/* shift_left occurs */
+		first_pointer = 0;
+		first_item = 0;
+		/* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer,
+		   for key - with first_item */
+		internal_delete_pointers_items(src_bi, first_pointer,
+					       first_item, cpy_num - del_par);
+	} else {		/* shift_right occurs */
+		int i, j;
+
+		i = (cpy_num - del_par ==
+		     (j =
+		      B_NR_ITEMS(src_bi->bi_bh)) + 1) ? 0 : j - cpy_num +
+		    del_par;
+
+		internal_delete_pointers_items(src_bi,
+					       j + 1 - cpy_num + del_par, i,
+					       cpy_num - del_par);
+	}
+}
+
+/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */
+static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_before,	/* insert key before key with n_dest number */
+				struct buffer_head *src, int src_position)
+{
+	struct buffer_head *dest = dest_bi->bi_bh;
+	int nr;
+	struct block_head *blkh;
+	struct reiserfs_key *key;
+
+	RFALSE(dest == NULL || src == NULL,
+	       "source(%p) or dest(%p) buffer is 0", src, dest);
+	RFALSE(dest_position_before < 0 || src_position < 0,
+	       "source(%d) or dest(%d) key number less than 0",
+	       src_position, dest_position_before);
+	RFALSE(dest_position_before > B_NR_ITEMS(dest) ||
+	       src_position >= B_NR_ITEMS(src),
+	       "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))",
+	       dest_position_before, B_NR_ITEMS(dest),
+	       src_position, B_NR_ITEMS(src));
+	RFALSE(B_FREE_SPACE(dest) < KEY_SIZE,
+	       "no enough free space (%d) in dest buffer", B_FREE_SPACE(dest));
+
+	blkh = B_BLK_HEAD(dest);
+	nr = blkh_nr_item(blkh);
+
+	/* prepare space for inserting key */
+	key = B_N_PDELIM_KEY(dest, dest_position_before);
+	memmove(key + 1, key,
+		(nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE);
+
+	/* insert key */
+	memcpy(key, B_N_PDELIM_KEY(src, src_position), KEY_SIZE);
+
+	/* Change dirt, free space, item number fields. */
+
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
+	set_blkh_free_space(blkh, blkh_free_space(blkh) - KEY_SIZE);
+
+	do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
+
+	if (dest_bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
+		put_dc_size(t_dc, dc_size(t_dc) + KEY_SIZE);
+
+		do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
+					       0);
+	}
+}
+
+/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
+ * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest.
+ * Replace  d_key'th key in buffer cfl.
+ * Delete pointer_amount items and node pointers from buffer src.
+ */
+/* this can be invoked both to shift from S to L and from R to S */
+static void internal_shift_left(int mode,	/* INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S */
+				struct tree_balance *tb,
+				int h, int pointer_amount)
+{
+	struct buffer_info dest_bi, src_bi;
+	struct buffer_head *cf;
+	int d_key_position;
+
+	internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
+				       &d_key_position, &cf);
+
+	/*printk("pointer_amount = %d\n",pointer_amount); */
+
+	if (pointer_amount) {
+		/* insert delimiting key from common father of dest and src to node dest into position B_NR_ITEM(dest) */
+		internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
+				    d_key_position);
+
+		if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) {
+			if (src_bi.bi_position /*src->b_item_order */  == 0)
+				replace_key(tb, cf, d_key_position,
+					    src_bi.
+					    bi_parent /*src->b_parent */ , 0);
+		} else
+			replace_key(tb, cf, d_key_position, src_bi.bi_bh,
+				    pointer_amount - 1);
+	}
+	/* last parameter is del_parameter */
+	internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
+				     pointer_amount, 0);
+
+}
+
+/* Insert delimiting key to L[h].
+ * Copy n node pointers and n - 1 items from buffer S[h] to L[h].
+ * Delete n - 1 items and node pointers from buffer S[h].
+ */
+/* it always shifts from S[h] to L[h] */
+static void internal_shift1_left(struct tree_balance *tb,
+				 int h, int pointer_amount)
+{
+	struct buffer_info dest_bi, src_bi;
+	struct buffer_head *cf;
+	int d_key_position;
+
+	internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
+				       &dest_bi, &src_bi, &d_key_position, &cf);
+
+	if (pointer_amount > 0)	/* insert lkey[h]-th key  from CFL[h] to left neighbor L[h] */
+		internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
+				    d_key_position);
+	/*            internal_insert_key (tb->L[h], B_NR_ITEM(tb->L[h]), tb->CFL[h], tb->lkey[h]); */
+
+	/* last parameter is del_parameter */
+	internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
+				     pointer_amount, 1);
+	/*    internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1); */
+}
+
+/* Insert d_key'th (delimiting) key from buffer cfr to head of dest.
+ * Copy n node pointers and n - 1 items from buffer src to buffer dest.
+ * Replace  d_key'th key in buffer cfr.
+ * Delete n items and node pointers from buffer src.
+ */
+static void internal_shift_right(int mode,	/* INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S */
+				 struct tree_balance *tb,
+				 int h, int pointer_amount)
+{
+	struct buffer_info dest_bi, src_bi;
+	struct buffer_head *cf;
+	int d_key_position;
+	int nr;
+
+	internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
+				       &d_key_position, &cf);
+
+	nr = B_NR_ITEMS(src_bi.bi_bh);
+
+	if (pointer_amount > 0) {
+		/* insert delimiting key from common father of dest and src to dest node into position 0 */
+		internal_insert_key(&dest_bi, 0, cf, d_key_position);
+		if (nr == pointer_amount - 1) {
+			RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ ||
+			       dest_bi.bi_bh != tb->R[h],
+			       "src (%p) must be == tb->S[h](%p) when it disappears",
+			       src_bi.bi_bh, PATH_H_PBUFFER(tb->tb_path, h));
+			/* when S[h] disappers replace left delemiting key as well */
+			if (tb->CFL[h])
+				replace_key(tb, cf, d_key_position, tb->CFL[h],
+					    tb->lkey[h]);
+		} else
+			replace_key(tb, cf, d_key_position, src_bi.bi_bh,
+				    nr - pointer_amount);
+	}
+
+	/* last parameter is del_parameter */
+	internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
+				     pointer_amount, 0);
+}
+
+/* Insert delimiting key to R[h].
+ * Copy n node pointers and n - 1 items from buffer S[h] to R[h].
+ * Delete n - 1 items and node pointers from buffer S[h].
+ */
+/* it always shift from S[h] to R[h] */
+static void internal_shift1_right(struct tree_balance *tb,
+				  int h, int pointer_amount)
+{
+	struct buffer_info dest_bi, src_bi;
+	struct buffer_head *cf;
+	int d_key_position;
+
+	internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
+				       &dest_bi, &src_bi, &d_key_position, &cf);
+
+	if (pointer_amount > 0)	/* insert rkey from CFR[h] to right neighbor R[h] */
+		internal_insert_key(&dest_bi, 0, cf, d_key_position);
+	/*            internal_insert_key (tb->R[h], 0, tb->CFR[h], tb->rkey[h]); */
+
+	/* last parameter is del_parameter */
+	internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
+				     pointer_amount, 1);
+	/*    internal_move_pointers_items (tb->R[h], tb->S[h], LAST_TO_FIRST, pointer_amount, 1); */
+}
+
+/* Delete insert_num node pointers together with their left items
+ * and balance current node.*/
+static void balance_internal_when_delete(struct tree_balance *tb,
+					 int h, int child_pos)
+{
+	int insert_num;
+	int n;
+	struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
+	struct buffer_info bi;
+
+	insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE));
+
+	/* delete child-node-pointer(s) together with their left item(s) */
+	bi.tb = tb;
+	bi.bi_bh = tbSh;
+	bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+	bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+
+	internal_delete_childs(&bi, child_pos, -insert_num);
+
+	RFALSE(tb->blknum[h] > 1,
+	       "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]);
+
+	n = B_NR_ITEMS(tbSh);
+
+	if (tb->lnum[h] == 0 && tb->rnum[h] == 0) {
+		if (tb->blknum[h] == 0) {
+			/* node S[h] (root of the tree) is empty now */
+			struct buffer_head *new_root;
+
+			RFALSE(n
+			       || B_FREE_SPACE(tbSh) !=
+			       MAX_CHILD_SIZE(tbSh) - DC_SIZE,
+			       "buffer must have only 0 keys (%d)", n);
+			RFALSE(bi.bi_parent, "root has parent (%p)",
+			       bi.bi_parent);
+
+			/* choose a new root */
+			if (!tb->L[h - 1] || !B_NR_ITEMS(tb->L[h - 1]))
+				new_root = tb->R[h - 1];
+			else
+				new_root = tb->L[h - 1];
+			/* switch super block's tree root block number to the new value */
+			PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr);
+			//REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --;
+			PUT_SB_TREE_HEIGHT(tb->tb_sb,
+					   SB_TREE_HEIGHT(tb->tb_sb) - 1);
+
+			do_balance_mark_sb_dirty(tb,
+						 REISERFS_SB(tb->tb_sb)->s_sbh,
+						 1);
+			/*&&&&&&&&&&&&&&&&&&&&&& */
+			if (h > 1)
+				/* use check_internal if new root is an internal node */
+				check_internal(new_root);
+			/*&&&&&&&&&&&&&&&&&&&&&& */
+
+			/* do what is needed for buffer thrown from tree */
+			reiserfs_invalidate_buffer(tb, tbSh);
+			return;
+		}
+		return;
+	}
+
+	if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) {	/* join S[h] with L[h] */
+
+		RFALSE(tb->rnum[h] != 0,
+		       "invalid tb->rnum[%d]==%d when joining S[h] with L[h]",
+		       h, tb->rnum[h]);
+
+		internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1);
+		reiserfs_invalidate_buffer(tb, tbSh);
+
+		return;
+	}
+
+	if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) {	/* join S[h] with R[h] */
+		RFALSE(tb->lnum[h] != 0,
+		       "invalid tb->lnum[%d]==%d when joining S[h] with R[h]",
+		       h, tb->lnum[h]);
+
+		internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1);
+
+		reiserfs_invalidate_buffer(tb, tbSh);
+		return;
+	}
+
+	if (tb->lnum[h] < 0) {	/* borrow from left neighbor L[h] */
+		RFALSE(tb->rnum[h] != 0,
+		       "wrong tb->rnum[%d]==%d when borrow from L[h]", h,
+		       tb->rnum[h]);
+		/*internal_shift_right (tb, h, tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], -tb->lnum[h]); */
+		internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h,
+				     -tb->lnum[h]);
+		return;
+	}
+
+	if (tb->rnum[h] < 0) {	/* borrow from right neighbor R[h] */
+		RFALSE(tb->lnum[h] != 0,
+		       "invalid tb->lnum[%d]==%d when borrow from R[h]",
+		       h, tb->lnum[h]);
+		internal_shift_left(INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]);	/*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]); */
+		return;
+	}
+
+	if (tb->lnum[h] > 0) {	/* split S[h] into two parts and put them into neighbors */
+		RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1,
+		       "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them",
+		       h, tb->lnum[h], h, tb->rnum[h], n);
+
+		internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);	/*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]); */
+		internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
+				     tb->rnum[h]);
+
+		reiserfs_invalidate_buffer(tb, tbSh);
+
+		return;
+	}
+	reiserfs_panic(tb->tb_sb, "ibalance-2",
+		       "unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
+		       h, tb->lnum[h], h, tb->rnum[h]);
+}
+
+/* Replace delimiting key of buffers L[h] and S[h] by the given key.*/
+static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key)
+{
+	RFALSE(tb->L[h] == NULL || tb->CFL[h] == NULL,
+	       "L[h](%p) and CFL[h](%p) must exist in replace_lkey",
+	       tb->L[h], tb->CFL[h]);
+
+	if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0)
+		return;
+
+	memcpy(B_N_PDELIM_KEY(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE);
+
+	do_balance_mark_internal_dirty(tb, tb->CFL[h], 0);
+}
+
+/* Replace delimiting key of buffers S[h] and R[h] by the given key.*/
+static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key)
+{
+	RFALSE(tb->R[h] == NULL || tb->CFR[h] == NULL,
+	       "R[h](%p) and CFR[h](%p) must exist in replace_rkey",
+	       tb->R[h], tb->CFR[h]);
+	RFALSE(B_NR_ITEMS(tb->R[h]) == 0,
+	       "R[h] can not be empty if it exists (item number=%d)",
+	       B_NR_ITEMS(tb->R[h]));
+
+	memcpy(B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE);
+
+	do_balance_mark_internal_dirty(tb, tb->CFR[h], 0);
+}
+
+int balance_internal(struct tree_balance *tb,	/* tree_balance structure               */
+		     int h,	/* level of the tree                    */
+		     int child_pos, struct item_head *insert_key,	/* key for insertion on higher level    */
+		     struct buffer_head **insert_ptr	/* node for insertion on higher level */
+    )
+    /* if inserting/pasting
+       {
+       child_pos is the position of the node-pointer in S[h] that        *
+       pointed to S[h-1] before balancing of the h-1 level;              *
+       this means that new pointers and items must be inserted AFTER *
+       child_pos
+       }
+       else
+       {
+       it is the position of the leftmost pointer that must be deleted (together with
+       its corresponding key to the left of the pointer)
+       as a result of the previous level's balancing.
+       }
+     */
+{
+	struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
+	struct buffer_info bi;
+	int order;		/* we return this: it is 0 if there is no S[h], else it is tb->S[h]->b_item_order */
+	int insert_num, n, k;
+	struct buffer_head *S_new;
+	struct item_head new_insert_key;
+	struct buffer_head *new_insert_ptr = NULL;
+	struct item_head *new_insert_key_addr = insert_key;
+
+	RFALSE(h < 1, "h (%d) can not be < 1 on internal level", h);
+
+	PROC_INFO_INC(tb->tb_sb, balance_at[h]);
+
+	order =
+	    (tbSh) ? PATH_H_POSITION(tb->tb_path,
+				     h + 1) /*tb->S[h]->b_item_order */ : 0;
+
+	/* Using insert_size[h] calculate the number insert_num of items
+	   that must be inserted to or deleted from S[h]. */
+	insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE));
+
+	/* Check whether insert_num is proper * */
+	RFALSE(insert_num < -2 || insert_num > 2,
+	       "incorrect number of items inserted to the internal node (%d)",
+	       insert_num);
+	RFALSE(h > 1 && (insert_num > 1 || insert_num < -1),
+	       "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level",
+	       insert_num, h);
+
+	/* Make balance in case insert_num < 0 */
+	if (insert_num < 0) {
+		balance_internal_when_delete(tb, h, child_pos);
+		return order;
+	}
+
+	k = 0;
+	if (tb->lnum[h] > 0) {
+		/* shift lnum[h] items from S[h] to the left neighbor L[h].
+		   check how many of new items fall into L[h] or CFL[h] after
+		   shifting */
+		n = B_NR_ITEMS(tb->L[h]);	/* number of items in L[h] */
+		if (tb->lnum[h] <= child_pos) {
+			/* new items don't fall into L[h] or CFL[h] */
+			internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
+					    tb->lnum[h]);
+			/*internal_shift_left (tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,tb->lnum[h]); */
+			child_pos -= tb->lnum[h];
+		} else if (tb->lnum[h] > child_pos + insert_num) {
+			/* all new items fall into L[h] */
+			internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
+					    tb->lnum[h] - insert_num);
+			/*                  internal_shift_left(tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,
+			   tb->lnum[h]-insert_num);
+			 */
+			/* insert insert_num keys and node-pointers into L[h] */
+			bi.tb = tb;
+			bi.bi_bh = tb->L[h];
+			bi.bi_parent = tb->FL[h];
+			bi.bi_position = get_left_neighbor_position(tb, h);
+			internal_insert_childs(&bi,
+					       /*tb->L[h], tb->S[h-1]->b_next */
+					       n + child_pos + 1,
+					       insert_num, insert_key,
+					       insert_ptr);
+
+			insert_num = 0;
+		} else {
+			struct disk_child *dc;
+
+			/* some items fall into L[h] or CFL[h], but some don't fall */
+			internal_shift1_left(tb, h, child_pos + 1);
+			/* calculate number of new items that fall into L[h] */
+			k = tb->lnum[h] - child_pos - 1;
+			bi.tb = tb;
+			bi.bi_bh = tb->L[h];
+			bi.bi_parent = tb->FL[h];
+			bi.bi_position = get_left_neighbor_position(tb, h);
+			internal_insert_childs(&bi,
+					       /*tb->L[h], tb->S[h-1]->b_next, */
+					       n + child_pos + 1, k,
+					       insert_key, insert_ptr);
+
+			replace_lkey(tb, h, insert_key + k);
+
+			/* replace the first node-ptr in S[h] by node-ptr to insert_ptr[k] */
+			dc = B_N_CHILD(tbSh, 0);
+			put_dc_size(dc,
+				    MAX_CHILD_SIZE(insert_ptr[k]) -
+				    B_FREE_SPACE(insert_ptr[k]));
+			put_dc_block_number(dc, insert_ptr[k]->b_blocknr);
+
+			do_balance_mark_internal_dirty(tb, tbSh, 0);
+
+			k++;
+			insert_key += k;
+			insert_ptr += k;
+			insert_num -= k;
+			child_pos = 0;
+		}
+	}
+	/* tb->lnum[h] > 0 */
+	if (tb->rnum[h] > 0) {
+		/*shift rnum[h] items from S[h] to the right neighbor R[h] */
+		/* check how many of new items fall into R or CFR after shifting */
+		n = B_NR_ITEMS(tbSh);	/* number of items in S[h] */
+		if (n - tb->rnum[h] >= child_pos)
+			/* new items fall into S[h] */
+			/*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],tb->rnum[h]); */
+			internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
+					     tb->rnum[h]);
+		else if (n + insert_num - tb->rnum[h] < child_pos) {
+			/* all new items fall into R[h] */
+			/*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],
+			   tb->rnum[h] - insert_num); */
+			internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
+					     tb->rnum[h] - insert_num);
+
+			/* insert insert_num keys and node-pointers into R[h] */
+			bi.tb = tb;
+			bi.bi_bh = tb->R[h];
+			bi.bi_parent = tb->FR[h];
+			bi.bi_position = get_right_neighbor_position(tb, h);
+			internal_insert_childs(&bi,
+					       /*tb->R[h],tb->S[h-1]->b_next */
+					       child_pos - n - insert_num +
+					       tb->rnum[h] - 1,
+					       insert_num, insert_key,
+					       insert_ptr);
+			insert_num = 0;
+		} else {
+			struct disk_child *dc;
+
+			/* one of the items falls into CFR[h] */
+			internal_shift1_right(tb, h, n - child_pos + 1);
+			/* calculate number of new items that fall into R[h] */
+			k = tb->rnum[h] - n + child_pos - 1;
+			bi.tb = tb;
+			bi.bi_bh = tb->R[h];
+			bi.bi_parent = tb->FR[h];
+			bi.bi_position = get_right_neighbor_position(tb, h);
+			internal_insert_childs(&bi,
+					       /*tb->R[h], tb->R[h]->b_child, */
+					       0, k, insert_key + 1,
+					       insert_ptr + 1);
+
+			replace_rkey(tb, h, insert_key + insert_num - k - 1);
+
+			/* replace the first node-ptr in R[h] by node-ptr insert_ptr[insert_num-k-1] */
+			dc = B_N_CHILD(tb->R[h], 0);
+			put_dc_size(dc,
+				    MAX_CHILD_SIZE(insert_ptr
+						   [insert_num - k - 1]) -
+				    B_FREE_SPACE(insert_ptr
+						 [insert_num - k - 1]));
+			put_dc_block_number(dc,
+					    insert_ptr[insert_num - k -
+						       1]->b_blocknr);
+
+			do_balance_mark_internal_dirty(tb, tb->R[h], 0);
+
+			insert_num -= (k + 1);
+		}
+	}
+
+    /** Fill new node that appears instead of S[h] **/
+	RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level");
+	RFALSE(tb->blknum[h] < 0, "blknum can not be < 0");
+
+	if (!tb->blknum[h]) {	/* node S[h] is empty now */
+		RFALSE(!tbSh, "S[h] is equal NULL");
+
+		/* do what is needed for buffer thrown from tree */
+		reiserfs_invalidate_buffer(tb, tbSh);
+		return order;
+	}
+
+	if (!tbSh) {
+		/* create new root */
+		struct disk_child *dc;
+		struct buffer_head *tbSh_1 = PATH_H_PBUFFER(tb->tb_path, h - 1);
+		struct block_head *blkh;
+
+		if (tb->blknum[h] != 1)
+			reiserfs_panic(NULL, "ibalance-3", "One new node "
+				       "required for creating the new root");
+		/* S[h] = empty buffer from the list FEB. */
+		tbSh = get_FEB(tb);
+		blkh = B_BLK_HEAD(tbSh);
+		set_blkh_level(blkh, h + 1);
+
+		/* Put the unique node-pointer to S[h] that points to S[h-1]. */
+
+		dc = B_N_CHILD(tbSh, 0);
+		put_dc_block_number(dc, tbSh_1->b_blocknr);
+		put_dc_size(dc,
+			    (MAX_CHILD_SIZE(tbSh_1) - B_FREE_SPACE(tbSh_1)));
+
+		tb->insert_size[h] -= DC_SIZE;
+		set_blkh_free_space(blkh, blkh_free_space(blkh) - DC_SIZE);
+
+		do_balance_mark_internal_dirty(tb, tbSh, 0);
+
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+		check_internal(tbSh);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+
+		/* put new root into path structure */
+		PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) =
+		    tbSh;
+
+		/* Change root in structure super block. */
+		PUT_SB_ROOT_BLOCK(tb->tb_sb, tbSh->b_blocknr);
+		PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1);
+		do_balance_mark_sb_dirty(tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
+	}
+
+	if (tb->blknum[h] == 2) {
+		int snum;
+		struct buffer_info dest_bi, src_bi;
+
+		/* S_new = free buffer from list FEB */
+		S_new = get_FEB(tb);
+
+		set_blkh_level(B_BLK_HEAD(S_new), h + 1);
+
+		dest_bi.tb = tb;
+		dest_bi.bi_bh = S_new;
+		dest_bi.bi_parent = NULL;
+		dest_bi.bi_position = 0;
+		src_bi.tb = tb;
+		src_bi.bi_bh = tbSh;
+		src_bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		src_bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+
+		n = B_NR_ITEMS(tbSh);	/* number of items in S[h] */
+		snum = (insert_num + n + 1) / 2;
+		if (n - snum >= child_pos) {
+			/* new items don't fall into S_new */
+			/*  store the delimiting key for the next level */
+			/* new_insert_key = (n - snum)'th key in S[h] */
+			memcpy(&new_insert_key, B_N_PDELIM_KEY(tbSh, n - snum),
+			       KEY_SIZE);
+			/* last parameter is del_par */
+			internal_move_pointers_items(&dest_bi, &src_bi,
+						     LAST_TO_FIRST, snum, 0);
+			/*            internal_move_pointers_items(S_new, tbSh, LAST_TO_FIRST, snum, 0); */
+		} else if (n + insert_num - snum < child_pos) {
+			/* all new items fall into S_new */
+			/*  store the delimiting key for the next level */
+			/* new_insert_key = (n + insert_item - snum)'th key in S[h] */
+			memcpy(&new_insert_key,
+			       B_N_PDELIM_KEY(tbSh, n + insert_num - snum),
+			       KEY_SIZE);
+			/* last parameter is del_par */
+			internal_move_pointers_items(&dest_bi, &src_bi,
+						     LAST_TO_FIRST,
+						     snum - insert_num, 0);
+			/*                  internal_move_pointers_items(S_new,tbSh,1,snum - insert_num,0); */
+
+			/* insert insert_num keys and node-pointers into S_new */
+			internal_insert_childs(&dest_bi,
+					       /*S_new,tb->S[h-1]->b_next, */
+					       child_pos - n - insert_num +
+					       snum - 1,
+					       insert_num, insert_key,
+					       insert_ptr);
+
+			insert_num = 0;
+		} else {
+			struct disk_child *dc;
+
+			/* some items fall into S_new, but some don't fall */
+			/* last parameter is del_par */
+			internal_move_pointers_items(&dest_bi, &src_bi,
+						     LAST_TO_FIRST,
+						     n - child_pos + 1, 1);
+			/*                  internal_move_pointers_items(S_new,tbSh,1,n - child_pos + 1,1); */
+			/* calculate number of new items that fall into S_new */
+			k = snum - n + child_pos - 1;
+
+			internal_insert_childs(&dest_bi, /*S_new, */ 0, k,
+					       insert_key + 1, insert_ptr + 1);
+
+			/* new_insert_key = insert_key[insert_num - k - 1] */
+			memcpy(&new_insert_key, insert_key + insert_num - k - 1,
+			       KEY_SIZE);
+			/* replace first node-ptr in S_new by node-ptr to insert_ptr[insert_num-k-1] */
+
+			dc = B_N_CHILD(S_new, 0);
+			put_dc_size(dc,
+				    (MAX_CHILD_SIZE
+				     (insert_ptr[insert_num - k - 1]) -
+				     B_FREE_SPACE(insert_ptr
+						  [insert_num - k - 1])));
+			put_dc_block_number(dc,
+					    insert_ptr[insert_num - k -
+						       1]->b_blocknr);
+
+			do_balance_mark_internal_dirty(tb, S_new, 0);
+
+			insert_num -= (k + 1);
+		}
+		/* new_insert_ptr = node_pointer to S_new */
+		new_insert_ptr = S_new;
+
+		RFALSE(!buffer_journaled(S_new) || buffer_journal_dirty(S_new)
+		       || buffer_dirty(S_new), "cm-00001: bad S_new (%b)",
+		       S_new);
+
+		// S_new is released in unfix_nodes
+	}
+
+	n = B_NR_ITEMS(tbSh);	/*number of items in S[h] */
+
+	if (0 <= child_pos && child_pos <= n && insert_num > 0) {
+		bi.tb = tb;
+		bi.bi_bh = tbSh;
+		bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		internal_insert_childs(&bi,	/*tbSh, */
+				       /*          ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next :  tb->S[h]->b_child->b_next, */
+				       child_pos, insert_num, insert_key,
+				       insert_ptr);
+	}
+
+	memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE);
+	insert_ptr[0] = new_insert_ptr;
+
+	return order;
+}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
new file mode 100644
index 00000000..4fd5bb33
--- /dev/null
+++ b/fs/reiserfs/inode.c
@@ -0,0 +1,3225 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/reiserfs_acl.h>
+#include <linux/reiserfs_xattr.h>
+#include <linux/exportfs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/quotaops.h>
+#include <linux/swap.h>
+
+int reiserfs_commit_write(struct file *f, struct page *page,
+			  unsigned from, unsigned to);
+
+void reiserfs_evict_inode(struct inode *inode)
+{
+	/* We need blocks for transaction + (user+group) quota update (possibly delete) */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 2 +
+	    2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
+	struct reiserfs_transaction_handle th;
+	int depth;
+	int err;
+
+	if (!inode->i_nlink && !is_bad_inode(inode))
+		dquot_initialize(inode);
+
+	truncate_inode_pages(&inode->i_data, 0);
+	if (inode->i_nlink)
+		goto no_delete;
+
+	depth = reiserfs_write_lock_once(inode->i_sb);
+
+	/* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
+	if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {	/* also handles bad_inode case */
+		reiserfs_delete_xattrs(inode);
+
+		if (journal_begin(&th, inode->i_sb, jbegin_count))
+			goto out;
+		reiserfs_update_inode_transaction(inode);
+
+		reiserfs_discard_prealloc(&th, inode);
+
+		err = reiserfs_delete_object(&th, inode);
+
+		/* Do quota update inside a transaction for journaled quotas. We must do that
+		 * after delete_object so that quota updates go into the same transaction as
+		 * stat data deletion */
+		if (!err) 
+			dquot_free_inode(inode);
+
+		if (journal_end(&th, inode->i_sb, jbegin_count))
+			goto out;
+
+		/* check return value from reiserfs_delete_object after
+		 * ending the transaction
+		 */
+		if (err)
+		    goto out;
+
+		/* all items of file are deleted, so we can remove "save" link */
+		remove_save_link(inode, 0 /* not truncate */ );	/* we can't do anything
+								 * about an error here */
+	} else {
+		/* no object items are in the tree */
+		;
+	}
+      out:
+	end_writeback(inode);	/* note this must go after the journal_end to prevent deadlock */
+	dquot_drop(inode);
+	inode->i_blocks = 0;
+	reiserfs_write_unlock_once(inode->i_sb, depth);
+	return;
+
+no_delete:
+	end_writeback(inode);
+	dquot_drop(inode);
+}
+
+static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
+			  __u32 objectid, loff_t offset, int type, int length)
+{
+	key->version = version;
+
+	key->on_disk_key.k_dir_id = dirid;
+	key->on_disk_key.k_objectid = objectid;
+	set_cpu_key_k_offset(key, offset);
+	set_cpu_key_k_type(key, type);
+	key->key_length = length;
+}
+
+/* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
+   offset and type of key */
+void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
+		  int type, int length)
+{
+	_make_cpu_key(key, get_inode_item_key_version(inode),
+		      le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
+		      le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
+		      length);
+}
+
+//
+// when key is 0, do not set version and short key
+//
+inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
+			      int version,
+			      loff_t offset, int type, int length,
+			      int entry_count /*or ih_free_space */ )
+{
+	if (key) {
+		ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
+		ih->ih_key.k_objectid =
+		    cpu_to_le32(key->on_disk_key.k_objectid);
+	}
+	put_ih_version(ih, version);
+	set_le_ih_k_offset(ih, offset);
+	set_le_ih_k_type(ih, type);
+	put_ih_item_len(ih, length);
+	/*    set_ih_free_space (ih, 0); */
+	// for directory items it is entry count, for directs and stat
+	// datas - 0xffff, for indirects - 0
+	put_ih_entry_count(ih, entry_count);
+}
+
+//
+// FIXME: we might cache recently accessed indirect item
+
+// Ugh.  Not too eager for that....
+//  I cut the code until such time as I see a convincing argument (benchmark).
+// I don't want a bloated inode struct..., and I don't like code complexity....
+
+/* cutting the code is fine, since it really isn't in use yet and is easy
+** to add back in.  But, Vladimir has a really good idea here.  Think
+** about what happens for reading a file.  For each page,
+** The VFS layer calls reiserfs_readpage, who searches the tree to find
+** an indirect item.  This indirect item has X number of pointers, where
+** X is a big number if we've done the block allocation right.  But,
+** we only use one or two of these pointers during each call to readpage,
+** needlessly researching again later on.
+**
+** The size of the cache could be dynamic based on the size of the file.
+**
+** I'd also like to see us cache the location the stat data item, since
+** we are needlessly researching for that frequently.
+**
+** --chris
+*/
+
+/* If this page has a file tail in it, and
+** it was read in by get_block_create_0, the page data is valid,
+** but tail is still sitting in a direct item, and we can't write to
+** it.  So, look through this page, and check all the mapped buffers
+** to make sure they have valid block numbers.  Any that don't need
+** to be unmapped, so that __block_write_begin will correctly call
+** reiserfs_get_block to convert the tail into an unformatted node
+*/
+static inline void fix_tail_page_for_writing(struct page *page)
+{
+	struct buffer_head *head, *next, *bh;
+
+	if (page && page_has_buffers(page)) {
+		head = page_buffers(page);
+		bh = head;
+		do {
+			next = bh->b_this_page;
+			if (buffer_mapped(bh) && bh->b_blocknr == 0) {
+				reiserfs_unmap_buffer(bh);
+			}
+			bh = next;
+		} while (bh != head);
+	}
+}
+
+/* reiserfs_get_block does not need to allocate a block only if it has been
+   done already or non-hole position has been found in the indirect item */
+static inline int allocation_needed(int retval, b_blocknr_t allocated,
+				    struct item_head *ih,
+				    __le32 * item, int pos_in_item)
+{
+	if (allocated)
+		return 0;
+	if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
+	    get_block_num(item, pos_in_item))
+		return 0;
+	return 1;
+}
+
+static inline int indirect_item_found(int retval, struct item_head *ih)
+{
+	return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
+}
+
+static inline void set_block_dev_mapped(struct buffer_head *bh,
+					b_blocknr_t block, struct inode *inode)
+{
+	map_bh(bh, inode->i_sb, block);
+}
+
+//
+// files which were created in the earlier version can not be longer,
+// than 2 gb
+//
+static int file_capable(struct inode *inode, sector_t block)
+{
+	if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||	// it is new file.
+	    block < (1 << (31 - inode->i_sb->s_blocksize_bits)))	// old file, but 'block' is inside of 2gb
+		return 1;
+
+	return 0;
+}
+
+static int restart_transaction(struct reiserfs_transaction_handle *th,
+			       struct inode *inode, struct treepath *path)
+{
+	struct super_block *s = th->t_super;
+	int len = th->t_blocks_allocated;
+	int err;
+
+	BUG_ON(!th->t_trans_id);
+	BUG_ON(!th->t_refcount);
+
+	pathrelse(path);
+
+	/* we cannot restart while nested */
+	if (th->t_refcount > 1) {
+		return 0;
+	}
+	reiserfs_update_sd(th, inode);
+	err = journal_end(th, s, len);
+	if (!err) {
+		err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
+		if (!err)
+			reiserfs_update_inode_transaction(inode);
+	}
+	return err;
+}
+
+// it is called by get_block when create == 0. Returns block number
+// for 'block'-th logical block of file. When it hits direct item it
+// returns 0 (being called from bmap) or read direct item into piece
+// of page (bh_result)
+
+// Please improve the english/clarity in the comment above, as it is
+// hard to understand.
+
+static int _get_block_create_0(struct inode *inode, sector_t block,
+			       struct buffer_head *bh_result, int args)
+{
+	INITIALIZE_PATH(path);
+	struct cpu_key key;
+	struct buffer_head *bh;
+	struct item_head *ih, tmp_ih;
+	b_blocknr_t blocknr;
+	char *p = NULL;
+	int chars;
+	int ret;
+	int result;
+	int done = 0;
+	unsigned long offset;
+
+	// prepare the key to look for the 'block'-th block of file
+	make_cpu_key(&key, inode,
+		     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
+		     3);
+
+	result = search_for_position_by_key(inode->i_sb, &key, &path);
+	if (result != POSITION_FOUND) {
+		pathrelse(&path);
+		if (p)
+			kunmap(bh_result->b_page);
+		if (result == IO_ERROR)
+			return -EIO;
+		// We do not return -ENOENT if there is a hole but page is uptodate, because it means
+		// That there is some MMAPED data associated with it that is yet to be written to disk.
+		if ((args & GET_BLOCK_NO_HOLE)
+		    && !PageUptodate(bh_result->b_page)) {
+			return -ENOENT;
+		}
+		return 0;
+	}
+	//
+	bh = get_last_bh(&path);
+	ih = get_ih(&path);
+	if (is_indirect_le_ih(ih)) {
+		__le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih);
+
+		/* FIXME: here we could cache indirect item or part of it in
+		   the inode to avoid search_by_key in case of subsequent
+		   access to file */
+		blocknr = get_block_num(ind_item, path.pos_in_item);
+		ret = 0;
+		if (blocknr) {
+			map_bh(bh_result, inode->i_sb, blocknr);
+			if (path.pos_in_item ==
+			    ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
+				set_buffer_boundary(bh_result);
+			}
+		} else
+			// We do not return -ENOENT if there is a hole but page is uptodate, because it means
+			// That there is some MMAPED data associated with it that is yet to  be written to disk.
+		if ((args & GET_BLOCK_NO_HOLE)
+			    && !PageUptodate(bh_result->b_page)) {
+			ret = -ENOENT;
+		}
+
+		pathrelse(&path);
+		if (p)
+			kunmap(bh_result->b_page);
+		return ret;
+	}
+	// requested data are in direct item(s)
+	if (!(args & GET_BLOCK_READ_DIRECT)) {
+		// we are called by bmap. FIXME: we can not map block of file
+		// when it is stored in direct item(s)
+		pathrelse(&path);
+		if (p)
+			kunmap(bh_result->b_page);
+		return -ENOENT;
+	}
+
+	/* if we've got a direct item, and the buffer or page was uptodate,
+	 ** we don't want to pull data off disk again.  skip to the
+	 ** end, where we map the buffer and return
+	 */
+	if (buffer_uptodate(bh_result)) {
+		goto finished;
+	} else
+		/*
+		 ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
+		 ** pages without any buffers.  If the page is up to date, we don't want
+		 ** read old data off disk.  Set the up to date bit on the buffer instead
+		 ** and jump to the end
+		 */
+	if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
+		set_buffer_uptodate(bh_result);
+		goto finished;
+	}
+	// read file tail into part of page
+	offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
+	copy_item_head(&tmp_ih, ih);
+
+	/* we only want to kmap if we are reading the tail into the page.
+	 ** this is not the common case, so we don't kmap until we are
+	 ** sure we need to.  But, this means the item might move if
+	 ** kmap schedules
+	 */
+	if (!p)
+		p = (char *)kmap(bh_result->b_page);
+
+	p += offset;
+	memset(p, 0, inode->i_sb->s_blocksize);
+	do {
+		if (!is_direct_le_ih(ih)) {
+			BUG();
+		}
+		/* make sure we don't read more bytes than actually exist in
+		 ** the file.  This can happen in odd cases where i_size isn't
+		 ** correct, and when direct item padding results in a few
+		 ** extra bytes at the end of the direct item
+		 */
+		if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
+			break;
+		if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
+			chars =
+			    inode->i_size - (le_ih_k_offset(ih) - 1) -
+			    path.pos_in_item;
+			done = 1;
+		} else {
+			chars = ih_item_len(ih) - path.pos_in_item;
+		}
+		memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars);
+
+		if (done)
+			break;
+
+		p += chars;
+
+		if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
+			// we done, if read direct item is not the last item of
+			// node FIXME: we could try to check right delimiting key
+			// to see whether direct item continues in the right
+			// neighbor or rely on i_size
+			break;
+
+		// update key to look for the next piece
+		set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
+		result = search_for_position_by_key(inode->i_sb, &key, &path);
+		if (result != POSITION_FOUND)
+			// i/o error most likely
+			break;
+		bh = get_last_bh(&path);
+		ih = get_ih(&path);
+	} while (1);
+
+	flush_dcache_page(bh_result->b_page);
+	kunmap(bh_result->b_page);
+
+      finished:
+	pathrelse(&path);
+
+	if (result == IO_ERROR)
+		return -EIO;
+
+	/* this buffer has valid data, but isn't valid for io.  mapping it to
+	 * block #0 tells the rest of reiserfs it just has a tail in it
+	 */
+	map_bh(bh_result, inode->i_sb, 0);
+	set_buffer_uptodate(bh_result);
+	return 0;
+}
+
+// this is called to create file map. So, _get_block_create_0 will not
+// read direct item
+static int reiserfs_bmap(struct inode *inode, sector_t block,
+			 struct buffer_head *bh_result, int create)
+{
+	if (!file_capable(inode, block))
+		return -EFBIG;
+
+	reiserfs_write_lock(inode->i_sb);
+	/* do not read the direct item */
+	_get_block_create_0(inode, block, bh_result, 0);
+	reiserfs_write_unlock(inode->i_sb);
+	return 0;
+}
+
+/* special version of get_block that is only used by grab_tail_page right
+** now.  It is sent to __block_write_begin, and when you try to get a
+** block past the end of the file (or a block from a hole) it returns
+** -ENOENT instead of a valid buffer.  __block_write_begin expects to
+** be able to do i/o on the buffers returned, unless an error value
+** is also returned.
+**
+** So, this allows __block_write_begin to be used for reading a single block
+** in a page.  Where it does not produce a valid page for holes, or past the
+** end of the file.  This turns out to be exactly what we need for reading
+** tails for conversion.
+**
+** The point of the wrapper is forcing a certain value for create, even
+** though the VFS layer is calling this function with create==1.  If you
+** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
+** don't use this function.
+*/
+static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
+				       struct buffer_head *bh_result,
+				       int create)
+{
+	return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
+}
+
+/* This is special helper for reiserfs_get_block in case we are executing
+   direct_IO request. */
+static int reiserfs_get_blocks_direct_io(struct inode *inode,
+					 sector_t iblock,
+					 struct buffer_head *bh_result,
+					 int create)
+{
+	int ret;
+
+	bh_result->b_page = NULL;
+
+	/* We set the b_size before reiserfs_get_block call since it is
+	   referenced in convert_tail_for_hole() that may be called from
+	   reiserfs_get_block() */
+	bh_result->b_size = (1 << inode->i_blkbits);
+
+	ret = reiserfs_get_block(inode, iblock, bh_result,
+				 create | GET_BLOCK_NO_DANGLE);
+	if (ret)
+		goto out;
+
+	/* don't allow direct io onto tail pages */
+	if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
+		/* make sure future calls to the direct io funcs for this offset
+		 ** in the file fail by unmapping the buffer
+		 */
+		clear_buffer_mapped(bh_result);
+		ret = -EINVAL;
+	}
+	/* Possible unpacked tail. Flush the data before pages have
+	   disappeared */
+	if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
+		int err;
+
+		reiserfs_write_lock(inode->i_sb);
+
+		err = reiserfs_commit_for_inode(inode);
+		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
+
+		reiserfs_write_unlock(inode->i_sb);
+
+		if (err < 0)
+			ret = err;
+	}
+      out:
+	return ret;
+}
+
+/*
+** helper function for when reiserfs_get_block is called for a hole
+** but the file tail is still in a direct item
+** bh_result is the buffer head for the hole
+** tail_offset is the offset of the start of the tail in the file
+**
+** This calls prepare_write, which will start a new transaction
+** you should not be in a transaction, or have any paths held when you
+** call this.
+*/
+static int convert_tail_for_hole(struct inode *inode,
+				 struct buffer_head *bh_result,
+				 loff_t tail_offset)
+{
+	unsigned long index;
+	unsigned long tail_end;
+	unsigned long tail_start;
+	struct page *tail_page;
+	struct page *hole_page = bh_result->b_page;
+	int retval = 0;
+
+	if ((tail_offset & (bh_result->b_size - 1)) != 1)
+		return -EIO;
+
+	/* always try to read until the end of the block */
+	tail_start = tail_offset & (PAGE_CACHE_SIZE - 1);
+	tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
+
+	index = tail_offset >> PAGE_CACHE_SHIFT;
+	/* hole_page can be zero in case of direct_io, we are sure
+	   that we cannot get here if we write with O_DIRECT into
+	   tail page */
+	if (!hole_page || index != hole_page->index) {
+		tail_page = grab_cache_page(inode->i_mapping, index);
+		retval = -ENOMEM;
+		if (!tail_page) {
+			goto out;
+		}
+	} else {
+		tail_page = hole_page;
+	}
+
+	/* we don't have to make sure the conversion did not happen while
+	 ** we were locking the page because anyone that could convert
+	 ** must first take i_mutex.
+	 **
+	 ** We must fix the tail page for writing because it might have buffers
+	 ** that are mapped, but have a block number of 0.  This indicates tail
+	 ** data that has been read directly into the page, and
+	 ** __block_write_begin won't trigger a get_block in this case.
+	 */
+	fix_tail_page_for_writing(tail_page);
+	retval = __reiserfs_write_begin(tail_page, tail_start,
+				      tail_end - tail_start);
+	if (retval)
+		goto unlock;
+
+	/* tail conversion might change the data in the page */
+	flush_dcache_page(tail_page);
+
+	retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
+
+      unlock:
+	if (tail_page != hole_page) {
+		unlock_page(tail_page);
+		page_cache_release(tail_page);
+	}
+      out:
+	return retval;
+}
+
+static inline int _allocate_block(struct reiserfs_transaction_handle *th,
+				  sector_t block,
+				  struct inode *inode,
+				  b_blocknr_t * allocated_block_nr,
+				  struct treepath *path, int flags)
+{
+	BUG_ON(!th->t_trans_id);
+
+#ifdef REISERFS_PREALLOCATE
+	if (!(flags & GET_BLOCK_NO_IMUX)) {
+		return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
+						  path, block);
+	}
+#endif
+	return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
+					 block);
+}
+
+int reiserfs_get_block(struct inode *inode, sector_t block,
+		       struct buffer_head *bh_result, int create)
+{
+	int repeat, retval = 0;
+	b_blocknr_t allocated_block_nr = 0;	// b_blocknr_t is (unsigned) 32 bit int
+	INITIALIZE_PATH(path);
+	int pos_in_item;
+	struct cpu_key key;
+	struct buffer_head *bh, *unbh = NULL;
+	struct item_head *ih, tmp_ih;
+	__le32 *item;
+	int done;
+	int fs_gen;
+	int lock_depth;
+	struct reiserfs_transaction_handle *th = NULL;
+	/* space reserved in transaction batch:
+	   . 3 balancings in direct->indirect conversion
+	   . 1 block involved into reiserfs_update_sd()
+	   XXX in practically impossible worst case direct2indirect()
+	   can incur (much) more than 3 balancings.
+	   quota update for user, group */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 + 1 +
+	    2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
+	int version;
+	int dangle = 1;
+	loff_t new_offset =
+	    (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
+
+	lock_depth = reiserfs_write_lock_once(inode->i_sb);
+	version = get_inode_item_key_version(inode);
+
+	if (!file_capable(inode, block)) {
+		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+		return -EFBIG;
+	}
+
+	/* if !create, we aren't changing the FS, so we don't need to
+	 ** log anything, so we don't need to start a transaction
+	 */
+	if (!(create & GET_BLOCK_CREATE)) {
+		int ret;
+		/* find number of block-th logical block of the file */
+		ret = _get_block_create_0(inode, block, bh_result,
+					  create | GET_BLOCK_READ_DIRECT);
+		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+		return ret;
+	}
+	/*
+	 * if we're already in a transaction, make sure to close
+	 * any new transactions we start in this func
+	 */
+	if ((create & GET_BLOCK_NO_DANGLE) ||
+	    reiserfs_transaction_running(inode->i_sb))
+		dangle = 0;
+
+	/* If file is of such a size, that it might have a tail and tails are enabled
+	 ** we should mark it as possibly needing tail packing on close
+	 */
+	if ((have_large_tails(inode->i_sb)
+	     && inode->i_size < i_block_size(inode) * 4)
+	    || (have_small_tails(inode->i_sb)
+		&& inode->i_size < i_block_size(inode)))
+		REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
+
+	/* set the key of the first byte in the 'block'-th block of file */
+	make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
+	if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
+	      start_trans:
+		th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
+		if (!th) {
+			retval = -ENOMEM;
+			goto failure;
+		}
+		reiserfs_update_inode_transaction(inode);
+	}
+      research:
+
+	retval = search_for_position_by_key(inode->i_sb, &key, &path);
+	if (retval == IO_ERROR) {
+		retval = -EIO;
+		goto failure;
+	}
+
+	bh = get_last_bh(&path);
+	ih = get_ih(&path);
+	item = get_item(&path);
+	pos_in_item = path.pos_in_item;
+
+	fs_gen = get_generation(inode->i_sb);
+	copy_item_head(&tmp_ih, ih);
+
+	if (allocation_needed
+	    (retval, allocated_block_nr, ih, item, pos_in_item)) {
+		/* we have to allocate block for the unformatted node */
+		if (!th) {
+			pathrelse(&path);
+			goto start_trans;
+		}
+
+		repeat =
+		    _allocate_block(th, block, inode, &allocated_block_nr,
+				    &path, create);
+
+		if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
+			/* restart the transaction to give the journal a chance to free
+			 ** some blocks.  releases the path, so we have to go back to
+			 ** research if we succeed on the second try
+			 */
+			SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
+			retval = restart_transaction(th, inode, &path);
+			if (retval)
+				goto failure;
+			repeat =
+			    _allocate_block(th, block, inode,
+					    &allocated_block_nr, NULL, create);
+
+			if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
+				goto research;
+			}
+			if (repeat == QUOTA_EXCEEDED)
+				retval = -EDQUOT;
+			else
+				retval = -ENOSPC;
+			goto failure;
+		}
+
+		if (fs_changed(fs_gen, inode->i_sb)
+		    && item_moved(&tmp_ih, &path)) {
+			goto research;
+		}
+	}
+
+	if (indirect_item_found(retval, ih)) {
+		b_blocknr_t unfm_ptr;
+		/* 'block'-th block is in the file already (there is
+		   corresponding cell in some indirect item). But it may be
+		   zero unformatted node pointer (hole) */
+		unfm_ptr = get_block_num(item, pos_in_item);
+		if (unfm_ptr == 0) {
+			/* use allocated block to plug the hole */
+			reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
+			if (fs_changed(fs_gen, inode->i_sb)
+			    && item_moved(&tmp_ih, &path)) {
+				reiserfs_restore_prepared_buffer(inode->i_sb,
+								 bh);
+				goto research;
+			}
+			set_buffer_new(bh_result);
+			if (buffer_dirty(bh_result)
+			    && reiserfs_data_ordered(inode->i_sb))
+				reiserfs_add_ordered_list(inode, bh_result);
+			put_block_num(item, pos_in_item, allocated_block_nr);
+			unfm_ptr = allocated_block_nr;
+			journal_mark_dirty(th, inode->i_sb, bh);
+			reiserfs_update_sd(th, inode);
+		}
+		set_block_dev_mapped(bh_result, unfm_ptr, inode);
+		pathrelse(&path);
+		retval = 0;
+		if (!dangle && th)
+			retval = reiserfs_end_persistent_transaction(th);
+
+		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+
+		/* the item was found, so new blocks were not added to the file
+		 ** there is no need to make sure the inode is updated with this
+		 ** transaction
+		 */
+		return retval;
+	}
+
+	if (!th) {
+		pathrelse(&path);
+		goto start_trans;
+	}
+
+	/* desired position is not found or is in the direct item. We have
+	   to append file with holes up to 'block'-th block converting
+	   direct items to indirect one if necessary */
+	done = 0;
+	do {
+		if (is_statdata_le_ih(ih)) {
+			__le32 unp = 0;
+			struct cpu_key tmp_key;
+
+			/* indirect item has to be inserted */
+			make_le_item_head(&tmp_ih, &key, version, 1,
+					  TYPE_INDIRECT, UNFM_P_SIZE,
+					  0 /* free_space */ );
+
+			if (cpu_key_k_offset(&key) == 1) {
+				/* we are going to add 'block'-th block to the file. Use
+				   allocated block for that */
+				unp = cpu_to_le32(allocated_block_nr);
+				set_block_dev_mapped(bh_result,
+						     allocated_block_nr, inode);
+				set_buffer_new(bh_result);
+				done = 1;
+			}
+			tmp_key = key;	// ;)
+			set_cpu_key_k_offset(&tmp_key, 1);
+			PATH_LAST_POSITION(&path)++;
+
+			retval =
+			    reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
+						 inode, (char *)&unp);
+			if (retval) {
+				reiserfs_free_block(th, inode,
+						    allocated_block_nr, 1);
+				goto failure;	// retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
+			}
+			//mark_tail_converted (inode);
+		} else if (is_direct_le_ih(ih)) {
+			/* direct item has to be converted */
+			loff_t tail_offset;
+
+			tail_offset =
+			    ((le_ih_k_offset(ih) -
+			      1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
+			if (tail_offset == cpu_key_k_offset(&key)) {
+				/* direct item we just found fits into block we have
+				   to map. Convert it into unformatted node: use
+				   bh_result for the conversion */
+				set_block_dev_mapped(bh_result,
+						     allocated_block_nr, inode);
+				unbh = bh_result;
+				done = 1;
+			} else {
+				/* we have to padd file tail stored in direct item(s)
+				   up to block size and convert it to unformatted
+				   node. FIXME: this should also get into page cache */
+
+				pathrelse(&path);
+				/*
+				 * ugly, but we can only end the transaction if
+				 * we aren't nested
+				 */
+				BUG_ON(!th->t_refcount);
+				if (th->t_refcount == 1) {
+					retval =
+					    reiserfs_end_persistent_transaction
+					    (th);
+					th = NULL;
+					if (retval)
+						goto failure;
+				}
+
+				retval =
+				    convert_tail_for_hole(inode, bh_result,
+							  tail_offset);
+				if (retval) {
+					if (retval != -ENOSPC)
+						reiserfs_error(inode->i_sb,
+							"clm-6004",
+							"convert tail failed "
+							"inode %lu, error %d",
+							inode->i_ino,
+							retval);
+					if (allocated_block_nr) {
+						/* the bitmap, the super, and the stat data == 3 */
+						if (!th)
+							th = reiserfs_persistent_transaction(inode->i_sb, 3);
+						if (th)
+							reiserfs_free_block(th,
+									    inode,
+									    allocated_block_nr,
+									    1);
+					}
+					goto failure;
+				}
+				goto research;
+			}
+			retval =
+			    direct2indirect(th, inode, &path, unbh,
+					    tail_offset);
+			if (retval) {
+				reiserfs_unmap_buffer(unbh);
+				reiserfs_free_block(th, inode,
+						    allocated_block_nr, 1);
+				goto failure;
+			}
+			/* it is important the set_buffer_uptodate is done after
+			 ** the direct2indirect.  The buffer might contain valid
+			 ** data newer than the data on disk (read by readpage, changed,
+			 ** and then sent here by writepage).  direct2indirect needs
+			 ** to know if unbh was already up to date, so it can decide
+			 ** if the data in unbh needs to be replaced with data from
+			 ** the disk
+			 */
+			set_buffer_uptodate(unbh);
+
+			/* unbh->b_page == NULL in case of DIRECT_IO request, this means
+			   buffer will disappear shortly, so it should not be added to
+			 */
+			if (unbh->b_page) {
+				/* we've converted the tail, so we must
+				 ** flush unbh before the transaction commits
+				 */
+				reiserfs_add_tail_list(inode, unbh);
+
+				/* mark it dirty now to prevent commit_write from adding
+				 ** this buffer to the inode's dirty buffer list
+				 */
+				/*
+				 * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
+				 * It's still atomic, but it sets the page dirty too,
+				 * which makes it eligible for writeback at any time by the
+				 * VM (which was also the case with __mark_buffer_dirty())
+				 */
+				mark_buffer_dirty(unbh);
+			}
+		} else {
+			/* append indirect item with holes if needed, when appending
+			   pointer to 'block'-th block use block, which is already
+			   allocated */
+			struct cpu_key tmp_key;
+			unp_t unf_single = 0;	// We use this in case we need to allocate only
+			// one block which is a fastpath
+			unp_t *un;
+			__u64 max_to_insert =
+			    MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
+			    UNFM_P_SIZE;
+			__u64 blocks_needed;
+
+			RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
+			       "vs-804: invalid position for append");
+			/* indirect item has to be appended, set up key of that position */
+			make_cpu_key(&tmp_key, inode,
+				     le_key_k_offset(version,
+						     &(ih->ih_key)) +
+				     op_bytes_number(ih,
+						     inode->i_sb->s_blocksize),
+				     //pos_in_item * inode->i_sb->s_blocksize,
+				     TYPE_INDIRECT, 3);	// key type is unimportant
+
+			RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
+			       "green-805: invalid offset");
+			blocks_needed =
+			    1 +
+			    ((cpu_key_k_offset(&key) -
+			      cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
+			     s_blocksize_bits);
+
+			if (blocks_needed == 1) {
+				un = &unf_single;
+			} else {
+				un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS);
+				if (!un) {
+					un = &unf_single;
+					blocks_needed = 1;
+					max_to_insert = 0;
+				}
+			}
+			if (blocks_needed <= max_to_insert) {
+				/* we are going to add target block to the file. Use allocated
+				   block for that */
+				un[blocks_needed - 1] =
+				    cpu_to_le32(allocated_block_nr);
+				set_block_dev_mapped(bh_result,
+						     allocated_block_nr, inode);
+				set_buffer_new(bh_result);
+				done = 1;
+			} else {
+				/* paste hole to the indirect item */
+				/* If kmalloc failed, max_to_insert becomes zero and it means we
+				   only have space for one block */
+				blocks_needed =
+				    max_to_insert ? max_to_insert : 1;
+			}
+			retval =
+			    reiserfs_paste_into_item(th, &path, &tmp_key, inode,
+						     (char *)un,
+						     UNFM_P_SIZE *
+						     blocks_needed);
+
+			if (blocks_needed != 1)
+				kfree(un);
+
+			if (retval) {
+				reiserfs_free_block(th, inode,
+						    allocated_block_nr, 1);
+				goto failure;
+			}
+			if (!done) {
+				/* We need to mark new file size in case this function will be
+				   interrupted/aborted later on. And we may do this only for
+				   holes. */
+				inode->i_size +=
+				    inode->i_sb->s_blocksize * blocks_needed;
+			}
+		}
+
+		if (done == 1)
+			break;
+
+		/* this loop could log more blocks than we had originally asked
+		 ** for.  So, we have to allow the transaction to end if it is
+		 ** too big or too full.  Update the inode so things are
+		 ** consistent if we crash before the function returns
+		 **
+		 ** release the path so that anybody waiting on the path before
+		 ** ending their transaction will be able to continue.
+		 */
+		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
+			retval = restart_transaction(th, inode, &path);
+			if (retval)
+				goto failure;
+		}
+		/*
+		 * inserting indirect pointers for a hole can take a
+		 * long time.  reschedule if needed and also release the write
+		 * lock for others.
+		 */
+		if (need_resched()) {
+			reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+			schedule();
+			lock_depth = reiserfs_write_lock_once(inode->i_sb);
+		}
+
+		retval = search_for_position_by_key(inode->i_sb, &key, &path);
+		if (retval == IO_ERROR) {
+			retval = -EIO;
+			goto failure;
+		}
+		if (retval == POSITION_FOUND) {
+			reiserfs_warning(inode->i_sb, "vs-825",
+					 "%K should not be found", &key);
+			retval = -EEXIST;
+			if (allocated_block_nr)
+				reiserfs_free_block(th, inode,
+						    allocated_block_nr, 1);
+			pathrelse(&path);
+			goto failure;
+		}
+		bh = get_last_bh(&path);
+		ih = get_ih(&path);
+		item = get_item(&path);
+		pos_in_item = path.pos_in_item;
+	} while (1);
+
+	retval = 0;
+
+      failure:
+	if (th && (!dangle || (retval && !th->t_trans_id))) {
+		int err;
+		if (th->t_trans_id)
+			reiserfs_update_sd(th, inode);
+		err = reiserfs_end_persistent_transaction(th);
+		if (err)
+			retval = err;
+	}
+
+	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+	reiserfs_check_path(&path);
+	return retval;
+}
+
+static int
+reiserfs_readpages(struct file *file, struct address_space *mapping,
+		   struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
+}
+
+/* Compute real number of used bytes by file
+ * Following three functions can go away when we'll have enough space in stat item
+ */
+static int real_space_diff(struct inode *inode, int sd_size)
+{
+	int bytes;
+	loff_t blocksize = inode->i_sb->s_blocksize;
+
+	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
+		return sd_size;
+
+	/* End of file is also in full block with indirect reference, so round
+	 ** up to the next block.
+	 **
+	 ** there is just no way to know if the tail is actually packed
+	 ** on the file, so we have to assume it isn't.  When we pack the
+	 ** tail, we add 4 bytes to pretend there really is an unformatted
+	 ** node pointer
+	 */
+	bytes =
+	    ((inode->i_size +
+	      (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
+	    sd_size;
+	return bytes;
+}
+
+static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
+					int sd_size)
+{
+	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+		return inode->i_size +
+		    (loff_t) (real_space_diff(inode, sd_size));
+	}
+	return ((loff_t) real_space_diff(inode, sd_size)) +
+	    (((loff_t) blocks) << 9);
+}
+
+/* Compute number of blocks used by file in ReiserFS counting */
+static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
+{
+	loff_t bytes = inode_get_bytes(inode);
+	loff_t real_space = real_space_diff(inode, sd_size);
+
+	/* keeps fsck and non-quota versions of reiserfs happy */
+	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+		bytes += (loff_t) 511;
+	}
+
+	/* files from before the quota patch might i_blocks such that
+	 ** bytes < real_space.  Deal with that here to prevent it from
+	 ** going negative.
+	 */
+	if (bytes < real_space)
+		return 0;
+	return (bytes - real_space) >> 9;
+}
+
+//
+// BAD: new directories have stat data of new type and all other items
+// of old type. Version stored in the inode says about body items, so
+// in update_stat_data we can not rely on inode, but have to check
+// item version directly
+//
+
+// called by read_locked_inode
+static void init_inode(struct inode *inode, struct treepath *path)
+{
+	struct buffer_head *bh;
+	struct item_head *ih;
+	__u32 rdev;
+	//int version = ITEM_VERSION_1;
+
+	bh = PATH_PLAST_BUFFER(path);
+	ih = PATH_PITEM_HEAD(path);
+
+	copy_key(INODE_PKEY(inode), &(ih->ih_key));
+
+	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
+	REISERFS_I(inode)->i_flags = 0;
+	REISERFS_I(inode)->i_prealloc_block = 0;
+	REISERFS_I(inode)->i_prealloc_count = 0;
+	REISERFS_I(inode)->i_trans_id = 0;
+	REISERFS_I(inode)->i_jl = NULL;
+	reiserfs_init_xattr_rwsem(inode);
+
+	if (stat_data_v1(ih)) {
+		struct stat_data_v1 *sd =
+		    (struct stat_data_v1 *)B_I_PITEM(bh, ih);
+		unsigned long blocks;
+
+		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
+		set_inode_sd_version(inode, STAT_DATA_V1);
+		inode->i_mode = sd_v1_mode(sd);
+		inode->i_nlink = sd_v1_nlink(sd);
+		inode->i_uid = sd_v1_uid(sd);
+		inode->i_gid = sd_v1_gid(sd);
+		inode->i_size = sd_v1_size(sd);
+		inode->i_atime.tv_sec = sd_v1_atime(sd);
+		inode->i_mtime.tv_sec = sd_v1_mtime(sd);
+		inode->i_ctime.tv_sec = sd_v1_ctime(sd);
+		inode->i_atime.tv_nsec = 0;
+		inode->i_ctime.tv_nsec = 0;
+		inode->i_mtime.tv_nsec = 0;
+
+		inode->i_blocks = sd_v1_blocks(sd);
+		inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
+		blocks = (inode->i_size + 511) >> 9;
+		blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
+		if (inode->i_blocks > blocks) {
+			// there was a bug in <=3.5.23 when i_blocks could take negative
+			// values. Starting from 3.5.17 this value could even be stored in
+			// stat data. For such files we set i_blocks based on file
+			// size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
+			// only updated if file's inode will ever change
+			inode->i_blocks = blocks;
+		}
+
+		rdev = sd_v1_rdev(sd);
+		REISERFS_I(inode)->i_first_direct_byte =
+		    sd_v1_first_direct_byte(sd);
+		/* an early bug in the quota code can give us an odd number for the
+		 ** block count.  This is incorrect, fix it here.
+		 */
+		if (inode->i_blocks & 1) {
+			inode->i_blocks++;
+		}
+		inode_set_bytes(inode,
+				to_real_used_space(inode, inode->i_blocks,
+						   SD_V1_SIZE));
+		/* nopack is initially zero for v1 objects. For v2 objects,
+		   nopack is initialised from sd_attrs */
+		REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
+	} else {
+		// new stat data found, but object may have old items
+		// (directories and symlinks)
+		struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
+
+		inode->i_mode = sd_v2_mode(sd);
+		inode->i_nlink = sd_v2_nlink(sd);
+		inode->i_uid = sd_v2_uid(sd);
+		inode->i_size = sd_v2_size(sd);
+		inode->i_gid = sd_v2_gid(sd);
+		inode->i_mtime.tv_sec = sd_v2_mtime(sd);
+		inode->i_atime.tv_sec = sd_v2_atime(sd);
+		inode->i_ctime.tv_sec = sd_v2_ctime(sd);
+		inode->i_ctime.tv_nsec = 0;
+		inode->i_mtime.tv_nsec = 0;
+		inode->i_atime.tv_nsec = 0;
+		inode->i_blocks = sd_v2_blocks(sd);
+		rdev = sd_v2_rdev(sd);
+		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+			inode->i_generation =
+			    le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
+		else
+			inode->i_generation = sd_v2_generation(sd);
+
+		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+			set_inode_item_key_version(inode, KEY_FORMAT_3_5);
+		else
+			set_inode_item_key_version(inode, KEY_FORMAT_3_6);
+		REISERFS_I(inode)->i_first_direct_byte = 0;
+		set_inode_sd_version(inode, STAT_DATA_V2);
+		inode_set_bytes(inode,
+				to_real_used_space(inode, inode->i_blocks,
+						   SD_V2_SIZE));
+		/* read persistent inode attributes from sd and initialise
+		   generic inode flags from them */
+		REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
+		sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
+	}
+
+	pathrelse(path);
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &reiserfs_file_inode_operations;
+		inode->i_fop = &reiserfs_file_operations;
+		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &reiserfs_dir_inode_operations;
+		inode->i_fop = &reiserfs_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &reiserfs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
+	} else {
+		inode->i_blocks = 0;
+		inode->i_op = &reiserfs_special_inode_operations;
+		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+	}
+}
+
+// update new stat data with inode fields
+static void inode2sd(void *sd, struct inode *inode, loff_t size)
+{
+	struct stat_data *sd_v2 = (struct stat_data *)sd;
+	__u16 flags;
+
+	set_sd_v2_mode(sd_v2, inode->i_mode);
+	set_sd_v2_nlink(sd_v2, inode->i_nlink);
+	set_sd_v2_uid(sd_v2, inode->i_uid);
+	set_sd_v2_size(sd_v2, size);
+	set_sd_v2_gid(sd_v2, inode->i_gid);
+	set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
+	set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
+	set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec);
+	set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
+	else
+		set_sd_v2_generation(sd_v2, inode->i_generation);
+	flags = REISERFS_I(inode)->i_attrs;
+	i_attrs_to_sd_attrs(inode, &flags);
+	set_sd_v2_attrs(sd_v2, flags);
+}
+
+// used to copy inode's fields to old stat data
+static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
+{
+	struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
+
+	set_sd_v1_mode(sd_v1, inode->i_mode);
+	set_sd_v1_uid(sd_v1, inode->i_uid);
+	set_sd_v1_gid(sd_v1, inode->i_gid);
+	set_sd_v1_nlink(sd_v1, inode->i_nlink);
+	set_sd_v1_size(sd_v1, size);
+	set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
+	set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec);
+	set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
+	else
+		set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
+
+	// Sigh. i_first_direct_byte is back
+	set_sd_v1_first_direct_byte(sd_v1,
+				    REISERFS_I(inode)->i_first_direct_byte);
+}
+
+/* NOTE, you must prepare the buffer head before sending it here,
+** and then log it after the call
+*/
+static void update_stat_data(struct treepath *path, struct inode *inode,
+			     loff_t size)
+{
+	struct buffer_head *bh;
+	struct item_head *ih;
+
+	bh = PATH_PLAST_BUFFER(path);
+	ih = PATH_PITEM_HEAD(path);
+
+	if (!is_statdata_le_ih(ih))
+		reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
+			       INODE_PKEY(inode), ih);
+
+	if (stat_data_v1(ih)) {
+		// path points to old stat data
+		inode2sd_v1(B_I_PITEM(bh, ih), inode, size);
+	} else {
+		inode2sd(B_I_PITEM(bh, ih), inode, size);
+	}
+
+	return;
+}
+
+void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
+			     struct inode *inode, loff_t size)
+{
+	struct cpu_key key;
+	INITIALIZE_PATH(path);
+	struct buffer_head *bh;
+	int fs_gen;
+	struct item_head *ih, tmp_ih;
+	int retval;
+
+	BUG_ON(!th->t_trans_id);
+
+	make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);	//key type is unimportant
+
+	for (;;) {
+		int pos;
+		/* look for the object's stat data */
+		retval = search_item(inode->i_sb, &key, &path);
+		if (retval == IO_ERROR) {
+			reiserfs_error(inode->i_sb, "vs-13050",
+				       "i/o failure occurred trying to "
+				       "update %K stat data", &key);
+			return;
+		}
+		if (retval == ITEM_NOT_FOUND) {
+			pos = PATH_LAST_POSITION(&path);
+			pathrelse(&path);
+			if (inode->i_nlink == 0) {
+				/*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
+				return;
+			}
+			reiserfs_warning(inode->i_sb, "vs-13060",
+					 "stat data of object %k (nlink == %d) "
+					 "not found (pos %d)",
+					 INODE_PKEY(inode), inode->i_nlink,
+					 pos);
+			reiserfs_check_path(&path);
+			return;
+		}
+
+		/* sigh, prepare_for_journal might schedule.  When it schedules the
+		 ** FS might change.  We have to detect that, and loop back to the
+		 ** search if the stat data item has moved
+		 */
+		bh = get_last_bh(&path);
+		ih = get_ih(&path);
+		copy_item_head(&tmp_ih, ih);
+		fs_gen = get_generation(inode->i_sb);
+		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
+		if (fs_changed(fs_gen, inode->i_sb)
+		    && item_moved(&tmp_ih, &path)) {
+			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
+			continue;	/* Stat_data item has been moved after scheduling. */
+		}
+		break;
+	}
+	update_stat_data(&path, inode, size);
+	journal_mark_dirty(th, th->t_super, bh);
+	pathrelse(&path);
+	return;
+}
+
+/* reiserfs_read_locked_inode is called to read the inode off disk, and it
+** does a make_bad_inode when things go wrong.  But, we need to make sure
+** and clear the key in the private portion of the inode, otherwise a
+** corresponding iput might try to delete whatever object the inode last
+** represented.
+*/
+static void reiserfs_make_bad_inode(struct inode *inode)
+{
+	memset(INODE_PKEY(inode), 0, KEY_SIZE);
+	make_bad_inode(inode);
+}
+
+//
+// initially this function was derived from minix or ext2's analog and
+// evolved as the prototype did
+//
+
+int reiserfs_init_locked_inode(struct inode *inode, void *p)
+{
+	struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
+	inode->i_ino = args->objectid;
+	INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
+	return 0;
+}
+
+/* looks for stat data in the tree, and fills up the fields of in-core
+   inode stat data fields */
+void reiserfs_read_locked_inode(struct inode *inode,
+				struct reiserfs_iget_args *args)
+{
+	INITIALIZE_PATH(path_to_sd);
+	struct cpu_key key;
+	unsigned long dirino;
+	int retval;
+
+	dirino = args->dirid;
+
+	/* set version 1, version 2 could be used too, because stat data
+	   key is the same in both versions */
+	key.version = KEY_FORMAT_3_5;
+	key.on_disk_key.k_dir_id = dirino;
+	key.on_disk_key.k_objectid = inode->i_ino;
+	key.on_disk_key.k_offset = 0;
+	key.on_disk_key.k_type = 0;
+
+	/* look for the object's stat data */
+	retval = search_item(inode->i_sb, &key, &path_to_sd);
+	if (retval == IO_ERROR) {
+		reiserfs_error(inode->i_sb, "vs-13070",
+			       "i/o failure occurred trying to find "
+			       "stat data of %K", &key);
+		reiserfs_make_bad_inode(inode);
+		return;
+	}
+	if (retval != ITEM_FOUND) {
+		/* a stale NFS handle can trigger this without it being an error */
+		pathrelse(&path_to_sd);
+		reiserfs_make_bad_inode(inode);
+		inode->i_nlink = 0;
+		return;
+	}
+
+	init_inode(inode, &path_to_sd);
+
+	/* It is possible that knfsd is trying to access inode of a file
+	   that is being removed from the disk by some other thread. As we
+	   update sd on unlink all that is required is to check for nlink
+	   here. This bug was first found by Sizif when debugging
+	   SquidNG/Butterfly, forgotten, and found again after Philippe
+	   Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
+
+	   More logical fix would require changes in fs/inode.c:iput() to
+	   remove inode from hash-table _after_ fs cleaned disk stuff up and
+	   in iget() to return NULL if I_FREEING inode is found in
+	   hash-table. */
+	/* Currently there is one place where it's ok to meet inode with
+	   nlink==0: processing of open-unlinked and half-truncated files
+	   during mount (fs/reiserfs/super.c:finish_unfinished()). */
+	if ((inode->i_nlink == 0) &&
+	    !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
+		reiserfs_warning(inode->i_sb, "vs-13075",
+				 "dead inode read from disk %K. "
+				 "This is likely to be race with knfsd. Ignore",
+				 &key);
+		reiserfs_make_bad_inode(inode);
+	}
+
+	reiserfs_check_path(&path_to_sd);	/* init inode should be relsing */
+
+}
+
+/**
+ * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
+ *
+ * @inode:    inode from hash table to check
+ * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
+ *
+ * This function is called by iget5_locked() to distinguish reiserfs inodes
+ * having the same inode numbers. Such inodes can only exist due to some
+ * error condition. One of them should be bad. Inodes with identical
+ * inode numbers (objectids) are distinguished by parent directory ids.
+ *
+ */
+int reiserfs_find_actor(struct inode *inode, void *opaque)
+{
+	struct reiserfs_iget_args *args;
+
+	args = opaque;
+	/* args is already in CPU order */
+	return (inode->i_ino == args->objectid) &&
+	    (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
+}
+
+struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
+{
+	struct inode *inode;
+	struct reiserfs_iget_args args;
+
+	args.objectid = key->on_disk_key.k_objectid;
+	args.dirid = key->on_disk_key.k_dir_id;
+	reiserfs_write_unlock(s);
+	inode = iget5_locked(s, key->on_disk_key.k_objectid,
+			     reiserfs_find_actor, reiserfs_init_locked_inode,
+			     (void *)(&args));
+	reiserfs_write_lock(s);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (inode->i_state & I_NEW) {
+		reiserfs_read_locked_inode(inode, &args);
+		unlock_new_inode(inode);
+	}
+
+	if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
+		/* either due to i/o error or a stale NFS handle */
+		iput(inode);
+		inode = NULL;
+	}
+	return inode;
+}
+
+static struct dentry *reiserfs_get_dentry(struct super_block *sb,
+	u32 objectid, u32 dir_id, u32 generation)
+
+{
+	struct cpu_key key;
+	struct inode *inode;
+
+	key.on_disk_key.k_objectid = objectid;
+	key.on_disk_key.k_dir_id = dir_id;
+	reiserfs_write_lock(sb);
+	inode = reiserfs_iget(sb, &key);
+	if (inode && !IS_ERR(inode) && generation != 0 &&
+	    generation != inode->i_generation) {
+		iput(inode);
+		inode = NULL;
+	}
+	reiserfs_write_unlock(sb);
+
+	return d_obtain_alias(inode);
+}
+
+struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	/* fhtype happens to reflect the number of u32s encoded.
+	 * due to a bug in earlier code, fhtype might indicate there
+	 * are more u32s then actually fitted.
+	 * so if fhtype seems to be more than len, reduce fhtype.
+	 * Valid types are:
+	 *   2 - objectid + dir_id - legacy support
+	 *   3 - objectid + dir_id + generation
+	 *   4 - objectid + dir_id + objectid and dirid of parent - legacy
+	 *   5 - objectid + dir_id + generation + objectid and dirid of parent
+	 *   6 - as above plus generation of directory
+	 * 6 does not fit in NFSv2 handles
+	 */
+	if (fh_type > fh_len) {
+		if (fh_type != 6 || fh_len != 5)
+			reiserfs_warning(sb, "reiserfs-13077",
+				"nfsd/reiserfs, fhtype=%d, len=%d - odd",
+				fh_type, fh_len);
+		fh_type = 5;
+	}
+
+	return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
+		(fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
+}
+
+struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	if (fh_type < 4)
+		return NULL;
+
+	return reiserfs_get_dentry(sb,
+		(fh_type >= 5) ? fid->raw[3] : fid->raw[2],
+		(fh_type >= 5) ? fid->raw[4] : fid->raw[3],
+		(fh_type == 6) ? fid->raw[5] : 0);
+}
+
+int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
+		       int need_parent)
+{
+	struct inode *inode = dentry->d_inode;
+	int maxlen = *lenp;
+
+	if (need_parent && (maxlen < 5)) {
+		*lenp = 5;
+		return 255;
+	} else if (maxlen < 3) {
+		*lenp = 3;
+		return 255;
+	}
+
+	data[0] = inode->i_ino;
+	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
+	data[2] = inode->i_generation;
+	*lenp = 3;
+	/* no room for directory info? return what we've stored so far */
+	if (maxlen < 5 || !need_parent)
+		return 3;
+
+	spin_lock(&dentry->d_lock);
+	inode = dentry->d_parent->d_inode;
+	data[3] = inode->i_ino;
+	data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
+	*lenp = 5;
+	if (maxlen >= 6) {
+		data[5] = inode->i_generation;
+		*lenp = 6;
+	}
+	spin_unlock(&dentry->d_lock);
+	return *lenp;
+}
+
+/* looks for stat data, then copies fields to it, marks the buffer
+   containing stat data as dirty */
+/* reiserfs inodes are never really dirty, since the dirty inode call
+** always logs them.  This call allows the VFS inode marking routines
+** to properly mark inodes for datasync and such, but only actually
+** does something when called for a synchronous update.
+*/
+int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct reiserfs_transaction_handle th;
+	int jbegin_count = 1;
+
+	if (inode->i_sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	/* memory pressure can sometimes initiate write_inode calls with sync == 1,
+	 ** these cases are just when the system needs ram, not when the
+	 ** inode needs to reach disk for safety, and they can safely be
+	 ** ignored because the altered inode has already been logged.
+	 */
+	if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
+		reiserfs_write_lock(inode->i_sb);
+		if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
+			reiserfs_update_sd(&th, inode);
+			journal_end_sync(&th, inode->i_sb, jbegin_count);
+		}
+		reiserfs_write_unlock(inode->i_sb);
+	}
+	return 0;
+}
+
+/* stat data of new object is inserted already, this inserts the item
+   containing "." and ".." entries */
+static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
+				  struct inode *inode,
+				  struct item_head *ih, struct treepath *path,
+				  struct inode *dir)
+{
+	struct super_block *sb = th->t_super;
+	char empty_dir[EMPTY_DIR_SIZE];
+	char *body = empty_dir;
+	struct cpu_key key;
+	int retval;
+
+	BUG_ON(!th->t_trans_id);
+
+	_make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
+		      le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
+		      TYPE_DIRENTRY, 3 /*key length */ );
+
+	/* compose item head for new item. Directories consist of items of
+	   old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
+	   is done by reiserfs_new_inode */
+	if (old_format_only(sb)) {
+		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
+				  TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
+
+		make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
+				       ih->ih_key.k_objectid,
+				       INODE_PKEY(dir)->k_dir_id,
+				       INODE_PKEY(dir)->k_objectid);
+	} else {
+		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
+				  TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
+
+		make_empty_dir_item(body, ih->ih_key.k_dir_id,
+				    ih->ih_key.k_objectid,
+				    INODE_PKEY(dir)->k_dir_id,
+				    INODE_PKEY(dir)->k_objectid);
+	}
+
+	/* look for place in the tree for new item */
+	retval = search_item(sb, &key, path);
+	if (retval == IO_ERROR) {
+		reiserfs_error(sb, "vs-13080",
+			       "i/o failure occurred creating new directory");
+		return -EIO;
+	}
+	if (retval == ITEM_FOUND) {
+		pathrelse(path);
+		reiserfs_warning(sb, "vs-13070",
+				 "object with this key exists (%k)",
+				 &(ih->ih_key));
+		return -EEXIST;
+	}
+
+	/* insert item, that is empty directory item */
+	return reiserfs_insert_item(th, path, &key, ih, inode, body);
+}
+
+/* stat data of object has been inserted, this inserts the item
+   containing the body of symlink */
+static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode of symlink */
+				struct item_head *ih,
+				struct treepath *path, const char *symname,
+				int item_len)
+{
+	struct super_block *sb = th->t_super;
+	struct cpu_key key;
+	int retval;
+
+	BUG_ON(!th->t_trans_id);
+
+	_make_cpu_key(&key, KEY_FORMAT_3_5,
+		      le32_to_cpu(ih->ih_key.k_dir_id),
+		      le32_to_cpu(ih->ih_key.k_objectid),
+		      1, TYPE_DIRECT, 3 /*key length */ );
+
+	make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
+			  0 /*free_space */ );
+
+	/* look for place in the tree for new item */
+	retval = search_item(sb, &key, path);
+	if (retval == IO_ERROR) {
+		reiserfs_error(sb, "vs-13080",
+			       "i/o failure occurred creating new symlink");
+		return -EIO;
+	}
+	if (retval == ITEM_FOUND) {
+		pathrelse(path);
+		reiserfs_warning(sb, "vs-13080",
+				 "object with this key exists (%k)",
+				 &(ih->ih_key));
+		return -EEXIST;
+	}
+
+	/* insert item, that is body of symlink */
+	return reiserfs_insert_item(th, path, &key, ih, inode, symname);
+}
+
+/* inserts the stat data into the tree, and then calls
+   reiserfs_new_directory (to insert ".", ".." item if new object is
+   directory) or reiserfs_new_symlink (to insert symlink body if new
+   object is symlink) or nothing (if new object is regular file)
+
+   NOTE! uid and gid must already be set in the inode.  If we return
+   non-zero due to an error, we have to drop the quota previously allocated
+   for the fresh inode.  This can only be done outside a transaction, so
+   if we return non-zero, we also end the transaction.  */
+int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
+		       struct inode *dir, int mode, const char *symname,
+		       /* 0 for regular, EMTRY_DIR_SIZE for dirs,
+		          strlen (symname) for symlinks) */
+		       loff_t i_size, struct dentry *dentry,
+		       struct inode *inode,
+		       struct reiserfs_security_handle *security)
+{
+	struct super_block *sb;
+	struct reiserfs_iget_args args;
+	INITIALIZE_PATH(path_to_key);
+	struct cpu_key key;
+	struct item_head ih;
+	struct stat_data sd;
+	int retval;
+	int err;
+
+	BUG_ON(!th->t_trans_id);
+
+	dquot_initialize(inode);
+	err = dquot_alloc_inode(inode);
+	if (err)
+		goto out_end_trans;
+	if (!dir->i_nlink) {
+		err = -EPERM;
+		goto out_bad_inode;
+	}
+
+	sb = dir->i_sb;
+
+	/* item head of new item */
+	ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
+	ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
+	if (!ih.ih_key.k_objectid) {
+		err = -ENOMEM;
+		goto out_bad_inode;
+	}
+	args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
+	if (old_format_only(sb))
+		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
+				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
+	else
+		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
+				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
+	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
+	args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
+	if (insert_inode_locked4(inode, args.objectid,
+			     reiserfs_find_actor, &args) < 0) {
+		err = -EINVAL;
+		goto out_bad_inode;
+	}
+	if (old_format_only(sb))
+		/* not a perfect generation count, as object ids can be reused, but
+		 ** this is as good as reiserfs can do right now.
+		 ** note that the private part of inode isn't filled in yet, we have
+		 ** to use the directory.
+		 */
+		inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
+	else
+#if defined( USE_INODE_GENERATION_COUNTER )
+		inode->i_generation =
+		    le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
+#else
+		inode->i_generation = ++event;
+#endif
+
+	/* fill stat data */
+	inode->i_nlink = (S_ISDIR(mode) ? 2 : 1);
+
+	/* uid and gid must already be set by the caller for quota init */
+
+	/* symlink cannot be immutable or append only, right? */
+	if (S_ISLNK(inode->i_mode))
+		inode->i_flags &= ~(S_IMMUTABLE | S_APPEND);
+
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_size = i_size;
+	inode->i_blocks = 0;
+	inode->i_bytes = 0;
+	REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
+	    U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
+
+	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
+	REISERFS_I(inode)->i_flags = 0;
+	REISERFS_I(inode)->i_prealloc_block = 0;
+	REISERFS_I(inode)->i_prealloc_count = 0;
+	REISERFS_I(inode)->i_trans_id = 0;
+	REISERFS_I(inode)->i_jl = NULL;
+	REISERFS_I(inode)->i_attrs =
+	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
+	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
+	reiserfs_init_xattr_rwsem(inode);
+
+	/* key to search for correct place for new stat data */
+	_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
+		      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
+		      TYPE_STAT_DATA, 3 /*key length */ );
+
+	/* find proper place for inserting of stat data */
+	retval = search_item(sb, &key, &path_to_key);
+	if (retval == IO_ERROR) {
+		err = -EIO;
+		goto out_bad_inode;
+	}
+	if (retval == ITEM_FOUND) {
+		pathrelse(&path_to_key);
+		err = -EEXIST;
+		goto out_bad_inode;
+	}
+	if (old_format_only(sb)) {
+		if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
+			pathrelse(&path_to_key);
+			/* i_uid or i_gid is too big to be stored in stat data v3.5 */
+			err = -EINVAL;
+			goto out_bad_inode;
+		}
+		inode2sd_v1(&sd, inode, inode->i_size);
+	} else {
+		inode2sd(&sd, inode, inode->i_size);
+	}
+	// store in in-core inode the key of stat data and version all
+	// object items will have (directory items will have old offset
+	// format, other new objects will consist of new items)
+	if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
+		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
+	else
+		set_inode_item_key_version(inode, KEY_FORMAT_3_6);
+	if (old_format_only(sb))
+		set_inode_sd_version(inode, STAT_DATA_V1);
+	else
+		set_inode_sd_version(inode, STAT_DATA_V2);
+
+	/* insert the stat data into the tree */
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+	if (REISERFS_I(dir)->new_packing_locality)
+		th->displace_new_blocks = 1;
+#endif
+	retval =
+	    reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
+				 (char *)(&sd));
+	if (retval) {
+		err = retval;
+		reiserfs_check_path(&path_to_key);
+		goto out_bad_inode;
+	}
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+	if (!th->displace_new_blocks)
+		REISERFS_I(dir)->new_packing_locality = 0;
+#endif
+	if (S_ISDIR(mode)) {
+		/* insert item with "." and ".." */
+		retval =
+		    reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
+	}
+
+	if (S_ISLNK(mode)) {
+		/* insert body of symlink */
+		if (!old_format_only(sb))
+			i_size = ROUND_UP(i_size);
+		retval =
+		    reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
+					 i_size);
+	}
+	if (retval) {
+		err = retval;
+		reiserfs_check_path(&path_to_key);
+		journal_end(th, th->t_super, th->t_blocks_allocated);
+		goto out_inserted_sd;
+	}
+
+	if (reiserfs_posixacl(inode->i_sb)) {
+		retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
+		if (retval) {
+			err = retval;
+			reiserfs_check_path(&path_to_key);
+			journal_end(th, th->t_super, th->t_blocks_allocated);
+			goto out_inserted_sd;
+		}
+	} else if (inode->i_sb->s_flags & MS_POSIXACL) {
+		reiserfs_warning(inode->i_sb, "jdm-13090",
+				 "ACLs aren't enabled in the fs, "
+				 "but vfs thinks they are!");
+	} else if (IS_PRIVATE(dir))
+		inode->i_flags |= S_PRIVATE;
+
+	if (security->name) {
+		retval = reiserfs_security_write(th, inode, security);
+		if (retval) {
+			err = retval;
+			reiserfs_check_path(&path_to_key);
+			retval = journal_end(th, th->t_super,
+					     th->t_blocks_allocated);
+			if (retval)
+				err = retval;
+			goto out_inserted_sd;
+		}
+	}
+
+	reiserfs_update_sd(th, inode);
+	reiserfs_check_path(&path_to_key);
+
+	return 0;
+
+/* it looks like you can easily compress these two goto targets into
+ * one.  Keeping it like this doesn't actually hurt anything, and they
+ * are place holders for what the quota code actually needs.
+ */
+      out_bad_inode:
+	/* Invalidate the object, nothing was inserted yet */
+	INODE_PKEY(inode)->k_objectid = 0;
+
+	/* Quota change must be inside a transaction for journaling */
+	dquot_free_inode(inode);
+
+      out_end_trans:
+	journal_end(th, th->t_super, th->t_blocks_allocated);
+	/* Drop can be outside and it needs more credits so it's better to have it outside */
+	dquot_drop(inode);
+	inode->i_flags |= S_NOQUOTA;
+	make_bad_inode(inode);
+
+      out_inserted_sd:
+	inode->i_nlink = 0;
+	th->t_trans_id = 0;	/* so the caller can't use this handle later */
+	unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
+	iput(inode);
+	return err;
+}
+
+/*
+** finds the tail page in the page cache,
+** reads the last block in.
+**
+** On success, page_result is set to a locked, pinned page, and bh_result
+** is set to an up to date buffer for the last block in the file.  returns 0.
+**
+** tail conversion is not done, so bh_result might not be valid for writing
+** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
+** trying to write the block.
+**
+** on failure, nonzero is returned, page_result and bh_result are untouched.
+*/
+static int grab_tail_page(struct inode *inode,
+			  struct page **page_result,
+			  struct buffer_head **bh_result)
+{
+
+	/* we want the page with the last byte in the file,
+	 ** not the page that will hold the next byte for appending
+	 */
+	unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+	unsigned long pos = 0;
+	unsigned long start = 0;
+	unsigned long blocksize = inode->i_sb->s_blocksize;
+	unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1);
+	struct buffer_head *bh;
+	struct buffer_head *head;
+	struct page *page;
+	int error;
+
+	/* we know that we are only called with inode->i_size > 0.
+	 ** we also know that a file tail can never be as big as a block
+	 ** If i_size % blocksize == 0, our file is currently block aligned
+	 ** and it won't need converting or zeroing after a truncate.
+	 */
+	if ((offset & (blocksize - 1)) == 0) {
+		return -ENOENT;
+	}
+	page = grab_cache_page(inode->i_mapping, index);
+	error = -ENOMEM;
+	if (!page) {
+		goto out;
+	}
+	/* start within the page of the last block in the file */
+	start = (offset / blocksize) * blocksize;
+
+	error = __block_write_begin(page, start, offset - start,
+				    reiserfs_get_block_create_0);
+	if (error)
+		goto unlock;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (pos >= start) {
+			break;
+		}
+		bh = bh->b_this_page;
+		pos += blocksize;
+	} while (bh != head);
+
+	if (!buffer_uptodate(bh)) {
+		/* note, this should never happen, prepare_write should
+		 ** be taking care of this for us.  If the buffer isn't up to date,
+		 ** I've screwed up the code to find the buffer, or the code to
+		 ** call prepare_write
+		 */
+		reiserfs_error(inode->i_sb, "clm-6000",
+			       "error reading block %lu", bh->b_blocknr);
+		error = -EIO;
+		goto unlock;
+	}
+	*bh_result = bh;
+	*page_result = page;
+
+      out:
+	return error;
+
+      unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return error;
+}
+
+/*
+** vfs version of truncate file.  Must NOT be called with
+** a transaction already started.
+**
+** some code taken from block_truncate_page
+*/
+int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
+{
+	struct reiserfs_transaction_handle th;
+	/* we want the offset for the first byte after the end of the file */
+	unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+	unsigned blocksize = inode->i_sb->s_blocksize;
+	unsigned length;
+	struct page *page = NULL;
+	int error;
+	struct buffer_head *bh = NULL;
+	int err2;
+	int lock_depth;
+
+	lock_depth = reiserfs_write_lock_once(inode->i_sb);
+
+	if (inode->i_size > 0) {
+		error = grab_tail_page(inode, &page, &bh);
+		if (error) {
+			// -ENOENT means we truncated past the end of the file,
+			// and get_block_create_0 could not find a block to read in,
+			// which is ok.
+			if (error != -ENOENT)
+				reiserfs_error(inode->i_sb, "clm-6001",
+					       "grab_tail_page failed %d",
+					       error);
+			page = NULL;
+			bh = NULL;
+		}
+	}
+
+	/* so, if page != NULL, we have a buffer head for the offset at
+	 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
+	 ** then we have an unformatted node.  Otherwise, we have a direct item,
+	 ** and no zeroing is required on disk.  We zero after the truncate,
+	 ** because the truncate might pack the item anyway
+	 ** (it will unmap bh if it packs).
+	 */
+	/* it is enough to reserve space in transaction for 2 balancings:
+	   one for "save" link adding and another for the first
+	   cut_from_item. 1 is for update_sd */
+	error = journal_begin(&th, inode->i_sb,
+			      JOURNAL_PER_BALANCE_CNT * 2 + 1);
+	if (error)
+		goto out;
+	reiserfs_update_inode_transaction(inode);
+	if (update_timestamps)
+		/* we are doing real truncate: if the system crashes before the last
+		   transaction of truncating gets committed - on reboot the file
+		   either appears truncated properly or not truncated at all */
+		add_save_link(&th, inode, 1);
+	err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
+	error =
+	    journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
+	if (error)
+		goto out;
+
+	/* check reiserfs_do_truncate after ending the transaction */
+	if (err2) {
+		error = err2;
+  		goto out;
+	}
+	
+	if (update_timestamps) {
+		error = remove_save_link(inode, 1 /* truncate */);
+		if (error)
+			goto out;
+	}
+
+	if (page) {
+		length = offset & (blocksize - 1);
+		/* if we are not on a block boundary */
+		if (length) {
+			length = blocksize - length;
+			zero_user(page, offset, length);
+			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
+				mark_buffer_dirty(bh);
+			}
+		}
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+
+	return 0;
+      out:
+	if (page) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+
+	return error;
+}
+
+static int map_block_for_writepage(struct inode *inode,
+				   struct buffer_head *bh_result,
+				   unsigned long block)
+{
+	struct reiserfs_transaction_handle th;
+	int fs_gen;
+	struct item_head tmp_ih;
+	struct item_head *ih;
+	struct buffer_head *bh;
+	__le32 *item;
+	struct cpu_key key;
+	INITIALIZE_PATH(path);
+	int pos_in_item;
+	int jbegin_count = JOURNAL_PER_BALANCE_CNT;
+	loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1;
+	int retval;
+	int use_get_block = 0;
+	int bytes_copied = 0;
+	int copy_size;
+	int trans_running = 0;
+
+	/* catch places below that try to log something without starting a trans */
+	th.t_trans_id = 0;
+
+	if (!buffer_uptodate(bh_result)) {
+		return -EIO;
+	}
+
+	kmap(bh_result->b_page);
+      start_over:
+	reiserfs_write_lock(inode->i_sb);
+	make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
+
+      research:
+	retval = search_for_position_by_key(inode->i_sb, &key, &path);
+	if (retval != POSITION_FOUND) {
+		use_get_block = 1;
+		goto out;
+	}
+
+	bh = get_last_bh(&path);
+	ih = get_ih(&path);
+	item = get_item(&path);
+	pos_in_item = path.pos_in_item;
+
+	/* we've found an unformatted node */
+	if (indirect_item_found(retval, ih)) {
+		if (bytes_copied > 0) {
+			reiserfs_warning(inode->i_sb, "clm-6002",
+					 "bytes_copied %d", bytes_copied);
+		}
+		if (!get_block_num(item, pos_in_item)) {
+			/* crap, we are writing to a hole */
+			use_get_block = 1;
+			goto out;
+		}
+		set_block_dev_mapped(bh_result,
+				     get_block_num(item, pos_in_item), inode);
+	} else if (is_direct_le_ih(ih)) {
+		char *p;
+		p = page_address(bh_result->b_page);
+		p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1);
+		copy_size = ih_item_len(ih) - pos_in_item;
+
+		fs_gen = get_generation(inode->i_sb);
+		copy_item_head(&tmp_ih, ih);
+
+		if (!trans_running) {
+			/* vs-3050 is gone, no need to drop the path */
+			retval = journal_begin(&th, inode->i_sb, jbegin_count);
+			if (retval)
+				goto out;
+			reiserfs_update_inode_transaction(inode);
+			trans_running = 1;
+			if (fs_changed(fs_gen, inode->i_sb)
+			    && item_moved(&tmp_ih, &path)) {
+				reiserfs_restore_prepared_buffer(inode->i_sb,
+								 bh);
+				goto research;
+			}
+		}
+
+		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
+
+		if (fs_changed(fs_gen, inode->i_sb)
+		    && item_moved(&tmp_ih, &path)) {
+			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
+			goto research;
+		}
+
+		memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied,
+		       copy_size);
+
+		journal_mark_dirty(&th, inode->i_sb, bh);
+		bytes_copied += copy_size;
+		set_block_dev_mapped(bh_result, 0, inode);
+
+		/* are there still bytes left? */
+		if (bytes_copied < bh_result->b_size &&
+		    (byte_offset + bytes_copied) < inode->i_size) {
+			set_cpu_key_k_offset(&key,
+					     cpu_key_k_offset(&key) +
+					     copy_size);
+			goto research;
+		}
+	} else {
+		reiserfs_warning(inode->i_sb, "clm-6003",
+				 "bad item inode %lu", inode->i_ino);
+		retval = -EIO;
+		goto out;
+	}
+	retval = 0;
+
+      out:
+	pathrelse(&path);
+	if (trans_running) {
+		int err = journal_end(&th, inode->i_sb, jbegin_count);
+		if (err)
+			retval = err;
+		trans_running = 0;
+	}
+	reiserfs_write_unlock(inode->i_sb);
+
+	/* this is where we fill in holes in the file. */
+	if (use_get_block) {
+		retval = reiserfs_get_block(inode, block, bh_result,
+					    GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
+					    | GET_BLOCK_NO_DANGLE);
+		if (!retval) {
+			if (!buffer_mapped(bh_result)
+			    || bh_result->b_blocknr == 0) {
+				/* get_block failed to find a mapped unformatted node. */
+				use_get_block = 0;
+				goto start_over;
+			}
+		}
+	}
+	kunmap(bh_result->b_page);
+
+	if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
+		/* we've copied data from the page into the direct item, so the
+		 * buffer in the page is now clean, mark it to reflect that.
+		 */
+		lock_buffer(bh_result);
+		clear_buffer_dirty(bh_result);
+		unlock_buffer(bh_result);
+	}
+	return retval;
+}
+
+/*
+ * mason@suse.com: updated in 2.5.54 to follow the same general io
+ * start/recovery path as __block_write_full_page, along with special
+ * code to handle reiserfs tails.
+ */
+static int reiserfs_write_full_page(struct page *page,
+				    struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	int error = 0;
+	unsigned long block;
+	sector_t last_block;
+	struct buffer_head *head, *bh;
+	int partial = 0;
+	int nr = 0;
+	int checked = PageChecked(page);
+	struct reiserfs_transaction_handle th;
+	struct super_block *s = inode->i_sb;
+	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
+	th.t_trans_id = 0;
+
+	/* no logging allowed when nonblocking or from PF_MEMALLOC */
+	if (checked && (current->flags & PF_MEMALLOC)) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return 0;
+	}
+
+	/* The page dirty bit is cleared before writepage is called, which
+	 * means we have to tell create_empty_buffers to make dirty buffers
+	 * The page really should be up to date at this point, so tossing
+	 * in the BH_Uptodate is just a sanity check.
+	 */
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, s->s_blocksize,
+				     (1 << BH_Dirty) | (1 << BH_Uptodate));
+	}
+	head = page_buffers(page);
+
+	/* last page in the file, zero out any contents past the
+	 ** last byte in the file
+	 */
+	if (page->index >= end_index) {
+		unsigned last_offset;
+
+		last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+		/* no file contents in this page */
+		if (page->index >= end_index + 1 || !last_offset) {
+			unlock_page(page);
+			return 0;
+		}
+		zero_user_segment(page, last_offset, PAGE_CACHE_SIZE);
+	}
+	bh = head;
+	block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
+	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
+	/* first map all the buffers, logging any direct items we find */
+	do {
+		if (block > last_block) {
+			/*
+			 * This can happen when the block size is less than
+			 * the page size.  The corresponding bytes in the page
+			 * were zero filled above
+			 */
+			clear_buffer_dirty(bh);
+			set_buffer_uptodate(bh);
+		} else if ((checked || buffer_dirty(bh)) &&
+		           (!buffer_mapped(bh) || (buffer_mapped(bh)
+						       && bh->b_blocknr ==
+						       0))) {
+			/* not mapped yet, or it points to a direct item, search
+			 * the btree for the mapping info, and log any direct
+			 * items found
+			 */
+			if ((error = map_block_for_writepage(inode, bh, block))) {
+				goto fail;
+			}
+		}
+		bh = bh->b_this_page;
+		block++;
+	} while (bh != head);
+
+	/*
+	 * we start the transaction after map_block_for_writepage,
+	 * because it can create holes in the file (an unbounded operation).
+	 * starting it here, we can make a reliable estimate for how many
+	 * blocks we're going to log
+	 */
+	if (checked) {
+		ClearPageChecked(page);
+		reiserfs_write_lock(s);
+		error = journal_begin(&th, s, bh_per_page + 1);
+		if (error) {
+			reiserfs_write_unlock(s);
+			goto fail;
+		}
+		reiserfs_update_inode_transaction(inode);
+	}
+	/* now go through and lock any dirty buffers on the page */
+	do {
+		get_bh(bh);
+		if (!buffer_mapped(bh))
+			continue;
+		if (buffer_mapped(bh) && bh->b_blocknr == 0)
+			continue;
+
+		if (checked) {
+			reiserfs_prepare_for_journal(s, bh, 1);
+			journal_mark_dirty(&th, s, bh);
+			continue;
+		}
+		/* from this point on, we know the buffer is mapped to a
+		 * real block and not a direct item
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE) {
+			lock_buffer(bh);
+		} else {
+			if (!trylock_buffer(bh)) {
+				redirty_page_for_writepage(wbc, page);
+				continue;
+			}
+		}
+		if (test_clear_buffer_dirty(bh)) {
+			mark_buffer_async_write(bh);
+		} else {
+			unlock_buffer(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+	if (checked) {
+		error = journal_end(&th, s, bh_per_page + 1);
+		reiserfs_write_unlock(s);
+		if (error)
+			goto fail;
+	}
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
+
+	/*
+	 * since any buffer might be the only dirty buffer on the page,
+	 * the first submit_bh can bring the page out of writeback.
+	 * be careful with the buffers.
+	 */
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			submit_bh(WRITE, bh);
+			nr++;
+		}
+		put_bh(bh);
+		bh = next;
+	} while (bh != head);
+
+	error = 0;
+      done:
+	if (nr == 0) {
+		/*
+		 * if this page only had a direct item, it is very possible for
+		 * no io to be required without there being an error.  Or,
+		 * someone else could have locked them and sent them down the
+		 * pipe without locking the page
+		 */
+		bh = head;
+		do {
+			if (!buffer_uptodate(bh)) {
+				partial = 1;
+				break;
+			}
+			bh = bh->b_this_page;
+		} while (bh != head);
+		if (!partial)
+			SetPageUptodate(page);
+		end_page_writeback(page);
+	}
+	return error;
+
+      fail:
+	/* catches various errors, we need to make sure any valid dirty blocks
+	 * get to the media.  The page is currently locked and not marked for
+	 * writeback
+	 */
+	ClearPageUptodate(page);
+	bh = head;
+	do {
+		get_bh(bh);
+		if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
+			lock_buffer(bh);
+			mark_buffer_async_write(bh);
+		} else {
+			/*
+			 * clear any dirty bits that might have come from getting
+			 * attached to a dirty page
+			 */
+			clear_buffer_dirty(bh);
+		}
+		bh = bh->b_this_page;
+	} while (bh != head);
+	SetPageError(page);
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			clear_buffer_dirty(bh);
+			submit_bh(WRITE, bh);
+			nr++;
+		}
+		put_bh(bh);
+		bh = next;
+	} while (bh != head);
+	goto done;
+}
+
+static int reiserfs_readpage(struct file *f, struct page *page)
+{
+	return block_read_full_page(page, reiserfs_get_block);
+}
+
+static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	reiserfs_wait_on_write_block(inode->i_sb);
+	return reiserfs_write_full_page(page, wbc);
+}
+
+static void reiserfs_truncate_failed_write(struct inode *inode)
+{
+	truncate_inode_pages(inode->i_mapping, inode->i_size);
+	reiserfs_truncate_file(inode, 0);
+}
+
+static int reiserfs_write_begin(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata)
+{
+	struct inode *inode;
+	struct page *page;
+	pgoff_t index;
+	int ret;
+	int old_ref = 0;
+
+ 	inode = mapping->host;
+	*fsdata = 0;
+ 	if (flags & AOP_FLAG_CONT_EXPAND &&
+ 	    (pos & (inode->i_sb->s_blocksize - 1)) == 0) {
+ 		pos ++;
+		*fsdata = (void *)(unsigned long)flags;
+	}
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	reiserfs_wait_on_write_block(inode->i_sb);
+	fix_tail_page_for_writing(page);
+	if (reiserfs_transaction_running(inode->i_sb)) {
+		struct reiserfs_transaction_handle *th;
+		th = (struct reiserfs_transaction_handle *)current->
+		    journal_info;
+		BUG_ON(!th->t_refcount);
+		BUG_ON(!th->t_trans_id);
+		old_ref = th->t_refcount;
+		th->t_refcount++;
+	}
+	ret = __block_write_begin(page, pos, len, reiserfs_get_block);
+	if (ret && reiserfs_transaction_running(inode->i_sb)) {
+		struct reiserfs_transaction_handle *th = current->journal_info;
+		/* this gets a little ugly.  If reiserfs_get_block returned an
+		 * error and left a transacstion running, we've got to close it,
+		 * and we've got to free handle if it was a persistent transaction.
+		 *
+		 * But, if we had nested into an existing transaction, we need
+		 * to just drop the ref count on the handle.
+		 *
+		 * If old_ref == 0, the transaction is from reiserfs_get_block,
+		 * and it was a persistent trans.  Otherwise, it was nested above.
+		 */
+		if (th->t_refcount > old_ref) {
+			if (old_ref)
+				th->t_refcount--;
+			else {
+				int err;
+				reiserfs_write_lock(inode->i_sb);
+				err = reiserfs_end_persistent_transaction(th);
+				reiserfs_write_unlock(inode->i_sb);
+				if (err)
+					ret = err;
+			}
+		}
+	}
+	if (ret) {
+		unlock_page(page);
+		page_cache_release(page);
+		/* Truncate allocated blocks */
+		reiserfs_truncate_failed_write(inode);
+	}
+	return ret;
+}
+
+int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
+{
+	struct inode *inode = page->mapping->host;
+	int ret;
+	int old_ref = 0;
+
+	reiserfs_write_unlock(inode->i_sb);
+	reiserfs_wait_on_write_block(inode->i_sb);
+	reiserfs_write_lock(inode->i_sb);
+
+	fix_tail_page_for_writing(page);
+	if (reiserfs_transaction_running(inode->i_sb)) {
+		struct reiserfs_transaction_handle *th;
+		th = (struct reiserfs_transaction_handle *)current->
+		    journal_info;
+		BUG_ON(!th->t_refcount);
+		BUG_ON(!th->t_trans_id);
+		old_ref = th->t_refcount;
+		th->t_refcount++;
+	}
+
+	ret = __block_write_begin(page, from, len, reiserfs_get_block);
+	if (ret && reiserfs_transaction_running(inode->i_sb)) {
+		struct reiserfs_transaction_handle *th = current->journal_info;
+		/* this gets a little ugly.  If reiserfs_get_block returned an
+		 * error and left a transacstion running, we've got to close it,
+		 * and we've got to free handle if it was a persistent transaction.
+		 *
+		 * But, if we had nested into an existing transaction, we need
+		 * to just drop the ref count on the handle.
+		 *
+		 * If old_ref == 0, the transaction is from reiserfs_get_block,
+		 * and it was a persistent trans.  Otherwise, it was nested above.
+		 */
+		if (th->t_refcount > old_ref) {
+			if (old_ref)
+				th->t_refcount--;
+			else {
+				int err;
+				reiserfs_write_lock(inode->i_sb);
+				err = reiserfs_end_persistent_transaction(th);
+				reiserfs_write_unlock(inode->i_sb);
+				if (err)
+					ret = err;
+			}
+		}
+	}
+	return ret;
+
+}
+
+static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
+{
+	return generic_block_bmap(as, block, reiserfs_bmap);
+}
+
+static int reiserfs_write_end(struct file *file, struct address_space *mapping,
+			      loff_t pos, unsigned len, unsigned copied,
+			      struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+	int ret = 0;
+	int update_sd = 0;
+	struct reiserfs_transaction_handle *th;
+	unsigned start;
+	int lock_depth = 0;
+	bool locked = false;
+
+	if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
+		pos ++;
+
+	reiserfs_wait_on_write_block(inode->i_sb);
+	if (reiserfs_transaction_running(inode->i_sb))
+		th = current->journal_info;
+	else
+		th = NULL;
+
+	start = pos & (PAGE_CACHE_SIZE - 1);
+	if (unlikely(copied < len)) {
+		if (!PageUptodate(page))
+			copied = 0;
+
+		page_zero_new_buffers(page, start + copied, start + len);
+	}
+	flush_dcache_page(page);
+
+	reiserfs_commit_page(inode, page, start, start + copied);
+
+	/* generic_commit_write does this for us, but does not update the
+	 ** transaction tracking stuff when the size changes.  So, we have
+	 ** to do the i_size updates here.
+	 */
+	if (pos + copied > inode->i_size) {
+		struct reiserfs_transaction_handle myth;
+		lock_depth = reiserfs_write_lock_once(inode->i_sb);
+		locked = true;
+		/* If the file have grown beyond the border where it
+		   can have a tail, unmark it as needing a tail
+		   packing */
+		if ((have_large_tails(inode->i_sb)
+		     && inode->i_size > i_block_size(inode) * 4)
+		    || (have_small_tails(inode->i_sb)
+			&& inode->i_size > i_block_size(inode)))
+			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
+
+		ret = journal_begin(&myth, inode->i_sb, 1);
+		if (ret)
+			goto journal_error;
+
+		reiserfs_update_inode_transaction(inode);
+		inode->i_size = pos + copied;
+		/*
+		 * this will just nest into our transaction.  It's important
+		 * to use mark_inode_dirty so the inode gets pushed around on the
+		 * dirty lists, and so that O_SYNC works as expected
+		 */
+		mark_inode_dirty(inode);
+		reiserfs_update_sd(&myth, inode);
+		update_sd = 1;
+		ret = journal_end(&myth, inode->i_sb, 1);
+		if (ret)
+			goto journal_error;
+	}
+	if (th) {
+		if (!locked) {
+			lock_depth = reiserfs_write_lock_once(inode->i_sb);
+			locked = true;
+		}
+		if (!update_sd)
+			mark_inode_dirty(inode);
+		ret = reiserfs_end_persistent_transaction(th);
+		if (ret)
+			goto out;
+	}
+
+      out:
+	if (locked)
+		reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (pos + len > inode->i_size)
+		reiserfs_truncate_failed_write(inode);
+
+	return ret == 0 ? copied : ret;
+
+      journal_error:
+	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+	locked = false;
+	if (th) {
+		if (!update_sd)
+			reiserfs_update_sd(th, inode);
+		ret = reiserfs_end_persistent_transaction(th);
+	}
+	goto out;
+}
+
+int reiserfs_commit_write(struct file *f, struct page *page,
+			  unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to;
+	int ret = 0;
+	int update_sd = 0;
+	struct reiserfs_transaction_handle *th = NULL;
+
+	reiserfs_write_unlock(inode->i_sb);
+	reiserfs_wait_on_write_block(inode->i_sb);
+	reiserfs_write_lock(inode->i_sb);
+
+	if (reiserfs_transaction_running(inode->i_sb)) {
+		th = current->journal_info;
+	}
+	reiserfs_commit_page(inode, page, from, to);
+
+	/* generic_commit_write does this for us, but does not update the
+	 ** transaction tracking stuff when the size changes.  So, we have
+	 ** to do the i_size updates here.
+	 */
+	if (pos > inode->i_size) {
+		struct reiserfs_transaction_handle myth;
+		/* If the file have grown beyond the border where it
+		   can have a tail, unmark it as needing a tail
+		   packing */
+		if ((have_large_tails(inode->i_sb)
+		     && inode->i_size > i_block_size(inode) * 4)
+		    || (have_small_tails(inode->i_sb)
+			&& inode->i_size > i_block_size(inode)))
+			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
+
+		ret = journal_begin(&myth, inode->i_sb, 1);
+		if (ret)
+			goto journal_error;
+
+		reiserfs_update_inode_transaction(inode);
+		inode->i_size = pos;
+		/*
+		 * this will just nest into our transaction.  It's important
+		 * to use mark_inode_dirty so the inode gets pushed around on the
+		 * dirty lists, and so that O_SYNC works as expected
+		 */
+		mark_inode_dirty(inode);
+		reiserfs_update_sd(&myth, inode);
+		update_sd = 1;
+		ret = journal_end(&myth, inode->i_sb, 1);
+		if (ret)
+			goto journal_error;
+	}
+	if (th) {
+		if (!update_sd)
+			mark_inode_dirty(inode);
+		ret = reiserfs_end_persistent_transaction(th);
+		if (ret)
+			goto out;
+	}
+
+      out:
+	return ret;
+
+      journal_error:
+	if (th) {
+		if (!update_sd)
+			reiserfs_update_sd(th, inode);
+		ret = reiserfs_end_persistent_transaction(th);
+	}
+
+	return ret;
+}
+
+void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
+{
+	if (reiserfs_attrs(inode->i_sb)) {
+		if (sd_attrs & REISERFS_SYNC_FL)
+			inode->i_flags |= S_SYNC;
+		else
+			inode->i_flags &= ~S_SYNC;
+		if (sd_attrs & REISERFS_IMMUTABLE_FL)
+			inode->i_flags |= S_IMMUTABLE;
+		else
+			inode->i_flags &= ~S_IMMUTABLE;
+		if (sd_attrs & REISERFS_APPEND_FL)
+			inode->i_flags |= S_APPEND;
+		else
+			inode->i_flags &= ~S_APPEND;
+		if (sd_attrs & REISERFS_NOATIME_FL)
+			inode->i_flags |= S_NOATIME;
+		else
+			inode->i_flags &= ~S_NOATIME;
+		if (sd_attrs & REISERFS_NOTAIL_FL)
+			REISERFS_I(inode)->i_flags |= i_nopack_mask;
+		else
+			REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
+	}
+}
+
+void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
+{
+	if (reiserfs_attrs(inode->i_sb)) {
+		if (inode->i_flags & S_IMMUTABLE)
+			*sd_attrs |= REISERFS_IMMUTABLE_FL;
+		else
+			*sd_attrs &= ~REISERFS_IMMUTABLE_FL;
+		if (inode->i_flags & S_SYNC)
+			*sd_attrs |= REISERFS_SYNC_FL;
+		else
+			*sd_attrs &= ~REISERFS_SYNC_FL;
+		if (inode->i_flags & S_NOATIME)
+			*sd_attrs |= REISERFS_NOATIME_FL;
+		else
+			*sd_attrs &= ~REISERFS_NOATIME_FL;
+		if (REISERFS_I(inode)->i_flags & i_nopack_mask)
+			*sd_attrs |= REISERFS_NOTAIL_FL;
+		else
+			*sd_attrs &= ~REISERFS_NOTAIL_FL;
+	}
+}
+
+/* decide if this buffer needs to stay around for data logging or ordered
+** write purposes
+*/
+static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
+{
+	int ret = 1;
+	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
+
+	lock_buffer(bh);
+	spin_lock(&j->j_dirty_buffers_lock);
+	if (!buffer_mapped(bh)) {
+		goto free_jh;
+	}
+	/* the page is locked, and the only places that log a data buffer
+	 * also lock the page.
+	 */
+	if (reiserfs_file_data_log(inode)) {
+		/*
+		 * very conservative, leave the buffer pinned if
+		 * anyone might need it.
+		 */
+		if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
+			ret = 0;
+		}
+	} else  if (buffer_dirty(bh)) {
+		struct reiserfs_journal_list *jl;
+		struct reiserfs_jh *jh = bh->b_private;
+
+		/* why is this safe?
+		 * reiserfs_setattr updates i_size in the on disk
+		 * stat data before allowing vmtruncate to be called.
+		 *
+		 * If buffer was put onto the ordered list for this
+		 * transaction, we know for sure either this transaction
+		 * or an older one already has updated i_size on disk,
+		 * and this ordered data won't be referenced in the file
+		 * if we crash.
+		 *
+		 * if the buffer was put onto the ordered list for an older
+		 * transaction, we need to leave it around
+		 */
+		if (jh && (jl = jh->jl)
+		    && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
+			ret = 0;
+	}
+      free_jh:
+	if (ret && bh->b_private) {
+		reiserfs_free_jh(bh);
+	}
+	spin_unlock(&j->j_dirty_buffers_lock);
+	unlock_buffer(bh);
+	return ret;
+}
+
+/* clm -- taken from fs/buffer.c:block_invalidate_page */
+static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct buffer_head *head, *bh, *next;
+	struct inode *inode = page->mapping->host;
+	unsigned int curr_off = 0;
+	int ret = 1;
+
+	BUG_ON(!PageLocked(page));
+
+	if (offset == 0)
+		ClearPageChecked(page);
+
+	if (!page_has_buffers(page))
+		goto out;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+		next = bh->b_this_page;
+
+		/*
+		 * is this block fully invalidated?
+		 */
+		if (offset <= curr_off) {
+			if (invalidatepage_can_drop(inode, bh))
+				reiserfs_unmap_buffer(bh);
+			else
+				ret = 0;
+		}
+		curr_off = next_off;
+		bh = next;
+	} while (bh != head);
+
+	/*
+	 * We release buffers only if the entire page is being invalidated.
+	 * The get_block cached value has been unconditionally invalidated,
+	 * so real IO is not possible anymore.
+	 */
+	if (!offset && ret) {
+		ret = try_to_release_page(page, 0);
+		/* maybe should BUG_ON(!ret); - neilb */
+	}
+      out:
+	return;
+}
+
+static int reiserfs_set_page_dirty(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	if (reiserfs_file_data_log(inode)) {
+		SetPageChecked(page);
+		return __set_page_dirty_nobuffers(page);
+	}
+	return __set_page_dirty_buffers(page);
+}
+
+/*
+ * Returns 1 if the page's buffers were dropped.  The page is locked.
+ *
+ * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
+ * in the buffers at page_buffers(page).
+ *
+ * even in -o notail mode, we can't be sure an old mount without -o notail
+ * didn't create files with tails.
+ */
+static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
+{
+	struct inode *inode = page->mapping->host;
+	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
+	struct buffer_head *head;
+	struct buffer_head *bh;
+	int ret = 1;
+
+	WARN_ON(PageChecked(page));
+	spin_lock(&j->j_dirty_buffers_lock);
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (bh->b_private) {
+			if (!buffer_dirty(bh) && !buffer_locked(bh)) {
+				reiserfs_free_jh(bh);
+			} else {
+				ret = 0;
+				break;
+			}
+		}
+		bh = bh->b_this_page;
+	} while (bh != head);
+	if (ret)
+		ret = try_to_free_buffers(page);
+	spin_unlock(&j->j_dirty_buffers_lock);
+	return ret;
+}
+
+/* We thank Mingming Cao for helping us understand in great detail what
+   to do in this section of the code. */
+static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
+				  const struct iovec *iov, loff_t offset,
+				  unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
+
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				  offset, nr_segs,
+				  reiserfs_get_blocks_direct_io, NULL);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+
+	return ret;
+}
+
+int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	unsigned int ia_valid;
+	int depth;
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	/* must be turned off for recursive notify_change calls */
+	ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
+
+	depth = reiserfs_write_lock_once(inode->i_sb);
+	if (is_quota_modification(inode, attr))
+		dquot_initialize(inode);
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		/* version 2 items will be caught by the s_maxbytes check
+		 ** done for us in vmtruncate
+		 */
+		if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
+		    attr->ia_size > MAX_NON_LFS) {
+			error = -EFBIG;
+			goto out;
+		}
+		/* fill in hole pointers in the expanding truncate case. */
+		if (attr->ia_size > inode->i_size) {
+			error = generic_cont_expand_simple(inode, attr->ia_size);
+			if (REISERFS_I(inode)->i_prealloc_count > 0) {
+				int err;
+				struct reiserfs_transaction_handle th;
+				/* we're changing at most 2 bitmaps, inode + super */
+				err = journal_begin(&th, inode->i_sb, 4);
+				if (!err) {
+					reiserfs_discard_prealloc(&th, inode);
+					err = journal_end(&th, inode->i_sb, 4);
+				}
+				if (err)
+					error = err;
+			}
+			if (error)
+				goto out;
+			/*
+			 * file size is changed, ctime and mtime are
+			 * to be updated
+			 */
+			attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
+		}
+	}
+
+	if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
+	     ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
+	    (get_inode_sd_version(inode) == STAT_DATA_V1)) {
+		/* stat data of format v3.5 has 16 bit uid and gid */
+		error = -EINVAL;
+		goto out;
+	}
+
+	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+	    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+		struct reiserfs_transaction_handle th;
+		int jbegin_count =
+		    2 *
+		    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
+		     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
+		    2;
+
+		error = reiserfs_chown_xattrs(inode, attr);
+
+		if (error)
+			return error;
+
+		/* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
+		error = journal_begin(&th, inode->i_sb, jbegin_count);
+		if (error)
+			goto out;
+		error = dquot_transfer(inode, attr);
+		if (error) {
+			journal_end(&th, inode->i_sb, jbegin_count);
+			goto out;
+		}
+
+		/* Update corresponding info in inode so that everything is in
+		 * one transaction */
+		if (attr->ia_valid & ATTR_UID)
+			inode->i_uid = attr->ia_uid;
+		if (attr->ia_valid & ATTR_GID)
+			inode->i_gid = attr->ia_gid;
+		mark_inode_dirty(inode);
+		error = journal_end(&th, inode->i_sb, jbegin_count);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * Relax the lock here, as it might truncate the
+	 * inode pages and wait for inode pages locks.
+	 * To release such page lock, the owner needs the
+	 * reiserfs lock
+	 */
+	reiserfs_write_unlock_once(inode->i_sb, depth);
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode))
+		error = vmtruncate(inode, attr->ia_size);
+
+	if (!error) {
+		setattr_copy(inode, attr);
+		mark_inode_dirty(inode);
+	}
+	depth = reiserfs_write_lock_once(inode->i_sb);
+
+	if (!error && reiserfs_posixacl(inode->i_sb)) {
+		if (attr->ia_valid & ATTR_MODE)
+			error = reiserfs_acl_chmod(inode);
+	}
+
+      out:
+	reiserfs_write_unlock_once(inode->i_sb, depth);
+
+	return error;
+}
+
+const struct address_space_operations reiserfs_address_space_operations = {
+	.writepage = reiserfs_writepage,
+	.readpage = reiserfs_readpage,
+	.readpages = reiserfs_readpages,
+	.releasepage = reiserfs_releasepage,
+	.invalidatepage = reiserfs_invalidatepage,
+	.write_begin = reiserfs_write_begin,
+	.write_end = reiserfs_write_end,
+	.bmap = reiserfs_aop_bmap,
+	.direct_IO = reiserfs_direct_IO,
+	.set_page_dirty = reiserfs_set_page_dirty,
+};
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
new file mode 100644
index 00000000..4e153051
--- /dev/null
+++ b/fs/reiserfs/ioctl.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/time.h>
+#include <asm/uaccess.h>
+#include <linux/pagemap.h>
+#include <linux/compat.h>
+
+/*
+ * reiserfs_ioctl - handler for ioctl for inode
+ * supported commands:
+ *  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
+ *                           and prevent packing file (argument arg has to be non-zero)
+ *  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
+ *  3) That's all for a while ...
+ */
+long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	unsigned int flags;
+	int err = 0;
+
+	reiserfs_write_lock(inode->i_sb);
+
+	switch (cmd) {
+	case REISERFS_IOC_UNPACK:
+		if (S_ISREG(inode->i_mode)) {
+			if (arg)
+				err = reiserfs_unpack(inode, filp);
+		} else
+			err = -ENOTTY;
+		break;
+		/*
+		 * following two cases are taken from fs/ext2/ioctl.c by Remy
+		 * Card (card@masi.ibp.fr)
+		 */
+	case REISERFS_IOC_GETFLAGS:
+		if (!reiserfs_attrs(inode->i_sb)) {
+			err = -ENOTTY;
+			break;
+		}
+
+		flags = REISERFS_I(inode)->i_attrs;
+		i_attrs_to_sd_attrs(inode, (__u16 *) & flags);
+		err = put_user(flags, (int __user *)arg);
+		break;
+	case REISERFS_IOC_SETFLAGS:{
+			if (!reiserfs_attrs(inode->i_sb)) {
+				err = -ENOTTY;
+				break;
+			}
+
+			err = mnt_want_write(filp->f_path.mnt);
+			if (err)
+				break;
+
+			if (!inode_owner_or_capable(inode)) {
+				err = -EPERM;
+				goto setflags_out;
+			}
+			if (get_user(flags, (int __user *)arg)) {
+				err = -EFAULT;
+				goto setflags_out;
+			}
+			/*
+			 * Is it quota file? Do not allow user to mess with it
+			 */
+			if (IS_NOQUOTA(inode)) {
+				err = -EPERM;
+				goto setflags_out;
+			}
+			if (((flags ^ REISERFS_I(inode)->
+			      i_attrs) & (REISERFS_IMMUTABLE_FL |
+					  REISERFS_APPEND_FL))
+			    && !capable(CAP_LINUX_IMMUTABLE)) {
+				err = -EPERM;
+				goto setflags_out;
+			}
+			if ((flags & REISERFS_NOTAIL_FL) &&
+			    S_ISREG(inode->i_mode)) {
+				int result;
+
+				result = reiserfs_unpack(inode, filp);
+				if (result) {
+					err = result;
+					goto setflags_out;
+				}
+			}
+			sd_attrs_to_i_attrs(flags, inode);
+			REISERFS_I(inode)->i_attrs = flags;
+			inode->i_ctime = CURRENT_TIME_SEC;
+			mark_inode_dirty(inode);
+setflags_out:
+			mnt_drop_write(filp->f_path.mnt);
+			break;
+		}
+	case REISERFS_IOC_GETVERSION:
+		err = put_user(inode->i_generation, (int __user *)arg);
+		break;
+	case REISERFS_IOC_SETVERSION:
+		if (!inode_owner_or_capable(inode)) {
+			err = -EPERM;
+			break;
+		}
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			break;
+		if (get_user(inode->i_generation, (int __user *)arg)) {
+			err = -EFAULT;
+			goto setversion_out;
+		}
+		inode->i_ctime = CURRENT_TIME_SEC;
+		mark_inode_dirty(inode);
+setversion_out:
+		mnt_drop_write(filp->f_path.mnt);
+		break;
+	default:
+		err = -ENOTTY;
+	}
+
+	reiserfs_write_unlock(inode->i_sb);
+
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
+				unsigned long arg)
+{
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case REISERFS_IOC32_UNPACK:
+		cmd = REISERFS_IOC_UNPACK;
+		break;
+	case REISERFS_IOC32_GETFLAGS:
+		cmd = REISERFS_IOC_GETFLAGS;
+		break;
+	case REISERFS_IOC32_SETFLAGS:
+		cmd = REISERFS_IOC_SETFLAGS;
+		break;
+	case REISERFS_IOC32_GETVERSION:
+		cmd = REISERFS_IOC_GETVERSION;
+		break;
+	case REISERFS_IOC32_SETVERSION:
+		cmd = REISERFS_IOC_SETVERSION;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+int reiserfs_commit_write(struct file *f, struct page *page,
+			  unsigned from, unsigned to);
+/*
+** reiserfs_unpack
+** Function try to convert tail from direct item into indirect.
+** It set up nopack attribute in the REISERFS_I(inode)->nopack
+*/
+int reiserfs_unpack(struct inode *inode, struct file *filp)
+{
+	int retval = 0;
+	int depth;
+	int index;
+	struct page *page;
+	struct address_space *mapping;
+	unsigned long write_from;
+	unsigned long blocksize = inode->i_sb->s_blocksize;
+
+	if (inode->i_size == 0) {
+		REISERFS_I(inode)->i_flags |= i_nopack_mask;
+		return 0;
+	}
+	/* ioctl already done */
+	if (REISERFS_I(inode)->i_flags & i_nopack_mask) {
+		return 0;
+	}
+
+	depth = reiserfs_write_lock_once(inode->i_sb);
+
+	/* we need to make sure nobody is changing the file size beneath us */
+	reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
+
+	write_from = inode->i_size & (blocksize - 1);
+	/* if we are on a block boundary, we are already unpacked.  */
+	if (write_from == 0) {
+		REISERFS_I(inode)->i_flags |= i_nopack_mask;
+		goto out;
+	}
+
+	/* we unpack by finding the page with the tail, and calling
+	 ** __reiserfs_write_begin on that page.  This will force a
+	 ** reiserfs_get_block to unpack the tail for us.
+	 */
+	index = inode->i_size >> PAGE_CACHE_SHIFT;
+	mapping = inode->i_mapping;
+	page = grab_cache_page(mapping, index);
+	retval = -ENOMEM;
+	if (!page) {
+		goto out;
+	}
+	retval = __reiserfs_write_begin(page, write_from, 0);
+	if (retval)
+		goto out_unlock;
+
+	/* conversion can change page contents, must flush */
+	flush_dcache_page(page);
+	retval = reiserfs_commit_write(NULL, page, write_from, write_from);
+	REISERFS_I(inode)->i_flags |= i_nopack_mask;
+
+      out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+
+      out:
+	mutex_unlock(&inode->i_mutex);
+	reiserfs_write_unlock_once(inode->i_sb, depth);
+	return retval;
+}
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
new file mode 100644
index 00000000..72cb1cc5
--- /dev/null
+++ b/fs/reiserfs/item_ops.c
@@ -0,0 +1,756 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/time.h>
+#include <linux/reiserfs_fs.h>
+
+// this contains item handlers for old item types: sd, direct,
+// indirect, directory
+
+/* and where are the comments? how about saying where we can find an
+   explanation of each item handler method? -Hans */
+
+//////////////////////////////////////////////////////////////////////////////
+// stat data functions
+//
+static int sd_bytes_number(struct item_head *ih, int block_size)
+{
+	return 0;
+}
+
+static void sd_decrement_key(struct cpu_key *key)
+{
+	key->on_disk_key.k_objectid--;
+	set_cpu_key_k_type(key, TYPE_ANY);
+	set_cpu_key_k_offset(key, (loff_t)(~0ULL >> 1));
+}
+
+static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize)
+{
+	return 0;
+}
+
+static char *print_time(time_t t)
+{
+	static char timebuf[256];
+
+	sprintf(timebuf, "%ld", t);
+	return timebuf;
+}
+
+static void sd_print_item(struct item_head *ih, char *item)
+{
+	printk("\tmode | size | nlinks | first direct | mtime\n");
+	if (stat_data_v1(ih)) {
+		struct stat_data_v1 *sd = (struct stat_data_v1 *)item;
+
+		printk("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd),
+		       sd_v1_size(sd), sd_v1_nlink(sd),
+		       sd_v1_first_direct_byte(sd),
+		       print_time(sd_v1_mtime(sd)));
+	} else {
+		struct stat_data *sd = (struct stat_data *)item;
+
+		printk("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd),
+		       (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
+		       sd_v2_rdev(sd), print_time(sd_v2_mtime(sd)));
+	}
+}
+
+static void sd_check_item(struct item_head *ih, char *item)
+{
+	// FIXME: type something here!
+}
+
+static int sd_create_vi(struct virtual_node *vn,
+			struct virtual_item *vi,
+			int is_affected, int insert_size)
+{
+	vi->vi_index = TYPE_STAT_DATA;
+	//vi->vi_type |= VI_TYPE_STAT_DATA;// not needed?
+	return 0;
+}
+
+static int sd_check_left(struct virtual_item *vi, int free,
+			 int start_skip, int end_skip)
+{
+	BUG_ON(start_skip || end_skip);
+	return -1;
+}
+
+static int sd_check_right(struct virtual_item *vi, int free)
+{
+	return -1;
+}
+
+static int sd_part_size(struct virtual_item *vi, int first, int count)
+{
+	BUG_ON(count);
+	return 0;
+}
+
+static int sd_unit_num(struct virtual_item *vi)
+{
+	return vi->vi_item_len - IH_SIZE;
+}
+
+static void sd_print_vi(struct virtual_item *vi)
+{
+	reiserfs_warning(NULL, "reiserfs-16100",
+			 "STATDATA, index %d, type 0x%x, %h",
+			 vi->vi_index, vi->vi_type, vi->vi_ih);
+}
+
+static struct item_operations stat_data_ops = {
+	.bytes_number = sd_bytes_number,
+	.decrement_key = sd_decrement_key,
+	.is_left_mergeable = sd_is_left_mergeable,
+	.print_item = sd_print_item,
+	.check_item = sd_check_item,
+
+	.create_vi = sd_create_vi,
+	.check_left = sd_check_left,
+	.check_right = sd_check_right,
+	.part_size = sd_part_size,
+	.unit_num = sd_unit_num,
+	.print_vi = sd_print_vi
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// direct item functions
+//
+static int direct_bytes_number(struct item_head *ih, int block_size)
+{
+	return ih_item_len(ih);
+}
+
+// FIXME: this should probably switch to indirect as well
+static void direct_decrement_key(struct cpu_key *key)
+{
+	cpu_key_k_offset_dec(key);
+	if (cpu_key_k_offset(key) == 0)
+		set_cpu_key_k_type(key, TYPE_STAT_DATA);
+}
+
+static int direct_is_left_mergeable(struct reiserfs_key *key,
+				    unsigned long bsize)
+{
+	int version = le_key_version(key);
+	return ((le_key_k_offset(version, key) & (bsize - 1)) != 1);
+}
+
+static void direct_print_item(struct item_head *ih, char *item)
+{
+	int j = 0;
+
+//    return;
+	printk("\"");
+	while (j < ih_item_len(ih))
+		printk("%c", item[j++]);
+	printk("\"\n");
+}
+
+static void direct_check_item(struct item_head *ih, char *item)
+{
+	// FIXME: type something here!
+}
+
+static int direct_create_vi(struct virtual_node *vn,
+			    struct virtual_item *vi,
+			    int is_affected, int insert_size)
+{
+	vi->vi_index = TYPE_DIRECT;
+	//vi->vi_type |= VI_TYPE_DIRECT;
+	return 0;
+}
+
+static int direct_check_left(struct virtual_item *vi, int free,
+			     int start_skip, int end_skip)
+{
+	int bytes;
+
+	bytes = free - free % 8;
+	return bytes ? : -1;
+}
+
+static int direct_check_right(struct virtual_item *vi, int free)
+{
+	return direct_check_left(vi, free, 0, 0);
+}
+
+static int direct_part_size(struct virtual_item *vi, int first, int count)
+{
+	return count;
+}
+
+static int direct_unit_num(struct virtual_item *vi)
+{
+	return vi->vi_item_len - IH_SIZE;
+}
+
+static void direct_print_vi(struct virtual_item *vi)
+{
+	reiserfs_warning(NULL, "reiserfs-16101",
+			 "DIRECT, index %d, type 0x%x, %h",
+			 vi->vi_index, vi->vi_type, vi->vi_ih);
+}
+
+static struct item_operations direct_ops = {
+	.bytes_number = direct_bytes_number,
+	.decrement_key = direct_decrement_key,
+	.is_left_mergeable = direct_is_left_mergeable,
+	.print_item = direct_print_item,
+	.check_item = direct_check_item,
+
+	.create_vi = direct_create_vi,
+	.check_left = direct_check_left,
+	.check_right = direct_check_right,
+	.part_size = direct_part_size,
+	.unit_num = direct_unit_num,
+	.print_vi = direct_print_vi
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// indirect item functions
+//
+
+static int indirect_bytes_number(struct item_head *ih, int block_size)
+{
+	return ih_item_len(ih) / UNFM_P_SIZE * block_size;	//- get_ih_free_space (ih);
+}
+
+// decrease offset, if it becomes 0, change type to stat data
+static void indirect_decrement_key(struct cpu_key *key)
+{
+	cpu_key_k_offset_dec(key);
+	if (cpu_key_k_offset(key) == 0)
+		set_cpu_key_k_type(key, TYPE_STAT_DATA);
+}
+
+// if it is not first item of the body, then it is mergeable
+static int indirect_is_left_mergeable(struct reiserfs_key *key,
+				      unsigned long bsize)
+{
+	int version = le_key_version(key);
+	return (le_key_k_offset(version, key) != 1);
+}
+
+// printing of indirect item
+static void start_new_sequence(__u32 * start, int *len, __u32 new)
+{
+	*start = new;
+	*len = 1;
+}
+
+static int sequence_finished(__u32 start, int *len, __u32 new)
+{
+	if (start == INT_MAX)
+		return 1;
+
+	if (start == 0 && new == 0) {
+		(*len)++;
+		return 0;
+	}
+	if (start != 0 && (start + *len) == new) {
+		(*len)++;
+		return 0;
+	}
+	return 1;
+}
+
+static void print_sequence(__u32 start, int len)
+{
+	if (start == INT_MAX)
+		return;
+
+	if (len == 1)
+		printk(" %d", start);
+	else
+		printk(" %d(%d)", start, len);
+}
+
+static void indirect_print_item(struct item_head *ih, char *item)
+{
+	int j;
+	__le32 *unp;
+	__u32 prev = INT_MAX;
+	int num = 0;
+
+	unp = (__le32 *) item;
+
+	if (ih_item_len(ih) % UNFM_P_SIZE)
+		reiserfs_warning(NULL, "reiserfs-16102", "invalid item len");
+
+	printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih));
+	for (j = 0; j < I_UNFM_NUM(ih); j++) {
+		if (sequence_finished(prev, &num, get_block_num(unp, j))) {
+			print_sequence(prev, num);
+			start_new_sequence(&prev, &num, get_block_num(unp, j));
+		}
+	}
+	print_sequence(prev, num);
+	printk("]\n");
+}
+
+static void indirect_check_item(struct item_head *ih, char *item)
+{
+	// FIXME: type something here!
+}
+
+static int indirect_create_vi(struct virtual_node *vn,
+			      struct virtual_item *vi,
+			      int is_affected, int insert_size)
+{
+	vi->vi_index = TYPE_INDIRECT;
+	//vi->vi_type |= VI_TYPE_INDIRECT;
+	return 0;
+}
+
+static int indirect_check_left(struct virtual_item *vi, int free,
+			       int start_skip, int end_skip)
+{
+	int bytes;
+
+	bytes = free - free % UNFM_P_SIZE;
+	return bytes ? : -1;
+}
+
+static int indirect_check_right(struct virtual_item *vi, int free)
+{
+	return indirect_check_left(vi, free, 0, 0);
+}
+
+// return size in bytes of 'units' units. If first == 0 - calculate from the head (left), otherwise - from tail (right)
+static int indirect_part_size(struct virtual_item *vi, int first, int units)
+{
+	// unit of indirect item is byte (yet)
+	return units;
+}
+
+static int indirect_unit_num(struct virtual_item *vi)
+{
+	// unit of indirect item is byte (yet)
+	return vi->vi_item_len - IH_SIZE;
+}
+
+static void indirect_print_vi(struct virtual_item *vi)
+{
+	reiserfs_warning(NULL, "reiserfs-16103",
+			 "INDIRECT, index %d, type 0x%x, %h",
+			 vi->vi_index, vi->vi_type, vi->vi_ih);
+}
+
+static struct item_operations indirect_ops = {
+	.bytes_number = indirect_bytes_number,
+	.decrement_key = indirect_decrement_key,
+	.is_left_mergeable = indirect_is_left_mergeable,
+	.print_item = indirect_print_item,
+	.check_item = indirect_check_item,
+
+	.create_vi = indirect_create_vi,
+	.check_left = indirect_check_left,
+	.check_right = indirect_check_right,
+	.part_size = indirect_part_size,
+	.unit_num = indirect_unit_num,
+	.print_vi = indirect_print_vi
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// direntry functions
+//
+
+static int direntry_bytes_number(struct item_head *ih, int block_size)
+{
+	reiserfs_warning(NULL, "vs-16090",
+			 "bytes number is asked for direntry");
+	return 0;
+}
+
+static void direntry_decrement_key(struct cpu_key *key)
+{
+	cpu_key_k_offset_dec(key);
+	if (cpu_key_k_offset(key) == 0)
+		set_cpu_key_k_type(key, TYPE_STAT_DATA);
+}
+
+static int direntry_is_left_mergeable(struct reiserfs_key *key,
+				      unsigned long bsize)
+{
+	if (le32_to_cpu(key->u.k_offset_v1.k_offset) == DOT_OFFSET)
+		return 0;
+	return 1;
+
+}
+
+static void direntry_print_item(struct item_head *ih, char *item)
+{
+	int i;
+	int namelen;
+	struct reiserfs_de_head *deh;
+	char *name;
+	static char namebuf[80];
+
+	printk("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name",
+	       "Key of pointed object", "Hash", "Gen number", "Status");
+
+	deh = (struct reiserfs_de_head *)item;
+
+	for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) {
+		namelen =
+		    (i ? (deh_location(deh - 1)) : ih_item_len(ih)) -
+		    deh_location(deh);
+		name = item + deh_location(deh);
+		if (name[namelen - 1] == 0)
+			namelen = strlen(name);
+		namebuf[0] = '"';
+		if (namelen > sizeof(namebuf) - 3) {
+			strncpy(namebuf + 1, name, sizeof(namebuf) - 3);
+			namebuf[sizeof(namebuf) - 2] = '"';
+			namebuf[sizeof(namebuf) - 1] = 0;
+		} else {
+			memcpy(namebuf + 1, name, namelen);
+			namebuf[namelen + 1] = '"';
+			namebuf[namelen + 2] = 0;
+		}
+
+		printk("%d:  %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n",
+		       i, namebuf,
+		       deh_dir_id(deh), deh_objectid(deh),
+		       GET_HASH_VALUE(deh_offset(deh)),
+		       GET_GENERATION_NUMBER((deh_offset(deh))),
+		       (de_hidden(deh)) ? "HIDDEN" : "VISIBLE");
+	}
+}
+
+static void direntry_check_item(struct item_head *ih, char *item)
+{
+	int i;
+	struct reiserfs_de_head *deh;
+
+	// FIXME: type something here!
+	deh = (struct reiserfs_de_head *)item;
+	for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) {
+		;
+	}
+}
+
+#define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1
+
+/*
+ * function returns old entry number in directory item in real node
+ * using new entry number in virtual item in virtual node */
+static inline int old_entry_num(int is_affected, int virtual_entry_num,
+				int pos_in_item, int mode)
+{
+	if (mode == M_INSERT || mode == M_DELETE)
+		return virtual_entry_num;
+
+	if (!is_affected)
+		/* cut or paste is applied to another item */
+		return virtual_entry_num;
+
+	if (virtual_entry_num < pos_in_item)
+		return virtual_entry_num;
+
+	if (mode == M_CUT)
+		return virtual_entry_num + 1;
+
+	RFALSE(mode != M_PASTE || virtual_entry_num == 0,
+	       "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'",
+	       mode);
+
+	return virtual_entry_num - 1;
+}
+
+/* Create an array of sizes of directory entries for virtual
+   item. Return space used by an item. FIXME: no control over
+   consuming of space used by this item handler */
+static int direntry_create_vi(struct virtual_node *vn,
+			      struct virtual_item *vi,
+			      int is_affected, int insert_size)
+{
+	struct direntry_uarea *dir_u = vi->vi_uarea;
+	int i, j;
+	int size = sizeof(struct direntry_uarea);
+	struct reiserfs_de_head *deh;
+
+	vi->vi_index = TYPE_DIRENTRY;
+
+	BUG_ON(!(vi->vi_ih) || !vi->vi_item);
+
+	dir_u->flags = 0;
+	if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET)
+		dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM;
+
+	deh = (struct reiserfs_de_head *)(vi->vi_item);
+
+	/* virtual directory item have this amount of entry after */
+	dir_u->entry_count = ih_entry_count(vi->vi_ih) +
+	    ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 :
+			      (vn->vn_mode == M_PASTE ? 1 : 0)) : 0);
+
+	for (i = 0; i < dir_u->entry_count; i++) {
+		j = old_entry_num(is_affected, i, vn->vn_pos_in_item,
+				  vn->vn_mode);
+		dir_u->entry_sizes[i] =
+		    (j ? deh_location(&(deh[j - 1])) : ih_item_len(vi->vi_ih)) -
+		    deh_location(&(deh[j])) + DEH_SIZE;
+	}
+
+	size += (dir_u->entry_count * sizeof(short));
+
+	/* set size of pasted entry */
+	if (is_affected && vn->vn_mode == M_PASTE)
+		dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size;
+
+#ifdef CONFIG_REISERFS_CHECK
+	/* compare total size of entries with item length */
+	{
+		int k, l;
+
+		l = 0;
+		for (k = 0; k < dir_u->entry_count; k++)
+			l += dir_u->entry_sizes[k];
+
+		if (l + IH_SIZE != vi->vi_item_len +
+		    ((is_affected
+		      && (vn->vn_mode == M_PASTE
+			  || vn->vn_mode == M_CUT)) ? insert_size : 0)) {
+			reiserfs_panic(NULL, "vs-8025", "(mode==%c, "
+				       "insert_size==%d), invalid length of "
+				       "directory item",
+				       vn->vn_mode, insert_size);
+		}
+	}
+#endif
+
+	return size;
+
+}
+
+//
+// return number of entries which may fit into specified amount of
+// free space, or -1 if free space is not enough even for 1 entry
+//
+static int direntry_check_left(struct virtual_item *vi, int free,
+			       int start_skip, int end_skip)
+{
+	int i;
+	int entries = 0;
+	struct direntry_uarea *dir_u = vi->vi_uarea;
+
+	for (i = start_skip; i < dir_u->entry_count - end_skip; i++) {
+		if (dir_u->entry_sizes[i] > free)
+			/* i-th entry doesn't fit into the remaining free space */
+			break;
+
+		free -= dir_u->entry_sizes[i];
+		entries++;
+	}
+
+	if (entries == dir_u->entry_count) {
+		reiserfs_panic(NULL, "item_ops-1",
+			       "free space %d, entry_count %d", free,
+			       dir_u->entry_count);
+	}
+
+	/* "." and ".." can not be separated from each other */
+	if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
+	    && entries < 2)
+		entries = 0;
+
+	return entries ? : -1;
+}
+
+static int direntry_check_right(struct virtual_item *vi, int free)
+{
+	int i;
+	int entries = 0;
+	struct direntry_uarea *dir_u = vi->vi_uarea;
+
+	for (i = dir_u->entry_count - 1; i >= 0; i--) {
+		if (dir_u->entry_sizes[i] > free)
+			/* i-th entry doesn't fit into the remaining free space */
+			break;
+
+		free -= dir_u->entry_sizes[i];
+		entries++;
+	}
+	BUG_ON(entries == dir_u->entry_count);
+
+	/* "." and ".." can not be separated from each other */
+	if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
+	    && entries > dir_u->entry_count - 2)
+		entries = dir_u->entry_count - 2;
+
+	return entries ? : -1;
+}
+
+/* sum of entry sizes between from-th and to-th entries including both edges */
+static int direntry_part_size(struct virtual_item *vi, int first, int count)
+{
+	int i, retval;
+	int from, to;
+	struct direntry_uarea *dir_u = vi->vi_uarea;
+
+	retval = 0;
+	if (first == 0)
+		from = 0;
+	else
+		from = dir_u->entry_count - count;
+	to = from + count - 1;
+
+	for (i = from; i <= to; i++)
+		retval += dir_u->entry_sizes[i];
+
+	return retval;
+}
+
+static int direntry_unit_num(struct virtual_item *vi)
+{
+	struct direntry_uarea *dir_u = vi->vi_uarea;
+
+	return dir_u->entry_count;
+}
+
+static void direntry_print_vi(struct virtual_item *vi)
+{
+	int i;
+	struct direntry_uarea *dir_u = vi->vi_uarea;
+
+	reiserfs_warning(NULL, "reiserfs-16104",
+			 "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
+			 vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags);
+	printk("%d entries: ", dir_u->entry_count);
+	for (i = 0; i < dir_u->entry_count; i++)
+		printk("%d ", dir_u->entry_sizes[i]);
+	printk("\n");
+}
+
+static struct item_operations direntry_ops = {
+	.bytes_number = direntry_bytes_number,
+	.decrement_key = direntry_decrement_key,
+	.is_left_mergeable = direntry_is_left_mergeable,
+	.print_item = direntry_print_item,
+	.check_item = direntry_check_item,
+
+	.create_vi = direntry_create_vi,
+	.check_left = direntry_check_left,
+	.check_right = direntry_check_right,
+	.part_size = direntry_part_size,
+	.unit_num = direntry_unit_num,
+	.print_vi = direntry_print_vi
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// Error catching functions to catch errors caused by incorrect item types.
+//
+static int errcatch_bytes_number(struct item_head *ih, int block_size)
+{
+	reiserfs_warning(NULL, "green-16001",
+			 "Invalid item type observed, run fsck ASAP");
+	return 0;
+}
+
+static void errcatch_decrement_key(struct cpu_key *key)
+{
+	reiserfs_warning(NULL, "green-16002",
+			 "Invalid item type observed, run fsck ASAP");
+}
+
+static int errcatch_is_left_mergeable(struct reiserfs_key *key,
+				      unsigned long bsize)
+{
+	reiserfs_warning(NULL, "green-16003",
+			 "Invalid item type observed, run fsck ASAP");
+	return 0;
+}
+
+static void errcatch_print_item(struct item_head *ih, char *item)
+{
+	reiserfs_warning(NULL, "green-16004",
+			 "Invalid item type observed, run fsck ASAP");
+}
+
+static void errcatch_check_item(struct item_head *ih, char *item)
+{
+	reiserfs_warning(NULL, "green-16005",
+			 "Invalid item type observed, run fsck ASAP");
+}
+
+static int errcatch_create_vi(struct virtual_node *vn,
+			      struct virtual_item *vi,
+			      int is_affected, int insert_size)
+{
+	reiserfs_warning(NULL, "green-16006",
+			 "Invalid item type observed, run fsck ASAP");
+	return 0;		// We might return -1 here as well, but it won't help as create_virtual_node() from where
+	// this operation is called from is of return type void.
+}
+
+static int errcatch_check_left(struct virtual_item *vi, int free,
+			       int start_skip, int end_skip)
+{
+	reiserfs_warning(NULL, "green-16007",
+			 "Invalid item type observed, run fsck ASAP");
+	return -1;
+}
+
+static int errcatch_check_right(struct virtual_item *vi, int free)
+{
+	reiserfs_warning(NULL, "green-16008",
+			 "Invalid item type observed, run fsck ASAP");
+	return -1;
+}
+
+static int errcatch_part_size(struct virtual_item *vi, int first, int count)
+{
+	reiserfs_warning(NULL, "green-16009",
+			 "Invalid item type observed, run fsck ASAP");
+	return 0;
+}
+
+static int errcatch_unit_num(struct virtual_item *vi)
+{
+	reiserfs_warning(NULL, "green-16010",
+			 "Invalid item type observed, run fsck ASAP");
+	return 0;
+}
+
+static void errcatch_print_vi(struct virtual_item *vi)
+{
+	reiserfs_warning(NULL, "green-16011",
+			 "Invalid item type observed, run fsck ASAP");
+}
+
+static struct item_operations errcatch_ops = {
+	errcatch_bytes_number,
+	errcatch_decrement_key,
+	errcatch_is_left_mergeable,
+	errcatch_print_item,
+	errcatch_check_item,
+
+	errcatch_create_vi,
+	errcatch_check_left,
+	errcatch_check_right,
+	errcatch_part_size,
+	errcatch_unit_num,
+	errcatch_print_vi
+};
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//
+#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
+#error Item types must use disk-format assigned values.
+#endif
+
+struct item_operations *item_ops[TYPE_ANY + 1] = {
+	&stat_data_ops,
+	&indirect_ops,
+	&direct_ops,
+	&direntry_ops,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	&errcatch_ops		/* This is to catch errors with invalid type (15th entry for TYPE_ANY) */
+};
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
new file mode 100644
index 00000000..c5e82ece
--- /dev/null
+++ b/fs/reiserfs/journal.c
@@ -0,0 +1,4319 @@
+/*
+** Write ahead logging implementation copyright Chris Mason 2000
+**
+** The background commits make this code very interrelated, and
+** overly complex.  I need to rethink things a bit....The major players:
+**
+** journal_begin -- call with the number of blocks you expect to log.
+**                  If the current transaction is too
+** 		    old, it will block until the current transaction is
+** 		    finished, and then start a new one.
+**		    Usually, your transaction will get joined in with
+**                  previous ones for speed.
+**
+** journal_join  -- same as journal_begin, but won't block on the current
+**                  transaction regardless of age.  Don't ever call
+**                  this.  Ever.  There are only two places it should be
+**                  called from, and they are both inside this file.
+**
+** journal_mark_dirty -- adds blocks into this transaction.  clears any flags
+**                       that might make them get sent to disk
+**                       and then marks them BH_JDirty.  Puts the buffer head
+**                       into the current transaction hash.
+**
+** journal_end -- if the current transaction is batchable, it does nothing
+**                   otherwise, it could do an async/synchronous commit, or
+**                   a full flush of all log and real blocks in the
+**                   transaction.
+**
+** flush_old_commits -- if the current transaction is too old, it is ended and
+**                      commit blocks are sent to disk.  Forces commit blocks
+**                      to disk for all backgrounded commits that have been
+**                      around too long.
+**		     -- Note, if you call this as an immediate flush from
+**		        from within kupdate, it will ignore the immediate flag
+*/
+
+#include <linux/time.h>
+#include <linux/semaphore.h>
+#include <linux/vmalloc.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+
+#include <asm/system.h>
+
+/* gets a struct reiserfs_journal_list * from a list head */
+#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+                               j_list))
+#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+                               j_working_list))
+
+/* the number of mounted filesystems.  This is used to decide when to
+** start and kill the commit workqueue
+*/
+static int reiserfs_mounted_fs_count;
+
+static struct workqueue_struct *commit_wq;
+
+#define JOURNAL_TRANS_HALF 1018	/* must be correct to keep the desc and commit
+				   structs at 4k */
+#define BUFNR 64		/*read ahead */
+
+/* cnode stat bits.  Move these into reiserfs_fs.h */
+
+#define BLOCK_FREED 2		/* this block was freed, and can't be written.  */
+#define BLOCK_FREED_HOLDER 3	/* this block was freed during this transaction, and can't be written */
+
+#define BLOCK_NEEDS_FLUSH 4	/* used in flush_journal_list */
+#define BLOCK_DIRTIED 5
+
+/* journal list state bits */
+#define LIST_TOUCHED 1
+#define LIST_DIRTY   2
+#define LIST_COMMIT_PENDING  4	/* someone will commit this list */
+
+/* flags for do_journal_end */
+#define FLUSH_ALL   1		/* flush commit and real blocks */
+#define COMMIT_NOW  2		/* end and commit this transaction */
+#define WAIT        4		/* wait for the log blocks to hit the disk */
+
+static int do_journal_end(struct reiserfs_transaction_handle *,
+			  struct super_block *, unsigned long nblocks,
+			  int flags);
+static int flush_journal_list(struct super_block *s,
+			      struct reiserfs_journal_list *jl, int flushall);
+static int flush_commit_list(struct super_block *s,
+			     struct reiserfs_journal_list *jl, int flushall);
+static int can_dirty(struct reiserfs_journal_cnode *cn);
+static int journal_join(struct reiserfs_transaction_handle *th,
+			struct super_block *sb, unsigned long nblocks);
+static int release_journal_dev(struct super_block *super,
+			       struct reiserfs_journal *journal);
+static int dirty_one_transaction(struct super_block *s,
+				 struct reiserfs_journal_list *jl);
+static void flush_async_commits(struct work_struct *work);
+static void queue_log_writer(struct super_block *s);
+
+/* values for join in do_journal_begin_r */
+enum {
+	JBEGIN_REG = 0,		/* regular journal begin */
+	JBEGIN_JOIN = 1,	/* join the running transaction if at all possible */
+	JBEGIN_ABORT = 2,	/* called from cleanup code, ignores aborted flag */
+};
+
+static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
+			      struct super_block *sb,
+			      unsigned long nblocks, int join);
+
+static void init_journal_hash(struct super_block *sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	memset(journal->j_hash_table, 0,
+	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
+}
+
+/*
+** clears BH_Dirty and sticks the buffer on the clean list.  Called because I can't allow refile_buffer to
+** make schedule happen after I've freed a block.  Look at remove_from_transaction and journal_mark_freed for
+** more details.
+*/
+static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
+{
+	if (bh) {
+		clear_buffer_dirty(bh);
+		clear_buffer_journal_test(bh);
+	}
+	return 0;
+}
+
+static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
+							 *sb)
+{
+	struct reiserfs_bitmap_node *bn;
+	static int id;
+
+	bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);
+	if (!bn) {
+		return NULL;
+	}
+	bn->data = kzalloc(sb->s_blocksize, GFP_NOFS);
+	if (!bn->data) {
+		kfree(bn);
+		return NULL;
+	}
+	bn->id = id++;
+	INIT_LIST_HEAD(&bn->list);
+	return bn;
+}
+
+static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_bitmap_node *bn = NULL;
+	struct list_head *entry = journal->j_bitmap_nodes.next;
+
+	journal->j_used_bitmap_nodes++;
+      repeat:
+
+	if (entry != &journal->j_bitmap_nodes) {
+		bn = list_entry(entry, struct reiserfs_bitmap_node, list);
+		list_del(entry);
+		memset(bn->data, 0, sb->s_blocksize);
+		journal->j_free_bitmap_nodes--;
+		return bn;
+	}
+	bn = allocate_bitmap_node(sb);
+	if (!bn) {
+		yield();
+		goto repeat;
+	}
+	return bn;
+}
+static inline void free_bitmap_node(struct super_block *sb,
+				    struct reiserfs_bitmap_node *bn)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	journal->j_used_bitmap_nodes--;
+	if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
+		kfree(bn->data);
+		kfree(bn);
+	} else {
+		list_add(&bn->list, &journal->j_bitmap_nodes);
+		journal->j_free_bitmap_nodes++;
+	}
+}
+
+static void allocate_bitmap_nodes(struct super_block *sb)
+{
+	int i;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_bitmap_node *bn = NULL;
+	for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
+		bn = allocate_bitmap_node(sb);
+		if (bn) {
+			list_add(&bn->list, &journal->j_bitmap_nodes);
+			journal->j_free_bitmap_nodes++;
+		} else {
+			break;	/* this is ok, we'll try again when more are needed */
+		}
+	}
+}
+
+static int set_bit_in_list_bitmap(struct super_block *sb,
+				  b_blocknr_t block,
+				  struct reiserfs_list_bitmap *jb)
+{
+	unsigned int bmap_nr = block / (sb->s_blocksize << 3);
+	unsigned int bit_nr = block % (sb->s_blocksize << 3);
+
+	if (!jb->bitmaps[bmap_nr]) {
+		jb->bitmaps[bmap_nr] = get_bitmap_node(sb);
+	}
+	set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
+	return 0;
+}
+
+static void cleanup_bitmap_list(struct super_block *sb,
+				struct reiserfs_list_bitmap *jb)
+{
+	int i;
+	if (jb->bitmaps == NULL)
+		return;
+
+	for (i = 0; i < reiserfs_bmap_count(sb); i++) {
+		if (jb->bitmaps[i]) {
+			free_bitmap_node(sb, jb->bitmaps[i]);
+			jb->bitmaps[i] = NULL;
+		}
+	}
+}
+
+/*
+** only call this on FS unmount.
+*/
+static int free_list_bitmaps(struct super_block *sb,
+			     struct reiserfs_list_bitmap *jb_array)
+{
+	int i;
+	struct reiserfs_list_bitmap *jb;
+	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
+		jb = jb_array + i;
+		jb->journal_list = NULL;
+		cleanup_bitmap_list(sb, jb);
+		vfree(jb->bitmaps);
+		jb->bitmaps = NULL;
+	}
+	return 0;
+}
+
+static int free_bitmap_nodes(struct super_block *sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct list_head *next = journal->j_bitmap_nodes.next;
+	struct reiserfs_bitmap_node *bn;
+
+	while (next != &journal->j_bitmap_nodes) {
+		bn = list_entry(next, struct reiserfs_bitmap_node, list);
+		list_del(next);
+		kfree(bn->data);
+		kfree(bn);
+		next = journal->j_bitmap_nodes.next;
+		journal->j_free_bitmap_nodes--;
+	}
+
+	return 0;
+}
+
+/*
+** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
+** jb_array is the array to be filled in.
+*/
+int reiserfs_allocate_list_bitmaps(struct super_block *sb,
+				   struct reiserfs_list_bitmap *jb_array,
+				   unsigned int bmap_nr)
+{
+	int i;
+	int failed = 0;
+	struct reiserfs_list_bitmap *jb;
+	int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *);
+
+	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
+		jb = jb_array + i;
+		jb->journal_list = NULL;
+		jb->bitmaps = vmalloc(mem);
+		if (!jb->bitmaps) {
+			reiserfs_warning(sb, "clm-2000", "unable to "
+					 "allocate bitmaps for journal lists");
+			failed = 1;
+			break;
+		}
+		memset(jb->bitmaps, 0, mem);
+	}
+	if (failed) {
+		free_list_bitmaps(sb, jb_array);
+		return -1;
+	}
+	return 0;
+}
+
+/*
+** find an available list bitmap.  If you can't find one, flush a commit list
+** and try again
+*/
+static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
+						    struct reiserfs_journal_list
+						    *jl)
+{
+	int i, j;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_list_bitmap *jb = NULL;
+
+	for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
+		i = journal->j_list_bitmap_index;
+		journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
+		jb = journal->j_list_bitmap + i;
+		if (journal->j_list_bitmap[i].journal_list) {
+			flush_commit_list(sb,
+					  journal->j_list_bitmap[i].
+					  journal_list, 1);
+			if (!journal->j_list_bitmap[i].journal_list) {
+				break;
+			}
+		} else {
+			break;
+		}
+	}
+	if (jb->journal_list) {	/* double check to make sure if flushed correctly */
+		return NULL;
+	}
+	jb->journal_list = jl;
+	return jb;
+}
+
+/*
+** allocates a new chunk of X nodes, and links them all together as a list.
+** Uses the cnode->next and cnode->prev pointers
+** returns NULL on failure
+*/
+static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
+{
+	struct reiserfs_journal_cnode *head;
+	int i;
+	if (num_cnodes <= 0) {
+		return NULL;
+	}
+	head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));
+	if (!head) {
+		return NULL;
+	}
+	memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode));
+	head[0].prev = NULL;
+	head[0].next = head + 1;
+	for (i = 1; i < num_cnodes; i++) {
+		head[i].prev = head + (i - 1);
+		head[i].next = head + (i + 1);	/* if last one, overwrite it after the if */
+	}
+	head[num_cnodes - 1].next = NULL;
+	return head;
+}
+
+/*
+** pulls a cnode off the free list, or returns NULL on failure
+*/
+static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
+{
+	struct reiserfs_journal_cnode *cn;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+
+	reiserfs_check_lock_depth(sb, "get_cnode");
+
+	if (journal->j_cnode_free <= 0) {
+		return NULL;
+	}
+	journal->j_cnode_used++;
+	journal->j_cnode_free--;
+	cn = journal->j_cnode_free_list;
+	if (!cn) {
+		return cn;
+	}
+	if (cn->next) {
+		cn->next->prev = NULL;
+	}
+	journal->j_cnode_free_list = cn->next;
+	memset(cn, 0, sizeof(struct reiserfs_journal_cnode));
+	return cn;
+}
+
+/*
+** returns a cnode to the free list
+*/
+static void free_cnode(struct super_block *sb,
+		       struct reiserfs_journal_cnode *cn)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+
+	reiserfs_check_lock_depth(sb, "free_cnode");
+
+	journal->j_cnode_used--;
+	journal->j_cnode_free++;
+	/* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
+	cn->next = journal->j_cnode_free_list;
+	if (journal->j_cnode_free_list) {
+		journal->j_cnode_free_list->prev = cn;
+	}
+	cn->prev = NULL;	/* not needed with the memset, but I might kill the memset, and forget to do this */
+	journal->j_cnode_free_list = cn;
+}
+
+static void clear_prepared_bits(struct buffer_head *bh)
+{
+	clear_buffer_journal_prepared(bh);
+	clear_buffer_journal_restore_dirty(bh);
+}
+
+/* return a cnode with same dev, block number and size in table, or null if not found */
+static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
+								  super_block
+								  *sb,
+								  struct
+								  reiserfs_journal_cnode
+								  **table,
+								  long bl)
+{
+	struct reiserfs_journal_cnode *cn;
+	cn = journal_hash(table, sb, bl);
+	while (cn) {
+		if (cn->blocknr == bl && cn->sb == sb)
+			return cn;
+		cn = cn->hnext;
+	}
+	return (struct reiserfs_journal_cnode *)0;
+}
+
+/*
+** this actually means 'can this block be reallocated yet?'.  If you set search_all, a block can only be allocated
+** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever
+** being overwritten by a replay after crashing.
+**
+** If you don't set search_all, a block can only be allocated if it is not in the current transaction.  Since deleting
+** a block removes it from the current transaction, this case should never happen.  If you don't set search_all, make
+** sure you never write the block without logging it.
+**
+** next_zero_bit is a suggestion about the next block to try for find_forward.
+** when bl is rejected because it is set in a journal list bitmap, we search
+** for the next zero bit in the bitmap that rejected bl.  Then, we return that
+** through next_zero_bit for find_forward to try.
+**
+** Just because we return something in next_zero_bit does not mean we won't
+** reject it on the next call to reiserfs_in_journal
+**
+*/
+int reiserfs_in_journal(struct super_block *sb,
+			unsigned int bmap_nr, int bit_nr, int search_all,
+			b_blocknr_t * next_zero_bit)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_journal_cnode *cn;
+	struct reiserfs_list_bitmap *jb;
+	int i;
+	unsigned long bl;
+
+	*next_zero_bit = 0;	/* always start this at zero. */
+
+	PROC_INFO_INC(sb, journal.in_journal);
+	/* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
+	 ** if we crash before the transaction that freed it commits,  this transaction won't
+	 ** have committed either, and the block will never be written
+	 */
+	if (search_all) {
+		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
+			PROC_INFO_INC(sb, journal.in_journal_bitmap);
+			jb = journal->j_list_bitmap + i;
+			if (jb->journal_list && jb->bitmaps[bmap_nr] &&
+			    test_bit(bit_nr,
+				     (unsigned long *)jb->bitmaps[bmap_nr]->
+				     data)) {
+				*next_zero_bit =
+				    find_next_zero_bit((unsigned long *)
+						       (jb->bitmaps[bmap_nr]->
+							data),
+						       sb->s_blocksize << 3,
+						       bit_nr + 1);
+				return 1;
+			}
+		}
+	}
+
+	bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr;
+	/* is it in any old transactions? */
+	if (search_all
+	    && (cn =
+		get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) {
+		return 1;
+	}
+
+	/* is it in the current transaction.  This should never happen */
+	if ((cn = get_journal_hash_dev(sb, journal->j_hash_table, bl))) {
+		BUG();
+		return 1;
+	}
+
+	PROC_INFO_INC(sb, journal.in_journal_reusable);
+	/* safe for reuse */
+	return 0;
+}
+
+/* insert cn into table
+*/
+static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
+				       struct reiserfs_journal_cnode *cn)
+{
+	struct reiserfs_journal_cnode *cn_orig;
+
+	cn_orig = journal_hash(table, cn->sb, cn->blocknr);
+	cn->hnext = cn_orig;
+	cn->hprev = NULL;
+	if (cn_orig) {
+		cn_orig->hprev = cn;
+	}
+	journal_hash(table, cn->sb, cn->blocknr) = cn;
+}
+
+/* lock the current transaction */
+static inline void lock_journal(struct super_block *sb)
+{
+	PROC_INFO_INC(sb, journal.lock_journal);
+
+	reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
+}
+
+/* unlock the current transaction */
+static inline void unlock_journal(struct super_block *sb)
+{
+	mutex_unlock(&SB_JOURNAL(sb)->j_mutex);
+}
+
+static inline void get_journal_list(struct reiserfs_journal_list *jl)
+{
+	jl->j_refcount++;
+}
+
+static inline void put_journal_list(struct super_block *s,
+				    struct reiserfs_journal_list *jl)
+{
+	if (jl->j_refcount < 1) {
+		reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d",
+			       jl->j_trans_id, jl->j_refcount);
+	}
+	if (--jl->j_refcount == 0)
+		kfree(jl);
+}
+
+/*
+** this used to be much more involved, and I'm keeping it just in case things get ugly again.
+** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a
+** transaction.
+*/
+static void cleanup_freed_for_journal_list(struct super_block *sb,
+					   struct reiserfs_journal_list *jl)
+{
+
+	struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
+	if (jb) {
+		cleanup_bitmap_list(sb, jb);
+	}
+	jl->j_list_bitmap->journal_list = NULL;
+	jl->j_list_bitmap = NULL;
+}
+
+static int journal_list_still_alive(struct super_block *s,
+				    unsigned int trans_id)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	struct list_head *entry = &journal->j_journal_list;
+	struct reiserfs_journal_list *jl;
+
+	if (!list_empty(entry)) {
+		jl = JOURNAL_LIST_ENTRY(entry->next);
+		if (jl->j_trans_id <= trans_id) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * If page->mapping was null, we failed to truncate this page for
+ * some reason.  Most likely because it was truncated after being
+ * logged via data=journal.
+ *
+ * This does a check to see if the buffer belongs to one of these
+ * lost pages before doing the final put_bh.  If page->mapping was
+ * null, it tries to free buffers on the page, which should make the
+ * final page_cache_release drop the page from the lru.
+ */
+static void release_buffer_page(struct buffer_head *bh)
+{
+	struct page *page = bh->b_page;
+	if (!page->mapping && trylock_page(page)) {
+		page_cache_get(page);
+		put_bh(bh);
+		if (!page->mapping)
+			try_to_free_buffers(page);
+		unlock_page(page);
+		page_cache_release(page);
+	} else {
+		put_bh(bh);
+	}
+}
+
+static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+	char b[BDEVNAME_SIZE];
+
+	if (buffer_journaled(bh)) {
+		reiserfs_warning(NULL, "clm-2084",
+				 "pinned buffer %lu:%s sent to disk",
+				 bh->b_blocknr, bdevname(bh->b_bdev, b));
+	}
+	if (uptodate)
+		set_buffer_uptodate(bh);
+	else
+		clear_buffer_uptodate(bh);
+
+	unlock_buffer(bh);
+	release_buffer_page(bh);
+}
+
+static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate)
+		set_buffer_uptodate(bh);
+	else
+		clear_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
+static void submit_logged_buffer(struct buffer_head *bh)
+{
+	get_bh(bh);
+	bh->b_end_io = reiserfs_end_buffer_io_sync;
+	clear_buffer_journal_new(bh);
+	clear_buffer_dirty(bh);
+	if (!test_clear_buffer_journal_test(bh))
+		BUG();
+	if (!buffer_uptodate(bh))
+		BUG();
+	submit_bh(WRITE, bh);
+}
+
+static void submit_ordered_buffer(struct buffer_head *bh)
+{
+	get_bh(bh);
+	bh->b_end_io = reiserfs_end_ordered_io;
+	clear_buffer_dirty(bh);
+	if (!buffer_uptodate(bh))
+		BUG();
+	submit_bh(WRITE, bh);
+}
+
+#define CHUNK_SIZE 32
+struct buffer_chunk {
+	struct buffer_head *bh[CHUNK_SIZE];
+	int nr;
+};
+
+static void write_chunk(struct buffer_chunk *chunk)
+{
+	int i;
+	get_fs_excl();
+	for (i = 0; i < chunk->nr; i++) {
+		submit_logged_buffer(chunk->bh[i]);
+	}
+	chunk->nr = 0;
+	put_fs_excl();
+}
+
+static void write_ordered_chunk(struct buffer_chunk *chunk)
+{
+	int i;
+	get_fs_excl();
+	for (i = 0; i < chunk->nr; i++) {
+		submit_ordered_buffer(chunk->bh[i]);
+	}
+	chunk->nr = 0;
+	put_fs_excl();
+}
+
+static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
+			spinlock_t * lock, void (fn) (struct buffer_chunk *))
+{
+	int ret = 0;
+	BUG_ON(chunk->nr >= CHUNK_SIZE);
+	chunk->bh[chunk->nr++] = bh;
+	if (chunk->nr >= CHUNK_SIZE) {
+		ret = 1;
+		if (lock)
+			spin_unlock(lock);
+		fn(chunk);
+		if (lock)
+			spin_lock(lock);
+	}
+	return ret;
+}
+
+static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
+static struct reiserfs_jh *alloc_jh(void)
+{
+	struct reiserfs_jh *jh;
+	while (1) {
+		jh = kmalloc(sizeof(*jh), GFP_NOFS);
+		if (jh) {
+			atomic_inc(&nr_reiserfs_jh);
+			return jh;
+		}
+		yield();
+	}
+}
+
+/*
+ * we want to free the jh when the buffer has been written
+ * and waited on
+ */
+void reiserfs_free_jh(struct buffer_head *bh)
+{
+	struct reiserfs_jh *jh;
+
+	jh = bh->b_private;
+	if (jh) {
+		bh->b_private = NULL;
+		jh->bh = NULL;
+		list_del_init(&jh->list);
+		kfree(jh);
+		if (atomic_read(&nr_reiserfs_jh) <= 0)
+			BUG();
+		atomic_dec(&nr_reiserfs_jh);
+		put_bh(bh);
+	}
+}
+
+static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
+			   int tail)
+{
+	struct reiserfs_jh *jh;
+
+	if (bh->b_private) {
+		spin_lock(&j->j_dirty_buffers_lock);
+		if (!bh->b_private) {
+			spin_unlock(&j->j_dirty_buffers_lock);
+			goto no_jh;
+		}
+		jh = bh->b_private;
+		list_del_init(&jh->list);
+	} else {
+	      no_jh:
+		get_bh(bh);
+		jh = alloc_jh();
+		spin_lock(&j->j_dirty_buffers_lock);
+		/* buffer must be locked for __add_jh, should be able to have
+		 * two adds at the same time
+		 */
+		BUG_ON(bh->b_private);
+		jh->bh = bh;
+		bh->b_private = jh;
+	}
+	jh->jl = j->j_current_jl;
+	if (tail)
+		list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
+	else {
+		list_add_tail(&jh->list, &jh->jl->j_bh_list);
+	}
+	spin_unlock(&j->j_dirty_buffers_lock);
+	return 0;
+}
+
+int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)
+{
+	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
+}
+int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)
+{
+	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
+}
+
+#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
+static int write_ordered_buffers(spinlock_t * lock,
+				 struct reiserfs_journal *j,
+				 struct reiserfs_journal_list *jl,
+				 struct list_head *list)
+{
+	struct buffer_head *bh;
+	struct reiserfs_jh *jh;
+	int ret = j->j_errno;
+	struct buffer_chunk chunk;
+	struct list_head tmp;
+	INIT_LIST_HEAD(&tmp);
+
+	chunk.nr = 0;
+	spin_lock(lock);
+	while (!list_empty(list)) {
+		jh = JH_ENTRY(list->next);
+		bh = jh->bh;
+		get_bh(bh);
+		if (!trylock_buffer(bh)) {
+			if (!buffer_dirty(bh)) {
+				list_move(&jh->list, &tmp);
+				goto loop_next;
+			}
+			spin_unlock(lock);
+			if (chunk.nr)
+				write_ordered_chunk(&chunk);
+			wait_on_buffer(bh);
+			cond_resched();
+			spin_lock(lock);
+			goto loop_next;
+		}
+		/* in theory, dirty non-uptodate buffers should never get here,
+		 * but the upper layer io error paths still have a few quirks.
+		 * Handle them here as gracefully as we can
+		 */
+		if (!buffer_uptodate(bh) && buffer_dirty(bh)) {
+			clear_buffer_dirty(bh);
+			ret = -EIO;
+		}
+		if (buffer_dirty(bh)) {
+			list_move(&jh->list, &tmp);
+			add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
+		} else {
+			reiserfs_free_jh(bh);
+			unlock_buffer(bh);
+		}
+	      loop_next:
+		put_bh(bh);
+		cond_resched_lock(lock);
+	}
+	if (chunk.nr) {
+		spin_unlock(lock);
+		write_ordered_chunk(&chunk);
+		spin_lock(lock);
+	}
+	while (!list_empty(&tmp)) {
+		jh = JH_ENTRY(tmp.prev);
+		bh = jh->bh;
+		get_bh(bh);
+		reiserfs_free_jh(bh);
+
+		if (buffer_locked(bh)) {
+			spin_unlock(lock);
+			wait_on_buffer(bh);
+			spin_lock(lock);
+		}
+		if (!buffer_uptodate(bh)) {
+			ret = -EIO;
+		}
+		/* ugly interaction with invalidatepage here.
+		 * reiserfs_invalidate_page will pin any buffer that has a valid
+		 * journal head from an older transaction.  If someone else sets
+		 * our buffer dirty after we write it in the first loop, and
+		 * then someone truncates the page away, nobody will ever write
+		 * the buffer. We're safe if we write the page one last time
+		 * after freeing the journal header.
+		 */
+		if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
+			spin_unlock(lock);
+			ll_rw_block(WRITE, 1, &bh);
+			spin_lock(lock);
+		}
+		put_bh(bh);
+		cond_resched_lock(lock);
+	}
+	spin_unlock(lock);
+	return ret;
+}
+
+static int flush_older_commits(struct super_block *s,
+			       struct reiserfs_journal_list *jl)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	struct reiserfs_journal_list *other_jl;
+	struct reiserfs_journal_list *first_jl;
+	struct list_head *entry;
+	unsigned int trans_id = jl->j_trans_id;
+	unsigned int other_trans_id;
+	unsigned int first_trans_id;
+
+      find_first:
+	/*
+	 * first we walk backwards to find the oldest uncommitted transation
+	 */
+	first_jl = jl;
+	entry = jl->j_list.prev;
+	while (1) {
+		other_jl = JOURNAL_LIST_ENTRY(entry);
+		if (entry == &journal->j_journal_list ||
+		    atomic_read(&other_jl->j_older_commits_done))
+			break;
+
+		first_jl = other_jl;
+		entry = other_jl->j_list.prev;
+	}
+
+	/* if we didn't find any older uncommitted transactions, return now */
+	if (first_jl == jl) {
+		return 0;
+	}
+
+	first_trans_id = first_jl->j_trans_id;
+
+	entry = &first_jl->j_list;
+	while (1) {
+		other_jl = JOURNAL_LIST_ENTRY(entry);
+		other_trans_id = other_jl->j_trans_id;
+
+		if (other_trans_id < trans_id) {
+			if (atomic_read(&other_jl->j_commit_left) != 0) {
+				flush_commit_list(s, other_jl, 0);
+
+				/* list we were called with is gone, return */
+				if (!journal_list_still_alive(s, trans_id))
+					return 1;
+
+				/* the one we just flushed is gone, this means all
+				 * older lists are also gone, so first_jl is no longer
+				 * valid either.  Go back to the beginning.
+				 */
+				if (!journal_list_still_alive
+				    (s, other_trans_id)) {
+					goto find_first;
+				}
+			}
+			entry = entry->next;
+			if (entry == &journal->j_journal_list)
+				return 0;
+		} else {
+			return 0;
+		}
+	}
+	return 0;
+}
+
+static int reiserfs_async_progress_wait(struct super_block *s)
+{
+	struct reiserfs_journal *j = SB_JOURNAL(s);
+
+	if (atomic_read(&j->j_async_throttle)) {
+		reiserfs_write_unlock(s);
+		congestion_wait(BLK_RW_ASYNC, HZ / 10);
+		reiserfs_write_lock(s);
+	}
+
+	return 0;
+}
+
+/*
+** if this journal list still has commit blocks unflushed, send them to disk.
+**
+** log areas must be flushed in order (transaction 2 can't commit before transaction 1)
+** Before the commit block can by written, every other log block must be safely on disk
+**
+*/
+static int flush_commit_list(struct super_block *s,
+			     struct reiserfs_journal_list *jl, int flushall)
+{
+	int i;
+	b_blocknr_t bn;
+	struct buffer_head *tbh = NULL;
+	unsigned int trans_id = jl->j_trans_id;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	int retval = 0;
+	int write_len;
+
+	reiserfs_check_lock_depth(s, "flush_commit_list");
+
+	if (atomic_read(&jl->j_older_commits_done)) {
+		return 0;
+	}
+
+	get_fs_excl();
+
+	/* before we can put our commit blocks on disk, we have to make sure everyone older than
+	 ** us is on disk too
+	 */
+	BUG_ON(jl->j_len <= 0);
+	BUG_ON(trans_id == journal->j_trans_id);
+
+	get_journal_list(jl);
+	if (flushall) {
+		if (flush_older_commits(s, jl) == 1) {
+			/* list disappeared during flush_older_commits.  return */
+			goto put_jl;
+		}
+	}
+
+	/* make sure nobody is trying to flush this one at the same time */
+	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
+
+	if (!journal_list_still_alive(s, trans_id)) {
+		mutex_unlock(&jl->j_commit_mutex);
+		goto put_jl;
+	}
+	BUG_ON(jl->j_trans_id == 0);
+
+	/* this commit is done, exit */
+	if (atomic_read(&(jl->j_commit_left)) <= 0) {
+		if (flushall) {
+			atomic_set(&(jl->j_older_commits_done), 1);
+		}
+		mutex_unlock(&jl->j_commit_mutex);
+		goto put_jl;
+	}
+
+	if (!list_empty(&jl->j_bh_list)) {
+		int ret;
+
+		/*
+		 * We might sleep in numerous places inside
+		 * write_ordered_buffers. Relax the write lock.
+		 */
+		reiserfs_write_unlock(s);
+		ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
+					    journal, jl, &jl->j_bh_list);
+		if (ret < 0 && retval == 0)
+			retval = ret;
+		reiserfs_write_lock(s);
+	}
+	BUG_ON(!list_empty(&jl->j_bh_list));
+	/*
+	 * for the description block and all the log blocks, submit any buffers
+	 * that haven't already reached the disk.  Try to write at least 256
+	 * log blocks. later on, we will only wait on blocks that correspond
+	 * to this transaction, but while we're unplugging we might as well
+	 * get a chunk of data on there.
+	 */
+	atomic_inc(&journal->j_async_throttle);
+	write_len = jl->j_len + 1;
+	if (write_len < 256)
+		write_len = 256;
+	for (i = 0 ; i < write_len ; i++) {
+		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %
+		    SB_ONDISK_JOURNAL_SIZE(s);
+		tbh = journal_find_get_block(s, bn);
+		if (tbh) {
+			if (buffer_dirty(tbh)) {
+		            reiserfs_write_unlock(s);
+			    ll_rw_block(WRITE, 1, &tbh);
+			    reiserfs_write_lock(s);
+			}
+			put_bh(tbh) ;
+		}
+	}
+	atomic_dec(&journal->j_async_throttle);
+
+	for (i = 0; i < (jl->j_len + 1); i++) {
+		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
+		    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
+		tbh = journal_find_get_block(s, bn);
+
+		reiserfs_write_unlock(s);
+		wait_on_buffer(tbh);
+		reiserfs_write_lock(s);
+		// since we're using ll_rw_blk above, it might have skipped over
+		// a locked buffer.  Double check here
+		//
+		/* redundant, sync_dirty_buffer() checks */
+		if (buffer_dirty(tbh)) {
+			reiserfs_write_unlock(s);
+			sync_dirty_buffer(tbh);
+			reiserfs_write_lock(s);
+		}
+		if (unlikely(!buffer_uptodate(tbh))) {
+#ifdef CONFIG_REISERFS_CHECK
+			reiserfs_warning(s, "journal-601",
+					 "buffer write failed");
+#endif
+			retval = -EIO;
+		}
+		put_bh(tbh);	/* once for journal_find_get_block */
+		put_bh(tbh);	/* once due to original getblk in do_journal_end */
+		atomic_dec(&(jl->j_commit_left));
+	}
+
+	BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
+
+	/* If there was a write error in the journal - we can't commit
+	 * this transaction - it will be invalid and, if successful,
+	 * will just end up propagating the write error out to
+	 * the file system. */
+	if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
+		if (buffer_dirty(jl->j_commit_bh))
+			BUG();
+		mark_buffer_dirty(jl->j_commit_bh) ;
+		reiserfs_write_unlock(s);
+		if (reiserfs_barrier_flush(s))
+			__sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
+		else
+			sync_dirty_buffer(jl->j_commit_bh);
+		reiserfs_write_lock(s);
+	}
+
+	/* If there was a write error in the journal - we can't commit this
+	 * transaction - it will be invalid and, if successful, will just end
+	 * up propagating the write error out to the filesystem. */
+	if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
+#ifdef CONFIG_REISERFS_CHECK
+		reiserfs_warning(s, "journal-615", "buffer write failed");
+#endif
+		retval = -EIO;
+	}
+	bforget(jl->j_commit_bh);
+	if (journal->j_last_commit_id != 0 &&
+	    (jl->j_trans_id - journal->j_last_commit_id) != 1) {
+		reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu",
+				 journal->j_last_commit_id, jl->j_trans_id);
+	}
+	journal->j_last_commit_id = jl->j_trans_id;
+
+	/* now, every commit block is on the disk.  It is safe to allow blocks freed during this transaction to be reallocated */
+	cleanup_freed_for_journal_list(s, jl);
+
+	retval = retval ? retval : journal->j_errno;
+
+	/* mark the metadata dirty */
+	if (!retval)
+		dirty_one_transaction(s, jl);
+	atomic_dec(&(jl->j_commit_left));
+
+	if (flushall) {
+		atomic_set(&(jl->j_older_commits_done), 1);
+	}
+	mutex_unlock(&jl->j_commit_mutex);
+      put_jl:
+	put_journal_list(s, jl);
+
+	if (retval)
+		reiserfs_abort(s, retval, "Journal write error in %s",
+			       __func__);
+	put_fs_excl();
+	return retval;
+}
+
+/*
+** flush_journal_list frequently needs to find a newer transaction for a given block.  This does that, or
+** returns NULL if it can't find anything
+*/
+static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
+							  reiserfs_journal_cnode
+							  *cn)
+{
+	struct super_block *sb = cn->sb;
+	b_blocknr_t blocknr = cn->blocknr;
+
+	cn = cn->hprev;
+	while (cn) {
+		if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
+			return cn->jlist;
+		}
+		cn = cn->hprev;
+	}
+	return NULL;
+}
+
+static int newer_jl_done(struct reiserfs_journal_cnode *cn)
+{
+	struct super_block *sb = cn->sb;
+	b_blocknr_t blocknr = cn->blocknr;
+
+	cn = cn->hprev;
+	while (cn) {
+		if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist &&
+		    atomic_read(&cn->jlist->j_commit_left) != 0)
+				    return 0;
+		cn = cn->hprev;
+	}
+	return 1;
+}
+
+static void remove_journal_hash(struct super_block *,
+				struct reiserfs_journal_cnode **,
+				struct reiserfs_journal_list *, unsigned long,
+				int);
+
+/*
+** once all the real blocks have been flushed, it is safe to remove them from the
+** journal list for this transaction.  Aside from freeing the cnode, this also allows the
+** block to be reallocated for data blocks if it had been deleted.
+*/
+static void remove_all_from_journal_list(struct super_block *sb,
+					 struct reiserfs_journal_list *jl,
+					 int debug)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_journal_cnode *cn, *last;
+	cn = jl->j_realblock;
+
+	/* which is better, to lock once around the whole loop, or
+	 ** to lock for each call to remove_journal_hash?
+	 */
+	while (cn) {
+		if (cn->blocknr != 0) {
+			if (debug) {
+				reiserfs_warning(sb, "reiserfs-2201",
+						 "block %u, bh is %d, state %ld",
+						 cn->blocknr, cn->bh ? 1 : 0,
+						 cn->state);
+			}
+			cn->state = 0;
+			remove_journal_hash(sb, journal->j_list_hash_table,
+					    jl, cn->blocknr, 1);
+		}
+		last = cn;
+		cn = cn->next;
+		free_cnode(sb, last);
+	}
+	jl->j_realblock = NULL;
+}
+
+/*
+** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block.
+** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start
+** releasing blocks in this transaction for reuse as data blocks.
+** called by flush_journal_list, before it calls remove_all_from_journal_list
+**
+*/
+static int _update_journal_header_block(struct super_block *sb,
+					unsigned long offset,
+					unsigned int trans_id)
+{
+	struct reiserfs_journal_header *jh;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+
+	if (reiserfs_is_journal_aborted(journal))
+		return -EIO;
+
+	if (trans_id >= journal->j_last_flush_trans_id) {
+		if (buffer_locked((journal->j_header_bh))) {
+			reiserfs_write_unlock(sb);
+			wait_on_buffer((journal->j_header_bh));
+			reiserfs_write_lock(sb);
+			if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
+#ifdef CONFIG_REISERFS_CHECK
+				reiserfs_warning(sb, "journal-699",
+						 "buffer write failed");
+#endif
+				return -EIO;
+			}
+		}
+		journal->j_last_flush_trans_id = trans_id;
+		journal->j_first_unflushed_offset = offset;
+		jh = (struct reiserfs_journal_header *)(journal->j_header_bh->
+							b_data);
+		jh->j_last_flush_trans_id = cpu_to_le32(trans_id);
+		jh->j_first_unflushed_offset = cpu_to_le32(offset);
+		jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
+
+		set_buffer_dirty(journal->j_header_bh);
+		reiserfs_write_unlock(sb);
+
+		if (reiserfs_barrier_flush(sb))
+			__sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
+		else
+			sync_dirty_buffer(journal->j_header_bh);
+
+		reiserfs_write_lock(sb);
+		if (!buffer_uptodate(journal->j_header_bh)) {
+			reiserfs_warning(sb, "journal-837",
+					 "IO error during journal replay");
+			return -EIO;
+		}
+	}
+	return 0;
+}
+
+static int update_journal_header_block(struct super_block *sb,
+				       unsigned long offset,
+				       unsigned int trans_id)
+{
+	return _update_journal_header_block(sb, offset, trans_id);
+}
+
+/*
+** flush any and all journal lists older than you are
+** can only be called from flush_journal_list
+*/
+static int flush_older_journal_lists(struct super_block *sb,
+				     struct reiserfs_journal_list *jl)
+{
+	struct list_head *entry;
+	struct reiserfs_journal_list *other_jl;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	unsigned int trans_id = jl->j_trans_id;
+
+	/* we know we are the only ones flushing things, no extra race
+	 * protection is required.
+	 */
+      restart:
+	entry = journal->j_journal_list.next;
+	/* Did we wrap? */
+	if (entry == &journal->j_journal_list)
+		return 0;
+	other_jl = JOURNAL_LIST_ENTRY(entry);
+	if (other_jl->j_trans_id < trans_id) {
+		BUG_ON(other_jl->j_refcount <= 0);
+		/* do not flush all */
+		flush_journal_list(sb, other_jl, 0);
+
+		/* other_jl is now deleted from the list */
+		goto restart;
+	}
+	return 0;
+}
+
+static void del_from_work_list(struct super_block *s,
+			       struct reiserfs_journal_list *jl)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	if (!list_empty(&jl->j_working_list)) {
+		list_del_init(&jl->j_working_list);
+		journal->j_num_work_lists--;
+	}
+}
+
+/* flush a journal list, both commit and real blocks
+**
+** always set flushall to 1, unless you are calling from inside
+** flush_journal_list
+**
+** IMPORTANT.  This can only be called while there are no journal writers,
+** and the journal is locked.  That means it can only be called from
+** do_journal_end, or by journal_release
+*/
+static int flush_journal_list(struct super_block *s,
+			      struct reiserfs_journal_list *jl, int flushall)
+{
+	struct reiserfs_journal_list *pjl;
+	struct reiserfs_journal_cnode *cn, *last;
+	int count;
+	int was_jwait = 0;
+	int was_dirty = 0;
+	struct buffer_head *saved_bh;
+	unsigned long j_len_saved = jl->j_len;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	int err = 0;
+
+	BUG_ON(j_len_saved <= 0);
+
+	if (atomic_read(&journal->j_wcount) != 0) {
+		reiserfs_warning(s, "clm-2048", "called with wcount %d",
+				 atomic_read(&journal->j_wcount));
+	}
+	BUG_ON(jl->j_trans_id == 0);
+
+	/* if flushall == 0, the lock is already held */
+	if (flushall) {
+		reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
+	} else if (mutex_trylock(&journal->j_flush_mutex)) {
+		BUG();
+	}
+
+	count = 0;
+	if (j_len_saved > journal->j_trans_max) {
+		reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu",
+			       j_len_saved, jl->j_trans_id);
+		return 0;
+	}
+
+	get_fs_excl();
+
+	/* if all the work is already done, get out of here */
+	if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
+	    atomic_read(&(jl->j_commit_left)) <= 0) {
+		goto flush_older_and_return;
+	}
+
+	/* start by putting the commit list on disk.  This will also flush
+	 ** the commit lists of any olders transactions
+	 */
+	flush_commit_list(s, jl, 1);
+
+	if (!(jl->j_state & LIST_DIRTY)
+	    && !reiserfs_is_journal_aborted(journal))
+		BUG();
+
+	/* are we done now? */
+	if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
+	    atomic_read(&(jl->j_commit_left)) <= 0) {
+		goto flush_older_and_return;
+	}
+
+	/* loop through each cnode, see if we need to write it,
+	 ** or wait on a more recent transaction, or just ignore it
+	 */
+	if (atomic_read(&(journal->j_wcount)) != 0) {
+		reiserfs_panic(s, "journal-844", "journal list is flushing, "
+			       "wcount is not 0");
+	}
+	cn = jl->j_realblock;
+	while (cn) {
+		was_jwait = 0;
+		was_dirty = 0;
+		saved_bh = NULL;
+		/* blocknr of 0 is no longer in the hash, ignore it */
+		if (cn->blocknr == 0) {
+			goto free_cnode;
+		}
+
+		/* This transaction failed commit. Don't write out to the disk */
+		if (!(jl->j_state & LIST_DIRTY))
+			goto free_cnode;
+
+		pjl = find_newer_jl_for_cn(cn);
+		/* the order is important here.  We check pjl to make sure we
+		 ** don't clear BH_JDirty_wait if we aren't the one writing this
+		 ** block to disk
+		 */
+		if (!pjl && cn->bh) {
+			saved_bh = cn->bh;
+
+			/* we do this to make sure nobody releases the buffer while
+			 ** we are working with it
+			 */
+			get_bh(saved_bh);
+
+			if (buffer_journal_dirty(saved_bh)) {
+				BUG_ON(!can_dirty(cn));
+				was_jwait = 1;
+				was_dirty = 1;
+			} else if (can_dirty(cn)) {
+				/* everything with !pjl && jwait should be writable */
+				BUG();
+			}
+		}
+
+		/* if someone has this block in a newer transaction, just make
+		 ** sure they are committed, and don't try writing it to disk
+		 */
+		if (pjl) {
+			if (atomic_read(&pjl->j_commit_left))
+				flush_commit_list(s, pjl, 1);
+			goto free_cnode;
+		}
+
+		/* bh == NULL when the block got to disk on its own, OR,
+		 ** the block got freed in a future transaction
+		 */
+		if (saved_bh == NULL) {
+			goto free_cnode;
+		}
+
+		/* this should never happen.  kupdate_one_transaction has this list
+		 ** locked while it works, so we should never see a buffer here that
+		 ** is not marked JDirty_wait
+		 */
+		if ((!was_jwait) && !buffer_locked(saved_bh)) {
+			reiserfs_warning(s, "journal-813",
+					 "BAD! buffer %llu %cdirty %cjwait, "
+					 "not in a newer tranasction",
+					 (unsigned long long)saved_bh->
+					 b_blocknr, was_dirty ? ' ' : '!',
+					 was_jwait ? ' ' : '!');
+		}
+		if (was_dirty) {
+			/* we inc again because saved_bh gets decremented at free_cnode */
+			get_bh(saved_bh);
+			set_bit(BLOCK_NEEDS_FLUSH, &cn->state);
+			lock_buffer(saved_bh);
+			BUG_ON(cn->blocknr != saved_bh->b_blocknr);
+			if (buffer_dirty(saved_bh))
+				submit_logged_buffer(saved_bh);
+			else
+				unlock_buffer(saved_bh);
+			count++;
+		} else {
+			reiserfs_warning(s, "clm-2082",
+					 "Unable to flush buffer %llu in %s",
+					 (unsigned long long)saved_bh->
+					 b_blocknr, __func__);
+		}
+	      free_cnode:
+		last = cn;
+		cn = cn->next;
+		if (saved_bh) {
+			/* we incremented this to keep others from taking the buffer head away */
+			put_bh(saved_bh);
+			if (atomic_read(&(saved_bh->b_count)) < 0) {
+				reiserfs_warning(s, "journal-945",
+						 "saved_bh->b_count < 0");
+			}
+		}
+	}
+	if (count > 0) {
+		cn = jl->j_realblock;
+		while (cn) {
+			if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
+				if (!cn->bh) {
+					reiserfs_panic(s, "journal-1011",
+						       "cn->bh is NULL");
+				}
+
+				reiserfs_write_unlock(s);
+				wait_on_buffer(cn->bh);
+				reiserfs_write_lock(s);
+
+				if (!cn->bh) {
+					reiserfs_panic(s, "journal-1012",
+						       "cn->bh is NULL");
+				}
+				if (unlikely(!buffer_uptodate(cn->bh))) {
+#ifdef CONFIG_REISERFS_CHECK
+					reiserfs_warning(s, "journal-949",
+							 "buffer write failed");
+#endif
+					err = -EIO;
+				}
+				/* note, we must clear the JDirty_wait bit after the up to date
+				 ** check, otherwise we race against our flushpage routine
+				 */
+				BUG_ON(!test_clear_buffer_journal_dirty
+				       (cn->bh));
+
+				/* drop one ref for us */
+				put_bh(cn->bh);
+				/* drop one ref for journal_mark_dirty */
+				release_buffer_page(cn->bh);
+			}
+			cn = cn->next;
+		}
+	}
+
+	if (err)
+		reiserfs_abort(s, -EIO,
+			       "Write error while pushing transaction to disk in %s",
+			       __func__);
+      flush_older_and_return:
+
+	/* before we can update the journal header block, we _must_ flush all
+	 ** real blocks from all older transactions to disk.  This is because
+	 ** once the header block is updated, this transaction will not be
+	 ** replayed after a crash
+	 */
+	if (flushall) {
+		flush_older_journal_lists(s, jl);
+	}
+
+	err = journal->j_errno;
+	/* before we can remove everything from the hash tables for this
+	 ** transaction, we must make sure it can never be replayed
+	 **
+	 ** since we are only called from do_journal_end, we know for sure there
+	 ** are no allocations going on while we are flushing journal lists.  So,
+	 ** we only need to update the journal header block for the last list
+	 ** being flushed
+	 */
+	if (!err && flushall) {
+		err =
+		    update_journal_header_block(s,
+						(jl->j_start + jl->j_len +
+						 2) % SB_ONDISK_JOURNAL_SIZE(s),
+						jl->j_trans_id);
+		if (err)
+			reiserfs_abort(s, -EIO,
+				       "Write error while updating journal header in %s",
+				       __func__);
+	}
+	remove_all_from_journal_list(s, jl, 0);
+	list_del_init(&jl->j_list);
+	journal->j_num_lists--;
+	del_from_work_list(s, jl);
+
+	if (journal->j_last_flush_id != 0 &&
+	    (jl->j_trans_id - journal->j_last_flush_id) != 1) {
+		reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu",
+				 journal->j_last_flush_id, jl->j_trans_id);
+	}
+	journal->j_last_flush_id = jl->j_trans_id;
+
+	/* not strictly required since we are freeing the list, but it should
+	 * help find code using dead lists later on
+	 */
+	jl->j_len = 0;
+	atomic_set(&(jl->j_nonzerolen), 0);
+	jl->j_start = 0;
+	jl->j_realblock = NULL;
+	jl->j_commit_bh = NULL;
+	jl->j_trans_id = 0;
+	jl->j_state = 0;
+	put_journal_list(s, jl);
+	if (flushall)
+		mutex_unlock(&journal->j_flush_mutex);
+	put_fs_excl();
+	return err;
+}
+
+static int test_transaction(struct super_block *s,
+                            struct reiserfs_journal_list *jl)
+{
+	struct reiserfs_journal_cnode *cn;
+
+	if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0)
+		return 1;
+
+	cn = jl->j_realblock;
+	while (cn) {
+		/* if the blocknr == 0, this has been cleared from the hash,
+		 ** skip it
+		 */
+		if (cn->blocknr == 0) {
+			goto next;
+		}
+		if (cn->bh && !newer_jl_done(cn))
+			return 0;
+	      next:
+		cn = cn->next;
+		cond_resched();
+	}
+	return 0;
+}
+
+static int write_one_transaction(struct super_block *s,
+				 struct reiserfs_journal_list *jl,
+				 struct buffer_chunk *chunk)
+{
+	struct reiserfs_journal_cnode *cn;
+	int ret = 0;
+
+	jl->j_state |= LIST_TOUCHED;
+	del_from_work_list(s, jl);
+	if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
+		return 0;
+	}
+
+	cn = jl->j_realblock;
+	while (cn) {
+		/* if the blocknr == 0, this has been cleared from the hash,
+		 ** skip it
+		 */
+		if (cn->blocknr == 0) {
+			goto next;
+		}
+		if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
+			struct buffer_head *tmp_bh;
+			/* we can race against journal_mark_freed when we try
+			 * to lock_buffer(cn->bh), so we have to inc the buffer
+			 * count, and recheck things after locking
+			 */
+			tmp_bh = cn->bh;
+			get_bh(tmp_bh);
+			lock_buffer(tmp_bh);
+			if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
+				if (!buffer_journal_dirty(tmp_bh) ||
+				    buffer_journal_prepared(tmp_bh))
+					BUG();
+				add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
+				ret++;
+			} else {
+				/* note, cn->bh might be null now */
+				unlock_buffer(tmp_bh);
+			}
+			put_bh(tmp_bh);
+		}
+	      next:
+		cn = cn->next;
+		cond_resched();
+	}
+	return ret;
+}
+
+/* used by flush_commit_list */
+static int dirty_one_transaction(struct super_block *s,
+				 struct reiserfs_journal_list *jl)
+{
+	struct reiserfs_journal_cnode *cn;
+	struct reiserfs_journal_list *pjl;
+	int ret = 0;
+
+	jl->j_state |= LIST_DIRTY;
+	cn = jl->j_realblock;
+	while (cn) {
+		/* look for a more recent transaction that logged this
+		 ** buffer.  Only the most recent transaction with a buffer in
+		 ** it is allowed to send that buffer to disk
+		 */
+		pjl = find_newer_jl_for_cn(cn);
+		if (!pjl && cn->blocknr && cn->bh
+		    && buffer_journal_dirty(cn->bh)) {
+			BUG_ON(!can_dirty(cn));
+			/* if the buffer is prepared, it will either be logged
+			 * or restored.  If restored, we need to make sure
+			 * it actually gets marked dirty
+			 */
+			clear_buffer_journal_new(cn->bh);
+			if (buffer_journal_prepared(cn->bh)) {
+				set_buffer_journal_restore_dirty(cn->bh);
+			} else {
+				set_buffer_journal_test(cn->bh);
+				mark_buffer_dirty(cn->bh);
+			}
+		}
+		cn = cn->next;
+	}
+	return ret;
+}
+
+static int kupdate_transactions(struct super_block *s,
+				struct reiserfs_journal_list *jl,
+				struct reiserfs_journal_list **next_jl,
+				unsigned int *next_trans_id,
+				int num_blocks, int num_trans)
+{
+	int ret = 0;
+	int written = 0;
+	int transactions_flushed = 0;
+	unsigned int orig_trans_id = jl->j_trans_id;
+	struct buffer_chunk chunk;
+	struct list_head *entry;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	chunk.nr = 0;
+
+	reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
+	if (!journal_list_still_alive(s, orig_trans_id)) {
+		goto done;
+	}
+
+	/* we've got j_flush_mutex held, nobody is going to delete any
+	 * of these lists out from underneath us
+	 */
+	while ((num_trans && transactions_flushed < num_trans) ||
+	       (!num_trans && written < num_blocks)) {
+
+		if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
+		    atomic_read(&jl->j_commit_left)
+		    || !(jl->j_state & LIST_DIRTY)) {
+			del_from_work_list(s, jl);
+			break;
+		}
+		ret = write_one_transaction(s, jl, &chunk);
+
+		if (ret < 0)
+			goto done;
+		transactions_flushed++;
+		written += ret;
+		entry = jl->j_list.next;
+
+		/* did we wrap? */
+		if (entry == &journal->j_journal_list) {
+			break;
+		}
+		jl = JOURNAL_LIST_ENTRY(entry);
+
+		/* don't bother with older transactions */
+		if (jl->j_trans_id <= orig_trans_id)
+			break;
+	}
+	if (chunk.nr) {
+		write_chunk(&chunk);
+	}
+
+      done:
+	mutex_unlock(&journal->j_flush_mutex);
+	return ret;
+}
+
+/* for o_sync and fsync heavy applications, they tend to use
+** all the journa list slots with tiny transactions.  These
+** trigger lots and lots of calls to update the header block, which
+** adds seeks and slows things down.
+**
+** This function tries to clear out a large chunk of the journal lists
+** at once, which makes everything faster since only the newest journal
+** list updates the header block
+*/
+static int flush_used_journal_lists(struct super_block *s,
+				    struct reiserfs_journal_list *jl)
+{
+	unsigned long len = 0;
+	unsigned long cur_len;
+	int ret;
+	int i;
+	int limit = 256;
+	struct reiserfs_journal_list *tjl;
+	struct reiserfs_journal_list *flush_jl;
+	unsigned int trans_id;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+
+	flush_jl = tjl = jl;
+
+	/* in data logging mode, try harder to flush a lot of blocks */
+	if (reiserfs_data_log(s))
+		limit = 1024;
+	/* flush for 256 transactions or limit blocks, whichever comes first */
+	for (i = 0; i < 256 && len < limit; i++) {
+		if (atomic_read(&tjl->j_commit_left) ||
+		    tjl->j_trans_id < jl->j_trans_id) {
+			break;
+		}
+		cur_len = atomic_read(&tjl->j_nonzerolen);
+		if (cur_len > 0) {
+			tjl->j_state &= ~LIST_TOUCHED;
+		}
+		len += cur_len;
+		flush_jl = tjl;
+		if (tjl->j_list.next == &journal->j_journal_list)
+			break;
+		tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
+	}
+	/* try to find a group of blocks we can flush across all the
+	 ** transactions, but only bother if we've actually spanned
+	 ** across multiple lists
+	 */
+	if (flush_jl != jl) {
+		ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
+	}
+	flush_journal_list(s, flush_jl, 1);
+	return 0;
+}
+
+/*
+** removes any nodes in table with name block and dev as bh.
+** only touchs the hnext and hprev pointers.
+*/
+void remove_journal_hash(struct super_block *sb,
+			 struct reiserfs_journal_cnode **table,
+			 struct reiserfs_journal_list *jl,
+			 unsigned long block, int remove_freed)
+{
+	struct reiserfs_journal_cnode *cur;
+	struct reiserfs_journal_cnode **head;
+
+	head = &(journal_hash(table, sb, block));
+	if (!head) {
+		return;
+	}
+	cur = *head;
+	while (cur) {
+		if (cur->blocknr == block && cur->sb == sb
+		    && (jl == NULL || jl == cur->jlist)
+		    && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
+			if (cur->hnext) {
+				cur->hnext->hprev = cur->hprev;
+			}
+			if (cur->hprev) {
+				cur->hprev->hnext = cur->hnext;
+			} else {
+				*head = cur->hnext;
+			}
+			cur->blocknr = 0;
+			cur->sb = NULL;
+			cur->state = 0;
+			if (cur->bh && cur->jlist)	/* anybody who clears the cur->bh will also dec the nonzerolen */
+				atomic_dec(&(cur->jlist->j_nonzerolen));
+			cur->bh = NULL;
+			cur->jlist = NULL;
+		}
+		cur = cur->hnext;
+	}
+}
+
+static void free_journal_ram(struct super_block *sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	kfree(journal->j_current_jl);
+	journal->j_num_lists--;
+
+	vfree(journal->j_cnode_free_orig);
+	free_list_bitmaps(sb, journal->j_list_bitmap);
+	free_bitmap_nodes(sb);	/* must be after free_list_bitmaps */
+	if (journal->j_header_bh) {
+		brelse(journal->j_header_bh);
+	}
+	/* j_header_bh is on the journal dev, make sure not to release the journal
+	 * dev until we brelse j_header_bh
+	 */
+	release_journal_dev(sb, journal);
+	vfree(journal);
+}
+
+/*
+** call on unmount.  Only set error to 1 if you haven't made your way out
+** of read_super() yet.  Any other caller must keep error at 0.
+*/
+static int do_journal_release(struct reiserfs_transaction_handle *th,
+			      struct super_block *sb, int error)
+{
+	struct reiserfs_transaction_handle myth;
+	int flushed = 0;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+
+	/* we only want to flush out transactions if we were called with error == 0
+	 */
+	if (!error && !(sb->s_flags & MS_RDONLY)) {
+		/* end the current trans */
+		BUG_ON(!th->t_trans_id);
+		do_journal_end(th, sb, 10, FLUSH_ALL);
+
+		/* make sure something gets logged to force our way into the flush code */
+		if (!journal_join(&myth, sb, 1)) {
+			reiserfs_prepare_for_journal(sb,
+						     SB_BUFFER_WITH_SB(sb),
+						     1);
+			journal_mark_dirty(&myth, sb,
+					   SB_BUFFER_WITH_SB(sb));
+			do_journal_end(&myth, sb, 1, FLUSH_ALL);
+			flushed = 1;
+		}
+	}
+
+	/* this also catches errors during the do_journal_end above */
+	if (!error && reiserfs_is_journal_aborted(journal)) {
+		memset(&myth, 0, sizeof(myth));
+		if (!journal_join_abort(&myth, sb, 1)) {
+			reiserfs_prepare_for_journal(sb,
+						     SB_BUFFER_WITH_SB(sb),
+						     1);
+			journal_mark_dirty(&myth, sb,
+					   SB_BUFFER_WITH_SB(sb));
+			do_journal_end(&myth, sb, 1, FLUSH_ALL);
+		}
+	}
+
+	reiserfs_mounted_fs_count--;
+	/* wait for all commits to finish */
+	cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
+
+	/*
+	 * We must release the write lock here because
+	 * the workqueue job (flush_async_commit) needs this lock
+	 */
+	reiserfs_write_unlock(sb);
+	flush_workqueue(commit_wq);
+
+	if (!reiserfs_mounted_fs_count) {
+		destroy_workqueue(commit_wq);
+		commit_wq = NULL;
+	}
+
+	free_journal_ram(sb);
+
+	reiserfs_write_lock(sb);
+
+	return 0;
+}
+
+/*
+** call on unmount.  flush all journal trans, release all alloc'd ram
+*/
+int journal_release(struct reiserfs_transaction_handle *th,
+		    struct super_block *sb)
+{
+	return do_journal_release(th, sb, 0);
+}
+
+/*
+** only call from an error condition inside reiserfs_read_super!
+*/
+int journal_release_error(struct reiserfs_transaction_handle *th,
+			  struct super_block *sb)
+{
+	return do_journal_release(th, sb, 1);
+}
+
+/* compares description block with commit block.  returns 1 if they differ, 0 if they are the same */
+static int journal_compare_desc_commit(struct super_block *sb,
+				       struct reiserfs_journal_desc *desc,
+				       struct reiserfs_journal_commit *commit)
+{
+	if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
+	    get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
+	    get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max ||
+	    get_commit_trans_len(commit) <= 0) {
+		return 1;
+	}
+	return 0;
+}
+
+/* returns 0 if it did not find a description block
+** returns -1 if it found a corrupt commit block
+** returns 1 if both desc and commit were valid
+*/
+static int journal_transaction_is_valid(struct super_block *sb,
+					struct buffer_head *d_bh,
+					unsigned int *oldest_invalid_trans_id,
+					unsigned long *newest_mount_id)
+{
+	struct reiserfs_journal_desc *desc;
+	struct reiserfs_journal_commit *commit;
+	struct buffer_head *c_bh;
+	unsigned long offset;
+
+	if (!d_bh)
+		return 0;
+
+	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
+	if (get_desc_trans_len(desc) > 0
+	    && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
+		if (oldest_invalid_trans_id && *oldest_invalid_trans_id
+		    && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
+			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+				       "journal-986: transaction "
+				       "is valid returning because trans_id %d is greater than "
+				       "oldest_invalid %lu",
+				       get_desc_trans_id(desc),
+				       *oldest_invalid_trans_id);
+			return 0;
+		}
+		if (newest_mount_id
+		    && *newest_mount_id > get_desc_mount_id(desc)) {
+			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+				       "journal-1087: transaction "
+				       "is valid returning because mount_id %d is less than "
+				       "newest_mount_id %lu",
+				       get_desc_mount_id(desc),
+				       *newest_mount_id);
+			return -1;
+		}
+		if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) {
+			reiserfs_warning(sb, "journal-2018",
+					 "Bad transaction length %d "
+					 "encountered, ignoring transaction",
+					 get_desc_trans_len(desc));
+			return -1;
+		}
+		offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
+
+		/* ok, we have a journal description block, lets see if the transaction was valid */
+		c_bh =
+		    journal_bread(sb,
+				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+				  ((offset + get_desc_trans_len(desc) +
+				    1) % SB_ONDISK_JOURNAL_SIZE(sb)));
+		if (!c_bh)
+			return 0;
+		commit = (struct reiserfs_journal_commit *)c_bh->b_data;
+		if (journal_compare_desc_commit(sb, desc, commit)) {
+			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+				       "journal_transaction_is_valid, commit offset %ld had bad "
+				       "time %d or length %d",
+				       c_bh->b_blocknr -
+				       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
+				       get_commit_trans_id(commit),
+				       get_commit_trans_len(commit));
+			brelse(c_bh);
+			if (oldest_invalid_trans_id) {
+				*oldest_invalid_trans_id =
+				    get_desc_trans_id(desc);
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+					       "journal-1004: "
+					       "transaction_is_valid setting oldest invalid trans_id "
+					       "to %d",
+					       get_desc_trans_id(desc));
+			}
+			return -1;
+		}
+		brelse(c_bh);
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+			       "journal-1006: found valid "
+			       "transaction start offset %llu, len %d id %d",
+			       d_bh->b_blocknr -
+			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
+			       get_desc_trans_len(desc),
+			       get_desc_trans_id(desc));
+		return 1;
+	} else {
+		return 0;
+	}
+}
+
+static void brelse_array(struct buffer_head **heads, int num)
+{
+	int i;
+	for (i = 0; i < num; i++) {
+		brelse(heads[i]);
+	}
+}
+
+/*
+** given the start, and values for the oldest acceptable transactions,
+** this either reads in a replays a transaction, or returns because the transaction
+** is invalid, or too old.
+*/
+static int journal_read_transaction(struct super_block *sb,
+				    unsigned long cur_dblock,
+				    unsigned long oldest_start,
+				    unsigned int oldest_trans_id,
+				    unsigned long newest_mount_id)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_journal_desc *desc;
+	struct reiserfs_journal_commit *commit;
+	unsigned int trans_id = 0;
+	struct buffer_head *c_bh;
+	struct buffer_head *d_bh;
+	struct buffer_head **log_blocks = NULL;
+	struct buffer_head **real_blocks = NULL;
+	unsigned int trans_offset;
+	int i;
+	int trans_half;
+
+	d_bh = journal_bread(sb, cur_dblock);
+	if (!d_bh)
+		return 1;
+	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
+	trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: "
+		       "journal_read_transaction, offset %llu, len %d mount_id %d",
+		       d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
+		       get_desc_trans_len(desc), get_desc_mount_id(desc));
+	if (get_desc_trans_id(desc) < oldest_trans_id) {
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: "
+			       "journal_read_trans skipping because %lu is too old",
+			       cur_dblock -
+			       SB_ONDISK_JOURNAL_1st_BLOCK(sb));
+		brelse(d_bh);
+		return 1;
+	}
+	if (get_desc_mount_id(desc) != newest_mount_id) {
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: "
+			       "journal_read_trans skipping because %d is != "
+			       "newest_mount_id %lu", get_desc_mount_id(desc),
+			       newest_mount_id);
+		brelse(d_bh);
+		return 1;
+	}
+	c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+			     ((trans_offset + get_desc_trans_len(desc) + 1) %
+			      SB_ONDISK_JOURNAL_SIZE(sb)));
+	if (!c_bh) {
+		brelse(d_bh);
+		return 1;
+	}
+	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
+	if (journal_compare_desc_commit(sb, desc, commit)) {
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+			       "journal_read_transaction, "
+			       "commit offset %llu had bad time %d or length %d",
+			       c_bh->b_blocknr -
+			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
+			       get_commit_trans_id(commit),
+			       get_commit_trans_len(commit));
+		brelse(c_bh);
+		brelse(d_bh);
+		return 1;
+	}
+
+	if (bdev_read_only(sb->s_bdev)) {
+		reiserfs_warning(sb, "clm-2076",
+				 "device is readonly, unable to replay log");
+		brelse(c_bh);
+		brelse(d_bh);
+		return -EROFS;
+	}
+
+	trans_id = get_desc_trans_id(desc);
+	/* now we know we've got a good transaction, and it was inside the valid time ranges */
+	log_blocks = kmalloc(get_desc_trans_len(desc) *
+			     sizeof(struct buffer_head *), GFP_NOFS);
+	real_blocks = kmalloc(get_desc_trans_len(desc) *
+			      sizeof(struct buffer_head *), GFP_NOFS);
+	if (!log_blocks || !real_blocks) {
+		brelse(c_bh);
+		brelse(d_bh);
+		kfree(log_blocks);
+		kfree(real_blocks);
+		reiserfs_warning(sb, "journal-1169",
+				 "kmalloc failed, unable to mount FS");
+		return -1;
+	}
+	/* get all the buffer heads */
+	trans_half = journal_trans_half(sb->s_blocksize);
+	for (i = 0; i < get_desc_trans_len(desc); i++) {
+		log_blocks[i] =
+		    journal_getblk(sb,
+				   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+				   (trans_offset + 1 +
+				    i) % SB_ONDISK_JOURNAL_SIZE(sb));
+		if (i < trans_half) {
+			real_blocks[i] =
+			    sb_getblk(sb,
+				      le32_to_cpu(desc->j_realblock[i]));
+		} else {
+			real_blocks[i] =
+			    sb_getblk(sb,
+				      le32_to_cpu(commit->
+						  j_realblock[i - trans_half]));
+		}
+		if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) {
+			reiserfs_warning(sb, "journal-1207",
+					 "REPLAY FAILURE fsck required! "
+					 "Block to replay is outside of "
+					 "filesystem");
+			goto abort_replay;
+		}
+		/* make sure we don't try to replay onto log or reserved area */
+		if (is_block_in_log_or_reserved_area
+		    (sb, real_blocks[i]->b_blocknr)) {
+			reiserfs_warning(sb, "journal-1204",
+					 "REPLAY FAILURE fsck required! "
+					 "Trying to replay onto a log block");
+		      abort_replay:
+			brelse_array(log_blocks, i);
+			brelse_array(real_blocks, i);
+			brelse(c_bh);
+			brelse(d_bh);
+			kfree(log_blocks);
+			kfree(real_blocks);
+			return -1;
+		}
+	}
+	/* read in the log blocks, memcpy to the corresponding real block */
+	ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
+	for (i = 0; i < get_desc_trans_len(desc); i++) {
+
+		reiserfs_write_unlock(sb);
+		wait_on_buffer(log_blocks[i]);
+		reiserfs_write_lock(sb);
+
+		if (!buffer_uptodate(log_blocks[i])) {
+			reiserfs_warning(sb, "journal-1212",
+					 "REPLAY FAILURE fsck required! "
+					 "buffer write failed");
+			brelse_array(log_blocks + i,
+				     get_desc_trans_len(desc) - i);
+			brelse_array(real_blocks, get_desc_trans_len(desc));
+			brelse(c_bh);
+			brelse(d_bh);
+			kfree(log_blocks);
+			kfree(real_blocks);
+			return -1;
+		}
+		memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,
+		       real_blocks[i]->b_size);
+		set_buffer_uptodate(real_blocks[i]);
+		brelse(log_blocks[i]);
+	}
+	/* flush out the real blocks */
+	for (i = 0; i < get_desc_trans_len(desc); i++) {
+		set_buffer_dirty(real_blocks[i]);
+		write_dirty_buffer(real_blocks[i], WRITE);
+	}
+	for (i = 0; i < get_desc_trans_len(desc); i++) {
+		wait_on_buffer(real_blocks[i]);
+		if (!buffer_uptodate(real_blocks[i])) {
+			reiserfs_warning(sb, "journal-1226",
+					 "REPLAY FAILURE, fsck required! "
+					 "buffer write failed");
+			brelse_array(real_blocks + i,
+				     get_desc_trans_len(desc) - i);
+			brelse(c_bh);
+			brelse(d_bh);
+			kfree(log_blocks);
+			kfree(real_blocks);
+			return -1;
+		}
+		brelse(real_blocks[i]);
+	}
+	cur_dblock =
+	    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+	    ((trans_offset + get_desc_trans_len(desc) +
+	      2) % SB_ONDISK_JOURNAL_SIZE(sb));
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+		       "journal-1095: setting journal " "start to offset %ld",
+		       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
+
+	/* init starting values for the first transaction, in case this is the last transaction to be replayed. */
+	journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
+	journal->j_last_flush_trans_id = trans_id;
+	journal->j_trans_id = trans_id + 1;
+	/* check for trans_id overflow */
+	if (journal->j_trans_id == 0)
+		journal->j_trans_id = 10;
+	brelse(c_bh);
+	brelse(d_bh);
+	kfree(log_blocks);
+	kfree(real_blocks);
+	return 0;
+}
+
+/* This function reads blocks starting from block and to max_block of bufsize
+   size (but no more than BUFNR blocks at a time). This proved to improve
+   mounting speed on self-rebuilding raid5 arrays at least.
+   Right now it is only used from journal code. But later we might use it
+   from other places.
+   Note: Do not use journal_getblk/sb_getblk functions here! */
+static struct buffer_head *reiserfs_breada(struct block_device *dev,
+					   b_blocknr_t block, int bufsize,
+					   b_blocknr_t max_block)
+{
+	struct buffer_head *bhlist[BUFNR];
+	unsigned int blocks = BUFNR;
+	struct buffer_head *bh;
+	int i, j;
+
+	bh = __getblk(dev, block, bufsize);
+	if (buffer_uptodate(bh))
+		return (bh);
+
+	if (block + BUFNR > max_block) {
+		blocks = max_block - block;
+	}
+	bhlist[0] = bh;
+	j = 1;
+	for (i = 1; i < blocks; i++) {
+		bh = __getblk(dev, block + i, bufsize);
+		if (buffer_uptodate(bh)) {
+			brelse(bh);
+			break;
+		} else
+			bhlist[j++] = bh;
+	}
+	ll_rw_block(READ, j, bhlist);
+	for (i = 1; i < j; i++)
+		brelse(bhlist[i]);
+	bh = bhlist[0];
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return bh;
+	brelse(bh);
+	return NULL;
+}
+
+/*
+** read and replay the log
+** on a clean unmount, the journal header's next unflushed pointer will be to an invalid
+** transaction.  This tests that before finding all the transactions in the log, which makes normal mount times fast.
+**
+** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid.
+**
+** On exit, it sets things up so the first transaction will work correctly.
+*/
+static int journal_read(struct super_block *sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_journal_desc *desc;
+	unsigned int oldest_trans_id = 0;
+	unsigned int oldest_invalid_trans_id = 0;
+	time_t start;
+	unsigned long oldest_start = 0;
+	unsigned long cur_dblock = 0;
+	unsigned long newest_mount_id = 9;
+	struct buffer_head *d_bh;
+	struct reiserfs_journal_header *jh;
+	int valid_journal_header = 0;
+	int replay_count = 0;
+	int continue_replay = 1;
+	int ret;
+	char b[BDEVNAME_SIZE];
+
+	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
+	reiserfs_info(sb, "checking transaction log (%s)\n",
+		      bdevname(journal->j_dev_bd, b));
+	start = get_seconds();
+
+	/* step 1, read in the journal header block.  Check the transaction it says
+	 ** is the first unflushed, and if that transaction is not valid,
+	 ** replay is done
+	 */
+	journal->j_header_bh = journal_bread(sb,
+					     SB_ONDISK_JOURNAL_1st_BLOCK(sb)
+					     + SB_ONDISK_JOURNAL_SIZE(sb));
+	if (!journal->j_header_bh) {
+		return 1;
+	}
+	jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
+	if (le32_to_cpu(jh->j_first_unflushed_offset) <
+	    SB_ONDISK_JOURNAL_SIZE(sb)
+	    && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
+		oldest_start =
+		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+		    le32_to_cpu(jh->j_first_unflushed_offset);
+		oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
+		newest_mount_id = le32_to_cpu(jh->j_mount_id);
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+			       "journal-1153: found in "
+			       "header: first_unflushed_offset %d, last_flushed_trans_id "
+			       "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
+			       le32_to_cpu(jh->j_last_flush_trans_id));
+		valid_journal_header = 1;
+
+		/* now, we try to read the first unflushed offset.  If it is not valid,
+		 ** there is nothing more we can do, and it makes no sense to read
+		 ** through the whole log.
+		 */
+		d_bh =
+		    journal_bread(sb,
+				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+				  le32_to_cpu(jh->j_first_unflushed_offset));
+		ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL);
+		if (!ret) {
+			continue_replay = 0;
+		}
+		brelse(d_bh);
+		goto start_log_replay;
+	}
+
+	/* ok, there are transactions that need to be replayed.  start with the first log block, find
+	 ** all the valid transactions, and pick out the oldest.
+	 */
+	while (continue_replay
+	       && cur_dblock <
+	       (SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+		SB_ONDISK_JOURNAL_SIZE(sb))) {
+		/* Note that it is required for blocksize of primary fs device and journal
+		   device to be the same */
+		d_bh =
+		    reiserfs_breada(journal->j_dev_bd, cur_dblock,
+				    sb->s_blocksize,
+				    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+				    SB_ONDISK_JOURNAL_SIZE(sb));
+		ret =
+		    journal_transaction_is_valid(sb, d_bh,
+						 &oldest_invalid_trans_id,
+						 &newest_mount_id);
+		if (ret == 1) {
+			desc = (struct reiserfs_journal_desc *)d_bh->b_data;
+			if (oldest_start == 0) {	/* init all oldest_ values */
+				oldest_trans_id = get_desc_trans_id(desc);
+				oldest_start = d_bh->b_blocknr;
+				newest_mount_id = get_desc_mount_id(desc);
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+					       "journal-1179: Setting "
+					       "oldest_start to offset %llu, trans_id %lu",
+					       oldest_start -
+					       SB_ONDISK_JOURNAL_1st_BLOCK
+					       (sb), oldest_trans_id);
+			} else if (oldest_trans_id > get_desc_trans_id(desc)) {
+				/* one we just read was older */
+				oldest_trans_id = get_desc_trans_id(desc);
+				oldest_start = d_bh->b_blocknr;
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+					       "journal-1180: Resetting "
+					       "oldest_start to offset %lu, trans_id %lu",
+					       oldest_start -
+					       SB_ONDISK_JOURNAL_1st_BLOCK
+					       (sb), oldest_trans_id);
+			}
+			if (newest_mount_id < get_desc_mount_id(desc)) {
+				newest_mount_id = get_desc_mount_id(desc);
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+					       "journal-1299: Setting "
+					       "newest_mount_id to %d",
+					       get_desc_mount_id(desc));
+			}
+			cur_dblock += get_desc_trans_len(desc) + 2;
+		} else {
+			cur_dblock++;
+		}
+		brelse(d_bh);
+	}
+
+      start_log_replay:
+	cur_dblock = oldest_start;
+	if (oldest_trans_id) {
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+			       "journal-1206: Starting replay "
+			       "from offset %llu, trans_id %lu",
+			       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
+			       oldest_trans_id);
+
+	}
+	replay_count = 0;
+	while (continue_replay && oldest_trans_id > 0) {
+		ret =
+		    journal_read_transaction(sb, cur_dblock, oldest_start,
+					     oldest_trans_id, newest_mount_id);
+		if (ret < 0) {
+			return ret;
+		} else if (ret != 0) {
+			break;
+		}
+		cur_dblock =
+		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start;
+		replay_count++;
+		if (cur_dblock == oldest_start)
+			break;
+	}
+
+	if (oldest_trans_id == 0) {
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+			       "journal-1225: No valid " "transactions found");
+	}
+	/* j_start does not get set correctly if we don't replay any transactions.
+	 ** if we had a valid journal_header, set j_start to the first unflushed transaction value,
+	 ** copy the trans_id from the header
+	 */
+	if (valid_journal_header && replay_count == 0) {
+		journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);
+		journal->j_trans_id =
+		    le32_to_cpu(jh->j_last_flush_trans_id) + 1;
+		/* check for trans_id overflow */
+		if (journal->j_trans_id == 0)
+			journal->j_trans_id = 10;
+		journal->j_last_flush_trans_id =
+		    le32_to_cpu(jh->j_last_flush_trans_id);
+		journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
+	} else {
+		journal->j_mount_id = newest_mount_id + 1;
+	}
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
+		       "newest_mount_id to %lu", journal->j_mount_id);
+	journal->j_first_unflushed_offset = journal->j_start;
+	if (replay_count > 0) {
+		reiserfs_info(sb,
+			      "replayed %d transactions in %lu seconds\n",
+			      replay_count, get_seconds() - start);
+	}
+	if (!bdev_read_only(sb->s_bdev) &&
+	    _update_journal_header_block(sb, journal->j_start,
+					 journal->j_last_flush_trans_id)) {
+		/* replay failed, caller must call free_journal_ram and abort
+		 ** the mount
+		 */
+		return -1;
+	}
+	return 0;
+}
+
+static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
+{
+	struct reiserfs_journal_list *jl;
+	jl = kzalloc(sizeof(struct reiserfs_journal_list),
+		     GFP_NOFS | __GFP_NOFAIL);
+	INIT_LIST_HEAD(&jl->j_list);
+	INIT_LIST_HEAD(&jl->j_working_list);
+	INIT_LIST_HEAD(&jl->j_tail_bh_list);
+	INIT_LIST_HEAD(&jl->j_bh_list);
+	mutex_init(&jl->j_commit_mutex);
+	SB_JOURNAL(s)->j_num_lists++;
+	get_journal_list(jl);
+	return jl;
+}
+
+static void journal_list_init(struct super_block *sb)
+{
+	SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
+}
+
+static int release_journal_dev(struct super_block *super,
+			       struct reiserfs_journal *journal)
+{
+	int result;
+
+	result = 0;
+
+	if (journal->j_dev_bd != NULL) {
+		result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
+		journal->j_dev_bd = NULL;
+	}
+
+	if (result != 0) {
+		reiserfs_warning(super, "sh-457",
+				 "Cannot release journal device: %i", result);
+	}
+	return result;
+}
+
+static int journal_init_dev(struct super_block *super,
+			    struct reiserfs_journal *journal,
+			    const char *jdev_name)
+{
+	int result;
+	dev_t jdev;
+	fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
+	char b[BDEVNAME_SIZE];
+
+	result = 0;
+
+	journal->j_dev_bd = NULL;
+	jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
+	    new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
+
+	if (bdev_read_only(super->s_bdev))
+		blkdev_mode = FMODE_READ;
+
+	/* there is no "jdev" option and journal is on separate device */
+	if ((!jdev_name || !jdev_name[0])) {
+		if (jdev == super->s_dev)
+			blkdev_mode &= ~FMODE_EXCL;
+		journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
+						      journal);
+		journal->j_dev_mode = blkdev_mode;
+		if (IS_ERR(journal->j_dev_bd)) {
+			result = PTR_ERR(journal->j_dev_bd);
+			journal->j_dev_bd = NULL;
+			reiserfs_warning(super, "sh-458",
+					 "cannot init journal device '%s': %i",
+					 __bdevname(jdev, b), result);
+			return result;
+		} else if (jdev != super->s_dev)
+			set_blocksize(journal->j_dev_bd, super->s_blocksize);
+
+		return 0;
+	}
+
+	journal->j_dev_mode = blkdev_mode;
+	journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
+	if (IS_ERR(journal->j_dev_bd)) {
+		result = PTR_ERR(journal->j_dev_bd);
+		journal->j_dev_bd = NULL;
+		reiserfs_warning(super,
+				 "journal_init_dev: Cannot open '%s': %i",
+				 jdev_name, result);
+		return result;
+	}
+
+	set_blocksize(journal->j_dev_bd, super->s_blocksize);
+	reiserfs_info(super,
+		      "journal_init_dev: journal device: %s\n",
+		      bdevname(journal->j_dev_bd, b));
+	return 0;
+}
+
+/**
+ * When creating/tuning a file system user can assign some
+ * journal params within boundaries which depend on the ratio
+ * blocksize/standard_blocksize.
+ *
+ * For blocks >= standard_blocksize transaction size should
+ * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more
+ * then JOURNAL_TRANS_MAX_DEFAULT.
+ *
+ * For blocks < standard_blocksize these boundaries should be
+ * decreased proportionally.
+ */
+#define REISERFS_STANDARD_BLKSIZE (4096)
+
+static int check_advise_trans_params(struct super_block *sb,
+				     struct reiserfs_journal *journal)
+{
+        if (journal->j_trans_max) {
+	        /* Non-default journal params.
+		   Do sanity check for them. */
+	        int ratio = 1;
+		if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
+		        ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
+
+		if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio ||
+		    journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio ||
+		    SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max <
+		    JOURNAL_MIN_RATIO) {
+			reiserfs_warning(sb, "sh-462",
+					 "bad transaction max size (%u). "
+					 "FSCK?", journal->j_trans_max);
+			return 1;
+		}
+		if (journal->j_max_batch != (journal->j_trans_max) *
+		        JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) {
+			reiserfs_warning(sb, "sh-463",
+					 "bad transaction max batch (%u). "
+					 "FSCK?", journal->j_max_batch);
+			return 1;
+		}
+	} else {
+		/* Default journal params.
+                   The file system was created by old version
+		   of mkreiserfs, so some fields contain zeros,
+		   and we need to advise proper values for them */
+		if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
+			reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
+					 sb->s_blocksize);
+			return 1;
+		}
+		journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
+		journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
+		journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
+	}
+	return 0;
+}
+
+/*
+** must be called once on fs mount.  calls journal_read for you
+*/
+int journal_init(struct super_block *sb, const char *j_dev_name,
+		 int old_format, unsigned int commit_max_age)
+{
+	int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2;
+	struct buffer_head *bhjh;
+	struct reiserfs_super_block *rs;
+	struct reiserfs_journal_header *jh;
+	struct reiserfs_journal *journal;
+	struct reiserfs_journal_list *jl;
+	char b[BDEVNAME_SIZE];
+	int ret;
+
+	/*
+	 * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS
+	 * dependency inversion warnings.
+	 */
+	reiserfs_write_unlock(sb);
+	journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal));
+	if (!journal) {
+		reiserfs_warning(sb, "journal-1256",
+				 "unable to get memory for journal structure");
+		reiserfs_write_lock(sb);
+		return 1;
+	}
+	memset(journal, 0, sizeof(struct reiserfs_journal));
+	INIT_LIST_HEAD(&journal->j_bitmap_nodes);
+	INIT_LIST_HEAD(&journal->j_prealloc_list);
+	INIT_LIST_HEAD(&journal->j_working_list);
+	INIT_LIST_HEAD(&journal->j_journal_list);
+	journal->j_persistent_trans = 0;
+	ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
+					   reiserfs_bmap_count(sb));
+	reiserfs_write_lock(sb);
+	if (ret)
+		goto free_and_return;
+
+	allocate_bitmap_nodes(sb);
+
+	/* reserved for journal area support */
+	SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ?
+						 REISERFS_OLD_DISK_OFFSET_IN_BYTES
+						 / sb->s_blocksize +
+						 reiserfs_bmap_count(sb) +
+						 1 :
+						 REISERFS_DISK_OFFSET_IN_BYTES /
+						 sb->s_blocksize + 2);
+
+	/* Sanity check to see is the standard journal fitting within first bitmap
+	   (actual for small blocksizes) */
+	if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
+	    (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
+	     SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
+		reiserfs_warning(sb, "journal-1393",
+				 "journal does not fit for area addressed "
+				 "by first of bitmap blocks. It starts at "
+				 "%u and its size is %u. Block size %ld",
+				 SB_JOURNAL_1st_RESERVED_BLOCK(sb),
+				 SB_ONDISK_JOURNAL_SIZE(sb),
+				 sb->s_blocksize);
+		goto free_and_return;
+	}
+
+	/*
+	 * We need to unlock here to avoid creating the following
+	 * dependency:
+	 * reiserfs_lock -> sysfs_mutex
+	 * Because the reiserfs mmap path creates the following dependency:
+	 * mm->mmap -> reiserfs_lock, hence we have
+	 * mm->mmap -> reiserfs_lock ->sysfs_mutex
+	 * This would ends up in a circular dependency with sysfs readdir path
+	 * which does sysfs_mutex -> mm->mmap_sem
+	 * This is fine because the reiserfs lock is useless in mount path,
+	 * at least until we call journal_begin. We keep it for paranoid
+	 * reasons.
+	 */
+	reiserfs_write_unlock(sb);
+	if (journal_init_dev(sb, journal, j_dev_name) != 0) {
+		reiserfs_write_lock(sb);
+		reiserfs_warning(sb, "sh-462",
+				 "unable to initialize jornal device");
+		goto free_and_return;
+	}
+	reiserfs_write_lock(sb);
+
+	rs = SB_DISK_SUPER_BLOCK(sb);
+
+	/* read journal header */
+	bhjh = journal_bread(sb,
+			     SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+			     SB_ONDISK_JOURNAL_SIZE(sb));
+	if (!bhjh) {
+		reiserfs_warning(sb, "sh-459",
+				 "unable to read journal header");
+		goto free_and_return;
+	}
+	jh = (struct reiserfs_journal_header *)(bhjh->b_data);
+
+	/* make sure that journal matches to the super block */
+	if (is_reiserfs_jr(rs)
+	    && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
+		sb_jp_journal_magic(rs))) {
+		reiserfs_warning(sb, "sh-460",
+				 "journal header magic %x (device %s) does "
+				 "not match to magic found in super block %x",
+				 jh->jh_journal.jp_journal_magic,
+				 bdevname(journal->j_dev_bd, b),
+				 sb_jp_journal_magic(rs));
+		brelse(bhjh);
+		goto free_and_return;
+	}
+
+	journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max);
+	journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch);
+	journal->j_max_commit_age =
+	    le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
+	journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
+
+	if (check_advise_trans_params(sb, journal) != 0)
+	        goto free_and_return;
+	journal->j_default_max_commit_age = journal->j_max_commit_age;
+
+	if (commit_max_age != 0) {
+		journal->j_max_commit_age = commit_max_age;
+		journal->j_max_trans_age = commit_max_age;
+	}
+
+	reiserfs_info(sb, "journal params: device %s, size %u, "
+		      "journal first block %u, max trans len %u, max batch %u, "
+		      "max commit age %u, max trans age %u\n",
+		      bdevname(journal->j_dev_bd, b),
+		      SB_ONDISK_JOURNAL_SIZE(sb),
+		      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
+		      journal->j_trans_max,
+		      journal->j_max_batch,
+		      journal->j_max_commit_age, journal->j_max_trans_age);
+
+	brelse(bhjh);
+
+	journal->j_list_bitmap_index = 0;
+	journal_list_init(sb);
+
+	memset(journal->j_list_hash_table, 0,
+	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
+
+	INIT_LIST_HEAD(&journal->j_dirty_buffers);
+	spin_lock_init(&journal->j_dirty_buffers_lock);
+
+	journal->j_start = 0;
+	journal->j_len = 0;
+	journal->j_len_alloc = 0;
+	atomic_set(&(journal->j_wcount), 0);
+	atomic_set(&(journal->j_async_throttle), 0);
+	journal->j_bcount = 0;
+	journal->j_trans_start_time = 0;
+	journal->j_last = NULL;
+	journal->j_first = NULL;
+	init_waitqueue_head(&(journal->j_join_wait));
+	mutex_init(&journal->j_mutex);
+	mutex_init(&journal->j_flush_mutex);
+
+	journal->j_trans_id = 10;
+	journal->j_mount_id = 10;
+	journal->j_state = 0;
+	atomic_set(&(journal->j_jlock), 0);
+	reiserfs_write_unlock(sb);
+	journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
+	reiserfs_write_lock(sb);
+	journal->j_cnode_free_orig = journal->j_cnode_free_list;
+	journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
+	journal->j_cnode_used = 0;
+	journal->j_must_wait = 0;
+
+	if (journal->j_cnode_free == 0) {
+		reiserfs_warning(sb, "journal-2004", "Journal cnode memory "
+		                 "allocation failed (%ld bytes). Journal is "
+		                 "too large for available memory. Usually "
+		                 "this is due to a journal that is too large.",
+		                 sizeof (struct reiserfs_journal_cnode) * num_cnodes);
+        	goto free_and_return;
+	}
+
+	init_journal_hash(sb);
+	jl = journal->j_current_jl;
+	jl->j_list_bitmap = get_list_bitmap(sb, jl);
+	if (!jl->j_list_bitmap) {
+		reiserfs_warning(sb, "journal-2005",
+				 "get_list_bitmap failed for journal list 0");
+		goto free_and_return;
+	}
+	if (journal_read(sb) < 0) {
+		reiserfs_warning(sb, "reiserfs-2006",
+				 "Replay Failure, unable to mount");
+		goto free_and_return;
+	}
+
+	reiserfs_mounted_fs_count++;
+	if (reiserfs_mounted_fs_count <= 1) {
+		reiserfs_write_unlock(sb);
+		commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
+		reiserfs_write_lock(sb);
+	}
+
+	INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
+	journal->j_work_sb = sb;
+	return 0;
+      free_and_return:
+	free_journal_ram(sb);
+	return 1;
+}
+
+/*
+** test for a polite end of the current transaction.  Used by file_write, and should
+** be used by delete to make sure they don't write more than can fit inside a single
+** transaction
+*/
+int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
+				   int new_alloc)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
+	time_t now = get_seconds();
+	/* cannot restart while nested */
+	BUG_ON(!th->t_trans_id);
+	if (th->t_refcount > 1)
+		return 0;
+	if (journal->j_must_wait > 0 ||
+	    (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
+	    atomic_read(&(journal->j_jlock)) ||
+	    (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
+	    journal->j_cnode_free < (journal->j_trans_max * 3)) {
+		return 1;
+	}
+	/* protected by the BKL here */
+	journal->j_len_alloc += new_alloc;
+	th->t_blocks_allocated += new_alloc ;
+	return 0;
+}
+
+/* this must be called inside a transaction, and requires the
+** kernel_lock to be held
+*/
+void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
+	BUG_ON(!th->t_trans_id);
+	journal->j_must_wait = 1;
+	set_bit(J_WRITERS_BLOCKED, &journal->j_state);
+	return;
+}
+
+/* this must be called without a transaction started, and does not
+** require BKL
+*/
+void reiserfs_allow_writes(struct super_block *s)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	clear_bit(J_WRITERS_BLOCKED, &journal->j_state);
+	wake_up(&journal->j_join_wait);
+}
+
+/* this must be called without a transaction started, and does not
+** require BKL
+*/
+void reiserfs_wait_on_write_block(struct super_block *s)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	wait_event(journal->j_join_wait,
+		   !test_bit(J_WRITERS_BLOCKED, &journal->j_state));
+}
+
+static void queue_log_writer(struct super_block *s)
+{
+	wait_queue_t wait;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	set_bit(J_WRITERS_QUEUED, &journal->j_state);
+
+	/*
+	 * we don't want to use wait_event here because
+	 * we only want to wait once.
+	 */
+	init_waitqueue_entry(&wait, current);
+	add_wait_queue(&journal->j_join_wait, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
+		reiserfs_write_unlock(s);
+		schedule();
+		reiserfs_write_lock(s);
+	}
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&journal->j_join_wait, &wait);
+}
+
+static void wake_queued_writers(struct super_block *s)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
+		wake_up(&journal->j_join_wait);
+}
+
+static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	unsigned long bcount = journal->j_bcount;
+	while (1) {
+		reiserfs_write_unlock(sb);
+		schedule_timeout_uninterruptible(1);
+		reiserfs_write_lock(sb);
+		journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
+		while ((atomic_read(&journal->j_wcount) > 0 ||
+			atomic_read(&journal->j_jlock)) &&
+		       journal->j_trans_id == trans_id) {
+			queue_log_writer(sb);
+		}
+		if (journal->j_trans_id != trans_id)
+			break;
+		if (bcount == journal->j_bcount)
+			break;
+		bcount = journal->j_bcount;
+	}
+}
+
+/* join == true if you must join an existing transaction.
+** join == false if you can deal with waiting for others to finish
+**
+** this will block until the transaction is joinable.  send the number of blocks you
+** expect to use in nblocks.
+*/
+static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
+			      struct super_block *sb, unsigned long nblocks,
+			      int join)
+{
+	time_t now = get_seconds();
+	unsigned int old_trans_id;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_transaction_handle myth;
+	int sched_count = 0;
+	int retval;
+
+	reiserfs_check_lock_depth(sb, "journal_begin");
+	BUG_ON(nblocks > journal->j_trans_max);
+
+	PROC_INFO_INC(sb, journal.journal_being);
+	/* set here for journal_join */
+	th->t_refcount = 1;
+	th->t_super = sb;
+
+      relock:
+	lock_journal(sb);
+	if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
+		unlock_journal(sb);
+		retval = journal->j_errno;
+		goto out_fail;
+	}
+	journal->j_bcount++;
+
+	if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
+		unlock_journal(sb);
+		reiserfs_write_unlock(sb);
+		reiserfs_wait_on_write_block(sb);
+		reiserfs_write_lock(sb);
+		PROC_INFO_INC(sb, journal.journal_relock_writers);
+		goto relock;
+	}
+	now = get_seconds();
+
+	/* if there is no room in the journal OR
+	 ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning
+	 ** we don't sleep if there aren't other writers
+	 */
+
+	if ((!join && journal->j_must_wait > 0) ||
+	    (!join
+	     && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch)
+	    || (!join && atomic_read(&journal->j_wcount) > 0
+		&& journal->j_trans_start_time > 0
+		&& (now - journal->j_trans_start_time) >
+		journal->j_max_trans_age) || (!join
+					      && atomic_read(&journal->j_jlock))
+	    || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
+
+		old_trans_id = journal->j_trans_id;
+		unlock_journal(sb);	/* allow others to finish this transaction */
+
+		if (!join && (journal->j_len_alloc + nblocks + 2) >=
+		    journal->j_max_batch &&
+		    ((journal->j_len + nblocks + 2) * 100) <
+		    (journal->j_len_alloc * 75)) {
+			if (atomic_read(&journal->j_wcount) > 10) {
+				sched_count++;
+				queue_log_writer(sb);
+				goto relock;
+			}
+		}
+		/* don't mess with joining the transaction if all we have to do is
+		 * wait for someone else to do a commit
+		 */
+		if (atomic_read(&journal->j_jlock)) {
+			while (journal->j_trans_id == old_trans_id &&
+			       atomic_read(&journal->j_jlock)) {
+				queue_log_writer(sb);
+			}
+			goto relock;
+		}
+		retval = journal_join(&myth, sb, 1);
+		if (retval)
+			goto out_fail;
+
+		/* someone might have ended the transaction while we joined */
+		if (old_trans_id != journal->j_trans_id) {
+			retval = do_journal_end(&myth, sb, 1, 0);
+		} else {
+			retval = do_journal_end(&myth, sb, 1, COMMIT_NOW);
+		}
+
+		if (retval)
+			goto out_fail;
+
+		PROC_INFO_INC(sb, journal.journal_relock_wcount);
+		goto relock;
+	}
+	/* we are the first writer, set trans_id */
+	if (journal->j_trans_start_time == 0) {
+		journal->j_trans_start_time = get_seconds();
+	}
+	atomic_inc(&(journal->j_wcount));
+	journal->j_len_alloc += nblocks;
+	th->t_blocks_logged = 0;
+	th->t_blocks_allocated = nblocks;
+	th->t_trans_id = journal->j_trans_id;
+	unlock_journal(sb);
+	INIT_LIST_HEAD(&th->t_list);
+	get_fs_excl();
+	return 0;
+
+      out_fail:
+	memset(th, 0, sizeof(*th));
+	/* Re-set th->t_super, so we can properly keep track of how many
+	 * persistent transactions there are. We need to do this so if this
+	 * call is part of a failed restart_transaction, we can free it later */
+	th->t_super = sb;
+	return retval;
+}
+
+struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
+								    super_block
+								    *s,
+								    int nblocks)
+{
+	int ret;
+	struct reiserfs_transaction_handle *th;
+
+	/* if we're nesting into an existing transaction.  It will be
+	 ** persistent on its own
+	 */
+	if (reiserfs_transaction_running(s)) {
+		th = current->journal_info;
+		th->t_refcount++;
+		BUG_ON(th->t_refcount < 2);
+		
+		return th;
+	}
+	th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
+	if (!th)
+		return NULL;
+	ret = journal_begin(th, s, nblocks);
+	if (ret) {
+		kfree(th);
+		return NULL;
+	}
+
+	SB_JOURNAL(s)->j_persistent_trans++;
+	return th;
+}
+
+int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
+{
+	struct super_block *s = th->t_super;
+	int ret = 0;
+	if (th->t_trans_id)
+		ret = journal_end(th, th->t_super, th->t_blocks_allocated);
+	else
+		ret = -EIO;
+	if (th->t_refcount == 0) {
+		SB_JOURNAL(s)->j_persistent_trans--;
+		kfree(th);
+	}
+	return ret;
+}
+
+static int journal_join(struct reiserfs_transaction_handle *th,
+			struct super_block *sb, unsigned long nblocks)
+{
+	struct reiserfs_transaction_handle *cur_th = current->journal_info;
+
+	/* this keeps do_journal_end from NULLing out the current->journal_info
+	 ** pointer
+	 */
+	th->t_handle_save = cur_th;
+	BUG_ON(cur_th && cur_th->t_refcount > 1);
+	return do_journal_begin_r(th, sb, nblocks, JBEGIN_JOIN);
+}
+
+int journal_join_abort(struct reiserfs_transaction_handle *th,
+		       struct super_block *sb, unsigned long nblocks)
+{
+	struct reiserfs_transaction_handle *cur_th = current->journal_info;
+
+	/* this keeps do_journal_end from NULLing out the current->journal_info
+	 ** pointer
+	 */
+	th->t_handle_save = cur_th;
+	BUG_ON(cur_th && cur_th->t_refcount > 1);
+	return do_journal_begin_r(th, sb, nblocks, JBEGIN_ABORT);
+}
+
+int journal_begin(struct reiserfs_transaction_handle *th,
+		  struct super_block *sb, unsigned long nblocks)
+{
+	struct reiserfs_transaction_handle *cur_th = current->journal_info;
+	int ret;
+
+	th->t_handle_save = NULL;
+	if (cur_th) {
+		/* we are nesting into the current transaction */
+		if (cur_th->t_super == sb) {
+			BUG_ON(!cur_th->t_refcount);
+			cur_th->t_refcount++;
+			memcpy(th, cur_th, sizeof(*th));
+			if (th->t_refcount <= 1)
+				reiserfs_warning(sb, "reiserfs-2005",
+						 "BAD: refcount <= 1, but "
+						 "journal_info != 0");
+			return 0;
+		} else {
+			/* we've ended up with a handle from a different filesystem.
+			 ** save it and restore on journal_end.  This should never
+			 ** really happen...
+			 */
+			reiserfs_warning(sb, "clm-2100",
+					 "nesting info a different FS");
+			th->t_handle_save = current->journal_info;
+			current->journal_info = th;
+		}
+	} else {
+		current->journal_info = th;
+	}
+	ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
+	BUG_ON(current->journal_info != th);
+
+	/* I guess this boils down to being the reciprocal of clm-2100 above.
+	 * If do_journal_begin_r fails, we need to put it back, since journal_end
+	 * won't be called to do it. */
+	if (ret)
+		current->journal_info = th->t_handle_save;
+	else
+		BUG_ON(!th->t_refcount);
+
+	return ret;
+}
+
+/*
+** puts bh into the current transaction.  If it was already there, reorders removes the
+** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order).
+**
+** if it was dirty, cleans and files onto the clean list.  I can't let it be dirty again until the
+** transaction is committed.
+**
+** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
+*/
+int journal_mark_dirty(struct reiserfs_transaction_handle *th,
+		       struct super_block *sb, struct buffer_head *bh)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_journal_cnode *cn = NULL;
+	int count_already_incd = 0;
+	int prepared = 0;
+	BUG_ON(!th->t_trans_id);
+
+	PROC_INFO_INC(sb, journal.mark_dirty);
+	if (th->t_trans_id != journal->j_trans_id) {
+		reiserfs_panic(th->t_super, "journal-1577",
+			       "handle trans id %ld != current trans id %ld",
+			       th->t_trans_id, journal->j_trans_id);
+	}
+
+	sb->s_dirt = 1;
+
+	prepared = test_clear_buffer_journal_prepared(bh);
+	clear_buffer_journal_restore_dirty(bh);
+	/* already in this transaction, we are done */
+	if (buffer_journaled(bh)) {
+		PROC_INFO_INC(sb, journal.mark_dirty_already);
+		return 0;
+	}
+
+	/* this must be turned into a panic instead of a warning.  We can't allow
+	 ** a dirty or journal_dirty or locked buffer to be logged, as some changes
+	 ** could get to disk too early.  NOT GOOD.
+	 */
+	if (!prepared || buffer_dirty(bh)) {
+		reiserfs_warning(sb, "journal-1777",
+				 "buffer %llu bad state "
+				 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
+				 (unsigned long long)bh->b_blocknr,
+				 prepared ? ' ' : '!',
+				 buffer_locked(bh) ? ' ' : '!',
+				 buffer_dirty(bh) ? ' ' : '!',
+				 buffer_journal_dirty(bh) ? ' ' : '!');
+	}
+
+	if (atomic_read(&(journal->j_wcount)) <= 0) {
+		reiserfs_warning(sb, "journal-1409",
+				 "returning because j_wcount was %d",
+				 atomic_read(&(journal->j_wcount)));
+		return 1;
+	}
+	/* this error means I've screwed up, and we've overflowed the transaction.
+	 ** Nothing can be done here, except make the FS readonly or panic.
+	 */
+	if (journal->j_len >= journal->j_trans_max) {
+		reiserfs_panic(th->t_super, "journal-1413",
+			       "j_len (%lu) is too big",
+			       journal->j_len);
+	}
+
+	if (buffer_journal_dirty(bh)) {
+		count_already_incd = 1;
+		PROC_INFO_INC(sb, journal.mark_dirty_notjournal);
+		clear_buffer_journal_dirty(bh);
+	}
+
+	if (journal->j_len > journal->j_len_alloc) {
+		journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT;
+	}
+
+	set_buffer_journaled(bh);
+
+	/* now put this guy on the end */
+	if (!cn) {
+		cn = get_cnode(sb);
+		if (!cn) {
+			reiserfs_panic(sb, "journal-4", "get_cnode failed!");
+		}
+
+		if (th->t_blocks_logged == th->t_blocks_allocated) {
+			th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT;
+			journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT;
+		}
+		th->t_blocks_logged++;
+		journal->j_len++;
+
+		cn->bh = bh;
+		cn->blocknr = bh->b_blocknr;
+		cn->sb = sb;
+		cn->jlist = NULL;
+		insert_journal_hash(journal->j_hash_table, cn);
+		if (!count_already_incd) {
+			get_bh(bh);
+		}
+	}
+	cn->next = NULL;
+	cn->prev = journal->j_last;
+	cn->bh = bh;
+	if (journal->j_last) {
+		journal->j_last->next = cn;
+		journal->j_last = cn;
+	} else {
+		journal->j_first = cn;
+		journal->j_last = cn;
+	}
+	return 0;
+}
+
+int journal_end(struct reiserfs_transaction_handle *th,
+		struct super_block *sb, unsigned long nblocks)
+{
+	if (!current->journal_info && th->t_refcount > 1)
+		reiserfs_warning(sb, "REISER-NESTING",
+				 "th NULL, refcount %d", th->t_refcount);
+
+	if (!th->t_trans_id) {
+		WARN_ON(1);
+		return -EIO;
+	}
+
+	th->t_refcount--;
+	if (th->t_refcount > 0) {
+		struct reiserfs_transaction_handle *cur_th =
+		    current->journal_info;
+
+		/* we aren't allowed to close a nested transaction on a different
+		 ** filesystem from the one in the task struct
+		 */
+		BUG_ON(cur_th->t_super != th->t_super);
+
+		if (th != cur_th) {
+			memcpy(current->journal_info, th, sizeof(*th));
+			th->t_trans_id = 0;
+		}
+		return 0;
+	} else {
+		return do_journal_end(th, sb, nblocks, 0);
+	}
+}
+
+/* removes from the current transaction, relsing and descrementing any counters.
+** also files the removed buffer directly onto the clean list
+**
+** called by journal_mark_freed when a block has been deleted
+**
+** returns 1 if it cleaned and relsed the buffer. 0 otherwise
+*/
+static int remove_from_transaction(struct super_block *sb,
+				   b_blocknr_t blocknr, int already_cleaned)
+{
+	struct buffer_head *bh;
+	struct reiserfs_journal_cnode *cn;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	int ret = 0;
+
+	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
+	if (!cn || !cn->bh) {
+		return ret;
+	}
+	bh = cn->bh;
+	if (cn->prev) {
+		cn->prev->next = cn->next;
+	}
+	if (cn->next) {
+		cn->next->prev = cn->prev;
+	}
+	if (cn == journal->j_first) {
+		journal->j_first = cn->next;
+	}
+	if (cn == journal->j_last) {
+		journal->j_last = cn->prev;
+	}
+	if (bh)
+		remove_journal_hash(sb, journal->j_hash_table, NULL,
+				    bh->b_blocknr, 0);
+	clear_buffer_journaled(bh);	/* don't log this one */
+
+	if (!already_cleaned) {
+		clear_buffer_journal_dirty(bh);
+		clear_buffer_dirty(bh);
+		clear_buffer_journal_test(bh);
+		put_bh(bh);
+		if (atomic_read(&(bh->b_count)) < 0) {
+			reiserfs_warning(sb, "journal-1752",
+					 "b_count < 0");
+		}
+		ret = 1;
+	}
+	journal->j_len--;
+	journal->j_len_alloc--;
+	free_cnode(sb, cn);
+	return ret;
+}
+
+/*
+** for any cnode in a journal list, it can only be dirtied of all the
+** transactions that include it are committed to disk.
+** this checks through each transaction, and returns 1 if you are allowed to dirty,
+** and 0 if you aren't
+**
+** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log
+** blocks for a given transaction on disk
+**
+*/
+static int can_dirty(struct reiserfs_journal_cnode *cn)
+{
+	struct super_block *sb = cn->sb;
+	b_blocknr_t blocknr = cn->blocknr;
+	struct reiserfs_journal_cnode *cur = cn->hprev;
+	int can_dirty = 1;
+
+	/* first test hprev.  These are all newer than cn, so any node here
+	 ** with the same block number and dev means this node can't be sent
+	 ** to disk right now.
+	 */
+	while (cur && can_dirty) {
+		if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
+		    cur->blocknr == blocknr) {
+			can_dirty = 0;
+		}
+		cur = cur->hprev;
+	}
+	/* then test hnext.  These are all older than cn.  As long as they
+	 ** are committed to the log, it is safe to write cn to disk
+	 */
+	cur = cn->hnext;
+	while (cur && can_dirty) {
+		if (cur->jlist && cur->jlist->j_len > 0 &&
+		    atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh &&
+		    cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
+			can_dirty = 0;
+		}
+		cur = cur->hnext;
+	}
+	return can_dirty;
+}
+
+/* syncs the commit blocks, but does not force the real buffers to disk
+** will wait until the current transaction is done/committed before returning
+*/
+int journal_end_sync(struct reiserfs_transaction_handle *th,
+		     struct super_block *sb, unsigned long nblocks)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+
+	BUG_ON(!th->t_trans_id);
+	/* you can sync while nested, very, very bad */
+	BUG_ON(th->t_refcount > 1);
+	if (journal->j_len == 0) {
+		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
+					     1);
+		journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
+	}
+	return do_journal_end(th, sb, nblocks, COMMIT_NOW | WAIT);
+}
+
+/*
+** writeback the pending async commits to disk
+*/
+static void flush_async_commits(struct work_struct *work)
+{
+	struct reiserfs_journal *journal =
+		container_of(work, struct reiserfs_journal, j_work.work);
+	struct super_block *sb = journal->j_work_sb;
+	struct reiserfs_journal_list *jl;
+	struct list_head *entry;
+
+	reiserfs_write_lock(sb);
+	if (!list_empty(&journal->j_journal_list)) {
+		/* last entry is the youngest, commit it and you get everything */
+		entry = journal->j_journal_list.prev;
+		jl = JOURNAL_LIST_ENTRY(entry);
+		flush_commit_list(sb, jl, 1);
+	}
+	reiserfs_write_unlock(sb);
+}
+
+/*
+** flushes any old transactions to disk
+** ends the current transaction if it is too old
+*/
+int reiserfs_flush_old_commits(struct super_block *sb)
+{
+	time_t now;
+	struct reiserfs_transaction_handle th;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+
+	now = get_seconds();
+	/* safety check so we don't flush while we are replaying the log during
+	 * mount
+	 */
+	if (list_empty(&journal->j_journal_list)) {
+		return 0;
+	}
+
+	/* check the current transaction.  If there are no writers, and it is
+	 * too old, finish it, and force the commit blocks to disk
+	 */
+	if (atomic_read(&journal->j_wcount) <= 0 &&
+	    journal->j_trans_start_time > 0 &&
+	    journal->j_len > 0 &&
+	    (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
+		if (!journal_join(&th, sb, 1)) {
+			reiserfs_prepare_for_journal(sb,
+						     SB_BUFFER_WITH_SB(sb),
+						     1);
+			journal_mark_dirty(&th, sb,
+					   SB_BUFFER_WITH_SB(sb));
+
+			/* we're only being called from kreiserfsd, it makes no sense to do
+			 ** an async commit so that kreiserfsd can do it later
+			 */
+			do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT);
+		}
+	}
+	return sb->s_dirt;
+}
+
+/*
+** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit
+**
+** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all
+** the writers are done.  By the time it wakes up, the transaction it was called has already ended, so it just
+** flushes the commit list and returns 0.
+**
+** Won't batch when flush or commit_now is set.  Also won't batch when others are waiting on j_join_wait.
+**
+** Note, we can't allow the journal_end to proceed while there are still writers in the log.
+*/
+static int check_journal_end(struct reiserfs_transaction_handle *th,
+			     struct super_block *sb, unsigned long nblocks,
+			     int flags)
+{
+
+	time_t now;
+	int flush = flags & FLUSH_ALL;
+	int commit_now = flags & COMMIT_NOW;
+	int wait_on_commit = flags & WAIT;
+	struct reiserfs_journal_list *jl;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+
+	BUG_ON(!th->t_trans_id);
+
+	if (th->t_trans_id != journal->j_trans_id) {
+		reiserfs_panic(th->t_super, "journal-1577",
+			       "handle trans id %ld != current trans id %ld",
+			       th->t_trans_id, journal->j_trans_id);
+	}
+
+	journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);
+	if (atomic_read(&(journal->j_wcount)) > 0) {	/* <= 0 is allowed.  unmounting might not call begin */
+		atomic_dec(&(journal->j_wcount));
+	}
+
+	/* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released
+	 ** will be dealt with by next transaction that actually writes something, but should be taken
+	 ** care of in this trans
+	 */
+	BUG_ON(journal->j_len == 0);
+
+	/* if wcount > 0, and we are called to with flush or commit_now,
+	 ** we wait on j_join_wait.  We will wake up when the last writer has
+	 ** finished the transaction, and started it on its way to the disk.
+	 ** Then, we flush the commit or journal list, and just return 0
+	 ** because the rest of journal end was already done for this transaction.
+	 */
+	if (atomic_read(&(journal->j_wcount)) > 0) {
+		if (flush || commit_now) {
+			unsigned trans_id;
+
+			jl = journal->j_current_jl;
+			trans_id = jl->j_trans_id;
+			if (wait_on_commit)
+				jl->j_state |= LIST_COMMIT_PENDING;
+			atomic_set(&(journal->j_jlock), 1);
+			if (flush) {
+				journal->j_next_full_flush = 1;
+			}
+			unlock_journal(sb);
+
+			/* sleep while the current transaction is still j_jlocked */
+			while (journal->j_trans_id == trans_id) {
+				if (atomic_read(&journal->j_jlock)) {
+					queue_log_writer(sb);
+				} else {
+					lock_journal(sb);
+					if (journal->j_trans_id == trans_id) {
+						atomic_set(&(journal->j_jlock),
+							   1);
+					}
+					unlock_journal(sb);
+				}
+			}
+			BUG_ON(journal->j_trans_id == trans_id);
+			
+			if (commit_now
+			    && journal_list_still_alive(sb, trans_id)
+			    && wait_on_commit) {
+				flush_commit_list(sb, jl, 1);
+			}
+			return 0;
+		}
+		unlock_journal(sb);
+		return 0;
+	}
+
+	/* deal with old transactions where we are the last writers */
+	now = get_seconds();
+	if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
+		commit_now = 1;
+		journal->j_next_async_flush = 1;
+	}
+	/* don't batch when someone is waiting on j_join_wait */
+	/* don't batch when syncing the commit or flushing the whole trans */
+	if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock)))
+	    && !flush && !commit_now && (journal->j_len < journal->j_max_batch)
+	    && journal->j_len_alloc < journal->j_max_batch
+	    && journal->j_cnode_free > (journal->j_trans_max * 3)) {
+		journal->j_bcount++;
+		unlock_journal(sb);
+		return 0;
+	}
+
+	if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) {
+		reiserfs_panic(sb, "journal-003",
+			       "j_start (%ld) is too high",
+			       journal->j_start);
+	}
+	return 1;
+}
+
+/*
+** Does all the work that makes deleting blocks safe.
+** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on.
+**
+** otherwise:
+** set a bit for the block in the journal bitmap.  That will prevent it from being allocated for unformatted nodes
+** before this transaction has finished.
+**
+** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers.  That will prevent any old transactions with
+** this block from trying to flush to the real location.  Since we aren't removing the cnode from the journal_list_hash,
+** the block can't be reallocated yet.
+**
+** Then remove it from the current transaction, decrementing any counters and filing it on the clean list.
+*/
+int journal_mark_freed(struct reiserfs_transaction_handle *th,
+		       struct super_block *sb, b_blocknr_t blocknr)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_journal_cnode *cn = NULL;
+	struct buffer_head *bh = NULL;
+	struct reiserfs_list_bitmap *jb = NULL;
+	int cleaned = 0;
+	BUG_ON(!th->t_trans_id);
+
+	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
+	if (cn && cn->bh) {
+		bh = cn->bh;
+		get_bh(bh);
+	}
+	/* if it is journal new, we just remove it from this transaction */
+	if (bh && buffer_journal_new(bh)) {
+		clear_buffer_journal_new(bh);
+		clear_prepared_bits(bh);
+		reiserfs_clean_and_file_buffer(bh);
+		cleaned = remove_from_transaction(sb, blocknr, cleaned);
+	} else {
+		/* set the bit for this block in the journal bitmap for this transaction */
+		jb = journal->j_current_jl->j_list_bitmap;
+		if (!jb) {
+			reiserfs_panic(sb, "journal-1702",
+				       "journal_list_bitmap is NULL");
+		}
+		set_bit_in_list_bitmap(sb, blocknr, jb);
+
+		/* Note, the entire while loop is not allowed to schedule.  */
+
+		if (bh) {
+			clear_prepared_bits(bh);
+			reiserfs_clean_and_file_buffer(bh);
+		}
+		cleaned = remove_from_transaction(sb, blocknr, cleaned);
+
+		/* find all older transactions with this block, make sure they don't try to write it out */
+		cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
+					  blocknr);
+		while (cn) {
+			if (sb == cn->sb && blocknr == cn->blocknr) {
+				set_bit(BLOCK_FREED, &cn->state);
+				if (cn->bh) {
+					if (!cleaned) {
+						/* remove_from_transaction will brelse the buffer if it was 
+						 ** in the current trans
+						 */
+						clear_buffer_journal_dirty(cn->
+									   bh);
+						clear_buffer_dirty(cn->bh);
+						clear_buffer_journal_test(cn->
+									  bh);
+						cleaned = 1;
+						put_bh(cn->bh);
+						if (atomic_read
+						    (&(cn->bh->b_count)) < 0) {
+							reiserfs_warning(sb,
+								 "journal-2138",
+								 "cn->bh->b_count < 0");
+						}
+					}
+					if (cn->jlist) {	/* since we are clearing the bh, we MUST dec nonzerolen */
+						atomic_dec(&
+							   (cn->jlist->
+							    j_nonzerolen));
+					}
+					cn->bh = NULL;
+				}
+			}
+			cn = cn->hnext;
+		}
+	}
+
+	if (bh)
+		release_buffer_page(bh); /* get_hash grabs the buffer */
+	return 0;
+}
+
+void reiserfs_update_inode_transaction(struct inode *inode)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb);
+	REISERFS_I(inode)->i_jl = journal->j_current_jl;
+	REISERFS_I(inode)->i_trans_id = journal->j_trans_id;
+}
+
+/*
+ * returns -1 on error, 0 if no commits/barriers were done and 1
+ * if a transaction was actually committed and the barrier was done
+ */
+static int __commit_trans_jl(struct inode *inode, unsigned long id,
+			     struct reiserfs_journal_list *jl)
+{
+	struct reiserfs_transaction_handle th;
+	struct super_block *sb = inode->i_sb;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	int ret = 0;
+
+	/* is it from the current transaction, or from an unknown transaction? */
+	if (id == journal->j_trans_id) {
+		jl = journal->j_current_jl;
+		/* try to let other writers come in and grow this transaction */
+		let_transaction_grow(sb, id);
+		if (journal->j_trans_id != id) {
+			goto flush_commit_only;
+		}
+
+		ret = journal_begin(&th, sb, 1);
+		if (ret)
+			return ret;
+
+		/* someone might have ended this transaction while we joined */
+		if (journal->j_trans_id != id) {
+			reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
+						     1);
+			journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb));
+			ret = journal_end(&th, sb, 1);
+			goto flush_commit_only;
+		}
+
+		ret = journal_end_sync(&th, sb, 1);
+		if (!ret)
+			ret = 1;
+
+	} else {
+		/* this gets tricky, we have to make sure the journal list in
+		 * the inode still exists.  We know the list is still around
+		 * if we've got a larger transaction id than the oldest list
+		 */
+	      flush_commit_only:
+		if (journal_list_still_alive(inode->i_sb, id)) {
+			/*
+			 * we only set ret to 1 when we know for sure
+			 * the barrier hasn't been started yet on the commit
+			 * block.
+			 */
+			if (atomic_read(&jl->j_commit_left) > 1)
+				ret = 1;
+			flush_commit_list(sb, jl, 1);
+			if (journal->j_errno)
+				ret = journal->j_errno;
+		}
+	}
+	/* otherwise the list is gone, and long since committed */
+	return ret;
+}
+
+int reiserfs_commit_for_inode(struct inode *inode)
+{
+	unsigned int id = REISERFS_I(inode)->i_trans_id;
+	struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
+
+	/* for the whole inode, assume unset id means it was
+	 * changed in the current transaction.  More conservative
+	 */
+	if (!id || !jl) {
+		reiserfs_update_inode_transaction(inode);
+		id = REISERFS_I(inode)->i_trans_id;
+		/* jl will be updated in __commit_trans_jl */
+	}
+
+	return __commit_trans_jl(inode, id, jl);
+}
+
+void reiserfs_restore_prepared_buffer(struct super_block *sb,
+				      struct buffer_head *bh)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	PROC_INFO_INC(sb, journal.restore_prepared);
+	if (!bh) {
+		return;
+	}
+	if (test_clear_buffer_journal_restore_dirty(bh) &&
+	    buffer_journal_dirty(bh)) {
+		struct reiserfs_journal_cnode *cn;
+		cn = get_journal_hash_dev(sb,
+					  journal->j_list_hash_table,
+					  bh->b_blocknr);
+		if (cn && can_dirty(cn)) {
+			set_buffer_journal_test(bh);
+			mark_buffer_dirty(bh);
+		}
+	}
+	clear_buffer_journal_prepared(bh);
+}
+
+extern struct tree_balance *cur_tb;
+/*
+** before we can change a metadata block, we have to make sure it won't
+** be written to disk while we are altering it.  So, we must:
+** clean it
+** wait on it.
+**
+*/
+int reiserfs_prepare_for_journal(struct super_block *sb,
+				 struct buffer_head *bh, int wait)
+{
+	PROC_INFO_INC(sb, journal.prepare);
+
+	if (!trylock_buffer(bh)) {
+		if (!wait)
+			return 0;
+		lock_buffer(bh);
+	}
+	set_buffer_journal_prepared(bh);
+	if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {
+		clear_buffer_journal_test(bh);
+		set_buffer_journal_restore_dirty(bh);
+	}
+	unlock_buffer(bh);
+	return 1;
+}
+
+static void flush_old_journal_lists(struct super_block *s)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	struct reiserfs_journal_list *jl;
+	struct list_head *entry;
+	time_t now = get_seconds();
+
+	while (!list_empty(&journal->j_journal_list)) {
+		entry = journal->j_journal_list.next;
+		jl = JOURNAL_LIST_ENTRY(entry);
+		/* this check should always be run, to send old lists to disk */
+		if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4)) &&
+		    atomic_read(&jl->j_commit_left) == 0 &&
+		    test_transaction(s, jl)) {
+			flush_used_journal_lists(s, jl);
+		} else {
+			break;
+		}
+	}
+}
+
+/*
+** long and ugly.  If flush, will not return until all commit
+** blocks and all real buffers in the trans are on disk.
+** If no_async, won't return until all commit blocks are on disk.
+**
+** keep reading, there are comments as you go along
+**
+** If the journal is aborted, we just clean up. Things like flushing
+** journal lists, etc just won't happen.
+*/
+static int do_journal_end(struct reiserfs_transaction_handle *th,
+			  struct super_block *sb, unsigned long nblocks,
+			  int flags)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	struct reiserfs_journal_cnode *cn, *next, *jl_cn;
+	struct reiserfs_journal_cnode *last_cn = NULL;
+	struct reiserfs_journal_desc *desc;
+	struct reiserfs_journal_commit *commit;
+	struct buffer_head *c_bh;	/* commit bh */
+	struct buffer_head *d_bh;	/* desc bh */
+	int cur_write_start = 0;	/* start index of current log write */
+	int old_start;
+	int i;
+	int flush;
+	int wait_on_commit;
+	struct reiserfs_journal_list *jl, *temp_jl;
+	struct list_head *entry, *safe;
+	unsigned long jindex;
+	unsigned int commit_trans_id;
+	int trans_half;
+
+	BUG_ON(th->t_refcount > 1);
+	BUG_ON(!th->t_trans_id);
+
+	/* protect flush_older_commits from doing mistakes if the
+           transaction ID counter gets overflowed.  */
+	if (th->t_trans_id == ~0U)
+		flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
+	flush = flags & FLUSH_ALL;
+	wait_on_commit = flags & WAIT;
+
+	put_fs_excl();
+	current->journal_info = th->t_handle_save;
+	reiserfs_check_lock_depth(sb, "journal end");
+	if (journal->j_len == 0) {
+		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
+					     1);
+		journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
+	}
+
+	lock_journal(sb);
+	if (journal->j_next_full_flush) {
+		flags |= FLUSH_ALL;
+		flush = 1;
+	}
+	if (journal->j_next_async_flush) {
+		flags |= COMMIT_NOW | WAIT;
+		wait_on_commit = 1;
+	}
+
+	/* check_journal_end locks the journal, and unlocks if it does not return 1
+	 ** it tells us if we should continue with the journal_end, or just return
+	 */
+	if (!check_journal_end(th, sb, nblocks, flags)) {
+		sb->s_dirt = 1;
+		wake_queued_writers(sb);
+		reiserfs_async_progress_wait(sb);
+		goto out;
+	}
+
+	/* check_journal_end might set these, check again */
+	if (journal->j_next_full_flush) {
+		flush = 1;
+	}
+
+	/*
+	 ** j must wait means we have to flush the log blocks, and the real blocks for
+	 ** this transaction
+	 */
+	if (journal->j_must_wait > 0) {
+		flush = 1;
+	}
+#ifdef REISERFS_PREALLOCATE
+	/* quota ops might need to nest, setup the journal_info pointer for them
+	 * and raise the refcount so that it is > 0. */
+	current->journal_info = th;
+	th->t_refcount++;
+	reiserfs_discard_all_prealloc(th);	/* it should not involve new blocks into
+						 * the transaction */
+	th->t_refcount--;
+	current->journal_info = th->t_handle_save;
+#endif
+
+	/* setup description block */
+	d_bh =
+	    journal_getblk(sb,
+			   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+			   journal->j_start);
+	set_buffer_uptodate(d_bh);
+	desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
+	memset(d_bh->b_data, 0, d_bh->b_size);
+	memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);
+	set_desc_trans_id(desc, journal->j_trans_id);
+
+	/* setup commit block.  Don't write (keep it clean too) this one until after everyone else is written */
+	c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+			      ((journal->j_start + journal->j_len +
+				1) % SB_ONDISK_JOURNAL_SIZE(sb)));
+	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
+	memset(c_bh->b_data, 0, c_bh->b_size);
+	set_commit_trans_id(commit, journal->j_trans_id);
+	set_buffer_uptodate(c_bh);
+
+	/* init this journal list */
+	jl = journal->j_current_jl;
+
+	/* we lock the commit before doing anything because
+	 * we want to make sure nobody tries to run flush_commit_list until
+	 * the new transaction is fully setup, and we've already flushed the
+	 * ordered bh list
+	 */
+	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
+
+	/* save the transaction id in case we need to commit it later */
+	commit_trans_id = jl->j_trans_id;
+
+	atomic_set(&jl->j_older_commits_done, 0);
+	jl->j_trans_id = journal->j_trans_id;
+	jl->j_timestamp = journal->j_trans_start_time;
+	jl->j_commit_bh = c_bh;
+	jl->j_start = journal->j_start;
+	jl->j_len = journal->j_len;
+	atomic_set(&jl->j_nonzerolen, journal->j_len);
+	atomic_set(&jl->j_commit_left, journal->j_len + 2);
+	jl->j_realblock = NULL;
+
+	/* The ENTIRE FOR LOOP MUST not cause schedule to occur.
+	 **  for each real block, add it to the journal list hash,
+	 ** copy into real block index array in the commit or desc block
+	 */
+	trans_half = journal_trans_half(sb->s_blocksize);
+	for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
+		if (buffer_journaled(cn->bh)) {
+			jl_cn = get_cnode(sb);
+			if (!jl_cn) {
+				reiserfs_panic(sb, "journal-1676",
+					       "get_cnode returned NULL");
+			}
+			if (i == 0) {
+				jl->j_realblock = jl_cn;
+			}
+			jl_cn->prev = last_cn;
+			jl_cn->next = NULL;
+			if (last_cn) {
+				last_cn->next = jl_cn;
+			}
+			last_cn = jl_cn;
+			/* make sure the block we are trying to log is not a block
+			   of journal or reserved area */
+
+			if (is_block_in_log_or_reserved_area
+			    (sb, cn->bh->b_blocknr)) {
+				reiserfs_panic(sb, "journal-2332",
+					       "Trying to log block %lu, "
+					       "which is a log block",
+					       cn->bh->b_blocknr);
+			}
+			jl_cn->blocknr = cn->bh->b_blocknr;
+			jl_cn->state = 0;
+			jl_cn->sb = sb;
+			jl_cn->bh = cn->bh;
+			jl_cn->jlist = jl;
+			insert_journal_hash(journal->j_list_hash_table, jl_cn);
+			if (i < trans_half) {
+				desc->j_realblock[i] =
+				    cpu_to_le32(cn->bh->b_blocknr);
+			} else {
+				commit->j_realblock[i - trans_half] =
+				    cpu_to_le32(cn->bh->b_blocknr);
+			}
+		} else {
+			i--;
+		}
+	}
+	set_desc_trans_len(desc, journal->j_len);
+	set_desc_mount_id(desc, journal->j_mount_id);
+	set_desc_trans_id(desc, journal->j_trans_id);
+	set_commit_trans_len(commit, journal->j_len);
+
+	/* special check in case all buffers in the journal were marked for not logging */
+	BUG_ON(journal->j_len == 0);
+
+	/* we're about to dirty all the log blocks, mark the description block
+	 * dirty now too.  Don't mark the commit block dirty until all the
+	 * others are on disk
+	 */
+	mark_buffer_dirty(d_bh);
+
+	/* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
+	cur_write_start = journal->j_start;
+	cn = journal->j_first;
+	jindex = 1;		/* start at one so we don't get the desc again */
+	while (cn) {
+		clear_buffer_journal_new(cn->bh);
+		/* copy all the real blocks into log area.  dirty log blocks */
+		if (buffer_journaled(cn->bh)) {
+			struct buffer_head *tmp_bh;
+			char *addr;
+			struct page *page;
+			tmp_bh =
+			    journal_getblk(sb,
+					   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+					   ((cur_write_start +
+					     jindex) %
+					    SB_ONDISK_JOURNAL_SIZE(sb)));
+			set_buffer_uptodate(tmp_bh);
+			page = cn->bh->b_page;
+			addr = kmap(page);
+			memcpy(tmp_bh->b_data,
+			       addr + offset_in_page(cn->bh->b_data),
+			       cn->bh->b_size);
+			kunmap(page);
+			mark_buffer_dirty(tmp_bh);
+			jindex++;
+			set_buffer_journal_dirty(cn->bh);
+			clear_buffer_journaled(cn->bh);
+		} else {
+			/* JDirty cleared sometime during transaction.  don't log this one */
+			reiserfs_warning(sb, "journal-2048",
+					 "BAD, buffer in journal hash, "
+					 "but not JDirty!");
+			brelse(cn->bh);
+		}
+		next = cn->next;
+		free_cnode(sb, cn);
+		cn = next;
+		reiserfs_write_unlock(sb);
+		cond_resched();
+		reiserfs_write_lock(sb);
+	}
+
+	/* we are done  with both the c_bh and d_bh, but
+	 ** c_bh must be written after all other commit blocks,
+	 ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
+	 */
+
+	journal->j_current_jl = alloc_journal_list(sb);
+
+	/* now it is safe to insert this transaction on the main list */
+	list_add_tail(&jl->j_list, &journal->j_journal_list);
+	list_add_tail(&jl->j_working_list, &journal->j_working_list);
+	journal->j_num_work_lists++;
+
+	/* reset journal values for the next transaction */
+	old_start = journal->j_start;
+	journal->j_start =
+	    (journal->j_start + journal->j_len +
+	     2) % SB_ONDISK_JOURNAL_SIZE(sb);
+	atomic_set(&(journal->j_wcount), 0);
+	journal->j_bcount = 0;
+	journal->j_last = NULL;
+	journal->j_first = NULL;
+	journal->j_len = 0;
+	journal->j_trans_start_time = 0;
+	/* check for trans_id overflow */
+	if (++journal->j_trans_id == 0)
+		journal->j_trans_id = 10;
+	journal->j_current_jl->j_trans_id = journal->j_trans_id;
+	journal->j_must_wait = 0;
+	journal->j_len_alloc = 0;
+	journal->j_next_full_flush = 0;
+	journal->j_next_async_flush = 0;
+	init_journal_hash(sb);
+
+	// make sure reiserfs_add_jh sees the new current_jl before we
+	// write out the tails
+	smp_mb();
+
+	/* tail conversion targets have to hit the disk before we end the
+	 * transaction.  Otherwise a later transaction might repack the tail
+	 * before this transaction commits, leaving the data block unflushed and
+	 * clean, if we crash before the later transaction commits, the data block
+	 * is lost.
+	 */
+	if (!list_empty(&jl->j_tail_bh_list)) {
+		reiserfs_write_unlock(sb);
+		write_ordered_buffers(&journal->j_dirty_buffers_lock,
+				      journal, jl, &jl->j_tail_bh_list);
+		reiserfs_write_lock(sb);
+	}
+	BUG_ON(!list_empty(&jl->j_tail_bh_list));
+	mutex_unlock(&jl->j_commit_mutex);
+
+	/* honor the flush wishes from the caller, simple commits can
+	 ** be done outside the journal lock, they are done below
+	 **
+	 ** if we don't flush the commit list right now, we put it into
+	 ** the work queue so the people waiting on the async progress work
+	 ** queue don't wait for this proc to flush journal lists and such.
+	 */
+	if (flush) {
+		flush_commit_list(sb, jl, 1);
+		flush_journal_list(sb, jl, 1);
+	} else if (!(jl->j_state & LIST_COMMIT_PENDING))
+		queue_delayed_work(commit_wq, &journal->j_work, HZ / 10);
+
+	/* if the next transaction has any chance of wrapping, flush
+	 ** transactions that might get overwritten.  If any journal lists are very
+	 ** old flush them as well.
+	 */
+      first_jl:
+	list_for_each_safe(entry, safe, &journal->j_journal_list) {
+		temp_jl = JOURNAL_LIST_ENTRY(entry);
+		if (journal->j_start <= temp_jl->j_start) {
+			if ((journal->j_start + journal->j_trans_max + 1) >=
+			    temp_jl->j_start) {
+				flush_used_journal_lists(sb, temp_jl);
+				goto first_jl;
+			} else if ((journal->j_start +
+				    journal->j_trans_max + 1) <
+				   SB_ONDISK_JOURNAL_SIZE(sb)) {
+				/* if we don't cross into the next transaction and we don't
+				 * wrap, there is no way we can overlap any later transactions
+				 * break now
+				 */
+				break;
+			}
+		} else if ((journal->j_start +
+			    journal->j_trans_max + 1) >
+			   SB_ONDISK_JOURNAL_SIZE(sb)) {
+			if (((journal->j_start + journal->j_trans_max + 1) %
+			     SB_ONDISK_JOURNAL_SIZE(sb)) >=
+			    temp_jl->j_start) {
+				flush_used_journal_lists(sb, temp_jl);
+				goto first_jl;
+			} else {
+				/* we don't overlap anything from out start to the end of the
+				 * log, and our wrapped portion doesn't overlap anything at
+				 * the start of the log.  We can break
+				 */
+				break;
+			}
+		}
+	}
+	flush_old_journal_lists(sb);
+
+	journal->j_current_jl->j_list_bitmap =
+	    get_list_bitmap(sb, journal->j_current_jl);
+
+	if (!(journal->j_current_jl->j_list_bitmap)) {
+		reiserfs_panic(sb, "journal-1996",
+			       "could not get a list bitmap");
+	}
+
+	atomic_set(&(journal->j_jlock), 0);
+	unlock_journal(sb);
+	/* wake up any body waiting to join. */
+	clear_bit(J_WRITERS_QUEUED, &journal->j_state);
+	wake_up(&(journal->j_join_wait));
+
+	if (!flush && wait_on_commit &&
+	    journal_list_still_alive(sb, commit_trans_id)) {
+		flush_commit_list(sb, jl, 1);
+	}
+      out:
+	reiserfs_check_lock_depth(sb, "journal end2");
+
+	memset(th, 0, sizeof(*th));
+	/* Re-set th->t_super, so we can properly keep track of how many
+	 * persistent transactions there are. We need to do this so if this
+	 * call is part of a failed restart_transaction, we can free it later */
+	th->t_super = sb;
+
+	return journal->j_errno;
+}
+
+/* Send the file system read only and refuse new transactions */
+void reiserfs_abort_journal(struct super_block *sb, int errno)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	if (test_bit(J_ABORTED, &journal->j_state))
+		return;
+
+	if (!journal->j_errno)
+		journal->j_errno = errno;
+
+	sb->s_flags |= MS_RDONLY;
+	set_bit(J_ABORTED, &journal->j_state);
+
+#ifdef CONFIG_REISERFS_CHECK
+	dump_stack();
+#endif
+}
+
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
new file mode 100644
index 00000000..03d85cbf
--- /dev/null
+++ b/fs/reiserfs/lbalance.c
@@ -0,0 +1,1311 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <asm/uaccess.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/buffer_head.h>
+
+/* these are used in do_balance.c */
+
+/* leaf_move_items
+   leaf_shift_left
+   leaf_shift_right
+   leaf_delete_items
+   leaf_insert_into_buf
+   leaf_paste_in_buffer
+   leaf_cut_from_buffer
+   leaf_paste_entries
+   */
+
+/* copy copy_count entries from source directory item to dest buffer (creating new item if needed) */
+static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
+				  struct buffer_head *source, int last_first,
+				  int item_num, int from, int copy_count)
+{
+	struct buffer_head *dest = dest_bi->bi_bh;
+	int item_num_in_dest;	/* either the number of target item,
+				   or if we must create a new item,
+				   the number of the item we will
+				   create it next to */
+	struct item_head *ih;
+	struct reiserfs_de_head *deh;
+	int copy_records_len;	/* length of all records in item to be copied */
+	char *records;
+
+	ih = B_N_PITEM_HEAD(source, item_num);
+
+	RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item");
+
+	/* length of all record to be copied and first byte of the last of them */
+	deh = B_I_DEH(source, ih);
+	if (copy_count) {
+		copy_records_len = (from ? deh_location(&(deh[from - 1])) :
+				    ih_item_len(ih)) -
+		    deh_location(&(deh[from + copy_count - 1]));
+		records =
+		    source->b_data + ih_location(ih) +
+		    deh_location(&(deh[from + copy_count - 1]));
+	} else {
+		copy_records_len = 0;
+		records = NULL;
+	}
+
+	/* when copy last to first, dest buffer can contain 0 items */
+	item_num_in_dest =
+	    (last_first ==
+	     LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest)
+							       - 1);
+
+	/* if there are no items in dest or the first/last item in dest is not item of the same directory */
+	if ((item_num_in_dest == -1) ||
+	    (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) ||
+	    (last_first == LAST_TO_FIRST
+	     && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key,
+							 B_N_PKEY(dest,
+								  item_num_in_dest))))
+	{
+		/* create new item in dest */
+		struct item_head new_ih;
+
+		/* form item header */
+		memcpy(&new_ih.ih_key, &ih->ih_key, KEY_SIZE);
+		put_ih_version(&new_ih, KEY_FORMAT_3_5);
+		/* calculate item len */
+		put_ih_item_len(&new_ih,
+				DEH_SIZE * copy_count + copy_records_len);
+		put_ih_entry_count(&new_ih, 0);
+
+		if (last_first == LAST_TO_FIRST) {
+			/* form key by the following way */
+			if (from < I_ENTRY_COUNT(ih)) {
+				set_le_ih_k_offset(&new_ih,
+						   deh_offset(&(deh[from])));
+				/*memcpy (&new_ih.ih_key.k_offset, &deh[from].deh_offset, SHORT_KEY_SIZE); */
+			} else {
+				/* no entries will be copied to this item in this function */
+				set_le_ih_k_offset(&new_ih, U32_MAX);
+				/* this item is not yet valid, but we want I_IS_DIRECTORY_ITEM to return 1 for it, so we -1 */
+			}
+			set_le_key_k_type(KEY_FORMAT_3_5, &(new_ih.ih_key),
+					  TYPE_DIRENTRY);
+		}
+
+		/* insert item into dest buffer */
+		leaf_insert_into_buf(dest_bi,
+				     (last_first ==
+				      LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest),
+				     &new_ih, NULL, 0);
+	} else {
+		/* prepare space for entries */
+		leaf_paste_in_buffer(dest_bi,
+				     (last_first ==
+				      FIRST_TO_LAST) ? (B_NR_ITEMS(dest) -
+							1) : 0, MAX_US_INT,
+				     DEH_SIZE * copy_count + copy_records_len,
+				     records, 0);
+	}
+
+	item_num_in_dest =
+	    (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0;
+
+	leaf_paste_entries(dest_bi, item_num_in_dest,
+			   (last_first ==
+			    FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD(dest,
+									  item_num_in_dest))
+			   : 0, copy_count, deh + from, records,
+			   DEH_SIZE * copy_count + copy_records_len);
+}
+
+/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or
+   part of it or nothing (see the return 0 below) from SOURCE to the end
+   (if last_first) or beginning (!last_first) of the DEST */
+/* returns 1 if anything was copied, else 0 */
+static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
+				   struct buffer_head *src, int last_first,
+				   int bytes_or_entries)
+{
+	struct buffer_head *dest = dest_bi->bi_bh;
+	int dest_nr_item, src_nr_item;	/* number of items in the source and destination buffers */
+	struct item_head *ih;
+	struct item_head *dih;
+
+	dest_nr_item = B_NR_ITEMS(dest);
+
+	if (last_first == FIRST_TO_LAST) {
+		/* if ( DEST is empty or first item of SOURCE and last item of DEST are the items of different objects
+		   or of different types ) then there is no need to treat this item differently from the other items
+		   that we copy, so we return */
+		ih = B_N_PITEM_HEAD(src, 0);
+		dih = B_N_PITEM_HEAD(dest, dest_nr_item - 1);
+		if (!dest_nr_item
+		    || (!op_is_left_mergeable(&(ih->ih_key), src->b_size)))
+			/* there is nothing to merge */
+			return 0;
+
+		RFALSE(!ih_item_len(ih),
+		       "vs-10010: item can not have empty length");
+
+		if (is_direntry_le_ih(ih)) {
+			if (bytes_or_entries == -1)
+				/* copy all entries to dest */
+				bytes_or_entries = ih_entry_count(ih);
+			leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, 0, 0,
+					      bytes_or_entries);
+			return 1;
+		}
+
+		/* copy part of the body of the first item of SOURCE to the end of the body of the last item of the DEST
+		   part defined by 'bytes_or_entries'; if bytes_or_entries == -1 copy whole body; don't create new item header
+		 */
+		if (bytes_or_entries == -1)
+			bytes_or_entries = ih_item_len(ih);
+
+#ifdef CONFIG_REISERFS_CHECK
+		else {
+			if (bytes_or_entries == ih_item_len(ih)
+			    && is_indirect_le_ih(ih))
+				if (get_ih_free_space(ih))
+					reiserfs_panic(sb_from_bi(dest_bi),
+						       "vs-10020",
+						       "last unformatted node "
+						       "must be filled "
+						       "entirely (%h)", ih);
+		}
+#endif
+
+		/* merge first item (or its part) of src buffer with the last
+		   item of dest buffer. Both are of the same file */
+		leaf_paste_in_buffer(dest_bi,
+				     dest_nr_item - 1, ih_item_len(dih),
+				     bytes_or_entries, B_I_PITEM(src, ih), 0);
+
+		if (is_indirect_le_ih(dih)) {
+			RFALSE(get_ih_free_space(dih),
+			       "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space",
+			       ih);
+			if (bytes_or_entries == ih_item_len(ih))
+				set_ih_free_space(dih, get_ih_free_space(ih));
+		}
+
+		return 1;
+	}
+
+	/* copy boundary item to right (last_first == LAST_TO_FIRST) */
+
+	/* ( DEST is empty or last item of SOURCE and first item of DEST
+	   are the items of different object or of different types )
+	 */
+	src_nr_item = B_NR_ITEMS(src);
+	ih = B_N_PITEM_HEAD(src, src_nr_item - 1);
+	dih = B_N_PITEM_HEAD(dest, 0);
+
+	if (!dest_nr_item || !op_is_left_mergeable(&(dih->ih_key), src->b_size))
+		return 0;
+
+	if (is_direntry_le_ih(ih)) {
+		if (bytes_or_entries == -1)
+			/* bytes_or_entries = entries number in last item body of SOURCE */
+			bytes_or_entries = ih_entry_count(ih);
+
+		leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
+				      src_nr_item - 1,
+				      ih_entry_count(ih) - bytes_or_entries,
+				      bytes_or_entries);
+		return 1;
+	}
+
+	/* copy part of the body of the last item of SOURCE to the begin of the body of the first item of the DEST;
+	   part defined by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; change first item key of the DEST;
+	   don't create new item header
+	 */
+
+	RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih),
+	       "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)",
+	       ih);
+
+	if (bytes_or_entries == -1) {
+		/* bytes_or_entries = length of last item body of SOURCE */
+		bytes_or_entries = ih_item_len(ih);
+
+		RFALSE(le_ih_k_offset(dih) !=
+		       le_ih_k_offset(ih) + op_bytes_number(ih, src->b_size),
+		       "vs-10050: items %h and %h do not match", ih, dih);
+
+		/* change first item key of the DEST */
+		set_le_ih_k_offset(dih, le_ih_k_offset(ih));
+
+		/* item becomes non-mergeable */
+		/* or mergeable if left item was */
+		set_le_ih_k_type(dih, le_ih_k_type(ih));
+	} else {
+		/* merge to right only part of item */
+		RFALSE(ih_item_len(ih) <= bytes_or_entries,
+		       "vs-10060: no so much bytes %lu (needed %lu)",
+		       (unsigned long)ih_item_len(ih),
+		       (unsigned long)bytes_or_entries);
+
+		/* change first item key of the DEST */
+		if (is_direct_le_ih(dih)) {
+			RFALSE(le_ih_k_offset(dih) <=
+			       (unsigned long)bytes_or_entries,
+			       "vs-10070: dih %h, bytes_or_entries(%d)", dih,
+			       bytes_or_entries);
+			set_le_ih_k_offset(dih,
+					   le_ih_k_offset(dih) -
+					   bytes_or_entries);
+		} else {
+			RFALSE(le_ih_k_offset(dih) <=
+			       (bytes_or_entries / UNFM_P_SIZE) * dest->b_size,
+			       "vs-10080: dih %h, bytes_or_entries(%d)",
+			       dih,
+			       (bytes_or_entries / UNFM_P_SIZE) * dest->b_size);
+			set_le_ih_k_offset(dih,
+					   le_ih_k_offset(dih) -
+					   ((bytes_or_entries / UNFM_P_SIZE) *
+					    dest->b_size));
+		}
+	}
+
+	leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries,
+			     B_I_PITEM(src,
+				       ih) + ih_item_len(ih) - bytes_or_entries,
+			     0);
+	return 1;
+}
+
+/* copy cpy_mun items from buffer src to buffer dest
+ * last_first == FIRST_TO_LAST means, that we copy cpy_num  items beginning from first-th item in src to tail of dest
+ * last_first == LAST_TO_FIRST means, that we copy cpy_num  items beginning from first-th item in src to head of dest
+ */
+static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
+				     struct buffer_head *src, int last_first,
+				     int first, int cpy_num)
+{
+	struct buffer_head *dest;
+	int nr, free_space;
+	int dest_before;
+	int last_loc, last_inserted_loc, location;
+	int i, j;
+	struct block_head *blkh;
+	struct item_head *ih;
+
+	RFALSE(last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST,
+	       "vs-10090: bad last_first parameter %d", last_first);
+	RFALSE(B_NR_ITEMS(src) - first < cpy_num,
+	       "vs-10100: too few items in source %d, required %d from %d",
+	       B_NR_ITEMS(src), cpy_num, first);
+	RFALSE(cpy_num < 0, "vs-10110: can not copy negative amount of items");
+	RFALSE(!dest_bi, "vs-10120: can not copy negative amount of items");
+
+	dest = dest_bi->bi_bh;
+
+	RFALSE(!dest, "vs-10130: can not copy negative amount of items");
+
+	if (cpy_num == 0)
+		return;
+
+	blkh = B_BLK_HEAD(dest);
+	nr = blkh_nr_item(blkh);
+	free_space = blkh_free_space(blkh);
+
+	/* we will insert items before 0-th or nr-th item in dest buffer. It depends of last_first parameter */
+	dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr;
+
+	/* location of head of first new item */
+	ih = B_N_PITEM_HEAD(dest, dest_before);
+
+	RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE,
+	       "vs-10140: not enough free space for headers %d (needed %d)",
+	       B_FREE_SPACE(dest), cpy_num * IH_SIZE);
+
+	/* prepare space for headers */
+	memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE);
+
+	/* copy item headers */
+	memcpy(ih, B_N_PITEM_HEAD(src, first), cpy_num * IH_SIZE);
+
+	free_space -= (IH_SIZE * cpy_num);
+	set_blkh_free_space(blkh, free_space);
+
+	/* location of unmovable item */
+	j = location = (dest_before == 0) ? dest->b_size : ih_location(ih - 1);
+	for (i = dest_before; i < nr + cpy_num; i++) {
+		location -= ih_item_len(ih + i - dest_before);
+		put_ih_location(ih + i - dest_before, location);
+	}
+
+	/* prepare space for items */
+	last_loc = ih_location(&(ih[nr + cpy_num - 1 - dest_before]));
+	last_inserted_loc = ih_location(&(ih[cpy_num - 1]));
+
+	/* check free space */
+	RFALSE(free_space < j - last_inserted_loc,
+	       "vs-10150: not enough free space for items %d (needed %d)",
+	       free_space, j - last_inserted_loc);
+
+	memmove(dest->b_data + last_loc,
+		dest->b_data + last_loc + j - last_inserted_loc,
+		last_inserted_loc - last_loc);
+
+	/* copy items */
+	memcpy(dest->b_data + last_inserted_loc,
+	       B_N_PITEM(src, (first + cpy_num - 1)), j - last_inserted_loc);
+
+	/* sizes, item number */
+	set_blkh_nr_item(blkh, nr + cpy_num);
+	set_blkh_free_space(blkh, free_space - (j - last_inserted_loc));
+
+	do_balance_mark_leaf_dirty(dest_bi->tb, dest, 0);
+
+	if (dest_bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
+		RFALSE(dc_block_number(t_dc) != dest->b_blocknr,
+		       "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu",
+		       (long unsigned)dest->b_blocknr,
+		       (long unsigned)dc_block_number(t_dc));
+		put_dc_size(t_dc,
+			    dc_size(t_dc) + (j - last_inserted_loc +
+					     IH_SIZE * cpy_num));
+
+		do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
+					       0);
+	}
+}
+
+/* This function splits the (liquid) item into two items (useful when
+   shifting part of an item into another node.) */
+static void leaf_item_bottle(struct buffer_info *dest_bi,
+			     struct buffer_head *src, int last_first,
+			     int item_num, int cpy_bytes)
+{
+	struct buffer_head *dest = dest_bi->bi_bh;
+	struct item_head *ih;
+
+	RFALSE(cpy_bytes == -1,
+	       "vs-10170: bytes == - 1 means: do not split item");
+
+	if (last_first == FIRST_TO_LAST) {
+		/* if ( if item in position item_num in buffer SOURCE is directory item ) */
+		ih = B_N_PITEM_HEAD(src, item_num);
+		if (is_direntry_le_ih(ih))
+			leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
+					      item_num, 0, cpy_bytes);
+		else {
+			struct item_head n_ih;
+
+			/* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST
+			   part defined by 'cpy_bytes'; create new item header; change old item_header (????);
+			   n_ih = new item_header;
+			 */
+			memcpy(&n_ih, ih, IH_SIZE);
+			put_ih_item_len(&n_ih, cpy_bytes);
+			if (is_indirect_le_ih(ih)) {
+				RFALSE(cpy_bytes == ih_item_len(ih)
+				       && get_ih_free_space(ih),
+				       "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)",
+				       (long unsigned)get_ih_free_space(ih));
+				set_ih_free_space(&n_ih, 0);
+			}
+
+			RFALSE(op_is_left_mergeable(&(ih->ih_key), src->b_size),
+			       "vs-10190: bad mergeability of item %h", ih);
+			n_ih.ih_version = ih->ih_version;	/* JDM Endian safe, both le */
+			leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih,
+					     B_N_PITEM(src, item_num), 0);
+		}
+	} else {
+		/*  if ( if item in position item_num in buffer SOURCE is directory item ) */
+		ih = B_N_PITEM_HEAD(src, item_num);
+		if (is_direntry_le_ih(ih))
+			leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
+					      item_num,
+					      I_ENTRY_COUNT(ih) - cpy_bytes,
+					      cpy_bytes);
+		else {
+			struct item_head n_ih;
+
+			/* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST
+			   part defined by 'cpy_bytes'; create new item header;
+			   n_ih = new item_header;
+			 */
+			memcpy(&n_ih, ih, SHORT_KEY_SIZE);
+
+			n_ih.ih_version = ih->ih_version;	/* JDM Endian safe, both le */
+
+			if (is_direct_le_ih(ih)) {
+				set_le_ih_k_offset(&n_ih,
+						   le_ih_k_offset(ih) +
+						   ih_item_len(ih) - cpy_bytes);
+				set_le_ih_k_type(&n_ih, TYPE_DIRECT);
+				set_ih_free_space(&n_ih, MAX_US_INT);
+			} else {
+				/* indirect item */
+				RFALSE(!cpy_bytes && get_ih_free_space(ih),
+				       "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended");
+				set_le_ih_k_offset(&n_ih,
+						   le_ih_k_offset(ih) +
+						   (ih_item_len(ih) -
+						    cpy_bytes) / UNFM_P_SIZE *
+						   dest->b_size);
+				set_le_ih_k_type(&n_ih, TYPE_INDIRECT);
+				set_ih_free_space(&n_ih, get_ih_free_space(ih));
+			}
+
+			/* set item length */
+			put_ih_item_len(&n_ih, cpy_bytes);
+
+			n_ih.ih_version = ih->ih_version;	/* JDM Endian safe, both le */
+
+			leaf_insert_into_buf(dest_bi, 0, &n_ih,
+					     B_N_PITEM(src,
+						       item_num) +
+					     ih_item_len(ih) - cpy_bytes, 0);
+		}
+	}
+}
+
+/* If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE to DEST.
+   If cpy_bytes not equal to minus one than copy cpy_num-1 whole items from SOURCE to DEST.
+   From last item copy cpy_num bytes for regular item and cpy_num directory entries for
+   directory item. */
+static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
+			   int last_first, int cpy_num, int cpy_bytes)
+{
+	struct buffer_head *dest;
+	int pos, i, src_nr_item, bytes;
+
+	dest = dest_bi->bi_bh;
+	RFALSE(!dest || !src, "vs-10210: !dest || !src");
+	RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
+	       "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST");
+	RFALSE(B_NR_ITEMS(src) < cpy_num,
+	       "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src),
+	       cpy_num);
+	RFALSE(cpy_num < 0, "vs-10240: cpy_num < 0 (%d)", cpy_num);
+
+	if (cpy_num == 0)
+		return 0;
+
+	if (last_first == FIRST_TO_LAST) {
+		/* copy items to left */
+		pos = 0;
+		if (cpy_num == 1)
+			bytes = cpy_bytes;
+		else
+			bytes = -1;
+
+		/* copy the first item or it part or nothing to the end of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */
+		i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes);
+		cpy_num -= i;
+		if (cpy_num == 0)
+			return i;
+		pos += i;
+		if (cpy_bytes == -1)
+			/* copy first cpy_num items starting from position 'pos' of SOURCE to end of DEST */
+			leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
+						 pos, cpy_num);
+		else {
+			/* copy first cpy_num-1 items starting from position 'pos-1' of the SOURCE to the end of the DEST */
+			leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
+						 pos, cpy_num - 1);
+
+			/* copy part of the item which number is cpy_num+pos-1 to the end of the DEST */
+			leaf_item_bottle(dest_bi, src, FIRST_TO_LAST,
+					 cpy_num + pos - 1, cpy_bytes);
+		}
+	} else {
+		/* copy items to right */
+		src_nr_item = B_NR_ITEMS(src);
+		if (cpy_num == 1)
+			bytes = cpy_bytes;
+		else
+			bytes = -1;
+
+		/* copy the last item or it part or nothing to the begin of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */
+		i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes);
+
+		cpy_num -= i;
+		if (cpy_num == 0)
+			return i;
+
+		pos = src_nr_item - cpy_num - i;
+		if (cpy_bytes == -1) {
+			/* starting from position 'pos' copy last cpy_num items of SOURCE to begin of DEST */
+			leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
+						 pos, cpy_num);
+		} else {
+			/* copy last cpy_num-1 items starting from position 'pos+1' of the SOURCE to the begin of the DEST; */
+			leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
+						 pos + 1, cpy_num - 1);
+
+			/* copy part of the item which number is pos to the begin of the DEST */
+			leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos,
+					 cpy_bytes);
+		}
+	}
+	return i;
+}
+
+/* there are types of coping: from S[0] to L[0], from S[0] to R[0],
+   from R[0] to L[0]. for each of these we have to define parent and
+   positions of destination and source buffers */
+static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
+				       struct buffer_info *dest_bi,
+				       struct buffer_info *src_bi,
+				       int *first_last,
+				       struct buffer_head *Snew)
+{
+	memset(dest_bi, 0, sizeof(struct buffer_info));
+	memset(src_bi, 0, sizeof(struct buffer_info));
+
+	/* define dest, src, dest parent, dest position */
+	switch (shift_mode) {
+	case LEAF_FROM_S_TO_L:	/* it is used in leaf_shift_left */
+		src_bi->tb = tb;
+		src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
+		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
+		src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);	/* src->b_item_order */
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->L[0];
+		dest_bi->bi_parent = tb->FL[0];
+		dest_bi->bi_position = get_left_neighbor_position(tb, 0);
+		*first_last = FIRST_TO_LAST;
+		break;
+
+	case LEAF_FROM_S_TO_R:	/* it is used in leaf_shift_right */
+		src_bi->tb = tb;
+		src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
+		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
+		src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->R[0];
+		dest_bi->bi_parent = tb->FR[0];
+		dest_bi->bi_position = get_right_neighbor_position(tb, 0);
+		*first_last = LAST_TO_FIRST;
+		break;
+
+	case LEAF_FROM_R_TO_L:	/* it is used in balance_leaf_when_delete */
+		src_bi->tb = tb;
+		src_bi->bi_bh = tb->R[0];
+		src_bi->bi_parent = tb->FR[0];
+		src_bi->bi_position = get_right_neighbor_position(tb, 0);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->L[0];
+		dest_bi->bi_parent = tb->FL[0];
+		dest_bi->bi_position = get_left_neighbor_position(tb, 0);
+		*first_last = FIRST_TO_LAST;
+		break;
+
+	case LEAF_FROM_L_TO_R:	/* it is used in balance_leaf_when_delete */
+		src_bi->tb = tb;
+		src_bi->bi_bh = tb->L[0];
+		src_bi->bi_parent = tb->FL[0];
+		src_bi->bi_position = get_left_neighbor_position(tb, 0);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->R[0];
+		dest_bi->bi_parent = tb->FR[0];
+		dest_bi->bi_position = get_right_neighbor_position(tb, 0);
+		*first_last = LAST_TO_FIRST;
+		break;
+
+	case LEAF_FROM_S_TO_SNEW:
+		src_bi->tb = tb;
+		src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
+		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
+		src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = Snew;
+		dest_bi->bi_parent = NULL;
+		dest_bi->bi_position = 0;
+		*first_last = LAST_TO_FIRST;
+		break;
+
+	default:
+		reiserfs_panic(sb_from_bi(src_bi), "vs-10250",
+			       "shift type is unknown (%d)", shift_mode);
+	}
+	RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh,
+	       "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly",
+	       shift_mode, src_bi->bi_bh, dest_bi->bi_bh);
+}
+
+/* copy mov_num items and mov_bytes of the (mov_num-1)th item to
+   neighbor. Delete them from source */
+int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
+		    int mov_bytes, struct buffer_head *Snew)
+{
+	int ret_value;
+	struct buffer_info dest_bi, src_bi;
+	int first_last;
+
+	leaf_define_dest_src_infos(shift_mode, tb, &dest_bi, &src_bi,
+				   &first_last, Snew);
+
+	ret_value =
+	    leaf_copy_items(&dest_bi, src_bi.bi_bh, first_last, mov_num,
+			    mov_bytes);
+
+	leaf_delete_items(&src_bi, first_last,
+			  (first_last ==
+			   FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) -
+						 mov_num), mov_num, mov_bytes);
+
+	return ret_value;
+}
+
+/* Shift shift_num items (and shift_bytes of last shifted item if shift_bytes != -1)
+   from S[0] to L[0] and replace the delimiting key */
+int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
+{
+	struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int i;
+
+	/* move shift_num (and shift_bytes bytes) items from S[0] to left neighbor L[0] */
+	i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL);
+
+	if (shift_num) {
+		if (B_NR_ITEMS(S0) == 0) {	/* number of items in S[0] == 0 */
+
+			RFALSE(shift_bytes != -1,
+			       "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)",
+			       shift_bytes);
+#ifdef CONFIG_REISERFS_CHECK
+			if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) {
+				print_cur_tb("vs-10275");
+				reiserfs_panic(tb->tb_sb, "vs-10275",
+					       "balance condition corrupted "
+					       "(%c)", tb->tb_mode);
+			}
+#endif
+
+			if (PATH_H_POSITION(tb->tb_path, 1) == 0)
+				replace_key(tb, tb->CFL[0], tb->lkey[0],
+					    PATH_H_PPARENT(tb->tb_path, 0), 0);
+
+		} else {
+			/* replace lkey in CFL[0] by 0-th key from S[0]; */
+			replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0);
+
+			RFALSE((shift_bytes != -1 &&
+				!(is_direntry_le_ih(B_N_PITEM_HEAD(S0, 0))
+				  && !I_ENTRY_COUNT(B_N_PITEM_HEAD(S0, 0)))) &&
+			       (!op_is_left_mergeable
+				(B_N_PKEY(S0, 0), S0->b_size)),
+			       "vs-10280: item must be mergeable");
+		}
+	}
+
+	return i;
+}
+
+/* CLEANING STOPPED HERE */
+
+/* Shift shift_num (shift_bytes) items from S[0] to the right neighbor, and replace the delimiting key */
+int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes)
+{
+	//  struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path);
+	int ret_value;
+
+	/* move shift_num (and shift_bytes) items from S[0] to right neighbor R[0] */
+	ret_value =
+	    leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL);
+
+	/* replace rkey in CFR[0] by the 0-th key from R[0] */
+	if (shift_num) {
+		replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+
+	}
+
+	return ret_value;
+}
+
+static void leaf_delete_items_entirely(struct buffer_info *bi,
+				       int first, int del_num);
+/*  If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR.
+    If not.
+    If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of
+    the first item. Part defined by del_bytes. Don't delete first item header
+    If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of
+    the last item . Part defined by del_bytes. Don't delete last item header.
+*/
+void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
+		       int first, int del_num, int del_bytes)
+{
+	struct buffer_head *bh;
+	int item_amount = B_NR_ITEMS(bh = cur_bi->bi_bh);
+
+	RFALSE(!bh, "10155: bh is not defined");
+	RFALSE(del_num < 0, "10160: del_num can not be < 0. del_num==%d",
+	       del_num);
+	RFALSE(first < 0
+	       || first + del_num > item_amount,
+	       "10165: invalid number of first item to be deleted (%d) or "
+	       "no so much items (%d) to delete (only %d)", first,
+	       first + del_num, item_amount);
+
+	if (del_num == 0)
+		return;
+
+	if (first == 0 && del_num == item_amount && del_bytes == -1) {
+		make_empty_node(cur_bi);
+		do_balance_mark_leaf_dirty(cur_bi->tb, bh, 0);
+		return;
+	}
+
+	if (del_bytes == -1)
+		/* delete del_num items beginning from item in position first */
+		leaf_delete_items_entirely(cur_bi, first, del_num);
+	else {
+		if (last_first == FIRST_TO_LAST) {
+			/* delete del_num-1 items beginning from item in position first  */
+			leaf_delete_items_entirely(cur_bi, first, del_num - 1);
+
+			/* delete the part of the first item of the bh
+			   do not delete item header
+			 */
+			leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes);
+		} else {
+			struct item_head *ih;
+			int len;
+
+			/* delete del_num-1 items beginning from item in position first+1  */
+			leaf_delete_items_entirely(cur_bi, first + 1,
+						   del_num - 1);
+
+			ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1);
+			if (is_direntry_le_ih(ih))
+				/* the last item is directory  */
+				/* len = numbers of directory entries in this item */
+				len = ih_entry_count(ih);
+			else
+				/* len = body len of item */
+				len = ih_item_len(ih);
+
+			/* delete the part of the last item of the bh
+			   do not delete item header
+			 */
+			leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1,
+					     len - del_bytes, del_bytes);
+		}
+	}
+}
+
+/* insert item into the leaf node in position before */
+void leaf_insert_into_buf(struct buffer_info *bi, int before,
+			  struct item_head *inserted_item_ih,
+			  const char *inserted_item_body, int zeros_number)
+{
+	struct buffer_head *bh = bi->bi_bh;
+	int nr, free_space;
+	struct block_head *blkh;
+	struct item_head *ih;
+	int i;
+	int last_loc, unmoved_loc;
+	char *to;
+
+	blkh = B_BLK_HEAD(bh);
+	nr = blkh_nr_item(blkh);
+	free_space = blkh_free_space(blkh);
+
+	/* check free space */
+	RFALSE(free_space < ih_item_len(inserted_item_ih) + IH_SIZE,
+	       "vs-10170: not enough free space in block %z, new item %h",
+	       bh, inserted_item_ih);
+	RFALSE(zeros_number > ih_item_len(inserted_item_ih),
+	       "vs-10172: zero number == %d, item length == %d",
+	       zeros_number, ih_item_len(inserted_item_ih));
+
+	/* get item new item must be inserted before */
+	ih = B_N_PITEM_HEAD(bh, before);
+
+	/* prepare space for the body of new item */
+	last_loc = nr ? ih_location(&(ih[nr - before - 1])) : bh->b_size;
+	unmoved_loc = before ? ih_location(ih - 1) : bh->b_size;
+
+	memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih),
+		bh->b_data + last_loc, unmoved_loc - last_loc);
+
+	to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih);
+	memset(to, 0, zeros_number);
+	to += zeros_number;
+
+	/* copy body to prepared space */
+	if (inserted_item_body)
+		memmove(to, inserted_item_body,
+			ih_item_len(inserted_item_ih) - zeros_number);
+	else
+		memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number);
+
+	/* insert item header */
+	memmove(ih + 1, ih, IH_SIZE * (nr - before));
+	memmove(ih, inserted_item_ih, IH_SIZE);
+
+	/* change locations */
+	for (i = before; i < nr + 1; i++) {
+		unmoved_loc -= ih_item_len(&(ih[i - before]));
+		put_ih_location(&(ih[i - before]), unmoved_loc);
+	}
+
+	/* sizes, free space, item number */
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
+	set_blkh_free_space(blkh,
+			    free_space - (IH_SIZE +
+					  ih_item_len(inserted_item_ih)));
+	do_balance_mark_leaf_dirty(bi->tb, bh, 1);
+
+	if (bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
+		put_dc_size(t_dc,
+			    dc_size(t_dc) + (IH_SIZE +
+					     ih_item_len(inserted_item_ih)));
+		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
+	}
+}
+
+/* paste paste_size bytes to affected_item_num-th item.
+   When item is a directory, this only prepare space for new entries */
+void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
+			  int pos_in_item, int paste_size,
+			  const char *body, int zeros_number)
+{
+	struct buffer_head *bh = bi->bi_bh;
+	int nr, free_space;
+	struct block_head *blkh;
+	struct item_head *ih;
+	int i;
+	int last_loc, unmoved_loc;
+
+	blkh = B_BLK_HEAD(bh);
+	nr = blkh_nr_item(blkh);
+	free_space = blkh_free_space(blkh);
+
+	/* check free space */
+	RFALSE(free_space < paste_size,
+	       "vs-10175: not enough free space: needed %d, available %d",
+	       paste_size, free_space);
+
+#ifdef CONFIG_REISERFS_CHECK
+	if (zeros_number > paste_size) {
+		struct super_block *sb = NULL;
+		if (bi && bi->tb)
+			sb = bi->tb->tb_sb;
+		print_cur_tb("10177");
+		reiserfs_panic(sb, "vs-10177",
+			       "zeros_number == %d, paste_size == %d",
+			       zeros_number, paste_size);
+	}
+#endif				/* CONFIG_REISERFS_CHECK */
+
+	/* item to be appended */
+	ih = B_N_PITEM_HEAD(bh, affected_item_num);
+
+	last_loc = ih_location(&(ih[nr - affected_item_num - 1]));
+	unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size;
+
+	/* prepare space */
+	memmove(bh->b_data + last_loc - paste_size, bh->b_data + last_loc,
+		unmoved_loc - last_loc);
+
+	/* change locations */
+	for (i = affected_item_num; i < nr; i++)
+		put_ih_location(&(ih[i - affected_item_num]),
+				ih_location(&(ih[i - affected_item_num])) -
+				paste_size);
+
+	if (body) {
+		if (!is_direntry_le_ih(ih)) {
+			if (!pos_in_item) {
+				/* shift data to right */
+				memmove(bh->b_data + ih_location(ih) +
+					paste_size,
+					bh->b_data + ih_location(ih),
+					ih_item_len(ih));
+				/* paste data in the head of item */
+				memset(bh->b_data + ih_location(ih), 0,
+				       zeros_number);
+				memcpy(bh->b_data + ih_location(ih) +
+				       zeros_number, body,
+				       paste_size - zeros_number);
+			} else {
+				memset(bh->b_data + unmoved_loc - paste_size, 0,
+				       zeros_number);
+				memcpy(bh->b_data + unmoved_loc - paste_size +
+				       zeros_number, body,
+				       paste_size - zeros_number);
+			}
+		}
+	} else
+		memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size);
+
+	put_ih_item_len(ih, ih_item_len(ih) + paste_size);
+
+	/* change free space */
+	set_blkh_free_space(blkh, free_space - paste_size);
+
+	do_balance_mark_leaf_dirty(bi->tb, bh, 0);
+
+	if (bi->bi_parent) {
+		struct disk_child *t_dc =
+		    B_N_CHILD(bi->bi_parent, bi->bi_position);
+		put_dc_size(t_dc, dc_size(t_dc) + paste_size);
+		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
+	}
+}
+
+/* cuts DEL_COUNT entries beginning from FROM-th entry. Directory item
+   does not have free space, so it moves DEHs and remaining records as
+   necessary. Return value is size of removed part of directory item
+   in bytes. */
+static int leaf_cut_entries(struct buffer_head *bh,
+			    struct item_head *ih, int from, int del_count)
+{
+	char *item;
+	struct reiserfs_de_head *deh;
+	int prev_record_offset;	/* offset of record, that is (from-1)th */
+	char *prev_record;	/* */
+	int cut_records_len;	/* length of all removed records */
+	int i;
+
+	/* make sure, that item is directory and there are enough entries to
+	   remove */
+	RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item");
+	RFALSE(I_ENTRY_COUNT(ih) < from + del_count,
+	       "10185: item contains not enough entries: entry_cout = %d, from = %d, to delete = %d",
+	       I_ENTRY_COUNT(ih), from, del_count);
+
+	if (del_count == 0)
+		return 0;
+
+	/* first byte of item */
+	item = bh->b_data + ih_location(ih);
+
+	/* entry head array */
+	deh = B_I_DEH(bh, ih);
+
+	/* first byte of remaining entries, those are BEFORE cut entries
+	   (prev_record) and length of all removed records (cut_records_len) */
+	prev_record_offset =
+	    (from ? deh_location(&(deh[from - 1])) : ih_item_len(ih));
+	cut_records_len = prev_record_offset /*from_record */  -
+	    deh_location(&(deh[from + del_count - 1]));
+	prev_record = item + prev_record_offset;
+
+	/* adjust locations of remaining entries */
+	for (i = I_ENTRY_COUNT(ih) - 1; i > from + del_count - 1; i--)
+		put_deh_location(&(deh[i]),
+				 deh_location(&deh[i]) -
+				 (DEH_SIZE * del_count));
+
+	for (i = 0; i < from; i++)
+		put_deh_location(&(deh[i]),
+				 deh_location(&deh[i]) - (DEH_SIZE * del_count +
+							  cut_records_len));
+
+	put_ih_entry_count(ih, ih_entry_count(ih) - del_count);
+
+	/* shift entry head array and entries those are AFTER removed entries */
+	memmove((char *)(deh + from),
+		deh + from + del_count,
+		prev_record - cut_records_len - (char *)(deh + from +
+							 del_count));
+
+	/* shift records, those are BEFORE removed entries */
+	memmove(prev_record - cut_records_len - DEH_SIZE * del_count,
+		prev_record, item + ih_item_len(ih) - prev_record);
+
+	return DEH_SIZE * del_count + cut_records_len;
+}
+
+/*  when cut item is part of regular file
+        pos_in_item - first byte that must be cut
+        cut_size - number of bytes to be cut beginning from pos_in_item
+
+   when cut item is part of directory
+        pos_in_item - number of first deleted entry
+        cut_size - count of deleted entries
+    */
+void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
+			  int pos_in_item, int cut_size)
+{
+	int nr;
+	struct buffer_head *bh = bi->bi_bh;
+	struct block_head *blkh;
+	struct item_head *ih;
+	int last_loc, unmoved_loc;
+	int i;
+
+	blkh = B_BLK_HEAD(bh);
+	nr = blkh_nr_item(blkh);
+
+	/* item head of truncated item */
+	ih = B_N_PITEM_HEAD(bh, cut_item_num);
+
+	if (is_direntry_le_ih(ih)) {
+		/* first cut entry () */
+		cut_size = leaf_cut_entries(bh, ih, pos_in_item, cut_size);
+		if (pos_in_item == 0) {
+			/* change key */
+			RFALSE(cut_item_num,
+			       "when 0-th enrty of item is cut, that item must be first in the node, not %d-th",
+			       cut_item_num);
+			/* change item key by key of first entry in the item */
+			set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih)));
+			/*memcpy (&ih->ih_key.k_offset, &(B_I_DEH (bh, ih)->deh_offset), SHORT_KEY_SIZE); */
+		}
+	} else {
+		/* item is direct or indirect */
+		RFALSE(is_statdata_le_ih(ih), "10195: item is stat data");
+		RFALSE(pos_in_item && pos_in_item + cut_size != ih_item_len(ih),
+		       "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)",
+		       (long unsigned)pos_in_item, (long unsigned)cut_size,
+		       (long unsigned)ih_item_len(ih));
+
+		/* shift item body to left if cut is from the head of item */
+		if (pos_in_item == 0) {
+			memmove(bh->b_data + ih_location(ih),
+				bh->b_data + ih_location(ih) + cut_size,
+				ih_item_len(ih) - cut_size);
+
+			/* change key of item */
+			if (is_direct_le_ih(ih))
+				set_le_ih_k_offset(ih,
+						   le_ih_k_offset(ih) +
+						   cut_size);
+			else {
+				set_le_ih_k_offset(ih,
+						   le_ih_k_offset(ih) +
+						   (cut_size / UNFM_P_SIZE) *
+						   bh->b_size);
+				RFALSE(ih_item_len(ih) == cut_size
+				       && get_ih_free_space(ih),
+				       "10205: invalid ih_free_space (%h)", ih);
+			}
+		}
+	}
+
+	/* location of the last item */
+	last_loc = ih_location(&(ih[nr - cut_item_num - 1]));
+
+	/* location of the item, which is remaining at the same place */
+	unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size;
+
+	/* shift */
+	memmove(bh->b_data + last_loc + cut_size, bh->b_data + last_loc,
+		unmoved_loc - last_loc - cut_size);
+
+	/* change item length */
+	put_ih_item_len(ih, ih_item_len(ih) - cut_size);
+
+	if (is_indirect_le_ih(ih)) {
+		if (pos_in_item)
+			set_ih_free_space(ih, 0);
+	}
+
+	/* change locations */
+	for (i = cut_item_num; i < nr; i++)
+		put_ih_location(&(ih[i - cut_item_num]),
+				ih_location(&ih[i - cut_item_num]) + cut_size);
+
+	/* size, free space */
+	set_blkh_free_space(blkh, blkh_free_space(blkh) + cut_size);
+
+	do_balance_mark_leaf_dirty(bi->tb, bh, 0);
+
+	if (bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
+		put_dc_size(t_dc, dc_size(t_dc) - cut_size);
+		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
+	}
+}
+
+/* delete del_num items from buffer starting from the first'th item */
+static void leaf_delete_items_entirely(struct buffer_info *bi,
+				       int first, int del_num)
+{
+	struct buffer_head *bh = bi->bi_bh;
+	int nr;
+	int i, j;
+	int last_loc, last_removed_loc;
+	struct block_head *blkh;
+	struct item_head *ih;
+
+	RFALSE(bh == NULL, "10210: buffer is 0");
+	RFALSE(del_num < 0, "10215: del_num less than 0 (%d)", del_num);
+
+	if (del_num == 0)
+		return;
+
+	blkh = B_BLK_HEAD(bh);
+	nr = blkh_nr_item(blkh);
+
+	RFALSE(first < 0 || first + del_num > nr,
+	       "10220: first=%d, number=%d, there is %d items", first, del_num,
+	       nr);
+
+	if (first == 0 && del_num == nr) {
+		/* this does not work */
+		make_empty_node(bi);
+
+		do_balance_mark_leaf_dirty(bi->tb, bh, 0);
+		return;
+	}
+
+	ih = B_N_PITEM_HEAD(bh, first);
+
+	/* location of unmovable item */
+	j = (first == 0) ? bh->b_size : ih_location(ih - 1);
+
+	/* delete items */
+	last_loc = ih_location(&(ih[nr - 1 - first]));
+	last_removed_loc = ih_location(&(ih[del_num - 1]));
+
+	memmove(bh->b_data + last_loc + j - last_removed_loc,
+		bh->b_data + last_loc, last_removed_loc - last_loc);
+
+	/* delete item headers */
+	memmove(ih, ih + del_num, (nr - first - del_num) * IH_SIZE);
+
+	/* change item location */
+	for (i = first; i < nr - del_num; i++)
+		put_ih_location(&(ih[i - first]),
+				ih_location(&(ih[i - first])) + (j -
+								 last_removed_loc));
+
+	/* sizes, item number */
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
+	set_blkh_free_space(blkh,
+			    blkh_free_space(blkh) + (j - last_removed_loc +
+						     IH_SIZE * del_num));
+
+	do_balance_mark_leaf_dirty(bi->tb, bh, 0);
+
+	if (bi->bi_parent) {
+		struct disk_child *t_dc =
+		    B_N_CHILD(bi->bi_parent, bi->bi_position);
+		put_dc_size(t_dc,
+			    dc_size(t_dc) - (j - last_removed_loc +
+					     IH_SIZE * del_num));
+		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
+	}
+}
+
+/* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */
+void leaf_paste_entries(struct buffer_info *bi,
+			int item_num,
+			int before,
+			int new_entry_count,
+			struct reiserfs_de_head *new_dehs,
+			const char *records, int paste_size)
+{
+	struct item_head *ih;
+	char *item;
+	struct reiserfs_de_head *deh;
+	char *insert_point;
+	int i, old_entry_num;
+	struct buffer_head *bh = bi->bi_bh;
+
+	if (new_entry_count == 0)
+		return;
+
+	ih = B_N_PITEM_HEAD(bh, item_num);
+
+	/* make sure, that item is directory, and there are enough records in it */
+	RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item");
+	RFALSE(I_ENTRY_COUNT(ih) < before,
+	       "10230: there are no entry we paste entries before. entry_count = %d, before = %d",
+	       I_ENTRY_COUNT(ih), before);
+
+	/* first byte of dest item */
+	item = bh->b_data + ih_location(ih);
+
+	/* entry head array */
+	deh = B_I_DEH(bh, ih);
+
+	/* new records will be pasted at this point */
+	insert_point =
+	    item +
+	    (before ? deh_location(&(deh[before - 1]))
+	     : (ih_item_len(ih) - paste_size));
+
+	/* adjust locations of records that will be AFTER new records */
+	for (i = I_ENTRY_COUNT(ih) - 1; i >= before; i--)
+		put_deh_location(&(deh[i]),
+				 deh_location(&(deh[i])) +
+				 (DEH_SIZE * new_entry_count));
+
+	/* adjust locations of records that will be BEFORE new records */
+	for (i = 0; i < before; i++)
+		put_deh_location(&(deh[i]),
+				 deh_location(&(deh[i])) + paste_size);
+
+	old_entry_num = I_ENTRY_COUNT(ih);
+	put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count);
+
+	/* prepare space for pasted records */
+	memmove(insert_point + paste_size, insert_point,
+		item + (ih_item_len(ih) - paste_size) - insert_point);
+
+	/* copy new records */
+	memcpy(insert_point + DEH_SIZE * new_entry_count, records,
+	       paste_size - DEH_SIZE * new_entry_count);
+
+	/* prepare space for new entry heads */
+	deh += before;
+	memmove((char *)(deh + new_entry_count), deh,
+		insert_point - (char *)deh);
+
+	/* copy new entry heads */
+	deh = (struct reiserfs_de_head *)((char *)deh);
+	memcpy(deh, new_dehs, DEH_SIZE * new_entry_count);
+
+	/* set locations of new records */
+	for (i = 0; i < new_entry_count; i++) {
+		put_deh_location(&(deh[i]),
+				 deh_location(&(deh[i])) +
+				 (-deh_location
+				  (&(new_dehs[new_entry_count - 1])) +
+				  insert_point + DEH_SIZE * new_entry_count -
+				  item));
+	}
+
+	/* change item key if necessary (when we paste before 0-th entry */
+	if (!before) {
+		set_le_ih_k_offset(ih, deh_offset(new_dehs));
+/*      memcpy (&ih->ih_key.k_offset,
+		       &new_dehs->deh_offset, SHORT_KEY_SIZE);*/
+	}
+#ifdef CONFIG_REISERFS_CHECK
+	{
+		int prev, next;
+		/* check record locations */
+		deh = B_I_DEH(bh, ih);
+		for (i = 0; i < I_ENTRY_COUNT(ih); i++) {
+			next =
+			    (i <
+			     I_ENTRY_COUNT(ih) -
+			     1) ? deh_location(&(deh[i + 1])) : 0;
+			prev = (i != 0) ? deh_location(&(deh[i - 1])) : 0;
+
+			if (prev && prev <= deh_location(&(deh[i])))
+				reiserfs_error(sb_from_bi(bi), "vs-10240",
+					       "directory item (%h) "
+					       "corrupted (prev %a, "
+					       "cur(%d) %a)",
+					       ih, deh + i - 1, i, deh + i);
+			if (next && next >= deh_location(&(deh[i])))
+				reiserfs_error(sb_from_bi(bi), "vs-10250",
+					       "directory item (%h) "
+					       "corrupted (cur(%d) %a, "
+					       "next %a)",
+					       ih, i, deh + i, deh + i + 1);
+		}
+	}
+#endif
+
+}
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
new file mode 100644
index 00000000..7df1ce48
--- /dev/null
+++ b/fs/reiserfs/lock.c
@@ -0,0 +1,97 @@
+#include <linux/reiserfs_fs.h>
+#include <linux/mutex.h>
+
+/*
+ * The previous reiserfs locking scheme was heavily based on
+ * the tricky properties of the Bkl:
+ *
+ * - it was acquired recursively by a same task
+ * - the performances relied on the release-while-schedule() property
+ *
+ * Now that we replace it by a mutex, we still want to keep the same
+ * recursive property to avoid big changes in the code structure.
+ * We use our own lock_owner here because the owner field on a mutex
+ * is only available in SMP or mutex debugging, also we only need this field
+ * for this mutex, no need for a system wide mutex facility.
+ *
+ * Also this lock is often released before a call that could block because
+ * reiserfs performances were partially based on the release while schedule()
+ * property of the Bkl.
+ */
+void reiserfs_write_lock(struct super_block *s)
+{
+	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+	if (sb_i->lock_owner != current) {
+		mutex_lock(&sb_i->lock);
+		sb_i->lock_owner = current;
+	}
+
+	/* No need to protect it, only the current task touches it */
+	sb_i->lock_depth++;
+}
+
+void reiserfs_write_unlock(struct super_block *s)
+{
+	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+	/*
+	 * Are we unlocking without even holding the lock?
+	 * Such a situation must raise a BUG() if we don't want
+	 * to corrupt the data.
+	 */
+	BUG_ON(sb_i->lock_owner != current);
+
+	if (--sb_i->lock_depth == -1) {
+		sb_i->lock_owner = NULL;
+		mutex_unlock(&sb_i->lock);
+	}
+}
+
+/*
+ * If we already own the lock, just exit and don't increase the depth.
+ * Useful when we don't want to lock more than once.
+ *
+ * We always return the lock_depth we had before calling
+ * this function.
+ */
+int reiserfs_write_lock_once(struct super_block *s)
+{
+	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+	if (sb_i->lock_owner != current) {
+		mutex_lock(&sb_i->lock);
+		sb_i->lock_owner = current;
+		return sb_i->lock_depth++;
+	}
+
+	return sb_i->lock_depth;
+}
+
+void reiserfs_write_unlock_once(struct super_block *s, int lock_depth)
+{
+	if (lock_depth == -1)
+		reiserfs_write_unlock(s);
+}
+
+/*
+ * Utility function to force a BUG if it is called without the superblock
+ * write lock held.  caller is the string printed just before calling BUG()
+ */
+void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
+{
+	struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
+
+	if (sb_i->lock_depth < 0)
+		reiserfs_panic(sb, "%s called without kernel lock held %d",
+			       caller);
+}
+
+#ifdef CONFIG_REISERFS_CHECK
+void reiserfs_lock_check_recursive(struct super_block *sb)
+{
+	struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
+
+	WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n");
+}
+#endif
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
new file mode 100644
index 00000000..11866269
--- /dev/null
+++ b/fs/reiserfs/namei.c
@@ -0,0 +1,1562 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ *
+ * Trivial changes by Alan Cox to remove EHASHCOLLISION for compatibility
+ *
+ * Trivial Changes:
+ * Rights granted to Hans Reiser to redistribute under other terms providing
+ * he accepts all liability including but not limited to patent, fitness
+ * for purpose, and direct or indirect claims arising from failure to perform.
+ *
+ * NO WARRANTY
+ */
+
+#include <linux/time.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/reiserfs_acl.h>
+#include <linux/reiserfs_xattr.h>
+#include <linux/quotaops.h>
+
+#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
+#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
+
+// directory item contains array of entry headers. This performs
+// binary search through that array
+static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
+{
+	struct item_head *ih = de->de_ih;
+	struct reiserfs_de_head *deh = de->de_deh;
+	int rbound, lbound, j;
+
+	lbound = 0;
+	rbound = I_ENTRY_COUNT(ih) - 1;
+
+	for (j = (rbound + lbound) / 2; lbound <= rbound;
+	     j = (rbound + lbound) / 2) {
+		if (off < deh_offset(deh + j)) {
+			rbound = j - 1;
+			continue;
+		}
+		if (off > deh_offset(deh + j)) {
+			lbound = j + 1;
+			continue;
+		}
+		// this is not name found, but matched third key component
+		de->de_entry_num = j;
+		return NAME_FOUND;
+	}
+
+	de->de_entry_num = lbound;
+	return NAME_NOT_FOUND;
+}
+
+// comment?  maybe something like set de to point to what the path points to?
+static inline void set_de_item_location(struct reiserfs_dir_entry *de,
+					struct treepath *path)
+{
+	de->de_bh = get_last_bh(path);
+	de->de_ih = get_ih(path);
+	de->de_deh = B_I_DEH(de->de_bh, de->de_ih);
+	de->de_item_num = PATH_LAST_POSITION(path);
+}
+
+// de_bh, de_ih, de_deh (points to first element of array), de_item_num is set
+inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de)
+{
+	struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
+
+	BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
+
+	de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num);
+	de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0);
+	de->de_name = B_I_PITEM(de->de_bh, de->de_ih) + deh_location(deh);
+	if (de->de_name[de->de_namelen - 1] == 0)
+		de->de_namelen = strlen(de->de_name);
+}
+
+// what entry points to
+static inline void set_de_object_key(struct reiserfs_dir_entry *de)
+{
+	BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
+	de->de_dir_id = deh_dir_id(&(de->de_deh[de->de_entry_num]));
+	de->de_objectid = deh_objectid(&(de->de_deh[de->de_entry_num]));
+}
+
+static inline void store_de_entry_key(struct reiserfs_dir_entry *de)
+{
+	struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
+
+	BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
+
+	/* store key of the found entry */
+	de->de_entry_key.version = KEY_FORMAT_3_5;
+	de->de_entry_key.on_disk_key.k_dir_id =
+	    le32_to_cpu(de->de_ih->ih_key.k_dir_id);
+	de->de_entry_key.on_disk_key.k_objectid =
+	    le32_to_cpu(de->de_ih->ih_key.k_objectid);
+	set_cpu_key_k_offset(&(de->de_entry_key), deh_offset(deh));
+	set_cpu_key_k_type(&(de->de_entry_key), TYPE_DIRENTRY);
+}
+
+/* We assign a key to each directory item, and place multiple entries
+in a single directory item.  A directory item has a key equal to the
+key of the first directory entry in it.
+
+This function first calls search_by_key, then, if item whose first
+entry matches is not found it looks for the entry inside directory
+item found by search_by_key. Fills the path to the entry, and to the
+entry position in the item
+
+*/
+
+/* The function is NOT SCHEDULE-SAFE! */
+int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
+			struct treepath *path, struct reiserfs_dir_entry *de)
+{
+	int retval;
+
+	retval = search_item(sb, key, path);
+	switch (retval) {
+	case ITEM_NOT_FOUND:
+		if (!PATH_LAST_POSITION(path)) {
+			reiserfs_error(sb, "vs-7000", "search_by_key "
+				       "returned item position == 0");
+			pathrelse(path);
+			return IO_ERROR;
+		}
+		PATH_LAST_POSITION(path)--;
+
+	case ITEM_FOUND:
+		break;
+
+	case IO_ERROR:
+		return retval;
+
+	default:
+		pathrelse(path);
+		reiserfs_error(sb, "vs-7002", "no path to here");
+		return IO_ERROR;
+	}
+
+	set_de_item_location(de, path);
+
+#ifdef CONFIG_REISERFS_CHECK
+	if (!is_direntry_le_ih(de->de_ih) ||
+	    COMP_SHORT_KEYS(&(de->de_ih->ih_key), key)) {
+		print_block(de->de_bh, 0, -1, -1);
+		reiserfs_panic(sb, "vs-7005", "found item %h is not directory "
+			       "item or does not belong to the same directory "
+			       "as key %K", de->de_ih, key);
+	}
+#endif				/* CONFIG_REISERFS_CHECK */
+
+	/* binary search in directory item by third componen t of the
+	   key. sets de->de_entry_num of de */
+	retval = bin_search_in_dir_item(de, cpu_key_k_offset(key));
+	path->pos_in_item = de->de_entry_num;
+	if (retval != NAME_NOT_FOUND) {
+		// ugly, but rename needs de_bh, de_deh, de_name, de_namelen, de_objectid set
+		set_de_name_and_namelen(de);
+		set_de_object_key(de);
+	}
+	return retval;
+}
+
+/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */
+
+/* The third component is hashed, and you can choose from more than
+   one hash function.  Per directory hashes are not yet implemented
+   but are thought about. This function should be moved to hashes.c
+   Jedi, please do so.  -Hans */
+
+static __u32 get_third_component(struct super_block *s,
+				 const char *name, int len)
+{
+	__u32 res;
+
+	if (!len || (len == 1 && name[0] == '.'))
+		return DOT_OFFSET;
+	if (len == 2 && name[0] == '.' && name[1] == '.')
+		return DOT_DOT_OFFSET;
+
+	res = REISERFS_SB(s)->s_hash_function(name, len);
+
+	// take bits from 7-th to 30-th including both bounds
+	res = GET_HASH_VALUE(res);
+	if (res == 0)
+		// needed to have no names before "." and ".." those have hash
+		// value == 0 and generation conters 1 and 2 accordingly
+		res = 128;
+	return res + MAX_GENERATION_NUMBER;
+}
+
+static int reiserfs_match(struct reiserfs_dir_entry *de,
+			  const char *name, int namelen)
+{
+	int retval = NAME_NOT_FOUND;
+
+	if ((namelen == de->de_namelen) &&
+	    !memcmp(de->de_name, name, de->de_namelen))
+		retval =
+		    (de_visible(de->de_deh + de->de_entry_num) ? NAME_FOUND :
+		     NAME_FOUND_INVISIBLE);
+
+	return retval;
+}
+
+/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */
+
+				/* used when hash collisions exist */
+
+static int linear_search_in_dir_item(struct cpu_key *key,
+				     struct reiserfs_dir_entry *de,
+				     const char *name, int namelen)
+{
+	struct reiserfs_de_head *deh = de->de_deh;
+	int retval;
+	int i;
+
+	i = de->de_entry_num;
+
+	if (i == I_ENTRY_COUNT(de->de_ih) ||
+	    GET_HASH_VALUE(deh_offset(deh + i)) !=
+	    GET_HASH_VALUE(cpu_key_k_offset(key))) {
+		i--;
+	}
+
+	RFALSE(de->de_deh != B_I_DEH(de->de_bh, de->de_ih),
+	       "vs-7010: array of entry headers not found");
+
+	deh += i;
+
+	for (; i >= 0; i--, deh--) {
+		if (GET_HASH_VALUE(deh_offset(deh)) !=
+		    GET_HASH_VALUE(cpu_key_k_offset(key))) {
+			// hash value does not match, no need to check whole name
+			return NAME_NOT_FOUND;
+		}
+
+		/* mark, that this generation number is used */
+		if (de->de_gen_number_bit_string)
+			set_bit(GET_GENERATION_NUMBER(deh_offset(deh)),
+				de->de_gen_number_bit_string);
+
+		// calculate pointer to name and namelen
+		de->de_entry_num = i;
+		set_de_name_and_namelen(de);
+
+		if ((retval =
+		     reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) {
+			// de's de_name, de_namelen, de_recordlen are set. Fill the rest:
+
+			// key of pointed object
+			set_de_object_key(de);
+
+			store_de_entry_key(de);
+
+			// retval can be NAME_FOUND or NAME_FOUND_INVISIBLE
+			return retval;
+		}
+	}
+
+	if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0)
+		/* we have reached left most entry in the node. In common we
+		   have to go to the left neighbor, but if generation counter
+		   is 0 already, we know for sure, that there is no name with
+		   the same hash value */
+		// FIXME: this work correctly only because hash value can not
+		// be 0. Btw, in case of Yura's hash it is probably possible,
+		// so, this is a bug
+		return NAME_NOT_FOUND;
+
+	RFALSE(de->de_item_num,
+	       "vs-7015: two diritems of the same directory in one node?");
+
+	return GOTO_PREVIOUS_ITEM;
+}
+
+// may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND
+// FIXME: should add something like IOERROR
+static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
+			       struct treepath *path_to_entry,
+			       struct reiserfs_dir_entry *de)
+{
+	struct cpu_key key_to_search;
+	int retval;
+
+	if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize))
+		return NAME_NOT_FOUND;
+
+	/* we will search for this key in the tree */
+	make_cpu_key(&key_to_search, dir,
+		     get_third_component(dir->i_sb, name, namelen),
+		     TYPE_DIRENTRY, 3);
+
+	while (1) {
+		retval =
+		    search_by_entry_key(dir->i_sb, &key_to_search,
+					path_to_entry, de);
+		if (retval == IO_ERROR) {
+			reiserfs_error(dir->i_sb, "zam-7001", "io error");
+			return IO_ERROR;
+		}
+
+		/* compare names for all entries having given hash value */
+		retval =
+		    linear_search_in_dir_item(&key_to_search, de, name,
+					      namelen);
+		if (retval != GOTO_PREVIOUS_ITEM) {
+			/* there is no need to scan directory anymore. Given entry found or does not exist */
+			path_to_entry->pos_in_item = de->de_entry_num;
+			return retval;
+		}
+
+		/* there is left neighboring item of this directory and given entry can be there */
+		set_cpu_key_k_offset(&key_to_search,
+				     le_ih_k_offset(de->de_ih) - 1);
+		pathrelse(path_to_entry);
+
+	}			/* while (1) */
+}
+
+static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
+				      struct nameidata *nd)
+{
+	int retval;
+	int lock_depth;
+	struct inode *inode = NULL;
+	struct reiserfs_dir_entry de;
+	INITIALIZE_PATH(path_to_entry);
+
+	if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	/*
+	 * Might be called with or without the write lock, must be careful
+	 * to not recursively hold it in case we want to release the lock
+	 * before rescheduling.
+	 */
+	lock_depth = reiserfs_write_lock_once(dir->i_sb);
+
+	de.de_gen_number_bit_string = NULL;
+	retval =
+	    reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
+				&path_to_entry, &de);
+	pathrelse(&path_to_entry);
+	if (retval == NAME_FOUND) {
+		inode = reiserfs_iget(dir->i_sb,
+				      (struct cpu_key *)&(de.de_dir_id));
+		if (!inode || IS_ERR(inode)) {
+			reiserfs_write_unlock_once(dir->i_sb, lock_depth);
+			return ERR_PTR(-EACCES);
+		}
+
+		/* Propagate the private flag so we know we're
+		 * in the priv tree */
+		if (IS_PRIVATE(dir))
+			inode->i_flags |= S_PRIVATE;
+	}
+	reiserfs_write_unlock_once(dir->i_sb, lock_depth);
+	if (retval == IO_ERROR) {
+		return ERR_PTR(-EIO);
+	}
+
+	return d_splice_alias(inode, dentry);
+}
+
+/*
+** looks up the dentry of the parent directory for child.
+** taken from ext2_get_parent
+*/
+struct dentry *reiserfs_get_parent(struct dentry *child)
+{
+	int retval;
+	struct inode *inode = NULL;
+	struct reiserfs_dir_entry de;
+	INITIALIZE_PATH(path_to_entry);
+	struct inode *dir = child->d_inode;
+
+	if (dir->i_nlink == 0) {
+		return ERR_PTR(-ENOENT);
+	}
+	de.de_gen_number_bit_string = NULL;
+
+	reiserfs_write_lock(dir->i_sb);
+	retval = reiserfs_find_entry(dir, "..", 2, &path_to_entry, &de);
+	pathrelse(&path_to_entry);
+	if (retval != NAME_FOUND) {
+		reiserfs_write_unlock(dir->i_sb);
+		return ERR_PTR(-ENOENT);
+	}
+	inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
+	reiserfs_write_unlock(dir->i_sb);
+
+	return d_obtain_alias(inode);
+}
+
+/* add entry to the directory (entry can be hidden).
+
+insert definition of when hidden directories are used here -Hans
+
+ Does not mark dir   inode dirty, do it after successesfull call to it */
+
+static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
+			      struct inode *dir, const char *name, int namelen,
+			      struct inode *inode, int visible)
+{
+	struct cpu_key entry_key;
+	struct reiserfs_de_head *deh;
+	INITIALIZE_PATH(path);
+	struct reiserfs_dir_entry de;
+	DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1);
+	int gen_number;
+	char small_buf[32 + DEH_SIZE];	/* 48 bytes now and we avoid kmalloc
+					   if we create file with short name */
+	char *buffer;
+	int buflen, paste_size;
+	int retval;
+
+	BUG_ON(!th->t_trans_id);
+
+	/* cannot allow items to be added into a busy deleted directory */
+	if (!namelen)
+		return -EINVAL;
+
+	if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize))
+		return -ENAMETOOLONG;
+
+	/* each entry has unique key. compose it */
+	make_cpu_key(&entry_key, dir,
+		     get_third_component(dir->i_sb, name, namelen),
+		     TYPE_DIRENTRY, 3);
+
+	/* get memory for composing the entry */
+	buflen = DEH_SIZE + ROUND_UP(namelen);
+	if (buflen > sizeof(small_buf)) {
+		buffer = kmalloc(buflen, GFP_NOFS);
+		if (!buffer)
+			return -ENOMEM;
+	} else
+		buffer = small_buf;
+
+	paste_size =
+	    (get_inode_sd_version(dir) ==
+	     STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen;
+
+	/* fill buffer : directory entry head, name[, dir objectid | , stat data | ,stat data, dir objectid ] */
+	deh = (struct reiserfs_de_head *)buffer;
+	deh->deh_location = 0;	/* JDM Endian safe if 0 */
+	put_deh_offset(deh, cpu_key_k_offset(&entry_key));
+	deh->deh_state = 0;	/* JDM Endian safe if 0 */
+	/* put key (ino analog) to de */
+	deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id;	/* safe: k_dir_id is le */
+	deh->deh_objectid = INODE_PKEY(inode)->k_objectid;	/* safe: k_objectid is le */
+
+	/* copy name */
+	memcpy((char *)(deh + 1), name, namelen);
+	/* padd by 0s to the 4 byte boundary */
+	padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen);
+
+	/* entry is ready to be pasted into tree, set 'visibility' and 'stat data in entry' attributes */
+	mark_de_without_sd(deh);
+	visible ? mark_de_visible(deh) : mark_de_hidden(deh);
+
+	/* find the proper place for the new entry */
+	memset(bit_string, 0, sizeof(bit_string));
+	de.de_gen_number_bit_string = bit_string;
+	retval = reiserfs_find_entry(dir, name, namelen, &path, &de);
+	if (retval != NAME_NOT_FOUND) {
+		if (buffer != small_buf)
+			kfree(buffer);
+		pathrelse(&path);
+
+		if (retval == IO_ERROR) {
+			return -EIO;
+		}
+
+		if (retval != NAME_FOUND) {
+			reiserfs_error(dir->i_sb, "zam-7002",
+				       "reiserfs_find_entry() returned "
+				       "unexpected value (%d)", retval);
+		}
+
+		return -EEXIST;
+	}
+
+	gen_number =
+	    find_first_zero_bit(bit_string,
+				MAX_GENERATION_NUMBER + 1);
+	if (gen_number > MAX_GENERATION_NUMBER) {
+		/* there is no free generation number */
+		reiserfs_warning(dir->i_sb, "reiserfs-7010",
+				 "Congratulations! we have got hash function "
+				 "screwed up");
+		if (buffer != small_buf)
+			kfree(buffer);
+		pathrelse(&path);
+		return -EBUSY;
+	}
+	/* adjust offset of directory enrty */
+	put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number));
+	set_cpu_key_k_offset(&entry_key, deh_offset(deh));
+
+	/* update max-hash-collisions counter in reiserfs_sb_info */
+	PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number);
+
+	if (gen_number != 0) {	/* we need to re-search for the insertion point */
+		if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) !=
+		    NAME_NOT_FOUND) {
+			reiserfs_warning(dir->i_sb, "vs-7032",
+					 "entry with this key (%K) already "
+					 "exists", &entry_key);
+
+			if (buffer != small_buf)
+				kfree(buffer);
+			pathrelse(&path);
+			return -EBUSY;
+		}
+	}
+
+	/* perform the insertion of the entry that we have prepared */
+	retval =
+	    reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer,
+				     paste_size);
+	if (buffer != small_buf)
+		kfree(buffer);
+	if (retval) {
+		reiserfs_check_path(&path);
+		return retval;
+	}
+
+	dir->i_size += paste_size;
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	if (!S_ISDIR(inode->i_mode) && visible)
+		// reiserfs_mkdir or reiserfs_rename will do that by itself
+		reiserfs_update_sd(th, dir);
+
+	reiserfs_check_path(&path);
+	return 0;
+}
+
+/* quota utility function, call if you've had to abort after calling
+** new_inode_init, and have not called reiserfs_new_inode yet.
+** This should only be called on inodes that do not have stat data
+** inserted into the tree yet.
+*/
+static int drop_new_inode(struct inode *inode)
+{
+	dquot_drop(inode);
+	make_bad_inode(inode);
+	inode->i_flags |= S_NOQUOTA;
+	iput(inode);
+	return 0;
+}
+
+/* utility function that does setup for reiserfs_new_inode.
+** dquot_initialize needs lots of credits so it's better to have it
+** outside of a transaction, so we had to pull some bits of
+** reiserfs_new_inode out into this func.
+*/
+static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
+{
+	/* Make inode invalid - just in case we are going to drop it before
+	 * the initialization happens */
+	INODE_PKEY(inode)->k_objectid = 0;
+	/* the quota init calls have to know who to charge the quota to, so
+	 ** we have to set uid and gid here
+	 */
+	inode_init_owner(inode, dir, mode);
+	dquot_initialize(inode);
+	return 0;
+}
+
+static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
+			   struct nameidata *nd)
+{
+	int retval;
+	struct inode *inode;
+	/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 2 +
+	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
+		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
+	struct reiserfs_transaction_handle th;
+	struct reiserfs_security_handle security;
+
+	dquot_initialize(dir);
+
+	if (!(inode = new_inode(dir->i_sb))) {
+		return -ENOMEM;
+	}
+	new_inode_init(inode, dir, mode);
+
+	jbegin_count += reiserfs_cache_default_acl(dir);
+	retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
+	reiserfs_write_lock(dir->i_sb);
+
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval) {
+		drop_new_inode(inode);
+		goto out_failed;
+	}
+
+	retval =
+	    reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
+			       inode, &security);
+	if (retval)
+		goto out_failed;
+
+	inode->i_op = &reiserfs_file_inode_operations;
+	inode->i_fop = &reiserfs_file_operations;
+	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
+
+	retval =
+	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
+			       dentry->d_name.len, inode, 1 /*visible */ );
+	if (retval) {
+		int err;
+		inode->i_nlink--;
+		reiserfs_update_sd(&th, inode);
+		err = journal_end(&th, dir->i_sb, jbegin_count);
+		if (err)
+			retval = err;
+		unlock_new_inode(inode);
+		iput(inode);
+		goto out_failed;
+	}
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+	retval = journal_end(&th, dir->i_sb, jbegin_count);
+
+      out_failed:
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
+}
+
+static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+			  dev_t rdev)
+{
+	int retval;
+	struct inode *inode;
+	struct reiserfs_transaction_handle th;
+	struct reiserfs_security_handle security;
+	/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 +
+	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
+		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	dquot_initialize(dir);
+
+	if (!(inode = new_inode(dir->i_sb))) {
+		return -ENOMEM;
+	}
+	new_inode_init(inode, dir, mode);
+
+	jbegin_count += reiserfs_cache_default_acl(dir);
+	retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
+	reiserfs_write_lock(dir->i_sb);
+
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval) {
+		drop_new_inode(inode);
+		goto out_failed;
+	}
+
+	retval =
+	    reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
+			       inode, &security);
+	if (retval) {
+		goto out_failed;
+	}
+
+	inode->i_op = &reiserfs_special_inode_operations;
+	init_special_inode(inode, inode->i_mode, rdev);
+
+	//FIXME: needed for block and char devices only
+	reiserfs_update_sd(&th, inode);
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	retval =
+	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
+			       dentry->d_name.len, inode, 1 /*visible */ );
+	if (retval) {
+		int err;
+		inode->i_nlink--;
+		reiserfs_update_sd(&th, inode);
+		err = journal_end(&th, dir->i_sb, jbegin_count);
+		if (err)
+			retval = err;
+		unlock_new_inode(inode);
+		iput(inode);
+		goto out_failed;
+	}
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+	retval = journal_end(&th, dir->i_sb, jbegin_count);
+
+      out_failed:
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
+}
+
+static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int retval;
+	struct inode *inode;
+	struct reiserfs_transaction_handle th;
+	struct reiserfs_security_handle security;
+	int lock_depth;
+	/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 +
+	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
+		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
+
+	dquot_initialize(dir);
+
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+	/* set flag that new packing locality created and new blocks for the content     * of that directory are not displaced yet */
+	REISERFS_I(dir)->new_packing_locality = 1;
+#endif
+	mode = S_IFDIR | mode;
+	if (!(inode = new_inode(dir->i_sb))) {
+		return -ENOMEM;
+	}
+	new_inode_init(inode, dir, mode);
+
+	jbegin_count += reiserfs_cache_default_acl(dir);
+	retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
+	lock_depth = reiserfs_write_lock_once(dir->i_sb);
+
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval) {
+		drop_new_inode(inode);
+		goto out_failed;
+	}
+
+	/* inc the link count now, so another writer doesn't overflow it while
+	 ** we sleep later on.
+	 */
+	INC_DIR_INODE_NLINK(dir)
+
+	    retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ ,
+					old_format_only(dir->i_sb) ?
+					EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
+					dentry, inode, &security);
+	if (retval) {
+		DEC_DIR_INODE_NLINK(dir)
+		goto out_failed;
+	}
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	inode->i_op = &reiserfs_dir_inode_operations;
+	inode->i_fop = &reiserfs_dir_operations;
+
+	// note, _this_ add_entry will not update dir's stat data
+	retval =
+	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
+			       dentry->d_name.len, inode, 1 /*visible */ );
+	if (retval) {
+		int err;
+		inode->i_nlink = 0;
+		DEC_DIR_INODE_NLINK(dir);
+		reiserfs_update_sd(&th, inode);
+		err = journal_end(&th, dir->i_sb, jbegin_count);
+		if (err)
+			retval = err;
+		unlock_new_inode(inode);
+		iput(inode);
+		goto out_failed;
+	}
+	// the above add_entry did not update dir's stat data
+	reiserfs_update_sd(&th, dir);
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+	retval = journal_end(&th, dir->i_sb, jbegin_count);
+out_failed:
+	reiserfs_write_unlock_once(dir->i_sb, lock_depth);
+	return retval;
+}
+
+static inline int reiserfs_empty_dir(struct inode *inode)
+{
+	/* we can cheat because an old format dir cannot have
+	 ** EMPTY_DIR_SIZE, and a new format dir cannot have
+	 ** EMPTY_DIR_SIZE_V1.  So, if the inode is either size,
+	 ** regardless of disk format version, the directory is empty.
+	 */
+	if (inode->i_size != EMPTY_DIR_SIZE &&
+	    inode->i_size != EMPTY_DIR_SIZE_V1) {
+		return 0;
+	}
+	return 1;
+}
+
+static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int retval, err;
+	struct inode *inode;
+	struct reiserfs_transaction_handle th;
+	int jbegin_count;
+	INITIALIZE_PATH(path);
+	struct reiserfs_dir_entry de;
+
+	/* we will be doing 2 balancings and update 2 stat data, we change quotas
+	 * of the owner of the directory and of the owner of the parent directory.
+	 * The quota structure is possibly deleted only on last iput => outside
+	 * of this transaction */
+	jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 2 + 2 +
+	    4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
+
+	dquot_initialize(dir);
+
+	reiserfs_write_lock(dir->i_sb);
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval)
+		goto out_rmdir;
+
+	de.de_gen_number_bit_string = NULL;
+	if ((retval =
+	     reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
+				 &path, &de)) == NAME_NOT_FOUND) {
+		retval = -ENOENT;
+		goto end_rmdir;
+	} else if (retval == IO_ERROR) {
+		retval = -EIO;
+		goto end_rmdir;
+	}
+
+	inode = dentry->d_inode;
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	if (de.de_objectid != inode->i_ino) {
+		// FIXME: compare key of an object and a key found in the
+		// entry
+		retval = -EIO;
+		goto end_rmdir;
+	}
+	if (!reiserfs_empty_dir(inode)) {
+		retval = -ENOTEMPTY;
+		goto end_rmdir;
+	}
+
+	/* cut entry from dir directory */
+	retval = reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL,	/* page */
+					0 /*new file size - not used here */ );
+	if (retval < 0)
+		goto end_rmdir;
+
+	if (inode->i_nlink != 2 && inode->i_nlink != 1)
+		reiserfs_error(inode->i_sb, "reiserfs-7040",
+			       "empty directory has nlink != 2 (%d)",
+			       inode->i_nlink);
+
+	clear_nlink(inode);
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	reiserfs_update_sd(&th, inode);
+
+	DEC_DIR_INODE_NLINK(dir)
+	    dir->i_size -= (DEH_SIZE + de.de_entrylen);
+	reiserfs_update_sd(&th, dir);
+
+	/* prevent empty directory from getting lost */
+	add_save_link(&th, inode, 0 /* not truncate */ );
+
+	retval = journal_end(&th, dir->i_sb, jbegin_count);
+	reiserfs_check_path(&path);
+      out_rmdir:
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
+
+      end_rmdir:
+	/* we must release path, because we did not call
+	   reiserfs_cut_from_item, or reiserfs_cut_from_item does not
+	   release path if operation was not complete */
+	pathrelse(&path);
+	err = journal_end(&th, dir->i_sb, jbegin_count);
+	reiserfs_write_unlock(dir->i_sb);
+	return err ? err : retval;
+}
+
+static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int retval, err;
+	struct inode *inode;
+	struct reiserfs_dir_entry de;
+	INITIALIZE_PATH(path);
+	struct reiserfs_transaction_handle th;
+	int jbegin_count;
+	unsigned long savelink;
+	int depth;
+
+	dquot_initialize(dir);
+
+	inode = dentry->d_inode;
+
+	/* in this transaction we can be doing at max two balancings and update
+	 * two stat datas, we change quotas of the owner of the directory and of
+	 * the owner of the parent directory. The quota structure is possibly
+	 * deleted only on iput => outside of this transaction */
+	jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 2 + 2 +
+	    4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
+
+	depth = reiserfs_write_lock_once(dir->i_sb);
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval)
+		goto out_unlink;
+
+	de.de_gen_number_bit_string = NULL;
+	if ((retval =
+	     reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
+				 &path, &de)) == NAME_NOT_FOUND) {
+		retval = -ENOENT;
+		goto end_unlink;
+	} else if (retval == IO_ERROR) {
+		retval = -EIO;
+		goto end_unlink;
+	}
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	if (de.de_objectid != inode->i_ino) {
+		// FIXME: compare key of an object and a key found in the
+		// entry
+		retval = -EIO;
+		goto end_unlink;
+	}
+
+	if (!inode->i_nlink) {
+		reiserfs_warning(inode->i_sb, "reiserfs-7042",
+				 "deleting nonexistent file (%lu), %d",
+				 inode->i_ino, inode->i_nlink);
+		inode->i_nlink = 1;
+	}
+
+	drop_nlink(inode);
+
+	/*
+	 * we schedule before doing the add_save_link call, save the link
+	 * count so we don't race
+	 */
+	savelink = inode->i_nlink;
+
+	retval =
+	    reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL,
+				   0);
+	if (retval < 0) {
+		inc_nlink(inode);
+		goto end_unlink;
+	}
+	inode->i_ctime = CURRENT_TIME_SEC;
+	reiserfs_update_sd(&th, inode);
+
+	dir->i_size -= (de.de_entrylen + DEH_SIZE);
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	reiserfs_update_sd(&th, dir);
+
+	if (!savelink)
+		/* prevent file from getting lost */
+		add_save_link(&th, inode, 0 /* not truncate */ );
+
+	retval = journal_end(&th, dir->i_sb, jbegin_count);
+	reiserfs_check_path(&path);
+	reiserfs_write_unlock_once(dir->i_sb, depth);
+	return retval;
+
+      end_unlink:
+	pathrelse(&path);
+	err = journal_end(&th, dir->i_sb, jbegin_count);
+	reiserfs_check_path(&path);
+	if (err)
+		retval = err;
+      out_unlink:
+	reiserfs_write_unlock_once(dir->i_sb, depth);
+	return retval;
+}
+
+static int reiserfs_symlink(struct inode *parent_dir,
+			    struct dentry *dentry, const char *symname)
+{
+	int retval;
+	struct inode *inode;
+	char *name;
+	int item_len;
+	struct reiserfs_transaction_handle th;
+	struct reiserfs_security_handle security;
+	int mode = S_IFLNK | S_IRWXUGO;
+	/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 +
+	    2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
+		 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
+
+	dquot_initialize(parent_dir);
+
+	if (!(inode = new_inode(parent_dir->i_sb))) {
+		return -ENOMEM;
+	}
+	new_inode_init(inode, parent_dir, mode);
+
+	retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name,
+					&security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
+
+	reiserfs_write_lock(parent_dir->i_sb);
+	item_len = ROUND_UP(strlen(symname));
+	if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) {
+		retval = -ENAMETOOLONG;
+		drop_new_inode(inode);
+		goto out_failed;
+	}
+
+	name = kmalloc(item_len, GFP_NOFS);
+	if (!name) {
+		drop_new_inode(inode);
+		retval = -ENOMEM;
+		goto out_failed;
+	}
+	memcpy(name, symname, strlen(symname));
+	padd_item(name, item_len, strlen(symname));
+
+	retval = journal_begin(&th, parent_dir->i_sb, jbegin_count);
+	if (retval) {
+		drop_new_inode(inode);
+		kfree(name);
+		goto out_failed;
+	}
+
+	retval =
+	    reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname),
+			       dentry, inode, &security);
+	kfree(name);
+	if (retval) {		/* reiserfs_new_inode iputs for us */
+		goto out_failed;
+	}
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(parent_dir);
+
+	inode->i_op = &reiserfs_symlink_inode_operations;
+	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
+
+	// must be sure this inode is written with this transaction
+	//
+	//reiserfs_update_sd (&th, inode, READ_BLOCKS);
+
+	retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
+				    dentry->d_name.len, inode, 1 /*visible */ );
+	if (retval) {
+		int err;
+		inode->i_nlink--;
+		reiserfs_update_sd(&th, inode);
+		err = journal_end(&th, parent_dir->i_sb, jbegin_count);
+		if (err)
+			retval = err;
+		unlock_new_inode(inode);
+		iput(inode);
+		goto out_failed;
+	}
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+	retval = journal_end(&th, parent_dir->i_sb, jbegin_count);
+      out_failed:
+	reiserfs_write_unlock(parent_dir->i_sb);
+	return retval;
+}
+
+static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
+			 struct dentry *dentry)
+{
+	int retval;
+	struct inode *inode = old_dentry->d_inode;
+	struct reiserfs_transaction_handle th;
+	/* We need blocks for transaction + update of quotas for the owners of the directory */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 +
+	    2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
+
+	dquot_initialize(dir);
+
+	reiserfs_write_lock(dir->i_sb);
+	if (inode->i_nlink >= REISERFS_LINK_MAX) {
+		//FIXME: sd_nlink is 32 bit for new files
+		reiserfs_write_unlock(dir->i_sb);
+		return -EMLINK;
+	}
+
+	/* inc before scheduling so reiserfs_unlink knows we are here */
+	inc_nlink(inode);
+
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval) {
+		inode->i_nlink--;
+		reiserfs_write_unlock(dir->i_sb);
+		return retval;
+	}
+
+	/* create new entry */
+	retval =
+	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
+			       dentry->d_name.len, inode, 1 /*visible */ );
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	if (retval) {
+		int err;
+		inode->i_nlink--;
+		err = journal_end(&th, dir->i_sb, jbegin_count);
+		reiserfs_write_unlock(dir->i_sb);
+		return err ? err : retval;
+	}
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	reiserfs_update_sd(&th, inode);
+
+	ihold(inode);
+	d_instantiate(dentry, inode);
+	retval = journal_end(&th, dir->i_sb, jbegin_count);
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
+}
+
+/* de contains information pointing to an entry which */
+static int de_still_valid(const char *name, int len,
+			  struct reiserfs_dir_entry *de)
+{
+	struct reiserfs_dir_entry tmp = *de;
+
+	// recalculate pointer to name and name length
+	set_de_name_and_namelen(&tmp);
+	// FIXME: could check more
+	if (tmp.de_namelen != len || memcmp(name, de->de_name, len))
+		return 0;
+	return 1;
+}
+
+static int entry_points_to_object(const char *name, int len,
+				  struct reiserfs_dir_entry *de,
+				  struct inode *inode)
+{
+	if (!de_still_valid(name, len, de))
+		return 0;
+
+	if (inode) {
+		if (!de_visible(de->de_deh + de->de_entry_num))
+			reiserfs_panic(inode->i_sb, "vs-7042",
+				       "entry must be visible");
+		return (de->de_objectid == inode->i_ino) ? 1 : 0;
+	}
+
+	/* this must be added hidden entry */
+	if (de_visible(de->de_deh + de->de_entry_num))
+		reiserfs_panic(NULL, "vs-7043", "entry must be visible");
+
+	return 1;
+}
+
+/* sets key of objectid the entry has to point to */
+static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de,
+				 struct reiserfs_key *key)
+{
+	/* JDM These operations are endian safe - both are le */
+	de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id;
+	de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid;
+}
+
+/*
+ * process, that is going to call fix_nodes/do_balance must hold only
+ * one path. If it holds 2 or more, it can get into endless waiting in
+ * get_empty_nodes or its clones
+ */
+static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			   struct inode *new_dir, struct dentry *new_dentry)
+{
+	int retval;
+	INITIALIZE_PATH(old_entry_path);
+	INITIALIZE_PATH(new_entry_path);
+	INITIALIZE_PATH(dot_dot_entry_path);
+	struct item_head new_entry_ih, old_entry_ih, dot_dot_ih;
+	struct reiserfs_dir_entry old_de, new_de, dot_dot_de;
+	struct inode *old_inode, *new_dentry_inode;
+	struct reiserfs_transaction_handle th;
+	int jbegin_count;
+	umode_t old_inode_mode;
+	unsigned long savelink = 1;
+	struct timespec ctime;
+
+	/* three balancings: (1) old name removal, (2) new name insertion
+	   and (3) maybe "save" link insertion
+	   stat data updates: (1) old directory,
+	   (2) new directory and (3) maybe old object stat data (when it is
+	   directory) and (4) maybe stat data of object to which new entry
+	   pointed initially and (5) maybe block containing ".." of
+	   renamed directory
+	   quota updates: two parent directories */
+	jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 + 5 +
+	    4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
+
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
+	old_inode = old_dentry->d_inode;
+	new_dentry_inode = new_dentry->d_inode;
+
+	// make sure, that oldname still exists and points to an object we
+	// are going to rename
+	old_de.de_gen_number_bit_string = NULL;
+	reiserfs_write_lock(old_dir->i_sb);
+	retval =
+	    reiserfs_find_entry(old_dir, old_dentry->d_name.name,
+				old_dentry->d_name.len, &old_entry_path,
+				&old_de);
+	pathrelse(&old_entry_path);
+	if (retval == IO_ERROR) {
+		reiserfs_write_unlock(old_dir->i_sb);
+		return -EIO;
+	}
+
+	if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) {
+		reiserfs_write_unlock(old_dir->i_sb);
+		return -ENOENT;
+	}
+
+	old_inode_mode = old_inode->i_mode;
+	if (S_ISDIR(old_inode_mode)) {
+		// make sure, that directory being renamed has correct ".."
+		// and that its new parent directory has not too many links
+		// already
+
+		if (new_dentry_inode) {
+			if (!reiserfs_empty_dir(new_dentry_inode)) {
+				reiserfs_write_unlock(old_dir->i_sb);
+				return -ENOTEMPTY;
+			}
+		}
+
+		/* directory is renamed, its parent directory will be changed,
+		 ** so find ".." entry
+		 */
+		dot_dot_de.de_gen_number_bit_string = NULL;
+		retval =
+		    reiserfs_find_entry(old_inode, "..", 2, &dot_dot_entry_path,
+					&dot_dot_de);
+		pathrelse(&dot_dot_entry_path);
+		if (retval != NAME_FOUND) {
+			reiserfs_write_unlock(old_dir->i_sb);
+			return -EIO;
+		}
+
+		/* inode number of .. must equal old_dir->i_ino */
+		if (dot_dot_de.de_objectid != old_dir->i_ino) {
+			reiserfs_write_unlock(old_dir->i_sb);
+			return -EIO;
+		}
+	}
+
+	retval = journal_begin(&th, old_dir->i_sb, jbegin_count);
+	if (retval) {
+		reiserfs_write_unlock(old_dir->i_sb);
+		return retval;
+	}
+
+	/* add new entry (or find the existing one) */
+	retval =
+	    reiserfs_add_entry(&th, new_dir, new_dentry->d_name.name,
+			       new_dentry->d_name.len, old_inode, 0);
+	if (retval == -EEXIST) {
+		if (!new_dentry_inode) {
+			reiserfs_panic(old_dir->i_sb, "vs-7050",
+				       "new entry is found, new inode == 0");
+		}
+	} else if (retval) {
+		int err = journal_end(&th, old_dir->i_sb, jbegin_count);
+		reiserfs_write_unlock(old_dir->i_sb);
+		return err ? err : retval;
+	}
+
+	reiserfs_update_inode_transaction(old_dir);
+	reiserfs_update_inode_transaction(new_dir);
+
+	/* this makes it so an fsync on an open fd for the old name will
+	 ** commit the rename operation
+	 */
+	reiserfs_update_inode_transaction(old_inode);
+
+	if (new_dentry_inode)
+		reiserfs_update_inode_transaction(new_dentry_inode);
+
+	while (1) {
+		// look for old name using corresponding entry key (found by reiserfs_find_entry)
+		if ((retval =
+		     search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key,
+					 &old_entry_path,
+					 &old_de)) != NAME_FOUND) {
+			pathrelse(&old_entry_path);
+			journal_end(&th, old_dir->i_sb, jbegin_count);
+			reiserfs_write_unlock(old_dir->i_sb);
+			return -EIO;
+		}
+
+		copy_item_head(&old_entry_ih, get_ih(&old_entry_path));
+
+		reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1);
+
+		// look for new name by reiserfs_find_entry
+		new_de.de_gen_number_bit_string = NULL;
+		retval =
+		    reiserfs_find_entry(new_dir, new_dentry->d_name.name,
+					new_dentry->d_name.len, &new_entry_path,
+					&new_de);
+		// reiserfs_add_entry should not return IO_ERROR, because it is called with essentially same parameters from
+		// reiserfs_add_entry above, and we'll catch any i/o errors before we get here.
+		if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) {
+			pathrelse(&new_entry_path);
+			pathrelse(&old_entry_path);
+			journal_end(&th, old_dir->i_sb, jbegin_count);
+			reiserfs_write_unlock(old_dir->i_sb);
+			return -EIO;
+		}
+
+		copy_item_head(&new_entry_ih, get_ih(&new_entry_path));
+
+		reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1);
+
+		if (S_ISDIR(old_inode->i_mode)) {
+			if ((retval =
+			     search_by_entry_key(new_dir->i_sb,
+						 &dot_dot_de.de_entry_key,
+						 &dot_dot_entry_path,
+						 &dot_dot_de)) != NAME_FOUND) {
+				pathrelse(&dot_dot_entry_path);
+				pathrelse(&new_entry_path);
+				pathrelse(&old_entry_path);
+				journal_end(&th, old_dir->i_sb, jbegin_count);
+				reiserfs_write_unlock(old_dir->i_sb);
+				return -EIO;
+			}
+			copy_item_head(&dot_dot_ih,
+				       get_ih(&dot_dot_entry_path));
+			// node containing ".." gets into transaction
+			reiserfs_prepare_for_journal(old_inode->i_sb,
+						     dot_dot_de.de_bh, 1);
+		}
+		/* we should check seals here, not do
+		   this stuff, yes? Then, having
+		   gathered everything into RAM we
+		   should lock the buffers, yes?  -Hans */
+		/* probably.  our rename needs to hold more
+		 ** than one path at once.  The seals would
+		 ** have to be written to deal with multi-path
+		 ** issues -chris
+		 */
+		/* sanity checking before doing the rename - avoid races many
+		 ** of the above checks could have scheduled.  We have to be
+		 ** sure our items haven't been shifted by another process.
+		 */
+		if (item_moved(&new_entry_ih, &new_entry_path) ||
+		    !entry_points_to_object(new_dentry->d_name.name,
+					    new_dentry->d_name.len,
+					    &new_de, new_dentry_inode) ||
+		    item_moved(&old_entry_ih, &old_entry_path) ||
+		    !entry_points_to_object(old_dentry->d_name.name,
+					    old_dentry->d_name.len,
+					    &old_de, old_inode)) {
+			reiserfs_restore_prepared_buffer(old_inode->i_sb,
+							 new_de.de_bh);
+			reiserfs_restore_prepared_buffer(old_inode->i_sb,
+							 old_de.de_bh);
+			if (S_ISDIR(old_inode_mode))
+				reiserfs_restore_prepared_buffer(old_inode->
+								 i_sb,
+								 dot_dot_de.
+								 de_bh);
+			continue;
+		}
+		if (S_ISDIR(old_inode_mode)) {
+			if (item_moved(&dot_dot_ih, &dot_dot_entry_path) ||
+			    !entry_points_to_object("..", 2, &dot_dot_de,
+						    old_dir)) {
+				reiserfs_restore_prepared_buffer(old_inode->
+								 i_sb,
+								 old_de.de_bh);
+				reiserfs_restore_prepared_buffer(old_inode->
+								 i_sb,
+								 new_de.de_bh);
+				reiserfs_restore_prepared_buffer(old_inode->
+								 i_sb,
+								 dot_dot_de.
+								 de_bh);
+				continue;
+			}
+		}
+
+		RFALSE(S_ISDIR(old_inode_mode) &&
+		       !buffer_journal_prepared(dot_dot_de.de_bh), "");
+
+		break;
+	}
+
+	/* ok, all the changes can be done in one fell swoop when we
+	   have claimed all the buffers needed. */
+
+	mark_de_visible(new_de.de_deh + new_de.de_entry_num);
+	set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode));
+	journal_mark_dirty(&th, old_dir->i_sb, new_de.de_bh);
+
+	mark_de_hidden(old_de.de_deh + old_de.de_entry_num);
+	journal_mark_dirty(&th, old_dir->i_sb, old_de.de_bh);
+	ctime = CURRENT_TIME_SEC;
+	old_dir->i_ctime = old_dir->i_mtime = ctime;
+	new_dir->i_ctime = new_dir->i_mtime = ctime;
+	/* thanks to Alex Adriaanse <alex_a@caltech.edu> for patch which adds ctime update of
+	   renamed object */
+	old_inode->i_ctime = ctime;
+
+	if (new_dentry_inode) {
+		// adjust link number of the victim
+		if (S_ISDIR(new_dentry_inode->i_mode)) {
+			clear_nlink(new_dentry_inode);
+		} else {
+			drop_nlink(new_dentry_inode);
+		}
+		new_dentry_inode->i_ctime = ctime;
+		savelink = new_dentry_inode->i_nlink;
+	}
+
+	if (S_ISDIR(old_inode_mode)) {
+		/* adjust ".." of renamed directory */
+		set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir));
+		journal_mark_dirty(&th, new_dir->i_sb, dot_dot_de.de_bh);
+
+		if (!new_dentry_inode)
+			/* there (in new_dir) was no directory, so it got new link
+			   (".."  of renamed directory) */
+			INC_DIR_INODE_NLINK(new_dir);
+
+		/* old directory lost one link - ".. " of renamed directory */
+		DEC_DIR_INODE_NLINK(old_dir);
+	}
+	// looks like in 2.3.99pre3 brelse is atomic. so we can use pathrelse
+	pathrelse(&new_entry_path);
+	pathrelse(&dot_dot_entry_path);
+
+	// FIXME: this reiserfs_cut_from_item's return value may screw up
+	// anybody, but it will panic if will not be able to find the
+	// entry. This needs one more clean up
+	if (reiserfs_cut_from_item
+	    (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL,
+	     0) < 0)
+		reiserfs_error(old_dir->i_sb, "vs-7060",
+			       "couldn't not cut old name. Fsck later?");
+
+	old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
+
+	reiserfs_update_sd(&th, old_dir);
+	reiserfs_update_sd(&th, new_dir);
+	reiserfs_update_sd(&th, old_inode);
+
+	if (new_dentry_inode) {
+		if (savelink == 0)
+			add_save_link(&th, new_dentry_inode,
+				      0 /* not truncate */ );
+		reiserfs_update_sd(&th, new_dentry_inode);
+	}
+
+	retval = journal_end(&th, old_dir->i_sb, jbegin_count);
+	reiserfs_write_unlock(old_dir->i_sb);
+	return retval;
+}
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations reiserfs_dir_inode_operations = {
+	//&reiserfs_dir_operations,   /* default_file_ops */
+	.create = reiserfs_create,
+	.lookup = reiserfs_lookup,
+	.link = reiserfs_link,
+	.unlink = reiserfs_unlink,
+	.symlink = reiserfs_symlink,
+	.mkdir = reiserfs_mkdir,
+	.rmdir = reiserfs_rmdir,
+	.mknod = reiserfs_mknod,
+	.rename = reiserfs_rename,
+	.setattr = reiserfs_setattr,
+	.setxattr = reiserfs_setxattr,
+	.getxattr = reiserfs_getxattr,
+	.listxattr = reiserfs_listxattr,
+	.removexattr = reiserfs_removexattr,
+	.permission = reiserfs_permission,
+};
+
+/*
+ * symlink operations.. same as page_symlink_inode_operations, with xattr
+ * stuff added
+ */
+const struct inode_operations reiserfs_symlink_inode_operations = {
+	.readlink = generic_readlink,
+	.follow_link = page_follow_link_light,
+	.put_link = page_put_link,
+	.setattr = reiserfs_setattr,
+	.setxattr = reiserfs_setxattr,
+	.getxattr = reiserfs_getxattr,
+	.listxattr = reiserfs_listxattr,
+	.removexattr = reiserfs_removexattr,
+	.permission = reiserfs_permission,
+
+};
+
+/*
+ * special file operations.. just xattr/acl stuff
+ */
+const struct inode_operations reiserfs_special_inode_operations = {
+	.setattr = reiserfs_setattr,
+	.setxattr = reiserfs_setxattr,
+	.getxattr = reiserfs_getxattr,
+	.listxattr = reiserfs_listxattr,
+	.removexattr = reiserfs_removexattr,
+	.permission = reiserfs_permission,
+
+};
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
new file mode 100644
index 00000000..3a6de810
--- /dev/null
+++ b/fs/reiserfs/objectid.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/string.h>
+#include <linux/random.h>
+#include <linux/time.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/reiserfs_fs_sb.h>
+
+// find where objectid map starts
+#define objectid_map(s,rs) (old_format_only (s) ? \
+                         (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\
+			 (__le32 *)((rs) + 1))
+
+#ifdef CONFIG_REISERFS_CHECK
+
+static void check_objectid_map(struct super_block *s, __le32 * map)
+{
+	if (le32_to_cpu(map[0]) != 1)
+		reiserfs_panic(s, "vs-15010", "map corrupted: %lx",
+			       (long unsigned int)le32_to_cpu(map[0]));
+
+	// FIXME: add something else here
+}
+
+#else
+static void check_objectid_map(struct super_block *s, __le32 * map)
+{;
+}
+#endif
+
+/* When we allocate objectids we allocate the first unused objectid.
+   Each sequence of objectids in use (the odd sequences) is followed
+   by a sequence of objectids not in use (the even sequences).  We
+   only need to record the last objectid in each of these sequences
+   (both the odd and even sequences) in order to fully define the
+   boundaries of the sequences.  A consequence of allocating the first
+   objectid not in use is that under most conditions this scheme is
+   extremely compact.  The exception is immediately after a sequence
+   of operations which deletes a large number of objects of
+   non-sequential objectids, and even then it will become compact
+   again as soon as more objects are created.  Note that many
+   interesting optimizations of layout could result from complicating
+   objectid assignment, but we have deferred making them for now. */
+
+/* get unique object identifier */
+__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
+{
+	struct super_block *s = th->t_super;
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+	__le32 *map = objectid_map(s, rs);
+	__u32 unused_objectid;
+
+	BUG_ON(!th->t_trans_id);
+
+	check_objectid_map(s, map);
+
+	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+	/* comment needed -Hans */
+	unused_objectid = le32_to_cpu(map[1]);
+	if (unused_objectid == U32_MAX) {
+		reiserfs_warning(s, "reiserfs-15100", "no more object ids");
+		reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s));
+		return 0;
+	}
+
+	/* This incrementation allocates the first unused objectid. That
+	   is to say, the first entry on the objectid map is the first
+	   unused objectid, and by incrementing it we use it.  See below
+	   where we check to see if we eliminated a sequence of unused
+	   objectids.... */
+	map[1] = cpu_to_le32(unused_objectid + 1);
+
+	/* Now we check to see if we eliminated the last remaining member of
+	   the first even sequence (and can eliminate the sequence by
+	   eliminating its last objectid from oids), and can collapse the
+	   first two odd sequences into one sequence.  If so, then the net
+	   result is to eliminate a pair of objectids from oids.  We do this
+	   by shifting the entire map to the left. */
+	if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) {
+		memmove(map + 1, map + 3,
+			(sb_oid_cursize(rs) - 3) * sizeof(__u32));
+		set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
+	}
+
+	journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s));
+	return unused_objectid;
+}
+
+/* makes object identifier unused */
+void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
+			       __u32 objectid_to_release)
+{
+	struct super_block *s = th->t_super;
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+	__le32 *map = objectid_map(s, rs);
+	int i = 0;
+
+	BUG_ON(!th->t_trans_id);
+	//return;
+	check_objectid_map(s, map);
+
+	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+	journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s));
+
+	/* start at the beginning of the objectid map (i = 0) and go to
+	   the end of it (i = disk_sb->s_oid_cursize).  Linear search is
+	   what we use, though it is possible that binary search would be
+	   more efficient after performing lots of deletions (which is
+	   when oids is large.)  We only check even i's. */
+	while (i < sb_oid_cursize(rs)) {
+		if (objectid_to_release == le32_to_cpu(map[i])) {
+			/* This incrementation unallocates the objectid. */
+			//map[i]++;
+			le32_add_cpu(&map[i], 1);
+
+			/* Did we unallocate the last member of an odd sequence, and can shrink oids? */
+			if (map[i] == map[i + 1]) {
+				/* shrink objectid map */
+				memmove(map + i, map + i + 2,
+					(sb_oid_cursize(rs) - i -
+					 2) * sizeof(__u32));
+				//disk_sb->s_oid_cursize -= 2;
+				set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
+
+				RFALSE(sb_oid_cursize(rs) < 2 ||
+				       sb_oid_cursize(rs) > sb_oid_maxsize(rs),
+				       "vs-15005: objectid map corrupted cur_size == %d (max == %d)",
+				       sb_oid_cursize(rs), sb_oid_maxsize(rs));
+			}
+			return;
+		}
+
+		if (objectid_to_release > le32_to_cpu(map[i]) &&
+		    objectid_to_release < le32_to_cpu(map[i + 1])) {
+			/* size of objectid map is not changed */
+			if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) {
+				//objectid_map[i+1]--;
+				le32_add_cpu(&map[i + 1], -1);
+				return;
+			}
+
+			/* JDM comparing two little-endian values for equality -- safe */
+			if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) {
+				/* objectid map must be expanded, but there is no space */
+				PROC_INFO_INC(s, leaked_oid);
+				return;
+			}
+
+			/* expand the objectid map */
+			memmove(map + i + 3, map + i + 1,
+				(sb_oid_cursize(rs) - i - 1) * sizeof(__u32));
+			map[i + 1] = cpu_to_le32(objectid_to_release);
+			map[i + 2] = cpu_to_le32(objectid_to_release + 1);
+			set_sb_oid_cursize(rs, sb_oid_cursize(rs) + 2);
+			return;
+		}
+		i += 2;
+	}
+
+	reiserfs_error(s, "vs-15011", "tried to free free object id (%lu)",
+		       (long unsigned)objectid_to_release);
+}
+
+int reiserfs_convert_objectid_map_v1(struct super_block *s)
+{
+	struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK(s);
+	int cur_size = sb_oid_cursize(disk_sb);
+	int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2;
+	int old_max = sb_oid_maxsize(disk_sb);
+	struct reiserfs_super_block_v1 *disk_sb_v1;
+	__le32 *objectid_map, *new_objectid_map;
+	int i;
+
+	disk_sb_v1 =
+	    (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data);
+	objectid_map = (__le32 *) (disk_sb_v1 + 1);
+	new_objectid_map = (__le32 *) (disk_sb + 1);
+
+	if (cur_size > new_size) {
+		/* mark everyone used that was listed as free at the end of the objectid
+		 ** map
+		 */
+		objectid_map[new_size - 1] = objectid_map[cur_size - 1];
+		set_sb_oid_cursize(disk_sb, new_size);
+	}
+	/* move the smaller objectid map past the end of the new super */
+	for (i = new_size - 1; i >= 0; i--) {
+		objectid_map[i + (old_max - new_size)] = objectid_map[i];
+	}
+
+	/* set the max size so we don't overflow later */
+	set_sb_oid_maxsize(disk_sb, new_size);
+
+	/* Zero out label and generate random UUID */
+	memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label));
+	generate_random_uuid(disk_sb->s_uuid);
+
+	/* finally, zero out the unused chunk of the new super */
+	memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused));
+	return 0;
+}
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
new file mode 100644
index 00000000..45de98b5
--- /dev/null
+++ b/fs/reiserfs/prints.c
@@ -0,0 +1,768 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+
+#include <stdarg.h>
+
+static char error_buf[1024];
+static char fmt_buf[1024];
+static char off_buf[80];
+
+static char *reiserfs_cpu_offset(struct cpu_key *key)
+{
+	if (cpu_key_k_type(key) == TYPE_DIRENTRY)
+		sprintf(off_buf, "%Lu(%Lu)",
+			(unsigned long long)
+			GET_HASH_VALUE(cpu_key_k_offset(key)),
+			(unsigned long long)
+			GET_GENERATION_NUMBER(cpu_key_k_offset(key)));
+	else
+		sprintf(off_buf, "0x%Lx",
+			(unsigned long long)cpu_key_k_offset(key));
+	return off_buf;
+}
+
+static char *le_offset(struct reiserfs_key *key)
+{
+	int version;
+
+	version = le_key_version(key);
+	if (le_key_k_type(version, key) == TYPE_DIRENTRY)
+		sprintf(off_buf, "%Lu(%Lu)",
+			(unsigned long long)
+			GET_HASH_VALUE(le_key_k_offset(version, key)),
+			(unsigned long long)
+			GET_GENERATION_NUMBER(le_key_k_offset(version, key)));
+	else
+		sprintf(off_buf, "0x%Lx",
+			(unsigned long long)le_key_k_offset(version, key));
+	return off_buf;
+}
+
+static char *cpu_type(struct cpu_key *key)
+{
+	if (cpu_key_k_type(key) == TYPE_STAT_DATA)
+		return "SD";
+	if (cpu_key_k_type(key) == TYPE_DIRENTRY)
+		return "DIR";
+	if (cpu_key_k_type(key) == TYPE_DIRECT)
+		return "DIRECT";
+	if (cpu_key_k_type(key) == TYPE_INDIRECT)
+		return "IND";
+	return "UNKNOWN";
+}
+
+static char *le_type(struct reiserfs_key *key)
+{
+	int version;
+
+	version = le_key_version(key);
+
+	if (le_key_k_type(version, key) == TYPE_STAT_DATA)
+		return "SD";
+	if (le_key_k_type(version, key) == TYPE_DIRENTRY)
+		return "DIR";
+	if (le_key_k_type(version, key) == TYPE_DIRECT)
+		return "DIRECT";
+	if (le_key_k_type(version, key) == TYPE_INDIRECT)
+		return "IND";
+	return "UNKNOWN";
+}
+
+/* %k */
+static void sprintf_le_key(char *buf, struct reiserfs_key *key)
+{
+	if (key)
+		sprintf(buf, "[%d %d %s %s]", le32_to_cpu(key->k_dir_id),
+			le32_to_cpu(key->k_objectid), le_offset(key),
+			le_type(key));
+	else
+		sprintf(buf, "[NULL]");
+}
+
+/* %K */
+static void sprintf_cpu_key(char *buf, struct cpu_key *key)
+{
+	if (key)
+		sprintf(buf, "[%d %d %s %s]", key->on_disk_key.k_dir_id,
+			key->on_disk_key.k_objectid, reiserfs_cpu_offset(key),
+			cpu_type(key));
+	else
+		sprintf(buf, "[NULL]");
+}
+
+static void sprintf_de_head(char *buf, struct reiserfs_de_head *deh)
+{
+	if (deh)
+		sprintf(buf,
+			"[offset=%d dir_id=%d objectid=%d location=%d state=%04x]",
+			deh_offset(deh), deh_dir_id(deh), deh_objectid(deh),
+			deh_location(deh), deh_state(deh));
+	else
+		sprintf(buf, "[NULL]");
+
+}
+
+static void sprintf_item_head(char *buf, struct item_head *ih)
+{
+	if (ih) {
+		strcpy(buf,
+		       (ih_version(ih) == KEY_FORMAT_3_6) ? "*3.6* " : "*3.5*");
+		sprintf_le_key(buf + strlen(buf), &(ih->ih_key));
+		sprintf(buf + strlen(buf), ", item_len %d, item_location %d, "
+			"free_space(entry_count) %d",
+			ih_item_len(ih), ih_location(ih), ih_free_space(ih));
+	} else
+		sprintf(buf, "[NULL]");
+}
+
+static void sprintf_direntry(char *buf, struct reiserfs_dir_entry *de)
+{
+	char name[20];
+
+	memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen);
+	name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0;
+	sprintf(buf, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid);
+}
+
+static void sprintf_block_head(char *buf, struct buffer_head *bh)
+{
+	sprintf(buf, "level=%d, nr_items=%d, free_space=%d rdkey ",
+		B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh));
+}
+
+static void sprintf_buffer_head(char *buf, struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+
+	sprintf(buf,
+		"dev %s, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
+		bdevname(bh->b_bdev, b), bh->b_size,
+		(unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)),
+		bh->b_state, bh->b_page,
+		buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE",
+		buffer_dirty(bh) ? "DIRTY" : "CLEAN",
+		buffer_locked(bh) ? "LOCKED" : "UNLOCKED");
+}
+
+static void sprintf_disk_child(char *buf, struct disk_child *dc)
+{
+	sprintf(buf, "[dc_number=%d, dc_size=%u]", dc_block_number(dc),
+		dc_size(dc));
+}
+
+static char *is_there_reiserfs_struct(char *fmt, int *what)
+{
+	char *k = fmt;
+
+	while ((k = strchr(k, '%')) != NULL) {
+		if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' ||
+		    k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') {
+			*what = k[1];
+			break;
+		}
+		k++;
+	}
+	return k;
+}
+
+/* debugging reiserfs we used to print out a lot of different
+   variables, like keys, item headers, buffer heads etc. Values of
+   most fields matter. So it took a long time just to write
+   appropriative printk. With this reiserfs_warning you can use format
+   specification for complex structures like you used to do with
+   printfs for integers, doubles and pointers. For instance, to print
+   out key structure you have to write just:
+   reiserfs_warning ("bad key %k", key);
+   instead of
+   printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid,
+           key->k_offset, key->k_uniqueness);
+*/
+static DEFINE_SPINLOCK(error_lock);
+static void prepare_error_buf(const char *fmt, va_list args)
+{
+	char *fmt1 = fmt_buf;
+	char *k;
+	char *p = error_buf;
+	int what;
+
+	spin_lock(&error_lock);
+
+	strcpy(fmt1, fmt);
+
+	while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) {
+		*k = 0;
+
+		p += vsprintf(p, fmt1, args);
+
+		switch (what) {
+		case 'k':
+			sprintf_le_key(p, va_arg(args, struct reiserfs_key *));
+			break;
+		case 'K':
+			sprintf_cpu_key(p, va_arg(args, struct cpu_key *));
+			break;
+		case 'h':
+			sprintf_item_head(p, va_arg(args, struct item_head *));
+			break;
+		case 't':
+			sprintf_direntry(p,
+					 va_arg(args,
+						struct reiserfs_dir_entry *));
+			break;
+		case 'y':
+			sprintf_disk_child(p,
+					   va_arg(args, struct disk_child *));
+			break;
+		case 'z':
+			sprintf_block_head(p,
+					   va_arg(args, struct buffer_head *));
+			break;
+		case 'b':
+			sprintf_buffer_head(p,
+					    va_arg(args, struct buffer_head *));
+			break;
+		case 'a':
+			sprintf_de_head(p,
+					va_arg(args,
+					       struct reiserfs_de_head *));
+			break;
+		}
+
+		p += strlen(p);
+		fmt1 = k + 2;
+	}
+	vsprintf(p, fmt1, args);
+	spin_unlock(&error_lock);
+
+}
+
+/* in addition to usual conversion specifiers this accepts reiserfs
+   specific conversion specifiers:
+   %k to print little endian key,
+   %K to print cpu key,
+   %h to print item_head,
+   %t to print directory entry
+   %z to print block head (arg must be struct buffer_head *
+   %b to print buffer_head
+*/
+
+#define do_reiserfs_warning(fmt)\
+{\
+    va_list args;\
+    va_start( args, fmt );\
+    prepare_error_buf( fmt, args );\
+    va_end( args );\
+}
+
+void __reiserfs_warning(struct super_block *sb, const char *id,
+			 const char *function, const char *fmt, ...)
+{
+	do_reiserfs_warning(fmt);
+	if (sb)
+		printk(KERN_WARNING "REISERFS warning (device %s): %s%s%s: "
+		       "%s\n", sb->s_id, id ? id : "", id ? " " : "",
+		       function, error_buf);
+	else
+		printk(KERN_WARNING "REISERFS warning: %s%s%s: %s\n",
+		       id ? id : "", id ? " " : "", function, error_buf);
+}
+
+/* No newline.. reiserfs_info calls can be followed by printk's */
+void reiserfs_info(struct super_block *sb, const char *fmt, ...)
+{
+	do_reiserfs_warning(fmt);
+	if (sb)
+		printk(KERN_NOTICE "REISERFS (device %s): %s",
+		       sb->s_id, error_buf);
+	else
+		printk(KERN_NOTICE "REISERFS %s:", error_buf);
+}
+
+/* No newline.. reiserfs_printk calls can be followed by printk's */
+static void reiserfs_printk(const char *fmt, ...)
+{
+	do_reiserfs_warning(fmt);
+	printk(error_buf);
+}
+
+void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
+{
+#ifdef CONFIG_REISERFS_CHECK
+	do_reiserfs_warning(fmt);
+	if (s)
+		printk(KERN_DEBUG "REISERFS debug (device %s): %s\n",
+		       s->s_id, error_buf);
+	else
+		printk(KERN_DEBUG "REISERFS debug: %s\n", error_buf);
+#endif
+}
+
+/* The format:
+
+           maintainer-errorid: [function-name:] message
+
+    where errorid is unique to the maintainer and function-name is
+    optional, is recommended, so that anyone can easily find the bug
+    with a simple grep for the short to type string
+    maintainer-errorid.  Don't bother with reusing errorids, there are
+    lots of numbers out there.
+
+    Example:
+
+    reiserfs_panic(
+	p_sb, "reiser-29: reiserfs_new_blocknrs: "
+	"one of search_start or rn(%d) is equal to MAX_B_NUM,"
+	"which means that we are optimizing location based on the bogus location of a temp buffer (%p).",
+	rn, bh
+    );
+
+    Regular panic()s sometimes clear the screen before the message can
+    be read, thus the need for the while loop.
+
+    Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it
+    pointless complexity):
+
+    panics in reiserfs_fs.h have numbers from 1000 to 1999
+    super.c				        2000 to 2999
+    preserve.c (unused)			    3000 to 3999
+    bitmap.c				    4000 to 4999
+    stree.c				        5000 to 5999
+    prints.c				    6000 to 6999
+    namei.c                     7000 to 7999
+    fix_nodes.c                 8000 to 8999
+    dir.c                       9000 to 9999
+	lbalance.c					10000 to 10999
+	ibalance.c		11000 to 11999 not ready
+	do_balan.c		12000 to 12999
+	inode.c			13000 to 13999
+	file.c			14000 to 14999
+    objectid.c                       15000 - 15999
+    buffer.c                         16000 - 16999
+    symlink.c                        17000 - 17999
+
+   .  */
+
+void __reiserfs_panic(struct super_block *sb, const char *id,
+		      const char *function, const char *fmt, ...)
+{
+	do_reiserfs_warning(fmt);
+
+#ifdef CONFIG_REISERFS_CHECK
+	dump_stack();
+#endif
+	if (sb)
+		panic(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n",
+		      sb->s_id, id ? id : "", id ? " " : "",
+		      function, error_buf);
+	else
+		panic(KERN_WARNING "REISERFS panic: %s%s%s: %s\n",
+		      id ? id : "", id ? " " : "", function, error_buf);
+}
+
+void __reiserfs_error(struct super_block *sb, const char *id,
+		      const char *function, const char *fmt, ...)
+{
+	do_reiserfs_warning(fmt);
+
+	BUG_ON(sb == NULL);
+
+	if (reiserfs_error_panic(sb))
+		__reiserfs_panic(sb, id, function, error_buf);
+
+	if (id && id[0])
+		printk(KERN_CRIT "REISERFS error (device %s): %s %s: %s\n",
+		       sb->s_id, id, function, error_buf);
+	else
+		printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n",
+		       sb->s_id, function, error_buf);
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	reiserfs_info(sb, "Remounting filesystem read-only\n");
+	sb->s_flags |= MS_RDONLY;
+	reiserfs_abort_journal(sb, -EIO);
+}
+
+void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
+{
+	do_reiserfs_warning(fmt);
+
+	if (reiserfs_error_panic(sb)) {
+		panic(KERN_CRIT "REISERFS panic (device %s): %s\n", sb->s_id,
+		      error_buf);
+	}
+
+	if (reiserfs_is_journal_aborted(SB_JOURNAL(sb)))
+		return;
+
+	printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id,
+	       error_buf);
+
+	sb->s_flags |= MS_RDONLY;
+	reiserfs_abort_journal(sb, errno);
+}
+
+/* this prints internal nodes (4 keys/items in line) (dc_number,
+   dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number,
+   dc_size)...*/
+static int print_internal(struct buffer_head *bh, int first, int last)
+{
+	struct reiserfs_key *key;
+	struct disk_child *dc;
+	int i;
+	int from, to;
+
+	if (!B_IS_KEYS_LEVEL(bh))
+		return 1;
+
+	check_internal(bh);
+
+	if (first == -1) {
+		from = 0;
+		to = B_NR_ITEMS(bh);
+	} else {
+		from = first;
+		to = last < B_NR_ITEMS(bh) ? last : B_NR_ITEMS(bh);
+	}
+
+	reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh);
+
+	dc = B_N_CHILD(bh, from);
+	reiserfs_printk("PTR %d: %y ", from, dc);
+
+	for (i = from, key = B_N_PDELIM_KEY(bh, from), dc++; i < to;
+	     i++, key++, dc++) {
+		reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc);
+		if (i && i % 4 == 0)
+			printk("\n");
+	}
+	printk("\n");
+	return 0;
+}
+
+static int print_leaf(struct buffer_head *bh, int print_mode, int first,
+		      int last)
+{
+	struct block_head *blkh;
+	struct item_head *ih;
+	int i, nr;
+	int from, to;
+
+	if (!B_IS_ITEMS_LEVEL(bh))
+		return 1;
+
+	check_leaf(bh);
+
+	blkh = B_BLK_HEAD(bh);
+	ih = B_N_PITEM_HEAD(bh, 0);
+	nr = blkh_nr_item(blkh);
+
+	printk
+	    ("\n===================================================================\n");
+	reiserfs_printk("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh);
+
+	if (!(print_mode & PRINT_LEAF_ITEMS)) {
+		reiserfs_printk("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n",
+				&(ih->ih_key), &((ih + nr - 1)->ih_key));
+		return 0;
+	}
+
+	if (first < 0 || first > nr - 1)
+		from = 0;
+	else
+		from = first;
+
+	if (last < 0 || last > nr)
+		to = nr;
+	else
+		to = last;
+
+	ih += from;
+	printk
+	    ("-------------------------------------------------------------------------------\n");
+	printk
+	    ("|##|   type    |           key           | ilen | free_space | version | loc  |\n");
+	for (i = from; i < to; i++, ih++) {
+		printk
+		    ("-------------------------------------------------------------------------------\n");
+		reiserfs_printk("|%2d| %h |\n", i, ih);
+		if (print_mode & PRINT_LEAF_ITEMS)
+			op_print_item(ih, B_I_PITEM(bh, ih));
+	}
+
+	printk
+	    ("===================================================================\n");
+
+	return 0;
+}
+
+char *reiserfs_hashname(int code)
+{
+	if (code == YURA_HASH)
+		return "rupasov";
+	if (code == TEA_HASH)
+		return "tea";
+	if (code == R5_HASH)
+		return "r5";
+
+	return "unknown";
+}
+
+/* return 1 if this is not super block */
+static int print_super_block(struct buffer_head *bh)
+{
+	struct reiserfs_super_block *rs =
+	    (struct reiserfs_super_block *)(bh->b_data);
+	int skipped, data_blocks;
+	char *version;
+	char b[BDEVNAME_SIZE];
+
+	if (is_reiserfs_3_5(rs)) {
+		version = "3.5";
+	} else if (is_reiserfs_3_6(rs)) {
+		version = "3.6";
+	} else if (is_reiserfs_jr(rs)) {
+		version = ((sb_version(rs) == REISERFS_VERSION_2) ?
+			   "3.6" : "3.5");
+	} else {
+		return 1;
+	}
+
+	printk("%s\'s super block is in block %llu\n", bdevname(bh->b_bdev, b),
+	       (unsigned long long)bh->b_blocknr);
+	printk("Reiserfs version %s\n", version);
+	printk("Block count %u\n", sb_block_count(rs));
+	printk("Blocksize %d\n", sb_blocksize(rs));
+	printk("Free blocks %u\n", sb_free_blocks(rs));
+	// FIXME: this would be confusing if
+	// someone stores reiserfs super block in some data block ;)
+//    skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs);
+	skipped = bh->b_blocknr;
+	data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) -
+	    (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) +
+	     1 : sb_reserved_for_journal(rs)) - sb_free_blocks(rs);
+	printk
+	    ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n"
+	     "1 super block, %d data blocks\n", skipped, sb_bmap_nr(rs),
+	     (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) :
+	      sb_reserved_for_journal(rs)), data_blocks);
+	printk("Root block %u\n", sb_root_block(rs));
+	printk("Journal block (first) %d\n", sb_jp_journal_1st_block(rs));
+	printk("Journal dev %d\n", sb_jp_journal_dev(rs));
+	printk("Journal orig size %d\n", sb_jp_journal_size(rs));
+	printk("FS state %d\n", sb_fs_state(rs));
+	printk("Hash function \"%s\"\n",
+	       reiserfs_hashname(sb_hash_function_code(rs)));
+
+	printk("Tree height %d\n", sb_tree_height(rs));
+	return 0;
+}
+
+static int print_desc_block(struct buffer_head *bh)
+{
+	struct reiserfs_journal_desc *desc;
+
+	if (memcmp(get_journal_desc_magic(bh), JOURNAL_DESC_MAGIC, 8))
+		return 1;
+
+	desc = (struct reiserfs_journal_desc *)(bh->b_data);
+	printk("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)",
+	       (unsigned long long)bh->b_blocknr, get_desc_trans_id(desc),
+	       get_desc_mount_id(desc), get_desc_trans_len(desc));
+
+	return 0;
+}
+
+void print_block(struct buffer_head *bh, ...)	//int print_mode, int first, int last)
+{
+	va_list args;
+	int mode, first, last;
+
+	if (!bh) {
+		printk("print_block: buffer is NULL\n");
+		return;
+	}
+
+	va_start(args, bh);
+
+	mode = va_arg(args, int);
+	first = va_arg(args, int);
+	last = va_arg(args, int);
+	if (print_leaf(bh, mode, first, last))
+		if (print_internal(bh, first, last))
+			if (print_super_block(bh))
+				if (print_desc_block(bh))
+					printk
+					    ("Block %llu contains unformatted data\n",
+					     (unsigned long long)bh->b_blocknr);
+
+	va_end(args);
+}
+
+static char print_tb_buf[2048];
+
+/* this stores initial state of tree balance in the print_tb_buf */
+void store_print_tb(struct tree_balance *tb)
+{
+	int h = 0;
+	int i;
+	struct buffer_head *tbSh, *tbFh;
+
+	if (!tb)
+		return;
+
+	sprintf(print_tb_buf, "\n"
+		"BALANCING %d\n"
+		"MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n"
+		"=====================================================================\n"
+		"* h *    S    *    L    *    R    *   F   *   FL  *   FR  *  CFL  *  CFR  *\n",
+		REISERFS_SB(tb->tb_sb)->s_do_balance,
+		tb->tb_mode, PATH_LAST_POSITION(tb->tb_path),
+		tb->tb_path->pos_in_item);
+
+	for (h = 0; h < ARRAY_SIZE(tb->insert_size); h++) {
+		if (PATH_H_PATH_OFFSET(tb->tb_path, h) <=
+		    tb->tb_path->path_length
+		    && PATH_H_PATH_OFFSET(tb->tb_path,
+					  h) > ILLEGAL_PATH_ELEMENT_OFFSET) {
+			tbSh = PATH_H_PBUFFER(tb->tb_path, h);
+			tbFh = PATH_H_PPARENT(tb->tb_path, h);
+		} else {
+			tbSh = NULL;
+			tbFh = NULL;
+		}
+		sprintf(print_tb_buf + strlen(print_tb_buf),
+			"* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n",
+			h,
+			(tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL),
+			(tbSh) ? atomic_read(&(tbSh->b_count)) : -1,
+			(tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL),
+			(tb->L[h]) ? atomic_read(&(tb->L[h]->b_count)) : -1,
+			(tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL),
+			(tb->R[h]) ? atomic_read(&(tb->R[h]->b_count)) : -1,
+			(tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL),
+			(tb->FL[h]) ? (long long)(tb->FL[h]->
+						  b_blocknr) : (-1LL),
+			(tb->FR[h]) ? (long long)(tb->FR[h]->
+						  b_blocknr) : (-1LL),
+			(tb->CFL[h]) ? (long long)(tb->CFL[h]->
+						   b_blocknr) : (-1LL),
+			(tb->CFR[h]) ? (long long)(tb->CFR[h]->
+						   b_blocknr) : (-1LL));
+	}
+
+	sprintf(print_tb_buf + strlen(print_tb_buf),
+		"=====================================================================\n"
+		"* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n"
+		"* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n",
+		tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],
+		tb->rbytes, tb->blknum[0], tb->s0num, tb->s1num, tb->s1bytes,
+		tb->s2num, tb->s2bytes, tb->cur_blknum, tb->lkey[0],
+		tb->rkey[0]);
+
+	/* this prints balance parameters for non-leaf levels */
+	h = 0;
+	do {
+		h++;
+		sprintf(print_tb_buf + strlen(print_tb_buf),
+			"* %d * %4d * %2d *    * %2d *    * %2d *\n",
+			h, tb->insert_size[h], tb->lnum[h], tb->rnum[h],
+			tb->blknum[h]);
+	} while (tb->insert_size[h]);
+
+	sprintf(print_tb_buf + strlen(print_tb_buf),
+		"=====================================================================\n"
+		"FEB list: ");
+
+	/* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */
+	h = 0;
+	for (i = 0; i < ARRAY_SIZE(tb->FEB); i++)
+		sprintf(print_tb_buf + strlen(print_tb_buf),
+			"%p (%llu %d)%s", tb->FEB[i],
+			tb->FEB[i] ? (unsigned long long)tb->FEB[i]->
+			b_blocknr : 0ULL,
+			tb->FEB[i] ? atomic_read(&(tb->FEB[i]->b_count)) : 0,
+			(i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", ");
+
+	sprintf(print_tb_buf + strlen(print_tb_buf),
+		"======================== the end ====================================\n");
+}
+
+void print_cur_tb(char *mes)
+{
+	printk("%s\n%s", mes, print_tb_buf);
+}
+
+static void check_leaf_block_head(struct buffer_head *bh)
+{
+	struct block_head *blkh;
+	int nr;
+
+	blkh = B_BLK_HEAD(bh);
+	nr = blkh_nr_item(blkh);
+	if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE)
+		reiserfs_panic(NULL, "vs-6010", "invalid item number %z",
+			       bh);
+	if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr)
+		reiserfs_panic(NULL, "vs-6020", "invalid free space %z",
+			       bh);
+
+}
+
+static void check_internal_block_head(struct buffer_head *bh)
+{
+	struct block_head *blkh;
+
+	blkh = B_BLK_HEAD(bh);
+	if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT))
+		reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh);
+
+	if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE)
+		reiserfs_panic(NULL, "vs-6030", "invalid item number %z", bh);
+
+	if (B_FREE_SPACE(bh) !=
+	    bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) -
+	    DC_SIZE * (B_NR_ITEMS(bh) + 1))
+		reiserfs_panic(NULL, "vs-6040", "invalid free space %z", bh);
+
+}
+
+void check_leaf(struct buffer_head *bh)
+{
+	int i;
+	struct item_head *ih;
+
+	if (!bh)
+		return;
+	check_leaf_block_head(bh);
+	for (i = 0, ih = B_N_PITEM_HEAD(bh, 0); i < B_NR_ITEMS(bh); i++, ih++)
+		op_check_item(ih, B_I_PITEM(bh, ih));
+}
+
+void check_internal(struct buffer_head *bh)
+{
+	if (!bh)
+		return;
+	check_internal_block_head(bh);
+}
+
+void print_statistics(struct super_block *s)
+{
+
+	/*
+	   printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \
+	   bmap with search %d, without %d, dir2ind %d, ind2dir %d\n",
+	   REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes,
+	   REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search,
+	   REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct);
+	 */
+
+}
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
new file mode 100644
index 00000000..7a998119
--- /dev/null
+++ b/fs/reiserfs/procfs.c
@@ -0,0 +1,576 @@
+/* -*- linux-c -*- */
+
+/* fs/reiserfs/procfs.c */
+
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+/* proc info support a la one created by Sizif@Botik.RU for PGC */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/reiserfs_fs_sb.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+
+/*
+ * LOCKING:
+ *
+ * We rely on new Alexander Viro's super-block locking.
+ *
+ */
+
+static int show_version(struct seq_file *m, struct super_block *sb)
+{
+	char *format;
+
+	if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) {
+		format = "3.6";
+	} else if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5)) {
+		format = "3.5";
+	} else {
+		format = "unknown";
+	}
+
+	seq_printf(m, "%s format\twith checks %s\n", format,
+#if defined( CONFIG_REISERFS_CHECK )
+		   "on"
+#else
+		   "off"
+#endif
+	    );
+	return 0;
+}
+
+#define SF( x ) ( r -> x )
+#define SFP( x ) SF( s_proc_info_data.x )
+#define SFPL( x ) SFP( x[ level ] )
+#define SFPF( x ) SFP( scan_bitmap.x )
+#define SFPJ( x ) SFP( journal.x )
+
+#define D2C( x ) le16_to_cpu( x )
+#define D4C( x ) le32_to_cpu( x )
+#define DF( x ) D2C( rs -> s_v1.x )
+#define DFL( x ) D4C( rs -> s_v1.x )
+
+#define objectid_map( s, rs ) (old_format_only (s) ?				\
+                         (__le32 *)((struct reiserfs_super_block_v1 *)rs + 1) :	\
+			 (__le32 *)(rs + 1))
+#define MAP( i ) D4C( objectid_map( sb, rs )[ i ] )
+
+#define DJF( x ) le32_to_cpu( rs -> x )
+#define DJV( x ) le32_to_cpu( s_v1 -> x )
+#define DJP( x ) le32_to_cpu( jp -> x )
+#define JF( x ) ( r -> s_journal -> x )
+
+static int show_super(struct seq_file *m, struct super_block *sb)
+{
+	struct reiserfs_sb_info *r = REISERFS_SB(sb);
+
+	seq_printf(m, "state: \t%s\n"
+		   "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
+		   "gen. counter: \t%i\n"
+		   "s_disk_reads: \t%i\n"
+		   "s_disk_writes: \t%i\n"
+		   "s_fix_nodes: \t%i\n"
+		   "s_do_balance: \t%i\n"
+		   "s_unneeded_left_neighbor: \t%i\n"
+		   "s_good_search_by_key_reada: \t%i\n"
+		   "s_bmaps: \t%i\n"
+		   "s_bmaps_without_search: \t%i\n"
+		   "s_direct2indirect: \t%i\n"
+		   "s_indirect2direct: \t%i\n"
+		   "\n"
+		   "max_hash_collisions: \t%i\n"
+		   "breads: \t%lu\n"
+		   "bread_misses: \t%lu\n"
+		   "search_by_key: \t%lu\n"
+		   "search_by_key_fs_changed: \t%lu\n"
+		   "search_by_key_restarted: \t%lu\n"
+		   "insert_item_restarted: \t%lu\n"
+		   "paste_into_item_restarted: \t%lu\n"
+		   "cut_from_item_restarted: \t%lu\n"
+		   "delete_solid_item_restarted: \t%lu\n"
+		   "delete_item_restarted: \t%lu\n"
+		   "leaked_oid: \t%lu\n"
+		   "leaves_removable: \t%lu\n",
+		   SF(s_mount_state) == REISERFS_VALID_FS ?
+		   "REISERFS_VALID_FS" : "REISERFS_ERROR_FS",
+		   reiserfs_r5_hash(sb) ? "FORCE_R5 " : "",
+		   reiserfs_rupasov_hash(sb) ? "FORCE_RUPASOV " : "",
+		   reiserfs_tea_hash(sb) ? "FORCE_TEA " : "",
+		   reiserfs_hash_detect(sb) ? "DETECT_HASH " : "",
+		   reiserfs_no_border(sb) ? "NO_BORDER " : "BORDER ",
+		   reiserfs_no_unhashed_relocation(sb) ?
+		   "NO_UNHASHED_RELOCATION " : "",
+		   reiserfs_hashed_relocation(sb) ? "UNHASHED_RELOCATION " : "",
+		   reiserfs_test4(sb) ? "TEST4 " : "",
+		   have_large_tails(sb) ? "TAILS " : have_small_tails(sb) ?
+		   "SMALL_TAILS " : "NO_TAILS ",
+		   replay_only(sb) ? "REPLAY_ONLY " : "",
+		   convert_reiserfs(sb) ? "CONV " : "",
+		   atomic_read(&r->s_generation_counter),
+		   SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes),
+		   SF(s_do_balance), SF(s_unneeded_left_neighbor),
+		   SF(s_good_search_by_key_reada), SF(s_bmaps),
+		   SF(s_bmaps_without_search), SF(s_direct2indirect),
+		   SF(s_indirect2direct), SFP(max_hash_collisions), SFP(breads),
+		   SFP(bread_miss), SFP(search_by_key),
+		   SFP(search_by_key_fs_changed), SFP(search_by_key_restarted),
+		   SFP(insert_item_restarted), SFP(paste_into_item_restarted),
+		   SFP(cut_from_item_restarted),
+		   SFP(delete_solid_item_restarted), SFP(delete_item_restarted),
+		   SFP(leaked_oid), SFP(leaves_removable));
+
+	return 0;
+}
+
+static int show_per_level(struct seq_file *m, struct super_block *sb)
+{
+	struct reiserfs_sb_info *r = REISERFS_SB(sb);
+	int level;
+
+	seq_printf(m, "level\t"
+		   "     balances"
+		   " [sbk:  reads"
+		   "   fs_changed"
+		   "   restarted]"
+		   "   free space"
+		   "        items"
+		   "   can_remove"
+		   "         lnum"
+		   "         rnum"
+		   "       lbytes"
+		   "       rbytes"
+		   "     get_neig"
+		   " get_neig_res" "  need_l_neig" "  need_r_neig" "\n");
+
+	for (level = 0; level < MAX_HEIGHT; ++level) {
+		seq_printf(m, "%i\t"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12li"
+			   " %12li"
+			   " %12li"
+			   " %12li"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   "\n",
+			   level,
+			   SFPL(balance_at),
+			   SFPL(sbk_read_at),
+			   SFPL(sbk_fs_changed),
+			   SFPL(sbk_restarted),
+			   SFPL(free_at),
+			   SFPL(items_at),
+			   SFPL(can_node_be_removed),
+			   SFPL(lnum),
+			   SFPL(rnum),
+			   SFPL(lbytes),
+			   SFPL(rbytes),
+			   SFPL(get_neighbors),
+			   SFPL(get_neighbors_restart),
+			   SFPL(need_l_neighbor), SFPL(need_r_neighbor)
+		    );
+	}
+	return 0;
+}
+
+static int show_bitmap(struct seq_file *m, struct super_block *sb)
+{
+	struct reiserfs_sb_info *r = REISERFS_SB(sb);
+
+	seq_printf(m, "free_block: %lu\n"
+		   "  scan_bitmap:"
+		   "          wait"
+		   "          bmap"
+		   "         retry"
+		   "        stolen"
+		   "  journal_hint"
+		   "journal_nohint"
+		   "\n"
+		   " %14lu"
+		   " %14lu"
+		   " %14lu"
+		   " %14lu"
+		   " %14lu"
+		   " %14lu"
+		   " %14lu"
+		   "\n",
+		   SFP(free_block),
+		   SFPF(call),
+		   SFPF(wait),
+		   SFPF(bmap),
+		   SFPF(retry),
+		   SFPF(stolen),
+		   SFPF(in_journal_hint), SFPF(in_journal_nohint));
+
+	return 0;
+}
+
+static int show_on_disk_super(struct seq_file *m, struct super_block *sb)
+{
+	struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
+	struct reiserfs_super_block *rs = sb_info->s_rs;
+	int hash_code = DFL(s_hash_function_code);
+	__u32 flags = DJF(s_flags);
+
+	seq_printf(m, "block_count: \t%i\n"
+		   "free_blocks: \t%i\n"
+		   "root_block: \t%i\n"
+		   "blocksize: \t%i\n"
+		   "oid_maxsize: \t%i\n"
+		   "oid_cursize: \t%i\n"
+		   "umount_state: \t%i\n"
+		   "magic: \t%10.10s\n"
+		   "fs_state: \t%i\n"
+		   "hash: \t%s\n"
+		   "tree_height: \t%i\n"
+		   "bmap_nr: \t%i\n"
+		   "version: \t%i\n"
+		   "flags: \t%x[%s]\n"
+		   "reserved_for_journal: \t%i\n",
+		   DFL(s_block_count),
+		   DFL(s_free_blocks),
+		   DFL(s_root_block),
+		   DF(s_blocksize),
+		   DF(s_oid_maxsize),
+		   DF(s_oid_cursize),
+		   DF(s_umount_state),
+		   rs->s_v1.s_magic,
+		   DF(s_fs_state),
+		   hash_code == TEA_HASH ? "tea" :
+		   (hash_code == YURA_HASH) ? "rupasov" :
+		   (hash_code == R5_HASH) ? "r5" :
+		   (hash_code == UNSET_HASH) ? "unset" : "unknown",
+		   DF(s_tree_height),
+		   DF(s_bmap_nr),
+		   DF(s_version), flags, (flags & reiserfs_attrs_cleared)
+		   ? "attrs_cleared" : "", DF(s_reserved_for_journal));
+
+	return 0;
+}
+
+static int show_oidmap(struct seq_file *m, struct super_block *sb)
+{
+	struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
+	struct reiserfs_super_block *rs = sb_info->s_rs;
+	unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize);
+	unsigned long total_used = 0;
+	int i;
+
+	for (i = 0; i < mapsize; ++i) {
+		__u32 right;
+
+		right = (i == mapsize - 1) ? MAX_KEY_OBJECTID : MAP(i + 1);
+		seq_printf(m, "%s: [ %x .. %x )\n",
+			   (i & 1) ? "free" : "used", MAP(i), right);
+		if (!(i & 1)) {
+			total_used += right - MAP(i);
+		}
+	}
+#if defined( REISERFS_USE_OIDMAPF )
+	if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) {
+		loff_t size = sb_info->oidmap.mapf->f_path.dentry->d_inode->i_size;
+		total_used += size / sizeof(reiserfs_oidinterval_d_t);
+	}
+#endif
+	seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n",
+		   mapsize,
+		   mapsize, le16_to_cpu(rs->s_v1.s_oid_maxsize), total_used);
+	return 0;
+}
+
+static int show_journal(struct seq_file *m, struct super_block *sb)
+{
+	struct reiserfs_sb_info *r = REISERFS_SB(sb);
+	struct reiserfs_super_block *rs = r->s_rs;
+	struct journal_params *jp = &rs->s_v1.s_journal;
+	char b[BDEVNAME_SIZE];
+
+	seq_printf(m,		/* on-disk fields */
+		   "jp_journal_1st_block: \t%i\n"
+		   "jp_journal_dev: \t%s[%x]\n"
+		   "jp_journal_size: \t%i\n"
+		   "jp_journal_trans_max: \t%i\n"
+		   "jp_journal_magic: \t%i\n"
+		   "jp_journal_max_batch: \t%i\n"
+		   "jp_journal_max_commit_age: \t%i\n"
+		   "jp_journal_max_trans_age: \t%i\n"
+		   /* incore fields */
+		   "j_1st_reserved_block: \t%i\n"
+		   "j_state: \t%li\n"
+		   "j_trans_id: \t%u\n"
+		   "j_mount_id: \t%lu\n"
+		   "j_start: \t%lu\n"
+		   "j_len: \t%lu\n"
+		   "j_len_alloc: \t%lu\n"
+		   "j_wcount: \t%i\n"
+		   "j_bcount: \t%lu\n"
+		   "j_first_unflushed_offset: \t%lu\n"
+		   "j_last_flush_trans_id: \t%u\n"
+		   "j_trans_start_time: \t%li\n"
+		   "j_list_bitmap_index: \t%i\n"
+		   "j_must_wait: \t%i\n"
+		   "j_next_full_flush: \t%i\n"
+		   "j_next_async_flush: \t%i\n"
+		   "j_cnode_used: \t%i\n" "j_cnode_free: \t%i\n" "\n"
+		   /* reiserfs_proc_info_data_t.journal fields */
+		   "in_journal: \t%12lu\n"
+		   "in_journal_bitmap: \t%12lu\n"
+		   "in_journal_reusable: \t%12lu\n"
+		   "lock_journal: \t%12lu\n"
+		   "lock_journal_wait: \t%12lu\n"
+		   "journal_begin: \t%12lu\n"
+		   "journal_relock_writers: \t%12lu\n"
+		   "journal_relock_wcount: \t%12lu\n"
+		   "mark_dirty: \t%12lu\n"
+		   "mark_dirty_already: \t%12lu\n"
+		   "mark_dirty_notjournal: \t%12lu\n"
+		   "restore_prepared: \t%12lu\n"
+		   "prepare: \t%12lu\n"
+		   "prepare_retry: \t%12lu\n",
+		   DJP(jp_journal_1st_block),
+		   bdevname(SB_JOURNAL(sb)->j_dev_bd, b),
+		   DJP(jp_journal_dev),
+		   DJP(jp_journal_size),
+		   DJP(jp_journal_trans_max),
+		   DJP(jp_journal_magic),
+		   DJP(jp_journal_max_batch),
+		   SB_JOURNAL(sb)->j_max_commit_age,
+		   DJP(jp_journal_max_trans_age),
+		   JF(j_1st_reserved_block),
+		   JF(j_state),
+		   JF(j_trans_id),
+		   JF(j_mount_id),
+		   JF(j_start),
+		   JF(j_len),
+		   JF(j_len_alloc),
+		   atomic_read(&r->s_journal->j_wcount),
+		   JF(j_bcount),
+		   JF(j_first_unflushed_offset),
+		   JF(j_last_flush_trans_id),
+		   JF(j_trans_start_time),
+		   JF(j_list_bitmap_index),
+		   JF(j_must_wait),
+		   JF(j_next_full_flush),
+		   JF(j_next_async_flush),
+		   JF(j_cnode_used),
+		   JF(j_cnode_free),
+		   SFPJ(in_journal),
+		   SFPJ(in_journal_bitmap),
+		   SFPJ(in_journal_reusable),
+		   SFPJ(lock_journal),
+		   SFPJ(lock_journal_wait),
+		   SFPJ(journal_being),
+		   SFPJ(journal_relock_writers),
+		   SFPJ(journal_relock_wcount),
+		   SFPJ(mark_dirty),
+		   SFPJ(mark_dirty_already),
+		   SFPJ(mark_dirty_notjournal),
+		   SFPJ(restore_prepared), SFPJ(prepare), SFPJ(prepare_retry)
+	    );
+	return 0;
+}
+
+/* iterator */
+static int test_sb(struct super_block *sb, void *data)
+{
+	return data == sb;
+}
+
+static int set_sb(struct super_block *sb, void *data)
+{
+	return -ENOENT;
+}
+
+static void *r_start(struct seq_file *m, loff_t * pos)
+{
+	struct proc_dir_entry *de = m->private;
+	struct super_block *s = de->parent->data;
+	loff_t l = *pos;
+
+	if (l)
+		return NULL;
+
+	if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, s)))
+		return NULL;
+
+	up_write(&s->s_umount);
+	return s;
+}
+
+static void *r_next(struct seq_file *m, void *v, loff_t * pos)
+{
+	++*pos;
+	if (v)
+		deactivate_super(v);
+	return NULL;
+}
+
+static void r_stop(struct seq_file *m, void *v)
+{
+	if (v)
+		deactivate_super(v);
+}
+
+static int r_show(struct seq_file *m, void *v)
+{
+	struct proc_dir_entry *de = m->private;
+	int (*show) (struct seq_file *, struct super_block *) = de->data;
+	return show(m, v);
+}
+
+static const struct seq_operations r_ops = {
+	.start = r_start,
+	.next = r_next,
+	.stop = r_stop,
+	.show = r_show,
+};
+
+static int r_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open(file, &r_ops);
+
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = PDE(inode);
+	}
+	return ret;
+}
+
+static const struct file_operations r_file_operations = {
+	.open = r_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+	.owner = THIS_MODULE,
+};
+
+static struct proc_dir_entry *proc_info_root = NULL;
+static const char proc_info_root_name[] = "fs/reiserfs";
+
+static void add_file(struct super_block *sb, char *name,
+		     int (*func) (struct seq_file *, struct super_block *))
+{
+	proc_create_data(name, 0, REISERFS_SB(sb)->procdir,
+			 &r_file_operations, func);
+}
+
+int reiserfs_proc_info_init(struct super_block *sb)
+{
+	char b[BDEVNAME_SIZE];
+	char *s;
+
+	/* Some block devices use /'s */
+	strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE);
+	s = strchr(b, '/');
+	if (s)
+		*s = '!';
+
+	spin_lock_init(&__PINFO(sb).lock);
+	REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root);
+	if (REISERFS_SB(sb)->procdir) {
+		REISERFS_SB(sb)->procdir->data = sb;
+		add_file(sb, "version", show_version);
+		add_file(sb, "super", show_super);
+		add_file(sb, "per-level", show_per_level);
+		add_file(sb, "bitmap", show_bitmap);
+		add_file(sb, "on-disk-super", show_on_disk_super);
+		add_file(sb, "oidmap", show_oidmap);
+		add_file(sb, "journal", show_journal);
+		return 0;
+	}
+	reiserfs_warning(sb, "cannot create /proc/%s/%s",
+			 proc_info_root_name, b);
+	return 1;
+}
+
+int reiserfs_proc_info_done(struct super_block *sb)
+{
+	struct proc_dir_entry *de = REISERFS_SB(sb)->procdir;
+	char b[BDEVNAME_SIZE];
+	char *s;
+
+	/* Some block devices use /'s */
+	strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE);
+	s = strchr(b, '/');
+	if (s)
+		*s = '!';
+
+	if (de) {
+		remove_proc_entry("journal", de);
+		remove_proc_entry("oidmap", de);
+		remove_proc_entry("on-disk-super", de);
+		remove_proc_entry("bitmap", de);
+		remove_proc_entry("per-level", de);
+		remove_proc_entry("super", de);
+		remove_proc_entry("version", de);
+	}
+	spin_lock(&__PINFO(sb).lock);
+	__PINFO(sb).exiting = 1;
+	spin_unlock(&__PINFO(sb).lock);
+	if (proc_info_root) {
+		remove_proc_entry(b, proc_info_root);
+		REISERFS_SB(sb)->procdir = NULL;
+	}
+	return 0;
+}
+
+int reiserfs_proc_info_global_init(void)
+{
+	if (proc_info_root == NULL) {
+		proc_info_root = proc_mkdir(proc_info_root_name, NULL);
+		if (!proc_info_root) {
+			reiserfs_warning(NULL, "cannot create /proc/%s",
+					 proc_info_root_name);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+int reiserfs_proc_info_global_done(void)
+{
+	if (proc_info_root != NULL) {
+		proc_info_root = NULL;
+		remove_proc_entry(proc_info_root_name, NULL);
+	}
+	return 0;
+}
+/*
+ * Revision 1.1.8.2  2001/07/15 17:08:42  god
+ *  . use get_super() in procfs.c
+ *  . remove remove_save_link() from reiserfs_do_truncate()
+ *
+ * I accept terms and conditions stated in the Legal Agreement
+ * (available at http://www.namesys.com/legalese.html)
+ *
+ * Revision 1.1.8.1  2001/07/11 16:48:50  god
+ * proc info support
+ *
+ * I accept terms and conditions stated in the Legal Agreement
+ * (available at http://www.namesys.com/legalese.html)
+ *
+ */
+
+/*
+ * Make Linus happy.
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * End:
+ */
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
new file mode 100644
index 00000000..b3a94d20
--- /dev/null
+++ b/fs/reiserfs/resize.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+/*
+ * Written by Alexander Zarochentcev.
+ *
+ * The kernel part of the (on-line) reiserfs resizer.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/reiserfs_fs_sb.h>
+#include <linux/buffer_head.h>
+
+int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
+{
+	int err = 0;
+	struct reiserfs_super_block *sb;
+	struct reiserfs_bitmap_info *bitmap;
+	struct reiserfs_bitmap_info *info;
+	struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s);
+	struct buffer_head *bh;
+	struct reiserfs_transaction_handle th;
+	unsigned int bmap_nr_new, bmap_nr;
+	unsigned int block_r_new, block_r;
+
+	struct reiserfs_list_bitmap *jb;
+	struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS];
+
+	unsigned long int block_count, free_blocks;
+	int i;
+	int copy_size;
+
+	sb = SB_DISK_SUPER_BLOCK(s);
+
+	if (SB_BLOCK_COUNT(s) >= block_count_new) {
+		printk("can\'t shrink filesystem on-line\n");
+		return -EINVAL;
+	}
+
+	/* check the device size */
+	bh = sb_bread(s, block_count_new - 1);
+	if (!bh) {
+		printk("reiserfs_resize: can\'t read last block\n");
+		return -EINVAL;
+	}
+	bforget(bh);
+
+	/* old disk layout detection; those partitions can be mounted, but
+	 * cannot be resized */
+	if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size
+	    != REISERFS_DISK_OFFSET_IN_BYTES) {
+		printk
+		    ("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n");
+		return -ENOTSUPP;
+	}
+
+	/* count used bits in last bitmap block */
+	block_r = SB_BLOCK_COUNT(s) -
+			(reiserfs_bmap_count(s) - 1) * s->s_blocksize * 8;
+
+	/* count bitmap blocks in new fs */
+	bmap_nr_new = block_count_new / (s->s_blocksize * 8);
+	block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8;
+	if (block_r_new)
+		bmap_nr_new++;
+	else
+		block_r_new = s->s_blocksize * 8;
+
+	/* save old values */
+	block_count = SB_BLOCK_COUNT(s);
+	bmap_nr = reiserfs_bmap_count(s);
+
+	/* resizing of reiserfs bitmaps (journal and real), if needed */
+	if (bmap_nr_new > bmap_nr) {
+		/* reallocate journal bitmaps */
+		if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) {
+			printk
+			    ("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
+			return -ENOMEM;
+		}
+		/* the new journal bitmaps are zero filled, now we copy in the bitmap
+		 ** node pointers from the old journal bitmap structs, and then
+		 ** transfer the new data structures into the journal struct.
+		 **
+		 ** using the copy_size var below allows this code to work for
+		 ** both shrinking and expanding the FS.
+		 */
+		copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr;
+		copy_size =
+		    copy_size * sizeof(struct reiserfs_list_bitmap_node *);
+		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
+			struct reiserfs_bitmap_node **node_tmp;
+			jb = SB_JOURNAL(s)->j_list_bitmap + i;
+			memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size);
+
+			/* just in case vfree schedules on us, copy the new
+			 ** pointer into the journal struct before freeing the
+			 ** old one
+			 */
+			node_tmp = jb->bitmaps;
+			jb->bitmaps = jbitmap[i].bitmaps;
+			vfree(node_tmp);
+		}
+
+		/* allocate additional bitmap blocks, reallocate array of bitmap
+		 * block pointers */
+		bitmap =
+		    vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
+		if (!bitmap) {
+			/* Journal bitmaps are still supersized, but the memory isn't
+			 * leaked, so I guess it's ok */
+			printk("reiserfs_resize: unable to allocate memory.\n");
+			return -ENOMEM;
+		}
+		memset(bitmap, 0,
+		       sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
+		for (i = 0; i < bmap_nr; i++)
+			bitmap[i] = old_bitmap[i];
+
+		/* This doesn't go through the journal, but it doesn't have to.
+		 * The changes are still atomic: We're synced up when the journal
+		 * transaction begins, and the new bitmaps don't matter if the
+		 * transaction fails. */
+		for (i = bmap_nr; i < bmap_nr_new; i++) {
+			/* don't use read_bitmap_block since it will cache
+			 * the uninitialized bitmap */
+			bh = sb_bread(s, i * s->s_blocksize * 8);
+			if (!bh) {
+				vfree(bitmap);
+				return -EIO;
+			}
+			memset(bh->b_data, 0, sb_blocksize(sb));
+			reiserfs_test_and_set_le_bit(0, bh->b_data);
+			reiserfs_cache_bitmap_metadata(s, bh, bitmap + i);
+
+			set_buffer_uptodate(bh);
+			mark_buffer_dirty(bh);
+			reiserfs_write_unlock(s);
+			sync_dirty_buffer(bh);
+			reiserfs_write_lock(s);
+			// update bitmap_info stuff
+			bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
+			brelse(bh);
+		}
+		/* free old bitmap blocks array */
+		SB_AP_BITMAP(s) = bitmap;
+		vfree(old_bitmap);
+	}
+
+	/* begin transaction, if there was an error, it's fine. Yes, we have
+	 * incorrect bitmaps now, but none of it is ever going to touch the
+	 * disk anyway. */
+	err = journal_begin(&th, s, 10);
+	if (err)
+		return err;
+
+	/* Extend old last bitmap block - new blocks have been made available */
+	info = SB_AP_BITMAP(s) + bmap_nr - 1;
+	bh = reiserfs_read_bitmap_block(s, bmap_nr - 1);
+	if (!bh) {
+		int jerr = journal_end(&th, s, 10);
+		if (jerr)
+			return jerr;
+		return -EIO;
+	}
+
+	reiserfs_prepare_for_journal(s, bh, 1);
+	for (i = block_r; i < s->s_blocksize * 8; i++)
+		reiserfs_test_and_clear_le_bit(i, bh->b_data);
+	info->free_count += s->s_blocksize * 8 - block_r;
+
+	journal_mark_dirty(&th, s, bh);
+	brelse(bh);
+
+	/* Correct new last bitmap block - It may not be full */
+	info = SB_AP_BITMAP(s) + bmap_nr_new - 1;
+	bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1);
+	if (!bh) {
+		int jerr = journal_end(&th, s, 10);
+		if (jerr)
+			return jerr;
+		return -EIO;
+	}
+
+	reiserfs_prepare_for_journal(s, bh, 1);
+	for (i = block_r_new; i < s->s_blocksize * 8; i++)
+		reiserfs_test_and_set_le_bit(i, bh->b_data);
+	journal_mark_dirty(&th, s, bh);
+	brelse(bh);
+
+	info->free_count -= s->s_blocksize * 8 - block_r_new;
+	/* update super */
+	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+	free_blocks = SB_FREE_BLOCKS(s);
+	PUT_SB_FREE_BLOCKS(s,
+			   free_blocks + (block_count_new - block_count -
+					  (bmap_nr_new - bmap_nr)));
+	PUT_SB_BLOCK_COUNT(s, block_count_new);
+	PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new);
+	s->s_dirt = 1;
+
+	journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+
+	SB_JOURNAL(s)->j_must_wait = 1;
+	return journal_end(&th, s, 10);
+}
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
new file mode 100644
index 00000000..313d39d6
--- /dev/null
+++ b/fs/reiserfs/stree.c
@@ -0,0 +1,2120 @@
+/*
+ *  Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+/*
+ *  Written by Anatoly P. Pinchuk pap@namesys.botik.ru
+ *  Programm System Institute
+ *  Pereslavl-Zalessky Russia
+ */
+
+/*
+ *  This file contains functions dealing with S+tree
+ *
+ * B_IS_IN_TREE
+ * copy_item_head
+ * comp_short_keys
+ * comp_keys
+ * comp_short_le_keys
+ * le_key2cpu_key
+ * comp_le_keys
+ * bin_search
+ * get_lkey
+ * get_rkey
+ * key_in_buffer
+ * decrement_bcount
+ * reiserfs_check_path
+ * pathrelse_and_restore
+ * pathrelse
+ * search_by_key_reada
+ * search_by_key
+ * search_for_position_by_key
+ * comp_items
+ * prepare_for_direct_item
+ * prepare_for_direntry_item
+ * prepare_for_delete_or_cut
+ * calc_deleted_bytes_number
+ * init_tb_struct
+ * padd_item
+ * reiserfs_delete_item
+ * reiserfs_delete_solid_item
+ * reiserfs_delete_object
+ * maybe_indirect_to_direct
+ * indirect_to_direct_roll_back
+ * reiserfs_cut_from_item
+ * truncate_directory
+ * reiserfs_do_truncate
+ * reiserfs_paste_into_item
+ * reiserfs_insert_item
+ */
+
+#include <linux/time.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/quotaops.h>
+
+/* Does the buffer contain a disk block which is in the tree. */
+inline int B_IS_IN_TREE(const struct buffer_head *bh)
+{
+
+	RFALSE(B_LEVEL(bh) > MAX_HEIGHT,
+	       "PAP-1010: block (%b) has too big level (%z)", bh, bh);
+
+	return (B_LEVEL(bh) != FREE_LEVEL);
+}
+
+//
+// to gets item head in le form
+//
+inline void copy_item_head(struct item_head *to,
+			   const struct item_head *from)
+{
+	memcpy(to, from, IH_SIZE);
+}
+
+/* k1 is pointer to on-disk structure which is stored in little-endian
+   form. k2 is pointer to cpu variable. For key of items of the same
+   object this returns 0.
+   Returns: -1 if key1 < key2
+   0 if key1 == key2
+   1 if key1 > key2 */
+inline int comp_short_keys(const struct reiserfs_key *le_key,
+			   const struct cpu_key *cpu_key)
+{
+	__u32 n;
+	n = le32_to_cpu(le_key->k_dir_id);
+	if (n < cpu_key->on_disk_key.k_dir_id)
+		return -1;
+	if (n > cpu_key->on_disk_key.k_dir_id)
+		return 1;
+	n = le32_to_cpu(le_key->k_objectid);
+	if (n < cpu_key->on_disk_key.k_objectid)
+		return -1;
+	if (n > cpu_key->on_disk_key.k_objectid)
+		return 1;
+	return 0;
+}
+
+/* k1 is pointer to on-disk structure which is stored in little-endian
+   form. k2 is pointer to cpu variable.
+   Compare keys using all 4 key fields.
+   Returns: -1 if key1 < key2 0
+   if key1 = key2 1 if key1 > key2 */
+static inline int comp_keys(const struct reiserfs_key *le_key,
+			    const struct cpu_key *cpu_key)
+{
+	int retval;
+
+	retval = comp_short_keys(le_key, cpu_key);
+	if (retval)
+		return retval;
+	if (le_key_k_offset(le_key_version(le_key), le_key) <
+	    cpu_key_k_offset(cpu_key))
+		return -1;
+	if (le_key_k_offset(le_key_version(le_key), le_key) >
+	    cpu_key_k_offset(cpu_key))
+		return 1;
+
+	if (cpu_key->key_length == 3)
+		return 0;
+
+	/* this part is needed only when tail conversion is in progress */
+	if (le_key_k_type(le_key_version(le_key), le_key) <
+	    cpu_key_k_type(cpu_key))
+		return -1;
+
+	if (le_key_k_type(le_key_version(le_key), le_key) >
+	    cpu_key_k_type(cpu_key))
+		return 1;
+
+	return 0;
+}
+
+inline int comp_short_le_keys(const struct reiserfs_key *key1,
+			      const struct reiserfs_key *key2)
+{
+	__u32 *k1_u32, *k2_u32;
+	int key_length = REISERFS_SHORT_KEY_LEN;
+
+	k1_u32 = (__u32 *) key1;
+	k2_u32 = (__u32 *) key2;
+	for (; key_length--; ++k1_u32, ++k2_u32) {
+		if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32))
+			return -1;
+		if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32))
+			return 1;
+	}
+	return 0;
+}
+
+inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from)
+{
+	int version;
+	to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id);
+	to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid);
+
+	// find out version of the key
+	version = le_key_version(from);
+	to->version = version;
+	to->on_disk_key.k_offset = le_key_k_offset(version, from);
+	to->on_disk_key.k_type = le_key_k_type(version, from);
+}
+
+// this does not say which one is bigger, it only returns 1 if keys
+// are not equal, 0 otherwise
+inline int comp_le_keys(const struct reiserfs_key *k1,
+			const struct reiserfs_key *k2)
+{
+	return memcmp(k1, k2, sizeof(struct reiserfs_key));
+}
+
+/**************************************************************************
+ *  Binary search toolkit function                                        *
+ *  Search for an item in the array by the item key                       *
+ *  Returns:    1 if found,  0 if not found;                              *
+ *        *pos = number of the searched element if found, else the        *
+ *        number of the first element that is larger than key.            *
+ **************************************************************************/
+/* For those not familiar with binary search: lbound is the leftmost item that it
+ could be, rbound the rightmost item that it could be.  We examine the item
+ halfway between lbound and rbound, and that tells us either that we can increase
+ lbound, or decrease rbound, or that we have found it, or if lbound <= rbound that
+ there are no possible items, and we have not found it. With each examination we
+ cut the number of possible items it could be by one more than half rounded down,
+ or we find it. */
+static inline int bin_search(const void *key,	/* Key to search for. */
+			     const void *base,	/* First item in the array. */
+			     int num,	/* Number of items in the array. */
+			     int width,	/* Item size in the array.
+					   searched. Lest the reader be
+					   confused, note that this is crafted
+					   as a general function, and when it
+					   is applied specifically to the array
+					   of item headers in a node, width
+					   is actually the item header size not
+					   the item size. */
+			     int *pos /* Number of the searched for element. */
+    )
+{
+	int rbound, lbound, j;
+
+	for (j = ((rbound = num - 1) + (lbound = 0)) / 2;
+	     lbound <= rbound; j = (rbound + lbound) / 2)
+		switch (comp_keys
+			((struct reiserfs_key *)((char *)base + j * width),
+			 (struct cpu_key *)key)) {
+		case -1:
+			lbound = j + 1;
+			continue;
+		case 1:
+			rbound = j - 1;
+			continue;
+		case 0:
+			*pos = j;
+			return ITEM_FOUND;	/* Key found in the array.  */
+		}
+
+	/* bin_search did not find given key, it returns position of key,
+	   that is minimal and greater than the given one. */
+	*pos = lbound;
+	return ITEM_NOT_FOUND;
+}
+
+
+/* Minimal possible key. It is never in the tree. */
+const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
+
+/* Maximal possible key. It is never in the tree. */
+static const struct reiserfs_key MAX_KEY = {
+	__constant_cpu_to_le32(0xffffffff),
+	__constant_cpu_to_le32(0xffffffff),
+	{{__constant_cpu_to_le32(0xffffffff),
+	  __constant_cpu_to_le32(0xffffffff)},}
+};
+
+/* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom
+   of the path, and going upwards.  We must check the path's validity at each step.  If the key is not in
+   the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this
+   case we return a special key, either MIN_KEY or MAX_KEY. */
+static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path,
+						  const struct super_block *sb)
+{
+	int position, path_offset = chk_path->path_length;
+	struct buffer_head *parent;
+
+	RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
+	       "PAP-5010: invalid offset in the path");
+
+	/* While not higher in path than first element. */
+	while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
+
+		RFALSE(!buffer_uptodate
+		       (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
+		       "PAP-5020: parent is not uptodate");
+
+		/* Parent at the path is not in the tree now. */
+		if (!B_IS_IN_TREE
+		    (parent =
+		     PATH_OFFSET_PBUFFER(chk_path, path_offset)))
+			return &MAX_KEY;
+		/* Check whether position in the parent is correct. */
+		if ((position =
+		     PATH_OFFSET_POSITION(chk_path,
+					  path_offset)) >
+		    B_NR_ITEMS(parent))
+			return &MAX_KEY;
+		/* Check whether parent at the path really points to the child. */
+		if (B_N_CHILD_NUM(parent, position) !=
+		    PATH_OFFSET_PBUFFER(chk_path,
+					path_offset + 1)->b_blocknr)
+			return &MAX_KEY;
+		/* Return delimiting key if position in the parent is not equal to zero. */
+		if (position)
+			return B_N_PDELIM_KEY(parent, position - 1);
+	}
+	/* Return MIN_KEY if we are in the root of the buffer tree. */
+	if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
+	    b_blocknr == SB_ROOT_BLOCK(sb))
+		return &MIN_KEY;
+	return &MAX_KEY;
+}
+
+/* Get delimiting key of the buffer at the path and its right neighbor. */
+inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
+					   const struct super_block *sb)
+{
+	int position, path_offset = chk_path->path_length;
+	struct buffer_head *parent;
+
+	RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
+	       "PAP-5030: invalid offset in the path");
+
+	while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
+
+		RFALSE(!buffer_uptodate
+		       (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
+		       "PAP-5040: parent is not uptodate");
+
+		/* Parent at the path is not in the tree now. */
+		if (!B_IS_IN_TREE
+		    (parent =
+		     PATH_OFFSET_PBUFFER(chk_path, path_offset)))
+			return &MIN_KEY;
+		/* Check whether position in the parent is correct. */
+		if ((position =
+		     PATH_OFFSET_POSITION(chk_path,
+					  path_offset)) >
+		    B_NR_ITEMS(parent))
+			return &MIN_KEY;
+		/* Check whether parent at the path really points to the child. */
+		if (B_N_CHILD_NUM(parent, position) !=
+		    PATH_OFFSET_PBUFFER(chk_path,
+					path_offset + 1)->b_blocknr)
+			return &MIN_KEY;
+		/* Return delimiting key if position in the parent is not the last one. */
+		if (position != B_NR_ITEMS(parent))
+			return B_N_PDELIM_KEY(parent, position);
+	}
+	/* Return MAX_KEY if we are in the root of the buffer tree. */
+	if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
+	    b_blocknr == SB_ROOT_BLOCK(sb))
+		return &MAX_KEY;
+	return &MIN_KEY;
+}
+
+/* Check whether a key is contained in the tree rooted from a buffer at a path. */
+/* This works by looking at the left and right delimiting keys for the buffer in the last path_element in
+   the path.  These delimiting keys are stored at least one level above that buffer in the tree. If the
+   buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in
+   this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */
+static inline int key_in_buffer(struct treepath *chk_path,	/* Path which should be checked.  */
+				const struct cpu_key *key,	/* Key which should be checked.   */
+				struct super_block *sb
+    )
+{
+
+	RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET
+	       || chk_path->path_length > MAX_HEIGHT,
+	       "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
+	       key, chk_path->path_length);
+	RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev,
+	       "PAP-5060: device must not be NODEV");
+
+	if (comp_keys(get_lkey(chk_path, sb), key) == 1)
+		/* left delimiting key is bigger, that the key we look for */
+		return 0;
+	/*  if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */
+	if (comp_keys(get_rkey(chk_path, sb), key) != 1)
+		/* key must be less than right delimitiing key */
+		return 0;
+	return 1;
+}
+
+int reiserfs_check_path(struct treepath *p)
+{
+	RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET,
+	       "path not properly relsed");
+	return 0;
+}
+
+/* Drop the reference to each buffer in a path and restore
+ * dirty bits clean when preparing the buffer for the log.
+ * This version should only be called from fix_nodes() */
+void pathrelse_and_restore(struct super_block *sb,
+			   struct treepath *search_path)
+{
+	int path_offset = search_path->path_length;
+
+	RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
+	       "clm-4000: invalid path offset");
+
+	while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
+		struct buffer_head *bh;
+		bh = PATH_OFFSET_PBUFFER(search_path, path_offset--);
+		reiserfs_restore_prepared_buffer(sb, bh);
+		brelse(bh);
+	}
+	search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
+}
+
+/* Drop the reference to each buffer in a path */
+void pathrelse(struct treepath *search_path)
+{
+	int path_offset = search_path->path_length;
+
+	RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
+	       "PAP-5090: invalid path offset");
+
+	while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET)
+		brelse(PATH_OFFSET_PBUFFER(search_path, path_offset--));
+
+	search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
+}
+
+static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
+{
+	struct block_head *blkh;
+	struct item_head *ih;
+	int used_space;
+	int prev_location;
+	int i;
+	int nr;
+
+	blkh = (struct block_head *)buf;
+	if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
+		reiserfs_warning(NULL, "reiserfs-5080",
+				 "this should be caught earlier");
+		return 0;
+	}
+
+	nr = blkh_nr_item(blkh);
+	if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
+		/* item number is too big or too small */
+		reiserfs_warning(NULL, "reiserfs-5081",
+				 "nr_item seems wrong: %z", bh);
+		return 0;
+	}
+	ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
+	used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih));
+	if (used_space != blocksize - blkh_free_space(blkh)) {
+		/* free space does not match to calculated amount of use space */
+		reiserfs_warning(NULL, "reiserfs-5082",
+				 "free space seems wrong: %z", bh);
+		return 0;
+	}
+	// FIXME: it is_leaf will hit performance too much - we may have
+	// return 1 here
+
+	/* check tables of item heads */
+	ih = (struct item_head *)(buf + BLKH_SIZE);
+	prev_location = blocksize;
+	for (i = 0; i < nr; i++, ih++) {
+		if (le_ih_k_type(ih) == TYPE_ANY) {
+			reiserfs_warning(NULL, "reiserfs-5083",
+					 "wrong item type for item %h",
+					 ih);
+			return 0;
+		}
+		if (ih_location(ih) >= blocksize
+		    || ih_location(ih) < IH_SIZE * nr) {
+			reiserfs_warning(NULL, "reiserfs-5084",
+					 "item location seems wrong: %h",
+					 ih);
+			return 0;
+		}
+		if (ih_item_len(ih) < 1
+		    || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) {
+			reiserfs_warning(NULL, "reiserfs-5085",
+					 "item length seems wrong: %h",
+					 ih);
+			return 0;
+		}
+		if (prev_location - ih_location(ih) != ih_item_len(ih)) {
+			reiserfs_warning(NULL, "reiserfs-5086",
+					 "item location seems wrong "
+					 "(second one): %h", ih);
+			return 0;
+		}
+		prev_location = ih_location(ih);
+	}
+
+	// one may imagine much more checks
+	return 1;
+}
+
+/* returns 1 if buf looks like an internal node, 0 otherwise */
+static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
+{
+	struct block_head *blkh;
+	int nr;
+	int used_space;
+
+	blkh = (struct block_head *)buf;
+	nr = blkh_level(blkh);
+	if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
+		/* this level is not possible for internal nodes */
+		reiserfs_warning(NULL, "reiserfs-5087",
+				 "this should be caught earlier");
+		return 0;
+	}
+
+	nr = blkh_nr_item(blkh);
+	if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
+		/* for internal which is not root we might check min number of keys */
+		reiserfs_warning(NULL, "reiserfs-5088",
+				 "number of key seems wrong: %z", bh);
+		return 0;
+	}
+
+	used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
+	if (used_space != blocksize - blkh_free_space(blkh)) {
+		reiserfs_warning(NULL, "reiserfs-5089",
+				 "free space seems wrong: %z", bh);
+		return 0;
+	}
+	// one may imagine much more checks
+	return 1;
+}
+
+// make sure that bh contains formatted node of reiserfs tree of
+// 'level'-th level
+static int is_tree_node(struct buffer_head *bh, int level)
+{
+	if (B_LEVEL(bh) != level) {
+		reiserfs_warning(NULL, "reiserfs-5090", "node level %d does "
+				 "not match to the expected one %d",
+				 B_LEVEL(bh), level);
+		return 0;
+	}
+	if (level == DISK_LEAF_NODE_LEVEL)
+		return is_leaf(bh->b_data, bh->b_size, bh);
+
+	return is_internal(bh->b_data, bh->b_size, bh);
+}
+
+#define SEARCH_BY_KEY_READA 16
+
+/*
+ * The function is NOT SCHEDULE-SAFE!
+ * It might unlock the write lock if we needed to wait for a block
+ * to be read. Note that in this case it won't recover the lock to avoid
+ * high contention resulting from too much lock requests, especially
+ * the caller (search_by_key) will perform other schedule-unsafe
+ * operations just after calling this function.
+ *
+ * @return true if we have unlocked
+ */
+static bool search_by_key_reada(struct super_block *s,
+				struct buffer_head **bh,
+				b_blocknr_t *b, int num)
+{
+	int i, j;
+	bool unlocked = false;
+
+	for (i = 0; i < num; i++) {
+		bh[i] = sb_getblk(s, b[i]);
+	}
+	/*
+	 * We are going to read some blocks on which we
+	 * have a reference. It's safe, though we might be
+	 * reading blocks concurrently changed if we release
+	 * the lock. But it's still fine because we check later
+	 * if the tree changed
+	 */
+	for (j = 0; j < i; j++) {
+		/*
+		 * note, this needs attention if we are getting rid of the BKL
+		 * you have to make sure the prepared bit isn't set on this buffer
+		 */
+		if (!buffer_uptodate(bh[j])) {
+			if (!unlocked) {
+				reiserfs_write_unlock(s);
+				unlocked = true;
+			}
+			ll_rw_block(READA, 1, bh + j);
+		}
+		brelse(bh[j]);
+	}
+	return unlocked;
+}
+
+/**************************************************************************
+ * Algorithm   SearchByKey                                                *
+ *             look for item in the Disk S+Tree by its key                *
+ * Input:  sb   -  super block                                            *
+ *         key  - pointer to the key to search                            *
+ * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR                         *
+ *         search_path - path from the root to the needed leaf            *
+ **************************************************************************/
+
+/* This function fills up the path from the root to the leaf as it
+   descends the tree looking for the key.  It uses reiserfs_bread to
+   try to find buffers in the cache given their block number.  If it
+   does not find them in the cache it reads them from disk.  For each
+   node search_by_key finds using reiserfs_bread it then uses
+   bin_search to look through that node.  bin_search will find the
+   position of the block_number of the next node if it is looking
+   through an internal node.  If it is looking through a leaf node
+   bin_search will find the position of the item which has key either
+   equal to given key, or which is the maximal key less than the given
+   key.  search_by_key returns a path that must be checked for the
+   correctness of the top of the path but need not be checked for the
+   correctness of the bottom of the path */
+/* The function is NOT SCHEDULE-SAFE! */
+int search_by_key(struct super_block *sb, const struct cpu_key *key,	/* Key to search. */
+		  struct treepath *search_path,/* This structure was
+						   allocated and initialized
+						   by the calling
+						   function. It is filled up
+						   by this function.  */
+		  int stop_level	/* How far down the tree to search. To
+					   stop at leaf level - set to
+					   DISK_LEAF_NODE_LEVEL */
+    )
+{
+	b_blocknr_t block_number;
+	int expected_level;
+	struct buffer_head *bh;
+	struct path_element *last_element;
+	int node_level, retval;
+	int right_neighbor_of_leaf_node;
+	int fs_gen;
+	struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
+	b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA];
+	int reada_count = 0;
+
+#ifdef CONFIG_REISERFS_CHECK
+	int repeat_counter = 0;
+#endif
+
+	PROC_INFO_INC(sb, search_by_key);
+
+	/* As we add each node to a path we increase its count.  This means that
+	   we must be careful to release all nodes in a path before we either
+	   discard the path struct or re-use the path struct, as we do here. */
+
+	pathrelse(search_path);
+
+	right_neighbor_of_leaf_node = 0;
+
+	/* With each iteration of this loop we search through the items in the
+	   current node, and calculate the next current node(next path element)
+	   for the next iteration of this loop.. */
+	block_number = SB_ROOT_BLOCK(sb);
+	expected_level = -1;
+	while (1) {
+
+#ifdef CONFIG_REISERFS_CHECK
+		if (!(++repeat_counter % 50000))
+			reiserfs_warning(sb, "PAP-5100",
+					 "%s: there were %d iterations of "
+					 "while loop looking for key %K",
+					 current->comm, repeat_counter,
+					 key);
+#endif
+
+		/* prep path to have another element added to it. */
+		last_element =
+		    PATH_OFFSET_PELEMENT(search_path,
+					 ++search_path->path_length);
+		fs_gen = get_generation(sb);
+
+		/* Read the next tree node, and set the last element in the path to
+		   have a pointer to it. */
+		if ((bh = last_element->pe_buffer =
+		     sb_getblk(sb, block_number))) {
+			bool unlocked = false;
+
+			if (!buffer_uptodate(bh) && reada_count > 1)
+				/* may unlock the write lock */
+				unlocked = search_by_key_reada(sb, reada_bh,
+						    reada_blocks, reada_count);
+			/*
+			 * If we haven't already unlocked the write lock,
+			 * then we need to do that here before reading
+			 * the current block
+			 */
+			if (!buffer_uptodate(bh) && !unlocked) {
+				reiserfs_write_unlock(sb);
+				unlocked = true;
+			}
+			ll_rw_block(READ, 1, &bh);
+			wait_on_buffer(bh);
+
+			if (unlocked)
+				reiserfs_write_lock(sb);
+			if (!buffer_uptodate(bh))
+				goto io_error;
+		} else {
+		      io_error:
+			search_path->path_length--;
+			pathrelse(search_path);
+			return IO_ERROR;
+		}
+		reada_count = 0;
+		if (expected_level == -1)
+			expected_level = SB_TREE_HEIGHT(sb);
+		expected_level--;
+
+		/* It is possible that schedule occurred. We must check whether the key
+		   to search is still in the tree rooted from the current buffer. If
+		   not then repeat search from the root. */
+		if (fs_changed(fs_gen, sb) &&
+		    (!B_IS_IN_TREE(bh) ||
+		     B_LEVEL(bh) != expected_level ||
+		     !key_in_buffer(search_path, key, sb))) {
+			PROC_INFO_INC(sb, search_by_key_fs_changed);
+			PROC_INFO_INC(sb, search_by_key_restarted);
+			PROC_INFO_INC(sb,
+				      sbk_restarted[expected_level - 1]);
+			pathrelse(search_path);
+
+			/* Get the root block number so that we can repeat the search
+			   starting from the root. */
+			block_number = SB_ROOT_BLOCK(sb);
+			expected_level = -1;
+			right_neighbor_of_leaf_node = 0;
+
+			/* repeat search from the root */
+			continue;
+		}
+
+		/* only check that the key is in the buffer if key is not
+		   equal to the MAX_KEY. Latter case is only possible in
+		   "finish_unfinished()" processing during mount. */
+		RFALSE(comp_keys(&MAX_KEY, key) &&
+		       !key_in_buffer(search_path, key, sb),
+		       "PAP-5130: key is not in the buffer");
+#ifdef CONFIG_REISERFS_CHECK
+		if (REISERFS_SB(sb)->cur_tb) {
+			print_cur_tb("5140");
+			reiserfs_panic(sb, "PAP-5140",
+				       "schedule occurred in do_balance!");
+		}
+#endif
+
+		// make sure, that the node contents look like a node of
+		// certain level
+		if (!is_tree_node(bh, expected_level)) {
+			reiserfs_error(sb, "vs-5150",
+				       "invalid format found in block %ld. "
+				       "Fsck?", bh->b_blocknr);
+			pathrelse(search_path);
+			return IO_ERROR;
+		}
+
+		/* ok, we have acquired next formatted node in the tree */
+		node_level = B_LEVEL(bh);
+
+		PROC_INFO_BH_STAT(sb, bh, node_level - 1);
+
+		RFALSE(node_level < stop_level,
+		       "vs-5152: tree level (%d) is less than stop level (%d)",
+		       node_level, stop_level);
+
+		retval = bin_search(key, B_N_PITEM_HEAD(bh, 0),
+				      B_NR_ITEMS(bh),
+				      (node_level ==
+				       DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
+				      KEY_SIZE,
+				      &(last_element->pe_position));
+		if (node_level == stop_level) {
+			return retval;
+		}
+
+		/* we are not in the stop level */
+		if (retval == ITEM_FOUND)
+			/* item has been found, so we choose the pointer which is to the right of the found one */
+			last_element->pe_position++;
+
+		/* if item was not found we choose the position which is to
+		   the left of the found item. This requires no code,
+		   bin_search did it already. */
+
+		/* So we have chosen a position in the current node which is
+		   an internal node.  Now we calculate child block number by
+		   position in the node. */
+		block_number =
+		    B_N_CHILD_NUM(bh, last_element->pe_position);
+
+		/* if we are going to read leaf nodes, try for read ahead as well */
+		if ((search_path->reada & PATH_READA) &&
+		    node_level == DISK_LEAF_NODE_LEVEL + 1) {
+			int pos = last_element->pe_position;
+			int limit = B_NR_ITEMS(bh);
+			struct reiserfs_key *le_key;
+
+			if (search_path->reada & PATH_READA_BACK)
+				limit = 0;
+			while (reada_count < SEARCH_BY_KEY_READA) {
+				if (pos == limit)
+					break;
+				reada_blocks[reada_count++] =
+				    B_N_CHILD_NUM(bh, pos);
+				if (search_path->reada & PATH_READA_BACK)
+					pos--;
+				else
+					pos++;
+
+				/*
+				 * check to make sure we're in the same object
+				 */
+				le_key = B_N_PDELIM_KEY(bh, pos);
+				if (le32_to_cpu(le_key->k_objectid) !=
+				    key->on_disk_key.k_objectid) {
+					break;
+				}
+			}
+		}
+	}
+}
+
+/* Form the path to an item and position in this item which contains
+   file byte defined by key. If there is no such item
+   corresponding to the key, we point the path to the item with
+   maximal key less than key, and *pos_in_item is set to one
+   past the last entry/byte in the item.  If searching for entry in a
+   directory item, and it is not found, *pos_in_item is set to one
+   entry more than the entry with maximal key which is less than the
+   sought key.
+
+   Note that if there is no entry in this same node which is one more,
+   then we point to an imaginary entry.  for direct items, the
+   position is in units of bytes, for indirect items the position is
+   in units of blocknr entries, for directory items the position is in
+   units of directory entries.  */
+
+/* The function is NOT SCHEDULE-SAFE! */
+int search_for_position_by_key(struct super_block *sb,	/* Pointer to the super block.          */
+			       const struct cpu_key *p_cpu_key,	/* Key to search (cpu variable)         */
+			       struct treepath *search_path	/* Filled up by this function.          */
+    )
+{
+	struct item_head *p_le_ih;	/* pointer to on-disk structure */
+	int blk_size;
+	loff_t item_offset, offset;
+	struct reiserfs_dir_entry de;
+	int retval;
+
+	/* If searching for directory entry. */
+	if (is_direntry_cpu_key(p_cpu_key))
+		return search_by_entry_key(sb, p_cpu_key, search_path,
+					   &de);
+
+	/* If not searching for directory entry. */
+
+	/* If item is found. */
+	retval = search_item(sb, p_cpu_key, search_path);
+	if (retval == IO_ERROR)
+		return retval;
+	if (retval == ITEM_FOUND) {
+
+		RFALSE(!ih_item_len
+		       (B_N_PITEM_HEAD
+			(PATH_PLAST_BUFFER(search_path),
+			 PATH_LAST_POSITION(search_path))),
+		       "PAP-5165: item length equals zero");
+
+		pos_in_item(search_path) = 0;
+		return POSITION_FOUND;
+	}
+
+	RFALSE(!PATH_LAST_POSITION(search_path),
+	       "PAP-5170: position equals zero");
+
+	/* Item is not found. Set path to the previous item. */
+	p_le_ih =
+	    B_N_PITEM_HEAD(PATH_PLAST_BUFFER(search_path),
+			   --PATH_LAST_POSITION(search_path));
+	blk_size = sb->s_blocksize;
+
+	if (comp_short_keys(&(p_le_ih->ih_key), p_cpu_key)) {
+		return FILE_NOT_FOUND;
+	}
+	// FIXME: quite ugly this far
+
+	item_offset = le_ih_k_offset(p_le_ih);
+	offset = cpu_key_k_offset(p_cpu_key);
+
+	/* Needed byte is contained in the item pointed to by the path. */
+	if (item_offset <= offset &&
+	    item_offset + op_bytes_number(p_le_ih, blk_size) > offset) {
+		pos_in_item(search_path) = offset - item_offset;
+		if (is_indirect_le_ih(p_le_ih)) {
+			pos_in_item(search_path) /= blk_size;
+		}
+		return POSITION_FOUND;
+	}
+
+	/* Needed byte is not contained in the item pointed to by the
+	   path. Set pos_in_item out of the item. */
+	if (is_indirect_le_ih(p_le_ih))
+		pos_in_item(search_path) =
+		    ih_item_len(p_le_ih) / UNFM_P_SIZE;
+	else
+		pos_in_item(search_path) = ih_item_len(p_le_ih);
+
+	return POSITION_NOT_FOUND;
+}
+
+/* Compare given item and item pointed to by the path. */
+int comp_items(const struct item_head *stored_ih, const struct treepath *path)
+{
+	struct buffer_head *bh = PATH_PLAST_BUFFER(path);
+	struct item_head *ih;
+
+	/* Last buffer at the path is not in the tree. */
+	if (!B_IS_IN_TREE(bh))
+		return 1;
+
+	/* Last path position is invalid. */
+	if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh))
+		return 1;
+
+	/* we need only to know, whether it is the same item */
+	ih = get_ih(path);
+	return memcmp(stored_ih, ih, IH_SIZE);
+}
+
+/* unformatted nodes are not logged anymore, ever.  This is safe
+** now
+*/
+#define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1)
+
+// block can not be forgotten as it is in I/O or held by someone
+#define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh)))
+
+// prepare for delete or cut of direct item
+static inline int prepare_for_direct_item(struct treepath *path,
+					  struct item_head *le_ih,
+					  struct inode *inode,
+					  loff_t new_file_length, int *cut_size)
+{
+	loff_t round_len;
+
+	if (new_file_length == max_reiserfs_offset(inode)) {
+		/* item has to be deleted */
+		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
+		return M_DELETE;
+	}
+	// new file gets truncated
+	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) {
+		//
+		round_len = ROUND_UP(new_file_length);
+		/* this was new_file_length < le_ih ... */
+		if (round_len < le_ih_k_offset(le_ih)) {
+			*cut_size = -(IH_SIZE + ih_item_len(le_ih));
+			return M_DELETE;	/* Delete this item. */
+		}
+		/* Calculate first position and size for cutting from item. */
+		pos_in_item(path) = round_len - (le_ih_k_offset(le_ih) - 1);
+		*cut_size = -(ih_item_len(le_ih) - pos_in_item(path));
+
+		return M_CUT;	/* Cut from this item. */
+	}
+
+	// old file: items may have any length
+
+	if (new_file_length < le_ih_k_offset(le_ih)) {
+		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
+		return M_DELETE;	/* Delete this item. */
+	}
+	/* Calculate first position and size for cutting from item. */
+	*cut_size = -(ih_item_len(le_ih) -
+		      (pos_in_item(path) =
+		       new_file_length + 1 - le_ih_k_offset(le_ih)));
+	return M_CUT;		/* Cut from this item. */
+}
+
+static inline int prepare_for_direntry_item(struct treepath *path,
+					    struct item_head *le_ih,
+					    struct inode *inode,
+					    loff_t new_file_length,
+					    int *cut_size)
+{
+	if (le_ih_k_offset(le_ih) == DOT_OFFSET &&
+	    new_file_length == max_reiserfs_offset(inode)) {
+		RFALSE(ih_entry_count(le_ih) != 2,
+		       "PAP-5220: incorrect empty directory item (%h)", le_ih);
+		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
+		return M_DELETE;	/* Delete the directory item containing "." and ".." entry. */
+	}
+
+	if (ih_entry_count(le_ih) == 1) {
+		/* Delete the directory item such as there is one record only
+		   in this item */
+		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
+		return M_DELETE;
+	}
+
+	/* Cut one record from the directory item. */
+	*cut_size =
+	    -(DEH_SIZE +
+	      entry_length(get_last_bh(path), le_ih, pos_in_item(path)));
+	return M_CUT;
+}
+
+#define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1)
+
+/*  If the path points to a directory or direct item, calculate mode and the size cut, for balance.
+    If the path points to an indirect item, remove some number of its unformatted nodes.
+    In case of file truncate calculate whether this item must be deleted/truncated or last
+    unformatted node of this item will be converted to a direct item.
+    This function returns a determination of what balance mode the calling function should employ. */
+static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path, const struct cpu_key *item_key, int *removed,	/* Number of unformatted nodes which were removed
+																						   from end of the file. */
+				      int *cut_size, unsigned long long new_file_length	/* MAX_KEY_OFFSET in case of delete. */
+    )
+{
+	struct super_block *sb = inode->i_sb;
+	struct item_head *p_le_ih = PATH_PITEM_HEAD(path);
+	struct buffer_head *bh = PATH_PLAST_BUFFER(path);
+
+	BUG_ON(!th->t_trans_id);
+
+	/* Stat_data item. */
+	if (is_statdata_le_ih(p_le_ih)) {
+
+		RFALSE(new_file_length != max_reiserfs_offset(inode),
+		       "PAP-5210: mode must be M_DELETE");
+
+		*cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
+		return M_DELETE;
+	}
+
+	/* Directory item. */
+	if (is_direntry_le_ih(p_le_ih))
+		return prepare_for_direntry_item(path, p_le_ih, inode,
+						 new_file_length,
+						 cut_size);
+
+	/* Direct item. */
+	if (is_direct_le_ih(p_le_ih))
+		return prepare_for_direct_item(path, p_le_ih, inode,
+					       new_file_length, cut_size);
+
+	/* Case of an indirect item. */
+	{
+	    int blk_size = sb->s_blocksize;
+	    struct item_head s_ih;
+	    int need_re_search;
+	    int delete = 0;
+	    int result = M_CUT;
+	    int pos = 0;
+
+	    if ( new_file_length == max_reiserfs_offset (inode) ) {
+		/* prepare_for_delete_or_cut() is called by
+		 * reiserfs_delete_item() */
+		new_file_length = 0;
+		delete = 1;
+	    }
+
+	    do {
+		need_re_search = 0;
+		*cut_size = 0;
+		bh = PATH_PLAST_BUFFER(path);
+		copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
+		pos = I_UNFM_NUM(&s_ih);
+
+		while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) {
+		    __le32 *unfm;
+		    __u32 block;
+
+		    /* Each unformatted block deletion may involve one additional
+		     * bitmap block into the transaction, thereby the initial
+		     * journal space reservation might not be enough. */
+		    if (!delete && (*cut_size) != 0 &&
+			reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD)
+			break;
+
+		    unfm = (__le32 *)B_I_PITEM(bh, &s_ih) + pos - 1;
+		    block = get_block_num(unfm, 0);
+
+		    if (block != 0) {
+			reiserfs_prepare_for_journal(sb, bh, 1);
+			put_block_num(unfm, 0, 0);
+			journal_mark_dirty(th, sb, bh);
+			reiserfs_free_block(th, inode, block, 1);
+		    }
+
+		    reiserfs_write_unlock(sb);
+		    cond_resched();
+		    reiserfs_write_lock(sb);
+
+		    if (item_moved (&s_ih, path))  {
+			need_re_search = 1;
+			break;
+		    }
+
+		    pos --;
+		    (*removed)++;
+		    (*cut_size) -= UNFM_P_SIZE;
+
+		    if (pos == 0) {
+			(*cut_size) -= IH_SIZE;
+			result = M_DELETE;
+			break;
+		    }
+		}
+		/* a trick.  If the buffer has been logged, this will do nothing.  If
+		** we've broken the loop without logging it, it will restore the
+		** buffer */
+		reiserfs_restore_prepared_buffer(sb, bh);
+	    } while (need_re_search &&
+		     search_for_position_by_key(sb, item_key, path) == POSITION_FOUND);
+	    pos_in_item(path) = pos * UNFM_P_SIZE;
+
+	    if (*cut_size == 0) {
+		/* Nothing were cut. maybe convert last unformatted node to the
+		 * direct item? */
+		result = M_CONVERT;
+	    }
+	    return result;
+	}
+}
+
+/* Calculate number of bytes which will be deleted or cut during balance */
+static int calc_deleted_bytes_number(struct tree_balance *tb, char mode)
+{
+	int del_size;
+	struct item_head *p_le_ih = PATH_PITEM_HEAD(tb->tb_path);
+
+	if (is_statdata_le_ih(p_le_ih))
+		return 0;
+
+	del_size =
+	    (mode ==
+	     M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0];
+	if (is_direntry_le_ih(p_le_ih)) {
+		/* return EMPTY_DIR_SIZE; We delete emty directoris only.
+		 * we can't use EMPTY_DIR_SIZE, as old format dirs have a different
+		 * empty size.  ick. FIXME, is this right? */
+		return del_size;
+	}
+
+	if (is_indirect_le_ih(p_le_ih))
+		del_size = (del_size / UNFM_P_SIZE) *
+				(PATH_PLAST_BUFFER(tb->tb_path)->b_size);
+	return del_size;
+}
+
+static void init_tb_struct(struct reiserfs_transaction_handle *th,
+			   struct tree_balance *tb,
+			   struct super_block *sb,
+			   struct treepath *path, int size)
+{
+
+	BUG_ON(!th->t_trans_id);
+
+	memset(tb, '\0', sizeof(struct tree_balance));
+	tb->transaction_handle = th;
+	tb->tb_sb = sb;
+	tb->tb_path = path;
+	PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
+	PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
+	tb->insert_size[0] = size;
+}
+
+void padd_item(char *item, int total_length, int length)
+{
+	int i;
+
+	for (i = total_length; i > length;)
+		item[--i] = 0;
+}
+
+#ifdef REISERQUOTA_DEBUG
+char key2type(struct reiserfs_key *ih)
+{
+	if (is_direntry_le_key(2, ih))
+		return 'd';
+	if (is_direct_le_key(2, ih))
+		return 'D';
+	if (is_indirect_le_key(2, ih))
+		return 'i';
+	if (is_statdata_le_key(2, ih))
+		return 's';
+	return 'u';
+}
+
+char head2type(struct item_head *ih)
+{
+	if (is_direntry_le_ih(ih))
+		return 'd';
+	if (is_direct_le_ih(ih))
+		return 'D';
+	if (is_indirect_le_ih(ih))
+		return 'i';
+	if (is_statdata_le_ih(ih))
+		return 's';
+	return 'u';
+}
+#endif
+
+/* Delete object item.
+ * th       - active transaction handle
+ * path     - path to the deleted item
+ * item_key - key to search for the deleted item
+ * indode   - used for updating i_blocks and quotas
+ * un_bh    - NULL or unformatted node pointer
+ */
+int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
+			 struct treepath *path, const struct cpu_key *item_key,
+			 struct inode *inode, struct buffer_head *un_bh)
+{
+	struct super_block *sb = inode->i_sb;
+	struct tree_balance s_del_balance;
+	struct item_head s_ih;
+	struct item_head *q_ih;
+	int quota_cut_bytes;
+	int ret_value, del_size, removed;
+
+#ifdef CONFIG_REISERFS_CHECK
+	char mode;
+	int iter = 0;
+#endif
+
+	BUG_ON(!th->t_trans_id);
+
+	init_tb_struct(th, &s_del_balance, sb, path,
+		       0 /*size is unknown */ );
+
+	while (1) {
+		removed = 0;
+
+#ifdef CONFIG_REISERFS_CHECK
+		iter++;
+		mode =
+#endif
+		    prepare_for_delete_or_cut(th, inode, path,
+					      item_key, &removed,
+					      &del_size,
+					      max_reiserfs_offset(inode));
+
+		RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
+
+		copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
+		s_del_balance.insert_size[0] = del_size;
+
+		ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
+		if (ret_value != REPEAT_SEARCH)
+			break;
+
+		PROC_INFO_INC(sb, delete_item_restarted);
+
+		// file system changed, repeat search
+		ret_value =
+		    search_for_position_by_key(sb, item_key, path);
+		if (ret_value == IO_ERROR)
+			break;
+		if (ret_value == FILE_NOT_FOUND) {
+			reiserfs_warning(sb, "vs-5340",
+					 "no items of the file %K found",
+					 item_key);
+			break;
+		}
+	}			/* while (1) */
+
+	if (ret_value != CARRY_ON) {
+		unfix_nodes(&s_del_balance);
+		return 0;
+	}
+	// reiserfs_delete_item returns item length when success
+	ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
+	q_ih = get_ih(path);
+	quota_cut_bytes = ih_item_len(q_ih);
+
+	/* hack so the quota code doesn't have to guess if the file
+	 ** has a tail.  On tail insert, we allocate quota for 1 unformatted node.
+	 ** We test the offset because the tail might have been
+	 ** split into multiple items, and we only want to decrement for
+	 ** the unfm node once
+	 */
+	if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) {
+		if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) {
+			quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
+		} else {
+			quota_cut_bytes = 0;
+		}
+	}
+
+	if (un_bh) {
+		int off;
+		char *data;
+
+		/* We are in direct2indirect conversion, so move tail contents
+		   to the unformatted node */
+		/* note, we do the copy before preparing the buffer because we
+		 ** don't care about the contents of the unformatted node yet.
+		 ** the only thing we really care about is the direct item's data
+		 ** is in the unformatted node.
+		 **
+		 ** Otherwise, we would have to call reiserfs_prepare_for_journal on
+		 ** the unformatted node, which might schedule, meaning we'd have to
+		 ** loop all the way back up to the start of the while loop.
+		 **
+		 ** The unformatted node must be dirtied later on.  We can't be
+		 ** sure here if the entire tail has been deleted yet.
+		 **
+		 ** un_bh is from the page cache (all unformatted nodes are
+		 ** from the page cache) and might be a highmem page.  So, we
+		 ** can't use un_bh->b_data.
+		 ** -clm
+		 */
+
+		data = kmap_atomic(un_bh->b_page, KM_USER0);
+		off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
+		memcpy(data + off,
+		       B_I_PITEM(PATH_PLAST_BUFFER(path), &s_ih),
+		       ret_value);
+		kunmap_atomic(data, KM_USER0);
+	}
+	/* Perform balancing after all resources have been collected at once. */
+	do_balance(&s_del_balance, NULL, NULL, M_DELETE);
+
+#ifdef REISERQUOTA_DEBUG
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
+		       "reiserquota delete_item(): freeing %u, id=%u type=%c",
+		       quota_cut_bytes, inode->i_uid, head2type(&s_ih));
+#endif
+	dquot_free_space_nodirty(inode, quota_cut_bytes);
+
+	/* Return deleted body length */
+	return ret_value;
+}
+
+/* Summary Of Mechanisms For Handling Collisions Between Processes:
+
+ deletion of the body of the object is performed by iput(), with the
+ result that if multiple processes are operating on a file, the
+ deletion of the body of the file is deferred until the last process
+ that has an open inode performs its iput().
+
+ writes and truncates are protected from collisions by use of
+ semaphores.
+
+ creates, linking, and mknod are protected from collisions with other
+ processes by making the reiserfs_add_entry() the last step in the
+ creation, and then rolling back all changes if there was a collision.
+ - Hans
+*/
+
+/* this deletes item which never gets split */
+void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
+				struct inode *inode, struct reiserfs_key *key)
+{
+	struct tree_balance tb;
+	INITIALIZE_PATH(path);
+	int item_len = 0;
+	int tb_init = 0;
+	struct cpu_key cpu_key;
+	int retval;
+	int quota_cut_bytes = 0;
+
+	BUG_ON(!th->t_trans_id);
+
+	le_key2cpu_key(&cpu_key, key);
+
+	while (1) {
+		retval = search_item(th->t_super, &cpu_key, &path);
+		if (retval == IO_ERROR) {
+			reiserfs_error(th->t_super, "vs-5350",
+				       "i/o failure occurred trying "
+				       "to delete %K", &cpu_key);
+			break;
+		}
+		if (retval != ITEM_FOUND) {
+			pathrelse(&path);
+			// No need for a warning, if there is just no free space to insert '..' item into the newly-created subdir
+			if (!
+			    ((unsigned long long)
+			     GET_HASH_VALUE(le_key_k_offset
+					    (le_key_version(key), key)) == 0
+			     && (unsigned long long)
+			     GET_GENERATION_NUMBER(le_key_k_offset
+						   (le_key_version(key),
+						    key)) == 1))
+				reiserfs_warning(th->t_super, "vs-5355",
+						 "%k not found", key);
+			break;
+		}
+		if (!tb_init) {
+			tb_init = 1;
+			item_len = ih_item_len(PATH_PITEM_HEAD(&path));
+			init_tb_struct(th, &tb, th->t_super, &path,
+				       -(IH_SIZE + item_len));
+		}
+		quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path));
+
+		retval = fix_nodes(M_DELETE, &tb, NULL, NULL);
+		if (retval == REPEAT_SEARCH) {
+			PROC_INFO_INC(th->t_super, delete_solid_item_restarted);
+			continue;
+		}
+
+		if (retval == CARRY_ON) {
+			do_balance(&tb, NULL, NULL, M_DELETE);
+			if (inode) {	/* Should we count quota for item? (we don't count quotas for save-links) */
+#ifdef REISERQUOTA_DEBUG
+				reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
+					       "reiserquota delete_solid_item(): freeing %u id=%u type=%c",
+					       quota_cut_bytes, inode->i_uid,
+					       key2type(key));
+#endif
+				dquot_free_space_nodirty(inode,
+							 quota_cut_bytes);
+			}
+			break;
+		}
+		// IO_ERROR, NO_DISK_SPACE, etc
+		reiserfs_warning(th->t_super, "vs-5360",
+				 "could not delete %K due to fix_nodes failure",
+				 &cpu_key);
+		unfix_nodes(&tb);
+		break;
+	}
+
+	reiserfs_check_path(&path);
+}
+
+int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
+			   struct inode *inode)
+{
+	int err;
+	inode->i_size = 0;
+	BUG_ON(!th->t_trans_id);
+
+	/* for directory this deletes item containing "." and ".." */
+	err =
+	    reiserfs_do_truncate(th, inode, NULL, 0 /*no timestamp updates */ );
+	if (err)
+		return err;
+
+#if defined( USE_INODE_GENERATION_COUNTER )
+	if (!old_format_only(th->t_super)) {
+		__le32 *inode_generation;
+
+		inode_generation =
+		    &REISERFS_SB(th->t_super)->s_rs->s_inode_generation;
+		le32_add_cpu(inode_generation, 1);
+	}
+/* USE_INODE_GENERATION_COUNTER */
+#endif
+	reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
+
+	return err;
+}
+
+static void unmap_buffers(struct page *page, loff_t pos)
+{
+	struct buffer_head *bh;
+	struct buffer_head *head;
+	struct buffer_head *next;
+	unsigned long tail_index;
+	unsigned long cur_index;
+
+	if (page) {
+		if (page_has_buffers(page)) {
+			tail_index = pos & (PAGE_CACHE_SIZE - 1);
+			cur_index = 0;
+			head = page_buffers(page);
+			bh = head;
+			do {
+				next = bh->b_this_page;
+
+				/* we want to unmap the buffers that contain the tail, and
+				 ** all the buffers after it (since the tail must be at the
+				 ** end of the file).  We don't want to unmap file data
+				 ** before the tail, since it might be dirty and waiting to
+				 ** reach disk
+				 */
+				cur_index += bh->b_size;
+				if (cur_index > tail_index) {
+					reiserfs_unmap_buffer(bh);
+				}
+				bh = next;
+			} while (bh != head);
+		}
+	}
+}
+
+static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
+				    struct inode *inode,
+				    struct page *page,
+				    struct treepath *path,
+				    const struct cpu_key *item_key,
+				    loff_t new_file_size, char *mode)
+{
+	struct super_block *sb = inode->i_sb;
+	int block_size = sb->s_blocksize;
+	int cut_bytes;
+	BUG_ON(!th->t_trans_id);
+	BUG_ON(new_file_size != inode->i_size);
+
+	/* the page being sent in could be NULL if there was an i/o error
+	 ** reading in the last block.  The user will hit problems trying to
+	 ** read the file, but for now we just skip the indirect2direct
+	 */
+	if (atomic_read(&inode->i_count) > 1 ||
+	    !tail_has_to_be_packed(inode) ||
+	    !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) {
+		/* leave tail in an unformatted node */
+		*mode = M_SKIP_BALANCING;
+		cut_bytes =
+		    block_size - (new_file_size & (block_size - 1));
+		pathrelse(path);
+		return cut_bytes;
+	}
+	/* Perform the conversion to a direct_item. */
+	/* return indirect_to_direct(inode, path, item_key,
+				  new_file_size, mode); */
+	return indirect2direct(th, inode, page, path, item_key,
+			       new_file_size, mode);
+}
+
+/* we did indirect_to_direct conversion. And we have inserted direct
+   item successesfully, but there were no disk space to cut unfm
+   pointer being converted. Therefore we have to delete inserted
+   direct item(s) */
+static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
+					 struct inode *inode, struct treepath *path)
+{
+	struct cpu_key tail_key;
+	int tail_len;
+	int removed;
+	BUG_ON(!th->t_trans_id);
+
+	make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);	// !!!!
+	tail_key.key_length = 4;
+
+	tail_len =
+	    (cpu_key_k_offset(&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1;
+	while (tail_len) {
+		/* look for the last byte of the tail */
+		if (search_for_position_by_key(inode->i_sb, &tail_key, path) ==
+		    POSITION_NOT_FOUND)
+			reiserfs_panic(inode->i_sb, "vs-5615",
+				       "found invalid item");
+		RFALSE(path->pos_in_item !=
+		       ih_item_len(PATH_PITEM_HEAD(path)) - 1,
+		       "vs-5616: appended bytes found");
+		PATH_LAST_POSITION(path)--;
+
+		removed =
+		    reiserfs_delete_item(th, path, &tail_key, inode,
+					 NULL /*unbh not needed */ );
+		RFALSE(removed <= 0
+		       || removed > tail_len,
+		       "vs-5617: there was tail %d bytes, removed item length %d bytes",
+		       tail_len, removed);
+		tail_len -= removed;
+		set_cpu_key_k_offset(&tail_key,
+				     cpu_key_k_offset(&tail_key) - removed);
+	}
+	reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct "
+			 "conversion has been rolled back due to "
+			 "lack of disk space");
+	//mark_file_without_tail (inode);
+	mark_inode_dirty(inode);
+}
+
+/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
+int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
+			   struct treepath *path,
+			   struct cpu_key *item_key,
+			   struct inode *inode,
+			   struct page *page, loff_t new_file_size)
+{
+	struct super_block *sb = inode->i_sb;
+	/* Every function which is going to call do_balance must first
+	   create a tree_balance structure.  Then it must fill up this
+	   structure by using the init_tb_struct and fix_nodes functions.
+	   After that we can make tree balancing. */
+	struct tree_balance s_cut_balance;
+	struct item_head *p_le_ih;
+	int cut_size = 0,	/* Amount to be cut. */
+	    ret_value = CARRY_ON, removed = 0,	/* Number of the removed unformatted nodes. */
+	    is_inode_locked = 0;
+	char mode;		/* Mode of the balance. */
+	int retval2 = -1;
+	int quota_cut_bytes;
+	loff_t tail_pos = 0;
+
+	BUG_ON(!th->t_trans_id);
+
+	init_tb_struct(th, &s_cut_balance, inode->i_sb, path,
+		       cut_size);
+
+	/* Repeat this loop until we either cut the item without needing
+	   to balance, or we fix_nodes without schedule occurring */
+	while (1) {
+		/* Determine the balance mode, position of the first byte to
+		   be cut, and size to be cut.  In case of the indirect item
+		   free unformatted nodes which are pointed to by the cut
+		   pointers. */
+
+		mode =
+		    prepare_for_delete_or_cut(th, inode, path,
+					      item_key, &removed,
+					      &cut_size, new_file_size);
+		if (mode == M_CONVERT) {
+			/* convert last unformatted node to direct item or leave
+			   tail in the unformatted node */
+			RFALSE(ret_value != CARRY_ON,
+			       "PAP-5570: can not convert twice");
+
+			ret_value =
+			    maybe_indirect_to_direct(th, inode, page,
+						     path, item_key,
+						     new_file_size, &mode);
+			if (mode == M_SKIP_BALANCING)
+				/* tail has been left in the unformatted node */
+				return ret_value;
+
+			is_inode_locked = 1;
+
+			/* removing of last unformatted node will change value we
+			   have to return to truncate. Save it */
+			retval2 = ret_value;
+			/*retval2 = sb->s_blocksize - (new_file_size & (sb->s_blocksize - 1)); */
+
+			/* So, we have performed the first part of the conversion:
+			   inserting the new direct item.  Now we are removing the
+			   last unformatted node pointer. Set key to search for
+			   it. */
+			set_cpu_key_k_type(item_key, TYPE_INDIRECT);
+			item_key->key_length = 4;
+			new_file_size -=
+			    (new_file_size & (sb->s_blocksize - 1));
+			tail_pos = new_file_size;
+			set_cpu_key_k_offset(item_key, new_file_size + 1);
+			if (search_for_position_by_key
+			    (sb, item_key,
+			     path) == POSITION_NOT_FOUND) {
+				print_block(PATH_PLAST_BUFFER(path), 3,
+					    PATH_LAST_POSITION(path) - 1,
+					    PATH_LAST_POSITION(path) + 1);
+				reiserfs_panic(sb, "PAP-5580", "item to "
+					       "convert does not exist (%K)",
+					       item_key);
+			}
+			continue;
+		}
+		if (cut_size == 0) {
+			pathrelse(path);
+			return 0;
+		}
+
+		s_cut_balance.insert_size[0] = cut_size;
+
+		ret_value = fix_nodes(mode, &s_cut_balance, NULL, NULL);
+		if (ret_value != REPEAT_SEARCH)
+			break;
+
+		PROC_INFO_INC(sb, cut_from_item_restarted);
+
+		ret_value =
+		    search_for_position_by_key(sb, item_key, path);
+		if (ret_value == POSITION_FOUND)
+			continue;
+
+		reiserfs_warning(sb, "PAP-5610", "item %K not found",
+				 item_key);
+		unfix_nodes(&s_cut_balance);
+		return (ret_value == IO_ERROR) ? -EIO : -ENOENT;
+	}			/* while */
+
+	// check fix_nodes results (IO_ERROR or NO_DISK_SPACE)
+	if (ret_value != CARRY_ON) {
+		if (is_inode_locked) {
+			// FIXME: this seems to be not needed: we are always able
+			// to cut item
+			indirect_to_direct_roll_back(th, inode, path);
+		}
+		if (ret_value == NO_DISK_SPACE)
+			reiserfs_warning(sb, "reiserfs-5092",
+					 "NO_DISK_SPACE");
+		unfix_nodes(&s_cut_balance);
+		return -EIO;
+	}
+
+	/* go ahead and perform balancing */
+
+	RFALSE(mode == M_PASTE || mode == M_INSERT, "invalid mode");
+
+	/* Calculate number of bytes that need to be cut from the item. */
+	quota_cut_bytes =
+	    (mode ==
+	     M_DELETE) ? ih_item_len(get_ih(path)) : -s_cut_balance.
+	    insert_size[0];
+	if (retval2 == -1)
+		ret_value = calc_deleted_bytes_number(&s_cut_balance, mode);
+	else
+		ret_value = retval2;
+
+	/* For direct items, we only change the quota when deleting the last
+	 ** item.
+	 */
+	p_le_ih = PATH_PITEM_HEAD(s_cut_balance.tb_path);
+	if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) {
+		if (mode == M_DELETE &&
+		    (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) ==
+		    1) {
+			// FIXME: this is to keep 3.5 happy
+			REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
+			quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
+		} else {
+			quota_cut_bytes = 0;
+		}
+	}
+#ifdef CONFIG_REISERFS_CHECK
+	if (is_inode_locked) {
+		struct item_head *le_ih =
+		    PATH_PITEM_HEAD(s_cut_balance.tb_path);
+		/* we are going to complete indirect2direct conversion. Make
+		   sure, that we exactly remove last unformatted node pointer
+		   of the item */
+		if (!is_indirect_le_ih(le_ih))
+			reiserfs_panic(sb, "vs-5652",
+				       "item must be indirect %h", le_ih);
+
+		if (mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
+			reiserfs_panic(sb, "vs-5653", "completing "
+				       "indirect2direct conversion indirect "
+				       "item %h being deleted must be of "
+				       "4 byte long", le_ih);
+
+		if (mode == M_CUT
+		    && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
+			reiserfs_panic(sb, "vs-5654", "can not complete "
+				       "indirect2direct conversion of %h "
+				       "(CUT, insert_size==%d)",
+				       le_ih, s_cut_balance.insert_size[0]);
+		}
+		/* it would be useful to make sure, that right neighboring
+		   item is direct item of this file */
+	}
+#endif
+
+	do_balance(&s_cut_balance, NULL, NULL, mode);
+	if (is_inode_locked) {
+		/* we've done an indirect->direct conversion.  when the data block
+		 ** was freed, it was removed from the list of blocks that must
+		 ** be flushed before the transaction commits, make sure to
+		 ** unmap and invalidate it
+		 */
+		unmap_buffers(page, tail_pos);
+		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
+	}
+#ifdef REISERQUOTA_DEBUG
+	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
+		       "reiserquota cut_from_item(): freeing %u id=%u type=%c",
+		       quota_cut_bytes, inode->i_uid, '?');
+#endif
+	dquot_free_space_nodirty(inode, quota_cut_bytes);
+	return ret_value;
+}
+
+static void truncate_directory(struct reiserfs_transaction_handle *th,
+			       struct inode *inode)
+{
+	BUG_ON(!th->t_trans_id);
+	if (inode->i_nlink)
+		reiserfs_error(inode->i_sb, "vs-5655", "link count != 0");
+
+	set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET);
+	set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY);
+	reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
+	reiserfs_update_sd(th, inode);
+	set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), SD_OFFSET);
+	set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA);
+}
+
+/* Truncate file to the new size. Note, this must be called with a transaction
+   already started */
+int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
+			  struct inode *inode,	/* ->i_size contains new size */
+			 struct page *page,	/* up to date for last block */
+			 int update_timestamps	/* when it is called by
+						   file_release to convert
+						   the tail - no timestamps
+						   should be updated */
+    )
+{
+	INITIALIZE_PATH(s_search_path);	/* Path to the current object item. */
+	struct item_head *p_le_ih;	/* Pointer to an item header. */
+	struct cpu_key s_item_key;	/* Key to search for a previous file item. */
+	loff_t file_size,	/* Old file size. */
+	 new_file_size;	/* New file size. */
+	int deleted;		/* Number of deleted or truncated bytes. */
+	int retval;
+	int err = 0;
+
+	BUG_ON(!th->t_trans_id);
+	if (!
+	    (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+	     || S_ISLNK(inode->i_mode)))
+		return 0;
+
+	if (S_ISDIR(inode->i_mode)) {
+		// deletion of directory - no need to update timestamps
+		truncate_directory(th, inode);
+		return 0;
+	}
+
+	/* Get new file size. */
+	new_file_size = inode->i_size;
+
+	// FIXME: note, that key type is unimportant here
+	make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode),
+		     TYPE_DIRECT, 3);
+
+	retval =
+	    search_for_position_by_key(inode->i_sb, &s_item_key,
+				       &s_search_path);
+	if (retval == IO_ERROR) {
+		reiserfs_error(inode->i_sb, "vs-5657",
+			       "i/o failure occurred trying to truncate %K",
+			       &s_item_key);
+		err = -EIO;
+		goto out;
+	}
+	if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
+		reiserfs_error(inode->i_sb, "PAP-5660",
+			       "wrong result %d of search for %K", retval,
+			       &s_item_key);
+
+		err = -EIO;
+		goto out;
+	}
+
+	s_search_path.pos_in_item--;
+
+	/* Get real file size (total length of all file items) */
+	p_le_ih = PATH_PITEM_HEAD(&s_search_path);
+	if (is_statdata_le_ih(p_le_ih))
+		file_size = 0;
+	else {
+		loff_t offset = le_ih_k_offset(p_le_ih);
+		int bytes =
+		    op_bytes_number(p_le_ih, inode->i_sb->s_blocksize);
+
+		/* this may mismatch with real file size: if last direct item
+		   had no padding zeros and last unformatted node had no free
+		   space, this file would have this file size */
+		file_size = offset + bytes - 1;
+	}
+	/*
+	 * are we doing a full truncate or delete, if so
+	 * kick in the reada code
+	 */
+	if (new_file_size == 0)
+		s_search_path.reada = PATH_READA | PATH_READA_BACK;
+
+	if (file_size == 0 || file_size < new_file_size) {
+		goto update_and_out;
+	}
+
+	/* Update key to search for the last file item. */
+	set_cpu_key_k_offset(&s_item_key, file_size);
+
+	do {
+		/* Cut or delete file item. */
+		deleted =
+		    reiserfs_cut_from_item(th, &s_search_path, &s_item_key,
+					   inode, page, new_file_size);
+		if (deleted < 0) {
+			reiserfs_warning(inode->i_sb, "vs-5665",
+					 "reiserfs_cut_from_item failed");
+			reiserfs_check_path(&s_search_path);
+			return 0;
+		}
+
+		RFALSE(deleted > file_size,
+		       "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K",
+		       deleted, file_size, &s_item_key);
+
+		/* Change key to search the last file item. */
+		file_size -= deleted;
+
+		set_cpu_key_k_offset(&s_item_key, file_size);
+
+		/* While there are bytes to truncate and previous file item is presented in the tree. */
+
+		/*
+		 ** This loop could take a really long time, and could log
+		 ** many more blocks than a transaction can hold.  So, we do a polite
+		 ** journal end here, and if the transaction needs ending, we make
+		 ** sure the file is consistent before ending the current trans
+		 ** and starting a new one
+		 */
+		if (journal_transaction_should_end(th, 0) ||
+		    reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
+			int orig_len_alloc = th->t_blocks_allocated;
+			pathrelse(&s_search_path);
+
+			if (update_timestamps) {
+				inode->i_mtime = CURRENT_TIME_SEC;
+				inode->i_ctime = CURRENT_TIME_SEC;
+			}
+			reiserfs_update_sd(th, inode);
+
+			err = journal_end(th, inode->i_sb, orig_len_alloc);
+			if (err)
+				goto out;
+			err = journal_begin(th, inode->i_sb,
+					    JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ;
+			if (err)
+				goto out;
+			reiserfs_update_inode_transaction(inode);
+		}
+	} while (file_size > ROUND_UP(new_file_size) &&
+		 search_for_position_by_key(inode->i_sb, &s_item_key,
+					    &s_search_path) == POSITION_FOUND);
+
+	RFALSE(file_size > ROUND_UP(new_file_size),
+	       "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d",
+	       new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
+
+      update_and_out:
+	if (update_timestamps) {
+		// this is truncate, not file closing
+		inode->i_mtime = CURRENT_TIME_SEC;
+		inode->i_ctime = CURRENT_TIME_SEC;
+	}
+	reiserfs_update_sd(th, inode);
+
+      out:
+	pathrelse(&s_search_path);
+	return err;
+}
+
+#ifdef CONFIG_REISERFS_CHECK
+// this makes sure, that we __append__, not overwrite or add holes
+static void check_research_for_paste(struct treepath *path,
+				     const struct cpu_key *key)
+{
+	struct item_head *found_ih = get_ih(path);
+
+	if (is_direct_le_ih(found_ih)) {
+		if (le_ih_k_offset(found_ih) +
+		    op_bytes_number(found_ih,
+				    get_last_bh(path)->b_size) !=
+		    cpu_key_k_offset(key)
+		    || op_bytes_number(found_ih,
+				       get_last_bh(path)->b_size) !=
+		    pos_in_item(path))
+			reiserfs_panic(NULL, "PAP-5720", "found direct item "
+				       "%h or position (%d) does not match "
+				       "to key %K", found_ih,
+				       pos_in_item(path), key);
+	}
+	if (is_indirect_le_ih(found_ih)) {
+		if (le_ih_k_offset(found_ih) +
+		    op_bytes_number(found_ih,
+				    get_last_bh(path)->b_size) !=
+		    cpu_key_k_offset(key)
+		    || I_UNFM_NUM(found_ih) != pos_in_item(path)
+		    || get_ih_free_space(found_ih) != 0)
+			reiserfs_panic(NULL, "PAP-5730", "found indirect "
+				       "item (%h) or position (%d) does not "
+				       "match to key (%K)",
+				       found_ih, pos_in_item(path), key);
+	}
+}
+#endif				/* config reiserfs check */
+
+/* Paste bytes to the existing item. Returns bytes number pasted into the item. */
+int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct treepath *search_path,	/* Path to the pasted item.	  */
+			     const struct cpu_key *key,	/* Key to search for the needed item. */
+			     struct inode *inode,	/* Inode item belongs to */
+			     const char *body,	/* Pointer to the bytes to paste.    */
+			     int pasted_size)
+{				/* Size of pasted bytes.             */
+	struct tree_balance s_paste_balance;
+	int retval;
+	int fs_gen;
+
+	BUG_ON(!th->t_trans_id);
+
+	fs_gen = get_generation(inode->i_sb);
+
+#ifdef REISERQUOTA_DEBUG
+	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
+		       "reiserquota paste_into_item(): allocating %u id=%u type=%c",
+		       pasted_size, inode->i_uid,
+		       key2type(&(key->on_disk_key)));
+#endif
+
+	retval = dquot_alloc_space_nodirty(inode, pasted_size);
+	if (retval) {
+		pathrelse(search_path);
+		return retval;
+	}
+	init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
+		       pasted_size);
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+	s_paste_balance.key = key->on_disk_key;
+#endif
+
+	/* DQUOT_* can schedule, must check before the fix_nodes */
+	if (fs_changed(fs_gen, inode->i_sb)) {
+		goto search_again;
+	}
+
+	while ((retval =
+		fix_nodes(M_PASTE, &s_paste_balance, NULL,
+			  body)) == REPEAT_SEARCH) {
+	      search_again:
+		/* file system changed while we were in the fix_nodes */
+		PROC_INFO_INC(th->t_super, paste_into_item_restarted);
+		retval =
+		    search_for_position_by_key(th->t_super, key,
+					       search_path);
+		if (retval == IO_ERROR) {
+			retval = -EIO;
+			goto error_out;
+		}
+		if (retval == POSITION_FOUND) {
+			reiserfs_warning(inode->i_sb, "PAP-5710",
+					 "entry or pasted byte (%K) exists",
+					 key);
+			retval = -EEXIST;
+			goto error_out;
+		}
+#ifdef CONFIG_REISERFS_CHECK
+		check_research_for_paste(search_path, key);
+#endif
+	}
+
+	/* Perform balancing after all resources are collected by fix_nodes, and
+	   accessing them will not risk triggering schedule. */
+	if (retval == CARRY_ON) {
+		do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE);
+		return 0;
+	}
+	retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
+      error_out:
+	/* this also releases the path */
+	unfix_nodes(&s_paste_balance);
+#ifdef REISERQUOTA_DEBUG
+	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
+		       "reiserquota paste_into_item(): freeing %u id=%u type=%c",
+		       pasted_size, inode->i_uid,
+		       key2type(&(key->on_disk_key)));
+#endif
+	dquot_free_space_nodirty(inode, pasted_size);
+	return retval;
+}
+
+/* Insert new item into the buffer at the path.
+ * th   - active transaction handle
+ * path - path to the inserted item
+ * ih   - pointer to the item header to insert
+ * body - pointer to the bytes to insert
+ */
+int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
+			 struct treepath *path, const struct cpu_key *key,
+			 struct item_head *ih, struct inode *inode,
+			 const char *body)
+{
+	struct tree_balance s_ins_balance;
+	int retval;
+	int fs_gen = 0;
+	int quota_bytes = 0;
+
+	BUG_ON(!th->t_trans_id);
+
+	if (inode) {		/* Do we count quotas for item? */
+		fs_gen = get_generation(inode->i_sb);
+		quota_bytes = ih_item_len(ih);
+
+		/* hack so the quota code doesn't have to guess if the file has
+		 ** a tail, links are always tails, so there's no guessing needed
+		 */
+		if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih))
+			quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE;
+#ifdef REISERQUOTA_DEBUG
+		reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
+			       "reiserquota insert_item(): allocating %u id=%u type=%c",
+			       quota_bytes, inode->i_uid, head2type(ih));
+#endif
+		/* We can't dirty inode here. It would be immediately written but
+		 * appropriate stat item isn't inserted yet... */
+		retval = dquot_alloc_space_nodirty(inode, quota_bytes);
+		if (retval) {
+			pathrelse(path);
+			return retval;
+		}
+	}
+	init_tb_struct(th, &s_ins_balance, th->t_super, path,
+		       IH_SIZE + ih_item_len(ih));
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+	s_ins_balance.key = key->on_disk_key;
+#endif
+	/* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */
+	if (inode && fs_changed(fs_gen, inode->i_sb)) {
+		goto search_again;
+	}
+
+	while ((retval =
+		fix_nodes(M_INSERT, &s_ins_balance, ih,
+			  body)) == REPEAT_SEARCH) {
+	      search_again:
+		/* file system changed while we were in the fix_nodes */
+		PROC_INFO_INC(th->t_super, insert_item_restarted);
+		retval = search_item(th->t_super, key, path);
+		if (retval == IO_ERROR) {
+			retval = -EIO;
+			goto error_out;
+		}
+		if (retval == ITEM_FOUND) {
+			reiserfs_warning(th->t_super, "PAP-5760",
+					 "key %K already exists in the tree",
+					 key);
+			retval = -EEXIST;
+			goto error_out;
+		}
+	}
+
+	/* make balancing after all resources will be collected at a time */
+	if (retval == CARRY_ON) {
+		do_balance(&s_ins_balance, ih, body, M_INSERT);
+		return 0;
+	}
+
+	retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
+      error_out:
+	/* also releases the path */
+	unfix_nodes(&s_ins_balance);
+#ifdef REISERQUOTA_DEBUG
+	reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
+		       "reiserquota insert_item(): freeing %u id=%u type=%c",
+		       quota_bytes, inode->i_uid, head2type(ih));
+#endif
+	if (inode)
+		dquot_free_space_nodirty(inode, quota_bytes);
+	return retval;
+}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
new file mode 100644
index 00000000..f19dfbf6
--- /dev/null
+++ b/fs/reiserfs/super.c
@@ -0,0 +1,2271 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ *
+ * Trivial changes by Alan Cox to add the LFS fixes
+ *
+ * Trivial Changes:
+ * Rights granted to Hans Reiser to redistribute under other terms providing
+ * he accepts all liability including but not limited to patent, fitness
+ * for purpose, and direct or indirect claims arising from failure to perform.
+ *
+ * NO WARRANTY
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/time.h>
+#include <asm/uaccess.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/reiserfs_acl.h>
+#include <linux/reiserfs_xattr.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/quotaops.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/crc32.h>
+
+struct file_system_type reiserfs_fs_type;
+
+static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING;
+static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING;
+static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING;
+
+int is_reiserfs_3_5(struct reiserfs_super_block *rs)
+{
+	return !strncmp(rs->s_v1.s_magic, reiserfs_3_5_magic_string,
+			strlen(reiserfs_3_5_magic_string));
+}
+
+int is_reiserfs_3_6(struct reiserfs_super_block *rs)
+{
+	return !strncmp(rs->s_v1.s_magic, reiserfs_3_6_magic_string,
+			strlen(reiserfs_3_6_magic_string));
+}
+
+int is_reiserfs_jr(struct reiserfs_super_block *rs)
+{
+	return !strncmp(rs->s_v1.s_magic, reiserfs_jr_magic_string,
+			strlen(reiserfs_jr_magic_string));
+}
+
+static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
+{
+	return (is_reiserfs_3_5(rs) || is_reiserfs_3_6(rs) ||
+		is_reiserfs_jr(rs));
+}
+
+static int reiserfs_remount(struct super_block *s, int *flags, char *data);
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
+
+static int reiserfs_sync_fs(struct super_block *s, int wait)
+{
+	struct reiserfs_transaction_handle th;
+
+	reiserfs_write_lock(s);
+	if (!journal_begin(&th, s, 1))
+		if (!journal_end_sync(&th, s, 1))
+			reiserfs_flush_old_commits(s);
+	s->s_dirt = 0;	/* Even if it's not true.
+			 * We'll loop forever in sync_supers otherwise */
+	reiserfs_write_unlock(s);
+	return 0;
+}
+
+static void reiserfs_write_super(struct super_block *s)
+{
+	reiserfs_sync_fs(s, 1);
+}
+
+static int reiserfs_freeze(struct super_block *s)
+{
+	struct reiserfs_transaction_handle th;
+	reiserfs_write_lock(s);
+	if (!(s->s_flags & MS_RDONLY)) {
+		int err = journal_begin(&th, s, 1);
+		if (err) {
+			reiserfs_block_writes(&th);
+		} else {
+			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
+						     1);
+			journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+			reiserfs_block_writes(&th);
+			journal_end_sync(&th, s, 1);
+		}
+	}
+	s->s_dirt = 0;
+	reiserfs_write_unlock(s);
+	return 0;
+}
+
+static int reiserfs_unfreeze(struct super_block *s)
+{
+	reiserfs_allow_writes(s);
+	return 0;
+}
+
+extern const struct in_core_key MAX_IN_CORE_KEY;
+
+/* this is used to delete "save link" when there are no items of a
+   file it points to. It can either happen if unlink is completed but
+   "save unlink" removal, or if file has both unlink and truncate
+   pending and as unlink completes first (because key of "save link"
+   protecting unlink is bigger that a key lf "save link" which
+   protects truncate), so there left no items to make truncate
+   completion on */
+static int remove_save_link_only(struct super_block *s,
+				 struct reiserfs_key *key, int oid_free)
+{
+	struct reiserfs_transaction_handle th;
+	int err;
+
+	/* we are going to do one balancing */
+	err = journal_begin(&th, s, JOURNAL_PER_BALANCE_CNT);
+	if (err)
+		return err;
+
+	reiserfs_delete_solid_item(&th, NULL, key);
+	if (oid_free)
+		/* removals are protected by direct items */
+		reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid));
+
+	return journal_end(&th, s, JOURNAL_PER_BALANCE_CNT);
+}
+
+#ifdef CONFIG_QUOTA
+static int reiserfs_quota_on_mount(struct super_block *, int);
+#endif
+
+/* look for uncompleted unlinks and truncates and complete them */
+static int finish_unfinished(struct super_block *s)
+{
+	INITIALIZE_PATH(path);
+	struct cpu_key max_cpu_key, obj_key;
+	struct reiserfs_key save_link_key, last_inode_key;
+	int retval = 0;
+	struct item_head *ih;
+	struct buffer_head *bh;
+	int item_pos;
+	char *item;
+	int done;
+	struct inode *inode;
+	int truncate;
+#ifdef CONFIG_QUOTA
+	int i;
+	int ms_active_set;
+	int quota_enabled[MAXQUOTAS];
+#endif
+
+	/* compose key to look for "save" links */
+	max_cpu_key.version = KEY_FORMAT_3_5;
+	max_cpu_key.on_disk_key.k_dir_id = ~0U;
+	max_cpu_key.on_disk_key.k_objectid = ~0U;
+	set_cpu_key_k_offset(&max_cpu_key, ~0U);
+	max_cpu_key.key_length = 3;
+
+	memset(&last_inode_key, 0, sizeof(last_inode_key));
+
+#ifdef CONFIG_QUOTA
+	/* Needed for iput() to work correctly and not trash data */
+	if (s->s_flags & MS_ACTIVE) {
+		ms_active_set = 0;
+	} else {
+		ms_active_set = 1;
+		s->s_flags |= MS_ACTIVE;
+	}
+	/* Turn on quotas so that they are updated correctly */
+	for (i = 0; i < MAXQUOTAS; i++) {
+		quota_enabled[i] = 1;
+		if (REISERFS_SB(s)->s_qf_names[i]) {
+			int ret;
+
+			if (sb_has_quota_active(s, i)) {
+				quota_enabled[i] = 0;
+				continue;
+			}
+			ret = reiserfs_quota_on_mount(s, i);
+			if (ret < 0)
+				reiserfs_warning(s, "reiserfs-2500",
+						 "cannot turn on journaled "
+						 "quota: error %d", ret);
+		}
+	}
+#endif
+
+	done = 0;
+	REISERFS_SB(s)->s_is_unlinked_ok = 1;
+	while (!retval) {
+		retval = search_item(s, &max_cpu_key, &path);
+		if (retval != ITEM_NOT_FOUND) {
+			reiserfs_error(s, "vs-2140",
+				       "search_by_key returned %d", retval);
+			break;
+		}
+
+		bh = get_last_bh(&path);
+		item_pos = get_item_pos(&path);
+		if (item_pos != B_NR_ITEMS(bh)) {
+			reiserfs_warning(s, "vs-2060",
+					 "wrong position found");
+			break;
+		}
+		item_pos--;
+		ih = B_N_PITEM_HEAD(bh, item_pos);
+
+		if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID)
+			/* there are no "save" links anymore */
+			break;
+
+		save_link_key = ih->ih_key;
+		if (is_indirect_le_ih(ih))
+			truncate = 1;
+		else
+			truncate = 0;
+
+		/* reiserfs_iget needs k_dirid and k_objectid only */
+		item = B_I_PITEM(bh, ih);
+		obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item);
+		obj_key.on_disk_key.k_objectid =
+		    le32_to_cpu(ih->ih_key.k_objectid);
+		obj_key.on_disk_key.k_offset = 0;
+		obj_key.on_disk_key.k_type = 0;
+
+		pathrelse(&path);
+
+		inode = reiserfs_iget(s, &obj_key);
+		if (!inode) {
+			/* the unlink almost completed, it just did not manage to remove
+			   "save" link and release objectid */
+			reiserfs_warning(s, "vs-2180", "iget failed for %K",
+					 &obj_key);
+			retval = remove_save_link_only(s, &save_link_key, 1);
+			continue;
+		}
+
+		if (!truncate && inode->i_nlink) {
+			/* file is not unlinked */
+			reiserfs_warning(s, "vs-2185",
+					 "file %K is not unlinked",
+					 &obj_key);
+			retval = remove_save_link_only(s, &save_link_key, 0);
+			continue;
+		}
+		dquot_initialize(inode);
+
+		if (truncate && S_ISDIR(inode->i_mode)) {
+			/* We got a truncate request for a dir which is impossible.
+			   The only imaginable way is to execute unfinished truncate request
+			   then boot into old kernel, remove the file and create dir with
+			   the same key. */
+			reiserfs_warning(s, "green-2101",
+					 "impossible truncate on a "
+					 "directory %k. Please report",
+					 INODE_PKEY(inode));
+			retval = remove_save_link_only(s, &save_link_key, 0);
+			truncate = 0;
+			iput(inode);
+			continue;
+		}
+
+		if (truncate) {
+			REISERFS_I(inode)->i_flags |=
+			    i_link_saved_truncate_mask;
+			/* not completed truncate found. New size was committed together
+			   with "save" link */
+			reiserfs_info(s, "Truncating %k to %Ld ..",
+				      INODE_PKEY(inode), inode->i_size);
+			reiserfs_truncate_file(inode,
+					       0
+					       /*don't update modification time */
+					       );
+			retval = remove_save_link(inode, truncate);
+		} else {
+			REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
+			/* not completed unlink (rmdir) found */
+			reiserfs_info(s, "Removing %k..", INODE_PKEY(inode));
+			if (memcmp(&last_inode_key, INODE_PKEY(inode),
+					sizeof(last_inode_key))){
+				last_inode_key = *INODE_PKEY(inode);
+				/* removal gets completed in iput */
+				retval = 0;
+			} else {
+				reiserfs_warning(s, "super-2189", "Dead loop "
+						 "in finish_unfinished "
+						 "detected, just remove "
+						 "save link\n");
+				retval = remove_save_link_only(s,
+							&save_link_key, 0);
+			}
+		}
+
+		iput(inode);
+		printk("done\n");
+		done++;
+	}
+	REISERFS_SB(s)->s_is_unlinked_ok = 0;
+
+#ifdef CONFIG_QUOTA
+	/* Turn quotas off */
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (sb_dqopt(s)->files[i] && quota_enabled[i])
+			dquot_quota_off(s, i);
+	}
+	if (ms_active_set)
+		/* Restore the flag back */
+		s->s_flags &= ~MS_ACTIVE;
+#endif
+	pathrelse(&path);
+	if (done)
+		reiserfs_info(s, "There were %d uncompleted unlinks/truncates. "
+			      "Completed\n", done);
+	return retval;
+}
+
+/* to protect file being unlinked from getting lost we "safe" link files
+   being unlinked. This link will be deleted in the same transaction with last
+   item of file. mounting the filesystem we scan all these links and remove
+   files which almost got lost */
+void add_save_link(struct reiserfs_transaction_handle *th,
+		   struct inode *inode, int truncate)
+{
+	INITIALIZE_PATH(path);
+	int retval;
+	struct cpu_key key;
+	struct item_head ih;
+	__le32 link;
+
+	BUG_ON(!th->t_trans_id);
+
+	/* file can only get one "save link" of each kind */
+	RFALSE(truncate &&
+	       (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask),
+	       "saved link already exists for truncated inode %lx",
+	       (long)inode->i_ino);
+	RFALSE(!truncate &&
+	       (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask),
+	       "saved link already exists for unlinked inode %lx",
+	       (long)inode->i_ino);
+
+	/* setup key of "save" link */
+	key.version = KEY_FORMAT_3_5;
+	key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID;
+	key.on_disk_key.k_objectid = inode->i_ino;
+	if (!truncate) {
+		/* unlink, rmdir, rename */
+		set_cpu_key_k_offset(&key, 1 + inode->i_sb->s_blocksize);
+		set_cpu_key_k_type(&key, TYPE_DIRECT);
+
+		/* item head of "safe" link */
+		make_le_item_head(&ih, &key, key.version,
+				  1 + inode->i_sb->s_blocksize, TYPE_DIRECT,
+				  4 /*length */ , 0xffff /*free space */ );
+	} else {
+		/* truncate */
+		if (S_ISDIR(inode->i_mode))
+			reiserfs_warning(inode->i_sb, "green-2102",
+					 "Adding a truncate savelink for "
+					 "a directory %k! Please report",
+					 INODE_PKEY(inode));
+		set_cpu_key_k_offset(&key, 1);
+		set_cpu_key_k_type(&key, TYPE_INDIRECT);
+
+		/* item head of "safe" link */
+		make_le_item_head(&ih, &key, key.version, 1, TYPE_INDIRECT,
+				  4 /*length */ , 0 /*free space */ );
+	}
+	key.key_length = 3;
+
+	/* look for its place in the tree */
+	retval = search_item(inode->i_sb, &key, &path);
+	if (retval != ITEM_NOT_FOUND) {
+		if (retval != -ENOSPC)
+			reiserfs_error(inode->i_sb, "vs-2100",
+				       "search_by_key (%K) returned %d", &key,
+				       retval);
+		pathrelse(&path);
+		return;
+	}
+
+	/* body of "save" link */
+	link = INODE_PKEY(inode)->k_dir_id;
+
+	/* put "save" link into tree, don't charge quota to anyone */
+	retval =
+	    reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link);
+	if (retval) {
+		if (retval != -ENOSPC)
+			reiserfs_error(inode->i_sb, "vs-2120",
+				       "insert_item returned %d", retval);
+	} else {
+		if (truncate)
+			REISERFS_I(inode)->i_flags |=
+			    i_link_saved_truncate_mask;
+		else
+			REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
+	}
+}
+
+/* this opens transaction unlike add_save_link */
+int remove_save_link(struct inode *inode, int truncate)
+{
+	struct reiserfs_transaction_handle th;
+	struct reiserfs_key key;
+	int err;
+
+	/* we are going to do one balancing only */
+	err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
+	if (err)
+		return err;
+
+	/* setup key of "save" link */
+	key.k_dir_id = cpu_to_le32(MAX_KEY_OBJECTID);
+	key.k_objectid = INODE_PKEY(inode)->k_objectid;
+	if (!truncate) {
+		/* unlink, rmdir, rename */
+		set_le_key_k_offset(KEY_FORMAT_3_5, &key,
+				    1 + inode->i_sb->s_blocksize);
+		set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_DIRECT);
+	} else {
+		/* truncate */
+		set_le_key_k_offset(KEY_FORMAT_3_5, &key, 1);
+		set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_INDIRECT);
+	}
+
+	if ((truncate &&
+	     (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask)) ||
+	    (!truncate &&
+	     (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask)))
+		/* don't take quota bytes from anywhere */
+		reiserfs_delete_solid_item(&th, NULL, &key);
+	if (!truncate) {
+		reiserfs_release_objectid(&th, inode->i_ino);
+		REISERFS_I(inode)->i_flags &= ~i_link_saved_unlink_mask;
+	} else
+		REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask;
+
+	return journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
+}
+
+static void reiserfs_kill_sb(struct super_block *s)
+{
+	if (REISERFS_SB(s)) {
+		/*
+		 * Force any pending inode evictions to occur now. Any
+		 * inodes to be removed that have extended attributes
+		 * associated with them need to clean them up before
+		 * we can release the extended attribute root dentries.
+		 * shrink_dcache_for_umount will BUG if we don't release
+		 * those before it's called so ->put_super is too late.
+		 */
+		shrink_dcache_sb(s);
+
+		dput(REISERFS_SB(s)->xattr_root);
+		REISERFS_SB(s)->xattr_root = NULL;
+		dput(REISERFS_SB(s)->priv_root);
+		REISERFS_SB(s)->priv_root = NULL;
+	}
+
+	kill_block_super(s);
+}
+
+static void reiserfs_put_super(struct super_block *s)
+{
+	struct reiserfs_transaction_handle th;
+	th.t_trans_id = 0;
+
+	dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
+	reiserfs_write_lock(s);
+
+	if (s->s_dirt)
+		reiserfs_write_super(s);
+
+	/* change file system state to current state if it was mounted with read-write permissions */
+	if (!(s->s_flags & MS_RDONLY)) {
+		if (!journal_begin(&th, s, 10)) {
+			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
+						     1);
+			set_sb_umount_state(SB_DISK_SUPER_BLOCK(s),
+					    REISERFS_SB(s)->s_mount_state);
+			journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+		}
+	}
+
+	/* note, journal_release checks for readonly mount, and can decide not
+	 ** to do a journal_end
+	 */
+	journal_release(&th, s);
+
+	reiserfs_free_bitmap_cache(s);
+
+	brelse(SB_BUFFER_WITH_SB(s));
+
+	print_statistics(s);
+
+	if (REISERFS_SB(s)->reserved_blocks != 0) {
+		reiserfs_warning(s, "green-2005", "reserved blocks left %d",
+				 REISERFS_SB(s)->reserved_blocks);
+	}
+
+	reiserfs_proc_info_done(s);
+
+	reiserfs_write_unlock(s);
+	mutex_destroy(&REISERFS_SB(s)->lock);
+	kfree(s->s_fs_info);
+	s->s_fs_info = NULL;
+}
+
+static struct kmem_cache *reiserfs_inode_cachep;
+
+static struct inode *reiserfs_alloc_inode(struct super_block *sb)
+{
+	struct reiserfs_inode_info *ei;
+	ei = (struct reiserfs_inode_info *)
+	    kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	atomic_set(&ei->openers, 0);
+	mutex_init(&ei->tailpack);
+	return &ei->vfs_inode;
+}
+
+static void reiserfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
+}
+
+static void reiserfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, reiserfs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
+
+	INIT_LIST_HEAD(&ei->i_prealloc_list);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
+						  sizeof(struct
+							 reiserfs_inode_info),
+						  0, (SLAB_RECLAIM_ACCOUNT|
+							SLAB_MEM_SPREAD),
+						  init_once);
+	if (reiserfs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(reiserfs_inode_cachep);
+}
+
+/* we don't mark inodes dirty, we just log them */
+static void reiserfs_dirty_inode(struct inode *inode, int flags)
+{
+	struct reiserfs_transaction_handle th;
+
+	int err = 0;
+	int lock_depth;
+
+	if (inode->i_sb->s_flags & MS_RDONLY) {
+		reiserfs_warning(inode->i_sb, "clm-6006",
+				 "writing inode %lu on readonly FS",
+				 inode->i_ino);
+		return;
+	}
+	lock_depth = reiserfs_write_lock_once(inode->i_sb);
+
+	/* this is really only used for atime updates, so they don't have
+	 ** to be included in O_SYNC or fsync
+	 */
+	err = journal_begin(&th, inode->i_sb, 1);
+	if (err)
+		goto out;
+
+	reiserfs_update_sd(&th, inode);
+	journal_end(&th, inode->i_sb, 1);
+
+out:
+	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+}
+
+#ifdef CONFIG_QUOTA
+static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
+				    size_t, loff_t);
+static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t,
+				   loff_t);
+#endif
+
+static const struct super_operations reiserfs_sops = {
+	.alloc_inode = reiserfs_alloc_inode,
+	.destroy_inode = reiserfs_destroy_inode,
+	.write_inode = reiserfs_write_inode,
+	.dirty_inode = reiserfs_dirty_inode,
+	.evict_inode = reiserfs_evict_inode,
+	.put_super = reiserfs_put_super,
+	.write_super = reiserfs_write_super,
+	.sync_fs = reiserfs_sync_fs,
+	.freeze_fs = reiserfs_freeze,
+	.unfreeze_fs = reiserfs_unfreeze,
+	.statfs = reiserfs_statfs,
+	.remount_fs = reiserfs_remount,
+	.show_options = generic_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read = reiserfs_quota_read,
+	.quota_write = reiserfs_quota_write,
+#endif
+};
+
+#ifdef CONFIG_QUOTA
+#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
+
+static int reiserfs_write_dquot(struct dquot *);
+static int reiserfs_acquire_dquot(struct dquot *);
+static int reiserfs_release_dquot(struct dquot *);
+static int reiserfs_mark_dquot_dirty(struct dquot *);
+static int reiserfs_write_info(struct super_block *, int);
+static int reiserfs_quota_on(struct super_block *, int, int, struct path *);
+
+static const struct dquot_operations reiserfs_quota_operations = {
+	.write_dquot = reiserfs_write_dquot,
+	.acquire_dquot = reiserfs_acquire_dquot,
+	.release_dquot = reiserfs_release_dquot,
+	.mark_dirty = reiserfs_mark_dquot_dirty,
+	.write_info = reiserfs_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
+};
+
+static const struct quotactl_ops reiserfs_qctl_operations = {
+	.quota_on = reiserfs_quota_on,
+	.quota_off = dquot_quota_off,
+	.quota_sync = dquot_quota_sync,
+	.get_info = dquot_get_dqinfo,
+	.set_info = dquot_set_dqinfo,
+	.get_dqblk = dquot_get_dqblk,
+	.set_dqblk = dquot_set_dqblk,
+};
+#endif
+
+static const struct export_operations reiserfs_export_ops = {
+	.encode_fh = reiserfs_encode_fh,
+	.fh_to_dentry = reiserfs_fh_to_dentry,
+	.fh_to_parent = reiserfs_fh_to_parent,
+	.get_parent = reiserfs_get_parent,
+};
+
+/* this struct is used in reiserfs_getopt () for containing the value for those
+   mount options that have values rather than being toggles. */
+typedef struct {
+	char *value;
+	int setmask;		/* bitmask which is to set on mount_options bitmask when this
+				   value is found, 0 is no bits are to be changed. */
+	int clrmask;		/* bitmask which is to clear on mount_options bitmask when  this
+				   value is found, 0 is no bits are to be changed. This is
+				   applied BEFORE setmask */
+} arg_desc_t;
+
+/* Set this bit in arg_required to allow empty arguments */
+#define REISERFS_OPT_ALLOWEMPTY 31
+
+/* this struct is used in reiserfs_getopt() for describing the set of reiserfs
+   mount options */
+typedef struct {
+	char *option_name;
+	int arg_required;	/* 0 if argument is not required, not 0 otherwise */
+	const arg_desc_t *values;	/* list of values accepted by an option */
+	int setmask;		/* bitmask which is to set on mount_options bitmask when this
+				   value is found, 0 is no bits are to be changed. */
+	int clrmask;		/* bitmask which is to clear on mount_options bitmask when  this
+				   value is found, 0 is no bits are to be changed. This is
+				   applied BEFORE setmask */
+} opt_desc_t;
+
+/* possible values for -o data= */
+static const arg_desc_t logging_mode[] = {
+	{"ordered", 1 << REISERFS_DATA_ORDERED,
+	 (1 << REISERFS_DATA_LOG | 1 << REISERFS_DATA_WRITEBACK)},
+	{"journal", 1 << REISERFS_DATA_LOG,
+	 (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_WRITEBACK)},
+	{"writeback", 1 << REISERFS_DATA_WRITEBACK,
+	 (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_LOG)},
+	{.value = NULL}
+};
+
+/* possible values for -o barrier= */
+static const arg_desc_t barrier_mode[] = {
+	{"none", 1 << REISERFS_BARRIER_NONE, 1 << REISERFS_BARRIER_FLUSH},
+	{"flush", 1 << REISERFS_BARRIER_FLUSH, 1 << REISERFS_BARRIER_NONE},
+	{.value = NULL}
+};
+
+/* possible values for "-o block-allocator=" and bits which are to be set in
+   s_mount_opt of reiserfs specific part of in-core super block */
+static const arg_desc_t balloc[] = {
+	{"noborder", 1 << REISERFS_NO_BORDER, 0},
+	{"border", 0, 1 << REISERFS_NO_BORDER},
+	{"no_unhashed_relocation", 1 << REISERFS_NO_UNHASHED_RELOCATION, 0},
+	{"hashed_relocation", 1 << REISERFS_HASHED_RELOCATION, 0},
+	{"test4", 1 << REISERFS_TEST4, 0},
+	{"notest4", 0, 1 << REISERFS_TEST4},
+	{NULL, 0, 0}
+};
+
+static const arg_desc_t tails[] = {
+	{"on", 1 << REISERFS_LARGETAIL, 1 << REISERFS_SMALLTAIL},
+	{"off", 0, (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
+	{"small", 1 << REISERFS_SMALLTAIL, 1 << REISERFS_LARGETAIL},
+	{NULL, 0, 0}
+};
+
+static const arg_desc_t error_actions[] = {
+	{"panic", 1 << REISERFS_ERROR_PANIC,
+	 (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)},
+	{"ro-remount", 1 << REISERFS_ERROR_RO,
+	 (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)},
+#ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG
+	{"continue", 1 << REISERFS_ERROR_CONTINUE,
+	 (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)},
+#endif
+	{NULL, 0, 0},
+};
+
+/* proceed only one option from a list *cur - string containing of mount options
+   opts - array of options which are accepted
+   opt_arg - if option is found and requires an argument and if it is specifed
+   in the input - pointer to the argument is stored here
+   bit_flags - if option requires to set a certain bit - it is set here
+   return -1 if unknown option is found, opt->arg_required otherwise */
+static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
+			   char **opt_arg, unsigned long *bit_flags)
+{
+	char *p;
+	/* foo=bar,
+	   ^   ^  ^
+	   |   |  +-- option_end
+	   |   +-- arg_start
+	   +-- option_start
+	 */
+	const opt_desc_t *opt;
+	const arg_desc_t *arg;
+
+	p = *cur;
+
+	/* assume argument cannot contain commas */
+	*cur = strchr(p, ',');
+	if (*cur) {
+		*(*cur) = '\0';
+		(*cur)++;
+	}
+
+	if (!strncmp(p, "alloc=", 6)) {
+		/* Ugly special case, probably we should redo options parser so that
+		   it can understand several arguments for some options, also so that
+		   it can fill several bitfields with option values. */
+		if (reiserfs_parse_alloc_options(s, p + 6)) {
+			return -1;
+		} else {
+			return 0;
+		}
+	}
+
+	/* for every option in the list */
+	for (opt = opts; opt->option_name; opt++) {
+		if (!strncmp(p, opt->option_name, strlen(opt->option_name))) {
+			if (bit_flags) {
+				if (opt->clrmask ==
+				    (1 << REISERFS_UNSUPPORTED_OPT))
+					reiserfs_warning(s, "super-6500",
+							 "%s not supported.\n",
+							 p);
+				else
+					*bit_flags &= ~opt->clrmask;
+				if (opt->setmask ==
+				    (1 << REISERFS_UNSUPPORTED_OPT))
+					reiserfs_warning(s, "super-6501",
+							 "%s not supported.\n",
+							 p);
+				else
+					*bit_flags |= opt->setmask;
+			}
+			break;
+		}
+	}
+	if (!opt->option_name) {
+		reiserfs_warning(s, "super-6502",
+				 "unknown mount option \"%s\"", p);
+		return -1;
+	}
+
+	p += strlen(opt->option_name);
+	switch (*p) {
+	case '=':
+		if (!opt->arg_required) {
+			reiserfs_warning(s, "super-6503",
+					 "the option \"%s\" does not "
+					 "require an argument\n",
+					 opt->option_name);
+			return -1;
+		}
+		break;
+
+	case 0:
+		if (opt->arg_required) {
+			reiserfs_warning(s, "super-6504",
+					 "the option \"%s\" requires an "
+					 "argument\n", opt->option_name);
+			return -1;
+		}
+		break;
+	default:
+		reiserfs_warning(s, "super-6505",
+				 "head of option \"%s\" is only correct\n",
+				 opt->option_name);
+		return -1;
+	}
+
+	/* move to the argument, or to next option if argument is not required */
+	p++;
+
+	if (opt->arg_required
+	    && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY))
+	    && !strlen(p)) {
+		/* this catches "option=," if not allowed */
+		reiserfs_warning(s, "super-6506",
+				 "empty argument for \"%s\"\n",
+				 opt->option_name);
+		return -1;
+	}
+
+	if (!opt->values) {
+		/* *=NULLopt_arg contains pointer to argument */
+		*opt_arg = p;
+		return opt->arg_required & ~(1 << REISERFS_OPT_ALLOWEMPTY);
+	}
+
+	/* values possible for this option are listed in opt->values */
+	for (arg = opt->values; arg->value; arg++) {
+		if (!strcmp(p, arg->value)) {
+			if (bit_flags) {
+				*bit_flags &= ~arg->clrmask;
+				*bit_flags |= arg->setmask;
+			}
+			return opt->arg_required;
+		}
+	}
+
+	reiserfs_warning(s, "super-6506",
+			 "bad value \"%s\" for option \"%s\"\n", p,
+			 opt->option_name);
+	return -1;
+}
+
+/* returns 0 if something is wrong in option string, 1 - otherwise */
+static int reiserfs_parse_options(struct super_block *s, char *options,	/* string given via mount's -o */
+				  unsigned long *mount_options,
+				  /* after the parsing phase, contains the
+				     collection of bitflags defining what
+				     mount options were selected. */
+				  unsigned long *blocks,	/* strtol-ed from NNN of resize=NNN */
+				  char **jdev_name,
+				  unsigned int *commit_max_age,
+				  char **qf_names,
+				  unsigned int *qfmt)
+{
+	int c;
+	char *arg = NULL;
+	char *pos;
+	opt_desc_t opts[] = {
+		/* Compatibility stuff, so that -o notail for old setups still work */
+		{"tails",.arg_required = 't',.values = tails},
+		{"notail",.clrmask =
+		 (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
+		{"conv",.setmask = 1 << REISERFS_CONVERT},
+		{"attrs",.setmask = 1 << REISERFS_ATTRS},
+		{"noattrs",.clrmask = 1 << REISERFS_ATTRS},
+		{"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
+#ifdef CONFIG_REISERFS_FS_XATTR
+		{"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
+		{"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
+#else
+		{"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
+		{"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
+#endif
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
+		{"acl",.setmask = 1 << REISERFS_POSIXACL},
+		{"noacl",.clrmask = 1 << REISERFS_POSIXACL},
+#else
+		{"acl",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
+		{"noacl",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
+#endif
+		{.option_name = "nolog"},
+		{"replayonly",.setmask = 1 << REPLAYONLY},
+		{"block-allocator",.arg_required = 'a',.values = balloc},
+		{"data",.arg_required = 'd',.values = logging_mode},
+		{"barrier",.arg_required = 'b',.values = barrier_mode},
+		{"resize",.arg_required = 'r',.values = NULL},
+		{"jdev",.arg_required = 'j',.values = NULL},
+		{"nolargeio",.arg_required = 'w',.values = NULL},
+		{"commit",.arg_required = 'c',.values = NULL},
+		{"usrquota",.setmask = 1 << REISERFS_QUOTA},
+		{"grpquota",.setmask = 1 << REISERFS_QUOTA},
+		{"noquota",.clrmask = 1 << REISERFS_QUOTA},
+		{"errors",.arg_required = 'e',.values = error_actions},
+		{"usrjquota",.arg_required =
+		 'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
+		{"grpjquota",.arg_required =
+		 'g' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
+		{"jqfmt",.arg_required = 'f',.values = NULL},
+		{.option_name = NULL}
+	};
+
+	*blocks = 0;
+	if (!options || !*options)
+		/* use default configuration: create tails, journaling on, no
+		   conversion to newest format */
+		return 1;
+
+	for (pos = options; pos;) {
+		c = reiserfs_getopt(s, &pos, opts, &arg, mount_options);
+		if (c == -1)
+			/* wrong option is given */
+			return 0;
+
+		if (c == 'r') {
+			char *p;
+
+			p = NULL;
+			/* "resize=NNN" or "resize=auto" */
+
+			if (!strcmp(arg, "auto")) {
+				/* From JFS code, to auto-get the size. */
+				*blocks =
+				    s->s_bdev->bd_inode->i_size >> s->
+				    s_blocksize_bits;
+			} else {
+				*blocks = simple_strtoul(arg, &p, 0);
+				if (*p != '\0') {
+					/* NNN does not look like a number */
+					reiserfs_warning(s, "super-6507",
+							 "bad value %s for "
+							 "-oresize\n", arg);
+					return 0;
+				}
+			}
+		}
+
+		if (c == 'c') {
+			char *p = NULL;
+			unsigned long val = simple_strtoul(arg, &p, 0);
+			/* commit=NNN (time in seconds) */
+			if (*p != '\0' || val >= (unsigned int)-1) {
+				reiserfs_warning(s, "super-6508",
+						 "bad value %s for -ocommit\n",
+						 arg);
+				return 0;
+			}
+			*commit_max_age = (unsigned int)val;
+		}
+
+		if (c == 'w') {
+			reiserfs_warning(s, "super-6509", "nolargeio option "
+					 "is no longer supported");
+			return 0;
+		}
+
+		if (c == 'j') {
+			if (arg && *arg && jdev_name) {
+				if (*jdev_name) {	//Hm, already assigned?
+					reiserfs_warning(s, "super-6510",
+							 "journal device was "
+							 "already specified to "
+							 "be %s", *jdev_name);
+					return 0;
+				}
+				*jdev_name = arg;
+			}
+		}
+#ifdef CONFIG_QUOTA
+		if (c == 'u' || c == 'g') {
+			int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
+
+			if (sb_any_quota_loaded(s) &&
+			    (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
+				reiserfs_warning(s, "super-6511",
+						 "cannot change journaled "
+						 "quota options when quota "
+						 "turned on.");
+				return 0;
+			}
+			if (*arg) {	/* Some filename specified? */
+				if (REISERFS_SB(s)->s_qf_names[qtype]
+				    && strcmp(REISERFS_SB(s)->s_qf_names[qtype],
+					      arg)) {
+					reiserfs_warning(s, "super-6512",
+							 "%s quota file "
+							 "already specified.",
+							 QTYPE2NAME(qtype));
+					return 0;
+				}
+				if (strchr(arg, '/')) {
+					reiserfs_warning(s, "super-6513",
+							 "quotafile must be "
+							 "on filesystem root.");
+					return 0;
+				}
+				qf_names[qtype] =
+				    kmalloc(strlen(arg) + 1, GFP_KERNEL);
+				if (!qf_names[qtype]) {
+					reiserfs_warning(s, "reiserfs-2502",
+							 "not enough memory "
+							 "for storing "
+							 "quotafile name.");
+					return 0;
+				}
+				strcpy(qf_names[qtype], arg);
+				*mount_options |= 1 << REISERFS_QUOTA;
+			} else {
+				if (qf_names[qtype] !=
+				    REISERFS_SB(s)->s_qf_names[qtype])
+					kfree(qf_names[qtype]);
+				qf_names[qtype] = NULL;
+			}
+		}
+		if (c == 'f') {
+			if (!strcmp(arg, "vfsold"))
+				*qfmt = QFMT_VFS_OLD;
+			else if (!strcmp(arg, "vfsv0"))
+				*qfmt = QFMT_VFS_V0;
+			else {
+				reiserfs_warning(s, "super-6514",
+						 "unknown quota format "
+						 "specified.");
+				return 0;
+			}
+			if (sb_any_quota_loaded(s) &&
+			    *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
+				reiserfs_warning(s, "super-6515",
+						 "cannot change journaled "
+						 "quota options when quota "
+						 "turned on.");
+				return 0;
+			}
+		}
+#else
+		if (c == 'u' || c == 'g' || c == 'f') {
+			reiserfs_warning(s, "reiserfs-2503", "journaled "
+					 "quota options not supported.");
+			return 0;
+		}
+#endif
+	}
+
+#ifdef CONFIG_QUOTA
+	if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt
+	    && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) {
+		reiserfs_warning(s, "super-6515",
+				 "journaled quota format not specified.");
+		return 0;
+	}
+	/* This checking is not precise wrt the quota type but for our purposes it is sufficient */
+	if (!(*mount_options & (1 << REISERFS_QUOTA))
+	    && sb_any_quota_loaded(s)) {
+		reiserfs_warning(s, "super-6516", "quota options must "
+				 "be present when quota is turned on.");
+		return 0;
+	}
+#endif
+
+	return 1;
+}
+
+static void switch_data_mode(struct super_block *s, unsigned long mode)
+{
+	REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
+					 (1 << REISERFS_DATA_ORDERED) |
+					 (1 << REISERFS_DATA_WRITEBACK));
+	REISERFS_SB(s)->s_mount_opt |= (1 << mode);
+}
+
+static void handle_data_mode(struct super_block *s, unsigned long mount_options)
+{
+	if (mount_options & (1 << REISERFS_DATA_LOG)) {
+		if (!reiserfs_data_log(s)) {
+			switch_data_mode(s, REISERFS_DATA_LOG);
+			reiserfs_info(s, "switching to journaled data mode\n");
+		}
+	} else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
+		if (!reiserfs_data_ordered(s)) {
+			switch_data_mode(s, REISERFS_DATA_ORDERED);
+			reiserfs_info(s, "switching to ordered data mode\n");
+		}
+	} else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
+		if (!reiserfs_data_writeback(s)) {
+			switch_data_mode(s, REISERFS_DATA_WRITEBACK);
+			reiserfs_info(s, "switching to writeback data mode\n");
+		}
+	}
+}
+
+static void handle_barrier_mode(struct super_block *s, unsigned long bits)
+{
+	int flush = (1 << REISERFS_BARRIER_FLUSH);
+	int none = (1 << REISERFS_BARRIER_NONE);
+	int all_barrier = flush | none;
+
+	if (bits & all_barrier) {
+		REISERFS_SB(s)->s_mount_opt &= ~all_barrier;
+		if (bits & flush) {
+			REISERFS_SB(s)->s_mount_opt |= flush;
+			printk("reiserfs: enabling write barrier flush mode\n");
+		} else if (bits & none) {
+			REISERFS_SB(s)->s_mount_opt |= none;
+			printk("reiserfs: write barriers turned off\n");
+		}
+	}
+}
+
+static void handle_attrs(struct super_block *s)
+{
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+
+	if (reiserfs_attrs(s)) {
+		if (old_format_only(s)) {
+			reiserfs_warning(s, "super-6517", "cannot support "
+					 "attributes on 3.5.x disk format");
+			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
+			return;
+		}
+		if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) {
+			reiserfs_warning(s, "super-6518", "cannot support "
+					 "attributes until flag is set in "
+					 "super-block");
+			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
+		}
+	}
+}
+
+#ifdef CONFIG_QUOTA
+static void handle_quota_files(struct super_block *s, char **qf_names,
+			       unsigned int *qfmt)
+{
+	int i;
+
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
+			kfree(REISERFS_SB(s)->s_qf_names[i]);
+		REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
+	}
+	if (*qfmt)
+		REISERFS_SB(s)->s_jquota_fmt = *qfmt;
+}
+#endif
+
+static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
+{
+	struct reiserfs_super_block *rs;
+	struct reiserfs_transaction_handle th;
+	unsigned long blocks;
+	unsigned long mount_options = REISERFS_SB(s)->s_mount_opt;
+	unsigned long safe_mask = 0;
+	unsigned int commit_max_age = (unsigned int)-1;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	char *new_opts = kstrdup(arg, GFP_KERNEL);
+	int err;
+	char *qf_names[MAXQUOTAS];
+	unsigned int qfmt = 0;
+#ifdef CONFIG_QUOTA
+	int i;
+#endif
+
+	reiserfs_write_lock(s);
+
+#ifdef CONFIG_QUOTA
+	memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
+#endif
+
+	rs = SB_DISK_SUPER_BLOCK(s);
+
+	if (!reiserfs_parse_options
+	    (s, arg, &mount_options, &blocks, NULL, &commit_max_age,
+	    qf_names, &qfmt)) {
+#ifdef CONFIG_QUOTA
+		for (i = 0; i < MAXQUOTAS; i++)
+			if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
+				kfree(qf_names[i]);
+#endif
+		err = -EINVAL;
+		goto out_err;
+	}
+#ifdef CONFIG_QUOTA
+	handle_quota_files(s, qf_names, &qfmt);
+#endif
+
+	handle_attrs(s);
+
+	/* Add options that are safe here */
+	safe_mask |= 1 << REISERFS_SMALLTAIL;
+	safe_mask |= 1 << REISERFS_LARGETAIL;
+	safe_mask |= 1 << REISERFS_NO_BORDER;
+	safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION;
+	safe_mask |= 1 << REISERFS_HASHED_RELOCATION;
+	safe_mask |= 1 << REISERFS_TEST4;
+	safe_mask |= 1 << REISERFS_ATTRS;
+	safe_mask |= 1 << REISERFS_XATTRS_USER;
+	safe_mask |= 1 << REISERFS_POSIXACL;
+	safe_mask |= 1 << REISERFS_BARRIER_FLUSH;
+	safe_mask |= 1 << REISERFS_BARRIER_NONE;
+	safe_mask |= 1 << REISERFS_ERROR_RO;
+	safe_mask |= 1 << REISERFS_ERROR_CONTINUE;
+	safe_mask |= 1 << REISERFS_ERROR_PANIC;
+	safe_mask |= 1 << REISERFS_QUOTA;
+
+	/* Update the bitmask, taking care to keep
+	 * the bits we're not allowed to change here */
+	REISERFS_SB(s)->s_mount_opt =
+	    (REISERFS_SB(s)->
+	     s_mount_opt & ~safe_mask) | (mount_options & safe_mask);
+
+	if (commit_max_age != 0 && commit_max_age != (unsigned int)-1) {
+		journal->j_max_commit_age = commit_max_age;
+		journal->j_max_trans_age = commit_max_age;
+	} else if (commit_max_age == 0) {
+		/* 0 means restore defaults. */
+		journal->j_max_commit_age = journal->j_default_max_commit_age;
+		journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
+	}
+
+	if (blocks) {
+		err = reiserfs_resize(s, blocks);
+		if (err != 0)
+			goto out_err;
+	}
+
+	if (*mount_flags & MS_RDONLY) {
+		reiserfs_xattr_init(s, *mount_flags);
+		/* remount read-only */
+		if (s->s_flags & MS_RDONLY)
+			/* it is read-only already */
+			goto out_ok;
+
+		err = dquot_suspend(s, -1);
+		if (err < 0)
+			goto out_err;
+
+		/* try to remount file system with read-only permissions */
+		if (sb_umount_state(rs) == REISERFS_VALID_FS
+		    || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
+			goto out_ok;
+		}
+
+		err = journal_begin(&th, s, 10);
+		if (err)
+			goto out_err;
+
+		/* Mounting a rw partition read-only. */
+		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+		set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state);
+		journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+	} else {
+		/* remount read-write */
+		if (!(s->s_flags & MS_RDONLY)) {
+			reiserfs_xattr_init(s, *mount_flags);
+			goto out_ok;	/* We are read-write already */
+		}
+
+		if (reiserfs_is_journal_aborted(journal)) {
+			err = journal->j_errno;
+			goto out_err;
+		}
+
+		handle_data_mode(s, mount_options);
+		handle_barrier_mode(s, mount_options);
+		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
+		s->s_flags &= ~MS_RDONLY;	/* now it is safe to call journal_begin */
+		err = journal_begin(&th, s, 10);
+		if (err)
+			goto out_err;
+
+		/* Mount a partition which is read-only, read-write */
+		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
+		s->s_flags &= ~MS_RDONLY;
+		set_sb_umount_state(rs, REISERFS_ERROR_FS);
+		if (!old_format_only(s))
+			set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
+		/* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
+		journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+		REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS;
+	}
+	/* this will force a full flush of all journal lists */
+	SB_JOURNAL(s)->j_must_wait = 1;
+	err = journal_end(&th, s, 10);
+	if (err)
+		goto out_err;
+	s->s_dirt = 0;
+
+	if (!(*mount_flags & MS_RDONLY)) {
+		dquot_resume(s, -1);
+		finish_unfinished(s);
+		reiserfs_xattr_init(s, *mount_flags);
+	}
+
+out_ok:
+	replace_mount_options(s, new_opts);
+	reiserfs_write_unlock(s);
+	return 0;
+
+out_err:
+	kfree(new_opts);
+	reiserfs_write_unlock(s);
+	return err;
+}
+
+static int read_super_block(struct super_block *s, int offset)
+{
+	struct buffer_head *bh;
+	struct reiserfs_super_block *rs;
+	int fs_blocksize;
+
+	bh = sb_bread(s, offset / s->s_blocksize);
+	if (!bh) {
+		reiserfs_warning(s, "sh-2006",
+				 "bread failed (dev %s, block %lu, size %lu)",
+				 reiserfs_bdevname(s), offset / s->s_blocksize,
+				 s->s_blocksize);
+		return 1;
+	}
+
+	rs = (struct reiserfs_super_block *)bh->b_data;
+	if (!is_any_reiserfs_magic_string(rs)) {
+		brelse(bh);
+		return 1;
+	}
+	//
+	// ok, reiserfs signature (old or new) found in at the given offset
+	//
+	fs_blocksize = sb_blocksize(rs);
+	brelse(bh);
+	sb_set_blocksize(s, fs_blocksize);
+
+	bh = sb_bread(s, offset / s->s_blocksize);
+	if (!bh) {
+		reiserfs_warning(s, "sh-2007",
+				 "bread failed (dev %s, block %lu, size %lu)",
+				 reiserfs_bdevname(s), offset / s->s_blocksize,
+				 s->s_blocksize);
+		return 1;
+	}
+
+	rs = (struct reiserfs_super_block *)bh->b_data;
+	if (sb_blocksize(rs) != s->s_blocksize) {
+		reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
+				 "filesystem on (dev %s, block %Lu, size %lu)",
+				 reiserfs_bdevname(s),
+				 (unsigned long long)bh->b_blocknr,
+				 s->s_blocksize);
+		brelse(bh);
+		return 1;
+	}
+
+	if (rs->s_v1.s_root_block == cpu_to_le32(-1)) {
+		brelse(bh);
+		reiserfs_warning(s, "super-6519", "Unfinished reiserfsck "
+				 "--rebuild-tree run detected. Please run\n"
+				 "reiserfsck --rebuild-tree and wait for a "
+				 "completion. If that fails\n"
+				 "get newer reiserfsprogs package");
+		return 1;
+	}
+
+	SB_BUFFER_WITH_SB(s) = bh;
+	SB_DISK_SUPER_BLOCK(s) = rs;
+
+	if (is_reiserfs_jr(rs)) {
+		/* magic is of non-standard journal filesystem, look at s_version to
+		   find which format is in use */
+		if (sb_version(rs) == REISERFS_VERSION_2)
+			reiserfs_info(s, "found reiserfs format \"3.6\""
+				      " with non-standard journal\n");
+		else if (sb_version(rs) == REISERFS_VERSION_1)
+			reiserfs_info(s, "found reiserfs format \"3.5\""
+				      " with non-standard journal\n");
+		else {
+			reiserfs_warning(s, "sh-2012", "found unknown "
+					 "format \"%u\" of reiserfs with "
+					 "non-standard magic", sb_version(rs));
+			return 1;
+		}
+	} else
+		/* s_version of standard format may contain incorrect information,
+		   so we just look at the magic string */
+		reiserfs_info(s,
+			      "found reiserfs format \"%s\" with standard journal\n",
+			      is_reiserfs_3_5(rs) ? "3.5" : "3.6");
+
+	s->s_op = &reiserfs_sops;
+	s->s_export_op = &reiserfs_export_ops;
+#ifdef CONFIG_QUOTA
+	s->s_qcop = &reiserfs_qctl_operations;
+	s->dq_op = &reiserfs_quota_operations;
+#endif
+
+	/* new format is limited by the 32 bit wide i_blocks field, want to
+	 ** be one full block below that.
+	 */
+	s->s_maxbytes = (512LL << 32) - s->s_blocksize;
+	return 0;
+}
+
+/* after journal replay, reread all bitmap and super blocks */
+static int reread_meta_blocks(struct super_block *s)
+{
+	ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
+	reiserfs_write_unlock(s);
+	wait_on_buffer(SB_BUFFER_WITH_SB(s));
+	reiserfs_write_lock(s);
+	if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
+		reiserfs_warning(s, "reiserfs-2504", "error reading the super");
+		return 1;
+	}
+
+	return 0;
+}
+
+/////////////////////////////////////////////////////
+// hash detection stuff
+
+// if root directory is empty - we set default - Yura's - hash and
+// warn about it
+// FIXME: we look for only one name in a directory. If tea and yura
+// bith have the same value - we ask user to send report to the
+// mailing list
+static __u32 find_hash_out(struct super_block *s)
+{
+	int retval;
+	struct inode *inode;
+	struct cpu_key key;
+	INITIALIZE_PATH(path);
+	struct reiserfs_dir_entry de;
+	__u32 hash = DEFAULT_HASH;
+
+	inode = s->s_root->d_inode;
+
+	do {			// Some serious "goto"-hater was there ;)
+		u32 teahash, r5hash, yurahash;
+
+		make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3);
+		retval = search_by_entry_key(s, &key, &path, &de);
+		if (retval == IO_ERROR) {
+			pathrelse(&path);
+			return UNSET_HASH;
+		}
+		if (retval == NAME_NOT_FOUND)
+			de.de_entry_num--;
+		set_de_name_and_namelen(&de);
+		if (deh_offset(&(de.de_deh[de.de_entry_num])) == DOT_DOT_OFFSET) {
+			/* allow override in this case */
+			if (reiserfs_rupasov_hash(s)) {
+				hash = YURA_HASH;
+			}
+			reiserfs_info(s, "FS seems to be empty, autodetect "
+					 "is using the default hash\n");
+			break;
+		}
+		r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen));
+		teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen));
+		yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen));
+		if (((teahash == r5hash)
+		     &&
+		     (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num])))
+		      == r5hash)) || ((teahash == yurahash)
+				      && (yurahash ==
+					  GET_HASH_VALUE(deh_offset
+							 (&
+							  (de.
+							   de_deh[de.
+								  de_entry_num])))))
+		    || ((r5hash == yurahash)
+			&& (yurahash ==
+			    GET_HASH_VALUE(deh_offset
+					   (&(de.de_deh[de.de_entry_num])))))) {
+			reiserfs_warning(s, "reiserfs-2506", "Unable to "
+					 "automatically detect hash function. "
+					 "Please mount with -o "
+					 "hash={tea,rupasov,r5}");
+			hash = UNSET_HASH;
+			break;
+		}
+		if (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num]))) ==
+		    yurahash)
+			hash = YURA_HASH;
+		else if (GET_HASH_VALUE
+			 (deh_offset(&(de.de_deh[de.de_entry_num]))) == teahash)
+			hash = TEA_HASH;
+		else if (GET_HASH_VALUE
+			 (deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash)
+			hash = R5_HASH;
+		else {
+			reiserfs_warning(s, "reiserfs-2506",
+					 "Unrecognised hash function");
+			hash = UNSET_HASH;
+		}
+	} while (0);
+
+	pathrelse(&path);
+	return hash;
+}
+
+// finds out which hash names are sorted with
+static int what_hash(struct super_block *s)
+{
+	__u32 code;
+
+	code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s));
+
+	/* reiserfs_hash_detect() == true if any of the hash mount options
+	 ** were used.  We must check them to make sure the user isn't
+	 ** using a bad hash value
+	 */
+	if (code == UNSET_HASH || reiserfs_hash_detect(s))
+		code = find_hash_out(s);
+
+	if (code != UNSET_HASH && reiserfs_hash_detect(s)) {
+		/* detection has found the hash, and we must check against the
+		 ** mount options
+		 */
+		if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
+			reiserfs_warning(s, "reiserfs-2507",
+					 "Error, %s hash detected, "
+					 "unable to force rupasov hash",
+					 reiserfs_hashname(code));
+			code = UNSET_HASH;
+		} else if (reiserfs_tea_hash(s) && code != TEA_HASH) {
+			reiserfs_warning(s, "reiserfs-2508",
+					 "Error, %s hash detected, "
+					 "unable to force tea hash",
+					 reiserfs_hashname(code));
+			code = UNSET_HASH;
+		} else if (reiserfs_r5_hash(s) && code != R5_HASH) {
+			reiserfs_warning(s, "reiserfs-2509",
+					 "Error, %s hash detected, "
+					 "unable to force r5 hash",
+					 reiserfs_hashname(code));
+			code = UNSET_HASH;
+		}
+	} else {
+		/* find_hash_out was not called or could not determine the hash */
+		if (reiserfs_rupasov_hash(s)) {
+			code = YURA_HASH;
+		} else if (reiserfs_tea_hash(s)) {
+			code = TEA_HASH;
+		} else if (reiserfs_r5_hash(s)) {
+			code = R5_HASH;
+		}
+	}
+
+	/* if we are mounted RW, and we have a new valid hash code, update
+	 ** the super
+	 */
+	if (code != UNSET_HASH &&
+	    !(s->s_flags & MS_RDONLY) &&
+	    code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) {
+		set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code);
+	}
+	return code;
+}
+
+// return pointer to appropriate function
+static hashf_t hash_function(struct super_block *s)
+{
+	switch (what_hash(s)) {
+	case TEA_HASH:
+		reiserfs_info(s, "Using tea hash to sort names\n");
+		return keyed_hash;
+	case YURA_HASH:
+		reiserfs_info(s, "Using rupasov hash to sort names\n");
+		return yura_hash;
+	case R5_HASH:
+		reiserfs_info(s, "Using r5 hash to sort names\n");
+		return r5_hash;
+	}
+	return NULL;
+}
+
+// this is used to set up correct value for old partitions
+static int function2code(hashf_t func)
+{
+	if (func == keyed_hash)
+		return TEA_HASH;
+	if (func == yura_hash)
+		return YURA_HASH;
+	if (func == r5_hash)
+		return R5_HASH;
+
+	BUG();			// should never happen
+
+	return 0;
+}
+
+#define SWARN(silent, s, id, ...)			\
+	if (!(silent))				\
+		reiserfs_warning(s, id, __VA_ARGS__)
+
+static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct inode *root_inode;
+	struct reiserfs_transaction_handle th;
+	int old_format = 0;
+	unsigned long blocks;
+	unsigned int commit_max_age = 0;
+	int jinit_done = 0;
+	struct reiserfs_iget_args args;
+	struct reiserfs_super_block *rs;
+	char *jdev_name;
+	struct reiserfs_sb_info *sbi;
+	int errval = -EINVAL;
+	char *qf_names[MAXQUOTAS] = {};
+	unsigned int qfmt = 0;
+
+	save_mount_options(s, data);
+
+	sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	s->s_fs_info = sbi;
+	/* Set default values for options: non-aggressive tails, RO on errors */
+	REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
+	REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO);
+	/* no preallocation minimum, be smart in
+	   reiserfs_file_write instead */
+	REISERFS_SB(s)->s_alloc_options.preallocmin = 0;
+	/* Preallocate by 16 blocks (17-1) at once */
+	REISERFS_SB(s)->s_alloc_options.preallocsize = 17;
+	/* setup default block allocator options */
+	reiserfs_init_alloc_options(s);
+
+	mutex_init(&REISERFS_SB(s)->lock);
+	REISERFS_SB(s)->lock_depth = -1;
+
+	/*
+	 * This function is called with the bkl, which also was the old
+	 * locking used here.
+	 * do_journal_begin() will soon check if we hold the lock (ie: was the
+	 * bkl). This is likely because do_journal_begin() has several another
+	 * callers because at this time, it doesn't seem to be necessary to
+	 * protect against anything.
+	 * Anyway, let's be conservative and lock for now.
+	 */
+	reiserfs_write_lock(s);
+
+	jdev_name = NULL;
+	if (reiserfs_parse_options
+	    (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
+	     &commit_max_age, qf_names, &qfmt) == 0) {
+		goto error;
+	}
+#ifdef CONFIG_QUOTA
+	handle_quota_files(s, qf_names, &qfmt);
+#endif
+
+	if (blocks) {
+		SWARN(silent, s, "jmacd-7", "resize option for remount only");
+		goto error;
+	}
+
+	/* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */
+	if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES))
+		old_format = 1;
+	/* try new format (64-th 1k block), which can contain reiserfs super block */
+	else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
+		SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
+		      reiserfs_bdevname(s));
+		goto error;
+	}
+
+	rs = SB_DISK_SUPER_BLOCK(s);
+	/* Let's do basic sanity check to verify that underlying device is not
+	   smaller than the filesystem. If the check fails then abort and scream,
+	   because bad stuff will happen otherwise. */
+	if (s->s_bdev && s->s_bdev->bd_inode
+	    && i_size_read(s->s_bdev->bd_inode) <
+	    sb_block_count(rs) * sb_blocksize(rs)) {
+		SWARN(silent, s, "", "Filesystem cannot be "
+		      "mounted because it is bigger than the device");
+		SWARN(silent, s, "", "You may need to run fsck "
+		      "or increase size of your LVM partition");
+		SWARN(silent, s, "", "Or may be you forgot to "
+		      "reboot after fdisk when it told you to");
+		goto error;
+	}
+
+	sbi->s_mount_state = SB_REISERFS_STATE(s);
+	sbi->s_mount_state = REISERFS_VALID_FS;
+
+	if ((errval = reiserfs_init_bitmap_cache(s))) {
+		SWARN(silent, s, "jmacd-8", "unable to read bitmap");
+		goto error;
+	}
+	errval = -EINVAL;
+#ifdef CONFIG_REISERFS_CHECK
+	SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
+	SWARN(silent, s, "", "- it is slow mode for debugging.");
+#endif
+
+	/* make data=ordered the default */
+	if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
+	    !reiserfs_data_writeback(s)) {
+		REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
+	}
+
+	if (reiserfs_data_log(s)) {
+		reiserfs_info(s, "using journaled data mode\n");
+	} else if (reiserfs_data_ordered(s)) {
+		reiserfs_info(s, "using ordered data mode\n");
+	} else {
+		reiserfs_info(s, "using writeback data mode\n");
+	}
+	if (reiserfs_barrier_flush(s)) {
+		printk("reiserfs: using flush barriers\n");
+	}
+	// set_device_ro(s->s_dev, 1) ;
+	if (journal_init(s, jdev_name, old_format, commit_max_age)) {
+		SWARN(silent, s, "sh-2022",
+		      "unable to initialize journal space");
+		goto error;
+	} else {
+		jinit_done = 1;	/* once this is set, journal_release must be called
+				 ** if we error out of the mount
+				 */
+	}
+	if (reread_meta_blocks(s)) {
+		SWARN(silent, s, "jmacd-9",
+		      "unable to reread meta blocks after journal init");
+		goto error;
+	}
+
+	if (replay_only(s))
+		goto error;
+
+	if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) {
+		SWARN(silent, s, "clm-7000",
+		      "Detected readonly device, marking FS readonly");
+		s->s_flags |= MS_RDONLY;
+	}
+	args.objectid = REISERFS_ROOT_OBJECTID;
+	args.dirid = REISERFS_ROOT_PARENT_OBJECTID;
+	root_inode =
+	    iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor,
+			 reiserfs_init_locked_inode, (void *)(&args));
+	if (!root_inode) {
+		SWARN(silent, s, "jmacd-10", "get root inode failed");
+		goto error;
+	}
+
+	if (root_inode->i_state & I_NEW) {
+		reiserfs_read_locked_inode(root_inode, &args);
+		unlock_new_inode(root_inode);
+	}
+
+	s->s_root = d_alloc_root(root_inode);
+	if (!s->s_root) {
+		iput(root_inode);
+		goto error;
+	}
+	// define and initialize hash function
+	sbi->s_hash_function = hash_function(s);
+	if (sbi->s_hash_function == NULL) {
+		dput(s->s_root);
+		s->s_root = NULL;
+		goto error;
+	}
+
+	if (is_reiserfs_3_5(rs)
+	    || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1))
+		set_bit(REISERFS_3_5, &(sbi->s_properties));
+	else if (old_format)
+		set_bit(REISERFS_OLD_FORMAT, &(sbi->s_properties));
+	else
+		set_bit(REISERFS_3_6, &(sbi->s_properties));
+
+	if (!(s->s_flags & MS_RDONLY)) {
+
+		errval = journal_begin(&th, s, 1);
+		if (errval) {
+			dput(s->s_root);
+			s->s_root = NULL;
+			goto error;
+		}
+		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+
+		set_sb_umount_state(rs, REISERFS_ERROR_FS);
+		set_sb_fs_state(rs, 0);
+
+		/* Clear out s_bmap_nr if it would wrap. We can handle this
+		 * case, but older revisions can't. This will cause the
+		 * file system to fail mount on those older implementations,
+		 * avoiding corruption. -jeffm */
+		if (bmap_would_wrap(reiserfs_bmap_count(s)) &&
+		    sb_bmap_nr(rs) != 0) {
+			reiserfs_warning(s, "super-2030", "This file system "
+					"claims to use %u bitmap blocks in "
+					"its super block, but requires %u. "
+					"Clearing to zero.", sb_bmap_nr(rs),
+					reiserfs_bmap_count(s));
+
+			set_sb_bmap_nr(rs, 0);
+		}
+
+		if (old_format_only(s)) {
+			/* filesystem of format 3.5 either with standard or non-standard
+			   journal */
+			if (convert_reiserfs(s)) {
+				/* and -o conv is given */
+				if (!silent)
+					reiserfs_info(s,
+						      "converting 3.5 filesystem to the 3.6 format");
+
+				if (is_reiserfs_3_5(rs))
+					/* put magic string of 3.6 format. 2.2 will not be able to
+					   mount this filesystem anymore */
+					memcpy(rs->s_v1.s_magic,
+					       reiserfs_3_6_magic_string,
+					       sizeof
+					       (reiserfs_3_6_magic_string));
+
+				set_sb_version(rs, REISERFS_VERSION_2);
+				reiserfs_convert_objectid_map_v1(s);
+				set_bit(REISERFS_3_6, &(sbi->s_properties));
+				clear_bit(REISERFS_3_5, &(sbi->s_properties));
+			} else if (!silent) {
+				reiserfs_info(s, "using 3.5.x disk format\n");
+			}
+		} else
+			set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
+
+
+		journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+		errval = journal_end(&th, s, 1);
+		if (errval) {
+			dput(s->s_root);
+			s->s_root = NULL;
+			goto error;
+		}
+
+		if ((errval = reiserfs_lookup_privroot(s)) ||
+		    (errval = reiserfs_xattr_init(s, s->s_flags))) {
+			dput(s->s_root);
+			s->s_root = NULL;
+			goto error;
+		}
+
+		/* look for files which were to be removed in previous session */
+		finish_unfinished(s);
+	} else {
+		if (old_format_only(s) && !silent) {
+			reiserfs_info(s, "using 3.5.x disk format\n");
+		}
+
+		if ((errval = reiserfs_lookup_privroot(s)) ||
+		    (errval = reiserfs_xattr_init(s, s->s_flags))) {
+			dput(s->s_root);
+			s->s_root = NULL;
+			goto error;
+		}
+	}
+	// mark hash in super block: it could be unset. overwrite should be ok
+	set_sb_hash_function_code(rs, function2code(sbi->s_hash_function));
+
+	handle_attrs(s);
+
+	reiserfs_proc_info_init(s);
+
+	init_waitqueue_head(&(sbi->s_wait));
+	spin_lock_init(&sbi->bitmap_lock);
+
+	reiserfs_write_unlock(s);
+
+	return (0);
+
+error:
+	if (jinit_done) {	/* kill the commit thread, free journal ram */
+		journal_release_error(NULL, s);
+	}
+
+	reiserfs_write_unlock(s);
+
+	reiserfs_free_bitmap_cache(s);
+	if (SB_BUFFER_WITH_SB(s))
+		brelse(SB_BUFFER_WITH_SB(s));
+#ifdef CONFIG_QUOTA
+	{
+		int j;
+		for (j = 0; j < MAXQUOTAS; j++)
+			kfree(qf_names[j]);
+	}
+#endif
+	kfree(sbi);
+
+	s->s_fs_info = NULL;
+	return errval;
+}
+
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb);
+
+	buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize));
+	buf->f_bfree = sb_free_blocks(rs);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
+	/* changed to accommodate gcc folks. */
+	buf->f_type = REISERFS_SUPER_MAGIC;
+	buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2);
+	buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2,
+				sizeof(rs->s_uuid)/2);
+
+	return 0;
+}
+
+#ifdef CONFIG_QUOTA
+static int reiserfs_write_dquot(struct dquot *dquot)
+{
+	struct reiserfs_transaction_handle th;
+	int ret, err;
+
+	reiserfs_write_lock(dquot->dq_sb);
+	ret =
+	    journal_begin(&th, dquot->dq_sb,
+			  REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+	if (ret)
+		goto out;
+	ret = dquot_commit(dquot);
+	err =
+	    journal_end(&th, dquot->dq_sb,
+			REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+	if (!ret && err)
+		ret = err;
+      out:
+	reiserfs_write_unlock(dquot->dq_sb);
+	return ret;
+}
+
+static int reiserfs_acquire_dquot(struct dquot *dquot)
+{
+	struct reiserfs_transaction_handle th;
+	int ret, err;
+
+	reiserfs_write_lock(dquot->dq_sb);
+	ret =
+	    journal_begin(&th, dquot->dq_sb,
+			  REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+	if (ret)
+		goto out;
+	ret = dquot_acquire(dquot);
+	err =
+	    journal_end(&th, dquot->dq_sb,
+			REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+	if (!ret && err)
+		ret = err;
+      out:
+	reiserfs_write_unlock(dquot->dq_sb);
+	return ret;
+}
+
+static int reiserfs_release_dquot(struct dquot *dquot)
+{
+	struct reiserfs_transaction_handle th;
+	int ret, err;
+
+	reiserfs_write_lock(dquot->dq_sb);
+	ret =
+	    journal_begin(&th, dquot->dq_sb,
+			  REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+	if (ret) {
+		/* Release dquot anyway to avoid endless cycle in dqput() */
+		dquot_release(dquot);
+		goto out;
+	}
+	ret = dquot_release(dquot);
+	err =
+	    journal_end(&th, dquot->dq_sb,
+			REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+	if (!ret && err)
+		ret = err;
+      out:
+	reiserfs_write_unlock(dquot->dq_sb);
+	return ret;
+}
+
+static int reiserfs_mark_dquot_dirty(struct dquot *dquot)
+{
+	/* Are we journaling quotas? */
+	if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
+	    REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+		dquot_mark_dquot_dirty(dquot);
+		return reiserfs_write_dquot(dquot);
+	} else
+		return dquot_mark_dquot_dirty(dquot);
+}
+
+static int reiserfs_write_info(struct super_block *sb, int type)
+{
+	struct reiserfs_transaction_handle th;
+	int ret, err;
+
+	/* Data block + inode block */
+	reiserfs_write_lock(sb);
+	ret = journal_begin(&th, sb, 2);
+	if (ret)
+		goto out;
+	ret = dquot_commit_info(sb, type);
+	err = journal_end(&th, sb, 2);
+	if (!ret && err)
+		ret = err;
+      out:
+	reiserfs_write_unlock(sb);
+	return ret;
+}
+
+/*
+ * Turn on quotas during mount time - we need to find the quota file and such...
+ */
+static int reiserfs_quota_on_mount(struct super_block *sb, int type)
+{
+	return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
+					REISERFS_SB(sb)->s_jquota_fmt, type);
+}
+
+/*
+ * Standard function to be called on quota_on
+ */
+static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
+			     struct path *path)
+{
+	int err;
+	struct inode *inode;
+	struct reiserfs_transaction_handle th;
+
+	if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
+		return -EINVAL;
+
+	/* Quotafile not on the same filesystem? */
+	if (path->mnt->mnt_sb != sb) {
+		err = -EXDEV;
+		goto out;
+	}
+	inode = path->dentry->d_inode;
+	/* We must not pack tails for quota files on reiserfs for quota IO to work */
+	if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
+		err = reiserfs_unpack(inode, NULL);
+		if (err) {
+			reiserfs_warning(sb, "super-6520",
+				"Unpacking tail of quota file failed"
+				" (%d). Cannot turn on quotas.", err);
+			err = -EINVAL;
+			goto out;
+		}
+		mark_inode_dirty(inode);
+	}
+	/* Journaling quota? */
+	if (REISERFS_SB(sb)->s_qf_names[type]) {
+		/* Quotafile not of fs root? */
+		if (path->dentry->d_parent != sb->s_root)
+			reiserfs_warning(sb, "super-6521",
+				 "Quota file not on filesystem root. "
+				 "Journalled quota will not work.");
+	}
+
+	/*
+	 * When we journal data on quota file, we have to flush journal to see
+	 * all updates to the file when we bypass pagecache...
+	 */
+	if (reiserfs_file_data_log(inode)) {
+		/* Just start temporary transaction and finish it */
+		err = journal_begin(&th, sb, 1);
+		if (err)
+			goto out;
+		err = journal_end_sync(&th, sb, 1);
+		if (err)
+			goto out;
+	}
+	err = dquot_quota_on(sb, type, format_id, path);
+out:
+	return err;
+}
+
+/* Read data from quotafile - avoid pagecache and such because we cannot afford
+ * acquiring the locks... As quota files are never truncated and quota code
+ * itself serializes the operations (and no one else should touch the files)
+ * we don't have to be afraid of races */
+static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
+				   size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	unsigned long blk = off >> sb->s_blocksize_bits;
+	int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
+	size_t toread;
+	struct buffer_head tmp_bh, *bh;
+	loff_t i_size = i_size_read(inode);
+
+	if (off > i_size)
+		return 0;
+	if (off + len > i_size)
+		len = i_size - off;
+	toread = len;
+	while (toread > 0) {
+		tocopy =
+		    sb->s_blocksize - offset <
+		    toread ? sb->s_blocksize - offset : toread;
+		tmp_bh.b_state = 0;
+		/* Quota files are without tails so we can safely use this function */
+		reiserfs_write_lock(sb);
+		err = reiserfs_get_block(inode, blk, &tmp_bh, 0);
+		reiserfs_write_unlock(sb);
+		if (err)
+			return err;
+		if (!buffer_mapped(&tmp_bh))	/* A hole? */
+			memset(data, 0, tocopy);
+		else {
+			bh = sb_bread(sb, tmp_bh.b_blocknr);
+			if (!bh)
+				return -EIO;
+			memcpy(data, bh->b_data + offset, tocopy);
+			brelse(bh);
+		}
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
+				    const char *data, size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	unsigned long blk = off >> sb->s_blocksize_bits;
+	int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
+	int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL;
+	size_t towrite = len;
+	struct buffer_head tmp_bh, *bh;
+
+	if (!current->journal_info) {
+		printk(KERN_WARNING "reiserfs: Quota write (off=%Lu, len=%Lu)"
+			" cancelled because transaction is not started.\n",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
+	while (towrite > 0) {
+		tocopy = sb->s_blocksize - offset < towrite ?
+		    sb->s_blocksize - offset : towrite;
+		tmp_bh.b_state = 0;
+		err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
+		if (err)
+			goto out;
+		if (offset || tocopy != sb->s_blocksize)
+			bh = sb_bread(sb, tmp_bh.b_blocknr);
+		else
+			bh = sb_getblk(sb, tmp_bh.b_blocknr);
+		if (!bh) {
+			err = -EIO;
+			goto out;
+		}
+		lock_buffer(bh);
+		memcpy(bh->b_data + offset, data, tocopy);
+		flush_dcache_page(bh->b_page);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		reiserfs_prepare_for_journal(sb, bh, 1);
+		journal_mark_dirty(current->journal_info, sb, bh);
+		if (!journal_quota)
+			reiserfs_add_ordered_list(inode, bh);
+		brelse(bh);
+		offset = 0;
+		towrite -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+out:
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
+		return err;
+	}
+	if (inode->i_size < off + len - towrite)
+		i_size_write(inode, off + len - towrite);
+	inode->i_version++;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+	mutex_unlock(&inode->i_mutex);
+	return len - towrite;
+}
+
+#endif
+
+static struct dentry *get_super_block(struct file_system_type *fs_type,
+			   int flags, const char *dev_name,
+			   void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
+}
+
+static int __init init_reiserfs_fs(void)
+{
+	int ret;
+
+	if ((ret = init_inodecache())) {
+		return ret;
+	}
+
+	reiserfs_proc_info_global_init();
+
+	ret = register_filesystem(&reiserfs_fs_type);
+
+	if (ret == 0) {
+		return 0;
+	}
+
+	reiserfs_proc_info_global_done();
+	destroy_inodecache();
+
+	return ret;
+}
+
+static void __exit exit_reiserfs_fs(void)
+{
+	reiserfs_proc_info_global_done();
+	unregister_filesystem(&reiserfs_fs_type);
+	destroy_inodecache();
+}
+
+struct file_system_type reiserfs_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "reiserfs",
+	.mount = get_super_block,
+	.kill_sb = reiserfs_kill_sb,
+	.fs_flags = FS_REQUIRES_DEV,
+};
+
+MODULE_DESCRIPTION("ReiserFS journaled filesystem");
+MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>");
+MODULE_LICENSE("GPL");
+
+module_init(init_reiserfs_fs);
+module_exit(exit_reiserfs_fs);
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
new file mode 100644
index 00000000..d7f6e51b
--- /dev/null
+++ b/fs/reiserfs/tail_conversion.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright details
+ */
+
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/reiserfs_fs.h>
+
+/* access to tail : when one is going to read tail it must make sure, that is not running.
+ direct2indirect and indirect2direct can not run concurrently */
+
+/* Converts direct items to an unformatted node. Panics if file has no
+   tail. -ENOSPC if no disk space for conversion */
+/* path points to first direct item of the file regarless of how many of
+   them are there */
+int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
+		    struct treepath *path, struct buffer_head *unbh,
+		    loff_t tail_offset)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *up_to_date_bh;
+	struct item_head *p_le_ih = PATH_PITEM_HEAD(path);
+	unsigned long total_tail = 0;
+	struct cpu_key end_key;	/* Key to search for the last byte of the
+				   converted item. */
+	struct item_head ind_ih;	/* new indirect item to be inserted or
+					   key of unfm pointer to be pasted */
+	int blk_size, retval;	/* returned value for reiserfs_insert_item and clones */
+	unp_t unfm_ptr;		/* Handle on an unformatted node
+				   that will be inserted in the
+				   tree. */
+
+	BUG_ON(!th->t_trans_id);
+
+	REISERFS_SB(sb)->s_direct2indirect++;
+
+	blk_size = sb->s_blocksize;
+
+	/* and key to search for append or insert pointer to the new
+	   unformatted node. */
+	copy_item_head(&ind_ih, p_le_ih);
+	set_le_ih_k_offset(&ind_ih, tail_offset);
+	set_le_ih_k_type(&ind_ih, TYPE_INDIRECT);
+
+	/* Set the key to search for the place for new unfm pointer */
+	make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4);
+
+	/* FIXME: we could avoid this */
+	if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) {
+		reiserfs_error(sb, "PAP-14030",
+			       "pasted or inserted byte exists in "
+			       "the tree %K. Use fsck to repair.", &end_key);
+		pathrelse(path);
+		return -EIO;
+	}
+
+	p_le_ih = PATH_PITEM_HEAD(path);
+
+	unfm_ptr = cpu_to_le32(unbh->b_blocknr);
+
+	if (is_statdata_le_ih(p_le_ih)) {
+		/* Insert new indirect item. */
+		set_ih_free_space(&ind_ih, 0);	/* delete at nearest future */
+		put_ih_item_len(&ind_ih, UNFM_P_SIZE);
+		PATH_LAST_POSITION(path)++;
+		retval =
+		    reiserfs_insert_item(th, path, &end_key, &ind_ih, inode,
+					 (char *)&unfm_ptr);
+	} else {
+		/* Paste into last indirect item of an object. */
+		retval = reiserfs_paste_into_item(th, path, &end_key, inode,
+						    (char *)&unfm_ptr,
+						    UNFM_P_SIZE);
+	}
+	if (retval) {
+		return retval;
+	}
+	// note: from here there are two keys which have matching first
+	// three key components. They only differ by the fourth one.
+
+	/* Set the key to search for the direct items of the file */
+	make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT,
+		     4);
+
+	/* Move bytes from the direct items to the new unformatted node
+	   and delete them. */
+	while (1) {
+		int tail_size;
+
+		/* end_key.k_offset is set so, that we will always have found
+		   last item of the file */
+		if (search_for_position_by_key(sb, &end_key, path) ==
+		    POSITION_FOUND)
+			reiserfs_panic(sb, "PAP-14050",
+				       "direct item (%K) not found", &end_key);
+		p_le_ih = PATH_PITEM_HEAD(path);
+		RFALSE(!is_direct_le_ih(p_le_ih),
+		       "vs-14055: direct item expected(%K), found %h",
+		       &end_key, p_le_ih);
+		tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1))
+		    + ih_item_len(p_le_ih) - 1;
+
+		/* we only send the unbh pointer if the buffer is not up to date.
+		 ** this avoids overwriting good data from writepage() with old data
+		 ** from the disk or buffer cache
+		 ** Special case: unbh->b_page will be NULL if we are coming through
+		 ** DIRECT_IO handler here.
+		 */
+		if (!unbh->b_page || buffer_uptodate(unbh)
+		    || PageUptodate(unbh->b_page)) {
+			up_to_date_bh = NULL;
+		} else {
+			up_to_date_bh = unbh;
+		}
+		retval = reiserfs_delete_item(th, path, &end_key, inode,
+						up_to_date_bh);
+
+		total_tail += retval;
+		if (tail_size == retval)
+			// done: file does not have direct items anymore
+			break;
+
+	}
+	/* if we've copied bytes from disk into the page, we need to zero
+	 ** out the unused part of the block (it was not up to date before)
+	 */
+	if (up_to_date_bh) {
+		unsigned pgoff =
+		    (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1);
+		char *kaddr = kmap_atomic(up_to_date_bh->b_page, KM_USER0);
+		memset(kaddr + pgoff, 0, blk_size - total_tail);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+
+	REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
+
+	return 0;
+}
+
+/* stolen from fs/buffer.c */
+void reiserfs_unmap_buffer(struct buffer_head *bh)
+{
+	lock_buffer(bh);
+	if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
+		BUG();
+	}
+	clear_buffer_dirty(bh);
+	/* Remove the buffer from whatever list it belongs to. We are mostly
+	   interested in removing it from per-sb j_dirty_buffers list, to avoid
+	   BUG() on attempt to write not mapped buffer */
+	if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) {
+		struct inode *inode = bh->b_page->mapping->host;
+		struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
+		spin_lock(&j->j_dirty_buffers_lock);
+		list_del_init(&bh->b_assoc_buffers);
+		reiserfs_free_jh(bh);
+		spin_unlock(&j->j_dirty_buffers_lock);
+	}
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	bh->b_bdev = NULL;
+	unlock_buffer(bh);
+}
+
+/* this first locks inode (neither reads nor sync are permitted),
+   reads tail through page cache, insert direct item. When direct item
+   inserted successfully inode is left locked. Return value is always
+   what we expect from it (number of cut bytes). But when tail remains
+   in the unformatted node, we set mode to SKIP_BALANCING and unlock
+   inode */
+int indirect2direct(struct reiserfs_transaction_handle *th,
+		    struct inode *inode, struct page *page,
+		    struct treepath *path,	/* path to the indirect item. */
+		    const struct cpu_key *item_key,	/* Key to look for
+							 * unformatted node
+							 * pointer to be cut. */
+		    loff_t n_new_file_size,	/* New file size. */
+		    char *mode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct item_head s_ih;
+	unsigned long block_size = sb->s_blocksize;
+	char *tail;
+	int tail_len, round_tail_len;
+	loff_t pos, pos1;	/* position of first byte of the tail */
+	struct cpu_key key;
+
+	BUG_ON(!th->t_trans_id);
+
+	REISERFS_SB(sb)->s_indirect2direct++;
+
+	*mode = M_SKIP_BALANCING;
+
+	/* store item head path points to. */
+	copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
+
+	tail_len = (n_new_file_size & (block_size - 1));
+	if (get_inode_sd_version(inode) == STAT_DATA_V2)
+		round_tail_len = ROUND_UP(tail_len);
+	else
+		round_tail_len = tail_len;
+
+	pos =
+	    le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE -
+					 1) * sb->s_blocksize;
+	pos1 = pos;
+
+	// we are protected by i_mutex. The tail can not disapper, not
+	// append can be done either
+	// we are in truncate or packing tail in file_release
+
+	tail = (char *)kmap(page);	/* this can schedule */
+
+	if (path_changed(&s_ih, path)) {
+		/* re-search indirect item */
+		if (search_for_position_by_key(sb, item_key, path)
+		    == POSITION_NOT_FOUND)
+			reiserfs_panic(sb, "PAP-5520",
+				       "item to be converted %K does not exist",
+				       item_key);
+		copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
+#ifdef CONFIG_REISERFS_CHECK
+		pos = le_ih_k_offset(&s_ih) - 1 +
+		    (ih_item_len(&s_ih) / UNFM_P_SIZE -
+		     1) * sb->s_blocksize;
+		if (pos != pos1)
+			reiserfs_panic(sb, "vs-5530", "tail position "
+				       "changed while we were reading it");
+#endif
+	}
+
+	/* Set direct item header to insert. */
+	make_le_item_head(&s_ih, NULL, get_inode_item_key_version(inode),
+			  pos1 + 1, TYPE_DIRECT, round_tail_len,
+			  0xffff /*ih_free_space */ );
+
+	/* we want a pointer to the first byte of the tail in the page.
+	 ** the page was locked and this part of the page was up to date when
+	 ** indirect2direct was called, so we know the bytes are still valid
+	 */
+	tail = tail + (pos & (PAGE_CACHE_SIZE - 1));
+
+	PATH_LAST_POSITION(path)++;
+
+	key = *item_key;
+	set_cpu_key_k_type(&key, TYPE_DIRECT);
+	key.key_length = 4;
+	/* Insert tail as new direct item in the tree */
+	if (reiserfs_insert_item(th, path, &key, &s_ih, inode,
+				 tail ? tail : NULL) < 0) {
+		/* No disk memory. So we can not convert last unformatted node
+		   to the direct item.  In this case we used to adjust
+		   indirect items's ih_free_space. Now ih_free_space is not
+		   used, it would be ideal to write zeros to corresponding
+		   unformatted node. For now i_size is considered as guard for
+		   going out of file size */
+		kunmap(page);
+		return block_size - round_tail_len;
+	}
+	kunmap(page);
+
+	/* make sure to get the i_blocks changes from reiserfs_insert_item */
+	reiserfs_update_sd(th, inode);
+
+	// note: we have now the same as in above direct2indirect
+	// conversion: there are two keys which have matching first three
+	// key components. They only differ by the fouhth one.
+
+	/* We have inserted new direct item and must remove last
+	   unformatted node. */
+	*mode = M_CUT;
+
+	/* we store position of first direct item in the in-core inode */
+	/* mark_file_with_tail (inode, pos1 + 1); */
+	REISERFS_I(inode)->i_first_direct_byte = pos1 + 1;
+
+	return block_size - round_tail_len;
+}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
new file mode 100644
index 00000000..d7808969
--- /dev/null
+++ b/fs/reiserfs/xattr.c
@@ -0,0 +1,1051 @@
+/*
+ * linux/fs/reiserfs/xattr.c
+ *
+ * Copyright (c) 2002 by Jeff Mahoney, <jeffm@suse.com>
+ *
+ */
+
+/*
+ * In order to implement EA/ACLs in a clean, backwards compatible manner,
+ * they are implemented as files in a "private" directory.
+ * Each EA is in it's own file, with the directory layout like so (/ is assumed
+ * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory,
+ * directories named using the capital-hex form of the objectid and
+ * generation number are used. Inside each directory are individual files
+ * named with the name of the extended attribute.
+ *
+ * So, for objectid 12648430, we could have:
+ * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access
+ * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default
+ * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type
+ * .. or similar.
+ *
+ * The file contents are the text of the EA. The size is known based on the
+ * stat data describing the file.
+ *
+ * In the case of system.posix_acl_access and system.posix_acl_default, since
+ * these are special cases for filesystem ACLs, they are interpreted by the
+ * kernel, in addition, they are negatively and positively cached and attached
+ * to the inode so that unnecessary lookups are avoided.
+ *
+ * Locking works like so:
+ * Directory components (xattr root, xattr dir) are protectd by their i_mutex.
+ * The xattrs themselves are protected by the xattr_sem.
+ */
+
+#include <linux/reiserfs_fs.h>
+#include <linux/capability.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include <linux/reiserfs_xattr.h>
+#include <linux/reiserfs_acl.h>
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+#include <linux/stat.h>
+#include <linux/quotaops.h>
+#include <linux/security.h>
+
+#define PRIVROOT_NAME ".reiserfs_priv"
+#define XAROOT_NAME   "xattrs"
+
+
+/* Helpers for inode ops. We do this so that we don't have all the VFS
+ * overhead and also for proper i_mutex annotation.
+ * dir->i_mutex must be held for all of them. */
+#ifdef CONFIG_REISERFS_FS_XATTR
+static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
+{
+	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	return dir->i_op->create(dir, dentry, mode, NULL);
+}
+#endif
+
+static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	return dir->i_op->mkdir(dir, dentry, mode);
+}
+
+/* We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr
+ * mutation ops aren't called during rename or splace, which are the
+ * only other users of I_MUTEX_CHILD. It violates the ordering, but that's
+ * better than allocating another subclass just for this code. */
+static int xattr_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int error;
+	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+
+	reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
+					I_MUTEX_CHILD, dir->i_sb);
+	error = dir->i_op->unlink(dir, dentry);
+	mutex_unlock(&dentry->d_inode->i_mutex);
+
+	if (!error)
+		d_delete(dentry);
+	return error;
+}
+
+static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int error;
+	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+
+	reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
+					I_MUTEX_CHILD, dir->i_sb);
+	error = dir->i_op->rmdir(dir, dentry);
+	if (!error)
+		dentry->d_inode->i_flags |= S_DEAD;
+	mutex_unlock(&dentry->d_inode->i_mutex);
+	if (!error)
+		d_delete(dentry);
+
+	return error;
+}
+
+#define xattr_may_create(flags)	(!flags || flags & XATTR_CREATE)
+
+static struct dentry *open_xa_root(struct super_block *sb, int flags)
+{
+	struct dentry *privroot = REISERFS_SB(sb)->priv_root;
+	struct dentry *xaroot;
+	if (!privroot->d_inode)
+		return ERR_PTR(-ENODATA);
+
+	mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR);
+
+	xaroot = dget(REISERFS_SB(sb)->xattr_root);
+	if (!xaroot)
+		xaroot = ERR_PTR(-ENODATA);
+	else if (!xaroot->d_inode) {
+		int err = -ENODATA;
+		if (xattr_may_create(flags))
+			err = xattr_mkdir(privroot->d_inode, xaroot, 0700);
+		if (err) {
+			dput(xaroot);
+			xaroot = ERR_PTR(err);
+		}
+	}
+
+	mutex_unlock(&privroot->d_inode->i_mutex);
+	return xaroot;
+}
+
+static struct dentry *open_xa_dir(const struct inode *inode, int flags)
+{
+	struct dentry *xaroot, *xadir;
+	char namebuf[17];
+
+	xaroot = open_xa_root(inode->i_sb, flags);
+	if (IS_ERR(xaroot))
+		return xaroot;
+
+	snprintf(namebuf, sizeof(namebuf), "%X.%X",
+		 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
+		 inode->i_generation);
+
+	mutex_lock_nested(&xaroot->d_inode->i_mutex, I_MUTEX_XATTR);
+
+	xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
+	if (!IS_ERR(xadir) && !xadir->d_inode) {
+		int err = -ENODATA;
+		if (xattr_may_create(flags))
+			err = xattr_mkdir(xaroot->d_inode, xadir, 0700);
+		if (err) {
+			dput(xadir);
+			xadir = ERR_PTR(err);
+		}
+	}
+
+	mutex_unlock(&xaroot->d_inode->i_mutex);
+	dput(xaroot);
+	return xadir;
+}
+
+/* The following are side effects of other operations that aren't explicitly
+ * modifying extended attributes. This includes operations such as permissions
+ * or ownership changes, object deletions, etc. */
+struct reiserfs_dentry_buf {
+	struct dentry *xadir;
+	int count;
+	struct dentry *dentries[8];
+};
+
+static int
+fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,
+		    u64 ino, unsigned int d_type)
+{
+	struct reiserfs_dentry_buf *dbuf = buf;
+	struct dentry *dentry;
+	WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex));
+
+	if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
+		return -ENOSPC;
+
+	if (name[0] == '.' && (name[1] == '\0' ||
+			       (name[1] == '.' && name[2] == '\0')))
+		return 0;
+
+	dentry = lookup_one_len(name, dbuf->xadir, namelen);
+	if (IS_ERR(dentry)) {
+		return PTR_ERR(dentry);
+	} else if (!dentry->d_inode) {
+		/* A directory entry exists, but no file? */
+		reiserfs_error(dentry->d_sb, "xattr-20003",
+			       "Corrupted directory: xattr %s listed but "
+			       "not found for file %s.\n",
+			       dentry->d_name.name, dbuf->xadir->d_name.name);
+		dput(dentry);
+		return -EIO;
+	}
+
+	dbuf->dentries[dbuf->count++] = dentry;
+	return 0;
+}
+
+static void
+cleanup_dentry_buf(struct reiserfs_dentry_buf *buf)
+{
+	int i;
+	for (i = 0; i < buf->count; i++)
+		if (buf->dentries[i])
+			dput(buf->dentries[i]);
+}
+
+static int reiserfs_for_each_xattr(struct inode *inode,
+				   int (*action)(struct dentry *, void *),
+				   void *data)
+{
+	struct dentry *dir;
+	int i, err = 0;
+	loff_t pos = 0;
+	struct reiserfs_dentry_buf buf = {
+		.count = 0,
+	};
+
+	/* Skip out, an xattr has no xattrs associated with it */
+	if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
+		return 0;
+
+	reiserfs_write_unlock(inode->i_sb);
+	dir = open_xa_dir(inode, XATTR_REPLACE);
+	if (IS_ERR(dir)) {
+		err = PTR_ERR(dir);
+		reiserfs_write_lock(inode->i_sb);
+		goto out;
+	} else if (!dir->d_inode) {
+		err = 0;
+		reiserfs_write_lock(inode->i_sb);
+		goto out_dir;
+	}
+
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
+
+	reiserfs_write_lock(inode->i_sb);
+
+	buf.xadir = dir;
+	err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos);
+	while ((err == 0 || err == -ENOSPC) && buf.count) {
+		err = 0;
+
+		for (i = 0; i < buf.count && buf.dentries[i]; i++) {
+			int lerr = 0;
+			struct dentry *dentry = buf.dentries[i];
+
+			if (err == 0 && !S_ISDIR(dentry->d_inode->i_mode))
+				lerr = action(dentry, data);
+
+			dput(dentry);
+			buf.dentries[i] = NULL;
+			err = lerr ?: err;
+		}
+		buf.count = 0;
+		if (!err)
+			err = reiserfs_readdir_dentry(dir, &buf,
+						      fill_with_dentries, &pos);
+	}
+	mutex_unlock(&dir->d_inode->i_mutex);
+
+	/* Clean up after a failed readdir */
+	cleanup_dentry_buf(&buf);
+
+	if (!err) {
+		/* We start a transaction here to avoid a ABBA situation
+		 * between the xattr root's i_mutex and the journal lock.
+		 * This doesn't incur much additional overhead since the
+		 * new transaction will just nest inside the
+		 * outer transaction. */
+		int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
+			     4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
+		struct reiserfs_transaction_handle th;
+		err = journal_begin(&th, inode->i_sb, blocks);
+		if (!err) {
+			int jerror;
+			reiserfs_mutex_lock_nested_safe(
+					  &dir->d_parent->d_inode->i_mutex,
+					  I_MUTEX_XATTR, inode->i_sb);
+			err = action(dir, data);
+			jerror = journal_end(&th, inode->i_sb, blocks);
+			mutex_unlock(&dir->d_parent->d_inode->i_mutex);
+			err = jerror ?: err;
+		}
+	}
+out_dir:
+	dput(dir);
+out:
+	/* -ENODATA isn't an error */
+	if (err == -ENODATA)
+		err = 0;
+	return err;
+}
+
+static int delete_one_xattr(struct dentry *dentry, void *data)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+
+	/* This is the xattr dir, handle specially. */
+	if (S_ISDIR(dentry->d_inode->i_mode))
+		return xattr_rmdir(dir, dentry);
+
+	return xattr_unlink(dir, dentry);
+}
+
+static int chown_one_xattr(struct dentry *dentry, void *data)
+{
+	struct iattr *attrs = data;
+	return reiserfs_setattr(dentry, attrs);
+}
+
+/* No i_mutex, but the inode is unconnected. */
+int reiserfs_delete_xattrs(struct inode *inode)
+{
+	int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL);
+	if (err)
+		reiserfs_warning(inode->i_sb, "jdm-20004",
+				 "Couldn't delete all xattrs (%d)\n", err);
+	return err;
+}
+
+/* inode->i_mutex: down */
+int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
+{
+	int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs);
+	if (err)
+		reiserfs_warning(inode->i_sb, "jdm-20007",
+				 "Couldn't chown all xattrs (%d)\n", err);
+	return err;
+}
+
+#ifdef CONFIG_REISERFS_FS_XATTR
+/* Returns a dentry corresponding to a specific extended attribute file
+ * for the inode. If flags allow, the file is created. Otherwise, a
+ * valid or negative dentry, or an error is returned. */
+static struct dentry *xattr_lookup(struct inode *inode, const char *name,
+				    int flags)
+{
+	struct dentry *xadir, *xafile;
+	int err = 0;
+
+	xadir = open_xa_dir(inode, flags);
+	if (IS_ERR(xadir))
+		return ERR_CAST(xadir);
+
+	mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
+	xafile = lookup_one_len(name, xadir, strlen(name));
+	if (IS_ERR(xafile)) {
+		err = PTR_ERR(xafile);
+		goto out;
+	}
+
+	if (xafile->d_inode && (flags & XATTR_CREATE))
+		err = -EEXIST;
+
+	if (!xafile->d_inode) {
+		err = -ENODATA;
+		if (xattr_may_create(flags))
+			err = xattr_create(xadir->d_inode, xafile,
+					      0700|S_IFREG);
+	}
+
+	if (err)
+		dput(xafile);
+out:
+	mutex_unlock(&xadir->d_inode->i_mutex);
+	dput(xadir);
+	if (err)
+		return ERR_PTR(err);
+	return xafile;
+}
+
+/* Internal operations on file data */
+static inline void reiserfs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+static struct page *reiserfs_get_page(struct inode *dir, size_t n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page;
+	/* We can deadlock if we try to free dentries,
+	   and an unlink/rmdir has just occurred - GFP_NOFS avoids this */
+	mapping_set_gfp_mask(mapping, GFP_NOFS);
+	page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL);
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+      fail:
+	reiserfs_put_page(page);
+	return ERR_PTR(-EIO);
+}
+
+static inline __u32 xattr_hash(const char *msg, int len)
+{
+	return csum_partial(msg, len, 0);
+}
+
+int reiserfs_commit_write(struct file *f, struct page *page,
+			  unsigned from, unsigned to);
+
+static void update_ctime(struct inode *inode)
+{
+	struct timespec now = current_fs_time(inode->i_sb);
+	if (inode_unhashed(inode) || !inode->i_nlink ||
+	    timespec_equal(&inode->i_ctime, &now))
+		return;
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+}
+
+static int lookup_and_delete_xattr(struct inode *inode, const char *name)
+{
+	int err = 0;
+	struct dentry *dentry, *xadir;
+
+	xadir = open_xa_dir(inode, XATTR_REPLACE);
+	if (IS_ERR(xadir))
+		return PTR_ERR(xadir);
+
+	mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
+	dentry = lookup_one_len(name, xadir, strlen(name));
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out_dput;
+	}
+
+	if (dentry->d_inode) {
+		reiserfs_write_lock(inode->i_sb);
+		err = xattr_unlink(xadir->d_inode, dentry);
+		reiserfs_write_unlock(inode->i_sb);
+		update_ctime(inode);
+	}
+
+	dput(dentry);
+out_dput:
+	mutex_unlock(&xadir->d_inode->i_mutex);
+	dput(xadir);
+	return err;
+}
+
+
+/* Generic extended attribute operations that can be used by xa plugins */
+
+/*
+ * inode->i_mutex: down
+ */
+int
+reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
+			  struct inode *inode, const char *name,
+			  const void *buffer, size_t buffer_size, int flags)
+{
+	int err = 0;
+	struct dentry *dentry;
+	struct page *page;
+	char *data;
+	size_t file_pos = 0;
+	size_t buffer_pos = 0;
+	size_t new_size;
+	__u32 xahash = 0;
+
+	if (get_inode_sd_version(inode) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	reiserfs_write_unlock(inode->i_sb);
+
+	if (!buffer) {
+		err = lookup_and_delete_xattr(inode, name);
+		reiserfs_write_lock(inode->i_sb);
+		return err;
+	}
+
+	dentry = xattr_lookup(inode, name, flags);
+	if (IS_ERR(dentry)) {
+		reiserfs_write_lock(inode->i_sb);
+		return PTR_ERR(dentry);
+	}
+
+	down_write(&REISERFS_I(inode)->i_xattr_sem);
+
+	reiserfs_write_lock(inode->i_sb);
+
+	xahash = xattr_hash(buffer, buffer_size);
+	while (buffer_pos < buffer_size || buffer_pos == 0) {
+		size_t chunk;
+		size_t skip = 0;
+		size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1));
+		if (buffer_size - buffer_pos > PAGE_CACHE_SIZE)
+			chunk = PAGE_CACHE_SIZE;
+		else
+			chunk = buffer_size - buffer_pos;
+
+		page = reiserfs_get_page(dentry->d_inode, file_pos);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto out_unlock;
+		}
+
+		lock_page(page);
+		data = page_address(page);
+
+		if (file_pos == 0) {
+			struct reiserfs_xattr_header *rxh;
+			skip = file_pos = sizeof(struct reiserfs_xattr_header);
+			if (chunk + skip > PAGE_CACHE_SIZE)
+				chunk = PAGE_CACHE_SIZE - skip;
+			rxh = (struct reiserfs_xattr_header *)data;
+			rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC);
+			rxh->h_hash = cpu_to_le32(xahash);
+		}
+
+		err = __reiserfs_write_begin(page, page_offset, chunk + skip);
+		if (!err) {
+			if (buffer)
+				memcpy(data + skip, buffer + buffer_pos, chunk);
+			err = reiserfs_commit_write(NULL, page, page_offset,
+						    page_offset + chunk +
+						    skip);
+		}
+		unlock_page(page);
+		reiserfs_put_page(page);
+		buffer_pos += chunk;
+		file_pos += chunk;
+		skip = 0;
+		if (err || buffer_size == 0 || !buffer)
+			break;
+	}
+
+	new_size = buffer_size + sizeof(struct reiserfs_xattr_header);
+	if (!err && new_size < i_size_read(dentry->d_inode)) {
+		struct iattr newattrs = {
+			.ia_ctime = current_fs_time(inode->i_sb),
+			.ia_size = new_size,
+			.ia_valid = ATTR_SIZE | ATTR_CTIME,
+		};
+
+		reiserfs_write_unlock(inode->i_sb);
+		mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
+		down_write(&dentry->d_inode->i_alloc_sem);
+		reiserfs_write_lock(inode->i_sb);
+
+		err = reiserfs_setattr(dentry, &newattrs);
+		up_write(&dentry->d_inode->i_alloc_sem);
+		mutex_unlock(&dentry->d_inode->i_mutex);
+	} else
+		update_ctime(inode);
+out_unlock:
+	up_write(&REISERFS_I(inode)->i_xattr_sem);
+	dput(dentry);
+	return err;
+}
+
+/* We need to start a transaction to maintain lock ordering */
+int reiserfs_xattr_set(struct inode *inode, const char *name,
+		       const void *buffer, size_t buffer_size, int flags)
+{
+
+	struct reiserfs_transaction_handle th;
+	int error, error2;
+	size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size);
+
+	if (!(flags & XATTR_REPLACE))
+		jbegin_count += reiserfs_xattr_jcreate_nblocks(inode);
+
+	reiserfs_write_lock(inode->i_sb);
+	error = journal_begin(&th, inode->i_sb, jbegin_count);
+	if (error) {
+		reiserfs_write_unlock(inode->i_sb);
+		return error;
+	}
+
+	error = reiserfs_xattr_set_handle(&th, inode, name,
+					  buffer, buffer_size, flags);
+
+	error2 = journal_end(&th, inode->i_sb, jbegin_count);
+	if (error == 0)
+		error = error2;
+	reiserfs_write_unlock(inode->i_sb);
+
+	return error;
+}
+
+/*
+ * inode->i_mutex: down
+ */
+int
+reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
+		   size_t buffer_size)
+{
+	ssize_t err = 0;
+	struct dentry *dentry;
+	size_t isize;
+	size_t file_pos = 0;
+	size_t buffer_pos = 0;
+	struct page *page;
+	__u32 hash = 0;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	/* We can't have xattrs attached to v1 items since they don't have
+	 * generation numbers */
+	if (get_inode_sd_version(inode) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	dentry = xattr_lookup(inode, name, XATTR_REPLACE);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out;
+	}
+
+	down_read(&REISERFS_I(inode)->i_xattr_sem);
+
+	isize = i_size_read(dentry->d_inode);
+
+	/* Just return the size needed */
+	if (buffer == NULL) {
+		err = isize - sizeof(struct reiserfs_xattr_header);
+		goto out_unlock;
+	}
+
+	if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) {
+		err = -ERANGE;
+		goto out_unlock;
+	}
+
+	while (file_pos < isize) {
+		size_t chunk;
+		char *data;
+		size_t skip = 0;
+		if (isize - file_pos > PAGE_CACHE_SIZE)
+			chunk = PAGE_CACHE_SIZE;
+		else
+			chunk = isize - file_pos;
+
+		page = reiserfs_get_page(dentry->d_inode, file_pos);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto out_unlock;
+		}
+
+		lock_page(page);
+		data = page_address(page);
+		if (file_pos == 0) {
+			struct reiserfs_xattr_header *rxh =
+			    (struct reiserfs_xattr_header *)data;
+			skip = file_pos = sizeof(struct reiserfs_xattr_header);
+			chunk -= skip;
+			/* Magic doesn't match up.. */
+			if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) {
+				unlock_page(page);
+				reiserfs_put_page(page);
+				reiserfs_warning(inode->i_sb, "jdm-20001",
+						 "Invalid magic for xattr (%s) "
+						 "associated with %k", name,
+						 INODE_PKEY(inode));
+				err = -EIO;
+				goto out_unlock;
+			}
+			hash = le32_to_cpu(rxh->h_hash);
+		}
+		memcpy(buffer + buffer_pos, data + skip, chunk);
+		unlock_page(page);
+		reiserfs_put_page(page);
+		file_pos += chunk;
+		buffer_pos += chunk;
+		skip = 0;
+	}
+	err = isize - sizeof(struct reiserfs_xattr_header);
+
+	if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) !=
+	    hash) {
+		reiserfs_warning(inode->i_sb, "jdm-20002",
+				 "Invalid hash for xattr (%s) associated "
+				 "with %k", name, INODE_PKEY(inode));
+		err = -EIO;
+	}
+
+out_unlock:
+	up_read(&REISERFS_I(inode)->i_xattr_sem);
+	dput(dentry);
+
+out:
+	return err;
+}
+
+/*
+ * In order to implement different sets of xattr operations for each xattr
+ * prefix with the generic xattr API, a filesystem should create a
+ * null-terminated array of struct xattr_handler (one for each prefix) and
+ * hang a pointer to it off of the s_xattr field of the superblock.
+ *
+ * The generic_fooxattr() functions will use this list to dispatch xattr
+ * operations to the correct xattr_handler.
+ */
+#define for_each_xattr_handler(handlers, handler)		\
+		for ((handler) = *(handlers)++;			\
+			(handler) != NULL;			\
+			(handler) = *(handlers)++)
+
+/* This is the implementation for the xattr plugin infrastructure */
+static inline const struct xattr_handler *
+find_xattr_handler_prefix(const struct xattr_handler **handlers,
+			   const char *name)
+{
+	const struct xattr_handler *xah;
+
+	if (!handlers)
+		return NULL;
+
+	for_each_xattr_handler(handlers, xah) {
+		if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0)
+			break;
+	}
+
+	return xah;
+}
+
+
+/*
+ * Inode operation getxattr()
+ */
+ssize_t
+reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
+		  size_t size)
+{
+	const struct xattr_handler *handler;
+
+	handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
+
+	if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	return handler->get(dentry, name, buffer, size, handler->flags);
+}
+
+/*
+ * Inode operation setxattr()
+ *
+ * dentry->d_inode->i_mutex down
+ */
+int
+reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		  size_t size, int flags)
+{
+	const struct xattr_handler *handler;
+
+	handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
+
+	if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	return handler->set(dentry, name, value, size, flags, handler->flags);
+}
+
+/*
+ * Inode operation removexattr()
+ *
+ * dentry->d_inode->i_mutex down
+ */
+int reiserfs_removexattr(struct dentry *dentry, const char *name)
+{
+	const struct xattr_handler *handler;
+	handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
+
+	if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags);
+}
+
+struct listxattr_buf {
+	size_t size;
+	size_t pos;
+	char *buf;
+	struct dentry *dentry;
+};
+
+static int listxattr_filler(void *buf, const char *name, int namelen,
+			    loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct listxattr_buf *b = (struct listxattr_buf *)buf;
+	size_t size;
+	if (name[0] != '.' ||
+	    (namelen != 1 && (name[1] != '.' || namelen != 2))) {
+		const struct xattr_handler *handler;
+		handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
+						    name);
+		if (!handler)	/* Unsupported xattr name */
+			return 0;
+		if (b->buf) {
+			size = handler->list(b->dentry, b->buf + b->pos,
+					 b->size, name, namelen,
+					 handler->flags);
+			if (size > b->size)
+				return -ERANGE;
+		} else {
+			size = handler->list(b->dentry, NULL, 0, name,
+					     namelen, handler->flags);
+		}
+
+		b->pos += size;
+	}
+	return 0;
+}
+
+/*
+ * Inode operation listxattr()
+ *
+ * We totally ignore the generic listxattr here because it would be stupid
+ * not to. Since the xattrs are organized in a directory, we can just
+ * readdir to find them.
+ */
+ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
+{
+	struct dentry *dir;
+	int err = 0;
+	loff_t pos = 0;
+	struct listxattr_buf buf = {
+		.dentry = dentry,
+		.buf = buffer,
+		.size = buffer ? size : 0,
+	};
+
+	if (!dentry->d_inode)
+		return -EINVAL;
+
+	if (!dentry->d_sb->s_xattr ||
+	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	dir = open_xa_dir(dentry->d_inode, XATTR_REPLACE);
+	if (IS_ERR(dir)) {
+		err = PTR_ERR(dir);
+		if (err == -ENODATA)
+			err = 0;  /* Not an error if there aren't any xattrs */
+		goto out;
+	}
+
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
+	err = reiserfs_readdir_dentry(dir, &buf, listxattr_filler, &pos);
+	mutex_unlock(&dir->d_inode->i_mutex);
+
+	if (!err)
+		err = buf.pos;
+
+	dput(dir);
+out:
+	return err;
+}
+
+static int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags)
+{
+	struct posix_acl *acl;
+	int error = -EAGAIN; /* do regular unix permission checks by default */
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
+
+	if (acl) {
+		if (!IS_ERR(acl)) {
+			error = posix_acl_permission(inode, acl, mask);
+			posix_acl_release(acl);
+		} else if (PTR_ERR(acl) != -ENODATA)
+			error = PTR_ERR(acl);
+	}
+
+	return error;
+}
+
+static int create_privroot(struct dentry *dentry)
+{
+	int err;
+	struct inode *inode = dentry->d_parent->d_inode;
+	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+
+	err = xattr_mkdir(inode, dentry, 0700);
+	if (err || !dentry->d_inode) {
+		reiserfs_warning(dentry->d_sb, "jdm-20006",
+				 "xattrs/ACLs enabled and couldn't "
+				 "find/create .reiserfs_priv. "
+				 "Failing mount.");
+		return -EOPNOTSUPP;
+	}
+
+	dentry->d_inode->i_flags |= S_PRIVATE;
+	reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
+		      "storage.\n", PRIVROOT_NAME);
+
+	return 0;
+}
+
+#else
+int __init reiserfs_xattr_register_handlers(void) { return 0; }
+void reiserfs_xattr_unregister_handlers(void) {}
+static int create_privroot(struct dentry *dentry) { return 0; }
+#endif
+
+/* Actual operations that are exported to VFS-land */
+const struct xattr_handler *reiserfs_xattr_handlers[] = {
+#ifdef CONFIG_REISERFS_FS_XATTR
+	&reiserfs_xattr_user_handler,
+	&reiserfs_xattr_trusted_handler,
+#endif
+#ifdef CONFIG_REISERFS_FS_SECURITY
+	&reiserfs_xattr_security_handler,
+#endif
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
+	&reiserfs_posix_acl_access_handler,
+	&reiserfs_posix_acl_default_handler,
+#endif
+	NULL
+};
+
+static int xattr_mount_check(struct super_block *s)
+{
+	/* We need generation numbers to ensure that the oid mapping is correct
+	 * v3.5 filesystems don't have them. */
+	if (old_format_only(s)) {
+		if (reiserfs_xattrs_optional(s)) {
+			/* Old format filesystem, but optional xattrs have
+			 * been enabled. Error out. */
+			reiserfs_warning(s, "jdm-2005",
+					 "xattrs/ACLs not supported "
+					 "on pre-v3.6 format filesystems. "
+					 "Failing mount.");
+			return -EOPNOTSUPP;
+		}
+	}
+
+	return 0;
+}
+
+int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	/*
+	 * We don't do permission checks on the internal objects.
+	 * Permissions are determined by the "owning" object.
+	 */
+	if (IS_PRIVATE(inode))
+		return 0;
+
+#ifdef CONFIG_REISERFS_FS_XATTR
+	/*
+	 * Stat data v1 doesn't support ACLs.
+	 */
+	if (get_inode_sd_version(inode) != STAT_DATA_V1)
+		return generic_permission(inode, mask, flags,
+					reiserfs_check_acl);
+#endif
+	return generic_permission(inode, mask, flags, NULL);
+}
+
+static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	return -EPERM;
+}
+
+static const struct dentry_operations xattr_lookup_poison_ops = {
+	.d_revalidate = xattr_hide_revalidate,
+};
+
+int reiserfs_lookup_privroot(struct super_block *s)
+{
+	struct dentry *dentry;
+	int err = 0;
+
+	/* If we don't have the privroot located yet - go find it */
+	reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
+	dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
+				strlen(PRIVROOT_NAME));
+	if (!IS_ERR(dentry)) {
+		REISERFS_SB(s)->priv_root = dentry;
+		d_set_d_op(dentry, &xattr_lookup_poison_ops);
+		if (dentry->d_inode)
+			dentry->d_inode->i_flags |= S_PRIVATE;
+	} else
+		err = PTR_ERR(dentry);
+	mutex_unlock(&s->s_root->d_inode->i_mutex);
+
+	return err;
+}
+
+/* We need to take a copy of the mount flags since things like
+ * MS_RDONLY don't get set until *after* we're called.
+ * mount_flags != mount_options */
+int reiserfs_xattr_init(struct super_block *s, int mount_flags)
+{
+	int err = 0;
+	struct dentry *privroot = REISERFS_SB(s)->priv_root;
+
+	err = xattr_mount_check(s);
+	if (err)
+		goto error;
+
+	if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
+		reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
+		err = create_privroot(REISERFS_SB(s)->priv_root);
+		mutex_unlock(&s->s_root->d_inode->i_mutex);
+	}
+
+	if (privroot->d_inode) {
+		s->s_xattr = reiserfs_xattr_handlers;
+		reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s);
+		if (!REISERFS_SB(s)->xattr_root) {
+			struct dentry *dentry;
+			dentry = lookup_one_len(XAROOT_NAME, privroot,
+						strlen(XAROOT_NAME));
+			if (!IS_ERR(dentry))
+				REISERFS_SB(s)->xattr_root = dentry;
+			else
+				err = PTR_ERR(dentry);
+		}
+		mutex_unlock(&privroot->d_inode->i_mutex);
+	}
+
+error:
+	if (err) {
+		clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt));
+		clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt));
+	}
+
+	/* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
+	if (reiserfs_posixacl(s))
+		s->s_flags |= MS_POSIXACL;
+	else
+		s->s_flags &= ~MS_POSIXACL;
+
+	return err;
+}
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
new file mode 100644
index 00000000..3dc38f12
--- /dev/null
+++ b/fs/reiserfs/xattr_acl.c
@@ -0,0 +1,531 @@
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/reiserfs_xattr.h>
+#include <linux/reiserfs_acl.h>
+#include <asm/uaccess.h>
+
+static int reiserfs_set_acl(struct reiserfs_transaction_handle *th,
+			    struct inode *inode, int type,
+			    struct posix_acl *acl);
+
+static int
+posix_acl_set(struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags, int type)
+{
+	struct inode *inode = dentry->d_inode;
+	struct posix_acl *acl;
+	int error, error2;
+	struct reiserfs_transaction_handle th;
+	size_t jcreate_blocks;
+	if (!reiserfs_posixacl(inode->i_sb))
+		return -EOPNOTSUPP;
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl)) {
+			return PTR_ERR(acl);
+		} else if (acl) {
+			error = posix_acl_valid(acl);
+			if (error)
+				goto release_and_out;
+		}
+	} else
+		acl = NULL;
+
+	/* Pessimism: We can't assume that anything from the xattr root up
+	 * has been created. */
+
+	jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) +
+			 reiserfs_xattr_nblocks(inode, size) * 2;
+
+	reiserfs_write_lock(inode->i_sb);
+	error = journal_begin(&th, inode->i_sb, jcreate_blocks);
+	if (error == 0) {
+		error = reiserfs_set_acl(&th, inode, type, acl);
+		error2 = journal_end(&th, inode->i_sb, jcreate_blocks);
+		if (error2)
+			error = error2;
+	}
+	reiserfs_write_unlock(inode->i_sb);
+
+      release_and_out:
+	posix_acl_release(acl);
+	return error;
+}
+
+static int
+posix_acl_get(struct dentry *dentry, const char *name, void *buffer,
+		size_t size, int type)
+{
+	struct posix_acl *acl;
+	int error;
+
+	if (!reiserfs_posixacl(dentry->d_sb))
+		return -EOPNOTSUPP;
+
+	acl = reiserfs_get_acl(dentry->d_inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	error = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *posix_acl_from_disk(const void *value, size_t size)
+{
+	const char *end = (char *)value + size;
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(reiserfs_acl_header))
+		return ERR_PTR(-EINVAL);
+	if (((reiserfs_acl_header *) value)->a_version !=
+	    cpu_to_le32(REISERFS_ACL_VERSION))
+		return ERR_PTR(-EINVAL);
+	value = (char *)value + sizeof(reiserfs_acl_header);
+	count = reiserfs_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n = 0; n < count; n++) {
+		reiserfs_acl_entry *entry = (reiserfs_acl_entry *) value;
+		if ((char *)value + sizeof(reiserfs_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		switch (acl->a_entries[n].e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			value = (char *)value +
+			    sizeof(reiserfs_acl_entry_short);
+			acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
+			break;
+
+		case ACL_USER:
+		case ACL_GROUP:
+			value = (char *)value + sizeof(reiserfs_acl_entry);
+			if ((char *)value > end)
+				goto fail;
+			acl->a_entries[n].e_id = le32_to_cpu(entry->e_id);
+			break;
+
+		default:
+			goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+
+      fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
+{
+	reiserfs_acl_header *ext_acl;
+	char *e;
+	int n;
+
+	*size = reiserfs_acl_size(acl->a_count);
+	ext_acl = kmalloc(sizeof(reiserfs_acl_header) +
+						  acl->a_count *
+						  sizeof(reiserfs_acl_entry),
+						  GFP_NOFS);
+	if (!ext_acl)
+		return ERR_PTR(-ENOMEM);
+	ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION);
+	e = (char *)ext_acl + sizeof(reiserfs_acl_header);
+	for (n = 0; n < acl->a_count; n++) {
+		reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e;
+		entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
+		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+		switch (acl->a_entries[n].e_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
+			e += sizeof(reiserfs_acl_entry);
+			break;
+
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			e += sizeof(reiserfs_acl_entry_short);
+			break;
+
+		default:
+			goto fail;
+		}
+	}
+	return (char *)ext_acl;
+
+      fail:
+	kfree(ext_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Inode operation get_posix_acl().
+ *
+ * inode->i_mutex: down
+ * BKL held [before 2.5.x]
+ */
+struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
+{
+	char *name, *value;
+	struct posix_acl *acl;
+	int size;
+	int retval;
+
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+
+	size = reiserfs_xattr_get(inode, name, NULL, 0);
+	if (size < 0) {
+		if (size == -ENODATA || size == -ENOSYS) {
+			set_cached_acl(inode, type, NULL);
+			return NULL;
+		}
+		return ERR_PTR(size);
+	}
+
+	value = kmalloc(size, GFP_NOFS);
+	if (!value)
+		return ERR_PTR(-ENOMEM);
+
+	retval = reiserfs_xattr_get(inode, name, value, size);
+	if (retval == -ENODATA || retval == -ENOSYS) {
+		/* This shouldn't actually happen as it should have
+		   been caught above.. but just in case */
+		acl = NULL;
+	} else if (retval < 0) {
+		acl = ERR_PTR(retval);
+	} else {
+		acl = posix_acl_from_disk(value, retval);
+	}
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+
+	kfree(value);
+	return acl;
+}
+
+/*
+ * Inode operation set_posix_acl().
+ *
+ * inode->i_mutex: down
+ * BKL held [before 2.5.x]
+ */
+static int
+reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
+		 int type, struct posix_acl *acl)
+{
+	char *name;
+	void *value = NULL;
+	size_t size = 0;
+	int error;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			error = posix_acl_equiv_mode(acl, &mode);
+			if (error < 0)
+				return error;
+			else {
+				inode->i_mode = mode;
+				if (error == 0)
+					acl = NULL;
+			}
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		value = posix_acl_to_disk(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	error = reiserfs_xattr_set_handle(th, inode, name, value, size, 0);
+
+	/*
+	 * Ensure that the inode gets dirtied if we're only using
+	 * the mode bits and an old ACL didn't exist. We don't need
+	 * to check if the inode is hashed here since we won't get
+	 * called by reiserfs_inherit_default_acl().
+	 */
+	if (error == -ENODATA) {
+		error = 0;
+		if (type == ACL_TYPE_ACCESS) {
+			inode->i_ctime = CURRENT_TIME_SEC;
+			mark_inode_dirty(inode);
+		}
+	}
+
+	kfree(value);
+
+	if (!error)
+		set_cached_acl(inode, type, acl);
+
+	return error;
+}
+
+/* dir->i_mutex: locked,
+ * inode is new and not released into the wild yet */
+int
+reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
+			     struct inode *dir, struct dentry *dentry,
+			     struct inode *inode)
+{
+	struct posix_acl *acl;
+	int err = 0;
+
+	/* ACLs only get applied to files and directories */
+	if (S_ISLNK(inode->i_mode))
+		return 0;
+
+	/* ACLs can only be used on "new" objects, so if it's an old object
+	 * there is nothing to inherit from */
+	if (get_inode_sd_version(dir) == STAT_DATA_V1)
+		goto apply_umask;
+
+	/* Don't apply ACLs to objects in the .reiserfs_priv tree.. This
+	 * would be useless since permissions are ignored, and a pain because
+	 * it introduces locking cycles */
+	if (IS_PRIVATE(dir)) {
+		inode->i_flags |= S_PRIVATE;
+		goto apply_umask;
+	}
+
+	acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+
+	if (acl) {
+		struct posix_acl *acl_copy;
+		mode_t mode = inode->i_mode;
+		int need_acl;
+
+		/* Copy the default ACL to the default ACL of a new directory */
+		if (S_ISDIR(inode->i_mode)) {
+			err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
+					       acl);
+			if (err)
+				goto cleanup;
+		}
+
+		/* Now we reconcile the new ACL and the mode,
+		   potentially modifying both */
+		acl_copy = posix_acl_clone(acl, GFP_NOFS);
+		if (!acl_copy) {
+			err = -ENOMEM;
+			goto cleanup;
+		}
+
+		need_acl = posix_acl_create_masq(acl_copy, &mode);
+		if (need_acl >= 0) {
+			if (mode != inode->i_mode) {
+				inode->i_mode = mode;
+			}
+
+			/* If we need an ACL.. */
+			if (need_acl > 0) {
+				err = reiserfs_set_acl(th, inode,
+						       ACL_TYPE_ACCESS,
+						       acl_copy);
+				if (err)
+					goto cleanup_copy;
+			}
+		}
+	      cleanup_copy:
+		posix_acl_release(acl_copy);
+	      cleanup:
+		posix_acl_release(acl);
+	} else {
+	      apply_umask:
+		/* no ACL, apply umask */
+		inode->i_mode &= ~current_umask();
+	}
+
+	return err;
+}
+
+/* This is used to cache the default acl before a new object is created.
+ * The biggest reason for this is to get an idea of how many blocks will
+ * actually be required for the create operation if we must inherit an ACL.
+ * An ACL write can add up to 3 object creations and an additional file write
+ * so we'd prefer not to reserve that many blocks in the journal if we can.
+ * It also has the advantage of not loading the ACL with a transaction open,
+ * this may seem silly, but if the owner of the directory is doing the
+ * creation, the ACL may not be loaded since the permissions wouldn't require
+ * it.
+ * We return the number of blocks required for the transaction.
+ */
+int reiserfs_cache_default_acl(struct inode *inode)
+{
+	struct posix_acl *acl;
+	int nblocks = 0;
+
+	if (IS_PRIVATE(inode))
+		return 0;
+
+	acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT);
+
+	if (acl && !IS_ERR(acl)) {
+		int size = reiserfs_acl_size(acl->a_count);
+
+		/* Other xattrs can be created during inode creation. We don't
+		 * want to claim too many blocks, so we check to see if we
+		 * we need to create the tree to the xattrs, and then we
+		 * just want two files. */
+		nblocks = reiserfs_xattr_jcreate_nblocks(inode);
+		nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
+
+		REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
+
+		/* We need to account for writes + bitmaps for two files */
+		nblocks += reiserfs_xattr_nblocks(inode, size) * 4;
+		posix_acl_release(acl);
+	}
+
+	return nblocks;
+}
+
+int reiserfs_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+	int error;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
+	    !reiserfs_posixacl(inode->i_sb)) {
+		return 0;
+	}
+
+	reiserfs_write_unlock(inode->i_sb);
+	acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
+	reiserfs_write_lock(inode->i_sb);
+	if (!acl)
+		return 0;
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	clone = posix_acl_clone(acl, GFP_NOFS);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+	error = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!error) {
+		struct reiserfs_transaction_handle th;
+		size_t size = reiserfs_xattr_nblocks(inode,
+					     reiserfs_acl_size(clone->a_count));
+		int depth;
+
+		depth = reiserfs_write_lock_once(inode->i_sb);
+		error = journal_begin(&th, inode->i_sb, size * 2);
+		if (!error) {
+			int error2;
+			error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS,
+						 clone);
+			error2 = journal_end(&th, inode->i_sb, size * 2);
+			if (error2)
+				error = error2;
+		}
+		reiserfs_write_unlock_once(inode->i_sb, depth);
+	}
+	posix_acl_release(clone);
+	return error;
+}
+
+static size_t posix_acl_access_list(struct dentry *dentry, char *list,
+				    size_t list_size, const char *name,
+				    size_t name_len, int type)
+{
+	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+	if (!reiserfs_posixacl(dentry->d_sb))
+		return 0;
+	if (list && size <= list_size)
+		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+	return size;
+}
+
+const struct xattr_handler reiserfs_posix_acl_access_handler = {
+	.prefix = POSIX_ACL_XATTR_ACCESS,
+	.flags = ACL_TYPE_ACCESS,
+	.get = posix_acl_get,
+	.set = posix_acl_set,
+	.list = posix_acl_access_list,
+};
+
+static size_t posix_acl_default_list(struct dentry *dentry, char *list,
+				     size_t list_size, const char *name,
+				     size_t name_len, int type)
+{
+	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+	if (!reiserfs_posixacl(dentry->d_sb))
+		return 0;
+	if (list && size <= list_size)
+		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+	return size;
+}
+
+const struct xattr_handler reiserfs_posix_acl_default_handler = {
+	.prefix = POSIX_ACL_XATTR_DEFAULT,
+	.flags = ACL_TYPE_DEFAULT,
+	.get = posix_acl_get,
+	.set = posix_acl_set,
+	.list = posix_acl_default_list,
+};
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
new file mode 100644
index 00000000..ef66c18a
--- /dev/null
+++ b/fs/reiserfs/xattr_security.c
@@ -0,0 +1,120 @@
+#include <linux/reiserfs_fs.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+#include <linux/reiserfs_xattr.h>
+#include <linux/security.h>
+#include <asm/uaccess.h>
+
+static int
+security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
+		int handler_flags)
+{
+	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
+		return -EINVAL;
+
+	if (IS_PRIVATE(dentry->d_inode))
+		return -EPERM;
+
+	return reiserfs_xattr_get(dentry->d_inode, name, buffer, size);
+}
+
+static int
+security_set(struct dentry *dentry, const char *name, const void *buffer,
+	     size_t size, int flags, int handler_flags)
+{
+	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
+		return -EINVAL;
+
+	if (IS_PRIVATE(dentry->d_inode))
+		return -EPERM;
+
+	return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags);
+}
+
+static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
+			    const char *name, size_t namelen, int handler_flags)
+{
+	const size_t len = namelen + 1;
+
+	if (IS_PRIVATE(dentry->d_inode))
+		return 0;
+
+	if (list && len <= list_len) {
+		memcpy(list, name, namelen);
+		list[namelen] = '\0';
+	}
+
+	return len;
+}
+
+/* Initializes the security context for a new inode and returns the number
+ * of blocks needed for the transaction. If successful, reiserfs_security
+ * must be released using reiserfs_security_free when the caller is done. */
+int reiserfs_security_init(struct inode *dir, struct inode *inode,
+			   const struct qstr *qstr,
+			   struct reiserfs_security_handle *sec)
+{
+	int blocks = 0;
+	int error;
+
+	sec->name = NULL;
+
+	/* Don't add selinux attributes on xattrs - they'll never get used */
+	if (IS_PRIVATE(dir))
+		return 0;
+
+	error = security_inode_init_security(inode, dir, qstr, &sec->name,
+					     &sec->value, &sec->length);
+	if (error) {
+		if (error == -EOPNOTSUPP)
+			error = 0;
+
+		sec->name = NULL;
+		sec->value = NULL;
+		sec->length = 0;
+		return error;
+	}
+
+	if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
+		blocks = reiserfs_xattr_jcreate_nblocks(inode) +
+			 reiserfs_xattr_nblocks(inode, sec->length);
+		/* We don't want to count the directories twice if we have
+		 * a default ACL. */
+		REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
+	}
+	return blocks;
+}
+
+int reiserfs_security_write(struct reiserfs_transaction_handle *th,
+			    struct inode *inode,
+			    struct reiserfs_security_handle *sec)
+{
+	int error;
+	if (strlen(sec->name) < sizeof(XATTR_SECURITY_PREFIX))
+		return -EINVAL;
+
+	error = reiserfs_xattr_set_handle(th, inode, sec->name, sec->value,
+					  sec->length, XATTR_CREATE);
+	if (error == -ENODATA || error == -EOPNOTSUPP)
+		error = 0;
+
+	return error;
+}
+
+void reiserfs_security_free(struct reiserfs_security_handle *sec)
+{
+	kfree(sec->name);
+	kfree(sec->value);
+	sec->name = NULL;
+	sec->value = NULL;
+}
+
+const struct xattr_handler reiserfs_xattr_security_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.get = security_get,
+	.set = security_set,
+	.list = security_list,
+};
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
new file mode 100644
index 00000000..9883736c
--- /dev/null
+++ b/fs/reiserfs/xattr_trusted.c
@@ -0,0 +1,56 @@
+#include <linux/reiserfs_fs.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include <linux/reiserfs_xattr.h>
+#include <asm/uaccess.h>
+
+static int
+trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
+	    int handler_flags)
+{
+	if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode))
+		return -EPERM;
+
+	return reiserfs_xattr_get(dentry->d_inode, name, buffer, size);
+}
+
+static int
+trusted_set(struct dentry *dentry, const char *name, const void *buffer,
+	    size_t size, int flags, int handler_flags)
+{
+	if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode))
+		return -EPERM;
+
+	return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags);
+}
+
+static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
+			   const char *name, size_t name_len, int handler_flags)
+{
+	const size_t len = name_len + 1;
+
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode))
+		return 0;
+
+	if (list && len <= list_size) {
+		memcpy(list, name, name_len);
+		list[name_len] = '\0';
+	}
+	return len;
+}
+
+const struct xattr_handler reiserfs_xattr_trusted_handler = {
+	.prefix = XATTR_TRUSTED_PREFIX,
+	.get = trusted_get,
+	.set = trusted_set,
+	.list = trusted_list,
+};
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
new file mode 100644
index 00000000..45ae1a00
--- /dev/null
+++ b/fs/reiserfs/xattr_user.c
@@ -0,0 +1,52 @@
+#include <linux/reiserfs_fs.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include <linux/reiserfs_xattr.h>
+#include <asm/uaccess.h>
+
+static int
+user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
+	 int handler_flags)
+{
+
+	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
+		return -EINVAL;
+	if (!reiserfs_xattrs_user(dentry->d_sb))
+		return -EOPNOTSUPP;
+	return reiserfs_xattr_get(dentry->d_inode, name, buffer, size);
+}
+
+static int
+user_set(struct dentry *dentry, const char *name, const void *buffer,
+	 size_t size, int flags, int handler_flags)
+{
+	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
+		return -EINVAL;
+
+	if (!reiserfs_xattrs_user(dentry->d_sb))
+		return -EOPNOTSUPP;
+	return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags);
+}
+
+static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
+			const char *name, size_t name_len, int handler_flags)
+{
+	const size_t len = name_len + 1;
+
+	if (!reiserfs_xattrs_user(dentry->d_sb))
+		return 0;
+	if (list && len <= list_size) {
+		memcpy(list, name, name_len);
+		list[name_len] = '\0';
+	}
+	return len;
+}
+
+const struct xattr_handler reiserfs_xattr_user_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.get = user_get,
+	.set = user_set,
+	.list = user_list,
+};
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
new file mode 100644
index 00000000..ce2d6bcc
--- /dev/null
+++ b/fs/romfs/Kconfig
@@ -0,0 +1,62 @@
+config ROMFS_FS
+	tristate "ROM file system support"
+	depends on BLOCK || MTD
+	---help---
+	  This is a very small read-only file system mainly intended for
+	  initial ram disks of installation disks, but it could be used for
+	  other read-only media as well.  Read
+	  <file:Documentation/filesystems/romfs.txt> for details.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called romfs.  Note that the file system of your
+	  root partition (the one containing the directory /) cannot be a
+	  module.
+
+	  If you don't know whether you need it, then you don't need it:
+	  answer N.
+
+#
+# Select the backing stores to be supported
+#
+choice
+	prompt "RomFS backing stores"
+	depends on ROMFS_FS
+	default ROMFS_BACKED_BY_BLOCK
+	help
+	  Select the backing stores to be supported.
+
+config ROMFS_BACKED_BY_BLOCK
+	bool "Block device-backed ROM file system support"
+	depends on BLOCK
+	help
+	  This permits ROMFS to use block devices buffered through the page
+	  cache as the medium from which to retrieve data.  It does not allow
+	  direct mapping of the medium.
+
+	  If unsure, answer Y.
+
+config ROMFS_BACKED_BY_MTD
+	bool "MTD-backed ROM file system support"
+	depends on MTD=y || (ROMFS_FS=m && MTD)
+	help
+	  This permits ROMFS to use MTD based devices directly, without the
+	  intercession of the block layer (which may have been disabled).  It
+	  also allows direct mapping of MTD devices through romfs files under
+	  NOMMU conditions if the underlying device is directly addressable by
+	  the CPU.
+
+	  If unsure, answer Y.
+
+config ROMFS_BACKED_BY_BOTH
+	bool "Both the above"
+	depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD))
+endchoice
+
+
+config ROMFS_ON_BLOCK
+	bool
+	default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
+
+config ROMFS_ON_MTD
+	bool
+	default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH
diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile
new file mode 100644
index 00000000..420beb7d
--- /dev/null
+++ b/fs/romfs/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the linux RomFS filesystem routines.
+#
+
+obj-$(CONFIG_ROMFS_FS) += romfs.o
+
+romfs-y := storage.o super.o
+
+ifneq ($(CONFIG_MMU),y)
+romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o
+endif
+
diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
new file mode 100644
index 00000000..95217b83
--- /dev/null
+++ b/fs/romfs/internal.h
@@ -0,0 +1,47 @@
+/* RomFS internal definitions
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/romfs_fs.h>
+
+struct romfs_inode_info {
+	struct inode	vfs_inode;
+	unsigned long	i_metasize;	/* size of non-data area */
+	unsigned long	i_dataoffset;	/* from the start of fs */
+};
+
+static inline size_t romfs_maxsize(struct super_block *sb)
+{
+	return (size_t) (unsigned long) sb->s_fs_info;
+}
+
+static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
+{
+	return container_of(inode, struct romfs_inode_info, vfs_inode);
+}
+
+/*
+ * mmap-nommu.c
+ */
+#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD)
+extern const struct file_operations romfs_ro_fops;
+#else
+#define romfs_ro_fops	generic_ro_fops
+#endif
+
+/*
+ * storage.c
+ */
+extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
+			  void *buf, size_t buflen);
+extern ssize_t romfs_dev_strnlen(struct super_block *sb,
+				 unsigned long pos, size_t maxlen);
+extern int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
+			    const char *str, size_t size);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
new file mode 100644
index 00000000..eed99428
--- /dev/null
+++ b/fs/romfs/mmap-nommu.c
@@ -0,0 +1,79 @@
+/* NOMMU mmap support for RomFS on MTD devices
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/mtd/super.h>
+#include "internal.h"
+
+/*
+ * try to determine where a shared mapping can be made
+ * - only supported for NOMMU at the moment (MMU can't doesn't copy private
+ *   mappings)
+ * - attempts to map through to the underlying MTD device
+ */
+static unsigned long romfs_get_unmapped_area(struct file *file,
+					     unsigned long addr,
+					     unsigned long len,
+					     unsigned long pgoff,
+					     unsigned long flags)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct mtd_info *mtd = inode->i_sb->s_mtd;
+	unsigned long isize, offset, maxpages, lpages;
+
+	if (!mtd)
+		goto cant_map_directly;
+
+	/* the mapping mustn't extend beyond the EOF */
+	lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	isize = i_size_read(inode);
+	offset = pgoff << PAGE_SHIFT;
+
+	maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if ((pgoff >= maxpages) || (maxpages - pgoff < lpages))
+		return (unsigned long) -EINVAL;
+
+	/* we need to call down to the MTD layer to do the actual mapping */
+	if (mtd->get_unmapped_area) {
+		if (addr != 0)
+			return (unsigned long) -EINVAL;
+
+		if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
+			return (unsigned long) -EINVAL;
+
+		offset += ROMFS_I(inode)->i_dataoffset;
+		if (offset > mtd->size - len)
+			return (unsigned long) -EINVAL;
+
+		return mtd->get_unmapped_area(mtd, len, offset, flags);
+	}
+
+cant_map_directly:
+	return (unsigned long) -ENOSYS;
+}
+
+/*
+ * permit a R/O mapping to be made directly through onto an MTD device if
+ * possible
+ */
+static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
+}
+
+const struct file_operations romfs_ro_fops = {
+	.llseek			= generic_file_llseek,
+	.read			= do_sync_read,
+	.aio_read		= generic_file_aio_read,
+	.splice_read		= generic_file_splice_read,
+	.mmap			= romfs_mmap,
+	.get_unmapped_area	= romfs_get_unmapped_area,
+};
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
new file mode 100644
index 00000000..71e2b4d5
--- /dev/null
+++ b/fs/romfs/storage.c
@@ -0,0 +1,293 @@
+/* RomFS storage access routines
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/mtd/super.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+
+#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK)
+#error no ROMFS backing store interface configured
+#endif
+
+#ifdef CONFIG_ROMFS_ON_MTD
+#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__))
+
+/*
+ * read data from an romfs image on an MTD device
+ */
+static int romfs_mtd_read(struct super_block *sb, unsigned long pos,
+			  void *buf, size_t buflen)
+{
+	size_t rlen;
+	int ret;
+
+	ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf);
+	return (ret < 0 || rlen != buflen) ? -EIO : 0;
+}
+
+/*
+ * determine the length of a string in a romfs image on an MTD device
+ */
+static ssize_t romfs_mtd_strnlen(struct super_block *sb,
+				 unsigned long pos, size_t maxlen)
+{
+	ssize_t n = 0;
+	size_t segment;
+	u_char buf[16], *p;
+	size_t len;
+	int ret;
+
+	/* scan the string up to 16 bytes at a time */
+	while (maxlen > 0) {
+		segment = min_t(size_t, maxlen, 16);
+		ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+		if (ret < 0)
+			return ret;
+		p = memchr(buf, 0, len);
+		if (p)
+			return n + (p - buf);
+		maxlen -= len;
+		pos += len;
+		n += len;
+	}
+
+	return n;
+}
+
+/*
+ * compare a string to one in a romfs image on MTD
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_mtd_strcmp(struct super_block *sb, unsigned long pos,
+			    const char *str, size_t size)
+{
+	u_char buf[17];
+	size_t len, segment;
+	int ret;
+
+	/* scan the string up to 16 bytes at a time, and attempt to grab the
+	 * trailing NUL whilst we're at it */
+	buf[0] = 0xff;
+
+	while (size > 0) {
+		segment = min_t(size_t, size + 1, 17);
+		ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+		if (ret < 0)
+			return ret;
+		len--;
+		if (memcmp(buf, str, len) != 0)
+			return 0;
+		buf[0] = buf[len];
+		size -= len;
+		pos += len;
+		str += len;
+	}
+
+	/* check the trailing NUL was */
+	if (buf[0])
+		return 0;
+
+	return 1;
+}
+#endif /* CONFIG_ROMFS_ON_MTD */
+
+#ifdef CONFIG_ROMFS_ON_BLOCK
+/*
+ * read data from an romfs image on a block device
+ */
+static int romfs_blk_read(struct super_block *sb, unsigned long pos,
+			  void *buf, size_t buflen)
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	size_t segment;
+
+	/* copy the string up to blocksize bytes at a time */
+	while (buflen > 0) {
+		offset = pos & (ROMBSIZE - 1);
+		segment = min_t(size_t, buflen, ROMBSIZE - offset);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		memcpy(buf, bh->b_data + offset, segment);
+		brelse(bh);
+		buf += segment;
+		buflen -= segment;
+		pos += segment;
+	}
+
+	return 0;
+}
+
+/*
+ * determine the length of a string in romfs on a block device
+ */
+static ssize_t romfs_blk_strnlen(struct super_block *sb,
+				 unsigned long pos, size_t limit)
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	ssize_t n = 0;
+	size_t segment;
+	u_char *buf, *p;
+
+	/* scan the string up to blocksize bytes at a time */
+	while (limit > 0) {
+		offset = pos & (ROMBSIZE - 1);
+		segment = min_t(size_t, limit, ROMBSIZE - offset);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		buf = bh->b_data + offset;
+		p = memchr(buf, 0, segment);
+		brelse(bh);
+		if (p)
+			return n + (p - buf);
+		limit -= segment;
+		pos += segment;
+		n += segment;
+	}
+
+	return n;
+}
+
+/*
+ * compare a string to one in a romfs image on a block device
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_blk_strcmp(struct super_block *sb, unsigned long pos,
+			    const char *str, size_t size)
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	size_t segment;
+	bool matched, terminated = false;
+
+	/* compare string up to a block at a time */
+	while (size > 0) {
+		offset = pos & (ROMBSIZE - 1);
+		segment = min_t(size_t, size, ROMBSIZE - offset);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		matched = (memcmp(bh->b_data + offset, str, segment) == 0);
+
+		size -= segment;
+		pos += segment;
+		str += segment;
+		if (matched && size == 0 && offset + segment < ROMBSIZE) {
+			if (!bh->b_data[offset + segment])
+				terminated = true;
+			else
+				matched = false;
+		}
+		brelse(bh);
+		if (!matched)
+			return 0;
+	}
+
+	if (!terminated) {
+		/* the terminating NUL must be on the first byte of the next
+		 * block */
+		BUG_ON((pos & (ROMBSIZE - 1)) != 0);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		matched = !bh->b_data[0];
+		brelse(bh);
+		if (!matched)
+			return 0;
+	}
+
+	return 1;
+}
+#endif /* CONFIG_ROMFS_ON_BLOCK */
+
+/*
+ * read data from the romfs image
+ */
+int romfs_dev_read(struct super_block *sb, unsigned long pos,
+		   void *buf, size_t buflen)
+{
+	size_t limit;
+
+	limit = romfs_maxsize(sb);
+	if (pos >= limit)
+		return -EIO;
+	if (buflen > limit - pos)
+		buflen = limit - pos;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd)
+		return romfs_mtd_read(sb, pos, buf, buflen);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev)
+		return romfs_blk_read(sb, pos, buf, buflen);
+#endif
+	return -EIO;
+}
+
+/*
+ * determine the length of a string in romfs
+ */
+ssize_t romfs_dev_strnlen(struct super_block *sb,
+			  unsigned long pos, size_t maxlen)
+{
+	size_t limit;
+
+	limit = romfs_maxsize(sb);
+	if (pos >= limit)
+		return -EIO;
+	if (maxlen > limit - pos)
+		maxlen = limit - pos;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd)
+		return romfs_mtd_strnlen(sb, pos, maxlen);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev)
+		return romfs_blk_strnlen(sb, pos, maxlen);
+#endif
+	return -EIO;
+}
+
+/*
+ * compare a string to one in romfs
+ * - the string to be compared to, str, may not be NUL-terminated; instead the
+ *   string is of the specified size
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
+		     const char *str, size_t size)
+{
+	size_t limit;
+
+	limit = romfs_maxsize(sb);
+	if (pos >= limit)
+		return -EIO;
+	if (size > ROMFS_MAXFN)
+		return -ENAMETOOLONG;
+	if (size + 1 > limit - pos)
+		return -EIO;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd)
+		return romfs_mtd_strcmp(sb, pos, str, size);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev)
+		return romfs_blk_strcmp(sb, pos, str, size);
+#endif
+	return -EIO;
+}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
new file mode 100644
index 00000000..2305e312
--- /dev/null
+++ b/fs/romfs/super.c
@@ -0,0 +1,662 @@
+/* Block- or MTD-based romfs
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * Derived from: ROMFS file system, Linux implementation
+ *
+ * Copyright © 1997-1999  Janos Farkas <chexum@shadow.banki.hu>
+ *
+ * Using parts of the minix filesystem
+ * Copyright © 1991, 1992  Linus Torvalds
+ *
+ * and parts of the affs filesystem additionally
+ * Copyright © 1993  Ray Burr
+ * Copyright © 1996  Hans-Joachim Widmaier
+ *
+ * Changes
+ *					Changed for 2.1.19 modules
+ *	Jan 1997			Initial release
+ *	Jun 1997			2.1.43+ changes
+ *					Proper page locking in readpage
+ *					Changed to work with 2.1.45+ fs
+ *	Jul 1997			Fixed follow_link
+ *			2.1.47
+ *					lookup shouldn't return -ENOENT
+ *					from Horst von Brand:
+ *					  fail on wrong checksum
+ *					  double unlock_super was possible
+ *					  correct namelen for statfs
+ *					spotted by Bill Hawes:
+ *					  readlink shouldn't iput()
+ *	Jun 1998	2.1.106		from Avery Pennarun: glibc scandir()
+ *					  exposed a problem in readdir
+ *			2.1.107		code-freeze spellchecker run
+ *	Aug 1998			2.1.118+ VFS changes
+ *	Sep 1998	2.1.122		another VFS change (follow_link)
+ *	Apr 1999	2.2.7		no more EBADF checking in
+ *					  lookup/readdir, use ERR_PTR
+ *	Jun 1999	2.3.6		d_alloc_root use changed
+ *			2.3.9		clean up usage of ENOENT/negative
+ *					  dentries in lookup
+ *					clean up page flags setting
+ *					  (error, uptodate, locking) in
+ *					  in readpage
+ *					use init_special_inode for
+ *					  fifos/sockets (and streamline) in
+ *					  read_inode, fix _ops table order
+ *	Aug 1999	2.3.16		__initfunc() => __init change
+ *	Oct 1999	2.3.24		page->owner hack obsoleted
+ *	Nov 1999	2.3.27		2.3.25+ page->offset => index change
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/mtd/super.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+
+static struct kmem_cache *romfs_inode_cachep;
+
+static const umode_t romfs_modemap[8] = {
+	0,			/* hard link */
+	S_IFDIR  | 0644,	/* directory */
+	S_IFREG  | 0644,	/* regular file */
+	S_IFLNK  | 0777,	/* symlink */
+	S_IFBLK  | 0600,	/* blockdev */
+	S_IFCHR  | 0600,	/* chardev */
+	S_IFSOCK | 0644,	/* socket */
+	S_IFIFO  | 0644		/* FIFO */
+};
+
+static const unsigned char romfs_dtype_table[] = {
+	DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
+};
+
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
+
+/*
+ * read a page worth of data from the image
+ */
+static int romfs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t offset, size;
+	unsigned long fillsize, pos;
+	void *buf;
+	int ret;
+
+	buf = kmap(page);
+	if (!buf)
+		return -ENOMEM;
+
+	/* 32 bit warning -- but not for us :) */
+	offset = page_offset(page);
+	size = i_size_read(inode);
+	fillsize = 0;
+	ret = 0;
+	if (offset < size) {
+		size -= offset;
+		fillsize = size > PAGE_SIZE ? PAGE_SIZE : size;
+
+		pos = ROMFS_I(inode)->i_dataoffset + offset;
+
+		ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
+		if (ret < 0) {
+			SetPageError(page);
+			fillsize = 0;
+			ret = -EIO;
+		}
+	}
+
+	if (fillsize < PAGE_SIZE)
+		memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
+	if (ret == 0)
+		SetPageUptodate(page);
+
+	flush_dcache_page(page);
+	kunmap(page);
+	unlock_page(page);
+	return ret;
+}
+
+static const struct address_space_operations romfs_aops = {
+	.readpage	= romfs_readpage
+};
+
+/*
+ * read the entries from a directory
+ */
+static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *i = filp->f_dentry->d_inode;
+	struct romfs_inode ri;
+	unsigned long offset, maxoff;
+	int j, ino, nextfh;
+	int stored = 0;
+	char fsname[ROMFS_MAXFN];	/* XXX dynamic? */
+	int ret;
+
+	maxoff = romfs_maxsize(i->i_sb);
+
+	offset = filp->f_pos;
+	if (!offset) {
+		offset = i->i_ino & ROMFH_MASK;
+		ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+		if (ret < 0)
+			goto out;
+		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+	}
+
+	/* Not really failsafe, but we are read-only... */
+	for (;;) {
+		if (!offset || offset >= maxoff) {
+			offset = maxoff;
+			filp->f_pos = offset;
+			goto out;
+		}
+		filp->f_pos = offset;
+
+		/* Fetch inode info */
+		ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+		if (ret < 0)
+			goto out;
+
+		j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE,
+				      sizeof(fsname) - 1);
+		if (j < 0)
+			goto out;
+
+		ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j);
+		if (ret < 0)
+			goto out;
+		fsname[j] = '\0';
+
+		ino = offset;
+		nextfh = be32_to_cpu(ri.next);
+		if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
+			ino = be32_to_cpu(ri.spec);
+		if (filldir(dirent, fsname, j, offset, ino,
+			    romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
+			goto out;
+
+		stored++;
+		offset = nextfh & ROMFH_MASK;
+	}
+
+out:
+	return stored;
+}
+
+/*
+ * look up an entry in a directory
+ */
+static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	unsigned long offset, maxoff;
+	struct inode *inode;
+	struct romfs_inode ri;
+	const char *name;		/* got from dentry */
+	int len, ret;
+
+	offset = dir->i_ino & ROMFH_MASK;
+	ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE);
+	if (ret < 0)
+		goto error;
+
+	/* search all the file entries in the list starting from the one
+	 * pointed to by the directory's special data */
+	maxoff = romfs_maxsize(dir->i_sb);
+	offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+
+	name = dentry->d_name.name;
+	len = dentry->d_name.len;
+
+	for (;;) {
+		if (!offset || offset >= maxoff)
+			goto out0;
+
+		ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
+		if (ret < 0)
+			goto error;
+
+		/* try to match the first 16 bytes of name */
+		ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name,
+				       len);
+		if (ret < 0)
+			goto error;
+		if (ret == 1)
+			break;
+
+		/* next entry */
+		offset = be32_to_cpu(ri.next) & ROMFH_MASK;
+	}
+
+	/* Hard link handling */
+	if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
+		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+
+	inode = romfs_iget(dir->i_sb, offset);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto error;
+	}
+	goto outi;
+
+	/*
+	 * it's a bit funky, _lookup needs to return an error code
+	 * (negative) or a NULL, both as a dentry.  ENOENT should not
+	 * be returned, instead we need to create a negative dentry by
+	 * d_add(dentry, NULL); and return 0 as no error.
+	 * (Although as I see, it only matters on writable file
+	 * systems).
+	 */
+out0:
+	inode = NULL;
+outi:
+	d_add(dentry, inode);
+	ret = 0;
+error:
+	return ERR_PTR(ret);
+}
+
+static const struct file_operations romfs_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= romfs_readdir,
+	.llseek		= default_llseek,
+};
+
+static const struct inode_operations romfs_dir_inode_operations = {
+	.lookup		= romfs_lookup,
+};
+
+/*
+ * get a romfs inode based on its position in the image (which doubles as the
+ * inode number)
+ */
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
+{
+	struct romfs_inode_info *inode;
+	struct romfs_inode ri;
+	struct inode *i;
+	unsigned long nlen;
+	unsigned nextfh;
+	int ret;
+	umode_t mode;
+
+	/* we might have to traverse a chain of "hard link" file entries to get
+	 * to the actual file */
+	for (;;) {
+		ret = romfs_dev_read(sb, pos, &ri, sizeof(ri));
+		if (ret < 0)
+			goto error;
+
+		/* XXX: do romfs_checksum here too (with name) */
+
+		nextfh = be32_to_cpu(ri.next);
+		if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
+			break;
+
+		pos = be32_to_cpu(ri.spec) & ROMFH_MASK;
+	}
+
+	/* determine the length of the filename */
+	nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN);
+	if (IS_ERR_VALUE(nlen))
+		goto eio;
+
+	/* get an inode for this image position */
+	i = iget_locked(sb, pos);
+	if (!i)
+		return ERR_PTR(-ENOMEM);
+
+	if (!(i->i_state & I_NEW))
+		return i;
+
+	/* precalculate the data offset */
+	inode = ROMFS_I(i);
+	inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
+	inode->i_dataoffset = pos + inode->i_metasize;
+
+	i->i_nlink = 1;		/* Hard to decide.. */
+	i->i_size = be32_to_cpu(ri.size);
+	i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
+	i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
+
+	/* set up mode and ops */
+	mode = romfs_modemap[nextfh & ROMFH_TYPE];
+
+	switch (nextfh & ROMFH_TYPE) {
+	case ROMFH_DIR:
+		i->i_size = ROMFS_I(i)->i_metasize;
+		i->i_op = &romfs_dir_inode_operations;
+		i->i_fop = &romfs_dir_operations;
+		if (nextfh & ROMFH_EXEC)
+			mode |= S_IXUGO;
+		break;
+	case ROMFH_REG:
+		i->i_fop = &romfs_ro_fops;
+		i->i_data.a_ops = &romfs_aops;
+		if (i->i_sb->s_mtd)
+			i->i_data.backing_dev_info =
+				i->i_sb->s_mtd->backing_dev_info;
+		if (nextfh & ROMFH_EXEC)
+			mode |= S_IXUGO;
+		break;
+	case ROMFH_SYM:
+		i->i_op = &page_symlink_inode_operations;
+		i->i_data.a_ops = &romfs_aops;
+		mode |= S_IRWXUGO;
+		break;
+	default:
+		/* depending on MBZ for sock/fifos */
+		nextfh = be32_to_cpu(ri.spec);
+		init_special_inode(i, mode, MKDEV(nextfh >> 16,
+						  nextfh & 0xffff));
+		break;
+	}
+
+	i->i_mode = mode;
+
+	unlock_new_inode(i);
+	return i;
+
+eio:
+	ret = -EIO;
+error:
+	printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos);
+	return ERR_PTR(ret);
+}
+
+/*
+ * allocate a new inode
+ */
+static struct inode *romfs_alloc_inode(struct super_block *sb)
+{
+	struct romfs_inode_info *inode;
+	inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
+	return inode ? &inode->vfs_inode : NULL;
+}
+
+/*
+ * return a spent inode to the slab cache
+ */
+static void romfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
+}
+
+static void romfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, romfs_i_callback);
+}
+
+/*
+ * get filesystem statistics
+ */
+static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type = ROMFS_MAGIC;
+	buf->f_namelen = ROMFS_MAXFN;
+	buf->f_bsize = ROMBSIZE;
+	buf->f_bfree = buf->f_bavail = buf->f_ffree;
+	buf->f_blocks =
+		(romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	return 0;
+}
+
+/*
+ * remounting must involve read-only
+ */
+static int romfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+static const struct super_operations romfs_super_ops = {
+	.alloc_inode	= romfs_alloc_inode,
+	.destroy_inode	= romfs_destroy_inode,
+	.statfs		= romfs_statfs,
+	.remount_fs	= romfs_remount,
+};
+
+/*
+ * checksum check on part of a romfs filesystem
+ */
+static __u32 romfs_checksum(const void *data, int size)
+{
+	const __be32 *ptr = data;
+	__u32 sum;
+
+	sum = 0;
+	size >>= 2;
+	while (size > 0) {
+		sum += be32_to_cpu(*ptr++);
+		size--;
+	}
+	return sum;
+}
+
+/*
+ * fill in the superblock
+ */
+static int romfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct romfs_super_block *rsb;
+	struct inode *root;
+	unsigned long pos, img_size;
+	const char *storage;
+	size_t len;
+	int ret;
+
+#ifdef CONFIG_BLOCK
+	if (!sb->s_mtd) {
+		sb_set_blocksize(sb, ROMBSIZE);
+	} else {
+		sb->s_blocksize = ROMBSIZE;
+		sb->s_blocksize_bits = blksize_bits(ROMBSIZE);
+	}
+#endif
+
+	sb->s_maxbytes = 0xFFFFFFFF;
+	sb->s_magic = ROMFS_MAGIC;
+	sb->s_flags |= MS_RDONLY | MS_NOATIME;
+	sb->s_op = &romfs_super_ops;
+
+	/* read the image superblock and check it */
+	rsb = kmalloc(512, GFP_KERNEL);
+	if (!rsb)
+		return -ENOMEM;
+
+	sb->s_fs_info = (void *) 512;
+	ret = romfs_dev_read(sb, 0, rsb, 512);
+	if (ret < 0)
+		goto error_rsb;
+
+	img_size = be32_to_cpu(rsb->size);
+
+	if (sb->s_mtd && img_size > sb->s_mtd->size)
+		goto error_rsb_inval;
+
+	sb->s_fs_info = (void *) img_size;
+
+	if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
+	    img_size < ROMFH_SIZE) {
+		if (!silent)
+			printk(KERN_WARNING "VFS:"
+			       " Can't find a romfs filesystem on dev %s.\n",
+			       sb->s_id);
+		goto error_rsb_inval;
+	}
+
+	if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
+		printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n",
+		       sb->s_id);
+		goto error_rsb_inval;
+	}
+
+	storage = sb->s_mtd ? "MTD" : "the block layer";
+
+	len = strnlen(rsb->name, ROMFS_MAXFN);
+	if (!silent)
+		printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n",
+		       (unsigned) len, (unsigned) len, rsb->name, storage);
+
+	kfree(rsb);
+	rsb = NULL;
+
+	/* find the root directory */
+	pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
+
+	root = romfs_iget(sb, pos);
+	if (IS_ERR(root))
+		goto error;
+
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root)
+		goto error_i;
+
+	return 0;
+
+error_i:
+	iput(root);
+error:
+	return -EINVAL;
+error_rsb_inval:
+	ret = -EINVAL;
+error_rsb:
+	kfree(rsb);
+	return ret;
+}
+
+/*
+ * get a superblock for mounting
+ */
+static struct dentry *romfs_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data)
+{
+	struct dentry *ret = ERR_PTR(-EINVAL);
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (ret == ERR_PTR(-EINVAL))
+		ret = mount_bdev(fs_type, flags, dev_name, data,
+				  romfs_fill_super);
+#endif
+	return ret;
+}
+
+/*
+ * destroy a romfs superblock in the appropriate manner
+ */
+static void romfs_kill_sb(struct super_block *sb)
+{
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd) {
+		kill_mtd_super(sb);
+		return;
+	}
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev) {
+		kill_block_super(sb);
+		return;
+	}
+#endif
+}
+
+static struct file_system_type romfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "romfs",
+	.mount		= romfs_mount,
+	.kill_sb	= romfs_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+/*
+ * inode storage initialiser
+ */
+static void romfs_i_init_once(void *_inode)
+{
+	struct romfs_inode_info *inode = _inode;
+
+	inode_init_once(&inode->vfs_inode);
+}
+
+/*
+ * romfs module initialisation
+ */
+static int __init init_romfs_fs(void)
+{
+	int ret;
+
+	printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n");
+
+	romfs_inode_cachep =
+		kmem_cache_create("romfs_i",
+				  sizeof(struct romfs_inode_info), 0,
+				  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				  romfs_i_init_once);
+
+	if (!romfs_inode_cachep) {
+		printk(KERN_ERR
+		       "ROMFS error: Failed to initialise inode cache\n");
+		return -ENOMEM;
+	}
+	ret = register_filesystem(&romfs_fs_type);
+	if (ret) {
+		printk(KERN_ERR "ROMFS error: Failed to register filesystem\n");
+		goto error_register;
+	}
+	return 0;
+
+error_register:
+	kmem_cache_destroy(romfs_inode_cachep);
+	return ret;
+}
+
+/*
+ * romfs module removal
+ */
+static void __exit exit_romfs_fs(void)
+{
+	unregister_filesystem(&romfs_fs_type);
+	kmem_cache_destroy(romfs_inode_cachep);
+}
+
+module_init(init_romfs_fs);
+module_exit(exit_romfs_fs);
+
+MODULE_DESCRIPTION("Direct-MTD Capable RomFS");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */
diff --git a/fs/select.c b/fs/select.c
new file mode 100644
index 00000000..d33418fd
--- /dev/null
+++ b/fs/select.c
@@ -0,0 +1,999 @@
+/*
+ * This file contains the procedures for the handling of select and poll
+ *
+ * Created for Linux based loosely upon Mathius Lattner's minix
+ * patches by Peter MacDonald. Heavily edited by Linus.
+ *
+ *  4 February 1994
+ *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
+ *     flag set in its personality we do *not* modify the given timeout
+ *     parameter to reflect time remaining.
+ *
+ *  24 January 2000
+ *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
+ *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/personality.h> /* for STICKY_TIMEOUTS */
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/fs.h>
+#include <linux/rcupdate.h>
+#include <linux/hrtimer.h>
+
+#include <asm/uaccess.h>
+
+
+/*
+ * Estimate expected accuracy in ns from a timeval.
+ *
+ * After quite a bit of churning around, we've settled on
+ * a simple thing of taking 0.1% of the timeout as the
+ * slack, with a cap of 100 msec.
+ * "nice" tasks get a 0.5% slack instead.
+ *
+ * Consider this comment an open invitation to come up with even
+ * better solutions..
+ */
+
+#define MAX_SLACK	(100 * NSEC_PER_MSEC)
+
+static long __estimate_accuracy(struct timespec *tv)
+{
+	long slack;
+	int divfactor = 1000;
+
+	if (tv->tv_sec < 0)
+		return 0;
+
+	if (task_nice(current) > 0)
+		divfactor = divfactor / 5;
+
+	if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
+		return MAX_SLACK;
+
+	slack = tv->tv_nsec / divfactor;
+	slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
+
+	if (slack > MAX_SLACK)
+		return MAX_SLACK;
+
+	return slack;
+}
+
+long select_estimate_accuracy(struct timespec *tv)
+{
+	unsigned long ret;
+	struct timespec now;
+
+	/*
+	 * Realtime tasks get a slack of 0 for obvious reasons.
+	 */
+
+	if (rt_task(current))
+		return 0;
+
+	ktime_get_ts(&now);
+	now = timespec_sub(*tv, now);
+	ret = __estimate_accuracy(&now);
+	if (ret < current->timer_slack_ns)
+		return current->timer_slack_ns;
+	return ret;
+}
+
+
+
+struct poll_table_page {
+	struct poll_table_page * next;
+	struct poll_table_entry * entry;
+	struct poll_table_entry entries[0];
+};
+
+#define POLL_TABLE_FULL(table) \
+	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
+
+/*
+ * Ok, Peter made a complicated, but straightforward multiple_wait() function.
+ * I have rewritten this, taking some shortcuts: This code may not be easy to
+ * follow, but it should be free of race-conditions, and it's practical. If you
+ * understand what I'm doing here, then you understand how the linux
+ * sleep/wakeup mechanism works.
+ *
+ * Two very simple procedures, poll_wait() and poll_freewait() make all the
+ * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
+ * as all select/poll functions have to call it to add an entry to the
+ * poll table.
+ */
+static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
+		       poll_table *p);
+
+void poll_initwait(struct poll_wqueues *pwq)
+{
+	init_poll_funcptr(&pwq->pt, __pollwait);
+	pwq->polling_task = current;
+	pwq->triggered = 0;
+	pwq->error = 0;
+	pwq->table = NULL;
+	pwq->inline_index = 0;
+}
+EXPORT_SYMBOL(poll_initwait);
+
+static void free_poll_entry(struct poll_table_entry *entry)
+{
+	remove_wait_queue(entry->wait_address, &entry->wait);
+	fput(entry->filp);
+}
+
+void poll_freewait(struct poll_wqueues *pwq)
+{
+	struct poll_table_page * p = pwq->table;
+	int i;
+	for (i = 0; i < pwq->inline_index; i++)
+		free_poll_entry(pwq->inline_entries + i);
+	while (p) {
+		struct poll_table_entry * entry;
+		struct poll_table_page *old;
+
+		entry = p->entry;
+		do {
+			entry--;
+			free_poll_entry(entry);
+		} while (entry > p->entries);
+		old = p;
+		p = p->next;
+		free_page((unsigned long) old);
+	}
+}
+EXPORT_SYMBOL(poll_freewait);
+
+static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
+{
+	struct poll_table_page *table = p->table;
+
+	if (p->inline_index < N_INLINE_POLL_ENTRIES)
+		return p->inline_entries + p->inline_index++;
+
+	if (!table || POLL_TABLE_FULL(table)) {
+		struct poll_table_page *new_table;
+
+		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
+		if (!new_table) {
+			p->error = -ENOMEM;
+			return NULL;
+		}
+		new_table->entry = new_table->entries;
+		new_table->next = table;
+		p->table = new_table;
+		table = new_table;
+	}
+
+	return table->entry++;
+}
+
+static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct poll_wqueues *pwq = wait->private;
+	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
+
+	/*
+	 * Although this function is called under waitqueue lock, LOCK
+	 * doesn't imply write barrier and the users expect write
+	 * barrier semantics on wakeup functions.  The following
+	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+	 * and is paired with set_mb() in poll_schedule_timeout.
+	 */
+	smp_wmb();
+	pwq->triggered = 1;
+
+	/*
+	 * Perform the default wake up operation using a dummy
+	 * waitqueue.
+	 *
+	 * TODO: This is hacky but there currently is no interface to
+	 * pass in @sync.  @sync is scheduled to be removed and once
+	 * that happens, wake_up_process() can be used directly.
+	 */
+	return default_wake_function(&dummy_wait, mode, sync, key);
+}
+
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct poll_table_entry *entry;
+
+	entry = container_of(wait, struct poll_table_entry, wait);
+	if (key && !((unsigned long)key & entry->key))
+		return 0;
+	return __pollwake(wait, mode, sync, key);
+}
+
+/* Add a new entry */
+static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
+				poll_table *p)
+{
+	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
+	struct poll_table_entry *entry = poll_get_entry(pwq);
+	if (!entry)
+		return;
+	get_file(filp);
+	entry->filp = filp;
+	entry->wait_address = wait_address;
+	entry->key = p->key;
+	init_waitqueue_func_entry(&entry->wait, pollwake);
+	entry->wait.private = pwq;
+	add_wait_queue(wait_address, &entry->wait);
+}
+
+int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+			  ktime_t *expires, unsigned long slack)
+{
+	int rc = -EINTR;
+
+	set_current_state(state);
+	if (!pwq->triggered)
+		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+	__set_current_state(TASK_RUNNING);
+
+	/*
+	 * Prepare for the next iteration.
+	 *
+	 * The following set_mb() serves two purposes.  First, it's
+	 * the counterpart rmb of the wmb in pollwake() such that data
+	 * written before wake up is always visible after wake up.
+	 * Second, the full barrier guarantees that triggered clearing
+	 * doesn't pass event check of the next iteration.  Note that
+	 * this problem doesn't exist for the first iteration as
+	 * add_wait_queue() has full barrier semantics.
+	 */
+	set_mb(pwq->triggered, 0);
+
+	return rc;
+}
+EXPORT_SYMBOL(poll_schedule_timeout);
+
+/**
+ * poll_select_set_timeout - helper function to setup the timeout value
+ * @to:		pointer to timespec variable for the final timeout
+ * @sec:	seconds (from user space)
+ * @nsec:	nanoseconds (from user space)
+ *
+ * Note, we do not use a timespec for the user space value here, That
+ * way we can use the function for timeval and compat interfaces as well.
+ *
+ * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
+ */
+int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
+{
+	struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
+
+	if (!timespec_valid(&ts))
+		return -EINVAL;
+
+	/* Optimize for the zero timeout value here */
+	if (!sec && !nsec) {
+		to->tv_sec = to->tv_nsec = 0;
+	} else {
+		ktime_get_ts(to);
+		*to = timespec_add_safe(*to, ts);
+	}
+	return 0;
+}
+
+static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+				      int timeval, int ret)
+{
+	struct timespec rts;
+	struct timeval rtv;
+
+	if (!p)
+		return ret;
+
+	if (current->personality & STICKY_TIMEOUTS)
+		goto sticky;
+
+	/* No update for zero timeout */
+	if (!end_time->tv_sec && !end_time->tv_nsec)
+		return ret;
+
+	ktime_get_ts(&rts);
+	rts = timespec_sub(*end_time, rts);
+	if (rts.tv_sec < 0)
+		rts.tv_sec = rts.tv_nsec = 0;
+
+	if (timeval) {
+		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
+			memset(&rtv, 0, sizeof(rtv));
+		rtv.tv_sec = rts.tv_sec;
+		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+
+		if (!copy_to_user(p, &rtv, sizeof(rtv)))
+			return ret;
+
+	} else if (!copy_to_user(p, &rts, sizeof(rts)))
+		return ret;
+
+	/*
+	 * If an application puts its timeval in read-only memory, we
+	 * don't want the Linux-specific update to the timeval to
+	 * cause a fault after the select has completed
+	 * successfully. However, because we're not updating the
+	 * timeval, we can't restart the system call.
+	 */
+
+sticky:
+	if (ret == -ERESTARTNOHAND)
+		ret = -EINTR;
+	return ret;
+}
+
+#define FDS_IN(fds, n)		(fds->in + n)
+#define FDS_OUT(fds, n)		(fds->out + n)
+#define FDS_EX(fds, n)		(fds->ex + n)
+
+#define BITS(fds, n)	(*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))
+
+static int max_select_fd(unsigned long n, fd_set_bits *fds)
+{
+	unsigned long *open_fds;
+	unsigned long set;
+	int max;
+	struct fdtable *fdt;
+
+	/* handle last in-complete long-word first */
+	set = ~(~0UL << (n & (__NFDBITS-1)));
+	n /= __NFDBITS;
+	fdt = files_fdtable(current->files);
+	open_fds = fdt->open_fds->fds_bits+n;
+	max = 0;
+	if (set) {
+		set &= BITS(fds, n);
+		if (set) {
+			if (!(set & ~*open_fds))
+				goto get_max;
+			return -EBADF;
+		}
+	}
+	while (n) {
+		open_fds--;
+		n--;
+		set = BITS(fds, n);
+		if (!set)
+			continue;
+		if (set & ~*open_fds)
+			return -EBADF;
+		if (max)
+			continue;
+get_max:
+		do {
+			max++;
+			set >>= 1;
+		} while (set);
+		max += n * __NFDBITS;
+	}
+
+	return max;
+}
+
+#define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
+#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
+#define POLLEX_SET (POLLPRI)
+
+static inline void wait_key_set(poll_table *wait, unsigned long in,
+				unsigned long out, unsigned long bit)
+{
+	if (wait) {
+		wait->key = POLLEX_SET;
+		if (in & bit)
+			wait->key |= POLLIN_SET;
+		if (out & bit)
+			wait->key |= POLLOUT_SET;
+	}
+}
+
+int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
+{
+	ktime_t expire, *to = NULL;
+	struct poll_wqueues table;
+	poll_table *wait;
+	int retval, i, timed_out = 0;
+	unsigned long slack = 0;
+
+	rcu_read_lock();
+	retval = max_select_fd(n, fds);
+	rcu_read_unlock();
+
+	if (retval < 0)
+		return retval;
+	n = retval;
+
+	poll_initwait(&table);
+	wait = &table.pt;
+	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
+		wait = NULL;
+		timed_out = 1;
+	}
+
+	if (end_time && !timed_out)
+		slack = select_estimate_accuracy(end_time);
+
+	retval = 0;
+	for (;;) {
+		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
+
+		inp = fds->in; outp = fds->out; exp = fds->ex;
+		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
+
+		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
+			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
+			unsigned long res_in = 0, res_out = 0, res_ex = 0;
+			const struct file_operations *f_op = NULL;
+			struct file *file = NULL;
+
+			in = *inp++; out = *outp++; ex = *exp++;
+			all_bits = in | out | ex;
+			if (all_bits == 0) {
+				i += __NFDBITS;
+				continue;
+			}
+
+			for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
+				int fput_needed;
+				if (i >= n)
+					break;
+				if (!(bit & all_bits))
+					continue;
+				file = fget_light(i, &fput_needed);
+				if (file) {
+					f_op = file->f_op;
+					mask = DEFAULT_POLLMASK;
+					if (f_op && f_op->poll) {
+						wait_key_set(wait, in, out, bit);
+						mask = (*f_op->poll)(file, wait);
+					}
+					fput_light(file, fput_needed);
+					if ((mask & POLLIN_SET) && (in & bit)) {
+						res_in |= bit;
+						retval++;
+						wait = NULL;
+					}
+					if ((mask & POLLOUT_SET) && (out & bit)) {
+						res_out |= bit;
+						retval++;
+						wait = NULL;
+					}
+					if ((mask & POLLEX_SET) && (ex & bit)) {
+						res_ex |= bit;
+						retval++;
+						wait = NULL;
+					}
+				}
+			}
+			if (res_in)
+				*rinp = res_in;
+			if (res_out)
+				*routp = res_out;
+			if (res_ex)
+				*rexp = res_ex;
+			cond_resched();
+		}
+		wait = NULL;
+		if (retval || timed_out || signal_pending(current))
+			break;
+		if (table.error) {
+			retval = table.error;
+			break;
+		}
+
+		/*
+		 * If this is the first loop and we have a timeout
+		 * given, then we convert to ktime_t and set the to
+		 * pointer to the expiry value.
+		 */
+		if (end_time && !to) {
+			expire = timespec_to_ktime(*end_time);
+			to = &expire;
+		}
+
+		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
+					   to, slack))
+			timed_out = 1;
+	}
+
+	poll_freewait(&table);
+
+	return retval;
+}
+
+/*
+ * We can actually return ERESTARTSYS instead of EINTR, but I'd
+ * like to be certain this leads to no problems. So I return
+ * EINTR just for safety.
+ *
+ * Update: ERESTARTSYS breaks at least the xview clock binary, so
+ * I'm trying ERESTARTNOHAND which restart only when you want to.
+ */
+int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
+			   fd_set __user *exp, struct timespec *end_time)
+{
+	fd_set_bits fds;
+	void *bits;
+	int ret, max_fds;
+	unsigned int size;
+	struct fdtable *fdt;
+	/* Allocate small arguments on the stack to save memory and be faster */
+	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
+
+	ret = -EINVAL;
+	if (n < 0)
+		goto out_nofds;
+
+	/* max_fds can increase, so grab it once to avoid race */
+	rcu_read_lock();
+	fdt = files_fdtable(current->files);
+	max_fds = fdt->max_fds;
+	rcu_read_unlock();
+	if (n > max_fds)
+		n = max_fds;
+
+	/*
+	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
+	 * since we used fdset we need to allocate memory in units of
+	 * long-words. 
+	 */
+	size = FDS_BYTES(n);
+	bits = stack_fds;
+	if (size > sizeof(stack_fds) / 6) {
+		/* Not enough space in on-stack array; must use kmalloc */
+		ret = -ENOMEM;
+		bits = kmalloc(6 * size, GFP_KERNEL);
+		if (!bits)
+			goto out_nofds;
+	}
+	fds.in      = bits;
+	fds.out     = bits +   size;
+	fds.ex      = bits + 2*size;
+	fds.res_in  = bits + 3*size;
+	fds.res_out = bits + 4*size;
+	fds.res_ex  = bits + 5*size;
+
+	if ((ret = get_fd_set(n, inp, fds.in)) ||
+	    (ret = get_fd_set(n, outp, fds.out)) ||
+	    (ret = get_fd_set(n, exp, fds.ex)))
+		goto out;
+	zero_fd_set(n, fds.res_in);
+	zero_fd_set(n, fds.res_out);
+	zero_fd_set(n, fds.res_ex);
+
+	ret = do_select(n, &fds, end_time);
+
+	if (ret < 0)
+		goto out;
+	if (!ret) {
+		ret = -ERESTARTNOHAND;
+		if (signal_pending(current))
+			goto out;
+		ret = 0;
+	}
+
+	if (set_fd_set(n, inp, fds.res_in) ||
+	    set_fd_set(n, outp, fds.res_out) ||
+	    set_fd_set(n, exp, fds.res_ex))
+		ret = -EFAULT;
+
+out:
+	if (bits != stack_fds)
+		kfree(bits);
+out_nofds:
+	return ret;
+}
+
+SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
+		fd_set __user *, exp, struct timeval __user *, tvp)
+{
+	struct timespec end_time, *to = NULL;
+	struct timeval tv;
+	int ret;
+
+	if (tvp) {
+		if (copy_from_user(&tv, tvp, sizeof(tv)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to,
+				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
+				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
+			return -EINVAL;
+	}
+
+	ret = core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
+
+	return ret;
+}
+
+#ifdef HAVE_SET_RESTORE_SIGMASK
+static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
+		       fd_set __user *exp, struct timespec __user *tsp,
+		       const sigset_t __user *sigmask, size_t sigsetsize)
+{
+	sigset_t ksigmask, sigsaved;
+	struct timespec ts, end_time, *to = NULL;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
+	}
+
+	if (sigmask) {
+		/* XXX: Don't preclude handling different sized sigset_t's.  */
+		if (sigsetsize != sizeof(sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+			return -EFAULT;
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+	if (ret == -ERESTARTNOHAND) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+					sizeof(sigsaved));
+			set_restore_sigmask();
+		}
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	return ret;
+}
+
+/*
+ * Most architectures can't handle 7-argument syscalls. So we provide a
+ * 6-argument version where the sixth argument is a pointer to a structure
+ * which has a pointer to the sigset_t itself followed by a size_t containing
+ * the sigset size.
+ */
+SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
+		fd_set __user *, exp, struct timespec __user *, tsp,
+		void __user *, sig)
+{
+	size_t sigsetsize = 0;
+	sigset_t __user *up = NULL;
+
+	if (sig) {
+		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
+		    || __get_user(up, (sigset_t __user * __user *)sig)
+		    || __get_user(sigsetsize,
+				(size_t __user *)(sig+sizeof(void *))))
+			return -EFAULT;
+	}
+
+	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
+}
+#endif /* HAVE_SET_RESTORE_SIGMASK */
+
+#ifdef __ARCH_WANT_SYS_OLD_SELECT
+struct sel_arg_struct {
+	unsigned long n;
+	fd_set __user *inp, *outp, *exp;
+	struct timeval __user *tvp;
+};
+
+SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
+{
+	struct sel_arg_struct a;
+
+	if (copy_from_user(&a, arg, sizeof(a)))
+		return -EFAULT;
+	return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
+}
+#endif
+
+struct poll_list {
+	struct poll_list *next;
+	int len;
+	struct pollfd entries[0];
+};
+
+#define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
+
+/*
+ * Fish for pollable events on the pollfd->fd file descriptor. We're only
+ * interested in events matching the pollfd->events mask, and the result
+ * matching that mask is both recorded in pollfd->revents and returned. The
+ * pwait poll_table will be used by the fd-provided poll handler for waiting,
+ * if non-NULL.
+ */
+static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
+{
+	unsigned int mask;
+	int fd;
+
+	mask = 0;
+	fd = pollfd->fd;
+	if (fd >= 0) {
+		int fput_needed;
+		struct file * file;
+
+		file = fget_light(fd, &fput_needed);
+		mask = POLLNVAL;
+		if (file != NULL) {
+			mask = DEFAULT_POLLMASK;
+			if (file->f_op && file->f_op->poll) {
+				if (pwait)
+					pwait->key = pollfd->events |
+							POLLERR | POLLHUP;
+				mask = file->f_op->poll(file, pwait);
+			}
+			/* Mask out unneeded events. */
+			mask &= pollfd->events | POLLERR | POLLHUP;
+			fput_light(file, fput_needed);
+		}
+	}
+	pollfd->revents = mask;
+
+	return mask;
+}
+
+static int do_poll(unsigned int nfds,  struct poll_list *list,
+		   struct poll_wqueues *wait, struct timespec *end_time)
+{
+	poll_table* pt = &wait->pt;
+	ktime_t expire, *to = NULL;
+	int timed_out = 0, count = 0;
+	unsigned long slack = 0;
+
+	/* Optimise the no-wait case */
+	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
+		pt = NULL;
+		timed_out = 1;
+	}
+
+	if (end_time && !timed_out)
+		slack = select_estimate_accuracy(end_time);
+
+	for (;;) {
+		struct poll_list *walk;
+
+		for (walk = list; walk != NULL; walk = walk->next) {
+			struct pollfd * pfd, * pfd_end;
+
+			pfd = walk->entries;
+			pfd_end = pfd + walk->len;
+			for (; pfd != pfd_end; pfd++) {
+				/*
+				 * Fish for events. If we found one, record it
+				 * and kill the poll_table, so we don't
+				 * needlessly register any other waiters after
+				 * this. They'll get immediately deregistered
+				 * when we break out and return.
+				 */
+				if (do_pollfd(pfd, pt)) {
+					count++;
+					pt = NULL;
+				}
+			}
+		}
+		/*
+		 * All waiters have already been registered, so don't provide
+		 * a poll_table to them on the next loop iteration.
+		 */
+		pt = NULL;
+		if (!count) {
+			count = wait->error;
+			if (signal_pending(current))
+				count = -EINTR;
+		}
+		if (count || timed_out)
+			break;
+
+		/*
+		 * If this is the first loop and we have a timeout
+		 * given, then we convert to ktime_t and set the to
+		 * pointer to the expiry value.
+		 */
+		if (end_time && !to) {
+			expire = timespec_to_ktime(*end_time);
+			to = &expire;
+		}
+
+		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
+			timed_out = 1;
+	}
+	return count;
+}
+
+#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
+			sizeof(struct pollfd))
+
+int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
+		struct timespec *end_time)
+{
+	struct poll_wqueues table;
+ 	int err = -EFAULT, fdcount, len, size;
+	/* Allocate small arguments on the stack to save memory and be
+	   faster - use long to make sure the buffer is aligned properly
+	   on 64 bit archs to avoid unaligned access */
+	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
+	struct poll_list *const head = (struct poll_list *)stack_pps;
+ 	struct poll_list *walk = head;
+ 	unsigned long todo = nfds;
+
+	if (nfds > rlimit(RLIMIT_NOFILE))
+		return -EINVAL;
+
+	len = min_t(unsigned int, nfds, N_STACK_PPS);
+	for (;;) {
+		walk->next = NULL;
+		walk->len = len;
+		if (!len)
+			break;
+
+		if (copy_from_user(walk->entries, ufds + nfds-todo,
+					sizeof(struct pollfd) * walk->len))
+			goto out_fds;
+
+		todo -= walk->len;
+		if (!todo)
+			break;
+
+		len = min(todo, POLLFD_PER_PAGE);
+		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
+		walk = walk->next = kmalloc(size, GFP_KERNEL);
+		if (!walk) {
+			err = -ENOMEM;
+			goto out_fds;
+		}
+	}
+
+	poll_initwait(&table);
+	fdcount = do_poll(nfds, head, &table, end_time);
+	poll_freewait(&table);
+
+	for (walk = head; walk; walk = walk->next) {
+		struct pollfd *fds = walk->entries;
+		int j;
+
+		for (j = 0; j < walk->len; j++, ufds++)
+			if (__put_user(fds[j].revents, &ufds->revents))
+				goto out_fds;
+  	}
+
+	err = fdcount;
+out_fds:
+	walk = head->next;
+	while (walk) {
+		struct poll_list *pos = walk;
+		walk = walk->next;
+		kfree(pos);
+	}
+
+	return err;
+}
+
+static long do_restart_poll(struct restart_block *restart_block)
+{
+	struct pollfd __user *ufds = restart_block->poll.ufds;
+	int nfds = restart_block->poll.nfds;
+	struct timespec *to = NULL, end_time;
+	int ret;
+
+	if (restart_block->poll.has_timeout) {
+		end_time.tv_sec = restart_block->poll.tv_sec;
+		end_time.tv_nsec = restart_block->poll.tv_nsec;
+		to = &end_time;
+	}
+
+	ret = do_sys_poll(ufds, nfds, to);
+
+	if (ret == -EINTR) {
+		restart_block->fn = do_restart_poll;
+		ret = -ERESTART_RESTARTBLOCK;
+	}
+	return ret;
+}
+
+SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
+		long, timeout_msecs)
+{
+	struct timespec end_time, *to = NULL;
+	int ret;
+
+	if (timeout_msecs >= 0) {
+		to = &end_time;
+		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
+			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
+	}
+
+	ret = do_sys_poll(ufds, nfds, to);
+
+	if (ret == -EINTR) {
+		struct restart_block *restart_block;
+
+		restart_block = &current_thread_info()->restart_block;
+		restart_block->fn = do_restart_poll;
+		restart_block->poll.ufds = ufds;
+		restart_block->poll.nfds = nfds;
+
+		if (timeout_msecs >= 0) {
+			restart_block->poll.tv_sec = end_time.tv_sec;
+			restart_block->poll.tv_nsec = end_time.tv_nsec;
+			restart_block->poll.has_timeout = 1;
+		} else
+			restart_block->poll.has_timeout = 0;
+
+		ret = -ERESTART_RESTARTBLOCK;
+	}
+	return ret;
+}
+
+#ifdef HAVE_SET_RESTORE_SIGMASK
+SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
+		struct timespec __user *, tsp, const sigset_t __user *, sigmask,
+		size_t, sigsetsize)
+{
+	sigset_t ksigmask, sigsaved;
+	struct timespec ts, end_time, *to = NULL;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
+	}
+
+	if (sigmask) {
+		/* XXX: Don't preclude handling different sized sigset_t's.  */
+		if (sigsetsize != sizeof(sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+			return -EFAULT;
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = do_sys_poll(ufds, nfds, to);
+
+	/* We can restart this syscall, usually */
+	if (ret == -EINTR) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+					sizeof(sigsaved));
+			set_restore_sigmask();
+		}
+		ret = -ERESTARTNOHAND;
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+	return ret;
+}
+#endif /* HAVE_SET_RESTORE_SIGMASK */
diff --git a/fs/seq_file.c b/fs/seq_file.c
new file mode 100644
index 00000000..dba43c3e
--- /dev/null
+++ b/fs/seq_file.c
@@ -0,0 +1,821 @@
+/*
+ * linux/fs/seq_file.c
+ *
+ * helper functions for making synthetic files from sequences of records.
+ * initial implementation -- AV, Oct 2001.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include <asm/page.h>
+
+/**
+ *	seq_open -	initialize sequential file
+ *	@file: file we initialize
+ *	@op: method table describing the sequence
+ *
+ *	seq_open() sets @file, associating it with a sequence described
+ *	by @op.  @op->start() sets the iterator up and returns the first
+ *	element of sequence. @op->stop() shuts it down.  @op->next()
+ *	returns the next element of sequence.  @op->show() prints element
+ *	into the buffer.  In case of error ->start() and ->next() return
+ *	ERR_PTR(error).  In the end of sequence they return %NULL. ->show()
+ *	returns 0 in case of success and negative number in case of error.
+ *	Returning SEQ_SKIP means "discard this element and move on".
+ */
+int seq_open(struct file *file, const struct seq_operations *op)
+{
+	struct seq_file *p = file->private_data;
+
+	if (!p) {
+		p = kmalloc(sizeof(*p), GFP_KERNEL);
+		if (!p)
+			return -ENOMEM;
+		file->private_data = p;
+	}
+	memset(p, 0, sizeof(*p));
+	mutex_init(&p->lock);
+	p->op = op;
+
+	/*
+	 * Wrappers around seq_open(e.g. swaps_open) need to be
+	 * aware of this. If they set f_version themselves, they
+	 * should call seq_open first and then set f_version.
+	 */
+	file->f_version = 0;
+
+	/*
+	 * seq_files support lseek() and pread().  They do not implement
+	 * write() at all, but we clear FMODE_PWRITE here for historical
+	 * reasons.
+	 *
+	 * If a client of seq_files a) implements file.write() and b) wishes to
+	 * support pwrite() then that client will need to implement its own
+	 * file.open() which calls seq_open() and then sets FMODE_PWRITE.
+	 */
+	file->f_mode &= ~FMODE_PWRITE;
+	return 0;
+}
+EXPORT_SYMBOL(seq_open);
+
+static int traverse(struct seq_file *m, loff_t offset)
+{
+	loff_t pos = 0, index;
+	int error = 0;
+	void *p;
+
+	m->version = 0;
+	index = 0;
+	m->count = m->from = 0;
+	if (!offset) {
+		m->index = index;
+		return 0;
+	}
+	if (!m->buf) {
+		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
+		if (!m->buf)
+			return -ENOMEM;
+	}
+	p = m->op->start(m, &index);
+	while (p) {
+		error = PTR_ERR(p);
+		if (IS_ERR(p))
+			break;
+		error = m->op->show(m, p);
+		if (error < 0)
+			break;
+		if (unlikely(error)) {
+			error = 0;
+			m->count = 0;
+		}
+		if (m->count == m->size)
+			goto Eoverflow;
+		if (pos + m->count > offset) {
+			m->from = offset - pos;
+			m->count -= m->from;
+			m->index = index;
+			break;
+		}
+		pos += m->count;
+		m->count = 0;
+		if (pos == offset) {
+			index++;
+			m->index = index;
+			break;
+		}
+		p = m->op->next(m, p, &index);
+	}
+	m->op->stop(m, p);
+	m->index = index;
+	return error;
+
+Eoverflow:
+	m->op->stop(m, p);
+	kfree(m->buf);
+	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
+	return !m->buf ? -ENOMEM : -EAGAIN;
+}
+
+/**
+ *	seq_read -	->read() method for sequential files.
+ *	@file: the file to read from
+ *	@buf: the buffer to read to
+ *	@size: the maximum number of bytes to read
+ *	@ppos: the current position in the file
+ *
+ *	Ready-made ->f_op->read()
+ */
+ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
+{
+	struct seq_file *m = file->private_data;
+	size_t copied = 0;
+	loff_t pos;
+	size_t n;
+	void *p;
+	int err = 0;
+
+	mutex_lock(&m->lock);
+
+	/* Don't assume *ppos is where we left it */
+	if (unlikely(*ppos != m->read_pos)) {
+		m->read_pos = *ppos;
+		while ((err = traverse(m, *ppos)) == -EAGAIN)
+			;
+		if (err) {
+			/* With prejudice... */
+			m->read_pos = 0;
+			m->version = 0;
+			m->index = 0;
+			m->count = 0;
+			goto Done;
+		}
+	}
+
+	/*
+	 * seq_file->op->..m_start/m_stop/m_next may do special actions
+	 * or optimisations based on the file->f_version, so we want to
+	 * pass the file->f_version to those methods.
+	 *
+	 * seq_file->version is just copy of f_version, and seq_file
+	 * methods can treat it simply as file version.
+	 * It is copied in first and copied out after all operations.
+	 * It is convenient to have it as  part of structure to avoid the
+	 * need of passing another argument to all the seq_file methods.
+	 */
+	m->version = file->f_version;
+	/* grab buffer if we didn't have one */
+	if (!m->buf) {
+		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
+		if (!m->buf)
+			goto Enomem;
+	}
+	/* if not empty - flush it first */
+	if (m->count) {
+		n = min(m->count, size);
+		err = copy_to_user(buf, m->buf + m->from, n);
+		if (err)
+			goto Efault;
+		m->count -= n;
+		m->from += n;
+		size -= n;
+		buf += n;
+		copied += n;
+		if (!m->count)
+			m->index++;
+		if (!size)
+			goto Done;
+	}
+	/* we need at least one record in buffer */
+	pos = m->index;
+	p = m->op->start(m, &pos);
+	while (1) {
+		err = PTR_ERR(p);
+		if (!p || IS_ERR(p))
+			break;
+		err = m->op->show(m, p);
+		if (err < 0)
+			break;
+		if (unlikely(err))
+			m->count = 0;
+		if (unlikely(!m->count)) {
+			p = m->op->next(m, p, &pos);
+			m->index = pos;
+			continue;
+		}
+		if (m->count < m->size)
+			goto Fill;
+		m->op->stop(m, p);
+		kfree(m->buf);
+		m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
+		if (!m->buf)
+			goto Enomem;
+		m->count = 0;
+		m->version = 0;
+		pos = m->index;
+		p = m->op->start(m, &pos);
+	}
+	m->op->stop(m, p);
+	m->count = 0;
+	goto Done;
+Fill:
+	/* they want more? let's try to get some more */
+	while (m->count < size) {
+		size_t offs = m->count;
+		loff_t next = pos;
+		p = m->op->next(m, p, &next);
+		if (!p || IS_ERR(p)) {
+			err = PTR_ERR(p);
+			break;
+		}
+		err = m->op->show(m, p);
+		if (m->count == m->size || err) {
+			m->count = offs;
+			if (likely(err <= 0))
+				break;
+		}
+		pos = next;
+	}
+	m->op->stop(m, p);
+	n = min(m->count, size);
+	err = copy_to_user(buf, m->buf, n);
+	if (err)
+		goto Efault;
+	copied += n;
+	m->count -= n;
+	if (m->count)
+		m->from = n;
+	else
+		pos++;
+	m->index = pos;
+Done:
+	if (!copied)
+		copied = err;
+	else {
+		*ppos += copied;
+		m->read_pos += copied;
+	}
+	file->f_version = m->version;
+	mutex_unlock(&m->lock);
+	return copied;
+Enomem:
+	err = -ENOMEM;
+	goto Done;
+Efault:
+	err = -EFAULT;
+	goto Done;
+}
+EXPORT_SYMBOL(seq_read);
+
+/**
+ *	seq_lseek -	->llseek() method for sequential files.
+ *	@file: the file in question
+ *	@offset: new position
+ *	@origin: 0 for absolute, 1 for relative position
+ *
+ *	Ready-made ->f_op->llseek()
+ */
+loff_t seq_lseek(struct file *file, loff_t offset, int origin)
+{
+	struct seq_file *m = file->private_data;
+	loff_t retval = -EINVAL;
+
+	mutex_lock(&m->lock);
+	m->version = file->f_version;
+	switch (origin) {
+		case 1:
+			offset += file->f_pos;
+		case 0:
+			if (offset < 0)
+				break;
+			retval = offset;
+			if (offset != m->read_pos) {
+				while ((retval=traverse(m, offset)) == -EAGAIN)
+					;
+				if (retval) {
+					/* with extreme prejudice... */
+					file->f_pos = 0;
+					m->read_pos = 0;
+					m->version = 0;
+					m->index = 0;
+					m->count = 0;
+				} else {
+					m->read_pos = offset;
+					retval = file->f_pos = offset;
+				}
+			}
+	}
+	file->f_version = m->version;
+	mutex_unlock(&m->lock);
+	return retval;
+}
+EXPORT_SYMBOL(seq_lseek);
+
+/**
+ *	seq_release -	free the structures associated with sequential file.
+ *	@file: file in question
+ *	@inode: file->f_path.dentry->d_inode
+ *
+ *	Frees the structures associated with sequential file; can be used
+ *	as ->f_op->release() if you don't have private data to destroy.
+ */
+int seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+	kfree(m->buf);
+	kfree(m);
+	return 0;
+}
+EXPORT_SYMBOL(seq_release);
+
+/**
+ *	seq_escape -	print string into buffer, escaping some characters
+ *	@m:	target buffer
+ *	@s:	string
+ *	@esc:	set of characters that need escaping
+ *
+ *	Puts string into buffer, replacing each occurrence of character from
+ *	@esc with usual octal escape.  Returns 0 in case of success, -1 - in
+ *	case of overflow.
+ */
+int seq_escape(struct seq_file *m, const char *s, const char *esc)
+{
+	char *end = m->buf + m->size;
+        char *p;
+	char c;
+
+        for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
+		if (!strchr(esc, c)) {
+			*p++ = c;
+			continue;
+		}
+		if (p + 3 < end) {
+			*p++ = '\\';
+			*p++ = '0' + ((c & 0300) >> 6);
+			*p++ = '0' + ((c & 070) >> 3);
+			*p++ = '0' + (c & 07);
+			continue;
+		}
+		m->count = m->size;
+		return -1;
+        }
+	m->count = p - m->buf;
+        return 0;
+}
+EXPORT_SYMBOL(seq_escape);
+
+int seq_printf(struct seq_file *m, const char *f, ...)
+{
+	va_list args;
+	int len;
+
+	if (m->count < m->size) {
+		va_start(args, f);
+		len = vsnprintf(m->buf + m->count, m->size - m->count, f, args);
+		va_end(args);
+		if (m->count + len < m->size) {
+			m->count += len;
+			return 0;
+		}
+	}
+	m->count = m->size;
+	return -1;
+}
+EXPORT_SYMBOL(seq_printf);
+
+/**
+ *	mangle_path -	mangle and copy path to buffer beginning
+ *	@s: buffer start
+ *	@p: beginning of path in above buffer
+ *	@esc: set of characters that need escaping
+ *
+ *      Copy the path from @p to @s, replacing each occurrence of character from
+ *      @esc with usual octal escape.
+ *      Returns pointer past last written character in @s, or NULL in case of
+ *      failure.
+ */
+char *mangle_path(char *s, char *p, char *esc)
+{
+	while (s <= p) {
+		char c = *p++;
+		if (!c) {
+			return s;
+		} else if (!strchr(esc, c)) {
+			*s++ = c;
+		} else if (s + 4 > p) {
+			break;
+		} else {
+			*s++ = '\\';
+			*s++ = '0' + ((c & 0300) >> 6);
+			*s++ = '0' + ((c & 070) >> 3);
+			*s++ = '0' + (c & 07);
+		}
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(mangle_path);
+
+/**
+ * seq_path - seq_file interface to print a pathname
+ * @m: the seq_file handle
+ * @path: the struct path to print
+ * @esc: set of characters to escape in the output
+ *
+ * return the absolute path of 'path', as represented by the
+ * dentry / mnt pair in the path parameter.
+ */
+int seq_path(struct seq_file *m, struct path *path, char *esc)
+{
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int res = -1;
+
+	if (size) {
+		char *p = d_path(path, buf, size);
+		if (!IS_ERR(p)) {
+			char *end = mangle_path(buf, p, esc);
+			if (end)
+				res = end - buf;
+		}
+	}
+	seq_commit(m, res);
+
+	return res;
+}
+EXPORT_SYMBOL(seq_path);
+
+/*
+ * Same as seq_path, but relative to supplied root.
+ */
+int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
+		  char *esc)
+{
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int res = -ENAMETOOLONG;
+
+	if (size) {
+		char *p;
+
+		p = __d_path(path, root, buf, size);
+		if (!p)
+			return SEQ_SKIP;
+		res = PTR_ERR(p);
+		if (!IS_ERR(p)) {
+			char *end = mangle_path(buf, p, esc);
+			if (end)
+				res = end - buf;
+			else
+				res = -ENAMETOOLONG;
+		}
+	}
+	seq_commit(m, res);
+
+	return res < 0 && res != -ENAMETOOLONG ? res : 0;
+}
+
+/*
+ * returns the path of the 'dentry' from the root of its filesystem.
+ */
+int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
+{
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int res = -1;
+
+	if (size) {
+		char *p = dentry_path(dentry, buf, size);
+		if (!IS_ERR(p)) {
+			char *end = mangle_path(buf, p, esc);
+			if (end)
+				res = end - buf;
+		}
+	}
+	seq_commit(m, res);
+
+	return res;
+}
+
+int seq_bitmap(struct seq_file *m, const unsigned long *bits,
+				   unsigned int nr_bits)
+{
+	if (m->count < m->size) {
+		int len = bitmap_scnprintf(m->buf + m->count,
+				m->size - m->count, bits, nr_bits);
+		if (m->count + len < m->size) {
+			m->count += len;
+			return 0;
+		}
+	}
+	m->count = m->size;
+	return -1;
+}
+EXPORT_SYMBOL(seq_bitmap);
+
+int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
+		unsigned int nr_bits)
+{
+	if (m->count < m->size) {
+		int len = bitmap_scnlistprintf(m->buf + m->count,
+				m->size - m->count, bits, nr_bits);
+		if (m->count + len < m->size) {
+			m->count += len;
+			return 0;
+		}
+	}
+	m->count = m->size;
+	return -1;
+}
+EXPORT_SYMBOL(seq_bitmap_list);
+
+static void *single_start(struct seq_file *p, loff_t *pos)
+{
+	return NULL + (*pos == 0);
+}
+
+static void *single_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	++*pos;
+	return NULL;
+}
+
+static void single_stop(struct seq_file *p, void *v)
+{
+}
+
+int single_open(struct file *file, int (*show)(struct seq_file *, void *),
+		void *data)
+{
+	struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL);
+	int res = -ENOMEM;
+
+	if (op) {
+		op->start = single_start;
+		op->next = single_next;
+		op->stop = single_stop;
+		op->show = show;
+		res = seq_open(file, op);
+		if (!res)
+			((struct seq_file *)file->private_data)->private = data;
+		else
+			kfree(op);
+	}
+	return res;
+}
+EXPORT_SYMBOL(single_open);
+
+int single_release(struct inode *inode, struct file *file)
+{
+	const struct seq_operations *op = ((struct seq_file *)file->private_data)->op;
+	int res = seq_release(inode, file);
+	kfree(op);
+	return res;
+}
+EXPORT_SYMBOL(single_release);
+
+int seq_release_private(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+
+	kfree(seq->private);
+	seq->private = NULL;
+	return seq_release(inode, file);
+}
+EXPORT_SYMBOL(seq_release_private);
+
+void *__seq_open_private(struct file *f, const struct seq_operations *ops,
+		int psize)
+{
+	int rc;
+	void *private;
+	struct seq_file *seq;
+
+	private = kzalloc(psize, GFP_KERNEL);
+	if (private == NULL)
+		goto out;
+
+	rc = seq_open(f, ops);
+	if (rc < 0)
+		goto out_free;
+
+	seq = f->private_data;
+	seq->private = private;
+	return private;
+
+out_free:
+	kfree(private);
+out:
+	return NULL;
+}
+EXPORT_SYMBOL(__seq_open_private);
+
+int seq_open_private(struct file *filp, const struct seq_operations *ops,
+		int psize)
+{
+	return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL(seq_open_private);
+
+int seq_putc(struct seq_file *m, char c)
+{
+	if (m->count < m->size) {
+		m->buf[m->count++] = c;
+		return 0;
+	}
+	return -1;
+}
+EXPORT_SYMBOL(seq_putc);
+
+int seq_puts(struct seq_file *m, const char *s)
+{
+	int len = strlen(s);
+	if (m->count + len < m->size) {
+		memcpy(m->buf + m->count, s, len);
+		m->count += len;
+		return 0;
+	}
+	m->count = m->size;
+	return -1;
+}
+EXPORT_SYMBOL(seq_puts);
+
+/**
+ * seq_write - write arbitrary data to buffer
+ * @seq: seq_file identifying the buffer to which data should be written
+ * @data: data address
+ * @len: number of bytes
+ *
+ * Return 0 on success, non-zero otherwise.
+ */
+int seq_write(struct seq_file *seq, const void *data, size_t len)
+{
+	if (seq->count + len < seq->size) {
+		memcpy(seq->buf + seq->count, data, len);
+		seq->count += len;
+		return 0;
+	}
+	seq->count = seq->size;
+	return -1;
+}
+EXPORT_SYMBOL(seq_write);
+
+struct list_head *seq_list_start(struct list_head *head, loff_t pos)
+{
+	struct list_head *lh;
+
+	list_for_each(lh, head)
+		if (pos-- == 0)
+			return lh;
+
+	return NULL;
+}
+EXPORT_SYMBOL(seq_list_start);
+
+struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
+{
+	if (!pos)
+		return head;
+
+	return seq_list_start(head, pos - 1);
+}
+EXPORT_SYMBOL(seq_list_start_head);
+
+struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
+{
+	struct list_head *lh;
+
+	lh = ((struct list_head *)v)->next;
+	++*ppos;
+	return lh == head ? NULL : lh;
+}
+EXPORT_SYMBOL(seq_list_next);
+
+/**
+ * seq_hlist_start - start an iteration of a hlist
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start().
+ */
+struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos)
+{
+	struct hlist_node *node;
+
+	hlist_for_each(node, head)
+		if (pos-- == 0)
+			return node;
+	return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start);
+
+/**
+ * seq_hlist_start_head - start an iteration of a hlist
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start(). Call this function if you want to
+ * print a header at the top of the output.
+ */
+struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos)
+{
+	if (!pos)
+		return SEQ_START_TOKEN;
+
+	return seq_hlist_start(head, pos - 1);
+}
+EXPORT_SYMBOL(seq_hlist_start_head);
+
+/**
+ * seq_hlist_next - move to the next position of the hlist
+ * @v:    the current iterator
+ * @head: the head of the hlist
+ * @ppos: the current position
+ *
+ * Called at seq_file->op->next().
+ */
+struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
+				  loff_t *ppos)
+{
+	struct hlist_node *node = v;
+
+	++*ppos;
+	if (v == SEQ_START_TOKEN)
+		return head->first;
+	else
+		return node->next;
+}
+EXPORT_SYMBOL(seq_hlist_next);
+
+/**
+ * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start().
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
+				       loff_t pos)
+{
+	struct hlist_node *node;
+
+	__hlist_for_each_rcu(node, head)
+		if (pos-- == 0)
+			return node;
+	return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start_rcu);
+
+/**
+ * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start(). Call this function if you want to
+ * print a header at the top of the output.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
+					    loff_t pos)
+{
+	if (!pos)
+		return SEQ_START_TOKEN;
+
+	return seq_hlist_start_rcu(head, pos - 1);
+}
+EXPORT_SYMBOL(seq_hlist_start_head_rcu);
+
+/**
+ * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
+ * @v:    the current iterator
+ * @head: the head of the hlist
+ * @ppos: the current position
+ *
+ * Called at seq_file->op->next().
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+struct hlist_node *seq_hlist_next_rcu(void *v,
+				      struct hlist_head *head,
+				      loff_t *ppos)
+{
+	struct hlist_node *node = v;
+
+	++*ppos;
+	if (v == SEQ_START_TOKEN)
+		return rcu_dereference(head->first);
+	else
+		return rcu_dereference(node->next);
+}
+EXPORT_SYMBOL(seq_hlist_next_rcu);
diff --git a/fs/signalfd.c b/fs/signalfd.c
new file mode 100644
index 00000000..7ae2a574
--- /dev/null
+++ b/fs/signalfd.c
@@ -0,0 +1,295 @@
+/*
+ *  fs/signalfd.c
+ *
+ *  Copyright (C) 2003  Linus Torvalds
+ *
+ *  Mon Mar 5, 2007: Davide Libenzi <davidel@xmailserver.org>
+ *      Changed ->read() to return a siginfo strcture instead of signal number.
+ *      Fixed locking in ->poll().
+ *      Added sighand-detach notification.
+ *      Added fd re-use in sys_signalfd() syscall.
+ *      Now using anonymous inode source.
+ *      Thanks to Oleg Nesterov for useful code review and suggestions.
+ *      More comments and suggestions from Arnd Bergmann.
+ *  Sat May 19, 2007: Davi E. M. Arnaut <davi@haxent.com.br>
+ *      Retrieve multiple signals with one read() call
+ *  Sun Jul 15, 2007: Davide Libenzi <davidel@xmailserver.org>
+ *      Attach to the sighand only during read() and poll().
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/list.h>
+#include <linux/anon_inodes.h>
+#include <linux/signalfd.h>
+#include <linux/syscalls.h>
+
+void signalfd_cleanup(struct sighand_struct *sighand)
+{
+	wait_queue_head_t *wqh = &sighand->signalfd_wqh;
+	/*
+	 * The lockless check can race with remove_wait_queue() in progress,
+	 * but in this case its caller should run under rcu_read_lock() and
+	 * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return.
+	 */
+	if (likely(!waitqueue_active(wqh)))
+		return;
+
+	/* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */
+	wake_up_poll(wqh, POLLHUP | POLLFREE);
+}
+
+struct signalfd_ctx {
+	sigset_t sigmask;
+};
+
+static int signalfd_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static unsigned int signalfd_poll(struct file *file, poll_table *wait)
+{
+	struct signalfd_ctx *ctx = file->private_data;
+	unsigned int events = 0;
+
+	poll_wait(file, &current->sighand->signalfd_wqh, wait);
+
+	spin_lock_irq(&current->sighand->siglock);
+	if (next_signal(&current->pending, &ctx->sigmask) ||
+	    next_signal(&current->signal->shared_pending,
+			&ctx->sigmask))
+		events |= POLLIN;
+	spin_unlock_irq(&current->sighand->siglock);
+
+	return events;
+}
+
+/*
+ * Copied from copy_siginfo_to_user() in kernel/signal.c
+ */
+static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
+			     siginfo_t const *kinfo)
+{
+	long err;
+
+	BUILD_BUG_ON(sizeof(struct signalfd_siginfo) != 128);
+
+	/*
+	 * Unused members should be zero ...
+	 */
+	err = __clear_user(uinfo, sizeof(*uinfo));
+
+	/*
+	 * If you change siginfo_t structure, please be sure
+	 * this code is fixed accordingly.
+	 */
+	err |= __put_user(kinfo->si_signo, &uinfo->ssi_signo);
+	err |= __put_user(kinfo->si_errno, &uinfo->ssi_errno);
+	err |= __put_user((short) kinfo->si_code, &uinfo->ssi_code);
+	switch (kinfo->si_code & __SI_MASK) {
+	case __SI_KILL:
+		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
+		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
+		break;
+	case __SI_TIMER:
+		 err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid);
+		 err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun);
+		 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+		 err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
+		break;
+	case __SI_POLL:
+		err |= __put_user(kinfo->si_band, &uinfo->ssi_band);
+		err |= __put_user(kinfo->si_fd, &uinfo->ssi_fd);
+		break;
+	case __SI_FAULT:
+		err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr);
+#ifdef __ARCH_SI_TRAPNO
+		err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
+#endif
+#ifdef BUS_MCEERR_AO
+		/* 
+		 * Other callers might not initialize the si_lsb field,
+		 * so check explicitly for the right codes here.
+		 */
+		if (kinfo->si_code == BUS_MCEERR_AR ||
+		    kinfo->si_code == BUS_MCEERR_AO)
+			err |= __put_user((short) kinfo->si_addr_lsb,
+					  &uinfo->ssi_addr_lsb);
+#endif
+		break;
+	case __SI_CHLD:
+		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
+		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
+		err |= __put_user(kinfo->si_status, &uinfo->ssi_status);
+		err |= __put_user(kinfo->si_utime, &uinfo->ssi_utime);
+		err |= __put_user(kinfo->si_stime, &uinfo->ssi_stime);
+		break;
+	case __SI_RT: /* This is not generated by the kernel as of now. */
+	case __SI_MESGQ: /* But this is */
+		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
+		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
+		err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+		err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
+		break;
+	default:
+		/*
+		 * This case catches also the signals queued by sigqueue().
+		 */
+		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
+		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
+		err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+		err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
+		break;
+	}
+
+	return err ? -EFAULT: sizeof(*uinfo);
+}
+
+static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info,
+				int nonblock)
+{
+	ssize_t ret;
+	DECLARE_WAITQUEUE(wait, current);
+
+	spin_lock_irq(&current->sighand->siglock);
+	ret = dequeue_signal(current, &ctx->sigmask, info);
+	switch (ret) {
+	case 0:
+		if (!nonblock)
+			break;
+		ret = -EAGAIN;
+	default:
+		spin_unlock_irq(&current->sighand->siglock);
+		return ret;
+	}
+
+	add_wait_queue(&current->sighand->signalfd_wqh, &wait);
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		ret = dequeue_signal(current, &ctx->sigmask, info);
+		if (ret != 0)
+			break;
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		spin_unlock_irq(&current->sighand->siglock);
+		schedule();
+		spin_lock_irq(&current->sighand->siglock);
+	}
+	spin_unlock_irq(&current->sighand->siglock);
+
+	remove_wait_queue(&current->sighand->signalfd_wqh, &wait);
+	__set_current_state(TASK_RUNNING);
+
+	return ret;
+}
+
+/*
+ * Returns a multiple of the size of a "struct signalfd_siginfo", or a negative
+ * error code. The "count" parameter must be at least the size of a
+ * "struct signalfd_siginfo".
+ */
+static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
+			     loff_t *ppos)
+{
+	struct signalfd_ctx *ctx = file->private_data;
+	struct signalfd_siginfo __user *siginfo;
+	int nonblock = file->f_flags & O_NONBLOCK;
+	ssize_t ret, total = 0;
+	siginfo_t info;
+
+	count /= sizeof(struct signalfd_siginfo);
+	if (!count)
+		return -EINVAL;
+
+	siginfo = (struct signalfd_siginfo __user *) buf;
+	do {
+		ret = signalfd_dequeue(ctx, &info, nonblock);
+		if (unlikely(ret <= 0))
+			break;
+		ret = signalfd_copyinfo(siginfo, &info);
+		if (ret < 0)
+			break;
+		siginfo++;
+		total += ret;
+		nonblock = 1;
+	} while (--count);
+
+	return total ? total: ret;
+}
+
+static const struct file_operations signalfd_fops = {
+	.release	= signalfd_release,
+	.poll		= signalfd_poll,
+	.read		= signalfd_read,
+	.llseek		= noop_llseek,
+};
+
+SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
+		size_t, sizemask, int, flags)
+{
+	sigset_t sigmask;
+	struct signalfd_ctx *ctx;
+
+	/* Check the SFD_* constants for consistency.  */
+	BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK);
+
+	if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK))
+		return -EINVAL;
+
+	if (sizemask != sizeof(sigset_t) ||
+	    copy_from_user(&sigmask, user_mask, sizeof(sigmask)))
+		return -EINVAL;
+	sigdelsetmask(&sigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+	signotset(&sigmask);
+
+	if (ufd == -1) {
+		ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+		if (!ctx)
+			return -ENOMEM;
+
+		ctx->sigmask = sigmask;
+
+		/*
+		 * When we call this, the initialization must be complete, since
+		 * anon_inode_getfd() will install the fd.
+		 */
+		ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx,
+				       O_RDWR | (flags & (O_CLOEXEC | O_NONBLOCK)));
+		if (ufd < 0)
+			kfree(ctx);
+	} else {
+		struct file *file = fget(ufd);
+		if (!file)
+			return -EBADF;
+		ctx = file->private_data;
+		if (file->f_op != &signalfd_fops) {
+			fput(file);
+			return -EINVAL;
+		}
+		spin_lock_irq(&current->sighand->siglock);
+		ctx->sigmask = sigmask;
+		spin_unlock_irq(&current->sighand->siglock);
+
+		wake_up(&current->sighand->signalfd_wqh);
+		fput(file);
+	}
+
+	return ufd;
+}
+
+SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask,
+		size_t, sizemask)
+{
+	return sys_signalfd4(ufd, user_mask, sizemask, 0);
+}
diff --git a/fs/splice.c b/fs/splice.c
new file mode 100644
index 00000000..9d890085
--- /dev/null
+++ b/fs/splice.c
@@ -0,0 +1,2047 @@
+/*
+ * "splice": joining two ropes together by interweaving their strands.
+ *
+ * This is the "extended pipe" functionality, where a pipe is used as
+ * an arbitrary in-memory buffer. Think of a pipe as a small kernel
+ * buffer that you can use to transfer data from one end to the other.
+ *
+ * The traditional unix read/write is extended with a "splice()" operation
+ * that transfers data buffers to or from a pipe buffer.
+ *
+ * Named by Larry McVoy, original implementation from Linus, extended by
+ * Jens to support splicing to files, network, direct splicing, etc and
+ * fixing lots of bugs.
+ *
+ * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
+ * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
+ * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
+ *
+ */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/splice.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+#include <linux/uio.h>
+#include <linux/security.h>
+#include <linux/gfp.h>
+#include <linux/socket.h>
+
+/*
+ * Attempt to steal a page from a pipe buffer. This should perhaps go into
+ * a vm helper function, it's already simplified quite a bit by the
+ * addition of remove_mapping(). If success is returned, the caller may
+ * attempt to reuse this page for another destination.
+ */
+static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
+				     struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+	struct address_space *mapping;
+
+	lock_page(page);
+
+	mapping = page_mapping(page);
+	if (mapping) {
+		WARN_ON(!PageUptodate(page));
+
+		/*
+		 * At least for ext2 with nobh option, we need to wait on
+		 * writeback completing on this page, since we'll remove it
+		 * from the pagecache.  Otherwise truncate wont wait on the
+		 * page, allowing the disk blocks to be reused by someone else
+		 * before we actually wrote our data to them. fs corruption
+		 * ensues.
+		 */
+		wait_on_page_writeback(page);
+
+		if (page_has_private(page) &&
+		    !try_to_release_page(page, GFP_KERNEL))
+			goto out_unlock;
+
+		/*
+		 * If we succeeded in removing the mapping, set LRU flag
+		 * and return good.
+		 */
+		if (remove_mapping(mapping, page)) {
+			buf->flags |= PIPE_BUF_FLAG_LRU;
+			return 0;
+		}
+	}
+
+	/*
+	 * Raced with truncate or failed to remove page from current
+	 * address space, unlock and return failure.
+	 */
+out_unlock:
+	unlock_page(page);
+	return 1;
+}
+
+static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
+					struct pipe_buffer *buf)
+{
+	page_cache_release(buf->page);
+	buf->flags &= ~PIPE_BUF_FLAG_LRU;
+}
+
+/*
+ * Check whether the contents of buf is OK to access. Since the content
+ * is a page cache page, IO may be in flight.
+ */
+static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
+				       struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+	int err;
+
+	if (!PageUptodate(page)) {
+		lock_page(page);
+
+		/*
+		 * Page got truncated/unhashed. This will cause a 0-byte
+		 * splice, if this is the first page.
+		 */
+		if (!page->mapping) {
+			err = -ENODATA;
+			goto error;
+		}
+
+		/*
+		 * Uh oh, read-error from disk.
+		 */
+		if (!PageUptodate(page)) {
+			err = -EIO;
+			goto error;
+		}
+
+		/*
+		 * Page is ok afterall, we are done.
+		 */
+		unlock_page(page);
+	}
+
+	return 0;
+error:
+	unlock_page(page);
+	return err;
+}
+
+static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = page_cache_pipe_buf_confirm,
+	.release = page_cache_pipe_buf_release,
+	.steal = page_cache_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
+		return 1;
+
+	buf->flags |= PIPE_BUF_FLAG_LRU;
+	return generic_pipe_buf_steal(pipe, buf);
+}
+
+static const struct pipe_buf_operations user_page_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = page_cache_pipe_buf_release,
+	.steal = user_page_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
+{
+	smp_mb();
+	if (waitqueue_active(&pipe->wait))
+		wake_up_interruptible(&pipe->wait);
+	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+}
+
+/**
+ * splice_to_pipe - fill passed data into a pipe
+ * @pipe:	pipe to fill
+ * @spd:	data to fill
+ *
+ * Description:
+ *    @spd contains a map of pages and len/offset tuples, along with
+ *    the struct pipe_buf_operations associated with these pages. This
+ *    function will link that data to the pipe.
+ *
+ */
+ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
+		       struct splice_pipe_desc *spd)
+{
+	unsigned int spd_pages = spd->nr_pages;
+	int ret, do_wakeup, page_nr;
+
+	ret = 0;
+	do_wakeup = 0;
+	page_nr = 0;
+
+	pipe_lock(pipe);
+
+	for (;;) {
+		if (!pipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+
+		if (pipe->nrbufs < pipe->buffers) {
+			int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+			struct pipe_buffer *buf = pipe->bufs + newbuf;
+
+			buf->page = spd->pages[page_nr];
+			buf->offset = spd->partial[page_nr].offset;
+			buf->len = spd->partial[page_nr].len;
+			buf->private = spd->partial[page_nr].private;
+			buf->ops = spd->ops;
+			if (spd->flags & SPLICE_F_GIFT)
+				buf->flags |= PIPE_BUF_FLAG_GIFT;
+
+			pipe->nrbufs++;
+			page_nr++;
+			ret += buf->len;
+
+			if (pipe->inode)
+				do_wakeup = 1;
+
+			if (!--spd->nr_pages)
+				break;
+			if (pipe->nrbufs < pipe->buffers)
+				continue;
+
+			break;
+		}
+
+		if (spd->flags & SPLICE_F_NONBLOCK) {
+			if (!ret)
+				ret = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (do_wakeup) {
+			smp_mb();
+			if (waitqueue_active(&pipe->wait))
+				wake_up_interruptible_sync(&pipe->wait);
+			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+			do_wakeup = 0;
+		}
+
+		pipe->waiting_writers++;
+		pipe_wait(pipe);
+		pipe->waiting_writers--;
+	}
+
+	pipe_unlock(pipe);
+
+	if (do_wakeup)
+		wakeup_pipe_readers(pipe);
+
+	while (page_nr < spd_pages)
+		spd->spd_release(spd, page_nr++);
+
+	return ret;
+}
+
+static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
+{
+	page_cache_release(spd->pages[i]);
+}
+
+/*
+ * Check if we need to grow the arrays holding pages and partial page
+ * descriptions.
+ */
+int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
+{
+	if (pipe->buffers <= PIPE_DEF_BUFFERS)
+		return 0;
+
+	spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
+	spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
+
+	if (spd->pages && spd->partial)
+		return 0;
+
+	kfree(spd->pages);
+	kfree(spd->partial);
+	return -ENOMEM;
+}
+
+void splice_shrink_spd(struct pipe_inode_info *pipe,
+		       struct splice_pipe_desc *spd)
+{
+	if (pipe->buffers <= PIPE_DEF_BUFFERS)
+		return;
+
+	kfree(spd->pages);
+	kfree(spd->partial);
+}
+
+static int
+__generic_file_splice_read(struct file *in, loff_t *ppos,
+			   struct pipe_inode_info *pipe, size_t len,
+			   unsigned int flags)
+{
+	struct address_space *mapping = in->f_mapping;
+	unsigned int loff, nr_pages, req_pages;
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
+	struct page *page;
+	pgoff_t index, end_index;
+	loff_t isize;
+	int error, page_nr;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &page_cache_pipe_buf_ops,
+		.spd_release = spd_release_page,
+	};
+
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	loff = *ppos & ~PAGE_CACHE_MASK;
+	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	nr_pages = min(req_pages, pipe->buffers);
+
+	/*
+	 * Lookup the (hopefully) full range of pages we need.
+	 */
+	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
+	index += spd.nr_pages;
+
+	/*
+	 * If find_get_pages_contig() returned fewer pages than we needed,
+	 * readahead/allocate the rest and fill in the holes.
+	 */
+	if (spd.nr_pages < nr_pages)
+		page_cache_sync_readahead(mapping, &in->f_ra, in,
+				index, req_pages - spd.nr_pages);
+
+	error = 0;
+	while (spd.nr_pages < nr_pages) {
+		/*
+		 * Page could be there, find_get_pages_contig() breaks on
+		 * the first hole.
+		 */
+		page = find_get_page(mapping, index);
+		if (!page) {
+			/*
+			 * page didn't exist, allocate one.
+			 */
+			page = page_cache_alloc_cold(mapping);
+			if (!page)
+				break;
+
+			error = add_to_page_cache_lru(page, mapping, index,
+						GFP_KERNEL);
+			if (unlikely(error)) {
+				page_cache_release(page);
+				if (error == -EEXIST)
+					continue;
+				break;
+			}
+			/*
+			 * add_to_page_cache() locks the page, unlock it
+			 * to avoid convoluting the logic below even more.
+			 */
+			unlock_page(page);
+		}
+
+		spd.pages[spd.nr_pages++] = page;
+		index++;
+	}
+
+	/*
+	 * Now loop over the map and see if we need to start IO on any
+	 * pages, fill in the partial map, etc.
+	 */
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	nr_pages = spd.nr_pages;
+	spd.nr_pages = 0;
+	for (page_nr = 0; page_nr < nr_pages; page_nr++) {
+		unsigned int this_len;
+
+		if (!len)
+			break;
+
+		/*
+		 * this_len is the max we'll use from this page
+		 */
+		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+		page = spd.pages[page_nr];
+
+		if (PageReadahead(page))
+			page_cache_async_readahead(mapping, &in->f_ra, in,
+					page, index, req_pages - page_nr);
+
+		/*
+		 * If the page isn't uptodate, we may need to start io on it
+		 */
+		if (!PageUptodate(page)) {
+			lock_page(page);
+
+			/*
+			 * Page was truncated, or invalidated by the
+			 * filesystem.  Redo the find/create, but this time the
+			 * page is kept locked, so there's no chance of another
+			 * race with truncate/invalidate.
+			 */
+			if (!page->mapping) {
+				unlock_page(page);
+				page = find_or_create_page(mapping, index,
+						mapping_gfp_mask(mapping));
+
+				if (!page) {
+					error = -ENOMEM;
+					break;
+				}
+				page_cache_release(spd.pages[page_nr]);
+				spd.pages[page_nr] = page;
+			}
+			/*
+			 * page was already under io and is now done, great
+			 */
+			if (PageUptodate(page)) {
+				unlock_page(page);
+				goto fill_it;
+			}
+
+			/*
+			 * need to read in the page
+			 */
+			error = mapping->a_ops->readpage(in, page);
+			if (unlikely(error)) {
+				/*
+				 * We really should re-lookup the page here,
+				 * but it complicates things a lot. Instead
+				 * lets just do what we already stored, and
+				 * we'll get it the next time we are called.
+				 */
+				if (error == AOP_TRUNCATED_PAGE)
+					error = 0;
+
+				break;
+			}
+		}
+fill_it:
+		/*
+		 * i_size must be checked after PageUptodate.
+		 */
+		isize = i_size_read(mapping->host);
+		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+		if (unlikely(!isize || index > end_index))
+			break;
+
+		/*
+		 * if this is the last page, see if we need to shrink
+		 * the length and stop
+		 */
+		if (end_index == index) {
+			unsigned int plen;
+
+			/*
+			 * max good bytes in this page
+			 */
+			plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+			if (plen <= loff)
+				break;
+
+			/*
+			 * force quit after adding this page
+			 */
+			this_len = min(this_len, plen - loff);
+			len = this_len;
+		}
+
+		spd.partial[page_nr].offset = loff;
+		spd.partial[page_nr].len = this_len;
+		len -= this_len;
+		loff = 0;
+		spd.nr_pages++;
+		index++;
+	}
+
+	/*
+	 * Release any pages at the end, if we quit early. 'page_nr' is how far
+	 * we got, 'nr_pages' is how many pages are in the map.
+	 */
+	while (page_nr < nr_pages)
+		page_cache_release(spd.pages[page_nr++]);
+	in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+
+	if (spd.nr_pages)
+		error = splice_to_pipe(pipe, &spd);
+
+	splice_shrink_spd(pipe, &spd);
+	return error;
+}
+
+/**
+ * generic_file_splice_read - splice data from file to a pipe
+ * @in:		file to splice from
+ * @ppos:	position in @in
+ * @pipe:	pipe to splice to
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    Will read pages from given file and fill them into a pipe. Can be
+ *    used as long as the address_space operations for the source implements
+ *    a readpage() hook.
+ *
+ */
+ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags)
+{
+	loff_t isize, left;
+	int ret;
+
+	isize = i_size_read(in->f_mapping->host);
+	if (unlikely(*ppos >= isize))
+		return 0;
+
+	left = isize - *ppos;
+	if (unlikely(left < len))
+		len = left;
+
+	ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
+	if (ret > 0) {
+		*ppos += ret;
+		file_accessed(in);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(generic_file_splice_read);
+
+static const struct pipe_buf_operations default_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = generic_pipe_buf_release,
+	.steal = generic_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
+			    unsigned long vlen, loff_t offset)
+{
+	mm_segment_t old_fs;
+	loff_t pos = offset;
+	ssize_t res;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+	set_fs(old_fs);
+
+	return res;
+}
+
+static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
+			    loff_t pos)
+{
+	mm_segment_t old_fs;
+	ssize_t res;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = vfs_write(file, (const char __user *)buf, count, &pos);
+	set_fs(old_fs);
+
+	return res;
+}
+
+ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags)
+{
+	unsigned int nr_pages;
+	unsigned int nr_freed;
+	size_t offset;
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
+	struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
+	ssize_t res;
+	size_t this_len;
+	int error;
+	int i;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &default_pipe_buf_ops,
+		.spd_release = spd_release_page,
+	};
+
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
+	res = -ENOMEM;
+	vec = __vec;
+	if (pipe->buffers > PIPE_DEF_BUFFERS) {
+		vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
+		if (!vec)
+			goto shrink_ret;
+	}
+
+	offset = *ppos & ~PAGE_CACHE_MASK;
+	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
+		struct page *page;
+
+		page = alloc_page(GFP_USER);
+		error = -ENOMEM;
+		if (!page)
+			goto err;
+
+		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
+		vec[i].iov_base = (void __user *) page_address(page);
+		vec[i].iov_len = this_len;
+		spd.pages[i] = page;
+		spd.nr_pages++;
+		len -= this_len;
+		offset = 0;
+	}
+
+	res = kernel_readv(in, vec, spd.nr_pages, *ppos);
+	if (res < 0) {
+		error = res;
+		goto err;
+	}
+
+	error = 0;
+	if (!res)
+		goto err;
+
+	nr_freed = 0;
+	for (i = 0; i < spd.nr_pages; i++) {
+		this_len = min_t(size_t, vec[i].iov_len, res);
+		spd.partial[i].offset = 0;
+		spd.partial[i].len = this_len;
+		if (!this_len) {
+			__free_page(spd.pages[i]);
+			spd.pages[i] = NULL;
+			nr_freed++;
+		}
+		res -= this_len;
+	}
+	spd.nr_pages -= nr_freed;
+
+	res = splice_to_pipe(pipe, &spd);
+	if (res > 0)
+		*ppos += res;
+
+shrink_ret:
+	if (vec != __vec)
+		kfree(vec);
+	splice_shrink_spd(pipe, &spd);
+	return res;
+
+err:
+	for (i = 0; i < spd.nr_pages; i++)
+		__free_page(spd.pages[i]);
+
+	res = error;
+	goto shrink_ret;
+}
+EXPORT_SYMBOL(default_file_splice_read);
+
+/*
+ * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
+ * using sendpage(). Return the number of bytes sent.
+ */
+static int pipe_to_sendpage(struct pipe_inode_info *pipe,
+			    struct pipe_buffer *buf, struct splice_desc *sd)
+{
+	struct file *file = sd->u.file;
+	loff_t pos = sd->pos;
+	int more;
+
+	if (!likely(file->f_op && file->f_op->sendpage))
+		return -EINVAL;
+
+	more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
+	if (sd->len < sd->total_len)
+		more |= MSG_SENDPAGE_NOTLAST;
+	return file->f_op->sendpage(file, buf->page, buf->offset,
+				    sd->len, &pos, more);
+}
+
+/*
+ * This is a little more tricky than the file -> pipe splicing. There are
+ * basically three cases:
+ *
+ *	- Destination page already exists in the address space and there
+ *	  are users of it. For that case we have no other option that
+ *	  copying the data. Tough luck.
+ *	- Destination page already exists in the address space, but there
+ *	  are no users of it. Make sure it's uptodate, then drop it. Fall
+ *	  through to last case.
+ *	- Destination page does not exist, we can add the pipe page to
+ *	  the page cache and avoid the copy.
+ *
+ * If asked to move pages to the output file (SPLICE_F_MOVE is set in
+ * sd->flags), we attempt to migrate pages from the pipe to the output
+ * file address space page cache. This is possible if no one else has
+ * the pipe page referenced outside of the pipe and page cache. If
+ * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
+ * a new page in the output file page cache and fill/dirty that.
+ */
+int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+		 struct splice_desc *sd)
+{
+	struct file *file = sd->u.file;
+	struct address_space *mapping = file->f_mapping;
+	unsigned int offset, this_len;
+	struct page *page;
+	void *fsdata;
+	int ret;
+
+	offset = sd->pos & ~PAGE_CACHE_MASK;
+
+	this_len = sd->len;
+	if (this_len + offset > PAGE_CACHE_SIZE)
+		this_len = PAGE_CACHE_SIZE - offset;
+
+	ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
+				AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+	if (unlikely(ret))
+		goto out;
+
+	if (buf->page != page) {
+		/*
+		 * Careful, ->map() uses KM_USER0!
+		 */
+		char *src = buf->ops->map(pipe, buf, 1);
+		char *dst = kmap_atomic(page, KM_USER1);
+
+		memcpy(dst + offset, src + buf->offset, this_len);
+		flush_dcache_page(page);
+		kunmap_atomic(dst, KM_USER1);
+		buf->ops->unmap(pipe, buf, src);
+	}
+	ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
+				page, fsdata);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(pipe_to_file);
+
+static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
+{
+	smp_mb();
+	if (waitqueue_active(&pipe->wait))
+		wake_up_interruptible(&pipe->wait);
+	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+}
+
+/**
+ * splice_from_pipe_feed - feed available data from a pipe to a file
+ * @pipe:	pipe to splice from
+ * @sd:		information to @actor
+ * @actor:	handler that splices the data
+ *
+ * Description:
+ *    This function loops over the pipe and calls @actor to do the
+ *    actual moving of a single struct pipe_buffer to the desired
+ *    destination.  It returns when there's no more buffers left in
+ *    the pipe or if the requested number of bytes (@sd->total_len)
+ *    have been copied.  It returns a positive number (one) if the
+ *    pipe needs to be filled with more data, zero if the required
+ *    number of bytes have been copied and -errno on error.
+ *
+ *    This, together with splice_from_pipe_{begin,end,next}, may be
+ *    used to implement the functionality of __splice_from_pipe() when
+ *    locking is required around copying the pipe buffers to the
+ *    destination.
+ */
+int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
+			  splice_actor *actor)
+{
+	int ret;
+
+	while (pipe->nrbufs) {
+		struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+		const struct pipe_buf_operations *ops = buf->ops;
+
+		sd->len = buf->len;
+		if (sd->len > sd->total_len)
+			sd->len = sd->total_len;
+
+		ret = buf->ops->confirm(pipe, buf);
+		if (unlikely(ret)) {
+			if (ret == -ENODATA)
+				ret = 0;
+			return ret;
+		}
+
+		ret = actor(pipe, buf, sd);
+		if (ret <= 0)
+			return ret;
+
+		buf->offset += ret;
+		buf->len -= ret;
+
+		sd->num_spliced += ret;
+		sd->len -= ret;
+		sd->pos += ret;
+		sd->total_len -= ret;
+
+		if (!buf->len) {
+			buf->ops = NULL;
+			ops->release(pipe, buf);
+			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+			pipe->nrbufs--;
+			if (pipe->inode)
+				sd->need_wakeup = true;
+		}
+
+		if (!sd->total_len)
+			return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(splice_from_pipe_feed);
+
+/**
+ * splice_from_pipe_next - wait for some data to splice from
+ * @pipe:	pipe to splice from
+ * @sd:		information about the splice operation
+ *
+ * Description:
+ *    This function will wait for some data and return a positive
+ *    value (one) if pipe buffers are available.  It will return zero
+ *    or -errno if no more data needs to be spliced.
+ */
+int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+	while (!pipe->nrbufs) {
+		if (!pipe->writers)
+			return 0;
+
+		if (!pipe->waiting_writers && sd->num_spliced)
+			return 0;
+
+		if (sd->flags & SPLICE_F_NONBLOCK)
+			return -EAGAIN;
+
+		if (signal_pending(current))
+			return -ERESTARTSYS;
+
+		if (sd->need_wakeup) {
+			wakeup_pipe_writers(pipe);
+			sd->need_wakeup = false;
+		}
+
+		pipe_wait(pipe);
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(splice_from_pipe_next);
+
+/**
+ * splice_from_pipe_begin - start splicing from pipe
+ * @sd:		information about the splice operation
+ *
+ * Description:
+ *    This function should be called before a loop containing
+ *    splice_from_pipe_next() and splice_from_pipe_feed() to
+ *    initialize the necessary fields of @sd.
+ */
+void splice_from_pipe_begin(struct splice_desc *sd)
+{
+	sd->num_spliced = 0;
+	sd->need_wakeup = false;
+}
+EXPORT_SYMBOL(splice_from_pipe_begin);
+
+/**
+ * splice_from_pipe_end - finish splicing from pipe
+ * @pipe:	pipe to splice from
+ * @sd:		information about the splice operation
+ *
+ * Description:
+ *    This function will wake up pipe writers if necessary.  It should
+ *    be called after a loop containing splice_from_pipe_next() and
+ *    splice_from_pipe_feed().
+ */
+void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+	if (sd->need_wakeup)
+		wakeup_pipe_writers(pipe);
+}
+EXPORT_SYMBOL(splice_from_pipe_end);
+
+/**
+ * __splice_from_pipe - splice data from a pipe to given actor
+ * @pipe:	pipe to splice from
+ * @sd:		information to @actor
+ * @actor:	handler that splices the data
+ *
+ * Description:
+ *    This function does little more than loop over the pipe and call
+ *    @actor to do the actual moving of a single struct pipe_buffer to
+ *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
+ *    pipe_to_user.
+ *
+ */
+ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
+			   splice_actor *actor)
+{
+	int ret;
+
+	splice_from_pipe_begin(sd);
+	do {
+		ret = splice_from_pipe_next(pipe, sd);
+		if (ret > 0)
+			ret = splice_from_pipe_feed(pipe, sd, actor);
+	} while (ret > 0);
+	splice_from_pipe_end(pipe, sd);
+
+	return sd->num_spliced ? sd->num_spliced : ret;
+}
+EXPORT_SYMBOL(__splice_from_pipe);
+
+/**
+ * splice_from_pipe - splice data from a pipe to a file
+ * @pipe:	pipe to splice from
+ * @out:	file to splice to
+ * @ppos:	position in @out
+ * @len:	how many bytes to splice
+ * @flags:	splice modifier flags
+ * @actor:	handler that splices the data
+ *
+ * Description:
+ *    See __splice_from_pipe. This function locks the pipe inode,
+ *    otherwise it's identical to __splice_from_pipe().
+ *
+ */
+ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
+			 loff_t *ppos, size_t len, unsigned int flags,
+			 splice_actor *actor)
+{
+	ssize_t ret;
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+		.u.file = out,
+	};
+
+	pipe_lock(pipe);
+	ret = __splice_from_pipe(pipe, &sd, actor);
+	pipe_unlock(pipe);
+
+	return ret;
+}
+
+/**
+ * generic_file_splice_write - splice data from a pipe to a file
+ * @pipe:	pipe info
+ * @out:	file to write to
+ * @ppos:	position in @out
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    Will either move or copy pages (determined by @flags options) from
+ *    the given pipe inode to the given file.
+ *
+ */
+ssize_t
+generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
+			  loff_t *ppos, size_t len, unsigned int flags)
+{
+	struct address_space *mapping = out->f_mapping;
+	struct inode *inode = mapping->host;
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+		.u.file = out,
+	};
+	ssize_t ret;
+
+	pipe_lock(pipe);
+
+	splice_from_pipe_begin(&sd);
+	do {
+		ret = splice_from_pipe_next(pipe, &sd);
+		if (ret <= 0)
+			break;
+
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		ret = file_remove_suid(out);
+		if (!ret) {
+			file_update_time(out);
+			ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
+		}
+		mutex_unlock(&inode->i_mutex);
+	} while (ret > 0);
+	splice_from_pipe_end(pipe, &sd);
+
+	pipe_unlock(pipe);
+
+	if (sd.num_spliced)
+		ret = sd.num_spliced;
+
+	if (ret > 0) {
+		unsigned long nr_pages;
+		int err;
+
+		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+		err = generic_write_sync(out, *ppos, ret);
+		if (err)
+			ret = err;
+		else
+			*ppos += ret;
+		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+	}
+
+	return ret;
+}
+
+EXPORT_SYMBOL(generic_file_splice_write);
+
+static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+			  struct splice_desc *sd)
+{
+	int ret;
+	void *data;
+
+	data = buf->ops->map(pipe, buf, 0);
+	ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
+	buf->ops->unmap(pipe, buf, data);
+
+	return ret;
+}
+
+static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
+					 struct file *out, loff_t *ppos,
+					 size_t len, unsigned int flags)
+{
+	ssize_t ret;
+
+	ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
+	if (ret > 0)
+		*ppos += ret;
+
+	return ret;
+}
+
+/**
+ * generic_splice_sendpage - splice data from a pipe to a socket
+ * @pipe:	pipe to splice from
+ * @out:	socket to write to
+ * @ppos:	position in @out
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    Will send @len bytes from the pipe to a network socket. No data copying
+ *    is involved.
+ *
+ */
+ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
+				loff_t *ppos, size_t len, unsigned int flags)
+{
+	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
+}
+
+EXPORT_SYMBOL(generic_splice_sendpage);
+
+/*
+ * Attempt to initiate a splice from pipe to file.
+ */
+static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+			   loff_t *ppos, size_t len, unsigned int flags)
+{
+	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
+				loff_t *, size_t, unsigned int);
+	int ret;
+
+	if (unlikely(!(out->f_mode & FMODE_WRITE)))
+		return -EBADF;
+
+	if (unlikely(out->f_flags & O_APPEND))
+		return -EINVAL;
+
+	ret = rw_verify_area(WRITE, out, ppos, len);
+	if (unlikely(ret < 0))
+		return ret;
+
+	if (out->f_op && out->f_op->splice_write)
+		splice_write = out->f_op->splice_write;
+	else
+		splice_write = default_file_splice_write;
+
+	return splice_write(pipe, out, ppos, len, flags);
+}
+
+/*
+ * Attempt to initiate a splice from a file to a pipe.
+ */
+static long do_splice_to(struct file *in, loff_t *ppos,
+			 struct pipe_inode_info *pipe, size_t len,
+			 unsigned int flags)
+{
+	ssize_t (*splice_read)(struct file *, loff_t *,
+			       struct pipe_inode_info *, size_t, unsigned int);
+	int ret;
+
+	if (unlikely(!(in->f_mode & FMODE_READ)))
+		return -EBADF;
+
+	ret = rw_verify_area(READ, in, ppos, len);
+	if (unlikely(ret < 0))
+		return ret;
+
+	if (in->f_op && in->f_op->splice_read)
+		splice_read = in->f_op->splice_read;
+	else
+		splice_read = default_file_splice_read;
+
+	return splice_read(in, ppos, pipe, len, flags);
+}
+
+/**
+ * splice_direct_to_actor - splices data directly between two non-pipes
+ * @in:		file to splice from
+ * @sd:		actor information on where to splice to
+ * @actor:	handles the data splicing
+ *
+ * Description:
+ *    This is a special case helper to splice directly between two
+ *    points, without requiring an explicit pipe. Internally an allocated
+ *    pipe is cached in the process, and reused during the lifetime of
+ *    that process.
+ *
+ */
+ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
+			       splice_direct_actor *actor)
+{
+	struct pipe_inode_info *pipe;
+	long ret, bytes;
+	umode_t i_mode;
+	size_t len;
+	int i, flags;
+
+	/*
+	 * We require the input being a regular file, as we don't want to
+	 * randomly drop data for eg socket -> socket splicing. Use the
+	 * piped splicing for that!
+	 */
+	i_mode = in->f_path.dentry->d_inode->i_mode;
+	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
+		return -EINVAL;
+
+	/*
+	 * neither in nor out is a pipe, setup an internal pipe attached to
+	 * 'out' and transfer the wanted data from 'in' to 'out' through that
+	 */
+	pipe = current->splice_pipe;
+	if (unlikely(!pipe)) {
+		pipe = alloc_pipe_info(NULL);
+		if (!pipe)
+			return -ENOMEM;
+
+		/*
+		 * We don't have an immediate reader, but we'll read the stuff
+		 * out of the pipe right after the splice_to_pipe(). So set
+		 * PIPE_READERS appropriately.
+		 */
+		pipe->readers = 1;
+
+		current->splice_pipe = pipe;
+	}
+
+	/*
+	 * Do the splice.
+	 */
+	ret = 0;
+	bytes = 0;
+	len = sd->total_len;
+	flags = sd->flags;
+
+	/*
+	 * Don't block on output, we have to drain the direct pipe.
+	 */
+	sd->flags &= ~SPLICE_F_NONBLOCK;
+
+	while (len) {
+		size_t read_len;
+		loff_t pos = sd->pos, prev_pos = pos;
+
+		ret = do_splice_to(in, &pos, pipe, len, flags);
+		if (unlikely(ret <= 0))
+			goto out_release;
+
+		read_len = ret;
+		sd->total_len = read_len;
+
+		/*
+		 * NOTE: nonblocking mode only applies to the input. We
+		 * must not do the output in nonblocking mode as then we
+		 * could get stuck data in the internal pipe:
+		 */
+		ret = actor(pipe, sd);
+		if (unlikely(ret <= 0)) {
+			sd->pos = prev_pos;
+			goto out_release;
+		}
+
+		bytes += ret;
+		len -= ret;
+		sd->pos = pos;
+
+		if (ret < read_len) {
+			sd->pos = prev_pos + ret;
+			goto out_release;
+		}
+	}
+
+done:
+	pipe->nrbufs = pipe->curbuf = 0;
+	file_accessed(in);
+	return bytes;
+
+out_release:
+	/*
+	 * If we did an incomplete transfer we must release
+	 * the pipe buffers in question:
+	 */
+	for (i = 0; i < pipe->buffers; i++) {
+		struct pipe_buffer *buf = pipe->bufs + i;
+
+		if (buf->ops) {
+			buf->ops->release(pipe, buf);
+			buf->ops = NULL;
+		}
+	}
+
+	if (!bytes)
+		bytes = ret;
+
+	goto done;
+}
+EXPORT_SYMBOL(splice_direct_to_actor);
+
+static int direct_splice_actor(struct pipe_inode_info *pipe,
+			       struct splice_desc *sd)
+{
+	struct file *file = sd->u.file;
+
+	return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
+			      sd->flags);
+}
+
+/**
+ * do_splice_direct - splices data directly between two files
+ * @in:		file to splice from
+ * @ppos:	input file offset
+ * @out:	file to splice to
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    For use by do_sendfile(). splice can easily emulate sendfile, but
+ *    doing it in the application would incur an extra system call
+ *    (splice in + splice out, as compared to just sendfile()). So this helper
+ *    can splice directly through a process-private pipe.
+ *
+ */
+long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+		      size_t len, unsigned int flags)
+{
+	struct splice_desc sd = {
+		.len		= len,
+		.total_len	= len,
+		.flags		= flags,
+		.pos		= *ppos,
+		.u.file		= out,
+	};
+	long ret;
+
+	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
+	if (ret > 0)
+		*ppos = sd.pos;
+
+	return ret;
+}
+
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+			       struct pipe_inode_info *opipe,
+			       size_t len, unsigned int flags);
+
+/*
+ * Determine where to splice to/from.
+ */
+static long do_splice(struct file *in, loff_t __user *off_in,
+		      struct file *out, loff_t __user *off_out,
+		      size_t len, unsigned int flags)
+{
+	struct pipe_inode_info *ipipe;
+	struct pipe_inode_info *opipe;
+	loff_t offset, *off;
+	long ret;
+
+	ipipe = get_pipe_info(in);
+	opipe = get_pipe_info(out);
+
+	if (ipipe && opipe) {
+		if (off_in || off_out)
+			return -ESPIPE;
+
+		if (!(in->f_mode & FMODE_READ))
+			return -EBADF;
+
+		if (!(out->f_mode & FMODE_WRITE))
+			return -EBADF;
+
+		/* Splicing to self would be fun, but... */
+		if (ipipe == opipe)
+			return -EINVAL;
+
+		return splice_pipe_to_pipe(ipipe, opipe, len, flags);
+	}
+
+	if (ipipe) {
+		if (off_in)
+			return -ESPIPE;
+		if (off_out) {
+			if (!(out->f_mode & FMODE_PWRITE))
+				return -EINVAL;
+			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
+				return -EFAULT;
+			off = &offset;
+		} else
+			off = &out->f_pos;
+
+		ret = do_splice_from(ipipe, out, off, len, flags);
+
+		if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
+			ret = -EFAULT;
+
+		return ret;
+	}
+
+	if (opipe) {
+		if (off_out)
+			return -ESPIPE;
+		if (off_in) {
+			if (!(in->f_mode & FMODE_PREAD))
+				return -EINVAL;
+			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
+				return -EFAULT;
+			off = &offset;
+		} else
+			off = &in->f_pos;
+
+		ret = do_splice_to(in, off, opipe, len, flags);
+
+		if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
+			ret = -EFAULT;
+
+		return ret;
+	}
+
+	return -EINVAL;
+}
+
+/*
+ * Map an iov into an array of pages and offset/length tupples. With the
+ * partial_page structure, we can map several non-contiguous ranges into
+ * our ones pages[] map instead of splitting that operation into pieces.
+ * Could easily be exported as a generic helper for other users, in which
+ * case one would probably want to add a 'max_nr_pages' parameter as well.
+ */
+static int get_iovec_page_array(const struct iovec __user *iov,
+				unsigned int nr_vecs, struct page **pages,
+				struct partial_page *partial, int aligned,
+				unsigned int pipe_buffers)
+{
+	int buffers = 0, error = 0;
+
+	while (nr_vecs) {
+		unsigned long off, npages;
+		struct iovec entry;
+		void __user *base;
+		size_t len;
+		int i;
+
+		error = -EFAULT;
+		if (copy_from_user(&entry, iov, sizeof(entry)))
+			break;
+
+		base = entry.iov_base;
+		len = entry.iov_len;
+
+		/*
+		 * Sanity check this iovec. 0 read succeeds.
+		 */
+		error = 0;
+		if (unlikely(!len))
+			break;
+		error = -EFAULT;
+		if (!access_ok(VERIFY_READ, base, len))
+			break;
+
+		/*
+		 * Get this base offset and number of pages, then map
+		 * in the user pages.
+		 */
+		off = (unsigned long) base & ~PAGE_MASK;
+
+		/*
+		 * If asked for alignment, the offset must be zero and the
+		 * length a multiple of the PAGE_SIZE.
+		 */
+		error = -EINVAL;
+		if (aligned && (off || len & ~PAGE_MASK))
+			break;
+
+		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (npages > pipe_buffers - buffers)
+			npages = pipe_buffers - buffers;
+
+		error = get_user_pages_fast((unsigned long)base, npages,
+					0, &pages[buffers]);
+
+		if (unlikely(error <= 0))
+			break;
+
+		/*
+		 * Fill this contiguous range into the partial page map.
+		 */
+		for (i = 0; i < error; i++) {
+			const int plen = min_t(size_t, len, PAGE_SIZE - off);
+
+			partial[buffers].offset = off;
+			partial[buffers].len = plen;
+
+			off = 0;
+			len -= plen;
+			buffers++;
+		}
+
+		/*
+		 * We didn't complete this iov, stop here since it probably
+		 * means we have to move some of this into a pipe to
+		 * be able to continue.
+		 */
+		if (len)
+			break;
+
+		/*
+		 * Don't continue if we mapped fewer pages than we asked for,
+		 * or if we mapped the max number of pages that we have
+		 * room for.
+		 */
+		if (error < npages || buffers == pipe_buffers)
+			break;
+
+		nr_vecs--;
+		iov++;
+	}
+
+	if (buffers)
+		return buffers;
+
+	return error;
+}
+
+static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+			struct splice_desc *sd)
+{
+	char *src;
+	int ret;
+
+	/*
+	 * See if we can use the atomic maps, by prefaulting in the
+	 * pages and doing an atomic copy
+	 */
+	if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
+		src = buf->ops->map(pipe, buf, 1);
+		ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
+							sd->len);
+		buf->ops->unmap(pipe, buf, src);
+		if (!ret) {
+			ret = sd->len;
+			goto out;
+		}
+	}
+
+	/*
+	 * No dice, use slow non-atomic map and copy
+ 	 */
+	src = buf->ops->map(pipe, buf, 0);
+
+	ret = sd->len;
+	if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
+		ret = -EFAULT;
+
+	buf->ops->unmap(pipe, buf, src);
+out:
+	if (ret > 0)
+		sd->u.userptr += ret;
+	return ret;
+}
+
+/*
+ * For lack of a better implementation, implement vmsplice() to userspace
+ * as a simple copy of the pipes pages to the user iov.
+ */
+static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
+			     unsigned long nr_segs, unsigned int flags)
+{
+	struct pipe_inode_info *pipe;
+	struct splice_desc sd;
+	ssize_t size;
+	int error;
+	long ret;
+
+	pipe = get_pipe_info(file);
+	if (!pipe)
+		return -EBADF;
+
+	pipe_lock(pipe);
+
+	error = ret = 0;
+	while (nr_segs) {
+		void __user *base;
+		size_t len;
+
+		/*
+		 * Get user address base and length for this iovec.
+		 */
+		error = get_user(base, &iov->iov_base);
+		if (unlikely(error))
+			break;
+		error = get_user(len, &iov->iov_len);
+		if (unlikely(error))
+			break;
+
+		/*
+		 * Sanity check this iovec. 0 read succeeds.
+		 */
+		if (unlikely(!len))
+			break;
+		if (unlikely(!base)) {
+			error = -EFAULT;
+			break;
+		}
+
+		if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
+			error = -EFAULT;
+			break;
+		}
+
+		sd.len = 0;
+		sd.total_len = len;
+		sd.flags = flags;
+		sd.u.userptr = base;
+		sd.pos = 0;
+
+		size = __splice_from_pipe(pipe, &sd, pipe_to_user);
+		if (size < 0) {
+			if (!ret)
+				ret = size;
+
+			break;
+		}
+
+		ret += size;
+
+		if (size < len)
+			break;
+
+		nr_segs--;
+		iov++;
+	}
+
+	pipe_unlock(pipe);
+
+	if (!ret)
+		ret = error;
+
+	return ret;
+}
+
+/*
+ * vmsplice splices a user address range into a pipe. It can be thought of
+ * as splice-from-memory, where the regular splice is splice-from-file (or
+ * to file). In both cases the output is a pipe, naturally.
+ */
+static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
+			     unsigned long nr_segs, unsigned int flags)
+{
+	struct pipe_inode_info *pipe;
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &user_page_pipe_buf_ops,
+		.spd_release = spd_release_page,
+	};
+	long ret;
+
+	pipe = get_pipe_info(file);
+	if (!pipe)
+		return -EBADF;
+
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
+	spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
+					    spd.partial, flags & SPLICE_F_GIFT,
+					    pipe->buffers);
+	if (spd.nr_pages <= 0)
+		ret = spd.nr_pages;
+	else
+		ret = splice_to_pipe(pipe, &spd);
+
+	splice_shrink_spd(pipe, &spd);
+	return ret;
+}
+
+/*
+ * Note that vmsplice only really supports true splicing _from_ user memory
+ * to a pipe, not the other way around. Splicing from user memory is a simple
+ * operation that can be supported without any funky alignment restrictions
+ * or nasty vm tricks. We simply map in the user memory and fill them into
+ * a pipe. The reverse isn't quite as easy, though. There are two possible
+ * solutions for that:
+ *
+ *	- memcpy() the data internally, at which point we might as well just
+ *	  do a regular read() on the buffer anyway.
+ *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
+ *	  has restriction limitations on both ends of the pipe).
+ *
+ * Currently we punt and implement it as a normal copy, see pipe_to_user().
+ *
+ */
+SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
+		unsigned long, nr_segs, unsigned int, flags)
+{
+	struct file *file;
+	long error;
+	int fput;
+
+	if (unlikely(nr_segs > UIO_MAXIOV))
+		return -EINVAL;
+	else if (unlikely(!nr_segs))
+		return 0;
+
+	error = -EBADF;
+	file = fget_light(fd, &fput);
+	if (file) {
+		if (file->f_mode & FMODE_WRITE)
+			error = vmsplice_to_pipe(file, iov, nr_segs, flags);
+		else if (file->f_mode & FMODE_READ)
+			error = vmsplice_to_user(file, iov, nr_segs, flags);
+
+		fput_light(file, fput);
+	}
+
+	return error;
+}
+
+SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
+		int, fd_out, loff_t __user *, off_out,
+		size_t, len, unsigned int, flags)
+{
+	long error;
+	struct file *in, *out;
+	int fput_in, fput_out;
+
+	if (unlikely(!len))
+		return 0;
+
+	error = -EBADF;
+	in = fget_light(fd_in, &fput_in);
+	if (in) {
+		if (in->f_mode & FMODE_READ) {
+			out = fget_light(fd_out, &fput_out);
+			if (out) {
+				if (out->f_mode & FMODE_WRITE)
+					error = do_splice(in, off_in,
+							  out, off_out,
+							  len, flags);
+				fput_light(out, fput_out);
+			}
+		}
+
+		fput_light(in, fput_in);
+	}
+
+	return error;
+}
+
+/*
+ * Make sure there's data to read. Wait for input if we can, otherwise
+ * return an appropriate error.
+ */
+static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+{
+	int ret;
+
+	/*
+	 * Check ->nrbufs without the inode lock first. This function
+	 * is speculative anyways, so missing one is ok.
+	 */
+	if (pipe->nrbufs)
+		return 0;
+
+	ret = 0;
+	pipe_lock(pipe);
+
+	while (!pipe->nrbufs) {
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		if (!pipe->writers)
+			break;
+		if (!pipe->waiting_writers) {
+			if (flags & SPLICE_F_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
+			}
+		}
+		pipe_wait(pipe);
+	}
+
+	pipe_unlock(pipe);
+	return ret;
+}
+
+/*
+ * Make sure there's writeable room. Wait for room if we can, otherwise
+ * return an appropriate error.
+ */
+static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+{
+	int ret;
+
+	/*
+	 * Check ->nrbufs without the inode lock first. This function
+	 * is speculative anyways, so missing one is ok.
+	 */
+	if (pipe->nrbufs < pipe->buffers)
+		return 0;
+
+	ret = 0;
+	pipe_lock(pipe);
+
+	while (pipe->nrbufs >= pipe->buffers) {
+		if (!pipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			ret = -EPIPE;
+			break;
+		}
+		if (flags & SPLICE_F_NONBLOCK) {
+			ret = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		pipe->waiting_writers++;
+		pipe_wait(pipe);
+		pipe->waiting_writers--;
+	}
+
+	pipe_unlock(pipe);
+	return ret;
+}
+
+/*
+ * Splice contents of ipipe to opipe.
+ */
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+			       struct pipe_inode_info *opipe,
+			       size_t len, unsigned int flags)
+{
+	struct pipe_buffer *ibuf, *obuf;
+	int ret = 0, nbuf;
+	bool input_wakeup = false;
+
+
+retry:
+	ret = ipipe_prep(ipipe, flags);
+	if (ret)
+		return ret;
+
+	ret = opipe_prep(opipe, flags);
+	if (ret)
+		return ret;
+
+	/*
+	 * Potential ABBA deadlock, work around it by ordering lock
+	 * grabbing by pipe info address. Otherwise two different processes
+	 * could deadlock (one doing tee from A -> B, the other from B -> A).
+	 */
+	pipe_double_lock(ipipe, opipe);
+
+	do {
+		if (!opipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+
+		if (!ipipe->nrbufs && !ipipe->writers)
+			break;
+
+		/*
+		 * Cannot make any progress, because either the input
+		 * pipe is empty or the output pipe is full.
+		 */
+		if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
+			/* Already processed some buffers, break */
+			if (ret)
+				break;
+
+			if (flags & SPLICE_F_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
+			}
+
+			/*
+			 * We raced with another reader/writer and haven't
+			 * managed to process any buffers.  A zero return
+			 * value means EOF, so retry instead.
+			 */
+			pipe_unlock(ipipe);
+			pipe_unlock(opipe);
+			goto retry;
+		}
+
+		ibuf = ipipe->bufs + ipipe->curbuf;
+		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
+		obuf = opipe->bufs + nbuf;
+
+		if (len >= ibuf->len) {
+			/*
+			 * Simply move the whole buffer from ipipe to opipe
+			 */
+			*obuf = *ibuf;
+			ibuf->ops = NULL;
+			opipe->nrbufs++;
+			ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
+			ipipe->nrbufs--;
+			input_wakeup = true;
+		} else {
+			/*
+			 * Get a reference to this pipe buffer,
+			 * so we can copy the contents over.
+			 */
+			ibuf->ops->get(ipipe, ibuf);
+			*obuf = *ibuf;
+
+			/*
+			 * Don't inherit the gift flag, we need to
+			 * prevent multiple steals of this page.
+			 */
+			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+
+			obuf->len = len;
+			opipe->nrbufs++;
+			ibuf->offset += obuf->len;
+			ibuf->len -= obuf->len;
+		}
+		ret += obuf->len;
+		len -= obuf->len;
+	} while (len);
+
+	pipe_unlock(ipipe);
+	pipe_unlock(opipe);
+
+	/*
+	 * If we put data in the output pipe, wakeup any potential readers.
+	 */
+	if (ret > 0)
+		wakeup_pipe_readers(opipe);
+
+	if (input_wakeup)
+		wakeup_pipe_writers(ipipe);
+
+	return ret;
+}
+
+/*
+ * Link contents of ipipe to opipe.
+ */
+static int link_pipe(struct pipe_inode_info *ipipe,
+		     struct pipe_inode_info *opipe,
+		     size_t len, unsigned int flags)
+{
+	struct pipe_buffer *ibuf, *obuf;
+	int ret = 0, i = 0, nbuf;
+
+	/*
+	 * Potential ABBA deadlock, work around it by ordering lock
+	 * grabbing by pipe info address. Otherwise two different processes
+	 * could deadlock (one doing tee from A -> B, the other from B -> A).
+	 */
+	pipe_double_lock(ipipe, opipe);
+
+	do {
+		if (!opipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+
+		/*
+		 * If we have iterated all input buffers or ran out of
+		 * output room, break.
+		 */
+		if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
+			break;
+
+		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
+		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
+
+		/*
+		 * Get a reference to this pipe buffer,
+		 * so we can copy the contents over.
+		 */
+		ibuf->ops->get(ipipe, ibuf);
+
+		obuf = opipe->bufs + nbuf;
+		*obuf = *ibuf;
+
+		/*
+		 * Don't inherit the gift flag, we need to
+		 * prevent multiple steals of this page.
+		 */
+		obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+
+		if (obuf->len > len)
+			obuf->len = len;
+
+		opipe->nrbufs++;
+		ret += obuf->len;
+		len -= obuf->len;
+		i++;
+	} while (len);
+
+	/*
+	 * return EAGAIN if we have the potential of some data in the
+	 * future, otherwise just return 0
+	 */
+	if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
+		ret = -EAGAIN;
+
+	pipe_unlock(ipipe);
+	pipe_unlock(opipe);
+
+	/*
+	 * If we put data in the output pipe, wakeup any potential readers.
+	 */
+	if (ret > 0)
+		wakeup_pipe_readers(opipe);
+
+	return ret;
+}
+
+/*
+ * This is a tee(1) implementation that works on pipes. It doesn't copy
+ * any data, it simply references the 'in' pages on the 'out' pipe.
+ * The 'flags' used are the SPLICE_F_* variants, currently the only
+ * applicable one is SPLICE_F_NONBLOCK.
+ */
+static long do_tee(struct file *in, struct file *out, size_t len,
+		   unsigned int flags)
+{
+	struct pipe_inode_info *ipipe = get_pipe_info(in);
+	struct pipe_inode_info *opipe = get_pipe_info(out);
+	int ret = -EINVAL;
+
+	/*
+	 * Duplicate the contents of ipipe to opipe without actually
+	 * copying the data.
+	 */
+	if (ipipe && opipe && ipipe != opipe) {
+		/*
+		 * Keep going, unless we encounter an error. The ipipe/opipe
+		 * ordering doesn't really matter.
+		 */
+		ret = ipipe_prep(ipipe, flags);
+		if (!ret) {
+			ret = opipe_prep(opipe, flags);
+			if (!ret)
+				ret = link_pipe(ipipe, opipe, len, flags);
+		}
+	}
+
+	return ret;
+}
+
+SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
+{
+	struct file *in;
+	int error, fput_in;
+
+	if (unlikely(!len))
+		return 0;
+
+	error = -EBADF;
+	in = fget_light(fdin, &fput_in);
+	if (in) {
+		if (in->f_mode & FMODE_READ) {
+			int fput_out;
+			struct file *out = fget_light(fdout, &fput_out);
+
+			if (out) {
+				if (out->f_mode & FMODE_WRITE)
+					error = do_tee(in, out, len, flags);
+				fput_light(out, fput_out);
+			}
+		}
+ 		fput_light(in, fput_in);
+ 	}
+
+	return error;
+}
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
new file mode 100644
index 00000000..7797218d
--- /dev/null
+++ b/fs/squashfs/Kconfig
@@ -0,0 +1,89 @@
+config SQUASHFS
+	tristate "SquashFS 4.0 - Squashed file system support"
+	depends on BLOCK
+	select ZLIB_INFLATE
+	help
+	  Saying Y here includes support for SquashFS 4.0 (a Compressed
+	  Read-Only File System).  Squashfs is a highly compressed read-only
+	  filesystem for Linux.  It uses zlib, lzo or xz compression to
+	  compress both files, inodes and directories.  Inodes in the system
+	  are very small and all blocks are packed to minimise data overhead.
+	  Block sizes greater than 4K are supported up to a maximum of 1 Mbytes
+	  (default block size 128K).  SquashFS 4.0 supports 64 bit filesystems
+	  and files (larger than 4GB), full uid/gid information, hard links and
+	  timestamps.
+
+	  Squashfs is intended for general read-only filesystem use, for
+	  archival use (i.e. in cases where a .tar.gz file may be used), and in
+	  embedded systems where low overhead is needed.  Further information
+	  and tools are available from http://squashfs.sourceforge.net.
+
+	  If you want to compile this as a module ( = code which can be
+	  inserted in and removed from the running kernel whenever you want),
+	  say M here and read <file:Documentation/modules.txt>.  The module
+	  will be called squashfs.  Note that the root file system (the one
+	  containing the directory /) cannot be compiled as a module.
+
+	  If unsure, say N.
+
+config SQUASHFS_XATTR
+	bool "Squashfs XATTR support"
+	depends on SQUASHFS
+	help
+	  Saying Y here includes support for extended attributes (xattrs).
+	  Xattrs are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page).
+
+	  If unsure, say N.
+
+config SQUASHFS_LZO
+	bool "Include support for LZO compressed file systems"
+	depends on SQUASHFS
+	select LZO_DECOMPRESS
+	help
+	  Saying Y here includes support for reading Squashfs file systems
+	  compressed with LZO compression.  LZO compression is mainly
+	  aimed at embedded systems with slower CPUs where the overheads
+	  of zlib are too high.
+
+	  LZO is not the standard compression used in Squashfs and so most
+	  file systems will be readable without selecting this option.
+
+	  If unsure, say N.
+
+config SQUASHFS_XZ
+	bool "Include support for XZ compressed file systems"
+	depends on SQUASHFS
+	select XZ_DEC
+	help
+	  Saying Y here includes support for reading Squashfs file systems
+	  compressed with XZ compression.  XZ gives better compression than
+	  the default zlib compression, at the expense of greater CPU and
+	  memory overhead.
+
+	  XZ is not the standard compression used in Squashfs and so most
+	  file systems will be readable without selecting this option.
+
+	  If unsure, say N.
+
+config SQUASHFS_EMBEDDED
+	bool "Additional option for memory-constrained systems"
+	depends on SQUASHFS
+	help
+	  Saying Y here allows you to specify cache size.
+
+	  If unsure, say N.
+
+config SQUASHFS_FRAGMENT_CACHE_SIZE
+	int "Number of fragments cached" if SQUASHFS_EMBEDDED
+	depends on SQUASHFS
+	default "3"
+	help
+	  By default SquashFS caches the last 3 fragments read from
+	  the filesystem.  Increasing this amount may mean SquashFS
+	  has to re-read fragments less often from disk, at the expense
+	  of extra system memory.  Decreasing this amount will mean
+	  SquashFS uses less memory at the expense of extra reads from disk.
+
+	  Note there must be at least one cached fragment.  Anything
+	  much more than three will probably not make much difference.
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
new file mode 100644
index 00000000..cecf2bea
--- /dev/null
+++ b/fs/squashfs/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the linux squashfs routines.
+#
+
+obj-$(CONFIG_SQUASHFS) += squashfs.o
+squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
+squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
+squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
+squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
+squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
new file mode 100644
index 00000000..ed0eb2a9
--- /dev/null
+++ b/fs/squashfs/block.c
@@ -0,0 +1,210 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * block.c
+ */
+
+/*
+ * This file implements the low-level routines to read and decompress
+ * datablocks and metadata blocks.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+#include "decompressor.h"
+
+/*
+ * Read the metadata block length, this is stored in the first two
+ * bytes of the metadata block.
+ */
+static struct buffer_head *get_block_length(struct super_block *sb,
+			u64 *cur_index, int *offset, int *length)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	struct buffer_head *bh;
+
+	bh = sb_bread(sb, *cur_index);
+	if (bh == NULL)
+		return NULL;
+
+	if (msblk->devblksize - *offset == 1) {
+		*length = (unsigned char) bh->b_data[*offset];
+		put_bh(bh);
+		bh = sb_bread(sb, ++(*cur_index));
+		if (bh == NULL)
+			return NULL;
+		*length |= (unsigned char) bh->b_data[0] << 8;
+		*offset = 1;
+	} else {
+		*length = (unsigned char) bh->b_data[*offset] |
+			(unsigned char) bh->b_data[*offset + 1] << 8;
+		*offset += 2;
+
+		if (*offset == msblk->devblksize) {
+			put_bh(bh);
+			bh = sb_bread(sb, ++(*cur_index));
+			if (bh == NULL)
+				return NULL;
+			*offset = 0;
+		}
+	}
+
+	return bh;
+}
+
+
+/*
+ * Read and decompress a metadata block or datablock.  Length is non-zero
+ * if a datablock is being read (the size is stored elsewhere in the
+ * filesystem), otherwise the length is obtained from the first two bytes of
+ * the metadata block.  A bit in the length field indicates if the block
+ * is stored uncompressed in the filesystem (usually because compression
+ * generated a larger block - this does occasionally happen with zlib).
+ */
+int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
+			int length, u64 *next_index, int srclength, int pages)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	struct buffer_head **bh;
+	int offset = index & ((1 << msblk->devblksize_log2) - 1);
+	u64 cur_index = index >> msblk->devblksize_log2;
+	int bytes, compressed, b = 0, k = 0, page = 0, avail;
+
+	bh = kcalloc(((srclength + msblk->devblksize - 1)
+		>> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
+	if (bh == NULL)
+		return -ENOMEM;
+
+	if (length) {
+		/*
+		 * Datablock.
+		 */
+		bytes = -offset;
+		compressed = SQUASHFS_COMPRESSED_BLOCK(length);
+		length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
+		if (next_index)
+			*next_index = index + length;
+
+		TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
+			index, compressed ? "" : "un", length, srclength);
+
+		if (length < 0 || length > srclength ||
+				(index + length) > msblk->bytes_used)
+			goto read_failure;
+
+		for (b = 0; bytes < length; b++, cur_index++) {
+			bh[b] = sb_getblk(sb, cur_index);
+			if (bh[b] == NULL)
+				goto block_release;
+			bytes += msblk->devblksize;
+		}
+		ll_rw_block(READ, b, bh);
+	} else {
+		/*
+		 * Metadata block.
+		 */
+		if ((index + 2) > msblk->bytes_used)
+			goto read_failure;
+
+		bh[0] = get_block_length(sb, &cur_index, &offset, &length);
+		if (bh[0] == NULL)
+			goto read_failure;
+		b = 1;
+
+		bytes = msblk->devblksize - offset;
+		compressed = SQUASHFS_COMPRESSED(length);
+		length = SQUASHFS_COMPRESSED_SIZE(length);
+		if (next_index)
+			*next_index = index + length + 2;
+
+		TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
+				compressed ? "" : "un", length);
+
+		if (length < 0 || length > srclength ||
+					(index + length) > msblk->bytes_used)
+			goto block_release;
+
+		for (; bytes < length; b++) {
+			bh[b] = sb_getblk(sb, ++cur_index);
+			if (bh[b] == NULL)
+				goto block_release;
+			bytes += msblk->devblksize;
+		}
+		ll_rw_block(READ, b - 1, bh + 1);
+	}
+
+	if (compressed) {
+		length = squashfs_decompress(msblk, buffer, bh, b, offset,
+			 length, srclength, pages);
+		if (length < 0)
+			goto read_failure;
+	} else {
+		/*
+		 * Block is uncompressed.
+		 */
+		int i, in, pg_offset = 0;
+
+		for (i = 0; i < b; i++) {
+			wait_on_buffer(bh[i]);
+			if (!buffer_uptodate(bh[i]))
+				goto block_release;
+		}
+
+		for (bytes = length; k < b; k++) {
+			in = min(bytes, msblk->devblksize - offset);
+			bytes -= in;
+			while (in) {
+				if (pg_offset == PAGE_CACHE_SIZE) {
+					page++;
+					pg_offset = 0;
+				}
+				avail = min_t(int, in, PAGE_CACHE_SIZE -
+						pg_offset);
+				memcpy(buffer[page] + pg_offset,
+						bh[k]->b_data + offset, avail);
+				in -= avail;
+				pg_offset += avail;
+				offset += avail;
+			}
+			offset = 0;
+			put_bh(bh[k]);
+		}
+	}
+
+	kfree(bh);
+	return length;
+
+block_release:
+	for (; k < b; k++)
+		put_bh(bh[k]);
+
+read_failure:
+	ERROR("squashfs_read_data failed to read block 0x%llx\n",
+					(unsigned long long) index);
+	kfree(bh);
+	return -EIO;
+}
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
new file mode 100644
index 00000000..f744be98
--- /dev/null
+++ b/fs/squashfs/cache.c
@@ -0,0 +1,428 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * cache.c
+ */
+
+/*
+ * Blocks in Squashfs are compressed.  To avoid repeatedly decompressing
+ * recently accessed data Squashfs uses two small metadata and fragment caches.
+ *
+ * This file implements a generic cache implementation used for both caches,
+ * plus functions layered ontop of the generic cache implementation to
+ * access the metadata and fragment caches.
+ *
+ * To avoid out of memory and fragmentation issues with vmalloc the cache
+ * uses sequences of kmalloced PAGE_CACHE_SIZE buffers.
+ *
+ * It should be noted that the cache is not used for file datablocks, these
+ * are decompressed and cached in the page-cache in the normal way.  The
+ * cache is only used to temporarily cache fragment and metadata blocks
+ * which have been read as as a result of a metadata (i.e. inode or
+ * directory) or fragment access.  Because metadata and fragments are packed
+ * together into blocks (to gain greater compression) the read of a particular
+ * piece of metadata or fragment will retrieve other metadata/fragments which
+ * have been packed with it, these because of locality-of-reference may be read
+ * in the near future. Temporarily caching them ensures they are available for
+ * near future access without requiring an additional read and decompress.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/pagemap.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+
+/*
+ * Look-up block in cache, and increment usage count.  If not in cache, read
+ * and decompress it from disk.
+ */
+struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
+	struct squashfs_cache *cache, u64 block, int length)
+{
+	int i, n;
+	struct squashfs_cache_entry *entry;
+
+	spin_lock(&cache->lock);
+
+	while (1) {
+		for (i = 0; i < cache->entries; i++)
+			if (cache->entry[i].block == block)
+				break;
+
+		if (i == cache->entries) {
+			/*
+			 * Block not in cache, if all cache entries are used
+			 * go to sleep waiting for one to become available.
+			 */
+			if (cache->unused == 0) {
+				cache->num_waiters++;
+				spin_unlock(&cache->lock);
+				wait_event(cache->wait_queue, cache->unused);
+				spin_lock(&cache->lock);
+				cache->num_waiters--;
+				continue;
+			}
+
+			/*
+			 * At least one unused cache entry.  A simple
+			 * round-robin strategy is used to choose the entry to
+			 * be evicted from the cache.
+			 */
+			i = cache->next_blk;
+			for (n = 0; n < cache->entries; n++) {
+				if (cache->entry[i].refcount == 0)
+					break;
+				i = (i + 1) % cache->entries;
+			}
+
+			cache->next_blk = (i + 1) % cache->entries;
+			entry = &cache->entry[i];
+
+			/*
+			 * Initialise chosen cache entry, and fill it in from
+			 * disk.
+			 */
+			cache->unused--;
+			entry->block = block;
+			entry->refcount = 1;
+			entry->pending = 1;
+			entry->num_waiters = 0;
+			entry->error = 0;
+			spin_unlock(&cache->lock);
+
+			entry->length = squashfs_read_data(sb, entry->data,
+				block, length, &entry->next_index,
+				cache->block_size, cache->pages);
+
+			spin_lock(&cache->lock);
+
+			if (entry->length < 0)
+				entry->error = entry->length;
+
+			entry->pending = 0;
+
+			/*
+			 * While filling this entry one or more other processes
+			 * have looked it up in the cache, and have slept
+			 * waiting for it to become available.
+			 */
+			if (entry->num_waiters) {
+				spin_unlock(&cache->lock);
+				wake_up_all(&entry->wait_queue);
+			} else
+				spin_unlock(&cache->lock);
+
+			goto out;
+		}
+
+		/*
+		 * Block already in cache.  Increment refcount so it doesn't
+		 * get reused until we're finished with it, if it was
+		 * previously unused there's one less cache entry available
+		 * for reuse.
+		 */
+		entry = &cache->entry[i];
+		if (entry->refcount == 0)
+			cache->unused--;
+		entry->refcount++;
+
+		/*
+		 * If the entry is currently being filled in by another process
+		 * go to sleep waiting for it to become available.
+		 */
+		if (entry->pending) {
+			entry->num_waiters++;
+			spin_unlock(&cache->lock);
+			wait_event(entry->wait_queue, !entry->pending);
+		} else
+			spin_unlock(&cache->lock);
+
+		goto out;
+	}
+
+out:
+	TRACE("Got %s %d, start block %lld, refcount %d, error %d\n",
+		cache->name, i, entry->block, entry->refcount, entry->error);
+
+	if (entry->error)
+		ERROR("Unable to read %s cache entry [%llx]\n", cache->name,
+							block);
+	return entry;
+}
+
+
+/*
+ * Release cache entry, once usage count is zero it can be reused.
+ */
+void squashfs_cache_put(struct squashfs_cache_entry *entry)
+{
+	struct squashfs_cache *cache = entry->cache;
+
+	spin_lock(&cache->lock);
+	entry->refcount--;
+	if (entry->refcount == 0) {
+		cache->unused++;
+		/*
+		 * If there's any processes waiting for a block to become
+		 * available, wake one up.
+		 */
+		if (cache->num_waiters) {
+			spin_unlock(&cache->lock);
+			wake_up(&cache->wait_queue);
+			return;
+		}
+	}
+	spin_unlock(&cache->lock);
+}
+
+/*
+ * Delete cache reclaiming all kmalloced buffers.
+ */
+void squashfs_cache_delete(struct squashfs_cache *cache)
+{
+	int i, j;
+
+	if (cache == NULL)
+		return;
+
+	for (i = 0; i < cache->entries; i++) {
+		if (cache->entry[i].data) {
+			for (j = 0; j < cache->pages; j++)
+				kfree(cache->entry[i].data[j]);
+			kfree(cache->entry[i].data);
+		}
+	}
+
+	kfree(cache->entry);
+	kfree(cache);
+}
+
+
+/*
+ * Initialise cache allocating the specified number of entries, each of
+ * size block_size.  To avoid vmalloc fragmentation issues each entry
+ * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers.
+ */
+struct squashfs_cache *squashfs_cache_init(char *name, int entries,
+	int block_size)
+{
+	int i, j;
+	struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+
+	if (cache == NULL) {
+		ERROR("Failed to allocate %s cache\n", name);
+		return NULL;
+	}
+
+	cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL);
+	if (cache->entry == NULL) {
+		ERROR("Failed to allocate %s cache\n", name);
+		goto cleanup;
+	}
+
+	cache->next_blk = 0;
+	cache->unused = entries;
+	cache->entries = entries;
+	cache->block_size = block_size;
+	cache->pages = block_size >> PAGE_CACHE_SHIFT;
+	cache->pages = cache->pages ? cache->pages : 1;
+	cache->name = name;
+	cache->num_waiters = 0;
+	spin_lock_init(&cache->lock);
+	init_waitqueue_head(&cache->wait_queue);
+
+	for (i = 0; i < entries; i++) {
+		struct squashfs_cache_entry *entry = &cache->entry[i];
+
+		init_waitqueue_head(&cache->entry[i].wait_queue);
+		entry->cache = cache;
+		entry->block = SQUASHFS_INVALID_BLK;
+		entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL);
+		if (entry->data == NULL) {
+			ERROR("Failed to allocate %s cache entry\n", name);
+			goto cleanup;
+		}
+
+		for (j = 0; j < cache->pages; j++) {
+			entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+			if (entry->data[j] == NULL) {
+				ERROR("Failed to allocate %s buffer\n", name);
+				goto cleanup;
+			}
+		}
+	}
+
+	return cache;
+
+cleanup:
+	squashfs_cache_delete(cache);
+	return NULL;
+}
+
+
+/*
+ * Copy up to length bytes from cache entry to buffer starting at offset bytes
+ * into the cache entry.  If there's not length bytes then copy the number of
+ * bytes available.  In all cases return the number of bytes copied.
+ */
+int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
+		int offset, int length)
+{
+	int remaining = length;
+
+	if (length == 0)
+		return 0;
+	else if (buffer == NULL)
+		return min(length, entry->length - offset);
+
+	while (offset < entry->length) {
+		void *buff = entry->data[offset / PAGE_CACHE_SIZE]
+				+ (offset % PAGE_CACHE_SIZE);
+		int bytes = min_t(int, entry->length - offset,
+				PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
+
+		if (bytes >= remaining) {
+			memcpy(buffer, buff, remaining);
+			remaining = 0;
+			break;
+		}
+
+		memcpy(buffer, buff, bytes);
+		buffer += bytes;
+		remaining -= bytes;
+		offset += bytes;
+	}
+
+	return length - remaining;
+}
+
+
+/*
+ * Read length bytes from metadata position <block, offset> (block is the
+ * start of the compressed block on disk, and offset is the offset into
+ * the block once decompressed).  Data is packed into consecutive blocks,
+ * and length bytes may require reading more than one block.
+ */
+int squashfs_read_metadata(struct super_block *sb, void *buffer,
+		u64 *block, int *offset, int length)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int bytes, copied = length;
+	struct squashfs_cache_entry *entry;
+
+	TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
+
+	while (length) {
+		entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
+		if (entry->error)
+			return entry->error;
+		else if (*offset >= entry->length)
+			return -EIO;
+
+		bytes = squashfs_copy_data(buffer, entry, *offset, length);
+		if (buffer)
+			buffer += bytes;
+		length -= bytes;
+		*offset += bytes;
+
+		if (*offset == entry->length) {
+			*block = entry->next_index;
+			*offset = 0;
+		}
+
+		squashfs_cache_put(entry);
+	}
+
+	return copied;
+}
+
+
+/*
+ * Look-up in the fragmment cache the fragment located at <start_block> in the
+ * filesystem.  If necessary read and decompress it from disk.
+ */
+struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *sb,
+				u64 start_block, int length)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+
+	return squashfs_cache_get(sb, msblk->fragment_cache, start_block,
+		length);
+}
+
+
+/*
+ * Read and decompress the datablock located at <start_block> in the
+ * filesystem.  The cache is used here to avoid duplicating locking and
+ * read/decompress code.
+ */
+struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
+				u64 start_block, int length)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+
+	return squashfs_cache_get(sb, msblk->read_page, start_block, length);
+}
+
+
+/*
+ * Read a filesystem table (uncompressed sequence of bytes) from disk
+ */
+void *squashfs_read_table(struct super_block *sb, u64 block, int length)
+{
+	int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	int i, res;
+	void *table, *buffer, **data;
+
+	table = buffer = kmalloc(length, GFP_KERNEL);
+	if (table == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
+	if (data == NULL) {
+		res = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
+		data[i] = buffer;
+
+	res = squashfs_read_data(sb, data, block, length |
+		SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages);
+
+	kfree(data);
+
+	if (res < 0)
+		goto failed;
+
+	return table;
+
+failed:
+	kfree(table);
+	return ERR_PTR(res);
+}
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
new file mode 100644
index 00000000..9f1b0bb9
--- /dev/null
+++ b/fs/squashfs/decompressor.c
@@ -0,0 +1,110 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * decompressor.c
+ */
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file (and decompressor.h) implements a decompressor framework for
+ * Squashfs, allowing multiple decompressors to be easily supported
+ */
+
+static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
+	NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
+};
+
+#ifndef CONFIG_SQUASHFS_LZO
+static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
+	NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
+};
+#endif
+
+#ifndef CONFIG_SQUASHFS_XZ
+static const struct squashfs_decompressor squashfs_xz_comp_ops = {
+	NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
+};
+#endif
+
+static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
+	NULL, NULL, NULL, 0, "unknown", 0
+};
+
+static const struct squashfs_decompressor *decompressor[] = {
+	&squashfs_zlib_comp_ops,
+	&squashfs_lzo_comp_ops,
+	&squashfs_xz_comp_ops,
+	&squashfs_lzma_unsupported_comp_ops,
+	&squashfs_unknown_comp_ops
+};
+
+
+const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
+{
+	int i;
+
+	for (i = 0; decompressor[i]->id; i++)
+		if (id == decompressor[i]->id)
+			break;
+
+	return decompressor[i];
+}
+
+
+void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	void *strm, *buffer = NULL;
+	int length = 0;
+
+	/*
+	 * Read decompressor specific options from file system if present
+	 */
+	if (SQUASHFS_COMP_OPTS(flags)) {
+		buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+		if (buffer == NULL)
+			return ERR_PTR(-ENOMEM);
+
+		length = squashfs_read_data(sb, &buffer,
+			sizeof(struct squashfs_super_block), 0, NULL,
+			PAGE_CACHE_SIZE, 1);
+
+		if (length < 0) {
+			strm = ERR_PTR(length);
+			goto finished;
+		}
+	}
+
+	strm = msblk->decompressor->init(msblk, buffer, length);
+
+finished:
+	kfree(buffer);
+
+	return strm;
+}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
new file mode 100644
index 00000000..8ba70cff
--- /dev/null
+++ b/fs/squashfs/decompressor.h
@@ -0,0 +1,59 @@
+#ifndef DECOMPRESSOR_H
+#define DECOMPRESSOR_H
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * decompressor.h
+ */
+
+struct squashfs_decompressor {
+	void	*(*init)(struct squashfs_sb_info *, void *, int);
+	void	(*free)(void *);
+	int	(*decompress)(struct squashfs_sb_info *, void **,
+		struct buffer_head **, int, int, int, int, int);
+	int	id;
+	char	*name;
+	int	supported;
+};
+
+static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
+	void *s)
+{
+	if (msblk->decompressor)
+		msblk->decompressor->free(s);
+}
+
+static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
+	void **buffer, struct buffer_head **bh, int b, int offset, int length,
+	int srclength, int pages)
+{
+	return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
+		length, srclength, pages);
+}
+
+#ifdef CONFIG_SQUASHFS_XZ
+extern const struct squashfs_decompressor squashfs_xz_comp_ops;
+#endif
+
+#ifdef CONFIG_SQUASHFS_LZO
+extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
+#endif
+
+#endif
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
new file mode 100644
index 00000000..9dfe2ce0
--- /dev/null
+++ b/fs/squashfs/dir.c
@@ -0,0 +1,244 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * dir.c
+ */
+
+/*
+ * This file implements code to read directories from disk.
+ *
+ * See namei.c for a description of directory organisation on disk.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+static const unsigned char squashfs_filetype_table[] = {
+	DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK
+};
+
+/*
+ * Lookup offset (f_pos) in the directory index, returning the
+ * metadata block containing it.
+ *
+ * If we get an error reading the index then return the part of the index
+ * (if any) we have managed to read - the index isn't essential, just
+ * quicker.
+ */
+static int get_dir_index_using_offset(struct super_block *sb,
+	u64 *next_block, int *next_offset, u64 index_start, int index_offset,
+	int i_count, u64 f_pos)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int err, i, index, length = 0;
+	struct squashfs_dir_index dir_index;
+
+	TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n",
+					i_count, f_pos);
+
+	/*
+	 * Translate from external f_pos to the internal f_pos.  This
+	 * is offset by 3 because we invent "." and ".." entries which are
+	 * not actually stored in the directory.
+	 */
+	if (f_pos < 3)
+		return f_pos;
+	f_pos -= 3;
+
+	for (i = 0; i < i_count; i++) {
+		err = squashfs_read_metadata(sb, &dir_index, &index_start,
+				&index_offset, sizeof(dir_index));
+		if (err < 0)
+			break;
+
+		index = le32_to_cpu(dir_index.index);
+		if (index > f_pos)
+			/*
+			 * Found the index we're looking for.
+			 */
+			break;
+
+		err = squashfs_read_metadata(sb, NULL, &index_start,
+				&index_offset, le32_to_cpu(dir_index.size) + 1);
+		if (err < 0)
+			break;
+
+		length = index;
+		*next_block = le32_to_cpu(dir_index.start_block) +
+					msblk->directory_table;
+	}
+
+	*next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
+
+	/*
+	 * Translate back from internal f_pos to external f_pos.
+	 */
+	return length + 3;
+}
+
+
+static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	u64 block = squashfs_i(inode)->start + msblk->directory_table;
+	int offset = squashfs_i(inode)->offset, length = 0, dir_count, size,
+				type, err;
+	unsigned int inode_number;
+	struct squashfs_dir_header dirh;
+	struct squashfs_dir_entry *dire;
+
+	TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset);
+
+	dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
+	if (dire == NULL) {
+		ERROR("Failed to allocate squashfs_dir_entry\n");
+		goto finish;
+	}
+
+	/*
+	 * Return "." and  ".." entries as the first two filenames in the
+	 * directory.  To maximise compression these two entries are not
+	 * stored in the directory, and so we invent them here.
+	 *
+	 * It also means that the external f_pos is offset by 3 from the
+	 * on-disk directory f_pos.
+	 */
+	while (file->f_pos < 3) {
+		char *name;
+		int i_ino;
+
+		if (file->f_pos == 0) {
+			name = ".";
+			size = 1;
+			i_ino = inode->i_ino;
+		} else {
+			name = "..";
+			size = 2;
+			i_ino = squashfs_i(inode)->parent;
+		}
+
+		TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n",
+				dirent, name, size, file->f_pos, i_ino,
+				squashfs_filetype_table[1]);
+
+		if (filldir(dirent, name, size, file->f_pos, i_ino,
+				squashfs_filetype_table[1]) < 0) {
+				TRACE("Filldir returned less than 0\n");
+			goto finish;
+		}
+
+		file->f_pos += size;
+	}
+
+	length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
+				squashfs_i(inode)->dir_idx_start,
+				squashfs_i(inode)->dir_idx_offset,
+				squashfs_i(inode)->dir_idx_cnt,
+				file->f_pos);
+
+	while (length < i_size_read(inode)) {
+		/*
+		 * Read directory header
+		 */
+		err = squashfs_read_metadata(inode->i_sb, &dirh, &block,
+					&offset, sizeof(dirh));
+		if (err < 0)
+			goto failed_read;
+
+		length += sizeof(dirh);
+
+		dir_count = le32_to_cpu(dirh.count) + 1;
+
+		/* dir_count should never be larger than 256 */
+		if (dir_count > 256)
+			goto failed_read;
+
+		while (dir_count--) {
+			/*
+			 * Read directory entry.
+			 */
+			err = squashfs_read_metadata(inode->i_sb, dire, &block,
+					&offset, sizeof(*dire));
+			if (err < 0)
+				goto failed_read;
+
+			size = le16_to_cpu(dire->size) + 1;
+
+			/* size should never be larger than SQUASHFS_NAME_LEN */
+			if (size > SQUASHFS_NAME_LEN)
+				goto failed_read;
+
+			err = squashfs_read_metadata(inode->i_sb, dire->name,
+					&block, &offset, size);
+			if (err < 0)
+				goto failed_read;
+
+			length += sizeof(*dire) + size;
+
+			if (file->f_pos >= length)
+				continue;
+
+			dire->name[size] = '\0';
+			inode_number = le32_to_cpu(dirh.inode_number) +
+				((short) le16_to_cpu(dire->inode_number));
+			type = le16_to_cpu(dire->type);
+
+			TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)"
+					"\n", dirent, dire->name, size,
+					file->f_pos,
+					le32_to_cpu(dirh.start_block),
+					le16_to_cpu(dire->offset),
+					inode_number,
+					squashfs_filetype_table[type]);
+
+			if (filldir(dirent, dire->name, size, file->f_pos,
+					inode_number,
+					squashfs_filetype_table[type]) < 0) {
+				TRACE("Filldir returned less than 0\n");
+				goto finish;
+			}
+
+			file->f_pos = length;
+		}
+	}
+
+finish:
+	kfree(dire);
+	return 0;
+
+failed_read:
+	ERROR("Unable to read directory block [%llx:%x]\n", block, offset);
+	kfree(dire);
+	return 0;
+}
+
+
+const struct file_operations squashfs_dir_ops = {
+	.read = generic_read_dir,
+	.readdir = squashfs_readdir,
+	.llseek = default_llseek,
+};
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
new file mode 100644
index 00000000..5e1101ff
--- /dev/null
+++ b/fs/squashfs/export.c
@@ -0,0 +1,163 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * export.c
+ */
+
+/*
+ * This file implements code to make Squashfs filesystems exportable (NFS etc.)
+ *
+ * The export code uses an inode lookup table to map inode numbers passed in
+ * filehandles to an inode location on disk.  This table is stored compressed
+ * into metadata blocks.  A second index table is used to locate these.  This
+ * second index table for speed of access (and because it is small) is read at
+ * mount time and cached in memory.
+ *
+ * The inode lookup table is used only by the export code, inode disk
+ * locations are directly encoded in directories, enabling direct access
+ * without an intermediate lookup for all operations except the export ops.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/dcache.h>
+#include <linux/exportfs.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Look-up inode number (ino) in table, returning the inode location.
+ */
+static long long squashfs_inode_lookup(struct super_block *sb, int ino_num)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1);
+	int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1);
+	u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]);
+	__le64 ino;
+	int err;
+
+	TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num);
+
+	err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino));
+	if (err < 0)
+		return err;
+
+	TRACE("squashfs_inode_lookup, inode = 0x%llx\n",
+		(u64) le64_to_cpu(ino));
+
+	return le64_to_cpu(ino);
+}
+
+
+static struct dentry *squashfs_export_iget(struct super_block *sb,
+	unsigned int ino_num)
+{
+	long long ino;
+	struct dentry *dentry = ERR_PTR(-ENOENT);
+
+	TRACE("Entered squashfs_export_iget\n");
+
+	ino = squashfs_inode_lookup(sb, ino_num);
+	if (ino >= 0)
+		dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num));
+
+	return dentry;
+}
+
+
+static struct dentry *squashfs_fh_to_dentry(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT)
+			|| fh_len < 2)
+		return NULL;
+
+	return squashfs_export_iget(sb, fid->i32.ino);
+}
+
+
+static struct dentry *squashfs_fh_to_parent(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4)
+		return NULL;
+
+	return squashfs_export_iget(sb, fid->i32.parent_ino);
+}
+
+
+static struct dentry *squashfs_get_parent(struct dentry *child)
+{
+	struct inode *inode = child->d_inode;
+	unsigned int parent_ino = squashfs_i(inode)->parent;
+
+	return squashfs_export_iget(inode->i_sb, parent_ino);
+}
+
+
+/*
+ * Read uncompressed inode lookup table indexes off disk into memory
+ */
+__le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
+		u64 lookup_table_start, u64 next_table, unsigned int inodes)
+{
+	unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes);
+	__le64 *table;
+
+	TRACE("In read_inode_lookup_table, length %d\n", length);
+
+	/* Sanity check values */
+
+	/* there should always be at least one inode */
+	if (inodes == 0)
+		return ERR_PTR(-EINVAL);
+
+	/* length bytes should not extend into the next table - this check
+	 * also traps instances where lookup_table_start is incorrectly larger
+	 * than the next table start
+	 */
+	if (lookup_table_start + length > next_table)
+		return ERR_PTR(-EINVAL);
+
+	table = squashfs_read_table(sb, lookup_table_start, length);
+
+	/*
+	 * table[0] points to the first inode lookup table metadata block,
+	 * this should be less than lookup_table_start
+	 */
+	if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) {
+		kfree(table);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return table;
+}
+
+
+const struct export_operations squashfs_export_ops = {
+	.fh_to_dentry = squashfs_fh_to_dentry,
+	.fh_to_parent = squashfs_fh_to_parent,
+	.get_parent = squashfs_get_parent
+};
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
new file mode 100644
index 00000000..38bb1c64
--- /dev/null
+++ b/fs/squashfs/file.c
@@ -0,0 +1,501 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * file.c
+ */
+
+/*
+ * This file contains code for handling regular files.  A regular file
+ * consists of a sequence of contiguous compressed blocks, and/or a
+ * compressed fragment block (tail-end packed block).   The compressed size
+ * of each datablock is stored in a block list contained within the
+ * file inode (itself stored in one or more compressed metadata blocks).
+ *
+ * To speed up access to datablocks when reading 'large' files (256 Mbytes or
+ * larger), the code implements an index cache that caches the mapping from
+ * block index to datablock location on disk.
+ *
+ * The index cache allows Squashfs to handle large files (up to 1.75 TiB) while
+ * retaining a simple and space-efficient block list on disk.  The cache
+ * is split into slots, caching up to eight 224 GiB files (128 KiB blocks).
+ * Larger files use multiple slots, with 1.75 TiB files using all 8 slots.
+ * The index cache is designed to be memory efficient, and by default uses
+ * 16 KiB.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mutex.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Locate cache slot in range [offset, index] for specified inode.  If
+ * there's more than one return the slot closest to index.
+ */
+static struct meta_index *locate_meta_index(struct inode *inode, int offset,
+				int index)
+{
+	struct meta_index *meta = NULL;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	int i;
+
+	mutex_lock(&msblk->meta_index_mutex);
+
+	TRACE("locate_meta_index: index %d, offset %d\n", index, offset);
+
+	if (msblk->meta_index == NULL)
+		goto not_allocated;
+
+	for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
+		if (msblk->meta_index[i].inode_number == inode->i_ino &&
+				msblk->meta_index[i].offset >= offset &&
+				msblk->meta_index[i].offset <= index &&
+				msblk->meta_index[i].locked == 0) {
+			TRACE("locate_meta_index: entry %d, offset %d\n", i,
+					msblk->meta_index[i].offset);
+			meta = &msblk->meta_index[i];
+			offset = meta->offset;
+		}
+	}
+
+	if (meta)
+		meta->locked = 1;
+
+not_allocated:
+	mutex_unlock(&msblk->meta_index_mutex);
+
+	return meta;
+}
+
+
+/*
+ * Find and initialise an empty cache slot for index offset.
+ */
+static struct meta_index *empty_meta_index(struct inode *inode, int offset,
+				int skip)
+{
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	struct meta_index *meta = NULL;
+	int i;
+
+	mutex_lock(&msblk->meta_index_mutex);
+
+	TRACE("empty_meta_index: offset %d, skip %d\n", offset, skip);
+
+	if (msblk->meta_index == NULL) {
+		/*
+		 * First time cache index has been used, allocate and
+		 * initialise.  The cache index could be allocated at
+		 * mount time but doing it here means it is allocated only
+		 * if a 'large' file is read.
+		 */
+		msblk->meta_index = kcalloc(SQUASHFS_META_SLOTS,
+			sizeof(*(msblk->meta_index)), GFP_KERNEL);
+		if (msblk->meta_index == NULL) {
+			ERROR("Failed to allocate meta_index\n");
+			goto failed;
+		}
+		for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
+			msblk->meta_index[i].inode_number = 0;
+			msblk->meta_index[i].locked = 0;
+		}
+		msblk->next_meta_index = 0;
+	}
+
+	for (i = SQUASHFS_META_SLOTS; i &&
+			msblk->meta_index[msblk->next_meta_index].locked; i--)
+		msblk->next_meta_index = (msblk->next_meta_index + 1) %
+			SQUASHFS_META_SLOTS;
+
+	if (i == 0) {
+		TRACE("empty_meta_index: failed!\n");
+		goto failed;
+	}
+
+	TRACE("empty_meta_index: returned meta entry %d, %p\n",
+			msblk->next_meta_index,
+			&msblk->meta_index[msblk->next_meta_index]);
+
+	meta = &msblk->meta_index[msblk->next_meta_index];
+	msblk->next_meta_index = (msblk->next_meta_index + 1) %
+			SQUASHFS_META_SLOTS;
+
+	meta->inode_number = inode->i_ino;
+	meta->offset = offset;
+	meta->skip = skip;
+	meta->entries = 0;
+	meta->locked = 1;
+
+failed:
+	mutex_unlock(&msblk->meta_index_mutex);
+	return meta;
+}
+
+
+static void release_meta_index(struct inode *inode, struct meta_index *meta)
+{
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	mutex_lock(&msblk->meta_index_mutex);
+	meta->locked = 0;
+	mutex_unlock(&msblk->meta_index_mutex);
+}
+
+
+/*
+ * Read the next n blocks from the block list, starting from
+ * metadata block <start_block, offset>.
+ */
+static long long read_indexes(struct super_block *sb, int n,
+				u64 *start_block, int *offset)
+{
+	int err, i;
+	long long block = 0;
+	__le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+
+	if (blist == NULL) {
+		ERROR("read_indexes: Failed to allocate block_list\n");
+		return -ENOMEM;
+	}
+
+	while (n) {
+		int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2);
+
+		err = squashfs_read_metadata(sb, blist, start_block,
+				offset, blocks << 2);
+		if (err < 0) {
+			ERROR("read_indexes: reading block [%llx:%x]\n",
+				*start_block, *offset);
+			goto failure;
+		}
+
+		for (i = 0; i < blocks; i++) {
+			int size = le32_to_cpu(blist[i]);
+			block += SQUASHFS_COMPRESSED_SIZE_BLOCK(size);
+		}
+		n -= blocks;
+	}
+
+	kfree(blist);
+	return block;
+
+failure:
+	kfree(blist);
+	return err;
+}
+
+
+/*
+ * Each cache index slot has SQUASHFS_META_ENTRIES, each of which
+ * can cache one index -> datablock/blocklist-block mapping.  We wish
+ * to distribute these over the length of the file, entry[0] maps index x,
+ * entry[1] maps index x + skip, entry[2] maps index x + 2 * skip, and so on.
+ * The larger the file, the greater the skip factor.  The skip factor is
+ * limited to the size of the metadata cache (SQUASHFS_CACHED_BLKS) to ensure
+ * the number of metadata blocks that need to be read fits into the cache.
+ * If the skip factor is limited in this way then the file will use multiple
+ * slots.
+ */
+static inline int calculate_skip(int blocks)
+{
+	int skip = blocks / ((SQUASHFS_META_ENTRIES + 1)
+		 * SQUASHFS_META_INDEXES);
+	return min(SQUASHFS_CACHED_BLKS - 1, skip + 1);
+}
+
+
+/*
+ * Search and grow the index cache for the specified inode, returning the
+ * on-disk locations of the datablock and block list metadata block
+ * <index_block, index_offset> for index (scaled to nearest cache index).
+ */
+static int fill_meta_index(struct inode *inode, int index,
+		u64 *index_block, int *index_offset, u64 *data_block)
+{
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	int skip = calculate_skip(i_size_read(inode) >> msblk->block_log);
+	int offset = 0;
+	struct meta_index *meta;
+	struct meta_entry *meta_entry;
+	u64 cur_index_block = squashfs_i(inode)->block_list_start;
+	int cur_offset = squashfs_i(inode)->offset;
+	u64 cur_data_block = squashfs_i(inode)->start;
+	int err, i;
+
+	/*
+	 * Scale index to cache index (cache slot entry)
+	 */
+	index /= SQUASHFS_META_INDEXES * skip;
+
+	while (offset < index) {
+		meta = locate_meta_index(inode, offset + 1, index);
+
+		if (meta == NULL) {
+			meta = empty_meta_index(inode, offset + 1, skip);
+			if (meta == NULL)
+				goto all_done;
+		} else {
+			offset = index < meta->offset + meta->entries ? index :
+				meta->offset + meta->entries - 1;
+			meta_entry = &meta->meta_entry[offset - meta->offset];
+			cur_index_block = meta_entry->index_block +
+				msblk->inode_table;
+			cur_offset = meta_entry->offset;
+			cur_data_block = meta_entry->data_block;
+			TRACE("get_meta_index: offset %d, meta->offset %d, "
+				"meta->entries %d\n", offset, meta->offset,
+				meta->entries);
+			TRACE("get_meta_index: index_block 0x%llx, offset 0x%x"
+				" data_block 0x%llx\n", cur_index_block,
+				cur_offset, cur_data_block);
+		}
+
+		/*
+		 * If necessary grow cache slot by reading block list.  Cache
+		 * slot is extended up to index or to the end of the slot, in
+		 * which case further slots will be used.
+		 */
+		for (i = meta->offset + meta->entries; i <= index &&
+				i < meta->offset + SQUASHFS_META_ENTRIES; i++) {
+			int blocks = skip * SQUASHFS_META_INDEXES;
+			long long res = read_indexes(inode->i_sb, blocks,
+					&cur_index_block, &cur_offset);
+
+			if (res < 0) {
+				if (meta->entries == 0)
+					/*
+					 * Don't leave an empty slot on read
+					 * error allocated to this inode...
+					 */
+					meta->inode_number = 0;
+				err = res;
+				goto failed;
+			}
+
+			cur_data_block += res;
+			meta_entry = &meta->meta_entry[i - meta->offset];
+			meta_entry->index_block = cur_index_block -
+				msblk->inode_table;
+			meta_entry->offset = cur_offset;
+			meta_entry->data_block = cur_data_block;
+			meta->entries++;
+			offset++;
+		}
+
+		TRACE("get_meta_index: meta->offset %d, meta->entries %d\n",
+				meta->offset, meta->entries);
+
+		release_meta_index(inode, meta);
+	}
+
+all_done:
+	*index_block = cur_index_block;
+	*index_offset = cur_offset;
+	*data_block = cur_data_block;
+
+	/*
+	 * Scale cache index (cache slot entry) to index
+	 */
+	return offset * SQUASHFS_META_INDEXES * skip;
+
+failed:
+	release_meta_index(inode, meta);
+	return err;
+}
+
+
+/*
+ * Get the on-disk location and compressed size of the datablock
+ * specified by index.  Fill_meta_index() does most of the work.
+ */
+static int read_blocklist(struct inode *inode, int index, u64 *block)
+{
+	u64 start;
+	long long blks;
+	int offset;
+	__le32 size;
+	int res = fill_meta_index(inode, index, &start, &offset, block);
+
+	TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset"
+		       " 0x%x, block 0x%llx\n", res, index, start, offset,
+			*block);
+
+	if (res < 0)
+		return res;
+
+	/*
+	 * res contains the index of the mapping returned by fill_meta_index(),
+	 * this will likely be less than the desired index (because the
+	 * meta_index cache works at a higher granularity).  Read any
+	 * extra block indexes needed.
+	 */
+	if (res < index) {
+		blks = read_indexes(inode->i_sb, index - res, &start, &offset);
+		if (blks < 0)
+			return (int) blks;
+		*block += blks;
+	}
+
+	/*
+	 * Read length of block specified by index.
+	 */
+	res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset,
+			sizeof(size));
+	if (res < 0)
+		return res;
+	return le32_to_cpu(size);
+}
+
+
+static int squashfs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	int bytes, i, offset = 0, sparse = 0;
+	struct squashfs_cache_entry *buffer = NULL;
+	void *pageaddr;
+
+	int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+	int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
+	int start_index = page->index & ~mask;
+	int end_index = start_index | mask;
+	int file_end = i_size_read(inode) >> msblk->block_log;
+
+	TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
+				page->index, squashfs_i(inode)->start);
+
+	if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+					PAGE_CACHE_SHIFT))
+		goto out;
+
+	if (index < file_end || squashfs_i(inode)->fragment_block ==
+					SQUASHFS_INVALID_BLK) {
+		/*
+		 * Reading a datablock from disk.  Need to read block list
+		 * to get location and block size.
+		 */
+		u64 block = 0;
+		int bsize = read_blocklist(inode, index, &block);
+		if (bsize < 0)
+			goto error_out;
+
+		if (bsize == 0) { /* hole */
+			bytes = index == file_end ?
+				(i_size_read(inode) & (msblk->block_size - 1)) :
+				 msblk->block_size;
+			sparse = 1;
+		} else {
+			/*
+			 * Read and decompress datablock.
+			 */
+			buffer = squashfs_get_datablock(inode->i_sb,
+								block, bsize);
+			if (buffer->error) {
+				ERROR("Unable to read page, block %llx, size %x"
+					"\n", block, bsize);
+				squashfs_cache_put(buffer);
+				goto error_out;
+			}
+			bytes = buffer->length;
+		}
+	} else {
+		/*
+		 * Datablock is stored inside a fragment (tail-end packed
+		 * block).
+		 */
+		buffer = squashfs_get_fragment(inode->i_sb,
+				squashfs_i(inode)->fragment_block,
+				squashfs_i(inode)->fragment_size);
+
+		if (buffer->error) {
+			ERROR("Unable to read page, block %llx, size %x\n",
+				squashfs_i(inode)->fragment_block,
+				squashfs_i(inode)->fragment_size);
+			squashfs_cache_put(buffer);
+			goto error_out;
+		}
+		bytes = i_size_read(inode) & (msblk->block_size - 1);
+		offset = squashfs_i(inode)->fragment_offset;
+	}
+
+	/*
+	 * Loop copying datablock into pages.  As the datablock likely covers
+	 * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly
+	 * grab the pages from the page cache, except for the page that we've
+	 * been called to fill.
+	 */
+	for (i = start_index; i <= end_index && bytes > 0; i++,
+			bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
+		struct page *push_page;
+		int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE);
+
+		TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
+
+		push_page = (i == page->index) ? page :
+			grab_cache_page_nowait(page->mapping, i);
+
+		if (!push_page)
+			continue;
+
+		if (PageUptodate(push_page))
+			goto skip_page;
+
+		pageaddr = kmap_atomic(push_page, KM_USER0);
+		squashfs_copy_data(pageaddr, buffer, offset, avail);
+		memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+		kunmap_atomic(pageaddr, KM_USER0);
+		flush_dcache_page(push_page);
+		SetPageUptodate(push_page);
+skip_page:
+		unlock_page(push_page);
+		if (i != page->index)
+			page_cache_release(push_page);
+	}
+
+	if (!sparse)
+		squashfs_cache_put(buffer);
+
+	return 0;
+
+error_out:
+	SetPageError(page);
+out:
+	pageaddr = kmap_atomic(page, KM_USER0);
+	memset(pageaddr, 0, PAGE_CACHE_SIZE);
+	kunmap_atomic(pageaddr, KM_USER0);
+	flush_dcache_page(page);
+	if (!PageError(page))
+		SetPageUptodate(page);
+	unlock_page(page);
+
+	return 0;
+}
+
+
+const struct address_space_operations squashfs_aops = {
+	.readpage = squashfs_readpage
+};
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
new file mode 100644
index 00000000..0ed6edbc
--- /dev/null
+++ b/fs/squashfs/fragment.c
@@ -0,0 +1,99 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * fragment.c
+ */
+
+/*
+ * This file implements code to handle compressed fragments (tail-end packed
+ * datablocks).
+ *
+ * Regular files contain a fragment index which is mapped to a fragment
+ * location on disk and compressed size using a fragment lookup table.
+ * Like everything in Squashfs this fragment lookup table is itself stored
+ * compressed into metadata blocks.  A second index table is used to locate
+ * these.  This second index table for speed of access (and because it
+ * is small) is read at mount time and cached in memory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+
+/*
+ * Look-up fragment using the fragment index table.  Return the on disk
+ * location of the fragment and its compressed size
+ */
+int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment,
+				u64 *fragment_block)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int block = SQUASHFS_FRAGMENT_INDEX(fragment);
+	int offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment);
+	u64 start_block = le64_to_cpu(msblk->fragment_index[block]);
+	struct squashfs_fragment_entry fragment_entry;
+	int size;
+
+	size = squashfs_read_metadata(sb, &fragment_entry, &start_block,
+					&offset, sizeof(fragment_entry));
+	if (size < 0)
+		return size;
+
+	*fragment_block = le64_to_cpu(fragment_entry.start_block);
+	size = le32_to_cpu(fragment_entry.size);
+
+	return size;
+}
+
+
+/*
+ * Read the uncompressed fragment lookup table indexes off disk into memory
+ */
+__le64 *squashfs_read_fragment_index_table(struct super_block *sb,
+	u64 fragment_table_start, u64 next_table, unsigned int fragments)
+{
+	unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments);
+	__le64 *table;
+
+	/*
+	 * Sanity check, length bytes should not extend into the next table -
+	 * this check also traps instances where fragment_table_start is
+	 * incorrectly larger than the next table start
+	 */
+	if (fragment_table_start + length > next_table)
+		return ERR_PTR(-EINVAL);
+
+	table = squashfs_read_table(sb, fragment_table_start, length);
+
+	/*
+	 * table[0] points to the first fragment table metadata block, this
+	 * should be less than fragment_table_start
+	 */
+	if (!IS_ERR(table) && le64_to_cpu(table[0]) >= fragment_table_start) {
+		kfree(table);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return table;
+}
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
new file mode 100644
index 00000000..d38ea3da
--- /dev/null
+++ b/fs/squashfs/id.c
@@ -0,0 +1,102 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * id.c
+ */
+
+/*
+ * This file implements code to handle uids and gids.
+ *
+ * For space efficiency regular files store uid and gid indexes, which are
+ * converted to 32-bit uids/gids using an id look up table.  This table is
+ * stored compressed into metadata blocks.  A second index table is used to
+ * locate these.  This second index table for speed of access (and because it
+ * is small) is read at mount time and cached in memory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+
+/*
+ * Map uid/gid index into real 32-bit uid/gid using the id look up table
+ */
+int squashfs_get_id(struct super_block *sb, unsigned int index,
+					unsigned int *id)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int block = SQUASHFS_ID_BLOCK(index);
+	int offset = SQUASHFS_ID_BLOCK_OFFSET(index);
+	u64 start_block = le64_to_cpu(msblk->id_table[block]);
+	__le32 disk_id;
+	int err;
+
+	err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset,
+							sizeof(disk_id));
+	if (err < 0)
+		return err;
+
+	*id = le32_to_cpu(disk_id);
+	return 0;
+}
+
+
+/*
+ * Read uncompressed id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_id_index_table(struct super_block *sb,
+		u64 id_table_start, u64 next_table, unsigned short no_ids)
+{
+	unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids);
+	__le64 *table;
+
+	TRACE("In read_id_index_table, length %d\n", length);
+
+	/* Sanity check values */
+
+	/* there should always be at least one id */
+	if (no_ids == 0)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * length bytes should not extend into the next table - this check
+	 * also traps instances where id_table_start is incorrectly larger
+	 * than the next table start
+	 */
+	if (id_table_start + length > next_table)
+		return ERR_PTR(-EINVAL);
+
+	table = squashfs_read_table(sb, id_table_start, length);
+
+	/*
+	 * table[0] points to the first id lookup table metadata block, this
+	 * should be less than id_table_start
+	 */
+	if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) {
+		kfree(table);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return table;
+}
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
new file mode 100644
index 00000000..04bebcaa
--- /dev/null
+++ b/fs/squashfs/inode.c
@@ -0,0 +1,425 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * inode.c
+ */
+
+/*
+ * This file implements code to create and read inodes from disk.
+ *
+ * Inodes in Squashfs are identified by a 48-bit inode which encodes the
+ * location of the compressed metadata block containing the inode, and the byte
+ * offset into that block where the inode is placed (<block, offset>).
+ *
+ * To maximise compression there are different inodes for each file type
+ * (regular file, directory, device, etc.), the inode contents and length
+ * varying with the type.
+ *
+ * To further maximise compression, two types of regular file inode and
+ * directory inode are defined: inodes optimised for frequently occurring
+ * regular files and directories, and extended types where extra
+ * information has to be stored.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/xattr.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "xattr.h"
+
+/*
+ * Initialise VFS inode with the base inode information common to all
+ * Squashfs inode types.  Sqsh_ino contains the unswapped base inode
+ * off disk.
+ */
+static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
+				struct squashfs_base_inode *sqsh_ino)
+{
+	int err;
+
+	err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &inode->i_uid);
+	if (err)
+		return err;
+
+	err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &inode->i_gid);
+	if (err)
+		return err;
+
+	inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
+	inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
+	inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
+	inode->i_ctime.tv_sec = inode->i_mtime.tv_sec;
+	inode->i_mode = le16_to_cpu(sqsh_ino->mode);
+	inode->i_size = 0;
+
+	return err;
+}
+
+
+struct inode *squashfs_iget(struct super_block *sb, long long ino,
+				unsigned int ino_number)
+{
+	struct inode *inode = iget_locked(sb, ino_number);
+	int err;
+
+	TRACE("Entered squashfs_iget\n");
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	err = squashfs_read_inode(inode, ino);
+	if (err) {
+		iget_failed(inode);
+		return ERR_PTR(err);
+	}
+
+	unlock_new_inode(inode);
+	return inode;
+}
+
+
+/*
+ * Initialise VFS inode by reading inode from inode table (compressed
+ * metadata).  The format and amount of data read depends on type.
+ */
+int squashfs_read_inode(struct inode *inode, long long ino)
+{
+	struct super_block *sb = inode->i_sb;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
+	int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
+	union squashfs_inode squashfs_ino;
+	struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
+	int xattr_id = SQUASHFS_INVALID_XATTR;
+
+	TRACE("Entered squashfs_read_inode\n");
+
+	/*
+	 * Read inode base common to all inode types.
+	 */
+	err = squashfs_read_metadata(sb, sqshb_ino, &block,
+				&offset, sizeof(*sqshb_ino));
+	if (err < 0)
+		goto failed_read;
+
+	err = squashfs_new_inode(sb, inode, sqshb_ino);
+	if (err)
+		goto failed_read;
+
+	block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
+	offset = SQUASHFS_INODE_OFFSET(ino);
+
+	type = le16_to_cpu(sqshb_ino->inode_type);
+	switch (type) {
+	case SQUASHFS_REG_TYPE: {
+		unsigned int frag_offset, frag;
+		int frag_size;
+		u64 frag_blk;
+		struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+							sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		frag = le32_to_cpu(sqsh_ino->fragment);
+		if (frag != SQUASHFS_INVALID_FRAG) {
+			frag_offset = le32_to_cpu(sqsh_ino->offset);
+			frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
+			if (frag_size < 0) {
+				err = frag_size;
+				goto failed_read;
+			}
+		} else {
+			frag_blk = SQUASHFS_INVALID_BLK;
+			frag_size = 0;
+			frag_offset = 0;
+		}
+
+		inode->i_nlink = 1;
+		inode->i_size = le32_to_cpu(sqsh_ino->file_size);
+		inode->i_fop = &generic_ro_fops;
+		inode->i_mode |= S_IFREG;
+		inode->i_blocks = ((inode->i_size - 1) >> 9) + 1;
+		squashfs_i(inode)->fragment_block = frag_blk;
+		squashfs_i(inode)->fragment_size = frag_size;
+		squashfs_i(inode)->fragment_offset = frag_offset;
+		squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+		squashfs_i(inode)->block_list_start = block;
+		squashfs_i(inode)->offset = offset;
+		inode->i_data.a_ops = &squashfs_aops;
+
+		TRACE("File inode %x:%x, start_block %llx, block_list_start "
+			"%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
+			offset, squashfs_i(inode)->start, block, offset);
+		break;
+	}
+	case SQUASHFS_LREG_TYPE: {
+		unsigned int frag_offset, frag;
+		int frag_size;
+		u64 frag_blk;
+		struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+							sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		frag = le32_to_cpu(sqsh_ino->fragment);
+		if (frag != SQUASHFS_INVALID_FRAG) {
+			frag_offset = le32_to_cpu(sqsh_ino->offset);
+			frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
+			if (frag_size < 0) {
+				err = frag_size;
+				goto failed_read;
+			}
+		} else {
+			frag_blk = SQUASHFS_INVALID_BLK;
+			frag_size = 0;
+			frag_offset = 0;
+		}
+
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+		inode->i_op = &squashfs_inode_ops;
+		inode->i_fop = &generic_ro_fops;
+		inode->i_mode |= S_IFREG;
+		inode->i_blocks = ((inode->i_size -
+				le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1;
+
+		squashfs_i(inode)->fragment_block = frag_blk;
+		squashfs_i(inode)->fragment_size = frag_size;
+		squashfs_i(inode)->fragment_offset = frag_offset;
+		squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block);
+		squashfs_i(inode)->block_list_start = block;
+		squashfs_i(inode)->offset = offset;
+		inode->i_data.a_ops = &squashfs_aops;
+
+		TRACE("File inode %x:%x, start_block %llx, block_list_start "
+			"%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
+			offset, squashfs_i(inode)->start, block, offset);
+		break;
+	}
+	case SQUASHFS_DIR_TYPE: {
+		struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		inode->i_size = le16_to_cpu(sqsh_ino->file_size);
+		inode->i_op = &squashfs_dir_inode_ops;
+		inode->i_fop = &squashfs_dir_ops;
+		inode->i_mode |= S_IFDIR;
+		squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+		squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
+		squashfs_i(inode)->dir_idx_cnt = 0;
+		squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
+
+		TRACE("Directory inode %x:%x, start_block %llx, offset %x\n",
+				SQUASHFS_INODE_BLK(ino), offset,
+				squashfs_i(inode)->start,
+				le16_to_cpu(sqsh_ino->offset));
+		break;
+	}
+	case SQUASHFS_LDIR_TYPE: {
+		struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		inode->i_size = le32_to_cpu(sqsh_ino->file_size);
+		inode->i_op = &squashfs_dir_inode_ops;
+		inode->i_fop = &squashfs_dir_ops;
+		inode->i_mode |= S_IFDIR;
+		squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+		squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
+		squashfs_i(inode)->dir_idx_start = block;
+		squashfs_i(inode)->dir_idx_offset = offset;
+		squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count);
+		squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
+
+		TRACE("Long directory inode %x:%x, start_block %llx, offset "
+				"%x\n", SQUASHFS_INODE_BLK(ino), offset,
+				squashfs_i(inode)->start,
+				le16_to_cpu(sqsh_ino->offset));
+		break;
+	}
+	case SQUASHFS_SYMLINK_TYPE:
+	case SQUASHFS_LSYMLINK_TYPE: {
+		struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
+		inode->i_op = &squashfs_symlink_inode_ops;
+		inode->i_data.a_ops = &squashfs_symlink_aops;
+		inode->i_mode |= S_IFLNK;
+		squashfs_i(inode)->start = block;
+		squashfs_i(inode)->offset = offset;
+
+		if (type == SQUASHFS_LSYMLINK_TYPE) {
+			__le32 xattr;
+
+			err = squashfs_read_metadata(sb, NULL, &block,
+						&offset, inode->i_size);
+			if (err < 0)
+				goto failed_read;
+			err = squashfs_read_metadata(sb, &xattr, &block,
+						&offset, sizeof(xattr));
+			if (err < 0)
+				goto failed_read;
+			xattr_id = le32_to_cpu(xattr);
+		}
+
+		TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
+				"%x\n", SQUASHFS_INODE_BLK(ino), offset,
+				block, offset);
+		break;
+	}
+	case SQUASHFS_BLKDEV_TYPE:
+	case SQUASHFS_CHRDEV_TYPE: {
+		struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
+		unsigned int rdev;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		if (type == SQUASHFS_CHRDEV_TYPE)
+			inode->i_mode |= S_IFCHR;
+		else
+			inode->i_mode |= S_IFBLK;
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		rdev = le32_to_cpu(sqsh_ino->rdev);
+		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+
+		TRACE("Device inode %x:%x, rdev %x\n",
+				SQUASHFS_INODE_BLK(ino), offset, rdev);
+		break;
+	}
+	case SQUASHFS_LBLKDEV_TYPE:
+	case SQUASHFS_LCHRDEV_TYPE: {
+		struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
+		unsigned int rdev;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		if (type == SQUASHFS_LCHRDEV_TYPE)
+			inode->i_mode |= S_IFCHR;
+		else
+			inode->i_mode |= S_IFBLK;
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
+		inode->i_op = &squashfs_inode_ops;
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		rdev = le32_to_cpu(sqsh_ino->rdev);
+		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+
+		TRACE("Device inode %x:%x, rdev %x\n",
+				SQUASHFS_INODE_BLK(ino), offset, rdev);
+		break;
+	}
+	case SQUASHFS_FIFO_TYPE:
+	case SQUASHFS_SOCKET_TYPE: {
+		struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		if (type == SQUASHFS_FIFO_TYPE)
+			inode->i_mode |= S_IFIFO;
+		else
+			inode->i_mode |= S_IFSOCK;
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		init_special_inode(inode, inode->i_mode, 0);
+		break;
+	}
+	case SQUASHFS_LFIFO_TYPE:
+	case SQUASHFS_LSOCKET_TYPE: {
+		struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		if (type == SQUASHFS_LFIFO_TYPE)
+			inode->i_mode |= S_IFIFO;
+		else
+			inode->i_mode |= S_IFSOCK;
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
+		inode->i_op = &squashfs_inode_ops;
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		init_special_inode(inode, inode->i_mode, 0);
+		break;
+	}
+	default:
+		ERROR("Unknown inode type %d in squashfs_iget!\n", type);
+		return -EINVAL;
+	}
+
+	if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
+		err = squashfs_xattr_lookup(sb, xattr_id,
+					&squashfs_i(inode)->xattr_count,
+					&squashfs_i(inode)->xattr_size,
+					&squashfs_i(inode)->xattr);
+		if (err < 0)
+			goto failed_read;
+		inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
+				+ 1;
+	} else
+		squashfs_i(inode)->xattr_count = 0;
+
+	return 0;
+
+failed_read:
+	ERROR("Unable to read inode 0x%llx\n", ino);
+	return err;
+}
+
+
+const struct inode_operations squashfs_inode_ops = {
+	.getxattr = generic_getxattr,
+	.listxattr = squashfs_listxattr
+};
+
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
new file mode 100644
index 00000000..00f4dfc5
--- /dev/null
+++ b/fs/squashfs/lzo_wrapper.c
@@ -0,0 +1,135 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010 LG Electronics
+ * Chan Jeong <chan.jeong@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * lzo_wrapper.c
+ */
+
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/lzo.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+#include "decompressor.h"
+
+struct squashfs_lzo {
+	void	*input;
+	void	*output;
+};
+
+static void *lzo_init(struct squashfs_sb_info *msblk, void *buff, int len)
+{
+	int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+
+	struct squashfs_lzo *stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+	if (stream == NULL)
+		goto failed;
+	stream->input = vmalloc(block_size);
+	if (stream->input == NULL)
+		goto failed;
+	stream->output = vmalloc(block_size);
+	if (stream->output == NULL)
+		goto failed2;
+
+	return stream;
+
+failed2:
+	vfree(stream->input);
+failed:
+	ERROR("Failed to allocate lzo workspace\n");
+	kfree(stream);
+	return ERR_PTR(-ENOMEM);
+}
+
+
+static void lzo_free(void *strm)
+{
+	struct squashfs_lzo *stream = strm;
+
+	if (stream) {
+		vfree(stream->input);
+		vfree(stream->output);
+	}
+	kfree(stream);
+}
+
+
+static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+	struct buffer_head **bh, int b, int offset, int length, int srclength,
+	int pages)
+{
+	struct squashfs_lzo *stream = msblk->stream;
+	void *buff = stream->input;
+	int avail, i, bytes = length, res;
+	size_t out_len = srclength;
+
+	mutex_lock(&msblk->read_data_mutex);
+
+	for (i = 0; i < b; i++) {
+		wait_on_buffer(bh[i]);
+		if (!buffer_uptodate(bh[i]))
+			goto block_release;
+
+		avail = min(bytes, msblk->devblksize - offset);
+		memcpy(buff, bh[i]->b_data + offset, avail);
+		buff += avail;
+		bytes -= avail;
+		offset = 0;
+		put_bh(bh[i]);
+	}
+
+	res = lzo1x_decompress_safe(stream->input, (size_t)length,
+					stream->output, &out_len);
+	if (res != LZO_E_OK)
+		goto failed;
+
+	res = bytes = (int)out_len;
+	for (i = 0, buff = stream->output; bytes && i < pages; i++) {
+		avail = min_t(int, bytes, PAGE_CACHE_SIZE);
+		memcpy(buffer[i], buff, avail);
+		buff += avail;
+		bytes -= avail;
+	}
+
+	mutex_unlock(&msblk->read_data_mutex);
+	return res;
+
+block_release:
+	for (; i < b; i++)
+		put_bh(bh[i]);
+
+failed:
+	mutex_unlock(&msblk->read_data_mutex);
+
+	ERROR("lzo decompression failed, data probably corrupt\n");
+	return -EIO;
+}
+
+const struct squashfs_decompressor squashfs_lzo_comp_ops = {
+	.init = lzo_init,
+	.free = lzo_free,
+	.decompress = lzo_uncompress,
+	.id = LZO_COMPRESSION,
+	.name = "lzo",
+	.supported = 1
+};
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
new file mode 100644
index 00000000..4bc63ac6
--- /dev/null
+++ b/fs/squashfs/namei.c
@@ -0,0 +1,257 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * namei.c
+ */
+
+/*
+ * This file implements code to do filename lookup in directories.
+ *
+ * Like inodes, directories are packed into compressed metadata blocks, stored
+ * in a directory table.  Directories are accessed using the start address of
+ * the metablock containing the directory and the offset into the
+ * decompressed block (<block, offset>).
+ *
+ * Directories are organised in a slightly complex way, and are not simply
+ * a list of file names.  The organisation takes advantage of the
+ * fact that (in most cases) the inodes of the files will be in the same
+ * compressed metadata block, and therefore, can share the start block.
+ * Directories are therefore organised in a two level list, a directory
+ * header containing the shared start block value, and a sequence of directory
+ * entries, each of which share the shared start block.  A new directory header
+ * is written once/if the inode start block changes.  The directory
+ * header/directory entry list is repeated as many times as necessary.
+ *
+ * Directories are sorted, and can contain a directory index to speed up
+ * file lookup.  Directory indexes store one entry per metablock, each entry
+ * storing the index/filename mapping to the first directory header
+ * in each metadata block.  Directories are sorted in alphabetical order,
+ * and at lookup the index is scanned linearly looking for the first filename
+ * alphabetically larger than the filename being looked up.  At this point the
+ * location of the metadata block the filename is in has been found.
+ * The general idea of the index is ensure only one metadata block needs to be
+ * decompressed to do a lookup irrespective of the length of the directory.
+ * This scheme has the advantage that it doesn't require extra memory overhead
+ * and doesn't require much extra storage on disk.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/dcache.h>
+#include <linux/xattr.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "xattr.h"
+
+/*
+ * Lookup name in the directory index, returning the location of the metadata
+ * block containing it, and the directory index this represents.
+ *
+ * If we get an error reading the index then return the part of the index
+ * (if any) we have managed to read - the index isn't essential, just
+ * quicker.
+ */
+static int get_dir_index_using_name(struct super_block *sb,
+			u64 *next_block, int *next_offset, u64 index_start,
+			int index_offset, int i_count, const char *name,
+			int len)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int i, size, length = 0, err;
+	struct squashfs_dir_index *index;
+	char *str;
+
+	TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count);
+
+	index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL);
+	if (index == NULL) {
+		ERROR("Failed to allocate squashfs_dir_index\n");
+		goto out;
+	}
+
+	str = &index->name[SQUASHFS_NAME_LEN + 1];
+	strncpy(str, name, len);
+	str[len] = '\0';
+
+	for (i = 0; i < i_count; i++) {
+		err = squashfs_read_metadata(sb, index, &index_start,
+					&index_offset, sizeof(*index));
+		if (err < 0)
+			break;
+
+
+		size = le32_to_cpu(index->size) + 1;
+
+		err = squashfs_read_metadata(sb, index->name, &index_start,
+					&index_offset, size);
+		if (err < 0)
+			break;
+
+		index->name[size] = '\0';
+
+		if (strcmp(index->name, str) > 0)
+			break;
+
+		length = le32_to_cpu(index->index);
+		*next_block = le32_to_cpu(index->start_block) +
+					msblk->directory_table;
+	}
+
+	*next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
+	kfree(index);
+
+out:
+	/*
+	 * Return index (f_pos) of the looked up metadata block.  Translate
+	 * from internal f_pos to external f_pos which is offset by 3 because
+	 * we invent "." and ".." entries which are not actually stored in the
+	 * directory.
+	 */
+	return length + 3;
+}
+
+
+static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
+				 struct nameidata *nd)
+{
+	const unsigned char *name = dentry->d_name.name;
+	int len = dentry->d_name.len;
+	struct inode *inode = NULL;
+	struct squashfs_sb_info *msblk = dir->i_sb->s_fs_info;
+	struct squashfs_dir_header dirh;
+	struct squashfs_dir_entry *dire;
+	u64 block = squashfs_i(dir)->start + msblk->directory_table;
+	int offset = squashfs_i(dir)->offset;
+	int err, length = 0, dir_count, size;
+
+	TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset);
+
+	dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
+	if (dire == NULL) {
+		ERROR("Failed to allocate squashfs_dir_entry\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (len > SQUASHFS_NAME_LEN) {
+		err = -ENAMETOOLONG;
+		goto failed;
+	}
+
+	length = get_dir_index_using_name(dir->i_sb, &block, &offset,
+				squashfs_i(dir)->dir_idx_start,
+				squashfs_i(dir)->dir_idx_offset,
+				squashfs_i(dir)->dir_idx_cnt, name, len);
+
+	while (length < i_size_read(dir)) {
+		/*
+		 * Read directory header.
+		 */
+		err = squashfs_read_metadata(dir->i_sb, &dirh, &block,
+				&offset, sizeof(dirh));
+		if (err < 0)
+			goto read_failure;
+
+		length += sizeof(dirh);
+
+		dir_count = le32_to_cpu(dirh.count) + 1;
+
+		/* dir_count should never be larger than 256 */
+		if (dir_count > 256)
+			goto data_error;
+
+		while (dir_count--) {
+			/*
+			 * Read directory entry.
+			 */
+			err = squashfs_read_metadata(dir->i_sb, dire, &block,
+					&offset, sizeof(*dire));
+			if (err < 0)
+				goto read_failure;
+
+			size = le16_to_cpu(dire->size) + 1;
+
+			/* size should never be larger than SQUASHFS_NAME_LEN */
+			if (size > SQUASHFS_NAME_LEN)
+				goto data_error;
+
+			err = squashfs_read_metadata(dir->i_sb, dire->name,
+					&block, &offset, size);
+			if (err < 0)
+				goto read_failure;
+
+			length += sizeof(*dire) + size;
+
+			if (name[0] < dire->name[0])
+				goto exit_lookup;
+
+			if (len == size && !strncmp(name, dire->name, len)) {
+				unsigned int blk, off, ino_num;
+				long long ino;
+				blk = le32_to_cpu(dirh.start_block);
+				off = le16_to_cpu(dire->offset);
+				ino_num = le32_to_cpu(dirh.inode_number) +
+					(short) le16_to_cpu(dire->inode_number);
+				ino = SQUASHFS_MKINODE(blk, off);
+
+				TRACE("calling squashfs_iget for directory "
+					"entry %s, inode  %x:%x, %d\n", name,
+					blk, off, ino_num);
+
+				inode = squashfs_iget(dir->i_sb, ino, ino_num);
+				if (IS_ERR(inode)) {
+					err = PTR_ERR(inode);
+					goto failed;
+				}
+
+				goto exit_lookup;
+			}
+		}
+	}
+
+exit_lookup:
+	kfree(dire);
+	if (inode)
+		return d_splice_alias(inode, dentry);
+	d_add(dentry, inode);
+	return ERR_PTR(0);
+
+data_error:
+	err = -EIO;
+
+read_failure:
+	ERROR("Unable to read directory block [%llx:%x]\n",
+		squashfs_i(dir)->start + msblk->directory_table,
+		squashfs_i(dir)->offset);
+failed:
+	kfree(dire);
+	return ERR_PTR(err);
+}
+
+
+const struct inode_operations squashfs_dir_inode_ops = {
+	.lookup = squashfs_lookup,
+	.getxattr = generic_getxattr,
+	.listxattr = squashfs_listxattr
+};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
new file mode 100644
index 00000000..e3be6a71
--- /dev/null
+++ b/fs/squashfs/squashfs.h
@@ -0,0 +1,102 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs.h
+ */
+
+#define TRACE(s, args...)	pr_debug("SQUASHFS: "s, ## args)
+
+#define ERROR(s, args...)	pr_err("SQUASHFS error: "s, ## args)
+
+#define WARNING(s, args...)	pr_warning("SQUASHFS: "s, ## args)
+
+/* block.c */
+extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
+				int, int);
+
+/* cache.c */
+extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
+extern void squashfs_cache_delete(struct squashfs_cache *);
+extern struct squashfs_cache_entry *squashfs_cache_get(struct super_block *,
+				struct squashfs_cache *, u64, int);
+extern void squashfs_cache_put(struct squashfs_cache_entry *);
+extern int squashfs_copy_data(void *, struct squashfs_cache_entry *, int, int);
+extern int squashfs_read_metadata(struct super_block *, void *, u64 *,
+				int *, int);
+extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *,
+				u64, int);
+extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
+				u64, int);
+extern void *squashfs_read_table(struct super_block *, u64, int);
+
+/* decompressor.c */
+extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
+extern void *squashfs_decompressor_init(struct super_block *, unsigned short);
+
+/* export.c */
+extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64,
+				unsigned int);
+
+/* fragment.c */
+extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *);
+extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
+				u64, u64, unsigned int);
+
+/* id.c */
+extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
+extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, u64,
+				unsigned short);
+
+/* inode.c */
+extern struct inode *squashfs_iget(struct super_block *, long long,
+				unsigned int);
+extern int squashfs_read_inode(struct inode *, long long);
+
+/* xattr.c */
+extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
+
+/*
+ * Inodes, files,  decompressor and xattr operations
+ */
+
+/* dir.c */
+extern const struct file_operations squashfs_dir_ops;
+
+/* export.c */
+extern const struct export_operations squashfs_export_ops;
+
+/* file.c */
+extern const struct address_space_operations squashfs_aops;
+
+/* inode.c */
+extern const struct inode_operations squashfs_inode_ops;
+
+/* namei.c */
+extern const struct inode_operations squashfs_dir_inode_ops;
+
+/* symlink.c */
+extern const struct address_space_operations squashfs_symlink_aops;
+extern const struct inode_operations squashfs_symlink_inode_ops;
+
+/* xattr.c */
+extern const struct xattr_handler *squashfs_xattr_handlers[];
+
+/* zlib_wrapper.c */
+extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
new file mode 100644
index 00000000..b4a4e539
--- /dev/null
+++ b/fs/squashfs/squashfs_fs.h
@@ -0,0 +1,459 @@
+#ifndef SQUASHFS_FS
+#define SQUASHFS_FS
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs.h
+ */
+
+#define SQUASHFS_CACHED_FRAGMENTS	CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE
+#define SQUASHFS_MAJOR			4
+#define SQUASHFS_MINOR			0
+#define SQUASHFS_START			0
+
+/* size of metadata (inode and directory) blocks */
+#define SQUASHFS_METADATA_SIZE		8192
+#define SQUASHFS_METADATA_LOG		13
+
+/* default size of data blocks */
+#define SQUASHFS_FILE_SIZE		131072
+#define SQUASHFS_FILE_LOG		17
+
+#define SQUASHFS_FILE_MAX_SIZE		1048576
+#define SQUASHFS_FILE_MAX_LOG		20
+
+/* Max number of uids and gids */
+#define SQUASHFS_IDS			65536
+
+/* Max length of filename (not 255) */
+#define SQUASHFS_NAME_LEN		256
+
+#define SQUASHFS_INVALID_FRAG		(0xffffffffU)
+#define SQUASHFS_INVALID_XATTR		(0xffffffffU)
+#define SQUASHFS_INVALID_BLK		(-1LL)
+
+/* Filesystem flags */
+#define SQUASHFS_NOI			0
+#define SQUASHFS_NOD			1
+#define SQUASHFS_NOF			3
+#define SQUASHFS_NO_FRAG		4
+#define SQUASHFS_ALWAYS_FRAG		5
+#define SQUASHFS_DUPLICATE		6
+#define SQUASHFS_EXPORT			7
+#define SQUASHFS_COMP_OPT		10
+
+#define SQUASHFS_BIT(flag, bit)		((flag >> bit) & 1)
+
+#define SQUASHFS_UNCOMPRESSED_INODES(flags)	SQUASHFS_BIT(flags, \
+						SQUASHFS_NOI)
+
+#define SQUASHFS_UNCOMPRESSED_DATA(flags)	SQUASHFS_BIT(flags, \
+						SQUASHFS_NOD)
+
+#define SQUASHFS_UNCOMPRESSED_FRAGMENTS(flags)	SQUASHFS_BIT(flags, \
+						SQUASHFS_NOF)
+
+#define SQUASHFS_NO_FRAGMENTS(flags)		SQUASHFS_BIT(flags, \
+						SQUASHFS_NO_FRAG)
+
+#define SQUASHFS_ALWAYS_FRAGMENTS(flags)	SQUASHFS_BIT(flags, \
+						SQUASHFS_ALWAYS_FRAG)
+
+#define SQUASHFS_DUPLICATES(flags)		SQUASHFS_BIT(flags, \
+						SQUASHFS_DUPLICATE)
+
+#define SQUASHFS_EXPORTABLE(flags)		SQUASHFS_BIT(flags, \
+						SQUASHFS_EXPORT)
+
+#define SQUASHFS_COMP_OPTS(flags)		SQUASHFS_BIT(flags, \
+						SQUASHFS_COMP_OPT)
+
+/* Max number of types and file types */
+#define SQUASHFS_DIR_TYPE		1
+#define SQUASHFS_REG_TYPE		2
+#define SQUASHFS_SYMLINK_TYPE		3
+#define SQUASHFS_BLKDEV_TYPE		4
+#define SQUASHFS_CHRDEV_TYPE		5
+#define SQUASHFS_FIFO_TYPE		6
+#define SQUASHFS_SOCKET_TYPE		7
+#define SQUASHFS_LDIR_TYPE		8
+#define SQUASHFS_LREG_TYPE		9
+#define SQUASHFS_LSYMLINK_TYPE		10
+#define SQUASHFS_LBLKDEV_TYPE		11
+#define SQUASHFS_LCHRDEV_TYPE		12
+#define SQUASHFS_LFIFO_TYPE		13
+#define SQUASHFS_LSOCKET_TYPE		14
+
+/* Xattr types */
+#define SQUASHFS_XATTR_USER             0
+#define SQUASHFS_XATTR_TRUSTED          1
+#define SQUASHFS_XATTR_SECURITY         2
+#define SQUASHFS_XATTR_VALUE_OOL        256
+#define SQUASHFS_XATTR_PREFIX_MASK      0xff
+
+/* Flag whether block is compressed or uncompressed, bit is set if block is
+ * uncompressed */
+#define SQUASHFS_COMPRESSED_BIT		(1 << 15)
+
+#define SQUASHFS_COMPRESSED_SIZE(B)	(((B) & ~SQUASHFS_COMPRESSED_BIT) ? \
+		(B) & ~SQUASHFS_COMPRESSED_BIT :  SQUASHFS_COMPRESSED_BIT)
+
+#define SQUASHFS_COMPRESSED(B)		(!((B) & SQUASHFS_COMPRESSED_BIT))
+
+#define SQUASHFS_COMPRESSED_BIT_BLOCK	(1 << 24)
+
+#define SQUASHFS_COMPRESSED_SIZE_BLOCK(B)	((B) & \
+						~SQUASHFS_COMPRESSED_BIT_BLOCK)
+
+#define SQUASHFS_COMPRESSED_BLOCK(B)	(!((B) & SQUASHFS_COMPRESSED_BIT_BLOCK))
+
+/*
+ * Inode number ops.  Inodes consist of a compressed block number, and an
+ * uncompressed offset within that block
+ */
+#define SQUASHFS_INODE_BLK(A)		((unsigned int) ((A) >> 16))
+
+#define SQUASHFS_INODE_OFFSET(A)	((unsigned int) ((A) & 0xffff))
+
+#define SQUASHFS_MKINODE(A, B)		((long long)(((long long) (A)\
+					<< 16) + (B)))
+
+/* Translate between VFS mode and squashfs mode */
+#define SQUASHFS_MODE(A)		((A) & 0xfff)
+
+/* fragment and fragment table defines */
+#define SQUASHFS_FRAGMENT_BYTES(A)	\
+				((A) * sizeof(struct squashfs_fragment_entry))
+
+#define SQUASHFS_FRAGMENT_INDEX(A)	(SQUASHFS_FRAGMENT_BYTES(A) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_FRAGMENT_INDEX_OFFSET(A)	(SQUASHFS_FRAGMENT_BYTES(A) % \
+						SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_FRAGMENT_INDEXES(A)	((SQUASHFS_FRAGMENT_BYTES(A) + \
+					SQUASHFS_METADATA_SIZE - 1) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_FRAGMENT_INDEX_BYTES(A)	(SQUASHFS_FRAGMENT_INDEXES(A) *\
+						sizeof(u64))
+
+/* inode lookup table defines */
+#define SQUASHFS_LOOKUP_BYTES(A)	((A) * sizeof(u64))
+
+#define SQUASHFS_LOOKUP_BLOCK(A)	(SQUASHFS_LOOKUP_BYTES(A) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_LOOKUP_BLOCK_OFFSET(A)	(SQUASHFS_LOOKUP_BYTES(A) % \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_LOOKUP_BLOCKS(A)	((SQUASHFS_LOOKUP_BYTES(A) + \
+					SQUASHFS_METADATA_SIZE - 1) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_LOOKUP_BLOCK_BYTES(A)	(SQUASHFS_LOOKUP_BLOCKS(A) *\
+					sizeof(u64))
+
+/* uid/gid lookup table defines */
+#define SQUASHFS_ID_BYTES(A)		((A) * sizeof(unsigned int))
+
+#define SQUASHFS_ID_BLOCK(A)		(SQUASHFS_ID_BYTES(A) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_ID_BLOCK_OFFSET(A)	(SQUASHFS_ID_BYTES(A) % \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_ID_BLOCKS(A)		((SQUASHFS_ID_BYTES(A) + \
+					SQUASHFS_METADATA_SIZE - 1) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_ID_BLOCK_BYTES(A)	(SQUASHFS_ID_BLOCKS(A) *\
+					sizeof(u64))
+/* xattr id lookup table defines */
+#define SQUASHFS_XATTR_BYTES(A)		((A) * sizeof(struct squashfs_xattr_id))
+
+#define SQUASHFS_XATTR_BLOCK(A)		(SQUASHFS_XATTR_BYTES(A) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCK_OFFSET(A)	(SQUASHFS_XATTR_BYTES(A) % \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCKS(A)	((SQUASHFS_XATTR_BYTES(A) + \
+					SQUASHFS_METADATA_SIZE - 1) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCK_BYTES(A)	(SQUASHFS_XATTR_BLOCKS(A) *\
+					sizeof(u64))
+#define SQUASHFS_XATTR_BLK(A)		((unsigned int) ((A) >> 16))
+
+#define SQUASHFS_XATTR_OFFSET(A)	((unsigned int) ((A) & 0xffff))
+
+/* cached data constants for filesystem */
+#define SQUASHFS_CACHED_BLKS		8
+
+#define SQUASHFS_MAX_FILE_SIZE_LOG	64
+
+#define SQUASHFS_MAX_FILE_SIZE		(1LL << \
+					(SQUASHFS_MAX_FILE_SIZE_LOG - 2))
+
+/* meta index cache */
+#define SQUASHFS_META_INDEXES	(SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
+#define SQUASHFS_META_ENTRIES	127
+#define SQUASHFS_META_SLOTS	8
+
+struct meta_entry {
+	u64			data_block;
+	unsigned int		index_block;
+	unsigned short		offset;
+	unsigned short		pad;
+};
+
+struct meta_index {
+	unsigned int		inode_number;
+	unsigned int		offset;
+	unsigned short		entries;
+	unsigned short		skip;
+	unsigned short		locked;
+	unsigned short		pad;
+	struct meta_entry	meta_entry[SQUASHFS_META_ENTRIES];
+};
+
+
+/*
+ * definitions for structures on disk
+ */
+#define ZLIB_COMPRESSION	1
+#define LZMA_COMPRESSION	2
+#define LZO_COMPRESSION		3
+#define XZ_COMPRESSION		4
+
+struct squashfs_super_block {
+	__le32			s_magic;
+	__le32			inodes;
+	__le32			mkfs_time;
+	__le32			block_size;
+	__le32			fragments;
+	__le16			compression;
+	__le16			block_log;
+	__le16			flags;
+	__le16			no_ids;
+	__le16			s_major;
+	__le16			s_minor;
+	__le64			root_inode;
+	__le64			bytes_used;
+	__le64			id_table_start;
+	__le64			xattr_id_table_start;
+	__le64			inode_table_start;
+	__le64			directory_table_start;
+	__le64			fragment_table_start;
+	__le64			lookup_table_start;
+};
+
+struct squashfs_dir_index {
+	__le32			index;
+	__le32			start_block;
+	__le32			size;
+	unsigned char		name[0];
+};
+
+struct squashfs_base_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+};
+
+struct squashfs_ipc_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			nlink;
+};
+
+struct squashfs_lipc_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			nlink;
+	__le32			xattr;
+};
+
+struct squashfs_dev_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			nlink;
+	__le32			rdev;
+};
+
+struct squashfs_ldev_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			nlink;
+	__le32			rdev;
+	__le32			xattr;
+};
+
+struct squashfs_symlink_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			nlink;
+	__le32			symlink_size;
+	char			symlink[0];
+};
+
+struct squashfs_reg_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			start_block;
+	__le32			fragment;
+	__le32			offset;
+	__le32			file_size;
+	__le16			block_list[0];
+};
+
+struct squashfs_lreg_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le64			start_block;
+	__le64			file_size;
+	__le64			sparse;
+	__le32			nlink;
+	__le32			fragment;
+	__le32			offset;
+	__le32			xattr;
+	__le16			block_list[0];
+};
+
+struct squashfs_dir_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			start_block;
+	__le32			nlink;
+	__le16			file_size;
+	__le16			offset;
+	__le32			parent_inode;
+};
+
+struct squashfs_ldir_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32			inode_number;
+	__le32			nlink;
+	__le32			file_size;
+	__le32			start_block;
+	__le32			parent_inode;
+	__le16			i_count;
+	__le16			offset;
+	__le32			xattr;
+	struct squashfs_dir_index	index[0];
+};
+
+union squashfs_inode {
+	struct squashfs_base_inode		base;
+	struct squashfs_dev_inode		dev;
+	struct squashfs_ldev_inode		ldev;
+	struct squashfs_symlink_inode		symlink;
+	struct squashfs_reg_inode		reg;
+	struct squashfs_lreg_inode		lreg;
+	struct squashfs_dir_inode		dir;
+	struct squashfs_ldir_inode		ldir;
+	struct squashfs_ipc_inode		ipc;
+	struct squashfs_lipc_inode		lipc;
+};
+
+struct squashfs_dir_entry {
+	__le16			offset;
+	__le16			inode_number;
+	__le16			type;
+	__le16			size;
+	char			name[0];
+};
+
+struct squashfs_dir_header {
+	__le32			count;
+	__le32			start_block;
+	__le32			inode_number;
+};
+
+struct squashfs_fragment_entry {
+	__le64			start_block;
+	__le32			size;
+	unsigned int		unused;
+};
+
+struct squashfs_xattr_entry {
+	__le16			type;
+	__le16			size;
+	char			data[0];
+};
+
+struct squashfs_xattr_val {
+	__le32			vsize;
+	char			value[0];
+};
+
+struct squashfs_xattr_id {
+	__le64			xattr;
+	__le32			count;
+	__le32			size;
+};
+
+struct squashfs_xattr_id_table {
+	__le64			xattr_table_start;
+	__le32			xattr_ids;
+	__le32			unused;
+};
+
+#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
new file mode 100644
index 00000000..73588e77
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -0,0 +1,54 @@
+#ifndef SQUASHFS_FS_I
+#define SQUASHFS_FS_I
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs_i.h
+ */
+
+struct squashfs_inode_info {
+	u64		start;
+	int		offset;
+	u64		xattr;
+	unsigned int	xattr_size;
+	int		xattr_count;
+	union {
+		struct {
+			u64		fragment_block;
+			int		fragment_size;
+			int		fragment_offset;
+			u64		block_list_start;
+		};
+		struct {
+			u64		dir_idx_start;
+			int		dir_idx_offset;
+			int		dir_idx_cnt;
+			int		parent;
+		};
+	};
+	struct inode	vfs_inode;
+};
+
+
+static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
+{
+	return list_entry(inode, struct squashfs_inode_info, vfs_inode);
+}
+#endif
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
new file mode 100644
index 00000000..651f0b31
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -0,0 +1,79 @@
+#ifndef SQUASHFS_FS_SB
+#define SQUASHFS_FS_SB
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs_sb.h
+ */
+
+#include "squashfs_fs.h"
+
+struct squashfs_cache {
+	char			*name;
+	int			entries;
+	int			next_blk;
+	int			num_waiters;
+	int			unused;
+	int			block_size;
+	int			pages;
+	spinlock_t		lock;
+	wait_queue_head_t	wait_queue;
+	struct squashfs_cache_entry *entry;
+};
+
+struct squashfs_cache_entry {
+	u64			block;
+	int			length;
+	int			refcount;
+	u64			next_index;
+	int			pending;
+	int			error;
+	int			num_waiters;
+	wait_queue_head_t	wait_queue;
+	struct squashfs_cache	*cache;
+	void			**data;
+};
+
+struct squashfs_sb_info {
+	const struct squashfs_decompressor	*decompressor;
+	int					devblksize;
+	int					devblksize_log2;
+	struct squashfs_cache			*block_cache;
+	struct squashfs_cache			*fragment_cache;
+	struct squashfs_cache			*read_page;
+	int					next_meta_index;
+	__le64					*id_table;
+	__le64					*fragment_index;
+	__le64					*xattr_id_table;
+	struct mutex				read_data_mutex;
+	struct mutex				meta_index_mutex;
+	struct meta_index			*meta_index;
+	void					*stream;
+	__le64					*inode_lookup_table;
+	u64					inode_table;
+	u64					directory_table;
+	u64					xattr_table;
+	unsigned int				block_size;
+	unsigned short				block_log;
+	long long				bytes_used;
+	unsigned int				inodes;
+	int					xattr_ids;
+};
+#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
new file mode 100644
index 00000000..7438850c
--- /dev/null
+++ b/fs/squashfs/super.c
@@ -0,0 +1,497 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * super.c
+ */
+
+/*
+ * This file implements code to read the superblock, read and initialise
+ * in-memory structures at mount time, and all the VFS glue code to register
+ * the filesystem.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/magic.h>
+#include <linux/xattr.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "decompressor.h"
+#include "xattr.h"
+
+static struct file_system_type squashfs_fs_type;
+static const struct super_operations squashfs_super_ops;
+
+static const struct squashfs_decompressor *supported_squashfs_filesystem(short
+	major, short minor, short id)
+{
+	const struct squashfs_decompressor *decompressor;
+
+	if (major < SQUASHFS_MAJOR) {
+		ERROR("Major/Minor mismatch, older Squashfs %d.%d "
+			"filesystems are unsupported\n", major, minor);
+		return NULL;
+	} else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
+		ERROR("Major/Minor mismatch, trying to mount newer "
+			"%d.%d filesystem\n", major, minor);
+		ERROR("Please update your kernel\n");
+		return NULL;
+	}
+
+	decompressor = squashfs_lookup_decompressor(id);
+	if (!decompressor->supported) {
+		ERROR("Filesystem uses \"%s\" compression. This is not "
+			"supported\n", decompressor->name);
+		return NULL;
+	}
+
+	return decompressor;
+}
+
+
+static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct squashfs_sb_info *msblk;
+	struct squashfs_super_block *sblk = NULL;
+	char b[BDEVNAME_SIZE];
+	struct inode *root;
+	long long root_inode;
+	unsigned short flags;
+	unsigned int fragments;
+	u64 lookup_table_start, xattr_id_table_start, next_table;
+	int err;
+
+	TRACE("Entered squashfs_fill_superblock\n");
+
+	sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
+	if (sb->s_fs_info == NULL) {
+		ERROR("Failed to allocate squashfs_sb_info\n");
+		return -ENOMEM;
+	}
+	msblk = sb->s_fs_info;
+
+	msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE);
+	msblk->devblksize_log2 = ffz(~msblk->devblksize);
+
+	mutex_init(&msblk->read_data_mutex);
+	mutex_init(&msblk->meta_index_mutex);
+
+	/*
+	 * msblk->bytes_used is checked in squashfs_read_table to ensure reads
+	 * are not beyond filesystem end.  But as we're using
+	 * squashfs_read_table here to read the superblock (including the value
+	 * of bytes_used) we need to set it to an initial sensible dummy value
+	 */
+	msblk->bytes_used = sizeof(*sblk);
+	sblk = squashfs_read_table(sb, SQUASHFS_START, sizeof(*sblk));
+
+	if (IS_ERR(sblk)) {
+		ERROR("unable to read squashfs_super_block\n");
+		err = PTR_ERR(sblk);
+		sblk = NULL;
+		goto failed_mount;
+	}
+
+	err = -EINVAL;
+
+	/* Check it is a SQUASHFS superblock */
+	sb->s_magic = le32_to_cpu(sblk->s_magic);
+	if (sb->s_magic != SQUASHFS_MAGIC) {
+		if (!silent)
+			ERROR("Can't find a SQUASHFS superblock on %s\n",
+						bdevname(sb->s_bdev, b));
+		goto failed_mount;
+	}
+
+	/* Check the MAJOR & MINOR versions and lookup compression type */
+	msblk->decompressor = supported_squashfs_filesystem(
+			le16_to_cpu(sblk->s_major),
+			le16_to_cpu(sblk->s_minor),
+			le16_to_cpu(sblk->compression));
+	if (msblk->decompressor == NULL)
+		goto failed_mount;
+
+	/* Check the filesystem does not extend beyond the end of the
+	   block device */
+	msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
+	if (msblk->bytes_used < 0 || msblk->bytes_used >
+			i_size_read(sb->s_bdev->bd_inode))
+		goto failed_mount;
+
+	/* Check block size for sanity */
+	msblk->block_size = le32_to_cpu(sblk->block_size);
+	if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE)
+		goto failed_mount;
+
+	/*
+	 * Check the system page size is not larger than the filesystem
+	 * block size (by default 128K).  This is currently not supported.
+	 */
+	if (PAGE_CACHE_SIZE > msblk->block_size) {
+		ERROR("Page size > filesystem block size (%d).  This is "
+			"currently not supported!\n", msblk->block_size);
+		goto failed_mount;
+	}
+
+	msblk->block_log = le16_to_cpu(sblk->block_log);
+	if (msblk->block_log > SQUASHFS_FILE_MAX_LOG)
+		goto failed_mount;
+
+	/* Check the root inode for sanity */
+	root_inode = le64_to_cpu(sblk->root_inode);
+	if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE)
+		goto failed_mount;
+
+	msblk->inode_table = le64_to_cpu(sblk->inode_table_start);
+	msblk->directory_table = le64_to_cpu(sblk->directory_table_start);
+	msblk->inodes = le32_to_cpu(sblk->inodes);
+	flags = le16_to_cpu(sblk->flags);
+
+	TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b));
+	TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags)
+				? "un" : "");
+	TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags)
+				? "un" : "");
+	TRACE("Filesystem size %lld bytes\n", msblk->bytes_used);
+	TRACE("Block size %d\n", msblk->block_size);
+	TRACE("Number of inodes %d\n", msblk->inodes);
+	TRACE("Number of fragments %d\n", le32_to_cpu(sblk->fragments));
+	TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids));
+	TRACE("sblk->inode_table_start %llx\n", msblk->inode_table);
+	TRACE("sblk->directory_table_start %llx\n", msblk->directory_table);
+	TRACE("sblk->fragment_table_start %llx\n",
+		(u64) le64_to_cpu(sblk->fragment_table_start));
+	TRACE("sblk->id_table_start %llx\n",
+		(u64) le64_to_cpu(sblk->id_table_start));
+
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_flags |= MS_RDONLY;
+	sb->s_op = &squashfs_super_ops;
+
+	err = -ENOMEM;
+
+	msblk->block_cache = squashfs_cache_init("metadata",
+			SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
+	if (msblk->block_cache == NULL)
+		goto failed_mount;
+
+	/* Allocate read_page block */
+	msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size);
+	if (msblk->read_page == NULL) {
+		ERROR("Failed to allocate read_page block\n");
+		goto failed_mount;
+	}
+
+	msblk->stream = squashfs_decompressor_init(sb, flags);
+	if (IS_ERR(msblk->stream)) {
+		err = PTR_ERR(msblk->stream);
+		msblk->stream = NULL;
+		goto failed_mount;
+	}
+
+	/* Handle xattrs */
+	sb->s_xattr = squashfs_xattr_handlers;
+	xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
+	if (xattr_id_table_start == SQUASHFS_INVALID_BLK) {
+		next_table = msblk->bytes_used;
+		goto allocate_id_index_table;
+	}
+
+	/* Allocate and read xattr id lookup table */
+	msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
+		xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
+	if (IS_ERR(msblk->xattr_id_table)) {
+		ERROR("unable to read xattr id index table\n");
+		err = PTR_ERR(msblk->xattr_id_table);
+		msblk->xattr_id_table = NULL;
+		if (err != -ENOTSUPP)
+			goto failed_mount;
+	}
+	next_table = msblk->xattr_table;
+
+allocate_id_index_table:
+	/* Allocate and read id index table */
+	msblk->id_table = squashfs_read_id_index_table(sb,
+		le64_to_cpu(sblk->id_table_start), next_table,
+		le16_to_cpu(sblk->no_ids));
+	if (IS_ERR(msblk->id_table)) {
+		ERROR("unable to read id index table\n");
+		err = PTR_ERR(msblk->id_table);
+		msblk->id_table = NULL;
+		goto failed_mount;
+	}
+	next_table = le64_to_cpu(msblk->id_table[0]);
+
+	/* Handle inode lookup table */
+	lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
+	if (lookup_table_start == SQUASHFS_INVALID_BLK)
+		goto handle_fragments;
+
+	/* Allocate and read inode lookup table */
+	msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
+		lookup_table_start, next_table, msblk->inodes);
+	if (IS_ERR(msblk->inode_lookup_table)) {
+		ERROR("unable to read inode lookup table\n");
+		err = PTR_ERR(msblk->inode_lookup_table);
+		msblk->inode_lookup_table = NULL;
+		goto failed_mount;
+	}
+	next_table = le64_to_cpu(msblk->inode_lookup_table[0]);
+
+	sb->s_export_op = &squashfs_export_ops;
+
+handle_fragments:
+	fragments = le32_to_cpu(sblk->fragments);
+	if (fragments == 0)
+		goto check_directory_table;
+
+	msblk->fragment_cache = squashfs_cache_init("fragment",
+		SQUASHFS_CACHED_FRAGMENTS, msblk->block_size);
+	if (msblk->fragment_cache == NULL) {
+		err = -ENOMEM;
+		goto failed_mount;
+	}
+
+	/* Allocate and read fragment index table */
+	msblk->fragment_index = squashfs_read_fragment_index_table(sb,
+		le64_to_cpu(sblk->fragment_table_start), next_table, fragments);
+	if (IS_ERR(msblk->fragment_index)) {
+		ERROR("unable to read fragment index table\n");
+		err = PTR_ERR(msblk->fragment_index);
+		msblk->fragment_index = NULL;
+		goto failed_mount;
+	}
+	next_table = le64_to_cpu(msblk->fragment_index[0]);
+
+check_directory_table:
+	/* Sanity check directory_table */
+	if (msblk->directory_table >= next_table) {
+		err = -EINVAL;
+		goto failed_mount;
+	}
+
+	/* Sanity check inode_table */
+	if (msblk->inode_table >= msblk->directory_table) {
+		err = -EINVAL;
+		goto failed_mount;
+	}
+
+	/* allocate root */
+	root = new_inode(sb);
+	if (!root) {
+		err = -ENOMEM;
+		goto failed_mount;
+	}
+
+	err = squashfs_read_inode(root, root_inode);
+	if (err) {
+		make_bad_inode(root);
+		iput(root);
+		goto failed_mount;
+	}
+	insert_inode_hash(root);
+
+	sb->s_root = d_alloc_root(root);
+	if (sb->s_root == NULL) {
+		ERROR("Root inode create failed\n");
+		err = -ENOMEM;
+		iput(root);
+		goto failed_mount;
+	}
+
+	TRACE("Leaving squashfs_fill_super\n");
+	kfree(sblk);
+	return 0;
+
+failed_mount:
+	squashfs_cache_delete(msblk->block_cache);
+	squashfs_cache_delete(msblk->fragment_cache);
+	squashfs_cache_delete(msblk->read_page);
+	squashfs_decompressor_free(msblk, msblk->stream);
+	kfree(msblk->inode_lookup_table);
+	kfree(msblk->fragment_index);
+	kfree(msblk->id_table);
+	kfree(msblk->xattr_id_table);
+	kfree(sb->s_fs_info);
+	sb->s_fs_info = NULL;
+	kfree(sblk);
+	return err;
+}
+
+
+static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
+	u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev);
+
+	TRACE("Entered squashfs_statfs\n");
+
+	buf->f_type = SQUASHFS_MAGIC;
+	buf->f_bsize = msblk->block_size;
+	buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1;
+	buf->f_bfree = buf->f_bavail = 0;
+	buf->f_files = msblk->inodes;
+	buf->f_ffree = 0;
+	buf->f_namelen = SQUASHFS_NAME_LEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+
+static int squashfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+
+static void squashfs_put_super(struct super_block *sb)
+{
+	if (sb->s_fs_info) {
+		struct squashfs_sb_info *sbi = sb->s_fs_info;
+		squashfs_cache_delete(sbi->block_cache);
+		squashfs_cache_delete(sbi->fragment_cache);
+		squashfs_cache_delete(sbi->read_page);
+		squashfs_decompressor_free(sbi, sbi->stream);
+		kfree(sbi->id_table);
+		kfree(sbi->fragment_index);
+		kfree(sbi->meta_index);
+		kfree(sbi->inode_lookup_table);
+		kfree(sbi->xattr_id_table);
+		kfree(sb->s_fs_info);
+		sb->s_fs_info = NULL;
+	}
+}
+
+
+static struct dentry *squashfs_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
+}
+
+
+static struct kmem_cache *squashfs_inode_cachep;
+
+
+static void init_once(void *foo)
+{
+	struct squashfs_inode_info *ei = foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+
+static int __init init_inodecache(void)
+{
+	squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
+		sizeof(struct squashfs_inode_info), 0,
+		SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
+
+	return squashfs_inode_cachep ? 0 : -ENOMEM;
+}
+
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(squashfs_inode_cachep);
+}
+
+
+static int __init init_squashfs_fs(void)
+{
+	int err = init_inodecache();
+
+	if (err)
+		return err;
+
+	err = register_filesystem(&squashfs_fs_type);
+	if (err) {
+		destroy_inodecache();
+		return err;
+	}
+
+	printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) "
+		"Phillip Lougher\n");
+
+	return 0;
+}
+
+
+static void __exit exit_squashfs_fs(void)
+{
+	unregister_filesystem(&squashfs_fs_type);
+	destroy_inodecache();
+}
+
+
+static struct inode *squashfs_alloc_inode(struct super_block *sb)
+{
+	struct squashfs_inode_info *ei =
+		kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL);
+
+	return ei ? &ei->vfs_inode : NULL;
+}
+
+
+static void squashfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
+}
+
+static void squashfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, squashfs_i_callback);
+}
+
+
+static struct file_system_type squashfs_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "squashfs",
+	.mount = squashfs_mount,
+	.kill_sb = kill_block_super,
+	.fs_flags = FS_REQUIRES_DEV
+};
+
+static const struct super_operations squashfs_super_ops = {
+	.alloc_inode = squashfs_alloc_inode,
+	.destroy_inode = squashfs_destroy_inode,
+	.statfs = squashfs_statfs,
+	.put_super = squashfs_put_super,
+	.remount_fs = squashfs_remount
+};
+
+module_init(init_squashfs_fs);
+module_exit(exit_squashfs_fs);
+MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem");
+MODULE_AUTHOR("Phillip Lougher <phillip@squashfs.org.uk>");
+MODULE_LICENSE("GPL");
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
new file mode 100644
index 00000000..11918172
--- /dev/null
+++ b/fs/squashfs/symlink.c
@@ -0,0 +1,127 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * symlink.c
+ */
+
+/*
+ * This file implements code to handle symbolic links.
+ *
+ * The data contents of symbolic links are stored inside the symbolic
+ * link inode within the inode table.  This allows the normally small symbolic
+ * link to be compressed as part of the inode table, achieving much greater
+ * compression than if the symbolic link was compressed individually.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "xattr.h"
+
+static int squashfs_symlink_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int index = page->index << PAGE_CACHE_SHIFT;
+	u64 block = squashfs_i(inode)->start;
+	int offset = squashfs_i(inode)->offset;
+	int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE);
+	int bytes, copied;
+	void *pageaddr;
+	struct squashfs_cache_entry *entry;
+
+	TRACE("Entered squashfs_symlink_readpage, page index %ld, start block "
+			"%llx, offset %x\n", page->index, block, offset);
+
+	/*
+	 * Skip index bytes into symlink metadata.
+	 */
+	if (index) {
+		bytes = squashfs_read_metadata(sb, NULL, &block, &offset,
+								index);
+		if (bytes < 0) {
+			ERROR("Unable to read symlink [%llx:%x]\n",
+				squashfs_i(inode)->start,
+				squashfs_i(inode)->offset);
+			goto error_out;
+		}
+	}
+
+	/*
+	 * Read length bytes from symlink metadata.  Squashfs_read_metadata
+	 * is not used here because it can sleep and we want to use
+	 * kmap_atomic to map the page.  Instead call the underlying
+	 * squashfs_cache_get routine.  As length bytes may overlap metadata
+	 * blocks, we may need to call squashfs_cache_get multiple times.
+	 */
+	for (bytes = 0; bytes < length; offset = 0, bytes += copied) {
+		entry = squashfs_cache_get(sb, msblk->block_cache, block, 0);
+		if (entry->error) {
+			ERROR("Unable to read symlink [%llx:%x]\n",
+				squashfs_i(inode)->start,
+				squashfs_i(inode)->offset);
+			squashfs_cache_put(entry);
+			goto error_out;
+		}
+
+		pageaddr = kmap_atomic(page, KM_USER0);
+		copied = squashfs_copy_data(pageaddr + bytes, entry, offset,
+								length - bytes);
+		if (copied == length - bytes)
+			memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length);
+		else
+			block = entry->next_index;
+		kunmap_atomic(pageaddr, KM_USER0);
+		squashfs_cache_put(entry);
+	}
+
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	unlock_page(page);
+	return 0;
+
+error_out:
+	SetPageError(page);
+	unlock_page(page);
+	return 0;
+}
+
+
+const struct address_space_operations squashfs_symlink_aops = {
+	.readpage = squashfs_symlink_readpage
+};
+
+const struct inode_operations squashfs_symlink_inode_ops = {
+	.readlink = generic_readlink,
+	.follow_link = page_follow_link_light,
+	.put_link = page_put_link,
+	.getxattr = generic_getxattr,
+	.listxattr = squashfs_listxattr
+};
+
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
new file mode 100644
index 00000000..92fcde7b
--- /dev/null
+++ b/fs/squashfs/xattr.c
@@ -0,0 +1,324 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr.c
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+static const struct xattr_handler *squashfs_xattr_handler(int);
+
+ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
+	size_t buffer_size)
+{
+	struct inode *inode = d->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+						 + msblk->xattr_table;
+	int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+	int count = squashfs_i(inode)->xattr_count;
+	size_t rest = buffer_size;
+	int err;
+
+	/* check that the file system has xattrs */
+	if (msblk->xattr_id_table == NULL)
+		return -EOPNOTSUPP;
+
+	/* loop reading each xattr name */
+	while (count--) {
+		struct squashfs_xattr_entry entry;
+		struct squashfs_xattr_val val;
+		const struct xattr_handler *handler;
+		int name_size, prefix_size = 0;
+
+		err = squashfs_read_metadata(sb, &entry, &start, &offset,
+							sizeof(entry));
+		if (err < 0)
+			goto failed;
+
+		name_size = le16_to_cpu(entry.size);
+		handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
+		if (handler)
+			prefix_size = handler->list(d, buffer, rest, NULL,
+				name_size, handler->flags);
+		if (prefix_size) {
+			if (buffer) {
+				if (prefix_size + name_size + 1 > rest) {
+					err = -ERANGE;
+					goto failed;
+				}
+				buffer += prefix_size;
+			}
+			err = squashfs_read_metadata(sb, buffer, &start,
+				&offset, name_size);
+			if (err < 0)
+				goto failed;
+			if (buffer) {
+				buffer[name_size] = '\0';
+				buffer += name_size + 1;
+			}
+			rest -= prefix_size + name_size + 1;
+		} else  {
+			/* no handler or insuffficient privileges, so skip */
+			err = squashfs_read_metadata(sb, NULL, &start,
+				&offset, name_size);
+			if (err < 0)
+				goto failed;
+		}
+
+
+		/* skip remaining xattr entry */
+		err = squashfs_read_metadata(sb, &val, &start, &offset,
+						sizeof(val));
+		if (err < 0)
+			goto failed;
+
+		err = squashfs_read_metadata(sb, NULL, &start, &offset,
+						le32_to_cpu(val.vsize));
+		if (err < 0)
+			goto failed;
+	}
+	err = buffer_size - rest;
+
+failed:
+	return err;
+}
+
+
+static int squashfs_xattr_get(struct inode *inode, int name_index,
+	const char *name, void *buffer, size_t buffer_size)
+{
+	struct super_block *sb = inode->i_sb;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+						 + msblk->xattr_table;
+	int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+	int count = squashfs_i(inode)->xattr_count;
+	int name_len = strlen(name);
+	int err, vsize;
+	char *target = kmalloc(name_len, GFP_KERNEL);
+
+	if (target == NULL)
+		return  -ENOMEM;
+
+	/* loop reading each xattr name */
+	for (; count; count--) {
+		struct squashfs_xattr_entry entry;
+		struct squashfs_xattr_val val;
+		int type, prefix, name_size;
+
+		err = squashfs_read_metadata(sb, &entry, &start, &offset,
+							sizeof(entry));
+		if (err < 0)
+			goto failed;
+
+		name_size = le16_to_cpu(entry.size);
+		type = le16_to_cpu(entry.type);
+		prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
+
+		if (prefix == name_index && name_size == name_len)
+			err = squashfs_read_metadata(sb, target, &start,
+						&offset, name_size);
+		else
+			err = squashfs_read_metadata(sb, NULL, &start,
+						&offset, name_size);
+		if (err < 0)
+			goto failed;
+
+		if (prefix == name_index && name_size == name_len &&
+					strncmp(target, name, name_size) == 0) {
+			/* found xattr */
+			if (type & SQUASHFS_XATTR_VALUE_OOL) {
+				__le64 xattr_val;
+				u64 xattr;
+				/* val is a reference to the real location */
+				err = squashfs_read_metadata(sb, &val, &start,
+						&offset, sizeof(val));
+				if (err < 0)
+					goto failed;
+				err = squashfs_read_metadata(sb, &xattr_val,
+					&start, &offset, sizeof(xattr_val));
+				if (err < 0)
+					goto failed;
+				xattr = le64_to_cpu(xattr_val);
+				start = SQUASHFS_XATTR_BLK(xattr) +
+							msblk->xattr_table;
+				offset = SQUASHFS_XATTR_OFFSET(xattr);
+			}
+			/* read xattr value */
+			err = squashfs_read_metadata(sb, &val, &start, &offset,
+							sizeof(val));
+			if (err < 0)
+				goto failed;
+
+			vsize = le32_to_cpu(val.vsize);
+			if (buffer) {
+				if (vsize > buffer_size) {
+					err = -ERANGE;
+					goto failed;
+				}
+				err = squashfs_read_metadata(sb, buffer, &start,
+					 &offset, vsize);
+				if (err < 0)
+					goto failed;
+			}
+			break;
+		}
+
+		/* no match, skip remaining xattr entry */
+		err = squashfs_read_metadata(sb, &val, &start, &offset,
+							sizeof(val));
+		if (err < 0)
+			goto failed;
+		err = squashfs_read_metadata(sb, NULL, &start, &offset,
+						le32_to_cpu(val.vsize));
+		if (err < 0)
+			goto failed;
+	}
+	err = count ? vsize : -ENODATA;
+
+failed:
+	kfree(target);
+	return err;
+}
+
+
+/*
+ * User namespace support
+ */
+static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
+	const char *name, size_t name_len, int type)
+{
+	if (list && XATTR_USER_PREFIX_LEN <= list_size)
+		memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+	return XATTR_USER_PREFIX_LEN;
+}
+
+static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
+	size_t size, int type)
+{
+	if (name[0] == '\0')
+		return  -EINVAL;
+
+	return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
+		buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= squashfs_user_list,
+	.get	= squashfs_user_get
+};
+
+/*
+ * Trusted namespace support
+ */
+static size_t squashfs_trusted_list(struct dentry *d, char *list,
+	size_t list_size, const char *name, size_t name_len, int type)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return 0;
+
+	if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
+		memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+	return XATTR_TRUSTED_PREFIX_LEN;
+}
+
+static int squashfs_trusted_get(struct dentry *d, const char *name,
+	void *buffer, size_t size, int type)
+{
+	if (name[0] == '\0')
+		return  -EINVAL;
+
+	return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
+		buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= squashfs_trusted_list,
+	.get	= squashfs_trusted_get
+};
+
+/*
+ * Security namespace support
+ */
+static size_t squashfs_security_list(struct dentry *d, char *list,
+	size_t list_size, const char *name, size_t name_len, int type)
+{
+	if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
+		memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
+	return XATTR_SECURITY_PREFIX_LEN;
+}
+
+static int squashfs_security_get(struct dentry *d, const char *name,
+	void *buffer, size_t size, int type)
+{
+	if (name[0] == '\0')
+		return  -EINVAL;
+
+	return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
+		buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= squashfs_security_list,
+	.get	= squashfs_security_get
+};
+
+static const struct xattr_handler *squashfs_xattr_handler(int type)
+{
+	if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
+		/* ignore unrecognised type */
+		return NULL;
+
+	switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
+	case SQUASHFS_XATTR_USER:
+		return &squashfs_xattr_user_handler;
+	case SQUASHFS_XATTR_TRUSTED:
+		return &squashfs_xattr_trusted_handler;
+	case SQUASHFS_XATTR_SECURITY:
+		return &squashfs_xattr_security_handler;
+	default:
+		/* ignore unrecognised type */
+		return NULL;
+	}
+}
+
+const struct xattr_handler *squashfs_xattr_handlers[] = {
+	&squashfs_xattr_user_handler,
+	&squashfs_xattr_trusted_handler,
+	&squashfs_xattr_security_handler,
+	NULL
+};
+
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
new file mode 100644
index 00000000..c83f5d9e
--- /dev/null
+++ b/fs/squashfs/xattr.h
@@ -0,0 +1,47 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr.h
+ */
+
+#ifdef CONFIG_SQUASHFS_XATTR
+extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
+		u64 *, int *);
+extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
+		unsigned int *, unsigned long long *);
+#else
+static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
+		u64 start, u64 *xattr_table_start, int *xattr_ids)
+{
+	ERROR("Xattrs in filesystem, these will be ignored\n");
+	*xattr_table_start = start;
+	return ERR_PTR(-ENOTSUPP);
+}
+
+static inline int squashfs_xattr_lookup(struct super_block *sb,
+		unsigned int index, int *count, unsigned int *size,
+		unsigned long long *xattr)
+{
+	return 0;
+}
+#define squashfs_listxattr NULL
+#define generic_getxattr NULL
+#define squashfs_xattr_handlers NULL
+#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
new file mode 100644
index 00000000..c89607d6
--- /dev/null
+++ b/fs/squashfs/xattr_id.c
@@ -0,0 +1,95 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+
+/*
+ * This file implements code to map the 32-bit xattr id stored in the inode
+ * into the on disk location of the xattr data.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+#include "xattr.h"
+
+/*
+ * Map xattr id using the xattr id look up table
+ */
+int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
+		int *count, unsigned int *size, unsigned long long *xattr)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int block = SQUASHFS_XATTR_BLOCK(index);
+	int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
+	u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+	struct squashfs_xattr_id id;
+	int err;
+
+	err = squashfs_read_metadata(sb, &id, &start_block, &offset,
+							sizeof(id));
+	if (err < 0)
+		return err;
+
+	*xattr = le64_to_cpu(id.xattr);
+	*size = le32_to_cpu(id.size);
+	*count = le32_to_cpu(id.count);
+	return 0;
+}
+
+
+/*
+ * Read uncompressed xattr id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
+		u64 *xattr_table_start, int *xattr_ids)
+{
+	unsigned int len;
+	struct squashfs_xattr_id_table *id_table;
+
+	id_table = squashfs_read_table(sb, start, sizeof(*id_table));
+	if (IS_ERR(id_table))
+		return (__le64 *) id_table;
+
+	*xattr_table_start = le64_to_cpu(id_table->xattr_table_start);
+	*xattr_ids = le32_to_cpu(id_table->xattr_ids);
+	kfree(id_table);
+
+	/* Sanity check values */
+
+	/* there is always at least one xattr id */
+	if (*xattr_ids == 0)
+		return ERR_PTR(-EINVAL);
+
+	/* xattr_table should be less than start */
+	if (*xattr_table_start >= start)
+		return ERR_PTR(-EINVAL);
+
+	len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+
+	TRACE("In read_xattr_index_table, length %d\n", len);
+
+	return squashfs_read_table(sb, start + sizeof(*id_table), len);
+}
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
new file mode 100644
index 00000000..1760b7d1
--- /dev/null
+++ b/fs/squashfs/xz_wrapper.c
@@ -0,0 +1,180 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xz_wrapper.c
+ */
+
+
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/xz.h>
+#include <linux/bitops.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+#include "decompressor.h"
+
+struct squashfs_xz {
+	struct xz_dec *state;
+	struct xz_buf buf;
+};
+
+struct comp_opts {
+	__le32 dictionary_size;
+	__le32 flags;
+};
+
+static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff,
+	int len)
+{
+	struct comp_opts *comp_opts = buff;
+	struct squashfs_xz *stream;
+	int dict_size = msblk->block_size;
+	int err, n;
+
+	if (comp_opts) {
+		/* check compressor options are the expected length */
+		if (len < sizeof(*comp_opts)) {
+			err = -EIO;
+			goto failed;
+		}
+
+		dict_size = le32_to_cpu(comp_opts->dictionary_size);
+
+		/* the dictionary size should be 2^n or 2^n+2^(n+1) */
+		n = ffs(dict_size) - 1;
+		if (dict_size != (1 << n) && dict_size != (1 << n) +
+						(1 << (n + 1))) {
+			err = -EIO;
+			goto failed;
+		}
+	}
+
+	dict_size = max_t(int, dict_size, SQUASHFS_METADATA_SIZE);
+
+	stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+	if (stream == NULL) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	stream->state = xz_dec_init(XZ_PREALLOC, dict_size);
+	if (stream->state == NULL) {
+		kfree(stream);
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	return stream;
+
+failed:
+	ERROR("Failed to initialise xz decompressor\n");
+	return ERR_PTR(err);
+}
+
+
+static void squashfs_xz_free(void *strm)
+{
+	struct squashfs_xz *stream = strm;
+
+	if (stream) {
+		xz_dec_end(stream->state);
+		kfree(stream);
+	}
+}
+
+
+static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+	struct buffer_head **bh, int b, int offset, int length, int srclength,
+	int pages)
+{
+	enum xz_ret xz_err;
+	int avail, total = 0, k = 0, page = 0;
+	struct squashfs_xz *stream = msblk->stream;
+
+	mutex_lock(&msblk->read_data_mutex);
+
+	xz_dec_reset(stream->state);
+	stream->buf.in_pos = 0;
+	stream->buf.in_size = 0;
+	stream->buf.out_pos = 0;
+	stream->buf.out_size = PAGE_CACHE_SIZE;
+	stream->buf.out = buffer[page++];
+
+	do {
+		if (stream->buf.in_pos == stream->buf.in_size && k < b) {
+			avail = min(length, msblk->devblksize - offset);
+			length -= avail;
+			wait_on_buffer(bh[k]);
+			if (!buffer_uptodate(bh[k]))
+				goto release_mutex;
+
+			stream->buf.in = bh[k]->b_data + offset;
+			stream->buf.in_size = avail;
+			stream->buf.in_pos = 0;
+			offset = 0;
+		}
+
+		if (stream->buf.out_pos == stream->buf.out_size
+							&& page < pages) {
+			stream->buf.out = buffer[page++];
+			stream->buf.out_pos = 0;
+			total += PAGE_CACHE_SIZE;
+		}
+
+		xz_err = xz_dec_run(stream->state, &stream->buf);
+
+		if (stream->buf.in_pos == stream->buf.in_size && k < b)
+			put_bh(bh[k++]);
+	} while (xz_err == XZ_OK);
+
+	if (xz_err != XZ_STREAM_END) {
+		ERROR("xz_dec_run error, data probably corrupt\n");
+		goto release_mutex;
+	}
+
+	if (k < b) {
+		ERROR("xz_uncompress error, input remaining\n");
+		goto release_mutex;
+	}
+
+	total += stream->buf.out_pos;
+	mutex_unlock(&msblk->read_data_mutex);
+	return total;
+
+release_mutex:
+	mutex_unlock(&msblk->read_data_mutex);
+
+	for (; k < b; k++)
+		put_bh(bh[k]);
+
+	return -EIO;
+}
+
+const struct squashfs_decompressor squashfs_xz_comp_ops = {
+	.init = squashfs_xz_init,
+	.free = squashfs_xz_free,
+	.decompress = squashfs_xz_uncompress,
+	.id = XZ_COMPRESSION,
+	.name = "xz",
+	.supported = 1
+};
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
new file mode 100644
index 00000000..55d918fd
--- /dev/null
+++ b/fs/squashfs/zlib_wrapper.c
@@ -0,0 +1,149 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * zlib_wrapper.c
+ */
+
+
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/vmalloc.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+#include "decompressor.h"
+
+static void *zlib_init(struct squashfs_sb_info *dummy, void *buff, int len)
+{
+	z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
+	if (stream == NULL)
+		goto failed;
+	stream->workspace = vmalloc(zlib_inflate_workspacesize());
+	if (stream->workspace == NULL)
+		goto failed;
+
+	return stream;
+
+failed:
+	ERROR("Failed to allocate zlib workspace\n");
+	kfree(stream);
+	return ERR_PTR(-ENOMEM);
+}
+
+
+static void zlib_free(void *strm)
+{
+	z_stream *stream = strm;
+
+	if (stream)
+		vfree(stream->workspace);
+	kfree(stream);
+}
+
+
+static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+	struct buffer_head **bh, int b, int offset, int length, int srclength,
+	int pages)
+{
+	int zlib_err, zlib_init = 0;
+	int k = 0, page = 0;
+	z_stream *stream = msblk->stream;
+
+	mutex_lock(&msblk->read_data_mutex);
+
+	stream->avail_out = 0;
+	stream->avail_in = 0;
+
+	do {
+		if (stream->avail_in == 0 && k < b) {
+			int avail = min(length, msblk->devblksize - offset);
+			length -= avail;
+			wait_on_buffer(bh[k]);
+			if (!buffer_uptodate(bh[k]))
+				goto release_mutex;
+
+			stream->next_in = bh[k]->b_data + offset;
+			stream->avail_in = avail;
+			offset = 0;
+		}
+
+		if (stream->avail_out == 0 && page < pages) {
+			stream->next_out = buffer[page++];
+			stream->avail_out = PAGE_CACHE_SIZE;
+		}
+
+		if (!zlib_init) {
+			zlib_err = zlib_inflateInit(stream);
+			if (zlib_err != Z_OK) {
+				ERROR("zlib_inflateInit returned unexpected "
+					"result 0x%x, srclength %d\n",
+					zlib_err, srclength);
+				goto release_mutex;
+			}
+			zlib_init = 1;
+		}
+
+		zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
+
+		if (stream->avail_in == 0 && k < b)
+			put_bh(bh[k++]);
+	} while (zlib_err == Z_OK);
+
+	if (zlib_err != Z_STREAM_END) {
+		ERROR("zlib_inflate error, data probably corrupt\n");
+		goto release_mutex;
+	}
+
+	zlib_err = zlib_inflateEnd(stream);
+	if (zlib_err != Z_OK) {
+		ERROR("zlib_inflate error, data probably corrupt\n");
+		goto release_mutex;
+	}
+
+	if (k < b) {
+		ERROR("zlib_uncompress error, data remaining\n");
+		goto release_mutex;
+	}
+
+	length = stream->total_out;
+	mutex_unlock(&msblk->read_data_mutex);
+	return length;
+
+release_mutex:
+	mutex_unlock(&msblk->read_data_mutex);
+
+	for (; k < b; k++)
+		put_bh(bh[k]);
+
+	return -EIO;
+}
+
+const struct squashfs_decompressor squashfs_zlib_comp_ops = {
+	.init = zlib_init,
+	.free = zlib_free,
+	.decompress = zlib_uncompress,
+	.id = ZLIB_COMPRESSION,
+	.name = "zlib",
+	.supported = 1
+};
+
diff --git a/fs/stack.c b/fs/stack.c
new file mode 100644
index 00000000..4a6f7f44
--- /dev/null
+++ b/fs/stack.c
@@ -0,0 +1,79 @@
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/fs_stack.h>
+
+/* does _NOT_ require i_mutex to be held.
+ *
+ * This function cannot be inlined since i_size_{read,write} is rather
+ * heavy-weight on 32-bit systems
+ */
+void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
+{
+	loff_t i_size;
+	blkcnt_t i_blocks;
+
+	/*
+	 * i_size_read() includes its own seqlocking and protection from
+	 * preemption (see include/linux/fs.h): we need nothing extra for
+	 * that here, and prefer to avoid nesting locks than attempt to keep
+	 * i_size and i_blocks in sync together.
+	 */
+	i_size = i_size_read(src);
+
+	/*
+	 * But if CONFIG_LBDAF (on 32-bit), we ought to make an effort to
+	 * keep the two halves of i_blocks in sync despite SMP or PREEMPT -
+	 * though stat's generic_fillattr() doesn't bother, and we won't be
+	 * applying quotas (where i_blocks does become important) at the
+	 * upper level.
+	 *
+	 * We don't actually know what locking is used at the lower level;
+	 * but if it's a filesystem that supports quotas, it will be using
+	 * i_lock as in inode_add_bytes().  tmpfs uses other locking, and
+	 * its 32-bit is (just) able to exceed 2TB i_size with the aid of
+	 * holes; but its i_blocks cannot carry into the upper long without
+	 * almost 2TB swap - let's ignore that case.
+	 */
+	if (sizeof(i_blocks) > sizeof(long))
+		spin_lock(&src->i_lock);
+	i_blocks = src->i_blocks;
+	if (sizeof(i_blocks) > sizeof(long))
+		spin_unlock(&src->i_lock);
+
+	/*
+	 * If CONFIG_SMP or CONFIG_PREEMPT on 32-bit, it's vital for
+	 * fsstack_copy_inode_size() to hold some lock around
+	 * i_size_write(), otherwise i_size_read() may spin forever (see
+	 * include/linux/fs.h).  We don't necessarily hold i_mutex when this
+	 * is called, so take i_lock for that case.
+	 *
+	 * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the
+	 * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock
+	 * for that case too, and do both at once by combining the tests.
+	 *
+	 * There is none of this locking overhead in the 64-bit case.
+	 */
+	if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long))
+		spin_lock(&dst->i_lock);
+	i_size_write(dst, i_size);
+	dst->i_blocks = i_blocks;
+	if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long))
+		spin_unlock(&dst->i_lock);
+}
+EXPORT_SYMBOL_GPL(fsstack_copy_inode_size);
+
+/* copy all attributes */
+void fsstack_copy_attr_all(struct inode *dest, const struct inode *src)
+{
+	dest->i_mode = src->i_mode;
+	dest->i_uid = src->i_uid;
+	dest->i_gid = src->i_gid;
+	dest->i_rdev = src->i_rdev;
+	dest->i_atime = src->i_atime;
+	dest->i_mtime = src->i_mtime;
+	dest->i_ctime = src->i_ctime;
+	dest->i_blkbits = src->i_blkbits;
+	dest->i_flags = src->i_flags;
+	dest->i_nlink = src->i_nlink;
+}
+EXPORT_SYMBOL_GPL(fsstack_copy_attr_all);
diff --git a/fs/stat.c b/fs/stat.c
new file mode 100644
index 00000000..02a60614
--- /dev/null
+++ b/fs/stat.c
@@ -0,0 +1,473 @@
+/*
+ *  linux/fs/stat.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/highuid.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/pagemap.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+void generic_fillattr(struct inode *inode, struct kstat *stat)
+{
+	stat->dev = inode->i_sb->s_dev;
+	stat->ino = inode->i_ino;
+	stat->mode = inode->i_mode;
+	stat->nlink = inode->i_nlink;
+	stat->uid = inode->i_uid;
+	stat->gid = inode->i_gid;
+	stat->rdev = inode->i_rdev;
+	stat->atime = inode->i_atime;
+	stat->mtime = inode->i_mtime;
+	stat->ctime = inode->i_ctime;
+	stat->size = i_size_read(inode);
+	stat->blocks = inode->i_blocks;
+	stat->blksize = (1 << inode->i_blkbits);
+}
+
+EXPORT_SYMBOL(generic_fillattr);
+
+int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	int retval;
+
+	retval = security_inode_getattr(mnt, dentry);
+	if (retval)
+		return retval;
+
+	if (inode->i_op->getattr)
+		return inode->i_op->getattr(mnt, dentry, stat);
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+EXPORT_SYMBOL(vfs_getattr);
+
+int vfs_fstat(unsigned int fd, struct kstat *stat)
+{
+	struct file *f = fget(fd);
+	int error = -EBADF;
+
+	if (f) {
+		error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
+		fput(f);
+	}
+	return error;
+}
+EXPORT_SYMBOL(vfs_fstat);
+
+int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
+		int flag)
+{
+	struct path path;
+	int error = -EINVAL;
+	int lookup_flags = 0;
+
+	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
+		      AT_EMPTY_PATH)) != 0)
+		goto out;
+
+	if (!(flag & AT_SYMLINK_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
+	if (flag & AT_NO_AUTOMOUNT)
+		lookup_flags |= LOOKUP_NO_AUTOMOUNT;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+
+	error = user_path_at(dfd, filename, lookup_flags, &path);
+	if (error)
+		goto out;
+
+	error = vfs_getattr(path.mnt, path.dentry, stat);
+	path_put(&path);
+out:
+	return error;
+}
+EXPORT_SYMBOL(vfs_fstatat);
+
+int vfs_stat(const char __user *name, struct kstat *stat)
+{
+	return vfs_fstatat(AT_FDCWD, name, stat, 0);
+}
+EXPORT_SYMBOL(vfs_stat);
+
+int vfs_lstat(const char __user *name, struct kstat *stat)
+{
+	return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
+}
+EXPORT_SYMBOL(vfs_lstat);
+
+
+#ifdef __ARCH_WANT_OLD_STAT
+
+/*
+ * For backward compatibility?  Maybe this should be moved
+ * into arch/i386 instead?
+ */
+static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * statbuf)
+{
+	static int warncount = 5;
+	struct __old_kernel_stat tmp;
+	
+	if (warncount > 0) {
+		warncount--;
+		printk(KERN_WARNING "VFS: Warning: %s using old stat() call. Recompile your binary.\n",
+			current->comm);
+	} else if (warncount < 0) {
+		/* it's laughable, but... */
+		warncount = 0;
+	}
+
+	memset(&tmp, 0, sizeof(struct __old_kernel_stat));
+	tmp.st_dev = old_encode_dev(stat->dev);
+	tmp.st_ino = stat->ino;
+	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+		return -EOVERFLOW;
+	tmp.st_mode = stat->mode;
+	tmp.st_nlink = stat->nlink;
+	if (tmp.st_nlink != stat->nlink)
+		return -EOVERFLOW;
+	SET_UID(tmp.st_uid, stat->uid);
+	SET_GID(tmp.st_gid, stat->gid);
+	tmp.st_rdev = old_encode_dev(stat->rdev);
+#if BITS_PER_LONG == 32
+	if (stat->size > MAX_NON_LFS)
+		return -EOVERFLOW;
+#endif	
+	tmp.st_size = stat->size;
+	tmp.st_atime = stat->atime.tv_sec;
+	tmp.st_mtime = stat->mtime.tv_sec;
+	tmp.st_ctime = stat->ctime.tv_sec;
+	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
+}
+
+SYSCALL_DEFINE2(stat, const char __user *, filename,
+		struct __old_kernel_stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_stat(filename, &stat);
+	if (error)
+		return error;
+
+	return cp_old_stat(&stat, statbuf);
+}
+
+SYSCALL_DEFINE2(lstat, const char __user *, filename,
+		struct __old_kernel_stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
+
+	return cp_old_stat(&stat, statbuf);
+}
+
+SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error = vfs_fstat(fd, &stat);
+
+	if (!error)
+		error = cp_old_stat(&stat, statbuf);
+
+	return error;
+}
+
+#endif /* __ARCH_WANT_OLD_STAT */
+
+static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
+{
+	struct stat tmp;
+
+#if BITS_PER_LONG == 32
+	if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+		return -EOVERFLOW;
+#else
+	if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
+		return -EOVERFLOW;
+#endif
+
+	memset(&tmp, 0, sizeof(tmp));
+#if BITS_PER_LONG == 32
+	tmp.st_dev = old_encode_dev(stat->dev);
+#else
+	tmp.st_dev = new_encode_dev(stat->dev);
+#endif
+	tmp.st_ino = stat->ino;
+	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+		return -EOVERFLOW;
+	tmp.st_mode = stat->mode;
+	tmp.st_nlink = stat->nlink;
+	if (tmp.st_nlink != stat->nlink)
+		return -EOVERFLOW;
+	SET_UID(tmp.st_uid, stat->uid);
+	SET_GID(tmp.st_gid, stat->gid);
+#if BITS_PER_LONG == 32
+	tmp.st_rdev = old_encode_dev(stat->rdev);
+#else
+	tmp.st_rdev = new_encode_dev(stat->rdev);
+#endif
+#if BITS_PER_LONG == 32
+	if (stat->size > MAX_NON_LFS)
+		return -EOVERFLOW;
+#endif	
+	tmp.st_size = stat->size;
+	tmp.st_atime = stat->atime.tv_sec;
+	tmp.st_mtime = stat->mtime.tv_sec;
+	tmp.st_ctime = stat->ctime.tv_sec;
+#ifdef STAT_HAVE_NSEC
+	tmp.st_atime_nsec = stat->atime.tv_nsec;
+	tmp.st_mtime_nsec = stat->mtime.tv_nsec;
+	tmp.st_ctime_nsec = stat->ctime.tv_nsec;
+#endif
+	tmp.st_blocks = stat->blocks;
+	tmp.st_blksize = stat->blksize;
+	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
+}
+
+SYSCALL_DEFINE2(newstat, const char __user *, filename,
+		struct stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error = vfs_stat(filename, &stat);
+
+	if (error)
+		return error;
+	return cp_new_stat(&stat, statbuf);
+}
+
+SYSCALL_DEFINE2(newlstat, const char __user *, filename,
+		struct stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
+
+	return cp_new_stat(&stat, statbuf);
+}
+
+#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
+SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
+		struct stat __user *, statbuf, int, flag)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_new_stat(&stat, statbuf);
+}
+#endif
+
+SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error = vfs_fstat(fd, &stat);
+
+	if (!error)
+		error = cp_new_stat(&stat, statbuf);
+
+	return error;
+}
+
+SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
+		char __user *, buf, int, bufsiz)
+{
+	struct path path;
+	int error;
+	int empty = 0;
+
+	if (bufsiz <= 0)
+		return -EINVAL;
+
+	error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty);
+	if (!error) {
+		struct inode *inode = path.dentry->d_inode;
+
+		error = empty ? -ENOENT : -EINVAL;
+		if (inode->i_op->readlink) {
+			error = security_inode_readlink(path.dentry);
+			if (!error) {
+				touch_atime(path.mnt, path.dentry);
+				error = inode->i_op->readlink(path.dentry,
+							      buf, bufsiz);
+			}
+		}
+		path_put(&path);
+	}
+	return error;
+}
+
+SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf,
+		int, bufsiz)
+{
+	return sys_readlinkat(AT_FDCWD, path, buf, bufsiz);
+}
+
+
+/* ---------- LFS-64 ----------- */
+#ifdef __ARCH_WANT_STAT64
+
+static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
+{
+	struct stat64 tmp;
+
+	memset(&tmp, 0, sizeof(struct stat64));
+#ifdef CONFIG_MIPS
+	/* mips has weird padding, so we don't get 64 bits there */
+	if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
+		return -EOVERFLOW;
+	tmp.st_dev = new_encode_dev(stat->dev);
+	tmp.st_rdev = new_encode_dev(stat->rdev);
+#else
+	tmp.st_dev = huge_encode_dev(stat->dev);
+	tmp.st_rdev = huge_encode_dev(stat->rdev);
+#endif
+	tmp.st_ino = stat->ino;
+	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+		return -EOVERFLOW;
+#ifdef STAT64_HAS_BROKEN_ST_INO
+	tmp.__st_ino = stat->ino;
+#endif
+	tmp.st_mode = stat->mode;
+	tmp.st_nlink = stat->nlink;
+	tmp.st_uid = stat->uid;
+	tmp.st_gid = stat->gid;
+	tmp.st_atime = stat->atime.tv_sec;
+	tmp.st_atime_nsec = stat->atime.tv_nsec;
+	tmp.st_mtime = stat->mtime.tv_sec;
+	tmp.st_mtime_nsec = stat->mtime.tv_nsec;
+	tmp.st_ctime = stat->ctime.tv_sec;
+	tmp.st_ctime_nsec = stat->ctime.tv_nsec;
+	tmp.st_size = stat->size;
+	tmp.st_blocks = stat->blocks;
+	tmp.st_blksize = stat->blksize;
+	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
+}
+
+SYSCALL_DEFINE2(stat64, const char __user *, filename,
+		struct stat64 __user *, statbuf)
+{
+	struct kstat stat;
+	int error = vfs_stat(filename, &stat);
+
+	if (!error)
+		error = cp_new_stat64(&stat, statbuf);
+
+	return error;
+}
+
+SYSCALL_DEFINE2(lstat64, const char __user *, filename,
+		struct stat64 __user *, statbuf)
+{
+	struct kstat stat;
+	int error = vfs_lstat(filename, &stat);
+
+	if (!error)
+		error = cp_new_stat64(&stat, statbuf);
+
+	return error;
+}
+
+SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
+{
+	struct kstat stat;
+	int error = vfs_fstat(fd, &stat);
+
+	if (!error)
+		error = cp_new_stat64(&stat, statbuf);
+
+	return error;
+}
+
+SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
+		struct stat64 __user *, statbuf, int, flag)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_new_stat64(&stat, statbuf);
+}
+#endif /* __ARCH_WANT_STAT64 */
+
+/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
+void __inode_add_bytes(struct inode *inode, loff_t bytes)
+{
+	inode->i_blocks += bytes >> 9;
+	bytes &= 511;
+	inode->i_bytes += bytes;
+	if (inode->i_bytes >= 512) {
+		inode->i_blocks++;
+		inode->i_bytes -= 512;
+	}
+}
+
+void inode_add_bytes(struct inode *inode, loff_t bytes)
+{
+	spin_lock(&inode->i_lock);
+	__inode_add_bytes(inode, bytes);
+	spin_unlock(&inode->i_lock);
+}
+
+EXPORT_SYMBOL(inode_add_bytes);
+
+void inode_sub_bytes(struct inode *inode, loff_t bytes)
+{
+	spin_lock(&inode->i_lock);
+	inode->i_blocks -= bytes >> 9;
+	bytes &= 511;
+	if (inode->i_bytes < bytes) {
+		inode->i_blocks--;
+		inode->i_bytes += 512;
+	}
+	inode->i_bytes -= bytes;
+	spin_unlock(&inode->i_lock);
+}
+
+EXPORT_SYMBOL(inode_sub_bytes);
+
+loff_t inode_get_bytes(struct inode *inode)
+{
+	loff_t ret;
+
+	spin_lock(&inode->i_lock);
+	ret = (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+EXPORT_SYMBOL(inode_get_bytes);
+
+void inode_set_bytes(struct inode *inode, loff_t bytes)
+{
+	/* Caller is here responsible for sufficient locking
+	 * (ie. inode->i_lock) */
+	inode->i_blocks = bytes >> 9;
+	inode->i_bytes = bytes & 511;
+}
+
+EXPORT_SYMBOL(inode_set_bytes);
diff --git a/fs/statfs.c b/fs/statfs.c
new file mode 100644
index 00000000..9cf04a11
--- /dev/null
+++ b/fs/statfs.c
@@ -0,0 +1,229 @@
+#include <linux/syscalls.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
+
+static int flags_by_mnt(int mnt_flags)
+{
+	int flags = 0;
+
+	if (mnt_flags & MNT_READONLY)
+		flags |= ST_RDONLY;
+	if (mnt_flags & MNT_NOSUID)
+		flags |= ST_NOSUID;
+	if (mnt_flags & MNT_NODEV)
+		flags |= ST_NODEV;
+	if (mnt_flags & MNT_NOEXEC)
+		flags |= ST_NOEXEC;
+	if (mnt_flags & MNT_NOATIME)
+		flags |= ST_NOATIME;
+	if (mnt_flags & MNT_NODIRATIME)
+		flags |= ST_NODIRATIME;
+	if (mnt_flags & MNT_RELATIME)
+		flags |= ST_RELATIME;
+	return flags;
+}
+
+static int flags_by_sb(int s_flags)
+{
+	int flags = 0;
+	if (s_flags & MS_SYNCHRONOUS)
+		flags |= ST_SYNCHRONOUS;
+	if (s_flags & MS_MANDLOCK)
+		flags |= ST_MANDLOCK;
+	return flags;
+}
+
+static int calculate_f_flags(struct vfsmount *mnt)
+{
+	return ST_VALID | flags_by_mnt(mnt->mnt_flags) |
+		flags_by_sb(mnt->mnt_sb->s_flags);
+}
+
+int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
+{
+	int retval;
+
+	if (!dentry->d_sb->s_op->statfs)
+		return -ENOSYS;
+
+	memset(buf, 0, sizeof(*buf));
+	retval = security_sb_statfs(dentry);
+	if (retval)
+		return retval;
+	retval = dentry->d_sb->s_op->statfs(dentry, buf);
+	if (retval == 0 && buf->f_frsize == 0)
+		buf->f_frsize = buf->f_bsize;
+	return retval;
+}
+
+int vfs_statfs(struct path *path, struct kstatfs *buf)
+{
+	int error;
+
+	error = statfs_by_dentry(path->dentry, buf);
+	if (!error)
+		buf->f_flags = calculate_f_flags(path->mnt);
+	return error;
+}
+EXPORT_SYMBOL(vfs_statfs);
+
+int user_statfs(const char __user *pathname, struct kstatfs *st)
+{
+	struct path path;
+	int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
+	if (!error) {
+		error = vfs_statfs(&path, st);
+		path_put(&path);
+	}
+	return error;
+}
+
+int fd_statfs(int fd, struct kstatfs *st)
+{
+	struct file *file = fget(fd);
+	int error = -EBADF;
+	if (file) {
+		error = vfs_statfs(&file->f_path, st);
+		fput(file);
+	}
+	return error;
+}
+
+static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
+{
+	struct statfs buf;
+
+	if (sizeof(buf) == sizeof(*st))
+		memcpy(&buf, st, sizeof(*st));
+	else {
+		if (sizeof buf.f_blocks == 4) {
+			if ((st->f_blocks | st->f_bfree | st->f_bavail |
+			     st->f_bsize | st->f_frsize) &
+			    0xffffffff00000000ULL)
+				return -EOVERFLOW;
+			/*
+			 * f_files and f_ffree may be -1; it's okay to stuff
+			 * that into 32 bits
+			 */
+			if (st->f_files != -1 &&
+			    (st->f_files & 0xffffffff00000000ULL))
+				return -EOVERFLOW;
+			if (st->f_ffree != -1 &&
+			    (st->f_ffree & 0xffffffff00000000ULL))
+				return -EOVERFLOW;
+		}
+
+		buf.f_type = st->f_type;
+		buf.f_bsize = st->f_bsize;
+		buf.f_blocks = st->f_blocks;
+		buf.f_bfree = st->f_bfree;
+		buf.f_bavail = st->f_bavail;
+		buf.f_files = st->f_files;
+		buf.f_ffree = st->f_ffree;
+		buf.f_fsid = st->f_fsid;
+		buf.f_namelen = st->f_namelen;
+		buf.f_frsize = st->f_frsize;
+		buf.f_flags = st->f_flags;
+		memset(buf.f_spare, 0, sizeof(buf.f_spare));
+	}
+	if (copy_to_user(p, &buf, sizeof(buf)))
+		return -EFAULT;
+	return 0;
+}
+
+static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
+{
+	struct statfs64 buf;
+	if (sizeof(buf) == sizeof(*st))
+		memcpy(&buf, st, sizeof(*st));
+	else {
+		buf.f_type = st->f_type;
+		buf.f_bsize = st->f_bsize;
+		buf.f_blocks = st->f_blocks;
+		buf.f_bfree = st->f_bfree;
+		buf.f_bavail = st->f_bavail;
+		buf.f_files = st->f_files;
+		buf.f_ffree = st->f_ffree;
+		buf.f_fsid = st->f_fsid;
+		buf.f_namelen = st->f_namelen;
+		buf.f_frsize = st->f_frsize;
+		buf.f_flags = st->f_flags;
+		memset(buf.f_spare, 0, sizeof(buf.f_spare));
+	}
+	if (copy_to_user(p, &buf, sizeof(buf)))
+		return -EFAULT;
+	return 0;
+}
+
+SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
+{
+	struct kstatfs st;
+	int error = user_statfs(pathname, &st);
+	if (!error)
+		error = do_statfs_native(&st, buf);
+	return error;
+}
+
+SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
+{
+	struct kstatfs st;
+	int error;
+	if (sz != sizeof(*buf))
+		return -EINVAL;
+	error = user_statfs(pathname, &st);
+	if (!error)
+		error = do_statfs64(&st, buf);
+	return error;
+}
+
+SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
+{
+	struct kstatfs st;
+	int error = fd_statfs(fd, &st);
+	if (!error)
+		error = do_statfs_native(&st, buf);
+	return error;
+}
+
+SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
+{
+	struct kstatfs st;
+	int error;
+
+	if (sz != sizeof(*buf))
+		return -EINVAL;
+
+	error = fd_statfs(fd, &st);
+	if (!error)
+		error = do_statfs64(&st, buf);
+	return error;
+}
+
+SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
+{
+	struct super_block *s;
+	struct ustat tmp;
+	struct kstatfs sbuf;
+	int err;
+
+	s = user_get_super(new_decode_dev(dev));
+	if (!s)
+		return -EINVAL;
+
+	err = statfs_by_dentry(s->s_root, &sbuf);
+	drop_super(s);
+	if (err)
+		return err;
+
+	memset(&tmp,0,sizeof(struct ustat));
+	tmp.f_tfree = sbuf.f_bfree;
+	tmp.f_tinode = sbuf.f_ffree;
+
+	return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
+}
diff --git a/fs/super.c b/fs/super.c
new file mode 100644
index 00000000..caf4dfa2
--- /dev/null
+++ b/fs/super.c
@@ -0,0 +1,1061 @@
+/*
+ *  linux/fs/super.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  super.c contains code to handle: - mount structures
+ *                                   - super-block tables
+ *                                   - filesystem drivers list
+ *                                   - mount system call
+ *                                   - umount system call
+ *                                   - ustat system call
+ *
+ * GK 2/5/95  -  Changed to support mounting the root fs via NFS
+ *
+ *  Added kerneld support: Jacques Gelinas and Bjorn Ekwall
+ *  Added change_root: Werner Almesberger & Hans Lermen, Feb '96
+ *  Added options to /proc/mounts:
+ *    Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
+ *  Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
+ *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/acct.h>
+#include <linux/blkdev.h>
+#include <linux/mount.h>
+#include <linux/security.h>
+#include <linux/writeback.h>		/* for the emergency remount stuff */
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/backing-dev.h>
+#include <linux/rculist_bl.h>
+#include <linux/cleancache.h>
+#include "internal.h"
+
+
+LIST_HEAD(super_blocks);
+DEFINE_SPINLOCK(sb_lock);
+
+/**
+ *	alloc_super	-	create new superblock
+ *	@type:	filesystem type superblock should belong to
+ *
+ *	Allocates and initializes a new &struct super_block.  alloc_super()
+ *	returns a pointer new superblock or %NULL if allocation had failed.
+ */
+static struct super_block *alloc_super(struct file_system_type *type)
+{
+	struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
+	static const struct super_operations default_op;
+
+	if (s) {
+		if (security_sb_alloc(s)) {
+			kfree(s);
+			s = NULL;
+			goto out;
+		}
+#ifdef CONFIG_SMP
+		s->s_files = alloc_percpu(struct list_head);
+		if (!s->s_files) {
+			security_sb_free(s);
+			kfree(s);
+			s = NULL;
+			goto out;
+		} else {
+			int i;
+
+			for_each_possible_cpu(i)
+				INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
+		}
+#else
+		INIT_LIST_HEAD(&s->s_files);
+#endif
+		s->s_bdi = &default_backing_dev_info;
+		INIT_LIST_HEAD(&s->s_instances);
+		INIT_HLIST_BL_HEAD(&s->s_anon);
+		INIT_LIST_HEAD(&s->s_inodes);
+		INIT_LIST_HEAD(&s->s_dentry_lru);
+		init_rwsem(&s->s_umount);
+		mutex_init(&s->s_lock);
+		lockdep_set_class(&s->s_umount, &type->s_umount_key);
+		/*
+		 * The locking rules for s_lock are up to the
+		 * filesystem. For example ext3fs has different
+		 * lock ordering than usbfs:
+		 */
+		lockdep_set_class(&s->s_lock, &type->s_lock_key);
+		/*
+		 * sget() can have s_umount recursion.
+		 *
+		 * When it cannot find a suitable sb, it allocates a new
+		 * one (this one), and tries again to find a suitable old
+		 * one.
+		 *
+		 * In case that succeeds, it will acquire the s_umount
+		 * lock of the old one. Since these are clearly distrinct
+		 * locks, and this object isn't exposed yet, there's no
+		 * risk of deadlocks.
+		 *
+		 * Annotate this by putting this lock in a different
+		 * subclass.
+		 */
+		down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
+		s->s_count = 1;
+		atomic_set(&s->s_active, 1);
+		mutex_init(&s->s_vfs_rename_mutex);
+		lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
+		mutex_init(&s->s_dquot.dqio_mutex);
+		mutex_init(&s->s_dquot.dqonoff_mutex);
+		init_rwsem(&s->s_dquot.dqptr_sem);
+		init_waitqueue_head(&s->s_wait_unfrozen);
+		s->s_maxbytes = MAX_NON_LFS;
+		s->s_op = &default_op;
+		s->s_time_gran = 1000000000;
+		s->cleancache_poolid = -1;
+	}
+out:
+	return s;
+}
+
+/**
+ *	destroy_super	-	frees a superblock
+ *	@s: superblock to free
+ *
+ *	Frees a superblock.
+ */
+static inline void destroy_super(struct super_block *s)
+{
+#ifdef CONFIG_SMP
+	free_percpu(s->s_files);
+#endif
+	security_sb_free(s);
+	kfree(s->s_subtype);
+	kfree(s->s_options);
+	kfree(s);
+}
+
+/* Superblock refcounting  */
+
+/*
+ * Drop a superblock's refcount.  The caller must hold sb_lock.
+ */
+void __put_super(struct super_block *sb)
+{
+	if (!--sb->s_count) {
+		list_del_init(&sb->s_list);
+		destroy_super(sb);
+	}
+}
+
+/**
+ *	put_super	-	drop a temporary reference to superblock
+ *	@sb: superblock in question
+ *
+ *	Drops a temporary reference, frees superblock if there's no
+ *	references left.
+ */
+void put_super(struct super_block *sb)
+{
+	spin_lock(&sb_lock);
+	__put_super(sb);
+	spin_unlock(&sb_lock);
+}
+
+
+/**
+ *	deactivate_locked_super	-	drop an active reference to superblock
+ *	@s: superblock to deactivate
+ *
+ *	Drops an active reference to superblock, converting it into a temprory
+ *	one if there is no other active references left.  In that case we
+ *	tell fs driver to shut it down and drop the temporary reference we
+ *	had just acquired.
+ *
+ *	Caller holds exclusive lock on superblock; that lock is released.
+ */
+void deactivate_locked_super(struct super_block *s)
+{
+	struct file_system_type *fs = s->s_type;
+	if (atomic_dec_and_test(&s->s_active)) {
+		cleancache_flush_fs(s);
+		fs->kill_sb(s);
+		/*
+		 * We need to call rcu_barrier so all the delayed rcu free
+		 * inodes are flushed before we release the fs module.
+		 */
+		rcu_barrier();
+		put_filesystem(fs);
+		put_super(s);
+	} else {
+		up_write(&s->s_umount);
+	}
+}
+
+EXPORT_SYMBOL(deactivate_locked_super);
+
+/**
+ *	deactivate_super	-	drop an active reference to superblock
+ *	@s: superblock to deactivate
+ *
+ *	Variant of deactivate_locked_super(), except that superblock is *not*
+ *	locked by caller.  If we are going to drop the final active reference,
+ *	lock will be acquired prior to that.
+ */
+void deactivate_super(struct super_block *s)
+{
+        if (!atomic_add_unless(&s->s_active, -1, 1)) {
+		down_write(&s->s_umount);
+		deactivate_locked_super(s);
+	}
+}
+
+EXPORT_SYMBOL(deactivate_super);
+
+/**
+ *	grab_super - acquire an active reference
+ *	@s: reference we are trying to make active
+ *
+ *	Tries to acquire an active reference.  grab_super() is used when we
+ * 	had just found a superblock in super_blocks or fs_type->fs_supers
+ *	and want to turn it into a full-blown active reference.  grab_super()
+ *	is called with sb_lock held and drops it.  Returns 1 in case of
+ *	success, 0 if we had failed (superblock contents was already dead or
+ *	dying when grab_super() had been called).
+ */
+static int grab_super(struct super_block *s) __releases(sb_lock)
+{
+	if (atomic_inc_not_zero(&s->s_active)) {
+		spin_unlock(&sb_lock);
+		return 1;
+	}
+	/* it's going away */
+	s->s_count++;
+	spin_unlock(&sb_lock);
+	/* wait for it to die */
+	down_write(&s->s_umount);
+	up_write(&s->s_umount);
+	put_super(s);
+	return 0;
+}
+
+/*
+ * Superblock locking.  We really ought to get rid of these two.
+ */
+void lock_super(struct super_block * sb)
+{
+	get_fs_excl();
+	mutex_lock(&sb->s_lock);
+}
+
+void unlock_super(struct super_block * sb)
+{
+	put_fs_excl();
+	mutex_unlock(&sb->s_lock);
+}
+
+EXPORT_SYMBOL(lock_super);
+EXPORT_SYMBOL(unlock_super);
+
+/**
+ *	generic_shutdown_super	-	common helper for ->kill_sb()
+ *	@sb: superblock to kill
+ *
+ *	generic_shutdown_super() does all fs-independent work on superblock
+ *	shutdown.  Typical ->kill_sb() should pick all fs-specific objects
+ *	that need destruction out of superblock, call generic_shutdown_super()
+ *	and release aforementioned objects.  Note: dentries and inodes _are_
+ *	taken care of and do not need specific handling.
+ *
+ *	Upon calling this function, the filesystem may no longer alter or
+ *	rearrange the set of dentries belonging to this super_block, nor may it
+ *	change the attachments of dentries to inodes.
+ */
+void generic_shutdown_super(struct super_block *sb)
+{
+	const struct super_operations *sop = sb->s_op;
+
+
+	if (sb->s_root) {
+		shrink_dcache_for_umount(sb);
+		sync_filesystem(sb);
+		get_fs_excl();
+		sb->s_flags &= ~MS_ACTIVE;
+
+		fsnotify_unmount_inodes(&sb->s_inodes);
+
+		evict_inodes(sb);
+
+		if (sop->put_super)
+			sop->put_super(sb);
+
+		if (!list_empty(&sb->s_inodes)) {
+			printk("VFS: Busy inodes after unmount of %s. "
+			   "Self-destruct in 5 seconds.  Have a nice day...\n",
+			   sb->s_id);
+		}
+		put_fs_excl();
+	}
+	spin_lock(&sb_lock);
+	/* should be initialized for __put_super_and_need_restart() */
+	list_del_init(&sb->s_instances);
+	spin_unlock(&sb_lock);
+	up_write(&sb->s_umount);
+}
+
+EXPORT_SYMBOL(generic_shutdown_super);
+
+/**
+ *	sget	-	find or create a superblock
+ *	@type:	filesystem type superblock should belong to
+ *	@test:	comparison callback
+ *	@set:	setup callback
+ *	@data:	argument to each of them
+ */
+struct super_block *sget(struct file_system_type *type,
+			int (*test)(struct super_block *,void *),
+			int (*set)(struct super_block *,void *),
+			void *data)
+{
+	struct super_block *s = NULL;
+	struct super_block *old;
+	int err;
+
+retry:
+	spin_lock(&sb_lock);
+	if (test) {
+		list_for_each_entry(old, &type->fs_supers, s_instances) {
+			if (!test(old, data))
+				continue;
+			if (!grab_super(old))
+				goto retry;
+			if (s) {
+				up_write(&s->s_umount);
+				destroy_super(s);
+				s = NULL;
+			}
+			down_write(&old->s_umount);
+			if (unlikely(!(old->s_flags & MS_BORN))) {
+				deactivate_locked_super(old);
+				goto retry;
+			}
+			return old;
+		}
+	}
+	if (!s) {
+		spin_unlock(&sb_lock);
+		s = alloc_super(type);
+		if (!s)
+			return ERR_PTR(-ENOMEM);
+		goto retry;
+	}
+		
+	err = set(s, data);
+	if (err) {
+		spin_unlock(&sb_lock);
+		up_write(&s->s_umount);
+		destroy_super(s);
+		return ERR_PTR(err);
+	}
+	s->s_type = type;
+	strlcpy(s->s_id, type->name, sizeof(s->s_id));
+	list_add_tail(&s->s_list, &super_blocks);
+	list_add(&s->s_instances, &type->fs_supers);
+	spin_unlock(&sb_lock);
+	get_filesystem(type);
+	return s;
+}
+
+EXPORT_SYMBOL(sget);
+
+void drop_super(struct super_block *sb)
+{
+	up_read(&sb->s_umount);
+	put_super(sb);
+}
+
+EXPORT_SYMBOL(drop_super);
+
+/**
+ * sync_supers - helper for periodic superblock writeback
+ *
+ * Call the write_super method if present on all dirty superblocks in
+ * the system.  This is for the periodic writeback used by most older
+ * filesystems.  For data integrity superblock writeback use
+ * sync_filesystems() instead.
+ *
+ * Note: check the dirty flag before waiting, so we don't
+ * hold up the sync while mounting a device. (The newly
+ * mounted device won't need syncing.)
+ */
+void sync_supers(void)
+{
+	struct super_block *sb, *p = NULL;
+
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (list_empty(&sb->s_instances))
+			continue;
+		if (sb->s_op->write_super && sb->s_dirt) {
+			sb->s_count++;
+			spin_unlock(&sb_lock);
+
+			down_read(&sb->s_umount);
+			if (sb->s_root && sb->s_dirt)
+				sb->s_op->write_super(sb);
+			up_read(&sb->s_umount);
+
+			spin_lock(&sb_lock);
+			if (p)
+				__put_super(p);
+			p = sb;
+		}
+	}
+	if (p)
+		__put_super(p);
+	spin_unlock(&sb_lock);
+}
+
+/**
+ *	iterate_supers - call function for all active superblocks
+ *	@f: function to call
+ *	@arg: argument to pass to it
+ *
+ *	Scans the superblock list and calls given function, passing it
+ *	locked superblock and given argument.
+ */
+void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
+{
+	struct super_block *sb, *p = NULL;
+
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (list_empty(&sb->s_instances))
+			continue;
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+
+		down_read(&sb->s_umount);
+		if (sb->s_root)
+			f(sb, arg);
+		up_read(&sb->s_umount);
+
+		spin_lock(&sb_lock);
+		if (p)
+			__put_super(p);
+		p = sb;
+	}
+	if (p)
+		__put_super(p);
+	spin_unlock(&sb_lock);
+}
+
+/**
+ *	get_super - get the superblock of a device
+ *	@bdev: device to get the superblock for
+ *	
+ *	Scans the superblock list and finds the superblock of the file system
+ *	mounted on the device given. %NULL is returned if no match is found.
+ */
+
+struct super_block *get_super(struct block_device *bdev)
+{
+	struct super_block *sb;
+
+	if (!bdev)
+		return NULL;
+
+	spin_lock(&sb_lock);
+rescan:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (list_empty(&sb->s_instances))
+			continue;
+		if (sb->s_bdev == bdev) {
+			sb->s_count++;
+			spin_unlock(&sb_lock);
+			down_read(&sb->s_umount);
+			/* still alive? */
+			if (sb->s_root)
+				return sb;
+			up_read(&sb->s_umount);
+			/* nope, got unmounted */
+			spin_lock(&sb_lock);
+			__put_super(sb);
+			goto rescan;
+		}
+	}
+	spin_unlock(&sb_lock);
+	return NULL;
+}
+
+EXPORT_SYMBOL(get_super);
+
+/**
+ * get_active_super - get an active reference to the superblock of a device
+ * @bdev: device to get the superblock for
+ *
+ * Scans the superblock list and finds the superblock of the file system
+ * mounted on the device given.  Returns the superblock with an active
+ * reference or %NULL if none was found.
+ */
+struct super_block *get_active_super(struct block_device *bdev)
+{
+	struct super_block *sb;
+
+	if (!bdev)
+		return NULL;
+
+restart:
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (list_empty(&sb->s_instances))
+			continue;
+		if (sb->s_bdev == bdev) {
+			if (grab_super(sb)) /* drops sb_lock */
+				return sb;
+			else
+				goto restart;
+		}
+	}
+	spin_unlock(&sb_lock);
+	return NULL;
+}
+ 
+struct super_block *user_get_super(dev_t dev)
+{
+	struct super_block *sb;
+
+	spin_lock(&sb_lock);
+rescan:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (list_empty(&sb->s_instances))
+			continue;
+		if (sb->s_dev ==  dev) {
+			sb->s_count++;
+			spin_unlock(&sb_lock);
+			down_read(&sb->s_umount);
+			/* still alive? */
+			if (sb->s_root)
+				return sb;
+			up_read(&sb->s_umount);
+			/* nope, got unmounted */
+			spin_lock(&sb_lock);
+			__put_super(sb);
+			goto rescan;
+		}
+	}
+	spin_unlock(&sb_lock);
+	return NULL;
+}
+
+/**
+ *	do_remount_sb - asks filesystem to change mount options.
+ *	@sb:	superblock in question
+ *	@flags:	numeric part of options
+ *	@data:	the rest of options
+ *      @force: whether or not to force the change
+ *
+ *	Alters the mount options of a mounted file system.
+ */
+int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+{
+	int retval;
+	int remount_ro;
+
+	if (sb->s_frozen != SB_UNFROZEN)
+		return -EBUSY;
+
+#ifdef CONFIG_BLOCK
+	if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
+		return -EACCES;
+#endif
+
+	if (flags & MS_RDONLY)
+		acct_auto_close(sb);
+	shrink_dcache_sb(sb);
+	sync_filesystem(sb);
+
+	remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+
+	/* If we are remounting RDONLY and current sb is read/write,
+	   make sure there are no rw files opened */
+	if (remount_ro) {
+		if (force)
+			mark_files_ro(sb);
+		else if (!fs_may_remount_ro(sb))
+			return -EBUSY;
+	}
+
+	if (sb->s_op->remount_fs) {
+		retval = sb->s_op->remount_fs(sb, &flags, data);
+		if (retval)
+			return retval;
+	}
+	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
+
+	/*
+	 * Some filesystems modify their metadata via some other path than the
+	 * bdev buffer cache (eg. use a private mapping, or directories in
+	 * pagecache, etc). Also file data modifications go via their own
+	 * mappings. So If we try to mount readonly then copy the filesystem
+	 * from bdev, we could get stale data, so invalidate it to give a best
+	 * effort at coherency.
+	 */
+	if (remount_ro && sb->s_bdev)
+		invalidate_bdev(sb->s_bdev);
+	return 0;
+}
+
+static void do_emergency_remount(struct work_struct *work)
+{
+	struct super_block *sb, *p = NULL;
+
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (list_empty(&sb->s_instances))
+			continue;
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_write(&sb->s_umount);
+		if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
+			/*
+			 * What lock protects sb->s_flags??
+			 */
+			do_remount_sb(sb, MS_RDONLY, NULL, 1);
+		}
+		up_write(&sb->s_umount);
+		spin_lock(&sb_lock);
+		if (p)
+			__put_super(p);
+		p = sb;
+	}
+	if (p)
+		__put_super(p);
+	spin_unlock(&sb_lock);
+	kfree(work);
+	printk("Emergency Remount complete\n");
+}
+
+void emergency_remount(void)
+{
+	struct work_struct *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work) {
+		INIT_WORK(work, do_emergency_remount);
+		schedule_work(work);
+	}
+}
+
+/*
+ * Unnamed block devices are dummy devices used by virtual
+ * filesystems which don't use real block-devices.  -- jrs
+ */
+
+static DEFINE_IDA(unnamed_dev_ida);
+static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
+static int unnamed_dev_start = 0; /* don't bother trying below it */
+
+int set_anon_super(struct super_block *s, void *data)
+{
+	int dev;
+	int error;
+
+ retry:
+	if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0)
+		return -ENOMEM;
+	spin_lock(&unnamed_dev_lock);
+	error = ida_get_new_above(&unnamed_dev_ida, unnamed_dev_start, &dev);
+	if (!error)
+		unnamed_dev_start = dev + 1;
+	spin_unlock(&unnamed_dev_lock);
+	if (error == -EAGAIN)
+		/* We raced and lost with another CPU. */
+		goto retry;
+	else if (error)
+		return -EAGAIN;
+
+	if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) {
+		spin_lock(&unnamed_dev_lock);
+		ida_remove(&unnamed_dev_ida, dev);
+		if (unnamed_dev_start > dev)
+			unnamed_dev_start = dev;
+		spin_unlock(&unnamed_dev_lock);
+		return -EMFILE;
+	}
+	s->s_dev = MKDEV(0, dev & MINORMASK);
+	s->s_bdi = &noop_backing_dev_info;
+	return 0;
+}
+
+EXPORT_SYMBOL(set_anon_super);
+
+void kill_anon_super(struct super_block *sb)
+{
+	int slot = MINOR(sb->s_dev);
+
+	generic_shutdown_super(sb);
+	spin_lock(&unnamed_dev_lock);
+	ida_remove(&unnamed_dev_ida, slot);
+	if (slot < unnamed_dev_start)
+		unnamed_dev_start = slot;
+	spin_unlock(&unnamed_dev_lock);
+}
+
+EXPORT_SYMBOL(kill_anon_super);
+
+void kill_litter_super(struct super_block *sb)
+{
+	if (sb->s_root)
+		d_genocide(sb->s_root);
+	kill_anon_super(sb);
+}
+
+EXPORT_SYMBOL(kill_litter_super);
+
+static int ns_test_super(struct super_block *sb, void *data)
+{
+	return sb->s_fs_info == data;
+}
+
+static int ns_set_super(struct super_block *sb, void *data)
+{
+	sb->s_fs_info = data;
+	return set_anon_super(sb, NULL);
+}
+
+struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
+	void *data, int (*fill_super)(struct super_block *, void *, int))
+{
+	struct super_block *sb;
+
+	sb = sget(fs_type, ns_test_super, ns_set_super, data);
+	if (IS_ERR(sb))
+		return ERR_CAST(sb);
+
+	if (!sb->s_root) {
+		int err;
+		sb->s_flags = flags;
+		err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+		if (err) {
+			deactivate_locked_super(sb);
+			return ERR_PTR(err);
+		}
+
+		sb->s_flags |= MS_ACTIVE;
+	}
+
+	return dget(sb->s_root);
+}
+
+EXPORT_SYMBOL(mount_ns);
+
+#ifdef CONFIG_BLOCK
+static int set_bdev_super(struct super_block *s, void *data)
+{
+	s->s_bdev = data;
+	s->s_dev = s->s_bdev->bd_dev;
+
+	/*
+	 * We set the bdi here to the queue backing, file systems can
+	 * overwrite this in ->fill_super()
+	 */
+	s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
+	return 0;
+}
+
+static int test_bdev_super(struct super_block *s, void *data)
+{
+	return (void *)s->s_bdev == data;
+}
+
+struct dentry *mount_bdev(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data,
+	int (*fill_super)(struct super_block *, void *, int))
+{
+	struct block_device *bdev;
+	struct super_block *s;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
+	int error = 0;
+
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
+	if (IS_ERR(bdev))
+		return ERR_CAST(bdev);
+
+	/*
+	 * once the super is inserted into the list by sget, s_umount
+	 * will protect the lockfs code from trying to start a snapshot
+	 * while we are mounting
+	 */
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (bdev->bd_fsfreeze_count > 0) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		error = -EBUSY;
+		goto error_bdev;
+	}
+	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	if (IS_ERR(s))
+		goto error_s;
+
+	if (s->s_root) {
+		if ((flags ^ s->s_flags) & MS_RDONLY) {
+			deactivate_locked_super(s);
+			error = -EBUSY;
+			goto error_bdev;
+		}
+
+		/*
+		 * s_umount nests inside bd_mutex during
+		 * __invalidate_device().  blkdev_put() acquires
+		 * bd_mutex and can't be called under s_umount.  Drop
+		 * s_umount temporarily.  This is safe as we're
+		 * holding an active reference.
+		 */
+		up_write(&s->s_umount);
+		blkdev_put(bdev, mode);
+		down_write(&s->s_umount);
+	} else {
+		char b[BDEVNAME_SIZE];
+
+		s->s_flags = flags | MS_NOSEC;
+		s->s_mode = mode;
+		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		sb_set_blocksize(s, block_size(bdev));
+		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		if (error) {
+			deactivate_locked_super(s);
+			goto error;
+		}
+
+		s->s_flags |= MS_ACTIVE;
+		bdev->bd_super = s;
+	}
+
+	return dget(s->s_root);
+
+error_s:
+	error = PTR_ERR(s);
+error_bdev:
+	blkdev_put(bdev, mode);
+error:
+	return ERR_PTR(error);
+}
+EXPORT_SYMBOL(mount_bdev);
+
+void kill_block_super(struct super_block *sb)
+{
+	struct block_device *bdev = sb->s_bdev;
+	fmode_t mode = sb->s_mode;
+
+	bdev->bd_super = NULL;
+	generic_shutdown_super(sb);
+	sync_blockdev(bdev);
+	WARN_ON_ONCE(!(mode & FMODE_EXCL));
+	blkdev_put(bdev, mode | FMODE_EXCL);
+}
+
+EXPORT_SYMBOL(kill_block_super);
+#endif
+
+struct dentry *mount_nodev(struct file_system_type *fs_type,
+	int flags, void *data,
+	int (*fill_super)(struct super_block *, void *, int))
+{
+	int error;
+	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
+
+	if (IS_ERR(s))
+		return ERR_CAST(s);
+
+	s->s_flags = flags;
+
+	error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+	if (error) {
+		deactivate_locked_super(s);
+		return ERR_PTR(error);
+	}
+	s->s_flags |= MS_ACTIVE;
+	return dget(s->s_root);
+}
+EXPORT_SYMBOL(mount_nodev);
+
+static int compare_single(struct super_block *s, void *p)
+{
+	return 1;
+}
+
+struct dentry *mount_single(struct file_system_type *fs_type,
+	int flags, void *data,
+	int (*fill_super)(struct super_block *, void *, int))
+{
+	struct super_block *s;
+	int error;
+
+	s = sget(fs_type, compare_single, set_anon_super, NULL);
+	if (IS_ERR(s))
+		return ERR_CAST(s);
+	if (!s->s_root) {
+		s->s_flags = flags;
+		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		if (error) {
+			deactivate_locked_super(s);
+			return ERR_PTR(error);
+		}
+		s->s_flags |= MS_ACTIVE;
+	} else {
+		do_remount_sb(s, flags, data, 0);
+	}
+	return dget(s->s_root);
+}
+EXPORT_SYMBOL(mount_single);
+
+struct dentry *
+mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
+{
+	struct dentry *root;
+	struct super_block *sb;
+	char *secdata = NULL;
+	int error = -ENOMEM;
+
+	if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
+		secdata = alloc_secdata();
+		if (!secdata)
+			goto out;
+
+		error = security_sb_copy_data(data, secdata);
+		if (error)
+			goto out_free_secdata;
+	}
+
+	root = type->mount(type, flags, name, data);
+	if (IS_ERR(root)) {
+		error = PTR_ERR(root);
+		goto out_free_secdata;
+	}
+	sb = root->d_sb;
+	BUG_ON(!sb);
+	WARN_ON(!sb->s_bdi);
+	WARN_ON(sb->s_bdi == &default_backing_dev_info);
+	sb->s_flags |= MS_BORN;
+
+	error = security_sb_kern_mount(sb, flags, secdata);
+	if (error)
+		goto out_sb;
+
+	/*
+	 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
+	 * but s_maxbytes was an unsigned long long for many releases. Throw
+	 * this warning for a little while to try and catch filesystems that
+	 * violate this rule.
+	 */
+	WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
+		"negative value (%lld)\n", type->name, sb->s_maxbytes);
+
+	up_write(&sb->s_umount);
+	free_secdata(secdata);
+	return root;
+out_sb:
+	dput(root);
+	deactivate_locked_super(sb);
+out_free_secdata:
+	free_secdata(secdata);
+out:
+	return ERR_PTR(error);
+}
+
+/**
+ * freeze_super - lock the filesystem and force it into a consistent state
+ * @sb: the super to lock
+ *
+ * Syncs the super to make sure the filesystem is consistent and calls the fs's
+ * freeze_fs.  Subsequent calls to this without first thawing the fs will return
+ * -EBUSY.
+ */
+int freeze_super(struct super_block *sb)
+{
+	int ret;
+
+	atomic_inc(&sb->s_active);
+	down_write(&sb->s_umount);
+	if (sb->s_frozen) {
+		deactivate_locked_super(sb);
+		return -EBUSY;
+	}
+
+	if (sb->s_flags & MS_RDONLY) {
+		sb->s_frozen = SB_FREEZE_TRANS;
+		smp_wmb();
+		up_write(&sb->s_umount);
+		return 0;
+	}
+
+	sb->s_frozen = SB_FREEZE_WRITE;
+	smp_wmb();
+
+	sync_filesystem(sb);
+
+	sb->s_frozen = SB_FREEZE_TRANS;
+	smp_wmb();
+
+	sync_blockdev(sb->s_bdev);
+	if (sb->s_op->freeze_fs) {
+		ret = sb->s_op->freeze_fs(sb);
+		if (ret) {
+			printk(KERN_ERR
+				"VFS:Filesystem freeze failed\n");
+			sb->s_frozen = SB_UNFROZEN;
+			smp_wmb();
+			wake_up(&sb->s_wait_unfrozen);
+			deactivate_locked_super(sb);
+			return ret;
+		}
+	}
+	up_write(&sb->s_umount);
+	return 0;
+}
+EXPORT_SYMBOL(freeze_super);
+
+/**
+ * thaw_super -- unlock filesystem
+ * @sb: the super to thaw
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_super().
+ */
+int thaw_super(struct super_block *sb)
+{
+	int error;
+
+	down_write(&sb->s_umount);
+	if (sb->s_frozen == SB_UNFROZEN) {
+		up_write(&sb->s_umount);
+		return -EINVAL;
+	}
+
+	if (sb->s_flags & MS_RDONLY)
+		goto out;
+
+	if (sb->s_op->unfreeze_fs) {
+		error = sb->s_op->unfreeze_fs(sb);
+		if (error) {
+			printk(KERN_ERR
+				"VFS:Filesystem thaw failed\n");
+			sb->s_frozen = SB_FREEZE_TRANS;
+			up_write(&sb->s_umount);
+			return error;
+		}
+	}
+
+out:
+	sb->s_frozen = SB_UNFROZEN;
+	smp_wmb();
+	wake_up(&sb->s_wait_unfrozen);
+	deactivate_locked_super(sb);
+
+	return 0;
+}
+EXPORT_SYMBOL(thaw_super);
diff --git a/fs/sync.c b/fs/sync.c
new file mode 100644
index 00000000..c38ec163
--- /dev/null
+++ b/fs/sync.c
@@ -0,0 +1,402 @@
+/*
+ * High-level sync()-related operations
+ */
+
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/syscalls.h>
+#include <linux/linkage.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
+#include "internal.h"
+
+#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
+			SYNC_FILE_RANGE_WAIT_AFTER)
+
+/*
+ * Do the filesystem syncing work. For simple filesystems
+ * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
+ * submit IO for these buffers via __sync_blockdev(). This also speeds up the
+ * wait == 1 case since in that case write_inode() functions do
+ * sync_dirty_buffer() and thus effectively write one block at a time.
+ */
+static int __sync_filesystem(struct super_block *sb, int wait)
+{
+	/*
+	 * This should be safe, as we require bdi backing to actually
+	 * write out data in the first place
+	 */
+	if (sb->s_bdi == &noop_backing_dev_info)
+		return 0;
+
+	if (sb->s_qcop && sb->s_qcop->quota_sync)
+		sb->s_qcop->quota_sync(sb, -1, wait);
+
+	if (wait)
+		sync_inodes_sb(sb);
+	else
+		writeback_inodes_sb(sb);
+
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, wait);
+	return __sync_blockdev(sb->s_bdev, wait);
+}
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int sync_filesystem(struct super_block *sb)
+{
+	int ret;
+
+	/*
+	 * We need to be protected against the filesystem going from
+	 * r/o to r/w or vice versa.
+	 */
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+	/*
+	 * No point in syncing out anything if the filesystem is read-only.
+	 */
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	ret = __sync_filesystem(sb, 0);
+	if (ret < 0)
+		return ret;
+	return __sync_filesystem(sb, 1);
+}
+EXPORT_SYMBOL_GPL(sync_filesystem);
+
+static void sync_one_sb(struct super_block *sb, void *arg)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		__sync_filesystem(sb, *(int *)arg);
+}
+/*
+ * Sync all the data for all the filesystems (called by sys_sync() and
+ * emergency sync)
+ */
+static void sync_filesystems(int wait)
+{
+	iterate_supers(sync_one_sb, &wait);
+}
+
+/*
+ * sync everything.  Start out by waking pdflush, because that writes back
+ * all queues in parallel.
+ */
+SYSCALL_DEFINE0(sync)
+{
+	wakeup_flusher_threads(0);
+	sync_filesystems(0);
+	sync_filesystems(1);
+	if (unlikely(laptop_mode))
+		laptop_sync_completion();
+	return 0;
+}
+
+static void do_sync_work(struct work_struct *work)
+{
+	/*
+	 * Sync twice to reduce the possibility we skipped some inodes / pages
+	 * because they were temporarily locked
+	 */
+	sync_filesystems(0);
+	sync_filesystems(0);
+	printk("Emergency Sync complete\n");
+	kfree(work);
+}
+
+void emergency_sync(void)
+{
+	struct work_struct *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work) {
+		INIT_WORK(work, do_sync_work);
+		schedule_work(work);
+	}
+}
+
+/*
+ * sync a single super
+ */
+SYSCALL_DEFINE1(syncfs, int, fd)
+{
+	struct file *file;
+	struct super_block *sb;
+	int ret;
+	int fput_needed;
+
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -EBADF;
+	sb = file->f_dentry->d_sb;
+
+	down_read(&sb->s_umount);
+	ret = sync_filesystem(sb);
+	up_read(&sb->s_umount);
+
+	fput_light(file, fput_needed);
+	return ret;
+}
+
+/**
+ * vfs_fsync_range - helper to sync a range of data & metadata to disk
+ * @file:		file to sync
+ * @start:		offset in bytes of the beginning of data range to sync
+ * @end:		offset in bytes of the end of data range (inclusive)
+ * @datasync:		perform only datasync
+ *
+ * Write back data in range @start..@end and metadata for @file to disk.  If
+ * @datasync is set only metadata needed to access modified file data is
+ * written.
+ */
+int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct address_space *mapping = file->f_mapping;
+	int err, ret;
+
+	if (!file->f_op || !file->f_op->fsync) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = filemap_write_and_wait_range(mapping, start, end);
+
+	/*
+	 * We need to protect against concurrent writers, which could cause
+	 * livelocks in fsync_buffers_list().
+	 */
+	mutex_lock(&mapping->host->i_mutex);
+	err = file->f_op->fsync(file, datasync);
+	if (!ret)
+		ret = err;
+	mutex_unlock(&mapping->host->i_mutex);
+
+out:
+	return ret;
+}
+EXPORT_SYMBOL(vfs_fsync_range);
+
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file:		file to sync
+ * @datasync:		only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk.  If @datasync is
+ * set only metadata needed to access modified file data is written.
+ */
+int vfs_fsync(struct file *file, int datasync)
+{
+	return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
+}
+EXPORT_SYMBOL(vfs_fsync);
+
+static int do_fsync(unsigned int fd, int datasync)
+{
+	struct file *file;
+	int ret = -EBADF;
+
+	file = fget(fd);
+	if (file) {
+		ret = vfs_fsync(file, datasync);
+		fput(file);
+	}
+	return ret;
+}
+
+SYSCALL_DEFINE1(fsync, unsigned int, fd)
+{
+	return do_fsync(fd, 0);
+}
+
+SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
+{
+	return do_fsync(fd, 1);
+}
+
+/**
+ * generic_write_sync - perform syncing after a write if file / inode is sync
+ * @file:	file to which the write happened
+ * @pos:	offset where the write started
+ * @count:	length of the write
+ *
+ * This is just a simple wrapper about our general syncing function.
+ */
+int generic_write_sync(struct file *file, loff_t pos, loff_t count)
+{
+	if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
+		return 0;
+	return vfs_fsync_range(file, pos, pos + count - 1,
+			       (file->f_flags & __O_SYNC) ? 0 : 1);
+}
+EXPORT_SYMBOL(generic_write_sync);
+
+/*
+ * sys_sync_file_range() permits finely controlled syncing over a segment of
+ * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
+ * zero then sys_sync_file_range() will operate from offset out to EOF.
+ *
+ * The flag bits are:
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
+ * before performing the write.
+ *
+ * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
+ * range which are not presently under writeback. Note that this may block for
+ * significant periods due to exhaustion of disk request structures.
+ *
+ * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
+ * after performing the write.
+ *
+ * Useful combinations of the flag bits are:
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
+ * in the range which were dirty on entry to sys_sync_file_range() are placed
+ * under writeout.  This is a start-write-for-data-integrity operation.
+ *
+ * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
+ * are not presently under writeout.  This is an asynchronous flush-to-disk
+ * operation.  Not suitable for data integrity operations.
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
+ * completion of writeout of all pages in the range.  This will be used after an
+ * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
+ * for that operation to complete and to return the result.
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
+ * a traditional sync() operation.  This is a write-for-data-integrity operation
+ * which will ensure that all pages in the range which were dirty on entry to
+ * sys_sync_file_range() are committed to disk.
+ *
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
+ * I/O errors or ENOSPC conditions and will return those to the caller, after
+ * clearing the EIO and ENOSPC flags in the address_space.
+ *
+ * It should be noted that none of these operations write out the file's
+ * metadata.  So unless the application is strictly performing overwrites of
+ * already-instantiated disk blocks, there are no guarantees here that the data
+ * will be available after a crash.
+ */
+SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
+				unsigned int flags)
+{
+	int ret;
+	struct file *file;
+	struct address_space *mapping;
+	loff_t endbyte;			/* inclusive */
+	int fput_needed;
+	umode_t i_mode;
+
+	ret = -EINVAL;
+	if (flags & ~VALID_FLAGS)
+		goto out;
+
+	endbyte = offset + nbytes;
+
+	if ((s64)offset < 0)
+		goto out;
+	if ((s64)endbyte < 0)
+		goto out;
+	if (endbyte < offset)
+		goto out;
+
+	if (sizeof(pgoff_t) == 4) {
+		if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
+			/*
+			 * The range starts outside a 32 bit machine's
+			 * pagecache addressing capabilities.  Let it "succeed"
+			 */
+			ret = 0;
+			goto out;
+		}
+		if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
+			/*
+			 * Out to EOF
+			 */
+			nbytes = 0;
+		}
+	}
+
+	if (nbytes == 0)
+		endbyte = LLONG_MAX;
+	else
+		endbyte--;		/* inclusive */
+
+	ret = -EBADF;
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		goto out;
+
+	i_mode = file->f_path.dentry->d_inode->i_mode;
+	ret = -ESPIPE;
+	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
+			!S_ISLNK(i_mode))
+		goto out_put;
+
+	mapping = file->f_mapping;
+	if (!mapping) {
+		ret = -EINVAL;
+		goto out_put;
+	}
+
+	ret = 0;
+	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
+		ret = filemap_fdatawait_range(mapping, offset, endbyte);
+		if (ret < 0)
+			goto out_put;
+	}
+
+	if (flags & SYNC_FILE_RANGE_WRITE) {
+		ret = filemap_fdatawrite_range(mapping, offset, endbyte);
+		if (ret < 0)
+			goto out_put;
+	}
+
+	if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
+		ret = filemap_fdatawait_range(mapping, offset, endbyte);
+
+out_put:
+	fput_light(file, fput_needed);
+out:
+	return ret;
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_sync_file_range(long fd, loff_t offset, loff_t nbytes,
+				    long flags)
+{
+	return SYSC_sync_file_range((int) fd, offset, nbytes,
+				    (unsigned int) flags);
+}
+SYSCALL_ALIAS(sys_sync_file_range, SyS_sync_file_range);
+#endif
+
+/* It would be nice if people remember that not all the world's an i386
+   when they introduce new system calls */
+SYSCALL_DEFINE(sync_file_range2)(int fd, unsigned int flags,
+				 loff_t offset, loff_t nbytes)
+{
+	return sys_sync_file_range(fd, offset, nbytes, flags);
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_sync_file_range2(long fd, long flags,
+				     loff_t offset, loff_t nbytes)
+{
+	return SYSC_sync_file_range2((int) fd, (unsigned int) flags,
+				     offset, nbytes);
+}
+SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2);
+#endif
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
new file mode 100644
index 00000000..8c41feac
--- /dev/null
+++ b/fs/sysfs/Kconfig
@@ -0,0 +1,23 @@
+config SYSFS
+	bool "sysfs file system support" if EXPERT
+	default y
+	help
+	The sysfs filesystem is a virtual filesystem that the kernel uses to
+	export internal kernel objects, their attributes, and their
+	relationships to one another.
+
+	Users can use sysfs to ascertain useful information about the running
+	kernel, such as the devices the kernel has discovered on each bus and
+	which driver each is bound to. sysfs can also be used to tune devices
+	and other kernel subsystems.
+
+	Some system agents rely on the information in sysfs to operate.
+	/sbin/hotplug uses device and object attributes in sysfs to assist in
+	delegating policy decisions, like persistently naming devices.
+
+	sysfs is currently used by the block subsystem to mount the root
+	partition.  If sysfs is disabled you must specify the boot device on
+	the kernel boot command line via its major and minor numbers.  For
+	example, "root=03:01" for /dev/hda1.
+
+	Designers of embedded systems may wish to say N here to conserve space.
diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile
new file mode 100644
index 00000000..7a1ceb94
--- /dev/null
+++ b/fs/sysfs/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the sysfs virtual filesystem
+#
+
+obj-y		:= inode.o file.o dir.o symlink.o mount.o bin.o \
+		   group.o
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
new file mode 100644
index 00000000..a4759833
--- /dev/null
+++ b/fs/sysfs/bin.c
@@ -0,0 +1,506 @@
+/*
+ * fs/sysfs/bin.c - sysfs binary file implementation
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Matthew Wilcox
+ * Copyright (c) 2004 Silicon Graphics, Inc.
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please see Documentation/filesystems/sysfs.txt for more information.
+ */
+
+#undef DEBUG
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/mm.h>
+
+#include <asm/uaccess.h>
+
+#include "sysfs.h"
+
+/*
+ * There's one bin_buffer for each open file.
+ *
+ * filp->private_data points to bin_buffer and
+ * sysfs_dirent->s_bin_attr.buffers points to a the bin_buffer s
+ * sysfs_dirent->s_bin_attr.buffers is protected by sysfs_bin_lock
+ */
+static DEFINE_MUTEX(sysfs_bin_lock);
+
+struct bin_buffer {
+	struct mutex			mutex;
+	void				*buffer;
+	int				mmapped;
+	const struct vm_operations_struct *vm_ops;
+	struct file			*file;
+	struct hlist_node		list;
+};
+
+static int
+fill_read(struct file *file, char *buffer, loff_t off, size_t count)
+{
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
+	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
+	int rc;
+
+	/* need attr_sd for attr, its parent for kobj */
+	if (!sysfs_get_active(attr_sd))
+		return -ENODEV;
+
+	rc = -EIO;
+	if (attr->read)
+		rc = attr->read(file, kobj, attr, buffer, off, count);
+
+	sysfs_put_active(attr_sd);
+
+	return rc;
+}
+
+static ssize_t
+read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
+{
+	struct bin_buffer *bb = file->private_data;
+	int size = file->f_path.dentry->d_inode->i_size;
+	loff_t offs = *off;
+	int count = min_t(size_t, bytes, PAGE_SIZE);
+	char *temp;
+
+	if (!bytes)
+		return 0;
+
+	if (size) {
+		if (offs > size)
+			return 0;
+		if (offs + count > size)
+			count = size - offs;
+	}
+
+	temp = kmalloc(count, GFP_KERNEL);
+	if (!temp)
+		return -ENOMEM;
+
+	mutex_lock(&bb->mutex);
+
+	count = fill_read(file, bb->buffer, offs, count);
+	if (count < 0) {
+		mutex_unlock(&bb->mutex);
+		goto out_free;
+	}
+
+	memcpy(temp, bb->buffer, count);
+
+	mutex_unlock(&bb->mutex);
+
+	if (copy_to_user(userbuf, temp, count)) {
+		count = -EFAULT;
+		goto out_free;
+	}
+
+	pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
+
+	*off = offs + count;
+
+ out_free:
+	kfree(temp);
+	return count;
+}
+
+static int
+flush_write(struct file *file, char *buffer, loff_t offset, size_t count)
+{
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
+	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
+	int rc;
+
+	/* need attr_sd for attr, its parent for kobj */
+	if (!sysfs_get_active(attr_sd))
+		return -ENODEV;
+
+	rc = -EIO;
+	if (attr->write)
+		rc = attr->write(file, kobj, attr, buffer, offset, count);
+
+	sysfs_put_active(attr_sd);
+
+	return rc;
+}
+
+static ssize_t write(struct file *file, const char __user *userbuf,
+		     size_t bytes, loff_t *off)
+{
+	struct bin_buffer *bb = file->private_data;
+	int size = file->f_path.dentry->d_inode->i_size;
+	loff_t offs = *off;
+	int count = min_t(size_t, bytes, PAGE_SIZE);
+	char *temp;
+
+	if (!bytes)
+		return 0;
+
+	if (size) {
+		if (offs > size)
+			return 0;
+		if (offs + count > size)
+			count = size - offs;
+	}
+
+	temp = memdup_user(userbuf, count);
+	if (IS_ERR(temp))
+		return PTR_ERR(temp);
+
+	mutex_lock(&bb->mutex);
+
+	memcpy(bb->buffer, temp, count);
+
+	count = flush_write(file, bb->buffer, offs, count);
+	mutex_unlock(&bb->mutex);
+
+	if (count > 0)
+		*off = offs + count;
+
+	kfree(temp);
+	return count;
+}
+
+static void bin_vma_open(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+
+	if (!bb->vm_ops)
+		return;
+
+	if (!sysfs_get_active(attr_sd))
+		return;
+
+	if (bb->vm_ops->open)
+		bb->vm_ops->open(vma);
+
+	sysfs_put_active(attr_sd);
+}
+
+static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	int ret;
+
+	if (!bb->vm_ops)
+		return VM_FAULT_SIGBUS;
+
+	if (!sysfs_get_active(attr_sd))
+		return VM_FAULT_SIGBUS;
+
+	ret = VM_FAULT_SIGBUS;
+	if (bb->vm_ops->fault)
+		ret = bb->vm_ops->fault(vma, vmf);
+
+	sysfs_put_active(attr_sd);
+	return ret;
+}
+
+static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	int ret;
+
+	if (!bb->vm_ops)
+		return VM_FAULT_SIGBUS;
+
+	if (!sysfs_get_active(attr_sd))
+		return VM_FAULT_SIGBUS;
+
+	ret = 0;
+	if (bb->vm_ops->page_mkwrite)
+		ret = bb->vm_ops->page_mkwrite(vma, vmf);
+
+	sysfs_put_active(attr_sd);
+	return ret;
+}
+
+static int bin_access(struct vm_area_struct *vma, unsigned long addr,
+		  void *buf, int len, int write)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	int ret;
+
+	if (!bb->vm_ops)
+		return -EINVAL;
+
+	if (!sysfs_get_active(attr_sd))
+		return -EINVAL;
+
+	ret = -EINVAL;
+	if (bb->vm_ops->access)
+		ret = bb->vm_ops->access(vma, addr, buf, len, write);
+
+	sysfs_put_active(attr_sd);
+	return ret;
+}
+
+#ifdef CONFIG_NUMA
+static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	int ret;
+
+	if (!bb->vm_ops)
+		return 0;
+
+	if (!sysfs_get_active(attr_sd))
+		return -EINVAL;
+
+	ret = 0;
+	if (bb->vm_ops->set_policy)
+		ret = bb->vm_ops->set_policy(vma, new);
+
+	sysfs_put_active(attr_sd);
+	return ret;
+}
+
+static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
+					unsigned long addr)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	struct mempolicy *pol;
+
+	if (!bb->vm_ops)
+		return vma->vm_policy;
+
+	if (!sysfs_get_active(attr_sd))
+		return vma->vm_policy;
+
+	pol = vma->vm_policy;
+	if (bb->vm_ops->get_policy)
+		pol = bb->vm_ops->get_policy(vma, addr);
+
+	sysfs_put_active(attr_sd);
+	return pol;
+}
+
+static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
+			const nodemask_t *to, unsigned long flags)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	int ret;
+
+	if (!bb->vm_ops)
+		return 0;
+
+	if (!sysfs_get_active(attr_sd))
+		return 0;
+
+	ret = 0;
+	if (bb->vm_ops->migrate)
+		ret = bb->vm_ops->migrate(vma, from, to, flags);
+
+	sysfs_put_active(attr_sd);
+	return ret;
+}
+#endif
+
+static const struct vm_operations_struct bin_vm_ops = {
+	.open		= bin_vma_open,
+	.fault		= bin_fault,
+	.page_mkwrite	= bin_page_mkwrite,
+	.access		= bin_access,
+#ifdef CONFIG_NUMA
+	.set_policy	= bin_set_policy,
+	.get_policy	= bin_get_policy,
+	.migrate	= bin_migrate,
+#endif
+};
+
+static int mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
+	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
+	int rc;
+
+	mutex_lock(&bb->mutex);
+
+	/* need attr_sd for attr, its parent for kobj */
+	rc = -ENODEV;
+	if (!sysfs_get_active(attr_sd))
+		goto out_unlock;
+
+	rc = -EINVAL;
+	if (!attr->mmap)
+		goto out_put;
+
+	rc = attr->mmap(file, kobj, attr, vma);
+	if (rc)
+		goto out_put;
+
+	/*
+	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
+	 * to satisfy versions of X which crash if the mmap fails: that
+	 * substitutes a new vm_file, and we don't then want bin_vm_ops.
+	 */
+	if (vma->vm_file != file)
+		goto out_put;
+
+	rc = -EINVAL;
+	if (bb->mmapped && bb->vm_ops != vma->vm_ops)
+		goto out_put;
+
+	/*
+	 * It is not possible to successfully wrap close.
+	 * So error if someone is trying to use close.
+	 */
+	rc = -EINVAL;
+	if (vma->vm_ops && vma->vm_ops->close)
+		goto out_put;
+
+	rc = 0;
+	bb->mmapped = 1;
+	bb->vm_ops = vma->vm_ops;
+	vma->vm_ops = &bin_vm_ops;
+out_put:
+	sysfs_put_active(attr_sd);
+out_unlock:
+	mutex_unlock(&bb->mutex);
+
+	return rc;
+}
+
+static int open(struct inode * inode, struct file * file)
+{
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
+	struct bin_buffer *bb = NULL;
+	int error;
+
+	/* binary file operations requires both @sd and its parent */
+	if (!sysfs_get_active(attr_sd))
+		return -ENODEV;
+
+	error = -EACCES;
+	if ((file->f_mode & FMODE_WRITE) && !(attr->write || attr->mmap))
+		goto err_out;
+	if ((file->f_mode & FMODE_READ) && !(attr->read || attr->mmap))
+		goto err_out;
+
+	error = -ENOMEM;
+	bb = kzalloc(sizeof(*bb), GFP_KERNEL);
+	if (!bb)
+		goto err_out;
+
+	bb->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!bb->buffer)
+		goto err_out;
+
+	mutex_init(&bb->mutex);
+	bb->file = file;
+	file->private_data = bb;
+
+	mutex_lock(&sysfs_bin_lock);
+	hlist_add_head(&bb->list, &attr_sd->s_bin_attr.buffers);
+	mutex_unlock(&sysfs_bin_lock);
+
+	/* open succeeded, put active references */
+	sysfs_put_active(attr_sd);
+	return 0;
+
+ err_out:
+	sysfs_put_active(attr_sd);
+	kfree(bb);
+	return error;
+}
+
+static int release(struct inode * inode, struct file * file)
+{
+	struct bin_buffer *bb = file->private_data;
+
+	mutex_lock(&sysfs_bin_lock);
+	hlist_del(&bb->list);
+	mutex_unlock(&sysfs_bin_lock);
+
+	kfree(bb->buffer);
+	kfree(bb);
+	return 0;
+}
+
+const struct file_operations bin_fops = {
+	.read		= read,
+	.write		= write,
+	.mmap		= mmap,
+	.llseek		= generic_file_llseek,
+	.open		= open,
+	.release	= release,
+};
+
+
+void unmap_bin_file(struct sysfs_dirent *attr_sd)
+{
+	struct bin_buffer *bb;
+	struct hlist_node *tmp;
+
+	if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR)
+		return;
+
+	mutex_lock(&sysfs_bin_lock);
+
+	hlist_for_each_entry(bb, tmp, &attr_sd->s_bin_attr.buffers, list) {
+		struct inode *inode = bb->file->f_path.dentry->d_inode;
+
+		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+	}
+
+	mutex_unlock(&sysfs_bin_lock);
+}
+
+/**
+ *	sysfs_create_bin_file - create binary file for object.
+ *	@kobj:	object.
+ *	@attr:	attribute descriptor.
+ */
+
+int sysfs_create_bin_file(struct kobject *kobj,
+			  const struct bin_attribute *attr)
+{
+	BUG_ON(!kobj || !kobj->sd || !attr);
+
+	return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
+}
+
+
+/**
+ *	sysfs_remove_bin_file - remove binary file for object.
+ *	@kobj:	object.
+ *	@attr:	attribute descriptor.
+ */
+
+void sysfs_remove_bin_file(struct kobject *kobj,
+			   const struct bin_attribute *attr)
+{
+	sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
+}
+
+EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
+EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
new file mode 100644
index 00000000..ea9120a8
--- /dev/null
+++ b/fs/sysfs/dir.c
@@ -0,0 +1,964 @@
+/*
+ * fs/sysfs/dir.c - sysfs core and dir operation implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please see Documentation/filesystems/sysfs.txt for more information.
+ */
+
+#undef DEBUG
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/completion.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include "sysfs.h"
+
+DEFINE_MUTEX(sysfs_mutex);
+DEFINE_SPINLOCK(sysfs_assoc_lock);
+
+static DEFINE_SPINLOCK(sysfs_ino_lock);
+static DEFINE_IDA(sysfs_ino_ida);
+
+/**
+ *	sysfs_link_sibling - link sysfs_dirent into sibling list
+ *	@sd: sysfs_dirent of interest
+ *
+ *	Link @sd into its sibling list which starts from
+ *	sd->s_parent->s_dir.children.
+ *
+ *	Locking:
+ *	mutex_lock(sysfs_mutex)
+ */
+static void sysfs_link_sibling(struct sysfs_dirent *sd)
+{
+	struct sysfs_dirent *parent_sd = sd->s_parent;
+	struct sysfs_dirent **pos;
+
+	BUG_ON(sd->s_sibling);
+
+	/* Store directory entries in order by ino.  This allows
+	 * readdir to properly restart without having to add a
+	 * cursor into the s_dir.children list.
+	 */
+	for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) {
+		if (sd->s_ino < (*pos)->s_ino)
+			break;
+	}
+	sd->s_sibling = *pos;
+	*pos = sd;
+}
+
+/**
+ *	sysfs_unlink_sibling - unlink sysfs_dirent from sibling list
+ *	@sd: sysfs_dirent of interest
+ *
+ *	Unlink @sd from its sibling list which starts from
+ *	sd->s_parent->s_dir.children.
+ *
+ *	Locking:
+ *	mutex_lock(sysfs_mutex)
+ */
+static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
+{
+	struct sysfs_dirent **pos;
+
+	for (pos = &sd->s_parent->s_dir.children; *pos;
+	     pos = &(*pos)->s_sibling) {
+		if (*pos == sd) {
+			*pos = sd->s_sibling;
+			sd->s_sibling = NULL;
+			break;
+		}
+	}
+}
+
+/**
+ *	sysfs_get_active - get an active reference to sysfs_dirent
+ *	@sd: sysfs_dirent to get an active reference to
+ *
+ *	Get an active reference of @sd.  This function is noop if @sd
+ *	is NULL.
+ *
+ *	RETURNS:
+ *	Pointer to @sd on success, NULL on failure.
+ */
+struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
+{
+	if (unlikely(!sd))
+		return NULL;
+
+	while (1) {
+		int v, t;
+
+		v = atomic_read(&sd->s_active);
+		if (unlikely(v < 0))
+			return NULL;
+
+		t = atomic_cmpxchg(&sd->s_active, v, v + 1);
+		if (likely(t == v)) {
+			rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
+			return sd;
+		}
+		if (t < 0)
+			return NULL;
+
+		cpu_relax();
+	}
+}
+
+/**
+ *	sysfs_put_active - put an active reference to sysfs_dirent
+ *	@sd: sysfs_dirent to put an active reference to
+ *
+ *	Put an active reference to @sd.  This function is noop if @sd
+ *	is NULL.
+ */
+void sysfs_put_active(struct sysfs_dirent *sd)
+{
+	struct completion *cmpl;
+	int v;
+
+	if (unlikely(!sd))
+		return;
+
+	rwsem_release(&sd->dep_map, 1, _RET_IP_);
+	v = atomic_dec_return(&sd->s_active);
+	if (likely(v != SD_DEACTIVATED_BIAS))
+		return;
+
+	/* atomic_dec_return() is a mb(), we'll always see the updated
+	 * sd->s_sibling.
+	 */
+	cmpl = (void *)sd->s_sibling;
+	complete(cmpl);
+}
+
+/**
+ *	sysfs_deactivate - deactivate sysfs_dirent
+ *	@sd: sysfs_dirent to deactivate
+ *
+ *	Deny new active references and drain existing ones.
+ */
+static void sysfs_deactivate(struct sysfs_dirent *sd)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	int v;
+
+	BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
+
+	if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
+		return;
+
+	sd->s_sibling = (void *)&wait;
+
+	rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
+	/* atomic_add_return() is a mb(), put_active() will always see
+	 * the updated sd->s_sibling.
+	 */
+	v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
+
+	if (v != SD_DEACTIVATED_BIAS) {
+		lock_contended(&sd->dep_map, _RET_IP_);
+		wait_for_completion(&wait);
+	}
+
+	sd->s_sibling = NULL;
+
+	lock_acquired(&sd->dep_map, _RET_IP_);
+	rwsem_release(&sd->dep_map, 1, _RET_IP_);
+}
+
+static int sysfs_alloc_ino(ino_t *pino)
+{
+	int ino, rc;
+
+ retry:
+	spin_lock(&sysfs_ino_lock);
+	rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino);
+	spin_unlock(&sysfs_ino_lock);
+
+	if (rc == -EAGAIN) {
+		if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL))
+			goto retry;
+		rc = -ENOMEM;
+	}
+
+	*pino = ino;
+	return rc;
+}
+
+static void sysfs_free_ino(ino_t ino)
+{
+	spin_lock(&sysfs_ino_lock);
+	ida_remove(&sysfs_ino_ida, ino);
+	spin_unlock(&sysfs_ino_lock);
+}
+
+void release_sysfs_dirent(struct sysfs_dirent * sd)
+{
+	struct sysfs_dirent *parent_sd;
+
+ repeat:
+	/* Moving/renaming is always done while holding reference.
+	 * sd->s_parent won't change beneath us.
+	 */
+	parent_sd = sd->s_parent;
+
+	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
+		sysfs_put(sd->s_symlink.target_sd);
+	if (sysfs_type(sd) & SYSFS_COPY_NAME)
+		kfree(sd->s_name);
+	if (sd->s_iattr && sd->s_iattr->ia_secdata)
+		security_release_secctx(sd->s_iattr->ia_secdata,
+					sd->s_iattr->ia_secdata_len);
+	kfree(sd->s_iattr);
+	sysfs_free_ino(sd->s_ino);
+	kmem_cache_free(sysfs_dir_cachep, sd);
+
+	sd = parent_sd;
+	if (sd && atomic_dec_and_test(&sd->s_count))
+		goto repeat;
+}
+
+static int sysfs_dentry_delete(const struct dentry *dentry)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	return !!(sd->s_flags & SYSFS_FLAG_REMOVED);
+}
+
+static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct sysfs_dirent *sd;
+	int is_dir;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	sd = dentry->d_fsdata;
+	mutex_lock(&sysfs_mutex);
+
+	/* The sysfs dirent has been deleted */
+	if (sd->s_flags & SYSFS_FLAG_REMOVED)
+		goto out_bad;
+
+	/* The sysfs dirent has been moved? */
+	if (dentry->d_parent->d_fsdata != sd->s_parent)
+		goto out_bad;
+
+	/* The sysfs dirent has been renamed */
+	if (strcmp(dentry->d_name.name, sd->s_name) != 0)
+		goto out_bad;
+
+	mutex_unlock(&sysfs_mutex);
+out_valid:
+	return 1;
+out_bad:
+	/* Remove the dentry from the dcache hashes.
+	 * If this is a deleted dentry we use d_drop instead of d_delete
+	 * so sysfs doesn't need to cope with negative dentries.
+	 *
+	 * If this is a dentry that has simply been renamed we
+	 * use d_drop to remove it from the dcache lookup on its
+	 * old parent.  If this dentry persists later when a lookup
+	 * is performed at its new name the dentry will be readded
+	 * to the dcache hashes.
+	 */
+	is_dir = (sysfs_type(sd) == SYSFS_DIR);
+	mutex_unlock(&sysfs_mutex);
+	if (is_dir) {
+		/* If we have submounts we must allow the vfs caches
+		 * to lie about the state of the filesystem to prevent
+		 * leaks and other nasty things.
+		 */
+		if (have_submounts(dentry))
+			goto out_valid;
+		shrink_dcache_parent(dentry);
+	}
+	d_drop(dentry);
+	return 0;
+}
+
+static void sysfs_dentry_iput(struct dentry *dentry, struct inode *inode)
+{
+	struct sysfs_dirent * sd = dentry->d_fsdata;
+
+	sysfs_put(sd);
+	iput(inode);
+}
+
+static const struct dentry_operations sysfs_dentry_ops = {
+	.d_revalidate	= sysfs_dentry_revalidate,
+	.d_delete	= sysfs_dentry_delete,
+	.d_iput		= sysfs_dentry_iput,
+};
+
+struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
+{
+	char *dup_name = NULL;
+	struct sysfs_dirent *sd;
+
+	if (type & SYSFS_COPY_NAME) {
+		name = dup_name = kstrdup(name, GFP_KERNEL);
+		if (!name)
+			return NULL;
+	}
+
+	sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
+	if (!sd)
+		goto err_out1;
+
+	if (sysfs_alloc_ino(&sd->s_ino))
+		goto err_out2;
+
+	atomic_set(&sd->s_count, 1);
+	atomic_set(&sd->s_active, 0);
+
+	sd->s_name = name;
+	sd->s_mode = mode;
+	sd->s_flags = type;
+
+	return sd;
+
+ err_out2:
+	kmem_cache_free(sysfs_dir_cachep, sd);
+ err_out1:
+	kfree(dup_name);
+	return NULL;
+}
+
+/**
+ *	sysfs_addrm_start - prepare for sysfs_dirent add/remove
+ *	@acxt: pointer to sysfs_addrm_cxt to be used
+ *	@parent_sd: parent sysfs_dirent
+ *
+ *	This function is called when the caller is about to add or
+ *	remove sysfs_dirent under @parent_sd.  This function acquires
+ *	sysfs_mutex.  @acxt is used to keep and pass context to
+ *	other addrm functions.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).  sysfs_mutex is locked on
+ *	return.
+ */
+void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
+		       struct sysfs_dirent *parent_sd)
+{
+	memset(acxt, 0, sizeof(*acxt));
+	acxt->parent_sd = parent_sd;
+
+	mutex_lock(&sysfs_mutex);
+}
+
+/**
+ *	__sysfs_add_one - add sysfs_dirent to parent without warning
+ *	@acxt: addrm context to use
+ *	@sd: sysfs_dirent to be added
+ *
+ *	Get @acxt->parent_sd and set sd->s_parent to it and increment
+ *	nlink of parent inode if @sd is a directory and link into the
+ *	children list of the parent.
+ *
+ *	This function should be called between calls to
+ *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
+ *	passed the same @acxt as passed to sysfs_addrm_start().
+ *
+ *	LOCKING:
+ *	Determined by sysfs_addrm_start().
+ *
+ *	RETURNS:
+ *	0 on success, -EEXIST if entry with the given name already
+ *	exists.
+ */
+int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+{
+	struct sysfs_inode_attrs *ps_iattr;
+
+	if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
+		return -EEXIST;
+
+	sd->s_parent = sysfs_get(acxt->parent_sd);
+
+	sysfs_link_sibling(sd);
+
+	/* Update timestamps on the parent */
+	ps_iattr = acxt->parent_sd->s_iattr;
+	if (ps_iattr) {
+		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+	}
+
+	return 0;
+}
+
+/**
+ *	sysfs_pathname - return full path to sysfs dirent
+ *	@sd: sysfs_dirent whose path we want
+ *	@path: caller allocated buffer
+ *
+ *	Gives the name "/" to the sysfs_root entry; any path returned
+ *	is relative to wherever sysfs is mounted.
+ *
+ *	XXX: does no error checking on @path size
+ */
+static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
+{
+	if (sd->s_parent) {
+		sysfs_pathname(sd->s_parent, path);
+		strcat(path, "/");
+	}
+	strcat(path, sd->s_name);
+	return path;
+}
+
+/**
+ *	sysfs_add_one - add sysfs_dirent to parent
+ *	@acxt: addrm context to use
+ *	@sd: sysfs_dirent to be added
+ *
+ *	Get @acxt->parent_sd and set sd->s_parent to it and increment
+ *	nlink of parent inode if @sd is a directory and link into the
+ *	children list of the parent.
+ *
+ *	This function should be called between calls to
+ *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
+ *	passed the same @acxt as passed to sysfs_addrm_start().
+ *
+ *	LOCKING:
+ *	Determined by sysfs_addrm_start().
+ *
+ *	RETURNS:
+ *	0 on success, -EEXIST if entry with the given name already
+ *	exists.
+ */
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+{
+	int ret;
+
+	ret = __sysfs_add_one(acxt, sd);
+	if (ret == -EEXIST) {
+		char *path = kzalloc(PATH_MAX, GFP_KERNEL);
+		WARN(1, KERN_WARNING
+		     "sysfs: cannot create duplicate filename '%s'\n",
+		     (path == NULL) ? sd->s_name :
+		     strcat(strcat(sysfs_pathname(acxt->parent_sd, path), "/"),
+		            sd->s_name));
+		kfree(path);
+	}
+
+	return ret;
+}
+
+/**
+ *	sysfs_remove_one - remove sysfs_dirent from parent
+ *	@acxt: addrm context to use
+ *	@sd: sysfs_dirent to be removed
+ *
+ *	Mark @sd removed and drop nlink of parent inode if @sd is a
+ *	directory.  @sd is unlinked from the children list.
+ *
+ *	This function should be called between calls to
+ *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
+ *	passed the same @acxt as passed to sysfs_addrm_start().
+ *
+ *	LOCKING:
+ *	Determined by sysfs_addrm_start().
+ */
+void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+{
+	struct sysfs_inode_attrs *ps_iattr;
+
+	BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED);
+
+	sysfs_unlink_sibling(sd);
+
+	/* Update timestamps on the parent */
+	ps_iattr = acxt->parent_sd->s_iattr;
+	if (ps_iattr) {
+		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+	}
+
+	sd->s_flags |= SYSFS_FLAG_REMOVED;
+	sd->s_sibling = acxt->removed;
+	acxt->removed = sd;
+}
+
+/**
+ *	sysfs_addrm_finish - finish up sysfs_dirent add/remove
+ *	@acxt: addrm context to finish up
+ *
+ *	Finish up sysfs_dirent add/remove.  Resources acquired by
+ *	sysfs_addrm_start() are released and removed sysfs_dirents are
+ *	cleaned up.
+ *
+ *	LOCKING:
+ *	sysfs_mutex is released.
+ */
+void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
+{
+	/* release resources acquired by sysfs_addrm_start() */
+	mutex_unlock(&sysfs_mutex);
+
+	/* kill removed sysfs_dirents */
+	while (acxt->removed) {
+		struct sysfs_dirent *sd = acxt->removed;
+
+		acxt->removed = sd->s_sibling;
+		sd->s_sibling = NULL;
+
+		sysfs_deactivate(sd);
+		unmap_bin_file(sd);
+		sysfs_put(sd);
+	}
+}
+
+/**
+ *	sysfs_find_dirent - find sysfs_dirent with the given name
+ *	@parent_sd: sysfs_dirent to search under
+ *	@name: name to look for
+ *
+ *	Look for sysfs_dirent with name @name under @parent_sd.
+ *
+ *	LOCKING:
+ *	mutex_lock(sysfs_mutex)
+ *
+ *	RETURNS:
+ *	Pointer to sysfs_dirent if found, NULL if not.
+ */
+struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
+				       const void *ns,
+				       const unsigned char *name)
+{
+	struct sysfs_dirent *sd;
+
+	for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
+		if (ns && sd->s_ns && (sd->s_ns != ns))
+			continue;
+		if (!strcmp(sd->s_name, name))
+			return sd;
+	}
+	return NULL;
+}
+
+/**
+ *	sysfs_get_dirent - find and get sysfs_dirent with the given name
+ *	@parent_sd: sysfs_dirent to search under
+ *	@name: name to look for
+ *
+ *	Look for sysfs_dirent with name @name under @parent_sd and get
+ *	it if found.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).  Grabs sysfs_mutex.
+ *
+ *	RETURNS:
+ *	Pointer to sysfs_dirent if found, NULL if not.
+ */
+struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+				      const void *ns,
+				      const unsigned char *name)
+{
+	struct sysfs_dirent *sd;
+
+	mutex_lock(&sysfs_mutex);
+	sd = sysfs_find_dirent(parent_sd, ns, name);
+	sysfs_get(sd);
+	mutex_unlock(&sysfs_mutex);
+
+	return sd;
+}
+EXPORT_SYMBOL_GPL(sysfs_get_dirent);
+
+static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
+	enum kobj_ns_type type, const void *ns, const char *name,
+	struct sysfs_dirent **p_sd)
+{
+	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
+	struct sysfs_addrm_cxt acxt;
+	struct sysfs_dirent *sd;
+	int rc;
+
+	/* allocate */
+	sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
+	if (!sd)
+		return -ENOMEM;
+
+	sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
+	sd->s_ns = ns;
+	sd->s_dir.kobj = kobj;
+
+	/* link in */
+	sysfs_addrm_start(&acxt, parent_sd);
+	rc = sysfs_add_one(&acxt, sd);
+	sysfs_addrm_finish(&acxt);
+
+	if (rc == 0)
+		*p_sd = sd;
+	else
+		sysfs_put(sd);
+
+	return rc;
+}
+
+int sysfs_create_subdir(struct kobject *kobj, const char *name,
+			struct sysfs_dirent **p_sd)
+{
+	return create_dir(kobj, kobj->sd,
+			  KOBJ_NS_TYPE_NONE, NULL, name, p_sd);
+}
+
+/**
+ *	sysfs_read_ns_type: return associated ns_type
+ *	@kobj: the kobject being queried
+ *
+ *	Each kobject can be tagged with exactly one namespace type
+ *	(i.e. network or user).  Return the ns_type associated with
+ *	this object if any
+ */
+static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
+{
+	const struct kobj_ns_type_operations *ops;
+	enum kobj_ns_type type;
+
+	ops = kobj_child_ns_ops(kobj);
+	if (!ops)
+		return KOBJ_NS_TYPE_NONE;
+
+	type = ops->type;
+	BUG_ON(type <= KOBJ_NS_TYPE_NONE);
+	BUG_ON(type >= KOBJ_NS_TYPES);
+	BUG_ON(!kobj_ns_type_registered(type));
+
+	return type;
+}
+
+/**
+ *	sysfs_create_dir - create a directory for an object.
+ *	@kobj:		object we're creating directory for. 
+ */
+int sysfs_create_dir(struct kobject * kobj)
+{
+	enum kobj_ns_type type;
+	struct sysfs_dirent *parent_sd, *sd;
+	const void *ns = NULL;
+	int error = 0;
+
+	BUG_ON(!kobj);
+
+	if (kobj->parent)
+		parent_sd = kobj->parent->sd;
+	else
+		parent_sd = &sysfs_root;
+
+	if (sysfs_ns_type(parent_sd))
+		ns = kobj->ktype->namespace(kobj);
+	type = sysfs_read_ns_type(kobj);
+
+	error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
+	if (!error)
+		kobj->sd = sd;
+	return error;
+}
+
+static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
+				struct nameidata *nd)
+{
+	struct dentry *ret = NULL;
+	struct dentry *parent = dentry->d_parent;
+	struct sysfs_dirent *parent_sd = parent->d_fsdata;
+	struct sysfs_dirent *sd;
+	struct inode *inode;
+	enum kobj_ns_type type;
+	const void *ns;
+
+	mutex_lock(&sysfs_mutex);
+
+	type = sysfs_ns_type(parent_sd);
+	ns = sysfs_info(dir->i_sb)->ns[type];
+
+	sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
+
+	/* no such entry */
+	if (!sd) {
+		ret = ERR_PTR(-ENOENT);
+		goto out_unlock;
+	}
+
+	/* attach dentry and inode */
+	inode = sysfs_get_inode(dir->i_sb, sd);
+	if (!inode) {
+		ret = ERR_PTR(-ENOMEM);
+		goto out_unlock;
+	}
+
+	/* instantiate and hash dentry */
+	ret = d_find_alias(inode);
+	if (!ret) {
+		d_set_d_op(dentry, &sysfs_dentry_ops);
+		dentry->d_fsdata = sysfs_get(sd);
+		d_add(dentry, inode);
+	} else {
+		d_move(ret, dentry);
+		iput(inode);
+	}
+
+ out_unlock:
+	mutex_unlock(&sysfs_mutex);
+	return ret;
+}
+
+const struct inode_operations sysfs_dir_inode_operations = {
+	.lookup		= sysfs_lookup,
+	.permission	= sysfs_permission,
+	.setattr	= sysfs_setattr,
+	.getattr	= sysfs_getattr,
+	.setxattr	= sysfs_setxattr,
+};
+
+static void remove_dir(struct sysfs_dirent *sd)
+{
+	struct sysfs_addrm_cxt acxt;
+
+	sysfs_addrm_start(&acxt, sd->s_parent);
+	sysfs_remove_one(&acxt, sd);
+	sysfs_addrm_finish(&acxt);
+}
+
+void sysfs_remove_subdir(struct sysfs_dirent *sd)
+{
+	remove_dir(sd);
+}
+
+
+static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
+{
+	struct sysfs_addrm_cxt acxt;
+	struct sysfs_dirent **pos;
+
+	if (!dir_sd)
+		return;
+
+	pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
+	sysfs_addrm_start(&acxt, dir_sd);
+	pos = &dir_sd->s_dir.children;
+	while (*pos) {
+		struct sysfs_dirent *sd = *pos;
+
+		if (sysfs_type(sd) != SYSFS_DIR)
+			sysfs_remove_one(&acxt, sd);
+		else
+			pos = &(*pos)->s_sibling;
+	}
+	sysfs_addrm_finish(&acxt);
+
+	remove_dir(dir_sd);
+}
+
+/**
+ *	sysfs_remove_dir - remove an object's directory.
+ *	@kobj:	object.
+ *
+ *	The only thing special about this is that we remove any files in
+ *	the directory before we remove the directory, and we've inlined
+ *	what used to be sysfs_rmdir() below, instead of calling separately.
+ */
+
+void sysfs_remove_dir(struct kobject * kobj)
+{
+	struct sysfs_dirent *sd = kobj->sd;
+
+	spin_lock(&sysfs_assoc_lock);
+	kobj->sd = NULL;
+	spin_unlock(&sysfs_assoc_lock);
+
+	__sysfs_remove_dir(sd);
+}
+
+int sysfs_rename(struct sysfs_dirent *sd,
+	struct sysfs_dirent *new_parent_sd, const void *new_ns,
+	const char *new_name)
+{
+	const char *dup_name = NULL;
+	int error;
+
+	mutex_lock(&sysfs_mutex);
+
+	error = 0;
+	if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
+	    (strcmp(sd->s_name, new_name) == 0))
+		goto out;	/* nothing to rename */
+
+	error = -EEXIST;
+	if (sysfs_find_dirent(new_parent_sd, new_ns, new_name))
+		goto out;
+
+	/* rename sysfs_dirent */
+	if (strcmp(sd->s_name, new_name) != 0) {
+		error = -ENOMEM;
+		new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
+		if (!new_name)
+			goto out;
+
+		dup_name = sd->s_name;
+		sd->s_name = new_name;
+	}
+
+	/* Remove from old parent's list and insert into new parent's list. */
+	if (sd->s_parent != new_parent_sd) {
+		sysfs_unlink_sibling(sd);
+		sysfs_get(new_parent_sd);
+		sysfs_put(sd->s_parent);
+		sd->s_parent = new_parent_sd;
+		sysfs_link_sibling(sd);
+	}
+	sd->s_ns = new_ns;
+
+	error = 0;
+ out:
+	mutex_unlock(&sysfs_mutex);
+	kfree(dup_name);
+	return error;
+}
+
+int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
+{
+	struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
+	const void *new_ns = NULL;
+
+	if (sysfs_ns_type(parent_sd))
+		new_ns = kobj->ktype->namespace(kobj);
+
+	return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
+}
+
+int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
+{
+	struct sysfs_dirent *sd = kobj->sd;
+	struct sysfs_dirent *new_parent_sd;
+	const void *new_ns = NULL;
+
+	BUG_ON(!sd->s_parent);
+	if (sysfs_ns_type(sd->s_parent))
+		new_ns = kobj->ktype->namespace(kobj);
+	new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
+		new_parent_kobj->sd : &sysfs_root;
+
+	return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name);
+}
+
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct sysfs_dirent *sd)
+{
+	return (sd->s_mode >> 12) & 15;
+}
+
+static int sysfs_dir_release(struct inode *inode, struct file *filp)
+{
+	sysfs_put(filp->private_data);
+	return 0;
+}
+
+static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
+	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos)
+{
+	if (pos) {
+		int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
+			pos->s_parent == parent_sd &&
+			ino == pos->s_ino;
+		sysfs_put(pos);
+		if (!valid)
+			pos = NULL;
+	}
+	if (!pos && (ino > 1) && (ino < INT_MAX)) {
+		pos = parent_sd->s_dir.children;
+		while (pos && (ino > pos->s_ino))
+			pos = pos->s_sibling;
+	}
+	while (pos && pos->s_ns && pos->s_ns != ns)
+		pos = pos->s_sibling;
+	return pos;
+}
+
+static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
+	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos)
+{
+	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
+	if (pos)
+		pos = pos->s_sibling;
+	while (pos && pos->s_ns && pos->s_ns != ns)
+		pos = pos->s_sibling;
+	return pos;
+}
+
+static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct sysfs_dirent * parent_sd = dentry->d_fsdata;
+	struct sysfs_dirent *pos = filp->private_data;
+	enum kobj_ns_type type;
+	const void *ns;
+	ino_t ino;
+
+	type = sysfs_ns_type(parent_sd);
+	ns = sysfs_info(dentry->d_sb)->ns[type];
+
+	if (filp->f_pos == 0) {
+		ino = parent_sd->s_ino;
+		if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
+			filp->f_pos++;
+	}
+	if (filp->f_pos == 1) {
+		if (parent_sd->s_parent)
+			ino = parent_sd->s_parent->s_ino;
+		else
+			ino = parent_sd->s_ino;
+		if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
+			filp->f_pos++;
+	}
+	mutex_lock(&sysfs_mutex);
+	for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
+	     pos;
+	     pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
+		const char * name;
+		unsigned int type;
+		int len, ret;
+
+		name = pos->s_name;
+		len = strlen(name);
+		ino = pos->s_ino;
+		type = dt_type(pos);
+		filp->f_pos = ino;
+		filp->private_data = sysfs_get(pos);
+
+		mutex_unlock(&sysfs_mutex);
+		ret = filldir(dirent, name, len, filp->f_pos, ino, type);
+		mutex_lock(&sysfs_mutex);
+		if (ret < 0)
+			break;
+	}
+	mutex_unlock(&sysfs_mutex);
+	if ((filp->f_pos > 1) && !pos) { /* EOF */
+		filp->f_pos = INT_MAX;
+		filp->private_data = NULL;
+	}
+	return 0;
+}
+
+
+const struct file_operations sysfs_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= sysfs_readdir,
+	.release	= sysfs_dir_release,
+	.llseek		= generic_file_llseek,
+};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
new file mode 100644
index 00000000..1ad8c93c
--- /dev/null
+++ b/fs/sysfs/file.c
@@ -0,0 +1,747 @@
+/*
+ * fs/sysfs/file.c - sysfs regular (text) file implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please see Documentation/filesystems/sysfs.txt for more information.
+ */
+
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/kallsyms.h>
+#include <linux/slab.h>
+#include <linux/fsnotify.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/limits.h>
+#include <asm/uaccess.h>
+
+#include "sysfs.h"
+
+/*
+ * There's one sysfs_buffer for each open file and one
+ * sysfs_open_dirent for each sysfs_dirent with one or more open
+ * files.
+ *
+ * filp->private_data points to sysfs_buffer and
+ * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open
+ * is protected by sysfs_open_dirent_lock.
+ */
+static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
+
+struct sysfs_open_dirent {
+	atomic_t		refcnt;
+	atomic_t		event;
+	wait_queue_head_t	poll;
+	struct list_head	buffers; /* goes through sysfs_buffer.list */
+};
+
+struct sysfs_buffer {
+	size_t			count;
+	loff_t			pos;
+	char			* page;
+	const struct sysfs_ops	* ops;
+	struct mutex		mutex;
+	int			needs_read_fill;
+	int			event;
+	struct list_head	list;
+};
+
+/**
+ *	fill_read_buffer - allocate and fill buffer from object.
+ *	@dentry:	dentry pointer.
+ *	@buffer:	data buffer for file.
+ *
+ *	Allocate @buffer->page, if it hasn't been already, then call the
+ *	kobject's show() method to fill the buffer with this attribute's 
+ *	data. 
+ *	This is called only once, on the file's first read unless an error
+ *	is returned.
+ */
+static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer)
+{
+	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
+	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
+	const struct sysfs_ops * ops = buffer->ops;
+	int ret = 0;
+	ssize_t count;
+
+	if (!buffer->page)
+		buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	/* need attr_sd for attr and ops, its parent for kobj */
+	if (!sysfs_get_active(attr_sd))
+		return -ENODEV;
+
+	buffer->event = atomic_read(&attr_sd->s_attr.open->event);
+	count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
+
+	sysfs_put_active(attr_sd);
+
+	/*
+	 * The code works fine with PAGE_SIZE return but it's likely to
+	 * indicate truncated result or overflow in normal use cases.
+	 */
+	if (count >= (ssize_t)PAGE_SIZE) {
+		print_symbol("fill_read_buffer: %s returned bad count\n",
+			(unsigned long)ops->show);
+		/* Try to struggle along */
+		count = PAGE_SIZE - 1;
+	}
+	if (count >= 0) {
+		buffer->needs_read_fill = 0;
+		buffer->count = count;
+	} else {
+		ret = count;
+	}
+	return ret;
+}
+
+/**
+ *	sysfs_read_file - read an attribute. 
+ *	@file:	file pointer.
+ *	@buf:	buffer to fill.
+ *	@count:	number of bytes to read.
+ *	@ppos:	starting offset in file.
+ *
+ *	Userspace wants to read an attribute file. The attribute descriptor
+ *	is in the file's ->d_fsdata. The target object is in the directory's
+ *	->d_fsdata.
+ *
+ *	We call fill_read_buffer() to allocate and fill the buffer from the
+ *	object's show() method exactly once (if the read is happening from
+ *	the beginning of the file). That should fill the entire buffer with
+ *	all the data the object has to offer for that attribute.
+ *	We then call flush_read_buffer() to copy the buffer to userspace
+ *	in the increments specified.
+ */
+
+static ssize_t
+sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct sysfs_buffer * buffer = file->private_data;
+	ssize_t retval = 0;
+
+	mutex_lock(&buffer->mutex);
+	if (buffer->needs_read_fill || *ppos == 0) {
+		retval = fill_read_buffer(file->f_path.dentry,buffer);
+		if (retval)
+			goto out;
+	}
+	pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
+		 __func__, count, *ppos, buffer->page);
+	retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
+					 buffer->count);
+out:
+	mutex_unlock(&buffer->mutex);
+	return retval;
+}
+
+/**
+ *	fill_write_buffer - copy buffer from userspace.
+ *	@buffer:	data buffer for file.
+ *	@buf:		data from user.
+ *	@count:		number of bytes in @userbuf.
+ *
+ *	Allocate @buffer->page if it hasn't been already, then
+ *	copy the user-supplied buffer into it.
+ */
+
+static int 
+fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t count)
+{
+	int error;
+
+	if (!buffer->page)
+		buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	if (count >= PAGE_SIZE)
+		count = PAGE_SIZE - 1;
+	error = copy_from_user(buffer->page,buf,count);
+	buffer->needs_read_fill = 1;
+	/* if buf is assumed to contain a string, terminate it by \0,
+	   so e.g. sscanf() can scan the string easily */
+	buffer->page[count] = 0;
+	return error ? -EFAULT : count;
+}
+
+
+/**
+ *	flush_write_buffer - push buffer to kobject.
+ *	@dentry:	dentry to the attribute
+ *	@buffer:	data buffer for file.
+ *	@count:		number of bytes
+ *
+ *	Get the correct pointers for the kobject and the attribute we're
+ *	dealing with, then call the store() method for the attribute, 
+ *	passing the buffer that we acquired in fill_write_buffer().
+ */
+
+static int
+flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t count)
+{
+	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
+	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
+	const struct sysfs_ops * ops = buffer->ops;
+	int rc;
+
+	/* need attr_sd for attr and ops, its parent for kobj */
+	if (!sysfs_get_active(attr_sd))
+		return -ENODEV;
+
+	rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count);
+
+	sysfs_put_active(attr_sd);
+
+	return rc;
+}
+
+
+/**
+ *	sysfs_write_file - write an attribute.
+ *	@file:	file pointer
+ *	@buf:	data to write
+ *	@count:	number of bytes
+ *	@ppos:	starting offset
+ *
+ *	Similar to sysfs_read_file(), though working in the opposite direction.
+ *	We allocate and fill the data from the user in fill_write_buffer(),
+ *	then push it to the kobject in flush_write_buffer().
+ *	There is no easy way for us to know if userspace is only doing a partial
+ *	write, so we don't support them. We expect the entire buffer to come
+ *	on the first write. 
+ *	Hint: if you're writing a value, first read the file, modify only the
+ *	the value you're changing, then write entire buffer back. 
+ */
+
+static ssize_t
+sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct sysfs_buffer * buffer = file->private_data;
+	ssize_t len;
+
+	mutex_lock(&buffer->mutex);
+	len = fill_write_buffer(buffer, buf, count);
+	if (len > 0)
+		len = flush_write_buffer(file->f_path.dentry, buffer, len);
+	if (len > 0)
+		*ppos += len;
+	mutex_unlock(&buffer->mutex);
+	return len;
+}
+
+/**
+ *	sysfs_get_open_dirent - get or create sysfs_open_dirent
+ *	@sd: target sysfs_dirent
+ *	@buffer: sysfs_buffer for this instance of open
+ *
+ *	If @sd->s_attr.open exists, increment its reference count;
+ *	otherwise, create one.  @buffer is chained to the buffers
+ *	list.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ *
+ *	RETURNS:
+ *	0 on success, -errno on failure.
+ */
+static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
+				 struct sysfs_buffer *buffer)
+{
+	struct sysfs_open_dirent *od, *new_od = NULL;
+
+ retry:
+	spin_lock_irq(&sysfs_open_dirent_lock);
+
+	if (!sd->s_attr.open && new_od) {
+		sd->s_attr.open = new_od;
+		new_od = NULL;
+	}
+
+	od = sd->s_attr.open;
+	if (od) {
+		atomic_inc(&od->refcnt);
+		list_add_tail(&buffer->list, &od->buffers);
+	}
+
+	spin_unlock_irq(&sysfs_open_dirent_lock);
+
+	if (od) {
+		kfree(new_od);
+		return 0;
+	}
+
+	/* not there, initialize a new one and retry */
+	new_od = kmalloc(sizeof(*new_od), GFP_KERNEL);
+	if (!new_od)
+		return -ENOMEM;
+
+	atomic_set(&new_od->refcnt, 0);
+	atomic_set(&new_od->event, 1);
+	init_waitqueue_head(&new_od->poll);
+	INIT_LIST_HEAD(&new_od->buffers);
+	goto retry;
+}
+
+/**
+ *	sysfs_put_open_dirent - put sysfs_open_dirent
+ *	@sd: target sysfs_dirent
+ *	@buffer: associated sysfs_buffer
+ *
+ *	Put @sd->s_attr.open and unlink @buffer from the buffers list.
+ *	If reference count reaches zero, disassociate and free it.
+ *
+ *	LOCKING:
+ *	None.
+ */
+static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
+				  struct sysfs_buffer *buffer)
+{
+	struct sysfs_open_dirent *od = sd->s_attr.open;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
+
+	list_del(&buffer->list);
+	if (atomic_dec_and_test(&od->refcnt))
+		sd->s_attr.open = NULL;
+	else
+		od = NULL;
+
+	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
+
+	kfree(od);
+}
+
+static int sysfs_open_file(struct inode *inode, struct file *file)
+{
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
+	struct sysfs_buffer *buffer;
+	const struct sysfs_ops *ops;
+	int error = -EACCES;
+
+	/* need attr_sd for attr and ops, its parent for kobj */
+	if (!sysfs_get_active(attr_sd))
+		return -ENODEV;
+
+	/* every kobject with an attribute needs a ktype assigned */
+	if (kobj->ktype && kobj->ktype->sysfs_ops)
+		ops = kobj->ktype->sysfs_ops;
+	else {
+		WARN(1, KERN_ERR "missing sysfs attribute operations for "
+		       "kobject: %s\n", kobject_name(kobj));
+		goto err_out;
+	}
+
+	/* File needs write support.
+	 * The inode's perms must say it's ok, 
+	 * and we must have a store method.
+	 */
+	if (file->f_mode & FMODE_WRITE) {
+		if (!(inode->i_mode & S_IWUGO) || !ops->store)
+			goto err_out;
+	}
+
+	/* File needs read support.
+	 * The inode's perms must say it's ok, and we there
+	 * must be a show method for it.
+	 */
+	if (file->f_mode & FMODE_READ) {
+		if (!(inode->i_mode & S_IRUGO) || !ops->show)
+			goto err_out;
+	}
+
+	/* No error? Great, allocate a buffer for the file, and store it
+	 * it in file->private_data for easy access.
+	 */
+	error = -ENOMEM;
+	buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL);
+	if (!buffer)
+		goto err_out;
+
+	mutex_init(&buffer->mutex);
+	buffer->needs_read_fill = 1;
+	buffer->ops = ops;
+	file->private_data = buffer;
+
+	/* make sure we have open dirent struct */
+	error = sysfs_get_open_dirent(attr_sd, buffer);
+	if (error)
+		goto err_free;
+
+	/* open succeeded, put active references */
+	sysfs_put_active(attr_sd);
+	return 0;
+
+ err_free:
+	kfree(buffer);
+ err_out:
+	sysfs_put_active(attr_sd);
+	return error;
+}
+
+static int sysfs_release(struct inode *inode, struct file *filp)
+{
+	struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
+	struct sysfs_buffer *buffer = filp->private_data;
+
+	sysfs_put_open_dirent(sd, buffer);
+
+	if (buffer->page)
+		free_page((unsigned long)buffer->page);
+	kfree(buffer);
+
+	return 0;
+}
+
+/* Sysfs attribute files are pollable.  The idea is that you read
+ * the content and then you use 'poll' or 'select' to wait for
+ * the content to change.  When the content changes (assuming the
+ * manager for the kobject supports notification), poll will
+ * return POLLERR|POLLPRI, and select will return the fd whether
+ * it is waiting for read, write, or exceptions.
+ * Once poll/select indicates that the value has changed, you
+ * need to close and re-open the file, or seek to 0 and read again.
+ * Reminder: this only works for attributes which actively support
+ * it, and it is not possible to test an attribute from userspace
+ * to see if it supports poll (Neither 'poll' nor 'select' return
+ * an appropriate error code).  When in doubt, set a suitable timeout value.
+ */
+static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
+{
+	struct sysfs_buffer * buffer = filp->private_data;
+	struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
+	struct sysfs_open_dirent *od = attr_sd->s_attr.open;
+
+	/* need parent for the kobj, grab both */
+	if (!sysfs_get_active(attr_sd))
+		goto trigger;
+
+	poll_wait(filp, &od->poll, wait);
+
+	sysfs_put_active(attr_sd);
+
+	if (buffer->event != atomic_read(&od->event))
+		goto trigger;
+
+	return DEFAULT_POLLMASK;
+
+ trigger:
+	buffer->needs_read_fill = 1;
+	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
+}
+
+void sysfs_notify_dirent(struct sysfs_dirent *sd)
+{
+	struct sysfs_open_dirent *od;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
+
+	od = sd->s_attr.open;
+	if (od) {
+		atomic_inc(&od->event);
+		wake_up_interruptible(&od->poll);
+	}
+
+	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
+}
+EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
+
+void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
+{
+	struct sysfs_dirent *sd = k->sd;
+
+	mutex_lock(&sysfs_mutex);
+
+	if (sd && dir)
+		/* Only directories are tagged, so no need to pass
+		 * a tag explicitly.
+		 */
+		sd = sysfs_find_dirent(sd, NULL, dir);
+	if (sd && attr)
+		sd = sysfs_find_dirent(sd, NULL, attr);
+	if (sd)
+		sysfs_notify_dirent(sd);
+
+	mutex_unlock(&sysfs_mutex);
+}
+EXPORT_SYMBOL_GPL(sysfs_notify);
+
+const struct file_operations sysfs_file_operations = {
+	.read		= sysfs_read_file,
+	.write		= sysfs_write_file,
+	.llseek		= generic_file_llseek,
+	.open		= sysfs_open_file,
+	.release	= sysfs_release,
+	.poll		= sysfs_poll,
+};
+
+int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
+			const struct attribute *attr, int type, mode_t amode)
+{
+	umode_t mode = (amode & S_IALLUGO) | S_IFREG;
+	struct sysfs_addrm_cxt acxt;
+	struct sysfs_dirent *sd;
+	int rc;
+
+	sd = sysfs_new_dirent(attr->name, mode, type);
+	if (!sd)
+		return -ENOMEM;
+	sd->s_attr.attr = (void *)attr;
+	sysfs_dirent_init_lockdep(sd);
+
+	sysfs_addrm_start(&acxt, dir_sd);
+	rc = sysfs_add_one(&acxt, sd);
+	sysfs_addrm_finish(&acxt);
+
+	if (rc)
+		sysfs_put(sd);
+
+	return rc;
+}
+
+
+int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
+		   int type)
+{
+	return sysfs_add_file_mode(dir_sd, attr, type, attr->mode);
+}
+
+
+/**
+ *	sysfs_create_file - create an attribute file for an object.
+ *	@kobj:	object we're creating for. 
+ *	@attr:	attribute descriptor.
+ */
+
+int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
+{
+	BUG_ON(!kobj || !kobj->sd || !attr);
+
+	return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR);
+
+}
+
+int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
+{
+	int err = 0;
+	int i;
+
+	for (i = 0; ptr[i] && !err; i++)
+		err = sysfs_create_file(kobj, ptr[i]);
+	if (err)
+		while (--i >= 0)
+			sysfs_remove_file(kobj, ptr[i]);
+	return err;
+}
+
+/**
+ * sysfs_add_file_to_group - add an attribute file to a pre-existing group.
+ * @kobj: object we're acting for.
+ * @attr: attribute descriptor.
+ * @group: group name.
+ */
+int sysfs_add_file_to_group(struct kobject *kobj,
+		const struct attribute *attr, const char *group)
+{
+	struct sysfs_dirent *dir_sd;
+	int error;
+
+	if (group)
+		dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
+	else
+		dir_sd = sysfs_get(kobj->sd);
+
+	if (!dir_sd)
+		return -ENOENT;
+
+	error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR);
+	sysfs_put(dir_sd);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
+
+/**
+ * sysfs_chmod_file - update the modified mode value on an object attribute.
+ * @kobj: object we're acting for.
+ * @attr: attribute descriptor.
+ * @mode: file permissions.
+ *
+ */
+int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
+		     mode_t mode)
+{
+	struct sysfs_dirent *sd;
+	struct iattr newattrs;
+	int rc;
+
+	mutex_lock(&sysfs_mutex);
+
+	rc = -ENOENT;
+	sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
+	if (!sd)
+		goto out;
+
+	newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO);
+	newattrs.ia_valid = ATTR_MODE;
+	rc = sysfs_sd_setattr(sd, &newattrs);
+
+ out:
+	mutex_unlock(&sysfs_mutex);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(sysfs_chmod_file);
+
+
+/**
+ *	sysfs_remove_file - remove an object attribute.
+ *	@kobj:	object we're acting for.
+ *	@attr:	attribute descriptor.
+ *
+ *	Hash the attribute name and kill the victim.
+ */
+
+void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
+{
+	sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
+}
+
+void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
+{
+	int i;
+	for (i = 0; ptr[i]; i++)
+		sysfs_remove_file(kobj, ptr[i]);
+}
+
+/**
+ * sysfs_remove_file_from_group - remove an attribute file from a group.
+ * @kobj: object we're acting for.
+ * @attr: attribute descriptor.
+ * @group: group name.
+ */
+void sysfs_remove_file_from_group(struct kobject *kobj,
+		const struct attribute *attr, const char *group)
+{
+	struct sysfs_dirent *dir_sd;
+
+	if (group)
+		dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
+	else
+		dir_sd = sysfs_get(kobj->sd);
+	if (dir_sd) {
+		sysfs_hash_and_remove(dir_sd, NULL, attr->name);
+		sysfs_put(dir_sd);
+	}
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
+
+struct sysfs_schedule_callback_struct {
+	struct list_head	workq_list;
+	struct kobject		*kobj;
+	void			(*func)(void *);
+	void			*data;
+	struct module		*owner;
+	struct work_struct	work;
+};
+
+static struct workqueue_struct *sysfs_workqueue;
+static DEFINE_MUTEX(sysfs_workq_mutex);
+static LIST_HEAD(sysfs_workq);
+static void sysfs_schedule_callback_work(struct work_struct *work)
+{
+	struct sysfs_schedule_callback_struct *ss = container_of(work,
+			struct sysfs_schedule_callback_struct, work);
+
+	(ss->func)(ss->data);
+	kobject_put(ss->kobj);
+	module_put(ss->owner);
+	mutex_lock(&sysfs_workq_mutex);
+	list_del(&ss->workq_list);
+	mutex_unlock(&sysfs_workq_mutex);
+	kfree(ss);
+}
+
+/**
+ * sysfs_schedule_callback - helper to schedule a callback for a kobject
+ * @kobj: object we're acting for.
+ * @func: callback function to invoke later.
+ * @data: argument to pass to @func.
+ * @owner: module owning the callback code
+ *
+ * sysfs attribute methods must not unregister themselves or their parent
+ * kobject (which would amount to the same thing).  Attempts to do so will
+ * deadlock, since unregistration is mutually exclusive with driver
+ * callbacks.
+ *
+ * Instead methods can call this routine, which will attempt to allocate
+ * and schedule a workqueue request to call back @func with @data as its
+ * argument in the workqueue's process context.  @kobj will be pinned
+ * until @func returns.
+ *
+ * Returns 0 if the request was submitted, -ENOMEM if storage could not
+ * be allocated, -ENODEV if a reference to @owner isn't available,
+ * -EAGAIN if a callback has already been scheduled for @kobj.
+ */
+int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
+		void *data, struct module *owner)
+{
+	struct sysfs_schedule_callback_struct *ss, *tmp;
+
+	if (!try_module_get(owner))
+		return -ENODEV;
+
+	mutex_lock(&sysfs_workq_mutex);
+	list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
+		if (ss->kobj == kobj) {
+			module_put(owner);
+			mutex_unlock(&sysfs_workq_mutex);
+			return -EAGAIN;
+		}
+	mutex_unlock(&sysfs_workq_mutex);
+
+	if (sysfs_workqueue == NULL) {
+		sysfs_workqueue = create_singlethread_workqueue("sysfsd");
+		if (sysfs_workqueue == NULL) {
+			module_put(owner);
+			return -ENOMEM;
+		}
+	}
+
+	ss = kmalloc(sizeof(*ss), GFP_KERNEL);
+	if (!ss) {
+		module_put(owner);
+		return -ENOMEM;
+	}
+	kobject_get(kobj);
+	ss->kobj = kobj;
+	ss->func = func;
+	ss->data = data;
+	ss->owner = owner;
+	INIT_WORK(&ss->work, sysfs_schedule_callback_work);
+	INIT_LIST_HEAD(&ss->workq_list);
+	mutex_lock(&sysfs_workq_mutex);
+	list_add_tail(&ss->workq_list, &sysfs_workq);
+	mutex_unlock(&sysfs_workq_mutex);
+	queue_work(sysfs_workqueue, &ss->work);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
+
+
+EXPORT_SYMBOL_GPL(sysfs_create_file);
+EXPORT_SYMBOL_GPL(sysfs_remove_file);
+EXPORT_SYMBOL_GPL(sysfs_remove_files);
+EXPORT_SYMBOL_GPL(sysfs_create_files);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
new file mode 100644
index 00000000..194414f8
--- /dev/null
+++ b/fs/sysfs/group.c
@@ -0,0 +1,207 @@
+/*
+ * fs/sysfs/group.c - Operations for adding/removing multiple files at once.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ *
+ * This file is released undert the GPL v2. 
+ *
+ */
+
+#include <linux/kobject.h>
+#include <linux/module.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/err.h>
+#include "sysfs.h"
+
+
+static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
+			 const struct attribute_group *grp)
+{
+	struct attribute *const* attr;
+	int i;
+
+	for (i = 0, attr = grp->attrs; *attr; i++, attr++)
+		sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+}
+
+static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
+			const struct attribute_group *grp, int update)
+{
+	struct attribute *const* attr;
+	int error = 0, i;
+
+	for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
+		mode_t mode = 0;
+
+		/* in update mode, we're changing the permissions or
+		 * visibility.  Do this by first removing then
+		 * re-adding (if required) the file */
+		if (update)
+			sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+		if (grp->is_visible) {
+			mode = grp->is_visible(kobj, *attr, i);
+			if (!mode)
+				continue;
+		}
+		error = sysfs_add_file_mode(dir_sd, *attr, SYSFS_KOBJ_ATTR,
+					    (*attr)->mode | mode);
+		if (unlikely(error))
+			break;
+	}
+	if (error)
+		remove_files(dir_sd, kobj, grp);
+	return error;
+}
+
+
+static int internal_create_group(struct kobject *kobj, int update,
+				 const struct attribute_group *grp)
+{
+	struct sysfs_dirent *sd;
+	int error;
+
+	BUG_ON(!kobj || (!update && !kobj->sd));
+
+	/* Updates may happen before the object has been instantiated */
+	if (unlikely(update && !kobj->sd))
+		return -EINVAL;
+
+	if (grp->name) {
+		error = sysfs_create_subdir(kobj, grp->name, &sd);
+		if (error)
+			return error;
+	} else
+		sd = kobj->sd;
+	sysfs_get(sd);
+	error = create_files(sd, kobj, grp, update);
+	if (error) {
+		if (grp->name)
+			sysfs_remove_subdir(sd);
+	}
+	sysfs_put(sd);
+	return error;
+}
+
+/**
+ * sysfs_create_group - given a directory kobject, create an attribute group
+ * @kobj:	The kobject to create the group on
+ * @grp:	The attribute group to create
+ *
+ * This function creates a group for the first time.  It will explicitly
+ * warn and error if any of the attribute files being created already exist.
+ *
+ * Returns 0 on success or error.
+ */
+int sysfs_create_group(struct kobject *kobj,
+		       const struct attribute_group *grp)
+{
+	return internal_create_group(kobj, 0, grp);
+}
+
+/**
+ * sysfs_update_group - given a directory kobject, update an attribute group
+ * @kobj:	The kobject to update the group on
+ * @grp:	The attribute group to update
+ *
+ * This function updates an attribute group.  Unlike
+ * sysfs_create_group(), it will explicitly not warn or error if any
+ * of the attribute files being created already exist.  Furthermore,
+ * if the visibility of the files has changed through the is_visible()
+ * callback, it will update the permissions and add or remove the
+ * relevant files.
+ *
+ * The primary use for this function is to call it after making a change
+ * that affects group visibility.
+ *
+ * Returns 0 on success or error.
+ */
+int sysfs_update_group(struct kobject *kobj,
+		       const struct attribute_group *grp)
+{
+	return internal_create_group(kobj, 1, grp);
+}
+
+
+
+void sysfs_remove_group(struct kobject * kobj, 
+			const struct attribute_group * grp)
+{
+	struct sysfs_dirent *dir_sd = kobj->sd;
+	struct sysfs_dirent *sd;
+
+	if (grp->name) {
+		sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
+		if (!sd) {
+			WARN(!sd, KERN_WARNING "sysfs group %p not found for "
+				"kobject '%s'\n", grp, kobject_name(kobj));
+			return;
+		}
+	} else
+		sd = sysfs_get(dir_sd);
+
+	remove_files(sd, kobj, grp);
+	if (grp->name)
+		sysfs_remove_subdir(sd);
+
+	sysfs_put(sd);
+}
+
+/**
+ * sysfs_merge_group - merge files into a pre-existing attribute group.
+ * @kobj:	The kobject containing the group.
+ * @grp:	The files to create and the attribute group they belong to.
+ *
+ * This function returns an error if the group doesn't exist or any of the
+ * files already exist in that group, in which case none of the new files
+ * are created.
+ */
+int sysfs_merge_group(struct kobject *kobj,
+		       const struct attribute_group *grp)
+{
+	struct sysfs_dirent *dir_sd;
+	int error = 0;
+	struct attribute *const *attr;
+	int i;
+
+	dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
+	if (!dir_sd)
+		return -ENOENT;
+
+	for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
+		error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
+	if (error) {
+		while (--i >= 0)
+			sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name);
+	}
+	sysfs_put(dir_sd);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_merge_group);
+
+/**
+ * sysfs_unmerge_group - remove files from a pre-existing attribute group.
+ * @kobj:	The kobject containing the group.
+ * @grp:	The files to remove and the attribute group they belong to.
+ */
+void sysfs_unmerge_group(struct kobject *kobj,
+		       const struct attribute_group *grp)
+{
+	struct sysfs_dirent *dir_sd;
+	struct attribute *const *attr;
+
+	dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
+	if (dir_sd) {
+		for (attr = grp->attrs; *attr; ++attr)
+			sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+		sysfs_put(dir_sd);
+	}
+}
+EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
+
+
+EXPORT_SYMBOL_GPL(sysfs_create_group);
+EXPORT_SYMBOL_GPL(sysfs_update_group);
+EXPORT_SYMBOL_GPL(sysfs_remove_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
new file mode 100644
index 00000000..a494413e
--- /dev/null
+++ b/fs/sysfs/inode.c
@@ -0,0 +1,367 @@
+/*
+ * fs/sysfs/inode.c - basic sysfs inode and dentry operations
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please see Documentation/filesystems/sysfs.txt for more information.
+ */
+
+#undef DEBUG 
+
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/backing-dev.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include "sysfs.h"
+
+extern struct super_block * sysfs_sb;
+
+static const struct address_space_operations sysfs_aops = {
+	.readpage	= simple_readpage,
+	.write_begin	= simple_write_begin,
+	.write_end	= simple_write_end,
+};
+
+static struct backing_dev_info sysfs_backing_dev_info = {
+	.name		= "sysfs",
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+
+static const struct inode_operations sysfs_inode_operations ={
+	.permission	= sysfs_permission,
+	.setattr	= sysfs_setattr,
+	.getattr	= sysfs_getattr,
+	.setxattr	= sysfs_setxattr,
+};
+
+int __init sysfs_inode_init(void)
+{
+	return bdi_init(&sysfs_backing_dev_info);
+}
+
+static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
+{
+	struct sysfs_inode_attrs *attrs;
+	struct iattr *iattrs;
+
+	attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
+	if (!attrs)
+		return NULL;
+	iattrs = &attrs->ia_iattr;
+
+	/* assign default attributes */
+	iattrs->ia_mode = sd->s_mode;
+	iattrs->ia_uid = 0;
+	iattrs->ia_gid = 0;
+	iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
+
+	return attrs;
+}
+
+int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr)
+{
+	struct sysfs_inode_attrs *sd_attrs;
+	struct iattr *iattrs;
+	unsigned int ia_valid = iattr->ia_valid;
+
+	sd_attrs = sd->s_iattr;
+
+	if (!sd_attrs) {
+		/* setting attributes for the first time, allocate now */
+		sd_attrs = sysfs_init_inode_attrs(sd);
+		if (!sd_attrs)
+			return -ENOMEM;
+		sd->s_iattr = sd_attrs;
+	}
+	/* attributes were changed at least once in past */
+	iattrs = &sd_attrs->ia_iattr;
+
+	if (ia_valid & ATTR_UID)
+		iattrs->ia_uid = iattr->ia_uid;
+	if (ia_valid & ATTR_GID)
+		iattrs->ia_gid = iattr->ia_gid;
+	if (ia_valid & ATTR_ATIME)
+		iattrs->ia_atime = iattr->ia_atime;
+	if (ia_valid & ATTR_MTIME)
+		iattrs->ia_mtime = iattr->ia_mtime;
+	if (ia_valid & ATTR_CTIME)
+		iattrs->ia_ctime = iattr->ia_ctime;
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = iattr->ia_mode;
+		iattrs->ia_mode = sd->s_mode = mode;
+	}
+	return 0;
+}
+
+int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	int error;
+
+	if (!sd)
+		return -EINVAL;
+
+	mutex_lock(&sysfs_mutex);
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		goto out;
+
+	error = sysfs_sd_setattr(sd, iattr);
+	if (error)
+		goto out;
+
+	/* this ignores size changes */
+	setattr_copy(inode, iattr);
+
+out:
+	mutex_unlock(&sysfs_mutex);
+	return error;
+}
+
+static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *secdata_len)
+{
+	struct sysfs_inode_attrs *iattrs;
+	void *old_secdata;
+	size_t old_secdata_len;
+
+	if (!sd->s_iattr) {
+		sd->s_iattr = sysfs_init_inode_attrs(sd);
+		if (!sd->s_iattr)
+			return -ENOMEM;
+	}
+
+	iattrs = sd->s_iattr;
+	old_secdata = iattrs->ia_secdata;
+	old_secdata_len = iattrs->ia_secdata_len;
+
+	iattrs->ia_secdata = *secdata;
+	iattrs->ia_secdata_len = *secdata_len;
+
+	*secdata = old_secdata;
+	*secdata_len = old_secdata_len;
+	return 0;
+}
+
+int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	void *secdata;
+	int error;
+	u32 secdata_len = 0;
+
+	if (!sd)
+		return -EINVAL;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
+		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+		error = security_inode_setsecurity(dentry->d_inode, suffix,
+						value, size, flags);
+		if (error)
+			goto out;
+		error = security_inode_getsecctx(dentry->d_inode,
+						&secdata, &secdata_len);
+		if (error)
+			goto out;
+
+		mutex_lock(&sysfs_mutex);
+		error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
+		mutex_unlock(&sysfs_mutex);
+
+		if (secdata)
+			security_release_secctx(secdata, secdata_len);
+	} else
+		return -EINVAL;
+out:
+	return error;
+}
+
+static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
+{
+	inode->i_mode = mode;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+}
+
+static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
+{
+	inode->i_uid = iattr->ia_uid;
+	inode->i_gid = iattr->ia_gid;
+	inode->i_atime = iattr->ia_atime;
+	inode->i_mtime = iattr->ia_mtime;
+	inode->i_ctime = iattr->ia_ctime;
+}
+
+static int sysfs_count_nlink(struct sysfs_dirent *sd)
+{
+	struct sysfs_dirent *child;
+	int nr = 0;
+
+	for (child = sd->s_dir.children; child; child = child->s_sibling)
+		if (sysfs_type(child) == SYSFS_DIR)
+			nr++;
+
+	return nr + 2;
+}
+
+static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
+{
+	struct sysfs_inode_attrs *iattrs = sd->s_iattr;
+
+	inode->i_mode = sd->s_mode;
+	if (iattrs) {
+		/* sysfs_dirent has non-default attributes
+		 * get them from persistent copy in sysfs_dirent
+		 */
+		set_inode_attr(inode, &iattrs->ia_iattr);
+		security_inode_notifysecctx(inode,
+					    iattrs->ia_secdata,
+					    iattrs->ia_secdata_len);
+	}
+
+	if (sysfs_type(sd) == SYSFS_DIR)
+		inode->i_nlink = sysfs_count_nlink(sd);
+}
+
+int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct inode *inode = dentry->d_inode;
+
+	mutex_lock(&sysfs_mutex);
+	sysfs_refresh_inode(sd, inode);
+	mutex_unlock(&sysfs_mutex);
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
+{
+	struct bin_attribute *bin_attr;
+
+	inode->i_private = sysfs_get(sd);
+	inode->i_mapping->a_ops = &sysfs_aops;
+	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
+	inode->i_op = &sysfs_inode_operations;
+
+	set_default_inode_attr(inode, sd->s_mode);
+	sysfs_refresh_inode(sd, inode);
+
+	/* initialize inode according to type */
+	switch (sysfs_type(sd)) {
+	case SYSFS_DIR:
+		inode->i_op = &sysfs_dir_inode_operations;
+		inode->i_fop = &sysfs_dir_operations;
+		break;
+	case SYSFS_KOBJ_ATTR:
+		inode->i_size = PAGE_SIZE;
+		inode->i_fop = &sysfs_file_operations;
+		break;
+	case SYSFS_KOBJ_BIN_ATTR:
+		bin_attr = sd->s_bin_attr.bin_attr;
+		inode->i_size = bin_attr->size;
+		inode->i_fop = &bin_fops;
+		break;
+	case SYSFS_KOBJ_LINK:
+		inode->i_op = &sysfs_symlink_inode_operations;
+		break;
+	default:
+		BUG();
+	}
+
+	unlock_new_inode(inode);
+}
+
+/**
+ *	sysfs_get_inode - get inode for sysfs_dirent
+ *	@sb: super block
+ *	@sd: sysfs_dirent to allocate inode for
+ *
+ *	Get inode for @sd.  If such inode doesn't exist, a new inode
+ *	is allocated and basics are initialized.  New inode is
+ *	returned locked.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ *
+ *	RETURNS:
+ *	Pointer to allocated inode on success, NULL on failure.
+ */
+struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
+{
+	struct inode *inode;
+
+	inode = iget_locked(sb, sd->s_ino);
+	if (inode && (inode->i_state & I_NEW))
+		sysfs_init_inode(sd, inode);
+
+	return inode;
+}
+
+/*
+ * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
+ * To prevent the sysfs inode numbers from being freed prematurely we take a
+ * reference to sysfs_dirent from the sysfs inode.  A
+ * super_operations.evict_inode() implementation is needed to drop that
+ * reference upon inode destruction.
+ */
+void sysfs_evict_inode(struct inode *inode)
+{
+	struct sysfs_dirent *sd  = inode->i_private;
+
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	sysfs_put(sd);
+}
+
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name)
+{
+	struct sysfs_addrm_cxt acxt;
+	struct sysfs_dirent *sd;
+
+	if (!dir_sd)
+		return -ENOENT;
+
+	sysfs_addrm_start(&acxt, dir_sd);
+
+	sd = sysfs_find_dirent(dir_sd, ns, name);
+	if (sd && (sd->s_ns != ns))
+		sd = NULL;
+	if (sd)
+		sysfs_remove_one(&acxt, sd);
+
+	sysfs_addrm_finish(&acxt);
+
+	if (sd)
+		return 0;
+	else
+		return -ENOENT;
+}
+
+int sysfs_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	struct sysfs_dirent *sd;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	sd = inode->i_private;
+
+	mutex_lock(&sysfs_mutex);
+	sysfs_refresh_inode(sd, inode);
+	mutex_unlock(&sysfs_mutex);
+
+	return generic_permission(inode, mask, flags, NULL);
+}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
new file mode 100644
index 00000000..e34f0d99
--- /dev/null
+++ b/fs/sysfs/mount.c
@@ -0,0 +1,201 @@
+/*
+ * fs/sysfs/symlink.c - operations for initializing and mounting sysfs
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please see Documentation/filesystems/sysfs.txt for more information.
+ */
+
+#define DEBUG 
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+
+#include "sysfs.h"
+
+
+static struct vfsmount *sysfs_mnt;
+struct kmem_cache *sysfs_dir_cachep;
+
+static const struct super_operations sysfs_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= sysfs_evict_inode,
+};
+
+struct sysfs_dirent sysfs_root = {
+	.s_name		= "",
+	.s_count	= ATOMIC_INIT(1),
+	.s_flags	= SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
+	.s_mode		= S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
+	.s_ino		= 1,
+};
+
+static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode;
+	struct dentry *root;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = SYSFS_MAGIC;
+	sb->s_op = &sysfs_ops;
+	sb->s_time_gran = 1;
+
+	/* get root inode, initialize and unlock it */
+	mutex_lock(&sysfs_mutex);
+	inode = sysfs_get_inode(sb, &sysfs_root);
+	mutex_unlock(&sysfs_mutex);
+	if (!inode) {
+		pr_debug("sysfs: could not get root inode\n");
+		return -ENOMEM;
+	}
+
+	/* instantiate and link root dentry */
+	root = d_alloc_root(inode);
+	if (!root) {
+		pr_debug("%s: could not get root dentry!\n",__func__);
+		iput(inode);
+		return -ENOMEM;
+	}
+	root->d_fsdata = &sysfs_root;
+	sb->s_root = root;
+	return 0;
+}
+
+static int sysfs_test_super(struct super_block *sb, void *data)
+{
+	struct sysfs_super_info *sb_info = sysfs_info(sb);
+	struct sysfs_super_info *info = data;
+	enum kobj_ns_type type;
+	int found = 1;
+
+	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
+		if (sb_info->ns[type] != info->ns[type])
+			found = 0;
+	}
+	return found;
+}
+
+static int sysfs_set_super(struct super_block *sb, void *data)
+{
+	int error;
+	error = set_anon_super(sb, data);
+	if (!error)
+		sb->s_fs_info = data;
+	return error;
+}
+
+static void free_sysfs_super_info(struct sysfs_super_info *info)
+{
+	int type;
+	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
+		kobj_ns_drop(type, info->ns[type]);
+	kfree(info);
+}
+
+static struct dentry *sysfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	struct sysfs_super_info *info;
+	enum kobj_ns_type type;
+	struct super_block *sb;
+	int error;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return ERR_PTR(-ENOMEM);
+
+	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
+		info->ns[type] = kobj_ns_grab_current(type);
+
+	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
+	if (IS_ERR(sb) || sb->s_fs_info != info)
+		free_sysfs_super_info(info);
+	if (IS_ERR(sb))
+		return ERR_CAST(sb);
+	if (!sb->s_root) {
+		sb->s_flags = flags;
+		error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+		if (error) {
+			deactivate_locked_super(sb);
+			return ERR_PTR(error);
+		}
+		sb->s_flags |= MS_ACTIVE;
+	}
+
+	return dget(sb->s_root);
+}
+
+static void sysfs_kill_sb(struct super_block *sb)
+{
+	struct sysfs_super_info *info = sysfs_info(sb);
+	/* Remove the superblock from fs_supers/s_instances
+	 * so we can't find it, before freeing sysfs_super_info.
+	 */
+	kill_anon_super(sb);
+	free_sysfs_super_info(info);
+}
+
+static struct file_system_type sysfs_fs_type = {
+	.name		= "sysfs",
+	.mount		= sysfs_mount,
+	.kill_sb	= sysfs_kill_sb,
+};
+
+int __init sysfs_init(void)
+{
+	int err = -ENOMEM;
+
+	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
+					      sizeof(struct sysfs_dirent),
+					      0, 0, NULL);
+	if (!sysfs_dir_cachep)
+		goto out;
+
+	err = sysfs_inode_init();
+	if (err)
+		goto out_err;
+
+	err = register_filesystem(&sysfs_fs_type);
+	if (!err) {
+		sysfs_mnt = kern_mount(&sysfs_fs_type);
+		if (IS_ERR(sysfs_mnt)) {
+			printk(KERN_ERR "sysfs: could not mount!\n");
+			err = PTR_ERR(sysfs_mnt);
+			sysfs_mnt = NULL;
+			unregister_filesystem(&sysfs_fs_type);
+			goto out_err;
+		}
+	} else
+		goto out_err;
+out:
+	return err;
+out_err:
+	kmem_cache_destroy(sysfs_dir_cachep);
+	sysfs_dir_cachep = NULL;
+	goto out;
+}
+
+#undef sysfs_get
+struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
+{
+	return __sysfs_get(sd);
+}
+EXPORT_SYMBOL_GPL(sysfs_get);
+
+#undef sysfs_put
+void sysfs_put(struct sysfs_dirent *sd)
+{
+	__sysfs_put(sd);
+}
+EXPORT_SYMBOL_GPL(sysfs_put);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
new file mode 100644
index 00000000..a7ac78f8
--- /dev/null
+++ b/fs/sysfs/symlink.c
@@ -0,0 +1,307 @@
+/*
+ * fs/sysfs/symlink.c - sysfs symlink implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please see Documentation/filesystems/sysfs.txt for more information.
+ */
+
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/namei.h>
+#include <linux/mutex.h>
+#include <linux/security.h>
+
+#include "sysfs.h"
+
+static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
+				const char *name, int warn)
+{
+	struct sysfs_dirent *parent_sd = NULL;
+	struct sysfs_dirent *target_sd = NULL;
+	struct sysfs_dirent *sd = NULL;
+	struct sysfs_addrm_cxt acxt;
+	enum kobj_ns_type ns_type;
+	int error;
+
+	BUG_ON(!name);
+
+	if (!kobj)
+		parent_sd = &sysfs_root;
+	else
+		parent_sd = kobj->sd;
+
+	error = -EFAULT;
+	if (!parent_sd)
+		goto out_put;
+
+	/* target->sd can go away beneath us but is protected with
+	 * sysfs_assoc_lock.  Fetch target_sd from it.
+	 */
+	spin_lock(&sysfs_assoc_lock);
+	if (target->sd)
+		target_sd = sysfs_get(target->sd);
+	spin_unlock(&sysfs_assoc_lock);
+
+	error = -ENOENT;
+	if (!target_sd)
+		goto out_put;
+
+	error = -ENOMEM;
+	sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
+	if (!sd)
+		goto out_put;
+
+	ns_type = sysfs_ns_type(parent_sd);
+	if (ns_type)
+		sd->s_ns = target->ktype->namespace(target);
+	sd->s_symlink.target_sd = target_sd;
+	target_sd = NULL;	/* reference is now owned by the symlink */
+
+	sysfs_addrm_start(&acxt, parent_sd);
+	/* Symlinks must be between directories with the same ns_type */
+	if (!ns_type ||
+	    (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
+		if (warn)
+			error = sysfs_add_one(&acxt, sd);
+		else
+			error = __sysfs_add_one(&acxt, sd);
+	} else {
+		error = -EINVAL;
+		WARN(1, KERN_WARNING
+			"sysfs: symlink across ns_types %s/%s -> %s/%s\n",
+			parent_sd->s_name,
+			sd->s_name,
+			sd->s_symlink.target_sd->s_parent->s_name,
+			sd->s_symlink.target_sd->s_name);
+	}
+	sysfs_addrm_finish(&acxt);
+
+	if (error)
+		goto out_put;
+
+	return 0;
+
+ out_put:
+	sysfs_put(target_sd);
+	sysfs_put(sd);
+	return error;
+}
+
+/**
+ *	sysfs_create_link - create symlink between two objects.
+ *	@kobj:	object whose directory we're creating the link in.
+ *	@target:	object we're pointing to.
+ *	@name:		name of the symlink.
+ */
+int sysfs_create_link(struct kobject *kobj, struct kobject *target,
+		      const char *name)
+{
+	return sysfs_do_create_link(kobj, target, name, 1);
+}
+
+/**
+ *	sysfs_create_link_nowarn - create symlink between two objects.
+ *	@kobj:	object whose directory we're creating the link in.
+ *	@target:	object we're pointing to.
+ *	@name:		name of the symlink.
+ *
+ *	This function does the same as sysf_create_link(), but it
+ *	doesn't warn if the link already exists.
+ */
+int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
+			     const char *name)
+{
+	return sysfs_do_create_link(kobj, target, name, 0);
+}
+
+/**
+ *	sysfs_delete_link - remove symlink in object's directory.
+ *	@kobj:	object we're acting for.
+ *	@targ:	object we're pointing to.
+ *	@name:	name of the symlink to remove.
+ *
+ *	Unlike sysfs_remove_link sysfs_delete_link has enough information
+ *	to successfully delete symlinks in tagged directories.
+ */
+void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
+			const char *name)
+{
+	const void *ns = NULL;
+	spin_lock(&sysfs_assoc_lock);
+	if (targ->sd && sysfs_ns_type(kobj->sd))
+		ns = targ->sd->s_ns;
+	spin_unlock(&sysfs_assoc_lock);
+	sysfs_hash_and_remove(kobj->sd, ns, name);
+}
+
+/**
+ *	sysfs_remove_link - remove symlink in object's directory.
+ *	@kobj:	object we're acting for.
+ *	@name:	name of the symlink to remove.
+ */
+
+void sysfs_remove_link(struct kobject * kobj, const char * name)
+{
+	struct sysfs_dirent *parent_sd = NULL;
+
+	if (!kobj)
+		parent_sd = &sysfs_root;
+	else
+		parent_sd = kobj->sd;
+
+	sysfs_hash_and_remove(parent_sd, NULL, name);
+}
+
+/**
+ *	sysfs_rename_link - rename symlink in object's directory.
+ *	@kobj:	object we're acting for.
+ *	@targ:	object we're pointing to.
+ *	@old:	previous name of the symlink.
+ *	@new:	new name of the symlink.
+ *
+ *	A helper function for the common rename symlink idiom.
+ */
+int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
+			const char *old, const char *new)
+{
+	struct sysfs_dirent *parent_sd, *sd = NULL;
+	const void *old_ns = NULL, *new_ns = NULL;
+	int result;
+
+	if (!kobj)
+		parent_sd = &sysfs_root;
+	else
+		parent_sd = kobj->sd;
+
+	if (targ->sd)
+		old_ns = targ->sd->s_ns;
+
+	result = -ENOENT;
+	sd = sysfs_get_dirent(parent_sd, old_ns, old);
+	if (!sd)
+		goto out;
+
+	result = -EINVAL;
+	if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
+		goto out;
+	if (sd->s_symlink.target_sd->s_dir.kobj != targ)
+		goto out;
+
+	if (sysfs_ns_type(parent_sd))
+		new_ns = targ->ktype->namespace(targ);
+
+	result = sysfs_rename(sd, parent_sd, new_ns, new);
+
+out:
+	sysfs_put(sd);
+	return result;
+}
+
+static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
+				 struct sysfs_dirent *target_sd, char *path)
+{
+	struct sysfs_dirent *base, *sd;
+	char *s = path;
+	int len = 0;
+
+	/* go up to the root, stop at the base */
+	base = parent_sd;
+	while (base->s_parent) {
+		sd = target_sd->s_parent;
+		while (sd->s_parent && base != sd)
+			sd = sd->s_parent;
+
+		if (base == sd)
+			break;
+
+		strcpy(s, "../");
+		s += 3;
+		base = base->s_parent;
+	}
+
+	/* determine end of target string for reverse fillup */
+	sd = target_sd;
+	while (sd->s_parent && sd != base) {
+		len += strlen(sd->s_name) + 1;
+		sd = sd->s_parent;
+	}
+
+	/* check limits */
+	if (len < 2)
+		return -EINVAL;
+	len--;
+	if ((s - path) + len > PATH_MAX)
+		return -ENAMETOOLONG;
+
+	/* reverse fillup of target string from target to base */
+	sd = target_sd;
+	while (sd->s_parent && sd != base) {
+		int slen = strlen(sd->s_name);
+
+		len -= slen;
+		strncpy(s + len, sd->s_name, slen);
+		if (len)
+			s[--len] = '/';
+
+		sd = sd->s_parent;
+	}
+
+	return 0;
+}
+
+static int sysfs_getlink(struct dentry *dentry, char * path)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct sysfs_dirent *parent_sd = sd->s_parent;
+	struct sysfs_dirent *target_sd = sd->s_symlink.target_sd;
+	int error;
+
+	mutex_lock(&sysfs_mutex);
+	error = sysfs_get_target_path(parent_sd, target_sd, path);
+	mutex_unlock(&sysfs_mutex);
+
+	return error;
+}
+
+static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int error = -ENOMEM;
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+	if (page) {
+		error = sysfs_getlink(dentry, (char *) page); 
+		if (error < 0)
+			free_page((unsigned long)page);
+	}
+	nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
+	return NULL;
+}
+
+static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+	char *page = nd_get_link(nd);
+	if (!IS_ERR(page))
+		free_page((unsigned long)page);
+}
+
+const struct inode_operations sysfs_symlink_inode_operations = {
+	.setxattr	= sysfs_setxattr,
+	.readlink	= generic_readlink,
+	.follow_link	= sysfs_follow_link,
+	.put_link	= sysfs_put_link,
+	.setattr	= sysfs_setattr,
+	.getattr	= sysfs_getattr,
+	.permission	= sysfs_permission,
+};
+
+
+EXPORT_SYMBOL_GPL(sysfs_create_link);
+EXPORT_SYMBOL_GPL(sysfs_remove_link);
+EXPORT_SYMBOL_GPL(sysfs_rename_link);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
new file mode 100644
index 00000000..2ed2404f
--- /dev/null
+++ b/fs/sysfs/sysfs.h
@@ -0,0 +1,231 @@
+/*
+ * fs/sysfs/sysfs.h - sysfs internal header file
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/lockdep.h>
+#include <linux/kobject_ns.h>
+#include <linux/fs.h>
+
+struct sysfs_open_dirent;
+
+/* type-specific structures for sysfs_dirent->s_* union members */
+struct sysfs_elem_dir {
+	struct kobject		*kobj;
+	/* children list starts here and goes through sd->s_sibling */
+	struct sysfs_dirent	*children;
+};
+
+struct sysfs_elem_symlink {
+	struct sysfs_dirent	*target_sd;
+};
+
+struct sysfs_elem_attr {
+	struct attribute	*attr;
+	struct sysfs_open_dirent *open;
+};
+
+struct sysfs_elem_bin_attr {
+	struct bin_attribute	*bin_attr;
+	struct hlist_head	buffers;
+};
+
+struct sysfs_inode_attrs {
+	struct iattr	ia_iattr;
+	void		*ia_secdata;
+	u32		ia_secdata_len;
+};
+
+/*
+ * sysfs_dirent - the building block of sysfs hierarchy.  Each and
+ * every sysfs node is represented by single sysfs_dirent.
+ *
+ * As long as s_count reference is held, the sysfs_dirent itself is
+ * accessible.  Dereferencing s_elem or any other outer entity
+ * requires s_active reference.
+ */
+struct sysfs_dirent {
+	atomic_t		s_count;
+	atomic_t		s_active;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+	struct sysfs_dirent	*s_parent;
+	struct sysfs_dirent	*s_sibling;
+	const char		*s_name;
+
+	const void		*s_ns; /* namespace tag */
+	union {
+		struct sysfs_elem_dir		s_dir;
+		struct sysfs_elem_symlink	s_symlink;
+		struct sysfs_elem_attr		s_attr;
+		struct sysfs_elem_bin_attr	s_bin_attr;
+	};
+
+	unsigned int		s_flags;
+	unsigned short		s_mode;
+	ino_t			s_ino;
+	struct sysfs_inode_attrs *s_iattr;
+};
+
+#define SD_DEACTIVATED_BIAS		INT_MIN
+
+#define SYSFS_TYPE_MASK			0x00ff
+#define SYSFS_DIR			0x0001
+#define SYSFS_KOBJ_ATTR			0x0002
+#define SYSFS_KOBJ_BIN_ATTR		0x0004
+#define SYSFS_KOBJ_LINK			0x0008
+#define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
+#define SYSFS_ACTIVE_REF		(SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
+
+/* identify any namespace tag on sysfs_dirents */
+#define SYSFS_NS_TYPE_MASK		0xff00
+#define SYSFS_NS_TYPE_SHIFT		8
+
+#define SYSFS_FLAG_MASK			~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
+#define SYSFS_FLAG_REMOVED		0x020000
+
+static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
+{
+	return sd->s_flags & SYSFS_TYPE_MASK;
+}
+
+/*
+ * Return any namespace tags on this dirent.
+ * enum kobj_ns_type is defined in linux/kobject.h
+ */
+static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
+{
+	return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define sysfs_dirent_init_lockdep(sd)				\
+do {								\
+	struct attribute *attr = sd->s_attr.attr;		\
+	struct lock_class_key *key = attr->key;			\
+	if (!key)						\
+		key = &attr->skey;				\
+								\
+	lockdep_init_map(&sd->dep_map, "s_active", key, 0);	\
+} while(0)
+#else
+#define sysfs_dirent_init_lockdep(sd) do {} while(0)
+#endif
+
+/*
+ * Context structure to be used while adding/removing nodes.
+ */
+struct sysfs_addrm_cxt {
+	struct sysfs_dirent	*parent_sd;
+	struct sysfs_dirent	*removed;
+};
+
+/*
+ * mount.c
+ */
+
+/*
+ * Each sb is associated with a set of namespace tags (i.e.
+ * the network namespace of the task which mounted this sysfs
+ * instance).
+ */
+struct sysfs_super_info {
+	void *ns[KOBJ_NS_TYPES];
+};
+#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
+extern struct sysfs_dirent sysfs_root;
+extern struct kmem_cache *sysfs_dir_cachep;
+
+/*
+ * dir.c
+ */
+extern struct mutex sysfs_mutex;
+extern spinlock_t sysfs_assoc_lock;
+
+extern const struct file_operations sysfs_dir_operations;
+extern const struct inode_operations sysfs_dir_inode_operations;
+
+struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
+struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
+void sysfs_put_active(struct sysfs_dirent *sd);
+void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
+		       struct sysfs_dirent *parent_sd);
+int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
+void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
+void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
+
+struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
+				       const void *ns,
+				       const unsigned char *name);
+struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+				      const void *ns,
+				      const unsigned char *name);
+struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
+
+void release_sysfs_dirent(struct sysfs_dirent *sd);
+
+int sysfs_create_subdir(struct kobject *kobj, const char *name,
+			struct sysfs_dirent **p_sd);
+void sysfs_remove_subdir(struct sysfs_dirent *sd);
+
+int sysfs_rename(struct sysfs_dirent *sd,
+	struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name);
+
+static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
+{
+	if (sd) {
+		WARN_ON(!atomic_read(&sd->s_count));
+		atomic_inc(&sd->s_count);
+	}
+	return sd;
+}
+#define sysfs_get(sd) __sysfs_get(sd)
+
+static inline void __sysfs_put(struct sysfs_dirent *sd)
+{
+	if (sd && atomic_dec_and_test(&sd->s_count))
+		release_sysfs_dirent(sd);
+}
+#define sysfs_put(sd) __sysfs_put(sd)
+
+/*
+ * inode.c
+ */
+struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
+void sysfs_evict_inode(struct inode *inode);
+int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
+int sysfs_permission(struct inode *inode, int mask, unsigned int flags);
+int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
+int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
+int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags);
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name);
+int sysfs_inode_init(void);
+
+/*
+ * file.c
+ */
+extern const struct file_operations sysfs_file_operations;
+
+int sysfs_add_file(struct sysfs_dirent *dir_sd,
+		   const struct attribute *attr, int type);
+
+int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
+			const struct attribute *attr, int type, mode_t amode);
+/*
+ * bin.c
+ */
+extern const struct file_operations bin_fops;
+void unmap_bin_file(struct sysfs_dirent *attr_sd);
+
+/*
+ * symlink.c
+ */
+extern const struct inode_operations sysfs_symlink_inode_operations;
diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig
new file mode 100644
index 00000000..33aeb4b7
--- /dev/null
+++ b/fs/sysv/Kconfig
@@ -0,0 +1,36 @@
+config SYSV_FS
+	tristate "System V/Xenix/V7/Coherent file system support"
+	depends on BLOCK
+	help
+	  SCO, Xenix and Coherent are commercial Unix systems for Intel
+	  machines, and Version 7 was used on the DEC PDP-11. Saying Y
+	  here would allow you to read from their floppies and hard disk
+	  partitions.
+
+	  If you have floppies or hard disk partitions like that, it is likely
+	  that they contain binaries from those other Unix systems; in order
+	  to run these binaries, you will want to install linux-abi which is
+	  a set of kernel modules that lets you run SCO, Xenix, Wyse,
+	  UnixWare, Dell Unix and System V programs under Linux.  It is
+	  available via FTP (user: ftp) from
+	  <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>).
+	  NOTE: that will work only for binaries from Intel-based systems;
+	  PDP ones will have to wait until somebody ports Linux to -11 ;-)
+
+	  If you only intend to mount files from some other Unix over the
+	  network using NFS, you don't need the System V file system support
+	  (but you need NFS file system support obviously).
+
+	  Note that this option is generally not needed for floppies, since a
+	  good portable way to transport files and directories between unixes
+	  (and even other operating systems) is given by the tar program ("man
+	  tar" or preferably "info tar").  Note also that this option has
+	  nothing whatsoever to do with the option "System V IPC". Read about
+	  the System V file system in
+	  <file:Documentation/filesystems/sysv-fs.txt>.
+	  Saying Y here will enlarge your kernel by about 27 KB.
+
+	  To compile this as a module, choose M here: the module will be called
+	  sysv.
+
+	  If you haven't heard about all of this before, it's safe to say N.
diff --git a/fs/sysv/Makefile b/fs/sysv/Makefile
new file mode 100644
index 00000000..3591f9d7
--- /dev/null
+++ b/fs/sysv/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the Linux SystemV/Coherent filesystem routines.
+#
+
+obj-$(CONFIG_SYSV_FS) += sysv.o
+
+sysv-objs := ialloc.o balloc.o inode.o itree.o file.o dir.o \
+	     namei.o super.o symlink.o
diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c
new file mode 100644
index 00000000..9a6ad96a
--- /dev/null
+++ b/fs/sysv/balloc.c
@@ -0,0 +1,239 @@
+/*
+ *  linux/fs/sysv/balloc.c
+ *
+ *  minix/bitmap.c
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext/freelists.c
+ *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
+ *
+ *  xenix/alloc.c
+ *  Copyright (C) 1992  Doug Evans
+ *
+ *  coh/alloc.c
+ *  Copyright (C) 1993  Pascal Haible, Bruno Haible
+ *
+ *  sysv/balloc.c
+ *  Copyright (C) 1993  Bruno Haible
+ *
+ *  This file contains code for allocating/freeing blocks.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include "sysv.h"
+
+/* We don't trust the value of
+   sb->sv_sbd2->s_tfree = *sb->sv_free_blocks
+   but we nevertheless keep it up to date. */
+
+static inline sysv_zone_t *get_chunk(struct super_block *sb, struct buffer_head *bh)
+{
+	char *bh_data = bh->b_data;
+
+	if (SYSV_SB(sb)->s_type == FSTYPE_SYSV4)
+		return (sysv_zone_t*)(bh_data+4);
+	else
+		return (sysv_zone_t*)(bh_data+2);
+}
+
+/* NOTE NOTE NOTE: nr is a block number _as_ _stored_ _on_ _disk_ */
+
+void sysv_free_block(struct super_block * sb, sysv_zone_t nr)
+{
+	struct sysv_sb_info * sbi = SYSV_SB(sb);
+	struct buffer_head * bh;
+	sysv_zone_t *blocks = sbi->s_bcache;
+	unsigned count;
+	unsigned block = fs32_to_cpu(sbi, nr);
+
+	/*
+	 * This code does not work at all for AFS (it has a bitmap
+	 * free list).  As AFS is supposed to be read-only no one
+	 * should call this for an AFS filesystem anyway...
+	 */
+	if (sbi->s_type == FSTYPE_AFS)
+		return;
+
+	if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) {
+		printk("sysv_free_block: trying to free block not in datazone\n");
+		return;
+	}
+
+	lock_super(sb);
+	count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
+
+	if (count > sbi->s_flc_size) {
+		printk("sysv_free_block: flc_count > flc_size\n");
+		unlock_super(sb);
+		return;
+	}
+	/* If the free list head in super-block is full, it is copied
+	 * into this block being freed, ditto if it's completely empty
+	 * (applies only on Coherent).
+	 */
+	if (count == sbi->s_flc_size || count == 0) {
+		block += sbi->s_block_base;
+		bh = sb_getblk(sb, block);
+		if (!bh) {
+			printk("sysv_free_block: getblk() failed\n");
+			unlock_super(sb);
+			return;
+		}
+		memset(bh->b_data, 0, sb->s_blocksize);
+		*(__fs16*)bh->b_data = cpu_to_fs16(sbi, count);
+		memcpy(get_chunk(sb,bh), blocks, count * sizeof(sysv_zone_t));
+		mark_buffer_dirty(bh);
+		set_buffer_uptodate(bh);
+		brelse(bh);
+		count = 0;
+	}
+	sbi->s_bcache[count++] = nr;
+
+	*sbi->s_bcache_count = cpu_to_fs16(sbi, count);
+	fs32_add(sbi, sbi->s_free_blocks, 1);
+	dirty_sb(sb);
+	unlock_super(sb);
+}
+
+sysv_zone_t sysv_new_block(struct super_block * sb)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	unsigned int block;
+	sysv_zone_t nr;
+	struct buffer_head * bh;
+	unsigned count;
+
+	lock_super(sb);
+	count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
+
+	if (count == 0) /* Applies only to Coherent FS */
+		goto Enospc;
+	nr = sbi->s_bcache[--count];
+	if (nr == 0)  /* Applies only to Xenix FS, SystemV FS */
+		goto Enospc;
+
+	block = fs32_to_cpu(sbi, nr);
+
+	*sbi->s_bcache_count = cpu_to_fs16(sbi, count);
+
+	if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) {
+		printk("sysv_new_block: new block %d is not in data zone\n",
+			block);
+		goto Enospc;
+	}
+
+	if (count == 0) { /* the last block continues the free list */
+		unsigned count;
+
+		block += sbi->s_block_base;
+		if (!(bh = sb_bread(sb, block))) {
+			printk("sysv_new_block: cannot read free-list block\n");
+			/* retry this same block next time */
+			*sbi->s_bcache_count = cpu_to_fs16(sbi, 1);
+			goto Enospc;
+		}
+		count = fs16_to_cpu(sbi, *(__fs16*)bh->b_data);
+		if (count > sbi->s_flc_size) {
+			printk("sysv_new_block: free-list block with >flc_size entries\n");
+			brelse(bh);
+			goto Enospc;
+		}
+		*sbi->s_bcache_count = cpu_to_fs16(sbi, count);
+		memcpy(sbi->s_bcache, get_chunk(sb, bh),
+				count * sizeof(sysv_zone_t));
+		brelse(bh);
+	}
+	/* Now the free list head in the superblock is valid again. */
+	fs32_add(sbi, sbi->s_free_blocks, -1);
+	dirty_sb(sb);
+	unlock_super(sb);
+	return nr;
+
+Enospc:
+	unlock_super(sb);
+	return 0;
+}
+
+unsigned long sysv_count_free_blocks(struct super_block * sb)
+{
+	struct sysv_sb_info * sbi = SYSV_SB(sb);
+	int sb_count;
+	int count;
+	struct buffer_head * bh = NULL;
+	sysv_zone_t *blocks;
+	unsigned block;
+	int n;
+
+	/*
+	 * This code does not work at all for AFS (it has a bitmap
+	 * free list).  As AFS is supposed to be read-only we just
+	 * lie and say it has no free block at all.
+	 */
+	if (sbi->s_type == FSTYPE_AFS)
+		return 0;
+
+	lock_super(sb);
+	sb_count = fs32_to_cpu(sbi, *sbi->s_free_blocks);
+
+	if (0)
+		goto trust_sb;
+
+	/* this causes a lot of disk traffic ... */
+	count = 0;
+	n = fs16_to_cpu(sbi, *sbi->s_bcache_count);
+	blocks = sbi->s_bcache;
+	while (1) {
+		sysv_zone_t zone;
+		if (n > sbi->s_flc_size)
+			goto E2big;
+		zone = 0;
+		while (n && (zone = blocks[--n]) != 0)
+			count++;
+		if (zone == 0)
+			break;
+
+		block = fs32_to_cpu(sbi, zone);
+		if (bh)
+			brelse(bh);
+
+		if (block < sbi->s_firstdatazone || block >= sbi->s_nzones)
+			goto Einval;
+		block += sbi->s_block_base;
+		bh = sb_bread(sb, block);
+		if (!bh)
+			goto Eio;
+		n = fs16_to_cpu(sbi, *(__fs16*)bh->b_data);
+		blocks = get_chunk(sb, bh);
+	}
+	if (bh)
+		brelse(bh);
+	if (count != sb_count)
+		goto Ecount;
+done:
+	unlock_super(sb);
+	return count;
+
+Einval:
+	printk("sysv_count_free_blocks: new block %d is not in data zone\n",
+		block);
+	goto trust_sb;
+Eio:
+	printk("sysv_count_free_blocks: cannot read free-list block\n");
+	goto trust_sb;
+E2big:
+	printk("sysv_count_free_blocks: >flc_size entries in free-list block\n");
+	if (bh)
+		brelse(bh);
+trust_sb:
+	count = sb_count;
+	goto done;
+Ecount:
+	printk("sysv_count_free_blocks: free block count was %d, "
+		"correcting to %d\n", sb_count, count);
+	if (!(sb->s_flags & MS_RDONLY)) {
+		*sbi->s_free_blocks = cpu_to_fs32(sbi, count);
+		dirty_sb(sb);
+	}
+	goto done;
+}
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
new file mode 100644
index 00000000..a77c4215
--- /dev/null
+++ b/fs/sysv/dir.c
@@ -0,0 +1,377 @@
+/*
+ *  linux/fs/sysv/dir.c
+ *
+ *  minix/dir.c
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  coh/dir.c
+ *  Copyright (C) 1993  Pascal Haible, Bruno Haible
+ *
+ *  sysv/dir.c
+ *  Copyright (C) 1993  Bruno Haible
+ *
+ *  SystemV/Coherent directory handling functions
+ */
+
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include "sysv.h"
+
+static int sysv_readdir(struct file *, void *, filldir_t);
+
+const struct file_operations sysv_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= sysv_readdir,
+	.fsync		= generic_file_fsync,
+};
+
+static inline void dir_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+
+static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *dir = mapping->host;
+	int err = 0;
+
+	block_write_end(NULL, mapping, pos, len, len, page, NULL);
+	if (pos+len > dir->i_size) {
+		i_size_write(dir, pos+len);
+		mark_inode_dirty(dir);
+	}
+	if (IS_DIRSYNC(dir))
+		err = write_one_page(page, 1);
+	else
+		unlock_page(page);
+	return err;
+}
+
+static struct page * dir_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+	if (!IS_ERR(page))
+		kmap(page);
+	return page;
+}
+
+static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	unsigned long pos = filp->f_pos;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	unsigned offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = dir_pages(inode);
+
+	pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
+	if (pos >= inode->i_size)
+		goto done;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct sysv_dir_entry *de;
+		struct page *page = dir_get_page(inode, n);
+
+		if (IS_ERR(page))
+			continue;
+		kaddr = (char *)page_address(page);
+		de = (struct sysv_dir_entry *)(kaddr+offset);
+		limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE;
+		for ( ;(char*)de <= limit; de++) {
+			char *name = de->name;
+			int over;
+
+			if (!de->inode)
+				continue;
+
+			offset = (char *)de - kaddr;
+
+			over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN),
+					((loff_t)n<<PAGE_CACHE_SHIFT) | offset,
+					fs16_to_cpu(SYSV_SB(sb), de->inode),
+					DT_UNKNOWN);
+			if (over) {
+				dir_put_page(page);
+				goto done;
+			}
+		}
+		dir_put_page(page);
+	}
+
+done:
+	filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
+	return 0;
+}
+
+/* compare strings: name[0..len-1] (not zero-terminated) and
+ * buffer[0..] (filled with zeroes up to buffer[0..maxlen-1])
+ */
+static inline int namecompare(int len, int maxlen,
+	const char * name, const char * buffer)
+{
+	if (len < maxlen && buffer[len])
+		return 0;
+	return !memcmp(name, buffer, len);
+}
+
+/*
+ *	sysv_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the cache buffer in which the entry was found, and the entry
+ * itself (as a parameter - res_dir). It does NOT read the inode of the
+ * entry - you'll have to do that yourself if you want to.
+ */
+struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_page)
+{
+	const char * name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct inode * dir = dentry->d_parent->d_inode;
+	unsigned long start, n;
+	unsigned long npages = dir_pages(dir);
+	struct page *page = NULL;
+	struct sysv_dir_entry *de;
+
+	*res_page = NULL;
+
+	start = SYSV_I(dir)->i_dir_start_lookup;
+	if (start >= npages)
+		start = 0;
+	n = start;
+
+	do {
+		char *kaddr;
+		page = dir_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = (char*)page_address(page);
+			de = (struct sysv_dir_entry *) kaddr;
+			kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE;
+			for ( ; (char *) de <= kaddr ; de++) {
+				if (!de->inode)
+					continue;
+				if (namecompare(namelen, SYSV_NAMELEN,
+							name, de->name))
+					goto found;
+			}
+			dir_put_page(page);
+		}
+
+		if (++n >= npages)
+			n = 0;
+	} while (n != start);
+
+	return NULL;
+
+found:
+	SYSV_I(dir)->i_dir_start_lookup = n;
+	*res_page = page;
+	return de;
+}
+
+int sysv_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const char * name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct page *page = NULL;
+	struct sysv_dir_entry * de;
+	unsigned long npages = dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	loff_t pos;
+	int err;
+
+	/* We take care of directory expansion in the same loop */
+	for (n = 0; n <= npages; n++) {
+		page = dir_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		kaddr = (char*)page_address(page);
+		de = (struct sysv_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE;
+		while ((char *)de <= kaddr) {
+			if (!de->inode)
+				goto got_it;
+			err = -EEXIST;
+			if (namecompare(namelen, SYSV_NAMELEN, name, de->name)) 
+				goto out_page;
+			de++;
+		}
+		dir_put_page(page);
+	}
+	BUG();
+	return -EINVAL;
+
+got_it:
+	pos = page_offset(page) +
+			(char*)de - (char*)page_address(page);
+	lock_page(page);
+	err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
+	if (err)
+		goto out_unlock;
+	memcpy (de->name, name, namelen);
+	memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2);
+	de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
+	err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+out_page:
+	dir_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_page;
+}
+
+int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	char *kaddr = (char*)page_address(page);
+	loff_t pos = page_offset(page) + (char *)de - kaddr;
+	int err;
+
+	lock_page(page);
+	err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
+	BUG_ON(err);
+	de->inode = 0;
+	err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
+	dir_put_page(page);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	return err;
+}
+
+int sysv_make_empty(struct inode *inode, struct inode *dir)
+{
+	struct page *page = grab_cache_page(inode->i_mapping, 0);
+	struct sysv_dir_entry * de;
+	char *base;
+	int err;
+
+	if (!page)
+		return -ENOMEM;
+	err = sysv_prepare_chunk(page, 0, 2 * SYSV_DIRSIZE);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+	kmap(page);
+
+	base = (char*)page_address(page);
+	memset(base, 0, PAGE_CACHE_SIZE);
+
+	de = (struct sysv_dir_entry *) base;
+	de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
+	strcpy(de->name,".");
+	de++;
+	de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino);
+	strcpy(de->name,"..");
+
+	kunmap(page);
+	err = dir_commit_chunk(page, 0, 2 * SYSV_DIRSIZE);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int sysv_empty_dir(struct inode * inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct page *page = NULL;
+	unsigned long i, npages = dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct sysv_dir_entry * de;
+		page = dir_get_page(inode, i);
+
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = (char *)page_address(page);
+		de = (struct sysv_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE-SYSV_DIRSIZE;
+
+		for ( ;(char *)de <= kaddr; de++) {
+			if (!de->inode)
+				continue;
+			/* check for . and .. */
+			if (de->name[0] != '.')
+				goto not_empty;
+			if (!de->name[1]) {
+				if (de->inode == cpu_to_fs16(SYSV_SB(sb),
+							inode->i_ino))
+					continue;
+				goto not_empty;
+			}
+			if (de->name[1] != '.' || de->name[2])
+				goto not_empty;
+		}
+		dir_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	dir_put_page(page);
+	return 0;
+}
+
+/* Releases the page */
+void sysv_set_link(struct sysv_dir_entry *de, struct page *page,
+	struct inode *inode)
+{
+	struct inode *dir = page->mapping->host;
+	loff_t pos = page_offset(page) +
+			(char *)de-(char*)page_address(page);
+	int err;
+
+	lock_page(page);
+	err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
+	BUG_ON(err);
+	de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
+	err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
+	dir_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+}
+
+struct sysv_dir_entry * sysv_dotdot (struct inode *dir, struct page **p)
+{
+	struct page *page = dir_get_page(dir, 0);
+	struct sysv_dir_entry *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = (struct sysv_dir_entry*) page_address(page) + 1;
+		*p = page;
+	}
+	return de;
+}
+
+ino_t sysv_inode_by_name(struct dentry *dentry)
+{
+	struct page *page;
+	struct sysv_dir_entry *de = sysv_find_entry (dentry, &page);
+	ino_t res = 0;
+	
+	if (de) {
+		res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode);
+		dir_put_page(page);
+	}
+	return res;
+}
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
new file mode 100644
index 00000000..0a659395
--- /dev/null
+++ b/fs/sysv/file.c
@@ -0,0 +1,58 @@
+/*
+ *  linux/fs/sysv/file.c
+ *
+ *  minix/file.c
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  coh/file.c
+ *  Copyright (C) 1993  Pascal Haible, Bruno Haible
+ *
+ *  sysv/file.c
+ *  Copyright (C) 1993  Bruno Haible
+ *
+ *  SystemV/Coherent regular file handling primitives
+ */
+
+#include "sysv.h"
+
+/*
+ * We have mostly NULLs here: the current defaults are OK for
+ * the coh filesystem.
+ */
+const struct file_operations sysv_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.fsync		= generic_file_fsync,
+	.splice_read	= generic_file_splice_read,
+};
+
+static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+const struct inode_operations sysv_file_inode_operations = {
+	.truncate	= sysv_truncate,
+	.setattr	= sysv_setattr,
+	.getattr	= sysv_getattr,
+};
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
new file mode 100644
index 00000000..0c96c98b
--- /dev/null
+++ b/fs/sysv/ialloc.c
@@ -0,0 +1,234 @@
+/*
+ *  linux/fs/sysv/ialloc.c
+ *
+ *  minix/bitmap.c
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext/freelists.c
+ *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
+ *
+ *  xenix/alloc.c
+ *  Copyright (C) 1992  Doug Evans
+ *
+ *  coh/alloc.c
+ *  Copyright (C) 1993  Pascal Haible, Bruno Haible
+ *
+ *  sysv/ialloc.c
+ *  Copyright (C) 1993  Bruno Haible
+ *
+ *  This file contains code for allocating/freeing inodes.
+ */
+
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/sched.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include "sysv.h"
+
+/* We don't trust the value of
+   sb->sv_sbd2->s_tinode = *sb->sv_sb_total_free_inodes
+   but we nevertheless keep it up to date. */
+
+/* An inode on disk is considered free if both i_mode == 0 and i_nlink == 0. */
+
+/* return &sb->sv_sb_fic_inodes[i] = &sbd->s_inode[i]; */
+static inline sysv_ino_t *
+sv_sb_fic_inode(struct super_block * sb, unsigned int i)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+
+	if (sbi->s_bh1 == sbi->s_bh2)
+		return &sbi->s_sb_fic_inodes[i];
+	else {
+		/* 512 byte Xenix FS */
+		unsigned int offset = offsetof(struct xenix_super_block, s_inode[i]);
+		if (offset < 512)
+			return (sysv_ino_t*)(sbi->s_sbd1 + offset);
+		else
+			return (sysv_ino_t*)(sbi->s_sbd2 + offset);
+	}
+}
+
+struct sysv_inode *
+sysv_raw_inode(struct super_block *sb, unsigned ino, struct buffer_head **bh)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	struct sysv_inode *res;
+	int block = sbi->s_firstinodezone + sbi->s_block_base;
+
+	block += (ino-1) >> sbi->s_inodes_per_block_bits;
+	*bh = sb_bread(sb, block);
+	if (!*bh)
+		return NULL;
+	res = (struct sysv_inode *)(*bh)->b_data;
+	return res + ((ino-1) & sbi->s_inodes_per_block_1);
+}
+
+static int refill_free_cache(struct super_block *sb)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	struct buffer_head * bh;
+	struct sysv_inode * raw_inode;
+	int i = 0, ino;
+
+	ino = SYSV_ROOT_INO+1;
+	raw_inode = sysv_raw_inode(sb, ino, &bh);
+	if (!raw_inode)
+		goto out;
+	while (ino <= sbi->s_ninodes) {
+		if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) {
+			*sv_sb_fic_inode(sb,i++) = cpu_to_fs16(SYSV_SB(sb), ino);
+			if (i == sbi->s_fic_size)
+				break;
+		}
+		if ((ino++ & sbi->s_inodes_per_block_1) == 0) {
+			brelse(bh);
+			raw_inode = sysv_raw_inode(sb, ino, &bh);
+			if (!raw_inode)
+				goto out;
+		} else
+			raw_inode++;
+	}
+	brelse(bh);
+out:
+	return i;
+}
+
+void sysv_free_inode(struct inode * inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	unsigned int ino;
+	struct buffer_head * bh;
+	struct sysv_inode * raw_inode;
+	unsigned count;
+
+	sb = inode->i_sb;
+	ino = inode->i_ino;
+	if (ino <= SYSV_ROOT_INO || ino > sbi->s_ninodes) {
+		printk("sysv_free_inode: inode 0,1,2 or nonexistent inode\n");
+		return;
+	}
+	raw_inode = sysv_raw_inode(sb, ino, &bh);
+	if (!raw_inode) {
+		printk("sysv_free_inode: unable to read inode block on device "
+		       "%s\n", inode->i_sb->s_id);
+		return;
+	}
+	lock_super(sb);
+	count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
+	if (count < sbi->s_fic_size) {
+		*sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino);
+		*sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
+	}
+	fs16_add(sbi, sbi->s_sb_total_free_inodes, 1);
+	dirty_sb(sb);
+	memset(raw_inode, 0, sizeof(struct sysv_inode));
+	mark_buffer_dirty(bh);
+	unlock_super(sb);
+	brelse(bh);
+}
+
+struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	struct inode *inode;
+	sysv_ino_t ino;
+	unsigned count;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE
+	};
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	lock_super(sb);
+	count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
+	if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) {
+		count = refill_free_cache(sb);
+		if (count == 0) {
+			iput(inode);
+			unlock_super(sb);
+			return ERR_PTR(-ENOSPC);
+		}
+	}
+	/* Now count > 0. */
+	ino = *sv_sb_fic_inode(sb,--count);
+	*sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
+	fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
+	dirty_sb(sb);
+	inode_init_owner(inode, dir, mode);
+	inode->i_ino = fs16_to_cpu(sbi, ino);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_blocks = 0;
+	memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data));
+	SYSV_I(inode)->i_dir_start_lookup = 0;
+	insert_inode_hash(inode);
+	mark_inode_dirty(inode);
+
+	sysv_write_inode(inode, &wbc);	/* ensure inode not allocated again */
+	mark_inode_dirty(inode);	/* cleared by sysv_write_inode() */
+	/* That's it. */
+	unlock_super(sb);
+	return inode;
+}
+
+unsigned long sysv_count_free_inodes(struct super_block * sb)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	struct buffer_head * bh;
+	struct sysv_inode * raw_inode;
+	int ino, count, sb_count;
+
+	lock_super(sb);
+
+	sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes);
+
+	if (0)
+		goto trust_sb;
+
+	/* this causes a lot of disk traffic ... */
+	count = 0;
+	ino = SYSV_ROOT_INO+1;
+	raw_inode = sysv_raw_inode(sb, ino, &bh);
+	if (!raw_inode)
+		goto Eio;
+	while (ino <= sbi->s_ninodes) {
+		if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0)
+			count++;
+		if ((ino++ & sbi->s_inodes_per_block_1) == 0) {
+			brelse(bh);
+			raw_inode = sysv_raw_inode(sb, ino, &bh);
+			if (!raw_inode)
+				goto Eio;
+		} else
+			raw_inode++;
+	}
+	brelse(bh);
+	if (count != sb_count)
+		goto Einval;
+out:
+	unlock_super(sb);
+	return count;
+
+Einval:
+	printk("sysv_count_free_inodes: "
+		"free inode count was %d, correcting to %d\n",
+		sb_count, count);
+	if (!(sb->s_flags & MS_RDONLY)) {
+		*sbi->s_sb_total_free_inodes = cpu_to_fs16(SYSV_SB(sb), count);
+		dirty_sb(sb);
+	}
+	goto out;
+
+Eio:
+	printk("sysv_count_free_inodes: unable to read inode table\n");
+trust_sb:
+	count = sb_count;
+	goto out;
+}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
new file mode 100644
index 00000000..0630eb96
--- /dev/null
+++ b/fs/sysv/inode.c
@@ -0,0 +1,381 @@
+/*
+ *  linux/fs/sysv/inode.c
+ *
+ *  minix/inode.c
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  xenix/inode.c
+ *  Copyright (C) 1992  Doug Evans
+ *
+ *  coh/inode.c
+ *  Copyright (C) 1993  Pascal Haible, Bruno Haible
+ *
+ *  sysv/inode.c
+ *  Copyright (C) 1993  Paul B. Monday
+ *
+ *  sysv/inode.c
+ *  Copyright (C) 1993  Bruno Haible
+ *  Copyright (C) 1997, 1998  Krzysztof G. Baranowski
+ *
+ *  This file contains code for allocating/freeing inodes and for read/writing
+ *  the superblock.
+ */
+
+#include <linux/highuid.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/writeback.h>
+#include <linux/namei.h>
+#include <asm/byteorder.h>
+#include "sysv.h"
+
+static int sysv_sync_fs(struct super_block *sb, int wait)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	unsigned long time = get_seconds(), old_time;
+
+	lock_super(sb);
+
+	/*
+	 * If we are going to write out the super block,
+	 * then attach current time stamp.
+	 * But if the filesystem was marked clean, keep it clean.
+	 */
+	sb->s_dirt = 0;
+	old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
+	if (sbi->s_type == FSTYPE_SYSV4) {
+		if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time))
+			*sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38 - time);
+		*sbi->s_sb_time = cpu_to_fs32(sbi, time);
+		mark_buffer_dirty(sbi->s_bh2);
+	}
+
+	unlock_super(sb);
+
+	return 0;
+}
+
+static void sysv_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		sysv_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
+}
+
+static int sysv_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	lock_super(sb);
+	if (sbi->s_forced_ro)
+		*flags |= MS_RDONLY;
+	if (*flags & MS_RDONLY)
+		sysv_write_super(sb);
+	unlock_super(sb);
+	return 0;
+}
+
+static void sysv_put_super(struct super_block *sb)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+
+	if (sb->s_dirt)
+		sysv_write_super(sb);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		/* XXX ext2 also updates the state here */
+		mark_buffer_dirty(sbi->s_bh1);
+		if (sbi->s_bh1 != sbi->s_bh2)
+			mark_buffer_dirty(sbi->s_bh2);
+	}
+
+	brelse(sbi->s_bh1);
+	if (sbi->s_bh1 != sbi->s_bh2)
+		brelse(sbi->s_bh2);
+
+	kfree(sbi);
+}
+
+static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type = sb->s_magic;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = sbi->s_ndatazones;
+	buf->f_bavail = buf->f_bfree = sysv_count_free_blocks(sb);
+	buf->f_files = sbi->s_ninodes;
+	buf->f_ffree = sysv_count_free_inodes(sb);
+	buf->f_namelen = SYSV_NAMELEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	return 0;
+}
+
+/* 
+ * NXI <-> N0XI for PDP, XIN <-> XIN0 for le32, NIX <-> 0NIX for be32
+ */
+static inline void read3byte(struct sysv_sb_info *sbi,
+	unsigned char * from, unsigned char * to)
+{
+	if (sbi->s_bytesex == BYTESEX_PDP) {
+		to[0] = from[0];
+		to[1] = 0;
+		to[2] = from[1];
+		to[3] = from[2];
+	} else if (sbi->s_bytesex == BYTESEX_LE) {
+		to[0] = from[0];
+		to[1] = from[1];
+		to[2] = from[2];
+		to[3] = 0;
+	} else {
+		to[0] = 0;
+		to[1] = from[0];
+		to[2] = from[1];
+		to[3] = from[2];
+	}
+}
+
+static inline void write3byte(struct sysv_sb_info *sbi,
+	unsigned char * from, unsigned char * to)
+{
+	if (sbi->s_bytesex == BYTESEX_PDP) {
+		to[0] = from[0];
+		to[1] = from[2];
+		to[2] = from[3];
+	} else if (sbi->s_bytesex == BYTESEX_LE) {
+		to[0] = from[0];
+		to[1] = from[1];
+		to[2] = from[2];
+	} else {
+		to[0] = from[1];
+		to[1] = from[2];
+		to[2] = from[3];
+	}
+}
+
+static const struct inode_operations sysv_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.getattr	= sysv_getattr,
+};
+
+void sysv_set_inode(struct inode *inode, dev_t rdev)
+{
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &sysv_file_inode_operations;
+		inode->i_fop = &sysv_file_operations;
+		inode->i_mapping->a_ops = &sysv_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &sysv_dir_inode_operations;
+		inode->i_fop = &sysv_dir_operations;
+		inode->i_mapping->a_ops = &sysv_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (inode->i_blocks) {
+			inode->i_op = &sysv_symlink_inode_operations;
+			inode->i_mapping->a_ops = &sysv_aops;
+		} else {
+			inode->i_op = &sysv_fast_symlink_inode_operations;
+			nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size,
+				sizeof(SYSV_I(inode)->i_data) - 1);
+		}
+	} else
+		init_special_inode(inode, inode->i_mode, rdev);
+}
+
+struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
+{
+	struct sysv_sb_info * sbi = SYSV_SB(sb);
+	struct buffer_head * bh;
+	struct sysv_inode * raw_inode;
+	struct sysv_inode_info * si;
+	struct inode *inode;
+	unsigned int block;
+
+	if (!ino || ino > sbi->s_ninodes) {
+		printk("Bad inode number on dev %s: %d is out of range\n",
+		       sb->s_id, ino);
+		return ERR_PTR(-EIO);
+	}
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	raw_inode = sysv_raw_inode(sb, ino, &bh);
+	if (!raw_inode) {
+		printk("Major problem: unable to read inode from dev %s\n",
+		       inode->i_sb->s_id);
+		goto bad_inode;
+	}
+	/* SystemV FS: kludge permissions if ino==SYSV_ROOT_INO ?? */
+	inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode);
+	inode->i_uid = (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid);
+	inode->i_gid = (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid);
+	inode->i_nlink = fs16_to_cpu(sbi, raw_inode->i_nlink);
+	inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
+	inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime);
+	inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime);
+	inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_ctime);
+	inode->i_ctime.tv_nsec = 0;
+	inode->i_atime.tv_nsec = 0;
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_blocks = 0;
+
+	si = SYSV_I(inode);
+	for (block = 0; block < 10+1+1+1; block++)
+		read3byte(sbi, &raw_inode->i_data[3*block],
+				(u8 *)&si->i_data[block]);
+	brelse(bh);
+	si->i_dir_start_lookup = 0;
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		sysv_set_inode(inode,
+			       old_decode_dev(fs32_to_cpu(sbi, si->i_data[0])));
+	else
+		sysv_set_inode(inode, 0);
+	unlock_new_inode(inode);
+	return inode;
+
+bad_inode:
+	iget_failed(inode);
+	return ERR_PTR(-EIO);
+}
+
+static int __sysv_write_inode(struct inode *inode, int wait)
+{
+	struct super_block * sb = inode->i_sb;
+	struct sysv_sb_info * sbi = SYSV_SB(sb);
+	struct buffer_head * bh;
+	struct sysv_inode * raw_inode;
+	struct sysv_inode_info * si;
+	unsigned int ino, block;
+	int err = 0;
+
+	ino = inode->i_ino;
+	if (!ino || ino > sbi->s_ninodes) {
+		printk("Bad inode number on dev %s: %d is out of range\n",
+		       inode->i_sb->s_id, ino);
+		return -EIO;
+	}
+	raw_inode = sysv_raw_inode(sb, ino, &bh);
+	if (!raw_inode) {
+		printk("unable to read i-node block\n");
+		return -EIO;
+	}
+
+	raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
+	raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid));
+	raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid));
+	raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink);
+	raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size);
+	raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec);
+	raw_inode->i_mtime = cpu_to_fs32(sbi, inode->i_mtime.tv_sec);
+	raw_inode->i_ctime = cpu_to_fs32(sbi, inode->i_ctime.tv_sec);
+
+	si = SYSV_I(inode);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		si->i_data[0] = cpu_to_fs32(sbi, old_encode_dev(inode->i_rdev));
+	for (block = 0; block < 10+1+1+1; block++)
+		write3byte(sbi, (u8 *)&si->i_data[block],
+			&raw_inode->i_data[3*block]);
+	mark_buffer_dirty(bh);
+	if (wait) {
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                        printk ("IO error syncing sysv inode [%s:%08x]\n",
+                                sb->s_id, ino);
+                        err = -EIO;
+                }
+        }
+	brelse(bh);
+	return 0;
+}
+
+int sysv_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
+int sysv_sync_inode(struct inode *inode)
+{
+	return __sysv_write_inode(inode, 1);
+}
+
+static void sysv_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	if (!inode->i_nlink) {
+		inode->i_size = 0;
+		sysv_truncate(inode);
+	}
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
+	if (!inode->i_nlink)
+		sysv_free_inode(inode);
+}
+
+static struct kmem_cache *sysv_inode_cachep;
+
+static struct inode *sysv_alloc_inode(struct super_block *sb)
+{
+	struct sysv_inode_info *si;
+
+	si = kmem_cache_alloc(sysv_inode_cachep, GFP_KERNEL);
+	if (!si)
+		return NULL;
+	return &si->vfs_inode;
+}
+
+static void sysv_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
+}
+
+static void sysv_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, sysv_i_callback);
+}
+
+static void init_once(void *p)
+{
+	struct sysv_inode_info *si = (struct sysv_inode_info *)p;
+
+	inode_init_once(&si->vfs_inode);
+}
+
+const struct super_operations sysv_sops = {
+	.alloc_inode	= sysv_alloc_inode,
+	.destroy_inode	= sysv_destroy_inode,
+	.write_inode	= sysv_write_inode,
+	.evict_inode	= sysv_evict_inode,
+	.put_super	= sysv_put_super,
+	.write_super	= sysv_write_super,
+	.sync_fs	= sysv_sync_fs,
+	.remount_fs	= sysv_remount,
+	.statfs		= sysv_statfs,
+};
+
+int __init sysv_init_icache(void)
+{
+	sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
+			sizeof(struct sysv_inode_info), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			init_once);
+	if (!sysv_inode_cachep)
+		return -ENOMEM;
+	return 0;
+}
+
+void sysv_destroy_icache(void)
+{
+	kmem_cache_destroy(sysv_inode_cachep);
+}
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
new file mode 100644
index 00000000..fa8d43c9
--- /dev/null
+++ b/fs/sysv/itree.c
@@ -0,0 +1,494 @@
+/*
+ *  linux/fs/sysv/itree.c
+ *
+ *  Handling of indirect blocks' trees.
+ *  AV, Sep--Dec 2000
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/mount.h>
+#include <linux/string.h>
+#include "sysv.h"
+
+enum {DIRECT = 10, DEPTH = 4};	/* Have triple indirect */
+
+static inline void dirty_indirect(struct buffer_head *bh, struct inode *inode)
+{
+	mark_buffer_dirty_inode(bh, inode);
+	if (IS_SYNC(inode))
+		sync_dirty_buffer(bh);
+}
+
+static int block_to_path(struct inode *inode, long block, int offsets[DEPTH])
+{
+	struct super_block *sb = inode->i_sb;
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	int ptrs_bits = sbi->s_ind_per_block_bits;
+	unsigned long	indirect_blocks = sbi->s_ind_per_block,
+			double_blocks = sbi->s_ind_per_block_2;
+	int n = 0;
+
+	if (block < 0) {
+		printk("sysv_block_map: block < 0\n");
+	} else if (block < DIRECT) {
+		offsets[n++] = block;
+	} else if ( (block -= DIRECT) < indirect_blocks) {
+		offsets[n++] = DIRECT;
+		offsets[n++] = block;
+	} else if ((block -= indirect_blocks) < double_blocks) {
+		offsets[n++] = DIRECT+1;
+		offsets[n++] = block >> ptrs_bits;
+		offsets[n++] = block & (indirect_blocks - 1);
+	} else if (((block -= double_blocks) >> (ptrs_bits * 2)) < indirect_blocks) {
+		offsets[n++] = DIRECT+2;
+		offsets[n++] = block >> (ptrs_bits * 2);
+		offsets[n++] = (block >> ptrs_bits) & (indirect_blocks - 1);
+		offsets[n++] = block & (indirect_blocks - 1);
+	} else {
+		/* nothing */;
+	}
+	return n;
+}
+
+static inline int block_to_cpu(struct sysv_sb_info *sbi, sysv_zone_t nr)
+{
+	return sbi->s_block_base + fs32_to_cpu(sbi, nr);
+}
+
+typedef struct {
+	sysv_zone_t     *p;
+	sysv_zone_t     key;
+	struct buffer_head *bh;
+} Indirect;
+
+static DEFINE_RWLOCK(pointers_lock);
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, sysv_zone_t *v)
+{
+	p->key = *(p->p = v);
+	p->bh = bh;
+}
+
+static inline int verify_chain(Indirect *from, Indirect *to)
+{
+	while (from <= to && from->key == *from->p)
+		from++;
+	return (from > to);
+}
+
+static inline sysv_zone_t *block_end(struct buffer_head *bh)
+{
+	return (sysv_zone_t*)((char*)bh->b_data + bh->b_size);
+}
+
+/*
+ * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock)
+ */
+static Indirect *get_branch(struct inode *inode,
+			    int depth,
+			    int offsets[],
+			    Indirect chain[],
+			    int *err)
+{
+	struct super_block *sb = inode->i_sb;
+	Indirect *p = chain;
+	struct buffer_head *bh;
+
+	*err = 0;
+	add_chain(chain, NULL, SYSV_I(inode)->i_data + *offsets);
+	if (!p->key)
+		goto no_block;
+	while (--depth) {
+		int block = block_to_cpu(SYSV_SB(sb), p->key);
+		bh = sb_bread(sb, block);
+		if (!bh)
+			goto failure;
+		if (!verify_chain(chain, p))
+			goto changed;
+		add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets);
+		if (!p->key)
+			goto no_block;
+	}
+	return NULL;
+
+changed:
+	brelse(bh);
+	*err = -EAGAIN;
+	goto no_block;
+failure:
+	*err = -EIO;
+no_block:
+	return p;
+}
+
+static int alloc_branch(struct inode *inode,
+			int num,
+			int *offsets,
+			Indirect *branch)
+{
+	int blocksize = inode->i_sb->s_blocksize;
+	int n = 0;
+	int i;
+
+	branch[0].key = sysv_new_block(inode->i_sb);
+	if (branch[0].key) for (n = 1; n < num; n++) {
+		struct buffer_head *bh;
+		int parent;
+		/* Allocate the next block */
+		branch[n].key = sysv_new_block(inode->i_sb);
+		if (!branch[n].key)
+			break;
+		/*
+		 * Get buffer_head for parent block, zero it out and set 
+		 * the pointer to new one, then send parent to disk.
+		 */
+		parent = block_to_cpu(SYSV_SB(inode->i_sb), branch[n-1].key);
+		bh = sb_getblk(inode->i_sb, parent);
+		lock_buffer(bh);
+		memset(bh->b_data, 0, blocksize);
+		branch[n].bh = bh;
+		branch[n].p = (sysv_zone_t*) bh->b_data + offsets[n];
+		*branch[n].p = branch[n].key;
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		dirty_indirect(bh, inode);
+	}
+	if (n == num)
+		return 0;
+
+	/* Allocation failed, free what we already allocated */
+	for (i = 1; i < n; i++)
+		bforget(branch[i].bh);
+	for (i = 0; i < n; i++)
+		sysv_free_block(inode->i_sb, branch[i].key);
+	return -ENOSPC;
+}
+
+static inline int splice_branch(struct inode *inode,
+				Indirect chain[],
+				Indirect *where,
+				int num)
+{
+	int i;
+
+	/* Verify that place we are splicing to is still there and vacant */
+	write_lock(&pointers_lock);
+	if (!verify_chain(chain, where-1) || *where->p)
+		goto changed;
+	*where->p = where->key;
+	write_unlock(&pointers_lock);
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+
+	/* had we spliced it onto indirect block? */
+	if (where->bh)
+		dirty_indirect(where->bh, inode);
+
+	if (IS_SYNC(inode))
+		sysv_sync_inode(inode);
+	else
+		mark_inode_dirty(inode);
+	return 0;
+
+changed:
+	write_unlock(&pointers_lock);
+	for (i = 1; i < num; i++)
+		bforget(where[i].bh);
+	for (i = 0; i < num; i++)
+		sysv_free_block(inode->i_sb, where[i].key);
+	return -EAGAIN;
+}
+
+static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
+{
+	int err = -EIO;
+	int offsets[DEPTH];
+	Indirect chain[DEPTH];
+	struct super_block *sb = inode->i_sb;
+	Indirect *partial;
+	int left;
+	int depth = block_to_path(inode, iblock, offsets);
+
+	if (depth == 0)
+		goto out;
+
+reread:
+	read_lock(&pointers_lock);
+	partial = get_branch(inode, depth, offsets, chain, &err);
+	read_unlock(&pointers_lock);
+
+	/* Simplest case - block found, no allocation needed */
+	if (!partial) {
+got_it:
+		map_bh(bh_result, sb, block_to_cpu(SYSV_SB(sb),
+					chain[depth-1].key));
+		/* Clean up and exit */
+		partial = chain+depth-1; /* the whole chain */
+		goto cleanup;
+	}
+
+	/* Next simple case - plain lookup or failed read of indirect block */
+	if (!create || err == -EIO) {
+cleanup:
+		while (partial > chain) {
+			brelse(partial->bh);
+			partial--;
+		}
+out:
+		return err;
+	}
+
+	/*
+	 * Indirect block might be removed by truncate while we were
+	 * reading it. Handling of that case (forget what we've got and
+	 * reread) is taken out of the main path.
+	 */
+	if (err == -EAGAIN)
+		goto changed;
+
+	left = (chain + depth) - partial;
+	err = alloc_branch(inode, left, offsets+(partial-chain), partial);
+	if (err)
+		goto cleanup;
+
+	if (splice_branch(inode, chain, partial, left) < 0)
+		goto changed;
+
+	set_buffer_new(bh_result);
+	goto got_it;
+
+changed:
+	while (partial > chain) {
+		brelse(partial->bh);
+		partial--;
+	}
+	goto reread;
+}
+
+static inline int all_zeroes(sysv_zone_t *p, sysv_zone_t *q)
+{
+	while (p < q)
+		if (*p++)
+			return 0;
+	return 1;
+}
+
+static Indirect *find_shared(struct inode *inode,
+				int depth,
+				int offsets[],
+				Indirect chain[],
+				sysv_zone_t *top)
+{
+	Indirect *partial, *p;
+	int k, err;
+
+	*top = 0;
+	for (k = depth; k > 1 && !offsets[k-1]; k--)
+		;
+
+	write_lock(&pointers_lock);
+	partial = get_branch(inode, k, offsets, chain, &err);
+	if (!partial)
+		partial = chain + k-1;
+	/*
+	 * If the branch acquired continuation since we've looked at it -
+	 * fine, it should all survive and (new) top doesn't belong to us.
+	 */
+	if (!partial->key && *partial->p) {
+		write_unlock(&pointers_lock);
+		goto no_top;
+	}
+	for (p=partial; p>chain && all_zeroes((sysv_zone_t*)p->bh->b_data,p->p); p--)
+		;
+	/*
+	 * OK, we've found the last block that must survive. The rest of our
+	 * branch should be detached before unlocking. However, if that rest
+	 * of branch is all ours and does not grow immediately from the inode
+	 * it's easier to cheat and just decrement partial->p.
+	 */
+	if (p == chain + k - 1 && p > chain) {
+		p->p--;
+	} else {
+		*top = *p->p;
+		*p->p = 0;
+	}
+	write_unlock(&pointers_lock);
+
+	while (partial > p) {
+		brelse(partial->bh);
+		partial--;
+	}
+no_top:
+	return partial;
+}
+
+static inline void free_data(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q)
+{
+	for ( ; p < q ; p++) {
+		sysv_zone_t nr = *p;
+		if (nr) {
+			*p = 0;
+			sysv_free_block(inode->i_sb, nr);
+			mark_inode_dirty(inode);
+		}
+	}
+}
+
+static void free_branches(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q, int depth)
+{
+	struct buffer_head * bh;
+	struct super_block *sb = inode->i_sb;
+
+	if (depth--) {
+		for ( ; p < q ; p++) {
+			int block;
+			sysv_zone_t nr = *p;
+			if (!nr)
+				continue;
+			*p = 0;
+			block = block_to_cpu(SYSV_SB(sb), nr);
+			bh = sb_bread(sb, block);
+			if (!bh)
+				continue;
+			free_branches(inode, (sysv_zone_t*)bh->b_data,
+					block_end(bh), depth);
+			bforget(bh);
+			sysv_free_block(sb, nr);
+			mark_inode_dirty(inode);
+		}
+	} else
+		free_data(inode, p, q);
+}
+
+void sysv_truncate (struct inode * inode)
+{
+	sysv_zone_t *i_data = SYSV_I(inode)->i_data;
+	int offsets[DEPTH];
+	Indirect chain[DEPTH];
+	Indirect *partial;
+	sysv_zone_t nr = 0;
+	int n;
+	long iblock;
+	unsigned blocksize;
+
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	    S_ISLNK(inode->i_mode)))
+		return;
+
+	blocksize = inode->i_sb->s_blocksize;
+	iblock = (inode->i_size + blocksize-1)
+					>> inode->i_sb->s_blocksize_bits;
+
+	block_truncate_page(inode->i_mapping, inode->i_size, get_block);
+
+	n = block_to_path(inode, iblock, offsets);
+	if (n == 0)
+		return;
+
+	if (n == 1) {
+		free_data(inode, i_data+offsets[0], i_data + DIRECT);
+		goto do_indirects;
+	}
+
+	partial = find_shared(inode, n, offsets, chain, &nr);
+	/* Kill the top of shared branch (already detached) */
+	if (nr) {
+		if (partial == chain)
+			mark_inode_dirty(inode);
+		else
+			dirty_indirect(partial->bh, inode);
+		free_branches(inode, &nr, &nr+1, (chain+n-1) - partial);
+	}
+	/* Clear the ends of indirect blocks on the shared branch */
+	while (partial > chain) {
+		free_branches(inode, partial->p + 1, block_end(partial->bh),
+				(chain+n-1) - partial);
+		dirty_indirect(partial->bh, inode);
+		brelse (partial->bh);
+		partial--;
+	}
+do_indirects:
+	/* Kill the remaining (whole) subtrees (== subtrees deeper than...) */
+	while (n < DEPTH) {
+		nr = i_data[DIRECT + n - 1];
+		if (nr) {
+			i_data[DIRECT + n - 1] = 0;
+			mark_inode_dirty(inode);
+			free_branches(inode, &nr, &nr+1, n);
+		}
+		n++;
+	}
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+	if (IS_SYNC(inode))
+		sysv_sync_inode (inode);
+	else
+		mark_inode_dirty(inode);
+}
+
+static unsigned sysv_nblocks(struct super_block *s, loff_t size)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(s);
+	int ptrs_bits = sbi->s_ind_per_block_bits;
+	unsigned blocks, res, direct = DIRECT, i = DEPTH;
+	blocks = (size + s->s_blocksize - 1) >> s->s_blocksize_bits;
+	res = blocks;
+	while (--i && blocks > direct) {
+		blocks = ((blocks - direct - 1) >> ptrs_bits) + 1;
+		res += blocks;
+		direct = 1;
+	}
+	return blocks;
+}
+
+int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct super_block *s = mnt->mnt_sb;
+	generic_fillattr(dentry->d_inode, stat);
+	stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size);
+	stat->blksize = s->s_blocksize;
+	return 0;
+}
+
+static int sysv_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page,get_block,wbc);
+}
+
+static int sysv_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page,get_block);
+}
+
+int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	return __block_write_begin(page, pos, len, get_block);
+}
+
+static int sysv_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep, get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
+
+static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,get_block);
+}
+
+const struct address_space_operations sysv_aops = {
+	.readpage = sysv_readpage,
+	.writepage = sysv_writepage,
+	.write_begin = sysv_write_begin,
+	.write_end = generic_write_end,
+	.bmap = sysv_bmap
+};
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
new file mode 100644
index 00000000..e474fbcf
--- /dev/null
+++ b/fs/sysv/namei.c
@@ -0,0 +1,301 @@
+/*
+ *  linux/fs/sysv/namei.c
+ *
+ *  minix/namei.c
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  coh/namei.c
+ *  Copyright (C) 1993  Pascal Haible, Bruno Haible
+ *
+ *  sysv/namei.c
+ *  Copyright (C) 1993  Bruno Haible
+ *  Copyright (C) 1997, 1998  Krzysztof G. Baranowski
+ */
+
+#include <linux/pagemap.h>
+#include "sysv.h"
+
+static int add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = sysv_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+static int sysv_hash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
+{
+	/* Truncate the name in place, avoids having to define a compare
+	   function. */
+	if (qstr->len > SYSV_NAMELEN) {
+		qstr->len = SYSV_NAMELEN;
+		qstr->hash = full_name_hash(qstr->name, qstr->len);
+	}
+	return 0;
+}
+
+const struct dentry_operations sysv_dentry_operations = {
+	.d_hash		= sysv_hash,
+};
+
+static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+{
+	struct inode * inode = NULL;
+	ino_t ino;
+
+	if (dentry->d_name.len > SYSV_NAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+	ino = sysv_inode_by_name(dentry);
+
+	if (ino) {
+		inode = sysv_iget(dir->i_sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	d_add(dentry, inode);
+	return NULL;
+}
+
+static int sysv_mknod(struct inode * dir, struct dentry * dentry, int mode, dev_t rdev)
+{
+	struct inode * inode;
+	int err;
+
+	if (!old_valid_dev(rdev))
+		return -EINVAL;
+
+	inode = sysv_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+
+	if (!IS_ERR(inode)) {
+		sysv_set_inode(inode, rdev);
+		mark_inode_dirty(inode);
+		err = add_nondir(dentry, inode);
+	}
+	return err;
+}
+
+static int sysv_create(struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd)
+{
+	return sysv_mknod(dir, dentry, mode, 0);
+}
+
+static int sysv_symlink(struct inode * dir, struct dentry * dentry, 
+	const char * symname)
+{
+	int err = -ENAMETOOLONG;
+	int l = strlen(symname)+1;
+	struct inode * inode;
+
+	if (l > dir->i_sb->s_blocksize)
+		goto out;
+
+	inode = sysv_new_inode(dir, S_IFLNK|0777);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+	
+	sysv_set_inode(inode, 0);
+	err = page_symlink(inode, symname, l);
+	if (err)
+		goto out_fail;
+
+	mark_inode_dirty(inode);
+	err = add_nondir(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	iput(inode);
+	goto out;
+}
+
+static int sysv_link(struct dentry * old_dentry, struct inode * dir, 
+	struct dentry * dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+
+	if (inode->i_nlink >= SYSV_SB(inode->i_sb)->s_link_max)
+		return -EMLINK;
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	inode_inc_link_count(inode);
+	ihold(inode);
+
+	return add_nondir(dentry, inode);
+}
+
+static int sysv_mkdir(struct inode * dir, struct dentry *dentry, int mode)
+{
+	struct inode * inode;
+	int err = -EMLINK;
+
+	if (dir->i_nlink >= SYSV_SB(dir->i_sb)->s_link_max) 
+		goto out;
+	inode_inc_link_count(dir);
+
+	inode = sysv_new_inode(dir, S_IFDIR|mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_dir;
+
+	sysv_set_inode(inode, 0);
+
+	inode_inc_link_count(inode);
+
+	err = sysv_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = sysv_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+
+        d_instantiate(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	inode_dec_link_count(inode);
+	iput(inode);
+out_dir:
+	inode_dec_link_count(dir);
+	goto out;
+}
+
+static int sysv_unlink(struct inode * dir, struct dentry * dentry)
+{
+	struct inode * inode = dentry->d_inode;
+	struct page * page;
+	struct sysv_dir_entry * de;
+	int err = -ENOENT;
+
+	de = sysv_find_entry(dentry, &page);
+	if (!de)
+		goto out;
+
+	err = sysv_delete_entry (de, page);
+	if (err)
+		goto out;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+out:
+	return err;
+}
+
+static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int err = -ENOTEMPTY;
+
+	if (sysv_empty_dir(inode)) {
+		err = sysv_unlink(dir, dentry);
+		if (!err) {
+			inode->i_size = 0;
+			inode_dec_link_count(inode);
+			inode_dec_link_count(dir);
+		}
+	}
+	return err;
+}
+
+/*
+ * Anybody can rename anything with this: the permission checks are left to the
+ * higher-level routines.
+ */
+static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
+		  struct inode * new_dir, struct dentry * new_dentry)
+{
+	struct inode * old_inode = old_dentry->d_inode;
+	struct inode * new_inode = new_dentry->d_inode;
+	struct page * dir_page = NULL;
+	struct sysv_dir_entry * dir_de = NULL;
+	struct page * old_page;
+	struct sysv_dir_entry * old_de;
+	int err = -ENOENT;
+
+	old_de = sysv_find_entry(old_dentry, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = sysv_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page * new_page;
+		struct sysv_dir_entry * new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !sysv_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = sysv_find_entry(new_dentry, &new_page);
+		if (!new_de)
+			goto out_dir;
+		sysv_set_link(new_de, new_page, old_inode);
+		new_inode->i_ctime = CURRENT_TIME_SEC;
+		if (dir_de)
+			drop_nlink(new_inode);
+		inode_dec_link_count(new_inode);
+	} else {
+		if (dir_de) {
+			err = -EMLINK;
+			if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max)
+				goto out_dir;
+		}
+		err = sysv_add_link(new_dentry, old_inode);
+		if (err)
+			goto out_dir;
+		if (dir_de)
+			inode_inc_link_count(new_dir);
+	}
+
+	sysv_delete_entry(old_de, old_page);
+	mark_inode_dirty(old_inode);
+
+	if (dir_de) {
+		sysv_set_link(dir_de, dir_page, new_dir);
+		inode_dec_link_count(old_dir);
+	}
+	return 0;
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	return err;
+}
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations sysv_dir_inode_operations = {
+	.create		= sysv_create,
+	.lookup		= sysv_lookup,
+	.link		= sysv_link,
+	.unlink		= sysv_unlink,
+	.symlink	= sysv_symlink,
+	.mkdir		= sysv_mkdir,
+	.rmdir		= sysv_rmdir,
+	.mknod		= sysv_mknod,
+	.rename		= sysv_rename,
+	.getattr	= sysv_getattr,
+};
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
new file mode 100644
index 00000000..f60c1969
--- /dev/null
+++ b/fs/sysv/super.c
@@ -0,0 +1,590 @@
+/*
+ *  linux/fs/sysv/inode.c
+ *
+ *  minix/inode.c
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  xenix/inode.c
+ *  Copyright (C) 1992  Doug Evans
+ *
+ *  coh/inode.c
+ *  Copyright (C) 1993  Pascal Haible, Bruno Haible
+ *
+ *  sysv/inode.c
+ *  Copyright (C) 1993  Paul B. Monday
+ *
+ *  sysv/inode.c
+ *  Copyright (C) 1993  Bruno Haible
+ *  Copyright (C) 1997, 1998  Krzysztof G. Baranowski
+ *
+ *  This file contains code for read/parsing the superblock.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include "sysv.h"
+
+/*
+ * The following functions try to recognize specific filesystems.
+ *
+ * We recognize:
+ * - Xenix FS by its magic number.
+ * - SystemV FS by its magic number.
+ * - Coherent FS by its funny fname/fpack field.
+ * - SCO AFS by s_nfree == 0xffff
+ * - V7 FS has no distinguishing features.
+ *
+ * We discriminate among SystemV4 and SystemV2 FS by the assumption that
+ * the time stamp is not < 01-01-1980.
+ */
+
+enum {
+	JAN_1_1980 = (10*365 + 2) * 24 * 60 * 60
+};
+
+static void detected_xenix(struct sysv_sb_info *sbi)
+{
+	struct buffer_head *bh1 = sbi->s_bh1;
+	struct buffer_head *bh2 = sbi->s_bh2;
+	struct xenix_super_block * sbd1;
+	struct xenix_super_block * sbd2;
+
+	if (bh1 != bh2)
+		sbd1 = sbd2 = (struct xenix_super_block *) bh1->b_data;
+	else {
+		/* block size = 512, so bh1 != bh2 */
+		sbd1 = (struct xenix_super_block *) bh1->b_data;
+		sbd2 = (struct xenix_super_block *) (bh2->b_data - 512);
+	}
+
+	sbi->s_link_max = XENIX_LINK_MAX;
+	sbi->s_fic_size = XENIX_NICINOD;
+	sbi->s_flc_size = XENIX_NICFREE;
+	sbi->s_sbd1 = (char *)sbd1;
+	sbi->s_sbd2 = (char *)sbd2;
+	sbi->s_sb_fic_count = &sbd1->s_ninode;
+	sbi->s_sb_fic_inodes = &sbd1->s_inode[0];
+	sbi->s_sb_total_free_inodes = &sbd2->s_tinode;
+	sbi->s_bcache_count = &sbd1->s_nfree;
+	sbi->s_bcache = &sbd1->s_free[0];
+	sbi->s_free_blocks = &sbd2->s_tfree;
+	sbi->s_sb_time = &sbd2->s_time;
+	sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd1->s_isize);
+	sbi->s_nzones = fs32_to_cpu(sbi, sbd1->s_fsize);
+}
+
+static void detected_sysv4(struct sysv_sb_info *sbi)
+{
+	struct sysv4_super_block * sbd;
+	struct buffer_head *bh1 = sbi->s_bh1;
+	struct buffer_head *bh2 = sbi->s_bh2;
+
+	if (bh1 == bh2)
+		sbd = (struct sysv4_super_block *) (bh1->b_data + BLOCK_SIZE/2);
+	else
+		sbd = (struct sysv4_super_block *) bh2->b_data;
+
+	sbi->s_link_max = SYSV_LINK_MAX;
+	sbi->s_fic_size = SYSV_NICINOD;
+	sbi->s_flc_size = SYSV_NICFREE;
+	sbi->s_sbd1 = (char *)sbd;
+	sbi->s_sbd2 = (char *)sbd;
+	sbi->s_sb_fic_count = &sbd->s_ninode;
+	sbi->s_sb_fic_inodes = &sbd->s_inode[0];
+	sbi->s_sb_total_free_inodes = &sbd->s_tinode;
+	sbi->s_bcache_count = &sbd->s_nfree;
+	sbi->s_bcache = &sbd->s_free[0];
+	sbi->s_free_blocks = &sbd->s_tfree;
+	sbi->s_sb_time = &sbd->s_time;
+	sbi->s_sb_state = &sbd->s_state;
+	sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
+	sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
+}
+
+static void detected_sysv2(struct sysv_sb_info *sbi)
+{
+	struct sysv2_super_block *sbd;
+	struct buffer_head *bh1 = sbi->s_bh1;
+	struct buffer_head *bh2 = sbi->s_bh2;
+
+	if (bh1 == bh2)
+		sbd = (struct sysv2_super_block *) (bh1->b_data + BLOCK_SIZE/2);
+	else
+		sbd = (struct sysv2_super_block *) bh2->b_data;
+
+	sbi->s_link_max = SYSV_LINK_MAX;
+	sbi->s_fic_size = SYSV_NICINOD;
+	sbi->s_flc_size = SYSV_NICFREE;
+	sbi->s_sbd1 = (char *)sbd;
+	sbi->s_sbd2 = (char *)sbd;
+	sbi->s_sb_fic_count = &sbd->s_ninode;
+	sbi->s_sb_fic_inodes = &sbd->s_inode[0];
+	sbi->s_sb_total_free_inodes = &sbd->s_tinode;
+	sbi->s_bcache_count = &sbd->s_nfree;
+	sbi->s_bcache = &sbd->s_free[0];
+	sbi->s_free_blocks = &sbd->s_tfree;
+	sbi->s_sb_time = &sbd->s_time;
+	sbi->s_sb_state = &sbd->s_state;
+	sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
+	sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
+}
+
+static void detected_coherent(struct sysv_sb_info *sbi)
+{
+	struct coh_super_block * sbd;
+	struct buffer_head *bh1 = sbi->s_bh1;
+
+	sbd = (struct coh_super_block *) bh1->b_data;
+
+	sbi->s_link_max = COH_LINK_MAX;
+	sbi->s_fic_size = COH_NICINOD;
+	sbi->s_flc_size = COH_NICFREE;
+	sbi->s_sbd1 = (char *)sbd;
+	sbi->s_sbd2 = (char *)sbd;
+	sbi->s_sb_fic_count = &sbd->s_ninode;
+	sbi->s_sb_fic_inodes = &sbd->s_inode[0];
+	sbi->s_sb_total_free_inodes = &sbd->s_tinode;
+	sbi->s_bcache_count = &sbd->s_nfree;
+	sbi->s_bcache = &sbd->s_free[0];
+	sbi->s_free_blocks = &sbd->s_tfree;
+	sbi->s_sb_time = &sbd->s_time;
+	sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
+	sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
+}
+
+static void detected_v7(struct sysv_sb_info *sbi)
+{
+	struct buffer_head *bh2 = sbi->s_bh2;
+	struct v7_super_block *sbd = (struct v7_super_block *)bh2->b_data;
+
+	sbi->s_link_max = V7_LINK_MAX;
+	sbi->s_fic_size = V7_NICINOD;
+	sbi->s_flc_size = V7_NICFREE;
+	sbi->s_sbd1 = (char *)sbd;
+	sbi->s_sbd2 = (char *)sbd;
+	sbi->s_sb_fic_count = &sbd->s_ninode;
+	sbi->s_sb_fic_inodes = &sbd->s_inode[0];
+	sbi->s_sb_total_free_inodes = &sbd->s_tinode;
+	sbi->s_bcache_count = &sbd->s_nfree;
+	sbi->s_bcache = &sbd->s_free[0];
+	sbi->s_free_blocks = &sbd->s_tfree;
+	sbi->s_sb_time = &sbd->s_time;
+	sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
+	sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
+}
+
+static int detect_xenix(struct sysv_sb_info *sbi, struct buffer_head *bh)
+{
+	struct xenix_super_block *sbd = (struct xenix_super_block *)bh->b_data;
+	if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0x2b5544))
+		sbi->s_bytesex = BYTESEX_LE;
+	else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0x2b5544))
+		sbi->s_bytesex = BYTESEX_BE;
+	else
+		return 0;
+	switch (fs32_to_cpu(sbi, sbd->s_type)) {
+	case 1:
+		sbi->s_type = FSTYPE_XENIX;
+		return 1;
+	case 2:
+		sbi->s_type = FSTYPE_XENIX;
+		return 2;
+	default:
+		return 0;
+	}
+}
+
+static int detect_sysv(struct sysv_sb_info *sbi, struct buffer_head *bh)
+{
+	struct super_block *sb = sbi->s_sb;
+	/* All relevant fields are at the same offsets in R2 and R4 */
+	struct sysv4_super_block * sbd;
+	u32 type;
+
+	sbd = (struct sysv4_super_block *) (bh->b_data + BLOCK_SIZE/2);
+	if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0xfd187e20))
+		sbi->s_bytesex = BYTESEX_LE;
+	else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0xfd187e20))
+		sbi->s_bytesex = BYTESEX_BE;
+	else
+		return 0;
+
+	type = fs32_to_cpu(sbi, sbd->s_type);
+ 
+ 	if (fs16_to_cpu(sbi, sbd->s_nfree) == 0xffff) {
+ 		sbi->s_type = FSTYPE_AFS;
+		sbi->s_forced_ro = 1;
+ 		if (!(sb->s_flags & MS_RDONLY)) {
+ 			printk("SysV FS: SCO EAFS on %s detected, " 
+ 				"forcing read-only mode.\n", 
+ 				sb->s_id);
+ 		}
+ 		return type;
+ 	}
+ 
+	if (fs32_to_cpu(sbi, sbd->s_time) < JAN_1_1980) {
+		/* this is likely to happen on SystemV2 FS */
+		if (type > 3 || type < 1)
+			return 0;
+		sbi->s_type = FSTYPE_SYSV2;
+		return type;
+	}
+	if ((type > 3 || type < 1) && (type > 0x30 || type < 0x10))
+		return 0;
+
+	/* On Interactive Unix (ISC) Version 4.0/3.x s_type field = 0x10,
+	   0x20 or 0x30 indicates that symbolic links and the 14-character
+	   filename limit is gone. Due to lack of information about this
+           feature read-only mode seems to be a reasonable approach... -KGB */
+
+	if (type >= 0x10) {
+		printk("SysV FS: can't handle long file names on %s, "
+		       "forcing read-only mode.\n", sb->s_id);
+		sbi->s_forced_ro = 1;
+	}
+
+	sbi->s_type = FSTYPE_SYSV4;
+	return type >= 0x10 ? type >> 4 : type;
+}
+
+static int detect_coherent(struct sysv_sb_info *sbi, struct buffer_head *bh)
+{
+	struct coh_super_block * sbd;
+
+	sbd = (struct coh_super_block *) (bh->b_data + BLOCK_SIZE/2);
+	if ((memcmp(sbd->s_fname,"noname",6) && memcmp(sbd->s_fname,"xxxxx ",6))
+	    || (memcmp(sbd->s_fpack,"nopack",6) && memcmp(sbd->s_fpack,"xxxxx\n",6)))
+		return 0;
+	sbi->s_bytesex = BYTESEX_PDP;
+	sbi->s_type = FSTYPE_COH;
+	return 1;
+}
+
+static int detect_sysv_odd(struct sysv_sb_info *sbi, struct buffer_head *bh)
+{
+	int size = detect_sysv(sbi, bh);
+
+	return size>2 ? 0 : size;
+}
+
+static struct {
+	int block;
+	int (*test)(struct sysv_sb_info *, struct buffer_head *);
+} flavours[] = {
+	{1, detect_xenix},
+	{0, detect_sysv},
+	{0, detect_coherent},
+	{9, detect_sysv_odd},
+	{15,detect_sysv_odd},
+	{18,detect_sysv},
+};
+
+static char *flavour_names[] = {
+	[FSTYPE_XENIX]	= "Xenix",
+	[FSTYPE_SYSV4]	= "SystemV",
+	[FSTYPE_SYSV2]	= "SystemV Release 2",
+	[FSTYPE_COH]	= "Coherent",
+	[FSTYPE_V7]	= "V7",
+	[FSTYPE_AFS]	= "AFS",
+};
+
+static void (*flavour_setup[])(struct sysv_sb_info *) = {
+	[FSTYPE_XENIX]	= detected_xenix,
+	[FSTYPE_SYSV4]	= detected_sysv4,
+	[FSTYPE_SYSV2]	= detected_sysv2,
+	[FSTYPE_COH]	= detected_coherent,
+	[FSTYPE_V7]	= detected_v7,
+	[FSTYPE_AFS]	= detected_sysv4,
+};
+
+static int complete_read_super(struct super_block *sb, int silent, int size)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	struct inode *root_inode;
+	char *found = flavour_names[sbi->s_type];
+	u_char n_bits = size+8;
+	int bsize = 1 << n_bits;
+	int bsize_4 = bsize >> 2;
+
+	sbi->s_firstinodezone = 2;
+
+	flavour_setup[sbi->s_type](sbi);
+	
+	sbi->s_truncate = 1;
+	sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone;
+	sbi->s_inodes_per_block = bsize >> 6;
+	sbi->s_inodes_per_block_1 = (bsize >> 6)-1;
+	sbi->s_inodes_per_block_bits = n_bits-6;
+	sbi->s_ind_per_block = bsize_4;
+	sbi->s_ind_per_block_2 = bsize_4*bsize_4;
+	sbi->s_toobig_block = 10 + bsize_4 * (1 + bsize_4 * (1 + bsize_4));
+	sbi->s_ind_per_block_bits = n_bits-2;
+
+	sbi->s_ninodes = (sbi->s_firstdatazone - sbi->s_firstinodezone)
+		<< sbi->s_inodes_per_block_bits;
+
+	if (!silent)
+		printk("VFS: Found a %s FS (block size = %ld) on device %s\n",
+		       found, sb->s_blocksize, sb->s_id);
+
+	sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type;
+	/* set up enough so that it can read an inode */
+	sb->s_op = &sysv_sops;
+	if (sbi->s_forced_ro)
+		sb->s_flags |= MS_RDONLY;
+	if (sbi->s_truncate)
+		sb->s_d_op = &sysv_dentry_operations;
+	root_inode = sysv_iget(sb, SYSV_ROOT_INO);
+	if (IS_ERR(root_inode)) {
+		printk("SysV FS: get root inode failed\n");
+		return 0;
+	}
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root) {
+		iput(root_inode);
+		printk("SysV FS: get root dentry failed\n");
+		return 0;
+	}
+	return 1;
+}
+
+static int sysv_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct buffer_head *bh1, *bh = NULL;
+	struct sysv_sb_info *sbi;
+	unsigned long blocknr;
+	int size = 0, i;
+	
+	BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block));
+	BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block));
+	BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block));
+	BUILD_BUG_ON(500 != sizeof (struct coh_super_block));
+	BUILD_BUG_ON(64 != sizeof (struct sysv_inode));
+
+	sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sbi->s_sb = sb;
+	sbi->s_block_base = 0;
+	sb->s_fs_info = sbi;
+
+	sb_set_blocksize(sb, BLOCK_SIZE);
+
+	for (i = 0; i < ARRAY_SIZE(flavours) && !size; i++) {
+		brelse(bh);
+		bh = sb_bread(sb, flavours[i].block);
+		if (!bh)
+			continue;
+		size = flavours[i].test(SYSV_SB(sb), bh);
+	}
+
+	if (!size)
+		goto Eunknown;
+
+	switch (size) {
+		case 1:
+			blocknr = bh->b_blocknr << 1;
+			brelse(bh);
+			sb_set_blocksize(sb, 512);
+			bh1 = sb_bread(sb, blocknr);
+			bh = sb_bread(sb, blocknr + 1);
+			break;
+		case 2:
+			bh1 = bh;
+			break;
+		case 3:
+			blocknr = bh->b_blocknr >> 1;
+			brelse(bh);
+			sb_set_blocksize(sb, 2048);
+			bh1 = bh = sb_bread(sb, blocknr);
+			break;
+		default:
+			goto Ebadsize;
+	}
+
+	if (bh && bh1) {
+		sbi->s_bh1 = bh1;
+		sbi->s_bh2 = bh;
+		if (complete_read_super(sb, silent, size))
+			return 0;
+	}
+
+	brelse(bh1);
+	brelse(bh);
+	sb_set_blocksize(sb, BLOCK_SIZE);
+	printk("oldfs: cannot read superblock\n");
+failed:
+	kfree(sbi);
+	return -EINVAL;
+
+Eunknown:
+	brelse(bh);
+	if (!silent)
+		printk("VFS: unable to find oldfs superblock on device %s\n",
+			sb->s_id);
+	goto failed;
+Ebadsize:
+	brelse(bh);
+	if (!silent)
+		printk("VFS: oldfs: unsupported block size (%dKb)\n",
+			1<<(size-2));
+	goto failed;
+}
+
+static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh)
+{
+	struct v7_super_block *v7sb;
+	struct sysv_inode *v7i;
+	struct buffer_head *bh2;
+	struct sysv_sb_info *sbi;
+
+	sbi = sb->s_fs_info;
+
+	/* plausibility check on superblock */
+	v7sb = (struct v7_super_block *) bh->b_data;
+	if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
+	    fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
+	    fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE)
+		return 0;
+
+	/* plausibility check on root inode: it is a directory,
+	   with a nonzero size that is a multiple of 16 */
+	bh2 = sb_bread(sb, 2);
+	if (bh2 == NULL)
+		return 0;
+
+	v7i = (struct sysv_inode *)(bh2->b_data + 64);
+	if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
+	    (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
+	    (fs32_to_cpu(sbi, v7i->i_size) & 017) ||
+	    (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES *
+	     sizeof(struct sysv_dir_entry))) {
+		brelse(bh2);
+		return 0;
+	}
+
+	brelse(bh2);
+	return 1;
+}
+
+static int v7_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct sysv_sb_info *sbi;
+	struct buffer_head *bh;
+
+	if (440 != sizeof (struct v7_super_block))
+		panic("V7 FS: bad super-block size");
+	if (64 != sizeof (struct sysv_inode))
+		panic("sysv fs: bad i-node size");
+
+	sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sbi->s_sb = sb;
+	sbi->s_block_base = 0;
+	sbi->s_type = FSTYPE_V7;
+	sb->s_fs_info = sbi;
+	
+	sb_set_blocksize(sb, 512);
+
+	if ((bh = sb_bread(sb, 1)) == NULL) {
+		if (!silent)
+			printk("VFS: unable to read V7 FS superblock on "
+			       "device %s.\n", sb->s_id);
+		goto failed;
+	}
+
+	/* Try PDP-11 UNIX */
+	sbi->s_bytesex = BYTESEX_PDP;
+	if (v7_sanity_check(sb, bh))
+		goto detected;
+
+	/* Try PC/IX, v7/x86 */
+	sbi->s_bytesex = BYTESEX_LE;
+	if (v7_sanity_check(sb, bh))
+		goto detected;
+
+	goto failed;
+
+detected:
+	sbi->s_bh1 = bh;
+	sbi->s_bh2 = bh;
+	if (complete_read_super(sb, silent, 1))
+		return 0;
+
+failed:
+	printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n",
+		sb->s_id);
+	brelse(bh);
+	kfree(sbi);
+	return -EINVAL;
+}
+
+/* Every kernel module contains stuff like this. */
+
+static struct dentry *sysv_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
+}
+
+static struct dentry *v7_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super);
+}
+
+static struct file_system_type sysv_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "sysv",
+	.mount		= sysv_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static struct file_system_type v7_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "v7",
+	.mount		= v7_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_sysv_fs(void)
+{
+	int error;
+
+	error = sysv_init_icache();
+	if (error)
+		goto out;
+	error = register_filesystem(&sysv_fs_type);
+	if (error)
+		goto destroy_icache;
+	error = register_filesystem(&v7_fs_type);
+	if (error)
+		goto unregister;
+	return 0;
+
+unregister:
+	unregister_filesystem(&sysv_fs_type);
+destroy_icache:
+	sysv_destroy_icache();
+out:
+	return error;
+}
+
+static void __exit exit_sysv_fs(void)
+{
+	unregister_filesystem(&sysv_fs_type);
+	unregister_filesystem(&v7_fs_type);
+	sysv_destroy_icache();
+}
+
+module_init(init_sysv_fs)
+module_exit(exit_sysv_fs)
+MODULE_ALIAS("v7");
+MODULE_LICENSE("GPL");
diff --git a/fs/sysv/symlink.c b/fs/sysv/symlink.c
new file mode 100644
index 00000000..00d2f8a4
--- /dev/null
+++ b/fs/sysv/symlink.c
@@ -0,0 +1,20 @@
+/*
+ *  linux/fs/sysv/symlink.c
+ *
+ *  Handling of System V filesystem fast symlinks extensions.
+ *  Aug 2001, Christoph Hellwig (hch@infradead.org)
+ */
+
+#include "sysv.h"
+#include <linux/namei.h>
+
+static void *sysv_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	nd_set_link(nd, (char *)SYSV_I(dentry->d_inode)->i_data);
+	return NULL;
+}
+
+const struct inode_operations sysv_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= sysv_follow_link,
+};
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
new file mode 100644
index 00000000..bb55cdb3
--- /dev/null
+++ b/fs/sysv/sysv.h
@@ -0,0 +1,248 @@
+#ifndef _SYSV_H
+#define _SYSV_H
+
+#include <linux/buffer_head.h>
+
+typedef __u16 __bitwise __fs16;
+typedef __u32 __bitwise __fs32;
+
+#include <linux/sysv_fs.h>
+
+/*
+ * SystemV/V7/Coherent super-block data in memory
+ *
+ * The SystemV/V7/Coherent superblock contains dynamic data (it gets modified
+ * while the system is running). This is in contrast to the Minix and Berkeley
+ * filesystems (where the superblock is never modified). This affects the
+ * sync() operation: we must keep the superblock in a disk buffer and use this
+ * one as our "working copy".
+ */
+
+struct sysv_sb_info {
+	struct super_block *s_sb;	/* VFS superblock */
+	int	       s_type;		/* file system type: FSTYPE_{XENIX|SYSV|COH} */
+	char	       s_bytesex;	/* bytesex (le/be/pdp) */
+	char	       s_truncate;	/* if 1: names > SYSV_NAMELEN chars are truncated */
+					/* if 0: they are disallowed (ENAMETOOLONG) */
+	nlink_t        s_link_max;	/* max number of hard links to a file */
+	unsigned int   s_inodes_per_block;	/* number of inodes per block */
+	unsigned int   s_inodes_per_block_1;	/* inodes_per_block - 1 */
+	unsigned int   s_inodes_per_block_bits;	/* log2(inodes_per_block) */
+	unsigned int   s_ind_per_block;		/* number of indirections per block */
+	unsigned int   s_ind_per_block_bits;	/* log2(ind_per_block) */
+	unsigned int   s_ind_per_block_2;	/* ind_per_block ^ 2 */
+	unsigned int   s_toobig_block;		/* 10 + ipb + ipb^2 + ipb^3 */
+	unsigned int   s_block_base;	/* physical block number of block 0 */
+	unsigned short s_fic_size;	/* free inode cache size, NICINOD */
+	unsigned short s_flc_size;	/* free block list chunk size, NICFREE */
+	/* The superblock is kept in one or two disk buffers: */
+	struct buffer_head *s_bh1;
+	struct buffer_head *s_bh2;
+	/* These are pointers into the disk buffer, to compensate for
+	   different superblock layout. */
+	char *         s_sbd1;		/* entire superblock data, for part 1 */
+	char *         s_sbd2;		/* entire superblock data, for part 2 */
+	__fs16         *s_sb_fic_count;	/* pointer to s_sbd->s_ninode */
+        sysv_ino_t     *s_sb_fic_inodes; /* pointer to s_sbd->s_inode */
+	__fs16         *s_sb_total_free_inodes; /* pointer to s_sbd->s_tinode */
+	__fs16         *s_bcache_count;	/* pointer to s_sbd->s_nfree */
+	sysv_zone_t    *s_bcache;	/* pointer to s_sbd->s_free */
+	__fs32         *s_free_blocks;	/* pointer to s_sbd->s_tfree */
+	__fs32         *s_sb_time;	/* pointer to s_sbd->s_time */
+	__fs32         *s_sb_state;	/* pointer to s_sbd->s_state, only FSTYPE_SYSV */
+	/* We keep those superblock entities that don't change here;
+	   this saves us an indirection and perhaps a conversion. */
+	u32            s_firstinodezone; /* index of first inode zone */
+	u32            s_firstdatazone;	/* same as s_sbd->s_isize */
+	u32            s_ninodes;	/* total number of inodes */
+	u32            s_ndatazones;	/* total number of data zones */
+	u32            s_nzones;	/* same as s_sbd->s_fsize */
+	u16	       s_namelen;       /* max length of dir entry */
+	int	       s_forced_ro;
+};
+
+/*
+ * SystemV/V7/Coherent FS inode data in memory
+ */
+struct sysv_inode_info {
+	__fs32		i_data[13];
+	u32		i_dir_start_lookup;
+	struct inode	vfs_inode;
+};
+
+
+static inline struct sysv_inode_info *SYSV_I(struct inode *inode)
+{
+	return list_entry(inode, struct sysv_inode_info, vfs_inode);
+}
+
+static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+
+/* identify the FS in memory */
+enum {
+	FSTYPE_NONE = 0,
+	FSTYPE_XENIX,
+	FSTYPE_SYSV4,
+	FSTYPE_SYSV2,
+	FSTYPE_COH,
+	FSTYPE_V7,
+	FSTYPE_AFS,
+	FSTYPE_END,
+};
+
+#define SYSV_MAGIC_BASE		0x012FF7B3
+
+#define XENIX_SUPER_MAGIC	(SYSV_MAGIC_BASE+FSTYPE_XENIX)
+#define SYSV4_SUPER_MAGIC	(SYSV_MAGIC_BASE+FSTYPE_SYSV4)
+#define SYSV2_SUPER_MAGIC	(SYSV_MAGIC_BASE+FSTYPE_SYSV2)
+#define COH_SUPER_MAGIC		(SYSV_MAGIC_BASE+FSTYPE_COH)
+
+
+/* Admissible values for i_nlink: 0.._LINK_MAX */
+enum {
+	XENIX_LINK_MAX	=	126,	/* ?? */
+	SYSV_LINK_MAX	=	126,	/* 127? 251? */
+	V7_LINK_MAX     =	126,	/* ?? */
+	COH_LINK_MAX	=	10000,
+};
+
+
+static inline void dirty_sb(struct super_block *sb)
+{
+	struct sysv_sb_info *sbi = SYSV_SB(sb);
+
+	mark_buffer_dirty(sbi->s_bh1);
+	if (sbi->s_bh1 != sbi->s_bh2)
+		mark_buffer_dirty(sbi->s_bh2);
+	sb->s_dirt = 1;
+}
+
+
+/* ialloc.c */
+extern struct sysv_inode *sysv_raw_inode(struct super_block *, unsigned,
+			struct buffer_head **);
+extern struct inode * sysv_new_inode(const struct inode *, mode_t);
+extern void sysv_free_inode(struct inode *);
+extern unsigned long sysv_count_free_inodes(struct super_block *);
+
+/* balloc.c */
+extern sysv_zone_t sysv_new_block(struct super_block *);
+extern void sysv_free_block(struct super_block *, sysv_zone_t);
+extern unsigned long sysv_count_free_blocks(struct super_block *);
+
+/* itree.c */
+extern void sysv_truncate(struct inode *);
+extern int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len);
+
+/* inode.c */
+extern struct inode *sysv_iget(struct super_block *, unsigned int);
+extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
+extern int sysv_sync_inode(struct inode *);
+extern void sysv_set_inode(struct inode *, dev_t);
+extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int sysv_init_icache(void);
+extern void sysv_destroy_icache(void);
+
+
+/* dir.c */
+extern struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct page **);
+extern int sysv_add_link(struct dentry *, struct inode *);
+extern int sysv_delete_entry(struct sysv_dir_entry *, struct page *);
+extern int sysv_make_empty(struct inode *, struct inode *);
+extern int sysv_empty_dir(struct inode *);
+extern void sysv_set_link(struct sysv_dir_entry *, struct page *,
+			struct inode *);
+extern struct sysv_dir_entry *sysv_dotdot(struct inode *, struct page **);
+extern ino_t sysv_inode_by_name(struct dentry *);
+
+
+extern const struct inode_operations sysv_file_inode_operations;
+extern const struct inode_operations sysv_dir_inode_operations;
+extern const struct inode_operations sysv_fast_symlink_inode_operations;
+extern const struct file_operations sysv_file_operations;
+extern const struct file_operations sysv_dir_operations;
+extern const struct address_space_operations sysv_aops;
+extern const struct super_operations sysv_sops;
+extern const struct dentry_operations sysv_dentry_operations;
+
+
+enum {
+	BYTESEX_LE,
+	BYTESEX_PDP,
+	BYTESEX_BE,
+};
+
+static inline u32 PDP_swab(u32 x)
+{
+#ifdef __LITTLE_ENDIAN
+	return ((x & 0xffff) << 16) | ((x & 0xffff0000) >> 16);
+#else
+#ifdef __BIG_ENDIAN
+	return ((x & 0xff00ff) << 8) | ((x & 0xff00ff00) >> 8);
+#else
+#error BYTESEX
+#endif
+#endif
+}
+
+static inline __u32 fs32_to_cpu(struct sysv_sb_info *sbi, __fs32 n)
+{
+	if (sbi->s_bytesex == BYTESEX_PDP)
+		return PDP_swab((__force __u32)n);
+	else if (sbi->s_bytesex == BYTESEX_LE)
+		return le32_to_cpu((__force __le32)n);
+	else
+		return be32_to_cpu((__force __be32)n);
+}
+
+static inline __fs32 cpu_to_fs32(struct sysv_sb_info *sbi, __u32 n)
+{
+	if (sbi->s_bytesex == BYTESEX_PDP)
+		return (__force __fs32)PDP_swab(n);
+	else if (sbi->s_bytesex == BYTESEX_LE)
+		return (__force __fs32)cpu_to_le32(n);
+	else
+		return (__force __fs32)cpu_to_be32(n);
+}
+
+static inline __fs32 fs32_add(struct sysv_sb_info *sbi, __fs32 *n, int d)
+{
+	if (sbi->s_bytesex == BYTESEX_PDP)
+		*(__u32*)n = PDP_swab(PDP_swab(*(__u32*)n)+d);
+	else if (sbi->s_bytesex == BYTESEX_LE)
+		le32_add_cpu((__le32 *)n, d);
+	else
+		be32_add_cpu((__be32 *)n, d);
+	return *n;
+}
+
+static inline __u16 fs16_to_cpu(struct sysv_sb_info *sbi, __fs16 n)
+{
+	if (sbi->s_bytesex != BYTESEX_BE)
+		return le16_to_cpu((__force __le16)n);
+	else
+		return be16_to_cpu((__force __be16)n);
+}
+
+static inline __fs16 cpu_to_fs16(struct sysv_sb_info *sbi, __u16 n)
+{
+	if (sbi->s_bytesex != BYTESEX_BE)
+		return (__force __fs16)cpu_to_le16(n);
+	else
+		return (__force __fs16)cpu_to_be16(n);
+}
+
+static inline __fs16 fs16_add(struct sysv_sb_info *sbi, __fs16 *n, int d)
+{
+	if (sbi->s_bytesex != BYTESEX_BE)
+		le16_add_cpu((__le16 *)n, d);
+	else
+		be16_add_cpu((__be16 *)n, d);
+	return *n;
+}
+
+#endif /* _SYSV_H */
diff --git a/fs/timerfd.c b/fs/timerfd.c
new file mode 100644
index 00000000..dffeb379
--- /dev/null
+++ b/fs/timerfd.c
@@ -0,0 +1,369 @@
+/*
+ *  fs/timerfd.c
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *
+ *
+ *  Thanks to Thomas Gleixner for code reviews and useful comments.
+ *
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/time.h>
+#include <linux/hrtimer.h>
+#include <linux/anon_inodes.h>
+#include <linux/timerfd.h>
+#include <linux/syscalls.h>
+#include <linux/rcupdate.h>
+
+struct timerfd_ctx {
+	struct hrtimer tmr;
+	ktime_t tintv;
+	ktime_t moffs;
+	wait_queue_head_t wqh;
+	u64 ticks;
+	int expired;
+	int clockid;
+	struct rcu_head rcu;
+	struct list_head clist;
+	bool might_cancel;
+};
+
+static LIST_HEAD(cancel_list);
+static DEFINE_SPINLOCK(cancel_lock);
+
+/*
+ * This gets called when the timer event triggers. We set the "expired"
+ * flag, but we do not re-arm the timer (in case it's necessary,
+ * tintv.tv64 != 0) until the timer is accessed.
+ */
+static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
+{
+	struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	ctx->expired = 1;
+	ctx->ticks++;
+	wake_up_locked(&ctx->wqh);
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+
+	return HRTIMER_NORESTART;
+}
+
+/*
+ * Called when the clock was set to cancel the timers in the cancel
+ * list. This will wake up processes waiting on these timers. The
+ * wake-up requires ctx->ticks to be non zero, therefore we increment
+ * it before calling wake_up_locked().
+ */
+void timerfd_clock_was_set(void)
+{
+	ktime_t moffs = ktime_get_monotonic_offset();
+	struct timerfd_ctx *ctx;
+	unsigned long flags;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ctx, &cancel_list, clist) {
+		if (!ctx->might_cancel)
+			continue;
+		spin_lock_irqsave(&ctx->wqh.lock, flags);
+		if (ctx->moffs.tv64 != moffs.tv64) {
+			ctx->moffs.tv64 = KTIME_MAX;
+			ctx->ticks++;
+			wake_up_locked(&ctx->wqh);
+		}
+		spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+	}
+	rcu_read_unlock();
+}
+
+static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
+{
+	if (ctx->might_cancel) {
+		ctx->might_cancel = false;
+		spin_lock(&cancel_lock);
+		list_del_rcu(&ctx->clist);
+		spin_unlock(&cancel_lock);
+	}
+}
+
+static bool timerfd_canceled(struct timerfd_ctx *ctx)
+{
+	if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX)
+		return false;
+	ctx->moffs = ktime_get_monotonic_offset();
+	return true;
+}
+
+static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
+{
+	if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) &&
+	    (flags & TFD_TIMER_CANCEL_ON_SET)) {
+		if (!ctx->might_cancel) {
+			ctx->might_cancel = true;
+			spin_lock(&cancel_lock);
+			list_add_rcu(&ctx->clist, &cancel_list);
+			spin_unlock(&cancel_lock);
+		}
+	} else if (ctx->might_cancel) {
+		timerfd_remove_cancel(ctx);
+	}
+}
+
+static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
+{
+	ktime_t remaining;
+
+	remaining = hrtimer_expires_remaining(&ctx->tmr);
+	return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
+}
+
+static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
+			 const struct itimerspec *ktmr)
+{
+	enum hrtimer_mode htmode;
+	ktime_t texp;
+	int clockid = ctx->clockid;
+
+	htmode = (flags & TFD_TIMER_ABSTIME) ?
+		HRTIMER_MODE_ABS: HRTIMER_MODE_REL;
+
+	texp = timespec_to_ktime(ktmr->it_value);
+	ctx->expired = 0;
+	ctx->ticks = 0;
+	ctx->tintv = timespec_to_ktime(ktmr->it_interval);
+	hrtimer_init(&ctx->tmr, clockid, htmode);
+	hrtimer_set_expires(&ctx->tmr, texp);
+	ctx->tmr.function = timerfd_tmrproc;
+	if (texp.tv64 != 0) {
+		hrtimer_start(&ctx->tmr, texp, htmode);
+		if (timerfd_canceled(ctx))
+			return -ECANCELED;
+	}
+	return 0;
+}
+
+static int timerfd_release(struct inode *inode, struct file *file)
+{
+	struct timerfd_ctx *ctx = file->private_data;
+
+	timerfd_remove_cancel(ctx);
+	hrtimer_cancel(&ctx->tmr);
+	kfree_rcu(ctx, rcu);
+	return 0;
+}
+
+static unsigned int timerfd_poll(struct file *file, poll_table *wait)
+{
+	struct timerfd_ctx *ctx = file->private_data;
+	unsigned int events = 0;
+	unsigned long flags;
+
+	poll_wait(file, &ctx->wqh, wait);
+
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	if (ctx->ticks)
+		events |= POLLIN;
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+
+	return events;
+}
+
+static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct timerfd_ctx *ctx = file->private_data;
+	ssize_t res;
+	u64 ticks = 0;
+
+	if (count < sizeof(ticks))
+		return -EINVAL;
+	spin_lock_irq(&ctx->wqh.lock);
+	if (file->f_flags & O_NONBLOCK)
+		res = -EAGAIN;
+	else
+		res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
+
+	/*
+	 * If clock has changed, we do not care about the
+	 * ticks and we do not rearm the timer. Userspace must
+	 * reevaluate anyway.
+	 */
+	if (timerfd_canceled(ctx)) {
+		ctx->ticks = 0;
+		ctx->expired = 0;
+		res = -ECANCELED;
+	}
+
+	if (ctx->ticks) {
+		ticks = ctx->ticks;
+
+		if (ctx->expired && ctx->tintv.tv64) {
+			/*
+			 * If tintv.tv64 != 0, this is a periodic timer that
+			 * needs to be re-armed. We avoid doing it in the timer
+			 * callback to avoid DoS attacks specifying a very
+			 * short timer period.
+			 */
+			ticks += hrtimer_forward_now(&ctx->tmr,
+						     ctx->tintv) - 1;
+			hrtimer_restart(&ctx->tmr);
+		}
+		ctx->expired = 0;
+		ctx->ticks = 0;
+	}
+	spin_unlock_irq(&ctx->wqh.lock);
+	if (ticks)
+		res = put_user(ticks, (u64 __user *) buf) ? -EFAULT: sizeof(ticks);
+	return res;
+}
+
+static const struct file_operations timerfd_fops = {
+	.release	= timerfd_release,
+	.poll		= timerfd_poll,
+	.read		= timerfd_read,
+	.llseek		= noop_llseek,
+};
+
+static struct file *timerfd_fget(int fd)
+{
+	struct file *file;
+
+	file = fget(fd);
+	if (!file)
+		return ERR_PTR(-EBADF);
+	if (file->f_op != &timerfd_fops) {
+		fput(file);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return file;
+}
+
+SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
+{
+	int ufd;
+	struct timerfd_ctx *ctx;
+
+	/* Check the TFD_* constants for consistency.  */
+	BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
+
+	if ((flags & ~TFD_CREATE_FLAGS) ||
+	    (clockid != CLOCK_MONOTONIC &&
+	     clockid != CLOCK_REALTIME))
+		return -EINVAL;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	init_waitqueue_head(&ctx->wqh);
+	ctx->clockid = clockid;
+	hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
+	ctx->moffs = ktime_get_monotonic_offset();
+
+	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
+			       O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
+	if (ufd < 0)
+		kfree(ctx);
+
+	return ufd;
+}
+
+SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
+		const struct itimerspec __user *, utmr,
+		struct itimerspec __user *, otmr)
+{
+	struct file *file;
+	struct timerfd_ctx *ctx;
+	struct itimerspec ktmr, kotmr;
+	int ret;
+
+	if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
+		return -EFAULT;
+
+	if ((flags & ~TFD_SETTIME_FLAGS) ||
+	    !timespec_valid(&ktmr.it_value) ||
+	    !timespec_valid(&ktmr.it_interval))
+		return -EINVAL;
+
+	file = timerfd_fget(ufd);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+	ctx = file->private_data;
+
+	timerfd_setup_cancel(ctx, flags);
+
+	/*
+	 * We need to stop the existing timer before reprogramming
+	 * it to the new values.
+	 */
+	for (;;) {
+		spin_lock_irq(&ctx->wqh.lock);
+		if (hrtimer_try_to_cancel(&ctx->tmr) >= 0)
+			break;
+		spin_unlock_irq(&ctx->wqh.lock);
+		cpu_relax();
+	}
+
+	/*
+	 * If the timer is expired and it's periodic, we need to advance it
+	 * because the caller may want to know the previous expiration time.
+	 * We do not update "ticks" and "expired" since the timer will be
+	 * re-programmed again in the following timerfd_setup() call.
+	 */
+	if (ctx->expired && ctx->tintv.tv64)
+		hrtimer_forward_now(&ctx->tmr, ctx->tintv);
+
+	kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
+	kotmr.it_interval = ktime_to_timespec(ctx->tintv);
+
+	/*
+	 * Re-program the timer to the new value ...
+	 */
+	ret = timerfd_setup(ctx, flags, &ktmr);
+
+	spin_unlock_irq(&ctx->wqh.lock);
+	fput(file);
+	if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr)))
+		return -EFAULT;
+
+	return ret;
+}
+
+SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
+{
+	struct file *file;
+	struct timerfd_ctx *ctx;
+	struct itimerspec kotmr;
+
+	file = timerfd_fget(ufd);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+	ctx = file->private_data;
+
+	spin_lock_irq(&ctx->wqh.lock);
+	if (ctx->expired && ctx->tintv.tv64) {
+		ctx->expired = 0;
+		ctx->ticks +=
+			hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1;
+		hrtimer_restart(&ctx->tmr);
+	}
+	kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
+	kotmr.it_interval = ktime_to_timespec(ctx->tintv);
+	spin_unlock_irq(&ctx->wqh.lock);
+	fput(file);
+
+	return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0;
+}
+
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
new file mode 100644
index 00000000..f8b0160d
--- /dev/null
+++ b/fs/ubifs/Kconfig
@@ -0,0 +1,60 @@
+config UBIFS_FS
+	tristate "UBIFS file system support"
+	select CRC16
+	select CRC32
+	select CRYPTO if UBIFS_FS_ADVANCED_COMPR
+	select CRYPTO if UBIFS_FS_LZO
+	select CRYPTO if UBIFS_FS_ZLIB
+	select CRYPTO_LZO if UBIFS_FS_LZO
+	select CRYPTO_DEFLATE if UBIFS_FS_ZLIB
+	depends on MTD_UBI
+	help
+	  UBIFS is a file system for flash devices which works on top of UBI.
+
+config UBIFS_FS_XATTR
+	bool "Extended attributes support"
+	depends on UBIFS_FS
+	help
+	  This option enables support of extended attributes.
+
+config UBIFS_FS_ADVANCED_COMPR
+	bool "Advanced compression options"
+	depends on UBIFS_FS
+	help
+	  This option allows to explicitly choose which compressions, if any,
+	  are enabled in UBIFS. Removing compressors means inability to read
+	  existing file systems.
+
+	  If unsure, say 'N'.
+
+config UBIFS_FS_LZO
+	bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR
+	depends on UBIFS_FS
+	default y
+	help
+	   LZO compressor is generally faster than zlib but compresses worse.
+	   Say 'Y' if unsure.
+
+config UBIFS_FS_ZLIB
+	bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR
+	depends on UBIFS_FS
+	default y
+	help
+	  Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
+
+# Debugging-related stuff
+config UBIFS_FS_DEBUG
+	bool "Enable debugging support"
+	depends on UBIFS_FS
+	select DEBUG_FS
+	select KALLSYMS
+	help
+	  This option enables UBIFS debugging support. It makes sure various
+	  assertions, self-checks, debugging messages and test modes are compiled
+	  in (this all is compiled out otherwise). Assertions are light-weight
+	  and this option also enables them. Self-checks, debugging messages and
+	  test modes are switched off by default. Thus, it is safe and actually
+	  recommended to have debugging support enabled, and it should not slow
+	  down UBIFS. You can then further enable / disable individual  debugging
+	  features using UBIFS module parameters and the corresponding sysfs
+	  interfaces.
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
new file mode 100644
index 00000000..80e93c35
--- /dev/null
+++ b/fs/ubifs/Makefile
@@ -0,0 +1,9 @@
+obj-$(CONFIG_UBIFS_FS) += ubifs.o
+
+ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
+ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
+ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
+ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o
+
+ubifs-$(CONFIG_UBIFS_FS_DEBUG) += debug.o
+ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
new file mode 100644
index 00000000..315de66e
--- /dev/null
+++ b/fs/ubifs/budget.c
@@ -0,0 +1,732 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the budgeting sub-system which is responsible for UBIFS
+ * space management.
+ *
+ * Factors such as compression, wasted space at the ends of LEBs, space in other
+ * journal heads, the effect of updates on the index, and so on, make it
+ * impossible to accurately predict the amount of space needed. Consequently
+ * approximations are used.
+ */
+
+#include "ubifs.h"
+#include <linux/writeback.h>
+#include <linux/math64.h>
+
+/*
+ * When pessimistic budget calculations say that there is no enough space,
+ * UBIFS starts writing back dirty inodes and pages, doing garbage collection,
+ * or committing. The below constant defines maximum number of times UBIFS
+ * repeats the operations.
+ */
+#define MAX_MKSPC_RETRIES 3
+
+/*
+ * The below constant defines amount of dirty pages which should be written
+ * back at when trying to shrink the liability.
+ */
+#define NR_TO_WRITE 16
+
+/**
+ * shrink_liability - write-back some dirty pages/inodes.
+ * @c: UBIFS file-system description object
+ * @nr_to_write: how many dirty pages to write-back
+ *
+ * This function shrinks UBIFS liability by means of writing back some amount
+ * of dirty inodes and their pages.
+ *
+ * Note, this function synchronizes even VFS inodes which are locked
+ * (@i_mutex) by the caller of the budgeting function, because write-back does
+ * not touch @i_mutex.
+ */
+static void shrink_liability(struct ubifs_info *c, int nr_to_write)
+{
+	down_read(&c->vfs_sb->s_umount);
+	writeback_inodes_sb(c->vfs_sb);
+	up_read(&c->vfs_sb->s_umount);
+}
+
+/**
+ * run_gc - run garbage collector.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs garbage collector to make some more free space. Returns
+ * zero if a free LEB has been produced, %-EAGAIN if commit is required, and a
+ * negative error code in case of failure.
+ */
+static int run_gc(struct ubifs_info *c)
+{
+	int err, lnum;
+
+	/* Make some free space by garbage-collecting dirty space */
+	down_read(&c->commit_sem);
+	lnum = ubifs_garbage_collect(c, 1);
+	up_read(&c->commit_sem);
+	if (lnum < 0)
+		return lnum;
+
+	/* GC freed one LEB, return it to lprops */
+	dbg_budg("GC freed LEB %d", lnum);
+	err = ubifs_return_leb(c, lnum);
+	if (err)
+		return err;
+	return 0;
+}
+
+/**
+ * get_liability - calculate current liability.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and returns current UBIFS liability, i.e. the
+ * amount of bytes UBIFS has "promised" to write to the media.
+ */
+static long long get_liability(struct ubifs_info *c)
+{
+	long long liab;
+
+	spin_lock(&c->space_lock);
+	liab = c->bi.idx_growth + c->bi.data_growth + c->bi.dd_growth;
+	spin_unlock(&c->space_lock);
+	return liab;
+}
+
+/**
+ * make_free_space - make more free space on the file-system.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called when an operation cannot be budgeted because there
+ * is supposedly no free space. But in most cases there is some free space:
+ *   o budgeting is pessimistic, so it always budgets more than it is actually
+ *     needed, so shrinking the liability is one way to make free space - the
+ *     cached data will take less space then it was budgeted for;
+ *   o GC may turn some dark space into free space (budgeting treats dark space
+ *     as not available);
+ *   o commit may free some LEB, i.e., turn freeable LEBs into free LEBs.
+ *
+ * So this function tries to do the above. Returns %-EAGAIN if some free space
+ * was presumably made and the caller has to re-try budgeting the operation.
+ * Returns %-ENOSPC if it couldn't do more free space, and other negative error
+ * codes on failures.
+ */
+static int make_free_space(struct ubifs_info *c)
+{
+	int err, retries = 0;
+	long long liab1, liab2;
+
+	do {
+		liab1 = get_liability(c);
+		/*
+		 * We probably have some dirty pages or inodes (liability), try
+		 * to write them back.
+		 */
+		dbg_budg("liability %lld, run write-back", liab1);
+		shrink_liability(c, NR_TO_WRITE);
+
+		liab2 = get_liability(c);
+		if (liab2 < liab1)
+			return -EAGAIN;
+
+		dbg_budg("new liability %lld (not shrunk)", liab2);
+
+		/* Liability did not shrink again, try GC */
+		dbg_budg("Run GC");
+		err = run_gc(c);
+		if (!err)
+			return -EAGAIN;
+
+		if (err != -EAGAIN && err != -ENOSPC)
+			/* Some real error happened */
+			return err;
+
+		dbg_budg("Run commit (retries %d)", retries);
+		err = ubifs_run_commit(c);
+		if (err)
+			return err;
+	} while (retries++ < MAX_MKSPC_RETRIES);
+
+	return -ENOSPC;
+}
+
+/**
+ * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and returns the number of LEBs which should be kept
+ * for index usage.
+ */
+int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
+{
+	int idx_lebs;
+	long long idx_size;
+
+	idx_size = c->bi.old_idx_sz + c->bi.idx_growth + c->bi.uncommitted_idx;
+	/* And make sure we have thrice the index size of space reserved */
+	idx_size += idx_size << 1;
+	/*
+	 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
+	 * pair, nor similarly the two variables for the new index size, so we
+	 * have to do this costly 64-bit division on fast-path.
+	 */
+	idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size);
+	/*
+	 * The index head is not available for the in-the-gaps method, so add an
+	 * extra LEB to compensate.
+	 */
+	idx_lebs += 1;
+	if (idx_lebs < MIN_INDEX_LEBS)
+		idx_lebs = MIN_INDEX_LEBS;
+	return idx_lebs;
+}
+
+/**
+ * ubifs_calc_available - calculate available FS space.
+ * @c: UBIFS file-system description object
+ * @min_idx_lebs: minimum number of LEBs reserved for the index
+ *
+ * This function calculates and returns amount of FS space available for use.
+ */
+long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
+{
+	int subtract_lebs;
+	long long available;
+
+	available = c->main_bytes - c->lst.total_used;
+
+	/*
+	 * Now 'available' contains theoretically available flash space
+	 * assuming there is no index, so we have to subtract the space which
+	 * is reserved for the index.
+	 */
+	subtract_lebs = min_idx_lebs;
+
+	/* Take into account that GC reserves one LEB for its own needs */
+	subtract_lebs += 1;
+
+	/*
+	 * The GC journal head LEB is not really accessible. And since
+	 * different write types go to different heads, we may count only on
+	 * one head's space.
+	 */
+	subtract_lebs += c->jhead_cnt - 1;
+
+	/* We also reserve one LEB for deletions, which bypass budgeting */
+	subtract_lebs += 1;
+
+	available -= (long long)subtract_lebs * c->leb_size;
+
+	/* Subtract the dead space which is not available for use */
+	available -= c->lst.total_dead;
+
+	/*
+	 * Subtract dark space, which might or might not be usable - it depends
+	 * on the data which we have on the media and which will be written. If
+	 * this is a lot of uncompressed or not-compressible data, the dark
+	 * space cannot be used.
+	 */
+	available -= c->lst.total_dark;
+
+	/*
+	 * However, there is more dark space. The index may be bigger than
+	 * @min_idx_lebs. Those extra LEBs are assumed to be available, but
+	 * their dark space is not included in total_dark, so it is subtracted
+	 * here.
+	 */
+	if (c->lst.idx_lebs > min_idx_lebs) {
+		subtract_lebs = c->lst.idx_lebs - min_idx_lebs;
+		available -= subtract_lebs * c->dark_wm;
+	}
+
+	/* The calculations are rough and may end up with a negative number */
+	return available > 0 ? available : 0;
+}
+
+/**
+ * can_use_rp - check whether the user is allowed to use reserved pool.
+ * @c: UBIFS file-system description object
+ *
+ * UBIFS has so-called "reserved pool" which is flash space reserved
+ * for the superuser and for uses whose UID/GID is recorded in UBIFS superblock.
+ * This function checks whether current user is allowed to use reserved pool.
+ * Returns %1  current user is allowed to use reserved pool and %0 otherwise.
+ */
+static int can_use_rp(struct ubifs_info *c)
+{
+	if (current_fsuid() == c->rp_uid || capable(CAP_SYS_RESOURCE) ||
+	    (c->rp_gid != 0 && in_group_p(c->rp_gid)))
+		return 1;
+	return 0;
+}
+
+/**
+ * do_budget_space - reserve flash space for index and data growth.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure UBIFS has enough free LEBs for index growth and
+ * data.
+ *
+ * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
+ * would take if it was consolidated and written to the flash. This guarantees
+ * that the "in-the-gaps" commit method always succeeds and UBIFS will always
+ * be able to commit dirty index. So this function basically adds amount of
+ * budgeted index space to the size of the current index, multiplies this by 3,
+ * and makes sure this does not exceed the amount of free LEBs.
+ *
+ * Notes about @c->bi.min_idx_lebs and @c->lst.idx_lebs variables:
+ * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
+ *    be large, because UBIFS does not do any index consolidation as long as
+ *    there is free space. IOW, the index may take a lot of LEBs, but the LEBs
+ *    will contain a lot of dirt.
+ * o @c->bi.min_idx_lebs is the number of LEBS the index presumably takes. IOW,
+ *    the index may be consolidated to take up to @c->bi.min_idx_lebs LEBs.
+ *
+ * This function returns zero in case of success, and %-ENOSPC in case of
+ * failure.
+ */
+static int do_budget_space(struct ubifs_info *c)
+{
+	long long outstanding, available;
+	int lebs, rsvd_idx_lebs, min_idx_lebs;
+
+	/* First budget index space */
+	min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+	/* Now 'min_idx_lebs' contains number of LEBs to reserve */
+	if (min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+	else
+		rsvd_idx_lebs = 0;
+
+	/*
+	 * The number of LEBs that are available to be used by the index is:
+	 *
+	 *    @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt -
+	 *    @c->lst.taken_empty_lebs
+	 *
+	 * @c->lst.empty_lebs are available because they are empty.
+	 * @c->freeable_cnt are available because they contain only free and
+	 * dirty space, @c->idx_gc_cnt are available because they are index
+	 * LEBs that have been garbage collected and are awaiting the commit
+	 * before they can be used. And the in-the-gaps method will grab these
+	 * if it needs them. @c->lst.taken_empty_lebs are empty LEBs that have
+	 * already been allocated for some purpose.
+	 *
+	 * Note, @c->idx_gc_cnt is included to both @c->lst.empty_lebs (because
+	 * these LEBs are empty) and to @c->lst.taken_empty_lebs (because they
+	 * are taken until after the commit).
+	 *
+	 * Note, @c->lst.taken_empty_lebs may temporarily be higher by one
+	 * because of the way we serialize LEB allocations and budgeting. See a
+	 * comment in 'ubifs_find_free_space()'.
+	 */
+	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+	       c->lst.taken_empty_lebs;
+	if (unlikely(rsvd_idx_lebs > lebs)) {
+		dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
+			 "rsvd_idx_lebs %d", min_idx_lebs, c->bi.min_idx_lebs,
+			 rsvd_idx_lebs);
+		return -ENOSPC;
+	}
+
+	available = ubifs_calc_available(c, min_idx_lebs);
+	outstanding = c->bi.data_growth + c->bi.dd_growth;
+
+	if (unlikely(available < outstanding)) {
+		dbg_budg("out of data space: available %lld, outstanding %lld",
+			 available, outstanding);
+		return -ENOSPC;
+	}
+
+	if (available - outstanding <= c->rp_size && !can_use_rp(c))
+		return -ENOSPC;
+
+	c->bi.min_idx_lebs = min_idx_lebs;
+	return 0;
+}
+
+/**
+ * calc_idx_growth - calculate approximate index growth from budgeting request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ *
+ * For now we assume each new node adds one znode. But this is rather poor
+ * approximation, though.
+ */
+static int calc_idx_growth(const struct ubifs_info *c,
+			   const struct ubifs_budget_req *req)
+{
+	int znodes;
+
+	znodes = req->new_ino + (req->new_page << UBIFS_BLOCKS_PER_PAGE_SHIFT) +
+		 req->new_dent;
+	return znodes * c->max_idx_node_sz;
+}
+
+/**
+ * calc_data_growth - calculate approximate amount of new data from budgeting
+ * request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ */
+static int calc_data_growth(const struct ubifs_info *c,
+			    const struct ubifs_budget_req *req)
+{
+	int data_growth;
+
+	data_growth = req->new_ino  ? c->bi.inode_budget : 0;
+	if (req->new_page)
+		data_growth += c->bi.page_budget;
+	if (req->new_dent)
+		data_growth += c->bi.dent_budget;
+	data_growth += req->new_ino_d;
+	return data_growth;
+}
+
+/**
+ * calc_dd_growth - calculate approximate amount of data which makes other data
+ * dirty from budgeting request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ */
+static int calc_dd_growth(const struct ubifs_info *c,
+			  const struct ubifs_budget_req *req)
+{
+	int dd_growth;
+
+	dd_growth = req->dirtied_page ? c->bi.page_budget : 0;
+
+	if (req->dirtied_ino)
+		dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1);
+	if (req->mod_dent)
+		dd_growth += c->bi.dent_budget;
+	dd_growth += req->dirtied_ino_d;
+	return dd_growth;
+}
+
+/**
+ * ubifs_budget_space - ensure there is enough space to complete an operation.
+ * @c: UBIFS file-system description object
+ * @req: budget request
+ *
+ * This function allocates budget for an operation. It uses pessimistic
+ * approximation of how much flash space the operation needs. The goal of this
+ * function is to make sure UBIFS always has flash space to flush all dirty
+ * pages, dirty inodes, and dirty znodes (liability). This function may force
+ * commit, garbage-collection or write-back. Returns zero in case of success,
+ * %-ENOSPC if there is no free space and other negative error codes in case of
+ * failures.
+ */
+int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
+{
+	int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
+	int err, idx_growth, data_growth, dd_growth, retried = 0;
+
+	ubifs_assert(req->new_page <= 1);
+	ubifs_assert(req->dirtied_page <= 1);
+	ubifs_assert(req->new_dent <= 1);
+	ubifs_assert(req->mod_dent <= 1);
+	ubifs_assert(req->new_ino <= 1);
+	ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
+	ubifs_assert(req->dirtied_ino <= 4);
+	ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+	ubifs_assert(!(req->new_ino_d & 7));
+	ubifs_assert(!(req->dirtied_ino_d & 7));
+
+	data_growth = calc_data_growth(c, req);
+	dd_growth = calc_dd_growth(c, req);
+	if (!data_growth && !dd_growth)
+		return 0;
+	idx_growth = calc_idx_growth(c, req);
+
+again:
+	spin_lock(&c->space_lock);
+	ubifs_assert(c->bi.idx_growth >= 0);
+	ubifs_assert(c->bi.data_growth >= 0);
+	ubifs_assert(c->bi.dd_growth >= 0);
+
+	if (unlikely(c->bi.nospace) && (c->bi.nospace_rp || !can_use_rp(c))) {
+		dbg_budg("no space");
+		spin_unlock(&c->space_lock);
+		return -ENOSPC;
+	}
+
+	c->bi.idx_growth += idx_growth;
+	c->bi.data_growth += data_growth;
+	c->bi.dd_growth += dd_growth;
+
+	err = do_budget_space(c);
+	if (likely(!err)) {
+		req->idx_growth = idx_growth;
+		req->data_growth = data_growth;
+		req->dd_growth = dd_growth;
+		spin_unlock(&c->space_lock);
+		return 0;
+	}
+
+	/* Restore the old values */
+	c->bi.idx_growth -= idx_growth;
+	c->bi.data_growth -= data_growth;
+	c->bi.dd_growth -= dd_growth;
+	spin_unlock(&c->space_lock);
+
+	if (req->fast) {
+		dbg_budg("no space for fast budgeting");
+		return err;
+	}
+
+	err = make_free_space(c);
+	cond_resched();
+	if (err == -EAGAIN) {
+		dbg_budg("try again");
+		goto again;
+	} else if (err == -ENOSPC) {
+		if (!retried) {
+			retried = 1;
+			dbg_budg("-ENOSPC, but anyway try once again");
+			goto again;
+		}
+		dbg_budg("FS is full, -ENOSPC");
+		c->bi.nospace = 1;
+		if (can_use_rp(c) || c->rp_size == 0)
+			c->bi.nospace_rp = 1;
+		smp_wmb();
+	} else
+		ubifs_err("cannot budget space, error %d", err);
+	return err;
+}
+
+/**
+ * ubifs_release_budget - release budgeted free space.
+ * @c: UBIFS file-system description object
+ * @req: budget request
+ *
+ * This function releases the space budgeted by 'ubifs_budget_space()'. Note,
+ * since the index changes (which were budgeted for in @req->idx_growth) will
+ * only be written to the media on commit, this function moves the index budget
+ * from @c->bi.idx_growth to @c->bi.uncommitted_idx. The latter will be zeroed
+ * by the commit operation.
+ */
+void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
+{
+	ubifs_assert(req->new_page <= 1);
+	ubifs_assert(req->dirtied_page <= 1);
+	ubifs_assert(req->new_dent <= 1);
+	ubifs_assert(req->mod_dent <= 1);
+	ubifs_assert(req->new_ino <= 1);
+	ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
+	ubifs_assert(req->dirtied_ino <= 4);
+	ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+	ubifs_assert(!(req->new_ino_d & 7));
+	ubifs_assert(!(req->dirtied_ino_d & 7));
+	if (!req->recalculate) {
+		ubifs_assert(req->idx_growth >= 0);
+		ubifs_assert(req->data_growth >= 0);
+		ubifs_assert(req->dd_growth >= 0);
+	}
+
+	if (req->recalculate) {
+		req->data_growth = calc_data_growth(c, req);
+		req->dd_growth = calc_dd_growth(c, req);
+		req->idx_growth = calc_idx_growth(c, req);
+	}
+
+	if (!req->data_growth && !req->dd_growth)
+		return;
+
+	c->bi.nospace = c->bi.nospace_rp = 0;
+	smp_wmb();
+
+	spin_lock(&c->space_lock);
+	c->bi.idx_growth -= req->idx_growth;
+	c->bi.uncommitted_idx += req->idx_growth;
+	c->bi.data_growth -= req->data_growth;
+	c->bi.dd_growth -= req->dd_growth;
+	c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+	ubifs_assert(c->bi.idx_growth >= 0);
+	ubifs_assert(c->bi.data_growth >= 0);
+	ubifs_assert(c->bi.dd_growth >= 0);
+	ubifs_assert(c->bi.min_idx_lebs < c->main_lebs);
+	ubifs_assert(!(c->bi.idx_growth & 7));
+	ubifs_assert(!(c->bi.data_growth & 7));
+	ubifs_assert(!(c->bi.dd_growth & 7));
+	spin_unlock(&c->space_lock);
+}
+
+/**
+ * ubifs_convert_page_budget - convert budget of a new page.
+ * @c: UBIFS file-system description object
+ *
+ * This function converts budget which was allocated for a new page of data to
+ * the budget of changing an existing page of data. The latter is smaller than
+ * the former, so this function only does simple re-calculation and does not
+ * involve any write-back.
+ */
+void ubifs_convert_page_budget(struct ubifs_info *c)
+{
+	spin_lock(&c->space_lock);
+	/* Release the index growth reservation */
+	c->bi.idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	/* Release the data growth reservation */
+	c->bi.data_growth -= c->bi.page_budget;
+	/* Increase the dirty data growth reservation instead */
+	c->bi.dd_growth += c->bi.page_budget;
+	/* And re-calculate the indexing space reservation */
+	c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	spin_unlock(&c->space_lock);
+}
+
+/**
+ * ubifs_release_dirty_inode_budget - release dirty inode budget.
+ * @c: UBIFS file-system description object
+ * @ui: UBIFS inode to release the budget for
+ *
+ * This function releases budget corresponding to a dirty inode. It is usually
+ * called when after the inode has been written to the media and marked as
+ * clean. It also causes the "no space" flags to be cleared.
+ */
+void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
+				      struct ubifs_inode *ui)
+{
+	struct ubifs_budget_req req;
+
+	memset(&req, 0, sizeof(struct ubifs_budget_req));
+	/* The "no space" flags will be cleared because dd_growth is > 0 */
+	req.dd_growth = c->bi.inode_budget + ALIGN(ui->data_len, 8);
+	ubifs_release_budget(c, &req);
+}
+
+/**
+ * ubifs_reported_space - calculate reported free space.
+ * @c: the UBIFS file-system description object
+ * @free: amount of free space
+ *
+ * This function calculates amount of free space which will be reported to
+ * user-space. User-space application tend to expect that if the file-system
+ * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
+ * are able to write a file of size N. UBIFS attaches node headers to each data
+ * node and it has to write indexing nodes as well. This introduces additional
+ * overhead, and UBIFS has to report slightly less free space to meet the above
+ * expectations.
+ *
+ * This function assumes free space is made up of uncompressed data nodes and
+ * full index nodes (one per data node, tripled because we always allow enough
+ * space to write the index thrice).
+ *
+ * Note, the calculation is pessimistic, which means that most of the time
+ * UBIFS reports less space than it actually has.
+ */
+long long ubifs_reported_space(const struct ubifs_info *c, long long free)
+{
+	int divisor, factor, f;
+
+	/*
+	 * Reported space size is @free * X, where X is UBIFS block size
+	 * divided by UBIFS block size + all overhead one data block
+	 * introduces. The overhead is the node header + indexing overhead.
+	 *
+	 * Indexing overhead calculations are based on the following formula:
+	 * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number
+	 * of data nodes, f - fanout. Because effective UBIFS fanout is twice
+	 * as less than maximum fanout, we assume that each data node
+	 * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
+	 * Note, the multiplier 3 is because UBIFS reserves thrice as more space
+	 * for the index.
+	 */
+	f = c->fanout > 3 ? c->fanout >> 1 : 2;
+	factor = UBIFS_BLOCK_SIZE;
+	divisor = UBIFS_MAX_DATA_NODE_SZ;
+	divisor += (c->max_idx_node_sz * 3) / (f - 1);
+	free *= factor;
+	return div_u64(free, divisor);
+}
+
+/**
+ * ubifs_get_free_space_nolock - return amount of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates amount of free space to report to user-space.
+ *
+ * Because UBIFS may introduce substantial overhead (the index, node headers,
+ * alignment, wastage at the end of LEBs, etc), it cannot report real amount of
+ * free flash space it has (well, because not all dirty space is reclaimable,
+ * UBIFS does not actually know the real amount). If UBIFS did so, it would
+ * bread user expectations about what free space is. Users seem to accustomed
+ * to assume that if the file-system reports N bytes of free space, they would
+ * be able to fit a file of N bytes to the FS. This almost works for
+ * traditional file-systems, because they have way less overhead than UBIFS.
+ * So, to keep users happy, UBIFS tries to take the overhead into account.
+ */
+long long ubifs_get_free_space_nolock(struct ubifs_info *c)
+{
+	int rsvd_idx_lebs, lebs;
+	long long available, outstanding, free;
+
+	ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+	outstanding = c->bi.data_growth + c->bi.dd_growth;
+	available = ubifs_calc_available(c, c->bi.min_idx_lebs);
+
+	/*
+	 * When reporting free space to user-space, UBIFS guarantees that it is
+	 * possible to write a file of free space size. This means that for
+	 * empty LEBs we may use more precise calculations than
+	 * 'ubifs_calc_available()' is using. Namely, we know that in empty
+	 * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm.
+	 * Thus, amend the available space.
+	 *
+	 * Note, the calculations below are similar to what we have in
+	 * 'do_budget_space()', so refer there for comments.
+	 */
+	if (c->bi.min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
+	else
+		rsvd_idx_lebs = 0;
+	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+	       c->lst.taken_empty_lebs;
+	lebs -= rsvd_idx_lebs;
+	available += lebs * (c->dark_wm - c->leb_overhead);
+
+	if (available > outstanding)
+		free = ubifs_reported_space(c, available - outstanding);
+	else
+		free = 0;
+	return free;
+}
+
+/**
+ * ubifs_get_free_space - return amount of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and returns amount of free space to report to
+ * user-space.
+ */
+long long ubifs_get_free_space(struct ubifs_info *c)
+{
+	long long free;
+
+	spin_lock(&c->space_lock);
+	free = ubifs_get_free_space_nolock(c);
+	spin_unlock(&c->space_lock);
+
+	return free;
+}
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
new file mode 100644
index 00000000..87cd0ead
--- /dev/null
+++ b/fs/ubifs/commit.c
@@ -0,0 +1,738 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements functions that manage the running of the commit process.
+ * Each affected module has its own functions to accomplish their part in the
+ * commit and those functions are called here.
+ *
+ * The commit is the process whereby all updates to the index and LEB properties
+ * are written out together and the journal becomes empty. This keeps the
+ * file system consistent - at all times the state can be recreated by reading
+ * the index and LEB properties and then replaying the journal.
+ *
+ * The commit is split into two parts named "commit start" and "commit end".
+ * During commit start, the commit process has exclusive access to the journal
+ * by holding the commit semaphore down for writing. As few I/O operations as
+ * possible are performed during commit start, instead the nodes that are to be
+ * written are merely identified. During commit end, the commit semaphore is no
+ * longer held and the journal is again in operation, allowing users to continue
+ * to use the file system while the bulk of the commit I/O is performed. The
+ * purpose of this two-step approach is to prevent the commit from causing any
+ * latency blips. Note that in any case, the commit does not prevent lookups
+ * (as permitted by the TNC mutex), or access to VFS data structures e.g. page
+ * cache.
+ */
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include "ubifs.h"
+
+/*
+ * nothing_to_commit - check if there is nothing to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which checks if there is anything to commit. It is
+ * used as an optimization to avoid starting the commit if it is not really
+ * necessary. Indeed, the commit operation always assumes flash I/O (e.g.,
+ * writing the commit start node to the log), and it is better to avoid doing
+ * this unnecessarily. E.g., 'ubifs_sync_fs()' runs the commit, but if there is
+ * nothing to commit, it is more optimal to avoid any flash I/O.
+ *
+ * This function has to be called with @c->commit_sem locked for writing -
+ * this function does not take LPT/TNC locks because the @c->commit_sem
+ * guarantees that we have exclusive access to the TNC and LPT data structures.
+ *
+ * This function returns %1 if there is nothing to commit and %0 otherwise.
+ */
+static int nothing_to_commit(struct ubifs_info *c)
+{
+	/*
+	 * During mounting or remounting from R/O mode to R/W mode we may
+	 * commit for various recovery-related reasons.
+	 */
+	if (c->mounting || c->remounting_rw)
+		return 0;
+
+	/*
+	 * If the root TNC node is dirty, we definitely have something to
+	 * commit.
+	 */
+	if (c->zroot.znode && test_bit(DIRTY_ZNODE, &c->zroot.znode->flags))
+		return 0;
+
+	/*
+	 * Even though the TNC is clean, the LPT tree may have dirty nodes. For
+	 * example, this may happen if the budgeting subsystem invoked GC to
+	 * make some free space, and the GC found an LEB with only dirty and
+	 * free space. In this case GC would just change the lprops of this
+	 * LEB (by turning all space into free space) and unmap it.
+	 */
+	if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags))
+		return 0;
+
+	ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
+	ubifs_assert(c->dirty_pn_cnt == 0);
+	ubifs_assert(c->dirty_nn_cnt == 0);
+
+	return 1;
+}
+
+/**
+ * do_commit - commit the journal.
+ * @c: UBIFS file-system description object
+ *
+ * This function implements UBIFS commit. It has to be called with commit lock
+ * locked. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int do_commit(struct ubifs_info *c)
+{
+	int err, new_ltail_lnum, old_ltail_lnum, i;
+	struct ubifs_zbranch zroot;
+	struct ubifs_lp_stats lst;
+
+	dbg_cmt("start");
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+
+	if (c->ro_error) {
+		err = -EROFS;
+		goto out_up;
+	}
+
+	if (nothing_to_commit(c)) {
+		up_write(&c->commit_sem);
+		err = 0;
+		goto out_cancel;
+	}
+
+	/* Sync all write buffers (necessary for recovery) */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		if (err)
+			goto out_up;
+	}
+
+	c->cmt_no += 1;
+	err = ubifs_gc_start_commit(c);
+	if (err)
+		goto out_up;
+	err = dbg_check_lprops(c);
+	if (err)
+		goto out_up;
+	err = ubifs_log_start_commit(c, &new_ltail_lnum);
+	if (err)
+		goto out_up;
+	err = ubifs_tnc_start_commit(c, &zroot);
+	if (err)
+		goto out_up;
+	err = ubifs_lpt_start_commit(c);
+	if (err)
+		goto out_up;
+	err = ubifs_orphan_start_commit(c);
+	if (err)
+		goto out_up;
+
+	ubifs_get_lp_stats(c, &lst);
+
+	up_write(&c->commit_sem);
+
+	err = ubifs_tnc_end_commit(c);
+	if (err)
+		goto out;
+	err = ubifs_lpt_end_commit(c);
+	if (err)
+		goto out;
+	err = ubifs_orphan_end_commit(c);
+	if (err)
+		goto out;
+	old_ltail_lnum = c->ltail_lnum;
+	err = ubifs_log_end_commit(c, new_ltail_lnum);
+	if (err)
+		goto out;
+	err = dbg_check_old_index(c, &zroot);
+	if (err)
+		goto out;
+
+	mutex_lock(&c->mst_mutex);
+	c->mst_node->cmt_no      = cpu_to_le64(c->cmt_no);
+	c->mst_node->log_lnum    = cpu_to_le32(new_ltail_lnum);
+	c->mst_node->root_lnum   = cpu_to_le32(zroot.lnum);
+	c->mst_node->root_offs   = cpu_to_le32(zroot.offs);
+	c->mst_node->root_len    = cpu_to_le32(zroot.len);
+	c->mst_node->ihead_lnum  = cpu_to_le32(c->ihead_lnum);
+	c->mst_node->ihead_offs  = cpu_to_le32(c->ihead_offs);
+	c->mst_node->index_size  = cpu_to_le64(c->bi.old_idx_sz);
+	c->mst_node->lpt_lnum    = cpu_to_le32(c->lpt_lnum);
+	c->mst_node->lpt_offs    = cpu_to_le32(c->lpt_offs);
+	c->mst_node->nhead_lnum  = cpu_to_le32(c->nhead_lnum);
+	c->mst_node->nhead_offs  = cpu_to_le32(c->nhead_offs);
+	c->mst_node->ltab_lnum   = cpu_to_le32(c->ltab_lnum);
+	c->mst_node->ltab_offs   = cpu_to_le32(c->ltab_offs);
+	c->mst_node->lsave_lnum  = cpu_to_le32(c->lsave_lnum);
+	c->mst_node->lsave_offs  = cpu_to_le32(c->lsave_offs);
+	c->mst_node->lscan_lnum  = cpu_to_le32(c->lscan_lnum);
+	c->mst_node->empty_lebs  = cpu_to_le32(lst.empty_lebs);
+	c->mst_node->idx_lebs    = cpu_to_le32(lst.idx_lebs);
+	c->mst_node->total_free  = cpu_to_le64(lst.total_free);
+	c->mst_node->total_dirty = cpu_to_le64(lst.total_dirty);
+	c->mst_node->total_used  = cpu_to_le64(lst.total_used);
+	c->mst_node->total_dead  = cpu_to_le64(lst.total_dead);
+	c->mst_node->total_dark  = cpu_to_le64(lst.total_dark);
+	if (c->no_orphs)
+		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+	else
+		c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS);
+	err = ubifs_write_master(c);
+	mutex_unlock(&c->mst_mutex);
+	if (err)
+		goto out;
+
+	err = ubifs_log_post_commit(c, old_ltail_lnum);
+	if (err)
+		goto out;
+	err = ubifs_gc_end_commit(c);
+	if (err)
+		goto out;
+	err = ubifs_lpt_post_commit(c);
+	if (err)
+		goto out;
+
+out_cancel:
+	spin_lock(&c->cs_lock);
+	c->cmt_state = COMMIT_RESTING;
+	wake_up(&c->cmt_wq);
+	dbg_cmt("commit end");
+	spin_unlock(&c->cs_lock);
+	return 0;
+
+out_up:
+	up_write(&c->commit_sem);
+out:
+	ubifs_err("commit failed, error %d", err);
+	spin_lock(&c->cs_lock);
+	c->cmt_state = COMMIT_BROKEN;
+	wake_up(&c->cmt_wq);
+	spin_unlock(&c->cs_lock);
+	ubifs_ro_mode(c, err);
+	return err;
+}
+
+/**
+ * run_bg_commit - run background commit if it is needed.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs background commit if it is needed. Returns zero in case
+ * of success and a negative error code in case of failure.
+ */
+static int run_bg_commit(struct ubifs_info *c)
+{
+	spin_lock(&c->cs_lock);
+	/*
+	 * Run background commit only if background commit was requested or if
+	 * commit is required.
+	 */
+	if (c->cmt_state != COMMIT_BACKGROUND &&
+	    c->cmt_state != COMMIT_REQUIRED)
+		goto out;
+	spin_unlock(&c->cs_lock);
+
+	down_write(&c->commit_sem);
+	spin_lock(&c->cs_lock);
+	if (c->cmt_state == COMMIT_REQUIRED)
+		c->cmt_state = COMMIT_RUNNING_REQUIRED;
+	else if (c->cmt_state == COMMIT_BACKGROUND)
+		c->cmt_state = COMMIT_RUNNING_BACKGROUND;
+	else
+		goto out_cmt_unlock;
+	spin_unlock(&c->cs_lock);
+
+	return do_commit(c);
+
+out_cmt_unlock:
+	up_write(&c->commit_sem);
+out:
+	spin_unlock(&c->cs_lock);
+	return 0;
+}
+
+/**
+ * ubifs_bg_thread - UBIFS background thread function.
+ * @info: points to the file-system description object
+ *
+ * This function implements various file-system background activities:
+ * o when a write-buffer timer expires it synchronizes the appropriate
+ *   write-buffer;
+ * o when the journal is about to be full, it starts in-advance commit.
+ *
+ * Note, other stuff like background garbage collection may be added here in
+ * future.
+ */
+int ubifs_bg_thread(void *info)
+{
+	int err;
+	struct ubifs_info *c = info;
+
+	dbg_msg("background thread \"%s\" started, PID %d",
+		c->bgt_name, current->pid);
+	set_freezable();
+
+	while (1) {
+		if (kthread_should_stop())
+			break;
+
+		if (try_to_freeze())
+			continue;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		/* Check if there is something to do */
+		if (!c->need_bgt) {
+			/*
+			 * Nothing prevents us from going sleep now and
+			 * be never woken up and block the task which
+			 * could wait in 'kthread_stop()' forever.
+			 */
+			if (kthread_should_stop())
+				break;
+			schedule();
+			continue;
+		} else
+			__set_current_state(TASK_RUNNING);
+
+		c->need_bgt = 0;
+		err = ubifs_bg_wbufs_sync(c);
+		if (err)
+			ubifs_ro_mode(c, err);
+
+		run_bg_commit(c);
+		cond_resched();
+	}
+
+	dbg_msg("background thread \"%s\" stops", c->bgt_name);
+	return 0;
+}
+
+/**
+ * ubifs_commit_required - set commit state to "required".
+ * @c: UBIFS file-system description object
+ *
+ * This function is called if a commit is required but cannot be done from the
+ * calling function, so it is just flagged instead.
+ */
+void ubifs_commit_required(struct ubifs_info *c)
+{
+	spin_lock(&c->cs_lock);
+	switch (c->cmt_state) {
+	case COMMIT_RESTING:
+	case COMMIT_BACKGROUND:
+		dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+			dbg_cstate(COMMIT_REQUIRED));
+		c->cmt_state = COMMIT_REQUIRED;
+		break;
+	case COMMIT_RUNNING_BACKGROUND:
+		dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+			dbg_cstate(COMMIT_RUNNING_REQUIRED));
+		c->cmt_state = COMMIT_RUNNING_REQUIRED;
+		break;
+	case COMMIT_REQUIRED:
+	case COMMIT_RUNNING_REQUIRED:
+	case COMMIT_BROKEN:
+		break;
+	}
+	spin_unlock(&c->cs_lock);
+}
+
+/**
+ * ubifs_request_bg_commit - notify the background thread to do a commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called if the journal is full enough to make a commit
+ * worthwhile, so background thread is kicked to start it.
+ */
+void ubifs_request_bg_commit(struct ubifs_info *c)
+{
+	spin_lock(&c->cs_lock);
+	if (c->cmt_state == COMMIT_RESTING) {
+		dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+			dbg_cstate(COMMIT_BACKGROUND));
+		c->cmt_state = COMMIT_BACKGROUND;
+		spin_unlock(&c->cs_lock);
+		ubifs_wake_up_bgt(c);
+	} else
+		spin_unlock(&c->cs_lock);
+}
+
+/**
+ * wait_for_commit - wait for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function sleeps until the commit operation is no longer running.
+ */
+static int wait_for_commit(struct ubifs_info *c)
+{
+	dbg_cmt("pid %d goes sleep", current->pid);
+
+	/*
+	 * The following sleeps if the condition is false, and will be woken
+	 * when the commit ends. It is possible, although very unlikely, that we
+	 * will wake up and see the subsequent commit running, rather than the
+	 * one we were waiting for, and go back to sleep.  However, we will be
+	 * woken again, so there is no danger of sleeping forever.
+	 */
+	wait_event(c->cmt_wq, c->cmt_state != COMMIT_RUNNING_BACKGROUND &&
+			      c->cmt_state != COMMIT_RUNNING_REQUIRED);
+	dbg_cmt("commit finished, pid %d woke up", current->pid);
+	return 0;
+}
+
+/**
+ * ubifs_run_commit - run or wait for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs commit and returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+int ubifs_run_commit(struct ubifs_info *c)
+{
+	int err = 0;
+
+	spin_lock(&c->cs_lock);
+	if (c->cmt_state == COMMIT_BROKEN) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
+		/*
+		 * We set the commit state to 'running required' to indicate
+		 * that we want it to complete as quickly as possible.
+		 */
+		c->cmt_state = COMMIT_RUNNING_REQUIRED;
+
+	if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+		spin_unlock(&c->cs_lock);
+		return wait_for_commit(c);
+	}
+	spin_unlock(&c->cs_lock);
+
+	/* Ok, the commit is indeed needed */
+
+	down_write(&c->commit_sem);
+	spin_lock(&c->cs_lock);
+	/*
+	 * Since we unlocked 'c->cs_lock', the state may have changed, so
+	 * re-check it.
+	 */
+	if (c->cmt_state == COMMIT_BROKEN) {
+		err = -EINVAL;
+		goto out_cmt_unlock;
+	}
+
+	if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
+		c->cmt_state = COMMIT_RUNNING_REQUIRED;
+
+	if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+		up_write(&c->commit_sem);
+		spin_unlock(&c->cs_lock);
+		return wait_for_commit(c);
+	}
+	c->cmt_state = COMMIT_RUNNING_REQUIRED;
+	spin_unlock(&c->cs_lock);
+
+	err = do_commit(c);
+	return err;
+
+out_cmt_unlock:
+	up_write(&c->commit_sem);
+out:
+	spin_unlock(&c->cs_lock);
+	return err;
+}
+
+/**
+ * ubifs_gc_should_commit - determine if it is time for GC to run commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called by garbage collection to determine if commit should
+ * be run. If commit state is @COMMIT_BACKGROUND, which means that the journal
+ * is full enough to start commit, this function returns true. It is not
+ * absolutely necessary to commit yet, but it feels like this should be better
+ * then to keep doing GC. This function returns %1 if GC has to initiate commit
+ * and %0 if not.
+ */
+int ubifs_gc_should_commit(struct ubifs_info *c)
+{
+	int ret = 0;
+
+	spin_lock(&c->cs_lock);
+	if (c->cmt_state == COMMIT_BACKGROUND) {
+		dbg_cmt("commit required now");
+		c->cmt_state = COMMIT_REQUIRED;
+	} else
+		dbg_cmt("commit not requested");
+	if (c->cmt_state == COMMIT_REQUIRED)
+		ret = 1;
+	spin_unlock(&c->cs_lock);
+	return ret;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * struct idx_node - hold index nodes during index tree traversal.
+ * @list: list
+ * @iip: index in parent (slot number of this indexing node in the parent
+ *       indexing node)
+ * @upper_key: all keys in this indexing node have to be less or equivalent to
+ *             this key
+ * @idx: index node (8-byte aligned because all node structures must be 8-byte
+ *       aligned)
+ */
+struct idx_node {
+	struct list_head list;
+	int iip;
+	union ubifs_key upper_key;
+	struct ubifs_idx_node idx __attribute__((aligned(8)));
+};
+
+/**
+ * dbg_old_index_check_init - get information for the next old index check.
+ * @c: UBIFS file-system description object
+ * @zroot: root of the index
+ *
+ * This function records information about the index that will be needed for the
+ * next old index check i.e. 'dbg_check_old_index()'.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+	struct ubifs_idx_node *idx;
+	int lnum, offs, len, err = 0;
+	struct ubifs_debug_info *d = c->dbg;
+
+	d->old_zroot = *zroot;
+	lnum = d->old_zroot.lnum;
+	offs = d->old_zroot.offs;
+	len = d->old_zroot.len;
+
+	idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
+	if (!idx)
+		return -ENOMEM;
+
+	err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+	if (err)
+		goto out;
+
+	d->old_zroot_level = le16_to_cpu(idx->level);
+	d->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
+out:
+	kfree(idx);
+	return err;
+}
+
+/**
+ * dbg_check_old_index - check the old copy of the index.
+ * @c: UBIFS file-system description object
+ * @zroot: root of the new index
+ *
+ * In order to be able to recover from an unclean unmount, a complete copy of
+ * the index must exist on flash. This is the "old" index. The commit process
+ * must write the "new" index to flash without overwriting or destroying any
+ * part of the old index. This function is run at commit end in order to check
+ * that the old index does indeed exist completely intact.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+	int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
+	int first = 1, iip;
+	struct ubifs_debug_info *d = c->dbg;
+	union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key;
+	unsigned long long uninitialized_var(last_sqnum);
+	struct ubifs_idx_node *idx;
+	struct list_head list;
+	struct idx_node *i;
+	size_t sz;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX))
+		return 0;
+
+	INIT_LIST_HEAD(&list);
+
+	sz = sizeof(struct idx_node) + ubifs_idx_node_sz(c, c->fanout) -
+	     UBIFS_IDX_NODE_SZ;
+
+	/* Start at the old zroot */
+	lnum = d->old_zroot.lnum;
+	offs = d->old_zroot.offs;
+	len = d->old_zroot.len;
+	iip = 0;
+
+	/*
+	 * Traverse the index tree preorder depth-first i.e. do a node and then
+	 * its subtrees from left to right.
+	 */
+	while (1) {
+		struct ubifs_branch *br;
+
+		/* Get the next index node */
+		i = kmalloc(sz, GFP_NOFS);
+		if (!i) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+		i->iip = iip;
+		/* Keep the index nodes on our path in a linked list */
+		list_add_tail(&i->list, &list);
+		/* Read the index node */
+		idx = &i->idx;
+		err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+		if (err)
+			goto out_free;
+		/* Validate index node */
+		child_cnt = le16_to_cpu(idx->child_cnt);
+		if (child_cnt < 1 || child_cnt > c->fanout) {
+			err = 1;
+			goto out_dump;
+		}
+		if (first) {
+			first = 0;
+			/* Check root level and sqnum */
+			if (le16_to_cpu(idx->level) != d->old_zroot_level) {
+				err = 2;
+				goto out_dump;
+			}
+			if (le64_to_cpu(idx->ch.sqnum) != d->old_zroot_sqnum) {
+				err = 3;
+				goto out_dump;
+			}
+			/* Set last values as though root had a parent */
+			last_level = le16_to_cpu(idx->level) + 1;
+			last_sqnum = le64_to_cpu(idx->ch.sqnum) + 1;
+			key_read(c, ubifs_idx_key(c, idx), &lower_key);
+			highest_ino_key(c, &upper_key, INUM_WATERMARK);
+		}
+		key_copy(c, &upper_key, &i->upper_key);
+		if (le16_to_cpu(idx->level) != last_level - 1) {
+			err = 3;
+			goto out_dump;
+		}
+		/*
+		 * The index is always written bottom up hence a child's sqnum
+		 * is always less than the parents.
+		 */
+		if (le64_to_cpu(idx->ch.sqnum) >= last_sqnum) {
+			err = 4;
+			goto out_dump;
+		}
+		/* Check key range */
+		key_read(c, ubifs_idx_key(c, idx), &l_key);
+		br = ubifs_idx_branch(c, idx, child_cnt - 1);
+		key_read(c, &br->key, &u_key);
+		if (keys_cmp(c, &lower_key, &l_key) > 0) {
+			err = 5;
+			goto out_dump;
+		}
+		if (keys_cmp(c, &upper_key, &u_key) < 0) {
+			err = 6;
+			goto out_dump;
+		}
+		if (keys_cmp(c, &upper_key, &u_key) == 0)
+			if (!is_hash_key(c, &u_key)) {
+				err = 7;
+				goto out_dump;
+			}
+		/* Go to next index node */
+		if (le16_to_cpu(idx->level) == 0) {
+			/* At the bottom, so go up until can go right */
+			while (1) {
+				/* Drop the bottom of the list */
+				list_del(&i->list);
+				kfree(i);
+				/* No more list means we are done */
+				if (list_empty(&list))
+					goto out;
+				/* Look at the new bottom */
+				i = list_entry(list.prev, struct idx_node,
+					       list);
+				idx = &i->idx;
+				/* Can we go right */
+				if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
+					iip = iip + 1;
+					break;
+				} else
+					/* Nope, so go up again */
+					iip = i->iip;
+			}
+		} else
+			/* Go down left */
+			iip = 0;
+		/*
+		 * We have the parent in 'idx' and now we set up for reading the
+		 * child pointed to by slot 'iip'.
+		 */
+		last_level = le16_to_cpu(idx->level);
+		last_sqnum = le64_to_cpu(idx->ch.sqnum);
+		br = ubifs_idx_branch(c, idx, iip);
+		lnum = le32_to_cpu(br->lnum);
+		offs = le32_to_cpu(br->offs);
+		len = le32_to_cpu(br->len);
+		key_read(c, &br->key, &lower_key);
+		if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
+			br = ubifs_idx_branch(c, idx, iip + 1);
+			key_read(c, &br->key, &upper_key);
+		} else
+			key_copy(c, &i->upper_key, &upper_key);
+	}
+out:
+	err = dbg_old_index_check_init(c, zroot);
+	if (err)
+		goto out_free;
+
+	return 0;
+
+out_dump:
+	dbg_err("dumping index node (iip=%d)", i->iip);
+	dbg_dump_node(c, idx);
+	list_del(&i->list);
+	kfree(i);
+	if (!list_empty(&list)) {
+		i = list_entry(list.prev, struct idx_node, list);
+		dbg_err("dumping parent index node");
+		dbg_dump_node(c, &i->idx);
+	}
+out_free:
+	while (!list_empty(&list)) {
+		i = list_entry(list.next, struct idx_node, list);
+		list_del(&i->list);
+		kfree(i);
+	}
+	ubifs_err("failed, error %d", err);
+	if (err > 0)
+		err = -EINVAL;
+	return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
new file mode 100644
index 00000000..11e4132f
--- /dev/null
+++ b/fs/ubifs/compress.c
@@ -0,0 +1,251 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ *          Zoltan Sogor
+ */
+
+/*
+ * This file provides a single place to access to compression and
+ * decompression.
+ */
+
+#include <linux/crypto.h>
+#include "ubifs.h"
+
+/* Fake description object for the "none" compressor */
+static struct ubifs_compressor none_compr = {
+	.compr_type = UBIFS_COMPR_NONE,
+	.name = "none",
+	.capi_name = "",
+};
+
+#ifdef CONFIG_UBIFS_FS_LZO
+static DEFINE_MUTEX(lzo_mutex);
+
+static struct ubifs_compressor lzo_compr = {
+	.compr_type = UBIFS_COMPR_LZO,
+	.comp_mutex = &lzo_mutex,
+	.name = "lzo",
+	.capi_name = "lzo",
+};
+#else
+static struct ubifs_compressor lzo_compr = {
+	.compr_type = UBIFS_COMPR_LZO,
+	.name = "lzo",
+};
+#endif
+
+#ifdef CONFIG_UBIFS_FS_ZLIB
+static DEFINE_MUTEX(deflate_mutex);
+static DEFINE_MUTEX(inflate_mutex);
+
+static struct ubifs_compressor zlib_compr = {
+	.compr_type = UBIFS_COMPR_ZLIB,
+	.comp_mutex = &deflate_mutex,
+	.decomp_mutex = &inflate_mutex,
+	.name = "zlib",
+	.capi_name = "deflate",
+};
+#else
+static struct ubifs_compressor zlib_compr = {
+	.compr_type = UBIFS_COMPR_ZLIB,
+	.name = "zlib",
+};
+#endif
+
+/* All UBIFS compressors */
+struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
+
+/**
+ * ubifs_compress - compress data.
+ * @in_buf: data to compress
+ * @in_len: length of the data to compress
+ * @out_buf: output buffer where compressed data should be stored
+ * @out_len: output buffer length is returned here
+ * @compr_type: type of compression to use on enter, actually used compression
+ *              type on exit
+ *
+ * This function compresses input buffer @in_buf of length @in_len and stores
+ * the result in the output buffer @out_buf and the resulting length in
+ * @out_len. If the input buffer does not compress, it is just copied to the
+ * @out_buf. The same happens if @compr_type is %UBIFS_COMPR_NONE or if
+ * compression error occurred.
+ *
+ * Note, if the input buffer was not compressed, it is copied to the output
+ * buffer and %UBIFS_COMPR_NONE is returned in @compr_type.
+ */
+void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
+		    int *compr_type)
+{
+	int err;
+	struct ubifs_compressor *compr = ubifs_compressors[*compr_type];
+
+	if (*compr_type == UBIFS_COMPR_NONE)
+		goto no_compr;
+
+	/* If the input data is small, do not even try to compress it */
+	if (in_len < UBIFS_MIN_COMPR_LEN)
+		goto no_compr;
+
+	if (compr->comp_mutex)
+		mutex_lock(compr->comp_mutex);
+	err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
+				   (unsigned int *)out_len);
+	if (compr->comp_mutex)
+		mutex_unlock(compr->comp_mutex);
+	if (unlikely(err)) {
+		ubifs_warn("cannot compress %d bytes, compressor %s, "
+			   "error %d, leave data uncompressed",
+			   in_len, compr->name, err);
+		 goto no_compr;
+	}
+
+	/*
+	 * If the data compressed only slightly, it is better to leave it
+	 * uncompressed to improve read speed.
+	 */
+	if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF)
+		goto no_compr;
+
+	return;
+
+no_compr:
+	memcpy(out_buf, in_buf, in_len);
+	*out_len = in_len;
+	*compr_type = UBIFS_COMPR_NONE;
+}
+
+/**
+ * ubifs_decompress - decompress data.
+ * @in_buf: data to decompress
+ * @in_len: length of the data to decompress
+ * @out_buf: output buffer where decompressed data should
+ * @out_len: output length is returned here
+ * @compr_type: type of compression
+ *
+ * This function decompresses data from buffer @in_buf into buffer @out_buf.
+ * The length of the uncompressed data is returned in @out_len. This functions
+ * returns %0 on success or a negative error code on failure.
+ */
+int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
+		     int *out_len, int compr_type)
+{
+	int err;
+	struct ubifs_compressor *compr;
+
+	if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) {
+		ubifs_err("invalid compression type %d", compr_type);
+		return -EINVAL;
+	}
+
+	compr = ubifs_compressors[compr_type];
+
+	if (unlikely(!compr->capi_name)) {
+		ubifs_err("%s compression is not compiled in", compr->name);
+		return -EINVAL;
+	}
+
+	if (compr_type == UBIFS_COMPR_NONE) {
+		memcpy(out_buf, in_buf, in_len);
+		*out_len = in_len;
+		return 0;
+	}
+
+	if (compr->decomp_mutex)
+		mutex_lock(compr->decomp_mutex);
+	err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
+				     (unsigned int *)out_len);
+	if (compr->decomp_mutex)
+		mutex_unlock(compr->decomp_mutex);
+	if (err)
+		ubifs_err("cannot decompress %d bytes, compressor %s, "
+			  "error %d", in_len, compr->name, err);
+
+	return err;
+}
+
+/**
+ * compr_init - initialize a compressor.
+ * @compr: compressor description object
+ *
+ * This function initializes the requested compressor and returns zero in case
+ * of success or a negative error code in case of failure.
+ */
+static int __init compr_init(struct ubifs_compressor *compr)
+{
+	if (compr->capi_name) {
+		compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0);
+		if (IS_ERR(compr->cc)) {
+			ubifs_err("cannot initialize compressor %s, error %ld",
+				  compr->name, PTR_ERR(compr->cc));
+			return PTR_ERR(compr->cc);
+		}
+	}
+
+	ubifs_compressors[compr->compr_type] = compr;
+	return 0;
+}
+
+/**
+ * compr_exit - de-initialize a compressor.
+ * @compr: compressor description object
+ */
+static void compr_exit(struct ubifs_compressor *compr)
+{
+	if (compr->capi_name)
+		crypto_free_comp(compr->cc);
+	return;
+}
+
+/**
+ * ubifs_compressors_init - initialize UBIFS compressors.
+ *
+ * This function initializes the compressor which were compiled in. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+int __init ubifs_compressors_init(void)
+{
+	int err;
+
+	err = compr_init(&lzo_compr);
+	if (err)
+		return err;
+
+	err = compr_init(&zlib_compr);
+	if (err)
+		goto out_lzo;
+
+	ubifs_compressors[UBIFS_COMPR_NONE] = &none_compr;
+	return 0;
+
+out_lzo:
+	compr_exit(&lzo_compr);
+	return err;
+}
+
+/**
+ * ubifs_compressors_exit - de-initialize UBIFS compressors.
+ */
+void ubifs_compressors_exit(void)
+{
+	compr_exit(&lzo_compr);
+	compr_exit(&zlib_compr);
+}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
new file mode 100644
index 00000000..0bb2bcef
--- /dev/null
+++ b/fs/ubifs/debug.c
@@ -0,0 +1,2933 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements most of the debugging stuff which is compiled in only
+ * when it is enabled. But some debugging check functions are implemented in
+ * corresponding subsystem, just because they are closely related and utilize
+ * various local functions of those subsystems.
+ */
+
+#define UBIFS_DBG_PRESERVE_UBI
+
+#include "ubifs.h"
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <linux/math64.h>
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+DEFINE_SPINLOCK(dbg_lock);
+
+static char dbg_key_buf0[128];
+static char dbg_key_buf1[128];
+
+unsigned int ubifs_chk_flags;
+unsigned int ubifs_tst_flags;
+
+module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
+module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
+
+MODULE_PARM_DESC(debug_chks, "Debug check flags");
+MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
+
+static const char *get_key_fmt(int fmt)
+{
+	switch (fmt) {
+	case UBIFS_SIMPLE_KEY_FMT:
+		return "simple";
+	default:
+		return "unknown/invalid format";
+	}
+}
+
+static const char *get_key_hash(int hash)
+{
+	switch (hash) {
+	case UBIFS_KEY_HASH_R5:
+		return "R5";
+	case UBIFS_KEY_HASH_TEST:
+		return "test";
+	default:
+		return "unknown/invalid name hash";
+	}
+}
+
+static const char *get_key_type(int type)
+{
+	switch (type) {
+	case UBIFS_INO_KEY:
+		return "inode";
+	case UBIFS_DENT_KEY:
+		return "direntry";
+	case UBIFS_XENT_KEY:
+		return "xentry";
+	case UBIFS_DATA_KEY:
+		return "data";
+	case UBIFS_TRUN_KEY:
+		return "truncate";
+	default:
+		return "unknown/invalid key";
+	}
+}
+
+static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
+			char *buffer)
+{
+	char *p = buffer;
+	int type = key_type(c, key);
+
+	if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
+		switch (type) {
+		case UBIFS_INO_KEY:
+			sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key),
+			       get_key_type(type));
+			break;
+		case UBIFS_DENT_KEY:
+		case UBIFS_XENT_KEY:
+			sprintf(p, "(%lu, %s, %#08x)",
+				(unsigned long)key_inum(c, key),
+				get_key_type(type), key_hash(c, key));
+			break;
+		case UBIFS_DATA_KEY:
+			sprintf(p, "(%lu, %s, %u)",
+				(unsigned long)key_inum(c, key),
+				get_key_type(type), key_block(c, key));
+			break;
+		case UBIFS_TRUN_KEY:
+			sprintf(p, "(%lu, %s)",
+				(unsigned long)key_inum(c, key),
+				get_key_type(type));
+			break;
+		default:
+			sprintf(p, "(bad key type: %#08x, %#08x)",
+				key->u32[0], key->u32[1]);
+		}
+	} else
+		sprintf(p, "bad key format %d", c->key_fmt);
+}
+
+const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
+{
+	/* dbg_lock must be held */
+	sprintf_key(c, key, dbg_key_buf0);
+	return dbg_key_buf0;
+}
+
+const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
+{
+	/* dbg_lock must be held */
+	sprintf_key(c, key, dbg_key_buf1);
+	return dbg_key_buf1;
+}
+
+const char *dbg_ntype(int type)
+{
+	switch (type) {
+	case UBIFS_PAD_NODE:
+		return "padding node";
+	case UBIFS_SB_NODE:
+		return "superblock node";
+	case UBIFS_MST_NODE:
+		return "master node";
+	case UBIFS_REF_NODE:
+		return "reference node";
+	case UBIFS_INO_NODE:
+		return "inode node";
+	case UBIFS_DENT_NODE:
+		return "direntry node";
+	case UBIFS_XENT_NODE:
+		return "xentry node";
+	case UBIFS_DATA_NODE:
+		return "data node";
+	case UBIFS_TRUN_NODE:
+		return "truncate node";
+	case UBIFS_IDX_NODE:
+		return "indexing node";
+	case UBIFS_CS_NODE:
+		return "commit start node";
+	case UBIFS_ORPH_NODE:
+		return "orphan node";
+	default:
+		return "unknown node";
+	}
+}
+
+static const char *dbg_gtype(int type)
+{
+	switch (type) {
+	case UBIFS_NO_NODE_GROUP:
+		return "no node group";
+	case UBIFS_IN_NODE_GROUP:
+		return "in node group";
+	case UBIFS_LAST_OF_NODE_GROUP:
+		return "last of node group";
+	default:
+		return "unknown";
+	}
+}
+
+const char *dbg_cstate(int cmt_state)
+{
+	switch (cmt_state) {
+	case COMMIT_RESTING:
+		return "commit resting";
+	case COMMIT_BACKGROUND:
+		return "background commit requested";
+	case COMMIT_REQUIRED:
+		return "commit required";
+	case COMMIT_RUNNING_BACKGROUND:
+		return "BACKGROUND commit running";
+	case COMMIT_RUNNING_REQUIRED:
+		return "commit running and required";
+	case COMMIT_BROKEN:
+		return "broken commit";
+	default:
+		return "unknown commit state";
+	}
+}
+
+const char *dbg_jhead(int jhead)
+{
+	switch (jhead) {
+	case GCHD:
+		return "0 (GC)";
+	case BASEHD:
+		return "1 (base)";
+	case DATAHD:
+		return "2 (data)";
+	default:
+		return "unknown journal head";
+	}
+}
+
+static void dump_ch(const struct ubifs_ch *ch)
+{
+	printk(KERN_DEBUG "\tmagic          %#x\n", le32_to_cpu(ch->magic));
+	printk(KERN_DEBUG "\tcrc            %#x\n", le32_to_cpu(ch->crc));
+	printk(KERN_DEBUG "\tnode_type      %d (%s)\n", ch->node_type,
+	       dbg_ntype(ch->node_type));
+	printk(KERN_DEBUG "\tgroup_type     %d (%s)\n", ch->group_type,
+	       dbg_gtype(ch->group_type));
+	printk(KERN_DEBUG "\tsqnum          %llu\n",
+	       (unsigned long long)le64_to_cpu(ch->sqnum));
+	printk(KERN_DEBUG "\tlen            %u\n", le32_to_cpu(ch->len));
+}
+
+void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode)
+{
+	const struct ubifs_inode *ui = ubifs_inode(inode);
+
+	printk(KERN_DEBUG "Dump in-memory inode:");
+	printk(KERN_DEBUG "\tinode          %lu\n", inode->i_ino);
+	printk(KERN_DEBUG "\tsize           %llu\n",
+	       (unsigned long long)i_size_read(inode));
+	printk(KERN_DEBUG "\tnlink          %u\n", inode->i_nlink);
+	printk(KERN_DEBUG "\tuid            %u\n", (unsigned int)inode->i_uid);
+	printk(KERN_DEBUG "\tgid            %u\n", (unsigned int)inode->i_gid);
+	printk(KERN_DEBUG "\tatime          %u.%u\n",
+	       (unsigned int)inode->i_atime.tv_sec,
+	       (unsigned int)inode->i_atime.tv_nsec);
+	printk(KERN_DEBUG "\tmtime          %u.%u\n",
+	       (unsigned int)inode->i_mtime.tv_sec,
+	       (unsigned int)inode->i_mtime.tv_nsec);
+	printk(KERN_DEBUG "\tctime          %u.%u\n",
+	       (unsigned int)inode->i_ctime.tv_sec,
+	       (unsigned int)inode->i_ctime.tv_nsec);
+	printk(KERN_DEBUG "\tcreat_sqnum    %llu\n", ui->creat_sqnum);
+	printk(KERN_DEBUG "\txattr_size     %u\n", ui->xattr_size);
+	printk(KERN_DEBUG "\txattr_cnt      %u\n", ui->xattr_cnt);
+	printk(KERN_DEBUG "\txattr_names    %u\n", ui->xattr_names);
+	printk(KERN_DEBUG "\tdirty          %u\n", ui->dirty);
+	printk(KERN_DEBUG "\txattr          %u\n", ui->xattr);
+	printk(KERN_DEBUG "\tbulk_read      %u\n", ui->xattr);
+	printk(KERN_DEBUG "\tsynced_i_size  %llu\n",
+	       (unsigned long long)ui->synced_i_size);
+	printk(KERN_DEBUG "\tui_size        %llu\n",
+	       (unsigned long long)ui->ui_size);
+	printk(KERN_DEBUG "\tflags          %d\n", ui->flags);
+	printk(KERN_DEBUG "\tcompr_type     %d\n", ui->compr_type);
+	printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read);
+	printk(KERN_DEBUG "\tread_in_a_row  %lu\n", ui->read_in_a_row);
+	printk(KERN_DEBUG "\tdata_len       %d\n", ui->data_len);
+}
+
+void dbg_dump_node(const struct ubifs_info *c, const void *node)
+{
+	int i, n;
+	union ubifs_key key;
+	const struct ubifs_ch *ch = node;
+
+	if (dbg_failure_mode)
+		return;
+
+	/* If the magic is incorrect, just hexdump the first bytes */
+	if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) {
+		printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ);
+		print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1,
+			       (void *)node, UBIFS_CH_SZ, 1);
+		return;
+	}
+
+	spin_lock(&dbg_lock);
+	dump_ch(node);
+
+	switch (ch->node_type) {
+	case UBIFS_PAD_NODE:
+	{
+		const struct ubifs_pad_node *pad = node;
+
+		printk(KERN_DEBUG "\tpad_len        %u\n",
+		       le32_to_cpu(pad->pad_len));
+		break;
+	}
+	case UBIFS_SB_NODE:
+	{
+		const struct ubifs_sb_node *sup = node;
+		unsigned int sup_flags = le32_to_cpu(sup->flags);
+
+		printk(KERN_DEBUG "\tkey_hash       %d (%s)\n",
+		       (int)sup->key_hash, get_key_hash(sup->key_hash));
+		printk(KERN_DEBUG "\tkey_fmt        %d (%s)\n",
+		       (int)sup->key_fmt, get_key_fmt(sup->key_fmt));
+		printk(KERN_DEBUG "\tflags          %#x\n", sup_flags);
+		printk(KERN_DEBUG "\t  big_lpt      %u\n",
+		       !!(sup_flags & UBIFS_FLG_BIGLPT));
+		printk(KERN_DEBUG "\t  space_fixup  %u\n",
+		       !!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
+		printk(KERN_DEBUG "\tmin_io_size    %u\n",
+		       le32_to_cpu(sup->min_io_size));
+		printk(KERN_DEBUG "\tleb_size       %u\n",
+		       le32_to_cpu(sup->leb_size));
+		printk(KERN_DEBUG "\tleb_cnt        %u\n",
+		       le32_to_cpu(sup->leb_cnt));
+		printk(KERN_DEBUG "\tmax_leb_cnt    %u\n",
+		       le32_to_cpu(sup->max_leb_cnt));
+		printk(KERN_DEBUG "\tmax_bud_bytes  %llu\n",
+		       (unsigned long long)le64_to_cpu(sup->max_bud_bytes));
+		printk(KERN_DEBUG "\tlog_lebs       %u\n",
+		       le32_to_cpu(sup->log_lebs));
+		printk(KERN_DEBUG "\tlpt_lebs       %u\n",
+		       le32_to_cpu(sup->lpt_lebs));
+		printk(KERN_DEBUG "\torph_lebs      %u\n",
+		       le32_to_cpu(sup->orph_lebs));
+		printk(KERN_DEBUG "\tjhead_cnt      %u\n",
+		       le32_to_cpu(sup->jhead_cnt));
+		printk(KERN_DEBUG "\tfanout         %u\n",
+		       le32_to_cpu(sup->fanout));
+		printk(KERN_DEBUG "\tlsave_cnt      %u\n",
+		       le32_to_cpu(sup->lsave_cnt));
+		printk(KERN_DEBUG "\tdefault_compr  %u\n",
+		       (int)le16_to_cpu(sup->default_compr));
+		printk(KERN_DEBUG "\trp_size        %llu\n",
+		       (unsigned long long)le64_to_cpu(sup->rp_size));
+		printk(KERN_DEBUG "\trp_uid         %u\n",
+		       le32_to_cpu(sup->rp_uid));
+		printk(KERN_DEBUG "\trp_gid         %u\n",
+		       le32_to_cpu(sup->rp_gid));
+		printk(KERN_DEBUG "\tfmt_version    %u\n",
+		       le32_to_cpu(sup->fmt_version));
+		printk(KERN_DEBUG "\ttime_gran      %u\n",
+		       le32_to_cpu(sup->time_gran));
+		printk(KERN_DEBUG "\tUUID           %pUB\n",
+		       sup->uuid);
+		break;
+	}
+	case UBIFS_MST_NODE:
+	{
+		const struct ubifs_mst_node *mst = node;
+
+		printk(KERN_DEBUG "\thighest_inum   %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->highest_inum));
+		printk(KERN_DEBUG "\tcommit number  %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->cmt_no));
+		printk(KERN_DEBUG "\tflags          %#x\n",
+		       le32_to_cpu(mst->flags));
+		printk(KERN_DEBUG "\tlog_lnum       %u\n",
+		       le32_to_cpu(mst->log_lnum));
+		printk(KERN_DEBUG "\troot_lnum      %u\n",
+		       le32_to_cpu(mst->root_lnum));
+		printk(KERN_DEBUG "\troot_offs      %u\n",
+		       le32_to_cpu(mst->root_offs));
+		printk(KERN_DEBUG "\troot_len       %u\n",
+		       le32_to_cpu(mst->root_len));
+		printk(KERN_DEBUG "\tgc_lnum        %u\n",
+		       le32_to_cpu(mst->gc_lnum));
+		printk(KERN_DEBUG "\tihead_lnum     %u\n",
+		       le32_to_cpu(mst->ihead_lnum));
+		printk(KERN_DEBUG "\tihead_offs     %u\n",
+		       le32_to_cpu(mst->ihead_offs));
+		printk(KERN_DEBUG "\tindex_size     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->index_size));
+		printk(KERN_DEBUG "\tlpt_lnum       %u\n",
+		       le32_to_cpu(mst->lpt_lnum));
+		printk(KERN_DEBUG "\tlpt_offs       %u\n",
+		       le32_to_cpu(mst->lpt_offs));
+		printk(KERN_DEBUG "\tnhead_lnum     %u\n",
+		       le32_to_cpu(mst->nhead_lnum));
+		printk(KERN_DEBUG "\tnhead_offs     %u\n",
+		       le32_to_cpu(mst->nhead_offs));
+		printk(KERN_DEBUG "\tltab_lnum      %u\n",
+		       le32_to_cpu(mst->ltab_lnum));
+		printk(KERN_DEBUG "\tltab_offs      %u\n",
+		       le32_to_cpu(mst->ltab_offs));
+		printk(KERN_DEBUG "\tlsave_lnum     %u\n",
+		       le32_to_cpu(mst->lsave_lnum));
+		printk(KERN_DEBUG "\tlsave_offs     %u\n",
+		       le32_to_cpu(mst->lsave_offs));
+		printk(KERN_DEBUG "\tlscan_lnum     %u\n",
+		       le32_to_cpu(mst->lscan_lnum));
+		printk(KERN_DEBUG "\tleb_cnt        %u\n",
+		       le32_to_cpu(mst->leb_cnt));
+		printk(KERN_DEBUG "\tempty_lebs     %u\n",
+		       le32_to_cpu(mst->empty_lebs));
+		printk(KERN_DEBUG "\tidx_lebs       %u\n",
+		       le32_to_cpu(mst->idx_lebs));
+		printk(KERN_DEBUG "\ttotal_free     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->total_free));
+		printk(KERN_DEBUG "\ttotal_dirty    %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->total_dirty));
+		printk(KERN_DEBUG "\ttotal_used     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->total_used));
+		printk(KERN_DEBUG "\ttotal_dead     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->total_dead));
+		printk(KERN_DEBUG "\ttotal_dark     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->total_dark));
+		break;
+	}
+	case UBIFS_REF_NODE:
+	{
+		const struct ubifs_ref_node *ref = node;
+
+		printk(KERN_DEBUG "\tlnum           %u\n",
+		       le32_to_cpu(ref->lnum));
+		printk(KERN_DEBUG "\toffs           %u\n",
+		       le32_to_cpu(ref->offs));
+		printk(KERN_DEBUG "\tjhead          %u\n",
+		       le32_to_cpu(ref->jhead));
+		break;
+	}
+	case UBIFS_INO_NODE:
+	{
+		const struct ubifs_ino_node *ino = node;
+
+		key_read(c, &ino->key, &key);
+		printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
+		printk(KERN_DEBUG "\tcreat_sqnum    %llu\n",
+		       (unsigned long long)le64_to_cpu(ino->creat_sqnum));
+		printk(KERN_DEBUG "\tsize           %llu\n",
+		       (unsigned long long)le64_to_cpu(ino->size));
+		printk(KERN_DEBUG "\tnlink          %u\n",
+		       le32_to_cpu(ino->nlink));
+		printk(KERN_DEBUG "\tatime          %lld.%u\n",
+		       (long long)le64_to_cpu(ino->atime_sec),
+		       le32_to_cpu(ino->atime_nsec));
+		printk(KERN_DEBUG "\tmtime          %lld.%u\n",
+		       (long long)le64_to_cpu(ino->mtime_sec),
+		       le32_to_cpu(ino->mtime_nsec));
+		printk(KERN_DEBUG "\tctime          %lld.%u\n",
+		       (long long)le64_to_cpu(ino->ctime_sec),
+		       le32_to_cpu(ino->ctime_nsec));
+		printk(KERN_DEBUG "\tuid            %u\n",
+		       le32_to_cpu(ino->uid));
+		printk(KERN_DEBUG "\tgid            %u\n",
+		       le32_to_cpu(ino->gid));
+		printk(KERN_DEBUG "\tmode           %u\n",
+		       le32_to_cpu(ino->mode));
+		printk(KERN_DEBUG "\tflags          %#x\n",
+		       le32_to_cpu(ino->flags));
+		printk(KERN_DEBUG "\txattr_cnt      %u\n",
+		       le32_to_cpu(ino->xattr_cnt));
+		printk(KERN_DEBUG "\txattr_size     %u\n",
+		       le32_to_cpu(ino->xattr_size));
+		printk(KERN_DEBUG "\txattr_names    %u\n",
+		       le32_to_cpu(ino->xattr_names));
+		printk(KERN_DEBUG "\tcompr_type     %#x\n",
+		       (int)le16_to_cpu(ino->compr_type));
+		printk(KERN_DEBUG "\tdata len       %u\n",
+		       le32_to_cpu(ino->data_len));
+		break;
+	}
+	case UBIFS_DENT_NODE:
+	case UBIFS_XENT_NODE:
+	{
+		const struct ubifs_dent_node *dent = node;
+		int nlen = le16_to_cpu(dent->nlen);
+
+		key_read(c, &dent->key, &key);
+		printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
+		printk(KERN_DEBUG "\tinum           %llu\n",
+		       (unsigned long long)le64_to_cpu(dent->inum));
+		printk(KERN_DEBUG "\ttype           %d\n", (int)dent->type);
+		printk(KERN_DEBUG "\tnlen           %d\n", nlen);
+		printk(KERN_DEBUG "\tname           ");
+
+		if (nlen > UBIFS_MAX_NLEN)
+			printk(KERN_DEBUG "(bad name length, not printing, "
+					  "bad or corrupted node)");
+		else {
+			for (i = 0; i < nlen && dent->name[i]; i++)
+				printk(KERN_CONT "%c", dent->name[i]);
+		}
+		printk(KERN_CONT "\n");
+
+		break;
+	}
+	case UBIFS_DATA_NODE:
+	{
+		const struct ubifs_data_node *dn = node;
+		int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
+
+		key_read(c, &dn->key, &key);
+		printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
+		printk(KERN_DEBUG "\tsize           %u\n",
+		       le32_to_cpu(dn->size));
+		printk(KERN_DEBUG "\tcompr_typ      %d\n",
+		       (int)le16_to_cpu(dn->compr_type));
+		printk(KERN_DEBUG "\tdata size      %d\n",
+		       dlen);
+		printk(KERN_DEBUG "\tdata:\n");
+		print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1,
+			       (void *)&dn->data, dlen, 0);
+		break;
+	}
+	case UBIFS_TRUN_NODE:
+	{
+		const struct ubifs_trun_node *trun = node;
+
+		printk(KERN_DEBUG "\tinum           %u\n",
+		       le32_to_cpu(trun->inum));
+		printk(KERN_DEBUG "\told_size       %llu\n",
+		       (unsigned long long)le64_to_cpu(trun->old_size));
+		printk(KERN_DEBUG "\tnew_size       %llu\n",
+		       (unsigned long long)le64_to_cpu(trun->new_size));
+		break;
+	}
+	case UBIFS_IDX_NODE:
+	{
+		const struct ubifs_idx_node *idx = node;
+
+		n = le16_to_cpu(idx->child_cnt);
+		printk(KERN_DEBUG "\tchild_cnt      %d\n", n);
+		printk(KERN_DEBUG "\tlevel          %d\n",
+		       (int)le16_to_cpu(idx->level));
+		printk(KERN_DEBUG "\tBranches:\n");
+
+		for (i = 0; i < n && i < c->fanout - 1; i++) {
+			const struct ubifs_branch *br;
+
+			br = ubifs_idx_branch(c, idx, i);
+			key_read(c, &br->key, &key);
+			printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
+			       i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
+			       le32_to_cpu(br->len), DBGKEY(&key));
+		}
+		break;
+	}
+	case UBIFS_CS_NODE:
+		break;
+	case UBIFS_ORPH_NODE:
+	{
+		const struct ubifs_orph_node *orph = node;
+
+		printk(KERN_DEBUG "\tcommit number  %llu\n",
+		       (unsigned long long)
+				le64_to_cpu(orph->cmt_no) & LLONG_MAX);
+		printk(KERN_DEBUG "\tlast node flag %llu\n",
+		       (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63);
+		n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3;
+		printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
+		for (i = 0; i < n; i++)
+			printk(KERN_DEBUG "\t  ino %llu\n",
+			       (unsigned long long)le64_to_cpu(orph->inos[i]));
+		break;
+	}
+	default:
+		printk(KERN_DEBUG "node type %d was not recognized\n",
+		       (int)ch->node_type);
+	}
+	spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_budget_req(const struct ubifs_budget_req *req)
+{
+	spin_lock(&dbg_lock);
+	printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n",
+	       req->new_ino, req->dirtied_ino);
+	printk(KERN_DEBUG "\tnew_ino_d   %d, dirtied_ino_d %d\n",
+	       req->new_ino_d, req->dirtied_ino_d);
+	printk(KERN_DEBUG "\tnew_page    %d, dirtied_page %d\n",
+	       req->new_page, req->dirtied_page);
+	printk(KERN_DEBUG "\tnew_dent    %d, mod_dent     %d\n",
+	       req->new_dent, req->mod_dent);
+	printk(KERN_DEBUG "\tidx_growth  %d\n", req->idx_growth);
+	printk(KERN_DEBUG "\tdata_growth %d dd_growth     %d\n",
+	       req->data_growth, req->dd_growth);
+	spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
+{
+	spin_lock(&dbg_lock);
+	printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, "
+	       "idx_lebs  %d\n", current->pid, lst->empty_lebs, lst->idx_lebs);
+	printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "
+	       "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
+	       lst->total_dirty);
+	printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, "
+	       "total_dead %lld\n", lst->total_used, lst->total_dark,
+	       lst->total_dead);
+	spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
+{
+	int i;
+	struct rb_node *rb;
+	struct ubifs_bud *bud;
+	struct ubifs_gced_idx_leb *idx_gc;
+	long long available, outstanding, free;
+
+	spin_lock(&c->space_lock);
+	spin_lock(&dbg_lock);
+	printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, "
+	       "total budget sum %lld\n", current->pid,
+	       bi->data_growth + bi->dd_growth,
+	       bi->data_growth + bi->dd_growth + bi->idx_growth);
+	printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, "
+	       "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth,
+	       bi->idx_growth);
+	printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, "
+	       "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz,
+	       bi->uncommitted_idx);
+	printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
+	       bi->page_budget, bi->inode_budget, bi->dent_budget);
+	printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n",
+	       bi->nospace, bi->nospace_rp);
+	printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
+	       c->dark_wm, c->dead_wm, c->max_idx_node_sz);
+
+	if (bi != &c->bi)
+		/*
+		 * If we are dumping saved budgeting data, do not print
+		 * additional information which is about the current state, not
+		 * the old one which corresponded to the saved budgeting data.
+		 */
+		goto out_unlock;
+
+	printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
+	       c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt);
+	printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
+	       "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
+	       atomic_long_read(&c->dirty_zn_cnt),
+	       atomic_long_read(&c->clean_zn_cnt));
+	printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
+	       c->gc_lnum, c->ihead_lnum);
+
+	/* If we are in R/O mode, journal heads do not exist */
+	if (c->jheads)
+		for (i = 0; i < c->jhead_cnt; i++)
+			printk(KERN_DEBUG "\tjhead %s\t LEB %d\n",
+			       dbg_jhead(c->jheads[i].wbuf.jhead),
+			       c->jheads[i].wbuf.lnum);
+	for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
+		bud = rb_entry(rb, struct ubifs_bud, rb);
+		printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
+	}
+	list_for_each_entry(bud, &c->old_buds, list)
+		printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum);
+	list_for_each_entry(idx_gc, &c->idx_gc, list)
+		printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
+		       idx_gc->lnum, idx_gc->unmap);
+	printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
+
+	/* Print budgeting predictions */
+	available = ubifs_calc_available(c, c->bi.min_idx_lebs);
+	outstanding = c->bi.data_growth + c->bi.dd_growth;
+	free = ubifs_get_free_space_nolock(c);
+	printk(KERN_DEBUG "Budgeting predictions:\n");
+	printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
+	       available, outstanding, free);
+out_unlock:
+	spin_unlock(&dbg_lock);
+	spin_unlock(&c->space_lock);
+}
+
+void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
+{
+	int i, spc, dark = 0, dead = 0;
+	struct rb_node *rb;
+	struct ubifs_bud *bud;
+
+	spc = lp->free + lp->dirty;
+	if (spc < c->dead_wm)
+		dead = spc;
+	else
+		dark = ubifs_calc_dark(c, spc);
+
+	if (lp->flags & LPROPS_INDEX)
+		printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
+		       "free + dirty %-8d flags %#x (", lp->lnum, lp->free,
+		       lp->dirty, c->leb_size - spc, spc, lp->flags);
+	else
+		printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
+		       "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d "
+		       "flags %#-4x (", lp->lnum, lp->free, lp->dirty,
+		       c->leb_size - spc, spc, dark, dead,
+		       (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags);
+
+	if (lp->flags & LPROPS_TAKEN) {
+		if (lp->flags & LPROPS_INDEX)
+			printk(KERN_CONT "index, taken");
+		else
+			printk(KERN_CONT "taken");
+	} else {
+		const char *s;
+
+		if (lp->flags & LPROPS_INDEX) {
+			switch (lp->flags & LPROPS_CAT_MASK) {
+			case LPROPS_DIRTY_IDX:
+				s = "dirty index";
+				break;
+			case LPROPS_FRDI_IDX:
+				s = "freeable index";
+				break;
+			default:
+				s = "index";
+			}
+		} else {
+			switch (lp->flags & LPROPS_CAT_MASK) {
+			case LPROPS_UNCAT:
+				s = "not categorized";
+				break;
+			case LPROPS_DIRTY:
+				s = "dirty";
+				break;
+			case LPROPS_FREE:
+				s = "free";
+				break;
+			case LPROPS_EMPTY:
+				s = "empty";
+				break;
+			case LPROPS_FREEABLE:
+				s = "freeable";
+				break;
+			default:
+				s = NULL;
+				break;
+			}
+		}
+		printk(KERN_CONT "%s", s);
+	}
+
+	for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) {
+		bud = rb_entry(rb, struct ubifs_bud, rb);
+		if (bud->lnum == lp->lnum) {
+			int head = 0;
+			for (i = 0; i < c->jhead_cnt; i++) {
+				/*
+				 * Note, if we are in R/O mode or in the middle
+				 * of mounting/re-mounting, the write-buffers do
+				 * not exist.
+				 */
+				if (c->jheads &&
+				    lp->lnum == c->jheads[i].wbuf.lnum) {
+					printk(KERN_CONT ", jhead %s",
+					       dbg_jhead(i));
+					head = 1;
+				}
+			}
+			if (!head)
+				printk(KERN_CONT ", bud of jhead %s",
+				       dbg_jhead(bud->jhead));
+		}
+	}
+	if (lp->lnum == c->gc_lnum)
+		printk(KERN_CONT ", GC LEB");
+	printk(KERN_CONT ")\n");
+}
+
+void dbg_dump_lprops(struct ubifs_info *c)
+{
+	int lnum, err;
+	struct ubifs_lprops lp;
+	struct ubifs_lp_stats lst;
+
+	printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n",
+	       current->pid);
+	ubifs_get_lp_stats(c, &lst);
+	dbg_dump_lstats(&lst);
+
+	for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
+		err = ubifs_read_one_lp(c, lnum, &lp);
+		if (err)
+			ubifs_err("cannot read lprops for LEB %d", lnum);
+
+		dbg_dump_lprop(c, &lp);
+	}
+	printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n",
+	       current->pid);
+}
+
+void dbg_dump_lpt_info(struct ubifs_info *c)
+{
+	int i;
+
+	spin_lock(&dbg_lock);
+	printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid);
+	printk(KERN_DEBUG "\tlpt_sz:        %lld\n", c->lpt_sz);
+	printk(KERN_DEBUG "\tpnode_sz:      %d\n", c->pnode_sz);
+	printk(KERN_DEBUG "\tnnode_sz:      %d\n", c->nnode_sz);
+	printk(KERN_DEBUG "\tltab_sz:       %d\n", c->ltab_sz);
+	printk(KERN_DEBUG "\tlsave_sz:      %d\n", c->lsave_sz);
+	printk(KERN_DEBUG "\tbig_lpt:       %d\n", c->big_lpt);
+	printk(KERN_DEBUG "\tlpt_hght:      %d\n", c->lpt_hght);
+	printk(KERN_DEBUG "\tpnode_cnt:     %d\n", c->pnode_cnt);
+	printk(KERN_DEBUG "\tnnode_cnt:     %d\n", c->nnode_cnt);
+	printk(KERN_DEBUG "\tdirty_pn_cnt:  %d\n", c->dirty_pn_cnt);
+	printk(KERN_DEBUG "\tdirty_nn_cnt:  %d\n", c->dirty_nn_cnt);
+	printk(KERN_DEBUG "\tlsave_cnt:     %d\n", c->lsave_cnt);
+	printk(KERN_DEBUG "\tspace_bits:    %d\n", c->space_bits);
+	printk(KERN_DEBUG "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits);
+	printk(KERN_DEBUG "\tlpt_offs_bits: %d\n", c->lpt_offs_bits);
+	printk(KERN_DEBUG "\tlpt_spc_bits:  %d\n", c->lpt_spc_bits);
+	printk(KERN_DEBUG "\tpcnt_bits:     %d\n", c->pcnt_bits);
+	printk(KERN_DEBUG "\tlnum_bits:     %d\n", c->lnum_bits);
+	printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
+	printk(KERN_DEBUG "\tLPT head is at %d:%d\n",
+	       c->nhead_lnum, c->nhead_offs);
+	printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n",
+	       c->ltab_lnum, c->ltab_offs);
+	if (c->big_lpt)
+		printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n",
+		       c->lsave_lnum, c->lsave_offs);
+	for (i = 0; i < c->lpt_lebs; i++)
+		printk(KERN_DEBUG "\tLPT LEB %d free %d dirty %d tgc %d "
+		       "cmt %d\n", i + c->lpt_first, c->ltab[i].free,
+		       c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt);
+	spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_leb(const struct ubifs_info *c, int lnum)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	void *buf;
+
+	if (dbg_failure_mode)
+		return;
+
+	printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
+	       current->pid, lnum);
+
+	buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+	if (!buf) {
+		ubifs_err("cannot allocate memory for dumping LEB %d", lnum);
+		return;
+	}
+
+	sleb = ubifs_scan(c, lnum, 0, buf, 0);
+	if (IS_ERR(sleb)) {
+		ubifs_err("scan error %d", (int)PTR_ERR(sleb));
+		goto out;
+	}
+
+	printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
+	       sleb->nodes_cnt, sleb->endpt);
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		cond_resched();
+		printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum,
+		       snod->offs, snod->len);
+		dbg_dump_node(c, snod->node);
+	}
+
+	printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
+	       current->pid, lnum);
+	ubifs_scan_destroy(sleb);
+
+out:
+	vfree(buf);
+	return;
+}
+
+void dbg_dump_znode(const struct ubifs_info *c,
+		    const struct ubifs_znode *znode)
+{
+	int n;
+	const struct ubifs_zbranch *zbr;
+
+	spin_lock(&dbg_lock);
+	if (znode->parent)
+		zbr = &znode->parent->zbranch[znode->iip];
+	else
+		zbr = &c->zroot;
+
+	printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d"
+	       " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs,
+	       zbr->len, znode->parent, znode->iip, znode->level,
+	       znode->child_cnt, znode->flags);
+
+	if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
+		spin_unlock(&dbg_lock);
+		return;
+	}
+
+	printk(KERN_DEBUG "zbranches:\n");
+	for (n = 0; n < znode->child_cnt; n++) {
+		zbr = &znode->zbranch[n];
+		if (znode->level > 0)
+			printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
+					  "%s\n", n, zbr->znode, zbr->lnum,
+					  zbr->offs, zbr->len,
+					  DBGKEY(&zbr->key));
+		else
+			printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
+					  "%s\n", n, zbr->znode, zbr->lnum,
+					  zbr->offs, zbr->len,
+					  DBGKEY(&zbr->key));
+	}
+	spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
+{
+	int i;
+
+	printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n",
+	       current->pid, cat, heap->cnt);
+	for (i = 0; i < heap->cnt; i++) {
+		struct ubifs_lprops *lprops = heap->arr[i];
+
+		printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d "
+		       "flags %d\n", i, lprops->lnum, lprops->hpos,
+		       lprops->free, lprops->dirty, lprops->flags);
+	}
+	printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid);
+}
+
+void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+		    struct ubifs_nnode *parent, int iip)
+{
+	int i;
+
+	printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid);
+	printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
+	       (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
+	printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
+	       pnode->flags, iip, pnode->level, pnode->num);
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		struct ubifs_lprops *lp = &pnode->lprops[i];
+
+		printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n",
+		       i, lp->free, lp->dirty, lp->flags, lp->lnum);
+	}
+}
+
+void dbg_dump_tnc(struct ubifs_info *c)
+{
+	struct ubifs_znode *znode;
+	int level;
+
+	printk(KERN_DEBUG "\n");
+	printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid);
+	znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
+	level = znode->level;
+	printk(KERN_DEBUG "== Level %d ==\n", level);
+	while (znode) {
+		if (level != znode->level) {
+			level = znode->level;
+			printk(KERN_DEBUG "== Level %d ==\n", level);
+		}
+		dbg_dump_znode(c, znode);
+		znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
+	}
+	printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid);
+}
+
+static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
+		      void *priv)
+{
+	dbg_dump_znode(c, znode);
+	return 0;
+}
+
+/**
+ * dbg_dump_index - dump the on-flash index.
+ * @c: UBIFS file-system description object
+ *
+ * This function dumps whole UBIFS indexing B-tree, unlike 'dbg_dump_tnc()'
+ * which dumps only in-memory znodes and does not read znodes which from flash.
+ */
+void dbg_dump_index(struct ubifs_info *c)
+{
+	dbg_walk_index(c, NULL, dump_znode, NULL);
+}
+
+/**
+ * dbg_save_space_info - save information about flash space.
+ * @c: UBIFS file-system description object
+ *
+ * This function saves information about UBIFS free space, dirty space, etc, in
+ * order to check it later.
+ */
+void dbg_save_space_info(struct ubifs_info *c)
+{
+	struct ubifs_debug_info *d = c->dbg;
+	int freeable_cnt;
+
+	spin_lock(&c->space_lock);
+	memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats));
+	memcpy(&d->saved_bi, &c->bi, sizeof(struct ubifs_budg_info));
+	d->saved_idx_gc_cnt = c->idx_gc_cnt;
+
+	/*
+	 * We use a dirty hack here and zero out @c->freeable_cnt, because it
+	 * affects the free space calculations, and UBIFS might not know about
+	 * all freeable eraseblocks. Indeed, we know about freeable eraseblocks
+	 * only when we read their lprops, and we do this only lazily, upon the
+	 * need. So at any given point of time @c->freeable_cnt might be not
+	 * exactly accurate.
+	 *
+	 * Just one example about the issue we hit when we did not zero
+	 * @c->freeable_cnt.
+	 * 1. The file-system is mounted R/O, c->freeable_cnt is %0. We save the
+	 *    amount of free space in @d->saved_free
+	 * 2. We re-mount R/W, which makes UBIFS to read the "lsave"
+	 *    information from flash, where we cache LEBs from various
+	 *    categories ('ubifs_remount_fs()' -> 'ubifs_lpt_init()'
+	 *    -> 'lpt_init_wr()' -> 'read_lsave()' -> 'ubifs_lpt_lookup()'
+	 *    -> 'ubifs_get_pnode()' -> 'update_cats()'
+	 *    -> 'ubifs_add_to_cat()').
+	 * 3. Lsave contains a freeable eraseblock, and @c->freeable_cnt
+	 *    becomes %1.
+	 * 4. We calculate the amount of free space when the re-mount is
+	 *    finished in 'dbg_check_space_info()' and it does not match
+	 *    @d->saved_free.
+	 */
+	freeable_cnt = c->freeable_cnt;
+	c->freeable_cnt = 0;
+	d->saved_free = ubifs_get_free_space_nolock(c);
+	c->freeable_cnt = freeable_cnt;
+	spin_unlock(&c->space_lock);
+}
+
+/**
+ * dbg_check_space_info - check flash space information.
+ * @c: UBIFS file-system description object
+ *
+ * This function compares current flash space information with the information
+ * which was saved when the 'dbg_save_space_info()' function was called.
+ * Returns zero if the information has not changed, and %-EINVAL it it has
+ * changed.
+ */
+int dbg_check_space_info(struct ubifs_info *c)
+{
+	struct ubifs_debug_info *d = c->dbg;
+	struct ubifs_lp_stats lst;
+	long long free;
+	int freeable_cnt;
+
+	spin_lock(&c->space_lock);
+	freeable_cnt = c->freeable_cnt;
+	c->freeable_cnt = 0;
+	free = ubifs_get_free_space_nolock(c);
+	c->freeable_cnt = freeable_cnt;
+	spin_unlock(&c->space_lock);
+
+	if (free != d->saved_free) {
+		ubifs_err("free space changed from %lld to %lld",
+			  d->saved_free, free);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	ubifs_msg("saved lprops statistics dump");
+	dbg_dump_lstats(&d->saved_lst);
+	ubifs_msg("saved budgeting info dump");
+	dbg_dump_budg(c, &d->saved_bi);
+	ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt);
+	ubifs_msg("current lprops statistics dump");
+	ubifs_get_lp_stats(c, &lst);
+	dbg_dump_lstats(&lst);
+	ubifs_msg("current budgeting info dump");
+	dbg_dump_budg(c, &c->bi);
+	dump_stack();
+	return -EINVAL;
+}
+
+/**
+ * dbg_check_synced_i_size - check synchronized inode size.
+ * @inode: inode to check
+ *
+ * If inode is clean, synchronized inode size has to be equivalent to current
+ * inode size. This function has to be called only for locked inodes (@i_mutex
+ * has to be locked). Returns %0 if synchronized inode size if correct, and
+ * %-EINVAL if not.
+ */
+int dbg_check_synced_i_size(struct inode *inode)
+{
+	int err = 0;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+
+	mutex_lock(&ui->ui_mutex);
+	spin_lock(&ui->ui_lock);
+	if (ui->ui_size != ui->synced_i_size && !ui->dirty) {
+		ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode "
+			  "is clean", ui->ui_size, ui->synced_i_size);
+		ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino,
+			  inode->i_mode, i_size_read(inode));
+		dbg_dump_stack();
+		err = -EINVAL;
+	}
+	spin_unlock(&ui->ui_lock);
+	mutex_unlock(&ui->ui_mutex);
+	return err;
+}
+
+/*
+ * dbg_check_dir - check directory inode size and link count.
+ * @c: UBIFS file-system description object
+ * @dir: the directory to calculate size for
+ * @size: the result is returned here
+ *
+ * This function makes sure that directory size and link count are correct.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * Note, it is good idea to make sure the @dir->i_mutex is locked before
+ * calling this function.
+ */
+int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir)
+{
+	unsigned int nlink = 2;
+	union ubifs_key key;
+	struct ubifs_dent_node *dent, *pdent = NULL;
+	struct qstr nm = { .name = NULL };
+	loff_t size = UBIFS_INO_NODE_SZ;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+
+	if (!S_ISDIR(dir->i_mode))
+		return 0;
+
+	lowest_dent_key(c, &key, dir->i_ino);
+	while (1) {
+		int err;
+
+		dent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(dent)) {
+			err = PTR_ERR(dent);
+			if (err == -ENOENT)
+				break;
+			return err;
+		}
+
+		nm.name = dent->name;
+		nm.len = le16_to_cpu(dent->nlen);
+		size += CALC_DENT_SIZE(nm.len);
+		if (dent->type == UBIFS_ITYPE_DIR)
+			nlink += 1;
+		kfree(pdent);
+		pdent = dent;
+		key_read(c, &dent->key, &key);
+	}
+	kfree(pdent);
+
+	if (i_size_read(dir) != size) {
+		ubifs_err("directory inode %lu has size %llu, "
+			  "but calculated size is %llu", dir->i_ino,
+			  (unsigned long long)i_size_read(dir),
+			  (unsigned long long)size);
+		dump_stack();
+		return -EINVAL;
+	}
+	if (dir->i_nlink != nlink) {
+		ubifs_err("directory inode %lu has nlink %u, but calculated "
+			  "nlink is %u", dir->i_ino, dir->i_nlink, nlink);
+		dump_stack();
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * dbg_check_key_order - make sure that colliding keys are properly ordered.
+ * @c: UBIFS file-system description object
+ * @zbr1: first zbranch
+ * @zbr2: following zbranch
+ *
+ * In UBIFS indexing B-tree colliding keys has to be sorted in binary order of
+ * names of the direntries/xentries which are referred by the keys. This
+ * function reads direntries/xentries referred by @zbr1 and @zbr2 and makes
+ * sure the name of direntry/xentry referred by @zbr1 is less than
+ * direntry/xentry referred by @zbr2. Returns zero if this is true, %1 if not,
+ * and a negative error code in case of failure.
+ */
+static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
+			       struct ubifs_zbranch *zbr2)
+{
+	int err, nlen1, nlen2, cmp;
+	struct ubifs_dent_node *dent1, *dent2;
+	union ubifs_key key;
+
+	ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
+	dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+	if (!dent1)
+		return -ENOMEM;
+	dent2 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+	if (!dent2) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	err = ubifs_tnc_read_node(c, zbr1, dent1);
+	if (err)
+		goto out_free;
+	err = ubifs_validate_entry(c, dent1);
+	if (err)
+		goto out_free;
+
+	err = ubifs_tnc_read_node(c, zbr2, dent2);
+	if (err)
+		goto out_free;
+	err = ubifs_validate_entry(c, dent2);
+	if (err)
+		goto out_free;
+
+	/* Make sure node keys are the same as in zbranch */
+	err = 1;
+	key_read(c, &dent1->key, &key);
+	if (keys_cmp(c, &zbr1->key, &key)) {
+		dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
+			zbr1->offs, DBGKEY(&key));
+		dbg_err("but it should have key %s according to tnc",
+			DBGKEY(&zbr1->key));
+		dbg_dump_node(c, dent1);
+		goto out_free;
+	}
+
+	key_read(c, &dent2->key, &key);
+	if (keys_cmp(c, &zbr2->key, &key)) {
+		dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
+			zbr1->offs, DBGKEY(&key));
+		dbg_err("but it should have key %s according to tnc",
+			DBGKEY(&zbr2->key));
+		dbg_dump_node(c, dent2);
+		goto out_free;
+	}
+
+	nlen1 = le16_to_cpu(dent1->nlen);
+	nlen2 = le16_to_cpu(dent2->nlen);
+
+	cmp = memcmp(dent1->name, dent2->name, min_t(int, nlen1, nlen2));
+	if (cmp < 0 || (cmp == 0 && nlen1 < nlen2)) {
+		err = 0;
+		goto out_free;
+	}
+	if (cmp == 0 && nlen1 == nlen2)
+		dbg_err("2 xent/dent nodes with the same name");
+	else
+		dbg_err("bad order of colliding key %s",
+			DBGKEY(&key));
+
+	ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
+	dbg_dump_node(c, dent1);
+	ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
+	dbg_dump_node(c, dent2);
+
+out_free:
+	kfree(dent2);
+	kfree(dent1);
+	return err;
+}
+
+/**
+ * dbg_check_znode - check if znode is all right.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch which points to this znode
+ *
+ * This function makes sure that znode referred to by @zbr is all right.
+ * Returns zero if it is, and %-EINVAL if it is not.
+ */
+static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
+{
+	struct ubifs_znode *znode = zbr->znode;
+	struct ubifs_znode *zp = znode->parent;
+	int n, err, cmp;
+
+	if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
+		err = 1;
+		goto out;
+	}
+	if (znode->level < 0) {
+		err = 2;
+		goto out;
+	}
+	if (znode->iip < 0 || znode->iip >= c->fanout) {
+		err = 3;
+		goto out;
+	}
+
+	if (zbr->len == 0)
+		/* Only dirty zbranch may have no on-flash nodes */
+		if (!ubifs_zn_dirty(znode)) {
+			err = 4;
+			goto out;
+		}
+
+	if (ubifs_zn_dirty(znode)) {
+		/*
+		 * If znode is dirty, its parent has to be dirty as well. The
+		 * order of the operation is important, so we have to have
+		 * memory barriers.
+		 */
+		smp_mb();
+		if (zp && !ubifs_zn_dirty(zp)) {
+			/*
+			 * The dirty flag is atomic and is cleared outside the
+			 * TNC mutex, so znode's dirty flag may now have
+			 * been cleared. The child is always cleared before the
+			 * parent, so we just need to check again.
+			 */
+			smp_mb();
+			if (ubifs_zn_dirty(znode)) {
+				err = 5;
+				goto out;
+			}
+		}
+	}
+
+	if (zp) {
+		const union ubifs_key *min, *max;
+
+		if (znode->level != zp->level - 1) {
+			err = 6;
+			goto out;
+		}
+
+		/* Make sure the 'parent' pointer in our znode is correct */
+		err = ubifs_search_zbranch(c, zp, &zbr->key, &n);
+		if (!err) {
+			/* This zbranch does not exist in the parent */
+			err = 7;
+			goto out;
+		}
+
+		if (znode->iip >= zp->child_cnt) {
+			err = 8;
+			goto out;
+		}
+
+		if (znode->iip != n) {
+			/* This may happen only in case of collisions */
+			if (keys_cmp(c, &zp->zbranch[n].key,
+				     &zp->zbranch[znode->iip].key)) {
+				err = 9;
+				goto out;
+			}
+			n = znode->iip;
+		}
+
+		/*
+		 * Make sure that the first key in our znode is greater than or
+		 * equal to the key in the pointing zbranch.
+		 */
+		min = &zbr->key;
+		cmp = keys_cmp(c, min, &znode->zbranch[0].key);
+		if (cmp == 1) {
+			err = 10;
+			goto out;
+		}
+
+		if (n + 1 < zp->child_cnt) {
+			max = &zp->zbranch[n + 1].key;
+
+			/*
+			 * Make sure the last key in our znode is less or
+			 * equivalent than the key in the zbranch which goes
+			 * after our pointing zbranch.
+			 */
+			cmp = keys_cmp(c, max,
+				&znode->zbranch[znode->child_cnt - 1].key);
+			if (cmp == -1) {
+				err = 11;
+				goto out;
+			}
+		}
+	} else {
+		/* This may only be root znode */
+		if (zbr != &c->zroot) {
+			err = 12;
+			goto out;
+		}
+	}
+
+	/*
+	 * Make sure that next key is greater or equivalent then the previous
+	 * one.
+	 */
+	for (n = 1; n < znode->child_cnt; n++) {
+		cmp = keys_cmp(c, &znode->zbranch[n - 1].key,
+			       &znode->zbranch[n].key);
+		if (cmp > 0) {
+			err = 13;
+			goto out;
+		}
+		if (cmp == 0) {
+			/* This can only be keys with colliding hash */
+			if (!is_hash_key(c, &znode->zbranch[n].key)) {
+				err = 14;
+				goto out;
+			}
+
+			if (znode->level != 0 || c->replaying)
+				continue;
+
+			/*
+			 * Colliding keys should follow binary order of
+			 * corresponding xentry/dentry names.
+			 */
+			err = dbg_check_key_order(c, &znode->zbranch[n - 1],
+						  &znode->zbranch[n]);
+			if (err < 0)
+				return err;
+			if (err) {
+				err = 15;
+				goto out;
+			}
+		}
+	}
+
+	for (n = 0; n < znode->child_cnt; n++) {
+		if (!znode->zbranch[n].znode &&
+		    (znode->zbranch[n].lnum == 0 ||
+		     znode->zbranch[n].len == 0)) {
+			err = 16;
+			goto out;
+		}
+
+		if (znode->zbranch[n].lnum != 0 &&
+		    znode->zbranch[n].len == 0) {
+			err = 17;
+			goto out;
+		}
+
+		if (znode->zbranch[n].lnum == 0 &&
+		    znode->zbranch[n].len != 0) {
+			err = 18;
+			goto out;
+		}
+
+		if (znode->zbranch[n].lnum == 0 &&
+		    znode->zbranch[n].offs != 0) {
+			err = 19;
+			goto out;
+		}
+
+		if (znode->level != 0 && znode->zbranch[n].znode)
+			if (znode->zbranch[n].znode->parent != znode) {
+				err = 20;
+				goto out;
+			}
+	}
+
+	return 0;
+
+out:
+	ubifs_err("failed, error %d", err);
+	ubifs_msg("dump of the znode");
+	dbg_dump_znode(c, znode);
+	if (zp) {
+		ubifs_msg("dump of the parent znode");
+		dbg_dump_znode(c, zp);
+	}
+	dump_stack();
+	return -EINVAL;
+}
+
+/**
+ * dbg_check_tnc - check TNC tree.
+ * @c: UBIFS file-system description object
+ * @extra: do extra checks that are possible at start commit
+ *
+ * This function traverses whole TNC tree and checks every znode. Returns zero
+ * if everything is all right and %-EINVAL if something is wrong with TNC.
+ */
+int dbg_check_tnc(struct ubifs_info *c, int extra)
+{
+	struct ubifs_znode *znode;
+	long clean_cnt = 0, dirty_cnt = 0;
+	int err, last;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_TNC))
+		return 0;
+
+	ubifs_assert(mutex_is_locked(&c->tnc_mutex));
+	if (!c->zroot.znode)
+		return 0;
+
+	znode = ubifs_tnc_postorder_first(c->zroot.znode);
+	while (1) {
+		struct ubifs_znode *prev;
+		struct ubifs_zbranch *zbr;
+
+		if (!znode->parent)
+			zbr = &c->zroot;
+		else
+			zbr = &znode->parent->zbranch[znode->iip];
+
+		err = dbg_check_znode(c, zbr);
+		if (err)
+			return err;
+
+		if (extra) {
+			if (ubifs_zn_dirty(znode))
+				dirty_cnt += 1;
+			else
+				clean_cnt += 1;
+		}
+
+		prev = znode;
+		znode = ubifs_tnc_postorder_next(znode);
+		if (!znode)
+			break;
+
+		/*
+		 * If the last key of this znode is equivalent to the first key
+		 * of the next znode (collision), then check order of the keys.
+		 */
+		last = prev->child_cnt - 1;
+		if (prev->level == 0 && znode->level == 0 && !c->replaying &&
+		    !keys_cmp(c, &prev->zbranch[last].key,
+			      &znode->zbranch[0].key)) {
+			err = dbg_check_key_order(c, &prev->zbranch[last],
+						  &znode->zbranch[0]);
+			if (err < 0)
+				return err;
+			if (err) {
+				ubifs_msg("first znode");
+				dbg_dump_znode(c, prev);
+				ubifs_msg("second znode");
+				dbg_dump_znode(c, znode);
+				return -EINVAL;
+			}
+		}
+	}
+
+	if (extra) {
+		if (clean_cnt != atomic_long_read(&c->clean_zn_cnt)) {
+			ubifs_err("incorrect clean_zn_cnt %ld, calculated %ld",
+				  atomic_long_read(&c->clean_zn_cnt),
+				  clean_cnt);
+			return -EINVAL;
+		}
+		if (dirty_cnt != atomic_long_read(&c->dirty_zn_cnt)) {
+			ubifs_err("incorrect dirty_zn_cnt %ld, calculated %ld",
+				  atomic_long_read(&c->dirty_zn_cnt),
+				  dirty_cnt);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * dbg_walk_index - walk the on-flash index.
+ * @c: UBIFS file-system description object
+ * @leaf_cb: called for each leaf node
+ * @znode_cb: called for each indexing node
+ * @priv: private data which is passed to callbacks
+ *
+ * This function walks the UBIFS index and calls the @leaf_cb for each leaf
+ * node and @znode_cb for each indexing node. Returns zero in case of success
+ * and a negative error code in case of failure.
+ *
+ * It would be better if this function removed every znode it pulled to into
+ * the TNC, so that the behavior more closely matched the non-debugging
+ * behavior.
+ */
+int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
+		   dbg_znode_callback znode_cb, void *priv)
+{
+	int err;
+	struct ubifs_zbranch *zbr;
+	struct ubifs_znode *znode, *child;
+
+	mutex_lock(&c->tnc_mutex);
+	/* If the root indexing node is not in TNC - pull it */
+	if (!c->zroot.znode) {
+		c->zroot.znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+		if (IS_ERR(c->zroot.znode)) {
+			err = PTR_ERR(c->zroot.znode);
+			c->zroot.znode = NULL;
+			goto out_unlock;
+		}
+	}
+
+	/*
+	 * We are going to traverse the indexing tree in the postorder manner.
+	 * Go down and find the leftmost indexing node where we are going to
+	 * start from.
+	 */
+	znode = c->zroot.znode;
+	while (znode->level > 0) {
+		zbr = &znode->zbranch[0];
+		child = zbr->znode;
+		if (!child) {
+			child = ubifs_load_znode(c, zbr, znode, 0);
+			if (IS_ERR(child)) {
+				err = PTR_ERR(child);
+				goto out_unlock;
+			}
+			zbr->znode = child;
+		}
+
+		znode = child;
+	}
+
+	/* Iterate over all indexing nodes */
+	while (1) {
+		int idx;
+
+		cond_resched();
+
+		if (znode_cb) {
+			err = znode_cb(c, znode, priv);
+			if (err) {
+				ubifs_err("znode checking function returned "
+					  "error %d", err);
+				dbg_dump_znode(c, znode);
+				goto out_dump;
+			}
+		}
+		if (leaf_cb && znode->level == 0) {
+			for (idx = 0; idx < znode->child_cnt; idx++) {
+				zbr = &znode->zbranch[idx];
+				err = leaf_cb(c, zbr, priv);
+				if (err) {
+					ubifs_err("leaf checking function "
+						  "returned error %d, for leaf "
+						  "at LEB %d:%d",
+						  err, zbr->lnum, zbr->offs);
+					goto out_dump;
+				}
+			}
+		}
+
+		if (!znode->parent)
+			break;
+
+		idx = znode->iip + 1;
+		znode = znode->parent;
+		if (idx < znode->child_cnt) {
+			/* Switch to the next index in the parent */
+			zbr = &znode->zbranch[idx];
+			child = zbr->znode;
+			if (!child) {
+				child = ubifs_load_znode(c, zbr, znode, idx);
+				if (IS_ERR(child)) {
+					err = PTR_ERR(child);
+					goto out_unlock;
+				}
+				zbr->znode = child;
+			}
+			znode = child;
+		} else
+			/*
+			 * This is the last child, switch to the parent and
+			 * continue.
+			 */
+			continue;
+
+		/* Go to the lowest leftmost znode in the new sub-tree */
+		while (znode->level > 0) {
+			zbr = &znode->zbranch[0];
+			child = zbr->znode;
+			if (!child) {
+				child = ubifs_load_znode(c, zbr, znode, 0);
+				if (IS_ERR(child)) {
+					err = PTR_ERR(child);
+					goto out_unlock;
+				}
+				zbr->znode = child;
+			}
+			znode = child;
+		}
+	}
+
+	mutex_unlock(&c->tnc_mutex);
+	return 0;
+
+out_dump:
+	if (znode->parent)
+		zbr = &znode->parent->zbranch[znode->iip];
+	else
+		zbr = &c->zroot;
+	ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs);
+	dbg_dump_znode(c, znode);
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * add_size - add znode size to partially calculated index size.
+ * @c: UBIFS file-system description object
+ * @znode: znode to add size for
+ * @priv: partially calculated index size
+ *
+ * This is a helper function for 'dbg_check_idx_size()' which is called for
+ * every indexing node and adds its size to the 'long long' variable pointed to
+ * by @priv.
+ */
+static int add_size(struct ubifs_info *c, struct ubifs_znode *znode, void *priv)
+{
+	long long *idx_size = priv;
+	int add;
+
+	add = ubifs_idx_node_sz(c, znode->child_cnt);
+	add = ALIGN(add, 8);
+	*idx_size += add;
+	return 0;
+}
+
+/**
+ * dbg_check_idx_size - check index size.
+ * @c: UBIFS file-system description object
+ * @idx_size: size to check
+ *
+ * This function walks the UBIFS index, calculates its size and checks that the
+ * size is equivalent to @idx_size. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+int dbg_check_idx_size(struct ubifs_info *c, long long idx_size)
+{
+	int err;
+	long long calc = 0;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_IDX_SZ))
+		return 0;
+
+	err = dbg_walk_index(c, NULL, add_size, &calc);
+	if (err) {
+		ubifs_err("error %d while walking the index", err);
+		return err;
+	}
+
+	if (calc != idx_size) {
+		ubifs_err("index size check failed: calculated size is %lld, "
+			  "should be %lld", calc, idx_size);
+		dump_stack();
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * struct fsck_inode - information about an inode used when checking the file-system.
+ * @rb: link in the RB-tree of inodes
+ * @inum: inode number
+ * @mode: inode type, permissions, etc
+ * @nlink: inode link count
+ * @xattr_cnt: count of extended attributes
+ * @references: how many directory/xattr entries refer this inode (calculated
+ *              while walking the index)
+ * @calc_cnt: for directory inode count of child directories
+ * @size: inode size (read from on-flash inode)
+ * @xattr_sz: summary size of all extended attributes (read from on-flash
+ *            inode)
+ * @calc_sz: for directories calculated directory size
+ * @calc_xcnt: count of extended attributes
+ * @calc_xsz: calculated summary size of all extended attributes
+ * @xattr_nms: sum of lengths of all extended attribute names belonging to this
+ *             inode (read from on-flash inode)
+ * @calc_xnms: calculated sum of lengths of all extended attribute names
+ */
+struct fsck_inode {
+	struct rb_node rb;
+	ino_t inum;
+	umode_t mode;
+	unsigned int nlink;
+	unsigned int xattr_cnt;
+	int references;
+	int calc_cnt;
+	long long size;
+	unsigned int xattr_sz;
+	long long calc_sz;
+	long long calc_xcnt;
+	long long calc_xsz;
+	unsigned int xattr_nms;
+	long long calc_xnms;
+};
+
+/**
+ * struct fsck_data - private FS checking information.
+ * @inodes: RB-tree of all inodes (contains @struct fsck_inode objects)
+ */
+struct fsck_data {
+	struct rb_root inodes;
+};
+
+/**
+ * add_inode - add inode information to RB-tree of inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ * @ino: raw UBIFS inode to add
+ *
+ * This is a helper function for 'check_leaf()' which adds information about
+ * inode @ino to the RB-tree of inodes. Returns inode information pointer in
+ * case of success and a negative error code in case of failure.
+ */
+static struct fsck_inode *add_inode(struct ubifs_info *c,
+				    struct fsck_data *fsckd,
+				    struct ubifs_ino_node *ino)
+{
+	struct rb_node **p, *parent = NULL;
+	struct fsck_inode *fscki;
+	ino_t inum = key_inum_flash(c, &ino->key);
+	struct inode *inode;
+	struct ubifs_inode *ui;
+
+	p = &fsckd->inodes.rb_node;
+	while (*p) {
+		parent = *p;
+		fscki = rb_entry(parent, struct fsck_inode, rb);
+		if (inum < fscki->inum)
+			p = &(*p)->rb_left;
+		else if (inum > fscki->inum)
+			p = &(*p)->rb_right;
+		else
+			return fscki;
+	}
+
+	if (inum > c->highest_inum) {
+		ubifs_err("too high inode number, max. is %lu",
+			  (unsigned long)c->highest_inum);
+		return ERR_PTR(-EINVAL);
+	}
+
+	fscki = kzalloc(sizeof(struct fsck_inode), GFP_NOFS);
+	if (!fscki)
+		return ERR_PTR(-ENOMEM);
+
+	inode = ilookup(c->vfs_sb, inum);
+
+	fscki->inum = inum;
+	/*
+	 * If the inode is present in the VFS inode cache, use it instead of
+	 * the on-flash inode which might be out-of-date. E.g., the size might
+	 * be out-of-date. If we do not do this, the following may happen, for
+	 * example:
+	 *   1. A power cut happens
+	 *   2. We mount the file-system R/O, the replay process fixes up the
+	 *      inode size in the VFS cache, but on on-flash.
+	 *   3. 'check_leaf()' fails because it hits a data node beyond inode
+	 *      size.
+	 */
+	if (!inode) {
+		fscki->nlink = le32_to_cpu(ino->nlink);
+		fscki->size = le64_to_cpu(ino->size);
+		fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+		fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
+		fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
+		fscki->mode = le32_to_cpu(ino->mode);
+	} else {
+		ui = ubifs_inode(inode);
+		fscki->nlink = inode->i_nlink;
+		fscki->size = inode->i_size;
+		fscki->xattr_cnt = ui->xattr_cnt;
+		fscki->xattr_sz = ui->xattr_size;
+		fscki->xattr_nms = ui->xattr_names;
+		fscki->mode = inode->i_mode;
+		iput(inode);
+	}
+
+	if (S_ISDIR(fscki->mode)) {
+		fscki->calc_sz = UBIFS_INO_NODE_SZ;
+		fscki->calc_cnt = 2;
+	}
+
+	rb_link_node(&fscki->rb, parent, p);
+	rb_insert_color(&fscki->rb, &fsckd->inodes);
+
+	return fscki;
+}
+
+/**
+ * search_inode - search inode in the RB-tree of inodes.
+ * @fsckd: FS checking information
+ * @inum: inode number to search
+ *
+ * This is a helper function for 'check_leaf()' which searches inode @inum in
+ * the RB-tree of inodes and returns an inode information pointer or %NULL if
+ * the inode was not found.
+ */
+static struct fsck_inode *search_inode(struct fsck_data *fsckd, ino_t inum)
+{
+	struct rb_node *p;
+	struct fsck_inode *fscki;
+
+	p = fsckd->inodes.rb_node;
+	while (p) {
+		fscki = rb_entry(p, struct fsck_inode, rb);
+		if (inum < fscki->inum)
+			p = p->rb_left;
+		else if (inum > fscki->inum)
+			p = p->rb_right;
+		else
+			return fscki;
+	}
+	return NULL;
+}
+
+/**
+ * read_add_inode - read inode node and add it to RB-tree of inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ * @inum: inode number to read
+ *
+ * This is a helper function for 'check_leaf()' which finds inode node @inum in
+ * the index, reads it, and adds it to the RB-tree of inodes. Returns inode
+ * information pointer in case of success and a negative error code in case of
+ * failure.
+ */
+static struct fsck_inode *read_add_inode(struct ubifs_info *c,
+					 struct fsck_data *fsckd, ino_t inum)
+{
+	int n, err;
+	union ubifs_key key;
+	struct ubifs_znode *znode;
+	struct ubifs_zbranch *zbr;
+	struct ubifs_ino_node *ino;
+	struct fsck_inode *fscki;
+
+	fscki = search_inode(fsckd, inum);
+	if (fscki)
+		return fscki;
+
+	ino_key_init(c, &key, inum);
+	err = ubifs_lookup_level0(c, &key, &znode, &n);
+	if (!err) {
+		ubifs_err("inode %lu not found in index", (unsigned long)inum);
+		return ERR_PTR(-ENOENT);
+	} else if (err < 0) {
+		ubifs_err("error %d while looking up inode %lu",
+			  err, (unsigned long)inum);
+		return ERR_PTR(err);
+	}
+
+	zbr = &znode->zbranch[n];
+	if (zbr->len < UBIFS_INO_NODE_SZ) {
+		ubifs_err("bad node %lu node length %d",
+			  (unsigned long)inum, zbr->len);
+		return ERR_PTR(-EINVAL);
+	}
+
+	ino = kmalloc(zbr->len, GFP_NOFS);
+	if (!ino)
+		return ERR_PTR(-ENOMEM);
+
+	err = ubifs_tnc_read_node(c, zbr, ino);
+	if (err) {
+		ubifs_err("cannot read inode node at LEB %d:%d, error %d",
+			  zbr->lnum, zbr->offs, err);
+		kfree(ino);
+		return ERR_PTR(err);
+	}
+
+	fscki = add_inode(c, fsckd, ino);
+	kfree(ino);
+	if (IS_ERR(fscki)) {
+		ubifs_err("error %ld while adding inode %lu node",
+			  PTR_ERR(fscki), (unsigned long)inum);
+		return fscki;
+	}
+
+	return fscki;
+}
+
+/**
+ * check_leaf - check leaf node.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of the leaf node to check
+ * @priv: FS checking information
+ *
+ * This is a helper function for 'dbg_check_filesystem()' which is called for
+ * every single leaf node while walking the indexing tree. It checks that the
+ * leaf node referred from the indexing tree exists, has correct CRC, and does
+ * some other basic validation. This function is also responsible for building
+ * an RB-tree of inodes - it adds all inodes into the RB-tree. It also
+ * calculates reference count, size, etc for each inode in order to later
+ * compare them to the information stored inside the inodes and detect possible
+ * inconsistencies. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+		      void *priv)
+{
+	ino_t inum;
+	void *node;
+	struct ubifs_ch *ch;
+	int err, type = key_type(c, &zbr->key);
+	struct fsck_inode *fscki;
+
+	if (zbr->len < UBIFS_CH_SZ) {
+		ubifs_err("bad leaf length %d (LEB %d:%d)",
+			  zbr->len, zbr->lnum, zbr->offs);
+		return -EINVAL;
+	}
+
+	node = kmalloc(zbr->len, GFP_NOFS);
+	if (!node)
+		return -ENOMEM;
+
+	err = ubifs_tnc_read_node(c, zbr, node);
+	if (err) {
+		ubifs_err("cannot read leaf node at LEB %d:%d, error %d",
+			  zbr->lnum, zbr->offs, err);
+		goto out_free;
+	}
+
+	/* If this is an inode node, add it to RB-tree of inodes */
+	if (type == UBIFS_INO_KEY) {
+		fscki = add_inode(c, priv, node);
+		if (IS_ERR(fscki)) {
+			err = PTR_ERR(fscki);
+			ubifs_err("error %d while adding inode node", err);
+			goto out_dump;
+		}
+		goto out;
+	}
+
+	if (type != UBIFS_DENT_KEY && type != UBIFS_XENT_KEY &&
+	    type != UBIFS_DATA_KEY) {
+		ubifs_err("unexpected node type %d at LEB %d:%d",
+			  type, zbr->lnum, zbr->offs);
+		err = -EINVAL;
+		goto out_free;
+	}
+
+	ch = node;
+	if (le64_to_cpu(ch->sqnum) > c->max_sqnum) {
+		ubifs_err("too high sequence number, max. is %llu",
+			  c->max_sqnum);
+		err = -EINVAL;
+		goto out_dump;
+	}
+
+	if (type == UBIFS_DATA_KEY) {
+		long long blk_offs;
+		struct ubifs_data_node *dn = node;
+
+		/*
+		 * Search the inode node this data node belongs to and insert
+		 * it to the RB-tree of inodes.
+		 */
+		inum = key_inum_flash(c, &dn->key);
+		fscki = read_add_inode(c, priv, inum);
+		if (IS_ERR(fscki)) {
+			err = PTR_ERR(fscki);
+			ubifs_err("error %d while processing data node and "
+				  "trying to find inode node %lu",
+				  err, (unsigned long)inum);
+			goto out_dump;
+		}
+
+		/* Make sure the data node is within inode size */
+		blk_offs = key_block_flash(c, &dn->key);
+		blk_offs <<= UBIFS_BLOCK_SHIFT;
+		blk_offs += le32_to_cpu(dn->size);
+		if (blk_offs > fscki->size) {
+			ubifs_err("data node at LEB %d:%d is not within inode "
+				  "size %lld", zbr->lnum, zbr->offs,
+				  fscki->size);
+			err = -EINVAL;
+			goto out_dump;
+		}
+	} else {
+		int nlen;
+		struct ubifs_dent_node *dent = node;
+		struct fsck_inode *fscki1;
+
+		err = ubifs_validate_entry(c, dent);
+		if (err)
+			goto out_dump;
+
+		/*
+		 * Search the inode node this entry refers to and the parent
+		 * inode node and insert them to the RB-tree of inodes.
+		 */
+		inum = le64_to_cpu(dent->inum);
+		fscki = read_add_inode(c, priv, inum);
+		if (IS_ERR(fscki)) {
+			err = PTR_ERR(fscki);
+			ubifs_err("error %d while processing entry node and "
+				  "trying to find inode node %lu",
+				  err, (unsigned long)inum);
+			goto out_dump;
+		}
+
+		/* Count how many direntries or xentries refers this inode */
+		fscki->references += 1;
+
+		inum = key_inum_flash(c, &dent->key);
+		fscki1 = read_add_inode(c, priv, inum);
+		if (IS_ERR(fscki1)) {
+			err = PTR_ERR(fscki1);
+			ubifs_err("error %d while processing entry node and "
+				  "trying to find parent inode node %lu",
+				  err, (unsigned long)inum);
+			goto out_dump;
+		}
+
+		nlen = le16_to_cpu(dent->nlen);
+		if (type == UBIFS_XENT_KEY) {
+			fscki1->calc_xcnt += 1;
+			fscki1->calc_xsz += CALC_DENT_SIZE(nlen);
+			fscki1->calc_xsz += CALC_XATTR_BYTES(fscki->size);
+			fscki1->calc_xnms += nlen;
+		} else {
+			fscki1->calc_sz += CALC_DENT_SIZE(nlen);
+			if (dent->type == UBIFS_ITYPE_DIR)
+				fscki1->calc_cnt += 1;
+		}
+	}
+
+out:
+	kfree(node);
+	return 0;
+
+out_dump:
+	ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs);
+	dbg_dump_node(c, node);
+out_free:
+	kfree(node);
+	return err;
+}
+
+/**
+ * free_inodes - free RB-tree of inodes.
+ * @fsckd: FS checking information
+ */
+static void free_inodes(struct fsck_data *fsckd)
+{
+	struct rb_node *this = fsckd->inodes.rb_node;
+	struct fsck_inode *fscki;
+
+	while (this) {
+		if (this->rb_left)
+			this = this->rb_left;
+		else if (this->rb_right)
+			this = this->rb_right;
+		else {
+			fscki = rb_entry(this, struct fsck_inode, rb);
+			this = rb_parent(this);
+			if (this) {
+				if (this->rb_left == &fscki->rb)
+					this->rb_left = NULL;
+				else
+					this->rb_right = NULL;
+			}
+			kfree(fscki);
+		}
+	}
+}
+
+/**
+ * check_inodes - checks all inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ *
+ * This is a helper function for 'dbg_check_filesystem()' which walks the
+ * RB-tree of inodes after the index scan has been finished, and checks that
+ * inode nlink, size, etc are correct. Returns zero if inodes are fine,
+ * %-EINVAL if not, and a negative error code in case of failure.
+ */
+static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
+{
+	int n, err;
+	union ubifs_key key;
+	struct ubifs_znode *znode;
+	struct ubifs_zbranch *zbr;
+	struct ubifs_ino_node *ino;
+	struct fsck_inode *fscki;
+	struct rb_node *this = rb_first(&fsckd->inodes);
+
+	while (this) {
+		fscki = rb_entry(this, struct fsck_inode, rb);
+		this = rb_next(this);
+
+		if (S_ISDIR(fscki->mode)) {
+			/*
+			 * Directories have to have exactly one reference (they
+			 * cannot have hardlinks), although root inode is an
+			 * exception.
+			 */
+			if (fscki->inum != UBIFS_ROOT_INO &&
+			    fscki->references != 1) {
+				ubifs_err("directory inode %lu has %d "
+					  "direntries which refer it, but "
+					  "should be 1",
+					  (unsigned long)fscki->inum,
+					  fscki->references);
+				goto out_dump;
+			}
+			if (fscki->inum == UBIFS_ROOT_INO &&
+			    fscki->references != 0) {
+				ubifs_err("root inode %lu has non-zero (%d) "
+					  "direntries which refer it",
+					  (unsigned long)fscki->inum,
+					  fscki->references);
+				goto out_dump;
+			}
+			if (fscki->calc_sz != fscki->size) {
+				ubifs_err("directory inode %lu size is %lld, "
+					  "but calculated size is %lld",
+					  (unsigned long)fscki->inum,
+					  fscki->size, fscki->calc_sz);
+				goto out_dump;
+			}
+			if (fscki->calc_cnt != fscki->nlink) {
+				ubifs_err("directory inode %lu nlink is %d, "
+					  "but calculated nlink is %d",
+					  (unsigned long)fscki->inum,
+					  fscki->nlink, fscki->calc_cnt);
+				goto out_dump;
+			}
+		} else {
+			if (fscki->references != fscki->nlink) {
+				ubifs_err("inode %lu nlink is %d, but "
+					  "calculated nlink is %d",
+					  (unsigned long)fscki->inum,
+					  fscki->nlink, fscki->references);
+				goto out_dump;
+			}
+		}
+		if (fscki->xattr_sz != fscki->calc_xsz) {
+			ubifs_err("inode %lu has xattr size %u, but "
+				  "calculated size is %lld",
+				  (unsigned long)fscki->inum, fscki->xattr_sz,
+				  fscki->calc_xsz);
+			goto out_dump;
+		}
+		if (fscki->xattr_cnt != fscki->calc_xcnt) {
+			ubifs_err("inode %lu has %u xattrs, but "
+				  "calculated count is %lld",
+				  (unsigned long)fscki->inum,
+				  fscki->xattr_cnt, fscki->calc_xcnt);
+			goto out_dump;
+		}
+		if (fscki->xattr_nms != fscki->calc_xnms) {
+			ubifs_err("inode %lu has xattr names' size %u, but "
+				  "calculated names' size is %lld",
+				  (unsigned long)fscki->inum, fscki->xattr_nms,
+				  fscki->calc_xnms);
+			goto out_dump;
+		}
+	}
+
+	return 0;
+
+out_dump:
+	/* Read the bad inode and dump it */
+	ino_key_init(c, &key, fscki->inum);
+	err = ubifs_lookup_level0(c, &key, &znode, &n);
+	if (!err) {
+		ubifs_err("inode %lu not found in index",
+			  (unsigned long)fscki->inum);
+		return -ENOENT;
+	} else if (err < 0) {
+		ubifs_err("error %d while looking up inode %lu",
+			  err, (unsigned long)fscki->inum);
+		return err;
+	}
+
+	zbr = &znode->zbranch[n];
+	ino = kmalloc(zbr->len, GFP_NOFS);
+	if (!ino)
+		return -ENOMEM;
+
+	err = ubifs_tnc_read_node(c, zbr, ino);
+	if (err) {
+		ubifs_err("cannot read inode node at LEB %d:%d, error %d",
+			  zbr->lnum, zbr->offs, err);
+		kfree(ino);
+		return err;
+	}
+
+	ubifs_msg("dump of the inode %lu sitting in LEB %d:%d",
+		  (unsigned long)fscki->inum, zbr->lnum, zbr->offs);
+	dbg_dump_node(c, ino);
+	kfree(ino);
+	return -EINVAL;
+}
+
+/**
+ * dbg_check_filesystem - check the file-system.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks the file system, namely:
+ * o makes sure that all leaf nodes exist and their CRCs are correct;
+ * o makes sure inode nlink, size, xattr size/count are correct (for all
+ *   inodes).
+ *
+ * The function reads whole indexing tree and all nodes, so it is pretty
+ * heavy-weight. Returns zero if the file-system is consistent, %-EINVAL if
+ * not, and a negative error code in case of failure.
+ */
+int dbg_check_filesystem(struct ubifs_info *c)
+{
+	int err;
+	struct fsck_data fsckd;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_FS))
+		return 0;
+
+	fsckd.inodes = RB_ROOT;
+	err = dbg_walk_index(c, check_leaf, NULL, &fsckd);
+	if (err)
+		goto out_free;
+
+	err = check_inodes(c, &fsckd);
+	if (err)
+		goto out_free;
+
+	free_inodes(&fsckd);
+	return 0;
+
+out_free:
+	ubifs_err("file-system check failed with error %d", err);
+	dump_stack();
+	free_inodes(&fsckd);
+	return err;
+}
+
+/**
+ * dbg_check_data_nodes_order - check that list of data nodes is sorted.
+ * @c: UBIFS file-system description object
+ * @head: the list of nodes ('struct ubifs_scan_node' objects)
+ *
+ * This function returns zero if the list of data nodes is sorted correctly,
+ * and %-EINVAL if not.
+ */
+int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
+{
+	struct list_head *cur;
+	struct ubifs_scan_node *sa, *sb;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+
+	for (cur = head->next; cur->next != head; cur = cur->next) {
+		ino_t inuma, inumb;
+		uint32_t blka, blkb;
+
+		cond_resched();
+		sa = container_of(cur, struct ubifs_scan_node, list);
+		sb = container_of(cur->next, struct ubifs_scan_node, list);
+
+		if (sa->type != UBIFS_DATA_NODE) {
+			ubifs_err("bad node type %d", sa->type);
+			dbg_dump_node(c, sa->node);
+			return -EINVAL;
+		}
+		if (sb->type != UBIFS_DATA_NODE) {
+			ubifs_err("bad node type %d", sb->type);
+			dbg_dump_node(c, sb->node);
+			return -EINVAL;
+		}
+
+		inuma = key_inum(c, &sa->key);
+		inumb = key_inum(c, &sb->key);
+
+		if (inuma < inumb)
+			continue;
+		if (inuma > inumb) {
+			ubifs_err("larger inum %lu goes before inum %lu",
+				  (unsigned long)inuma, (unsigned long)inumb);
+			goto error_dump;
+		}
+
+		blka = key_block(c, &sa->key);
+		blkb = key_block(c, &sb->key);
+
+		if (blka > blkb) {
+			ubifs_err("larger block %u goes before %u", blka, blkb);
+			goto error_dump;
+		}
+		if (blka == blkb) {
+			ubifs_err("two data nodes for the same block");
+			goto error_dump;
+		}
+	}
+
+	return 0;
+
+error_dump:
+	dbg_dump_node(c, sa->node);
+	dbg_dump_node(c, sb->node);
+	return -EINVAL;
+}
+
+/**
+ * dbg_check_nondata_nodes_order - check that list of data nodes is sorted.
+ * @c: UBIFS file-system description object
+ * @head: the list of nodes ('struct ubifs_scan_node' objects)
+ *
+ * This function returns zero if the list of non-data nodes is sorted correctly,
+ * and %-EINVAL if not.
+ */
+int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
+{
+	struct list_head *cur;
+	struct ubifs_scan_node *sa, *sb;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+
+	for (cur = head->next; cur->next != head; cur = cur->next) {
+		ino_t inuma, inumb;
+		uint32_t hasha, hashb;
+
+		cond_resched();
+		sa = container_of(cur, struct ubifs_scan_node, list);
+		sb = container_of(cur->next, struct ubifs_scan_node, list);
+
+		if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
+		    sa->type != UBIFS_XENT_NODE) {
+			ubifs_err("bad node type %d", sa->type);
+			dbg_dump_node(c, sa->node);
+			return -EINVAL;
+		}
+		if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
+		    sa->type != UBIFS_XENT_NODE) {
+			ubifs_err("bad node type %d", sb->type);
+			dbg_dump_node(c, sb->node);
+			return -EINVAL;
+		}
+
+		if (sa->type != UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
+			ubifs_err("non-inode node goes before inode node");
+			goto error_dump;
+		}
+
+		if (sa->type == UBIFS_INO_NODE && sb->type != UBIFS_INO_NODE)
+			continue;
+
+		if (sa->type == UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
+			/* Inode nodes are sorted in descending size order */
+			if (sa->len < sb->len) {
+				ubifs_err("smaller inode node goes first");
+				goto error_dump;
+			}
+			continue;
+		}
+
+		/*
+		 * This is either a dentry or xentry, which should be sorted in
+		 * ascending (parent ino, hash) order.
+		 */
+		inuma = key_inum(c, &sa->key);
+		inumb = key_inum(c, &sb->key);
+
+		if (inuma < inumb)
+			continue;
+		if (inuma > inumb) {
+			ubifs_err("larger inum %lu goes before inum %lu",
+				  (unsigned long)inuma, (unsigned long)inumb);
+			goto error_dump;
+		}
+
+		hasha = key_block(c, &sa->key);
+		hashb = key_block(c, &sb->key);
+
+		if (hasha > hashb) {
+			ubifs_err("larger hash %u goes before %u",
+				  hasha, hashb);
+			goto error_dump;
+		}
+	}
+
+	return 0;
+
+error_dump:
+	ubifs_msg("dumping first node");
+	dbg_dump_node(c, sa->node);
+	ubifs_msg("dumping second node");
+	dbg_dump_node(c, sb->node);
+	return -EINVAL;
+	return 0;
+}
+
+int dbg_force_in_the_gaps(void)
+{
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+
+	return !(random32() & 7);
+}
+
+/* Failure mode for recovery testing */
+
+#define chance(n, d) (simple_rand() <= (n) * 32768LL / (d))
+
+struct failure_mode_info {
+	struct list_head list;
+	struct ubifs_info *c;
+};
+
+static LIST_HEAD(fmi_list);
+static DEFINE_SPINLOCK(fmi_lock);
+
+static unsigned int next;
+
+static int simple_rand(void)
+{
+	if (next == 0)
+		next = current->pid;
+	next = next * 1103515245 + 12345;
+	return (next >> 16) & 32767;
+}
+
+static void failure_mode_init(struct ubifs_info *c)
+{
+	struct failure_mode_info *fmi;
+
+	fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS);
+	if (!fmi) {
+		ubifs_err("Failed to register failure mode - no memory");
+		return;
+	}
+	fmi->c = c;
+	spin_lock(&fmi_lock);
+	list_add_tail(&fmi->list, &fmi_list);
+	spin_unlock(&fmi_lock);
+}
+
+static void failure_mode_exit(struct ubifs_info *c)
+{
+	struct failure_mode_info *fmi, *tmp;
+
+	spin_lock(&fmi_lock);
+	list_for_each_entry_safe(fmi, tmp, &fmi_list, list)
+		if (fmi->c == c) {
+			list_del(&fmi->list);
+			kfree(fmi);
+		}
+	spin_unlock(&fmi_lock);
+}
+
+static struct ubifs_info *dbg_find_info(struct ubi_volume_desc *desc)
+{
+	struct failure_mode_info *fmi;
+
+	spin_lock(&fmi_lock);
+	list_for_each_entry(fmi, &fmi_list, list)
+		if (fmi->c->ubi == desc) {
+			struct ubifs_info *c = fmi->c;
+
+			spin_unlock(&fmi_lock);
+			return c;
+		}
+	spin_unlock(&fmi_lock);
+	return NULL;
+}
+
+static int in_failure_mode(struct ubi_volume_desc *desc)
+{
+	struct ubifs_info *c = dbg_find_info(desc);
+
+	if (c && dbg_failure_mode)
+		return c->dbg->failure_mode;
+	return 0;
+}
+
+static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
+{
+	struct ubifs_info *c = dbg_find_info(desc);
+	struct ubifs_debug_info *d;
+
+	if (!c || !dbg_failure_mode)
+		return 0;
+	d = c->dbg;
+	if (d->failure_mode)
+		return 1;
+	if (!d->fail_cnt) {
+		/* First call - decide delay to failure */
+		if (chance(1, 2)) {
+			unsigned int delay = 1 << (simple_rand() >> 11);
+
+			if (chance(1, 2)) {
+				d->fail_delay = 1;
+				d->fail_timeout = jiffies +
+						  msecs_to_jiffies(delay);
+				dbg_rcvry("failing after %ums", delay);
+			} else {
+				d->fail_delay = 2;
+				d->fail_cnt_max = delay;
+				dbg_rcvry("failing after %u calls", delay);
+			}
+		}
+		d->fail_cnt += 1;
+	}
+	/* Determine if failure delay has expired */
+	if (d->fail_delay == 1) {
+		if (time_before(jiffies, d->fail_timeout))
+			return 0;
+	} else if (d->fail_delay == 2)
+		if (d->fail_cnt++ < d->fail_cnt_max)
+			return 0;
+	if (lnum == UBIFS_SB_LNUM) {
+		if (write) {
+			if (chance(1, 2))
+				return 0;
+		} else if (chance(19, 20))
+			return 0;
+		dbg_rcvry("failing in super block LEB %d", lnum);
+	} else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) {
+		if (chance(19, 20))
+			return 0;
+		dbg_rcvry("failing in master LEB %d", lnum);
+	} else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) {
+		if (write) {
+			if (chance(99, 100))
+				return 0;
+		} else if (chance(399, 400))
+			return 0;
+		dbg_rcvry("failing in log LEB %d", lnum);
+	} else if (lnum >= c->lpt_first && lnum <= c->lpt_last) {
+		if (write) {
+			if (chance(7, 8))
+				return 0;
+		} else if (chance(19, 20))
+			return 0;
+		dbg_rcvry("failing in LPT LEB %d", lnum);
+	} else if (lnum >= c->orph_first && lnum <= c->orph_last) {
+		if (write) {
+			if (chance(1, 2))
+				return 0;
+		} else if (chance(9, 10))
+			return 0;
+		dbg_rcvry("failing in orphan LEB %d", lnum);
+	} else if (lnum == c->ihead_lnum) {
+		if (chance(99, 100))
+			return 0;
+		dbg_rcvry("failing in index head LEB %d", lnum);
+	} else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) {
+		if (chance(9, 10))
+			return 0;
+		dbg_rcvry("failing in GC head LEB %d", lnum);
+	} else if (write && !RB_EMPTY_ROOT(&c->buds) &&
+		   !ubifs_search_bud(c, lnum)) {
+		if (chance(19, 20))
+			return 0;
+		dbg_rcvry("failing in non-bud LEB %d", lnum);
+	} else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND ||
+		   c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+		if (chance(999, 1000))
+			return 0;
+		dbg_rcvry("failing in bud LEB %d commit running", lnum);
+	} else {
+		if (chance(9999, 10000))
+			return 0;
+		dbg_rcvry("failing in bud LEB %d commit not running", lnum);
+	}
+	ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum);
+	d->failure_mode = 1;
+	dump_stack();
+	return 1;
+}
+
+static void cut_data(const void *buf, int len)
+{
+	int flen, i;
+	unsigned char *p = (void *)buf;
+
+	flen = (len * (long long)simple_rand()) >> 15;
+	for (i = flen; i < len; i++)
+		p[i] = 0xff;
+}
+
+int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
+		 int len, int check)
+{
+	if (in_failure_mode(desc))
+		return -EROFS;
+	return ubi_leb_read(desc, lnum, buf, offset, len, check);
+}
+
+int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
+		  int offset, int len, int dtype)
+{
+	int err, failing;
+
+	if (in_failure_mode(desc))
+		return -EROFS;
+	failing = do_fail(desc, lnum, 1);
+	if (failing)
+		cut_data(buf, len);
+	err = ubi_leb_write(desc, lnum, buf, offset, len, dtype);
+	if (err)
+		return err;
+	if (failing)
+		return -EROFS;
+	return 0;
+}
+
+int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
+		   int len, int dtype)
+{
+	int err;
+
+	if (do_fail(desc, lnum, 1))
+		return -EROFS;
+	err = ubi_leb_change(desc, lnum, buf, len, dtype);
+	if (err)
+		return err;
+	if (do_fail(desc, lnum, 1))
+		return -EROFS;
+	return 0;
+}
+
+int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
+{
+	int err;
+
+	if (do_fail(desc, lnum, 0))
+		return -EROFS;
+	err = ubi_leb_erase(desc, lnum);
+	if (err)
+		return err;
+	if (do_fail(desc, lnum, 0))
+		return -EROFS;
+	return 0;
+}
+
+int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
+{
+	int err;
+
+	if (do_fail(desc, lnum, 0))
+		return -EROFS;
+	err = ubi_leb_unmap(desc, lnum);
+	if (err)
+		return err;
+	if (do_fail(desc, lnum, 0))
+		return -EROFS;
+	return 0;
+}
+
+int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
+{
+	if (in_failure_mode(desc))
+		return -EROFS;
+	return ubi_is_mapped(desc, lnum);
+}
+
+int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
+{
+	int err;
+
+	if (do_fail(desc, lnum, 0))
+		return -EROFS;
+	err = ubi_leb_map(desc, lnum, dtype);
+	if (err)
+		return err;
+	if (do_fail(desc, lnum, 0))
+		return -EROFS;
+	return 0;
+}
+
+/**
+ * ubifs_debugging_init - initialize UBIFS debugging.
+ * @c: UBIFS file-system description object
+ *
+ * This function initializes debugging-related data for the file system.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_debugging_init(struct ubifs_info *c)
+{
+	c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL);
+	if (!c->dbg)
+		return -ENOMEM;
+
+	failure_mode_init(c);
+	return 0;
+}
+
+/**
+ * ubifs_debugging_exit - free debugging data.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_debugging_exit(struct ubifs_info *c)
+{
+	failure_mode_exit(c);
+	kfree(c->dbg);
+}
+
+/*
+ * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
+ * contain the stuff specific to particular file-system mounts.
+ */
+static struct dentry *dfs_rootdir;
+
+/**
+ * dbg_debugfs_init - initialize debugfs file-system.
+ *
+ * UBIFS uses debugfs file-system to expose various debugging knobs to
+ * user-space. This function creates "ubifs" directory in the debugfs
+ * file-system. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int dbg_debugfs_init(void)
+{
+	dfs_rootdir = debugfs_create_dir("ubifs", NULL);
+	if (IS_ERR(dfs_rootdir)) {
+		int err = PTR_ERR(dfs_rootdir);
+		ubifs_err("cannot create \"ubifs\" debugfs directory, "
+			  "error %d\n", err);
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system.
+ */
+void dbg_debugfs_exit(void)
+{
+	debugfs_remove(dfs_rootdir);
+}
+
+static int open_debugfs_file(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return nonseekable_open(inode, file);
+}
+
+static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct ubifs_info *c = file->private_data;
+	struct ubifs_debug_info *d = c->dbg;
+
+	if (file->f_path.dentry == d->dfs_dump_lprops)
+		dbg_dump_lprops(c);
+	else if (file->f_path.dentry == d->dfs_dump_budg)
+		dbg_dump_budg(c, &c->bi);
+	else if (file->f_path.dentry == d->dfs_dump_tnc) {
+		mutex_lock(&c->tnc_mutex);
+		dbg_dump_tnc(c);
+		mutex_unlock(&c->tnc_mutex);
+	} else
+		return -EINVAL;
+
+	return count;
+}
+
+static const struct file_operations dfs_fops = {
+	.open = open_debugfs_file,
+	.write = write_debugfs_file,
+	.owner = THIS_MODULE,
+	.llseek = no_llseek,
+};
+
+/**
+ * dbg_debugfs_init_fs - initialize debugfs for UBIFS instance.
+ * @c: UBIFS file-system description object
+ *
+ * This function creates all debugfs files for this instance of UBIFS. Returns
+ * zero in case of success and a negative error code in case of failure.
+ *
+ * Note, the only reason we have not merged this function with the
+ * 'ubifs_debugging_init()' function is because it is better to initialize
+ * debugfs interfaces at the very end of the mount process, and remove them at
+ * the very beginning of the mount process.
+ */
+int dbg_debugfs_init_fs(struct ubifs_info *c)
+{
+	int err;
+	const char *fname;
+	struct dentry *dent;
+	struct ubifs_debug_info *d = c->dbg;
+
+	sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
+	fname = d->dfs_dir_name;
+	dent = debugfs_create_dir(fname, dfs_rootdir);
+	if (IS_ERR_OR_NULL(dent))
+		goto out;
+	d->dfs_dir = dent;
+
+	fname = "dump_lprops";
+	dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	d->dfs_dump_lprops = dent;
+
+	fname = "dump_budg";
+	dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	d->dfs_dump_budg = dent;
+
+	fname = "dump_tnc";
+	dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	d->dfs_dump_tnc = dent;
+
+	return 0;
+
+out_remove:
+	debugfs_remove_recursive(d->dfs_dir);
+out:
+	err = dent ? PTR_ERR(dent) : -ENODEV;
+	ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
+		  fname, err);
+	return err;
+}
+
+/**
+ * dbg_debugfs_exit_fs - remove all debugfs files.
+ * @c: UBIFS file-system description object
+ */
+void dbg_debugfs_exit_fs(struct ubifs_info *c)
+{
+	debugfs_remove_recursive(c->dbg->dfs_dir);
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
new file mode 100644
index 00000000..fd75b635
--- /dev/null
+++ b/fs/ubifs/debug.h
@@ -0,0 +1,445 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+#ifndef __UBIFS_DEBUG_H__
+#define __UBIFS_DEBUG_H__
+
+/* Checking helper functions */
+typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
+				 struct ubifs_zbranch *zbr, void *priv);
+typedef int (*dbg_znode_callback)(struct ubifs_info *c,
+				  struct ubifs_znode *znode, void *priv);
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+#include <linux/random.h>
+
+/**
+ * ubifs_debug_info - per-FS debugging information.
+ * @old_zroot: old index root - used by 'dbg_check_old_index()'
+ * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
+ * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
+ * @failure_mode: failure mode for recovery testing
+ * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
+ * @fail_timeout: time in jiffies when delay of failure mode expires
+ * @fail_cnt: current number of calls to failure mode I/O functions
+ * @fail_cnt_max: number of calls by which to delay failure mode
+ * @chk_lpt_sz: used by LPT tree size checker
+ * @chk_lpt_sz2: used by LPT tree size checker
+ * @chk_lpt_wastage: used by LPT tree size checker
+ * @chk_lpt_lebs: used by LPT tree size checker
+ * @new_nhead_offs: used by LPT tree size checker
+ * @new_ihead_lnum: used by debugging to check @c->ihead_lnum
+ * @new_ihead_offs: used by debugging to check @c->ihead_offs
+ *
+ * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
+ * @saved_bi: saved budgeting information
+ * @saved_free: saved amount of free space
+ * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt
+ *
+ * @dfs_dir_name: name of debugfs directory containing this file-system's files
+ * @dfs_dir: direntry object of the file-system debugfs directory
+ * @dfs_dump_lprops: "dump lprops" debugfs knob
+ * @dfs_dump_budg: "dump budgeting information" debugfs knob
+ * @dfs_dump_tnc: "dump TNC" debugfs knob
+ */
+struct ubifs_debug_info {
+	struct ubifs_zbranch old_zroot;
+	int old_zroot_level;
+	unsigned long long old_zroot_sqnum;
+	int failure_mode;
+	int fail_delay;
+	unsigned long fail_timeout;
+	unsigned int fail_cnt;
+	unsigned int fail_cnt_max;
+	long long chk_lpt_sz;
+	long long chk_lpt_sz2;
+	long long chk_lpt_wastage;
+	int chk_lpt_lebs;
+	int new_nhead_offs;
+	int new_ihead_lnum;
+	int new_ihead_offs;
+
+	struct ubifs_lp_stats saved_lst;
+	struct ubifs_budg_info saved_bi;
+	long long saved_free;
+	int saved_idx_gc_cnt;
+
+	char dfs_dir_name[100];
+	struct dentry *dfs_dir;
+	struct dentry *dfs_dump_lprops;
+	struct dentry *dfs_dump_budg;
+	struct dentry *dfs_dump_tnc;
+};
+
+#define ubifs_assert(expr) do {                                                \
+	if (unlikely(!(expr))) {                                               \
+		printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
+		       __func__, __LINE__, current->pid);                      \
+		dbg_dump_stack();                                              \
+	}                                                                      \
+} while (0)
+
+#define ubifs_assert_cmt_locked(c) do {                                        \
+	if (unlikely(down_write_trylock(&(c)->commit_sem))) {                  \
+		up_write(&(c)->commit_sem);                                    \
+		printk(KERN_CRIT "commit lock is not locked!\n");              \
+		ubifs_assert(0);                                               \
+	}                                                                      \
+} while (0)
+
+#define dbg_dump_stack() dump_stack()
+
+#define dbg_err(fmt, ...) do {                                                 \
+	spin_lock(&dbg_lock);                                                  \
+	ubifs_err(fmt, ##__VA_ARGS__);                                         \
+	spin_unlock(&dbg_lock);                                                \
+} while (0)
+
+const char *dbg_key_str0(const struct ubifs_info *c,
+			 const union ubifs_key *key);
+const char *dbg_key_str1(const struct ubifs_info *c,
+			 const union ubifs_key *key);
+
+/*
+ * TODO: these macros are now broken because there is no locking around them
+ * and we use a global buffer for the key string. This means that in case of
+ * concurrent execution we will end up with incorrect and messy key strings.
+ */
+#define DBGKEY(key) dbg_key_str0(c, (key))
+#define DBGKEY1(key) dbg_key_str1(c, (key))
+
+#define ubifs_dbg_msg(type, fmt, ...) \
+	pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
+
+/* Just a debugging messages not related to any specific UBIFS subsystem */
+#define dbg_msg(fmt, ...)                                                     \
+	printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid,  \
+	       __func__, ##__VA_ARGS__)
+
+/* General messages */
+#define dbg_gen(fmt, ...)   ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
+/* Additional journal messages */
+#define dbg_jnl(fmt, ...)   ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
+/* Additional TNC messages */
+#define dbg_tnc(fmt, ...)   ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
+/* Additional lprops messages */
+#define dbg_lp(fmt, ...)    ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
+/* Additional LEB find messages */
+#define dbg_find(fmt, ...)  ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
+/* Additional mount messages */
+#define dbg_mnt(fmt, ...)   ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
+/* Additional I/O messages */
+#define dbg_io(fmt, ...)    ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
+/* Additional commit messages */
+#define dbg_cmt(fmt, ...)   ubifs_dbg_msg("cmt", fmt, ##__VA_ARGS__)
+/* Additional budgeting messages */
+#define dbg_budg(fmt, ...)  ubifs_dbg_msg("budg", fmt, ##__VA_ARGS__)
+/* Additional log messages */
+#define dbg_log(fmt, ...)   ubifs_dbg_msg("log", fmt, ##__VA_ARGS__)
+/* Additional gc messages */
+#define dbg_gc(fmt, ...)    ubifs_dbg_msg("gc", fmt, ##__VA_ARGS__)
+/* Additional scan messages */
+#define dbg_scan(fmt, ...)  ubifs_dbg_msg("scan", fmt, ##__VA_ARGS__)
+/* Additional recovery messages */
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
+
+/*
+ * Debugging check flags.
+ *
+ * UBIFS_CHK_GEN: general checks
+ * UBIFS_CHK_TNC: check TNC
+ * UBIFS_CHK_IDX_SZ: check index size
+ * UBIFS_CHK_ORPH: check orphans
+ * UBIFS_CHK_OLD_IDX: check the old index
+ * UBIFS_CHK_LPROPS: check lprops
+ * UBIFS_CHK_FS: check the file-system
+ */
+enum {
+	UBIFS_CHK_GEN     = 0x1,
+	UBIFS_CHK_TNC     = 0x2,
+	UBIFS_CHK_IDX_SZ  = 0x4,
+	UBIFS_CHK_ORPH    = 0x8,
+	UBIFS_CHK_OLD_IDX = 0x10,
+	UBIFS_CHK_LPROPS  = 0x20,
+	UBIFS_CHK_FS      = 0x40,
+};
+
+/*
+ * Special testing flags.
+ *
+ * UBIFS_TST_RCVRY: failure mode for recovery testing
+ */
+enum {
+	UBIFS_TST_RCVRY             = 0x4,
+};
+
+extern spinlock_t dbg_lock;
+
+extern unsigned int ubifs_msg_flags;
+extern unsigned int ubifs_chk_flags;
+extern unsigned int ubifs_tst_flags;
+
+int ubifs_debugging_init(struct ubifs_info *c);
+void ubifs_debugging_exit(struct ubifs_info *c);
+
+/* Dump functions */
+const char *dbg_ntype(int type);
+const char *dbg_cstate(int cmt_state);
+const char *dbg_jhead(int jhead);
+const char *dbg_get_key_dump(const struct ubifs_info *c,
+			     const union ubifs_key *key);
+void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
+void dbg_dump_node(const struct ubifs_info *c, const void *node);
+void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
+		       int offs);
+void dbg_dump_budget_req(const struct ubifs_budget_req *req);
+void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
+void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi);
+void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
+void dbg_dump_lprops(struct ubifs_info *c);
+void dbg_dump_lpt_info(struct ubifs_info *c);
+void dbg_dump_leb(const struct ubifs_info *c, int lnum);
+void dbg_dump_znode(const struct ubifs_info *c,
+		    const struct ubifs_znode *znode);
+void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
+void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+		    struct ubifs_nnode *parent, int iip);
+void dbg_dump_tnc(struct ubifs_info *c);
+void dbg_dump_index(struct ubifs_info *c);
+void dbg_dump_lpt_lebs(const struct ubifs_info *c);
+
+int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
+		   dbg_znode_callback znode_cb, void *priv);
+
+/* Checking functions */
+void dbg_save_space_info(struct ubifs_info *c);
+int dbg_check_space_info(struct ubifs_info *c);
+int dbg_check_lprops(struct ubifs_info *c);
+int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int dbg_check_cats(struct ubifs_info *c);
+int dbg_check_ltab(struct ubifs_info *c);
+int dbg_chk_lpt_free_spc(struct ubifs_info *c);
+int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len);
+int dbg_check_synced_i_size(struct inode *inode);
+int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir);
+int dbg_check_tnc(struct ubifs_info *c, int extra);
+int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
+int dbg_check_filesystem(struct ubifs_info *c);
+void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
+		    int add_pos);
+int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
+			int row, int col);
+int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
+			 loff_t size);
+int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
+int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
+
+/* Force the use of in-the-gaps method for testing */
+static inline int dbg_force_in_the_gaps_enabled(void)
+{
+	return ubifs_chk_flags & UBIFS_CHK_GEN;
+}
+int dbg_force_in_the_gaps(void);
+
+/* Failure mode for recovery testing */
+#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
+
+#ifndef UBIFS_DBG_PRESERVE_UBI
+#define ubi_leb_read   dbg_leb_read
+#define ubi_leb_write  dbg_leb_write
+#define ubi_leb_change dbg_leb_change
+#define ubi_leb_erase  dbg_leb_erase
+#define ubi_leb_unmap  dbg_leb_unmap
+#define ubi_is_mapped  dbg_is_mapped
+#define ubi_leb_map    dbg_leb_map
+#endif
+
+int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
+		 int len, int check);
+int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
+		  int offset, int len, int dtype);
+int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
+		   int len, int dtype);
+int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum);
+int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum);
+int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum);
+int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype);
+
+static inline int dbg_read(struct ubi_volume_desc *desc, int lnum, char *buf,
+			   int offset, int len)
+{
+	return dbg_leb_read(desc, lnum, buf, offset, len, 0);
+}
+
+static inline int dbg_write(struct ubi_volume_desc *desc, int lnum,
+			    const void *buf, int offset, int len)
+{
+	return dbg_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN);
+}
+
+static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
+				    const void *buf, int len)
+{
+	return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
+}
+
+/* Debugfs-related stuff */
+int dbg_debugfs_init(void);
+void dbg_debugfs_exit(void);
+int dbg_debugfs_init_fs(struct ubifs_info *c);
+void dbg_debugfs_exit_fs(struct ubifs_info *c);
+
+#else /* !CONFIG_UBIFS_FS_DEBUG */
+
+/* Use "if (0)" to make compiler check arguments even if debugging is off */
+#define ubifs_assert(expr)  do {                                               \
+	if (0 && (expr))                                                       \
+		printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
+		       __func__, __LINE__, current->pid);                      \
+} while (0)
+
+#define dbg_err(fmt, ...)   do {                   \
+	if (0)                                     \
+		ubifs_err(fmt, ##__VA_ARGS__);     \
+} while (0)
+
+#define ubifs_dbg_msg(fmt, ...) do {               \
+	if (0)                                     \
+		pr_debug(fmt "\n", ##__VA_ARGS__); \
+} while (0)
+
+#define dbg_dump_stack()
+#define ubifs_assert_cmt_locked(c)
+
+#define dbg_msg(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gen(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+
+#define DBGKEY(key)  ((char *)(key))
+#define DBGKEY1(key) ((char *)(key))
+
+static inline int ubifs_debugging_init(struct ubifs_info *c)      { return 0; }
+static inline void ubifs_debugging_exit(struct ubifs_info *c)     { return; }
+static inline const char *dbg_ntype(int type)                     { return ""; }
+static inline const char *dbg_cstate(int cmt_state)               { return ""; }
+static inline const char *dbg_jhead(int jhead)                    { return ""; }
+static inline const char *
+dbg_get_key_dump(const struct ubifs_info *c,
+		 const union ubifs_key *key)                      { return ""; }
+static inline void dbg_dump_inode(const struct ubifs_info *c,
+				  const struct inode *inode)      { return; }
+static inline void dbg_dump_node(const struct ubifs_info *c,
+				 const void *node)                { return; }
+static inline void dbg_dump_lpt_node(const struct ubifs_info *c,
+				     void *node, int lnum,
+				     int offs)                    { return; }
+static inline void
+dbg_dump_budget_req(const struct ubifs_budget_req *req)           { return; }
+static inline void
+dbg_dump_lstats(const struct ubifs_lp_stats *lst)                 { return; }
+static inline void
+dbg_dump_budg(struct ubifs_info *c,
+	      const struct ubifs_budg_info *bi)                   { return; }
+static inline void dbg_dump_lprop(const struct ubifs_info *c,
+				  const struct ubifs_lprops *lp)  { return; }
+static inline void dbg_dump_lprops(struct ubifs_info *c)          { return; }
+static inline void dbg_dump_lpt_info(struct ubifs_info *c)        { return; }
+static inline void dbg_dump_leb(const struct ubifs_info *c,
+				int lnum)                         { return; }
+static inline void
+dbg_dump_znode(const struct ubifs_info *c,
+	       const struct ubifs_znode *znode)                   { return; }
+static inline void dbg_dump_heap(struct ubifs_info *c,
+				 struct ubifs_lpt_heap *heap,
+				 int cat)                         { return; }
+static inline void dbg_dump_pnode(struct ubifs_info *c,
+				  struct ubifs_pnode *pnode,
+				  struct ubifs_nnode *parent,
+				  int iip)                        { return; }
+static inline void dbg_dump_tnc(struct ubifs_info *c)             { return; }
+static inline void dbg_dump_index(struct ubifs_info *c)           { return; }
+static inline void dbg_dump_lpt_lebs(const struct ubifs_info *c)  { return; }
+
+static inline int dbg_walk_index(struct ubifs_info *c,
+				 dbg_leaf_callback leaf_cb,
+				 dbg_znode_callback znode_cb,
+				 void *priv)                      { return 0; }
+static inline void dbg_save_space_info(struct ubifs_info *c)      { return; }
+static inline int dbg_check_space_info(struct ubifs_info *c)      { return 0; }
+static inline int dbg_check_lprops(struct ubifs_info *c)          { return 0; }
+static inline int
+dbg_old_index_check_init(struct ubifs_info *c,
+			 struct ubifs_zbranch *zroot)             { return 0; }
+static inline int
+dbg_check_old_index(struct ubifs_info *c,
+		    struct ubifs_zbranch *zroot)                  { return 0; }
+static inline int dbg_check_cats(struct ubifs_info *c)            { return 0; }
+static inline int dbg_check_ltab(struct ubifs_info *c)            { return 0; }
+static inline int dbg_chk_lpt_free_spc(struct ubifs_info *c)      { return 0; }
+static inline int dbg_chk_lpt_sz(struct ubifs_info *c,
+				 int action, int len)             { return 0; }
+static inline int dbg_check_synced_i_size(struct inode *inode)    { return 0; }
+static inline int dbg_check_dir_size(struct ubifs_info *c,
+				     const struct inode *dir)     { return 0; }
+static inline int dbg_check_tnc(struct ubifs_info *c, int extra)  { return 0; }
+static inline int dbg_check_idx_size(struct ubifs_info *c,
+				     long long idx_size)          { return 0; }
+static inline int dbg_check_filesystem(struct ubifs_info *c)      { return 0; }
+static inline void dbg_check_heap(struct ubifs_info *c,
+				  struct ubifs_lpt_heap *heap,
+				  int cat, int add_pos)           { return; }
+static inline int dbg_check_lpt_nodes(struct ubifs_info *c,
+	struct ubifs_cnode *cnode, int row, int col)              { return 0; }
+static inline int dbg_check_inode_size(struct ubifs_info *c,
+				       const struct inode *inode,
+				       loff_t size)               { return 0; }
+static inline int
+dbg_check_data_nodes_order(struct ubifs_info *c,
+			   struct list_head *head)                { return 0; }
+static inline int
+dbg_check_nondata_nodes_order(struct ubifs_info *c,
+			      struct list_head *head)             { return 0; }
+
+static inline int dbg_force_in_the_gaps(void)                     { return 0; }
+#define dbg_force_in_the_gaps_enabled() 0
+#define dbg_failure_mode                0
+
+static inline int dbg_debugfs_init(void)                          { return 0; }
+static inline void dbg_debugfs_exit(void)                         { return; }
+static inline int dbg_debugfs_init_fs(struct ubifs_info *c)       { return 0; }
+static inline int dbg_debugfs_exit_fs(struct ubifs_info *c)       { return 0; }
+
+#endif /* !CONFIG_UBIFS_FS_DEBUG */
+#endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
new file mode 100644
index 00000000..ef5abd38
--- /dev/null
+++ b/fs/ubifs/dir.c
@@ -0,0 +1,1206 @@
+/* * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ *          Zoltan Sogor
+ */
+
+/*
+ * This file implements directory operations.
+ *
+ * All FS operations in this file allocate budget before writing anything to the
+ * media. If they fail to allocate it, the error is returned. The only
+ * exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even
+ * if they unable to allocate the budget, because deletion %-ENOSPC failure is
+ * not what users are usually ready to get. UBIFS budgeting subsystem has some
+ * space reserved for these purposes.
+ *
+ * All operations in this file write all inodes which they change straight
+ * away, instead of marking them dirty. For example, 'ubifs_link()' changes
+ * @i_size of the parent inode and writes the parent inode together with the
+ * target inode. This was done to simplify file-system recovery which would
+ * otherwise be very difficult to do. The only exception is rename which marks
+ * the re-named inode dirty (because its @i_ctime is updated) but does not
+ * write it, but just marks it as dirty.
+ */
+
+#include "ubifs.h"
+
+/**
+ * inherit_flags - inherit flags of the parent inode.
+ * @dir: parent inode
+ * @mode: new inode mode flags
+ *
+ * This is a helper function for 'ubifs_new_inode()' which inherits flag of the
+ * parent directory inode @dir. UBIFS inodes inherit the following flags:
+ * o %UBIFS_COMPR_FL, which is useful to switch compression on/of on
+ *   sub-directory basis;
+ * o %UBIFS_SYNC_FL - useful for the same reasons;
+ * o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories.
+ *
+ * This function returns the inherited flags.
+ */
+static int inherit_flags(const struct inode *dir, int mode)
+{
+	int flags;
+	const struct ubifs_inode *ui = ubifs_inode(dir);
+
+	if (!S_ISDIR(dir->i_mode))
+		/*
+		 * The parent is not a directory, which means that an extended
+		 * attribute inode is being created. No flags.
+		 */
+		return 0;
+
+	flags = ui->flags & (UBIFS_COMPR_FL | UBIFS_SYNC_FL | UBIFS_DIRSYNC_FL);
+	if (!S_ISDIR(mode))
+		/* The "DIRSYNC" flag only applies to directories */
+		flags &= ~UBIFS_DIRSYNC_FL;
+	return flags;
+}
+
+/**
+ * ubifs_new_inode - allocate new UBIFS inode object.
+ * @c: UBIFS file-system description object
+ * @dir: parent directory inode
+ * @mode: inode mode flags
+ *
+ * This function finds an unused inode number, allocates new inode and
+ * initializes it. Returns new inode in case of success and an error code in
+ * case of failure.
+ */
+struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+			      int mode)
+{
+	struct inode *inode;
+	struct ubifs_inode *ui;
+
+	inode = new_inode(c->vfs_sb);
+	ui = ubifs_inode(inode);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and
+	 * marking them dirty in file write path (see 'file_update_time()').
+	 * UBIFS has to fully control "clean <-> dirty" transitions of inodes
+	 * to make budgeting work.
+	 */
+	inode->i_flags |= (S_NOCMTIME);
+
+	inode_init_owner(inode, dir, mode);
+	inode->i_mtime = inode->i_atime = inode->i_ctime =
+			 ubifs_current_time(inode);
+	inode->i_mapping->nrpages = 0;
+	/* Disable readahead */
+	inode->i_mapping->backing_dev_info = &c->bdi;
+
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_mapping->a_ops = &ubifs_file_address_operations;
+		inode->i_op = &ubifs_file_inode_operations;
+		inode->i_fop = &ubifs_file_operations;
+		break;
+	case S_IFDIR:
+		inode->i_op  = &ubifs_dir_inode_operations;
+		inode->i_fop = &ubifs_dir_operations;
+		inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ;
+		break;
+	case S_IFLNK:
+		inode->i_op = &ubifs_symlink_inode_operations;
+		break;
+	case S_IFSOCK:
+	case S_IFIFO:
+	case S_IFBLK:
+	case S_IFCHR:
+		inode->i_op  = &ubifs_file_inode_operations;
+		break;
+	default:
+		BUG();
+	}
+
+	ui->flags = inherit_flags(dir, mode);
+	ubifs_set_inode_flags(inode);
+	if (S_ISREG(mode))
+		ui->compr_type = c->default_compr;
+	else
+		ui->compr_type = UBIFS_COMPR_NONE;
+	ui->synced_i_size = 0;
+
+	spin_lock(&c->cnt_lock);
+	/* Inode number overflow is currently not supported */
+	if (c->highest_inum >= INUM_WARN_WATERMARK) {
+		if (c->highest_inum >= INUM_WATERMARK) {
+			spin_unlock(&c->cnt_lock);
+			ubifs_err("out of inode numbers");
+			make_bad_inode(inode);
+			iput(inode);
+			return ERR_PTR(-EINVAL);
+		}
+		ubifs_warn("running out of inode numbers (current %lu, max %d)",
+			   (unsigned long)c->highest_inum, INUM_WATERMARK);
+	}
+
+	inode->i_ino = ++c->highest_inum;
+	/*
+	 * The creation sequence number remains with this inode for its
+	 * lifetime. All nodes for this inode have a greater sequence number,
+	 * and so it is possible to distinguish obsolete nodes belonging to a
+	 * previous incarnation of the same inode number - for example, for the
+	 * purpose of rebuilding the index.
+	 */
+	ui->creat_sqnum = ++c->max_sqnum;
+	spin_unlock(&c->cnt_lock);
+	return inode;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm)
+{
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+	if (le16_to_cpu(dent->nlen) != nm->len)
+		return -EINVAL;
+	if (memcmp(dent->name, nm->name, nm->len))
+		return -EINVAL;
+	return 0;
+}
+
+#else
+
+#define dbg_check_name(dent, nm) 0
+
+#endif
+
+static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	int err;
+	union ubifs_key key;
+	struct inode *inode = NULL;
+	struct ubifs_dent_node *dent;
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+
+	dbg_gen("'%.*s' in dir ino %lu",
+		dentry->d_name.len, dentry->d_name.name, dir->i_ino);
+
+	if (dentry->d_name.len > UBIFS_MAX_NLEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+	if (!dent)
+		return ERR_PTR(-ENOMEM);
+
+	dent_key_init(c, &key, dir->i_ino, &dentry->d_name);
+
+	err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
+	if (err) {
+		if (err == -ENOENT) {
+			dbg_gen("not found");
+			goto done;
+		}
+		goto out;
+	}
+
+	if (dbg_check_name(dent, &dentry->d_name)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
+	if (IS_ERR(inode)) {
+		/*
+		 * This should not happen. Probably the file-system needs
+		 * checking.
+		 */
+		err = PTR_ERR(inode);
+		ubifs_err("dead directory entry '%.*s', error %d",
+			  dentry->d_name.len, dentry->d_name.name, err);
+		ubifs_ro_mode(c, err);
+		goto out;
+	}
+
+done:
+	kfree(dent);
+	/*
+	 * Note, d_splice_alias() would be required instead if we supported
+	 * NFS.
+	 */
+	d_add(dentry, inode);
+	return NULL;
+
+out:
+	kfree(dent);
+	return ERR_PTR(err);
+}
+
+static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode,
+			struct nameidata *nd)
+{
+	struct inode *inode;
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+					.dirtied_ino = 1 };
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+
+	/*
+	 * Budget request settings: new inode, new direntry, changing the
+	 * parent directory inode.
+	 */
+
+	dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
+		dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	inode = ubifs_new_inode(c, dir, mode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_budg;
+	}
+
+	mutex_lock(&dir_ui->ui_mutex);
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&dir_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	insert_inode_hash(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_cancel:
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	mutex_unlock(&dir_ui->ui_mutex);
+	make_bad_inode(inode);
+	iput(inode);
+out_budg:
+	ubifs_release_budget(c, &req);
+	ubifs_err("cannot create regular file, error %d", err);
+	return err;
+}
+
+/**
+ * vfs_dent_type - get VFS directory entry type.
+ * @type: UBIFS directory entry type
+ *
+ * This function converts UBIFS directory entry type into VFS directory entry
+ * type.
+ */
+static unsigned int vfs_dent_type(uint8_t type)
+{
+	switch (type) {
+	case UBIFS_ITYPE_REG:
+		return DT_REG;
+	case UBIFS_ITYPE_DIR:
+		return DT_DIR;
+	case UBIFS_ITYPE_LNK:
+		return DT_LNK;
+	case UBIFS_ITYPE_BLK:
+		return DT_BLK;
+	case UBIFS_ITYPE_CHR:
+		return DT_CHR;
+	case UBIFS_ITYPE_FIFO:
+		return DT_FIFO;
+	case UBIFS_ITYPE_SOCK:
+		return DT_SOCK;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+/*
+ * The classical Unix view for directory is that it is a linear array of
+ * (name, inode number) entries. Linux/VFS assumes this model as well.
+ * Particularly, 'readdir()' call wants us to return a directory entry offset
+ * which later may be used to continue 'readdir()'ing the directory or to
+ * 'seek()' to that specific direntry. Obviously UBIFS does not really fit this
+ * model because directory entries are identified by keys, which may collide.
+ *
+ * UBIFS uses directory entry hash value for directory offsets, so
+ * 'seekdir()'/'telldir()' may not always work because of possible key
+ * collisions. But UBIFS guarantees that consecutive 'readdir()' calls work
+ * properly by means of saving full directory entry name in the private field
+ * of the file description object.
+ *
+ * This means that UBIFS cannot support NFS which requires full
+ * 'seekdir()'/'telldir()' support.
+ */
+static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+	int err, over = 0;
+	struct qstr nm;
+	union ubifs_key key;
+	struct ubifs_dent_node *dent;
+	struct inode *dir = file->f_path.dentry->d_inode;
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+
+	dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
+
+	if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2)
+		/*
+		 * The directory was seek'ed to a senseless position or there
+		 * are no more entries.
+		 */
+		return 0;
+
+	/* File positions 0 and 1 correspond to "." and ".." */
+	if (file->f_pos == 0) {
+		ubifs_assert(!file->private_data);
+		over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
+		if (over)
+			return 0;
+		file->f_pos = 1;
+	}
+
+	if (file->f_pos == 1) {
+		ubifs_assert(!file->private_data);
+		over = filldir(dirent, "..", 2, 1,
+			       parent_ino(file->f_path.dentry), DT_DIR);
+		if (over)
+			return 0;
+
+		/* Find the first entry in TNC and save it */
+		lowest_dent_key(c, &key, dir->i_ino);
+		nm.name = NULL;
+		dent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(dent)) {
+			err = PTR_ERR(dent);
+			goto out;
+		}
+
+		file->f_pos = key_hash_flash(c, &dent->key);
+		file->private_data = dent;
+	}
+
+	dent = file->private_data;
+	if (!dent) {
+		/*
+		 * The directory was seek'ed to and is now readdir'ed.
+		 * Find the entry corresponding to @file->f_pos or the
+		 * closest one.
+		 */
+		dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
+		nm.name = NULL;
+		dent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(dent)) {
+			err = PTR_ERR(dent);
+			goto out;
+		}
+		file->f_pos = key_hash_flash(c, &dent->key);
+		file->private_data = dent;
+	}
+
+	while (1) {
+		dbg_gen("feed '%s', ino %llu, new f_pos %#x",
+			dent->name, (unsigned long long)le64_to_cpu(dent->inum),
+			key_hash_flash(c, &dent->key));
+		ubifs_assert(le64_to_cpu(dent->ch.sqnum) >
+			     ubifs_inode(dir)->creat_sqnum);
+
+		nm.len = le16_to_cpu(dent->nlen);
+		over = filldir(dirent, dent->name, nm.len, file->f_pos,
+			       le64_to_cpu(dent->inum),
+			       vfs_dent_type(dent->type));
+		if (over)
+			return 0;
+
+		/* Switch to the next entry */
+		key_read(c, &dent->key, &key);
+		nm.name = dent->name;
+		dent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(dent)) {
+			err = PTR_ERR(dent);
+			goto out;
+		}
+
+		kfree(file->private_data);
+		file->f_pos = key_hash_flash(c, &dent->key);
+		file->private_data = dent;
+		cond_resched();
+	}
+
+out:
+	if (err != -ENOENT) {
+		ubifs_err("cannot find next direntry, error %d", err);
+		return err;
+	}
+
+	kfree(file->private_data);
+	file->private_data = NULL;
+	file->f_pos = 2;
+	return 0;
+}
+
+/* If a directory is seeked, we have to free saved readdir() state */
+static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+	kfree(file->private_data);
+	file->private_data = NULL;
+	return generic_file_llseek(file, offset, origin);
+}
+
+/* Free saved readdir() state when the directory is closed */
+static int ubifs_dir_release(struct inode *dir, struct file *file)
+{
+	kfree(file->private_data);
+	file->private_data = NULL;
+	return 0;
+}
+
+/**
+ * lock_2_inodes - a wrapper for locking two UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ *
+ * We do not implement any tricks to guarantee strict lock ordering, because
+ * VFS has already done it for us on the @i_mutex. So this is just a simple
+ * wrapper function.
+ */
+static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+	mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
+}
+
+/**
+ * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ */
+static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+	mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+}
+
+static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	struct inode *inode = old_dentry->d_inode;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
+				.dirtied_ino_d = ALIGN(ui->data_len, 8) };
+
+	/*
+	 * Budget request settings: new direntry, changing the target inode,
+	 * changing the parent inode.
+	 */
+
+	dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
+		dentry->d_name.len, dentry->d_name.name, inode->i_ino,
+		inode->i_nlink, dir->i_ino);
+	ubifs_assert(mutex_is_locked(&dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&inode->i_mutex));
+
+	err = dbg_check_synced_i_size(inode);
+	if (err)
+		return err;
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	lock_2_inodes(dir, inode);
+	inc_nlink(inode);
+	ihold(inode);
+	inode->i_ctime = ubifs_current_time(inode);
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	if (err)
+		goto out_cancel;
+	unlock_2_inodes(dir, inode);
+
+	ubifs_release_budget(c, &req);
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_cancel:
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	drop_nlink(inode);
+	unlock_2_inodes(dir, inode);
+	ubifs_release_budget(c, &req);
+	iput(inode);
+	return err;
+}
+
+static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	struct inode *inode = dentry->d_inode;
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	int err, budgeted = 1;
+	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+
+	/*
+	 * Budget request settings: deletion direntry, deletion inode (+1 for
+	 * @dirtied_ino), changing the parent directory inode. If budgeting
+	 * fails, go ahead anyway because we have extra space reserved for
+	 * deletions.
+	 */
+
+	dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
+		dentry->d_name.len, dentry->d_name.name, inode->i_ino,
+		inode->i_nlink, dir->i_ino);
+	ubifs_assert(mutex_is_locked(&dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&inode->i_mutex));
+	err = dbg_check_synced_i_size(inode);
+	if (err)
+		return err;
+
+	err = ubifs_budget_space(c, &req);
+	if (err) {
+		if (err != -ENOSPC)
+			return err;
+		budgeted = 0;
+	}
+
+	lock_2_inodes(dir, inode);
+	inode->i_ctime = ubifs_current_time(dir);
+	drop_nlink(inode);
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+	if (err)
+		goto out_cancel;
+	unlock_2_inodes(dir, inode);
+
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	else {
+		/* We've deleted something - clean the "no space" flags */
+		c->bi.nospace = c->bi.nospace_rp = 0;
+		smp_wmb();
+	}
+	return 0;
+
+out_cancel:
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	inc_nlink(inode);
+	unlock_2_inodes(dir, inode);
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	return err;
+}
+
+/**
+ * check_dir_empty - check if a directory is empty or not.
+ * @c: UBIFS file-system description object
+ * @dir: VFS inode object of the directory to check
+ *
+ * This function checks if directory @dir is empty. Returns zero if the
+ * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes
+ * in case of of errors.
+ */
+static int check_dir_empty(struct ubifs_info *c, struct inode *dir)
+{
+	struct qstr nm = { .name = NULL };
+	struct ubifs_dent_node *dent;
+	union ubifs_key key;
+	int err;
+
+	lowest_dent_key(c, &key, dir->i_ino);
+	dent = ubifs_tnc_next_ent(c, &key, &nm);
+	if (IS_ERR(dent)) {
+		err = PTR_ERR(dent);
+		if (err == -ENOENT)
+			err = 0;
+	} else {
+		kfree(dent);
+		err = -ENOTEMPTY;
+	}
+	return err;
+}
+
+static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	struct inode *inode = dentry->d_inode;
+	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	int err, budgeted = 1;
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+
+	/*
+	 * Budget request settings: deletion direntry, deletion inode and
+	 * changing the parent inode. If budgeting fails, go ahead anyway
+	 * because we have extra space reserved for deletions.
+	 */
+
+	dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
+		dentry->d_name.name, inode->i_ino, dir->i_ino);
+	ubifs_assert(mutex_is_locked(&dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&inode->i_mutex));
+	err = check_dir_empty(c, dentry->d_inode);
+	if (err)
+		return err;
+
+	err = ubifs_budget_space(c, &req);
+	if (err) {
+		if (err != -ENOSPC)
+			return err;
+		budgeted = 0;
+	}
+
+	lock_2_inodes(dir, inode);
+	inode->i_ctime = ubifs_current_time(dir);
+	clear_nlink(inode);
+	drop_nlink(dir);
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+	if (err)
+		goto out_cancel;
+	unlock_2_inodes(dir, inode);
+
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	else {
+		/* We've deleted something - clean the "no space" flags */
+		c->bi.nospace = c->bi.nospace_rp = 0;
+		smp_wmb();
+	}
+	return 0;
+
+out_cancel:
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	inc_nlink(dir);
+	inc_nlink(inode);
+	inc_nlink(inode);
+	unlock_2_inodes(dir, inode);
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	return err;
+}
+
+static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
+
+	/*
+	 * Budget request settings: new inode, new direntry and changing parent
+	 * directory inode.
+	 */
+
+	dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
+		dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_budg;
+	}
+
+	mutex_lock(&dir_ui->ui_mutex);
+	insert_inode_hash(inode);
+	inc_nlink(inode);
+	inc_nlink(dir);
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	if (err) {
+		ubifs_err("cannot create directory, error %d", err);
+		goto out_cancel;
+	}
+	mutex_unlock(&dir_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_cancel:
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	drop_nlink(dir);
+	mutex_unlock(&dir_ui->ui_mutex);
+	make_bad_inode(inode);
+	iput(inode);
+out_budg:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
+		       int mode, dev_t rdev)
+{
+	struct inode *inode;
+	struct ubifs_inode *ui;
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	union ubifs_dev_desc *dev = NULL;
+	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	int err, devlen = 0;
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+					.new_ino_d = ALIGN(devlen, 8),
+					.dirtied_ino = 1 };
+
+	/*
+	 * Budget request settings: new inode, new direntry and changing parent
+	 * directory inode.
+	 */
+
+	dbg_gen("dent '%.*s' in dir ino %lu",
+		dentry->d_name.len, dentry->d_name.name, dir->i_ino);
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	if (S_ISBLK(mode) || S_ISCHR(mode)) {
+		dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
+		if (!dev)
+			return -ENOMEM;
+		devlen = ubifs_encode_dev(dev, rdev);
+	}
+
+	err = ubifs_budget_space(c, &req);
+	if (err) {
+		kfree(dev);
+		return err;
+	}
+
+	inode = ubifs_new_inode(c, dir, mode);
+	if (IS_ERR(inode)) {
+		kfree(dev);
+		err = PTR_ERR(inode);
+		goto out_budg;
+	}
+
+	init_special_inode(inode, inode->i_mode, rdev);
+	inode->i_size = ubifs_inode(inode)->ui_size = devlen;
+	ui = ubifs_inode(inode);
+	ui->data = dev;
+	ui->data_len = devlen;
+
+	mutex_lock(&dir_ui->ui_mutex);
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&dir_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	insert_inode_hash(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_cancel:
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	mutex_unlock(&dir_ui->ui_mutex);
+	make_bad_inode(inode);
+	iput(inode);
+out_budg:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
+			 const char *symname)
+{
+	struct inode *inode;
+	struct ubifs_inode *ui;
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	int err, len = strlen(symname);
+	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+					.new_ino_d = ALIGN(len, 8),
+					.dirtied_ino = 1 };
+
+	/*
+	 * Budget request settings: new inode, new direntry and changing parent
+	 * directory inode.
+	 */
+
+	dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len,
+		dentry->d_name.name, symname, dir->i_ino);
+
+	if (len > UBIFS_MAX_INO_DATA)
+		return -ENAMETOOLONG;
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_budg;
+	}
+
+	ui = ubifs_inode(inode);
+	ui->data = kmalloc(len + 1, GFP_NOFS);
+	if (!ui->data) {
+		err = -ENOMEM;
+		goto out_inode;
+	}
+
+	memcpy(ui->data, symname, len);
+	((char *)ui->data)[len] = '\0';
+	/*
+	 * The terminating zero byte is not written to the flash media and it
+	 * is put just to make later in-memory string processing simpler. Thus,
+	 * data length is @len, not @len + %1.
+	 */
+	ui->data_len = len;
+	inode->i_size = ubifs_inode(inode)->ui_size = len;
+
+	mutex_lock(&dir_ui->ui_mutex);
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&dir_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	insert_inode_hash(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_cancel:
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	mutex_unlock(&dir_ui->ui_mutex);
+out_inode:
+	make_bad_inode(inode);
+	iput(inode);
+out_budg:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+/**
+ * lock_3_inodes - a wrapper for locking three UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ * @inode3: third inode
+ *
+ * This function is used for 'ubifs_rename()' and @inode1 may be the same as
+ * @inode2 whereas @inode3 may be %NULL.
+ *
+ * We do not implement any tricks to guarantee strict lock ordering, because
+ * VFS has already done it for us on the @i_mutex. So this is just a simple
+ * wrapper function.
+ */
+static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
+			  struct inode *inode3)
+{
+	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+	if (inode2 != inode1)
+		mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
+	if (inode3)
+		mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3);
+}
+
+/**
+ * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename.
+ * @inode1: first inode
+ * @inode2: second inode
+ * @inode3: third inode
+ */
+static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
+			    struct inode *inode3)
+{
+	if (inode3)
+		mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
+	if (inode1 != inode2)
+		mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+}
+
+static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct ubifs_info *c = old_dir->i_sb->s_fs_info;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode);
+	int err, release, sync = 0, move = (new_dir != old_dir);
+	int is_dir = S_ISDIR(old_inode->i_mode);
+	int unlink = !!new_inode;
+	int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len);
+	int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len);
+	struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
+					.dirtied_ino = 3 };
+	struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
+			.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
+	struct timespec time;
+
+	/*
+	 * Budget request settings: deletion direntry, new direntry, removing
+	 * the old inode, and changing old and new parent directory inodes.
+	 *
+	 * However, this operation also marks the target inode as dirty and
+	 * does not write it, so we allocate budget for the target inode
+	 * separately.
+	 */
+
+	dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in "
+		"dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
+		old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
+		new_dentry->d_name.name, new_dir->i_ino);
+	ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
+	if (unlink)
+		ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
+
+
+	if (unlink && is_dir) {
+		err = check_dir_empty(c, new_inode);
+		if (err)
+			return err;
+	}
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+	err = ubifs_budget_space(c, &ino_req);
+	if (err) {
+		ubifs_release_budget(c, &req);
+		return err;
+	}
+
+	lock_3_inodes(old_dir, new_dir, new_inode);
+
+	/*
+	 * Like most other Unix systems, set the @i_ctime for inodes on a
+	 * rename.
+	 */
+	time = ubifs_current_time(old_dir);
+	old_inode->i_ctime = time;
+
+	/* We must adjust parent link count when renaming directories */
+	if (is_dir) {
+		if (move) {
+			/*
+			 * @old_dir loses a link because we are moving
+			 * @old_inode to a different directory.
+			 */
+			drop_nlink(old_dir);
+			/*
+			 * @new_dir only gains a link if we are not also
+			 * overwriting an existing directory.
+			 */
+			if (!unlink)
+				inc_nlink(new_dir);
+		} else {
+			/*
+			 * @old_inode is not moving to a different directory,
+			 * but @old_dir still loses a link if we are
+			 * overwriting an existing directory.
+			 */
+			if (unlink)
+				drop_nlink(old_dir);
+		}
+	}
+
+	old_dir->i_size -= old_sz;
+	ubifs_inode(old_dir)->ui_size = old_dir->i_size;
+	old_dir->i_mtime = old_dir->i_ctime = time;
+	new_dir->i_mtime = new_dir->i_ctime = time;
+
+	/*
+	 * And finally, if we unlinked a direntry which happened to have the
+	 * same name as the moved direntry, we have to decrement @i_nlink of
+	 * the unlinked inode and change its ctime.
+	 */
+	if (unlink) {
+		/*
+		 * Directories cannot have hard-links, so if this is a
+		 * directory, decrement its @i_nlink twice because an empty
+		 * directory has @i_nlink 2.
+		 */
+		if (is_dir)
+			drop_nlink(new_inode);
+		new_inode->i_ctime = time;
+		drop_nlink(new_inode);
+	} else {
+		new_dir->i_size += new_sz;
+		ubifs_inode(new_dir)->ui_size = new_dir->i_size;
+	}
+
+	/*
+	 * Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode
+	 * is dirty, because this will be done later on at the end of
+	 * 'ubifs_rename()'.
+	 */
+	if (IS_SYNC(old_inode)) {
+		sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
+		if (unlink && IS_SYNC(new_inode))
+			sync = 1;
+	}
+	err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry,
+			       sync);
+	if (err)
+		goto out_cancel;
+
+	unlock_3_inodes(old_dir, new_dir, new_inode);
+	ubifs_release_budget(c, &req);
+
+	mutex_lock(&old_inode_ui->ui_mutex);
+	release = old_inode_ui->dirty;
+	mark_inode_dirty_sync(old_inode);
+	mutex_unlock(&old_inode_ui->ui_mutex);
+
+	if (release)
+		ubifs_release_budget(c, &ino_req);
+	if (IS_SYNC(old_inode))
+		err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
+	return err;
+
+out_cancel:
+	if (unlink) {
+		if (is_dir)
+			inc_nlink(new_inode);
+		inc_nlink(new_inode);
+	} else {
+		new_dir->i_size -= new_sz;
+		ubifs_inode(new_dir)->ui_size = new_dir->i_size;
+	}
+	old_dir->i_size += old_sz;
+	ubifs_inode(old_dir)->ui_size = old_dir->i_size;
+	if (is_dir) {
+		if (move) {
+			inc_nlink(old_dir);
+			if (!unlink)
+				drop_nlink(new_dir);
+		} else {
+			if (unlink)
+				inc_nlink(old_dir);
+		}
+	}
+	unlock_3_inodes(old_dir, new_dir, new_inode);
+	ubifs_release_budget(c, &ino_req);
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		  struct kstat *stat)
+{
+	loff_t size;
+	struct inode *inode = dentry->d_inode;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	mutex_lock(&ui->ui_mutex);
+	stat->dev = inode->i_sb->s_dev;
+	stat->ino = inode->i_ino;
+	stat->mode = inode->i_mode;
+	stat->nlink = inode->i_nlink;
+	stat->uid = inode->i_uid;
+	stat->gid = inode->i_gid;
+	stat->rdev = inode->i_rdev;
+	stat->atime = inode->i_atime;
+	stat->mtime = inode->i_mtime;
+	stat->ctime = inode->i_ctime;
+	stat->blksize = UBIFS_BLOCK_SIZE;
+	stat->size = ui->ui_size;
+
+	/*
+	 * Unfortunately, the 'stat()' system call was designed for block
+	 * device based file systems, and it is not appropriate for UBIFS,
+	 * because UBIFS does not have notion of "block". For example, it is
+	 * difficult to tell how many block a directory takes - it actually
+	 * takes less than 300 bytes, but we have to round it to block size,
+	 * which introduces large mistake. This makes utilities like 'du' to
+	 * report completely senseless numbers. This is the reason why UBIFS
+	 * goes the same way as JFFS2 - it reports zero blocks for everything
+	 * but regular files, which makes more sense than reporting completely
+	 * wrong sizes.
+	 */
+	if (S_ISREG(inode->i_mode)) {
+		size = ui->xattr_size;
+		size += stat->size;
+		size = ALIGN(size, UBIFS_BLOCK_SIZE);
+		/*
+		 * Note, user-space expects 512-byte blocks count irrespectively
+		 * of what was reported in @stat->size.
+		 */
+		stat->blocks = size >> 9;
+	} else
+		stat->blocks = 0;
+	mutex_unlock(&ui->ui_mutex);
+	return 0;
+}
+
+const struct inode_operations ubifs_dir_inode_operations = {
+	.lookup      = ubifs_lookup,
+	.create      = ubifs_create,
+	.link        = ubifs_link,
+	.symlink     = ubifs_symlink,
+	.unlink      = ubifs_unlink,
+	.mkdir       = ubifs_mkdir,
+	.rmdir       = ubifs_rmdir,
+	.mknod       = ubifs_mknod,
+	.rename      = ubifs_rename,
+	.setattr     = ubifs_setattr,
+	.getattr     = ubifs_getattr,
+#ifdef CONFIG_UBIFS_FS_XATTR
+	.setxattr    = ubifs_setxattr,
+	.getxattr    = ubifs_getxattr,
+	.listxattr   = ubifs_listxattr,
+	.removexattr = ubifs_removexattr,
+#endif
+};
+
+const struct file_operations ubifs_dir_operations = {
+	.llseek         = ubifs_dir_llseek,
+	.release        = ubifs_dir_release,
+	.read           = generic_read_dir,
+	.readdir        = ubifs_readdir,
+	.fsync          = ubifs_fsync,
+	.unlocked_ioctl = ubifs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ubifs_compat_ioctl,
+#endif
+};
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
new file mode 100644
index 00000000..5e7fccfc
--- /dev/null
+++ b/fs/ubifs/file.c
@@ -0,0 +1,1593 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements VFS file and inode operations for regular files, device
+ * nodes and symlinks as well as address space operations.
+ *
+ * UBIFS uses 2 page flags: @PG_private and @PG_checked. @PG_private is set if
+ * the page is dirty and is used for optimization purposes - dirty pages are
+ * not budgeted so the flag shows that 'ubifs_write_end()' should not release
+ * the budget for this page. The @PG_checked flag is set if full budgeting is
+ * required for the page e.g., when it corresponds to a file hole or it is
+ * beyond the file size. The budgeting is done in 'ubifs_write_begin()', because
+ * it is OK to fail in this function, and the budget is released in
+ * 'ubifs_write_end()'. So the @PG_private and @PG_checked flags carry
+ * information about how the page was budgeted, to make it possible to release
+ * the budget properly.
+ *
+ * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we
+ * implement. However, this is not true for 'ubifs_writepage()', which may be
+ * called with @i_mutex unlocked. For example, when pdflush is doing background
+ * write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. At "normal"
+ * work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. in the
+ * "sys_write -> alloc_pages -> direct reclaim path". So, in 'ubifs_writepage()'
+ * we are only guaranteed that the page is locked.
+ *
+ * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
+ * read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
+ * ondemand_readahead -> readpage"). In case of readahead, @I_SYNC flag is not
+ * set as well. However, UBIFS disables readahead.
+ */
+
+#include "ubifs.h"
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+
+static int read_block(struct inode *inode, void *addr, unsigned int block,
+		      struct ubifs_data_node *dn)
+{
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	int err, len, out_len;
+	union ubifs_key key;
+	unsigned int dlen;
+
+	data_key_init(c, &key, inode->i_ino, block);
+	err = ubifs_tnc_lookup(c, &key, dn);
+	if (err) {
+		if (err == -ENOENT)
+			/* Not found, so it must be a hole */
+			memset(addr, 0, UBIFS_BLOCK_SIZE);
+		return err;
+	}
+
+	ubifs_assert(le64_to_cpu(dn->ch.sqnum) >
+		     ubifs_inode(inode)->creat_sqnum);
+	len = le32_to_cpu(dn->size);
+	if (len <= 0 || len > UBIFS_BLOCK_SIZE)
+		goto dump;
+
+	dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+	out_len = UBIFS_BLOCK_SIZE;
+	err = ubifs_decompress(&dn->data, dlen, addr, &out_len,
+			       le16_to_cpu(dn->compr_type));
+	if (err || len != out_len)
+		goto dump;
+
+	/*
+	 * Data length can be less than a full block, even for blocks that are
+	 * not the last in the file (e.g., as a result of making a hole and
+	 * appending data). Ensure that the remainder is zeroed out.
+	 */
+	if (len < UBIFS_BLOCK_SIZE)
+		memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
+
+	return 0;
+
+dump:
+	ubifs_err("bad data node (block %u, inode %lu)",
+		  block, inode->i_ino);
+	dbg_dump_node(c, dn);
+	return -EINVAL;
+}
+
+static int do_readpage(struct page *page)
+{
+	void *addr;
+	int err = 0, i;
+	unsigned int block, beyond;
+	struct ubifs_data_node *dn;
+	struct inode *inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+
+	dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
+		inode->i_ino, page->index, i_size, page->flags);
+	ubifs_assert(!PageChecked(page));
+	ubifs_assert(!PagePrivate(page));
+
+	addr = kmap(page);
+
+	block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
+	if (block >= beyond) {
+		/* Reading beyond inode */
+		SetPageChecked(page);
+		memset(addr, 0, PAGE_CACHE_SIZE);
+		goto out;
+	}
+
+	dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS);
+	if (!dn) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	i = 0;
+	while (1) {
+		int ret;
+
+		if (block >= beyond) {
+			/* Reading beyond inode */
+			err = -ENOENT;
+			memset(addr, 0, UBIFS_BLOCK_SIZE);
+		} else {
+			ret = read_block(inode, addr, block, dn);
+			if (ret) {
+				err = ret;
+				if (err != -ENOENT)
+					break;
+			} else if (block + 1 == beyond) {
+				int dlen = le32_to_cpu(dn->size);
+				int ilen = i_size & (UBIFS_BLOCK_SIZE - 1);
+
+				if (ilen && ilen < dlen)
+					memset(addr + ilen, 0, dlen - ilen);
+			}
+		}
+		if (++i >= UBIFS_BLOCKS_PER_PAGE)
+			break;
+		block += 1;
+		addr += UBIFS_BLOCK_SIZE;
+	}
+	if (err) {
+		if (err == -ENOENT) {
+			/* Not found, so it must be a hole */
+			SetPageChecked(page);
+			dbg_gen("hole");
+			goto out_free;
+		}
+		ubifs_err("cannot read page %lu of inode %lu, error %d",
+			  page->index, inode->i_ino, err);
+		goto error;
+	}
+
+out_free:
+	kfree(dn);
+out:
+	SetPageUptodate(page);
+	ClearPageError(page);
+	flush_dcache_page(page);
+	kunmap(page);
+	return 0;
+
+error:
+	kfree(dn);
+	ClearPageUptodate(page);
+	SetPageError(page);
+	flush_dcache_page(page);
+	kunmap(page);
+	return err;
+}
+
+/**
+ * release_new_page_budget - release budget of a new page.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which releases budget corresponding to the budget
+ * of one new page of data.
+ */
+static void release_new_page_budget(struct ubifs_info *c)
+{
+	struct ubifs_budget_req req = { .recalculate = 1, .new_page = 1 };
+
+	ubifs_release_budget(c, &req);
+}
+
+/**
+ * release_existing_page_budget - release budget of an existing page.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which releases budget corresponding to the budget
+ * of changing one one page of data which already exists on the flash media.
+ */
+static void release_existing_page_budget(struct ubifs_info *c)
+{
+	struct ubifs_budget_req req = { .dd_growth = c->bi.page_budget};
+
+	ubifs_release_budget(c, &req);
+}
+
+static int write_begin_slow(struct address_space *mapping,
+			    loff_t pos, unsigned len, struct page **pagep,
+			    unsigned flags)
+{
+	struct inode *inode = mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct ubifs_budget_req req = { .new_page = 1 };
+	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+	struct page *page;
+
+	dbg_gen("ino %lu, pos %llu, len %u, i_size %lld",
+		inode->i_ino, pos, len, inode->i_size);
+
+	/*
+	 * At the slow path we have to budget before locking the page, because
+	 * budgeting may force write-back, which would wait on locked pages and
+	 * deadlock if we had the page locked. At this point we do not know
+	 * anything about the page, so assume that this is a new page which is
+	 * written to a hole. This corresponds to largest budget. Later the
+	 * budget will be amended if this is not true.
+	 */
+	if (appending)
+		/* We are appending data, budget for inode change */
+		req.dirtied_ino = 1;
+
+	err = ubifs_budget_space(c, &req);
+	if (unlikely(err))
+		return err;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (unlikely(!page)) {
+		ubifs_release_budget(c, &req);
+		return -ENOMEM;
+	}
+
+	if (!PageUptodate(page)) {
+		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+			SetPageChecked(page);
+		else {
+			err = do_readpage(page);
+			if (err) {
+				unlock_page(page);
+				page_cache_release(page);
+				return err;
+			}
+		}
+
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+
+	if (PagePrivate(page))
+		/*
+		 * The page is dirty, which means it was budgeted twice:
+		 *   o first time the budget was allocated by the task which
+		 *     made the page dirty and set the PG_private flag;
+		 *   o and then we budgeted for it for the second time at the
+		 *     very beginning of this function.
+		 *
+		 * So what we have to do is to release the page budget we
+		 * allocated.
+		 */
+		release_new_page_budget(c);
+	else if (!PageChecked(page))
+		/*
+		 * We are changing a page which already exists on the media.
+		 * This means that changing the page does not make the amount
+		 * of indexing information larger, and this part of the budget
+		 * which we have already acquired may be released.
+		 */
+		ubifs_convert_page_budget(c);
+
+	if (appending) {
+		struct ubifs_inode *ui = ubifs_inode(inode);
+
+		/*
+		 * 'ubifs_write_end()' is optimized from the fast-path part of
+		 * 'ubifs_write_begin()' and expects the @ui_mutex to be locked
+		 * if data is appended.
+		 */
+		mutex_lock(&ui->ui_mutex);
+		if (ui->dirty)
+			/*
+			 * The inode is dirty already, so we may free the
+			 * budget we allocated.
+			 */
+			ubifs_release_dirty_inode_budget(c, ui);
+	}
+
+	*pagep = page;
+	return 0;
+}
+
+/**
+ * allocate_budget - allocate budget for 'ubifs_write_begin()'.
+ * @c: UBIFS file-system description object
+ * @page: page to allocate budget for
+ * @ui: UBIFS inode object the page belongs to
+ * @appending: non-zero if the page is appended
+ *
+ * This is a helper function for 'ubifs_write_begin()' which allocates budget
+ * for the operation. The budget is allocated differently depending on whether
+ * this is appending, whether the page is dirty or not, and so on. This
+ * function leaves the @ui->ui_mutex locked in case of appending. Returns zero
+ * in case of success and %-ENOSPC in case of failure.
+ */
+static int allocate_budget(struct ubifs_info *c, struct page *page,
+			   struct ubifs_inode *ui, int appending)
+{
+	struct ubifs_budget_req req = { .fast = 1 };
+
+	if (PagePrivate(page)) {
+		if (!appending)
+			/*
+			 * The page is dirty and we are not appending, which
+			 * means no budget is needed at all.
+			 */
+			return 0;
+
+		mutex_lock(&ui->ui_mutex);
+		if (ui->dirty)
+			/*
+			 * The page is dirty and we are appending, so the inode
+			 * has to be marked as dirty. However, it is already
+			 * dirty, so we do not need any budget. We may return,
+			 * but @ui->ui_mutex hast to be left locked because we
+			 * should prevent write-back from flushing the inode
+			 * and freeing the budget. The lock will be released in
+			 * 'ubifs_write_end()'.
+			 */
+			return 0;
+
+		/*
+		 * The page is dirty, we are appending, the inode is clean, so
+		 * we need to budget the inode change.
+		 */
+		req.dirtied_ino = 1;
+	} else {
+		if (PageChecked(page))
+			/*
+			 * The page corresponds to a hole and does not
+			 * exist on the media. So changing it makes
+			 * make the amount of indexing information
+			 * larger, and we have to budget for a new
+			 * page.
+			 */
+			req.new_page = 1;
+		else
+			/*
+			 * Not a hole, the change will not add any new
+			 * indexing information, budget for page
+			 * change.
+			 */
+			req.dirtied_page = 1;
+
+		if (appending) {
+			mutex_lock(&ui->ui_mutex);
+			if (!ui->dirty)
+				/*
+				 * The inode is clean but we will have to mark
+				 * it as dirty because we are appending. This
+				 * needs a budget.
+				 */
+				req.dirtied_ino = 1;
+		}
+	}
+
+	return ubifs_budget_space(c, &req);
+}
+
+/*
+ * This function is called when a page of data is going to be written. Since
+ * the page of data will not necessarily go to the flash straight away, UBIFS
+ * has to reserve space on the media for it, which is done by means of
+ * budgeting.
+ *
+ * This is the hot-path of the file-system and we are trying to optimize it as
+ * much as possible. For this reasons it is split on 2 parts - slow and fast.
+ *
+ * There many budgeting cases:
+ *     o a new page is appended - we have to budget for a new page and for
+ *       changing the inode; however, if the inode is already dirty, there is
+ *       no need to budget for it;
+ *     o an existing clean page is changed - we have budget for it; if the page
+ *       does not exist on the media (a hole), we have to budget for a new
+ *       page; otherwise, we may budget for changing an existing page; the
+ *       difference between these cases is that changing an existing page does
+ *       not introduce anything new to the FS indexing information, so it does
+ *       not grow, and smaller budget is acquired in this case;
+ *     o an existing dirty page is changed - no need to budget at all, because
+ *       the page budget has been acquired by earlier, when the page has been
+ *       marked dirty.
+ *
+ * UBIFS budgeting sub-system may force write-back if it thinks there is no
+ * space to reserve. This imposes some locking restrictions and makes it
+ * impossible to take into account the above cases, and makes it impossible to
+ * optimize budgeting.
+ *
+ * The solution for this is that the fast path of 'ubifs_write_begin()' assumes
+ * there is a plenty of flash space and the budget will be acquired quickly,
+ * without forcing write-back. The slow path does not make this assumption.
+ */
+static int ubifs_write_begin(struct file *file, struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+	int skipped_read = 0;
+	struct page *page;
+
+	ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+
+	if (unlikely(c->ro_error))
+		return -EROFS;
+
+	/* Try out the fast-path part first */
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (unlikely(!page))
+		return -ENOMEM;
+
+	if (!PageUptodate(page)) {
+		/* The page is not loaded from the flash */
+		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
+			/*
+			 * We change whole page so no need to load it. But we
+			 * do not know whether this page exists on the media or
+			 * not, so we assume the latter because it requires
+			 * larger budget. The assumption is that it is better
+			 * to budget a bit more than to read the page from the
+			 * media. Thus, we are setting the @PG_checked flag
+			 * here.
+			 */
+			SetPageChecked(page);
+			skipped_read = 1;
+		} else {
+			err = do_readpage(page);
+			if (err) {
+				unlock_page(page);
+				page_cache_release(page);
+				return err;
+			}
+		}
+
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+
+	err = allocate_budget(c, page, ui, appending);
+	if (unlikely(err)) {
+		ubifs_assert(err == -ENOSPC);
+		/*
+		 * If we skipped reading the page because we were going to
+		 * write all of it, then it is not up to date.
+		 */
+		if (skipped_read) {
+			ClearPageChecked(page);
+			ClearPageUptodate(page);
+		}
+		/*
+		 * Budgeting failed which means it would have to force
+		 * write-back but didn't, because we set the @fast flag in the
+		 * request. Write-back cannot be done now, while we have the
+		 * page locked, because it would deadlock. Unlock and free
+		 * everything and fall-back to slow-path.
+		 */
+		if (appending) {
+			ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+			mutex_unlock(&ui->ui_mutex);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+
+		return write_begin_slow(mapping, pos, len, pagep, flags);
+	}
+
+	/*
+	 * Whee, we acquired budgeting quickly - without involving
+	 * garbage-collection, committing or forcing write-back. We return
+	 * with @ui->ui_mutex locked if we are appending pages, and unlocked
+	 * otherwise. This is an optimization (slightly hacky though).
+	 */
+	*pagep = page;
+	return 0;
+
+}
+
+/**
+ * cancel_budget - cancel budget.
+ * @c: UBIFS file-system description object
+ * @page: page to cancel budget for
+ * @ui: UBIFS inode object the page belongs to
+ * @appending: non-zero if the page is appended
+ *
+ * This is a helper function for a page write operation. It unlocks the
+ * @ui->ui_mutex in case of appending.
+ */
+static void cancel_budget(struct ubifs_info *c, struct page *page,
+			  struct ubifs_inode *ui, int appending)
+{
+	if (appending) {
+		if (!ui->dirty)
+			ubifs_release_dirty_inode_budget(c, ui);
+		mutex_unlock(&ui->ui_mutex);
+	}
+	if (!PagePrivate(page)) {
+		if (PageChecked(page))
+			release_new_page_budget(c);
+		else
+			release_existing_page_budget(c);
+	}
+}
+
+static int ubifs_write_end(struct file *file, struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	loff_t end_pos = pos + len;
+	int appending = !!(end_pos > inode->i_size);
+
+	dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld",
+		inode->i_ino, pos, page->index, len, copied, inode->i_size);
+
+	if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) {
+		/*
+		 * VFS copied less data to the page that it intended and
+		 * declared in its '->write_begin()' call via the @len
+		 * argument. If the page was not up-to-date, and @len was
+		 * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did
+		 * not load it from the media (for optimization reasons). This
+		 * means that part of the page contains garbage. So read the
+		 * page now.
+		 */
+		dbg_gen("copied %d instead of %d, read page and repeat",
+			copied, len);
+		cancel_budget(c, page, ui, appending);
+		ClearPageChecked(page);
+
+		/*
+		 * Return 0 to force VFS to repeat the whole operation, or the
+		 * error code if 'do_readpage()' fails.
+		 */
+		copied = do_readpage(page);
+		goto out;
+	}
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		atomic_long_inc(&c->dirty_pg_cnt);
+		__set_page_dirty_nobuffers(page);
+	}
+
+	if (appending) {
+		i_size_write(inode, end_pos);
+		ui->ui_size = end_pos;
+		/*
+		 * Note, we do not set @I_DIRTY_PAGES (which means that the
+		 * inode has dirty pages), this has been done in
+		 * '__set_page_dirty_nobuffers()'.
+		 */
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+		ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+		mutex_unlock(&ui->ui_mutex);
+	}
+
+out:
+	unlock_page(page);
+	page_cache_release(page);
+	return copied;
+}
+
+/**
+ * populate_page - copy data nodes into a page for bulk-read.
+ * @c: UBIFS file-system description object
+ * @page: page
+ * @bu: bulk-read information
+ * @n: next zbranch slot
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int populate_page(struct ubifs_info *c, struct page *page,
+			 struct bu_info *bu, int *n)
+{
+	int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0;
+	struct inode *inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	unsigned int page_block;
+	void *addr, *zaddr;
+	pgoff_t end_index;
+
+	dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
+		inode->i_ino, page->index, i_size, page->flags);
+
+	addr = zaddr = kmap(page);
+
+	end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+	if (!i_size || page->index > end_index) {
+		hole = 1;
+		memset(addr, 0, PAGE_CACHE_SIZE);
+		goto out_hole;
+	}
+
+	page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	while (1) {
+		int err, len, out_len, dlen;
+
+		if (nn >= bu->cnt) {
+			hole = 1;
+			memset(addr, 0, UBIFS_BLOCK_SIZE);
+		} else if (key_block(c, &bu->zbranch[nn].key) == page_block) {
+			struct ubifs_data_node *dn;
+
+			dn = bu->buf + (bu->zbranch[nn].offs - offs);
+
+			ubifs_assert(le64_to_cpu(dn->ch.sqnum) >
+				     ubifs_inode(inode)->creat_sqnum);
+
+			len = le32_to_cpu(dn->size);
+			if (len <= 0 || len > UBIFS_BLOCK_SIZE)
+				goto out_err;
+
+			dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+			out_len = UBIFS_BLOCK_SIZE;
+			err = ubifs_decompress(&dn->data, dlen, addr, &out_len,
+					       le16_to_cpu(dn->compr_type));
+			if (err || len != out_len)
+				goto out_err;
+
+			if (len < UBIFS_BLOCK_SIZE)
+				memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
+
+			nn += 1;
+			read = (i << UBIFS_BLOCK_SHIFT) + len;
+		} else if (key_block(c, &bu->zbranch[nn].key) < page_block) {
+			nn += 1;
+			continue;
+		} else {
+			hole = 1;
+			memset(addr, 0, UBIFS_BLOCK_SIZE);
+		}
+		if (++i >= UBIFS_BLOCKS_PER_PAGE)
+			break;
+		addr += UBIFS_BLOCK_SIZE;
+		page_block += 1;
+	}
+
+	if (end_index == page->index) {
+		int len = i_size & (PAGE_CACHE_SIZE - 1);
+
+		if (len && len < read)
+			memset(zaddr + len, 0, read - len);
+	}
+
+out_hole:
+	if (hole) {
+		SetPageChecked(page);
+		dbg_gen("hole");
+	}
+
+	SetPageUptodate(page);
+	ClearPageError(page);
+	flush_dcache_page(page);
+	kunmap(page);
+	*n = nn;
+	return 0;
+
+out_err:
+	ClearPageUptodate(page);
+	SetPageError(page);
+	flush_dcache_page(page);
+	kunmap(page);
+	ubifs_err("bad data node (block %u, inode %lu)",
+		  page_block, inode->i_ino);
+	return -EINVAL;
+}
+
+/**
+ * ubifs_do_bulk_read - do bulk-read.
+ * @c: UBIFS file-system description object
+ * @bu: bulk-read information
+ * @page1: first page to read
+ *
+ * This function returns %1 if the bulk-read is done, otherwise %0 is returned.
+ */
+static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
+			      struct page *page1)
+{
+	pgoff_t offset = page1->index, end_index;
+	struct address_space *mapping = page1->mapping;
+	struct inode *inode = mapping->host;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	int err, page_idx, page_cnt, ret = 0, n = 0;
+	int allocate = bu->buf ? 0 : 1;
+	loff_t isize;
+
+	err = ubifs_tnc_get_bu_keys(c, bu);
+	if (err)
+		goto out_warn;
+
+	if (bu->eof) {
+		/* Turn off bulk-read at the end of the file */
+		ui->read_in_a_row = 1;
+		ui->bulk_read = 0;
+	}
+
+	page_cnt = bu->blk_cnt >> UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	if (!page_cnt) {
+		/*
+		 * This happens when there are multiple blocks per page and the
+		 * blocks for the first page we are looking for, are not
+		 * together. If all the pages were like this, bulk-read would
+		 * reduce performance, so we turn it off for a while.
+		 */
+		goto out_bu_off;
+	}
+
+	if (bu->cnt) {
+		if (allocate) {
+			/*
+			 * Allocate bulk-read buffer depending on how many data
+			 * nodes we are going to read.
+			 */
+			bu->buf_len = bu->zbranch[bu->cnt - 1].offs +
+				      bu->zbranch[bu->cnt - 1].len -
+				      bu->zbranch[0].offs;
+			ubifs_assert(bu->buf_len > 0);
+			ubifs_assert(bu->buf_len <= c->leb_size);
+			bu->buf = kmalloc(bu->buf_len, GFP_NOFS | __GFP_NOWARN);
+			if (!bu->buf)
+				goto out_bu_off;
+		}
+
+		err = ubifs_tnc_bulk_read(c, bu);
+		if (err)
+			goto out_warn;
+	}
+
+	err = populate_page(c, page1, bu, &n);
+	if (err)
+		goto out_warn;
+
+	unlock_page(page1);
+	ret = 1;
+
+	isize = i_size_read(inode);
+	if (isize == 0)
+		goto out_free;
+	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+
+	for (page_idx = 1; page_idx < page_cnt; page_idx++) {
+		pgoff_t page_offset = offset + page_idx;
+		struct page *page;
+
+		if (page_offset > end_index)
+			break;
+		page = find_or_create_page(mapping, page_offset,
+					   GFP_NOFS | __GFP_COLD);
+		if (!page)
+			break;
+		if (!PageUptodate(page))
+			err = populate_page(c, page, bu, &n);
+		unlock_page(page);
+		page_cache_release(page);
+		if (err)
+			break;
+	}
+
+	ui->last_page_read = offset + page_idx - 1;
+
+out_free:
+	if (allocate)
+		kfree(bu->buf);
+	return ret;
+
+out_warn:
+	ubifs_warn("ignoring error %d and skipping bulk-read", err);
+	goto out_free;
+
+out_bu_off:
+	ui->read_in_a_row = ui->bulk_read = 0;
+	goto out_free;
+}
+
+/**
+ * ubifs_bulk_read - determine whether to bulk-read and, if so, do it.
+ * @page: page from which to start bulk-read.
+ *
+ * Some flash media are capable of reading sequentially at faster rates. UBIFS
+ * bulk-read facility is designed to take advantage of that, by reading in one
+ * go consecutive data nodes that are also located consecutively in the same
+ * LEB. This function returns %1 if a bulk-read is done and %0 otherwise.
+ */
+static int ubifs_bulk_read(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	pgoff_t index = page->index, last_page_read = ui->last_page_read;
+	struct bu_info *bu;
+	int err = 0, allocated = 0;
+
+	ui->last_page_read = index;
+	if (!c->bulk_read)
+		return 0;
+
+	/*
+	 * Bulk-read is protected by @ui->ui_mutex, but it is an optimization,
+	 * so don't bother if we cannot lock the mutex.
+	 */
+	if (!mutex_trylock(&ui->ui_mutex))
+		return 0;
+
+	if (index != last_page_read + 1) {
+		/* Turn off bulk-read if we stop reading sequentially */
+		ui->read_in_a_row = 1;
+		if (ui->bulk_read)
+			ui->bulk_read = 0;
+		goto out_unlock;
+	}
+
+	if (!ui->bulk_read) {
+		ui->read_in_a_row += 1;
+		if (ui->read_in_a_row < 3)
+			goto out_unlock;
+		/* Three reads in a row, so switch on bulk-read */
+		ui->bulk_read = 1;
+	}
+
+	/*
+	 * If possible, try to use pre-allocated bulk-read information, which
+	 * is protected by @c->bu_mutex.
+	 */
+	if (mutex_trylock(&c->bu_mutex))
+		bu = &c->bu;
+	else {
+		bu = kmalloc(sizeof(struct bu_info), GFP_NOFS | __GFP_NOWARN);
+		if (!bu)
+			goto out_unlock;
+
+		bu->buf = NULL;
+		allocated = 1;
+	}
+
+	bu->buf_len = c->max_bu_buf_len;
+	data_key_init(c, &bu->key, inode->i_ino,
+		      page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT);
+	err = ubifs_do_bulk_read(c, bu, page);
+
+	if (!allocated)
+		mutex_unlock(&c->bu_mutex);
+	else
+		kfree(bu);
+
+out_unlock:
+	mutex_unlock(&ui->ui_mutex);
+	return err;
+}
+
+static int ubifs_readpage(struct file *file, struct page *page)
+{
+	if (ubifs_bulk_read(page))
+		return 0;
+	do_readpage(page);
+	unlock_page(page);
+	return 0;
+}
+
+static int do_writepage(struct page *page, int len)
+{
+	int err = 0, i, blen;
+	unsigned int block;
+	void *addr;
+	union ubifs_key key;
+	struct inode *inode = page->mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+#ifdef UBIFS_DEBUG
+	spin_lock(&ui->ui_lock);
+	ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE);
+	spin_unlock(&ui->ui_lock);
+#endif
+
+	/* Update radix tree tags */
+	set_page_writeback(page);
+
+	addr = kmap(page);
+	block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	i = 0;
+	while (len) {
+		blen = min_t(int, len, UBIFS_BLOCK_SIZE);
+		data_key_init(c, &key, inode->i_ino, block);
+		err = ubifs_jnl_write_data(c, inode, &key, addr, blen);
+		if (err)
+			break;
+		if (++i >= UBIFS_BLOCKS_PER_PAGE)
+			break;
+		block += 1;
+		addr += blen;
+		len -= blen;
+	}
+	if (err) {
+		SetPageError(page);
+		ubifs_err("cannot write page %lu of inode %lu, error %d",
+			  page->index, inode->i_ino, err);
+		ubifs_ro_mode(c, err);
+	}
+
+	ubifs_assert(PagePrivate(page));
+	if (PageChecked(page))
+		release_new_page_budget(c);
+	else
+		release_existing_page_budget(c);
+
+	atomic_long_dec(&c->dirty_pg_cnt);
+	ClearPagePrivate(page);
+	ClearPageChecked(page);
+
+	kunmap(page);
+	unlock_page(page);
+	end_page_writeback(page);
+	return err;
+}
+
+/*
+ * When writing-back dirty inodes, VFS first writes-back pages belonging to the
+ * inode, then the inode itself. For UBIFS this may cause a problem. Consider a
+ * situation when a we have an inode with size 0, then a megabyte of data is
+ * appended to the inode, then write-back starts and flushes some amount of the
+ * dirty pages, the journal becomes full, commit happens and finishes, and then
+ * an unclean reboot happens. When the file system is mounted next time, the
+ * inode size would still be 0, but there would be many pages which are beyond
+ * the inode size, they would be indexed and consume flash space. Because the
+ * journal has been committed, the replay would not be able to detect this
+ * situation and correct the inode size. This means UBIFS would have to scan
+ * whole index and correct all inode sizes, which is long an unacceptable.
+ *
+ * To prevent situations like this, UBIFS writes pages back only if they are
+ * within the last synchronized inode size, i.e. the size which has been
+ * written to the flash media last time. Otherwise, UBIFS forces inode
+ * write-back, thus making sure the on-flash inode contains current inode size,
+ * and then keeps writing pages back.
+ *
+ * Some locking issues explanation. 'ubifs_writepage()' first is called with
+ * the page locked, and it locks @ui_mutex. However, write-back does take inode
+ * @i_mutex, which means other VFS operations may be run on this inode at the
+ * same time. And the problematic one is truncation to smaller size, from where
+ * we have to call 'truncate_setsize()', which first changes @inode->i_size,
+ * then drops the truncated pages. And while dropping the pages, it takes the
+ * page lock. This means that 'do_truncation()' cannot call 'truncate_setsize()'
+ * with @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'.
+ * This means that @inode->i_size is changed while @ui_mutex is unlocked.
+ *
+ * XXX(truncate): with the new truncate sequence this is not true anymore,
+ * and the calls to truncate_setsize can be move around freely.  They should
+ * be moved to the very end of the truncate sequence.
+ *
+ * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
+ * inode size. How do we do this if @inode->i_size may became smaller while we
+ * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
+ * @ui->ui_isize "shadow" field which UBIFS uses instead of @inode->i_size
+ * internally and updates it under @ui_mutex.
+ *
+ * Q: why we do not worry that if we race with truncation, we may end up with a
+ * situation when the inode is truncated while we are in the middle of
+ * 'do_writepage()', so we do write beyond inode size?
+ * A: If we are in the middle of 'do_writepage()', truncation would be locked
+ * on the page lock and it would not write the truncated inode node to the
+ * journal before we have finished.
+ */
+static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	loff_t i_size =  i_size_read(inode), synced_i_size;
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	int err, len = i_size & (PAGE_CACHE_SIZE - 1);
+	void *kaddr;
+
+	dbg_gen("ino %lu, pg %lu, pg flags %#lx",
+		inode->i_ino, page->index, page->flags);
+	ubifs_assert(PagePrivate(page));
+
+	/* Is the page fully outside @i_size? (truncate in progress) */
+	if (page->index > end_index || (page->index == end_index && !len)) {
+		err = 0;
+		goto out_unlock;
+	}
+
+	spin_lock(&ui->ui_lock);
+	synced_i_size = ui->synced_i_size;
+	spin_unlock(&ui->ui_lock);
+
+	/* Is the page fully inside @i_size? */
+	if (page->index < end_index) {
+		if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
+			err = inode->i_sb->s_op->write_inode(inode, NULL);
+			if (err)
+				goto out_unlock;
+			/*
+			 * The inode has been written, but the write-buffer has
+			 * not been synchronized, so in case of an unclean
+			 * reboot we may end up with some pages beyond inode
+			 * size, but they would be in the journal (because
+			 * commit flushes write buffers) and recovery would deal
+			 * with this.
+			 */
+		}
+		return do_writepage(page, PAGE_CACHE_SIZE);
+	}
+
+	/*
+	 * The page straddles @i_size. It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped. "A file is mapped
+	 * in multiples of the page size. For a file that is not a multiple of
+	 * the page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr + len, 0, PAGE_CACHE_SIZE - len);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (i_size > synced_i_size) {
+		err = inode->i_sb->s_op->write_inode(inode, NULL);
+		if (err)
+			goto out_unlock;
+	}
+
+	return do_writepage(page, len);
+
+out_unlock:
+	unlock_page(page);
+	return err;
+}
+
+/**
+ * do_attr_changes - change inode attributes.
+ * @inode: inode to change attributes for
+ * @attr: describes attributes to change
+ */
+static void do_attr_changes(struct inode *inode, const struct iattr *attr)
+{
+	if (attr->ia_valid & ATTR_UID)
+		inode->i_uid = attr->ia_uid;
+	if (attr->ia_valid & ATTR_GID)
+		inode->i_gid = attr->ia_gid;
+	if (attr->ia_valid & ATTR_ATIME)
+		inode->i_atime = timespec_trunc(attr->ia_atime,
+						inode->i_sb->s_time_gran);
+	if (attr->ia_valid & ATTR_MTIME)
+		inode->i_mtime = timespec_trunc(attr->ia_mtime,
+						inode->i_sb->s_time_gran);
+	if (attr->ia_valid & ATTR_CTIME)
+		inode->i_ctime = timespec_trunc(attr->ia_ctime,
+						inode->i_sb->s_time_gran);
+	if (attr->ia_valid & ATTR_MODE) {
+		umode_t mode = attr->ia_mode;
+
+		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+			mode &= ~S_ISGID;
+		inode->i_mode = mode;
+	}
+}
+
+/**
+ * do_truncation - truncate an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to truncate
+ * @attr: inode attribute changes description
+ *
+ * This function implements VFS '->setattr()' call when the inode is truncated
+ * to a smaller size. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int do_truncation(struct ubifs_info *c, struct inode *inode,
+			 const struct iattr *attr)
+{
+	int err;
+	struct ubifs_budget_req req;
+	loff_t old_size = inode->i_size, new_size = attr->ia_size;
+	int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
+	memset(&req, 0, sizeof(struct ubifs_budget_req));
+
+	/*
+	 * If this is truncation to a smaller size, and we do not truncate on a
+	 * block boundary, budget for changing one data block, because the last
+	 * block will be re-written.
+	 */
+	if (new_size & (UBIFS_BLOCK_SIZE - 1))
+		req.dirtied_page = 1;
+
+	req.dirtied_ino = 1;
+	/* A funny way to budget for truncation node */
+	req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
+	err = ubifs_budget_space(c, &req);
+	if (err) {
+		/*
+		 * Treat truncations to zero as deletion and always allow them,
+		 * just like we do for '->unlink()'.
+		 */
+		if (new_size || err != -ENOSPC)
+			return err;
+		budgeted = 0;
+	}
+
+	truncate_setsize(inode, new_size);
+
+	if (offset) {
+		pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
+		struct page *page;
+
+		page = find_lock_page(inode->i_mapping, index);
+		if (page) {
+			if (PageDirty(page)) {
+				/*
+				 * 'ubifs_jnl_truncate()' will try to truncate
+				 * the last data node, but it contains
+				 * out-of-date data because the page is dirty.
+				 * Write the page now, so that
+				 * 'ubifs_jnl_truncate()' will see an already
+				 * truncated (and up to date) data node.
+				 */
+				ubifs_assert(PagePrivate(page));
+
+				clear_page_dirty_for_io(page);
+				if (UBIFS_BLOCKS_PER_PAGE_SHIFT)
+					offset = new_size &
+						 (PAGE_CACHE_SIZE - 1);
+				err = do_writepage(page, offset);
+				page_cache_release(page);
+				if (err)
+					goto out_budg;
+				/*
+				 * We could now tell 'ubifs_jnl_truncate()' not
+				 * to read the last block.
+				 */
+			} else {
+				/*
+				 * We could 'kmap()' the page and pass the data
+				 * to 'ubifs_jnl_truncate()' to save it from
+				 * having to read it.
+				 */
+				unlock_page(page);
+				page_cache_release(page);
+			}
+		}
+	}
+
+	mutex_lock(&ui->ui_mutex);
+	ui->ui_size = inode->i_size;
+	/* Truncation changes inode [mc]time */
+	inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+	/* Other attributes may be changed at the same time as well */
+	do_attr_changes(inode, attr);
+	err = ubifs_jnl_truncate(c, inode, old_size, new_size);
+	mutex_unlock(&ui->ui_mutex);
+
+out_budg:
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	else {
+		c->bi.nospace = c->bi.nospace_rp = 0;
+		smp_wmb();
+	}
+	return err;
+}
+
+/**
+ * do_setattr - change inode attributes.
+ * @c: UBIFS file-system description object
+ * @inode: inode to change attributes for
+ * @attr: inode attribute changes description
+ *
+ * This function implements VFS '->setattr()' call for all cases except
+ * truncations to smaller size. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+static int do_setattr(struct ubifs_info *c, struct inode *inode,
+		      const struct iattr *attr)
+{
+	int err, release;
+	loff_t new_size = attr->ia_size;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_budget_req req = { .dirtied_ino = 1,
+				.dirtied_ino_d = ALIGN(ui->data_len, 8) };
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		dbg_gen("size %lld -> %lld", inode->i_size, new_size);
+		truncate_setsize(inode, new_size);
+	}
+
+	mutex_lock(&ui->ui_mutex);
+	if (attr->ia_valid & ATTR_SIZE) {
+		/* Truncation changes inode [mc]time */
+		inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+		/* 'truncate_setsize()' changed @i_size, update @ui_size */
+		ui->ui_size = inode->i_size;
+	}
+
+	do_attr_changes(inode, attr);
+
+	release = ui->dirty;
+	if (attr->ia_valid & ATTR_SIZE)
+		/*
+		 * Inode length changed, so we have to make sure
+		 * @I_DIRTY_DATASYNC is set.
+		 */
+		 __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+	else
+		mark_inode_dirty_sync(inode);
+	mutex_unlock(&ui->ui_mutex);
+
+	if (release)
+		ubifs_release_budget(c, &req);
+	if (IS_SYNC(inode))
+		err = inode->i_sb->s_op->write_inode(inode, NULL);
+	return err;
+}
+
+int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int err;
+	struct inode *inode = dentry->d_inode;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+	dbg_gen("ino %lu, mode %#x, ia_valid %#x",
+		inode->i_ino, inode->i_mode, attr->ia_valid);
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	err = dbg_check_synced_i_size(inode);
+	if (err)
+		return err;
+
+	if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size < inode->i_size)
+		/* Truncation to a smaller size */
+		err = do_truncation(c, inode, attr);
+	else
+		err = do_setattr(c, inode, attr);
+
+	return err;
+}
+
+static void ubifs_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct inode *inode = page->mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+	ubifs_assert(PagePrivate(page));
+	if (offset)
+		/* Partial page remains dirty */
+		return;
+
+	if (PageChecked(page))
+		release_new_page_budget(c);
+	else
+		release_existing_page_budget(c);
+
+	atomic_long_dec(&c->dirty_pg_cnt);
+	ClearPagePrivate(page);
+	ClearPageChecked(page);
+}
+
+static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ubifs_inode *ui = ubifs_inode(dentry->d_inode);
+
+	nd_set_link(nd, ui->data);
+	return NULL;
+}
+
+int ubifs_fsync(struct file *file, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	int err;
+
+	dbg_gen("syncing inode %lu", inode->i_ino);
+
+	if (c->ro_mount)
+		/*
+		 * For some really strange reasons VFS does not filter out
+		 * 'fsync()' for R/O mounted file-systems as per 2.6.39.
+		 */
+		return 0;
+
+	/*
+	 * VFS has already synchronized dirty pages for this inode. Synchronize
+	 * the inode unless this is a 'datasync()' call.
+	 */
+	if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
+		err = inode->i_sb->s_op->write_inode(inode, NULL);
+		if (err)
+			return err;
+	}
+
+	/*
+	 * Nodes related to this inode may still sit in a write-buffer. Flush
+	 * them.
+	 */
+	err = ubifs_sync_wbufs_by_inode(c, inode);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/**
+ * mctime_update_needed - check if mtime or ctime update is needed.
+ * @inode: the inode to do the check for
+ * @now: current time
+ *
+ * This helper function checks if the inode mtime/ctime should be updated or
+ * not. If current values of the time-stamps are within the UBIFS inode time
+ * granularity, they are not updated. This is an optimization.
+ */
+static inline int mctime_update_needed(const struct inode *inode,
+				       const struct timespec *now)
+{
+	if (!timespec_equal(&inode->i_mtime, now) ||
+	    !timespec_equal(&inode->i_ctime, now))
+		return 1;
+	return 0;
+}
+
+/**
+ * update_ctime - update mtime and ctime of an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to update
+ *
+ * This function updates mtime and ctime of the inode if it is not equivalent to
+ * current time. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int update_mctime(struct ubifs_info *c, struct inode *inode)
+{
+	struct timespec now = ubifs_current_time(inode);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	if (mctime_update_needed(inode, &now)) {
+		int err, release;
+		struct ubifs_budget_req req = { .dirtied_ino = 1,
+				.dirtied_ino_d = ALIGN(ui->data_len, 8) };
+
+		err = ubifs_budget_space(c, &req);
+		if (err)
+			return err;
+
+		mutex_lock(&ui->ui_mutex);
+		inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+		release = ui->dirty;
+		mark_inode_dirty_sync(inode);
+		mutex_unlock(&ui->ui_mutex);
+		if (release)
+			ubifs_release_budget(c, &req);
+	}
+
+	return 0;
+}
+
+static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos)
+{
+	int err;
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+	err = update_mctime(c, inode);
+	if (err)
+		return err;
+
+	return generic_file_aio_write(iocb, iov, nr_segs, pos);
+}
+
+static int ubifs_set_page_dirty(struct page *page)
+{
+	int ret;
+
+	ret = __set_page_dirty_nobuffers(page);
+	/*
+	 * An attempt to dirty a page without budgeting for it - should not
+	 * happen.
+	 */
+	ubifs_assert(ret == 0);
+	return ret;
+}
+
+static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
+{
+	/*
+	 * An attempt to release a dirty page without budgeting for it - should
+	 * not happen.
+	 */
+	if (PageWriteback(page))
+		return 0;
+	ubifs_assert(PagePrivate(page));
+	ubifs_assert(0);
+	ClearPagePrivate(page);
+	ClearPageChecked(page);
+	return 1;
+}
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable.
+ * UBIFS must ensure page is budgeted for.
+ */
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
+				 struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct timespec now = ubifs_current_time(inode);
+	struct ubifs_budget_req req = { .new_page = 1 };
+	int err, update_time;
+
+	dbg_gen("ino %lu, pg %lu, i_size %lld",	inode->i_ino, page->index,
+		i_size_read(inode));
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+
+	if (unlikely(c->ro_error))
+		return VM_FAULT_SIGBUS; /* -EROFS */
+
+	/*
+	 * We have not locked @page so far so we may budget for changing the
+	 * page. Note, we cannot do this after we locked the page, because
+	 * budgeting may cause write-back which would cause deadlock.
+	 *
+	 * At the moment we do not know whether the page is dirty or not, so we
+	 * assume that it is not and budget for a new page. We could look at
+	 * the @PG_private flag and figure this out, but we may race with write
+	 * back and the page state may change by the time we lock it, so this
+	 * would need additional care. We do not bother with this at the
+	 * moment, although it might be good idea to do. Instead, we allocate
+	 * budget for a new page and amend it later on if the page was in fact
+	 * dirty.
+	 *
+	 * The budgeting-related logic of this function is similar to what we
+	 * do in 'ubifs_write_begin()' and 'ubifs_write_end()'. Glance there
+	 * for more comments.
+	 */
+	update_time = mctime_update_needed(inode, &now);
+	if (update_time)
+		/*
+		 * We have to change inode time stamp which requires extra
+		 * budgeting.
+		 */
+		req.dirtied_ino = 1;
+
+	err = ubifs_budget_space(c, &req);
+	if (unlikely(err)) {
+		if (err == -ENOSPC)
+			ubifs_warn("out of space for mmapped file "
+				   "(inode number %lu)", inode->i_ino);
+		return VM_FAULT_SIGBUS;
+	}
+
+	lock_page(page);
+	if (unlikely(page->mapping != inode->i_mapping ||
+		     page_offset(page) > i_size_read(inode))) {
+		/* Page got truncated out from underneath us */
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (PagePrivate(page))
+		release_new_page_budget(c);
+	else {
+		if (!PageChecked(page))
+			ubifs_convert_page_budget(c);
+		SetPagePrivate(page);
+		atomic_long_inc(&c->dirty_pg_cnt);
+		__set_page_dirty_nobuffers(page);
+	}
+
+	if (update_time) {
+		int release;
+		struct ubifs_inode *ui = ubifs_inode(inode);
+
+		mutex_lock(&ui->ui_mutex);
+		inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+		release = ui->dirty;
+		mark_inode_dirty_sync(inode);
+		mutex_unlock(&ui->ui_mutex);
+		if (release)
+			ubifs_release_dirty_inode_budget(c, ui);
+	}
+
+	unlock_page(page);
+	return 0;
+
+out_unlock:
+	unlock_page(page);
+	ubifs_release_budget(c, &req);
+	if (err)
+		err = VM_FAULT_SIGBUS;
+	return err;
+}
+
+static const struct vm_operations_struct ubifs_file_vm_ops = {
+	.fault        = filemap_fault,
+	.page_mkwrite = ubifs_vm_page_mkwrite,
+};
+
+static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int err;
+
+	err = generic_file_mmap(file, vma);
+	if (err)
+		return err;
+	vma->vm_ops = &ubifs_file_vm_ops;
+	return 0;
+}
+
+const struct address_space_operations ubifs_file_address_operations = {
+	.readpage       = ubifs_readpage,
+	.writepage      = ubifs_writepage,
+	.write_begin    = ubifs_write_begin,
+	.write_end      = ubifs_write_end,
+	.invalidatepage = ubifs_invalidatepage,
+	.set_page_dirty = ubifs_set_page_dirty,
+	.releasepage    = ubifs_releasepage,
+};
+
+const struct inode_operations ubifs_file_inode_operations = {
+	.setattr     = ubifs_setattr,
+	.getattr     = ubifs_getattr,
+#ifdef CONFIG_UBIFS_FS_XATTR
+	.setxattr    = ubifs_setxattr,
+	.getxattr    = ubifs_getxattr,
+	.listxattr   = ubifs_listxattr,
+	.removexattr = ubifs_removexattr,
+#endif
+};
+
+const struct inode_operations ubifs_symlink_inode_operations = {
+	.readlink    = generic_readlink,
+	.follow_link = ubifs_follow_link,
+	.setattr     = ubifs_setattr,
+	.getattr     = ubifs_getattr,
+};
+
+const struct file_operations ubifs_file_operations = {
+	.llseek         = generic_file_llseek,
+	.read           = do_sync_read,
+	.write          = do_sync_write,
+	.aio_read       = generic_file_aio_read,
+	.aio_write      = ubifs_aio_write,
+	.mmap           = ubifs_file_mmap,
+	.fsync          = ubifs_fsync,
+	.unlocked_ioctl = ubifs_ioctl,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ubifs_compat_ioctl,
+#endif
+};
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
new file mode 100644
index 00000000..2559d174
--- /dev/null
+++ b/fs/ubifs/find.c
@@ -0,0 +1,977 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file contains functions for finding LEBs for various purposes e.g.
+ * garbage collection. In general, lprops category heaps and lists are used
+ * for fast access, falling back on scanning the LPT as a last resort.
+ */
+
+#include <linux/sort.h>
+#include "ubifs.h"
+
+/**
+ * struct scan_data - data provided to scan callback functions
+ * @min_space: minimum number of bytes for which to scan
+ * @pick_free: whether it is OK to scan for empty LEBs
+ * @lnum: LEB number found is returned here
+ * @exclude_index: whether to exclude index LEBs
+ */
+struct scan_data {
+	int min_space;
+	int pick_free;
+	int lnum;
+	int exclude_index;
+};
+
+/**
+ * valuable - determine whether LEB properties are valuable.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties
+ *
+ * This function return %1 if the LEB properties should be added to the LEB
+ * properties tree in memory. Otherwise %0 is returned.
+ */
+static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops)
+{
+	int n, cat = lprops->flags & LPROPS_CAT_MASK;
+	struct ubifs_lpt_heap *heap;
+
+	switch (cat) {
+	case LPROPS_DIRTY:
+	case LPROPS_DIRTY_IDX:
+	case LPROPS_FREE:
+		heap = &c->lpt_heap[cat - 1];
+		if (heap->cnt < heap->max_cnt)
+			return 1;
+		if (lprops->free + lprops->dirty >= c->dark_wm)
+			return 1;
+		return 0;
+	case LPROPS_EMPTY:
+		n = c->lst.empty_lebs + c->freeable_cnt -
+		    c->lst.taken_empty_lebs;
+		if (n < c->lsave_cnt)
+			return 1;
+		return 0;
+	case LPROPS_FREEABLE:
+		return 1;
+	case LPROPS_FRDI_IDX:
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * scan_for_dirty_cb - dirty space scan callback.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_dirty_cb(struct ubifs_info *c,
+			     const struct ubifs_lprops *lprops, int in_tree,
+			     struct scan_data *data)
+{
+	int ret = LPT_SCAN_CONTINUE;
+
+	/* Exclude LEBs that are currently in use */
+	if (lprops->flags & LPROPS_TAKEN)
+		return LPT_SCAN_CONTINUE;
+	/* Determine whether to add these LEB properties to the tree */
+	if (!in_tree && valuable(c, lprops))
+		ret |= LPT_SCAN_ADD;
+	/* Exclude LEBs with too little space */
+	if (lprops->free + lprops->dirty < data->min_space)
+		return ret;
+	/* If specified, exclude index LEBs */
+	if (data->exclude_index && lprops->flags & LPROPS_INDEX)
+		return ret;
+	/* If specified, exclude empty or freeable LEBs */
+	if (lprops->free + lprops->dirty == c->leb_size) {
+		if (!data->pick_free)
+			return ret;
+	/* Exclude LEBs with too little dirty space (unless it is empty) */
+	} else if (lprops->dirty < c->dead_wm)
+		return ret;
+	/* Finally we found space */
+	data->lnum = lprops->lnum;
+	return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * scan_for_dirty - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount free plus dirty space the returned LEB has to
+ *             have
+ * @pick_free: if it is OK to return a free or freeable LEB
+ * @exclude_index: whether to exclude index LEBs
+ *
+ * This function returns a pointer to the LEB properties found or a negative
+ * error code.
+ */
+static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
+						 int min_space, int pick_free,
+						 int exclude_index)
+{
+	const struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+	struct scan_data data;
+	int err, i;
+
+	/* There may be an LEB with enough dirty space on the free heap */
+	heap = &c->lpt_heap[LPROPS_FREE - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		lprops = heap->arr[i];
+		if (lprops->free + lprops->dirty < min_space)
+			continue;
+		if (lprops->dirty < c->dead_wm)
+			continue;
+		return lprops;
+	}
+	/*
+	 * A LEB may have fallen off of the bottom of the dirty heap, and ended
+	 * up as uncategorized even though it has enough dirty space for us now,
+	 * so check the uncategorized list. N.B. neither empty nor freeable LEBs
+	 * can end up as uncategorized because they are kept on lists not
+	 * finite-sized heaps.
+	 */
+	list_for_each_entry(lprops, &c->uncat_list, list) {
+		if (lprops->flags & LPROPS_TAKEN)
+			continue;
+		if (lprops->free + lprops->dirty < min_space)
+			continue;
+		if (exclude_index && (lprops->flags & LPROPS_INDEX))
+			continue;
+		if (lprops->dirty < c->dead_wm)
+			continue;
+		return lprops;
+	}
+	/* We have looked everywhere in main memory, now scan the flash */
+	if (c->pnodes_have >= c->pnode_cnt)
+		/* All pnodes are in memory, so skip scan */
+		return ERR_PTR(-ENOSPC);
+	data.min_space = min_space;
+	data.pick_free = pick_free;
+	data.lnum = -1;
+	data.exclude_index = exclude_index;
+	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+				    (ubifs_lpt_scan_callback)scan_for_dirty_cb,
+				    &data);
+	if (err)
+		return ERR_PTR(err);
+	ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+	c->lscan_lnum = data.lnum;
+	lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+	if (IS_ERR(lprops))
+		return lprops;
+	ubifs_assert(lprops->lnum == data.lnum);
+	ubifs_assert(lprops->free + lprops->dirty >= min_space);
+	ubifs_assert(lprops->dirty >= c->dead_wm ||
+		     (pick_free &&
+		      lprops->free + lprops->dirty == c->leb_size));
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!exclude_index || !(lprops->flags & LPROPS_INDEX));
+	return lprops;
+}
+
+/**
+ * ubifs_find_dirty_leb - find a dirty LEB for the Garbage Collector.
+ * @c: the UBIFS file-system description object
+ * @ret_lp: LEB properties are returned here on exit
+ * @min_space: minimum amount free plus dirty space the returned LEB has to
+ *             have
+ * @pick_free: controls whether it is OK to pick empty or index LEBs
+ *
+ * This function tries to find a dirty logical eraseblock which has at least
+ * @min_space free and dirty space. It prefers to take an LEB from the dirty or
+ * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
+ * or do not have an LEB which satisfies the @min_space criteria.
+ *
+ * Note, LEBs which have less than dead watermark of free + dirty space are
+ * never picked by this function.
+ *
+ * The additional @pick_free argument controls if this function has to return a
+ * free or freeable LEB if one is present. For example, GC must to set it to %1,
+ * when called from the journal space reservation function, because the
+ * appearance of free space may coincide with the loss of enough dirty space
+ * for GC to succeed anyway.
+ *
+ * In contrast, if the Garbage Collector is called from budgeting, it should
+ * just make free space, not return LEBs which are already free or freeable.
+ *
+ * In addition @pick_free is set to %2 by the recovery process in order to
+ * recover gc_lnum in which case an index LEB must not be returned.
+ *
+ * This function returns zero and the LEB properties of found dirty LEB in case
+ * of success, %-ENOSPC if no dirty LEB was found and a negative error code in
+ * case of other failures. The returned LEB is marked as "taken".
+ */
+int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
+			 int min_space, int pick_free)
+{
+	int err = 0, sum, exclude_index = pick_free == 2 ? 1 : 0;
+	const struct ubifs_lprops *lp = NULL, *idx_lp = NULL;
+	struct ubifs_lpt_heap *heap, *idx_heap;
+
+	ubifs_get_lprops(c);
+
+	if (pick_free) {
+		int lebs, rsvd_idx_lebs = 0;
+
+		spin_lock(&c->space_lock);
+		lebs = c->lst.empty_lebs + c->idx_gc_cnt;
+		lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
+
+		/*
+		 * Note, the index may consume more LEBs than have been reserved
+		 * for it. It is OK because it might be consolidated by GC.
+		 * But if the index takes fewer LEBs than it is reserved for it,
+		 * this function must avoid picking those reserved LEBs.
+		 */
+		if (c->bi.min_idx_lebs >= c->lst.idx_lebs) {
+			rsvd_idx_lebs = c->bi.min_idx_lebs -  c->lst.idx_lebs;
+			exclude_index = 1;
+		}
+		spin_unlock(&c->space_lock);
+
+		/* Check if there are enough free LEBs for the index */
+		if (rsvd_idx_lebs < lebs) {
+			/* OK, try to find an empty LEB */
+			lp = ubifs_fast_find_empty(c);
+			if (lp)
+				goto found;
+
+			/* Or a freeable LEB */
+			lp = ubifs_fast_find_freeable(c);
+			if (lp)
+				goto found;
+		} else
+			/*
+			 * We cannot pick free/freeable LEBs in the below code.
+			 */
+			pick_free = 0;
+	} else {
+		spin_lock(&c->space_lock);
+		exclude_index = (c->bi.min_idx_lebs >= c->lst.idx_lebs);
+		spin_unlock(&c->space_lock);
+	}
+
+	/* Look on the dirty and dirty index heaps */
+	heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+	idx_heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+
+	if (idx_heap->cnt && !exclude_index) {
+		idx_lp = idx_heap->arr[0];
+		sum = idx_lp->free + idx_lp->dirty;
+		/*
+		 * Since we reserve thrice as much space for the index than it
+		 * actually takes, it does not make sense to pick indexing LEBs
+		 * with less than, say, half LEB of dirty space. May be half is
+		 * not the optimal boundary - this should be tested and
+		 * checked. This boundary should determine how much we use
+		 * in-the-gaps to consolidate the index comparing to how much
+		 * we use garbage collector to consolidate it. The "half"
+		 * criteria just feels to be fine.
+		 */
+		if (sum < min_space || sum < c->half_leb_size)
+			idx_lp = NULL;
+	}
+
+	if (heap->cnt) {
+		lp = heap->arr[0];
+		if (lp->dirty + lp->free < min_space)
+			lp = NULL;
+	}
+
+	/* Pick the LEB with most space */
+	if (idx_lp && lp) {
+		if (idx_lp->free + idx_lp->dirty >= lp->free + lp->dirty)
+			lp = idx_lp;
+	} else if (idx_lp && !lp)
+		lp = idx_lp;
+
+	if (lp) {
+		ubifs_assert(lp->free + lp->dirty >= c->dead_wm);
+		goto found;
+	}
+
+	/* Did not find a dirty LEB on the dirty heaps, have to scan */
+	dbg_find("scanning LPT for a dirty LEB");
+	lp = scan_for_dirty(c, min_space, pick_free, exclude_index);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+	ubifs_assert(lp->dirty >= c->dead_wm ||
+		     (pick_free && lp->free + lp->dirty == c->leb_size));
+
+found:
+	dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
+		 lp->lnum, lp->free, lp->dirty, lp->flags);
+
+	lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+			     lp->flags | LPROPS_TAKEN, 0);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	memcpy(ret_lp, lp, sizeof(struct ubifs_lprops));
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * scan_for_free_cb - free space scan callback.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_free_cb(struct ubifs_info *c,
+			    const struct ubifs_lprops *lprops, int in_tree,
+			    struct scan_data *data)
+{
+	int ret = LPT_SCAN_CONTINUE;
+
+	/* Exclude LEBs that are currently in use */
+	if (lprops->flags & LPROPS_TAKEN)
+		return LPT_SCAN_CONTINUE;
+	/* Determine whether to add these LEB properties to the tree */
+	if (!in_tree && valuable(c, lprops))
+		ret |= LPT_SCAN_ADD;
+	/* Exclude index LEBs */
+	if (lprops->flags & LPROPS_INDEX)
+		return ret;
+	/* Exclude LEBs with too little space */
+	if (lprops->free < data->min_space)
+		return ret;
+	/* If specified, exclude empty LEBs */
+	if (!data->pick_free && lprops->free == c->leb_size)
+		return ret;
+	/*
+	 * LEBs that have only free and dirty space must not be allocated
+	 * because they may have been unmapped already or they may have data
+	 * that is obsolete only because of nodes that are still sitting in a
+	 * wbuf.
+	 */
+	if (lprops->free + lprops->dirty == c->leb_size && lprops->dirty > 0)
+		return ret;
+	/* Finally we found space */
+	data->lnum = lprops->lnum;
+	return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * do_find_free_space - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount of free space required
+ * @pick_free: whether it is OK to scan for empty LEBs
+ * @squeeze: whether to try to find space in a non-empty LEB first
+ *
+ * This function returns a pointer to the LEB properties found or a negative
+ * error code.
+ */
+static
+const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
+					      int min_space, int pick_free,
+					      int squeeze)
+{
+	const struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+	struct scan_data data;
+	int err, i;
+
+	if (squeeze) {
+		lprops = ubifs_fast_find_free(c);
+		if (lprops && lprops->free >= min_space)
+			return lprops;
+	}
+	if (pick_free) {
+		lprops = ubifs_fast_find_empty(c);
+		if (lprops)
+			return lprops;
+	}
+	if (!squeeze) {
+		lprops = ubifs_fast_find_free(c);
+		if (lprops && lprops->free >= min_space)
+			return lprops;
+	}
+	/* There may be an LEB with enough free space on the dirty heap */
+	heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		lprops = heap->arr[i];
+		if (lprops->free >= min_space)
+			return lprops;
+	}
+	/*
+	 * A LEB may have fallen off of the bottom of the free heap, and ended
+	 * up as uncategorized even though it has enough free space for us now,
+	 * so check the uncategorized list. N.B. neither empty nor freeable LEBs
+	 * can end up as uncategorized because they are kept on lists not
+	 * finite-sized heaps.
+	 */
+	list_for_each_entry(lprops, &c->uncat_list, list) {
+		if (lprops->flags & LPROPS_TAKEN)
+			continue;
+		if (lprops->flags & LPROPS_INDEX)
+			continue;
+		if (lprops->free >= min_space)
+			return lprops;
+	}
+	/* We have looked everywhere in main memory, now scan the flash */
+	if (c->pnodes_have >= c->pnode_cnt)
+		/* All pnodes are in memory, so skip scan */
+		return ERR_PTR(-ENOSPC);
+	data.min_space = min_space;
+	data.pick_free = pick_free;
+	data.lnum = -1;
+	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+				    (ubifs_lpt_scan_callback)scan_for_free_cb,
+				    &data);
+	if (err)
+		return ERR_PTR(err);
+	ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+	c->lscan_lnum = data.lnum;
+	lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+	if (IS_ERR(lprops))
+		return lprops;
+	ubifs_assert(lprops->lnum == data.lnum);
+	ubifs_assert(lprops->free >= min_space);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+	return lprops;
+}
+
+/**
+ * ubifs_find_free_space - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount of required free space
+ * @offs: contains offset of where free space starts on exit
+ * @squeeze: whether to try to find space in a non-empty LEB first
+ *
+ * This function looks for an LEB with at least @min_space bytes of free space.
+ * It tries to find an empty LEB if possible. If no empty LEBs are available,
+ * this function searches for a non-empty data LEB. The returned LEB is marked
+ * as "taken".
+ *
+ * This function returns found LEB number in case of success, %-ENOSPC if it
+ * failed to find a LEB with @min_space bytes of free space and other a negative
+ * error codes in case of failure.
+ */
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
+			  int squeeze)
+{
+	const struct ubifs_lprops *lprops;
+	int lebs, rsvd_idx_lebs, pick_free = 0, err, lnum, flags;
+
+	dbg_find("min_space %d", min_space);
+	ubifs_get_lprops(c);
+
+	/* Check if there are enough empty LEBs for commit */
+	spin_lock(&c->space_lock);
+	if (c->bi.min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = c->bi.min_idx_lebs -  c->lst.idx_lebs;
+	else
+		rsvd_idx_lebs = 0;
+	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+	       c->lst.taken_empty_lebs;
+	if (rsvd_idx_lebs < lebs)
+		/*
+		 * OK to allocate an empty LEB, but we still don't want to go
+		 * looking for one if there aren't any.
+		 */
+		if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+			pick_free = 1;
+			/*
+			 * Because we release the space lock, we must account
+			 * for this allocation here. After the LEB properties
+			 * flags have been updated, we subtract one. Note, the
+			 * result of this is that lprops also decreases
+			 * @taken_empty_lebs in 'ubifs_change_lp()', so it is
+			 * off by one for a short period of time which may
+			 * introduce a small disturbance to budgeting
+			 * calculations, but this is harmless because at the
+			 * worst case this would make the budgeting subsystem
+			 * be more pessimistic than needed.
+			 *
+			 * Fundamentally, this is about serialization of the
+			 * budgeting and lprops subsystems. We could make the
+			 * @space_lock a mutex and avoid dropping it before
+			 * calling 'ubifs_change_lp()', but mutex is more
+			 * heavy-weight, and we want budgeting to be as fast as
+			 * possible.
+			 */
+			c->lst.taken_empty_lebs += 1;
+		}
+	spin_unlock(&c->space_lock);
+
+	lprops = do_find_free_space(c, min_space, pick_free, squeeze);
+	if (IS_ERR(lprops)) {
+		err = PTR_ERR(lprops);
+		goto out;
+	}
+
+	lnum = lprops->lnum;
+	flags = lprops->flags | LPROPS_TAKEN;
+
+	lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, flags, 0);
+	if (IS_ERR(lprops)) {
+		err = PTR_ERR(lprops);
+		goto out;
+	}
+
+	if (pick_free) {
+		spin_lock(&c->space_lock);
+		c->lst.taken_empty_lebs -= 1;
+		spin_unlock(&c->space_lock);
+	}
+
+	*offs = c->leb_size - lprops->free;
+	ubifs_release_lprops(c);
+
+	if (*offs == 0) {
+		/*
+		 * Ensure that empty LEBs have been unmapped. They may not have
+		 * been, for example, because of an unclean unmount.  Also
+		 * LEBs that were freeable LEBs (free + dirty == leb_size) will
+		 * not have been unmapped.
+		 */
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+	}
+
+	dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs);
+	ubifs_assert(*offs <= c->leb_size - min_space);
+	return lnum;
+
+out:
+	if (pick_free) {
+		spin_lock(&c->space_lock);
+		c->lst.taken_empty_lebs -= 1;
+		spin_unlock(&c->space_lock);
+	}
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * scan_for_idx_cb - callback used by the scan for a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_idx_cb(struct ubifs_info *c,
+			   const struct ubifs_lprops *lprops, int in_tree,
+			   struct scan_data *data)
+{
+	int ret = LPT_SCAN_CONTINUE;
+
+	/* Exclude LEBs that are currently in use */
+	if (lprops->flags & LPROPS_TAKEN)
+		return LPT_SCAN_CONTINUE;
+	/* Determine whether to add these LEB properties to the tree */
+	if (!in_tree && valuable(c, lprops))
+		ret |= LPT_SCAN_ADD;
+	/* Exclude index LEBS */
+	if (lprops->flags & LPROPS_INDEX)
+		return ret;
+	/* Exclude LEBs that cannot be made empty */
+	if (lprops->free + lprops->dirty != c->leb_size)
+		return ret;
+	/*
+	 * We are allocating for the index so it is safe to allocate LEBs with
+	 * only free and dirty space, because write buffers are sync'd at commit
+	 * start.
+	 */
+	data->lnum = lprops->lnum;
+	return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * scan_for_leb_for_idx - scan for a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ */
+static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+	struct scan_data data;
+	int err;
+
+	data.lnum = -1;
+	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+				    (ubifs_lpt_scan_callback)scan_for_idx_cb,
+				    &data);
+	if (err)
+		return ERR_PTR(err);
+	ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+	c->lscan_lnum = data.lnum;
+	lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+	if (IS_ERR(lprops))
+		return lprops;
+	ubifs_assert(lprops->lnum == data.lnum);
+	ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+	return lprops;
+}
+
+/**
+ * ubifs_find_free_leb_for_idx - find a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ *
+ * This function looks for a free LEB and returns that LEB number. The returned
+ * LEB is marked as "taken", "index".
+ *
+ * Only empty LEBs are allocated. This is for two reasons. First, the commit
+ * calculates the number of LEBs to allocate based on the assumption that they
+ * will be empty. Secondly, free space at the end of an index LEB is not
+ * guaranteed to be empty because it may have been used by the in-the-gaps
+ * method prior to an unclean unmount.
+ *
+ * If no LEB is found %-ENOSPC is returned. For other failures another negative
+ * error code is returned.
+ */
+int ubifs_find_free_leb_for_idx(struct ubifs_info *c)
+{
+	const struct ubifs_lprops *lprops;
+	int lnum = -1, err, flags;
+
+	ubifs_get_lprops(c);
+
+	lprops = ubifs_fast_find_empty(c);
+	if (!lprops) {
+		lprops = ubifs_fast_find_freeable(c);
+		if (!lprops) {
+			ubifs_assert(c->freeable_cnt == 0);
+			if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+				lprops = scan_for_leb_for_idx(c);
+				if (IS_ERR(lprops)) {
+					err = PTR_ERR(lprops);
+					goto out;
+				}
+			}
+		}
+	}
+
+	if (!lprops) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	lnum = lprops->lnum;
+
+	dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
+		 lnum, lprops->free, lprops->dirty, lprops->flags);
+
+	flags = lprops->flags | LPROPS_TAKEN | LPROPS_INDEX;
+	lprops = ubifs_change_lp(c, lprops, c->leb_size, 0, flags, 0);
+	if (IS_ERR(lprops)) {
+		err = PTR_ERR(lprops);
+		goto out;
+	}
+
+	ubifs_release_lprops(c);
+
+	/*
+	 * Ensure that empty LEBs have been unmapped. They may not have been,
+	 * for example, because of an unclean unmount. Also LEBs that were
+	 * freeable LEBs (free + dirty == leb_size) will not have been unmapped.
+	 */
+	err = ubifs_leb_unmap(c, lnum);
+	if (err) {
+		ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+				    LPROPS_TAKEN | LPROPS_INDEX, 0);
+		return err;
+	}
+
+	return lnum;
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+static int cmp_dirty_idx(const struct ubifs_lprops **a,
+			 const struct ubifs_lprops **b)
+{
+	const struct ubifs_lprops *lpa = *a;
+	const struct ubifs_lprops *lpb = *b;
+
+	return lpa->dirty + lpa->free - lpb->dirty - lpb->free;
+}
+
+static void swap_dirty_idx(struct ubifs_lprops **a, struct ubifs_lprops **b,
+			   int size)
+{
+	struct ubifs_lprops *t = *a;
+
+	*a = *b;
+	*b = t;
+}
+
+/**
+ * ubifs_save_dirty_idx_lnums - save an array of the most dirty index LEB nos.
+ * @c: the UBIFS file-system description object
+ *
+ * This function is called each commit to create an array of LEB numbers of
+ * dirty index LEBs sorted in order of dirty and free space.  This is used by
+ * the in-the-gaps method of TNC commit.
+ */
+int ubifs_save_dirty_idx_lnums(struct ubifs_info *c)
+{
+	int i;
+
+	ubifs_get_lprops(c);
+	/* Copy the LPROPS_DIRTY_IDX heap */
+	c->dirty_idx.cnt = c->lpt_heap[LPROPS_DIRTY_IDX - 1].cnt;
+	memcpy(c->dirty_idx.arr, c->lpt_heap[LPROPS_DIRTY_IDX - 1].arr,
+	       sizeof(void *) * c->dirty_idx.cnt);
+	/* Sort it so that the dirtiest is now at the end */
+	sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *),
+	     (int (*)(const void *, const void *))cmp_dirty_idx,
+	     (void (*)(void *, void *, int))swap_dirty_idx);
+	dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt);
+	if (c->dirty_idx.cnt)
+		dbg_find("dirtiest index LEB is %d with dirty %d and free %d",
+			 c->dirty_idx.arr[c->dirty_idx.cnt - 1]->lnum,
+			 c->dirty_idx.arr[c->dirty_idx.cnt - 1]->dirty,
+			 c->dirty_idx.arr[c->dirty_idx.cnt - 1]->free);
+	/* Replace the lprops pointers with LEB numbers */
+	for (i = 0; i < c->dirty_idx.cnt; i++)
+		c->dirty_idx.arr[i] = (void *)(size_t)c->dirty_idx.arr[i]->lnum;
+	ubifs_release_lprops(c);
+	return 0;
+}
+
+/**
+ * scan_dirty_idx_cb - callback used by the scan for a dirty index LEB.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_dirty_idx_cb(struct ubifs_info *c,
+			   const struct ubifs_lprops *lprops, int in_tree,
+			   struct scan_data *data)
+{
+	int ret = LPT_SCAN_CONTINUE;
+
+	/* Exclude LEBs that are currently in use */
+	if (lprops->flags & LPROPS_TAKEN)
+		return LPT_SCAN_CONTINUE;
+	/* Determine whether to add these LEB properties to the tree */
+	if (!in_tree && valuable(c, lprops))
+		ret |= LPT_SCAN_ADD;
+	/* Exclude non-index LEBs */
+	if (!(lprops->flags & LPROPS_INDEX))
+		return ret;
+	/* Exclude LEBs with too little space */
+	if (lprops->free + lprops->dirty < c->min_idx_node_sz)
+		return ret;
+	/* Finally we found space */
+	data->lnum = lprops->lnum;
+	return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * find_dirty_idx_leb - find a dirty index LEB.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB number upon success and a negative error code upon
+ * failure.  In particular, -ENOSPC is returned if a dirty index LEB is not
+ * found.
+ *
+ * Note that this function scans the entire LPT but it is called very rarely.
+ */
+static int find_dirty_idx_leb(struct ubifs_info *c)
+{
+	const struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+	struct scan_data data;
+	int err, i, ret;
+
+	/* Check all structures in memory first */
+	data.lnum = -1;
+	heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		lprops = heap->arr[i];
+		ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+		if (ret & LPT_SCAN_STOP)
+			goto found;
+	}
+	list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+		ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+		if (ret & LPT_SCAN_STOP)
+			goto found;
+	}
+	list_for_each_entry(lprops, &c->uncat_list, list) {
+		ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+		if (ret & LPT_SCAN_STOP)
+			goto found;
+	}
+	if (c->pnodes_have >= c->pnode_cnt)
+		/* All pnodes are in memory, so skip scan */
+		return -ENOSPC;
+	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+				    (ubifs_lpt_scan_callback)scan_dirty_idx_cb,
+				    &data);
+	if (err)
+		return err;
+found:
+	ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+	c->lscan_lnum = data.lnum;
+	lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+	if (IS_ERR(lprops))
+		return PTR_ERR(lprops);
+	ubifs_assert(lprops->lnum == data.lnum);
+	ubifs_assert(lprops->free + lprops->dirty >= c->min_idx_node_sz);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert((lprops->flags & LPROPS_INDEX));
+
+	dbg_find("found dirty LEB %d, free %d, dirty %d, flags %#x",
+		 lprops->lnum, lprops->free, lprops->dirty, lprops->flags);
+
+	lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC,
+				 lprops->flags | LPROPS_TAKEN, 0);
+	if (IS_ERR(lprops))
+		return PTR_ERR(lprops);
+
+	return lprops->lnum;
+}
+
+/**
+ * get_idx_gc_leb - try to get a LEB number from trivial GC.
+ * @c: the UBIFS file-system description object
+ */
+static int get_idx_gc_leb(struct ubifs_info *c)
+{
+	const struct ubifs_lprops *lp;
+	int err, lnum;
+
+	err = ubifs_get_idx_gc_leb(c);
+	if (err < 0)
+		return err;
+	lnum = err;
+	/*
+	 * The LEB was due to be unmapped after the commit but
+	 * it is needed now for this commit.
+	 */
+	lp = ubifs_lpt_lookup_dirty(c, lnum);
+	if (IS_ERR(lp))
+		return PTR_ERR(lp);
+	lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+			     lp->flags | LPROPS_INDEX, -1);
+	if (IS_ERR(lp))
+		return PTR_ERR(lp);
+	dbg_find("LEB %d, dirty %d and free %d flags %#x",
+		 lp->lnum, lp->dirty, lp->free, lp->flags);
+	return lnum;
+}
+
+/**
+ * find_dirtiest_idx_leb - find dirtiest index LEB from dirtiest array.
+ * @c: the UBIFS file-system description object
+ */
+static int find_dirtiest_idx_leb(struct ubifs_info *c)
+{
+	const struct ubifs_lprops *lp;
+	int lnum;
+
+	while (1) {
+		if (!c->dirty_idx.cnt)
+			return -ENOSPC;
+		/* The lprops pointers were replaced by LEB numbers */
+		lnum = (size_t)c->dirty_idx.arr[--c->dirty_idx.cnt];
+		lp = ubifs_lpt_lookup(c, lnum);
+		if (IS_ERR(lp))
+			return PTR_ERR(lp);
+		if ((lp->flags & LPROPS_TAKEN) || !(lp->flags & LPROPS_INDEX))
+			continue;
+		lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+				     lp->flags | LPROPS_TAKEN, 0);
+		if (IS_ERR(lp))
+			return PTR_ERR(lp);
+		break;
+	}
+	dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty,
+		 lp->free, lp->flags);
+	ubifs_assert(lp->flags | LPROPS_TAKEN);
+	ubifs_assert(lp->flags | LPROPS_INDEX);
+	return lnum;
+}
+
+/**
+ * ubifs_find_dirty_idx_leb - try to find dirtiest index LEB as at last commit.
+ * @c: the UBIFS file-system description object
+ *
+ * This function attempts to find an untaken index LEB with the most free and
+ * dirty space that can be used without overwriting index nodes that were in the
+ * last index committed.
+ */
+int ubifs_find_dirty_idx_leb(struct ubifs_info *c)
+{
+	int err;
+
+	ubifs_get_lprops(c);
+
+	/*
+	 * We made an array of the dirtiest index LEB numbers as at the start of
+	 * last commit.  Try that array first.
+	 */
+	err = find_dirtiest_idx_leb(c);
+
+	/* Next try scanning the entire LPT */
+	if (err == -ENOSPC)
+		err = find_dirty_idx_leb(c);
+
+	/* Finally take any index LEBs awaiting trivial GC */
+	if (err == -ENOSPC)
+		err = get_idx_gc_leb(c);
+
+	ubifs_release_lprops(c);
+	return err;
+}
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
new file mode 100644
index 00000000..ded29f62
--- /dev/null
+++ b/fs/ubifs/gc.c
@@ -0,0 +1,985 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements garbage collection. The procedure for garbage collection
+ * is different depending on whether a LEB as an index LEB (contains index
+ * nodes) or not. For non-index LEBs, garbage collection finds a LEB which
+ * contains a lot of dirty space (obsolete nodes), and copies the non-obsolete
+ * nodes to the journal, at which point the garbage-collected LEB is free to be
+ * reused. For index LEBs, garbage collection marks the non-obsolete index nodes
+ * dirty in the TNC, and after the next commit, the garbage-collected LEB is
+ * to be reused. Garbage collection will cause the number of dirty index nodes
+ * to grow, however sufficient space is reserved for the index to ensure the
+ * commit will never run out of space.
+ *
+ * Notes about dead watermark. At current UBIFS implementation we assume that
+ * LEBs which have less than @c->dead_wm bytes of free + dirty space are full
+ * and not worth garbage-collecting. The dead watermark is one min. I/O unit
+ * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS
+ * Garbage Collector has to synchronize the GC head's write buffer before
+ * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can
+ * actually reclaim even very small pieces of dirty space by garbage collecting
+ * enough dirty LEBs, but we do not bother doing this at this implementation.
+ *
+ * Notes about dark watermark. The results of GC work depends on how big are
+ * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed,
+ * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would
+ * have to waste large pieces of free space at the end of LEB B, because nodes
+ * from LEB A would not fit. And the worst situation is when all nodes are of
+ * maximum size. So dark watermark is the amount of free + dirty space in LEB
+ * which are guaranteed to be reclaimable. If LEB has less space, the GC might
+ * be unable to reclaim it. So, LEBs with free + dirty greater than dark
+ * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
+ * good, and GC takes extra care when moving them.
+ */
+
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/list_sort.h>
+#include "ubifs.h"
+
+/*
+ * GC may need to move more than one LEB to make progress. The below constants
+ * define "soft" and "hard" limits on the number of LEBs the garbage collector
+ * may move.
+ */
+#define SOFT_LEBS_LIMIT 4
+#define HARD_LEBS_LIMIT 32
+
+/**
+ * switch_gc_head - switch the garbage collection journal head.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to write
+ * @len: length of the buffer to write
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ *
+ * This function switch the GC head to the next LEB which is reserved in
+ * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required,
+ * and other negative error code in case of failures.
+ */
+static int switch_gc_head(struct ubifs_info *c)
+{
+	int err, gc_lnum = c->gc_lnum;
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+
+	ubifs_assert(gc_lnum != -1);
+	dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)",
+	       wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum,
+	       c->leb_size - wbuf->offs - wbuf->used);
+
+	err = ubifs_wbuf_sync_nolock(wbuf);
+	if (err)
+		return err;
+
+	/*
+	 * The GC write-buffer was synchronized, we may safely unmap
+	 * 'c->gc_lnum'.
+	 */
+	err = ubifs_leb_unmap(c, gc_lnum);
+	if (err)
+		return err;
+
+	err = ubifs_wbuf_sync_nolock(wbuf);
+	if (err)
+		return err;
+
+	err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
+	if (err)
+		return err;
+
+	c->gc_lnum = -1;
+	err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM);
+	return err;
+}
+
+/**
+ * data_nodes_cmp - compare 2 data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first data node
+ * @a: second data node
+ *
+ * This function compares data nodes @a and @b. Returns %1 if @a has greater
+ * inode or block number, and %-1 otherwise.
+ */
+static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	ino_t inuma, inumb;
+	struct ubifs_info *c = priv;
+	struct ubifs_scan_node *sa, *sb;
+
+	cond_resched();
+	if (a == b)
+		return 0;
+
+	sa = list_entry(a, struct ubifs_scan_node, list);
+	sb = list_entry(b, struct ubifs_scan_node, list);
+
+	ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
+	ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
+	ubifs_assert(sa->type == UBIFS_DATA_NODE);
+	ubifs_assert(sb->type == UBIFS_DATA_NODE);
+
+	inuma = key_inum(c, &sa->key);
+	inumb = key_inum(c, &sb->key);
+
+	if (inuma == inumb) {
+		unsigned int blka = key_block(c, &sa->key);
+		unsigned int blkb = key_block(c, &sb->key);
+
+		if (blka <= blkb)
+			return -1;
+	} else if (inuma <= inumb)
+		return -1;
+
+	return 1;
+}
+
+/*
+ * nondata_nodes_cmp - compare 2 non-data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first node
+ * @a: second node
+ *
+ * This function compares nodes @a and @b. It makes sure that inode nodes go
+ * first and sorted by length in descending order. Directory entry nodes go
+ * after inode nodes and are sorted in ascending hash valuer order.
+ */
+static int nondata_nodes_cmp(void *priv, struct list_head *a,
+			     struct list_head *b)
+{
+	ino_t inuma, inumb;
+	struct ubifs_info *c = priv;
+	struct ubifs_scan_node *sa, *sb;
+
+	cond_resched();
+	if (a == b)
+		return 0;
+
+	sa = list_entry(a, struct ubifs_scan_node, list);
+	sb = list_entry(b, struct ubifs_scan_node, list);
+
+	ubifs_assert(key_type(c, &sa->key) != UBIFS_DATA_KEY &&
+		     key_type(c, &sb->key) != UBIFS_DATA_KEY);
+	ubifs_assert(sa->type != UBIFS_DATA_NODE &&
+		     sb->type != UBIFS_DATA_NODE);
+
+	/* Inodes go before directory entries */
+	if (sa->type == UBIFS_INO_NODE) {
+		if (sb->type == UBIFS_INO_NODE)
+			return sb->len - sa->len;
+		return -1;
+	}
+	if (sb->type == UBIFS_INO_NODE)
+		return 1;
+
+	ubifs_assert(key_type(c, &sa->key) == UBIFS_DENT_KEY ||
+		     key_type(c, &sa->key) == UBIFS_XENT_KEY);
+	ubifs_assert(key_type(c, &sb->key) == UBIFS_DENT_KEY ||
+		     key_type(c, &sb->key) == UBIFS_XENT_KEY);
+	ubifs_assert(sa->type == UBIFS_DENT_NODE ||
+		     sa->type == UBIFS_XENT_NODE);
+	ubifs_assert(sb->type == UBIFS_DENT_NODE ||
+		     sb->type == UBIFS_XENT_NODE);
+
+	inuma = key_inum(c, &sa->key);
+	inumb = key_inum(c, &sb->key);
+
+	if (inuma == inumb) {
+		uint32_t hasha = key_hash(c, &sa->key);
+		uint32_t hashb = key_hash(c, &sb->key);
+
+		if (hasha <= hashb)
+			return -1;
+	} else if (inuma <= inumb)
+		return -1;
+
+	return 1;
+}
+
+/**
+ * sort_nodes - sort nodes for GC.
+ * @c: UBIFS file-system description object
+ * @sleb: describes nodes to sort and contains the result on exit
+ * @nondata: contains non-data nodes on exit
+ * @min: minimum node size is returned here
+ *
+ * This function sorts the list of inodes to garbage collect. First of all, it
+ * kills obsolete nodes and separates data and non-data nodes to the
+ * @sleb->nodes and @nondata lists correspondingly.
+ *
+ * Data nodes are then sorted in block number order - this is important for
+ * bulk-read; data nodes with lower inode number go before data nodes with
+ * higher inode number, and data nodes with lower block number go before data
+ * nodes with higher block number;
+ *
+ * Non-data nodes are sorted as follows.
+ *   o First go inode nodes - they are sorted in descending length order.
+ *   o Then go directory entry nodes - they are sorted in hash order, which
+ *     should supposedly optimize 'readdir()'. Direntry nodes with lower parent
+ *     inode number go before direntry nodes with higher parent inode number,
+ *     and direntry nodes with lower name hash values go before direntry nodes
+ *     with higher name hash values.
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		      struct list_head *nondata, int *min)
+{
+	int err;
+	struct ubifs_scan_node *snod, *tmp;
+
+	*min = INT_MAX;
+
+	/* Separate data nodes and non-data nodes */
+	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+		ubifs_assert(snod->type == UBIFS_INO_NODE  ||
+			     snod->type == UBIFS_DATA_NODE ||
+			     snod->type == UBIFS_DENT_NODE ||
+			     snod->type == UBIFS_XENT_NODE ||
+			     snod->type == UBIFS_TRUN_NODE);
+
+		if (snod->type != UBIFS_INO_NODE  &&
+		    snod->type != UBIFS_DATA_NODE &&
+		    snod->type != UBIFS_DENT_NODE &&
+		    snod->type != UBIFS_XENT_NODE) {
+			/* Probably truncation node, zap it */
+			list_del(&snod->list);
+			kfree(snod);
+			continue;
+		}
+
+		ubifs_assert(key_type(c, &snod->key) == UBIFS_DATA_KEY ||
+			     key_type(c, &snod->key) == UBIFS_INO_KEY  ||
+			     key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+			     key_type(c, &snod->key) == UBIFS_XENT_KEY);
+
+		err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
+					 snod->offs, 0);
+		if (err < 0)
+			return err;
+
+		if (!err) {
+			/* The node is obsolete, remove it from the list */
+			list_del(&snod->list);
+			kfree(snod);
+			continue;
+		}
+
+		if (snod->len < *min)
+			*min = snod->len;
+
+		if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
+			list_move_tail(&snod->list, nondata);
+	}
+
+	/* Sort data and non-data nodes */
+	list_sort(c, &sleb->nodes, &data_nodes_cmp);
+	list_sort(c, nondata, &nondata_nodes_cmp);
+
+	err = dbg_check_data_nodes_order(c, &sleb->nodes);
+	if (err)
+		return err;
+	err = dbg_check_nondata_nodes_order(c, nondata);
+	if (err)
+		return err;
+	return 0;
+}
+
+/**
+ * move_node - move a node.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ * @snod: the mode to move
+ * @wbuf: write-buffer to move node to
+ *
+ * This function moves node @snod to @wbuf, changes TNC correspondingly, and
+ * destroys @snod. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		     struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
+{
+	int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
+
+	cond_resched();
+	err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
+	if (err)
+		return err;
+
+	err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
+				snod->offs, new_lnum, new_offs,
+				snod->len);
+	list_del(&snod->list);
+	kfree(snod);
+	return err;
+}
+
+/**
+ * move_nodes - move nodes.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ *
+ * This function moves valid nodes from data LEB described by @sleb to the GC
+ * journal head. This function returns zero in case of success, %-EAGAIN if
+ * commit is required, and other negative error codes in case of other
+ * failures.
+ */
+static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+{
+	int err, min;
+	LIST_HEAD(nondata);
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+
+	if (wbuf->lnum == -1) {
+		/*
+		 * The GC journal head is not set, because it is the first GC
+		 * invocation since mount.
+		 */
+		err = switch_gc_head(c);
+		if (err)
+			return err;
+	}
+
+	err = sort_nodes(c, sleb, &nondata, &min);
+	if (err)
+		goto out;
+
+	/* Write nodes to their new location. Use the first-fit strategy */
+	while (1) {
+		int avail;
+		struct ubifs_scan_node *snod, *tmp;
+
+		/* Move data nodes */
+		list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+			avail = c->leb_size - wbuf->offs - wbuf->used;
+			if  (snod->len > avail)
+				/*
+				 * Do not skip data nodes in order to optimize
+				 * bulk-read.
+				 */
+				break;
+
+			err = move_node(c, sleb, snod, wbuf);
+			if (err)
+				goto out;
+		}
+
+		/* Move non-data nodes */
+		list_for_each_entry_safe(snod, tmp, &nondata, list) {
+			avail = c->leb_size - wbuf->offs - wbuf->used;
+			if (avail < min)
+				break;
+
+			if  (snod->len > avail) {
+				/*
+				 * Keep going only if this is an inode with
+				 * some data. Otherwise stop and switch the GC
+				 * head. IOW, we assume that data-less inode
+				 * nodes and direntry nodes are roughly of the
+				 * same size.
+				 */
+				if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+				    snod->len == UBIFS_INO_NODE_SZ)
+					break;
+				continue;
+			}
+
+			err = move_node(c, sleb, snod, wbuf);
+			if (err)
+				goto out;
+		}
+
+		if (list_empty(&sleb->nodes) && list_empty(&nondata))
+			break;
+
+		/*
+		 * Waste the rest of the space in the LEB and switch to the
+		 * next LEB.
+		 */
+		err = switch_gc_head(c);
+		if (err)
+			goto out;
+	}
+
+	return 0;
+
+out:
+	list_splice_tail(&nondata, &sleb->nodes);
+	return err;
+}
+
+/**
+ * gc_sync_wbufs - sync write-buffers for GC.
+ * @c: UBIFS file-system description object
+ *
+ * We must guarantee that obsoleting nodes are on flash. Unfortunately they may
+ * be in a write-buffer instead. That is, a node could be written to a
+ * write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is
+ * erased before the write-buffer is sync'd and then there is an unclean
+ * unmount, then an existing node is lost. To avoid this, we sync all
+ * write-buffers.
+ *
+ * This function returns %0 on success or a negative error code on failure.
+ */
+static int gc_sync_wbufs(struct ubifs_info *c)
+{
+	int err, i;
+
+	for (i = 0; i < c->jhead_cnt; i++) {
+		if (i == GCHD)
+			continue;
+		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/**
+ * ubifs_garbage_collect_leb - garbage-collect a logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lp: describes the LEB to garbage collect
+ *
+ * This function garbage-collects an LEB and returns one of the @LEB_FREED,
+ * @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is
+ * required, and other negative error codes in case of failures.
+ */
+int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+	int err = 0, lnum = lp->lnum;
+
+	ubifs_assert(c->gc_lnum != -1 || wbuf->offs + wbuf->used == 0 ||
+		     c->need_recovery);
+	ubifs_assert(c->gc_lnum != lnum);
+	ubifs_assert(wbuf->lnum != lnum);
+
+	if (lp->free + lp->dirty == c->leb_size) {
+		/* Special case - a free LEB  */
+		dbg_gc("LEB %d is free, return it", lp->lnum);
+		ubifs_assert(!(lp->flags & LPROPS_INDEX));
+
+		if (lp->free != c->leb_size) {
+			/*
+			 * Write buffers must be sync'd before unmapping
+			 * freeable LEBs, because one of them may contain data
+			 * which obsoletes something in 'lp->pnum'.
+			 */
+			err = gc_sync_wbufs(c);
+			if (err)
+				return err;
+			err = ubifs_change_one_lp(c, lp->lnum, c->leb_size,
+						  0, 0, 0, 0);
+			if (err)
+				return err;
+		}
+		err = ubifs_leb_unmap(c, lp->lnum);
+		if (err)
+			return err;
+
+		if (c->gc_lnum == -1) {
+			c->gc_lnum = lnum;
+			return LEB_RETAINED;
+		}
+
+		return LEB_FREED;
+	}
+
+	/*
+	 * We scan the entire LEB even though we only really need to scan up to
+	 * (c->leb_size - lp->free).
+	 */
+	sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
+	if (IS_ERR(sleb))
+		return PTR_ERR(sleb);
+
+	ubifs_assert(!list_empty(&sleb->nodes));
+	snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
+
+	if (snod->type == UBIFS_IDX_NODE) {
+		struct ubifs_gced_idx_leb *idx_gc;
+
+		dbg_gc("indexing LEB %d (free %d, dirty %d)",
+		       lnum, lp->free, lp->dirty);
+		list_for_each_entry(snod, &sleb->nodes, list) {
+			struct ubifs_idx_node *idx = snod->node;
+			int level = le16_to_cpu(idx->level);
+
+			ubifs_assert(snod->type == UBIFS_IDX_NODE);
+			key_read(c, ubifs_idx_key(c, idx), &snod->key);
+			err = ubifs_dirty_idx_node(c, &snod->key, level, lnum,
+						   snod->offs);
+			if (err)
+				goto out;
+		}
+
+		idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
+		if (!idx_gc) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		idx_gc->lnum = lnum;
+		idx_gc->unmap = 0;
+		list_add(&idx_gc->list, &c->idx_gc);
+
+		/*
+		 * Don't release the LEB until after the next commit, because
+		 * it may contain data which is needed for recovery. So
+		 * although we freed this LEB, it will become usable only after
+		 * the commit.
+		 */
+		err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0,
+					  LPROPS_INDEX, 1);
+		if (err)
+			goto out;
+		err = LEB_FREED_IDX;
+	} else {
+		dbg_gc("data LEB %d (free %d, dirty %d)",
+		       lnum, lp->free, lp->dirty);
+
+		err = move_nodes(c, sleb);
+		if (err)
+			goto out_inc_seq;
+
+		err = gc_sync_wbufs(c);
+		if (err)
+			goto out_inc_seq;
+
+		err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
+		if (err)
+			goto out_inc_seq;
+
+		/* Allow for races with TNC */
+		c->gced_lnum = lnum;
+		smp_wmb();
+		c->gc_seq += 1;
+		smp_wmb();
+
+		if (c->gc_lnum == -1) {
+			c->gc_lnum = lnum;
+			err = LEB_RETAINED;
+		} else {
+			err = ubifs_wbuf_sync_nolock(wbuf);
+			if (err)
+				goto out;
+
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				goto out;
+
+			err = LEB_FREED;
+		}
+	}
+
+out:
+	ubifs_scan_destroy(sleb);
+	return err;
+
+out_inc_seq:
+	/* We may have moved at least some nodes so allow for races with TNC */
+	c->gced_lnum = lnum;
+	smp_wmb();
+	c->gc_seq += 1;
+	smp_wmb();
+	goto out;
+}
+
+/**
+ * ubifs_garbage_collect - UBIFS garbage collector.
+ * @c: UBIFS file-system description object
+ * @anyway: do GC even if there are free LEBs
+ *
+ * This function does out-of-place garbage collection. The return codes are:
+ *   o positive LEB number if the LEB has been freed and may be used;
+ *   o %-EAGAIN if the caller has to run commit;
+ *   o %-ENOSPC if GC failed to make any progress;
+ *   o other negative error codes in case of other errors.
+ *
+ * Garbage collector writes data to the journal when GC'ing data LEBs, and just
+ * marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point
+ * commit may be required. But commit cannot be run from inside GC, because the
+ * caller might be holding the commit lock, so %-EAGAIN is returned instead;
+ * And this error code means that the caller has to run commit, and re-run GC
+ * if there is still no free space.
+ *
+ * There are many reasons why this function may return %-EAGAIN:
+ * o the log is full and there is no space to write an LEB reference for
+ *   @c->gc_lnum;
+ * o the journal is too large and exceeds size limitations;
+ * o GC moved indexing LEBs, but they can be used only after the commit;
+ * o the shrinker fails to find clean znodes to free and requests the commit;
+ * o etc.
+ *
+ * Note, if the file-system is close to be full, this function may return
+ * %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of
+ * the function. E.g., this happens if the limits on the journal size are too
+ * tough and GC writes too much to the journal before an LEB is freed. This
+ * might also mean that the journal is too large, and the TNC becomes to big,
+ * so that the shrinker is constantly called, finds not clean znodes to free,
+ * and requests commit. Well, this may also happen if the journal is all right,
+ * but another kernel process consumes too much memory. Anyway, infinite
+ * %-EAGAIN may happen, but in some extreme/misconfiguration cases.
+ */
+int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
+{
+	int i, err, ret, min_space = c->dead_wm;
+	struct ubifs_lprops lp;
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+
+	ubifs_assert_cmt_locked(c);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+
+	if (ubifs_gc_should_commit(c))
+		return -EAGAIN;
+
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+
+	if (c->ro_error) {
+		ret = -EROFS;
+		goto out_unlock;
+	}
+
+	/* We expect the write-buffer to be empty on entry */
+	ubifs_assert(!wbuf->used);
+
+	for (i = 0; ; i++) {
+		int space_before = c->leb_size - wbuf->offs - wbuf->used;
+		int space_after;
+
+		cond_resched();
+
+		/* Give the commit an opportunity to run */
+		if (ubifs_gc_should_commit(c)) {
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) {
+			/*
+			 * We've done enough iterations. Indexing LEBs were
+			 * moved and will be available after the commit.
+			 */
+			dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN");
+			ubifs_commit_required(c);
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (i > HARD_LEBS_LIMIT) {
+			/*
+			 * We've moved too many LEBs and have not made
+			 * progress, give up.
+			 */
+			dbg_gc("hard limit, -ENOSPC");
+			ret = -ENOSPC;
+			break;
+		}
+
+		/*
+		 * Empty and freeable LEBs can turn up while we waited for
+		 * the wbuf lock, or while we have been running GC. In that
+		 * case, we should just return one of those instead of
+		 * continuing to GC dirty LEBs. Hence we request
+		 * 'ubifs_find_dirty_leb()' to return an empty LEB if it can.
+		 */
+		ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1);
+		if (ret) {
+			if (ret == -ENOSPC)
+				dbg_gc("no more dirty LEBs");
+			break;
+		}
+
+		dbg_gc("found LEB %d: free %d, dirty %d, sum %d "
+		       "(min. space %d)", lp.lnum, lp.free, lp.dirty,
+		       lp.free + lp.dirty, min_space);
+
+		space_before = c->leb_size - wbuf->offs - wbuf->used;
+		if (wbuf->lnum == -1)
+			space_before = 0;
+
+		ret = ubifs_garbage_collect_leb(c, &lp);
+		if (ret < 0) {
+			if (ret == -EAGAIN) {
+				/*
+				 * This is not error, so we have to return the
+				 * LEB to lprops. But if 'ubifs_return_leb()'
+				 * fails, its failure code is propagated to the
+				 * caller instead of the original '-EAGAIN'.
+				 */
+				err = ubifs_return_leb(c, lp.lnum);
+				if (err)
+					ret = err;
+				break;
+			}
+			goto out;
+		}
+
+		if (ret == LEB_FREED) {
+			/* An LEB has been freed and is ready for use */
+			dbg_gc("LEB %d freed, return", lp.lnum);
+			ret = lp.lnum;
+			break;
+		}
+
+		if (ret == LEB_FREED_IDX) {
+			/*
+			 * This was an indexing LEB and it cannot be
+			 * immediately used. And instead of requesting the
+			 * commit straight away, we try to garbage collect some
+			 * more.
+			 */
+			dbg_gc("indexing LEB %d freed, continue", lp.lnum);
+			continue;
+		}
+
+		ubifs_assert(ret == LEB_RETAINED);
+		space_after = c->leb_size - wbuf->offs - wbuf->used;
+		dbg_gc("LEB %d retained, freed %d bytes", lp.lnum,
+		       space_after - space_before);
+
+		if (space_after > space_before) {
+			/* GC makes progress, keep working */
+			min_space >>= 1;
+			if (min_space < c->dead_wm)
+				min_space = c->dead_wm;
+			continue;
+		}
+
+		dbg_gc("did not make progress");
+
+		/*
+		 * GC moved an LEB bud have not done any progress. This means
+		 * that the previous GC head LEB contained too few free space
+		 * and the LEB which was GC'ed contained only large nodes which
+		 * did not fit that space.
+		 *
+		 * We can do 2 things:
+		 * 1. pick another LEB in a hope it'll contain a small node
+		 *    which will fit the space we have at the end of current GC
+		 *    head LEB, but there is no guarantee, so we try this out
+		 *    unless we have already been working for too long;
+		 * 2. request an LEB with more dirty space, which will force
+		 *    'ubifs_find_dirty_leb()' to start scanning the lprops
+		 *    table, instead of just picking one from the heap
+		 *    (previously it already picked the dirtiest LEB).
+		 */
+		if (i < SOFT_LEBS_LIMIT) {
+			dbg_gc("try again");
+			continue;
+		}
+
+		min_space <<= 1;
+		if (min_space > c->dark_wm)
+			min_space = c->dark_wm;
+		dbg_gc("set min. space to %d", min_space);
+	}
+
+	if (ret == -ENOSPC && !list_empty(&c->idx_gc)) {
+		dbg_gc("no space, some index LEBs GC'ed, -EAGAIN");
+		ubifs_commit_required(c);
+		ret = -EAGAIN;
+	}
+
+	err = ubifs_wbuf_sync_nolock(wbuf);
+	if (!err)
+		err = ubifs_leb_unmap(c, c->gc_lnum);
+	if (err) {
+		ret = err;
+		goto out;
+	}
+out_unlock:
+	mutex_unlock(&wbuf->io_mutex);
+	return ret;
+
+out:
+	ubifs_assert(ret < 0);
+	ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
+	ubifs_wbuf_sync_nolock(wbuf);
+	ubifs_ro_mode(c, ret);
+	mutex_unlock(&wbuf->io_mutex);
+	ubifs_return_leb(c, lp.lnum);
+	return ret;
+}
+
+/**
+ * ubifs_gc_start_commit - garbage collection at start of commit.
+ * @c: UBIFS file-system description object
+ *
+ * If a LEB has only dirty and free space, then we may safely unmap it and make
+ * it free.  Note, we cannot do this with indexing LEBs because dirty space may
+ * correspond index nodes that are required for recovery.  In that case, the
+ * LEB cannot be unmapped until after the next commit.
+ *
+ * This function returns %0 upon success and a negative error code upon failure.
+ */
+int ubifs_gc_start_commit(struct ubifs_info *c)
+{
+	struct ubifs_gced_idx_leb *idx_gc;
+	const struct ubifs_lprops *lp;
+	int err = 0, flags;
+
+	ubifs_get_lprops(c);
+
+	/*
+	 * Unmap (non-index) freeable LEBs. Note that recovery requires that all
+	 * wbufs are sync'd before this, which is done in 'do_commit()'.
+	 */
+	while (1) {
+		lp = ubifs_fast_find_freeable(c);
+		if (IS_ERR(lp)) {
+			err = PTR_ERR(lp);
+			goto out;
+		}
+		if (!lp)
+			break;
+		ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+		ubifs_assert(!(lp->flags & LPROPS_INDEX));
+		err = ubifs_leb_unmap(c, lp->lnum);
+		if (err)
+			goto out;
+		lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0);
+		if (IS_ERR(lp)) {
+			err = PTR_ERR(lp);
+			goto out;
+		}
+		ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+		ubifs_assert(!(lp->flags & LPROPS_INDEX));
+	}
+
+	/* Mark GC'd index LEBs OK to unmap after this commit finishes */
+	list_for_each_entry(idx_gc, &c->idx_gc, list)
+		idx_gc->unmap = 1;
+
+	/* Record index freeable LEBs for unmapping after commit */
+	while (1) {
+		lp = ubifs_fast_find_frdi_idx(c);
+		if (IS_ERR(lp)) {
+			err = PTR_ERR(lp);
+			goto out;
+		}
+		if (!lp)
+			break;
+		idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
+		if (!idx_gc) {
+			err = -ENOMEM;
+			goto out;
+		}
+		ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+		ubifs_assert(lp->flags & LPROPS_INDEX);
+		/* Don't release the LEB until after the next commit */
+		flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX;
+		lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1);
+		if (IS_ERR(lp)) {
+			err = PTR_ERR(lp);
+			kfree(idx_gc);
+			goto out;
+		}
+		ubifs_assert(lp->flags & LPROPS_TAKEN);
+		ubifs_assert(!(lp->flags & LPROPS_INDEX));
+		idx_gc->lnum = lp->lnum;
+		idx_gc->unmap = 1;
+		list_add(&idx_gc->list, &c->idx_gc);
+	}
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * ubifs_gc_end_commit - garbage collection at end of commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function completes out-of-place garbage collection of index LEBs.
+ */
+int ubifs_gc_end_commit(struct ubifs_info *c)
+{
+	struct ubifs_gced_idx_leb *idx_gc, *tmp;
+	struct ubifs_wbuf *wbuf;
+	int err = 0;
+
+	wbuf = &c->jheads[GCHD].wbuf;
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+	list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list)
+		if (idx_gc->unmap) {
+			dbg_gc("LEB %d", idx_gc->lnum);
+			err = ubifs_leb_unmap(c, idx_gc->lnum);
+			if (err)
+				goto out;
+			err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC,
+					  LPROPS_NC, 0, LPROPS_TAKEN, -1);
+			if (err)
+				goto out;
+			list_del(&idx_gc->list);
+			kfree(idx_gc);
+		}
+out:
+	mutex_unlock(&wbuf->io_mutex);
+	return err;
+}
+
+/**
+ * ubifs_destroy_idx_gc - destroy idx_gc list.
+ * @c: UBIFS file-system description object
+ *
+ * This function destroys the @c->idx_gc list. It is called when unmounting
+ * so locks are not needed. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+void ubifs_destroy_idx_gc(struct ubifs_info *c)
+{
+	while (!list_empty(&c->idx_gc)) {
+		struct ubifs_gced_idx_leb *idx_gc;
+
+		idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
+				    list);
+		c->idx_gc_cnt -= 1;
+		list_del(&idx_gc->list);
+		kfree(idx_gc);
+	}
+}
+
+/**
+ * ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list.
+ * @c: UBIFS file-system description object
+ *
+ * Called during start commit so locks are not needed.
+ */
+int ubifs_get_idx_gc_leb(struct ubifs_info *c)
+{
+	struct ubifs_gced_idx_leb *idx_gc;
+	int lnum;
+
+	if (list_empty(&c->idx_gc))
+		return -ENOSPC;
+	idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list);
+	lnum = idx_gc->lnum;
+	/* c->idx_gc_cnt is updated by the caller when lprops are updated */
+	list_del(&idx_gc->list);
+	kfree(idx_gc);
+	return lnum;
+}
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
new file mode 100644
index 00000000..3be645e0
--- /dev/null
+++ b/fs/ubifs/io.c
@@ -0,0 +1,1054 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ *          Zoltan Sogor
+ */
+
+/*
+ * This file implements UBIFS I/O subsystem which provides various I/O-related
+ * helper functions (reading/writing/checking/validating nodes) and implements
+ * write-buffering support. Write buffers help to save space which otherwise
+ * would have been wasted for padding to the nearest minimal I/O unit boundary.
+ * Instead, data first goes to the write-buffer and is flushed when the
+ * buffer is full or when it is not used for some time (by timer). This is
+ * similar to the mechanism is used by JFFS2.
+ *
+ * UBIFS distinguishes between minimum write size (@c->min_io_size) and maximum
+ * write size (@c->max_write_size). The latter is the maximum amount of bytes
+ * the underlying flash is able to program at a time, and writing in
+ * @c->max_write_size units should presumably be faster. Obviously,
+ * @c->min_io_size <= @c->max_write_size. Write-buffers are of
+ * @c->max_write_size bytes in size for maximum performance. However, when a
+ * write-buffer is flushed, only the portion of it (aligned to @c->min_io_size
+ * boundary) which contains data is written, not the whole write-buffer,
+ * because this is more space-efficient.
+ *
+ * This optimization adds few complications to the code. Indeed, on the one
+ * hand, we want to write in optimal @c->max_write_size bytes chunks, which
+ * also means aligning writes at the @c->max_write_size bytes offsets. On the
+ * other hand, we do not want to waste space when synchronizing the write
+ * buffer, so during synchronization we writes in smaller chunks. And this makes
+ * the next write offset to be not aligned to @c->max_write_size bytes. So the
+ * have to make sure that the write-buffer offset (@wbuf->offs) becomes aligned
+ * to @c->max_write_size bytes again. We do this by temporarily shrinking
+ * write-buffer size (@wbuf->size).
+ *
+ * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
+ * mutexes defined inside these objects. Since sometimes upper-level code
+ * has to lock the write-buffer (e.g. journal space reservation code), many
+ * functions related to write-buffers have "nolock" suffix which means that the
+ * caller has to lock the write-buffer before calling this function.
+ *
+ * UBIFS stores nodes at 64 bit-aligned addresses. If the node length is not
+ * aligned, UBIFS starts the next node from the aligned address, and the padded
+ * bytes may contain any rubbish. In other words, UBIFS does not put padding
+ * bytes in those small gaps. Common headers of nodes store real node lengths,
+ * not aligned lengths. Indexing nodes also store real lengths in branches.
+ *
+ * UBIFS uses padding when it pads to the next min. I/O unit. In this case it
+ * uses padding nodes or padding bytes, if the padding node does not fit.
+ *
+ * All UBIFS nodes are protected by CRC checksums and UBIFS checks CRC when
+ * they are read from the flash media.
+ */
+
+#include <linux/crc32.h>
+#include <linux/slab.h>
+#include "ubifs.h"
+
+/**
+ * ubifs_ro_mode - switch UBIFS to read read-only mode.
+ * @c: UBIFS file-system description object
+ * @err: error code which is the reason of switching to R/O mode
+ */
+void ubifs_ro_mode(struct ubifs_info *c, int err)
+{
+	if (!c->ro_error) {
+		c->ro_error = 1;
+		c->no_chk_data_crc = 0;
+		c->vfs_sb->s_flags |= MS_RDONLY;
+		ubifs_warn("switched to read-only mode, error %d", err);
+		dbg_dump_stack();
+	}
+}
+
+/**
+ * ubifs_check_node - check node.
+ * @c: UBIFS file-system description object
+ * @buf: node to check
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @quiet: print no messages
+ * @must_chk_crc: indicates whether to always check the CRC
+ *
+ * This function checks node magic number and CRC checksum. This function also
+ * validates node length to prevent UBIFS from becoming crazy when an attacker
+ * feeds it a file-system image with incorrect nodes. For example, too large
+ * node length in the common header could cause UBIFS to read memory outside of
+ * allocated buffer when checking the CRC checksum.
+ *
+ * This function may skip data nodes CRC checking if @c->no_chk_data_crc is
+ * true, which is controlled by corresponding UBIFS mount option. However, if
+ * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
+ * checked. Similarly, if @c->mounting or @c->remounting_rw is true (we are
+ * mounting or re-mounting to R/W mode), @c->no_chk_data_crc is ignored and CRC
+ * is checked. This is because during mounting or re-mounting from R/O mode to
+ * R/W mode we may read journal nodes (when replying the journal or doing the
+ * recovery) and the journal nodes may potentially be corrupted, so checking is
+ * required.
+ *
+ * This function returns zero in case of success and %-EUCLEAN in case of bad
+ * CRC or magic.
+ */
+int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
+		     int offs, int quiet, int must_chk_crc)
+{
+	int err = -EINVAL, type, node_len;
+	uint32_t crc, node_crc, magic;
+	const struct ubifs_ch *ch = buf;
+
+	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(!(offs & 7) && offs < c->leb_size);
+
+	magic = le32_to_cpu(ch->magic);
+	if (magic != UBIFS_NODE_MAGIC) {
+		if (!quiet)
+			ubifs_err("bad magic %#08x, expected %#08x",
+				  magic, UBIFS_NODE_MAGIC);
+		err = -EUCLEAN;
+		goto out;
+	}
+
+	type = ch->node_type;
+	if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) {
+		if (!quiet)
+			ubifs_err("bad node type %d", type);
+		goto out;
+	}
+
+	node_len = le32_to_cpu(ch->len);
+	if (node_len + offs > c->leb_size)
+		goto out_len;
+
+	if (c->ranges[type].max_len == 0) {
+		if (node_len != c->ranges[type].len)
+			goto out_len;
+	} else if (node_len < c->ranges[type].min_len ||
+		   node_len > c->ranges[type].max_len)
+		goto out_len;
+
+	if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->mounting &&
+	    !c->remounting_rw && c->no_chk_data_crc)
+		return 0;
+
+	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
+	node_crc = le32_to_cpu(ch->crc);
+	if (crc != node_crc) {
+		if (!quiet)
+			ubifs_err("bad CRC: calculated %#08x, read %#08x",
+				  crc, node_crc);
+		err = -EUCLEAN;
+		goto out;
+	}
+
+	return 0;
+
+out_len:
+	if (!quiet)
+		ubifs_err("bad node length %d", node_len);
+out:
+	if (!quiet) {
+		ubifs_err("bad node at LEB %d:%d", lnum, offs);
+		dbg_dump_node(c, buf);
+		dbg_dump_stack();
+	}
+	return err;
+}
+
+/**
+ * ubifs_pad - pad flash space.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to put padding to
+ * @pad: how many bytes to pad
+ *
+ * The flash media obliges us to write only in chunks of %c->min_io_size and
+ * when we have to write less data we add padding node to the write-buffer and
+ * pad it to the next minimal I/O unit's boundary. Padding nodes help when the
+ * media is being scanned. If the amount of wasted space is not enough to fit a
+ * padding node which takes %UBIFS_PAD_NODE_SZ bytes, we write padding bytes
+ * pattern (%UBIFS_PADDING_BYTE).
+ *
+ * Padding nodes are also used to fill gaps when the "commit-in-gaps" method is
+ * used.
+ */
+void ubifs_pad(const struct ubifs_info *c, void *buf, int pad)
+{
+	uint32_t crc;
+
+	ubifs_assert(pad >= 0 && !(pad & 7));
+
+	if (pad >= UBIFS_PAD_NODE_SZ) {
+		struct ubifs_ch *ch = buf;
+		struct ubifs_pad_node *pad_node = buf;
+
+		ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+		ch->node_type = UBIFS_PAD_NODE;
+		ch->group_type = UBIFS_NO_NODE_GROUP;
+		ch->padding[0] = ch->padding[1] = 0;
+		ch->sqnum = 0;
+		ch->len = cpu_to_le32(UBIFS_PAD_NODE_SZ);
+		pad -= UBIFS_PAD_NODE_SZ;
+		pad_node->pad_len = cpu_to_le32(pad);
+		crc = crc32(UBIFS_CRC32_INIT, buf + 8, UBIFS_PAD_NODE_SZ - 8);
+		ch->crc = cpu_to_le32(crc);
+		memset(buf + UBIFS_PAD_NODE_SZ, 0, pad);
+	} else if (pad > 0)
+		/* Too little space, padding node won't fit */
+		memset(buf, UBIFS_PADDING_BYTE, pad);
+}
+
+/**
+ * next_sqnum - get next sequence number.
+ * @c: UBIFS file-system description object
+ */
+static unsigned long long next_sqnum(struct ubifs_info *c)
+{
+	unsigned long long sqnum;
+
+	spin_lock(&c->cnt_lock);
+	sqnum = ++c->max_sqnum;
+	spin_unlock(&c->cnt_lock);
+
+	if (unlikely(sqnum >= SQNUM_WARN_WATERMARK)) {
+		if (sqnum >= SQNUM_WATERMARK) {
+			ubifs_err("sequence number overflow %llu, end of life",
+				  sqnum);
+			ubifs_ro_mode(c, -EINVAL);
+		}
+		ubifs_warn("running out of sequence numbers, end of life soon");
+	}
+
+	return sqnum;
+}
+
+/**
+ * ubifs_prepare_node - prepare node to be written to flash.
+ * @c: UBIFS file-system description object
+ * @node: the node to pad
+ * @len: node length
+ * @pad: if the buffer has to be padded
+ *
+ * This function prepares node at @node to be written to the media - it
+ * calculates node CRC, fills the common header, and adds proper padding up to
+ * the next minimum I/O unit if @pad is not zero.
+ */
+void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad)
+{
+	uint32_t crc;
+	struct ubifs_ch *ch = node;
+	unsigned long long sqnum = next_sqnum(c);
+
+	ubifs_assert(len >= UBIFS_CH_SZ);
+
+	ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+	ch->len = cpu_to_le32(len);
+	ch->group_type = UBIFS_NO_NODE_GROUP;
+	ch->sqnum = cpu_to_le64(sqnum);
+	ch->padding[0] = ch->padding[1] = 0;
+	crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
+	ch->crc = cpu_to_le32(crc);
+
+	if (pad) {
+		len = ALIGN(len, 8);
+		pad = ALIGN(len, c->min_io_size) - len;
+		ubifs_pad(c, node + len, pad);
+	}
+}
+
+/**
+ * ubifs_prep_grp_node - prepare node of a group to be written to flash.
+ * @c: UBIFS file-system description object
+ * @node: the node to pad
+ * @len: node length
+ * @last: indicates the last node of the group
+ *
+ * This function prepares node at @node to be written to the media - it
+ * calculates node CRC and fills the common header.
+ */
+void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
+{
+	uint32_t crc;
+	struct ubifs_ch *ch = node;
+	unsigned long long sqnum = next_sqnum(c);
+
+	ubifs_assert(len >= UBIFS_CH_SZ);
+
+	ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+	ch->len = cpu_to_le32(len);
+	if (last)
+		ch->group_type = UBIFS_LAST_OF_NODE_GROUP;
+	else
+		ch->group_type = UBIFS_IN_NODE_GROUP;
+	ch->sqnum = cpu_to_le64(sqnum);
+	ch->padding[0] = ch->padding[1] = 0;
+	crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
+	ch->crc = cpu_to_le32(crc);
+}
+
+/**
+ * wbuf_timer_callback - write-buffer timer callback function.
+ * @data: timer data (write-buffer descriptor)
+ *
+ * This function is called when the write-buffer timer expires.
+ */
+static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
+{
+	struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer);
+
+	dbg_io("jhead %s", dbg_jhead(wbuf->jhead));
+	wbuf->need_sync = 1;
+	wbuf->c->need_wbuf_sync = 1;
+	ubifs_wake_up_bgt(wbuf->c);
+	return HRTIMER_NORESTART;
+}
+
+/**
+ * new_wbuf_timer - start new write-buffer timer.
+ * @wbuf: write-buffer descriptor
+ */
+static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
+{
+	ubifs_assert(!hrtimer_active(&wbuf->timer));
+
+	if (wbuf->no_timer)
+		return;
+	dbg_io("set timer for jhead %s, %llu-%llu millisecs",
+	       dbg_jhead(wbuf->jhead),
+	       div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC),
+	       div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta,
+		       USEC_PER_SEC));
+	hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta,
+			       HRTIMER_MODE_REL);
+}
+
+/**
+ * cancel_wbuf_timer - cancel write-buffer timer.
+ * @wbuf: write-buffer descriptor
+ */
+static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
+{
+	if (wbuf->no_timer)
+		return;
+	wbuf->need_sync = 0;
+	hrtimer_cancel(&wbuf->timer);
+}
+
+/**
+ * ubifs_wbuf_sync_nolock - synchronize write-buffer.
+ * @wbuf: write-buffer to synchronize
+ *
+ * This function synchronizes write-buffer @buf and returns zero in case of
+ * success or a negative error code in case of failure.
+ *
+ * Note, although write-buffers are of @c->max_write_size, this function does
+ * not necessarily writes all @c->max_write_size bytes to the flash. Instead,
+ * if the write-buffer is only partially filled with data, only the used part
+ * of the write-buffer (aligned on @c->min_io_size boundary) is synchronized.
+ * This way we waste less space.
+ */
+int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
+{
+	struct ubifs_info *c = wbuf->c;
+	int err, dirt, sync_len;
+
+	cancel_wbuf_timer_nolock(wbuf);
+	if (!wbuf->used || wbuf->lnum == -1)
+		/* Write-buffer is empty or not seeked */
+		return 0;
+
+	dbg_io("LEB %d:%d, %d bytes, jhead %s",
+	       wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
+	ubifs_assert(!(wbuf->avail & 7));
+	ubifs_assert(wbuf->offs + wbuf->size <= c->leb_size);
+	ubifs_assert(wbuf->size >= c->min_io_size);
+	ubifs_assert(wbuf->size <= c->max_write_size);
+	ubifs_assert(wbuf->size % c->min_io_size == 0);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->leb_size - wbuf->offs >= c->max_write_size)
+		ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
+
+	if (c->ro_error)
+		return -EROFS;
+
+	/*
+	 * Do not write whole write buffer but write only the minimum necessary
+	 * amount of min. I/O units.
+	 */
+	sync_len = ALIGN(wbuf->used, c->min_io_size);
+	dirt = sync_len - wbuf->used;
+	if (dirt)
+		ubifs_pad(c, wbuf->buf + wbuf->used, dirt);
+	err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
+			    sync_len, wbuf->dtype);
+	if (err) {
+		ubifs_err("cannot write %d bytes to LEB %d:%d",
+			  sync_len, wbuf->lnum, wbuf->offs);
+		dbg_dump_stack();
+		return err;
+	}
+
+	spin_lock(&wbuf->lock);
+	wbuf->offs += sync_len;
+	/*
+	 * Now @wbuf->offs is not necessarily aligned to @c->max_write_size.
+	 * But our goal is to optimize writes and make sure we write in
+	 * @c->max_write_size chunks and to @c->max_write_size-aligned offset.
+	 * Thus, if @wbuf->offs is not aligned to @c->max_write_size now, make
+	 * sure that @wbuf->offs + @wbuf->size is aligned to
+	 * @c->max_write_size. This way we make sure that after next
+	 * write-buffer flush we are again at the optimal offset (aligned to
+	 * @c->max_write_size).
+	 */
+	if (c->leb_size - wbuf->offs < c->max_write_size)
+		wbuf->size = c->leb_size - wbuf->offs;
+	else if (wbuf->offs & (c->max_write_size - 1))
+		wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
+	else
+		wbuf->size = c->max_write_size;
+	wbuf->avail = wbuf->size;
+	wbuf->used = 0;
+	wbuf->next_ino = 0;
+	spin_unlock(&wbuf->lock);
+
+	if (wbuf->sync_callback)
+		err = wbuf->sync_callback(c, wbuf->lnum,
+					  c->leb_size - wbuf->offs, dirt);
+	return err;
+}
+
+/**
+ * ubifs_wbuf_seek_nolock - seek write-buffer.
+ * @wbuf: write-buffer
+ * @lnum: logical eraseblock number to seek to
+ * @offs: logical eraseblock offset to seek to
+ * @dtype: data type
+ *
+ * This function targets the write-buffer to logical eraseblock @lnum:@offs.
+ * The write-buffer has to be empty. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
+			   int dtype)
+{
+	const struct ubifs_info *c = wbuf->c;
+
+	dbg_io("LEB %d:%d, jhead %s", lnum, offs, dbg_jhead(wbuf->jhead));
+	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
+	ubifs_assert(offs >= 0 && offs <= c->leb_size);
+	ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
+	ubifs_assert(lnum != wbuf->lnum);
+	ubifs_assert(wbuf->used == 0);
+
+	spin_lock(&wbuf->lock);
+	wbuf->lnum = lnum;
+	wbuf->offs = offs;
+	if (c->leb_size - wbuf->offs < c->max_write_size)
+		wbuf->size = c->leb_size - wbuf->offs;
+	else if (wbuf->offs & (c->max_write_size - 1))
+		wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
+	else
+		wbuf->size = c->max_write_size;
+	wbuf->avail = wbuf->size;
+	wbuf->used = 0;
+	spin_unlock(&wbuf->lock);
+	wbuf->dtype = dtype;
+
+	return 0;
+}
+
+/**
+ * ubifs_bg_wbufs_sync - synchronize write-buffers.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called by background thread to synchronize write-buffers.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_bg_wbufs_sync(struct ubifs_info *c)
+{
+	int err, i;
+
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (!c->need_wbuf_sync)
+		return 0;
+	c->need_wbuf_sync = 0;
+
+	if (c->ro_error) {
+		err = -EROFS;
+		goto out_timers;
+	}
+
+	dbg_io("synchronize");
+	for (i = 0; i < c->jhead_cnt; i++) {
+		struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+
+		cond_resched();
+
+		/*
+		 * If the mutex is locked then wbuf is being changed, so
+		 * synchronization is not necessary.
+		 */
+		if (mutex_is_locked(&wbuf->io_mutex))
+			continue;
+
+		mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+		if (!wbuf->need_sync) {
+			mutex_unlock(&wbuf->io_mutex);
+			continue;
+		}
+
+		err = ubifs_wbuf_sync_nolock(wbuf);
+		mutex_unlock(&wbuf->io_mutex);
+		if (err) {
+			ubifs_err("cannot sync write-buffer, error %d", err);
+			ubifs_ro_mode(c, err);
+			goto out_timers;
+		}
+	}
+
+	return 0;
+
+out_timers:
+	/* Cancel all timers to prevent repeated errors */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+
+		mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+		cancel_wbuf_timer_nolock(wbuf);
+		mutex_unlock(&wbuf->io_mutex);
+	}
+	return err;
+}
+
+/**
+ * ubifs_wbuf_write_nolock - write data to flash via write-buffer.
+ * @wbuf: write-buffer
+ * @buf: node to write
+ * @len: node length
+ *
+ * This function writes data to flash via write-buffer @wbuf. This means that
+ * the last piece of the node won't reach the flash media immediately if it
+ * does not take whole max. write unit (@c->max_write_size). Instead, the node
+ * will sit in RAM until the write-buffer is synchronized (e.g., by timer, or
+ * because more data are appended to the write-buffer).
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure. If the node cannot be written because there is no more
+ * space in this logical eraseblock, %-ENOSPC is returned.
+ */
+int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
+{
+	struct ubifs_info *c = wbuf->c;
+	int err, written, n, aligned_len = ALIGN(len, 8);
+
+	dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
+	       dbg_ntype(((struct ubifs_ch *)buf)->node_type),
+	       dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs + wbuf->used);
+	ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
+	ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
+	ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
+	ubifs_assert(wbuf->avail > 0 && wbuf->avail <= wbuf->size);
+	ubifs_assert(wbuf->size >= c->min_io_size);
+	ubifs_assert(wbuf->size <= c->max_write_size);
+	ubifs_assert(wbuf->size % c->min_io_size == 0);
+	ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	ubifs_assert(!c->space_fixup);
+	if (c->leb_size - wbuf->offs >= c->max_write_size)
+		ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
+
+	if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	cancel_wbuf_timer_nolock(wbuf);
+
+	if (c->ro_error)
+		return -EROFS;
+
+	if (aligned_len <= wbuf->avail) {
+		/*
+		 * The node is not very large and fits entirely within
+		 * write-buffer.
+		 */
+		memcpy(wbuf->buf + wbuf->used, buf, len);
+
+		if (aligned_len == wbuf->avail) {
+			dbg_io("flush jhead %s wbuf to LEB %d:%d",
+			       dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
+			err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
+					    wbuf->offs, wbuf->size,
+					    wbuf->dtype);
+			if (err)
+				goto out;
+
+			spin_lock(&wbuf->lock);
+			wbuf->offs += wbuf->size;
+			if (c->leb_size - wbuf->offs >= c->max_write_size)
+				wbuf->size = c->max_write_size;
+			else
+				wbuf->size = c->leb_size - wbuf->offs;
+			wbuf->avail = wbuf->size;
+			wbuf->used = 0;
+			wbuf->next_ino = 0;
+			spin_unlock(&wbuf->lock);
+		} else {
+			spin_lock(&wbuf->lock);
+			wbuf->avail -= aligned_len;
+			wbuf->used += aligned_len;
+			spin_unlock(&wbuf->lock);
+		}
+
+		goto exit;
+	}
+
+	written = 0;
+
+	if (wbuf->used) {
+		/*
+		 * The node is large enough and does not fit entirely within
+		 * current available space. We have to fill and flush
+		 * write-buffer and switch to the next max. write unit.
+		 */
+		dbg_io("flush jhead %s wbuf to LEB %d:%d",
+		       dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
+		memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
+		err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
+				    wbuf->size, wbuf->dtype);
+		if (err)
+			goto out;
+
+		wbuf->offs += wbuf->size;
+		len -= wbuf->avail;
+		aligned_len -= wbuf->avail;
+		written += wbuf->avail;
+	} else if (wbuf->offs & (c->max_write_size - 1)) {
+		/*
+		 * The write-buffer offset is not aligned to
+		 * @c->max_write_size and @wbuf->size is less than
+		 * @c->max_write_size. Write @wbuf->size bytes to make sure the
+		 * following writes are done in optimal @c->max_write_size
+		 * chunks.
+		 */
+		dbg_io("write %d bytes to LEB %d:%d",
+		       wbuf->size, wbuf->lnum, wbuf->offs);
+		err = ubi_leb_write(c->ubi, wbuf->lnum, buf, wbuf->offs,
+				    wbuf->size, wbuf->dtype);
+		if (err)
+			goto out;
+
+		wbuf->offs += wbuf->size;
+		len -= wbuf->size;
+		aligned_len -= wbuf->size;
+		written += wbuf->size;
+	}
+
+	/*
+	 * The remaining data may take more whole max. write units, so write the
+	 * remains multiple to max. write unit size directly to the flash media.
+	 * We align node length to 8-byte boundary because we anyway flash wbuf
+	 * if the remaining space is less than 8 bytes.
+	 */
+	n = aligned_len >> c->max_write_shift;
+	if (n) {
+		n <<= c->max_write_shift;
+		dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
+		       wbuf->offs);
+		err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written,
+				    wbuf->offs, n, wbuf->dtype);
+		if (err)
+			goto out;
+		wbuf->offs += n;
+		aligned_len -= n;
+		len -= n;
+		written += n;
+	}
+
+	spin_lock(&wbuf->lock);
+	if (aligned_len)
+		/*
+		 * And now we have what's left and what does not take whole
+		 * max. write unit, so write it to the write-buffer and we are
+		 * done.
+		 */
+		memcpy(wbuf->buf, buf + written, len);
+
+	if (c->leb_size - wbuf->offs >= c->max_write_size)
+		wbuf->size = c->max_write_size;
+	else
+		wbuf->size = c->leb_size - wbuf->offs;
+	wbuf->avail = wbuf->size - aligned_len;
+	wbuf->used = aligned_len;
+	wbuf->next_ino = 0;
+	spin_unlock(&wbuf->lock);
+
+exit:
+	if (wbuf->sync_callback) {
+		int free = c->leb_size - wbuf->offs - wbuf->used;
+
+		err = wbuf->sync_callback(c, wbuf->lnum, free, 0);
+		if (err)
+			goto out;
+	}
+
+	if (wbuf->used)
+		new_wbuf_timer_nolock(wbuf);
+
+	return 0;
+
+out:
+	ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
+		  len, wbuf->lnum, wbuf->offs, err);
+	dbg_dump_node(c, buf);
+	dbg_dump_stack();
+	dbg_dump_leb(c, wbuf->lnum);
+	return err;
+}
+
+/**
+ * ubifs_write_node - write node to the media.
+ * @c: UBIFS file-system description object
+ * @buf: the node to write
+ * @len: node length
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @dtype: node life-time hint (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN)
+ *
+ * This function automatically fills node magic number, assigns sequence
+ * number, and calculates node CRC checksum. The length of the @buf buffer has
+ * to be aligned to the minimal I/O unit size. This function automatically
+ * appends padding node and padding bytes if needed. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
+		     int offs, int dtype)
+{
+	int err, buf_len = ALIGN(len, c->min_io_size);
+
+	dbg_io("LEB %d:%d, %s, length %d (aligned %d)",
+	       lnum, offs, dbg_ntype(((struct ubifs_ch *)buf)->node_type), len,
+	       buf_len);
+	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	ubifs_assert(!c->space_fixup);
+
+	if (c->ro_error)
+		return -EROFS;
+
+	ubifs_prepare_node(c, buf, len, 1);
+	err = ubi_leb_write(c->ubi, lnum, buf, offs, buf_len, dtype);
+	if (err) {
+		ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
+			  buf_len, lnum, offs, err);
+		dbg_dump_node(c, buf);
+		dbg_dump_stack();
+	}
+
+	return err;
+}
+
+/**
+ * ubifs_read_node_wbuf - read node from the media or write-buffer.
+ * @wbuf: wbuf to check for un-written data
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ *
+ * This function reads a node of known type and length, checks it and stores
+ * in @buf. If the node partially or fully sits in the write-buffer, this
+ * function takes data from the buffer, otherwise it reads the flash media.
+ * Returns zero in case of success, %-EUCLEAN if CRC mismatched and a negative
+ * error code in case of failure.
+ */
+int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
+			 int lnum, int offs)
+{
+	const struct ubifs_info *c = wbuf->c;
+	int err, rlen, overlap;
+	struct ubifs_ch *ch = buf;
+
+	dbg_io("LEB %d:%d, %s, length %d, jhead %s", lnum, offs,
+	       dbg_ntype(type), len, dbg_jhead(wbuf->jhead));
+	ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(!(offs & 7) && offs < c->leb_size);
+	ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
+
+	spin_lock(&wbuf->lock);
+	overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs);
+	if (!overlap) {
+		/* We may safely unlock the write-buffer and read the data */
+		spin_unlock(&wbuf->lock);
+		return ubifs_read_node(c, buf, type, len, lnum, offs);
+	}
+
+	/* Don't read under wbuf */
+	rlen = wbuf->offs - offs;
+	if (rlen < 0)
+		rlen = 0;
+
+	/* Copy the rest from the write-buffer */
+	memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen);
+	spin_unlock(&wbuf->lock);
+
+	if (rlen > 0) {
+		/* Read everything that goes before write-buffer */
+		err = ubi_read(c->ubi, lnum, buf, offs, rlen);
+		if (err && err != -EBADMSG) {
+			ubifs_err("failed to read node %d from LEB %d:%d, "
+				  "error %d", type, lnum, offs, err);
+			dbg_dump_stack();
+			return err;
+		}
+	}
+
+	if (type != ch->node_type) {
+		ubifs_err("bad node type (%d but expected %d)",
+			  ch->node_type, type);
+		goto out;
+	}
+
+	err = ubifs_check_node(c, buf, lnum, offs, 0, 0);
+	if (err) {
+		ubifs_err("expected node type %d", type);
+		return err;
+	}
+
+	rlen = le32_to_cpu(ch->len);
+	if (rlen != len) {
+		ubifs_err("bad node length %d, expected %d", rlen, len);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	ubifs_err("bad node at LEB %d:%d", lnum, offs);
+	dbg_dump_node(c, buf);
+	dbg_dump_stack();
+	return -EINVAL;
+}
+
+/**
+ * ubifs_read_node - read node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length (not aligned)
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ *
+ * This function reads a node of known type and and length, checks it and
+ * stores in @buf. Returns zero in case of success, %-EUCLEAN if CRC mismatched
+ * and a negative error code in case of failure.
+ */
+int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
+		    int lnum, int offs)
+{
+	int err, l;
+	struct ubifs_ch *ch = buf;
+
+	dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(len >= UBIFS_CH_SZ && offs + len <= c->leb_size);
+	ubifs_assert(!(offs & 7) && offs < c->leb_size);
+	ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
+
+	err = ubi_read(c->ubi, lnum, buf, offs, len);
+	if (err && err != -EBADMSG) {
+		ubifs_err("cannot read node %d from LEB %d:%d, error %d",
+			  type, lnum, offs, err);
+		return err;
+	}
+
+	if (type != ch->node_type) {
+		ubifs_err("bad node type (%d but expected %d)",
+			  ch->node_type, type);
+		goto out;
+	}
+
+	err = ubifs_check_node(c, buf, lnum, offs, 0, 0);
+	if (err) {
+		ubifs_err("expected node type %d", type);
+		return err;
+	}
+
+	l = le32_to_cpu(ch->len);
+	if (l != len) {
+		ubifs_err("bad node length %d, expected %d", l, len);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs,
+		  ubi_is_mapped(c->ubi, lnum));
+	dbg_dump_node(c, buf);
+	dbg_dump_stack();
+	return -EINVAL;
+}
+
+/**
+ * ubifs_wbuf_init - initialize write-buffer.
+ * @c: UBIFS file-system description object
+ * @wbuf: write-buffer to initialize
+ *
+ * This function initializes write-buffer. Returns zero in case of success
+ * %-ENOMEM in case of failure.
+ */
+int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
+{
+	size_t size;
+
+	wbuf->buf = kmalloc(c->max_write_size, GFP_KERNEL);
+	if (!wbuf->buf)
+		return -ENOMEM;
+
+	size = (c->max_write_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
+	wbuf->inodes = kmalloc(size, GFP_KERNEL);
+	if (!wbuf->inodes) {
+		kfree(wbuf->buf);
+		wbuf->buf = NULL;
+		return -ENOMEM;
+	}
+
+	wbuf->used = 0;
+	wbuf->lnum = wbuf->offs = -1;
+	/*
+	 * If the LEB starts at the max. write size aligned address, then
+	 * write-buffer size has to be set to @c->max_write_size. Otherwise,
+	 * set it to something smaller so that it ends at the closest max.
+	 * write size boundary.
+	 */
+	size = c->max_write_size - (c->leb_start % c->max_write_size);
+	wbuf->avail = wbuf->size = size;
+	wbuf->dtype = UBI_UNKNOWN;
+	wbuf->sync_callback = NULL;
+	mutex_init(&wbuf->io_mutex);
+	spin_lock_init(&wbuf->lock);
+	wbuf->c = c;
+	wbuf->next_ino = 0;
+
+	hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	wbuf->timer.function = wbuf_timer_callback_nolock;
+	wbuf->softlimit = ktime_set(WBUF_TIMEOUT_SOFTLIMIT, 0);
+	wbuf->delta = WBUF_TIMEOUT_HARDLIMIT - WBUF_TIMEOUT_SOFTLIMIT;
+	wbuf->delta *= 1000000000ULL;
+	ubifs_assert(wbuf->delta <= ULONG_MAX);
+	return 0;
+}
+
+/**
+ * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array.
+ * @wbuf: the write-buffer where to add
+ * @inum: the inode number
+ *
+ * This function adds an inode number to the inode array of the write-buffer.
+ */
+void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum)
+{
+	if (!wbuf->buf)
+		/* NOR flash or something similar */
+		return;
+
+	spin_lock(&wbuf->lock);
+	if (wbuf->used)
+		wbuf->inodes[wbuf->next_ino++] = inum;
+	spin_unlock(&wbuf->lock);
+}
+
+/**
+ * wbuf_has_ino - returns if the wbuf contains data from the inode.
+ * @wbuf: the write-buffer
+ * @inum: the inode number
+ *
+ * This function returns with %1 if the write-buffer contains some data from the
+ * given inode otherwise it returns with %0.
+ */
+static int wbuf_has_ino(struct ubifs_wbuf *wbuf, ino_t inum)
+{
+	int i, ret = 0;
+
+	spin_lock(&wbuf->lock);
+	for (i = 0; i < wbuf->next_ino; i++)
+		if (inum == wbuf->inodes[i]) {
+			ret = 1;
+			break;
+		}
+	spin_unlock(&wbuf->lock);
+
+	return ret;
+}
+
+/**
+ * ubifs_sync_wbufs_by_inode - synchronize write-buffers for an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to synchronize
+ *
+ * This function synchronizes write-buffers which contain nodes belonging to
+ * @inode. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode)
+{
+	int i, err = 0;
+
+	for (i = 0; i < c->jhead_cnt; i++) {
+		struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+
+		if (i == GCHD)
+			/*
+			 * GC head is special, do not look at it. Even if the
+			 * head contains something related to this inode, it is
+			 * a _copy_ of corresponding on-flash node which sits
+			 * somewhere else.
+			 */
+			continue;
+
+		if (!wbuf_has_ino(wbuf, inode->i_ino))
+			continue;
+
+		mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+		if (wbuf_has_ino(wbuf, inode->i_ino))
+			err = ubifs_wbuf_sync_nolock(wbuf);
+		mutex_unlock(&wbuf->io_mutex);
+
+		if (err) {
+			ubifs_ro_mode(c, err);
+			return err;
+		}
+	}
+	return 0;
+}
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
new file mode 100644
index 00000000..548acf49
--- /dev/null
+++ b/fs/ubifs/ioctl.c
@@ -0,0 +1,205 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Zoltan Sogor
+ *          Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/* This file implements EXT2-compatible extended attribute ioctl() calls */
+
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include "ubifs.h"
+
+/**
+ * ubifs_set_inode_flags - set VFS inode flags.
+ * @inode: VFS inode to set flags for
+ *
+ * This function propagates flags from UBIFS inode object to VFS inode object.
+ */
+void ubifs_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = ubifs_inode(inode)->flags;
+
+	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC);
+	if (flags & UBIFS_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & UBIFS_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & UBIFS_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (flags & UBIFS_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+}
+
+/*
+ * ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags.
+ * @ioctl_flags: flags to convert
+ *
+ * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags
+ * (@UBIFS_COMPR_FL, etc).
+ */
+static int ioctl2ubifs(int ioctl_flags)
+{
+	int ubifs_flags = 0;
+
+	if (ioctl_flags & FS_COMPR_FL)
+		ubifs_flags |= UBIFS_COMPR_FL;
+	if (ioctl_flags & FS_SYNC_FL)
+		ubifs_flags |= UBIFS_SYNC_FL;
+	if (ioctl_flags & FS_APPEND_FL)
+		ubifs_flags |= UBIFS_APPEND_FL;
+	if (ioctl_flags & FS_IMMUTABLE_FL)
+		ubifs_flags |= UBIFS_IMMUTABLE_FL;
+	if (ioctl_flags & FS_DIRSYNC_FL)
+		ubifs_flags |= UBIFS_DIRSYNC_FL;
+
+	return ubifs_flags;
+}
+
+/*
+ * ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags.
+ * @ubifs_flags: flags to convert
+ *
+ * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags
+ * (@FS_COMPR_FL, etc).
+ */
+static int ubifs2ioctl(int ubifs_flags)
+{
+	int ioctl_flags = 0;
+
+	if (ubifs_flags & UBIFS_COMPR_FL)
+		ioctl_flags |= FS_COMPR_FL;
+	if (ubifs_flags & UBIFS_SYNC_FL)
+		ioctl_flags |= FS_SYNC_FL;
+	if (ubifs_flags & UBIFS_APPEND_FL)
+		ioctl_flags |= FS_APPEND_FL;
+	if (ubifs_flags & UBIFS_IMMUTABLE_FL)
+		ioctl_flags |= FS_IMMUTABLE_FL;
+	if (ubifs_flags & UBIFS_DIRSYNC_FL)
+		ioctl_flags |= FS_DIRSYNC_FL;
+
+	return ioctl_flags;
+}
+
+static int setflags(struct inode *inode, int flags)
+{
+	int oldflags, err, release;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_budget_req req = { .dirtied_ino = 1,
+					.dirtied_ino_d = ui->data_len };
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	/*
+	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+	 * the relevant capability.
+	 */
+	mutex_lock(&ui->ui_mutex);
+	oldflags = ubifs2ioctl(ui->flags);
+	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+		if (!capable(CAP_LINUX_IMMUTABLE)) {
+			err = -EPERM;
+			goto out_unlock;
+		}
+	}
+
+	ui->flags = ioctl2ubifs(flags);
+	ubifs_set_inode_flags(inode);
+	inode->i_ctime = ubifs_current_time(inode);
+	release = ui->dirty;
+	mark_inode_dirty_sync(inode);
+	mutex_unlock(&ui->ui_mutex);
+
+	if (release)
+		ubifs_release_budget(c, &req);
+	if (IS_SYNC(inode))
+		err = write_inode_now(inode, 1);
+	return err;
+
+out_unlock:
+	ubifs_err("can't modify inode %lu attributes", inode->i_ino);
+	mutex_unlock(&ui->ui_mutex);
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int flags, err;
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		flags = ubifs2ioctl(ubifs_inode(inode)->flags);
+
+		dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags);
+		return put_user(flags, (int __user *) arg);
+
+	case FS_IOC_SETFLAGS: {
+		if (IS_RDONLY(inode))
+			return -EROFS;
+
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		if (get_user(flags, (int __user *) arg))
+			return -EFAULT;
+
+		if (!S_ISDIR(inode->i_mode))
+			flags &= ~FS_DIRSYNC_FL;
+
+		/*
+		 * Make sure the file-system is read-write and make sure it
+		 * will not become read-only while we are changing the flags.
+		 */
+		err = mnt_want_write(file->f_path.mnt);
+		if (err)
+			return err;
+		dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags);
+		err = setflags(inode, flags);
+		mnt_drop_write(file->f_path.mnt);
+		return err;
+	}
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case FS_IOC32_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return ubifs_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
new file mode 100644
index 00000000..cef0460f
--- /dev/null
+++ b/fs/ubifs/journal.c
@@ -0,0 +1,1466 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS journal.
+ *
+ * The journal consists of 2 parts - the log and bud LEBs. The log has fixed
+ * length and position, while a bud logical eraseblock is any LEB in the main
+ * area. Buds contain file system data - data nodes, inode nodes, etc. The log
+ * contains only references to buds and some other stuff like commit
+ * start node. The idea is that when we commit the journal, we do
+ * not copy the data, the buds just become indexed. Since after the commit the
+ * nodes in bud eraseblocks become leaf nodes of the file system index tree, we
+ * use term "bud". Analogy is obvious, bud eraseblocks contain nodes which will
+ * become leafs in the future.
+ *
+ * The journal is multi-headed because we want to write data to the journal as
+ * optimally as possible. It is nice to have nodes belonging to the same inode
+ * in one LEB, so we may write data owned by different inodes to different
+ * journal heads, although at present only one data head is used.
+ *
+ * For recovery reasons, the base head contains all inode nodes, all directory
+ * entry nodes and all truncate nodes. This means that the other heads contain
+ * only data nodes.
+ *
+ * Bud LEBs may be half-indexed. For example, if the bud was not full at the
+ * time of commit, the bud is retained to continue to be used in the journal,
+ * even though the "front" of the LEB is now indexed. In that case, the log
+ * reference contains the offset where the bud starts for the purposes of the
+ * journal.
+ *
+ * The journal size has to be limited, because the larger is the journal, the
+ * longer it takes to mount UBIFS (scanning the journal) and the more memory it
+ * takes (indexing in the TNC).
+ *
+ * All the journal write operations like 'ubifs_jnl_update()' here, which write
+ * multiple UBIFS nodes to the journal at one go, are atomic with respect to
+ * unclean reboots. Should the unclean reboot happen, the recovery code drops
+ * all the nodes.
+ */
+
+#include "ubifs.h"
+
+/**
+ * zero_ino_node_unused - zero out unused fields of an on-flash inode node.
+ * @ino: the inode to zero out
+ */
+static inline void zero_ino_node_unused(struct ubifs_ino_node *ino)
+{
+	memset(ino->padding1, 0, 4);
+	memset(ino->padding2, 0, 26);
+}
+
+/**
+ * zero_dent_node_unused - zero out unused fields of an on-flash directory
+ *                         entry node.
+ * @dent: the directory entry to zero out
+ */
+static inline void zero_dent_node_unused(struct ubifs_dent_node *dent)
+{
+	dent->padding1 = 0;
+	memset(dent->padding2, 0, 4);
+}
+
+/**
+ * zero_data_node_unused - zero out unused fields of an on-flash data node.
+ * @data: the data node to zero out
+ */
+static inline void zero_data_node_unused(struct ubifs_data_node *data)
+{
+	memset(data->padding, 0, 2);
+}
+
+/**
+ * zero_trun_node_unused - zero out unused fields of an on-flash truncation
+ *                         node.
+ * @trun: the truncation node to zero out
+ */
+static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
+{
+	memset(trun->padding, 0, 12);
+}
+
+/**
+ * reserve_space - reserve space in the journal.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head number
+ * @len: node length
+ *
+ * This function reserves space in journal head @head. If the reservation
+ * succeeded, the journal head stays locked and later has to be unlocked using
+ * 'release_head()'. 'write_node()' and 'write_head()' functions also unlock
+ * it. Returns zero in case of success, %-EAGAIN if commit has to be done, and
+ * other negative error codes in case of other failures.
+ */
+static int reserve_space(struct ubifs_info *c, int jhead, int len)
+{
+	int err = 0, err1, retries = 0, avail, lnum, offs, squeeze;
+	struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+
+	/*
+	 * Typically, the base head has smaller nodes written to it, so it is
+	 * better to try to allocate space at the ends of eraseblocks. This is
+	 * what the squeeze parameter does.
+	 */
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	squeeze = (jhead == BASEHD);
+again:
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+
+	if (c->ro_error) {
+		err = -EROFS;
+		goto out_unlock;
+	}
+
+	avail = c->leb_size - wbuf->offs - wbuf->used;
+	if (wbuf->lnum != -1 && avail >= len)
+		return 0;
+
+	/*
+	 * Write buffer wasn't seek'ed or there is no enough space - look for an
+	 * LEB with some empty space.
+	 */
+	lnum = ubifs_find_free_space(c, len, &offs, squeeze);
+	if (lnum >= 0)
+		goto out;
+
+	err = lnum;
+	if (err != -ENOSPC)
+		goto out_unlock;
+
+	/*
+	 * No free space, we have to run garbage collector to make
+	 * some. But the write-buffer mutex has to be unlocked because
+	 * GC also takes it.
+	 */
+	dbg_jnl("no free space in jhead %s, run GC", dbg_jhead(jhead));
+	mutex_unlock(&wbuf->io_mutex);
+
+	lnum = ubifs_garbage_collect(c, 0);
+	if (lnum < 0) {
+		err = lnum;
+		if (err != -ENOSPC)
+			return err;
+
+		/*
+		 * GC could not make a free LEB. But someone else may
+		 * have allocated new bud for this journal head,
+		 * because we dropped @wbuf->io_mutex, so try once
+		 * again.
+		 */
+		dbg_jnl("GC couldn't make a free LEB for jhead %s",
+			dbg_jhead(jhead));
+		if (retries++ < 2) {
+			dbg_jnl("retry (%d)", retries);
+			goto again;
+		}
+
+		dbg_jnl("return -ENOSPC");
+		return err;
+	}
+
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+	dbg_jnl("got LEB %d for jhead %s", lnum, dbg_jhead(jhead));
+	avail = c->leb_size - wbuf->offs - wbuf->used;
+
+	if (wbuf->lnum != -1 && avail >= len) {
+		/*
+		 * Someone else has switched the journal head and we have
+		 * enough space now. This happens when more than one process is
+		 * trying to write to the same journal head at the same time.
+		 */
+		dbg_jnl("return LEB %d back, already have LEB %d:%d",
+			lnum, wbuf->lnum, wbuf->offs + wbuf->used);
+		err = ubifs_return_leb(c, lnum);
+		if (err)
+			goto out_unlock;
+		return 0;
+	}
+
+	offs = 0;
+
+out:
+	/*
+	 * Make sure we synchronize the write-buffer before we add the new bud
+	 * to the log. Otherwise we may have a power cut after the log
+	 * reference node for the last bud (@lnum) is written but before the
+	 * write-buffer data are written to the next-to-last bud
+	 * (@wbuf->lnum). And the effect would be that the recovery would see
+	 * that there is corruption in the next-to-last bud.
+	 */
+	err = ubifs_wbuf_sync_nolock(wbuf);
+	if (err)
+		goto out_return;
+	err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
+	if (err)
+		goto out_return;
+	err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
+	if (err)
+		goto out_unlock;
+
+	return 0;
+
+out_unlock:
+	mutex_unlock(&wbuf->io_mutex);
+	return err;
+
+out_return:
+	/* An error occurred and the LEB has to be returned to lprops */
+	ubifs_assert(err < 0);
+	err1 = ubifs_return_leb(c, lnum);
+	if (err1 && err == -EAGAIN)
+		/*
+		 * Return original error code only if it is not %-EAGAIN,
+		 * which is not really an error. Otherwise, return the error
+		 * code of 'ubifs_return_leb()'.
+		 */
+		err = err1;
+	mutex_unlock(&wbuf->io_mutex);
+	return err;
+}
+
+/**
+ * write_node - write node to a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @node: node to write
+ * @len: node length
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ *
+ * This function writes a node to reserved space of journal head @jhead.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
+		      int *lnum, int *offs)
+{
+	struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+
+	ubifs_assert(jhead != GCHD);
+
+	*lnum = c->jheads[jhead].wbuf.lnum;
+	*offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
+
+	dbg_jnl("jhead %s, LEB %d:%d, len %d",
+		dbg_jhead(jhead), *lnum, *offs, len);
+	ubifs_prepare_node(c, node, len, 0);
+
+	return ubifs_wbuf_write_nolock(wbuf, node, len);
+}
+
+/**
+ * write_head - write data to a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @buf: buffer to write
+ * @len: length to write
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ * @sync: non-zero if the write-buffer has to by synchronized
+ *
+ * This function is the same as 'write_node()' but it does not assume the
+ * buffer it is writing is a node, so it does not prepare it (which means
+ * initializing common header and calculating CRC).
+ */
+static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
+		      int *lnum, int *offs, int sync)
+{
+	int err;
+	struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+
+	ubifs_assert(jhead != GCHD);
+
+	*lnum = c->jheads[jhead].wbuf.lnum;
+	*offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
+	dbg_jnl("jhead %s, LEB %d:%d, len %d",
+		dbg_jhead(jhead), *lnum, *offs, len);
+
+	err = ubifs_wbuf_write_nolock(wbuf, buf, len);
+	if (err)
+		return err;
+	if (sync)
+		err = ubifs_wbuf_sync_nolock(wbuf);
+	return err;
+}
+
+/**
+ * make_reservation - reserve journal space.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @len: how many bytes to reserve
+ *
+ * This function makes space reservation in journal head @jhead. The function
+ * takes the commit lock and locks the journal head, and the caller has to
+ * unlock the head and finish the reservation with 'finish_reservation()'.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * Note, the journal head may be unlocked as soon as the data is written, while
+ * the commit lock has to be released after the data has been added to the
+ * TNC.
+ */
+static int make_reservation(struct ubifs_info *c, int jhead, int len)
+{
+	int err, cmt_retries = 0, nospc_retries = 0;
+
+again:
+	down_read(&c->commit_sem);
+	err = reserve_space(c, jhead, len);
+	if (!err)
+		return 0;
+	up_read(&c->commit_sem);
+
+	if (err == -ENOSPC) {
+		/*
+		 * GC could not make any progress. We should try to commit
+		 * once because it could make some dirty space and GC would
+		 * make progress, so make the error -EAGAIN so that the below
+		 * will commit and re-try.
+		 */
+		if (nospc_retries++ < 2) {
+			dbg_jnl("no space, retry");
+			err = -EAGAIN;
+		}
+
+		/*
+		 * This means that the budgeting is incorrect. We always have
+		 * to be able to write to the media, because all operations are
+		 * budgeted. Deletions are not budgeted, though, but we reserve
+		 * an extra LEB for them.
+		 */
+	}
+
+	if (err != -EAGAIN)
+		goto out;
+
+	/*
+	 * -EAGAIN means that the journal is full or too large, or the above
+	 * code wants to do one commit. Do this and re-try.
+	 */
+	if (cmt_retries > 128) {
+		/*
+		 * This should not happen unless the journal size limitations
+		 * are too tough.
+		 */
+		ubifs_err("stuck in space allocation");
+		err = -ENOSPC;
+		goto out;
+	} else if (cmt_retries > 32)
+		ubifs_warn("too many space allocation re-tries (%d)",
+			   cmt_retries);
+
+	dbg_jnl("-EAGAIN, commit and retry (retried %d times)",
+		cmt_retries);
+	cmt_retries += 1;
+
+	err = ubifs_run_commit(c);
+	if (err)
+		return err;
+	goto again;
+
+out:
+	ubifs_err("cannot reserve %d bytes in jhead %d, error %d",
+		  len, jhead, err);
+	if (err == -ENOSPC) {
+		/* This are some budgeting problems, print useful information */
+		down_write(&c->commit_sem);
+		dbg_dump_stack();
+		dbg_dump_budg(c, &c->bi);
+		dbg_dump_lprops(c);
+		cmt_retries = dbg_check_lprops(c);
+		up_write(&c->commit_sem);
+	}
+	return err;
+}
+
+/**
+ * release_head - release a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ *
+ * This function releases journal head @jhead which was locked by
+ * the 'make_reservation()' function. It has to be called after each successful
+ * 'make_reservation()' invocation.
+ */
+static inline void release_head(struct ubifs_info *c, int jhead)
+{
+	mutex_unlock(&c->jheads[jhead].wbuf.io_mutex);
+}
+
+/**
+ * finish_reservation - finish a reservation.
+ * @c: UBIFS file-system description object
+ *
+ * This function finishes journal space reservation. It must be called after
+ * 'make_reservation()'.
+ */
+static void finish_reservation(struct ubifs_info *c)
+{
+	up_read(&c->commit_sem);
+}
+
+/**
+ * get_dent_type - translate VFS inode mode to UBIFS directory entry type.
+ * @mode: inode mode
+ */
+static int get_dent_type(int mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		return UBIFS_ITYPE_REG;
+	case S_IFDIR:
+		return UBIFS_ITYPE_DIR;
+	case S_IFLNK:
+		return UBIFS_ITYPE_LNK;
+	case S_IFBLK:
+		return UBIFS_ITYPE_BLK;
+	case S_IFCHR:
+		return UBIFS_ITYPE_CHR;
+	case S_IFIFO:
+		return UBIFS_ITYPE_FIFO;
+	case S_IFSOCK:
+		return UBIFS_ITYPE_SOCK;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+/**
+ * pack_inode - pack an inode node.
+ * @c: UBIFS file-system description object
+ * @ino: buffer in which to pack inode node
+ * @inode: inode to pack
+ * @last: indicates the last node of the group
+ */
+static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
+		       const struct inode *inode, int last)
+{
+	int data_len = 0, last_reference = !inode->i_nlink;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	ino->ch.node_type = UBIFS_INO_NODE;
+	ino_key_init_flash(c, &ino->key, inode->i_ino);
+	ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum);
+	ino->atime_sec  = cpu_to_le64(inode->i_atime.tv_sec);
+	ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	ino->ctime_sec  = cpu_to_le64(inode->i_ctime.tv_sec);
+	ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	ino->mtime_sec  = cpu_to_le64(inode->i_mtime.tv_sec);
+	ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	ino->uid   = cpu_to_le32(inode->i_uid);
+	ino->gid   = cpu_to_le32(inode->i_gid);
+	ino->mode  = cpu_to_le32(inode->i_mode);
+	ino->flags = cpu_to_le32(ui->flags);
+	ino->size  = cpu_to_le64(ui->ui_size);
+	ino->nlink = cpu_to_le32(inode->i_nlink);
+	ino->compr_type  = cpu_to_le16(ui->compr_type);
+	ino->data_len    = cpu_to_le32(ui->data_len);
+	ino->xattr_cnt   = cpu_to_le32(ui->xattr_cnt);
+	ino->xattr_size  = cpu_to_le32(ui->xattr_size);
+	ino->xattr_names = cpu_to_le32(ui->xattr_names);
+	zero_ino_node_unused(ino);
+
+	/*
+	 * Drop the attached data if this is a deletion inode, the data is not
+	 * needed anymore.
+	 */
+	if (!last_reference) {
+		memcpy(ino->data, ui->data, ui->data_len);
+		data_len = ui->data_len;
+	}
+
+	ubifs_prep_grp_node(c, ino, UBIFS_INO_NODE_SZ + data_len, last);
+}
+
+/**
+ * mark_inode_clean - mark UBIFS inode as clean.
+ * @c: UBIFS file-system description object
+ * @ui: UBIFS inode to mark as clean
+ *
+ * This helper function marks UBIFS inode @ui as clean by cleaning the
+ * @ui->dirty flag and releasing its budget. Note, VFS may still treat the
+ * inode as dirty and try to write it back, but 'ubifs_write_inode()' would
+ * just do nothing.
+ */
+static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
+{
+	if (ui->dirty)
+		ubifs_release_dirty_inode_budget(c, ui);
+	ui->dirty = 0;
+}
+
+/**
+ * ubifs_jnl_update - update inode.
+ * @c: UBIFS file-system description object
+ * @dir: parent inode or host inode in case of extended attributes
+ * @nm: directory entry name
+ * @inode: inode to update
+ * @deletion: indicates a directory entry deletion i.e unlink or rmdir
+ * @xent: non-zero if the directory entry is an extended attribute entry
+ *
+ * This function updates an inode by writing a directory entry (or extended
+ * attribute entry), the inode itself, and the parent directory inode (or the
+ * host inode) to the journal.
+ *
+ * The function writes the host inode @dir last, which is important in case of
+ * extended attributes. Indeed, then we guarantee that if the host inode gets
+ * synchronized (with 'fsync()'), and the write-buffer it sits in gets flushed,
+ * the extended attribute inode gets flushed too. And this is exactly what the
+ * user expects - synchronizing the host inode synchronizes its extended
+ * attributes. Similarly, this guarantees that if @dir is synchronized, its
+ * directory entry corresponding to @nm gets synchronized too.
+ *
+ * If the inode (@inode) or the parent directory (@dir) are synchronous, this
+ * function synchronizes the write-buffer.
+ *
+ * This function marks the @dir and @inode inodes as clean and returns zero on
+ * success. In case of failure, a negative error code is returned.
+ */
+int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
+		     const struct qstr *nm, const struct inode *inode,
+		     int deletion, int xent)
+{
+	int err, dlen, ilen, len, lnum, ino_offs, dent_offs;
+	int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir);
+	int last_reference = !!(deletion && inode->i_nlink == 0);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	struct ubifs_dent_node *dent;
+	struct ubifs_ino_node *ino;
+	union ubifs_key dent_key, ino_key;
+
+	dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
+		inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
+	ubifs_assert(dir_ui->data_len == 0);
+	ubifs_assert(mutex_is_locked(&dir_ui->ui_mutex));
+
+	dlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+	ilen = UBIFS_INO_NODE_SZ;
+
+	/*
+	 * If the last reference to the inode is being deleted, then there is
+	 * no need to attach and write inode data, it is being deleted anyway.
+	 * And if the inode is being deleted, no need to synchronize
+	 * write-buffer even if the inode is synchronous.
+	 */
+	if (!last_reference) {
+		ilen += ui->data_len;
+		sync |= IS_SYNC(inode);
+	}
+
+	aligned_dlen = ALIGN(dlen, 8);
+	aligned_ilen = ALIGN(ilen, 8);
+	len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ;
+	dent = kmalloc(len, GFP_NOFS);
+	if (!dent)
+		return -ENOMEM;
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, BASEHD, len);
+	if (err)
+		goto out_free;
+
+	if (!xent) {
+		dent->ch.node_type = UBIFS_DENT_NODE;
+		dent_key_init(c, &dent_key, dir->i_ino, nm);
+	} else {
+		dent->ch.node_type = UBIFS_XENT_NODE;
+		xent_key_init(c, &dent_key, dir->i_ino, nm);
+	}
+
+	key_write(c, &dent_key, dent->key);
+	dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino);
+	dent->type = get_dent_type(inode->i_mode);
+	dent->nlen = cpu_to_le16(nm->len);
+	memcpy(dent->name, nm->name, nm->len);
+	dent->name[nm->len] = '\0';
+	zero_dent_node_unused(dent);
+	ubifs_prep_grp_node(c, dent, dlen, 0);
+
+	ino = (void *)dent + aligned_dlen;
+	pack_inode(c, ino, inode, 0);
+	ino = (void *)ino + aligned_ilen;
+	pack_inode(c, ino, dir, 1);
+
+	if (last_reference) {
+		err = ubifs_add_orphan(c, inode->i_ino);
+		if (err) {
+			release_head(c, BASEHD);
+			goto out_finish;
+		}
+		ui->del_cmtno = c->cmt_no;
+	}
+
+	err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync);
+	if (err)
+		goto out_release;
+	if (!sync) {
+		struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+		ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
+		ubifs_wbuf_add_ino_nolock(wbuf, dir->i_ino);
+	}
+	release_head(c, BASEHD);
+	kfree(dent);
+
+	if (deletion) {
+		err = ubifs_tnc_remove_nm(c, &dent_key, nm);
+		if (err)
+			goto out_ro;
+		err = ubifs_add_dirt(c, lnum, dlen);
+	} else
+		err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, nm);
+	if (err)
+		goto out_ro;
+
+	/*
+	 * Note, we do not remove the inode from TNC even if the last reference
+	 * to it has just been deleted, because the inode may still be opened.
+	 * Instead, the inode has been added to orphan lists and the orphan
+	 * subsystem will take further care about it.
+	 */
+	ino_key_init(c, &ino_key, inode->i_ino);
+	ino_offs = dent_offs + aligned_dlen;
+	err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen);
+	if (err)
+		goto out_ro;
+
+	ino_key_init(c, &ino_key, dir->i_ino);
+	ino_offs += aligned_ilen;
+	err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, UBIFS_INO_NODE_SZ);
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	spin_lock(&ui->ui_lock);
+	ui->synced_i_size = ui->ui_size;
+	spin_unlock(&ui->ui_lock);
+	mark_inode_clean(c, ui);
+	mark_inode_clean(c, dir_ui);
+	return 0;
+
+out_finish:
+	finish_reservation(c);
+out_free:
+	kfree(dent);
+	return err;
+
+out_release:
+	release_head(c, BASEHD);
+	kfree(dent);
+out_ro:
+	ubifs_ro_mode(c, err);
+	if (last_reference)
+		ubifs_delete_orphan(c, inode->i_ino);
+	finish_reservation(c);
+	return err;
+}
+
+/**
+ * ubifs_jnl_write_data - write a data node to the journal.
+ * @c: UBIFS file-system description object
+ * @inode: inode the data node belongs to
+ * @key: node key
+ * @buf: buffer to write
+ * @len: data length (must not exceed %UBIFS_BLOCK_SIZE)
+ *
+ * This function writes a data node to the journal. Returns %0 if the data node
+ * was successfully written, and a negative error code in case of failure.
+ */
+int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
+			 const union ubifs_key *key, const void *buf, int len)
+{
+	struct ubifs_data_node *data;
+	int err, lnum, offs, compr_type, out_len;
+	int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	dbg_jnl("ino %lu, blk %u, len %d, key %s",
+		(unsigned long)key_inum(c, key), key_block(c, key), len,
+		DBGKEY(key));
+	ubifs_assert(len <= UBIFS_BLOCK_SIZE);
+
+	data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
+	if (!data) {
+		/*
+		 * Fall-back to the write reserve buffer. Note, we might be
+		 * currently on the memory reclaim path, when the kernel is
+		 * trying to free some memory by writing out dirty pages. The
+		 * write reserve buffer helps us to guarantee that we are
+		 * always able to write the data.
+		 */
+		allocated = 0;
+		mutex_lock(&c->write_reserve_mutex);
+		data = c->write_reserve_buf;
+	}
+
+	data->ch.node_type = UBIFS_DATA_NODE;
+	key_write(c, key, &data->key);
+	data->size = cpu_to_le32(len);
+	zero_data_node_unused(data);
+
+	if (!(ui->flags & UBIFS_COMPR_FL))
+		/* Compression is disabled for this inode */
+		compr_type = UBIFS_COMPR_NONE;
+	else
+		compr_type = ui->compr_type;
+
+	out_len = dlen - UBIFS_DATA_NODE_SZ;
+	ubifs_compress(buf, len, &data->data, &out_len, &compr_type);
+	ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
+
+	dlen = UBIFS_DATA_NODE_SZ + out_len;
+	data->compr_type = cpu_to_le16(compr_type);
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, DATAHD, dlen);
+	if (err)
+		goto out_free;
+
+	err = write_node(c, DATAHD, data, dlen, &lnum, &offs);
+	if (err)
+		goto out_release;
+	ubifs_wbuf_add_ino_nolock(&c->jheads[DATAHD].wbuf, key_inum(c, key));
+	release_head(c, DATAHD);
+
+	err = ubifs_tnc_add(c, key, lnum, offs, dlen);
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	if (!allocated)
+		mutex_unlock(&c->write_reserve_mutex);
+	else
+		kfree(data);
+	return 0;
+
+out_release:
+	release_head(c, DATAHD);
+out_ro:
+	ubifs_ro_mode(c, err);
+	finish_reservation(c);
+out_free:
+	if (!allocated)
+		mutex_unlock(&c->write_reserve_mutex);
+	else
+		kfree(data);
+	return err;
+}
+
+/**
+ * ubifs_jnl_write_inode - flush inode to the journal.
+ * @c: UBIFS file-system description object
+ * @inode: inode to flush
+ *
+ * This function writes inode @inode to the journal. If the inode is
+ * synchronous, it also synchronizes the write-buffer. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
+{
+	int err, lnum, offs;
+	struct ubifs_ino_node *ino;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink;
+
+	dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink);
+
+	/*
+	 * If the inode is being deleted, do not write the attached data. No
+	 * need to synchronize the write-buffer either.
+	 */
+	if (!last_reference) {
+		len += ui->data_len;
+		sync = IS_SYNC(inode);
+	}
+	ino = kmalloc(len, GFP_NOFS);
+	if (!ino)
+		return -ENOMEM;
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, BASEHD, len);
+	if (err)
+		goto out_free;
+
+	pack_inode(c, ino, inode, 1);
+	err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
+	if (err)
+		goto out_release;
+	if (!sync)
+		ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+					  inode->i_ino);
+	release_head(c, BASEHD);
+
+	if (last_reference) {
+		err = ubifs_tnc_remove_ino(c, inode->i_ino);
+		if (err)
+			goto out_ro;
+		ubifs_delete_orphan(c, inode->i_ino);
+		err = ubifs_add_dirt(c, lnum, len);
+	} else {
+		union ubifs_key key;
+
+		ino_key_init(c, &key, inode->i_ino);
+		err = ubifs_tnc_add(c, &key, lnum, offs, len);
+	}
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	spin_lock(&ui->ui_lock);
+	ui->synced_i_size = ui->ui_size;
+	spin_unlock(&ui->ui_lock);
+	kfree(ino);
+	return 0;
+
+out_release:
+	release_head(c, BASEHD);
+out_ro:
+	ubifs_ro_mode(c, err);
+	finish_reservation(c);
+out_free:
+	kfree(ino);
+	return err;
+}
+
+/**
+ * ubifs_jnl_delete_inode - delete an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to delete
+ *
+ * This function deletes inode @inode which includes removing it from orphans,
+ * deleting it from TNC and, in some cases, writing a deletion inode to the
+ * journal.
+ *
+ * When regular file inodes are unlinked or a directory inode is removed, the
+ * 'ubifs_jnl_update()' function writes a corresponding deletion inode and
+ * direntry to the media, and adds the inode to orphans. After this, when the
+ * last reference to this inode has been dropped, this function is called. In
+ * general, it has to write one more deletion inode to the media, because if
+ * a commit happened between 'ubifs_jnl_update()' and
+ * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal
+ * anymore, and in fact it might not be on the flash anymore, because it might
+ * have been garbage-collected already. And for optimization reasons UBIFS does
+ * not read the orphan area if it has been unmounted cleanly, so it would have
+ * no indication in the journal that there is a deleted inode which has to be
+ * removed from TNC.
+ *
+ * However, if there was no commit between 'ubifs_jnl_update()' and
+ * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion
+ * inode to the media for the second time. And this is quite a typical case.
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
+{
+	int err;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	ubifs_assert(inode->i_nlink == 0);
+
+	if (ui->del_cmtno != c->cmt_no)
+		/* A commit happened for sure */
+		return ubifs_jnl_write_inode(c, inode);
+
+	down_read(&c->commit_sem);
+	/*
+	 * Check commit number again, because the first test has been done
+	 * without @c->commit_sem, so a commit might have happened.
+	 */
+	if (ui->del_cmtno != c->cmt_no) {
+		up_read(&c->commit_sem);
+		return ubifs_jnl_write_inode(c, inode);
+	}
+
+	err = ubifs_tnc_remove_ino(c, inode->i_ino);
+	if (err)
+		ubifs_ro_mode(c, err);
+	else
+		ubifs_delete_orphan(c, inode->i_ino);
+	up_read(&c->commit_sem);
+	return err;
+}
+
+/**
+ * ubifs_jnl_rename - rename a directory entry.
+ * @c: UBIFS file-system description object
+ * @old_dir: parent inode of directory entry to rename
+ * @old_dentry: directory entry to rename
+ * @new_dir: parent inode of directory entry to rename
+ * @new_dentry: new directory entry (or directory entry to replace)
+ * @sync: non-zero if the write-buffer has to be synchronized
+ *
+ * This function implements the re-name operation which may involve writing up
+ * to 3 inodes and 2 directory entries. It marks the written inodes as clean
+ * and returns zero on success. In case of failure, a negative error code is
+ * returned.
+ */
+int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
+		     const struct dentry *old_dentry,
+		     const struct inode *new_dir,
+		     const struct dentry *new_dentry, int sync)
+{
+	void *p;
+	union ubifs_key key;
+	struct ubifs_dent_node *dent, *dent2;
+	int err, dlen1, dlen2, ilen, lnum, offs, len;
+	const struct inode *old_inode = old_dentry->d_inode;
+	const struct inode *new_inode = new_dentry->d_inode;
+	int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
+	int last_reference = !!(new_inode && new_inode->i_nlink == 0);
+	int move = (old_dir != new_dir);
+	struct ubifs_inode *uninitialized_var(new_ui);
+
+	dbg_jnl("dent '%.*s' in dir ino %lu to dent '%.*s' in dir ino %lu",
+		old_dentry->d_name.len, old_dentry->d_name.name,
+		old_dir->i_ino, new_dentry->d_name.len,
+		new_dentry->d_name.name, new_dir->i_ino);
+	ubifs_assert(ubifs_inode(old_dir)->data_len == 0);
+	ubifs_assert(ubifs_inode(new_dir)->data_len == 0);
+	ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex));
+	ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex));
+
+	dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1;
+	dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1;
+	if (new_inode) {
+		new_ui = ubifs_inode(new_inode);
+		ubifs_assert(mutex_is_locked(&new_ui->ui_mutex));
+		ilen = UBIFS_INO_NODE_SZ;
+		if (!last_reference)
+			ilen += new_ui->data_len;
+	} else
+		ilen = 0;
+
+	aligned_dlen1 = ALIGN(dlen1, 8);
+	aligned_dlen2 = ALIGN(dlen2, 8);
+	len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8);
+	if (old_dir != new_dir)
+		len += plen;
+	dent = kmalloc(len, GFP_NOFS);
+	if (!dent)
+		return -ENOMEM;
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, BASEHD, len);
+	if (err)
+		goto out_free;
+
+	/* Make new dent */
+	dent->ch.node_type = UBIFS_DENT_NODE;
+	dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name);
+	dent->inum = cpu_to_le64(old_inode->i_ino);
+	dent->type = get_dent_type(old_inode->i_mode);
+	dent->nlen = cpu_to_le16(new_dentry->d_name.len);
+	memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len);
+	dent->name[new_dentry->d_name.len] = '\0';
+	zero_dent_node_unused(dent);
+	ubifs_prep_grp_node(c, dent, dlen1, 0);
+
+	/* Make deletion dent */
+	dent2 = (void *)dent + aligned_dlen1;
+	dent2->ch.node_type = UBIFS_DENT_NODE;
+	dent_key_init_flash(c, &dent2->key, old_dir->i_ino,
+			    &old_dentry->d_name);
+	dent2->inum = 0;
+	dent2->type = DT_UNKNOWN;
+	dent2->nlen = cpu_to_le16(old_dentry->d_name.len);
+	memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len);
+	dent2->name[old_dentry->d_name.len] = '\0';
+	zero_dent_node_unused(dent2);
+	ubifs_prep_grp_node(c, dent2, dlen2, 0);
+
+	p = (void *)dent2 + aligned_dlen2;
+	if (new_inode) {
+		pack_inode(c, p, new_inode, 0);
+		p += ALIGN(ilen, 8);
+	}
+
+	if (!move)
+		pack_inode(c, p, old_dir, 1);
+	else {
+		pack_inode(c, p, old_dir, 0);
+		p += ALIGN(plen, 8);
+		pack_inode(c, p, new_dir, 1);
+	}
+
+	if (last_reference) {
+		err = ubifs_add_orphan(c, new_inode->i_ino);
+		if (err) {
+			release_head(c, BASEHD);
+			goto out_finish;
+		}
+		new_ui->del_cmtno = c->cmt_no;
+	}
+
+	err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync);
+	if (err)
+		goto out_release;
+	if (!sync) {
+		struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+		ubifs_wbuf_add_ino_nolock(wbuf, new_dir->i_ino);
+		ubifs_wbuf_add_ino_nolock(wbuf, old_dir->i_ino);
+		if (new_inode)
+			ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+						  new_inode->i_ino);
+	}
+	release_head(c, BASEHD);
+
+	dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name);
+	err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name);
+	if (err)
+		goto out_ro;
+
+	err = ubifs_add_dirt(c, lnum, dlen2);
+	if (err)
+		goto out_ro;
+
+	dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
+	err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name);
+	if (err)
+		goto out_ro;
+
+	offs += aligned_dlen1 + aligned_dlen2;
+	if (new_inode) {
+		ino_key_init(c, &key, new_inode->i_ino);
+		err = ubifs_tnc_add(c, &key, lnum, offs, ilen);
+		if (err)
+			goto out_ro;
+		offs += ALIGN(ilen, 8);
+	}
+
+	ino_key_init(c, &key, old_dir->i_ino);
+	err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+	if (err)
+		goto out_ro;
+
+	if (old_dir != new_dir) {
+		offs += ALIGN(plen, 8);
+		ino_key_init(c, &key, new_dir->i_ino);
+		err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+		if (err)
+			goto out_ro;
+	}
+
+	finish_reservation(c);
+	if (new_inode) {
+		mark_inode_clean(c, new_ui);
+		spin_lock(&new_ui->ui_lock);
+		new_ui->synced_i_size = new_ui->ui_size;
+		spin_unlock(&new_ui->ui_lock);
+	}
+	mark_inode_clean(c, ubifs_inode(old_dir));
+	if (move)
+		mark_inode_clean(c, ubifs_inode(new_dir));
+	kfree(dent);
+	return 0;
+
+out_release:
+	release_head(c, BASEHD);
+out_ro:
+	ubifs_ro_mode(c, err);
+	if (last_reference)
+		ubifs_delete_orphan(c, new_inode->i_ino);
+out_finish:
+	finish_reservation(c);
+out_free:
+	kfree(dent);
+	return err;
+}
+
+/**
+ * recomp_data_node - re-compress a truncated data node.
+ * @dn: data node to re-compress
+ * @new_len: new length
+ *
+ * This function is used when an inode is truncated and the last data node of
+ * the inode has to be re-compressed and re-written.
+ */
+static int recomp_data_node(struct ubifs_data_node *dn, int *new_len)
+{
+	void *buf;
+	int err, len, compr_type, out_len;
+
+	out_len = le32_to_cpu(dn->size);
+	buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+
+	len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+	compr_type = le16_to_cpu(dn->compr_type);
+	err = ubifs_decompress(&dn->data, len, buf, &out_len, compr_type);
+	if (err)
+		goto out;
+
+	ubifs_compress(buf, *new_len, &dn->data, &out_len, &compr_type);
+	ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
+	dn->compr_type = cpu_to_le16(compr_type);
+	dn->size = cpu_to_le32(*new_len);
+	*new_len = UBIFS_DATA_NODE_SZ + out_len;
+out:
+	kfree(buf);
+	return err;
+}
+
+/**
+ * ubifs_jnl_truncate - update the journal for a truncation.
+ * @c: UBIFS file-system description object
+ * @inode: inode to truncate
+ * @old_size: old size
+ * @new_size: new size
+ *
+ * When the size of a file decreases due to truncation, a truncation node is
+ * written, the journal tree is updated, and the last data block is re-written
+ * if it has been affected. The inode is also updated in order to synchronize
+ * the new inode size.
+ *
+ * This function marks the inode as clean and returns zero on success. In case
+ * of failure, a negative error code is returned.
+ */
+int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
+		       loff_t old_size, loff_t new_size)
+{
+	union ubifs_key key, to_key;
+	struct ubifs_ino_node *ino;
+	struct ubifs_trun_node *trun;
+	struct ubifs_data_node *uninitialized_var(dn);
+	int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	ino_t inum = inode->i_ino;
+	unsigned int blk;
+
+	dbg_jnl("ino %lu, size %lld -> %lld",
+		(unsigned long)inum, old_size, new_size);
+	ubifs_assert(!ui->data_len);
+	ubifs_assert(S_ISREG(inode->i_mode));
+	ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+
+	sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
+	     UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR;
+	ino = kmalloc(sz, GFP_NOFS);
+	if (!ino)
+		return -ENOMEM;
+
+	trun = (void *)ino + UBIFS_INO_NODE_SZ;
+	trun->ch.node_type = UBIFS_TRUN_NODE;
+	trun->inum = cpu_to_le32(inum);
+	trun->old_size = cpu_to_le64(old_size);
+	trun->new_size = cpu_to_le64(new_size);
+	zero_trun_node_unused(trun);
+
+	dlen = new_size & (UBIFS_BLOCK_SIZE - 1);
+	if (dlen) {
+		/* Get last data block so it can be truncated */
+		dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
+		blk = new_size >> UBIFS_BLOCK_SHIFT;
+		data_key_init(c, &key, inum, blk);
+		dbg_jnl("last block key %s", DBGKEY(&key));
+		err = ubifs_tnc_lookup(c, &key, dn);
+		if (err == -ENOENT)
+			dlen = 0; /* Not found (so it is a hole) */
+		else if (err)
+			goto out_free;
+		else {
+			if (le32_to_cpu(dn->size) <= dlen)
+				dlen = 0; /* Nothing to do */
+			else {
+				int compr_type = le16_to_cpu(dn->compr_type);
+
+				if (compr_type != UBIFS_COMPR_NONE) {
+					err = recomp_data_node(dn, &dlen);
+					if (err)
+						goto out_free;
+				} else {
+					dn->size = cpu_to_le32(dlen);
+					dlen += UBIFS_DATA_NODE_SZ;
+				}
+				zero_data_node_unused(dn);
+			}
+		}
+	}
+
+	/* Must make reservation before allocating sequence numbers */
+	len = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ;
+	if (dlen)
+		len += dlen;
+	err = make_reservation(c, BASEHD, len);
+	if (err)
+		goto out_free;
+
+	pack_inode(c, ino, inode, 0);
+	ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);
+	if (dlen)
+		ubifs_prep_grp_node(c, dn, dlen, 1);
+
+	err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
+	if (err)
+		goto out_release;
+	if (!sync)
+		ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, inum);
+	release_head(c, BASEHD);
+
+	if (dlen) {
+		sz = offs + UBIFS_INO_NODE_SZ + UBIFS_TRUN_NODE_SZ;
+		err = ubifs_tnc_add(c, &key, lnum, sz, dlen);
+		if (err)
+			goto out_ro;
+	}
+
+	ino_key_init(c, &key, inum);
+	err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ);
+	if (err)
+		goto out_ro;
+
+	err = ubifs_add_dirt(c, lnum, UBIFS_TRUN_NODE_SZ);
+	if (err)
+		goto out_ro;
+
+	bit = new_size & (UBIFS_BLOCK_SIZE - 1);
+	blk = (new_size >> UBIFS_BLOCK_SHIFT) + (bit ? 1 : 0);
+	data_key_init(c, &key, inum, blk);
+
+	bit = old_size & (UBIFS_BLOCK_SIZE - 1);
+	blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0 : 1);
+	data_key_init(c, &to_key, inum, blk);
+
+	err = ubifs_tnc_remove_range(c, &key, &to_key);
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	spin_lock(&ui->ui_lock);
+	ui->synced_i_size = ui->ui_size;
+	spin_unlock(&ui->ui_lock);
+	mark_inode_clean(c, ui);
+	kfree(ino);
+	return 0;
+
+out_release:
+	release_head(c, BASEHD);
+out_ro:
+	ubifs_ro_mode(c, err);
+	finish_reservation(c);
+out_free:
+	kfree(ino);
+	return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_XATTR
+
+/**
+ * ubifs_jnl_delete_xattr - delete an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @inode: extended attribute inode
+ * @nm: extended attribute entry name
+ *
+ * This function delete an extended attribute which is very similar to
+ * un-linking regular files - it writes a deletion xentry, a deletion inode and
+ * updates the target inode. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
+			   const struct inode *inode, const struct qstr *nm)
+{
+	int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen;
+	struct ubifs_dent_node *xent;
+	struct ubifs_ino_node *ino;
+	union ubifs_key xent_key, key1, key2;
+	int sync = IS_DIRSYNC(host);
+	struct ubifs_inode *host_ui = ubifs_inode(host);
+
+	dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d",
+		host->i_ino, inode->i_ino, nm->name,
+		ubifs_inode(inode)->data_len);
+	ubifs_assert(inode->i_nlink == 0);
+	ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
+
+	/*
+	 * Since we are deleting the inode, we do not bother to attach any data
+	 * to it and assume its length is %UBIFS_INO_NODE_SZ.
+	 */
+	xlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+	aligned_xlen = ALIGN(xlen, 8);
+	hlen = host_ui->data_len + UBIFS_INO_NODE_SZ;
+	len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8);
+
+	xent = kmalloc(len, GFP_NOFS);
+	if (!xent)
+		return -ENOMEM;
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, BASEHD, len);
+	if (err) {
+		kfree(xent);
+		return err;
+	}
+
+	xent->ch.node_type = UBIFS_XENT_NODE;
+	xent_key_init(c, &xent_key, host->i_ino, nm);
+	key_write(c, &xent_key, xent->key);
+	xent->inum = 0;
+	xent->type = get_dent_type(inode->i_mode);
+	xent->nlen = cpu_to_le16(nm->len);
+	memcpy(xent->name, nm->name, nm->len);
+	xent->name[nm->len] = '\0';
+	zero_dent_node_unused(xent);
+	ubifs_prep_grp_node(c, xent, xlen, 0);
+
+	ino = (void *)xent + aligned_xlen;
+	pack_inode(c, ino, inode, 0);
+	ino = (void *)ino + UBIFS_INO_NODE_SZ;
+	pack_inode(c, ino, host, 1);
+
+	err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);
+	if (!sync && !err)
+		ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, host->i_ino);
+	release_head(c, BASEHD);
+	kfree(xent);
+	if (err)
+		goto out_ro;
+
+	/* Remove the extended attribute entry from TNC */
+	err = ubifs_tnc_remove_nm(c, &xent_key, nm);
+	if (err)
+		goto out_ro;
+	err = ubifs_add_dirt(c, lnum, xlen);
+	if (err)
+		goto out_ro;
+
+	/*
+	 * Remove all nodes belonging to the extended attribute inode from TNC.
+	 * Well, there actually must be only one node - the inode itself.
+	 */
+	lowest_ino_key(c, &key1, inode->i_ino);
+	highest_ino_key(c, &key2, inode->i_ino);
+	err = ubifs_tnc_remove_range(c, &key1, &key2);
+	if (err)
+		goto out_ro;
+	err = ubifs_add_dirt(c, lnum, UBIFS_INO_NODE_SZ);
+	if (err)
+		goto out_ro;
+
+	/* And update TNC with the new host inode position */
+	ino_key_init(c, &key1, host->i_ino);
+	err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen);
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	spin_lock(&host_ui->ui_lock);
+	host_ui->synced_i_size = host_ui->ui_size;
+	spin_unlock(&host_ui->ui_lock);
+	mark_inode_clean(c, host_ui);
+	return 0;
+
+out_ro:
+	ubifs_ro_mode(c, err);
+	finish_reservation(c);
+	return err;
+}
+
+/**
+ * ubifs_jnl_change_xattr - change an extended attribute.
+ * @c: UBIFS file-system description object
+ * @inode: extended attribute inode
+ * @host: host inode
+ *
+ * This function writes the updated version of an extended attribute inode and
+ * the host inode to the journal (to the base head). The host inode is written
+ * after the extended attribute inode in order to guarantee that the extended
+ * attribute will be flushed when the inode is synchronized by 'fsync()' and
+ * consequently, the write-buffer is synchronized. This function returns zero
+ * in case of success and a negative error code in case of failure.
+ */
+int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
+			   const struct inode *host)
+{
+	int err, len1, len2, aligned_len, aligned_len1, lnum, offs;
+	struct ubifs_inode *host_ui = ubifs_inode(host);
+	struct ubifs_ino_node *ino;
+	union ubifs_key key;
+	int sync = IS_DIRSYNC(host);
+
+	dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino);
+	ubifs_assert(host->i_nlink > 0);
+	ubifs_assert(inode->i_nlink > 0);
+	ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
+
+	len1 = UBIFS_INO_NODE_SZ + host_ui->data_len;
+	len2 = UBIFS_INO_NODE_SZ + ubifs_inode(inode)->data_len;
+	aligned_len1 = ALIGN(len1, 8);
+	aligned_len = aligned_len1 + ALIGN(len2, 8);
+
+	ino = kmalloc(aligned_len, GFP_NOFS);
+	if (!ino)
+		return -ENOMEM;
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, BASEHD, aligned_len);
+	if (err)
+		goto out_free;
+
+	pack_inode(c, ino, host, 0);
+	pack_inode(c, (void *)ino + aligned_len1, inode, 1);
+
+	err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);
+	if (!sync && !err) {
+		struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+		ubifs_wbuf_add_ino_nolock(wbuf, host->i_ino);
+		ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
+	}
+	release_head(c, BASEHD);
+	if (err)
+		goto out_ro;
+
+	ino_key_init(c, &key, host->i_ino);
+	err = ubifs_tnc_add(c, &key, lnum, offs, len1);
+	if (err)
+		goto out_ro;
+
+	ino_key_init(c, &key, inode->i_ino);
+	err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2);
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	spin_lock(&host_ui->ui_lock);
+	host_ui->synced_i_size = host_ui->ui_size;
+	spin_unlock(&host_ui->ui_lock);
+	mark_inode_clean(c, host_ui);
+	kfree(ino);
+	return 0;
+
+out_ro:
+	ubifs_ro_mode(c, err);
+	finish_reservation(c);
+out_free:
+	kfree(ino);
+	return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_XATTR */
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
new file mode 100644
index 00000000..92a8491a
--- /dev/null
+++ b/fs/ubifs/key.h
@@ -0,0 +1,548 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This header contains various key-related definitions and helper function.
+ * UBIFS allows several key schemes, so we access key fields only via these
+ * helpers. At the moment only one key scheme is supported.
+ *
+ * Simple key scheme
+ * ~~~~~~~~~~~~~~~~~
+ *
+ * Keys are 64-bits long. First 32-bits are inode number (parent inode number
+ * in case of direntry key). Next 3 bits are node type. The last 29 bits are
+ * 4KiB offset in case of inode node, and direntry hash in case of a direntry
+ * node. We use "r5" hash borrowed from reiserfs.
+ */
+
+#ifndef __UBIFS_KEY_H__
+#define __UBIFS_KEY_H__
+
+/**
+ * key_mask_hash - mask a valid hash value.
+ * @val: value to be masked
+ *
+ * We use hash values as offset in directories, so values %0 and %1 are
+ * reserved for "." and "..". %2 is reserved for "end of readdir" marker. This
+ * function makes sure the reserved values are not used.
+ */
+static inline uint32_t key_mask_hash(uint32_t hash)
+{
+	hash &= UBIFS_S_KEY_HASH_MASK;
+	if (unlikely(hash <= 2))
+		hash += 3;
+	return hash;
+}
+
+/**
+ * key_r5_hash - R5 hash function (borrowed from reiserfs).
+ * @s: direntry name
+ * @len: name length
+ */
+static inline uint32_t key_r5_hash(const char *s, int len)
+{
+	uint32_t a = 0;
+	const signed char *str = (const signed char *)s;
+
+	while (*str) {
+		a += *str << 4;
+		a += *str >> 4;
+		a *= 11;
+		str++;
+	}
+
+	return key_mask_hash(a);
+}
+
+/**
+ * key_test_hash - testing hash function.
+ * @str: direntry name
+ * @len: name length
+ */
+static inline uint32_t key_test_hash(const char *str, int len)
+{
+	uint32_t a = 0;
+
+	len = min_t(uint32_t, len, 4);
+	memcpy(&a, str, len);
+	return key_mask_hash(a);
+}
+
+/**
+ * ino_key_init - initialize inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void ino_key_init(const struct ubifs_info *c,
+				union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * ino_key_init_flash - initialize on-flash inode key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: inode number
+ */
+static inline void ino_key_init_flash(const struct ubifs_info *c, void *k,
+				      ino_t inum)
+{
+	union ubifs_key *key = k;
+
+	key->j32[0] = cpu_to_le32(inum);
+	key->j32[1] = cpu_to_le32(UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS);
+	memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * lowest_ino_key - get the lowest possible inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void lowest_ino_key(const struct ubifs_info *c,
+				union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = 0;
+}
+
+/**
+ * highest_ino_key - get the highest possible inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void highest_ino_key(const struct ubifs_info *c,
+				union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = 0xffffffff;
+}
+
+/**
+ * dent_key_init - initialize directory entry key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: parent inode number
+ * @nm: direntry name and length
+ */
+static inline void dent_key_init(const struct ubifs_info *c,
+				 union ubifs_key *key, ino_t inum,
+				 const struct qstr *nm)
+{
+	uint32_t hash = c->key_hash(nm->name, nm->len);
+
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->u32[0] = inum;
+	key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * dent_key_init_hash - initialize directory entry key without re-calculating
+ *                      hash function.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: parent inode number
+ * @hash: direntry name hash
+ */
+static inline void dent_key_init_hash(const struct ubifs_info *c,
+				      union ubifs_key *key, ino_t inum,
+				      uint32_t hash)
+{
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->u32[0] = inum;
+	key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * dent_key_init_flash - initialize on-flash directory entry key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: parent inode number
+ * @nm: direntry name and length
+ */
+static inline void dent_key_init_flash(const struct ubifs_info *c, void *k,
+				       ino_t inum, const struct qstr *nm)
+{
+	union ubifs_key *key = k;
+	uint32_t hash = c->key_hash(nm->name, nm->len);
+
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->j32[0] = cpu_to_le32(inum);
+	key->j32[1] = cpu_to_le32(hash |
+				  (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS));
+	memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * lowest_dent_key - get the lowest possible directory entry key.
+ * @c: UBIFS file-system description object
+ * @key: where to store the lowest key
+ * @inum: parent inode number
+ */
+static inline void lowest_dent_key(const struct ubifs_info *c,
+				   union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS;
+}
+
+/**
+ * xent_key_init - initialize extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: host inode number
+ * @nm: extended attribute entry name and length
+ */
+static inline void xent_key_init(const struct ubifs_info *c,
+				 union ubifs_key *key, ino_t inum,
+				 const struct qstr *nm)
+{
+	uint32_t hash = c->key_hash(nm->name, nm->len);
+
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->u32[0] = inum;
+	key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * xent_key_init_flash - initialize on-flash extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: host inode number
+ * @nm: extended attribute entry name and length
+ */
+static inline void xent_key_init_flash(const struct ubifs_info *c, void *k,
+				       ino_t inum, const struct qstr *nm)
+{
+	union ubifs_key *key = k;
+	uint32_t hash = c->key_hash(nm->name, nm->len);
+
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->j32[0] = cpu_to_le32(inum);
+	key->j32[1] = cpu_to_le32(hash |
+				  (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS));
+	memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * lowest_xent_key - get the lowest possible extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @key: where to store the lowest key
+ * @inum: host inode number
+ */
+static inline void lowest_xent_key(const struct ubifs_info *c,
+				   union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS;
+}
+
+/**
+ * data_key_init - initialize data key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ * @block: block number
+ */
+static inline void data_key_init(const struct ubifs_info *c,
+				 union ubifs_key *key, ino_t inum,
+				 unsigned int block)
+{
+	ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
+	key->u32[0] = inum;
+	key->u32[1] = block | (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS);
+}
+
+/**
+ * highest_data_key - get the highest possible data key for an inode.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void highest_data_key(const struct ubifs_info *c,
+				   union ubifs_key *key, ino_t inum)
+{
+	data_key_init(c, key, inum, UBIFS_S_KEY_BLOCK_MASK);
+}
+
+/**
+ * trun_key_init - initialize truncation node key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ *
+ * Note, UBIFS does not have truncation keys on the media and this function is
+ * only used for purposes of replay.
+ */
+static inline void trun_key_init(const struct ubifs_info *c,
+				 union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = UBIFS_TRUN_KEY << UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * invalid_key_init - initialize invalid node key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ *
+ * This is a helper function which marks a @key object as invalid.
+ */
+static inline void invalid_key_init(const struct ubifs_info *c,
+				    union ubifs_key *key)
+{
+	key->u32[0] = 0xDEADBEAF;
+	key->u32[1] = UBIFS_INVALID_KEY;
+}
+
+/**
+ * key_type - get key type.
+ * @c: UBIFS file-system description object
+ * @key: key to get type of
+ */
+static inline int key_type(const struct ubifs_info *c,
+			   const union ubifs_key *key)
+{
+	return key->u32[1] >> UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * key_type_flash - get type of a on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: key to get type of
+ */
+static inline int key_type_flash(const struct ubifs_info *c, const void *k)
+{
+	const union ubifs_key *key = k;
+
+	return le32_to_cpu(key->j32[1]) >> UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * key_inum - fetch inode number from key.
+ * @c: UBIFS file-system description object
+ * @k: key to fetch inode number from
+ */
+static inline ino_t key_inum(const struct ubifs_info *c, const void *k)
+{
+	const union ubifs_key *key = k;
+
+	return key->u32[0];
+}
+
+/**
+ * key_inum_flash - fetch inode number from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: key to fetch inode number from
+ */
+static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
+{
+	const union ubifs_key *key = k;
+
+	return le32_to_cpu(key->j32[0]);
+}
+
+/**
+ * key_hash - get directory entry hash.
+ * @c: UBIFS file-system description object
+ * @key: the key to get hash from
+ */
+static inline uint32_t key_hash(const struct ubifs_info *c,
+				const union ubifs_key *key)
+{
+	return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
+}
+
+/**
+ * key_hash_flash - get directory entry hash from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: the key to get hash from
+ */
+static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k)
+{
+	const union ubifs_key *key = k;
+
+	return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_HASH_MASK;
+}
+
+/**
+ * key_block - get data block number.
+ * @c: UBIFS file-system description object
+ * @key: the key to get the block number from
+ */
+static inline unsigned int key_block(const struct ubifs_info *c,
+				     const union ubifs_key *key)
+{
+	return key->u32[1] & UBIFS_S_KEY_BLOCK_MASK;
+}
+
+/**
+ * key_block_flash - get data block number from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: the key to get the block number from
+ */
+static inline unsigned int key_block_flash(const struct ubifs_info *c,
+					   const void *k)
+{
+	const union ubifs_key *key = k;
+
+	return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_BLOCK_MASK;
+}
+
+/**
+ * key_read - transform a key to in-memory format.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_read(const struct ubifs_info *c, const void *from,
+			    union ubifs_key *to)
+{
+	const union ubifs_key *f = from;
+
+	to->u32[0] = le32_to_cpu(f->j32[0]);
+	to->u32[1] = le32_to_cpu(f->j32[1]);
+}
+
+/**
+ * key_write - transform a key from in-memory format.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_write(const struct ubifs_info *c,
+			     const union ubifs_key *from, void *to)
+{
+	union ubifs_key *t = to;
+
+	t->j32[0] = cpu_to_le32(from->u32[0]);
+	t->j32[1] = cpu_to_le32(from->u32[1]);
+	memset(to + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * key_write_idx - transform a key from in-memory format for the index.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_write_idx(const struct ubifs_info *c,
+				 const union ubifs_key *from, void *to)
+{
+	union ubifs_key *t = to;
+
+	t->j32[0] = cpu_to_le32(from->u32[0]);
+	t->j32[1] = cpu_to_le32(from->u32[1]);
+}
+
+/**
+ * key_copy - copy a key.
+ * @c: UBIFS file-system description object
+ * @from: the key to copy from
+ * @to: the key to copy to
+ */
+static inline void key_copy(const struct ubifs_info *c,
+			    const union ubifs_key *from, union ubifs_key *to)
+{
+	to->u64[0] = from->u64[0];
+}
+
+/**
+ * keys_cmp - compare keys.
+ * @c: UBIFS file-system description object
+ * @key1: the first key to compare
+ * @key2: the second key to compare
+ *
+ * This function compares 2 keys and returns %-1 if @key1 is less than
+ * @key2, %0 if the keys are equivalent and %1 if @key1 is greater than @key2.
+ */
+static inline int keys_cmp(const struct ubifs_info *c,
+			   const union ubifs_key *key1,
+			   const union ubifs_key *key2)
+{
+	if (key1->u32[0] < key2->u32[0])
+		return -1;
+	if (key1->u32[0] > key2->u32[0])
+		return 1;
+	if (key1->u32[1] < key2->u32[1])
+		return -1;
+	if (key1->u32[1] > key2->u32[1])
+		return 1;
+
+	return 0;
+}
+
+/**
+ * keys_eq - determine if keys are equivalent.
+ * @c: UBIFS file-system description object
+ * @key1: the first key to compare
+ * @key2: the second key to compare
+ *
+ * This function compares 2 keys and returns %1 if @key1 is equal to @key2 and
+ * %0 if not.
+ */
+static inline int keys_eq(const struct ubifs_info *c,
+			  const union ubifs_key *key1,
+			  const union ubifs_key *key2)
+{
+	if (key1->u32[0] != key2->u32[0])
+		return 0;
+	if (key1->u32[1] != key2->u32[1])
+		return 0;
+	return 1;
+}
+
+/**
+ * is_hash_key - is a key vulnerable to hash collisions.
+ * @c: UBIFS file-system description object
+ * @key: key
+ *
+ * This function returns %1 if @key is a hashed key or %0 otherwise.
+ */
+static inline int is_hash_key(const struct ubifs_info *c,
+			      const union ubifs_key *key)
+{
+	int type = key_type(c, key);
+
+	return type == UBIFS_DENT_KEY || type == UBIFS_XENT_KEY;
+}
+
+/**
+ * key_max_inode_size - get maximum file size allowed by current key format.
+ * @c: UBIFS file-system description object
+ */
+static inline unsigned long long key_max_inode_size(const struct ubifs_info *c)
+{
+	switch (c->key_fmt) {
+	case UBIFS_SIMPLE_KEY_FMT:
+		return (1ULL << UBIFS_S_KEY_BLOCK_BITS) * UBIFS_BLOCK_SIZE;
+	default:
+		return 0;
+	}
+}
+
+#endif /* !__UBIFS_KEY_H__ */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
new file mode 100644
index 00000000..affea949
--- /dev/null
+++ b/fs/ubifs/log.c
@@ -0,0 +1,773 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file is a part of UBIFS journal implementation and contains various
+ * functions which manipulate the log. The log is a fixed area on the flash
+ * which does not contain any data but refers to buds. The log is a part of the
+ * journal.
+ */
+
+#include "ubifs.h"
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_check_bud_bytes(struct ubifs_info *c);
+#else
+#define dbg_check_bud_bytes(c) 0
+#endif
+
+/**
+ * ubifs_search_bud - search bud LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number to search
+ *
+ * This function searches bud LEB @lnum. Returns bud description object in case
+ * of success and %NULL if there is no bud with this LEB number.
+ */
+struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum)
+{
+	struct rb_node *p;
+	struct ubifs_bud *bud;
+
+	spin_lock(&c->buds_lock);
+	p = c->buds.rb_node;
+	while (p) {
+		bud = rb_entry(p, struct ubifs_bud, rb);
+		if (lnum < bud->lnum)
+			p = p->rb_left;
+		else if (lnum > bud->lnum)
+			p = p->rb_right;
+		else {
+			spin_unlock(&c->buds_lock);
+			return bud;
+		}
+	}
+	spin_unlock(&c->buds_lock);
+	return NULL;
+}
+
+/**
+ * ubifs_get_wbuf - get the wbuf associated with a LEB, if there is one.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number to search
+ *
+ * This functions returns the wbuf for @lnum or %NULL if there is not one.
+ */
+struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
+{
+	struct rb_node *p;
+	struct ubifs_bud *bud;
+	int jhead;
+
+	if (!c->jheads)
+		return NULL;
+
+	spin_lock(&c->buds_lock);
+	p = c->buds.rb_node;
+	while (p) {
+		bud = rb_entry(p, struct ubifs_bud, rb);
+		if (lnum < bud->lnum)
+			p = p->rb_left;
+		else if (lnum > bud->lnum)
+			p = p->rb_right;
+		else {
+			jhead = bud->jhead;
+			spin_unlock(&c->buds_lock);
+			return &c->jheads[jhead].wbuf;
+		}
+	}
+	spin_unlock(&c->buds_lock);
+	return NULL;
+}
+
+/**
+ * empty_log_bytes - calculate amount of empty space in the log.
+ * @c: UBIFS file-system description object
+ */
+static inline long long empty_log_bytes(const struct ubifs_info *c)
+{
+	long long h, t;
+
+	h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs;
+	t = (long long)c->ltail_lnum * c->leb_size;
+
+	if (h >= t)
+		return c->log_bytes - h + t;
+	else
+		return t - h;
+}
+
+/**
+ * ubifs_add_bud - add bud LEB to the tree of buds and its journal head list.
+ * @c: UBIFS file-system description object
+ * @bud: the bud to add
+ */
+void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
+{
+	struct rb_node **p, *parent = NULL;
+	struct ubifs_bud *b;
+	struct ubifs_jhead *jhead;
+
+	spin_lock(&c->buds_lock);
+	p = &c->buds.rb_node;
+	while (*p) {
+		parent = *p;
+		b = rb_entry(parent, struct ubifs_bud, rb);
+		ubifs_assert(bud->lnum != b->lnum);
+		if (bud->lnum < b->lnum)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&bud->rb, parent, p);
+	rb_insert_color(&bud->rb, &c->buds);
+	if (c->jheads) {
+		jhead = &c->jheads[bud->jhead];
+		list_add_tail(&bud->list, &jhead->buds_list);
+	} else
+		ubifs_assert(c->replaying && c->ro_mount);
+
+	/*
+	 * Note, although this is a new bud, we anyway account this space now,
+	 * before any data has been written to it, because this is about to
+	 * guarantee fixed mount time, and this bud will anyway be read and
+	 * scanned.
+	 */
+	c->bud_bytes += c->leb_size - bud->start;
+
+	dbg_log("LEB %d:%d, jhead %s, bud_bytes %lld", bud->lnum,
+		bud->start, dbg_jhead(bud->jhead), c->bud_bytes);
+	spin_unlock(&c->buds_lock);
+}
+
+/**
+ * ubifs_add_bud_to_log - add a new bud to the log.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head the bud belongs to
+ * @lnum: LEB number of the bud
+ * @offs: starting offset of the bud
+ *
+ * This function writes reference node for the new bud LEB @lnum it to the log,
+ * and adds it to the buds tress. It also makes sure that log size does not
+ * exceed the 'c->max_bud_bytes' limit. Returns zero in case of success,
+ * %-EAGAIN if commit is required, and a negative error codes in case of
+ * failure.
+ */
+int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
+{
+	int err;
+	struct ubifs_bud *bud;
+	struct ubifs_ref_node *ref;
+
+	bud = kmalloc(sizeof(struct ubifs_bud), GFP_NOFS);
+	if (!bud)
+		return -ENOMEM;
+	ref = kzalloc(c->ref_node_alsz, GFP_NOFS);
+	if (!ref) {
+		kfree(bud);
+		return -ENOMEM;
+	}
+
+	mutex_lock(&c->log_mutex);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error) {
+		err = -EROFS;
+		goto out_unlock;
+	}
+
+	/* Make sure we have enough space in the log */
+	if (empty_log_bytes(c) - c->ref_node_alsz < c->min_log_bytes) {
+		dbg_log("not enough log space - %lld, required %d",
+			empty_log_bytes(c), c->min_log_bytes);
+		ubifs_commit_required(c);
+		err = -EAGAIN;
+		goto out_unlock;
+	}
+
+	/*
+	 * Make sure the amount of space in buds will not exceed the
+	 * 'c->max_bud_bytes' limit, because we want to guarantee mount time
+	 * limits.
+	 *
+	 * It is not necessary to hold @c->buds_lock when reading @c->bud_bytes
+	 * because we are holding @c->log_mutex. All @c->bud_bytes take place
+	 * when both @c->log_mutex and @c->bud_bytes are locked.
+	 */
+	if (c->bud_bytes + c->leb_size - offs > c->max_bud_bytes) {
+		dbg_log("bud bytes %lld (%lld max), require commit",
+			c->bud_bytes, c->max_bud_bytes);
+		ubifs_commit_required(c);
+		err = -EAGAIN;
+		goto out_unlock;
+	}
+
+	/*
+	 * If the journal is full enough - start background commit. Note, it is
+	 * OK to read 'c->cmt_state' without spinlock because integer reads
+	 * are atomic in the kernel.
+	 */
+	if (c->bud_bytes >= c->bg_bud_bytes &&
+	    c->cmt_state == COMMIT_RESTING) {
+		dbg_log("bud bytes %lld (%lld max), initiate BG commit",
+			c->bud_bytes, c->max_bud_bytes);
+		ubifs_request_bg_commit(c);
+	}
+
+	bud->lnum = lnum;
+	bud->start = offs;
+	bud->jhead = jhead;
+
+	ref->ch.node_type = UBIFS_REF_NODE;
+	ref->lnum = cpu_to_le32(bud->lnum);
+	ref->offs = cpu_to_le32(bud->start);
+	ref->jhead = cpu_to_le32(jhead);
+
+	if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
+		c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+		c->lhead_offs = 0;
+	}
+
+	if (c->lhead_offs == 0) {
+		/* Must ensure next log LEB has been unmapped */
+		err = ubifs_leb_unmap(c, c->lhead_lnum);
+		if (err)
+			goto out_unlock;
+	}
+
+	if (bud->start == 0) {
+		/*
+		 * Before writing the LEB reference which refers an empty LEB
+		 * to the log, we have to make sure it is mapped, because
+		 * otherwise we'd risk to refer an LEB with garbage in case of
+		 * an unclean reboot, because the target LEB might have been
+		 * unmapped, but not yet physically erased.
+		 */
+		err = ubi_leb_map(c->ubi, bud->lnum, UBI_SHORTTERM);
+		if (err)
+			goto out_unlock;
+	}
+
+	dbg_log("write ref LEB %d:%d",
+		c->lhead_lnum, c->lhead_offs);
+	err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum,
+			       c->lhead_offs, UBI_SHORTTERM);
+	if (err)
+		goto out_unlock;
+
+	c->lhead_offs += c->ref_node_alsz;
+
+	ubifs_add_bud(c, bud);
+
+	mutex_unlock(&c->log_mutex);
+	kfree(ref);
+	return 0;
+
+out_unlock:
+	if (err != -EAGAIN)
+		ubifs_ro_mode(c, err);
+	mutex_unlock(&c->log_mutex);
+	kfree(ref);
+	kfree(bud);
+	return err;
+}
+
+/**
+ * remove_buds - remove used buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function removes use buds from the buds tree. It does not remove the
+ * buds which are pointed to by journal heads.
+ */
+static void remove_buds(struct ubifs_info *c)
+{
+	struct rb_node *p;
+
+	ubifs_assert(list_empty(&c->old_buds));
+	c->cmt_bud_bytes = 0;
+	spin_lock(&c->buds_lock);
+	p = rb_first(&c->buds);
+	while (p) {
+		struct rb_node *p1 = p;
+		struct ubifs_bud *bud;
+		struct ubifs_wbuf *wbuf;
+
+		p = rb_next(p);
+		bud = rb_entry(p1, struct ubifs_bud, rb);
+		wbuf = &c->jheads[bud->jhead].wbuf;
+
+		if (wbuf->lnum == bud->lnum) {
+			/*
+			 * Do not remove buds which are pointed to by journal
+			 * heads (non-closed buds).
+			 */
+			c->cmt_bud_bytes += wbuf->offs - bud->start;
+			dbg_log("preserve %d:%d, jhead %s, bud bytes %d, "
+				"cmt_bud_bytes %lld", bud->lnum, bud->start,
+				dbg_jhead(bud->jhead), wbuf->offs - bud->start,
+				c->cmt_bud_bytes);
+			bud->start = wbuf->offs;
+		} else {
+			c->cmt_bud_bytes += c->leb_size - bud->start;
+			dbg_log("remove %d:%d, jhead %s, bud bytes %d, "
+				"cmt_bud_bytes %lld", bud->lnum, bud->start,
+				dbg_jhead(bud->jhead), c->leb_size - bud->start,
+				c->cmt_bud_bytes);
+			rb_erase(p1, &c->buds);
+			/*
+			 * If the commit does not finish, the recovery will need
+			 * to replay the journal, in which case the old buds
+			 * must be unchanged. Do not release them until post
+			 * commit i.e. do not allow them to be garbage
+			 * collected.
+			 */
+			list_move(&bud->list, &c->old_buds);
+		}
+	}
+	spin_unlock(&c->buds_lock);
+}
+
+/**
+ * ubifs_log_start_commit - start commit.
+ * @c: UBIFS file-system description object
+ * @ltail_lnum: return new log tail LEB number
+ *
+ * The commit operation starts with writing "commit start" node to the log and
+ * reference nodes for all journal heads which will define new journal after
+ * the commit has been finished. The commit start and reference nodes are
+ * written in one go to the nearest empty log LEB (hence, when commit is
+ * finished UBIFS may safely unmap all the previous log LEBs). This function
+ * returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
+{
+	void *buf;
+	struct ubifs_cs_node *cs;
+	struct ubifs_ref_node *ref;
+	int err, i, max_len, len;
+
+	err = dbg_check_bud_bytes(c);
+	if (err)
+		return err;
+
+	max_len = UBIFS_CS_NODE_SZ + c->jhead_cnt * UBIFS_REF_NODE_SZ;
+	max_len = ALIGN(max_len, c->min_io_size);
+	buf = cs = kmalloc(max_len, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+
+	cs->ch.node_type = UBIFS_CS_NODE;
+	cs->cmt_no = cpu_to_le64(c->cmt_no);
+	ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);
+
+	/*
+	 * Note, we do not lock 'c->log_mutex' because this is the commit start
+	 * phase and we are exclusively using the log. And we do not lock
+	 * write-buffer because nobody can write to the file-system at this
+	 * phase.
+	 */
+
+	len = UBIFS_CS_NODE_SZ;
+	for (i = 0; i < c->jhead_cnt; i++) {
+		int lnum = c->jheads[i].wbuf.lnum;
+		int offs = c->jheads[i].wbuf.offs;
+
+		if (lnum == -1 || offs == c->leb_size)
+			continue;
+
+		dbg_log("add ref to LEB %d:%d for jhead %s",
+			lnum, offs, dbg_jhead(i));
+		ref = buf + len;
+		ref->ch.node_type = UBIFS_REF_NODE;
+		ref->lnum = cpu_to_le32(lnum);
+		ref->offs = cpu_to_le32(offs);
+		ref->jhead = cpu_to_le32(i);
+
+		ubifs_prepare_node(c, ref, UBIFS_REF_NODE_SZ, 0);
+		len += UBIFS_REF_NODE_SZ;
+	}
+
+	ubifs_pad(c, buf + len, ALIGN(len, c->min_io_size) - len);
+
+	/* Switch to the next log LEB */
+	if (c->lhead_offs) {
+		c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+		c->lhead_offs = 0;
+	}
+
+	if (c->lhead_offs == 0) {
+		/* Must ensure next LEB has been unmapped */
+		err = ubifs_leb_unmap(c, c->lhead_lnum);
+		if (err)
+			goto out;
+	}
+
+	len = ALIGN(len, c->min_io_size);
+	dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len);
+	err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len, UBI_SHORTTERM);
+	if (err)
+		goto out;
+
+	*ltail_lnum = c->lhead_lnum;
+
+	c->lhead_offs += len;
+	if (c->lhead_offs == c->leb_size) {
+		c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+		c->lhead_offs = 0;
+	}
+
+	remove_buds(c);
+
+	/*
+	 * We have started the commit and now users may use the rest of the log
+	 * for new writes.
+	 */
+	c->min_log_bytes = 0;
+
+out:
+	kfree(buf);
+	return err;
+}
+
+/**
+ * ubifs_log_end_commit - end commit.
+ * @c: UBIFS file-system description object
+ * @ltail_lnum: new log tail LEB number
+ *
+ * This function is called on when the commit operation was finished. It
+ * moves log tail to new position and unmaps LEBs which contain obsolete data.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
+{
+	int err;
+
+	/*
+	 * At this phase we have to lock 'c->log_mutex' because UBIFS allows FS
+	 * writes during commit. Its only short "commit" start phase when
+	 * writers are blocked.
+	 */
+	mutex_lock(&c->log_mutex);
+
+	dbg_log("old tail was LEB %d:0, new tail is LEB %d:0",
+		c->ltail_lnum, ltail_lnum);
+
+	c->ltail_lnum = ltail_lnum;
+	/*
+	 * The commit is finished and from now on it must be guaranteed that
+	 * there is always enough space for the next commit.
+	 */
+	c->min_log_bytes = c->leb_size;
+
+	spin_lock(&c->buds_lock);
+	c->bud_bytes -= c->cmt_bud_bytes;
+	spin_unlock(&c->buds_lock);
+
+	err = dbg_check_bud_bytes(c);
+
+	mutex_unlock(&c->log_mutex);
+	return err;
+}
+
+/**
+ * ubifs_log_post_commit - things to do after commit is completed.
+ * @c: UBIFS file-system description object
+ * @old_ltail_lnum: old log tail LEB number
+ *
+ * Release buds only after commit is completed, because they must be unchanged
+ * if recovery is needed.
+ *
+ * Unmap log LEBs only after commit is completed, because they may be needed for
+ * recovery.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
+{
+	int lnum, err = 0;
+
+	while (!list_empty(&c->old_buds)) {
+		struct ubifs_bud *bud;
+
+		bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
+		err = ubifs_return_leb(c, bud->lnum);
+		if (err)
+			return err;
+		list_del(&bud->list);
+		kfree(bud);
+	}
+	mutex_lock(&c->log_mutex);
+	for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
+	     lnum = ubifs_next_log_lnum(c, lnum)) {
+		dbg_log("unmap log LEB %d", lnum);
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			goto out;
+	}
+out:
+	mutex_unlock(&c->log_mutex);
+	return err;
+}
+
+/**
+ * struct done_ref - references that have been done.
+ * @rb: rb-tree node
+ * @lnum: LEB number
+ */
+struct done_ref {
+	struct rb_node rb;
+	int lnum;
+};
+
+/**
+ * done_already - determine if a reference has been done already.
+ * @done_tree: rb-tree to store references that have been done
+ * @lnum: LEB number of reference
+ *
+ * This function returns %1 if the reference has been done, %0 if not, otherwise
+ * a negative error code is returned.
+ */
+static int done_already(struct rb_root *done_tree, int lnum)
+{
+	struct rb_node **p = &done_tree->rb_node, *parent = NULL;
+	struct done_ref *dr;
+
+	while (*p) {
+		parent = *p;
+		dr = rb_entry(parent, struct done_ref, rb);
+		if (lnum < dr->lnum)
+			p = &(*p)->rb_left;
+		else if (lnum > dr->lnum)
+			p = &(*p)->rb_right;
+		else
+			return 1;
+	}
+
+	dr = kzalloc(sizeof(struct done_ref), GFP_NOFS);
+	if (!dr)
+		return -ENOMEM;
+
+	dr->lnum = lnum;
+
+	rb_link_node(&dr->rb, parent, p);
+	rb_insert_color(&dr->rb, done_tree);
+
+	return 0;
+}
+
+/**
+ * destroy_done_tree - destroy the done tree.
+ * @done_tree: done tree to destroy
+ */
+static void destroy_done_tree(struct rb_root *done_tree)
+{
+	struct rb_node *this = done_tree->rb_node;
+	struct done_ref *dr;
+
+	while (this) {
+		if (this->rb_left) {
+			this = this->rb_left;
+			continue;
+		} else if (this->rb_right) {
+			this = this->rb_right;
+			continue;
+		}
+		dr = rb_entry(this, struct done_ref, rb);
+		this = rb_parent(this);
+		if (this) {
+			if (this->rb_left == &dr->rb)
+				this->rb_left = NULL;
+			else
+				this->rb_right = NULL;
+		}
+		kfree(dr);
+	}
+}
+
+/**
+ * add_node - add a node to the consolidated log.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to which to add
+ * @lnum: LEB number to which to write is passed and returned here
+ * @offs: offset to where to write is passed and returned here
+ * @node: node to add
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
+		    void *node)
+{
+	struct ubifs_ch *ch = node;
+	int len = le32_to_cpu(ch->len), remains = c->leb_size - *offs;
+
+	if (len > remains) {
+		int sz = ALIGN(*offs, c->min_io_size), err;
+
+		ubifs_pad(c, buf + *offs, sz - *offs);
+		err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
+		if (err)
+			return err;
+		*lnum = ubifs_next_log_lnum(c, *lnum);
+		*offs = 0;
+	}
+	memcpy(buf + *offs, node, len);
+	*offs += ALIGN(len, 8);
+	return 0;
+}
+
+/**
+ * ubifs_consolidate_log - consolidate the log.
+ * @c: UBIFS file-system description object
+ *
+ * Repeated failed commits could cause the log to be full, but at least 1 LEB is
+ * needed for commit. This function rewrites the reference nodes in the log
+ * omitting duplicates, and failed CS nodes, and leaving no gaps.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_consolidate_log(struct ubifs_info *c)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	struct rb_root done_tree = RB_ROOT;
+	int lnum, err, first = 1, write_lnum, offs = 0;
+	void *buf;
+
+	dbg_rcvry("log tail LEB %d, log head LEB %d", c->ltail_lnum,
+		  c->lhead_lnum);
+	buf = vmalloc(c->leb_size);
+	if (!buf)
+		return -ENOMEM;
+	lnum = c->ltail_lnum;
+	write_lnum = lnum;
+	while (1) {
+		sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
+		if (IS_ERR(sleb)) {
+			err = PTR_ERR(sleb);
+			goto out_free;
+		}
+		list_for_each_entry(snod, &sleb->nodes, list) {
+			switch (snod->type) {
+			case UBIFS_REF_NODE: {
+				struct ubifs_ref_node *ref = snod->node;
+				int ref_lnum = le32_to_cpu(ref->lnum);
+
+				err = done_already(&done_tree, ref_lnum);
+				if (err < 0)
+					goto out_scan;
+				if (err != 1) {
+					err = add_node(c, buf, &write_lnum,
+						       &offs, snod->node);
+					if (err)
+						goto out_scan;
+				}
+				break;
+			}
+			case UBIFS_CS_NODE:
+				if (!first)
+					break;
+				err = add_node(c, buf, &write_lnum, &offs,
+					       snod->node);
+				if (err)
+					goto out_scan;
+				first = 0;
+				break;
+			}
+		}
+		ubifs_scan_destroy(sleb);
+		if (lnum == c->lhead_lnum)
+			break;
+		lnum = ubifs_next_log_lnum(c, lnum);
+	}
+	if (offs) {
+		int sz = ALIGN(offs, c->min_io_size);
+
+		ubifs_pad(c, buf + offs, sz - offs);
+		err = ubifs_leb_change(c, write_lnum, buf, sz, UBI_SHORTTERM);
+		if (err)
+			goto out_free;
+		offs = ALIGN(offs, c->min_io_size);
+	}
+	destroy_done_tree(&done_tree);
+	vfree(buf);
+	if (write_lnum == c->lhead_lnum) {
+		ubifs_err("log is too full");
+		return -EINVAL;
+	}
+	/* Unmap remaining LEBs */
+	lnum = write_lnum;
+	do {
+		lnum = ubifs_next_log_lnum(c, lnum);
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+	} while (lnum != c->lhead_lnum);
+	c->lhead_lnum = write_lnum;
+	c->lhead_offs = offs;
+	dbg_rcvry("new log head at %d:%d", c->lhead_lnum, c->lhead_offs);
+	return 0;
+
+out_scan:
+	ubifs_scan_destroy(sleb);
+out_free:
+	destroy_done_tree(&done_tree);
+	vfree(buf);
+	return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_check_bud_bytes - make sure bud bytes calculation are all right.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure the amount of flash space used by closed buds
+ * ('c->bud_bytes' is correct). Returns zero in case of success and %-EINVAL in
+ * case of failure.
+ */
+static int dbg_check_bud_bytes(struct ubifs_info *c)
+{
+	int i, err = 0;
+	struct ubifs_bud *bud;
+	long long bud_bytes = 0;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+
+	spin_lock(&c->buds_lock);
+	for (i = 0; i < c->jhead_cnt; i++)
+		list_for_each_entry(bud, &c->jheads[i].buds_list, list)
+			bud_bytes += c->leb_size - bud->start;
+
+	if (c->bud_bytes != bud_bytes) {
+		ubifs_err("bad bud_bytes %lld, calculated %lld",
+			  c->bud_bytes, bud_bytes);
+		err = -EINVAL;
+	}
+	spin_unlock(&c->buds_lock);
+
+	return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
new file mode 100644
index 00000000..667884f4
--- /dev/null
+++ b/fs/ubifs/lprops.c
@@ -0,0 +1,1319 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the functions that access LEB properties and their
+ * categories. LEBs are categorized based on the needs of UBIFS, and the
+ * categories are stored as either heaps or lists to provide a fast way of
+ * finding a LEB in a particular category. For example, UBIFS may need to find
+ * an empty LEB for the journal, or a very dirty LEB for garbage collection.
+ */
+
+#include "ubifs.h"
+
+/**
+ * get_heap_comp_val - get the LEB properties value for heap comparisons.
+ * @lprops: LEB properties
+ * @cat: LEB category
+ */
+static int get_heap_comp_val(struct ubifs_lprops *lprops, int cat)
+{
+	switch (cat) {
+	case LPROPS_FREE:
+		return lprops->free;
+	case LPROPS_DIRTY_IDX:
+		return lprops->free + lprops->dirty;
+	default:
+		return lprops->dirty;
+	}
+}
+
+/**
+ * move_up_lpt_heap - move a new heap entry up as far as possible.
+ * @c: UBIFS file-system description object
+ * @heap: LEB category heap
+ * @lprops: LEB properties to move
+ * @cat: LEB category
+ *
+ * New entries to a heap are added at the bottom and then moved up until the
+ * parent's value is greater.  In the case of LPT's category heaps, the value
+ * is either the amount of free space or the amount of dirty space, depending
+ * on the category.
+ */
+static void move_up_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
+			     struct ubifs_lprops *lprops, int cat)
+{
+	int val1, val2, hpos;
+
+	hpos = lprops->hpos;
+	if (!hpos)
+		return; /* Already top of the heap */
+	val1 = get_heap_comp_val(lprops, cat);
+	/* Compare to parent and, if greater, move up the heap */
+	do {
+		int ppos = (hpos - 1) / 2;
+
+		val2 = get_heap_comp_val(heap->arr[ppos], cat);
+		if (val2 >= val1)
+			return;
+		/* Greater than parent so move up */
+		heap->arr[ppos]->hpos = hpos;
+		heap->arr[hpos] = heap->arr[ppos];
+		heap->arr[ppos] = lprops;
+		lprops->hpos = ppos;
+		hpos = ppos;
+	} while (hpos);
+}
+
+/**
+ * adjust_lpt_heap - move a changed heap entry up or down the heap.
+ * @c: UBIFS file-system description object
+ * @heap: LEB category heap
+ * @lprops: LEB properties to move
+ * @hpos: heap position of @lprops
+ * @cat: LEB category
+ *
+ * Changed entries in a heap are moved up or down until the parent's value is
+ * greater.  In the case of LPT's category heaps, the value is either the amount
+ * of free space or the amount of dirty space, depending on the category.
+ */
+static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
+			    struct ubifs_lprops *lprops, int hpos, int cat)
+{
+	int val1, val2, val3, cpos;
+
+	val1 = get_heap_comp_val(lprops, cat);
+	/* Compare to parent and, if greater than parent, move up the heap */
+	if (hpos) {
+		int ppos = (hpos - 1) / 2;
+
+		val2 = get_heap_comp_val(heap->arr[ppos], cat);
+		if (val1 > val2) {
+			/* Greater than parent so move up */
+			while (1) {
+				heap->arr[ppos]->hpos = hpos;
+				heap->arr[hpos] = heap->arr[ppos];
+				heap->arr[ppos] = lprops;
+				lprops->hpos = ppos;
+				hpos = ppos;
+				if (!hpos)
+					return;
+				ppos = (hpos - 1) / 2;
+				val2 = get_heap_comp_val(heap->arr[ppos], cat);
+				if (val1 <= val2)
+					return;
+				/* Still greater than parent so keep going */
+			}
+		}
+	}
+
+	/* Not greater than parent, so compare to children */
+	while (1) {
+		/* Compare to left child */
+		cpos = hpos * 2 + 1;
+		if (cpos >= heap->cnt)
+			return;
+		val2 = get_heap_comp_val(heap->arr[cpos], cat);
+		if (val1 < val2) {
+			/* Less than left child, so promote biggest child */
+			if (cpos + 1 < heap->cnt) {
+				val3 = get_heap_comp_val(heap->arr[cpos + 1],
+							 cat);
+				if (val3 > val2)
+					cpos += 1; /* Right child is bigger */
+			}
+			heap->arr[cpos]->hpos = hpos;
+			heap->arr[hpos] = heap->arr[cpos];
+			heap->arr[cpos] = lprops;
+			lprops->hpos = cpos;
+			hpos = cpos;
+			continue;
+		}
+		/* Compare to right child */
+		cpos += 1;
+		if (cpos >= heap->cnt)
+			return;
+		val3 = get_heap_comp_val(heap->arr[cpos], cat);
+		if (val1 < val3) {
+			/* Less than right child, so promote right child */
+			heap->arr[cpos]->hpos = hpos;
+			heap->arr[hpos] = heap->arr[cpos];
+			heap->arr[cpos] = lprops;
+			lprops->hpos = cpos;
+			hpos = cpos;
+			continue;
+		}
+		return;
+	}
+}
+
+/**
+ * add_to_lpt_heap - add LEB properties to a LEB category heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to add
+ * @cat: LEB category
+ *
+ * This function returns %1 if @lprops is added to the heap for LEB category
+ * @cat, otherwise %0 is returned because the heap is full.
+ */
+static int add_to_lpt_heap(struct ubifs_info *c, struct ubifs_lprops *lprops,
+			   int cat)
+{
+	struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+
+	if (heap->cnt >= heap->max_cnt) {
+		const int b = LPT_HEAP_SZ / 2 - 1;
+		int cpos, val1, val2;
+
+		/* Compare to some other LEB on the bottom of heap */
+		/* Pick a position kind of randomly */
+		cpos = (((size_t)lprops >> 4) & b) + b;
+		ubifs_assert(cpos >= b);
+		ubifs_assert(cpos < LPT_HEAP_SZ);
+		ubifs_assert(cpos < heap->cnt);
+
+		val1 = get_heap_comp_val(lprops, cat);
+		val2 = get_heap_comp_val(heap->arr[cpos], cat);
+		if (val1 > val2) {
+			struct ubifs_lprops *lp;
+
+			lp = heap->arr[cpos];
+			lp->flags &= ~LPROPS_CAT_MASK;
+			lp->flags |= LPROPS_UNCAT;
+			list_add(&lp->list, &c->uncat_list);
+			lprops->hpos = cpos;
+			heap->arr[cpos] = lprops;
+			move_up_lpt_heap(c, heap, lprops, cat);
+			dbg_check_heap(c, heap, cat, lprops->hpos);
+			return 1; /* Added to heap */
+		}
+		dbg_check_heap(c, heap, cat, -1);
+		return 0; /* Not added to heap */
+	} else {
+		lprops->hpos = heap->cnt++;
+		heap->arr[lprops->hpos] = lprops;
+		move_up_lpt_heap(c, heap, lprops, cat);
+		dbg_check_heap(c, heap, cat, lprops->hpos);
+		return 1; /* Added to heap */
+	}
+}
+
+/**
+ * remove_from_lpt_heap - remove LEB properties from a LEB category heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to remove
+ * @cat: LEB category
+ */
+static void remove_from_lpt_heap(struct ubifs_info *c,
+				 struct ubifs_lprops *lprops, int cat)
+{
+	struct ubifs_lpt_heap *heap;
+	int hpos = lprops->hpos;
+
+	heap = &c->lpt_heap[cat - 1];
+	ubifs_assert(hpos >= 0 && hpos < heap->cnt);
+	ubifs_assert(heap->arr[hpos] == lprops);
+	heap->cnt -= 1;
+	if (hpos < heap->cnt) {
+		heap->arr[hpos] = heap->arr[heap->cnt];
+		heap->arr[hpos]->hpos = hpos;
+		adjust_lpt_heap(c, heap, heap->arr[hpos], hpos, cat);
+	}
+	dbg_check_heap(c, heap, cat, -1);
+}
+
+/**
+ * lpt_heap_replace - replace lprops in a category heap.
+ * @c: UBIFS file-system description object
+ * @old_lprops: LEB properties to replace
+ * @new_lprops: LEB properties with which to replace
+ * @cat: LEB category
+ *
+ * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
+ * and the lprops that the pnode contains.  When that happens, references in
+ * the category heaps to those lprops must be updated to point to the new
+ * lprops.  This function does that.
+ */
+static void lpt_heap_replace(struct ubifs_info *c,
+			     struct ubifs_lprops *old_lprops,
+			     struct ubifs_lprops *new_lprops, int cat)
+{
+	struct ubifs_lpt_heap *heap;
+	int hpos = new_lprops->hpos;
+
+	heap = &c->lpt_heap[cat - 1];
+	heap->arr[hpos] = new_lprops;
+}
+
+/**
+ * ubifs_add_to_cat - add LEB properties to a category list or heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to add
+ * @cat: LEB category to which to add
+ *
+ * LEB properties are categorized to enable fast find operations.
+ */
+void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
+		      int cat)
+{
+	switch (cat) {
+	case LPROPS_DIRTY:
+	case LPROPS_DIRTY_IDX:
+	case LPROPS_FREE:
+		if (add_to_lpt_heap(c, lprops, cat))
+			break;
+		/* No more room on heap so make it un-categorized */
+		cat = LPROPS_UNCAT;
+		/* Fall through */
+	case LPROPS_UNCAT:
+		list_add(&lprops->list, &c->uncat_list);
+		break;
+	case LPROPS_EMPTY:
+		list_add(&lprops->list, &c->empty_list);
+		break;
+	case LPROPS_FREEABLE:
+		list_add(&lprops->list, &c->freeable_list);
+		c->freeable_cnt += 1;
+		break;
+	case LPROPS_FRDI_IDX:
+		list_add(&lprops->list, &c->frdi_idx_list);
+		break;
+	default:
+		ubifs_assert(0);
+	}
+	lprops->flags &= ~LPROPS_CAT_MASK;
+	lprops->flags |= cat;
+}
+
+/**
+ * ubifs_remove_from_cat - remove LEB properties from a category list or heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to remove
+ * @cat: LEB category from which to remove
+ *
+ * LEB properties are categorized to enable fast find operations.
+ */
+static void ubifs_remove_from_cat(struct ubifs_info *c,
+				  struct ubifs_lprops *lprops, int cat)
+{
+	switch (cat) {
+	case LPROPS_DIRTY:
+	case LPROPS_DIRTY_IDX:
+	case LPROPS_FREE:
+		remove_from_lpt_heap(c, lprops, cat);
+		break;
+	case LPROPS_FREEABLE:
+		c->freeable_cnt -= 1;
+		ubifs_assert(c->freeable_cnt >= 0);
+		/* Fall through */
+	case LPROPS_UNCAT:
+	case LPROPS_EMPTY:
+	case LPROPS_FRDI_IDX:
+		ubifs_assert(!list_empty(&lprops->list));
+		list_del(&lprops->list);
+		break;
+	default:
+		ubifs_assert(0);
+	}
+}
+
+/**
+ * ubifs_replace_cat - replace lprops in a category list or heap.
+ * @c: UBIFS file-system description object
+ * @old_lprops: LEB properties to replace
+ * @new_lprops: LEB properties with which to replace
+ *
+ * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
+ * and the lprops that the pnode contains. When that happens, references in
+ * category lists and heaps must be replaced. This function does that.
+ */
+void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
+		       struct ubifs_lprops *new_lprops)
+{
+	int cat;
+
+	cat = new_lprops->flags & LPROPS_CAT_MASK;
+	switch (cat) {
+	case LPROPS_DIRTY:
+	case LPROPS_DIRTY_IDX:
+	case LPROPS_FREE:
+		lpt_heap_replace(c, old_lprops, new_lprops, cat);
+		break;
+	case LPROPS_UNCAT:
+	case LPROPS_EMPTY:
+	case LPROPS_FREEABLE:
+	case LPROPS_FRDI_IDX:
+		list_replace(&old_lprops->list, &new_lprops->list);
+		break;
+	default:
+		ubifs_assert(0);
+	}
+}
+
+/**
+ * ubifs_ensure_cat - ensure LEB properties are categorized.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties
+ *
+ * A LEB may have fallen off of the bottom of a heap, and ended up as
+ * un-categorized even though it has enough space for us now. If that is the
+ * case this function will put the LEB back onto a heap.
+ */
+void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+	int cat = lprops->flags & LPROPS_CAT_MASK;
+
+	if (cat != LPROPS_UNCAT)
+		return;
+	cat = ubifs_categorize_lprops(c, lprops);
+	if (cat == LPROPS_UNCAT)
+		return;
+	ubifs_remove_from_cat(c, lprops, LPROPS_UNCAT);
+	ubifs_add_to_cat(c, lprops, cat);
+}
+
+/**
+ * ubifs_categorize_lprops - categorize LEB properties.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to categorize
+ *
+ * LEB properties are categorized to enable fast find operations. This function
+ * returns the LEB category to which the LEB properties belong. Note however
+ * that if the LEB category is stored as a heap and the heap is full, the
+ * LEB properties may have their category changed to %LPROPS_UNCAT.
+ */
+int ubifs_categorize_lprops(const struct ubifs_info *c,
+			    const struct ubifs_lprops *lprops)
+{
+	if (lprops->flags & LPROPS_TAKEN)
+		return LPROPS_UNCAT;
+
+	if (lprops->free == c->leb_size) {
+		ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+		return LPROPS_EMPTY;
+	}
+
+	if (lprops->free + lprops->dirty == c->leb_size) {
+		if (lprops->flags & LPROPS_INDEX)
+			return LPROPS_FRDI_IDX;
+		else
+			return LPROPS_FREEABLE;
+	}
+
+	if (lprops->flags & LPROPS_INDEX) {
+		if (lprops->dirty + lprops->free >= c->min_idx_node_sz)
+			return LPROPS_DIRTY_IDX;
+	} else {
+		if (lprops->dirty >= c->dead_wm &&
+		    lprops->dirty > lprops->free)
+			return LPROPS_DIRTY;
+		if (lprops->free > 0)
+			return LPROPS_FREE;
+	}
+
+	return LPROPS_UNCAT;
+}
+
+/**
+ * change_category - change LEB properties category.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to re-categorize
+ *
+ * LEB properties are categorized to enable fast find operations. When the LEB
+ * properties change they must be re-categorized.
+ */
+static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+	int old_cat = lprops->flags & LPROPS_CAT_MASK;
+	int new_cat = ubifs_categorize_lprops(c, lprops);
+
+	if (old_cat == new_cat) {
+		struct ubifs_lpt_heap *heap = &c->lpt_heap[new_cat - 1];
+
+		/* lprops on a heap now must be moved up or down */
+		if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT)
+			return; /* Not on a heap */
+		heap = &c->lpt_heap[new_cat - 1];
+		adjust_lpt_heap(c, heap, lprops, lprops->hpos, new_cat);
+	} else {
+		ubifs_remove_from_cat(c, lprops, old_cat);
+		ubifs_add_to_cat(c, lprops, new_cat);
+	}
+}
+
+/**
+ * ubifs_calc_dark - calculate LEB dark space size.
+ * @c: the UBIFS file-system description object
+ * @spc: amount of free and dirty space in the LEB
+ *
+ * This function calculates and returns amount of dark space in an LEB which
+ * has @spc bytes of free and dirty space.
+ *
+ * UBIFS is trying to account the space which might not be usable, and this
+ * space is called "dark space". For example, if an LEB has only %512 free
+ * bytes, it is dark space, because it cannot fit a large data node.
+ */
+int ubifs_calc_dark(const struct ubifs_info *c, int spc)
+{
+	ubifs_assert(!(spc & 7));
+
+	if (spc < c->dark_wm)
+		return spc;
+
+	/*
+	 * If we have slightly more space then the dark space watermark, we can
+	 * anyway safely assume it we'll be able to write a node of the
+	 * smallest size there.
+	 */
+	if (spc - c->dark_wm < MIN_WRITE_SZ)
+		return spc - MIN_WRITE_SZ;
+
+	return c->dark_wm;
+}
+
+/**
+ * is_lprops_dirty - determine if LEB properties are dirty.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to test
+ */
+static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+	struct ubifs_pnode *pnode;
+	int pos;
+
+	pos = (lprops->lnum - c->main_first) & (UBIFS_LPT_FANOUT - 1);
+	pnode = (struct ubifs_pnode *)container_of(lprops - pos,
+						   struct ubifs_pnode,
+						   lprops[0]);
+	return !test_bit(COW_ZNODE, &pnode->flags) &&
+	       test_bit(DIRTY_CNODE, &pnode->flags);
+}
+
+/**
+ * ubifs_change_lp - change LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lp: LEB properties to change
+ * @free: new free space amount
+ * @dirty: new dirty space amount
+ * @flags: new flags
+ * @idx_gc_cnt: change to the count of @idx_gc list
+ *
+ * This function changes LEB properties (@free, @dirty or @flag). However, the
+ * property which has the %LPROPS_NC value is not changed. Returns a pointer to
+ * the updated LEB properties on success and a negative error code on failure.
+ *
+ * Note, the LEB properties may have had to be copied (due to COW) and
+ * consequently the pointer returned may not be the same as the pointer
+ * passed.
+ */
+const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
+					   const struct ubifs_lprops *lp,
+					   int free, int dirty, int flags,
+					   int idx_gc_cnt)
+{
+	/*
+	 * This is the only function that is allowed to change lprops, so we
+	 * discard the "const" qualifier.
+	 */
+	struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp;
+
+	dbg_lp("LEB %d, free %d, dirty %d, flags %d",
+	       lprops->lnum, free, dirty, flags);
+
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+	ubifs_assert(c->lst.empty_lebs >= 0 &&
+		     c->lst.empty_lebs <= c->main_lebs);
+	ubifs_assert(c->freeable_cnt >= 0);
+	ubifs_assert(c->freeable_cnt <= c->main_lebs);
+	ubifs_assert(c->lst.taken_empty_lebs >= 0);
+	ubifs_assert(c->lst.taken_empty_lebs <= c->lst.empty_lebs);
+	ubifs_assert(!(c->lst.total_free & 7) && !(c->lst.total_dirty & 7));
+	ubifs_assert(!(c->lst.total_dead & 7) && !(c->lst.total_dark & 7));
+	ubifs_assert(!(c->lst.total_used & 7));
+	ubifs_assert(free == LPROPS_NC || free >= 0);
+	ubifs_assert(dirty == LPROPS_NC || dirty >= 0);
+
+	if (!is_lprops_dirty(c, lprops)) {
+		lprops = ubifs_lpt_lookup_dirty(c, lprops->lnum);
+		if (IS_ERR(lprops))
+			return lprops;
+	} else
+		ubifs_assert(lprops == ubifs_lpt_lookup_dirty(c, lprops->lnum));
+
+	ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7));
+
+	spin_lock(&c->space_lock);
+	if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
+		c->lst.taken_empty_lebs -= 1;
+
+	if (!(lprops->flags & LPROPS_INDEX)) {
+		int old_spc;
+
+		old_spc = lprops->free + lprops->dirty;
+		if (old_spc < c->dead_wm)
+			c->lst.total_dead -= old_spc;
+		else
+			c->lst.total_dark -= ubifs_calc_dark(c, old_spc);
+
+		c->lst.total_used -= c->leb_size - old_spc;
+	}
+
+	if (free != LPROPS_NC) {
+		free = ALIGN(free, 8);
+		c->lst.total_free += free - lprops->free;
+
+		/* Increase or decrease empty LEBs counter if needed */
+		if (free == c->leb_size) {
+			if (lprops->free != c->leb_size)
+				c->lst.empty_lebs += 1;
+		} else if (lprops->free == c->leb_size)
+			c->lst.empty_lebs -= 1;
+		lprops->free = free;
+	}
+
+	if (dirty != LPROPS_NC) {
+		dirty = ALIGN(dirty, 8);
+		c->lst.total_dirty += dirty - lprops->dirty;
+		lprops->dirty = dirty;
+	}
+
+	if (flags != LPROPS_NC) {
+		/* Take care about indexing LEBs counter if needed */
+		if ((lprops->flags & LPROPS_INDEX)) {
+			if (!(flags & LPROPS_INDEX))
+				c->lst.idx_lebs -= 1;
+		} else if (flags & LPROPS_INDEX)
+			c->lst.idx_lebs += 1;
+		lprops->flags = flags;
+	}
+
+	if (!(lprops->flags & LPROPS_INDEX)) {
+		int new_spc;
+
+		new_spc = lprops->free + lprops->dirty;
+		if (new_spc < c->dead_wm)
+			c->lst.total_dead += new_spc;
+		else
+			c->lst.total_dark += ubifs_calc_dark(c, new_spc);
+
+		c->lst.total_used += c->leb_size - new_spc;
+	}
+
+	if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
+		c->lst.taken_empty_lebs += 1;
+
+	change_category(c, lprops);
+	c->idx_gc_cnt += idx_gc_cnt;
+	spin_unlock(&c->space_lock);
+	return lprops;
+}
+
+/**
+ * ubifs_get_lp_stats - get lprops statistics.
+ * @c: UBIFS file-system description object
+ * @st: return statistics
+ */
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst)
+{
+	spin_lock(&c->space_lock);
+	memcpy(lst, &c->lst, sizeof(struct ubifs_lp_stats));
+	spin_unlock(&c->space_lock);
+}
+
+/**
+ * ubifs_change_one_lp - change LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to change properties for
+ * @free: amount of free space
+ * @dirty: amount of dirty space
+ * @flags_set: flags to set
+ * @flags_clean: flags to clean
+ * @idx_gc_cnt: change to the count of idx_gc list
+ *
+ * This function changes properties of LEB @lnum. It is a helper wrapper over
+ * 'ubifs_change_lp()' which hides lprops get/release. The arguments are the
+ * same as in case of 'ubifs_change_lp()'. Returns zero in case of success and
+ * a negative error code in case of failure.
+ */
+int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+			int flags_set, int flags_clean, int idx_gc_cnt)
+{
+	int err = 0, flags;
+	const struct ubifs_lprops *lp;
+
+	ubifs_get_lprops(c);
+
+	lp = ubifs_lpt_lookup_dirty(c, lnum);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	flags = (lp->flags | flags_set) & ~flags_clean;
+	lp = ubifs_change_lp(c, lp, free, dirty, flags, idx_gc_cnt);
+	if (IS_ERR(lp))
+		err = PTR_ERR(lp);
+
+out:
+	ubifs_release_lprops(c);
+	if (err)
+		ubifs_err("cannot change properties of LEB %d, error %d",
+			  lnum, err);
+	return err;
+}
+
+/**
+ * ubifs_update_one_lp - update LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to change properties for
+ * @free: amount of free space
+ * @dirty: amount of dirty space to add
+ * @flags_set: flags to set
+ * @flags_clean: flags to clean
+ *
+ * This function is the same as 'ubifs_change_one_lp()' but @dirty is added to
+ * current dirty space, not substitutes it.
+ */
+int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+			int flags_set, int flags_clean)
+{
+	int err = 0, flags;
+	const struct ubifs_lprops *lp;
+
+	ubifs_get_lprops(c);
+
+	lp = ubifs_lpt_lookup_dirty(c, lnum);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	flags = (lp->flags | flags_set) & ~flags_clean;
+	lp = ubifs_change_lp(c, lp, free, lp->dirty + dirty, flags, 0);
+	if (IS_ERR(lp))
+		err = PTR_ERR(lp);
+
+out:
+	ubifs_release_lprops(c);
+	if (err)
+		ubifs_err("cannot update properties of LEB %d, error %d",
+			  lnum, err);
+	return err;
+}
+
+/**
+ * ubifs_read_one_lp - read LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to read properties for
+ * @lp: where to store read properties
+ *
+ * This helper function reads properties of a LEB @lnum and stores them in @lp.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
+{
+	int err = 0;
+	const struct ubifs_lprops *lpp;
+
+	ubifs_get_lprops(c);
+
+	lpp = ubifs_lpt_lookup(c, lnum);
+	if (IS_ERR(lpp)) {
+		err = PTR_ERR(lpp);
+		ubifs_err("cannot read properties of LEB %d, error %d",
+			  lnum, err);
+		goto out;
+	}
+
+	memcpy(lp, lpp, sizeof(struct ubifs_lprops));
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * ubifs_fast_find_free - try to find a LEB with free space quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a LEB with free space or %NULL if
+ * the function is unable to find a LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+	heap = &c->lpt_heap[LPROPS_FREE - 1];
+	if (heap->cnt == 0)
+		return NULL;
+
+	lprops = heap->arr[0];
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+	return lprops;
+}
+
+/**
+ * ubifs_fast_find_empty - try to find an empty LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for an empty LEB or %NULL if the
+ * function is unable to find an empty LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+	if (list_empty(&c->empty_list))
+		return NULL;
+
+	lprops = list_entry(c->empty_list.next, struct ubifs_lprops, list);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+	ubifs_assert(lprops->free == c->leb_size);
+	return lprops;
+}
+
+/**
+ * ubifs_fast_find_freeable - try to find a freeable LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a freeable LEB or %NULL if the
+ * function is unable to find a freeable LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+	if (list_empty(&c->freeable_list))
+		return NULL;
+
+	lprops = list_entry(c->freeable_list.next, struct ubifs_lprops, list);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+	ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+	ubifs_assert(c->freeable_cnt > 0);
+	return lprops;
+}
+
+/**
+ * ubifs_fast_find_frdi_idx - try to find a freeable index LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a freeable index LEB or %NULL if the
+ * function is unable to find a freeable index LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+	if (list_empty(&c->frdi_idx_list))
+		return NULL;
+
+	lprops = list_entry(c->frdi_idx_list.next, struct ubifs_lprops, list);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert((lprops->flags & LPROPS_INDEX));
+	ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+	return lprops;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_check_cats - check category heaps and lists.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_cats(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+	struct list_head *pos;
+	int i, cat;
+
+	if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
+		return 0;
+
+	list_for_each_entry(lprops, &c->empty_list, list) {
+		if (lprops->free != c->leb_size) {
+			ubifs_err("non-empty LEB %d on empty list "
+				  "(free %d dirty %d flags %d)", lprops->lnum,
+				  lprops->free, lprops->dirty, lprops->flags);
+			return -EINVAL;
+		}
+		if (lprops->flags & LPROPS_TAKEN) {
+			ubifs_err("taken LEB %d on empty list "
+				  "(free %d dirty %d flags %d)", lprops->lnum,
+				  lprops->free, lprops->dirty, lprops->flags);
+			return -EINVAL;
+		}
+	}
+
+	i = 0;
+	list_for_each_entry(lprops, &c->freeable_list, list) {
+		if (lprops->free + lprops->dirty != c->leb_size) {
+			ubifs_err("non-freeable LEB %d on freeable list "
+				  "(free %d dirty %d flags %d)", lprops->lnum,
+				  lprops->free, lprops->dirty, lprops->flags);
+			return -EINVAL;
+		}
+		if (lprops->flags & LPROPS_TAKEN) {
+			ubifs_err("taken LEB %d on freeable list "
+				  "(free %d dirty %d flags %d)", lprops->lnum,
+				  lprops->free, lprops->dirty, lprops->flags);
+			return -EINVAL;
+		}
+		i += 1;
+	}
+	if (i != c->freeable_cnt) {
+		ubifs_err("freeable list count %d expected %d", i,
+			  c->freeable_cnt);
+		return -EINVAL;
+	}
+
+	i = 0;
+	list_for_each(pos, &c->idx_gc)
+		i += 1;
+	if (i != c->idx_gc_cnt) {
+		ubifs_err("idx_gc list count %d expected %d", i,
+			  c->idx_gc_cnt);
+		return -EINVAL;
+	}
+
+	list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+		if (lprops->free + lprops->dirty != c->leb_size) {
+			ubifs_err("non-freeable LEB %d on frdi_idx list "
+				  "(free %d dirty %d flags %d)", lprops->lnum,
+				  lprops->free, lprops->dirty, lprops->flags);
+			return -EINVAL;
+		}
+		if (lprops->flags & LPROPS_TAKEN) {
+			ubifs_err("taken LEB %d on frdi_idx list "
+				  "(free %d dirty %d flags %d)", lprops->lnum,
+				  lprops->free, lprops->dirty, lprops->flags);
+			return -EINVAL;
+		}
+		if (!(lprops->flags & LPROPS_INDEX)) {
+			ubifs_err("non-index LEB %d on frdi_idx list "
+				  "(free %d dirty %d flags %d)", lprops->lnum,
+				  lprops->free, lprops->dirty, lprops->flags);
+			return -EINVAL;
+		}
+	}
+
+	for (cat = 1; cat <= LPROPS_HEAP_CNT; cat++) {
+		struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+
+		for (i = 0; i < heap->cnt; i++) {
+			lprops = heap->arr[i];
+			if (!lprops) {
+				ubifs_err("null ptr in LPT heap cat %d", cat);
+				return -EINVAL;
+			}
+			if (lprops->hpos != i) {
+				ubifs_err("bad ptr in LPT heap cat %d", cat);
+				return -EINVAL;
+			}
+			if (lprops->flags & LPROPS_TAKEN) {
+				ubifs_err("taken LEB in LPT heap cat %d", cat);
+				return -EINVAL;
+			}
+		}
+	}
+
+	return 0;
+}
+
+void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
+		    int add_pos)
+{
+	int i = 0, j, err = 0;
+
+	if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
+		return;
+
+	for (i = 0; i < heap->cnt; i++) {
+		struct ubifs_lprops *lprops = heap->arr[i];
+		struct ubifs_lprops *lp;
+
+		if (i != add_pos)
+			if ((lprops->flags & LPROPS_CAT_MASK) != cat) {
+				err = 1;
+				goto out;
+			}
+		if (lprops->hpos != i) {
+			err = 2;
+			goto out;
+		}
+		lp = ubifs_lpt_lookup(c, lprops->lnum);
+		if (IS_ERR(lp)) {
+			err = 3;
+			goto out;
+		}
+		if (lprops != lp) {
+			dbg_msg("lprops %zx lp %zx lprops->lnum %d lp->lnum %d",
+				(size_t)lprops, (size_t)lp, lprops->lnum,
+				lp->lnum);
+			err = 4;
+			goto out;
+		}
+		for (j = 0; j < i; j++) {
+			lp = heap->arr[j];
+			if (lp == lprops) {
+				err = 5;
+				goto out;
+			}
+			if (lp->lnum == lprops->lnum) {
+				err = 6;
+				goto out;
+			}
+		}
+	}
+out:
+	if (err) {
+		dbg_msg("failed cat %d hpos %d err %d", cat, i, err);
+		dbg_dump_stack();
+		dbg_dump_heap(c, heap, cat);
+	}
+}
+
+/**
+ * scan_check_cb - scan callback.
+ * @c: the UBIFS file-system description object
+ * @lp: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @lst: lprops statistics to update
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_check_cb(struct ubifs_info *c,
+			 const struct ubifs_lprops *lp, int in_tree,
+			 struct ubifs_lp_stats *lst)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
+	void *buf = NULL;
+
+	cat = lp->flags & LPROPS_CAT_MASK;
+	if (cat != LPROPS_UNCAT) {
+		cat = ubifs_categorize_lprops(c, lp);
+		if (cat != (lp->flags & LPROPS_CAT_MASK)) {
+			ubifs_err("bad LEB category %d expected %d",
+				  (lp->flags & LPROPS_CAT_MASK), cat);
+			return -EINVAL;
+		}
+	}
+
+	/* Check lp is on its category list (if it has one) */
+	if (in_tree) {
+		struct list_head *list = NULL;
+
+		switch (cat) {
+		case LPROPS_EMPTY:
+			list = &c->empty_list;
+			break;
+		case LPROPS_FREEABLE:
+			list = &c->freeable_list;
+			break;
+		case LPROPS_FRDI_IDX:
+			list = &c->frdi_idx_list;
+			break;
+		case LPROPS_UNCAT:
+			list = &c->uncat_list;
+			break;
+		}
+		if (list) {
+			struct ubifs_lprops *lprops;
+			int found = 0;
+
+			list_for_each_entry(lprops, list, list) {
+				if (lprops == lp) {
+					found = 1;
+					break;
+				}
+			}
+			if (!found) {
+				ubifs_err("bad LPT list (category %d)", cat);
+				return -EINVAL;
+			}
+		}
+	}
+
+	/* Check lp is on its category heap (if it has one) */
+	if (in_tree && cat > 0 && cat <= LPROPS_HEAP_CNT) {
+		struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+
+		if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
+		    lp != heap->arr[lp->hpos]) {
+			ubifs_err("bad LPT heap (category %d)", cat);
+			return -EINVAL;
+		}
+	}
+
+	buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/*
+	 * After an unclean unmount, empty and freeable LEBs
+	 * may contain garbage - do not scan them.
+	 */
+	if (lp->free == c->leb_size) {
+		lst->empty_lebs += 1;
+		lst->total_free += c->leb_size;
+		lst->total_dark += ubifs_calc_dark(c, c->leb_size);
+		return LPT_SCAN_CONTINUE;
+	}
+	if (lp->free + lp->dirty == c->leb_size &&
+	    !(lp->flags & LPROPS_INDEX)) {
+		lst->total_free  += lp->free;
+		lst->total_dirty += lp->dirty;
+		lst->total_dark  +=  ubifs_calc_dark(c, c->leb_size);
+		return LPT_SCAN_CONTINUE;
+	}
+
+	sleb = ubifs_scan(c, lnum, 0, buf, 0);
+	if (IS_ERR(sleb)) {
+		ret = PTR_ERR(sleb);
+		if (ret == -EUCLEAN) {
+			dbg_dump_lprops(c);
+			dbg_dump_budg(c, &c->bi);
+		}
+		goto out;
+	}
+
+	is_idx = -1;
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		int found, level = 0;
+
+		cond_resched();
+
+		if (is_idx == -1)
+			is_idx = (snod->type == UBIFS_IDX_NODE) ? 1 : 0;
+
+		if (is_idx && snod->type != UBIFS_IDX_NODE) {
+			ubifs_err("indexing node in data LEB %d:%d",
+				  lnum, snod->offs);
+			goto out_destroy;
+		}
+
+		if (snod->type == UBIFS_IDX_NODE) {
+			struct ubifs_idx_node *idx = snod->node;
+
+			key_read(c, ubifs_idx_key(c, idx), &snod->key);
+			level = le16_to_cpu(idx->level);
+		}
+
+		found = ubifs_tnc_has_node(c, &snod->key, level, lnum,
+					   snod->offs, is_idx);
+		if (found) {
+			if (found < 0)
+				goto out_destroy;
+			used += ALIGN(snod->len, 8);
+		}
+	}
+
+	free = c->leb_size - sleb->endpt;
+	dirty = sleb->endpt - used;
+
+	if (free > c->leb_size || free < 0 || dirty > c->leb_size ||
+	    dirty < 0) {
+		ubifs_err("bad calculated accounting for LEB %d: "
+			  "free %d, dirty %d", lnum, free, dirty);
+		goto out_destroy;
+	}
+
+	if (lp->free + lp->dirty == c->leb_size &&
+	    free + dirty == c->leb_size)
+		if ((is_idx && !(lp->flags & LPROPS_INDEX)) ||
+		    (!is_idx && free == c->leb_size) ||
+		    lp->free == c->leb_size) {
+			/*
+			 * Empty or freeable LEBs could contain index
+			 * nodes from an uncompleted commit due to an
+			 * unclean unmount. Or they could be empty for
+			 * the same reason. Or it may simply not have been
+			 * unmapped.
+			 */
+			free = lp->free;
+			dirty = lp->dirty;
+			is_idx = 0;
+		    }
+
+	if (is_idx && lp->free + lp->dirty == free + dirty &&
+	    lnum != c->ihead_lnum) {
+		/*
+		 * After an unclean unmount, an index LEB could have a different
+		 * amount of free space than the value recorded by lprops. That
+		 * is because the in-the-gaps method may use free space or
+		 * create free space (as a side-effect of using ubi_leb_change
+		 * and not writing the whole LEB). The incorrect free space
+		 * value is not a problem because the index is only ever
+		 * allocated empty LEBs, so there will never be an attempt to
+		 * write to the free space at the end of an index LEB - except
+		 * by the in-the-gaps method for which it is not a problem.
+		 */
+		free = lp->free;
+		dirty = lp->dirty;
+	}
+
+	if (lp->free != free || lp->dirty != dirty)
+		goto out_print;
+
+	if (is_idx && !(lp->flags & LPROPS_INDEX)) {
+		if (free == c->leb_size)
+			/* Free but not unmapped LEB, it's fine */
+			is_idx = 0;
+		else {
+			ubifs_err("indexing node without indexing "
+				  "flag");
+			goto out_print;
+		}
+	}
+
+	if (!is_idx && (lp->flags & LPROPS_INDEX)) {
+		ubifs_err("data node with indexing flag");
+		goto out_print;
+	}
+
+	if (free == c->leb_size)
+		lst->empty_lebs += 1;
+
+	if (is_idx)
+		lst->idx_lebs += 1;
+
+	if (!(lp->flags & LPROPS_INDEX))
+		lst->total_used += c->leb_size - free - dirty;
+	lst->total_free += free;
+	lst->total_dirty += dirty;
+
+	if (!(lp->flags & LPROPS_INDEX)) {
+		int spc = free + dirty;
+
+		if (spc < c->dead_wm)
+			lst->total_dead += spc;
+		else
+			lst->total_dark += ubifs_calc_dark(c, spc);
+	}
+
+	ubifs_scan_destroy(sleb);
+	vfree(buf);
+	return LPT_SCAN_CONTINUE;
+
+out_print:
+	ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
+		  "should be free %d, dirty %d",
+		  lnum, lp->free, lp->dirty, lp->flags, free, dirty);
+	dbg_dump_leb(c, lnum);
+out_destroy:
+	ubifs_scan_destroy(sleb);
+	ret = -EINVAL;
+out:
+	vfree(buf);
+	return ret;
+}
+
+/**
+ * dbg_check_lprops - check all LEB properties.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks all LEB properties and makes sure they are all correct.
+ * It returns zero if everything is fine, %-EINVAL if there is an inconsistency
+ * and other negative error codes in case of other errors. This function is
+ * called while the file system is locked (because of commit start), so no
+ * additional locking is required. Note that locking the LPT mutex would cause
+ * a circular lock dependency with the TNC mutex.
+ */
+int dbg_check_lprops(struct ubifs_info *c)
+{
+	int i, err;
+	struct ubifs_lp_stats lst;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+		return 0;
+
+	/*
+	 * As we are going to scan the media, the write buffers have to be
+	 * synchronized.
+	 */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		if (err)
+			return err;
+	}
+
+	memset(&lst, 0, sizeof(struct ubifs_lp_stats));
+	err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
+				    (ubifs_lpt_scan_callback)scan_check_cb,
+				    &lst);
+	if (err && err != -ENOSPC)
+		goto out;
+
+	if (lst.empty_lebs != c->lst.empty_lebs ||
+	    lst.idx_lebs != c->lst.idx_lebs ||
+	    lst.total_free != c->lst.total_free ||
+	    lst.total_dirty != c->lst.total_dirty ||
+	    lst.total_used != c->lst.total_used) {
+		ubifs_err("bad overall accounting");
+		ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
+			  "total_free %lld, total_dirty %lld, total_used %lld",
+			  lst.empty_lebs, lst.idx_lebs, lst.total_free,
+			  lst.total_dirty, lst.total_used);
+		ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
+			  "total_free %lld, total_dirty %lld, total_used %lld",
+			  c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
+			  c->lst.total_dirty, c->lst.total_used);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (lst.total_dead != c->lst.total_dead ||
+	    lst.total_dark != c->lst.total_dark) {
+		ubifs_err("bad dead/dark space accounting");
+		ubifs_err("calculated: total_dead %lld, total_dark %lld",
+			  lst.total_dead, lst.total_dark);
+		ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
+			  c->lst.total_dead, c->lst.total_dark);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = dbg_check_cats(c);
+out:
+	return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
new file mode 100644
index 00000000..ef5155e1
--- /dev/null
+++ b/fs/ubifs/lpt.c
@@ -0,0 +1,2277 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the LEB properties tree (LPT) area. The LPT area
+ * contains the LEB properties tree, a table of LPT area eraseblocks (ltab), and
+ * (for the "big" model) a table of saved LEB numbers (lsave). The LPT area sits
+ * between the log and the orphan area.
+ *
+ * The LPT area is like a miniature self-contained file system. It is required
+ * that it never runs out of space, is fast to access and update, and scales
+ * logarithmically. The LEB properties tree is implemented as a wandering tree
+ * much like the TNC, and the LPT area has its own garbage collection.
+ *
+ * The LPT has two slightly different forms called the "small model" and the
+ * "big model". The small model is used when the entire LEB properties table
+ * can be written into a single eraseblock. In that case, garbage collection
+ * consists of just writing the whole table, which therefore makes all other
+ * eraseblocks reusable. In the case of the big model, dirty eraseblocks are
+ * selected for garbage collection, which consists of marking the clean nodes in
+ * that LEB as dirty, and then only the dirty nodes are written out. Also, in
+ * the case of the big model, a table of LEB numbers is saved so that the entire
+ * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first
+ * mounted.
+ */
+
+#include "ubifs.h"
+#include <linux/crc16.h>
+#include <linux/math64.h>
+#include <linux/slab.h>
+
+/**
+ * do_calc_lpt_geom - calculate sizes for the LPT area.
+ * @c: the UBIFS file-system description object
+ *
+ * Calculate the sizes of LPT bit fields, nodes, and tree, based on the
+ * properties of the flash and whether LPT is "big" (c->big_lpt).
+ */
+static void do_calc_lpt_geom(struct ubifs_info *c)
+{
+	int i, n, bits, per_leb_wastage, max_pnode_cnt;
+	long long sz, tot_wastage;
+
+	n = c->main_lebs + c->max_leb_cnt - c->leb_cnt;
+	max_pnode_cnt = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
+
+	c->lpt_hght = 1;
+	n = UBIFS_LPT_FANOUT;
+	while (n < max_pnode_cnt) {
+		c->lpt_hght += 1;
+		n <<= UBIFS_LPT_FANOUT_SHIFT;
+	}
+
+	c->pnode_cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+
+	n = DIV_ROUND_UP(c->pnode_cnt, UBIFS_LPT_FANOUT);
+	c->nnode_cnt = n;
+	for (i = 1; i < c->lpt_hght; i++) {
+		n = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
+		c->nnode_cnt += n;
+	}
+
+	c->space_bits = fls(c->leb_size) - 3;
+	c->lpt_lnum_bits = fls(c->lpt_lebs);
+	c->lpt_offs_bits = fls(c->leb_size - 1);
+	c->lpt_spc_bits = fls(c->leb_size);
+
+	n = DIV_ROUND_UP(c->max_leb_cnt, UBIFS_LPT_FANOUT);
+	c->pcnt_bits = fls(n - 1);
+
+	c->lnum_bits = fls(c->max_leb_cnt - 1);
+
+	bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+	       (c->big_lpt ? c->pcnt_bits : 0) +
+	       (c->space_bits * 2 + 1) * UBIFS_LPT_FANOUT;
+	c->pnode_sz = (bits + 7) / 8;
+
+	bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+	       (c->big_lpt ? c->pcnt_bits : 0) +
+	       (c->lpt_lnum_bits + c->lpt_offs_bits) * UBIFS_LPT_FANOUT;
+	c->nnode_sz = (bits + 7) / 8;
+
+	bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+	       c->lpt_lebs * c->lpt_spc_bits * 2;
+	c->ltab_sz = (bits + 7) / 8;
+
+	bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+	       c->lnum_bits * c->lsave_cnt;
+	c->lsave_sz = (bits + 7) / 8;
+
+	/* Calculate the minimum LPT size */
+	c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
+	c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
+	c->lpt_sz += c->ltab_sz;
+	if (c->big_lpt)
+		c->lpt_sz += c->lsave_sz;
+
+	/* Add wastage */
+	sz = c->lpt_sz;
+	per_leb_wastage = max_t(int, c->pnode_sz, c->nnode_sz);
+	sz += per_leb_wastage;
+	tot_wastage = per_leb_wastage;
+	while (sz > c->leb_size) {
+		sz += per_leb_wastage;
+		sz -= c->leb_size;
+		tot_wastage += per_leb_wastage;
+	}
+	tot_wastage += ALIGN(sz, c->min_io_size) - sz;
+	c->lpt_sz += tot_wastage;
+}
+
+/**
+ * ubifs_calc_lpt_geom - calculate and check sizes for the LPT area.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_calc_lpt_geom(struct ubifs_info *c)
+{
+	int lebs_needed;
+	long long sz;
+
+	do_calc_lpt_geom(c);
+
+	/* Verify that lpt_lebs is big enough */
+	sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
+	lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
+	if (lebs_needed > c->lpt_lebs) {
+		ubifs_err("too few LPT LEBs");
+		return -EINVAL;
+	}
+
+	/* Verify that ltab fits in a single LEB (since ltab is a single node */
+	if (c->ltab_sz > c->leb_size) {
+		ubifs_err("LPT ltab too big");
+		return -EINVAL;
+	}
+
+	c->check_lpt_free = c->big_lpt;
+	return 0;
+}
+
+/**
+ * calc_dflt_lpt_geom - calculate default LPT geometry.
+ * @c: the UBIFS file-system description object
+ * @main_lebs: number of main area LEBs is passed and returned here
+ * @big_lpt: whether the LPT area is "big" is returned here
+ *
+ * The size of the LPT area depends on parameters that themselves are dependent
+ * on the size of the LPT area. This function, successively recalculates the LPT
+ * area geometry until the parameters and resultant geometry are consistent.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
+			      int *big_lpt)
+{
+	int i, lebs_needed;
+	long long sz;
+
+	/* Start by assuming the minimum number of LPT LEBs */
+	c->lpt_lebs = UBIFS_MIN_LPT_LEBS;
+	c->main_lebs = *main_lebs - c->lpt_lebs;
+	if (c->main_lebs <= 0)
+		return -EINVAL;
+
+	/* And assume we will use the small LPT model */
+	c->big_lpt = 0;
+
+	/*
+	 * Calculate the geometry based on assumptions above and then see if it
+	 * makes sense
+	 */
+	do_calc_lpt_geom(c);
+
+	/* Small LPT model must have lpt_sz < leb_size */
+	if (c->lpt_sz > c->leb_size) {
+		/* Nope, so try again using big LPT model */
+		c->big_lpt = 1;
+		do_calc_lpt_geom(c);
+	}
+
+	/* Now check there are enough LPT LEBs */
+	for (i = 0; i < 64 ; i++) {
+		sz = c->lpt_sz * 4; /* Allow 4 times the size */
+		lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
+		if (lebs_needed > c->lpt_lebs) {
+			/* Not enough LPT LEBs so try again with more */
+			c->lpt_lebs = lebs_needed;
+			c->main_lebs = *main_lebs - c->lpt_lebs;
+			if (c->main_lebs <= 0)
+				return -EINVAL;
+			do_calc_lpt_geom(c);
+			continue;
+		}
+		if (c->ltab_sz > c->leb_size) {
+			ubifs_err("LPT ltab too big");
+			return -EINVAL;
+		}
+		*main_lebs = c->main_lebs;
+		*big_lpt = c->big_lpt;
+		return 0;
+	}
+	return -EINVAL;
+}
+
+/**
+ * pack_bits - pack bit fields end-to-end.
+ * @addr: address at which to pack (passed and next address returned)
+ * @pos: bit position at which to pack (passed and next position returned)
+ * @val: value to pack
+ * @nrbits: number of bits of value to pack (1-32)
+ */
+static void pack_bits(uint8_t **addr, int *pos, uint32_t val, int nrbits)
+{
+	uint8_t *p = *addr;
+	int b = *pos;
+
+	ubifs_assert(nrbits > 0);
+	ubifs_assert(nrbits <= 32);
+	ubifs_assert(*pos >= 0);
+	ubifs_assert(*pos < 8);
+	ubifs_assert((val >> nrbits) == 0 || nrbits == 32);
+	if (b) {
+		*p |= ((uint8_t)val) << b;
+		nrbits += b;
+		if (nrbits > 8) {
+			*++p = (uint8_t)(val >>= (8 - b));
+			if (nrbits > 16) {
+				*++p = (uint8_t)(val >>= 8);
+				if (nrbits > 24) {
+					*++p = (uint8_t)(val >>= 8);
+					if (nrbits > 32)
+						*++p = (uint8_t)(val >>= 8);
+				}
+			}
+		}
+	} else {
+		*p = (uint8_t)val;
+		if (nrbits > 8) {
+			*++p = (uint8_t)(val >>= 8);
+			if (nrbits > 16) {
+				*++p = (uint8_t)(val >>= 8);
+				if (nrbits > 24)
+					*++p = (uint8_t)(val >>= 8);
+			}
+		}
+	}
+	b = nrbits & 7;
+	if (b == 0)
+		p++;
+	*addr = p;
+	*pos = b;
+}
+
+/**
+ * ubifs_unpack_bits - unpack bit fields.
+ * @addr: address at which to unpack (passed and next address returned)
+ * @pos: bit position at which to unpack (passed and next position returned)
+ * @nrbits: number of bits of value to unpack (1-32)
+ *
+ * This functions returns the value unpacked.
+ */
+uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits)
+{
+	const int k = 32 - nrbits;
+	uint8_t *p = *addr;
+	int b = *pos;
+	uint32_t uninitialized_var(val);
+	const int bytes = (nrbits + b + 7) >> 3;
+
+	ubifs_assert(nrbits > 0);
+	ubifs_assert(nrbits <= 32);
+	ubifs_assert(*pos >= 0);
+	ubifs_assert(*pos < 8);
+	if (b) {
+		switch (bytes) {
+		case 2:
+			val = p[1];
+			break;
+		case 3:
+			val = p[1] | ((uint32_t)p[2] << 8);
+			break;
+		case 4:
+			val = p[1] | ((uint32_t)p[2] << 8) |
+				     ((uint32_t)p[3] << 16);
+			break;
+		case 5:
+			val = p[1] | ((uint32_t)p[2] << 8) |
+				     ((uint32_t)p[3] << 16) |
+				     ((uint32_t)p[4] << 24);
+		}
+		val <<= (8 - b);
+		val |= *p >> b;
+		nrbits += b;
+	} else {
+		switch (bytes) {
+		case 1:
+			val = p[0];
+			break;
+		case 2:
+			val = p[0] | ((uint32_t)p[1] << 8);
+			break;
+		case 3:
+			val = p[0] | ((uint32_t)p[1] << 8) |
+				     ((uint32_t)p[2] << 16);
+			break;
+		case 4:
+			val = p[0] | ((uint32_t)p[1] << 8) |
+				     ((uint32_t)p[2] << 16) |
+				     ((uint32_t)p[3] << 24);
+			break;
+		}
+	}
+	val <<= k;
+	val >>= k;
+	b = nrbits & 7;
+	p += nrbits >> 3;
+	*addr = p;
+	*pos = b;
+	ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32);
+	return val;
+}
+
+/**
+ * ubifs_pack_pnode - pack all the bit fields of a pnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @pnode: pnode to pack
+ */
+void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
+		      struct ubifs_pnode *pnode)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0;
+	uint16_t crc;
+
+	pack_bits(&addr, &pos, UBIFS_LPT_PNODE, UBIFS_LPT_TYPE_BITS);
+	if (c->big_lpt)
+		pack_bits(&addr, &pos, pnode->num, c->pcnt_bits);
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		pack_bits(&addr, &pos, pnode->lprops[i].free >> 3,
+			  c->space_bits);
+		pack_bits(&addr, &pos, pnode->lprops[i].dirty >> 3,
+			  c->space_bits);
+		if (pnode->lprops[i].flags & LPROPS_INDEX)
+			pack_bits(&addr, &pos, 1, 1);
+		else
+			pack_bits(&addr, &pos, 0, 1);
+	}
+	crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+		    c->pnode_sz - UBIFS_LPT_CRC_BYTES);
+	addr = buf;
+	pos = 0;
+	pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_pack_nnode - pack all the bit fields of a nnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @nnode: nnode to pack
+ */
+void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
+		      struct ubifs_nnode *nnode)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0;
+	uint16_t crc;
+
+	pack_bits(&addr, &pos, UBIFS_LPT_NNODE, UBIFS_LPT_TYPE_BITS);
+	if (c->big_lpt)
+		pack_bits(&addr, &pos, nnode->num, c->pcnt_bits);
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		int lnum = nnode->nbranch[i].lnum;
+
+		if (lnum == 0)
+			lnum = c->lpt_last + 1;
+		pack_bits(&addr, &pos, lnum - c->lpt_first, c->lpt_lnum_bits);
+		pack_bits(&addr, &pos, nnode->nbranch[i].offs,
+			  c->lpt_offs_bits);
+	}
+	crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+		    c->nnode_sz - UBIFS_LPT_CRC_BYTES);
+	addr = buf;
+	pos = 0;
+	pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_pack_ltab - pack the LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @ltab: LPT's own lprops table to pack
+ */
+void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
+		     struct ubifs_lpt_lprops *ltab)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0;
+	uint16_t crc;
+
+	pack_bits(&addr, &pos, UBIFS_LPT_LTAB, UBIFS_LPT_TYPE_BITS);
+	for (i = 0; i < c->lpt_lebs; i++) {
+		pack_bits(&addr, &pos, ltab[i].free, c->lpt_spc_bits);
+		pack_bits(&addr, &pos, ltab[i].dirty, c->lpt_spc_bits);
+	}
+	crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+		    c->ltab_sz - UBIFS_LPT_CRC_BYTES);
+	addr = buf;
+	pos = 0;
+	pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_pack_lsave - pack the LPT's save table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @lsave: LPT's save table to pack
+ */
+void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0;
+	uint16_t crc;
+
+	pack_bits(&addr, &pos, UBIFS_LPT_LSAVE, UBIFS_LPT_TYPE_BITS);
+	for (i = 0; i < c->lsave_cnt; i++)
+		pack_bits(&addr, &pos, lsave[i], c->lnum_bits);
+	crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+		    c->lsave_sz - UBIFS_LPT_CRC_BYTES);
+	addr = buf;
+	pos = 0;
+	pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_add_lpt_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to which to add dirty space
+ * @dirty: amount of dirty space to add
+ */
+void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty)
+{
+	if (!dirty || !lnum)
+		return;
+	dbg_lp("LEB %d add %d to %d",
+	       lnum, dirty, c->ltab[lnum - c->lpt_first].dirty);
+	ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+	c->ltab[lnum - c->lpt_first].dirty += dirty;
+}
+
+/**
+ * set_ltab - set LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @free: amount of free space
+ * @dirty: amount of dirty space
+ */
+static void set_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
+{
+	dbg_lp("LEB %d free %d dirty %d to %d %d",
+	       lnum, c->ltab[lnum - c->lpt_first].free,
+	       c->ltab[lnum - c->lpt_first].dirty, free, dirty);
+	ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+	c->ltab[lnum - c->lpt_first].free = free;
+	c->ltab[lnum - c->lpt_first].dirty = dirty;
+}
+
+/**
+ * ubifs_add_nnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode for which to add dirt
+ */
+void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode)
+{
+	struct ubifs_nnode *np = nnode->parent;
+
+	if (np)
+		ubifs_add_lpt_dirt(c, np->nbranch[nnode->iip].lnum,
+				   c->nnode_sz);
+	else {
+		ubifs_add_lpt_dirt(c, c->lpt_lnum, c->nnode_sz);
+		if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
+			c->lpt_drty_flgs |= LTAB_DIRTY;
+			ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
+		}
+	}
+}
+
+/**
+ * add_pnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode for which to add dirt
+ */
+static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+	ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
+			   c->pnode_sz);
+}
+
+/**
+ * calc_nnode_num - calculate nnode number.
+ * @row: the row in the tree (root is zero)
+ * @col: the column in the row (leftmost is zero)
+ *
+ * The nnode number is a number that uniquely identifies a nnode and can be used
+ * easily to traverse the tree from the root to that nnode.
+ *
+ * This function calculates and returns the nnode number for the nnode at @row
+ * and @col.
+ */
+static int calc_nnode_num(int row, int col)
+{
+	int num, bits;
+
+	num = 1;
+	while (row--) {
+		bits = (col & (UBIFS_LPT_FANOUT - 1));
+		col >>= UBIFS_LPT_FANOUT_SHIFT;
+		num <<= UBIFS_LPT_FANOUT_SHIFT;
+		num |= bits;
+	}
+	return num;
+}
+
+/**
+ * calc_nnode_num_from_parent - calculate nnode number.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * The nnode number is a number that uniquely identifies a nnode and can be used
+ * easily to traverse the tree from the root to that nnode.
+ *
+ * This function calculates and returns the nnode number based on the parent's
+ * nnode number and the index in parent.
+ */
+static int calc_nnode_num_from_parent(const struct ubifs_info *c,
+				      struct ubifs_nnode *parent, int iip)
+{
+	int num, shft;
+
+	if (!parent)
+		return 1;
+	shft = (c->lpt_hght - parent->level) * UBIFS_LPT_FANOUT_SHIFT;
+	num = parent->num ^ (1 << shft);
+	num |= (UBIFS_LPT_FANOUT + iip) << shft;
+	return num;
+}
+
+/**
+ * calc_pnode_num_from_parent - calculate pnode number.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * The pnode number is a number that uniquely identifies a pnode and can be used
+ * easily to traverse the tree from the root to that pnode.
+ *
+ * This function calculates and returns the pnode number based on the parent's
+ * nnode number and the index in parent.
+ */
+static int calc_pnode_num_from_parent(const struct ubifs_info *c,
+				      struct ubifs_nnode *parent, int iip)
+{
+	int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0;
+
+	for (i = 0; i < n; i++) {
+		num <<= UBIFS_LPT_FANOUT_SHIFT;
+		num |= pnum & (UBIFS_LPT_FANOUT - 1);
+		pnum >>= UBIFS_LPT_FANOUT_SHIFT;
+	}
+	num <<= UBIFS_LPT_FANOUT_SHIFT;
+	num |= iip;
+	return num;
+}
+
+/**
+ * ubifs_create_dflt_lpt - create default LPT.
+ * @c: UBIFS file-system description object
+ * @main_lebs: number of main area LEBs is passed and returned here
+ * @lpt_first: LEB number of first LPT LEB
+ * @lpt_lebs: number of LEBs for LPT is passed and returned here
+ * @big_lpt: use big LPT model is passed and returned here
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
+			  int *lpt_lebs, int *big_lpt)
+{
+	int lnum, err = 0, node_sz, iopos, i, j, cnt, len, alen, row;
+	int blnum, boffs, bsz, bcnt;
+	struct ubifs_pnode *pnode = NULL;
+	struct ubifs_nnode *nnode = NULL;
+	void *buf = NULL, *p;
+	struct ubifs_lpt_lprops *ltab = NULL;
+	int *lsave = NULL;
+
+	err = calc_dflt_lpt_geom(c, main_lebs, big_lpt);
+	if (err)
+		return err;
+	*lpt_lebs = c->lpt_lebs;
+
+	/* Needed by 'ubifs_pack_nnode()' and 'set_ltab()' */
+	c->lpt_first = lpt_first;
+	/* Needed by 'set_ltab()' */
+	c->lpt_last = lpt_first + c->lpt_lebs - 1;
+	/* Needed by 'ubifs_pack_lsave()' */
+	c->main_first = c->leb_cnt - *main_lebs;
+
+	lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_KERNEL);
+	pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL);
+	nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL);
+	buf = vmalloc(c->leb_size);
+	ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+	if (!pnode || !nnode || !buf || !ltab || !lsave) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	ubifs_assert(!c->ltab);
+	c->ltab = ltab; /* Needed by set_ltab */
+
+	/* Initialize LPT's own lprops */
+	for (i = 0; i < c->lpt_lebs; i++) {
+		ltab[i].free = c->leb_size;
+		ltab[i].dirty = 0;
+		ltab[i].tgc = 0;
+		ltab[i].cmt = 0;
+	}
+
+	lnum = lpt_first;
+	p = buf;
+	/* Number of leaf nodes (pnodes) */
+	cnt = c->pnode_cnt;
+
+	/*
+	 * The first pnode contains the LEB properties for the LEBs that contain
+	 * the root inode node and the root index node of the index tree.
+	 */
+	node_sz = ALIGN(ubifs_idx_node_sz(c, 1), 8);
+	iopos = ALIGN(node_sz, c->min_io_size);
+	pnode->lprops[0].free = c->leb_size - iopos;
+	pnode->lprops[0].dirty = iopos - node_sz;
+	pnode->lprops[0].flags = LPROPS_INDEX;
+
+	node_sz = UBIFS_INO_NODE_SZ;
+	iopos = ALIGN(node_sz, c->min_io_size);
+	pnode->lprops[1].free = c->leb_size - iopos;
+	pnode->lprops[1].dirty = iopos - node_sz;
+
+	for (i = 2; i < UBIFS_LPT_FANOUT; i++)
+		pnode->lprops[i].free = c->leb_size;
+
+	/* Add first pnode */
+	ubifs_pack_pnode(c, p, pnode);
+	p += c->pnode_sz;
+	len = c->pnode_sz;
+	pnode->num += 1;
+
+	/* Reset pnode values for remaining pnodes */
+	pnode->lprops[0].free = c->leb_size;
+	pnode->lprops[0].dirty = 0;
+	pnode->lprops[0].flags = 0;
+
+	pnode->lprops[1].free = c->leb_size;
+	pnode->lprops[1].dirty = 0;
+
+	/*
+	 * To calculate the internal node branches, we keep information about
+	 * the level below.
+	 */
+	blnum = lnum; /* LEB number of level below */
+	boffs = 0; /* Offset of level below */
+	bcnt = cnt; /* Number of nodes in level below */
+	bsz = c->pnode_sz; /* Size of nodes in level below */
+
+	/* Add all remaining pnodes */
+	for (i = 1; i < cnt; i++) {
+		if (len + c->pnode_sz > c->leb_size) {
+			alen = ALIGN(len, c->min_io_size);
+			set_ltab(c, lnum, c->leb_size - alen, alen - len);
+			memset(p, 0xff, alen - len);
+			err = ubi_leb_change(c->ubi, lnum++, buf, alen,
+					     UBI_SHORTTERM);
+			if (err)
+				goto out;
+			p = buf;
+			len = 0;
+		}
+		ubifs_pack_pnode(c, p, pnode);
+		p += c->pnode_sz;
+		len += c->pnode_sz;
+		/*
+		 * pnodes are simply numbered left to right starting at zero,
+		 * which means the pnode number can be used easily to traverse
+		 * down the tree to the corresponding pnode.
+		 */
+		pnode->num += 1;
+	}
+
+	row = 0;
+	for (i = UBIFS_LPT_FANOUT; cnt > i; i <<= UBIFS_LPT_FANOUT_SHIFT)
+		row += 1;
+	/* Add all nnodes, one level at a time */
+	while (1) {
+		/* Number of internal nodes (nnodes) at next level */
+		cnt = DIV_ROUND_UP(cnt, UBIFS_LPT_FANOUT);
+		for (i = 0; i < cnt; i++) {
+			if (len + c->nnode_sz > c->leb_size) {
+				alen = ALIGN(len, c->min_io_size);
+				set_ltab(c, lnum, c->leb_size - alen,
+					    alen - len);
+				memset(p, 0xff, alen - len);
+				err = ubi_leb_change(c->ubi, lnum++, buf, alen,
+						     UBI_SHORTTERM);
+				if (err)
+					goto out;
+				p = buf;
+				len = 0;
+			}
+			/* Only 1 nnode at this level, so it is the root */
+			if (cnt == 1) {
+				c->lpt_lnum = lnum;
+				c->lpt_offs = len;
+			}
+			/* Set branches to the level below */
+			for (j = 0; j < UBIFS_LPT_FANOUT; j++) {
+				if (bcnt) {
+					if (boffs + bsz > c->leb_size) {
+						blnum += 1;
+						boffs = 0;
+					}
+					nnode->nbranch[j].lnum = blnum;
+					nnode->nbranch[j].offs = boffs;
+					boffs += bsz;
+					bcnt--;
+				} else {
+					nnode->nbranch[j].lnum = 0;
+					nnode->nbranch[j].offs = 0;
+				}
+			}
+			nnode->num = calc_nnode_num(row, i);
+			ubifs_pack_nnode(c, p, nnode);
+			p += c->nnode_sz;
+			len += c->nnode_sz;
+		}
+		/* Only 1 nnode at this level, so it is the root */
+		if (cnt == 1)
+			break;
+		/* Update the information about the level below */
+		bcnt = cnt;
+		bsz = c->nnode_sz;
+		row -= 1;
+	}
+
+	if (*big_lpt) {
+		/* Need to add LPT's save table */
+		if (len + c->lsave_sz > c->leb_size) {
+			alen = ALIGN(len, c->min_io_size);
+			set_ltab(c, lnum, c->leb_size - alen, alen - len);
+			memset(p, 0xff, alen - len);
+			err = ubi_leb_change(c->ubi, lnum++, buf, alen,
+					     UBI_SHORTTERM);
+			if (err)
+				goto out;
+			p = buf;
+			len = 0;
+		}
+
+		c->lsave_lnum = lnum;
+		c->lsave_offs = len;
+
+		for (i = 0; i < c->lsave_cnt && i < *main_lebs; i++)
+			lsave[i] = c->main_first + i;
+		for (; i < c->lsave_cnt; i++)
+			lsave[i] = c->main_first;
+
+		ubifs_pack_lsave(c, p, lsave);
+		p += c->lsave_sz;
+		len += c->lsave_sz;
+	}
+
+	/* Need to add LPT's own LEB properties table */
+	if (len + c->ltab_sz > c->leb_size) {
+		alen = ALIGN(len, c->min_io_size);
+		set_ltab(c, lnum, c->leb_size - alen, alen - len);
+		memset(p, 0xff, alen - len);
+		err = ubi_leb_change(c->ubi, lnum++, buf, alen, UBI_SHORTTERM);
+		if (err)
+			goto out;
+		p = buf;
+		len = 0;
+	}
+
+	c->ltab_lnum = lnum;
+	c->ltab_offs = len;
+
+	/* Update ltab before packing it */
+	len += c->ltab_sz;
+	alen = ALIGN(len, c->min_io_size);
+	set_ltab(c, lnum, c->leb_size - alen, alen - len);
+
+	ubifs_pack_ltab(c, p, ltab);
+	p += c->ltab_sz;
+
+	/* Write remaining buffer */
+	memset(p, 0xff, alen - len);
+	err = ubi_leb_change(c->ubi, lnum, buf, alen, UBI_SHORTTERM);
+	if (err)
+		goto out;
+
+	c->nhead_lnum = lnum;
+	c->nhead_offs = ALIGN(len, c->min_io_size);
+
+	dbg_lp("space_bits %d", c->space_bits);
+	dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
+	dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
+	dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
+	dbg_lp("pcnt_bits %d", c->pcnt_bits);
+	dbg_lp("lnum_bits %d", c->lnum_bits);
+	dbg_lp("pnode_sz %d", c->pnode_sz);
+	dbg_lp("nnode_sz %d", c->nnode_sz);
+	dbg_lp("ltab_sz %d", c->ltab_sz);
+	dbg_lp("lsave_sz %d", c->lsave_sz);
+	dbg_lp("lsave_cnt %d", c->lsave_cnt);
+	dbg_lp("lpt_hght %d", c->lpt_hght);
+	dbg_lp("big_lpt %d", c->big_lpt);
+	dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+	dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+	dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+	if (c->big_lpt)
+		dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+out:
+	c->ltab = NULL;
+	kfree(lsave);
+	vfree(ltab);
+	vfree(buf);
+	kfree(nnode);
+	kfree(pnode);
+	return err;
+}
+
+/**
+ * update_cats - add LEB properties of a pnode to LEB category lists and heaps.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode
+ *
+ * When a pnode is loaded into memory, the LEB properties it contains are added,
+ * by this function, to the LEB category lists and heaps.
+ */
+static void update_cats(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+	int i;
+
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		int cat = pnode->lprops[i].flags & LPROPS_CAT_MASK;
+		int lnum = pnode->lprops[i].lnum;
+
+		if (!lnum)
+			return;
+		ubifs_add_to_cat(c, &pnode->lprops[i], cat);
+	}
+}
+
+/**
+ * replace_cats - add LEB properties of a pnode to LEB category lists and heaps.
+ * @c: UBIFS file-system description object
+ * @old_pnode: pnode copied
+ * @new_pnode: pnode copy
+ *
+ * During commit it is sometimes necessary to copy a pnode
+ * (see dirty_cow_pnode).  When that happens, references in
+ * category lists and heaps must be replaced.  This function does that.
+ */
+static void replace_cats(struct ubifs_info *c, struct ubifs_pnode *old_pnode,
+			 struct ubifs_pnode *new_pnode)
+{
+	int i;
+
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		if (!new_pnode->lprops[i].lnum)
+			return;
+		ubifs_replace_cat(c, &old_pnode->lprops[i],
+				  &new_pnode->lprops[i]);
+	}
+}
+
+/**
+ * check_lpt_crc - check LPT node crc is correct.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing node
+ * @len: length of node
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int check_lpt_crc(void *buf, int len)
+{
+	int pos = 0;
+	uint8_t *addr = buf;
+	uint16_t crc, calc_crc;
+
+	crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
+	calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+			 len - UBIFS_LPT_CRC_BYTES);
+	if (crc != calc_crc) {
+		ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc,
+			  calc_crc);
+		dbg_dump_stack();
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * check_lpt_type - check LPT node type is correct.
+ * @c: UBIFS file-system description object
+ * @addr: address of type bit field is passed and returned updated here
+ * @pos: position of type bit field is passed and returned updated here
+ * @type: expected type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int check_lpt_type(uint8_t **addr, int *pos, int type)
+{
+	int node_type;
+
+	node_type = ubifs_unpack_bits(addr, pos, UBIFS_LPT_TYPE_BITS);
+	if (node_type != type) {
+		ubifs_err("invalid type (%d) in LPT node type %d", node_type,
+			  type);
+		dbg_dump_stack();
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * unpack_pnode - unpack a pnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing packed pnode to unpack
+ * @pnode: pnode structure to fill
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_pnode(const struct ubifs_info *c, void *buf,
+			struct ubifs_pnode *pnode)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0, err;
+
+	err = check_lpt_type(&addr, &pos, UBIFS_LPT_PNODE);
+	if (err)
+		return err;
+	if (c->big_lpt)
+		pnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		struct ubifs_lprops * const lprops = &pnode->lprops[i];
+
+		lprops->free = ubifs_unpack_bits(&addr, &pos, c->space_bits);
+		lprops->free <<= 3;
+		lprops->dirty = ubifs_unpack_bits(&addr, &pos, c->space_bits);
+		lprops->dirty <<= 3;
+
+		if (ubifs_unpack_bits(&addr, &pos, 1))
+			lprops->flags = LPROPS_INDEX;
+		else
+			lprops->flags = 0;
+		lprops->flags |= ubifs_categorize_lprops(c, lprops);
+	}
+	err = check_lpt_crc(buf, c->pnode_sz);
+	return err;
+}
+
+/**
+ * ubifs_unpack_nnode - unpack a nnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing packed nnode to unpack
+ * @nnode: nnode structure to fill
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
+		       struct ubifs_nnode *nnode)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0, err;
+
+	err = check_lpt_type(&addr, &pos, UBIFS_LPT_NNODE);
+	if (err)
+		return err;
+	if (c->big_lpt)
+		nnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		int lnum;
+
+		lnum = ubifs_unpack_bits(&addr, &pos, c->lpt_lnum_bits) +
+		       c->lpt_first;
+		if (lnum == c->lpt_last + 1)
+			lnum = 0;
+		nnode->nbranch[i].lnum = lnum;
+		nnode->nbranch[i].offs = ubifs_unpack_bits(&addr, &pos,
+						     c->lpt_offs_bits);
+	}
+	err = check_lpt_crc(buf, c->nnode_sz);
+	return err;
+}
+
+/**
+ * unpack_ltab - unpack the LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer from which to unpack
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_ltab(const struct ubifs_info *c, void *buf)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0, err;
+
+	err = check_lpt_type(&addr, &pos, UBIFS_LPT_LTAB);
+	if (err)
+		return err;
+	for (i = 0; i < c->lpt_lebs; i++) {
+		int free = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
+		int dirty = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
+
+		if (free < 0 || free > c->leb_size || dirty < 0 ||
+		    dirty > c->leb_size || free + dirty > c->leb_size)
+			return -EINVAL;
+
+		c->ltab[i].free = free;
+		c->ltab[i].dirty = dirty;
+		c->ltab[i].tgc = 0;
+		c->ltab[i].cmt = 0;
+	}
+	err = check_lpt_crc(buf, c->ltab_sz);
+	return err;
+}
+
+/**
+ * unpack_lsave - unpack the LPT's save table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer from which to unpack
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_lsave(const struct ubifs_info *c, void *buf)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0, err;
+
+	err = check_lpt_type(&addr, &pos, UBIFS_LPT_LSAVE);
+	if (err)
+		return err;
+	for (i = 0; i < c->lsave_cnt; i++) {
+		int lnum = ubifs_unpack_bits(&addr, &pos, c->lnum_bits);
+
+		if (lnum < c->main_first || lnum >= c->leb_cnt)
+			return -EINVAL;
+		c->lsave[i] = lnum;
+	}
+	err = check_lpt_crc(buf, c->lsave_sz);
+	return err;
+}
+
+/**
+ * validate_nnode - validate a nnode.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode to validate
+ * @parent: parent nnode (or NULL for the root nnode)
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int validate_nnode(const struct ubifs_info *c, struct ubifs_nnode *nnode,
+			  struct ubifs_nnode *parent, int iip)
+{
+	int i, lvl, max_offs;
+
+	if (c->big_lpt) {
+		int num = calc_nnode_num_from_parent(c, parent, iip);
+
+		if (nnode->num != num)
+			return -EINVAL;
+	}
+	lvl = parent ? parent->level - 1 : c->lpt_hght;
+	if (lvl < 1)
+		return -EINVAL;
+	if (lvl == 1)
+		max_offs = c->leb_size - c->pnode_sz;
+	else
+		max_offs = c->leb_size - c->nnode_sz;
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		int lnum = nnode->nbranch[i].lnum;
+		int offs = nnode->nbranch[i].offs;
+
+		if (lnum == 0) {
+			if (offs != 0)
+				return -EINVAL;
+			continue;
+		}
+		if (lnum < c->lpt_first || lnum > c->lpt_last)
+			return -EINVAL;
+		if (offs < 0 || offs > max_offs)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * validate_pnode - validate a pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to validate
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int validate_pnode(const struct ubifs_info *c, struct ubifs_pnode *pnode,
+			  struct ubifs_nnode *parent, int iip)
+{
+	int i;
+
+	if (c->big_lpt) {
+		int num = calc_pnode_num_from_parent(c, parent, iip);
+
+		if (pnode->num != num)
+			return -EINVAL;
+	}
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		int free = pnode->lprops[i].free;
+		int dirty = pnode->lprops[i].dirty;
+
+		if (free < 0 || free > c->leb_size || free % c->min_io_size ||
+		    (free & 7))
+			return -EINVAL;
+		if (dirty < 0 || dirty > c->leb_size || (dirty & 7))
+			return -EINVAL;
+		if (dirty + free > c->leb_size)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * set_pnode_lnum - set LEB numbers on a pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to update
+ *
+ * This function calculates the LEB numbers for the LEB properties it contains
+ * based on the pnode number.
+ */
+static void set_pnode_lnum(const struct ubifs_info *c,
+			   struct ubifs_pnode *pnode)
+{
+	int i, lnum;
+
+	lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + c->main_first;
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		if (lnum >= c->leb_cnt)
+			return;
+		pnode->lprops[i].lnum = lnum++;
+	}
+}
+
+/**
+ * ubifs_read_nnode - read a nnode from flash and link it to the tree in memory.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode (or NULL for the root)
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch = NULL;
+	struct ubifs_nnode *nnode = NULL;
+	void *buf = c->lpt_nod_buf;
+	int err, lnum, offs;
+
+	if (parent) {
+		branch = &parent->nbranch[iip];
+		lnum = branch->lnum;
+		offs = branch->offs;
+	} else {
+		lnum = c->lpt_lnum;
+		offs = c->lpt_offs;
+	}
+	nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+	if (!nnode) {
+		err = -ENOMEM;
+		goto out;
+	}
+	if (lnum == 0) {
+		/*
+		 * This nnode was not written which just means that the LEB
+		 * properties in the subtree below it describe empty LEBs. We
+		 * make the nnode as though we had read it, which in fact means
+		 * doing almost nothing.
+		 */
+		if (c->big_lpt)
+			nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+	} else {
+		err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz);
+		if (err)
+			goto out;
+		err = ubifs_unpack_nnode(c, buf, nnode);
+		if (err)
+			goto out;
+	}
+	err = validate_nnode(c, nnode, parent, iip);
+	if (err)
+		goto out;
+	if (!c->big_lpt)
+		nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+	if (parent) {
+		branch->nnode = nnode;
+		nnode->level = parent->level - 1;
+	} else {
+		c->nroot = nnode;
+		nnode->level = c->lpt_hght;
+	}
+	nnode->parent = parent;
+	nnode->iip = iip;
+	return 0;
+
+out:
+	ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs);
+	kfree(nnode);
+	return err;
+}
+
+/**
+ * read_pnode - read a pnode from flash and link it to the tree in memory.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch;
+	struct ubifs_pnode *pnode = NULL;
+	void *buf = c->lpt_nod_buf;
+	int err, lnum, offs;
+
+	branch = &parent->nbranch[iip];
+	lnum = branch->lnum;
+	offs = branch->offs;
+	pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+	if (!pnode)
+		return -ENOMEM;
+
+	if (lnum == 0) {
+		/*
+		 * This pnode was not written which just means that the LEB
+		 * properties in it describe empty LEBs. We make the pnode as
+		 * though we had read it.
+		 */
+		int i;
+
+		if (c->big_lpt)
+			pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+			struct ubifs_lprops * const lprops = &pnode->lprops[i];
+
+			lprops->free = c->leb_size;
+			lprops->flags = ubifs_categorize_lprops(c, lprops);
+		}
+	} else {
+		err = ubi_read(c->ubi, lnum, buf, offs, c->pnode_sz);
+		if (err)
+			goto out;
+		err = unpack_pnode(c, buf, pnode);
+		if (err)
+			goto out;
+	}
+	err = validate_pnode(c, pnode, parent, iip);
+	if (err)
+		goto out;
+	if (!c->big_lpt)
+		pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+	branch->pnode = pnode;
+	pnode->parent = parent;
+	pnode->iip = iip;
+	set_pnode_lnum(c, pnode);
+	c->pnodes_have += 1;
+	return 0;
+
+out:
+	ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs);
+	dbg_dump_pnode(c, pnode, parent, iip);
+	dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip));
+	kfree(pnode);
+	return err;
+}
+
+/**
+ * read_ltab - read LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_ltab(struct ubifs_info *c)
+{
+	int err;
+	void *buf;
+
+	buf = vmalloc(c->ltab_sz);
+	if (!buf)
+		return -ENOMEM;
+	err = ubi_read(c->ubi, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz);
+	if (err)
+		goto out;
+	err = unpack_ltab(c, buf);
+out:
+	vfree(buf);
+	return err;
+}
+
+/**
+ * read_lsave - read LPT's save table.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_lsave(struct ubifs_info *c)
+{
+	int err, i;
+	void *buf;
+
+	buf = vmalloc(c->lsave_sz);
+	if (!buf)
+		return -ENOMEM;
+	err = ubi_read(c->ubi, c->lsave_lnum, buf, c->lsave_offs, c->lsave_sz);
+	if (err)
+		goto out;
+	err = unpack_lsave(c, buf);
+	if (err)
+		goto out;
+	for (i = 0; i < c->lsave_cnt; i++) {
+		int lnum = c->lsave[i];
+		struct ubifs_lprops *lprops;
+
+		/*
+		 * Due to automatic resizing, the values in the lsave table
+		 * could be beyond the volume size - just ignore them.
+		 */
+		if (lnum >= c->leb_cnt)
+			continue;
+		lprops = ubifs_lpt_lookup(c, lnum);
+		if (IS_ERR(lprops)) {
+			err = PTR_ERR(lprops);
+			goto out;
+		}
+	}
+out:
+	vfree(buf);
+	return err;
+}
+
+/**
+ * ubifs_get_nnode - get a nnode.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode (or NULL for the root)
+ * @iip: index in parent
+ *
+ * This function returns a pointer to the nnode on success or a negative error
+ * code on failure.
+ */
+struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
+				    struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch;
+	struct ubifs_nnode *nnode;
+	int err;
+
+	branch = &parent->nbranch[iip];
+	nnode = branch->nnode;
+	if (nnode)
+		return nnode;
+	err = ubifs_read_nnode(c, parent, iip);
+	if (err)
+		return ERR_PTR(err);
+	return branch->nnode;
+}
+
+/**
+ * ubifs_get_pnode - get a pnode.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns a pointer to the pnode on success or a negative error
+ * code on failure.
+ */
+struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
+				    struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch;
+	struct ubifs_pnode *pnode;
+	int err;
+
+	branch = &parent->nbranch[iip];
+	pnode = branch->pnode;
+	if (pnode)
+		return pnode;
+	err = read_pnode(c, parent, iip);
+	if (err)
+		return ERR_PTR(err);
+	update_cats(c, branch->pnode);
+	return branch->pnode;
+}
+
+/**
+ * ubifs_lpt_lookup - lookup LEB properties in the LPT.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to lookup
+ *
+ * This function returns a pointer to the LEB properties on success or a
+ * negative error code on failure.
+ */
+struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
+{
+	int err, i, h, iip, shft;
+	struct ubifs_nnode *nnode;
+	struct ubifs_pnode *pnode;
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return ERR_PTR(err);
+	}
+	nnode = c->nroot;
+	i = lnum - c->main_first;
+	shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+	for (h = 1; h < c->lpt_hght; h++) {
+		iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+		shft -= UBIFS_LPT_FANOUT_SHIFT;
+		nnode = ubifs_get_nnode(c, nnode, iip);
+		if (IS_ERR(nnode))
+			return ERR_CAST(nnode);
+	}
+	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+	shft -= UBIFS_LPT_FANOUT_SHIFT;
+	pnode = ubifs_get_pnode(c, nnode, iip);
+	if (IS_ERR(pnode))
+		return ERR_CAST(pnode);
+	iip = (i & (UBIFS_LPT_FANOUT - 1));
+	dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
+	       pnode->lprops[iip].free, pnode->lprops[iip].dirty,
+	       pnode->lprops[iip].flags);
+	return &pnode->lprops[iip];
+}
+
+/**
+ * dirty_cow_nnode - ensure a nnode is not being committed.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode to check
+ *
+ * Returns dirtied nnode on success or negative error code on failure.
+ */
+static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c,
+					   struct ubifs_nnode *nnode)
+{
+	struct ubifs_nnode *n;
+	int i;
+
+	if (!test_bit(COW_CNODE, &nnode->flags)) {
+		/* nnode is not being committed */
+		if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+			c->dirty_nn_cnt += 1;
+			ubifs_add_nnode_dirt(c, nnode);
+		}
+		return nnode;
+	}
+
+	/* nnode is being committed, so copy it */
+	n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+	if (unlikely(!n))
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(n, nnode, sizeof(struct ubifs_nnode));
+	n->cnext = NULL;
+	__set_bit(DIRTY_CNODE, &n->flags);
+	__clear_bit(COW_CNODE, &n->flags);
+
+	/* The children now have new parent */
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		struct ubifs_nbranch *branch = &n->nbranch[i];
+
+		if (branch->cnode)
+			branch->cnode->parent = n;
+	}
+
+	ubifs_assert(!test_bit(OBSOLETE_CNODE, &nnode->flags));
+	__set_bit(OBSOLETE_CNODE, &nnode->flags);
+
+	c->dirty_nn_cnt += 1;
+	ubifs_add_nnode_dirt(c, nnode);
+	if (nnode->parent)
+		nnode->parent->nbranch[n->iip].nnode = n;
+	else
+		c->nroot = n;
+	return n;
+}
+
+/**
+ * dirty_cow_pnode - ensure a pnode is not being committed.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to check
+ *
+ * Returns dirtied pnode on success or negative error code on failure.
+ */
+static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c,
+					   struct ubifs_pnode *pnode)
+{
+	struct ubifs_pnode *p;
+
+	if (!test_bit(COW_CNODE, &pnode->flags)) {
+		/* pnode is not being committed */
+		if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
+			c->dirty_pn_cnt += 1;
+			add_pnode_dirt(c, pnode);
+		}
+		return pnode;
+	}
+
+	/* pnode is being committed, so copy it */
+	p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+	if (unlikely(!p))
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(p, pnode, sizeof(struct ubifs_pnode));
+	p->cnext = NULL;
+	__set_bit(DIRTY_CNODE, &p->flags);
+	__clear_bit(COW_CNODE, &p->flags);
+	replace_cats(c, pnode, p);
+
+	ubifs_assert(!test_bit(OBSOLETE_CNODE, &pnode->flags));
+	__set_bit(OBSOLETE_CNODE, &pnode->flags);
+
+	c->dirty_pn_cnt += 1;
+	add_pnode_dirt(c, pnode);
+	pnode->parent->nbranch[p->iip].pnode = p;
+	return p;
+}
+
+/**
+ * ubifs_lpt_lookup_dirty - lookup LEB properties in the LPT.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to lookup
+ *
+ * This function returns a pointer to the LEB properties on success or a
+ * negative error code on failure.
+ */
+struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
+{
+	int err, i, h, iip, shft;
+	struct ubifs_nnode *nnode;
+	struct ubifs_pnode *pnode;
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return ERR_PTR(err);
+	}
+	nnode = c->nroot;
+	nnode = dirty_cow_nnode(c, nnode);
+	if (IS_ERR(nnode))
+		return ERR_CAST(nnode);
+	i = lnum - c->main_first;
+	shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+	for (h = 1; h < c->lpt_hght; h++) {
+		iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+		shft -= UBIFS_LPT_FANOUT_SHIFT;
+		nnode = ubifs_get_nnode(c, nnode, iip);
+		if (IS_ERR(nnode))
+			return ERR_CAST(nnode);
+		nnode = dirty_cow_nnode(c, nnode);
+		if (IS_ERR(nnode))
+			return ERR_CAST(nnode);
+	}
+	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+	shft -= UBIFS_LPT_FANOUT_SHIFT;
+	pnode = ubifs_get_pnode(c, nnode, iip);
+	if (IS_ERR(pnode))
+		return ERR_CAST(pnode);
+	pnode = dirty_cow_pnode(c, pnode);
+	if (IS_ERR(pnode))
+		return ERR_CAST(pnode);
+	iip = (i & (UBIFS_LPT_FANOUT - 1));
+	dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
+	       pnode->lprops[iip].free, pnode->lprops[iip].dirty,
+	       pnode->lprops[iip].flags);
+	ubifs_assert(test_bit(DIRTY_CNODE, &pnode->flags));
+	return &pnode->lprops[iip];
+}
+
+/**
+ * lpt_init_rd - initialize the LPT for reading.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_init_rd(struct ubifs_info *c)
+{
+	int err, i;
+
+	c->ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+	if (!c->ltab)
+		return -ENOMEM;
+
+	i = max_t(int, c->nnode_sz, c->pnode_sz);
+	c->lpt_nod_buf = kmalloc(i, GFP_KERNEL);
+	if (!c->lpt_nod_buf)
+		return -ENOMEM;
+
+	for (i = 0; i < LPROPS_HEAP_CNT; i++) {
+		c->lpt_heap[i].arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ,
+					     GFP_KERNEL);
+		if (!c->lpt_heap[i].arr)
+			return -ENOMEM;
+		c->lpt_heap[i].cnt = 0;
+		c->lpt_heap[i].max_cnt = LPT_HEAP_SZ;
+	}
+
+	c->dirty_idx.arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, GFP_KERNEL);
+	if (!c->dirty_idx.arr)
+		return -ENOMEM;
+	c->dirty_idx.cnt = 0;
+	c->dirty_idx.max_cnt = LPT_HEAP_SZ;
+
+	err = read_ltab(c);
+	if (err)
+		return err;
+
+	dbg_lp("space_bits %d", c->space_bits);
+	dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
+	dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
+	dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
+	dbg_lp("pcnt_bits %d", c->pcnt_bits);
+	dbg_lp("lnum_bits %d", c->lnum_bits);
+	dbg_lp("pnode_sz %d", c->pnode_sz);
+	dbg_lp("nnode_sz %d", c->nnode_sz);
+	dbg_lp("ltab_sz %d", c->ltab_sz);
+	dbg_lp("lsave_sz %d", c->lsave_sz);
+	dbg_lp("lsave_cnt %d", c->lsave_cnt);
+	dbg_lp("lpt_hght %d", c->lpt_hght);
+	dbg_lp("big_lpt %d", c->big_lpt);
+	dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+	dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+	dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+	if (c->big_lpt)
+		dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+
+	return 0;
+}
+
+/**
+ * lpt_init_wr - initialize the LPT for writing.
+ * @c: UBIFS file-system description object
+ *
+ * 'lpt_init_rd()' must have been called already.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_init_wr(struct ubifs_info *c)
+{
+	int err, i;
+
+	c->ltab_cmt = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+	if (!c->ltab_cmt)
+		return -ENOMEM;
+
+	c->lpt_buf = vmalloc(c->leb_size);
+	if (!c->lpt_buf)
+		return -ENOMEM;
+
+	if (c->big_lpt) {
+		c->lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_NOFS);
+		if (!c->lsave)
+			return -ENOMEM;
+		err = read_lsave(c);
+		if (err)
+			return err;
+	}
+
+	for (i = 0; i < c->lpt_lebs; i++)
+		if (c->ltab[i].free == c->leb_size) {
+			err = ubifs_leb_unmap(c, i + c->lpt_first);
+			if (err)
+				return err;
+		}
+
+	return 0;
+}
+
+/**
+ * ubifs_lpt_init - initialize the LPT.
+ * @c: UBIFS file-system description object
+ * @rd: whether to initialize lpt for reading
+ * @wr: whether to initialize lpt for writing
+ *
+ * For mounting 'rw', @rd and @wr are both true. For mounting 'ro', @rd is true
+ * and @wr is false. For mounting from 'ro' to 'rw', @rd is false and @wr is
+ * true.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr)
+{
+	int err;
+
+	if (rd) {
+		err = lpt_init_rd(c);
+		if (err)
+			return err;
+	}
+
+	if (wr) {
+		err = lpt_init_wr(c);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/**
+ * struct lpt_scan_node - somewhere to put nodes while we scan LPT.
+ * @nnode: where to keep a nnode
+ * @pnode: where to keep a pnode
+ * @cnode: where to keep a cnode
+ * @in_tree: is the node in the tree in memory
+ * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in
+ * the tree
+ * @ptr.pnode: ditto for pnode
+ * @ptr.cnode: ditto for cnode
+ */
+struct lpt_scan_node {
+	union {
+		struct ubifs_nnode nnode;
+		struct ubifs_pnode pnode;
+		struct ubifs_cnode cnode;
+	};
+	int in_tree;
+	union {
+		struct ubifs_nnode *nnode;
+		struct ubifs_pnode *pnode;
+		struct ubifs_cnode *cnode;
+	} ptr;
+};
+
+/**
+ * scan_get_nnode - for the scan, get a nnode from either the tree or flash.
+ * @c: the UBIFS file-system description object
+ * @path: where to put the nnode
+ * @parent: parent of the nnode
+ * @iip: index in parent of the nnode
+ *
+ * This function returns a pointer to the nnode on success or a negative error
+ * code on failure.
+ */
+static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
+					  struct lpt_scan_node *path,
+					  struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch;
+	struct ubifs_nnode *nnode;
+	void *buf = c->lpt_nod_buf;
+	int err;
+
+	branch = &parent->nbranch[iip];
+	nnode = branch->nnode;
+	if (nnode) {
+		path->in_tree = 1;
+		path->ptr.nnode = nnode;
+		return nnode;
+	}
+	nnode = &path->nnode;
+	path->in_tree = 0;
+	path->ptr.nnode = nnode;
+	memset(nnode, 0, sizeof(struct ubifs_nnode));
+	if (branch->lnum == 0) {
+		/*
+		 * This nnode was not written which just means that the LEB
+		 * properties in the subtree below it describe empty LEBs. We
+		 * make the nnode as though we had read it, which in fact means
+		 * doing almost nothing.
+		 */
+		if (c->big_lpt)
+			nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+	} else {
+		err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
+			       c->nnode_sz);
+		if (err)
+			return ERR_PTR(err);
+		err = ubifs_unpack_nnode(c, buf, nnode);
+		if (err)
+			return ERR_PTR(err);
+	}
+	err = validate_nnode(c, nnode, parent, iip);
+	if (err)
+		return ERR_PTR(err);
+	if (!c->big_lpt)
+		nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+	nnode->level = parent->level - 1;
+	nnode->parent = parent;
+	nnode->iip = iip;
+	return nnode;
+}
+
+/**
+ * scan_get_pnode - for the scan, get a pnode from either the tree or flash.
+ * @c: the UBIFS file-system description object
+ * @path: where to put the pnode
+ * @parent: parent of the pnode
+ * @iip: index in parent of the pnode
+ *
+ * This function returns a pointer to the pnode on success or a negative error
+ * code on failure.
+ */
+static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c,
+					  struct lpt_scan_node *path,
+					  struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch;
+	struct ubifs_pnode *pnode;
+	void *buf = c->lpt_nod_buf;
+	int err;
+
+	branch = &parent->nbranch[iip];
+	pnode = branch->pnode;
+	if (pnode) {
+		path->in_tree = 1;
+		path->ptr.pnode = pnode;
+		return pnode;
+	}
+	pnode = &path->pnode;
+	path->in_tree = 0;
+	path->ptr.pnode = pnode;
+	memset(pnode, 0, sizeof(struct ubifs_pnode));
+	if (branch->lnum == 0) {
+		/*
+		 * This pnode was not written which just means that the LEB
+		 * properties in it describe empty LEBs. We make the pnode as
+		 * though we had read it.
+		 */
+		int i;
+
+		if (c->big_lpt)
+			pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+			struct ubifs_lprops * const lprops = &pnode->lprops[i];
+
+			lprops->free = c->leb_size;
+			lprops->flags = ubifs_categorize_lprops(c, lprops);
+		}
+	} else {
+		ubifs_assert(branch->lnum >= c->lpt_first &&
+			     branch->lnum <= c->lpt_last);
+		ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size);
+		err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
+			       c->pnode_sz);
+		if (err)
+			return ERR_PTR(err);
+		err = unpack_pnode(c, buf, pnode);
+		if (err)
+			return ERR_PTR(err);
+	}
+	err = validate_pnode(c, pnode, parent, iip);
+	if (err)
+		return ERR_PTR(err);
+	if (!c->big_lpt)
+		pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+	pnode->parent = parent;
+	pnode->iip = iip;
+	set_pnode_lnum(c, pnode);
+	return pnode;
+}
+
+/**
+ * ubifs_lpt_scan_nolock - scan the LPT.
+ * @c: the UBIFS file-system description object
+ * @start_lnum: LEB number from which to start scanning
+ * @end_lnum: LEB number at which to stop scanning
+ * @scan_cb: callback function called for each lprops
+ * @data: data to be passed to the callback function
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
+			  ubifs_lpt_scan_callback scan_cb, void *data)
+{
+	int err = 0, i, h, iip, shft;
+	struct ubifs_nnode *nnode;
+	struct ubifs_pnode *pnode;
+	struct lpt_scan_node *path;
+
+	if (start_lnum == -1) {
+		start_lnum = end_lnum + 1;
+		if (start_lnum >= c->leb_cnt)
+			start_lnum = c->main_first;
+	}
+
+	ubifs_assert(start_lnum >= c->main_first && start_lnum < c->leb_cnt);
+	ubifs_assert(end_lnum >= c->main_first && end_lnum < c->leb_cnt);
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return err;
+	}
+
+	path = kmalloc(sizeof(struct lpt_scan_node) * (c->lpt_hght + 1),
+		       GFP_NOFS);
+	if (!path)
+		return -ENOMEM;
+
+	path[0].ptr.nnode = c->nroot;
+	path[0].in_tree = 1;
+again:
+	/* Descend to the pnode containing start_lnum */
+	nnode = c->nroot;
+	i = start_lnum - c->main_first;
+	shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+	for (h = 1; h < c->lpt_hght; h++) {
+		iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+		shft -= UBIFS_LPT_FANOUT_SHIFT;
+		nnode = scan_get_nnode(c, path + h, nnode, iip);
+		if (IS_ERR(nnode)) {
+			err = PTR_ERR(nnode);
+			goto out;
+		}
+	}
+	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+	shft -= UBIFS_LPT_FANOUT_SHIFT;
+	pnode = scan_get_pnode(c, path + h, nnode, iip);
+	if (IS_ERR(pnode)) {
+		err = PTR_ERR(pnode);
+		goto out;
+	}
+	iip = (i & (UBIFS_LPT_FANOUT - 1));
+
+	/* Loop for each lprops */
+	while (1) {
+		struct ubifs_lprops *lprops = &pnode->lprops[iip];
+		int ret, lnum = lprops->lnum;
+
+		ret = scan_cb(c, lprops, path[h].in_tree, data);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		if (ret & LPT_SCAN_ADD) {
+			/* Add all the nodes in path to the tree in memory */
+			for (h = 1; h < c->lpt_hght; h++) {
+				const size_t sz = sizeof(struct ubifs_nnode);
+				struct ubifs_nnode *parent;
+
+				if (path[h].in_tree)
+					continue;
+				nnode = kmalloc(sz, GFP_NOFS);
+				if (!nnode) {
+					err = -ENOMEM;
+					goto out;
+				}
+				memcpy(nnode, &path[h].nnode, sz);
+				parent = nnode->parent;
+				parent->nbranch[nnode->iip].nnode = nnode;
+				path[h].ptr.nnode = nnode;
+				path[h].in_tree = 1;
+				path[h + 1].cnode.parent = nnode;
+			}
+			if (path[h].in_tree)
+				ubifs_ensure_cat(c, lprops);
+			else {
+				const size_t sz = sizeof(struct ubifs_pnode);
+				struct ubifs_nnode *parent;
+
+				pnode = kmalloc(sz, GFP_NOFS);
+				if (!pnode) {
+					err = -ENOMEM;
+					goto out;
+				}
+				memcpy(pnode, &path[h].pnode, sz);
+				parent = pnode->parent;
+				parent->nbranch[pnode->iip].pnode = pnode;
+				path[h].ptr.pnode = pnode;
+				path[h].in_tree = 1;
+				update_cats(c, pnode);
+				c->pnodes_have += 1;
+			}
+			err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)
+						  c->nroot, 0, 0);
+			if (err)
+				goto out;
+			err = dbg_check_cats(c);
+			if (err)
+				goto out;
+		}
+		if (ret & LPT_SCAN_STOP) {
+			err = 0;
+			break;
+		}
+		/* Get the next lprops */
+		if (lnum == end_lnum) {
+			/*
+			 * We got to the end without finding what we were
+			 * looking for
+			 */
+			err = -ENOSPC;
+			goto out;
+		}
+		if (lnum + 1 >= c->leb_cnt) {
+			/* Wrap-around to the beginning */
+			start_lnum = c->main_first;
+			goto again;
+		}
+		if (iip + 1 < UBIFS_LPT_FANOUT) {
+			/* Next lprops is in the same pnode */
+			iip += 1;
+			continue;
+		}
+		/* We need to get the next pnode. Go up until we can go right */
+		iip = pnode->iip;
+		while (1) {
+			h -= 1;
+			ubifs_assert(h >= 0);
+			nnode = path[h].ptr.nnode;
+			if (iip + 1 < UBIFS_LPT_FANOUT)
+				break;
+			iip = nnode->iip;
+		}
+		/* Go right */
+		iip += 1;
+		/* Descend to the pnode */
+		h += 1;
+		for (; h < c->lpt_hght; h++) {
+			nnode = scan_get_nnode(c, path + h, nnode, iip);
+			if (IS_ERR(nnode)) {
+				err = PTR_ERR(nnode);
+				goto out;
+			}
+			iip = 0;
+		}
+		pnode = scan_get_pnode(c, path + h, nnode, iip);
+		if (IS_ERR(pnode)) {
+			err = PTR_ERR(pnode);
+			goto out;
+		}
+		iip = 0;
+	}
+out:
+	kfree(path);
+	return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_chk_pnode - check a pnode.
+ * @c: the UBIFS file-system description object
+ * @pnode: pnode to check
+ * @col: pnode column
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+			 int col)
+{
+	int i;
+
+	if (pnode->num != col) {
+		dbg_err("pnode num %d expected %d parent num %d iip %d",
+			pnode->num, col, pnode->parent->num, pnode->iip);
+		return -EINVAL;
+	}
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		struct ubifs_lprops *lp, *lprops = &pnode->lprops[i];
+		int lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + i +
+			   c->main_first;
+		int found, cat = lprops->flags & LPROPS_CAT_MASK;
+		struct ubifs_lpt_heap *heap;
+		struct list_head *list = NULL;
+
+		if (lnum >= c->leb_cnt)
+			continue;
+		if (lprops->lnum != lnum) {
+			dbg_err("bad LEB number %d expected %d",
+				lprops->lnum, lnum);
+			return -EINVAL;
+		}
+		if (lprops->flags & LPROPS_TAKEN) {
+			if (cat != LPROPS_UNCAT) {
+				dbg_err("LEB %d taken but not uncat %d",
+					lprops->lnum, cat);
+				return -EINVAL;
+			}
+			continue;
+		}
+		if (lprops->flags & LPROPS_INDEX) {
+			switch (cat) {
+			case LPROPS_UNCAT:
+			case LPROPS_DIRTY_IDX:
+			case LPROPS_FRDI_IDX:
+				break;
+			default:
+				dbg_err("LEB %d index but cat %d",
+					lprops->lnum, cat);
+				return -EINVAL;
+			}
+		} else {
+			switch (cat) {
+			case LPROPS_UNCAT:
+			case LPROPS_DIRTY:
+			case LPROPS_FREE:
+			case LPROPS_EMPTY:
+			case LPROPS_FREEABLE:
+				break;
+			default:
+				dbg_err("LEB %d not index but cat %d",
+					lprops->lnum, cat);
+				return -EINVAL;
+			}
+		}
+		switch (cat) {
+		case LPROPS_UNCAT:
+			list = &c->uncat_list;
+			break;
+		case LPROPS_EMPTY:
+			list = &c->empty_list;
+			break;
+		case LPROPS_FREEABLE:
+			list = &c->freeable_list;
+			break;
+		case LPROPS_FRDI_IDX:
+			list = &c->frdi_idx_list;
+			break;
+		}
+		found = 0;
+		switch (cat) {
+		case LPROPS_DIRTY:
+		case LPROPS_DIRTY_IDX:
+		case LPROPS_FREE:
+			heap = &c->lpt_heap[cat - 1];
+			if (lprops->hpos < heap->cnt &&
+			    heap->arr[lprops->hpos] == lprops)
+				found = 1;
+			break;
+		case LPROPS_UNCAT:
+		case LPROPS_EMPTY:
+		case LPROPS_FREEABLE:
+		case LPROPS_FRDI_IDX:
+			list_for_each_entry(lp, list, list)
+				if (lprops == lp) {
+					found = 1;
+					break;
+				}
+			break;
+		}
+		if (!found) {
+			dbg_err("LEB %d cat %d not found in cat heap/list",
+				lprops->lnum, cat);
+			return -EINVAL;
+		}
+		switch (cat) {
+		case LPROPS_EMPTY:
+			if (lprops->free != c->leb_size) {
+				dbg_err("LEB %d cat %d free %d dirty %d",
+					lprops->lnum, cat, lprops->free,
+					lprops->dirty);
+				return -EINVAL;
+			}
+		case LPROPS_FREEABLE:
+		case LPROPS_FRDI_IDX:
+			if (lprops->free + lprops->dirty != c->leb_size) {
+				dbg_err("LEB %d cat %d free %d dirty %d",
+					lprops->lnum, cat, lprops->free,
+					lprops->dirty);
+				return -EINVAL;
+			}
+		}
+	}
+	return 0;
+}
+
+/**
+ * dbg_check_lpt_nodes - check nnodes and pnodes.
+ * @c: the UBIFS file-system description object
+ * @cnode: next cnode (nnode or pnode) to check
+ * @row: row of cnode (root is zero)
+ * @col: column of cnode (leftmost is zero)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
+			int row, int col)
+{
+	struct ubifs_nnode *nnode, *nn;
+	struct ubifs_cnode *cn;
+	int num, iip = 0, err;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+		return 0;
+
+	while (cnode) {
+		ubifs_assert(row >= 0);
+		nnode = cnode->parent;
+		if (cnode->level) {
+			/* cnode is a nnode */
+			num = calc_nnode_num(row, col);
+			if (cnode->num != num) {
+				dbg_err("nnode num %d expected %d "
+					"parent num %d iip %d", cnode->num, num,
+					(nnode ? nnode->num : 0), cnode->iip);
+				return -EINVAL;
+			}
+			nn = (struct ubifs_nnode *)cnode;
+			while (iip < UBIFS_LPT_FANOUT) {
+				cn = nn->nbranch[iip].cnode;
+				if (cn) {
+					/* Go down */
+					row += 1;
+					col <<= UBIFS_LPT_FANOUT_SHIFT;
+					col += iip;
+					iip = 0;
+					cnode = cn;
+					break;
+				}
+				/* Go right */
+				iip += 1;
+			}
+			if (iip < UBIFS_LPT_FANOUT)
+				continue;
+		} else {
+			struct ubifs_pnode *pnode;
+
+			/* cnode is a pnode */
+			pnode = (struct ubifs_pnode *)cnode;
+			err = dbg_chk_pnode(c, pnode, col);
+			if (err)
+				return err;
+		}
+		/* Go up and to the right */
+		row -= 1;
+		col >>= UBIFS_LPT_FANOUT_SHIFT;
+		iip = cnode->iip + 1;
+		cnode = (struct ubifs_cnode *)nnode;
+	}
+	return 0;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
new file mode 100644
index 00000000..dfcb5748
--- /dev/null
+++ b/fs/ubifs/lpt_commit.c
@@ -0,0 +1,2050 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements commit-related functionality of the LEB properties
+ * subsystem.
+ */
+
+#include <linux/crc16.h>
+#include <linux/slab.h>
+#include "ubifs.h"
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_populate_lsave(struct ubifs_info *c);
+#else
+#define dbg_populate_lsave(c) 0
+#endif
+
+/**
+ * first_dirty_cnode - find first dirty cnode.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode at which to start
+ *
+ * This function returns the first dirty cnode or %NULL if there is not one.
+ */
+static struct ubifs_cnode *first_dirty_cnode(struct ubifs_nnode *nnode)
+{
+	ubifs_assert(nnode);
+	while (1) {
+		int i, cont = 0;
+
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+			struct ubifs_cnode *cnode;
+
+			cnode = nnode->nbranch[i].cnode;
+			if (cnode &&
+			    test_bit(DIRTY_CNODE, &cnode->flags)) {
+				if (cnode->level == 0)
+					return cnode;
+				nnode = (struct ubifs_nnode *)cnode;
+				cont = 1;
+				break;
+			}
+		}
+		if (!cont)
+			return (struct ubifs_cnode *)nnode;
+	}
+}
+
+/**
+ * next_dirty_cnode - find next dirty cnode.
+ * @cnode: cnode from which to begin searching
+ *
+ * This function returns the next dirty cnode or %NULL if there is not one.
+ */
+static struct ubifs_cnode *next_dirty_cnode(struct ubifs_cnode *cnode)
+{
+	struct ubifs_nnode *nnode;
+	int i;
+
+	ubifs_assert(cnode);
+	nnode = cnode->parent;
+	if (!nnode)
+		return NULL;
+	for (i = cnode->iip + 1; i < UBIFS_LPT_FANOUT; i++) {
+		cnode = nnode->nbranch[i].cnode;
+		if (cnode && test_bit(DIRTY_CNODE, &cnode->flags)) {
+			if (cnode->level == 0)
+				return cnode; /* cnode is a pnode */
+			/* cnode is a nnode */
+			return first_dirty_cnode((struct ubifs_nnode *)cnode);
+		}
+	}
+	return (struct ubifs_cnode *)nnode;
+}
+
+/**
+ * get_cnodes_to_commit - create list of dirty cnodes to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of cnodes to commit.
+ */
+static int get_cnodes_to_commit(struct ubifs_info *c)
+{
+	struct ubifs_cnode *cnode, *cnext;
+	int cnt = 0;
+
+	if (!c->nroot)
+		return 0;
+
+	if (!test_bit(DIRTY_CNODE, &c->nroot->flags))
+		return 0;
+
+	c->lpt_cnext = first_dirty_cnode(c->nroot);
+	cnode = c->lpt_cnext;
+	if (!cnode)
+		return 0;
+	cnt += 1;
+	while (1) {
+		ubifs_assert(!test_bit(COW_ZNODE, &cnode->flags));
+		__set_bit(COW_ZNODE, &cnode->flags);
+		cnext = next_dirty_cnode(cnode);
+		if (!cnext) {
+			cnode->cnext = c->lpt_cnext;
+			break;
+		}
+		cnode->cnext = cnext;
+		cnode = cnext;
+		cnt += 1;
+	}
+	dbg_cmt("committing %d cnodes", cnt);
+	dbg_lp("committing %d cnodes", cnt);
+	ubifs_assert(cnt == c->dirty_nn_cnt + c->dirty_pn_cnt);
+	return cnt;
+}
+
+/**
+ * upd_ltab - update LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @free: amount of free space
+ * @dirty: amount of dirty space to add
+ */
+static void upd_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
+{
+	dbg_lp("LEB %d free %d dirty %d to %d +%d",
+	       lnum, c->ltab[lnum - c->lpt_first].free,
+	       c->ltab[lnum - c->lpt_first].dirty, free, dirty);
+	ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+	c->ltab[lnum - c->lpt_first].free = free;
+	c->ltab[lnum - c->lpt_first].dirty += dirty;
+}
+
+/**
+ * alloc_lpt_leb - allocate an LPT LEB that is empty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number is passed and returned here
+ *
+ * This function finds the next empty LEB in the ltab starting from @lnum. If a
+ * an empty LEB is found it is returned in @lnum and the function returns %0.
+ * Otherwise the function returns -ENOSPC.  Note however, that LPT is designed
+ * never to run out of space.
+ */
+static int alloc_lpt_leb(struct ubifs_info *c, int *lnum)
+{
+	int i, n;
+
+	n = *lnum - c->lpt_first + 1;
+	for (i = n; i < c->lpt_lebs; i++) {
+		if (c->ltab[i].tgc || c->ltab[i].cmt)
+			continue;
+		if (c->ltab[i].free == c->leb_size) {
+			c->ltab[i].cmt = 1;
+			*lnum = i + c->lpt_first;
+			return 0;
+		}
+	}
+
+	for (i = 0; i < n; i++) {
+		if (c->ltab[i].tgc || c->ltab[i].cmt)
+			continue;
+		if (c->ltab[i].free == c->leb_size) {
+			c->ltab[i].cmt = 1;
+			*lnum = i + c->lpt_first;
+			return 0;
+		}
+	}
+	return -ENOSPC;
+}
+
+/**
+ * layout_cnodes - layout cnodes for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_cnodes(struct ubifs_info *c)
+{
+	int lnum, offs, len, alen, done_lsave, done_ltab, err;
+	struct ubifs_cnode *cnode;
+
+	err = dbg_chk_lpt_sz(c, 0, 0);
+	if (err)
+		return err;
+	cnode = c->lpt_cnext;
+	if (!cnode)
+		return 0;
+	lnum = c->nhead_lnum;
+	offs = c->nhead_offs;
+	/* Try to place lsave and ltab nicely */
+	done_lsave = !c->big_lpt;
+	done_ltab = 0;
+	if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
+		done_lsave = 1;
+		c->lsave_lnum = lnum;
+		c->lsave_offs = offs;
+		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
+	}
+
+	if (offs + c->ltab_sz <= c->leb_size) {
+		done_ltab = 1;
+		c->ltab_lnum = lnum;
+		c->ltab_offs = offs;
+		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
+	}
+
+	do {
+		if (cnode->level) {
+			len = c->nnode_sz;
+			c->dirty_nn_cnt -= 1;
+		} else {
+			len = c->pnode_sz;
+			c->dirty_pn_cnt -= 1;
+		}
+		while (offs + len > c->leb_size) {
+			alen = ALIGN(offs, c->min_io_size);
+			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
+			err = alloc_lpt_leb(c, &lnum);
+			if (err)
+				goto no_space;
+			offs = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+			/* Try to place lsave and ltab nicely */
+			if (!done_lsave) {
+				done_lsave = 1;
+				c->lsave_lnum = lnum;
+				c->lsave_offs = offs;
+				offs += c->lsave_sz;
+				dbg_chk_lpt_sz(c, 1, c->lsave_sz);
+				continue;
+			}
+			if (!done_ltab) {
+				done_ltab = 1;
+				c->ltab_lnum = lnum;
+				c->ltab_offs = offs;
+				offs += c->ltab_sz;
+				dbg_chk_lpt_sz(c, 1, c->ltab_sz);
+				continue;
+			}
+			break;
+		}
+		if (cnode->parent) {
+			cnode->parent->nbranch[cnode->iip].lnum = lnum;
+			cnode->parent->nbranch[cnode->iip].offs = offs;
+		} else {
+			c->lpt_lnum = lnum;
+			c->lpt_offs = offs;
+		}
+		offs += len;
+		dbg_chk_lpt_sz(c, 1, len);
+		cnode = cnode->cnext;
+	} while (cnode && cnode != c->lpt_cnext);
+
+	/* Make sure to place LPT's save table */
+	if (!done_lsave) {
+		if (offs + c->lsave_sz > c->leb_size) {
+			alen = ALIGN(offs, c->min_io_size);
+			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
+			err = alloc_lpt_leb(c, &lnum);
+			if (err)
+				goto no_space;
+			offs = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+		}
+		done_lsave = 1;
+		c->lsave_lnum = lnum;
+		c->lsave_offs = offs;
+		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
+	}
+
+	/* Make sure to place LPT's own lprops table */
+	if (!done_ltab) {
+		if (offs + c->ltab_sz > c->leb_size) {
+			alen = ALIGN(offs, c->min_io_size);
+			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
+			err = alloc_lpt_leb(c, &lnum);
+			if (err)
+				goto no_space;
+			offs = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+		}
+		done_ltab = 1;
+		c->ltab_lnum = lnum;
+		c->ltab_offs = offs;
+		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
+	}
+
+	alen = ALIGN(offs, c->min_io_size);
+	upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+	dbg_chk_lpt_sz(c, 4, alen - offs);
+	err = dbg_chk_lpt_sz(c, 3, alen);
+	if (err)
+		return err;
+	return 0;
+
+no_space:
+	ubifs_err("LPT out of space");
+	dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, "
+		"done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
+	dbg_dump_lpt_info(c);
+	dbg_dump_lpt_lebs(c);
+	dump_stack();
+	return err;
+}
+
+/**
+ * realloc_lpt_leb - allocate an LPT LEB that is empty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number is passed and returned here
+ *
+ * This function duplicates exactly the results of the function alloc_lpt_leb.
+ * It is used during end commit to reallocate the same LEB numbers that were
+ * allocated by alloc_lpt_leb during start commit.
+ *
+ * This function finds the next LEB that was allocated by the alloc_lpt_leb
+ * function starting from @lnum. If a LEB is found it is returned in @lnum and
+ * the function returns %0. Otherwise the function returns -ENOSPC.
+ * Note however, that LPT is designed never to run out of space.
+ */
+static int realloc_lpt_leb(struct ubifs_info *c, int *lnum)
+{
+	int i, n;
+
+	n = *lnum - c->lpt_first + 1;
+	for (i = n; i < c->lpt_lebs; i++)
+		if (c->ltab[i].cmt) {
+			c->ltab[i].cmt = 0;
+			*lnum = i + c->lpt_first;
+			return 0;
+		}
+
+	for (i = 0; i < n; i++)
+		if (c->ltab[i].cmt) {
+			c->ltab[i].cmt = 0;
+			*lnum = i + c->lpt_first;
+			return 0;
+		}
+	return -ENOSPC;
+}
+
+/**
+ * write_cnodes - write cnodes for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int write_cnodes(struct ubifs_info *c)
+{
+	int lnum, offs, len, from, err, wlen, alen, done_ltab, done_lsave;
+	struct ubifs_cnode *cnode;
+	void *buf = c->lpt_buf;
+
+	cnode = c->lpt_cnext;
+	if (!cnode)
+		return 0;
+	lnum = c->nhead_lnum;
+	offs = c->nhead_offs;
+	from = offs;
+	/* Ensure empty LEB is unmapped */
+	if (offs == 0) {
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+	}
+	/* Try to place lsave and ltab nicely */
+	done_lsave = !c->big_lpt;
+	done_ltab = 0;
+	if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
+		done_lsave = 1;
+		ubifs_pack_lsave(c, buf + offs, c->lsave);
+		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
+	}
+
+	if (offs + c->ltab_sz <= c->leb_size) {
+		done_ltab = 1;
+		ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
+	}
+
+	/* Loop for each cnode */
+	do {
+		if (cnode->level)
+			len = c->nnode_sz;
+		else
+			len = c->pnode_sz;
+		while (offs + len > c->leb_size) {
+			wlen = offs - from;
+			if (wlen) {
+				alen = ALIGN(wlen, c->min_io_size);
+				memset(buf + offs, 0xff, alen - wlen);
+				err = ubifs_leb_write(c, lnum, buf + from, from,
+						       alen, UBI_SHORTTERM);
+				if (err)
+					return err;
+			}
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
+			err = realloc_lpt_leb(c, &lnum);
+			if (err)
+				goto no_space;
+			offs = from = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				return err;
+			/* Try to place lsave and ltab nicely */
+			if (!done_lsave) {
+				done_lsave = 1;
+				ubifs_pack_lsave(c, buf + offs, c->lsave);
+				offs += c->lsave_sz;
+				dbg_chk_lpt_sz(c, 1, c->lsave_sz);
+				continue;
+			}
+			if (!done_ltab) {
+				done_ltab = 1;
+				ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+				offs += c->ltab_sz;
+				dbg_chk_lpt_sz(c, 1, c->ltab_sz);
+				continue;
+			}
+			break;
+		}
+		if (cnode->level)
+			ubifs_pack_nnode(c, buf + offs,
+					 (struct ubifs_nnode *)cnode);
+		else
+			ubifs_pack_pnode(c, buf + offs,
+					 (struct ubifs_pnode *)cnode);
+		/*
+		 * The reason for the barriers is the same as in case of TNC.
+		 * See comment in 'write_index()'. 'dirty_cow_nnode()' and
+		 * 'dirty_cow_pnode()' are the functions for which this is
+		 * important.
+		 */
+		clear_bit(DIRTY_CNODE, &cnode->flags);
+		smp_mb__before_clear_bit();
+		clear_bit(COW_ZNODE, &cnode->flags);
+		smp_mb__after_clear_bit();
+		offs += len;
+		dbg_chk_lpt_sz(c, 1, len);
+		cnode = cnode->cnext;
+	} while (cnode && cnode != c->lpt_cnext);
+
+	/* Make sure to place LPT's save table */
+	if (!done_lsave) {
+		if (offs + c->lsave_sz > c->leb_size) {
+			wlen = offs - from;
+			alen = ALIGN(wlen, c->min_io_size);
+			memset(buf + offs, 0xff, alen - wlen);
+			err = ubifs_leb_write(c, lnum, buf + from, from, alen,
+					      UBI_SHORTTERM);
+			if (err)
+				return err;
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
+			err = realloc_lpt_leb(c, &lnum);
+			if (err)
+				goto no_space;
+			offs = from = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				return err;
+		}
+		done_lsave = 1;
+		ubifs_pack_lsave(c, buf + offs, c->lsave);
+		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
+	}
+
+	/* Make sure to place LPT's own lprops table */
+	if (!done_ltab) {
+		if (offs + c->ltab_sz > c->leb_size) {
+			wlen = offs - from;
+			alen = ALIGN(wlen, c->min_io_size);
+			memset(buf + offs, 0xff, alen - wlen);
+			err = ubifs_leb_write(c, lnum, buf + from, from, alen,
+					      UBI_SHORTTERM);
+			if (err)
+				return err;
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
+			err = realloc_lpt_leb(c, &lnum);
+			if (err)
+				goto no_space;
+			offs = from = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				return err;
+		}
+		done_ltab = 1;
+		ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
+	}
+
+	/* Write remaining data in buffer */
+	wlen = offs - from;
+	alen = ALIGN(wlen, c->min_io_size);
+	memset(buf + offs, 0xff, alen - wlen);
+	err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM);
+	if (err)
+		return err;
+
+	dbg_chk_lpt_sz(c, 4, alen - wlen);
+	err = dbg_chk_lpt_sz(c, 3, ALIGN(offs, c->min_io_size));
+	if (err)
+		return err;
+
+	c->nhead_lnum = lnum;
+	c->nhead_offs = ALIGN(offs, c->min_io_size);
+
+	dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+	dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+	dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+	if (c->big_lpt)
+		dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+
+	return 0;
+
+no_space:
+	ubifs_err("LPT out of space mismatch");
+	dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
+		"%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
+	dbg_dump_lpt_info(c);
+	dbg_dump_lpt_lebs(c);
+	dump_stack();
+	return err;
+}
+
+/**
+ * next_pnode_to_dirty - find next pnode to dirty.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode
+ *
+ * This function returns the next pnode to dirty or %NULL if there are no more
+ * pnodes.  Note that pnodes that have never been written (lnum == 0) are
+ * skipped.
+ */
+static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
+					       struct ubifs_pnode *pnode)
+{
+	struct ubifs_nnode *nnode;
+	int iip;
+
+	/* Try to go right */
+	nnode = pnode->parent;
+	for (iip = pnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
+		if (nnode->nbranch[iip].lnum)
+			return ubifs_get_pnode(c, nnode, iip);
+	}
+
+	/* Go up while can't go right */
+	do {
+		iip = nnode->iip + 1;
+		nnode = nnode->parent;
+		if (!nnode)
+			return NULL;
+		for (; iip < UBIFS_LPT_FANOUT; iip++) {
+			if (nnode->nbranch[iip].lnum)
+				break;
+		}
+	} while (iip >= UBIFS_LPT_FANOUT);
+
+	/* Go right */
+	nnode = ubifs_get_nnode(c, nnode, iip);
+	if (IS_ERR(nnode))
+		return (void *)nnode;
+
+	/* Go down to level 1 */
+	while (nnode->level > 1) {
+		for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) {
+			if (nnode->nbranch[iip].lnum)
+				break;
+		}
+		if (iip >= UBIFS_LPT_FANOUT) {
+			/*
+			 * Should not happen, but we need to keep going
+			 * if it does.
+			 */
+			iip = 0;
+		}
+		nnode = ubifs_get_nnode(c, nnode, iip);
+		if (IS_ERR(nnode))
+			return (void *)nnode;
+	}
+
+	for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++)
+		if (nnode->nbranch[iip].lnum)
+			break;
+	if (iip >= UBIFS_LPT_FANOUT)
+		/* Should not happen, but we need to keep going if it does */
+		iip = 0;
+	return ubifs_get_pnode(c, nnode, iip);
+}
+
+/**
+ * pnode_lookup - lookup a pnode in the LPT.
+ * @c: UBIFS file-system description object
+ * @i: pnode number (0 to main_lebs - 1)
+ *
+ * This function returns a pointer to the pnode on success or a negative
+ * error code on failure.
+ */
+static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
+{
+	int err, h, iip, shft;
+	struct ubifs_nnode *nnode;
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return ERR_PTR(err);
+	}
+	i <<= UBIFS_LPT_FANOUT_SHIFT;
+	nnode = c->nroot;
+	shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+	for (h = 1; h < c->lpt_hght; h++) {
+		iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+		shft -= UBIFS_LPT_FANOUT_SHIFT;
+		nnode = ubifs_get_nnode(c, nnode, iip);
+		if (IS_ERR(nnode))
+			return ERR_CAST(nnode);
+	}
+	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+	return ubifs_get_pnode(c, nnode, iip);
+}
+
+/**
+ * add_pnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode for which to add dirt
+ */
+static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+	ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
+			   c->pnode_sz);
+}
+
+/**
+ * do_make_pnode_dirty - mark a pnode dirty.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to mark dirty
+ */
+static void do_make_pnode_dirty(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+	/* Assumes cnext list is empty i.e. not called during commit */
+	if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
+		struct ubifs_nnode *nnode;
+
+		c->dirty_pn_cnt += 1;
+		add_pnode_dirt(c, pnode);
+		/* Mark parent and ancestors dirty too */
+		nnode = pnode->parent;
+		while (nnode) {
+			if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+				c->dirty_nn_cnt += 1;
+				ubifs_add_nnode_dirt(c, nnode);
+				nnode = nnode->parent;
+			} else
+				break;
+		}
+	}
+}
+
+/**
+ * make_tree_dirty - mark the entire LEB properties tree dirty.
+ * @c: UBIFS file-system description object
+ *
+ * This function is used by the "small" LPT model to cause the entire LEB
+ * properties tree to be written.  The "small" LPT model does not use LPT
+ * garbage collection because it is more efficient to write the entire tree
+ * (because it is small).
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_tree_dirty(struct ubifs_info *c)
+{
+	struct ubifs_pnode *pnode;
+
+	pnode = pnode_lookup(c, 0);
+	if (IS_ERR(pnode))
+		return PTR_ERR(pnode);
+
+	while (pnode) {
+		do_make_pnode_dirty(c, pnode);
+		pnode = next_pnode_to_dirty(c, pnode);
+		if (IS_ERR(pnode))
+			return PTR_ERR(pnode);
+	}
+	return 0;
+}
+
+/**
+ * need_write_all - determine if the LPT area is running out of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %1 if the LPT area is running out of free space and %0
+ * if it is not.
+ */
+static int need_write_all(struct ubifs_info *c)
+{
+	long long free = 0;
+	int i;
+
+	for (i = 0; i < c->lpt_lebs; i++) {
+		if (i + c->lpt_first == c->nhead_lnum)
+			free += c->leb_size - c->nhead_offs;
+		else if (c->ltab[i].free == c->leb_size)
+			free += c->leb_size;
+		else if (c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
+			free += c->leb_size;
+	}
+	/* Less than twice the size left */
+	if (free <= c->lpt_sz * 2)
+		return 1;
+	return 0;
+}
+
+/**
+ * lpt_tgc_start - start trivial garbage collection of LPT LEBs.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial garbage collection is where a LPT LEB contains only dirty and
+ * free space and so may be reused as soon as the next commit is completed.
+ * This function is called during start commit to mark LPT LEBs for trivial GC.
+ */
+static void lpt_tgc_start(struct ubifs_info *c)
+{
+	int i;
+
+	for (i = 0; i < c->lpt_lebs; i++) {
+		if (i + c->lpt_first == c->nhead_lnum)
+			continue;
+		if (c->ltab[i].dirty > 0 &&
+		    c->ltab[i].free + c->ltab[i].dirty == c->leb_size) {
+			c->ltab[i].tgc = 1;
+			c->ltab[i].free = c->leb_size;
+			c->ltab[i].dirty = 0;
+			dbg_lp("LEB %d", i + c->lpt_first);
+		}
+	}
+}
+
+/**
+ * lpt_tgc_end - end trivial garbage collection of LPT LEBs.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial garbage collection is where a LPT LEB contains only dirty and
+ * free space and so may be reused as soon as the next commit is completed.
+ * This function is called after the commit is completed (master node has been
+ * written) and un-maps LPT LEBs that were marked for trivial GC.
+ */
+static int lpt_tgc_end(struct ubifs_info *c)
+{
+	int i, err;
+
+	for (i = 0; i < c->lpt_lebs; i++)
+		if (c->ltab[i].tgc) {
+			err = ubifs_leb_unmap(c, i + c->lpt_first);
+			if (err)
+				return err;
+			c->ltab[i].tgc = 0;
+			dbg_lp("LEB %d", i + c->lpt_first);
+		}
+	return 0;
+}
+
+/**
+ * populate_lsave - fill the lsave array with important LEB numbers.
+ * @c: the UBIFS file-system description object
+ *
+ * This function is only called for the "big" model. It records a small number
+ * of LEB numbers of important LEBs.  Important LEBs are ones that are (from
+ * most important to least important): empty, freeable, freeable index, dirty
+ * index, dirty or free. Upon mount, we read this list of LEB numbers and bring
+ * their pnodes into memory.  That will stop us from having to scan the LPT
+ * straight away. For the "small" model we assume that scanning the LPT is no
+ * big deal.
+ */
+static void populate_lsave(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+	int i, cnt = 0;
+
+	ubifs_assert(c->big_lpt);
+	if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
+		c->lpt_drty_flgs |= LSAVE_DIRTY;
+		ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
+	}
+
+	if (dbg_populate_lsave(c))
+		return;
+
+	list_for_each_entry(lprops, &c->empty_list, list) {
+		c->lsave[cnt++] = lprops->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	list_for_each_entry(lprops, &c->freeable_list, list) {
+		c->lsave[cnt++] = lprops->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+		c->lsave[cnt++] = lprops->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		c->lsave[cnt++] = heap->arr[i]->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		c->lsave[cnt++] = heap->arr[i]->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	heap = &c->lpt_heap[LPROPS_FREE - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		c->lsave[cnt++] = heap->arr[i]->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	/* Fill it up completely */
+	while (cnt < c->lsave_cnt)
+		c->lsave[cnt++] = c->main_first;
+}
+
+/**
+ * nnode_lookup - lookup a nnode in the LPT.
+ * @c: UBIFS file-system description object
+ * @i: nnode number
+ *
+ * This function returns a pointer to the nnode on success or a negative
+ * error code on failure.
+ */
+static struct ubifs_nnode *nnode_lookup(struct ubifs_info *c, int i)
+{
+	int err, iip;
+	struct ubifs_nnode *nnode;
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return ERR_PTR(err);
+	}
+	nnode = c->nroot;
+	while (1) {
+		iip = i & (UBIFS_LPT_FANOUT - 1);
+		i >>= UBIFS_LPT_FANOUT_SHIFT;
+		if (!i)
+			break;
+		nnode = ubifs_get_nnode(c, nnode, iip);
+		if (IS_ERR(nnode))
+			return nnode;
+	}
+	return nnode;
+}
+
+/**
+ * make_nnode_dirty - find a nnode and, if found, make it dirty.
+ * @c: UBIFS file-system description object
+ * @node_num: nnode number of nnode to make dirty
+ * @lnum: LEB number where nnode was written
+ * @offs: offset where nnode was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_nnode_dirty(struct ubifs_info *c, int node_num, int lnum,
+			    int offs)
+{
+	struct ubifs_nnode *nnode;
+
+	nnode = nnode_lookup(c, node_num);
+	if (IS_ERR(nnode))
+		return PTR_ERR(nnode);
+	if (nnode->parent) {
+		struct ubifs_nbranch *branch;
+
+		branch = &nnode->parent->nbranch[nnode->iip];
+		if (branch->lnum != lnum || branch->offs != offs)
+			return 0; /* nnode is obsolete */
+	} else if (c->lpt_lnum != lnum || c->lpt_offs != offs)
+			return 0; /* nnode is obsolete */
+	/* Assumes cnext list is empty i.e. not called during commit */
+	if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+		c->dirty_nn_cnt += 1;
+		ubifs_add_nnode_dirt(c, nnode);
+		/* Mark parent and ancestors dirty too */
+		nnode = nnode->parent;
+		while (nnode) {
+			if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+				c->dirty_nn_cnt += 1;
+				ubifs_add_nnode_dirt(c, nnode);
+				nnode = nnode->parent;
+			} else
+				break;
+		}
+	}
+	return 0;
+}
+
+/**
+ * make_pnode_dirty - find a pnode and, if found, make it dirty.
+ * @c: UBIFS file-system description object
+ * @node_num: pnode number of pnode to make dirty
+ * @lnum: LEB number where pnode was written
+ * @offs: offset where pnode was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_pnode_dirty(struct ubifs_info *c, int node_num, int lnum,
+			    int offs)
+{
+	struct ubifs_pnode *pnode;
+	struct ubifs_nbranch *branch;
+
+	pnode = pnode_lookup(c, node_num);
+	if (IS_ERR(pnode))
+		return PTR_ERR(pnode);
+	branch = &pnode->parent->nbranch[pnode->iip];
+	if (branch->lnum != lnum || branch->offs != offs)
+		return 0;
+	do_make_pnode_dirty(c, pnode);
+	return 0;
+}
+
+/**
+ * make_ltab_dirty - make ltab node dirty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number where ltab was written
+ * @offs: offset where ltab was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	if (lnum != c->ltab_lnum || offs != c->ltab_offs)
+		return 0; /* This ltab node is obsolete */
+	if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
+		c->lpt_drty_flgs |= LTAB_DIRTY;
+		ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
+	}
+	return 0;
+}
+
+/**
+ * make_lsave_dirty - make lsave node dirty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number where lsave was written
+ * @offs: offset where lsave was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	if (lnum != c->lsave_lnum || offs != c->lsave_offs)
+		return 0; /* This lsave node is obsolete */
+	if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
+		c->lpt_drty_flgs |= LSAVE_DIRTY;
+		ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
+	}
+	return 0;
+}
+
+/**
+ * make_node_dirty - make node dirty.
+ * @c: UBIFS file-system description object
+ * @node_type: LPT node type
+ * @node_num: node number
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num,
+			   int lnum, int offs)
+{
+	switch (node_type) {
+	case UBIFS_LPT_NNODE:
+		return make_nnode_dirty(c, node_num, lnum, offs);
+	case UBIFS_LPT_PNODE:
+		return make_pnode_dirty(c, node_num, lnum, offs);
+	case UBIFS_LPT_LTAB:
+		return make_ltab_dirty(c, lnum, offs);
+	case UBIFS_LPT_LSAVE:
+		return make_lsave_dirty(c, lnum, offs);
+	}
+	return -EINVAL;
+}
+
+/**
+ * get_lpt_node_len - return the length of a node based on its type.
+ * @c: UBIFS file-system description object
+ * @node_type: LPT node type
+ */
+static int get_lpt_node_len(const struct ubifs_info *c, int node_type)
+{
+	switch (node_type) {
+	case UBIFS_LPT_NNODE:
+		return c->nnode_sz;
+	case UBIFS_LPT_PNODE:
+		return c->pnode_sz;
+	case UBIFS_LPT_LTAB:
+		return c->ltab_sz;
+	case UBIFS_LPT_LSAVE:
+		return c->lsave_sz;
+	}
+	return 0;
+}
+
+/**
+ * get_pad_len - return the length of padding in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @len: length of buffer
+ */
+static int get_pad_len(const struct ubifs_info *c, uint8_t *buf, int len)
+{
+	int offs, pad_len;
+
+	if (c->min_io_size == 1)
+		return 0;
+	offs = c->leb_size - len;
+	pad_len = ALIGN(offs, c->min_io_size) - offs;
+	return pad_len;
+}
+
+/**
+ * get_lpt_node_type - return type (and node number) of a node in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @node_num: node number is returned here
+ */
+static int get_lpt_node_type(const struct ubifs_info *c, uint8_t *buf,
+			     int *node_num)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int pos = 0, node_type;
+
+	node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
+	*node_num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+	return node_type;
+}
+
+/**
+ * is_a_node - determine if a buffer contains a node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @len: length of buffer
+ *
+ * This function returns %1 if the buffer contains a node or %0 if it does not.
+ */
+static int is_a_node(const struct ubifs_info *c, uint8_t *buf, int len)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int pos = 0, node_type, node_len;
+	uint16_t crc, calc_crc;
+
+	if (len < UBIFS_LPT_CRC_BYTES + (UBIFS_LPT_TYPE_BITS + 7) / 8)
+		return 0;
+	node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
+	if (node_type == UBIFS_LPT_NOT_A_NODE)
+		return 0;
+	node_len = get_lpt_node_len(c, node_type);
+	if (!node_len || node_len > len)
+		return 0;
+	pos = 0;
+	addr = buf;
+	crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
+	calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+			 node_len - UBIFS_LPT_CRC_BYTES);
+	if (crc != calc_crc)
+		return 0;
+	return 1;
+}
+
+/**
+ * lpt_gc_lnum - garbage collect a LPT LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to garbage collect
+ *
+ * LPT garbage collection is used only for the "big" LPT model
+ * (c->big_lpt == 1).  Garbage collection simply involves marking all the nodes
+ * in the LEB being garbage-collected as dirty.  The dirty nodes are written
+ * next commit, after which the LEB is free to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_gc_lnum(struct ubifs_info *c, int lnum)
+{
+	int err, len = c->leb_size, node_type, node_num, node_len, offs;
+	void *buf = c->lpt_buf;
+
+	dbg_lp("LEB %d", lnum);
+	err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+	if (err) {
+		ubifs_err("cannot read LEB %d, error %d", lnum, err);
+		return err;
+	}
+	while (1) {
+		if (!is_a_node(c, buf, len)) {
+			int pad_len;
+
+			pad_len = get_pad_len(c, buf, len);
+			if (pad_len) {
+				buf += pad_len;
+				len -= pad_len;
+				continue;
+			}
+			return 0;
+		}
+		node_type = get_lpt_node_type(c, buf, &node_num);
+		node_len = get_lpt_node_len(c, node_type);
+		offs = c->leb_size - len;
+		ubifs_assert(node_len != 0);
+		mutex_lock(&c->lp_mutex);
+		err = make_node_dirty(c, node_type, node_num, lnum, offs);
+		mutex_unlock(&c->lp_mutex);
+		if (err)
+			return err;
+		buf += node_len;
+		len -= node_len;
+	}
+	return 0;
+}
+
+/**
+ * lpt_gc - LPT garbage collection.
+ * @c: UBIFS file-system description object
+ *
+ * Select a LPT LEB for LPT garbage collection and call 'lpt_gc_lnum()'.
+ * Returns %0 on success and a negative error code on failure.
+ */
+static int lpt_gc(struct ubifs_info *c)
+{
+	int i, lnum = -1, dirty = 0;
+
+	mutex_lock(&c->lp_mutex);
+	for (i = 0; i < c->lpt_lebs; i++) {
+		ubifs_assert(!c->ltab[i].tgc);
+		if (i + c->lpt_first == c->nhead_lnum ||
+		    c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
+			continue;
+		if (c->ltab[i].dirty > dirty) {
+			dirty = c->ltab[i].dirty;
+			lnum = i + c->lpt_first;
+		}
+	}
+	mutex_unlock(&c->lp_mutex);
+	if (lnum == -1)
+		return -ENOSPC;
+	return lpt_gc_lnum(c, lnum);
+}
+
+/**
+ * ubifs_lpt_start_commit - UBIFS commit starts.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called when UBIFS starts the commit operation.
+ * This function "freezes" all currently dirty LEB properties and does not
+ * change them anymore. Further changes are saved and tracked separately
+ * because they are not part of this commit. This function returns zero in case
+ * of success and a negative error code in case of failure.
+ */
+int ubifs_lpt_start_commit(struct ubifs_info *c)
+{
+	int err, cnt;
+
+	dbg_lp("");
+
+	mutex_lock(&c->lp_mutex);
+	err = dbg_chk_lpt_free_spc(c);
+	if (err)
+		goto out;
+	err = dbg_check_ltab(c);
+	if (err)
+		goto out;
+
+	if (c->check_lpt_free) {
+		/*
+		 * We ensure there is enough free space in
+		 * ubifs_lpt_post_commit() by marking nodes dirty. That
+		 * information is lost when we unmount, so we also need
+		 * to check free space once after mounting also.
+		 */
+		c->check_lpt_free = 0;
+		while (need_write_all(c)) {
+			mutex_unlock(&c->lp_mutex);
+			err = lpt_gc(c);
+			if (err)
+				return err;
+			mutex_lock(&c->lp_mutex);
+		}
+	}
+
+	lpt_tgc_start(c);
+
+	if (!c->dirty_pn_cnt) {
+		dbg_cmt("no cnodes to commit");
+		err = 0;
+		goto out;
+	}
+
+	if (!c->big_lpt && need_write_all(c)) {
+		/* If needed, write everything */
+		err = make_tree_dirty(c);
+		if (err)
+			goto out;
+		lpt_tgc_start(c);
+	}
+
+	if (c->big_lpt)
+		populate_lsave(c);
+
+	cnt = get_cnodes_to_commit(c);
+	ubifs_assert(cnt != 0);
+
+	err = layout_cnodes(c);
+	if (err)
+		goto out;
+
+	/* Copy the LPT's own lprops for end commit to write */
+	memcpy(c->ltab_cmt, c->ltab,
+	       sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+	c->lpt_drty_flgs &= ~(LTAB_DIRTY | LSAVE_DIRTY);
+
+out:
+	mutex_unlock(&c->lp_mutex);
+	return err;
+}
+
+/**
+ * free_obsolete_cnodes - free obsolete cnodes for commit end.
+ * @c: UBIFS file-system description object
+ */
+static void free_obsolete_cnodes(struct ubifs_info *c)
+{
+	struct ubifs_cnode *cnode, *cnext;
+
+	cnext = c->lpt_cnext;
+	if (!cnext)
+		return;
+	do {
+		cnode = cnext;
+		cnext = cnode->cnext;
+		if (test_bit(OBSOLETE_CNODE, &cnode->flags))
+			kfree(cnode);
+		else
+			cnode->cnext = NULL;
+	} while (cnext != c->lpt_cnext);
+	c->lpt_cnext = NULL;
+}
+
+/**
+ * ubifs_lpt_end_commit - finish the commit operation.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called when the commit operation finishes. It
+ * flushes the changes which were "frozen" by 'ubifs_lprops_start_commit()' to
+ * the media. Returns zero in case of success and a negative error code in case
+ * of failure.
+ */
+int ubifs_lpt_end_commit(struct ubifs_info *c)
+{
+	int err;
+
+	dbg_lp("");
+
+	if (!c->lpt_cnext)
+		return 0;
+
+	err = write_cnodes(c);
+	if (err)
+		return err;
+
+	mutex_lock(&c->lp_mutex);
+	free_obsolete_cnodes(c);
+	mutex_unlock(&c->lp_mutex);
+
+	return 0;
+}
+
+/**
+ * ubifs_lpt_post_commit - post commit LPT trivial GC and LPT GC.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial GC is completed after a commit. Also LPT GC is done after a
+ * commit for the "big" LPT model.
+ */
+int ubifs_lpt_post_commit(struct ubifs_info *c)
+{
+	int err;
+
+	mutex_lock(&c->lp_mutex);
+	err = lpt_tgc_end(c);
+	if (err)
+		goto out;
+	if (c->big_lpt)
+		while (need_write_all(c)) {
+			mutex_unlock(&c->lp_mutex);
+			err = lpt_gc(c);
+			if (err)
+				return err;
+			mutex_lock(&c->lp_mutex);
+		}
+out:
+	mutex_unlock(&c->lp_mutex);
+	return err;
+}
+
+/**
+ * first_nnode - find the first nnode in memory.
+ * @c: UBIFS file-system description object
+ * @hght: height of tree where nnode found is returned here
+ *
+ * This function returns a pointer to the nnode found or %NULL if no nnode is
+ * found. This function is a helper to 'ubifs_lpt_free()'.
+ */
+static struct ubifs_nnode *first_nnode(struct ubifs_info *c, int *hght)
+{
+	struct ubifs_nnode *nnode;
+	int h, i, found;
+
+	nnode = c->nroot;
+	*hght = 0;
+	if (!nnode)
+		return NULL;
+	for (h = 1; h < c->lpt_hght; h++) {
+		found = 0;
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+			if (nnode->nbranch[i].nnode) {
+				found = 1;
+				nnode = nnode->nbranch[i].nnode;
+				*hght = h;
+				break;
+			}
+		}
+		if (!found)
+			break;
+	}
+	return nnode;
+}
+
+/**
+ * next_nnode - find the next nnode in memory.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode from which to start.
+ * @hght: height of tree where nnode is, is passed and returned here
+ *
+ * This function returns a pointer to the nnode found or %NULL if no nnode is
+ * found. This function is a helper to 'ubifs_lpt_free()'.
+ */
+static struct ubifs_nnode *next_nnode(struct ubifs_info *c,
+				      struct ubifs_nnode *nnode, int *hght)
+{
+	struct ubifs_nnode *parent;
+	int iip, h, i, found;
+
+	parent = nnode->parent;
+	if (!parent)
+		return NULL;
+	if (nnode->iip == UBIFS_LPT_FANOUT - 1) {
+		*hght -= 1;
+		return parent;
+	}
+	for (iip = nnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
+		nnode = parent->nbranch[iip].nnode;
+		if (nnode)
+			break;
+	}
+	if (!nnode) {
+		*hght -= 1;
+		return parent;
+	}
+	for (h = *hght + 1; h < c->lpt_hght; h++) {
+		found = 0;
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+			if (nnode->nbranch[i].nnode) {
+				found = 1;
+				nnode = nnode->nbranch[i].nnode;
+				*hght = h;
+				break;
+			}
+		}
+		if (!found)
+			break;
+	}
+	return nnode;
+}
+
+/**
+ * ubifs_lpt_free - free resources owned by the LPT.
+ * @c: UBIFS file-system description object
+ * @wr_only: free only resources used for writing
+ */
+void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
+{
+	struct ubifs_nnode *nnode;
+	int i, hght;
+
+	/* Free write-only things first */
+
+	free_obsolete_cnodes(c); /* Leftover from a failed commit */
+
+	vfree(c->ltab_cmt);
+	c->ltab_cmt = NULL;
+	vfree(c->lpt_buf);
+	c->lpt_buf = NULL;
+	kfree(c->lsave);
+	c->lsave = NULL;
+
+	if (wr_only)
+		return;
+
+	/* Now free the rest */
+
+	nnode = first_nnode(c, &hght);
+	while (nnode) {
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++)
+			kfree(nnode->nbranch[i].nnode);
+		nnode = next_nnode(c, nnode, &hght);
+	}
+	for (i = 0; i < LPROPS_HEAP_CNT; i++)
+		kfree(c->lpt_heap[i].arr);
+	kfree(c->dirty_idx.arr);
+	kfree(c->nroot);
+	vfree(c->ltab);
+	kfree(c->lpt_nod_buf);
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes.
+ * @buf: buffer
+ * @len: buffer length
+ */
+static int dbg_is_all_ff(uint8_t *buf, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (buf[i] != 0xff)
+			return 0;
+	return 1;
+}
+
+/**
+ * dbg_is_nnode_dirty - determine if a nnode is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where nnode was written
+ * @offs: offset where nnode was written
+ */
+static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	struct ubifs_nnode *nnode;
+	int hght;
+
+	/* Entire tree is in memory so first_nnode / next_nnode are OK */
+	nnode = first_nnode(c, &hght);
+	for (; nnode; nnode = next_nnode(c, nnode, &hght)) {
+		struct ubifs_nbranch *branch;
+
+		cond_resched();
+		if (nnode->parent) {
+			branch = &nnode->parent->nbranch[nnode->iip];
+			if (branch->lnum != lnum || branch->offs != offs)
+				continue;
+			if (test_bit(DIRTY_CNODE, &nnode->flags))
+				return 1;
+			return 0;
+		} else {
+			if (c->lpt_lnum != lnum || c->lpt_offs != offs)
+				continue;
+			if (test_bit(DIRTY_CNODE, &nnode->flags))
+				return 1;
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/**
+ * dbg_is_pnode_dirty - determine if a pnode is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where pnode was written
+ * @offs: offset where pnode was written
+ */
+static int dbg_is_pnode_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	int i, cnt;
+
+	cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+	for (i = 0; i < cnt; i++) {
+		struct ubifs_pnode *pnode;
+		struct ubifs_nbranch *branch;
+
+		cond_resched();
+		pnode = pnode_lookup(c, i);
+		if (IS_ERR(pnode))
+			return PTR_ERR(pnode);
+		branch = &pnode->parent->nbranch[pnode->iip];
+		if (branch->lnum != lnum || branch->offs != offs)
+			continue;
+		if (test_bit(DIRTY_CNODE, &pnode->flags))
+			return 1;
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * dbg_is_ltab_dirty - determine if a ltab node is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where ltab node was written
+ * @offs: offset where ltab node was written
+ */
+static int dbg_is_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	if (lnum != c->ltab_lnum || offs != c->ltab_offs)
+		return 1;
+	return (c->lpt_drty_flgs & LTAB_DIRTY) != 0;
+}
+
+/**
+ * dbg_is_lsave_dirty - determine if a lsave node is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where lsave node was written
+ * @offs: offset where lsave node was written
+ */
+static int dbg_is_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	if (lnum != c->lsave_lnum || offs != c->lsave_offs)
+		return 1;
+	return (c->lpt_drty_flgs & LSAVE_DIRTY) != 0;
+}
+
+/**
+ * dbg_is_node_dirty - determine if a node is dirty.
+ * @c: the UBIFS file-system description object
+ * @node_type: node type
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ */
+static int dbg_is_node_dirty(struct ubifs_info *c, int node_type, int lnum,
+			     int offs)
+{
+	switch (node_type) {
+	case UBIFS_LPT_NNODE:
+		return dbg_is_nnode_dirty(c, lnum, offs);
+	case UBIFS_LPT_PNODE:
+		return dbg_is_pnode_dirty(c, lnum, offs);
+	case UBIFS_LPT_LTAB:
+		return dbg_is_ltab_dirty(c, lnum, offs);
+	case UBIFS_LPT_LSAVE:
+		return dbg_is_lsave_dirty(c, lnum, offs);
+	}
+	return 1;
+}
+
+/**
+ * dbg_check_ltab_lnum - check the ltab for a LPT LEB number.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
+{
+	int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
+	int ret;
+	void *buf, *p;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+		return 0;
+
+	buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+	if (!buf) {
+		ubifs_err("cannot allocate memory for ltab checking");
+		return 0;
+	}
+
+	dbg_lp("LEB %d", lnum);
+	err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+	if (err) {
+		dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err);
+		goto out;
+	}
+	while (1) {
+		if (!is_a_node(c, p, len)) {
+			int i, pad_len;
+
+			pad_len = get_pad_len(c, p, len);
+			if (pad_len) {
+				p += pad_len;
+				len -= pad_len;
+				dirty += pad_len;
+				continue;
+			}
+			if (!dbg_is_all_ff(p, len)) {
+				dbg_msg("invalid empty space in LEB %d at %d",
+					lnum, c->leb_size - len);
+				err = -EINVAL;
+			}
+			i = lnum - c->lpt_first;
+			if (len != c->ltab[i].free) {
+				dbg_msg("invalid free space in LEB %d "
+					"(free %d, expected %d)",
+					lnum, len, c->ltab[i].free);
+				err = -EINVAL;
+			}
+			if (dirty != c->ltab[i].dirty) {
+				dbg_msg("invalid dirty space in LEB %d "
+					"(dirty %d, expected %d)",
+					lnum, dirty, c->ltab[i].dirty);
+				err = -EINVAL;
+			}
+			goto out;
+		}
+		node_type = get_lpt_node_type(c, p, &node_num);
+		node_len = get_lpt_node_len(c, node_type);
+		ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len);
+		if (ret == 1)
+			dirty += node_len;
+		p += node_len;
+		len -= node_len;
+	}
+
+	err = 0;
+out:
+	vfree(buf);
+	return err;
+}
+
+/**
+ * dbg_check_ltab - check the free and dirty space in the ltab.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_ltab(struct ubifs_info *c)
+{
+	int lnum, err, i, cnt;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+		return 0;
+
+	/* Bring the entire tree into memory */
+	cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+	for (i = 0; i < cnt; i++) {
+		struct ubifs_pnode *pnode;
+
+		pnode = pnode_lookup(c, i);
+		if (IS_ERR(pnode))
+			return PTR_ERR(pnode);
+		cond_resched();
+	}
+
+	/* Check nodes */
+	err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)c->nroot, 0, 0);
+	if (err)
+		return err;
+
+	/* Check each LEB */
+	for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
+		err = dbg_check_ltab_lnum(c, lnum);
+		if (err) {
+			dbg_err("failed at LEB %d", lnum);
+			return err;
+		}
+	}
+
+	dbg_lp("succeeded");
+	return 0;
+}
+
+/**
+ * dbg_chk_lpt_free_spc - check LPT free space is enough to write entire LPT.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_chk_lpt_free_spc(struct ubifs_info *c)
+{
+	long long free = 0;
+	int i;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+		return 0;
+
+	for (i = 0; i < c->lpt_lebs; i++) {
+		if (c->ltab[i].tgc || c->ltab[i].cmt)
+			continue;
+		if (i + c->lpt_first == c->nhead_lnum)
+			free += c->leb_size - c->nhead_offs;
+		else if (c->ltab[i].free == c->leb_size)
+			free += c->leb_size;
+	}
+	if (free < c->lpt_sz) {
+		dbg_err("LPT space error: free %lld lpt_sz %lld",
+			free, c->lpt_sz);
+		dbg_dump_lpt_info(c);
+		dbg_dump_lpt_lebs(c);
+		dump_stack();
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
+ * @c: the UBIFS file-system description object
+ * @action: what to do
+ * @len: length written
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ * The @action argument may be one of:
+ *   o %0 - LPT debugging checking starts, initialize debugging variables;
+ *   o %1 - wrote an LPT node, increase LPT size by @len bytes;
+ *   o %2 - switched to a different LEB and wasted @len bytes;
+ *   o %3 - check that we've written the right number of bytes.
+ *   o %4 - wasted @len bytes;
+ */
+int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
+{
+	struct ubifs_debug_info *d = c->dbg;
+	long long chk_lpt_sz, lpt_sz;
+	int err = 0;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+		return 0;
+
+	switch (action) {
+	case 0:
+		d->chk_lpt_sz = 0;
+		d->chk_lpt_sz2 = 0;
+		d->chk_lpt_lebs = 0;
+		d->chk_lpt_wastage = 0;
+		if (c->dirty_pn_cnt > c->pnode_cnt) {
+			dbg_err("dirty pnodes %d exceed max %d",
+				c->dirty_pn_cnt, c->pnode_cnt);
+			err = -EINVAL;
+		}
+		if (c->dirty_nn_cnt > c->nnode_cnt) {
+			dbg_err("dirty nnodes %d exceed max %d",
+				c->dirty_nn_cnt, c->nnode_cnt);
+			err = -EINVAL;
+		}
+		return err;
+	case 1:
+		d->chk_lpt_sz += len;
+		return 0;
+	case 2:
+		d->chk_lpt_sz += len;
+		d->chk_lpt_wastage += len;
+		d->chk_lpt_lebs += 1;
+		return 0;
+	case 3:
+		chk_lpt_sz = c->leb_size;
+		chk_lpt_sz *= d->chk_lpt_lebs;
+		chk_lpt_sz += len - c->nhead_offs;
+		if (d->chk_lpt_sz != chk_lpt_sz) {
+			dbg_err("LPT wrote %lld but space used was %lld",
+				d->chk_lpt_sz, chk_lpt_sz);
+			err = -EINVAL;
+		}
+		if (d->chk_lpt_sz > c->lpt_sz) {
+			dbg_err("LPT wrote %lld but lpt_sz is %lld",
+				d->chk_lpt_sz, c->lpt_sz);
+			err = -EINVAL;
+		}
+		if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) {
+			dbg_err("LPT layout size %lld but wrote %lld",
+				d->chk_lpt_sz, d->chk_lpt_sz2);
+			err = -EINVAL;
+		}
+		if (d->chk_lpt_sz2 && d->new_nhead_offs != len) {
+			dbg_err("LPT new nhead offs: expected %d was %d",
+				d->new_nhead_offs, len);
+			err = -EINVAL;
+		}
+		lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
+		lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
+		lpt_sz += c->ltab_sz;
+		if (c->big_lpt)
+			lpt_sz += c->lsave_sz;
+		if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) {
+			dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
+				d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz);
+			err = -EINVAL;
+		}
+		if (err) {
+			dbg_dump_lpt_info(c);
+			dbg_dump_lpt_lebs(c);
+			dump_stack();
+		}
+		d->chk_lpt_sz2 = d->chk_lpt_sz;
+		d->chk_lpt_sz = 0;
+		d->chk_lpt_wastage = 0;
+		d->chk_lpt_lebs = 0;
+		d->new_nhead_offs = len;
+		return err;
+	case 4:
+		d->chk_lpt_sz += len;
+		d->chk_lpt_wastage += len;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+/**
+ * dbg_dump_lpt_leb - dump an LPT LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to dump
+ *
+ * This function dumps an LEB from LPT area. Nodes in this area are very
+ * different to nodes in the main area (e.g., they do not have common headers,
+ * they do not have 8-byte alignments, etc), so we have a separate function to
+ * dump LPT area LEBs. Note, LPT has to be locked by the caller.
+ */
+static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
+{
+	int err, len = c->leb_size, node_type, node_num, node_len, offs;
+	void *buf, *p;
+
+	printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
+	       current->pid, lnum);
+	buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+	if (!buf) {
+		ubifs_err("cannot allocate memory to dump LPT");
+		return;
+	}
+
+	err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+	if (err) {
+		ubifs_err("cannot read LEB %d, error %d", lnum, err);
+		goto out;
+	}
+	while (1) {
+		offs = c->leb_size - len;
+		if (!is_a_node(c, p, len)) {
+			int pad_len;
+
+			pad_len = get_pad_len(c, p, len);
+			if (pad_len) {
+				printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n",
+				       lnum, offs, pad_len);
+				p += pad_len;
+				len -= pad_len;
+				continue;
+			}
+			if (len)
+				printk(KERN_DEBUG "LEB %d:%d, free %d bytes\n",
+				       lnum, offs, len);
+			break;
+		}
+
+		node_type = get_lpt_node_type(c, p, &node_num);
+		switch (node_type) {
+		case UBIFS_LPT_PNODE:
+		{
+			node_len = c->pnode_sz;
+			if (c->big_lpt)
+				printk(KERN_DEBUG "LEB %d:%d, pnode num %d\n",
+				       lnum, offs, node_num);
+			else
+				printk(KERN_DEBUG "LEB %d:%d, pnode\n",
+				       lnum, offs);
+			break;
+		}
+		case UBIFS_LPT_NNODE:
+		{
+			int i;
+			struct ubifs_nnode nnode;
+
+			node_len = c->nnode_sz;
+			if (c->big_lpt)
+				printk(KERN_DEBUG "LEB %d:%d, nnode num %d, ",
+				       lnum, offs, node_num);
+			else
+				printk(KERN_DEBUG "LEB %d:%d, nnode, ",
+				       lnum, offs);
+			err = ubifs_unpack_nnode(c, p, &nnode);
+			for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+				printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
+				       nnode.nbranch[i].offs);
+				if (i != UBIFS_LPT_FANOUT - 1)
+					printk(KERN_CONT ", ");
+			}
+			printk(KERN_CONT "\n");
+			break;
+		}
+		case UBIFS_LPT_LTAB:
+			node_len = c->ltab_sz;
+			printk(KERN_DEBUG "LEB %d:%d, ltab\n",
+			       lnum, offs);
+			break;
+		case UBIFS_LPT_LSAVE:
+			node_len = c->lsave_sz;
+			printk(KERN_DEBUG "LEB %d:%d, lsave len\n", lnum, offs);
+			break;
+		default:
+			ubifs_err("LPT node type %d not recognized", node_type);
+			goto out;
+		}
+
+		p += node_len;
+		len -= node_len;
+	}
+
+	printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
+	       current->pid, lnum);
+out:
+	vfree(buf);
+	return;
+}
+
+/**
+ * dbg_dump_lpt_lebs - dump LPT lebs.
+ * @c: UBIFS file-system description object
+ *
+ * This function dumps all LPT LEBs. The caller has to make sure the LPT is
+ * locked.
+ */
+void dbg_dump_lpt_lebs(const struct ubifs_info *c)
+{
+	int i;
+
+	printk(KERN_DEBUG "(pid %d) start dumping all LPT LEBs\n",
+	       current->pid);
+	for (i = 0; i < c->lpt_lebs; i++)
+		dump_lpt_leb(c, i + c->lpt_first);
+	printk(KERN_DEBUG "(pid %d) finish dumping all LPT LEBs\n",
+	       current->pid);
+}
+
+/**
+ * dbg_populate_lsave - debugging version of 'populate_lsave()'
+ * @c: UBIFS file-system description object
+ *
+ * This is a debugging version for 'populate_lsave()' which populates lsave
+ * with random LEBs instead of useful LEBs, which is good for test coverage.
+ * Returns zero if lsave has not been populated (this debugging feature is
+ * disabled) an non-zero if lsave has been populated.
+ */
+static int dbg_populate_lsave(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+	int i;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+	if (random32() & 3)
+		return 0;
+
+	for (i = 0; i < c->lsave_cnt; i++)
+		c->lsave[i] = c->main_first;
+
+	list_for_each_entry(lprops, &c->empty_list, list)
+		c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+	list_for_each_entry(lprops, &c->freeable_list, list)
+		c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+	list_for_each_entry(lprops, &c->frdi_idx_list, list)
+		c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+
+	heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+	for (i = 0; i < heap->cnt; i++)
+		c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+	heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+	for (i = 0; i < heap->cnt; i++)
+		c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+	heap = &c->lpt_heap[LPROPS_FREE - 1];
+	for (i = 0; i < heap->cnt; i++)
+		c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+
+	return 1;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
new file mode 100644
index 00000000..278c2382
--- /dev/null
+++ b/fs/ubifs/master.c
@@ -0,0 +1,396 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/* This file implements reading and writing the master node */
+
+#include "ubifs.h"
+
+/**
+ * scan_for_master - search the valid master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function scans the master node LEBs and search for the latest master
+ * node. Returns zero in case of success, %-EUCLEAN if there master area is
+ * corrupted and requires recovery, and a negative error code in case of
+ * failure.
+ */
+static int scan_for_master(struct ubifs_info *c)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	int lnum, offs = 0, nodes_cnt;
+
+	lnum = UBIFS_MST_LNUM;
+
+	sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
+	if (IS_ERR(sleb))
+		return PTR_ERR(sleb);
+	nodes_cnt = sleb->nodes_cnt;
+	if (nodes_cnt > 0) {
+		snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+				  list);
+		if (snod->type != UBIFS_MST_NODE)
+			goto out_dump;
+		memcpy(c->mst_node, snod->node, snod->len);
+		offs = snod->offs;
+	}
+	ubifs_scan_destroy(sleb);
+
+	lnum += 1;
+
+	sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
+	if (IS_ERR(sleb))
+		return PTR_ERR(sleb);
+	if (sleb->nodes_cnt != nodes_cnt)
+		goto out;
+	if (!sleb->nodes_cnt)
+		goto out;
+	snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
+	if (snod->type != UBIFS_MST_NODE)
+		goto out_dump;
+	if (snod->offs != offs)
+		goto out;
+	if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
+		   (void *)snod->node + UBIFS_CH_SZ,
+		   UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+		goto out;
+	c->mst_offs = offs;
+	ubifs_scan_destroy(sleb);
+	return 0;
+
+out:
+	ubifs_scan_destroy(sleb);
+	return -EUCLEAN;
+
+out_dump:
+	ubifs_err("unexpected node type %d master LEB %d:%d",
+		  snod->type, lnum, snod->offs);
+	ubifs_scan_destroy(sleb);
+	return -EINVAL;
+}
+
+/**
+ * validate_master - validate master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function validates data which was read from master node. Returns zero
+ * if the data is all right and %-EINVAL if not.
+ */
+static int validate_master(const struct ubifs_info *c)
+{
+	long long main_sz;
+	int err;
+
+	if (c->max_sqnum >= SQNUM_WATERMARK) {
+		err = 1;
+		goto out;
+	}
+
+	if (c->cmt_no >= c->max_sqnum) {
+		err = 2;
+		goto out;
+	}
+
+	if (c->highest_inum >= INUM_WATERMARK) {
+		err = 3;
+		goto out;
+	}
+
+	if (c->lhead_lnum < UBIFS_LOG_LNUM ||
+	    c->lhead_lnum >= UBIFS_LOG_LNUM + c->log_lebs ||
+	    c->lhead_offs < 0 || c->lhead_offs >= c->leb_size ||
+	    c->lhead_offs & (c->min_io_size - 1)) {
+		err = 4;
+		goto out;
+	}
+
+	if (c->zroot.lnum >= c->leb_cnt || c->zroot.lnum < c->main_first ||
+	    c->zroot.offs >= c->leb_size || c->zroot.offs & 7) {
+		err = 5;
+		goto out;
+	}
+
+	if (c->zroot.len < c->ranges[UBIFS_IDX_NODE].min_len ||
+	    c->zroot.len > c->ranges[UBIFS_IDX_NODE].max_len) {
+		err = 6;
+		goto out;
+	}
+
+	if (c->gc_lnum >= c->leb_cnt || c->gc_lnum < c->main_first) {
+		err = 7;
+		goto out;
+	}
+
+	if (c->ihead_lnum >= c->leb_cnt || c->ihead_lnum < c->main_first ||
+	    c->ihead_offs % c->min_io_size || c->ihead_offs < 0 ||
+	    c->ihead_offs > c->leb_size || c->ihead_offs & 7) {
+		err = 8;
+		goto out;
+	}
+
+	main_sz = (long long)c->main_lebs * c->leb_size;
+	if (c->bi.old_idx_sz & 7 || c->bi.old_idx_sz >= main_sz) {
+		err = 9;
+		goto out;
+	}
+
+	if (c->lpt_lnum < c->lpt_first || c->lpt_lnum > c->lpt_last ||
+	    c->lpt_offs < 0 || c->lpt_offs + c->nnode_sz > c->leb_size) {
+		err = 10;
+		goto out;
+	}
+
+	if (c->nhead_lnum < c->lpt_first || c->nhead_lnum > c->lpt_last ||
+	    c->nhead_offs < 0 || c->nhead_offs % c->min_io_size ||
+	    c->nhead_offs > c->leb_size) {
+		err = 11;
+		goto out;
+	}
+
+	if (c->ltab_lnum < c->lpt_first || c->ltab_lnum > c->lpt_last ||
+	    c->ltab_offs < 0 ||
+	    c->ltab_offs + c->ltab_sz > c->leb_size) {
+		err = 12;
+		goto out;
+	}
+
+	if (c->big_lpt && (c->lsave_lnum < c->lpt_first ||
+	    c->lsave_lnum > c->lpt_last || c->lsave_offs < 0 ||
+	    c->lsave_offs + c->lsave_sz > c->leb_size)) {
+		err = 13;
+		goto out;
+	}
+
+	if (c->lscan_lnum < c->main_first || c->lscan_lnum >= c->leb_cnt) {
+		err = 14;
+		goto out;
+	}
+
+	if (c->lst.empty_lebs < 0 || c->lst.empty_lebs > c->main_lebs - 2) {
+		err = 15;
+		goto out;
+	}
+
+	if (c->lst.idx_lebs < 0 || c->lst.idx_lebs > c->main_lebs - 1) {
+		err = 16;
+		goto out;
+	}
+
+	if (c->lst.total_free < 0 || c->lst.total_free > main_sz ||
+	    c->lst.total_free & 7) {
+		err = 17;
+		goto out;
+	}
+
+	if (c->lst.total_dirty < 0 || (c->lst.total_dirty & 7)) {
+		err = 18;
+		goto out;
+	}
+
+	if (c->lst.total_used < 0 || (c->lst.total_used & 7)) {
+		err = 19;
+		goto out;
+	}
+
+	if (c->lst.total_free + c->lst.total_dirty +
+	    c->lst.total_used > main_sz) {
+		err = 20;
+		goto out;
+	}
+
+	if (c->lst.total_dead + c->lst.total_dark +
+	    c->lst.total_used + c->bi.old_idx_sz > main_sz) {
+		err = 21;
+		goto out;
+	}
+
+	if (c->lst.total_dead < 0 ||
+	    c->lst.total_dead > c->lst.total_free + c->lst.total_dirty ||
+	    c->lst.total_dead & 7) {
+		err = 22;
+		goto out;
+	}
+
+	if (c->lst.total_dark < 0 ||
+	    c->lst.total_dark > c->lst.total_free + c->lst.total_dirty ||
+	    c->lst.total_dark & 7) {
+		err = 23;
+		goto out;
+	}
+
+	return 0;
+
+out:
+	ubifs_err("bad master node at offset %d error %d", c->mst_offs, err);
+	dbg_dump_node(c, c->mst_node);
+	return -EINVAL;
+}
+
+/**
+ * ubifs_read_master - read master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function finds and reads the master node during file-system mount. If
+ * the flash is empty, it creates default master node as well. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+int ubifs_read_master(struct ubifs_info *c)
+{
+	int err, old_leb_cnt;
+
+	c->mst_node = kzalloc(c->mst_node_alsz, GFP_KERNEL);
+	if (!c->mst_node)
+		return -ENOMEM;
+
+	err = scan_for_master(c);
+	if (err) {
+		if (err == -EUCLEAN)
+			err = ubifs_recover_master_node(c);
+		if (err)
+			/*
+			 * Note, we do not free 'c->mst_node' here because the
+			 * unmount routine will take care of this.
+			 */
+			return err;
+	}
+
+	/* Make sure that the recovery flag is clear */
+	c->mst_node->flags &= cpu_to_le32(~UBIFS_MST_RCVRY);
+
+	c->max_sqnum       = le64_to_cpu(c->mst_node->ch.sqnum);
+	c->highest_inum    = le64_to_cpu(c->mst_node->highest_inum);
+	c->cmt_no          = le64_to_cpu(c->mst_node->cmt_no);
+	c->zroot.lnum      = le32_to_cpu(c->mst_node->root_lnum);
+	c->zroot.offs      = le32_to_cpu(c->mst_node->root_offs);
+	c->zroot.len       = le32_to_cpu(c->mst_node->root_len);
+	c->lhead_lnum      = le32_to_cpu(c->mst_node->log_lnum);
+	c->gc_lnum         = le32_to_cpu(c->mst_node->gc_lnum);
+	c->ihead_lnum      = le32_to_cpu(c->mst_node->ihead_lnum);
+	c->ihead_offs      = le32_to_cpu(c->mst_node->ihead_offs);
+	c->bi.old_idx_sz   = le64_to_cpu(c->mst_node->index_size);
+	c->lpt_lnum        = le32_to_cpu(c->mst_node->lpt_lnum);
+	c->lpt_offs        = le32_to_cpu(c->mst_node->lpt_offs);
+	c->nhead_lnum      = le32_to_cpu(c->mst_node->nhead_lnum);
+	c->nhead_offs      = le32_to_cpu(c->mst_node->nhead_offs);
+	c->ltab_lnum       = le32_to_cpu(c->mst_node->ltab_lnum);
+	c->ltab_offs       = le32_to_cpu(c->mst_node->ltab_offs);
+	c->lsave_lnum      = le32_to_cpu(c->mst_node->lsave_lnum);
+	c->lsave_offs      = le32_to_cpu(c->mst_node->lsave_offs);
+	c->lscan_lnum      = le32_to_cpu(c->mst_node->lscan_lnum);
+	c->lst.empty_lebs  = le32_to_cpu(c->mst_node->empty_lebs);
+	c->lst.idx_lebs    = le32_to_cpu(c->mst_node->idx_lebs);
+	old_leb_cnt        = le32_to_cpu(c->mst_node->leb_cnt);
+	c->lst.total_free  = le64_to_cpu(c->mst_node->total_free);
+	c->lst.total_dirty = le64_to_cpu(c->mst_node->total_dirty);
+	c->lst.total_used  = le64_to_cpu(c->mst_node->total_used);
+	c->lst.total_dead  = le64_to_cpu(c->mst_node->total_dead);
+	c->lst.total_dark  = le64_to_cpu(c->mst_node->total_dark);
+
+	c->calc_idx_sz = c->bi.old_idx_sz;
+
+	if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
+		c->no_orphs = 1;
+
+	if (old_leb_cnt != c->leb_cnt) {
+		/* The file system has been resized */
+		int growth = c->leb_cnt - old_leb_cnt;
+
+		if (c->leb_cnt < old_leb_cnt ||
+		    c->leb_cnt < UBIFS_MIN_LEB_CNT) {
+			ubifs_err("bad leb_cnt on master node");
+			dbg_dump_node(c, c->mst_node);
+			return -EINVAL;
+		}
+
+		dbg_mnt("Auto resizing (master) from %d LEBs to %d LEBs",
+			old_leb_cnt, c->leb_cnt);
+		c->lst.empty_lebs += growth;
+		c->lst.total_free += growth * (long long)c->leb_size;
+		c->lst.total_dark += growth * (long long)c->dark_wm;
+
+		/*
+		 * Reflect changes back onto the master node. N.B. the master
+		 * node gets written immediately whenever mounting (or
+		 * remounting) in read-write mode, so we do not need to write it
+		 * here.
+		 */
+		c->mst_node->leb_cnt = cpu_to_le32(c->leb_cnt);
+		c->mst_node->empty_lebs = cpu_to_le32(c->lst.empty_lebs);
+		c->mst_node->total_free = cpu_to_le64(c->lst.total_free);
+		c->mst_node->total_dark = cpu_to_le64(c->lst.total_dark);
+	}
+
+	err = validate_master(c);
+	if (err)
+		return err;
+
+	err = dbg_old_index_check_init(c, &c->zroot);
+
+	return err;
+}
+
+/**
+ * ubifs_write_master - write master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the master node. The caller has to take the
+ * @c->mst_mutex lock before calling this function. Returns zero in case of
+ * success and a negative error code in case of failure. The master node is
+ * written twice to enable recovery.
+ */
+int ubifs_write_master(struct ubifs_info *c)
+{
+	int err, lnum, offs, len;
+
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error)
+		return -EROFS;
+
+	lnum = UBIFS_MST_LNUM;
+	offs = c->mst_offs + c->mst_node_alsz;
+	len = UBIFS_MST_NODE_SZ;
+
+	if (offs + UBIFS_MST_NODE_SZ > c->leb_size) {
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+		offs = 0;
+	}
+
+	c->mst_offs = offs;
+	c->mst_node->highest_inum = cpu_to_le64(c->highest_inum);
+
+	err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
+	if (err)
+		return err;
+
+	lnum += 1;
+
+	if (offs == 0) {
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+	}
+	err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
+
+	return err;
+}
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
new file mode 100644
index 00000000..0b5296a9
--- /dev/null
+++ b/fs/ubifs/misc.h
@@ -0,0 +1,360 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file contains miscellaneous helper functions.
+ */
+
+#ifndef __UBIFS_MISC_H__
+#define __UBIFS_MISC_H__
+
+/**
+ * ubifs_zn_dirty - check if znode is dirty.
+ * @znode: znode to check
+ *
+ * This helper function returns %1 if @znode is dirty and %0 otherwise.
+ */
+static inline int ubifs_zn_dirty(const struct ubifs_znode *znode)
+{
+	return !!test_bit(DIRTY_ZNODE, &znode->flags);
+}
+
+/**
+ * ubifs_wake_up_bgt - wake up background thread.
+ * @c: UBIFS file-system description object
+ */
+static inline void ubifs_wake_up_bgt(struct ubifs_info *c)
+{
+	if (c->bgt && !c->need_bgt) {
+		c->need_bgt = 1;
+		wake_up_process(c->bgt);
+	}
+}
+
+/**
+ * ubifs_tnc_find_child - find next child in znode.
+ * @znode: znode to search at
+ * @start: the zbranch index to start at
+ *
+ * This helper function looks for znode child starting at index @start. Returns
+ * the child or %NULL if no children were found.
+ */
+static inline struct ubifs_znode *
+ubifs_tnc_find_child(struct ubifs_znode *znode, int start)
+{
+	while (start < znode->child_cnt) {
+		if (znode->zbranch[start].znode)
+			return znode->zbranch[start].znode;
+		start += 1;
+	}
+
+	return NULL;
+}
+
+/**
+ * ubifs_inode - get UBIFS inode information by VFS 'struct inode' object.
+ * @inode: the VFS 'struct inode' pointer
+ */
+static inline struct ubifs_inode *ubifs_inode(const struct inode *inode)
+{
+	return container_of(inode, struct ubifs_inode, vfs_inode);
+}
+
+/**
+ * ubifs_compr_present - check if compressor was compiled in.
+ * @compr_type: compressor type to check
+ *
+ * This function returns %1 of compressor of type @compr_type is present, and
+ * %0 if not.
+ */
+static inline int ubifs_compr_present(int compr_type)
+{
+	ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
+	return !!ubifs_compressors[compr_type]->capi_name;
+}
+
+/**
+ * ubifs_compr_name - get compressor name string by its type.
+ * @compr_type: compressor type
+ *
+ * This function returns compressor type string.
+ */
+static inline const char *ubifs_compr_name(int compr_type)
+{
+	ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
+	return ubifs_compressors[compr_type]->name;
+}
+
+/**
+ * ubifs_wbuf_sync - synchronize write-buffer.
+ * @wbuf: write-buffer to synchronize
+ *
+ * This is the same as as 'ubifs_wbuf_sync_nolock()' but it does not assume
+ * that the write-buffer is already locked.
+ */
+static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf)
+{
+	int err;
+
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+	err = ubifs_wbuf_sync_nolock(wbuf);
+	mutex_unlock(&wbuf->io_mutex);
+	return err;
+}
+
+/**
+ * ubifs_leb_unmap - unmap an LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to unmap
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
+{
+	int err;
+
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error)
+		return -EROFS;
+	err = ubi_leb_unmap(c->ubi, lnum);
+	if (err) {
+		ubifs_err("unmap LEB %d failed, error %d", lnum, err);
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * ubifs_leb_write - write to a LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to write
+ * @buf: buffer to write from
+ * @offs: offset within LEB to write to
+ * @len: length to write
+ * @dtype: data type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
+				  const void *buf, int offs, int len, int dtype)
+{
+	int err;
+
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error)
+		return -EROFS;
+	err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
+	if (err) {
+		ubifs_err("writing %d bytes at %d:%d, error %d",
+			  len, lnum, offs, err);
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * ubifs_leb_change - atomic LEB change.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to write
+ * @buf: buffer to write from
+ * @len: length to write
+ * @dtype: data type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
+				   const void *buf, int len, int dtype)
+{
+	int err;
+
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error)
+		return -EROFS;
+	err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
+	if (err) {
+		ubifs_err("changing %d bytes in LEB %d, error %d",
+			  len, lnum, err);
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * ubifs_encode_dev - encode device node IDs.
+ * @dev: UBIFS device node information
+ * @rdev: device IDs to encode
+ *
+ * This is a helper function which encodes major/minor numbers of a device node
+ * into UBIFS device node description. We use standard Linux "new" and "huge"
+ * encodings.
+ */
+static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev)
+{
+	if (new_valid_dev(rdev)) {
+		dev->new = cpu_to_le32(new_encode_dev(rdev));
+		return sizeof(dev->new);
+	} else {
+		dev->huge = cpu_to_le64(huge_encode_dev(rdev));
+		return sizeof(dev->huge);
+	}
+}
+
+/**
+ * ubifs_add_dirt - add dirty space to LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to add dirty space for
+ * @dirty: dirty space to add
+ *
+ * This is a helper function which increased amount of dirty LEB space. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static inline int ubifs_add_dirt(struct ubifs_info *c, int lnum, int dirty)
+{
+	return ubifs_update_one_lp(c, lnum, LPROPS_NC, dirty, 0, 0);
+}
+
+/**
+ * ubifs_return_leb - return LEB to lprops.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to return
+ *
+ * This helper function cleans the "taken" flag of a logical eraseblock in the
+ * lprops. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static inline int ubifs_return_leb(struct ubifs_info *c, int lnum)
+{
+	return ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+				   LPROPS_TAKEN, 0);
+}
+
+/**
+ * ubifs_idx_node_sz - return index node size.
+ * @c: the UBIFS file-system description object
+ * @child_cnt: number of children of this index node
+ */
+static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt)
+{
+	return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len) * child_cnt;
+}
+
+/**
+ * ubifs_idx_branch - return pointer to an index branch.
+ * @c: the UBIFS file-system description object
+ * @idx: index node
+ * @bnum: branch number
+ */
+static inline
+struct ubifs_branch *ubifs_idx_branch(const struct ubifs_info *c,
+				      const struct ubifs_idx_node *idx,
+				      int bnum)
+{
+	return (struct ubifs_branch *)((void *)idx->branches +
+				       (UBIFS_BRANCH_SZ + c->key_len) * bnum);
+}
+
+/**
+ * ubifs_idx_key - return pointer to an index key.
+ * @c: the UBIFS file-system description object
+ * @idx: index node
+ */
+static inline void *ubifs_idx_key(const struct ubifs_info *c,
+				  const struct ubifs_idx_node *idx)
+{
+	return (void *)((struct ubifs_branch *)idx->branches)->key;
+}
+
+/**
+ * ubifs_current_time - round current time to time granularity.
+ * @inode: inode
+ */
+static inline struct timespec ubifs_current_time(struct inode *inode)
+{
+	return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
+		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+}
+
+/**
+ * ubifs_tnc_lookup - look up a file-system node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ *
+ * This function look up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure.
+ */
+static inline int ubifs_tnc_lookup(struct ubifs_info *c,
+				   const union ubifs_key *key, void *node)
+{
+	return ubifs_tnc_locate(c, key, node, NULL, NULL);
+}
+
+/**
+ * ubifs_get_lprops - get reference to LEB properties.
+ * @c: the UBIFS file-system description object
+ *
+ * This function locks lprops. Lprops have to be unlocked by
+ * 'ubifs_release_lprops()'.
+ */
+static inline void ubifs_get_lprops(struct ubifs_info *c)
+{
+	mutex_lock(&c->lp_mutex);
+}
+
+/**
+ * ubifs_release_lprops - release lprops lock.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called after each 'ubifs_get_lprops()' call to
+ * unlock lprops.
+ */
+static inline void ubifs_release_lprops(struct ubifs_info *c)
+{
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+	ubifs_assert(c->lst.empty_lebs >= 0 &&
+		     c->lst.empty_lebs <= c->main_lebs);
+	mutex_unlock(&c->lp_mutex);
+}
+
+/**
+ * ubifs_next_log_lnum - switch to the next log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: current log LEB
+ *
+ * This helper function returns the log LEB number which goes next after LEB
+ * 'lnum'.
+ */
+static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum)
+{
+	lnum += 1;
+	if (lnum > c->log_last)
+		lnum = UBIFS_LOG_LNUM;
+
+	return lnum;
+}
+
+#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
new file mode 100644
index 00000000..a5422fff
--- /dev/null
+++ b/fs/ubifs/orphan.c
@@ -0,0 +1,972 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Author: Adrian Hunter
+ */
+
+#include "ubifs.h"
+
+/*
+ * An orphan is an inode number whose inode node has been committed to the index
+ * with a link count of zero. That happens when an open file is deleted
+ * (unlinked) and then a commit is run. In the normal course of events the inode
+ * would be deleted when the file is closed. However in the case of an unclean
+ * unmount, orphans need to be accounted for. After an unclean unmount, the
+ * orphans' inodes must be deleted which means either scanning the entire index
+ * looking for them, or keeping a list on flash somewhere. This unit implements
+ * the latter approach.
+ *
+ * The orphan area is a fixed number of LEBs situated between the LPT area and
+ * the main area. The number of orphan area LEBs is specified when the file
+ * system is created. The minimum number is 1. The size of the orphan area
+ * should be so that it can hold the maximum number of orphans that are expected
+ * to ever exist at one time.
+ *
+ * The number of orphans that can fit in a LEB is:
+ *
+ *         (c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)
+ *
+ * For example: a 15872 byte LEB can fit 1980 orphans so 1 LEB may be enough.
+ *
+ * Orphans are accumulated in a rb-tree. When an inode's link count drops to
+ * zero, the inode number is added to the rb-tree. It is removed from the tree
+ * when the inode is deleted.  Any new orphans that are in the orphan tree when
+ * the commit is run, are written to the orphan area in 1 or more orphan nodes.
+ * If the orphan area is full, it is consolidated to make space.  There is
+ * always enough space because validation prevents the user from creating more
+ * than the maximum number of orphans allowed.
+ */
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_check_orphans(struct ubifs_info *c);
+#else
+#define dbg_check_orphans(c) 0
+#endif
+
+/**
+ * ubifs_add_orphan - add an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * Add an orphan. This function is called when an inodes link count drops to
+ * zero.
+ */
+int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
+{
+	struct ubifs_orphan *orphan, *o;
+	struct rb_node **p, *parent = NULL;
+
+	orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS);
+	if (!orphan)
+		return -ENOMEM;
+	orphan->inum = inum;
+	orphan->new = 1;
+
+	spin_lock(&c->orphan_lock);
+	if (c->tot_orphans >= c->max_orphans) {
+		spin_unlock(&c->orphan_lock);
+		kfree(orphan);
+		return -ENFILE;
+	}
+	p = &c->orph_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		o = rb_entry(parent, struct ubifs_orphan, rb);
+		if (inum < o->inum)
+			p = &(*p)->rb_left;
+		else if (inum > o->inum)
+			p = &(*p)->rb_right;
+		else {
+			dbg_err("orphaned twice");
+			spin_unlock(&c->orphan_lock);
+			kfree(orphan);
+			return 0;
+		}
+	}
+	c->tot_orphans += 1;
+	c->new_orphans += 1;
+	rb_link_node(&orphan->rb, parent, p);
+	rb_insert_color(&orphan->rb, &c->orph_tree);
+	list_add_tail(&orphan->list, &c->orph_list);
+	list_add_tail(&orphan->new_list, &c->orph_new);
+	spin_unlock(&c->orphan_lock);
+	dbg_gen("ino %lu", (unsigned long)inum);
+	return 0;
+}
+
+/**
+ * ubifs_delete_orphan - delete an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * Delete an orphan. This function is called when an inode is deleted.
+ */
+void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
+{
+	struct ubifs_orphan *o;
+	struct rb_node *p;
+
+	spin_lock(&c->orphan_lock);
+	p = c->orph_tree.rb_node;
+	while (p) {
+		o = rb_entry(p, struct ubifs_orphan, rb);
+		if (inum < o->inum)
+			p = p->rb_left;
+		else if (inum > o->inum)
+			p = p->rb_right;
+		else {
+			if (o->dnext) {
+				spin_unlock(&c->orphan_lock);
+				dbg_gen("deleted twice ino %lu",
+					(unsigned long)inum);
+				return;
+			}
+			if (o->cnext) {
+				o->dnext = c->orph_dnext;
+				c->orph_dnext = o;
+				spin_unlock(&c->orphan_lock);
+				dbg_gen("delete later ino %lu",
+					(unsigned long)inum);
+				return;
+			}
+			rb_erase(p, &c->orph_tree);
+			list_del(&o->list);
+			c->tot_orphans -= 1;
+			if (o->new) {
+				list_del(&o->new_list);
+				c->new_orphans -= 1;
+			}
+			spin_unlock(&c->orphan_lock);
+			kfree(o);
+			dbg_gen("inum %lu", (unsigned long)inum);
+			return;
+		}
+	}
+	spin_unlock(&c->orphan_lock);
+	dbg_err("missing orphan ino %lu", (unsigned long)inum);
+	dbg_dump_stack();
+}
+
+/**
+ * ubifs_orphan_start_commit - start commit of orphans.
+ * @c: UBIFS file-system description object
+ *
+ * Start commit of orphans.
+ */
+int ubifs_orphan_start_commit(struct ubifs_info *c)
+{
+	struct ubifs_orphan *orphan, **last;
+
+	spin_lock(&c->orphan_lock);
+	last = &c->orph_cnext;
+	list_for_each_entry(orphan, &c->orph_new, new_list) {
+		ubifs_assert(orphan->new);
+		orphan->new = 0;
+		*last = orphan;
+		last = &orphan->cnext;
+	}
+	*last = orphan->cnext;
+	c->cmt_orphans = c->new_orphans;
+	c->new_orphans = 0;
+	dbg_cmt("%d orphans to commit", c->cmt_orphans);
+	INIT_LIST_HEAD(&c->orph_new);
+	if (c->tot_orphans == 0)
+		c->no_orphs = 1;
+	else
+		c->no_orphs = 0;
+	spin_unlock(&c->orphan_lock);
+	return 0;
+}
+
+/**
+ * avail_orphs - calculate available space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of orphans that can be written in the
+ * available space.
+ */
+static int avail_orphs(struct ubifs_info *c)
+{
+	int avail_lebs, avail, gap;
+
+	avail_lebs = c->orph_lebs - (c->ohead_lnum - c->orph_first) - 1;
+	avail = avail_lebs *
+	       ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
+	gap = c->leb_size - c->ohead_offs;
+	if (gap >= UBIFS_ORPH_NODE_SZ + sizeof(__le64))
+		avail += (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
+	return avail;
+}
+
+/**
+ * tot_avail_orphs - calculate total space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of orphans that can be written in half
+ * the total space. That leaves half the space for adding new orphans.
+ */
+static int tot_avail_orphs(struct ubifs_info *c)
+{
+	int avail_lebs, avail;
+
+	avail_lebs = c->orph_lebs;
+	avail = avail_lebs *
+	       ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
+	return avail / 2;
+}
+
+/**
+ * do_write_orph_node - write a node to the orphan head.
+ * @c: UBIFS file-system description object
+ * @len: length of node
+ * @atomic: write atomically
+ *
+ * This function writes a node to the orphan head from the orphan buffer. If
+ * %atomic is not zero, then the write is done atomically. On success, %0 is
+ * returned, otherwise a negative error code is returned.
+ */
+static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
+{
+	int err = 0;
+
+	if (atomic) {
+		ubifs_assert(c->ohead_offs == 0);
+		ubifs_prepare_node(c, c->orph_buf, len, 1);
+		len = ALIGN(len, c->min_io_size);
+		err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len,
+				       UBI_SHORTTERM);
+	} else {
+		if (c->ohead_offs == 0) {
+			/* Ensure LEB has been unmapped */
+			err = ubifs_leb_unmap(c, c->ohead_lnum);
+			if (err)
+				return err;
+		}
+		err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum,
+				       c->ohead_offs, UBI_SHORTTERM);
+	}
+	return err;
+}
+
+/**
+ * write_orph_node - write an orphan node.
+ * @c: UBIFS file-system description object
+ * @atomic: write atomically
+ *
+ * This function builds an orphan node from the cnext list and writes it to the
+ * orphan head. On success, %0 is returned, otherwise a negative error code
+ * is returned.
+ */
+static int write_orph_node(struct ubifs_info *c, int atomic)
+{
+	struct ubifs_orphan *orphan, *cnext;
+	struct ubifs_orph_node *orph;
+	int gap, err, len, cnt, i;
+
+	ubifs_assert(c->cmt_orphans > 0);
+	gap = c->leb_size - c->ohead_offs;
+	if (gap < UBIFS_ORPH_NODE_SZ + sizeof(__le64)) {
+		c->ohead_lnum += 1;
+		c->ohead_offs = 0;
+		gap = c->leb_size;
+		if (c->ohead_lnum > c->orph_last) {
+			/*
+			 * We limit the number of orphans so that this should
+			 * never happen.
+			 */
+			ubifs_err("out of space in orphan area");
+			return -EINVAL;
+		}
+	}
+	cnt = (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
+	if (cnt > c->cmt_orphans)
+		cnt = c->cmt_orphans;
+	len = UBIFS_ORPH_NODE_SZ + cnt * sizeof(__le64);
+	ubifs_assert(c->orph_buf);
+	orph = c->orph_buf;
+	orph->ch.node_type = UBIFS_ORPH_NODE;
+	spin_lock(&c->orphan_lock);
+	cnext = c->orph_cnext;
+	for (i = 0; i < cnt; i++) {
+		orphan = cnext;
+		orph->inos[i] = cpu_to_le64(orphan->inum);
+		cnext = orphan->cnext;
+		orphan->cnext = NULL;
+	}
+	c->orph_cnext = cnext;
+	c->cmt_orphans -= cnt;
+	spin_unlock(&c->orphan_lock);
+	if (c->cmt_orphans)
+		orph->cmt_no = cpu_to_le64(c->cmt_no);
+	else
+		/* Mark the last node of the commit */
+		orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63));
+	ubifs_assert(c->ohead_offs + len <= c->leb_size);
+	ubifs_assert(c->ohead_lnum >= c->orph_first);
+	ubifs_assert(c->ohead_lnum <= c->orph_last);
+	err = do_write_orph_node(c, len, atomic);
+	c->ohead_offs += ALIGN(len, c->min_io_size);
+	c->ohead_offs = ALIGN(c->ohead_offs, 8);
+	return err;
+}
+
+/**
+ * write_orph_nodes - write orphan nodes until there are no more to commit.
+ * @c: UBIFS file-system description object
+ * @atomic: write atomically
+ *
+ * This function writes orphan nodes for all the orphans to commit. On success,
+ * %0 is returned, otherwise a negative error code is returned.
+ */
+static int write_orph_nodes(struct ubifs_info *c, int atomic)
+{
+	int err;
+
+	while (c->cmt_orphans > 0) {
+		err = write_orph_node(c, atomic);
+		if (err)
+			return err;
+	}
+	if (atomic) {
+		int lnum;
+
+		/* Unmap any unused LEBs after consolidation */
+		lnum = c->ohead_lnum + 1;
+		for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) {
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+/**
+ * consolidate - consolidate the orphan area.
+ * @c: UBIFS file-system description object
+ *
+ * This function enables consolidation by putting all the orphans into the list
+ * to commit. The list is in the order that the orphans were added, and the
+ * LEBs are written atomically in order, so at no time can orphans be lost by
+ * an unclean unmount.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int consolidate(struct ubifs_info *c)
+{
+	int tot_avail = tot_avail_orphs(c), err = 0;
+
+	spin_lock(&c->orphan_lock);
+	dbg_cmt("there is space for %d orphans and there are %d",
+		tot_avail, c->tot_orphans);
+	if (c->tot_orphans - c->new_orphans <= tot_avail) {
+		struct ubifs_orphan *orphan, **last;
+		int cnt = 0;
+
+		/* Change the cnext list to include all non-new orphans */
+		last = &c->orph_cnext;
+		list_for_each_entry(orphan, &c->orph_list, list) {
+			if (orphan->new)
+				continue;
+			*last = orphan;
+			last = &orphan->cnext;
+			cnt += 1;
+		}
+		*last = orphan->cnext;
+		ubifs_assert(cnt == c->tot_orphans - c->new_orphans);
+		c->cmt_orphans = cnt;
+		c->ohead_lnum = c->orph_first;
+		c->ohead_offs = 0;
+	} else {
+		/*
+		 * We limit the number of orphans so that this should
+		 * never happen.
+		 */
+		ubifs_err("out of space in orphan area");
+		err = -EINVAL;
+	}
+	spin_unlock(&c->orphan_lock);
+	return err;
+}
+
+/**
+ * commit_orphans - commit orphans.
+ * @c: UBIFS file-system description object
+ *
+ * This function commits orphans to flash. On success, %0 is returned,
+ * otherwise a negative error code is returned.
+ */
+static int commit_orphans(struct ubifs_info *c)
+{
+	int avail, atomic = 0, err;
+
+	ubifs_assert(c->cmt_orphans > 0);
+	avail = avail_orphs(c);
+	if (avail < c->cmt_orphans) {
+		/* Not enough space to write new orphans, so consolidate */
+		err = consolidate(c);
+		if (err)
+			return err;
+		atomic = 1;
+	}
+	err = write_orph_nodes(c, atomic);
+	return err;
+}
+
+/**
+ * erase_deleted - erase the orphans marked for deletion.
+ * @c: UBIFS file-system description object
+ *
+ * During commit, the orphans being committed cannot be deleted, so they are
+ * marked for deletion and deleted by this function. Also, the recovery
+ * adds killed orphans to the deletion list, and therefore they are deleted
+ * here too.
+ */
+static void erase_deleted(struct ubifs_info *c)
+{
+	struct ubifs_orphan *orphan, *dnext;
+
+	spin_lock(&c->orphan_lock);
+	dnext = c->orph_dnext;
+	while (dnext) {
+		orphan = dnext;
+		dnext = orphan->dnext;
+		ubifs_assert(!orphan->new);
+		rb_erase(&orphan->rb, &c->orph_tree);
+		list_del(&orphan->list);
+		c->tot_orphans -= 1;
+		dbg_gen("deleting orphan ino %lu", (unsigned long)orphan->inum);
+		kfree(orphan);
+	}
+	c->orph_dnext = NULL;
+	spin_unlock(&c->orphan_lock);
+}
+
+/**
+ * ubifs_orphan_end_commit - end commit of orphans.
+ * @c: UBIFS file-system description object
+ *
+ * End commit of orphans.
+ */
+int ubifs_orphan_end_commit(struct ubifs_info *c)
+{
+	int err;
+
+	if (c->cmt_orphans != 0) {
+		err = commit_orphans(c);
+		if (err)
+			return err;
+	}
+	erase_deleted(c);
+	err = dbg_check_orphans(c);
+	return err;
+}
+
+/**
+ * ubifs_clear_orphans - erase all LEBs used for orphans.
+ * @c: UBIFS file-system description object
+ *
+ * If recovery is not required, then the orphans from the previous session
+ * are not needed. This function locates the LEBs used to record
+ * orphans, and un-maps them.
+ */
+int ubifs_clear_orphans(struct ubifs_info *c)
+{
+	int lnum, err;
+
+	for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+	}
+	c->ohead_lnum = c->orph_first;
+	c->ohead_offs = 0;
+	return 0;
+}
+
+/**
+ * insert_dead_orphan - insert an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * This function is a helper to the 'do_kill_orphans()' function. The orphan
+ * must be kept until the next commit, so it is added to the rb-tree and the
+ * deletion list.
+ */
+static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
+{
+	struct ubifs_orphan *orphan, *o;
+	struct rb_node **p, *parent = NULL;
+
+	orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL);
+	if (!orphan)
+		return -ENOMEM;
+	orphan->inum = inum;
+
+	p = &c->orph_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		o = rb_entry(parent, struct ubifs_orphan, rb);
+		if (inum < o->inum)
+			p = &(*p)->rb_left;
+		else if (inum > o->inum)
+			p = &(*p)->rb_right;
+		else {
+			/* Already added - no problem */
+			kfree(orphan);
+			return 0;
+		}
+	}
+	c->tot_orphans += 1;
+	rb_link_node(&orphan->rb, parent, p);
+	rb_insert_color(&orphan->rb, &c->orph_tree);
+	list_add_tail(&orphan->list, &c->orph_list);
+	orphan->dnext = c->orph_dnext;
+	c->orph_dnext = orphan;
+	dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum,
+		c->new_orphans, c->tot_orphans);
+	return 0;
+}
+
+/**
+ * do_kill_orphans - remove orphan inodes from the index.
+ * @c: UBIFS file-system description object
+ * @sleb: scanned LEB
+ * @last_cmt_no: cmt_no of last orphan node read is passed and returned here
+ * @outofdate: whether the LEB is out of date is returned here
+ * @last_flagged: whether the end orphan node is encountered
+ *
+ * This function is a helper to the 'kill_orphans()' function. It goes through
+ * every orphan node in a LEB and for every inode number recorded, removes
+ * all keys for that inode from the TNC.
+ */
+static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+			   unsigned long long *last_cmt_no, int *outofdate,
+			   int *last_flagged)
+{
+	struct ubifs_scan_node *snod;
+	struct ubifs_orph_node *orph;
+	unsigned long long cmt_no;
+	ino_t inum;
+	int i, n, err, first = 1;
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		if (snod->type != UBIFS_ORPH_NODE) {
+			ubifs_err("invalid node type %d in orphan area at "
+				  "%d:%d", snod->type, sleb->lnum, snod->offs);
+			dbg_dump_node(c, snod->node);
+			return -EINVAL;
+		}
+
+		orph = snod->node;
+
+		/* Check commit number */
+		cmt_no = le64_to_cpu(orph->cmt_no) & LLONG_MAX;
+		/*
+		 * The commit number on the master node may be less, because
+		 * of a failed commit. If there are several failed commits in a
+		 * row, the commit number written on orphan nodes will continue
+		 * to increase (because the commit number is adjusted here) even
+		 * though the commit number on the master node stays the same
+		 * because the master node has not been re-written.
+		 */
+		if (cmt_no > c->cmt_no)
+			c->cmt_no = cmt_no;
+		if (cmt_no < *last_cmt_no && *last_flagged) {
+			/*
+			 * The last orphan node had a higher commit number and
+			 * was flagged as the last written for that commit
+			 * number. That makes this orphan node, out of date.
+			 */
+			if (!first) {
+				ubifs_err("out of order commit number %llu in "
+					  "orphan node at %d:%d",
+					  cmt_no, sleb->lnum, snod->offs);
+				dbg_dump_node(c, snod->node);
+				return -EINVAL;
+			}
+			dbg_rcvry("out of date LEB %d", sleb->lnum);
+			*outofdate = 1;
+			return 0;
+		}
+
+		if (first)
+			first = 0;
+
+		n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
+		for (i = 0; i < n; i++) {
+			inum = le64_to_cpu(orph->inos[i]);
+			dbg_rcvry("deleting orphaned inode %lu",
+				  (unsigned long)inum);
+			err = ubifs_tnc_remove_ino(c, inum);
+			if (err)
+				return err;
+			err = insert_dead_orphan(c, inum);
+			if (err)
+				return err;
+		}
+
+		*last_cmt_no = cmt_no;
+		if (le64_to_cpu(orph->cmt_no) & (1ULL << 63)) {
+			dbg_rcvry("last orph node for commit %llu at %d:%d",
+				  cmt_no, sleb->lnum, snod->offs);
+			*last_flagged = 1;
+		} else
+			*last_flagged = 0;
+	}
+
+	return 0;
+}
+
+/**
+ * kill_orphans - remove all orphan inodes from the index.
+ * @c: UBIFS file-system description object
+ *
+ * If recovery is required, then orphan inodes recorded during the previous
+ * session (which ended with an unclean unmount) must be deleted from the index.
+ * This is done by updating the TNC, but since the index is not updated until
+ * the next commit, the LEBs where the orphan information is recorded are not
+ * erased until the next commit.
+ */
+static int kill_orphans(struct ubifs_info *c)
+{
+	unsigned long long last_cmt_no = 0;
+	int lnum, err = 0, outofdate = 0, last_flagged = 0;
+
+	c->ohead_lnum = c->orph_first;
+	c->ohead_offs = 0;
+	/* Check no-orphans flag and skip this if no orphans */
+	if (c->no_orphs) {
+		dbg_rcvry("no orphans");
+		return 0;
+	}
+	/*
+	 * Orph nodes always start at c->orph_first and are written to each
+	 * successive LEB in turn. Generally unused LEBs will have been unmapped
+	 * but may contain out of date orphan nodes if the unmap didn't go
+	 * through. In addition, the last orphan node written for each commit is
+	 * marked (top bit of orph->cmt_no is set to 1). It is possible that
+	 * there are orphan nodes from the next commit (i.e. the commit did not
+	 * complete successfully). In that case, no orphans will have been lost
+	 * due to the way that orphans are written, and any orphans added will
+	 * be valid orphans anyway and so can be deleted.
+	 */
+	for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+		struct ubifs_scan_leb *sleb;
+
+		dbg_rcvry("LEB %d", lnum);
+		sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
+		if (IS_ERR(sleb)) {
+			if (PTR_ERR(sleb) == -EUCLEAN)
+				sleb = ubifs_recover_leb(c, lnum, 0,
+							 c->sbuf, -1);
+			if (IS_ERR(sleb)) {
+				err = PTR_ERR(sleb);
+				break;
+			}
+		}
+		err = do_kill_orphans(c, sleb, &last_cmt_no, &outofdate,
+				      &last_flagged);
+		if (err || outofdate) {
+			ubifs_scan_destroy(sleb);
+			break;
+		}
+		if (sleb->endpt) {
+			c->ohead_lnum = lnum;
+			c->ohead_offs = sleb->endpt;
+		}
+		ubifs_scan_destroy(sleb);
+	}
+	return err;
+}
+
+/**
+ * ubifs_mount_orphans - delete orphan inodes and erase LEBs that recorded them.
+ * @c: UBIFS file-system description object
+ * @unclean: indicates recovery from unclean unmount
+ * @read_only: indicates read only mount
+ *
+ * This function is called when mounting to erase orphans from the previous
+ * session. If UBIFS was not unmounted cleanly, then the inodes recorded as
+ * orphans are deleted.
+ */
+int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
+{
+	int err = 0;
+
+	c->max_orphans = tot_avail_orphs(c);
+
+	if (!read_only) {
+		c->orph_buf = vmalloc(c->leb_size);
+		if (!c->orph_buf)
+			return -ENOMEM;
+	}
+
+	if (unclean)
+		err = kill_orphans(c);
+	else if (!read_only)
+		err = ubifs_clear_orphans(c);
+
+	return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+struct check_orphan {
+	struct rb_node rb;
+	ino_t inum;
+};
+
+struct check_info {
+	unsigned long last_ino;
+	unsigned long tot_inos;
+	unsigned long missing;
+	unsigned long long leaf_cnt;
+	struct ubifs_ino_node *node;
+	struct rb_root root;
+};
+
+static int dbg_find_orphan(struct ubifs_info *c, ino_t inum)
+{
+	struct ubifs_orphan *o;
+	struct rb_node *p;
+
+	spin_lock(&c->orphan_lock);
+	p = c->orph_tree.rb_node;
+	while (p) {
+		o = rb_entry(p, struct ubifs_orphan, rb);
+		if (inum < o->inum)
+			p = p->rb_left;
+		else if (inum > o->inum)
+			p = p->rb_right;
+		else {
+			spin_unlock(&c->orphan_lock);
+			return 1;
+		}
+	}
+	spin_unlock(&c->orphan_lock);
+	return 0;
+}
+
+static int dbg_ins_check_orphan(struct rb_root *root, ino_t inum)
+{
+	struct check_orphan *orphan, *o;
+	struct rb_node **p, *parent = NULL;
+
+	orphan = kzalloc(sizeof(struct check_orphan), GFP_NOFS);
+	if (!orphan)
+		return -ENOMEM;
+	orphan->inum = inum;
+
+	p = &root->rb_node;
+	while (*p) {
+		parent = *p;
+		o = rb_entry(parent, struct check_orphan, rb);
+		if (inum < o->inum)
+			p = &(*p)->rb_left;
+		else if (inum > o->inum)
+			p = &(*p)->rb_right;
+		else {
+			kfree(orphan);
+			return 0;
+		}
+	}
+	rb_link_node(&orphan->rb, parent, p);
+	rb_insert_color(&orphan->rb, root);
+	return 0;
+}
+
+static int dbg_find_check_orphan(struct rb_root *root, ino_t inum)
+{
+	struct check_orphan *o;
+	struct rb_node *p;
+
+	p = root->rb_node;
+	while (p) {
+		o = rb_entry(p, struct check_orphan, rb);
+		if (inum < o->inum)
+			p = p->rb_left;
+		else if (inum > o->inum)
+			p = p->rb_right;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+static void dbg_free_check_tree(struct rb_root *root)
+{
+	struct rb_node *this = root->rb_node;
+	struct check_orphan *o;
+
+	while (this) {
+		if (this->rb_left) {
+			this = this->rb_left;
+			continue;
+		} else if (this->rb_right) {
+			this = this->rb_right;
+			continue;
+		}
+		o = rb_entry(this, struct check_orphan, rb);
+		this = rb_parent(this);
+		if (this) {
+			if (this->rb_left == &o->rb)
+				this->rb_left = NULL;
+			else
+				this->rb_right = NULL;
+		}
+		kfree(o);
+	}
+}
+
+static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			    void *priv)
+{
+	struct check_info *ci = priv;
+	ino_t inum;
+	int err;
+
+	inum = key_inum(c, &zbr->key);
+	if (inum != ci->last_ino) {
+		/* Lowest node type is the inode node, so it comes first */
+		if (key_type(c, &zbr->key) != UBIFS_INO_KEY)
+			ubifs_err("found orphan node ino %lu, type %d",
+				  (unsigned long)inum, key_type(c, &zbr->key));
+		ci->last_ino = inum;
+		ci->tot_inos += 1;
+		err = ubifs_tnc_read_node(c, zbr, ci->node);
+		if (err) {
+			ubifs_err("node read failed, error %d", err);
+			return err;
+		}
+		if (ci->node->nlink == 0)
+			/* Must be recorded as an orphan */
+			if (!dbg_find_check_orphan(&ci->root, inum) &&
+			    !dbg_find_orphan(c, inum)) {
+				ubifs_err("missing orphan, ino %lu",
+					  (unsigned long)inum);
+				ci->missing += 1;
+			}
+	}
+	ci->leaf_cnt += 1;
+	return 0;
+}
+
+static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb)
+{
+	struct ubifs_scan_node *snod;
+	struct ubifs_orph_node *orph;
+	ino_t inum;
+	int i, n, err;
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		cond_resched();
+		if (snod->type != UBIFS_ORPH_NODE)
+			continue;
+		orph = snod->node;
+		n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
+		for (i = 0; i < n; i++) {
+			inum = le64_to_cpu(orph->inos[i]);
+			err = dbg_ins_check_orphan(&ci->root, inum);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
+{
+	int lnum, err = 0;
+	void *buf;
+
+	/* Check no-orphans flag and skip this if no orphans */
+	if (c->no_orphs)
+		return 0;
+
+	buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+	if (!buf) {
+		ubifs_err("cannot allocate memory to check orphans");
+		return 0;
+	}
+
+	for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+		struct ubifs_scan_leb *sleb;
+
+		sleb = ubifs_scan(c, lnum, 0, buf, 0);
+		if (IS_ERR(sleb)) {
+			err = PTR_ERR(sleb);
+			break;
+		}
+
+		err = dbg_read_orphans(ci, sleb);
+		ubifs_scan_destroy(sleb);
+		if (err)
+			break;
+	}
+
+	vfree(buf);
+	return err;
+}
+
+static int dbg_check_orphans(struct ubifs_info *c)
+{
+	struct check_info ci;
+	int err;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_ORPH))
+		return 0;
+
+	ci.last_ino = 0;
+	ci.tot_inos = 0;
+	ci.missing  = 0;
+	ci.leaf_cnt = 0;
+	ci.root = RB_ROOT;
+	ci.node = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
+	if (!ci.node) {
+		ubifs_err("out of memory");
+		return -ENOMEM;
+	}
+
+	err = dbg_scan_orphans(c, &ci);
+	if (err)
+		goto out;
+
+	err = dbg_walk_index(c, &dbg_orphan_check, NULL, &ci);
+	if (err) {
+		ubifs_err("cannot scan TNC, error %d", err);
+		goto out;
+	}
+
+	if (ci.missing) {
+		ubifs_err("%lu missing orphan(s)", ci.missing);
+		err = -EINVAL;
+		goto out;
+	}
+
+	dbg_cmt("last inode number is %lu", ci.last_ino);
+	dbg_cmt("total number of inodes is %lu", ci.tot_inos);
+	dbg_cmt("total number of leaf nodes is %llu", ci.leaf_cnt);
+
+out:
+	dbg_free_check_tree(&ci.root);
+	kfree(ci.node);
+	return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
new file mode 100644
index 00000000..783d8e0b
--- /dev/null
+++ b/fs/ubifs/recovery.c
@@ -0,0 +1,1567 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements functions needed to recover from unclean un-mounts.
+ * When UBIFS is mounted, it checks a flag on the master node to determine if
+ * an un-mount was completed successfully. If not, the process of mounting
+ * incorporates additional checking and fixing of on-flash data structures.
+ * UBIFS always cleans away all remnants of an unclean un-mount, so that
+ * errors do not accumulate. However UBIFS defers recovery if it is mounted
+ * read-only, and the flash is not modified in that case.
+ *
+ * The general UBIFS approach to the recovery is that it recovers from
+ * corruptions which could be caused by power cuts, but it refuses to recover
+ * from corruption caused by other reasons. And UBIFS tries to distinguish
+ * between these 2 reasons of corruptions and silently recover in the former
+ * case and loudly complain in the latter case.
+ *
+ * UBIFS writes only to erased LEBs, so it writes only to the flash space
+ * containing only 0xFFs. UBIFS also always writes strictly from the beginning
+ * of the LEB to the end. And UBIFS assumes that the underlying flash media
+ * writes in @c->max_write_size bytes at a time.
+ *
+ * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min.
+ * I/O unit corresponding to offset X to contain corrupted data, all the
+ * following min. I/O units have to contain empty space (all 0xFFs). If this is
+ * not true, the corruption cannot be the result of a power cut, and UBIFS
+ * refuses to mount.
+ */
+
+#include <linux/crc32.h>
+#include <linux/slab.h>
+#include "ubifs.h"
+
+/**
+ * is_empty - determine whether a buffer is empty (contains all 0xff).
+ * @buf: buffer to clean
+ * @len: length of buffer
+ *
+ * This function returns %1 if the buffer is empty (contains all 0xff) otherwise
+ * %0 is returned.
+ */
+static int is_empty(void *buf, int len)
+{
+	uint8_t *p = buf;
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (*p++ != 0xff)
+			return 0;
+	return 1;
+}
+
+/**
+ * first_non_ff - find offset of the first non-0xff byte.
+ * @buf: buffer to search in
+ * @len: length of buffer
+ *
+ * This function returns offset of the first non-0xff byte in @buf or %-1 if
+ * the buffer contains only 0xff bytes.
+ */
+static int first_non_ff(void *buf, int len)
+{
+	uint8_t *p = buf;
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (*p++ != 0xff)
+			return i;
+	return -1;
+}
+
+/**
+ * get_master_node - get the last valid master node allowing for corruption.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @pbuf: buffer containing the LEB read, is returned here
+ * @mst: master node, if found, is returned here
+ * @cor: corruption, if found, is returned here
+ *
+ * This function allocates a buffer, reads the LEB into it, and finds and
+ * returns the last valid master node allowing for one area of corruption.
+ * The corrupt area, if there is one, must be consistent with the assumption
+ * that it is the result of an unclean unmount while the master node was being
+ * written. Under those circumstances, it is valid to use the previously written
+ * master node.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf,
+			   struct ubifs_mst_node **mst, void **cor)
+{
+	const int sz = c->mst_node_alsz;
+	int err, offs, len;
+	void *sbuf, *buf;
+
+	sbuf = vmalloc(c->leb_size);
+	if (!sbuf)
+		return -ENOMEM;
+
+	err = ubi_read(c->ubi, lnum, sbuf, 0, c->leb_size);
+	if (err && err != -EBADMSG)
+		goto out_free;
+
+	/* Find the first position that is definitely not a node */
+	offs = 0;
+	buf = sbuf;
+	len = c->leb_size;
+	while (offs + UBIFS_MST_NODE_SZ <= c->leb_size) {
+		struct ubifs_ch *ch = buf;
+
+		if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
+			break;
+		offs += sz;
+		buf  += sz;
+		len  -= sz;
+	}
+	/* See if there was a valid master node before that */
+	if (offs) {
+		int ret;
+
+		offs -= sz;
+		buf  -= sz;
+		len  += sz;
+		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+		if (ret != SCANNED_A_NODE && offs) {
+			/* Could have been corruption so check one place back */
+			offs -= sz;
+			buf  -= sz;
+			len  += sz;
+			ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+			if (ret != SCANNED_A_NODE)
+				/*
+				 * We accept only one area of corruption because
+				 * we are assuming that it was caused while
+				 * trying to write a master node.
+				 */
+				goto out_err;
+		}
+		if (ret == SCANNED_A_NODE) {
+			struct ubifs_ch *ch = buf;
+
+			if (ch->node_type != UBIFS_MST_NODE)
+				goto out_err;
+			dbg_rcvry("found a master node at %d:%d", lnum, offs);
+			*mst = buf;
+			offs += sz;
+			buf  += sz;
+			len  -= sz;
+		}
+	}
+	/* Check for corruption */
+	if (offs < c->leb_size) {
+		if (!is_empty(buf, min_t(int, len, sz))) {
+			*cor = buf;
+			dbg_rcvry("found corruption at %d:%d", lnum, offs);
+		}
+		offs += sz;
+		buf  += sz;
+		len  -= sz;
+	}
+	/* Check remaining empty space */
+	if (offs < c->leb_size)
+		if (!is_empty(buf, len))
+			goto out_err;
+	*pbuf = sbuf;
+	return 0;
+
+out_err:
+	err = -EINVAL;
+out_free:
+	vfree(sbuf);
+	*mst = NULL;
+	*cor = NULL;
+	return err;
+}
+
+/**
+ * write_rcvrd_mst_node - write recovered master node.
+ * @c: UBIFS file-system description object
+ * @mst: master node
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int write_rcvrd_mst_node(struct ubifs_info *c,
+				struct ubifs_mst_node *mst)
+{
+	int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz;
+	__le32 save_flags;
+
+	dbg_rcvry("recovery");
+
+	save_flags = mst->flags;
+	mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY);
+
+	ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1);
+	err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM);
+	if (err)
+		goto out;
+	err = ubi_leb_change(c->ubi, lnum + 1, mst, sz, UBI_SHORTTERM);
+	if (err)
+		goto out;
+out:
+	mst->flags = save_flags;
+	return err;
+}
+
+/**
+ * ubifs_recover_master_node - recover the master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function recovers the master node from corruption that may occur due to
+ * an unclean unmount.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_master_node(struct ubifs_info *c)
+{
+	void *buf1 = NULL, *buf2 = NULL, *cor1 = NULL, *cor2 = NULL;
+	struct ubifs_mst_node *mst1 = NULL, *mst2 = NULL, *mst;
+	const int sz = c->mst_node_alsz;
+	int err, offs1, offs2;
+
+	dbg_rcvry("recovery");
+
+	err = get_master_node(c, UBIFS_MST_LNUM, &buf1, &mst1, &cor1);
+	if (err)
+		goto out_free;
+
+	err = get_master_node(c, UBIFS_MST_LNUM + 1, &buf2, &mst2, &cor2);
+	if (err)
+		goto out_free;
+
+	if (mst1) {
+		offs1 = (void *)mst1 - buf1;
+		if ((le32_to_cpu(mst1->flags) & UBIFS_MST_RCVRY) &&
+		    (offs1 == 0 && !cor1)) {
+			/*
+			 * mst1 was written by recovery at offset 0 with no
+			 * corruption.
+			 */
+			dbg_rcvry("recovery recovery");
+			mst = mst1;
+		} else if (mst2) {
+			offs2 = (void *)mst2 - buf2;
+			if (offs1 == offs2) {
+				/* Same offset, so must be the same */
+				if (memcmp((void *)mst1 + UBIFS_CH_SZ,
+					   (void *)mst2 + UBIFS_CH_SZ,
+					   UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+					goto out_err;
+				mst = mst1;
+			} else if (offs2 + sz == offs1) {
+				/* 1st LEB was written, 2nd was not */
+				if (cor1)
+					goto out_err;
+				mst = mst1;
+			} else if (offs1 == 0 && offs2 + sz >= c->leb_size) {
+				/* 1st LEB was unmapped and written, 2nd not */
+				if (cor1)
+					goto out_err;
+				mst = mst1;
+			} else
+				goto out_err;
+		} else {
+			/*
+			 * 2nd LEB was unmapped and about to be written, so
+			 * there must be only one master node in the first LEB
+			 * and no corruption.
+			 */
+			if (offs1 != 0 || cor1)
+				goto out_err;
+			mst = mst1;
+		}
+	} else {
+		if (!mst2)
+			goto out_err;
+		/*
+		 * 1st LEB was unmapped and about to be written, so there must
+		 * be no room left in 2nd LEB.
+		 */
+		offs2 = (void *)mst2 - buf2;
+		if (offs2 + sz + sz <= c->leb_size)
+			goto out_err;
+		mst = mst2;
+	}
+
+	ubifs_msg("recovered master node from LEB %d",
+		  (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
+
+	memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
+
+	if (c->ro_mount) {
+		/* Read-only mode. Keep a copy for switching to rw mode */
+		c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
+		if (!c->rcvrd_mst_node) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+		memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ);
+
+		/*
+		 * We had to recover the master node, which means there was an
+		 * unclean reboot. However, it is possible that the master node
+		 * is clean at this point, i.e., %UBIFS_MST_DIRTY is not set.
+		 * E.g., consider the following chain of events:
+		 *
+		 * 1. UBIFS was cleanly unmounted, so the master node is clean
+		 * 2. UBIFS is being mounted R/W and starts changing the master
+		 *    node in the first (%UBIFS_MST_LNUM). A power cut happens,
+		 *    so this LEB ends up with some amount of garbage at the
+		 *    end.
+		 * 3. UBIFS is being mounted R/O. We reach this place and
+		 *    recover the master node from the second LEB
+		 *    (%UBIFS_MST_LNUM + 1). But we cannot update the media
+		 *    because we are being mounted R/O. We have to defer the
+		 *    operation.
+		 * 4. However, this master node (@c->mst_node) is marked as
+		 *    clean (since the step 1). And if we just return, the
+		 *    mount code will be confused and won't recover the master
+		 *    node when it is re-mounter R/W later.
+		 *
+		 *    Thus, to force the recovery by marking the master node as
+		 *    dirty.
+		 */
+		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+	} else {
+		/* Write the recovered master node */
+		c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1;
+		err = write_rcvrd_mst_node(c, c->mst_node);
+		if (err)
+			goto out_free;
+	}
+
+	vfree(buf2);
+	vfree(buf1);
+
+	return 0;
+
+out_err:
+	err = -EINVAL;
+out_free:
+	ubifs_err("failed to recover master node");
+	if (mst1) {
+		dbg_err("dumping first master node");
+		dbg_dump_node(c, mst1);
+	}
+	if (mst2) {
+		dbg_err("dumping second master node");
+		dbg_dump_node(c, mst2);
+	}
+	vfree(buf2);
+	vfree(buf1);
+	return err;
+}
+
+/**
+ * ubifs_write_rcvrd_mst_node - write the recovered master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the master node that was recovered during mounting in
+ * read-only mode and must now be written because we are remounting rw.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
+{
+	int err;
+
+	if (!c->rcvrd_mst_node)
+		return 0;
+	c->rcvrd_mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+	c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+	err = write_rcvrd_mst_node(c, c->rcvrd_mst_node);
+	if (err)
+		return err;
+	kfree(c->rcvrd_mst_node);
+	c->rcvrd_mst_node = NULL;
+	return 0;
+}
+
+/**
+ * is_last_write - determine if an offset was in the last write to a LEB.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to check
+ * @offs: offset to check
+ *
+ * This function returns %1 if @offs was in the last write to the LEB whose data
+ * is in @buf, otherwise %0 is returned. The determination is made by checking
+ * for subsequent empty space starting from the next @c->max_write_size
+ * boundary.
+ */
+static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
+{
+	int empty_offs, check_len;
+	uint8_t *p;
+
+	/*
+	 * Round up to the next @c->max_write_size boundary i.e. @offs is in
+	 * the last wbuf written. After that should be empty space.
+	 */
+	empty_offs = ALIGN(offs + 1, c->max_write_size);
+	check_len = c->leb_size - empty_offs;
+	p = buf + empty_offs - offs;
+	return is_empty(p, check_len);
+}
+
+/**
+ * clean_buf - clean the data from an LEB sitting in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to clean
+ * @lnum: LEB number to clean
+ * @offs: offset from which to clean
+ * @len: length of buffer
+ *
+ * This function pads up to the next min_io_size boundary (if there is one) and
+ * sets empty space to all 0xff. @buf, @offs and @len are updated to the next
+ * @c->min_io_size boundary.
+ */
+static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
+		      int *offs, int *len)
+{
+	int empty_offs, pad_len;
+
+	lnum = lnum;
+	dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs);
+
+	ubifs_assert(!(*offs & 7));
+	empty_offs = ALIGN(*offs, c->min_io_size);
+	pad_len = empty_offs - *offs;
+	ubifs_pad(c, *buf, pad_len);
+	*offs += pad_len;
+	*buf += pad_len;
+	*len -= pad_len;
+	memset(*buf, 0xff, c->leb_size - empty_offs);
+}
+
+/**
+ * no_more_nodes - determine if there are no more nodes in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to check
+ * @len: length of buffer
+ * @lnum: LEB number of the LEB from which @buf was read
+ * @offs: offset from which @buf was read
+ *
+ * This function ensures that the corrupted node at @offs is the last thing
+ * written to a LEB. This function returns %1 if more data is not found and
+ * %0 if more data is found.
+ */
+static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
+			int lnum, int offs)
+{
+	struct ubifs_ch *ch = buf;
+	int skip, dlen = le32_to_cpu(ch->len);
+
+	/* Check for empty space after the corrupt node's common header */
+	skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs;
+	if (is_empty(buf + skip, len - skip))
+		return 1;
+	/*
+	 * The area after the common header size is not empty, so the common
+	 * header must be intact. Check it.
+	 */
+	if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) {
+		dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
+		return 0;
+	}
+	/* Now we know the corrupt node's length we can skip over it */
+	skip = ALIGN(offs + dlen, c->max_write_size) - offs;
+	/* After which there should be empty space */
+	if (is_empty(buf + skip, len - skip))
+		return 1;
+	dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip);
+	return 0;
+}
+
+/**
+ * fix_unclean_leb - fix an unclean LEB.
+ * @c: UBIFS file-system description object
+ * @sleb: scanned LEB information
+ * @start: offset where scan started
+ */
+static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+			   int start)
+{
+	int lnum = sleb->lnum, endpt = start;
+
+	/* Get the end offset of the last node we are keeping */
+	if (!list_empty(&sleb->nodes)) {
+		struct ubifs_scan_node *snod;
+
+		snod = list_entry(sleb->nodes.prev,
+				  struct ubifs_scan_node, list);
+		endpt = snod->offs + snod->len;
+	}
+
+	if (c->ro_mount && !c->remounting_rw) {
+		/* Add to recovery list */
+		struct ubifs_unclean_leb *ucleb;
+
+		dbg_rcvry("need to fix LEB %d start %d endpt %d",
+			  lnum, start, sleb->endpt);
+		ucleb = kzalloc(sizeof(struct ubifs_unclean_leb), GFP_NOFS);
+		if (!ucleb)
+			return -ENOMEM;
+		ucleb->lnum = lnum;
+		ucleb->endpt = endpt;
+		list_add_tail(&ucleb->list, &c->unclean_leb_list);
+	} else {
+		/* Write the fixed LEB back to flash */
+		int err;
+
+		dbg_rcvry("fixing LEB %d start %d endpt %d",
+			  lnum, start, sleb->endpt);
+		if (endpt == 0) {
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				return err;
+		} else {
+			int len = ALIGN(endpt, c->min_io_size);
+
+			if (start) {
+				err = ubi_read(c->ubi, lnum, sleb->buf, 0,
+					       start);
+				if (err)
+					return err;
+			}
+			/* Pad to min_io_size */
+			if (len > endpt) {
+				int pad_len = len - ALIGN(endpt, 8);
+
+				if (pad_len > 0) {
+					void *buf = sleb->buf + len - pad_len;
+
+					ubifs_pad(c, buf, pad_len);
+				}
+			}
+			err = ubi_leb_change(c->ubi, lnum, sleb->buf, len,
+					     UBI_UNKNOWN);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+/**
+ * drop_last_group - drop the last group of nodes.
+ * @sleb: scanned LEB information
+ * @offs: offset of dropped nodes is returned here
+ *
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * group of nodes of the scanned LEB.
+ */
+static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs)
+{
+	while (!list_empty(&sleb->nodes)) {
+		struct ubifs_scan_node *snod;
+		struct ubifs_ch *ch;
+
+		snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+				  list);
+		ch = snod->node;
+		if (ch->group_type != UBIFS_IN_NODE_GROUP)
+			break;
+
+		dbg_rcvry("dropping grouped node at %d:%d",
+			  sleb->lnum, snod->offs);
+		*offs = snod->offs;
+		list_del(&snod->list);
+		kfree(snod);
+		sleb->nodes_cnt -= 1;
+	}
+}
+
+/**
+ * drop_last_node - drop the last node.
+ * @sleb: scanned LEB information
+ * @offs: offset of dropped nodes is returned here
+ * @grouped: non-zero if whole group of nodes have to be dropped
+ *
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * node of the scanned LEB.
+ */
+static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
+{
+	struct ubifs_scan_node *snod;
+
+	if (!list_empty(&sleb->nodes)) {
+		snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+				  list);
+
+		dbg_rcvry("dropping last node at %d:%d", sleb->lnum, snod->offs);
+		*offs = snod->offs;
+		list_del(&snod->list);
+		kfree(snod);
+		sleb->nodes_cnt -= 1;
+	}
+}
+
+/**
+ * ubifs_recover_leb - scan and recover a LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @offs: offset
+ * @sbuf: LEB-sized buffer to use
+ * @jhead: journal head number this LEB belongs to (%-1 if the LEB does not
+ *         belong to any journal head)
+ *
+ * This function does a scan of a LEB, but caters for errors that might have
+ * been caused by the unclean unmount from which we are attempting to recover.
+ * Returns %0 in case of success, %-EUCLEAN if an unrecoverable corruption is
+ * found, and a negative error code in case of failure.
+ */
+struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
+					 int offs, void *sbuf, int jhead)
+{
+	int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
+	int grouped = jhead == -1 ? 0 : c->jheads[jhead].grouped;
+	struct ubifs_scan_leb *sleb;
+	void *buf = sbuf + offs;
+
+	dbg_rcvry("%d:%d, jhead %d, grouped %d", lnum, offs, jhead, grouped);
+
+	sleb = ubifs_start_scan(c, lnum, offs, sbuf);
+	if (IS_ERR(sleb))
+		return sleb;
+
+	ubifs_assert(len >= 8);
+	while (len >= 8) {
+		dbg_scan("look at LEB %d:%d (%d bytes left)",
+			 lnum, offs, len);
+
+		cond_resched();
+
+		/*
+		 * Scan quietly until there is an error from which we cannot
+		 * recover
+		 */
+		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+		if (ret == SCANNED_A_NODE) {
+			/* A valid node, and not a padding node */
+			struct ubifs_ch *ch = buf;
+			int node_len;
+
+			err = ubifs_add_snod(c, sleb, buf, offs);
+			if (err)
+				goto error;
+			node_len = ALIGN(le32_to_cpu(ch->len), 8);
+			offs += node_len;
+			buf += node_len;
+			len -= node_len;
+		} else if (ret > 0) {
+			/* Padding bytes or a valid padding node */
+			offs += ret;
+			buf += ret;
+			len -= ret;
+		} else if (ret == SCANNED_EMPTY_SPACE ||
+			   ret == SCANNED_GARBAGE     ||
+			   ret == SCANNED_A_BAD_PAD_NODE ||
+			   ret == SCANNED_A_CORRUPT_NODE) {
+			dbg_rcvry("found corruption - %d", ret);
+			break;
+		} else {
+			dbg_err("unexpected return value %d", ret);
+			err = -EINVAL;
+			goto error;
+		}
+	}
+
+	if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) {
+		if (!is_last_write(c, buf, offs))
+			goto corrupted_rescan;
+	} else if (ret == SCANNED_A_CORRUPT_NODE) {
+		if (!no_more_nodes(c, buf, len, lnum, offs))
+			goto corrupted_rescan;
+	} else if (!is_empty(buf, len)) {
+		if (!is_last_write(c, buf, offs)) {
+			int corruption = first_non_ff(buf, len);
+
+			/*
+			 * See header comment for this file for more
+			 * explanations about the reasons we have this check.
+			 */
+			ubifs_err("corrupt empty space LEB %d:%d, corruption "
+				  "starts at %d", lnum, offs, corruption);
+			/* Make sure we dump interesting non-0xFF data */
+			offs += corruption;
+			buf += corruption;
+			goto corrupted;
+		}
+	}
+
+	min_io_unit = round_down(offs, c->min_io_size);
+	if (grouped)
+		/*
+		 * If nodes are grouped, always drop the incomplete group at
+		 * the end.
+		 */
+		drop_last_group(sleb, &offs);
+
+	if (jhead == GCHD) {
+		/*
+		 * If this LEB belongs to the GC head then while we are in the
+		 * middle of the same min. I/O unit keep dropping nodes. So
+		 * basically, what we want is to make sure that the last min.
+		 * I/O unit where we saw the corruption is dropped completely
+		 * with all the uncorrupted nodes which may possibly sit there.
+		 *
+		 * In other words, let's name the min. I/O unit where the
+		 * corruption starts B, and the previous min. I/O unit A. The
+		 * below code tries to deal with a situation when half of B
+		 * contains valid nodes or the end of a valid node, and the
+		 * second half of B contains corrupted data or garbage. This
+		 * means that UBIFS had been writing to B just before the power
+		 * cut happened. I do not know how realistic is this scenario
+		 * that half of the min. I/O unit had been written successfully
+		 * and the other half not, but this is possible in our 'failure
+		 * mode emulation' infrastructure at least.
+		 *
+		 * So what is the problem, why we need to drop those nodes? Why
+		 * can't we just clean-up the second half of B by putting a
+		 * padding node there? We can, and this works fine with one
+		 * exception which was reproduced with power cut emulation
+		 * testing and happens extremely rarely.
+		 *
+		 * Imagine the file-system is full, we run GC which starts
+		 * moving valid nodes from LEB X to LEB Y (obviously, LEB Y is
+		 * the current GC head LEB). The @c->gc_lnum is -1, which means
+		 * that GC will retain LEB X and will try to continue. Imagine
+		 * that LEB X is currently the dirtiest LEB, and the amount of
+		 * used space in LEB Y is exactly the same as amount of free
+		 * space in LEB X.
+		 *
+		 * And a power cut happens when nodes are moved from LEB X to
+		 * LEB Y. We are here trying to recover LEB Y which is the GC
+		 * head LEB. We find the min. I/O unit B as described above.
+		 * Then we clean-up LEB Y by padding min. I/O unit. And later
+		 * 'ubifs_rcvry_gc_commit()' function fails, because it cannot
+		 * find a dirty LEB which could be GC'd into LEB Y! Even LEB X
+		 * does not match because the amount of valid nodes there does
+		 * not fit the free space in LEB Y any more! And this is
+		 * because of the padding node which we added to LEB Y. The
+		 * user-visible effect of this which I once observed and
+		 * analysed is that we cannot mount the file-system with
+		 * -ENOSPC error.
+		 *
+		 * So obviously, to make sure that situation does not happen we
+		 * should free min. I/O unit B in LEB Y completely and the last
+		 * used min. I/O unit in LEB Y should be A. This is basically
+		 * what the below code tries to do.
+		 */
+		while (offs > min_io_unit)
+			drop_last_node(sleb, &offs);
+	}
+
+	buf = sbuf + offs;
+	len = c->leb_size - offs;
+
+	clean_buf(c, &buf, lnum, &offs, &len);
+	ubifs_end_scan(c, sleb, lnum, offs);
+
+	err = fix_unclean_leb(c, sleb, start);
+	if (err)
+		goto error;
+
+	return sleb;
+
+corrupted_rescan:
+	/* Re-scan the corrupted data with verbose messages */
+	dbg_err("corruptio %d", ret);
+	ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+corrupted:
+	ubifs_scanned_corruption(c, lnum, offs, buf);
+	err = -EUCLEAN;
+error:
+	ubifs_err("LEB %d scanning failed", lnum);
+	ubifs_scan_destroy(sleb);
+	return ERR_PTR(err);
+}
+
+/**
+ * get_cs_sqnum - get commit start sequence number.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of commit start node
+ * @offs: offset of commit start node
+ * @cs_sqnum: commit start sequence number is returned here
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs,
+			unsigned long long *cs_sqnum)
+{
+	struct ubifs_cs_node *cs_node = NULL;
+	int err, ret;
+
+	dbg_rcvry("at %d:%d", lnum, offs);
+	cs_node = kmalloc(UBIFS_CS_NODE_SZ, GFP_KERNEL);
+	if (!cs_node)
+		return -ENOMEM;
+	if (c->leb_size - offs < UBIFS_CS_NODE_SZ)
+		goto out_err;
+	err = ubi_read(c->ubi, lnum, (void *)cs_node, offs, UBIFS_CS_NODE_SZ);
+	if (err && err != -EBADMSG)
+		goto out_free;
+	ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0);
+	if (ret != SCANNED_A_NODE) {
+		dbg_err("Not a valid node");
+		goto out_err;
+	}
+	if (cs_node->ch.node_type != UBIFS_CS_NODE) {
+		dbg_err("Node a CS node, type is %d", cs_node->ch.node_type);
+		goto out_err;
+	}
+	if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) {
+		dbg_err("CS node cmt_no %llu != current cmt_no %llu",
+			(unsigned long long)le64_to_cpu(cs_node->cmt_no),
+			c->cmt_no);
+		goto out_err;
+	}
+	*cs_sqnum = le64_to_cpu(cs_node->ch.sqnum);
+	dbg_rcvry("commit start sqnum %llu", *cs_sqnum);
+	kfree(cs_node);
+	return 0;
+
+out_err:
+	err = -EINVAL;
+out_free:
+	ubifs_err("failed to get CS sqnum");
+	kfree(cs_node);
+	return err;
+}
+
+/**
+ * ubifs_recover_log_leb - scan and recover a log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @offs: offset
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function does a scan of a LEB, but caters for errors that might have
+ * been caused by unclean reboots from which we are attempting to recover
+ * (assume that only the last log LEB can be corrupted by an unclean reboot).
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
+					     int offs, void *sbuf)
+{
+	struct ubifs_scan_leb *sleb;
+	int next_lnum;
+
+	dbg_rcvry("LEB %d", lnum);
+	next_lnum = lnum + 1;
+	if (next_lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+		next_lnum = UBIFS_LOG_LNUM;
+	if (next_lnum != c->ltail_lnum) {
+		/*
+		 * We can only recover at the end of the log, so check that the
+		 * next log LEB is empty or out of date.
+		 */
+		sleb = ubifs_scan(c, next_lnum, 0, sbuf, 0);
+		if (IS_ERR(sleb))
+			return sleb;
+		if (sleb->nodes_cnt) {
+			struct ubifs_scan_node *snod;
+			unsigned long long cs_sqnum = c->cs_sqnum;
+
+			snod = list_entry(sleb->nodes.next,
+					  struct ubifs_scan_node, list);
+			if (cs_sqnum == 0) {
+				int err;
+
+				err = get_cs_sqnum(c, lnum, offs, &cs_sqnum);
+				if (err) {
+					ubifs_scan_destroy(sleb);
+					return ERR_PTR(err);
+				}
+			}
+			if (snod->sqnum > cs_sqnum) {
+				ubifs_err("unrecoverable log corruption "
+					  "in LEB %d", lnum);
+				ubifs_scan_destroy(sleb);
+				return ERR_PTR(-EUCLEAN);
+			}
+		}
+		ubifs_scan_destroy(sleb);
+	}
+	return ubifs_recover_leb(c, lnum, offs, sbuf, -1);
+}
+
+/**
+ * recover_head - recover a head.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of head to recover
+ * @offs: offset of head to recover
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function ensures that there is no data on the flash at a head location.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int recover_head(const struct ubifs_info *c, int lnum, int offs,
+			void *sbuf)
+{
+	int len = c->max_write_size, err;
+
+	if (offs + len > c->leb_size)
+		len = c->leb_size - offs;
+
+	if (!len)
+		return 0;
+
+	/* Read at the head location and check it is empty flash */
+	err = ubi_read(c->ubi, lnum, sbuf, offs, len);
+	if (err || !is_empty(sbuf, len)) {
+		dbg_rcvry("cleaning head at %d:%d", lnum, offs);
+		if (offs == 0)
+			return ubifs_leb_unmap(c, lnum);
+		err = ubi_read(c->ubi, lnum, sbuf, 0, offs);
+		if (err)
+			return err;
+		return ubi_leb_change(c->ubi, lnum, sbuf, offs, UBI_UNKNOWN);
+	}
+
+	return 0;
+}
+
+/**
+ * ubifs_recover_inl_heads - recover index and LPT heads.
+ * @c: UBIFS file-system description object
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function ensures that there is no data on the flash at the index and
+ * LPT head locations.
+ *
+ * This deals with the recovery of a half-completed journal commit. UBIFS is
+ * careful never to overwrite the last version of the index or the LPT. Because
+ * the index and LPT are wandering trees, data from a half-completed commit will
+ * not be referenced anywhere in UBIFS. The data will be either in LEBs that are
+ * assumed to be empty and will be unmapped anyway before use, or in the index
+ * and LPT heads.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
+{
+	int err;
+
+	ubifs_assert(!c->ro_mount || c->remounting_rw);
+
+	dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
+	err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
+	if (err)
+		return err;
+
+	dbg_rcvry("checking LPT head at %d:%d", c->nhead_lnum, c->nhead_offs);
+	err = recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/**
+ *  clean_an_unclean_leb - read and write a LEB to remove corruption.
+ * @c: UBIFS file-system description object
+ * @ucleb: unclean LEB information
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function reads a LEB up to a point pre-determined by the mount recovery,
+ * checks the nodes, and writes the result back to the flash, thereby cleaning
+ * off any following corruption, or non-fatal ECC errors.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int clean_an_unclean_leb(const struct ubifs_info *c,
+				struct ubifs_unclean_leb *ucleb, void *sbuf)
+{
+	int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1;
+	void *buf = sbuf;
+
+	dbg_rcvry("LEB %d len %d", lnum, len);
+
+	if (len == 0) {
+		/* Nothing to read, just unmap it */
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+		return 0;
+	}
+
+	err = ubi_read(c->ubi, lnum, buf, offs, len);
+	if (err && err != -EBADMSG)
+		return err;
+
+	while (len >= 8) {
+		int ret;
+
+		cond_resched();
+
+		/* Scan quietly until there is an error */
+		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
+
+		if (ret == SCANNED_A_NODE) {
+			/* A valid node, and not a padding node */
+			struct ubifs_ch *ch = buf;
+			int node_len;
+
+			node_len = ALIGN(le32_to_cpu(ch->len), 8);
+			offs += node_len;
+			buf += node_len;
+			len -= node_len;
+			continue;
+		}
+
+		if (ret > 0) {
+			/* Padding bytes or a valid padding node */
+			offs += ret;
+			buf += ret;
+			len -= ret;
+			continue;
+		}
+
+		if (ret == SCANNED_EMPTY_SPACE) {
+			ubifs_err("unexpected empty space at %d:%d",
+				  lnum, offs);
+			return -EUCLEAN;
+		}
+
+		if (quiet) {
+			/* Redo the last scan but noisily */
+			quiet = 0;
+			continue;
+		}
+
+		ubifs_scanned_corruption(c, lnum, offs, buf);
+		return -EUCLEAN;
+	}
+
+	/* Pad to min_io_size */
+	len = ALIGN(ucleb->endpt, c->min_io_size);
+	if (len > ucleb->endpt) {
+		int pad_len = len - ALIGN(ucleb->endpt, 8);
+
+		if (pad_len > 0) {
+			buf = c->sbuf + len - pad_len;
+			ubifs_pad(c, buf, pad_len);
+		}
+	}
+
+	/* Write back the LEB atomically */
+	err = ubi_leb_change(c->ubi, lnum, sbuf, len, UBI_UNKNOWN);
+	if (err)
+		return err;
+
+	dbg_rcvry("cleaned LEB %d", lnum);
+
+	return 0;
+}
+
+/**
+ * ubifs_clean_lebs - clean LEBs recovered during read-only mount.
+ * @c: UBIFS file-system description object
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function cleans a LEB identified during recovery that needs to be
+ * written but was not because UBIFS was mounted read-only. This happens when
+ * remounting to read-write mode.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
+{
+	dbg_rcvry("recovery");
+	while (!list_empty(&c->unclean_leb_list)) {
+		struct ubifs_unclean_leb *ucleb;
+		int err;
+
+		ucleb = list_entry(c->unclean_leb_list.next,
+				   struct ubifs_unclean_leb, list);
+		err = clean_an_unclean_leb(c, ucleb, sbuf);
+		if (err)
+			return err;
+		list_del(&ucleb->list);
+		kfree(ucleb);
+	}
+	return 0;
+}
+
+/**
+ * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty
+ * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static int grab_empty_leb(struct ubifs_info *c)
+{
+	int lnum, err;
+
+	/*
+	 * Note, it is very important to first search for an empty LEB and then
+	 * run the commit, not vice-versa. The reason is that there might be
+	 * only one empty LEB at the moment, the one which has been the
+	 * @c->gc_lnum just before the power cut happened. During the regular
+	 * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no
+	 * one but GC can grab it. But at this moment this single empty LEB is
+	 * not marked as taken, so if we run commit - what happens? Right, the
+	 * commit will grab it and write the index there. Remember that the
+	 * index always expands as long as there is free space, and it only
+	 * starts consolidating when we run out of space.
+	 *
+	 * IOW, if we run commit now, we might not be able to find a free LEB
+	 * after this.
+	 */
+	lnum = ubifs_find_free_leb_for_idx(c);
+	if (lnum < 0) {
+		dbg_err("could not find an empty LEB");
+		dbg_dump_lprops(c);
+		dbg_dump_budg(c, &c->bi);
+		return lnum;
+	}
+
+	/* Reset the index flag */
+	err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+				  LPROPS_INDEX, 0);
+	if (err)
+		return err;
+
+	c->gc_lnum = lnum;
+	dbg_rcvry("found empty LEB %d, run commit", lnum);
+
+	return ubifs_run_commit(c);
+}
+
+/**
+ * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
+ * @c: UBIFS file-system description object
+ *
+ * Out-of-place garbage collection requires always one empty LEB with which to
+ * start garbage collection. The LEB number is recorded in c->gc_lnum and is
+ * written to the master node on unmounting. In the case of an unclean unmount
+ * the value of gc_lnum recorded in the master node is out of date and cannot
+ * be used. Instead, recovery must allocate an empty LEB for this purpose.
+ * However, there may not be enough empty space, in which case it must be
+ * possible to GC the dirtiest LEB into the GC head LEB.
+ *
+ * This function also runs the commit which causes the TNC updates from
+ * size-recovery and orphans to be written to the flash. That is important to
+ * ensure correct replay order for subsequent mounts.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_rcvry_gc_commit(struct ubifs_info *c)
+{
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+	struct ubifs_lprops lp;
+	int err;
+
+	dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs);
+
+	c->gc_lnum = -1;
+	if (wbuf->lnum == -1 || wbuf->offs == c->leb_size)
+		return grab_empty_leb(c);
+
+	err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
+	if (err) {
+		if (err != -ENOSPC)
+			return err;
+
+		dbg_rcvry("could not find a dirty LEB");
+		return grab_empty_leb(c);
+	}
+
+	ubifs_assert(!(lp.flags & LPROPS_INDEX));
+	ubifs_assert(lp.free + lp.dirty >= wbuf->offs);
+
+	/*
+	 * We run the commit before garbage collection otherwise subsequent
+	 * mounts will see the GC and orphan deletion in a different order.
+	 */
+	dbg_rcvry("committing");
+	err = ubifs_run_commit(c);
+	if (err)
+		return err;
+
+	dbg_rcvry("GC'ing LEB %d", lp.lnum);
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+	err = ubifs_garbage_collect_leb(c, &lp);
+	if (err >= 0) {
+		int err2 = ubifs_wbuf_sync_nolock(wbuf);
+
+		if (err2)
+			err = err2;
+	}
+	mutex_unlock(&wbuf->io_mutex);
+	if (err < 0) {
+		dbg_err("GC failed, error %d", err);
+		if (err == -EAGAIN)
+			err = -EINVAL;
+		return err;
+	}
+
+	ubifs_assert(err == LEB_RETAINED);
+	if (err != LEB_RETAINED)
+		return -EINVAL;
+
+	err = ubifs_leb_unmap(c, c->gc_lnum);
+	if (err)
+		return err;
+
+	dbg_rcvry("allocated LEB %d for GC", lp.lnum);
+	return 0;
+}
+
+/**
+ * struct size_entry - inode size information for recovery.
+ * @rb: link in the RB-tree of sizes
+ * @inum: inode number
+ * @i_size: size on inode
+ * @d_size: maximum size based on data nodes
+ * @exists: indicates whether the inode exists
+ * @inode: inode if pinned in memory awaiting rw mode to fix it
+ */
+struct size_entry {
+	struct rb_node rb;
+	ino_t inum;
+	loff_t i_size;
+	loff_t d_size;
+	int exists;
+	struct inode *inode;
+};
+
+/**
+ * add_ino - add an entry to the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ * @i_size: size on inode
+ * @d_size: maximum size based on data nodes
+ * @exists: indicates whether the inode exists
+ */
+static int add_ino(struct ubifs_info *c, ino_t inum, loff_t i_size,
+		   loff_t d_size, int exists)
+{
+	struct rb_node **p = &c->size_tree.rb_node, *parent = NULL;
+	struct size_entry *e;
+
+	while (*p) {
+		parent = *p;
+		e = rb_entry(parent, struct size_entry, rb);
+		if (inum < e->inum)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	e = kzalloc(sizeof(struct size_entry), GFP_KERNEL);
+	if (!e)
+		return -ENOMEM;
+
+	e->inum = inum;
+	e->i_size = i_size;
+	e->d_size = d_size;
+	e->exists = exists;
+
+	rb_link_node(&e->rb, parent, p);
+	rb_insert_color(&e->rb, &c->size_tree);
+
+	return 0;
+}
+
+/**
+ * find_ino - find an entry on the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ */
+static struct size_entry *find_ino(struct ubifs_info *c, ino_t inum)
+{
+	struct rb_node *p = c->size_tree.rb_node;
+	struct size_entry *e;
+
+	while (p) {
+		e = rb_entry(p, struct size_entry, rb);
+		if (inum < e->inum)
+			p = p->rb_left;
+		else if (inum > e->inum)
+			p = p->rb_right;
+		else
+			return e;
+	}
+	return NULL;
+}
+
+/**
+ * remove_ino - remove an entry from the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ */
+static void remove_ino(struct ubifs_info *c, ino_t inum)
+{
+	struct size_entry *e = find_ino(c, inum);
+
+	if (!e)
+		return;
+	rb_erase(&e->rb, &c->size_tree);
+	kfree(e);
+}
+
+/**
+ * ubifs_destroy_size_tree - free resources related to the size tree.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_destroy_size_tree(struct ubifs_info *c)
+{
+	struct rb_node *this = c->size_tree.rb_node;
+	struct size_entry *e;
+
+	while (this) {
+		if (this->rb_left) {
+			this = this->rb_left;
+			continue;
+		} else if (this->rb_right) {
+			this = this->rb_right;
+			continue;
+		}
+		e = rb_entry(this, struct size_entry, rb);
+		if (e->inode)
+			iput(e->inode);
+		this = rb_parent(this);
+		if (this) {
+			if (this->rb_left == &e->rb)
+				this->rb_left = NULL;
+			else
+				this->rb_right = NULL;
+		}
+		kfree(e);
+	}
+	c->size_tree = RB_ROOT;
+}
+
+/**
+ * ubifs_recover_size_accum - accumulate inode sizes for recovery.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @deletion: node is for a deletion
+ * @new_size: inode size
+ *
+ * This function has two purposes:
+ *     1) to ensure there are no data nodes that fall outside the inode size
+ *     2) to ensure there are no data nodes for inodes that do not exist
+ * To accomplish those purposes, a rb-tree is constructed containing an entry
+ * for each inode number in the journal that has not been deleted, and recording
+ * the size from the inode node, the maximum size of any data node (also altered
+ * by truncations) and a flag indicating a inode number for which no inode node
+ * was present in the journal.
+ *
+ * Note that there is still the possibility that there are data nodes that have
+ * been committed that are beyond the inode size, however the only way to find
+ * them would be to scan the entire index. Alternatively, some provision could
+ * be made to record the size of inodes at the start of commit, which would seem
+ * very cumbersome for a scenario that is quite unlikely and the only negative
+ * consequence of which is wasted space.
+ *
+ * This functions returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
+			     int deletion, loff_t new_size)
+{
+	ino_t inum = key_inum(c, key);
+	struct size_entry *e;
+	int err;
+
+	switch (key_type(c, key)) {
+	case UBIFS_INO_KEY:
+		if (deletion)
+			remove_ino(c, inum);
+		else {
+			e = find_ino(c, inum);
+			if (e) {
+				e->i_size = new_size;
+				e->exists = 1;
+			} else {
+				err = add_ino(c, inum, new_size, 0, 1);
+				if (err)
+					return err;
+			}
+		}
+		break;
+	case UBIFS_DATA_KEY:
+		e = find_ino(c, inum);
+		if (e) {
+			if (new_size > e->d_size)
+				e->d_size = new_size;
+		} else {
+			err = add_ino(c, inum, 0, new_size, 0);
+			if (err)
+				return err;
+		}
+		break;
+	case UBIFS_TRUN_KEY:
+		e = find_ino(c, inum);
+		if (e)
+			e->d_size = new_size;
+		break;
+	}
+	return 0;
+}
+
+/**
+ * fix_size_in_place - fix inode size in place on flash.
+ * @c: UBIFS file-system description object
+ * @e: inode size information for recovery
+ */
+static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
+{
+	struct ubifs_ino_node *ino = c->sbuf;
+	unsigned char *p;
+	union ubifs_key key;
+	int err, lnum, offs, len;
+	loff_t i_size;
+	uint32_t crc;
+
+	/* Locate the inode node LEB number and offset */
+	ino_key_init(c, &key, e->inum);
+	err = ubifs_tnc_locate(c, &key, ino, &lnum, &offs);
+	if (err)
+		goto out;
+	/*
+	 * If the size recorded on the inode node is greater than the size that
+	 * was calculated from nodes in the journal then don't change the inode.
+	 */
+	i_size = le64_to_cpu(ino->size);
+	if (i_size >= e->d_size)
+		return 0;
+	/* Read the LEB */
+	err = ubi_read(c->ubi, lnum, c->sbuf, 0, c->leb_size);
+	if (err)
+		goto out;
+	/* Change the size field and recalculate the CRC */
+	ino = c->sbuf + offs;
+	ino->size = cpu_to_le64(e->d_size);
+	len = le32_to_cpu(ino->ch.len);
+	crc = crc32(UBIFS_CRC32_INIT, (void *)ino + 8, len - 8);
+	ino->ch.crc = cpu_to_le32(crc);
+	/* Work out where data in the LEB ends and free space begins */
+	p = c->sbuf;
+	len = c->leb_size - 1;
+	while (p[len] == 0xff)
+		len -= 1;
+	len = ALIGN(len + 1, c->min_io_size);
+	/* Atomically write the fixed LEB back again */
+	err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
+	if (err)
+		goto out;
+	dbg_rcvry("inode %lu at %d:%d size %lld -> %lld",
+		  (unsigned long)e->inum, lnum, offs, i_size, e->d_size);
+	return 0;
+
+out:
+	ubifs_warn("inode %lu failed to fix size %lld -> %lld error %d",
+		   (unsigned long)e->inum, e->i_size, e->d_size, err);
+	return err;
+}
+
+/**
+ * ubifs_recover_size - recover inode size.
+ * @c: UBIFS file-system description object
+ *
+ * This function attempts to fix inode size discrepancies identified by the
+ * 'ubifs_recover_size_accum()' function.
+ *
+ * This functions returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_size(struct ubifs_info *c)
+{
+	struct rb_node *this = rb_first(&c->size_tree);
+
+	while (this) {
+		struct size_entry *e;
+		int err;
+
+		e = rb_entry(this, struct size_entry, rb);
+		if (!e->exists) {
+			union ubifs_key key;
+
+			ino_key_init(c, &key, e->inum);
+			err = ubifs_tnc_lookup(c, &key, c->sbuf);
+			if (err && err != -ENOENT)
+				return err;
+			if (err == -ENOENT) {
+				/* Remove data nodes that have no inode */
+				dbg_rcvry("removing ino %lu",
+					  (unsigned long)e->inum);
+				err = ubifs_tnc_remove_ino(c, e->inum);
+				if (err)
+					return err;
+			} else {
+				struct ubifs_ino_node *ino = c->sbuf;
+
+				e->exists = 1;
+				e->i_size = le64_to_cpu(ino->size);
+			}
+		}
+
+		if (e->exists && e->i_size < e->d_size) {
+			if (c->ro_mount) {
+				/* Fix the inode size and pin it in memory */
+				struct inode *inode;
+				struct ubifs_inode *ui;
+
+				ubifs_assert(!e->inode);
+
+				inode = ubifs_iget(c->vfs_sb, e->inum);
+				if (IS_ERR(inode))
+					return PTR_ERR(inode);
+
+				ui = ubifs_inode(inode);
+				if (inode->i_size < e->d_size) {
+					dbg_rcvry("ino %lu size %lld -> %lld",
+						  (unsigned long)e->inum,
+						  inode->i_size, e->d_size);
+					inode->i_size = e->d_size;
+					ui->ui_size = e->d_size;
+					ui->synced_i_size = e->d_size;
+					e->inode = inode;
+					this = rb_next(this);
+					continue;
+				}
+				iput(inode);
+			} else {
+				/* Fix the size in place */
+				err = fix_size_in_place(c, e);
+				if (err)
+					return err;
+				if (e->inode)
+					iput(e->inode);
+			}
+		}
+
+		this = rb_next(this);
+		rb_erase(&e->rb, &c->size_tree);
+		kfree(e);
+	}
+
+	return 0;
+}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
new file mode 100644
index 00000000..5e97161c
--- /dev/null
+++ b/fs/ubifs/replay.c
@@ -0,0 +1,1080 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file contains journal replay code. It runs when the file-system is being
+ * mounted and requires no locking.
+ *
+ * The larger is the journal, the longer it takes to scan it, so the longer it
+ * takes to mount UBIFS. This is why the journal has limited size which may be
+ * changed depending on the system requirements. But a larger journal gives
+ * faster I/O speed because it writes the index less frequently. So this is a
+ * trade-off. Also, the journal is indexed by the in-memory index (TNC), so the
+ * larger is the journal, the more memory its index may consume.
+ */
+
+#include "ubifs.h"
+#include <linux/list_sort.h>
+
+/**
+ * struct replay_entry - replay list entry.
+ * @lnum: logical eraseblock number of the node
+ * @offs: node offset
+ * @len: node length
+ * @deletion: non-zero if this entry corresponds to a node deletion
+ * @sqnum: node sequence number
+ * @list: links the replay list
+ * @key: node key
+ * @nm: directory entry name
+ * @old_size: truncation old size
+ * @new_size: truncation new size
+ *
+ * The replay process first scans all buds and builds the replay list, then
+ * sorts the replay list in nodes sequence number order, and then inserts all
+ * the replay entries to the TNC.
+ */
+struct replay_entry {
+	int lnum;
+	int offs;
+	int len;
+	unsigned int deletion:1;
+	unsigned long long sqnum;
+	struct list_head list;
+	union ubifs_key key;
+	union {
+		struct qstr nm;
+		struct {
+			loff_t old_size;
+			loff_t new_size;
+		};
+	};
+};
+
+/**
+ * struct bud_entry - entry in the list of buds to replay.
+ * @list: next bud in the list
+ * @bud: bud description object
+ * @sqnum: reference node sequence number
+ * @free: free bytes in the bud
+ * @dirty: dirty bytes in the bud
+ */
+struct bud_entry {
+	struct list_head list;
+	struct ubifs_bud *bud;
+	unsigned long long sqnum;
+	int free;
+	int dirty;
+};
+
+/**
+ * set_bud_lprops - set free and dirty space used by a bud.
+ * @c: UBIFS file-system description object
+ * @b: bud entry which describes the bud
+ *
+ * This function makes sure the LEB properties of bud @b are set correctly
+ * after the replay. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b)
+{
+	const struct ubifs_lprops *lp;
+	int err = 0, dirty;
+
+	ubifs_get_lprops(c);
+
+	lp = ubifs_lpt_lookup_dirty(c, b->bud->lnum);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	dirty = lp->dirty;
+	if (b->bud->start == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
+		/*
+		 * The LEB was added to the journal with a starting offset of
+		 * zero which means the LEB must have been empty. The LEB
+		 * property values should be @lp->free == @c->leb_size and
+		 * @lp->dirty == 0, but that is not the case. The reason is that
+		 * the LEB had been garbage collected before it became the bud,
+		 * and there was not commit inbetween. The garbage collector
+		 * resets the free and dirty space without recording it
+		 * anywhere except lprops, so if there was no commit then
+		 * lprops does not have that information.
+		 *
+		 * We do not need to adjust free space because the scan has told
+		 * us the exact value which is recorded in the replay entry as
+		 * @b->free.
+		 *
+		 * However we do need to subtract from the dirty space the
+		 * amount of space that the garbage collector reclaimed, which
+		 * is the whole LEB minus the amount of space that was free.
+		 */
+		dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
+			lp->free, lp->dirty);
+		dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
+			lp->free, lp->dirty);
+		dirty -= c->leb_size - lp->free;
+		/*
+		 * If the replay order was perfect the dirty space would now be
+		 * zero. The order is not perfect because the journal heads
+		 * race with each other. This is not a problem but is does mean
+		 * that the dirty space may temporarily exceed c->leb_size
+		 * during the replay.
+		 */
+		if (dirty != 0)
+			dbg_msg("LEB %d lp: %d free %d dirty "
+				"replay: %d free %d dirty", b->bud->lnum,
+				lp->free, lp->dirty, b->free, b->dirty);
+	}
+	lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty,
+			     lp->flags | LPROPS_TAKEN, 0);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	/* Make sure the journal head points to the latest bud */
+	err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf,
+				     b->bud->lnum, c->leb_size - b->free,
+				     UBI_SHORTTERM);
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * set_buds_lprops - set free and dirty space for all replayed buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function sets LEB properties for all replayed buds. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int set_buds_lprops(struct ubifs_info *c)
+{
+	struct bud_entry *b;
+	int err;
+
+	list_for_each_entry(b, &c->replay_buds, list) {
+		err = set_bud_lprops(c, b);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/**
+ * trun_remove_range - apply a replay entry for a truncation to the TNC.
+ * @c: UBIFS file-system description object
+ * @r: replay entry of truncation
+ */
+static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
+{
+	unsigned min_blk, max_blk;
+	union ubifs_key min_key, max_key;
+	ino_t ino;
+
+	min_blk = r->new_size / UBIFS_BLOCK_SIZE;
+	if (r->new_size & (UBIFS_BLOCK_SIZE - 1))
+		min_blk += 1;
+
+	max_blk = r->old_size / UBIFS_BLOCK_SIZE;
+	if ((r->old_size & (UBIFS_BLOCK_SIZE - 1)) == 0)
+		max_blk -= 1;
+
+	ino = key_inum(c, &r->key);
+
+	data_key_init(c, &min_key, ino, min_blk);
+	data_key_init(c, &max_key, ino, max_blk);
+
+	return ubifs_tnc_remove_range(c, &min_key, &max_key);
+}
+
+/**
+ * apply_replay_entry - apply a replay entry to the TNC.
+ * @c: UBIFS file-system description object
+ * @r: replay entry to apply
+ *
+ * Apply a replay entry to the TNC.
+ */
+static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
+{
+	int err;
+
+	dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum,
+		r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key));
+
+	/* Set c->replay_sqnum to help deal with dangling branches. */
+	c->replay_sqnum = r->sqnum;
+
+	if (is_hash_key(c, &r->key)) {
+		if (r->deletion)
+			err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
+		else
+			err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
+					       r->len, &r->nm);
+	} else {
+		if (r->deletion)
+			switch (key_type(c, &r->key)) {
+			case UBIFS_INO_KEY:
+			{
+				ino_t inum = key_inum(c, &r->key);
+
+				err = ubifs_tnc_remove_ino(c, inum);
+				break;
+			}
+			case UBIFS_TRUN_KEY:
+				err = trun_remove_range(c, r);
+				break;
+			default:
+				err = ubifs_tnc_remove(c, &r->key);
+				break;
+			}
+		else
+			err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs,
+					    r->len);
+		if (err)
+			return err;
+
+		if (c->need_recovery)
+			err = ubifs_recover_size_accum(c, &r->key, r->deletion,
+						       r->new_size);
+	}
+
+	return err;
+}
+
+/**
+ * replay_entries_cmp - compare 2 replay entries.
+ * @priv: UBIFS file-system description object
+ * @a: first replay entry
+ * @a: second replay entry
+ *
+ * This is a comparios function for 'list_sort()' which compares 2 replay
+ * entries @a and @b by comparing their sequence numer.  Returns %1 if @a has
+ * greater sequence number and %-1 otherwise.
+ */
+static int replay_entries_cmp(void *priv, struct list_head *a,
+			      struct list_head *b)
+{
+	struct replay_entry *ra, *rb;
+
+	cond_resched();
+	if (a == b)
+		return 0;
+
+	ra = list_entry(a, struct replay_entry, list);
+	rb = list_entry(b, struct replay_entry, list);
+	ubifs_assert(ra->sqnum != rb->sqnum);
+	if (ra->sqnum > rb->sqnum)
+		return 1;
+	return -1;
+}
+
+/**
+ * apply_replay_list - apply the replay list to the TNC.
+ * @c: UBIFS file-system description object
+ *
+ * Apply all entries in the replay list to the TNC. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+static int apply_replay_list(struct ubifs_info *c)
+{
+	struct replay_entry *r;
+	int err;
+
+	list_sort(c, &c->replay_list, &replay_entries_cmp);
+
+	list_for_each_entry(r, &c->replay_list, list) {
+		cond_resched();
+
+		err = apply_replay_entry(c, r);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/**
+ * destroy_replay_list - destroy the replay.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy the replay list.
+ */
+static void destroy_replay_list(struct ubifs_info *c)
+{
+	struct replay_entry *r, *tmp;
+
+	list_for_each_entry_safe(r, tmp, &c->replay_list, list) {
+		if (is_hash_key(c, &r->key))
+			kfree(r->nm.name);
+		list_del(&r->list);
+		kfree(r);
+	}
+}
+
+/**
+ * insert_node - insert a node to the replay list
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @len: node length
+ * @key: node key
+ * @sqnum: sequence number
+ * @deletion: non-zero if this is a deletion
+ * @used: number of bytes in use in a LEB
+ * @old_size: truncation old size
+ * @new_size: truncation new size
+ *
+ * This function inserts a scanned non-direntry node to the replay list. The
+ * replay list contains @struct replay_entry elements, and we sort this list in
+ * sequence number order before applying it. The replay list is applied at the
+ * very end of the replay process. Since the list is sorted in sequence number
+ * order, the older modifications are applied first. This function returns zero
+ * in case of success and a negative error code in case of failure.
+ */
+static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
+		       union ubifs_key *key, unsigned long long sqnum,
+		       int deletion, int *used, loff_t old_size,
+		       loff_t new_size)
+{
+	struct replay_entry *r;
+
+	dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+
+	if (key_inum(c, key) >= c->highest_inum)
+		c->highest_inum = key_inum(c, key);
+
+	r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+	if (!r)
+		return -ENOMEM;
+
+	if (!deletion)
+		*used += ALIGN(len, 8);
+	r->lnum = lnum;
+	r->offs = offs;
+	r->len = len;
+	r->deletion = !!deletion;
+	r->sqnum = sqnum;
+	key_copy(c, key, &r->key);
+	r->old_size = old_size;
+	r->new_size = new_size;
+
+	list_add_tail(&r->list, &c->replay_list);
+	return 0;
+}
+
+/**
+ * insert_dent - insert a directory entry node into the replay list.
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @len: node length
+ * @key: node key
+ * @name: directory entry name
+ * @nlen: directory entry name length
+ * @sqnum: sequence number
+ * @deletion: non-zero if this is a deletion
+ * @used: number of bytes in use in a LEB
+ *
+ * This function inserts a scanned directory entry node or an extended
+ * attribute entry to the replay list. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
+		       union ubifs_key *key, const char *name, int nlen,
+		       unsigned long long sqnum, int deletion, int *used)
+{
+	struct replay_entry *r;
+	char *nbuf;
+
+	dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+	if (key_inum(c, key) >= c->highest_inum)
+		c->highest_inum = key_inum(c, key);
+
+	r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+	if (!r)
+		return -ENOMEM;
+
+	nbuf = kmalloc(nlen + 1, GFP_KERNEL);
+	if (!nbuf) {
+		kfree(r);
+		return -ENOMEM;
+	}
+
+	if (!deletion)
+		*used += ALIGN(len, 8);
+	r->lnum = lnum;
+	r->offs = offs;
+	r->len = len;
+	r->deletion = !!deletion;
+	r->sqnum = sqnum;
+	key_copy(c, key, &r->key);
+	r->nm.len = nlen;
+	memcpy(nbuf, name, nlen);
+	nbuf[nlen] = '\0';
+	r->nm.name = nbuf;
+
+	list_add_tail(&r->list, &c->replay_list);
+	return 0;
+}
+
+/**
+ * ubifs_validate_entry - validate directory or extended attribute entry node.
+ * @c: UBIFS file-system description object
+ * @dent: the node to validate
+ *
+ * This function validates directory or extended attribute entry node @dent.
+ * Returns zero if the node is all right and a %-EINVAL if not.
+ */
+int ubifs_validate_entry(struct ubifs_info *c,
+			 const struct ubifs_dent_node *dent)
+{
+	int key_type = key_type_flash(c, dent->key);
+	int nlen = le16_to_cpu(dent->nlen);
+
+	if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 ||
+	    dent->type >= UBIFS_ITYPES_CNT ||
+	    nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 ||
+	    strnlen(dent->name, nlen) != nlen ||
+	    le64_to_cpu(dent->inum) > MAX_INUM) {
+		ubifs_err("bad %s node", key_type == UBIFS_DENT_KEY ?
+			  "directory entry" : "extended attribute entry");
+		return -EINVAL;
+	}
+
+	if (key_type != UBIFS_DENT_KEY && key_type != UBIFS_XENT_KEY) {
+		ubifs_err("bad key type %d", key_type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * is_last_bud - check if the bud is the last in the journal head.
+ * @c: UBIFS file-system description object
+ * @bud: bud description object
+ *
+ * This function checks if bud @bud is the last bud in its journal head. This
+ * information is then used by 'replay_bud()' to decide whether the bud can
+ * have corruptions or not. Indeed, only last buds can be corrupted by power
+ * cuts. Returns %1 if this is the last bud, and %0 if not.
+ */
+static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud)
+{
+	struct ubifs_jhead *jh = &c->jheads[bud->jhead];
+	struct ubifs_bud *next;
+	uint32_t data;
+	int err;
+
+	if (list_is_last(&bud->list, &jh->buds_list))
+		return 1;
+
+	/*
+	 * The following is a quirk to make sure we work correctly with UBIFS
+	 * images used with older UBIFS.
+	 *
+	 * Normally, the last bud will be the last in the journal head's list
+	 * of bud. However, there is one exception if the UBIFS image belongs
+	 * to older UBIFS. This is fairly unlikely: one would need to use old
+	 * UBIFS, then have a power cut exactly at the right point, and then
+	 * try to mount this image with new UBIFS.
+	 *
+	 * The exception is: it is possible to have 2 buds A and B, A goes
+	 * before B, and B is the last, bud B is contains no data, and bud A is
+	 * corrupted at the end. The reason is that in older versions when the
+	 * journal code switched the next bud (from A to B), it first added a
+	 * log reference node for the new bud (B), and only after this it
+	 * synchronized the write-buffer of current bud (A). But later this was
+	 * changed and UBIFS started to always synchronize the write-buffer of
+	 * the bud (A) before writing the log reference for the new bud (B).
+	 *
+	 * But because older UBIFS always synchronized A's write-buffer before
+	 * writing to B, we can recognize this exceptional situation but
+	 * checking the contents of bud B - if it is empty, then A can be
+	 * treated as the last and we can recover it.
+	 *
+	 * TODO: remove this piece of code in a couple of years (today it is
+	 * 16.05.2011).
+	 */
+	next = list_entry(bud->list.next, struct ubifs_bud, list);
+	if (!list_is_last(&next->list, &jh->buds_list))
+		return 0;
+
+	err = ubi_read(c->ubi, next->lnum, (char *)&data,
+		       next->start, 4);
+	if (err)
+		return 0;
+
+	return data == 0xFFFFFFFF;
+}
+
+/**
+ * replay_bud - replay a bud logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @b: bud entry which describes the bud
+ *
+ * This function replays bud @bud, recovers it if needed, and adds all nodes
+ * from this bud to the replay list. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
+{
+	int is_last = is_last_bud(c, b->bud);
+	int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start;
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+
+	dbg_mnt("replay bud LEB %d, head %d, offs %d, is_last %d",
+		lnum, b->bud->jhead, offs, is_last);
+
+	if (c->need_recovery && is_last)
+		/*
+		 * Recover only last LEBs in the journal heads, because power
+		 * cuts may cause corruptions only in these LEBs, because only
+		 * these LEBs could possibly be written to at the power cut
+		 * time.
+		 */
+		sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, b->bud->jhead);
+	else
+		sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
+	if (IS_ERR(sleb))
+		return PTR_ERR(sleb);
+
+	/*
+	 * The bud does not have to start from offset zero - the beginning of
+	 * the 'lnum' LEB may contain previously committed data. One of the
+	 * things we have to do in replay is to correctly update lprops with
+	 * newer information about this LEB.
+	 *
+	 * At this point lprops thinks that this LEB has 'c->leb_size - offs'
+	 * bytes of free space because it only contain information about
+	 * committed data.
+	 *
+	 * But we know that real amount of free space is 'c->leb_size -
+	 * sleb->endpt', and the space in the 'lnum' LEB between 'offs' and
+	 * 'sleb->endpt' is used by bud data. We have to correctly calculate
+	 * how much of these data are dirty and update lprops with this
+	 * information.
+	 *
+	 * The dirt in that LEB region is comprised of padding nodes, deletion
+	 * nodes, truncation nodes and nodes which are obsoleted by subsequent
+	 * nodes in this LEB. So instead of calculating clean space, we
+	 * calculate used space ('used' variable).
+	 */
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		int deletion = 0;
+
+		cond_resched();
+
+		if (snod->sqnum >= SQNUM_WATERMARK) {
+			ubifs_err("file system's life ended");
+			goto out_dump;
+		}
+
+		if (snod->sqnum > c->max_sqnum)
+			c->max_sqnum = snod->sqnum;
+
+		switch (snod->type) {
+		case UBIFS_INO_NODE:
+		{
+			struct ubifs_ino_node *ino = snod->node;
+			loff_t new_size = le64_to_cpu(ino->size);
+
+			if (le32_to_cpu(ino->nlink) == 0)
+				deletion = 1;
+			err = insert_node(c, lnum, snod->offs, snod->len,
+					  &snod->key, snod->sqnum, deletion,
+					  &used, 0, new_size);
+			break;
+		}
+		case UBIFS_DATA_NODE:
+		{
+			struct ubifs_data_node *dn = snod->node;
+			loff_t new_size = le32_to_cpu(dn->size) +
+					  key_block(c, &snod->key) *
+					  UBIFS_BLOCK_SIZE;
+
+			err = insert_node(c, lnum, snod->offs, snod->len,
+					  &snod->key, snod->sqnum, deletion,
+					  &used, 0, new_size);
+			break;
+		}
+		case UBIFS_DENT_NODE:
+		case UBIFS_XENT_NODE:
+		{
+			struct ubifs_dent_node *dent = snod->node;
+
+			err = ubifs_validate_entry(c, dent);
+			if (err)
+				goto out_dump;
+
+			err = insert_dent(c, lnum, snod->offs, snod->len,
+					  &snod->key, dent->name,
+					  le16_to_cpu(dent->nlen), snod->sqnum,
+					  !le64_to_cpu(dent->inum), &used);
+			break;
+		}
+		case UBIFS_TRUN_NODE:
+		{
+			struct ubifs_trun_node *trun = snod->node;
+			loff_t old_size = le64_to_cpu(trun->old_size);
+			loff_t new_size = le64_to_cpu(trun->new_size);
+			union ubifs_key key;
+
+			/* Validate truncation node */
+			if (old_size < 0 || old_size > c->max_inode_sz ||
+			    new_size < 0 || new_size > c->max_inode_sz ||
+			    old_size <= new_size) {
+				ubifs_err("bad truncation node");
+				goto out_dump;
+			}
+
+			/*
+			 * Create a fake truncation key just to use the same
+			 * functions which expect nodes to have keys.
+			 */
+			trun_key_init(c, &key, le32_to_cpu(trun->inum));
+			err = insert_node(c, lnum, snod->offs, snod->len,
+					  &key, snod->sqnum, 1, &used,
+					  old_size, new_size);
+			break;
+		}
+		default:
+			ubifs_err("unexpected node type %d in bud LEB %d:%d",
+				  snod->type, lnum, snod->offs);
+			err = -EINVAL;
+			goto out_dump;
+		}
+		if (err)
+			goto out;
+	}
+
+	ubifs_assert(ubifs_search_bud(c, lnum));
+	ubifs_assert(sleb->endpt - offs >= used);
+	ubifs_assert(sleb->endpt % c->min_io_size == 0);
+
+	b->dirty = sleb->endpt - offs - used;
+	b->free = c->leb_size - sleb->endpt;
+	dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, b->dirty, b->free);
+
+out:
+	ubifs_scan_destroy(sleb);
+	return err;
+
+out_dump:
+	ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs);
+	dbg_dump_node(c, snod->node);
+	ubifs_scan_destroy(sleb);
+	return -EINVAL;
+}
+
+/**
+ * replay_buds - replay all buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int replay_buds(struct ubifs_info *c)
+{
+	struct bud_entry *b;
+	int err;
+	unsigned long long prev_sqnum = 0;
+
+	list_for_each_entry(b, &c->replay_buds, list) {
+		err = replay_bud(c, b);
+		if (err)
+			return err;
+
+		ubifs_assert(b->sqnum > prev_sqnum);
+		prev_sqnum = b->sqnum;
+	}
+
+	return 0;
+}
+
+/**
+ * destroy_bud_list - destroy the list of buds to replay.
+ * @c: UBIFS file-system description object
+ */
+static void destroy_bud_list(struct ubifs_info *c)
+{
+	struct bud_entry *b;
+
+	while (!list_empty(&c->replay_buds)) {
+		b = list_entry(c->replay_buds.next, struct bud_entry, list);
+		list_del(&b->list);
+		kfree(b);
+	}
+}
+
+/**
+ * add_replay_bud - add a bud to the list of buds to replay.
+ * @c: UBIFS file-system description object
+ * @lnum: bud logical eraseblock number to replay
+ * @offs: bud start offset
+ * @jhead: journal head to which this bud belongs
+ * @sqnum: reference node sequence number
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
+			  unsigned long long sqnum)
+{
+	struct ubifs_bud *bud;
+	struct bud_entry *b;
+
+	dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead);
+
+	bud = kmalloc(sizeof(struct ubifs_bud), GFP_KERNEL);
+	if (!bud)
+		return -ENOMEM;
+
+	b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL);
+	if (!b) {
+		kfree(bud);
+		return -ENOMEM;
+	}
+
+	bud->lnum = lnum;
+	bud->start = offs;
+	bud->jhead = jhead;
+	ubifs_add_bud(c, bud);
+
+	b->bud = bud;
+	b->sqnum = sqnum;
+	list_add_tail(&b->list, &c->replay_buds);
+
+	return 0;
+}
+
+/**
+ * validate_ref - validate a reference node.
+ * @c: UBIFS file-system description object
+ * @ref: the reference node to validate
+ * @ref_lnum: LEB number of the reference node
+ * @ref_offs: reference node offset
+ *
+ * This function returns %1 if a bud reference already exists for the LEB. %0 is
+ * returned if the reference node is new, otherwise %-EINVAL is returned if
+ * validation failed.
+ */
+static int validate_ref(struct ubifs_info *c, const struct ubifs_ref_node *ref)
+{
+	struct ubifs_bud *bud;
+	int lnum = le32_to_cpu(ref->lnum);
+	unsigned int offs = le32_to_cpu(ref->offs);
+	unsigned int jhead = le32_to_cpu(ref->jhead);
+
+	/*
+	 * ref->offs may point to the end of LEB when the journal head points
+	 * to the end of LEB and we write reference node for it during commit.
+	 * So this is why we require 'offs > c->leb_size'.
+	 */
+	if (jhead >= c->jhead_cnt || lnum >= c->leb_cnt ||
+	    lnum < c->main_first || offs > c->leb_size ||
+	    offs & (c->min_io_size - 1))
+		return -EINVAL;
+
+	/* Make sure we have not already looked at this bud */
+	bud = ubifs_search_bud(c, lnum);
+	if (bud) {
+		if (bud->jhead == jhead && bud->start <= offs)
+			return 1;
+		ubifs_err("bud at LEB %d:%d was already referred", lnum, offs);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * replay_log_leb - replay a log logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: log logical eraseblock to replay
+ * @offs: offset to start replaying from
+ * @sbuf: scan buffer
+ *
+ * This function replays a log LEB and returns zero in case of success, %1 if
+ * this is the last LEB in the log, and a negative error code in case of
+ * failure.
+ */
+static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
+{
+	int err;
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	const struct ubifs_cs_node *node;
+
+	dbg_mnt("replay log LEB %d:%d", lnum, offs);
+	sleb = ubifs_scan(c, lnum, offs, sbuf, c->need_recovery);
+	if (IS_ERR(sleb)) {
+		if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
+			return PTR_ERR(sleb);
+		/*
+		 * Note, the below function will recover this log LEB only if
+		 * it is the last, because unclean reboots can possibly corrupt
+		 * only the tail of the log.
+		 */
+		sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
+		if (IS_ERR(sleb))
+			return PTR_ERR(sleb);
+	}
+
+	if (sleb->nodes_cnt == 0) {
+		err = 1;
+		goto out;
+	}
+
+	node = sleb->buf;
+	snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
+	if (c->cs_sqnum == 0) {
+		/*
+		 * This is the first log LEB we are looking at, make sure that
+		 * the first node is a commit start node. Also record its
+		 * sequence number so that UBIFS can determine where the log
+		 * ends, because all nodes which were have higher sequence
+		 * numbers.
+		 */
+		if (snod->type != UBIFS_CS_NODE) {
+			dbg_err("first log node at LEB %d:%d is not CS node",
+				lnum, offs);
+			goto out_dump;
+		}
+		if (le64_to_cpu(node->cmt_no) != c->cmt_no) {
+			dbg_err("first CS node at LEB %d:%d has wrong "
+				"commit number %llu expected %llu",
+				lnum, offs,
+				(unsigned long long)le64_to_cpu(node->cmt_no),
+				c->cmt_no);
+			goto out_dump;
+		}
+
+		c->cs_sqnum = le64_to_cpu(node->ch.sqnum);
+		dbg_mnt("commit start sqnum %llu", c->cs_sqnum);
+	}
+
+	if (snod->sqnum < c->cs_sqnum) {
+		/*
+		 * This means that we reached end of log and now
+		 * look to the older log data, which was already
+		 * committed but the eraseblock was not erased (UBIFS
+		 * only un-maps it). So this basically means we have to
+		 * exit with "end of log" code.
+		 */
+		err = 1;
+		goto out;
+	}
+
+	/* Make sure the first node sits at offset zero of the LEB */
+	if (snod->offs != 0) {
+		dbg_err("first node is not at zero offset");
+		goto out_dump;
+	}
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		cond_resched();
+
+		if (snod->sqnum >= SQNUM_WATERMARK) {
+			ubifs_err("file system's life ended");
+			goto out_dump;
+		}
+
+		if (snod->sqnum < c->cs_sqnum) {
+			dbg_err("bad sqnum %llu, commit sqnum %llu",
+				snod->sqnum, c->cs_sqnum);
+			goto out_dump;
+		}
+
+		if (snod->sqnum > c->max_sqnum)
+			c->max_sqnum = snod->sqnum;
+
+		switch (snod->type) {
+		case UBIFS_REF_NODE: {
+			const struct ubifs_ref_node *ref = snod->node;
+
+			err = validate_ref(c, ref);
+			if (err == 1)
+				break; /* Already have this bud */
+			if (err)
+				goto out_dump;
+
+			err = add_replay_bud(c, le32_to_cpu(ref->lnum),
+					     le32_to_cpu(ref->offs),
+					     le32_to_cpu(ref->jhead),
+					     snod->sqnum);
+			if (err)
+				goto out;
+
+			break;
+		}
+		case UBIFS_CS_NODE:
+			/* Make sure it sits at the beginning of LEB */
+			if (snod->offs != 0) {
+				ubifs_err("unexpected node in log");
+				goto out_dump;
+			}
+			break;
+		default:
+			ubifs_err("unexpected node in log");
+			goto out_dump;
+		}
+	}
+
+	if (sleb->endpt || c->lhead_offs >= c->leb_size) {
+		c->lhead_lnum = lnum;
+		c->lhead_offs = sleb->endpt;
+	}
+
+	err = !sleb->endpt;
+out:
+	ubifs_scan_destroy(sleb);
+	return err;
+
+out_dump:
+	ubifs_err("log error detected while replaying the log at LEB %d:%d",
+		  lnum, offs + snod->offs);
+	dbg_dump_node(c, snod->node);
+	ubifs_scan_destroy(sleb);
+	return -EINVAL;
+}
+
+/**
+ * take_ihead - update the status of the index head in lprops to 'taken'.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the amount of free space in the index head LEB or a
+ * negative error code.
+ */
+static int take_ihead(struct ubifs_info *c)
+{
+	const struct ubifs_lprops *lp;
+	int err, free;
+
+	ubifs_get_lprops(c);
+
+	lp = ubifs_lpt_lookup_dirty(c, c->ihead_lnum);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	free = lp->free;
+
+	lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+			     lp->flags | LPROPS_TAKEN, 0);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	err = free;
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * ubifs_replay_journal - replay journal.
+ * @c: UBIFS file-system description object
+ *
+ * This function scans the journal, replays and cleans it up. It makes sure all
+ * memory data structures related to uncommitted journal are built (dirty TNC
+ * tree, tree of buds, modified lprops, etc).
+ */
+int ubifs_replay_journal(struct ubifs_info *c)
+{
+	int err, i, lnum, offs, free;
+
+	BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
+
+	/* Update the status of the index head in lprops to 'taken' */
+	free = take_ihead(c);
+	if (free < 0)
+		return free; /* Error code */
+
+	if (c->ihead_offs != c->leb_size - free) {
+		ubifs_err("bad index head LEB %d:%d", c->ihead_lnum,
+			  c->ihead_offs);
+		return -EINVAL;
+	}
+
+	dbg_mnt("start replaying the journal");
+	c->replaying = 1;
+	lnum = c->ltail_lnum = c->lhead_lnum;
+	offs = c->lhead_offs;
+
+	for (i = 0; i < c->log_lebs; i++, lnum++) {
+		if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) {
+			/*
+			 * The log is logically circular, we reached the last
+			 * LEB, switch to the first one.
+			 */
+			lnum = UBIFS_LOG_LNUM;
+			offs = 0;
+		}
+		err = replay_log_leb(c, lnum, offs, c->sbuf);
+		if (err == 1)
+			/* We hit the end of the log */
+			break;
+		if (err)
+			goto out;
+		offs = 0;
+	}
+
+	err = replay_buds(c);
+	if (err)
+		goto out;
+
+	err = apply_replay_list(c);
+	if (err)
+		goto out;
+
+	err = set_buds_lprops(c);
+	if (err)
+		goto out;
+
+	/*
+	 * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable
+	 * to roughly estimate index growth. Things like @c->bi.min_idx_lebs
+	 * depend on it. This means we have to initialize it to make sure
+	 * budgeting works properly.
+	 */
+	c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
+	c->bi.uncommitted_idx *= c->max_idx_node_sz;
+
+	ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
+	dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
+		"highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
+		(unsigned long)c->highest_inum);
+out:
+	destroy_replay_list(c);
+	destroy_bud_list(c);
+	c->replaying = 0;
+	return err;
+}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
new file mode 100644
index 00000000..c606f010
--- /dev/null
+++ b/fs/ubifs/sb.c
@@ -0,0 +1,803 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS superblock. The superblock is stored at the first
+ * LEB of the volume and is never changed by UBIFS. Only user-space tools may
+ * change it. The superblock node mostly contains geometry information.
+ */
+
+#include "ubifs.h"
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/math64.h>
+
+/*
+ * Default journal size in logical eraseblocks as a percent of total
+ * flash size.
+ */
+#define DEFAULT_JNL_PERCENT 5
+
+/* Default maximum journal size in bytes */
+#define DEFAULT_MAX_JNL (32*1024*1024)
+
+/* Default indexing tree fanout */
+#define DEFAULT_FANOUT 8
+
+/* Default number of data journal heads */
+#define DEFAULT_JHEADS_CNT 1
+
+/* Default positions of different LEBs in the main area */
+#define DEFAULT_IDX_LEB  0
+#define DEFAULT_DATA_LEB 1
+#define DEFAULT_GC_LEB   2
+
+/* Default number of LEB numbers in LPT's save table */
+#define DEFAULT_LSAVE_CNT 256
+
+/* Default reserved pool size as a percent of maximum free space */
+#define DEFAULT_RP_PERCENT 5
+
+/* The default maximum size of reserved pool in bytes */
+#define DEFAULT_MAX_RP_SIZE (5*1024*1024)
+
+/* Default time granularity in nanoseconds */
+#define DEFAULT_TIME_GRAN 1000000000
+
+/**
+ * create_default_filesystem - format empty UBI volume.
+ * @c: UBIFS file-system description object
+ *
+ * This function creates default empty file-system. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+static int create_default_filesystem(struct ubifs_info *c)
+{
+	struct ubifs_sb_node *sup;
+	struct ubifs_mst_node *mst;
+	struct ubifs_idx_node *idx;
+	struct ubifs_branch *br;
+	struct ubifs_ino_node *ino;
+	struct ubifs_cs_node *cs;
+	union ubifs_key key;
+	int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
+	int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
+	int min_leb_cnt = UBIFS_MIN_LEB_CNT;
+	long long tmp64, main_bytes;
+	__le64 tmp_le64;
+
+	/* Some functions called from here depend on the @c->key_len filed */
+	c->key_len = UBIFS_SK_LEN;
+
+	/*
+	 * First of all, we have to calculate default file-system geometry -
+	 * log size, journal size, etc.
+	 */
+	if (c->leb_cnt < 0x7FFFFFFF / DEFAULT_JNL_PERCENT)
+		/* We can first multiply then divide and have no overflow */
+		jnl_lebs = c->leb_cnt * DEFAULT_JNL_PERCENT / 100;
+	else
+		jnl_lebs = (c->leb_cnt / 100) * DEFAULT_JNL_PERCENT;
+
+	if (jnl_lebs < UBIFS_MIN_JNL_LEBS)
+		jnl_lebs = UBIFS_MIN_JNL_LEBS;
+	if (jnl_lebs * c->leb_size > DEFAULT_MAX_JNL)
+		jnl_lebs = DEFAULT_MAX_JNL / c->leb_size;
+
+	/*
+	 * The log should be large enough to fit reference nodes for all bud
+	 * LEBs. Because buds do not have to start from the beginning of LEBs
+	 * (half of the LEB may contain committed data), the log should
+	 * generally be larger, make it twice as large.
+	 */
+	tmp = 2 * (c->ref_node_alsz * jnl_lebs) + c->leb_size - 1;
+	log_lebs = tmp / c->leb_size;
+	/* Plus one LEB reserved for commit */
+	log_lebs += 1;
+	if (c->leb_cnt - min_leb_cnt > 8) {
+		/* And some extra space to allow writes while committing */
+		log_lebs += 1;
+		min_leb_cnt += 1;
+	}
+
+	max_buds = jnl_lebs - log_lebs;
+	if (max_buds < UBIFS_MIN_BUD_LEBS)
+		max_buds = UBIFS_MIN_BUD_LEBS;
+
+	/*
+	 * Orphan nodes are stored in a separate area. One node can store a lot
+	 * of orphan inode numbers, but when new orphan comes we just add a new
+	 * orphan node. At some point the nodes are consolidated into one
+	 * orphan node.
+	 */
+	orph_lebs = UBIFS_MIN_ORPH_LEBS;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+	if (c->leb_cnt - min_leb_cnt > 1)
+		/*
+		 * For debugging purposes it is better to have at least 2
+		 * orphan LEBs, because the orphan subsystem would need to do
+		 * consolidations and would be stressed more.
+		 */
+		orph_lebs += 1;
+#endif
+
+	main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs;
+	main_lebs -= orph_lebs;
+
+	lpt_first = UBIFS_LOG_LNUM + log_lebs;
+	c->lsave_cnt = DEFAULT_LSAVE_CNT;
+	c->max_leb_cnt = c->leb_cnt;
+	err = ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs,
+				    &big_lpt);
+	if (err)
+		return err;
+
+	dbg_gen("LEB Properties Tree created (LEBs %d-%d)", lpt_first,
+		lpt_first + lpt_lebs - 1);
+
+	main_first = c->leb_cnt - main_lebs;
+
+	/* Create default superblock */
+	tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
+	sup = kzalloc(tmp, GFP_KERNEL);
+	if (!sup)
+		return -ENOMEM;
+
+	tmp64 = (long long)max_buds * c->leb_size;
+	if (big_lpt)
+		sup_flags |= UBIFS_FLG_BIGLPT;
+
+	sup->ch.node_type  = UBIFS_SB_NODE;
+	sup->key_hash      = UBIFS_KEY_HASH_R5;
+	sup->flags         = cpu_to_le32(sup_flags);
+	sup->min_io_size   = cpu_to_le32(c->min_io_size);
+	sup->leb_size      = cpu_to_le32(c->leb_size);
+	sup->leb_cnt       = cpu_to_le32(c->leb_cnt);
+	sup->max_leb_cnt   = cpu_to_le32(c->max_leb_cnt);
+	sup->max_bud_bytes = cpu_to_le64(tmp64);
+	sup->log_lebs      = cpu_to_le32(log_lebs);
+	sup->lpt_lebs      = cpu_to_le32(lpt_lebs);
+	sup->orph_lebs     = cpu_to_le32(orph_lebs);
+	sup->jhead_cnt     = cpu_to_le32(DEFAULT_JHEADS_CNT);
+	sup->fanout        = cpu_to_le32(DEFAULT_FANOUT);
+	sup->lsave_cnt     = cpu_to_le32(c->lsave_cnt);
+	sup->fmt_version   = cpu_to_le32(UBIFS_FORMAT_VERSION);
+	sup->time_gran     = cpu_to_le32(DEFAULT_TIME_GRAN);
+	if (c->mount_opts.override_compr)
+		sup->default_compr = cpu_to_le16(c->mount_opts.compr_type);
+	else
+		sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
+
+	generate_random_uuid(sup->uuid);
+
+	main_bytes = (long long)main_lebs * c->leb_size;
+	tmp64 = div_u64(main_bytes * DEFAULT_RP_PERCENT, 100);
+	if (tmp64 > DEFAULT_MAX_RP_SIZE)
+		tmp64 = DEFAULT_MAX_RP_SIZE;
+	sup->rp_size = cpu_to_le64(tmp64);
+	sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
+
+	err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
+	kfree(sup);
+	if (err)
+		return err;
+
+	dbg_gen("default superblock created at LEB 0:0");
+
+	/* Create default master node */
+	mst = kzalloc(c->mst_node_alsz, GFP_KERNEL);
+	if (!mst)
+		return -ENOMEM;
+
+	mst->ch.node_type = UBIFS_MST_NODE;
+	mst->log_lnum     = cpu_to_le32(UBIFS_LOG_LNUM);
+	mst->highest_inum = cpu_to_le64(UBIFS_FIRST_INO);
+	mst->cmt_no       = 0;
+	mst->root_lnum    = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
+	mst->root_offs    = 0;
+	tmp = ubifs_idx_node_sz(c, 1);
+	mst->root_len     = cpu_to_le32(tmp);
+	mst->gc_lnum      = cpu_to_le32(main_first + DEFAULT_GC_LEB);
+	mst->ihead_lnum   = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
+	mst->ihead_offs   = cpu_to_le32(ALIGN(tmp, c->min_io_size));
+	mst->index_size   = cpu_to_le64(ALIGN(tmp, 8));
+	mst->lpt_lnum     = cpu_to_le32(c->lpt_lnum);
+	mst->lpt_offs     = cpu_to_le32(c->lpt_offs);
+	mst->nhead_lnum   = cpu_to_le32(c->nhead_lnum);
+	mst->nhead_offs   = cpu_to_le32(c->nhead_offs);
+	mst->ltab_lnum    = cpu_to_le32(c->ltab_lnum);
+	mst->ltab_offs    = cpu_to_le32(c->ltab_offs);
+	mst->lsave_lnum   = cpu_to_le32(c->lsave_lnum);
+	mst->lsave_offs   = cpu_to_le32(c->lsave_offs);
+	mst->lscan_lnum   = cpu_to_le32(main_first);
+	mst->empty_lebs   = cpu_to_le32(main_lebs - 2);
+	mst->idx_lebs     = cpu_to_le32(1);
+	mst->leb_cnt      = cpu_to_le32(c->leb_cnt);
+
+	/* Calculate lprops statistics */
+	tmp64 = main_bytes;
+	tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
+	tmp64 -= ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
+	mst->total_free = cpu_to_le64(tmp64);
+
+	tmp64 = ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
+	ino_waste = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size) -
+			  UBIFS_INO_NODE_SZ;
+	tmp64 += ino_waste;
+	tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), 8);
+	mst->total_dirty = cpu_to_le64(tmp64);
+
+	/*  The indexing LEB does not contribute to dark space */
+	tmp64 = (c->main_lebs - 1) * c->dark_wm;
+	mst->total_dark = cpu_to_le64(tmp64);
+
+	mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
+
+	err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0,
+			       UBI_UNKNOWN);
+	if (err) {
+		kfree(mst);
+		return err;
+	}
+	err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0,
+			       UBI_UNKNOWN);
+	kfree(mst);
+	if (err)
+		return err;
+
+	dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM);
+
+	/* Create the root indexing node */
+	tmp = ubifs_idx_node_sz(c, 1);
+	idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);
+	if (!idx)
+		return -ENOMEM;
+
+	c->key_fmt = UBIFS_SIMPLE_KEY_FMT;
+	c->key_hash = key_r5_hash;
+
+	idx->ch.node_type = UBIFS_IDX_NODE;
+	idx->child_cnt = cpu_to_le16(1);
+	ino_key_init(c, &key, UBIFS_ROOT_INO);
+	br = ubifs_idx_branch(c, idx, 0);
+	key_write_idx(c, &key, &br->key);
+	br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB);
+	br->len  = cpu_to_le32(UBIFS_INO_NODE_SZ);
+	err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0,
+			       UBI_UNKNOWN);
+	kfree(idx);
+	if (err)
+		return err;
+
+	dbg_gen("default root indexing node created LEB %d:0",
+		main_first + DEFAULT_IDX_LEB);
+
+	/* Create default root inode */
+	tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
+	ino = kzalloc(tmp, GFP_KERNEL);
+	if (!ino)
+		return -ENOMEM;
+
+	ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO);
+	ino->ch.node_type = UBIFS_INO_NODE;
+	ino->creat_sqnum = cpu_to_le64(++c->max_sqnum);
+	ino->nlink = cpu_to_le32(2);
+	tmp_le64 = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);
+	ino->atime_sec   = tmp_le64;
+	ino->ctime_sec   = tmp_le64;
+	ino->mtime_sec   = tmp_le64;
+	ino->atime_nsec  = 0;
+	ino->ctime_nsec  = 0;
+	ino->mtime_nsec  = 0;
+	ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);
+	ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ);
+
+	/* Set compression enabled by default */
+	ino->flags = cpu_to_le32(UBIFS_COMPR_FL);
+
+	err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,
+			       main_first + DEFAULT_DATA_LEB, 0,
+			       UBI_UNKNOWN);
+	kfree(ino);
+	if (err)
+		return err;
+
+	dbg_gen("root inode created at LEB %d:0",
+		main_first + DEFAULT_DATA_LEB);
+
+	/*
+	 * The first node in the log has to be the commit start node. This is
+	 * always the case during normal file-system operation. Write a fake
+	 * commit start node to the log.
+	 */
+	tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size);
+	cs = kzalloc(tmp, GFP_KERNEL);
+	if (!cs)
+		return -ENOMEM;
+
+	cs->ch.node_type = UBIFS_CS_NODE;
+	err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM,
+			       0, UBI_UNKNOWN);
+	kfree(cs);
+
+	ubifs_msg("default file-system created");
+	return 0;
+}
+
+/**
+ * validate_sb - validate superblock node.
+ * @c: UBIFS file-system description object
+ * @sup: superblock node
+ *
+ * This function validates superblock node @sup. Since most of data was read
+ * from the superblock and stored in @c, the function validates fields in @c
+ * instead. Returns zero in case of success and %-EINVAL in case of validation
+ * failure.
+ */
+static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
+{
+	long long max_bytes;
+	int err = 1, min_leb_cnt;
+
+	if (!c->key_hash) {
+		err = 2;
+		goto failed;
+	}
+
+	if (sup->key_fmt != UBIFS_SIMPLE_KEY_FMT) {
+		err = 3;
+		goto failed;
+	}
+
+	if (le32_to_cpu(sup->min_io_size) != c->min_io_size) {
+		ubifs_err("min. I/O unit mismatch: %d in superblock, %d real",
+			  le32_to_cpu(sup->min_io_size), c->min_io_size);
+		goto failed;
+	}
+
+	if (le32_to_cpu(sup->leb_size) != c->leb_size) {
+		ubifs_err("LEB size mismatch: %d in superblock, %d real",
+			  le32_to_cpu(sup->leb_size), c->leb_size);
+		goto failed;
+	}
+
+	if (c->log_lebs < UBIFS_MIN_LOG_LEBS ||
+	    c->lpt_lebs < UBIFS_MIN_LPT_LEBS ||
+	    c->orph_lebs < UBIFS_MIN_ORPH_LEBS ||
+	    c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
+		err = 4;
+		goto failed;
+	}
+
+	/*
+	 * Calculate minimum allowed amount of main area LEBs. This is very
+	 * similar to %UBIFS_MIN_LEB_CNT, but we take into account real what we
+	 * have just read from the superblock.
+	 */
+	min_leb_cnt = UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs;
+	min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6;
+
+	if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) {
+		ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, "
+			  "%d minimum required", c->leb_cnt, c->vi.size,
+			  min_leb_cnt);
+		goto failed;
+	}
+
+	if (c->max_leb_cnt < c->leb_cnt) {
+		ubifs_err("max. LEB count %d less than LEB count %d",
+			  c->max_leb_cnt, c->leb_cnt);
+		goto failed;
+	}
+
+	if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
+		err = 7;
+		goto failed;
+	}
+
+	if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS ||
+	    c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) {
+		err = 8;
+		goto failed;
+	}
+
+	if (c->jhead_cnt < NONDATA_JHEADS_CNT + 1 ||
+	    c->jhead_cnt > NONDATA_JHEADS_CNT + UBIFS_MAX_JHEADS) {
+		err = 9;
+		goto failed;
+	}
+
+	if (c->fanout < UBIFS_MIN_FANOUT ||
+	    ubifs_idx_node_sz(c, c->fanout) > c->leb_size) {
+		err = 10;
+		goto failed;
+	}
+
+	if (c->lsave_cnt < 0 || (c->lsave_cnt > DEFAULT_LSAVE_CNT &&
+	    c->lsave_cnt > c->max_leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS -
+	    c->log_lebs - c->lpt_lebs - c->orph_lebs)) {
+		err = 11;
+		goto failed;
+	}
+
+	if (UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs + c->lpt_lebs +
+	    c->orph_lebs + c->main_lebs != c->leb_cnt) {
+		err = 12;
+		goto failed;
+	}
+
+	if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
+		err = 13;
+		goto failed;
+	}
+
+	max_bytes = c->main_lebs * (long long)c->leb_size;
+	if (c->rp_size < 0 || max_bytes < c->rp_size) {
+		err = 14;
+		goto failed;
+	}
+
+	if (le32_to_cpu(sup->time_gran) > 1000000000 ||
+	    le32_to_cpu(sup->time_gran) < 1) {
+		err = 15;
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	ubifs_err("bad superblock, error %d", err);
+	dbg_dump_node(c, sup);
+	return -EINVAL;
+}
+
+/**
+ * ubifs_read_sb_node - read superblock node.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns a pointer to the superblock node or a negative error
+ * code. Note, the user of this function is responsible of kfree()'ing the
+ * returned superblock buffer.
+ */
+struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
+{
+	struct ubifs_sb_node *sup;
+	int err;
+
+	sup = kmalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_NOFS);
+	if (!sup)
+		return ERR_PTR(-ENOMEM);
+
+	err = ubifs_read_node(c, sup, UBIFS_SB_NODE, UBIFS_SB_NODE_SZ,
+			      UBIFS_SB_LNUM, 0);
+	if (err) {
+		kfree(sup);
+		return ERR_PTR(err);
+	}
+
+	return sup;
+}
+
+/**
+ * ubifs_write_sb_node - write superblock node.
+ * @c: UBIFS file-system description object
+ * @sup: superblock node read with 'ubifs_read_sb_node()'
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup)
+{
+	int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
+
+	ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1);
+	return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len, UBI_LONGTERM);
+}
+
+/**
+ * ubifs_read_superblock - read superblock.
+ * @c: UBIFS file-system description object
+ *
+ * This function finds, reads and checks the superblock. If an empty UBI volume
+ * is being mounted, this function creates default superblock. Returns zero in
+ * case of success, and a negative error code in case of failure.
+ */
+int ubifs_read_superblock(struct ubifs_info *c)
+{
+	int err, sup_flags;
+	struct ubifs_sb_node *sup;
+
+	if (c->empty) {
+		err = create_default_filesystem(c);
+		if (err)
+			return err;
+	}
+
+	sup = ubifs_read_sb_node(c);
+	if (IS_ERR(sup))
+		return PTR_ERR(sup);
+
+	c->fmt_version = le32_to_cpu(sup->fmt_version);
+	c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
+
+	/*
+	 * The software supports all previous versions but not future versions,
+	 * due to the unavailability of time-travelling equipment.
+	 */
+	if (c->fmt_version > UBIFS_FORMAT_VERSION) {
+		ubifs_assert(!c->ro_media || c->ro_mount);
+		if (!c->ro_mount ||
+		    c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
+			ubifs_err("on-flash format version is w%d/r%d, but "
+				  "software only supports up to version "
+				  "w%d/r%d", c->fmt_version,
+				  c->ro_compat_version, UBIFS_FORMAT_VERSION,
+				  UBIFS_RO_COMPAT_VERSION);
+			if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
+				ubifs_msg("only R/O mounting is possible");
+				err = -EROFS;
+			} else
+				err = -EINVAL;
+			goto out;
+		}
+
+		/*
+		 * The FS is mounted R/O, and the media format is
+		 * R/O-compatible with the UBIFS implementation, so we can
+		 * mount.
+		 */
+		c->rw_incompat = 1;
+	}
+
+	if (c->fmt_version < 3) {
+		ubifs_err("on-flash format version %d is not supported",
+			  c->fmt_version);
+		err = -EINVAL;
+		goto out;
+	}
+
+	switch (sup->key_hash) {
+	case UBIFS_KEY_HASH_R5:
+		c->key_hash = key_r5_hash;
+		c->key_hash_type = UBIFS_KEY_HASH_R5;
+		break;
+
+	case UBIFS_KEY_HASH_TEST:
+		c->key_hash = key_test_hash;
+		c->key_hash_type = UBIFS_KEY_HASH_TEST;
+		break;
+	};
+
+	c->key_fmt = sup->key_fmt;
+
+	switch (c->key_fmt) {
+	case UBIFS_SIMPLE_KEY_FMT:
+		c->key_len = UBIFS_SK_LEN;
+		break;
+	default:
+		ubifs_err("unsupported key format");
+		err = -EINVAL;
+		goto out;
+	}
+
+	c->leb_cnt       = le32_to_cpu(sup->leb_cnt);
+	c->max_leb_cnt   = le32_to_cpu(sup->max_leb_cnt);
+	c->max_bud_bytes = le64_to_cpu(sup->max_bud_bytes);
+	c->log_lebs      = le32_to_cpu(sup->log_lebs);
+	c->lpt_lebs      = le32_to_cpu(sup->lpt_lebs);
+	c->orph_lebs     = le32_to_cpu(sup->orph_lebs);
+	c->jhead_cnt     = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;
+	c->fanout        = le32_to_cpu(sup->fanout);
+	c->lsave_cnt     = le32_to_cpu(sup->lsave_cnt);
+	c->rp_size       = le64_to_cpu(sup->rp_size);
+	c->rp_uid        = le32_to_cpu(sup->rp_uid);
+	c->rp_gid        = le32_to_cpu(sup->rp_gid);
+	sup_flags        = le32_to_cpu(sup->flags);
+	if (!c->mount_opts.override_compr)
+		c->default_compr = le16_to_cpu(sup->default_compr);
+
+	c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
+	memcpy(&c->uuid, &sup->uuid, 16);
+	c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
+	c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP);
+
+	/* Automatically increase file system size to the maximum size */
+	c->old_leb_cnt = c->leb_cnt;
+	if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
+		c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
+		if (c->ro_mount)
+			dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
+				c->old_leb_cnt,	c->leb_cnt);
+		else {
+			dbg_mnt("Auto resizing (sb) from %d LEBs to %d LEBs",
+				c->old_leb_cnt, c->leb_cnt);
+			sup->leb_cnt = cpu_to_le32(c->leb_cnt);
+			err = ubifs_write_sb_node(c, sup);
+			if (err)
+				goto out;
+			c->old_leb_cnt = c->leb_cnt;
+		}
+	}
+
+	c->log_bytes = (long long)c->log_lebs * c->leb_size;
+	c->log_last = UBIFS_LOG_LNUM + c->log_lebs - 1;
+	c->lpt_first = UBIFS_LOG_LNUM + c->log_lebs;
+	c->lpt_last = c->lpt_first + c->lpt_lebs - 1;
+	c->orph_first = c->lpt_last + 1;
+	c->orph_last = c->orph_first + c->orph_lebs - 1;
+	c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
+	c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
+	c->main_first = c->leb_cnt - c->main_lebs;
+
+	err = validate_sb(c, sup);
+out:
+	kfree(sup);
+	return err;
+}
+
+/**
+ * fixup_leb - fixup/unmap an LEB containing free space.
+ * @c: UBIFS file-system description object
+ * @lnum: the LEB number to fix up
+ * @len: number of used bytes in LEB (starting at offset 0)
+ *
+ * This function reads the contents of the given LEB number @lnum, then fixes
+ * it up, so that empty min. I/O units in the end of LEB are actually erased on
+ * flash (rather than being just all-0xff real data). If the LEB is completely
+ * empty, it is simply unmapped.
+ */
+static int fixup_leb(struct ubifs_info *c, int lnum, int len)
+{
+	int err;
+
+	ubifs_assert(len >= 0);
+	ubifs_assert(len % c->min_io_size == 0);
+	ubifs_assert(len < c->leb_size);
+
+	if (len == 0) {
+		dbg_mnt("unmap empty LEB %d", lnum);
+		return ubi_leb_unmap(c->ubi, lnum);
+	}
+
+	dbg_mnt("fixup LEB %d, data len %d", lnum, len);
+	err = ubi_read(c->ubi, lnum, c->sbuf, 0, len);
+	if (err)
+		return err;
+
+	return ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
+}
+
+/**
+ * fixup_free_space - find & remap all LEBs containing free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function walks through all LEBs in the filesystem and fiexes up those
+ * containing free/empty space.
+ */
+static int fixup_free_space(struct ubifs_info *c)
+{
+	int lnum, err = 0;
+	struct ubifs_lprops *lprops;
+
+	ubifs_get_lprops(c);
+
+	/* Fixup LEBs in the master area */
+	for (lnum = UBIFS_MST_LNUM; lnum < UBIFS_LOG_LNUM; lnum++) {
+		err = fixup_leb(c, lnum, c->mst_offs + c->mst_node_alsz);
+		if (err)
+			goto out;
+	}
+
+	/* Unmap unused log LEBs */
+	lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+	while (lnum != c->ltail_lnum) {
+		err = fixup_leb(c, lnum, 0);
+		if (err)
+			goto out;
+		lnum = ubifs_next_log_lnum(c, lnum);
+	}
+
+	/* Fixup the current log head */
+	err = fixup_leb(c, c->lhead_lnum, c->lhead_offs);
+	if (err)
+		goto out;
+
+	/* Fixup LEBs in the LPT area */
+	for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
+		int free = c->ltab[lnum - c->lpt_first].free;
+
+		if (free > 0) {
+			err = fixup_leb(c, lnum, c->leb_size - free);
+			if (err)
+				goto out;
+		}
+	}
+
+	/* Unmap LEBs in the orphans area */
+	for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+		err = fixup_leb(c, lnum, 0);
+		if (err)
+			goto out;
+	}
+
+	/* Fixup LEBs in the main area */
+	for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
+		lprops = ubifs_lpt_lookup(c, lnum);
+		if (IS_ERR(lprops)) {
+			err = PTR_ERR(lprops);
+			goto out;
+		}
+
+		if (lprops->free > 0) {
+			err = fixup_leb(c, lnum, c->leb_size - lprops->free);
+			if (err)
+				goto out;
+		}
+	}
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * ubifs_fixup_free_space - find & fix all LEBs with free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function fixes up LEBs containing free space on first mount, if the
+ * appropriate flag was set when the FS was created. Each LEB with one or more
+ * empty min. I/O unit (i.e. free-space-count > 0) is re-written, to make sure
+ * the free space is actually erased. E.g., this is necessary for some NAND
+ * chips, since the free space may have been programmed like real "0xff" data
+ * (generating a non-0xff ECC), causing future writes to the not-really-erased
+ * NAND pages to behave badly. After the space is fixed up, the superblock flag
+ * is cleared, so that this is skipped for all future mounts.
+ */
+int ubifs_fixup_free_space(struct ubifs_info *c)
+{
+	int err;
+	struct ubifs_sb_node *sup;
+
+	ubifs_assert(c->space_fixup);
+	ubifs_assert(!c->ro_mount);
+
+	ubifs_msg("start fixing up free space");
+
+	err = fixup_free_space(c);
+	if (err)
+		return err;
+
+	sup = ubifs_read_sb_node(c);
+	if (IS_ERR(sup))
+		return PTR_ERR(sup);
+
+	/* Free-space fixup is no longer required */
+	c->space_fixup = 0;
+	sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP);
+
+	err = ubifs_write_sb_node(c, sup);
+	kfree(sup);
+	if (err)
+		return err;
+
+	ubifs_msg("free space fixup complete");
+	return err;
+}
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
new file mode 100644
index 00000000..36216b46
--- /dev/null
+++ b/fs/ubifs/scan.c
@@ -0,0 +1,380 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the scan which is a general-purpose function for
+ * determining what nodes are in an eraseblock. The scan is used to replay the
+ * journal, to do garbage collection. for the TNC in-the-gaps method, and by
+ * debugging functions.
+ */
+
+#include "ubifs.h"
+
+/**
+ * scan_padding_bytes - scan for padding bytes.
+ * @buf: buffer to scan
+ * @len: length of buffer
+ *
+ * This function returns the number of padding bytes on success and
+ * %SCANNED_GARBAGE on failure.
+ */
+static int scan_padding_bytes(void *buf, int len)
+{
+	int pad_len = 0, max_pad_len = min_t(int, UBIFS_PAD_NODE_SZ, len);
+	uint8_t *p = buf;
+
+	dbg_scan("not a node");
+
+	while (pad_len < max_pad_len && *p++ == UBIFS_PADDING_BYTE)
+		pad_len += 1;
+
+	if (!pad_len || (pad_len & 7))
+		return SCANNED_GARBAGE;
+
+	dbg_scan("%d padding bytes", pad_len);
+
+	return pad_len;
+}
+
+/**
+ * ubifs_scan_a_node - scan for a node or padding.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to scan
+ * @len: length of buffer
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @quiet: print no messages
+ *
+ * This function returns a scanning code to indicate what was scanned.
+ */
+int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
+		      int offs, int quiet)
+{
+	struct ubifs_ch *ch = buf;
+	uint32_t magic;
+
+	magic = le32_to_cpu(ch->magic);
+
+	if (magic == 0xFFFFFFFF) {
+		dbg_scan("hit empty space");
+		return SCANNED_EMPTY_SPACE;
+	}
+
+	if (magic != UBIFS_NODE_MAGIC)
+		return scan_padding_bytes(buf, len);
+
+	if (len < UBIFS_CH_SZ)
+		return SCANNED_GARBAGE;
+
+	dbg_scan("scanning %s", dbg_ntype(ch->node_type));
+
+	if (ubifs_check_node(c, buf, lnum, offs, quiet, 1))
+		return SCANNED_A_CORRUPT_NODE;
+
+	if (ch->node_type == UBIFS_PAD_NODE) {
+		struct ubifs_pad_node *pad = buf;
+		int pad_len = le32_to_cpu(pad->pad_len);
+		int node_len = le32_to_cpu(ch->len);
+
+		/* Validate the padding node */
+		if (pad_len < 0 ||
+		    offs + node_len + pad_len > c->leb_size) {
+			if (!quiet) {
+				ubifs_err("bad pad node at LEB %d:%d",
+					  lnum, offs);
+				dbg_dump_node(c, pad);
+			}
+			return SCANNED_A_BAD_PAD_NODE;
+		}
+
+		/* Make the node pads to 8-byte boundary */
+		if ((node_len + pad_len) & 7) {
+			if (!quiet)
+				dbg_err("bad padding length %d - %d",
+					offs, offs + node_len + pad_len);
+			return SCANNED_A_BAD_PAD_NODE;
+		}
+
+		dbg_scan("%d bytes padded, offset now %d",
+			 pad_len, ALIGN(offs + node_len + pad_len, 8));
+
+		return node_len + pad_len;
+	}
+
+	return SCANNED_A_NODE;
+}
+
+/**
+ * ubifs_start_scan - create LEB scanning information at start of scan.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ * @sbuf: scan buffer (must be c->leb_size)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
+					int offs, void *sbuf)
+{
+	struct ubifs_scan_leb *sleb;
+	int err;
+
+	dbg_scan("scan LEB %d:%d", lnum, offs);
+
+	sleb = kzalloc(sizeof(struct ubifs_scan_leb), GFP_NOFS);
+	if (!sleb)
+		return ERR_PTR(-ENOMEM);
+
+	sleb->lnum = lnum;
+	INIT_LIST_HEAD(&sleb->nodes);
+	sleb->buf = sbuf;
+
+	err = ubi_read(c->ubi, lnum, sbuf + offs, offs, c->leb_size - offs);
+	if (err && err != -EBADMSG) {
+		ubifs_err("cannot read %d bytes from LEB %d:%d,"
+			  " error %d", c->leb_size - offs, lnum, offs, err);
+		kfree(sleb);
+		return ERR_PTR(err);
+	}
+
+	if (err == -EBADMSG)
+		sleb->ecc = 1;
+
+	return sleb;
+}
+
+/**
+ * ubifs_end_scan - update LEB scanning information at end of scan.
+ * @c: UBIFS file-system description object
+ * @sleb: scanning information
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		    int lnum, int offs)
+{
+	lnum = lnum;
+	dbg_scan("stop scanning LEB %d at offset %d", lnum, offs);
+	ubifs_assert(offs % c->min_io_size == 0);
+
+	sleb->endpt = ALIGN(offs, c->min_io_size);
+}
+
+/**
+ * ubifs_add_snod - add a scanned node to LEB scanning information.
+ * @c: UBIFS file-system description object
+ * @sleb: scanning information
+ * @buf: buffer containing node
+ * @offs: offset of node on flash
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		   void *buf, int offs)
+{
+	struct ubifs_ch *ch = buf;
+	struct ubifs_ino_node *ino = buf;
+	struct ubifs_scan_node *snod;
+
+	snod = kmalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
+	if (!snod)
+		return -ENOMEM;
+
+	snod->sqnum = le64_to_cpu(ch->sqnum);
+	snod->type = ch->node_type;
+	snod->offs = offs;
+	snod->len = le32_to_cpu(ch->len);
+	snod->node = buf;
+
+	switch (ch->node_type) {
+	case UBIFS_INO_NODE:
+	case UBIFS_DENT_NODE:
+	case UBIFS_XENT_NODE:
+	case UBIFS_DATA_NODE:
+		/*
+		 * The key is in the same place in all keyed
+		 * nodes.
+		 */
+		key_read(c, &ino->key, &snod->key);
+		break;
+	default:
+		invalid_key_init(c, &snod->key);
+		break;
+	}
+	list_add_tail(&snod->list, &sleb->nodes);
+	sleb->nodes_cnt += 1;
+	return 0;
+}
+
+/**
+ * ubifs_scanned_corruption - print information after UBIFS scanned corruption.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of corruption
+ * @offs: offset of corruption
+ * @buf: buffer containing corruption
+ */
+void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
+			      void *buf)
+{
+	int len;
+
+	ubifs_err("corruption at LEB %d:%d", lnum, offs);
+	if (dbg_failure_mode)
+		return;
+	len = c->leb_size - offs;
+	if (len > 8192)
+		len = 8192;
+	dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs);
+	print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1);
+}
+
+/**
+ * ubifs_scan - scan a logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ * @sbuf: scan buffer (must be of @c->leb_size bytes in size)
+ * @quiet: print no messages
+ *
+ * This function scans LEB number @lnum and returns complete information about
+ * its contents. Returns the scaned information in case of success and,
+ * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
+ * of failure.
+ *
+ * If @quiet is non-zero, this function does not print large and scary
+ * error messages and flash dumps in case of errors.
+ */
+struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
+				  int offs, void *sbuf, int quiet)
+{
+	void *buf = sbuf + offs;
+	int err, len = c->leb_size - offs;
+	struct ubifs_scan_leb *sleb;
+
+	sleb = ubifs_start_scan(c, lnum, offs, sbuf);
+	if (IS_ERR(sleb))
+		return sleb;
+
+	while (len >= 8) {
+		struct ubifs_ch *ch = buf;
+		int node_len, ret;
+
+		dbg_scan("look at LEB %d:%d (%d bytes left)",
+			 lnum, offs, len);
+
+		cond_resched();
+
+		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
+		if (ret > 0) {
+			/* Padding bytes or a valid padding node */
+			offs += ret;
+			buf += ret;
+			len -= ret;
+			continue;
+		}
+
+		if (ret == SCANNED_EMPTY_SPACE)
+			/* Empty space is checked later */
+			break;
+
+		switch (ret) {
+		case SCANNED_GARBAGE:
+			dbg_err("garbage");
+			goto corrupted;
+		case SCANNED_A_NODE:
+			break;
+		case SCANNED_A_CORRUPT_NODE:
+		case SCANNED_A_BAD_PAD_NODE:
+			dbg_err("bad node");
+			goto corrupted;
+		default:
+			dbg_err("unknown");
+			err = -EINVAL;
+			goto error;
+		}
+
+		err = ubifs_add_snod(c, sleb, buf, offs);
+		if (err)
+			goto error;
+
+		node_len = ALIGN(le32_to_cpu(ch->len), 8);
+		offs += node_len;
+		buf += node_len;
+		len -= node_len;
+	}
+
+	if (offs % c->min_io_size) {
+		if (!quiet)
+			ubifs_err("empty space starts at non-aligned offset %d",
+				  offs);
+		goto corrupted;
+	}
+
+	ubifs_end_scan(c, sleb, lnum, offs);
+
+	for (; len > 4; offs += 4, buf = buf + 4, len -= 4)
+		if (*(uint32_t *)buf != 0xffffffff)
+			break;
+	for (; len; offs++, buf++, len--)
+		if (*(uint8_t *)buf != 0xff) {
+			if (!quiet)
+				ubifs_err("corrupt empty space at LEB %d:%d",
+					  lnum, offs);
+			goto corrupted;
+		}
+
+	return sleb;
+
+corrupted:
+	if (!quiet) {
+		ubifs_scanned_corruption(c, lnum, offs, buf);
+		ubifs_err("LEB %d scanning failed", lnum);
+	}
+	err = -EUCLEAN;
+	ubifs_scan_destroy(sleb);
+	return ERR_PTR(err);
+
+error:
+	ubifs_err("LEB %d scanning failed, error %d", lnum, err);
+	ubifs_scan_destroy(sleb);
+	return ERR_PTR(err);
+}
+
+/**
+ * ubifs_scan_destroy - destroy LEB scanning information.
+ * @sleb: scanning information to free
+ */
+void ubifs_scan_destroy(struct ubifs_scan_leb *sleb)
+{
+	struct ubifs_scan_node *node;
+	struct list_head *head;
+
+	head = &sleb->nodes;
+	while (!list_empty(head)) {
+		node = list_entry(head->next, struct ubifs_scan_node, list);
+		list_del(&node->list);
+		kfree(node);
+	}
+	kfree(sleb);
+}
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
new file mode 100644
index 00000000..9e1d0566
--- /dev/null
+++ b/fs/ubifs/shrinker.c
@@ -0,0 +1,325 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS shrinker which evicts clean znodes from the TNC
+ * tree when Linux VM needs more RAM.
+ *
+ * We do not implement any LRU lists to find oldest znodes to free because it
+ * would add additional overhead to the file system fast paths. So the shrinker
+ * just walks the TNC tree when searching for znodes to free.
+ *
+ * If the root of a TNC sub-tree is clean and old enough, then the children are
+ * also clean and old enough. So the shrinker walks the TNC in level order and
+ * dumps entire sub-trees.
+ *
+ * The age of znodes is just the time-stamp when they were last looked at.
+ * The current shrinker first tries to evict old znodes, then young ones.
+ *
+ * Since the shrinker is global, it has to protect against races with FS
+ * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'.
+ */
+
+#include "ubifs.h"
+
+/* List of all UBIFS file-system instances */
+LIST_HEAD(ubifs_infos);
+
+/*
+ * We number each shrinker run and record the number on the ubifs_info structure
+ * so that we can easily work out which ubifs_info structures have already been
+ * done by the current run.
+ */
+static unsigned int shrinker_run_no;
+
+/* Protects 'ubifs_infos' list */
+DEFINE_SPINLOCK(ubifs_infos_lock);
+
+/* Global clean znode counter (for all mounted UBIFS instances) */
+atomic_long_t ubifs_clean_zn_cnt;
+
+/**
+ * shrink_tnc - shrink TNC tree.
+ * @c: UBIFS file-system description object
+ * @nr: number of znodes to free
+ * @age: the age of znodes to free
+ * @contention: if any contention, this is set to %1
+ *
+ * This function traverses TNC tree and frees clean znodes. It does not free
+ * clean znodes which younger then @age. Returns number of freed znodes.
+ */
+static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
+{
+	int total_freed = 0;
+	struct ubifs_znode *znode, *zprev;
+	int time = get_seconds();
+
+	ubifs_assert(mutex_is_locked(&c->umount_mutex));
+	ubifs_assert(mutex_is_locked(&c->tnc_mutex));
+
+	if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0)
+		return 0;
+
+	/*
+	 * Traverse the TNC tree in levelorder manner, so that it is possible
+	 * to destroy large sub-trees. Indeed, if a znode is old, then all its
+	 * children are older or of the same age.
+	 *
+	 * Note, we are holding 'c->tnc_mutex', so we do not have to lock the
+	 * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is
+	 * changed only when the 'c->tnc_mutex' is held.
+	 */
+	zprev = NULL;
+	znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
+	while (znode && total_freed < nr &&
+	       atomic_long_read(&c->clean_zn_cnt) > 0) {
+		int freed;
+
+		/*
+		 * If the znode is clean, but it is in the 'c->cnext' list, this
+		 * means that this znode has just been written to flash as a
+		 * part of commit and was marked clean. They will be removed
+		 * from the list at end commit. We cannot change the list,
+		 * because it is not protected by any mutex (design decision to
+		 * make commit really independent and parallel to main I/O). So
+		 * we just skip these znodes.
+		 *
+		 * Note, the 'clean_zn_cnt' counters are not updated until
+		 * after the commit, so the UBIFS shrinker does not report
+		 * the znodes which are in the 'c->cnext' list as freeable.
+		 *
+		 * Also note, if the root of a sub-tree is not in 'c->cnext',
+		 * then the whole sub-tree is not in 'c->cnext' as well, so it
+		 * is safe to dump whole sub-tree.
+		 */
+
+		if (znode->cnext) {
+			/*
+			 * Very soon these znodes will be removed from the list
+			 * and become freeable.
+			 */
+			*contention = 1;
+		} else if (!ubifs_zn_dirty(znode) &&
+			   abs(time - znode->time) >= age) {
+			if (znode->parent)
+				znode->parent->zbranch[znode->iip].znode = NULL;
+			else
+				c->zroot.znode = NULL;
+
+			freed = ubifs_destroy_tnc_subtree(znode);
+			atomic_long_sub(freed, &ubifs_clean_zn_cnt);
+			atomic_long_sub(freed, &c->clean_zn_cnt);
+			ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0);
+			total_freed += freed;
+			znode = zprev;
+		}
+
+		if (unlikely(!c->zroot.znode))
+			break;
+
+		zprev = znode;
+		znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
+		cond_resched();
+	}
+
+	return total_freed;
+}
+
+/**
+ * shrink_tnc_trees - shrink UBIFS TNC trees.
+ * @nr: number of znodes to free
+ * @age: the age of znodes to free
+ * @contention: if any contention, this is set to %1
+ *
+ * This function walks the list of mounted UBIFS file-systems and frees clean
+ * znodes which are older than @age, until at least @nr znodes are freed.
+ * Returns the number of freed znodes.
+ */
+static int shrink_tnc_trees(int nr, int age, int *contention)
+{
+	struct ubifs_info *c;
+	struct list_head *p;
+	unsigned int run_no;
+	int freed = 0;
+
+	spin_lock(&ubifs_infos_lock);
+	do {
+		run_no = ++shrinker_run_no;
+	} while (run_no == 0);
+	/* Iterate over all mounted UBIFS file-systems and try to shrink them */
+	p = ubifs_infos.next;
+	while (p != &ubifs_infos) {
+		c = list_entry(p, struct ubifs_info, infos_list);
+		/*
+		 * We move the ones we do to the end of the list, so we stop
+		 * when we see one we have already done.
+		 */
+		if (c->shrinker_run_no == run_no)
+			break;
+		if (!mutex_trylock(&c->umount_mutex)) {
+			/* Some un-mount is in progress, try next FS */
+			*contention = 1;
+			p = p->next;
+			continue;
+		}
+		/*
+		 * We're holding 'c->umount_mutex', so the file-system won't go
+		 * away.
+		 */
+		if (!mutex_trylock(&c->tnc_mutex)) {
+			mutex_unlock(&c->umount_mutex);
+			*contention = 1;
+			p = p->next;
+			continue;
+		}
+		spin_unlock(&ubifs_infos_lock);
+		/*
+		 * OK, now we have TNC locked, the file-system cannot go away -
+		 * it is safe to reap the cache.
+		 */
+		c->shrinker_run_no = run_no;
+		freed += shrink_tnc(c, nr, age, contention);
+		mutex_unlock(&c->tnc_mutex);
+		spin_lock(&ubifs_infos_lock);
+		/* Get the next list element before we move this one */
+		p = p->next;
+		/*
+		 * Move this one to the end of the list to provide some
+		 * fairness.
+		 */
+		list_move_tail(&c->infos_list, &ubifs_infos);
+		mutex_unlock(&c->umount_mutex);
+		if (freed >= nr)
+			break;
+	}
+	spin_unlock(&ubifs_infos_lock);
+	return freed;
+}
+
+/**
+ * kick_a_thread - kick a background thread to start commit.
+ *
+ * This function kicks a background thread to start background commit. Returns
+ * %-1 if a thread was kicked or there is another reason to assume the memory
+ * will soon be freed or become freeable. If there are no dirty znodes, returns
+ * %0.
+ */
+static int kick_a_thread(void)
+{
+	int i;
+	struct ubifs_info *c;
+
+	/*
+	 * Iterate over all mounted UBIFS file-systems and find out if there is
+	 * already an ongoing commit operation there. If no, then iterate for
+	 * the second time and initiate background commit.
+	 */
+	spin_lock(&ubifs_infos_lock);
+	for (i = 0; i < 2; i++) {
+		list_for_each_entry(c, &ubifs_infos, infos_list) {
+			long dirty_zn_cnt;
+
+			if (!mutex_trylock(&c->umount_mutex)) {
+				/*
+				 * Some un-mount is in progress, it will
+				 * certainly free memory, so just return.
+				 */
+				spin_unlock(&ubifs_infos_lock);
+				return -1;
+			}
+
+			dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
+
+			if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
+			    c->ro_mount || c->ro_error) {
+				mutex_unlock(&c->umount_mutex);
+				continue;
+			}
+
+			if (c->cmt_state != COMMIT_RESTING) {
+				spin_unlock(&ubifs_infos_lock);
+				mutex_unlock(&c->umount_mutex);
+				return -1;
+			}
+
+			if (i == 1) {
+				list_move_tail(&c->infos_list, &ubifs_infos);
+				spin_unlock(&ubifs_infos_lock);
+
+				ubifs_request_bg_commit(c);
+				mutex_unlock(&c->umount_mutex);
+				return -1;
+			}
+			mutex_unlock(&c->umount_mutex);
+		}
+	}
+	spin_unlock(&ubifs_infos_lock);
+
+	return 0;
+}
+
+int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
+{
+	int nr = sc->nr_to_scan;
+	int freed, contention = 0;
+	long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
+
+	if (nr == 0)
+		/*
+		 * Due to the way UBIFS updates the clean znode counter it may
+		 * temporarily be negative.
+		 */
+		return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
+
+	if (!clean_zn_cnt) {
+		/*
+		 * No clean znodes, nothing to reap. All we can do in this case
+		 * is to kick background threads to start commit, which will
+		 * probably make clean znodes which, in turn, will be freeable.
+		 * And we return -1 which means will make VM call us again
+		 * later.
+		 */
+		dbg_tnc("no clean znodes, kick a thread");
+		return kick_a_thread();
+	}
+
+	freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention);
+	if (freed >= nr)
+		goto out;
+
+	dbg_tnc("not enough old znodes, try to free young ones");
+	freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention);
+	if (freed >= nr)
+		goto out;
+
+	dbg_tnc("not enough young znodes, free all");
+	freed += shrink_tnc_trees(nr - freed, 0, &contention);
+
+	if (!freed && contention) {
+		dbg_tnc("freed nothing, but contention");
+		return -1;
+	}
+
+out:
+	dbg_tnc("%d znodes were freed, requested %d", freed, nr);
+	return freed;
+}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
new file mode 100644
index 00000000..529be058
--- /dev/null
+++ b/fs/ubifs/super.c
@@ -0,0 +1,2321 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS initialization and VFS superblock operations. Some
+ * initialization stuff which is rather large and complex is placed at
+ * corresponding subsystems, but most of it is here.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/kthread.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/math64.h>
+#include <linux/writeback.h>
+#include "ubifs.h"
+
+/*
+ * Maximum amount of memory we may 'kmalloc()' without worrying that we are
+ * allocating too much.
+ */
+#define UBIFS_KMALLOC_OK (128*1024)
+
+/* Slab cache for UBIFS inodes */
+struct kmem_cache *ubifs_inode_slab;
+
+/* UBIFS TNC shrinker description */
+static struct shrinker ubifs_shrinker_info = {
+	.shrink = ubifs_shrinker,
+	.seeks = DEFAULT_SEEKS,
+};
+
+/**
+ * validate_inode - validate inode.
+ * @c: UBIFS file-system description object
+ * @inode: the inode to validate
+ *
+ * This is a helper function for 'ubifs_iget()' which validates various fields
+ * of a newly built inode to make sure they contain sane values and prevent
+ * possible vulnerabilities. Returns zero if the inode is all right and
+ * a non-zero error code if not.
+ */
+static int validate_inode(struct ubifs_info *c, const struct inode *inode)
+{
+	int err;
+	const struct ubifs_inode *ui = ubifs_inode(inode);
+
+	if (inode->i_size > c->max_inode_sz) {
+		ubifs_err("inode is too large (%lld)",
+			  (long long)inode->i_size);
+		return 1;
+	}
+
+	if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
+		ubifs_err("unknown compression type %d", ui->compr_type);
+		return 2;
+	}
+
+	if (ui->xattr_names + ui->xattr_cnt > XATTR_LIST_MAX)
+		return 3;
+
+	if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA)
+		return 4;
+
+	if (ui->xattr && (inode->i_mode & S_IFMT) != S_IFREG)
+		return 5;
+
+	if (!ubifs_compr_present(ui->compr_type)) {
+		ubifs_warn("inode %lu uses '%s' compression, but it was not "
+			   "compiled in", inode->i_ino,
+			   ubifs_compr_name(ui->compr_type));
+	}
+
+	err = dbg_check_dir_size(c, inode);
+	return err;
+}
+
+struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
+{
+	int err;
+	union ubifs_key key;
+	struct ubifs_ino_node *ino;
+	struct ubifs_info *c = sb->s_fs_info;
+	struct inode *inode;
+	struct ubifs_inode *ui;
+
+	dbg_gen("inode %lu", inum);
+
+	inode = iget_locked(sb, inum);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+	ui = ubifs_inode(inode);
+
+	ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
+	if (!ino) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	ino_key_init(c, &key, inode->i_ino);
+
+	err = ubifs_tnc_lookup(c, &key, ino);
+	if (err)
+		goto out_ino;
+
+	inode->i_flags |= (S_NOCMTIME | S_NOATIME);
+	inode->i_nlink = le32_to_cpu(ino->nlink);
+	inode->i_uid   = le32_to_cpu(ino->uid);
+	inode->i_gid   = le32_to_cpu(ino->gid);
+	inode->i_atime.tv_sec  = (int64_t)le64_to_cpu(ino->atime_sec);
+	inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec);
+	inode->i_mtime.tv_sec  = (int64_t)le64_to_cpu(ino->mtime_sec);
+	inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec);
+	inode->i_ctime.tv_sec  = (int64_t)le64_to_cpu(ino->ctime_sec);
+	inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec);
+	inode->i_mode = le32_to_cpu(ino->mode);
+	inode->i_size = le64_to_cpu(ino->size);
+
+	ui->data_len    = le32_to_cpu(ino->data_len);
+	ui->flags       = le32_to_cpu(ino->flags);
+	ui->compr_type  = le16_to_cpu(ino->compr_type);
+	ui->creat_sqnum = le64_to_cpu(ino->creat_sqnum);
+	ui->xattr_cnt   = le32_to_cpu(ino->xattr_cnt);
+	ui->xattr_size  = le32_to_cpu(ino->xattr_size);
+	ui->xattr_names = le32_to_cpu(ino->xattr_names);
+	ui->synced_i_size = ui->ui_size = inode->i_size;
+
+	ui->xattr = (ui->flags & UBIFS_XATTR_FL) ? 1 : 0;
+
+	err = validate_inode(c, inode);
+	if (err)
+		goto out_invalid;
+
+	/* Disable read-ahead */
+	inode->i_mapping->backing_dev_info = &c->bdi;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_mapping->a_ops = &ubifs_file_address_operations;
+		inode->i_op = &ubifs_file_inode_operations;
+		inode->i_fop = &ubifs_file_operations;
+		if (ui->xattr) {
+			ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
+			if (!ui->data) {
+				err = -ENOMEM;
+				goto out_ino;
+			}
+			memcpy(ui->data, ino->data, ui->data_len);
+			((char *)ui->data)[ui->data_len] = '\0';
+		} else if (ui->data_len != 0) {
+			err = 10;
+			goto out_invalid;
+		}
+		break;
+	case S_IFDIR:
+		inode->i_op  = &ubifs_dir_inode_operations;
+		inode->i_fop = &ubifs_dir_operations;
+		if (ui->data_len != 0) {
+			err = 11;
+			goto out_invalid;
+		}
+		break;
+	case S_IFLNK:
+		inode->i_op = &ubifs_symlink_inode_operations;
+		if (ui->data_len <= 0 || ui->data_len > UBIFS_MAX_INO_DATA) {
+			err = 12;
+			goto out_invalid;
+		}
+		ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
+		if (!ui->data) {
+			err = -ENOMEM;
+			goto out_ino;
+		}
+		memcpy(ui->data, ino->data, ui->data_len);
+		((char *)ui->data)[ui->data_len] = '\0';
+		break;
+	case S_IFBLK:
+	case S_IFCHR:
+	{
+		dev_t rdev;
+		union ubifs_dev_desc *dev;
+
+		ui->data = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
+		if (!ui->data) {
+			err = -ENOMEM;
+			goto out_ino;
+		}
+
+		dev = (union ubifs_dev_desc *)ino->data;
+		if (ui->data_len == sizeof(dev->new))
+			rdev = new_decode_dev(le32_to_cpu(dev->new));
+		else if (ui->data_len == sizeof(dev->huge))
+			rdev = huge_decode_dev(le64_to_cpu(dev->huge));
+		else {
+			err = 13;
+			goto out_invalid;
+		}
+		memcpy(ui->data, ino->data, ui->data_len);
+		inode->i_op = &ubifs_file_inode_operations;
+		init_special_inode(inode, inode->i_mode, rdev);
+		break;
+	}
+	case S_IFSOCK:
+	case S_IFIFO:
+		inode->i_op = &ubifs_file_inode_operations;
+		init_special_inode(inode, inode->i_mode, 0);
+		if (ui->data_len != 0) {
+			err = 14;
+			goto out_invalid;
+		}
+		break;
+	default:
+		err = 15;
+		goto out_invalid;
+	}
+
+	kfree(ino);
+	ubifs_set_inode_flags(inode);
+	unlock_new_inode(inode);
+	return inode;
+
+out_invalid:
+	ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err);
+	dbg_dump_node(c, ino);
+	dbg_dump_inode(c, inode);
+	err = -EINVAL;
+out_ino:
+	kfree(ino);
+out:
+	ubifs_err("failed to read inode %lu, error %d", inode->i_ino, err);
+	iget_failed(inode);
+	return ERR_PTR(err);
+}
+
+static struct inode *ubifs_alloc_inode(struct super_block *sb)
+{
+	struct ubifs_inode *ui;
+
+	ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS);
+	if (!ui)
+		return NULL;
+
+	memset((void *)ui + sizeof(struct inode), 0,
+	       sizeof(struct ubifs_inode) - sizeof(struct inode));
+	mutex_init(&ui->ui_mutex);
+	spin_lock_init(&ui->ui_lock);
+	return &ui->vfs_inode;
+};
+
+static void ubifs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ubifs_inode_slab, ui);
+}
+
+static void ubifs_destroy_inode(struct inode *inode)
+{
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	kfree(ui->data);
+	call_rcu(&inode->i_rcu, ubifs_i_callback);
+}
+
+/*
+ * Note, Linux write-back code calls this without 'i_mutex'.
+ */
+static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int err = 0;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	ubifs_assert(!ui->xattr);
+	if (is_bad_inode(inode))
+		return 0;
+
+	mutex_lock(&ui->ui_mutex);
+	/*
+	 * Due to races between write-back forced by budgeting
+	 * (see 'sync_some_inodes()') and pdflush write-back, the inode may
+	 * have already been synchronized, do not do this again. This might
+	 * also happen if it was synchronized in an VFS operation, e.g.
+	 * 'ubifs_link()'.
+	 */
+	if (!ui->dirty) {
+		mutex_unlock(&ui->ui_mutex);
+		return 0;
+	}
+
+	/*
+	 * As an optimization, do not write orphan inodes to the media just
+	 * because this is not needed.
+	 */
+	dbg_gen("inode %lu, mode %#x, nlink %u",
+		inode->i_ino, (int)inode->i_mode, inode->i_nlink);
+	if (inode->i_nlink) {
+		err = ubifs_jnl_write_inode(c, inode);
+		if (err)
+			ubifs_err("can't write inode %lu, error %d",
+				  inode->i_ino, err);
+		else
+			err = dbg_check_inode_size(c, inode, ui->ui_size);
+	}
+
+	ui->dirty = 0;
+	mutex_unlock(&ui->ui_mutex);
+	ubifs_release_dirty_inode_budget(c, ui);
+	return err;
+}
+
+static void ubifs_evict_inode(struct inode *inode)
+{
+	int err;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	if (ui->xattr)
+		/*
+		 * Extended attribute inode deletions are fully handled in
+		 * 'ubifs_removexattr()'. These inodes are special and have
+		 * limited usage, so there is nothing to do here.
+		 */
+		goto out;
+
+	dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
+	ubifs_assert(!atomic_read(&inode->i_count));
+
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (inode->i_nlink)
+		goto done;
+
+	if (is_bad_inode(inode))
+		goto out;
+
+	ui->ui_size = inode->i_size = 0;
+	err = ubifs_jnl_delete_inode(c, inode);
+	if (err)
+		/*
+		 * Worst case we have a lost orphan inode wasting space, so a
+		 * simple error message is OK here.
+		 */
+		ubifs_err("can't delete inode %lu, error %d",
+			  inode->i_ino, err);
+
+out:
+	if (ui->dirty)
+		ubifs_release_dirty_inode_budget(c, ui);
+	else {
+		/* We've deleted something - clean the "no space" flags */
+		c->bi.nospace = c->bi.nospace_rp = 0;
+		smp_wmb();
+	}
+done:
+	end_writeback(inode);
+}
+
+static void ubifs_dirty_inode(struct inode *inode, int flags)
+{
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+	if (!ui->dirty) {
+		ui->dirty = 1;
+		dbg_gen("inode %lu",  inode->i_ino);
+	}
+}
+
+static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct ubifs_info *c = dentry->d_sb->s_fs_info;
+	unsigned long long free;
+	__le32 *uuid = (__le32 *)c->uuid;
+
+	free = ubifs_get_free_space(c);
+	dbg_gen("free space %lld bytes (%lld blocks)",
+		free, free >> UBIFS_BLOCK_SHIFT);
+
+	buf->f_type = UBIFS_SUPER_MAGIC;
+	buf->f_bsize = UBIFS_BLOCK_SIZE;
+	buf->f_blocks = c->block_cnt;
+	buf->f_bfree = free >> UBIFS_BLOCK_SHIFT;
+	if (free > c->report_rp_size)
+		buf->f_bavail = (free - c->report_rp_size) >> UBIFS_BLOCK_SHIFT;
+	else
+		buf->f_bavail = 0;
+	buf->f_files = 0;
+	buf->f_ffree = 0;
+	buf->f_namelen = UBIFS_MAX_NLEN;
+	buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
+	buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
+	ubifs_assert(buf->f_bfree <= c->block_cnt);
+	return 0;
+}
+
+static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+	struct ubifs_info *c = mnt->mnt_sb->s_fs_info;
+
+	if (c->mount_opts.unmount_mode == 2)
+		seq_printf(s, ",fast_unmount");
+	else if (c->mount_opts.unmount_mode == 1)
+		seq_printf(s, ",norm_unmount");
+
+	if (c->mount_opts.bulk_read == 2)
+		seq_printf(s, ",bulk_read");
+	else if (c->mount_opts.bulk_read == 1)
+		seq_printf(s, ",no_bulk_read");
+
+	if (c->mount_opts.chk_data_crc == 2)
+		seq_printf(s, ",chk_data_crc");
+	else if (c->mount_opts.chk_data_crc == 1)
+		seq_printf(s, ",no_chk_data_crc");
+
+	if (c->mount_opts.override_compr) {
+		seq_printf(s, ",compr=%s",
+			   ubifs_compr_name(c->mount_opts.compr_type));
+	}
+
+	return 0;
+}
+
+static int ubifs_sync_fs(struct super_block *sb, int wait)
+{
+	int i, err;
+	struct ubifs_info *c = sb->s_fs_info;
+
+	/*
+	 * Zero @wait is just an advisory thing to help the file system shove
+	 * lots of data into the queues, and there will be the second
+	 * '->sync_fs()' call, with non-zero @wait.
+	 */
+	if (!wait)
+		return 0;
+
+	/*
+	 * Synchronize write buffers, because 'ubifs_run_commit()' does not
+	 * do this if it waits for an already running commit.
+	 */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		if (err)
+			return err;
+	}
+
+	/*
+	 * Strictly speaking, it is not necessary to commit the journal here,
+	 * synchronizing write-buffers would be enough. But committing makes
+	 * UBIFS free space predictions much more accurate, so we want to let
+	 * the user be able to get more accurate results of 'statfs()' after
+	 * they synchronize the file system.
+	 */
+	err = ubifs_run_commit(c);
+	if (err)
+		return err;
+
+	return ubi_sync(c->vi.ubi_num);
+}
+
+/**
+ * init_constants_early - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This function initialize UBIFS constants which do not need the superblock to
+ * be read. It also checks that the UBI volume satisfies basic UBIFS
+ * requirements. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int init_constants_early(struct ubifs_info *c)
+{
+	if (c->vi.corrupted) {
+		ubifs_warn("UBI volume is corrupted - read-only mode");
+		c->ro_media = 1;
+	}
+
+	if (c->di.ro_mode) {
+		ubifs_msg("read-only UBI device");
+		c->ro_media = 1;
+	}
+
+	if (c->vi.vol_type == UBI_STATIC_VOLUME) {
+		ubifs_msg("static UBI volume - read-only mode");
+		c->ro_media = 1;
+	}
+
+	c->leb_cnt = c->vi.size;
+	c->leb_size = c->vi.usable_leb_size;
+	c->leb_start = c->di.leb_start;
+	c->half_leb_size = c->leb_size / 2;
+	c->min_io_size = c->di.min_io_size;
+	c->min_io_shift = fls(c->min_io_size) - 1;
+	c->max_write_size = c->di.max_write_size;
+	c->max_write_shift = fls(c->max_write_size) - 1;
+
+	if (c->leb_size < UBIFS_MIN_LEB_SZ) {
+		ubifs_err("too small LEBs (%d bytes), min. is %d bytes",
+			  c->leb_size, UBIFS_MIN_LEB_SZ);
+		return -EINVAL;
+	}
+
+	if (c->leb_cnt < UBIFS_MIN_LEB_CNT) {
+		ubifs_err("too few LEBs (%d), min. is %d",
+			  c->leb_cnt, UBIFS_MIN_LEB_CNT);
+		return -EINVAL;
+	}
+
+	if (!is_power_of_2(c->min_io_size)) {
+		ubifs_err("bad min. I/O size %d", c->min_io_size);
+		return -EINVAL;
+	}
+
+	/*
+	 * Maximum write size has to be greater or equivalent to min. I/O
+	 * size, and be multiple of min. I/O size.
+	 */
+	if (c->max_write_size < c->min_io_size ||
+	    c->max_write_size % c->min_io_size ||
+	    !is_power_of_2(c->max_write_size)) {
+		ubifs_err("bad write buffer size %d for %d min. I/O unit",
+			  c->max_write_size, c->min_io_size);
+		return -EINVAL;
+	}
+
+	/*
+	 * UBIFS aligns all node to 8-byte boundary, so to make function in
+	 * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is
+	 * less than 8.
+	 */
+	if (c->min_io_size < 8) {
+		c->min_io_size = 8;
+		c->min_io_shift = 3;
+		if (c->max_write_size < c->min_io_size) {
+			c->max_write_size = c->min_io_size;
+			c->max_write_shift = c->min_io_shift;
+		}
+	}
+
+	c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size);
+	c->mst_node_alsz = ALIGN(UBIFS_MST_NODE_SZ, c->min_io_size);
+
+	/*
+	 * Initialize node length ranges which are mostly needed for node
+	 * length validation.
+	 */
+	c->ranges[UBIFS_PAD_NODE].len  = UBIFS_PAD_NODE_SZ;
+	c->ranges[UBIFS_SB_NODE].len   = UBIFS_SB_NODE_SZ;
+	c->ranges[UBIFS_MST_NODE].len  = UBIFS_MST_NODE_SZ;
+	c->ranges[UBIFS_REF_NODE].len  = UBIFS_REF_NODE_SZ;
+	c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ;
+	c->ranges[UBIFS_CS_NODE].len   = UBIFS_CS_NODE_SZ;
+
+	c->ranges[UBIFS_INO_NODE].min_len  = UBIFS_INO_NODE_SZ;
+	c->ranges[UBIFS_INO_NODE].max_len  = UBIFS_MAX_INO_NODE_SZ;
+	c->ranges[UBIFS_ORPH_NODE].min_len =
+				UBIFS_ORPH_NODE_SZ + sizeof(__le64);
+	c->ranges[UBIFS_ORPH_NODE].max_len = c->leb_size;
+	c->ranges[UBIFS_DENT_NODE].min_len = UBIFS_DENT_NODE_SZ;
+	c->ranges[UBIFS_DENT_NODE].max_len = UBIFS_MAX_DENT_NODE_SZ;
+	c->ranges[UBIFS_XENT_NODE].min_len = UBIFS_XENT_NODE_SZ;
+	c->ranges[UBIFS_XENT_NODE].max_len = UBIFS_MAX_XENT_NODE_SZ;
+	c->ranges[UBIFS_DATA_NODE].min_len = UBIFS_DATA_NODE_SZ;
+	c->ranges[UBIFS_DATA_NODE].max_len = UBIFS_MAX_DATA_NODE_SZ;
+	/*
+	 * Minimum indexing node size is amended later when superblock is
+	 * read and the key length is known.
+	 */
+	c->ranges[UBIFS_IDX_NODE].min_len = UBIFS_IDX_NODE_SZ + UBIFS_BRANCH_SZ;
+	/*
+	 * Maximum indexing node size is amended later when superblock is
+	 * read and the fanout is known.
+	 */
+	c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
+
+	/*
+	 * Initialize dead and dark LEB space watermarks. See gc.c for comments
+	 * about these values.
+	 */
+	c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
+	c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
+
+	/*
+	 * Calculate how many bytes would be wasted at the end of LEB if it was
+	 * fully filled with data nodes of maximum size. This is used in
+	 * calculations when reporting free space.
+	 */
+	c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
+
+	/* Buffer size for bulk-reads */
+	c->max_bu_buf_len = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ;
+	if (c->max_bu_buf_len > c->leb_size)
+		c->max_bu_buf_len = c->leb_size;
+	return 0;
+}
+
+/**
+ * bud_wbuf_callback - bud LEB write-buffer synchronization call-back.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB the write-buffer was synchronized to
+ * @free: how many free bytes left in this LEB
+ * @pad: how many bytes were padded
+ *
+ * This is a callback function which is called by the I/O unit when the
+ * write-buffer is synchronized. We need this to correctly maintain space
+ * accounting in bud logical eraseblocks. This function returns zero in case of
+ * success and a negative error code in case of failure.
+ *
+ * This function actually belongs to the journal, but we keep it here because
+ * we want to keep it static.
+ */
+static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
+{
+	return ubifs_update_one_lp(c, lnum, free, pad, 0, 0);
+}
+
+/*
+ * init_constants_sb - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which initializes various UBIFS constants after
+ * the superblock has been read. It also checks various UBIFS parameters and
+ * makes sure they are all right. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+static int init_constants_sb(struct ubifs_info *c)
+{
+	int tmp, err;
+	long long tmp64;
+
+	c->main_bytes = (long long)c->main_lebs * c->leb_size;
+	c->max_znode_sz = sizeof(struct ubifs_znode) +
+				c->fanout * sizeof(struct ubifs_zbranch);
+
+	tmp = ubifs_idx_node_sz(c, 1);
+	c->ranges[UBIFS_IDX_NODE].min_len = tmp;
+	c->min_idx_node_sz = ALIGN(tmp, 8);
+
+	tmp = ubifs_idx_node_sz(c, c->fanout);
+	c->ranges[UBIFS_IDX_NODE].max_len = tmp;
+	c->max_idx_node_sz = ALIGN(tmp, 8);
+
+	/* Make sure LEB size is large enough to fit full commit */
+	tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt;
+	tmp = ALIGN(tmp, c->min_io_size);
+	if (tmp > c->leb_size) {
+		dbg_err("too small LEB size %d, at least %d needed",
+			c->leb_size, tmp);
+		return -EINVAL;
+	}
+
+	/*
+	 * Make sure that the log is large enough to fit reference nodes for
+	 * all buds plus one reserved LEB.
+	 */
+	tmp64 = c->max_bud_bytes + c->leb_size - 1;
+	c->max_bud_cnt = div_u64(tmp64, c->leb_size);
+	tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1);
+	tmp /= c->leb_size;
+	tmp += 1;
+	if (c->log_lebs < tmp) {
+		dbg_err("too small log %d LEBs, required min. %d LEBs",
+			c->log_lebs, tmp);
+		return -EINVAL;
+	}
+
+	/*
+	 * When budgeting we assume worst-case scenarios when the pages are not
+	 * be compressed and direntries are of the maximum size.
+	 *
+	 * Note, data, which may be stored in inodes is budgeted separately, so
+	 * it is not included into 'c->bi.inode_budget'.
+	 */
+	c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
+	c->bi.inode_budget = UBIFS_INO_NODE_SZ;
+	c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ;
+
+	/*
+	 * When the amount of flash space used by buds becomes
+	 * 'c->max_bud_bytes', UBIFS just blocks all writers and starts commit.
+	 * The writers are unblocked when the commit is finished. To avoid
+	 * writers to be blocked UBIFS initiates background commit in advance,
+	 * when number of bud bytes becomes above the limit defined below.
+	 */
+	c->bg_bud_bytes = (c->max_bud_bytes * 13) >> 4;
+
+	/*
+	 * Ensure minimum journal size. All the bytes in the journal heads are
+	 * considered to be used, when calculating the current journal usage.
+	 * Consequently, if the journal is too small, UBIFS will treat it as
+	 * always full.
+	 */
+	tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1;
+	if (c->bg_bud_bytes < tmp64)
+		c->bg_bud_bytes = tmp64;
+	if (c->max_bud_bytes < tmp64 + c->leb_size)
+		c->max_bud_bytes = tmp64 + c->leb_size;
+
+	err = ubifs_calc_lpt_geom(c);
+	if (err)
+		return err;
+
+	/* Initialize effective LEB size used in budgeting calculations */
+	c->idx_leb_size = c->leb_size - c->max_idx_node_sz;
+	return 0;
+}
+
+/*
+ * init_constants_master - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which initializes various UBIFS constants after
+ * the master node has been read. It also checks various UBIFS parameters and
+ * makes sure they are all right.
+ */
+static void init_constants_master(struct ubifs_info *c)
+{
+	long long tmp64;
+
+	c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	c->report_rp_size = ubifs_reported_space(c, c->rp_size);
+
+	/*
+	 * Calculate total amount of FS blocks. This number is not used
+	 * internally because it does not make much sense for UBIFS, but it is
+	 * necessary to report something for the 'statfs()' call.
+	 *
+	 * Subtract the LEB reserved for GC, the LEB which is reserved for
+	 * deletions, minimum LEBs for the index, and assume only one journal
+	 * head is available.
+	 */
+	tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1;
+	tmp64 *= (long long)c->leb_size - c->leb_overhead;
+	tmp64 = ubifs_reported_space(c, tmp64);
+	c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
+}
+
+/**
+ * take_gc_lnum - reserve GC LEB.
+ * @c: UBIFS file-system description object
+ *
+ * This function ensures that the LEB reserved for garbage collection is marked
+ * as "taken" in lprops. We also have to set free space to LEB size and dirty
+ * space to zero, because lprops may contain out-of-date information if the
+ * file-system was un-mounted before it has been committed. This function
+ * returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int take_gc_lnum(struct ubifs_info *c)
+{
+	int err;
+
+	if (c->gc_lnum == -1) {
+		ubifs_err("no LEB for GC");
+		return -EINVAL;
+	}
+
+	/* And we have to tell lprops that this LEB is taken */
+	err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
+				  LPROPS_TAKEN, 0, 0);
+	return err;
+}
+
+/**
+ * alloc_wbufs - allocate write-buffers.
+ * @c: UBIFS file-system description object
+ *
+ * This helper function allocates and initializes UBIFS write-buffers. Returns
+ * zero in case of success and %-ENOMEM in case of failure.
+ */
+static int alloc_wbufs(struct ubifs_info *c)
+{
+	int i, err;
+
+	c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead),
+			   GFP_KERNEL);
+	if (!c->jheads)
+		return -ENOMEM;
+
+	/* Initialize journal heads */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		INIT_LIST_HEAD(&c->jheads[i].buds_list);
+		err = ubifs_wbuf_init(c, &c->jheads[i].wbuf);
+		if (err)
+			return err;
+
+		c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
+		c->jheads[i].wbuf.jhead = i;
+		c->jheads[i].grouped = 1;
+	}
+
+	c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
+	/*
+	 * Garbage Collector head likely contains long-term data and
+	 * does not need to be synchronized by timer. Also GC head nodes are
+	 * not grouped.
+	 */
+	c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
+	c->jheads[GCHD].wbuf.no_timer = 1;
+	c->jheads[GCHD].grouped = 0;
+
+	return 0;
+}
+
+/**
+ * free_wbufs - free write-buffers.
+ * @c: UBIFS file-system description object
+ */
+static void free_wbufs(struct ubifs_info *c)
+{
+	int i;
+
+	if (c->jheads) {
+		for (i = 0; i < c->jhead_cnt; i++) {
+			kfree(c->jheads[i].wbuf.buf);
+			kfree(c->jheads[i].wbuf.inodes);
+		}
+		kfree(c->jheads);
+		c->jheads = NULL;
+	}
+}
+
+/**
+ * free_orphans - free orphans.
+ * @c: UBIFS file-system description object
+ */
+static void free_orphans(struct ubifs_info *c)
+{
+	struct ubifs_orphan *orph;
+
+	while (c->orph_dnext) {
+		orph = c->orph_dnext;
+		c->orph_dnext = orph->dnext;
+		list_del(&orph->list);
+		kfree(orph);
+	}
+
+	while (!list_empty(&c->orph_list)) {
+		orph = list_entry(c->orph_list.next, struct ubifs_orphan, list);
+		list_del(&orph->list);
+		kfree(orph);
+		dbg_err("orphan list not empty at unmount");
+	}
+
+	vfree(c->orph_buf);
+	c->orph_buf = NULL;
+}
+
+/**
+ * free_buds - free per-bud objects.
+ * @c: UBIFS file-system description object
+ */
+static void free_buds(struct ubifs_info *c)
+{
+	struct rb_node *this = c->buds.rb_node;
+	struct ubifs_bud *bud;
+
+	while (this) {
+		if (this->rb_left)
+			this = this->rb_left;
+		else if (this->rb_right)
+			this = this->rb_right;
+		else {
+			bud = rb_entry(this, struct ubifs_bud, rb);
+			this = rb_parent(this);
+			if (this) {
+				if (this->rb_left == &bud->rb)
+					this->rb_left = NULL;
+				else
+					this->rb_right = NULL;
+			}
+			kfree(bud);
+		}
+	}
+}
+
+/**
+ * check_volume_empty - check if the UBI volume is empty.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks if the UBIFS volume is empty by looking if its LEBs are
+ * mapped or not. The result of checking is stored in the @c->empty variable.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int check_volume_empty(struct ubifs_info *c)
+{
+	int lnum, err;
+
+	c->empty = 1;
+	for (lnum = 0; lnum < c->leb_cnt; lnum++) {
+		err = ubi_is_mapped(c->ubi, lnum);
+		if (unlikely(err < 0))
+			return err;
+		if (err == 1) {
+			c->empty = 0;
+			break;
+		}
+
+		cond_resched();
+	}
+
+	return 0;
+}
+
+/*
+ * UBIFS mount options.
+ *
+ * Opt_fast_unmount: do not run a journal commit before un-mounting
+ * Opt_norm_unmount: run a journal commit before un-mounting
+ * Opt_bulk_read: enable bulk-reads
+ * Opt_no_bulk_read: disable bulk-reads
+ * Opt_chk_data_crc: check CRCs when reading data nodes
+ * Opt_no_chk_data_crc: do not check CRCs when reading data nodes
+ * Opt_override_compr: override default compressor
+ * Opt_err: just end of array marker
+ */
+enum {
+	Opt_fast_unmount,
+	Opt_norm_unmount,
+	Opt_bulk_read,
+	Opt_no_bulk_read,
+	Opt_chk_data_crc,
+	Opt_no_chk_data_crc,
+	Opt_override_compr,
+	Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_fast_unmount, "fast_unmount"},
+	{Opt_norm_unmount, "norm_unmount"},
+	{Opt_bulk_read, "bulk_read"},
+	{Opt_no_bulk_read, "no_bulk_read"},
+	{Opt_chk_data_crc, "chk_data_crc"},
+	{Opt_no_chk_data_crc, "no_chk_data_crc"},
+	{Opt_override_compr, "compr=%s"},
+	{Opt_err, NULL},
+};
+
+/**
+ * parse_standard_option - parse a standard mount option.
+ * @option: the option to parse
+ *
+ * Normally, standard mount options like "sync" are passed to file-systems as
+ * flags. However, when a "rootflags=" kernel boot parameter is used, they may
+ * be present in the options string. This function tries to deal with this
+ * situation and parse standard options. Returns 0 if the option was not
+ * recognized, and the corresponding integer flag if it was.
+ *
+ * UBIFS is only interested in the "sync" option, so do not check for anything
+ * else.
+ */
+static int parse_standard_option(const char *option)
+{
+	ubifs_msg("parse %s", option);
+	if (!strcmp(option, "sync"))
+		return MS_SYNCHRONOUS;
+	return 0;
+}
+
+/**
+ * ubifs_parse_options - parse mount parameters.
+ * @c: UBIFS file-system description object
+ * @options: parameters to parse
+ * @is_remount: non-zero if this is FS re-mount
+ *
+ * This function parses UBIFS mount options and returns zero in case success
+ * and a negative error code in case of failure.
+ */
+static int ubifs_parse_options(struct ubifs_info *c, char *options,
+			       int is_remount)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ","))) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		/*
+		 * %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
+		 * We accept them in order to be backward-compatible. But this
+		 * should be removed at some point.
+		 */
+		case Opt_fast_unmount:
+			c->mount_opts.unmount_mode = 2;
+			break;
+		case Opt_norm_unmount:
+			c->mount_opts.unmount_mode = 1;
+			break;
+		case Opt_bulk_read:
+			c->mount_opts.bulk_read = 2;
+			c->bulk_read = 1;
+			break;
+		case Opt_no_bulk_read:
+			c->mount_opts.bulk_read = 1;
+			c->bulk_read = 0;
+			break;
+		case Opt_chk_data_crc:
+			c->mount_opts.chk_data_crc = 2;
+			c->no_chk_data_crc = 0;
+			break;
+		case Opt_no_chk_data_crc:
+			c->mount_opts.chk_data_crc = 1;
+			c->no_chk_data_crc = 1;
+			break;
+		case Opt_override_compr:
+		{
+			char *name = match_strdup(&args[0]);
+
+			if (!name)
+				return -ENOMEM;
+			if (!strcmp(name, "none"))
+				c->mount_opts.compr_type = UBIFS_COMPR_NONE;
+			else if (!strcmp(name, "lzo"))
+				c->mount_opts.compr_type = UBIFS_COMPR_LZO;
+			else if (!strcmp(name, "zlib"))
+				c->mount_opts.compr_type = UBIFS_COMPR_ZLIB;
+			else {
+				ubifs_err("unknown compressor \"%s\"", name);
+				kfree(name);
+				return -EINVAL;
+			}
+			kfree(name);
+			c->mount_opts.override_compr = 1;
+			c->default_compr = c->mount_opts.compr_type;
+			break;
+		}
+		default:
+		{
+			unsigned long flag;
+			struct super_block *sb = c->vfs_sb;
+
+			flag = parse_standard_option(p);
+			if (!flag) {
+				ubifs_err("unrecognized mount option \"%s\" "
+					  "or missing value", p);
+				return -EINVAL;
+			}
+			sb->s_flags |= flag;
+			break;
+		}
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * destroy_journal - destroy journal data structures.
+ * @c: UBIFS file-system description object
+ *
+ * This function destroys journal data structures including those that may have
+ * been created by recovery functions.
+ */
+static void destroy_journal(struct ubifs_info *c)
+{
+	while (!list_empty(&c->unclean_leb_list)) {
+		struct ubifs_unclean_leb *ucleb;
+
+		ucleb = list_entry(c->unclean_leb_list.next,
+				   struct ubifs_unclean_leb, list);
+		list_del(&ucleb->list);
+		kfree(ucleb);
+	}
+	while (!list_empty(&c->old_buds)) {
+		struct ubifs_bud *bud;
+
+		bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
+		list_del(&bud->list);
+		kfree(bud);
+	}
+	ubifs_destroy_idx_gc(c);
+	ubifs_destroy_size_tree(c);
+	ubifs_tnc_close(c);
+	free_buds(c);
+}
+
+/**
+ * bu_init - initialize bulk-read information.
+ * @c: UBIFS file-system description object
+ */
+static void bu_init(struct ubifs_info *c)
+{
+	ubifs_assert(c->bulk_read == 1);
+
+	if (c->bu.buf)
+		return; /* Already initialized */
+
+again:
+	c->bu.buf = kmalloc(c->max_bu_buf_len, GFP_KERNEL | __GFP_NOWARN);
+	if (!c->bu.buf) {
+		if (c->max_bu_buf_len > UBIFS_KMALLOC_OK) {
+			c->max_bu_buf_len = UBIFS_KMALLOC_OK;
+			goto again;
+		}
+
+		/* Just disable bulk-read */
+		ubifs_warn("Cannot allocate %d bytes of memory for bulk-read, "
+			   "disabling it", c->max_bu_buf_len);
+		c->mount_opts.bulk_read = 1;
+		c->bulk_read = 0;
+		return;
+	}
+}
+
+/**
+ * check_free_space - check if there is enough free space to mount.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure UBIFS has enough free space to be mounted in
+ * read/write mode. UBIFS must always have some free space to allow deletions.
+ */
+static int check_free_space(struct ubifs_info *c)
+{
+	ubifs_assert(c->dark_wm > 0);
+	if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
+		ubifs_err("insufficient free space to mount in R/W mode");
+		dbg_dump_budg(c, &c->bi);
+		dbg_dump_lprops(c);
+		return -ENOSPC;
+	}
+	return 0;
+}
+
+/**
+ * mount_ubifs - mount UBIFS file-system.
+ * @c: UBIFS file-system description object
+ *
+ * This function mounts UBIFS file system. Returns zero in case of success and
+ * a negative error code in case of failure.
+ *
+ * Note, the function does not de-allocate resources it it fails half way
+ * through, and the caller has to do this instead.
+ */
+static int mount_ubifs(struct ubifs_info *c)
+{
+	int err;
+	long long x;
+	size_t sz;
+
+	c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY);
+	err = init_constants_early(c);
+	if (err)
+		return err;
+
+	err = ubifs_debugging_init(c);
+	if (err)
+		return err;
+
+	err = check_volume_empty(c);
+	if (err)
+		goto out_free;
+
+	if (c->empty && (c->ro_mount || c->ro_media)) {
+		/*
+		 * This UBI volume is empty, and read-only, or the file system
+		 * is mounted read-only - we cannot format it.
+		 */
+		ubifs_err("can't format empty UBI volume: read-only %s",
+			  c->ro_media ? "UBI volume" : "mount");
+		err = -EROFS;
+		goto out_free;
+	}
+
+	if (c->ro_media && !c->ro_mount) {
+		ubifs_err("cannot mount read-write - read-only media");
+		err = -EROFS;
+		goto out_free;
+	}
+
+	/*
+	 * The requirement for the buffer is that it should fit indexing B-tree
+	 * height amount of integers. We assume the height if the TNC tree will
+	 * never exceed 64.
+	 */
+	err = -ENOMEM;
+	c->bottom_up_buf = kmalloc(BOTTOM_UP_HEIGHT * sizeof(int), GFP_KERNEL);
+	if (!c->bottom_up_buf)
+		goto out_free;
+
+	c->sbuf = vmalloc(c->leb_size);
+	if (!c->sbuf)
+		goto out_free;
+
+	if (!c->ro_mount) {
+		c->ileb_buf = vmalloc(c->leb_size);
+		if (!c->ileb_buf)
+			goto out_free;
+	}
+
+	if (c->bulk_read == 1)
+		bu_init(c);
+
+	if (!c->ro_mount) {
+		c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ,
+					       GFP_KERNEL);
+		if (!c->write_reserve_buf)
+			goto out_free;
+	}
+
+	c->mounting = 1;
+
+	err = ubifs_read_superblock(c);
+	if (err)
+		goto out_free;
+
+	/*
+	 * Make sure the compressor which is set as default in the superblock
+	 * or overridden by mount options is actually compiled in.
+	 */
+	if (!ubifs_compr_present(c->default_compr)) {
+		ubifs_err("'compressor \"%s\" is not compiled in",
+			  ubifs_compr_name(c->default_compr));
+		err = -ENOTSUPP;
+		goto out_free;
+	}
+
+	err = init_constants_sb(c);
+	if (err)
+		goto out_free;
+
+	sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
+	sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
+	c->cbuf = kmalloc(sz, GFP_NOFS);
+	if (!c->cbuf) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	err = alloc_wbufs(c);
+	if (err)
+		goto out_cbuf;
+
+	sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
+	if (!c->ro_mount) {
+		/* Create background thread */
+		c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
+		if (IS_ERR(c->bgt)) {
+			err = PTR_ERR(c->bgt);
+			c->bgt = NULL;
+			ubifs_err("cannot spawn \"%s\", error %d",
+				  c->bgt_name, err);
+			goto out_wbufs;
+		}
+		wake_up_process(c->bgt);
+	}
+
+	err = ubifs_read_master(c);
+	if (err)
+		goto out_master;
+
+	init_constants_master(c);
+
+	if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
+		ubifs_msg("recovery needed");
+		c->need_recovery = 1;
+	}
+
+	if (c->need_recovery && !c->ro_mount) {
+		err = ubifs_recover_inl_heads(c, c->sbuf);
+		if (err)
+			goto out_master;
+	}
+
+	err = ubifs_lpt_init(c, 1, !c->ro_mount);
+	if (err)
+		goto out_master;
+
+	if (!c->ro_mount && c->space_fixup) {
+		err = ubifs_fixup_free_space(c);
+		if (err)
+			goto out_master;
+	}
+
+	if (!c->ro_mount) {
+		/*
+		 * Set the "dirty" flag so that if we reboot uncleanly we
+		 * will notice this immediately on the next mount.
+		 */
+		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+		err = ubifs_write_master(c);
+		if (err)
+			goto out_lpt;
+	}
+
+	err = dbg_check_idx_size(c, c->bi.old_idx_sz);
+	if (err)
+		goto out_lpt;
+
+	err = ubifs_replay_journal(c);
+	if (err)
+		goto out_journal;
+
+	/* Calculate 'min_idx_lebs' after journal replay */
+	c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+	err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
+	if (err)
+		goto out_orphans;
+
+	if (!c->ro_mount) {
+		int lnum;
+
+		err = check_free_space(c);
+		if (err)
+			goto out_orphans;
+
+		/* Check for enough log space */
+		lnum = c->lhead_lnum + 1;
+		if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+			lnum = UBIFS_LOG_LNUM;
+		if (lnum == c->ltail_lnum) {
+			err = ubifs_consolidate_log(c);
+			if (err)
+				goto out_orphans;
+		}
+
+		if (c->need_recovery) {
+			err = ubifs_recover_size(c);
+			if (err)
+				goto out_orphans;
+			err = ubifs_rcvry_gc_commit(c);
+			if (err)
+				goto out_orphans;
+		} else {
+			err = take_gc_lnum(c);
+			if (err)
+				goto out_orphans;
+
+			/*
+			 * GC LEB may contain garbage if there was an unclean
+			 * reboot, and it should be un-mapped.
+			 */
+			err = ubifs_leb_unmap(c, c->gc_lnum);
+			if (err)
+				goto out_orphans;
+		}
+
+		err = dbg_check_lprops(c);
+		if (err)
+			goto out_orphans;
+	} else if (c->need_recovery) {
+		err = ubifs_recover_size(c);
+		if (err)
+			goto out_orphans;
+	} else {
+		/*
+		 * Even if we mount read-only, we have to set space in GC LEB
+		 * to proper value because this affects UBIFS free space
+		 * reporting. We do not want to have a situation when
+		 * re-mounting from R/O to R/W changes amount of free space.
+		 */
+		err = take_gc_lnum(c);
+		if (err)
+			goto out_orphans;
+	}
+
+	spin_lock(&ubifs_infos_lock);
+	list_add_tail(&c->infos_list, &ubifs_infos);
+	spin_unlock(&ubifs_infos_lock);
+
+	if (c->need_recovery) {
+		if (c->ro_mount)
+			ubifs_msg("recovery deferred");
+		else {
+			c->need_recovery = 0;
+			ubifs_msg("recovery completed");
+			/*
+			 * GC LEB has to be empty and taken at this point. But
+			 * the journal head LEBs may also be accounted as
+			 * "empty taken" if they are empty.
+			 */
+			ubifs_assert(c->lst.taken_empty_lebs > 0);
+		}
+	} else
+		ubifs_assert(c->lst.taken_empty_lebs > 0);
+
+	err = dbg_check_filesystem(c);
+	if (err)
+		goto out_infos;
+
+	err = dbg_debugfs_init_fs(c);
+	if (err)
+		goto out_infos;
+
+	c->mounting = 0;
+
+	ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
+		  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
+	if (c->ro_mount)
+		ubifs_msg("mounted read-only");
+	x = (long long)c->main_lebs * c->leb_size;
+	ubifs_msg("file system size:   %lld bytes (%lld KiB, %lld MiB, %d "
+		  "LEBs)", x, x >> 10, x >> 20, c->main_lebs);
+	x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
+	ubifs_msg("journal size:       %lld bytes (%lld KiB, %lld MiB, %d "
+		  "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
+	ubifs_msg("media format:       w%d/r%d (latest is w%d/r%d)",
+		  c->fmt_version, c->ro_compat_version,
+		  UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
+	ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
+	ubifs_msg("reserved for root:  %llu bytes (%llu KiB)",
+		c->report_rp_size, c->report_rp_size >> 10);
+
+	dbg_msg("compiled on:         " __DATE__ " at " __TIME__);
+	dbg_msg("min. I/O unit size:  %d bytes", c->min_io_size);
+	dbg_msg("max. write size:     %d bytes", c->max_write_size);
+	dbg_msg("LEB size:            %d bytes (%d KiB)",
+		c->leb_size, c->leb_size >> 10);
+	dbg_msg("data journal heads:  %d",
+		c->jhead_cnt - NONDATA_JHEADS_CNT);
+	dbg_msg("UUID:                %pUB", c->uuid);
+	dbg_msg("big_lpt              %d", c->big_lpt);
+	dbg_msg("log LEBs:            %d (%d - %d)",
+		c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
+	dbg_msg("LPT area LEBs:       %d (%d - %d)",
+		c->lpt_lebs, c->lpt_first, c->lpt_last);
+	dbg_msg("orphan area LEBs:    %d (%d - %d)",
+		c->orph_lebs, c->orph_first, c->orph_last);
+	dbg_msg("main area LEBs:      %d (%d - %d)",
+		c->main_lebs, c->main_first, c->leb_cnt - 1);
+	dbg_msg("index LEBs:          %d", c->lst.idx_lebs);
+	dbg_msg("total index bytes:   %lld (%lld KiB, %lld MiB)",
+		c->bi.old_idx_sz, c->bi.old_idx_sz >> 10,
+		c->bi.old_idx_sz >> 20);
+	dbg_msg("key hash type:       %d", c->key_hash_type);
+	dbg_msg("tree fanout:         %d", c->fanout);
+	dbg_msg("reserved GC LEB:     %d", c->gc_lnum);
+	dbg_msg("first main LEB:      %d", c->main_first);
+	dbg_msg("max. znode size      %d", c->max_znode_sz);
+	dbg_msg("max. index node size %d", c->max_idx_node_sz);
+	dbg_msg("node sizes:          data %zu, inode %zu, dentry %zu",
+		UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ);
+	dbg_msg("node sizes:          trun %zu, sb %zu, master %zu",
+		UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
+	dbg_msg("node sizes:          ref %zu, cmt. start %zu, orph %zu",
+		UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
+	dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu, idx %d",
+		UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
+		UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
+	dbg_msg("dead watermark:      %d", c->dead_wm);
+	dbg_msg("dark watermark:      %d", c->dark_wm);
+	dbg_msg("LEB overhead:        %d", c->leb_overhead);
+	x = (long long)c->main_lebs * c->dark_wm;
+	dbg_msg("max. dark space:     %lld (%lld KiB, %lld MiB)",
+		x, x >> 10, x >> 20);
+	dbg_msg("maximum bud bytes:   %lld (%lld KiB, %lld MiB)",
+		c->max_bud_bytes, c->max_bud_bytes >> 10,
+		c->max_bud_bytes >> 20);
+	dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)",
+		c->bg_bud_bytes, c->bg_bud_bytes >> 10,
+		c->bg_bud_bytes >> 20);
+	dbg_msg("current bud bytes    %lld (%lld KiB, %lld MiB)",
+		c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20);
+	dbg_msg("max. seq. number:    %llu", c->max_sqnum);
+	dbg_msg("commit number:       %llu", c->cmt_no);
+
+	return 0;
+
+out_infos:
+	spin_lock(&ubifs_infos_lock);
+	list_del(&c->infos_list);
+	spin_unlock(&ubifs_infos_lock);
+out_orphans:
+	free_orphans(c);
+out_journal:
+	destroy_journal(c);
+out_lpt:
+	ubifs_lpt_free(c, 0);
+out_master:
+	kfree(c->mst_node);
+	kfree(c->rcvrd_mst_node);
+	if (c->bgt)
+		kthread_stop(c->bgt);
+out_wbufs:
+	free_wbufs(c);
+out_cbuf:
+	kfree(c->cbuf);
+out_free:
+	kfree(c->write_reserve_buf);
+	kfree(c->bu.buf);
+	vfree(c->ileb_buf);
+	vfree(c->sbuf);
+	kfree(c->bottom_up_buf);
+	ubifs_debugging_exit(c);
+	return err;
+}
+
+/**
+ * ubifs_umount - un-mount UBIFS file-system.
+ * @c: UBIFS file-system description object
+ *
+ * Note, this function is called to free allocated resourced when un-mounting,
+ * as well as free resources when an error occurred while we were half way
+ * through mounting (error path cleanup function). So it has to make sure the
+ * resource was actually allocated before freeing it.
+ */
+static void ubifs_umount(struct ubifs_info *c)
+{
+	dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num,
+		c->vi.vol_id);
+
+	dbg_debugfs_exit_fs(c);
+	spin_lock(&ubifs_infos_lock);
+	list_del(&c->infos_list);
+	spin_unlock(&ubifs_infos_lock);
+
+	if (c->bgt)
+		kthread_stop(c->bgt);
+
+	destroy_journal(c);
+	free_wbufs(c);
+	free_orphans(c);
+	ubifs_lpt_free(c, 0);
+
+	kfree(c->cbuf);
+	kfree(c->rcvrd_mst_node);
+	kfree(c->mst_node);
+	kfree(c->write_reserve_buf);
+	kfree(c->bu.buf);
+	vfree(c->ileb_buf);
+	vfree(c->sbuf);
+	kfree(c->bottom_up_buf);
+	ubifs_debugging_exit(c);
+}
+
+/**
+ * ubifs_remount_rw - re-mount in read-write mode.
+ * @c: UBIFS file-system description object
+ *
+ * UBIFS avoids allocating many unnecessary resources when mounted in read-only
+ * mode. This function allocates the needed resources and re-mounts UBIFS in
+ * read-write mode.
+ */
+static int ubifs_remount_rw(struct ubifs_info *c)
+{
+	int err, lnum;
+
+	if (c->rw_incompat) {
+		ubifs_err("the file-system is not R/W-compatible");
+		ubifs_msg("on-flash format version is w%d/r%d, but software "
+			  "only supports up to version w%d/r%d", c->fmt_version,
+			  c->ro_compat_version, UBIFS_FORMAT_VERSION,
+			  UBIFS_RO_COMPAT_VERSION);
+		return -EROFS;
+	}
+
+	mutex_lock(&c->umount_mutex);
+	dbg_save_space_info(c);
+	c->remounting_rw = 1;
+	c->ro_mount = 0;
+
+	err = check_free_space(c);
+	if (err)
+		goto out;
+
+	if (c->old_leb_cnt != c->leb_cnt) {
+		struct ubifs_sb_node *sup;
+
+		sup = ubifs_read_sb_node(c);
+		if (IS_ERR(sup)) {
+			err = PTR_ERR(sup);
+			goto out;
+		}
+		sup->leb_cnt = cpu_to_le32(c->leb_cnt);
+		err = ubifs_write_sb_node(c, sup);
+		kfree(sup);
+		if (err)
+			goto out;
+	}
+
+	if (c->need_recovery) {
+		ubifs_msg("completing deferred recovery");
+		err = ubifs_write_rcvrd_mst_node(c);
+		if (err)
+			goto out;
+		err = ubifs_recover_size(c);
+		if (err)
+			goto out;
+		err = ubifs_clean_lebs(c, c->sbuf);
+		if (err)
+			goto out;
+		err = ubifs_recover_inl_heads(c, c->sbuf);
+		if (err)
+			goto out;
+	} else {
+		/* A readonly mount is not allowed to have orphans */
+		ubifs_assert(c->tot_orphans == 0);
+		err = ubifs_clear_orphans(c);
+		if (err)
+			goto out;
+	}
+
+	if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
+		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+		err = ubifs_write_master(c);
+		if (err)
+			goto out;
+	}
+
+	c->ileb_buf = vmalloc(c->leb_size);
+	if (!c->ileb_buf) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL);
+	if (!c->write_reserve_buf)
+		goto out;
+
+	err = ubifs_lpt_init(c, 0, 1);
+	if (err)
+		goto out;
+
+	/* Create background thread */
+	c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
+	if (IS_ERR(c->bgt)) {
+		err = PTR_ERR(c->bgt);
+		c->bgt = NULL;
+		ubifs_err("cannot spawn \"%s\", error %d",
+			  c->bgt_name, err);
+		goto out;
+	}
+	wake_up_process(c->bgt);
+
+	c->orph_buf = vmalloc(c->leb_size);
+	if (!c->orph_buf) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* Check for enough log space */
+	lnum = c->lhead_lnum + 1;
+	if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+		lnum = UBIFS_LOG_LNUM;
+	if (lnum == c->ltail_lnum) {
+		err = ubifs_consolidate_log(c);
+		if (err)
+			goto out;
+	}
+
+	if (c->need_recovery)
+		err = ubifs_rcvry_gc_commit(c);
+	else
+		err = ubifs_leb_unmap(c, c->gc_lnum);
+	if (err)
+		goto out;
+
+	dbg_gen("re-mounted read-write");
+	c->remounting_rw = 0;
+
+	if (c->need_recovery) {
+		c->need_recovery = 0;
+		ubifs_msg("deferred recovery completed");
+	} else {
+		/*
+		 * Do not run the debugging space check if the were doing
+		 * recovery, because when we saved the information we had the
+		 * file-system in a state where the TNC and lprops has been
+		 * modified in memory, but all the I/O operations (including a
+		 * commit) were deferred. So the file-system was in
+		 * "non-committed" state. Now the file-system is in committed
+		 * state, and of course the amount of free space will change
+		 * because, for example, the old index size was imprecise.
+		 */
+		err = dbg_check_space_info(c);
+	}
+
+	if (c->space_fixup) {
+		err = ubifs_fixup_free_space(c);
+		if (err)
+			goto out;
+	}
+
+	mutex_unlock(&c->umount_mutex);
+	return err;
+
+out:
+	c->ro_mount = 1;
+	vfree(c->orph_buf);
+	c->orph_buf = NULL;
+	if (c->bgt) {
+		kthread_stop(c->bgt);
+		c->bgt = NULL;
+	}
+	free_wbufs(c);
+	kfree(c->write_reserve_buf);
+	c->write_reserve_buf = NULL;
+	vfree(c->ileb_buf);
+	c->ileb_buf = NULL;
+	ubifs_lpt_free(c, 1);
+	c->remounting_rw = 0;
+	mutex_unlock(&c->umount_mutex);
+	return err;
+}
+
+/**
+ * ubifs_remount_ro - re-mount in read-only mode.
+ * @c: UBIFS file-system description object
+ *
+ * We assume VFS has stopped writing. Possibly the background thread could be
+ * running a commit, however kthread_stop will wait in that case.
+ */
+static void ubifs_remount_ro(struct ubifs_info *c)
+{
+	int i, err;
+
+	ubifs_assert(!c->need_recovery);
+	ubifs_assert(!c->ro_mount);
+
+	mutex_lock(&c->umount_mutex);
+	if (c->bgt) {
+		kthread_stop(c->bgt);
+		c->bgt = NULL;
+	}
+
+	dbg_save_space_info(c);
+
+	for (i = 0; i < c->jhead_cnt; i++)
+		ubifs_wbuf_sync(&c->jheads[i].wbuf);
+
+	c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+	c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+	c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+	err = ubifs_write_master(c);
+	if (err)
+		ubifs_ro_mode(c, err);
+
+	vfree(c->orph_buf);
+	c->orph_buf = NULL;
+	kfree(c->write_reserve_buf);
+	c->write_reserve_buf = NULL;
+	vfree(c->ileb_buf);
+	c->ileb_buf = NULL;
+	ubifs_lpt_free(c, 1);
+	c->ro_mount = 1;
+	err = dbg_check_space_info(c);
+	if (err)
+		ubifs_ro_mode(c, err);
+	mutex_unlock(&c->umount_mutex);
+}
+
+static void ubifs_put_super(struct super_block *sb)
+{
+	int i;
+	struct ubifs_info *c = sb->s_fs_info;
+
+	ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
+		  c->vi.vol_id);
+
+	/*
+	 * The following asserts are only valid if there has not been a failure
+	 * of the media. For example, there will be dirty inodes if we failed
+	 * to write them back because of I/O errors.
+	 */
+	if (!c->ro_error) {
+		ubifs_assert(c->bi.idx_growth == 0);
+		ubifs_assert(c->bi.dd_growth == 0);
+		ubifs_assert(c->bi.data_growth == 0);
+	}
+
+	/*
+	 * The 'c->umount_lock' prevents races between UBIFS memory shrinker
+	 * and file system un-mount. Namely, it prevents the shrinker from
+	 * picking this superblock for shrinking - it will be just skipped if
+	 * the mutex is locked.
+	 */
+	mutex_lock(&c->umount_mutex);
+	if (!c->ro_mount) {
+		/*
+		 * First of all kill the background thread to make sure it does
+		 * not interfere with un-mounting and freeing resources.
+		 */
+		if (c->bgt) {
+			kthread_stop(c->bgt);
+			c->bgt = NULL;
+		}
+
+		/*
+		 * On fatal errors c->ro_error is set to 1, in which case we do
+		 * not write the master node.
+		 */
+		if (!c->ro_error) {
+			int err;
+
+			/* Synchronize write-buffers */
+			for (i = 0; i < c->jhead_cnt; i++)
+				ubifs_wbuf_sync(&c->jheads[i].wbuf);
+
+			/*
+			 * We are being cleanly unmounted which means the
+			 * orphans were killed - indicate this in the master
+			 * node. Also save the reserved GC LEB number.
+			 */
+			c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+			c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+			c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+			err = ubifs_write_master(c);
+			if (err)
+				/*
+				 * Recovery will attempt to fix the master area
+				 * next mount, so we just print a message and
+				 * continue to unmount normally.
+				 */
+				ubifs_err("failed to write master node, "
+					  "error %d", err);
+		} else {
+			for (i = 0; i < c->jhead_cnt; i++)
+				/* Make sure write-buffer timers are canceled */
+				hrtimer_cancel(&c->jheads[i].wbuf.timer);
+		}
+	}
+
+	ubifs_umount(c);
+	bdi_destroy(&c->bdi);
+	ubi_close_volume(c->ubi);
+	mutex_unlock(&c->umount_mutex);
+}
+
+static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	int err;
+	struct ubifs_info *c = sb->s_fs_info;
+
+	dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
+
+	err = ubifs_parse_options(c, data, 1);
+	if (err) {
+		ubifs_err("invalid or unknown remount parameter");
+		return err;
+	}
+
+	if (c->ro_mount && !(*flags & MS_RDONLY)) {
+		if (c->ro_error) {
+			ubifs_msg("cannot re-mount R/W due to prior errors");
+			return -EROFS;
+		}
+		if (c->ro_media) {
+			ubifs_msg("cannot re-mount R/W - UBI volume is R/O");
+			return -EROFS;
+		}
+		err = ubifs_remount_rw(c);
+		if (err)
+			return err;
+	} else if (!c->ro_mount && (*flags & MS_RDONLY)) {
+		if (c->ro_error) {
+			ubifs_msg("cannot re-mount R/O due to prior errors");
+			return -EROFS;
+		}
+		ubifs_remount_ro(c);
+	}
+
+	if (c->bulk_read == 1)
+		bu_init(c);
+	else {
+		dbg_gen("disable bulk-read");
+		kfree(c->bu.buf);
+		c->bu.buf = NULL;
+	}
+
+	ubifs_assert(c->lst.taken_empty_lebs > 0);
+	return 0;
+}
+
+const struct super_operations ubifs_super_operations = {
+	.alloc_inode   = ubifs_alloc_inode,
+	.destroy_inode = ubifs_destroy_inode,
+	.put_super     = ubifs_put_super,
+	.write_inode   = ubifs_write_inode,
+	.evict_inode   = ubifs_evict_inode,
+	.statfs        = ubifs_statfs,
+	.dirty_inode   = ubifs_dirty_inode,
+	.remount_fs    = ubifs_remount_fs,
+	.show_options  = ubifs_show_options,
+	.sync_fs       = ubifs_sync_fs,
+};
+
+/**
+ * open_ubi - parse UBI device name string and open the UBI device.
+ * @name: UBI volume name
+ * @mode: UBI volume open mode
+ *
+ * The primary method of mounting UBIFS is by specifying the UBI volume
+ * character device node path. However, UBIFS may also be mounted withoug any
+ * character device node using one of the following methods:
+ *
+ * o ubiX_Y    - mount UBI device number X, volume Y;
+ * o ubiY      - mount UBI device number 0, volume Y;
+ * o ubiX:NAME - mount UBI device X, volume with name NAME;
+ * o ubi:NAME  - mount UBI device 0, volume with name NAME.
+ *
+ * Alternative '!' separator may be used instead of ':' (because some shells
+ * like busybox may interpret ':' as an NFS host name separator). This function
+ * returns UBI volume description object in case of success and a negative
+ * error code in case of failure.
+ */
+static struct ubi_volume_desc *open_ubi(const char *name, int mode)
+{
+	struct ubi_volume_desc *ubi;
+	int dev, vol;
+	char *endptr;
+
+	/* First, try to open using the device node path method */
+	ubi = ubi_open_volume_path(name, mode);
+	if (!IS_ERR(ubi))
+		return ubi;
+
+	/* Try the "nodev" method */
+	if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
+		return ERR_PTR(-EINVAL);
+
+	/* ubi:NAME method */
+	if ((name[3] == ':' || name[3] == '!') && name[4] != '\0')
+		return ubi_open_volume_nm(0, name + 4, mode);
+
+	if (!isdigit(name[3]))
+		return ERR_PTR(-EINVAL);
+
+	dev = simple_strtoul(name + 3, &endptr, 0);
+
+	/* ubiY method */
+	if (*endptr == '\0')
+		return ubi_open_volume(0, dev, mode);
+
+	/* ubiX_Y method */
+	if (*endptr == '_' && isdigit(endptr[1])) {
+		vol = simple_strtoul(endptr + 1, &endptr, 0);
+		if (*endptr != '\0')
+			return ERR_PTR(-EINVAL);
+		return ubi_open_volume(dev, vol, mode);
+	}
+
+	/* ubiX:NAME method */
+	if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0')
+		return ubi_open_volume_nm(dev, ++endptr, mode);
+
+	return ERR_PTR(-EINVAL);
+}
+
+static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
+{
+	struct ubifs_info *c;
+
+	c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL);
+	if (c) {
+		spin_lock_init(&c->cnt_lock);
+		spin_lock_init(&c->cs_lock);
+		spin_lock_init(&c->buds_lock);
+		spin_lock_init(&c->space_lock);
+		spin_lock_init(&c->orphan_lock);
+		init_rwsem(&c->commit_sem);
+		mutex_init(&c->lp_mutex);
+		mutex_init(&c->tnc_mutex);
+		mutex_init(&c->log_mutex);
+		mutex_init(&c->mst_mutex);
+		mutex_init(&c->umount_mutex);
+		mutex_init(&c->bu_mutex);
+		mutex_init(&c->write_reserve_mutex);
+		init_waitqueue_head(&c->cmt_wq);
+		c->buds = RB_ROOT;
+		c->old_idx = RB_ROOT;
+		c->size_tree = RB_ROOT;
+		c->orph_tree = RB_ROOT;
+		INIT_LIST_HEAD(&c->infos_list);
+		INIT_LIST_HEAD(&c->idx_gc);
+		INIT_LIST_HEAD(&c->replay_list);
+		INIT_LIST_HEAD(&c->replay_buds);
+		INIT_LIST_HEAD(&c->uncat_list);
+		INIT_LIST_HEAD(&c->empty_list);
+		INIT_LIST_HEAD(&c->freeable_list);
+		INIT_LIST_HEAD(&c->frdi_idx_list);
+		INIT_LIST_HEAD(&c->unclean_leb_list);
+		INIT_LIST_HEAD(&c->old_buds);
+		INIT_LIST_HEAD(&c->orph_list);
+		INIT_LIST_HEAD(&c->orph_new);
+		c->no_chk_data_crc = 1;
+
+		c->highest_inum = UBIFS_FIRST_INO;
+		c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
+
+		ubi_get_volume_info(ubi, &c->vi);
+		ubi_get_device_info(c->vi.ubi_num, &c->di);
+	}
+	return c;
+}
+
+static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct ubifs_info *c = sb->s_fs_info;
+	struct inode *root;
+	int err;
+
+	c->vfs_sb = sb;
+	/* Re-open the UBI device in read-write mode */
+	c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE);
+	if (IS_ERR(c->ubi)) {
+		err = PTR_ERR(c->ubi);
+		goto out;
+	}
+
+	/*
+	 * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
+	 * UBIFS, I/O is not deferred, it is done immediately in readpage,
+	 * which means the user would have to wait not just for their own I/O
+	 * but the read-ahead I/O as well i.e. completely pointless.
+	 *
+	 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
+	 */
+	c->bdi.name = "ubifs",
+	c->bdi.capabilities = BDI_CAP_MAP_COPY;
+	err  = bdi_init(&c->bdi);
+	if (err)
+		goto out_close;
+	err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d",
+			   c->vi.ubi_num, c->vi.vol_id);
+	if (err)
+		goto out_bdi;
+
+	err = ubifs_parse_options(c, data, 0);
+	if (err)
+		goto out_bdi;
+
+	sb->s_bdi = &c->bdi;
+	sb->s_fs_info = c;
+	sb->s_magic = UBIFS_SUPER_MAGIC;
+	sb->s_blocksize = UBIFS_BLOCK_SIZE;
+	sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT;
+	sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c);
+	if (c->max_inode_sz > MAX_LFS_FILESIZE)
+		sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
+	sb->s_op = &ubifs_super_operations;
+
+	mutex_lock(&c->umount_mutex);
+	err = mount_ubifs(c);
+	if (err) {
+		ubifs_assert(err < 0);
+		goto out_unlock;
+	}
+
+	/* Read the root inode */
+	root = ubifs_iget(sb, UBIFS_ROOT_INO);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto out_umount;
+	}
+
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root)
+		goto out_iput;
+
+	mutex_unlock(&c->umount_mutex);
+	return 0;
+
+out_iput:
+	iput(root);
+out_umount:
+	ubifs_umount(c);
+out_unlock:
+	mutex_unlock(&c->umount_mutex);
+out_bdi:
+	bdi_destroy(&c->bdi);
+out_close:
+	ubi_close_volume(c->ubi);
+out:
+	return err;
+}
+
+static int sb_test(struct super_block *sb, void *data)
+{
+	struct ubifs_info *c1 = data;
+	struct ubifs_info *c = sb->s_fs_info;
+
+	return c->vi.cdev == c1->vi.cdev;
+}
+
+static int sb_set(struct super_block *sb, void *data)
+{
+	sb->s_fs_info = data;
+	return set_anon_super(sb, NULL);
+}
+
+static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
+			const char *name, void *data)
+{
+	struct ubi_volume_desc *ubi;
+	struct ubifs_info *c;
+	struct super_block *sb;
+	int err;
+
+	dbg_gen("name %s, flags %#x", name, flags);
+
+	/*
+	 * Get UBI device number and volume ID. Mount it read-only so far
+	 * because this might be a new mount point, and UBI allows only one
+	 * read-write user at a time.
+	 */
+	ubi = open_ubi(name, UBI_READONLY);
+	if (IS_ERR(ubi)) {
+		dbg_err("cannot open \"%s\", error %d",
+			name, (int)PTR_ERR(ubi));
+		return ERR_CAST(ubi);
+	}
+
+	c = alloc_ubifs_info(ubi);
+	if (!c) {
+		err = -ENOMEM;
+		goto out_close;
+	}
+
+	dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
+
+	sb = sget(fs_type, sb_test, sb_set, c);
+	if (IS_ERR(sb)) {
+		err = PTR_ERR(sb);
+		kfree(c);
+		goto out_close;
+	}
+
+	if (sb->s_root) {
+		struct ubifs_info *c1 = sb->s_fs_info;
+		kfree(c);
+		/* A new mount point for already mounted UBIFS */
+		dbg_gen("this ubi volume is already mounted");
+		if (!!(flags & MS_RDONLY) != c1->ro_mount) {
+			err = -EBUSY;
+			goto out_deact;
+		}
+	} else {
+		sb->s_flags = flags;
+		err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+		if (err)
+			goto out_deact;
+		/* We do not support atime */
+		sb->s_flags |= MS_ACTIVE | MS_NOATIME;
+	}
+
+	/* 'fill_super()' opens ubi again so we must close it here */
+	ubi_close_volume(ubi);
+
+	return dget(sb->s_root);
+
+out_deact:
+	deactivate_locked_super(sb);
+out_close:
+	ubi_close_volume(ubi);
+	return ERR_PTR(err);
+}
+
+static void kill_ubifs_super(struct super_block *s)
+{
+	struct ubifs_info *c = s->s_fs_info;
+	kill_anon_super(s);
+	kfree(c);
+}
+
+static struct file_system_type ubifs_fs_type = {
+	.name    = "ubifs",
+	.owner   = THIS_MODULE,
+	.mount   = ubifs_mount,
+	.kill_sb = kill_ubifs_super,
+};
+
+/*
+ * Inode slab cache constructor.
+ */
+static void inode_slab_ctor(void *obj)
+{
+	struct ubifs_inode *ui = obj;
+	inode_init_once(&ui->vfs_inode);
+}
+
+static int __init ubifs_init(void)
+{
+	int err;
+
+	BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24);
+
+	/* Make sure node sizes are 8-byte aligned */
+	BUILD_BUG_ON(UBIFS_CH_SZ        & 7);
+	BUILD_BUG_ON(UBIFS_INO_NODE_SZ  & 7);
+	BUILD_BUG_ON(UBIFS_DENT_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_XENT_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_DATA_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_SB_NODE_SZ   & 7);
+	BUILD_BUG_ON(UBIFS_MST_NODE_SZ  & 7);
+	BUILD_BUG_ON(UBIFS_REF_NODE_SZ  & 7);
+	BUILD_BUG_ON(UBIFS_CS_NODE_SZ   & 7);
+	BUILD_BUG_ON(UBIFS_ORPH_NODE_SZ & 7);
+
+	BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ  & 7);
+	BUILD_BUG_ON(UBIFS_MAX_NODE_SZ      & 7);
+	BUILD_BUG_ON(MIN_WRITE_SZ           & 7);
+
+	/* Check min. node size */
+	BUILD_BUG_ON(UBIFS_INO_NODE_SZ  < MIN_WRITE_SZ);
+	BUILD_BUG_ON(UBIFS_DENT_NODE_SZ < MIN_WRITE_SZ);
+	BUILD_BUG_ON(UBIFS_XENT_NODE_SZ < MIN_WRITE_SZ);
+	BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ < MIN_WRITE_SZ);
+
+	BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
+	BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
+	BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ > UBIFS_MAX_NODE_SZ);
+	BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ  > UBIFS_MAX_NODE_SZ);
+
+	/* Defined node sizes */
+	BUILD_BUG_ON(UBIFS_SB_NODE_SZ  != 4096);
+	BUILD_BUG_ON(UBIFS_MST_NODE_SZ != 512);
+	BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160);
+	BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64);
+
+	/*
+	 * We use 2 bit wide bit-fields to store compression type, which should
+	 * be amended if more compressors are added. The bit-fields are:
+	 * @compr_type in 'struct ubifs_inode', @default_compr in
+	 * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'.
+	 */
+	BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4);
+
+	/*
+	 * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
+	 * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
+	 */
+	if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) {
+		ubifs_err("VFS page cache size is %u bytes, but UBIFS requires"
+			  " at least 4096 bytes",
+			  (unsigned int)PAGE_CACHE_SIZE);
+		return -EINVAL;
+	}
+
+	err = register_filesystem(&ubifs_fs_type);
+	if (err) {
+		ubifs_err("cannot register file system, error %d", err);
+		return err;
+	}
+
+	err = -ENOMEM;
+	ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
+				sizeof(struct ubifs_inode), 0,
+				SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
+				&inode_slab_ctor);
+	if (!ubifs_inode_slab)
+		goto out_reg;
+
+	register_shrinker(&ubifs_shrinker_info);
+
+	err = ubifs_compressors_init();
+	if (err)
+		goto out_shrinker;
+
+	err = dbg_debugfs_init();
+	if (err)
+		goto out_compr;
+
+	return 0;
+
+out_compr:
+	ubifs_compressors_exit();
+out_shrinker:
+	unregister_shrinker(&ubifs_shrinker_info);
+	kmem_cache_destroy(ubifs_inode_slab);
+out_reg:
+	unregister_filesystem(&ubifs_fs_type);
+	return err;
+}
+/* late_initcall to let compressors initialize first */
+late_initcall(ubifs_init);
+
+static void __exit ubifs_exit(void)
+{
+	ubifs_assert(list_empty(&ubifs_infos));
+	ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
+
+	dbg_debugfs_exit();
+	ubifs_compressors_exit();
+	unregister_shrinker(&ubifs_shrinker_info);
+	kmem_cache_destroy(ubifs_inode_slab);
+	unregister_filesystem(&ubifs_fs_type);
+}
+module_exit(ubifs_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(__stringify(UBIFS_VERSION));
+MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter");
+MODULE_DESCRIPTION("UBIFS - UBI File System");
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
new file mode 100644
index 00000000..91b4213d
--- /dev/null
+++ b/fs/ubifs/tnc.c
@@ -0,0 +1,3349 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements TNC (Tree Node Cache) which caches indexing nodes of
+ * the UBIFS B-tree.
+ *
+ * At the moment the locking rules of the TNC tree are quite simple and
+ * straightforward. We just have a mutex and lock it when we traverse the
+ * tree. If a znode is not in memory, we read it from flash while still having
+ * the mutex locked.
+ */
+
+#include <linux/crc32.h>
+#include <linux/slab.h>
+#include "ubifs.h"
+
+/*
+ * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions.
+ * @NAME_LESS: name corresponding to the first argument is less than second
+ * @NAME_MATCHES: names match
+ * @NAME_GREATER: name corresponding to the second argument is greater than
+ *                first
+ * @NOT_ON_MEDIA: node referred by zbranch does not exist on the media
+ *
+ * These constants were introduce to improve readability.
+ */
+enum {
+	NAME_LESS    = 0,
+	NAME_MATCHES = 1,
+	NAME_GREATER = 2,
+	NOT_ON_MEDIA = 3,
+};
+
+/**
+ * insert_old_idx - record an index node obsoleted since the last commit start.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ *
+ * For recovery, there must always be a complete intact version of the index on
+ * flash at all times. That is called the "old index". It is the index as at the
+ * time of the last successful commit. Many of the index nodes in the old index
+ * may be dirty, but they must not be erased until the next successful commit
+ * (at which point that index becomes the old index).
+ *
+ * That means that the garbage collection and the in-the-gaps method of
+ * committing must be able to determine if an index node is in the old index.
+ * Most of the old index nodes can be found by looking up the TNC using the
+ * 'lookup_znode()' function. However, some of the old index nodes may have
+ * been deleted from the current index or may have been changed so much that
+ * they cannot be easily found. In those cases, an entry is added to an RB-tree.
+ * That is what this function does. The RB-tree is ordered by LEB number and
+ * offset because they uniquely identify the old index node.
+ */
+static int insert_old_idx(struct ubifs_info *c, int lnum, int offs)
+{
+	struct ubifs_old_idx *old_idx, *o;
+	struct rb_node **p, *parent = NULL;
+
+	old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS);
+	if (unlikely(!old_idx))
+		return -ENOMEM;
+	old_idx->lnum = lnum;
+	old_idx->offs = offs;
+
+	p = &c->old_idx.rb_node;
+	while (*p) {
+		parent = *p;
+		o = rb_entry(parent, struct ubifs_old_idx, rb);
+		if (lnum < o->lnum)
+			p = &(*p)->rb_left;
+		else if (lnum > o->lnum)
+			p = &(*p)->rb_right;
+		else if (offs < o->offs)
+			p = &(*p)->rb_left;
+		else if (offs > o->offs)
+			p = &(*p)->rb_right;
+		else {
+			ubifs_err("old idx added twice!");
+			kfree(old_idx);
+			return 0;
+		}
+	}
+	rb_link_node(&old_idx->rb, parent, p);
+	rb_insert_color(&old_idx->rb, &c->old_idx);
+	return 0;
+}
+
+/**
+ * insert_old_idx_znode - record a znode obsoleted since last commit start.
+ * @c: UBIFS file-system description object
+ * @znode: znode of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ */
+int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode)
+{
+	if (znode->parent) {
+		struct ubifs_zbranch *zbr;
+
+		zbr = &znode->parent->zbranch[znode->iip];
+		if (zbr->len)
+			return insert_old_idx(c, zbr->lnum, zbr->offs);
+	} else
+		if (c->zroot.len)
+			return insert_old_idx(c, c->zroot.lnum,
+					      c->zroot.offs);
+	return 0;
+}
+
+/**
+ * ins_clr_old_idx_znode - record a znode obsoleted since last commit start.
+ * @c: UBIFS file-system description object
+ * @znode: znode of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ */
+static int ins_clr_old_idx_znode(struct ubifs_info *c,
+				 struct ubifs_znode *znode)
+{
+	int err;
+
+	if (znode->parent) {
+		struct ubifs_zbranch *zbr;
+
+		zbr = &znode->parent->zbranch[znode->iip];
+		if (zbr->len) {
+			err = insert_old_idx(c, zbr->lnum, zbr->offs);
+			if (err)
+				return err;
+			zbr->lnum = 0;
+			zbr->offs = 0;
+			zbr->len = 0;
+		}
+	} else
+		if (c->zroot.len) {
+			err = insert_old_idx(c, c->zroot.lnum, c->zroot.offs);
+			if (err)
+				return err;
+			c->zroot.lnum = 0;
+			c->zroot.offs = 0;
+			c->zroot.len = 0;
+		}
+	return 0;
+}
+
+/**
+ * destroy_old_idx - destroy the old_idx RB-tree.
+ * @c: UBIFS file-system description object
+ *
+ * During start commit, the old_idx RB-tree is used to avoid overwriting index
+ * nodes that were in the index last commit but have since been deleted.  This
+ * is necessary for recovery i.e. the old index must be kept intact until the
+ * new index is successfully written.  The old-idx RB-tree is used for the
+ * in-the-gaps method of writing index nodes and is destroyed every commit.
+ */
+void destroy_old_idx(struct ubifs_info *c)
+{
+	struct rb_node *this = c->old_idx.rb_node;
+	struct ubifs_old_idx *old_idx;
+
+	while (this) {
+		if (this->rb_left) {
+			this = this->rb_left;
+			continue;
+		} else if (this->rb_right) {
+			this = this->rb_right;
+			continue;
+		}
+		old_idx = rb_entry(this, struct ubifs_old_idx, rb);
+		this = rb_parent(this);
+		if (this) {
+			if (this->rb_left == &old_idx->rb)
+				this->rb_left = NULL;
+			else
+				this->rb_right = NULL;
+		}
+		kfree(old_idx);
+	}
+	c->old_idx = RB_ROOT;
+}
+
+/**
+ * copy_znode - copy a dirty znode.
+ * @c: UBIFS file-system description object
+ * @znode: znode to copy
+ *
+ * A dirty znode being committed may not be changed, so it is copied.
+ */
+static struct ubifs_znode *copy_znode(struct ubifs_info *c,
+				      struct ubifs_znode *znode)
+{
+	struct ubifs_znode *zn;
+
+	zn = kmalloc(c->max_znode_sz, GFP_NOFS);
+	if (unlikely(!zn))
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(zn, znode, c->max_znode_sz);
+	zn->cnext = NULL;
+	__set_bit(DIRTY_ZNODE, &zn->flags);
+	__clear_bit(COW_ZNODE, &zn->flags);
+
+	ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
+	__set_bit(OBSOLETE_ZNODE, &znode->flags);
+
+	if (znode->level != 0) {
+		int i;
+		const int n = zn->child_cnt;
+
+		/* The children now have new parent */
+		for (i = 0; i < n; i++) {
+			struct ubifs_zbranch *zbr = &zn->zbranch[i];
+
+			if (zbr->znode)
+				zbr->znode->parent = zn;
+		}
+	}
+
+	atomic_long_inc(&c->dirty_zn_cnt);
+	return zn;
+}
+
+/**
+ * add_idx_dirt - add dirt due to a dirty znode.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of index node
+ * @dirt: size of index node
+ *
+ * This function updates lprops dirty space and the new size of the index.
+ */
+static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt)
+{
+	c->calc_idx_sz -= ALIGN(dirt, 8);
+	return ubifs_add_dirt(c, lnum, dirt);
+}
+
+/**
+ * dirty_cow_znode - ensure a znode is not being committed.
+ * @c: UBIFS file-system description object
+ * @zbr: branch of znode to check
+ *
+ * Returns dirtied znode on success or negative error code on failure.
+ */
+static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
+					   struct ubifs_zbranch *zbr)
+{
+	struct ubifs_znode *znode = zbr->znode;
+	struct ubifs_znode *zn;
+	int err;
+
+	if (!test_bit(COW_ZNODE, &znode->flags)) {
+		/* znode is not being committed */
+		if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) {
+			atomic_long_inc(&c->dirty_zn_cnt);
+			atomic_long_dec(&c->clean_zn_cnt);
+			atomic_long_dec(&ubifs_clean_zn_cnt);
+			err = add_idx_dirt(c, zbr->lnum, zbr->len);
+			if (unlikely(err))
+				return ERR_PTR(err);
+		}
+		return znode;
+	}
+
+	zn = copy_znode(c, znode);
+	if (IS_ERR(zn))
+		return zn;
+
+	if (zbr->len) {
+		err = insert_old_idx(c, zbr->lnum, zbr->offs);
+		if (unlikely(err))
+			return ERR_PTR(err);
+		err = add_idx_dirt(c, zbr->lnum, zbr->len);
+	} else
+		err = 0;
+
+	zbr->znode = zn;
+	zbr->lnum = 0;
+	zbr->offs = 0;
+	zbr->len = 0;
+
+	if (unlikely(err))
+		return ERR_PTR(err);
+	return zn;
+}
+
+/**
+ * lnc_add - add a leaf node to the leaf node cache.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ *
+ * Leaf nodes are non-index nodes directory entry nodes or data nodes. The
+ * purpose of the leaf node cache is to save re-reading the same leaf node over
+ * and over again. Most things are cached by VFS, however the file system must
+ * cache directory entries for readdir and for resolving hash collisions. The
+ * present implementation of the leaf node cache is extremely simple, and
+ * allows for error returns that are not used but that may be needed if a more
+ * complex implementation is created.
+ *
+ * Note, this function does not add the @node object to LNC directly, but
+ * allocates a copy of the object and adds the copy to LNC. The reason for this
+ * is that @node has been allocated outside of the TNC subsystem and will be
+ * used with @c->tnc_mutex unlock upon return from the TNC subsystem. But LNC
+ * may be changed at any time, e.g. freed by the shrinker.
+ */
+static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+		   const void *node)
+{
+	int err;
+	void *lnc_node;
+	const struct ubifs_dent_node *dent = node;
+
+	ubifs_assert(!zbr->leaf);
+	ubifs_assert(zbr->len != 0);
+	ubifs_assert(is_hash_key(c, &zbr->key));
+
+	err = ubifs_validate_entry(c, dent);
+	if (err) {
+		dbg_dump_stack();
+		dbg_dump_node(c, dent);
+		return err;
+	}
+
+	lnc_node = kmalloc(zbr->len, GFP_NOFS);
+	if (!lnc_node)
+		/* We don't have to have the cache, so no error */
+		return 0;
+
+	memcpy(lnc_node, node, zbr->len);
+	zbr->leaf = lnc_node;
+	return 0;
+}
+
+ /**
+ * lnc_add_directly - add a leaf node to the leaf-node-cache.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ *
+ * This function is similar to 'lnc_add()', but it does not create a copy of
+ * @node but inserts @node to TNC directly.
+ */
+static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			    void *node)
+{
+	int err;
+
+	ubifs_assert(!zbr->leaf);
+	ubifs_assert(zbr->len != 0);
+
+	err = ubifs_validate_entry(c, node);
+	if (err) {
+		dbg_dump_stack();
+		dbg_dump_node(c, node);
+		return err;
+	}
+
+	zbr->leaf = node;
+	return 0;
+}
+
+/**
+ * lnc_free - remove a leaf node from the leaf node cache.
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ */
+static void lnc_free(struct ubifs_zbranch *zbr)
+{
+	if (!zbr->leaf)
+		return;
+	kfree(zbr->leaf);
+	zbr->leaf = NULL;
+}
+
+/**
+ * tnc_read_node_nm - read a "hashed" leaf node.
+ * @c: UBIFS file-system description object
+ * @zbr: key and position of the node
+ * @node: node is returned here
+ *
+ * This function reads a "hashed" node defined by @zbr from the leaf node cache
+ * (in it is there) or from the hash media, in which case the node is also
+ * added to LNC. Returns zero in case of success or a negative negative error
+ * code in case of failure.
+ */
+static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			    void *node)
+{
+	int err;
+
+	ubifs_assert(is_hash_key(c, &zbr->key));
+
+	if (zbr->leaf) {
+		/* Read from the leaf node cache */
+		ubifs_assert(zbr->len != 0);
+		memcpy(node, zbr->leaf, zbr->len);
+		return 0;
+	}
+
+	err = ubifs_tnc_read_node(c, zbr, node);
+	if (err)
+		return err;
+
+	/* Add the node to the leaf node cache */
+	err = lnc_add(c, zbr, node);
+	return err;
+}
+
+/**
+ * try_read_node - read a node if it is a node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length (not aligned)
+ * @lnum: LEB number of node to read
+ * @offs: offset of node to read
+ *
+ * This function tries to read a node of known type and length, checks it and
+ * stores it in @buf. This function returns %1 if a node is present and %0 if
+ * a node is not present. A negative error code is returned for I/O errors.
+ * This function performs that same function as ubifs_read_node except that
+ * it does not require that there is actually a node present and instead
+ * the return code indicates if a node was read.
+ *
+ * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
+ * is true (it is controlled by corresponding mount option). However, if
+ * @c->mounting or @c->remounting_rw is true (we are mounting or re-mounting to
+ * R/W mode), @c->no_chk_data_crc is ignored and CRC is checked. This is
+ * because during mounting or re-mounting from R/O mode to R/W mode we may read
+ * journal nodes (when replying the journal or doing the recovery) and the
+ * journal nodes may potentially be corrupted, so checking is required.
+ */
+static int try_read_node(const struct ubifs_info *c, void *buf, int type,
+			 int len, int lnum, int offs)
+{
+	int err, node_len;
+	struct ubifs_ch *ch = buf;
+	uint32_t crc, node_crc;
+
+	dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+
+	err = ubi_read(c->ubi, lnum, buf, offs, len);
+	if (err) {
+		ubifs_err("cannot read node type %d from LEB %d:%d, error %d",
+			  type, lnum, offs, err);
+		return err;
+	}
+
+	if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
+		return 0;
+
+	if (ch->node_type != type)
+		return 0;
+
+	node_len = le32_to_cpu(ch->len);
+	if (node_len != len)
+		return 0;
+
+	if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting &&
+	    !c->remounting_rw)
+		return 1;
+
+	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
+	node_crc = le32_to_cpu(ch->crc);
+	if (crc != node_crc)
+		return 0;
+
+	return 1;
+}
+
+/**
+ * fallible_read_node - try to read a leaf node.
+ * @c: UBIFS file-system description object
+ * @key:  key of node to read
+ * @zbr:  position of node
+ * @node: node returned
+ *
+ * This function tries to read a node and returns %1 if the node is read, %0
+ * if the node is not present, and a negative error code in the case of error.
+ */
+static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
+			      struct ubifs_zbranch *zbr, void *node)
+{
+	int ret;
+
+	dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key));
+
+	ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
+			    zbr->offs);
+	if (ret == 1) {
+		union ubifs_key node_key;
+		struct ubifs_dent_node *dent = node;
+
+		/* All nodes have key in the same place */
+		key_read(c, &dent->key, &node_key);
+		if (keys_cmp(c, key, &node_key) != 0)
+			ret = 0;
+	}
+	if (ret == 0 && c->replaying)
+		dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
+			zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
+	return ret;
+}
+
+/**
+ * matches_name - determine if a direntry or xattr entry matches a given name.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of dent
+ * @nm: name to match
+ *
+ * This function checks if xentry/direntry referred by zbranch @zbr matches name
+ * @nm. Returns %NAME_MATCHES if it does, %NAME_LESS if the name referred by
+ * @zbr is less than @nm, and %NAME_GREATER if it is greater than @nm. In case
+ * of failure, a negative error code is returned.
+ */
+static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			const struct qstr *nm)
+{
+	struct ubifs_dent_node *dent;
+	int nlen, err;
+
+	/* If possible, match against the dent in the leaf node cache */
+	if (!zbr->leaf) {
+		dent = kmalloc(zbr->len, GFP_NOFS);
+		if (!dent)
+			return -ENOMEM;
+
+		err = ubifs_tnc_read_node(c, zbr, dent);
+		if (err)
+			goto out_free;
+
+		/* Add the node to the leaf node cache */
+		err = lnc_add_directly(c, zbr, dent);
+		if (err)
+			goto out_free;
+	} else
+		dent = zbr->leaf;
+
+	nlen = le16_to_cpu(dent->nlen);
+	err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+	if (err == 0) {
+		if (nlen == nm->len)
+			return NAME_MATCHES;
+		else if (nlen < nm->len)
+			return NAME_LESS;
+		else
+			return NAME_GREATER;
+	} else if (err < 0)
+		return NAME_LESS;
+	else
+		return NAME_GREATER;
+
+out_free:
+	kfree(dent);
+	return err;
+}
+
+/**
+ * get_znode - get a TNC znode that may not be loaded yet.
+ * @c: UBIFS file-system description object
+ * @znode: parent znode
+ * @n: znode branch slot number
+ *
+ * This function returns the znode or a negative error code.
+ */
+static struct ubifs_znode *get_znode(struct ubifs_info *c,
+				     struct ubifs_znode *znode, int n)
+{
+	struct ubifs_zbranch *zbr;
+
+	zbr = &znode->zbranch[n];
+	if (zbr->znode)
+		znode = zbr->znode;
+	else
+		znode = ubifs_load_znode(c, zbr, znode, n);
+	return znode;
+}
+
+/**
+ * tnc_next - find next TNC entry.
+ * @c: UBIFS file-system description object
+ * @zn: znode is passed and returned here
+ * @n: znode branch slot number is passed and returned here
+ *
+ * This function returns %0 if the next TNC entry is found, %-ENOENT if there is
+ * no next entry, or a negative error code otherwise.
+ */
+static int tnc_next(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
+{
+	struct ubifs_znode *znode = *zn;
+	int nn = *n;
+
+	nn += 1;
+	if (nn < znode->child_cnt) {
+		*n = nn;
+		return 0;
+	}
+	while (1) {
+		struct ubifs_znode *zp;
+
+		zp = znode->parent;
+		if (!zp)
+			return -ENOENT;
+		nn = znode->iip + 1;
+		znode = zp;
+		if (nn < znode->child_cnt) {
+			znode = get_znode(c, znode, nn);
+			if (IS_ERR(znode))
+				return PTR_ERR(znode);
+			while (znode->level != 0) {
+				znode = get_znode(c, znode, 0);
+				if (IS_ERR(znode))
+					return PTR_ERR(znode);
+			}
+			nn = 0;
+			break;
+		}
+	}
+	*zn = znode;
+	*n = nn;
+	return 0;
+}
+
+/**
+ * tnc_prev - find previous TNC entry.
+ * @c: UBIFS file-system description object
+ * @zn: znode is returned here
+ * @n: znode branch slot number is passed and returned here
+ *
+ * This function returns %0 if the previous TNC entry is found, %-ENOENT if
+ * there is no next entry, or a negative error code otherwise.
+ */
+static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
+{
+	struct ubifs_znode *znode = *zn;
+	int nn = *n;
+
+	if (nn > 0) {
+		*n = nn - 1;
+		return 0;
+	}
+	while (1) {
+		struct ubifs_znode *zp;
+
+		zp = znode->parent;
+		if (!zp)
+			return -ENOENT;
+		nn = znode->iip - 1;
+		znode = zp;
+		if (nn >= 0) {
+			znode = get_znode(c, znode, nn);
+			if (IS_ERR(znode))
+				return PTR_ERR(znode);
+			while (znode->level != 0) {
+				nn = znode->child_cnt - 1;
+				znode = get_znode(c, znode, nn);
+				if (IS_ERR(znode))
+					return PTR_ERR(znode);
+			}
+			nn = znode->child_cnt - 1;
+			break;
+		}
+	}
+	*zn = znode;
+	*n = nn;
+	return 0;
+}
+
+/**
+ * resolve_collision - resolve a collision.
+ * @c: UBIFS file-system description object
+ * @key: key of a directory or extended attribute entry
+ * @zn: znode is returned here
+ * @n: zbranch number is passed and returned here
+ * @nm: name of the entry
+ *
+ * This function is called for "hashed" keys to make sure that the found key
+ * really corresponds to the looked up node (directory or extended attribute
+ * entry). It returns %1 and sets @zn and @n if the collision is resolved.
+ * %0 is returned if @nm is not found and @zn and @n are set to the previous
+ * entry, i.e. to the entry after which @nm could follow if it were in TNC.
+ * This means that @n may be set to %-1 if the leftmost key in @zn is the
+ * previous one. A negative error code is returned on failures.
+ */
+static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key,
+			     struct ubifs_znode **zn, int *n,
+			     const struct qstr *nm)
+{
+	int err;
+
+	err = matches_name(c, &(*zn)->zbranch[*n], nm);
+	if (unlikely(err < 0))
+		return err;
+	if (err == NAME_MATCHES)
+		return 1;
+
+	if (err == NAME_GREATER) {
+		/* Look left */
+		while (1) {
+			err = tnc_prev(c, zn, n);
+			if (err == -ENOENT) {
+				ubifs_assert(*n == 0);
+				*n = -1;
+				return 0;
+			}
+			if (err < 0)
+				return err;
+			if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
+				/*
+				 * We have found the branch after which we would
+				 * like to insert, but inserting in this znode
+				 * may still be wrong. Consider the following 3
+				 * znodes, in the case where we are resolving a
+				 * collision with Key2.
+				 *
+				 *                  znode zp
+				 *            ----------------------
+				 * level 1     |  Key0  |  Key1  |
+				 *            -----------------------
+				 *                 |            |
+				 *       znode za  |            |  znode zb
+				 *          ------------      ------------
+				 * level 0  |  Key0  |        |  Key2  |
+				 *          ------------      ------------
+				 *
+				 * The lookup finds Key2 in znode zb. Lets say
+				 * there is no match and the name is greater so
+				 * we look left. When we find Key0, we end up
+				 * here. If we return now, we will insert into
+				 * znode za at slot n = 1.  But that is invalid
+				 * according to the parent's keys.  Key2 must
+				 * be inserted into znode zb.
+				 *
+				 * Note, this problem is not relevant for the
+				 * case when we go right, because
+				 * 'tnc_insert()' would correct the parent key.
+				 */
+				if (*n == (*zn)->child_cnt - 1) {
+					err = tnc_next(c, zn, n);
+					if (err) {
+						/* Should be impossible */
+						ubifs_assert(0);
+						if (err == -ENOENT)
+							err = -EINVAL;
+						return err;
+					}
+					ubifs_assert(*n == 0);
+					*n = -1;
+				}
+				return 0;
+			}
+			err = matches_name(c, &(*zn)->zbranch[*n], nm);
+			if (err < 0)
+				return err;
+			if (err == NAME_LESS)
+				return 0;
+			if (err == NAME_MATCHES)
+				return 1;
+			ubifs_assert(err == NAME_GREATER);
+		}
+	} else {
+		int nn = *n;
+		struct ubifs_znode *znode = *zn;
+
+		/* Look right */
+		while (1) {
+			err = tnc_next(c, &znode, &nn);
+			if (err == -ENOENT)
+				return 0;
+			if (err < 0)
+				return err;
+			if (keys_cmp(c, &znode->zbranch[nn].key, key))
+				return 0;
+			err = matches_name(c, &znode->zbranch[nn], nm);
+			if (err < 0)
+				return err;
+			if (err == NAME_GREATER)
+				return 0;
+			*zn = znode;
+			*n = nn;
+			if (err == NAME_MATCHES)
+				return 1;
+			ubifs_assert(err == NAME_LESS);
+		}
+	}
+}
+
+/**
+ * fallible_matches_name - determine if a dent matches a given name.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of dent
+ * @nm: name to match
+ *
+ * This is a "fallible" version of 'matches_name()' function which does not
+ * panic if the direntry/xentry referred by @zbr does not exist on the media.
+ *
+ * This function checks if xentry/direntry referred by zbranch @zbr matches name
+ * @nm. Returns %NAME_MATCHES it does, %NAME_LESS if the name referred by @zbr
+ * is less than @nm, %NAME_GREATER if it is greater than @nm, and @NOT_ON_MEDIA
+ * if xentry/direntry referred by @zbr does not exist on the media. A negative
+ * error code is returned in case of failure.
+ */
+static int fallible_matches_name(struct ubifs_info *c,
+				 struct ubifs_zbranch *zbr,
+				 const struct qstr *nm)
+{
+	struct ubifs_dent_node *dent;
+	int nlen, err;
+
+	/* If possible, match against the dent in the leaf node cache */
+	if (!zbr->leaf) {
+		dent = kmalloc(zbr->len, GFP_NOFS);
+		if (!dent)
+			return -ENOMEM;
+
+		err = fallible_read_node(c, &zbr->key, zbr, dent);
+		if (err < 0)
+			goto out_free;
+		if (err == 0) {
+			/* The node was not present */
+			err = NOT_ON_MEDIA;
+			goto out_free;
+		}
+		ubifs_assert(err == 1);
+
+		err = lnc_add_directly(c, zbr, dent);
+		if (err)
+			goto out_free;
+	} else
+		dent = zbr->leaf;
+
+	nlen = le16_to_cpu(dent->nlen);
+	err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+	if (err == 0) {
+		if (nlen == nm->len)
+			return NAME_MATCHES;
+		else if (nlen < nm->len)
+			return NAME_LESS;
+		else
+			return NAME_GREATER;
+	} else if (err < 0)
+		return NAME_LESS;
+	else
+		return NAME_GREATER;
+
+out_free:
+	kfree(dent);
+	return err;
+}
+
+/**
+ * fallible_resolve_collision - resolve a collision even if nodes are missing.
+ * @c: UBIFS file-system description object
+ * @key: key
+ * @zn: znode is returned here
+ * @n: branch number is passed and returned here
+ * @nm: name of directory entry
+ * @adding: indicates caller is adding a key to the TNC
+ *
+ * This is a "fallible" version of the 'resolve_collision()' function which
+ * does not panic if one of the nodes referred to by TNC does not exist on the
+ * media. This may happen when replaying the journal if a deleted node was
+ * Garbage-collected and the commit was not done. A branch that refers to a node
+ * that is not present is called a dangling branch. The following are the return
+ * codes for this function:
+ *  o if @nm was found, %1 is returned and @zn and @n are set to the found
+ *    branch;
+ *  o if we are @adding and @nm was not found, %0 is returned;
+ *  o if we are not @adding and @nm was not found, but a dangling branch was
+ *    found, then %1 is returned and @zn and @n are set to the dangling branch;
+ *  o a negative error code is returned in case of failure.
+ */
+static int fallible_resolve_collision(struct ubifs_info *c,
+				      const union ubifs_key *key,
+				      struct ubifs_znode **zn, int *n,
+				      const struct qstr *nm, int adding)
+{
+	struct ubifs_znode *o_znode = NULL, *znode = *zn;
+	int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n;
+
+	cmp = fallible_matches_name(c, &znode->zbranch[nn], nm);
+	if (unlikely(cmp < 0))
+		return cmp;
+	if (cmp == NAME_MATCHES)
+		return 1;
+	if (cmp == NOT_ON_MEDIA) {
+		o_znode = znode;
+		o_n = nn;
+		/*
+		 * We are unlucky and hit a dangling branch straight away.
+		 * Now we do not really know where to go to find the needed
+		 * branch - to the left or to the right. Well, let's try left.
+		 */
+		unsure = 1;
+	} else if (!adding)
+		unsure = 1; /* Remove a dangling branch wherever it is */
+
+	if (cmp == NAME_GREATER || unsure) {
+		/* Look left */
+		while (1) {
+			err = tnc_prev(c, zn, n);
+			if (err == -ENOENT) {
+				ubifs_assert(*n == 0);
+				*n = -1;
+				break;
+			}
+			if (err < 0)
+				return err;
+			if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
+				/* See comments in 'resolve_collision()' */
+				if (*n == (*zn)->child_cnt - 1) {
+					err = tnc_next(c, zn, n);
+					if (err) {
+						/* Should be impossible */
+						ubifs_assert(0);
+						if (err == -ENOENT)
+							err = -EINVAL;
+						return err;
+					}
+					ubifs_assert(*n == 0);
+					*n = -1;
+				}
+				break;
+			}
+			err = fallible_matches_name(c, &(*zn)->zbranch[*n], nm);
+			if (err < 0)
+				return err;
+			if (err == NAME_MATCHES)
+				return 1;
+			if (err == NOT_ON_MEDIA) {
+				o_znode = *zn;
+				o_n = *n;
+				continue;
+			}
+			if (!adding)
+				continue;
+			if (err == NAME_LESS)
+				break;
+			else
+				unsure = 0;
+		}
+	}
+
+	if (cmp == NAME_LESS || unsure) {
+		/* Look right */
+		*zn = znode;
+		*n = nn;
+		while (1) {
+			err = tnc_next(c, &znode, &nn);
+			if (err == -ENOENT)
+				break;
+			if (err < 0)
+				return err;
+			if (keys_cmp(c, &znode->zbranch[nn].key, key))
+				break;
+			err = fallible_matches_name(c, &znode->zbranch[nn], nm);
+			if (err < 0)
+				return err;
+			if (err == NAME_GREATER)
+				break;
+			*zn = znode;
+			*n = nn;
+			if (err == NAME_MATCHES)
+				return 1;
+			if (err == NOT_ON_MEDIA) {
+				o_znode = znode;
+				o_n = nn;
+			}
+		}
+	}
+
+	/* Never match a dangling branch when adding */
+	if (adding || !o_znode)
+		return 0;
+
+	dbg_mnt("dangling match LEB %d:%d len %d %s",
+		o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
+		o_znode->zbranch[o_n].len, DBGKEY(key));
+	*zn = o_znode;
+	*n = o_n;
+	return 1;
+}
+
+/**
+ * matches_position - determine if a zbranch matches a given position.
+ * @zbr: zbranch of dent
+ * @lnum: LEB number of dent to match
+ * @offs: offset of dent to match
+ *
+ * This function returns %1 if @lnum:@offs matches, and %0 otherwise.
+ */
+static int matches_position(struct ubifs_zbranch *zbr, int lnum, int offs)
+{
+	if (zbr->lnum == lnum && zbr->offs == offs)
+		return 1;
+	else
+		return 0;
+}
+
+/**
+ * resolve_collision_directly - resolve a collision directly.
+ * @c: UBIFS file-system description object
+ * @key: key of directory entry
+ * @zn: znode is passed and returned here
+ * @n: zbranch number is passed and returned here
+ * @lnum: LEB number of dent node to match
+ * @offs: offset of dent node to match
+ *
+ * This function is used for "hashed" keys to make sure the found directory or
+ * extended attribute entry node is what was looked for. It is used when the
+ * flash address of the right node is known (@lnum:@offs) which makes it much
+ * easier to resolve collisions (no need to read entries and match full
+ * names). This function returns %1 and sets @zn and @n if the collision is
+ * resolved, %0 if @lnum:@offs is not found and @zn and @n are set to the
+ * previous directory entry. Otherwise a negative error code is returned.
+ */
+static int resolve_collision_directly(struct ubifs_info *c,
+				      const union ubifs_key *key,
+				      struct ubifs_znode **zn, int *n,
+				      int lnum, int offs)
+{
+	struct ubifs_znode *znode;
+	int nn, err;
+
+	znode = *zn;
+	nn = *n;
+	if (matches_position(&znode->zbranch[nn], lnum, offs))
+		return 1;
+
+	/* Look left */
+	while (1) {
+		err = tnc_prev(c, &znode, &nn);
+		if (err == -ENOENT)
+			break;
+		if (err < 0)
+			return err;
+		if (keys_cmp(c, &znode->zbranch[nn].key, key))
+			break;
+		if (matches_position(&znode->zbranch[nn], lnum, offs)) {
+			*zn = znode;
+			*n = nn;
+			return 1;
+		}
+	}
+
+	/* Look right */
+	znode = *zn;
+	nn = *n;
+	while (1) {
+		err = tnc_next(c, &znode, &nn);
+		if (err == -ENOENT)
+			return 0;
+		if (err < 0)
+			return err;
+		if (keys_cmp(c, &znode->zbranch[nn].key, key))
+			return 0;
+		*zn = znode;
+		*n = nn;
+		if (matches_position(&znode->zbranch[nn], lnum, offs))
+			return 1;
+	}
+}
+
+/**
+ * dirty_cow_bottom_up - dirty a znode and its ancestors.
+ * @c: UBIFS file-system description object
+ * @znode: znode to dirty
+ *
+ * If we do not have a unique key that resides in a znode, then we cannot
+ * dirty that znode from the top down (i.e. by using lookup_level0_dirty)
+ * This function records the path back to the last dirty ancestor, and then
+ * dirties the znodes on that path.
+ */
+static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
+					       struct ubifs_znode *znode)
+{
+	struct ubifs_znode *zp;
+	int *path = c->bottom_up_buf, p = 0;
+
+	ubifs_assert(c->zroot.znode);
+	ubifs_assert(znode);
+	if (c->zroot.znode->level > BOTTOM_UP_HEIGHT) {
+		kfree(c->bottom_up_buf);
+		c->bottom_up_buf = kmalloc(c->zroot.znode->level * sizeof(int),
+					   GFP_NOFS);
+		if (!c->bottom_up_buf)
+			return ERR_PTR(-ENOMEM);
+		path = c->bottom_up_buf;
+	}
+	if (c->zroot.znode->level) {
+		/* Go up until parent is dirty */
+		while (1) {
+			int n;
+
+			zp = znode->parent;
+			if (!zp)
+				break;
+			n = znode->iip;
+			ubifs_assert(p < c->zroot.znode->level);
+			path[p++] = n;
+			if (!zp->cnext && ubifs_zn_dirty(znode))
+				break;
+			znode = zp;
+		}
+	}
+
+	/* Come back down, dirtying as we go */
+	while (1) {
+		struct ubifs_zbranch *zbr;
+
+		zp = znode->parent;
+		if (zp) {
+			ubifs_assert(path[p - 1] >= 0);
+			ubifs_assert(path[p - 1] < zp->child_cnt);
+			zbr = &zp->zbranch[path[--p]];
+			znode = dirty_cow_znode(c, zbr);
+		} else {
+			ubifs_assert(znode == c->zroot.znode);
+			znode = dirty_cow_znode(c, &c->zroot);
+		}
+		if (IS_ERR(znode) || !p)
+			break;
+		ubifs_assert(path[p - 1] >= 0);
+		ubifs_assert(path[p - 1] < znode->child_cnt);
+		znode = znode->zbranch[path[p - 1]].znode;
+	}
+
+	return znode;
+}
+
+/**
+ * ubifs_lookup_level0 - search for zero-level znode.
+ * @c: UBIFS file-system description object
+ * @key:  key to lookup
+ * @zn: znode is returned here
+ * @n: znode branch slot number is returned here
+ *
+ * This function looks up the TNC tree and search for zero-level znode which
+ * refers key @key. The found zero-level znode is returned in @zn. There are 3
+ * cases:
+ *   o exact match, i.e. the found zero-level znode contains key @key, then %1
+ *     is returned and slot number of the matched branch is stored in @n;
+ *   o not exact match, which means that zero-level znode does not contain
+ *     @key, then %0 is returned and slot number of the closest branch is stored
+ *     in @n;
+ *   o @key is so small that it is even less than the lowest key of the
+ *     leftmost zero-level node, then %0 is returned and %0 is stored in @n.
+ *
+ * Note, when the TNC tree is traversed, some znodes may be absent, then this
+ * function reads corresponding indexing nodes and inserts them to TNC. In
+ * case of failure, a negative error code is returned.
+ */
+int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
+			struct ubifs_znode **zn, int *n)
+{
+	int err, exact;
+	struct ubifs_znode *znode;
+	unsigned long time = get_seconds();
+
+	dbg_tnc("search key %s", DBGKEY(key));
+	ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
+
+	znode = c->zroot.znode;
+	if (unlikely(!znode)) {
+		znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+	}
+
+	znode->time = time;
+
+	while (1) {
+		struct ubifs_zbranch *zbr;
+
+		exact = ubifs_search_zbranch(c, znode, key, n);
+
+		if (znode->level == 0)
+			break;
+
+		if (*n < 0)
+			*n = 0;
+		zbr = &znode->zbranch[*n];
+
+		if (zbr->znode) {
+			znode->time = time;
+			znode = zbr->znode;
+			continue;
+		}
+
+		/* znode is not in TNC cache, load it from the media */
+		znode = ubifs_load_znode(c, zbr, znode, *n);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+	}
+
+	*zn = znode;
+	if (exact || !is_hash_key(c, key) || *n != -1) {
+		dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
+		return exact;
+	}
+
+	/*
+	 * Here is a tricky place. We have not found the key and this is a
+	 * "hashed" key, which may collide. The rest of the code deals with
+	 * situations like this:
+	 *
+	 *                  | 3 | 5 |
+	 *                  /       \
+	 *          | 3 | 5 |      | 6 | 7 | (x)
+	 *
+	 * Or more a complex example:
+	 *
+	 *                | 1 | 5 |
+	 *                /       \
+	 *       | 1 | 3 |         | 5 | 8 |
+	 *              \           /
+	 *          | 5 | 5 |   | 6 | 7 | (x)
+	 *
+	 * In the examples, if we are looking for key "5", we may reach nodes
+	 * marked with "(x)". In this case what we have do is to look at the
+	 * left and see if there is "5" key there. If there is, we have to
+	 * return it.
+	 *
+	 * Note, this whole situation is possible because we allow to have
+	 * elements which are equivalent to the next key in the parent in the
+	 * children of current znode. For example, this happens if we split a
+	 * znode like this: | 3 | 5 | 5 | 6 | 7 |, which results in something
+	 * like this:
+	 *                      | 3 | 5 |
+	 *                       /     \
+	 *                | 3 | 5 |   | 5 | 6 | 7 |
+	 *                              ^
+	 * And this becomes what is at the first "picture" after key "5" marked
+	 * with "^" is removed. What could be done is we could prohibit
+	 * splitting in the middle of the colliding sequence. Also, when
+	 * removing the leftmost key, we would have to correct the key of the
+	 * parent node, which would introduce additional complications. Namely,
+	 * if we changed the leftmost key of the parent znode, the garbage
+	 * collector would be unable to find it (GC is doing this when GC'ing
+	 * indexing LEBs). Although we already have an additional RB-tree where
+	 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
+	 * after the commit. But anyway, this does not look easy to implement
+	 * so we did not try this.
+	 */
+	err = tnc_prev(c, &znode, n);
+	if (err == -ENOENT) {
+		dbg_tnc("found 0, lvl %d, n -1", znode->level);
+		*n = -1;
+		return 0;
+	}
+	if (unlikely(err < 0))
+		return err;
+	if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
+		dbg_tnc("found 0, lvl %d, n -1", znode->level);
+		*n = -1;
+		return 0;
+	}
+
+	dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
+	*zn = znode;
+	return 1;
+}
+
+/**
+ * lookup_level0_dirty - search for zero-level znode dirtying.
+ * @c: UBIFS file-system description object
+ * @key:  key to lookup
+ * @zn: znode is returned here
+ * @n: znode branch slot number is returned here
+ *
+ * This function looks up the TNC tree and search for zero-level znode which
+ * refers key @key. The found zero-level znode is returned in @zn. There are 3
+ * cases:
+ *   o exact match, i.e. the found zero-level znode contains key @key, then %1
+ *     is returned and slot number of the matched branch is stored in @n;
+ *   o not exact match, which means that zero-level znode does not contain @key
+ *     then %0 is returned and slot number of the closed branch is stored in
+ *     @n;
+ *   o @key is so small that it is even less than the lowest key of the
+ *     leftmost zero-level node, then %0 is returned and %-1 is stored in @n.
+ *
+ * Additionally all znodes in the path from the root to the located zero-level
+ * znode are marked as dirty.
+ *
+ * Note, when the TNC tree is traversed, some znodes may be absent, then this
+ * function reads corresponding indexing nodes and inserts them to TNC. In
+ * case of failure, a negative error code is returned.
+ */
+static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
+			       struct ubifs_znode **zn, int *n)
+{
+	int err, exact;
+	struct ubifs_znode *znode;
+	unsigned long time = get_seconds();
+
+	dbg_tnc("search and dirty key %s", DBGKEY(key));
+
+	znode = c->zroot.znode;
+	if (unlikely(!znode)) {
+		znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+	}
+
+	znode = dirty_cow_znode(c, &c->zroot);
+	if (IS_ERR(znode))
+		return PTR_ERR(znode);
+
+	znode->time = time;
+
+	while (1) {
+		struct ubifs_zbranch *zbr;
+
+		exact = ubifs_search_zbranch(c, znode, key, n);
+
+		if (znode->level == 0)
+			break;
+
+		if (*n < 0)
+			*n = 0;
+		zbr = &znode->zbranch[*n];
+
+		if (zbr->znode) {
+			znode->time = time;
+			znode = dirty_cow_znode(c, zbr);
+			if (IS_ERR(znode))
+				return PTR_ERR(znode);
+			continue;
+		}
+
+		/* znode is not in TNC cache, load it from the media */
+		znode = ubifs_load_znode(c, zbr, znode, *n);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+		znode = dirty_cow_znode(c, zbr);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+	}
+
+	*zn = znode;
+	if (exact || !is_hash_key(c, key) || *n != -1) {
+		dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
+		return exact;
+	}
+
+	/*
+	 * See huge comment at 'lookup_level0_dirty()' what is the rest of the
+	 * code.
+	 */
+	err = tnc_prev(c, &znode, n);
+	if (err == -ENOENT) {
+		*n = -1;
+		dbg_tnc("found 0, lvl %d, n -1", znode->level);
+		return 0;
+	}
+	if (unlikely(err < 0))
+		return err;
+	if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
+		*n = -1;
+		dbg_tnc("found 0, lvl %d, n -1", znode->level);
+		return 0;
+	}
+
+	if (znode->cnext || !ubifs_zn_dirty(znode)) {
+		znode = dirty_cow_bottom_up(c, znode);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+	}
+
+	dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
+	*zn = znode;
+	return 1;
+}
+
+/**
+ * maybe_leb_gced - determine if a LEB may have been garbage collected.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @gc_seq1: garbage collection sequence number
+ *
+ * This function determines if @lnum may have been garbage collected since
+ * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise
+ * %0 is returned.
+ */
+static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
+{
+	int gc_seq2, gced_lnum;
+
+	gced_lnum = c->gced_lnum;
+	smp_rmb();
+	gc_seq2 = c->gc_seq;
+	/* Same seq means no GC */
+	if (gc_seq1 == gc_seq2)
+		return 0;
+	/* Different by more than 1 means we don't know */
+	if (gc_seq1 + 1 != gc_seq2)
+		return 1;
+	/*
+	 * We have seen the sequence number has increased by 1. Now we need to
+	 * be sure we read the right LEB number, so read it again.
+	 */
+	smp_rmb();
+	if (gced_lnum != c->gced_lnum)
+		return 1;
+	/* Finally we can check lnum */
+	if (gced_lnum == lnum)
+		return 1;
+	return 0;
+}
+
+/**
+ * ubifs_tnc_locate - look up a file-system node and return it and its location.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @lnum: LEB number is returned here
+ * @offs: offset is returned here
+ *
+ * This function looks up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure. The node location can be returned in @lnum and @offs.
+ */
+int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
+		     void *node, int *lnum, int *offs)
+{
+	int found, n, err, safely = 0, gc_seq1;
+	struct ubifs_znode *znode;
+	struct ubifs_zbranch zbr, *zt;
+
+again:
+	mutex_lock(&c->tnc_mutex);
+	found = ubifs_lookup_level0(c, key, &znode, &n);
+	if (!found) {
+		err = -ENOENT;
+		goto out;
+	} else if (found < 0) {
+		err = found;
+		goto out;
+	}
+	zt = &znode->zbranch[n];
+	if (lnum) {
+		*lnum = zt->lnum;
+		*offs = zt->offs;
+	}
+	if (is_hash_key(c, key)) {
+		/*
+		 * In this case the leaf node cache gets used, so we pass the
+		 * address of the zbranch and keep the mutex locked
+		 */
+		err = tnc_read_node_nm(c, zt, node);
+		goto out;
+	}
+	if (safely) {
+		err = ubifs_tnc_read_node(c, zt, node);
+		goto out;
+	}
+	/* Drop the TNC mutex prematurely and race with garbage collection */
+	zbr = znode->zbranch[n];
+	gc_seq1 = c->gc_seq;
+	mutex_unlock(&c->tnc_mutex);
+
+	if (ubifs_get_wbuf(c, zbr.lnum)) {
+		/* We do not GC journal heads */
+		err = ubifs_tnc_read_node(c, &zbr, node);
+		return err;
+	}
+
+	err = fallible_read_node(c, key, &zbr, node);
+	if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) {
+		/*
+		 * The node may have been GC'ed out from under us so try again
+		 * while keeping the TNC mutex locked.
+		 */
+		safely = 1;
+		goto again;
+	}
+	return 0;
+
+out:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_get_bu_keys - lookup keys for bulk-read.
+ * @c: UBIFS file-system description object
+ * @bu: bulk-read parameters and results
+ *
+ * Lookup consecutive data node keys for the same inode that reside
+ * consecutively in the same LEB. This function returns zero in case of success
+ * and a negative error code in case of failure.
+ *
+ * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function
+ * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares
+ * maximum possible amount of nodes for bulk-read.
+ */
+int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
+{
+	int n, err = 0, lnum = -1, uninitialized_var(offs);
+	int uninitialized_var(len);
+	unsigned int block = key_block(c, &bu->key);
+	struct ubifs_znode *znode;
+
+	bu->cnt = 0;
+	bu->blk_cnt = 0;
+	bu->eof = 0;
+
+	mutex_lock(&c->tnc_mutex);
+	/* Find first key */
+	err = ubifs_lookup_level0(c, &bu->key, &znode, &n);
+	if (err < 0)
+		goto out;
+	if (err) {
+		/* Key found */
+		len = znode->zbranch[n].len;
+		/* The buffer must be big enough for at least 1 node */
+		if (len > bu->buf_len) {
+			err = -EINVAL;
+			goto out;
+		}
+		/* Add this key */
+		bu->zbranch[bu->cnt++] = znode->zbranch[n];
+		bu->blk_cnt += 1;
+		lnum = znode->zbranch[n].lnum;
+		offs = ALIGN(znode->zbranch[n].offs + len, 8);
+	}
+	while (1) {
+		struct ubifs_zbranch *zbr;
+		union ubifs_key *key;
+		unsigned int next_block;
+
+		/* Find next key */
+		err = tnc_next(c, &znode, &n);
+		if (err)
+			goto out;
+		zbr = &znode->zbranch[n];
+		key = &zbr->key;
+		/* See if there is another data key for this file */
+		if (key_inum(c, key) != key_inum(c, &bu->key) ||
+		    key_type(c, key) != UBIFS_DATA_KEY) {
+			err = -ENOENT;
+			goto out;
+		}
+		if (lnum < 0) {
+			/* First key found */
+			lnum = zbr->lnum;
+			offs = ALIGN(zbr->offs + zbr->len, 8);
+			len = zbr->len;
+			if (len > bu->buf_len) {
+				err = -EINVAL;
+				goto out;
+			}
+		} else {
+			/*
+			 * The data nodes must be in consecutive positions in
+			 * the same LEB.
+			 */
+			if (zbr->lnum != lnum || zbr->offs != offs)
+				goto out;
+			offs += ALIGN(zbr->len, 8);
+			len = ALIGN(len, 8) + zbr->len;
+			/* Must not exceed buffer length */
+			if (len > bu->buf_len)
+				goto out;
+		}
+		/* Allow for holes */
+		next_block = key_block(c, key);
+		bu->blk_cnt += (next_block - block - 1);
+		if (bu->blk_cnt >= UBIFS_MAX_BULK_READ)
+			goto out;
+		block = next_block;
+		/* Add this key */
+		bu->zbranch[bu->cnt++] = *zbr;
+		bu->blk_cnt += 1;
+		/* See if we have room for more */
+		if (bu->cnt >= UBIFS_MAX_BULK_READ)
+			goto out;
+		if (bu->blk_cnt >= UBIFS_MAX_BULK_READ)
+			goto out;
+	}
+out:
+	if (err == -ENOENT) {
+		bu->eof = 1;
+		err = 0;
+	}
+	bu->gc_seq = c->gc_seq;
+	mutex_unlock(&c->tnc_mutex);
+	if (err)
+		return err;
+	/*
+	 * An enormous hole could cause bulk-read to encompass too many
+	 * page cache pages, so limit the number here.
+	 */
+	if (bu->blk_cnt > UBIFS_MAX_BULK_READ)
+		bu->blk_cnt = UBIFS_MAX_BULK_READ;
+	/*
+	 * Ensure that bulk-read covers a whole number of page cache
+	 * pages.
+	 */
+	if (UBIFS_BLOCKS_PER_PAGE == 1 ||
+	    !(bu->blk_cnt & (UBIFS_BLOCKS_PER_PAGE - 1)))
+		return 0;
+	if (bu->eof) {
+		/* At the end of file we can round up */
+		bu->blk_cnt += UBIFS_BLOCKS_PER_PAGE - 1;
+		return 0;
+	}
+	/* Exclude data nodes that do not make up a whole page cache page */
+	block = key_block(c, &bu->key) + bu->blk_cnt;
+	block &= ~(UBIFS_BLOCKS_PER_PAGE - 1);
+	while (bu->cnt) {
+		if (key_block(c, &bu->zbranch[bu->cnt - 1].key) < block)
+			break;
+		bu->cnt -= 1;
+	}
+	return 0;
+}
+
+/**
+ * read_wbuf - bulk-read from a LEB with a wbuf.
+ * @wbuf: wbuf that may overlap the read
+ * @buf: buffer into which to read
+ * @len: read length
+ * @lnum: LEB number from which to read
+ * @offs: offset from which to read
+ *
+ * This functions returns %0 on success or a negative error code on failure.
+ */
+static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum,
+		     int offs)
+{
+	const struct ubifs_info *c = wbuf->c;
+	int rlen, overlap;
+
+	dbg_io("LEB %d:%d, length %d", lnum, offs, len);
+	ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(!(offs & 7) && offs < c->leb_size);
+	ubifs_assert(offs + len <= c->leb_size);
+
+	spin_lock(&wbuf->lock);
+	overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs);
+	if (!overlap) {
+		/* We may safely unlock the write-buffer and read the data */
+		spin_unlock(&wbuf->lock);
+		return ubi_read(c->ubi, lnum, buf, offs, len);
+	}
+
+	/* Don't read under wbuf */
+	rlen = wbuf->offs - offs;
+	if (rlen < 0)
+		rlen = 0;
+
+	/* Copy the rest from the write-buffer */
+	memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen);
+	spin_unlock(&wbuf->lock);
+
+	if (rlen > 0)
+		/* Read everything that goes before write-buffer */
+		return ubi_read(c->ubi, lnum, buf, offs, rlen);
+
+	return 0;
+}
+
+/**
+ * validate_data_node - validate data nodes for bulk-read.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing data node to validate
+ * @zbr: zbranch of data node to validate
+ *
+ * This functions returns %0 on success or a negative error code on failure.
+ */
+static int validate_data_node(struct ubifs_info *c, void *buf,
+			      struct ubifs_zbranch *zbr)
+{
+	union ubifs_key key1;
+	struct ubifs_ch *ch = buf;
+	int err, len;
+
+	if (ch->node_type != UBIFS_DATA_NODE) {
+		ubifs_err("bad node type (%d but expected %d)",
+			  ch->node_type, UBIFS_DATA_NODE);
+		goto out_err;
+	}
+
+	err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0);
+	if (err) {
+		ubifs_err("expected node type %d", UBIFS_DATA_NODE);
+		goto out;
+	}
+
+	len = le32_to_cpu(ch->len);
+	if (len != zbr->len) {
+		ubifs_err("bad node length %d, expected %d", len, zbr->len);
+		goto out_err;
+	}
+
+	/* Make sure the key of the read node is correct */
+	key_read(c, buf + UBIFS_KEY_OFFSET, &key1);
+	if (!keys_eq(c, &zbr->key, &key1)) {
+		ubifs_err("bad key in node at LEB %d:%d",
+			  zbr->lnum, zbr->offs);
+		dbg_tnc("looked for key %s found node's key %s",
+			DBGKEY(&zbr->key), DBGKEY1(&key1));
+		goto out_err;
+	}
+
+	return 0;
+
+out_err:
+	err = -EINVAL;
+out:
+	ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs);
+	dbg_dump_node(c, buf);
+	dbg_dump_stack();
+	return err;
+}
+
+/**
+ * ubifs_tnc_bulk_read - read a number of data nodes in one go.
+ * @c: UBIFS file-system description object
+ * @bu: bulk-read parameters and results
+ *
+ * This functions reads and validates the data nodes that were identified by the
+ * 'ubifs_tnc_get_bu_keys()' function. This functions returns %0 on success,
+ * -EAGAIN to indicate a race with GC, or another negative error code on
+ * failure.
+ */
+int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
+{
+	int lnum = bu->zbranch[0].lnum, offs = bu->zbranch[0].offs, len, err, i;
+	struct ubifs_wbuf *wbuf;
+	void *buf;
+
+	len = bu->zbranch[bu->cnt - 1].offs;
+	len += bu->zbranch[bu->cnt - 1].len - offs;
+	if (len > bu->buf_len) {
+		ubifs_err("buffer too small %d vs %d", bu->buf_len, len);
+		return -EINVAL;
+	}
+
+	/* Do the read */
+	wbuf = ubifs_get_wbuf(c, lnum);
+	if (wbuf)
+		err = read_wbuf(wbuf, bu->buf, len, lnum, offs);
+	else
+		err = ubi_read(c->ubi, lnum, bu->buf, offs, len);
+
+	/* Check for a race with GC */
+	if (maybe_leb_gced(c, lnum, bu->gc_seq))
+		return -EAGAIN;
+
+	if (err && err != -EBADMSG) {
+		ubifs_err("failed to read from LEB %d:%d, error %d",
+			  lnum, offs, err);
+		dbg_dump_stack();
+		dbg_tnc("key %s", DBGKEY(&bu->key));
+		return err;
+	}
+
+	/* Validate the nodes read */
+	buf = bu->buf;
+	for (i = 0; i < bu->cnt; i++) {
+		err = validate_data_node(c, buf, &bu->zbranch[i]);
+		if (err)
+			return err;
+		buf = buf + ALIGN(bu->zbranch[i].len, 8);
+	}
+
+	return 0;
+}
+
+/**
+ * do_lookup_nm- look up a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @nm: node name
+ *
+ * This function look up and reads a node which contains name hash in the key.
+ * Since the hash may have collisions, there may be many nodes with the same
+ * key, so we have to sequentially look to all of them until the needed one is
+ * found. This function returns zero in case of success, %-ENOENT if the node
+ * was not found, and a negative error code in case of failure.
+ */
+static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+			void *node, const struct qstr *nm)
+{
+	int found, n, err;
+	struct ubifs_znode *znode;
+
+	dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
+	mutex_lock(&c->tnc_mutex);
+	found = ubifs_lookup_level0(c, key, &znode, &n);
+	if (!found) {
+		err = -ENOENT;
+		goto out_unlock;
+	} else if (found < 0) {
+		err = found;
+		goto out_unlock;
+	}
+
+	ubifs_assert(n >= 0);
+
+	err = resolve_collision(c, key, &znode, &n, nm);
+	dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
+	if (unlikely(err < 0))
+		goto out_unlock;
+	if (err == 0) {
+		err = -ENOENT;
+		goto out_unlock;
+	}
+
+	err = tnc_read_node_nm(c, &znode->zbranch[n], node);
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_lookup_nm - look up a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @nm: node name
+ *
+ * This function look up and reads a node which contains name hash in the key.
+ * Since the hash may have collisions, there may be many nodes with the same
+ * key, so we have to sequentially look to all of them until the needed one is
+ * found. This function returns zero in case of success, %-ENOENT if the node
+ * was not found, and a negative error code in case of failure.
+ */
+int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+			void *node, const struct qstr *nm)
+{
+	int err, len;
+	const struct ubifs_dent_node *dent = node;
+
+	/*
+	 * We assume that in most of the cases there are no name collisions and
+	 * 'ubifs_tnc_lookup()' returns us the right direntry.
+	 */
+	err = ubifs_tnc_lookup(c, key, node);
+	if (err)
+		return err;
+
+	len = le16_to_cpu(dent->nlen);
+	if (nm->len == len && !memcmp(dent->name, nm->name, len))
+		return 0;
+
+	/*
+	 * Unluckily, there are hash collisions and we have to iterate over
+	 * them look at each direntry with colliding name hash sequentially.
+	 */
+	return do_lookup_nm(c, key, node, nm);
+}
+
+/**
+ * correct_parent_keys - correct parent znodes' keys.
+ * @c: UBIFS file-system description object
+ * @znode: znode to correct parent znodes for
+ *
+ * This is a helper function for 'tnc_insert()'. When the key of the leftmost
+ * zbranch changes, keys of parent znodes have to be corrected. This helper
+ * function is called in such situations and corrects the keys if needed.
+ */
+static void correct_parent_keys(const struct ubifs_info *c,
+				struct ubifs_znode *znode)
+{
+	union ubifs_key *key, *key1;
+
+	ubifs_assert(znode->parent);
+	ubifs_assert(znode->iip == 0);
+
+	key = &znode->zbranch[0].key;
+	key1 = &znode->parent->zbranch[0].key;
+
+	while (keys_cmp(c, key, key1) < 0) {
+		key_copy(c, key, key1);
+		znode = znode->parent;
+		znode->alt = 1;
+		if (!znode->parent || znode->iip)
+			break;
+		key1 = &znode->parent->zbranch[0].key;
+	}
+}
+
+/**
+ * insert_zbranch - insert a zbranch into a znode.
+ * @znode: znode into which to insert
+ * @zbr: zbranch to insert
+ * @n: slot number to insert to
+ *
+ * This is a helper function for 'tnc_insert()'. UBIFS does not allow "gaps" in
+ * znode's array of zbranches and keeps zbranches consolidated, so when a new
+ * zbranch has to be inserted to the @znode->zbranches[]' array at the @n-th
+ * slot, zbranches starting from @n have to be moved right.
+ */
+static void insert_zbranch(struct ubifs_znode *znode,
+			   const struct ubifs_zbranch *zbr, int n)
+{
+	int i;
+
+	ubifs_assert(ubifs_zn_dirty(znode));
+
+	if (znode->level) {
+		for (i = znode->child_cnt; i > n; i--) {
+			znode->zbranch[i] = znode->zbranch[i - 1];
+			if (znode->zbranch[i].znode)
+				znode->zbranch[i].znode->iip = i;
+		}
+		if (zbr->znode)
+			zbr->znode->iip = n;
+	} else
+		for (i = znode->child_cnt; i > n; i--)
+			znode->zbranch[i] = znode->zbranch[i - 1];
+
+	znode->zbranch[n] = *zbr;
+	znode->child_cnt += 1;
+
+	/*
+	 * After inserting at slot zero, the lower bound of the key range of
+	 * this znode may have changed. If this znode is subsequently split
+	 * then the upper bound of the key range may change, and furthermore
+	 * it could change to be lower than the original lower bound. If that
+	 * happens, then it will no longer be possible to find this znode in the
+	 * TNC using the key from the index node on flash. That is bad because
+	 * if it is not found, we will assume it is obsolete and may overwrite
+	 * it. Then if there is an unclean unmount, we will start using the
+	 * old index which will be broken.
+	 *
+	 * So we first mark znodes that have insertions at slot zero, and then
+	 * if they are split we add their lnum/offs to the old_idx tree.
+	 */
+	if (n == 0)
+		znode->alt = 1;
+}
+
+/**
+ * tnc_insert - insert a node into TNC.
+ * @c: UBIFS file-system description object
+ * @znode: znode to insert into
+ * @zbr: branch to insert
+ * @n: slot number to insert new zbranch to
+ *
+ * This function inserts a new node described by @zbr into znode @znode. If
+ * znode does not have a free slot for new zbranch, it is split. Parent znodes
+ * are splat as well if needed. Returns zero in case of success or a negative
+ * error code in case of failure.
+ */
+static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode,
+		      struct ubifs_zbranch *zbr, int n)
+{
+	struct ubifs_znode *zn, *zi, *zp;
+	int i, keep, move, appending = 0;
+	union ubifs_key *key = &zbr->key, *key1;
+
+	ubifs_assert(n >= 0 && n <= c->fanout);
+
+	/* Implement naive insert for now */
+again:
+	zp = znode->parent;
+	if (znode->child_cnt < c->fanout) {
+		ubifs_assert(n != c->fanout);
+		dbg_tnc("inserted at %d level %d, key %s", n, znode->level,
+			DBGKEY(key));
+
+		insert_zbranch(znode, zbr, n);
+
+		/* Ensure parent's key is correct */
+		if (n == 0 && zp && znode->iip == 0)
+			correct_parent_keys(c, znode);
+
+		return 0;
+	}
+
+	/*
+	 * Unfortunately, @znode does not have more empty slots and we have to
+	 * split it.
+	 */
+	dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key));
+
+	if (znode->alt)
+		/*
+		 * We can no longer be sure of finding this znode by key, so we
+		 * record it in the old_idx tree.
+		 */
+		ins_clr_old_idx_znode(c, znode);
+
+	zn = kzalloc(c->max_znode_sz, GFP_NOFS);
+	if (!zn)
+		return -ENOMEM;
+	zn->parent = zp;
+	zn->level = znode->level;
+
+	/* Decide where to split */
+	if (znode->level == 0 && key_type(c, key) == UBIFS_DATA_KEY) {
+		/* Try not to split consecutive data keys */
+		if (n == c->fanout) {
+			key1 = &znode->zbranch[n - 1].key;
+			if (key_inum(c, key1) == key_inum(c, key) &&
+			    key_type(c, key1) == UBIFS_DATA_KEY)
+				appending = 1;
+		} else
+			goto check_split;
+	} else if (appending && n != c->fanout) {
+		/* Try not to split consecutive data keys */
+		appending = 0;
+check_split:
+		if (n >= (c->fanout + 1) / 2) {
+			key1 = &znode->zbranch[0].key;
+			if (key_inum(c, key1) == key_inum(c, key) &&
+			    key_type(c, key1) == UBIFS_DATA_KEY) {
+				key1 = &znode->zbranch[n].key;
+				if (key_inum(c, key1) != key_inum(c, key) ||
+				    key_type(c, key1) != UBIFS_DATA_KEY) {
+					keep = n;
+					move = c->fanout - keep;
+					zi = znode;
+					goto do_split;
+				}
+			}
+		}
+	}
+
+	if (appending) {
+		keep = c->fanout;
+		move = 0;
+	} else {
+		keep = (c->fanout + 1) / 2;
+		move = c->fanout - keep;
+	}
+
+	/*
+	 * Although we don't at present, we could look at the neighbors and see
+	 * if we can move some zbranches there.
+	 */
+
+	if (n < keep) {
+		/* Insert into existing znode */
+		zi = znode;
+		move += 1;
+		keep -= 1;
+	} else {
+		/* Insert into new znode */
+		zi = zn;
+		n -= keep;
+		/* Re-parent */
+		if (zn->level != 0)
+			zbr->znode->parent = zn;
+	}
+
+do_split:
+
+	__set_bit(DIRTY_ZNODE, &zn->flags);
+	atomic_long_inc(&c->dirty_zn_cnt);
+
+	zn->child_cnt = move;
+	znode->child_cnt = keep;
+
+	dbg_tnc("moving %d, keeping %d", move, keep);
+
+	/* Move zbranch */
+	for (i = 0; i < move; i++) {
+		zn->zbranch[i] = znode->zbranch[keep + i];
+		/* Re-parent */
+		if (zn->level != 0)
+			if (zn->zbranch[i].znode) {
+				zn->zbranch[i].znode->parent = zn;
+				zn->zbranch[i].znode->iip = i;
+			}
+	}
+
+	/* Insert new key and branch */
+	dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key));
+
+	insert_zbranch(zi, zbr, n);
+
+	/* Insert new znode (produced by spitting) into the parent */
+	if (zp) {
+		if (n == 0 && zi == znode && znode->iip == 0)
+			correct_parent_keys(c, znode);
+
+		/* Locate insertion point */
+		n = znode->iip + 1;
+
+		/* Tail recursion */
+		zbr->key = zn->zbranch[0].key;
+		zbr->znode = zn;
+		zbr->lnum = 0;
+		zbr->offs = 0;
+		zbr->len = 0;
+		znode = zp;
+
+		goto again;
+	}
+
+	/* We have to split root znode */
+	dbg_tnc("creating new zroot at level %d", znode->level + 1);
+
+	zi = kzalloc(c->max_znode_sz, GFP_NOFS);
+	if (!zi)
+		return -ENOMEM;
+
+	zi->child_cnt = 2;
+	zi->level = znode->level + 1;
+
+	__set_bit(DIRTY_ZNODE, &zi->flags);
+	atomic_long_inc(&c->dirty_zn_cnt);
+
+	zi->zbranch[0].key = znode->zbranch[0].key;
+	zi->zbranch[0].znode = znode;
+	zi->zbranch[0].lnum = c->zroot.lnum;
+	zi->zbranch[0].offs = c->zroot.offs;
+	zi->zbranch[0].len = c->zroot.len;
+	zi->zbranch[1].key = zn->zbranch[0].key;
+	zi->zbranch[1].znode = zn;
+
+	c->zroot.lnum = 0;
+	c->zroot.offs = 0;
+	c->zroot.len = 0;
+	c->zroot.znode = zi;
+
+	zn->parent = zi;
+	zn->iip = 1;
+	znode->parent = zi;
+	znode->iip = 0;
+
+	return 0;
+}
+
+/**
+ * ubifs_tnc_add - add a node to TNC.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ *
+ * This function adds a node with key @key to TNC. The node may be new or it may
+ * obsolete some existing one. Returns %0 on success or negative error code on
+ * failure.
+ */
+int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
+		  int offs, int len)
+{
+	int found, n, err = 0;
+	struct ubifs_znode *znode;
+
+	mutex_lock(&c->tnc_mutex);
+	dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key));
+	found = lookup_level0_dirty(c, key, &znode, &n);
+	if (!found) {
+		struct ubifs_zbranch zbr;
+
+		zbr.znode = NULL;
+		zbr.lnum = lnum;
+		zbr.offs = offs;
+		zbr.len = len;
+		key_copy(c, key, &zbr.key);
+		err = tnc_insert(c, znode, &zbr, n + 1);
+	} else if (found == 1) {
+		struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+		lnc_free(zbr);
+		err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+		zbr->lnum = lnum;
+		zbr->offs = offs;
+		zbr->len = len;
+	} else
+		err = found;
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+	mutex_unlock(&c->tnc_mutex);
+
+	return err;
+}
+
+/**
+ * ubifs_tnc_replace - replace a node in the TNC only if the old node is found.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @old_lnum: LEB number of old node
+ * @old_offs: old node offset
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ *
+ * This function replaces a node with key @key in the TNC only if the old node
+ * is found.  This function is called by garbage collection when node are moved.
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
+		      int old_lnum, int old_offs, int lnum, int offs, int len)
+{
+	int found, n, err = 0;
+	struct ubifs_znode *znode;
+
+	mutex_lock(&c->tnc_mutex);
+	dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum,
+		old_offs, lnum, offs, len, DBGKEY(key));
+	found = lookup_level0_dirty(c, key, &znode, &n);
+	if (found < 0) {
+		err = found;
+		goto out_unlock;
+	}
+
+	if (found == 1) {
+		struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+		found = 0;
+		if (zbr->lnum == old_lnum && zbr->offs == old_offs) {
+			lnc_free(zbr);
+			err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+			if (err)
+				goto out_unlock;
+			zbr->lnum = lnum;
+			zbr->offs = offs;
+			zbr->len = len;
+			found = 1;
+		} else if (is_hash_key(c, key)) {
+			found = resolve_collision_directly(c, key, &znode, &n,
+							   old_lnum, old_offs);
+			dbg_tnc("rc returned %d, znode %p, n %d, LEB %d:%d",
+				found, znode, n, old_lnum, old_offs);
+			if (found < 0) {
+				err = found;
+				goto out_unlock;
+			}
+
+			if (found) {
+				/* Ensure the znode is dirtied */
+				if (znode->cnext || !ubifs_zn_dirty(znode)) {
+					znode = dirty_cow_bottom_up(c, znode);
+					if (IS_ERR(znode)) {
+						err = PTR_ERR(znode);
+						goto out_unlock;
+					}
+				}
+				zbr = &znode->zbranch[n];
+				lnc_free(zbr);
+				err = ubifs_add_dirt(c, zbr->lnum,
+						     zbr->len);
+				if (err)
+					goto out_unlock;
+				zbr->lnum = lnum;
+				zbr->offs = offs;
+				zbr->len = len;
+			}
+		}
+	}
+
+	if (!found)
+		err = ubifs_add_dirt(c, lnum, len);
+
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_add_nm - add a "hashed" node to TNC.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ * @nm: node name
+ *
+ * This is the same as 'ubifs_tnc_add()' but it should be used with keys which
+ * may have collisions, like directory entry keys.
+ */
+int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
+		     int lnum, int offs, int len, const struct qstr *nm)
+{
+	int found, n, err = 0;
+	struct ubifs_znode *znode;
+
+	mutex_lock(&c->tnc_mutex);
+	dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name,
+		DBGKEY(key));
+	found = lookup_level0_dirty(c, key, &znode, &n);
+	if (found < 0) {
+		err = found;
+		goto out_unlock;
+	}
+
+	if (found == 1) {
+		if (c->replaying)
+			found = fallible_resolve_collision(c, key, &znode, &n,
+							   nm, 1);
+		else
+			found = resolve_collision(c, key, &znode, &n, nm);
+		dbg_tnc("rc returned %d, znode %p, n %d", found, znode, n);
+		if (found < 0) {
+			err = found;
+			goto out_unlock;
+		}
+
+		/* Ensure the znode is dirtied */
+		if (znode->cnext || !ubifs_zn_dirty(znode)) {
+			znode = dirty_cow_bottom_up(c, znode);
+			if (IS_ERR(znode)) {
+				err = PTR_ERR(znode);
+				goto out_unlock;
+			}
+		}
+
+		if (found == 1) {
+			struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+			lnc_free(zbr);
+			err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+			zbr->lnum = lnum;
+			zbr->offs = offs;
+			zbr->len = len;
+			goto out_unlock;
+		}
+	}
+
+	if (!found) {
+		struct ubifs_zbranch zbr;
+
+		zbr.znode = NULL;
+		zbr.lnum = lnum;
+		zbr.offs = offs;
+		zbr.len = len;
+		key_copy(c, key, &zbr.key);
+		err = tnc_insert(c, znode, &zbr, n + 1);
+		if (err)
+			goto out_unlock;
+		if (c->replaying) {
+			/*
+			 * We did not find it in the index so there may be a
+			 * dangling branch still in the index. So we remove it
+			 * by passing 'ubifs_tnc_remove_nm()' the same key but
+			 * an unmatchable name.
+			 */
+			struct qstr noname = { .len = 0, .name = "" };
+
+			err = dbg_check_tnc(c, 0);
+			mutex_unlock(&c->tnc_mutex);
+			if (err)
+				return err;
+			return ubifs_tnc_remove_nm(c, key, &noname);
+		}
+	}
+
+out_unlock:
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * tnc_delete - delete a znode form TNC.
+ * @c: UBIFS file-system description object
+ * @znode: znode to delete from
+ * @n: zbranch slot number to delete
+ *
+ * This function deletes a leaf node from @n-th slot of @znode. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
+{
+	struct ubifs_zbranch *zbr;
+	struct ubifs_znode *zp;
+	int i, err;
+
+	/* Delete without merge for now */
+	ubifs_assert(znode->level == 0);
+	ubifs_assert(n >= 0 && n < c->fanout);
+	dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key));
+
+	zbr = &znode->zbranch[n];
+	lnc_free(zbr);
+
+	err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+	if (err) {
+		dbg_dump_znode(c, znode);
+		return err;
+	}
+
+	/* We do not "gap" zbranch slots */
+	for (i = n; i < znode->child_cnt - 1; i++)
+		znode->zbranch[i] = znode->zbranch[i + 1];
+	znode->child_cnt -= 1;
+
+	if (znode->child_cnt > 0)
+		return 0;
+
+	/*
+	 * This was the last zbranch, we have to delete this znode from the
+	 * parent.
+	 */
+
+	do {
+		ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
+		ubifs_assert(ubifs_zn_dirty(znode));
+
+		zp = znode->parent;
+		n = znode->iip;
+
+		atomic_long_dec(&c->dirty_zn_cnt);
+
+		err = insert_old_idx_znode(c, znode);
+		if (err)
+			return err;
+
+		if (znode->cnext) {
+			__set_bit(OBSOLETE_ZNODE, &znode->flags);
+			atomic_long_inc(&c->clean_zn_cnt);
+			atomic_long_inc(&ubifs_clean_zn_cnt);
+		} else
+			kfree(znode);
+		znode = zp;
+	} while (znode->child_cnt == 1); /* while removing last child */
+
+	/* Remove from znode, entry n - 1 */
+	znode->child_cnt -= 1;
+	ubifs_assert(znode->level != 0);
+	for (i = n; i < znode->child_cnt; i++) {
+		znode->zbranch[i] = znode->zbranch[i + 1];
+		if (znode->zbranch[i].znode)
+			znode->zbranch[i].znode->iip = i;
+	}
+
+	/*
+	 * If this is the root and it has only 1 child then
+	 * collapse the tree.
+	 */
+	if (!znode->parent) {
+		while (znode->child_cnt == 1 && znode->level != 0) {
+			zp = znode;
+			zbr = &znode->zbranch[0];
+			znode = get_znode(c, znode, 0);
+			if (IS_ERR(znode))
+				return PTR_ERR(znode);
+			znode = dirty_cow_znode(c, zbr);
+			if (IS_ERR(znode))
+				return PTR_ERR(znode);
+			znode->parent = NULL;
+			znode->iip = 0;
+			if (c->zroot.len) {
+				err = insert_old_idx(c, c->zroot.lnum,
+						     c->zroot.offs);
+				if (err)
+					return err;
+			}
+			c->zroot.lnum = zbr->lnum;
+			c->zroot.offs = zbr->offs;
+			c->zroot.len = zbr->len;
+			c->zroot.znode = znode;
+			ubifs_assert(!test_bit(OBSOLETE_ZNODE,
+				     &zp->flags));
+			ubifs_assert(test_bit(DIRTY_ZNODE, &zp->flags));
+			atomic_long_dec(&c->dirty_zn_cnt);
+
+			if (zp->cnext) {
+				__set_bit(OBSOLETE_ZNODE, &zp->flags);
+				atomic_long_inc(&c->clean_zn_cnt);
+				atomic_long_inc(&ubifs_clean_zn_cnt);
+			} else
+				kfree(zp);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ubifs_tnc_remove - remove an index entry of a node.
+ * @c: UBIFS file-system description object
+ * @key: key of node
+ *
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key)
+{
+	int found, n, err = 0;
+	struct ubifs_znode *znode;
+
+	mutex_lock(&c->tnc_mutex);
+	dbg_tnc("key %s", DBGKEY(key));
+	found = lookup_level0_dirty(c, key, &znode, &n);
+	if (found < 0) {
+		err = found;
+		goto out_unlock;
+	}
+	if (found == 1)
+		err = tnc_delete(c, znode, n);
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_remove_nm - remove an index entry for a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: key of node
+ * @nm: directory entry name
+ *
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
+			const struct qstr *nm)
+{
+	int n, err;
+	struct ubifs_znode *znode;
+
+	mutex_lock(&c->tnc_mutex);
+	dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key));
+	err = lookup_level0_dirty(c, key, &znode, &n);
+	if (err < 0)
+		goto out_unlock;
+
+	if (err) {
+		if (c->replaying)
+			err = fallible_resolve_collision(c, key, &znode, &n,
+							 nm, 0);
+		else
+			err = resolve_collision(c, key, &znode, &n, nm);
+		dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
+		if (err < 0)
+			goto out_unlock;
+		if (err) {
+			/* Ensure the znode is dirtied */
+			if (znode->cnext || !ubifs_zn_dirty(znode)) {
+				znode = dirty_cow_bottom_up(c, znode);
+				if (IS_ERR(znode)) {
+					err = PTR_ERR(znode);
+					goto out_unlock;
+				}
+			}
+			err = tnc_delete(c, znode, n);
+		}
+	}
+
+out_unlock:
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * key_in_range - determine if a key falls within a range of keys.
+ * @c: UBIFS file-system description object
+ * @key: key to check
+ * @from_key: lowest key in range
+ * @to_key: highest key in range
+ *
+ * This function returns %1 if the key is in range and %0 otherwise.
+ */
+static int key_in_range(struct ubifs_info *c, union ubifs_key *key,
+			union ubifs_key *from_key, union ubifs_key *to_key)
+{
+	if (keys_cmp(c, key, from_key) < 0)
+		return 0;
+	if (keys_cmp(c, key, to_key) > 0)
+		return 0;
+	return 1;
+}
+
+/**
+ * ubifs_tnc_remove_range - remove index entries in range.
+ * @c: UBIFS file-system description object
+ * @from_key: lowest key to remove
+ * @to_key: highest key to remove
+ *
+ * This function removes index entries starting at @from_key and ending at
+ * @to_key.  This function returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
+			   union ubifs_key *to_key)
+{
+	int i, n, k, err = 0;
+	struct ubifs_znode *znode;
+	union ubifs_key *key;
+
+	mutex_lock(&c->tnc_mutex);
+	while (1) {
+		/* Find first level 0 znode that contains keys to remove */
+		err = ubifs_lookup_level0(c, from_key, &znode, &n);
+		if (err < 0)
+			goto out_unlock;
+
+		if (err)
+			key = from_key;
+		else {
+			err = tnc_next(c, &znode, &n);
+			if (err == -ENOENT) {
+				err = 0;
+				goto out_unlock;
+			}
+			if (err < 0)
+				goto out_unlock;
+			key = &znode->zbranch[n].key;
+			if (!key_in_range(c, key, from_key, to_key)) {
+				err = 0;
+				goto out_unlock;
+			}
+		}
+
+		/* Ensure the znode is dirtied */
+		if (znode->cnext || !ubifs_zn_dirty(znode)) {
+			znode = dirty_cow_bottom_up(c, znode);
+			if (IS_ERR(znode)) {
+				err = PTR_ERR(znode);
+				goto out_unlock;
+			}
+		}
+
+		/* Remove all keys in range except the first */
+		for (i = n + 1, k = 0; i < znode->child_cnt; i++, k++) {
+			key = &znode->zbranch[i].key;
+			if (!key_in_range(c, key, from_key, to_key))
+				break;
+			lnc_free(&znode->zbranch[i]);
+			err = ubifs_add_dirt(c, znode->zbranch[i].lnum,
+					     znode->zbranch[i].len);
+			if (err) {
+				dbg_dump_znode(c, znode);
+				goto out_unlock;
+			}
+			dbg_tnc("removing %s", DBGKEY(key));
+		}
+		if (k) {
+			for (i = n + 1 + k; i < znode->child_cnt; i++)
+				znode->zbranch[i - k] = znode->zbranch[i];
+			znode->child_cnt -= k;
+		}
+
+		/* Now delete the first */
+		err = tnc_delete(c, znode, n);
+		if (err)
+			goto out_unlock;
+	}
+
+out_unlock:
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_remove_ino - remove an inode from TNC.
+ * @c: UBIFS file-system description object
+ * @inum: inode number to remove
+ *
+ * This function remove inode @inum and all the extended attributes associated
+ * with the anode from TNC and returns zero in case of success or a negative
+ * error code in case of failure.
+ */
+int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
+{
+	union ubifs_key key1, key2;
+	struct ubifs_dent_node *xent, *pxent = NULL;
+	struct qstr nm = { .name = NULL };
+
+	dbg_tnc("ino %lu", (unsigned long)inum);
+
+	/*
+	 * Walk all extended attribute entries and remove them together with
+	 * corresponding extended attribute inodes.
+	 */
+	lowest_xent_key(c, &key1, inum);
+	while (1) {
+		ino_t xattr_inum;
+		int err;
+
+		xent = ubifs_tnc_next_ent(c, &key1, &nm);
+		if (IS_ERR(xent)) {
+			err = PTR_ERR(xent);
+			if (err == -ENOENT)
+				break;
+			return err;
+		}
+
+		xattr_inum = le64_to_cpu(xent->inum);
+		dbg_tnc("xent '%s', ino %lu", xent->name,
+			(unsigned long)xattr_inum);
+
+		nm.name = xent->name;
+		nm.len = le16_to_cpu(xent->nlen);
+		err = ubifs_tnc_remove_nm(c, &key1, &nm);
+		if (err) {
+			kfree(xent);
+			return err;
+		}
+
+		lowest_ino_key(c, &key1, xattr_inum);
+		highest_ino_key(c, &key2, xattr_inum);
+		err = ubifs_tnc_remove_range(c, &key1, &key2);
+		if (err) {
+			kfree(xent);
+			return err;
+		}
+
+		kfree(pxent);
+		pxent = xent;
+		key_read(c, &xent->key, &key1);
+	}
+
+	kfree(pxent);
+	lowest_ino_key(c, &key1, inum);
+	highest_ino_key(c, &key2, inum);
+
+	return ubifs_tnc_remove_range(c, &key1, &key2);
+}
+
+/**
+ * ubifs_tnc_next_ent - walk directory or extended attribute entries.
+ * @c: UBIFS file-system description object
+ * @key: key of last entry
+ * @nm: name of last entry found or %NULL
+ *
+ * This function finds and reads the next directory or extended attribute entry
+ * after the given key (@key) if there is one. @nm is used to resolve
+ * collisions.
+ *
+ * If the name of the current entry is not known and only the key is known,
+ * @nm->name has to be %NULL. In this case the semantics of this function is a
+ * little bit different and it returns the entry corresponding to this key, not
+ * the next one. If the key was not found, the closest "right" entry is
+ * returned.
+ *
+ * If the fist entry has to be found, @key has to contain the lowest possible
+ * key value for this inode and @name has to be %NULL.
+ *
+ * This function returns the found directory or extended attribute entry node
+ * in case of success, %-ENOENT is returned if no entry was found, and a
+ * negative error code is returned in case of failure.
+ */
+struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
+					   union ubifs_key *key,
+					   const struct qstr *nm)
+{
+	int n, err, type = key_type(c, key);
+	struct ubifs_znode *znode;
+	struct ubifs_dent_node *dent;
+	struct ubifs_zbranch *zbr;
+	union ubifs_key *dkey;
+
+	dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key));
+	ubifs_assert(is_hash_key(c, key));
+
+	mutex_lock(&c->tnc_mutex);
+	err = ubifs_lookup_level0(c, key, &znode, &n);
+	if (unlikely(err < 0))
+		goto out_unlock;
+
+	if (nm->name) {
+		if (err) {
+			/* Handle collisions */
+			err = resolve_collision(c, key, &znode, &n, nm);
+			dbg_tnc("rc returned %d, znode %p, n %d",
+				err, znode, n);
+			if (unlikely(err < 0))
+				goto out_unlock;
+		}
+
+		/* Now find next entry */
+		err = tnc_next(c, &znode, &n);
+		if (unlikely(err))
+			goto out_unlock;
+	} else {
+		/*
+		 * The full name of the entry was not given, in which case the
+		 * behavior of this function is a little different and it
+		 * returns current entry, not the next one.
+		 */
+		if (!err) {
+			/*
+			 * However, the given key does not exist in the TNC
+			 * tree and @znode/@n variables contain the closest
+			 * "preceding" element. Switch to the next one.
+			 */
+			err = tnc_next(c, &znode, &n);
+			if (err)
+				goto out_unlock;
+		}
+	}
+
+	zbr = &znode->zbranch[n];
+	dent = kmalloc(zbr->len, GFP_NOFS);
+	if (unlikely(!dent)) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	/*
+	 * The above 'tnc_next()' call could lead us to the next inode, check
+	 * this.
+	 */
+	dkey = &zbr->key;
+	if (key_inum(c, dkey) != key_inum(c, key) ||
+	    key_type(c, dkey) != type) {
+		err = -ENOENT;
+		goto out_free;
+	}
+
+	err = tnc_read_node_nm(c, zbr, dent);
+	if (unlikely(err))
+		goto out_free;
+
+	mutex_unlock(&c->tnc_mutex);
+	return dent;
+
+out_free:
+	kfree(dent);
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return ERR_PTR(err);
+}
+
+/**
+ * tnc_destroy_cnext - destroy left-over obsolete znodes from a failed commit.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy left-over obsolete znodes from a failed commit.
+ */
+static void tnc_destroy_cnext(struct ubifs_info *c)
+{
+	struct ubifs_znode *cnext;
+
+	if (!c->cnext)
+		return;
+	ubifs_assert(c->cmt_state == COMMIT_BROKEN);
+	cnext = c->cnext;
+	do {
+		struct ubifs_znode *znode = cnext;
+
+		cnext = cnext->cnext;
+		if (test_bit(OBSOLETE_ZNODE, &znode->flags))
+			kfree(znode);
+	} while (cnext && cnext != c->cnext);
+}
+
+/**
+ * ubifs_tnc_close - close TNC subsystem and free all related resources.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_tnc_close(struct ubifs_info *c)
+{
+	tnc_destroy_cnext(c);
+	if (c->zroot.znode) {
+		long n;
+
+		ubifs_destroy_tnc_subtree(c->zroot.znode);
+		n = atomic_long_read(&c->clean_zn_cnt);
+		atomic_long_sub(n, &ubifs_clean_zn_cnt);
+	}
+	kfree(c->gap_lebs);
+	kfree(c->ilebs);
+	destroy_old_idx(c);
+}
+
+/**
+ * left_znode - get the znode to the left.
+ * @c: UBIFS file-system description object
+ * @znode: znode
+ *
+ * This function returns a pointer to the znode to the left of @znode or NULL if
+ * there is not one. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *left_znode(struct ubifs_info *c,
+				      struct ubifs_znode *znode)
+{
+	int level = znode->level;
+
+	while (1) {
+		int n = znode->iip - 1;
+
+		/* Go up until we can go left */
+		znode = znode->parent;
+		if (!znode)
+			return NULL;
+		if (n >= 0) {
+			/* Now go down the rightmost branch to 'level' */
+			znode = get_znode(c, znode, n);
+			if (IS_ERR(znode))
+				return znode;
+			while (znode->level != level) {
+				n = znode->child_cnt - 1;
+				znode = get_znode(c, znode, n);
+				if (IS_ERR(znode))
+					return znode;
+			}
+			break;
+		}
+	}
+	return znode;
+}
+
+/**
+ * right_znode - get the znode to the right.
+ * @c: UBIFS file-system description object
+ * @znode: znode
+ *
+ * This function returns a pointer to the znode to the right of @znode or NULL
+ * if there is not one. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *right_znode(struct ubifs_info *c,
+				       struct ubifs_znode *znode)
+{
+	int level = znode->level;
+
+	while (1) {
+		int n = znode->iip + 1;
+
+		/* Go up until we can go right */
+		znode = znode->parent;
+		if (!znode)
+			return NULL;
+		if (n < znode->child_cnt) {
+			/* Now go down the leftmost branch to 'level' */
+			znode = get_znode(c, znode, n);
+			if (IS_ERR(znode))
+				return znode;
+			while (znode->level != level) {
+				znode = get_znode(c, znode, 0);
+				if (IS_ERR(znode))
+					return znode;
+			}
+			break;
+		}
+	}
+	return znode;
+}
+
+/**
+ * lookup_znode - find a particular indexing node from TNC.
+ * @c: UBIFS file-system description object
+ * @key: index node key to lookup
+ * @level: index node level
+ * @lnum: index node LEB number
+ * @offs: index node offset
+ *
+ * This function searches an indexing node by its first key @key and its
+ * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
+ * nodes it traverses to TNC. This function is called for indexing nodes which
+ * were found on the media by scanning, for example when garbage-collecting or
+ * when doing in-the-gaps commit. This means that the indexing node which is
+ * looked for does not have to have exactly the same leftmost key @key, because
+ * the leftmost key may have been changed, in which case TNC will contain a
+ * dirty znode which still refers the same @lnum:@offs. This function is clever
+ * enough to recognize such indexing nodes.
+ *
+ * Note, if a znode was deleted or changed too much, then this function will
+ * not find it. For situations like this UBIFS has the old index RB-tree
+ * (indexed by @lnum:@offs).
+ *
+ * This function returns a pointer to the znode found or %NULL if it is not
+ * found. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
+					union ubifs_key *key, int level,
+					int lnum, int offs)
+{
+	struct ubifs_znode *znode, *zn;
+	int n, nn;
+
+	ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
+
+	/*
+	 * The arguments have probably been read off flash, so don't assume
+	 * they are valid.
+	 */
+	if (level < 0)
+		return ERR_PTR(-EINVAL);
+
+	/* Get the root znode */
+	znode = c->zroot.znode;
+	if (!znode) {
+		znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+		if (IS_ERR(znode))
+			return znode;
+	}
+	/* Check if it is the one we are looking for */
+	if (c->zroot.lnum == lnum && c->zroot.offs == offs)
+		return znode;
+	/* Descend to the parent level i.e. (level + 1) */
+	if (level >= znode->level)
+		return NULL;
+	while (1) {
+		ubifs_search_zbranch(c, znode, key, &n);
+		if (n < 0) {
+			/*
+			 * We reached a znode where the leftmost key is greater
+			 * than the key we are searching for. This is the same
+			 * situation as the one described in a huge comment at
+			 * the end of the 'ubifs_lookup_level0()' function. And
+			 * for exactly the same reasons we have to try to look
+			 * left before giving up.
+			 */
+			znode = left_znode(c, znode);
+			if (!znode)
+				return NULL;
+			if (IS_ERR(znode))
+				return znode;
+			ubifs_search_zbranch(c, znode, key, &n);
+			ubifs_assert(n >= 0);
+		}
+		if (znode->level == level + 1)
+			break;
+		znode = get_znode(c, znode, n);
+		if (IS_ERR(znode))
+			return znode;
+	}
+	/* Check if the child is the one we are looking for */
+	if (znode->zbranch[n].lnum == lnum && znode->zbranch[n].offs == offs)
+		return get_znode(c, znode, n);
+	/* If the key is unique, there is nowhere else to look */
+	if (!is_hash_key(c, key))
+		return NULL;
+	/*
+	 * The key is not unique and so may be also in the znodes to either
+	 * side.
+	 */
+	zn = znode;
+	nn = n;
+	/* Look left */
+	while (1) {
+		/* Move one branch to the left */
+		if (n)
+			n -= 1;
+		else {
+			znode = left_znode(c, znode);
+			if (!znode)
+				break;
+			if (IS_ERR(znode))
+				return znode;
+			n = znode->child_cnt - 1;
+		}
+		/* Check it */
+		if (znode->zbranch[n].lnum == lnum &&
+		    znode->zbranch[n].offs == offs)
+			return get_znode(c, znode, n);
+		/* Stop if the key is less than the one we are looking for */
+		if (keys_cmp(c, &znode->zbranch[n].key, key) < 0)
+			break;
+	}
+	/* Back to the middle */
+	znode = zn;
+	n = nn;
+	/* Look right */
+	while (1) {
+		/* Move one branch to the right */
+		if (++n >= znode->child_cnt) {
+			znode = right_znode(c, znode);
+			if (!znode)
+				break;
+			if (IS_ERR(znode))
+				return znode;
+			n = 0;
+		}
+		/* Check it */
+		if (znode->zbranch[n].lnum == lnum &&
+		    znode->zbranch[n].offs == offs)
+			return get_znode(c, znode, n);
+		/* Stop if the key is greater than the one we are looking for */
+		if (keys_cmp(c, &znode->zbranch[n].key, key) > 0)
+			break;
+	}
+	return NULL;
+}
+
+/**
+ * is_idx_node_in_tnc - determine if an index node is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: key of index node
+ * @level: index node level
+ * @lnum: LEB number of index node
+ * @offs: offset of index node
+ *
+ * This function returns %0 if the index node is not referred to in the TNC, %1
+ * if the index node is referred to in the TNC and the corresponding znode is
+ * dirty, %2 if an index node is referred to in the TNC and the corresponding
+ * znode is clean, and a negative error code in case of failure.
+ *
+ * Note, the @key argument has to be the key of the first child. Also note,
+ * this function relies on the fact that 0:0 is never a valid LEB number and
+ * offset for a main-area node.
+ */
+int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
+		       int lnum, int offs)
+{
+	struct ubifs_znode *znode;
+
+	znode = lookup_znode(c, key, level, lnum, offs);
+	if (!znode)
+		return 0;
+	if (IS_ERR(znode))
+		return PTR_ERR(znode);
+
+	return ubifs_zn_dirty(znode) ? 1 : 2;
+}
+
+/**
+ * is_leaf_node_in_tnc - determine if a non-indexing not is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @lnum: node LEB number
+ * @offs: node offset
+ *
+ * This function returns %1 if the node is referred to in the TNC, %0 if it is
+ * not, and a negative error code in case of failure.
+ *
+ * Note, this function relies on the fact that 0:0 is never a valid LEB number
+ * and offset for a main-area node.
+ */
+static int is_leaf_node_in_tnc(struct ubifs_info *c, union ubifs_key *key,
+			       int lnum, int offs)
+{
+	struct ubifs_zbranch *zbr;
+	struct ubifs_znode *znode, *zn;
+	int n, found, err, nn;
+	const int unique = !is_hash_key(c, key);
+
+	found = ubifs_lookup_level0(c, key, &znode, &n);
+	if (found < 0)
+		return found; /* Error code */
+	if (!found)
+		return 0;
+	zbr = &znode->zbranch[n];
+	if (lnum == zbr->lnum && offs == zbr->offs)
+		return 1; /* Found it */
+	if (unique)
+		return 0;
+	/*
+	 * Because the key is not unique, we have to look left
+	 * and right as well
+	 */
+	zn = znode;
+	nn = n;
+	/* Look left */
+	while (1) {
+		err = tnc_prev(c, &znode, &n);
+		if (err == -ENOENT)
+			break;
+		if (err)
+			return err;
+		if (keys_cmp(c, key, &znode->zbranch[n].key))
+			break;
+		zbr = &znode->zbranch[n];
+		if (lnum == zbr->lnum && offs == zbr->offs)
+			return 1; /* Found it */
+	}
+	/* Look right */
+	znode = zn;
+	n = nn;
+	while (1) {
+		err = tnc_next(c, &znode, &n);
+		if (err) {
+			if (err == -ENOENT)
+				return 0;
+			return err;
+		}
+		if (keys_cmp(c, key, &znode->zbranch[n].key))
+			break;
+		zbr = &znode->zbranch[n];
+		if (lnum == zbr->lnum && offs == zbr->offs)
+			return 1; /* Found it */
+	}
+	return 0;
+}
+
+/**
+ * ubifs_tnc_has_node - determine whether a node is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @level: index node level (if it is an index node)
+ * @lnum: node LEB number
+ * @offs: node offset
+ * @is_idx: non-zero if the node is an index node
+ *
+ * This function returns %1 if the node is in the TNC, %0 if it is not, and a
+ * negative error code in case of failure. For index nodes, @key has to be the
+ * key of the first child. An index node is considered to be in the TNC only if
+ * the corresponding znode is clean or has not been loaded.
+ */
+int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
+		       int lnum, int offs, int is_idx)
+{
+	int err;
+
+	mutex_lock(&c->tnc_mutex);
+	if (is_idx) {
+		err = is_idx_node_in_tnc(c, key, level, lnum, offs);
+		if (err < 0)
+			goto out_unlock;
+		if (err == 1)
+			/* The index node was found but it was dirty */
+			err = 0;
+		else if (err == 2)
+			/* The index node was found and it was clean */
+			err = 1;
+		else
+			BUG_ON(err != 0);
+	} else
+		err = is_leaf_node_in_tnc(c, key, lnum, offs);
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_dirty_idx_node - dirty an index node.
+ * @c: UBIFS file-system description object
+ * @key: index node key
+ * @level: index node level
+ * @lnum: index node LEB number
+ * @offs: index node offset
+ *
+ * This function loads and dirties an index node so that it can be garbage
+ * collected. The @key argument has to be the key of the first child. This
+ * function relies on the fact that 0:0 is never a valid LEB number and offset
+ * for a main-area node. Returns %0 on success and a negative error code on
+ * failure.
+ */
+int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
+			 int lnum, int offs)
+{
+	struct ubifs_znode *znode;
+	int err = 0;
+
+	mutex_lock(&c->tnc_mutex);
+	znode = lookup_znode(c, key, level, lnum, offs);
+	if (!znode)
+		goto out_unlock;
+	if (IS_ERR(znode)) {
+		err = PTR_ERR(znode);
+		goto out_unlock;
+	}
+	znode = dirty_cow_bottom_up(c, znode);
+	if (IS_ERR(znode)) {
+		err = PTR_ERR(znode);
+		goto out_unlock;
+	}
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_check_inode_size - check if inode size is correct.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ * @size: inode size
+ *
+ * This function makes sure that the inode size (@size) is correct and it does
+ * not have any pages beyond @size. Returns zero if the inode is OK, %-EINVAL
+ * if it has a data page beyond @size, and other negative error code in case of
+ * other errors.
+ */
+int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
+			 loff_t size)
+{
+	int err, n;
+	union ubifs_key from_key, to_key, *key;
+	struct ubifs_znode *znode;
+	unsigned int block;
+
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+
+	block = (size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
+	data_key_init(c, &from_key, inode->i_ino, block);
+	highest_data_key(c, &to_key, inode->i_ino);
+
+	mutex_lock(&c->tnc_mutex);
+	err = ubifs_lookup_level0(c, &from_key, &znode, &n);
+	if (err < 0)
+		goto out_unlock;
+
+	if (err) {
+		err = -EINVAL;
+		key = &from_key;
+		goto out_dump;
+	}
+
+	err = tnc_next(c, &znode, &n);
+	if (err == -ENOENT) {
+		err = 0;
+		goto out_unlock;
+	}
+	if (err < 0)
+		goto out_unlock;
+
+	ubifs_assert(err == 0);
+	key = &znode->zbranch[n].key;
+	if (!key_in_range(c, key, &from_key, &to_key))
+		goto out_unlock;
+
+out_dump:
+	block = key_block(c, key);
+	ubifs_err("inode %lu has size %lld, but there are data at offset %lld "
+		  "(data key %s)", (unsigned long)inode->i_ino, size,
+		  ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key));
+	dbg_dump_inode(c, inode);
+	dbg_dump_stack();
+	err = -EINVAL;
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
new file mode 100644
index 00000000..41920f35
--- /dev/null
+++ b/fs/ubifs/tnc_commit.c
@@ -0,0 +1,1103 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/* This file implements TNC functions for committing */
+
+#include "ubifs.h"
+
+/**
+ * make_idx_node - make an index node for fill-the-gaps method of TNC commit.
+ * @c: UBIFS file-system description object
+ * @idx: buffer in which to place new index node
+ * @znode: znode from which to make new index node
+ * @lnum: LEB number where new index node will be written
+ * @offs: offset where new index node will be written
+ * @len: length of new index node
+ */
+static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx,
+			 struct ubifs_znode *znode, int lnum, int offs, int len)
+{
+	struct ubifs_znode *zp;
+	int i, err;
+
+	/* Make index node */
+	idx->ch.node_type = UBIFS_IDX_NODE;
+	idx->child_cnt = cpu_to_le16(znode->child_cnt);
+	idx->level = cpu_to_le16(znode->level);
+	for (i = 0; i < znode->child_cnt; i++) {
+		struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+		struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+		key_write_idx(c, &zbr->key, &br->key);
+		br->lnum = cpu_to_le32(zbr->lnum);
+		br->offs = cpu_to_le32(zbr->offs);
+		br->len = cpu_to_le32(zbr->len);
+		if (!zbr->lnum || !zbr->len) {
+			ubifs_err("bad ref in znode");
+			dbg_dump_znode(c, znode);
+			if (zbr->znode)
+				dbg_dump_znode(c, zbr->znode);
+		}
+	}
+	ubifs_prepare_node(c, idx, len, 0);
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+	znode->lnum = lnum;
+	znode->offs = offs;
+	znode->len = len;
+#endif
+
+	err = insert_old_idx_znode(c, znode);
+
+	/* Update the parent */
+	zp = znode->parent;
+	if (zp) {
+		struct ubifs_zbranch *zbr;
+
+		zbr = &zp->zbranch[znode->iip];
+		zbr->lnum = lnum;
+		zbr->offs = offs;
+		zbr->len = len;
+	} else {
+		c->zroot.lnum = lnum;
+		c->zroot.offs = offs;
+		c->zroot.len = len;
+	}
+	c->calc_idx_sz += ALIGN(len, 8);
+
+	atomic_long_dec(&c->dirty_zn_cnt);
+
+	ubifs_assert(ubifs_zn_dirty(znode));
+	ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
+
+	__clear_bit(DIRTY_ZNODE, &znode->flags);
+	__clear_bit(COW_ZNODE, &znode->flags);
+
+	return err;
+}
+
+/**
+ * fill_gap - make index nodes in gaps in dirty index LEBs.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number that gap appears in
+ * @gap_start: offset of start of gap
+ * @gap_end: offset of end of gap
+ * @dirt: adds dirty space to this
+ *
+ * This function returns the number of index nodes written into the gap.
+ */
+static int fill_gap(struct ubifs_info *c, int lnum, int gap_start, int gap_end,
+		    int *dirt)
+{
+	int len, gap_remains, gap_pos, written, pad_len;
+
+	ubifs_assert((gap_start & 7) == 0);
+	ubifs_assert((gap_end & 7) == 0);
+	ubifs_assert(gap_end >= gap_start);
+
+	gap_remains = gap_end - gap_start;
+	if (!gap_remains)
+		return 0;
+	gap_pos = gap_start;
+	written = 0;
+	while (c->enext) {
+		len = ubifs_idx_node_sz(c, c->enext->child_cnt);
+		if (len < gap_remains) {
+			struct ubifs_znode *znode = c->enext;
+			const int alen = ALIGN(len, 8);
+			int err;
+
+			ubifs_assert(alen <= gap_remains);
+			err = make_idx_node(c, c->ileb_buf + gap_pos, znode,
+					    lnum, gap_pos, len);
+			if (err)
+				return err;
+			gap_remains -= alen;
+			gap_pos += alen;
+			c->enext = znode->cnext;
+			if (c->enext == c->cnext)
+				c->enext = NULL;
+			written += 1;
+		} else
+			break;
+	}
+	if (gap_end == c->leb_size) {
+		c->ileb_len = ALIGN(gap_pos, c->min_io_size);
+		/* Pad to end of min_io_size */
+		pad_len = c->ileb_len - gap_pos;
+	} else
+		/* Pad to end of gap */
+		pad_len = gap_remains;
+	dbg_gc("LEB %d:%d to %d len %d nodes written %d wasted bytes %d",
+	       lnum, gap_start, gap_end, gap_end - gap_start, written, pad_len);
+	ubifs_pad(c, c->ileb_buf + gap_pos, pad_len);
+	*dirt += pad_len;
+	return written;
+}
+
+/**
+ * find_old_idx - find an index node obsoleted since the last commit start.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ *
+ * Returns %1 if found and %0 otherwise.
+ */
+static int find_old_idx(struct ubifs_info *c, int lnum, int offs)
+{
+	struct ubifs_old_idx *o;
+	struct rb_node *p;
+
+	p = c->old_idx.rb_node;
+	while (p) {
+		o = rb_entry(p, struct ubifs_old_idx, rb);
+		if (lnum < o->lnum)
+			p = p->rb_left;
+		else if (lnum > o->lnum)
+			p = p->rb_right;
+		else if (offs < o->offs)
+			p = p->rb_left;
+		else if (offs > o->offs)
+			p = p->rb_right;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+/**
+ * is_idx_node_in_use - determine if an index node can be overwritten.
+ * @c: UBIFS file-system description object
+ * @key: key of index node
+ * @level: index node level
+ * @lnum: LEB number of index node
+ * @offs: offset of index node
+ *
+ * If @key / @lnum / @offs identify an index node that was not part of the old
+ * index, then this function returns %0 (obsolete).  Else if the index node was
+ * part of the old index but is now dirty %1 is returned, else if it is clean %2
+ * is returned. A negative error code is returned on failure.
+ */
+static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key,
+			      int level, int lnum, int offs)
+{
+	int ret;
+
+	ret = is_idx_node_in_tnc(c, key, level, lnum, offs);
+	if (ret < 0)
+		return ret; /* Error code */
+	if (ret == 0)
+		if (find_old_idx(c, lnum, offs))
+			return 1;
+	return ret;
+}
+
+/**
+ * layout_leb_in_gaps - layout index nodes using in-the-gaps method.
+ * @c: UBIFS file-system description object
+ * @p: return LEB number here
+ *
+ * This function lays out new index nodes for dirty znodes using in-the-gaps
+ * method of TNC commit.
+ * This function merely puts the next znode into the next gap, making no attempt
+ * to try to maximise the number of znodes that fit.
+ * This function returns the number of index nodes written into the gaps, or a
+ * negative error code on failure.
+ */
+static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	int lnum, dirt = 0, gap_start, gap_end, err, written, tot_written;
+
+	tot_written = 0;
+	/* Get an index LEB with lots of obsolete index nodes */
+	lnum = ubifs_find_dirty_idx_leb(c);
+	if (lnum < 0)
+		/*
+		 * There also may be dirt in the index head that could be
+		 * filled, however we do not check there at present.
+		 */
+		return lnum; /* Error code */
+	*p = lnum;
+	dbg_gc("LEB %d", lnum);
+	/*
+	 * Scan the index LEB.  We use the generic scan for this even though
+	 * it is more comprehensive and less efficient than is needed for this
+	 * purpose.
+	 */
+	sleb = ubifs_scan(c, lnum, 0, c->ileb_buf, 0);
+	c->ileb_len = 0;
+	if (IS_ERR(sleb))
+		return PTR_ERR(sleb);
+	gap_start = 0;
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		struct ubifs_idx_node *idx;
+		int in_use, level;
+
+		ubifs_assert(snod->type == UBIFS_IDX_NODE);
+		idx = snod->node;
+		key_read(c, ubifs_idx_key(c, idx), &snod->key);
+		level = le16_to_cpu(idx->level);
+		/* Determine if the index node is in use (not obsolete) */
+		in_use = is_idx_node_in_use(c, &snod->key, level, lnum,
+					    snod->offs);
+		if (in_use < 0) {
+			ubifs_scan_destroy(sleb);
+			return in_use; /* Error code */
+		}
+		if (in_use) {
+			if (in_use == 1)
+				dirt += ALIGN(snod->len, 8);
+			/*
+			 * The obsolete index nodes form gaps that can be
+			 * overwritten.  This gap has ended because we have
+			 * found an index node that is still in use
+			 * i.e. not obsolete
+			 */
+			gap_end = snod->offs;
+			/* Try to fill gap */
+			written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
+			if (written < 0) {
+				ubifs_scan_destroy(sleb);
+				return written; /* Error code */
+			}
+			tot_written += written;
+			gap_start = ALIGN(snod->offs + snod->len, 8);
+		}
+	}
+	ubifs_scan_destroy(sleb);
+	c->ileb_len = c->leb_size;
+	gap_end = c->leb_size;
+	/* Try to fill gap */
+	written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
+	if (written < 0)
+		return written; /* Error code */
+	tot_written += written;
+	if (tot_written == 0) {
+		struct ubifs_lprops lp;
+
+		dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
+		err = ubifs_read_one_lp(c, lnum, &lp);
+		if (err)
+			return err;
+		if (lp.free == c->leb_size) {
+			/*
+			 * We must have snatched this LEB from the idx_gc list
+			 * so we need to correct the free and dirty space.
+			 */
+			err = ubifs_change_one_lp(c, lnum,
+						  c->leb_size - c->ileb_len,
+						  dirt, 0, 0, 0);
+			if (err)
+				return err;
+		}
+		return 0;
+	}
+	err = ubifs_change_one_lp(c, lnum, c->leb_size - c->ileb_len, dirt,
+				  0, 0, 0);
+	if (err)
+		return err;
+	err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len,
+			       UBI_SHORTTERM);
+	if (err)
+		return err;
+	dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
+	return tot_written;
+}
+
+/**
+ * get_leb_cnt - calculate the number of empty LEBs needed to commit.
+ * @c: UBIFS file-system description object
+ * @cnt: number of znodes to commit
+ *
+ * This function returns the number of empty LEBs needed to commit @cnt znodes
+ * to the current index head.  The number is not exact and may be more than
+ * needed.
+ */
+static int get_leb_cnt(struct ubifs_info *c, int cnt)
+{
+	int d;
+
+	/* Assume maximum index node size (i.e. overestimate space needed) */
+	cnt -= (c->leb_size - c->ihead_offs) / c->max_idx_node_sz;
+	if (cnt < 0)
+		cnt = 0;
+	d = c->leb_size / c->max_idx_node_sz;
+	return DIV_ROUND_UP(cnt, d);
+}
+
+/**
+ * layout_in_gaps - in-the-gaps method of committing TNC.
+ * @c: UBIFS file-system description object
+ * @cnt: number of dirty znodes to commit.
+ *
+ * This function lays out new index nodes for dirty znodes using in-the-gaps
+ * method of TNC commit.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_in_gaps(struct ubifs_info *c, int cnt)
+{
+	int err, leb_needed_cnt, written, *p;
+
+	dbg_gc("%d znodes to write", cnt);
+
+	c->gap_lebs = kmalloc(sizeof(int) * (c->lst.idx_lebs + 1), GFP_NOFS);
+	if (!c->gap_lebs)
+		return -ENOMEM;
+
+	p = c->gap_lebs;
+	do {
+		ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs);
+		written = layout_leb_in_gaps(c, p);
+		if (written < 0) {
+			err = written;
+			if (err != -ENOSPC) {
+				kfree(c->gap_lebs);
+				c->gap_lebs = NULL;
+				return err;
+			}
+			if (dbg_force_in_the_gaps_enabled()) {
+				/*
+				 * Do not print scary warnings if the debugging
+				 * option which forces in-the-gaps is enabled.
+				 */
+				ubifs_warn("out of space");
+				dbg_dump_budg(c, &c->bi);
+				dbg_dump_lprops(c);
+			}
+			/* Try to commit anyway */
+			err = 0;
+			break;
+		}
+		p++;
+		cnt -= written;
+		leb_needed_cnt = get_leb_cnt(c, cnt);
+		dbg_gc("%d znodes remaining, need %d LEBs, have %d", cnt,
+		       leb_needed_cnt, c->ileb_cnt);
+	} while (leb_needed_cnt > c->ileb_cnt);
+
+	*p = -1;
+	return 0;
+}
+
+/**
+ * layout_in_empty_space - layout index nodes in empty space.
+ * @c: UBIFS file-system description object
+ *
+ * This function lays out new index nodes for dirty znodes using empty LEBs.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_in_empty_space(struct ubifs_info *c)
+{
+	struct ubifs_znode *znode, *cnext, *zp;
+	int lnum, offs, len, next_len, buf_len, buf_offs, used, avail;
+	int wlen, blen, err;
+
+	cnext = c->enext;
+	if (!cnext)
+		return 0;
+
+	lnum = c->ihead_lnum;
+	buf_offs = c->ihead_offs;
+
+	buf_len = ubifs_idx_node_sz(c, c->fanout);
+	buf_len = ALIGN(buf_len, c->min_io_size);
+	used = 0;
+	avail = buf_len;
+
+	/* Ensure there is enough room for first write */
+	next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+	if (buf_offs + next_len > c->leb_size)
+		lnum = -1;
+
+	while (1) {
+		znode = cnext;
+
+		len = ubifs_idx_node_sz(c, znode->child_cnt);
+
+		/* Determine the index node position */
+		if (lnum == -1) {
+			if (c->ileb_nxt >= c->ileb_cnt) {
+				ubifs_err("out of space");
+				return -ENOSPC;
+			}
+			lnum = c->ilebs[c->ileb_nxt++];
+			buf_offs = 0;
+			used = 0;
+			avail = buf_len;
+		}
+
+		offs = buf_offs + used;
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+		znode->lnum = lnum;
+		znode->offs = offs;
+		znode->len = len;
+#endif
+
+		/* Update the parent */
+		zp = znode->parent;
+		if (zp) {
+			struct ubifs_zbranch *zbr;
+			int i;
+
+			i = znode->iip;
+			zbr = &zp->zbranch[i];
+			zbr->lnum = lnum;
+			zbr->offs = offs;
+			zbr->len = len;
+		} else {
+			c->zroot.lnum = lnum;
+			c->zroot.offs = offs;
+			c->zroot.len = len;
+		}
+		c->calc_idx_sz += ALIGN(len, 8);
+
+		/*
+		 * Once lprops is updated, we can decrease the dirty znode count
+		 * but it is easier to just do it here.
+		 */
+		atomic_long_dec(&c->dirty_zn_cnt);
+
+		/*
+		 * Calculate the next index node length to see if there is
+		 * enough room for it
+		 */
+		cnext = znode->cnext;
+		if (cnext == c->cnext)
+			next_len = 0;
+		else
+			next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+
+		if (c->min_io_size == 1) {
+			buf_offs += ALIGN(len, 8);
+			if (next_len) {
+				if (buf_offs + next_len <= c->leb_size)
+					continue;
+				err = ubifs_update_one_lp(c, lnum, 0,
+						c->leb_size - buf_offs, 0, 0);
+				if (err)
+					return err;
+				lnum = -1;
+				continue;
+			}
+			err = ubifs_update_one_lp(c, lnum,
+					c->leb_size - buf_offs, 0, 0, 0);
+			if (err)
+				return err;
+			break;
+		}
+
+		/* Update buffer positions */
+		wlen = used + len;
+		used += ALIGN(len, 8);
+		avail -= ALIGN(len, 8);
+
+		if (next_len != 0 &&
+		    buf_offs + used + next_len <= c->leb_size &&
+		    avail > 0)
+			continue;
+
+		if (avail <= 0 && next_len &&
+		    buf_offs + used + next_len <= c->leb_size)
+			blen = buf_len;
+		else
+			blen = ALIGN(wlen, c->min_io_size);
+
+		/* The buffer is full or there are no more znodes to do */
+		buf_offs += blen;
+		if (next_len) {
+			if (buf_offs + next_len > c->leb_size) {
+				err = ubifs_update_one_lp(c, lnum,
+					c->leb_size - buf_offs, blen - used,
+					0, 0);
+				if (err)
+					return err;
+				lnum = -1;
+			}
+			used -= blen;
+			if (used < 0)
+				used = 0;
+			avail = buf_len - used;
+			continue;
+		}
+		err = ubifs_update_one_lp(c, lnum, c->leb_size - buf_offs,
+					  blen - used, 0, 0);
+		if (err)
+			return err;
+		break;
+	}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+	c->dbg->new_ihead_lnum = lnum;
+	c->dbg->new_ihead_offs = buf_offs;
+#endif
+
+	return 0;
+}
+
+/**
+ * layout_commit - determine positions of index nodes to commit.
+ * @c: UBIFS file-system description object
+ * @no_space: indicates that insufficient empty LEBs were allocated
+ * @cnt: number of znodes to commit
+ *
+ * Calculate and update the positions of index nodes to commit.  If there were
+ * an insufficient number of empty LEBs allocated, then index nodes are placed
+ * into the gaps created by obsolete index nodes in non-empty index LEBs.  For
+ * this purpose, an obsolete index node is one that was not in the index as at
+ * the end of the last commit.  To write "in-the-gaps" requires that those index
+ * LEBs are updated atomically in-place.
+ */
+static int layout_commit(struct ubifs_info *c, int no_space, int cnt)
+{
+	int err;
+
+	if (no_space) {
+		err = layout_in_gaps(c, cnt);
+		if (err)
+			return err;
+	}
+	err = layout_in_empty_space(c);
+	return err;
+}
+
+/**
+ * find_first_dirty - find first dirty znode.
+ * @znode: znode to begin searching from
+ */
+static struct ubifs_znode *find_first_dirty(struct ubifs_znode *znode)
+{
+	int i, cont;
+
+	if (!znode)
+		return NULL;
+
+	while (1) {
+		if (znode->level == 0) {
+			if (ubifs_zn_dirty(znode))
+				return znode;
+			return NULL;
+		}
+		cont = 0;
+		for (i = 0; i < znode->child_cnt; i++) {
+			struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+			if (zbr->znode && ubifs_zn_dirty(zbr->znode)) {
+				znode = zbr->znode;
+				cont = 1;
+				break;
+			}
+		}
+		if (!cont) {
+			if (ubifs_zn_dirty(znode))
+				return znode;
+			return NULL;
+		}
+	}
+}
+
+/**
+ * find_next_dirty - find next dirty znode.
+ * @znode: znode to begin searching from
+ */
+static struct ubifs_znode *find_next_dirty(struct ubifs_znode *znode)
+{
+	int n = znode->iip + 1;
+
+	znode = znode->parent;
+	if (!znode)
+		return NULL;
+	for (; n < znode->child_cnt; n++) {
+		struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+		if (zbr->znode && ubifs_zn_dirty(zbr->znode))
+			return find_first_dirty(zbr->znode);
+	}
+	return znode;
+}
+
+/**
+ * get_znodes_to_commit - create list of dirty znodes to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of znodes to commit.
+ */
+static int get_znodes_to_commit(struct ubifs_info *c)
+{
+	struct ubifs_znode *znode, *cnext;
+	int cnt = 0;
+
+	c->cnext = find_first_dirty(c->zroot.znode);
+	znode = c->enext = c->cnext;
+	if (!znode) {
+		dbg_cmt("no znodes to commit");
+		return 0;
+	}
+	cnt += 1;
+	while (1) {
+		ubifs_assert(!test_bit(COW_ZNODE, &znode->flags));
+		__set_bit(COW_ZNODE, &znode->flags);
+		znode->alt = 0;
+		cnext = find_next_dirty(znode);
+		if (!cnext) {
+			znode->cnext = c->cnext;
+			break;
+		}
+		znode->cnext = cnext;
+		znode = cnext;
+		cnt += 1;
+	}
+	dbg_cmt("committing %d znodes", cnt);
+	ubifs_assert(cnt == atomic_long_read(&c->dirty_zn_cnt));
+	return cnt;
+}
+
+/**
+ * alloc_idx_lebs - allocate empty LEBs to be used to commit.
+ * @c: UBIFS file-system description object
+ * @cnt: number of znodes to commit
+ *
+ * This function returns %-ENOSPC if it cannot allocate a sufficient number of
+ * empty LEBs.  %0 is returned on success, otherwise a negative error code
+ * is returned.
+ */
+static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
+{
+	int i, leb_cnt, lnum;
+
+	c->ileb_cnt = 0;
+	c->ileb_nxt = 0;
+	leb_cnt = get_leb_cnt(c, cnt);
+	dbg_cmt("need about %d empty LEBS for TNC commit", leb_cnt);
+	if (!leb_cnt)
+		return 0;
+	c->ilebs = kmalloc(leb_cnt * sizeof(int), GFP_NOFS);
+	if (!c->ilebs)
+		return -ENOMEM;
+	for (i = 0; i < leb_cnt; i++) {
+		lnum = ubifs_find_free_leb_for_idx(c);
+		if (lnum < 0)
+			return lnum;
+		c->ilebs[c->ileb_cnt++] = lnum;
+		dbg_cmt("LEB %d", lnum);
+	}
+	if (dbg_force_in_the_gaps())
+		return -ENOSPC;
+	return 0;
+}
+
+/**
+ * free_unused_idx_lebs - free unused LEBs that were allocated for the commit.
+ * @c: UBIFS file-system description object
+ *
+ * It is possible that we allocate more empty LEBs for the commit than we need.
+ * This functions frees the surplus.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int free_unused_idx_lebs(struct ubifs_info *c)
+{
+	int i, err = 0, lnum, er;
+
+	for (i = c->ileb_nxt; i < c->ileb_cnt; i++) {
+		lnum = c->ilebs[i];
+		dbg_cmt("LEB %d", lnum);
+		er = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+					 LPROPS_INDEX | LPROPS_TAKEN, 0);
+		if (!err)
+			err = er;
+	}
+	return err;
+}
+
+/**
+ * free_idx_lebs - free unused LEBs after commit end.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int free_idx_lebs(struct ubifs_info *c)
+{
+	int err;
+
+	err = free_unused_idx_lebs(c);
+	kfree(c->ilebs);
+	c->ilebs = NULL;
+	return err;
+}
+
+/**
+ * ubifs_tnc_start_commit - start TNC commit.
+ * @c: UBIFS file-system description object
+ * @zroot: new index root position is returned here
+ *
+ * This function prepares the list of indexing nodes to commit and lays out
+ * their positions on flash. If there is not enough free space it uses the
+ * in-gap commit method. Returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+	int err = 0, cnt;
+
+	mutex_lock(&c->tnc_mutex);
+	err = dbg_check_tnc(c, 1);
+	if (err)
+		goto out;
+	cnt = get_znodes_to_commit(c);
+	if (cnt != 0) {
+		int no_space = 0;
+
+		err = alloc_idx_lebs(c, cnt);
+		if (err == -ENOSPC)
+			no_space = 1;
+		else if (err)
+			goto out_free;
+		err = layout_commit(c, no_space, cnt);
+		if (err)
+			goto out_free;
+		ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
+		err = free_unused_idx_lebs(c);
+		if (err)
+			goto out;
+	}
+	destroy_old_idx(c);
+	memcpy(zroot, &c->zroot, sizeof(struct ubifs_zbranch));
+
+	err = ubifs_save_dirty_idx_lnums(c);
+	if (err)
+		goto out;
+
+	spin_lock(&c->space_lock);
+	/*
+	 * Although we have not finished committing yet, update size of the
+	 * committed index ('c->bi.old_idx_sz') and zero out the index growth
+	 * budget. It is OK to do this now, because we've reserved all the
+	 * space which is needed to commit the index, and it is save for the
+	 * budgeting subsystem to assume the index is already committed,
+	 * even though it is not.
+	 */
+	ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+	c->bi.old_idx_sz = c->calc_idx_sz;
+	c->bi.uncommitted_idx = 0;
+	c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	spin_unlock(&c->space_lock);
+	mutex_unlock(&c->tnc_mutex);
+
+	dbg_cmt("number of index LEBs %d", c->lst.idx_lebs);
+	dbg_cmt("size of index %llu", c->calc_idx_sz);
+	return err;
+
+out_free:
+	free_idx_lebs(c);
+out:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * write_index - write index nodes.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the index nodes whose positions were laid out in the
+ * layout_in_empty_space function.
+ */
+static int write_index(struct ubifs_info *c)
+{
+	struct ubifs_idx_node *idx;
+	struct ubifs_znode *znode, *cnext;
+	int i, lnum, offs, len, next_len, buf_len, buf_offs, used;
+	int avail, wlen, err, lnum_pos = 0;
+
+	cnext = c->enext;
+	if (!cnext)
+		return 0;
+
+	/*
+	 * Always write index nodes to the index head so that index nodes and
+	 * other types of nodes are never mixed in the same erase block.
+	 */
+	lnum = c->ihead_lnum;
+	buf_offs = c->ihead_offs;
+
+	/* Allocate commit buffer */
+	buf_len = ALIGN(c->max_idx_node_sz, c->min_io_size);
+	used = 0;
+	avail = buf_len;
+
+	/* Ensure there is enough room for first write */
+	next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+	if (buf_offs + next_len > c->leb_size) {
+		err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, 0,
+					  LPROPS_TAKEN);
+		if (err)
+			return err;
+		lnum = -1;
+	}
+
+	while (1) {
+		cond_resched();
+
+		znode = cnext;
+		idx = c->cbuf + used;
+
+		/* Make index node */
+		idx->ch.node_type = UBIFS_IDX_NODE;
+		idx->child_cnt = cpu_to_le16(znode->child_cnt);
+		idx->level = cpu_to_le16(znode->level);
+		for (i = 0; i < znode->child_cnt; i++) {
+			struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+			struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+			key_write_idx(c, &zbr->key, &br->key);
+			br->lnum = cpu_to_le32(zbr->lnum);
+			br->offs = cpu_to_le32(zbr->offs);
+			br->len = cpu_to_le32(zbr->len);
+			if (!zbr->lnum || !zbr->len) {
+				ubifs_err("bad ref in znode");
+				dbg_dump_znode(c, znode);
+				if (zbr->znode)
+					dbg_dump_znode(c, zbr->znode);
+			}
+		}
+		len = ubifs_idx_node_sz(c, znode->child_cnt);
+		ubifs_prepare_node(c, idx, len, 0);
+
+		/* Determine the index node position */
+		if (lnum == -1) {
+			lnum = c->ilebs[lnum_pos++];
+			buf_offs = 0;
+			used = 0;
+			avail = buf_len;
+		}
+		offs = buf_offs + used;
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+		if (lnum != znode->lnum || offs != znode->offs ||
+		    len != znode->len) {
+			ubifs_err("inconsistent znode posn");
+			return -EINVAL;
+		}
+#endif
+
+		/* Grab some stuff from znode while we still can */
+		cnext = znode->cnext;
+
+		ubifs_assert(ubifs_zn_dirty(znode));
+		ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
+
+		/*
+		 * It is important that other threads should see %DIRTY_ZNODE
+		 * flag cleared before %COW_ZNODE. Specifically, it matters in
+		 * the 'dirty_cow_znode()' function. This is the reason for the
+		 * first barrier. Also, we want the bit changes to be seen to
+		 * other threads ASAP, to avoid unnecesarry copying, which is
+		 * the reason for the second barrier.
+		 */
+		clear_bit(DIRTY_ZNODE, &znode->flags);
+		smp_mb__before_clear_bit();
+		clear_bit(COW_ZNODE, &znode->flags);
+		smp_mb__after_clear_bit();
+
+		/* Do not access znode from this point on */
+
+		/* Update buffer positions */
+		wlen = used + len;
+		used += ALIGN(len, 8);
+		avail -= ALIGN(len, 8);
+
+		/*
+		 * Calculate the next index node length to see if there is
+		 * enough room for it
+		 */
+		if (cnext == c->cnext)
+			next_len = 0;
+		else
+			next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+
+		if (c->min_io_size == 1) {
+			/*
+			 * Write the prepared index node immediately if there is
+			 * no minimum IO size
+			 */
+			err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
+					      wlen, UBI_SHORTTERM);
+			if (err)
+				return err;
+			buf_offs += ALIGN(wlen, 8);
+			if (next_len) {
+				used = 0;
+				avail = buf_len;
+				if (buf_offs + next_len > c->leb_size) {
+					err = ubifs_update_one_lp(c, lnum,
+						LPROPS_NC, 0, 0, LPROPS_TAKEN);
+					if (err)
+						return err;
+					lnum = -1;
+				}
+				continue;
+			}
+		} else {
+			int blen, nxt_offs = buf_offs + used + next_len;
+
+			if (next_len && nxt_offs <= c->leb_size) {
+				if (avail > 0)
+					continue;
+				else
+					blen = buf_len;
+			} else {
+				wlen = ALIGN(wlen, 8);
+				blen = ALIGN(wlen, c->min_io_size);
+				ubifs_pad(c, c->cbuf + wlen, blen - wlen);
+			}
+			/*
+			 * The buffer is full or there are no more znodes
+			 * to do
+			 */
+			err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
+					      blen, UBI_SHORTTERM);
+			if (err)
+				return err;
+			buf_offs += blen;
+			if (next_len) {
+				if (nxt_offs > c->leb_size) {
+					err = ubifs_update_one_lp(c, lnum,
+						LPROPS_NC, 0, 0, LPROPS_TAKEN);
+					if (err)
+						return err;
+					lnum = -1;
+				}
+				used -= blen;
+				if (used < 0)
+					used = 0;
+				avail = buf_len - used;
+				memmove(c->cbuf, c->cbuf + blen, used);
+				continue;
+			}
+		}
+		break;
+	}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+	if (lnum != c->dbg->new_ihead_lnum ||
+	    buf_offs != c->dbg->new_ihead_offs) {
+		ubifs_err("inconsistent ihead");
+		return -EINVAL;
+	}
+#endif
+
+	c->ihead_lnum = lnum;
+	c->ihead_offs = buf_offs;
+
+	return 0;
+}
+
+/**
+ * free_obsolete_znodes - free obsolete znodes.
+ * @c: UBIFS file-system description object
+ *
+ * At the end of commit end, obsolete znodes are freed.
+ */
+static void free_obsolete_znodes(struct ubifs_info *c)
+{
+	struct ubifs_znode *znode, *cnext;
+
+	cnext = c->cnext;
+	do {
+		znode = cnext;
+		cnext = znode->cnext;
+		if (test_bit(OBSOLETE_ZNODE, &znode->flags))
+			kfree(znode);
+		else {
+			znode->cnext = NULL;
+			atomic_long_inc(&c->clean_zn_cnt);
+			atomic_long_inc(&ubifs_clean_zn_cnt);
+		}
+	} while (cnext != c->cnext);
+}
+
+/**
+ * return_gap_lebs - return LEBs used by the in-gap commit method.
+ * @c: UBIFS file-system description object
+ *
+ * This function clears the "taken" flag for the LEBs which were used by the
+ * "commit in-the-gaps" method.
+ */
+static int return_gap_lebs(struct ubifs_info *c)
+{
+	int *p, err;
+
+	if (!c->gap_lebs)
+		return 0;
+
+	dbg_cmt("");
+	for (p = c->gap_lebs; *p != -1; p++) {
+		err = ubifs_change_one_lp(c, *p, LPROPS_NC, LPROPS_NC, 0,
+					  LPROPS_TAKEN, 0);
+		if (err)
+			return err;
+	}
+
+	kfree(c->gap_lebs);
+	c->gap_lebs = NULL;
+	return 0;
+}
+
+/**
+ * ubifs_tnc_end_commit - update the TNC for commit end.
+ * @c: UBIFS file-system description object
+ *
+ * Write the dirty znodes.
+ */
+int ubifs_tnc_end_commit(struct ubifs_info *c)
+{
+	int err;
+
+	if (!c->cnext)
+		return 0;
+
+	err = return_gap_lebs(c);
+	if (err)
+		return err;
+
+	err = write_index(c);
+	if (err)
+		return err;
+
+	mutex_lock(&c->tnc_mutex);
+
+	dbg_cmt("TNC height is %d", c->zroot.znode->level + 1);
+
+	free_obsolete_znodes(c);
+
+	c->cnext = NULL;
+	kfree(c->ilebs);
+	c->ilebs = NULL;
+
+	mutex_unlock(&c->tnc_mutex);
+
+	return 0;
+}
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
new file mode 100644
index 00000000..b48db999
--- /dev/null
+++ b/fs/ubifs/tnc_misc.c
@@ -0,0 +1,494 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file contains miscelanious TNC-related functions shared betweend
+ * different files. This file does not form any logically separate TNC
+ * sub-system. The file was created because there is a lot of TNC code and
+ * putting it all in one file would make that file too big and unreadable.
+ */
+
+#include "ubifs.h"
+
+/**
+ * ubifs_tnc_levelorder_next - next TNC tree element in levelorder traversal.
+ * @zr: root of the subtree to traverse
+ * @znode: previous znode
+ *
+ * This function implements levelorder TNC traversal. The LNC is ignored.
+ * Returns the next element or %NULL if @znode is already the last one.
+ */
+struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
+					      struct ubifs_znode *znode)
+{
+	int level, iip, level_search = 0;
+	struct ubifs_znode *zn;
+
+	ubifs_assert(zr);
+
+	if (unlikely(!znode))
+		return zr;
+
+	if (unlikely(znode == zr)) {
+		if (znode->level == 0)
+			return NULL;
+		return ubifs_tnc_find_child(zr, 0);
+	}
+
+	level = znode->level;
+
+	iip = znode->iip;
+	while (1) {
+		ubifs_assert(znode->level <= zr->level);
+
+		/*
+		 * First walk up until there is a znode with next branch to
+		 * look at.
+		 */
+		while (znode->parent != zr && iip >= znode->parent->child_cnt) {
+			znode = znode->parent;
+			iip = znode->iip;
+		}
+
+		if (unlikely(znode->parent == zr &&
+			     iip >= znode->parent->child_cnt)) {
+			/* This level is done, switch to the lower one */
+			level -= 1;
+			if (level_search || level < 0)
+				/*
+				 * We were already looking for znode at lower
+				 * level ('level_search'). As we are here
+				 * again, it just does not exist. Or all levels
+				 * were finished ('level < 0').
+				 */
+				return NULL;
+
+			level_search = 1;
+			iip = -1;
+			znode = ubifs_tnc_find_child(zr, 0);
+			ubifs_assert(znode);
+		}
+
+		/* Switch to the next index */
+		zn = ubifs_tnc_find_child(znode->parent, iip + 1);
+		if (!zn) {
+			/* No more children to look at, we have walk up */
+			iip = znode->parent->child_cnt;
+			continue;
+		}
+
+		/* Walk back down to the level we came from ('level') */
+		while (zn->level != level) {
+			znode = zn;
+			zn = ubifs_tnc_find_child(zn, 0);
+			if (!zn) {
+				/*
+				 * This path is not too deep so it does not
+				 * reach 'level'. Try next path.
+				 */
+				iip = znode->iip;
+				break;
+			}
+		}
+
+		if (zn) {
+			ubifs_assert(zn->level >= 0);
+			return zn;
+		}
+	}
+}
+
+/**
+ * ubifs_search_zbranch - search znode branch.
+ * @c: UBIFS file-system description object
+ * @znode: znode to search in
+ * @key: key to search for
+ * @n: znode branch slot number is returned here
+ *
+ * This is a helper function which search branch with key @key in @znode using
+ * binary search. The result of the search may be:
+ *   o exact match, then %1 is returned, and the slot number of the branch is
+ *     stored in @n;
+ *   o no exact match, then %0 is returned and the slot number of the left
+ *     closest branch is returned in @n; the slot if all keys in this znode are
+ *     greater than @key, then %-1 is returned in @n.
+ */
+int ubifs_search_zbranch(const struct ubifs_info *c,
+			 const struct ubifs_znode *znode,
+			 const union ubifs_key *key, int *n)
+{
+	int beg = 0, end = znode->child_cnt, uninitialized_var(mid);
+	int uninitialized_var(cmp);
+	const struct ubifs_zbranch *zbr = &znode->zbranch[0];
+
+	ubifs_assert(end > beg);
+
+	while (end > beg) {
+		mid = (beg + end) >> 1;
+		cmp = keys_cmp(c, key, &zbr[mid].key);
+		if (cmp > 0)
+			beg = mid + 1;
+		else if (cmp < 0)
+			end = mid;
+		else {
+			*n = mid;
+			return 1;
+		}
+	}
+
+	*n = end - 1;
+
+	/* The insert point is after *n */
+	ubifs_assert(*n >= -1 && *n < znode->child_cnt);
+	if (*n == -1)
+		ubifs_assert(keys_cmp(c, key, &zbr[0].key) < 0);
+	else
+		ubifs_assert(keys_cmp(c, key, &zbr[*n].key) > 0);
+	if (*n + 1 < znode->child_cnt)
+		ubifs_assert(keys_cmp(c, key, &zbr[*n + 1].key) < 0);
+
+	return 0;
+}
+
+/**
+ * ubifs_tnc_postorder_first - find first znode to do postorder tree traversal.
+ * @znode: znode to start at (root of the sub-tree to traverse)
+ *
+ * Find the lowest leftmost znode in a subtree of the TNC tree. The LNC is
+ * ignored.
+ */
+struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode)
+{
+	if (unlikely(!znode))
+		return NULL;
+
+	while (znode->level > 0) {
+		struct ubifs_znode *child;
+
+		child = ubifs_tnc_find_child(znode, 0);
+		if (!child)
+			return znode;
+		znode = child;
+	}
+
+	return znode;
+}
+
+/**
+ * ubifs_tnc_postorder_next - next TNC tree element in postorder traversal.
+ * @znode: previous znode
+ *
+ * This function implements postorder TNC traversal. The LNC is ignored.
+ * Returns the next element or %NULL if @znode is already the last one.
+ */
+struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode)
+{
+	struct ubifs_znode *zn;
+
+	ubifs_assert(znode);
+	if (unlikely(!znode->parent))
+		return NULL;
+
+	/* Switch to the next index in the parent */
+	zn = ubifs_tnc_find_child(znode->parent, znode->iip + 1);
+	if (!zn)
+		/* This is in fact the last child, return parent */
+		return znode->parent;
+
+	/* Go to the first znode in this new subtree */
+	return ubifs_tnc_postorder_first(zn);
+}
+
+/**
+ * ubifs_destroy_tnc_subtree - destroy all znodes connected to a subtree.
+ * @znode: znode defining subtree to destroy
+ *
+ * This function destroys subtree of the TNC tree. Returns number of clean
+ * znodes in the subtree.
+ */
+long ubifs_destroy_tnc_subtree(struct ubifs_znode *znode)
+{
+	struct ubifs_znode *zn = ubifs_tnc_postorder_first(znode);
+	long clean_freed = 0;
+	int n;
+
+	ubifs_assert(zn);
+	while (1) {
+		for (n = 0; n < zn->child_cnt; n++) {
+			if (!zn->zbranch[n].znode)
+				continue;
+
+			if (zn->level > 0 &&
+			    !ubifs_zn_dirty(zn->zbranch[n].znode))
+				clean_freed += 1;
+
+			cond_resched();
+			kfree(zn->zbranch[n].znode);
+		}
+
+		if (zn == znode) {
+			if (!ubifs_zn_dirty(zn))
+				clean_freed += 1;
+			kfree(zn);
+			return clean_freed;
+		}
+
+		zn = ubifs_tnc_postorder_next(zn);
+	}
+}
+
+/**
+ * read_znode - read an indexing node from flash and fill znode.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB of the indexing node to read
+ * @offs: node offset
+ * @len: node length
+ * @znode: znode to read to
+ *
+ * This function reads an indexing node from the flash media and fills znode
+ * with the read data. Returns zero in case of success and a negative error
+ * code in case of failure. The read indexing node is validated and if anything
+ * is wrong with it, this function prints complaint messages and returns
+ * %-EINVAL.
+ */
+static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
+		      struct ubifs_znode *znode)
+{
+	int i, err, type, cmp;
+	struct ubifs_idx_node *idx;
+
+	idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
+	if (!idx)
+		return -ENOMEM;
+
+	err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+	if (err < 0) {
+		kfree(idx);
+		return err;
+	}
+
+	znode->child_cnt = le16_to_cpu(idx->child_cnt);
+	znode->level = le16_to_cpu(idx->level);
+
+	dbg_tnc("LEB %d:%d, level %d, %d branch",
+		lnum, offs, znode->level, znode->child_cnt);
+
+	if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) {
+		dbg_err("current fanout %d, branch count %d",
+			c->fanout, znode->child_cnt);
+		dbg_err("max levels %d, znode level %d",
+			UBIFS_MAX_LEVELS, znode->level);
+		err = 1;
+		goto out_dump;
+	}
+
+	for (i = 0; i < znode->child_cnt; i++) {
+		const struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+		struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+		key_read(c, &br->key, &zbr->key);
+		zbr->lnum = le32_to_cpu(br->lnum);
+		zbr->offs = le32_to_cpu(br->offs);
+		zbr->len  = le32_to_cpu(br->len);
+		zbr->znode = NULL;
+
+		/* Validate branch */
+
+		if (zbr->lnum < c->main_first ||
+		    zbr->lnum >= c->leb_cnt || zbr->offs < 0 ||
+		    zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) {
+			dbg_err("bad branch %d", i);
+			err = 2;
+			goto out_dump;
+		}
+
+		switch (key_type(c, &zbr->key)) {
+		case UBIFS_INO_KEY:
+		case UBIFS_DATA_KEY:
+		case UBIFS_DENT_KEY:
+		case UBIFS_XENT_KEY:
+			break;
+		default:
+			dbg_msg("bad key type at slot %d: %s", i,
+				DBGKEY(&zbr->key));
+			err = 3;
+			goto out_dump;
+		}
+
+		if (znode->level)
+			continue;
+
+		type = key_type(c, &zbr->key);
+		if (c->ranges[type].max_len == 0) {
+			if (zbr->len != c->ranges[type].len) {
+				dbg_err("bad target node (type %d) length (%d)",
+					type, zbr->len);
+				dbg_err("have to be %d", c->ranges[type].len);
+				err = 4;
+				goto out_dump;
+			}
+		} else if (zbr->len < c->ranges[type].min_len ||
+			   zbr->len > c->ranges[type].max_len) {
+			dbg_err("bad target node (type %d) length (%d)",
+				type, zbr->len);
+			dbg_err("have to be in range of %d-%d",
+				c->ranges[type].min_len,
+				c->ranges[type].max_len);
+			err = 5;
+			goto out_dump;
+		}
+	}
+
+	/*
+	 * Ensure that the next key is greater or equivalent to the
+	 * previous one.
+	 */
+	for (i = 0; i < znode->child_cnt - 1; i++) {
+		const union ubifs_key *key1, *key2;
+
+		key1 = &znode->zbranch[i].key;
+		key2 = &znode->zbranch[i + 1].key;
+
+		cmp = keys_cmp(c, key1, key2);
+		if (cmp > 0) {
+			dbg_err("bad key order (keys %d and %d)", i, i + 1);
+			err = 6;
+			goto out_dump;
+		} else if (cmp == 0 && !is_hash_key(c, key1)) {
+			/* These can only be keys with colliding hash */
+			dbg_err("keys %d and %d are not hashed but equivalent",
+				i, i + 1);
+			err = 7;
+			goto out_dump;
+		}
+	}
+
+	kfree(idx);
+	return 0;
+
+out_dump:
+	ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err);
+	dbg_dump_node(c, idx);
+	kfree(idx);
+	return -EINVAL;
+}
+
+/**
+ * ubifs_load_znode - load znode to TNC cache.
+ * @c: UBIFS file-system description object
+ * @zbr: znode branch
+ * @parent: znode's parent
+ * @iip: index in parent
+ *
+ * This function loads znode pointed to by @zbr into the TNC cache and
+ * returns pointer to it in case of success and a negative error code in case
+ * of failure.
+ */
+struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
+				     struct ubifs_zbranch *zbr,
+				     struct ubifs_znode *parent, int iip)
+{
+	int err;
+	struct ubifs_znode *znode;
+
+	ubifs_assert(!zbr->znode);
+	/*
+	 * A slab cache is not presently used for znodes because the znode size
+	 * depends on the fanout which is stored in the superblock.
+	 */
+	znode = kzalloc(c->max_znode_sz, GFP_NOFS);
+	if (!znode)
+		return ERR_PTR(-ENOMEM);
+
+	err = read_znode(c, zbr->lnum, zbr->offs, zbr->len, znode);
+	if (err)
+		goto out;
+
+	atomic_long_inc(&c->clean_zn_cnt);
+
+	/*
+	 * Increment the global clean znode counter as well. It is OK that
+	 * global and per-FS clean znode counters may be inconsistent for some
+	 * short time (because we might be preempted at this point), the global
+	 * one is only used in shrinker.
+	 */
+	atomic_long_inc(&ubifs_clean_zn_cnt);
+
+	zbr->znode = znode;
+	znode->parent = parent;
+	znode->time = get_seconds();
+	znode->iip = iip;
+
+	return znode;
+
+out:
+	kfree(znode);
+	return ERR_PTR(err);
+}
+
+/**
+ * ubifs_tnc_read_node - read a leaf node from the flash media.
+ * @c: UBIFS file-system description object
+ * @zbr: key and position of the node
+ * @node: node is returned here
+ *
+ * This function reads a node defined by @zbr from the flash media. Returns
+ * zero in case of success or a negative negative error code in case of
+ * failure.
+ */
+int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			void *node)
+{
+	union ubifs_key key1, *key = &zbr->key;
+	int err, type = key_type(c, key);
+	struct ubifs_wbuf *wbuf;
+
+	/*
+	 * 'zbr' has to point to on-flash node. The node may sit in a bud and
+	 * may even be in a write buffer, so we have to take care about this.
+	 */
+	wbuf = ubifs_get_wbuf(c, zbr->lnum);
+	if (wbuf)
+		err = ubifs_read_node_wbuf(wbuf, node, type, zbr->len,
+					   zbr->lnum, zbr->offs);
+	else
+		err = ubifs_read_node(c, node, type, zbr->len, zbr->lnum,
+				      zbr->offs);
+
+	if (err) {
+		dbg_tnc("key %s", DBGKEY(key));
+		return err;
+	}
+
+	/* Make sure the key of the read node is correct */
+	key_read(c, node + UBIFS_KEY_OFFSET, &key1);
+	if (!keys_eq(c, key, &key1)) {
+		ubifs_err("bad key in node at LEB %d:%d",
+			  zbr->lnum, zbr->offs);
+		dbg_tnc("looked for key %s found node's key %s",
+			DBGKEY(key), DBGKEY1(&key1));
+		dbg_dump_node(c, node);
+		return -EINVAL;
+	}
+
+	return 0;
+}
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
new file mode 100644
index 00000000..e24380cf
--- /dev/null
+++ b/fs/ubifs/ubifs-media.h
@@ -0,0 +1,784 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file describes UBIFS on-flash format and contains definitions of all the
+ * relevant data structures and constants.
+ *
+ * All UBIFS on-flash objects are stored in the form of nodes. All nodes start
+ * with the UBIFS node magic number and have the same common header. Nodes
+ * always sit at 8-byte aligned positions on the media and node header sizes are
+ * also 8-byte aligned (except for the indexing node and the padding node).
+ */
+
+#ifndef __UBIFS_MEDIA_H__
+#define __UBIFS_MEDIA_H__
+
+/* UBIFS node magic number (must not have the padding byte first or last) */
+#define UBIFS_NODE_MAGIC  0x06101831
+
+/*
+ * UBIFS on-flash format version. This version is increased when the on-flash
+ * format is changing. If this happens, UBIFS is will support older versions as
+ * well. But older UBIFS code will not support newer formats. Format changes
+ * will be rare and only when absolutely necessary, e.g. to fix a bug or to add
+ * a new feature.
+ *
+ * UBIFS went into mainline kernel with format version 4. The older formats
+ * were development formats.
+ */
+#define UBIFS_FORMAT_VERSION 4
+
+/*
+ * Read-only compatibility version. If the UBIFS format is changed, older UBIFS
+ * implementations will not be able to mount newer formats in read-write mode.
+ * However, depending on the change, it may be possible to mount newer formats
+ * in R/O mode. This is indicated by the R/O compatibility version which is
+ * stored in the super-block.
+ *
+ * This is needed to support boot-loaders which only need R/O mounting. With
+ * this flag it is possible to do UBIFS format changes without a need to update
+ * boot-loaders.
+ */
+#define UBIFS_RO_COMPAT_VERSION 0
+
+/* Minimum logical eraseblock size in bytes */
+#define UBIFS_MIN_LEB_SZ (15*1024)
+
+/* Initial CRC32 value used when calculating CRC checksums */
+#define UBIFS_CRC32_INIT 0xFFFFFFFFU
+
+/*
+ * UBIFS does not try to compress data if its length is less than the below
+ * constant.
+ */
+#define UBIFS_MIN_COMPR_LEN 128
+
+/*
+ * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
+ * shorter than uncompressed data length, UBIFS prefers to leave this data
+ * node uncompress, because it'll be read faster.
+ */
+#define UBIFS_MIN_COMPRESS_DIFF 64
+
+/* Root inode number */
+#define UBIFS_ROOT_INO 1
+
+/* Lowest inode number used for regular inodes (not UBIFS-only internal ones) */
+#define UBIFS_FIRST_INO 64
+
+/*
+ * Maximum file name and extended attribute length (must be a multiple of 8,
+ * minus 1).
+ */
+#define UBIFS_MAX_NLEN 255
+
+/* Maximum number of data journal heads */
+#define UBIFS_MAX_JHEADS 1
+
+/*
+ * Size of UBIFS data block. Note, UBIFS is not a block oriented file-system,
+ * which means that it does not treat the underlying media as consisting of
+ * blocks like in case of hard drives. Do not be confused. UBIFS block is just
+ * the maximum amount of data which one data node can have or which can be
+ * attached to an inode node.
+ */
+#define UBIFS_BLOCK_SIZE  4096
+#define UBIFS_BLOCK_SHIFT 12
+
+/* UBIFS padding byte pattern (must not be first or last byte of node magic) */
+#define UBIFS_PADDING_BYTE 0xCE
+
+/* Maximum possible key length */
+#define UBIFS_MAX_KEY_LEN 16
+
+/* Key length ("simple" format) */
+#define UBIFS_SK_LEN 8
+
+/* Minimum index tree fanout */
+#define UBIFS_MIN_FANOUT 3
+
+/* Maximum number of levels in UBIFS indexing B-tree */
+#define UBIFS_MAX_LEVELS 512
+
+/* Maximum amount of data attached to an inode in bytes */
+#define UBIFS_MAX_INO_DATA UBIFS_BLOCK_SIZE
+
+/* LEB Properties Tree fanout (must be power of 2) and fanout shift */
+#define UBIFS_LPT_FANOUT 4
+#define UBIFS_LPT_FANOUT_SHIFT 2
+
+/* LEB Properties Tree bit field sizes */
+#define UBIFS_LPT_CRC_BITS 16
+#define UBIFS_LPT_CRC_BYTES 2
+#define UBIFS_LPT_TYPE_BITS 4
+
+/* The key is always at the same position in all keyed nodes */
+#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key)
+
+/* Garbage collector journal head number */
+#define UBIFS_GC_HEAD   0
+/* Base journal head number */
+#define UBIFS_BASE_HEAD 1
+/* Data journal head number */
+#define UBIFS_DATA_HEAD 2
+
+/*
+ * LEB Properties Tree node types.
+ *
+ * UBIFS_LPT_PNODE: LPT leaf node (contains LEB properties)
+ * UBIFS_LPT_NNODE: LPT internal node
+ * UBIFS_LPT_LTAB: LPT's own lprops table
+ * UBIFS_LPT_LSAVE: LPT's save table (big model only)
+ * UBIFS_LPT_NODE_CNT: count of LPT node types
+ * UBIFS_LPT_NOT_A_NODE: all ones (15 for 4 bits) is never a valid node type
+ */
+enum {
+	UBIFS_LPT_PNODE,
+	UBIFS_LPT_NNODE,
+	UBIFS_LPT_LTAB,
+	UBIFS_LPT_LSAVE,
+	UBIFS_LPT_NODE_CNT,
+	UBIFS_LPT_NOT_A_NODE = (1 << UBIFS_LPT_TYPE_BITS) - 1,
+};
+
+/*
+ * UBIFS inode types.
+ *
+ * UBIFS_ITYPE_REG: regular file
+ * UBIFS_ITYPE_DIR: directory
+ * UBIFS_ITYPE_LNK: soft link
+ * UBIFS_ITYPE_BLK: block device node
+ * UBIFS_ITYPE_CHR: character device node
+ * UBIFS_ITYPE_FIFO: fifo
+ * UBIFS_ITYPE_SOCK: socket
+ * UBIFS_ITYPES_CNT: count of supported file types
+ */
+enum {
+	UBIFS_ITYPE_REG,
+	UBIFS_ITYPE_DIR,
+	UBIFS_ITYPE_LNK,
+	UBIFS_ITYPE_BLK,
+	UBIFS_ITYPE_CHR,
+	UBIFS_ITYPE_FIFO,
+	UBIFS_ITYPE_SOCK,
+	UBIFS_ITYPES_CNT,
+};
+
+/*
+ * Supported key hash functions.
+ *
+ * UBIFS_KEY_HASH_R5: R5 hash
+ * UBIFS_KEY_HASH_TEST: test hash which just returns first 4 bytes of the name
+ */
+enum {
+	UBIFS_KEY_HASH_R5,
+	UBIFS_KEY_HASH_TEST,
+};
+
+/*
+ * Supported key formats.
+ *
+ * UBIFS_SIMPLE_KEY_FMT: simple key format
+ */
+enum {
+	UBIFS_SIMPLE_KEY_FMT,
+};
+
+/*
+ * The simple key format uses 29 bits for storing UBIFS block number and hash
+ * value.
+ */
+#define UBIFS_S_KEY_BLOCK_BITS 29
+#define UBIFS_S_KEY_BLOCK_MASK 0x1FFFFFFF
+#define UBIFS_S_KEY_HASH_BITS  UBIFS_S_KEY_BLOCK_BITS
+#define UBIFS_S_KEY_HASH_MASK  UBIFS_S_KEY_BLOCK_MASK
+
+/*
+ * Key types.
+ *
+ * UBIFS_INO_KEY: inode node key
+ * UBIFS_DATA_KEY: data node key
+ * UBIFS_DENT_KEY: directory entry node key
+ * UBIFS_XENT_KEY: extended attribute entry key
+ * UBIFS_KEY_TYPES_CNT: number of supported key types
+ */
+enum {
+	UBIFS_INO_KEY,
+	UBIFS_DATA_KEY,
+	UBIFS_DENT_KEY,
+	UBIFS_XENT_KEY,
+	UBIFS_KEY_TYPES_CNT,
+};
+
+/* Count of LEBs reserved for the superblock area */
+#define UBIFS_SB_LEBS 1
+/* Count of LEBs reserved for the master area */
+#define UBIFS_MST_LEBS 2
+
+/* First LEB of the superblock area */
+#define UBIFS_SB_LNUM 0
+/* First LEB of the master area */
+#define UBIFS_MST_LNUM (UBIFS_SB_LNUM + UBIFS_SB_LEBS)
+/* First LEB of the log area */
+#define UBIFS_LOG_LNUM (UBIFS_MST_LNUM + UBIFS_MST_LEBS)
+
+/*
+ * The below constants define the absolute minimum values for various UBIFS
+ * media areas. Many of them actually depend of flash geometry and the FS
+ * configuration (number of journal heads, orphan LEBs, etc). This means that
+ * the smallest volume size which can be used for UBIFS cannot be pre-defined
+ * by these constants. The file-system that meets the below limitation will not
+ * necessarily mount. UBIFS does run-time calculations and validates the FS
+ * size.
+ */
+
+/* Minimum number of logical eraseblocks in the log */
+#define UBIFS_MIN_LOG_LEBS 2
+/* Minimum number of bud logical eraseblocks (one for each head) */
+#define UBIFS_MIN_BUD_LEBS 3
+/* Minimum number of journal logical eraseblocks */
+#define UBIFS_MIN_JNL_LEBS (UBIFS_MIN_LOG_LEBS + UBIFS_MIN_BUD_LEBS)
+/* Minimum number of LPT area logical eraseblocks */
+#define UBIFS_MIN_LPT_LEBS 2
+/* Minimum number of orphan area logical eraseblocks */
+#define UBIFS_MIN_ORPH_LEBS 1
+/*
+ * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1
+ * for GC, 1 for deletions, and at least 1 for committed data).
+ */
+#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6)
+
+/* Minimum number of logical eraseblocks */
+#define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \
+			   UBIFS_MIN_LOG_LEBS + UBIFS_MIN_LPT_LEBS + \
+			   UBIFS_MIN_ORPH_LEBS + UBIFS_MIN_MAIN_LEBS)
+
+/* Node sizes (N.B. these are guaranteed to be multiples of 8) */
+#define UBIFS_CH_SZ        sizeof(struct ubifs_ch)
+#define UBIFS_INO_NODE_SZ  sizeof(struct ubifs_ino_node)
+#define UBIFS_DATA_NODE_SZ sizeof(struct ubifs_data_node)
+#define UBIFS_DENT_NODE_SZ sizeof(struct ubifs_dent_node)
+#define UBIFS_TRUN_NODE_SZ sizeof(struct ubifs_trun_node)
+#define UBIFS_PAD_NODE_SZ  sizeof(struct ubifs_pad_node)
+#define UBIFS_SB_NODE_SZ   sizeof(struct ubifs_sb_node)
+#define UBIFS_MST_NODE_SZ  sizeof(struct ubifs_mst_node)
+#define UBIFS_REF_NODE_SZ  sizeof(struct ubifs_ref_node)
+#define UBIFS_IDX_NODE_SZ  sizeof(struct ubifs_idx_node)
+#define UBIFS_CS_NODE_SZ   sizeof(struct ubifs_cs_node)
+#define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node)
+/* Extended attribute entry nodes are identical to directory entry nodes */
+#define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ
+/* Only this does not have to be multiple of 8 bytes */
+#define UBIFS_BRANCH_SZ    sizeof(struct ubifs_branch)
+
+/* Maximum node sizes (N.B. these are guaranteed to be multiples of 8) */
+#define UBIFS_MAX_DATA_NODE_SZ  (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE)
+#define UBIFS_MAX_INO_NODE_SZ   (UBIFS_INO_NODE_SZ + UBIFS_MAX_INO_DATA)
+#define UBIFS_MAX_DENT_NODE_SZ  (UBIFS_DENT_NODE_SZ + UBIFS_MAX_NLEN + 1)
+#define UBIFS_MAX_XENT_NODE_SZ  UBIFS_MAX_DENT_NODE_SZ
+
+/* The largest UBIFS node */
+#define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ
+
+/*
+ * On-flash inode flags.
+ *
+ * UBIFS_COMPR_FL: use compression for this inode
+ * UBIFS_SYNC_FL:  I/O on this inode has to be synchronous
+ * UBIFS_IMMUTABLE_FL: inode is immutable
+ * UBIFS_APPEND_FL: writes to the inode may only append data
+ * UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous
+ * UBIFS_XATTR_FL: this inode is the inode for an extended attribute value
+ *
+ * Note, these are on-flash flags which correspond to ioctl flags
+ * (@FS_COMPR_FL, etc). They have the same values now, but generally, do not
+ * have to be the same.
+ */
+enum {
+	UBIFS_COMPR_FL     = 0x01,
+	UBIFS_SYNC_FL      = 0x02,
+	UBIFS_IMMUTABLE_FL = 0x04,
+	UBIFS_APPEND_FL    = 0x08,
+	UBIFS_DIRSYNC_FL   = 0x10,
+	UBIFS_XATTR_FL     = 0x20,
+};
+
+/* Inode flag bits used by UBIFS */
+#define UBIFS_FL_MASK 0x0000001F
+
+/*
+ * UBIFS compression algorithms.
+ *
+ * UBIFS_COMPR_NONE: no compression
+ * UBIFS_COMPR_LZO: LZO compression
+ * UBIFS_COMPR_ZLIB: ZLIB compression
+ * UBIFS_COMPR_TYPES_CNT: count of supported compression types
+ */
+enum {
+	UBIFS_COMPR_NONE,
+	UBIFS_COMPR_LZO,
+	UBIFS_COMPR_ZLIB,
+	UBIFS_COMPR_TYPES_CNT,
+};
+
+/*
+ * UBIFS node types.
+ *
+ * UBIFS_INO_NODE: inode node
+ * UBIFS_DATA_NODE: data node
+ * UBIFS_DENT_NODE: directory entry node
+ * UBIFS_XENT_NODE: extended attribute node
+ * UBIFS_TRUN_NODE: truncation node
+ * UBIFS_PAD_NODE: padding node
+ * UBIFS_SB_NODE: superblock node
+ * UBIFS_MST_NODE: master node
+ * UBIFS_REF_NODE: LEB reference node
+ * UBIFS_IDX_NODE: index node
+ * UBIFS_CS_NODE: commit start node
+ * UBIFS_ORPH_NODE: orphan node
+ * UBIFS_NODE_TYPES_CNT: count of supported node types
+ *
+ * Note, we index arrays by these numbers, so keep them low and contiguous.
+ * Node type constants for inodes, direntries and so on have to be the same as
+ * corresponding key type constants.
+ */
+enum {
+	UBIFS_INO_NODE,
+	UBIFS_DATA_NODE,
+	UBIFS_DENT_NODE,
+	UBIFS_XENT_NODE,
+	UBIFS_TRUN_NODE,
+	UBIFS_PAD_NODE,
+	UBIFS_SB_NODE,
+	UBIFS_MST_NODE,
+	UBIFS_REF_NODE,
+	UBIFS_IDX_NODE,
+	UBIFS_CS_NODE,
+	UBIFS_ORPH_NODE,
+	UBIFS_NODE_TYPES_CNT,
+};
+
+/*
+ * Master node flags.
+ *
+ * UBIFS_MST_DIRTY: rebooted uncleanly - master node is dirty
+ * UBIFS_MST_NO_ORPHS: no orphan inodes present
+ * UBIFS_MST_RCVRY: written by recovery
+ */
+enum {
+	UBIFS_MST_DIRTY = 1,
+	UBIFS_MST_NO_ORPHS = 2,
+	UBIFS_MST_RCVRY = 4,
+};
+
+/*
+ * Node group type (used by recovery to recover whole group or none).
+ *
+ * UBIFS_NO_NODE_GROUP: this node is not part of a group
+ * UBIFS_IN_NODE_GROUP: this node is a part of a group
+ * UBIFS_LAST_OF_NODE_GROUP: this node is the last in a group
+ */
+enum {
+	UBIFS_NO_NODE_GROUP = 0,
+	UBIFS_IN_NODE_GROUP,
+	UBIFS_LAST_OF_NODE_GROUP,
+};
+
+/*
+ * Superblock flags.
+ *
+ * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
+ * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed
+ */
+enum {
+	UBIFS_FLG_BIGLPT = 0x02,
+	UBIFS_FLG_SPACE_FIXUP = 0x04,
+};
+
+/**
+ * struct ubifs_ch - common header node.
+ * @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC)
+ * @crc: CRC-32 checksum of the node header
+ * @sqnum: sequence number
+ * @len: full node length
+ * @node_type: node type
+ * @group_type: node group type
+ * @padding: reserved for future, zeroes
+ *
+ * Every UBIFS node starts with this common part. If the node has a key, the
+ * key always goes next.
+ */
+struct ubifs_ch {
+	__le32 magic;
+	__le32 crc;
+	__le64 sqnum;
+	__le32 len;
+	__u8 node_type;
+	__u8 group_type;
+	__u8 padding[2];
+} __packed;
+
+/**
+ * union ubifs_dev_desc - device node descriptor.
+ * @new: new type device descriptor
+ * @huge: huge type device descriptor
+ *
+ * This data structure describes major/minor numbers of a device node. In an
+ * inode is a device node then its data contains an object of this type. UBIFS
+ * uses standard Linux "new" and "huge" device node encodings.
+ */
+union ubifs_dev_desc {
+	__le32 new;
+	__le64 huge;
+} __packed;
+
+/**
+ * struct ubifs_ino_node - inode node.
+ * @ch: common header
+ * @key: node key
+ * @creat_sqnum: sequence number at time of creation
+ * @size: inode size in bytes (amount of uncompressed data)
+ * @atime_sec: access time seconds
+ * @ctime_sec: creation time seconds
+ * @mtime_sec: modification time seconds
+ * @atime_nsec: access time nanoseconds
+ * @ctime_nsec: creation time nanoseconds
+ * @mtime_nsec: modification time nanoseconds
+ * @nlink: number of hard links
+ * @uid: owner ID
+ * @gid: group ID
+ * @mode: access flags
+ * @flags: per-inode flags (%UBIFS_COMPR_FL, %UBIFS_SYNC_FL, etc)
+ * @data_len: inode data length
+ * @xattr_cnt: count of extended attributes this inode has
+ * @xattr_size: summarized size of all extended attributes in bytes
+ * @padding1: reserved for future, zeroes
+ * @xattr_names: sum of lengths of all extended attribute names belonging to
+ *               this inode
+ * @compr_type: compression type used for this inode
+ * @padding2: reserved for future, zeroes
+ * @data: data attached to the inode
+ *
+ * Note, even though inode compression type is defined by @compr_type, some
+ * nodes of this inode may be compressed with different compressor - this
+ * happens if compression type is changed while the inode already has data
+ * nodes. But @compr_type will be use for further writes to the inode.
+ *
+ * Note, do not forget to amend 'zero_ino_node_unused()' function when changing
+ * the padding fields.
+ */
+struct ubifs_ino_node {
+	struct ubifs_ch ch;
+	__u8 key[UBIFS_MAX_KEY_LEN];
+	__le64 creat_sqnum;
+	__le64 size;
+	__le64 atime_sec;
+	__le64 ctime_sec;
+	__le64 mtime_sec;
+	__le32 atime_nsec;
+	__le32 ctime_nsec;
+	__le32 mtime_nsec;
+	__le32 nlink;
+	__le32 uid;
+	__le32 gid;
+	__le32 mode;
+	__le32 flags;
+	__le32 data_len;
+	__le32 xattr_cnt;
+	__le32 xattr_size;
+	__u8 padding1[4]; /* Watch 'zero_ino_node_unused()' if changing! */
+	__le32 xattr_names;
+	__le16 compr_type;
+	__u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
+	__u8 data[];
+} __packed;
+
+/**
+ * struct ubifs_dent_node - directory entry node.
+ * @ch: common header
+ * @key: node key
+ * @inum: target inode number
+ * @padding1: reserved for future, zeroes
+ * @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc)
+ * @nlen: name length
+ * @padding2: reserved for future, zeroes
+ * @name: zero-terminated name
+ *
+ * Note, do not forget to amend 'zero_dent_node_unused()' function when
+ * changing the padding fields.
+ */
+struct ubifs_dent_node {
+	struct ubifs_ch ch;
+	__u8 key[UBIFS_MAX_KEY_LEN];
+	__le64 inum;
+	__u8 padding1;
+	__u8 type;
+	__le16 nlen;
+	__u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
+	__u8 name[];
+} __packed;
+
+/**
+ * struct ubifs_data_node - data node.
+ * @ch: common header
+ * @key: node key
+ * @size: uncompressed data size in bytes
+ * @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc)
+ * @padding: reserved for future, zeroes
+ * @data: data
+ *
+ * Note, do not forget to amend 'zero_data_node_unused()' function when
+ * changing the padding fields.
+ */
+struct ubifs_data_node {
+	struct ubifs_ch ch;
+	__u8 key[UBIFS_MAX_KEY_LEN];
+	__le32 size;
+	__le16 compr_type;
+	__u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
+	__u8 data[];
+} __packed;
+
+/**
+ * struct ubifs_trun_node - truncation node.
+ * @ch: common header
+ * @inum: truncated inode number
+ * @padding: reserved for future, zeroes
+ * @old_size: size before truncation
+ * @new_size: size after truncation
+ *
+ * This node exists only in the journal and never goes to the main area. Note,
+ * do not forget to amend 'zero_trun_node_unused()' function when changing the
+ * padding fields.
+ */
+struct ubifs_trun_node {
+	struct ubifs_ch ch;
+	__le32 inum;
+	__u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
+	__le64 old_size;
+	__le64 new_size;
+} __packed;
+
+/**
+ * struct ubifs_pad_node - padding node.
+ * @ch: common header
+ * @pad_len: how many bytes after this node are unused (because padded)
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_pad_node {
+	struct ubifs_ch ch;
+	__le32 pad_len;
+} __packed;
+
+/**
+ * struct ubifs_sb_node - superblock node.
+ * @ch: common header
+ * @padding: reserved for future, zeroes
+ * @key_hash: type of hash function used in keys
+ * @key_fmt: format of the key
+ * @flags: file-system flags (%UBIFS_FLG_BIGLPT, etc)
+ * @min_io_size: minimal input/output unit size
+ * @leb_size: logical eraseblock size in bytes
+ * @leb_cnt: count of LEBs used by file-system
+ * @max_leb_cnt: maximum count of LEBs used by file-system
+ * @max_bud_bytes: maximum amount of data stored in buds
+ * @log_lebs: log size in logical eraseblocks
+ * @lpt_lebs: number of LEBs used for lprops table
+ * @orph_lebs: number of LEBs used for recording orphans
+ * @jhead_cnt: count of journal heads
+ * @fanout: tree fanout (max. number of links per indexing node)
+ * @lsave_cnt: number of LEB numbers in LPT's save table
+ * @fmt_version: UBIFS on-flash format version
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ * @padding1: reserved for future, zeroes
+ * @rp_uid: reserve pool UID
+ * @rp_gid: reserve pool GID
+ * @rp_size: size of the reserved pool in bytes
+ * @padding2: reserved for future, zeroes
+ * @time_gran: time granularity in nanoseconds
+ * @uuid: UUID generated when the file system image was created
+ * @ro_compat_version: UBIFS R/O compatibility version
+ */
+struct ubifs_sb_node {
+	struct ubifs_ch ch;
+	__u8 padding[2];
+	__u8 key_hash;
+	__u8 key_fmt;
+	__le32 flags;
+	__le32 min_io_size;
+	__le32 leb_size;
+	__le32 leb_cnt;
+	__le32 max_leb_cnt;
+	__le64 max_bud_bytes;
+	__le32 log_lebs;
+	__le32 lpt_lebs;
+	__le32 orph_lebs;
+	__le32 jhead_cnt;
+	__le32 fanout;
+	__le32 lsave_cnt;
+	__le32 fmt_version;
+	__le16 default_compr;
+	__u8 padding1[2];
+	__le32 rp_uid;
+	__le32 rp_gid;
+	__le64 rp_size;
+	__le32 time_gran;
+	__u8 uuid[16];
+	__le32 ro_compat_version;
+	__u8 padding2[3968];
+} __packed;
+
+/**
+ * struct ubifs_mst_node - master node.
+ * @ch: common header
+ * @highest_inum: highest inode number in the committed index
+ * @cmt_no: commit number
+ * @flags: various flags (%UBIFS_MST_DIRTY, etc)
+ * @log_lnum: start of the log
+ * @root_lnum: LEB number of the root indexing node
+ * @root_offs: offset within @root_lnum
+ * @root_len: root indexing node length
+ * @gc_lnum: LEB reserved for garbage collection (%-1 value means the LEB was
+ * not reserved and should be reserved on mount)
+ * @ihead_lnum: LEB number of index head
+ * @ihead_offs: offset of index head
+ * @index_size: size of index on flash
+ * @total_free: total free space in bytes
+ * @total_dirty: total dirty space in bytes
+ * @total_used: total used space in bytes (includes only data LEBs)
+ * @total_dead: total dead space in bytes (includes only data LEBs)
+ * @total_dark: total dark space in bytes (includes only data LEBs)
+ * @lpt_lnum: LEB number of LPT root nnode
+ * @lpt_offs: offset of LPT root nnode
+ * @nhead_lnum: LEB number of LPT head
+ * @nhead_offs: offset of LPT head
+ * @ltab_lnum: LEB number of LPT's own lprops table
+ * @ltab_offs: offset of LPT's own lprops table
+ * @lsave_lnum: LEB number of LPT's save table (big model only)
+ * @lsave_offs: offset of LPT's save table (big model only)
+ * @lscan_lnum: LEB number of last LPT scan
+ * @empty_lebs: number of empty logical eraseblocks
+ * @idx_lebs: number of indexing logical eraseblocks
+ * @leb_cnt: count of LEBs used by file-system
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_mst_node {
+	struct ubifs_ch ch;
+	__le64 highest_inum;
+	__le64 cmt_no;
+	__le32 flags;
+	__le32 log_lnum;
+	__le32 root_lnum;
+	__le32 root_offs;
+	__le32 root_len;
+	__le32 gc_lnum;
+	__le32 ihead_lnum;
+	__le32 ihead_offs;
+	__le64 index_size;
+	__le64 total_free;
+	__le64 total_dirty;
+	__le64 total_used;
+	__le64 total_dead;
+	__le64 total_dark;
+	__le32 lpt_lnum;
+	__le32 lpt_offs;
+	__le32 nhead_lnum;
+	__le32 nhead_offs;
+	__le32 ltab_lnum;
+	__le32 ltab_offs;
+	__le32 lsave_lnum;
+	__le32 lsave_offs;
+	__le32 lscan_lnum;
+	__le32 empty_lebs;
+	__le32 idx_lebs;
+	__le32 leb_cnt;
+	__u8 padding[344];
+} __packed;
+
+/**
+ * struct ubifs_ref_node - logical eraseblock reference node.
+ * @ch: common header
+ * @lnum: the referred logical eraseblock number
+ * @offs: start offset in the referred LEB
+ * @jhead: journal head number
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_ref_node {
+	struct ubifs_ch ch;
+	__le32 lnum;
+	__le32 offs;
+	__le32 jhead;
+	__u8 padding[28];
+} __packed;
+
+/**
+ * struct ubifs_branch - key/reference/length branch
+ * @lnum: LEB number of the target node
+ * @offs: offset within @lnum
+ * @len: target node length
+ * @key: key
+ */
+struct ubifs_branch {
+	__le32 lnum;
+	__le32 offs;
+	__le32 len;
+	__u8 key[];
+} __packed;
+
+/**
+ * struct ubifs_idx_node - indexing node.
+ * @ch: common header
+ * @child_cnt: number of child index nodes
+ * @level: tree level
+ * @branches: LEB number / offset / length / key branches
+ */
+struct ubifs_idx_node {
+	struct ubifs_ch ch;
+	__le16 child_cnt;
+	__le16 level;
+	__u8 branches[];
+} __packed;
+
+/**
+ * struct ubifs_cs_node - commit start node.
+ * @ch: common header
+ * @cmt_no: commit number
+ */
+struct ubifs_cs_node {
+	struct ubifs_ch ch;
+	__le64 cmt_no;
+} __packed;
+
+/**
+ * struct ubifs_orph_node - orphan node.
+ * @ch: common header
+ * @cmt_no: commit number (also top bit is set on the last node of the commit)
+ * @inos: inode numbers of orphans
+ */
+struct ubifs_orph_node {
+	struct ubifs_ch ch;
+	__le64 cmt_no;
+	__le64 inos[];
+} __packed;
+
+#endif /* __UBIFS_MEDIA_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
new file mode 100644
index 00000000..f79983d6
--- /dev/null
+++ b/fs/ubifs/ubifs.h
@@ -0,0 +1,1777 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+#ifndef __UBIFS_H__
+#define __UBIFS_H__
+
+#include <asm/div64.h>
+#include <linux/statfs.h>
+#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/mtd/ubi.h>
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include "ubifs-media.h"
+
+/* Version of this UBIFS implementation */
+#define UBIFS_VERSION 1
+
+/* Normal UBIFS messages */
+#define ubifs_msg(fmt, ...) \
+		printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__)
+/* UBIFS error messages */
+#define ubifs_err(fmt, ...)                                                  \
+	printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \
+	       __func__, ##__VA_ARGS__)
+/* UBIFS warning messages */
+#define ubifs_warn(fmt, ...)                                         \
+	printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \
+	       current->pid, __func__, ##__VA_ARGS__)
+
+/* UBIFS file system VFS magic number */
+#define UBIFS_SUPER_MAGIC 0x24051905
+
+/* Number of UBIFS blocks per VFS page */
+#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE)
+#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT)
+
+/* "File system end of life" sequence number watermark */
+#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
+#define SQNUM_WATERMARK      0xFFFFFFFFFF000000ULL
+
+/*
+ * Minimum amount of LEBs reserved for the index. At present the index needs at
+ * least 2 LEBs: one for the index head and one for in-the-gaps method (which
+ * currently does not cater for the index head and so excludes it from
+ * consideration).
+ */
+#define MIN_INDEX_LEBS 2
+
+/* Minimum amount of data UBIFS writes to the flash */
+#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8)
+
+/*
+ * Currently we do not support inode number overlapping and re-using, so this
+ * watermark defines dangerous inode number level. This should be fixed later,
+ * although it is difficult to exceed current limit. Another option is to use
+ * 64-bit inode numbers, but this means more overhead.
+ */
+#define INUM_WARN_WATERMARK 0xFFF00000
+#define INUM_WATERMARK      0xFFFFFF00
+
+/* Largest key size supported in this implementation */
+#define CUR_MAX_KEY_LEN UBIFS_SK_LEN
+
+/* Maximum number of entries in each LPT (LEB category) heap */
+#define LPT_HEAP_SZ 256
+
+/*
+ * Background thread name pattern. The numbers are UBI device and volume
+ * numbers.
+ */
+#define BGT_NAME_PATTERN "ubifs_bgt%d_%d"
+
+/* Write-buffer synchronization timeout interval in seconds */
+#define WBUF_TIMEOUT_SOFTLIMIT 3
+#define WBUF_TIMEOUT_HARDLIMIT 5
+
+/* Maximum possible inode number (only 32-bit inodes are supported now) */
+#define MAX_INUM 0xFFFFFFFF
+
+/* Number of non-data journal heads */
+#define NONDATA_JHEADS_CNT 2
+
+/* Shorter names for journal head numbers for internal usage */
+#define GCHD   UBIFS_GC_HEAD
+#define BASEHD UBIFS_BASE_HEAD
+#define DATAHD UBIFS_DATA_HEAD
+
+/* 'No change' value for 'ubifs_change_lp()' */
+#define LPROPS_NC 0x80000001
+
+/*
+ * There is no notion of truncation key because truncation nodes do not exist
+ * in TNC. However, when replaying, it is handy to introduce fake "truncation"
+ * keys for truncation nodes because the code becomes simpler. So we define
+ * %UBIFS_TRUN_KEY type.
+ *
+ * But otherwise, out of the journal reply scope, the truncation keys are
+ * invalid.
+ */
+#define UBIFS_TRUN_KEY    UBIFS_KEY_TYPES_CNT
+#define UBIFS_INVALID_KEY UBIFS_KEY_TYPES_CNT
+
+/*
+ * How much a directory entry/extended attribute entry adds to the parent/host
+ * inode.
+ */
+#define CALC_DENT_SIZE(name_len) ALIGN(UBIFS_DENT_NODE_SZ + (name_len) + 1, 8)
+
+/* How much an extended attribute adds to the host inode */
+#define CALC_XATTR_BYTES(data_len) ALIGN(UBIFS_INO_NODE_SZ + (data_len) + 1, 8)
+
+/*
+ * Znodes which were not touched for 'OLD_ZNODE_AGE' seconds are considered
+ * "old", and znode which were touched last 'YOUNG_ZNODE_AGE' seconds ago are
+ * considered "young". This is used by shrinker when selecting znode to trim
+ * off.
+ */
+#define OLD_ZNODE_AGE 20
+#define YOUNG_ZNODE_AGE 5
+
+/*
+ * Some compressors, like LZO, may end up with more data then the input buffer.
+ * So UBIFS always allocates larger output buffer, to be sure the compressor
+ * will not corrupt memory in case of worst case compression.
+ */
+#define WORST_COMPR_FACTOR 2
+
+/*
+ * How much memory is needed for a buffer where we comress a data node.
+ */
+#define COMPRESSED_DATA_NODE_BUF_SZ \
+	(UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR)
+
+/* Maximum expected tree height for use by bottom_up_buf */
+#define BOTTOM_UP_HEIGHT 64
+
+/* Maximum number of data nodes to bulk-read */
+#define UBIFS_MAX_BULK_READ 32
+
+/*
+ * Lockdep classes for UBIFS inode @ui_mutex.
+ */
+enum {
+	WB_MUTEX_1 = 0,
+	WB_MUTEX_2 = 1,
+	WB_MUTEX_3 = 2,
+};
+
+/*
+ * Znode flags (actually, bit numbers which store the flags).
+ *
+ * DIRTY_ZNODE: znode is dirty
+ * COW_ZNODE: znode is being committed and a new instance of this znode has to
+ *            be created before changing this znode
+ * OBSOLETE_ZNODE: znode is obsolete, which means it was deleted, but it is
+ *                 still in the commit list and the ongoing commit operation
+ *                 will commit it, and delete this znode after it is done
+ */
+enum {
+	DIRTY_ZNODE    = 0,
+	COW_ZNODE      = 1,
+	OBSOLETE_ZNODE = 2,
+};
+
+/*
+ * Commit states.
+ *
+ * COMMIT_RESTING: commit is not wanted
+ * COMMIT_BACKGROUND: background commit has been requested
+ * COMMIT_REQUIRED: commit is required
+ * COMMIT_RUNNING_BACKGROUND: background commit is running
+ * COMMIT_RUNNING_REQUIRED: commit is running and it is required
+ * COMMIT_BROKEN: commit failed
+ */
+enum {
+	COMMIT_RESTING = 0,
+	COMMIT_BACKGROUND,
+	COMMIT_REQUIRED,
+	COMMIT_RUNNING_BACKGROUND,
+	COMMIT_RUNNING_REQUIRED,
+	COMMIT_BROKEN,
+};
+
+/*
+ * 'ubifs_scan_a_node()' return values.
+ *
+ * SCANNED_GARBAGE:  scanned garbage
+ * SCANNED_EMPTY_SPACE: scanned empty space
+ * SCANNED_A_NODE: scanned a valid node
+ * SCANNED_A_CORRUPT_NODE: scanned a corrupted node
+ * SCANNED_A_BAD_PAD_NODE: scanned a padding node with invalid pad length
+ *
+ * Greater than zero means: 'scanned that number of padding bytes'
+ */
+enum {
+	SCANNED_GARBAGE        = 0,
+	SCANNED_EMPTY_SPACE    = -1,
+	SCANNED_A_NODE         = -2,
+	SCANNED_A_CORRUPT_NODE = -3,
+	SCANNED_A_BAD_PAD_NODE = -4,
+};
+
+/*
+ * LPT cnode flag bits.
+ *
+ * DIRTY_CNODE: cnode is dirty
+ * COW_CNODE: cnode is being committed and must be copied before writing
+ * OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted),
+ * so it can (and must) be freed when the commit is finished
+ */
+enum {
+	DIRTY_CNODE    = 0,
+	COW_CNODE      = 1,
+	OBSOLETE_CNODE = 2,
+};
+
+/*
+ * Dirty flag bits (lpt_drty_flgs) for LPT special nodes.
+ *
+ * LTAB_DIRTY: ltab node is dirty
+ * LSAVE_DIRTY: lsave node is dirty
+ */
+enum {
+	LTAB_DIRTY  = 1,
+	LSAVE_DIRTY = 2,
+};
+
+/*
+ * Return codes used by the garbage collector.
+ * @LEB_FREED: the logical eraseblock was freed and is ready to use
+ * @LEB_FREED_IDX: indexing LEB was freed and can be used only after the commit
+ * @LEB_RETAINED: the logical eraseblock was freed and retained for GC purposes
+ */
+enum {
+	LEB_FREED,
+	LEB_FREED_IDX,
+	LEB_RETAINED,
+};
+
+/**
+ * struct ubifs_old_idx - index node obsoleted since last commit start.
+ * @rb: rb-tree node
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ */
+struct ubifs_old_idx {
+	struct rb_node rb;
+	int lnum;
+	int offs;
+};
+
+/* The below union makes it easier to deal with keys */
+union ubifs_key {
+	uint8_t u8[CUR_MAX_KEY_LEN];
+	uint32_t u32[CUR_MAX_KEY_LEN/4];
+	uint64_t u64[CUR_MAX_KEY_LEN/8];
+	__le32 j32[CUR_MAX_KEY_LEN/4];
+};
+
+/**
+ * struct ubifs_scan_node - UBIFS scanned node information.
+ * @list: list of scanned nodes
+ * @key: key of node scanned (if it has one)
+ * @sqnum: sequence number
+ * @type: type of node scanned
+ * @offs: offset with LEB of node scanned
+ * @len: length of node scanned
+ * @node: raw node
+ */
+struct ubifs_scan_node {
+	struct list_head list;
+	union ubifs_key key;
+	unsigned long long sqnum;
+	int type;
+	int offs;
+	int len;
+	void *node;
+};
+
+/**
+ * struct ubifs_scan_leb - UBIFS scanned LEB information.
+ * @lnum: logical eraseblock number
+ * @nodes_cnt: number of nodes scanned
+ * @nodes: list of struct ubifs_scan_node
+ * @endpt: end point (and therefore the start of empty space)
+ * @ecc: read returned -EBADMSG
+ * @buf: buffer containing entire LEB scanned
+ */
+struct ubifs_scan_leb {
+	int lnum;
+	int nodes_cnt;
+	struct list_head nodes;
+	int endpt;
+	int ecc;
+	void *buf;
+};
+
+/**
+ * struct ubifs_gced_idx_leb - garbage-collected indexing LEB.
+ * @list: list
+ * @lnum: LEB number
+ * @unmap: OK to unmap this LEB
+ *
+ * This data structure is used to temporary store garbage-collected indexing
+ * LEBs - they are not released immediately, but only after the next commit.
+ * This is needed to guarantee recoverability.
+ */
+struct ubifs_gced_idx_leb {
+	struct list_head list;
+	int lnum;
+	int unmap;
+};
+
+/**
+ * struct ubifs_inode - UBIFS in-memory inode description.
+ * @vfs_inode: VFS inode description object
+ * @creat_sqnum: sequence number at time of creation
+ * @del_cmtno: commit number corresponding to the time the inode was deleted,
+ *             protected by @c->commit_sem;
+ * @xattr_size: summarized size of all extended attributes in bytes
+ * @xattr_cnt: count of extended attributes this inode has
+ * @xattr_names: sum of lengths of all extended attribute names belonging to
+ *               this inode
+ * @dirty: non-zero if the inode is dirty
+ * @xattr: non-zero if this is an extended attribute inode
+ * @bulk_read: non-zero if bulk-read should be used
+ * @ui_mutex: serializes inode write-back with the rest of VFS operations,
+ *            serializes "clean <-> dirty" state changes, serializes bulk-read,
+ *            protects @dirty, @bulk_read, @ui_size, and @xattr_size
+ * @ui_lock: protects @synced_i_size
+ * @synced_i_size: synchronized size of inode, i.e. the value of inode size
+ *                 currently stored on the flash; used only for regular file
+ *                 inodes
+ * @ui_size: inode size used by UBIFS when writing to flash
+ * @flags: inode flags (@UBIFS_COMPR_FL, etc)
+ * @compr_type: default compression type used for this inode
+ * @last_page_read: page number of last page read (for bulk read)
+ * @read_in_a_row: number of consecutive pages read in a row (for bulk read)
+ * @data_len: length of the data attached to the inode
+ * @data: inode's data
+ *
+ * @ui_mutex exists for two main reasons. At first it prevents inodes from
+ * being written back while UBIFS changing them, being in the middle of an VFS
+ * operation. This way UBIFS makes sure the inode fields are consistent. For
+ * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and
+ * write-back must not write any of them before we have finished.
+ *
+ * The second reason is budgeting - UBIFS has to budget all operations. If an
+ * operation is going to mark an inode dirty, it has to allocate budget for
+ * this. It cannot just mark it dirty because there is no guarantee there will
+ * be enough flash space to write the inode back later. This means UBIFS has
+ * to have full control over inode "clean <-> dirty" transitions (and pages
+ * actually). But unfortunately, VFS marks inodes dirty in many places, and it
+ * does not ask the file-system if it is allowed to do so (there is a notifier,
+ * but it is not enough), i.e., there is no mechanism to synchronize with this.
+ * So UBIFS has its own inode dirty flag and its own mutex to serialize
+ * "clean <-> dirty" transitions.
+ *
+ * The @synced_i_size field is used to make sure we never write pages which are
+ * beyond last synchronized inode size. See 'ubifs_writepage()' for more
+ * information.
+ *
+ * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
+ * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
+ * make sure @inode->i_size is always changed under @ui_mutex, because it
+ * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would
+ * deadlock with 'ubifs_writepage()' (see file.c). All the other inode fields
+ * are changed under @ui_mutex, so they do not need "shadow" fields. Note, one
+ * could consider to rework locking and base it on "shadow" fields.
+ */
+struct ubifs_inode {
+	struct inode vfs_inode;
+	unsigned long long creat_sqnum;
+	unsigned long long del_cmtno;
+	unsigned int xattr_size;
+	unsigned int xattr_cnt;
+	unsigned int xattr_names;
+	unsigned int dirty:1;
+	unsigned int xattr:1;
+	unsigned int bulk_read:1;
+	unsigned int compr_type:2;
+	struct mutex ui_mutex;
+	spinlock_t ui_lock;
+	loff_t synced_i_size;
+	loff_t ui_size;
+	int flags;
+	pgoff_t last_page_read;
+	pgoff_t read_in_a_row;
+	int data_len;
+	void *data;
+};
+
+/**
+ * struct ubifs_unclean_leb - records a LEB recovered under read-only mode.
+ * @list: list
+ * @lnum: LEB number of recovered LEB
+ * @endpt: offset where recovery ended
+ *
+ * This structure records a LEB identified during recovery that needs to be
+ * cleaned but was not because UBIFS was mounted read-only. The information
+ * is used to clean the LEB when remounting to read-write mode.
+ */
+struct ubifs_unclean_leb {
+	struct list_head list;
+	int lnum;
+	int endpt;
+};
+
+/*
+ * LEB properties flags.
+ *
+ * LPROPS_UNCAT: not categorized
+ * LPROPS_DIRTY: dirty > free, dirty >= @c->dead_wm, not index
+ * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index
+ * LPROPS_FREE: free > 0, dirty < @c->dead_wm, not empty, not index
+ * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
+ * LPROPS_EMPTY: LEB is empty, not taken
+ * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
+ * LPROPS_FRDI_IDX: free + dirty == leb_size and index, may be taken
+ * LPROPS_CAT_MASK: mask for the LEB categories above
+ * LPROPS_TAKEN: LEB was taken (this flag is not saved on the media)
+ * LPROPS_INDEX: LEB contains indexing nodes (this flag also exists on flash)
+ */
+enum {
+	LPROPS_UNCAT     =  0,
+	LPROPS_DIRTY     =  1,
+	LPROPS_DIRTY_IDX =  2,
+	LPROPS_FREE      =  3,
+	LPROPS_HEAP_CNT  =  3,
+	LPROPS_EMPTY     =  4,
+	LPROPS_FREEABLE  =  5,
+	LPROPS_FRDI_IDX  =  6,
+	LPROPS_CAT_MASK  = 15,
+	LPROPS_TAKEN     = 16,
+	LPROPS_INDEX     = 32,
+};
+
+/**
+ * struct ubifs_lprops - logical eraseblock properties.
+ * @free: amount of free space in bytes
+ * @dirty: amount of dirty space in bytes
+ * @flags: LEB properties flags (see above)
+ * @lnum: LEB number
+ * @list: list of same-category lprops (for LPROPS_EMPTY and LPROPS_FREEABLE)
+ * @hpos: heap position in heap of same-category lprops (other categories)
+ */
+struct ubifs_lprops {
+	int free;
+	int dirty;
+	int flags;
+	int lnum;
+	union {
+		struct list_head list;
+		int hpos;
+	};
+};
+
+/**
+ * struct ubifs_lpt_lprops - LPT logical eraseblock properties.
+ * @free: amount of free space in bytes
+ * @dirty: amount of dirty space in bytes
+ * @tgc: trivial GC flag (1 => unmap after commit end)
+ * @cmt: commit flag (1 => reserved for commit)
+ */
+struct ubifs_lpt_lprops {
+	int free;
+	int dirty;
+	unsigned tgc:1;
+	unsigned cmt:1;
+};
+
+/**
+ * struct ubifs_lp_stats - statistics of eraseblocks in the main area.
+ * @empty_lebs: number of empty LEBs
+ * @taken_empty_lebs: number of taken LEBs
+ * @idx_lebs: number of indexing LEBs
+ * @total_free: total free space in bytes (includes all LEBs)
+ * @total_dirty: total dirty space in bytes (includes all LEBs)
+ * @total_used: total used space in bytes (does not include index LEBs)
+ * @total_dead: total dead space in bytes (does not include index LEBs)
+ * @total_dark: total dark space in bytes (does not include index LEBs)
+ *
+ * The @taken_empty_lebs field counts the LEBs that are in the transient state
+ * of having been "taken" for use but not yet written to. @taken_empty_lebs is
+ * needed to account correctly for @gc_lnum, otherwise @empty_lebs could be
+ * used by itself (in which case 'unused_lebs' would be a better name). In the
+ * case of @gc_lnum, it is "taken" at mount time or whenever a LEB is retained
+ * by GC, but unlike other empty LEBs that are "taken", it may not be written
+ * straight away (i.e. before the next commit start or unmount), so either
+ * @gc_lnum must be specially accounted for, or the current approach followed
+ * i.e. count it under @taken_empty_lebs.
+ *
+ * @empty_lebs includes @taken_empty_lebs.
+ *
+ * @total_used, @total_dead and @total_dark fields do not account indexing
+ * LEBs.
+ */
+struct ubifs_lp_stats {
+	int empty_lebs;
+	int taken_empty_lebs;
+	int idx_lebs;
+	long long total_free;
+	long long total_dirty;
+	long long total_used;
+	long long total_dead;
+	long long total_dark;
+};
+
+struct ubifs_nnode;
+
+/**
+ * struct ubifs_cnode - LEB Properties Tree common node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (zero for pnodes, greater than zero for nnodes)
+ * @num: node number
+ */
+struct ubifs_cnode {
+	struct ubifs_nnode *parent;
+	struct ubifs_cnode *cnext;
+	unsigned long flags;
+	int iip;
+	int level;
+	int num;
+};
+
+/**
+ * struct ubifs_pnode - LEB Properties Tree leaf node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (always zero for pnodes)
+ * @num: node number
+ * @lprops: LEB properties array
+ */
+struct ubifs_pnode {
+	struct ubifs_nnode *parent;
+	struct ubifs_cnode *cnext;
+	unsigned long flags;
+	int iip;
+	int level;
+	int num;
+	struct ubifs_lprops lprops[UBIFS_LPT_FANOUT];
+};
+
+/**
+ * struct ubifs_nbranch - LEB Properties Tree internal node branch.
+ * @lnum: LEB number of child
+ * @offs: offset of child
+ * @nnode: nnode child
+ * @pnode: pnode child
+ * @cnode: cnode child
+ */
+struct ubifs_nbranch {
+	int lnum;
+	int offs;
+	union {
+		struct ubifs_nnode *nnode;
+		struct ubifs_pnode *pnode;
+		struct ubifs_cnode *cnode;
+	};
+};
+
+/**
+ * struct ubifs_nnode - LEB Properties Tree internal node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (always greater than zero for nnodes)
+ * @num: node number
+ * @nbranch: branches to child nodes
+ */
+struct ubifs_nnode {
+	struct ubifs_nnode *parent;
+	struct ubifs_cnode *cnext;
+	unsigned long flags;
+	int iip;
+	int level;
+	int num;
+	struct ubifs_nbranch nbranch[UBIFS_LPT_FANOUT];
+};
+
+/**
+ * struct ubifs_lpt_heap - heap of categorized lprops.
+ * @arr: heap array
+ * @cnt: number in heap
+ * @max_cnt: maximum number allowed in heap
+ *
+ * There are %LPROPS_HEAP_CNT heaps.
+ */
+struct ubifs_lpt_heap {
+	struct ubifs_lprops **arr;
+	int cnt;
+	int max_cnt;
+};
+
+/*
+ * Return codes for LPT scan callback function.
+ *
+ * LPT_SCAN_CONTINUE: continue scanning
+ * LPT_SCAN_ADD: add the LEB properties scanned to the tree in memory
+ * LPT_SCAN_STOP: stop scanning
+ */
+enum {
+	LPT_SCAN_CONTINUE = 0,
+	LPT_SCAN_ADD = 1,
+	LPT_SCAN_STOP = 2,
+};
+
+struct ubifs_info;
+
+/* Callback used by the 'ubifs_lpt_scan_nolock()' function */
+typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
+				       const struct ubifs_lprops *lprops,
+				       int in_tree, void *data);
+
+/**
+ * struct ubifs_wbuf - UBIFS write-buffer.
+ * @c: UBIFS file-system description object
+ * @buf: write-buffer (of min. flash I/O unit size)
+ * @lnum: logical eraseblock number the write-buffer points to
+ * @offs: write-buffer offset in this logical eraseblock
+ * @avail: number of bytes available in the write-buffer
+ * @used:  number of used bytes in the write-buffer
+ * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range)
+ * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
+ * %UBI_UNKNOWN)
+ * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
+ *         up by 'mutex_lock_nested()).
+ * @sync_callback: write-buffer synchronization callback
+ * @io_mutex: serializes write-buffer I/O
+ * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
+ *        fields
+ * @softlimit: soft write-buffer timeout interval
+ * @delta: hard and soft timeouts delta (the timer expire inteval is @softlimit
+ *         and @softlimit + @delta)
+ * @timer: write-buffer timer
+ * @no_timer: non-zero if this write-buffer does not have a timer
+ * @need_sync: non-zero if the timer expired and the wbuf needs sync'ing
+ * @next_ino: points to the next position of the following inode number
+ * @inodes: stores the inode numbers of the nodes which are in wbuf
+ *
+ * The write-buffer synchronization callback is called when the write-buffer is
+ * synchronized in order to notify how much space was wasted due to
+ * write-buffer padding and how much free space is left in the LEB.
+ *
+ * Note: the fields @buf, @lnum, @offs, @avail and @used can be read under
+ * spin-lock or mutex because they are written under both mutex and spin-lock.
+ * @buf is appended to under mutex but overwritten under both mutex and
+ * spin-lock. Thus the data between @buf and @buf + @used can be read under
+ * spinlock.
+ */
+struct ubifs_wbuf {
+	struct ubifs_info *c;
+	void *buf;
+	int lnum;
+	int offs;
+	int avail;
+	int used;
+	int size;
+	int dtype;
+	int jhead;
+	int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
+	struct mutex io_mutex;
+	spinlock_t lock;
+	ktime_t softlimit;
+	unsigned long long delta;
+	struct hrtimer timer;
+	unsigned int no_timer:1;
+	unsigned int need_sync:1;
+	int next_ino;
+	ino_t *inodes;
+};
+
+/**
+ * struct ubifs_bud - bud logical eraseblock.
+ * @lnum: logical eraseblock number
+ * @start: where the (uncommitted) bud data starts
+ * @jhead: journal head number this bud belongs to
+ * @list: link in the list buds belonging to the same journal head
+ * @rb: link in the tree of all buds
+ */
+struct ubifs_bud {
+	int lnum;
+	int start;
+	int jhead;
+	struct list_head list;
+	struct rb_node rb;
+};
+
+/**
+ * struct ubifs_jhead - journal head.
+ * @wbuf: head's write-buffer
+ * @buds_list: list of bud LEBs belonging to this journal head
+ * @grouped: non-zero if UBIFS groups nodes when writing to this journal head
+ *
+ * Note, the @buds list is protected by the @c->buds_lock.
+ */
+struct ubifs_jhead {
+	struct ubifs_wbuf wbuf;
+	struct list_head buds_list;
+	unsigned int grouped:1;
+};
+
+/**
+ * struct ubifs_zbranch - key/coordinate/length branch stored in znodes.
+ * @key: key
+ * @znode: znode address in memory
+ * @lnum: LEB number of the target node (indexing node or data node)
+ * @offs: target node offset within @lnum
+ * @len: target node length
+ */
+struct ubifs_zbranch {
+	union ubifs_key key;
+	union {
+		struct ubifs_znode *znode;
+		void *leaf;
+	};
+	int lnum;
+	int offs;
+	int len;
+};
+
+/**
+ * struct ubifs_znode - in-memory representation of an indexing node.
+ * @parent: parent znode or NULL if it is the root
+ * @cnext: next znode to commit
+ * @flags: znode flags (%DIRTY_ZNODE, %COW_ZNODE or %OBSOLETE_ZNODE)
+ * @time: last access time (seconds)
+ * @level: level of the entry in the TNC tree
+ * @child_cnt: count of child znodes
+ * @iip: index in parent's zbranch array
+ * @alt: lower bound of key range has altered i.e. child inserted at slot 0
+ * @lnum: LEB number of the corresponding indexing node
+ * @offs: offset of the corresponding indexing node
+ * @len: length  of the corresponding indexing node
+ * @zbranch: array of znode branches (@c->fanout elements)
+ */
+struct ubifs_znode {
+	struct ubifs_znode *parent;
+	struct ubifs_znode *cnext;
+	unsigned long flags;
+	unsigned long time;
+	int level;
+	int child_cnt;
+	int iip;
+	int alt;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+	int lnum, offs, len;
+#endif
+	struct ubifs_zbranch zbranch[];
+};
+
+/**
+ * struct bu_info - bulk-read information.
+ * @key: first data node key
+ * @zbranch: zbranches of data nodes to bulk read
+ * @buf: buffer to read into
+ * @buf_len: buffer length
+ * @gc_seq: GC sequence number to detect races with GC
+ * @cnt: number of data nodes for bulk read
+ * @blk_cnt: number of data blocks including holes
+ * @oef: end of file reached
+ */
+struct bu_info {
+	union ubifs_key key;
+	struct ubifs_zbranch zbranch[UBIFS_MAX_BULK_READ];
+	void *buf;
+	int buf_len;
+	int gc_seq;
+	int cnt;
+	int blk_cnt;
+	int eof;
+};
+
+/**
+ * struct ubifs_node_range - node length range description data structure.
+ * @len: fixed node length
+ * @min_len: minimum possible node length
+ * @max_len: maximum possible node length
+ *
+ * If @max_len is %0, the node has fixed length @len.
+ */
+struct ubifs_node_range {
+	union {
+		int len;
+		int min_len;
+	};
+	int max_len;
+};
+
+/**
+ * struct ubifs_compressor - UBIFS compressor description structure.
+ * @compr_type: compressor type (%UBIFS_COMPR_LZO, etc)
+ * @cc: cryptoapi compressor handle
+ * @comp_mutex: mutex used during compression
+ * @decomp_mutex: mutex used during decompression
+ * @name: compressor name
+ * @capi_name: cryptoapi compressor name
+ */
+struct ubifs_compressor {
+	int compr_type;
+	struct crypto_comp *cc;
+	struct mutex *comp_mutex;
+	struct mutex *decomp_mutex;
+	const char *name;
+	const char *capi_name;
+};
+
+/**
+ * struct ubifs_budget_req - budget requirements of an operation.
+ *
+ * @fast: non-zero if the budgeting should try to acquire budget quickly and
+ *        should not try to call write-back
+ * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields
+ *               have to be re-calculated
+ * @new_page: non-zero if the operation adds a new page
+ * @dirtied_page: non-zero if the operation makes a page dirty
+ * @new_dent: non-zero if the operation adds a new directory entry
+ * @mod_dent: non-zero if the operation removes or modifies an existing
+ *            directory entry
+ * @new_ino: non-zero if the operation adds a new inode
+ * @new_ino_d: now much data newly created inode contains
+ * @dirtied_ino: how many inodes the operation makes dirty
+ * @dirtied_ino_d: now much data dirtied inode contains
+ * @idx_growth: how much the index will supposedly grow
+ * @data_growth: how much new data the operation will supposedly add
+ * @dd_growth: how much data that makes other data dirty the operation will
+ *             supposedly add
+ *
+ * @idx_growth, @data_growth and @dd_growth are not used in budget request. The
+ * budgeting subsystem caches index and data growth values there to avoid
+ * re-calculating them when the budget is released. However, if @idx_growth is
+ * %-1, it is calculated by the release function using other fields.
+ *
+ * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d
+ * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made
+ * dirty by the re-name operation.
+ *
+ * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to
+ * make sure the amount of inode data which contribute to @new_ino_d and
+ * @dirtied_ino_d fields are aligned.
+ */
+struct ubifs_budget_req {
+	unsigned int fast:1;
+	unsigned int recalculate:1;
+#ifndef UBIFS_DEBUG
+	unsigned int new_page:1;
+	unsigned int dirtied_page:1;
+	unsigned int new_dent:1;
+	unsigned int mod_dent:1;
+	unsigned int new_ino:1;
+	unsigned int new_ino_d:13;
+	unsigned int dirtied_ino:4;
+	unsigned int dirtied_ino_d:15;
+#else
+	/* Not bit-fields to check for overflows */
+	unsigned int new_page;
+	unsigned int dirtied_page;
+	unsigned int new_dent;
+	unsigned int mod_dent;
+	unsigned int new_ino;
+	unsigned int new_ino_d;
+	unsigned int dirtied_ino;
+	unsigned int dirtied_ino_d;
+#endif
+	int idx_growth;
+	int data_growth;
+	int dd_growth;
+};
+
+/**
+ * struct ubifs_orphan - stores the inode number of an orphan.
+ * @rb: rb-tree node of rb-tree of orphans sorted by inode number
+ * @list: list head of list of orphans in order added
+ * @new_list: list head of list of orphans added since the last commit
+ * @cnext: next orphan to commit
+ * @dnext: next orphan to delete
+ * @inum: inode number
+ * @new: %1 => added since the last commit, otherwise %0
+ */
+struct ubifs_orphan {
+	struct rb_node rb;
+	struct list_head list;
+	struct list_head new_list;
+	struct ubifs_orphan *cnext;
+	struct ubifs_orphan *dnext;
+	ino_t inum;
+	int new;
+};
+
+/**
+ * struct ubifs_mount_opts - UBIFS-specific mount options information.
+ * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
+ * @bulk_read: enable/disable bulk-reads (%0 default, %1 disabe, %2 enable)
+ * @chk_data_crc: enable/disable CRC data checking when reading data nodes
+ *                (%0 default, %1 disabe, %2 enable)
+ * @override_compr: override default compressor (%0 - do not override and use
+ *                  superblock compressor, %1 - override and use compressor
+ *                  specified in @compr_type)
+ * @compr_type: compressor type to override the superblock compressor with
+ *              (%UBIFS_COMPR_NONE, etc)
+ */
+struct ubifs_mount_opts {
+	unsigned int unmount_mode:2;
+	unsigned int bulk_read:2;
+	unsigned int chk_data_crc:2;
+	unsigned int override_compr:1;
+	unsigned int compr_type:2;
+};
+
+/**
+ * struct ubifs_budg_info - UBIFS budgeting information.
+ * @idx_growth: amount of bytes budgeted for index growth
+ * @data_growth: amount of bytes budgeted for cached data
+ * @dd_growth: amount of bytes budgeted for cached data that will make
+ *             other data dirty
+ * @uncommitted_idx: amount of bytes were budgeted for growth of the index, but
+ *                   which still have to be taken into account because the index
+ *                   has not been committed so far
+ * @old_idx_sz: size of index on flash
+ * @min_idx_lebs: minimum number of LEBs required for the index
+ * @nospace: non-zero if the file-system does not have flash space (used as
+ *           optimization)
+ * @nospace_rp: the same as @nospace, but additionally means that even reserved
+ *              pool is full
+ * @page_budget: budget for a page (constant, nenver changed after mount)
+ * @inode_budget: budget for an inode (constant, nenver changed after mount)
+ * @dent_budget: budget for a directory entry (constant, nenver changed after
+ *               mount)
+ */
+struct ubifs_budg_info {
+	long long idx_growth;
+	long long data_growth;
+	long long dd_growth;
+	long long uncommitted_idx;
+	unsigned long long old_idx_sz;
+	int min_idx_lebs;
+	unsigned int nospace:1;
+	unsigned int nospace_rp:1;
+	int page_budget;
+	int inode_budget;
+	int dent_budget;
+};
+
+struct ubifs_debug_info;
+
+/**
+ * struct ubifs_info - UBIFS file-system description data structure
+ * (per-superblock).
+ * @vfs_sb: VFS @struct super_block object
+ * @bdi: backing device info object to make VFS happy and disable read-ahead
+ *
+ * @highest_inum: highest used inode number
+ * @max_sqnum: current global sequence number
+ * @cmt_no: commit number of the last successfully completed commit, protected
+ *          by @commit_sem
+ * @cnt_lock: protects @highest_inum and @max_sqnum counters
+ * @fmt_version: UBIFS on-flash format version
+ * @ro_compat_version: R/O compatibility version
+ * @uuid: UUID from super block
+ *
+ * @lhead_lnum: log head logical eraseblock number
+ * @lhead_offs: log head offset
+ * @ltail_lnum: log tail logical eraseblock number (offset is always 0)
+ * @log_mutex: protects the log, @lhead_lnum, @lhead_offs, @ltail_lnum, and
+ *             @bud_bytes
+ * @min_log_bytes: minimum required number of bytes in the log
+ * @cmt_bud_bytes: used during commit to temporarily amount of bytes in
+ *                 committed buds
+ *
+ * @buds: tree of all buds indexed by bud LEB number
+ * @bud_bytes: how many bytes of flash is used by buds
+ * @buds_lock: protects the @buds tree, @bud_bytes, and per-journal head bud
+ *             lists
+ * @jhead_cnt: count of journal heads
+ * @jheads: journal heads (head zero is base head)
+ * @max_bud_bytes: maximum number of bytes allowed in buds
+ * @bg_bud_bytes: number of bud bytes when background commit is initiated
+ * @old_buds: buds to be released after commit ends
+ * @max_bud_cnt: maximum number of buds
+ *
+ * @commit_sem: synchronizes committer with other processes
+ * @cmt_state: commit state
+ * @cs_lock: commit state lock
+ * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
+ *
+ * @big_lpt: flag that LPT is too big to write whole during commit
+ * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up
+ * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
+ *                   recovery)
+ * @bulk_read: enable bulk-reads
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ * @rw_incompat: the media is not R/W compatible
+ *
+ * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
+ *             @calc_idx_sz
+ * @zroot: zbranch which points to the root index node and znode
+ * @cnext: next znode to commit
+ * @enext: next znode to commit to empty space
+ * @gap_lebs: array of LEBs used by the in-gaps commit method
+ * @cbuf: commit buffer
+ * @ileb_buf: buffer for commit in-the-gaps method
+ * @ileb_len: length of data in ileb_buf
+ * @ihead_lnum: LEB number of index head
+ * @ihead_offs: offset of index head
+ * @ilebs: pre-allocated index LEBs
+ * @ileb_cnt: number of pre-allocated index LEBs
+ * @ileb_nxt: next pre-allocated index LEBs
+ * @old_idx: tree of index nodes obsoleted since the last commit start
+ * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c
+ *
+ * @mst_node: master node
+ * @mst_offs: offset of valid master node
+ * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
+ *
+ * @max_bu_buf_len: maximum bulk-read buffer length
+ * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
+ * @bu: pre-allocated bulk-read information
+ *
+ * @write_reserve_mutex: protects @write_reserve_buf
+ * @write_reserve_buf: on the write path we allocate memory, which might
+ *                     sometimes be unavailable, in which case we use this
+ *                     write reserve buffer
+ *
+ * @log_lebs: number of logical eraseblocks in the log
+ * @log_bytes: log size in bytes
+ * @log_last: last LEB of the log
+ * @lpt_lebs: number of LEBs used for lprops table
+ * @lpt_first: first LEB of the lprops table area
+ * @lpt_last: last LEB of the lprops table area
+ * @orph_lebs: number of LEBs used for the orphan area
+ * @orph_first: first LEB of the orphan area
+ * @orph_last: last LEB of the orphan area
+ * @main_lebs: count of LEBs in the main area
+ * @main_first: first LEB of the main area
+ * @main_bytes: main area size in bytes
+ *
+ * @key_hash_type: type of the key hash
+ * @key_hash: direntry key hash function
+ * @key_fmt: key format
+ * @key_len: key length
+ * @fanout: fanout of the index tree (number of links per indexing node)
+ *
+ * @min_io_size: minimal input/output unit size
+ * @min_io_shift: number of bits in @min_io_size minus one
+ * @max_write_size: maximum amount of bytes the underlying flash can write at a
+ *                  time (MTD write buffer size)
+ * @max_write_shift: number of bits in @max_write_size minus one
+ * @leb_size: logical eraseblock size in bytes
+ * @leb_start: starting offset of logical eraseblocks within physical
+ *             eraseblocks
+ * @half_leb_size: half LEB size
+ * @idx_leb_size: how many bytes of an LEB are effectively available when it is
+ *                used to store indexing nodes (@leb_size - @max_idx_node_sz)
+ * @leb_cnt: count of logical eraseblocks
+ * @max_leb_cnt: maximum count of logical eraseblocks
+ * @old_leb_cnt: count of logical eraseblocks before re-size
+ * @ro_media: the underlying UBI volume is read-only
+ * @ro_mount: the file-system was mounted as read-only
+ * @ro_error: UBIFS switched to R/O mode because an error happened
+ *
+ * @dirty_pg_cnt: number of dirty pages (not used)
+ * @dirty_zn_cnt: number of dirty znodes
+ * @clean_zn_cnt: number of clean znodes
+ *
+ * @space_lock: protects @bi and @lst
+ * @lst: lprops statistics
+ * @bi: budgeting information
+ * @calc_idx_sz: temporary variable which is used to calculate new index size
+ *               (contains accurate new index size at end of TNC commit start)
+ *
+ * @ref_node_alsz: size of the LEB reference node aligned to the min. flash
+ *                 I/O unit
+ * @mst_node_alsz: master node aligned size
+ * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
+ * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
+ * @max_inode_sz: maximum possible inode size in bytes
+ * @max_znode_sz: size of znode in bytes
+ *
+ * @leb_overhead: how many bytes are wasted in an LEB when it is filled with
+ *                data nodes of maximum size - used in free space reporting
+ * @dead_wm: LEB dead space watermark
+ * @dark_wm: LEB dark space watermark
+ * @block_cnt: count of 4KiB blocks on the FS
+ *
+ * @ranges: UBIFS node length ranges
+ * @ubi: UBI volume descriptor
+ * @di: UBI device information
+ * @vi: UBI volume information
+ *
+ * @orph_tree: rb-tree of orphan inode numbers
+ * @orph_list: list of orphan inode numbers in order added
+ * @orph_new: list of orphan inode numbers added since last commit
+ * @orph_cnext: next orphan to commit
+ * @orph_dnext: next orphan to delete
+ * @orphan_lock: lock for orph_tree and orph_new
+ * @orph_buf: buffer for orphan nodes
+ * @new_orphans: number of orphans since last commit
+ * @cmt_orphans: number of orphans being committed
+ * @tot_orphans: number of orphans in the rb_tree
+ * @max_orphans: maximum number of orphans allowed
+ * @ohead_lnum: orphan head LEB number
+ * @ohead_offs: orphan head offset
+ * @no_orphs: non-zero if there are no orphans
+ *
+ * @bgt: UBIFS background thread
+ * @bgt_name: background thread name
+ * @need_bgt: if background thread should run
+ * @need_wbuf_sync: if write-buffers have to be synchronized
+ *
+ * @gc_lnum: LEB number used for garbage collection
+ * @sbuf: a buffer of LEB size used by GC and replay for scanning
+ * @idx_gc: list of index LEBs that have been garbage collected
+ * @idx_gc_cnt: number of elements on the idx_gc list
+ * @gc_seq: incremented for every non-index LEB garbage collected
+ * @gced_lnum: last non-index LEB that was garbage collected
+ *
+ * @infos_list: links all 'ubifs_info' objects
+ * @umount_mutex: serializes shrinker and un-mount
+ * @shrinker_run_no: shrinker run number
+ *
+ * @space_bits: number of bits needed to record free or dirty space
+ * @lpt_lnum_bits: number of bits needed to record a LEB number in the LPT
+ * @lpt_offs_bits: number of bits needed to record an offset in the LPT
+ * @lpt_spc_bits: number of bits needed to space in the LPT
+ * @pcnt_bits: number of bits needed to record pnode or nnode number
+ * @lnum_bits: number of bits needed to record LEB number
+ * @nnode_sz: size of on-flash nnode
+ * @pnode_sz: size of on-flash pnode
+ * @ltab_sz: size of on-flash LPT lprops table
+ * @lsave_sz: size of on-flash LPT save table
+ * @pnode_cnt: number of pnodes
+ * @nnode_cnt: number of nnodes
+ * @lpt_hght: height of the LPT
+ * @pnodes_have: number of pnodes in memory
+ *
+ * @lp_mutex: protects lprops table and all the other lprops-related fields
+ * @lpt_lnum: LEB number of the root nnode of the LPT
+ * @lpt_offs: offset of the root nnode of the LPT
+ * @nhead_lnum: LEB number of LPT head
+ * @nhead_offs: offset of LPT head
+ * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab
+ * @dirty_nn_cnt: number of dirty nnodes
+ * @dirty_pn_cnt: number of dirty pnodes
+ * @check_lpt_free: flag that indicates LPT GC may be needed
+ * @lpt_sz: LPT size
+ * @lpt_nod_buf: buffer for an on-flash nnode or pnode
+ * @lpt_buf: buffer of LEB size used by LPT
+ * @nroot: address in memory of the root nnode of the LPT
+ * @lpt_cnext: next LPT node to commit
+ * @lpt_heap: array of heaps of categorized lprops
+ * @dirty_idx: a (reverse sorted) copy of the LPROPS_DIRTY_IDX heap as at
+ *             previous commit start
+ * @uncat_list: list of un-categorized LEBs
+ * @empty_list: list of empty LEBs
+ * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
+ * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
+ * @freeable_cnt: number of freeable LEBs in @freeable_list
+ *
+ * @ltab_lnum: LEB number of LPT's own lprops table
+ * @ltab_offs: offset of LPT's own lprops table
+ * @ltab: LPT's own lprops table
+ * @ltab_cmt: LPT's own lprops table (commit copy)
+ * @lsave_cnt: number of LEB numbers in LPT's save table
+ * @lsave_lnum: LEB number of LPT's save table
+ * @lsave_offs: offset of LPT's save table
+ * @lsave: LPT's save table
+ * @lscan_lnum: LEB number of last LPT scan
+ *
+ * @rp_size: size of the reserved pool in bytes
+ * @report_rp_size: size of the reserved pool reported to user-space
+ * @rp_uid: reserved pool user ID
+ * @rp_gid: reserved pool group ID
+ *
+ * @empty: %1 if the UBI device is empty
+ * @need_recovery: %1 if the file-system needs recovery
+ * @replaying: %1 during journal replay
+ * @mounting: %1 while mounting
+ * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
+ * @replay_list: temporary list used during journal replay
+ * @replay_buds: list of buds to replay
+ * @cs_sqnum: sequence number of first node in the log (commit start node)
+ * @replay_sqnum: sequence number of node currently being replayed
+ * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
+ *                    mode
+ * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
+ *                  FS to R/W mode
+ * @size_tree: inode size information for recovery
+ * @mount_opts: UBIFS-specific mount options
+ *
+ * @dbg: debugging-related information
+ */
+struct ubifs_info {
+	struct super_block *vfs_sb;
+	struct backing_dev_info bdi;
+
+	ino_t highest_inum;
+	unsigned long long max_sqnum;
+	unsigned long long cmt_no;
+	spinlock_t cnt_lock;
+	int fmt_version;
+	int ro_compat_version;
+	unsigned char uuid[16];
+
+	int lhead_lnum;
+	int lhead_offs;
+	int ltail_lnum;
+	struct mutex log_mutex;
+	int min_log_bytes;
+	long long cmt_bud_bytes;
+
+	struct rb_root buds;
+	long long bud_bytes;
+	spinlock_t buds_lock;
+	int jhead_cnt;
+	struct ubifs_jhead *jheads;
+	long long max_bud_bytes;
+	long long bg_bud_bytes;
+	struct list_head old_buds;
+	int max_bud_cnt;
+
+	struct rw_semaphore commit_sem;
+	int cmt_state;
+	spinlock_t cs_lock;
+	wait_queue_head_t cmt_wq;
+
+	unsigned int big_lpt:1;
+	unsigned int space_fixup:1;
+	unsigned int no_chk_data_crc:1;
+	unsigned int bulk_read:1;
+	unsigned int default_compr:2;
+	unsigned int rw_incompat:1;
+
+	struct mutex tnc_mutex;
+	struct ubifs_zbranch zroot;
+	struct ubifs_znode *cnext;
+	struct ubifs_znode *enext;
+	int *gap_lebs;
+	void *cbuf;
+	void *ileb_buf;
+	int ileb_len;
+	int ihead_lnum;
+	int ihead_offs;
+	int *ilebs;
+	int ileb_cnt;
+	int ileb_nxt;
+	struct rb_root old_idx;
+	int *bottom_up_buf;
+
+	struct ubifs_mst_node *mst_node;
+	int mst_offs;
+	struct mutex mst_mutex;
+
+	int max_bu_buf_len;
+	struct mutex bu_mutex;
+	struct bu_info bu;
+
+	struct mutex write_reserve_mutex;
+	void *write_reserve_buf;
+
+	int log_lebs;
+	long long log_bytes;
+	int log_last;
+	int lpt_lebs;
+	int lpt_first;
+	int lpt_last;
+	int orph_lebs;
+	int orph_first;
+	int orph_last;
+	int main_lebs;
+	int main_first;
+	long long main_bytes;
+
+	uint8_t key_hash_type;
+	uint32_t (*key_hash)(const char *str, int len);
+	int key_fmt;
+	int key_len;
+	int fanout;
+
+	int min_io_size;
+	int min_io_shift;
+	int max_write_size;
+	int max_write_shift;
+	int leb_size;
+	int leb_start;
+	int half_leb_size;
+	int idx_leb_size;
+	int leb_cnt;
+	int max_leb_cnt;
+	int old_leb_cnt;
+	unsigned int ro_media:1;
+	unsigned int ro_mount:1;
+	unsigned int ro_error:1;
+
+	atomic_long_t dirty_pg_cnt;
+	atomic_long_t dirty_zn_cnt;
+	atomic_long_t clean_zn_cnt;
+
+	spinlock_t space_lock;
+	struct ubifs_lp_stats lst;
+	struct ubifs_budg_info bi;
+	unsigned long long calc_idx_sz;
+
+	int ref_node_alsz;
+	int mst_node_alsz;
+	int min_idx_node_sz;
+	int max_idx_node_sz;
+	long long max_inode_sz;
+	int max_znode_sz;
+
+	int leb_overhead;
+	int dead_wm;
+	int dark_wm;
+	int block_cnt;
+
+	struct ubifs_node_range ranges[UBIFS_NODE_TYPES_CNT];
+	struct ubi_volume_desc *ubi;
+	struct ubi_device_info di;
+	struct ubi_volume_info vi;
+
+	struct rb_root orph_tree;
+	struct list_head orph_list;
+	struct list_head orph_new;
+	struct ubifs_orphan *orph_cnext;
+	struct ubifs_orphan *orph_dnext;
+	spinlock_t orphan_lock;
+	void *orph_buf;
+	int new_orphans;
+	int cmt_orphans;
+	int tot_orphans;
+	int max_orphans;
+	int ohead_lnum;
+	int ohead_offs;
+	int no_orphs;
+
+	struct task_struct *bgt;
+	char bgt_name[sizeof(BGT_NAME_PATTERN) + 9];
+	int need_bgt;
+	int need_wbuf_sync;
+
+	int gc_lnum;
+	void *sbuf;
+	struct list_head idx_gc;
+	int idx_gc_cnt;
+	int gc_seq;
+	int gced_lnum;
+
+	struct list_head infos_list;
+	struct mutex umount_mutex;
+	unsigned int shrinker_run_no;
+
+	int space_bits;
+	int lpt_lnum_bits;
+	int lpt_offs_bits;
+	int lpt_spc_bits;
+	int pcnt_bits;
+	int lnum_bits;
+	int nnode_sz;
+	int pnode_sz;
+	int ltab_sz;
+	int lsave_sz;
+	int pnode_cnt;
+	int nnode_cnt;
+	int lpt_hght;
+	int pnodes_have;
+
+	struct mutex lp_mutex;
+	int lpt_lnum;
+	int lpt_offs;
+	int nhead_lnum;
+	int nhead_offs;
+	int lpt_drty_flgs;
+	int dirty_nn_cnt;
+	int dirty_pn_cnt;
+	int check_lpt_free;
+	long long lpt_sz;
+	void *lpt_nod_buf;
+	void *lpt_buf;
+	struct ubifs_nnode *nroot;
+	struct ubifs_cnode *lpt_cnext;
+	struct ubifs_lpt_heap lpt_heap[LPROPS_HEAP_CNT];
+	struct ubifs_lpt_heap dirty_idx;
+	struct list_head uncat_list;
+	struct list_head empty_list;
+	struct list_head freeable_list;
+	struct list_head frdi_idx_list;
+	int freeable_cnt;
+
+	int ltab_lnum;
+	int ltab_offs;
+	struct ubifs_lpt_lprops *ltab;
+	struct ubifs_lpt_lprops *ltab_cmt;
+	int lsave_cnt;
+	int lsave_lnum;
+	int lsave_offs;
+	int *lsave;
+	int lscan_lnum;
+
+	long long rp_size;
+	long long report_rp_size;
+	uid_t rp_uid;
+	gid_t rp_gid;
+
+	/* The below fields are used only during mounting and re-mounting */
+	unsigned int empty:1;
+	unsigned int need_recovery:1;
+	unsigned int replaying:1;
+	unsigned int mounting:1;
+	unsigned int remounting_rw:1;
+	struct list_head replay_list;
+	struct list_head replay_buds;
+	unsigned long long cs_sqnum;
+	unsigned long long replay_sqnum;
+	struct list_head unclean_leb_list;
+	struct ubifs_mst_node *rcvrd_mst_node;
+	struct rb_root size_tree;
+	struct ubifs_mount_opts mount_opts;
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+	struct ubifs_debug_info *dbg;
+#endif
+};
+
+extern struct list_head ubifs_infos;
+extern spinlock_t ubifs_infos_lock;
+extern atomic_long_t ubifs_clean_zn_cnt;
+extern struct kmem_cache *ubifs_inode_slab;
+extern const struct super_operations ubifs_super_operations;
+extern const struct address_space_operations ubifs_file_address_operations;
+extern const struct file_operations ubifs_file_operations;
+extern const struct inode_operations ubifs_file_inode_operations;
+extern const struct file_operations ubifs_dir_operations;
+extern const struct inode_operations ubifs_dir_inode_operations;
+extern const struct inode_operations ubifs_symlink_inode_operations;
+extern struct backing_dev_info ubifs_backing_dev_info;
+extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
+
+/* io.c */
+void ubifs_ro_mode(struct ubifs_info *c, int err);
+int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
+			   int dtype);
+int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf);
+int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
+		    int lnum, int offs);
+int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
+			 int lnum, int offs);
+int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
+		     int offs, int dtype);
+int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
+		     int offs, int quiet, int must_chk_crc);
+void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
+void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
+int ubifs_io_init(struct ubifs_info *c);
+void ubifs_pad(const struct ubifs_info *c, void *buf, int pad);
+int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf);
+int ubifs_bg_wbufs_sync(struct ubifs_info *c);
+void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum);
+int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode);
+
+/* scan.c */
+struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
+				  int offs, void *sbuf, int quiet);
+void ubifs_scan_destroy(struct ubifs_scan_leb *sleb);
+int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
+		      int offs, int quiet);
+struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
+					int offs, void *sbuf);
+void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		    int lnum, int offs);
+int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		   void *buf, int offs);
+void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
+			      void *buf);
+
+/* log.c */
+void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud);
+void ubifs_create_buds_lists(struct ubifs_info *c);
+int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs);
+struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum);
+struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum);
+int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum);
+int ubifs_log_end_commit(struct ubifs_info *c, int new_ltail_lnum);
+int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum);
+int ubifs_consolidate_log(struct ubifs_info *c);
+
+/* journal.c */
+int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
+		     const struct qstr *nm, const struct inode *inode,
+		     int deletion, int xent);
+int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
+			 const union ubifs_key *key, const void *buf, int len);
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode);
+int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode);
+int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
+		     const struct dentry *old_dentry,
+		     const struct inode *new_dir,
+		     const struct dentry *new_dentry, int sync);
+int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
+		       loff_t old_size, loff_t new_size);
+int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
+			   const struct inode *inode, const struct qstr *nm);
+int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1,
+			   const struct inode *inode2);
+
+/* budget.c */
+int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req);
+void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req);
+void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
+				      struct ubifs_inode *ui);
+int ubifs_budget_inode_op(struct ubifs_info *c, struct inode *inode,
+			  struct ubifs_budget_req *req);
+void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
+				struct ubifs_budget_req *req);
+void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
+			 struct ubifs_budget_req *req);
+long long ubifs_get_free_space(struct ubifs_info *c);
+long long ubifs_get_free_space_nolock(struct ubifs_info *c);
+int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
+void ubifs_convert_page_budget(struct ubifs_info *c);
+long long ubifs_reported_space(const struct ubifs_info *c, long long free);
+long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
+
+/* find.c */
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
+			  int squeeze);
+int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
+int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
+			 int min_space, int pick_free);
+int ubifs_find_dirty_idx_leb(struct ubifs_info *c);
+int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
+
+/* tnc.c */
+int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
+			struct ubifs_znode **zn, int *n);
+int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+			void *node, const struct qstr *nm);
+int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
+		     void *node, int *lnum, int *offs);
+int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
+		  int offs, int len);
+int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
+		      int old_lnum, int old_offs, int lnum, int offs, int len);
+int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
+		     int lnum, int offs, int len, const struct qstr *nm);
+int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key);
+int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
+			const struct qstr *nm);
+int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
+			   union ubifs_key *to_key);
+int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum);
+struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
+					   union ubifs_key *key,
+					   const struct qstr *nm);
+void ubifs_tnc_close(struct ubifs_info *c);
+int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
+		       int lnum, int offs, int is_idx);
+int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
+			 int lnum, int offs);
+/* Shared by tnc.c for tnc_commit.c */
+void destroy_old_idx(struct ubifs_info *c);
+int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
+		       int lnum, int offs);
+int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode);
+int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu);
+int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu);
+
+/* tnc_misc.c */
+struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
+					      struct ubifs_znode *znode);
+int ubifs_search_zbranch(const struct ubifs_info *c,
+			 const struct ubifs_znode *znode,
+			 const union ubifs_key *key, int *n);
+struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode);
+struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode);
+long ubifs_destroy_tnc_subtree(struct ubifs_znode *zr);
+struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
+				     struct ubifs_zbranch *zbr,
+				     struct ubifs_znode *parent, int iip);
+int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			void *node);
+
+/* tnc_commit.c */
+int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int ubifs_tnc_end_commit(struct ubifs_info *c);
+
+/* shrinker.c */
+int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc);
+
+/* commit.c */
+int ubifs_bg_thread(void *info);
+void ubifs_commit_required(struct ubifs_info *c);
+void ubifs_request_bg_commit(struct ubifs_info *c);
+int ubifs_run_commit(struct ubifs_info *c);
+void ubifs_recovery_commit(struct ubifs_info *c);
+int ubifs_gc_should_commit(struct ubifs_info *c);
+void ubifs_wait_for_commit(struct ubifs_info *c);
+
+/* master.c */
+int ubifs_read_master(struct ubifs_info *c);
+int ubifs_write_master(struct ubifs_info *c);
+
+/* sb.c */
+int ubifs_read_superblock(struct ubifs_info *c);
+struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
+int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
+int ubifs_fixup_free_space(struct ubifs_info *c);
+
+/* replay.c */
+int ubifs_validate_entry(struct ubifs_info *c,
+			 const struct ubifs_dent_node *dent);
+int ubifs_replay_journal(struct ubifs_info *c);
+
+/* gc.c */
+int ubifs_garbage_collect(struct ubifs_info *c, int anyway);
+int ubifs_gc_start_commit(struct ubifs_info *c);
+int ubifs_gc_end_commit(struct ubifs_info *c);
+void ubifs_destroy_idx_gc(struct ubifs_info *c);
+int ubifs_get_idx_gc_leb(struct ubifs_info *c);
+int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp);
+
+/* orphan.c */
+int ubifs_add_orphan(struct ubifs_info *c, ino_t inum);
+void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
+int ubifs_orphan_start_commit(struct ubifs_info *c);
+int ubifs_orphan_end_commit(struct ubifs_info *c);
+int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
+int ubifs_clear_orphans(struct ubifs_info *c);
+
+/* lpt.c */
+int ubifs_calc_lpt_geom(struct ubifs_info *c);
+int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
+			  int *lpt_lebs, int *big_lpt);
+int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr);
+struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum);
+struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum);
+int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
+			  ubifs_lpt_scan_callback scan_cb, void *data);
+
+/* Shared by lpt.c for lpt_commit.c */
+void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave);
+void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
+		     struct ubifs_lpt_lprops *ltab);
+void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
+		      struct ubifs_pnode *pnode);
+void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
+		      struct ubifs_nnode *nnode);
+struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
+				    struct ubifs_nnode *parent, int iip);
+struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
+				    struct ubifs_nnode *parent, int iip);
+int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip);
+void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
+void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
+uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits);
+struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
+/* Needed only in debugging code in lpt_commit.c */
+int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
+		       struct ubifs_nnode *nnode);
+
+/* lpt_commit.c */
+int ubifs_lpt_start_commit(struct ubifs_info *c);
+int ubifs_lpt_end_commit(struct ubifs_info *c);
+int ubifs_lpt_post_commit(struct ubifs_info *c);
+void ubifs_lpt_free(struct ubifs_info *c, int wr_only);
+
+/* lprops.c */
+const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
+					   const struct ubifs_lprops *lp,
+					   int free, int dirty, int flags,
+					   int idx_gc_cnt);
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst);
+void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
+		      int cat);
+void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
+		       struct ubifs_lprops *new_lprops);
+void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops);
+int ubifs_categorize_lprops(const struct ubifs_info *c,
+			    const struct ubifs_lprops *lprops);
+int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+			int flags_set, int flags_clean, int idx_gc_cnt);
+int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+			int flags_set, int flags_clean);
+int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp);
+const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
+int ubifs_calc_dark(const struct ubifs_info *c, int spc);
+
+/* file.c */
+int ubifs_fsync(struct file *file, int datasync);
+int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
+
+/* dir.c */
+struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+			      int mode);
+int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		  struct kstat *stat);
+
+/* xattr.c */
+int ubifs_setxattr(struct dentry *dentry, const char *name,
+		   const void *value, size_t size, int flags);
+ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
+		       size_t size);
+ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int ubifs_removexattr(struct dentry *dentry, const char *name);
+
+/* super.c */
+struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
+
+/* recovery.c */
+int ubifs_recover_master_node(struct ubifs_info *c);
+int ubifs_write_rcvrd_mst_node(struct ubifs_info *c);
+struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
+					 int offs, void *sbuf, int jhead);
+struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
+					     int offs, void *sbuf);
+int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf);
+int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf);
+int ubifs_rcvry_gc_commit(struct ubifs_info *c);
+int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
+			     int deletion, loff_t new_size);
+int ubifs_recover_size(struct ubifs_info *c);
+void ubifs_destroy_size_tree(struct ubifs_info *c);
+
+/* ioctl.c */
+long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+void ubifs_set_inode_flags(struct inode *inode);
+#ifdef CONFIG_COMPAT
+long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+#endif
+
+/* compressor.c */
+int __init ubifs_compressors_init(void);
+void ubifs_compressors_exit(void);
+void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
+		    int *compr_type);
+int ubifs_decompress(const void *buf, int len, void *out, int *out_len,
+		     int compr_type);
+
+#include "debug.h"
+#include "misc.h"
+#include "key.h"
+
+#endif /* !__UBIFS_H__ */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
new file mode 100644
index 00000000..16f19f55
--- /dev/null
+++ b/fs/ubifs/xattr.c
@@ -0,0 +1,572 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS extended attributes support.
+ *
+ * Extended attributes are implemented as regular inodes with attached data,
+ * which limits extended attribute size to UBIFS block size (4KiB). Names of
+ * extended attributes are described by extended attribute entries (xentries),
+ * which are almost identical to directory entries, but have different key type.
+ *
+ * In other words, the situation with extended attributes is very similar to
+ * directories. Indeed, any inode (but of course not xattr inodes) may have a
+ * number of associated xentries, just like directory inodes have associated
+ * directory entries. Extended attribute entries store the name of the extended
+ * attribute, the host inode number, and the extended attribute inode number.
+ * Similarly, direntries store the name, the parent and the target inode
+ * numbers. Thus, most of the common UBIFS mechanisms may be re-used for
+ * extended attributes.
+ *
+ * The number of extended attributes is not limited, but there is Linux
+ * limitation on the maximum possible size of the list of all extended
+ * attributes associated with an inode (%XATTR_LIST_MAX), so UBIFS makes sure
+ * the sum of all extended attribute names of the inode does not exceed that
+ * limit.
+ *
+ * Extended attributes are synchronous, which means they are written to the
+ * flash media synchronously and there is no write-back for extended attribute
+ * inodes. The extended attribute values are not stored in compressed form on
+ * the media.
+ *
+ * Since extended attributes are represented by regular inodes, they are cached
+ * in the VFS inode cache. The xentries are cached in the LNC cache (see
+ * tnc.c).
+ *
+ * ACL support is not implemented.
+ */
+
+#include "ubifs.h"
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+
+/*
+ * Limit the number of extended attributes per inode so that the total size
+ * (@xattr_size) is guaranteeded to fit in an 'unsigned int'.
+ */
+#define MAX_XATTRS_PER_INODE 65535
+
+/*
+ * Extended attribute type constants.
+ *
+ * USER_XATTR: user extended attribute ("user.*")
+ * TRUSTED_XATTR: trusted extended attribute ("trusted.*)
+ * SECURITY_XATTR: security extended attribute ("security.*")
+ */
+enum {
+	USER_XATTR,
+	TRUSTED_XATTR,
+	SECURITY_XATTR,
+};
+
+static const struct inode_operations empty_iops;
+static const struct file_operations empty_fops;
+
+/**
+ * create_xattr - create an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @nm: extended attribute name
+ * @value: extended attribute value
+ * @size: size of extended attribute value
+ *
+ * This is a helper function which creates an extended attribute of name @nm
+ * and value @value for inode @host. The host inode is also updated on flash
+ * because the ctime and extended attribute accounting data changes. This
+ * function returns zero in case of success and a negative error code in case
+ * of failure.
+ */
+static int create_xattr(struct ubifs_info *c, struct inode *host,
+			const struct qstr *nm, const void *value, int size)
+{
+	int err;
+	struct inode *inode;
+	struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+				.new_ino_d = ALIGN(size, 8), .dirtied_ino = 1,
+				.dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
+
+	if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)
+		return -ENOSPC;
+	/*
+	 * Linux limits the maximum size of the extended attribute names list
+	 * to %XATTR_LIST_MAX. This means we should not allow creating more
+	 * extended attributes if the name list becomes larger. This limitation
+	 * is artificial for UBIFS, though.
+	 */
+	if (host_ui->xattr_names + host_ui->xattr_cnt +
+					nm->len + 1 > XATTR_LIST_MAX)
+		return -ENOSPC;
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_budg;
+	}
+
+	/* Re-define all operations to be "nothing" */
+	inode->i_mapping->a_ops = &empty_aops;
+	inode->i_op = &empty_iops;
+	inode->i_fop = &empty_fops;
+
+	inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
+	ui = ubifs_inode(inode);
+	ui->xattr = 1;
+	ui->flags |= UBIFS_XATTR_FL;
+	ui->data = kmalloc(size, GFP_NOFS);
+	if (!ui->data) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+	memcpy(ui->data, value, size);
+	inode->i_size = ui->ui_size = size;
+	ui->data_len = size;
+
+	mutex_lock(&host_ui->ui_mutex);
+	host->i_ctime = ubifs_current_time(host);
+	host_ui->xattr_cnt += 1;
+	host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size += CALC_XATTR_BYTES(size);
+	host_ui->xattr_names += nm->len;
+
+	err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&host_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	insert_inode_hash(inode);
+	iput(inode);
+	return 0;
+
+out_cancel:
+	host_ui->xattr_cnt -= 1;
+	host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size -= CALC_XATTR_BYTES(size);
+	mutex_unlock(&host_ui->ui_mutex);
+out_free:
+	make_bad_inode(inode);
+	iput(inode);
+out_budg:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+/**
+ * change_xattr - change an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @inode: extended attribute inode
+ * @value: extended attribute value
+ * @size: size of extended attribute value
+ *
+ * This helper function changes the value of extended attribute @inode with new
+ * data from @value. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int change_xattr(struct ubifs_info *c, struct inode *host,
+			struct inode *inode, const void *value, int size)
+{
+	int err;
+	struct ubifs_inode *host_ui = ubifs_inode(host);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_budget_req req = { .dirtied_ino = 2,
+		.dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) };
+
+	ubifs_assert(ui->data_len == inode->i_size);
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	kfree(ui->data);
+	ui->data = kmalloc(size, GFP_NOFS);
+	if (!ui->data) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+	memcpy(ui->data, value, size);
+	inode->i_size = ui->ui_size = size;
+	ui->data_len = size;
+
+	mutex_lock(&host_ui->ui_mutex);
+	host->i_ctime = ubifs_current_time(host);
+	host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+	host_ui->xattr_size += CALC_XATTR_BYTES(size);
+
+	/*
+	 * It is important to write the host inode after the xattr inode
+	 * because if the host inode gets synchronized (via 'fsync()'), then
+	 * the extended attribute inode gets synchronized, because it goes
+	 * before the host inode in the write-buffer.
+	 */
+	err = ubifs_jnl_change_xattr(c, inode, host);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&host_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	return 0;
+
+out_cancel:
+	host_ui->xattr_size -= CALC_XATTR_BYTES(size);
+	host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
+	mutex_unlock(&host_ui->ui_mutex);
+	make_bad_inode(inode);
+out_free:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+/**
+ * check_namespace - check extended attribute name-space.
+ * @nm: extended attribute name
+ *
+ * This function makes sure the extended attribute name belongs to one of the
+ * supported extended attribute name-spaces. Returns name-space index in case
+ * of success and a negative error code in case of failure.
+ */
+static int check_namespace(const struct qstr *nm)
+{
+	int type;
+
+	if (nm->len > UBIFS_MAX_NLEN)
+		return -ENAMETOOLONG;
+
+	if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX,
+		     XATTR_TRUSTED_PREFIX_LEN)) {
+		if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0')
+			return -EINVAL;
+		type = TRUSTED_XATTR;
+	} else if (!strncmp(nm->name, XATTR_USER_PREFIX,
+				      XATTR_USER_PREFIX_LEN)) {
+		if (nm->name[XATTR_USER_PREFIX_LEN] == '\0')
+			return -EINVAL;
+		type = USER_XATTR;
+	} else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX,
+				     XATTR_SECURITY_PREFIX_LEN)) {
+		if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0')
+			return -EINVAL;
+		type = SECURITY_XATTR;
+	} else
+		return -EOPNOTSUPP;
+
+	return type;
+}
+
+static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
+{
+	struct inode *inode;
+
+	inode = ubifs_iget(c->vfs_sb, inum);
+	if (IS_ERR(inode)) {
+		ubifs_err("dead extended attribute entry, error %d",
+			  (int)PTR_ERR(inode));
+		return inode;
+	}
+	if (ubifs_inode(inode)->xattr)
+		return inode;
+	ubifs_err("corrupt extended attribute entry");
+	iput(inode);
+	return ERR_PTR(-EINVAL);
+}
+
+int ubifs_setxattr(struct dentry *dentry, const char *name,
+		   const void *value, size_t size, int flags)
+{
+	struct inode *inode, *host = dentry->d_inode;
+	struct ubifs_info *c = host->i_sb->s_fs_info;
+	struct qstr nm = { .name = name, .len = strlen(name) };
+	struct ubifs_dent_node *xent;
+	union ubifs_key key;
+	int err, type;
+
+	dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name,
+		host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+	ubifs_assert(mutex_is_locked(&host->i_mutex));
+
+	if (size > UBIFS_MAX_INO_DATA)
+		return -ERANGE;
+
+	type = check_namespace(&nm);
+	if (type < 0)
+		return type;
+
+	xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+	if (!xent)
+		return -ENOMEM;
+
+	/*
+	 * The extended attribute entries are stored in LNC, so multiple
+	 * look-ups do not involve reading the flash.
+	 */
+	xent_key_init(c, &key, host->i_ino, &nm);
+	err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+	if (err) {
+		if (err != -ENOENT)
+			goto out_free;
+
+		if (flags & XATTR_REPLACE)
+			/* We are asked not to create the xattr */
+			err = -ENODATA;
+		else
+			err = create_xattr(c, host, &nm, value, size);
+		goto out_free;
+	}
+
+	if (flags & XATTR_CREATE) {
+		/* We are asked not to replace the xattr */
+		err = -EEXIST;
+		goto out_free;
+	}
+
+	inode = iget_xattr(c, le64_to_cpu(xent->inum));
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_free;
+	}
+
+	err = change_xattr(c, host, inode, value, size);
+	iput(inode);
+
+out_free:
+	kfree(xent);
+	return err;
+}
+
+ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
+		       size_t size)
+{
+	struct inode *inode, *host = dentry->d_inode;
+	struct ubifs_info *c = host->i_sb->s_fs_info;
+	struct qstr nm = { .name = name, .len = strlen(name) };
+	struct ubifs_inode *ui;
+	struct ubifs_dent_node *xent;
+	union ubifs_key key;
+	int err;
+
+	dbg_gen("xattr '%s', ino %lu ('%.*s'), buf size %zd", name,
+		host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+
+	err = check_namespace(&nm);
+	if (err < 0)
+		return err;
+
+	xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+	if (!xent)
+		return -ENOMEM;
+
+	xent_key_init(c, &key, host->i_ino, &nm);
+	err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+	if (err) {
+		if (err == -ENOENT)
+			err = -ENODATA;
+		goto out_unlock;
+	}
+
+	inode = iget_xattr(c, le64_to_cpu(xent->inum));
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_unlock;
+	}
+
+	ui = ubifs_inode(inode);
+	ubifs_assert(inode->i_size == ui->data_len);
+	ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len);
+
+	if (buf) {
+		/* If @buf is %NULL we are supposed to return the length */
+		if (ui->data_len > size) {
+			dbg_err("buffer size %zd, xattr len %d",
+				size, ui->data_len);
+			err = -ERANGE;
+			goto out_iput;
+		}
+
+		memcpy(buf, ui->data, ui->data_len);
+	}
+	err = ui->data_len;
+
+out_iput:
+	iput(inode);
+out_unlock:
+	kfree(xent);
+	return err;
+}
+
+ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	union ubifs_key key;
+	struct inode *host = dentry->d_inode;
+	struct ubifs_info *c = host->i_sb->s_fs_info;
+	struct ubifs_inode *host_ui = ubifs_inode(host);
+	struct ubifs_dent_node *xent, *pxent = NULL;
+	int err, len, written = 0;
+	struct qstr nm = { .name = NULL };
+
+	dbg_gen("ino %lu ('%.*s'), buffer size %zd", host->i_ino,
+		dentry->d_name.len, dentry->d_name.name, size);
+
+	len = host_ui->xattr_names + host_ui->xattr_cnt;
+	if (!buffer)
+		/*
+		 * We should return the minimum buffer size which will fit a
+		 * null-terminated list of all the extended attribute names.
+		 */
+		return len;
+
+	if (len > size)
+		return -ERANGE;
+
+	lowest_xent_key(c, &key, host->i_ino);
+	while (1) {
+		int type;
+
+		xent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(xent)) {
+			err = PTR_ERR(xent);
+			break;
+		}
+
+		nm.name = xent->name;
+		nm.len = le16_to_cpu(xent->nlen);
+
+		type = check_namespace(&nm);
+		if (unlikely(type < 0)) {
+			err = type;
+			break;
+		}
+
+		/* Show trusted namespace only for "power" users */
+		if (type != TRUSTED_XATTR || capable(CAP_SYS_ADMIN)) {
+			memcpy(buffer + written, nm.name, nm.len + 1);
+			written += nm.len + 1;
+		}
+
+		kfree(pxent);
+		pxent = xent;
+		key_read(c, &xent->key, &key);
+	}
+
+	kfree(pxent);
+	if (err != -ENOENT) {
+		ubifs_err("cannot find next direntry, error %d", err);
+		return err;
+	}
+
+	ubifs_assert(written <= size);
+	return written;
+}
+
+static int remove_xattr(struct ubifs_info *c, struct inode *host,
+			struct inode *inode, const struct qstr *nm)
+{
+	int err;
+	struct ubifs_inode *host_ui = ubifs_inode(host);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1,
+				.dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
+
+	ubifs_assert(ui->data_len == inode->i_size);
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	mutex_lock(&host_ui->ui_mutex);
+	host->i_ctime = ubifs_current_time(host);
+	host_ui->xattr_cnt -= 1;
+	host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+	host_ui->xattr_names -= nm->len;
+
+	err = ubifs_jnl_delete_xattr(c, host, inode, nm);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&host_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	return 0;
+
+out_cancel:
+	host_ui->xattr_cnt += 1;
+	host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
+	mutex_unlock(&host_ui->ui_mutex);
+	ubifs_release_budget(c, &req);
+	make_bad_inode(inode);
+	return err;
+}
+
+int ubifs_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode, *host = dentry->d_inode;
+	struct ubifs_info *c = host->i_sb->s_fs_info;
+	struct qstr nm = { .name = name, .len = strlen(name) };
+	struct ubifs_dent_node *xent;
+	union ubifs_key key;
+	int err;
+
+	dbg_gen("xattr '%s', ino %lu ('%.*s')", name,
+		host->i_ino, dentry->d_name.len, dentry->d_name.name);
+	ubifs_assert(mutex_is_locked(&host->i_mutex));
+
+	err = check_namespace(&nm);
+	if (err < 0)
+		return err;
+
+	xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+	if (!xent)
+		return -ENOMEM;
+
+	xent_key_init(c, &key, host->i_ino, &nm);
+	err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+	if (err) {
+		if (err == -ENOENT)
+			err = -ENODATA;
+		goto out_free;
+	}
+
+	inode = iget_xattr(c, le64_to_cpu(xent->inum));
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_free;
+	}
+
+	ubifs_assert(inode->i_nlink == 1);
+	inode->i_nlink = 0;
+	err = remove_xattr(c, host, inode, &nm);
+	if (err)
+		inode->i_nlink = 1;
+
+	/* If @i_nlink is 0, 'iput()' will delete the inode */
+	iput(inode);
+
+out_free:
+	kfree(xent);
+	return err;
+}
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
new file mode 100644
index 00000000..0e0e99bd
--- /dev/null
+++ b/fs/udf/Kconfig
@@ -0,0 +1,18 @@
+config UDF_FS
+	tristate "UDF file system support"
+	select CRC_ITU_T
+	help
+	  This is the new file system used on some CD-ROMs and DVDs. Say Y if
+	  you intend to mount DVD discs or CDRW's written in packet mode, or
+	  if written to by other UDF utilities, such as DirectCD.
+	  Please read <file:Documentation/filesystems/udf.txt>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called udf.
+
+	  If unsure, say N.
+
+config UDF_NLS
+	bool
+	default y
+	depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
diff --git a/fs/udf/Makefile b/fs/udf/Makefile
new file mode 100644
index 00000000..eb880f66
--- /dev/null
+++ b/fs/udf/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the linux udf-filesystem routines.
+#
+
+obj-$(CONFIG_UDF_FS) += udf.o
+
+udf-objs     := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \
+		partition.o super.o truncate.o symlink.o \
+		directory.o misc.o udftime.o unicode.o
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
new file mode 100644
index 00000000..95518a9f
--- /dev/null
+++ b/fs/udf/balloc.c
@@ -0,0 +1,815 @@
+/*
+ * balloc.c
+ *
+ * PURPOSE
+ *	Block allocation handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1999-2001 Ben Fennema
+ *  (C) 1999 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  02/24/99 blf  Created.
+ *
+ */
+
+#include "udfdecl.h"
+
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+
+#include "udf_i.h"
+#include "udf_sb.h"
+
+#define udf_clear_bit	__test_and_clear_bit_le
+#define udf_set_bit	__test_and_set_bit_le
+#define udf_test_bit	test_bit_le
+#define udf_find_next_one_bit	find_next_bit_le
+
+static int read_block_bitmap(struct super_block *sb,
+			     struct udf_bitmap *bitmap, unsigned int block,
+			     unsigned long bitmap_nr)
+{
+	struct buffer_head *bh = NULL;
+	int retval = 0;
+	struct kernel_lb_addr loc;
+
+	loc.logicalBlockNum = bitmap->s_extPosition;
+	loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
+
+	bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block));
+	if (!bh)
+		retval = -EIO;
+
+	bitmap->s_block_bitmap[bitmap_nr] = bh;
+	return retval;
+}
+
+static int __load_block_bitmap(struct super_block *sb,
+			       struct udf_bitmap *bitmap,
+			       unsigned int block_group)
+{
+	int retval = 0;
+	int nr_groups = bitmap->s_nr_groups;
+
+	if (block_group >= nr_groups) {
+		udf_debug("block_group (%d) > nr_groups (%d)\n", block_group,
+			  nr_groups);
+	}
+
+	if (bitmap->s_block_bitmap[block_group]) {
+		return block_group;
+	} else {
+		retval = read_block_bitmap(sb, bitmap, block_group,
+					   block_group);
+		if (retval < 0)
+			return retval;
+		return block_group;
+	}
+}
+
+static inline int load_block_bitmap(struct super_block *sb,
+				    struct udf_bitmap *bitmap,
+				    unsigned int block_group)
+{
+	int slot;
+
+	slot = __load_block_bitmap(sb, bitmap, block_group);
+
+	if (slot < 0)
+		return slot;
+
+	if (!bitmap->s_block_bitmap[slot])
+		return -EIO;
+
+	return slot;
+}
+
+static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct logicalVolIntegrityDesc *lvid;
+
+	if (!sbi->s_lvid_bh)
+		return;
+
+	lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
+	le32_add_cpu(&lvid->freeSpaceTable[partition], cnt);
+	udf_updated_lvid(sb);
+}
+
+static void udf_bitmap_free_blocks(struct super_block *sb,
+				   struct inode *inode,
+				   struct udf_bitmap *bitmap,
+				   struct kernel_lb_addr *bloc,
+				   uint32_t offset,
+				   uint32_t count)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct buffer_head *bh = NULL;
+	struct udf_part_map *partmap;
+	unsigned long block;
+	unsigned long block_group;
+	unsigned long bit;
+	unsigned long i;
+	int bitmap_nr;
+	unsigned long overflow;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
+	if (bloc->logicalBlockNum + count < count ||
+	    (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
+		udf_debug("%d < %d || %d + %d > %d\n",
+			  bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
+			  count, partmap->s_partition_len);
+		goto error_return;
+	}
+
+	block = bloc->logicalBlockNum + offset +
+		(sizeof(struct spaceBitmapDesc) << 3);
+
+	do {
+		overflow = 0;
+		block_group = block >> (sb->s_blocksize_bits + 3);
+		bit = block % (sb->s_blocksize << 3);
+
+		/*
+		* Check to see if we are freeing blocks across a group boundary.
+		*/
+		if (bit + count > (sb->s_blocksize << 3)) {
+			overflow = bit + count - (sb->s_blocksize << 3);
+			count -= overflow;
+		}
+		bitmap_nr = load_block_bitmap(sb, bitmap, block_group);
+		if (bitmap_nr < 0)
+			goto error_return;
+
+		bh = bitmap->s_block_bitmap[bitmap_nr];
+		for (i = 0; i < count; i++) {
+			if (udf_set_bit(bit + i, bh->b_data)) {
+				udf_debug("bit %ld already set\n", bit + i);
+				udf_debug("byte=%2x\n",
+					((char *)bh->b_data)[(bit + i) >> 3]);
+			}
+		}
+		udf_add_free_space(sb, sbi->s_partition, count);
+		mark_buffer_dirty(bh);
+		if (overflow) {
+			block += count;
+			count = overflow;
+		}
+	} while (overflow);
+
+error_return:
+	mutex_unlock(&sbi->s_alloc_mutex);
+}
+
+static int udf_bitmap_prealloc_blocks(struct super_block *sb,
+				      struct inode *inode,
+				      struct udf_bitmap *bitmap,
+				      uint16_t partition, uint32_t first_block,
+				      uint32_t block_count)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	int alloc_count = 0;
+	int bit, block, block_group, group_start;
+	int nr_groups, bitmap_nr;
+	struct buffer_head *bh;
+	__u32 part_len;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	part_len = sbi->s_partmaps[partition].s_partition_len;
+	if (first_block >= part_len)
+		goto out;
+
+	if (first_block + block_count > part_len)
+		block_count = part_len - first_block;
+
+	do {
+		nr_groups = udf_compute_nr_groups(sb, partition);
+		block = first_block + (sizeof(struct spaceBitmapDesc) << 3);
+		block_group = block >> (sb->s_blocksize_bits + 3);
+		group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc);
+
+		bitmap_nr = load_block_bitmap(sb, bitmap, block_group);
+		if (bitmap_nr < 0)
+			goto out;
+		bh = bitmap->s_block_bitmap[bitmap_nr];
+
+		bit = block % (sb->s_blocksize << 3);
+
+		while (bit < (sb->s_blocksize << 3) && block_count > 0) {
+			if (!udf_clear_bit(bit, bh->b_data))
+				goto out;
+			block_count--;
+			alloc_count++;
+			bit++;
+			block++;
+		}
+		mark_buffer_dirty(bh);
+	} while (block_count > 0);
+
+out:
+	udf_add_free_space(sb, partition, -alloc_count);
+	mutex_unlock(&sbi->s_alloc_mutex);
+	return alloc_count;
+}
+
+static int udf_bitmap_new_block(struct super_block *sb,
+				struct inode *inode,
+				struct udf_bitmap *bitmap, uint16_t partition,
+				uint32_t goal, int *err)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	int newbit, bit = 0, block, block_group, group_start;
+	int end_goal, nr_groups, bitmap_nr, i;
+	struct buffer_head *bh = NULL;
+	char *ptr;
+	int newblock = 0;
+
+	*err = -ENOSPC;
+	mutex_lock(&sbi->s_alloc_mutex);
+
+repeat:
+	if (goal >= sbi->s_partmaps[partition].s_partition_len)
+		goal = 0;
+
+	nr_groups = bitmap->s_nr_groups;
+	block = goal + (sizeof(struct spaceBitmapDesc) << 3);
+	block_group = block >> (sb->s_blocksize_bits + 3);
+	group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc);
+
+	bitmap_nr = load_block_bitmap(sb, bitmap, block_group);
+	if (bitmap_nr < 0)
+		goto error_return;
+	bh = bitmap->s_block_bitmap[bitmap_nr];
+	ptr = memscan((char *)bh->b_data + group_start, 0xFF,
+		      sb->s_blocksize - group_start);
+
+	if ((ptr - ((char *)bh->b_data)) < sb->s_blocksize) {
+		bit = block % (sb->s_blocksize << 3);
+		if (udf_test_bit(bit, bh->b_data))
+			goto got_block;
+
+		end_goal = (bit + 63) & ~63;
+		bit = udf_find_next_one_bit(bh->b_data, end_goal, bit);
+		if (bit < end_goal)
+			goto got_block;
+
+		ptr = memscan((char *)bh->b_data + (bit >> 3), 0xFF,
+			      sb->s_blocksize - ((bit + 7) >> 3));
+		newbit = (ptr - ((char *)bh->b_data)) << 3;
+		if (newbit < sb->s_blocksize << 3) {
+			bit = newbit;
+			goto search_back;
+		}
+
+		newbit = udf_find_next_one_bit(bh->b_data,
+					       sb->s_blocksize << 3, bit);
+		if (newbit < sb->s_blocksize << 3) {
+			bit = newbit;
+			goto got_block;
+		}
+	}
+
+	for (i = 0; i < (nr_groups * 2); i++) {
+		block_group++;
+		if (block_group >= nr_groups)
+			block_group = 0;
+		group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc);
+
+		bitmap_nr = load_block_bitmap(sb, bitmap, block_group);
+		if (bitmap_nr < 0)
+			goto error_return;
+		bh = bitmap->s_block_bitmap[bitmap_nr];
+		if (i < nr_groups) {
+			ptr = memscan((char *)bh->b_data + group_start, 0xFF,
+				      sb->s_blocksize - group_start);
+			if ((ptr - ((char *)bh->b_data)) < sb->s_blocksize) {
+				bit = (ptr - ((char *)bh->b_data)) << 3;
+				break;
+			}
+		} else {
+			bit = udf_find_next_one_bit(bh->b_data,
+						    sb->s_blocksize << 3,
+						    group_start << 3);
+			if (bit < sb->s_blocksize << 3)
+				break;
+		}
+	}
+	if (i >= (nr_groups * 2)) {
+		mutex_unlock(&sbi->s_alloc_mutex);
+		return newblock;
+	}
+	if (bit < sb->s_blocksize << 3)
+		goto search_back;
+	else
+		bit = udf_find_next_one_bit(bh->b_data, sb->s_blocksize << 3,
+					    group_start << 3);
+	if (bit >= sb->s_blocksize << 3) {
+		mutex_unlock(&sbi->s_alloc_mutex);
+		return 0;
+	}
+
+search_back:
+	i = 0;
+	while (i < 7 && bit > (group_start << 3) &&
+	       udf_test_bit(bit - 1, bh->b_data)) {
+		++i;
+		--bit;
+	}
+
+got_block:
+	newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
+		(sizeof(struct spaceBitmapDesc) << 3);
+
+	if (!udf_clear_bit(bit, bh->b_data)) {
+		udf_debug("bit already cleared for block %d\n", bit);
+		goto repeat;
+	}
+
+	mark_buffer_dirty(bh);
+
+	udf_add_free_space(sb, partition, -1);
+	mutex_unlock(&sbi->s_alloc_mutex);
+	*err = 0;
+	return newblock;
+
+error_return:
+	*err = -EIO;
+	mutex_unlock(&sbi->s_alloc_mutex);
+	return 0;
+}
+
+static void udf_table_free_blocks(struct super_block *sb,
+				  struct inode *inode,
+				  struct inode *table,
+				  struct kernel_lb_addr *bloc,
+				  uint32_t offset,
+				  uint32_t count)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *partmap;
+	uint32_t start, end;
+	uint32_t elen;
+	struct kernel_lb_addr eloc;
+	struct extent_position oepos, epos;
+	int8_t etype;
+	int i;
+	struct udf_inode_info *iinfo;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
+	if (bloc->logicalBlockNum + count < count ||
+	    (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
+		udf_debug("%d < %d || %d + %d > %d\n",
+			  bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
+			  partmap->s_partition_len);
+		goto error_return;
+	}
+
+	iinfo = UDF_I(table);
+	udf_add_free_space(sb, sbi->s_partition, count);
+
+	start = bloc->logicalBlockNum + offset;
+	end = bloc->logicalBlockNum + offset + count - 1;
+
+	epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry);
+	elen = 0;
+	epos.block = oepos.block = iinfo->i_location;
+	epos.bh = oepos.bh = NULL;
+
+	while (count &&
+	       (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) {
+		if (((eloc.logicalBlockNum +
+			(elen >> sb->s_blocksize_bits)) == start)) {
+			if ((0x3FFFFFFF - elen) <
+					(count << sb->s_blocksize_bits)) {
+				uint32_t tmp = ((0x3FFFFFFF - elen) >>
+							sb->s_blocksize_bits);
+				count -= tmp;
+				start += tmp;
+				elen = (etype << 30) |
+					(0x40000000 - sb->s_blocksize);
+			} else {
+				elen = (etype << 30) |
+					(elen +
+					(count << sb->s_blocksize_bits));
+				start += count;
+				count = 0;
+			}
+			udf_write_aext(table, &oepos, &eloc, elen, 1);
+		} else if (eloc.logicalBlockNum == (end + 1)) {
+			if ((0x3FFFFFFF - elen) <
+					(count << sb->s_blocksize_bits)) {
+				uint32_t tmp = ((0x3FFFFFFF - elen) >>
+						sb->s_blocksize_bits);
+				count -= tmp;
+				end -= tmp;
+				eloc.logicalBlockNum -= tmp;
+				elen = (etype << 30) |
+					(0x40000000 - sb->s_blocksize);
+			} else {
+				eloc.logicalBlockNum = start;
+				elen = (etype << 30) |
+					(elen +
+					(count << sb->s_blocksize_bits));
+				end -= count;
+				count = 0;
+			}
+			udf_write_aext(table, &oepos, &eloc, elen, 1);
+		}
+
+		if (epos.bh != oepos.bh) {
+			i = -1;
+			oepos.block = epos.block;
+			brelse(oepos.bh);
+			get_bh(epos.bh);
+			oepos.bh = epos.bh;
+			oepos.offset = 0;
+		} else {
+			oepos.offset = epos.offset;
+		}
+	}
+
+	if (count) {
+		/*
+		 * NOTE: we CANNOT use udf_add_aext here, as it can try to
+		 * allocate a new block, and since we hold the super block
+		 * lock already very bad things would happen :)
+		 *
+		 * We copy the behavior of udf_add_aext, but instead of
+		 * trying to allocate a new block close to the existing one,
+		 * we just steal a block from the extent we are trying to add.
+		 *
+		 * It would be nice if the blocks were close together, but it
+		 * isn't required.
+		 */
+
+		int adsize;
+		struct short_ad *sad = NULL;
+		struct long_ad *lad = NULL;
+		struct allocExtDesc *aed;
+
+		eloc.logicalBlockNum = start;
+		elen = EXT_RECORDED_ALLOCATED |
+			(count << sb->s_blocksize_bits);
+
+		if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+			adsize = sizeof(struct short_ad);
+		else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+			adsize = sizeof(struct long_ad);
+		else {
+			brelse(oepos.bh);
+			brelse(epos.bh);
+			goto error_return;
+		}
+
+		if (epos.offset + (2 * adsize) > sb->s_blocksize) {
+			unsigned char *sptr, *dptr;
+			int loffset;
+
+			brelse(oepos.bh);
+			oepos = epos;
+
+			/* Steal a block from the extent being free'd */
+			epos.block.logicalBlockNum = eloc.logicalBlockNum;
+			eloc.logicalBlockNum++;
+			elen -= sb->s_blocksize;
+
+			epos.bh = udf_tread(sb,
+					udf_get_lb_pblock(sb, &epos.block, 0));
+			if (!epos.bh) {
+				brelse(oepos.bh);
+				goto error_return;
+			}
+			aed = (struct allocExtDesc *)(epos.bh->b_data);
+			aed->previousAllocExtLocation =
+				cpu_to_le32(oepos.block.logicalBlockNum);
+			if (epos.offset + adsize > sb->s_blocksize) {
+				loffset = epos.offset;
+				aed->lengthAllocDescs = cpu_to_le32(adsize);
+				sptr = iinfo->i_ext.i_data + epos.offset
+								- adsize;
+				dptr = epos.bh->b_data +
+					sizeof(struct allocExtDesc);
+				memcpy(dptr, sptr, adsize);
+				epos.offset = sizeof(struct allocExtDesc) +
+						adsize;
+			} else {
+				loffset = epos.offset + adsize;
+				aed->lengthAllocDescs = cpu_to_le32(0);
+				if (oepos.bh) {
+					sptr = oepos.bh->b_data + epos.offset;
+					aed = (struct allocExtDesc *)
+						oepos.bh->b_data;
+					le32_add_cpu(&aed->lengthAllocDescs,
+							adsize);
+				} else {
+					sptr = iinfo->i_ext.i_data +
+								epos.offset;
+					iinfo->i_lenAlloc += adsize;
+					mark_inode_dirty(table);
+				}
+				epos.offset = sizeof(struct allocExtDesc);
+			}
+			if (sbi->s_udfrev >= 0x0200)
+				udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
+					    3, 1, epos.block.logicalBlockNum,
+					    sizeof(struct tag));
+			else
+				udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
+					    2, 1, epos.block.logicalBlockNum,
+					    sizeof(struct tag));
+
+			switch (iinfo->i_alloc_type) {
+			case ICBTAG_FLAG_AD_SHORT:
+				sad = (struct short_ad *)sptr;
+				sad->extLength = cpu_to_le32(
+					EXT_NEXT_EXTENT_ALLOCDECS |
+					sb->s_blocksize);
+				sad->extPosition =
+					cpu_to_le32(epos.block.logicalBlockNum);
+				break;
+			case ICBTAG_FLAG_AD_LONG:
+				lad = (struct long_ad *)sptr;
+				lad->extLength = cpu_to_le32(
+					EXT_NEXT_EXTENT_ALLOCDECS |
+					sb->s_blocksize);
+				lad->extLocation =
+					cpu_to_lelb(epos.block);
+				break;
+			}
+			if (oepos.bh) {
+				udf_update_tag(oepos.bh->b_data, loffset);
+				mark_buffer_dirty(oepos.bh);
+			} else {
+				mark_inode_dirty(table);
+			}
+		}
+
+		/* It's possible that stealing the block emptied the extent */
+		if (elen) {
+			udf_write_aext(table, &epos, &eloc, elen, 1);
+
+			if (!epos.bh) {
+				iinfo->i_lenAlloc += adsize;
+				mark_inode_dirty(table);
+			} else {
+				aed = (struct allocExtDesc *)epos.bh->b_data;
+				le32_add_cpu(&aed->lengthAllocDescs, adsize);
+				udf_update_tag(epos.bh->b_data, epos.offset);
+				mark_buffer_dirty(epos.bh);
+			}
+		}
+	}
+
+	brelse(epos.bh);
+	brelse(oepos.bh);
+
+error_return:
+	mutex_unlock(&sbi->s_alloc_mutex);
+	return;
+}
+
+static int udf_table_prealloc_blocks(struct super_block *sb,
+				     struct inode *inode,
+				     struct inode *table, uint16_t partition,
+				     uint32_t first_block, uint32_t block_count)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	int alloc_count = 0;
+	uint32_t elen, adsize;
+	struct kernel_lb_addr eloc;
+	struct extent_position epos;
+	int8_t etype = -1;
+	struct udf_inode_info *iinfo;
+
+	if (first_block >= sbi->s_partmaps[partition].s_partition_len)
+		return 0;
+
+	iinfo = UDF_I(table);
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		return 0;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	epos.offset = sizeof(struct unallocSpaceEntry);
+	epos.block = iinfo->i_location;
+	epos.bh = NULL;
+	eloc.logicalBlockNum = 0xFFFFFFFF;
+
+	while (first_block != eloc.logicalBlockNum &&
+	       (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) {
+		udf_debug("eloc=%d, elen=%d, first_block=%d\n",
+			  eloc.logicalBlockNum, elen, first_block);
+		; /* empty loop body */
+	}
+
+	if (first_block == eloc.logicalBlockNum) {
+		epos.offset -= adsize;
+
+		alloc_count = (elen >> sb->s_blocksize_bits);
+		if (alloc_count > block_count) {
+			alloc_count = block_count;
+			eloc.logicalBlockNum += alloc_count;
+			elen -= (alloc_count << sb->s_blocksize_bits);
+			udf_write_aext(table, &epos, &eloc,
+					(etype << 30) | elen, 1);
+		} else
+			udf_delete_aext(table, epos, eloc,
+					(etype << 30) | elen);
+	} else {
+		alloc_count = 0;
+	}
+
+	brelse(epos.bh);
+
+	if (alloc_count)
+		udf_add_free_space(sb, partition, -alloc_count);
+	mutex_unlock(&sbi->s_alloc_mutex);
+	return alloc_count;
+}
+
+static int udf_table_new_block(struct super_block *sb,
+			       struct inode *inode,
+			       struct inode *table, uint16_t partition,
+			       uint32_t goal, int *err)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF;
+	uint32_t newblock = 0, adsize;
+	uint32_t elen, goal_elen = 0;
+	struct kernel_lb_addr eloc, uninitialized_var(goal_eloc);
+	struct extent_position epos, goal_epos;
+	int8_t etype;
+	struct udf_inode_info *iinfo = UDF_I(table);
+
+	*err = -ENOSPC;
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		return newblock;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	if (goal >= sbi->s_partmaps[partition].s_partition_len)
+		goal = 0;
+
+	/* We search for the closest matching block to goal. If we find
+	   a exact hit, we stop. Otherwise we keep going till we run out
+	   of extents. We store the buffer_head, bloc, and extoffset
+	   of the current closest match and use that when we are done.
+	 */
+	epos.offset = sizeof(struct unallocSpaceEntry);
+	epos.block = iinfo->i_location;
+	epos.bh = goal_epos.bh = NULL;
+
+	while (spread &&
+	       (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) {
+		if (goal >= eloc.logicalBlockNum) {
+			if (goal < eloc.logicalBlockNum +
+					(elen >> sb->s_blocksize_bits))
+				nspread = 0;
+			else
+				nspread = goal - eloc.logicalBlockNum -
+					(elen >> sb->s_blocksize_bits);
+		} else {
+			nspread = eloc.logicalBlockNum - goal;
+		}
+
+		if (nspread < spread) {
+			spread = nspread;
+			if (goal_epos.bh != epos.bh) {
+				brelse(goal_epos.bh);
+				goal_epos.bh = epos.bh;
+				get_bh(goal_epos.bh);
+			}
+			goal_epos.block = epos.block;
+			goal_epos.offset = epos.offset - adsize;
+			goal_eloc = eloc;
+			goal_elen = (etype << 30) | elen;
+		}
+	}
+
+	brelse(epos.bh);
+
+	if (spread == 0xFFFFFFFF) {
+		brelse(goal_epos.bh);
+		mutex_unlock(&sbi->s_alloc_mutex);
+		return 0;
+	}
+
+	/* Only allocate blocks from the beginning of the extent.
+	   That way, we only delete (empty) extents, never have to insert an
+	   extent because of splitting */
+	/* This works, but very poorly.... */
+
+	newblock = goal_eloc.logicalBlockNum;
+	goal_eloc.logicalBlockNum++;
+	goal_elen -= sb->s_blocksize;
+
+	if (goal_elen)
+		udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
+	else
+		udf_delete_aext(table, goal_epos, goal_eloc, goal_elen);
+	brelse(goal_epos.bh);
+
+	udf_add_free_space(sb, partition, -1);
+
+	mutex_unlock(&sbi->s_alloc_mutex);
+	*err = 0;
+	return newblock;
+}
+
+void udf_free_blocks(struct super_block *sb, struct inode *inode,
+		     struct kernel_lb_addr *bloc, uint32_t offset,
+		     uint32_t count)
+{
+	uint16_t partition = bloc->partitionReferenceNum;
+	struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
+
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
+		udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap,
+				       bloc, offset, count);
+	} else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
+		udf_table_free_blocks(sb, inode, map->s_uspace.s_table,
+				      bloc, offset, count);
+	} else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
+		udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap,
+				       bloc, offset, count);
+	} else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
+		udf_table_free_blocks(sb, inode, map->s_fspace.s_table,
+				      bloc, offset, count);
+	}
+}
+
+inline int udf_prealloc_blocks(struct super_block *sb,
+			       struct inode *inode,
+			       uint16_t partition, uint32_t first_block,
+			       uint32_t block_count)
+{
+	struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
+
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
+		return udf_bitmap_prealloc_blocks(sb, inode,
+						  map->s_uspace.s_bitmap,
+						  partition, first_block,
+						  block_count);
+	else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
+		return udf_table_prealloc_blocks(sb, inode,
+						 map->s_uspace.s_table,
+						 partition, first_block,
+						 block_count);
+	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
+		return udf_bitmap_prealloc_blocks(sb, inode,
+						  map->s_fspace.s_bitmap,
+						  partition, first_block,
+						  block_count);
+	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
+		return udf_table_prealloc_blocks(sb, inode,
+						 map->s_fspace.s_table,
+						 partition, first_block,
+						 block_count);
+	else
+		return 0;
+}
+
+inline int udf_new_block(struct super_block *sb,
+			 struct inode *inode,
+			 uint16_t partition, uint32_t goal, int *err)
+{
+	struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
+
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
+		return udf_bitmap_new_block(sb, inode,
+					   map->s_uspace.s_bitmap,
+					   partition, goal, err);
+	else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
+		return udf_table_new_block(sb, inode,
+					   map->s_uspace.s_table,
+					   partition, goal, err);
+	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
+		return udf_bitmap_new_block(sb, inode,
+					    map->s_fspace.s_bitmap,
+					    partition, goal, err);
+	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
+		return udf_table_new_block(sb, inode,
+					   map->s_fspace.s_table,
+					   partition, goal, err);
+	else {
+		*err = -EIO;
+		return 0;
+	}
+}
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
new file mode 100644
index 00000000..eb8bfe2b
--- /dev/null
+++ b/fs/udf/dir.c
@@ -0,0 +1,210 @@
+/*
+ * dir.c
+ *
+ * PURPOSE
+ *  Directory handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998-2004 Ben Fennema
+ *
+ * HISTORY
+ *
+ *  10/05/98 dgb  Split directory operations into its own file
+ *                Implemented directory reads via do_udf_readdir
+ *  10/06/98      Made directory operations work!
+ *  11/17/98      Rewrote directory to support ICBTAG_FLAG_AD_LONG
+ *  11/25/98 blf  Rewrote directory handling (readdir+lookup) to support reading
+ *                across blocks.
+ *  12/12/98      Split out the lookup code to namei.c. bulk of directory
+ *                code now in directory.c:udf_fileident_read.
+ */
+
+#include "udfdecl.h"
+
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+
+#include "udf_i.h"
+#include "udf_sb.h"
+
+static int do_udf_readdir(struct inode *dir, struct file *filp,
+			  filldir_t filldir, void *dirent)
+{
+	struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL};
+	struct fileIdentDesc *fi = NULL;
+	struct fileIdentDesc cfi;
+	int block, iblock;
+	loff_t nf_pos = (filp->f_pos - 1) << 2;
+	int flen;
+	unsigned char *fname = NULL;
+	unsigned char *nameptr;
+	uint16_t liu;
+	uint8_t lfi;
+	loff_t size = udf_ext0_offset(dir) + dir->i_size;
+	struct buffer_head *tmp, *bha[16];
+	struct kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t offset;
+	int i, num, ret = 0;
+	unsigned int dt_type;
+	struct extent_position epos = { NULL, 0, {0, 0} };
+	struct udf_inode_info *iinfo;
+
+	if (nf_pos >= size)
+		goto out;
+
+	fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+	if (!fname) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (nf_pos == 0)
+		nf_pos = udf_ext0_offset(dir);
+
+	fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1);
+	iinfo = UDF_I(dir);
+	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+		if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
+		    &epos, &eloc, &elen, &offset)
+		    != (EXT_RECORDED_ALLOCATED >> 30)) {
+			ret = -ENOENT;
+			goto out;
+		}
+		block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
+		if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
+			if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+				epos.offset -= sizeof(struct short_ad);
+			else if (iinfo->i_alloc_type ==
+					ICBTAG_FLAG_AD_LONG)
+				epos.offset -= sizeof(struct long_ad);
+		} else {
+			offset = 0;
+		}
+
+		if (!(fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block))) {
+			ret = -EIO;
+			goto out;
+		}
+
+		if (!(offset & ((16 >> (dir->i_sb->s_blocksize_bits - 9)) - 1))) {
+			i = 16 >> (dir->i_sb->s_blocksize_bits - 9);
+			if (i + offset > (elen >> dir->i_sb->s_blocksize_bits))
+				i = (elen >> dir->i_sb->s_blocksize_bits) - offset;
+			for (num = 0; i > 0; i--) {
+				block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i);
+				tmp = udf_tgetblk(dir->i_sb, block);
+				if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp))
+					bha[num++] = tmp;
+				else
+					brelse(tmp);
+			}
+			if (num) {
+				ll_rw_block(READA, num, bha);
+				for (i = 0; i < num; i++)
+					brelse(bha[i]);
+			}
+		}
+	}
+
+	while (nf_pos < size) {
+		filp->f_pos = (nf_pos >> 2) + 1;
+
+		fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
+					&elen, &offset);
+		if (!fi)
+			goto out;
+
+		liu = le16_to_cpu(cfi.lengthOfImpUse);
+		lfi = cfi.lengthFileIdent;
+
+		if (fibh.sbh == fibh.ebh) {
+			nameptr = fi->fileIdent + liu;
+		} else {
+			int poffset;	/* Unpaded ending offset */
+
+			poffset = fibh.soffset + sizeof(struct fileIdentDesc) + liu + lfi;
+
+			if (poffset >= lfi) {
+				nameptr = (char *)(fibh.ebh->b_data + poffset - lfi);
+			} else {
+				nameptr = fname;
+				memcpy(nameptr, fi->fileIdent + liu,
+				       lfi - poffset);
+				memcpy(nameptr + lfi - poffset,
+				       fibh.ebh->b_data, poffset);
+			}
+		}
+
+		if ((cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
+			if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNDELETE))
+				continue;
+		}
+
+		if ((cfi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) {
+			if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNHIDE))
+				continue;
+		}
+
+		if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) {
+			iblock = parent_ino(filp->f_path.dentry);
+			flen = 2;
+			memcpy(fname, "..", flen);
+			dt_type = DT_DIR;
+		} else {
+			struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
+
+			iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
+			flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
+			dt_type = DT_UNKNOWN;
+		}
+
+		if (flen && filldir(dirent, fname, flen, filp->f_pos,
+				    iblock, dt_type) < 0)
+			goto out;
+	} /* end while */
+
+	filp->f_pos = (nf_pos >> 2) + 1;
+
+out:
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+	brelse(epos.bh);
+	kfree(fname);
+
+	return ret;
+}
+
+static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *dir = filp->f_path.dentry->d_inode;
+	int result;
+
+	if (filp->f_pos == 0) {
+		if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
+			return 0;
+		}
+		filp->f_pos++;
+	}
+
+	result = do_udf_readdir(dir, filp, filldir, dirent);
+ 	return result;
+}
+
+/* readdir and lookup functions */
+const struct file_operations udf_dir_operations = {
+	.llseek			= generic_file_llseek,
+	.read			= generic_read_dir,
+	.readdir		= udf_readdir,
+	.unlocked_ioctl		= udf_ioctl,
+	.fsync			= generic_file_fsync,
+};
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
new file mode 100644
index 00000000..2ffdb673
--- /dev/null
+++ b/fs/udf/directory.c
@@ -0,0 +1,241 @@
+/*
+ * directory.c
+ *
+ * PURPOSE
+ *	Directory related functions
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ */
+
+#include "udfdecl.h"
+#include "udf_i.h"
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+
+struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
+					 struct udf_fileident_bh *fibh,
+					 struct fileIdentDesc *cfi,
+					 struct extent_position *epos,
+					 struct kernel_lb_addr *eloc, uint32_t *elen,
+					 sector_t *offset)
+{
+	struct fileIdentDesc *fi;
+	int i, num, block;
+	struct buffer_head *tmp, *bha[16];
+	struct udf_inode_info *iinfo = UDF_I(dir);
+
+	fibh->soffset = fibh->eoffset;
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+		fi = udf_get_fileident(iinfo->i_ext.i_data -
+				       (iinfo->i_efe ?
+					sizeof(struct extendedFileEntry) :
+					sizeof(struct fileEntry)),
+				       dir->i_sb->s_blocksize,
+				       &(fibh->eoffset));
+		if (!fi)
+			return NULL;
+
+		*nf_pos += fibh->eoffset - fibh->soffset;
+
+		memcpy((uint8_t *)cfi, (uint8_t *)fi,
+		       sizeof(struct fileIdentDesc));
+
+		return fi;
+	}
+
+	if (fibh->eoffset == dir->i_sb->s_blocksize) {
+		int lextoffset = epos->offset;
+		unsigned char blocksize_bits = dir->i_sb->s_blocksize_bits;
+
+		if (udf_next_aext(dir, epos, eloc, elen, 1) !=
+		    (EXT_RECORDED_ALLOCATED >> 30))
+			return NULL;
+
+		block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
+
+		(*offset)++;
+
+		if ((*offset << blocksize_bits) >= *elen)
+			*offset = 0;
+		else
+			epos->offset = lextoffset;
+
+		brelse(fibh->sbh);
+		fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block);
+		if (!fibh->sbh)
+			return NULL;
+		fibh->soffset = fibh->eoffset = 0;
+
+		if (!(*offset & ((16 >> (blocksize_bits - 9)) - 1))) {
+			i = 16 >> (blocksize_bits - 9);
+			if (i + *offset > (*elen >> blocksize_bits))
+				i = (*elen >> blocksize_bits)-*offset;
+			for (num = 0; i > 0; i--) {
+				block = udf_get_lb_pblock(dir->i_sb, eloc,
+							  *offset + i);
+				tmp = udf_tgetblk(dir->i_sb, block);
+				if (tmp && !buffer_uptodate(tmp) &&
+						!buffer_locked(tmp))
+					bha[num++] = tmp;
+				else
+					brelse(tmp);
+			}
+			if (num) {
+				ll_rw_block(READA, num, bha);
+				for (i = 0; i < num; i++)
+					brelse(bha[i]);
+			}
+		}
+	} else if (fibh->sbh != fibh->ebh) {
+		brelse(fibh->sbh);
+		fibh->sbh = fibh->ebh;
+	}
+
+	fi = udf_get_fileident(fibh->sbh->b_data, dir->i_sb->s_blocksize,
+			       &(fibh->eoffset));
+
+	if (!fi)
+		return NULL;
+
+	*nf_pos += fibh->eoffset - fibh->soffset;
+
+	if (fibh->eoffset <= dir->i_sb->s_blocksize) {
+		memcpy((uint8_t *)cfi, (uint8_t *)fi,
+		       sizeof(struct fileIdentDesc));
+	} else if (fibh->eoffset > dir->i_sb->s_blocksize) {
+		int lextoffset = epos->offset;
+
+		if (udf_next_aext(dir, epos, eloc, elen, 1) !=
+		    (EXT_RECORDED_ALLOCATED >> 30))
+			return NULL;
+
+		block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
+
+		(*offset)++;
+
+		if ((*offset << dir->i_sb->s_blocksize_bits) >= *elen)
+			*offset = 0;
+		else
+			epos->offset = lextoffset;
+
+		fibh->soffset -= dir->i_sb->s_blocksize;
+		fibh->eoffset -= dir->i_sb->s_blocksize;
+
+		fibh->ebh = udf_tread(dir->i_sb, block);
+		if (!fibh->ebh)
+			return NULL;
+
+		if (sizeof(struct fileIdentDesc) > -fibh->soffset) {
+			int fi_len;
+
+			memcpy((uint8_t *)cfi, (uint8_t *)fi, -fibh->soffset);
+			memcpy((uint8_t *)cfi - fibh->soffset,
+			       fibh->ebh->b_data,
+			       sizeof(struct fileIdentDesc) + fibh->soffset);
+
+			fi_len = (sizeof(struct fileIdentDesc) +
+				  cfi->lengthFileIdent +
+				  le16_to_cpu(cfi->lengthOfImpUse) + 3) & ~3;
+
+			*nf_pos += fi_len - (fibh->eoffset - fibh->soffset);
+			fibh->eoffset = fibh->soffset + fi_len;
+		} else {
+			memcpy((uint8_t *)cfi, (uint8_t *)fi,
+			       sizeof(struct fileIdentDesc));
+		}
+	}
+	return fi;
+}
+
+struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
+{
+	struct fileIdentDesc *fi;
+	int lengthThisIdent;
+	uint8_t *ptr;
+	int padlen;
+
+	if ((!buffer) || (!offset)) {
+		udf_debug("invalidparms\n, buffer=%p, offset=%p\n", buffer,
+			  offset);
+		return NULL;
+	}
+
+	ptr = buffer;
+
+	if ((*offset > 0) && (*offset < bufsize))
+		ptr += *offset;
+	fi = (struct fileIdentDesc *)ptr;
+	if (fi->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FID)) {
+		udf_debug("0x%x != TAG_IDENT_FID\n",
+			  le16_to_cpu(fi->descTag.tagIdent));
+		udf_debug("offset: %u sizeof: %lu bufsize: %u\n",
+			  *offset, (unsigned long)sizeof(struct fileIdentDesc),
+			  bufsize);
+		return NULL;
+	}
+	if ((*offset + sizeof(struct fileIdentDesc)) > bufsize)
+		lengthThisIdent = sizeof(struct fileIdentDesc);
+	else
+		lengthThisIdent = sizeof(struct fileIdentDesc) +
+			fi->lengthFileIdent + le16_to_cpu(fi->lengthOfImpUse);
+
+	/* we need to figure padding, too! */
+	padlen = lengthThisIdent % UDF_NAME_PAD;
+	if (padlen)
+		lengthThisIdent += (UDF_NAME_PAD - padlen);
+	*offset = *offset + lengthThisIdent;
+
+	return fi;
+}
+
+struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
+			      int inc)
+{
+	struct short_ad *sa;
+
+	if ((!ptr) || (!offset)) {
+		printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n");
+		return NULL;
+	}
+
+	if ((*offset + sizeof(struct short_ad)) > maxoffset)
+		return NULL;
+	else {
+		sa = (struct short_ad *)ptr;
+		if (sa->extLength == 0)
+			return NULL;
+	}
+
+	if (inc)
+		*offset += sizeof(struct short_ad);
+	return sa;
+}
+
+struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
+{
+	struct long_ad *la;
+
+	if ((!ptr) || (!offset)) {
+		printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n");
+		return NULL;
+	}
+
+	if ((*offset + sizeof(struct long_ad)) > maxoffset)
+		return NULL;
+	else {
+		la = (struct long_ad *)ptr;
+		if (la->extLength == 0)
+			return NULL;
+	}
+
+	if (inc)
+		*offset += sizeof(struct long_ad);
+	return la;
+}
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
new file mode 100644
index 00000000..4792b771
--- /dev/null
+++ b/fs/udf/ecma_167.h
@@ -0,0 +1,796 @@
+/*
+ * ecma_167.h
+ *
+ * This file is based on ECMA-167 3rd edition (June 1997)
+ * http://www.ecma.ch
+ *
+ * Copyright (c) 2001-2002  Ben Fennema <bfennema@falcon.csc.calpoly.edu>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <linux/types.h>
+
+#ifndef _ECMA_167_H
+#define _ECMA_167_H 1
+
+/* Character set specification (ECMA 167r3 1/7.2.1) */
+struct charspec {
+	uint8_t		charSetType;
+	uint8_t		charSetInfo[63];
+} __attribute__ ((packed));
+
+/* Character Set Type (ECMA 167r3 1/7.2.1.1) */
+#define CHARSPEC_TYPE_CS0		0x00	/* (1/7.2.2) */
+#define CHARSPEC_TYPE_CS1		0x01	/* (1/7.2.3) */
+#define CHARSPEC_TYPE_CS2		0x02	/* (1/7.2.4) */
+#define CHARSPEC_TYPE_CS3		0x03	/* (1/7.2.5) */
+#define CHARSPEC_TYPE_CS4		0x04	/* (1/7.2.6) */
+#define CHARSPEC_TYPE_CS5		0x05	/* (1/7.2.7) */
+#define CHARSPEC_TYPE_CS6		0x06	/* (1/7.2.8) */
+#define CHARSPEC_TYPE_CS7		0x07	/* (1/7.2.9) */
+#define CHARSPEC_TYPE_CS8		0x08	/* (1/7.2.10) */
+
+typedef uint8_t		dstring;
+
+/* Timestamp (ECMA 167r3 1/7.3) */
+struct timestamp {
+	__le16		typeAndTimezone;
+	__le16		year;
+	uint8_t		month;
+	uint8_t		day;
+	uint8_t		hour;
+	uint8_t		minute;
+	uint8_t		second;
+	uint8_t		centiseconds;
+	uint8_t		hundredsOfMicroseconds;
+	uint8_t		microseconds;
+} __attribute__ ((packed));
+
+/* Type and Time Zone (ECMA 167r3 1/7.3.1) */
+#define TIMESTAMP_TYPE_MASK		0xF000
+#define TIMESTAMP_TYPE_CUT		0x0000
+#define TIMESTAMP_TYPE_LOCAL		0x1000
+#define TIMESTAMP_TYPE_AGREEMENT	0x2000
+#define TIMESTAMP_TIMEZONE_MASK		0x0FFF
+
+/* Entity identifier (ECMA 167r3 1/7.4) */
+struct regid {
+	uint8_t		flags;
+	uint8_t		ident[23];
+	uint8_t		identSuffix[8];
+} __attribute__ ((packed));
+
+/* Flags (ECMA 167r3 1/7.4.1) */
+#define ENTITYID_FLAGS_DIRTY		0x00
+#define ENTITYID_FLAGS_PROTECTED	0x01
+
+/* Volume Structure Descriptor (ECMA 167r3 2/9.1) */
+#define VSD_STD_ID_LEN			5
+struct volStructDesc {
+	uint8_t		structType;
+	uint8_t		stdIdent[VSD_STD_ID_LEN];
+	uint8_t		structVersion;
+	uint8_t		structData[2041];
+} __attribute__ ((packed));
+
+/* Standard Identifier (EMCA 167r2 2/9.1.2) */
+#define VSD_STD_ID_NSR02		"NSR02"	/* (3/9.1) */
+
+/* Standard Identifier (ECMA 167r3 2/9.1.2) */
+#define VSD_STD_ID_BEA01		"BEA01"	/* (2/9.2) */
+#define VSD_STD_ID_BOOT2		"BOOT2"	/* (2/9.4) */
+#define VSD_STD_ID_CD001		"CD001"	/* (ECMA-119) */
+#define VSD_STD_ID_CDW02		"CDW02"	/* (ECMA-168) */
+#define VSD_STD_ID_NSR03		"NSR03"	/* (3/9.1) */
+#define VSD_STD_ID_TEA01		"TEA01"	/* (2/9.3) */
+
+/* Beginning Extended Area Descriptor (ECMA 167r3 2/9.2) */
+struct beginningExtendedAreaDesc {
+	uint8_t		structType;
+	uint8_t		stdIdent[VSD_STD_ID_LEN];
+	uint8_t		structVersion;
+	uint8_t		structData[2041];
+} __attribute__ ((packed));
+
+/* Terminating Extended Area Descriptor (ECMA 167r3 2/9.3) */
+struct terminatingExtendedAreaDesc {
+	uint8_t		structType;
+	uint8_t		stdIdent[VSD_STD_ID_LEN];
+	uint8_t		structVersion;
+	uint8_t		structData[2041];
+} __attribute__ ((packed));
+
+/* Boot Descriptor (ECMA 167r3 2/9.4) */
+struct bootDesc {
+	uint8_t			structType;
+	uint8_t			stdIdent[VSD_STD_ID_LEN];
+	uint8_t			structVersion;
+	uint8_t			reserved1;
+	struct regid		archType;
+	struct regid		bootIdent;
+	__le32			bootExtLocation;
+	__le32			bootExtLength;
+	__le64			loadAddress;
+	__le64			startAddress;
+	struct timestamp	descCreationDateAndTime;
+	__le16			flags;
+	uint8_t			reserved2[32];
+	uint8_t			bootUse[1906];
+} __attribute__ ((packed));
+
+/* Flags (ECMA 167r3 2/9.4.12) */
+#define BOOT_FLAGS_ERASE		0x01
+
+/* Extent Descriptor (ECMA 167r3 3/7.1) */
+struct extent_ad {
+	__le32		extLength;
+	__le32		extLocation;
+} __attribute__ ((packed));
+
+struct kernel_extent_ad {
+	uint32_t	extLength;
+	uint32_t	extLocation;
+};
+
+/* Descriptor Tag (ECMA 167r3 3/7.2) */
+struct tag {
+	__le16		tagIdent;
+	__le16		descVersion;
+	uint8_t		tagChecksum;
+	uint8_t		reserved;
+	__le16		tagSerialNum;
+	__le16		descCRC;
+	__le16		descCRCLength;
+	__le32		tagLocation;
+} __attribute__ ((packed));
+
+/* Tag Identifier (ECMA 167r3 3/7.2.1) */
+#define TAG_IDENT_PVD			0x0001
+#define TAG_IDENT_AVDP			0x0002
+#define TAG_IDENT_VDP			0x0003
+#define TAG_IDENT_IUVD			0x0004
+#define TAG_IDENT_PD			0x0005
+#define TAG_IDENT_LVD			0x0006
+#define TAG_IDENT_USD			0x0007
+#define TAG_IDENT_TD			0x0008
+#define TAG_IDENT_LVID			0x0009
+
+/* NSR Descriptor (ECMA 167r3 3/9.1) */
+struct NSRDesc {
+	uint8_t		structType;
+	uint8_t		stdIdent[VSD_STD_ID_LEN];
+	uint8_t		structVersion;
+	uint8_t		reserved;
+	uint8_t		structData[2040];
+} __attribute__ ((packed));
+
+/* Primary Volume Descriptor (ECMA 167r3 3/10.1) */
+struct primaryVolDesc {
+	struct tag		descTag;
+	__le32			volDescSeqNum;
+	__le32			primaryVolDescNum;
+	dstring			volIdent[32];
+	__le16			volSeqNum;
+	__le16			maxVolSeqNum;
+	__le16			interchangeLvl;
+	__le16			maxInterchangeLvl;
+	__le32			charSetList;
+	__le32			maxCharSetList;
+	dstring			volSetIdent[128];
+	struct charspec		descCharSet;
+	struct charspec		explanatoryCharSet;
+	struct extent_ad	volAbstract;
+	struct extent_ad	volCopyright;
+	struct regid		appIdent;
+	struct timestamp	recordingDateAndTime;
+	struct regid		impIdent;
+	uint8_t			impUse[64];
+	__le32			predecessorVolDescSeqLocation;
+	__le16			flags;
+	uint8_t			reserved[22];
+} __attribute__ ((packed));
+
+/* Flags (ECMA 167r3 3/10.1.21) */
+#define PVD_FLAGS_VSID_COMMON		0x0001
+
+/* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */
+struct anchorVolDescPtr {
+	struct tag		descTag;
+	struct extent_ad	mainVolDescSeqExt;
+	struct extent_ad	reserveVolDescSeqExt;
+	uint8_t	 		reserved[480];
+} __attribute__ ((packed));
+
+/* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */
+struct volDescPtr {
+	struct tag		descTag;
+	__le32			volDescSeqNum;
+	struct extent_ad	nextVolDescSeqExt;
+	uint8_t			reserved[484];
+} __attribute__ ((packed));
+
+/* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */
+struct impUseVolDesc {
+	struct tag	descTag;
+	__le32		volDescSeqNum;
+	struct regid	impIdent;
+	uint8_t		impUse[460];
+} __attribute__ ((packed));
+
+/* Partition Descriptor (ECMA 167r3 3/10.5) */
+struct partitionDesc {
+	struct tag descTag;
+	__le32 volDescSeqNum;
+	__le16 partitionFlags;
+	__le16 partitionNumber;
+	struct regid partitionContents;
+	uint8_t partitionContentsUse[128];
+	__le32 accessType;
+	__le32 partitionStartingLocation;
+	__le32 partitionLength;
+	struct regid impIdent;
+	uint8_t impUse[128];
+	uint8_t reserved[156];
+} __attribute__ ((packed));
+
+/* Partition Flags (ECMA 167r3 3/10.5.3) */
+#define PD_PARTITION_FLAGS_ALLOC	0x0001
+
+/* Partition Contents (ECMA 167r2 3/10.5.3) */
+#define PD_PARTITION_CONTENTS_NSR02	"+NSR02"
+
+/* Partition Contents (ECMA 167r3 3/10.5.5) */
+#define PD_PARTITION_CONTENTS_FDC01	"+FDC01"
+#define PD_PARTITION_CONTENTS_CD001	"+CD001"
+#define PD_PARTITION_CONTENTS_CDW02	"+CDW02"
+#define PD_PARTITION_CONTENTS_NSR03	"+NSR03"
+
+/* Access Type (ECMA 167r3 3/10.5.7) */
+#define PD_ACCESS_TYPE_NONE		0x00000000
+#define PD_ACCESS_TYPE_READ_ONLY	0x00000001
+#define PD_ACCESS_TYPE_WRITE_ONCE	0x00000002
+#define PD_ACCESS_TYPE_REWRITABLE	0x00000003
+#define PD_ACCESS_TYPE_OVERWRITABLE	0x00000004
+
+/* Logical Volume Descriptor (ECMA 167r3 3/10.6) */
+struct logicalVolDesc {
+	struct tag		descTag;
+	__le32			volDescSeqNum;
+	struct charspec		descCharSet;
+	dstring			logicalVolIdent[128];
+	__le32			logicalBlockSize;
+	struct regid		domainIdent;
+	uint8_t			logicalVolContentsUse[16];
+	__le32			mapTableLength;
+	__le32			numPartitionMaps;
+	struct regid		impIdent;
+	uint8_t			impUse[128];
+	struct extent_ad	integritySeqExt;
+	uint8_t			partitionMaps[0];
+} __attribute__ ((packed));
+
+/* Generic Partition Map (ECMA 167r3 3/10.7.1) */
+struct genericPartitionMap {
+	uint8_t		partitionMapType;
+	uint8_t		partitionMapLength;
+	uint8_t		partitionMapping[0];
+} __attribute__ ((packed));
+
+/* Partition Map Type (ECMA 167r3 3/10.7.1.1) */
+#define GP_PARTITION_MAP_TYPE_UNDEF	0x00
+#define GP_PARTIITON_MAP_TYPE_1		0x01
+#define GP_PARTITION_MAP_TYPE_2		0x02
+
+/* Type 1 Partition Map (ECMA 167r3 3/10.7.2) */
+struct genericPartitionMap1 {
+	uint8_t		partitionMapType;
+	uint8_t		partitionMapLength;
+	__le16		volSeqNum;
+	__le16		partitionNum;
+} __attribute__ ((packed));
+
+/* Type 2 Partition Map (ECMA 167r3 3/10.7.3) */
+struct genericPartitionMap2 {
+	uint8_t		partitionMapType;
+	uint8_t		partitionMapLength;
+	uint8_t		partitionIdent[62];
+} __attribute__ ((packed));
+
+/* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */
+struct unallocSpaceDesc {
+	struct tag		descTag;
+	__le32			volDescSeqNum;
+	__le32			numAllocDescs;
+	struct extent_ad	allocDescs[0];
+} __attribute__ ((packed));
+
+/* Terminating Descriptor (ECMA 167r3 3/10.9) */
+struct terminatingDesc {
+	struct tag	descTag;
+	uint8_t		reserved[496];
+} __attribute__ ((packed));
+
+/* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */
+struct logicalVolIntegrityDesc {
+	struct tag		descTag;
+	struct timestamp	recordingDateAndTime;
+	__le32			integrityType;
+	struct extent_ad	nextIntegrityExt;
+	uint8_t			logicalVolContentsUse[32];
+	__le32			numOfPartitions;
+	__le32			lengthOfImpUse;
+	__le32			freeSpaceTable[0];
+	__le32			sizeTable[0];
+	uint8_t			impUse[0];
+} __attribute__ ((packed));
+
+/* Integrity Type (ECMA 167r3 3/10.10.3) */
+#define LVID_INTEGRITY_TYPE_OPEN	0x00000000
+#define LVID_INTEGRITY_TYPE_CLOSE	0x00000001
+
+/* Recorded Address (ECMA 167r3 4/7.1) */
+struct lb_addr {
+	__le32		logicalBlockNum;
+	__le16	 	partitionReferenceNum;
+} __attribute__ ((packed));
+
+/* ... and its in-core analog */
+struct kernel_lb_addr {
+	uint32_t		logicalBlockNum;
+	uint16_t	 	partitionReferenceNum;
+};
+
+/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
+struct short_ad {
+        __le32		extLength;
+        __le32		extPosition;
+} __attribute__ ((packed));
+
+/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */
+struct long_ad {
+	__le32		extLength;
+	struct lb_addr	extLocation;
+	uint8_t		impUse[6];
+} __attribute__ ((packed));
+
+struct kernel_long_ad {
+	uint32_t		extLength;
+	struct kernel_lb_addr	extLocation;
+	uint8_t			impUse[6];
+};
+
+/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */
+struct ext_ad {
+	__le32		extLength;
+	__le32		recordedLength;
+	__le32		informationLength;
+	struct lb_addr	extLocation;
+} __attribute__ ((packed));
+
+struct kernel_ext_ad {
+	uint32_t		extLength;
+	uint32_t		recordedLength;
+	uint32_t		informationLength;
+	struct kernel_lb_addr	extLocation;
+};
+
+/* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */
+
+/* Tag Identifier (ECMA 167r3 4/7.2.1) */
+#define TAG_IDENT_FSD			0x0100
+#define TAG_IDENT_FID			0x0101
+#define TAG_IDENT_AED			0x0102
+#define TAG_IDENT_IE			0x0103
+#define TAG_IDENT_TE			0x0104
+#define TAG_IDENT_FE			0x0105
+#define TAG_IDENT_EAHD			0x0106
+#define TAG_IDENT_USE			0x0107
+#define TAG_IDENT_SBD			0x0108
+#define TAG_IDENT_PIE			0x0109
+#define TAG_IDENT_EFE			0x010A
+
+/* File Set Descriptor (ECMA 167r3 4/14.1) */
+struct fileSetDesc {
+	struct tag		descTag;
+	struct timestamp	recordingDateAndTime;
+	__le16			interchangeLvl;
+	__le16			maxInterchangeLvl;
+	__le32			charSetList;
+	__le32			maxCharSetList;
+	__le32			fileSetNum;
+	__le32			fileSetDescNum;
+	struct charspec		logicalVolIdentCharSet;
+	dstring			logicalVolIdent[128];
+	struct charspec		fileSetCharSet;
+	dstring			fileSetIdent[32];
+	dstring			copyrightFileIdent[32];
+	dstring			abstractFileIdent[32];
+	struct long_ad		rootDirectoryICB;
+	struct regid		domainIdent;
+	struct long_ad		nextExt;
+	struct long_ad		streamDirectoryICB;
+	uint8_t			reserved[32];
+} __attribute__ ((packed));
+
+/* Partition Header Descriptor (ECMA 167r3 4/14.3) */
+struct partitionHeaderDesc {
+	struct short_ad	unallocSpaceTable;
+	struct short_ad	unallocSpaceBitmap;
+	struct short_ad	partitionIntegrityTable;
+	struct short_ad	freedSpaceTable;
+	struct short_ad	freedSpaceBitmap;
+	uint8_t		reserved[88];
+} __attribute__ ((packed));
+
+/* File Identifier Descriptor (ECMA 167r3 4/14.4) */
+struct fileIdentDesc {
+	struct tag	descTag;
+	__le16		fileVersionNum;
+	uint8_t		fileCharacteristics;
+	uint8_t		lengthFileIdent;
+	struct long_ad	icb;
+	__le16		lengthOfImpUse;
+	uint8_t		impUse[0];
+	uint8_t		fileIdent[0];
+	uint8_t		padding[0];
+} __attribute__ ((packed));
+
+/* File Characteristics (ECMA 167r3 4/14.4.3) */
+#define FID_FILE_CHAR_HIDDEN		0x01
+#define FID_FILE_CHAR_DIRECTORY		0x02
+#define FID_FILE_CHAR_DELETED		0x04
+#define FID_FILE_CHAR_PARENT		0x08
+#define FID_FILE_CHAR_METADATA		0x10
+
+/* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */
+struct allocExtDesc {
+	struct tag	descTag;
+	__le32		previousAllocExtLocation;
+	__le32		lengthAllocDescs;
+} __attribute__ ((packed));
+
+/* ICB Tag (ECMA 167r3 4/14.6) */
+struct icbtag {
+	__le32		priorRecordedNumDirectEntries;
+	__le16		strategyType;
+	__le16		strategyParameter;
+	__le16		numEntries;
+	uint8_t		reserved;
+	uint8_t		fileType;
+	struct lb_addr	parentICBLocation;
+	__le16		flags;
+} __attribute__ ((packed));
+
+/* Strategy Type (ECMA 167r3 4/14.6.2) */
+#define ICBTAG_STRATEGY_TYPE_UNDEF	0x0000
+#define ICBTAG_STRATEGY_TYPE_1		0x0001
+#define ICBTAG_STRATEGY_TYPE_2		0x0002
+#define ICBTAG_STRATEGY_TYPE_3		0x0003
+#define ICBTAG_STRATEGY_TYPE_4		0x0004
+
+/* File Type (ECMA 167r3 4/14.6.6) */
+#define ICBTAG_FILE_TYPE_UNDEF		0x00
+#define ICBTAG_FILE_TYPE_USE		0x01
+#define ICBTAG_FILE_TYPE_PIE		0x02
+#define ICBTAG_FILE_TYPE_IE		0x03
+#define ICBTAG_FILE_TYPE_DIRECTORY	0x04
+#define ICBTAG_FILE_TYPE_REGULAR	0x05
+#define ICBTAG_FILE_TYPE_BLOCK		0x06
+#define ICBTAG_FILE_TYPE_CHAR		0x07
+#define ICBTAG_FILE_TYPE_EA		0x08
+#define ICBTAG_FILE_TYPE_FIFO		0x09
+#define ICBTAG_FILE_TYPE_SOCKET		0x0A
+#define ICBTAG_FILE_TYPE_TE		0x0B
+#define ICBTAG_FILE_TYPE_SYMLINK	0x0C
+#define ICBTAG_FILE_TYPE_STREAMDIR	0x0D
+
+/* Flags (ECMA 167r3 4/14.6.8) */
+#define ICBTAG_FLAG_AD_MASK		0x0007
+#define ICBTAG_FLAG_AD_SHORT		0x0000
+#define ICBTAG_FLAG_AD_LONG		0x0001
+#define ICBTAG_FLAG_AD_EXTENDED		0x0002
+#define ICBTAG_FLAG_AD_IN_ICB		0x0003
+#define ICBTAG_FLAG_SORTED		0x0008
+#define ICBTAG_FLAG_NONRELOCATABLE	0x0010
+#define ICBTAG_FLAG_ARCHIVE		0x0020
+#define ICBTAG_FLAG_SETUID		0x0040
+#define ICBTAG_FLAG_SETGID		0x0080
+#define ICBTAG_FLAG_STICKY		0x0100
+#define ICBTAG_FLAG_CONTIGUOUS		0x0200
+#define ICBTAG_FLAG_SYSTEM		0x0400
+#define ICBTAG_FLAG_TRANSFORMED		0x0800
+#define ICBTAG_FLAG_MULTIVERSIONS	0x1000
+#define ICBTAG_FLAG_STREAM		0x2000
+
+/* Indirect Entry (ECMA 167r3 4/14.7) */
+struct indirectEntry {
+	struct tag	descTag;
+	struct icbtag	icbTag;
+	struct long_ad	indirectICB;
+} __attribute__ ((packed));
+
+/* Terminal Entry (ECMA 167r3 4/14.8) */
+struct terminalEntry {
+	struct tag	descTag;
+	struct icbtag	icbTag;
+} __attribute__ ((packed));
+
+/* File Entry (ECMA 167r3 4/14.9) */
+struct fileEntry {
+	struct tag		descTag;
+	struct icbtag		icbTag;
+	__le32			uid;
+	__le32			gid;
+	__le32			permissions;
+	__le16			fileLinkCount;
+	uint8_t			recordFormat;
+	uint8_t			recordDisplayAttr;
+	__le32			recordLength;
+	__le64			informationLength;
+	__le64			logicalBlocksRecorded;
+	struct timestamp	accessTime;
+	struct timestamp	modificationTime;
+	struct timestamp	attrTime;
+	__le32			checkpoint;
+	struct long_ad		extendedAttrICB;
+	struct regid		impIdent;
+	__le64			uniqueID;
+	__le32			lengthExtendedAttr;
+	__le32			lengthAllocDescs;
+	uint8_t			extendedAttr[0];
+	uint8_t			allocDescs[0];
+} __attribute__ ((packed));
+
+/* Permissions (ECMA 167r3 4/14.9.5) */
+#define FE_PERM_O_EXEC			0x00000001U
+#define FE_PERM_O_WRITE			0x00000002U
+#define FE_PERM_O_READ			0x00000004U
+#define FE_PERM_O_CHATTR		0x00000008U
+#define FE_PERM_O_DELETE		0x00000010U
+#define FE_PERM_G_EXEC			0x00000020U
+#define FE_PERM_G_WRITE			0x00000040U
+#define FE_PERM_G_READ			0x00000080U
+#define FE_PERM_G_CHATTR		0x00000100U
+#define FE_PERM_G_DELETE		0x00000200U
+#define FE_PERM_U_EXEC			0x00000400U
+#define FE_PERM_U_WRITE			0x00000800U
+#define FE_PERM_U_READ			0x00001000U
+#define FE_PERM_U_CHATTR		0x00002000U
+#define FE_PERM_U_DELETE		0x00004000U
+
+/* Record Format (ECMA 167r3 4/14.9.7) */
+#define FE_RECORD_FMT_UNDEF		0x00
+#define FE_RECORD_FMT_FIXED_PAD		0x01
+#define FE_RECORD_FMT_FIXED		0x02
+#define FE_RECORD_FMT_VARIABLE8		0x03
+#define FE_RECORD_FMT_VARIABLE16	0x04
+#define FE_RECORD_FMT_VARIABLE16_MSB	0x05
+#define FE_RECORD_FMT_VARIABLE32	0x06
+#define FE_RECORD_FMT_PRINT		0x07
+#define FE_RECORD_FMT_LF		0x08
+#define FE_RECORD_FMT_CR		0x09
+#define FE_RECORD_FMT_CRLF		0x0A
+#define FE_RECORD_FMT_LFCR		0x0B
+
+/* Record Display Attributes (ECMA 167r3 4/14.9.8) */
+#define FE_RECORD_DISPLAY_ATTR_UNDEF	0x00
+#define FE_RECORD_DISPLAY_ATTR_1	0x01
+#define FE_RECORD_DISPLAY_ATTR_2	0x02
+#define FE_RECORD_DISPLAY_ATTR_3	0x03
+
+/* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */
+struct extendedAttrHeaderDesc {
+	struct tag	descTag;
+	__le32		impAttrLocation;
+	__le32		appAttrLocation;
+} __attribute__ ((packed));
+
+/* Generic Format (ECMA 167r3 4/14.10.2) */
+struct genericFormat {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	uint8_t		attrData[0];
+} __attribute__ ((packed));
+
+/* Character Set Information (ECMA 167r3 4/14.10.3) */
+struct charSetInfo {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	__le32		escapeSeqLength;
+	uint8_t		charSetType;
+	uint8_t		escapeSeq[0];
+} __attribute__ ((packed));
+
+/* Alternate Permissions (ECMA 167r3 4/14.10.4) */
+struct altPerms {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	__le16		ownerIdent;
+	__le16		groupIdent;
+	__le16		permission;
+} __attribute__ ((packed));
+
+/* File Times Extended Attribute (ECMA 167r3 4/14.10.5) */
+struct fileTimesExtAttr {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	__le32		dataLength;
+	__le32		fileTimeExistence;
+	uint8_t		fileTimes;
+} __attribute__ ((packed));
+
+/* FileTimeExistence (ECMA 167r3 4/14.10.5.6) */
+#define FTE_CREATION			0x00000001
+#define FTE_DELETION			0x00000004
+#define FTE_EFFECTIVE			0x00000008
+#define FTE_BACKUP			0x00000002
+
+/* Information Times Extended Attribute (ECMA 167r3 4/14.10.6) */
+struct infoTimesExtAttr {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	__le32		dataLength;
+	__le32		infoTimeExistence;
+	uint8_t		infoTimes[0];
+} __attribute__ ((packed));
+
+/* Device Specification (ECMA 167r3 4/14.10.7) */
+struct deviceSpec {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	__le32		impUseLength;
+	__le32		majorDeviceIdent;
+	__le32		minorDeviceIdent;
+	uint8_t		impUse[0];
+} __attribute__ ((packed));
+
+/* Implementation Use Extended Attr (ECMA 167r3 4/14.10.8) */
+struct impUseExtAttr {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	__le32		impUseLength;
+	struct regid	impIdent;
+	uint8_t		impUse[0];
+} __attribute__ ((packed));
+
+/* Application Use Extended Attribute (ECMA 167r3 4/14.10.9) */
+struct appUseExtAttr {
+	__le32		attrType;
+	uint8_t		attrSubtype;
+	uint8_t		reserved[3];
+	__le32		attrLength;
+	__le32		appUseLength;
+	struct regid	appIdent;
+	uint8_t		appUse[0];
+} __attribute__ ((packed));
+
+#define EXTATTR_CHAR_SET		1
+#define EXTATTR_ALT_PERMS		3
+#define EXTATTR_FILE_TIMES		5
+#define EXTATTR_INFO_TIMES		6
+#define EXTATTR_DEV_SPEC		12
+#define EXTATTR_IMP_USE			2048
+#define EXTATTR_APP_USE			65536
+
+/* Unallocated Space Entry (ECMA 167r3 4/14.11) */
+struct unallocSpaceEntry {
+	struct tag	descTag;
+	struct icbtag	icbTag;
+	__le32		lengthAllocDescs;
+	uint8_t		allocDescs[0];
+} __attribute__ ((packed));
+
+/* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */
+struct spaceBitmapDesc {
+	struct tag	descTag;
+	__le32		numOfBits;
+	__le32		numOfBytes;
+	uint8_t		bitmap[0];
+} __attribute__ ((packed));
+
+/* Partition Integrity Entry (ECMA 167r3 4/14.13) */
+struct partitionIntegrityEntry {
+	struct tag		descTag;
+	struct icbtag		icbTag;
+	struct timestamp	recordingDateAndTime;
+	uint8_t			integrityType;
+	uint8_t			reserved[175];
+	struct regid		impIdent;
+	uint8_t			impUse[256];
+} __attribute__ ((packed));
+
+/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
+
+/* Extent Length (ECMA 167r3 4/14.14.1.1) */
+#define EXT_RECORDED_ALLOCATED		0x00000000
+#define EXT_NOT_RECORDED_ALLOCATED	0x40000000
+#define EXT_NOT_RECORDED_NOT_ALLOCATED	0x80000000
+#define EXT_NEXT_EXTENT_ALLOCDECS	0xC0000000
+
+/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */
+
+/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */
+
+/* Logical Volume Header Descriptor (ECMA 167r3 4/14.15) */
+struct logicalVolHeaderDesc {
+	__le64		uniqueID;
+	uint8_t		reserved[24];
+} __attribute__ ((packed));
+
+/* Path Component (ECMA 167r3 4/14.16.1) */
+struct pathComponent {
+	uint8_t		componentType;
+	uint8_t		lengthComponentIdent;
+	__le16		componentFileVersionNum;
+	dstring		componentIdent[0];
+} __attribute__ ((packed));
+
+/* File Entry (ECMA 167r3 4/14.17) */
+struct extendedFileEntry {
+	struct tag		descTag;
+	struct icbtag		icbTag;
+	__le32			uid;
+	__le32			gid;
+	__le32			permissions;
+	__le16			fileLinkCount;
+	uint8_t			recordFormat;
+	uint8_t			recordDisplayAttr;
+	__le32			recordLength;
+	__le64			informationLength;
+	__le64			objectSize;
+	__le64			logicalBlocksRecorded;
+	struct timestamp	accessTime;
+	struct timestamp	modificationTime;
+	struct timestamp	createTime;
+	struct timestamp	attrTime;
+	__le32			checkpoint;
+	__le32			reserved;
+	struct long_ad		extendedAttrICB;
+	struct long_ad		streamDirectoryICB;
+	struct regid		impIdent;
+	__le64			uniqueID;
+	__le32			lengthExtendedAttr;
+	__le32			lengthAllocDescs;
+	uint8_t			extendedAttr[0];
+	uint8_t			allocDescs[0];
+} __attribute__ ((packed));
+
+#endif /* _ECMA_167_H */
diff --git a/fs/udf/file.c b/fs/udf/file.c
new file mode 100644
index 00000000..3438b000
--- /dev/null
+++ b/fs/udf/file.c
@@ -0,0 +1,249 @@
+/*
+ * file.c
+ *
+ * PURPOSE
+ *  File handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *  This file is distributed under the terms of the GNU General Public
+ *  License (GPL). Copies of the GPL can be obtained from:
+ *    ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *  Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998-1999 Dave Boynton
+ *  (C) 1998-2004 Ben Fennema
+ *  (C) 1999-2000 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  10/02/98 dgb  Attempt to integrate into udf.o
+ *  10/07/98      Switched to using generic_readpage, etc., like isofs
+ *                And it works!
+ *  12/06/98 blf  Added udf_file_read. uses generic_file_read for all cases but
+ *                ICBTAG_FLAG_AD_IN_ICB.
+ *  04/06/99      64 bit file handling on 32 bit systems taken from ext2 file.c
+ *  05/12/99      Preliminary file write support
+ */
+
+#include "udfdecl.h"
+#include <linux/fs.h>
+#include <asm/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/string.h> /* memset */
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/aio.h>
+
+#include "udf_i.h"
+#include "udf_sb.h"
+
+static int udf_adinicb_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	char *kaddr;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	BUG_ON(!PageLocked(page));
+
+	kaddr = kmap(page);
+	memset(kaddr, 0, PAGE_CACHE_SIZE);
+	memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+
+	return 0;
+}
+
+static int udf_adinicb_writepage(struct page *page,
+				 struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	char *kaddr;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	BUG_ON(!PageLocked(page));
+
+	kaddr = kmap(page);
+	memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size);
+	mark_inode_dirty(inode);
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+
+	return 0;
+}
+
+static int udf_adinicb_write_end(struct file *file,
+			struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+	char *kaddr;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset,
+		kaddr + offset, copied);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	return simple_write_end(file, mapping, pos, len, copied, page, fsdata);
+}
+
+const struct address_space_operations udf_adinicb_aops = {
+	.readpage	= udf_adinicb_readpage,
+	.writepage	= udf_adinicb_writepage,
+	.write_begin = simple_write_begin,
+	.write_end = udf_adinicb_write_end,
+};
+
+static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				  unsigned long nr_segs, loff_t ppos)
+{
+	ssize_t retval;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int err, pos;
+	size_t count = iocb->ki_left;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	down_write(&iinfo->i_data_sem);
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+		if (file->f_flags & O_APPEND)
+			pos = inode->i_size;
+		else
+			pos = ppos;
+
+		if (inode->i_sb->s_blocksize <
+				(udf_file_entry_alloc_offset(inode) +
+						pos + count)) {
+			err = udf_expand_file_adinicb(inode);
+			if (err) {
+				udf_debug("udf_expand_adinicb: err=%d\n", err);
+				return err;
+			}
+		} else {
+			if (pos + count > inode->i_size)
+				iinfo->i_lenAlloc = pos + count;
+			else
+				iinfo->i_lenAlloc = inode->i_size;
+			up_write(&iinfo->i_data_sem);
+		}
+	} else
+		up_write(&iinfo->i_data_sem);
+
+	retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
+	if (retval > 0)
+		mark_inode_dirty(inode);
+
+	return retval;
+}
+
+long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	long old_block, new_block;
+	int result = -EINVAL;
+
+	if (file_permission(filp, MAY_READ) != 0) {
+		udf_debug("no permission to access inode %lu\n", inode->i_ino);
+		result = -EPERM;
+		goto out;
+	}
+
+	if (!arg) {
+		udf_debug("invalid argument to udf_ioctl\n");
+		result = -EINVAL;
+		goto out;
+	}
+
+	switch (cmd) {
+	case UDF_GETVOLIDENT:
+		if (copy_to_user((char __user *)arg,
+				 UDF_SB(inode->i_sb)->s_volume_ident, 32))
+			result = -EFAULT;
+		else
+			result = 0;
+		goto out;
+	case UDF_RELOCATE_BLOCKS:
+		if (!capable(CAP_SYS_ADMIN)) {
+			result = -EACCES;
+			goto out;
+		}
+		if (get_user(old_block, (long __user *)arg)) {
+			result = -EFAULT;
+			goto out;
+		}
+		result = udf_relocate_blocks(inode->i_sb,
+						old_block, &new_block);
+		if (result == 0)
+			result = put_user(new_block, (long __user *)arg);
+		goto out;
+	case UDF_GETEASIZE:
+		result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg);
+		goto out;
+	case UDF_GETEABLOCK:
+		result = copy_to_user((char __user *)arg,
+				      UDF_I(inode)->i_ext.i_data,
+				      UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0;
+		goto out;
+	}
+
+out:
+	return result;
+}
+
+static int udf_release_file(struct inode *inode, struct file *filp)
+{
+	if (filp->f_mode & FMODE_WRITE) {
+		down_write(&UDF_I(inode)->i_data_sem);
+		udf_discard_prealloc(inode);
+		udf_truncate_tail_extent(inode);
+		up_write(&UDF_I(inode)->i_data_sem);
+	}
+	return 0;
+}
+
+const struct file_operations udf_file_operations = {
+	.read			= do_sync_read,
+	.aio_read		= generic_file_aio_read,
+	.unlocked_ioctl		= udf_ioctl,
+	.open			= generic_file_open,
+	.mmap			= generic_file_mmap,
+	.write			= do_sync_write,
+	.aio_write		= udf_file_aio_write,
+	.release		= udf_release_file,
+	.fsync			= generic_file_fsync,
+	.splice_read		= generic_file_splice_read,
+	.llseek			= generic_file_llseek,
+};
+
+static int udf_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = udf_setsize(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+const struct inode_operations udf_file_inode_operations = {
+	.setattr		= udf_setattr,
+};
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
new file mode 100644
index 00000000..6fb7e0ad
--- /dev/null
+++ b/fs/udf/ialloc.c
@@ -0,0 +1,132 @@
+/*
+ * ialloc.c
+ *
+ * PURPOSE
+ *	Inode allocation handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998-2001 Ben Fennema
+ *
+ * HISTORY
+ *
+ *  02/24/99 blf  Created.
+ *
+ */
+
+#include "udfdecl.h"
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "udf_i.h"
+#include "udf_sb.h"
+
+void udf_free_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	if (sbi->s_lvid_bh) {
+		struct logicalVolIntegrityDescImpUse *lvidiu =
+							udf_sb_lvidiu(sbi);
+		if (S_ISDIR(inode->i_mode))
+			le32_add_cpu(&lvidiu->numDirs, -1);
+		else
+			le32_add_cpu(&lvidiu->numFiles, -1);
+		udf_updated_lvid(sb);
+	}
+	mutex_unlock(&sbi->s_alloc_mutex);
+
+	udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
+}
+
+struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
+{
+	struct super_block *sb = dir->i_sb;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct inode *inode;
+	int block;
+	uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
+	struct udf_inode_info *iinfo;
+	struct udf_inode_info *dinfo = UDF_I(dir);
+
+	inode = new_inode(sb);
+
+	if (!inode) {
+		*err = -ENOMEM;
+		return NULL;
+	}
+	*err = -ENOSPC;
+
+	iinfo = UDF_I(inode);
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
+		iinfo->i_efe = 1;
+		if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
+			sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
+		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+					    sizeof(struct extendedFileEntry),
+					    GFP_KERNEL);
+	} else {
+		iinfo->i_efe = 0;
+		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+					    sizeof(struct fileEntry),
+					    GFP_KERNEL);
+	}
+	if (!iinfo->i_ext.i_data) {
+		iput(inode);
+		*err = -ENOMEM;
+		return NULL;
+	}
+
+	block = udf_new_block(dir->i_sb, NULL,
+			      dinfo->i_location.partitionReferenceNum,
+			      start, err);
+	if (*err) {
+		iput(inode);
+		return NULL;
+	}
+
+	if (sbi->s_lvid_bh) {
+		struct logicalVolIntegrityDescImpUse *lvidiu;
+
+		iinfo->i_unique = lvid_get_unique_id(sb);
+		mutex_lock(&sbi->s_alloc_mutex);
+		lvidiu = udf_sb_lvidiu(sbi);
+		if (S_ISDIR(mode))
+			le32_add_cpu(&lvidiu->numDirs, 1);
+		else
+			le32_add_cpu(&lvidiu->numFiles, 1);
+		udf_updated_lvid(sb);
+		mutex_unlock(&sbi->s_alloc_mutex);
+	}
+
+	inode_init_owner(inode, dir, mode);
+
+	iinfo->i_location.logicalBlockNum = block;
+	iinfo->i_location.partitionReferenceNum =
+				dinfo->i_location.partitionReferenceNum;
+	inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0);
+	inode->i_blocks = 0;
+	iinfo->i_lenEAttr = 0;
+	iinfo->i_lenAlloc = 0;
+	iinfo->i_use = 0;
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
+		iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
+	else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
+		iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
+	else
+		iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
+	inode->i_mtime = inode->i_atime = inode->i_ctime =
+		iinfo->i_crtime = current_fs_time(inode->i_sb);
+	insert_inode_hash(inode);
+	mark_inode_dirty(inode);
+
+	*err = 0;
+	return inode;
+}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
new file mode 100644
index 00000000..262050f2
--- /dev/null
+++ b/fs/udf/inode.c
@@ -0,0 +1,2166 @@
+/*
+ * inode.c
+ *
+ * PURPOSE
+ *  Inode handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *  This file is distributed under the terms of the GNU General Public
+ *  License (GPL). Copies of the GPL can be obtained from:
+ *    ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *  Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998 Dave Boynton
+ *  (C) 1998-2004 Ben Fennema
+ *  (C) 1999-2000 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  10/04/98 dgb  Added rudimentary directory functions
+ *  10/07/98      Fully working udf_block_map! It works!
+ *  11/25/98      bmap altered to better support extents
+ *  12/06/98 blf  partition support in udf_iget, udf_block_map
+ *                and udf_read_inode
+ *  12/12/98      rewrote udf_block_map to handle next extents and descs across
+ *                block boundaries (which is not actually allowed)
+ *  12/20/98      added support for strategy 4096
+ *  03/07/99      rewrote udf_block_map (again)
+ *                New funcs, inode_bmap, udf_next_aext
+ *  04/19/99      Support for writing device EA's for major/minor #
+ */
+
+#include "udfdecl.h"
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+#include <linux/crc-itu-t.h>
+
+#include "udf_i.h"
+#include "udf_sb.h"
+
+MODULE_AUTHOR("Ben Fennema");
+MODULE_DESCRIPTION("Universal Disk Format Filesystem");
+MODULE_LICENSE("GPL");
+
+#define EXTENT_MERGE_SIZE 5
+
+static mode_t udf_convert_permissions(struct fileEntry *);
+static int udf_update_inode(struct inode *, int);
+static void udf_fill_inode(struct inode *, struct buffer_head *);
+static int udf_sync_inode(struct inode *inode);
+static int udf_alloc_i_data(struct inode *inode, size_t size);
+static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
+					sector_t *, int *);
+static int8_t udf_insert_aext(struct inode *, struct extent_position,
+			      struct kernel_lb_addr, uint32_t);
+static void udf_split_extents(struct inode *, int *, int, int,
+			      struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+static void udf_prealloc_extents(struct inode *, int, int,
+				 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+static void udf_merge_extents(struct inode *,
+			      struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+static void udf_update_extents(struct inode *,
+			       struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
+			       struct extent_position *);
+static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
+
+
+void udf_evict_inode(struct inode *inode)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	int want_delete = 0;
+
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
+		want_delete = 1;
+		udf_setsize(inode, 0);
+		udf_update_inode(inode, IS_SYNC(inode));
+	} else
+		truncate_inode_pages(&inode->i_data, 0);
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
+	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
+	    inode->i_size != iinfo->i_lenExtents) {
+		printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
+			"inode size %llu different from extent length %llu. "
+			"Filesystem need not be standards compliant.\n",
+			inode->i_sb->s_id, inode->i_ino, inode->i_mode,
+			(unsigned long long)inode->i_size,
+			(unsigned long long)iinfo->i_lenExtents);
+	}
+	kfree(iinfo->i_ext.i_data);
+	iinfo->i_ext.i_data = NULL;
+	if (want_delete) {
+		udf_free_inode(inode);
+	}
+}
+
+static int udf_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, udf_get_block, wbc);
+}
+
+static int udf_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, udf_get_block);
+}
+
+static int udf_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
+	if (unlikely(ret)) {
+		struct inode *inode = mapping->host;
+		struct udf_inode_info *iinfo = UDF_I(inode);
+		loff_t isize = inode->i_size;
+
+		if (pos + len > isize) {
+			truncate_pagecache(inode, pos + len, isize);
+			if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+				down_write(&iinfo->i_data_sem);
+				udf_truncate_extents(inode);
+				up_write(&iinfo->i_data_sem);
+			}
+		}
+	}
+
+	return ret;
+}
+
+static sector_t udf_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, udf_get_block);
+}
+
+const struct address_space_operations udf_aops = {
+	.readpage	= udf_readpage,
+	.writepage	= udf_writepage,
+	.write_begin		= udf_write_begin,
+	.write_end		= generic_write_end,
+	.bmap		= udf_bmap,
+};
+
+/*
+ * Expand file stored in ICB to a normal one-block-file
+ *
+ * This function requires i_data_sem for writing and releases it.
+ * This function requires i_mutex held
+ */
+int udf_expand_file_adinicb(struct inode *inode)
+{
+	struct page *page;
+	char *kaddr;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	int err;
+	struct writeback_control udf_wbc = {
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = 1,
+	};
+
+	if (!iinfo->i_lenAlloc) {
+		if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
+			iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
+		else
+			iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
+		/* from now on we have normal address_space methods */
+		inode->i_data.a_ops = &udf_aops;
+		up_write(&iinfo->i_data_sem);
+		mark_inode_dirty(inode);
+		return 0;
+	}
+	/*
+	 * Release i_data_sem so that we can lock a page - page lock ranks
+	 * above i_data_sem. i_mutex still protects us against file changes.
+	 */
+	up_write(&iinfo->i_data_sem);
+
+	page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+
+	if (!PageUptodate(page)) {
+		kaddr = kmap(page);
+		memset(kaddr + iinfo->i_lenAlloc, 0x00,
+		       PAGE_CACHE_SIZE - iinfo->i_lenAlloc);
+		memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr,
+			iinfo->i_lenAlloc);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+		kunmap(page);
+	}
+	down_write(&iinfo->i_data_sem);
+	memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00,
+	       iinfo->i_lenAlloc);
+	iinfo->i_lenAlloc = 0;
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
+		iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
+	else
+		iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
+	/* from now on we have normal address_space methods */
+	inode->i_data.a_ops = &udf_aops;
+	up_write(&iinfo->i_data_sem);
+	err = inode->i_data.a_ops->writepage(page, &udf_wbc);
+	if (err) {
+		/* Restore everything back so that we don't lose data... */
+		lock_page(page);
+		kaddr = kmap(page);
+		down_write(&iinfo->i_data_sem);
+		memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
+		       inode->i_size);
+		kunmap(page);
+		unlock_page(page);
+		iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
+		inode->i_data.a_ops = &udf_adinicb_aops;
+		up_write(&iinfo->i_data_sem);
+	}
+	page_cache_release(page);
+	mark_inode_dirty(inode);
+
+	return err;
+}
+
+struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
+					   int *err)
+{
+	int newblock;
+	struct buffer_head *dbh = NULL;
+	struct kernel_lb_addr eloc;
+	uint8_t alloctype;
+	struct extent_position epos;
+
+	struct udf_fileident_bh sfibh, dfibh;
+	loff_t f_pos = udf_ext0_offset(inode);
+	int size = udf_ext0_offset(inode) + inode->i_size;
+	struct fileIdentDesc cfi, *sfi, *dfi;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
+		alloctype = ICBTAG_FLAG_AD_SHORT;
+	else
+		alloctype = ICBTAG_FLAG_AD_LONG;
+
+	if (!inode->i_size) {
+		iinfo->i_alloc_type = alloctype;
+		mark_inode_dirty(inode);
+		return NULL;
+	}
+
+	/* alloc block, and copy data to it */
+	*block = udf_new_block(inode->i_sb, inode,
+			       iinfo->i_location.partitionReferenceNum,
+			       iinfo->i_location.logicalBlockNum, err);
+	if (!(*block))
+		return NULL;
+	newblock = udf_get_pblock(inode->i_sb, *block,
+				  iinfo->i_location.partitionReferenceNum,
+				0);
+	if (!newblock)
+		return NULL;
+	dbh = udf_tgetblk(inode->i_sb, newblock);
+	if (!dbh)
+		return NULL;
+	lock_buffer(dbh);
+	memset(dbh->b_data, 0x00, inode->i_sb->s_blocksize);
+	set_buffer_uptodate(dbh);
+	unlock_buffer(dbh);
+	mark_buffer_dirty_inode(dbh, inode);
+
+	sfibh.soffset = sfibh.eoffset =
+			f_pos & (inode->i_sb->s_blocksize - 1);
+	sfibh.sbh = sfibh.ebh = NULL;
+	dfibh.soffset = dfibh.eoffset = 0;
+	dfibh.sbh = dfibh.ebh = dbh;
+	while (f_pos < size) {
+		iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
+		sfi = udf_fileident_read(inode, &f_pos, &sfibh, &cfi, NULL,
+					 NULL, NULL, NULL);
+		if (!sfi) {
+			brelse(dbh);
+			return NULL;
+		}
+		iinfo->i_alloc_type = alloctype;
+		sfi->descTag.tagLocation = cpu_to_le32(*block);
+		dfibh.soffset = dfibh.eoffset;
+		dfibh.eoffset += (sfibh.eoffset - sfibh.soffset);
+		dfi = (struct fileIdentDesc *)(dbh->b_data + dfibh.soffset);
+		if (udf_write_fi(inode, sfi, dfi, &dfibh, sfi->impUse,
+				 sfi->fileIdent +
+					le16_to_cpu(sfi->lengthOfImpUse))) {
+			iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
+			brelse(dbh);
+			return NULL;
+		}
+	}
+	mark_buffer_dirty_inode(dbh, inode);
+
+	memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0,
+		iinfo->i_lenAlloc);
+	iinfo->i_lenAlloc = 0;
+	eloc.logicalBlockNum = *block;
+	eloc.partitionReferenceNum =
+				iinfo->i_location.partitionReferenceNum;
+	iinfo->i_lenExtents = inode->i_size;
+	epos.bh = NULL;
+	epos.block = iinfo->i_location;
+	epos.offset = udf_file_entry_alloc_offset(inode);
+	udf_add_aext(inode, &epos, &eloc, inode->i_size, 0);
+	/* UniqueID stuff */
+
+	brelse(epos.bh);
+	mark_inode_dirty(inode);
+	return dbh;
+}
+
+static int udf_get_block(struct inode *inode, sector_t block,
+			 struct buffer_head *bh_result, int create)
+{
+	int err, new;
+	struct buffer_head *bh;
+	sector_t phys = 0;
+	struct udf_inode_info *iinfo;
+
+	if (!create) {
+		phys = udf_block_map(inode, block);
+		if (phys)
+			map_bh(bh_result, inode->i_sb, phys);
+		return 0;
+	}
+
+	err = -EIO;
+	new = 0;
+	bh = NULL;
+	iinfo = UDF_I(inode);
+
+	down_write(&iinfo->i_data_sem);
+	if (block == iinfo->i_next_alloc_block + 1) {
+		iinfo->i_next_alloc_block++;
+		iinfo->i_next_alloc_goal++;
+	}
+
+	err = 0;
+
+	bh = inode_getblk(inode, block, &err, &phys, &new);
+	BUG_ON(bh);
+	if (err)
+		goto abort;
+	BUG_ON(!phys);
+
+	if (new)
+		set_buffer_new(bh_result);
+	map_bh(bh_result, inode->i_sb, phys);
+
+abort:
+	up_write(&iinfo->i_data_sem);
+	return err;
+}
+
+static struct buffer_head *udf_getblk(struct inode *inode, long block,
+				      int create, int *err)
+{
+	struct buffer_head *bh;
+	struct buffer_head dummy;
+
+	dummy.b_state = 0;
+	dummy.b_blocknr = -1000;
+	*err = udf_get_block(inode, block, &dummy, create);
+	if (!*err && buffer_mapped(&dummy)) {
+		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+		if (buffer_new(&dummy)) {
+			lock_buffer(bh);
+			memset(bh->b_data, 0x00, inode->i_sb->s_blocksize);
+			set_buffer_uptodate(bh);
+			unlock_buffer(bh);
+			mark_buffer_dirty_inode(bh, inode);
+		}
+		return bh;
+	}
+
+	return NULL;
+}
+
+/* Extend the file by 'blocks' blocks, return the number of extents added */
+static int udf_do_extend_file(struct inode *inode,
+			      struct extent_position *last_pos,
+			      struct kernel_long_ad *last_ext,
+			      sector_t blocks)
+{
+	sector_t add;
+	int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
+	struct super_block *sb = inode->i_sb;
+	struct kernel_lb_addr prealloc_loc = {};
+	int prealloc_len = 0;
+	struct udf_inode_info *iinfo;
+	int err;
+
+	/* The previous extent is fake and we should not extend by anything
+	 * - there's nothing to do... */
+	if (!blocks && fake)
+		return 0;
+
+	iinfo = UDF_I(inode);
+	/* Round the last extent up to a multiple of block size */
+	if (last_ext->extLength & (sb->s_blocksize - 1)) {
+		last_ext->extLength =
+			(last_ext->extLength & UDF_EXTENT_FLAG_MASK) |
+			(((last_ext->extLength & UDF_EXTENT_LENGTH_MASK) +
+			  sb->s_blocksize - 1) & ~(sb->s_blocksize - 1));
+		iinfo->i_lenExtents =
+			(iinfo->i_lenExtents + sb->s_blocksize - 1) &
+			~(sb->s_blocksize - 1);
+	}
+
+	/* Last extent are just preallocated blocks? */
+	if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
+						EXT_NOT_RECORDED_ALLOCATED) {
+		/* Save the extent so that we can reattach it to the end */
+		prealloc_loc = last_ext->extLocation;
+		prealloc_len = last_ext->extLength;
+		/* Mark the extent as a hole */
+		last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
+			(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
+		last_ext->extLocation.logicalBlockNum = 0;
+		last_ext->extLocation.partitionReferenceNum = 0;
+	}
+
+	/* Can we merge with the previous extent? */
+	if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
+					EXT_NOT_RECORDED_NOT_ALLOCATED) {
+		add = ((1 << 30) - sb->s_blocksize -
+			(last_ext->extLength & UDF_EXTENT_LENGTH_MASK)) >>
+			sb->s_blocksize_bits;
+		if (add > blocks)
+			add = blocks;
+		blocks -= add;
+		last_ext->extLength += add << sb->s_blocksize_bits;
+	}
+
+	if (fake) {
+		udf_add_aext(inode, last_pos, &last_ext->extLocation,
+			     last_ext->extLength, 1);
+		count++;
+	} else
+		udf_write_aext(inode, last_pos, &last_ext->extLocation,
+				last_ext->extLength, 1);
+
+	/* Managed to do everything necessary? */
+	if (!blocks)
+		goto out;
+
+	/* All further extents will be NOT_RECORDED_NOT_ALLOCATED */
+	last_ext->extLocation.logicalBlockNum = 0;
+	last_ext->extLocation.partitionReferenceNum = 0;
+	add = (1 << (30-sb->s_blocksize_bits)) - 1;
+	last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
+				(add << sb->s_blocksize_bits);
+
+	/* Create enough extents to cover the whole hole */
+	while (blocks > add) {
+		blocks -= add;
+		err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
+				   last_ext->extLength, 1);
+		if (err)
+			return err;
+		count++;
+	}
+	if (blocks) {
+		last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
+			(blocks << sb->s_blocksize_bits);
+		err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
+				   last_ext->extLength, 1);
+		if (err)
+			return err;
+		count++;
+	}
+
+out:
+	/* Do we have some preallocated blocks saved? */
+	if (prealloc_len) {
+		err = udf_add_aext(inode, last_pos, &prealloc_loc,
+				   prealloc_len, 1);
+		if (err)
+			return err;
+		last_ext->extLocation = prealloc_loc;
+		last_ext->extLength = prealloc_len;
+		count++;
+	}
+
+	/* last_pos should point to the last written extent... */
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		last_pos->offset -= sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		last_pos->offset -= sizeof(struct long_ad);
+	else
+		return -EIO;
+
+	return count;
+}
+
+static int udf_extend_file(struct inode *inode, loff_t newsize)
+{
+
+	struct extent_position epos;
+	struct kernel_lb_addr eloc;
+	uint32_t elen;
+	int8_t etype;
+	struct super_block *sb = inode->i_sb;
+	sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
+	int adsize;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	struct kernel_long_ad extent;
+	int err;
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		BUG();
+
+	etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
+
+	/* File has extent covering the new size (could happen when extending
+	 * inside a block)? */
+	if (etype != -1)
+		return 0;
+	if (newsize & (sb->s_blocksize - 1))
+		offset++;
+	/* Extended file just to the boundary of the last file block? */
+	if (offset == 0)
+		return 0;
+
+	/* Truncate is extending the file by 'offset' blocks */
+	if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
+	    (epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
+		/* File has no extents at all or has empty last
+		 * indirect extent! Create a fake extent... */
+		extent.extLocation.logicalBlockNum = 0;
+		extent.extLocation.partitionReferenceNum = 0;
+		extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
+	} else {
+		epos.offset -= adsize;
+		etype = udf_next_aext(inode, &epos, &extent.extLocation,
+				      &extent.extLength, 0);
+		extent.extLength |= etype << 30;
+	}
+	err = udf_do_extend_file(inode, &epos, &extent, offset);
+	if (err < 0)
+		goto out;
+	err = 0;
+	iinfo->i_lenExtents = newsize;
+out:
+	brelse(epos.bh);
+	return err;
+}
+
+static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
+					int *err, sector_t *phys, int *new)
+{
+	static sector_t last_block;
+	struct buffer_head *result = NULL;
+	struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
+	struct extent_position prev_epos, cur_epos, next_epos;
+	int count = 0, startnum = 0, endnum = 0;
+	uint32_t elen = 0, tmpelen;
+	struct kernel_lb_addr eloc, tmpeloc;
+	int c = 1;
+	loff_t lbcount = 0, b_off = 0;
+	uint32_t newblocknum, newblock;
+	sector_t offset = 0;
+	int8_t etype;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	int goal = 0, pgoal = iinfo->i_location.logicalBlockNum;
+	int lastblock = 0;
+
+	prev_epos.offset = udf_file_entry_alloc_offset(inode);
+	prev_epos.block = iinfo->i_location;
+	prev_epos.bh = NULL;
+	cur_epos = next_epos = prev_epos;
+	b_off = (loff_t)block << inode->i_sb->s_blocksize_bits;
+
+	/* find the extent which contains the block we are looking for.
+	   alternate between laarr[0] and laarr[1] for locations of the
+	   current extent, and the previous extent */
+	do {
+		if (prev_epos.bh != cur_epos.bh) {
+			brelse(prev_epos.bh);
+			get_bh(cur_epos.bh);
+			prev_epos.bh = cur_epos.bh;
+		}
+		if (cur_epos.bh != next_epos.bh) {
+			brelse(cur_epos.bh);
+			get_bh(next_epos.bh);
+			cur_epos.bh = next_epos.bh;
+		}
+
+		lbcount += elen;
+
+		prev_epos.block = cur_epos.block;
+		cur_epos.block = next_epos.block;
+
+		prev_epos.offset = cur_epos.offset;
+		cur_epos.offset = next_epos.offset;
+
+		etype = udf_next_aext(inode, &next_epos, &eloc, &elen, 1);
+		if (etype == -1)
+			break;
+
+		c = !c;
+
+		laarr[c].extLength = (etype << 30) | elen;
+		laarr[c].extLocation = eloc;
+
+		if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))
+			pgoal = eloc.logicalBlockNum +
+				((elen + inode->i_sb->s_blocksize - 1) >>
+				 inode->i_sb->s_blocksize_bits);
+
+		count++;
+	} while (lbcount + elen <= b_off);
+
+	b_off -= lbcount;
+	offset = b_off >> inode->i_sb->s_blocksize_bits;
+	/*
+	 * Move prev_epos and cur_epos into indirect extent if we are at
+	 * the pointer to it
+	 */
+	udf_next_aext(inode, &prev_epos, &tmpeloc, &tmpelen, 0);
+	udf_next_aext(inode, &cur_epos, &tmpeloc, &tmpelen, 0);
+
+	/* if the extent is allocated and recorded, return the block
+	   if the extent is not a multiple of the blocksize, round up */
+
+	if (etype == (EXT_RECORDED_ALLOCATED >> 30)) {
+		if (elen & (inode->i_sb->s_blocksize - 1)) {
+			elen = EXT_RECORDED_ALLOCATED |
+				((elen + inode->i_sb->s_blocksize - 1) &
+				 ~(inode->i_sb->s_blocksize - 1));
+			udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
+		}
+		brelse(prev_epos.bh);
+		brelse(cur_epos.bh);
+		brelse(next_epos.bh);
+		newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
+		*phys = newblock;
+		return NULL;
+	}
+
+	last_block = block;
+	/* Are we beyond EOF? */
+	if (etype == -1) {
+		int ret;
+
+		if (count) {
+			if (c)
+				laarr[0] = laarr[1];
+			startnum = 1;
+		} else {
+			/* Create a fake extent when there's not one */
+			memset(&laarr[0].extLocation, 0x00,
+				sizeof(struct kernel_lb_addr));
+			laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
+			/* Will udf_do_extend_file() create real extent from
+			   a fake one? */
+			startnum = (offset > 0);
+		}
+		/* Create extents for the hole between EOF and offset */
+		ret = udf_do_extend_file(inode, &prev_epos, laarr, offset);
+		if (ret < 0) {
+			brelse(prev_epos.bh);
+			brelse(cur_epos.bh);
+			brelse(next_epos.bh);
+			*err = ret;
+			return NULL;
+		}
+		c = 0;
+		offset = 0;
+		count += ret;
+		/* We are not covered by a preallocated extent? */
+		if ((laarr[0].extLength & UDF_EXTENT_FLAG_MASK) !=
+						EXT_NOT_RECORDED_ALLOCATED) {
+			/* Is there any real extent? - otherwise we overwrite
+			 * the fake one... */
+			if (count)
+				c = !c;
+			laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
+				inode->i_sb->s_blocksize;
+			memset(&laarr[c].extLocation, 0x00,
+				sizeof(struct kernel_lb_addr));
+			count++;
+			endnum++;
+		}
+		endnum = c + 1;
+		lastblock = 1;
+	} else {
+		endnum = startnum = ((count > 2) ? 2 : count);
+
+		/* if the current extent is in position 0,
+		   swap it with the previous */
+		if (!c && count != 1) {
+			laarr[2] = laarr[0];
+			laarr[0] = laarr[1];
+			laarr[1] = laarr[2];
+			c = 1;
+		}
+
+		/* if the current block is located in an extent,
+		   read the next extent */
+		etype = udf_next_aext(inode, &next_epos, &eloc, &elen, 0);
+		if (etype != -1) {
+			laarr[c + 1].extLength = (etype << 30) | elen;
+			laarr[c + 1].extLocation = eloc;
+			count++;
+			startnum++;
+			endnum++;
+		} else
+			lastblock = 1;
+	}
+
+	/* if the current extent is not recorded but allocated, get the
+	 * block in the extent corresponding to the requested block */
+	if ((laarr[c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30))
+		newblocknum = laarr[c].extLocation.logicalBlockNum + offset;
+	else { /* otherwise, allocate a new block */
+		if (iinfo->i_next_alloc_block == block)
+			goal = iinfo->i_next_alloc_goal;
+
+		if (!goal) {
+			if (!(goal = pgoal)) /* XXX: what was intended here? */
+				goal = iinfo->i_location.logicalBlockNum + 1;
+		}
+
+		newblocknum = udf_new_block(inode->i_sb, inode,
+				iinfo->i_location.partitionReferenceNum,
+				goal, err);
+		if (!newblocknum) {
+			brelse(prev_epos.bh);
+			*err = -ENOSPC;
+			return NULL;
+		}
+		iinfo->i_lenExtents += inode->i_sb->s_blocksize;
+	}
+
+	/* if the extent the requsted block is located in contains multiple
+	 * blocks, split the extent into at most three extents. blocks prior
+	 * to requested block, requested block, and blocks after requested
+	 * block */
+	udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum);
+
+#ifdef UDF_PREALLOCATE
+	/* We preallocate blocks only for regular files. It also makes sense
+	 * for directories but there's a problem when to drop the
+	 * preallocation. We might use some delayed work for that but I feel
+	 * it's overengineering for a filesystem like UDF. */
+	if (S_ISREG(inode->i_mode))
+		udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
+#endif
+
+	/* merge any continuous blocks in laarr */
+	udf_merge_extents(inode, laarr, &endnum);
+
+	/* write back the new extents, inserting new extents if the new number
+	 * of extents is greater than the old number, and deleting extents if
+	 * the new number of extents is less than the old number */
+	udf_update_extents(inode, laarr, startnum, endnum, &prev_epos);
+
+	brelse(prev_epos.bh);
+
+	newblock = udf_get_pblock(inode->i_sb, newblocknum,
+				iinfo->i_location.partitionReferenceNum, 0);
+	if (!newblock)
+		return NULL;
+	*phys = newblock;
+	*err = 0;
+	*new = 1;
+	iinfo->i_next_alloc_block = block;
+	iinfo->i_next_alloc_goal = newblocknum;
+	inode->i_ctime = current_fs_time(inode->i_sb);
+
+	if (IS_SYNC(inode))
+		udf_sync_inode(inode);
+	else
+		mark_inode_dirty(inode);
+
+	return result;
+}
+
+static void udf_split_extents(struct inode *inode, int *c, int offset,
+			      int newblocknum,
+			      struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+			      int *endnum)
+{
+	unsigned long blocksize = inode->i_sb->s_blocksize;
+	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
+
+	if ((laarr[*c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30) ||
+	    (laarr[*c].extLength >> 30) ==
+				(EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) {
+		int curr = *c;
+		int blen = ((laarr[curr].extLength & UDF_EXTENT_LENGTH_MASK) +
+			    blocksize - 1) >> blocksize_bits;
+		int8_t etype = (laarr[curr].extLength >> 30);
+
+		if (blen == 1)
+			;
+		else if (!offset || blen == offset + 1) {
+			laarr[curr + 2] = laarr[curr + 1];
+			laarr[curr + 1] = laarr[curr];
+		} else {
+			laarr[curr + 3] = laarr[curr + 1];
+			laarr[curr + 2] = laarr[curr + 1] = laarr[curr];
+		}
+
+		if (offset) {
+			if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
+				udf_free_blocks(inode->i_sb, inode,
+						&laarr[curr].extLocation,
+						0, offset);
+				laarr[curr].extLength =
+					EXT_NOT_RECORDED_NOT_ALLOCATED |
+					(offset << blocksize_bits);
+				laarr[curr].extLocation.logicalBlockNum = 0;
+				laarr[curr].extLocation.
+						partitionReferenceNum = 0;
+			} else
+				laarr[curr].extLength = (etype << 30) |
+					(offset << blocksize_bits);
+			curr++;
+			(*c)++;
+			(*endnum)++;
+		}
+
+		laarr[curr].extLocation.logicalBlockNum = newblocknum;
+		if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))
+			laarr[curr].extLocation.partitionReferenceNum =
+				UDF_I(inode)->i_location.partitionReferenceNum;
+		laarr[curr].extLength = EXT_RECORDED_ALLOCATED |
+			blocksize;
+		curr++;
+
+		if (blen != offset + 1) {
+			if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30))
+				laarr[curr].extLocation.logicalBlockNum +=
+								offset + 1;
+			laarr[curr].extLength = (etype << 30) |
+				((blen - (offset + 1)) << blocksize_bits);
+			curr++;
+			(*endnum)++;
+		}
+	}
+}
+
+static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
+				 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+				 int *endnum)
+{
+	int start, length = 0, currlength = 0, i;
+
+	if (*endnum >= (c + 1)) {
+		if (!lastblock)
+			return;
+		else
+			start = c;
+	} else {
+		if ((laarr[c + 1].extLength >> 30) ==
+					(EXT_NOT_RECORDED_ALLOCATED >> 30)) {
+			start = c + 1;
+			length = currlength =
+				(((laarr[c + 1].extLength &
+					UDF_EXTENT_LENGTH_MASK) +
+				inode->i_sb->s_blocksize - 1) >>
+				inode->i_sb->s_blocksize_bits);
+		} else
+			start = c;
+	}
+
+	for (i = start + 1; i <= *endnum; i++) {
+		if (i == *endnum) {
+			if (lastblock)
+				length += UDF_DEFAULT_PREALLOC_BLOCKS;
+		} else if ((laarr[i].extLength >> 30) ==
+				(EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) {
+			length += (((laarr[i].extLength &
+						UDF_EXTENT_LENGTH_MASK) +
+				    inode->i_sb->s_blocksize - 1) >>
+				    inode->i_sb->s_blocksize_bits);
+		} else
+			break;
+	}
+
+	if (length) {
+		int next = laarr[start].extLocation.logicalBlockNum +
+			(((laarr[start].extLength & UDF_EXTENT_LENGTH_MASK) +
+			  inode->i_sb->s_blocksize - 1) >>
+			  inode->i_sb->s_blocksize_bits);
+		int numalloc = udf_prealloc_blocks(inode->i_sb, inode,
+				laarr[start].extLocation.partitionReferenceNum,
+				next, (UDF_DEFAULT_PREALLOC_BLOCKS > length ?
+				length : UDF_DEFAULT_PREALLOC_BLOCKS) -
+				currlength);
+		if (numalloc) 	{
+			if (start == (c + 1))
+				laarr[start].extLength +=
+					(numalloc <<
+					 inode->i_sb->s_blocksize_bits);
+			else {
+				memmove(&laarr[c + 2], &laarr[c + 1],
+					sizeof(struct long_ad) * (*endnum - (c + 1)));
+				(*endnum)++;
+				laarr[c + 1].extLocation.logicalBlockNum = next;
+				laarr[c + 1].extLocation.partitionReferenceNum =
+					laarr[c].extLocation.
+							partitionReferenceNum;
+				laarr[c + 1].extLength =
+					EXT_NOT_RECORDED_ALLOCATED |
+					(numalloc <<
+					 inode->i_sb->s_blocksize_bits);
+				start = c + 1;
+			}
+
+			for (i = start + 1; numalloc && i < *endnum; i++) {
+				int elen = ((laarr[i].extLength &
+						UDF_EXTENT_LENGTH_MASK) +
+					    inode->i_sb->s_blocksize - 1) >>
+					    inode->i_sb->s_blocksize_bits;
+
+				if (elen > numalloc) {
+					laarr[i].extLength -=
+						(numalloc <<
+						 inode->i_sb->s_blocksize_bits);
+					numalloc = 0;
+				} else {
+					numalloc -= elen;
+					if (*endnum > (i + 1))
+						memmove(&laarr[i],
+							&laarr[i + 1],
+							sizeof(struct long_ad) *
+							(*endnum - (i + 1)));
+					i--;
+					(*endnum)--;
+				}
+			}
+			UDF_I(inode)->i_lenExtents +=
+				numalloc << inode->i_sb->s_blocksize_bits;
+		}
+	}
+}
+
+static void udf_merge_extents(struct inode *inode,
+			      struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+			      int *endnum)
+{
+	int i;
+	unsigned long blocksize = inode->i_sb->s_blocksize;
+	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
+
+	for (i = 0; i < (*endnum - 1); i++) {
+		struct kernel_long_ad *li /*l[i]*/ = &laarr[i];
+		struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
+
+		if (((li->extLength >> 30) == (lip1->extLength >> 30)) &&
+			(((li->extLength >> 30) ==
+				(EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) ||
+			((lip1->extLocation.logicalBlockNum -
+			  li->extLocation.logicalBlockNum) ==
+			(((li->extLength & UDF_EXTENT_LENGTH_MASK) +
+			blocksize - 1) >> blocksize_bits)))) {
+
+			if (((li->extLength & UDF_EXTENT_LENGTH_MASK) +
+				(lip1->extLength & UDF_EXTENT_LENGTH_MASK) +
+				blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) {
+				lip1->extLength = (lip1->extLength -
+						  (li->extLength &
+						   UDF_EXTENT_LENGTH_MASK) +
+						   UDF_EXTENT_LENGTH_MASK) &
+							~(blocksize - 1);
+				li->extLength = (li->extLength &
+						 UDF_EXTENT_FLAG_MASK) +
+						(UDF_EXTENT_LENGTH_MASK + 1) -
+						blocksize;
+				lip1->extLocation.logicalBlockNum =
+					li->extLocation.logicalBlockNum +
+					((li->extLength &
+						UDF_EXTENT_LENGTH_MASK) >>
+						blocksize_bits);
+			} else {
+				li->extLength = lip1->extLength +
+					(((li->extLength &
+						UDF_EXTENT_LENGTH_MASK) +
+					 blocksize - 1) & ~(blocksize - 1));
+				if (*endnum > (i + 2))
+					memmove(&laarr[i + 1], &laarr[i + 2],
+						sizeof(struct long_ad) *
+						(*endnum - (i + 2)));
+				i--;
+				(*endnum)--;
+			}
+		} else if (((li->extLength >> 30) ==
+				(EXT_NOT_RECORDED_ALLOCATED >> 30)) &&
+			   ((lip1->extLength >> 30) ==
+				(EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) {
+			udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0,
+					((li->extLength &
+					  UDF_EXTENT_LENGTH_MASK) +
+					 blocksize - 1) >> blocksize_bits);
+			li->extLocation.logicalBlockNum = 0;
+			li->extLocation.partitionReferenceNum = 0;
+
+			if (((li->extLength & UDF_EXTENT_LENGTH_MASK) +
+			     (lip1->extLength & UDF_EXTENT_LENGTH_MASK) +
+			     blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) {
+				lip1->extLength = (lip1->extLength -
+						   (li->extLength &
+						   UDF_EXTENT_LENGTH_MASK) +
+						   UDF_EXTENT_LENGTH_MASK) &
+						   ~(blocksize - 1);
+				li->extLength = (li->extLength &
+						 UDF_EXTENT_FLAG_MASK) +
+						(UDF_EXTENT_LENGTH_MASK + 1) -
+						blocksize;
+			} else {
+				li->extLength = lip1->extLength +
+					(((li->extLength &
+						UDF_EXTENT_LENGTH_MASK) +
+					  blocksize - 1) & ~(blocksize - 1));
+				if (*endnum > (i + 2))
+					memmove(&laarr[i + 1], &laarr[i + 2],
+						sizeof(struct long_ad) *
+						(*endnum - (i + 2)));
+				i--;
+				(*endnum)--;
+			}
+		} else if ((li->extLength >> 30) ==
+					(EXT_NOT_RECORDED_ALLOCATED >> 30)) {
+			udf_free_blocks(inode->i_sb, inode,
+					&li->extLocation, 0,
+					((li->extLength &
+						UDF_EXTENT_LENGTH_MASK) +
+					 blocksize - 1) >> blocksize_bits);
+			li->extLocation.logicalBlockNum = 0;
+			li->extLocation.partitionReferenceNum = 0;
+			li->extLength = (li->extLength &
+						UDF_EXTENT_LENGTH_MASK) |
+						EXT_NOT_RECORDED_NOT_ALLOCATED;
+		}
+	}
+}
+
+static void udf_update_extents(struct inode *inode,
+			       struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+			       int startnum, int endnum,
+			       struct extent_position *epos)
+{
+	int start = 0, i;
+	struct kernel_lb_addr tmploc;
+	uint32_t tmplen;
+
+	if (startnum > endnum) {
+		for (i = 0; i < (startnum - endnum); i++)
+			udf_delete_aext(inode, *epos, laarr[i].extLocation,
+					laarr[i].extLength);
+	} else if (startnum < endnum) {
+		for (i = 0; i < (endnum - startnum); i++) {
+			udf_insert_aext(inode, *epos, laarr[i].extLocation,
+					laarr[i].extLength);
+			udf_next_aext(inode, epos, &laarr[i].extLocation,
+				      &laarr[i].extLength, 1);
+			start++;
+		}
+	}
+
+	for (i = start; i < endnum; i++) {
+		udf_next_aext(inode, epos, &tmploc, &tmplen, 0);
+		udf_write_aext(inode, epos, &laarr[i].extLocation,
+			       laarr[i].extLength, 1);
+	}
+}
+
+struct buffer_head *udf_bread(struct inode *inode, int block,
+			      int create, int *err)
+{
+	struct buffer_head *bh = NULL;
+
+	bh = udf_getblk(inode, block, create, err);
+	if (!bh)
+		return NULL;
+
+	if (buffer_uptodate(bh))
+		return bh;
+
+	ll_rw_block(READ, 1, &bh);
+
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return bh;
+
+	brelse(bh);
+	*err = -EIO;
+	return NULL;
+}
+
+int udf_setsize(struct inode *inode, loff_t newsize)
+{
+	int err;
+	struct udf_inode_info *iinfo;
+	int bsize = 1 << inode->i_blkbits;
+
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	      S_ISLNK(inode->i_mode)))
+		return -EINVAL;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return -EPERM;
+
+	iinfo = UDF_I(inode);
+	if (newsize > inode->i_size) {
+		down_write(&iinfo->i_data_sem);
+		if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+			if (bsize <
+			    (udf_file_entry_alloc_offset(inode) + newsize)) {
+				err = udf_expand_file_adinicb(inode);
+				if (err)
+					return err;
+				down_write(&iinfo->i_data_sem);
+			} else
+				iinfo->i_lenAlloc = newsize;
+		}
+		err = udf_extend_file(inode, newsize);
+		if (err) {
+			up_write(&iinfo->i_data_sem);
+			return err;
+		}
+		truncate_setsize(inode, newsize);
+		up_write(&iinfo->i_data_sem);
+	} else {
+		if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+			down_write(&iinfo->i_data_sem);
+			memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize,
+			       0x00, bsize - newsize -
+			       udf_file_entry_alloc_offset(inode));
+			iinfo->i_lenAlloc = newsize;
+			truncate_setsize(inode, newsize);
+			up_write(&iinfo->i_data_sem);
+			goto update_time;
+		}
+		err = block_truncate_page(inode->i_mapping, newsize,
+					  udf_get_block);
+		if (err)
+			return err;
+		down_write(&iinfo->i_data_sem);
+		truncate_setsize(inode, newsize);
+		udf_truncate_extents(inode);
+		up_write(&iinfo->i_data_sem);
+	}
+update_time:
+	inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
+	if (IS_SYNC(inode))
+		udf_sync_inode(inode);
+	else
+		mark_inode_dirty(inode);
+	return 0;
+}
+
+static void __udf_read_inode(struct inode *inode)
+{
+	struct buffer_head *bh = NULL;
+	struct fileEntry *fe;
+	uint16_t ident;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	/*
+	 * Set defaults, but the inode is still incomplete!
+	 * Note: get_new_inode() sets the following on a new inode:
+	 *      i_sb = sb
+	 *      i_no = ino
+	 *      i_flags = sb->s_flags
+	 *      i_state = 0
+	 * clean_inode(): zero fills and sets
+	 *      i_count = 1
+	 *      i_nlink = 1
+	 *      i_op = NULL;
+	 */
+	bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
+	if (!bh) {
+		printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n",
+		       inode->i_ino);
+		make_bad_inode(inode);
+		return;
+	}
+
+	if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE &&
+	    ident != TAG_IDENT_USE) {
+		printk(KERN_ERR "udf: udf_read_inode(ino %ld) "
+				"failed ident=%d\n", inode->i_ino, ident);
+		brelse(bh);
+		make_bad_inode(inode);
+		return;
+	}
+
+	fe = (struct fileEntry *)bh->b_data;
+
+	if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
+		struct buffer_head *ibh;
+
+		ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
+					&ident);
+		if (ident == TAG_IDENT_IE && ibh) {
+			struct buffer_head *nbh = NULL;
+			struct kernel_lb_addr loc;
+			struct indirectEntry *ie;
+
+			ie = (struct indirectEntry *)ibh->b_data;
+			loc = lelb_to_cpu(ie->indirectICB.extLocation);
+
+			if (ie->indirectICB.extLength &&
+				(nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
+							&ident))) {
+				if (ident == TAG_IDENT_FE ||
+					ident == TAG_IDENT_EFE) {
+					memcpy(&iinfo->i_location,
+						&loc,
+						sizeof(struct kernel_lb_addr));
+					brelse(bh);
+					brelse(ibh);
+					brelse(nbh);
+					__udf_read_inode(inode);
+					return;
+				}
+				brelse(nbh);
+			}
+		}
+		brelse(ibh);
+	} else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
+		printk(KERN_ERR "udf: unsupported strategy type: %d\n",
+		       le16_to_cpu(fe->icbTag.strategyType));
+		brelse(bh);
+		make_bad_inode(inode);
+		return;
+	}
+	udf_fill_inode(inode, bh);
+
+	brelse(bh);
+}
+
+static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
+{
+	struct fileEntry *fe;
+	struct extendedFileEntry *efe;
+	int offset;
+	struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	fe = (struct fileEntry *)bh->b_data;
+	efe = (struct extendedFileEntry *)bh->b_data;
+
+	if (fe->icbTag.strategyType == cpu_to_le16(4))
+		iinfo->i_strat4096 = 0;
+	else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */
+		iinfo->i_strat4096 = 1;
+
+	iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) &
+							ICBTAG_FLAG_AD_MASK;
+	iinfo->i_unique = 0;
+	iinfo->i_lenEAttr = 0;
+	iinfo->i_lenExtents = 0;
+	iinfo->i_lenAlloc = 0;
+	iinfo->i_next_alloc_block = 0;
+	iinfo->i_next_alloc_goal = 0;
+	if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
+		iinfo->i_efe = 1;
+		iinfo->i_use = 0;
+		if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+					sizeof(struct extendedFileEntry))) {
+			make_bad_inode(inode);
+			return;
+		}
+		memcpy(iinfo->i_ext.i_data,
+		       bh->b_data + sizeof(struct extendedFileEntry),
+		       inode->i_sb->s_blocksize -
+					sizeof(struct extendedFileEntry));
+	} else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
+		iinfo->i_efe = 0;
+		iinfo->i_use = 0;
+		if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+						sizeof(struct fileEntry))) {
+			make_bad_inode(inode);
+			return;
+		}
+		memcpy(iinfo->i_ext.i_data,
+		       bh->b_data + sizeof(struct fileEntry),
+		       inode->i_sb->s_blocksize - sizeof(struct fileEntry));
+	} else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) {
+		iinfo->i_efe = 0;
+		iinfo->i_use = 1;
+		iinfo->i_lenAlloc = le32_to_cpu(
+				((struct unallocSpaceEntry *)bh->b_data)->
+				 lengthAllocDescs);
+		if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+					sizeof(struct unallocSpaceEntry))) {
+			make_bad_inode(inode);
+			return;
+		}
+		memcpy(iinfo->i_ext.i_data,
+		       bh->b_data + sizeof(struct unallocSpaceEntry),
+		       inode->i_sb->s_blocksize -
+					sizeof(struct unallocSpaceEntry));
+		return;
+	}
+
+	read_lock(&sbi->s_cred_lock);
+	inode->i_uid = le32_to_cpu(fe->uid);
+	if (inode->i_uid == -1 ||
+	    UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) ||
+	    UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_SET))
+		inode->i_uid = UDF_SB(inode->i_sb)->s_uid;
+
+	inode->i_gid = le32_to_cpu(fe->gid);
+	if (inode->i_gid == -1 ||
+	    UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_IGNORE) ||
+	    UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET))
+		inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
+
+	if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
+			sbi->s_fmode != UDF_INVALID_MODE)
+		inode->i_mode = sbi->s_fmode;
+	else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY &&
+			sbi->s_dmode != UDF_INVALID_MODE)
+		inode->i_mode = sbi->s_dmode;
+	else
+		inode->i_mode = udf_convert_permissions(fe);
+	inode->i_mode &= ~sbi->s_umask;
+	read_unlock(&sbi->s_cred_lock);
+
+	inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
+	if (!inode->i_nlink)
+		inode->i_nlink = 1;
+
+	inode->i_size = le64_to_cpu(fe->informationLength);
+	iinfo->i_lenExtents = inode->i_size;
+
+	if (iinfo->i_efe == 0) {
+		inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
+			(inode->i_sb->s_blocksize_bits - 9);
+
+		if (!udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime))
+			inode->i_atime = sbi->s_record_time;
+
+		if (!udf_disk_stamp_to_time(&inode->i_mtime,
+					    fe->modificationTime))
+			inode->i_mtime = sbi->s_record_time;
+
+		if (!udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime))
+			inode->i_ctime = sbi->s_record_time;
+
+		iinfo->i_unique = le64_to_cpu(fe->uniqueID);
+		iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
+		iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs);
+		offset = sizeof(struct fileEntry) + iinfo->i_lenEAttr;
+	} else {
+		inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
+		    (inode->i_sb->s_blocksize_bits - 9);
+
+		if (!udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime))
+			inode->i_atime = sbi->s_record_time;
+
+		if (!udf_disk_stamp_to_time(&inode->i_mtime,
+					    efe->modificationTime))
+			inode->i_mtime = sbi->s_record_time;
+
+		if (!udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime))
+			iinfo->i_crtime = sbi->s_record_time;
+
+		if (!udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime))
+			inode->i_ctime = sbi->s_record_time;
+
+		iinfo->i_unique = le64_to_cpu(efe->uniqueID);
+		iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
+		iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs);
+		offset = sizeof(struct extendedFileEntry) +
+							iinfo->i_lenEAttr;
+	}
+
+	switch (fe->icbTag.fileType) {
+	case ICBTAG_FILE_TYPE_DIRECTORY:
+		inode->i_op = &udf_dir_inode_operations;
+		inode->i_fop = &udf_dir_operations;
+		inode->i_mode |= S_IFDIR;
+		inc_nlink(inode);
+		break;
+	case ICBTAG_FILE_TYPE_REALTIME:
+	case ICBTAG_FILE_TYPE_REGULAR:
+	case ICBTAG_FILE_TYPE_UNDEF:
+	case ICBTAG_FILE_TYPE_VAT20:
+		if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+			inode->i_data.a_ops = &udf_adinicb_aops;
+		else
+			inode->i_data.a_ops = &udf_aops;
+		inode->i_op = &udf_file_inode_operations;
+		inode->i_fop = &udf_file_operations;
+		inode->i_mode |= S_IFREG;
+		break;
+	case ICBTAG_FILE_TYPE_BLOCK:
+		inode->i_mode |= S_IFBLK;
+		break;
+	case ICBTAG_FILE_TYPE_CHAR:
+		inode->i_mode |= S_IFCHR;
+		break;
+	case ICBTAG_FILE_TYPE_FIFO:
+		init_special_inode(inode, inode->i_mode | S_IFIFO, 0);
+		break;
+	case ICBTAG_FILE_TYPE_SOCKET:
+		init_special_inode(inode, inode->i_mode | S_IFSOCK, 0);
+		break;
+	case ICBTAG_FILE_TYPE_SYMLINK:
+		inode->i_data.a_ops = &udf_symlink_aops;
+		inode->i_op = &udf_symlink_inode_operations;
+		inode->i_mode = S_IFLNK | S_IRWXUGO;
+		break;
+	case ICBTAG_FILE_TYPE_MAIN:
+		udf_debug("METADATA FILE-----\n");
+		break;
+	case ICBTAG_FILE_TYPE_MIRROR:
+		udf_debug("METADATA MIRROR FILE-----\n");
+		break;
+	case ICBTAG_FILE_TYPE_BITMAP:
+		udf_debug("METADATA BITMAP FILE-----\n");
+		break;
+	default:
+		printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown "
+				"file type=%d\n", inode->i_ino,
+				fe->icbTag.fileType);
+		make_bad_inode(inode);
+		return;
+	}
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		struct deviceSpec *dsea =
+			(struct deviceSpec *)udf_get_extendedattr(inode, 12, 1);
+		if (dsea) {
+			init_special_inode(inode, inode->i_mode,
+				MKDEV(le32_to_cpu(dsea->majorDeviceIdent),
+				      le32_to_cpu(dsea->minorDeviceIdent)));
+			/* Developer ID ??? */
+		} else
+			make_bad_inode(inode);
+	}
+}
+
+static int udf_alloc_i_data(struct inode *inode, size_t size)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL);
+
+	if (!iinfo->i_ext.i_data) {
+		printk(KERN_ERR "udf:udf_alloc_i_data (ino %ld) "
+				"no free memory\n", inode->i_ino);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static mode_t udf_convert_permissions(struct fileEntry *fe)
+{
+	mode_t mode;
+	uint32_t permissions;
+	uint32_t flags;
+
+	permissions = le32_to_cpu(fe->permissions);
+	flags = le16_to_cpu(fe->icbTag.flags);
+
+	mode =	((permissions) & S_IRWXO) |
+		((permissions >> 2) & S_IRWXG) |
+		((permissions >> 4) & S_IRWXU) |
+		((flags & ICBTAG_FLAG_SETUID) ? S_ISUID : 0) |
+		((flags & ICBTAG_FLAG_SETGID) ? S_ISGID : 0) |
+		((flags & ICBTAG_FLAG_STICKY) ? S_ISVTX : 0);
+
+	return mode;
+}
+
+int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
+static int udf_sync_inode(struct inode *inode)
+{
+	return udf_update_inode(inode, 1);
+}
+
+static int udf_update_inode(struct inode *inode, int do_sync)
+{
+	struct buffer_head *bh = NULL;
+	struct fileEntry *fe;
+	struct extendedFileEntry *efe;
+	uint32_t udfperms;
+	uint16_t icbflags;
+	uint16_t crclen;
+	int err = 0;
+	struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
+	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	bh = udf_tgetblk(inode->i_sb,
+			udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0));
+	if (!bh) {
+		udf_debug("getblk failure\n");
+		return -ENOMEM;
+	}
+
+	lock_buffer(bh);
+	memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+	fe = (struct fileEntry *)bh->b_data;
+	efe = (struct extendedFileEntry *)bh->b_data;
+
+	if (iinfo->i_use) {
+		struct unallocSpaceEntry *use =
+			(struct unallocSpaceEntry *)bh->b_data;
+
+		use->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
+		memcpy(bh->b_data + sizeof(struct unallocSpaceEntry),
+		       iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
+					sizeof(struct unallocSpaceEntry));
+		use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE);
+		use->descTag.tagLocation =
+				cpu_to_le32(iinfo->i_location.logicalBlockNum);
+		crclen = sizeof(struct unallocSpaceEntry) +
+				iinfo->i_lenAlloc - sizeof(struct tag);
+		use->descTag.descCRCLength = cpu_to_le16(crclen);
+		use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
+							   sizeof(struct tag),
+							   crclen));
+		use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
+
+		goto out;
+	}
+
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
+		fe->uid = cpu_to_le32(-1);
+	else
+		fe->uid = cpu_to_le32(inode->i_uid);
+
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET))
+		fe->gid = cpu_to_le32(-1);
+	else
+		fe->gid = cpu_to_le32(inode->i_gid);
+
+	udfperms = ((inode->i_mode & S_IRWXO)) |
+		   ((inode->i_mode & S_IRWXG) << 2) |
+		   ((inode->i_mode & S_IRWXU) << 4);
+
+	udfperms |= (le32_to_cpu(fe->permissions) &
+		    (FE_PERM_O_DELETE | FE_PERM_O_CHATTR |
+		     FE_PERM_G_DELETE | FE_PERM_G_CHATTR |
+		     FE_PERM_U_DELETE | FE_PERM_U_CHATTR));
+	fe->permissions = cpu_to_le32(udfperms);
+
+	if (S_ISDIR(inode->i_mode))
+		fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1);
+	else
+		fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
+
+	fe->informationLength = cpu_to_le64(inode->i_size);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		struct regid *eid;
+		struct deviceSpec *dsea =
+			(struct deviceSpec *)udf_get_extendedattr(inode, 12, 1);
+		if (!dsea) {
+			dsea = (struct deviceSpec *)
+				udf_add_extendedattr(inode,
+						     sizeof(struct deviceSpec) +
+						     sizeof(struct regid), 12, 0x3);
+			dsea->attrType = cpu_to_le32(12);
+			dsea->attrSubtype = 1;
+			dsea->attrLength = cpu_to_le32(
+						sizeof(struct deviceSpec) +
+						sizeof(struct regid));
+			dsea->impUseLength = cpu_to_le32(sizeof(struct regid));
+		}
+		eid = (struct regid *)dsea->impUse;
+		memset(eid, 0, sizeof(struct regid));
+		strcpy(eid->ident, UDF_ID_DEVELOPER);
+		eid->identSuffix[0] = UDF_OS_CLASS_UNIX;
+		eid->identSuffix[1] = UDF_OS_ID_LINUX;
+		dsea->majorDeviceIdent = cpu_to_le32(imajor(inode));
+		dsea->minorDeviceIdent = cpu_to_le32(iminor(inode));
+	}
+
+	if (iinfo->i_efe == 0) {
+		memcpy(bh->b_data + sizeof(struct fileEntry),
+		       iinfo->i_ext.i_data,
+		       inode->i_sb->s_blocksize - sizeof(struct fileEntry));
+		fe->logicalBlocksRecorded = cpu_to_le64(
+			(inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
+			(blocksize_bits - 9));
+
+		udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
+		udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
+		udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
+		memset(&(fe->impIdent), 0, sizeof(struct regid));
+		strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
+		fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
+		fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
+		fe->uniqueID = cpu_to_le64(iinfo->i_unique);
+		fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr);
+		fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
+		fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE);
+		crclen = sizeof(struct fileEntry);
+	} else {
+		memcpy(bh->b_data + sizeof(struct extendedFileEntry),
+		       iinfo->i_ext.i_data,
+		       inode->i_sb->s_blocksize -
+					sizeof(struct extendedFileEntry));
+		efe->objectSize = cpu_to_le64(inode->i_size);
+		efe->logicalBlocksRecorded = cpu_to_le64(
+			(inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
+			(blocksize_bits - 9));
+
+		if (iinfo->i_crtime.tv_sec > inode->i_atime.tv_sec ||
+		    (iinfo->i_crtime.tv_sec == inode->i_atime.tv_sec &&
+		     iinfo->i_crtime.tv_nsec > inode->i_atime.tv_nsec))
+			iinfo->i_crtime = inode->i_atime;
+
+		if (iinfo->i_crtime.tv_sec > inode->i_mtime.tv_sec ||
+		    (iinfo->i_crtime.tv_sec == inode->i_mtime.tv_sec &&
+		     iinfo->i_crtime.tv_nsec > inode->i_mtime.tv_nsec))
+			iinfo->i_crtime = inode->i_mtime;
+
+		if (iinfo->i_crtime.tv_sec > inode->i_ctime.tv_sec ||
+		    (iinfo->i_crtime.tv_sec == inode->i_ctime.tv_sec &&
+		     iinfo->i_crtime.tv_nsec > inode->i_ctime.tv_nsec))
+			iinfo->i_crtime = inode->i_ctime;
+
+		udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime);
+		udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime);
+		udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
+		udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
+
+		memset(&(efe->impIdent), 0, sizeof(struct regid));
+		strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
+		efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
+		efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
+		efe->uniqueID = cpu_to_le64(iinfo->i_unique);
+		efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr);
+		efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
+		efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE);
+		crclen = sizeof(struct extendedFileEntry);
+	}
+	if (iinfo->i_strat4096) {
+		fe->icbTag.strategyType = cpu_to_le16(4096);
+		fe->icbTag.strategyParameter = cpu_to_le16(1);
+		fe->icbTag.numEntries = cpu_to_le16(2);
+	} else {
+		fe->icbTag.strategyType = cpu_to_le16(4);
+		fe->icbTag.numEntries = cpu_to_le16(1);
+	}
+
+	if (S_ISDIR(inode->i_mode))
+		fe->icbTag.fileType = ICBTAG_FILE_TYPE_DIRECTORY;
+	else if (S_ISREG(inode->i_mode))
+		fe->icbTag.fileType = ICBTAG_FILE_TYPE_REGULAR;
+	else if (S_ISLNK(inode->i_mode))
+		fe->icbTag.fileType = ICBTAG_FILE_TYPE_SYMLINK;
+	else if (S_ISBLK(inode->i_mode))
+		fe->icbTag.fileType = ICBTAG_FILE_TYPE_BLOCK;
+	else if (S_ISCHR(inode->i_mode))
+		fe->icbTag.fileType = ICBTAG_FILE_TYPE_CHAR;
+	else if (S_ISFIFO(inode->i_mode))
+		fe->icbTag.fileType = ICBTAG_FILE_TYPE_FIFO;
+	else if (S_ISSOCK(inode->i_mode))
+		fe->icbTag.fileType = ICBTAG_FILE_TYPE_SOCKET;
+
+	icbflags =	iinfo->i_alloc_type |
+			((inode->i_mode & S_ISUID) ? ICBTAG_FLAG_SETUID : 0) |
+			((inode->i_mode & S_ISGID) ? ICBTAG_FLAG_SETGID : 0) |
+			((inode->i_mode & S_ISVTX) ? ICBTAG_FLAG_STICKY : 0) |
+			(le16_to_cpu(fe->icbTag.flags) &
+				~(ICBTAG_FLAG_AD_MASK | ICBTAG_FLAG_SETUID |
+				ICBTAG_FLAG_SETGID | ICBTAG_FLAG_STICKY));
+
+	fe->icbTag.flags = cpu_to_le16(icbflags);
+	if (sbi->s_udfrev >= 0x0200)
+		fe->descTag.descVersion = cpu_to_le16(3);
+	else
+		fe->descTag.descVersion = cpu_to_le16(2);
+	fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number);
+	fe->descTag.tagLocation = cpu_to_le32(
+					iinfo->i_location.logicalBlockNum);
+	crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag);
+	fe->descTag.descCRCLength = cpu_to_le16(crclen);
+	fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
+						  crclen));
+	fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
+
+out:
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	/* write the data blocks */
+	mark_buffer_dirty(bh);
+	if (do_sync) {
+		sync_dirty_buffer(bh);
+		if (buffer_write_io_error(bh)) {
+			printk(KERN_WARNING "IO error syncing udf inode "
+				"[%s:%08lx]\n", inode->i_sb->s_id,
+				inode->i_ino);
+			err = -EIO;
+		}
+	}
+	brelse(bh);
+
+	return err;
+}
+
+struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
+{
+	unsigned long block = udf_get_lb_pblock(sb, ino, 0);
+	struct inode *inode = iget_locked(sb, block);
+
+	if (!inode)
+		return NULL;
+
+	if (inode->i_state & I_NEW) {
+		memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
+		__udf_read_inode(inode);
+		unlock_new_inode(inode);
+	}
+
+	if (is_bad_inode(inode))
+		goto out_iput;
+
+	if (ino->logicalBlockNum >= UDF_SB(sb)->
+			s_partmaps[ino->partitionReferenceNum].s_partition_len) {
+		udf_debug("block=%d, partition=%d out of range\n",
+			  ino->logicalBlockNum, ino->partitionReferenceNum);
+		make_bad_inode(inode);
+		goto out_iput;
+	}
+
+	return inode;
+
+ out_iput:
+	iput(inode);
+	return NULL;
+}
+
+int udf_add_aext(struct inode *inode, struct extent_position *epos,
+		 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+{
+	int adsize;
+	struct short_ad *sad = NULL;
+	struct long_ad *lad = NULL;
+	struct allocExtDesc *aed;
+	uint8_t *ptr;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (!epos->bh)
+		ptr = iinfo->i_ext.i_data + epos->offset -
+			udf_file_entry_alloc_offset(inode) +
+			iinfo->i_lenEAttr;
+	else
+		ptr = epos->bh->b_data + epos->offset;
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		return -EIO;
+
+	if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
+		unsigned char *sptr, *dptr;
+		struct buffer_head *nbh;
+		int err, loffset;
+		struct kernel_lb_addr obloc = epos->block;
+
+		epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
+						obloc.partitionReferenceNum,
+						obloc.logicalBlockNum, &err);
+		if (!epos->block.logicalBlockNum)
+			return -ENOSPC;
+		nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
+								 &epos->block,
+								 0));
+		if (!nbh)
+			return -EIO;
+		lock_buffer(nbh);
+		memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize);
+		set_buffer_uptodate(nbh);
+		unlock_buffer(nbh);
+		mark_buffer_dirty_inode(nbh, inode);
+
+		aed = (struct allocExtDesc *)(nbh->b_data);
+		if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
+			aed->previousAllocExtLocation =
+					cpu_to_le32(obloc.logicalBlockNum);
+		if (epos->offset + adsize > inode->i_sb->s_blocksize) {
+			loffset = epos->offset;
+			aed->lengthAllocDescs = cpu_to_le32(adsize);
+			sptr = ptr - adsize;
+			dptr = nbh->b_data + sizeof(struct allocExtDesc);
+			memcpy(dptr, sptr, adsize);
+			epos->offset = sizeof(struct allocExtDesc) + adsize;
+		} else {
+			loffset = epos->offset + adsize;
+			aed->lengthAllocDescs = cpu_to_le32(0);
+			sptr = ptr;
+			epos->offset = sizeof(struct allocExtDesc);
+
+			if (epos->bh) {
+				aed = (struct allocExtDesc *)epos->bh->b_data;
+				le32_add_cpu(&aed->lengthAllocDescs, adsize);
+			} else {
+				iinfo->i_lenAlloc += adsize;
+				mark_inode_dirty(inode);
+			}
+		}
+		if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
+			udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
+				    epos->block.logicalBlockNum, sizeof(struct tag));
+		else
+			udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
+				    epos->block.logicalBlockNum, sizeof(struct tag));
+		switch (iinfo->i_alloc_type) {
+		case ICBTAG_FLAG_AD_SHORT:
+			sad = (struct short_ad *)sptr;
+			sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
+						     inode->i_sb->s_blocksize);
+			sad->extPosition =
+				cpu_to_le32(epos->block.logicalBlockNum);
+			break;
+		case ICBTAG_FLAG_AD_LONG:
+			lad = (struct long_ad *)sptr;
+			lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
+						     inode->i_sb->s_blocksize);
+			lad->extLocation = cpu_to_lelb(epos->block);
+			memset(lad->impUse, 0x00, sizeof(lad->impUse));
+			break;
+		}
+		if (epos->bh) {
+			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
+			    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
+				udf_update_tag(epos->bh->b_data, loffset);
+			else
+				udf_update_tag(epos->bh->b_data,
+						sizeof(struct allocExtDesc));
+			mark_buffer_dirty_inode(epos->bh, inode);
+			brelse(epos->bh);
+		} else {
+			mark_inode_dirty(inode);
+		}
+		epos->bh = nbh;
+	}
+
+	udf_write_aext(inode, epos, eloc, elen, inc);
+
+	if (!epos->bh) {
+		iinfo->i_lenAlloc += adsize;
+		mark_inode_dirty(inode);
+	} else {
+		aed = (struct allocExtDesc *)epos->bh->b_data;
+		le32_add_cpu(&aed->lengthAllocDescs, adsize);
+		if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
+				UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
+			udf_update_tag(epos->bh->b_data,
+					epos->offset + (inc ? 0 : adsize));
+		else
+			udf_update_tag(epos->bh->b_data,
+					sizeof(struct allocExtDesc));
+		mark_buffer_dirty_inode(epos->bh, inode);
+	}
+
+	return 0;
+}
+
+void udf_write_aext(struct inode *inode, struct extent_position *epos,
+		    struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+{
+	int adsize;
+	uint8_t *ptr;
+	struct short_ad *sad;
+	struct long_ad *lad;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (!epos->bh)
+		ptr = iinfo->i_ext.i_data + epos->offset -
+			udf_file_entry_alloc_offset(inode) +
+			iinfo->i_lenEAttr;
+	else
+		ptr = epos->bh->b_data + epos->offset;
+
+	switch (iinfo->i_alloc_type) {
+	case ICBTAG_FLAG_AD_SHORT:
+		sad = (struct short_ad *)ptr;
+		sad->extLength = cpu_to_le32(elen);
+		sad->extPosition = cpu_to_le32(eloc->logicalBlockNum);
+		adsize = sizeof(struct short_ad);
+		break;
+	case ICBTAG_FLAG_AD_LONG:
+		lad = (struct long_ad *)ptr;
+		lad->extLength = cpu_to_le32(elen);
+		lad->extLocation = cpu_to_lelb(*eloc);
+		memset(lad->impUse, 0x00, sizeof(lad->impUse));
+		adsize = sizeof(struct long_ad);
+		break;
+	default:
+		return;
+	}
+
+	if (epos->bh) {
+		if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
+		    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) {
+			struct allocExtDesc *aed =
+				(struct allocExtDesc *)epos->bh->b_data;
+			udf_update_tag(epos->bh->b_data,
+				       le32_to_cpu(aed->lengthAllocDescs) +
+				       sizeof(struct allocExtDesc));
+		}
+		mark_buffer_dirty_inode(epos->bh, inode);
+	} else {
+		mark_inode_dirty(inode);
+	}
+
+	if (inc)
+		epos->offset += adsize;
+}
+
+int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
+		     struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
+{
+	int8_t etype;
+
+	while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) ==
+	       (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
+		int block;
+		epos->block = *eloc;
+		epos->offset = sizeof(struct allocExtDesc);
+		brelse(epos->bh);
+		block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0);
+		epos->bh = udf_tread(inode->i_sb, block);
+		if (!epos->bh) {
+			udf_debug("reading block %d failed!\n", block);
+			return -1;
+		}
+	}
+
+	return etype;
+}
+
+int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
+			struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
+{
+	int alen;
+	int8_t etype;
+	uint8_t *ptr;
+	struct short_ad *sad;
+	struct long_ad *lad;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (!epos->bh) {
+		if (!epos->offset)
+			epos->offset = udf_file_entry_alloc_offset(inode);
+		ptr = iinfo->i_ext.i_data + epos->offset -
+			udf_file_entry_alloc_offset(inode) +
+			iinfo->i_lenEAttr;
+		alen = udf_file_entry_alloc_offset(inode) +
+							iinfo->i_lenAlloc;
+	} else {
+		if (!epos->offset)
+			epos->offset = sizeof(struct allocExtDesc);
+		ptr = epos->bh->b_data + epos->offset;
+		alen = sizeof(struct allocExtDesc) +
+			le32_to_cpu(((struct allocExtDesc *)epos->bh->b_data)->
+							lengthAllocDescs);
+	}
+
+	switch (iinfo->i_alloc_type) {
+	case ICBTAG_FLAG_AD_SHORT:
+		sad = udf_get_fileshortad(ptr, alen, &epos->offset, inc);
+		if (!sad)
+			return -1;
+		etype = le32_to_cpu(sad->extLength) >> 30;
+		eloc->logicalBlockNum = le32_to_cpu(sad->extPosition);
+		eloc->partitionReferenceNum =
+				iinfo->i_location.partitionReferenceNum;
+		*elen = le32_to_cpu(sad->extLength) & UDF_EXTENT_LENGTH_MASK;
+		break;
+	case ICBTAG_FLAG_AD_LONG:
+		lad = udf_get_filelongad(ptr, alen, &epos->offset, inc);
+		if (!lad)
+			return -1;
+		etype = le32_to_cpu(lad->extLength) >> 30;
+		*eloc = lelb_to_cpu(lad->extLocation);
+		*elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK;
+		break;
+	default:
+		udf_debug("alloc_type = %d unsupported\n",
+				iinfo->i_alloc_type);
+		return -1;
+	}
+
+	return etype;
+}
+
+static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
+			      struct kernel_lb_addr neloc, uint32_t nelen)
+{
+	struct kernel_lb_addr oeloc;
+	uint32_t oelen;
+	int8_t etype;
+
+	if (epos.bh)
+		get_bh(epos.bh);
+
+	while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) {
+		udf_write_aext(inode, &epos, &neloc, nelen, 1);
+		neloc = oeloc;
+		nelen = (etype << 30) | oelen;
+	}
+	udf_add_aext(inode, &epos, &neloc, nelen, 1);
+	brelse(epos.bh);
+
+	return (nelen >> 30);
+}
+
+int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
+		       struct kernel_lb_addr eloc, uint32_t elen)
+{
+	struct extent_position oepos;
+	int adsize;
+	int8_t etype;
+	struct allocExtDesc *aed;
+	struct udf_inode_info *iinfo;
+
+	if (epos.bh) {
+		get_bh(epos.bh);
+		get_bh(epos.bh);
+	}
+
+	iinfo = UDF_I(inode);
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		adsize = 0;
+
+	oepos = epos;
+	if (udf_next_aext(inode, &epos, &eloc, &elen, 1) == -1)
+		return -1;
+
+	while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
+		udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1);
+		if (oepos.bh != epos.bh) {
+			oepos.block = epos.block;
+			brelse(oepos.bh);
+			get_bh(epos.bh);
+			oepos.bh = epos.bh;
+			oepos.offset = epos.offset - adsize;
+		}
+	}
+	memset(&eloc, 0x00, sizeof(struct kernel_lb_addr));
+	elen = 0;
+
+	if (epos.bh != oepos.bh) {
+		udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1);
+		udf_write_aext(inode, &oepos, &eloc, elen, 1);
+		udf_write_aext(inode, &oepos, &eloc, elen, 1);
+		if (!oepos.bh) {
+			iinfo->i_lenAlloc -= (adsize * 2);
+			mark_inode_dirty(inode);
+		} else {
+			aed = (struct allocExtDesc *)oepos.bh->b_data;
+			le32_add_cpu(&aed->lengthAllocDescs, -(2 * adsize));
+			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
+			    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
+				udf_update_tag(oepos.bh->b_data,
+						oepos.offset - (2 * adsize));
+			else
+				udf_update_tag(oepos.bh->b_data,
+						sizeof(struct allocExtDesc));
+			mark_buffer_dirty_inode(oepos.bh, inode);
+		}
+	} else {
+		udf_write_aext(inode, &oepos, &eloc, elen, 1);
+		if (!oepos.bh) {
+			iinfo->i_lenAlloc -= adsize;
+			mark_inode_dirty(inode);
+		} else {
+			aed = (struct allocExtDesc *)oepos.bh->b_data;
+			le32_add_cpu(&aed->lengthAllocDescs, -adsize);
+			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
+			    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
+				udf_update_tag(oepos.bh->b_data,
+						epos.offset - adsize);
+			else
+				udf_update_tag(oepos.bh->b_data,
+						sizeof(struct allocExtDesc));
+			mark_buffer_dirty_inode(oepos.bh, inode);
+		}
+	}
+
+	brelse(epos.bh);
+	brelse(oepos.bh);
+
+	return (elen >> 30);
+}
+
+int8_t inode_bmap(struct inode *inode, sector_t block,
+		  struct extent_position *pos, struct kernel_lb_addr *eloc,
+		  uint32_t *elen, sector_t *offset)
+{
+	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
+	loff_t lbcount = 0, bcount =
+	    (loff_t) block << blocksize_bits;
+	int8_t etype;
+	struct udf_inode_info *iinfo;
+
+	iinfo = UDF_I(inode);
+	pos->offset = 0;
+	pos->block = iinfo->i_location;
+	pos->bh = NULL;
+	*elen = 0;
+
+	do {
+		etype = udf_next_aext(inode, pos, eloc, elen, 1);
+		if (etype == -1) {
+			*offset = (bcount - lbcount) >> blocksize_bits;
+			iinfo->i_lenExtents = lbcount;
+			return -1;
+		}
+		lbcount += *elen;
+	} while (lbcount <= bcount);
+
+	*offset = (bcount + *elen - lbcount) >> blocksize_bits;
+
+	return etype;
+}
+
+long udf_block_map(struct inode *inode, sector_t block)
+{
+	struct kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t offset;
+	struct extent_position epos = {};
+	int ret;
+
+	down_read(&UDF_I(inode)->i_data_sem);
+
+	if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
+						(EXT_RECORDED_ALLOCATED >> 30))
+		ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
+	else
+		ret = 0;
+
+	up_read(&UDF_I(inode)->i_data_sem);
+	brelse(epos.bh);
+
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV))
+		return udf_fixed_to_variable(ret);
+	else
+		return ret;
+}
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
new file mode 100644
index 00000000..43e24a3b
--- /dev/null
+++ b/fs/udf/lowlevel.c
@@ -0,0 +1,67 @@
+/*
+ * lowlevel.c
+ *
+ * PURPOSE
+ *  Low Level Device Routines for the UDF filesystem
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1999-2001 Ben Fennema
+ *
+ * HISTORY
+ *
+ *  03/26/99 blf  Created.
+ */
+
+#include "udfdecl.h"
+
+#include <linux/blkdev.h>
+#include <linux/cdrom.h>
+#include <asm/uaccess.h>
+
+#include "udf_sb.h"
+
+unsigned int udf_get_last_session(struct super_block *sb)
+{
+	struct cdrom_multisession ms_info;
+	unsigned int vol_desc_start;
+	struct block_device *bdev = sb->s_bdev;
+	int i;
+
+	vol_desc_start = 0;
+	ms_info.addr_format = CDROM_LBA;
+	i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
+
+	if (i == 0) {
+		udf_debug("XA disk: %s, vol_desc_start=%d\n",
+			  (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba);
+		if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
+			vol_desc_start = ms_info.addr.lba;
+	} else {
+		udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i);
+	}
+	return vol_desc_start;
+}
+
+unsigned long udf_get_last_block(struct super_block *sb)
+{
+	struct block_device *bdev = sb->s_bdev;
+	unsigned long lblock = 0;
+
+	/*
+	 * ioctl failed or returned obviously bogus value?
+	 * Try using the device size...
+	 */
+	if (ioctl_by_bdev(bdev, CDROM_LAST_WRITTEN, (unsigned long) &lblock) ||
+	    lblock == 0)
+		lblock = bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+
+	if (lblock)
+		return lblock - 1;
+	else
+		return 0;
+}
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
new file mode 100644
index 00000000..9215700c
--- /dev/null
+++ b/fs/udf/misc.c
@@ -0,0 +1,296 @@
+/*
+ * misc.c
+ *
+ * PURPOSE
+ *	Miscellaneous routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998 Dave Boynton
+ *  (C) 1998-2004 Ben Fennema
+ *  (C) 1999-2000 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  04/19/99 blf  partial support for reading/writing specific EA's
+ */
+
+#include "udfdecl.h"
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/crc-itu-t.h>
+
+#include "udf_i.h"
+#include "udf_sb.h"
+
+struct buffer_head *udf_tgetblk(struct super_block *sb, int block)
+{
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV))
+		return sb_getblk(sb, udf_fixed_to_variable(block));
+	else
+		return sb_getblk(sb, block);
+}
+
+struct buffer_head *udf_tread(struct super_block *sb, int block)
+{
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV))
+		return sb_bread(sb, udf_fixed_to_variable(block));
+	else
+		return sb_bread(sb, block);
+}
+
+struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
+					   uint32_t type, uint8_t loc)
+{
+	uint8_t *ea = NULL, *ad = NULL;
+	int offset;
+	uint16_t crclen;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	ea = iinfo->i_ext.i_data;
+	if (iinfo->i_lenEAttr) {
+		ad = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
+	} else {
+		ad = ea;
+		size += sizeof(struct extendedAttrHeaderDesc);
+	}
+
+	offset = inode->i_sb->s_blocksize - udf_file_entry_alloc_offset(inode) -
+		iinfo->i_lenAlloc;
+
+	/* TODO - Check for FreeEASpace */
+
+	if (loc & 0x01 && offset >= size) {
+		struct extendedAttrHeaderDesc *eahd;
+		eahd = (struct extendedAttrHeaderDesc *)ea;
+
+		if (iinfo->i_lenAlloc)
+			memmove(&ad[size], ad, iinfo->i_lenAlloc);
+
+		if (iinfo->i_lenEAttr) {
+			/* check checksum/crc */
+			if (eahd->descTag.tagIdent !=
+					cpu_to_le16(TAG_IDENT_EAHD) ||
+			    le32_to_cpu(eahd->descTag.tagLocation) !=
+					iinfo->i_location.logicalBlockNum)
+				return NULL;
+		} else {
+			struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
+
+			size -= sizeof(struct extendedAttrHeaderDesc);
+			iinfo->i_lenEAttr +=
+				sizeof(struct extendedAttrHeaderDesc);
+			eahd->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EAHD);
+			if (sbi->s_udfrev >= 0x0200)
+				eahd->descTag.descVersion = cpu_to_le16(3);
+			else
+				eahd->descTag.descVersion = cpu_to_le16(2);
+			eahd->descTag.tagSerialNum =
+					cpu_to_le16(sbi->s_serial_number);
+			eahd->descTag.tagLocation = cpu_to_le32(
+					iinfo->i_location.logicalBlockNum);
+			eahd->impAttrLocation = cpu_to_le32(0xFFFFFFFF);
+			eahd->appAttrLocation = cpu_to_le32(0xFFFFFFFF);
+		}
+
+		offset = iinfo->i_lenEAttr;
+		if (type < 2048) {
+			if (le32_to_cpu(eahd->appAttrLocation) <
+					iinfo->i_lenEAttr) {
+				uint32_t aal =
+					le32_to_cpu(eahd->appAttrLocation);
+				memmove(&ea[offset - aal + size],
+					&ea[aal], offset - aal);
+				offset -= aal;
+				eahd->appAttrLocation =
+						cpu_to_le32(aal + size);
+			}
+			if (le32_to_cpu(eahd->impAttrLocation) <
+					iinfo->i_lenEAttr) {
+				uint32_t ial =
+					le32_to_cpu(eahd->impAttrLocation);
+				memmove(&ea[offset - ial + size],
+					&ea[ial], offset - ial);
+				offset -= ial;
+				eahd->impAttrLocation =
+						cpu_to_le32(ial + size);
+			}
+		} else if (type < 65536) {
+			if (le32_to_cpu(eahd->appAttrLocation) <
+					iinfo->i_lenEAttr) {
+				uint32_t aal =
+					le32_to_cpu(eahd->appAttrLocation);
+				memmove(&ea[offset - aal + size],
+					&ea[aal], offset - aal);
+				offset -= aal;
+				eahd->appAttrLocation =
+						cpu_to_le32(aal + size);
+			}
+		}
+		/* rewrite CRC + checksum of eahd */
+		crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag);
+		eahd->descTag.descCRCLength = cpu_to_le16(crclen);
+		eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd +
+						sizeof(struct tag), crclen));
+		eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag);
+		iinfo->i_lenEAttr += size;
+		return (struct genericFormat *)&ea[offset];
+	}
+	if (loc & 0x02)
+		;
+
+	return NULL;
+}
+
+struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
+					   uint8_t subtype)
+{
+	struct genericFormat *gaf;
+	uint8_t *ea = NULL;
+	uint32_t offset;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	ea = iinfo->i_ext.i_data;
+
+	if (iinfo->i_lenEAttr) {
+		struct extendedAttrHeaderDesc *eahd;
+		eahd = (struct extendedAttrHeaderDesc *)ea;
+
+		/* check checksum/crc */
+		if (eahd->descTag.tagIdent !=
+				cpu_to_le16(TAG_IDENT_EAHD) ||
+		    le32_to_cpu(eahd->descTag.tagLocation) !=
+				iinfo->i_location.logicalBlockNum)
+			return NULL;
+
+		if (type < 2048)
+			offset = sizeof(struct extendedAttrHeaderDesc);
+		else if (type < 65536)
+			offset = le32_to_cpu(eahd->impAttrLocation);
+		else
+			offset = le32_to_cpu(eahd->appAttrLocation);
+
+		while (offset < iinfo->i_lenEAttr) {
+			gaf = (struct genericFormat *)&ea[offset];
+			if (le32_to_cpu(gaf->attrType) == type &&
+					gaf->attrSubtype == subtype)
+				return gaf;
+			else
+				offset += le32_to_cpu(gaf->attrLength);
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * udf_read_tagged
+ *
+ * PURPOSE
+ *	Read the first block of a tagged descriptor.
+ *
+ * HISTORY
+ *	July 1, 1997 - Andrew E. Mileski
+ *	Written, tested, and released.
+ */
+struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
+				    uint32_t location, uint16_t *ident)
+{
+	struct tag *tag_p;
+	struct buffer_head *bh = NULL;
+
+	/* Read the block */
+	if (block == 0xFFFFFFFF)
+		return NULL;
+
+	bh = udf_tread(sb, block);
+	if (!bh) {
+		udf_debug("block=%d, location=%d: read failed\n",
+			  block, location);
+		return NULL;
+	}
+
+	tag_p = (struct tag *)(bh->b_data);
+
+	*ident = le16_to_cpu(tag_p->tagIdent);
+
+	if (location != le32_to_cpu(tag_p->tagLocation)) {
+		udf_debug("location mismatch block %u, tag %u != %u\n",
+			  block, le32_to_cpu(tag_p->tagLocation), location);
+		goto error_out;
+	}
+
+	/* Verify the tag checksum */
+	if (udf_tag_checksum(tag_p) != tag_p->tagChecksum) {
+		printk(KERN_ERR "udf: tag checksum failed block %d\n", block);
+		goto error_out;
+	}
+
+	/* Verify the tag version */
+	if (tag_p->descVersion != cpu_to_le16(0x0002U) &&
+	    tag_p->descVersion != cpu_to_le16(0x0003U)) {
+		udf_debug("tag version 0x%04x != 0x0002 || 0x0003 block %d\n",
+			  le16_to_cpu(tag_p->descVersion), block);
+		goto error_out;
+	}
+
+	/* Verify the descriptor CRC */
+	if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize ||
+	    le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
+					bh->b_data + sizeof(struct tag),
+					le16_to_cpu(tag_p->descCRCLength)))
+		return bh;
+
+	udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block,
+	    le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength));
+
+error_out:
+	brelse(bh);
+	return NULL;
+}
+
+struct buffer_head *udf_read_ptagged(struct super_block *sb,
+				     struct kernel_lb_addr *loc,
+				     uint32_t offset, uint16_t *ident)
+{
+	return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset),
+			       loc->logicalBlockNum + offset, ident);
+}
+
+void udf_update_tag(char *data, int length)
+{
+	struct tag *tptr = (struct tag *)data;
+	length -= sizeof(struct tag);
+
+	tptr->descCRCLength = cpu_to_le16(length);
+	tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length));
+	tptr->tagChecksum = udf_tag_checksum(tptr);
+}
+
+void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
+		 uint32_t loc, int length)
+{
+	struct tag *tptr = (struct tag *)data;
+	tptr->tagIdent = cpu_to_le16(ident);
+	tptr->descVersion = cpu_to_le16(version);
+	tptr->tagSerialNum = cpu_to_le16(snum);
+	tptr->tagLocation = cpu_to_le32(loc);
+	udf_update_tag(data, length);
+}
+
+u8 udf_tag_checksum(const struct tag *t)
+{
+	u8 *data = (u8 *)t;
+	u8 checksum = 0;
+	int i;
+	for (i = 0; i < sizeof(struct tag); ++i)
+		if (i != 4) /* position of checksum */
+			checksum += data[i];
+	return checksum;
+}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
new file mode 100644
index 00000000..f1dce848
--- /dev/null
+++ b/fs/udf/namei.c
@@ -0,0 +1,1339 @@
+/*
+ * namei.c
+ *
+ * PURPOSE
+ *      Inode name handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *      This file is distributed under the terms of the GNU General Public
+ *      License (GPL). Copies of the GPL can be obtained from:
+ *              ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *      Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998-2004 Ben Fennema
+ *  (C) 1999-2000 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  12/12/98 blf  Created. Split out the lookup code from dir.c
+ *  04/19/99 blf  link, mknod, symlink support
+ */
+
+#include "udfdecl.h"
+
+#include "udf_i.h"
+#include "udf_sb.h"
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/sched.h>
+#include <linux/crc-itu-t.h>
+#include <linux/exportfs.h>
+
+enum { UDF_MAX_LINKS = 0xffff };
+
+static inline int udf_match(int len1, const unsigned char *name1, int len2,
+			    const unsigned char *name2)
+{
+	if (len1 != len2)
+		return 0;
+
+	return !memcmp(name1, name2, len1);
+}
+
+int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
+		 struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh,
+		 uint8_t *impuse, uint8_t *fileident)
+{
+	uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag);
+	uint16_t crc;
+	int offset;
+	uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse);
+	uint8_t lfi = cfi->lengthFileIdent;
+	int padlen = fibh->eoffset - fibh->soffset - liu - lfi -
+		sizeof(struct fileIdentDesc);
+	int adinicb = 0;
+
+	if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		adinicb = 1;
+
+	offset = fibh->soffset + sizeof(struct fileIdentDesc);
+
+	if (impuse) {
+		if (adinicb || (offset + liu < 0)) {
+			memcpy((uint8_t *)sfi->impUse, impuse, liu);
+		} else if (offset >= 0) {
+			memcpy(fibh->ebh->b_data + offset, impuse, liu);
+		} else {
+			memcpy((uint8_t *)sfi->impUse, impuse, -offset);
+			memcpy(fibh->ebh->b_data, impuse - offset,
+				liu + offset);
+		}
+	}
+
+	offset += liu;
+
+	if (fileident) {
+		if (adinicb || (offset + lfi < 0)) {
+			memcpy((uint8_t *)sfi->fileIdent + liu, fileident, lfi);
+		} else if (offset >= 0) {
+			memcpy(fibh->ebh->b_data + offset, fileident, lfi);
+		} else {
+			memcpy((uint8_t *)sfi->fileIdent + liu, fileident,
+				-offset);
+			memcpy(fibh->ebh->b_data, fileident - offset,
+				lfi + offset);
+		}
+	}
+
+	offset += lfi;
+
+	if (adinicb || (offset + padlen < 0)) {
+		memset((uint8_t *)sfi->padding + liu + lfi, 0x00, padlen);
+	} else if (offset >= 0) {
+		memset(fibh->ebh->b_data + offset, 0x00, padlen);
+	} else {
+		memset((uint8_t *)sfi->padding + liu + lfi, 0x00, -offset);
+		memset(fibh->ebh->b_data, 0x00, padlen + offset);
+	}
+
+	crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag),
+		      sizeof(struct fileIdentDesc) - sizeof(struct tag));
+
+	if (fibh->sbh == fibh->ebh) {
+		crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
+			      crclen + sizeof(struct tag) -
+			      sizeof(struct fileIdentDesc));
+	} else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) {
+		crc = crc_itu_t(crc, fibh->ebh->b_data +
+					sizeof(struct fileIdentDesc) +
+					fibh->soffset,
+			      crclen + sizeof(struct tag) -
+					sizeof(struct fileIdentDesc));
+	} else {
+		crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
+			      -fibh->soffset - sizeof(struct fileIdentDesc));
+		crc = crc_itu_t(crc, fibh->ebh->b_data, fibh->eoffset);
+	}
+
+	cfi->descTag.descCRC = cpu_to_le16(crc);
+	cfi->descTag.descCRCLength = cpu_to_le16(crclen);
+	cfi->descTag.tagChecksum = udf_tag_checksum(&cfi->descTag);
+
+	if (adinicb || (sizeof(struct fileIdentDesc) <= -fibh->soffset)) {
+		memcpy((uint8_t *)sfi, (uint8_t *)cfi,
+			sizeof(struct fileIdentDesc));
+	} else {
+		memcpy((uint8_t *)sfi, (uint8_t *)cfi, -fibh->soffset);
+		memcpy(fibh->ebh->b_data, (uint8_t *)cfi - fibh->soffset,
+		       sizeof(struct fileIdentDesc) + fibh->soffset);
+	}
+
+	if (adinicb) {
+		mark_inode_dirty(inode);
+	} else {
+		if (fibh->sbh != fibh->ebh)
+			mark_buffer_dirty_inode(fibh->ebh, inode);
+		mark_buffer_dirty_inode(fibh->sbh, inode);
+	}
+	return 0;
+}
+
+static struct fileIdentDesc *udf_find_entry(struct inode *dir,
+					    const struct qstr *child,
+					    struct udf_fileident_bh *fibh,
+					    struct fileIdentDesc *cfi)
+{
+	struct fileIdentDesc *fi = NULL;
+	loff_t f_pos;
+	int block, flen;
+	unsigned char *fname = NULL;
+	unsigned char *nameptr;
+	uint8_t lfi;
+	uint16_t liu;
+	loff_t size;
+	struct kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t offset;
+	struct extent_position epos = {};
+	struct udf_inode_info *dinfo = UDF_I(dir);
+	int isdotdot = child->len == 2 &&
+		child->name[0] == '.' && child->name[1] == '.';
+
+	size = udf_ext0_offset(dir) + dir->i_size;
+	f_pos = udf_ext0_offset(dir);
+
+	fibh->sbh = fibh->ebh = NULL;
+	fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1);
+	if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+		if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
+		    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
+			goto out_err;
+		block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
+		if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
+			if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+				epos.offset -= sizeof(struct short_ad);
+			else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+				epos.offset -= sizeof(struct long_ad);
+		} else
+			offset = 0;
+
+		fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block);
+		if (!fibh->sbh)
+			goto out_err;
+	}
+
+	fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+	if (!fname)
+		goto out_err;
+
+	while (f_pos < size) {
+		fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc,
+					&elen, &offset);
+		if (!fi)
+			goto out_err;
+
+		liu = le16_to_cpu(cfi->lengthOfImpUse);
+		lfi = cfi->lengthFileIdent;
+
+		if (fibh->sbh == fibh->ebh) {
+			nameptr = fi->fileIdent + liu;
+		} else {
+			int poffset;	/* Unpaded ending offset */
+
+			poffset = fibh->soffset + sizeof(struct fileIdentDesc) +
+					liu + lfi;
+
+			if (poffset >= lfi)
+				nameptr = (uint8_t *)(fibh->ebh->b_data +
+						      poffset - lfi);
+			else {
+				nameptr = fname;
+				memcpy(nameptr, fi->fileIdent + liu,
+					lfi - poffset);
+				memcpy(nameptr + lfi - poffset,
+					fibh->ebh->b_data, poffset);
+			}
+		}
+
+		if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
+			if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNDELETE))
+				continue;
+		}
+
+		if ((cfi->fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) {
+			if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNHIDE))
+				continue;
+		}
+
+		if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) &&
+		    isdotdot)
+			goto out_ok;
+
+		if (!lfi)
+			continue;
+
+		flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
+		if (flen && udf_match(flen, fname, child->len, child->name))
+			goto out_ok;
+	}
+
+out_err:
+	fi = NULL;
+	if (fibh->sbh != fibh->ebh)
+		brelse(fibh->ebh);
+	brelse(fibh->sbh);
+out_ok:
+	brelse(epos.bh);
+	kfree(fname);
+
+	return fi;
+}
+
+static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
+				 struct nameidata *nd)
+{
+	struct inode *inode = NULL;
+	struct fileIdentDesc cfi;
+	struct udf_fileident_bh fibh;
+
+	if (dentry->d_name.len > UDF_NAME_LEN - 2)
+		return ERR_PTR(-ENAMETOOLONG);
+
+#ifdef UDF_RECOVERY
+	/* temporary shorthand for specifying files by inode number */
+	if (!strncmp(dentry->d_name.name, ".B=", 3)) {
+		struct kernel_lb_addr lb = {
+			.logicalBlockNum = 0,
+			.partitionReferenceNum =
+				simple_strtoul(dentry->d_name.name + 3,
+						NULL, 0),
+		};
+		inode = udf_iget(dir->i_sb, lb);
+		if (!inode) {
+			return ERR_PTR(-EACCES);
+		}
+	} else
+#endif /* UDF_RECOVERY */
+
+	if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) {
+		struct kernel_lb_addr loc;
+
+		if (fibh.sbh != fibh.ebh)
+			brelse(fibh.ebh);
+		brelse(fibh.sbh);
+
+		loc = lelb_to_cpu(cfi.icb.extLocation);
+		inode = udf_iget(dir->i_sb, &loc);
+		if (!inode) {
+			return ERR_PTR(-EACCES);
+		}
+	}
+
+	return d_splice_alias(inode, dentry);
+}
+
+static struct fileIdentDesc *udf_add_entry(struct inode *dir,
+					   struct dentry *dentry,
+					   struct udf_fileident_bh *fibh,
+					   struct fileIdentDesc *cfi, int *err)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fileIdentDesc *fi = NULL;
+	unsigned char *name = NULL;
+	int namelen;
+	loff_t f_pos;
+	loff_t size = udf_ext0_offset(dir) + dir->i_size;
+	int nfidlen;
+	uint8_t lfi;
+	uint16_t liu;
+	int block;
+	struct kernel_lb_addr eloc;
+	uint32_t elen = 0;
+	sector_t offset;
+	struct extent_position epos = {};
+	struct udf_inode_info *dinfo;
+
+	fibh->sbh = fibh->ebh = NULL;
+	name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+	if (!name) {
+		*err = -ENOMEM;
+		goto out_err;
+	}
+
+	if (dentry) {
+		if (!dentry->d_name.len) {
+			*err = -EINVAL;
+			goto out_err;
+		}
+		namelen = udf_put_filename(sb, dentry->d_name.name, name,
+						 dentry->d_name.len);
+		if (!namelen) {
+			*err = -ENAMETOOLONG;
+			goto out_err;
+		}
+	} else {
+		namelen = 0;
+	}
+
+	nfidlen = (sizeof(struct fileIdentDesc) + namelen + 3) & ~3;
+
+	f_pos = udf_ext0_offset(dir);
+
+	fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1);
+	dinfo = UDF_I(dir);
+	if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+		if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
+		    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
+			block = udf_get_lb_pblock(dir->i_sb,
+					&dinfo->i_location, 0);
+			fibh->soffset = fibh->eoffset = sb->s_blocksize;
+			goto add;
+		}
+		block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
+		if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
+			if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+				epos.offset -= sizeof(struct short_ad);
+			else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+				epos.offset -= sizeof(struct long_ad);
+		} else
+			offset = 0;
+
+		fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block);
+		if (!fibh->sbh) {
+			*err = -EIO;
+			goto out_err;
+		}
+
+		block = dinfo->i_location.logicalBlockNum;
+	}
+
+	while (f_pos < size) {
+		fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc,
+					&elen, &offset);
+
+		if (!fi) {
+			*err = -EIO;
+			goto out_err;
+		}
+
+		liu = le16_to_cpu(cfi->lengthOfImpUse);
+		lfi = cfi->lengthFileIdent;
+
+		if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
+			if (((sizeof(struct fileIdentDesc) +
+					liu + lfi + 3) & ~3) == nfidlen) {
+				cfi->descTag.tagSerialNum = cpu_to_le16(1);
+				cfi->fileVersionNum = cpu_to_le16(1);
+				cfi->fileCharacteristics = 0;
+				cfi->lengthFileIdent = namelen;
+				cfi->lengthOfImpUse = cpu_to_le16(0);
+				if (!udf_write_fi(dir, cfi, fi, fibh, NULL,
+						  name))
+					goto out_ok;
+				else {
+					*err = -EIO;
+					goto out_err;
+				}
+			}
+		}
+	}
+
+add:
+	f_pos += nfidlen;
+
+	if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB &&
+	    sb->s_blocksize - fibh->eoffset < nfidlen) {
+		brelse(epos.bh);
+		epos.bh = NULL;
+		fibh->soffset -= udf_ext0_offset(dir);
+		fibh->eoffset -= udf_ext0_offset(dir);
+		f_pos -= udf_ext0_offset(dir);
+		if (fibh->sbh != fibh->ebh)
+			brelse(fibh->ebh);
+		brelse(fibh->sbh);
+		fibh->sbh = fibh->ebh =
+				udf_expand_dir_adinicb(dir, &block, err);
+		if (!fibh->sbh)
+			goto out_err;
+		epos.block = dinfo->i_location;
+		epos.offset = udf_file_entry_alloc_offset(dir);
+		/* Load extent udf_expand_dir_adinicb() has created */
+		udf_current_aext(dir, &epos, &eloc, &elen, 1);
+	}
+
+	/* Entry fits into current block? */
+	if (sb->s_blocksize - fibh->eoffset >= nfidlen) {
+		fibh->soffset = fibh->eoffset;
+		fibh->eoffset += nfidlen;
+		if (fibh->sbh != fibh->ebh) {
+			brelse(fibh->sbh);
+			fibh->sbh = fibh->ebh;
+		}
+
+		if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+			block = dinfo->i_location.logicalBlockNum;
+			fi = (struct fileIdentDesc *)
+					(dinfo->i_ext.i_data +
+					 fibh->soffset -
+					 udf_ext0_offset(dir) +
+					 dinfo->i_lenEAttr);
+		} else {
+			block = eloc.logicalBlockNum +
+					((elen - 1) >>
+						dir->i_sb->s_blocksize_bits);
+			fi = (struct fileIdentDesc *)
+				(fibh->sbh->b_data + fibh->soffset);
+		}
+	} else {
+		/* Round up last extent in the file */
+		elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
+		if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+			epos.offset -= sizeof(struct short_ad);
+		else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+			epos.offset -= sizeof(struct long_ad);
+		udf_write_aext(dir, &epos, &eloc, elen, 1);
+		dinfo->i_lenExtents = (dinfo->i_lenExtents + sb->s_blocksize
+					- 1) & ~(sb->s_blocksize - 1);
+
+		fibh->soffset = fibh->eoffset - sb->s_blocksize;
+		fibh->eoffset += nfidlen - sb->s_blocksize;
+		if (fibh->sbh != fibh->ebh) {
+			brelse(fibh->sbh);
+			fibh->sbh = fibh->ebh;
+		}
+
+		block = eloc.logicalBlockNum + ((elen - 1) >>
+						dir->i_sb->s_blocksize_bits);
+		fibh->ebh = udf_bread(dir,
+				f_pos >> dir->i_sb->s_blocksize_bits, 1, err);
+		if (!fibh->ebh)
+			goto out_err;
+		/* Extents could have been merged, invalidate our position */
+		brelse(epos.bh);
+		epos.bh = NULL;
+		epos.block = dinfo->i_location;
+		epos.offset = udf_file_entry_alloc_offset(dir);
+
+		if (!fibh->soffset) {
+			/* Find the freshly allocated block */
+			while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
+				(EXT_RECORDED_ALLOCATED >> 30))
+				;
+			block = eloc.logicalBlockNum + ((elen - 1) >>
+					dir->i_sb->s_blocksize_bits);
+			brelse(fibh->sbh);
+			fibh->sbh = fibh->ebh;
+			fi = (struct fileIdentDesc *)(fibh->sbh->b_data);
+		} else {
+			fi = (struct fileIdentDesc *)
+				(fibh->sbh->b_data + sb->s_blocksize +
+					fibh->soffset);
+		}
+	}
+
+	memset(cfi, 0, sizeof(struct fileIdentDesc));
+	if (UDF_SB(sb)->s_udfrev >= 0x0200)
+		udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block,
+			    sizeof(struct tag));
+	else
+		udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block,
+			    sizeof(struct tag));
+	cfi->fileVersionNum = cpu_to_le16(1);
+	cfi->lengthFileIdent = namelen;
+	cfi->lengthOfImpUse = cpu_to_le16(0);
+	if (!udf_write_fi(dir, cfi, fi, fibh, NULL, name)) {
+		dir->i_size += nfidlen;
+		if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+			dinfo->i_lenAlloc += nfidlen;
+		else {
+			/* Find the last extent and truncate it to proper size */
+			while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
+				(EXT_RECORDED_ALLOCATED >> 30))
+				;
+			elen -= dinfo->i_lenExtents - dir->i_size;
+			if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+				epos.offset -= sizeof(struct short_ad);
+			else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+				epos.offset -= sizeof(struct long_ad);
+			udf_write_aext(dir, &epos, &eloc, elen, 1);
+			dinfo->i_lenExtents = dir->i_size;
+		}
+
+		mark_inode_dirty(dir);
+		goto out_ok;
+	} else {
+		*err = -EIO;
+		goto out_err;
+	}
+
+out_err:
+	fi = NULL;
+	if (fibh->sbh != fibh->ebh)
+		brelse(fibh->ebh);
+	brelse(fibh->sbh);
+out_ok:
+	brelse(epos.bh);
+	kfree(name);
+	return fi;
+}
+
+static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
+			    struct udf_fileident_bh *fibh,
+			    struct fileIdentDesc *cfi)
+{
+	cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED;
+
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
+		memset(&(cfi->icb), 0x00, sizeof(struct long_ad));
+
+	return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
+}
+
+static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
+		      struct nameidata *nd)
+{
+	struct udf_fileident_bh fibh;
+	struct inode *inode;
+	struct fileIdentDesc cfi, *fi;
+	int err;
+	struct udf_inode_info *iinfo;
+
+	inode = udf_new_inode(dir, mode, &err);
+	if (!inode) {
+		return err;
+	}
+
+	iinfo = UDF_I(inode);
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		inode->i_data.a_ops = &udf_adinicb_aops;
+	else
+		inode->i_data.a_ops = &udf_aops;
+	inode->i_op = &udf_file_inode_operations;
+	inode->i_fop = &udf_file_operations;
+	mark_inode_dirty(inode);
+
+	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
+	if (!fi) {
+		inode->i_nlink--;
+		mark_inode_dirty(inode);
+		iput(inode);
+		return err;
+	}
+	cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
+	cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
+	*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
+		cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
+	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
+	if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		mark_inode_dirty(dir);
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+	d_instantiate(dentry, inode);
+
+	return 0;
+}
+
+static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
+		     dev_t rdev)
+{
+	struct inode *inode;
+	struct udf_fileident_bh fibh;
+	struct fileIdentDesc cfi, *fi;
+	int err;
+	struct udf_inode_info *iinfo;
+
+	if (!old_valid_dev(rdev))
+		return -EINVAL;
+
+	err = -EIO;
+	inode = udf_new_inode(dir, mode, &err);
+	if (!inode)
+		goto out;
+
+	iinfo = UDF_I(inode);
+	init_special_inode(inode, mode, rdev);
+	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
+	if (!fi) {
+		inode->i_nlink--;
+		mark_inode_dirty(inode);
+		iput(inode);
+		return err;
+	}
+	cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
+	cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
+	*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
+		cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
+	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
+	if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		mark_inode_dirty(dir);
+	mark_inode_dirty(inode);
+
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+	d_instantiate(dentry, inode);
+	err = 0;
+
+out:
+	return err;
+}
+
+static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+	struct udf_fileident_bh fibh;
+	struct fileIdentDesc cfi, *fi;
+	int err;
+	struct udf_inode_info *dinfo = UDF_I(dir);
+	struct udf_inode_info *iinfo;
+
+	err = -EMLINK;
+	if (dir->i_nlink >= UDF_MAX_LINKS)
+		goto out;
+
+	err = -EIO;
+	inode = udf_new_inode(dir, S_IFDIR | mode, &err);
+	if (!inode)
+		goto out;
+
+	iinfo = UDF_I(inode);
+	inode->i_op = &udf_dir_inode_operations;
+	inode->i_fop = &udf_dir_operations;
+	fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err);
+	if (!fi) {
+		inode->i_nlink--;
+		mark_inode_dirty(inode);
+		iput(inode);
+		goto out;
+	}
+	inode->i_nlink = 2;
+	cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
+	cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location);
+	*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
+		cpu_to_le32(dinfo->i_unique & 0x00000000FFFFFFFFUL);
+	cfi.fileCharacteristics =
+			FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT;
+	udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL);
+	brelse(fibh.sbh);
+	mark_inode_dirty(inode);
+
+	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
+	if (!fi) {
+		inode->i_nlink = 0;
+		mark_inode_dirty(inode);
+		iput(inode);
+		goto out;
+	}
+	cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
+	cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
+	*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
+		cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
+	cfi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY;
+	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
+	inc_nlink(dir);
+	mark_inode_dirty(dir);
+	d_instantiate(dentry, inode);
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+	err = 0;
+
+out:
+	return err;
+}
+
+static int empty_dir(struct inode *dir)
+{
+	struct fileIdentDesc *fi, cfi;
+	struct udf_fileident_bh fibh;
+	loff_t f_pos;
+	loff_t size = udf_ext0_offset(dir) + dir->i_size;
+	int block;
+	struct kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t offset;
+	struct extent_position epos = {};
+	struct udf_inode_info *dinfo = UDF_I(dir);
+
+	f_pos = udf_ext0_offset(dir);
+	fibh.soffset = fibh.eoffset = f_pos & (dir->i_sb->s_blocksize - 1);
+
+	if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		fibh.sbh = fibh.ebh = NULL;
+	else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits,
+			      &epos, &eloc, &elen, &offset) ==
+					(EXT_RECORDED_ALLOCATED >> 30)) {
+		block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
+		if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
+			if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+				epos.offset -= sizeof(struct short_ad);
+			else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+				epos.offset -= sizeof(struct long_ad);
+		} else
+			offset = 0;
+
+		fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block);
+		if (!fibh.sbh) {
+			brelse(epos.bh);
+			return 0;
+		}
+	} else {
+		brelse(epos.bh);
+		return 0;
+	}
+
+	while (f_pos < size) {
+		fi = udf_fileident_read(dir, &f_pos, &fibh, &cfi, &epos, &eloc,
+					&elen, &offset);
+		if (!fi) {
+			if (fibh.sbh != fibh.ebh)
+				brelse(fibh.ebh);
+			brelse(fibh.sbh);
+			brelse(epos.bh);
+			return 0;
+		}
+
+		if (cfi.lengthFileIdent &&
+		    (cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) == 0) {
+			if (fibh.sbh != fibh.ebh)
+				brelse(fibh.ebh);
+			brelse(fibh.sbh);
+			brelse(epos.bh);
+			return 0;
+		}
+	}
+
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+	brelse(epos.bh);
+
+	return 1;
+}
+
+static int udf_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int retval;
+	struct inode *inode = dentry->d_inode;
+	struct udf_fileident_bh fibh;
+	struct fileIdentDesc *fi, cfi;
+	struct kernel_lb_addr tloc;
+
+	retval = -ENOENT;
+	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
+	if (!fi)
+		goto out;
+
+	retval = -EIO;
+	tloc = lelb_to_cpu(cfi.icb.extLocation);
+	if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
+		goto end_rmdir;
+	retval = -ENOTEMPTY;
+	if (!empty_dir(inode))
+		goto end_rmdir;
+	retval = udf_delete_entry(dir, fi, &fibh, &cfi);
+	if (retval)
+		goto end_rmdir;
+	if (inode->i_nlink != 2)
+		udf_warning(inode->i_sb, "udf_rmdir",
+			    "empty directory has nlink != 2 (%d)",
+			    inode->i_nlink);
+	clear_nlink(inode);
+	inode->i_size = 0;
+	inode_dec_link_count(dir);
+	inode->i_ctime = dir->i_ctime = dir->i_mtime =
+						current_fs_time(dir->i_sb);
+	mark_inode_dirty(dir);
+
+end_rmdir:
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+
+out:
+	return retval;
+}
+
+static int udf_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int retval;
+	struct inode *inode = dentry->d_inode;
+	struct udf_fileident_bh fibh;
+	struct fileIdentDesc *fi;
+	struct fileIdentDesc cfi;
+	struct kernel_lb_addr tloc;
+
+	retval = -ENOENT;
+	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
+	if (!fi)
+		goto out;
+
+	retval = -EIO;
+	tloc = lelb_to_cpu(cfi.icb.extLocation);
+	if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
+		goto end_unlink;
+
+	if (!inode->i_nlink) {
+		udf_debug("Deleting nonexistent file (%lu), %d\n",
+			  inode->i_ino, inode->i_nlink);
+		inode->i_nlink = 1;
+	}
+	retval = udf_delete_entry(dir, fi, &fibh, &cfi);
+	if (retval)
+		goto end_unlink;
+	dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
+	mark_inode_dirty(dir);
+	inode_dec_link_count(inode);
+	inode->i_ctime = dir->i_ctime;
+	retval = 0;
+
+end_unlink:
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+
+out:
+	return retval;
+}
+
+static int udf_symlink(struct inode *dir, struct dentry *dentry,
+		       const char *symname)
+{
+	struct inode *inode;
+	struct pathComponent *pc;
+	const char *compstart;
+	struct udf_fileident_bh fibh;
+	struct extent_position epos = {};
+	int eoffset, elen = 0;
+	struct fileIdentDesc *fi;
+	struct fileIdentDesc cfi;
+	uint8_t *ea;
+	int err;
+	int block;
+	unsigned char *name = NULL;
+	int namelen;
+	struct udf_inode_info *iinfo;
+	struct super_block *sb = dir->i_sb;
+
+	inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
+	if (!inode)
+		goto out;
+
+	iinfo = UDF_I(inode);
+	down_write(&iinfo->i_data_sem);
+	name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+	if (!name) {
+		err = -ENOMEM;
+		goto out_no_entry;
+	}
+
+	inode->i_data.a_ops = &udf_symlink_aops;
+	inode->i_op = &udf_symlink_inode_operations;
+
+	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+		struct kernel_lb_addr eloc;
+		uint32_t bsize;
+
+		block = udf_new_block(sb, inode,
+				iinfo->i_location.partitionReferenceNum,
+				iinfo->i_location.logicalBlockNum, &err);
+		if (!block)
+			goto out_no_entry;
+		epos.block = iinfo->i_location;
+		epos.offset = udf_file_entry_alloc_offset(inode);
+		epos.bh = NULL;
+		eloc.logicalBlockNum = block;
+		eloc.partitionReferenceNum =
+				iinfo->i_location.partitionReferenceNum;
+		bsize = sb->s_blocksize;
+		iinfo->i_lenExtents = bsize;
+		udf_add_aext(inode, &epos, &eloc, bsize, 0);
+		brelse(epos.bh);
+
+		block = udf_get_pblock(sb, block,
+				iinfo->i_location.partitionReferenceNum,
+				0);
+		epos.bh = udf_tgetblk(sb, block);
+		lock_buffer(epos.bh);
+		memset(epos.bh->b_data, 0x00, bsize);
+		set_buffer_uptodate(epos.bh);
+		unlock_buffer(epos.bh);
+		mark_buffer_dirty_inode(epos.bh, inode);
+		ea = epos.bh->b_data + udf_ext0_offset(inode);
+	} else
+		ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
+
+	eoffset = sb->s_blocksize - udf_ext0_offset(inode);
+	pc = (struct pathComponent *)ea;
+
+	if (*symname == '/') {
+		do {
+			symname++;
+		} while (*symname == '/');
+
+		pc->componentType = 1;
+		pc->lengthComponentIdent = 0;
+		pc->componentFileVersionNum = 0;
+		elen += sizeof(struct pathComponent);
+	}
+
+	err = -ENAMETOOLONG;
+
+	while (*symname) {
+		if (elen + sizeof(struct pathComponent) > eoffset)
+			goto out_no_entry;
+
+		pc = (struct pathComponent *)(ea + elen);
+
+		compstart = symname;
+
+		do {
+			symname++;
+		} while (*symname && *symname != '/');
+
+		pc->componentType = 5;
+		pc->lengthComponentIdent = 0;
+		pc->componentFileVersionNum = 0;
+		if (compstart[0] == '.') {
+			if ((symname - compstart) == 1)
+				pc->componentType = 4;
+			else if ((symname - compstart) == 2 &&
+					compstart[1] == '.')
+				pc->componentType = 3;
+		}
+
+		if (pc->componentType == 5) {
+			namelen = udf_put_filename(sb, compstart, name,
+						   symname - compstart);
+			if (!namelen)
+				goto out_no_entry;
+
+			if (elen + sizeof(struct pathComponent) + namelen >
+					eoffset)
+				goto out_no_entry;
+			else
+				pc->lengthComponentIdent = namelen;
+
+			memcpy(pc->componentIdent, name, namelen);
+		}
+
+		elen += sizeof(struct pathComponent) + pc->lengthComponentIdent;
+
+		if (*symname) {
+			do {
+				symname++;
+			} while (*symname == '/');
+		}
+	}
+
+	brelse(epos.bh);
+	inode->i_size = elen;
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		iinfo->i_lenAlloc = inode->i_size;
+	else
+		udf_truncate_tail_extent(inode);
+	mark_inode_dirty(inode);
+
+	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
+	if (!fi)
+		goto out_no_entry;
+	cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
+	cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
+	if (UDF_SB(inode->i_sb)->s_lvid_bh) {
+		*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
+			cpu_to_le32(lvid_get_unique_id(sb));
+	}
+	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
+	if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		mark_inode_dirty(dir);
+	up_write(&iinfo->i_data_sem);
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+	d_instantiate(dentry, inode);
+	err = 0;
+
+out:
+	kfree(name);
+	return err;
+
+out_no_entry:
+	up_write(&iinfo->i_data_sem);
+	inode_dec_link_count(inode);
+	iput(inode);
+	goto out;
+}
+
+static int udf_link(struct dentry *old_dentry, struct inode *dir,
+		    struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+	struct udf_fileident_bh fibh;
+	struct fileIdentDesc cfi, *fi;
+	int err;
+
+	if (inode->i_nlink >= UDF_MAX_LINKS)
+		return -EMLINK;
+
+	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
+	if (!fi) {
+		return err;
+	}
+	cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
+	cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location);
+	if (UDF_SB(inode->i_sb)->s_lvid_bh) {
+		*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
+			cpu_to_le32(lvid_get_unique_id(inode->i_sb));
+	}
+	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
+	if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		mark_inode_dirty(dir);
+
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+	inc_nlink(inode);
+	inode->i_ctime = current_fs_time(inode->i_sb);
+	mark_inode_dirty(inode);
+	ihold(inode);
+	d_instantiate(dentry, inode);
+
+	return 0;
+}
+
+/* Anybody can rename anything with this: the permission checks are left to the
+ * higher-level routines.
+ */
+static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct udf_fileident_bh ofibh, nfibh;
+	struct fileIdentDesc *ofi = NULL, *nfi = NULL, *dir_fi = NULL;
+	struct fileIdentDesc ocfi, ncfi;
+	struct buffer_head *dir_bh = NULL;
+	int retval = -ENOENT;
+	struct kernel_lb_addr tloc;
+	struct udf_inode_info *old_iinfo = UDF_I(old_inode);
+
+	ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
+	if (ofi) {
+		if (ofibh.sbh != ofibh.ebh)
+			brelse(ofibh.ebh);
+		brelse(ofibh.sbh);
+	}
+	tloc = lelb_to_cpu(ocfi.icb.extLocation);
+	if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
+	    != old_inode->i_ino)
+		goto end_rename;
+
+	nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi);
+	if (nfi) {
+		if (!new_inode) {
+			if (nfibh.sbh != nfibh.ebh)
+				brelse(nfibh.ebh);
+			brelse(nfibh.sbh);
+			nfi = NULL;
+		}
+	}
+	if (S_ISDIR(old_inode->i_mode)) {
+		int offset = udf_ext0_offset(old_inode);
+
+		if (new_inode) {
+			retval = -ENOTEMPTY;
+			if (!empty_dir(new_inode))
+				goto end_rename;
+		}
+		retval = -EIO;
+		if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+			dir_fi = udf_get_fileident(
+					old_iinfo->i_ext.i_data -
+					  (old_iinfo->i_efe ?
+					   sizeof(struct extendedFileEntry) :
+					   sizeof(struct fileEntry)),
+					old_inode->i_sb->s_blocksize, &offset);
+		} else {
+			dir_bh = udf_bread(old_inode, 0, 0, &retval);
+			if (!dir_bh)
+				goto end_rename;
+			dir_fi = udf_get_fileident(dir_bh->b_data,
+					old_inode->i_sb->s_blocksize, &offset);
+		}
+		if (!dir_fi)
+			goto end_rename;
+		tloc = lelb_to_cpu(dir_fi->icb.extLocation);
+		if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) !=
+				old_dir->i_ino)
+			goto end_rename;
+
+		retval = -EMLINK;
+		if (!new_inode && new_dir->i_nlink >= UDF_MAX_LINKS)
+			goto end_rename;
+	}
+	if (!nfi) {
+		nfi = udf_add_entry(new_dir, new_dentry, &nfibh, &ncfi,
+				    &retval);
+		if (!nfi)
+			goto end_rename;
+	}
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+	 * rename.
+	 */
+	old_inode->i_ctime = current_fs_time(old_inode->i_sb);
+	mark_inode_dirty(old_inode);
+
+	/*
+	 * ok, that's it
+	 */
+	ncfi.fileVersionNum = ocfi.fileVersionNum;
+	ncfi.fileCharacteristics = ocfi.fileCharacteristics;
+	memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad));
+	udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL);
+
+	/* The old fid may have moved - find it again */
+	ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
+	udf_delete_entry(old_dir, ofi, &ofibh, &ocfi);
+
+	if (new_inode) {
+		new_inode->i_ctime = current_fs_time(new_inode->i_sb);
+		inode_dec_link_count(new_inode);
+	}
+	old_dir->i_ctime = old_dir->i_mtime = current_fs_time(old_dir->i_sb);
+	mark_inode_dirty(old_dir);
+
+	if (dir_fi) {
+		dir_fi->icb.extLocation = cpu_to_lelb(UDF_I(new_dir)->i_location);
+		udf_update_tag((char *)dir_fi,
+				(sizeof(struct fileIdentDesc) +
+				le16_to_cpu(dir_fi->lengthOfImpUse) + 3) & ~3);
+		if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+			mark_inode_dirty(old_inode);
+		else
+			mark_buffer_dirty_inode(dir_bh, old_inode);
+
+		inode_dec_link_count(old_dir);
+		if (new_inode)
+			inode_dec_link_count(new_inode);
+		else {
+			inc_nlink(new_dir);
+			mark_inode_dirty(new_dir);
+		}
+	}
+
+	if (ofi) {
+		if (ofibh.sbh != ofibh.ebh)
+			brelse(ofibh.ebh);
+		brelse(ofibh.sbh);
+	}
+
+	retval = 0;
+
+end_rename:
+	brelse(dir_bh);
+	if (nfi) {
+		if (nfibh.sbh != nfibh.ebh)
+			brelse(nfibh.ebh);
+		brelse(nfibh.sbh);
+	}
+
+	return retval;
+}
+
+static struct dentry *udf_get_parent(struct dentry *child)
+{
+	struct kernel_lb_addr tloc;
+	struct inode *inode = NULL;
+	struct qstr dotdot = {.name = "..", .len = 2};
+	struct fileIdentDesc cfi;
+	struct udf_fileident_bh fibh;
+
+	if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
+		goto out_unlock;
+
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+
+	tloc = lelb_to_cpu(cfi.icb.extLocation);
+	inode = udf_iget(child->d_inode->i_sb, &tloc);
+	if (!inode)
+		goto out_unlock;
+
+	return d_obtain_alias(inode);
+out_unlock:
+	return ERR_PTR(-EACCES);
+}
+
+
+static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
+					u16 partref, __u32 generation)
+{
+	struct inode *inode;
+	struct kernel_lb_addr loc;
+
+	if (block == 0)
+		return ERR_PTR(-ESTALE);
+
+	loc.logicalBlockNum = block;
+	loc.partitionReferenceNum = partref;
+	inode = udf_iget(sb, &loc);
+
+	if (inode == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return d_obtain_alias(inode);
+}
+
+static struct dentry *udf_fh_to_dentry(struct super_block *sb,
+				       struct fid *fid, int fh_len, int fh_type)
+{
+	if ((fh_len != 3 && fh_len != 5) ||
+	    (fh_type != FILEID_UDF_WITH_PARENT &&
+	     fh_type != FILEID_UDF_WITHOUT_PARENT))
+		return NULL;
+
+	return udf_nfs_get_inode(sb, fid->udf.block, fid->udf.partref,
+			fid->udf.generation);
+}
+
+static struct dentry *udf_fh_to_parent(struct super_block *sb,
+				       struct fid *fid, int fh_len, int fh_type)
+{
+	if (fh_len != 5 || fh_type != FILEID_UDF_WITH_PARENT)
+		return NULL;
+
+	return udf_nfs_get_inode(sb, fid->udf.parent_block,
+				 fid->udf.parent_partref,
+				 fid->udf.parent_generation);
+}
+static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
+			 int connectable)
+{
+	int len = *lenp;
+	struct inode *inode =  de->d_inode;
+	struct kernel_lb_addr location = UDF_I(inode)->i_location;
+	struct fid *fid = (struct fid *)fh;
+	int type = FILEID_UDF_WITHOUT_PARENT;
+
+	if (connectable && (len < 5)) {
+		*lenp = 5;
+		return 255;
+	} else if (len < 3) {
+		*lenp = 3;
+		return 255;
+	}
+
+	*lenp = 3;
+	fid->udf.block = location.logicalBlockNum;
+	fid->udf.partref = location.partitionReferenceNum;
+	fid->udf.generation = inode->i_generation;
+
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		spin_lock(&de->d_lock);
+		inode = de->d_parent->d_inode;
+		location = UDF_I(inode)->i_location;
+		fid->udf.parent_block = location.logicalBlockNum;
+		fid->udf.parent_partref = location.partitionReferenceNum;
+		fid->udf.parent_generation = inode->i_generation;
+		spin_unlock(&de->d_lock);
+		*lenp = 5;
+		type = FILEID_UDF_WITH_PARENT;
+	}
+
+	return type;
+}
+
+const struct export_operations udf_export_ops = {
+	.encode_fh	= udf_encode_fh,
+	.fh_to_dentry   = udf_fh_to_dentry,
+	.fh_to_parent   = udf_fh_to_parent,
+	.get_parent     = udf_get_parent,
+};
+
+const struct inode_operations udf_dir_inode_operations = {
+	.lookup				= udf_lookup,
+	.create				= udf_create,
+	.link				= udf_link,
+	.unlink				= udf_unlink,
+	.symlink			= udf_symlink,
+	.mkdir				= udf_mkdir,
+	.rmdir				= udf_rmdir,
+	.mknod				= udf_mknod,
+	.rename				= udf_rename,
+};
+const struct inode_operations udf_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+};
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
new file mode 100644
index 00000000..fbff7465
--- /dev/null
+++ b/fs/udf/osta_udf.h
@@ -0,0 +1,279 @@
+/*
+ * osta_udf.h
+ *
+ * This file is based on OSTA UDF(tm) 2.50 (April 30, 2003)
+ * http://www.osta.org
+ *
+ * Copyright (c) 2001-2004  Ben Fennema <bfennema@falcon.csc.calpoly.edu>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU Public License ("GPL").
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "ecma_167.h"
+
+#ifndef _OSTA_UDF_H
+#define _OSTA_UDF_H 1
+
+/* OSTA CS0 Charspec (UDF 2.50 2.1.2) */
+#define UDF_CHAR_SET_TYPE		0
+#define UDF_CHAR_SET_INFO		"OSTA Compressed Unicode"
+
+/* Entity Identifier (UDF 2.50 2.1.5) */
+/* Identifiers (UDF 2.50 2.1.5.2) */
+#define UDF_ID_DEVELOPER		"*Linux UDFFS"
+#define	UDF_ID_COMPLIANT		"*OSTA UDF Compliant"
+#define UDF_ID_LV_INFO			"*UDF LV Info"
+#define UDF_ID_FREE_EA			"*UDF FreeEASpace"
+#define UDF_ID_FREE_APP_EA		"*UDF FreeAppEASpace"
+#define UDF_ID_DVD_CGMS			"*UDF DVD CGMS Info"
+#define UDF_ID_OS2_EA			"*UDF OS/2 EA"
+#define UDF_ID_OS2_EA_LENGTH		"*UDF OS/2 EALength"
+#define UDF_ID_MAC_VOLUME		"*UDF Mac VolumeInfo"
+#define UDF_ID_MAC_FINDER		"*UDF Mac FinderInfo"
+#define UDF_ID_MAC_UNIQUE		"*UDF Mac UniqueIDTable"
+#define UDF_ID_MAC_RESOURCE		"*UDF Mac ResourceFork"
+#define UDF_ID_VIRTUAL			"*UDF Virtual Partition"
+#define UDF_ID_SPARABLE			"*UDF Sparable Partition"
+#define UDF_ID_ALLOC			"*UDF Virtual Alloc Tbl"
+#define UDF_ID_SPARING			"*UDF Sparing Table"
+#define UDF_ID_METADATA			"*UDF Metadata Partition"
+
+/* Identifier Suffix (UDF 2.50 2.1.5.3) */
+#define IS_DF_HARD_WRITE_PROTECT	0x01
+#define IS_DF_SOFT_WRITE_PROTECT	0x02
+
+struct UDFIdentSuffix {
+	__le16		UDFRevision;
+	uint8_t		OSClass;
+	uint8_t		OSIdentifier;
+	uint8_t		reserved[4];
+} __attribute__ ((packed));
+
+struct impIdentSuffix {
+	uint8_t		OSClass;
+	uint8_t		OSIdentifier;
+	uint8_t		reserved[6];
+} __attribute__ ((packed));
+
+struct appIdentSuffix {
+	uint8_t		impUse[8];
+} __attribute__ ((packed));
+
+/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */
+/* Implementation Use (UDF 2.50 2.2.6.4) */
+struct logicalVolIntegrityDescImpUse {
+	struct regid	impIdent;
+	__le32		numFiles;
+	__le32		numDirs;
+	__le16		minUDFReadRev;
+	__le16		minUDFWriteRev;
+	__le16		maxUDFWriteRev;
+	uint8_t		impUse[0];
+} __attribute__ ((packed));
+
+/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */
+/* Implementation Use (UDF 2.50 2.2.7.2) */
+struct impUseVolDescImpUse {
+	struct charspec	LVICharset;
+	dstring		logicalVolIdent[128];
+	dstring		LVInfo1[36];
+	dstring		LVInfo2[36];
+	dstring		LVInfo3[36];
+	struct regid	impIdent;
+	uint8_t		impUse[128];
+} __attribute__ ((packed));
+
+struct udfPartitionMap2 {
+	uint8_t		partitionMapType;
+	uint8_t		partitionMapLength;
+	uint8_t		reserved1[2];
+	struct regid	partIdent;
+	__le16		volSeqNum;
+	__le16		partitionNum;
+} __attribute__ ((packed));
+
+/* Virtual Partition Map (UDF 2.50 2.2.8) */
+struct virtualPartitionMap {
+	uint8_t		partitionMapType;
+	uint8_t		partitionMapLength;
+	uint8_t		reserved1[2];
+	struct regid	partIdent;
+	__le16		volSeqNum;
+	__le16		partitionNum;
+	uint8_t		reserved2[24];
+} __attribute__ ((packed));
+
+/* Sparable Partition Map (UDF 2.50 2.2.9) */
+struct sparablePartitionMap {
+	uint8_t partitionMapType;
+	uint8_t partitionMapLength;
+	uint8_t reserved1[2];
+	struct regid partIdent;
+	__le16 volSeqNum;
+	__le16 partitionNum;
+	__le16 packetLength;
+	uint8_t numSparingTables;
+	uint8_t reserved2[1];
+	__le32 sizeSparingTable;
+	__le32 locSparingTable[4];
+} __attribute__ ((packed));
+
+/* Metadata Partition Map (UDF 2.4.0 2.2.10) */
+struct metadataPartitionMap {
+	uint8_t		partitionMapType;
+	uint8_t		partitionMapLength;
+	uint8_t		reserved1[2];
+	struct regid	partIdent;
+	__le16		volSeqNum;
+	__le16		partitionNum;
+	__le32		metadataFileLoc;
+	__le32		metadataMirrorFileLoc;
+	__le32		metadataBitmapFileLoc;
+	__le32		allocUnitSize;
+	__le16		alignUnitSize;
+	uint8_t		flags;
+	uint8_t		reserved2[5];
+} __attribute__ ((packed));
+
+/* Virtual Allocation Table (UDF 1.5 2.2.10) */
+struct virtualAllocationTable15 {
+	__le32		VirtualSector[0];
+	struct regid	vatIdent;
+	__le32		previousVATICBLoc;
+} __attribute__ ((packed));
+
+#define ICBTAG_FILE_TYPE_VAT15		0x00U
+
+/* Virtual Allocation Table (UDF 2.50 2.2.11) */
+struct virtualAllocationTable20 {
+	__le16		lengthHeader;
+	__le16		lengthImpUse;
+	dstring		logicalVolIdent[128];
+	__le32		previousVATICBLoc;
+	__le32		numFiles;
+	__le32		numDirs;
+	__le16		minReadRevision;
+	__le16		minWriteRevision;
+	__le16		maxWriteRevision;
+	__le16		reserved;
+	uint8_t		impUse[0];
+	__le32		vatEntry[0];
+} __attribute__ ((packed));
+
+#define ICBTAG_FILE_TYPE_VAT20		0xF8U
+
+/* Sparing Table (UDF 2.50 2.2.12) */
+struct sparingEntry {
+	__le32		origLocation;
+	__le32		mappedLocation;
+} __attribute__ ((packed));
+
+struct sparingTable {
+	struct tag	descTag;
+	struct regid	sparingIdent;
+	__le16		reallocationTableLen;
+	__le16		reserved;
+	__le32		sequenceNum;
+	struct sparingEntry
+			mapEntry[0];
+} __attribute__ ((packed));
+
+/* Metadata File (and Metadata Mirror File) (UDF 2.50 2.2.13.1) */
+#define ICBTAG_FILE_TYPE_MAIN		0xFA
+#define ICBTAG_FILE_TYPE_MIRROR		0xFB
+#define ICBTAG_FILE_TYPE_BITMAP		0xFC
+
+/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
+struct allocDescImpUse {
+	__le16		flags;
+	uint8_t		impUse[4];
+} __attribute__ ((packed));
+
+#define AD_IU_EXT_ERASED		0x0001
+
+/* Real-Time Files (UDF 2.50 6.11) */
+#define ICBTAG_FILE_TYPE_REALTIME	0xF9U
+
+/* Implementation Use Extended Attribute (UDF 2.50 3.3.4.5) */
+/* FreeEASpace (UDF 2.50 3.3.4.5.1.1) */
+struct freeEaSpace {
+	__le16		headerChecksum;
+	uint8_t		freeEASpace[0];
+} __attribute__ ((packed));
+
+/* DVD Copyright Management Information (UDF 2.50 3.3.4.5.1.2) */
+struct DVDCopyrightImpUse {
+	__le16		headerChecksum;
+	uint8_t		CGMSInfo;
+	uint8_t		dataType;
+	uint8_t		protectionSystemInfo[4];
+} __attribute__ ((packed));
+
+/* Application Use Extended Attribute (UDF 2.50 3.3.4.6) */
+/* FreeAppEASpace (UDF 2.50 3.3.4.6.1) */
+struct freeAppEASpace {
+	__le16		headerChecksum;
+	uint8_t		freeEASpace[0];
+} __attribute__ ((packed));
+
+/* UDF Defined System Stream (UDF 2.50 3.3.7) */
+#define UDF_ID_UNIQUE_ID		"*UDF Unique ID Mapping Data"
+#define UDF_ID_NON_ALLOC		"*UDF Non-Allocatable Space"
+#define UDF_ID_POWER_CAL		"*UDF Power Cal Table"
+#define UDF_ID_BACKUP			"*UDF Backup"
+
+/* Operating System Identifiers (UDF 2.50 6.3) */
+#define UDF_OS_CLASS_UNDEF		0x00U
+#define UDF_OS_CLASS_DOS		0x01U
+#define UDF_OS_CLASS_OS2		0x02U
+#define UDF_OS_CLASS_MAC		0x03U
+#define UDF_OS_CLASS_UNIX		0x04U
+#define UDF_OS_CLASS_WIN9X		0x05U
+#define UDF_OS_CLASS_WINNT		0x06U
+#define UDF_OS_CLASS_OS400		0x07U
+#define UDF_OS_CLASS_BEOS		0x08U
+#define UDF_OS_CLASS_WINCE		0x09U
+
+#define UDF_OS_ID_UNDEF			0x00U
+#define UDF_OS_ID_DOS			0x00U
+#define UDF_OS_ID_OS2			0x00U
+#define UDF_OS_ID_MAC			0x00U
+#define UDF_OS_ID_MAX_OSX		0x01U
+#define UDF_OS_ID_UNIX			0x00U
+#define UDF_OS_ID_AIX			0x01U
+#define UDF_OS_ID_SOLARIS		0x02U
+#define UDF_OS_ID_HPUX			0x03U
+#define UDF_OS_ID_IRIX			0x04U
+#define UDF_OS_ID_LINUX			0x05U
+#define UDF_OS_ID_MKLINUX		0x06U
+#define UDF_OS_ID_FREEBSD		0x07U
+#define UDF_OS_ID_WIN9X			0x00U
+#define UDF_OS_ID_WINNT			0x00U
+#define UDF_OS_ID_OS400			0x00U
+#define UDF_OS_ID_BEOS			0x00U
+#define UDF_OS_ID_WINCE			0x00U
+
+#endif /* _OSTA_UDF_H */
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
new file mode 100644
index 00000000..a71090ea
--- /dev/null
+++ b/fs/udf/partition.c
@@ -0,0 +1,334 @@
+/*
+ * partition.c
+ *
+ * PURPOSE
+ *      Partition handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *      This file is distributed under the terms of the GNU General Public
+ *      License (GPL). Copies of the GPL can be obtained from:
+ *              ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *      Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998-2001 Ben Fennema
+ *
+ * HISTORY
+ *
+ * 12/06/98 blf  Created file.
+ *
+ */
+
+#include "udfdecl.h"
+#include "udf_sb.h"
+#include "udf_i.h"
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+
+uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
+			uint16_t partition, uint32_t offset)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map;
+	if (partition >= sbi->s_partitions) {
+		udf_debug("block=%d, partition=%d, offset=%d: "
+			  "invalid partition\n", block, partition, offset);
+		return 0xFFFFFFFF;
+	}
+	map = &sbi->s_partmaps[partition];
+	if (map->s_partition_func)
+		return map->s_partition_func(sb, block, partition, offset);
+	else
+		return map->s_partition_root + block + offset;
+}
+
+uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
+			       uint16_t partition, uint32_t offset)
+{
+	struct buffer_head *bh = NULL;
+	uint32_t newblock;
+	uint32_t index;
+	uint32_t loc;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map;
+	struct udf_virtual_data *vdata;
+	struct udf_inode_info *iinfo = UDF_I(sbi->s_vat_inode);
+
+	map = &sbi->s_partmaps[partition];
+	vdata = &map->s_type_specific.s_virtual;
+
+	if (block > vdata->s_num_entries) {
+		udf_debug("Trying to access block beyond end of VAT "
+			  "(%d max %d)\n", block, vdata->s_num_entries);
+		return 0xFFFFFFFF;
+	}
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+		loc = le32_to_cpu(((__le32 *)(iinfo->i_ext.i_data +
+			vdata->s_start_offset))[block]);
+		goto translate;
+	}
+	index = (sb->s_blocksize - vdata->s_start_offset) / sizeof(uint32_t);
+	if (block >= index) {
+		block -= index;
+		newblock = 1 + (block / (sb->s_blocksize / sizeof(uint32_t)));
+		index = block % (sb->s_blocksize / sizeof(uint32_t));
+	} else {
+		newblock = 0;
+		index = vdata->s_start_offset / sizeof(uint32_t) + block;
+	}
+
+	loc = udf_block_map(sbi->s_vat_inode, newblock);
+
+	bh = sb_bread(sb, loc);
+	if (!bh) {
+		udf_debug("get_pblock(UDF_VIRTUAL_MAP:%p,%d,%d) VAT: %d[%d]\n",
+			  sb, block, partition, loc, index);
+		return 0xFFFFFFFF;
+	}
+
+	loc = le32_to_cpu(((__le32 *)bh->b_data)[index]);
+
+	brelse(bh);
+
+translate:
+	if (iinfo->i_location.partitionReferenceNum == partition) {
+		udf_debug("recursive call to udf_get_pblock!\n");
+		return 0xFFFFFFFF;
+	}
+
+	return udf_get_pblock(sb, loc,
+			      iinfo->i_location.partitionReferenceNum,
+			      offset);
+}
+
+inline uint32_t udf_get_pblock_virt20(struct super_block *sb, uint32_t block,
+				      uint16_t partition, uint32_t offset)
+{
+	return udf_get_pblock_virt15(sb, block, partition, offset);
+}
+
+uint32_t udf_get_pblock_spar15(struct super_block *sb, uint32_t block,
+			       uint16_t partition, uint32_t offset)
+{
+	int i;
+	struct sparingTable *st = NULL;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map;
+	uint32_t packet;
+	struct udf_sparing_data *sdata;
+
+	map = &sbi->s_partmaps[partition];
+	sdata = &map->s_type_specific.s_sparing;
+	packet = (block + offset) & ~(sdata->s_packet_len - 1);
+
+	for (i = 0; i < 4; i++) {
+		if (sdata->s_spar_map[i] != NULL) {
+			st = (struct sparingTable *)
+					sdata->s_spar_map[i]->b_data;
+			break;
+		}
+	}
+
+	if (st) {
+		for (i = 0; i < le16_to_cpu(st->reallocationTableLen); i++) {
+			struct sparingEntry *entry = &st->mapEntry[i];
+			u32 origLoc = le32_to_cpu(entry->origLocation);
+			if (origLoc >= 0xFFFFFFF0)
+				break;
+			else if (origLoc == packet)
+				return le32_to_cpu(entry->mappedLocation) +
+					((block + offset) &
+						(sdata->s_packet_len - 1));
+			else if (origLoc > packet)
+				break;
+		}
+	}
+
+	return map->s_partition_root + block + offset;
+}
+
+int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
+{
+	struct udf_sparing_data *sdata;
+	struct sparingTable *st = NULL;
+	struct sparingEntry mapEntry;
+	uint32_t packet;
+	int i, j, k, l;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	u16 reallocationTableLen;
+	struct buffer_head *bh;
+	int ret = 0;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	for (i = 0; i < sbi->s_partitions; i++) {
+		struct udf_part_map *map = &sbi->s_partmaps[i];
+		if (old_block > map->s_partition_root &&
+		    old_block < map->s_partition_root + map->s_partition_len) {
+			sdata = &map->s_type_specific.s_sparing;
+			packet = (old_block - map->s_partition_root) &
+						~(sdata->s_packet_len - 1);
+
+			for (j = 0; j < 4; j++)
+				if (sdata->s_spar_map[j] != NULL) {
+					st = (struct sparingTable *)
+						sdata->s_spar_map[j]->b_data;
+					break;
+				}
+
+			if (!st) {
+				ret = 1;
+				goto out;
+			}
+
+			reallocationTableLen =
+					le16_to_cpu(st->reallocationTableLen);
+			for (k = 0; k < reallocationTableLen; k++) {
+				struct sparingEntry *entry = &st->mapEntry[k];
+				u32 origLoc = le32_to_cpu(entry->origLocation);
+
+				if (origLoc == 0xFFFFFFFF) {
+					for (; j < 4; j++) {
+						int len;
+						bh = sdata->s_spar_map[j];
+						if (!bh)
+							continue;
+
+						st = (struct sparingTable *)
+								bh->b_data;
+						entry->origLocation =
+							cpu_to_le32(packet);
+						len =
+						  sizeof(struct sparingTable) +
+						  reallocationTableLen *
+						  sizeof(struct sparingEntry);
+						udf_update_tag((char *)st, len);
+						mark_buffer_dirty(bh);
+					}
+					*new_block = le32_to_cpu(
+							entry->mappedLocation) +
+						     ((old_block -
+							map->s_partition_root) &
+						     (sdata->s_packet_len - 1));
+					ret = 0;
+					goto out;
+				} else if (origLoc == packet) {
+					*new_block = le32_to_cpu(
+							entry->mappedLocation) +
+						     ((old_block -
+							map->s_partition_root) &
+						     (sdata->s_packet_len - 1));
+					ret = 0;
+					goto out;
+				} else if (origLoc > packet)
+					break;
+			}
+
+			for (l = k; l < reallocationTableLen; l++) {
+				struct sparingEntry *entry = &st->mapEntry[l];
+				u32 origLoc = le32_to_cpu(entry->origLocation);
+
+				if (origLoc != 0xFFFFFFFF)
+					continue;
+
+				for (; j < 4; j++) {
+					bh = sdata->s_spar_map[j];
+					if (!bh)
+						continue;
+
+					st = (struct sparingTable *)bh->b_data;
+					mapEntry = st->mapEntry[l];
+					mapEntry.origLocation =
+							cpu_to_le32(packet);
+					memmove(&st->mapEntry[k + 1],
+						&st->mapEntry[k],
+						(l - k) *
+						sizeof(struct sparingEntry));
+					st->mapEntry[k] = mapEntry;
+					udf_update_tag((char *)st,
+						sizeof(struct sparingTable) +
+						reallocationTableLen *
+						sizeof(struct sparingEntry));
+					mark_buffer_dirty(bh);
+				}
+				*new_block =
+					le32_to_cpu(
+					      st->mapEntry[k].mappedLocation) +
+					((old_block - map->s_partition_root) &
+					 (sdata->s_packet_len - 1));
+				ret = 0;
+				goto out;
+			}
+
+			ret = 1;
+			goto out;
+		} /* if old_block */
+	}
+
+	if (i == sbi->s_partitions) {
+		/* outside of partitions */
+		/* for now, fail =) */
+		ret = 1;
+	}
+
+out:
+	mutex_unlock(&sbi->s_alloc_mutex);
+	return ret;
+}
+
+static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
+					uint16_t partition, uint32_t offset)
+{
+	struct super_block *sb = inode->i_sb;
+	struct udf_part_map *map;
+	struct kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t ext_offset;
+	struct extent_position epos = {};
+	uint32_t phyblock;
+
+	if (inode_bmap(inode, block, &epos, &eloc, &elen, &ext_offset) !=
+						(EXT_RECORDED_ALLOCATED >> 30))
+		phyblock = 0xFFFFFFFF;
+	else {
+		map = &UDF_SB(sb)->s_partmaps[partition];
+		/* map to sparable/physical partition desc */
+		phyblock = udf_get_pblock(sb, eloc.logicalBlockNum,
+			map->s_partition_num, ext_offset + offset);
+	}
+
+	brelse(epos.bh);
+	return phyblock;
+}
+
+uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block,
+				uint16_t partition, uint32_t offset)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map;
+	struct udf_meta_data *mdata;
+	uint32_t retblk;
+	struct inode *inode;
+
+	udf_debug("READING from METADATA\n");
+
+	map = &sbi->s_partmaps[partition];
+	mdata = &map->s_type_specific.s_metadata;
+	inode = mdata->s_metadata_fe ? : mdata->s_mirror_fe;
+
+	/* We shouldn't mount such media... */
+	BUG_ON(!inode);
+	retblk = udf_try_read_meta(inode, block, partition, offset);
+	if (retblk == 0xFFFFFFFF) {
+		udf_warning(sb, __func__, "error reading from METADATA, "
+			"trying to read from MIRROR");
+		inode = mdata->s_mirror_fe;
+		if (!inode)
+			return 0xFFFFFFFF;
+		retblk = udf_try_read_meta(inode, block, partition, offset);
+	}
+
+	return retblk;
+}
diff --git a/fs/udf/super.c b/fs/udf/super.c
new file mode 100644
index 00000000..7f0e18aa
--- /dev/null
+++ b/fs/udf/super.c
@@ -0,0 +1,2324 @@
+/*
+ * super.c
+ *
+ * PURPOSE
+ *  Super block routines for the OSTA-UDF(tm) filesystem.
+ *
+ * DESCRIPTION
+ *  OSTA-UDF(tm) = Optical Storage Technology Association
+ *  Universal Disk Format.
+ *
+ *  This code is based on version 2.00 of the UDF specification,
+ *  and revision 3 of the ECMA 167 standard [equivalent to ISO 13346].
+ *    http://www.osta.org/
+ *    http://www.ecma.ch/
+ *    http://www.iso.org/
+ *
+ * COPYRIGHT
+ *  This file is distributed under the terms of the GNU General Public
+ *  License (GPL). Copies of the GPL can be obtained from:
+ *    ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *  Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998 Dave Boynton
+ *  (C) 1998-2004 Ben Fennema
+ *  (C) 2000 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  09/24/98 dgb  changed to allow compiling outside of kernel, and
+ *                added some debugging.
+ *  10/01/98 dgb  updated to allow (some) possibility of compiling w/2.0.34
+ *  10/16/98      attempting some multi-session support
+ *  10/17/98      added freespace count for "df"
+ *  11/11/98 gr   added novrs option
+ *  11/26/98 dgb  added fileset,anchor mount options
+ *  12/06/98 blf  really hosed things royally. vat/sparing support. sequenced
+ *                vol descs. rewrote option handling based on isofs
+ *  12/20/98      find the free space bitmap (if it exists)
+ */
+
+#include "udfdecl.h"
+
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/parser.h>
+#include <linux/stat.h>
+#include <linux/cdrom.h>
+#include <linux/nls.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/vmalloc.h>
+#include <linux/errno.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/bitmap.h>
+#include <linux/crc-itu-t.h>
+#include <asm/byteorder.h>
+
+#include "udf_sb.h"
+#include "udf_i.h"
+
+#include <linux/init.h>
+#include <asm/uaccess.h>
+
+#define VDS_POS_PRIMARY_VOL_DESC	0
+#define VDS_POS_UNALLOC_SPACE_DESC	1
+#define VDS_POS_LOGICAL_VOL_DESC	2
+#define VDS_POS_PARTITION_DESC		3
+#define VDS_POS_IMP_USE_VOL_DESC	4
+#define VDS_POS_VOL_DESC_PTR		5
+#define VDS_POS_TERMINATING_DESC	6
+#define VDS_POS_LENGTH			7
+
+#define UDF_DEFAULT_BLOCKSIZE 2048
+
+static char error_buf[1024];
+
+/* These are the "meat" - everything else is stuffing */
+static int udf_fill_super(struct super_block *, void *, int);
+static void udf_put_super(struct super_block *);
+static int udf_sync_fs(struct super_block *, int);
+static int udf_remount_fs(struct super_block *, int *, char *);
+static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
+static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *,
+			    struct kernel_lb_addr *);
+static void udf_load_fileset(struct super_block *, struct buffer_head *,
+			     struct kernel_lb_addr *);
+static void udf_open_lvid(struct super_block *);
+static void udf_close_lvid(struct super_block *);
+static unsigned int udf_count_free(struct super_block *);
+static int udf_statfs(struct dentry *, struct kstatfs *);
+static int udf_show_options(struct seq_file *, struct vfsmount *);
+static void udf_error(struct super_block *sb, const char *function,
+		      const char *fmt, ...);
+
+struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
+{
+	struct logicalVolIntegrityDesc *lvid =
+		(struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
+	__u32 number_of_partitions = le32_to_cpu(lvid->numOfPartitions);
+	__u32 offset = number_of_partitions * 2 *
+				sizeof(uint32_t)/sizeof(uint8_t);
+	return (struct logicalVolIntegrityDescImpUse *)&(lvid->impUse[offset]);
+}
+
+/* UDF filesystem type */
+static struct dentry *udf_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super);
+}
+
+static struct file_system_type udf_fstype = {
+	.owner		= THIS_MODULE,
+	.name		= "udf",
+	.mount		= udf_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static struct kmem_cache *udf_inode_cachep;
+
+static struct inode *udf_alloc_inode(struct super_block *sb)
+{
+	struct udf_inode_info *ei;
+	ei = kmem_cache_alloc(udf_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+
+	ei->i_unique = 0;
+	ei->i_lenExtents = 0;
+	ei->i_next_alloc_block = 0;
+	ei->i_next_alloc_goal = 0;
+	ei->i_strat4096 = 0;
+	init_rwsem(&ei->i_data_sem);
+
+	return &ei->vfs_inode;
+}
+
+static void udf_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(udf_inode_cachep, UDF_I(inode));
+}
+
+static void udf_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, udf_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct udf_inode_info *ei = (struct udf_inode_info *)foo;
+
+	ei->i_ext.i_data = NULL;
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	udf_inode_cachep = kmem_cache_create("udf_inode_cache",
+					     sizeof(struct udf_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT |
+						 SLAB_MEM_SPREAD),
+					     init_once);
+	if (!udf_inode_cachep)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(udf_inode_cachep);
+}
+
+/* Superblock operations */
+static const struct super_operations udf_sb_ops = {
+	.alloc_inode	= udf_alloc_inode,
+	.destroy_inode	= udf_destroy_inode,
+	.write_inode	= udf_write_inode,
+	.evict_inode	= udf_evict_inode,
+	.put_super	= udf_put_super,
+	.sync_fs	= udf_sync_fs,
+	.statfs		= udf_statfs,
+	.remount_fs	= udf_remount_fs,
+	.show_options	= udf_show_options,
+};
+
+struct udf_options {
+	unsigned char novrs;
+	unsigned int blocksize;
+	unsigned int session;
+	unsigned int lastblock;
+	unsigned int anchor;
+	unsigned int volume;
+	unsigned short partition;
+	unsigned int fileset;
+	unsigned int rootdir;
+	unsigned int flags;
+	mode_t umask;
+	gid_t gid;
+	uid_t uid;
+	mode_t fmode;
+	mode_t dmode;
+	struct nls_table *nls_map;
+};
+
+static int __init init_udf_fs(void)
+{
+	int err;
+
+	err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&udf_fstype);
+	if (err)
+		goto out;
+
+	return 0;
+
+out:
+	destroy_inodecache();
+
+out1:
+	return err;
+}
+
+static void __exit exit_udf_fs(void)
+{
+	unregister_filesystem(&udf_fstype);
+	destroy_inodecache();
+}
+
+module_init(init_udf_fs)
+module_exit(exit_udf_fs)
+
+static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+
+	sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map),
+				  GFP_KERNEL);
+	if (!sbi->s_partmaps) {
+		udf_error(sb, __func__,
+			  "Unable to allocate space for %d partition maps",
+			  count);
+		sbi->s_partitions = 0;
+		return -ENOMEM;
+	}
+
+	sbi->s_partitions = count;
+	return 0;
+}
+
+static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
+{
+	struct super_block *sb = mnt->mnt_sb;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+
+	if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT))
+		seq_puts(seq, ",nostrict");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET))
+		seq_printf(seq, ",bs=%lu", sb->s_blocksize);
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
+		seq_puts(seq, ",unhide");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE))
+		seq_puts(seq, ",undelete");
+	if (!UDF_QUERY_FLAG(sb, UDF_FLAG_USE_AD_IN_ICB))
+		seq_puts(seq, ",noadinicb");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_USE_SHORT_AD))
+		seq_puts(seq, ",shortad");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_FORGET))
+		seq_puts(seq, ",uid=forget");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_IGNORE))
+		seq_puts(seq, ",uid=ignore");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_FORGET))
+		seq_puts(seq, ",gid=forget");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_IGNORE))
+		seq_puts(seq, ",gid=ignore");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET))
+		seq_printf(seq, ",uid=%u", sbi->s_uid);
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET))
+		seq_printf(seq, ",gid=%u", sbi->s_gid);
+	if (sbi->s_umask != 0)
+		seq_printf(seq, ",umask=%o", sbi->s_umask);
+	if (sbi->s_fmode != UDF_INVALID_MODE)
+		seq_printf(seq, ",mode=%o", sbi->s_fmode);
+	if (sbi->s_dmode != UDF_INVALID_MODE)
+		seq_printf(seq, ",dmode=%o", sbi->s_dmode);
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET))
+		seq_printf(seq, ",session=%u", sbi->s_session);
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET))
+		seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
+	if (sbi->s_anchor != 0)
+		seq_printf(seq, ",anchor=%u", sbi->s_anchor);
+	/*
+	 * volume, partition, fileset and rootdir seem to be ignored
+	 * currently
+	 */
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
+		seq_puts(seq, ",utf8");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) && sbi->s_nls_map)
+		seq_printf(seq, ",iocharset=%s", sbi->s_nls_map->charset);
+
+	return 0;
+}
+
+/*
+ * udf_parse_options
+ *
+ * PURPOSE
+ *	Parse mount options.
+ *
+ * DESCRIPTION
+ *	The following mount options are supported:
+ *
+ *	gid=		Set the default group.
+ *	umask=		Set the default umask.
+ *	mode=		Set the default file permissions.
+ *	dmode=		Set the default directory permissions.
+ *	uid=		Set the default user.
+ *	bs=		Set the block size.
+ *	unhide		Show otherwise hidden files.
+ *	undelete	Show deleted files in lists.
+ *	adinicb		Embed data in the inode (default)
+ *	noadinicb	Don't embed data in the inode
+ *	shortad		Use short ad's
+ *	longad		Use long ad's (default)
+ *	nostrict	Unset strict conformance
+ *	iocharset=	Set the NLS character set
+ *
+ *	The remaining are for debugging and disaster recovery:
+ *
+ *	novrs		Skip volume sequence recognition
+ *
+ *	The following expect a offset from 0.
+ *
+ *	session=	Set the CDROM session (default= last session)
+ *	anchor=		Override standard anchor location. (default= 256)
+ *	volume=		Override the VolumeDesc location. (unused)
+ *	partition=	Override the PartitionDesc location. (unused)
+ *	lastblock=	Set the last block of the filesystem/
+ *
+ *	The following expect a offset from the partition root.
+ *
+ *	fileset=	Override the fileset block location. (unused)
+ *	rootdir=	Override the root directory location. (unused)
+ *		WARNING: overriding the rootdir to a non-directory may
+ *		yield highly unpredictable results.
+ *
+ * PRE-CONDITIONS
+ *	options		Pointer to mount options string.
+ *	uopts		Pointer to mount options variable.
+ *
+ * POST-CONDITIONS
+ *	<return>	1	Mount options parsed okay.
+ *	<return>	0	Error parsing mount options.
+ *
+ * HISTORY
+ *	July 1, 1997 - Andrew E. Mileski
+ *	Written, tested, and released.
+ */
+
+enum {
+	Opt_novrs, Opt_nostrict, Opt_bs, Opt_unhide, Opt_undelete,
+	Opt_noadinicb, Opt_adinicb, Opt_shortad, Opt_longad,
+	Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
+	Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
+	Opt_rootdir, Opt_utf8, Opt_iocharset,
+	Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore,
+	Opt_fmode, Opt_dmode
+};
+
+static const match_table_t tokens = {
+	{Opt_novrs,	"novrs"},
+	{Opt_nostrict,	"nostrict"},
+	{Opt_bs,	"bs=%u"},
+	{Opt_unhide,	"unhide"},
+	{Opt_undelete,	"undelete"},
+	{Opt_noadinicb,	"noadinicb"},
+	{Opt_adinicb,	"adinicb"},
+	{Opt_shortad,	"shortad"},
+	{Opt_longad,	"longad"},
+	{Opt_uforget,	"uid=forget"},
+	{Opt_uignore,	"uid=ignore"},
+	{Opt_gforget,	"gid=forget"},
+	{Opt_gignore,	"gid=ignore"},
+	{Opt_gid,	"gid=%u"},
+	{Opt_uid,	"uid=%u"},
+	{Opt_umask,	"umask=%o"},
+	{Opt_session,	"session=%u"},
+	{Opt_lastblock,	"lastblock=%u"},
+	{Opt_anchor,	"anchor=%u"},
+	{Opt_volume,	"volume=%u"},
+	{Opt_partition,	"partition=%u"},
+	{Opt_fileset,	"fileset=%u"},
+	{Opt_rootdir,	"rootdir=%u"},
+	{Opt_utf8,	"utf8"},
+	{Opt_iocharset,	"iocharset=%s"},
+	{Opt_fmode,     "mode=%o"},
+	{Opt_dmode,     "dmode=%o"},
+	{Opt_err,	NULL}
+};
+
+static int udf_parse_options(char *options, struct udf_options *uopt,
+			     bool remount)
+{
+	char *p;
+	int option;
+
+	uopt->novrs = 0;
+	uopt->partition = 0xFFFF;
+	uopt->session = 0xFFFFFFFF;
+	uopt->lastblock = 0;
+	uopt->anchor = 0;
+	uopt->volume = 0xFFFFFFFF;
+	uopt->rootdir = 0xFFFFFFFF;
+	uopt->fileset = 0xFFFFFFFF;
+	uopt->nls_map = NULL;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_novrs:
+			uopt->novrs = 1;
+			break;
+		case Opt_bs:
+			if (match_int(&args[0], &option))
+				return 0;
+			uopt->blocksize = option;
+			uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
+			break;
+		case Opt_unhide:
+			uopt->flags |= (1 << UDF_FLAG_UNHIDE);
+			break;
+		case Opt_undelete:
+			uopt->flags |= (1 << UDF_FLAG_UNDELETE);
+			break;
+		case Opt_noadinicb:
+			uopt->flags &= ~(1 << UDF_FLAG_USE_AD_IN_ICB);
+			break;
+		case Opt_adinicb:
+			uopt->flags |= (1 << UDF_FLAG_USE_AD_IN_ICB);
+			break;
+		case Opt_shortad:
+			uopt->flags |= (1 << UDF_FLAG_USE_SHORT_AD);
+			break;
+		case Opt_longad:
+			uopt->flags &= ~(1 << UDF_FLAG_USE_SHORT_AD);
+			break;
+		case Opt_gid:
+			if (match_int(args, &option))
+				return 0;
+			uopt->gid = option;
+			uopt->flags |= (1 << UDF_FLAG_GID_SET);
+			break;
+		case Opt_uid:
+			if (match_int(args, &option))
+				return 0;
+			uopt->uid = option;
+			uopt->flags |= (1 << UDF_FLAG_UID_SET);
+			break;
+		case Opt_umask:
+			if (match_octal(args, &option))
+				return 0;
+			uopt->umask = option;
+			break;
+		case Opt_nostrict:
+			uopt->flags &= ~(1 << UDF_FLAG_STRICT);
+			break;
+		case Opt_session:
+			if (match_int(args, &option))
+				return 0;
+			uopt->session = option;
+			if (!remount)
+				uopt->flags |= (1 << UDF_FLAG_SESSION_SET);
+			break;
+		case Opt_lastblock:
+			if (match_int(args, &option))
+				return 0;
+			uopt->lastblock = option;
+			if (!remount)
+				uopt->flags |= (1 << UDF_FLAG_LASTBLOCK_SET);
+			break;
+		case Opt_anchor:
+			if (match_int(args, &option))
+				return 0;
+			uopt->anchor = option;
+			break;
+		case Opt_volume:
+			if (match_int(args, &option))
+				return 0;
+			uopt->volume = option;
+			break;
+		case Opt_partition:
+			if (match_int(args, &option))
+				return 0;
+			uopt->partition = option;
+			break;
+		case Opt_fileset:
+			if (match_int(args, &option))
+				return 0;
+			uopt->fileset = option;
+			break;
+		case Opt_rootdir:
+			if (match_int(args, &option))
+				return 0;
+			uopt->rootdir = option;
+			break;
+		case Opt_utf8:
+			uopt->flags |= (1 << UDF_FLAG_UTF8);
+			break;
+#ifdef CONFIG_UDF_NLS
+		case Opt_iocharset:
+			uopt->nls_map = load_nls(args[0].from);
+			uopt->flags |= (1 << UDF_FLAG_NLS_MAP);
+			break;
+#endif
+		case Opt_uignore:
+			uopt->flags |= (1 << UDF_FLAG_UID_IGNORE);
+			break;
+		case Opt_uforget:
+			uopt->flags |= (1 << UDF_FLAG_UID_FORGET);
+			break;
+		case Opt_gignore:
+			uopt->flags |= (1 << UDF_FLAG_GID_IGNORE);
+			break;
+		case Opt_gforget:
+			uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
+			break;
+		case Opt_fmode:
+			if (match_octal(args, &option))
+				return 0;
+			uopt->fmode = option & 0777;
+			break;
+		case Opt_dmode:
+			if (match_octal(args, &option))
+				return 0;
+			uopt->dmode = option & 0777;
+			break;
+		default:
+			printk(KERN_ERR "udf: bad mount option \"%s\" "
+			       "or missing value\n", p);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
+{
+	struct udf_options uopt;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	int error = 0;
+
+	uopt.flags = sbi->s_flags;
+	uopt.uid   = sbi->s_uid;
+	uopt.gid   = sbi->s_gid;
+	uopt.umask = sbi->s_umask;
+	uopt.fmode = sbi->s_fmode;
+	uopt.dmode = sbi->s_dmode;
+
+	if (!udf_parse_options(options, &uopt, true))
+		return -EINVAL;
+
+	write_lock(&sbi->s_cred_lock);
+	sbi->s_flags = uopt.flags;
+	sbi->s_uid   = uopt.uid;
+	sbi->s_gid   = uopt.gid;
+	sbi->s_umask = uopt.umask;
+	sbi->s_fmode = uopt.fmode;
+	sbi->s_dmode = uopt.dmode;
+	write_unlock(&sbi->s_cred_lock);
+
+	if (sbi->s_lvid_bh) {
+		int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
+		if (write_rev > UDF_MAX_WRITE_VERSION)
+			*flags |= MS_RDONLY;
+	}
+
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		goto out_unlock;
+
+	if (*flags & MS_RDONLY)
+		udf_close_lvid(sb);
+	else
+		udf_open_lvid(sb);
+
+out_unlock:
+	return error;
+}
+
+/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
+/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
+static loff_t udf_check_vsd(struct super_block *sb)
+{
+	struct volStructDesc *vsd = NULL;
+	loff_t sector = 32768;
+	int sectorsize;
+	struct buffer_head *bh = NULL;
+	int nsr02 = 0;
+	int nsr03 = 0;
+	struct udf_sb_info *sbi;
+
+	sbi = UDF_SB(sb);
+	if (sb->s_blocksize < sizeof(struct volStructDesc))
+		sectorsize = sizeof(struct volStructDesc);
+	else
+		sectorsize = sb->s_blocksize;
+
+	sector += (sbi->s_session << sb->s_blocksize_bits);
+
+	udf_debug("Starting at sector %u (%ld byte sectors)\n",
+		  (unsigned int)(sector >> sb->s_blocksize_bits),
+		  sb->s_blocksize);
+	/* Process the sequence (if applicable) */
+	for (; !nsr02 && !nsr03; sector += sectorsize) {
+		/* Read a block */
+		bh = udf_tread(sb, sector >> sb->s_blocksize_bits);
+		if (!bh)
+			break;
+
+		/* Look for ISO  descriptors */
+		vsd = (struct volStructDesc *)(bh->b_data +
+					      (sector & (sb->s_blocksize - 1)));
+
+		if (vsd->stdIdent[0] == 0) {
+			brelse(bh);
+			break;
+		} else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
+				    VSD_STD_ID_LEN)) {
+			switch (vsd->structType) {
+			case 0:
+				udf_debug("ISO9660 Boot Record found\n");
+				break;
+			case 1:
+				udf_debug("ISO9660 Primary Volume Descriptor "
+					  "found\n");
+				break;
+			case 2:
+				udf_debug("ISO9660 Supplementary Volume "
+					  "Descriptor found\n");
+				break;
+			case 3:
+				udf_debug("ISO9660 Volume Partition Descriptor "
+					  "found\n");
+				break;
+			case 255:
+				udf_debug("ISO9660 Volume Descriptor Set "
+					  "Terminator found\n");
+				break;
+			default:
+				udf_debug("ISO9660 VRS (%u) found\n",
+					  vsd->structType);
+				break;
+			}
+		} else if (!strncmp(vsd->stdIdent, VSD_STD_ID_BEA01,
+				    VSD_STD_ID_LEN))
+			; /* nothing */
+		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_TEA01,
+				    VSD_STD_ID_LEN)) {
+			brelse(bh);
+			break;
+		} else if (!strncmp(vsd->stdIdent, VSD_STD_ID_NSR02,
+				    VSD_STD_ID_LEN))
+			nsr02 = sector;
+		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_NSR03,
+				    VSD_STD_ID_LEN))
+			nsr03 = sector;
+		brelse(bh);
+	}
+
+	if (nsr03)
+		return nsr03;
+	else if (nsr02)
+		return nsr02;
+	else if (sector - (sbi->s_session << sb->s_blocksize_bits) == 32768)
+		return -1;
+	else
+		return 0;
+}
+
+static int udf_find_fileset(struct super_block *sb,
+			    struct kernel_lb_addr *fileset,
+			    struct kernel_lb_addr *root)
+{
+	struct buffer_head *bh = NULL;
+	long lastblock;
+	uint16_t ident;
+	struct udf_sb_info *sbi;
+
+	if (fileset->logicalBlockNum != 0xFFFFFFFF ||
+	    fileset->partitionReferenceNum != 0xFFFF) {
+		bh = udf_read_ptagged(sb, fileset, 0, &ident);
+
+		if (!bh) {
+			return 1;
+		} else if (ident != TAG_IDENT_FSD) {
+			brelse(bh);
+			return 1;
+		}
+
+	}
+
+	sbi = UDF_SB(sb);
+	if (!bh) {
+		/* Search backwards through the partitions */
+		struct kernel_lb_addr newfileset;
+
+/* --> cvg: FIXME - is it reasonable? */
+		return 1;
+
+		for (newfileset.partitionReferenceNum = sbi->s_partitions - 1;
+		     (newfileset.partitionReferenceNum != 0xFFFF &&
+		      fileset->logicalBlockNum == 0xFFFFFFFF &&
+		      fileset->partitionReferenceNum == 0xFFFF);
+		     newfileset.partitionReferenceNum--) {
+			lastblock = sbi->s_partmaps
+					[newfileset.partitionReferenceNum]
+						.s_partition_len;
+			newfileset.logicalBlockNum = 0;
+
+			do {
+				bh = udf_read_ptagged(sb, &newfileset, 0,
+						      &ident);
+				if (!bh) {
+					newfileset.logicalBlockNum++;
+					continue;
+				}
+
+				switch (ident) {
+				case TAG_IDENT_SBD:
+				{
+					struct spaceBitmapDesc *sp;
+					sp = (struct spaceBitmapDesc *)
+								bh->b_data;
+					newfileset.logicalBlockNum += 1 +
+						((le32_to_cpu(sp->numOfBytes) +
+						  sizeof(struct spaceBitmapDesc)
+						  - 1) >> sb->s_blocksize_bits);
+					brelse(bh);
+					break;
+				}
+				case TAG_IDENT_FSD:
+					*fileset = newfileset;
+					break;
+				default:
+					newfileset.logicalBlockNum++;
+					brelse(bh);
+					bh = NULL;
+					break;
+				}
+			} while (newfileset.logicalBlockNum < lastblock &&
+				 fileset->logicalBlockNum == 0xFFFFFFFF &&
+				 fileset->partitionReferenceNum == 0xFFFF);
+		}
+	}
+
+	if ((fileset->logicalBlockNum != 0xFFFFFFFF ||
+	     fileset->partitionReferenceNum != 0xFFFF) && bh) {
+		udf_debug("Fileset at block=%d, partition=%d\n",
+			  fileset->logicalBlockNum,
+			  fileset->partitionReferenceNum);
+
+		sbi->s_partition = fileset->partitionReferenceNum;
+		udf_load_fileset(sb, bh, root);
+		brelse(bh);
+		return 0;
+	}
+	return 1;
+}
+
+static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
+{
+	struct primaryVolDesc *pvoldesc;
+	struct ustr *instr, *outstr;
+	struct buffer_head *bh;
+	uint16_t ident;
+	int ret = 1;
+
+	instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+	if (!instr)
+		return 1;
+
+	outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+	if (!outstr)
+		goto out1;
+
+	bh = udf_read_tagged(sb, block, block, &ident);
+	if (!bh)
+		goto out2;
+
+	BUG_ON(ident != TAG_IDENT_PVD);
+
+	pvoldesc = (struct primaryVolDesc *)bh->b_data;
+
+	if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
+			      pvoldesc->recordingDateAndTime)) {
+#ifdef UDFFS_DEBUG
+		struct timestamp *ts = &pvoldesc->recordingDateAndTime;
+		udf_debug("recording time %04u/%02u/%02u"
+			  " %02u:%02u (%x)\n",
+			  le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
+			  ts->minute, le16_to_cpu(ts->typeAndTimezone));
+#endif
+	}
+
+	if (!udf_build_ustr(instr, pvoldesc->volIdent, 32))
+		if (udf_CS0toUTF8(outstr, instr)) {
+			strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
+				outstr->u_len > 31 ? 31 : outstr->u_len);
+			udf_debug("volIdent[] = '%s'\n",
+					UDF_SB(sb)->s_volume_ident);
+		}
+
+	if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
+		if (udf_CS0toUTF8(outstr, instr))
+			udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
+
+	brelse(bh);
+	ret = 0;
+out2:
+	kfree(outstr);
+out1:
+	kfree(instr);
+	return ret;
+}
+
+static int udf_load_metadata_files(struct super_block *sb, int partition)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map;
+	struct udf_meta_data *mdata;
+	struct kernel_lb_addr addr;
+	int fe_error = 0;
+
+	map = &sbi->s_partmaps[partition];
+	mdata = &map->s_type_specific.s_metadata;
+
+	/* metadata address */
+	addr.logicalBlockNum =  mdata->s_meta_file_loc;
+	addr.partitionReferenceNum = map->s_partition_num;
+
+	udf_debug("Metadata file location: block = %d part = %d\n",
+			  addr.logicalBlockNum, addr.partitionReferenceNum);
+
+	mdata->s_metadata_fe = udf_iget(sb, &addr);
+
+	if (mdata->s_metadata_fe == NULL) {
+		udf_warning(sb, __func__, "metadata inode efe not found, "
+				"will try mirror inode.");
+		fe_error = 1;
+	} else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type !=
+		 ICBTAG_FLAG_AD_SHORT) {
+		udf_warning(sb, __func__, "metadata inode efe does not have "
+			"short allocation descriptors!");
+		fe_error = 1;
+		iput(mdata->s_metadata_fe);
+		mdata->s_metadata_fe = NULL;
+	}
+
+	/* mirror file entry */
+	addr.logicalBlockNum = mdata->s_mirror_file_loc;
+	addr.partitionReferenceNum = map->s_partition_num;
+
+	udf_debug("Mirror metadata file location: block = %d part = %d\n",
+			  addr.logicalBlockNum, addr.partitionReferenceNum);
+
+	mdata->s_mirror_fe = udf_iget(sb, &addr);
+
+	if (mdata->s_mirror_fe == NULL) {
+		if (fe_error) {
+			udf_error(sb, __func__, "mirror inode efe not found "
+			"and metadata inode is missing too, exiting...");
+			goto error_exit;
+		} else
+			udf_warning(sb, __func__, "mirror inode efe not found,"
+					" but metadata inode is OK");
+	} else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type !=
+		 ICBTAG_FLAG_AD_SHORT) {
+		udf_warning(sb, __func__, "mirror inode efe does not have "
+			"short allocation descriptors!");
+		iput(mdata->s_mirror_fe);
+		mdata->s_mirror_fe = NULL;
+		if (fe_error)
+			goto error_exit;
+	}
+
+	/*
+	 * bitmap file entry
+	 * Note:
+	 * Load only if bitmap file location differs from 0xFFFFFFFF (DCN-5102)
+	*/
+	if (mdata->s_bitmap_file_loc != 0xFFFFFFFF) {
+		addr.logicalBlockNum = mdata->s_bitmap_file_loc;
+		addr.partitionReferenceNum = map->s_partition_num;
+
+		udf_debug("Bitmap file location: block = %d part = %d\n",
+			addr.logicalBlockNum, addr.partitionReferenceNum);
+
+		mdata->s_bitmap_fe = udf_iget(sb, &addr);
+
+		if (mdata->s_bitmap_fe == NULL) {
+			if (sb->s_flags & MS_RDONLY)
+				udf_warning(sb, __func__, "bitmap inode efe "
+					"not found but it's ok since the disc"
+					" is mounted read-only");
+			else {
+				udf_error(sb, __func__, "bitmap inode efe not "
+					"found and attempted read-write mount");
+				goto error_exit;
+			}
+		}
+	}
+
+	udf_debug("udf_load_metadata_files Ok\n");
+
+	return 0;
+
+error_exit:
+	return 1;
+}
+
+static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
+			     struct kernel_lb_addr *root)
+{
+	struct fileSetDesc *fset;
+
+	fset = (struct fileSetDesc *)bh->b_data;
+
+	*root = lelb_to_cpu(fset->rootDirectoryICB.extLocation);
+
+	UDF_SB(sb)->s_serial_number = le16_to_cpu(fset->descTag.tagSerialNum);
+
+	udf_debug("Rootdir at block=%d, partition=%d\n",
+		  root->logicalBlockNum, root->partitionReferenceNum);
+}
+
+int udf_compute_nr_groups(struct super_block *sb, u32 partition)
+{
+	struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
+	return DIV_ROUND_UP(map->s_partition_len +
+			    (sizeof(struct spaceBitmapDesc) << 3),
+			    sb->s_blocksize * 8);
+}
+
+static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
+{
+	struct udf_bitmap *bitmap;
+	int nr_groups;
+	int size;
+
+	nr_groups = udf_compute_nr_groups(sb, index);
+	size = sizeof(struct udf_bitmap) +
+		(sizeof(struct buffer_head *) * nr_groups);
+
+	if (size <= PAGE_SIZE)
+		bitmap = kzalloc(size, GFP_KERNEL);
+	else
+		bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
+
+	if (bitmap == NULL) {
+		udf_error(sb, __func__,
+			  "Unable to allocate space for bitmap "
+			  "and %d buffer_head pointers", nr_groups);
+		return NULL;
+	}
+
+	bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);
+	bitmap->s_nr_groups = nr_groups;
+	return bitmap;
+}
+
+static int udf_fill_partdesc_info(struct super_block *sb,
+		struct partitionDesc *p, int p_index)
+{
+	struct udf_part_map *map;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct partitionHeaderDesc *phd;
+
+	map = &sbi->s_partmaps[p_index];
+
+	map->s_partition_len = le32_to_cpu(p->partitionLength); /* blocks */
+	map->s_partition_root = le32_to_cpu(p->partitionStartingLocation);
+
+	if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_READ_ONLY))
+		map->s_partition_flags |= UDF_PART_FLAG_READ_ONLY;
+	if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_WRITE_ONCE))
+		map->s_partition_flags |= UDF_PART_FLAG_WRITE_ONCE;
+	if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_REWRITABLE))
+		map->s_partition_flags |= UDF_PART_FLAG_REWRITABLE;
+	if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE))
+		map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE;
+
+	udf_debug("Partition (%d type %x) starts at physical %d, "
+		  "block length %d\n", p_index,
+		  map->s_partition_type, map->s_partition_root,
+		  map->s_partition_len);
+
+	if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
+	    strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
+		return 0;
+
+	phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
+	if (phd->unallocSpaceTable.extLength) {
+		struct kernel_lb_addr loc = {
+			.logicalBlockNum = le32_to_cpu(
+				phd->unallocSpaceTable.extPosition),
+			.partitionReferenceNum = p_index,
+		};
+
+		map->s_uspace.s_table = udf_iget(sb, &loc);
+		if (!map->s_uspace.s_table) {
+			udf_debug("cannot load unallocSpaceTable (part %d)\n",
+					p_index);
+			return 1;
+		}
+		map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
+		udf_debug("unallocSpaceTable (part %d) @ %ld\n",
+				p_index, map->s_uspace.s_table->i_ino);
+	}
+
+	if (phd->unallocSpaceBitmap.extLength) {
+		struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
+		if (!bitmap)
+			return 1;
+		map->s_uspace.s_bitmap = bitmap;
+		bitmap->s_extLength = le32_to_cpu(
+				phd->unallocSpaceBitmap.extLength);
+		bitmap->s_extPosition = le32_to_cpu(
+				phd->unallocSpaceBitmap.extPosition);
+		map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
+		udf_debug("unallocSpaceBitmap (part %d) @ %d\n", p_index,
+						bitmap->s_extPosition);
+	}
+
+	if (phd->partitionIntegrityTable.extLength)
+		udf_debug("partitionIntegrityTable (part %d)\n", p_index);
+
+	if (phd->freedSpaceTable.extLength) {
+		struct kernel_lb_addr loc = {
+			.logicalBlockNum = le32_to_cpu(
+				phd->freedSpaceTable.extPosition),
+			.partitionReferenceNum = p_index,
+		};
+
+		map->s_fspace.s_table = udf_iget(sb, &loc);
+		if (!map->s_fspace.s_table) {
+			udf_debug("cannot load freedSpaceTable (part %d)\n",
+				p_index);
+			return 1;
+		}
+
+		map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
+		udf_debug("freedSpaceTable (part %d) @ %ld\n",
+				p_index, map->s_fspace.s_table->i_ino);
+	}
+
+	if (phd->freedSpaceBitmap.extLength) {
+		struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
+		if (!bitmap)
+			return 1;
+		map->s_fspace.s_bitmap = bitmap;
+		bitmap->s_extLength = le32_to_cpu(
+				phd->freedSpaceBitmap.extLength);
+		bitmap->s_extPosition = le32_to_cpu(
+				phd->freedSpaceBitmap.extPosition);
+		map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
+		udf_debug("freedSpaceBitmap (part %d) @ %d\n", p_index,
+					bitmap->s_extPosition);
+	}
+	return 0;
+}
+
+static void udf_find_vat_block(struct super_block *sb, int p_index,
+			       int type1_index, sector_t start_block)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map = &sbi->s_partmaps[p_index];
+	sector_t vat_block;
+	struct kernel_lb_addr ino;
+
+	/*
+	 * VAT file entry is in the last recorded block. Some broken disks have
+	 * it a few blocks before so try a bit harder...
+	 */
+	ino.partitionReferenceNum = type1_index;
+	for (vat_block = start_block;
+	     vat_block >= map->s_partition_root &&
+	     vat_block >= start_block - 3 &&
+	     !sbi->s_vat_inode; vat_block--) {
+		ino.logicalBlockNum = vat_block - map->s_partition_root;
+		sbi->s_vat_inode = udf_iget(sb, &ino);
+	}
+}
+
+static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map = &sbi->s_partmaps[p_index];
+	struct buffer_head *bh = NULL;
+	struct udf_inode_info *vati;
+	uint32_t pos;
+	struct virtualAllocationTable20 *vat20;
+	sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+
+	udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
+	if (!sbi->s_vat_inode &&
+	    sbi->s_last_block != blocks - 1) {
+		printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
+		       " last recorded block (%lu), retrying with the last "
+		       "block of the device (%lu).\n",
+		       (unsigned long)sbi->s_last_block,
+		       (unsigned long)blocks - 1);
+		udf_find_vat_block(sb, p_index, type1_index, blocks - 1);
+	}
+	if (!sbi->s_vat_inode)
+		return 1;
+
+	if (map->s_partition_type == UDF_VIRTUAL_MAP15) {
+		map->s_type_specific.s_virtual.s_start_offset = 0;
+		map->s_type_specific.s_virtual.s_num_entries =
+			(sbi->s_vat_inode->i_size - 36) >> 2;
+	} else if (map->s_partition_type == UDF_VIRTUAL_MAP20) {
+		vati = UDF_I(sbi->s_vat_inode);
+		if (vati->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+			pos = udf_block_map(sbi->s_vat_inode, 0);
+			bh = sb_bread(sb, pos);
+			if (!bh)
+				return 1;
+			vat20 = (struct virtualAllocationTable20 *)bh->b_data;
+		} else {
+			vat20 = (struct virtualAllocationTable20 *)
+							vati->i_ext.i_data;
+		}
+
+		map->s_type_specific.s_virtual.s_start_offset =
+			le16_to_cpu(vat20->lengthHeader);
+		map->s_type_specific.s_virtual.s_num_entries =
+			(sbi->s_vat_inode->i_size -
+				map->s_type_specific.s_virtual.
+					s_start_offset) >> 2;
+		brelse(bh);
+	}
+	return 0;
+}
+
+static int udf_load_partdesc(struct super_block *sb, sector_t block)
+{
+	struct buffer_head *bh;
+	struct partitionDesc *p;
+	struct udf_part_map *map;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	int i, type1_idx;
+	uint16_t partitionNumber;
+	uint16_t ident;
+	int ret = 0;
+
+	bh = udf_read_tagged(sb, block, block, &ident);
+	if (!bh)
+		return 1;
+	if (ident != TAG_IDENT_PD)
+		goto out_bh;
+
+	p = (struct partitionDesc *)bh->b_data;
+	partitionNumber = le16_to_cpu(p->partitionNumber);
+
+	/* First scan for TYPE1, SPARABLE and METADATA partitions */
+	for (i = 0; i < sbi->s_partitions; i++) {
+		map = &sbi->s_partmaps[i];
+		udf_debug("Searching map: (%d == %d)\n",
+			  map->s_partition_num, partitionNumber);
+		if (map->s_partition_num == partitionNumber &&
+		    (map->s_partition_type == UDF_TYPE1_MAP15 ||
+		     map->s_partition_type == UDF_SPARABLE_MAP15))
+			break;
+	}
+
+	if (i >= sbi->s_partitions) {
+		udf_debug("Partition (%d) not found in partition map\n",
+			  partitionNumber);
+		goto out_bh;
+	}
+
+	ret = udf_fill_partdesc_info(sb, p, i);
+
+	/*
+	 * Now rescan for VIRTUAL or METADATA partitions when SPARABLE and
+	 * PHYSICAL partitions are already set up
+	 */
+	type1_idx = i;
+	for (i = 0; i < sbi->s_partitions; i++) {
+		map = &sbi->s_partmaps[i];
+
+		if (map->s_partition_num == partitionNumber &&
+		    (map->s_partition_type == UDF_VIRTUAL_MAP15 ||
+		     map->s_partition_type == UDF_VIRTUAL_MAP20 ||
+		     map->s_partition_type == UDF_METADATA_MAP25))
+			break;
+	}
+
+	if (i >= sbi->s_partitions)
+		goto out_bh;
+
+	ret = udf_fill_partdesc_info(sb, p, i);
+	if (ret)
+		goto out_bh;
+
+	if (map->s_partition_type == UDF_METADATA_MAP25) {
+		ret = udf_load_metadata_files(sb, i);
+		if (ret) {
+			printk(KERN_ERR "UDF-fs: error loading MetaData "
+			"partition map %d\n", i);
+			goto out_bh;
+		}
+	} else {
+		ret = udf_load_vat(sb, i, type1_idx);
+		if (ret)
+			goto out_bh;
+		/*
+		 * Mark filesystem read-only if we have a partition with
+		 * virtual map since we don't handle writing to it (we
+		 * overwrite blocks instead of relocating them).
+		 */
+		sb->s_flags |= MS_RDONLY;
+		printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only "
+			"because writing to pseudooverwrite partition is "
+			"not implemented.\n");
+	}
+out_bh:
+	/* In case loading failed, we handle cleanup in udf_fill_super */
+	brelse(bh);
+	return ret;
+}
+
+static int udf_load_logicalvol(struct super_block *sb, sector_t block,
+			       struct kernel_lb_addr *fileset)
+{
+	struct logicalVolDesc *lvd;
+	int i, j, offset;
+	uint8_t type;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct genericPartitionMap *gpm;
+	uint16_t ident;
+	struct buffer_head *bh;
+	int ret = 0;
+
+	bh = udf_read_tagged(sb, block, block, &ident);
+	if (!bh)
+		return 1;
+	BUG_ON(ident != TAG_IDENT_LVD);
+	lvd = (struct logicalVolDesc *)bh->b_data;
+
+	i = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps));
+	if (i != 0) {
+		ret = i;
+		goto out_bh;
+	}
+
+	for (i = 0, offset = 0;
+	     i < sbi->s_partitions && offset < le32_to_cpu(lvd->mapTableLength);
+	     i++, offset += gpm->partitionMapLength) {
+		struct udf_part_map *map = &sbi->s_partmaps[i];
+		gpm = (struct genericPartitionMap *)
+				&(lvd->partitionMaps[offset]);
+		type = gpm->partitionMapType;
+		if (type == 1) {
+			struct genericPartitionMap1 *gpm1 =
+				(struct genericPartitionMap1 *)gpm;
+			map->s_partition_type = UDF_TYPE1_MAP15;
+			map->s_volumeseqnum = le16_to_cpu(gpm1->volSeqNum);
+			map->s_partition_num = le16_to_cpu(gpm1->partitionNum);
+			map->s_partition_func = NULL;
+		} else if (type == 2) {
+			struct udfPartitionMap2 *upm2 =
+						(struct udfPartitionMap2 *)gpm;
+			if (!strncmp(upm2->partIdent.ident, UDF_ID_VIRTUAL,
+						strlen(UDF_ID_VIRTUAL))) {
+				u16 suf =
+					le16_to_cpu(((__le16 *)upm2->partIdent.
+							identSuffix)[0]);
+				if (suf < 0x0200) {
+					map->s_partition_type =
+							UDF_VIRTUAL_MAP15;
+					map->s_partition_func =
+							udf_get_pblock_virt15;
+				} else {
+					map->s_partition_type =
+							UDF_VIRTUAL_MAP20;
+					map->s_partition_func =
+							udf_get_pblock_virt20;
+				}
+			} else if (!strncmp(upm2->partIdent.ident,
+						UDF_ID_SPARABLE,
+						strlen(UDF_ID_SPARABLE))) {
+				uint32_t loc;
+				struct sparingTable *st;
+				struct sparablePartitionMap *spm =
+					(struct sparablePartitionMap *)gpm;
+
+				map->s_partition_type = UDF_SPARABLE_MAP15;
+				map->s_type_specific.s_sparing.s_packet_len =
+						le16_to_cpu(spm->packetLength);
+				for (j = 0; j < spm->numSparingTables; j++) {
+					struct buffer_head *bh2;
+
+					loc = le32_to_cpu(
+						spm->locSparingTable[j]);
+					bh2 = udf_read_tagged(sb, loc, loc,
+							     &ident);
+					map->s_type_specific.s_sparing.
+							s_spar_map[j] = bh2;
+
+					if (bh2 == NULL)
+						continue;
+
+					st = (struct sparingTable *)bh2->b_data;
+					if (ident != 0 || strncmp(
+						st->sparingIdent.ident,
+						UDF_ID_SPARING,
+						strlen(UDF_ID_SPARING))) {
+						brelse(bh2);
+						map->s_type_specific.s_sparing.
+							s_spar_map[j] = NULL;
+					}
+				}
+				map->s_partition_func = udf_get_pblock_spar15;
+			} else if (!strncmp(upm2->partIdent.ident,
+						UDF_ID_METADATA,
+						strlen(UDF_ID_METADATA))) {
+				struct udf_meta_data *mdata =
+					&map->s_type_specific.s_metadata;
+				struct metadataPartitionMap *mdm =
+						(struct metadataPartitionMap *)
+						&(lvd->partitionMaps[offset]);
+				udf_debug("Parsing Logical vol part %d "
+					"type %d  id=%s\n", i, type,
+					UDF_ID_METADATA);
+
+				map->s_partition_type = UDF_METADATA_MAP25;
+				map->s_partition_func = udf_get_pblock_meta25;
+
+				mdata->s_meta_file_loc   =
+					le32_to_cpu(mdm->metadataFileLoc);
+				mdata->s_mirror_file_loc =
+					le32_to_cpu(mdm->metadataMirrorFileLoc);
+				mdata->s_bitmap_file_loc =
+					le32_to_cpu(mdm->metadataBitmapFileLoc);
+				mdata->s_alloc_unit_size =
+					le32_to_cpu(mdm->allocUnitSize);
+				mdata->s_align_unit_size =
+					le16_to_cpu(mdm->alignUnitSize);
+				mdata->s_dup_md_flag 	 =
+					mdm->flags & 0x01;
+
+				udf_debug("Metadata Ident suffix=0x%x\n",
+					(le16_to_cpu(
+					 ((__le16 *)
+					      mdm->partIdent.identSuffix)[0])));
+				udf_debug("Metadata part num=%d\n",
+					le16_to_cpu(mdm->partitionNum));
+				udf_debug("Metadata part alloc unit size=%d\n",
+					le32_to_cpu(mdm->allocUnitSize));
+				udf_debug("Metadata file loc=%d\n",
+					le32_to_cpu(mdm->metadataFileLoc));
+				udf_debug("Mirror file loc=%d\n",
+				       le32_to_cpu(mdm->metadataMirrorFileLoc));
+				udf_debug("Bitmap file loc=%d\n",
+				       le32_to_cpu(mdm->metadataBitmapFileLoc));
+				udf_debug("Duplicate Flag: %d %d\n",
+					mdata->s_dup_md_flag, mdm->flags);
+			} else {
+				udf_debug("Unknown ident: %s\n",
+					  upm2->partIdent.ident);
+				continue;
+			}
+			map->s_volumeseqnum = le16_to_cpu(upm2->volSeqNum);
+			map->s_partition_num = le16_to_cpu(upm2->partitionNum);
+		}
+		udf_debug("Partition (%d:%d) type %d on volume %d\n",
+			  i, map->s_partition_num, type,
+			  map->s_volumeseqnum);
+	}
+
+	if (fileset) {
+		struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
+
+		*fileset = lelb_to_cpu(la->extLocation);
+		udf_debug("FileSet found in LogicalVolDesc at block=%d, "
+			  "partition=%d\n", fileset->logicalBlockNum,
+			  fileset->partitionReferenceNum);
+	}
+	if (lvd->integritySeqExt.extLength)
+		udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
+
+out_bh:
+	brelse(bh);
+	return ret;
+}
+
+/*
+ * udf_load_logicalvolint
+ *
+ */
+static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc)
+{
+	struct buffer_head *bh = NULL;
+	uint16_t ident;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct logicalVolIntegrityDesc *lvid;
+
+	while (loc.extLength > 0 &&
+	       (bh = udf_read_tagged(sb, loc.extLocation,
+				     loc.extLocation, &ident)) &&
+	       ident == TAG_IDENT_LVID) {
+		sbi->s_lvid_bh = bh;
+		lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+
+		if (lvid->nextIntegrityExt.extLength)
+			udf_load_logicalvolint(sb,
+				leea_to_cpu(lvid->nextIntegrityExt));
+
+		if (sbi->s_lvid_bh != bh)
+			brelse(bh);
+		loc.extLength -= sb->s_blocksize;
+		loc.extLocation++;
+	}
+	if (sbi->s_lvid_bh != bh)
+		brelse(bh);
+}
+
+/*
+ * udf_process_sequence
+ *
+ * PURPOSE
+ *	Process a main/reserve volume descriptor sequence.
+ *
+ * PRE-CONDITIONS
+ *	sb			Pointer to _locked_ superblock.
+ *	block			First block of first extent of the sequence.
+ *	lastblock		Lastblock of first extent of the sequence.
+ *
+ * HISTORY
+ *	July 1, 1997 - Andrew E. Mileski
+ *	Written, tested, and released.
+ */
+static noinline int udf_process_sequence(struct super_block *sb, long block,
+				long lastblock, struct kernel_lb_addr *fileset)
+{
+	struct buffer_head *bh = NULL;
+	struct udf_vds_record vds[VDS_POS_LENGTH];
+	struct udf_vds_record *curr;
+	struct generic_desc *gd;
+	struct volDescPtr *vdp;
+	int done = 0;
+	uint32_t vdsn;
+	uint16_t ident;
+	long next_s = 0, next_e = 0;
+
+	memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
+
+	/*
+	 * Read the main descriptor sequence and find which descriptors
+	 * are in it.
+	 */
+	for (; (!done && block <= lastblock); block++) {
+
+		bh = udf_read_tagged(sb, block, block, &ident);
+		if (!bh) {
+			printk(KERN_ERR "udf: Block %Lu of volume descriptor "
+			       "sequence is corrupted or we could not read "
+			       "it.\n", (unsigned long long)block);
+			return 1;
+		}
+
+		/* Process each descriptor (ISO 13346 3/8.3-8.4) */
+		gd = (struct generic_desc *)bh->b_data;
+		vdsn = le32_to_cpu(gd->volDescSeqNum);
+		switch (ident) {
+		case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */
+			curr = &vds[VDS_POS_PRIMARY_VOL_DESC];
+			if (vdsn >= curr->volDescSeqNum) {
+				curr->volDescSeqNum = vdsn;
+				curr->block = block;
+			}
+			break;
+		case TAG_IDENT_VDP: /* ISO 13346 3/10.3 */
+			curr = &vds[VDS_POS_VOL_DESC_PTR];
+			if (vdsn >= curr->volDescSeqNum) {
+				curr->volDescSeqNum = vdsn;
+				curr->block = block;
+
+				vdp = (struct volDescPtr *)bh->b_data;
+				next_s = le32_to_cpu(
+					vdp->nextVolDescSeqExt.extLocation);
+				next_e = le32_to_cpu(
+					vdp->nextVolDescSeqExt.extLength);
+				next_e = next_e >> sb->s_blocksize_bits;
+				next_e += next_s;
+			}
+			break;
+		case TAG_IDENT_IUVD: /* ISO 13346 3/10.4 */
+			curr = &vds[VDS_POS_IMP_USE_VOL_DESC];
+			if (vdsn >= curr->volDescSeqNum) {
+				curr->volDescSeqNum = vdsn;
+				curr->block = block;
+			}
+			break;
+		case TAG_IDENT_PD: /* ISO 13346 3/10.5 */
+			curr = &vds[VDS_POS_PARTITION_DESC];
+			if (!curr->block)
+				curr->block = block;
+			break;
+		case TAG_IDENT_LVD: /* ISO 13346 3/10.6 */
+			curr = &vds[VDS_POS_LOGICAL_VOL_DESC];
+			if (vdsn >= curr->volDescSeqNum) {
+				curr->volDescSeqNum = vdsn;
+				curr->block = block;
+			}
+			break;
+		case TAG_IDENT_USD: /* ISO 13346 3/10.8 */
+			curr = &vds[VDS_POS_UNALLOC_SPACE_DESC];
+			if (vdsn >= curr->volDescSeqNum) {
+				curr->volDescSeqNum = vdsn;
+				curr->block = block;
+			}
+			break;
+		case TAG_IDENT_TD: /* ISO 13346 3/10.9 */
+			vds[VDS_POS_TERMINATING_DESC].block = block;
+			if (next_e) {
+				block = next_s;
+				lastblock = next_e;
+				next_s = next_e = 0;
+			} else
+				done = 1;
+			break;
+		}
+		brelse(bh);
+	}
+	/*
+	 * Now read interesting descriptors again and process them
+	 * in a suitable order
+	 */
+	if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) {
+		printk(KERN_ERR "udf: Primary Volume Descriptor not found!\n");
+		return 1;
+	}
+	if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block))
+		return 1;
+
+	if (vds[VDS_POS_LOGICAL_VOL_DESC].block && udf_load_logicalvol(sb,
+	    vds[VDS_POS_LOGICAL_VOL_DESC].block, fileset))
+		return 1;
+
+	if (vds[VDS_POS_PARTITION_DESC].block) {
+		/*
+		 * We rescan the whole descriptor sequence to find
+		 * partition descriptor blocks and process them.
+		 */
+		for (block = vds[VDS_POS_PARTITION_DESC].block;
+		     block < vds[VDS_POS_TERMINATING_DESC].block;
+		     block++)
+			if (udf_load_partdesc(sb, block))
+				return 1;
+	}
+
+	return 0;
+}
+
+static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
+			     struct kernel_lb_addr *fileset)
+{
+	struct anchorVolDescPtr *anchor;
+	long main_s, main_e, reserve_s, reserve_e;
+
+	anchor = (struct anchorVolDescPtr *)bh->b_data;
+
+	/* Locate the main sequence */
+	main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
+	main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
+	main_e = main_e >> sb->s_blocksize_bits;
+	main_e += main_s;
+
+	/* Locate the reserve sequence */
+	reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation);
+	reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength);
+	reserve_e = reserve_e >> sb->s_blocksize_bits;
+	reserve_e += reserve_s;
+
+	/* Process the main & reserve sequences */
+	/* responsible for finding the PartitionDesc(s) */
+	if (!udf_process_sequence(sb, main_s, main_e, fileset))
+		return 1;
+	return !udf_process_sequence(sb, reserve_s, reserve_e, fileset);
+}
+
+/*
+ * Check whether there is an anchor block in the given block and
+ * load Volume Descriptor Sequence if so.
+ */
+static int udf_check_anchor_block(struct super_block *sb, sector_t block,
+				  struct kernel_lb_addr *fileset)
+{
+	struct buffer_head *bh;
+	uint16_t ident;
+	int ret;
+
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
+	    udf_fixed_to_variable(block) >=
+	    sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
+		return 0;
+
+	bh = udf_read_tagged(sb, block, block, &ident);
+	if (!bh)
+		return 0;
+	if (ident != TAG_IDENT_AVDP) {
+		brelse(bh);
+		return 0;
+	}
+	ret = udf_load_sequence(sb, bh, fileset);
+	brelse(bh);
+	return ret;
+}
+
+/* Search for an anchor volume descriptor pointer */
+static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
+				 struct kernel_lb_addr *fileset)
+{
+	sector_t last[6];
+	int i;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	int last_count = 0;
+
+	/* First try user provided anchor */
+	if (sbi->s_anchor) {
+		if (udf_check_anchor_block(sb, sbi->s_anchor, fileset))
+			return lastblock;
+	}
+	/*
+	 * according to spec, anchor is in either:
+	 *     block 256
+	 *     lastblock-256
+	 *     lastblock
+	 *  however, if the disc isn't closed, it could be 512.
+	 */
+	if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset))
+		return lastblock;
+	/*
+	 * The trouble is which block is the last one. Drives often misreport
+	 * this so we try various possibilities.
+	 */
+	last[last_count++] = lastblock;
+	if (lastblock >= 1)
+		last[last_count++] = lastblock - 1;
+	last[last_count++] = lastblock + 1;
+	if (lastblock >= 2)
+		last[last_count++] = lastblock - 2;
+	if (lastblock >= 150)
+		last[last_count++] = lastblock - 150;
+	if (lastblock >= 152)
+		last[last_count++] = lastblock - 152;
+
+	for (i = 0; i < last_count; i++) {
+		if (last[i] >= sb->s_bdev->bd_inode->i_size >>
+				sb->s_blocksize_bits)
+			continue;
+		if (udf_check_anchor_block(sb, last[i], fileset))
+			return last[i];
+		if (last[i] < 256)
+			continue;
+		if (udf_check_anchor_block(sb, last[i] - 256, fileset))
+			return last[i];
+	}
+
+	/* Finally try block 512 in case media is open */
+	if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset))
+		return last[0];
+	return 0;
+}
+
+/*
+ * Find an anchor volume descriptor and load Volume Descriptor Sequence from
+ * area specified by it. The function expects sbi->s_lastblock to be the last
+ * block on the media.
+ *
+ * Return 1 if ok, 0 if not found.
+ *
+ */
+static int udf_find_anchor(struct super_block *sb,
+			   struct kernel_lb_addr *fileset)
+{
+	sector_t lastblock;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+
+	lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
+	if (lastblock)
+		goto out;
+
+	/* No anchor found? Try VARCONV conversion of block numbers */
+	UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+	/* Firstly, we try to not convert number of the last block */
+	lastblock = udf_scan_anchors(sb,
+				udf_variable_to_fixed(sbi->s_last_block),
+				fileset);
+	if (lastblock)
+		goto out;
+
+	/* Secondly, we try with converted number of the last block */
+	lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
+	if (!lastblock) {
+		/* VARCONV didn't help. Clear it. */
+		UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
+		return 0;
+	}
+out:
+	sbi->s_last_block = lastblock;
+	return 1;
+}
+
+/*
+ * Check Volume Structure Descriptor, find Anchor block and load Volume
+ * Descriptor Sequence
+ */
+static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
+			int silent, struct kernel_lb_addr *fileset)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	loff_t nsr_off;
+
+	if (!sb_set_blocksize(sb, uopt->blocksize)) {
+		if (!silent)
+			printk(KERN_WARNING "UDF-fs: Bad block size\n");
+		return 0;
+	}
+	sbi->s_last_block = uopt->lastblock;
+	if (!uopt->novrs) {
+		/* Check that it is NSR02 compliant */
+		nsr_off = udf_check_vsd(sb);
+		if (!nsr_off) {
+			if (!silent)
+				printk(KERN_WARNING "UDF-fs: No VRS found\n");
+			return 0;
+		}
+		if (nsr_off == -1)
+			udf_debug("Failed to read byte 32768. Assuming open "
+				  "disc. Skipping validity check\n");
+		if (!sbi->s_last_block)
+			sbi->s_last_block = udf_get_last_block(sb);
+	} else {
+		udf_debug("Validity check skipped because of novrs option\n");
+	}
+
+	/* Look for anchor block and load Volume Descriptor Sequence */
+	sbi->s_anchor = uopt->anchor;
+	if (!udf_find_anchor(sb, fileset)) {
+		if (!silent)
+			printk(KERN_WARNING "UDF-fs: No anchor found\n");
+		return 0;
+	}
+	return 1;
+}
+
+static void udf_open_lvid(struct super_block *sb)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct buffer_head *bh = sbi->s_lvid_bh;
+	struct logicalVolIntegrityDesc *lvid;
+	struct logicalVolIntegrityDescImpUse *lvidiu;
+
+	if (!bh)
+		return;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+	lvidiu = udf_sb_lvidiu(sbi);
+
+	lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
+	lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
+	udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
+				CURRENT_TIME);
+	lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
+
+	lvid->descTag.descCRC = cpu_to_le16(
+		crc_itu_t(0, (char *)lvid + sizeof(struct tag),
+			le16_to_cpu(lvid->descTag.descCRCLength)));
+
+	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+	mark_buffer_dirty(bh);
+	sbi->s_lvid_dirty = 0;
+	mutex_unlock(&sbi->s_alloc_mutex);
+}
+
+static void udf_close_lvid(struct super_block *sb)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct buffer_head *bh = sbi->s_lvid_bh;
+	struct logicalVolIntegrityDesc *lvid;
+	struct logicalVolIntegrityDescImpUse *lvidiu;
+
+	if (!bh)
+		return;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+	lvidiu = udf_sb_lvidiu(sbi);
+	lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
+	lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
+	udf_time_to_disk_stamp(&lvid->recordingDateAndTime, CURRENT_TIME);
+	if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
+		lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION);
+	if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
+		lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev);
+	if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev))
+		lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev);
+	lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
+
+	lvid->descTag.descCRC = cpu_to_le16(
+			crc_itu_t(0, (char *)lvid + sizeof(struct tag),
+				le16_to_cpu(lvid->descTag.descCRCLength)));
+
+	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+	/*
+	 * We set buffer uptodate unconditionally here to avoid spurious
+	 * warnings from mark_buffer_dirty() when previous EIO has marked
+	 * the buffer as !uptodate
+	 */
+	set_buffer_uptodate(bh);
+	mark_buffer_dirty(bh);
+	sbi->s_lvid_dirty = 0;
+	mutex_unlock(&sbi->s_alloc_mutex);
+}
+
+u64 lvid_get_unique_id(struct super_block *sb)
+{
+	struct buffer_head *bh;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct logicalVolIntegrityDesc *lvid;
+	struct logicalVolHeaderDesc *lvhd;
+	u64 uniqueID;
+	u64 ret;
+
+	bh = sbi->s_lvid_bh;
+	if (!bh)
+		return 0;
+
+	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+	lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	ret = uniqueID = le64_to_cpu(lvhd->uniqueID);
+	if (!(++uniqueID & 0xFFFFFFFF))
+		uniqueID += 16;
+	lvhd->uniqueID = cpu_to_le64(uniqueID);
+	mutex_unlock(&sbi->s_alloc_mutex);
+	mark_buffer_dirty(bh);
+
+	return ret;
+}
+
+static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
+{
+	int i;
+	int nr_groups = bitmap->s_nr_groups;
+	int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) *
+						nr_groups);
+
+	for (i = 0; i < nr_groups; i++)
+		if (bitmap->s_block_bitmap[i])
+			brelse(bitmap->s_block_bitmap[i]);
+
+	if (size <= PAGE_SIZE)
+		kfree(bitmap);
+	else
+		vfree(bitmap);
+}
+
+static void udf_free_partition(struct udf_part_map *map)
+{
+	int i;
+	struct udf_meta_data *mdata;
+
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
+		iput(map->s_uspace.s_table);
+	if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
+		iput(map->s_fspace.s_table);
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
+		udf_sb_free_bitmap(map->s_uspace.s_bitmap);
+	if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
+		udf_sb_free_bitmap(map->s_fspace.s_bitmap);
+	if (map->s_partition_type == UDF_SPARABLE_MAP15)
+		for (i = 0; i < 4; i++)
+			brelse(map->s_type_specific.s_sparing.s_spar_map[i]);
+	else if (map->s_partition_type == UDF_METADATA_MAP25) {
+		mdata = &map->s_type_specific.s_metadata;
+		iput(mdata->s_metadata_fe);
+		mdata->s_metadata_fe = NULL;
+
+		iput(mdata->s_mirror_fe);
+		mdata->s_mirror_fe = NULL;
+
+		iput(mdata->s_bitmap_fe);
+		mdata->s_bitmap_fe = NULL;
+	}
+}
+
+static int udf_fill_super(struct super_block *sb, void *options, int silent)
+{
+	int i;
+	int ret;
+	struct inode *inode = NULL;
+	struct udf_options uopt;
+	struct kernel_lb_addr rootdir, fileset;
+	struct udf_sb_info *sbi;
+
+	uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
+	uopt.uid = -1;
+	uopt.gid = -1;
+	uopt.umask = 0;
+	uopt.fmode = UDF_INVALID_MODE;
+	uopt.dmode = UDF_INVALID_MODE;
+
+	sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sb->s_fs_info = sbi;
+
+	mutex_init(&sbi->s_alloc_mutex);
+
+	if (!udf_parse_options((char *)options, &uopt, false))
+		goto error_out;
+
+	if (uopt.flags & (1 << UDF_FLAG_UTF8) &&
+	    uopt.flags & (1 << UDF_FLAG_NLS_MAP)) {
+		udf_error(sb, "udf_read_super",
+			  "utf8 cannot be combined with iocharset\n");
+		goto error_out;
+	}
+#ifdef CONFIG_UDF_NLS
+	if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) {
+		uopt.nls_map = load_nls_default();
+		if (!uopt.nls_map)
+			uopt.flags &= ~(1 << UDF_FLAG_NLS_MAP);
+		else
+			udf_debug("Using default NLS map\n");
+	}
+#endif
+	if (!(uopt.flags & (1 << UDF_FLAG_NLS_MAP)))
+		uopt.flags |= (1 << UDF_FLAG_UTF8);
+
+	fileset.logicalBlockNum = 0xFFFFFFFF;
+	fileset.partitionReferenceNum = 0xFFFF;
+
+	sbi->s_flags = uopt.flags;
+	sbi->s_uid = uopt.uid;
+	sbi->s_gid = uopt.gid;
+	sbi->s_umask = uopt.umask;
+	sbi->s_fmode = uopt.fmode;
+	sbi->s_dmode = uopt.dmode;
+	sbi->s_nls_map = uopt.nls_map;
+	rwlock_init(&sbi->s_cred_lock);
+
+	if (uopt.session == 0xFFFFFFFF)
+		sbi->s_session = udf_get_last_session(sb);
+	else
+		sbi->s_session = uopt.session;
+
+	udf_debug("Multi-session=%d\n", sbi->s_session);
+
+	/* Fill in the rest of the superblock */
+	sb->s_op = &udf_sb_ops;
+	sb->s_export_op = &udf_export_ops;
+
+	sb->s_dirt = 0;
+	sb->s_magic = UDF_SUPER_MAGIC;
+	sb->s_time_gran = 1000;
+
+	if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
+		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+	} else {
+		uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
+		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+		if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
+			if (!silent)
+				printk(KERN_NOTICE
+				       "UDF-fs: Rescanning with blocksize "
+				       "%d\n", UDF_DEFAULT_BLOCKSIZE);
+			uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
+			ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+		}
+	}
+	if (!ret) {
+		printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
+		goto error_out;
+	}
+
+	udf_debug("Lastblock=%d\n", sbi->s_last_block);
+
+	if (sbi->s_lvid_bh) {
+		struct logicalVolIntegrityDescImpUse *lvidiu =
+							udf_sb_lvidiu(sbi);
+		uint16_t minUDFReadRev = le16_to_cpu(lvidiu->minUDFReadRev);
+		uint16_t minUDFWriteRev = le16_to_cpu(lvidiu->minUDFWriteRev);
+		/* uint16_t maxUDFWriteRev =
+				le16_to_cpu(lvidiu->maxUDFWriteRev); */
+
+		if (minUDFReadRev > UDF_MAX_READ_VERSION) {
+			printk(KERN_ERR "UDF-fs: minUDFReadRev=%x "
+					"(max is %x)\n",
+			       le16_to_cpu(lvidiu->minUDFReadRev),
+			       UDF_MAX_READ_VERSION);
+			goto error_out;
+		} else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION)
+			sb->s_flags |= MS_RDONLY;
+
+		sbi->s_udfrev = minUDFWriteRev;
+
+		if (minUDFReadRev >= UDF_VERS_USE_EXTENDED_FE)
+			UDF_SET_FLAG(sb, UDF_FLAG_USE_EXTENDED_FE);
+		if (minUDFReadRev >= UDF_VERS_USE_STREAMS)
+			UDF_SET_FLAG(sb, UDF_FLAG_USE_STREAMS);
+	}
+
+	if (!sbi->s_partitions) {
+		printk(KERN_WARNING "UDF-fs: No partition found (2)\n");
+		goto error_out;
+	}
+
+	if (sbi->s_partmaps[sbi->s_partition].s_partition_flags &
+			UDF_PART_FLAG_READ_ONLY) {
+		printk(KERN_NOTICE "UDF-fs: Partition marked readonly; "
+				   "forcing readonly mount\n");
+		sb->s_flags |= MS_RDONLY;
+	}
+
+	if (udf_find_fileset(sb, &fileset, &rootdir)) {
+		printk(KERN_WARNING "UDF-fs: No fileset found\n");
+		goto error_out;
+	}
+
+	if (!silent) {
+		struct timestamp ts;
+		udf_time_to_disk_stamp(&ts, sbi->s_record_time);
+		udf_info("UDF: Mounting volume '%s', "
+			 "timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
+			 sbi->s_volume_ident, le16_to_cpu(ts.year), ts.month, ts.day,
+			 ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone));
+	}
+	if (!(sb->s_flags & MS_RDONLY))
+		udf_open_lvid(sb);
+
+	/* Assign the root inode */
+	/* assign inodes by physical block number */
+	/* perhaps it's not extensible enough, but for now ... */
+	inode = udf_iget(sb, &rootdir);
+	if (!inode) {
+		printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, "
+				"partition=%d\n",
+		       rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
+		goto error_out;
+	}
+
+	/* Allocate a dentry for the root inode */
+	sb->s_root = d_alloc_root(inode);
+	if (!sb->s_root) {
+		printk(KERN_ERR "UDF-fs: Couldn't allocate root dentry\n");
+		iput(inode);
+		goto error_out;
+	}
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	return 0;
+
+error_out:
+	if (sbi->s_vat_inode)
+		iput(sbi->s_vat_inode);
+	if (sbi->s_partitions)
+		for (i = 0; i < sbi->s_partitions; i++)
+			udf_free_partition(&sbi->s_partmaps[i]);
+#ifdef CONFIG_UDF_NLS
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
+		unload_nls(sbi->s_nls_map);
+#endif
+	if (!(sb->s_flags & MS_RDONLY))
+		udf_close_lvid(sb);
+	brelse(sbi->s_lvid_bh);
+
+	kfree(sbi->s_partmaps);
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+
+	return -EINVAL;
+}
+
+static void udf_error(struct super_block *sb, const char *function,
+		      const char *fmt, ...)
+{
+	va_list args;
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		/* mark sb error */
+		sb->s_dirt = 1;
+	}
+	va_start(args, fmt);
+	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+	va_end(args);
+	printk(KERN_CRIT "UDF-fs error (device %s): %s: %s\n",
+		sb->s_id, function, error_buf);
+}
+
+void udf_warning(struct super_block *sb, const char *function,
+		 const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(error_buf, sizeof(error_buf), fmt, args);
+	va_end(args);
+	printk(KERN_WARNING "UDF-fs warning (device %s): %s: %s\n",
+	       sb->s_id, function, error_buf);
+}
+
+static void udf_put_super(struct super_block *sb)
+{
+	int i;
+	struct udf_sb_info *sbi;
+
+	sbi = UDF_SB(sb);
+
+	if (sbi->s_vat_inode)
+		iput(sbi->s_vat_inode);
+	if (sbi->s_partitions)
+		for (i = 0; i < sbi->s_partitions; i++)
+			udf_free_partition(&sbi->s_partmaps[i]);
+#ifdef CONFIG_UDF_NLS
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
+		unload_nls(sbi->s_nls_map);
+#endif
+	if (!(sb->s_flags & MS_RDONLY))
+		udf_close_lvid(sb);
+	brelse(sbi->s_lvid_bh);
+	kfree(sbi->s_partmaps);
+	kfree(sb->s_fs_info);
+	sb->s_fs_info = NULL;
+}
+
+static int udf_sync_fs(struct super_block *sb, int wait)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	if (sbi->s_lvid_dirty) {
+		/*
+		 * Blockdevice will be synced later so we don't have to submit
+		 * the buffer for IO
+		 */
+		mark_buffer_dirty(sbi->s_lvid_bh);
+		sb->s_dirt = 0;
+		sbi->s_lvid_dirty = 0;
+	}
+	mutex_unlock(&sbi->s_alloc_mutex);
+
+	return 0;
+}
+
+static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct logicalVolIntegrityDescImpUse *lvidiu;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	if (sbi->s_lvid_bh != NULL)
+		lvidiu = udf_sb_lvidiu(sbi);
+	else
+		lvidiu = NULL;
+
+	buf->f_type = UDF_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = sbi->s_partmaps[sbi->s_partition].s_partition_len;
+	buf->f_bfree = udf_count_free(sb);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = (lvidiu != NULL ? (le32_to_cpu(lvidiu->numFiles) +
+					  le32_to_cpu(lvidiu->numDirs)) : 0)
+			+ buf->f_bfree;
+	buf->f_ffree = buf->f_bfree;
+	buf->f_namelen = UDF_NAME_LEN - 2;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+static unsigned int udf_count_free_bitmap(struct super_block *sb,
+					  struct udf_bitmap *bitmap)
+{
+	struct buffer_head *bh = NULL;
+	unsigned int accum = 0;
+	int index;
+	int block = 0, newblock;
+	struct kernel_lb_addr loc;
+	uint32_t bytes;
+	uint8_t *ptr;
+	uint16_t ident;
+	struct spaceBitmapDesc *bm;
+
+	loc.logicalBlockNum = bitmap->s_extPosition;
+	loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
+	bh = udf_read_ptagged(sb, &loc, 0, &ident);
+
+	if (!bh) {
+		printk(KERN_ERR "udf: udf_count_free failed\n");
+		goto out;
+	} else if (ident != TAG_IDENT_SBD) {
+		brelse(bh);
+		printk(KERN_ERR "udf: udf_count_free failed\n");
+		goto out;
+	}
+
+	bm = (struct spaceBitmapDesc *)bh->b_data;
+	bytes = le32_to_cpu(bm->numOfBytes);
+	index = sizeof(struct spaceBitmapDesc); /* offset in first block only */
+	ptr = (uint8_t *)bh->b_data;
+
+	while (bytes > 0) {
+		u32 cur_bytes = min_t(u32, bytes, sb->s_blocksize - index);
+		accum += bitmap_weight((const unsigned long *)(ptr + index),
+					cur_bytes * 8);
+		bytes -= cur_bytes;
+		if (bytes) {
+			brelse(bh);
+			newblock = udf_get_lb_pblock(sb, &loc, ++block);
+			bh = udf_tread(sb, newblock);
+			if (!bh) {
+				udf_debug("read failed\n");
+				goto out;
+			}
+			index = 0;
+			ptr = (uint8_t *)bh->b_data;
+		}
+	}
+	brelse(bh);
+out:
+	return accum;
+}
+
+static unsigned int udf_count_free_table(struct super_block *sb,
+					 struct inode *table)
+{
+	unsigned int accum = 0;
+	uint32_t elen;
+	struct kernel_lb_addr eloc;
+	int8_t etype;
+	struct extent_position epos;
+
+	mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
+	epos.block = UDF_I(table)->i_location;
+	epos.offset = sizeof(struct unallocSpaceEntry);
+	epos.bh = NULL;
+
+	while ((etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1)
+		accum += (elen >> table->i_sb->s_blocksize_bits);
+
+	brelse(epos.bh);
+	mutex_unlock(&UDF_SB(sb)->s_alloc_mutex);
+
+	return accum;
+}
+
+static unsigned int udf_count_free(struct super_block *sb)
+{
+	unsigned int accum = 0;
+	struct udf_sb_info *sbi;
+	struct udf_part_map *map;
+
+	sbi = UDF_SB(sb);
+	if (sbi->s_lvid_bh) {
+		struct logicalVolIntegrityDesc *lvid =
+			(struct logicalVolIntegrityDesc *)
+			sbi->s_lvid_bh->b_data;
+		if (le32_to_cpu(lvid->numOfPartitions) > sbi->s_partition) {
+			accum = le32_to_cpu(
+					lvid->freeSpaceTable[sbi->s_partition]);
+			if (accum == 0xFFFFFFFF)
+				accum = 0;
+		}
+	}
+
+	if (accum)
+		return accum;
+
+	map = &sbi->s_partmaps[sbi->s_partition];
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
+		accum += udf_count_free_bitmap(sb,
+					       map->s_uspace.s_bitmap);
+	}
+	if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
+		accum += udf_count_free_bitmap(sb,
+					       map->s_fspace.s_bitmap);
+	}
+	if (accum)
+		return accum;
+
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
+		accum += udf_count_free_table(sb,
+					      map->s_uspace.s_table);
+	}
+	if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
+		accum += udf_count_free_table(sb,
+					      map->s_fspace.s_table);
+	}
+
+	return accum;
+}
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
new file mode 100644
index 00000000..b1d4488b
--- /dev/null
+++ b/fs/udf/symlink.c
@@ -0,0 +1,119 @@
+/*
+ * symlink.c
+ *
+ * PURPOSE
+ *	Symlink handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1998-2001 Ben Fennema
+ *  (C) 1999 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  04/16/99 blf  Created.
+ *
+ */
+
+#include "udfdecl.h"
+#include <asm/uaccess.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include "udf_i.h"
+
+static void udf_pc_to_char(struct super_block *sb, unsigned char *from,
+			   int fromlen, unsigned char *to)
+{
+	struct pathComponent *pc;
+	int elen = 0;
+	unsigned char *p = to;
+
+	while (elen < fromlen) {
+		pc = (struct pathComponent *)(from + elen);
+		switch (pc->componentType) {
+		case 1:
+			if (pc->lengthComponentIdent == 0) {
+				p = to;
+				*p++ = '/';
+			}
+			break;
+		case 3:
+			memcpy(p, "../", 3);
+			p += 3;
+			break;
+		case 4:
+			memcpy(p, "./", 2);
+			p += 2;
+			/* that would be . - just ignore */
+			break;
+		case 5:
+			p += udf_get_filename(sb, pc->componentIdent, p,
+					      pc->lengthComponentIdent);
+			*p++ = '/';
+			break;
+		}
+		elen += sizeof(struct pathComponent) + pc->lengthComponentIdent;
+	}
+	if (p > to + 1)
+		p[-1] = '\0';
+	else
+		p[0] = '\0';
+}
+
+static int udf_symlink_filler(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head *bh = NULL;
+	unsigned char *symlink;
+	int err = -EIO;
+	unsigned char *p = kmap(page);
+	struct udf_inode_info *iinfo;
+	uint32_t pos;
+
+	iinfo = UDF_I(inode);
+	pos = udf_block_map(inode, 0);
+
+	down_read(&iinfo->i_data_sem);
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+		symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
+	} else {
+		bh = sb_bread(inode->i_sb, pos);
+
+		if (!bh)
+			goto out;
+
+		symlink = bh->b_data;
+	}
+
+	udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p);
+	brelse(bh);
+
+	up_read(&iinfo->i_data_sem);
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+	return 0;
+
+out:
+	up_read(&iinfo->i_data_sem);
+	SetPageError(page);
+	kunmap(page);
+	unlock_page(page);
+	return err;
+}
+
+/*
+ * symlinks can't do much...
+ */
+const struct address_space_operations udf_symlink_aops = {
+	.readpage		= udf_symlink_filler,
+};
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
new file mode 100644
index 00000000..8424308d
--- /dev/null
+++ b/fs/udf/truncate.c
@@ -0,0 +1,289 @@
+/*
+ * truncate.c
+ *
+ * PURPOSE
+ *	Truncate handling routines for the OSTA-UDF(tm) filesystem.
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ *
+ *  (C) 1999-2004 Ben Fennema
+ *  (C) 1999 Stelias Computing Inc
+ *
+ * HISTORY
+ *
+ *  02/24/99 blf  Created.
+ *
+ */
+
+#include "udfdecl.h"
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/buffer_head.h>
+
+#include "udf_i.h"
+#include "udf_sb.h"
+
+static void extent_trunc(struct inode *inode, struct extent_position *epos,
+			 struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen,
+			 uint32_t nelen)
+{
+	struct kernel_lb_addr neloc = {};
+	int last_block = (elen + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+	int first_block = (nelen + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+
+	if (nelen) {
+		if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
+			udf_free_blocks(inode->i_sb, inode, eloc, 0,
+					last_block);
+			etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30);
+		} else
+			neloc = *eloc;
+		nelen = (etype << 30) | nelen;
+	}
+
+	if (elen != nelen) {
+		udf_write_aext(inode, epos, &neloc, nelen, 0);
+		if (last_block - first_block > 0) {
+			if (etype == (EXT_RECORDED_ALLOCATED >> 30))
+				mark_inode_dirty(inode);
+
+			if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))
+				udf_free_blocks(inode->i_sb, inode, eloc,
+						first_block,
+						last_block - first_block);
+		}
+	}
+}
+
+/*
+ * Truncate the last extent to match i_size. This function assumes
+ * that preallocation extent is already truncated.
+ */
+void udf_truncate_tail_extent(struct inode *inode)
+{
+	struct extent_position epos = {};
+	struct kernel_lb_addr eloc;
+	uint32_t elen, nelen;
+	uint64_t lbcount = 0;
+	int8_t etype = -1, netype;
+	int adsize;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ||
+	    inode->i_size == iinfo->i_lenExtents)
+		return;
+	/* Are we going to delete the file anyway? */
+	if (inode->i_nlink == 0)
+		return;
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		BUG();
+
+	/* Find the last extent in the file */
+	while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
+		etype = netype;
+		lbcount += elen;
+		if (lbcount > inode->i_size) {
+			if (lbcount - inode->i_size >= inode->i_sb->s_blocksize)
+				printk(KERN_WARNING
+				       "udf_truncate_tail_extent(): Too long "
+				       "extent after EOF in inode %u: i_size: "
+				       "%Ld lbcount: %Ld extent %u+%u\n",
+				       (unsigned)inode->i_ino,
+				       (long long)inode->i_size,
+				       (long long)lbcount,
+				       (unsigned)eloc.logicalBlockNum,
+				       (unsigned)elen);
+			nelen = elen - (lbcount - inode->i_size);
+			epos.offset -= adsize;
+			extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
+			epos.offset += adsize;
+			if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
+				printk(KERN_ERR "udf_truncate_tail_extent(): "
+				       "Extent after EOF in inode %u.\n",
+				       (unsigned)inode->i_ino);
+			break;
+		}
+	}
+	/* This inode entry is in-memory only and thus we don't have to mark
+	 * the inode dirty */
+	iinfo->i_lenExtents = inode->i_size;
+	brelse(epos.bh);
+}
+
+void udf_discard_prealloc(struct inode *inode)
+{
+	struct extent_position epos = { NULL, 0, {0, 0} };
+	struct kernel_lb_addr eloc;
+	uint32_t elen;
+	uint64_t lbcount = 0;
+	int8_t etype = -1, netype;
+	int adsize;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ||
+	    inode->i_size == iinfo->i_lenExtents)
+		return;
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		adsize = 0;
+
+	epos.block = iinfo->i_location;
+
+	/* Find the last extent in the file */
+	while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
+		etype = netype;
+		lbcount += elen;
+	}
+	if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
+		epos.offset -= adsize;
+		lbcount -= elen;
+		extent_trunc(inode, &epos, &eloc, etype, elen, 0);
+		if (!epos.bh) {
+			iinfo->i_lenAlloc =
+				epos.offset -
+				udf_file_entry_alloc_offset(inode);
+			mark_inode_dirty(inode);
+		} else {
+			struct allocExtDesc *aed =
+				(struct allocExtDesc *)(epos.bh->b_data);
+			aed->lengthAllocDescs =
+				cpu_to_le32(epos.offset -
+					    sizeof(struct allocExtDesc));
+			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
+			    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
+				udf_update_tag(epos.bh->b_data, epos.offset);
+			else
+				udf_update_tag(epos.bh->b_data,
+					       sizeof(struct allocExtDesc));
+			mark_buffer_dirty_inode(epos.bh, inode);
+		}
+	}
+	/* This inode entry is in-memory only and thus we don't have to mark
+	 * the inode dirty */
+	iinfo->i_lenExtents = lbcount;
+	brelse(epos.bh);
+}
+
+static void udf_update_alloc_ext_desc(struct inode *inode,
+				      struct extent_position *epos,
+				      u32 lenalloc)
+{
+	struct super_block *sb = inode->i_sb;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+
+	struct allocExtDesc *aed = (struct allocExtDesc *) (epos->bh->b_data);
+	int len = sizeof(struct allocExtDesc);
+
+	aed->lengthAllocDescs =	cpu_to_le32(lenalloc);
+	if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) || sbi->s_udfrev >= 0x0201)
+		len += lenalloc;
+
+	udf_update_tag(epos->bh->b_data, len);
+	mark_buffer_dirty_inode(epos->bh, inode);
+}
+
+/*
+ * Truncate extents of inode to inode->i_size. This function can be used only
+ * for making file shorter. For making file longer, udf_extend_file() has to
+ * be used.
+ */
+void udf_truncate_extents(struct inode *inode)
+{
+	struct extent_position epos;
+	struct kernel_lb_addr eloc, neloc = {};
+	uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc;
+	int8_t etype;
+	struct super_block *sb = inode->i_sb;
+	sector_t first_block = inode->i_size >> sb->s_blocksize_bits, offset;
+	loff_t byte_offset;
+	int adsize;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		BUG();
+
+	etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
+	byte_offset = (offset << sb->s_blocksize_bits) +
+		(inode->i_size & (sb->s_blocksize - 1));
+	if (etype == -1) {
+		/* We should extend the file? */
+		WARN_ON(byte_offset);
+		return;
+	}
+	epos.offset -= adsize;
+	extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
+	epos.offset += adsize;
+	if (byte_offset)
+		lenalloc = epos.offset;
+	else
+		lenalloc = epos.offset - adsize;
+
+	if (!epos.bh)
+		lenalloc -= udf_file_entry_alloc_offset(inode);
+	else
+		lenalloc -= sizeof(struct allocExtDesc);
+
+	while ((etype = udf_current_aext(inode, &epos, &eloc,
+					 &elen, 0)) != -1) {
+		if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
+			udf_write_aext(inode, &epos, &neloc, nelen, 0);
+			if (indirect_ext_len) {
+				/* We managed to free all extents in the
+				 * indirect extent - free it too */
+				BUG_ON(!epos.bh);
+				udf_free_blocks(sb, inode, &epos.block,
+						0, indirect_ext_len);
+			} else if (!epos.bh) {
+				iinfo->i_lenAlloc = lenalloc;
+				mark_inode_dirty(inode);
+			} else
+				udf_update_alloc_ext_desc(inode,
+						&epos, lenalloc);
+			brelse(epos.bh);
+			epos.offset = sizeof(struct allocExtDesc);
+			epos.block = eloc;
+			epos.bh = udf_tread(sb,
+					udf_get_lb_pblock(sb, &eloc, 0));
+			if (elen)
+				indirect_ext_len =
+					(elen + sb->s_blocksize - 1) >>
+					sb->s_blocksize_bits;
+			else
+				indirect_ext_len = 1;
+		} else {
+			extent_trunc(inode, &epos, &eloc, etype, elen, 0);
+			epos.offset += adsize;
+		}
+	}
+
+	if (indirect_ext_len) {
+		BUG_ON(!epos.bh);
+		udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len);
+	} else if (!epos.bh) {
+		iinfo->i_lenAlloc = lenalloc;
+		mark_inode_dirty(inode);
+	} else
+		udf_update_alloc_ext_desc(inode, &epos, lenalloc);
+	iinfo->i_lenExtents = inode->i_size;
+
+	brelse(epos.bh);
+}
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
new file mode 100644
index 00000000..d1bd31ea
--- /dev/null
+++ b/fs/udf/udf_i.h
@@ -0,0 +1,45 @@
+#ifndef _UDF_I_H
+#define _UDF_I_H
+
+/*
+ * The i_data_sem and i_mutex serve for protection of allocation information
+ * of a regular files and symlinks. This includes all extents belonging to
+ * the file/symlink, a fact whether data are in-inode or in external data
+ * blocks, preallocation, goal block information... When extents are read,
+ * i_mutex or i_data_sem must be held (for reading is enough in case of
+ * i_data_sem). When extents are changed, i_data_sem must be held for writing
+ * and also i_mutex must be held.
+ *
+ * For directories i_mutex is used for all the necessary protection.
+ */
+
+struct udf_inode_info {
+	struct timespec		i_crtime;
+	/* Physical address of inode */
+	struct kernel_lb_addr		i_location;
+	__u64			i_unique;
+	__u32			i_lenEAttr;
+	__u32			i_lenAlloc;
+	__u64			i_lenExtents;
+	__u32			i_next_alloc_block;
+	__u32			i_next_alloc_goal;
+	unsigned		i_alloc_type : 3;
+	unsigned		i_efe : 1;	/* extendedFileEntry */
+	unsigned		i_use : 1;	/* unallocSpaceEntry */
+	unsigned		i_strat4096 : 1;
+	unsigned		reserved : 26;
+	union {
+		struct short_ad	*i_sad;
+		struct long_ad		*i_lad;
+		__u8		*i_data;
+	} i_ext;
+	struct rw_semaphore	i_data_sem;
+	struct inode vfs_inode;
+};
+
+static inline struct udf_inode_info *UDF_I(struct inode *inode)
+{
+	return list_entry(inode, struct udf_inode_info, vfs_inode);
+}
+
+#endif /* _UDF_I_H) */
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
new file mode 100644
index 00000000..4858c191
--- /dev/null
+++ b/fs/udf/udf_sb.h
@@ -0,0 +1,182 @@
+#ifndef __LINUX_UDF_SB_H
+#define __LINUX_UDF_SB_H
+
+#include <linux/mutex.h>
+#include <linux/bitops.h>
+
+/* Since UDF 2.01 is ISO 13346 based... */
+#define UDF_SUPER_MAGIC			0x15013346
+
+#define UDF_MAX_READ_VERSION		0x0250
+#define UDF_MAX_WRITE_VERSION		0x0201
+
+#define UDF_FLAG_USE_EXTENDED_FE	0
+#define UDF_VERS_USE_EXTENDED_FE	0x0200
+#define UDF_FLAG_USE_STREAMS		1
+#define UDF_VERS_USE_STREAMS		0x0200
+#define UDF_FLAG_USE_SHORT_AD		2
+#define UDF_FLAG_USE_AD_IN_ICB		3
+#define UDF_FLAG_USE_FILE_CTIME_EA	4
+#define UDF_FLAG_STRICT			5
+#define UDF_FLAG_UNDELETE		6
+#define UDF_FLAG_UNHIDE			7
+#define UDF_FLAG_VARCONV		8
+#define UDF_FLAG_NLS_MAP		9
+#define UDF_FLAG_UTF8			10
+#define UDF_FLAG_UID_FORGET     11    /* save -1 for uid to disk */
+#define UDF_FLAG_UID_IGNORE     12    /* use sb uid instead of on disk uid */
+#define UDF_FLAG_GID_FORGET     13
+#define UDF_FLAG_GID_IGNORE     14
+#define UDF_FLAG_UID_SET	15
+#define UDF_FLAG_GID_SET	16
+#define UDF_FLAG_SESSION_SET	17
+#define UDF_FLAG_LASTBLOCK_SET	18
+#define UDF_FLAG_BLOCKSIZE_SET	19
+
+#define UDF_PART_FLAG_UNALLOC_BITMAP	0x0001
+#define UDF_PART_FLAG_UNALLOC_TABLE	0x0002
+#define UDF_PART_FLAG_FREED_BITMAP	0x0004
+#define UDF_PART_FLAG_FREED_TABLE	0x0008
+#define UDF_PART_FLAG_READ_ONLY		0x0010
+#define UDF_PART_FLAG_WRITE_ONCE	0x0020
+#define UDF_PART_FLAG_REWRITABLE	0x0040
+#define UDF_PART_FLAG_OVERWRITABLE	0x0080
+
+#define UDF_MAX_BLOCK_LOADED	8
+
+#define UDF_TYPE1_MAP15			0x1511U
+#define UDF_VIRTUAL_MAP15		0x1512U
+#define UDF_VIRTUAL_MAP20		0x2012U
+#define UDF_SPARABLE_MAP15		0x1522U
+#define UDF_METADATA_MAP25		0x2511U
+
+#define UDF_INVALID_MODE		((mode_t)-1)
+
+#pragma pack(1) /* XXX(hch): Why?  This file just defines in-core structures */
+
+struct udf_meta_data {
+	__u32	s_meta_file_loc;
+	__u32	s_mirror_file_loc;
+	__u32	s_bitmap_file_loc;
+	__u32	s_alloc_unit_size;
+	__u16	s_align_unit_size;
+	__u8 	s_dup_md_flag;
+	struct inode *s_metadata_fe;
+	struct inode *s_mirror_fe;
+	struct inode *s_bitmap_fe;
+};
+
+struct udf_sparing_data {
+	__u16	s_packet_len;
+	struct buffer_head *s_spar_map[4];
+};
+
+struct udf_virtual_data {
+	__u32	s_num_entries;
+	__u16	s_start_offset;
+};
+
+struct udf_bitmap {
+	__u32			s_extLength;
+	__u32			s_extPosition;
+	__u16			s_nr_groups;
+	struct buffer_head 	**s_block_bitmap;
+};
+
+struct udf_part_map {
+	union {
+		struct udf_bitmap	*s_bitmap;
+		struct inode		*s_table;
+	} s_uspace;
+	union {
+		struct udf_bitmap	*s_bitmap;
+		struct inode		*s_table;
+	} s_fspace;
+	__u32	s_partition_root;
+	__u32	s_partition_len;
+	__u16	s_partition_type;
+	__u16	s_partition_num;
+	union {
+		struct udf_sparing_data s_sparing;
+		struct udf_virtual_data s_virtual;
+		struct udf_meta_data s_metadata;
+	} s_type_specific;
+	__u32	(*s_partition_func)(struct super_block *, __u32, __u16, __u32);
+	__u16	s_volumeseqnum;
+	__u16	s_partition_flags;
+};
+
+#pragma pack()
+
+struct udf_sb_info {
+	struct udf_part_map	*s_partmaps;
+	__u8			s_volume_ident[32];
+
+	/* Overall info */
+	__u16			s_partitions;
+	__u16			s_partition;
+
+	/* Sector headers */
+	__s32			s_session;
+	__u32			s_anchor;
+	__u32			s_last_block;
+
+	struct buffer_head	*s_lvid_bh;
+
+	/* Default permissions */
+	mode_t			s_umask;
+	gid_t			s_gid;
+	uid_t			s_uid;
+	mode_t			s_fmode;
+	mode_t			s_dmode;
+	/* Lock protecting consistency of above permission settings */
+	rwlock_t		s_cred_lock;
+
+	/* Root Info */
+	struct timespec		s_record_time;
+
+	/* Fileset Info */
+	__u16			s_serial_number;
+
+	/* highest UDF revision we have recorded to this media */
+	__u16			s_udfrev;
+
+	/* Miscellaneous flags */
+	unsigned long		s_flags;
+
+	/* Encoding info */
+	struct nls_table	*s_nls_map;
+
+	/* VAT inode */
+	struct inode		*s_vat_inode;
+
+	struct mutex		s_alloc_mutex;
+	/* Protected by s_alloc_mutex */
+	unsigned int		s_lvid_dirty;
+};
+
+static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi);
+
+int udf_compute_nr_groups(struct super_block *sb, u32 partition);
+
+static inline int UDF_QUERY_FLAG(struct super_block *sb, int flag)
+{
+	return test_bit(flag, &UDF_SB(sb)->s_flags);
+}
+
+static inline void UDF_SET_FLAG(struct super_block *sb, int flag)
+{
+	set_bit(flag, &UDF_SB(sb)->s_flags);
+}
+
+static inline void UDF_CLEAR_FLAG(struct super_block *sb, int flag)
+{
+	clear_bit(flag, &UDF_SB(sb)->s_flags);
+}
+
+#endif /* __LINUX_UDF_SB_H */
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
new file mode 100644
index 00000000..dbd52d4b
--- /dev/null
+++ b/fs/udf/udfdecl.h
@@ -0,0 +1,241 @@
+#ifndef __UDF_DECL_H
+#define __UDF_DECL_H
+
+#include "ecma_167.h"
+#include "osta_udf.h"
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/udf_fs_i.h>
+
+#include "udf_sb.h"
+#include "udfend.h"
+#include "udf_i.h"
+
+#define UDF_PREALLOCATE
+#define UDF_DEFAULT_PREALLOC_BLOCKS	8
+
+#undef UDFFS_DEBUG
+
+#ifdef UDFFS_DEBUG
+#define udf_debug(f, a...) \
+do { \
+	printk(KERN_DEBUG "UDF-fs DEBUG %s:%d:%s: ", \
+		__FILE__, __LINE__, __func__); \
+	printk(f, ##a); \
+} while (0)
+#else
+#define udf_debug(f, a...) /**/
+#endif
+
+#define udf_info(f, a...) \
+	printk(KERN_INFO "UDF-fs INFO " f, ##a);
+
+
+#define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) )
+#define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) )
+
+#define UDF_EXTENT_LENGTH_MASK	0x3FFFFFFF
+#define UDF_EXTENT_FLAG_MASK	0xC0000000
+
+#define UDF_NAME_PAD		4
+#define UDF_NAME_LEN		256
+#define UDF_PATH_LEN		1023
+
+static inline size_t udf_file_entry_alloc_offset(struct inode *inode)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	if (iinfo->i_use)
+		return sizeof(struct unallocSpaceEntry);
+	else if (iinfo->i_efe)
+		return sizeof(struct extendedFileEntry) + iinfo->i_lenEAttr;
+	else
+		return sizeof(struct fileEntry) + iinfo->i_lenEAttr;
+}
+
+static inline size_t udf_ext0_offset(struct inode *inode)
+{
+	if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		return udf_file_entry_alloc_offset(inode);
+	else
+		return 0;
+}
+
+/* computes tag checksum */
+u8 udf_tag_checksum(const struct tag *t);
+
+struct dentry;
+struct inode;
+struct task_struct;
+struct buffer_head;
+struct super_block;
+
+extern const struct export_operations udf_export_ops;
+extern const struct inode_operations udf_dir_inode_operations;
+extern const struct file_operations udf_dir_operations;
+extern const struct inode_operations udf_file_inode_operations;
+extern const struct file_operations udf_file_operations;
+extern const struct inode_operations udf_symlink_inode_operations;
+extern const struct address_space_operations udf_aops;
+extern const struct address_space_operations udf_adinicb_aops;
+extern const struct address_space_operations udf_symlink_aops;
+
+struct udf_fileident_bh {
+	struct buffer_head *sbh;
+	struct buffer_head *ebh;
+	int soffset;
+	int eoffset;
+};
+
+struct udf_vds_record {
+	uint32_t block;
+	uint32_t volDescSeqNum;
+};
+
+struct generic_desc {
+	struct tag	descTag;
+	__le32		volDescSeqNum;
+};
+
+struct ustr {
+	uint8_t u_cmpID;
+	uint8_t u_name[UDF_NAME_LEN - 2];
+	uint8_t u_len;
+};
+
+struct extent_position {
+	struct buffer_head *bh;
+	uint32_t offset;
+	struct kernel_lb_addr block;
+};
+
+/* super.c */
+
+__attribute__((format(printf, 3, 4)))
+extern void udf_warning(struct super_block *, const char *, const char *, ...);
+static inline void udf_updated_lvid(struct super_block *sb)
+{
+	struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
+
+	BUG_ON(!bh);
+	WARN_ON_ONCE(((struct logicalVolIntegrityDesc *)
+		     bh->b_data)->integrityType !=
+		     cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN));
+	sb->s_dirt = 1;
+	UDF_SB(sb)->s_lvid_dirty = 1;
+}
+extern u64 lvid_get_unique_id(struct super_block *sb);
+
+/* namei.c */
+extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
+			struct fileIdentDesc *, struct udf_fileident_bh *,
+			uint8_t *, uint8_t *);
+
+/* file.c */
+extern long udf_ioctl(struct file *, unsigned int, unsigned long);
+/* inode.c */
+extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
+extern int udf_expand_file_adinicb(struct inode *);
+extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
+extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
+extern int udf_setsize(struct inode *, loff_t);
+extern void udf_read_inode(struct inode *);
+extern void udf_evict_inode(struct inode *);
+extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
+extern long udf_block_map(struct inode *, sector_t);
+extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
+			 struct kernel_lb_addr *, uint32_t *, sector_t *);
+extern int udf_add_aext(struct inode *, struct extent_position *,
+			struct kernel_lb_addr *, uint32_t, int);
+extern void udf_write_aext(struct inode *, struct extent_position *,
+			   struct kernel_lb_addr *, uint32_t, int);
+extern int8_t udf_delete_aext(struct inode *, struct extent_position,
+			      struct kernel_lb_addr, uint32_t);
+extern int8_t udf_next_aext(struct inode *, struct extent_position *,
+			    struct kernel_lb_addr *, uint32_t *, int);
+extern int8_t udf_current_aext(struct inode *, struct extent_position *,
+			       struct kernel_lb_addr *, uint32_t *, int);
+
+/* misc.c */
+extern struct buffer_head *udf_tgetblk(struct super_block *, int);
+extern struct buffer_head *udf_tread(struct super_block *, int);
+extern struct genericFormat *udf_add_extendedattr(struct inode *, uint32_t,
+						  uint32_t, uint8_t);
+extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t,
+						  uint8_t);
+extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t,
+					   uint32_t, uint16_t *);
+extern struct buffer_head *udf_read_ptagged(struct super_block *,
+					    struct kernel_lb_addr *, uint32_t,
+					    uint16_t *);
+extern void udf_update_tag(char *, int);
+extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int);
+
+/* lowlevel.c */
+extern unsigned int udf_get_last_session(struct super_block *);
+extern unsigned long udf_get_last_block(struct super_block *);
+
+/* partition.c */
+extern uint32_t udf_get_pblock(struct super_block *, uint32_t, uint16_t,
+			       uint32_t);
+extern uint32_t udf_get_pblock_virt15(struct super_block *, uint32_t, uint16_t,
+				      uint32_t);
+extern uint32_t udf_get_pblock_virt20(struct super_block *, uint32_t, uint16_t,
+				      uint32_t);
+extern uint32_t udf_get_pblock_spar15(struct super_block *, uint32_t, uint16_t,
+				      uint32_t);
+extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t,
+					  uint32_t);
+extern int udf_relocate_blocks(struct super_block *, long, long *);
+
+static inline uint32_t
+udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
+		  uint32_t offset)
+{
+	return udf_get_pblock(sb, loc->logicalBlockNum,
+			loc->partitionReferenceNum, offset);
+}
+
+/* unicode.c */
+extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
+extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
+			    int);
+extern int udf_build_ustr(struct ustr *, dstring *, int);
+extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
+
+/* ialloc.c */
+extern void udf_free_inode(struct inode *);
+extern struct inode *udf_new_inode(struct inode *, int, int *);
+
+/* truncate.c */
+extern void udf_truncate_tail_extent(struct inode *);
+extern void udf_discard_prealloc(struct inode *);
+extern void udf_truncate_extents(struct inode *);
+
+/* balloc.c */
+extern void udf_free_blocks(struct super_block *, struct inode *,
+			    struct kernel_lb_addr *, uint32_t, uint32_t);
+extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
+			       uint32_t, uint32_t);
+extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
+			 uint32_t, int *);
+
+/* directory.c */
+extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
+						struct udf_fileident_bh *,
+						struct fileIdentDesc *,
+						struct extent_position *,
+						struct kernel_lb_addr *, uint32_t *,
+						sector_t *);
+extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize,
+					       int *offset);
+extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
+extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
+
+/* udftime.c */
+extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest,
+						struct timestamp src);
+extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src);
+
+#endif				/* __UDF_DECL_H */
diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h
new file mode 100644
index 00000000..6a9f3a9c
--- /dev/null
+++ b/fs/udf/udfend.h
@@ -0,0 +1,77 @@
+#ifndef __UDF_ENDIAN_H
+#define __UDF_ENDIAN_H
+
+#include <asm/byteorder.h>
+#include <linux/string.h>
+
+static inline struct kernel_lb_addr lelb_to_cpu(struct lb_addr in)
+{
+	struct kernel_lb_addr out;
+
+	out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum);
+	out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum);
+
+	return out;
+}
+
+static inline struct lb_addr cpu_to_lelb(struct kernel_lb_addr in)
+{
+	struct lb_addr out;
+
+	out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum);
+	out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum);
+
+	return out;
+}
+
+static inline struct short_ad lesa_to_cpu(struct short_ad in)
+{
+	struct short_ad out;
+
+	out.extLength = le32_to_cpu(in.extLength);
+	out.extPosition = le32_to_cpu(in.extPosition);
+
+	return out;
+}
+
+static inline struct short_ad cpu_to_lesa(struct short_ad in)
+{
+	struct short_ad out;
+
+	out.extLength = cpu_to_le32(in.extLength);
+	out.extPosition = cpu_to_le32(in.extPosition);
+
+	return out;
+}
+
+static inline struct kernel_long_ad lela_to_cpu(struct long_ad in)
+{
+	struct kernel_long_ad out;
+
+	out.extLength = le32_to_cpu(in.extLength);
+	out.extLocation = lelb_to_cpu(in.extLocation);
+
+	return out;
+}
+
+static inline struct long_ad cpu_to_lela(struct kernel_long_ad in)
+{
+	struct long_ad out;
+
+	out.extLength = cpu_to_le32(in.extLength);
+	out.extLocation = cpu_to_lelb(in.extLocation);
+
+	return out;
+}
+
+static inline struct kernel_extent_ad leea_to_cpu(struct extent_ad in)
+{
+	struct kernel_extent_ad out;
+
+	out.extLength = le32_to_cpu(in.extLength);
+	out.extLocation = le32_to_cpu(in.extLocation);
+
+	return out;
+}
+
+#endif /* __UDF_ENDIAN_H */
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
new file mode 100644
index 00000000..b8c828c4
--- /dev/null
+++ b/fs/udf/udftime.c
@@ -0,0 +1,171 @@
+/* Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Paul Eggert (eggert@twinsun.com).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If not,
+   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+   Boston, MA 02111-1307, USA.  */
+
+/*
+ * dgb 10/02/98: ripped this from glibc source to help convert timestamps
+ *               to unix time
+ *     10/04/98: added new table-based lookup after seeing how ugly
+ *               the gnu code is
+ * blf 09/27/99: ripped out all the old code and inserted new table from
+ *		 John Brockmeyer (without leap second corrections)
+ *		 rewrote udf_stamp_to_time and fixed timezone accounting in
+ *		 udf_time_to_stamp.
+ */
+
+/*
+ * We don't take into account leap seconds. This may be correct or incorrect.
+ * For more NIST information (especially dealing with leap seconds), see:
+ * http://www.boulder.nist.gov/timefreq/pubs/bulletin/leapsecond.htm
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include "udfdecl.h"
+
+#define EPOCH_YEAR 1970
+
+#ifndef __isleap
+/* Nonzero if YEAR is a leap year (every 4 years,
+   except every 100th isn't, and every 400th is).  */
+#define	__isleap(year)	\
+  ((year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0))
+#endif
+
+/* How many days come before each month (0-12).  */
+static const unsigned short int __mon_yday[2][13] = {
+	/* Normal years.  */
+	{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
+	/* Leap years.  */
+	{0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
+};
+
+#define MAX_YEAR_SECONDS	69
+#define SPD			0x15180	/*3600*24 */
+#define SPY(y, l, s)		(SPD * (365 * y + l) + s)
+
+static time_t year_seconds[MAX_YEAR_SECONDS] = {
+/*1970*/ SPY(0,   0, 0), SPY(1,   0, 0), SPY(2,   0, 0), SPY(3,   1, 0),
+/*1974*/ SPY(4,   1, 0), SPY(5,   1, 0), SPY(6,   1, 0), SPY(7,   2, 0),
+/*1978*/ SPY(8,   2, 0), SPY(9,   2, 0), SPY(10,  2, 0), SPY(11,  3, 0),
+/*1982*/ SPY(12,  3, 0), SPY(13,  3, 0), SPY(14,  3, 0), SPY(15,  4, 0),
+/*1986*/ SPY(16,  4, 0), SPY(17,  4, 0), SPY(18,  4, 0), SPY(19,  5, 0),
+/*1990*/ SPY(20,  5, 0), SPY(21,  5, 0), SPY(22,  5, 0), SPY(23,  6, 0),
+/*1994*/ SPY(24,  6, 0), SPY(25,  6, 0), SPY(26,  6, 0), SPY(27,  7, 0),
+/*1998*/ SPY(28,  7, 0), SPY(29,  7, 0), SPY(30,  7, 0), SPY(31,  8, 0),
+/*2002*/ SPY(32,  8, 0), SPY(33,  8, 0), SPY(34,  8, 0), SPY(35,  9, 0),
+/*2006*/ SPY(36,  9, 0), SPY(37,  9, 0), SPY(38,  9, 0), SPY(39, 10, 0),
+/*2010*/ SPY(40, 10, 0), SPY(41, 10, 0), SPY(42, 10, 0), SPY(43, 11, 0),
+/*2014*/ SPY(44, 11, 0), SPY(45, 11, 0), SPY(46, 11, 0), SPY(47, 12, 0),
+/*2018*/ SPY(48, 12, 0), SPY(49, 12, 0), SPY(50, 12, 0), SPY(51, 13, 0),
+/*2022*/ SPY(52, 13, 0), SPY(53, 13, 0), SPY(54, 13, 0), SPY(55, 14, 0),
+/*2026*/ SPY(56, 14, 0), SPY(57, 14, 0), SPY(58, 14, 0), SPY(59, 15, 0),
+/*2030*/ SPY(60, 15, 0), SPY(61, 15, 0), SPY(62, 15, 0), SPY(63, 16, 0),
+/*2034*/ SPY(64, 16, 0), SPY(65, 16, 0), SPY(66, 16, 0), SPY(67, 17, 0),
+/*2038*/ SPY(68, 17, 0)
+};
+
+extern struct timezone sys_tz;
+
+#define SECS_PER_HOUR	(60 * 60)
+#define SECS_PER_DAY	(SECS_PER_HOUR * 24)
+
+struct timespec *
+udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src)
+{
+	int yday;
+	u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone);
+	u16 year = le16_to_cpu(src.year);
+	uint8_t type = typeAndTimezone >> 12;
+	int16_t offset;
+
+	if (type == 1) {
+		offset = typeAndTimezone << 4;
+		/* sign extent offset */
+		offset = (offset >> 4);
+		if (offset == -2047) /* unspecified offset */
+			offset = 0;
+	} else
+		offset = 0;
+
+	if ((year < EPOCH_YEAR) ||
+	    (year >= EPOCH_YEAR + MAX_YEAR_SECONDS)) {
+		return NULL;
+	}
+	dest->tv_sec = year_seconds[year - EPOCH_YEAR];
+	dest->tv_sec -= offset * 60;
+
+	yday = ((__mon_yday[__isleap(year)][src.month - 1]) + src.day - 1);
+	dest->tv_sec += (((yday * 24) + src.hour) * 60 + src.minute) * 60 + src.second;
+	dest->tv_nsec = 1000 * (src.centiseconds * 10000 +
+			src.hundredsOfMicroseconds * 100 + src.microseconds);
+	return dest;
+}
+
+struct timestamp *
+udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts)
+{
+	long int days, rem, y;
+	const unsigned short int *ip;
+	int16_t offset;
+
+	offset = -sys_tz.tz_minuteswest;
+
+	if (!dest)
+		return NULL;
+
+	dest->typeAndTimezone = cpu_to_le16(0x1000 | (offset & 0x0FFF));
+
+	ts.tv_sec += offset * 60;
+	days = ts.tv_sec / SECS_PER_DAY;
+	rem = ts.tv_sec % SECS_PER_DAY;
+	dest->hour = rem / SECS_PER_HOUR;
+	rem %= SECS_PER_HOUR;
+	dest->minute = rem / 60;
+	dest->second = rem % 60;
+	y = 1970;
+
+#define DIV(a, b) ((a) / (b) - ((a) % (b) < 0))
+#define LEAPS_THRU_END_OF(y) (DIV (y, 4) - DIV (y, 100) + DIV (y, 400))
+
+	while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
+		long int yg = y + days / 365 - (days % 365 < 0);
+
+		/* Adjust DAYS and Y to match the guessed year.  */
+		days -= ((yg - y) * 365
+			 + LEAPS_THRU_END_OF(yg - 1)
+			 - LEAPS_THRU_END_OF(y - 1));
+		y = yg;
+	}
+	dest->year = cpu_to_le16(y);
+	ip = __mon_yday[__isleap(y)];
+	for (y = 11; days < (long int)ip[y]; --y)
+		continue;
+	days -= ip[y];
+	dest->month = y + 1;
+	dest->day = days + 1;
+
+	dest->centiseconds = ts.tv_nsec / 10000000;
+	dest->hundredsOfMicroseconds = (ts.tv_nsec / 1000 -
+					dest->centiseconds * 10000) / 100;
+	dest->microseconds = (ts.tv_nsec / 1000 - dest->centiseconds * 10000 -
+			      dest->hundredsOfMicroseconds * 100);
+	return dest;
+}
+
+/* EOF */
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
new file mode 100644
index 00000000..d03a90b6
--- /dev/null
+++ b/fs/udf/unicode.c
@@ -0,0 +1,493 @@
+/*
+ * unicode.c
+ *
+ * PURPOSE
+ *	Routines for converting between UTF-8 and OSTA Compressed Unicode.
+ *      Also handles filename mangling
+ *
+ * DESCRIPTION
+ *	OSTA Compressed Unicode is explained in the OSTA UDF specification.
+ *		http://www.osta.org/
+ *	UTF-8 is explained in the IETF RFC XXXX.
+ *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
+ *
+ * COPYRIGHT
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from:
+ *		ftp://prep.ai.mit.edu/pub/gnu/GPL
+ *	Each contributing author retains all rights to their own work.
+ */
+
+#include "udfdecl.h"
+
+#include <linux/kernel.h>
+#include <linux/string.h>	/* for memset */
+#include <linux/nls.h>
+#include <linux/crc-itu-t.h>
+#include <linux/slab.h>
+
+#include "udf_sb.h"
+
+static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int);
+
+static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
+{
+	if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2))
+		return 0;
+
+	memset(dest, 0, sizeof(struct ustr));
+	memcpy(dest->u_name, src, strlen);
+	dest->u_cmpID = 0x08;
+	dest->u_len = strlen;
+
+	return strlen;
+}
+
+/*
+ * udf_build_ustr
+ */
+int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
+{
+	int usesize;
+
+	if (!dest || !ptr || !size)
+		return -1;
+	BUG_ON(size < 2);
+
+	usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
+	usesize = min(usesize, size - 2);
+	dest->u_cmpID = ptr[0];
+	dest->u_len = usesize;
+	memcpy(dest->u_name, ptr + 1, usesize);
+	memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
+
+	return 0;
+}
+
+/*
+ * udf_build_ustr_exact
+ */
+static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
+{
+	if ((!dest) || (!ptr) || (!exactsize))
+		return -1;
+
+	memset(dest, 0, sizeof(struct ustr));
+	dest->u_cmpID = ptr[0];
+	dest->u_len = exactsize - 1;
+	memcpy(dest->u_name, ptr + 1, exactsize - 1);
+
+	return 0;
+}
+
+/*
+ * udf_ocu_to_utf8
+ *
+ * PURPOSE
+ *	Convert OSTA Compressed Unicode to the UTF-8 equivalent.
+ *
+ * PRE-CONDITIONS
+ *	utf			Pointer to UTF-8 output buffer.
+ *	ocu			Pointer to OSTA Compressed Unicode input buffer
+ *				of size UDF_NAME_LEN bytes.
+ * 				both of type "struct ustr *"
+ *
+ * POST-CONDITIONS
+ *	<return>		Zero on success.
+ *
+ * HISTORY
+ *	November 12, 1997 - Andrew E. Mileski
+ *	Written, tested, and released.
+ */
+int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
+{
+	const uint8_t *ocu;
+	uint8_t cmp_id, ocu_len;
+	int i;
+
+	ocu_len = ocu_i->u_len;
+	if (ocu_len == 0) {
+		memset(utf_o, 0, sizeof(struct ustr));
+		return 0;
+	}
+
+	cmp_id = ocu_i->u_cmpID;
+	if (cmp_id != 8 && cmp_id != 16) {
+		memset(utf_o, 0, sizeof(struct ustr));
+		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
+		       cmp_id, ocu_i->u_name);
+		return 0;
+	}
+
+	ocu = ocu_i->u_name;
+	utf_o->u_len = 0;
+	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
+
+		/* Expand OSTA compressed Unicode to Unicode */
+		uint32_t c = ocu[i++];
+		if (cmp_id == 16)
+			c = (c << 8) | ocu[i++];
+
+		/* Compress Unicode to UTF-8 */
+		if (c < 0x80U)
+			utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
+		else if (c < 0x800U) {
+			utf_o->u_name[utf_o->u_len++] =
+						(uint8_t)(0xc0 | (c >> 6));
+			utf_o->u_name[utf_o->u_len++] =
+						(uint8_t)(0x80 | (c & 0x3f));
+		} else {
+			utf_o->u_name[utf_o->u_len++] =
+						(uint8_t)(0xe0 | (c >> 12));
+			utf_o->u_name[utf_o->u_len++] =
+						(uint8_t)(0x80 |
+							  ((c >> 6) & 0x3f));
+			utf_o->u_name[utf_o->u_len++] =
+						(uint8_t)(0x80 | (c & 0x3f));
+		}
+	}
+	utf_o->u_cmpID = 8;
+
+	return utf_o->u_len;
+}
+
+/*
+ *
+ * udf_utf8_to_ocu
+ *
+ * PURPOSE
+ *	Convert UTF-8 to the OSTA Compressed Unicode equivalent.
+ *
+ * DESCRIPTION
+ *	This routine is only called by udf_lookup().
+ *
+ * PRE-CONDITIONS
+ *	ocu			Pointer to OSTA Compressed Unicode output
+ *				buffer of size UDF_NAME_LEN bytes.
+ *	utf			Pointer to UTF-8 input buffer.
+ *	utf_len			Length of UTF-8 input buffer in bytes.
+ *
+ * POST-CONDITIONS
+ *	<return>		Zero on success.
+ *
+ * HISTORY
+ *	November 12, 1997 - Andrew E. Mileski
+ *	Written, tested, and released.
+ */
+static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
+{
+	unsigned c, i, max_val, utf_char;
+	int utf_cnt, u_len;
+
+	memset(ocu, 0, sizeof(dstring) * length);
+	ocu[0] = 8;
+	max_val = 0xffU;
+
+try_again:
+	u_len = 0U;
+	utf_char = 0U;
+	utf_cnt = 0U;
+	for (i = 0U; i < utf->u_len; i++) {
+		c = (uint8_t)utf->u_name[i];
+
+		/* Complete a multi-byte UTF-8 character */
+		if (utf_cnt) {
+			utf_char = (utf_char << 6) | (c & 0x3fU);
+			if (--utf_cnt)
+				continue;
+		} else {
+			/* Check for a multi-byte UTF-8 character */
+			if (c & 0x80U) {
+				/* Start a multi-byte UTF-8 character */
+				if ((c & 0xe0U) == 0xc0U) {
+					utf_char = c & 0x1fU;
+					utf_cnt = 1;
+				} else if ((c & 0xf0U) == 0xe0U) {
+					utf_char = c & 0x0fU;
+					utf_cnt = 2;
+				} else if ((c & 0xf8U) == 0xf0U) {
+					utf_char = c & 0x07U;
+					utf_cnt = 3;
+				} else if ((c & 0xfcU) == 0xf8U) {
+					utf_char = c & 0x03U;
+					utf_cnt = 4;
+				} else if ((c & 0xfeU) == 0xfcU) {
+					utf_char = c & 0x01U;
+					utf_cnt = 5;
+				} else {
+					goto error_out;
+				}
+				continue;
+			} else {
+				/* Single byte UTF-8 character (most common) */
+				utf_char = c;
+			}
+		}
+
+		/* Choose no compression if necessary */
+		if (utf_char > max_val) {
+			if (max_val == 0xffU) {
+				max_val = 0xffffU;
+				ocu[0] = (uint8_t)0x10U;
+				goto try_again;
+			}
+			goto error_out;
+		}
+
+		if (max_val == 0xffffU)
+			ocu[++u_len] = (uint8_t)(utf_char >> 8);
+		ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
+	}
+
+	if (utf_cnt) {
+error_out:
+		ocu[++u_len] = '?';
+		printk(KERN_DEBUG "udf: bad UTF-8 character\n");
+	}
+
+	ocu[length - 1] = (uint8_t)u_len + 1;
+
+	return u_len + 1;
+}
+
+static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
+			const struct ustr *ocu_i)
+{
+	const uint8_t *ocu;
+	uint8_t cmp_id, ocu_len;
+	int i, len;
+
+
+	ocu_len = ocu_i->u_len;
+	if (ocu_len == 0) {
+		memset(utf_o, 0, sizeof(struct ustr));
+		return 0;
+	}
+
+	cmp_id = ocu_i->u_cmpID;
+	if (cmp_id != 8 && cmp_id != 16) {
+		memset(utf_o, 0, sizeof(struct ustr));
+		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
+		       cmp_id, ocu_i->u_name);
+		return 0;
+	}
+
+	ocu = ocu_i->u_name;
+	utf_o->u_len = 0;
+	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
+		/* Expand OSTA compressed Unicode to Unicode */
+		uint32_t c = ocu[i++];
+		if (cmp_id == 16)
+			c = (c << 8) | ocu[i++];
+
+		len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
+				    UDF_NAME_LEN - utf_o->u_len);
+		/* Valid character? */
+		if (len >= 0)
+			utf_o->u_len += len;
+		else
+			utf_o->u_name[utf_o->u_len++] = '?';
+	}
+	utf_o->u_cmpID = 8;
+
+	return utf_o->u_len;
+}
+
+static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
+			int length)
+{
+	int len;
+	unsigned i, max_val;
+	uint16_t uni_char;
+	int u_len;
+
+	memset(ocu, 0, sizeof(dstring) * length);
+	ocu[0] = 8;
+	max_val = 0xffU;
+
+try_again:
+	u_len = 0U;
+	for (i = 0U; i < uni->u_len; i++) {
+		len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
+		if (!len)
+			continue;
+		/* Invalid character, deal with it */
+		if (len < 0) {
+			len = 1;
+			uni_char = '?';
+		}
+
+		if (uni_char > max_val) {
+			max_val = 0xffffU;
+			ocu[0] = (uint8_t)0x10U;
+			goto try_again;
+		}
+
+		if (max_val == 0xffffU)
+			ocu[++u_len] = (uint8_t)(uni_char >> 8);
+		ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
+		i += len - 1;
+	}
+
+	ocu[length - 1] = (uint8_t)u_len + 1;
+	return u_len + 1;
+}
+
+int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
+		     int flen)
+{
+	struct ustr *filename, *unifilename;
+	int len = 0;
+
+	filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
+	if (!filename)
+		return 0;
+
+	unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
+	if (!unifilename)
+		goto out1;
+
+	if (udf_build_ustr_exact(unifilename, sname, flen))
+		goto out2;
+
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
+		if (!udf_CS0toUTF8(filename, unifilename)) {
+			udf_debug("Failed in udf_get_filename: sname = %s\n",
+				  sname);
+			goto out2;
+		}
+	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
+		if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
+				  unifilename)) {
+			udf_debug("Failed in udf_get_filename: sname = %s\n",
+				  sname);
+			goto out2;
+		}
+	} else
+		goto out2;
+
+	len = udf_translate_to_linux(dname, filename->u_name, filename->u_len,
+				     unifilename->u_name, unifilename->u_len);
+out2:
+	kfree(unifilename);
+out1:
+	kfree(filename);
+	return len;
+}
+
+int udf_put_filename(struct super_block *sb, const uint8_t *sname,
+		     uint8_t *dname, int flen)
+{
+	struct ustr unifilename;
+	int namelen;
+
+	if (!udf_char_to_ustr(&unifilename, sname, flen))
+		return 0;
+
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
+		namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN);
+		if (!namelen)
+			return 0;
+	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
+		namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname,
+					&unifilename, UDF_NAME_LEN);
+		if (!namelen)
+			return 0;
+	} else
+		return 0;
+
+	return namelen;
+}
+
+#define ILLEGAL_CHAR_MARK	'_'
+#define EXT_MARK		'.'
+#define CRC_MARK		'#'
+#define EXT_SIZE 		5
+
+static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
+				  int udfLen, uint8_t *fidName,
+				  int fidNameLen)
+{
+	int index, newIndex = 0, needsCRC = 0;
+	int extIndex = 0, newExtIndex = 0, hasExt = 0;
+	unsigned short valueCRC;
+	uint8_t curr;
+	const uint8_t hexChar[] = "0123456789ABCDEF";
+
+	if (udfName[0] == '.' &&
+	    (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
+		needsCRC = 1;
+		newIndex = udfLen;
+		memcpy(newName, udfName, udfLen);
+	} else {
+		for (index = 0; index < udfLen; index++) {
+			curr = udfName[index];
+			if (curr == '/' || curr == 0) {
+				needsCRC = 1;
+				curr = ILLEGAL_CHAR_MARK;
+				while (index + 1 < udfLen &&
+						(udfName[index + 1] == '/' ||
+						 udfName[index + 1] == 0))
+					index++;
+			}
+			if (curr == EXT_MARK &&
+					(udfLen - index - 1) <= EXT_SIZE) {
+				if (udfLen == index + 1)
+					hasExt = 0;
+				else {
+					hasExt = 1;
+					extIndex = index;
+					newExtIndex = newIndex;
+				}
+			}
+			if (newIndex < 256)
+				newName[newIndex++] = curr;
+			else
+				needsCRC = 1;
+		}
+	}
+	if (needsCRC) {
+		uint8_t ext[EXT_SIZE];
+		int localExtIndex = 0;
+
+		if (hasExt) {
+			int maxFilenameLen;
+			for (index = 0;
+			     index < EXT_SIZE && extIndex + index + 1 < udfLen;
+			     index++) {
+				curr = udfName[extIndex + index + 1];
+
+				if (curr == '/' || curr == 0) {
+					needsCRC = 1;
+					curr = ILLEGAL_CHAR_MARK;
+					while (extIndex + index + 2 < udfLen &&
+					      (index + 1 < EXT_SIZE &&
+						(udfName[extIndex + index + 2] == '/' ||
+						 udfName[extIndex + index + 2] == 0)))
+						index++;
+				}
+				ext[localExtIndex++] = curr;
+			}
+			maxFilenameLen = 250 - localExtIndex;
+			if (newIndex > maxFilenameLen)
+				newIndex = maxFilenameLen;
+			else
+				newIndex = newExtIndex;
+		} else if (newIndex > 250)
+			newIndex = 250;
+		newName[newIndex++] = CRC_MARK;
+		valueCRC = crc_itu_t(0, fidName, fidNameLen);
+		newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
+		newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
+		newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
+		newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
+
+		if (hasExt) {
+			newName[newIndex++] = EXT_MARK;
+			for (index = 0; index < localExtIndex; index++)
+				newName[newIndex++] = ext[index];
+		}
+	}
+
+	return newIndex;
+}
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
new file mode 100644
index 00000000..e4f10a40
--- /dev/null
+++ b/fs/ufs/Kconfig
@@ -0,0 +1,43 @@
+config UFS_FS
+	tristate "UFS file system support (read only)"
+	depends on BLOCK
+	help
+	  BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
+	  OpenBSD and NeXTstep) use a file system called UFS. Some System V
+	  Unixes can create and mount hard disk partitions and diskettes using
+	  this file system as well. Saying Y here will allow you to read from
+	  these partitions; if you also want to write to them, say Y to the
+	  experimental "UFS file system write support", below. Please read the
+	  file <file:Documentation/filesystems/ufs.txt> for more information.
+
+          The recently released UFS2 variant (used in FreeBSD 5.x) is
+          READ-ONLY supported.
+
+	  Note that this option is generally not needed for floppies, since a
+	  good portable way to transport files and directories between unixes
+	  (and even other operating systems) is given by the tar program ("man
+	  tar" or preferably "info tar").
+
+	  When accessing NeXTstep files, you may need to convert them from the
+	  NeXT character set to the Latin1 character set; use the program
+	  recode ("info recode") for this purpose.
+
+	  To compile the UFS file system support as a module, choose M here: the
+	  module will be called ufs.
+
+	  If you haven't heard about all of this before, it's safe to say N.
+
+config UFS_FS_WRITE
+	bool "UFS file system write support (DANGEROUS)"
+	depends on UFS_FS && EXPERIMENTAL
+	help
+	  Say Y here if you want to try writing to UFS partitions. This is
+	  experimental, so you should back up your UFS partitions beforehand.
+
+config UFS_DEBUG
+	bool "UFS debugging"
+	depends on UFS_FS
+	help
+	  If you are experiencing any problems with the UFS filesystem, say
+	  Y here.  This will result in _many_ additional debugging messages to be
+	  written to the system log.
diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile
new file mode 100644
index 00000000..dd399804
--- /dev/null
+++ b/fs/ufs/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the Linux ufs filesystem routines.
+#
+
+obj-$(CONFIG_UFS_FS) += ufs.o
+
+ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \
+	    namei.o super.o symlink.o truncate.o util.o
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
new file mode 100644
index 00000000..42694e11
--- /dev/null
+++ b/fs/ufs/balloc.c
@@ -0,0 +1,953 @@
+/*
+ *  linux/fs/ufs/balloc.c
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ * UFS2 write support Evgeniy Dushistov <dushistov@mail.ru>, 2007
+ */
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/time.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/capability.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+#define INVBLOCK ((u64)-1L)
+
+static u64 ufs_add_fragments(struct inode *, u64, unsigned, unsigned, int *);
+static u64 ufs_alloc_fragments(struct inode *, unsigned, u64, unsigned, int *);
+static u64 ufs_alloccg_block(struct inode *, struct ufs_cg_private_info *, u64, int *);
+static u64 ufs_bitmap_search (struct super_block *, struct ufs_cg_private_info *, u64, unsigned);
+static unsigned char ufs_fragtable_8fpb[], ufs_fragtable_other[];
+static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *, unsigned, int);
+
+/*
+ * Free 'count' fragments from fragment number 'fragment'
+ */
+void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	unsigned cgno, bit, end_bit, bbase, blkmap, i;
+	u64 blkno;
+	
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+	usb1 = ubh_get_usb_first(uspi);
+	
+	UFSD("ENTER, fragment %llu, count %u\n",
+	     (unsigned long long)fragment, count);
+	
+	if (ufs_fragnum(fragment) + count > uspi->s_fpg)
+		ufs_error (sb, "ufs_free_fragments", "internal error");
+	
+	lock_super(sb);
+	
+	cgno = ufs_dtog(uspi, fragment);
+	bit = ufs_dtogd(uspi, fragment);
+	if (cgno >= uspi->s_ncg) {
+		ufs_panic (sb, "ufs_free_fragments", "freeing blocks are outside device");
+		goto failed;
+	}
+		
+	ucpi = ufs_load_cylinder (sb, cgno);
+	if (!ucpi) 
+		goto failed;
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
+	if (!ufs_cg_chkmagic(sb, ucg)) {
+		ufs_panic (sb, "ufs_free_fragments", "internal error, bad magic number on cg %u", cgno);
+		goto failed;
+	}
+
+	end_bit = bit + count;
+	bbase = ufs_blknum (bit);
+	blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
+	ufs_fragacct (sb, blkmap, ucg->cg_frsum, -1);
+	for (i = bit; i < end_bit; i++) {
+		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, i))
+			ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, i);
+		else 
+			ufs_error (sb, "ufs_free_fragments",
+				   "bit already cleared for fragment %u", i);
+	}
+	
+	fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
+	uspi->cs_total.cs_nffree += count;
+	fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
+	blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
+	ufs_fragacct(sb, blkmap, ucg->cg_frsum, 1);
+
+	/*
+	 * Trying to reassemble free fragments into block
+	 */
+	blkno = ufs_fragstoblks (bbase);
+	if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
+		fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb);
+		uspi->cs_total.cs_nffree -= uspi->s_fpb;
+		fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb);
+		if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
+			ufs_clusteracct (sb, ucpi, blkno, 1);
+		fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
+		uspi->cs_total.cs_nbfree++;
+		fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
+		if (uspi->fs_magic != UFS2_MAGIC) {
+			unsigned cylno = ufs_cbtocylno (bbase);
+
+			fs16_add(sb, &ubh_cg_blks(ucpi, cylno,
+						  ufs_cbtorpos(bbase)), 1);
+			fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
+		}
+	}
+	
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
+	sb->s_dirt = 1;
+	
+	unlock_super (sb);
+	UFSD("EXIT\n");
+	return;
+
+failed:
+	unlock_super (sb);
+	UFSD("EXIT (FAILED)\n");
+	return;
+}
+
+/*
+ * Free 'count' fragments from fragment number 'fragment' (free whole blocks)
+ */
+void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	unsigned overflow, cgno, bit, end_bit, i;
+	u64 blkno;
+	
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+	usb1 = ubh_get_usb_first(uspi);
+
+	UFSD("ENTER, fragment %llu, count %u\n",
+	     (unsigned long long)fragment, count);
+	
+	if ((fragment & uspi->s_fpbmask) || (count & uspi->s_fpbmask)) {
+		ufs_error (sb, "ufs_free_blocks", "internal error, "
+			   "fragment %llu, count %u\n",
+			   (unsigned long long)fragment, count);
+		goto failed;
+	}
+
+	lock_super(sb);
+	
+do_more:
+	overflow = 0;
+	cgno = ufs_dtog(uspi, fragment);
+	bit = ufs_dtogd(uspi, fragment);
+	if (cgno >= uspi->s_ncg) {
+		ufs_panic (sb, "ufs_free_blocks", "freeing blocks are outside device");
+		goto failed_unlock;
+	}
+	end_bit = bit + count;
+	if (end_bit > uspi->s_fpg) {
+		overflow = bit + count - uspi->s_fpg;
+		count -= overflow;
+		end_bit -= overflow;
+	}
+
+	ucpi = ufs_load_cylinder (sb, cgno);
+	if (!ucpi) 
+		goto failed_unlock;
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
+	if (!ufs_cg_chkmagic(sb, ucg)) {
+		ufs_panic (sb, "ufs_free_blocks", "internal error, bad magic number on cg %u", cgno);
+		goto failed_unlock;
+	}
+
+	for (i = bit; i < end_bit; i += uspi->s_fpb) {
+		blkno = ufs_fragstoblks(i);
+		if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
+			ufs_error(sb, "ufs_free_blocks", "freeing free fragment");
+		}
+		ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
+		if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
+			ufs_clusteracct (sb, ucpi, blkno, 1);
+
+		fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
+		uspi->cs_total.cs_nbfree++;
+		fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
+
+		if (uspi->fs_magic != UFS2_MAGIC) {
+			unsigned cylno = ufs_cbtocylno(i);
+
+			fs16_add(sb, &ubh_cg_blks(ucpi, cylno,
+						  ufs_cbtorpos(i)), 1);
+			fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
+		}
+	}
+
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
+
+	if (overflow) {
+		fragment += count;
+		count = overflow;
+		goto do_more;
+	}
+
+	sb->s_dirt = 1;
+	unlock_super (sb);
+	UFSD("EXIT\n");
+	return;
+
+failed_unlock:
+	unlock_super (sb);
+failed:
+	UFSD("EXIT (FAILED)\n");
+	return;
+}
+
+/*
+ * Modify inode page cache in such way:
+ * have - blocks with b_blocknr equal to oldb...oldb+count-1
+ * get - blocks with b_blocknr equal to newb...newb+count-1
+ * also we suppose that oldb...oldb+count-1 blocks
+ * situated at the end of file.
+ *
+ * We can come here from ufs_writepage or ufs_prepare_write,
+ * locked_page is argument of these functions, so we already lock it.
+ */
+static void ufs_change_blocknr(struct inode *inode, sector_t beg,
+			       unsigned int count, sector_t oldb,
+			       sector_t newb, struct page *locked_page)
+{
+	const unsigned blks_per_page =
+		1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	const unsigned mask = blks_per_page - 1;
+	struct address_space * const mapping = inode->i_mapping;
+	pgoff_t index, cur_index, last_index;
+	unsigned pos, j, lblock;
+	sector_t end, i;
+	struct page *page;
+	struct buffer_head *head, *bh;
+
+	UFSD("ENTER, ino %lu, count %u, oldb %llu, newb %llu\n",
+	      inode->i_ino, count,
+	     (unsigned long long)oldb, (unsigned long long)newb);
+
+	BUG_ON(!locked_page);
+	BUG_ON(!PageLocked(locked_page));
+
+	cur_index = locked_page->index;
+	end = count + beg;
+	last_index = end >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	for (i = beg; i < end; i = (i | mask) + 1) {
+		index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+		if (likely(cur_index != index)) {
+			page = ufs_get_locked_page(mapping, index);
+			if (!page)/* it was truncated */
+				continue;
+			if (IS_ERR(page)) {/* or EIO */
+				ufs_error(inode->i_sb, __func__,
+					  "read of page %llu failed\n",
+					  (unsigned long long)index);
+				continue;
+			}
+		} else
+			page = locked_page;
+
+		head = page_buffers(page);
+		bh = head;
+		pos = i & mask;
+		for (j = 0; j < pos; ++j)
+			bh = bh->b_this_page;
+
+
+		if (unlikely(index == last_index))
+			lblock = end & mask;
+		else
+			lblock = blks_per_page;
+
+		do {
+			if (j >= lblock)
+				break;
+			pos = (i - beg) + j;
+
+			if (!buffer_mapped(bh))
+					map_bh(bh, inode->i_sb, oldb + pos);
+			if (!buffer_uptodate(bh)) {
+				ll_rw_block(READ, 1, &bh);
+				wait_on_buffer(bh);
+				if (!buffer_uptodate(bh)) {
+					ufs_error(inode->i_sb, __func__,
+						  "read of block failed\n");
+					break;
+				}
+			}
+
+			UFSD(" change from %llu to %llu, pos %u\n",
+			     (unsigned long long)(pos + oldb),
+			     (unsigned long long)(pos + newb), pos);
+
+			bh->b_blocknr = newb + pos;
+			unmap_underlying_metadata(bh->b_bdev,
+						  bh->b_blocknr);
+			mark_buffer_dirty(bh);
+			++j;
+			bh = bh->b_this_page;
+		} while (bh != head);
+
+		if (likely(cur_index != index))
+			ufs_put_locked_page(page);
+ 	}
+	UFSD("EXIT\n");
+}
+
+static void ufs_clear_frags(struct inode *inode, sector_t beg, unsigned int n,
+			    int sync)
+{
+	struct buffer_head *bh;
+	sector_t end = beg + n;
+
+	for (; beg < end; ++beg) {
+		bh = sb_getblk(inode->i_sb, beg);
+		lock_buffer(bh);
+		memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+		set_buffer_uptodate(bh);
+		mark_buffer_dirty(bh);
+		unlock_buffer(bh);
+		if (IS_SYNC(inode) || sync)
+			sync_dirty_buffer(bh);
+		brelse(bh);
+	}
+}
+
+u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
+			   u64 goal, unsigned count, int *err,
+			   struct page *locked_page)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	unsigned cgno, oldcount, newcount;
+	u64 tmp, request, result;
+	
+	UFSD("ENTER, ino %lu, fragment %llu, goal %llu, count %u\n",
+	     inode->i_ino, (unsigned long long)fragment,
+	     (unsigned long long)goal, count);
+	
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+	usb1 = ubh_get_usb_first(uspi);
+	*err = -ENOSPC;
+
+	lock_super (sb);
+	tmp = ufs_data_ptr_to_cpu(sb, p);
+
+	if (count + ufs_fragnum(fragment) > uspi->s_fpb) {
+		ufs_warning(sb, "ufs_new_fragments", "internal warning"
+			    " fragment %llu, count %u",
+			    (unsigned long long)fragment, count);
+		count = uspi->s_fpb - ufs_fragnum(fragment); 
+	}
+	oldcount = ufs_fragnum (fragment);
+	newcount = oldcount + count;
+
+	/*
+	 * Somebody else has just allocated our fragments
+	 */
+	if (oldcount) {
+		if (!tmp) {
+			ufs_error(sb, "ufs_new_fragments", "internal error, "
+				  "fragment %llu, tmp %llu\n",
+				  (unsigned long long)fragment,
+				  (unsigned long long)tmp);
+			unlock_super(sb);
+			return INVBLOCK;
+		}
+		if (fragment < UFS_I(inode)->i_lastfrag) {
+			UFSD("EXIT (ALREADY ALLOCATED)\n");
+			unlock_super (sb);
+			return 0;
+		}
+	}
+	else {
+		if (tmp) {
+			UFSD("EXIT (ALREADY ALLOCATED)\n");
+			unlock_super(sb);
+			return 0;
+		}
+	}
+
+	/*
+	 * There is not enough space for user on the device
+	 */
+	if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
+		unlock_super (sb);
+		UFSD("EXIT (FAILED)\n");
+		return 0;
+	}
+
+	if (goal >= uspi->s_size) 
+		goal = 0;
+	if (goal == 0) 
+		cgno = ufs_inotocg (inode->i_ino);
+	else
+		cgno = ufs_dtog(uspi, goal);
+	 
+	/*
+	 * allocate new fragment
+	 */
+	if (oldcount == 0) {
+		result = ufs_alloc_fragments (inode, cgno, goal, count, err);
+		if (result) {
+			ufs_cpu_to_data_ptr(sb, p, result);
+			*err = 0;
+			UFS_I(inode)->i_lastfrag =
+				max(UFS_I(inode)->i_lastfrag, fragment + count);
+			ufs_clear_frags(inode, result + oldcount,
+					newcount - oldcount, locked_page != NULL);
+		}
+		unlock_super(sb);
+		UFSD("EXIT, result %llu\n", (unsigned long long)result);
+		return result;
+	}
+
+	/*
+	 * resize block
+	 */
+	result = ufs_add_fragments (inode, tmp, oldcount, newcount, err);
+	if (result) {
+		*err = 0;
+		UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
+						fragment + count);
+		ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
+				locked_page != NULL);
+		unlock_super(sb);
+		UFSD("EXIT, result %llu\n", (unsigned long long)result);
+		return result;
+	}
+
+	/*
+	 * allocate new block and move data
+	 */
+	switch (fs32_to_cpu(sb, usb1->fs_optim)) {
+	    case UFS_OPTSPACE:
+		request = newcount;
+		if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree
+		    > uspi->s_dsize * uspi->s_minfree / (2 * 100))
+			break;
+		usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
+		break;
+	    default:
+		usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
+	
+	    case UFS_OPTTIME:
+		request = uspi->s_fpb;
+		if (uspi->cs_total.cs_nffree < uspi->s_dsize *
+		    (uspi->s_minfree - 2) / 100)
+			break;
+		usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
+		break;
+	}
+	result = ufs_alloc_fragments (inode, cgno, goal, request, err);
+	if (result) {
+		ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
+				locked_page != NULL);
+		ufs_change_blocknr(inode, fragment - oldcount, oldcount,
+				   uspi->s_sbbase + tmp,
+				   uspi->s_sbbase + result, locked_page);
+		ufs_cpu_to_data_ptr(sb, p, result);
+		*err = 0;
+		UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
+						fragment + count);
+		unlock_super(sb);
+		if (newcount < request)
+			ufs_free_fragments (inode, result + newcount, request - newcount);
+		ufs_free_fragments (inode, tmp, oldcount);
+		UFSD("EXIT, result %llu\n", (unsigned long long)result);
+		return result;
+	}
+
+	unlock_super(sb);
+	UFSD("EXIT (FAILED)\n");
+	return 0;
+}		
+
+static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
+			     unsigned oldcount, unsigned newcount, int *err)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	unsigned cgno, fragno, fragoff, count, fragsize, i;
+	
+	UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
+	     (unsigned long long)fragment, oldcount, newcount);
+	
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+	usb1 = ubh_get_usb_first (uspi);
+	count = newcount - oldcount;
+	
+	cgno = ufs_dtog(uspi, fragment);
+	if (fs32_to_cpu(sb, UFS_SB(sb)->fs_cs(cgno).cs_nffree) < count)
+		return 0;
+	if ((ufs_fragnum (fragment) + newcount) > uspi->s_fpb)
+		return 0;
+	ucpi = ufs_load_cylinder (sb, cgno);
+	if (!ucpi)
+		return 0;
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
+	if (!ufs_cg_chkmagic(sb, ucg)) {
+		ufs_panic (sb, "ufs_add_fragments",
+			"internal error, bad magic number on cg %u", cgno);
+		return 0;
+	}
+
+	fragno = ufs_dtogd(uspi, fragment);
+	fragoff = ufs_fragnum (fragno);
+	for (i = oldcount; i < newcount; i++)
+		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
+			return 0;
+	/*
+	 * Block can be extended
+	 */
+	ucg->cg_time = cpu_to_fs32(sb, get_seconds());
+	for (i = newcount; i < (uspi->s_fpb - fragoff); i++)
+		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
+			break;
+	fragsize = i - oldcount;
+	if (!fs32_to_cpu(sb, ucg->cg_frsum[fragsize]))
+		ufs_panic (sb, "ufs_add_fragments",
+			"internal error or corrupted bitmap on cg %u", cgno);
+	fs32_sub(sb, &ucg->cg_frsum[fragsize], 1);
+	if (fragsize != count)
+		fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
+	for (i = oldcount; i < newcount; i++)
+		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
+
+	fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
+	fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
+	uspi->cs_total.cs_nffree -= count;
+	
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
+	sb->s_dirt = 1;
+
+	UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment);
+	
+	return fragment;
+}
+
+#define UFS_TEST_FREE_SPACE_CG \
+	ucg = (struct ufs_cylinder_group *) UFS_SB(sb)->s_ucg[cgno]->b_data; \
+	if (fs32_to_cpu(sb, ucg->cg_cs.cs_nbfree)) \
+		goto cg_found; \
+	for (k = count; k < uspi->s_fpb; k++) \
+		if (fs32_to_cpu(sb, ucg->cg_frsum[k])) \
+			goto cg_found; 
+
+static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
+			       u64 goal, unsigned count, int *err)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	unsigned oldcg, i, j, k, allocsize;
+	u64 result;
+	
+	UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
+	     inode->i_ino, cgno, (unsigned long long)goal, count);
+
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+	usb1 = ubh_get_usb_first(uspi);
+	oldcg = cgno;
+	
+	/*
+	 * 1. searching on preferred cylinder group
+	 */
+	UFS_TEST_FREE_SPACE_CG
+
+	/*
+	 * 2. quadratic rehash
+	 */
+	for (j = 1; j < uspi->s_ncg; j *= 2) {
+		cgno += j;
+		if (cgno >= uspi->s_ncg) 
+			cgno -= uspi->s_ncg;
+		UFS_TEST_FREE_SPACE_CG
+	}
+
+	/*
+	 * 3. brute force search
+	 * We start at i = 2 ( 0 is checked at 1.step, 1 at 2.step )
+	 */
+	cgno = (oldcg + 1) % uspi->s_ncg;
+	for (j = 2; j < uspi->s_ncg; j++) {
+		cgno++;
+		if (cgno >= uspi->s_ncg)
+			cgno = 0;
+		UFS_TEST_FREE_SPACE_CG
+	}
+	
+	UFSD("EXIT (FAILED)\n");
+	return 0;
+
+cg_found:
+	ucpi = ufs_load_cylinder (sb, cgno);
+	if (!ucpi)
+		return 0;
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
+	if (!ufs_cg_chkmagic(sb, ucg)) 
+		ufs_panic (sb, "ufs_alloc_fragments",
+			"internal error, bad magic number on cg %u", cgno);
+	ucg->cg_time = cpu_to_fs32(sb, get_seconds());
+
+	if (count == uspi->s_fpb) {
+		result = ufs_alloccg_block (inode, ucpi, goal, err);
+		if (result == INVBLOCK)
+			return 0;
+		goto succed;
+	}
+
+	for (allocsize = count; allocsize < uspi->s_fpb; allocsize++)
+		if (fs32_to_cpu(sb, ucg->cg_frsum[allocsize]) != 0)
+			break;
+	
+	if (allocsize == uspi->s_fpb) {
+		result = ufs_alloccg_block (inode, ucpi, goal, err);
+		if (result == INVBLOCK)
+			return 0;
+		goal = ufs_dtogd(uspi, result);
+		for (i = count; i < uspi->s_fpb; i++)
+			ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
+		i = uspi->s_fpb - count;
+
+		fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
+		uspi->cs_total.cs_nffree += i;
+		fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i);
+		fs32_add(sb, &ucg->cg_frsum[i], 1);
+		goto succed;
+	}
+
+	result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
+	if (result == INVBLOCK)
+		return 0;
+	for (i = 0; i < count; i++)
+		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
+	
+	fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
+	uspi->cs_total.cs_nffree -= count;
+	fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
+	fs32_sub(sb, &ucg->cg_frsum[allocsize], 1);
+
+	if (count != allocsize)
+		fs32_add(sb, &ucg->cg_frsum[allocsize - count], 1);
+
+succed:
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
+	sb->s_dirt = 1;
+
+	result += cgno * uspi->s_fpg;
+	UFSD("EXIT3, result %llu\n", (unsigned long long)result);
+	return result;
+}
+
+static u64 ufs_alloccg_block(struct inode *inode,
+			     struct ufs_cg_private_info *ucpi,
+			     u64 goal, int *err)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	struct ufs_cylinder_group * ucg;
+	u64 result, blkno;
+
+	UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
+
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+	usb1 = ubh_get_usb_first(uspi);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
+
+	if (goal == 0) {
+		goal = ucpi->c_rotor;
+		goto norot;
+	}
+	goal = ufs_blknum (goal);
+	goal = ufs_dtogd(uspi, goal);
+	
+	/*
+	 * If the requested block is available, use it.
+	 */
+	if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) {
+		result = goal;
+		goto gotit;
+	}
+	
+norot:	
+	result = ufs_bitmap_search (sb, ucpi, goal, uspi->s_fpb);
+	if (result == INVBLOCK)
+		return INVBLOCK;
+	ucpi->c_rotor = result;
+gotit:
+	blkno = ufs_fragstoblks(result);
+	ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
+	if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
+		ufs_clusteracct (sb, ucpi, blkno, -1);
+
+	fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
+	uspi->cs_total.cs_nbfree--;
+	fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1);
+
+	if (uspi->fs_magic != UFS2_MAGIC) {
+		unsigned cylno = ufs_cbtocylno((unsigned)result);
+
+		fs16_sub(sb, &ubh_cg_blks(ucpi, cylno,
+					  ufs_cbtorpos((unsigned)result)), 1);
+		fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1);
+	}
+	
+	UFSD("EXIT, result %llu\n", (unsigned long long)result);
+
+	return result;
+}
+
+static unsigned ubh_scanc(struct ufs_sb_private_info *uspi,
+			  struct ufs_buffer_head *ubh,
+			  unsigned begin, unsigned size,
+			  unsigned char *table, unsigned char mask)
+{
+	unsigned rest, offset;
+	unsigned char *cp;
+	
+
+	offset = begin & ~uspi->s_fmask;
+	begin >>= uspi->s_fshift;
+	for (;;) {
+		if ((offset + size) < uspi->s_fsize)
+			rest = size;
+		else
+			rest = uspi->s_fsize - offset;
+		size -= rest;
+		cp = ubh->bh[begin]->b_data + offset;
+		while ((table[*cp++] & mask) == 0 && --rest)
+			;
+		if (rest || !size)
+			break;
+		begin++;
+		offset = 0;
+	}
+	return (size + rest);
+}
+
+/*
+ * Find a block of the specified size in the specified cylinder group.
+ * @sp: pointer to super block
+ * @ucpi: pointer to cylinder group info
+ * @goal: near which block we want find new one
+ * @count: specified size
+ */
+static u64 ufs_bitmap_search(struct super_block *sb,
+			     struct ufs_cg_private_info *ucpi,
+			     u64 goal, unsigned count)
+{
+	/*
+	 * Bit patterns for identifying fragments in the block map
+	 * used as ((map & mask_arr) == want_arr)
+	 */
+	static const int mask_arr[9] = {
+		0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff
+	};
+	static const int want_arr[9] = {
+		0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
+	};
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct ufs_super_block_first *usb1;
+	struct ufs_cylinder_group *ucg;
+	unsigned start, length, loc;
+	unsigned pos, want, blockmap, mask, end;
+	u64 result;
+
+	UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx,
+	     (unsigned long long)goal, count);
+
+	usb1 = ubh_get_usb_first (uspi);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
+
+	if (goal)
+		start = ufs_dtogd(uspi, goal) >> 3;
+	else
+		start = ucpi->c_frotor >> 3;
+		
+	length = ((uspi->s_fpg + 7) >> 3) - start;
+	loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff + start, length,
+		(uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
+		1 << (count - 1 + (uspi->s_fpb & 7))); 
+	if (loc == 0) {
+		length = start + 1;
+		loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff, length,
+				(uspi->s_fpb == 8) ? ufs_fragtable_8fpb :
+				ufs_fragtable_other,
+				1 << (count - 1 + (uspi->s_fpb & 7)));
+		if (loc == 0) {
+			ufs_error(sb, "ufs_bitmap_search",
+				  "bitmap corrupted on cg %u, start %u,"
+				  " length %u, count %u, freeoff %u\n",
+				  ucpi->c_cgx, start, length, count,
+				  ucpi->c_freeoff);
+			return INVBLOCK;
+		}
+		start = 0;
+	}
+	result = (start + length - loc) << 3;
+	ucpi->c_frotor = result;
+
+	/*
+	 * found the byte in the map
+	 */
+
+	for (end = result + 8; result < end; result += uspi->s_fpb) {
+		blockmap = ubh_blkmap(UCPI_UBH(ucpi), ucpi->c_freeoff, result);
+		blockmap <<= 1;
+		mask = mask_arr[count];
+		want = want_arr[count];
+		for (pos = 0; pos <= uspi->s_fpb - count; pos++) {
+			if ((blockmap & mask) == want) {
+				UFSD("EXIT, result %llu\n",
+				     (unsigned long long)result);
+				return result + pos;
+ 			}
+			mask <<= 1;
+			want <<= 1;
+ 		}
+ 	}
+
+	ufs_error(sb, "ufs_bitmap_search", "block not in map on cg %u\n",
+		  ucpi->c_cgx);
+	UFSD("EXIT (FAILED)\n");
+	return INVBLOCK;
+}
+
+static void ufs_clusteracct(struct super_block * sb,
+	struct ufs_cg_private_info * ucpi, unsigned blkno, int cnt)
+{
+	struct ufs_sb_private_info * uspi;
+	int i, start, end, forw, back;
+	
+	uspi = UFS_SB(sb)->s_uspi;
+	if (uspi->s_contigsumsize <= 0)
+		return;
+
+	if (cnt > 0)
+		ubh_setbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
+	else
+		ubh_clrbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
+
+	/*
+	 * Find the size of the cluster going forward.
+	 */
+	start = blkno + 1;
+	end = start + uspi->s_contigsumsize;
+	if ( end >= ucpi->c_nclusterblks)
+		end = ucpi->c_nclusterblks;
+	i = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, end, start);
+	if (i > end)
+		i = end;
+	forw = i - start;
+	
+	/*
+	 * Find the size of the cluster going backward.
+	 */
+	start = blkno - 1;
+	end = start - uspi->s_contigsumsize;
+	if (end < 0 ) 
+		end = -1;
+	i = ubh_find_last_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, start, end);
+	if ( i < end) 
+		i = end;
+	back = start - i;
+	
+	/*
+	 * Account for old cluster and the possibly new forward and
+	 * back clusters.
+	 */
+	i = back + forw + 1;
+	if (i > uspi->s_contigsumsize)
+		i = uspi->s_contigsumsize;
+	fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (i << 2)), cnt);
+	if (back > 0)
+		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (back << 2)), cnt);
+	if (forw > 0)
+		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (forw << 2)), cnt);
+}
+
+
+static unsigned char ufs_fragtable_8fpb[] = {
+	0x00, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x04, 0x01, 0x01, 0x01, 0x03, 0x02, 0x03, 0x04, 0x08,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x02, 0x03, 0x03, 0x02, 0x04, 0x05, 0x08, 0x10,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+	0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, 0x04, 0x05, 0x05, 0x06, 0x08, 0x09, 0x10, 0x20,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,	
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11,
+	0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0A,
+	0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, 0x08, 0x09, 0x09, 0x0A, 0x10, 0x11, 0x20, 0x40,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, 0x05, 0x05, 0x05, 0x07, 0x09, 0x09, 0x11, 0x21,
+	0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0A,
+	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, 0x02, 0x03, 0x03, 0x02, 0x06, 0x07, 0x0A, 0x12,
+	0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x05, 0x07, 0x06, 0x07, 0x04, 0x0C,
+	0x08, 0x09, 0x09, 0x0A, 0x09, 0x09, 0x0A, 0x0C, 0x10, 0x11, 0x11, 0x12, 0x20, 0x21, 0x40, 0x80,
+};
+
+static unsigned char ufs_fragtable_other[] = {
+	0x00, 0x16, 0x16, 0x2A, 0x16, 0x16, 0x26, 0x4E, 0x16, 0x16, 0x16, 0x3E, 0x2A, 0x3E, 0x4E, 0x8A,
+	0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E,
+	0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E,
+	0x2A, 0x3E, 0x3E, 0x2A, 0x3E, 0x3E, 0x2E, 0x6E, 0x3E, 0x3E, 0x3E, 0x3E, 0x2A, 0x3E, 0x6E, 0xAA,
+	0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E,
+	0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E,
+	0x26, 0x36, 0x36, 0x2E, 0x36, 0x36, 0x26, 0x6E, 0x36, 0x36, 0x36, 0x3E, 0x2E, 0x3E, 0x6E, 0xAE,
+	0x4E, 0x5E, 0x5E, 0x6E, 0x5E, 0x5E, 0x6E, 0x4E, 0x5E, 0x5E, 0x5E, 0x7E, 0x6E, 0x7E, 0x4E, 0xCE,
+	0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E,
+	0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E,
+	0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E,
+	0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E, 0xBE,
+	0x2A, 0x3E, 0x3E, 0x2A, 0x3E, 0x3E, 0x2E, 0x6E, 0x3E, 0x3E, 0x3E, 0x3E, 0x2A, 0x3E, 0x6E, 0xAA,
+	0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E,	0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E, 0xBE,
+	0x4E, 0x5E, 0x5E, 0x6E, 0x5E, 0x5E, 0x6E, 0x4E, 0x5E, 0x5E, 0x5E, 0x7E, 0x6E, 0x7E, 0x4E, 0xCE,
+	0x8A, 0x9E, 0x9E, 0xAA, 0x9E, 0x9E, 0xAE, 0xCE, 0x9E, 0x9E, 0x9E, 0xBE, 0xAA, 0xBE, 0xCE, 0x8A,
+};
diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c
new file mode 100644
index 00000000..b4676322
--- /dev/null
+++ b/fs/ufs/cylinder.c
@@ -0,0 +1,201 @@
+/*
+ *  linux/fs/ufs/cylinder.c
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ *  ext2 - inode (block) bitmap caching inspired
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+
+#include <asm/byteorder.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+/*
+ * Read cylinder group into cache. The memory space for ufs_cg_private_info
+ * structure is already allocated during ufs_read_super.
+ */
+static void ufs_read_cylinder (struct super_block * sb,
+	unsigned cgno, unsigned bitmap_nr)
+{
+	struct ufs_sb_info * sbi = UFS_SB(sb);
+	struct ufs_sb_private_info * uspi;
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	unsigned i, j;
+
+	UFSD("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr);
+	uspi = sbi->s_uspi;
+	ucpi = sbi->s_ucpi[bitmap_nr];
+	ucg = (struct ufs_cylinder_group *)sbi->s_ucg[cgno]->b_data;
+
+	UCPI_UBH(ucpi)->fragment = ufs_cgcmin(cgno);
+	UCPI_UBH(ucpi)->count = uspi->s_cgsize >> sb->s_blocksize_bits;
+	/*
+	 * We have already the first fragment of cylinder group block in buffer
+	 */
+	UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno];
+	for (i = 1; i < UCPI_UBH(ucpi)->count; i++)
+		if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i)))
+			goto failed;
+	sbi->s_cgno[bitmap_nr] = cgno;
+			
+	ucpi->c_cgx	= fs32_to_cpu(sb, ucg->cg_cgx);
+	ucpi->c_ncyl	= fs16_to_cpu(sb, ucg->cg_ncyl);
+	ucpi->c_niblk	= fs16_to_cpu(sb, ucg->cg_niblk);
+	ucpi->c_ndblk	= fs32_to_cpu(sb, ucg->cg_ndblk);
+	ucpi->c_rotor	= fs32_to_cpu(sb, ucg->cg_rotor);
+	ucpi->c_frotor	= fs32_to_cpu(sb, ucg->cg_frotor);
+	ucpi->c_irotor	= fs32_to_cpu(sb, ucg->cg_irotor);
+	ucpi->c_btotoff	= fs32_to_cpu(sb, ucg->cg_btotoff);
+	ucpi->c_boff	= fs32_to_cpu(sb, ucg->cg_boff);
+	ucpi->c_iusedoff = fs32_to_cpu(sb, ucg->cg_iusedoff);
+	ucpi->c_freeoff	= fs32_to_cpu(sb, ucg->cg_freeoff);
+	ucpi->c_nextfreeoff = fs32_to_cpu(sb, ucg->cg_nextfreeoff);
+	ucpi->c_clustersumoff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clustersumoff);
+	ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff);
+	ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks);
+	UFSD("EXIT\n");
+	return;	
+	
+failed:
+	for (j = 1; j < i; j++)
+		brelse (sbi->s_ucg[j]);
+	sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY;
+	ufs_error (sb, "ufs_read_cylinder", "can't read cylinder group block %u", cgno);
+}
+
+/*
+ * Remove cylinder group from cache, doesn't release memory
+ * allocated for cylinder group (this is done at ufs_put_super only).
+ */
+void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
+{
+	struct ufs_sb_info * sbi = UFS_SB(sb);
+	struct ufs_sb_private_info * uspi; 
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	unsigned i;
+
+	UFSD("ENTER, bitmap_nr %u\n", bitmap_nr);
+
+	uspi = sbi->s_uspi;
+	if (sbi->s_cgno[bitmap_nr] == UFS_CGNO_EMPTY) {
+		UFSD("EXIT\n");
+		return;
+	}
+	ucpi = sbi->s_ucpi[bitmap_nr];
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
+
+	if (uspi->s_ncg > UFS_MAX_GROUP_LOADED && bitmap_nr >= sbi->s_cg_loaded) {
+		ufs_panic (sb, "ufs_put_cylinder", "internal error");
+		return;
+	}
+	/*
+	 * rotor is not so important data, so we put it to disk 
+	 * at the end of working with cylinder
+	 */
+	ucg->cg_rotor = cpu_to_fs32(sb, ucpi->c_rotor);
+	ucg->cg_frotor = cpu_to_fs32(sb, ucpi->c_frotor);
+	ucg->cg_irotor = cpu_to_fs32(sb, ucpi->c_irotor);
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	for (i = 1; i < UCPI_UBH(ucpi)->count; i++) {
+		brelse (UCPI_UBH(ucpi)->bh[i]);
+	}
+
+	sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY;
+	UFSD("EXIT\n");
+}
+
+/*
+ * Find cylinder group in cache and return it as pointer.
+ * If cylinder group is not in cache, we will load it from disk.
+ *
+ * The cache is managed by LRU algorithm. 
+ */
+struct ufs_cg_private_info * ufs_load_cylinder (
+	struct super_block * sb, unsigned cgno)
+{
+	struct ufs_sb_info * sbi = UFS_SB(sb);
+	struct ufs_sb_private_info * uspi;
+	struct ufs_cg_private_info * ucpi;
+	unsigned cg, i, j;
+
+	UFSD("ENTER, cgno %u\n", cgno);
+
+	uspi = sbi->s_uspi;
+	if (cgno >= uspi->s_ncg) {
+		ufs_panic (sb, "ufs_load_cylinder", "internal error, high number of cg");
+		return NULL;
+	}
+	/*
+	 * Cylinder group number cg it in cache and it was last used
+	 */
+	if (sbi->s_cgno[0] == cgno) {
+		UFSD("EXIT\n");
+		return sbi->s_ucpi[0];
+	}
+	/*
+	 * Number of cylinder groups is not higher than UFS_MAX_GROUP_LOADED
+	 */
+	if (uspi->s_ncg <= UFS_MAX_GROUP_LOADED) {
+		if (sbi->s_cgno[cgno] != UFS_CGNO_EMPTY) {
+			if (sbi->s_cgno[cgno] != cgno) {
+				ufs_panic (sb, "ufs_load_cylinder", "internal error, wrong number of cg in cache");
+				UFSD("EXIT (FAILED)\n");
+				return NULL;
+			}
+			else {
+				UFSD("EXIT\n");
+				return sbi->s_ucpi[cgno];
+			}
+		} else {
+			ufs_read_cylinder (sb, cgno, cgno);
+			UFSD("EXIT\n");
+			return sbi->s_ucpi[cgno];
+		}
+	}
+	/*
+	 * Cylinder group number cg is in cache but it was not last used, 
+	 * we will move to the first position
+	 */
+	for (i = 0; i < sbi->s_cg_loaded && sbi->s_cgno[i] != cgno; i++);
+	if (i < sbi->s_cg_loaded && sbi->s_cgno[i] == cgno) {
+		cg = sbi->s_cgno[i];
+		ucpi = sbi->s_ucpi[i];
+		for (j = i; j > 0; j--) {
+			sbi->s_cgno[j] = sbi->s_cgno[j-1];
+			sbi->s_ucpi[j] = sbi->s_ucpi[j-1];
+		}
+		sbi->s_cgno[0] = cg;
+		sbi->s_ucpi[0] = ucpi;
+	/*
+	 * Cylinder group number cg is not in cache, we will read it from disk
+	 * and put it to the first position
+	 */
+	} else {
+		if (sbi->s_cg_loaded < UFS_MAX_GROUP_LOADED)
+			sbi->s_cg_loaded++;
+		else
+			ufs_put_cylinder (sb, UFS_MAX_GROUP_LOADED-1);
+		ucpi = sbi->s_ucpi[sbi->s_cg_loaded - 1];
+		for (j = sbi->s_cg_loaded - 1; j > 0; j--) {
+			sbi->s_cgno[j] = sbi->s_cgno[j-1];
+			sbi->s_ucpi[j] = sbi->s_ucpi[j-1];
+		}
+		sbi->s_ucpi[0] = ucpi;
+		ufs_read_cylinder (sb, cgno, 0);
+	}
+	UFSD("EXIT\n");
+	return sbi->s_ucpi[0];
+}
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
new file mode 100644
index 00000000..dbc90994
--- /dev/null
+++ b/fs/ufs/dir.c
@@ -0,0 +1,666 @@
+/*
+ *  linux/fs/ufs/ufs_dir.c
+ *
+ * Copyright (C) 1996
+ * Adrian Rodriguez (adrian@franklins-tower.rutgers.edu)
+ * Laboratory for Computer Science Research Computing Facility
+ * Rutgers, The State University of New Jersey
+ *
+ * swab support by Francois-Rene Rideau <fare@tunes.org> 19970406
+ *
+ * 4.4BSD (FreeBSD) support added on February 1st 1998 by
+ * Niels Kristian Bech Jensen <nkbj@image.dk> partially based
+ * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>.
+ *
+ * Migration to usage of "page cache" on May 2006 by
+ * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/swap.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+/*
+ * NOTE! unlike strncmp, ufs_match returns 1 for success, 0 for failure.
+ *
+ * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller.
+ */
+static inline int ufs_match(struct super_block *sb, int len,
+		const unsigned char *name, struct ufs_dir_entry *de)
+{
+	if (len != ufs_get_de_namlen(sb, de))
+		return 0;
+	if (!de->d_ino)
+		return 0;
+	return !memcmp(name, de->d_name, len);
+}
+
+static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *dir = mapping->host;
+	int err = 0;
+
+	dir->i_version++;
+	block_write_end(NULL, mapping, pos, len, len, page, NULL);
+	if (pos+len > dir->i_size) {
+		i_size_write(dir, pos+len);
+		mark_inode_dirty(dir);
+	}
+	if (IS_DIRSYNC(dir))
+		err = write_one_page(page, 1);
+	else
+		unlock_page(page);
+	return err;
+}
+
+static inline void ufs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+static inline unsigned long ufs_dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+
+ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
+{
+	ino_t res = 0;
+	struct ufs_dir_entry *de;
+	struct page *page;
+	
+	de = ufs_find_entry(dir, qstr, &page);
+	if (de) {
+		res = fs32_to_cpu(dir->i_sb, de->d_ino);
+		ufs_put_page(page);
+	}
+	return res;
+}
+
+
+/* Releases the page */
+void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+		  struct page *page, struct inode *inode)
+{
+	loff_t pos = page_offset(page) +
+			(char *) de - (char *) page_address(page);
+	unsigned len = fs16_to_cpu(dir->i_sb, de->d_reclen);
+	int err;
+
+	lock_page(page);
+	err = ufs_prepare_chunk(page, pos, len);
+	BUG_ON(err);
+
+	de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
+	ufs_set_de_type(dir->i_sb, de, inode->i_mode);
+
+	err = ufs_commit_chunk(page, pos, len);
+	ufs_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+}
+
+
+static void ufs_check_page(struct page *page)
+{
+	struct inode *dir = page->mapping->host;
+	struct super_block *sb = dir->i_sb;
+	char *kaddr = page_address(page);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	const unsigned chunk_mask = UFS_SB(sb)->s_uspi->s_dirblksize - 1;
+	struct ufs_dir_entry *p;
+	char *error;
+
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & chunk_mask)
+			goto Ebadsize;
+		if (!limit)
+			goto out;
+	}
+	for (offs = 0; offs <= limit - UFS_DIR_REC_LEN(1); offs += rec_len) {
+		p = (struct ufs_dir_entry *)(kaddr + offs);
+		rec_len = fs16_to_cpu(sb, p->d_reclen);
+
+		if (rec_len < UFS_DIR_REC_LEN(1))
+			goto Eshort;
+		if (rec_len & 3)
+			goto Ealign;
+		if (rec_len < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, p)))
+			goto Enamelen;
+		if (((offs + rec_len - 1) ^ offs) & ~chunk_mask)
+			goto Espan;
+		if (fs32_to_cpu(sb, p->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg *
+						  UFS_SB(sb)->s_uspi->s_ncg))
+			goto Einumber;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+	/* Too bad, we had an error */
+
+Ebadsize:
+	ufs_error(sb, "ufs_check_page",
+		  "size of directory #%lu is not a multiple of chunk size",
+		  dir->i_ino
+	);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+	goto bad_entry;
+Einumber:
+	error = "inode out of bounds";
+bad_entry:
+	ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - "
+		   "offset=%lu, rec_len=%d, name_len=%d",
+		   dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		   rec_len, ufs_get_de_namlen(sb, p));
+	goto fail;
+Eend:
+	p = (struct ufs_dir_entry *)(kaddr + offs);
+	ufs_error(sb, __func__,
+		   "entry in directory #%lu spans the page boundary"
+		   "offset=%lu",
+		   dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs);
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
+
+static struct page *ufs_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (!PageChecked(page))
+			ufs_check_page(page);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	ufs_put_page(page);
+	return ERR_PTR(-EIO);
+}
+
+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned
+ufs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+	unsigned last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
+}
+
+static inline struct ufs_dir_entry *
+ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p)
+{
+	return (struct ufs_dir_entry *)((char *)p +
+					fs16_to_cpu(sb, p->d_reclen));
+}
+
+struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
+{
+	struct page *page = ufs_get_page(dir, 0);
+	struct ufs_dir_entry *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = ufs_next_entry(dir->i_sb,
+				    (struct ufs_dir_entry *)page_address(page));
+		*p = page;
+	}
+	return de;
+}
+
+/*
+ *	ufs_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the page in which the entry was found, and the entry itself
+ * (as a parameter - res_dir). Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
+				     struct page **res_page)
+{
+	struct super_block *sb = dir->i_sb;
+	const unsigned char *name = qstr->name;
+	int namelen = qstr->len;
+	unsigned reclen = UFS_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = ufs_dir_pages(dir);
+	struct page *page = NULL;
+	struct ufs_inode_info *ui = UFS_I(dir);
+	struct ufs_dir_entry *de;
+
+	UFSD("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen);
+
+	if (npages == 0 || namelen > UFS_MAXNAMLEN)
+		goto out;
+
+	/* OFFSET_CACHE */
+	*res_page = NULL;
+
+	start = ui->i_dir_start_lookup;
+
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = ufs_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (struct ufs_dir_entry *) kaddr;
+			kaddr += ufs_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->d_reclen == 0) {
+					ufs_error(dir->i_sb, __func__,
+						  "zero-length directory entry");
+					ufs_put_page(page);
+					goto out;
+				}
+				if (ufs_match(sb, namelen, name, de))
+					goto found;
+				de = ufs_next_entry(sb, de);
+			}
+			ufs_put_page(page);
+		}
+		if (++n >= npages)
+			n = 0;
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	ui->i_dir_start_lookup = n;
+	return de;
+}
+
+/*
+ *	Parent is locked.
+ */
+int ufs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const unsigned char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct super_block *sb = dir->i_sb;
+	unsigned reclen = UFS_DIR_REC_LEN(namelen);
+	const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize;
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	struct ufs_dir_entry *de;
+	unsigned long npages = ufs_dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	loff_t pos;
+	int err;
+
+	UFSD("ENTER, name %s, namelen %u\n", name, namelen);
+
+	/*
+	 * We take care of directory expansion in the same loop.
+	 * This code plays outside i_size, so it locks the page
+	 * to protect that region.
+	 */
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = ufs_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + ufs_last_byte(dir, n);
+		de = (struct ufs_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				/* We hit i_size */
+				name_len = 0;
+				rec_len = chunk_size;
+				de->d_reclen = cpu_to_fs16(sb, chunk_size);
+				de->d_ino = 0;
+				goto got_it;
+			}
+			if (de->d_reclen == 0) {
+				ufs_error(dir->i_sb, __func__,
+					  "zero-length directory entry");
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (ufs_match(sb, namelen, name, de))
+				goto out_unlock;
+			name_len = UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de));
+			rec_len = fs16_to_cpu(sb, de->d_reclen);
+			if (!de->d_ino && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (struct ufs_dir_entry *) ((char *) de + rec_len);
+		}
+		unlock_page(page);
+		ufs_put_page(page);
+	}
+	BUG();
+	return -EINVAL;
+
+got_it:
+	pos = page_offset(page) +
+			(char*)de - (char*)page_address(page);
+	err = ufs_prepare_chunk(page, pos, rec_len);
+	if (err)
+		goto out_unlock;
+	if (de->d_ino) {
+		struct ufs_dir_entry *de1 =
+			(struct ufs_dir_entry *) ((char *) de + name_len);
+		de1->d_reclen = cpu_to_fs16(sb, rec_len - name_len);
+		de->d_reclen = cpu_to_fs16(sb, name_len);
+
+		de = de1;
+	}
+
+	ufs_set_de_namlen(sb, de, namelen);
+	memcpy(de->d_name, name, namelen + 1);
+	de->d_ino = cpu_to_fs32(sb, inode->i_ino);
+	ufs_set_de_type(sb, de, inode->i_mode);
+
+	err = ufs_commit_chunk(page, pos, rec_len);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+
+	mark_inode_dirty(dir);
+	/* OFFSET_CACHE */
+out_put:
+	ufs_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+static inline unsigned
+ufs_validate_entry(struct super_block *sb, char *base,
+		   unsigned offset, unsigned mask)
+{
+	struct ufs_dir_entry *de = (struct ufs_dir_entry*)(base + offset);
+	struct ufs_dir_entry *p = (struct ufs_dir_entry*)(base + (offset&mask));
+	while ((char*)p < (char*)de) {
+		if (p->d_reclen == 0)
+			break;
+		p = ufs_next_entry(sb, p);
+	}
+	return (char *)p - base;
+}
+
+
+/*
+ * This is blatantly stolen from ext2fs
+ */
+static int
+ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	loff_t pos = filp->f_pos;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = ufs_dir_pages(inode);
+	unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
+	int need_revalidate = filp->f_version != inode->i_version;
+	unsigned flags = UFS_SB(sb)->s_flags;
+
+	UFSD("BEGIN\n");
+
+	if (pos > inode->i_size - UFS_DIR_REC_LEN(1))
+		return 0;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct ufs_dir_entry *de;
+
+		struct page *page = ufs_get_page(inode, n);
+
+		if (IS_ERR(page)) {
+			ufs_error(sb, __func__,
+				  "bad page in #%lu",
+				  inode->i_ino);
+			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			return -EIO;
+		}
+		kaddr = page_address(page);
+		if (unlikely(need_revalidate)) {
+			if (offset) {
+				offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
+				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+			}
+			filp->f_version = inode->i_version;
+			need_revalidate = 0;
+		}
+		de = (struct ufs_dir_entry *)(kaddr+offset);
+		limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1);
+		for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) {
+			if (de->d_reclen == 0) {
+				ufs_error(sb, __func__,
+					"zero-length directory entry");
+				ufs_put_page(page);
+				return -EIO;
+			}
+			if (de->d_ino) {
+				int over;
+				unsigned char d_type = DT_UNKNOWN;
+
+				offset = (char *)de - kaddr;
+
+				UFSD("filldir(%s,%u)\n", de->d_name,
+				      fs32_to_cpu(sb, de->d_ino));
+				UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
+
+				if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
+					d_type = de->d_u.d_44.d_type;
+
+				over = filldir(dirent, de->d_name,
+					       ufs_get_de_namlen(sb, de),
+						(n<<PAGE_CACHE_SHIFT) | offset,
+					       fs32_to_cpu(sb, de->d_ino), d_type);
+				if (over) {
+					ufs_put_page(page);
+					return 0;
+				}
+			}
+			filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
+		}
+		ufs_put_page(page);
+	}
+	return 0;
+}
+
+
+/*
+ * ufs_delete_entry deletes a directory entry by merging it with the
+ * previous entry.
+ */
+int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
+		     struct page * page)
+{
+	struct super_block *sb = inode->i_sb;
+	char *kaddr = page_address(page);
+	unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
+	unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen);
+	loff_t pos;
+	struct ufs_dir_entry *pde = NULL;
+	struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from);
+	int err;
+
+	UFSD("ENTER\n");
+
+	UFSD("ino %u, reclen %u, namlen %u, name %s\n",
+	      fs32_to_cpu(sb, de->d_ino),
+	      fs16_to_cpu(sb, de->d_reclen),
+	      ufs_get_de_namlen(sb, de), de->d_name);
+
+	while ((char*)de < (char*)dir) {
+		if (de->d_reclen == 0) {
+			ufs_error(inode->i_sb, __func__,
+				  "zero-length directory entry");
+			err = -EIO;
+			goto out;
+		}
+		pde = de;
+		de = ufs_next_entry(sb, de);
+	}
+	if (pde)
+		from = (char*)pde - (char*)page_address(page);
+
+	pos = page_offset(page) + from;
+	lock_page(page);
+	err = ufs_prepare_chunk(page, pos, to - from);
+	BUG_ON(err);
+	if (pde)
+		pde->d_reclen = cpu_to_fs16(sb, to - from);
+	dir->d_ino = 0;
+	err = ufs_commit_chunk(page, pos, to - from);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+out:
+	ufs_put_page(page);
+	UFSD("EXIT\n");
+	return err;
+}
+
+int ufs_make_empty(struct inode * inode, struct inode *dir)
+{
+	struct super_block * sb = dir->i_sb;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
+	const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize;
+	struct ufs_dir_entry * de;
+	char *base;
+	int err;
+
+	if (!page)
+		return -ENOMEM;
+
+	err = ufs_prepare_chunk(page, 0, chunk_size);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+
+	kmap(page);
+	base = (char*)page_address(page);
+	memset(base, 0, PAGE_CACHE_SIZE);
+
+	de = (struct ufs_dir_entry *) base;
+
+	de->d_ino = cpu_to_fs32(sb, inode->i_ino);
+	ufs_set_de_type(sb, de, inode->i_mode);
+	ufs_set_de_namlen(sb, de, 1);
+	de->d_reclen = cpu_to_fs16(sb, UFS_DIR_REC_LEN(1));
+	strcpy (de->d_name, ".");
+	de = (struct ufs_dir_entry *)
+		((char *)de + fs16_to_cpu(sb, de->d_reclen));
+	de->d_ino = cpu_to_fs32(sb, dir->i_ino);
+	ufs_set_de_type(sb, de, dir->i_mode);
+	de->d_reclen = cpu_to_fs16(sb, chunk_size - UFS_DIR_REC_LEN(1));
+	ufs_set_de_namlen(sb, de, 2);
+	strcpy (de->d_name, "..");
+	kunmap(page);
+
+	err = ufs_commit_chunk(page, 0, chunk_size);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int ufs_empty_dir(struct inode * inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct page *page = NULL;
+	unsigned long i, npages = ufs_dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct ufs_dir_entry *de;
+		page = ufs_get_page(inode, i);
+
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = page_address(page);
+		de = (struct ufs_dir_entry *)kaddr;
+		kaddr += ufs_last_byte(inode, i) - UFS_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->d_reclen == 0) {
+				ufs_error(inode->i_sb, __func__,
+					"zero-length directory entry: "
+					"kaddr=%p, de=%p\n", kaddr, de);
+				goto not_empty;
+			}
+			if (de->d_ino) {
+				u16 namelen=ufs_get_de_namlen(sb, de);
+				/* check for . and .. */
+				if (de->d_name[0] != '.')
+					goto not_empty;
+				if (namelen > 2)
+					goto not_empty;
+				if (namelen < 2) {
+					if (inode->i_ino !=
+					    fs32_to_cpu(sb, de->d_ino))
+						goto not_empty;
+				} else if (de->d_name[1] != '.')
+					goto not_empty;
+			}
+			de = ufs_next_entry(sb, de);
+		}
+		ufs_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	ufs_put_page(page);
+	return 0;
+}
+
+const struct file_operations ufs_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= ufs_readdir,
+	.fsync		= generic_file_fsync,
+	.llseek		= generic_file_llseek,
+};
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
new file mode 100644
index 00000000..33afa20d
--- /dev/null
+++ b/fs/ufs/file.c
@@ -0,0 +1,46 @@
+/*
+ *  linux/fs/ufs/file.c
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ *  from
+ *
+ *  linux/fs/ext2/file.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/file.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 fs regular file handling primitives
+ */
+
+#include <linux/fs.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+
+/*
+ * We have mostly NULL's here: the current defaults are ok for
+ * the ufs filesystem.
+ */
+ 
+const struct file_operations ufs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.open           = generic_file_open,
+	.fsync		= generic_file_fsync,
+	.splice_read	= generic_file_splice_read,
+};
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
new file mode 100644
index 00000000..2eabf04a
--- /dev/null
+++ b/fs/ufs/ialloc.c
@@ -0,0 +1,354 @@
+/*
+ *  linux/fs/ufs/ialloc.c
+ *
+ * Copyright (c) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ *  from
+ *
+ *  linux/fs/ext2/ialloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  BSD ufs-inspired inode and directory allocation by 
+ *  Stephen Tweedie (sct@dcs.ed.ac.uk), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * UFS2 write support added by
+ * Evgeniy Dushistov <dushistov@mail.ru>, 2007
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+/*
+ * NOTE! When we get the inode, we're the only people
+ * that have access to it, and as such there are no
+ * race conditions we have to worry about. The inode
+ * is not on the hash-lists, and it cannot be reached
+ * through the filesystem because the directory entry
+ * has been deleted earlier.
+ *
+ * HOWEVER: we must make sure that we get no aliases,
+ * which means that we have to call "clear_inode()"
+ * _before_ we mark the inode not in use in the inode
+ * bitmaps. Otherwise a newly created file might use
+ * the same inode number (not actually the same pointer
+ * though), and then we'd have two inodes sharing the
+ * same inode number and space on the harddisk.
+ */
+void ufs_free_inode (struct inode * inode)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	int is_directory;
+	unsigned ino, cg, bit;
+	
+	UFSD("ENTER, ino %lu\n", inode->i_ino);
+
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+	usb1 = ubh_get_usb_first(uspi);
+	
+	ino = inode->i_ino;
+
+	lock_super (sb);
+
+	if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) {
+		ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino);
+		unlock_super (sb);
+		return;
+	}
+	
+	cg = ufs_inotocg (ino);
+	bit = ufs_inotocgoff (ino);
+	ucpi = ufs_load_cylinder (sb, cg);
+	if (!ucpi) {
+		unlock_super (sb);
+		return;
+	}
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
+	if (!ufs_cg_chkmagic(sb, ucg))
+		ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number");
+
+	ucg->cg_time = cpu_to_fs32(sb, get_seconds());
+
+	is_directory = S_ISDIR(inode->i_mode);
+
+	if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
+		ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino);
+	else {
+		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
+		if (ino < ucpi->c_irotor)
+			ucpi->c_irotor = ino;
+		fs32_add(sb, &ucg->cg_cs.cs_nifree, 1);
+		uspi->cs_total.cs_nifree++;
+		fs32_add(sb, &UFS_SB(sb)->fs_cs(cg).cs_nifree, 1);
+
+		if (is_directory) {
+			fs32_sub(sb, &ucg->cg_cs.cs_ndir, 1);
+			uspi->cs_total.cs_ndir--;
+			fs32_sub(sb, &UFS_SB(sb)->fs_cs(cg).cs_ndir, 1);
+		}
+	}
+
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
+	
+	sb->s_dirt = 1;
+	unlock_super (sb);
+	UFSD("EXIT\n");
+}
+
+/*
+ * Nullify new chunk of inodes,
+ * BSD people also set ui_gen field of inode
+ * during nullification, but we not care about
+ * that because of linux ufs do not support NFS
+ */
+static void ufs2_init_inodes_chunk(struct super_block *sb,
+				   struct ufs_cg_private_info *ucpi,
+				   struct ufs_cylinder_group *ucg)
+{
+	struct buffer_head *bh;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	sector_t beg = uspi->s_sbbase +
+		ufs_inotofsba(ucpi->c_cgx * uspi->s_ipg +
+			      fs32_to_cpu(sb, ucg->cg_u.cg_u2.cg_initediblk));
+	sector_t end = beg + uspi->s_fpb;
+
+	UFSD("ENTER cgno %d\n", ucpi->c_cgx);
+
+	for (; beg < end; ++beg) {
+		bh = sb_getblk(sb, beg);
+		lock_buffer(bh);
+		memset(bh->b_data, 0, sb->s_blocksize);
+		set_buffer_uptodate(bh);
+		mark_buffer_dirty(bh);
+		unlock_buffer(bh);
+		if (sb->s_flags & MS_SYNCHRONOUS)
+			sync_dirty_buffer(bh);
+		brelse(bh);
+	}
+
+	fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb);
+	ubh_mark_buffer_dirty(UCPI_UBH(ucpi));
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
+
+	UFSD("EXIT\n");
+}
+
+/*
+ * There are two policies for allocating an inode.  If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory's block
+ * group to find a free inode.
+ */
+struct inode * ufs_new_inode(struct inode * dir, int mode)
+{
+	struct super_block * sb;
+	struct ufs_sb_info * sbi;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	struct ufs_cg_private_info * ucpi;
+	struct ufs_cylinder_group * ucg;
+	struct inode * inode;
+	unsigned cg, bit, i, j, start;
+	struct ufs_inode_info *ufsi;
+	int err = -ENOSPC;
+
+	UFSD("ENTER\n");
+	
+	/* Cannot create files in a deleted directory */
+	if (!dir || !dir->i_nlink)
+		return ERR_PTR(-EPERM);
+	sb = dir->i_sb;
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	ufsi = UFS_I(inode);
+	sbi = UFS_SB(sb);
+	uspi = sbi->s_uspi;
+	usb1 = ubh_get_usb_first(uspi);
+
+	lock_super (sb);
+
+	/*
+	 * Try to place the inode in its parent directory
+	 */
+	i = ufs_inotocg(dir->i_ino);
+	if (sbi->fs_cs(i).cs_nifree) {
+		cg = i;
+		goto cg_found;
+	}
+
+	/*
+	 * Use a quadratic hash to find a group with a free inode
+	 */
+	for ( j = 1; j < uspi->s_ncg; j <<= 1 ) {
+		i += j;
+		if (i >= uspi->s_ncg)
+			i -= uspi->s_ncg;
+		if (sbi->fs_cs(i).cs_nifree) {
+			cg = i;
+			goto cg_found;
+		}
+	}
+
+	/*
+	 * That failed: try linear search for a free inode
+	 */
+	i = ufs_inotocg(dir->i_ino) + 1;
+	for (j = 2; j < uspi->s_ncg; j++) {
+		i++;
+		if (i >= uspi->s_ncg)
+			i = 0;
+		if (sbi->fs_cs(i).cs_nifree) {
+			cg = i;
+			goto cg_found;
+		}
+	}
+
+	goto failed;
+
+cg_found:
+	ucpi = ufs_load_cylinder (sb, cg);
+	if (!ucpi) {
+		err = -EIO;
+		goto failed;
+	}
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
+	if (!ufs_cg_chkmagic(sb, ucg)) 
+		ufs_panic (sb, "ufs_new_inode", "internal error, bad cg magic number");
+
+	start = ucpi->c_irotor;
+	bit = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, uspi->s_ipg, start);
+	if (!(bit < uspi->s_ipg)) {
+		bit = ubh_find_first_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, start);
+		if (!(bit < start)) {
+			ufs_error (sb, "ufs_new_inode",
+			    "cylinder group %u corrupted - error in inode bitmap\n", cg);
+			err = -EIO;
+			goto failed;
+		}
+	}
+	UFSD("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg);
+	if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
+		ubh_setbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
+	else {
+		ufs_panic (sb, "ufs_new_inode", "internal error");
+		err = -EIO;
+		goto failed;
+	}
+
+	if (uspi->fs_magic == UFS2_MAGIC) {
+		u32 initediblk = fs32_to_cpu(sb, ucg->cg_u.cg_u2.cg_initediblk);
+
+		if (bit + uspi->s_inopb > initediblk &&
+		    initediblk < fs32_to_cpu(sb, ucg->cg_u.cg_u2.cg_niblk))
+			ufs2_init_inodes_chunk(sb, ucpi, ucg);
+	}
+
+	fs32_sub(sb, &ucg->cg_cs.cs_nifree, 1);
+	uspi->cs_total.cs_nifree--;
+	fs32_sub(sb, &sbi->fs_cs(cg).cs_nifree, 1);
+	
+	if (S_ISDIR(mode)) {
+		fs32_add(sb, &ucg->cg_cs.cs_ndir, 1);
+		uspi->cs_total.cs_ndir++;
+		fs32_add(sb, &sbi->fs_cs(cg).cs_ndir, 1);
+	}
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
+	sb->s_dirt = 1;
+
+	inode->i_ino = cg * uspi->s_ipg + bit;
+	inode_init_owner(inode, dir, mode);
+	inode->i_blocks = 0;
+	inode->i_generation = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	ufsi->i_flags = UFS_I(dir)->i_flags;
+	ufsi->i_lastfrag = 0;
+	ufsi->i_shadow = 0;
+	ufsi->i_osync = 0;
+	ufsi->i_oeftflag = 0;
+	ufsi->i_dir_start_lookup = 0;
+	memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1));
+	insert_inode_hash(inode);
+	mark_inode_dirty(inode);
+
+	if (uspi->fs_magic == UFS2_MAGIC) {
+		struct buffer_head *bh;
+		struct ufs2_inode *ufs2_inode;
+
+		/*
+		 * setup birth date, we do it here because of there is no sense
+		 * to hold it in struct ufs_inode_info, and lose 64 bit
+		 */
+		bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
+		if (!bh) {
+			ufs_warning(sb, "ufs_read_inode",
+				    "unable to read inode %lu\n",
+				    inode->i_ino);
+			err = -EIO;
+			goto fail_remove_inode;
+		}
+		lock_buffer(bh);
+		ufs2_inode = (struct ufs2_inode *)bh->b_data;
+		ufs2_inode += ufs_inotofsbo(inode->i_ino);
+		ufs2_inode->ui_birthtime = cpu_to_fs64(sb, CURRENT_TIME.tv_sec);
+		ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, CURRENT_TIME.tv_nsec);
+		mark_buffer_dirty(bh);
+		unlock_buffer(bh);
+		if (sb->s_flags & MS_SYNCHRONOUS)
+			sync_dirty_buffer(bh);
+		brelse(bh);
+	}
+
+	unlock_super (sb);
+
+	UFSD("allocating inode %lu\n", inode->i_ino);
+	UFSD("EXIT\n");
+	return inode;
+
+fail_remove_inode:
+	unlock_super(sb);
+	inode->i_nlink = 0;
+	iput(inode);
+	UFSD("EXIT (FAILED): err %d\n", err);
+	return ERR_PTR(err);
+failed:
+	unlock_super (sb);
+	make_bad_inode(inode);
+	iput (inode);
+	UFSD("EXIT (FAILED): err %d\n", err);
+	return ERR_PTR(err);
+}
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
new file mode 100644
index 00000000..b4d791a8
--- /dev/null
+++ b/fs/ufs/inode.c
@@ -0,0 +1,906 @@
+/*
+ *  linux/fs/ufs/inode.c
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ *  from
+ *
+ *  linux/fs/ext2/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Goal-directed block allocation by Stephen Tweedie (sct@dcs.ed.ac.uk), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock);
+
+static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
+{
+	struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi;
+	int ptrs = uspi->s_apb;
+	int ptrs_bits = uspi->s_apbshift;
+	const long direct_blocks = UFS_NDADDR,
+		indirect_blocks = ptrs,
+		double_blocks = (1 << (ptrs_bits * 2));
+	int n = 0;
+
+
+	UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks);
+	if (i_block < direct_blocks) {
+		offsets[n++] = i_block;
+	} else if ((i_block -= direct_blocks) < indirect_blocks) {
+		offsets[n++] = UFS_IND_BLOCK;
+		offsets[n++] = i_block;
+	} else if ((i_block -= indirect_blocks) < double_blocks) {
+		offsets[n++] = UFS_DIND_BLOCK;
+		offsets[n++] = i_block >> ptrs_bits;
+		offsets[n++] = i_block & (ptrs - 1);
+	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+		offsets[n++] = UFS_TIND_BLOCK;
+		offsets[n++] = i_block >> (ptrs_bits * 2);
+		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+		offsets[n++] = i_block & (ptrs - 1);
+	} else {
+		ufs_warning(inode->i_sb, "ufs_block_to_path", "block > big");
+	}
+	return n;
+}
+
+/*
+ * Returns the location of the fragment from
+ * the beginning of the filesystem.
+ */
+
+static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	u64 mask = (u64) uspi->s_apbmask>>uspi->s_fpbshift;
+	int shift = uspi->s_apbshift-uspi->s_fpbshift;
+	sector_t offsets[4], *p;
+	int depth = ufs_block_to_path(inode, frag >> uspi->s_fpbshift, offsets);
+	u64  ret = 0L;
+	__fs32 block;
+	__fs64 u2_block = 0L;
+	unsigned flags = UFS_SB(sb)->s_flags;
+	u64 temp = 0L;
+
+	UFSD(": frag = %llu  depth = %d\n", (unsigned long long)frag, depth);
+	UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",
+		uspi->s_fpbshift, uspi->s_apbmask,
+		(unsigned long long)mask);
+
+	if (depth == 0)
+		return 0;
+
+	p = offsets;
+
+	if (needs_lock)
+		lock_ufs(sb);
+	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
+		goto ufs2;
+
+	block = ufsi->i_u1.i_data[*p++];
+	if (!block)
+		goto out;
+	while (--depth) {
+		struct buffer_head *bh;
+		sector_t n = *p++;
+
+		bh = sb_bread(sb, uspi->s_sbbase + fs32_to_cpu(sb, block)+(n>>shift));
+		if (!bh)
+			goto out;
+		block = ((__fs32 *) bh->b_data)[n & mask];
+		brelse (bh);
+		if (!block)
+			goto out;
+	}
+	ret = (u64) (uspi->s_sbbase + fs32_to_cpu(sb, block) + (frag & uspi->s_fpbmask));
+	goto out;
+ufs2:
+	u2_block = ufsi->i_u1.u2_i_data[*p++];
+	if (!u2_block)
+		goto out;
+
+
+	while (--depth) {
+		struct buffer_head *bh;
+		sector_t n = *p++;
+
+
+		temp = (u64)(uspi->s_sbbase) + fs64_to_cpu(sb, u2_block);
+		bh = sb_bread(sb, temp +(u64) (n>>shift));
+		if (!bh)
+			goto out;
+		u2_block = ((__fs64 *)bh->b_data)[n & mask];
+		brelse(bh);
+		if (!u2_block)
+			goto out;
+	}
+	temp = (u64)uspi->s_sbbase + fs64_to_cpu(sb, u2_block);
+	ret = temp + (u64) (frag & uspi->s_fpbmask);
+
+out:
+	if (needs_lock)
+		unlock_ufs(sb);
+	return ret;
+}
+
+/**
+ * ufs_inode_getfrag() - allocate new fragment(s)
+ * @inode - pointer to inode
+ * @fragment - number of `fragment' which hold pointer
+ *   to new allocated fragment(s)
+ * @new_fragment - number of new allocated fragment(s)
+ * @required - how many fragment(s) we require
+ * @err - we set it if something wrong
+ * @phys - pointer to where we save physical number of new allocated fragments,
+ *   NULL if we allocate not data(indirect blocks for example).
+ * @new - we set it if we allocate new block
+ * @locked_page - for ufs_new_fragments()
+ */
+static struct buffer_head *
+ufs_inode_getfrag(struct inode *inode, u64 fragment,
+		  sector_t new_fragment, unsigned int required, int *err,
+		  long *phys, int *new, struct page *locked_page)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct buffer_head * result;
+	unsigned blockoff, lastblockoff;
+	u64 tmp, goal, lastfrag, block, lastblock;
+	void *p, *p2;
+
+	UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, required %u, "
+	     "metadata %d\n", inode->i_ino, (unsigned long long)fragment,
+	     (unsigned long long)new_fragment, required, !phys);
+
+        /* TODO : to be done for write support
+        if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
+             goto ufs2;
+         */
+
+	block = ufs_fragstoblks (fragment);
+	blockoff = ufs_fragnum (fragment);
+	p = ufs_get_direct_data_ptr(uspi, ufsi, block);
+
+	goal = 0;
+
+repeat:
+	tmp = ufs_data_ptr_to_cpu(sb, p);
+
+	lastfrag = ufsi->i_lastfrag;
+	if (tmp && fragment < lastfrag) {
+		if (!phys) {
+			result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
+			if (tmp == ufs_data_ptr_to_cpu(sb, p)) {
+				UFSD("EXIT, result %llu\n",
+				     (unsigned long long)tmp + blockoff);
+				return result;
+			}
+			brelse (result);
+			goto repeat;
+		} else {
+			*phys = uspi->s_sbbase + tmp + blockoff;
+			return NULL;
+		}
+	}
+
+	lastblock = ufs_fragstoblks (lastfrag);
+	lastblockoff = ufs_fragnum (lastfrag);
+	/*
+	 * We will extend file into new block beyond last allocated block
+	 */
+	if (lastblock < block) {
+		/*
+		 * We must reallocate last allocated block
+		 */
+		if (lastblockoff) {
+			p2 = ufs_get_direct_data_ptr(uspi, ufsi, lastblock);
+			tmp = ufs_new_fragments(inode, p2, lastfrag,
+						ufs_data_ptr_to_cpu(sb, p2),
+						uspi->s_fpb - lastblockoff,
+						err, locked_page);
+			if (!tmp) {
+				if (lastfrag != ufsi->i_lastfrag)
+					goto repeat;
+				else
+					return NULL;
+			}
+			lastfrag = ufsi->i_lastfrag;
+			
+		}
+		tmp = ufs_data_ptr_to_cpu(sb,
+					 ufs_get_direct_data_ptr(uspi, ufsi,
+								 lastblock));
+		if (tmp)
+			goal = tmp + uspi->s_fpb;
+		tmp = ufs_new_fragments (inode, p, fragment - blockoff, 
+					 goal, required + blockoff,
+					 err,
+					 phys != NULL ? locked_page : NULL);
+	} else if (lastblock == block) {
+	/*
+	 * We will extend last allocated block
+	 */
+		tmp = ufs_new_fragments(inode, p, fragment -
+					(blockoff - lastblockoff),
+					ufs_data_ptr_to_cpu(sb, p),
+					required +  (blockoff - lastblockoff),
+					err, phys != NULL ? locked_page : NULL);
+	} else /* (lastblock > block) */ {
+	/*
+	 * We will allocate new block before last allocated block
+	 */
+		if (block) {
+			tmp = ufs_data_ptr_to_cpu(sb,
+						 ufs_get_direct_data_ptr(uspi, ufsi, block - 1));
+			if (tmp)
+				goal = tmp + uspi->s_fpb;
+		}
+		tmp = ufs_new_fragments(inode, p, fragment - blockoff,
+					goal, uspi->s_fpb, err,
+					phys != NULL ? locked_page : NULL);
+	}
+	if (!tmp) {
+		if ((!blockoff && ufs_data_ptr_to_cpu(sb, p)) ||
+		    (blockoff && lastfrag != ufsi->i_lastfrag))
+			goto repeat;
+		*err = -ENOSPC;
+		return NULL;
+	}
+
+	if (!phys) {
+		result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
+	} else {
+		*phys = uspi->s_sbbase + tmp + blockoff;
+		result = NULL;
+		*err = 0;
+		*new = 1;
+	}
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	if (IS_SYNC(inode))
+		ufs_sync_inode (inode);
+	mark_inode_dirty(inode);
+	UFSD("EXIT, result %llu\n", (unsigned long long)tmp + blockoff);
+	return result;
+
+     /* This part : To be implemented ....
+        Required only for writing, not required for READ-ONLY.
+ufs2:
+
+	u2_block = ufs_fragstoblks(fragment);
+	u2_blockoff = ufs_fragnum(fragment);
+	p = ufsi->i_u1.u2_i_data + block;
+	goal = 0;
+
+repeat2:
+	tmp = fs32_to_cpu(sb, *p);
+	lastfrag = ufsi->i_lastfrag;
+
+     */
+}
+
+/**
+ * ufs_inode_getblock() - allocate new block
+ * @inode - pointer to inode
+ * @bh - pointer to block which hold "pointer" to new allocated block
+ * @fragment - number of `fragment' which hold pointer
+ *   to new allocated block
+ * @new_fragment - number of new allocated fragment
+ *  (block will hold this fragment and also uspi->s_fpb-1)
+ * @err - see ufs_inode_getfrag()
+ * @phys - see ufs_inode_getfrag()
+ * @new - see ufs_inode_getfrag()
+ * @locked_page - see ufs_inode_getfrag()
+ */
+static struct buffer_head *
+ufs_inode_getblock(struct inode *inode, struct buffer_head *bh,
+		  u64 fragment, sector_t new_fragment, int *err,
+		  long *phys, int *new, struct page *locked_page)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct buffer_head * result;
+	unsigned blockoff;
+	u64 tmp, goal, block;
+	void *p;
+
+	block = ufs_fragstoblks (fragment);
+	blockoff = ufs_fragnum (fragment);
+
+	UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, metadata %d\n",
+	     inode->i_ino, (unsigned long long)fragment,
+	     (unsigned long long)new_fragment, !phys);
+
+	result = NULL;
+	if (!bh)
+		goto out;
+	if (!buffer_uptodate(bh)) {
+		ll_rw_block (READ, 1, &bh);
+		wait_on_buffer (bh);
+		if (!buffer_uptodate(bh))
+			goto out;
+	}
+	if (uspi->fs_magic == UFS2_MAGIC)
+		p = (__fs64 *)bh->b_data + block;
+	else
+		p = (__fs32 *)bh->b_data + block;
+repeat:
+	tmp = ufs_data_ptr_to_cpu(sb, p);
+	if (tmp) {
+		if (!phys) {
+			result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
+			if (tmp == ufs_data_ptr_to_cpu(sb, p))
+				goto out;
+			brelse (result);
+			goto repeat;
+		} else {
+			*phys = uspi->s_sbbase + tmp + blockoff;
+			goto out;
+		}
+	}
+
+	if (block && (uspi->fs_magic == UFS2_MAGIC ?
+		      (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[block-1])) :
+		      (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[block-1]))))
+		goal = tmp + uspi->s_fpb;
+	else
+		goal = bh->b_blocknr + uspi->s_fpb;
+	tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal,
+				uspi->s_fpb, err, locked_page);
+	if (!tmp) {
+		if (ufs_data_ptr_to_cpu(sb, p))
+			goto repeat;
+		goto out;
+	}		
+
+
+	if (!phys) {
+		result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
+	} else {
+		*phys = uspi->s_sbbase + tmp + blockoff;
+		*new = 1;
+	}
+
+	mark_buffer_dirty(bh);
+	if (IS_SYNC(inode))
+		sync_dirty_buffer(bh);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	UFSD("result %llu\n", (unsigned long long)tmp + blockoff);
+out:
+	brelse (bh);
+	UFSD("EXIT\n");
+	return result;
+}
+
+/**
+ * ufs_getfrag_block() - `get_block_t' function, interface between UFS and
+ * readpage, writepage and so on
+ */
+
+int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
+{
+	struct super_block * sb = inode->i_sb;
+	struct ufs_sb_info * sbi = UFS_SB(sb);
+	struct ufs_sb_private_info * uspi = sbi->s_uspi;
+	struct buffer_head * bh;
+	int ret, err, new;
+	unsigned long ptr,phys;
+	u64 phys64 = 0;
+	bool needs_lock = (sbi->mutex_owner != current);
+	
+	if (!create) {
+		phys64 = ufs_frag_map(inode, fragment, needs_lock);
+		UFSD("phys64 = %llu\n", (unsigned long long)phys64);
+		if (phys64)
+			map_bh(bh_result, sb, phys64);
+		return 0;
+	}
+
+        /* This code entered only while writing ....? */
+
+	err = -EIO;
+	new = 0;
+	ret = 0;
+	bh = NULL;
+
+	if (needs_lock)
+		lock_ufs(sb);
+
+	UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
+	if (fragment >
+	    ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb)
+	     << uspi->s_fpbshift))
+		goto abort_too_big;
+
+	err = 0;
+	ptr = fragment;
+	  
+	/*
+	 * ok, these macros clean the logic up a bit and make
+	 * it much more readable:
+	 */
+#define GET_INODE_DATABLOCK(x) \
+	ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new,\
+			  bh_result->b_page)
+#define GET_INODE_PTR(x) \
+	ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL,\
+			  bh_result->b_page)
+#define GET_INDIRECT_DATABLOCK(x) \
+	ufs_inode_getblock(inode, bh, x, fragment,	\
+			  &err, &phys, &new, bh_result->b_page)
+#define GET_INDIRECT_PTR(x) \
+	ufs_inode_getblock(inode, bh, x, fragment,	\
+			  &err, NULL, NULL, NULL)
+
+	if (ptr < UFS_NDIR_FRAGMENT) {
+		bh = GET_INODE_DATABLOCK(ptr);
+		goto out;
+	}
+	ptr -= UFS_NDIR_FRAGMENT;
+	if (ptr < (1 << (uspi->s_apbshift + uspi->s_fpbshift))) {
+		bh = GET_INODE_PTR(UFS_IND_FRAGMENT + (ptr >> uspi->s_apbshift));
+		goto get_indirect;
+	}
+	ptr -= 1 << (uspi->s_apbshift + uspi->s_fpbshift);
+	if (ptr < (1 << (uspi->s_2apbshift + uspi->s_fpbshift))) {
+		bh = GET_INODE_PTR(UFS_DIND_FRAGMENT + (ptr >> uspi->s_2apbshift));
+		goto get_double;
+	}
+	ptr -= 1 << (uspi->s_2apbshift + uspi->s_fpbshift);
+	bh = GET_INODE_PTR(UFS_TIND_FRAGMENT + (ptr >> uspi->s_3apbshift));
+	bh = GET_INDIRECT_PTR((ptr >> uspi->s_2apbshift) & uspi->s_apbmask);
+get_double:
+	bh = GET_INDIRECT_PTR((ptr >> uspi->s_apbshift) & uspi->s_apbmask);
+get_indirect:
+	bh = GET_INDIRECT_DATABLOCK(ptr & uspi->s_apbmask);
+
+#undef GET_INODE_DATABLOCK
+#undef GET_INODE_PTR
+#undef GET_INDIRECT_DATABLOCK
+#undef GET_INDIRECT_PTR
+
+out:
+	if (err)
+		goto abort;
+	if (new)
+		set_buffer_new(bh_result);
+	map_bh(bh_result, sb, phys);
+abort:
+	if (needs_lock)
+		unlock_ufs(sb);
+
+	return err;
+
+abort_too_big:
+	ufs_warning(sb, "ufs_get_block", "block > big");
+	goto abort;
+}
+
+static int ufs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page,ufs_getfrag_block,wbc);
+}
+
+static int ufs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page,ufs_getfrag_block);
+}
+
+int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	return __block_write_begin(page, pos, len, ufs_getfrag_block);
+}
+
+static int ufs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				ufs_getfrag_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
+
+static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping,block,ufs_getfrag_block);
+}
+
+const struct address_space_operations ufs_aops = {
+	.readpage = ufs_readpage,
+	.writepage = ufs_writepage,
+	.write_begin = ufs_write_begin,
+	.write_end = generic_write_end,
+	.bmap = ufs_bmap
+};
+
+static void ufs_set_inode_ops(struct inode *inode)
+{
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &ufs_file_inode_operations;
+		inode->i_fop = &ufs_file_operations;
+		inode->i_mapping->a_ops = &ufs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &ufs_dir_inode_operations;
+		inode->i_fop = &ufs_dir_operations;
+		inode->i_mapping->a_ops = &ufs_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (!inode->i_blocks)
+			inode->i_op = &ufs_fast_symlink_inode_operations;
+		else {
+			inode->i_op = &ufs_symlink_inode_operations;
+			inode->i_mapping->a_ops = &ufs_aops;
+		}
+	} else
+		init_special_inode(inode, inode->i_mode,
+				   ufs_get_inode_dev(inode->i_sb, UFS_I(inode)));
+}
+
+static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	mode_t mode;
+
+	/*
+	 * Copy data to the in-core inode.
+	 */
+	inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode);
+	inode->i_nlink = fs16_to_cpu(sb, ufs_inode->ui_nlink);
+	if (inode->i_nlink == 0) {
+		ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
+		return -1;
+	}
+	
+	/*
+	 * Linux now has 32-bit uid and gid, so we can support EFT.
+	 */
+	inode->i_uid = ufs_get_inode_uid(sb, ufs_inode);
+	inode->i_gid = ufs_get_inode_gid(sb, ufs_inode);
+
+	inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size);
+	inode->i_atime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec);
+	inode->i_ctime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec);
+	inode->i_mtime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec);
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_atime.tv_nsec = 0;
+	inode->i_ctime.tv_nsec = 0;
+	inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks);
+	inode->i_generation = fs32_to_cpu(sb, ufs_inode->ui_gen);
+	ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags);
+	ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
+	ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
+
+	
+	if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
+		memcpy(ufsi->i_u1.i_data, &ufs_inode->ui_u2.ui_addr,
+		       sizeof(ufs_inode->ui_u2.ui_addr));
+	} else {
+		memcpy(ufsi->i_u1.i_symlink, ufs_inode->ui_u2.ui_symlink,
+		       sizeof(ufs_inode->ui_u2.ui_symlink) - 1);
+		ufsi->i_u1.i_symlink[sizeof(ufs_inode->ui_u2.ui_symlink) - 1] = 0;
+	}
+	return 0;
+}
+
+static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	mode_t mode;
+
+	UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino);
+	/*
+	 * Copy data to the in-core inode.
+	 */
+	inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode);
+	inode->i_nlink = fs16_to_cpu(sb, ufs2_inode->ui_nlink);
+	if (inode->i_nlink == 0) {
+		ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
+		return -1;
+	}
+
+        /*
+         * Linux now has 32-bit uid and gid, so we can support EFT.
+         */
+	inode->i_uid = fs32_to_cpu(sb, ufs2_inode->ui_uid);
+	inode->i_gid = fs32_to_cpu(sb, ufs2_inode->ui_gid);
+
+	inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size);
+	inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime);
+	inode->i_ctime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_ctime);
+	inode->i_mtime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_mtime);
+	inode->i_atime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_atimensec);
+	inode->i_ctime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_ctimensec);
+	inode->i_mtime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_mtimensec);
+	inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks);
+	inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen);
+	ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags);
+	/*
+	ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
+	ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
+	*/
+
+	if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
+		memcpy(ufsi->i_u1.u2_i_data, &ufs2_inode->ui_u2.ui_addr,
+		       sizeof(ufs2_inode->ui_u2.ui_addr));
+	} else {
+		memcpy(ufsi->i_u1.i_symlink, ufs2_inode->ui_u2.ui_symlink,
+		       sizeof(ufs2_inode->ui_u2.ui_symlink) - 1);
+		ufsi->i_u1.i_symlink[sizeof(ufs2_inode->ui_u2.ui_symlink) - 1] = 0;
+	}
+	return 0;
+}
+
+struct inode *ufs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct ufs_inode_info *ufsi;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct buffer_head * bh;
+	struct inode *inode;
+	int err;
+
+	UFSD("ENTER, ino %lu\n", ino);
+
+	if (ino < UFS_ROOTINO || ino > (uspi->s_ncg * uspi->s_ipg)) {
+		ufs_warning(sb, "ufs_read_inode", "bad inode number (%lu)\n",
+			    ino);
+		return ERR_PTR(-EIO);
+	}
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	ufsi = UFS_I(inode);
+
+	bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
+	if (!bh) {
+		ufs_warning(sb, "ufs_read_inode", "unable to read inode %lu\n",
+			    inode->i_ino);
+		goto bad_inode;
+	}
+	if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+		struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data;
+
+		err = ufs2_read_inode(inode,
+				      ufs2_inode + ufs_inotofsbo(inode->i_ino));
+	} else {
+		struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data;
+
+		err = ufs1_read_inode(inode,
+				      ufs_inode + ufs_inotofsbo(inode->i_ino));
+	}
+
+	if (err)
+		goto bad_inode;
+	inode->i_version++;
+	ufsi->i_lastfrag =
+		(inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
+	ufsi->i_dir_start_lookup = 0;
+	ufsi->i_osync = 0;
+
+	ufs_set_inode_ops(inode);
+
+	brelse(bh);
+
+	UFSD("EXIT\n");
+	unlock_new_inode(inode);
+	return inode;
+
+bad_inode:
+	iget_failed(inode);
+	return ERR_PTR(-EIO);
+}
+
+static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
+{
+	struct super_block *sb = inode->i_sb;
+ 	struct ufs_inode_info *ufsi = UFS_I(inode);
+
+	ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode);
+	ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink);
+
+	ufs_set_inode_uid(sb, ufs_inode, inode->i_uid);
+	ufs_set_inode_gid(sb, ufs_inode, inode->i_gid);
+		
+	ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
+	ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec);
+	ufs_inode->ui_atime.tv_usec = 0;
+	ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb, inode->i_ctime.tv_sec);
+	ufs_inode->ui_ctime.tv_usec = 0;
+	ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec);
+	ufs_inode->ui_mtime.tv_usec = 0;
+	ufs_inode->ui_blocks = cpu_to_fs32(sb, inode->i_blocks);
+	ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags);
+	ufs_inode->ui_gen = cpu_to_fs32(sb, inode->i_generation);
+
+	if ((UFS_SB(sb)->s_flags & UFS_UID_MASK) == UFS_UID_EFT) {
+		ufs_inode->ui_u3.ui_sun.ui_shadow = cpu_to_fs32(sb, ufsi->i_shadow);
+		ufs_inode->ui_u3.ui_sun.ui_oeftflag = cpu_to_fs32(sb, ufsi->i_oeftflag);
+	}
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		/* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */
+		ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.i_data[0];
+	} else if (inode->i_blocks) {
+		memcpy(&ufs_inode->ui_u2.ui_addr, ufsi->i_u1.i_data,
+		       sizeof(ufs_inode->ui_u2.ui_addr));
+	}
+	else {
+		memcpy(&ufs_inode->ui_u2.ui_symlink, ufsi->i_u1.i_symlink,
+		       sizeof(ufs_inode->ui_u2.ui_symlink));
+	}
+
+	if (!inode->i_nlink)
+		memset (ufs_inode, 0, sizeof(struct ufs_inode));
+}
+
+static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
+{
+	struct super_block *sb = inode->i_sb;
+ 	struct ufs_inode_info *ufsi = UFS_I(inode);
+
+	UFSD("ENTER\n");
+	ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode);
+	ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink);
+
+	ufs_inode->ui_uid = cpu_to_fs32(sb, inode->i_uid);
+	ufs_inode->ui_gid = cpu_to_fs32(sb, inode->i_gid);
+
+	ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
+	ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec);
+	ufs_inode->ui_atimensec = cpu_to_fs32(sb, inode->i_atime.tv_nsec);
+	ufs_inode->ui_ctime = cpu_to_fs64(sb, inode->i_ctime.tv_sec);
+	ufs_inode->ui_ctimensec = cpu_to_fs32(sb, inode->i_ctime.tv_nsec);
+	ufs_inode->ui_mtime = cpu_to_fs64(sb, inode->i_mtime.tv_sec);
+	ufs_inode->ui_mtimensec = cpu_to_fs32(sb, inode->i_mtime.tv_nsec);
+
+	ufs_inode->ui_blocks = cpu_to_fs64(sb, inode->i_blocks);
+	ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags);
+	ufs_inode->ui_gen = cpu_to_fs32(sb, inode->i_generation);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		/* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */
+		ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.u2_i_data[0];
+	} else if (inode->i_blocks) {
+		memcpy(&ufs_inode->ui_u2.ui_addr, ufsi->i_u1.u2_i_data,
+		       sizeof(ufs_inode->ui_u2.ui_addr));
+	} else {
+		memcpy(&ufs_inode->ui_u2.ui_symlink, ufsi->i_u1.i_symlink,
+		       sizeof(ufs_inode->ui_u2.ui_symlink));
+ 	}
+
+	if (!inode->i_nlink)
+		memset (ufs_inode, 0, sizeof(struct ufs2_inode));
+	UFSD("EXIT\n");
+}
+
+static int ufs_update_inode(struct inode * inode, int do_sync)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct buffer_head * bh;
+
+	UFSD("ENTER, ino %lu\n", inode->i_ino);
+
+	if (inode->i_ino < UFS_ROOTINO ||
+	    inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) {
+		ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino);
+		return -1;
+	}
+
+	bh = sb_bread(sb, ufs_inotofsba(inode->i_ino));
+	if (!bh) {
+		ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino);
+		return -1;
+	}
+	if (uspi->fs_magic == UFS2_MAGIC) {
+		struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data;
+
+		ufs2_update_inode(inode,
+				  ufs2_inode + ufs_inotofsbo(inode->i_ino));
+	} else {
+		struct ufs_inode *ufs_inode = (struct ufs_inode *) bh->b_data;
+
+		ufs1_update_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino));
+	}
+		
+	mark_buffer_dirty(bh);
+	if (do_sync)
+		sync_dirty_buffer(bh);
+	brelse (bh);
+	
+	UFSD("EXIT\n");
+	return 0;
+}
+
+int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int ret;
+	lock_ufs(inode->i_sb);
+	ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+	unlock_ufs(inode->i_sb);
+	return ret;
+}
+
+int ufs_sync_inode (struct inode *inode)
+{
+	return ufs_update_inode (inode, 1);
+}
+
+void ufs_evict_inode(struct inode * inode)
+{
+	int want_delete = 0;
+
+	if (!inode->i_nlink && !is_bad_inode(inode))
+		want_delete = 1;
+
+	truncate_inode_pages(&inode->i_data, 0);
+	if (want_delete) {
+		loff_t old_i_size;
+		/*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
+		lock_ufs(inode->i_sb);
+		mark_inode_dirty(inode);
+		ufs_update_inode(inode, IS_SYNC(inode));
+		old_i_size = inode->i_size;
+		inode->i_size = 0;
+		if (inode->i_blocks && ufs_truncate(inode, old_i_size))
+			ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
+		unlock_ufs(inode->i_sb);
+	}
+
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
+
+	if (want_delete) {
+		lock_ufs(inode->i_sb);
+		ufs_free_inode (inode);
+		unlock_ufs(inode->i_sb);
+	}
+}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
new file mode 100644
index 00000000..b57aab9a
--- /dev/null
+++ b/fs/ufs/namei.c
@@ -0,0 +1,360 @@
+/*
+ * linux/fs/ufs/namei.c
+ *
+ * Migration to usage of "page cache" on May 2006 by
+ * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ *  from
+ *
+ *  linux/fs/ext2/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "util.h"
+
+static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = ufs_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode * inode = NULL;
+	ino_t ino;
+	
+	if (dentry->d_name.len > UFS_MAXNAMLEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	lock_ufs(dir->i_sb);
+	ino = ufs_inode_by_name(dir, &dentry->d_name);
+	if (ino)
+		inode = ufs_iget(dir->i_sb, ino);
+	unlock_ufs(dir->i_sb);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	return d_splice_alias(inode, dentry);
+}
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate(). 
+ */
+static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
+		struct nameidata *nd)
+{
+	struct inode *inode;
+	int err;
+
+	UFSD("BEGIN\n");
+
+	inode = ufs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+
+	if (!IS_ERR(inode)) {
+		inode->i_op = &ufs_file_inode_operations;
+		inode->i_fop = &ufs_file_operations;
+		inode->i_mapping->a_ops = &ufs_aops;
+		mark_inode_dirty(inode);
+		lock_ufs(dir->i_sb);
+		err = ufs_add_nondir(dentry, inode);
+		unlock_ufs(dir->i_sb);
+	}
+	UFSD("END: err=%d\n", err);
+	return err;
+}
+
+static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev)
+{
+	struct inode *inode;
+	int err;
+
+	if (!old_valid_dev(rdev))
+		return -EINVAL;
+
+	inode = ufs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, mode, rdev);
+		ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev);
+		mark_inode_dirty(inode);
+		lock_ufs(dir->i_sb);
+		err = ufs_add_nondir(dentry, inode);
+		unlock_ufs(dir->i_sb);
+	}
+	return err;
+}
+
+static int ufs_symlink (struct inode * dir, struct dentry * dentry,
+	const char * symname)
+{
+	struct super_block * sb = dir->i_sb;
+	int err = -ENAMETOOLONG;
+	unsigned l = strlen(symname)+1;
+	struct inode * inode;
+
+	if (l > sb->s_blocksize)
+		goto out_notlocked;
+
+	lock_ufs(dir->i_sb);
+	inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
+		/* slow symlink */
+		inode->i_op = &ufs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &ufs_aops;
+		err = page_symlink(inode, symname, l);
+		if (err)
+			goto out_fail;
+	} else {
+		/* fast symlink */
+		inode->i_op = &ufs_fast_symlink_inode_operations;
+		memcpy(UFS_I(inode)->i_u1.i_symlink, symname, l);
+		inode->i_size = l-1;
+	}
+	mark_inode_dirty(inode);
+
+	err = ufs_add_nondir(dentry, inode);
+out:
+	unlock_ufs(dir->i_sb);
+out_notlocked:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	iput(inode);
+	goto out;
+}
+
+static int ufs_link (struct dentry * old_dentry, struct inode * dir,
+	struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+	int error;
+
+	lock_ufs(dir->i_sb);
+	if (inode->i_nlink >= UFS_LINK_MAX) {
+		unlock_ufs(dir->i_sb);
+		return -EMLINK;
+	}
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	inode_inc_link_count(inode);
+	ihold(inode);
+
+	error = ufs_add_nondir(dentry, inode);
+	unlock_ufs(dir->i_sb);
+	return error;
+}
+
+static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+{
+	struct inode * inode;
+	int err = -EMLINK;
+
+	if (dir->i_nlink >= UFS_LINK_MAX)
+		goto out;
+
+	lock_ufs(dir->i_sb);
+	inode_inc_link_count(dir);
+
+	inode = ufs_new_inode(dir, S_IFDIR|mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_dir;
+
+	inode->i_op = &ufs_dir_inode_operations;
+	inode->i_fop = &ufs_dir_operations;
+	inode->i_mapping->a_ops = &ufs_aops;
+
+	inode_inc_link_count(inode);
+
+	err = ufs_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = ufs_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+	unlock_ufs(dir->i_sb);
+
+	d_instantiate(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	inode_dec_link_count(inode);
+	iput (inode);
+out_dir:
+	inode_dec_link_count(dir);
+	unlock_ufs(dir->i_sb);
+	goto out;
+}
+
+static int ufs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode * inode = dentry->d_inode;
+	struct ufs_dir_entry *de;
+	struct page *page;
+	int err = -ENOENT;
+
+	de = ufs_find_entry(dir, &dentry->d_name, &page);
+	if (!de)
+		goto out;
+
+	err = ufs_delete_entry(dir, de, page);
+	if (err)
+		goto out;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+	err = 0;
+out:
+	return err;
+}
+
+static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
+{
+	struct inode * inode = dentry->d_inode;
+	int err= -ENOTEMPTY;
+
+	lock_ufs(dir->i_sb);
+	if (ufs_empty_dir (inode)) {
+		err = ufs_unlink(dir, dentry);
+		if (!err) {
+			inode->i_size = 0;
+			inode_dec_link_count(inode);
+			inode_dec_link_count(dir);
+		}
+	}
+	unlock_ufs(dir->i_sb);
+	return err;
+}
+
+static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct page *dir_page = NULL;
+	struct ufs_dir_entry * dir_de = NULL;
+	struct page *old_page;
+	struct ufs_dir_entry *old_de;
+	int err = -ENOENT;
+
+	old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = ufs_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page *new_page;
+		struct ufs_dir_entry *new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !ufs_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
+		if (!new_de)
+			goto out_dir;
+		ufs_set_link(new_dir, new_de, new_page, old_inode);
+		new_inode->i_ctime = CURRENT_TIME_SEC;
+		if (dir_de)
+			drop_nlink(new_inode);
+		inode_dec_link_count(new_inode);
+	} else {
+		if (dir_de) {
+			err = -EMLINK;
+			if (new_dir->i_nlink >= UFS_LINK_MAX)
+				goto out_dir;
+		}
+		err = ufs_add_link(new_dentry, old_inode);
+		if (err)
+			goto out_dir;
+		if (dir_de)
+			inode_inc_link_count(new_dir);
+	}
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+ 	 * rename.
+	 */
+	old_inode->i_ctime = CURRENT_TIME_SEC;
+
+	ufs_delete_entry(old_dir, old_de, old_page);
+	mark_inode_dirty(old_inode);
+
+	if (dir_de) {
+		ufs_set_link(old_inode, dir_de, dir_page, new_dir);
+		inode_dec_link_count(old_dir);
+	}
+	return 0;
+
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	return err;
+}
+
+const struct inode_operations ufs_dir_inode_operations = {
+	.create		= ufs_create,
+	.lookup		= ufs_lookup,
+	.link		= ufs_link,
+	.unlink		= ufs_unlink,
+	.symlink	= ufs_symlink,
+	.mkdir		= ufs_mkdir,
+	.rmdir		= ufs_rmdir,
+	.mknod		= ufs_mknod,
+	.rename		= ufs_rename,
+};
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
new file mode 100644
index 00000000..3915ade6
--- /dev/null
+++ b/fs/ufs/super.c
@@ -0,0 +1,1511 @@
+/*
+ *  linux/fs/ufs/super.c
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ */
+
+/* Derived from
+ *
+ *  linux/fs/ext2/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+ 
+/*
+ * Inspired by
+ *
+ *  linux/fs/ufs/super.c
+ *
+ * Copyright (C) 1996
+ * Adrian Rodriguez (adrian@franklins-tower.rutgers.edu)
+ * Laboratory for Computer Science Research Computing Facility
+ * Rutgers, The State University of New Jersey
+ *
+ * Copyright (C) 1996  Eddie C. Dost  (ecd@skynet.be)
+ *
+ * Kernel module support added on 96/04/26 by
+ * Stefan Reinauer <stepan@home.culture.mipt.ru>
+ *
+ * Module usage counts added on 96/04/29 by
+ * Gertjan van Wingerde <gwingerde@gmail.com>
+ *
+ * Clean swab support on 19970406 by
+ * Francois-Rene Rideau <fare@tunes.org>
+ *
+ * 4.4BSD (FreeBSD) support added on February 1st 1998 by
+ * Niels Kristian Bech Jensen <nkbj@image.dk> partially based
+ * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>.
+ *
+ * NeXTstep support added on February 5th 1998 by
+ * Niels Kristian Bech Jensen <nkbj@image.dk>.
+ *
+ * write support Daniel Pirkl <daniel.pirkl@email.cz> 1998
+ * 
+ * HP/UX hfs filesystem support added by
+ * Martin K. Petersen <mkp@mkp.net>, August 1999
+ *
+ * UFS2 (of FreeBSD 5.x) support added by
+ * Niraj Kumar <niraj17@iitbombay.org>, Jan 2004
+ *
+ * UFS2 write support added by
+ * Evgeniy Dushistov <dushistov@mail.ru>, 2007
+ */
+
+
+#include <linux/exportfs.h>
+#include <linux/module.h>
+#include <linux/bitops.h>
+
+#include <stdarg.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/parser.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/log2.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+void lock_ufs(struct super_block *sb)
+{
+#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+
+	mutex_lock(&sbi->mutex);
+	sbi->mutex_owner = current;
+#endif
+}
+
+void unlock_ufs(struct super_block *sb)
+{
+#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+
+	sbi->mutex_owner = NULL;
+	mutex_unlock(&sbi->mutex);
+#endif
+}
+
+static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
+{
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct inode *inode;
+
+	if (ino < UFS_ROOTINO || ino > uspi->s_ncg * uspi->s_ipg)
+		return ERR_PTR(-ESTALE);
+
+	inode = ufs_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return inode;
+}
+
+static struct dentry *ufs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+				       int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ufs_nfs_get_inode);
+}
+
+static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid,
+				       int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type, ufs_nfs_get_inode);
+}
+
+static struct dentry *ufs_get_parent(struct dentry *child)
+{
+	struct qstr dot_dot = {
+		.name	= "..",
+		.len	= 2,
+	};
+	ino_t ino;
+
+	ino = ufs_inode_by_name(child->d_inode, &dot_dot);
+	if (!ino)
+		return ERR_PTR(-ENOENT);
+	return d_obtain_alias(ufs_iget(child->d_inode->i_sb, ino));
+}
+
+static const struct export_operations ufs_export_ops = {
+	.fh_to_dentry	= ufs_fh_to_dentry,
+	.fh_to_parent	= ufs_fh_to_parent,
+	.get_parent	= ufs_get_parent,
+};
+
+#ifdef CONFIG_UFS_DEBUG
+/*
+ * Print contents of ufs_super_block, useful for debugging
+ */
+static void ufs_print_super_stuff(struct super_block *sb,
+				  struct ufs_super_block_first *usb1,
+				  struct ufs_super_block_second *usb2,
+				  struct ufs_super_block_third *usb3)
+{
+	u32 magic = fs32_to_cpu(sb, usb3->fs_magic);
+
+	printk("ufs_print_super_stuff\n");
+	printk("  magic:     0x%x\n", magic);
+	if (fs32_to_cpu(sb, usb3->fs_magic) == UFS2_MAGIC) {
+		printk("  fs_size:   %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size));
+		printk("  fs_dsize:  %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize));
+		printk("  bsize:         %u\n",
+		       fs32_to_cpu(sb, usb1->fs_bsize));
+		printk("  fsize:         %u\n",
+		       fs32_to_cpu(sb, usb1->fs_fsize));
+		printk("  fs_volname:  %s\n", usb2->fs_un.fs_u2.fs_volname);
+		printk("  fs_sblockloc: %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc));
+		printk("  cs_ndir(No of dirs):  %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir));
+		printk("  cs_nbfree(No of free blocks):  %llu\n",
+		       (unsigned long long)
+		       fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree));
+		printk(KERN_INFO"  cs_nifree(Num of free inodes): %llu\n",
+		       (unsigned long long)
+		       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree));
+		printk(KERN_INFO"  cs_nffree(Num of free frags): %llu\n",
+		       (unsigned long long)
+		       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree));
+		printk(KERN_INFO"  fs_maxsymlinklen: %u\n",
+		       fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen));
+	} else {
+		printk(" sblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
+		printk(" cblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
+		printk(" iblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
+		printk(" dblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
+		printk(" cgoffset:    %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cgoffset));
+		printk(" ~cgmask:     0x%x\n",
+		       ~fs32_to_cpu(sb, usb1->fs_cgmask));
+		printk(" size:        %u\n", fs32_to_cpu(sb, usb1->fs_size));
+		printk(" dsize:       %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
+		printk(" ncg:         %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
+		printk(" bsize:       %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
+		printk(" fsize:       %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
+		printk(" frag:        %u\n", fs32_to_cpu(sb, usb1->fs_frag));
+		printk(" fragshift:   %u\n",
+		       fs32_to_cpu(sb, usb1->fs_fragshift));
+		printk(" ~fmask:      %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
+		printk(" fshift:      %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
+		printk(" sbsize:      %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
+		printk(" spc:         %u\n", fs32_to_cpu(sb, usb1->fs_spc));
+		printk(" cpg:         %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
+		printk(" ipg:         %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
+		printk(" fpg:         %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
+		printk(" csaddr:      %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
+		printk(" cssize:      %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
+		printk(" cgsize:      %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
+		printk(" fstodb:      %u\n",
+		       fs32_to_cpu(sb, usb1->fs_fsbtodb));
+		printk(" nrpos:       %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
+		printk(" ndir         %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
+		printk(" nifree       %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
+		printk(" nbfree       %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
+		printk(" nffree       %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
+	}
+	printk("\n");
+}
+
+/*
+ * Print contents of ufs_cylinder_group, useful for debugging
+ */
+static void ufs_print_cylinder_stuff(struct super_block *sb,
+				     struct ufs_cylinder_group *cg)
+{
+	printk("\nufs_print_cylinder_stuff\n");
+	printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group));
+	printk("  magic:        %x\n", fs32_to_cpu(sb, cg->cg_magic));
+	printk("  time:         %u\n", fs32_to_cpu(sb, cg->cg_time));
+	printk("  cgx:          %u\n", fs32_to_cpu(sb, cg->cg_cgx));
+	printk("  ncyl:         %u\n", fs16_to_cpu(sb, cg->cg_ncyl));
+	printk("  niblk:        %u\n", fs16_to_cpu(sb, cg->cg_niblk));
+	printk("  ndblk:        %u\n", fs32_to_cpu(sb, cg->cg_ndblk));
+	printk("  cs_ndir:      %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir));
+	printk("  cs_nbfree:    %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree));
+	printk("  cs_nifree:    %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree));
+	printk("  cs_nffree:    %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree));
+	printk("  rotor:        %u\n", fs32_to_cpu(sb, cg->cg_rotor));
+	printk("  frotor:       %u\n", fs32_to_cpu(sb, cg->cg_frotor));
+	printk("  irotor:       %u\n", fs32_to_cpu(sb, cg->cg_irotor));
+	printk("  frsum:        %u, %u, %u, %u, %u, %u, %u, %u\n",
+	    fs32_to_cpu(sb, cg->cg_frsum[0]), fs32_to_cpu(sb, cg->cg_frsum[1]),
+	    fs32_to_cpu(sb, cg->cg_frsum[2]), fs32_to_cpu(sb, cg->cg_frsum[3]),
+	    fs32_to_cpu(sb, cg->cg_frsum[4]), fs32_to_cpu(sb, cg->cg_frsum[5]),
+	    fs32_to_cpu(sb, cg->cg_frsum[6]), fs32_to_cpu(sb, cg->cg_frsum[7]));
+	printk("  btotoff:      %u\n", fs32_to_cpu(sb, cg->cg_btotoff));
+	printk("  boff:         %u\n", fs32_to_cpu(sb, cg->cg_boff));
+	printk("  iuseoff:      %u\n", fs32_to_cpu(sb, cg->cg_iusedoff));
+	printk("  freeoff:      %u\n", fs32_to_cpu(sb, cg->cg_freeoff));
+	printk("  nextfreeoff:  %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff));
+	printk("  clustersumoff %u\n",
+	       fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
+	printk("  clusteroff    %u\n",
+	       fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
+	printk("  nclusterblks  %u\n",
+	       fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
+	printk("\n");
+}
+#else
+#  define ufs_print_super_stuff(sb, usb1, usb2, usb3) /**/
+#  define ufs_print_cylinder_stuff(sb, cg) /**/
+#endif /* CONFIG_UFS_DEBUG */
+
+static const struct super_operations ufs_super_ops;
+
+static char error_buf[1024];
+
+void ufs_error (struct super_block * sb, const char * function,
+	const char * fmt, ...)
+{
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	va_list args;
+
+	uspi = UFS_SB(sb)->s_uspi;
+	usb1 = ubh_get_usb_first(uspi);
+	
+	if (!(sb->s_flags & MS_RDONLY)) {
+		usb1->fs_clean = UFS_FSBAD;
+		ubh_mark_buffer_dirty(USPI_UBH(uspi));
+		sb->s_dirt = 1;
+		sb->s_flags |= MS_RDONLY;
+	}
+	va_start (args, fmt);
+	vsnprintf (error_buf, sizeof(error_buf), fmt, args);
+	va_end (args);
+	switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) {
+	case UFS_MOUNT_ONERROR_PANIC:
+		panic ("UFS-fs panic (device %s): %s: %s\n", 
+			sb->s_id, function, error_buf);
+
+	case UFS_MOUNT_ONERROR_LOCK:
+	case UFS_MOUNT_ONERROR_UMOUNT:
+	case UFS_MOUNT_ONERROR_REPAIR:
+		printk (KERN_CRIT "UFS-fs error (device %s): %s: %s\n",
+			sb->s_id, function, error_buf);
+	}		
+}
+
+void ufs_panic (struct super_block * sb, const char * function,
+	const char * fmt, ...)
+{
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	va_list args;
+	
+	uspi = UFS_SB(sb)->s_uspi;
+	usb1 = ubh_get_usb_first(uspi);
+	
+	if (!(sb->s_flags & MS_RDONLY)) {
+		usb1->fs_clean = UFS_FSBAD;
+		ubh_mark_buffer_dirty(USPI_UBH(uspi));
+		sb->s_dirt = 1;
+	}
+	va_start (args, fmt);
+	vsnprintf (error_buf, sizeof(error_buf), fmt, args);
+	va_end (args);
+	sb->s_flags |= MS_RDONLY;
+	printk (KERN_CRIT "UFS-fs panic (device %s): %s: %s\n",
+		sb->s_id, function, error_buf);
+}
+
+void ufs_warning (struct super_block * sb, const char * function,
+	const char * fmt, ...)
+{
+	va_list args;
+
+	va_start (args, fmt);
+	vsnprintf (error_buf, sizeof(error_buf), fmt, args);
+	va_end (args);
+	printk (KERN_WARNING "UFS-fs warning (device %s): %s: %s\n",
+		sb->s_id, function, error_buf);
+}
+
+enum {
+       Opt_type_old = UFS_MOUNT_UFSTYPE_OLD,
+       Opt_type_sunx86 = UFS_MOUNT_UFSTYPE_SUNx86,
+       Opt_type_sun = UFS_MOUNT_UFSTYPE_SUN,
+       Opt_type_sunos = UFS_MOUNT_UFSTYPE_SUNOS,
+       Opt_type_44bsd = UFS_MOUNT_UFSTYPE_44BSD,
+       Opt_type_ufs2 = UFS_MOUNT_UFSTYPE_UFS2,
+       Opt_type_hp = UFS_MOUNT_UFSTYPE_HP,
+       Opt_type_nextstepcd = UFS_MOUNT_UFSTYPE_NEXTSTEP_CD,
+       Opt_type_nextstep = UFS_MOUNT_UFSTYPE_NEXTSTEP,
+       Opt_type_openstep = UFS_MOUNT_UFSTYPE_OPENSTEP,
+       Opt_onerror_panic = UFS_MOUNT_ONERROR_PANIC,
+       Opt_onerror_lock = UFS_MOUNT_ONERROR_LOCK,
+       Opt_onerror_umount = UFS_MOUNT_ONERROR_UMOUNT,
+       Opt_onerror_repair = UFS_MOUNT_ONERROR_REPAIR,
+       Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_type_old, "ufstype=old"},
+	{Opt_type_sunx86, "ufstype=sunx86"},
+	{Opt_type_sun, "ufstype=sun"},
+	{Opt_type_sunos, "ufstype=sunos"},
+	{Opt_type_44bsd, "ufstype=44bsd"},
+	{Opt_type_ufs2, "ufstype=ufs2"},
+	{Opt_type_ufs2, "ufstype=5xbsd"},
+	{Opt_type_hp, "ufstype=hp"},
+	{Opt_type_nextstepcd, "ufstype=nextstep-cd"},
+	{Opt_type_nextstep, "ufstype=nextstep"},
+	{Opt_type_openstep, "ufstype=openstep"},
+/*end of possible ufs types */
+	{Opt_onerror_panic, "onerror=panic"},
+	{Opt_onerror_lock, "onerror=lock"},
+	{Opt_onerror_umount, "onerror=umount"},
+	{Opt_onerror_repair, "onerror=repair"},
+	{Opt_err, NULL}
+};
+
+static int ufs_parse_options (char * options, unsigned * mount_options)
+{
+	char * p;
+	
+	UFSD("ENTER\n");
+	
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_type_old:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_OLD);
+			break;
+		case Opt_type_sunx86:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_SUNx86);
+			break;
+		case Opt_type_sun:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_SUN);
+			break;
+		case Opt_type_sunos:
+			ufs_clear_opt(*mount_options, UFSTYPE);
+			ufs_set_opt(*mount_options, UFSTYPE_SUNOS);
+			break;
+		case Opt_type_44bsd:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_44BSD);
+			break;
+		case Opt_type_ufs2:
+			ufs_clear_opt(*mount_options, UFSTYPE);
+			ufs_set_opt(*mount_options, UFSTYPE_UFS2);
+			break;
+		case Opt_type_hp:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_HP);
+			break;
+		case Opt_type_nextstepcd:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP_CD);
+			break;
+		case Opt_type_nextstep:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP);
+			break;
+		case Opt_type_openstep:
+			ufs_clear_opt (*mount_options, UFSTYPE);
+			ufs_set_opt (*mount_options, UFSTYPE_OPENSTEP);
+			break;
+		case Opt_onerror_panic:
+			ufs_clear_opt (*mount_options, ONERROR);
+			ufs_set_opt (*mount_options, ONERROR_PANIC);
+			break;
+		case Opt_onerror_lock:
+			ufs_clear_opt (*mount_options, ONERROR);
+			ufs_set_opt (*mount_options, ONERROR_LOCK);
+			break;
+		case Opt_onerror_umount:
+			ufs_clear_opt (*mount_options, ONERROR);
+			ufs_set_opt (*mount_options, ONERROR_UMOUNT);
+			break;
+		case Opt_onerror_repair:
+			printk("UFS-fs: Unable to do repair on error, "
+				"will lock lock instead\n");
+			ufs_clear_opt (*mount_options, ONERROR);
+			ufs_set_opt (*mount_options, ONERROR_REPAIR);
+			break;
+		default:
+			printk("UFS-fs: Invalid option: \"%s\" "
+					"or missing value\n", p);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/*
+ * Different types of UFS hold fs_cstotal in different
+ * places, and use different data structure for it.
+ * To make things simpler we just copy fs_cstotal to ufs_sb_private_info
+ */
+static void ufs_setup_cstotal(struct super_block *sb)
+{
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	struct ufs_sb_private_info *uspi = sbi->s_uspi;
+	struct ufs_super_block_first *usb1;
+	struct ufs_super_block_second *usb2;
+	struct ufs_super_block_third *usb3;
+	unsigned mtype = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
+
+	UFSD("ENTER, mtype=%u\n", mtype);
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+
+	if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+	     (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+	    mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+		/*we have statistic in different place, then usual*/
+		uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir);
+		uspi->cs_total.cs_nbfree = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree);
+		uspi->cs_total.cs_nifree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree);
+		uspi->cs_total.cs_nffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree);
+	} else {
+		uspi->cs_total.cs_ndir = fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir);
+		uspi->cs_total.cs_nbfree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree);
+		uspi->cs_total.cs_nifree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree);
+		uspi->cs_total.cs_nffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree);
+	}
+	UFSD("EXIT\n");
+}
+
+/*
+ * Read on-disk structures associated with cylinder groups
+ */
+static int ufs_read_cylinder_structures(struct super_block *sb)
+{
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	struct ufs_sb_private_info *uspi = sbi->s_uspi;
+	struct ufs_buffer_head * ubh;
+	unsigned char * base, * space;
+	unsigned size, blks, i;
+	struct ufs_super_block_third *usb3;
+
+	UFSD("ENTER\n");
+
+	usb3 = ubh_get_usb_third(uspi);
+	/*
+	 * Read cs structures from (usually) first data block
+	 * on the device. 
+	 */
+	size = uspi->s_cssize;
+	blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
+	base = space = kmalloc(size, GFP_NOFS);
+	if (!base)
+		goto failed; 
+	sbi->s_csp = (struct ufs_csum *)space;
+	for (i = 0; i < blks; i += uspi->s_fpb) {
+		size = uspi->s_bsize;
+		if (i + uspi->s_fpb > blks)
+			size = (blks - i) * uspi->s_fsize;
+
+		ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
+		
+		if (!ubh)
+			goto failed;
+
+		ubh_ubhcpymem (space, ubh, size);
+
+		space += size;
+		ubh_brelse (ubh);
+		ubh = NULL;
+	}
+
+	/*
+	 * Read cylinder group (we read only first fragment from block
+	 * at this time) and prepare internal data structures for cg caching.
+	 */
+	if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_NOFS)))
+		goto failed;
+	for (i = 0; i < uspi->s_ncg; i++) 
+		sbi->s_ucg[i] = NULL;
+	for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
+		sbi->s_ucpi[i] = NULL;
+		sbi->s_cgno[i] = UFS_CGNO_EMPTY;
+	}
+	for (i = 0; i < uspi->s_ncg; i++) {
+		UFSD("read cg %u\n", i);
+		if (!(sbi->s_ucg[i] = sb_bread(sb, ufs_cgcmin(i))))
+			goto failed;
+		if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data))
+			goto failed;
+
+		ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
+	}
+	for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
+		if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_NOFS)))
+			goto failed;
+		sbi->s_cgno[i] = UFS_CGNO_EMPTY;
+	}
+	sbi->s_cg_loaded = 0;
+	UFSD("EXIT\n");
+	return 1;
+
+failed:
+	kfree (base);
+	if (sbi->s_ucg) {
+		for (i = 0; i < uspi->s_ncg; i++)
+			if (sbi->s_ucg[i])
+				brelse (sbi->s_ucg[i]);
+		kfree (sbi->s_ucg);
+		for (i = 0; i < UFS_MAX_GROUP_LOADED; i++)
+			kfree (sbi->s_ucpi[i]);
+	}
+	UFSD("EXIT (FAILED)\n");
+	return 0;
+}
+
+/*
+ * Sync our internal copy of fs_cstotal with disk
+ */
+static void ufs_put_cstotal(struct super_block *sb)
+{
+	unsigned mtype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct ufs_super_block_first *usb1;
+	struct ufs_super_block_second *usb2;
+	struct ufs_super_block_third *usb3;
+
+	UFSD("ENTER\n");
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+
+	if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+	     (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+	    mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+		/*we have statistic in different place, then usual*/
+		usb2->fs_un.fs_u2.cs_ndir =
+			cpu_to_fs64(sb, uspi->cs_total.cs_ndir);
+		usb2->fs_un.fs_u2.cs_nbfree =
+			cpu_to_fs64(sb, uspi->cs_total.cs_nbfree);
+		usb3->fs_un1.fs_u2.cs_nifree =
+			cpu_to_fs64(sb, uspi->cs_total.cs_nifree);
+		usb3->fs_un1.fs_u2.cs_nffree =
+			cpu_to_fs64(sb, uspi->cs_total.cs_nffree);
+	} else {
+		usb1->fs_cstotal.cs_ndir =
+			cpu_to_fs32(sb, uspi->cs_total.cs_ndir);
+		usb1->fs_cstotal.cs_nbfree =
+			cpu_to_fs32(sb, uspi->cs_total.cs_nbfree);
+		usb1->fs_cstotal.cs_nifree =
+			cpu_to_fs32(sb, uspi->cs_total.cs_nifree);
+		usb1->fs_cstotal.cs_nffree =
+			cpu_to_fs32(sb, uspi->cs_total.cs_nffree);
+	}
+	ubh_mark_buffer_dirty(USPI_UBH(uspi));
+	ufs_print_super_stuff(sb, usb1, usb2, usb3);
+	UFSD("EXIT\n");
+}
+
+/**
+ * ufs_put_super_internal() - put on-disk intrenal structures
+ * @sb: pointer to super_block structure
+ * Put on-disk structures associated with cylinder groups
+ * and write them back to disk, also update cs_total on disk
+ */
+static void ufs_put_super_internal(struct super_block *sb)
+{
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	struct ufs_sb_private_info *uspi = sbi->s_uspi;
+	struct ufs_buffer_head * ubh;
+	unsigned char * base, * space;
+	unsigned blks, size, i;
+
+	
+	UFSD("ENTER\n");
+
+	ufs_put_cstotal(sb);
+	size = uspi->s_cssize;
+	blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
+	base = space = (char*) sbi->s_csp;
+	for (i = 0; i < blks; i += uspi->s_fpb) {
+		size = uspi->s_bsize;
+		if (i + uspi->s_fpb > blks)
+			size = (blks - i) * uspi->s_fsize;
+
+		ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
+
+		ubh_memcpyubh (ubh, space, size);
+		space += size;
+		ubh_mark_buffer_uptodate (ubh, 1);
+		ubh_mark_buffer_dirty (ubh);
+		ubh_brelse (ubh);
+	}
+	for (i = 0; i < sbi->s_cg_loaded; i++) {
+		ufs_put_cylinder (sb, i);
+		kfree (sbi->s_ucpi[i]);
+	}
+	for (; i < UFS_MAX_GROUP_LOADED; i++) 
+		kfree (sbi->s_ucpi[i]);
+	for (i = 0; i < uspi->s_ncg; i++) 
+		brelse (sbi->s_ucg[i]);
+	kfree (sbi->s_ucg);
+	kfree (base);
+
+	UFSD("EXIT\n");
+}
+
+static int ufs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct ufs_sb_info * sbi;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	struct ufs_super_block_second * usb2;
+	struct ufs_super_block_third * usb3;
+	struct ufs_buffer_head * ubh;	
+	struct inode *inode;
+	unsigned block_size, super_block_size;
+	unsigned flags;
+	unsigned super_block_offset;
+	unsigned maxsymlen;
+	int ret = -EINVAL;
+
+	uspi = NULL;
+	ubh = NULL;
+	flags = 0;
+	
+	UFSD("ENTER\n");
+		
+	sbi = kzalloc(sizeof(struct ufs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		goto failed_nomem;
+	sb->s_fs_info = sbi;
+
+	UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
+	
+#ifndef CONFIG_UFS_FS_WRITE
+	if (!(sb->s_flags & MS_RDONLY)) {
+		printk("ufs was compiled with read-only support, "
+		"can't be mounted as read-write\n");
+		goto failed;
+	}
+#endif
+	mutex_init(&sbi->mutex);
+	/*
+	 * Set default mount options
+	 * Parse mount options
+	 */
+	sbi->s_mount_opt = 0;
+	ufs_set_opt (sbi->s_mount_opt, ONERROR_LOCK);
+	if (!ufs_parse_options ((char *) data, &sbi->s_mount_opt)) {
+		printk("wrong mount options\n");
+		goto failed;
+	}
+	if (!(sbi->s_mount_opt & UFS_MOUNT_UFSTYPE)) {
+		if (!silent)
+			printk("You didn't specify the type of your ufs filesystem\n\n"
+			"mount -t ufs -o ufstype="
+			"sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n"
+			">>>WARNING<<< Wrong ufstype may corrupt your filesystem, "
+			"default is ufstype=old\n");
+		ufs_set_opt (sbi->s_mount_opt, UFSTYPE_OLD);
+	}
+
+	uspi = kzalloc(sizeof(struct ufs_sb_private_info), GFP_KERNEL);
+	sbi->s_uspi = uspi;
+	if (!uspi)
+		goto failed;
+	uspi->s_dirblksize = UFS_SECTOR_SIZE;
+	super_block_offset=UFS_SBLOCK;
+
+	/* Keep 2Gig file limit. Some UFS variants need to override 
+	   this but as I don't know which I'll let those in the know loosen
+	   the rules */
+	switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) {
+	case UFS_MOUNT_UFSTYPE_44BSD:
+		UFSD("ufstype=44bsd\n");
+		uspi->s_fsize = block_size = 512;
+		uspi->s_fmask = ~(512 - 1);
+		uspi->s_fshift = 9;
+		uspi->s_sbsize = super_block_size = 1536;
+		uspi->s_sbbase = 0;
+		flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
+		break;
+	case UFS_MOUNT_UFSTYPE_UFS2:
+		UFSD("ufstype=ufs2\n");
+		super_block_offset=SBLOCK_UFS2;
+		uspi->s_fsize = block_size = 512;
+		uspi->s_fmask = ~(512 - 1);
+		uspi->s_fshift = 9;
+		uspi->s_sbsize = super_block_size = 1536;
+		uspi->s_sbbase =  0;
+		flags |= UFS_TYPE_UFS2 | UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
+		break;
+		
+	case UFS_MOUNT_UFSTYPE_SUN:
+		UFSD("ufstype=sun\n");
+		uspi->s_fsize = block_size = 1024;
+		uspi->s_fmask = ~(1024 - 1);
+		uspi->s_fshift = 10;
+		uspi->s_sbsize = super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		uspi->s_maxsymlinklen = 0; /* Not supported on disk */
+		flags |= UFS_DE_OLD | UFS_UID_EFT | UFS_ST_SUN | UFS_CG_SUN;
+		break;
+
+	case UFS_MOUNT_UFSTYPE_SUNOS:
+		UFSD(("ufstype=sunos\n"))
+		uspi->s_fsize = block_size = 1024;
+		uspi->s_fmask = ~(1024 - 1);
+		uspi->s_fshift = 10;
+		uspi->s_sbsize = 2048;
+		super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		uspi->s_maxsymlinklen = 0; /* Not supported on disk */
+		flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_SUNOS | UFS_CG_SUN;
+		break;
+
+	case UFS_MOUNT_UFSTYPE_SUNx86:
+		UFSD("ufstype=sunx86\n");
+		uspi->s_fsize = block_size = 1024;
+		uspi->s_fmask = ~(1024 - 1);
+		uspi->s_fshift = 10;
+		uspi->s_sbsize = super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		uspi->s_maxsymlinklen = 0; /* Not supported on disk */
+		flags |= UFS_DE_OLD | UFS_UID_EFT | UFS_ST_SUNx86 | UFS_CG_SUN;
+		break;
+
+	case UFS_MOUNT_UFSTYPE_OLD:
+		UFSD("ufstype=old\n");
+		uspi->s_fsize = block_size = 1024;
+		uspi->s_fmask = ~(1024 - 1);
+		uspi->s_fshift = 10;
+		uspi->s_sbsize = super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!silent)
+				printk(KERN_INFO "ufstype=old is supported read-only\n");
+			sb->s_flags |= MS_RDONLY;
+		}
+		break;
+	
+	case UFS_MOUNT_UFSTYPE_NEXTSTEP:
+		UFSD("ufstype=nextstep\n");
+		uspi->s_fsize = block_size = 1024;
+		uspi->s_fmask = ~(1024 - 1);
+		uspi->s_fshift = 10;
+		uspi->s_sbsize = super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		uspi->s_dirblksize = 1024;
+		flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!silent)
+				printk(KERN_INFO "ufstype=nextstep is supported read-only\n");
+			sb->s_flags |= MS_RDONLY;
+		}
+		break;
+	
+	case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD:
+		UFSD("ufstype=nextstep-cd\n");
+		uspi->s_fsize = block_size = 2048;
+		uspi->s_fmask = ~(2048 - 1);
+		uspi->s_fshift = 11;
+		uspi->s_sbsize = super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		uspi->s_dirblksize = 1024;
+		flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!silent)
+				printk(KERN_INFO "ufstype=nextstep-cd is supported read-only\n");
+			sb->s_flags |= MS_RDONLY;
+		}
+		break;
+	
+	case UFS_MOUNT_UFSTYPE_OPENSTEP:
+		UFSD("ufstype=openstep\n");
+		uspi->s_fsize = block_size = 1024;
+		uspi->s_fmask = ~(1024 - 1);
+		uspi->s_fshift = 10;
+		uspi->s_sbsize = super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		uspi->s_dirblksize = 1024;
+		flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!silent)
+				printk(KERN_INFO "ufstype=openstep is supported read-only\n");
+			sb->s_flags |= MS_RDONLY;
+		}
+		break;
+	
+	case UFS_MOUNT_UFSTYPE_HP:
+		UFSD("ufstype=hp\n");
+		uspi->s_fsize = block_size = 1024;
+		uspi->s_fmask = ~(1024 - 1);
+		uspi->s_fshift = 10;
+		uspi->s_sbsize = super_block_size = 2048;
+		uspi->s_sbbase = 0;
+		flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!silent)
+				printk(KERN_INFO "ufstype=hp is supported read-only\n");
+			sb->s_flags |= MS_RDONLY;
+ 		}
+ 		break;
+	default:
+		if (!silent)
+			printk("unknown ufstype\n");
+		goto failed;
+	}
+	
+again:	
+	if (!sb_set_blocksize(sb, block_size)) {
+		printk(KERN_ERR "UFS: failed to set blocksize\n");
+		goto failed;
+	}
+
+	/*
+	 * read ufs super block from device
+	 */
+
+	ubh = ubh_bread_uspi(uspi, sb, uspi->s_sbbase + super_block_offset/block_size, super_block_size);
+	
+	if (!ubh) 
+            goto failed;
+
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+
+	/* Sort out mod used on SunOS 4.1.3 for fs_state */
+	uspi->s_postblformat = fs32_to_cpu(sb, usb3->fs_postblformat);
+	if (((flags & UFS_ST_MASK) == UFS_ST_SUNOS) &&
+	    (uspi->s_postblformat != UFS_42POSTBLFMT)) {
+		flags &= ~UFS_ST_MASK;
+		flags |=  UFS_ST_SUN;
+	}
+
+	/*
+	 * Check ufs magic number
+	 */
+	sbi->s_bytesex = BYTESEX_LE;
+	switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
+		case UFS_MAGIC:
+		case UFS_MAGIC_BW:
+		case UFS2_MAGIC:
+		case UFS_MAGIC_LFN:
+	        case UFS_MAGIC_FEA:
+	        case UFS_MAGIC_4GB:
+			goto magic_found;
+	}
+	sbi->s_bytesex = BYTESEX_BE;
+	switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
+		case UFS_MAGIC:
+		case UFS_MAGIC_BW:
+		case UFS2_MAGIC:
+		case UFS_MAGIC_LFN:
+	        case UFS_MAGIC_FEA:
+	        case UFS_MAGIC_4GB:
+			goto magic_found;
+	}
+
+	if ((((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP) 
+	  || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP_CD) 
+	  || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_OPENSTEP)) 
+	  && uspi->s_sbbase < 256) {
+		ubh_brelse_uspi(uspi);
+		ubh = NULL;
+		uspi->s_sbbase += 8;
+		goto again;
+	}
+	if (!silent)
+		printk("ufs_read_super: bad magic number\n");
+	goto failed;
+
+magic_found:
+	/*
+	 * Check block and fragment sizes
+	 */
+	uspi->s_bsize = fs32_to_cpu(sb, usb1->fs_bsize);
+	uspi->s_fsize = fs32_to_cpu(sb, usb1->fs_fsize);
+	uspi->s_sbsize = fs32_to_cpu(sb, usb1->fs_sbsize);
+	uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask);
+	uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
+
+	if (!is_power_of_2(uspi->s_fsize)) {
+		printk(KERN_ERR "ufs_read_super: fragment size %u is not a power of 2\n",
+			uspi->s_fsize);
+			goto failed;
+	}
+	if (uspi->s_fsize < 512) {
+		printk(KERN_ERR "ufs_read_super: fragment size %u is too small\n",
+			uspi->s_fsize);
+		goto failed;
+	}
+	if (uspi->s_fsize > 4096) {
+		printk(KERN_ERR "ufs_read_super: fragment size %u is too large\n",
+			uspi->s_fsize);
+		goto failed;
+	}
+	if (!is_power_of_2(uspi->s_bsize)) {
+		printk(KERN_ERR "ufs_read_super: block size %u is not a power of 2\n",
+			uspi->s_bsize);
+		goto failed;
+	}
+	if (uspi->s_bsize < 4096) {
+		printk(KERN_ERR "ufs_read_super: block size %u is too small\n",
+			uspi->s_bsize);
+		goto failed;
+	}
+	if (uspi->s_bsize / uspi->s_fsize > 8) {
+		printk(KERN_ERR "ufs_read_super: too many fragments per block (%u)\n",
+			uspi->s_bsize / uspi->s_fsize);
+		goto failed;
+	}
+	if (uspi->s_fsize != block_size || uspi->s_sbsize != super_block_size) {
+		ubh_brelse_uspi(uspi);
+		ubh = NULL;
+		block_size = uspi->s_fsize;
+		super_block_size = uspi->s_sbsize;
+		UFSD("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size);
+		goto again;
+	}
+
+	sbi->s_flags = flags;/*after that line some functions use s_flags*/
+	ufs_print_super_stuff(sb, usb1, usb2, usb3);
+
+	/*
+	 * Check, if file system was correctly unmounted.
+	 * If not, make it read only.
+	 */
+	if (((flags & UFS_ST_MASK) == UFS_ST_44BSD) ||
+	  ((flags & UFS_ST_MASK) == UFS_ST_OLD) ||
+	  (((flags & UFS_ST_MASK) == UFS_ST_SUN ||
+	    (flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
+	  (flags & UFS_ST_MASK) == UFS_ST_SUNx86) &&
+	  (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) {
+		switch(usb1->fs_clean) {
+		case UFS_FSCLEAN:
+			UFSD("fs is clean\n");
+			break;
+		case UFS_FSSTABLE:
+			UFSD("fs is stable\n");
+			break;
+		case UFS_FSLOG:
+			UFSD("fs is logging fs\n");
+			break;
+		case UFS_FSOSF1:
+			UFSD("fs is DEC OSF/1\n");
+			break;
+		case UFS_FSACTIVE:
+			printk("ufs_read_super: fs is active\n");
+			sb->s_flags |= MS_RDONLY;
+			break;
+		case UFS_FSBAD:
+			printk("ufs_read_super: fs is bad\n");
+			sb->s_flags |= MS_RDONLY;
+			break;
+		default:
+			printk("ufs_read_super: can't grok fs_clean 0x%x\n", usb1->fs_clean);
+			sb->s_flags |= MS_RDONLY;
+			break;
+		}
+	} else {
+		printk("ufs_read_super: fs needs fsck\n");
+		sb->s_flags |= MS_RDONLY;
+	}
+
+	/*
+	 * Read ufs_super_block into internal data structures
+	 */
+	sb->s_op = &ufs_super_ops;
+	sb->s_export_op = &ufs_export_ops;
+
+	sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
+
+	uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno);
+	uspi->s_cblkno = fs32_to_cpu(sb, usb1->fs_cblkno);
+	uspi->s_iblkno = fs32_to_cpu(sb, usb1->fs_iblkno);
+	uspi->s_dblkno = fs32_to_cpu(sb, usb1->fs_dblkno);
+	uspi->s_cgoffset = fs32_to_cpu(sb, usb1->fs_cgoffset);
+	uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask);
+
+	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+		uspi->s_u2_size  = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size);
+		uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
+	} else {
+		uspi->s_size  =  fs32_to_cpu(sb, usb1->fs_size);
+		uspi->s_dsize =  fs32_to_cpu(sb, usb1->fs_dsize);
+	}
+
+	uspi->s_ncg = fs32_to_cpu(sb, usb1->fs_ncg);
+	/* s_bsize already set */
+	/* s_fsize already set */
+	uspi->s_fpb = fs32_to_cpu(sb, usb1->fs_frag);
+	uspi->s_minfree = fs32_to_cpu(sb, usb1->fs_minfree);
+	uspi->s_bmask = fs32_to_cpu(sb, usb1->fs_bmask);
+	uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask);
+	uspi->s_bshift = fs32_to_cpu(sb, usb1->fs_bshift);
+	uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
+	UFSD("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift,
+		uspi->s_fshift);
+	uspi->s_fpbshift = fs32_to_cpu(sb, usb1->fs_fragshift);
+	uspi->s_fsbtodb = fs32_to_cpu(sb, usb1->fs_fsbtodb);
+	/* s_sbsize already set */
+	uspi->s_csmask = fs32_to_cpu(sb, usb1->fs_csmask);
+	uspi->s_csshift = fs32_to_cpu(sb, usb1->fs_csshift);
+	uspi->s_nindir = fs32_to_cpu(sb, usb1->fs_nindir);
+	uspi->s_inopb = fs32_to_cpu(sb, usb1->fs_inopb);
+	uspi->s_nspf = fs32_to_cpu(sb, usb1->fs_nspf);
+	uspi->s_npsect = ufs_get_fs_npsect(sb, usb1, usb3);
+	uspi->s_interleave = fs32_to_cpu(sb, usb1->fs_interleave);
+	uspi->s_trackskew = fs32_to_cpu(sb, usb1->fs_trackskew);
+
+	if (uspi->fs_magic == UFS2_MAGIC)
+		uspi->s_csaddr = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_csaddr);
+	else
+		uspi->s_csaddr = fs32_to_cpu(sb, usb1->fs_csaddr);
+
+	uspi->s_cssize = fs32_to_cpu(sb, usb1->fs_cssize);
+	uspi->s_cgsize = fs32_to_cpu(sb, usb1->fs_cgsize);
+	uspi->s_ntrak = fs32_to_cpu(sb, usb1->fs_ntrak);
+	uspi->s_nsect = fs32_to_cpu(sb, usb1->fs_nsect);
+	uspi->s_spc = fs32_to_cpu(sb, usb1->fs_spc);
+	uspi->s_ipg = fs32_to_cpu(sb, usb1->fs_ipg);
+	uspi->s_fpg = fs32_to_cpu(sb, usb1->fs_fpg);
+	uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_un.fs_u1.fs_cpc);
+	uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_contigsumsize);
+	uspi->s_qbmask = ufs_get_fs_qbmask(sb, usb3);
+	uspi->s_qfmask = ufs_get_fs_qfmask(sb, usb3);
+	uspi->s_nrpos = fs32_to_cpu(sb, usb3->fs_nrpos);
+	uspi->s_postbloff = fs32_to_cpu(sb, usb3->fs_postbloff);
+	uspi->s_rotbloff = fs32_to_cpu(sb, usb3->fs_rotbloff);
+
+	/*
+	 * Compute another frequently used values
+	 */
+	uspi->s_fpbmask = uspi->s_fpb - 1;
+	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
+		uspi->s_apbshift = uspi->s_bshift - 3;
+	else
+		uspi->s_apbshift = uspi->s_bshift - 2;
+
+	uspi->s_2apbshift = uspi->s_apbshift * 2;
+	uspi->s_3apbshift = uspi->s_apbshift * 3;
+	uspi->s_apb = 1 << uspi->s_apbshift;
+	uspi->s_2apb = 1 << uspi->s_2apbshift;
+	uspi->s_3apb = 1 << uspi->s_3apbshift;
+	uspi->s_apbmask = uspi->s_apb - 1;
+	uspi->s_nspfshift = uspi->s_fshift - UFS_SECTOR_BITS;
+	uspi->s_nspb = uspi->s_nspf << uspi->s_fpbshift;
+	uspi->s_inopf = uspi->s_inopb >> uspi->s_fpbshift;
+	uspi->s_bpf = uspi->s_fsize << 3;
+	uspi->s_bpfshift = uspi->s_fshift + 3;
+	uspi->s_bpfmask = uspi->s_bpf - 1;
+	if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_44BSD ||
+	    (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_UFS2)
+		uspi->s_maxsymlinklen =
+		    fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen);
+
+	if (uspi->fs_magic == UFS2_MAGIC)
+		maxsymlen = 2 * 4 * (UFS_NDADDR + UFS_NINDIR);
+	else
+		maxsymlen = 4 * (UFS_NDADDR + UFS_NINDIR);
+	if (uspi->s_maxsymlinklen > maxsymlen) {
+		ufs_warning(sb, __func__, "ufs_read_super: excessive maximum "
+			    "fast symlink size (%u)\n", uspi->s_maxsymlinklen);
+		uspi->s_maxsymlinklen = maxsymlen;
+	}
+
+	inode = ufs_iget(sb, UFS_ROOTINO);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto failed;
+	}
+	sb->s_root = d_alloc_root(inode);
+	if (!sb->s_root) {
+		ret = -ENOMEM;
+		goto dalloc_failed;
+	}
+
+	ufs_setup_cstotal(sb);
+	/*
+	 * Read cylinder group structures
+	 */
+	if (!(sb->s_flags & MS_RDONLY))
+		if (!ufs_read_cylinder_structures(sb))
+			goto failed;
+
+	UFSD("EXIT\n");
+	return 0;
+
+dalloc_failed:
+	iput(inode);
+failed:
+	if (ubh)
+		ubh_brelse_uspi (uspi);
+	kfree (uspi);
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+	UFSD("EXIT (FAILED)\n");
+	return ret;
+
+failed_nomem:
+	UFSD("EXIT (NOMEM)\n");
+	return -ENOMEM;
+}
+
+static int ufs_sync_fs(struct super_block *sb, int wait)
+{
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	struct ufs_super_block_third * usb3;
+	unsigned flags;
+
+	lock_ufs(sb);
+	lock_super(sb);
+
+	UFSD("ENTER\n");
+
+	flags = UFS_SB(sb)->s_flags;
+	uspi = UFS_SB(sb)->s_uspi;
+	usb1 = ubh_get_usb_first(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+
+	usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+	if ((flags & UFS_ST_MASK) == UFS_ST_SUN  ||
+	    (flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
+	    (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
+		ufs_set_fs_state(sb, usb1, usb3,
+				UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
+	ufs_put_cstotal(sb);
+	sb->s_dirt = 0;
+
+	UFSD("EXIT\n");
+	unlock_super(sb);
+	unlock_ufs(sb);
+
+	return 0;
+}
+
+static void ufs_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		ufs_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
+}
+
+static void ufs_put_super(struct super_block *sb)
+{
+	struct ufs_sb_info * sbi = UFS_SB(sb);
+		
+	UFSD("ENTER\n");
+
+	if (sb->s_dirt)
+		ufs_write_super(sb);
+
+	if (!(sb->s_flags & MS_RDONLY))
+		ufs_put_super_internal(sb);
+	
+	ubh_brelse_uspi (sbi->s_uspi);
+	kfree (sbi->s_uspi);
+	kfree (sbi);
+	sb->s_fs_info = NULL;
+	UFSD("EXIT\n");
+	return;
+}
+
+
+static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
+{
+	struct ufs_sb_private_info * uspi;
+	struct ufs_super_block_first * usb1;
+	struct ufs_super_block_third * usb3;
+	unsigned new_mount_opt, ufstype;
+	unsigned flags;
+
+	lock_ufs(sb);
+	lock_super(sb);
+	uspi = UFS_SB(sb)->s_uspi;
+	flags = UFS_SB(sb)->s_flags;
+	usb1 = ubh_get_usb_first(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+	
+	/*
+	 * Allow the "check" option to be passed as a remount option.
+	 * It is not possible to change ufstype option during remount
+	 */
+	ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
+	new_mount_opt = 0;
+	ufs_set_opt (new_mount_opt, ONERROR_LOCK);
+	if (!ufs_parse_options (data, &new_mount_opt)) {
+		unlock_super(sb);
+		unlock_ufs(sb);
+		return -EINVAL;
+	}
+	if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
+		new_mount_opt |= ufstype;
+	} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
+		printk("ufstype can't be changed during remount\n");
+		unlock_super(sb);
+		unlock_ufs(sb);
+		return -EINVAL;
+	}
+
+	if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+		UFS_SB(sb)->s_mount_opt = new_mount_opt;
+		unlock_super(sb);
+		unlock_ufs(sb);
+		return 0;
+	}
+	
+	/*
+	 * fs was mouted as rw, remounting ro
+	 */
+	if (*mount_flags & MS_RDONLY) {
+		ufs_put_super_internal(sb);
+		usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+		if ((flags & UFS_ST_MASK) == UFS_ST_SUN
+		  || (flags & UFS_ST_MASK) == UFS_ST_SUNOS
+		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) 
+			ufs_set_fs_state(sb, usb1, usb3,
+				UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
+		ubh_mark_buffer_dirty (USPI_UBH(uspi));
+		sb->s_dirt = 0;
+		sb->s_flags |= MS_RDONLY;
+	} else {
+	/*
+	 * fs was mounted as ro, remounting rw
+	 */
+#ifndef CONFIG_UFS_FS_WRITE
+		printk("ufs was compiled with read-only support, "
+		"can't be mounted as read-write\n");
+		unlock_super(sb);
+		unlock_ufs(sb);
+		return -EINVAL;
+#else
+		if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 
+		    ufstype != UFS_MOUNT_UFSTYPE_SUNOS &&
+		    ufstype != UFS_MOUNT_UFSTYPE_44BSD &&
+		    ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
+		    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
+			printk("this ufstype is read-only supported\n");
+			unlock_super(sb);
+			unlock_ufs(sb);
+			return -EINVAL;
+		}
+		if (!ufs_read_cylinder_structures(sb)) {
+			printk("failed during remounting\n");
+			unlock_super(sb);
+			unlock_ufs(sb);
+			return -EPERM;
+		}
+		sb->s_flags &= ~MS_RDONLY;
+#endif
+	}
+	UFS_SB(sb)->s_mount_opt = new_mount_opt;
+	unlock_super(sb);
+	unlock_ufs(sb);
+	return 0;
+}
+
+static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb);
+	unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
+	const struct match_token *tp = tokens;
+
+	while (tp->token != Opt_onerror_panic && tp->token != mval)
+		++tp;
+	BUG_ON(tp->token == Opt_onerror_panic);
+	seq_printf(seq, ",%s", tp->pattern);
+
+	mval = sbi->s_mount_opt & UFS_MOUNT_ONERROR;
+	while (tp->token != Opt_err && tp->token != mval)
+		++tp;
+	BUG_ON(tp->token == Opt_err);
+	seq_printf(seq, ",%s", tp->pattern);
+
+	return 0;
+}
+
+static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi;
+	unsigned  flags = UFS_SB(sb)->s_flags;
+	struct ufs_super_block_first *usb1;
+	struct ufs_super_block_second *usb2;
+	struct ufs_super_block_third *usb3;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	lock_ufs(sb);
+
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+	
+	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+		buf->f_type = UFS2_MAGIC;
+		buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
+	} else {
+		buf->f_type = UFS_MAGIC;
+		buf->f_blocks = uspi->s_dsize;
+	}
+	buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
+		uspi->cs_total.cs_nffree;
+	buf->f_ffree = uspi->cs_total.cs_nifree;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree))
+		? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
+	buf->f_files = uspi->s_ncg * uspi->s_ipg;
+	buf->f_namelen = UFS_MAXNAMLEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	unlock_ufs(sb);
+
+	return 0;
+}
+
+static struct kmem_cache * ufs_inode_cachep;
+
+static struct inode *ufs_alloc_inode(struct super_block *sb)
+{
+	struct ufs_inode_info *ei;
+	ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+	ei->vfs_inode.i_version = 1;
+	return &ei->vfs_inode;
+}
+
+static void ufs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
+}
+
+static void ufs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ufs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
+					     sizeof(struct ufs_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (ufs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(ufs_inode_cachep);
+}
+
+static const struct super_operations ufs_super_ops = {
+	.alloc_inode	= ufs_alloc_inode,
+	.destroy_inode	= ufs_destroy_inode,
+	.write_inode	= ufs_write_inode,
+	.evict_inode	= ufs_evict_inode,
+	.put_super	= ufs_put_super,
+	.write_super	= ufs_write_super,
+	.sync_fs	= ufs_sync_fs,
+	.statfs		= ufs_statfs,
+	.remount_fs	= ufs_remount,
+	.show_options   = ufs_show_options,
+};
+
+static struct dentry *ufs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
+}
+
+static struct file_system_type ufs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ufs",
+	.mount		= ufs_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_ufs_fs(void)
+{
+	int err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&ufs_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	return err;
+}
+
+static void __exit exit_ufs_fs(void)
+{
+	unregister_filesystem(&ufs_fs_type);
+	destroy_inodecache();
+}
+
+module_init(init_ufs_fs)
+module_exit(exit_ufs_fs)
+MODULE_LICENSE("GPL");
diff --git a/fs/ufs/swab.h b/fs/ufs/swab.h
new file mode 100644
index 00000000..8d974c4f
--- /dev/null
+++ b/fs/ufs/swab.h
@@ -0,0 +1,115 @@
+/*
+ *  linux/fs/ufs/swab.h
+ *
+ * Copyright (C) 1997, 1998 Francois-Rene Rideau <fare@tunes.org>
+ * Copyright (C) 1998 Jakub Jelinek <jj@ultra.linux.cz>
+ * Copyright (C) 2001 Christoph Hellwig <hch@infradead.org>
+ */
+
+#ifndef _UFS_SWAB_H
+#define _UFS_SWAB_H
+
+/*
+ * Notes:
+ *    HERE WE ASSUME EITHER BIG OR LITTLE ENDIAN UFSes
+ *    in case there are ufs implementations that have strange bytesexes,
+ *    you'll need to modify code here as well as in ufs_super.c and ufs_fs.h
+ *    to support them.
+ */
+
+enum {
+	BYTESEX_LE,
+	BYTESEX_BE
+};
+
+static inline u64
+fs64_to_cpu(struct super_block *sbp, __fs64 n)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		return le64_to_cpu((__force __le64)n);
+	else
+		return be64_to_cpu((__force __be64)n);
+}
+
+static inline __fs64
+cpu_to_fs64(struct super_block *sbp, u64 n)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		return (__force __fs64)cpu_to_le64(n);
+	else
+		return (__force __fs64)cpu_to_be64(n);
+}
+
+static inline u32
+fs32_to_cpu(struct super_block *sbp, __fs32 n)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		return le32_to_cpu((__force __le32)n);
+	else
+		return be32_to_cpu((__force __be32)n);
+}
+
+static inline __fs32
+cpu_to_fs32(struct super_block *sbp, u32 n)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		return (__force __fs32)cpu_to_le32(n);
+	else
+		return (__force __fs32)cpu_to_be32(n);
+}
+
+static inline void
+fs32_add(struct super_block *sbp, __fs32 *n, int d)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		le32_add_cpu((__le32 *)n, d);
+	else
+		be32_add_cpu((__be32 *)n, d);
+}
+
+static inline void
+fs32_sub(struct super_block *sbp, __fs32 *n, int d)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		le32_add_cpu((__le32 *)n, -d);
+	else
+		be32_add_cpu((__be32 *)n, -d);
+}
+
+static inline u16
+fs16_to_cpu(struct super_block *sbp, __fs16 n)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		return le16_to_cpu((__force __le16)n);
+	else
+		return be16_to_cpu((__force __be16)n);
+}
+
+static inline __fs16
+cpu_to_fs16(struct super_block *sbp, u16 n)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		return (__force __fs16)cpu_to_le16(n);
+	else
+		return (__force __fs16)cpu_to_be16(n);
+}
+
+static inline void
+fs16_add(struct super_block *sbp, __fs16 *n, int d)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		le16_add_cpu((__le16 *)n, d);
+	else
+		be16_add_cpu((__be16 *)n, d);
+}
+
+static inline void
+fs16_sub(struct super_block *sbp, __fs16 *n, int d)
+{
+	if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE)
+		le16_add_cpu((__le16 *)n, -d);
+	else
+		be16_add_cpu((__be16 *)n, -d);
+}
+
+#endif /* _UFS_SWAB_H */
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
new file mode 100644
index 00000000..d283628b
--- /dev/null
+++ b/fs/ufs/symlink.c
@@ -0,0 +1,53 @@
+/*
+ *  linux/fs/ufs/symlink.c
+ *
+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@emai.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ *  from
+ *
+ *  linux/fs/ext2/symlink.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/symlink.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 symlink handling code
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+
+
+static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ufs_inode_info *p = UFS_I(dentry->d_inode);
+	nd_set_link(nd, (char*)p->i_u1.i_symlink);
+	return NULL;
+}
+
+const struct inode_operations ufs_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= ufs_follow_link,
+	.setattr	= ufs_setattr,
+};
+
+const struct inode_operations ufs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= ufs_setattr,
+};
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
new file mode 100644
index 00000000..f04f89fb
--- /dev/null
+++ b/fs/ufs/truncate.c
@@ -0,0 +1,523 @@
+/*
+ *  linux/fs/ufs/truncate.c
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ *
+ *  from
+ *
+ *  linux/fs/ext2/truncate.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/truncate.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+/*
+ * Real random numbers for secure rm added 94/02/18
+ * Idea from Pierre del Perugia <delperug@gla.ecoledoc.ibp.fr>
+ */
+
+/*
+ * Adoptation to use page cache and UFS2 write support by
+ * Evgeniy Dushistov <dushistov@mail.ru>, 2006-2007
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/fcntl.h>
+#include <linux/time.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/sched.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+/*
+ * Secure deletion currently doesn't work. It interacts very badly
+ * with buffers shared with memory mappings, and for that reason
+ * can't be done in the truncate() routines. It should instead be
+ * done separately in "release()" before calling the truncate routines
+ * that will release the actual file blocks.
+ *
+ *		Linus
+ */
+
+#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift)
+#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift)
+
+
+static int ufs_trunc_direct(struct inode *inode)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	void *p;
+	u64 frag1, frag2, frag3, frag4, block1, block2;
+	unsigned frag_to_free, free_count;
+	unsigned i, tmp;
+	int retry;
+	
+	UFSD("ENTER: ino %lu\n", inode->i_ino);
+
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+	
+	frag_to_free = 0;
+	free_count = 0;
+	retry = 0;
+	
+	frag1 = DIRECT_FRAGMENT;
+	frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
+	frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1);
+	frag3 = frag4 & ~uspi->s_fpbmask;
+	block1 = block2 = 0;
+	if (frag2 > frag3) {
+		frag2 = frag4;
+		frag3 = frag4 = 0;
+	} else if (frag2 < frag3) {
+		block1 = ufs_fragstoblks (frag2);
+		block2 = ufs_fragstoblks (frag3);
+	}
+
+	UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu,"
+	     " frag3 %llu, frag4 %llu\n", inode->i_ino,
+	     (unsigned long long)frag1, (unsigned long long)frag2,
+	     (unsigned long long)block1, (unsigned long long)block2,
+	     (unsigned long long)frag3, (unsigned long long)frag4);
+
+	if (frag1 >= frag2)
+		goto next1;		
+
+	/*
+	 * Free first free fragments
+	 */
+	p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1));
+	tmp = ufs_data_ptr_to_cpu(sb, p);
+	if (!tmp )
+		ufs_panic (sb, "ufs_trunc_direct", "internal error");
+	frag2 -= frag1;
+	frag1 = ufs_fragnum (frag1);
+
+	ufs_free_fragments(inode, tmp + frag1, frag2);
+	mark_inode_dirty(inode);
+	frag_to_free = tmp + frag1;
+
+next1:
+	/*
+	 * Free whole blocks
+	 */
+	for (i = block1 ; i < block2; i++) {
+		p = ufs_get_direct_data_ptr(uspi, ufsi, i);
+		tmp = ufs_data_ptr_to_cpu(sb, p);
+		if (!tmp)
+			continue;
+		ufs_data_ptr_clear(uspi, p);
+
+		if (free_count == 0) {
+			frag_to_free = tmp;
+			free_count = uspi->s_fpb;
+		} else if (free_count > 0 && frag_to_free == tmp - free_count)
+			free_count += uspi->s_fpb;
+		else {
+			ufs_free_blocks (inode, frag_to_free, free_count);
+			frag_to_free = tmp;
+			free_count = uspi->s_fpb;
+		}
+		mark_inode_dirty(inode);
+	}
+	
+	if (free_count > 0)
+		ufs_free_blocks (inode, frag_to_free, free_count);
+
+	if (frag3 >= frag4)
+		goto next3;
+
+	/*
+	 * Free last free fragments
+	 */
+	p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3));
+	tmp = ufs_data_ptr_to_cpu(sb, p);
+	if (!tmp )
+		ufs_panic(sb, "ufs_truncate_direct", "internal error");
+	frag4 = ufs_fragnum (frag4);
+	ufs_data_ptr_clear(uspi, p);
+
+	ufs_free_fragments (inode, tmp, frag4);
+	mark_inode_dirty(inode);
+ next3:
+
+	UFSD("EXIT: ino %lu\n", inode->i_ino);
+	return retry;
+}
+
+
+static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_buffer_head * ind_ubh;
+	void *ind;
+	u64 tmp, indirect_block, i, frag_to_free;
+	unsigned free_count;
+	int retry;
+
+	UFSD("ENTER: ino %lu, offset %llu, p: %p\n",
+	     inode->i_ino, (unsigned long long)offset, p);
+
+	BUG_ON(!p);
+		
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+
+	frag_to_free = 0;
+	free_count = 0;
+	retry = 0;
+	
+	tmp = ufs_data_ptr_to_cpu(sb, p);
+	if (!tmp)
+		return 0;
+	ind_ubh = ubh_bread(sb, tmp, uspi->s_bsize);
+	if (tmp != ufs_data_ptr_to_cpu(sb, p)) {
+		ubh_brelse (ind_ubh);
+		return 1;
+	}
+	if (!ind_ubh) {
+		ufs_data_ptr_clear(uspi, p);
+		return 0;
+	}
+
+	indirect_block = (DIRECT_BLOCK > offset) ? (DIRECT_BLOCK - offset) : 0;
+	for (i = indirect_block; i < uspi->s_apb; i++) {
+		ind = ubh_get_data_ptr(uspi, ind_ubh, i);
+		tmp = ufs_data_ptr_to_cpu(sb, ind);
+		if (!tmp)
+			continue;
+
+		ufs_data_ptr_clear(uspi, ind);
+		ubh_mark_buffer_dirty(ind_ubh);
+		if (free_count == 0) {
+			frag_to_free = tmp;
+			free_count = uspi->s_fpb;
+		} else if (free_count > 0 && frag_to_free == tmp - free_count)
+			free_count += uspi->s_fpb;
+		else {
+			ufs_free_blocks (inode, frag_to_free, free_count);
+			frag_to_free = tmp;
+			free_count = uspi->s_fpb;
+		}
+
+		mark_inode_dirty(inode);
+	}
+
+	if (free_count > 0) {
+		ufs_free_blocks (inode, frag_to_free, free_count);
+	}
+	for (i = 0; i < uspi->s_apb; i++)
+		if (!ufs_is_data_ptr_zero(uspi,
+					  ubh_get_data_ptr(uspi, ind_ubh, i)))
+			break;
+	if (i >= uspi->s_apb) {
+		tmp = ufs_data_ptr_to_cpu(sb, p);
+		ufs_data_ptr_clear(uspi, p);
+
+		ufs_free_blocks (inode, tmp, uspi->s_fpb);
+		mark_inode_dirty(inode);
+		ubh_bforget(ind_ubh);
+		ind_ubh = NULL;
+	}
+	if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh))
+		ubh_sync_block(ind_ubh);
+	ubh_brelse (ind_ubh);
+	
+	UFSD("EXIT: ino %lu\n", inode->i_ino);
+	
+	return retry;
+}
+
+static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p)
+{
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct ufs_buffer_head *dind_bh;
+	u64 i, tmp, dindirect_block;
+	void *dind;
+	int retry = 0;
+	
+	UFSD("ENTER: ino %lu\n", inode->i_ino);
+	
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+
+	dindirect_block = (DIRECT_BLOCK > offset) 
+		? ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0;
+	retry = 0;
+	
+	tmp = ufs_data_ptr_to_cpu(sb, p);
+	if (!tmp)
+		return 0;
+	dind_bh = ubh_bread(sb, tmp, uspi->s_bsize);
+	if (tmp != ufs_data_ptr_to_cpu(sb, p)) {
+		ubh_brelse (dind_bh);
+		return 1;
+	}
+	if (!dind_bh) {
+		ufs_data_ptr_clear(uspi, p);
+		return 0;
+	}
+
+	for (i = dindirect_block ; i < uspi->s_apb ; i++) {
+		dind = ubh_get_data_ptr(uspi, dind_bh, i);
+		tmp = ufs_data_ptr_to_cpu(sb, dind);
+		if (!tmp)
+			continue;
+		retry |= ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind);
+		ubh_mark_buffer_dirty(dind_bh);
+	}
+
+	for (i = 0; i < uspi->s_apb; i++)
+		if (!ufs_is_data_ptr_zero(uspi,
+					  ubh_get_data_ptr(uspi, dind_bh, i)))
+			break;
+	if (i >= uspi->s_apb) {
+		tmp = ufs_data_ptr_to_cpu(sb, p);
+		ufs_data_ptr_clear(uspi, p);
+
+		ufs_free_blocks(inode, tmp, uspi->s_fpb);
+		mark_inode_dirty(inode);
+		ubh_bforget(dind_bh);
+		dind_bh = NULL;
+	}
+	if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh))
+		ubh_sync_block(dind_bh);
+	ubh_brelse (dind_bh);
+	
+	UFSD("EXIT: ino %lu\n", inode->i_ino);
+	
+	return retry;
+}
+
+static int ufs_trunc_tindirect(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct ufs_buffer_head * tind_bh;
+	u64 tindirect_block, tmp, i;
+	void *tind, *p;
+	int retry;
+	
+	UFSD("ENTER: ino %lu\n", inode->i_ino);
+
+	retry = 0;
+	
+	tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb))
+		? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0;
+
+	p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK);
+	if (!(tmp = ufs_data_ptr_to_cpu(sb, p)))
+		return 0;
+	tind_bh = ubh_bread (sb, tmp, uspi->s_bsize);
+	if (tmp != ufs_data_ptr_to_cpu(sb, p)) {
+		ubh_brelse (tind_bh);
+		return 1;
+	}
+	if (!tind_bh) {
+		ufs_data_ptr_clear(uspi, p);
+		return 0;
+	}
+
+	for (i = tindirect_block ; i < uspi->s_apb ; i++) {
+		tind = ubh_get_data_ptr(uspi, tind_bh, i);
+		retry |= ufs_trunc_dindirect(inode, UFS_NDADDR + 
+			uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind);
+		ubh_mark_buffer_dirty(tind_bh);
+	}
+	for (i = 0; i < uspi->s_apb; i++)
+		if (!ufs_is_data_ptr_zero(uspi,
+					  ubh_get_data_ptr(uspi, tind_bh, i)))
+			break;
+	if (i >= uspi->s_apb) {
+		tmp = ufs_data_ptr_to_cpu(sb, p);
+		ufs_data_ptr_clear(uspi, p);
+
+		ufs_free_blocks(inode, tmp, uspi->s_fpb);
+		mark_inode_dirty(inode);
+		ubh_bforget(tind_bh);
+		tind_bh = NULL;
+	}
+	if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh))
+		ubh_sync_block(tind_bh);
+	ubh_brelse (tind_bh);
+	
+	UFSD("EXIT: ino %lu\n", inode->i_ino);
+	return retry;
+}
+
+static int ufs_alloc_lastblock(struct inode *inode)
+{
+	int err = 0;
+	struct super_block *sb = inode->i_sb;
+	struct address_space *mapping = inode->i_mapping;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	unsigned i, end;
+	sector_t lastfrag;
+	struct page *lastpage;
+	struct buffer_head *bh;
+	u64 phys64;
+
+	lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift;
+
+	if (!lastfrag)
+		goto out;
+
+	lastfrag--;
+
+	lastpage = ufs_get_locked_page(mapping, lastfrag >>
+				       (PAGE_CACHE_SHIFT - inode->i_blkbits));
+       if (IS_ERR(lastpage)) {
+               err = -EIO;
+               goto out;
+       }
+
+       end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1);
+       bh = page_buffers(lastpage);
+       for (i = 0; i < end; ++i)
+               bh = bh->b_this_page;
+
+
+       err = ufs_getfrag_block(inode, lastfrag, bh, 1);
+
+       if (unlikely(err))
+	       goto out_unlock;
+
+       if (buffer_new(bh)) {
+	       clear_buffer_new(bh);
+	       unmap_underlying_metadata(bh->b_bdev,
+					 bh->b_blocknr);
+	       /*
+		* we do not zeroize fragment, because of
+		* if it maped to hole, it already contains zeroes
+		*/
+	       set_buffer_uptodate(bh);
+	       mark_buffer_dirty(bh);
+	       set_page_dirty(lastpage);
+       }
+
+       if (lastfrag >= UFS_IND_FRAGMENT) {
+	       end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1;
+	       phys64 = bh->b_blocknr + 1;
+	       for (i = 0; i < end; ++i) {
+		       bh = sb_getblk(sb, i + phys64);
+		       lock_buffer(bh);
+		       memset(bh->b_data, 0, sb->s_blocksize);
+		       set_buffer_uptodate(bh);
+		       mark_buffer_dirty(bh);
+		       unlock_buffer(bh);
+		       sync_dirty_buffer(bh);
+		       brelse(bh);
+	       }
+       }
+out_unlock:
+       ufs_put_locked_page(lastpage);
+out:
+       return err;
+}
+
+int ufs_truncate(struct inode *inode, loff_t old_i_size)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	int retry, err = 0;
+	
+	UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n",
+	     inode->i_ino, (unsigned long long)i_size_read(inode),
+	     (unsigned long long)old_i_size);
+
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	      S_ISLNK(inode->i_mode)))
+		return -EINVAL;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return -EPERM;
+
+	err = ufs_alloc_lastblock(inode);
+
+	if (err) {
+		i_size_write(inode, old_i_size);
+		goto out;
+	}
+
+	block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block);
+
+	while (1) {
+		retry = ufs_trunc_direct(inode);
+		retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK,
+					    ufs_get_direct_data_ptr(uspi, ufsi,
+								    UFS_IND_BLOCK));
+		retry |= ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb,
+					     ufs_get_direct_data_ptr(uspi, ufsi,
+								     UFS_DIND_BLOCK));
+		retry |= ufs_trunc_tindirect (inode);
+		if (!retry)
+			break;
+		if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
+			ufs_sync_inode (inode);
+		yield();
+	}
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+	ufsi->i_lastfrag = DIRECT_FRAGMENT;
+	mark_inode_dirty(inode);
+out:
+	UFSD("EXIT: err %d\n", err);
+	return err;
+}
+
+int ufs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	unsigned int ia_valid = attr->ia_valid;
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+		loff_t old_i_size = inode->i_size;
+
+		/* XXX(truncate): truncate_setsize should be called last */
+		truncate_setsize(inode, attr->ia_size);
+
+		lock_ufs(inode->i_sb);
+		error = ufs_truncate(inode, old_i_size);
+		unlock_ufs(inode->i_sb);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+const struct inode_operations ufs_file_inode_operations = {
+	.setattr = ufs_setattr,
+};
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
new file mode 100644
index 00000000..5be2755d
--- /dev/null
+++ b/fs/ufs/ufs.h
@@ -0,0 +1,161 @@
+#ifndef _UFS_UFS_H
+#define _UFS_UFS_H 1
+
+#define UFS_MAX_GROUP_LOADED 8
+#define UFS_CGNO_EMPTY ((unsigned)-1)
+
+struct ufs_sb_private_info;
+struct ufs_cg_private_info;
+struct ufs_csum;
+
+struct ufs_sb_info {
+	struct ufs_sb_private_info * s_uspi;
+	struct ufs_csum	* s_csp;
+	unsigned s_bytesex;
+	unsigned s_flags;
+	struct buffer_head ** s_ucg;
+	struct ufs_cg_private_info * s_ucpi[UFS_MAX_GROUP_LOADED];
+	unsigned s_cgno[UFS_MAX_GROUP_LOADED];
+	unsigned short s_cg_loaded;
+	unsigned s_mount_opt;
+	struct mutex mutex;
+	struct task_struct *mutex_owner;
+};
+
+struct ufs_inode_info {
+	union {
+		__fs32	i_data[15];
+		__u8	i_symlink[2 * 4 * 15];
+		__fs64	u2_i_data[15];
+	} i_u1;
+	__u32	i_flags;
+	__u32	i_shadow;
+	__u32	i_unused1;
+	__u32	i_unused2;
+	__u32	i_oeftflag;
+	__u16	i_osync;
+	__u64	i_lastfrag;
+	__u32   i_dir_start_lookup;
+	struct inode vfs_inode;
+};
+
+/* mount options */
+#define UFS_MOUNT_ONERROR		0x0000000F
+#define UFS_MOUNT_ONERROR_PANIC		0x00000001
+#define UFS_MOUNT_ONERROR_LOCK		0x00000002
+#define UFS_MOUNT_ONERROR_UMOUNT	0x00000004
+#define UFS_MOUNT_ONERROR_REPAIR	0x00000008
+
+#define UFS_MOUNT_UFSTYPE		0x0000FFF0
+#define UFS_MOUNT_UFSTYPE_OLD		0x00000010
+#define UFS_MOUNT_UFSTYPE_44BSD		0x00000020
+#define UFS_MOUNT_UFSTYPE_SUN		0x00000040
+#define UFS_MOUNT_UFSTYPE_NEXTSTEP	0x00000080
+#define UFS_MOUNT_UFSTYPE_NEXTSTEP_CD	0x00000100
+#define UFS_MOUNT_UFSTYPE_OPENSTEP	0x00000200
+#define UFS_MOUNT_UFSTYPE_SUNx86	0x00000400
+#define UFS_MOUNT_UFSTYPE_HP	        0x00000800
+#define UFS_MOUNT_UFSTYPE_UFS2		0x00001000
+#define UFS_MOUNT_UFSTYPE_SUNOS		0x00002000
+
+#define ufs_clear_opt(o,opt)	o &= ~UFS_MOUNT_##opt
+#define ufs_set_opt(o,opt)	o |= UFS_MOUNT_##opt
+#define ufs_test_opt(o,opt)	((o) & UFS_MOUNT_##opt)
+
+/*
+ * Debug code
+ */
+#ifdef CONFIG_UFS_DEBUG
+#	define UFSD(f, a...)	{					\
+		printk ("UFSD (%s, %d): %s:",				\
+			__FILE__, __LINE__, __func__);		\
+		printk (f, ## a);					\
+	}
+#else
+#	define UFSD(f, a...)	/**/
+#endif
+
+/* balloc.c */
+extern void ufs_free_fragments (struct inode *, u64, unsigned);
+extern void ufs_free_blocks (struct inode *, u64, unsigned);
+extern u64 ufs_new_fragments(struct inode *, void *, u64, u64,
+			     unsigned, int *, struct page *);
+
+/* cylinder.c */
+extern struct ufs_cg_private_info * ufs_load_cylinder (struct super_block *, unsigned);
+extern void ufs_put_cylinder (struct super_block *, unsigned);
+
+/* dir.c */
+extern const struct inode_operations ufs_dir_inode_operations;
+extern int ufs_add_link (struct dentry *, struct inode *);
+extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *);
+extern int ufs_make_empty(struct inode *, struct inode *);
+extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **);
+extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *);
+extern int ufs_empty_dir (struct inode *);
+extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **);
+extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+			 struct page *page, struct inode *inode);
+
+/* file.c */
+extern const struct inode_operations ufs_file_inode_operations;
+extern const struct file_operations ufs_file_operations;
+extern const struct address_space_operations ufs_aops;
+
+/* ialloc.c */
+extern void ufs_free_inode (struct inode *inode);
+extern struct inode * ufs_new_inode (struct inode *, int);
+
+/* inode.c */
+extern struct inode *ufs_iget(struct super_block *, unsigned long);
+extern int ufs_write_inode (struct inode *, struct writeback_control *);
+extern int ufs_sync_inode (struct inode *);
+extern void ufs_evict_inode (struct inode *);
+extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
+
+/* namei.c */
+extern const struct file_operations ufs_dir_operations;
+
+/* super.c */
+extern void ufs_warning (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
+extern void ufs_error (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
+extern void ufs_panic (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
+
+/* symlink.c */
+extern const struct inode_operations ufs_fast_symlink_inode_operations;
+extern const struct inode_operations ufs_symlink_inode_operations;
+
+/* truncate.c */
+extern int ufs_truncate (struct inode *, loff_t);
+extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
+
+static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct ufs_inode_info *UFS_I(struct inode *inode)
+{
+	return container_of(inode, struct ufs_inode_info, vfs_inode);
+}
+
+/*
+ * Give cylinder group number for a file system block.
+ * Give cylinder group block number for a file system block.
+ */
+/* #define	ufs_dtog(d)	((d) / uspi->s_fpg) */
+static inline u64 ufs_dtog(struct ufs_sb_private_info * uspi, u64 b)
+{
+	do_div(b, uspi->s_fpg);
+	return b;
+}
+/* #define	ufs_dtogd(d)	((d) % uspi->s_fpg) */
+static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b)
+{
+	return do_div(b, uspi->s_fpg);
+}
+
+extern void lock_ufs(struct super_block *sb);
+extern void unlock_ufs(struct super_block *sb);
+
+#endif /* _UFS_UFS_H */
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
new file mode 100644
index 00000000..8aba544f
--- /dev/null
+++ b/fs/ufs/ufs_fs.h
@@ -0,0 +1,959 @@
+/*
+ *  linux/include/linux/ufs_fs.h
+ *
+ * Copyright (C) 1996
+ * Adrian Rodriguez (adrian@franklins-tower.rutgers.edu)
+ * Laboratory for Computer Science Research Computing Facility
+ * Rutgers, The State University of New Jersey
+ *
+ * Clean swab support by Fare <fare@tunes.org>
+ * just hope no one is using NNUUXXI on __?64 structure elements
+ * 64-bit clean thanks to Maciej W. Rozycki <macro@ds2.pg.gda.pl>
+ *
+ * 4.4BSD (FreeBSD) support added on February 1st 1998 by
+ * Niels Kristian Bech Jensen <nkbj@image.dk> partially based
+ * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>.
+ *
+ * NeXTstep support added on February 5th 1998 by
+ * Niels Kristian Bech Jensen <nkbj@image.dk>.
+ *
+ * Write support by Daniel Pirkl <daniel.pirkl@email.cz>
+ *
+ * HP/UX hfs filesystem support added by
+ * Martin K. Petersen <mkp@mkp.net>, August 1999
+ *
+ * UFS2 (of FreeBSD 5.x) support added by
+ * Niraj Kumar <niraj17@iitbombay.org>  , Jan 2004
+ *
+ */
+
+#ifndef __LINUX_UFS_FS_H
+#define __LINUX_UFS_FS_H
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/stat.h>
+#include <linux/fs.h>
+
+#include <asm/div64.h>
+typedef __u64 __bitwise __fs64;
+typedef __u32 __bitwise __fs32;
+typedef __u16 __bitwise __fs16;
+
+#define UFS_BBLOCK 0
+#define UFS_BBSIZE 8192
+#define UFS_SBLOCK 8192
+#define UFS_SBSIZE 8192
+
+#define UFS_SECTOR_SIZE 512
+#define UFS_SECTOR_BITS 9
+#define UFS_MAGIC  0x00011954
+#define UFS_MAGIC_BW 0x0f242697
+#define UFS2_MAGIC 0x19540119
+#define UFS_CIGAM  0x54190100 /* byteswapped MAGIC */
+
+/* Copied from FreeBSD */
+/*
+ * Each disk drive contains some number of filesystems.
+ * A filesystem consists of a number of cylinder groups.
+ * Each cylinder group has inodes and data.
+ *
+ * A filesystem is described by its super-block, which in turn
+ * describes the cylinder groups.  The super-block is critical
+ * data and is replicated in each cylinder group to protect against
+ * catastrophic loss.  This is done at `newfs' time and the critical
+ * super-block data does not change, so the copies need not be
+ * referenced further unless disaster strikes.
+ *
+ * For filesystem fs, the offsets of the various blocks of interest
+ * are given in the super block as:
+ *      [fs->fs_sblkno]         Super-block
+ *      [fs->fs_cblkno]         Cylinder group block
+ *      [fs->fs_iblkno]         Inode blocks
+ *      [fs->fs_dblkno]         Data blocks
+ * The beginning of cylinder group cg in fs, is given by
+ * the ``cgbase(fs, cg)'' macro.
+ *
+ * Depending on the architecture and the media, the superblock may
+ * reside in any one of four places. For tiny media where every block
+ * counts, it is placed at the very front of the partition. Historically,
+ * UFS1 placed it 8K from the front to leave room for the disk label and
+ * a small bootstrap. For UFS2 it got moved to 64K from the front to leave
+ * room for the disk label and a bigger bootstrap, and for really piggy
+ * systems we check at 256K from the front if the first three fail. In
+ * all cases the size of the superblock will be SBLOCKSIZE. All values are
+ * given in byte-offset form, so they do not imply a sector size. The
+ * SBLOCKSEARCH specifies the order in which the locations should be searched.
+ */
+#define SBLOCK_FLOPPY        0
+#define SBLOCK_UFS1       8192
+#define SBLOCK_UFS2      65536
+#define SBLOCK_PIGGY    262144
+#define SBLOCKSIZE        8192
+#define SBLOCKSEARCH \
+        { SBLOCK_UFS2, SBLOCK_UFS1, SBLOCK_FLOPPY, SBLOCK_PIGGY, -1 }
+
+
+/* HP specific MAGIC values */
+
+#define UFS_MAGIC_LFN   0x00095014 /* fs supports filenames > 14 chars */
+#define UFS_CIGAM_LFN   0x14500900 /* srahc 41 < semanelif stroppus sf */
+
+#define UFS_MAGIC_SEC   0x00612195 /* B1 security fs */
+#define UFS_CIGAM_SEC   0x95216100
+
+#define UFS_MAGIC_FEA   0x00195612 /* fs_featurebits supported */
+#define UFS_CIGAM_FEA   0x12561900
+
+#define UFS_MAGIC_4GB   0x05231994 /* fs > 4 GB && fs_featurebits */
+#define UFS_CIGAM_4GB   0x94192305
+
+/* Seems somebody at HP goofed here. B1 and lfs are both 0x2 !?! */
+#define UFS_FSF_LFN     0x00000001 /* long file names */
+#define UFS_FSF_B1      0x00000002 /* B1 security */
+#define UFS_FSF_LFS     0x00000002 /* large files */
+#define UFS_FSF_LUID    0x00000004 /* large UIDs */
+
+/* End of HP stuff */
+
+
+#define UFS_BSIZE	8192
+#define UFS_MINBSIZE	4096
+#define UFS_FSIZE	1024
+#define UFS_MAXFRAG	(UFS_BSIZE / UFS_FSIZE)
+
+#define UFS_NDADDR 12
+#define UFS_NINDIR 3
+
+#define UFS_IND_BLOCK	(UFS_NDADDR + 0)
+#define UFS_DIND_BLOCK	(UFS_NDADDR + 1)
+#define UFS_TIND_BLOCK	(UFS_NDADDR + 2)
+
+#define UFS_NDIR_FRAGMENT (UFS_NDADDR << uspi->s_fpbshift)
+#define UFS_IND_FRAGMENT (UFS_IND_BLOCK << uspi->s_fpbshift)
+#define UFS_DIND_FRAGMENT (UFS_DIND_BLOCK << uspi->s_fpbshift)
+#define UFS_TIND_FRAGMENT (UFS_TIND_BLOCK << uspi->s_fpbshift)
+
+#define UFS_ROOTINO 2
+#define UFS_FIRST_INO (UFS_ROOTINO + 1)
+
+#define UFS_USEEFT  ((__u16)65535)
+
+/* fs_clean values */
+#define UFS_FSOK      0x7c269d38
+#define UFS_FSACTIVE  ((__s8)0x00)
+#define UFS_FSCLEAN   ((__s8)0x01)
+#define UFS_FSSTABLE  ((__s8)0x02)
+#define UFS_FSOSF1    ((__s8)0x03)	/* is this correct for DEC OSF/1? */
+#define UFS_FSBAD     ((__s8)0xff)
+
+/* Solaris-specific fs_clean values */
+#define UFS_FSSUSPEND ((__s8)0xfe)	/* temporarily suspended */
+#define UFS_FSLOG     ((__s8)0xfd)	/* logging fs */
+#define UFS_FSFIX     ((__s8)0xfc)	/* being repaired while mounted */
+
+/* From here to next blank line, s_flags for ufs_sb_info */
+/* directory entry encoding */
+#define UFS_DE_MASK		0x00000010	/* mask for the following */
+#define UFS_DE_OLD		0x00000000
+#define UFS_DE_44BSD		0x00000010
+/* uid encoding */
+#define UFS_UID_MASK		0x00000060	/* mask for the following */
+#define UFS_UID_OLD		0x00000000
+#define UFS_UID_44BSD		0x00000020
+#define UFS_UID_EFT		0x00000040
+/* superblock state encoding */
+#define UFS_ST_MASK		0x00000700	/* mask for the following */
+#define UFS_ST_OLD		0x00000000
+#define UFS_ST_44BSD		0x00000100
+#define UFS_ST_SUN		0x00000200 /* Solaris */
+#define UFS_ST_SUNOS		0x00000300
+#define UFS_ST_SUNx86		0x00000400 /* Solaris x86 */
+/*cylinder group encoding */
+#define UFS_CG_MASK		0x00003000	/* mask for the following */
+#define UFS_CG_OLD		0x00000000
+#define UFS_CG_44BSD		0x00002000
+#define UFS_CG_SUN		0x00001000
+/* filesystem type encoding */
+#define UFS_TYPE_MASK		0x00010000	/* mask for the following */
+#define UFS_TYPE_UFS1		0x00000000
+#define UFS_TYPE_UFS2		0x00010000
+
+
+/* fs_inodefmt options */
+#define UFS_42INODEFMT	-1
+#define UFS_44INODEFMT	2
+
+/*
+ * MINFREE gives the minimum acceptable percentage of file system
+ * blocks which may be free. If the freelist drops below this level
+ * only the superuser may continue to allocate blocks. This may
+ * be set to 0 if no reserve of free blocks is deemed necessary,
+ * however throughput drops by fifty percent if the file system
+ * is run at between 95% and 100% full; thus the minimum default
+ * value of fs_minfree is 5%. However, to get good clustering
+ * performance, 10% is a better choice. hence we use 10% as our
+ * default value. With 10% free space, fragmentation is not a
+ * problem, so we choose to optimize for time.
+ */
+#define UFS_MINFREE         5
+#define UFS_DEFAULTOPT      UFS_OPTTIME
+
+/*
+ * Turn file system block numbers into disk block addresses.
+ * This maps file system blocks to device size blocks.
+ */
+#define ufs_fsbtodb(uspi, b)	((b) << (uspi)->s_fsbtodb)
+#define	ufs_dbtofsb(uspi, b)	((b) >> (uspi)->s_fsbtodb)
+
+/*
+ * Cylinder group macros to locate things in cylinder groups.
+ * They calc file system addresses of cylinder group data structures.
+ */
+#define	ufs_cgbase(c)	(uspi->s_fpg * (c))
+#define ufs_cgstart(c)	((uspi)->fs_magic == UFS2_MAGIC ?  ufs_cgbase(c) : \
+	(ufs_cgbase(c)  + uspi->s_cgoffset * ((c) & ~uspi->s_cgmask)))
+#define	ufs_cgsblock(c)	(ufs_cgstart(c) + uspi->s_sblkno)	/* super blk */
+#define	ufs_cgcmin(c)	(ufs_cgstart(c) + uspi->s_cblkno)	/* cg block */
+#define	ufs_cgimin(c)	(ufs_cgstart(c) + uspi->s_iblkno)	/* inode blk */
+#define	ufs_cgdmin(c)	(ufs_cgstart(c) + uspi->s_dblkno)	/* 1st data */
+
+/*
+ * Macros for handling inode numbers:
+ *     inode number to file system block offset.
+ *     inode number to cylinder group number.
+ *     inode number to file system block address.
+ */
+#define	ufs_inotocg(x)		((x) / uspi->s_ipg)
+#define	ufs_inotocgoff(x)	((x) % uspi->s_ipg)
+#define	ufs_inotofsba(x)	(((u64)ufs_cgimin(ufs_inotocg(x))) + ufs_inotocgoff(x) / uspi->s_inopf)
+#define	ufs_inotofsbo(x)	((x) % uspi->s_inopf)
+
+/*
+ * Compute the cylinder and rotational position of a cyl block addr.
+ */
+#define ufs_cbtocylno(bno) \
+	((bno) * uspi->s_nspf / uspi->s_spc)
+#define ufs_cbtorpos(bno)				      \
+	((UFS_SB(sb)->s_flags & UFS_CG_SUN) ?		      \
+	 (((((bno) * uspi->s_nspf % uspi->s_spc) %	      \
+	    uspi->s_nsect) *				      \
+	   uspi->s_nrpos) / uspi->s_nsect)		      \
+	 :						      \
+	((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \
+	* uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \
+	% uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \
+	  * uspi->s_nrpos) / uspi->s_npsect))
+
+/*
+ * The following macros optimize certain frequently calculated
+ * quantities by using shifts and masks in place of divisions
+ * modulos and multiplications.
+ */
+#define ufs_blkoff(loc)		((loc) & uspi->s_qbmask)
+#define ufs_fragoff(loc)	((loc) & uspi->s_qfmask)
+#define ufs_lblktosize(blk)	((blk) << uspi->s_bshift)
+#define ufs_lblkno(loc)		((loc) >> uspi->s_bshift)
+#define ufs_numfrags(loc)	((loc) >> uspi->s_fshift)
+#define ufs_blkroundup(size)	(((size) + uspi->s_qbmask) & uspi->s_bmask)
+#define ufs_fragroundup(size)	(((size) + uspi->s_qfmask) & uspi->s_fmask)
+#define ufs_fragstoblks(frags)	((frags) >> uspi->s_fpbshift)
+#define ufs_blkstofrags(blks)	((blks) << uspi->s_fpbshift)
+#define ufs_fragnum(fsb)	((fsb) & uspi->s_fpbmask)
+#define ufs_blknum(fsb)		((fsb) & ~uspi->s_fpbmask)
+
+#define	UFS_MAXNAMLEN 255
+#define UFS_MAXMNTLEN 512
+#define UFS2_MAXMNTLEN 468
+#define UFS2_MAXVOLLEN 32
+#define UFS_MAXCSBUFS 31
+#define UFS_LINK_MAX 32000
+/*
+#define	UFS2_NOCSPTRS	((128 / sizeof(void *)) - 4)
+*/
+#define	UFS2_NOCSPTRS	28
+
+/*
+ * UFS_DIR_PAD defines the directory entries boundaries
+ * (must be a multiple of 4)
+ */
+#define UFS_DIR_PAD			4
+#define UFS_DIR_ROUND			(UFS_DIR_PAD - 1)
+#define UFS_DIR_REC_LEN(name_len)	(((name_len) + 1 + 8 + UFS_DIR_ROUND) & ~UFS_DIR_ROUND)
+
+struct ufs_timeval {
+	__fs32	tv_sec;
+	__fs32	tv_usec;
+};
+
+struct ufs_dir_entry {
+	__fs32  d_ino;			/* inode number of this entry */
+	__fs16  d_reclen;		/* length of this entry */
+	union {
+		__fs16	d_namlen;		/* actual length of d_name */
+		struct {
+			__u8	d_type;		/* file type */
+			__u8	d_namlen;	/* length of string in d_name */
+		} d_44;
+	} d_u;
+	__u8	d_name[UFS_MAXNAMLEN + 1];	/* file name */
+};
+
+struct ufs_csum {
+	__fs32	cs_ndir;	/* number of directories */
+	__fs32	cs_nbfree;	/* number of free blocks */
+	__fs32	cs_nifree;	/* number of free inodes */
+	__fs32	cs_nffree;	/* number of free frags */
+};
+struct ufs2_csum_total {
+	__fs64	cs_ndir;	/* number of directories */
+	__fs64	cs_nbfree;	/* number of free blocks */
+	__fs64	cs_nifree;	/* number of free inodes */
+	__fs64	cs_nffree;	/* number of free frags */
+	__fs64   cs_numclusters;	/* number of free clusters */
+	__fs64   cs_spare[3];	/* future expansion */
+};
+
+struct ufs_csum_core {
+	__u64	cs_ndir;	/* number of directories */
+	__u64	cs_nbfree;	/* number of free blocks */
+	__u64	cs_nifree;	/* number of free inodes */
+	__u64	cs_nffree;	/* number of free frags */
+	__u64   cs_numclusters;	/* number of free clusters */
+};
+
+/*
+ * File system flags
+ */
+#define UFS_UNCLEAN      0x01    /* file system not clean at mount (unused) */
+#define UFS_DOSOFTDEP    0x02    /* file system using soft dependencies */
+#define UFS_NEEDSFSCK    0x04    /* needs sync fsck (FreeBSD compat, unused) */
+#define UFS_INDEXDIRS    0x08    /* kernel supports indexed directories */
+#define UFS_ACLS         0x10    /* file system has ACLs enabled */
+#define UFS_MULTILABEL   0x20    /* file system is MAC multi-label */
+#define UFS_FLAGS_UPDATED 0x80   /* flags have been moved to new location */
+
+#if 0
+/*
+ * This is the actual superblock, as it is laid out on the disk.
+ * Do NOT use this structure, because of sizeof(ufs_super_block) > 512 and
+ * it may occupy several blocks, use
+ * struct ufs_super_block_(first,second,third) instead.
+ */
+struct ufs_super_block {
+	union {
+		struct {
+			__fs32	fs_link;	/* UNUSED */
+		} fs_42;
+		struct {
+			__fs32	fs_state;	/* file system state flag */
+		} fs_sun;
+	} fs_u0;
+	__fs32	fs_rlink;	/* UNUSED */
+	__fs32	fs_sblkno;	/* addr of super-block in filesys */
+	__fs32	fs_cblkno;	/* offset of cyl-block in filesys */
+	__fs32	fs_iblkno;	/* offset of inode-blocks in filesys */
+	__fs32	fs_dblkno;	/* offset of first data after cg */
+	__fs32	fs_cgoffset;	/* cylinder group offset in cylinder */
+	__fs32	fs_cgmask;	/* used to calc mod fs_ntrak */
+	__fs32	fs_time;	/* last time written -- time_t */
+	__fs32	fs_size;	/* number of blocks in fs */
+	__fs32	fs_dsize;	/* number of data blocks in fs */
+	__fs32	fs_ncg;		/* number of cylinder groups */
+	__fs32	fs_bsize;	/* size of basic blocks in fs */
+	__fs32	fs_fsize;	/* size of frag blocks in fs */
+	__fs32	fs_frag;	/* number of frags in a block in fs */
+/* these are configuration parameters */
+	__fs32	fs_minfree;	/* minimum percentage of free blocks */
+	__fs32	fs_rotdelay;	/* num of ms for optimal next block */
+	__fs32	fs_rps;		/* disk revolutions per second */
+/* these fields can be computed from the others */
+	__fs32	fs_bmask;	/* ``blkoff'' calc of blk offsets */
+	__fs32	fs_fmask;	/* ``fragoff'' calc of frag offsets */
+	__fs32	fs_bshift;	/* ``lblkno'' calc of logical blkno */
+	__fs32	fs_fshift;	/* ``numfrags'' calc number of frags */
+/* these are configuration parameters */
+	__fs32	fs_maxcontig;	/* max number of contiguous blks */
+	__fs32	fs_maxbpg;	/* max number of blks per cyl group */
+/* these fields can be computed from the others */
+	__fs32	fs_fragshift;	/* block to frag shift */
+	__fs32	fs_fsbtodb;	/* fsbtodb and dbtofsb shift constant */
+	__fs32	fs_sbsize;	/* actual size of super block */
+	__fs32	fs_csmask;	/* csum block offset */
+	__fs32	fs_csshift;	/* csum block number */
+	__fs32	fs_nindir;	/* value of NINDIR */
+	__fs32	fs_inopb;	/* value of INOPB */
+	__fs32	fs_nspf;	/* value of NSPF */
+/* yet another configuration parameter */
+	__fs32	fs_optim;	/* optimization preference, see below */
+/* these fields are derived from the hardware */
+	union {
+		struct {
+			__fs32	fs_npsect;	/* # sectors/track including spares */
+		} fs_sun;
+		struct {
+			__fs32	fs_state;	/* file system state time stamp */
+		} fs_sunx86;
+	} fs_u1;
+	__fs32	fs_interleave;	/* hardware sector interleave */
+	__fs32	fs_trackskew;	/* sector 0 skew, per track */
+/* a unique id for this filesystem (currently unused and unmaintained) */
+/* In 4.3 Tahoe this space is used by fs_headswitch and fs_trkseek */
+/* Neither of those fields is used in the Tahoe code right now but */
+/* there could be problems if they are.                            */
+	__fs32	fs_id[2];	/* file system id */
+/* sizes determined by number of cylinder groups and their sizes */
+	__fs32	fs_csaddr;	/* blk addr of cyl grp summary area */
+	__fs32	fs_cssize;	/* size of cyl grp summary area */
+	__fs32	fs_cgsize;	/* cylinder group size */
+/* these fields are derived from the hardware */
+	__fs32	fs_ntrak;	/* tracks per cylinder */
+	__fs32	fs_nsect;	/* sectors per track */
+	__fs32	fs_spc;		/* sectors per cylinder */
+/* this comes from the disk driver partitioning */
+	__fs32	fs_ncyl;	/* cylinders in file system */
+/* these fields can be computed from the others */
+	__fs32	fs_cpg;		/* cylinders per group */
+	__fs32	fs_ipg;		/* inodes per cylinder group */
+	__fs32	fs_fpg;		/* blocks per group * fs_frag */
+/* this data must be re-computed after crashes */
+	struct ufs_csum fs_cstotal;	/* cylinder summary information */
+/* these fields are cleared at mount time */
+	__s8	fs_fmod;	/* super block modified flag */
+	__s8	fs_clean;	/* file system is clean flag */
+	__s8	fs_ronly;	/* mounted read-only flag */
+	__s8	fs_flags;
+	union {
+		struct {
+			__s8	fs_fsmnt[UFS_MAXMNTLEN];/* name mounted on */
+			__fs32	fs_cgrotor;	/* last cg searched */
+			__fs32	fs_csp[UFS_MAXCSBUFS];/*list of fs_cs info buffers */
+			__fs32	fs_maxcluster;
+			__fs32	fs_cpc;		/* cyl per cycle in postbl */
+			__fs16	fs_opostbl[16][8]; /* old rotation block list head */
+		} fs_u1;
+		struct {
+			__s8  fs_fsmnt[UFS2_MAXMNTLEN];	/* name mounted on */
+			__u8   fs_volname[UFS2_MAXVOLLEN]; /* volume name */
+			__fs64  fs_swuid;		/* system-wide uid */
+			__fs32  fs_pad;	/* due to alignment of fs_swuid */
+			__fs32   fs_cgrotor;     /* last cg searched */
+			__fs32   fs_ocsp[UFS2_NOCSPTRS]; /*list of fs_cs info buffers */
+			__fs32   fs_contigdirs;/*# of contiguously allocated dirs */
+			__fs32   fs_csp;	/* cg summary info buffer for fs_cs */
+			__fs32   fs_maxcluster;
+			__fs32   fs_active;/* used by snapshots to track fs */
+			__fs32   fs_old_cpc;	/* cyl per cycle in postbl */
+			__fs32   fs_maxbsize;/*maximum blocking factor permitted */
+			__fs64   fs_sparecon64[17];/*old rotation block list head */
+			__fs64   fs_sblockloc; /* byte offset of standard superblock */
+			struct  ufs2_csum_total fs_cstotal;/*cylinder summary information*/
+			struct  ufs_timeval    fs_time;		/* last time written */
+			__fs64    fs_size;		/* number of blocks in fs */
+			__fs64    fs_dsize;	/* number of data blocks in fs */
+			__fs64   fs_csaddr;	/* blk addr of cyl grp summary area */
+			__fs64    fs_pendingblocks;/* blocks in process of being freed */
+			__fs32    fs_pendinginodes;/*inodes in process of being freed */
+		} fs_u2;
+	}  fs_u11;
+	union {
+		struct {
+			__fs32	fs_sparecon[53];/* reserved for future constants */
+			__fs32	fs_reclaim;
+			__fs32	fs_sparecon2[1];
+			__fs32	fs_state;	/* file system state time stamp */
+			__fs32	fs_qbmask[2];	/* ~usb_bmask */
+			__fs32	fs_qfmask[2];	/* ~usb_fmask */
+		} fs_sun;
+		struct {
+			__fs32	fs_sparecon[53];/* reserved for future constants */
+			__fs32	fs_reclaim;
+			__fs32	fs_sparecon2[1];
+			__fs32	fs_npsect;	/* # sectors/track including spares */
+			__fs32	fs_qbmask[2];	/* ~usb_bmask */
+			__fs32	fs_qfmask[2];	/* ~usb_fmask */
+		} fs_sunx86;
+		struct {
+			__fs32	fs_sparecon[50];/* reserved for future constants */
+			__fs32	fs_contigsumsize;/* size of cluster summary array */
+			__fs32	fs_maxsymlinklen;/* max length of an internal symlink */
+			__fs32	fs_inodefmt;	/* format of on-disk inodes */
+			__fs32	fs_maxfilesize[2];	/* max representable file size */
+			__fs32	fs_qbmask[2];	/* ~usb_bmask */
+			__fs32	fs_qfmask[2];	/* ~usb_fmask */
+			__fs32	fs_state;	/* file system state time stamp */
+		} fs_44;
+	} fs_u2;
+	__fs32	fs_postblformat;	/* format of positional layout tables */
+	__fs32	fs_nrpos;		/* number of rotational positions */
+	__fs32	fs_postbloff;		/* (__s16) rotation block list head */
+	__fs32	fs_rotbloff;		/* (__u8) blocks for each rotation */
+	__fs32	fs_magic;		/* magic number */
+	__u8	fs_space[1];		/* list of blocks for each rotation */
+};
+#endif/*struct ufs_super_block*/
+
+/*
+ * Preference for optimization.
+ */
+#define UFS_OPTTIME	0	/* minimize allocation time */
+#define UFS_OPTSPACE	1	/* minimize disk fragmentation */
+
+/*
+ * Rotational layout table format types
+ */
+#define UFS_42POSTBLFMT		-1	/* 4.2BSD rotational table format */
+#define UFS_DYNAMICPOSTBLFMT	1	/* dynamic rotational table format */
+
+/*
+ * Convert cylinder group to base address of its global summary info.
+ */
+#define fs_cs(indx) s_csp[(indx)]
+
+/*
+ * Cylinder group block for a file system.
+ *
+ * Writable fields in the cylinder group are protected by the associated
+ * super block lock fs->fs_lock.
+ */
+#define	CG_MAGIC	0x090255
+#define ufs_cg_chkmagic(sb, ucg) \
+	(fs32_to_cpu((sb), (ucg)->cg_magic) == CG_MAGIC)
+/*
+ * Macros for access to old cylinder group array structures
+ */
+#define ufs_ocg_blktot(sb, ucg)      fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_btot)
+#define ufs_ocg_blks(sb, ucg, cylno) fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_b[cylno])
+#define ufs_ocg_inosused(sb, ucg)    fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_iused)
+#define ufs_ocg_blksfree(sb, ucg)    fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_free)
+#define ufs_ocg_chkmagic(sb, ucg) \
+	(fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_magic) == CG_MAGIC)
+
+/*
+ * size of this structure is 172 B
+ */
+struct	ufs_cylinder_group {
+	__fs32	cg_link;		/* linked list of cyl groups */
+	__fs32	cg_magic;		/* magic number */
+	__fs32	cg_time;		/* time last written */
+	__fs32	cg_cgx;			/* we are the cgx'th cylinder group */
+	__fs16	cg_ncyl;		/* number of cyl's this cg */
+	__fs16	cg_niblk;		/* number of inode blocks this cg */
+	__fs32	cg_ndblk;		/* number of data blocks this cg */
+	struct	ufs_csum cg_cs;		/* cylinder summary information */
+	__fs32	cg_rotor;		/* position of last used block */
+	__fs32	cg_frotor;		/* position of last used frag */
+	__fs32	cg_irotor;		/* position of last used inode */
+	__fs32	cg_frsum[UFS_MAXFRAG];	/* counts of available frags */
+	__fs32	cg_btotoff;		/* (__u32) block totals per cylinder */
+	__fs32	cg_boff;		/* (short) free block positions */
+	__fs32	cg_iusedoff;		/* (char) used inode map */
+	__fs32	cg_freeoff;		/* (u_char) free block map */
+	__fs32	cg_nextfreeoff;		/* (u_char) next available space */
+	union {
+		struct {
+			__fs32	cg_clustersumoff;	/* (u_int32) counts of avail clusters */
+			__fs32	cg_clusteroff;		/* (u_int8) free cluster map */
+			__fs32	cg_nclusterblks;	/* number of clusters this cg */
+			__fs32	cg_sparecon[13];	/* reserved for future use */
+		} cg_44;
+		struct {
+			__fs32	cg_clustersumoff;/* (u_int32) counts of avail clusters */
+			__fs32	cg_clusteroff;	/* (u_int8) free cluster map */
+			__fs32	cg_nclusterblks;/* number of clusters this cg */
+			__fs32   cg_niblk; /* number of inode blocks this cg */
+			__fs32   cg_initediblk;	/* last initialized inode */
+			__fs32   cg_sparecon32[3];/* reserved for future use */
+			__fs64   cg_time;	/* time last written */
+			__fs64	cg_sparecon[3];	/* reserved for future use */
+		} cg_u2;
+		__fs32	cg_sparecon[16];	/* reserved for future use */
+	} cg_u;
+	__u8	cg_space[1];		/* space for cylinder group maps */
+/* actually longer */
+};
+
+/* Historic Cylinder group info */
+struct ufs_old_cylinder_group {
+	__fs32	cg_link;		/* linked list of cyl groups */
+	__fs32	cg_rlink;		/* for incore cyl groups     */
+	__fs32	cg_time;		/* time last written */
+	__fs32	cg_cgx;			/* we are the cgx'th cylinder group */
+	__fs16	cg_ncyl;		/* number of cyl's this cg */
+	__fs16	cg_niblk;		/* number of inode blocks this cg */
+	__fs32	cg_ndblk;		/* number of data blocks this cg */
+	struct	ufs_csum cg_cs;		/* cylinder summary information */
+	__fs32	cg_rotor;		/* position of last used block */
+	__fs32	cg_frotor;		/* position of last used frag */
+	__fs32	cg_irotor;		/* position of last used inode */
+	__fs32	cg_frsum[8];		/* counts of available frags */
+	__fs32	cg_btot[32];		/* block totals per cylinder */
+	__fs16	cg_b[32][8];		/* positions of free blocks */
+	__u8	cg_iused[256];		/* used inode map */
+	__fs32	cg_magic;		/* magic number */
+	__u8	cg_free[1];		/* free block map */
+/* actually longer */
+};
+
+/*
+ * structure of an on-disk inode
+ */
+struct ufs_inode {
+	__fs16	ui_mode;		/*  0x0 */
+	__fs16	ui_nlink;		/*  0x2 */
+	union {
+		struct {
+			__fs16	ui_suid;	/*  0x4 */
+			__fs16	ui_sgid;	/*  0x6 */
+		} oldids;
+		__fs32	ui_inumber;		/*  0x4 lsf: inode number */
+		__fs32	ui_author;		/*  0x4 GNU HURD: author */
+	} ui_u1;
+	__fs64	ui_size;		/*  0x8 */
+	struct ufs_timeval ui_atime;	/* 0x10 access */
+	struct ufs_timeval ui_mtime;	/* 0x18 modification */
+	struct ufs_timeval ui_ctime;	/* 0x20 creation */
+	union {
+		struct {
+			__fs32	ui_db[UFS_NDADDR];/* 0x28 data blocks */
+			__fs32	ui_ib[UFS_NINDIR];/* 0x58 indirect blocks */
+		} ui_addr;
+		__u8	ui_symlink[4*(UFS_NDADDR+UFS_NINDIR)];/* 0x28 fast symlink */
+	} ui_u2;
+	__fs32	ui_flags;		/* 0x64 immutable, append-only... */
+	__fs32	ui_blocks;		/* 0x68 blocks in use */
+	__fs32	ui_gen;			/* 0x6c like ext2 i_version, for NFS support */
+	union {
+		struct {
+			__fs32	ui_shadow;	/* 0x70 shadow inode with security data */
+			__fs32	ui_uid;		/* 0x74 long EFT version of uid */
+			__fs32	ui_gid;		/* 0x78 long EFT version of gid */
+			__fs32	ui_oeftflag;	/* 0x7c reserved */
+		} ui_sun;
+		struct {
+			__fs32	ui_uid;		/* 0x70 File owner */
+			__fs32	ui_gid;		/* 0x74 File group */
+			__fs32	ui_spare[2];	/* 0x78 reserved */
+		} ui_44;
+		struct {
+			__fs32	ui_uid;		/* 0x70 */
+			__fs32	ui_gid;		/* 0x74 */
+			__fs16	ui_modeh;	/* 0x78 mode high bits */
+			__fs16	ui_spare;	/* 0x7A unused */
+			__fs32	ui_trans;	/* 0x7c filesystem translator */
+		} ui_hurd;
+	} ui_u3;
+};
+
+#define UFS_NXADDR  2            /* External addresses in inode. */
+struct ufs2_inode {
+	__fs16     ui_mode;        /*   0: IFMT, permissions; see below. */
+	__fs16     ui_nlink;       /*   2: File link count. */
+	__fs32     ui_uid;         /*   4: File owner. */
+	__fs32     ui_gid;         /*   8: File group. */
+	__fs32     ui_blksize;     /*  12: Inode blocksize. */
+	__fs64     ui_size;        /*  16: File byte count. */
+	__fs64     ui_blocks;      /*  24: Bytes actually held. */
+	__fs64   ui_atime;       /*  32: Last access time. */
+	__fs64   ui_mtime;       /*  40: Last modified time. */
+	__fs64   ui_ctime;       /*  48: Last inode change time. */
+	__fs64   ui_birthtime;   /*  56: Inode creation time. */
+	__fs32     ui_mtimensec;   /*  64: Last modified time. */
+	__fs32     ui_atimensec;   /*  68: Last access time. */
+	__fs32     ui_ctimensec;   /*  72: Last inode change time. */
+	__fs32     ui_birthnsec;   /*  76: Inode creation time. */
+	__fs32     ui_gen;         /*  80: Generation number. */
+	__fs32     ui_kernflags;   /*  84: Kernel flags. */
+	__fs32     ui_flags;       /*  88: Status flags (chflags). */
+	__fs32     ui_extsize;     /*  92: External attributes block. */
+	__fs64     ui_extb[UFS_NXADDR];/*  96: External attributes block. */
+	union {
+		struct {
+			__fs64     ui_db[UFS_NDADDR]; /* 112: Direct disk blocks. */
+			__fs64     ui_ib[UFS_NINDIR];/* 208: Indirect disk blocks.*/
+		} ui_addr;
+	__u8	ui_symlink[2*4*(UFS_NDADDR+UFS_NINDIR)];/* 0x28 fast symlink */
+	} ui_u2;
+	__fs64     ui_spare[3];    /* 232: Reserved; currently unused */
+};
+
+
+/* FreeBSD has these in sys/stat.h */
+/* ui_flags that can be set by a file owner */
+#define UFS_UF_SETTABLE   0x0000ffff
+#define UFS_UF_NODUMP     0x00000001  /* do not dump */
+#define UFS_UF_IMMUTABLE  0x00000002  /* immutable (can't "change") */
+#define UFS_UF_APPEND     0x00000004  /* append-only */
+#define UFS_UF_OPAQUE     0x00000008  /* directory is opaque (unionfs) */
+#define UFS_UF_NOUNLINK   0x00000010  /* can't be removed or renamed */
+/* ui_flags that only root can set */
+#define UFS_SF_SETTABLE   0xffff0000
+#define UFS_SF_ARCHIVED   0x00010000  /* archived */
+#define UFS_SF_IMMUTABLE  0x00020000  /* immutable (can't "change") */
+#define UFS_SF_APPEND     0x00040000  /* append-only */
+#define UFS_SF_NOUNLINK   0x00100000  /* can't be removed or renamed */
+
+/*
+ * This structure is used for reading disk structures larger
+ * than the size of fragment.
+ */
+struct ufs_buffer_head {
+	__u64 fragment;			/* first fragment */
+	__u64 count;				/* number of fragments */
+	struct buffer_head * bh[UFS_MAXFRAG];	/* buffers */
+};
+
+struct ufs_cg_private_info {
+	struct ufs_buffer_head c_ubh;
+	__u32	c_cgx;		/* number of cylidner group */
+	__u16	c_ncyl;		/* number of cyl's this cg */
+	__u16	c_niblk;	/* number of inode blocks this cg */
+	__u32	c_ndblk;	/* number of data blocks this cg */
+	__u32	c_rotor;	/* position of last used block */
+	__u32	c_frotor;	/* position of last used frag */
+	__u32	c_irotor;	/* position of last used inode */
+	__u32	c_btotoff;	/* (__u32) block totals per cylinder */
+	__u32	c_boff;		/* (short) free block positions */
+	__u32	c_iusedoff;	/* (char) used inode map */
+	__u32	c_freeoff;	/* (u_char) free block map */
+	__u32	c_nextfreeoff;	/* (u_char) next available space */
+	__u32	c_clustersumoff;/* (u_int32) counts of avail clusters */
+	__u32	c_clusteroff;	/* (u_int8) free cluster map */
+	__u32	c_nclusterblks;	/* number of clusters this cg */
+};
+
+
+struct ufs_sb_private_info {
+	struct ufs_buffer_head s_ubh; /* buffer containing super block */
+	struct ufs_csum_core cs_total;
+	__u32	s_sblkno;	/* offset of super-blocks in filesys */
+	__u32	s_cblkno;	/* offset of cg-block in filesys */
+	__u32	s_iblkno;	/* offset of inode-blocks in filesys */
+	__u32	s_dblkno;	/* offset of first data after cg */
+	__u32	s_cgoffset;	/* cylinder group offset in cylinder */
+	__u32	s_cgmask;	/* used to calc mod fs_ntrak */
+	__u32	s_size;		/* number of blocks (fragments) in fs */
+	__u32	s_dsize;	/* number of data blocks in fs */
+	__u64	s_u2_size;	/* ufs2: number of blocks (fragments) in fs */
+	__u64	s_u2_dsize;	/*ufs2:  number of data blocks in fs */
+	__u32	s_ncg;		/* number of cylinder groups */
+	__u32	s_bsize;	/* size of basic blocks */
+	__u32	s_fsize;	/* size of fragments */
+	__u32	s_fpb;		/* fragments per block */
+	__u32	s_minfree;	/* minimum percentage of free blocks */
+	__u32	s_bmask;	/* `blkoff'' calc of blk offsets */
+	__u32	s_fmask;	/* s_fsize mask */
+	__u32	s_bshift;	/* `lblkno'' calc of logical blkno */
+	__u32   s_fshift;	/* s_fsize shift */
+	__u32	s_fpbshift;	/* fragments per block shift */
+	__u32	s_fsbtodb;	/* fsbtodb and dbtofsb shift constant */
+	__u32	s_sbsize;	/* actual size of super block */
+	__u32   s_csmask;	/* csum block offset */
+	__u32	s_csshift;	/* csum block number */
+	__u32	s_nindir;	/* value of NINDIR */
+	__u32	s_inopb;	/* value of INOPB */
+	__u32	s_nspf;		/* value of NSPF */
+	__u32	s_npsect;	/* # sectors/track including spares */
+	__u32	s_interleave;	/* hardware sector interleave */
+	__u32	s_trackskew;	/* sector 0 skew, per track */
+	__u64	s_csaddr;	/* blk addr of cyl grp summary area */
+	__u32	s_cssize;	/* size of cyl grp summary area */
+	__u32	s_cgsize;	/* cylinder group size */
+	__u32	s_ntrak;	/* tracks per cylinder */
+	__u32	s_nsect;	/* sectors per track */
+	__u32	s_spc;		/* sectors per cylinder */
+	__u32	s_ipg;		/* inodes per cylinder group */
+	__u32	s_fpg;		/* fragments per group */
+	__u32	s_cpc;		/* cyl per cycle in postbl */
+	__s32	s_contigsumsize;/* size of cluster summary array, 44bsd */
+	__s64	s_qbmask;	/* ~usb_bmask */
+	__s64	s_qfmask;	/* ~usb_fmask */
+	__s32	s_postblformat;	/* format of positional layout tables */
+	__s32	s_nrpos;	/* number of rotational positions */
+        __s32	s_postbloff;	/* (__s16) rotation block list head */
+	__s32	s_rotbloff;	/* (__u8) blocks for each rotation */
+
+	__u32	s_fpbmask;	/* fragments per block mask */
+	__u32	s_apb;		/* address per block */
+	__u32	s_2apb;		/* address per block^2 */
+	__u32	s_3apb;		/* address per block^3 */
+	__u32	s_apbmask;	/* address per block mask */
+	__u32	s_apbshift;	/* address per block shift */
+	__u32	s_2apbshift;	/* address per block shift * 2 */
+	__u32	s_3apbshift;	/* address per block shift * 3 */
+	__u32	s_nspfshift;	/* number of sector per fragment shift */
+	__u32	s_nspb;		/* number of sector per block */
+	__u32	s_inopf;	/* inodes per fragment */
+	__u32	s_sbbase;	/* offset of NeXTstep superblock */
+	__u32	s_bpf;		/* bits per fragment */
+	__u32	s_bpfshift;	/* bits per fragment shift*/
+	__u32	s_bpfmask;	/* bits per fragment mask */
+
+	__u32	s_maxsymlinklen;/* upper limit on fast symlinks' size */
+	__s32	fs_magic;       /* filesystem magic */
+	unsigned int s_dirblksize;
+};
+
+/*
+ * Sizes of this structures are:
+ *	ufs_super_block_first	512
+ *	ufs_super_block_second	512
+ *	ufs_super_block_third	356
+ */
+struct ufs_super_block_first {
+	union {
+		struct {
+			__fs32	fs_link;	/* UNUSED */
+		} fs_42;
+		struct {
+			__fs32	fs_state;	/* file system state flag */
+		} fs_sun;
+	} fs_u0;
+	__fs32	fs_rlink;
+	__fs32	fs_sblkno;
+	__fs32	fs_cblkno;
+	__fs32	fs_iblkno;
+	__fs32	fs_dblkno;
+	__fs32	fs_cgoffset;
+	__fs32	fs_cgmask;
+	__fs32	fs_time;
+	__fs32	fs_size;
+	__fs32	fs_dsize;
+	__fs32	fs_ncg;
+	__fs32	fs_bsize;
+	__fs32	fs_fsize;
+	__fs32	fs_frag;
+	__fs32	fs_minfree;
+	__fs32	fs_rotdelay;
+	__fs32	fs_rps;
+	__fs32	fs_bmask;
+	__fs32	fs_fmask;
+	__fs32	fs_bshift;
+	__fs32	fs_fshift;
+	__fs32	fs_maxcontig;
+	__fs32	fs_maxbpg;
+	__fs32	fs_fragshift;
+	__fs32	fs_fsbtodb;
+	__fs32	fs_sbsize;
+	__fs32	fs_csmask;
+	__fs32	fs_csshift;
+	__fs32	fs_nindir;
+	__fs32	fs_inopb;
+	__fs32	fs_nspf;
+	__fs32	fs_optim;
+	union {
+		struct {
+			__fs32	fs_npsect;
+		} fs_sun;
+		struct {
+			__fs32	fs_state;
+		} fs_sunx86;
+	} fs_u1;
+	__fs32	fs_interleave;
+	__fs32	fs_trackskew;
+	__fs32	fs_id[2];
+	__fs32	fs_csaddr;
+	__fs32	fs_cssize;
+	__fs32	fs_cgsize;
+	__fs32	fs_ntrak;
+	__fs32	fs_nsect;
+	__fs32	fs_spc;
+	__fs32	fs_ncyl;
+	__fs32	fs_cpg;
+	__fs32	fs_ipg;
+	__fs32	fs_fpg;
+	struct ufs_csum fs_cstotal;
+	__s8	fs_fmod;
+	__s8	fs_clean;
+	__s8	fs_ronly;
+	__s8	fs_flags;
+	__s8	fs_fsmnt[UFS_MAXMNTLEN - 212];
+
+};
+
+struct ufs_super_block_second {
+	union {
+		struct {
+			__s8	fs_fsmnt[212];
+			__fs32	fs_cgrotor;
+			__fs32	fs_csp[UFS_MAXCSBUFS];
+			__fs32	fs_maxcluster;
+			__fs32	fs_cpc;
+			__fs16	fs_opostbl[82];
+		} fs_u1;
+		struct {
+			__s8  fs_fsmnt[UFS2_MAXMNTLEN - UFS_MAXMNTLEN + 212];
+			__u8   fs_volname[UFS2_MAXVOLLEN];
+			__fs64  fs_swuid;
+			__fs32  fs_pad;
+			__fs32   fs_cgrotor;
+			__fs32   fs_ocsp[UFS2_NOCSPTRS];
+			__fs32   fs_contigdirs;
+			__fs32   fs_csp;
+			__fs32   fs_maxcluster;
+			__fs32   fs_active;
+			__fs32   fs_old_cpc;
+			__fs32   fs_maxbsize;
+			__fs64   fs_sparecon64[17];
+			__fs64   fs_sblockloc;
+			__fs64	cs_ndir;
+			__fs64	cs_nbfree;
+		} fs_u2;
+	} fs_un;
+};
+
+struct ufs_super_block_third {
+	union {
+		struct {
+			__fs16	fs_opostbl[46];
+		} fs_u1;
+		struct {
+			__fs64	cs_nifree;	/* number of free inodes */
+			__fs64	cs_nffree;	/* number of free frags */
+			__fs64   cs_numclusters;	/* number of free clusters */
+			__fs64   cs_spare[3];	/* future expansion */
+			struct  ufs_timeval    fs_time;		/* last time written */
+			__fs64    fs_size;		/* number of blocks in fs */
+			__fs64    fs_dsize;	/* number of data blocks in fs */
+			__fs64   fs_csaddr;	/* blk addr of cyl grp summary area */
+			__fs64    fs_pendingblocks;/* blocks in process of being freed */
+			__fs32    fs_pendinginodes;/*inodes in process of being freed */
+		} __attribute__ ((packed)) fs_u2;
+	} fs_un1;
+	union {
+		struct {
+			__fs32	fs_sparecon[53];/* reserved for future constants */
+			__fs32	fs_reclaim;
+			__fs32	fs_sparecon2[1];
+			__fs32	fs_state;	/* file system state time stamp */
+			__fs32	fs_qbmask[2];	/* ~usb_bmask */
+			__fs32	fs_qfmask[2];	/* ~usb_fmask */
+		} fs_sun;
+		struct {
+			__fs32	fs_sparecon[53];/* reserved for future constants */
+			__fs32	fs_reclaim;
+			__fs32	fs_sparecon2[1];
+			__fs32	fs_npsect;	/* # sectors/track including spares */
+			__fs32	fs_qbmask[2];	/* ~usb_bmask */
+			__fs32	fs_qfmask[2];	/* ~usb_fmask */
+		} fs_sunx86;
+		struct {
+			__fs32	fs_sparecon[50];/* reserved for future constants */
+			__fs32	fs_contigsumsize;/* size of cluster summary array */
+			__fs32	fs_maxsymlinklen;/* max length of an internal symlink */
+			__fs32	fs_inodefmt;	/* format of on-disk inodes */
+			__fs32	fs_maxfilesize[2];	/* max representable file size */
+			__fs32	fs_qbmask[2];	/* ~usb_bmask */
+			__fs32	fs_qfmask[2];	/* ~usb_fmask */
+			__fs32	fs_state;	/* file system state time stamp */
+		} fs_44;
+	} fs_un2;
+	__fs32	fs_postblformat;
+	__fs32	fs_nrpos;
+	__fs32	fs_postbloff;
+	__fs32	fs_rotbloff;
+	__fs32	fs_magic;
+	__u8	fs_space[1];
+};
+
+#endif /* __LINUX_UFS_FS_H */
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
new file mode 100644
index 00000000..95425b59
--- /dev/null
+++ b/fs/ufs/util.c
@@ -0,0 +1,283 @@
+/*
+ *  linux/fs/ufs/util.c
+ *
+ * Copyright (C) 1998
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ */
+ 
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+
+#include "ufs_fs.h"
+#include "ufs.h"
+#include "swab.h"
+#include "util.h"
+
+struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
+	struct super_block *sb, u64 fragment, u64 size)
+{
+	struct ufs_buffer_head * ubh;
+	unsigned i, j ;
+	u64  count = 0;
+	if (size & ~uspi->s_fmask)
+		return NULL;
+	count = size >> uspi->s_fshift;
+	if (count > UFS_MAXFRAG)
+		return NULL;
+	ubh = (struct ufs_buffer_head *)
+		kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS);
+	if (!ubh)
+		return NULL;
+	ubh->fragment = fragment;
+	ubh->count = count;
+	for (i = 0; i < count; i++)
+		if (!(ubh->bh[i] = sb_bread(sb, fragment + i)))
+			goto failed;
+	for (; i < UFS_MAXFRAG; i++)
+		ubh->bh[i] = NULL;
+	return ubh;
+failed:
+	for (j = 0; j < i; j++)
+		brelse (ubh->bh[j]);
+	kfree(ubh);
+	return NULL;
+}
+
+struct ufs_buffer_head * ubh_bread_uspi (struct ufs_sb_private_info * uspi,
+	struct super_block *sb, u64 fragment, u64 size)
+{
+	unsigned i, j;
+	u64 count = 0;
+	if (size & ~uspi->s_fmask)
+		return NULL;
+	count = size >> uspi->s_fshift;
+	if (count <= 0 || count > UFS_MAXFRAG)
+		return NULL;
+	USPI_UBH(uspi)->fragment = fragment;
+	USPI_UBH(uspi)->count = count;
+	for (i = 0; i < count; i++)
+		if (!(USPI_UBH(uspi)->bh[i] = sb_bread(sb, fragment + i)))
+			goto failed;
+	for (; i < UFS_MAXFRAG; i++)
+		USPI_UBH(uspi)->bh[i] = NULL;
+	return USPI_UBH(uspi);
+failed:
+	for (j = 0; j < i; j++)
+		brelse (USPI_UBH(uspi)->bh[j]);
+	return NULL;
+}
+
+void ubh_brelse (struct ufs_buffer_head * ubh)
+{
+	unsigned i;
+	if (!ubh)
+		return;
+	for (i = 0; i < ubh->count; i++)
+		brelse (ubh->bh[i]);
+	kfree (ubh);
+}
+
+void ubh_brelse_uspi (struct ufs_sb_private_info * uspi)
+{
+	unsigned i;
+	if (!USPI_UBH(uspi))
+		return;
+	for ( i = 0; i < USPI_UBH(uspi)->count; i++ ) {
+		brelse (USPI_UBH(uspi)->bh[i]);
+		USPI_UBH(uspi)->bh[i] = NULL;
+	}
+}
+
+void ubh_mark_buffer_dirty (struct ufs_buffer_head * ubh)
+{
+	unsigned i;
+	if (!ubh)
+		return;
+	for ( i = 0; i < ubh->count; i++ )
+		mark_buffer_dirty (ubh->bh[i]);
+}
+
+void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
+{
+	unsigned i;
+	if (!ubh)
+		return;
+	if (flag) {
+		for ( i = 0; i < ubh->count; i++ )
+			set_buffer_uptodate (ubh->bh[i]);
+	} else {
+		for ( i = 0; i < ubh->count; i++ )
+			clear_buffer_uptodate (ubh->bh[i]);
+	}
+}
+
+void ubh_sync_block(struct ufs_buffer_head *ubh)
+{
+	if (ubh) {
+		unsigned i;
+
+		for (i = 0; i < ubh->count; i++)
+			write_dirty_buffer(ubh->bh[i], WRITE);
+
+		for (i = 0; i < ubh->count; i++)
+			wait_on_buffer(ubh->bh[i]);
+	}
+}
+
+void ubh_bforget (struct ufs_buffer_head * ubh)
+{
+	unsigned i;
+	if (!ubh) 
+		return;
+	for ( i = 0; i < ubh->count; i++ ) if ( ubh->bh[i] ) 
+		bforget (ubh->bh[i]);
+}
+ 
+int ubh_buffer_dirty (struct ufs_buffer_head * ubh)
+{
+	unsigned i;
+	unsigned result = 0;
+	if (!ubh)
+		return 0;
+	for ( i = 0; i < ubh->count; i++ )
+		result |= buffer_dirty(ubh->bh[i]);
+	return result;
+}
+
+void _ubh_ubhcpymem_(struct ufs_sb_private_info * uspi, 
+	unsigned char * mem, struct ufs_buffer_head * ubh, unsigned size)
+{
+	unsigned len, bhno;
+	if (size > (ubh->count << uspi->s_fshift))
+		size = ubh->count << uspi->s_fshift;
+	bhno = 0;
+	while (size) {
+		len = min_t(unsigned int, size, uspi->s_fsize);
+		memcpy (mem, ubh->bh[bhno]->b_data, len);
+		mem += uspi->s_fsize;
+		size -= len;
+		bhno++;
+	}
+}
+
+void _ubh_memcpyubh_(struct ufs_sb_private_info * uspi, 
+	struct ufs_buffer_head * ubh, unsigned char * mem, unsigned size)
+{
+	unsigned len, bhno;
+	if (size > (ubh->count << uspi->s_fshift))
+		size = ubh->count << uspi->s_fshift;
+	bhno = 0;
+	while (size) {
+		len = min_t(unsigned int, size, uspi->s_fsize);
+		memcpy (ubh->bh[bhno]->b_data, mem, len);
+		mem += uspi->s_fsize;
+		size -= len;
+		bhno++;
+	}
+}
+
+dev_t
+ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi)
+{
+	__u32 fs32;
+	dev_t dev;
+
+	if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
+		fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[1]);
+	else
+		fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[0]);
+	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
+	case UFS_ST_SUNx86:
+	case UFS_ST_SUN:
+		if ((fs32 & 0xffff0000) == 0 ||
+		    (fs32 & 0xffff0000) == 0xffff0000)
+			dev = old_decode_dev(fs32 & 0x7fff);
+		else
+			dev = MKDEV(sysv_major(fs32), sysv_minor(fs32));
+		break;
+
+	default:
+		dev = old_decode_dev(fs32);
+		break;
+	}
+	return dev;
+}
+
+void
+ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev)
+{
+	__u32 fs32;
+
+	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
+	case UFS_ST_SUNx86:
+	case UFS_ST_SUN:
+		fs32 = sysv_encode_dev(dev);
+		if ((fs32 & 0xffff8000) == 0) {
+			fs32 = old_encode_dev(dev);
+		}
+		break;
+
+	default:
+		fs32 = old_encode_dev(dev);
+		break;
+	}
+	if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
+		ufsi->i_u1.i_data[1] = cpu_to_fs32(sb, fs32);
+	else
+		ufsi->i_u1.i_data[0] = cpu_to_fs32(sb, fs32);
+}
+
+/**
+ * ufs_get_locked_page() - locate, pin and lock a pagecache page, if not exist
+ * read it from disk.
+ * @mapping: the address_space to search
+ * @index: the page index
+ *
+ * Locates the desired pagecache page, if not exist we'll read it,
+ * locks it, increments its reference
+ * count and returns its address.
+ *
+ */
+
+struct page *ufs_get_locked_page(struct address_space *mapping,
+				 pgoff_t index)
+{
+	struct page *page;
+
+	page = find_lock_page(mapping, index);
+	if (!page) {
+		page = read_mapping_page(mapping, index, NULL);
+
+		if (IS_ERR(page)) {
+			printk(KERN_ERR "ufs_change_blocknr: "
+			       "read_mapping_page error: ino %lu, index: %lu\n",
+			       mapping->host->i_ino, index);
+			goto out;
+		}
+
+		lock_page(page);
+
+		if (unlikely(page->mapping == NULL)) {
+			/* Truncate got there first */
+			unlock_page(page);
+			page_cache_release(page);
+			page = NULL;
+			goto out;
+		}
+
+		if (!PageUptodate(page) || PageError(page)) {
+			unlock_page(page);
+			page_cache_release(page);
+
+			printk(KERN_ERR "ufs_change_blocknr: "
+			       "can not read page: ino %lu, index: %lu\n",
+			       mapping->host->i_ino, index);
+
+			page = ERR_PTR(-EIO);
+		}
+	}
+out:
+	return page;
+}
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
new file mode 100644
index 00000000..95417592
--- /dev/null
+++ b/fs/ufs/util.h
@@ -0,0 +1,592 @@
+/*
+ *  linux/fs/ufs/util.h
+ *
+ * Copyright (C) 1998 
+ * Daniel Pirkl <daniel.pirkl@email.cz>
+ * Charles University, Faculty of Mathematics and Physics
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include "swab.h"
+
+
+/*
+ * some useful macros
+ */
+#define in_range(b,first,len)	((b)>=(first)&&(b)<(first)+(len))
+
+/*
+ * functions used for retyping
+ */
+static inline struct ufs_buffer_head *UCPI_UBH(struct ufs_cg_private_info *cpi)
+{
+	return &cpi->c_ubh;
+}
+static inline struct ufs_buffer_head *USPI_UBH(struct ufs_sb_private_info *spi)
+{
+	return &spi->s_ubh;
+}
+
+
+
+/*
+ * macros used for accessing structures
+ */
+static inline s32
+ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
+		 struct ufs_super_block_third *usb3)
+{
+	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
+	case UFS_ST_SUNOS:
+		if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT)
+			return fs32_to_cpu(sb, usb1->fs_u0.fs_sun.fs_state);
+		/* Fall Through to UFS_ST_SUN */
+	case UFS_ST_SUN:
+		return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state);
+	case UFS_ST_SUNx86:
+		return fs32_to_cpu(sb, usb1->fs_u1.fs_sunx86.fs_state);
+	case UFS_ST_44BSD:
+	default:
+		return fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_state);
+	}
+}
+
+static inline void
+ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
+		 struct ufs_super_block_third *usb3, s32 value)
+{
+	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
+	case UFS_ST_SUNOS:
+		if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT) {
+			usb1->fs_u0.fs_sun.fs_state = cpu_to_fs32(sb, value);
+			break;
+		}
+		/* Fall Through to UFS_ST_SUN */
+	case UFS_ST_SUN:
+		usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value);
+		break;
+	case UFS_ST_SUNx86:
+		usb1->fs_u1.fs_sunx86.fs_state = cpu_to_fs32(sb, value);
+		break;
+	case UFS_ST_44BSD:
+		usb3->fs_un2.fs_44.fs_state = cpu_to_fs32(sb, value);
+		break;
+	}
+}
+
+static inline u32
+ufs_get_fs_npsect(struct super_block *sb, struct ufs_super_block_first *usb1,
+		  struct ufs_super_block_third *usb3)
+{
+	if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
+		return fs32_to_cpu(sb, usb3->fs_un2.fs_sunx86.fs_npsect);
+	else
+		return fs32_to_cpu(sb, usb1->fs_u1.fs_sun.fs_npsect);
+}
+
+static inline u64
+ufs_get_fs_qbmask(struct super_block *sb, struct ufs_super_block_third *usb3)
+{
+	__fs64 tmp;
+
+	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
+	case UFS_ST_SUNOS:
+	case UFS_ST_SUN:
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qbmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qbmask[1];
+		break;
+	case UFS_ST_SUNx86:
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qbmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qbmask[1];
+		break;
+	case UFS_ST_44BSD:
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qbmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qbmask[1];
+		break;
+	}
+
+	return fs64_to_cpu(sb, tmp);
+}
+
+static inline u64
+ufs_get_fs_qfmask(struct super_block *sb, struct ufs_super_block_third *usb3)
+{
+	__fs64 tmp;
+
+	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
+	case UFS_ST_SUNOS:
+	case UFS_ST_SUN:
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qfmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qfmask[1];
+		break;
+	case UFS_ST_SUNx86:
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qfmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qfmask[1];
+		break;
+	case UFS_ST_44BSD:
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qfmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qfmask[1];
+		break;
+	}
+
+	return fs64_to_cpu(sb, tmp);
+}
+
+static inline u16
+ufs_get_de_namlen(struct super_block *sb, struct ufs_dir_entry *de)
+{
+	if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) == UFS_DE_OLD)
+		return fs16_to_cpu(sb, de->d_u.d_namlen);
+	else
+		return de->d_u.d_44.d_namlen; /* XXX this seems wrong */
+}
+
+static inline void
+ufs_set_de_namlen(struct super_block *sb, struct ufs_dir_entry *de, u16 value)
+{
+	if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) == UFS_DE_OLD)
+		de->d_u.d_namlen = cpu_to_fs16(sb, value);
+	else
+		de->d_u.d_44.d_namlen = value; /* XXX this seems wrong */
+}
+
+static inline void
+ufs_set_de_type(struct super_block *sb, struct ufs_dir_entry *de, int mode)
+{
+	if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) != UFS_DE_44BSD)
+		return;
+
+	/*
+	 * TODO turn this into a table lookup
+	 */
+	switch (mode & S_IFMT) {
+	case S_IFSOCK:
+		de->d_u.d_44.d_type = DT_SOCK;
+		break;
+	case S_IFLNK:
+		de->d_u.d_44.d_type = DT_LNK;
+		break;
+	case S_IFREG:
+		de->d_u.d_44.d_type = DT_REG;
+		break;
+	case S_IFBLK:
+		de->d_u.d_44.d_type = DT_BLK;
+		break;
+	case S_IFDIR:
+		de->d_u.d_44.d_type = DT_DIR;
+		break;
+	case S_IFCHR:
+		de->d_u.d_44.d_type = DT_CHR;
+		break;
+	case S_IFIFO:
+		de->d_u.d_44.d_type = DT_FIFO;
+		break;
+	default:
+		de->d_u.d_44.d_type = DT_UNKNOWN;
+	}
+}
+
+static inline u32
+ufs_get_inode_uid(struct super_block *sb, struct ufs_inode *inode)
+{
+	switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) {
+	case UFS_UID_44BSD:
+		return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_uid);
+	case UFS_UID_EFT:
+		if (inode->ui_u1.oldids.ui_suid == 0xFFFF)
+			return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_uid);
+		/* Fall through */
+	default:
+		return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_suid);
+	}
+}
+
+static inline void
+ufs_set_inode_uid(struct super_block *sb, struct ufs_inode *inode, u32 value)
+{
+	switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) {
+	case UFS_UID_44BSD:
+		inode->ui_u3.ui_44.ui_uid = cpu_to_fs32(sb, value);
+		inode->ui_u1.oldids.ui_suid = cpu_to_fs16(sb, value);
+		break;
+	case UFS_UID_EFT:
+		inode->ui_u3.ui_sun.ui_uid = cpu_to_fs32(sb, value);
+		if (value > 0xFFFF)
+			value = 0xFFFF;
+		/* Fall through */
+	default:
+		inode->ui_u1.oldids.ui_suid = cpu_to_fs16(sb, value);
+		break;
+	}
+}
+
+static inline u32
+ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode)
+{
+	switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) {
+	case UFS_UID_44BSD:
+		return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_gid);
+	case UFS_UID_EFT:
+		if (inode->ui_u1.oldids.ui_suid == 0xFFFF)
+			return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid);
+		/* Fall through */
+	default:
+		return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_sgid);
+	}
+}
+
+static inline void
+ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value)
+{
+	switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) {
+	case UFS_UID_44BSD:
+		inode->ui_u3.ui_44.ui_gid = cpu_to_fs32(sb, value);
+		inode->ui_u1.oldids.ui_sgid =  cpu_to_fs16(sb, value);
+		break;
+	case UFS_UID_EFT:
+		inode->ui_u3.ui_sun.ui_gid = cpu_to_fs32(sb, value);
+		if (value > 0xFFFF)
+			value = 0xFFFF;
+		/* Fall through */
+	default:
+		inode->ui_u1.oldids.ui_sgid =  cpu_to_fs16(sb, value);
+		break;
+	}
+}
+
+extern dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *);
+extern void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t);
+extern int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len);
+
+/*
+ * These functions manipulate ufs buffers
+ */
+#define ubh_bread(sb,fragment,size) _ubh_bread_(uspi,sb,fragment,size)  
+extern struct ufs_buffer_head * _ubh_bread_(struct ufs_sb_private_info *, struct super_block *, u64 , u64);
+extern struct ufs_buffer_head * ubh_bread_uspi(struct ufs_sb_private_info *, struct super_block *, u64, u64);
+extern void ubh_brelse (struct ufs_buffer_head *);
+extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
+extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
+extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
+extern void ubh_sync_block(struct ufs_buffer_head *);
+extern void ubh_bforget (struct ufs_buffer_head *);
+extern int  ubh_buffer_dirty (struct ufs_buffer_head *);
+#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
+extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struct ufs_buffer_head *, unsigned);
+#define ubh_memcpyubh(ubh,mem,size) _ubh_memcpyubh_(uspi,ubh,mem,size)
+extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned);
+
+/* This functions works with cache pages*/
+extern struct page *ufs_get_locked_page(struct address_space *mapping,
+					pgoff_t index);
+static inline void ufs_put_locked_page(struct page *page)
+{
+       unlock_page(page);
+       page_cache_release(page);
+}
+
+
+/*
+ * macros and inline function to get important structures from ufs_sb_private_info
+ */
+
+static inline void *get_usb_offset(struct ufs_sb_private_info *uspi,
+				   unsigned int offset)
+{
+	unsigned int index;
+	
+	index = offset >> uspi->s_fshift;
+	offset &= ~uspi->s_fmask;
+	return uspi->s_ubh.bh[index]->b_data + offset;
+}
+
+#define ubh_get_usb_first(uspi) \
+	((struct ufs_super_block_first *)get_usb_offset((uspi), 0))
+
+#define ubh_get_usb_second(uspi) \
+	((struct ufs_super_block_second *)get_usb_offset((uspi), UFS_SECTOR_SIZE))
+
+#define ubh_get_usb_third(uspi)	\
+	((struct ufs_super_block_third *)get_usb_offset((uspi), 2*UFS_SECTOR_SIZE))
+
+
+#define ubh_get_ucg(ubh) \
+	((struct ufs_cylinder_group *)((ubh)->bh[0]->b_data))
+
+
+/*
+ * Extract byte from ufs_buffer_head
+ * Extract the bits for a block from a map inside ufs_buffer_head
+ */
+#define ubh_get_addr8(ubh,begin) \
+	((u8*)(ubh)->bh[(begin) >> uspi->s_fshift]->b_data + \
+	((begin) & ~uspi->s_fmask))
+
+#define ubh_get_addr16(ubh,begin) \
+	(((__fs16*)((ubh)->bh[(begin) >> (uspi->s_fshift-1)]->b_data)) + \
+	((begin) & ((uspi->fsize>>1) - 1)))
+
+#define ubh_get_addr32(ubh,begin) \
+	(((__fs32*)((ubh)->bh[(begin) >> (uspi->s_fshift-2)]->b_data)) + \
+	((begin) & ((uspi->s_fsize>>2) - 1)))
+
+#define ubh_get_addr64(ubh,begin) \
+	(((__fs64*)((ubh)->bh[(begin) >> (uspi->s_fshift-3)]->b_data)) + \
+	((begin) & ((uspi->s_fsize>>3) - 1)))
+
+#define ubh_get_addr ubh_get_addr8
+
+static inline void *ubh_get_data_ptr(struct ufs_sb_private_info *uspi,
+				     struct ufs_buffer_head *ubh,
+				     u64 blk)
+{
+	if (uspi->fs_magic == UFS2_MAGIC)
+		return ubh_get_addr64(ubh, blk);
+	else
+		return ubh_get_addr32(ubh, blk);
+}
+
+#define ubh_blkmap(ubh,begin,bit) \
+	((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb)))
+
+/*
+ * Determine the number of available frags given a
+ * percentage to hold in reserve.
+ */
+static inline u64
+ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved)
+{
+	return ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
+		uspi->cs_total.cs_nffree -
+		(uspi->s_dsize * (percentreserved) / 100);
+}
+
+/*
+ * Macros to access cylinder group array structures
+ */
+#define ubh_cg_blktot(ucpi,cylno) \
+	(*((__fs32*)ubh_get_addr(UCPI_UBH(ucpi), (ucpi)->c_btotoff + ((cylno) << 2))))
+
+#define ubh_cg_blks(ucpi,cylno,rpos) \
+	(*((__fs16*)ubh_get_addr(UCPI_UBH(ucpi), \
+	(ucpi)->c_boff + (((cylno) * uspi->s_nrpos + (rpos)) << 1 ))))
+
+/*
+ * Bitmap operations
+ * These functions work like classical bitmap operations.
+ * The difference is that we don't have the whole bitmap
+ * in one contiguous chunk of memory, but in several buffers.
+ * The parameters of each function are super_block, ufs_buffer_head and
+ * position of the beginning of the bitmap.
+ */
+#define ubh_setbit(ubh,begin,bit) \
+	(*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) |= (1 << ((bit) & 7)))
+
+#define ubh_clrbit(ubh,begin,bit) \
+	(*ubh_get_addr (ubh, (begin) + ((bit) >> 3)) &= ~(1 << ((bit) & 7)))
+
+#define ubh_isset(ubh,begin,bit) \
+	(*ubh_get_addr (ubh, (begin) + ((bit) >> 3)) & (1 << ((bit) & 7)))
+
+#define ubh_isclr(ubh,begin,bit) (!ubh_isset(ubh,begin,bit))
+
+#define ubh_find_first_zero_bit(ubh,begin,size) _ubh_find_next_zero_bit_(uspi,ubh,begin,size,0)
+
+#define ubh_find_next_zero_bit(ubh,begin,size,offset) _ubh_find_next_zero_bit_(uspi,ubh,begin,size,offset)
+static inline unsigned _ubh_find_next_zero_bit_(
+	struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh,
+	unsigned begin, unsigned size, unsigned offset)
+{
+	unsigned base, count, pos;
+
+	size -= offset;
+	begin <<= 3;
+	offset += begin;
+	base = offset >> uspi->s_bpfshift;
+	offset &= uspi->s_bpfmask;
+	for (;;) {
+		count = min_t(unsigned int, size + offset, uspi->s_bpf);
+		size -= count - offset;
+		pos = find_next_zero_bit_le(ubh->bh[base]->b_data, count, offset);
+		if (pos < count || !size)
+			break;
+		base++;
+		offset = 0;
+	}
+	return (base << uspi->s_bpfshift) + pos - begin;
+} 	
+
+static inline unsigned find_last_zero_bit (unsigned char * bitmap,
+	unsigned size, unsigned offset)
+{
+	unsigned bit, i;
+	unsigned char * mapp;
+	unsigned char map;
+
+	mapp = bitmap + (size >> 3);
+	map = *mapp--;
+	bit = 1 << (size & 7);
+	for (i = size; i > offset; i--) {
+		if ((map & bit) == 0)
+			break;
+		if ((i & 7) != 0) {
+			bit >>= 1;
+		} else {
+			map = *mapp--;
+			bit = 1 << 7;
+		}
+	}
+	return i;
+}
+
+#define ubh_find_last_zero_bit(ubh,begin,size,offset) _ubh_find_last_zero_bit_(uspi,ubh,begin,size,offset)
+static inline unsigned _ubh_find_last_zero_bit_(
+	struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh,
+	unsigned begin, unsigned start, unsigned end)
+{
+	unsigned base, count, pos, size;
+
+	size = start - end;
+	begin <<= 3;
+	start += begin;
+	base = start >> uspi->s_bpfshift;
+	start &= uspi->s_bpfmask;
+	for (;;) {
+		count = min_t(unsigned int,
+			    size + (uspi->s_bpf - start), uspi->s_bpf)
+			- (uspi->s_bpf - start);
+		size -= count;
+		pos = find_last_zero_bit (ubh->bh[base]->b_data,
+			start, start - count);
+		if (pos > start - count || !size)
+			break;
+		base--;
+		start = uspi->s_bpf;
+	}
+	return (base << uspi->s_bpfshift) + pos - begin;
+} 	
+
+#define ubh_isblockclear(ubh,begin,block) (!_ubh_isblockset_(uspi,ubh,begin,block))
+
+#define ubh_isblockset(ubh,begin,block) _ubh_isblockset_(uspi,ubh,begin,block)
+static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi,
+	struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
+{
+	switch (uspi->s_fpb) {
+	case 8:
+	    	return (*ubh_get_addr (ubh, begin + block) == 0xff);
+	case 4:
+		return (*ubh_get_addr (ubh, begin + (block >> 1)) == (0x0f << ((block & 0x01) << 2)));
+	case 2:
+		return (*ubh_get_addr (ubh, begin + (block >> 2)) == (0x03 << ((block & 0x03) << 1)));
+	case 1:
+		return (*ubh_get_addr (ubh, begin + (block >> 3)) == (0x01 << (block & 0x07)));
+	}
+	return 0;	
+}
+
+#define ubh_clrblock(ubh,begin,block) _ubh_clrblock_(uspi,ubh,begin,block)
+static inline void _ubh_clrblock_(struct ufs_sb_private_info * uspi,
+	struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
+{
+	switch (uspi->s_fpb) {
+	case 8:
+	    	*ubh_get_addr (ubh, begin + block) = 0x00;
+	    	return; 
+	case 4:
+		*ubh_get_addr (ubh, begin + (block >> 1)) &= ~(0x0f << ((block & 0x01) << 2));
+		return;
+	case 2:
+		*ubh_get_addr (ubh, begin + (block >> 2)) &= ~(0x03 << ((block & 0x03) << 1));
+		return;
+	case 1:
+		*ubh_get_addr (ubh, begin + (block >> 3)) &= ~(0x01 << ((block & 0x07)));
+		return;
+	}
+}
+
+#define ubh_setblock(ubh,begin,block) _ubh_setblock_(uspi,ubh,begin,block)
+static inline void _ubh_setblock_(struct ufs_sb_private_info * uspi,
+	struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
+{
+	switch (uspi->s_fpb) {
+	case 8:
+	    	*ubh_get_addr(ubh, begin + block) = 0xff;
+	    	return;
+	case 4:
+		*ubh_get_addr(ubh, begin + (block >> 1)) |= (0x0f << ((block & 0x01) << 2));
+		return;
+	case 2:
+		*ubh_get_addr(ubh, begin + (block >> 2)) |= (0x03 << ((block & 0x03) << 1));
+		return;
+	case 1:
+		*ubh_get_addr(ubh, begin + (block >> 3)) |= (0x01 << ((block & 0x07)));
+		return;
+	}
+}
+
+static inline void ufs_fragacct (struct super_block * sb, unsigned blockmap,
+	__fs32 * fraglist, int cnt)
+{
+	struct ufs_sb_private_info * uspi;
+	unsigned fragsize, pos;
+	
+	uspi = UFS_SB(sb)->s_uspi;
+	
+	fragsize = 0;
+	for (pos = 0; pos < uspi->s_fpb; pos++) {
+		if (blockmap & (1 << pos)) {
+			fragsize++;
+		}
+		else if (fragsize > 0) {
+			fs32_add(sb, &fraglist[fragsize], cnt);
+			fragsize = 0;
+		}
+	}
+	if (fragsize > 0 && fragsize < uspi->s_fpb)
+		fs32_add(sb, &fraglist[fragsize], cnt);
+}
+
+static inline void *ufs_get_direct_data_ptr(struct ufs_sb_private_info *uspi,
+					    struct ufs_inode_info *ufsi,
+					    unsigned blk)
+{
+	BUG_ON(blk > UFS_TIND_BLOCK);
+	return uspi->fs_magic == UFS2_MAGIC ?
+		(void *)&ufsi->i_u1.u2_i_data[blk] :
+		(void *)&ufsi->i_u1.i_data[blk];
+}
+
+static inline u64 ufs_data_ptr_to_cpu(struct super_block *sb, void *p)
+{
+	return UFS_SB(sb)->s_uspi->fs_magic == UFS2_MAGIC ?
+		fs64_to_cpu(sb, *(__fs64 *)p) :
+		fs32_to_cpu(sb, *(__fs32 *)p);
+}
+
+static inline void ufs_cpu_to_data_ptr(struct super_block *sb, void *p, u64 val)
+{
+	if (UFS_SB(sb)->s_uspi->fs_magic == UFS2_MAGIC)
+		*(__fs64 *)p = cpu_to_fs64(sb, val);
+	else
+		*(__fs32 *)p = cpu_to_fs32(sb, val);
+}
+
+static inline void ufs_data_ptr_clear(struct ufs_sb_private_info *uspi,
+				      void *p)
+{
+	if (uspi->fs_magic == UFS2_MAGIC)
+		*(__fs64 *)p = 0;
+	else
+		*(__fs32 *)p = 0;
+}
+
+static inline int ufs_is_data_ptr_zero(struct ufs_sb_private_info *uspi,
+				       void *p)
+{
+	if (uspi->fs_magic == UFS2_MAGIC)
+		return *(__fs64 *)p == 0;
+	else
+		return *(__fs32 *)p == 0;
+}
diff --git a/fs/utimes.c b/fs/utimes.c
new file mode 100644
index 00000000..ba653f3d
--- /dev/null
+++ b/fs/utimes.c
@@ -0,0 +1,224 @@
+#include <linux/compiler.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/linkage.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+#include <linux/stat.h>
+#include <linux/utime.h>
+#include <linux/syscalls.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+#ifdef __ARCH_WANT_SYS_UTIME
+
+/*
+ * sys_utime() can be implemented in user-level using sys_utimes().
+ * Is this for backwards compatibility?  If so, why not move it
+ * into the appropriate arch directory (for those architectures that
+ * need it).
+ */
+
+/* If times==NULL, set access and modification to current time,
+ * must be owner or have write permission.
+ * Else, update from *times, must be owner or super user.
+ */
+SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
+{
+	struct timespec tv[2];
+
+	if (times) {
+		if (get_user(tv[0].tv_sec, &times->actime) ||
+		    get_user(tv[1].tv_sec, &times->modtime))
+			return -EFAULT;
+		tv[0].tv_nsec = 0;
+		tv[1].tv_nsec = 0;
+	}
+	return do_utimes(AT_FDCWD, filename, times ? tv : NULL, 0);
+}
+
+#endif
+
+static bool nsec_valid(long nsec)
+{
+	if (nsec == UTIME_OMIT || nsec == UTIME_NOW)
+		return true;
+
+	return nsec >= 0 && nsec <= 999999999;
+}
+
+static int utimes_common(struct path *path, struct timespec *times)
+{
+	int error;
+	struct iattr newattrs;
+	struct inode *inode = path->dentry->d_inode;
+
+	error = mnt_want_write(path->mnt);
+	if (error)
+		goto out;
+
+	if (times && times[0].tv_nsec == UTIME_NOW &&
+		     times[1].tv_nsec == UTIME_NOW)
+		times = NULL;
+
+	newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
+	if (times) {
+		if (times[0].tv_nsec == UTIME_OMIT)
+			newattrs.ia_valid &= ~ATTR_ATIME;
+		else if (times[0].tv_nsec != UTIME_NOW) {
+			newattrs.ia_atime.tv_sec = times[0].tv_sec;
+			newattrs.ia_atime.tv_nsec = times[0].tv_nsec;
+			newattrs.ia_valid |= ATTR_ATIME_SET;
+		}
+
+		if (times[1].tv_nsec == UTIME_OMIT)
+			newattrs.ia_valid &= ~ATTR_MTIME;
+		else if (times[1].tv_nsec != UTIME_NOW) {
+			newattrs.ia_mtime.tv_sec = times[1].tv_sec;
+			newattrs.ia_mtime.tv_nsec = times[1].tv_nsec;
+			newattrs.ia_valid |= ATTR_MTIME_SET;
+		}
+		/*
+		 * Tell inode_change_ok(), that this is an explicit time
+		 * update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET
+		 * were used.
+		 */
+		newattrs.ia_valid |= ATTR_TIMES_SET;
+	} else {
+		/*
+		 * If times is NULL (or both times are UTIME_NOW),
+		 * then we need to check permissions, because
+		 * inode_change_ok() won't do it.
+		 */
+		error = -EACCES;
+                if (IS_IMMUTABLE(inode))
+			goto mnt_drop_write_and_out;
+
+		if (!inode_owner_or_capable(inode)) {
+			error = inode_permission(inode, MAY_WRITE);
+			if (error)
+				goto mnt_drop_write_and_out;
+		}
+	}
+	mutex_lock(&inode->i_mutex);
+	error = notify_change(path->dentry, &newattrs);
+	mutex_unlock(&inode->i_mutex);
+
+mnt_drop_write_and_out:
+	mnt_drop_write(path->mnt);
+out:
+	return error;
+}
+
+/*
+ * do_utimes - change times on filename or file descriptor
+ * @dfd: open file descriptor, -1 or AT_FDCWD
+ * @filename: path name or NULL
+ * @times: new times or NULL
+ * @flags: zero or more flags (only AT_SYMLINK_NOFOLLOW for the moment)
+ *
+ * If filename is NULL and dfd refers to an open file, then operate on
+ * the file.  Otherwise look up filename, possibly using dfd as a
+ * starting point.
+ *
+ * If times==NULL, set access and modification to current time,
+ * must be owner or have write permission.
+ * Else, update from *times, must be owner or super user.
+ */
+long do_utimes(int dfd, const char __user *filename, struct timespec *times,
+	       int flags)
+{
+	int error = -EINVAL;
+
+	if (times && (!nsec_valid(times[0].tv_nsec) ||
+		      !nsec_valid(times[1].tv_nsec))) {
+		goto out;
+	}
+
+	if (flags & ~AT_SYMLINK_NOFOLLOW)
+		goto out;
+
+	if (filename == NULL && dfd != AT_FDCWD) {
+		struct file *file;
+
+		if (flags & AT_SYMLINK_NOFOLLOW)
+			goto out;
+
+		file = fget(dfd);
+		error = -EBADF;
+		if (!file)
+			goto out;
+
+		error = utimes_common(&file->f_path, times);
+		fput(file);
+	} else {
+		struct path path;
+		int lookup_flags = 0;
+
+		if (!(flags & AT_SYMLINK_NOFOLLOW))
+			lookup_flags |= LOOKUP_FOLLOW;
+
+		error = user_path_at(dfd, filename, lookup_flags, &path);
+		if (error)
+			goto out;
+
+		error = utimes_common(&path, times);
+		path_put(&path);
+	}
+
+out:
+	return error;
+}
+
+SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename,
+		struct timespec __user *, utimes, int, flags)
+{
+	struct timespec tstimes[2];
+
+	if (utimes) {
+		if (copy_from_user(&tstimes, utimes, sizeof(tstimes)))
+			return -EFAULT;
+
+		/* Nothing to do, we must not even check the path.  */
+		if (tstimes[0].tv_nsec == UTIME_OMIT &&
+		    tstimes[1].tv_nsec == UTIME_OMIT)
+			return 0;
+	}
+
+	return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
+}
+
+SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
+		struct timeval __user *, utimes)
+{
+	struct timeval times[2];
+	struct timespec tstimes[2];
+
+	if (utimes) {
+		if (copy_from_user(&times, utimes, sizeof(times)))
+			return -EFAULT;
+
+		/* This test is needed to catch all invalid values.  If we
+		   would test only in do_utimes we would miss those invalid
+		   values truncated by the multiplication with 1000.  Note
+		   that we also catch UTIME_{NOW,OMIT} here which are only
+		   valid for utimensat.  */
+		if (times[0].tv_usec >= 1000000 || times[0].tv_usec < 0 ||
+		    times[1].tv_usec >= 1000000 || times[1].tv_usec < 0)
+			return -EINVAL;
+
+		tstimes[0].tv_sec = times[0].tv_sec;
+		tstimes[0].tv_nsec = 1000 * times[0].tv_usec;
+		tstimes[1].tv_sec = times[1].tv_sec;
+		tstimes[1].tv_nsec = 1000 * times[1].tv_usec;
+	}
+
+	return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0);
+}
+
+SYSCALL_DEFINE2(utimes, char __user *, filename,
+		struct timeval __user *, utimes)
+{
+	return sys_futimesat(AT_FDCWD, filename, utimes);
+}
diff --git a/fs/xattr.c b/fs/xattr.c
new file mode 100644
index 00000000..f060663a
--- /dev/null
+++ b/fs/xattr.c
@@ -0,0 +1,698 @@
+/*
+  File: fs/xattr.c
+
+  Extended attribute handling.
+
+  Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
+  Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com>
+  Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ */
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/xattr.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/module.h>
+#include <linux/fsnotify.h>
+#include <linux/audit.h>
+#include <asm/uaccess.h>
+
+
+/*
+ * Check permissions for extended attribute access.  This is a bit complicated
+ * because different namespaces have very different rules.
+ */
+static int
+xattr_permission(struct inode *inode, const char *name, int mask)
+{
+	/*
+	 * We can never set or remove an extended attribute on a read-only
+	 * filesystem  or on an immutable / append-only inode.
+	 */
+	if (mask & MAY_WRITE) {
+		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+			return -EPERM;
+	}
+
+	/*
+	 * No restriction for security.* and system.* from the VFS.  Decision
+	 * on these is left to the underlying filesystem / security module.
+	 */
+	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
+	    !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return 0;
+
+	/*
+	 * The trusted.* namespace can only be accessed by privileged users.
+	 */
+	if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
+		if (!capable(CAP_SYS_ADMIN))
+			return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
+		return 0;
+	}
+
+	/*
+	 * In the user.* namespace, only regular files and directories can have
+	 * extended attributes. For sticky directories, only the owner and
+	 * privileged users can write attributes.
+	 */
+	if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
+		if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
+		    (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
+			return -EPERM;
+	}
+
+	return inode_permission(inode, mask);
+}
+
+/**
+ *  __vfs_setxattr_noperm - perform setxattr operation without performing
+ *  permission checks.
+ *
+ *  @dentry - object to perform setxattr on
+ *  @name - xattr name to set
+ *  @value - value to set @name to
+ *  @size - size of @value
+ *  @flags - flags to pass into filesystem operations
+ *
+ *  returns the result of the internal setxattr or setsecurity operations.
+ *
+ *  This function requires the caller to lock the inode's i_mutex before it
+ *  is executed. It also assumes that the caller will make the appropriate
+ *  permission checks.
+ */
+int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	int error = -EOPNOTSUPP;
+	int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
+				   XATTR_SECURITY_PREFIX_LEN);
+
+	if (issec)
+		inode->i_flags &= ~S_NOSEC;
+	if (inode->i_op->setxattr) {
+		error = inode->i_op->setxattr(dentry, name, value, size, flags);
+		if (!error) {
+			fsnotify_xattr(dentry);
+			security_inode_post_setxattr(dentry, name, value,
+						     size, flags);
+		}
+	} else if (issec) {
+		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+		error = security_inode_setsecurity(inode, suffix, value,
+						   size, flags);
+		if (!error)
+			fsnotify_xattr(dentry);
+	}
+
+	return error;
+}
+
+
+int
+vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = xattr_permission(inode, name, MAY_WRITE);
+	if (error)
+		return error;
+
+	mutex_lock(&inode->i_mutex);
+	error = security_inode_setxattr(dentry, name, value, size, flags);
+	if (error)
+		goto out;
+
+	error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_setxattr);
+
+ssize_t
+xattr_getsecurity(struct inode *inode, const char *name, void *value,
+			size_t size)
+{
+	void *buffer = NULL;
+	ssize_t len;
+
+	if (!value || !size) {
+		len = security_inode_getsecurity(inode, name, &buffer, false);
+		goto out_noalloc;
+	}
+
+	len = security_inode_getsecurity(inode, name, &buffer, true);
+	if (len < 0)
+		return len;
+	if (size < len) {
+		len = -ERANGE;
+		goto out;
+	}
+	memcpy(value, buffer, len);
+out:
+	security_release_secctx(buffer, len);
+out_noalloc:
+	return len;
+}
+EXPORT_SYMBOL_GPL(xattr_getsecurity);
+
+ssize_t
+vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = xattr_permission(inode, name, MAY_READ);
+	if (error)
+		return error;
+
+	error = security_inode_getxattr(dentry, name);
+	if (error)
+		return error;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX,
+				XATTR_SECURITY_PREFIX_LEN)) {
+		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+		int ret = xattr_getsecurity(inode, suffix, value, size);
+		/*
+		 * Only overwrite the return value if a security module
+		 * is actually active.
+		 */
+		if (ret == -EOPNOTSUPP)
+			goto nolsm;
+		return ret;
+	}
+nolsm:
+	if (inode->i_op->getxattr)
+		error = inode->i_op->getxattr(dentry, name, value, size);
+	else
+		error = -EOPNOTSUPP;
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_getxattr);
+
+ssize_t
+vfs_listxattr(struct dentry *d, char *list, size_t size)
+{
+	ssize_t error;
+
+	error = security_inode_listxattr(d);
+	if (error)
+		return error;
+	error = -EOPNOTSUPP;
+	if (d->d_inode->i_op->listxattr) {
+		error = d->d_inode->i_op->listxattr(d, list, size);
+	} else {
+		error = security_inode_listsecurity(d->d_inode, list, size);
+		if (size && error > size)
+			error = -ERANGE;
+	}
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_listxattr);
+
+int
+vfs_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	if (!inode->i_op->removexattr)
+		return -EOPNOTSUPP;
+
+	error = xattr_permission(inode, name, MAY_WRITE);
+	if (error)
+		return error;
+
+	error = security_inode_removexattr(dentry, name);
+	if (error)
+		return error;
+
+	mutex_lock(&inode->i_mutex);
+	error = inode->i_op->removexattr(dentry, name);
+	mutex_unlock(&inode->i_mutex);
+
+	if (!error)
+		fsnotify_xattr(dentry);
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_removexattr);
+
+
+/*
+ * Extended attribute SET operations
+ */
+static long
+setxattr(struct dentry *d, const char __user *name, const void __user *value,
+	 size_t size, int flags)
+{
+	int error;
+	void *kvalue = NULL;
+	char kname[XATTR_NAME_MAX + 1];
+
+	if (flags & ~(XATTR_CREATE|XATTR_REPLACE))
+		return -EINVAL;
+
+	error = strncpy_from_user(kname, name, sizeof(kname));
+	if (error == 0 || error == sizeof(kname))
+		error = -ERANGE;
+	if (error < 0)
+		return error;
+
+	if (size) {
+		if (size > XATTR_SIZE_MAX)
+			return -E2BIG;
+		kvalue = memdup_user(value, size);
+		if (IS_ERR(kvalue))
+			return PTR_ERR(kvalue);
+	}
+
+	error = vfs_setxattr(d, kname, kvalue, size, flags);
+	kfree(kvalue);
+	return error;
+}
+
+SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
+		const char __user *, name, const void __user *, value,
+		size_t, size, int, flags)
+{
+	struct path path;
+	int error;
+
+	error = user_path(pathname, &path);
+	if (error)
+		return error;
+	error = mnt_want_write(path.mnt);
+	if (!error) {
+		error = setxattr(path.dentry, name, value, size, flags);
+		mnt_drop_write(path.mnt);
+	}
+	path_put(&path);
+	return error;
+}
+
+SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
+		const char __user *, name, const void __user *, value,
+		size_t, size, int, flags)
+{
+	struct path path;
+	int error;
+
+	error = user_lpath(pathname, &path);
+	if (error)
+		return error;
+	error = mnt_want_write(path.mnt);
+	if (!error) {
+		error = setxattr(path.dentry, name, value, size, flags);
+		mnt_drop_write(path.mnt);
+	}
+	path_put(&path);
+	return error;
+}
+
+SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
+		const void __user *,value, size_t, size, int, flags)
+{
+	struct file *f;
+	struct dentry *dentry;
+	int error = -EBADF;
+
+	f = fget(fd);
+	if (!f)
+		return error;
+	dentry = f->f_path.dentry;
+	audit_inode(NULL, dentry);
+	error = mnt_want_write_file(f);
+	if (!error) {
+		error = setxattr(dentry, name, value, size, flags);
+		mnt_drop_write(f->f_path.mnt);
+	}
+	fput(f);
+	return error;
+}
+
+/*
+ * Extended attribute GET operations
+ */
+static ssize_t
+getxattr(struct dentry *d, const char __user *name, void __user *value,
+	 size_t size)
+{
+	ssize_t error;
+	void *kvalue = NULL;
+	char kname[XATTR_NAME_MAX + 1];
+
+	error = strncpy_from_user(kname, name, sizeof(kname));
+	if (error == 0 || error == sizeof(kname))
+		error = -ERANGE;
+	if (error < 0)
+		return error;
+
+	if (size) {
+		if (size > XATTR_SIZE_MAX)
+			size = XATTR_SIZE_MAX;
+		kvalue = kzalloc(size, GFP_KERNEL);
+		if (!kvalue)
+			return -ENOMEM;
+	}
+
+	error = vfs_getxattr(d, kname, kvalue, size);
+	if (error > 0) {
+		if (size && copy_to_user(value, kvalue, error))
+			error = -EFAULT;
+	} else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
+		/* The file system tried to returned a value bigger
+		   than XATTR_SIZE_MAX bytes. Not possible. */
+		error = -E2BIG;
+	}
+	kfree(kvalue);
+	return error;
+}
+
+SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
+		const char __user *, name, void __user *, value, size_t, size)
+{
+	struct path path;
+	ssize_t error;
+
+	error = user_path(pathname, &path);
+	if (error)
+		return error;
+	error = getxattr(path.dentry, name, value, size);
+	path_put(&path);
+	return error;
+}
+
+SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
+		const char __user *, name, void __user *, value, size_t, size)
+{
+	struct path path;
+	ssize_t error;
+
+	error = user_lpath(pathname, &path);
+	if (error)
+		return error;
+	error = getxattr(path.dentry, name, value, size);
+	path_put(&path);
+	return error;
+}
+
+SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
+		void __user *, value, size_t, size)
+{
+	struct file *f;
+	ssize_t error = -EBADF;
+
+	f = fget(fd);
+	if (!f)
+		return error;
+	audit_inode(NULL, f->f_path.dentry);
+	error = getxattr(f->f_path.dentry, name, value, size);
+	fput(f);
+	return error;
+}
+
+/*
+ * Extended attribute LIST operations
+ */
+static ssize_t
+listxattr(struct dentry *d, char __user *list, size_t size)
+{
+	ssize_t error;
+	char *klist = NULL;
+
+	if (size) {
+		if (size > XATTR_LIST_MAX)
+			size = XATTR_LIST_MAX;
+		klist = kmalloc(size, GFP_KERNEL);
+		if (!klist)
+			return -ENOMEM;
+	}
+
+	error = vfs_listxattr(d, klist, size);
+	if (error > 0) {
+		if (size && copy_to_user(list, klist, error))
+			error = -EFAULT;
+	} else if (error == -ERANGE && size >= XATTR_LIST_MAX) {
+		/* The file system tried to returned a list bigger
+		   than XATTR_LIST_MAX bytes. Not possible. */
+		error = -E2BIG;
+	}
+	kfree(klist);
+	return error;
+}
+
+SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
+		size_t, size)
+{
+	struct path path;
+	ssize_t error;
+
+	error = user_path(pathname, &path);
+	if (error)
+		return error;
+	error = listxattr(path.dentry, list, size);
+	path_put(&path);
+	return error;
+}
+
+SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
+		size_t, size)
+{
+	struct path path;
+	ssize_t error;
+
+	error = user_lpath(pathname, &path);
+	if (error)
+		return error;
+	error = listxattr(path.dentry, list, size);
+	path_put(&path);
+	return error;
+}
+
+SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
+{
+	struct file *f;
+	ssize_t error = -EBADF;
+
+	f = fget(fd);
+	if (!f)
+		return error;
+	audit_inode(NULL, f->f_path.dentry);
+	error = listxattr(f->f_path.dentry, list, size);
+	fput(f);
+	return error;
+}
+
+/*
+ * Extended attribute REMOVE operations
+ */
+static long
+removexattr(struct dentry *d, const char __user *name)
+{
+	int error;
+	char kname[XATTR_NAME_MAX + 1];
+
+	error = strncpy_from_user(kname, name, sizeof(kname));
+	if (error == 0 || error == sizeof(kname))
+		error = -ERANGE;
+	if (error < 0)
+		return error;
+
+	return vfs_removexattr(d, kname);
+}
+
+SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
+		const char __user *, name)
+{
+	struct path path;
+	int error;
+
+	error = user_path(pathname, &path);
+	if (error)
+		return error;
+	error = mnt_want_write(path.mnt);
+	if (!error) {
+		error = removexattr(path.dentry, name);
+		mnt_drop_write(path.mnt);
+	}
+	path_put(&path);
+	return error;
+}
+
+SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
+		const char __user *, name)
+{
+	struct path path;
+	int error;
+
+	error = user_lpath(pathname, &path);
+	if (error)
+		return error;
+	error = mnt_want_write(path.mnt);
+	if (!error) {
+		error = removexattr(path.dentry, name);
+		mnt_drop_write(path.mnt);
+	}
+	path_put(&path);
+	return error;
+}
+
+SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
+{
+	struct file *f;
+	struct dentry *dentry;
+	int error = -EBADF;
+
+	f = fget(fd);
+	if (!f)
+		return error;
+	dentry = f->f_path.dentry;
+	audit_inode(NULL, dentry);
+	error = mnt_want_write_file(f);
+	if (!error) {
+		error = removexattr(dentry, name);
+		mnt_drop_write(f->f_path.mnt);
+	}
+	fput(f);
+	return error;
+}
+
+
+static const char *
+strcmp_prefix(const char *a, const char *a_prefix)
+{
+	while (*a_prefix && *a == *a_prefix) {
+		a++;
+		a_prefix++;
+	}
+	return *a_prefix ? NULL : a;
+}
+
+/*
+ * In order to implement different sets of xattr operations for each xattr
+ * prefix with the generic xattr API, a filesystem should create a
+ * null-terminated array of struct xattr_handler (one for each prefix) and
+ * hang a pointer to it off of the s_xattr field of the superblock.
+ *
+ * The generic_fooxattr() functions will use this list to dispatch xattr
+ * operations to the correct xattr_handler.
+ */
+#define for_each_xattr_handler(handlers, handler)		\
+		for ((handler) = *(handlers)++;			\
+			(handler) != NULL;			\
+			(handler) = *(handlers)++)
+
+/*
+ * Find the xattr_handler with the matching prefix.
+ */
+static const struct xattr_handler *
+xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
+{
+	const struct xattr_handler *handler;
+
+	if (!*name)
+		return NULL;
+
+	for_each_xattr_handler(handlers, handler) {
+		const char *n = strcmp_prefix(*name, handler->prefix);
+		if (n) {
+			*name = n;
+			break;
+		}
+	}
+	return handler;
+}
+
+/*
+ * Find the handler for the prefix and dispatch its get() operation.
+ */
+ssize_t
+generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
+{
+	const struct xattr_handler *handler;
+
+	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
+	if (!handler)
+		return -EOPNOTSUPP;
+	return handler->get(dentry, name, buffer, size, handler->flags);
+}
+
+/*
+ * Combine the results of the list() operation from every xattr_handler in the
+ * list.
+ */
+ssize_t
+generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
+	unsigned int size = 0;
+
+	if (!buffer) {
+		for_each_xattr_handler(handlers, handler) {
+			size += handler->list(dentry, NULL, 0, NULL, 0,
+					      handler->flags);
+		}
+	} else {
+		char *buf = buffer;
+
+		for_each_xattr_handler(handlers, handler) {
+			size = handler->list(dentry, buf, buffer_size,
+					     NULL, 0, handler->flags);
+			if (size > buffer_size)
+				return -ERANGE;
+			buf += size;
+			buffer_size -= size;
+		}
+		size = buf - buffer;
+	}
+	return size;
+}
+
+/*
+ * Find the handler for the prefix and dispatch its set() operation.
+ */
+int
+generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
+{
+	const struct xattr_handler *handler;
+
+	if (size == 0)
+		value = "";  /* empty EA, do not remove */
+	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
+	if (!handler)
+		return -EOPNOTSUPP;
+	return handler->set(dentry, name, value, size, flags, handler->flags);
+}
+
+/*
+ * Find the handler for the prefix and dispatch its set() operation to remove
+ * any associated extended attribute.
+ */
+int
+generic_removexattr(struct dentry *dentry, const char *name)
+{
+	const struct xattr_handler *handler;
+
+	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
+	if (!handler)
+		return -EOPNOTSUPP;
+	return handler->set(dentry, name, NULL, 0,
+			    XATTR_REPLACE, handler->flags);
+}
+
+EXPORT_SYMBOL(generic_getxattr);
+EXPORT_SYMBOL(generic_listxattr);
+EXPORT_SYMBOL(generic_setxattr);
+EXPORT_SYMBOL(generic_removexattr);
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
new file mode 100644
index 00000000..8d5a506c
--- /dev/null
+++ b/fs/xattr_acl.c
@@ -0,0 +1,98 @@
+/*
+ * linux/fs/xattr_acl.c
+ *
+ * Almost all from linux/fs/ext2/acl.c:
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/gfp.h>
+
+
+/*
+ * Convert from extended attribute to in-memory representation.
+ */
+struct posix_acl *
+posix_acl_from_xattr(const void *value, size_t size)
+{
+	posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
+	posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
+	int count;
+	struct posix_acl *acl;
+	struct posix_acl_entry *acl_e;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(posix_acl_xattr_header))
+		 return ERR_PTR(-EINVAL);
+	if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	count = posix_acl_xattr_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	acl_e = acl->a_entries;
+	
+	for (end = entry + count; entry != end; acl_e++, entry++) {
+		acl_e->e_tag  = le16_to_cpu(entry->e_tag);
+		acl_e->e_perm = le16_to_cpu(entry->e_perm);
+
+		switch(acl_e->e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				acl_e->e_id = ACL_UNDEFINED_ID;
+				break;
+
+			case ACL_USER:
+			case ACL_GROUP:
+				acl_e->e_id = le32_to_cpu(entry->e_id);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL (posix_acl_from_xattr);
+
+/*
+ * Convert from in-memory to extended attribute representation.
+ */
+int
+posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size)
+{
+	posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
+	posix_acl_xattr_entry *ext_entry = ext_acl->a_entries;
+	int real_size, n;
+
+	real_size = posix_acl_xattr_size(acl->a_count);
+	if (!buffer)
+		return real_size;
+	if (real_size > size)
+		return -ERANGE;
+	
+	ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+
+	for (n=0; n < acl->a_count; n++, ext_entry++) {
+		ext_entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+		ext_entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+		ext_entry->e_id   = cpu_to_le32(acl->a_entries[n].e_id);
+	}
+	return real_size;
+}
+EXPORT_SYMBOL (posix_acl_to_xattr);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
new file mode 100644
index 00000000..6100ec0f
--- /dev/null
+++ b/fs/xfs/Kconfig
@@ -0,0 +1,82 @@
+config XFS_FS
+	tristate "XFS filesystem support"
+	depends on BLOCK
+	select EXPORTFS
+	help
+	  XFS is a high performance journaling filesystem which originated
+	  on the SGI IRIX platform.  It is completely multi-threaded, can
+	  support large files and large filesystems, extended attributes,
+	  variable block sizes, is extent based, and makes extensive use of
+	  Btrees (directories, extents, free space) to aid both performance
+	  and scalability.
+
+	  Refer to the documentation at <http://oss.sgi.com/projects/xfs/>
+	  for complete details.  This implementation is on-disk compatible
+	  with the IRIX version of XFS.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called xfs.  Be aware, however, that if the file
+	  system of your root partition is compiled as a module, you'll need
+	  to use an initial ramdisk (initrd) to boot.
+
+config XFS_QUOTA
+	bool "XFS Quota support"
+	depends on XFS_FS
+	select QUOTACTL
+	help
+	  If you say Y here, you will be able to set limits for disk usage on
+	  a per user and/or a per group basis under XFS.  XFS considers quota
+	  information as filesystem metadata and uses journaling to provide a
+	  higher level guarantee of consistency.  The on-disk data format for
+	  quota is also compatible with the IRIX version of XFS, allowing a
+	  filesystem to be migrated between Linux and IRIX without any need
+	  for conversion.
+
+	  If unsure, say N.  More comprehensive documentation can be found in
+	  README.quota in the xfsprogs package.  XFS quota can be used either
+	  with or without the generic quota support enabled (CONFIG_QUOTA) -
+	  they are completely independent subsystems.
+
+config XFS_POSIX_ACL
+	bool "XFS POSIX ACL support"
+	depends on XFS_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N.
+
+config XFS_RT
+	bool "XFS Realtime subvolume support"
+	depends on XFS_FS
+	help
+	  If you say Y here you will be able to mount and use XFS filesystems
+	  which contain a realtime subvolume.  The realtime subvolume is a
+	  separate area of disk space where only file data is stored.  It was
+	  originally designed to provide deterministic data rates suitable
+	  for media streaming applications, but is also useful as a generic
+	  mechanism for ensuring data and metadata/log I/Os are completely
+	  separated.  Regular file I/Os are isolated to a separate device
+	  from all other requests, and this can be done quite transparently
+	  to applications via the inherit-realtime directory inode flag.
+
+	  See the xfs man page in section 5 for additional information.
+
+	  If unsure, say N.
+
+config XFS_DEBUG
+	bool "XFS Debugging support (EXPERIMENTAL)"
+	depends on XFS_FS && EXPERIMENTAL
+	help
+	  Say Y here to get an XFS build with many debugging features,
+	  including ASSERT checks, function wrappers around macros,
+	  and extra sanity-checking functions in various code paths.
+
+	  Note that the resulting code will be HUGE and SLOW, and probably
+	  not useful unless you are debugging a particular problem.
+
+	  Say N unless you are an XFS developer, or you play one on TV.
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
new file mode 100644
index 00000000..284a7c89
--- /dev/null
+++ b/fs/xfs/Makefile
@@ -0,0 +1,111 @@
+#
+# Copyright (c) 2000-2005 Silicon Graphics, Inc.
+# All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+
+ccflags-y := -I$(src) -I$(src)/linux-2.6
+ccflags-$(CONFIG_XFS_DEBUG) += -g
+
+XFS_LINUX := linux-2.6
+
+obj-$(CONFIG_XFS_FS)		+= xfs.o
+
+xfs-y				+= linux-2.6/xfs_trace.o
+
+xfs-$(CONFIG_XFS_QUOTA)		+= $(addprefix quota/, \
+				   xfs_dquot.o \
+				   xfs_dquot_item.o \
+				   xfs_trans_dquot.o \
+				   xfs_qm_syscalls.o \
+				   xfs_qm_bhv.o \
+				   xfs_qm.o)
+xfs-$(CONFIG_XFS_QUOTA)		+= linux-2.6/xfs_quotaops.o
+
+ifeq ($(CONFIG_XFS_QUOTA),y)
+xfs-$(CONFIG_PROC_FS)		+= quota/xfs_qm_stats.o
+endif
+
+xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
+xfs-$(CONFIG_XFS_POSIX_ACL)	+= $(XFS_LINUX)/xfs_acl.o
+xfs-$(CONFIG_PROC_FS)		+= $(XFS_LINUX)/xfs_stats.o
+xfs-$(CONFIG_SYSCTL)		+= $(XFS_LINUX)/xfs_sysctl.o
+xfs-$(CONFIG_COMPAT)		+= $(XFS_LINUX)/xfs_ioctl32.o
+
+
+xfs-y				+= xfs_alloc.o \
+				   xfs_alloc_btree.o \
+				   xfs_attr.o \
+				   xfs_attr_leaf.o \
+				   xfs_bit.o \
+				   xfs_bmap.o \
+				   xfs_bmap_btree.o \
+				   xfs_btree.o \
+				   xfs_buf_item.o \
+				   xfs_da_btree.o \
+				   xfs_dir2.o \
+				   xfs_dir2_block.o \
+				   xfs_dir2_data.o \
+				   xfs_dir2_leaf.o \
+				   xfs_dir2_node.o \
+				   xfs_dir2_sf.o \
+				   xfs_error.o \
+				   xfs_extfree_item.o \
+				   xfs_filestream.o \
+				   xfs_fsops.o \
+				   xfs_ialloc.o \
+				   xfs_ialloc_btree.o \
+				   xfs_iget.o \
+				   xfs_inode.o \
+				   xfs_inode_item.o \
+				   xfs_iomap.o \
+				   xfs_itable.o \
+				   xfs_dfrag.o \
+				   xfs_log.o \
+				   xfs_log_cil.o \
+				   xfs_log_recover.o \
+				   xfs_mount.o \
+				   xfs_mru_cache.o \
+				   xfs_rename.o \
+				   xfs_trans.o \
+				   xfs_trans_ail.o \
+				   xfs_trans_buf.o \
+				   xfs_trans_extfree.o \
+				   xfs_trans_inode.o \
+				   xfs_utils.o \
+				   xfs_vnodeops.o \
+				   xfs_rw.o
+
+xfs-$(CONFIG_XFS_TRACE)		+= xfs_btree_trace.o
+
+# Objects in linux/
+xfs-y				+= $(addprefix $(XFS_LINUX)/, \
+				   kmem.o \
+				   xfs_aops.o \
+				   xfs_buf.o \
+				   xfs_discard.o \
+				   xfs_export.o \
+				   xfs_file.o \
+				   xfs_fs_subr.o \
+				   xfs_globals.o \
+				   xfs_ioctl.o \
+				   xfs_iops.o \
+				   xfs_message.o \
+				   xfs_super.o \
+				   xfs_sync.o \
+				   xfs_xattr.o)
+
+# Objects in support/
+xfs-y				+= support/uuid.o
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
new file mode 100644
index 00000000..a907de56
--- /dev/null
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include "time.h"
+#include "kmem.h"
+#include "xfs_message.h"
+
+/*
+ * Greedy allocation.  May fail and may return vmalloced memory.
+ *
+ * Must be freed using kmem_free_large.
+ */
+void *
+kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
+{
+	void		*ptr;
+	size_t		kmsize = maxsize;
+
+	while (!(ptr = kmem_zalloc_large(kmsize))) {
+		if ((kmsize >>= 1) <= minsize)
+			kmsize = minsize;
+	}
+	if (ptr)
+		*size = kmsize;
+	return ptr;
+}
+
+void *
+kmem_alloc(size_t size, unsigned int __nocast flags)
+{
+	int	retries = 0;
+	gfp_t	lflags = kmem_flags_convert(flags);
+	void	*ptr;
+
+	do {
+		ptr = kmalloc(size, lflags);
+		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+			return ptr;
+		if (!(++retries % 100))
+			xfs_err(NULL,
+		"possible memory allocation deadlock in %s (mode:0x%x)",
+					__func__, lflags);
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+	} while (1);
+}
+
+void *
+kmem_zalloc(size_t size, unsigned int __nocast flags)
+{
+	void	*ptr;
+
+	ptr = kmem_alloc(size, flags);
+	if (ptr)
+		memset((char *)ptr, 0, (int)size);
+	return ptr;
+}
+
+void
+kmem_free(const void *ptr)
+{
+	if (!is_vmalloc_addr(ptr)) {
+		kfree(ptr);
+	} else {
+		vfree(ptr);
+	}
+}
+
+void *
+kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
+	     unsigned int __nocast flags)
+{
+	void	*new;
+
+	new = kmem_alloc(newsize, flags);
+	if (ptr) {
+		if (new)
+			memcpy(new, ptr,
+				((oldsize < newsize) ? oldsize : newsize));
+		kmem_free(ptr);
+	}
+	return new;
+}
+
+void *
+kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
+{
+	int	retries = 0;
+	gfp_t	lflags = kmem_flags_convert(flags);
+	void	*ptr;
+
+	do {
+		ptr = kmem_cache_alloc(zone, lflags);
+		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+			return ptr;
+		if (!(++retries % 100))
+			xfs_err(NULL,
+		"possible memory allocation deadlock in %s (mode:0x%x)",
+					__func__, lflags);
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+	} while (1);
+}
+
+void *
+kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
+{
+	void	*ptr;
+
+	ptr = kmem_zone_alloc(zone, flags);
+	if (ptr)
+		memset((char *)ptr, 0, kmem_cache_size(zone));
+	return ptr;
+}
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
new file mode 100644
index 00000000..f7c8f7a9
--- /dev/null
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPPORT_KMEM_H__
+#define __XFS_SUPPORT_KMEM_H__
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+/*
+ * General memory allocation interfaces
+ */
+
+#define KM_SLEEP	0x0001u
+#define KM_NOSLEEP	0x0002u
+#define KM_NOFS		0x0004u
+#define KM_MAYFAIL	0x0008u
+
+/*
+ * We use a special process flag to avoid recursive callbacks into
+ * the filesystem during transactions.  We will also issue our own
+ * warnings, so we explicitly skip any generic ones (silly of us).
+ */
+static inline gfp_t
+kmem_flags_convert(unsigned int __nocast flags)
+{
+	gfp_t	lflags;
+
+	BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
+
+	if (flags & KM_NOSLEEP) {
+		lflags = GFP_ATOMIC | __GFP_NOWARN;
+	} else {
+		lflags = GFP_KERNEL | __GFP_NOWARN;
+		if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+			lflags &= ~__GFP_FS;
+	}
+	return lflags;
+}
+
+extern void *kmem_alloc(size_t, unsigned int __nocast);
+extern void *kmem_zalloc(size_t, unsigned int __nocast);
+extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
+extern void  kmem_free(const void *);
+
+static inline void *kmem_zalloc_large(size_t size)
+{
+	void *ptr;
+
+	ptr = vmalloc(size);
+	if (ptr)
+		memset(ptr, 0, size);
+	return ptr;
+}
+static inline void kmem_free_large(void *ptr)
+{
+	vfree(ptr);
+}
+
+extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
+
+/*
+ * Zone interfaces
+ */
+
+#define KM_ZONE_HWALIGN	SLAB_HWCACHE_ALIGN
+#define KM_ZONE_RECLAIM	SLAB_RECLAIM_ACCOUNT
+#define KM_ZONE_SPREAD	SLAB_MEM_SPREAD
+
+#define kmem_zone	kmem_cache
+#define kmem_zone_t	struct kmem_cache
+
+static inline kmem_zone_t *
+kmem_zone_init(int size, char *zone_name)
+{
+	return kmem_cache_create(zone_name, size, 0, 0, NULL);
+}
+
+static inline kmem_zone_t *
+kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
+		     void (*construct)(void *))
+{
+	return kmem_cache_create(zone_name, size, 0, flags, construct);
+}
+
+static inline void
+kmem_zone_free(kmem_zone_t *zone, void *ptr)
+{
+	kmem_cache_free(zone, ptr);
+}
+
+static inline void
+kmem_zone_destroy(kmem_zone_t *zone)
+{
+	if (zone)
+		kmem_cache_destroy(zone);
+}
+
+extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
+extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
+
+static inline int
+kmem_shake_allow(gfp_t gfp_mask)
+{
+	return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS));
+}
+
+#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h
new file mode 100644
index 00000000..ff6a1987
--- /dev/null
+++ b/fs/xfs/linux-2.6/mrlock.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPPORT_MRLOCK_H__
+#define __XFS_SUPPORT_MRLOCK_H__
+
+#include <linux/rwsem.h>
+
+typedef struct {
+	struct rw_semaphore	mr_lock;
+#ifdef DEBUG
+	int			mr_writer;
+#endif
+} mrlock_t;
+
+#ifdef DEBUG
+#define mrinit(mrp, name)	\
+	do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
+#else
+#define mrinit(mrp, name)	\
+	do { init_rwsem(&(mrp)->mr_lock); } while (0)
+#endif
+
+#define mrlock_init(mrp, t,n,s)	mrinit(mrp, n)
+#define mrfree(mrp)		do { } while (0)
+
+static inline void mraccess_nested(mrlock_t *mrp, int subclass)
+{
+	down_read_nested(&mrp->mr_lock, subclass);
+}
+
+static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
+{
+	down_write_nested(&mrp->mr_lock, subclass);
+#ifdef DEBUG
+	mrp->mr_writer = 1;
+#endif
+}
+
+static inline int mrtryaccess(mrlock_t *mrp)
+{
+	return down_read_trylock(&mrp->mr_lock);
+}
+
+static inline int mrtryupdate(mrlock_t *mrp)
+{
+	if (!down_write_trylock(&mrp->mr_lock))
+		return 0;
+#ifdef DEBUG
+	mrp->mr_writer = 1;
+#endif
+	return 1;
+}
+
+static inline void mrunlock_excl(mrlock_t *mrp)
+{
+#ifdef DEBUG
+	mrp->mr_writer = 0;
+#endif
+	up_write(&mrp->mr_lock);
+}
+
+static inline void mrunlock_shared(mrlock_t *mrp)
+{
+	up_read(&mrp->mr_lock);
+}
+
+static inline void mrdemote(mrlock_t *mrp)
+{
+#ifdef DEBUG
+	mrp->mr_writer = 0;
+#endif
+	downgrade_write(&mrp->mr_lock);
+}
+
+#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/linux-2.6/time.h
new file mode 100644
index 00000000..387e695a
--- /dev/null
+++ b/fs/xfs/linux-2.6/time.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPPORT_TIME_H__
+#define __XFS_SUPPORT_TIME_H__
+
+#include <linux/sched.h>
+#include <linux/time.h>
+
+typedef struct timespec timespec_t;
+
+static inline void delay(long ticks)
+{
+	schedule_timeout_uninterruptible(ticks);
+}
+
+static inline void nanotime(struct timespec *tvp)
+{
+	*tvp = CURRENT_TIME;
+}
+
+#endif /* __XFS_SUPPORT_TIME_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
new file mode 100644
index 00000000..f86e0348
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_acl.h"
+#include "xfs_attr.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+
+
+/*
+ * Locking scheme:
+ *  - all ACL updates are protected by inode->i_mutex, which is taken before
+ *    calling into this file.
+ */
+
+STATIC struct posix_acl *
+xfs_acl_from_disk(struct xfs_acl *aclp)
+{
+	struct posix_acl_entry *acl_e;
+	struct posix_acl *acl;
+	struct xfs_acl_entry *ace;
+	unsigned int count, i;
+
+	count = be32_to_cpu(aclp->acl_cnt);
+	if (count > XFS_ACL_MAX_ENTRIES)
+		return ERR_PTR(-EFSCORRUPTED);
+
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < count; i++) {
+		acl_e = &acl->a_entries[i];
+		ace = &aclp->acl_entry[i];
+
+		/*
+		 * The tag is 32 bits on disk and 16 bits in core.
+		 *
+		 * Because every access to it goes through the core
+		 * format first this is not a problem.
+		 */
+		acl_e->e_tag = be32_to_cpu(ace->ae_tag);
+		acl_e->e_perm = be16_to_cpu(ace->ae_perm);
+
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			acl_e->e_id = be32_to_cpu(ace->ae_id);
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			acl_e->e_id = ACL_UNDEFINED_ID;
+			break;
+		default:
+			goto fail;
+		}
+	}
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+STATIC void
+xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
+{
+	const struct posix_acl_entry *acl_e;
+	struct xfs_acl_entry *ace;
+	int i;
+
+	aclp->acl_cnt = cpu_to_be32(acl->a_count);
+	for (i = 0; i < acl->a_count; i++) {
+		ace = &aclp->acl_entry[i];
+		acl_e = &acl->a_entries[i];
+
+		ace->ae_tag = cpu_to_be32(acl_e->e_tag);
+		ace->ae_id = cpu_to_be32(acl_e->e_id);
+		ace->ae_perm = cpu_to_be16(acl_e->e_perm);
+	}
+}
+
+struct posix_acl *
+xfs_get_acl(struct inode *inode, int type)
+{
+	struct xfs_inode *ip = XFS_I(inode);
+	struct posix_acl *acl;
+	struct xfs_acl *xfs_acl;
+	int len = sizeof(struct xfs_acl);
+	unsigned char *ea_name;
+	int error;
+
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		ea_name = SGI_ACL_FILE;
+		break;
+	case ACL_TYPE_DEFAULT:
+		ea_name = SGI_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+
+	/*
+	 * If we have a cached ACLs value just return it, not need to
+	 * go out to the disk.
+	 */
+
+	xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+	if (!xfs_acl)
+		return ERR_PTR(-ENOMEM);
+
+	error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
+							&len, ATTR_ROOT);
+	if (error) {
+		/*
+		 * If the attribute doesn't exist make sure we have a negative
+		 * cache entry, for any other error assume it is transient and
+		 * leave the cache entry as ACL_NOT_CACHED.
+		 */
+		if (error == -ENOATTR) {
+			acl = NULL;
+			goto out_update_cache;
+		}
+		goto out;
+	}
+
+	acl = xfs_acl_from_disk(xfs_acl);
+	if (IS_ERR(acl))
+		goto out;
+
+ out_update_cache:
+	set_cached_acl(inode, type, acl);
+ out:
+	kfree(xfs_acl);
+	return acl;
+}
+
+STATIC int
+xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+	struct xfs_inode *ip = XFS_I(inode);
+	unsigned char *ea_name;
+	int error;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		ea_name = SGI_ACL_FILE;
+		break;
+	case ACL_TYPE_DEFAULT:
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		ea_name = SGI_ACL_DEFAULT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		struct xfs_acl *xfs_acl;
+		int len;
+
+		xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+		if (!xfs_acl)
+			return -ENOMEM;
+
+		xfs_acl_to_disk(xfs_acl, acl);
+		len = sizeof(struct xfs_acl) -
+			(sizeof(struct xfs_acl_entry) *
+			 (XFS_ACL_MAX_ENTRIES - acl->a_count));
+
+		error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
+				len, ATTR_ROOT);
+
+		kfree(xfs_acl);
+	} else {
+		/*
+		 * A NULL ACL argument means we want to remove the ACL.
+		 */
+		error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT);
+
+		/*
+		 * If the attribute didn't exist to start with that's fine.
+		 */
+		if (error == -ENOATTR)
+			error = 0;
+	}
+
+	if (!error)
+		set_cached_acl(inode, type, acl);
+	return error;
+}
+
+int
+xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
+{
+	struct xfs_inode *ip;
+	struct posix_acl *acl;
+	int error = -EAGAIN;
+
+	ip = XFS_I(inode);
+	trace_xfs_check_acl(ip);
+
+	/*
+	 * If there is no attribute fork no ACL exists on this inode and
+	 * we can skip the whole exercise.
+	 */
+	if (!XFS_IFORK_Q(ip))
+		return -EAGAIN;
+
+	if (flags & IPERM_FLAG_RCU) {
+		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+			return -ECHILD;
+		return -EAGAIN;
+	}
+
+	acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+	}
+
+	return error;
+}
+
+static int
+xfs_set_mode(struct inode *inode, mode_t mode)
+{
+	int error = 0;
+
+	if (mode != inode->i_mode) {
+		struct iattr iattr;
+
+		iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
+		iattr.ia_mode = mode;
+		iattr.ia_ctime = current_fs_time(inode->i_sb);
+
+		error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
+	}
+
+	return error;
+}
+
+static int
+xfs_acl_exists(struct inode *inode, unsigned char *name)
+{
+	int len = sizeof(struct xfs_acl);
+
+	return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
+			    ATTR_ROOT|ATTR_KERNOVAL) == 0);
+}
+
+int
+posix_acl_access_exists(struct inode *inode)
+{
+	return xfs_acl_exists(inode, SGI_ACL_FILE);
+}
+
+int
+posix_acl_default_exists(struct inode *inode)
+{
+	if (!S_ISDIR(inode->i_mode))
+		return 0;
+	return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
+}
+
+/*
+ * No need for i_mutex because the inode is not yet exposed to the VFS.
+ */
+int
+xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl)
+{
+	struct posix_acl *clone;
+	mode_t mode;
+	int error = 0, inherit = 0;
+
+	if (S_ISDIR(inode->i_mode)) {
+		error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl);
+		if (error)
+			return error;
+	}
+
+	clone = posix_acl_clone(default_acl, GFP_KERNEL);
+	if (!clone)
+		return -ENOMEM;
+
+	mode = inode->i_mode;
+	error = posix_acl_create_masq(clone, &mode);
+	if (error < 0)
+		goto out_release_clone;
+
+	/*
+	 * If posix_acl_create_masq returns a positive value we need to
+	 * inherit a permission that can't be represented using the Unix
+	 * mode bits and we actually need to set an ACL.
+	 */
+	if (error > 0)
+		inherit = 1;
+
+	error = xfs_set_mode(inode, mode);
+	if (error)
+		goto out_release_clone;
+
+	if (inherit)
+		error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+
+ out_release_clone:
+	posix_acl_release(clone);
+	return error;
+}
+
+int
+xfs_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+	int error;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+
+	error = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!error)
+		error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+
+	posix_acl_release(clone);
+	return error;
+}
+
+static int
+xfs_xattr_acl_get(struct dentry *dentry, const char *name,
+		void *value, size_t size, int type)
+{
+	struct posix_acl *acl;
+	int error;
+
+	acl = xfs_get_acl(dentry->d_inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+
+	error = posix_acl_to_xattr(acl, value, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+static int
+xfs_xattr_acl_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	struct inode *inode = dentry->d_inode;
+	struct posix_acl *acl = NULL;
+	int error = 0;
+
+	if (flags & XATTR_CREATE)
+		return -EINVAL;
+	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+		return value ? -EACCES : 0;
+	if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+		return -EPERM;
+
+	if (!value)
+		goto set_acl;
+
+	acl = posix_acl_from_xattr(value, size);
+	if (!acl) {
+		/*
+		 * acl_set_file(3) may request that we set default ACLs with
+		 * zero length -- defend (gracefully) against that here.
+		 */
+		goto out;
+	}
+	if (IS_ERR(acl)) {
+		error = PTR_ERR(acl);
+		goto out;
+	}
+
+	error = posix_acl_valid(acl);
+	if (error)
+		goto out_release;
+
+	error = -EINVAL;
+	if (acl->a_count > XFS_ACL_MAX_ENTRIES)
+		goto out_release;
+
+	if (type == ACL_TYPE_ACCESS) {
+		mode_t mode = inode->i_mode;
+		error = posix_acl_equiv_mode(acl, &mode);
+
+		if (error <= 0) {
+			posix_acl_release(acl);
+			acl = NULL;
+
+			if (error < 0)
+				return error;
+		}
+
+		error = xfs_set_mode(inode, mode);
+		if (error)
+			goto out_release;
+	}
+
+ set_acl:
+	error = xfs_set_acl(inode, type, acl);
+ out_release:
+	posix_acl_release(acl);
+ out:
+	return error;
+}
+
+const struct xattr_handler xfs_xattr_acl_access_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.flags	= ACL_TYPE_ACCESS,
+	.get	= xfs_xattr_acl_get,
+	.set	= xfs_xattr_acl_set,
+};
+
+const struct xattr_handler xfs_xattr_acl_default_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.flags	= ACL_TYPE_DEFAULT,
+	.get	= xfs_xattr_acl_get,
+	.set	= xfs_xattr_acl_set,
+};
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
new file mode 100644
index 00000000..79ce38be
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -0,0 +1,1506 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_trans.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_rw.h"
+#include "xfs_iomap.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+#include <linux/gfp.h>
+#include <linux/mpage.h>
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+
+
+/*
+ * Prime number of hash buckets since address is used as the key.
+ */
+#define NVSYNC		37
+#define to_ioend_wq(v)	(&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
+static wait_queue_head_t xfs_ioend_wq[NVSYNC];
+
+void __init
+xfs_ioend_init(void)
+{
+	int i;
+
+	for (i = 0; i < NVSYNC; i++)
+		init_waitqueue_head(&xfs_ioend_wq[i]);
+}
+
+void
+xfs_ioend_wait(
+	xfs_inode_t	*ip)
+{
+	wait_queue_head_t *wq = to_ioend_wq(ip);
+
+	wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
+}
+
+STATIC void
+xfs_ioend_wake(
+	xfs_inode_t	*ip)
+{
+	if (atomic_dec_and_test(&ip->i_iocount))
+		wake_up(to_ioend_wq(ip));
+}
+
+void
+xfs_count_page_state(
+	struct page		*page,
+	int			*delalloc,
+	int			*unwritten)
+{
+	struct buffer_head	*bh, *head;
+
+	*delalloc = *unwritten = 0;
+
+	bh = head = page_buffers(page);
+	do {
+		if (buffer_unwritten(bh))
+			(*unwritten) = 1;
+		else if (buffer_delay(bh))
+			(*delalloc) = 1;
+	} while ((bh = bh->b_this_page) != head);
+}
+
+STATIC struct block_device *
+xfs_find_bdev_for_inode(
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (XFS_IS_REALTIME_INODE(ip))
+		return mp->m_rtdev_targp->bt_bdev;
+	else
+		return mp->m_ddev_targp->bt_bdev;
+}
+
+/*
+ * We're now finished for good with this ioend structure.
+ * Update the page state via the associated buffer_heads,
+ * release holds on the inode and bio, and finally free
+ * up memory.  Do not use the ioend after this.
+ */
+STATIC void
+xfs_destroy_ioend(
+	xfs_ioend_t		*ioend)
+{
+	struct buffer_head	*bh, *next;
+	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
+
+	for (bh = ioend->io_buffer_head; bh; bh = next) {
+		next = bh->b_private;
+		bh->b_end_io(bh, !ioend->io_error);
+	}
+
+	/*
+	 * Volume managers supporting multiple paths can send back ENODEV
+	 * when the final path disappears.  In this case continuing to fill
+	 * the page cache with dirty data which cannot be written out is
+	 * evil, so prevent that.
+	 */
+	if (unlikely(ioend->io_error == -ENODEV)) {
+		xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
+				      __FILE__, __LINE__);
+	}
+
+	xfs_ioend_wake(ip);
+	mempool_free(ioend, xfs_ioend_pool);
+}
+
+/*
+ * If the end of the current ioend is beyond the current EOF,
+ * return the new EOF value, otherwise zero.
+ */
+STATIC xfs_fsize_t
+xfs_ioend_new_eof(
+	xfs_ioend_t		*ioend)
+{
+	xfs_inode_t		*ip = XFS_I(ioend->io_inode);
+	xfs_fsize_t		isize;
+	xfs_fsize_t		bsize;
+
+	bsize = ioend->io_offset + ioend->io_size;
+	isize = MAX(ip->i_size, ip->i_new_size);
+	isize = MIN(isize, bsize);
+	return isize > ip->i_d.di_size ? isize : 0;
+}
+
+/*
+ * Update on-disk file size now that data has been written to disk.  The
+ * current in-memory file size is i_size.  If a write is beyond eof i_new_size
+ * will be the intended file size until i_size is updated.  If this write does
+ * not extend all the way to the valid file size then restrict this update to
+ * the end of the write.
+ *
+ * This function does not block as blocking on the inode lock in IO completion
+ * can lead to IO completion order dependency deadlocks.. If it can't get the
+ * inode ilock it will return EAGAIN. Callers must handle this.
+ */
+STATIC int
+xfs_setfilesize(
+	xfs_ioend_t		*ioend)
+{
+	xfs_inode_t		*ip = XFS_I(ioend->io_inode);
+	xfs_fsize_t		isize;
+
+	if (unlikely(ioend->io_error))
+		return 0;
+
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+		return EAGAIN;
+
+	isize = xfs_ioend_new_eof(ioend);
+	if (isize) {
+		ip->i_d.di_size = isize;
+		xfs_mark_inode_dirty(ip);
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+}
+
+/*
+ * Schedule IO completion handling on the final put of an ioend.
+ */
+STATIC void
+xfs_finish_ioend(
+	struct xfs_ioend	*ioend)
+{
+	if (atomic_dec_and_test(&ioend->io_remaining)) {
+		if (ioend->io_type == IO_UNWRITTEN)
+			queue_work(xfsconvertd_workqueue, &ioend->io_work);
+		else
+			queue_work(xfsdatad_workqueue, &ioend->io_work);
+	}
+}
+
+/*
+ * IO write completion.
+ */
+STATIC void
+xfs_end_io(
+	struct work_struct *work)
+{
+	xfs_ioend_t	*ioend = container_of(work, xfs_ioend_t, io_work);
+	struct xfs_inode *ip = XFS_I(ioend->io_inode);
+	int		error = 0;
+
+	/*
+	 * For unwritten extents we need to issue transactions to convert a
+	 * range to normal written extens after the data I/O has finished.
+	 */
+	if (ioend->io_type == IO_UNWRITTEN &&
+	    likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
+
+		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+						 ioend->io_size);
+		if (error)
+			ioend->io_error = error;
+	}
+
+	/*
+	 * We might have to update the on-disk file size after extending
+	 * writes.
+	 */
+	error = xfs_setfilesize(ioend);
+	ASSERT(!error || error == EAGAIN);
+
+	/*
+	 * If we didn't complete processing of the ioend, requeue it to the
+	 * tail of the workqueue for another attempt later. Otherwise destroy
+	 * it.
+	 */
+	if (error == EAGAIN) {
+		atomic_inc(&ioend->io_remaining);
+		xfs_finish_ioend(ioend);
+		/* ensure we don't spin on blocked ioends */
+		delay(1);
+	} else {
+		if (ioend->io_iocb)
+			aio_complete(ioend->io_iocb, ioend->io_result, 0);
+		xfs_destroy_ioend(ioend);
+	}
+}
+
+/*
+ * Call IO completion handling in caller context on the final put of an ioend.
+ */
+STATIC void
+xfs_finish_ioend_sync(
+	struct xfs_ioend	*ioend)
+{
+	if (atomic_dec_and_test(&ioend->io_remaining))
+		xfs_end_io(&ioend->io_work);
+}
+
+/*
+ * Allocate and initialise an IO completion structure.
+ * We need to track unwritten extent write completion here initially.
+ * We'll need to extend this for updating the ondisk inode size later
+ * (vs. incore size).
+ */
+STATIC xfs_ioend_t *
+xfs_alloc_ioend(
+	struct inode		*inode,
+	unsigned int		type)
+{
+	xfs_ioend_t		*ioend;
+
+	ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
+
+	/*
+	 * Set the count to 1 initially, which will prevent an I/O
+	 * completion callback from happening before we have started
+	 * all the I/O from calling the completion routine too early.
+	 */
+	atomic_set(&ioend->io_remaining, 1);
+	ioend->io_error = 0;
+	ioend->io_list = NULL;
+	ioend->io_type = type;
+	ioend->io_inode = inode;
+	ioend->io_buffer_head = NULL;
+	ioend->io_buffer_tail = NULL;
+	atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
+	ioend->io_offset = 0;
+	ioend->io_size = 0;
+	ioend->io_iocb = NULL;
+	ioend->io_result = 0;
+
+	INIT_WORK(&ioend->io_work, xfs_end_io);
+	return ioend;
+}
+
+STATIC int
+xfs_map_blocks(
+	struct inode		*inode,
+	loff_t			offset,
+	struct xfs_bmbt_irec	*imap,
+	int			type,
+	int			nonblocking)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	ssize_t			count = 1 << inode->i_blkbits;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	int			error = 0;
+	int			bmapi_flags = XFS_BMAPI_ENTIRE;
+	int			nimaps = 1;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -XFS_ERROR(EIO);
+
+	if (type == IO_UNWRITTEN)
+		bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+		if (nonblocking)
+			return -XFS_ERROR(EAGAIN);
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+	}
+
+	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+	       (ip->i_df.if_flags & XFS_IFEXTENTS));
+	ASSERT(offset <= mp->m_maxioffset);
+
+	if (offset + count > mp->m_maxioffset)
+		count = mp->m_maxioffset - offset;
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+			  bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (error)
+		return -XFS_ERROR(error);
+
+	if (type == IO_DELALLOC &&
+	    (!nimaps || isnullstartblock(imap->br_startblock))) {
+		error = xfs_iomap_write_allocate(ip, offset, count, imap);
+		if (!error)
+			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+		return -XFS_ERROR(error);
+	}
+
+#ifdef DEBUG
+	if (type == IO_UNWRITTEN) {
+		ASSERT(nimaps);
+		ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+		ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+	}
+#endif
+	if (nimaps)
+		trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+	return 0;
+}
+
+STATIC int
+xfs_imap_valid(
+	struct inode		*inode,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset)
+{
+	offset >>= inode->i_blkbits;
+
+	return offset >= imap->br_startoff &&
+		offset < imap->br_startoff + imap->br_blockcount;
+}
+
+/*
+ * BIO completion handler for buffered IO.
+ */
+STATIC void
+xfs_end_bio(
+	struct bio		*bio,
+	int			error)
+{
+	xfs_ioend_t		*ioend = bio->bi_private;
+
+	ASSERT(atomic_read(&bio->bi_cnt) >= 1);
+	ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
+
+	/* Toss bio and pass work off to an xfsdatad thread */
+	bio->bi_private = NULL;
+	bio->bi_end_io = NULL;
+	bio_put(bio);
+
+	xfs_finish_ioend(ioend);
+}
+
+STATIC void
+xfs_submit_ioend_bio(
+	struct writeback_control *wbc,
+	xfs_ioend_t		*ioend,
+	struct bio		*bio)
+{
+	atomic_inc(&ioend->io_remaining);
+	bio->bi_private = ioend;
+	bio->bi_end_io = xfs_end_bio;
+
+	/*
+	 * If the I/O is beyond EOF we mark the inode dirty immediately
+	 * but don't update the inode size until I/O completion.
+	 */
+	if (xfs_ioend_new_eof(ioend))
+		xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
+
+	submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
+}
+
+STATIC struct bio *
+xfs_alloc_ioend_bio(
+	struct buffer_head	*bh)
+{
+	int			nvecs = bio_get_nr_vecs(bh->b_bdev);
+	struct bio		*bio = bio_alloc(GFP_NOIO, nvecs);
+
+	ASSERT(bio->bi_private == NULL);
+	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+	return bio;
+}
+
+STATIC void
+xfs_start_buffer_writeback(
+	struct buffer_head	*bh)
+{
+	ASSERT(buffer_mapped(bh));
+	ASSERT(buffer_locked(bh));
+	ASSERT(!buffer_delay(bh));
+	ASSERT(!buffer_unwritten(bh));
+
+	mark_buffer_async_write(bh);
+	set_buffer_uptodate(bh);
+	clear_buffer_dirty(bh);
+}
+
+STATIC void
+xfs_start_page_writeback(
+	struct page		*page,
+	int			clear_dirty,
+	int			buffers)
+{
+	ASSERT(PageLocked(page));
+	ASSERT(!PageWriteback(page));
+	if (clear_dirty)
+		clear_page_dirty_for_io(page);
+	set_page_writeback(page);
+	unlock_page(page);
+	/* If no buffers on the page are to be written, finish it here */
+	if (!buffers)
+		end_page_writeback(page);
+}
+
+static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
+{
+	return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+}
+
+/*
+ * Submit all of the bios for all of the ioends we have saved up, covering the
+ * initial writepage page and also any probed pages.
+ *
+ * Because we may have multiple ioends spanning a page, we need to start
+ * writeback on all the buffers before we submit them for I/O. If we mark the
+ * buffers as we got, then we can end up with a page that only has buffers
+ * marked async write and I/O complete on can occur before we mark the other
+ * buffers async write.
+ *
+ * The end result of this is that we trip a bug in end_page_writeback() because
+ * we call it twice for the one page as the code in end_buffer_async_write()
+ * assumes that all buffers on the page are started at the same time.
+ *
+ * The fix is two passes across the ioend list - one to start writeback on the
+ * buffer_heads, and then submit them for I/O on the second pass.
+ */
+STATIC void
+xfs_submit_ioend(
+	struct writeback_control *wbc,
+	xfs_ioend_t		*ioend)
+{
+	xfs_ioend_t		*head = ioend;
+	xfs_ioend_t		*next;
+	struct buffer_head	*bh;
+	struct bio		*bio;
+	sector_t		lastblock = 0;
+
+	/* Pass 1 - start writeback */
+	do {
+		next = ioend->io_list;
+		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
+			xfs_start_buffer_writeback(bh);
+	} while ((ioend = next) != NULL);
+
+	/* Pass 2 - submit I/O */
+	ioend = head;
+	do {
+		next = ioend->io_list;
+		bio = NULL;
+
+		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+
+			if (!bio) {
+ retry:
+				bio = xfs_alloc_ioend_bio(bh);
+			} else if (bh->b_blocknr != lastblock + 1) {
+				xfs_submit_ioend_bio(wbc, ioend, bio);
+				goto retry;
+			}
+
+			if (bio_add_buffer(bio, bh) != bh->b_size) {
+				xfs_submit_ioend_bio(wbc, ioend, bio);
+				goto retry;
+			}
+
+			lastblock = bh->b_blocknr;
+		}
+		if (bio)
+			xfs_submit_ioend_bio(wbc, ioend, bio);
+		xfs_finish_ioend(ioend);
+	} while ((ioend = next) != NULL);
+}
+
+/*
+ * Cancel submission of all buffer_heads so far in this endio.
+ * Toss the endio too.  Only ever called for the initial page
+ * in a writepage request, so only ever one page.
+ */
+STATIC void
+xfs_cancel_ioend(
+	xfs_ioend_t		*ioend)
+{
+	xfs_ioend_t		*next;
+	struct buffer_head	*bh, *next_bh;
+
+	do {
+		next = ioend->io_list;
+		bh = ioend->io_buffer_head;
+		do {
+			next_bh = bh->b_private;
+			clear_buffer_async_write(bh);
+			unlock_buffer(bh);
+		} while ((bh = next_bh) != NULL);
+
+		xfs_ioend_wake(XFS_I(ioend->io_inode));
+		mempool_free(ioend, xfs_ioend_pool);
+	} while ((ioend = next) != NULL);
+}
+
+/*
+ * Test to see if we've been building up a completion structure for
+ * earlier buffers -- if so, we try to append to this ioend if we
+ * can, otherwise we finish off any current ioend and start another.
+ * Return true if we've finished the given ioend.
+ */
+STATIC void
+xfs_add_to_ioend(
+	struct inode		*inode,
+	struct buffer_head	*bh,
+	xfs_off_t		offset,
+	unsigned int		type,
+	xfs_ioend_t		**result,
+	int			need_ioend)
+{
+	xfs_ioend_t		*ioend = *result;
+
+	if (!ioend || need_ioend || type != ioend->io_type) {
+		xfs_ioend_t	*previous = *result;
+
+		ioend = xfs_alloc_ioend(inode, type);
+		ioend->io_offset = offset;
+		ioend->io_buffer_head = bh;
+		ioend->io_buffer_tail = bh;
+		if (previous)
+			previous->io_list = ioend;
+		*result = ioend;
+	} else {
+		ioend->io_buffer_tail->b_private = bh;
+		ioend->io_buffer_tail = bh;
+	}
+
+	bh->b_private = NULL;
+	ioend->io_size += bh->b_size;
+}
+
+STATIC void
+xfs_map_buffer(
+	struct inode		*inode,
+	struct buffer_head	*bh,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset)
+{
+	sector_t		bn;
+	struct xfs_mount	*m = XFS_I(inode)->i_mount;
+	xfs_off_t		iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
+	xfs_daddr_t		iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
+
+	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+
+	bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
+	      ((offset - iomap_offset) >> inode->i_blkbits);
+
+	ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
+
+	bh->b_blocknr = bn;
+	set_buffer_mapped(bh);
+}
+
+STATIC void
+xfs_map_at_offset(
+	struct inode		*inode,
+	struct buffer_head	*bh,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset)
+{
+	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+
+	xfs_map_buffer(inode, bh, imap, offset);
+	set_buffer_mapped(bh);
+	clear_buffer_delay(bh);
+	clear_buffer_unwritten(bh);
+}
+
+/*
+ * Test if a given page is suitable for writing as part of an unwritten
+ * or delayed allocate extent.
+ */
+STATIC int
+xfs_is_delayed_page(
+	struct page		*page,
+	unsigned int		type)
+{
+	if (PageWriteback(page))
+		return 0;
+
+	if (page->mapping && page_has_buffers(page)) {
+		struct buffer_head	*bh, *head;
+		int			acceptable = 0;
+
+		bh = head = page_buffers(page);
+		do {
+			if (buffer_unwritten(bh))
+				acceptable = (type == IO_UNWRITTEN);
+			else if (buffer_delay(bh))
+				acceptable = (type == IO_DELALLOC);
+			else if (buffer_dirty(bh) && buffer_mapped(bh))
+				acceptable = (type == IO_OVERWRITE);
+			else
+				break;
+		} while ((bh = bh->b_this_page) != head);
+
+		if (acceptable)
+			return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Allocate & map buffers for page given the extent map. Write it out.
+ * except for the original page of a writepage, this is called on
+ * delalloc/unwritten pages only, for the original page it is possible
+ * that the page has no mapping at all.
+ */
+STATIC int
+xfs_convert_page(
+	struct inode		*inode,
+	struct page		*page,
+	loff_t			tindex,
+	struct xfs_bmbt_irec	*imap,
+	xfs_ioend_t		**ioendp,
+	struct writeback_control *wbc)
+{
+	struct buffer_head	*bh, *head;
+	xfs_off_t		end_offset;
+	unsigned long		p_offset;
+	unsigned int		type;
+	int			len, page_dirty;
+	int			count = 0, done = 0, uptodate = 1;
+ 	xfs_off_t		offset = page_offset(page);
+
+	if (page->index != tindex)
+		goto fail;
+	if (!trylock_page(page))
+		goto fail;
+	if (PageWriteback(page))
+		goto fail_unlock_page;
+	if (page->mapping != inode->i_mapping)
+		goto fail_unlock_page;
+	if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
+		goto fail_unlock_page;
+
+	/*
+	 * page_dirty is initially a count of buffers on the page before
+	 * EOF and is decremented as we move each into a cleanable state.
+	 *
+	 * Derivation:
+	 *
+	 * End offset is the highest offset that this page should represent.
+	 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
+	 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
+	 * hence give us the correct page_dirty count. On any other page,
+	 * it will be zero and in that case we need page_dirty to be the
+	 * count of buffers on the page.
+	 */
+	end_offset = min_t(unsigned long long,
+			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
+			i_size_read(inode));
+
+	len = 1 << inode->i_blkbits;
+	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
+					PAGE_CACHE_SIZE);
+	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
+	page_dirty = p_offset / len;
+
+	bh = head = page_buffers(page);
+	do {
+		if (offset >= end_offset)
+			break;
+		if (!buffer_uptodate(bh))
+			uptodate = 0;
+		if (!(PageUptodate(page) || buffer_uptodate(bh))) {
+			done = 1;
+			continue;
+		}
+
+		if (buffer_unwritten(bh) || buffer_delay(bh) ||
+		    buffer_mapped(bh)) {
+			if (buffer_unwritten(bh))
+				type = IO_UNWRITTEN;
+			else if (buffer_delay(bh))
+				type = IO_DELALLOC;
+			else
+				type = IO_OVERWRITE;
+
+			if (!xfs_imap_valid(inode, imap, offset)) {
+				done = 1;
+				continue;
+			}
+
+			lock_buffer(bh);
+			if (type != IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, imap, offset);
+			xfs_add_to_ioend(inode, bh, offset, type,
+					 ioendp, done);
+
+			page_dirty--;
+			count++;
+		} else {
+			done = 1;
+		}
+	} while (offset += len, (bh = bh->b_this_page) != head);
+
+	if (uptodate && bh == head)
+		SetPageUptodate(page);
+
+	if (count) {
+		if (--wbc->nr_to_write <= 0 &&
+		    wbc->sync_mode == WB_SYNC_NONE)
+			done = 1;
+	}
+	xfs_start_page_writeback(page, !page_dirty, count);
+
+	return done;
+ fail_unlock_page:
+	unlock_page(page);
+ fail:
+	return 1;
+}
+
+/*
+ * Convert & write out a cluster of pages in the same extent as defined
+ * by mp and following the start page.
+ */
+STATIC void
+xfs_cluster_write(
+	struct inode		*inode,
+	pgoff_t			tindex,
+	struct xfs_bmbt_irec	*imap,
+	xfs_ioend_t		**ioendp,
+	struct writeback_control *wbc,
+	pgoff_t			tlast)
+{
+	struct pagevec		pvec;
+	int			done = 0, i;
+
+	pagevec_init(&pvec, 0);
+	while (!done && tindex <= tlast) {
+		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
+
+		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
+			break;
+
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
+					imap, ioendp, wbc);
+			if (done)
+				break;
+		}
+
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+STATIC void
+xfs_vm_invalidatepage(
+	struct page		*page,
+	unsigned long		offset)
+{
+	trace_xfs_invalidatepage(page->mapping->host, page, offset);
+	block_invalidatepage(page, offset);
+}
+
+/*
+ * If the page has delalloc buffers on it, we need to punch them out before we
+ * invalidate the page. If we don't, we leave a stale delalloc mapping on the
+ * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
+ * is done on that same region - the delalloc extent is returned when none is
+ * supposed to be there.
+ *
+ * We prevent this by truncating away the delalloc regions on the page before
+ * invalidating it. Because they are delalloc, we can do this without needing a
+ * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
+ * truncation without a transaction as there is no space left for block
+ * reservation (typically why we see a ENOSPC in writeback).
+ *
+ * This is not a performance critical path, so for now just do the punching a
+ * buffer head at a time.
+ */
+STATIC void
+xfs_aops_discard_page(
+	struct page		*page)
+{
+	struct inode		*inode = page->mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct buffer_head	*bh, *head;
+	loff_t			offset = page_offset(page);
+
+	if (!xfs_is_delayed_page(page, IO_DELALLOC))
+		goto out_invalidate;
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		goto out_invalidate;
+
+	xfs_alert(ip->i_mount,
+		"page discard on page %p, inode 0x%llx, offset %llu.",
+			page, ip->i_ino, offset);
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	bh = head = page_buffers(page);
+	do {
+		int		error;
+		xfs_fileoff_t	start_fsb;
+
+		if (!buffer_delay(bh))
+			goto next_buffer;
+
+		start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+		error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
+		if (error) {
+			/* something screwed, just bail */
+			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+				xfs_alert(ip->i_mount,
+			"page discard unable to remove delalloc mapping.");
+			}
+			break;
+		}
+next_buffer:
+		offset += 1 << inode->i_blkbits;
+
+	} while ((bh = bh->b_this_page) != head);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_invalidate:
+	xfs_vm_invalidatepage(page, 0);
+	return;
+}
+
+/*
+ * Write out a dirty page.
+ *
+ * For delalloc space on the page we need to allocate space and flush it.
+ * For unwritten space on the page we need to start the conversion to
+ * regular allocated space.
+ * For any other dirty buffer heads on the page we should flush them.
+ *
+ * If we detect that a transaction would be required to flush the page, we
+ * have to check the process flags first, if we are already in a transaction
+ * or disk I/O during allocations is off, we need to fail the writepage and
+ * redirty the page.
+ */
+STATIC int
+xfs_vm_writepage(
+	struct page		*page,
+	struct writeback_control *wbc)
+{
+	struct inode		*inode = page->mapping->host;
+	int			delalloc, unwritten;
+	struct buffer_head	*bh, *head;
+	struct xfs_bmbt_irec	imap;
+	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
+	loff_t			offset;
+	unsigned int		type;
+	__uint64_t              end_offset;
+	pgoff_t                 end_index, last_index;
+	ssize_t			len;
+	int			err, imap_valid = 0, uptodate = 1;
+	int			count = 0;
+	int			nonblocking = 0;
+
+	trace_xfs_writepage(inode, page, 0);
+
+	ASSERT(page_has_buffers(page));
+
+	/*
+	 * Refuse to write the page out if we are called from reclaim context.
+	 *
+	 * This avoids stack overflows when called from deeply used stacks in
+	 * random callers for direct reclaim or memcg reclaim.  We explicitly
+	 * allow reclaim from kswapd as the stack usage there is relatively low.
+	 *
+	 * This should really be done by the core VM, but until that happens
+	 * filesystems like XFS, btrfs and ext4 have to take care of this
+	 * by themselves.
+	 */
+	if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
+		goto redirty;
+
+	/*
+	 * We need a transaction if there are delalloc or unwritten buffers
+	 * on the page.
+	 *
+	 * If we need a transaction and the process flags say we are already
+	 * in a transaction, or no IO is allowed then mark the page dirty
+	 * again and leave the page as is.
+	 */
+	xfs_count_page_state(page, &delalloc, &unwritten);
+	if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
+		goto redirty;
+
+	/* Is this page beyond the end of the file? */
+	offset = i_size_read(inode);
+	end_index = offset >> PAGE_CACHE_SHIFT;
+	last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
+	if (page->index >= end_index) {
+		if ((page->index >= end_index + 1) ||
+		    !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
+			unlock_page(page);
+			return 0;
+		}
+	}
+
+	end_offset = min_t(unsigned long long,
+			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
+			offset);
+	len = 1 << inode->i_blkbits;
+
+	bh = head = page_buffers(page);
+	offset = page_offset(page);
+	type = IO_OVERWRITE;
+
+	if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+		nonblocking = 1;
+
+	do {
+		int new_ioend = 0;
+
+		if (offset >= end_offset)
+			break;
+		if (!buffer_uptodate(bh))
+			uptodate = 0;
+
+		/*
+		 * set_page_dirty dirties all buffers in a page, independent
+		 * of their state.  The dirty state however is entirely
+		 * meaningless for holes (!mapped && uptodate), so skip
+		 * buffers covering holes here.
+		 */
+		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+			imap_valid = 0;
+			continue;
+		}
+
+		if (buffer_unwritten(bh)) {
+			if (type != IO_UNWRITTEN) {
+				type = IO_UNWRITTEN;
+				imap_valid = 0;
+			}
+		} else if (buffer_delay(bh)) {
+			if (type != IO_DELALLOC) {
+				type = IO_DELALLOC;
+				imap_valid = 0;
+			}
+		} else if (buffer_uptodate(bh)) {
+			if (type != IO_OVERWRITE) {
+				type = IO_OVERWRITE;
+				imap_valid = 0;
+			}
+		} else {
+			if (PageUptodate(page)) {
+				ASSERT(buffer_mapped(bh));
+				imap_valid = 0;
+			}
+			continue;
+		}
+
+		if (imap_valid)
+			imap_valid = xfs_imap_valid(inode, &imap, offset);
+		if (!imap_valid) {
+			/*
+			 * If we didn't have a valid mapping then we need to
+			 * put the new mapping into a separate ioend structure.
+			 * This ensures non-contiguous extents always have
+			 * separate ioends, which is particularly important
+			 * for unwritten extent conversion at I/O completion
+			 * time.
+			 */
+			new_ioend = 1;
+			err = xfs_map_blocks(inode, offset, &imap, type,
+					     nonblocking);
+			if (err)
+				goto error;
+			imap_valid = xfs_imap_valid(inode, &imap, offset);
+		}
+		if (imap_valid) {
+			lock_buffer(bh);
+			if (type != IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, &imap, offset);
+			xfs_add_to_ioend(inode, bh, offset, type, &ioend,
+					 new_ioend);
+			count++;
+		}
+
+		if (!iohead)
+			iohead = ioend;
+
+	} while (offset += len, ((bh = bh->b_this_page) != head));
+
+	if (uptodate && bh == head)
+		SetPageUptodate(page);
+
+	xfs_start_page_writeback(page, 1, count);
+
+	if (ioend && imap_valid) {
+		xfs_off_t		end_index;
+
+		end_index = imap.br_startoff + imap.br_blockcount;
+
+		/* to bytes */
+		end_index <<= inode->i_blkbits;
+
+		/* to pages */
+		end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
+
+		/* check against file size */
+		if (end_index > last_index)
+			end_index = last_index;
+
+		xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
+				  wbc, end_index);
+	}
+
+	if (iohead)
+		xfs_submit_ioend(wbc, iohead);
+
+	return 0;
+
+error:
+	if (iohead)
+		xfs_cancel_ioend(iohead);
+
+	if (err == -EAGAIN)
+		goto redirty;
+
+	xfs_aops_discard_page(page);
+	ClearPageUptodate(page);
+	unlock_page(page);
+	return err;
+
+redirty:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return 0;
+}
+
+STATIC int
+xfs_vm_writepages(
+	struct address_space	*mapping,
+	struct writeback_control *wbc)
+{
+	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+	return generic_writepages(mapping, wbc);
+}
+
+/*
+ * Called to move a page into cleanable state - and from there
+ * to be released. The page should already be clean. We always
+ * have buffer heads in this call.
+ *
+ * Returns 1 if the page is ok to release, 0 otherwise.
+ */
+STATIC int
+xfs_vm_releasepage(
+	struct page		*page,
+	gfp_t			gfp_mask)
+{
+	int			delalloc, unwritten;
+
+	trace_xfs_releasepage(page->mapping->host, page, 0);
+
+	xfs_count_page_state(page, &delalloc, &unwritten);
+
+	if (WARN_ON(delalloc))
+		return 0;
+	if (WARN_ON(unwritten))
+		return 0;
+
+	return try_to_free_buffers(page);
+}
+
+STATIC int
+__xfs_get_blocks(
+	struct inode		*inode,
+	sector_t		iblock,
+	struct buffer_head	*bh_result,
+	int			create,
+	int			direct)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	int			error = 0;
+	int			lockmode = 0;
+	struct xfs_bmbt_irec	imap;
+	int			nimaps = 1;
+	xfs_off_t		offset;
+	ssize_t			size;
+	int			new = 0;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -XFS_ERROR(EIO);
+
+	offset = (xfs_off_t)iblock << inode->i_blkbits;
+	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
+	size = bh_result->b_size;
+
+	if (!create && direct && offset >= i_size_read(inode))
+		return 0;
+
+	if (create) {
+		lockmode = XFS_ILOCK_EXCL;
+		xfs_ilock(ip, lockmode);
+	} else {
+		lockmode = xfs_ilock_map_shared(ip);
+	}
+
+	ASSERT(offset <= mp->m_maxioffset);
+	if (offset + size > mp->m_maxioffset)
+		size = mp->m_maxioffset - offset;
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+	error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+			  XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
+	if (error)
+		goto out_unlock;
+
+	if (create &&
+	    (!nimaps ||
+	     (imap.br_startblock == HOLESTARTBLOCK ||
+	      imap.br_startblock == DELAYSTARTBLOCK))) {
+		if (direct) {
+			error = xfs_iomap_write_direct(ip, offset, size,
+						       &imap, nimaps);
+		} else {
+			error = xfs_iomap_write_delay(ip, offset, size, &imap);
+		}
+		if (error)
+			goto out_unlock;
+
+		trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+	} else if (nimaps) {
+		trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+	} else {
+		trace_xfs_get_blocks_notfound(ip, offset, size);
+		goto out_unlock;
+	}
+	xfs_iunlock(ip, lockmode);
+
+	if (imap.br_startblock != HOLESTARTBLOCK &&
+	    imap.br_startblock != DELAYSTARTBLOCK) {
+		/*
+		 * For unwritten extents do not report a disk address on
+		 * the read case (treat as if we're reading into a hole).
+		 */
+		if (create || !ISUNWRITTEN(&imap))
+			xfs_map_buffer(inode, bh_result, &imap, offset);
+		if (create && ISUNWRITTEN(&imap)) {
+			if (direct)
+				bh_result->b_private = inode;
+			set_buffer_unwritten(bh_result);
+		}
+	}
+
+	/*
+	 * If this is a realtime file, data may be on a different device.
+	 * to that pointed to from the buffer_head b_bdev currently.
+	 */
+	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
+
+	/*
+	 * If we previously allocated a block out beyond eof and we are now
+	 * coming back to use it then we will need to flag it as new even if it
+	 * has a disk address.
+	 *
+	 * With sub-block writes into unwritten extents we also need to mark
+	 * the buffer as new so that the unwritten parts of the buffer gets
+	 * correctly zeroed.
+	 */
+	if (create &&
+	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
+	     (offset >= i_size_read(inode)) ||
+	     (new || ISUNWRITTEN(&imap))))
+		set_buffer_new(bh_result);
+
+	if (imap.br_startblock == DELAYSTARTBLOCK) {
+		BUG_ON(direct);
+		if (create) {
+			set_buffer_uptodate(bh_result);
+			set_buffer_mapped(bh_result);
+			set_buffer_delay(bh_result);
+		}
+	}
+
+	/*
+	 * If this is O_DIRECT or the mpage code calling tell them how large
+	 * the mapping is, so that we can avoid repeated get_blocks calls.
+	 */
+	if (direct || size > (1 << inode->i_blkbits)) {
+		xfs_off_t		mapping_size;
+
+		mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
+		mapping_size <<= inode->i_blkbits;
+
+		ASSERT(mapping_size > 0);
+		if (mapping_size > size)
+			mapping_size = size;
+		if (mapping_size > LONG_MAX)
+			mapping_size = LONG_MAX;
+
+		bh_result->b_size = mapping_size;
+	}
+
+	return 0;
+
+out_unlock:
+	xfs_iunlock(ip, lockmode);
+	return -error;
+}
+
+int
+xfs_get_blocks(
+	struct inode		*inode,
+	sector_t		iblock,
+	struct buffer_head	*bh_result,
+	int			create)
+{
+	return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
+}
+
+STATIC int
+xfs_get_blocks_direct(
+	struct inode		*inode,
+	sector_t		iblock,
+	struct buffer_head	*bh_result,
+	int			create)
+{
+	return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
+}
+
+/*
+ * Complete a direct I/O write request.
+ *
+ * If the private argument is non-NULL __xfs_get_blocks signals us that we
+ * need to issue a transaction to convert the range from unwritten to written
+ * extents.  In case this is regular synchronous I/O we just call xfs_end_io
+ * to do this and we are done.  But in case this was a successful AIO
+ * request this handler is called from interrupt context, from which we
+ * can't start transactions.  In that case offload the I/O completion to
+ * the workqueues we also use for buffered I/O completion.
+ */
+STATIC void
+xfs_end_io_direct_write(
+	struct kiocb		*iocb,
+	loff_t			offset,
+	ssize_t			size,
+	void			*private,
+	int			ret,
+	bool			is_async)
+{
+	struct xfs_ioend	*ioend = iocb->private;
+
+	/*
+	 * blockdev_direct_IO can return an error even after the I/O
+	 * completion handler was called.  Thus we need to protect
+	 * against double-freeing.
+	 */
+	iocb->private = NULL;
+
+	ioend->io_offset = offset;
+	ioend->io_size = size;
+	if (private && size > 0)
+		ioend->io_type = IO_UNWRITTEN;
+
+	if (is_async) {
+		/*
+		 * If we are converting an unwritten extent we need to delay
+		 * the AIO completion until after the unwrittent extent
+		 * conversion has completed, otherwise do it ASAP.
+		 */
+		if (ioend->io_type == IO_UNWRITTEN) {
+			ioend->io_iocb = iocb;
+			ioend->io_result = ret;
+		} else {
+			aio_complete(iocb, ret, 0);
+		}
+		xfs_finish_ioend(ioend);
+	} else {
+		xfs_finish_ioend_sync(ioend);
+	}
+}
+
+STATIC ssize_t
+xfs_vm_direct_IO(
+	int			rw,
+	struct kiocb		*iocb,
+	const struct iovec	*iov,
+	loff_t			offset,
+	unsigned long		nr_segs)
+{
+	struct inode		*inode = iocb->ki_filp->f_mapping->host;
+	struct block_device	*bdev = xfs_find_bdev_for_inode(inode);
+	ssize_t			ret;
+
+	if (rw & WRITE) {
+		iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
+
+		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
+					    offset, nr_segs,
+					    xfs_get_blocks_direct,
+					    xfs_end_io_direct_write, NULL, 0);
+		if (ret != -EIOCBQUEUED && iocb->private)
+			xfs_destroy_ioend(iocb->private);
+	} else {
+		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
+					    offset, nr_segs,
+					    xfs_get_blocks_direct,
+					    NULL, NULL, 0);
+	}
+
+	return ret;
+}
+
+STATIC void
+xfs_vm_write_failed(
+	struct address_space	*mapping,
+	loff_t			to)
+{
+	struct inode		*inode = mapping->host;
+
+	if (to > inode->i_size) {
+		/*
+		 * punch out the delalloc blocks we have already allocated. We
+		 * don't call xfs_setattr() to do this as we may be in the
+		 * middle of a multi-iovec write and so the vfs inode->i_size
+		 * will not match the xfs ip->i_size and so it will zero too
+		 * much. Hence we jus truncate the page cache to zero what is
+		 * necessary and punch the delalloc blocks directly.
+		 */
+		struct xfs_inode	*ip = XFS_I(inode);
+		xfs_fileoff_t		start_fsb;
+		xfs_fileoff_t		end_fsb;
+		int			error;
+
+		truncate_pagecache(inode, to, inode->i_size);
+
+		/*
+		 * Check if there are any blocks that are outside of i_size
+		 * that need to be trimmed back.
+		 */
+		start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
+		end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
+		if (end_fsb <= start_fsb)
+			return;
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+							end_fsb - start_fsb);
+		if (error) {
+			/* something screwed, just bail */
+			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+				xfs_alert(ip->i_mount,
+			"xfs_vm_write_failed: unable to clean up ino %lld",
+						ip->i_ino);
+			}
+		}
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+}
+
+STATIC int
+xfs_vm_write_begin(
+	struct file		*file,
+	struct address_space	*mapping,
+	loff_t			pos,
+	unsigned		len,
+	unsigned		flags,
+	struct page		**pagep,
+	void			**fsdata)
+{
+	int			ret;
+
+	ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS,
+				pagep, xfs_get_blocks);
+	if (unlikely(ret))
+		xfs_vm_write_failed(mapping, pos + len);
+	return ret;
+}
+
+STATIC int
+xfs_vm_write_end(
+	struct file		*file,
+	struct address_space	*mapping,
+	loff_t			pos,
+	unsigned		len,
+	unsigned		copied,
+	struct page		*page,
+	void			*fsdata)
+{
+	int			ret;
+
+	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	if (unlikely(ret < len))
+		xfs_vm_write_failed(mapping, pos + len);
+	return ret;
+}
+
+STATIC sector_t
+xfs_vm_bmap(
+	struct address_space	*mapping,
+	sector_t		block)
+{
+	struct inode		*inode = (struct inode *)mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	trace_xfs_vm_bmap(XFS_I(inode));
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	return generic_block_bmap(mapping, block, xfs_get_blocks);
+}
+
+STATIC int
+xfs_vm_readpage(
+	struct file		*unused,
+	struct page		*page)
+{
+	return mpage_readpage(page, xfs_get_blocks);
+}
+
+STATIC int
+xfs_vm_readpages(
+	struct file		*unused,
+	struct address_space	*mapping,
+	struct list_head	*pages,
+	unsigned		nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
+}
+
+const struct address_space_operations xfs_address_space_operations = {
+	.readpage		= xfs_vm_readpage,
+	.readpages		= xfs_vm_readpages,
+	.writepage		= xfs_vm_writepage,
+	.writepages		= xfs_vm_writepages,
+	.releasepage		= xfs_vm_releasepage,
+	.invalidatepage		= xfs_vm_invalidatepage,
+	.write_begin		= xfs_vm_write_begin,
+	.write_end		= xfs_vm_write_end,
+	.bmap			= xfs_vm_bmap,
+	.direct_IO		= xfs_vm_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
new file mode 100644
index 00000000..71f721e1
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2005-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_AOPS_H__
+#define __XFS_AOPS_H__
+
+extern struct workqueue_struct *xfsdatad_workqueue;
+extern struct workqueue_struct *xfsconvertd_workqueue;
+extern mempool_t *xfs_ioend_pool;
+
+/*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+	IO_DIRECT = 0,	/* special case for direct I/O ioends */
+	IO_DELALLOC,	/* mapping covers delalloc region */
+	IO_UNWRITTEN,	/* mapping covers allocated but uninitialized data */
+	IO_OVERWRITE,	/* mapping covers already allocated extent */
+};
+
+#define XFS_IO_TYPES \
+	{ 0,			"" }, \
+	{ IO_DELALLOC,		"delalloc" }, \
+	{ IO_UNWRITTEN,		"unwritten" }, \
+	{ IO_OVERWRITE,		"overwrite" }
+
+/*
+ * xfs_ioend struct manages large extent writes for XFS.
+ * It can manage several multi-page bio's at once.
+ */
+typedef struct xfs_ioend {
+	struct xfs_ioend	*io_list;	/* next ioend in chain */
+	unsigned int		io_type;	/* delalloc / unwritten */
+	int			io_error;	/* I/O error code */
+	atomic_t		io_remaining;	/* hold count */
+	struct inode		*io_inode;	/* file being written to */
+	struct buffer_head	*io_buffer_head;/* buffer linked list head */
+	struct buffer_head	*io_buffer_tail;/* buffer linked list tail */
+	size_t			io_size;	/* size of the extent */
+	xfs_off_t		io_offset;	/* offset in the file */
+	struct work_struct	io_work;	/* xfsdatad work queue */
+	struct kiocb		*io_iocb;
+	int			io_result;
+} xfs_ioend_t;
+
+extern const struct address_space_operations xfs_address_space_operations;
+extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+
+extern void xfs_ioend_init(void);
+extern void xfs_ioend_wait(struct xfs_inode *);
+
+extern void xfs_count_page_state(struct page *, int *, int *);
+
+#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
new file mode 100644
index 00000000..5e68099d
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -0,0 +1,1899 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/bio.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/percpu.h>
+#include <linux/blkdev.h>
+#include <linux/hash.h>
+#include <linux/kthread.h>
+#include <linux/migrate.h>
+#include <linux/backing-dev.h>
+#include <linux/freezer.h>
+
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_trace.h"
+
+static kmem_zone_t *xfs_buf_zone;
+STATIC int xfsbufd(void *);
+STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
+
+static struct workqueue_struct *xfslogd_workqueue;
+struct workqueue_struct *xfsdatad_workqueue;
+struct workqueue_struct *xfsconvertd_workqueue;
+
+#ifdef XFS_BUF_LOCK_TRACKING
+# define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
+# define XB_CLEAR_OWNER(bp)	((bp)->b_last_holder = -1)
+# define XB_GET_OWNER(bp)	((bp)->b_last_holder)
+#else
+# define XB_SET_OWNER(bp)	do { } while (0)
+# define XB_CLEAR_OWNER(bp)	do { } while (0)
+# define XB_GET_OWNER(bp)	do { } while (0)
+#endif
+
+#define xb_to_gfp(flags) \
+	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
+	  ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
+
+#define xb_to_km(flags) \
+	 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
+
+#define xfs_buf_allocate(flags) \
+	kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
+#define xfs_buf_deallocate(bp) \
+	kmem_zone_free(xfs_buf_zone, (bp));
+
+static inline int
+xfs_buf_is_vmapped(
+	struct xfs_buf	*bp)
+{
+	/*
+	 * Return true if the buffer is vmapped.
+	 *
+	 * The XBF_MAPPED flag is set if the buffer should be mapped, but the
+	 * code is clever enough to know it doesn't have to map a single page,
+	 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
+	 */
+	return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
+}
+
+static inline int
+xfs_buf_vmap_len(
+	struct xfs_buf	*bp)
+{
+	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
+}
+
+/*
+ * xfs_buf_lru_add - add a buffer to the LRU.
+ *
+ * The LRU takes a new reference to the buffer so that it will only be freed
+ * once the shrinker takes the buffer off the LRU.
+ */
+STATIC void
+xfs_buf_lru_add(
+	struct xfs_buf	*bp)
+{
+	struct xfs_buftarg *btp = bp->b_target;
+
+	spin_lock(&btp->bt_lru_lock);
+	if (list_empty(&bp->b_lru)) {
+		atomic_inc(&bp->b_hold);
+		list_add_tail(&bp->b_lru, &btp->bt_lru);
+		btp->bt_lru_nr++;
+	}
+	spin_unlock(&btp->bt_lru_lock);
+}
+
+/*
+ * xfs_buf_lru_del - remove a buffer from the LRU
+ *
+ * The unlocked check is safe here because it only occurs when there are not
+ * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
+ * to optimise the shrinker removing the buffer from the LRU and calling
+ * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
+ * bt_lru_lock.
+ */
+STATIC void
+xfs_buf_lru_del(
+	struct xfs_buf	*bp)
+{
+	struct xfs_buftarg *btp = bp->b_target;
+
+	if (list_empty(&bp->b_lru))
+		return;
+
+	spin_lock(&btp->bt_lru_lock);
+	if (!list_empty(&bp->b_lru)) {
+		list_del_init(&bp->b_lru);
+		btp->bt_lru_nr--;
+	}
+	spin_unlock(&btp->bt_lru_lock);
+}
+
+/*
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
+ */
+void
+xfs_buf_stale(
+	struct xfs_buf	*bp)
+{
+	bp->b_flags |= XBF_STALE;
+	atomic_set(&(bp)->b_lru_ref, 0);
+	if (!list_empty(&bp->b_lru)) {
+		struct xfs_buftarg *btp = bp->b_target;
+
+		spin_lock(&btp->bt_lru_lock);
+		if (!list_empty(&bp->b_lru)) {
+			list_del_init(&bp->b_lru);
+			btp->bt_lru_nr--;
+			atomic_dec(&bp->b_hold);
+		}
+		spin_unlock(&btp->bt_lru_lock);
+	}
+	ASSERT(atomic_read(&bp->b_hold) >= 1);
+}
+
+STATIC void
+_xfs_buf_initialize(
+	xfs_buf_t		*bp,
+	xfs_buftarg_t		*target,
+	xfs_off_t		range_base,
+	size_t			range_length,
+	xfs_buf_flags_t		flags)
+{
+	/*
+	 * We don't want certain flags to appear in b_flags.
+	 */
+	flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
+
+	memset(bp, 0, sizeof(xfs_buf_t));
+	atomic_set(&bp->b_hold, 1);
+	atomic_set(&bp->b_lru_ref, 1);
+	init_completion(&bp->b_iowait);
+	INIT_LIST_HEAD(&bp->b_lru);
+	INIT_LIST_HEAD(&bp->b_list);
+	RB_CLEAR_NODE(&bp->b_rbnode);
+	sema_init(&bp->b_sema, 0); /* held, no waiters */
+	XB_SET_OWNER(bp);
+	bp->b_target = target;
+	bp->b_file_offset = range_base;
+	/*
+	 * Set buffer_length and count_desired to the same value initially.
+	 * I/O routines should use count_desired, which will be the same in
+	 * most cases but may be reset (e.g. XFS recovery).
+	 */
+	bp->b_buffer_length = bp->b_count_desired = range_length;
+	bp->b_flags = flags;
+	bp->b_bn = XFS_BUF_DADDR_NULL;
+	atomic_set(&bp->b_pin_count, 0);
+	init_waitqueue_head(&bp->b_waiters);
+
+	XFS_STATS_INC(xb_create);
+
+	trace_xfs_buf_init(bp, _RET_IP_);
+}
+
+/*
+ *	Allocate a page array capable of holding a specified number
+ *	of pages, and point the page buf at it.
+ */
+STATIC int
+_xfs_buf_get_pages(
+	xfs_buf_t		*bp,
+	int			page_count,
+	xfs_buf_flags_t		flags)
+{
+	/* Make sure that we have a page list */
+	if (bp->b_pages == NULL) {
+		bp->b_offset = xfs_buf_poff(bp->b_file_offset);
+		bp->b_page_count = page_count;
+		if (page_count <= XB_PAGES) {
+			bp->b_pages = bp->b_page_array;
+		} else {
+			bp->b_pages = kmem_alloc(sizeof(struct page *) *
+					page_count, xb_to_km(flags));
+			if (bp->b_pages == NULL)
+				return -ENOMEM;
+		}
+		memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
+	}
+	return 0;
+}
+
+/*
+ *	Frees b_pages if it was allocated.
+ */
+STATIC void
+_xfs_buf_free_pages(
+	xfs_buf_t	*bp)
+{
+	if (bp->b_pages != bp->b_page_array) {
+		kmem_free(bp->b_pages);
+		bp->b_pages = NULL;
+	}
+}
+
+/*
+ *	Releases the specified buffer.
+ *
+ * 	The modification state of any associated pages is left unchanged.
+ * 	The buffer most not be on any hash - use xfs_buf_rele instead for
+ * 	hashed and refcounted buffers
+ */
+void
+xfs_buf_free(
+	xfs_buf_t		*bp)
+{
+	trace_xfs_buf_free(bp, _RET_IP_);
+
+	ASSERT(list_empty(&bp->b_lru));
+
+	if (bp->b_flags & _XBF_PAGES) {
+		uint		i;
+
+		if (xfs_buf_is_vmapped(bp))
+			vm_unmap_ram(bp->b_addr - bp->b_offset,
+					bp->b_page_count);
+
+		for (i = 0; i < bp->b_page_count; i++) {
+			struct page	*page = bp->b_pages[i];
+
+			__free_page(page);
+		}
+	} else if (bp->b_flags & _XBF_KMEM)
+		kmem_free(bp->b_addr);
+	_xfs_buf_free_pages(bp);
+	xfs_buf_deallocate(bp);
+}
+
+/*
+ * Allocates all the pages for buffer in question and builds it's page list.
+ */
+STATIC int
+xfs_buf_allocate_memory(
+	xfs_buf_t		*bp,
+	uint			flags)
+{
+	size_t			size = bp->b_count_desired;
+	size_t			nbytes, offset;
+	gfp_t			gfp_mask = xb_to_gfp(flags);
+	unsigned short		page_count, i;
+	xfs_off_t		end;
+	int			error;
+
+	/*
+	 * for buffers that are contained within a single page, just allocate
+	 * the memory from the heap - there's no need for the complexity of
+	 * page arrays to keep allocation down to order 0.
+	 */
+	if (bp->b_buffer_length < PAGE_SIZE) {
+		bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
+		if (!bp->b_addr) {
+			/* low memory - use alloc_page loop instead */
+			goto use_alloc_page;
+		}
+
+		if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
+								PAGE_MASK) !=
+		    ((unsigned long)bp->b_addr & PAGE_MASK)) {
+			/* b_addr spans two pages - use alloc_page instead */
+			kmem_free(bp->b_addr);
+			bp->b_addr = NULL;
+			goto use_alloc_page;
+		}
+		bp->b_offset = offset_in_page(bp->b_addr);
+		bp->b_pages = bp->b_page_array;
+		bp->b_pages[0] = virt_to_page(bp->b_addr);
+		bp->b_page_count = 1;
+		bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
+		return 0;
+	}
+
+use_alloc_page:
+	end = bp->b_file_offset + bp->b_buffer_length;
+	page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
+	error = _xfs_buf_get_pages(bp, page_count, flags);
+	if (unlikely(error))
+		return error;
+
+	offset = bp->b_offset;
+	bp->b_flags |= _XBF_PAGES;
+
+	for (i = 0; i < bp->b_page_count; i++) {
+		struct page	*page;
+		uint		retries = 0;
+retry:
+		page = alloc_page(gfp_mask);
+		if (unlikely(page == NULL)) {
+			if (flags & XBF_READ_AHEAD) {
+				bp->b_page_count = i;
+				error = ENOMEM;
+				goto out_free_pages;
+			}
+
+			/*
+			 * This could deadlock.
+			 *
+			 * But until all the XFS lowlevel code is revamped to
+			 * handle buffer allocation failures we can't do much.
+			 */
+			if (!(++retries % 100))
+				xfs_err(NULL,
+		"possible memory allocation deadlock in %s (mode:0x%x)",
+					__func__, gfp_mask);
+
+			XFS_STATS_INC(xb_page_retries);
+			congestion_wait(BLK_RW_ASYNC, HZ/50);
+			goto retry;
+		}
+
+		XFS_STATS_INC(xb_page_found);
+
+		nbytes = min_t(size_t, size, PAGE_SIZE - offset);
+		size -= nbytes;
+		bp->b_pages[i] = page;
+		offset = 0;
+	}
+	return 0;
+
+out_free_pages:
+	for (i = 0; i < bp->b_page_count; i++)
+		__free_page(bp->b_pages[i]);
+	return error;
+}
+
+/*
+ *	Map buffer into kernel address-space if necessary.
+ */
+STATIC int
+_xfs_buf_map_pages(
+	xfs_buf_t		*bp,
+	uint			flags)
+{
+	ASSERT(bp->b_flags & _XBF_PAGES);
+	if (bp->b_page_count == 1) {
+		/* A single page buffer is always mappable */
+		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
+		bp->b_flags |= XBF_MAPPED;
+	} else if (flags & XBF_MAPPED) {
+		int retried = 0;
+
+		do {
+			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+						-1, PAGE_KERNEL);
+			if (bp->b_addr)
+				break;
+			vm_unmap_aliases();
+		} while (retried++ <= 1);
+
+		if (!bp->b_addr)
+			return -ENOMEM;
+		bp->b_addr += bp->b_offset;
+		bp->b_flags |= XBF_MAPPED;
+	}
+
+	return 0;
+}
+
+/*
+ *	Finding and Reading Buffers
+ */
+
+/*
+ *	Look up, and creates if absent, a lockable buffer for
+ *	a given range of an inode.  The buffer is returned
+ *	locked.	 If other overlapping buffers exist, they are
+ *	released before the new buffer is created and locked,
+ *	which may imply that this call will block until those buffers
+ *	are unlocked.  No I/O is implied by this call.
+ */
+xfs_buf_t *
+_xfs_buf_find(
+	xfs_buftarg_t		*btp,	/* block device target		*/
+	xfs_off_t		ioff,	/* starting offset of range	*/
+	size_t			isize,	/* length of range		*/
+	xfs_buf_flags_t		flags,
+	xfs_buf_t		*new_bp)
+{
+	xfs_off_t		range_base;
+	size_t			range_length;
+	struct xfs_perag	*pag;
+	struct rb_node		**rbp;
+	struct rb_node		*parent;
+	xfs_buf_t		*bp;
+
+	range_base = (ioff << BBSHIFT);
+	range_length = (isize << BBSHIFT);
+
+	/* Check for IOs smaller than the sector size / not sector aligned */
+	ASSERT(!(range_length < (1 << btp->bt_sshift)));
+	ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
+
+	/* get tree root */
+	pag = xfs_perag_get(btp->bt_mount,
+				xfs_daddr_to_agno(btp->bt_mount, ioff));
+
+	/* walk tree */
+	spin_lock(&pag->pag_buf_lock);
+	rbp = &pag->pag_buf_tree.rb_node;
+	parent = NULL;
+	bp = NULL;
+	while (*rbp) {
+		parent = *rbp;
+		bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+
+		if (range_base < bp->b_file_offset)
+			rbp = &(*rbp)->rb_left;
+		else if (range_base > bp->b_file_offset)
+			rbp = &(*rbp)->rb_right;
+		else {
+			/*
+			 * found a block offset match. If the range doesn't
+			 * match, the only way this is allowed is if the buffer
+			 * in the cache is stale and the transaction that made
+			 * it stale has not yet committed. i.e. we are
+			 * reallocating a busy extent. Skip this buffer and
+			 * continue searching to the right for an exact match.
+			 */
+			if (bp->b_buffer_length != range_length) {
+				ASSERT(bp->b_flags & XBF_STALE);
+				rbp = &(*rbp)->rb_right;
+				continue;
+			}
+			atomic_inc(&bp->b_hold);
+			goto found;
+		}
+	}
+
+	/* No match found */
+	if (new_bp) {
+		_xfs_buf_initialize(new_bp, btp, range_base,
+				range_length, flags);
+		rb_link_node(&new_bp->b_rbnode, parent, rbp);
+		rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+		/* the buffer keeps the perag reference until it is freed */
+		new_bp->b_pag = pag;
+		spin_unlock(&pag->pag_buf_lock);
+	} else {
+		XFS_STATS_INC(xb_miss_locked);
+		spin_unlock(&pag->pag_buf_lock);
+		xfs_perag_put(pag);
+	}
+	return new_bp;
+
+found:
+	spin_unlock(&pag->pag_buf_lock);
+	xfs_perag_put(pag);
+
+	if (xfs_buf_cond_lock(bp)) {
+		/* failed, so wait for the lock if requested. */
+		if (!(flags & XBF_TRYLOCK)) {
+			xfs_buf_lock(bp);
+			XFS_STATS_INC(xb_get_locked_waited);
+		} else {
+			xfs_buf_rele(bp);
+			XFS_STATS_INC(xb_busy_locked);
+			return NULL;
+		}
+	}
+
+	/*
+	 * if the buffer is stale, clear all the external state associated with
+	 * it. We need to keep flags such as how we allocated the buffer memory
+	 * intact here.
+	 */
+	if (bp->b_flags & XBF_STALE) {
+		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
+		bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
+	}
+
+	trace_xfs_buf_find(bp, flags, _RET_IP_);
+	XFS_STATS_INC(xb_get_locked);
+	return bp;
+}
+
+/*
+ *	Assembles a buffer covering the specified range.
+ *	Storage in memory for all portions of the buffer will be allocated,
+ *	although backing storage may not be.
+ */
+xfs_buf_t *
+xfs_buf_get(
+	xfs_buftarg_t		*target,/* target for buffer		*/
+	xfs_off_t		ioff,	/* starting offset of range	*/
+	size_t			isize,	/* length of range		*/
+	xfs_buf_flags_t		flags)
+{
+	xfs_buf_t		*bp, *new_bp;
+	int			error = 0;
+
+	new_bp = xfs_buf_allocate(flags);
+	if (unlikely(!new_bp))
+		return NULL;
+
+	bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
+	if (bp == new_bp) {
+		error = xfs_buf_allocate_memory(bp, flags);
+		if (error)
+			goto no_buffer;
+	} else {
+		xfs_buf_deallocate(new_bp);
+		if (unlikely(bp == NULL))
+			return NULL;
+	}
+
+	if (!(bp->b_flags & XBF_MAPPED)) {
+		error = _xfs_buf_map_pages(bp, flags);
+		if (unlikely(error)) {
+			xfs_warn(target->bt_mount,
+				"%s: failed to map pages\n", __func__);
+			goto no_buffer;
+		}
+	}
+
+	XFS_STATS_INC(xb_get);
+
+	/*
+	 * Always fill in the block number now, the mapped cases can do
+	 * their own overlay of this later.
+	 */
+	bp->b_bn = ioff;
+	bp->b_count_desired = bp->b_buffer_length;
+
+	trace_xfs_buf_get(bp, flags, _RET_IP_);
+	return bp;
+
+ no_buffer:
+	if (flags & (XBF_LOCK | XBF_TRYLOCK))
+		xfs_buf_unlock(bp);
+	xfs_buf_rele(bp);
+	return NULL;
+}
+
+STATIC int
+_xfs_buf_read(
+	xfs_buf_t		*bp,
+	xfs_buf_flags_t		flags)
+{
+	int			status;
+
+	ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
+	ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
+
+	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
+			XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | \
+			XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+
+	status = xfs_buf_iorequest(bp);
+	if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC))
+		return status;
+	return xfs_buf_iowait(bp);
+}
+
+xfs_buf_t *
+xfs_buf_read(
+	xfs_buftarg_t		*target,
+	xfs_off_t		ioff,
+	size_t			isize,
+	xfs_buf_flags_t		flags)
+{
+	xfs_buf_t		*bp;
+
+	flags |= XBF_READ;
+
+	bp = xfs_buf_get(target, ioff, isize, flags);
+	if (bp) {
+		trace_xfs_buf_read(bp, flags, _RET_IP_);
+
+		if (!XFS_BUF_ISDONE(bp)) {
+			XFS_STATS_INC(xb_get_read);
+			_xfs_buf_read(bp, flags);
+		} else if (flags & XBF_ASYNC) {
+			/*
+			 * Read ahead call which is already satisfied,
+			 * drop the buffer
+			 */
+			goto no_buffer;
+		} else {
+			/* We do not want read in the flags */
+			bp->b_flags &= ~XBF_READ;
+		}
+	}
+
+	return bp;
+
+ no_buffer:
+	if (flags & (XBF_LOCK | XBF_TRYLOCK))
+		xfs_buf_unlock(bp);
+	xfs_buf_rele(bp);
+	return NULL;
+}
+
+/*
+ *	If we are not low on memory then do the readahead in a deadlock
+ *	safe manner.
+ */
+void
+xfs_buf_readahead(
+	xfs_buftarg_t		*target,
+	xfs_off_t		ioff,
+	size_t			isize)
+{
+	if (bdi_read_congested(target->bt_bdi))
+		return;
+
+	xfs_buf_read(target, ioff, isize,
+		     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
+}
+
+/*
+ * Read an uncached buffer from disk. Allocates and returns a locked
+ * buffer containing the disk contents or nothing.
+ */
+struct xfs_buf *
+xfs_buf_read_uncached(
+	struct xfs_mount	*mp,
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		daddr,
+	size_t			length,
+	int			flags)
+{
+	xfs_buf_t		*bp;
+	int			error;
+
+	bp = xfs_buf_get_uncached(target, length, flags);
+	if (!bp)
+		return NULL;
+
+	/* set up the buffer for a read IO */
+	xfs_buf_lock(bp);
+	XFS_BUF_SET_ADDR(bp, daddr);
+	XFS_BUF_READ(bp);
+	XFS_BUF_BUSY(bp);
+
+	xfsbdstrat(mp, bp);
+	error = xfs_buf_iowait(bp);
+	if (error || bp->b_error) {
+		xfs_buf_relse(bp);
+		return NULL;
+	}
+	return bp;
+}
+
+xfs_buf_t *
+xfs_buf_get_empty(
+	size_t			len,
+	xfs_buftarg_t		*target)
+{
+	xfs_buf_t		*bp;
+
+	bp = xfs_buf_allocate(0);
+	if (bp)
+		_xfs_buf_initialize(bp, target, 0, len, 0);
+	return bp;
+}
+
+/*
+ * Return a buffer allocated as an empty buffer and associated to external
+ * memory via xfs_buf_associate_memory() back to it's empty state.
+ */
+void
+xfs_buf_set_empty(
+	struct xfs_buf		*bp,
+	size_t			len)
+{
+	if (bp->b_pages)
+		_xfs_buf_free_pages(bp);
+
+	bp->b_pages = NULL;
+	bp->b_page_count = 0;
+	bp->b_addr = NULL;
+	bp->b_file_offset = 0;
+	bp->b_buffer_length = bp->b_count_desired = len;
+	bp->b_bn = XFS_BUF_DADDR_NULL;
+	bp->b_flags &= ~XBF_MAPPED;
+}
+
+static inline struct page *
+mem_to_page(
+	void			*addr)
+{
+	if ((!is_vmalloc_addr(addr))) {
+		return virt_to_page(addr);
+	} else {
+		return vmalloc_to_page(addr);
+	}
+}
+
+int
+xfs_buf_associate_memory(
+	xfs_buf_t		*bp,
+	void			*mem,
+	size_t			len)
+{
+	int			rval;
+	int			i = 0;
+	unsigned long		pageaddr;
+	unsigned long		offset;
+	size_t			buflen;
+	int			page_count;
+
+	pageaddr = (unsigned long)mem & PAGE_MASK;
+	offset = (unsigned long)mem - pageaddr;
+	buflen = PAGE_ALIGN(len + offset);
+	page_count = buflen >> PAGE_SHIFT;
+
+	/* Free any previous set of page pointers */
+	if (bp->b_pages)
+		_xfs_buf_free_pages(bp);
+
+	bp->b_pages = NULL;
+	bp->b_addr = mem;
+
+	rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
+	if (rval)
+		return rval;
+
+	bp->b_offset = offset;
+
+	for (i = 0; i < bp->b_page_count; i++) {
+		bp->b_pages[i] = mem_to_page((void *)pageaddr);
+		pageaddr += PAGE_SIZE;
+	}
+
+	bp->b_count_desired = len;
+	bp->b_buffer_length = buflen;
+	bp->b_flags |= XBF_MAPPED;
+
+	return 0;
+}
+
+xfs_buf_t *
+xfs_buf_get_uncached(
+	struct xfs_buftarg	*target,
+	size_t			len,
+	int			flags)
+{
+	unsigned long		page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
+	int			error, i;
+	xfs_buf_t		*bp;
+
+	bp = xfs_buf_allocate(0);
+	if (unlikely(bp == NULL))
+		goto fail;
+	_xfs_buf_initialize(bp, target, 0, len, 0);
+
+	error = _xfs_buf_get_pages(bp, page_count, 0);
+	if (error)
+		goto fail_free_buf;
+
+	for (i = 0; i < page_count; i++) {
+		bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
+		if (!bp->b_pages[i])
+			goto fail_free_mem;
+	}
+	bp->b_flags |= _XBF_PAGES;
+
+	error = _xfs_buf_map_pages(bp, XBF_MAPPED);
+	if (unlikely(error)) {
+		xfs_warn(target->bt_mount,
+			"%s: failed to map pages\n", __func__);
+		goto fail_free_mem;
+	}
+
+	xfs_buf_unlock(bp);
+
+	trace_xfs_buf_get_uncached(bp, _RET_IP_);
+	return bp;
+
+ fail_free_mem:
+	while (--i >= 0)
+		__free_page(bp->b_pages[i]);
+	_xfs_buf_free_pages(bp);
+ fail_free_buf:
+	xfs_buf_deallocate(bp);
+ fail:
+	return NULL;
+}
+
+/*
+ *	Increment reference count on buffer, to hold the buffer concurrently
+ *	with another thread which may release (free) the buffer asynchronously.
+ *	Must hold the buffer already to call this function.
+ */
+void
+xfs_buf_hold(
+	xfs_buf_t		*bp)
+{
+	trace_xfs_buf_hold(bp, _RET_IP_);
+	atomic_inc(&bp->b_hold);
+}
+
+/*
+ *	Releases a hold on the specified buffer.  If the
+ *	the hold count is 1, calls xfs_buf_free.
+ */
+void
+xfs_buf_rele(
+	xfs_buf_t		*bp)
+{
+	struct xfs_perag	*pag = bp->b_pag;
+
+	trace_xfs_buf_rele(bp, _RET_IP_);
+
+	if (!pag) {
+		ASSERT(list_empty(&bp->b_lru));
+		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
+		if (atomic_dec_and_test(&bp->b_hold))
+			xfs_buf_free(bp);
+		return;
+	}
+
+	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
+
+	ASSERT(atomic_read(&bp->b_hold) > 0);
+	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
+		if (!(bp->b_flags & XBF_STALE) &&
+			   atomic_read(&bp->b_lru_ref)) {
+			xfs_buf_lru_add(bp);
+			spin_unlock(&pag->pag_buf_lock);
+		} else {
+			xfs_buf_lru_del(bp);
+			ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
+			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+			spin_unlock(&pag->pag_buf_lock);
+			xfs_perag_put(pag);
+			xfs_buf_free(bp);
+		}
+	}
+}
+
+
+/*
+ *	Lock a buffer object, if it is not already locked.
+ *
+ *	If we come across a stale, pinned, locked buffer, we know that we are
+ *	being asked to lock a buffer that has been reallocated. Because it is
+ *	pinned, we know that the log has not been pushed to disk and hence it
+ *	will still be locked.  Rather than continuing to have trylock attempts
+ *	fail until someone else pushes the log, push it ourselves before
+ *	returning.  This means that the xfsaild will not get stuck trying
+ *	to push on stale inode buffers.
+ */
+int
+xfs_buf_cond_lock(
+	xfs_buf_t		*bp)
+{
+	int			locked;
+
+	locked = down_trylock(&bp->b_sema) == 0;
+	if (locked)
+		XB_SET_OWNER(bp);
+	else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+		xfs_log_force(bp->b_target->bt_mount, 0);
+
+	trace_xfs_buf_cond_lock(bp, _RET_IP_);
+	return locked ? 0 : -EBUSY;
+}
+
+int
+xfs_buf_lock_value(
+	xfs_buf_t		*bp)
+{
+	return bp->b_sema.count;
+}
+
+/*
+ *	Lock a buffer object.
+ *
+ *	If we come across a stale, pinned, locked buffer, we know that we
+ *	are being asked to lock a buffer that has been reallocated. Because
+ *	it is pinned, we know that the log has not been pushed to disk and
+ *	hence it will still be locked. Rather than sleeping until someone
+ *	else pushes the log, push it ourselves before trying to get the lock.
+ */
+void
+xfs_buf_lock(
+	xfs_buf_t		*bp)
+{
+	trace_xfs_buf_lock(bp, _RET_IP_);
+
+	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+		xfs_log_force(bp->b_target->bt_mount, 0);
+	down(&bp->b_sema);
+	XB_SET_OWNER(bp);
+
+	trace_xfs_buf_lock_done(bp, _RET_IP_);
+}
+
+/*
+ *	Releases the lock on the buffer object.
+ *	If the buffer is marked delwri but is not queued, do so before we
+ *	unlock the buffer as we need to set flags correctly.  We also need to
+ *	take a reference for the delwri queue because the unlocker is going to
+ *	drop their's and they don't know we just queued it.
+ */
+void
+xfs_buf_unlock(
+	xfs_buf_t		*bp)
+{
+	if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
+		atomic_inc(&bp->b_hold);
+		bp->b_flags |= XBF_ASYNC;
+		xfs_buf_delwri_queue(bp, 0);
+	}
+
+	XB_CLEAR_OWNER(bp);
+	up(&bp->b_sema);
+
+	trace_xfs_buf_unlock(bp, _RET_IP_);
+}
+
+STATIC void
+xfs_buf_wait_unpin(
+	xfs_buf_t		*bp)
+{
+	DECLARE_WAITQUEUE	(wait, current);
+
+	if (atomic_read(&bp->b_pin_count) == 0)
+		return;
+
+	add_wait_queue(&bp->b_waiters, &wait);
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (atomic_read(&bp->b_pin_count) == 0)
+			break;
+		io_schedule();
+	}
+	remove_wait_queue(&bp->b_waiters, &wait);
+	set_current_state(TASK_RUNNING);
+}
+
+/*
+ *	Buffer Utility Routines
+ */
+
+STATIC void
+xfs_buf_iodone_work(
+	struct work_struct	*work)
+{
+	xfs_buf_t		*bp =
+		container_of(work, xfs_buf_t, b_iodone_work);
+
+	if (bp->b_iodone)
+		(*(bp->b_iodone))(bp);
+	else if (bp->b_flags & XBF_ASYNC)
+		xfs_buf_relse(bp);
+}
+
+void
+xfs_buf_ioend(
+	xfs_buf_t		*bp,
+	int			schedule)
+{
+	trace_xfs_buf_iodone(bp, _RET_IP_);
+
+	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
+	if (bp->b_error == 0)
+		bp->b_flags |= XBF_DONE;
+
+	if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
+		if (schedule) {
+			INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
+			queue_work(xfslogd_workqueue, &bp->b_iodone_work);
+		} else {
+			xfs_buf_iodone_work(&bp->b_iodone_work);
+		}
+	} else {
+		complete(&bp->b_iowait);
+	}
+}
+
+void
+xfs_buf_ioerror(
+	xfs_buf_t		*bp,
+	int			error)
+{
+	ASSERT(error >= 0 && error <= 0xffff);
+	bp->b_error = (unsigned short)error;
+	trace_xfs_buf_ioerror(bp, error, _RET_IP_);
+}
+
+int
+xfs_bwrite(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	int			error;
+
+	bp->b_flags |= XBF_WRITE;
+	bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
+
+	xfs_buf_delwri_dequeue(bp);
+	xfs_bdstrat_cb(bp);
+
+	error = xfs_buf_iowait(bp);
+	if (error)
+		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+	xfs_buf_relse(bp);
+	return error;
+}
+
+void
+xfs_bdwrite(
+	void			*mp,
+	struct xfs_buf		*bp)
+{
+	trace_xfs_buf_bdwrite(bp, _RET_IP_);
+
+	bp->b_flags &= ~XBF_READ;
+	bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
+
+	xfs_buf_delwri_queue(bp, 1);
+}
+
+/*
+ * Called when we want to stop a buffer from getting written or read.
+ * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
+ * so that the proper iodone callbacks get called.
+ */
+STATIC int
+xfs_bioerror(
+	xfs_buf_t *bp)
+{
+#ifdef XFSERRORDEBUG
+	ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
+#endif
+
+	/*
+	 * No need to wait until the buffer is unpinned, we aren't flushing it.
+	 */
+	XFS_BUF_ERROR(bp, EIO);
+
+	/*
+	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
+	 */
+	XFS_BUF_UNREAD(bp);
+	XFS_BUF_UNDELAYWRITE(bp);
+	XFS_BUF_UNDONE(bp);
+	XFS_BUF_STALE(bp);
+
+	xfs_buf_ioend(bp, 0);
+
+	return EIO;
+}
+
+/*
+ * Same as xfs_bioerror, except that we are releasing the buffer
+ * here ourselves, and avoiding the xfs_buf_ioend call.
+ * This is meant for userdata errors; metadata bufs come with
+ * iodone functions attached, so that we can track down errors.
+ */
+STATIC int
+xfs_bioerror_relse(
+	struct xfs_buf	*bp)
+{
+	int64_t		fl = XFS_BUF_BFLAGS(bp);
+	/*
+	 * No need to wait until the buffer is unpinned.
+	 * We aren't flushing it.
+	 *
+	 * chunkhold expects B_DONE to be set, whether
+	 * we actually finish the I/O or not. We don't want to
+	 * change that interface.
+	 */
+	XFS_BUF_UNREAD(bp);
+	XFS_BUF_UNDELAYWRITE(bp);
+	XFS_BUF_DONE(bp);
+	XFS_BUF_STALE(bp);
+	XFS_BUF_CLR_IODONE_FUNC(bp);
+	if (!(fl & XBF_ASYNC)) {
+		/*
+		 * Mark b_error and B_ERROR _both_.
+		 * Lot's of chunkcache code assumes that.
+		 * There's no reason to mark error for
+		 * ASYNC buffers.
+		 */
+		XFS_BUF_ERROR(bp, EIO);
+		XFS_BUF_FINISH_IOWAIT(bp);
+	} else {
+		xfs_buf_relse(bp);
+	}
+
+	return EIO;
+}
+
+
+/*
+ * All xfs metadata buffers except log state machine buffers
+ * get this attached as their b_bdstrat callback function.
+ * This is so that we can catch a buffer
+ * after prematurely unpinning it to forcibly shutdown the filesystem.
+ */
+int
+xfs_bdstrat_cb(
+	struct xfs_buf	*bp)
+{
+	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+		trace_xfs_bdstrat_shut(bp, _RET_IP_);
+		/*
+		 * Metadata write that didn't get logged but
+		 * written delayed anyway. These aren't associated
+		 * with a transaction, and can be ignored.
+		 */
+		if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
+			return xfs_bioerror_relse(bp);
+		else
+			return xfs_bioerror(bp);
+	}
+
+	xfs_buf_iorequest(bp);
+	return 0;
+}
+
+/*
+ * Wrapper around bdstrat so that we can stop data from going to disk in case
+ * we are shutting down the filesystem.  Typically user data goes thru this
+ * path; one of the exceptions is the superblock.
+ */
+void
+xfsbdstrat(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		trace_xfs_bdstrat_shut(bp, _RET_IP_);
+		xfs_bioerror_relse(bp);
+		return;
+	}
+
+	xfs_buf_iorequest(bp);
+}
+
+STATIC void
+_xfs_buf_ioend(
+	xfs_buf_t		*bp,
+	int			schedule)
+{
+	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+		xfs_buf_ioend(bp, schedule);
+}
+
+STATIC void
+xfs_buf_bio_end_io(
+	struct bio		*bio,
+	int			error)
+{
+	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
+
+	xfs_buf_ioerror(bp, -error);
+
+	if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
+
+	_xfs_buf_ioend(bp, 1);
+	bio_put(bio);
+}
+
+STATIC void
+_xfs_buf_ioapply(
+	xfs_buf_t		*bp)
+{
+	int			rw, map_i, total_nr_pages, nr_pages;
+	struct bio		*bio;
+	int			offset = bp->b_offset;
+	int			size = bp->b_count_desired;
+	sector_t		sector = bp->b_bn;
+
+	total_nr_pages = bp->b_page_count;
+	map_i = 0;
+
+	if (bp->b_flags & XBF_ORDERED) {
+		ASSERT(!(bp->b_flags & XBF_READ));
+		rw = WRITE_FLUSH_FUA;
+	} else if (bp->b_flags & XBF_LOG_BUFFER) {
+		ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
+		bp->b_flags &= ~_XBF_RUN_QUEUES;
+		rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC;
+	} else if (bp->b_flags & _XBF_RUN_QUEUES) {
+		ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
+		bp->b_flags &= ~_XBF_RUN_QUEUES;
+		rw = (bp->b_flags & XBF_WRITE) ? WRITE_META : READ_META;
+	} else {
+		rw = (bp->b_flags & XBF_WRITE) ? WRITE :
+		     (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
+	}
+
+
+next_chunk:
+	atomic_inc(&bp->b_io_remaining);
+	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
+	if (nr_pages > total_nr_pages)
+		nr_pages = total_nr_pages;
+
+	bio = bio_alloc(GFP_NOIO, nr_pages);
+	bio->bi_bdev = bp->b_target->bt_bdev;
+	bio->bi_sector = sector;
+	bio->bi_end_io = xfs_buf_bio_end_io;
+	bio->bi_private = bp;
+
+
+	for (; size && nr_pages; nr_pages--, map_i++) {
+		int	rbytes, nbytes = PAGE_SIZE - offset;
+
+		if (nbytes > size)
+			nbytes = size;
+
+		rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
+		if (rbytes < nbytes)
+			break;
+
+		offset = 0;
+		sector += nbytes >> BBSHIFT;
+		size -= nbytes;
+		total_nr_pages--;
+	}
+
+	if (likely(bio->bi_size)) {
+		if (xfs_buf_is_vmapped(bp)) {
+			flush_kernel_vmap_range(bp->b_addr,
+						xfs_buf_vmap_len(bp));
+		}
+		submit_bio(rw, bio);
+		if (size)
+			goto next_chunk;
+	} else {
+		xfs_buf_ioerror(bp, EIO);
+		bio_put(bio);
+	}
+}
+
+int
+xfs_buf_iorequest(
+	xfs_buf_t		*bp)
+{
+	trace_xfs_buf_iorequest(bp, _RET_IP_);
+
+	if (bp->b_flags & XBF_DELWRI) {
+		xfs_buf_delwri_queue(bp, 1);
+		return 0;
+	}
+
+	if (bp->b_flags & XBF_WRITE) {
+		xfs_buf_wait_unpin(bp);
+	}
+
+	xfs_buf_hold(bp);
+
+	/* Set the count to 1 initially, this will stop an I/O
+	 * completion callout which happens before we have started
+	 * all the I/O from calling xfs_buf_ioend too early.
+	 */
+	atomic_set(&bp->b_io_remaining, 1);
+	_xfs_buf_ioapply(bp);
+	_xfs_buf_ioend(bp, 0);
+
+	xfs_buf_rele(bp);
+	return 0;
+}
+
+/*
+ *	Waits for I/O to complete on the buffer supplied.
+ *	It returns immediately if no I/O is pending.
+ *	It returns the I/O error code, if any, or 0 if there was no error.
+ */
+int
+xfs_buf_iowait(
+	xfs_buf_t		*bp)
+{
+	trace_xfs_buf_iowait(bp, _RET_IP_);
+
+	wait_for_completion(&bp->b_iowait);
+
+	trace_xfs_buf_iowait_done(bp, _RET_IP_);
+	return bp->b_error;
+}
+
+xfs_caddr_t
+xfs_buf_offset(
+	xfs_buf_t		*bp,
+	size_t			offset)
+{
+	struct page		*page;
+
+	if (bp->b_flags & XBF_MAPPED)
+		return XFS_BUF_PTR(bp) + offset;
+
+	offset += bp->b_offset;
+	page = bp->b_pages[offset >> PAGE_SHIFT];
+	return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
+}
+
+/*
+ *	Move data into or out of a buffer.
+ */
+void
+xfs_buf_iomove(
+	xfs_buf_t		*bp,	/* buffer to process		*/
+	size_t			boff,	/* starting buffer offset	*/
+	size_t			bsize,	/* length to copy		*/
+	void			*data,	/* data address			*/
+	xfs_buf_rw_t		mode)	/* read/write/zero flag		*/
+{
+	size_t			bend, cpoff, csize;
+	struct page		*page;
+
+	bend = boff + bsize;
+	while (boff < bend) {
+		page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
+		cpoff = xfs_buf_poff(boff + bp->b_offset);
+		csize = min_t(size_t,
+			      PAGE_SIZE-cpoff, bp->b_count_desired-boff);
+
+		ASSERT(((csize + cpoff) <= PAGE_SIZE));
+
+		switch (mode) {
+		case XBRW_ZERO:
+			memset(page_address(page) + cpoff, 0, csize);
+			break;
+		case XBRW_READ:
+			memcpy(data, page_address(page) + cpoff, csize);
+			break;
+		case XBRW_WRITE:
+			memcpy(page_address(page) + cpoff, data, csize);
+		}
+
+		boff += csize;
+		data += csize;
+	}
+}
+
+/*
+ *	Handling of buffer targets (buftargs).
+ */
+
+/*
+ * Wait for any bufs with callbacks that have been submitted but have not yet
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
+ */
+void
+xfs_wait_buftarg(
+	struct xfs_buftarg	*btp)
+{
+	struct xfs_buf		*bp;
+
+restart:
+	spin_lock(&btp->bt_lru_lock);
+	while (!list_empty(&btp->bt_lru)) {
+		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+		if (atomic_read(&bp->b_hold) > 1) {
+			spin_unlock(&btp->bt_lru_lock);
+			delay(100);
+			goto restart;
+		}
+		/*
+		 * clear the LRU reference count so the bufer doesn't get
+		 * ignored in xfs_buf_rele().
+		 */
+		atomic_set(&bp->b_lru_ref, 0);
+		spin_unlock(&btp->bt_lru_lock);
+		xfs_buf_rele(bp);
+		spin_lock(&btp->bt_lru_lock);
+	}
+	spin_unlock(&btp->bt_lru_lock);
+}
+
+int
+xfs_buftarg_shrink(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_buftarg	*btp = container_of(shrink,
+					struct xfs_buftarg, bt_shrinker);
+	struct xfs_buf		*bp;
+	int nr_to_scan = sc->nr_to_scan;
+	LIST_HEAD(dispose);
+
+	if (!nr_to_scan)
+		return btp->bt_lru_nr;
+
+	spin_lock(&btp->bt_lru_lock);
+	while (!list_empty(&btp->bt_lru)) {
+		if (nr_to_scan-- <= 0)
+			break;
+
+		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+
+		/*
+		 * Decrement the b_lru_ref count unless the value is already
+		 * zero. If the value is already zero, we need to reclaim the
+		 * buffer, otherwise it gets another trip through the LRU.
+		 */
+		if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+			list_move_tail(&bp->b_lru, &btp->bt_lru);
+			continue;
+		}
+
+		/*
+		 * remove the buffer from the LRU now to avoid needing another
+		 * lock round trip inside xfs_buf_rele().
+		 */
+		list_move(&bp->b_lru, &dispose);
+		btp->bt_lru_nr--;
+	}
+	spin_unlock(&btp->bt_lru_lock);
+
+	while (!list_empty(&dispose)) {
+		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+		list_del_init(&bp->b_lru);
+		xfs_buf_rele(bp);
+	}
+
+	return btp->bt_lru_nr;
+}
+
+void
+xfs_free_buftarg(
+	struct xfs_mount	*mp,
+	struct xfs_buftarg	*btp)
+{
+	unregister_shrinker(&btp->bt_shrinker);
+
+	xfs_flush_buftarg(btp, 1);
+	if (mp->m_flags & XFS_MOUNT_BARRIER)
+		xfs_blkdev_issue_flush(btp);
+
+	kthread_stop(btp->bt_task);
+	kmem_free(btp);
+}
+
+STATIC int
+xfs_setsize_buftarg_flags(
+	xfs_buftarg_t		*btp,
+	unsigned int		blocksize,
+	unsigned int		sectorsize,
+	int			verbose)
+{
+	btp->bt_bsize = blocksize;
+	btp->bt_sshift = ffs(sectorsize) - 1;
+	btp->bt_smask = sectorsize - 1;
+
+	if (set_blocksize(btp->bt_bdev, sectorsize)) {
+		xfs_warn(btp->bt_mount,
+			"Cannot set_blocksize to %u on device %s\n",
+			sectorsize, XFS_BUFTARG_NAME(btp));
+		return EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ *	When allocating the initial buffer target we have not yet
+ *	read in the superblock, so don't know what sized sectors
+ *	are being used is at this early stage.  Play safe.
+ */
+STATIC int
+xfs_setsize_buftarg_early(
+	xfs_buftarg_t		*btp,
+	struct block_device	*bdev)
+{
+	return xfs_setsize_buftarg_flags(btp,
+			PAGE_SIZE, bdev_logical_block_size(bdev), 0);
+}
+
+int
+xfs_setsize_buftarg(
+	xfs_buftarg_t		*btp,
+	unsigned int		blocksize,
+	unsigned int		sectorsize)
+{
+	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
+}
+
+STATIC int
+xfs_alloc_delwrite_queue(
+	xfs_buftarg_t		*btp,
+	const char		*fsname)
+{
+	INIT_LIST_HEAD(&btp->bt_delwrite_queue);
+	spin_lock_init(&btp->bt_delwrite_lock);
+	btp->bt_flags = 0;
+	btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
+	if (IS_ERR(btp->bt_task))
+		return PTR_ERR(btp->bt_task);
+	return 0;
+}
+
+xfs_buftarg_t *
+xfs_alloc_buftarg(
+	struct xfs_mount	*mp,
+	struct block_device	*bdev,
+	int			external,
+	const char		*fsname)
+{
+	xfs_buftarg_t		*btp;
+
+	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
+
+	btp->bt_mount = mp;
+	btp->bt_dev =  bdev->bd_dev;
+	btp->bt_bdev = bdev;
+	btp->bt_bdi = blk_get_backing_dev_info(bdev);
+	if (!btp->bt_bdi)
+		goto error;
+
+	INIT_LIST_HEAD(&btp->bt_lru);
+	spin_lock_init(&btp->bt_lru_lock);
+	if (xfs_setsize_buftarg_early(btp, bdev))
+		goto error;
+	if (xfs_alloc_delwrite_queue(btp, fsname))
+		goto error;
+	btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+	register_shrinker(&btp->bt_shrinker);
+	return btp;
+
+error:
+	kmem_free(btp);
+	return NULL;
+}
+
+
+/*
+ *	Delayed write buffer handling
+ */
+STATIC void
+xfs_buf_delwri_queue(
+	xfs_buf_t		*bp,
+	int			unlock)
+{
+	struct list_head	*dwq = &bp->b_target->bt_delwrite_queue;
+	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
+
+	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
+
+	ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
+
+	spin_lock(dwlk);
+	/* If already in the queue, dequeue and place at tail */
+	if (!list_empty(&bp->b_list)) {
+		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+		if (unlock)
+			atomic_dec(&bp->b_hold);
+		list_del(&bp->b_list);
+	}
+
+	if (list_empty(dwq)) {
+		/* start xfsbufd as it is about to have something to do */
+		wake_up_process(bp->b_target->bt_task);
+	}
+
+	bp->b_flags |= _XBF_DELWRI_Q;
+	list_add_tail(&bp->b_list, dwq);
+	bp->b_queuetime = jiffies;
+	spin_unlock(dwlk);
+
+	if (unlock)
+		xfs_buf_unlock(bp);
+}
+
+void
+xfs_buf_delwri_dequeue(
+	xfs_buf_t		*bp)
+{
+	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
+	int			dequeued = 0;
+
+	spin_lock(dwlk);
+	if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
+		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+		list_del_init(&bp->b_list);
+		dequeued = 1;
+	}
+	bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
+	spin_unlock(dwlk);
+
+	if (dequeued)
+		xfs_buf_rele(bp);
+
+	trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
+}
+
+/*
+ * If a delwri buffer needs to be pushed before it has aged out, then promote
+ * it to the head of the delwri queue so that it will be flushed on the next
+ * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
+ * than the age currently needed to flush the buffer. Hence the next time the
+ * xfsbufd sees it is guaranteed to be considered old enough to flush.
+ */
+void
+xfs_buf_delwri_promote(
+	struct xfs_buf	*bp)
+{
+	struct xfs_buftarg *btp = bp->b_target;
+	long		age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
+
+	ASSERT(bp->b_flags & XBF_DELWRI);
+	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+
+	/*
+	 * Check the buffer age before locking the delayed write queue as we
+	 * don't need to promote buffers that are already past the flush age.
+	 */
+	if (bp->b_queuetime < jiffies - age)
+		return;
+	bp->b_queuetime = jiffies - age;
+	spin_lock(&btp->bt_delwrite_lock);
+	list_move(&bp->b_list, &btp->bt_delwrite_queue);
+	spin_unlock(&btp->bt_delwrite_lock);
+}
+
+STATIC void
+xfs_buf_runall_queues(
+	struct workqueue_struct	*queue)
+{
+	flush_workqueue(queue);
+}
+
+/*
+ * Move as many buffers as specified to the supplied list
+ * idicating if we skipped any buffers to prevent deadlocks.
+ */
+STATIC int
+xfs_buf_delwri_split(
+	xfs_buftarg_t	*target,
+	struct list_head *list,
+	unsigned long	age)
+{
+	xfs_buf_t	*bp, *n;
+	struct list_head *dwq = &target->bt_delwrite_queue;
+	spinlock_t	*dwlk = &target->bt_delwrite_lock;
+	int		skipped = 0;
+	int		force;
+
+	force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
+	INIT_LIST_HEAD(list);
+	spin_lock(dwlk);
+	list_for_each_entry_safe(bp, n, dwq, b_list) {
+		ASSERT(bp->b_flags & XBF_DELWRI);
+
+		if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
+			if (!force &&
+			    time_before(jiffies, bp->b_queuetime + age)) {
+				xfs_buf_unlock(bp);
+				break;
+			}
+
+			bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q|
+					 _XBF_RUN_QUEUES);
+			bp->b_flags |= XBF_WRITE;
+			list_move_tail(&bp->b_list, list);
+			trace_xfs_buf_delwri_split(bp, _RET_IP_);
+		} else
+			skipped++;
+	}
+	spin_unlock(dwlk);
+
+	return skipped;
+
+}
+
+/*
+ * Compare function is more complex than it needs to be because
+ * the return value is only 32 bits and we are doing comparisons
+ * on 64 bit values
+ */
+static int
+xfs_buf_cmp(
+	void		*priv,
+	struct list_head *a,
+	struct list_head *b)
+{
+	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
+	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
+	xfs_daddr_t		diff;
+
+	diff = ap->b_bn - bp->b_bn;
+	if (diff < 0)
+		return -1;
+	if (diff > 0)
+		return 1;
+	return 0;
+}
+
+void
+xfs_buf_delwri_sort(
+	xfs_buftarg_t	*target,
+	struct list_head *list)
+{
+	list_sort(NULL, list, xfs_buf_cmp);
+}
+
+STATIC int
+xfsbufd(
+	void		*data)
+{
+	xfs_buftarg_t   *target = (xfs_buftarg_t *)data;
+
+	current->flags |= PF_MEMALLOC;
+
+	set_freezable();
+
+	do {
+		long	age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
+		long	tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
+		struct list_head tmp;
+		struct blk_plug plug;
+
+		if (unlikely(freezing(current))) {
+			set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
+			refrigerator();
+		} else {
+			clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
+		}
+
+		/* sleep for a long time if there is nothing to do. */
+		if (list_empty(&target->bt_delwrite_queue))
+			tout = MAX_SCHEDULE_TIMEOUT;
+		schedule_timeout_interruptible(tout);
+
+		xfs_buf_delwri_split(target, &tmp, age);
+		list_sort(NULL, &tmp, xfs_buf_cmp);
+
+		blk_start_plug(&plug);
+		while (!list_empty(&tmp)) {
+			struct xfs_buf *bp;
+			bp = list_first_entry(&tmp, struct xfs_buf, b_list);
+			list_del_init(&bp->b_list);
+			xfs_bdstrat_cb(bp);
+		}
+		blk_finish_plug(&plug);
+	} while (!kthread_should_stop());
+
+	return 0;
+}
+
+/*
+ *	Go through all incore buffers, and release buffers if they belong to
+ *	the given device. This is used in filesystem error handling to
+ *	preserve the consistency of its metadata.
+ */
+int
+xfs_flush_buftarg(
+	xfs_buftarg_t	*target,
+	int		wait)
+{
+	xfs_buf_t	*bp;
+	int		pincount = 0;
+	LIST_HEAD(tmp_list);
+	LIST_HEAD(wait_list);
+	struct blk_plug plug;
+
+	xfs_buf_runall_queues(xfsconvertd_workqueue);
+	xfs_buf_runall_queues(xfsdatad_workqueue);
+	xfs_buf_runall_queues(xfslogd_workqueue);
+
+	set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
+	pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
+
+	/*
+	 * Dropped the delayed write list lock, now walk the temporary list.
+	 * All I/O is issued async and then if we need to wait for completion
+	 * we do that after issuing all the IO.
+	 */
+	list_sort(NULL, &tmp_list, xfs_buf_cmp);
+
+	blk_start_plug(&plug);
+	while (!list_empty(&tmp_list)) {
+		bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
+		ASSERT(target == bp->b_target);
+		list_del_init(&bp->b_list);
+		if (wait) {
+			bp->b_flags &= ~XBF_ASYNC;
+			list_add(&bp->b_list, &wait_list);
+		}
+		xfs_bdstrat_cb(bp);
+	}
+	blk_finish_plug(&plug);
+
+	if (wait) {
+		/* Wait for IO to complete. */
+		while (!list_empty(&wait_list)) {
+			bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
+
+			list_del_init(&bp->b_list);
+			xfs_buf_iowait(bp);
+			xfs_buf_relse(bp);
+		}
+	}
+
+	return pincount;
+}
+
+int __init
+xfs_buf_init(void)
+{
+	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
+						KM_ZONE_HWALIGN, NULL);
+	if (!xfs_buf_zone)
+		goto out;
+
+	xfslogd_workqueue = alloc_workqueue("xfslogd",
+					WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
+	if (!xfslogd_workqueue)
+		goto out_free_buf_zone;
+
+	xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
+	if (!xfsdatad_workqueue)
+		goto out_destroy_xfslogd_workqueue;
+
+	xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
+						WQ_MEM_RECLAIM, 1);
+	if (!xfsconvertd_workqueue)
+		goto out_destroy_xfsdatad_workqueue;
+
+	return 0;
+
+ out_destroy_xfsdatad_workqueue:
+	destroy_workqueue(xfsdatad_workqueue);
+ out_destroy_xfslogd_workqueue:
+	destroy_workqueue(xfslogd_workqueue);
+ out_free_buf_zone:
+	kmem_zone_destroy(xfs_buf_zone);
+ out:
+	return -ENOMEM;
+}
+
+void
+xfs_buf_terminate(void)
+{
+	destroy_workqueue(xfsconvertd_workqueue);
+	destroy_workqueue(xfsdatad_workqueue);
+	destroy_workqueue(xfslogd_workqueue);
+	kmem_zone_destroy(xfs_buf_zone);
+}
+
+#ifdef CONFIG_KDB_MODULES
+struct list_head *
+xfs_get_buftarg_list(void)
+{
+	return &xfs_buftarg_list;
+}
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
new file mode 100644
index 00000000..36d6ee44
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BUF_H__
+#define __XFS_BUF_H__
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <asm/system.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/uio.h>
+
+/*
+ *	Base types
+ */
+
+#define XFS_BUF_DADDR_NULL	((xfs_daddr_t) (-1LL))
+
+#define xfs_buf_ctob(pp)	((pp) * PAGE_CACHE_SIZE)
+#define xfs_buf_btoc(dd)	(((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT)
+#define xfs_buf_btoct(dd)	((dd) >> PAGE_CACHE_SHIFT)
+#define xfs_buf_poff(aa)	((aa) & ~PAGE_CACHE_MASK)
+
+typedef enum {
+	XBRW_READ = 1,			/* transfer into target memory */
+	XBRW_WRITE = 2,			/* transfer from target memory */
+	XBRW_ZERO = 3,			/* Zero target memory */
+} xfs_buf_rw_t;
+
+#define XBF_READ	(1 << 0) /* buffer intended for reading from device */
+#define XBF_WRITE	(1 << 1) /* buffer intended for writing to device */
+#define XBF_MAPPED	(1 << 2) /* buffer mapped (b_addr valid) */
+#define XBF_ASYNC	(1 << 4) /* initiator will not wait for completion */
+#define XBF_DONE	(1 << 5) /* all pages in the buffer uptodate */
+#define XBF_DELWRI	(1 << 6) /* buffer has dirty pages */
+#define XBF_STALE	(1 << 7) /* buffer has been staled, do not find it */
+#define XBF_ORDERED	(1 << 11)/* use ordered writes */
+#define XBF_READ_AHEAD	(1 << 12)/* asynchronous read-ahead */
+#define XBF_LOG_BUFFER	(1 << 13)/* this is a buffer used for the log */
+
+/* flags used only as arguments to access routines */
+#define XBF_LOCK	(1 << 14)/* lock requested */
+#define XBF_TRYLOCK	(1 << 15)/* lock requested, but do not wait */
+#define XBF_DONT_BLOCK	(1 << 16)/* do not block in current thread */
+
+/* flags used only internally */
+#define _XBF_PAGES	(1 << 18)/* backed by refcounted pages */
+#define	_XBF_RUN_QUEUES	(1 << 19)/* run block device task queue	*/
+#define	_XBF_KMEM	(1 << 20)/* backed by heap memory */
+#define _XBF_DELWRI_Q	(1 << 21)/* buffer on delwri queue */
+
+typedef unsigned int xfs_buf_flags_t;
+
+#define XFS_BUF_FLAGS \
+	{ XBF_READ,		"READ" }, \
+	{ XBF_WRITE,		"WRITE" }, \
+	{ XBF_MAPPED,		"MAPPED" }, \
+	{ XBF_ASYNC,		"ASYNC" }, \
+	{ XBF_DONE,		"DONE" }, \
+	{ XBF_DELWRI,		"DELWRI" }, \
+	{ XBF_STALE,		"STALE" }, \
+	{ XBF_ORDERED,		"ORDERED" }, \
+	{ XBF_READ_AHEAD,	"READ_AHEAD" }, \
+	{ XBF_LOCK,		"LOCK" },  	/* should never be set */\
+	{ XBF_TRYLOCK,		"TRYLOCK" }, 	/* ditto */\
+	{ XBF_DONT_BLOCK,	"DONT_BLOCK" },	/* ditto */\
+	{ _XBF_PAGES,		"PAGES" }, \
+	{ _XBF_RUN_QUEUES,	"RUN_QUEUES" }, \
+	{ _XBF_KMEM,		"KMEM" }, \
+	{ _XBF_DELWRI_Q,	"DELWRI_Q" }
+
+typedef enum {
+	XBT_FORCE_SLEEP = 0,
+	XBT_FORCE_FLUSH = 1,
+} xfs_buftarg_flags_t;
+
+typedef struct xfs_bufhash {
+	struct list_head	bh_list;
+	spinlock_t		bh_lock;
+} xfs_bufhash_t;
+
+typedef struct xfs_buftarg {
+	dev_t			bt_dev;
+	struct block_device	*bt_bdev;
+	struct backing_dev_info	*bt_bdi;
+	struct xfs_mount	*bt_mount;
+	unsigned int		bt_bsize;
+	unsigned int		bt_sshift;
+	size_t			bt_smask;
+
+	/* per device delwri queue */
+	struct task_struct	*bt_task;
+	struct list_head	bt_delwrite_queue;
+	spinlock_t		bt_delwrite_lock;
+	unsigned long		bt_flags;
+
+	/* LRU control structures */
+	struct shrinker		bt_shrinker;
+	struct list_head	bt_lru;
+	spinlock_t		bt_lru_lock;
+	unsigned int		bt_lru_nr;
+} xfs_buftarg_t;
+
+struct xfs_buf;
+typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
+
+#define XB_PAGES	2
+
+typedef struct xfs_buf {
+	/*
+	 * first cacheline holds all the fields needed for an uncontended cache
+	 * hit to be fully processed. The semaphore straddles the cacheline
+	 * boundary, but the counter and lock sits on the first cacheline,
+	 * which is the only bit that is touched if we hit the semaphore
+	 * fast-path on locking.
+	 */
+	struct rb_node		b_rbnode;	/* rbtree node */
+	xfs_off_t		b_file_offset;	/* offset in file */
+	size_t			b_buffer_length;/* size of buffer in bytes */
+	atomic_t		b_hold;		/* reference count */
+	atomic_t		b_lru_ref;	/* lru reclaim ref count */
+	xfs_buf_flags_t		b_flags;	/* status flags */
+	struct semaphore	b_sema;		/* semaphore for lockables */
+
+	struct list_head	b_lru;		/* lru list */
+	wait_queue_head_t	b_waiters;	/* unpin waiters */
+	struct list_head	b_list;
+	struct xfs_perag	*b_pag;		/* contains rbtree root */
+	xfs_buftarg_t		*b_target;	/* buffer target (device) */
+	xfs_daddr_t		b_bn;		/* block number for I/O */
+	size_t			b_count_desired;/* desired transfer size */
+	void			*b_addr;	/* virtual address of buffer */
+	struct work_struct	b_iodone_work;
+	xfs_buf_iodone_t	b_iodone;	/* I/O completion function */
+	struct completion	b_iowait;	/* queue for I/O waiters */
+	void			*b_fspriv;
+	void			*b_fspriv2;
+	struct page		**b_pages;	/* array of page pointers */
+	struct page		*b_page_array[XB_PAGES]; /* inline pages */
+	unsigned long		b_queuetime;	/* time buffer was queued */
+	atomic_t		b_pin_count;	/* pin count */
+	atomic_t		b_io_remaining;	/* #outstanding I/O requests */
+	unsigned int		b_page_count;	/* size of page array */
+	unsigned int		b_offset;	/* page offset in first page */
+	unsigned short		b_error;	/* error code on I/O */
+#ifdef XFS_BUF_LOCK_TRACKING
+	int			b_last_holder;
+#endif
+} xfs_buf_t;
+
+
+/* Finding and Reading Buffers */
+extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t,
+				xfs_buf_flags_t, xfs_buf_t *);
+#define xfs_incore(buftarg,blkno,len,lockit) \
+	_xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
+
+extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t,
+				xfs_buf_flags_t);
+extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
+				xfs_buf_flags_t);
+
+extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
+extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
+extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
+extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
+extern void xfs_buf_hold(xfs_buf_t *);
+extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
+struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
+				struct xfs_buftarg *target,
+				xfs_daddr_t daddr, size_t length, int flags);
+
+/* Releasing Buffers */
+extern void xfs_buf_free(xfs_buf_t *);
+extern void xfs_buf_rele(xfs_buf_t *);
+
+/* Locking and Unlocking Buffers */
+extern int xfs_buf_cond_lock(xfs_buf_t *);
+extern int xfs_buf_lock_value(xfs_buf_t *);
+extern void xfs_buf_lock(xfs_buf_t *);
+extern void xfs_buf_unlock(xfs_buf_t *);
+
+/* Buffer Read and Write Routines */
+extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
+extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
+
+extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
+extern int xfs_bdstrat_cb(struct xfs_buf *);
+
+extern void xfs_buf_ioend(xfs_buf_t *,	int);
+extern void xfs_buf_ioerror(xfs_buf_t *, int);
+extern int xfs_buf_iorequest(xfs_buf_t *);
+extern int xfs_buf_iowait(xfs_buf_t *);
+extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
+				xfs_buf_rw_t);
+#define xfs_buf_zero(bp, off, len) \
+	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
+
+static inline int xfs_buf_geterror(xfs_buf_t *bp)
+{
+	return bp ? bp->b_error : ENOMEM;
+}
+
+/* Buffer Utility Routines */
+extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
+
+/* Delayed Write Buffer Routines */
+extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
+extern void xfs_buf_delwri_promote(xfs_buf_t *);
+
+/* Buffer Daemon Setup Routines */
+extern int xfs_buf_init(void);
+extern void xfs_buf_terminate(void);
+
+#define xfs_buf_target_name(target)	\
+	({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
+
+
+#define XFS_BUF_BFLAGS(bp)	((bp)->b_flags)
+#define XFS_BUF_ZEROFLAGS(bp)	((bp)->b_flags &= \
+		~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
+
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_STALE(bp)	xfs_buf_stale(bp);
+#define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XBF_STALE)
+#define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XBF_STALE)
+#define XFS_BUF_SUPER_STALE(bp)	do {				\
+					XFS_BUF_STALE(bp);	\
+					xfs_buf_delwri_dequeue(bp);	\
+					XFS_BUF_DONE(bp);	\
+				} while (0)
+
+#define XFS_BUF_DELAYWRITE(bp)		((bp)->b_flags |= XBF_DELWRI)
+#define XFS_BUF_UNDELAYWRITE(bp)	xfs_buf_delwri_dequeue(bp)
+#define XFS_BUF_ISDELAYWRITE(bp)	((bp)->b_flags & XBF_DELWRI)
+
+#define XFS_BUF_ERROR(bp,no)	xfs_buf_ioerror(bp,no)
+#define XFS_BUF_GETERROR(bp)	xfs_buf_geterror(bp)
+#define XFS_BUF_ISERROR(bp)	(xfs_buf_geterror(bp) ? 1 : 0)
+
+#define XFS_BUF_DONE(bp)	((bp)->b_flags |= XBF_DONE)
+#define XFS_BUF_UNDONE(bp)	((bp)->b_flags &= ~XBF_DONE)
+#define XFS_BUF_ISDONE(bp)	((bp)->b_flags & XBF_DONE)
+
+#define XFS_BUF_BUSY(bp)	do { } while (0)
+#define XFS_BUF_UNBUSY(bp)	do { } while (0)
+#define XFS_BUF_ISBUSY(bp)	(1)
+
+#define XFS_BUF_ASYNC(bp)	((bp)->b_flags |= XBF_ASYNC)
+#define XFS_BUF_UNASYNC(bp)	((bp)->b_flags &= ~XBF_ASYNC)
+#define XFS_BUF_ISASYNC(bp)	((bp)->b_flags & XBF_ASYNC)
+
+#define XFS_BUF_ORDERED(bp)	((bp)->b_flags |= XBF_ORDERED)
+#define XFS_BUF_UNORDERED(bp)	((bp)->b_flags &= ~XBF_ORDERED)
+#define XFS_BUF_ISORDERED(bp)	((bp)->b_flags & XBF_ORDERED)
+
+#define XFS_BUF_HOLD(bp)	xfs_buf_hold(bp)
+#define XFS_BUF_READ(bp)	((bp)->b_flags |= XBF_READ)
+#define XFS_BUF_UNREAD(bp)	((bp)->b_flags &= ~XBF_READ)
+#define XFS_BUF_ISREAD(bp)	((bp)->b_flags & XBF_READ)
+
+#define XFS_BUF_WRITE(bp)	((bp)->b_flags |= XBF_WRITE)
+#define XFS_BUF_UNWRITE(bp)	((bp)->b_flags &= ~XBF_WRITE)
+#define XFS_BUF_ISWRITE(bp)	((bp)->b_flags & XBF_WRITE)
+
+#define XFS_BUF_IODONE_FUNC(bp)			((bp)->b_iodone)
+#define XFS_BUF_SET_IODONE_FUNC(bp, func)	((bp)->b_iodone = (func))
+#define XFS_BUF_CLR_IODONE_FUNC(bp)		((bp)->b_iodone = NULL)
+
+#define XFS_BUF_FSPRIVATE(bp, type)		((type)(bp)->b_fspriv)
+#define XFS_BUF_SET_FSPRIVATE(bp, val)		((bp)->b_fspriv = (void*)(val))
+#define XFS_BUF_FSPRIVATE2(bp, type)		((type)(bp)->b_fspriv2)
+#define XFS_BUF_SET_FSPRIVATE2(bp, val)		((bp)->b_fspriv2 = (void*)(val))
+#define XFS_BUF_SET_START(bp)			do { } while (0)
+
+#define XFS_BUF_PTR(bp)			(xfs_caddr_t)((bp)->b_addr)
+#define XFS_BUF_SET_PTR(bp, val, cnt)	xfs_buf_associate_memory(bp, val, cnt)
+#define XFS_BUF_ADDR(bp)		((bp)->b_bn)
+#define XFS_BUF_SET_ADDR(bp, bno)	((bp)->b_bn = (xfs_daddr_t)(bno))
+#define XFS_BUF_OFFSET(bp)		((bp)->b_file_offset)
+#define XFS_BUF_SET_OFFSET(bp, off)	((bp)->b_file_offset = (off))
+#define XFS_BUF_COUNT(bp)		((bp)->b_count_desired)
+#define XFS_BUF_SET_COUNT(bp, cnt)	((bp)->b_count_desired = (cnt))
+#define XFS_BUF_SIZE(bp)		((bp)->b_buffer_length)
+#define XFS_BUF_SET_SIZE(bp, cnt)	((bp)->b_buffer_length = (cnt))
+
+static inline void
+xfs_buf_set_ref(
+	struct xfs_buf	*bp,
+	int		lru_ref)
+{
+	atomic_set(&bp->b_lru_ref, lru_ref);
+}
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)	xfs_buf_set_ref(bp, ref)
+#define XFS_BUF_SET_VTYPE(bp, type)		do { } while (0)
+
+#define XFS_BUF_ISPINNED(bp)	atomic_read(&((bp)->b_pin_count))
+
+#define XFS_BUF_VALUSEMA(bp)	xfs_buf_lock_value(bp)
+#define XFS_BUF_CPSEMA(bp)	(xfs_buf_cond_lock(bp) == 0)
+#define XFS_BUF_VSEMA(bp)	xfs_buf_unlock(bp)
+#define XFS_BUF_PSEMA(bp,x)	xfs_buf_lock(bp)
+#define XFS_BUF_FINISH_IOWAIT(bp)	complete(&bp->b_iowait);
+
+#define XFS_BUF_SET_TARGET(bp, target)	((bp)->b_target = (target))
+#define XFS_BUF_TARGET(bp)		((bp)->b_target)
+#define XFS_BUFTARG_NAME(target)	xfs_buf_target_name(target)
+
+static inline void xfs_buf_relse(xfs_buf_t *bp)
+{
+	xfs_buf_unlock(bp);
+	xfs_buf_rele(bp);
+}
+
+/*
+ *	Handling of buftargs.
+ */
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
+			struct block_device *, int, const char *);
+extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
+extern void xfs_wait_buftarg(xfs_buftarg_t *);
+extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
+extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
+
+#ifdef CONFIG_KDB_MODULES
+extern struct list_head *xfs_get_buftarg_list(void);
+#endif
+
+#define xfs_getsize_buftarg(buftarg)	block_size((buftarg)->bt_bdev)
+#define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)
+
+#define XFS_bflush(buftarg)		xfs_flush_buftarg(buftarg, 1)
+
+#endif	/* __XFS_BUF_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 00000000..572494fa
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+
+STATIC int
+xfs_trim_extents(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_fsblock_t		start,
+	xfs_fsblock_t		len,
+	xfs_fsblock_t		minlen,
+	__uint64_t		*blocks_trimmed)
+{
+	struct block_device	*bdev = mp->m_ddev_targp->bt_bdev;
+	struct xfs_btree_cur	*cur;
+	struct xfs_buf		*agbp;
+	struct xfs_perag	*pag;
+	int			error;
+	int			i;
+
+	pag = xfs_perag_get(mp, agno);
+
+	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+	if (error || !agbp)
+		goto out_put_perag;
+
+	cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+
+	/*
+	 * Force out the log.  This means any transactions that might have freed
+	 * space before we took the AGF buffer lock are now on disk, and the
+	 * volatile disk cache is flushed.
+	 */
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/*
+	 * Look up the longest btree in the AGF and start with it.
+	 */
+	error = xfs_alloc_lookup_le(cur, 0,
+			    be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
+	if (error)
+		goto out_del_cursor;
+
+	/*
+	 * Loop until we are done with all extents that are large
+	 * enough to be worth discarding.
+	 */
+	while (i) {
+		xfs_agblock_t fbno;
+		xfs_extlen_t flen;
+
+		error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+		if (error)
+			goto out_del_cursor;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+		ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
+
+		/*
+		 * Too small?  Give up.
+		 */
+		if (flen < minlen) {
+			trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+			goto out_del_cursor;
+		}
+
+		/*
+		 * If the extent is entirely outside of the range we are
+		 * supposed to discard skip it.  Do not bother to trim
+		 * down partially overlapping ranges for now.
+		 */
+		if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
+		    XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+			trace_xfs_discard_exclude(mp, agno, fbno, flen);
+			goto next_extent;
+		}
+
+		/*
+		 * If any blocks in the range are still busy, skip the
+		 * discard and try again the next time.
+		 */
+		if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+			trace_xfs_discard_busy(mp, agno, fbno, flen);
+			goto next_extent;
+		}
+
+		trace_xfs_discard_extent(mp, agno, fbno, flen);
+		error = -blkdev_issue_discard(bdev,
+				XFS_AGB_TO_DADDR(mp, agno, fbno),
+				XFS_FSB_TO_BB(mp, flen),
+				GFP_NOFS, 0);
+		if (error)
+			goto out_del_cursor;
+		*blocks_trimmed += flen;
+
+next_extent:
+		error = xfs_btree_decrement(cur, 0, &i);
+		if (error)
+			goto out_del_cursor;
+	}
+
+out_del_cursor:
+	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	xfs_buf_relse(agbp);
+out_put_perag:
+	xfs_perag_put(pag);
+	return error;
+}
+
+int
+xfs_ioc_trim(
+	struct xfs_mount		*mp,
+	struct fstrim_range __user	*urange)
+{
+	struct request_queue	*q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+	unsigned int		granularity = q->limits.discard_granularity;
+	struct fstrim_range	range;
+	xfs_fsblock_t		start, len, minlen;
+	xfs_agnumber_t		start_agno, end_agno, agno;
+	__uint64_t		blocks_trimmed = 0;
+	int			error, last_error = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -XFS_ERROR(EPERM);
+	if (!blk_queue_discard(q))
+		return -XFS_ERROR(EOPNOTSUPP);
+	if (copy_from_user(&range, urange, sizeof(range)))
+		return -XFS_ERROR(EFAULT);
+
+	/*
+	 * Truncating down the len isn't actually quite correct, but using
+	 * XFS_B_TO_FSB would mean we trivially get overflows for values
+	 * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
+	 * used by the fstrim application.  In the end it really doesn't
+	 * matter as trimming blocks is an advisory interface.
+	 */
+	start = XFS_B_TO_FSBT(mp, range.start);
+	len = XFS_B_TO_FSBT(mp, range.len);
+	minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+
+	start_agno = XFS_FSB_TO_AGNO(mp, start);
+	if (start_agno >= mp->m_sb.sb_agcount)
+		return -XFS_ERROR(EINVAL);
+
+	end_agno = XFS_FSB_TO_AGNO(mp, start + len);
+	if (end_agno >= mp->m_sb.sb_agcount)
+		end_agno = mp->m_sb.sb_agcount - 1;
+
+	for (agno = start_agno; agno <= end_agno; agno++) {
+		error = -xfs_trim_extents(mp, agno, start, len, minlen,
+					  &blocks_trimmed);
+		if (error)
+			last_error = error;
+	}
+
+	if (last_error)
+		return last_error;
+
+	range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+	if (copy_to_user(urange, &range, sizeof(range)))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+int
+xfs_discard_extents(
+	struct xfs_mount	*mp,
+	struct list_head	*list)
+{
+	struct xfs_busy_extent	*busyp;
+	int			error = 0;
+
+	list_for_each_entry(busyp, list, list) {
+		trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
+					 busyp->length);
+
+		error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+				XFS_FSB_TO_BB(mp, busyp->length),
+				GFP_NOFS, 0);
+		if (error && error != EOPNOTSUPP) {
+			xfs_info(mp,
+	 "discard failed for extent [0x%llu,%u], error %d",
+				 (unsigned long long)busyp->bno,
+				 busyp->length,
+				 error);
+			return error;
+		}
+	}
+
+	return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 00000000..344879ae
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,10 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+
+struct fstrim_range;
+struct list_head;
+
+extern int	xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+extern int	xfs_discard_extents(struct xfs_mount *, struct list_head *);
+
+#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
new file mode 100644
index 00000000..fed3f3c8
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_export.h"
+#include "xfs_vnodeops.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+
+/*
+ * Note that we only accept fileids which are long enough rather than allow
+ * the parent generation number to default to zero.  XFS considers zero a
+ * valid generation number not an invalid/wildcard value.
+ */
+static int xfs_fileid_length(int fileid_type)
+{
+	switch (fileid_type) {
+	case FILEID_INO32_GEN:
+		return 2;
+	case FILEID_INO32_GEN_PARENT:
+		return 4;
+	case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+		return 3;
+	case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+		return 6;
+	}
+	return 255; /* invalid */
+}
+
+STATIC int
+xfs_fs_encode_fh(
+	struct dentry		*dentry,
+	__u32			*fh,
+	int			*max_len,
+	int			connectable)
+{
+	struct fid		*fid = (struct fid *)fh;
+	struct xfs_fid64	*fid64 = (struct xfs_fid64 *)fh;
+	struct inode		*inode = dentry->d_inode;
+	int			fileid_type;
+	int			len;
+
+	/* Directories don't need their parent encoded, they have ".." */
+	if (S_ISDIR(inode->i_mode) || !connectable)
+		fileid_type = FILEID_INO32_GEN;
+	else
+		fileid_type = FILEID_INO32_GEN_PARENT;
+
+	/*
+	 * If the the filesystem may contain 64bit inode numbers, we need
+	 * to use larger file handles that can represent them.
+	 *
+	 * While we only allocate inodes that do not fit into 32 bits any
+	 * large enough filesystem may contain them, thus the slightly
+	 * confusing looking conditional below.
+	 */
+	if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+	    (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
+		fileid_type |= XFS_FILEID_TYPE_64FLAG;
+
+	/*
+	 * Only encode if there is enough space given.  In practice
+	 * this means we can't export a filesystem with 64bit inodes
+	 * over NFSv2 with the subtree_check export option; the other
+	 * seven combinations work.  The real answer is "don't use v2".
+	 */
+	len = xfs_fileid_length(fileid_type);
+	if (*max_len < len) {
+		*max_len = len;
+		return 255;
+	}
+	*max_len = len;
+
+	switch (fileid_type) {
+	case FILEID_INO32_GEN_PARENT:
+		spin_lock(&dentry->d_lock);
+		fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
+		fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
+		spin_unlock(&dentry->d_lock);
+		/*FALLTHRU*/
+	case FILEID_INO32_GEN:
+		fid->i32.ino = XFS_I(inode)->i_ino;
+		fid->i32.gen = inode->i_generation;
+		break;
+	case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+		spin_lock(&dentry->d_lock);
+		fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
+		fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
+		spin_unlock(&dentry->d_lock);
+		/*FALLTHRU*/
+	case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+		fid64->ino = XFS_I(inode)->i_ino;
+		fid64->gen = inode->i_generation;
+		break;
+	}
+
+	return fileid_type;
+}
+
+STATIC struct inode *
+xfs_nfs_get_inode(
+	struct super_block	*sb,
+	u64			ino,
+	u32			generation)
+ {
+ 	xfs_mount_t		*mp = XFS_M(sb);
+	xfs_inode_t		*ip;
+	int			error;
+
+	/*
+	 * NFS can sometimes send requests for ino 0.  Fail them gracefully.
+	 */
+	if (ino == 0)
+		return ERR_PTR(-ESTALE);
+
+	/*
+	 * The XFS_IGET_UNTRUSTED means that an invalid inode number is just
+	 * fine and not an indication of a corrupted filesystem as clients can
+	 * send invalid file handles and we have to handle it gracefully..
+	 */
+	error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip);
+	if (error) {
+		/*
+		 * EINVAL means the inode cluster doesn't exist anymore.
+		 * This implies the filehandle is stale, so we should
+		 * translate it here.
+		 * We don't use ESTALE directly down the chain to not
+		 * confuse applications using bulkstat that expect EINVAL.
+		 */
+		if (error == EINVAL)
+			error = ESTALE;
+		return ERR_PTR(-error);
+	}
+
+	if (ip->i_d.di_gen != generation) {
+		IRELE(ip);
+		return ERR_PTR(-ENOENT);
+	}
+
+	return VFS_I(ip);
+}
+
+STATIC struct dentry *
+xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		 int fh_len, int fileid_type)
+{
+	struct xfs_fid64	*fid64 = (struct xfs_fid64 *)fid;
+	struct inode		*inode = NULL;
+
+	if (fh_len < xfs_fileid_length(fileid_type))
+		return NULL;
+
+	switch (fileid_type) {
+	case FILEID_INO32_GEN_PARENT:
+	case FILEID_INO32_GEN:
+		inode = xfs_nfs_get_inode(sb, fid->i32.ino, fid->i32.gen);
+		break;
+	case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+	case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+		inode = xfs_nfs_get_inode(sb, fid64->ino, fid64->gen);
+		break;
+	}
+
+	return d_obtain_alias(inode);
+}
+
+STATIC struct dentry *
+xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		 int fh_len, int fileid_type)
+{
+	struct xfs_fid64	*fid64 = (struct xfs_fid64 *)fid;
+	struct inode		*inode = NULL;
+
+	switch (fileid_type) {
+	case FILEID_INO32_GEN_PARENT:
+		inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino,
+					      fid->i32.parent_gen);
+		break;
+	case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+		inode = xfs_nfs_get_inode(sb, fid64->parent_ino,
+					      fid64->parent_gen);
+		break;
+	}
+
+	return d_obtain_alias(inode);
+}
+
+STATIC struct dentry *
+xfs_fs_get_parent(
+	struct dentry		*child)
+{
+	int			error;
+	struct xfs_inode	*cip;
+
+	error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL);
+	if (unlikely(error))
+		return ERR_PTR(-error);
+
+	return d_obtain_alias(VFS_I(cip));
+}
+
+STATIC int
+xfs_fs_nfs_commit_metadata(
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error = 0;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	if (xfs_ipincount(ip)) {
+		error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn,
+				XFS_LOG_SYNC, NULL);
+	}
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	return error;
+}
+
+const struct export_operations xfs_export_operations = {
+	.encode_fh		= xfs_fs_encode_fh,
+	.fh_to_dentry		= xfs_fs_fh_to_dentry,
+	.fh_to_parent		= xfs_fs_fh_to_parent,
+	.get_parent		= xfs_fs_get_parent,
+	.commit_metadata	= xfs_fs_nfs_commit_metadata,
+};
diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/linux-2.6/xfs_export.h
new file mode 100644
index 00000000..3272b6ae
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_export.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_EXPORT_H__
+#define __XFS_EXPORT_H__
+
+/*
+ * Common defines for code related to exporting XFS filesystems over NFS.
+ *
+ * The NFS fileid goes out on the wire as an array of
+ * 32bit unsigned ints in host order.  There are 5 possible
+ * formats.
+ *
+ * (1)	fileid_type=0x00
+ *	(no fileid data; handled by the generic code)
+ *
+ * (2)	fileid_type=0x01
+ *	inode-num
+ *	generation
+ *
+ * (3)	fileid_type=0x02
+ *	inode-num
+ *	generation
+ *	parent-inode-num
+ *	parent-generation
+ *
+ * (4)	fileid_type=0x81
+ *	inode-num-lo32
+ *	inode-num-hi32
+ *	generation
+ *
+ * (5)	fileid_type=0x82
+ *	inode-num-lo32
+ *	inode-num-hi32
+ *	generation
+ *	parent-inode-num-lo32
+ *	parent-inode-num-hi32
+ *	parent-generation
+ *
+ * Note, the NFS filehandle also includes an fsid portion which
+ * may have an inode number in it.  That number is hardcoded to
+ * 32bits and there is no way for XFS to intercept it.  In
+ * practice this means when exporting an XFS filesystem with 64bit
+ * inodes you should either export the mountpoint (rather than
+ * a subdirectory) or use the "fsid" export option.
+ */
+
+struct xfs_fid64 {
+	u64 ino;
+	u32 gen;
+	u64 parent_ino;
+	u32 parent_gen;
+} __attribute__((packed));
+
+/* This flag goes on the wire.  Don't play with it. */
+#define XFS_FILEID_TYPE_64FLAG	0x80	/* NFS fileid has 64bit inodes */
+
+#endif	/* __XFS_EXPORT_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
new file mode 100644
index 00000000..b679198d
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -0,0 +1,1114 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_trans.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_vnodeops.h"
+#include "xfs_da_btree.h"
+#include "xfs_ioctl.h"
+#include "xfs_trace.h"
+
+#include <linux/dcache.h>
+#include <linux/falloc.h>
+
+static const struct vm_operations_struct xfs_file_vm_ops;
+
+/*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+	struct xfs_inode	*ip,
+	int			type)
+{
+	if (type & XFS_IOLOCK_EXCL)
+		mutex_lock(&VFS_I(ip)->i_mutex);
+	xfs_ilock(ip, type);
+}
+
+static inline void
+xfs_rw_iunlock(
+	struct xfs_inode	*ip,
+	int			type)
+{
+	xfs_iunlock(ip, type);
+	if (type & XFS_IOLOCK_EXCL)
+		mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+static inline void
+xfs_rw_ilock_demote(
+	struct xfs_inode	*ip,
+	int			type)
+{
+	xfs_ilock_demote(ip, type);
+	if (type & XFS_IOLOCK_EXCL)
+		mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+/*
+ *	xfs_iozero
+ *
+ *	xfs_iozero clears the specified range of buffer supplied,
+ *	and marks all the affected blocks as valid and modified.  If
+ *	an affected block is not allocated, it will be allocated.  If
+ *	an affected block is not completely overwritten, and is not
+ *	valid before the operation, it will be read from disk before
+ *	being partially zeroed.
+ */
+STATIC int
+xfs_iozero(
+	struct xfs_inode	*ip,	/* inode			*/
+	loff_t			pos,	/* offset in file		*/
+	size_t			count)	/* size of data to zero		*/
+{
+	struct page		*page;
+	struct address_space	*mapping;
+	int			status;
+
+	mapping = VFS_I(ip)->i_mapping;
+	do {
+		unsigned offset, bytes;
+		void *fsdata;
+
+		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count)
+			bytes = count;
+
+		status = pagecache_write_begin(NULL, mapping, pos, bytes,
+					AOP_FLAG_UNINTERRUPTIBLE,
+					&page, &fsdata);
+		if (status)
+			break;
+
+		zero_user(page, offset, bytes);
+
+		status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
+					page, fsdata);
+		WARN_ON(status <= 0); /* can't return less than zero! */
+		pos += bytes;
+		count -= bytes;
+		status = 0;
+	} while (count);
+
+	return (-status);
+}
+
+STATIC int
+xfs_file_fsync(
+	struct file		*file,
+	int			datasync)
+{
+	struct inode		*inode = file->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error = 0;
+	int			log_flushed = 0;
+
+	trace_xfs_file_fsync(ip);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -XFS_ERROR(EIO);
+
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
+
+	xfs_ioend_wait(ip);
+
+	if (mp->m_flags & XFS_MOUNT_BARRIER) {
+		/*
+		 * If we have an RT and/or log subvolume we need to make sure
+		 * to flush the write cache the device used for file data
+		 * first.  This is to ensure newly written file data make
+		 * it to disk before logging the new inode size in case of
+		 * an extending write.
+		 */
+		if (XFS_IS_REALTIME_INODE(ip))
+			xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+		else if (mp->m_logdev_targp != mp->m_ddev_targp)
+			xfs_blkdev_issue_flush(mp->m_ddev_targp);
+	}
+
+	/*
+	 * We always need to make sure that the required inode state is safe on
+	 * disk.  The inode might be clean but we still might need to force the
+	 * log because of committed transactions that haven't hit the disk yet.
+	 * Likewise, there could be unflushed non-transactional changes to the
+	 * inode core that have to go to disk and this requires us to issue
+	 * a synchronous transaction to capture these changes correctly.
+	 *
+	 * This code relies on the assumption that if the i_update_core field
+	 * of the inode is clear and the inode is unpinned then it is clean
+	 * and no action is required.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+	/*
+	 * First check if the VFS inode is marked dirty.  All the dirtying
+	 * of non-transactional updates no goes through mark_inode_dirty*,
+	 * which allows us to distinguish beteeen pure timestamp updates
+	 * and i_size updates which need to be caught for fdatasync.
+	 * After that also theck for the dirty state in the XFS inode, which
+	 * might gets cleared when the inode gets written out via the AIL
+	 * or xfs_iflush_cluster.
+	 */
+	if (((inode->i_state & I_DIRTY_DATASYNC) ||
+	    ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+	    ip->i_update_core) {
+		/*
+		 * Kick off a transaction to log the inode core to get the
+		 * updates.  The sync transaction will also force the log.
+		 */
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+		error = xfs_trans_reserve(tp, 0,
+				XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+		if (error) {
+			xfs_trans_cancel(tp, 0);
+			return -error;
+		}
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+		/*
+		 * Note - it's possible that we might have pushed ourselves out
+		 * of the way during trans_reserve which would flush the inode.
+		 * But there's no guarantee that the inode buffer has actually
+		 * gone out yet (it's delwri).	Plus the buffer could be pinned
+		 * anyway if it's part of an inode in another recent
+		 * transaction.	 So we play it safe and fire off the
+		 * transaction anyway.
+		 */
+		xfs_trans_ijoin(tp, ip);
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		xfs_trans_set_sync(tp);
+		error = _xfs_trans_commit(tp, 0, &log_flushed);
+
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	} else {
+		/*
+		 * Timestamps/size haven't changed since last inode flush or
+		 * inode transaction commit.  That means either nothing got
+		 * written or a transaction committed which caught the updates.
+		 * If the latter happened and the transaction hasn't hit the
+		 * disk yet, the inode will be still be pinned.  If it is,
+		 * force the log.
+		 */
+		if (xfs_ipincount(ip)) {
+			error = _xfs_log_force_lsn(mp,
+					ip->i_itemp->ili_last_lsn,
+					XFS_LOG_SYNC, &log_flushed);
+		}
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	}
+
+	/*
+	 * If we only have a single device, and the log force about was
+	 * a no-op we might have to flush the data device cache here.
+	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
+	 * an already allocated file and thus do not have any metadata to
+	 * commit.
+	 */
+	if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
+	    mp->m_logdev_targp == mp->m_ddev_targp &&
+	    !XFS_IS_REALTIME_INODE(ip) &&
+	    !log_flushed)
+		xfs_blkdev_issue_flush(mp->m_ddev_targp);
+
+	return -error;
+}
+
+STATIC ssize_t
+xfs_file_aio_read(
+	struct kiocb		*iocb,
+	const struct iovec	*iovp,
+	unsigned long		nr_segs,
+	loff_t			pos)
+{
+	struct file		*file = iocb->ki_filp;
+	struct inode		*inode = file->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	size_t			size = 0;
+	ssize_t			ret = 0;
+	int			ioflags = 0;
+	xfs_fsize_t		n;
+	unsigned long		seg;
+
+	XFS_STATS_INC(xs_read_calls);
+
+	BUG_ON(iocb->ki_pos != pos);
+
+	if (unlikely(file->f_flags & O_DIRECT))
+		ioflags |= IO_ISDIRECT;
+	if (file->f_mode & FMODE_NOCMTIME)
+		ioflags |= IO_INVIS;
+
+	/* START copy & waste from filemap.c */
+	for (seg = 0; seg < nr_segs; seg++) {
+		const struct iovec *iv = &iovp[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		size += iv->iov_len;
+		if (unlikely((ssize_t)(size|iv->iov_len) < 0))
+			return XFS_ERROR(-EINVAL);
+	}
+	/* END copy & waste from filemap.c */
+
+	if (unlikely(ioflags & IO_ISDIRECT)) {
+		xfs_buftarg_t	*target =
+			XFS_IS_REALTIME_INODE(ip) ?
+				mp->m_rtdev_targp : mp->m_ddev_targp;
+		if ((iocb->ki_pos & target->bt_smask) ||
+		    (size & target->bt_smask)) {
+			if (iocb->ki_pos == ip->i_size)
+				return 0;
+			return -XFS_ERROR(EINVAL);
+		}
+	}
+
+	n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
+	if (n <= 0 || size == 0)
+		return 0;
+
+	if (n < size)
+		size = n;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	/*
+	 * Locking is a bit tricky here. If we take an exclusive lock
+	 * for direct IO, we effectively serialise all new concurrent
+	 * read IO to this file and block it behind IO that is currently in
+	 * progress because IO in progress holds the IO lock shared. We only
+	 * need to hold the lock exclusive to blow away the page cache, so
+	 * only take lock exclusively if the page cache needs invalidation.
+	 * This allows the normal direct IO case of no page cache pages to
+	 * proceeed concurrently without serialisation.
+	 */
+	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
+		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+
+		if (inode->i_mapping->nrpages) {
+			ret = -xfs_flushinval_pages(ip,
+					(iocb->ki_pos & PAGE_CACHE_MASK),
+					-1, FI_REMAPF_LOCKED);
+			if (ret) {
+				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+				return ret;
+			}
+		}
+		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+	}
+
+	trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
+
+	ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
+	if (ret > 0)
+		XFS_STATS_ADD(xs_read_bytes, ret);
+
+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_splice_read(
+	struct file		*infilp,
+	loff_t			*ppos,
+	struct pipe_inode_info	*pipe,
+	size_t			count,
+	unsigned int		flags)
+{
+	struct xfs_inode	*ip = XFS_I(infilp->f_mapping->host);
+	int			ioflags = 0;
+	ssize_t			ret;
+
+	XFS_STATS_INC(xs_read_calls);
+
+	if (infilp->f_mode & FMODE_NOCMTIME)
+		ioflags |= IO_INVIS;
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return -EIO;
+
+	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+
+	trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+
+	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+	if (ret > 0)
+		XFS_STATS_ADD(xs_read_bytes, ret);
+
+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+	return ret;
+}
+
+STATIC void
+xfs_aio_write_isize_update(
+	struct inode	*inode,
+	loff_t		*ppos,
+	ssize_t		bytes_written)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	xfs_fsize_t		isize = i_size_read(inode);
+
+	if (bytes_written > 0)
+		XFS_STATS_ADD(xs_write_bytes, bytes_written);
+
+	if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
+					*ppos > isize))
+		*ppos = isize;
+
+	if (*ppos > ip->i_size) {
+		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+		if (*ppos > ip->i_size)
+			ip->i_size = *ppos;
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+}
+
+/*
+ * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
+ * part of the I/O may have been written to disk before the error occurred.  In
+ * this case the on-disk file size may have been adjusted beyond the in-memory
+ * file size and now needs to be truncated back.
+ */
+STATIC void
+xfs_aio_write_newsize_update(
+	struct xfs_inode	*ip)
+{
+	if (ip->i_new_size) {
+		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+		ip->i_new_size = 0;
+		if (ip->i_d.di_size > ip->i_size)
+			ip->i_d.di_size = ip->i_size;
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+}
+
+/*
+ * xfs_file_splice_write() does not use xfs_rw_ilock() because
+ * generic_file_splice_write() takes the i_mutex itself. This, in theory,
+ * couuld cause lock inversions between the aio_write path and the splice path
+ * if someone is doing concurrent splice(2) based writes and write(2) based
+ * writes to the same inode. The only real way to fix this is to re-implement
+ * the generic code here with correct locking orders.
+ */
+STATIC ssize_t
+xfs_file_splice_write(
+	struct pipe_inode_info	*pipe,
+	struct file		*outfilp,
+	loff_t			*ppos,
+	size_t			count,
+	unsigned int		flags)
+{
+	struct inode		*inode = outfilp->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	xfs_fsize_t		new_size;
+	int			ioflags = 0;
+	ssize_t			ret;
+
+	XFS_STATS_INC(xs_write_calls);
+
+	if (outfilp->f_mode & FMODE_NOCMTIME)
+		ioflags |= IO_INVIS;
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return -EIO;
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	new_size = *ppos + count;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (new_size > ip->i_size)
+		ip->i_new_size = new_size;
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
+
+	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+
+	xfs_aio_write_isize_update(inode, ppos, ret);
+	xfs_aio_write_newsize_update(ip);
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return ret;
+}
+
+/*
+ * This routine is called to handle zeroing any space in the last
+ * block of the file that is beyond the EOF.  We do this since the
+ * size is being increased without writing anything to that block
+ * and we don't want anyone to read the garbage on the disk.
+ */
+STATIC int				/* error (positive) */
+xfs_zero_last_block(
+	xfs_inode_t	*ip,
+	xfs_fsize_t	offset,
+	xfs_fsize_t	isize)
+{
+	xfs_fileoff_t	last_fsb;
+	xfs_mount_t	*mp = ip->i_mount;
+	int		nimaps;
+	int		zero_offset;
+	int		zero_len;
+	int		error = 0;
+	xfs_bmbt_irec_t	imap;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	zero_offset = XFS_B_FSB_OFFSET(mp, isize);
+	if (zero_offset == 0) {
+		/*
+		 * There are no extra bytes in the last block on disk to
+		 * zero, so return.
+		 */
+		return 0;
+	}
+
+	last_fsb = XFS_B_TO_FSBT(mp, isize);
+	nimaps = 1;
+	error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
+			  &nimaps, NULL);
+	if (error) {
+		return error;
+	}
+	ASSERT(nimaps > 0);
+	/*
+	 * If the block underlying isize is just a hole, then there
+	 * is nothing to zero.
+	 */
+	if (imap.br_startblock == HOLESTARTBLOCK) {
+		return 0;
+	}
+	/*
+	 * Zero the part of the last block beyond the EOF, and write it
+	 * out sync.  We need to drop the ilock while we do this so we
+	 * don't deadlock when the buffer cache calls back to us.
+	 */
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	zero_len = mp->m_sb.sb_blocksize - zero_offset;
+	if (isize + zero_len > offset)
+		zero_len = offset - isize;
+	error = xfs_iozero(ip, isize, zero_len);
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	ASSERT(error >= 0);
+	return error;
+}
+
+/*
+ * Zero any on disk space between the current EOF and the new,
+ * larger EOF.  This handles the normal case of zeroing the remainder
+ * of the last block in the file and the unusual case of zeroing blocks
+ * out beyond the size of the file.  This second case only happens
+ * with fixed size extents and when the system crashes before the inode
+ * size was updated but after blocks were allocated.  If fill is set,
+ * then any holes in the range are filled and zeroed.  If not, the holes
+ * are left alone as holes.
+ */
+
+int					/* error (positive) */
+xfs_zero_eof(
+	xfs_inode_t	*ip,
+	xfs_off_t	offset,		/* starting I/O offset */
+	xfs_fsize_t	isize)		/* current inode size */
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_fileoff_t	start_zero_fsb;
+	xfs_fileoff_t	end_zero_fsb;
+	xfs_fileoff_t	zero_count_fsb;
+	xfs_fileoff_t	last_fsb;
+	xfs_fileoff_t	zero_off;
+	xfs_fsize_t	zero_len;
+	int		nimaps;
+	int		error = 0;
+	xfs_bmbt_irec_t	imap;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+	ASSERT(offset > isize);
+
+	/*
+	 * First handle zeroing the block on which isize resides.
+	 * We only zero a part of that block so it is handled specially.
+	 */
+	error = xfs_zero_last_block(ip, offset, isize);
+	if (error) {
+		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+		return error;
+	}
+
+	/*
+	 * Calculate the range between the new size and the old
+	 * where blocks needing to be zeroed may exist.  To get the
+	 * block where the last byte in the file currently resides,
+	 * we need to subtract one from the size and truncate back
+	 * to a block boundary.  We subtract 1 in case the size is
+	 * exactly on a block boundary.
+	 */
+	last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
+	start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+	end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
+	ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
+	if (last_fsb == end_zero_fsb) {
+		/*
+		 * The size was only incremented on its last block.
+		 * We took care of that above, so just return.
+		 */
+		return 0;
+	}
+
+	ASSERT(start_zero_fsb <= end_zero_fsb);
+	while (start_zero_fsb <= end_zero_fsb) {
+		nimaps = 1;
+		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+		error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
+				  0, NULL, 0, &imap, &nimaps, NULL);
+		if (error) {
+			ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+			return error;
+		}
+		ASSERT(nimaps > 0);
+
+		if (imap.br_state == XFS_EXT_UNWRITTEN ||
+		    imap.br_startblock == HOLESTARTBLOCK) {
+			/*
+			 * This loop handles initializing pages that were
+			 * partially initialized by the code below this
+			 * loop. It basically zeroes the part of the page
+			 * that sits on a hole and sets the page as P_HOLE
+			 * and calls remapf if it is a mapped file.
+			 */
+			start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+			ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+			continue;
+		}
+
+		/*
+		 * There are blocks we need to zero.
+		 * Drop the inode lock while we're doing the I/O.
+		 * We'll still have the iolock to protect us.
+		 */
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+		zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
+		zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
+
+		if ((zero_off + zero_len) > offset)
+			zero_len = offset - zero_off;
+
+		error = xfs_iozero(ip, zero_off, zero_len);
+		if (error) {
+			goto out_lock;
+		}
+
+		start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+	}
+
+	return 0;
+
+out_lock:
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	ASSERT(error >= 0);
+	return error;
+}
+
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Returns with iolock held according to @iolock.
+ */
+STATIC ssize_t
+xfs_file_aio_write_checks(
+	struct file		*file,
+	loff_t			*pos,
+	size_t			*count,
+	int			*iolock)
+{
+	struct inode		*inode = file->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	xfs_fsize_t		new_size;
+	int			error = 0;
+
+	xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
+	if (error) {
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+		*iolock = 0;
+		return error;
+	}
+
+	new_size = *pos + *count;
+	if (new_size > ip->i_size)
+		ip->i_new_size = new_size;
+
+	if (likely(!(file->f_mode & FMODE_NOCMTIME)))
+		file_update_time(file);
+
+	/*
+	 * If the offset is beyond the size of the file, we need to zero any
+	 * blocks that fall between the existing EOF and the start of this
+	 * write.
+	 */
+	if (*pos > ip->i_size)
+		error = -xfs_zero_eof(ip, *pos, ip->i_size);
+
+	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+	if (error)
+		return error;
+
+	/*
+	 * If we're writing the file then make sure to clear the setuid and
+	 * setgid bits if the process is not being run by root.  This keeps
+	 * people from modifying setuid and setgid binaries.
+	 */
+	return file_remove_suid(file);
+
+}
+
+/*
+ * xfs_file_dio_aio_write - handle direct IO writes
+ *
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
+ * By separating it from the buffered write path we remove all the tricky to
+ * follow locking changes and looping.
+ *
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+	struct kiocb		*iocb,
+	const struct iovec	*iovp,
+	unsigned long		nr_segs,
+	loff_t			pos,
+	size_t			ocount,
+	int			*iolock)
+{
+	struct file		*file = iocb->ki_filp;
+	struct address_space	*mapping = file->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	ssize_t			ret = 0;
+	size_t			count = ocount;
+	int			unaligned_io = 0;
+	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
+					mp->m_rtdev_targp : mp->m_ddev_targp;
+
+	*iolock = 0;
+	if ((pos & target->bt_smask) || (count & target->bt_smask))
+		return -XFS_ERROR(EINVAL);
+
+	if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+		unaligned_io = 1;
+
+	if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+		*iolock = XFS_IOLOCK_EXCL;
+	else
+		*iolock = XFS_IOLOCK_SHARED;
+	xfs_rw_ilock(ip, *iolock);
+
+	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	if (ret)
+		return ret;
+
+	/*
+	 * Recheck if there are cached pages that need invalidate after we got
+	 * the iolock to protect against other threads adding new pages while
+	 * we were waiting for the iolock.
+	 */
+	if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
+		xfs_rw_iunlock(ip, *iolock);
+		*iolock = XFS_IOLOCK_EXCL;
+		xfs_rw_ilock(ip, *iolock);
+	}
+
+	if (mapping->nrpages) {
+		ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+							FI_REMAPF_LOCKED);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * If we are doing unaligned IO, wait for all other IO to drain,
+	 * otherwise demote the lock if we had to flush cached pages
+	 */
+	if (unaligned_io)
+		xfs_ioend_wait(ip);
+	else if (*iolock == XFS_IOLOCK_EXCL) {
+		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		*iolock = XFS_IOLOCK_SHARED;
+	}
+
+	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+	ret = generic_file_direct_write(iocb, iovp,
+			&nr_segs, pos, &iocb->ki_pos, count, ocount);
+
+	/* No fallback to buffered IO on errors for XFS. */
+	ASSERT(ret < 0 || ret == count);
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_buffered_aio_write(
+	struct kiocb		*iocb,
+	const struct iovec	*iovp,
+	unsigned long		nr_segs,
+	loff_t			pos,
+	size_t			ocount,
+	int			*iolock)
+{
+	struct file		*file = iocb->ki_filp;
+	struct address_space	*mapping = file->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	ssize_t			ret;
+	int			enospc = 0;
+	size_t			count = ocount;
+
+	*iolock = XFS_IOLOCK_EXCL;
+	xfs_rw_ilock(ip, *iolock);
+
+	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	if (ret)
+		return ret;
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
+
+write_retry:
+	trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
+	ret = generic_file_buffered_write(iocb, iovp, nr_segs,
+			pos, &iocb->ki_pos, count, ret);
+	/*
+	 * if we just got an ENOSPC, flush the inode now we aren't holding any
+	 * page locks and retry *once*
+	 */
+	if (ret == -ENOSPC && !enospc) {
+		ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+		if (ret)
+			return ret;
+		enospc = 1;
+		goto write_retry;
+	}
+	current->backing_dev_info = NULL;
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_aio_write(
+	struct kiocb		*iocb,
+	const struct iovec	*iovp,
+	unsigned long		nr_segs,
+	loff_t			pos)
+{
+	struct file		*file = iocb->ki_filp;
+	struct address_space	*mapping = file->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	ssize_t			ret;
+	int			iolock;
+	size_t			ocount = 0;
+
+	XFS_STATS_INC(xs_write_calls);
+
+	BUG_ON(iocb->ki_pos != pos);
+
+	ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+	if (ret)
+		return ret;
+
+	if (ocount == 0)
+		return 0;
+
+	xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return -EIO;
+
+	if (unlikely(file->f_flags & O_DIRECT))
+		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+						ocount, &iolock);
+	else
+		ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
+						ocount, &iolock);
+
+	xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
+
+	if (ret <= 0)
+		goto out_unlock;
+
+	/* Handle various SYNC-type writes */
+	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+		loff_t end = pos + ret - 1;
+		int error, error2;
+
+		xfs_rw_iunlock(ip, iolock);
+		error = filemap_write_and_wait_range(mapping, pos, end);
+		xfs_rw_ilock(ip, iolock);
+
+		error2 = -xfs_file_fsync(file,
+					 (file->f_flags & __O_SYNC) ? 0 : 1);
+		if (error)
+			ret = error;
+		else if (error2)
+			ret = error2;
+	}
+
+out_unlock:
+	xfs_aio_write_newsize_update(ip);
+	xfs_rw_iunlock(ip, iolock);
+	return ret;
+}
+
+STATIC long
+xfs_file_fallocate(
+	struct file	*file,
+	int		mode,
+	loff_t		offset,
+	loff_t		len)
+{
+	struct inode	*inode = file->f_path.dentry->d_inode;
+	long		error;
+	loff_t		new_size = 0;
+	xfs_flock64_t	bf;
+	xfs_inode_t	*ip = XFS_I(inode);
+	int		cmd = XFS_IOC_RESVSP;
+	int		attr_flags = XFS_ATTR_NOLOCK;
+
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	bf.l_whence = 0;
+	bf.l_start = offset;
+	bf.l_len = len;
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		cmd = XFS_IOC_UNRESVSP;
+
+	/* check the new inode size is valid before allocating */
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    offset + len > i_size_read(inode)) {
+		new_size = offset + len;
+		error = inode_newsize_ok(inode, new_size);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (file->f_flags & O_DSYNC)
+		attr_flags |= XFS_ATTR_SYNC;
+
+	error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
+	if (error)
+		goto out_unlock;
+
+	/* Change file size if needed */
+	if (new_size) {
+		struct iattr iattr;
+
+		iattr.ia_valid = ATTR_SIZE;
+		iattr.ia_size = new_size;
+		error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
+	}
+
+out_unlock:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
+
+
+STATIC int
+xfs_file_open(
+	struct inode	*inode,
+	struct file	*file)
+{
+	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+		return -EFBIG;
+	if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
+		return -EIO;
+	return 0;
+}
+
+STATIC int
+xfs_dir_open(
+	struct inode	*inode,
+	struct file	*file)
+{
+	struct xfs_inode *ip = XFS_I(inode);
+	int		mode;
+	int		error;
+
+	error = xfs_file_open(inode, file);
+	if (error)
+		return error;
+
+	/*
+	 * If there are any blocks, read-ahead block 0 as we're almost
+	 * certain to have the next operation be a read there.
+	 */
+	mode = xfs_ilock_map_shared(ip);
+	if (ip->i_d.di_nextents > 0)
+		xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+	xfs_iunlock(ip, mode);
+	return 0;
+}
+
+STATIC int
+xfs_file_release(
+	struct inode	*inode,
+	struct file	*filp)
+{
+	return -xfs_release(XFS_I(inode));
+}
+
+STATIC int
+xfs_file_readdir(
+	struct file	*filp,
+	void		*dirent,
+	filldir_t	filldir)
+{
+	struct inode	*inode = filp->f_path.dentry->d_inode;
+	xfs_inode_t	*ip = XFS_I(inode);
+	int		error;
+	size_t		bufsize;
+
+	/*
+	 * The Linux API doesn't pass down the total size of the buffer
+	 * we read into down to the filesystem.  With the filldir concept
+	 * it's not needed for correct information, but the XFS dir2 leaf
+	 * code wants an estimate of the buffer size to calculate it's
+	 * readahead window and size the buffers used for mapping to
+	 * physical blocks.
+	 *
+	 * Try to give it an estimate that's good enough, maybe at some
+	 * point we can change the ->readdir prototype to include the
+	 * buffer size.  For now we use the current glibc buffer size.
+	 */
+	bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
+
+	error = xfs_readdir(ip, dirent, bufsize,
+				(xfs_off_t *)&filp->f_pos, filldir);
+	if (error)
+		return -error;
+	return 0;
+}
+
+STATIC int
+xfs_file_mmap(
+	struct file	*filp,
+	struct vm_area_struct *vma)
+{
+	vma->vm_ops = &xfs_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+
+	file_accessed(filp);
+	return 0;
+}
+
+/*
+ * mmap()d file has taken write protection fault and is being made
+ * writable. We can set the page state up correctly for a writable
+ * page, which means we can do correct delalloc accounting (ENOSPC
+ * checking!) and unwritten extent mapping.
+ */
+STATIC int
+xfs_vm_page_mkwrite(
+	struct vm_area_struct	*vma,
+	struct vm_fault		*vmf)
+{
+	return block_page_mkwrite(vma, vmf, xfs_get_blocks);
+}
+
+const struct file_operations xfs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= xfs_file_aio_read,
+	.aio_write	= xfs_file_aio_write,
+	.splice_read	= xfs_file_splice_read,
+	.splice_write	= xfs_file_splice_write,
+	.unlocked_ioctl	= xfs_file_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= xfs_file_compat_ioctl,
+#endif
+	.mmap		= xfs_file_mmap,
+	.open		= xfs_file_open,
+	.release	= xfs_file_release,
+	.fsync		= xfs_file_fsync,
+	.fallocate	= xfs_file_fallocate,
+};
+
+const struct file_operations xfs_dir_file_operations = {
+	.open		= xfs_dir_open,
+	.read		= generic_read_dir,
+	.readdir	= xfs_file_readdir,
+	.llseek		= generic_file_llseek,
+	.unlocked_ioctl	= xfs_file_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= xfs_file_compat_ioctl,
+#endif
+	.fsync		= xfs_file_fsync,
+};
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+	.fault		= filemap_fault,
+	.page_mkwrite	= xfs_vm_page_mkwrite,
+};
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
new file mode 100644
index 00000000..ed88ed16
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_vnodeops.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+
+/*
+ * note: all filemap functions return negative error codes. These
+ * need to be inverted before returning to the xfs core functions.
+ */
+void
+xfs_tosspages(
+	xfs_inode_t	*ip,
+	xfs_off_t	first,
+	xfs_off_t	last,
+	int		fiopt)
+{
+	/* can't toss partial tail pages, so mask them out */
+	last &= ~(PAGE_SIZE - 1);
+	truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
+}
+
+int
+xfs_flushinval_pages(
+	xfs_inode_t	*ip,
+	xfs_off_t	first,
+	xfs_off_t	last,
+	int		fiopt)
+{
+	struct address_space *mapping = VFS_I(ip)->i_mapping;
+	int		ret = 0;
+
+	trace_xfs_pagecache_inval(ip, first, last);
+
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
+	ret = filemap_write_and_wait_range(mapping, first,
+				last == -1 ? LLONG_MAX : last);
+	if (!ret)
+		truncate_inode_pages_range(mapping, first, last);
+	return -ret;
+}
+
+int
+xfs_flush_pages(
+	xfs_inode_t	*ip,
+	xfs_off_t	first,
+	xfs_off_t	last,
+	uint64_t	flags,
+	int		fiopt)
+{
+	struct address_space *mapping = VFS_I(ip)->i_mapping;
+	int		ret = 0;
+	int		ret2;
+
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
+	ret = -filemap_fdatawrite_range(mapping, first,
+				last == -1 ? LLONG_MAX : last);
+	if (flags & XBF_ASYNC)
+		return ret;
+	ret2 = xfs_wait_on_pages(ip, first, last);
+	if (!ret)
+		ret = ret2;
+	return ret;
+}
+
+int
+xfs_wait_on_pages(
+	xfs_inode_t	*ip,
+	xfs_off_t	first,
+	xfs_off_t	last)
+{
+	struct address_space *mapping = VFS_I(ip)->i_mapping;
+
+	if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
+		return -filemap_fdatawait_range(mapping, first,
+					last == -1 ? ip->i_size - 1 : last);
+	}
+	return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
new file mode 100644
index 00000000..76e81cff
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sysctl.h"
+
+/*
+ * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,
+ * other XFS code uses these values.  Times are measured in centisecs (i.e.
+ * 100ths of a second).
+ */
+xfs_param_t xfs_params = {
+			  /*	MIN		DFLT		MAX	*/
+	.sgid_inherit	= {	0,		0,		1	},
+	.symlink_mode	= {	0,		0,		1	},
+	.panic_mask	= {	0,		0,		255	},
+	.error_level	= {	0,		3,		11	},
+	.syncd_timer	= {	1*100,		30*100,		7200*100},
+	.stats_clear	= {	0,		0,		1	},
+	.inherit_sync	= {	0,		1,		1	},
+	.inherit_nodump	= {	0,		1,		1	},
+	.inherit_noatim = {	0,		1,		1	},
+	.xfs_buf_timer	= {	100/2,		1*100,		30*100	},
+	.xfs_buf_age	= {	1*100,		15*100,		7200*100},
+	.inherit_nosym	= {	0,		0,		1	},
+	.rotorstep	= {	1,		1,		255	},
+	.inherit_nodfrg	= {	0,		1,		1	},
+	.fstrm_timer	= {	1,		30*100,		3600*100},
+};
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
new file mode 100644
index 00000000..acca2c5c
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -0,0 +1,1556 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_ioctl.h"
+#include "xfs_rtalloc.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_bmap.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_dfrag.h"
+#include "xfs_fsops.h"
+#include "xfs_vnodeops.h"
+#include "xfs_discard.h"
+#include "xfs_quota.h"
+#include "xfs_inode_item.h"
+#include "xfs_export.h"
+#include "xfs_trace.h"
+
+#include <linux/capability.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/exportfs.h>
+
+/*
+ * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
+ * a file or fs handle.
+ *
+ * XFS_IOC_PATH_TO_FSHANDLE
+ *    returns fs handle for a mount point or path within that mount point
+ * XFS_IOC_FD_TO_HANDLE
+ *    returns full handle for a FD opened in user space
+ * XFS_IOC_PATH_TO_HANDLE
+ *    returns full handle for a path
+ */
+int
+xfs_find_handle(
+	unsigned int		cmd,
+	xfs_fsop_handlereq_t	*hreq)
+{
+	int			hsize;
+	xfs_handle_t		handle;
+	struct inode		*inode;
+	struct file		*file = NULL;
+	struct path		path;
+	int			error;
+	struct xfs_inode	*ip;
+
+	if (cmd == XFS_IOC_FD_TO_HANDLE) {
+		file = fget(hreq->fd);
+		if (!file)
+			return -EBADF;
+		inode = file->f_path.dentry->d_inode;
+	} else {
+		error = user_lpath((const char __user *)hreq->path, &path);
+		if (error)
+			return error;
+		inode = path.dentry->d_inode;
+	}
+	ip = XFS_I(inode);
+
+	/*
+	 * We can only generate handles for inodes residing on a XFS filesystem,
+	 * and only for regular files, directories or symbolic links.
+	 */
+	error = -EINVAL;
+	if (inode->i_sb->s_magic != XFS_SB_MAGIC)
+		goto out_put;
+
+	error = -EBADF;
+	if (!S_ISREG(inode->i_mode) &&
+	    !S_ISDIR(inode->i_mode) &&
+	    !S_ISLNK(inode->i_mode))
+		goto out_put;
+
+
+	memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
+
+	if (cmd == XFS_IOC_PATH_TO_FSHANDLE) {
+		/*
+		 * This handle only contains an fsid, zero the rest.
+		 */
+		memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
+		hsize = sizeof(xfs_fsid_t);
+	} else {
+		int		lock_mode;
+
+		lock_mode = xfs_ilock_map_shared(ip);
+		handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
+					sizeof(handle.ha_fid.fid_len);
+		handle.ha_fid.fid_pad = 0;
+		handle.ha_fid.fid_gen = ip->i_d.di_gen;
+		handle.ha_fid.fid_ino = ip->i_ino;
+		xfs_iunlock_map_shared(ip, lock_mode);
+
+		hsize = XFS_HSIZE(handle);
+	}
+
+	error = -EFAULT;
+	if (copy_to_user(hreq->ohandle, &handle, hsize) ||
+	    copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
+		goto out_put;
+
+	error = 0;
+
+ out_put:
+	if (cmd == XFS_IOC_FD_TO_HANDLE)
+		fput(file);
+	else
+		path_put(&path);
+	return error;
+}
+
+/*
+ * No need to do permission checks on the various pathname components
+ * as the handle operations are privileged.
+ */
+STATIC int
+xfs_handle_acceptable(
+	void			*context,
+	struct dentry		*dentry)
+{
+	return 1;
+}
+
+/*
+ * Convert userspace handle data into a dentry.
+ */
+struct dentry *
+xfs_handle_to_dentry(
+	struct file		*parfilp,
+	void __user		*uhandle,
+	u32			hlen)
+{
+	xfs_handle_t		handle;
+	struct xfs_fid64	fid;
+
+	/*
+	 * Only allow handle opens under a directory.
+	 */
+	if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode))
+		return ERR_PTR(-ENOTDIR);
+
+	if (hlen != sizeof(xfs_handle_t))
+		return ERR_PTR(-EINVAL);
+	if (copy_from_user(&handle, uhandle, hlen))
+		return ERR_PTR(-EFAULT);
+	if (handle.ha_fid.fid_len !=
+	    sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len))
+		return ERR_PTR(-EINVAL);
+
+	memset(&fid, 0, sizeof(struct fid));
+	fid.ino = handle.ha_fid.fid_ino;
+	fid.gen = handle.ha_fid.fid_gen;
+
+	return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3,
+			FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG,
+			xfs_handle_acceptable, NULL);
+}
+
+STATIC struct dentry *
+xfs_handlereq_to_dentry(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq)
+{
+	return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen);
+}
+
+int
+xfs_open_by_handle(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq)
+{
+	const struct cred	*cred = current_cred();
+	int			error;
+	int			fd;
+	int			permflag;
+	struct file		*filp;
+	struct inode		*inode;
+	struct dentry		*dentry;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -XFS_ERROR(EPERM);
+
+	dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	inode = dentry->d_inode;
+
+	/* Restrict xfs_open_by_handle to directories & regular files. */
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
+		error = -XFS_ERROR(EPERM);
+		goto out_dput;
+	}
+
+#if BITS_PER_LONG != 32
+	hreq->oflags |= O_LARGEFILE;
+#endif
+
+	/* Put open permission in namei format. */
+	permflag = hreq->oflags;
+	if ((permflag+1) & O_ACCMODE)
+		permflag++;
+	if (permflag & O_TRUNC)
+		permflag |= 2;
+
+	if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
+	    (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
+		error = -XFS_ERROR(EPERM);
+		goto out_dput;
+	}
+
+	if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
+		error = -XFS_ERROR(EACCES);
+		goto out_dput;
+	}
+
+	/* Can't write directories. */
+	if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
+		error = -XFS_ERROR(EISDIR);
+		goto out_dput;
+	}
+
+	fd = get_unused_fd();
+	if (fd < 0) {
+		error = fd;
+		goto out_dput;
+	}
+
+	filp = dentry_open(dentry, mntget(parfilp->f_path.mnt),
+			   hreq->oflags, cred);
+	if (IS_ERR(filp)) {
+		put_unused_fd(fd);
+		return PTR_ERR(filp);
+	}
+
+	if (inode->i_mode & S_IFREG) {
+		filp->f_flags |= O_NOATIME;
+		filp->f_mode |= FMODE_NOCMTIME;
+	}
+
+	fd_install(fd, filp);
+	return fd;
+
+ out_dput:
+	dput(dentry);
+	return error;
+}
+
+/*
+ * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's
+ * unused first argument.
+ */
+STATIC int
+do_readlink(
+	char __user		*buffer,
+	int			buflen,
+	const char		*link)
+{
+        int len;
+
+	len = PTR_ERR(link);
+	if (IS_ERR(link))
+		goto out;
+
+	len = strlen(link);
+	if (len > (unsigned) buflen)
+		len = buflen;
+	if (copy_to_user(buffer, link, len))
+		len = -EFAULT;
+ out:
+	return len;
+}
+
+
+int
+xfs_readlink_by_handle(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq)
+{
+	struct dentry		*dentry;
+	__u32			olen;
+	void			*link;
+	int			error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -XFS_ERROR(EPERM);
+
+	dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	/* Restrict this handle operation to symlinks only. */
+	if (!S_ISLNK(dentry->d_inode->i_mode)) {
+		error = -XFS_ERROR(EINVAL);
+		goto out_dput;
+	}
+
+	if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
+		error = -XFS_ERROR(EFAULT);
+		goto out_dput;
+	}
+
+	link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
+	if (!link) {
+		error = -XFS_ERROR(ENOMEM);
+		goto out_dput;
+	}
+
+	error = -xfs_readlink(XFS_I(dentry->d_inode), link);
+	if (error)
+		goto out_kfree;
+	error = do_readlink(hreq->ohandle, olen, link);
+	if (error)
+		goto out_kfree;
+
+ out_kfree:
+	kfree(link);
+ out_dput:
+	dput(dentry);
+	return error;
+}
+
+STATIC int
+xfs_fssetdm_by_handle(
+	struct file		*parfilp,
+	void			__user *arg)
+{
+	int			error;
+	struct fsdmidata	fsd;
+	xfs_fsop_setdm_handlereq_t dmhreq;
+	struct dentry		*dentry;
+
+	if (!capable(CAP_MKNOD))
+		return -XFS_ERROR(EPERM);
+	if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
+		return -XFS_ERROR(EFAULT);
+
+	dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
+		error = -XFS_ERROR(EPERM);
+		goto out;
+	}
+
+	if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
+		error = -XFS_ERROR(EFAULT);
+		goto out;
+	}
+
+	error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+				 fsd.fsd_dmstate);
+
+ out:
+	dput(dentry);
+	return error;
+}
+
+STATIC int
+xfs_attrlist_by_handle(
+	struct file		*parfilp,
+	void			__user *arg)
+{
+	int			error = -ENOMEM;
+	attrlist_cursor_kern_t	*cursor;
+	xfs_fsop_attrlist_handlereq_t al_hreq;
+	struct dentry		*dentry;
+	char			*kbuf;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -XFS_ERROR(EPERM);
+	if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
+		return -XFS_ERROR(EFAULT);
+	if (al_hreq.buflen > XATTR_LIST_MAX)
+		return -XFS_ERROR(EINVAL);
+
+	/*
+	 * Reject flags, only allow namespaces.
+	 */
+	if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+		return -XFS_ERROR(EINVAL);
+
+	dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
+	if (!kbuf)
+		goto out_dput;
+
+	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+	error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+					al_hreq.flags, cursor);
+	if (error)
+		goto out_kfree;
+
+	if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
+		error = -EFAULT;
+
+ out_kfree:
+	kfree(kbuf);
+ out_dput:
+	dput(dentry);
+	return error;
+}
+
+int
+xfs_attrmulti_attr_get(
+	struct inode		*inode,
+	unsigned char		*name,
+	unsigned char		__user *ubuf,
+	__uint32_t		*len,
+	__uint32_t		flags)
+{
+	unsigned char		*kbuf;
+	int			error = EFAULT;
+
+	if (*len > XATTR_SIZE_MAX)
+		return EINVAL;
+	kbuf = kmalloc(*len, GFP_KERNEL);
+	if (!kbuf)
+		return ENOMEM;
+
+	error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
+	if (error)
+		goto out_kfree;
+
+	if (copy_to_user(ubuf, kbuf, *len))
+		error = EFAULT;
+
+ out_kfree:
+	kfree(kbuf);
+	return error;
+}
+
+int
+xfs_attrmulti_attr_set(
+	struct inode		*inode,
+	unsigned char		*name,
+	const unsigned char	__user *ubuf,
+	__uint32_t		len,
+	__uint32_t		flags)
+{
+	unsigned char		*kbuf;
+	int			error = EFAULT;
+
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		return EPERM;
+	if (len > XATTR_SIZE_MAX)
+		return EINVAL;
+
+	kbuf = memdup_user(ubuf, len);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
+
+	error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+
+	return error;
+}
+
+int
+xfs_attrmulti_attr_remove(
+	struct inode		*inode,
+	unsigned char		*name,
+	__uint32_t		flags)
+{
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		return EPERM;
+	return xfs_attr_remove(XFS_I(inode), name, flags);
+}
+
+STATIC int
+xfs_attrmulti_by_handle(
+	struct file		*parfilp,
+	void			__user *arg)
+{
+	int			error;
+	xfs_attr_multiop_t	*ops;
+	xfs_fsop_attrmulti_handlereq_t am_hreq;
+	struct dentry		*dentry;
+	unsigned int		i, size;
+	unsigned char		*attr_name;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -XFS_ERROR(EPERM);
+	if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
+		return -XFS_ERROR(EFAULT);
+
+	/* overflow check */
+	if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
+		return -E2BIG;
+
+	dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	error = E2BIG;
+	size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
+	if (!size || size > 16 * PAGE_SIZE)
+		goto out_dput;
+
+	ops = memdup_user(am_hreq.ops, size);
+	if (IS_ERR(ops)) {
+		error = PTR_ERR(ops);
+		goto out_dput;
+	}
+
+	attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
+	if (!attr_name)
+		goto out_kfree_ops;
+
+	error = 0;
+	for (i = 0; i < am_hreq.opcount; i++) {
+		ops[i].am_error = strncpy_from_user((char *)attr_name,
+				ops[i].am_attrname, MAXNAMELEN);
+		if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
+			error = -ERANGE;
+		if (ops[i].am_error < 0)
+			break;
+
+		switch (ops[i].am_opcode) {
+		case ATTR_OP_GET:
+			ops[i].am_error = xfs_attrmulti_attr_get(
+					dentry->d_inode, attr_name,
+					ops[i].am_attrvalue, &ops[i].am_length,
+					ops[i].am_flags);
+			break;
+		case ATTR_OP_SET:
+			ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+			if (ops[i].am_error)
+				break;
+			ops[i].am_error = xfs_attrmulti_attr_set(
+					dentry->d_inode, attr_name,
+					ops[i].am_attrvalue, ops[i].am_length,
+					ops[i].am_flags);
+			mnt_drop_write(parfilp->f_path.mnt);
+			break;
+		case ATTR_OP_REMOVE:
+			ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+			if (ops[i].am_error)
+				break;
+			ops[i].am_error = xfs_attrmulti_attr_remove(
+					dentry->d_inode, attr_name,
+					ops[i].am_flags);
+			mnt_drop_write(parfilp->f_path.mnt);
+			break;
+		default:
+			ops[i].am_error = EINVAL;
+		}
+	}
+
+	if (copy_to_user(am_hreq.ops, ops, size))
+		error = XFS_ERROR(EFAULT);
+
+	kfree(attr_name);
+ out_kfree_ops:
+	kfree(ops);
+ out_dput:
+	dput(dentry);
+	return -error;
+}
+
+int
+xfs_ioc_space(
+	struct xfs_inode	*ip,
+	struct inode		*inode,
+	struct file		*filp,
+	int			ioflags,
+	unsigned int		cmd,
+	xfs_flock64_t		*bf)
+{
+	int			attr_flags = 0;
+	int			error;
+
+	/*
+	 * Only allow the sys admin to reserve space unless
+	 * unwritten extents are enabled.
+	 */
+	if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
+	    !capable(CAP_SYS_ADMIN))
+		return -XFS_ERROR(EPERM);
+
+	if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
+		return -XFS_ERROR(EPERM);
+
+	if (!(filp->f_mode & FMODE_WRITE))
+		return -XFS_ERROR(EBADF);
+
+	if (!S_ISREG(inode->i_mode))
+		return -XFS_ERROR(EINVAL);
+
+	if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+		attr_flags |= XFS_ATTR_NONBLOCK;
+
+	if (filp->f_flags & O_DSYNC)
+		attr_flags |= XFS_ATTR_SYNC;
+
+	if (ioflags & IO_INVIS)
+		attr_flags |= XFS_ATTR_DMI;
+
+	error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
+	return -error;
+}
+
+STATIC int
+xfs_ioc_bulkstat(
+	xfs_mount_t		*mp,
+	unsigned int		cmd,
+	void			__user *arg)
+{
+	xfs_fsop_bulkreq_t	bulkreq;
+	int			count;	/* # of records returned */
+	xfs_ino_t		inlast;	/* last inode number */
+	int			done;
+	int			error;
+
+	/* done = 1 if there are more stats to get and if bulkstat */
+	/* should be called again (unused here, but used in dmapi) */
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -XFS_ERROR(EIO);
+
+	if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
+		return -XFS_ERROR(EFAULT);
+
+	if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
+		return -XFS_ERROR(EFAULT);
+
+	if ((count = bulkreq.icount) <= 0)
+		return -XFS_ERROR(EINVAL);
+
+	if (bulkreq.ubuffer == NULL)
+		return -XFS_ERROR(EINVAL);
+
+	if (cmd == XFS_IOC_FSINUMBERS)
+		error = xfs_inumbers(mp, &inlast, &count,
+					bulkreq.ubuffer, xfs_inumbers_fmt);
+	else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
+		error = xfs_bulkstat_single(mp, &inlast,
+						bulkreq.ubuffer, &done);
+	else	/* XFS_IOC_FSBULKSTAT */
+		error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
+				     sizeof(xfs_bstat_t), bulkreq.ubuffer,
+				     &done);
+
+	if (error)
+		return -error;
+
+	if (bulkreq.ocount != NULL) {
+		if (copy_to_user(bulkreq.lastip, &inlast,
+						sizeof(xfs_ino_t)))
+			return -XFS_ERROR(EFAULT);
+
+		if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
+			return -XFS_ERROR(EFAULT);
+	}
+
+	return 0;
+}
+
+STATIC int
+xfs_ioc_fsgeometry_v1(
+	xfs_mount_t		*mp,
+	void			__user *arg)
+{
+	xfs_fsop_geom_t         fsgeo;
+	int			error;
+
+	error = xfs_fs_geometry(mp, &fsgeo, 3);
+	if (error)
+		return -error;
+
+	/*
+	 * Caller should have passed an argument of type
+	 * xfs_fsop_geom_v1_t.  This is a proper subset of the
+	 * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
+	 */
+	if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+STATIC int
+xfs_ioc_fsgeometry(
+	xfs_mount_t		*mp,
+	void			__user *arg)
+{
+	xfs_fsop_geom_t		fsgeo;
+	int			error;
+
+	error = xfs_fs_geometry(mp, &fsgeo, 4);
+	if (error)
+		return -error;
+
+	if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+/*
+ * Linux extended inode flags interface.
+ */
+
+STATIC unsigned int
+xfs_merge_ioc_xflags(
+	unsigned int	flags,
+	unsigned int	start)
+{
+	unsigned int	xflags = start;
+
+	if (flags & FS_IMMUTABLE_FL)
+		xflags |= XFS_XFLAG_IMMUTABLE;
+	else
+		xflags &= ~XFS_XFLAG_IMMUTABLE;
+	if (flags & FS_APPEND_FL)
+		xflags |= XFS_XFLAG_APPEND;
+	else
+		xflags &= ~XFS_XFLAG_APPEND;
+	if (flags & FS_SYNC_FL)
+		xflags |= XFS_XFLAG_SYNC;
+	else
+		xflags &= ~XFS_XFLAG_SYNC;
+	if (flags & FS_NOATIME_FL)
+		xflags |= XFS_XFLAG_NOATIME;
+	else
+		xflags &= ~XFS_XFLAG_NOATIME;
+	if (flags & FS_NODUMP_FL)
+		xflags |= XFS_XFLAG_NODUMP;
+	else
+		xflags &= ~XFS_XFLAG_NODUMP;
+
+	return xflags;
+}
+
+STATIC unsigned int
+xfs_di2lxflags(
+	__uint16_t	di_flags)
+{
+	unsigned int	flags = 0;
+
+	if (di_flags & XFS_DIFLAG_IMMUTABLE)
+		flags |= FS_IMMUTABLE_FL;
+	if (di_flags & XFS_DIFLAG_APPEND)
+		flags |= FS_APPEND_FL;
+	if (di_flags & XFS_DIFLAG_SYNC)
+		flags |= FS_SYNC_FL;
+	if (di_flags & XFS_DIFLAG_NOATIME)
+		flags |= FS_NOATIME_FL;
+	if (di_flags & XFS_DIFLAG_NODUMP)
+		flags |= FS_NODUMP_FL;
+	return flags;
+}
+
+STATIC int
+xfs_ioc_fsgetxattr(
+	xfs_inode_t		*ip,
+	int			attr,
+	void			__user *arg)
+{
+	struct fsxattr		fa;
+
+	memset(&fa, 0, sizeof(struct fsxattr));
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	fa.fsx_xflags = xfs_ip2xflags(ip);
+	fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
+	fa.fsx_projid = xfs_get_projid(ip);
+
+	if (attr) {
+		if (ip->i_afp) {
+			if (ip->i_afp->if_flags & XFS_IFEXTENTS)
+				fa.fsx_nextents = ip->i_afp->if_bytes /
+							sizeof(xfs_bmbt_rec_t);
+			else
+				fa.fsx_nextents = ip->i_d.di_anextents;
+		} else
+			fa.fsx_nextents = 0;
+	} else {
+		if (ip->i_df.if_flags & XFS_IFEXTENTS)
+			fa.fsx_nextents = ip->i_df.if_bytes /
+						sizeof(xfs_bmbt_rec_t);
+		else
+			fa.fsx_nextents = ip->i_d.di_nextents;
+	}
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (copy_to_user(arg, &fa, sizeof(fa)))
+		return -EFAULT;
+	return 0;
+}
+
+STATIC void
+xfs_set_diflags(
+	struct xfs_inode	*ip,
+	unsigned int		xflags)
+{
+	unsigned int		di_flags;
+
+	/* can't set PREALLOC this way, just preserve it */
+	di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
+	if (xflags & XFS_XFLAG_IMMUTABLE)
+		di_flags |= XFS_DIFLAG_IMMUTABLE;
+	if (xflags & XFS_XFLAG_APPEND)
+		di_flags |= XFS_DIFLAG_APPEND;
+	if (xflags & XFS_XFLAG_SYNC)
+		di_flags |= XFS_DIFLAG_SYNC;
+	if (xflags & XFS_XFLAG_NOATIME)
+		di_flags |= XFS_DIFLAG_NOATIME;
+	if (xflags & XFS_XFLAG_NODUMP)
+		di_flags |= XFS_DIFLAG_NODUMP;
+	if (xflags & XFS_XFLAG_PROJINHERIT)
+		di_flags |= XFS_DIFLAG_PROJINHERIT;
+	if (xflags & XFS_XFLAG_NODEFRAG)
+		di_flags |= XFS_DIFLAG_NODEFRAG;
+	if (xflags & XFS_XFLAG_FILESTREAM)
+		di_flags |= XFS_DIFLAG_FILESTREAM;
+	if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+		if (xflags & XFS_XFLAG_RTINHERIT)
+			di_flags |= XFS_DIFLAG_RTINHERIT;
+		if (xflags & XFS_XFLAG_NOSYMLINKS)
+			di_flags |= XFS_DIFLAG_NOSYMLINKS;
+		if (xflags & XFS_XFLAG_EXTSZINHERIT)
+			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+	} else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
+		if (xflags & XFS_XFLAG_REALTIME)
+			di_flags |= XFS_DIFLAG_REALTIME;
+		if (xflags & XFS_XFLAG_EXTSIZE)
+			di_flags |= XFS_DIFLAG_EXTSIZE;
+	}
+
+	ip->i_d.di_flags = di_flags;
+}
+
+STATIC void
+xfs_diflags_to_linux(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip);
+	unsigned int		xflags = xfs_ip2xflags(ip);
+
+	if (xflags & XFS_XFLAG_IMMUTABLE)
+		inode->i_flags |= S_IMMUTABLE;
+	else
+		inode->i_flags &= ~S_IMMUTABLE;
+	if (xflags & XFS_XFLAG_APPEND)
+		inode->i_flags |= S_APPEND;
+	else
+		inode->i_flags &= ~S_APPEND;
+	if (xflags & XFS_XFLAG_SYNC)
+		inode->i_flags |= S_SYNC;
+	else
+		inode->i_flags &= ~S_SYNC;
+	if (xflags & XFS_XFLAG_NOATIME)
+		inode->i_flags |= S_NOATIME;
+	else
+		inode->i_flags &= ~S_NOATIME;
+}
+
+#define FSX_PROJID	1
+#define FSX_EXTSIZE	2
+#define FSX_XFLAGS	4
+#define FSX_NONBLOCK	8
+
+STATIC int
+xfs_ioctl_setattr(
+	xfs_inode_t		*ip,
+	struct fsxattr		*fa,
+	int			mask)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	unsigned int		lock_flags = 0;
+	struct xfs_dquot	*udqp = NULL;
+	struct xfs_dquot	*gdqp = NULL;
+	struct xfs_dquot	*olddquot = NULL;
+	int			code;
+
+	trace_xfs_ioctl_setattr(ip);
+
+	if (mp->m_flags & XFS_MOUNT_RDONLY)
+		return XFS_ERROR(EROFS);
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	/*
+	 * Disallow 32bit project ids when projid32bit feature is not enabled.
+	 */
+	if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
+			!xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
+		return XFS_ERROR(EINVAL);
+
+	/*
+	 * If disk quotas is on, we make sure that the dquots do exist on disk,
+	 * before we start any other transactions. Trying to do this later
+	 * is messy. We don't care to take a readlock to look at the ids
+	 * in inode here, because we can't hold it across the trans_reserve.
+	 * If the IDs do change before we take the ilock, we're covered
+	 * because the i_*dquot fields will get updated anyway.
+	 */
+	if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
+		code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
+					 ip->i_d.di_gid, fa->fsx_projid,
+					 XFS_QMOPT_PQUOTA, &udqp, &gdqp);
+		if (code)
+			return code;
+	}
+
+	/*
+	 * For the other attributes, we acquire the inode lock and
+	 * first do an error checking pass.
+	 */
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+	code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+	if (code)
+		goto error_return;
+
+	lock_flags = XFS_ILOCK_EXCL;
+	xfs_ilock(ip, lock_flags);
+
+	/*
+	 * CAP_FOWNER overrides the following restrictions:
+	 *
+	 * The user ID of the calling process must be equal
+	 * to the file owner ID, except in cases where the
+	 * CAP_FSETID capability is applicable.
+	 */
+	if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
+		code = XFS_ERROR(EPERM);
+		goto error_return;
+	}
+
+	/*
+	 * Do a quota reservation only if projid is actually going to change.
+	 */
+	if (mask & FSX_PROJID) {
+		if (XFS_IS_QUOTA_RUNNING(mp) &&
+		    XFS_IS_PQUOTA_ON(mp) &&
+		    xfs_get_projid(ip) != fa->fsx_projid) {
+			ASSERT(tp);
+			code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
+						capable(CAP_FOWNER) ?
+						XFS_QMOPT_FORCE_RES : 0);
+			if (code)	/* out of quota */
+				goto error_return;
+		}
+	}
+
+	if (mask & FSX_EXTSIZE) {
+		/*
+		 * Can't change extent size if any extents are allocated.
+		 */
+		if (ip->i_d.di_nextents &&
+		    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
+		     fa->fsx_extsize)) {
+			code = XFS_ERROR(EINVAL);	/* EFBIG? */
+			goto error_return;
+		}
+
+		/*
+		 * Extent size must be a multiple of the appropriate block
+		 * size, if set at all. It must also be smaller than the
+		 * maximum extent size supported by the filesystem.
+		 *
+		 * Also, for non-realtime files, limit the extent size hint to
+		 * half the size of the AGs in the filesystem so alignment
+		 * doesn't result in extents larger than an AG.
+		 */
+		if (fa->fsx_extsize != 0) {
+			xfs_extlen_t    size;
+			xfs_fsblock_t   extsize_fsb;
+
+			extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+			if (extsize_fsb > MAXEXTLEN) {
+				code = XFS_ERROR(EINVAL);
+				goto error_return;
+			}
+
+			if (XFS_IS_REALTIME_INODE(ip) ||
+			    ((mask & FSX_XFLAGS) &&
+			    (fa->fsx_xflags & XFS_XFLAG_REALTIME))) {
+				size = mp->m_sb.sb_rextsize <<
+				       mp->m_sb.sb_blocklog;
+			} else {
+				size = mp->m_sb.sb_blocksize;
+				if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
+					code = XFS_ERROR(EINVAL);
+					goto error_return;
+				}
+			}
+
+			if (fa->fsx_extsize % size) {
+				code = XFS_ERROR(EINVAL);
+				goto error_return;
+			}
+		}
+	}
+
+
+	if (mask & FSX_XFLAGS) {
+		/*
+		 * Can't change realtime flag if any extents are allocated.
+		 */
+		if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+		    (XFS_IS_REALTIME_INODE(ip)) !=
+		    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+			code = XFS_ERROR(EINVAL);	/* EFBIG? */
+			goto error_return;
+		}
+
+		/*
+		 * If realtime flag is set then must have realtime data.
+		 */
+		if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+			if ((mp->m_sb.sb_rblocks == 0) ||
+			    (mp->m_sb.sb_rextsize == 0) ||
+			    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
+				code = XFS_ERROR(EINVAL);
+				goto error_return;
+			}
+		}
+
+		/*
+		 * Can't modify an immutable/append-only file unless
+		 * we have appropriate permission.
+		 */
+		if ((ip->i_d.di_flags &
+				(XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
+		     (fa->fsx_xflags &
+				(XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+		    !capable(CAP_LINUX_IMMUTABLE)) {
+			code = XFS_ERROR(EPERM);
+			goto error_return;
+		}
+	}
+
+	xfs_trans_ijoin(tp, ip);
+
+	/*
+	 * Change file ownership.  Must be the owner or privileged.
+	 */
+	if (mask & FSX_PROJID) {
+		/*
+		 * CAP_FSETID overrides the following restrictions:
+		 *
+		 * The set-user-ID and set-group-ID bits of a file will be
+		 * cleared upon successful return from chown()
+		 */
+		if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+		    !capable(CAP_FSETID))
+			ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+
+		/*
+		 * Change the ownerships and register quota modifications
+		 * in the transaction.
+		 */
+		if (xfs_get_projid(ip) != fa->fsx_projid) {
+			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
+				olddquot = xfs_qm_vop_chown(tp, ip,
+							&ip->i_gdquot, gdqp);
+			}
+			xfs_set_projid(ip, fa->fsx_projid);
+
+			/*
+			 * We may have to rev the inode as well as
+			 * the superblock version number since projids didn't
+			 * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
+			 */
+			if (ip->i_d.di_version == 1)
+				xfs_bump_ino_vers2(tp, ip);
+		}
+
+	}
+
+	if (mask & FSX_EXTSIZE)
+		ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
+	if (mask & FSX_XFLAGS) {
+		xfs_set_diflags(ip, fa->fsx_xflags);
+		xfs_diflags_to_linux(ip);
+	}
+
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	XFS_STATS_INC(xs_ig_attrchg);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 * This is slightly sub-optimal in that truncates require
+	 * two sync transactions instead of one for wsync filesystems.
+	 * One for the truncate and one for the timestamps since we
+	 * don't want to change the timestamps unless we're sure the
+	 * truncate worked.  Truncates are less than 1% of the laddis
+	 * mix so this probably isn't worth the trouble to optimize.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(tp);
+	code = xfs_trans_commit(tp, 0);
+	xfs_iunlock(ip, lock_flags);
+
+	/*
+	 * Release any dquot(s) the inode had kept before chown.
+	 */
+	xfs_qm_dqrele(olddquot);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+
+	return code;
+
+ error_return:
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	xfs_trans_cancel(tp, 0);
+	if (lock_flags)
+		xfs_iunlock(ip, lock_flags);
+	return code;
+}
+
+STATIC int
+xfs_ioc_fssetxattr(
+	xfs_inode_t		*ip,
+	struct file		*filp,
+	void			__user *arg)
+{
+	struct fsxattr		fa;
+	unsigned int		mask;
+
+	if (copy_from_user(&fa, arg, sizeof(fa)))
+		return -EFAULT;
+
+	mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID;
+	if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+		mask |= FSX_NONBLOCK;
+
+	return -xfs_ioctl_setattr(ip, &fa, mask);
+}
+
+STATIC int
+xfs_ioc_getxflags(
+	xfs_inode_t		*ip,
+	void			__user *arg)
+{
+	unsigned int		flags;
+
+	flags = xfs_di2lxflags(ip->i_d.di_flags);
+	if (copy_to_user(arg, &flags, sizeof(flags)))
+		return -EFAULT;
+	return 0;
+}
+
+STATIC int
+xfs_ioc_setxflags(
+	xfs_inode_t		*ip,
+	struct file		*filp,
+	void			__user *arg)
+{
+	struct fsxattr		fa;
+	unsigned int		flags;
+	unsigned int		mask;
+
+	if (copy_from_user(&flags, arg, sizeof(flags)))
+		return -EFAULT;
+
+	if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+		      FS_NOATIME_FL | FS_NODUMP_FL | \
+		      FS_SYNC_FL))
+		return -EOPNOTSUPP;
+
+	mask = FSX_XFLAGS;
+	if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+		mask |= FSX_NONBLOCK;
+	fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
+
+	return -xfs_ioctl_setattr(ip, &fa, mask);
+}
+
+STATIC int
+xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
+{
+	struct getbmap __user	*base = *ap;
+
+	/* copy only getbmap portion (not getbmapx) */
+	if (copy_to_user(base, bmv, sizeof(struct getbmap)))
+		return XFS_ERROR(EFAULT);
+
+	*ap += sizeof(struct getbmap);
+	return 0;
+}
+
+STATIC int
+xfs_ioc_getbmap(
+	struct xfs_inode	*ip,
+	int			ioflags,
+	unsigned int		cmd,
+	void			__user *arg)
+{
+	struct getbmapx		bmx;
+	int			error;
+
+	if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
+		return -XFS_ERROR(EFAULT);
+
+	if (bmx.bmv_count < 2)
+		return -XFS_ERROR(EINVAL);
+
+	bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
+	if (ioflags & IO_INVIS)
+		bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
+
+	error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+			    (struct getbmap *)arg+1);
+	if (error)
+		return -error;
+
+	/* copy back header - only size of getbmap */
+	if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+STATIC int
+xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
+{
+	struct getbmapx __user	*base = *ap;
+
+	if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
+		return XFS_ERROR(EFAULT);
+
+	*ap += sizeof(struct getbmapx);
+	return 0;
+}
+
+STATIC int
+xfs_ioc_getbmapx(
+	struct xfs_inode	*ip,
+	void			__user *arg)
+{
+	struct getbmapx		bmx;
+	int			error;
+
+	if (copy_from_user(&bmx, arg, sizeof(bmx)))
+		return -XFS_ERROR(EFAULT);
+
+	if (bmx.bmv_count < 2)
+		return -XFS_ERROR(EINVAL);
+
+	if (bmx.bmv_iflags & (~BMV_IF_VALID))
+		return -XFS_ERROR(EINVAL);
+
+	error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
+			    (struct getbmapx *)arg+1);
+	if (error)
+		return -error;
+
+	/* copy back header */
+	if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
+		return -XFS_ERROR(EFAULT);
+
+	return 0;
+}
+
+/*
+ * Note: some of the ioctl's return positive numbers as a
+ * byte count indicating success, such as readlink_by_handle.
+ * So we don't "sign flip" like most other routines.  This means
+ * true errors need to be returned as a negative value.
+ */
+long
+xfs_file_ioctl(
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		p)
+{
+	struct inode		*inode = filp->f_path.dentry->d_inode;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	void			__user *arg = (void __user *)p;
+	int			ioflags = 0;
+	int			error;
+
+	if (filp->f_mode & FMODE_NOCMTIME)
+		ioflags |= IO_INVIS;
+
+	trace_xfs_file_ioctl(ip);
+
+	switch (cmd) {
+	case FITRIM:
+		return xfs_ioc_trim(mp, arg);
+	case XFS_IOC_ALLOCSP:
+	case XFS_IOC_FREESP:
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_ALLOCSP64:
+	case XFS_IOC_FREESP64:
+	case XFS_IOC_RESVSP64:
+	case XFS_IOC_UNRESVSP64:
+	case XFS_IOC_ZERO_RANGE: {
+		xfs_flock64_t		bf;
+
+		if (copy_from_user(&bf, arg, sizeof(bf)))
+			return -XFS_ERROR(EFAULT);
+		return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+	}
+	case XFS_IOC_DIOINFO: {
+		struct dioattr	da;
+		xfs_buftarg_t	*target =
+			XFS_IS_REALTIME_INODE(ip) ?
+			mp->m_rtdev_targp : mp->m_ddev_targp;
+
+		da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
+		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
+
+		if (copy_to_user(arg, &da, sizeof(da)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_FSBULKSTAT_SINGLE:
+	case XFS_IOC_FSBULKSTAT:
+	case XFS_IOC_FSINUMBERS:
+		return xfs_ioc_bulkstat(mp, cmd, arg);
+
+	case XFS_IOC_FSGEOMETRY_V1:
+		return xfs_ioc_fsgeometry_v1(mp, arg);
+
+	case XFS_IOC_FSGEOMETRY:
+		return xfs_ioc_fsgeometry(mp, arg);
+
+	case XFS_IOC_GETVERSION:
+		return put_user(inode->i_generation, (int __user *)arg);
+
+	case XFS_IOC_FSGETXATTR:
+		return xfs_ioc_fsgetxattr(ip, 0, arg);
+	case XFS_IOC_FSGETXATTRA:
+		return xfs_ioc_fsgetxattr(ip, 1, arg);
+	case XFS_IOC_FSSETXATTR:
+		return xfs_ioc_fssetxattr(ip, filp, arg);
+	case XFS_IOC_GETXFLAGS:
+		return xfs_ioc_getxflags(ip, arg);
+	case XFS_IOC_SETXFLAGS:
+		return xfs_ioc_setxflags(ip, filp, arg);
+
+	case XFS_IOC_FSSETDM: {
+		struct fsdmidata	dmi;
+
+		if (copy_from_user(&dmi, arg, sizeof(dmi)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
+				dmi.fsd_dmstate);
+		return -error;
+	}
+
+	case XFS_IOC_GETBMAP:
+	case XFS_IOC_GETBMAPA:
+		return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+
+	case XFS_IOC_GETBMAPX:
+		return xfs_ioc_getbmapx(ip, arg);
+
+	case XFS_IOC_FD_TO_HANDLE:
+	case XFS_IOC_PATH_TO_HANDLE:
+	case XFS_IOC_PATH_TO_FSHANDLE: {
+		xfs_fsop_handlereq_t	hreq;
+
+		if (copy_from_user(&hreq, arg, sizeof(hreq)))
+			return -XFS_ERROR(EFAULT);
+		return xfs_find_handle(cmd, &hreq);
+	}
+	case XFS_IOC_OPEN_BY_HANDLE: {
+		xfs_fsop_handlereq_t	hreq;
+
+		if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+			return -XFS_ERROR(EFAULT);
+		return xfs_open_by_handle(filp, &hreq);
+	}
+	case XFS_IOC_FSSETDM_BY_HANDLE:
+		return xfs_fssetdm_by_handle(filp, arg);
+
+	case XFS_IOC_READLINK_BY_HANDLE: {
+		xfs_fsop_handlereq_t	hreq;
+
+		if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+			return -XFS_ERROR(EFAULT);
+		return xfs_readlink_by_handle(filp, &hreq);
+	}
+	case XFS_IOC_ATTRLIST_BY_HANDLE:
+		return xfs_attrlist_by_handle(filp, arg);
+
+	case XFS_IOC_ATTRMULTI_BY_HANDLE:
+		return xfs_attrmulti_by_handle(filp, arg);
+
+	case XFS_IOC_SWAPEXT: {
+		struct xfs_swapext	sxp;
+
+		if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
+			return -XFS_ERROR(EFAULT);
+		error = xfs_swapext(&sxp);
+		return -error;
+	}
+
+	case XFS_IOC_FSCOUNTS: {
+		xfs_fsop_counts_t out;
+
+		error = xfs_fs_counts(mp, &out);
+		if (error)
+			return -error;
+
+		if (copy_to_user(arg, &out, sizeof(out)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_SET_RESBLKS: {
+		xfs_fsop_resblks_t inout;
+		__uint64_t	   in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (mp->m_flags & XFS_MOUNT_RDONLY)
+			return -XFS_ERROR(EROFS);
+
+		if (copy_from_user(&inout, arg, sizeof(inout)))
+			return -XFS_ERROR(EFAULT);
+
+		/* input parameter is passed in resblks field of structure */
+		in = inout.resblks;
+		error = xfs_reserve_blocks(mp, &in, &inout);
+		if (error)
+			return -error;
+
+		if (copy_to_user(arg, &inout, sizeof(inout)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_GET_RESBLKS: {
+		xfs_fsop_resblks_t out;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		error = xfs_reserve_blocks(mp, NULL, &out);
+		if (error)
+			return -error;
+
+		if (copy_to_user(arg, &out, sizeof(out)))
+			return -XFS_ERROR(EFAULT);
+
+		return 0;
+	}
+
+	case XFS_IOC_FSGROWFSDATA: {
+		xfs_growfs_data_t in;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_growfs_data(mp, &in);
+		return -error;
+	}
+
+	case XFS_IOC_FSGROWFSLOG: {
+		xfs_growfs_log_t in;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_growfs_log(mp, &in);
+		return -error;
+	}
+
+	case XFS_IOC_FSGROWFSRT: {
+		xfs_growfs_rt_t in;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_growfs_rt(mp, &in);
+		return -error;
+	}
+
+	case XFS_IOC_GOINGDOWN: {
+		__uint32_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (get_user(in, (__uint32_t __user *)arg))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_fs_goingdown(mp, in);
+		return -error;
+	}
+
+	case XFS_IOC_ERROR_INJECTION: {
+		xfs_error_injection_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_errortag_add(in.errtag, mp);
+		return -error;
+	}
+
+	case XFS_IOC_ERROR_CLEARALL:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		error = xfs_errortag_clearall(mp, 1);
+		return -error;
+
+	default:
+		return -ENOTTY;
+	}
+}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
new file mode 100644
index 00000000..d56173b3
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IOCTL_H__
+#define __XFS_IOCTL_H__
+
+extern int
+xfs_ioc_space(
+	struct xfs_inode	*ip,
+	struct inode		*inode,
+	struct file		*filp,
+	int			ioflags,
+	unsigned int		cmd,
+	xfs_flock64_t		*bf);
+
+extern int
+xfs_find_handle(
+	unsigned int		cmd,
+	xfs_fsop_handlereq_t	*hreq);
+
+extern int
+xfs_open_by_handle(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq);
+
+extern int
+xfs_readlink_by_handle(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq);
+
+extern int
+xfs_attrmulti_attr_get(
+	struct inode		*inode,
+	unsigned char		*name,
+	unsigned char		__user *ubuf,
+	__uint32_t		*len,
+	__uint32_t		flags);
+
+extern int
+xfs_attrmulti_attr_set(
+	struct inode		*inode,
+	unsigned char		*name,
+	const unsigned char	__user *ubuf,
+	__uint32_t		len,
+	__uint32_t		flags);
+
+extern int
+xfs_attrmulti_attr_remove(
+	struct inode		*inode,
+	unsigned char		*name,
+	__uint32_t		flags);
+
+extern struct dentry *
+xfs_handle_to_dentry(
+	struct file		*parfilp,
+	void __user		*uhandle,
+	u32			hlen);
+
+extern long
+xfs_file_ioctl(
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		p);
+
+extern long
+xfs_file_compat_ioctl(
+	struct file		*file,
+	unsigned int		cmd,
+	unsigned long		arg);
+
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
new file mode 100644
index 00000000..54e623bf
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -0,0 +1,672 @@
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/compat.h>
+#include <linux/ioctl.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_vnode.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#include "xfs_dfrag.h"
+#include "xfs_vnodeops.h"
+#include "xfs_fsops.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_attr.h"
+#include "xfs_ioctl.h"
+#include "xfs_ioctl32.h"
+#include "xfs_trace.h"
+
+#define  _NATIVE_IOC(cmd, type) \
+	  _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
+
+#ifdef BROKEN_X86_ALIGNMENT
+STATIC int
+xfs_compat_flock64_copyin(
+	xfs_flock64_t		*bf,
+	compat_xfs_flock64_t	__user *arg32)
+{
+	if (get_user(bf->l_type,	&arg32->l_type) ||
+	    get_user(bf->l_whence,	&arg32->l_whence) ||
+	    get_user(bf->l_start,	&arg32->l_start) ||
+	    get_user(bf->l_len,		&arg32->l_len) ||
+	    get_user(bf->l_sysid,	&arg32->l_sysid) ||
+	    get_user(bf->l_pid,		&arg32->l_pid) ||
+	    copy_from_user(bf->l_pad,	&arg32->l_pad,	4*sizeof(u32)))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+STATIC int
+xfs_compat_ioc_fsgeometry_v1(
+	struct xfs_mount	  *mp,
+	compat_xfs_fsop_geom_v1_t __user *arg32)
+{
+	xfs_fsop_geom_t		  fsgeo;
+	int			  error;
+
+	error = xfs_fs_geometry(mp, &fsgeo, 3);
+	if (error)
+		return -error;
+	/* The 32-bit variant simply has some padding at the end */
+	if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+STATIC int
+xfs_compat_growfs_data_copyin(
+	struct xfs_growfs_data	 *in,
+	compat_xfs_growfs_data_t __user *arg32)
+{
+	if (get_user(in->newblocks, &arg32->newblocks) ||
+	    get_user(in->imaxpct,   &arg32->imaxpct))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+STATIC int
+xfs_compat_growfs_rt_copyin(
+	struct xfs_growfs_rt	 *in,
+	compat_xfs_growfs_rt_t	__user *arg32)
+{
+	if (get_user(in->newblocks, &arg32->newblocks) ||
+	    get_user(in->extsize,   &arg32->extsize))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+STATIC int
+xfs_inumbers_fmt_compat(
+	void			__user *ubuffer,
+	const xfs_inogrp_t	*buffer,
+	long			count,
+	long			*written)
+{
+	compat_xfs_inogrp_t	__user *p32 = ubuffer;
+	long			i;
+
+	for (i = 0; i < count; i++) {
+		if (put_user(buffer[i].xi_startino,   &p32[i].xi_startino) ||
+		    put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
+		    put_user(buffer[i].xi_allocmask,  &p32[i].xi_allocmask))
+			return -XFS_ERROR(EFAULT);
+	}
+	*written = count * sizeof(*p32);
+	return 0;
+}
+
+#else
+#define xfs_inumbers_fmt_compat xfs_inumbers_fmt
+#endif	/* BROKEN_X86_ALIGNMENT */
+
+STATIC int
+xfs_ioctl32_bstime_copyin(
+	xfs_bstime_t		*bstime,
+	compat_xfs_bstime_t	__user *bstime32)
+{
+	compat_time_t		sec32;	/* tv_sec differs on 64 vs. 32 */
+
+	if (get_user(sec32,		&bstime32->tv_sec)	||
+	    get_user(bstime->tv_nsec,	&bstime32->tv_nsec))
+		return -XFS_ERROR(EFAULT);
+	bstime->tv_sec = sec32;
+	return 0;
+}
+
+/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */
+STATIC int
+xfs_ioctl32_bstat_copyin(
+	xfs_bstat_t		*bstat,
+	compat_xfs_bstat_t	__user *bstat32)
+{
+	if (get_user(bstat->bs_ino,	&bstat32->bs_ino)	||
+	    get_user(bstat->bs_mode,	&bstat32->bs_mode)	||
+	    get_user(bstat->bs_nlink,	&bstat32->bs_nlink)	||
+	    get_user(bstat->bs_uid,	&bstat32->bs_uid)	||
+	    get_user(bstat->bs_gid,	&bstat32->bs_gid)	||
+	    get_user(bstat->bs_rdev,	&bstat32->bs_rdev)	||
+	    get_user(bstat->bs_blksize,	&bstat32->bs_blksize)	||
+	    get_user(bstat->bs_size,	&bstat32->bs_size)	||
+	    xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) ||
+	    xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) ||
+	    xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) ||
+	    get_user(bstat->bs_blocks,	&bstat32->bs_size)	||
+	    get_user(bstat->bs_xflags,	&bstat32->bs_size)	||
+	    get_user(bstat->bs_extsize,	&bstat32->bs_extsize)	||
+	    get_user(bstat->bs_extents,	&bstat32->bs_extents)	||
+	    get_user(bstat->bs_gen,	&bstat32->bs_gen)	||
+	    get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
+	    get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
+	    get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask)	||
+	    get_user(bstat->bs_dmstate,	&bstat32->bs_dmstate)	||
+	    get_user(bstat->bs_aextents, &bstat32->bs_aextents))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+/* XFS_IOC_FSBULKSTAT and friends */
+
+STATIC int
+xfs_bstime_store_compat(
+	compat_xfs_bstime_t	__user *p32,
+	const xfs_bstime_t	*p)
+{
+	__s32			sec32;
+
+	sec32 = p->tv_sec;
+	if (put_user(sec32, &p32->tv_sec) ||
+	    put_user(p->tv_nsec, &p32->tv_nsec))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+/* Return 0 on success or positive error (to xfs_bulkstat()) */
+STATIC int
+xfs_bulkstat_one_fmt_compat(
+	void			__user *ubuffer,
+	int			ubsize,
+	int			*ubused,
+	const xfs_bstat_t	*buffer)
+{
+	compat_xfs_bstat_t	__user *p32 = ubuffer;
+
+	if (ubsize < sizeof(*p32))
+		return XFS_ERROR(ENOMEM);
+
+	if (put_user(buffer->bs_ino,	  &p32->bs_ino)		||
+	    put_user(buffer->bs_mode,	  &p32->bs_mode)	||
+	    put_user(buffer->bs_nlink,	  &p32->bs_nlink)	||
+	    put_user(buffer->bs_uid,	  &p32->bs_uid)		||
+	    put_user(buffer->bs_gid,	  &p32->bs_gid)		||
+	    put_user(buffer->bs_rdev,	  &p32->bs_rdev)	||
+	    put_user(buffer->bs_blksize,  &p32->bs_blksize)	||
+	    put_user(buffer->bs_size,	  &p32->bs_size)	||
+	    xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) ||
+	    xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) ||
+	    xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) ||
+	    put_user(buffer->bs_blocks,	  &p32->bs_blocks)	||
+	    put_user(buffer->bs_xflags,	  &p32->bs_xflags)	||
+	    put_user(buffer->bs_extsize,  &p32->bs_extsize)	||
+	    put_user(buffer->bs_extents,  &p32->bs_extents)	||
+	    put_user(buffer->bs_gen,	  &p32->bs_gen)		||
+	    put_user(buffer->bs_projid,	  &p32->bs_projid)	||
+	    put_user(buffer->bs_projid_hi,	&p32->bs_projid_hi)	||
+	    put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)	||
+	    put_user(buffer->bs_dmstate,  &p32->bs_dmstate)	||
+	    put_user(buffer->bs_aextents, &p32->bs_aextents))
+		return XFS_ERROR(EFAULT);
+	if (ubused)
+		*ubused = sizeof(*p32);
+	return 0;
+}
+
+STATIC int
+xfs_bulkstat_one_compat(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_ino_t	ino,		/* inode number to get data for */
+	void		__user *buffer,	/* buffer to place output in */
+	int		ubsize,		/* size of buffer */
+	int		*ubused,	/* bytes used by me */
+	int		*stat)		/* BULKSTAT_RV_... */
+{
+	return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
+				    xfs_bulkstat_one_fmt_compat,
+				    ubused, stat);
+}
+
+/* copied from xfs_ioctl.c */
+STATIC int
+xfs_compat_ioc_bulkstat(
+	xfs_mount_t		  *mp,
+	unsigned int		  cmd,
+	compat_xfs_fsop_bulkreq_t __user *p32)
+{
+	u32			addr;
+	xfs_fsop_bulkreq_t	bulkreq;
+	int			count;	/* # of records returned */
+	xfs_ino_t		inlast;	/* last inode number */
+	int			done;
+	int			error;
+
+	/* done = 1 if there are more stats to get and if bulkstat */
+	/* should be called again (unused here, but used in dmapi) */
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -XFS_ERROR(EPERM);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -XFS_ERROR(EIO);
+
+	if (get_user(addr, &p32->lastip))
+		return -XFS_ERROR(EFAULT);
+	bulkreq.lastip = compat_ptr(addr);
+	if (get_user(bulkreq.icount, &p32->icount) ||
+	    get_user(addr, &p32->ubuffer))
+		return -XFS_ERROR(EFAULT);
+	bulkreq.ubuffer = compat_ptr(addr);
+	if (get_user(addr, &p32->ocount))
+		return -XFS_ERROR(EFAULT);
+	bulkreq.ocount = compat_ptr(addr);
+
+	if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
+		return -XFS_ERROR(EFAULT);
+
+	if ((count = bulkreq.icount) <= 0)
+		return -XFS_ERROR(EINVAL);
+
+	if (bulkreq.ubuffer == NULL)
+		return -XFS_ERROR(EINVAL);
+
+	if (cmd == XFS_IOC_FSINUMBERS_32) {
+		error = xfs_inumbers(mp, &inlast, &count,
+				bulkreq.ubuffer, xfs_inumbers_fmt_compat);
+	} else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
+		int res;
+
+		error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
+				sizeof(compat_xfs_bstat_t), 0, &res);
+	} else if (cmd == XFS_IOC_FSBULKSTAT_32) {
+		error = xfs_bulkstat(mp, &inlast, &count,
+			xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
+			bulkreq.ubuffer, &done);
+	} else
+		error = XFS_ERROR(EINVAL);
+	if (error)
+		return -error;
+
+	if (bulkreq.ocount != NULL) {
+		if (copy_to_user(bulkreq.lastip, &inlast,
+						sizeof(xfs_ino_t)))
+			return -XFS_ERROR(EFAULT);
+
+		if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
+			return -XFS_ERROR(EFAULT);
+	}
+
+	return 0;
+}
+
+STATIC int
+xfs_compat_handlereq_copyin(
+	xfs_fsop_handlereq_t		*hreq,
+	compat_xfs_fsop_handlereq_t	__user *arg32)
+{
+	compat_xfs_fsop_handlereq_t	hreq32;
+
+	if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
+		return -XFS_ERROR(EFAULT);
+
+	hreq->fd = hreq32.fd;
+	hreq->path = compat_ptr(hreq32.path);
+	hreq->oflags = hreq32.oflags;
+	hreq->ihandle = compat_ptr(hreq32.ihandle);
+	hreq->ihandlen = hreq32.ihandlen;
+	hreq->ohandle = compat_ptr(hreq32.ohandle);
+	hreq->ohandlen = compat_ptr(hreq32.ohandlen);
+
+	return 0;
+}
+
+STATIC struct dentry *
+xfs_compat_handlereq_to_dentry(
+	struct file		*parfilp,
+	compat_xfs_fsop_handlereq_t *hreq)
+{
+	return xfs_handle_to_dentry(parfilp,
+			compat_ptr(hreq->ihandle), hreq->ihandlen);
+}
+
+STATIC int
+xfs_compat_attrlist_by_handle(
+	struct file		*parfilp,
+	void			__user *arg)
+{
+	int			error;
+	attrlist_cursor_kern_t	*cursor;
+	compat_xfs_fsop_attrlist_handlereq_t al_hreq;
+	struct dentry		*dentry;
+	char			*kbuf;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -XFS_ERROR(EPERM);
+	if (copy_from_user(&al_hreq, arg,
+			   sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
+		return -XFS_ERROR(EFAULT);
+	if (al_hreq.buflen > XATTR_LIST_MAX)
+		return -XFS_ERROR(EINVAL);
+
+	/*
+	 * Reject flags, only allow namespaces.
+	 */
+	if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+		return -XFS_ERROR(EINVAL);
+
+	dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	error = -ENOMEM;
+	kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+	if (!kbuf)
+		goto out_dput;
+
+	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+	error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+					al_hreq.flags, cursor);
+	if (error)
+		goto out_kfree;
+
+	if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
+		error = -EFAULT;
+
+ out_kfree:
+	kfree(kbuf);
+ out_dput:
+	dput(dentry);
+	return error;
+}
+
+STATIC int
+xfs_compat_attrmulti_by_handle(
+	struct file				*parfilp,
+	void					__user *arg)
+{
+	int					error;
+	compat_xfs_attr_multiop_t		*ops;
+	compat_xfs_fsop_attrmulti_handlereq_t	am_hreq;
+	struct dentry				*dentry;
+	unsigned int				i, size;
+	unsigned char				*attr_name;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -XFS_ERROR(EPERM);
+	if (copy_from_user(&am_hreq, arg,
+			   sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
+		return -XFS_ERROR(EFAULT);
+
+	/* overflow check */
+	if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
+		return -E2BIG;
+
+	dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	error = E2BIG;
+	size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
+	if (!size || size > 16 * PAGE_SIZE)
+		goto out_dput;
+
+	ops = memdup_user(compat_ptr(am_hreq.ops), size);
+	if (IS_ERR(ops)) {
+		error = PTR_ERR(ops);
+		goto out_dput;
+	}
+
+	attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
+	if (!attr_name)
+		goto out_kfree_ops;
+
+	error = 0;
+	for (i = 0; i < am_hreq.opcount; i++) {
+		ops[i].am_error = strncpy_from_user((char *)attr_name,
+				compat_ptr(ops[i].am_attrname),
+				MAXNAMELEN);
+		if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
+			error = -ERANGE;
+		if (ops[i].am_error < 0)
+			break;
+
+		switch (ops[i].am_opcode) {
+		case ATTR_OP_GET:
+			ops[i].am_error = xfs_attrmulti_attr_get(
+					dentry->d_inode, attr_name,
+					compat_ptr(ops[i].am_attrvalue),
+					&ops[i].am_length, ops[i].am_flags);
+			break;
+		case ATTR_OP_SET:
+			ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+			if (ops[i].am_error)
+				break;
+			ops[i].am_error = xfs_attrmulti_attr_set(
+					dentry->d_inode, attr_name,
+					compat_ptr(ops[i].am_attrvalue),
+					ops[i].am_length, ops[i].am_flags);
+			mnt_drop_write(parfilp->f_path.mnt);
+			break;
+		case ATTR_OP_REMOVE:
+			ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+			if (ops[i].am_error)
+				break;
+			ops[i].am_error = xfs_attrmulti_attr_remove(
+					dentry->d_inode, attr_name,
+					ops[i].am_flags);
+			mnt_drop_write(parfilp->f_path.mnt);
+			break;
+		default:
+			ops[i].am_error = EINVAL;
+		}
+	}
+
+	if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
+		error = XFS_ERROR(EFAULT);
+
+	kfree(attr_name);
+ out_kfree_ops:
+	kfree(ops);
+ out_dput:
+	dput(dentry);
+	return -error;
+}
+
+STATIC int
+xfs_compat_fssetdm_by_handle(
+	struct file		*parfilp,
+	void			__user *arg)
+{
+	int			error;
+	struct fsdmidata	fsd;
+	compat_xfs_fsop_setdm_handlereq_t dmhreq;
+	struct dentry		*dentry;
+
+	if (!capable(CAP_MKNOD))
+		return -XFS_ERROR(EPERM);
+	if (copy_from_user(&dmhreq, arg,
+			   sizeof(compat_xfs_fsop_setdm_handlereq_t)))
+		return -XFS_ERROR(EFAULT);
+
+	dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
+		error = -XFS_ERROR(EPERM);
+		goto out;
+	}
+
+	if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
+		error = -XFS_ERROR(EFAULT);
+		goto out;
+	}
+
+	error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+				 fsd.fsd_dmstate);
+
+out:
+	dput(dentry);
+	return error;
+}
+
+long
+xfs_file_compat_ioctl(
+	struct file		*filp,
+	unsigned		cmd,
+	unsigned long		p)
+{
+	struct inode		*inode = filp->f_path.dentry->d_inode;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	void			__user *arg = (void __user *)p;
+	int			ioflags = 0;
+	int			error;
+
+	if (filp->f_mode & FMODE_NOCMTIME)
+		ioflags |= IO_INVIS;
+
+	trace_xfs_file_compat_ioctl(ip);
+
+	switch (cmd) {
+	/* No size or alignment issues on any arch */
+	case XFS_IOC_DIOINFO:
+	case XFS_IOC_FSGEOMETRY:
+	case XFS_IOC_FSGETXATTR:
+	case XFS_IOC_FSSETXATTR:
+	case XFS_IOC_FSGETXATTRA:
+	case XFS_IOC_FSSETDM:
+	case XFS_IOC_GETBMAP:
+	case XFS_IOC_GETBMAPA:
+	case XFS_IOC_GETBMAPX:
+	case XFS_IOC_FSCOUNTS:
+	case XFS_IOC_SET_RESBLKS:
+	case XFS_IOC_GET_RESBLKS:
+	case XFS_IOC_FSGROWFSLOG:
+	case XFS_IOC_GOINGDOWN:
+	case XFS_IOC_ERROR_INJECTION:
+	case XFS_IOC_ERROR_CLEARALL:
+		return xfs_file_ioctl(filp, cmd, p);
+#ifndef BROKEN_X86_ALIGNMENT
+	/* These are handled fine if no alignment issues */
+	case XFS_IOC_ALLOCSP:
+	case XFS_IOC_FREESP:
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_ALLOCSP64:
+	case XFS_IOC_FREESP64:
+	case XFS_IOC_RESVSP64:
+	case XFS_IOC_UNRESVSP64:
+	case XFS_IOC_FSGEOMETRY_V1:
+	case XFS_IOC_FSGROWFSDATA:
+	case XFS_IOC_FSGROWFSRT:
+	case XFS_IOC_ZERO_RANGE:
+		return xfs_file_ioctl(filp, cmd, p);
+#else
+	case XFS_IOC_ALLOCSP_32:
+	case XFS_IOC_FREESP_32:
+	case XFS_IOC_ALLOCSP64_32:
+	case XFS_IOC_FREESP64_32:
+	case XFS_IOC_RESVSP_32:
+	case XFS_IOC_UNRESVSP_32:
+	case XFS_IOC_RESVSP64_32:
+	case XFS_IOC_UNRESVSP64_32:
+	case XFS_IOC_ZERO_RANGE_32: {
+		struct xfs_flock64	bf;
+
+		if (xfs_compat_flock64_copyin(&bf, arg))
+			return -XFS_ERROR(EFAULT);
+		cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
+		return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+	}
+	case XFS_IOC_FSGEOMETRY_V1_32:
+		return xfs_compat_ioc_fsgeometry_v1(mp, arg);
+	case XFS_IOC_FSGROWFSDATA_32: {
+		struct xfs_growfs_data	in;
+
+		if (xfs_compat_growfs_data_copyin(&in, arg))
+			return -XFS_ERROR(EFAULT);
+		error = xfs_growfs_data(mp, &in);
+		return -error;
+	}
+	case XFS_IOC_FSGROWFSRT_32: {
+		struct xfs_growfs_rt	in;
+
+		if (xfs_compat_growfs_rt_copyin(&in, arg))
+			return -XFS_ERROR(EFAULT);
+		error = xfs_growfs_rt(mp, &in);
+		return -error;
+	}
+#endif
+	/* long changes size, but xfs only copiese out 32 bits */
+	case XFS_IOC_GETXFLAGS_32:
+	case XFS_IOC_SETXFLAGS_32:
+	case XFS_IOC_GETVERSION_32:
+		cmd = _NATIVE_IOC(cmd, long);
+		return xfs_file_ioctl(filp, cmd, p);
+	case XFS_IOC_SWAPEXT_32: {
+		struct xfs_swapext	  sxp;
+		struct compat_xfs_swapext __user *sxu = arg;
+
+		/* Bulk copy in up to the sx_stat field, then copy bstat */
+		if (copy_from_user(&sxp, sxu,
+				   offsetof(struct xfs_swapext, sx_stat)) ||
+		    xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
+			return -XFS_ERROR(EFAULT);
+		error = xfs_swapext(&sxp);
+		return -error;
+	}
+	case XFS_IOC_FSBULKSTAT_32:
+	case XFS_IOC_FSBULKSTAT_SINGLE_32:
+	case XFS_IOC_FSINUMBERS_32:
+		return xfs_compat_ioc_bulkstat(mp, cmd, arg);
+	case XFS_IOC_FD_TO_HANDLE_32:
+	case XFS_IOC_PATH_TO_HANDLE_32:
+	case XFS_IOC_PATH_TO_FSHANDLE_32: {
+		struct xfs_fsop_handlereq	hreq;
+
+		if (xfs_compat_handlereq_copyin(&hreq, arg))
+			return -XFS_ERROR(EFAULT);
+		cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
+		return xfs_find_handle(cmd, &hreq);
+	}
+	case XFS_IOC_OPEN_BY_HANDLE_32: {
+		struct xfs_fsop_handlereq	hreq;
+
+		if (xfs_compat_handlereq_copyin(&hreq, arg))
+			return -XFS_ERROR(EFAULT);
+		return xfs_open_by_handle(filp, &hreq);
+	}
+	case XFS_IOC_READLINK_BY_HANDLE_32: {
+		struct xfs_fsop_handlereq	hreq;
+
+		if (xfs_compat_handlereq_copyin(&hreq, arg))
+			return -XFS_ERROR(EFAULT);
+		return xfs_readlink_by_handle(filp, &hreq);
+	}
+	case XFS_IOC_ATTRLIST_BY_HANDLE_32:
+		return xfs_compat_attrlist_by_handle(filp, arg);
+	case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
+		return xfs_compat_attrmulti_by_handle(filp, arg);
+	case XFS_IOC_FSSETDM_BY_HANDLE_32:
+		return xfs_compat_fssetdm_by_handle(filp, arg);
+	default:
+		return -XFS_ERROR(ENOIOCTLCMD);
+	}
+}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
new file mode 100644
index 00000000..80f4060e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IOCTL32_H__
+#define __XFS_IOCTL32_H__
+
+#include <linux/compat.h>
+
+/*
+ * on 32-bit arches, ioctl argument structures may have different sizes
+ * and/or alignment.  We define compat structures which match the
+ * 32-bit sizes/alignments here, and their associated ioctl numbers.
+ *
+ * xfs_ioctl32.c contains routines to copy these structures in and out.
+ */
+
+/* stock kernel-level ioctls we support */
+#define XFS_IOC_GETXFLAGS_32	FS_IOC32_GETFLAGS
+#define XFS_IOC_SETXFLAGS_32	FS_IOC32_SETFLAGS
+#define XFS_IOC_GETVERSION_32	FS_IOC32_GETVERSION
+
+/*
+ * On intel, even if sizes match, alignment and/or padding may differ.
+ */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#define BROKEN_X86_ALIGNMENT
+#define __compat_packed __attribute__((packed))
+#else
+#define __compat_packed
+#endif
+
+typedef struct compat_xfs_bstime {
+	compat_time_t	tv_sec;		/* seconds		*/
+	__s32		tv_nsec;	/* and nanoseconds	*/
+} compat_xfs_bstime_t;
+
+typedef struct compat_xfs_bstat {
+	__u64		bs_ino;		/* inode number			*/
+	__u16		bs_mode;	/* type and mode		*/
+	__u16		bs_nlink;	/* number of links		*/
+	__u32		bs_uid;		/* user id			*/
+	__u32		bs_gid;		/* group id			*/
+	__u32		bs_rdev;	/* device value			*/
+	__s32		bs_blksize;	/* block size			*/
+	__s64		bs_size;	/* file size			*/
+	compat_xfs_bstime_t bs_atime;	/* access time			*/
+	compat_xfs_bstime_t bs_mtime;	/* modify time			*/
+	compat_xfs_bstime_t bs_ctime;	/* inode change time		*/
+	int64_t		bs_blocks;	/* number of blocks		*/
+	__u32		bs_xflags;	/* extended flags		*/
+	__s32		bs_extsize;	/* extent size			*/
+	__s32		bs_extents;	/* number of extents		*/
+	__u32		bs_gen;		/* generation count		*/
+	__u16		bs_projid_lo;	/* lower part of project id	*/
+#define	bs_projid	bs_projid_lo	/* (previously just bs_projid)	*/
+	__u16		bs_projid_hi;	/* high part of project id	*/
+	unsigned char	bs_pad[12];	/* pad space, unused		*/
+	__u32		bs_dmevmask;	/* DMIG event mask		*/
+	__u16		bs_dmstate;	/* DMIG state info		*/
+	__u16		bs_aextents;	/* attribute number of extents	*/
+} __compat_packed compat_xfs_bstat_t;
+
+typedef struct compat_xfs_fsop_bulkreq {
+	compat_uptr_t	lastip;		/* last inode # pointer		*/
+	__s32		icount;		/* count of entries in buffer	*/
+	compat_uptr_t	ubuffer;	/* user buffer for inode desc.	*/
+	compat_uptr_t	ocount;		/* output count pointer		*/
+} compat_xfs_fsop_bulkreq_t;
+
+#define XFS_IOC_FSBULKSTAT_32 \
+	_IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
+	_IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS_32 \
+	_IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
+
+typedef struct compat_xfs_fsop_handlereq {
+	__u32		fd;		/* fd for FD_TO_HANDLE		*/
+	compat_uptr_t	path;		/* user pathname		*/
+	__u32		oflags;		/* open flags			*/
+	compat_uptr_t	ihandle;	/* user supplied handle		*/
+	__u32		ihandlen;	/* user supplied length		*/
+	compat_uptr_t	ohandle;	/* user buffer for handle	*/
+	compat_uptr_t	ohandlen;	/* user buffer length		*/
+} compat_xfs_fsop_handlereq_t;
+
+#define XFS_IOC_PATH_TO_FSHANDLE_32 \
+	_IOWR('X', 104, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE_32 \
+	_IOWR('X', 105, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE_32 \
+	_IOWR('X', 106, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE_32 \
+	_IOWR('X', 107, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE_32 \
+	_IOWR('X', 108, struct compat_xfs_fsop_handlereq)
+
+/* The bstat field in the swapext struct needs translation */
+typedef struct compat_xfs_swapext {
+	__int64_t		sx_version;	/* version */
+	__int64_t		sx_fdtarget;	/* fd of target file */
+	__int64_t		sx_fdtmp;	/* fd of tmp file */
+	xfs_off_t		sx_offset;	/* offset into file */
+	xfs_off_t		sx_length;	/* leng from offset */
+	char			sx_pad[16];	/* pad space, unused */
+	compat_xfs_bstat_t	sx_stat;	/* stat of target b4 copy */
+} __compat_packed compat_xfs_swapext_t;
+
+#define XFS_IOC_SWAPEXT_32	_IOWR('X', 109, struct compat_xfs_swapext)
+
+typedef struct compat_xfs_fsop_attrlist_handlereq {
+	struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+	struct xfs_attrlist_cursor	pos; /* opaque cookie, list offset */
+	__u32				flags;	/* which namespace to use */
+	__u32				buflen;	/* length of buffer supplied */
+	compat_uptr_t			buffer;	/* returned names */
+} __compat_packed compat_xfs_fsop_attrlist_handlereq_t;
+
+/* Note: actually this is read/write */
+#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \
+	_IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq)
+
+/* am_opcodes defined in xfs_fs.h */
+typedef struct compat_xfs_attr_multiop {
+	__u32		am_opcode;
+	__s32		am_error;
+	compat_uptr_t	am_attrname;
+	compat_uptr_t	am_attrvalue;
+	__u32		am_length;
+	__u32		am_flags;
+} compat_xfs_attr_multiop_t;
+
+typedef struct compat_xfs_fsop_attrmulti_handlereq {
+	struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+	__u32				opcount;/* count of following multiop */
+	/* ptr to compat_xfs_attr_multiop */
+	compat_uptr_t			ops; /* attr_multi data */
+} compat_xfs_fsop_attrmulti_handlereq_t;
+
+#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \
+	_IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
+
+typedef struct compat_xfs_fsop_setdm_handlereq {
+	struct compat_xfs_fsop_handlereq hreq;	/* handle information   */
+	/* ptr to struct fsdmidata */
+	compat_uptr_t			data;	/* DMAPI data   */
+} compat_xfs_fsop_setdm_handlereq_t;
+
+#define XFS_IOC_FSSETDM_BY_HANDLE_32 \
+	_IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq)
+
+#ifdef BROKEN_X86_ALIGNMENT
+/* on ia32 l_start is on a 32-bit boundary */
+typedef struct compat_xfs_flock64 {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start	__attribute__((packed));
+			/* len == 0 means until end of file */
+	__s64		l_len __attribute__((packed));
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserve area */
+} compat_xfs_flock64_t;
+
+#define XFS_IOC_ALLOCSP_32	_IOW('X', 10, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP_32	_IOW('X', 11, struct compat_xfs_flock64)
+#define XFS_IOC_ALLOCSP64_32	_IOW('X', 36, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP64_32	_IOW('X', 37, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP_32	_IOW('X', 40, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP_32	_IOW('X', 41, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP64_32	_IOW('X', 42, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP64_32	_IOW('X', 43, struct compat_xfs_flock64)
+#define XFS_IOC_ZERO_RANGE_32	_IOW('X', 57, struct compat_xfs_flock64)
+
+typedef struct compat_xfs_fsop_geom_v1 {
+	__u32		blocksize;	/* filesystem (data) block size */
+	__u32		rtextsize;	/* realtime extent size		*/
+	__u32		agblocks;	/* fsblocks in an AG		*/
+	__u32		agcount;	/* number of allocation groups	*/
+	__u32		logblocks;	/* fsblocks in the log		*/
+	__u32		sectsize;	/* (data) sector size, bytes	*/
+	__u32		inodesize;	/* inode size in bytes		*/
+	__u32		imaxpct;	/* max allowed inode space(%)	*/
+	__u64		datablocks;	/* fsblocks in data subvolume	*/
+	__u64		rtblocks;	/* fsblocks in realtime subvol	*/
+	__u64		rtextents;	/* rt extents in realtime subvol*/
+	__u64		logstart;	/* starting fsblock of the log	*/
+	unsigned char	uuid[16];	/* unique id of the filesystem	*/
+	__u32		sunit;		/* stripe unit, fsblocks	*/
+	__u32		swidth;		/* stripe width, fsblocks	*/
+	__s32		version;	/* structure version		*/
+	__u32		flags;		/* superblock version flags	*/
+	__u32		logsectsize;	/* log sector size, bytes	*/
+	__u32		rtsectsize;	/* realtime sector size, bytes	*/
+	__u32		dirblocksize;	/* directory block size, bytes	*/
+} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
+
+#define XFS_IOC_FSGEOMETRY_V1_32  \
+	_IOR('X', 100, struct compat_xfs_fsop_geom_v1)
+
+typedef struct compat_xfs_inogrp {
+	__u64		xi_startino;	/* starting inode number	*/
+	__s32		xi_alloccount;	/* # bits set in allocmask	*/
+	__u64		xi_allocmask;	/* mask of allocated inodes	*/
+} __attribute__((packed)) compat_xfs_inogrp_t;
+
+/* These growfs input structures have padding on the end, so must translate */
+typedef struct compat_xfs_growfs_data {
+	__u64		newblocks;	/* new data subvol size, fsblocks */
+	__u32		imaxpct;	/* new inode space percentage limit */
+} __attribute__((packed)) compat_xfs_growfs_data_t;
+
+typedef struct compat_xfs_growfs_rt {
+	__u64		newblocks;	/* new realtime size, fsblocks */
+	__u32		extsize;	/* new realtime extent size, fsblocks */
+} __attribute__((packed)) compat_xfs_growfs_rt_t;
+
+#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data)
+#define XFS_IOC_FSGROWFSRT_32   _IOW('X', 112, struct compat_xfs_growfs_rt)
+
+#endif /* BROKEN_X86_ALIGNMENT */
+
+#endif /* __XFS_IOCTL32_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
new file mode 100644
index 00000000..f5b697bf
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -0,0 +1,778 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_acl.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+#include <linux/capability.h>
+#include <linux/xattr.h>
+#include <linux/namei.h>
+#include <linux/posix_acl.h>
+#include <linux/security.h>
+#include <linux/fiemap.h>
+#include <linux/slab.h>
+
+/*
+ * Bring the timestamps in the XFS inode uptodate.
+ *
+ * Used before writing the inode to disk.
+ */
+void
+xfs_synchronize_times(
+	xfs_inode_t	*ip)
+{
+	struct inode	*inode = VFS_I(ip);
+
+	ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
+	ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
+	ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
+	ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
+	ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
+	ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
+}
+
+/*
+ * If the linux inode is valid, mark it dirty, else mark the dirty state
+ * in the XFS inode to make sure we pick it up when reclaiming the inode.
+ */
+void
+xfs_mark_inode_dirty_sync(
+	xfs_inode_t	*ip)
+{
+	struct inode	*inode = VFS_I(ip);
+
+	if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
+		mark_inode_dirty_sync(inode);
+	else {
+		barrier();
+		ip->i_update_core = 1;
+	}
+}
+
+void
+xfs_mark_inode_dirty(
+	xfs_inode_t	*ip)
+{
+	struct inode	*inode = VFS_I(ip);
+
+	if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
+		mark_inode_dirty(inode);
+	else {
+		barrier();
+		ip->i_update_core = 1;
+	}
+
+}
+
+/*
+ * Hook in SELinux.  This is not quite correct yet, what we really need
+ * here (as we do for default ACLs) is a mechanism by which creation of
+ * these attrs can be journalled at inode creation time (along with the
+ * inode, of course, such that log replay can't cause these to be lost).
+ */
+STATIC int
+xfs_init_security(
+	struct inode	*inode,
+	struct inode	*dir,
+	const struct qstr *qstr)
+{
+	struct xfs_inode *ip = XFS_I(inode);
+	size_t		length;
+	void		*value;
+	unsigned char	*name;
+	int		error;
+
+	error = security_inode_init_security(inode, dir, qstr, (char **)&name,
+					     &value, &length);
+	if (error) {
+		if (error == -EOPNOTSUPP)
+			return 0;
+		return -error;
+	}
+
+	error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
+
+	kfree(name);
+	kfree(value);
+	return error;
+}
+
+static void
+xfs_dentry_to_name(
+	struct xfs_name	*namep,
+	struct dentry	*dentry)
+{
+	namep->name = dentry->d_name.name;
+	namep->len = dentry->d_name.len;
+}
+
+STATIC void
+xfs_cleanup_inode(
+	struct inode	*dir,
+	struct inode	*inode,
+	struct dentry	*dentry)
+{
+	struct xfs_name	teardown;
+
+	/* Oh, the horror.
+	 * If we can't add the ACL or we fail in
+	 * xfs_init_security we must back out.
+	 * ENOSPC can hit here, among other things.
+	 */
+	xfs_dentry_to_name(&teardown, dentry);
+
+	xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
+	iput(inode);
+}
+
+STATIC int
+xfs_vn_mknod(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	int		mode,
+	dev_t		rdev)
+{
+	struct inode	*inode;
+	struct xfs_inode *ip = NULL;
+	struct posix_acl *default_acl = NULL;
+	struct xfs_name	name;
+	int		error;
+
+	/*
+	 * Irix uses Missed'em'V split, but doesn't want to see
+	 * the upper 5 bits of (14bit) major.
+	 */
+	if (S_ISCHR(mode) || S_ISBLK(mode)) {
+		if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
+			return -EINVAL;
+		rdev = sysv_encode_dev(rdev);
+	} else {
+		rdev = 0;
+	}
+
+	if (IS_POSIXACL(dir)) {
+		default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
+		if (IS_ERR(default_acl))
+			return PTR_ERR(default_acl);
+
+		if (!default_acl)
+			mode &= ~current_umask();
+	}
+
+	xfs_dentry_to_name(&name, dentry);
+	error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+	if (unlikely(error))
+		goto out_free_acl;
+
+	inode = VFS_I(ip);
+
+	error = xfs_init_security(inode, dir, &dentry->d_name);
+	if (unlikely(error))
+		goto out_cleanup_inode;
+
+	if (default_acl) {
+		error = -xfs_inherit_acl(inode, default_acl);
+		if (unlikely(error))
+			goto out_cleanup_inode;
+		posix_acl_release(default_acl);
+	}
+
+
+	d_instantiate(dentry, inode);
+	return -error;
+
+ out_cleanup_inode:
+	xfs_cleanup_inode(dir, inode, dentry);
+ out_free_acl:
+	posix_acl_release(default_acl);
+	return -error;
+}
+
+STATIC int
+xfs_vn_create(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	int		mode,
+	struct nameidata *nd)
+{
+	return xfs_vn_mknod(dir, dentry, mode, 0);
+}
+
+STATIC int
+xfs_vn_mkdir(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	int		mode)
+{
+	return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0);
+}
+
+STATIC struct dentry *
+xfs_vn_lookup(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	struct nameidata *nd)
+{
+	struct xfs_inode *cip;
+	struct xfs_name	name;
+	int		error;
+
+	if (dentry->d_name.len >= MAXNAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	xfs_dentry_to_name(&name, dentry);
+	error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
+	if (unlikely(error)) {
+		if (unlikely(error != ENOENT))
+			return ERR_PTR(-error);
+		d_add(dentry, NULL);
+		return NULL;
+	}
+
+	return d_splice_alias(VFS_I(cip), dentry);
+}
+
+STATIC struct dentry *
+xfs_vn_ci_lookup(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	struct nameidata *nd)
+{
+	struct xfs_inode *ip;
+	struct xfs_name	xname;
+	struct xfs_name ci_name;
+	struct qstr	dname;
+	int		error;
+
+	if (dentry->d_name.len >= MAXNAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	xfs_dentry_to_name(&xname, dentry);
+	error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
+	if (unlikely(error)) {
+		if (unlikely(error != ENOENT))
+			return ERR_PTR(-error);
+		/*
+		 * call d_add(dentry, NULL) here when d_drop_negative_children
+		 * is called in xfs_vn_mknod (ie. allow negative dentries
+		 * with CI filesystems).
+		 */
+		return NULL;
+	}
+
+	/* if exact match, just splice and exit */
+	if (!ci_name.name)
+		return d_splice_alias(VFS_I(ip), dentry);
+
+	/* else case-insensitive match... */
+	dname.name = ci_name.name;
+	dname.len = ci_name.len;
+	dentry = d_add_ci(dentry, VFS_I(ip), &dname);
+	kmem_free(ci_name.name);
+	return dentry;
+}
+
+STATIC int
+xfs_vn_link(
+	struct dentry	*old_dentry,
+	struct inode	*dir,
+	struct dentry	*dentry)
+{
+	struct inode	*inode = old_dentry->d_inode;
+	struct xfs_name	name;
+	int		error;
+
+	xfs_dentry_to_name(&name, dentry);
+
+	error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
+	if (unlikely(error))
+		return -error;
+
+	ihold(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+STATIC int
+xfs_vn_unlink(
+	struct inode	*dir,
+	struct dentry	*dentry)
+{
+	struct xfs_name	name;
+	int		error;
+
+	xfs_dentry_to_name(&name, dentry);
+
+	error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
+	if (error)
+		return error;
+
+	/*
+	 * With unlink, the VFS makes the dentry "negative": no inode,
+	 * but still hashed. This is incompatible with case-insensitive
+	 * mode, so invalidate (unhash) the dentry in CI-mode.
+	 */
+	if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb))
+		d_invalidate(dentry);
+	return 0;
+}
+
+STATIC int
+xfs_vn_symlink(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	const char	*symname)
+{
+	struct inode	*inode;
+	struct xfs_inode *cip = NULL;
+	struct xfs_name	name;
+	int		error;
+	mode_t		mode;
+
+	mode = S_IFLNK |
+		(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
+	xfs_dentry_to_name(&name, dentry);
+
+	error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
+	if (unlikely(error))
+		goto out;
+
+	inode = VFS_I(cip);
+
+	error = xfs_init_security(inode, dir, &dentry->d_name);
+	if (unlikely(error))
+		goto out_cleanup_inode;
+
+	d_instantiate(dentry, inode);
+	return 0;
+
+ out_cleanup_inode:
+	xfs_cleanup_inode(dir, inode, dentry);
+ out:
+	return -error;
+}
+
+STATIC int
+xfs_vn_rename(
+	struct inode	*odir,
+	struct dentry	*odentry,
+	struct inode	*ndir,
+	struct dentry	*ndentry)
+{
+	struct inode	*new_inode = ndentry->d_inode;
+	struct xfs_name	oname;
+	struct xfs_name	nname;
+
+	xfs_dentry_to_name(&oname, odentry);
+	xfs_dentry_to_name(&nname, ndentry);
+
+	return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+			   XFS_I(ndir), &nname, new_inode ?
+			   			XFS_I(new_inode) : NULL);
+}
+
+/*
+ * careful here - this function can get called recursively, so
+ * we need to be very careful about how much stack we use.
+ * uio is kmalloced for this reason...
+ */
+STATIC void *
+xfs_vn_follow_link(
+	struct dentry		*dentry,
+	struct nameidata	*nd)
+{
+	char			*link;
+	int			error = -ENOMEM;
+
+	link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
+	if (!link)
+		goto out_err;
+
+	error = -xfs_readlink(XFS_I(dentry->d_inode), link);
+	if (unlikely(error))
+		goto out_kfree;
+
+	nd_set_link(nd, link);
+	return NULL;
+
+ out_kfree:
+	kfree(link);
+ out_err:
+	nd_set_link(nd, ERR_PTR(error));
+	return NULL;
+}
+
+STATIC void
+xfs_vn_put_link(
+	struct dentry	*dentry,
+	struct nameidata *nd,
+	void		*p)
+{
+	char		*s = nd_get_link(nd);
+
+	if (!IS_ERR(s))
+		kfree(s);
+}
+
+STATIC int
+xfs_vn_getattr(
+	struct vfsmount		*mnt,
+	struct dentry		*dentry,
+	struct kstat		*stat)
+{
+	struct inode		*inode = dentry->d_inode;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+
+	trace_xfs_getattr(ip);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -XFS_ERROR(EIO);
+
+	stat->size = XFS_ISIZE(ip);
+	stat->dev = inode->i_sb->s_dev;
+	stat->mode = ip->i_d.di_mode;
+	stat->nlink = ip->i_d.di_nlink;
+	stat->uid = ip->i_d.di_uid;
+	stat->gid = ip->i_d.di_gid;
+	stat->ino = ip->i_ino;
+	stat->atime = inode->i_atime;
+	stat->mtime = inode->i_mtime;
+	stat->ctime = inode->i_ctime;
+	stat->blocks =
+		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
+
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFBLK:
+	case S_IFCHR:
+		stat->blksize = BLKDEV_IOSIZE;
+		stat->rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
+				   sysv_minor(ip->i_df.if_u2.if_rdev));
+		break;
+	default:
+		if (XFS_IS_REALTIME_INODE(ip)) {
+			/*
+			 * If the file blocks are being allocated from a
+			 * realtime volume, then return the inode's realtime
+			 * extent size or the realtime volume's extent size.
+			 */
+			stat->blksize =
+				xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
+		} else
+			stat->blksize = xfs_preferred_iosize(mp);
+		stat->rdev = 0;
+		break;
+	}
+
+	return 0;
+}
+
+STATIC int
+xfs_vn_setattr(
+	struct dentry	*dentry,
+	struct iattr	*iattr)
+{
+	return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
+}
+
+#define XFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+
+/*
+ * Call fiemap helper to fill in user data.
+ * Returns positive errors to xfs_getbmap.
+ */
+STATIC int
+xfs_fiemap_format(
+	void			**arg,
+	struct getbmapx		*bmv,
+	int			*full)
+{
+	int			error;
+	struct fiemap_extent_info *fieinfo = *arg;
+	u32			fiemap_flags = 0;
+	u64			logical, physical, length;
+
+	/* Do nothing for a hole */
+	if (bmv->bmv_block == -1LL)
+		return 0;
+
+	logical = BBTOB(bmv->bmv_offset);
+	physical = BBTOB(bmv->bmv_block);
+	length = BBTOB(bmv->bmv_length);
+
+	if (bmv->bmv_oflags & BMV_OF_PREALLOC)
+		fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
+	else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
+		fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
+		physical = 0;   /* no block yet */
+	}
+	if (bmv->bmv_oflags & BMV_OF_LAST)
+		fiemap_flags |= FIEMAP_EXTENT_LAST;
+
+	error = fiemap_fill_next_extent(fieinfo, logical, physical,
+					length, fiemap_flags);
+	if (error > 0) {
+		error = 0;
+		*full = 1;	/* user array now full */
+	}
+
+	return -error;
+}
+
+STATIC int
+xfs_vn_fiemap(
+	struct inode		*inode,
+	struct fiemap_extent_info *fieinfo,
+	u64			start,
+	u64			length)
+{
+	xfs_inode_t		*ip = XFS_I(inode);
+	struct getbmapx		bm;
+	int			error;
+
+	error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
+	if (error)
+		return error;
+
+	/* Set up bmap header for xfs internal routine */
+	bm.bmv_offset = BTOBB(start);
+	/* Special case for whole file */
+	if (length == FIEMAP_MAX_OFFSET)
+		bm.bmv_length = -1LL;
+	else
+		bm.bmv_length = BTOBB(length);
+
+	/* We add one because in getbmap world count includes the header */
+	bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
+					fieinfo->fi_extents_max + 1;
+	bm.bmv_count = min_t(__s32, bm.bmv_count,
+			     (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
+	bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
+	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
+		bm.bmv_iflags |= BMV_IF_ATTRFORK;
+	if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
+		bm.bmv_iflags |= BMV_IF_DELALLOC;
+
+	error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
+	if (error)
+		return -error;
+
+	return 0;
+}
+
+static const struct inode_operations xfs_inode_operations = {
+	.check_acl		= xfs_check_acl,
+	.getattr		= xfs_vn_getattr,
+	.setattr		= xfs_vn_setattr,
+	.setxattr		= generic_setxattr,
+	.getxattr		= generic_getxattr,
+	.removexattr		= generic_removexattr,
+	.listxattr		= xfs_vn_listxattr,
+	.fiemap			= xfs_vn_fiemap,
+};
+
+static const struct inode_operations xfs_dir_inode_operations = {
+	.create			= xfs_vn_create,
+	.lookup			= xfs_vn_lookup,
+	.link			= xfs_vn_link,
+	.unlink			= xfs_vn_unlink,
+	.symlink		= xfs_vn_symlink,
+	.mkdir			= xfs_vn_mkdir,
+	/*
+	 * Yes, XFS uses the same method for rmdir and unlink.
+	 *
+	 * There are some subtile differences deeper in the code,
+	 * but we use S_ISDIR to check for those.
+	 */
+	.rmdir			= xfs_vn_unlink,
+	.mknod			= xfs_vn_mknod,
+	.rename			= xfs_vn_rename,
+	.check_acl		= xfs_check_acl,
+	.getattr		= xfs_vn_getattr,
+	.setattr		= xfs_vn_setattr,
+	.setxattr		= generic_setxattr,
+	.getxattr		= generic_getxattr,
+	.removexattr		= generic_removexattr,
+	.listxattr		= xfs_vn_listxattr,
+};
+
+static const struct inode_operations xfs_dir_ci_inode_operations = {
+	.create			= xfs_vn_create,
+	.lookup			= xfs_vn_ci_lookup,
+	.link			= xfs_vn_link,
+	.unlink			= xfs_vn_unlink,
+	.symlink		= xfs_vn_symlink,
+	.mkdir			= xfs_vn_mkdir,
+	/*
+	 * Yes, XFS uses the same method for rmdir and unlink.
+	 *
+	 * There are some subtile differences deeper in the code,
+	 * but we use S_ISDIR to check for those.
+	 */
+	.rmdir			= xfs_vn_unlink,
+	.mknod			= xfs_vn_mknod,
+	.rename			= xfs_vn_rename,
+	.check_acl		= xfs_check_acl,
+	.getattr		= xfs_vn_getattr,
+	.setattr		= xfs_vn_setattr,
+	.setxattr		= generic_setxattr,
+	.getxattr		= generic_getxattr,
+	.removexattr		= generic_removexattr,
+	.listxattr		= xfs_vn_listxattr,
+};
+
+static const struct inode_operations xfs_symlink_inode_operations = {
+	.readlink		= generic_readlink,
+	.follow_link		= xfs_vn_follow_link,
+	.put_link		= xfs_vn_put_link,
+	.check_acl		= xfs_check_acl,
+	.getattr		= xfs_vn_getattr,
+	.setattr		= xfs_vn_setattr,
+	.setxattr		= generic_setxattr,
+	.getxattr		= generic_getxattr,
+	.removexattr		= generic_removexattr,
+	.listxattr		= xfs_vn_listxattr,
+};
+
+STATIC void
+xfs_diflags_to_iflags(
+	struct inode		*inode,
+	struct xfs_inode	*ip)
+{
+	if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+		inode->i_flags |= S_IMMUTABLE;
+	else
+		inode->i_flags &= ~S_IMMUTABLE;
+	if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+		inode->i_flags |= S_APPEND;
+	else
+		inode->i_flags &= ~S_APPEND;
+	if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+		inode->i_flags |= S_SYNC;
+	else
+		inode->i_flags &= ~S_SYNC;
+	if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+		inode->i_flags |= S_NOATIME;
+	else
+		inode->i_flags &= ~S_NOATIME;
+}
+
+/*
+ * Initialize the Linux inode, set up the operation vectors and
+ * unlock the inode.
+ *
+ * When reading existing inodes from disk this is called directly
+ * from xfs_iget, when creating a new inode it is called from
+ * xfs_ialloc after setting up the inode.
+ *
+ * We are always called with an uninitialised linux inode here.
+ * We need to initialise the necessary fields and take a reference
+ * on it.
+ */
+void
+xfs_setup_inode(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = &ip->i_vnode;
+
+	inode->i_ino = ip->i_ino;
+	inode->i_state = I_NEW;
+
+	inode_sb_list_add(inode);
+	/* make the inode look hashed for the writeback code */
+	hlist_add_fake(&inode->i_hash);
+
+	inode->i_mode	= ip->i_d.di_mode;
+	inode->i_nlink	= ip->i_d.di_nlink;
+	inode->i_uid	= ip->i_d.di_uid;
+	inode->i_gid	= ip->i_d.di_gid;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFBLK:
+	case S_IFCHR:
+		inode->i_rdev =
+			MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
+			      sysv_minor(ip->i_df.if_u2.if_rdev));
+		break;
+	default:
+		inode->i_rdev = 0;
+		break;
+	}
+
+	inode->i_generation = ip->i_d.di_gen;
+	i_size_write(inode, ip->i_d.di_size);
+	inode->i_atime.tv_sec	= ip->i_d.di_atime.t_sec;
+	inode->i_atime.tv_nsec	= ip->i_d.di_atime.t_nsec;
+	inode->i_mtime.tv_sec	= ip->i_d.di_mtime.t_sec;
+	inode->i_mtime.tv_nsec	= ip->i_d.di_mtime.t_nsec;
+	inode->i_ctime.tv_sec	= ip->i_d.di_ctime.t_sec;
+	inode->i_ctime.tv_nsec	= ip->i_d.di_ctime.t_nsec;
+	xfs_diflags_to_iflags(inode, ip);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &xfs_inode_operations;
+		inode->i_fop = &xfs_file_operations;
+		inode->i_mapping->a_ops = &xfs_address_space_operations;
+		break;
+	case S_IFDIR:
+		if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+			inode->i_op = &xfs_dir_ci_inode_operations;
+		else
+			inode->i_op = &xfs_dir_inode_operations;
+		inode->i_fop = &xfs_dir_file_operations;
+		break;
+	case S_IFLNK:
+		inode->i_op = &xfs_symlink_inode_operations;
+		if (!(ip->i_df.if_flags & XFS_IFINLINE))
+			inode->i_mapping->a_ops = &xfs_address_space_operations;
+		break;
+	default:
+		inode->i_op = &xfs_inode_operations;
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		break;
+	}
+
+	xfs_iflags_clear(ip, XFS_INEW);
+	barrier();
+
+	unlock_new_inode(inode);
+}
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
new file mode 100644
index 00000000..ef41c92c
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IOPS_H__
+#define __XFS_IOPS_H__
+
+struct xfs_inode;
+
+extern const struct file_operations xfs_file_operations;
+extern const struct file_operations xfs_dir_file_operations;
+
+extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
+
+extern void xfs_setup_inode(struct xfs_inode *);
+
+#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
new file mode 100644
index 00000000..87315168
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_LINUX__
+#define __XFS_LINUX__
+
+#include <linux/types.h>
+
+/*
+ * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
+ * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
+ */
+#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
+# define XFS_BIG_BLKNOS	1
+# define XFS_BIG_INUMS	1
+#else
+# define XFS_BIG_BLKNOS	0
+# define XFS_BIG_INUMS	0
+#endif
+
+#include <xfs_types.h>
+#include <xfs_arch.h>
+
+#include <kmem.h>
+#include <mrlock.h>
+#include <time.h>
+
+#include <support/uuid.h>
+
+#include <linux/semaphore.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <linux/swap.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/major.h>
+#include <linux/pagemap.h>
+#include <linux/vfs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/sort.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/delay.h>
+#include <linux/log2.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/ctype.h>
+#include <linux/writeback.h>
+#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/list_sort.h>
+
+#include <asm/page.h>
+#include <asm/div64.h>
+#include <asm/param.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+
+#include <xfs_vnode.h>
+#include <xfs_stats.h>
+#include <xfs_sysctl.h>
+#include <xfs_iops.h>
+#include <xfs_aops.h>
+#include <xfs_super.h>
+#include <xfs_buf.h>
+#include <xfs_message.h>
+
+/*
+ * Feature macros (disable/enable)
+ */
+#ifdef CONFIG_SMP
+#define HAVE_PERCPU_SB	/* per cpu superblock counters are a 2.6 feature */
+#else
+#undef  HAVE_PERCPU_SB	/* per cpu superblock counters are a 2.6 feature */
+#endif
+
+#define irix_sgid_inherit	xfs_params.sgid_inherit.val
+#define irix_symlink_mode	xfs_params.symlink_mode.val
+#define xfs_panic_mask		xfs_params.panic_mask.val
+#define xfs_error_level		xfs_params.error_level.val
+#define xfs_syncd_centisecs	xfs_params.syncd_timer.val
+#define xfs_stats_clear		xfs_params.stats_clear.val
+#define xfs_inherit_sync	xfs_params.inherit_sync.val
+#define xfs_inherit_nodump	xfs_params.inherit_nodump.val
+#define xfs_inherit_noatime	xfs_params.inherit_noatim.val
+#define xfs_buf_timer_centisecs	xfs_params.xfs_buf_timer.val
+#define xfs_buf_age_centisecs	xfs_params.xfs_buf_age.val
+#define xfs_inherit_nosymlinks	xfs_params.inherit_nosym.val
+#define xfs_rotorstep		xfs_params.rotorstep.val
+#define xfs_inherit_nodefrag	xfs_params.inherit_nodfrg.val
+#define xfs_fstrm_centisecs	xfs_params.fstrm_timer.val
+
+#define current_cpu()		(raw_smp_processor_id())
+#define current_pid()		(current->pid)
+#define current_test_flags(f)	(current->flags & (f))
+#define current_set_flags_nested(sp, f)		\
+		(*(sp) = current->flags, current->flags |= (f))
+#define current_clear_flags_nested(sp, f)	\
+		(*(sp) = current->flags, current->flags &= ~(f))
+#define current_restore_flags_nested(sp, f)	\
+		(current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
+
+#define spinlock_destroy(lock)
+
+#define NBBY		8		/* number of bits per byte */
+
+/*
+ * Size of block device i/o is parameterized here.
+ * Currently the system supports page-sized i/o.
+ */
+#define	BLKDEV_IOSHIFT		PAGE_CACHE_SHIFT
+#define	BLKDEV_IOSIZE		(1<<BLKDEV_IOSHIFT)
+/* number of BB's per block device block */
+#define	BLKDEV_BB		BTOBB(BLKDEV_IOSIZE)
+
+#define ENOATTR		ENODATA		/* Attribute not found */
+#define EWRONGFS	EINVAL		/* Mount with wrong filesystem type */
+#define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */
+
+#define SYNCHRONIZE()	barrier()
+#define __return_address __builtin_return_address(0)
+
+#define XFS_PROJID_DEFAULT	0
+#define MAXPATHLEN	1024
+
+#define MIN(a,b)	(min(a,b))
+#define MAX(a,b)	(max(a,b))
+#define howmany(x, y)	(((x)+((y)-1))/(y))
+
+/*
+ * Various platform dependent calls that don't fit anywhere else
+ */
+#define xfs_sort(a,n,s,fn)	sort(a,n,s,fn,NULL)
+#define xfs_stack_trace()	dump_stack()
+
+
+/* Move the kernel do_div definition off to one side */
+
+#if defined __i386__
+/* For ia32 we need to pull some tricks to get past various versions
+ * of the compiler which do not like us using do_div in the middle
+ * of large functions.
+ */
+static inline __u32 xfs_do_div(void *a, __u32 b, int n)
+{
+	__u32	mod;
+
+	switch (n) {
+		case 4:
+			mod = *(__u32 *)a % b;
+			*(__u32 *)a = *(__u32 *)a / b;
+			return mod;
+		case 8:
+			{
+			unsigned long __upper, __low, __high, __mod;
+			__u64	c = *(__u64 *)a;
+			__upper = __high = c >> 32;
+			__low = c;
+			if (__high) {
+				__upper = __high % (b);
+				__high = __high / (b);
+			}
+			asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
+			asm("":"=A" (c):"a" (__low),"d" (__high));
+			*(__u64 *)a = c;
+			return __mod;
+			}
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+
+/* Side effect free 64 bit mod operation */
+static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
+{
+	switch (n) {
+		case 4:
+			return *(__u32 *)a % b;
+		case 8:
+			{
+			unsigned long __upper, __low, __high, __mod;
+			__u64	c = *(__u64 *)a;
+			__upper = __high = c >> 32;
+			__low = c;
+			if (__high) {
+				__upper = __high % (b);
+				__high = __high / (b);
+			}
+			asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
+			asm("":"=A" (c):"a" (__low),"d" (__high));
+			return __mod;
+			}
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+#else
+static inline __u32 xfs_do_div(void *a, __u32 b, int n)
+{
+	__u32	mod;
+
+	switch (n) {
+		case 4:
+			mod = *(__u32 *)a % b;
+			*(__u32 *)a = *(__u32 *)a / b;
+			return mod;
+		case 8:
+			mod = do_div(*(__u64 *)a, b);
+			return mod;
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+
+/* Side effect free 64 bit mod operation */
+static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
+{
+	switch (n) {
+		case 4:
+			return *(__u32 *)a % b;
+		case 8:
+			{
+			__u64	c = *(__u64 *)a;
+			return do_div(c, b);
+			}
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+#endif
+
+#undef do_div
+#define do_div(a, b)	xfs_do_div(&(a), (b), sizeof(a))
+#define do_mod(a, b)	xfs_do_mod(&(a), (b), sizeof(a))
+
+static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
+{
+	x += y - 1;
+	do_div(x, y);
+	return(x * y);
+}
+
+static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
+{
+	x += y - 1;
+	do_div(x, y);
+	return x;
+}
+
+/* ARM old ABI has some weird alignment/padding */
+#if defined(__arm__) && !defined(__ARM_EABI__)
+#define __arch_pack __attribute__((packed))
+#else
+#define __arch_pack
+#endif
+
+#define ASSERT_ALWAYS(expr)	\
+	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+
+#ifndef DEBUG
+#define ASSERT(expr)	((void)0)
+
+#ifndef STATIC
+# define STATIC static noinline
+#endif
+
+#else /* DEBUG */
+
+#define ASSERT(expr)	\
+	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+
+#ifndef STATIC
+# define STATIC noinline
+#endif
+
+#endif /* DEBUG */
+
+#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
new file mode 100644
index 00000000..bd672def
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2011 Red Hat, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+
+/*
+ * XFS logging functions
+ */
+static void
+__xfs_printk(
+	const char		*level,
+	const struct xfs_mount	*mp,
+	struct va_format	*vaf)
+{
+	if (mp && mp->m_fsname) {
+		printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
+		return;
+	}
+	printk("%sXFS: %pV\n", level, vaf);
+}
+
+#define define_xfs_printk_level(func, kern_level)		\
+void func(const struct xfs_mount *mp, const char *fmt, ...)	\
+{								\
+	struct va_format	vaf;				\
+	va_list			args;				\
+								\
+	va_start(args, fmt);					\
+								\
+	vaf.fmt = fmt;						\
+	vaf.va = &args;						\
+								\
+	__xfs_printk(kern_level, mp, &vaf);			\
+	va_end(args);						\
+}								\
+
+define_xfs_printk_level(xfs_emerg, KERN_EMERG);
+define_xfs_printk_level(xfs_alert, KERN_ALERT);
+define_xfs_printk_level(xfs_crit, KERN_CRIT);
+define_xfs_printk_level(xfs_err, KERN_ERR);
+define_xfs_printk_level(xfs_warn, KERN_WARNING);
+define_xfs_printk_level(xfs_notice, KERN_NOTICE);
+define_xfs_printk_level(xfs_info, KERN_INFO);
+#ifdef DEBUG
+define_xfs_printk_level(xfs_debug, KERN_DEBUG);
+#endif
+
+void
+xfs_alert_tag(
+	const struct xfs_mount	*mp,
+	int			panic_tag,
+	const char		*fmt, ...)
+{
+	struct va_format	vaf;
+	va_list			args;
+	int			do_panic = 0;
+
+	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
+		xfs_alert(mp, "Transforming an alert into a BUG.");
+		do_panic = 1;
+	}
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	__xfs_printk(KERN_ALERT, mp, &vaf);
+	va_end(args);
+
+	BUG_ON(do_panic);
+}
+
+void
+assfail(char *expr, char *file, int line)
+{
+	xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
+		expr, file, line);
+	BUG();
+}
+
+void
+xfs_hex_dump(void *p, int length)
+{
+	print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
+}
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
new file mode 100644
index 00000000..7fb7ea00
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -0,0 +1,39 @@
+#ifndef __XFS_MESSAGE_H
+#define __XFS_MESSAGE_H 1
+
+struct xfs_mount;
+
+extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
+			 const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
+extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+
+#ifdef DEBUG
+extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+#else
+static inline void
+__attribute__ ((format (printf, 2, 3)))
+xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+{
+}
+#endif
+
+extern void assfail(char *expr, char *f, int l);
+
+extern void xfs_hex_dump(void *p, int length);
+
+#endif	/* __XFS_MESSAGE_H */
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
new file mode 100644
index 00000000..29b9d642
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "quota/xfs_qm.h"
+#include <linux/quota.h>
+
+
+STATIC int
+xfs_quota_type(int type)
+{
+	switch (type) {
+	case USRQUOTA:
+		return XFS_DQ_USER;
+	case GRPQUOTA:
+		return XFS_DQ_GROUP;
+	default:
+		return XFS_DQ_PROJ;
+	}
+}
+
+STATIC int
+xfs_fs_get_xstate(
+	struct super_block	*sb,
+	struct fs_quota_stat	*fqs)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+	return -xfs_qm_scall_getqstat(mp, fqs);
+}
+
+STATIC int
+xfs_fs_set_xstate(
+	struct super_block	*sb,
+	unsigned int		uflags,
+	int			op)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+	unsigned int		flags = 0;
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+
+	if (uflags & FS_QUOTA_UDQ_ACCT)
+		flags |= XFS_UQUOTA_ACCT;
+	if (uflags & FS_QUOTA_PDQ_ACCT)
+		flags |= XFS_PQUOTA_ACCT;
+	if (uflags & FS_QUOTA_GDQ_ACCT)
+		flags |= XFS_GQUOTA_ACCT;
+	if (uflags & FS_QUOTA_UDQ_ENFD)
+		flags |= XFS_UQUOTA_ENFD;
+	if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
+		flags |= XFS_OQUOTA_ENFD;
+
+	switch (op) {
+	case Q_XQUOTAON:
+		return -xfs_qm_scall_quotaon(mp, flags);
+	case Q_XQUOTAOFF:
+		if (!XFS_IS_QUOTA_ON(mp))
+			return -EINVAL;
+		return -xfs_qm_scall_quotaoff(mp, flags);
+	case Q_XQUOTARM:
+		if (XFS_IS_QUOTA_ON(mp))
+			return -EINVAL;
+		return -xfs_qm_scall_trunc_qfiles(mp, flags);
+	}
+
+	return -EINVAL;
+}
+
+STATIC int
+xfs_fs_get_dqblk(
+	struct super_block	*sb,
+	int			type,
+	qid_t			id,
+	struct fs_disk_quota	*fdq)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ESRCH;
+
+	return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq);
+}
+
+STATIC int
+xfs_fs_set_dqblk(
+	struct super_block	*sb,
+	int			type,
+	qid_t			id,
+	struct fs_disk_quota	*fdq)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ESRCH;
+
+	return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
+}
+
+const struct quotactl_ops xfs_quotactl_operations = {
+	.get_xstate		= xfs_fs_get_xstate,
+	.set_xstate		= xfs_fs_set_xstate,
+	.get_dqblk		= xfs_fs_get_dqblk,
+	.set_dqblk		= xfs_fs_set_dqblk,
+};
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
new file mode 100644
index 00000000..76fdc586
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include <linux/proc_fs.h>
+
+DEFINE_PER_CPU(struct xfsstats, xfsstats);
+
+static int xfs_stat_proc_show(struct seq_file *m, void *v)
+{
+	int		c, i, j, val;
+	__uint64_t	xs_xstrat_bytes = 0;
+	__uint64_t	xs_write_bytes = 0;
+	__uint64_t	xs_read_bytes = 0;
+
+	static const struct xstats_entry {
+		char	*desc;
+		int	endpoint;
+	} xstats[] = {
+		{ "extent_alloc",	XFSSTAT_END_EXTENT_ALLOC	},
+		{ "abt",		XFSSTAT_END_ALLOC_BTREE		},
+		{ "blk_map",		XFSSTAT_END_BLOCK_MAPPING	},
+		{ "bmbt",		XFSSTAT_END_BLOCK_MAP_BTREE	},
+		{ "dir",		XFSSTAT_END_DIRECTORY_OPS	},
+		{ "trans",		XFSSTAT_END_TRANSACTIONS	},
+		{ "ig",			XFSSTAT_END_INODE_OPS		},
+		{ "log",		XFSSTAT_END_LOG_OPS		},
+		{ "push_ail",		XFSSTAT_END_TAIL_PUSHING	},
+		{ "xstrat",		XFSSTAT_END_WRITE_CONVERT	},
+		{ "rw",			XFSSTAT_END_READ_WRITE_OPS	},
+		{ "attr",		XFSSTAT_END_ATTRIBUTE_OPS	},
+		{ "icluster",		XFSSTAT_END_INODE_CLUSTER	},
+		{ "vnodes",		XFSSTAT_END_VNODE_OPS		},
+		{ "buf",		XFSSTAT_END_BUF			},
+		{ "abtb2",		XFSSTAT_END_ABTB_V2		},
+		{ "abtc2",		XFSSTAT_END_ABTC_V2		},
+		{ "bmbt2",		XFSSTAT_END_BMBT_V2		},
+		{ "ibt2",		XFSSTAT_END_IBT_V2		},
+	};
+
+	/* Loop over all stats groups */
+	for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
+		seq_printf(m, "%s", xstats[i].desc);
+		/* inner loop does each group */
+		while (j < xstats[i].endpoint) {
+			val = 0;
+			/* sum over all cpus */
+			for_each_possible_cpu(c)
+				val += *(((__u32*)&per_cpu(xfsstats, c) + j));
+			seq_printf(m, " %u", val);
+			j++;
+		}
+		seq_putc(m, '\n');
+	}
+	/* extra precision counters */
+	for_each_possible_cpu(i) {
+		xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
+		xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
+		xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
+	}
+
+	seq_printf(m, "xpc %Lu %Lu %Lu\n",
+			xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
+	seq_printf(m, "debug %u\n",
+#if defined(DEBUG)
+		1);
+#else
+		0);
+#endif
+	return 0;
+}
+
+static int xfs_stat_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xfs_stat_proc_show, NULL);
+}
+
+static const struct file_operations xfs_stat_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= xfs_stat_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+int
+xfs_init_procfs(void)
+{
+	if (!proc_mkdir("fs/xfs", NULL))
+		goto out;
+
+	if (!proc_create("fs/xfs/stat", 0, NULL,
+			 &xfs_stat_proc_fops))
+		goto out_remove_entry;
+	return 0;
+
+ out_remove_entry:
+	remove_proc_entry("fs/xfs", NULL);
+ out:
+	return -ENOMEM;
+}
+
+void
+xfs_cleanup_procfs(void)
+{
+	remove_proc_entry("fs/xfs/stat", NULL);
+	remove_proc_entry("fs/xfs", NULL);
+}
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
new file mode 100644
index 00000000..736854b1
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_STATS_H__
+#define __XFS_STATS_H__
+
+
+#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
+
+#include <linux/percpu.h>
+
+/*
+ * XFS global statistics
+ */
+struct xfsstats {
+# define XFSSTAT_END_EXTENT_ALLOC	4
+	__uint32_t		xs_allocx;
+	__uint32_t		xs_allocb;
+	__uint32_t		xs_freex;
+	__uint32_t		xs_freeb;
+# define XFSSTAT_END_ALLOC_BTREE	(XFSSTAT_END_EXTENT_ALLOC+4)
+	__uint32_t		xs_abt_lookup;
+	__uint32_t		xs_abt_compare;
+	__uint32_t		xs_abt_insrec;
+	__uint32_t		xs_abt_delrec;
+# define XFSSTAT_END_BLOCK_MAPPING	(XFSSTAT_END_ALLOC_BTREE+7)
+	__uint32_t		xs_blk_mapr;
+	__uint32_t		xs_blk_mapw;
+	__uint32_t		xs_blk_unmap;
+	__uint32_t		xs_add_exlist;
+	__uint32_t		xs_del_exlist;
+	__uint32_t		xs_look_exlist;
+	__uint32_t		xs_cmp_exlist;
+# define XFSSTAT_END_BLOCK_MAP_BTREE	(XFSSTAT_END_BLOCK_MAPPING+4)
+	__uint32_t		xs_bmbt_lookup;
+	__uint32_t		xs_bmbt_compare;
+	__uint32_t		xs_bmbt_insrec;
+	__uint32_t		xs_bmbt_delrec;
+# define XFSSTAT_END_DIRECTORY_OPS	(XFSSTAT_END_BLOCK_MAP_BTREE+4)
+	__uint32_t		xs_dir_lookup;
+	__uint32_t		xs_dir_create;
+	__uint32_t		xs_dir_remove;
+	__uint32_t		xs_dir_getdents;
+# define XFSSTAT_END_TRANSACTIONS	(XFSSTAT_END_DIRECTORY_OPS+3)
+	__uint32_t		xs_trans_sync;
+	__uint32_t		xs_trans_async;
+	__uint32_t		xs_trans_empty;
+# define XFSSTAT_END_INODE_OPS		(XFSSTAT_END_TRANSACTIONS+7)
+	__uint32_t		xs_ig_attempts;
+	__uint32_t		xs_ig_found;
+	__uint32_t		xs_ig_frecycle;
+	__uint32_t		xs_ig_missed;
+	__uint32_t		xs_ig_dup;
+	__uint32_t		xs_ig_reclaims;
+	__uint32_t		xs_ig_attrchg;
+# define XFSSTAT_END_LOG_OPS		(XFSSTAT_END_INODE_OPS+5)
+	__uint32_t		xs_log_writes;
+	__uint32_t		xs_log_blocks;
+	__uint32_t		xs_log_noiclogs;
+	__uint32_t		xs_log_force;
+	__uint32_t		xs_log_force_sleep;
+# define XFSSTAT_END_TAIL_PUSHING	(XFSSTAT_END_LOG_OPS+10)
+	__uint32_t		xs_try_logspace;
+	__uint32_t		xs_sleep_logspace;
+	__uint32_t		xs_push_ail;
+	__uint32_t		xs_push_ail_success;
+	__uint32_t		xs_push_ail_pushbuf;
+	__uint32_t		xs_push_ail_pinned;
+	__uint32_t		xs_push_ail_locked;
+	__uint32_t		xs_push_ail_flushing;
+	__uint32_t		xs_push_ail_restarts;
+	__uint32_t		xs_push_ail_flush;
+# define XFSSTAT_END_WRITE_CONVERT	(XFSSTAT_END_TAIL_PUSHING+2)
+	__uint32_t		xs_xstrat_quick;
+	__uint32_t		xs_xstrat_split;
+# define XFSSTAT_END_READ_WRITE_OPS	(XFSSTAT_END_WRITE_CONVERT+2)
+	__uint32_t		xs_write_calls;
+	__uint32_t		xs_read_calls;
+# define XFSSTAT_END_ATTRIBUTE_OPS	(XFSSTAT_END_READ_WRITE_OPS+4)
+	__uint32_t		xs_attr_get;
+	__uint32_t		xs_attr_set;
+	__uint32_t		xs_attr_remove;
+	__uint32_t		xs_attr_list;
+# define XFSSTAT_END_INODE_CLUSTER	(XFSSTAT_END_ATTRIBUTE_OPS+3)
+	__uint32_t		xs_iflush_count;
+	__uint32_t		xs_icluster_flushcnt;
+	__uint32_t		xs_icluster_flushinode;
+# define XFSSTAT_END_VNODE_OPS		(XFSSTAT_END_INODE_CLUSTER+8)
+	__uint32_t		vn_active;	/* # vnodes not on free lists */
+	__uint32_t		vn_alloc;	/* # times vn_alloc called */
+	__uint32_t		vn_get;		/* # times vn_get called */
+	__uint32_t		vn_hold;	/* # times vn_hold called */
+	__uint32_t		vn_rele;	/* # times vn_rele called */
+	__uint32_t		vn_reclaim;	/* # times vn_reclaim called */
+	__uint32_t		vn_remove;	/* # times vn_remove called */
+	__uint32_t		vn_free;	/* # times vn_free called */
+#define XFSSTAT_END_BUF			(XFSSTAT_END_VNODE_OPS+9)
+	__uint32_t		xb_get;
+	__uint32_t		xb_create;
+	__uint32_t		xb_get_locked;
+	__uint32_t		xb_get_locked_waited;
+	__uint32_t		xb_busy_locked;
+	__uint32_t		xb_miss_locked;
+	__uint32_t		xb_page_retries;
+	__uint32_t		xb_page_found;
+	__uint32_t		xb_get_read;
+/* Version 2 btree counters */
+#define XFSSTAT_END_ABTB_V2		(XFSSTAT_END_BUF+15)
+	__uint32_t		xs_abtb_2_lookup;
+	__uint32_t		xs_abtb_2_compare;
+	__uint32_t		xs_abtb_2_insrec;
+	__uint32_t		xs_abtb_2_delrec;
+	__uint32_t		xs_abtb_2_newroot;
+	__uint32_t		xs_abtb_2_killroot;
+	__uint32_t		xs_abtb_2_increment;
+	__uint32_t		xs_abtb_2_decrement;
+	__uint32_t		xs_abtb_2_lshift;
+	__uint32_t		xs_abtb_2_rshift;
+	__uint32_t		xs_abtb_2_split;
+	__uint32_t		xs_abtb_2_join;
+	__uint32_t		xs_abtb_2_alloc;
+	__uint32_t		xs_abtb_2_free;
+	__uint32_t		xs_abtb_2_moves;
+#define XFSSTAT_END_ABTC_V2		(XFSSTAT_END_ABTB_V2+15)
+	__uint32_t		xs_abtc_2_lookup;
+	__uint32_t		xs_abtc_2_compare;
+	__uint32_t		xs_abtc_2_insrec;
+	__uint32_t		xs_abtc_2_delrec;
+	__uint32_t		xs_abtc_2_newroot;
+	__uint32_t		xs_abtc_2_killroot;
+	__uint32_t		xs_abtc_2_increment;
+	__uint32_t		xs_abtc_2_decrement;
+	__uint32_t		xs_abtc_2_lshift;
+	__uint32_t		xs_abtc_2_rshift;
+	__uint32_t		xs_abtc_2_split;
+	__uint32_t		xs_abtc_2_join;
+	__uint32_t		xs_abtc_2_alloc;
+	__uint32_t		xs_abtc_2_free;
+	__uint32_t		xs_abtc_2_moves;
+#define XFSSTAT_END_BMBT_V2		(XFSSTAT_END_ABTC_V2+15)
+	__uint32_t		xs_bmbt_2_lookup;
+	__uint32_t		xs_bmbt_2_compare;
+	__uint32_t		xs_bmbt_2_insrec;
+	__uint32_t		xs_bmbt_2_delrec;
+	__uint32_t		xs_bmbt_2_newroot;
+	__uint32_t		xs_bmbt_2_killroot;
+	__uint32_t		xs_bmbt_2_increment;
+	__uint32_t		xs_bmbt_2_decrement;
+	__uint32_t		xs_bmbt_2_lshift;
+	__uint32_t		xs_bmbt_2_rshift;
+	__uint32_t		xs_bmbt_2_split;
+	__uint32_t		xs_bmbt_2_join;
+	__uint32_t		xs_bmbt_2_alloc;
+	__uint32_t		xs_bmbt_2_free;
+	__uint32_t		xs_bmbt_2_moves;
+#define XFSSTAT_END_IBT_V2		(XFSSTAT_END_BMBT_V2+15)
+	__uint32_t		xs_ibt_2_lookup;
+	__uint32_t		xs_ibt_2_compare;
+	__uint32_t		xs_ibt_2_insrec;
+	__uint32_t		xs_ibt_2_delrec;
+	__uint32_t		xs_ibt_2_newroot;
+	__uint32_t		xs_ibt_2_killroot;
+	__uint32_t		xs_ibt_2_increment;
+	__uint32_t		xs_ibt_2_decrement;
+	__uint32_t		xs_ibt_2_lshift;
+	__uint32_t		xs_ibt_2_rshift;
+	__uint32_t		xs_ibt_2_split;
+	__uint32_t		xs_ibt_2_join;
+	__uint32_t		xs_ibt_2_alloc;
+	__uint32_t		xs_ibt_2_free;
+	__uint32_t		xs_ibt_2_moves;
+/* Extra precision counters */
+	__uint64_t		xs_xstrat_bytes;
+	__uint64_t		xs_write_bytes;
+	__uint64_t		xs_read_bytes;
+};
+
+DECLARE_PER_CPU(struct xfsstats, xfsstats);
+
+/*
+ * We don't disable preempt, not too worried about poking the
+ * wrong CPU's stat for now (also aggregated before reporting).
+ */
+#define XFS_STATS_INC(v)	(per_cpu(xfsstats, current_cpu()).v++)
+#define XFS_STATS_DEC(v)	(per_cpu(xfsstats, current_cpu()).v--)
+#define XFS_STATS_ADD(v, inc)	(per_cpu(xfsstats, current_cpu()).v += (inc))
+
+extern int xfs_init_procfs(void);
+extern void xfs_cleanup_procfs(void);
+
+
+#else	/* !CONFIG_PROC_FS */
+
+# define XFS_STATS_INC(count)
+# define XFS_STATS_DEC(count)
+# define XFS_STATS_ADD(count, inc)
+
+static inline int xfs_init_procfs(void)
+{
+	return 0;
+}
+
+static inline void xfs_cleanup_procfs(void)
+{
+}
+
+#endif	/* !CONFIG_PROC_FS */
+
+#endif /* __XFS_STATS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
new file mode 100644
index 00000000..e6ac98c1
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -0,0 +1,1720 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+#include "xfs_ialloc.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_fsops.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_vnodeops.h"
+#include "xfs_log_priv.h"
+#include "xfs_trans_priv.h"
+#include "xfs_filestream.h"
+#include "xfs_da_btree.h"
+#include "xfs_extfree_item.h"
+#include "xfs_mru_cache.h"
+#include "xfs_inode_item.h"
+#include "xfs_sync.h"
+#include "xfs_trace.h"
+
+#include <linux/namei.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/mempool.h>
+#include <linux/writeback.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/parser.h>
+
+static const struct super_operations xfs_super_operations;
+static kmem_zone_t *xfs_ioend_zone;
+mempool_t *xfs_ioend_pool;
+
+#define MNTOPT_LOGBUFS	"logbufs"	/* number of XFS log buffers */
+#define MNTOPT_LOGBSIZE	"logbsize"	/* size of XFS log buffers */
+#define MNTOPT_LOGDEV	"logdev"	/* log device */
+#define MNTOPT_RTDEV	"rtdev"		/* realtime I/O device */
+#define MNTOPT_BIOSIZE	"biosize"	/* log2 of preferred buffered io size */
+#define MNTOPT_WSYNC	"wsync"		/* safe-mode nfs compatible mount */
+#define MNTOPT_NOALIGN	"noalign"	/* turn off stripe alignment */
+#define MNTOPT_SWALLOC	"swalloc"	/* turn on stripe width allocation */
+#define MNTOPT_SUNIT	"sunit"		/* data volume stripe unit */
+#define MNTOPT_SWIDTH	"swidth"	/* data volume stripe width */
+#define MNTOPT_NOUUID	"nouuid"	/* ignore filesystem UUID */
+#define MNTOPT_MTPT	"mtpt"		/* filesystem mount point */
+#define MNTOPT_GRPID	"grpid"		/* group-ID from parent directory */
+#define MNTOPT_NOGRPID	"nogrpid"	/* group-ID from current process */
+#define MNTOPT_BSDGROUPS    "bsdgroups"    /* group-ID from parent directory */
+#define MNTOPT_SYSVGROUPS   "sysvgroups"   /* group-ID from current process */
+#define MNTOPT_ALLOCSIZE    "allocsize"    /* preferred allocation size */
+#define MNTOPT_NORECOVERY   "norecovery"   /* don't run XFS recovery */
+#define MNTOPT_BARRIER	"barrier"	/* use writer barriers for log write and
+					 * unwritten extent conversion */
+#define MNTOPT_NOBARRIER "nobarrier"	/* .. disable */
+#define MNTOPT_64BITINODE   "inode64"	/* inodes can be allocated anywhere */
+#define MNTOPT_IKEEP	"ikeep"		/* do not free empty inode clusters */
+#define MNTOPT_NOIKEEP	"noikeep"	/* free empty inode clusters */
+#define MNTOPT_LARGEIO	   "largeio"	/* report large I/O sizes in stat() */
+#define MNTOPT_NOLARGEIO   "nolargeio"	/* do not report large I/O sizes
+					 * in stat(). */
+#define MNTOPT_ATTR2	"attr2"		/* do use attr2 attribute format */
+#define MNTOPT_NOATTR2	"noattr2"	/* do not use attr2 attribute format */
+#define MNTOPT_FILESTREAM  "filestreams" /* use filestreams allocator */
+#define MNTOPT_QUOTA	"quota"		/* disk quotas (user) */
+#define MNTOPT_NOQUOTA	"noquota"	/* no quotas */
+#define MNTOPT_USRQUOTA	"usrquota"	/* user quota enabled */
+#define MNTOPT_GRPQUOTA	"grpquota"	/* group quota enabled */
+#define MNTOPT_PRJQUOTA	"prjquota"	/* project quota enabled */
+#define MNTOPT_UQUOTA	"uquota"	/* user quota (IRIX variant) */
+#define MNTOPT_GQUOTA	"gquota"	/* group quota (IRIX variant) */
+#define MNTOPT_PQUOTA	"pquota"	/* project quota (IRIX variant) */
+#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
+#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
+#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
+#define MNTOPT_QUOTANOENF  "qnoenforce"	/* same as uqnoenforce */
+#define MNTOPT_DELAYLOG    "delaylog"	/* Delayed logging enabled */
+#define MNTOPT_NODELAYLOG  "nodelaylog"	/* Delayed logging disabled */
+#define MNTOPT_DISCARD	   "discard"	/* Discard unused blocks */
+#define MNTOPT_NODISCARD   "nodiscard"	/* Do not discard unused blocks */
+
+/*
+ * Table driven mount option parser.
+ *
+ * Currently only used for remount, but it will be used for mount
+ * in the future, too.
+ */
+enum {
+	Opt_barrier, Opt_nobarrier, Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_err, NULL}
+};
+
+
+STATIC unsigned long
+suffix_strtoul(char *s, char **endp, unsigned int base)
+{
+	int	last, shift_left_factor = 0;
+	char	*value = s;
+
+	last = strlen(value) - 1;
+	if (value[last] == 'K' || value[last] == 'k') {
+		shift_left_factor = 10;
+		value[last] = '\0';
+	}
+	if (value[last] == 'M' || value[last] == 'm') {
+		shift_left_factor = 20;
+		value[last] = '\0';
+	}
+	if (value[last] == 'G' || value[last] == 'g') {
+		shift_left_factor = 30;
+		value[last] = '\0';
+	}
+
+	return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
+}
+
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock has _not_ yet been read in.
+ *
+ * Note that this function leaks the various device name allocations on
+ * failure.  The caller takes care of them.
+ */
+STATIC int
+xfs_parseargs(
+	struct xfs_mount	*mp,
+	char			*options)
+{
+	struct super_block	*sb = mp->m_super;
+	char			*this_char, *value, *eov;
+	int			dsunit = 0;
+	int			dswidth = 0;
+	int			iosize = 0;
+	__uint8_t		iosizelog = 0;
+
+	/*
+	 * set up the mount name first so all the errors will refer to the
+	 * correct device.
+	 */
+	mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
+	if (!mp->m_fsname)
+		return ENOMEM;
+	mp->m_fsname_len = strlen(mp->m_fsname) + 1;
+
+	/*
+	 * Copy binary VFS mount flags we are interested in.
+	 */
+	if (sb->s_flags & MS_RDONLY)
+		mp->m_flags |= XFS_MOUNT_RDONLY;
+	if (sb->s_flags & MS_DIRSYNC)
+		mp->m_flags |= XFS_MOUNT_DIRSYNC;
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		mp->m_flags |= XFS_MOUNT_WSYNC;
+
+	/*
+	 * Set some default flags that could be cleared by the mount option
+	 * parsing.
+	 */
+	mp->m_flags |= XFS_MOUNT_BARRIER;
+	mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+	mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+	mp->m_flags |= XFS_MOUNT_DELAYLOG;
+
+	/*
+	 * These can be overridden by the mount option parsing.
+	 */
+	mp->m_logbufs = -1;
+	mp->m_logbsize = -1;
+
+	if (!options)
+		goto done;
+
+	while ((this_char = strsep(&options, ",")) != NULL) {
+		if (!*this_char)
+			continue;
+		if ((value = strchr(this_char, '=')) != NULL)
+			*value++ = 0;
+
+		if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return EINVAL;
+			}
+			mp->m_logbufs = simple_strtoul(value, &eov, 10);
+		} else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return EINVAL;
+			}
+			mp->m_logbsize = suffix_strtoul(value, &eov, 10);
+		} else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return EINVAL;
+			}
+			mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+			if (!mp->m_logname)
+				return ENOMEM;
+		} else if (!strcmp(this_char, MNTOPT_MTPT)) {
+			xfs_warn(mp, "%s option not allowed on this system",
+				this_char);
+			return EINVAL;
+		} else if (!strcmp(this_char, MNTOPT_RTDEV)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return EINVAL;
+			}
+			mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+			if (!mp->m_rtname)
+				return ENOMEM;
+		} else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return EINVAL;
+			}
+			iosize = simple_strtoul(value, &eov, 10);
+			iosizelog = ffs(iosize) - 1;
+		} else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return EINVAL;
+			}
+			iosize = suffix_strtoul(value, &eov, 10);
+			iosizelog = ffs(iosize) - 1;
+		} else if (!strcmp(this_char, MNTOPT_GRPID) ||
+			   !strcmp(this_char, MNTOPT_BSDGROUPS)) {
+			mp->m_flags |= XFS_MOUNT_GRPID;
+		} else if (!strcmp(this_char, MNTOPT_NOGRPID) ||
+			   !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
+			mp->m_flags &= ~XFS_MOUNT_GRPID;
+		} else if (!strcmp(this_char, MNTOPT_WSYNC)) {
+			mp->m_flags |= XFS_MOUNT_WSYNC;
+		} else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
+			mp->m_flags |= XFS_MOUNT_NORECOVERY;
+		} else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
+			mp->m_flags |= XFS_MOUNT_NOALIGN;
+		} else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
+			mp->m_flags |= XFS_MOUNT_SWALLOC;
+		} else if (!strcmp(this_char, MNTOPT_SUNIT)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return EINVAL;
+			}
+			dsunit = simple_strtoul(value, &eov, 10);
+		} else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
+			if (!value || !*value) {
+				xfs_warn(mp, "%s option requires an argument",
+					this_char);
+				return EINVAL;
+			}
+			dswidth = simple_strtoul(value, &eov, 10);
+		} else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
+			mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
+#if !XFS_BIG_INUMS
+			xfs_warn(mp, "%s option not allowed on this system",
+				this_char);
+			return EINVAL;
+#endif
+		} else if (!strcmp(this_char, MNTOPT_NOUUID)) {
+			mp->m_flags |= XFS_MOUNT_NOUUID;
+		} else if (!strcmp(this_char, MNTOPT_BARRIER)) {
+			mp->m_flags |= XFS_MOUNT_BARRIER;
+		} else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
+			mp->m_flags &= ~XFS_MOUNT_BARRIER;
+		} else if (!strcmp(this_char, MNTOPT_IKEEP)) {
+			mp->m_flags |= XFS_MOUNT_IKEEP;
+		} else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
+			mp->m_flags &= ~XFS_MOUNT_IKEEP;
+		} else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
+			mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
+		} else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
+			mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+		} else if (!strcmp(this_char, MNTOPT_ATTR2)) {
+			mp->m_flags |= XFS_MOUNT_ATTR2;
+		} else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
+			mp->m_flags &= ~XFS_MOUNT_ATTR2;
+			mp->m_flags |= XFS_MOUNT_NOATTR2;
+		} else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
+			mp->m_flags |= XFS_MOUNT_FILESTREAMS;
+		} else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
+			mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+					  XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+					  XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+					  XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
+		} else if (!strcmp(this_char, MNTOPT_QUOTA) ||
+			   !strcmp(this_char, MNTOPT_UQUOTA) ||
+			   !strcmp(this_char, MNTOPT_USRQUOTA)) {
+			mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+					 XFS_UQUOTA_ENFD);
+		} else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
+			   !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
+			mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+			mp->m_qflags &= ~XFS_UQUOTA_ENFD;
+		} else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
+			   !strcmp(this_char, MNTOPT_PRJQUOTA)) {
+			mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+					 XFS_OQUOTA_ENFD);
+		} else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
+			mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
+			mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+		} else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
+			   !strcmp(this_char, MNTOPT_GRPQUOTA)) {
+			mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+					 XFS_OQUOTA_ENFD);
+		} else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
+			mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+			mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+		} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
+			mp->m_flags |= XFS_MOUNT_DELAYLOG;
+		} else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
+			mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
+		} else if (!strcmp(this_char, MNTOPT_DISCARD)) {
+			mp->m_flags |= XFS_MOUNT_DISCARD;
+		} else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
+			mp->m_flags &= ~XFS_MOUNT_DISCARD;
+		} else if (!strcmp(this_char, "ihashsize")) {
+			xfs_warn(mp,
+	"ihashsize no longer used, option is deprecated.");
+		} else if (!strcmp(this_char, "osyncisdsync")) {
+			xfs_warn(mp,
+	"osyncisdsync has no effect, option is deprecated.");
+		} else if (!strcmp(this_char, "osyncisosync")) {
+			xfs_warn(mp,
+	"osyncisosync has no effect, option is deprecated.");
+		} else if (!strcmp(this_char, "irixsgid")) {
+			xfs_warn(mp,
+	"irixsgid is now a sysctl(2) variable, option is deprecated.");
+		} else {
+			xfs_warn(mp, "unknown mount option [%s].", this_char);
+			return EINVAL;
+		}
+	}
+
+	/*
+	 * no recovery flag requires a read-only mount
+	 */
+	if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
+	    !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+		xfs_warn(mp, "no-recovery mounts must be read-only.");
+		return EINVAL;
+	}
+
+	if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
+		xfs_warn(mp,
+	"sunit and swidth options incompatible with the noalign option");
+		return EINVAL;
+	}
+
+	if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
+	    !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
+		xfs_warn(mp,
+	"the discard option is incompatible with the nodelaylog option");
+		return EINVAL;
+	}
+
+#ifndef CONFIG_XFS_QUOTA
+	if (XFS_IS_QUOTA_RUNNING(mp)) {
+		xfs_warn(mp, "quota support not available in this kernel.");
+		return EINVAL;
+	}
+#endif
+
+	if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
+	    (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
+		xfs_warn(mp, "cannot mount with both project and group quota");
+		return EINVAL;
+	}
+
+	if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
+		xfs_warn(mp, "sunit and swidth must be specified together");
+		return EINVAL;
+	}
+
+	if (dsunit && (dswidth % dsunit != 0)) {
+		xfs_warn(mp,
+	"stripe width (%d) must be a multiple of the stripe unit (%d)",
+			dswidth, dsunit);
+		return EINVAL;
+	}
+
+done:
+	if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+		/*
+		 * At this point the superblock has not been read
+		 * in, therefore we do not know the block size.
+		 * Before the mount call ends we will convert
+		 * these to FSBs.
+		 */
+		if (dsunit) {
+			mp->m_dalign = dsunit;
+			mp->m_flags |= XFS_MOUNT_RETERR;
+		}
+
+		if (dswidth)
+			mp->m_swidth = dswidth;
+	}
+
+	if (mp->m_logbufs != -1 &&
+	    mp->m_logbufs != 0 &&
+	    (mp->m_logbufs < XLOG_MIN_ICLOGS ||
+	     mp->m_logbufs > XLOG_MAX_ICLOGS)) {
+		xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
+			mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+		return XFS_ERROR(EINVAL);
+	}
+	if (mp->m_logbsize != -1 &&
+	    mp->m_logbsize !=  0 &&
+	    (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
+	     mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
+	     !is_power_of_2(mp->m_logbsize))) {
+		xfs_warn(mp,
+			"invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+			mp->m_logbsize);
+		return XFS_ERROR(EINVAL);
+	}
+
+	if (iosizelog) {
+		if (iosizelog > XFS_MAX_IO_LOG ||
+		    iosizelog < XFS_MIN_IO_LOG) {
+			xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
+				iosizelog, XFS_MIN_IO_LOG,
+				XFS_MAX_IO_LOG);
+			return XFS_ERROR(EINVAL);
+		}
+
+		mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+		mp->m_readio_log = iosizelog;
+		mp->m_writeio_log = iosizelog;
+	}
+
+	return 0;
+}
+
+struct proc_xfs_info {
+	int	flag;
+	char	*str;
+};
+
+STATIC int
+xfs_showargs(
+	struct xfs_mount	*mp,
+	struct seq_file		*m)
+{
+	static struct proc_xfs_info xfs_info_set[] = {
+		/* the few simple ones we can get from the mount struct */
+		{ XFS_MOUNT_IKEEP,		"," MNTOPT_IKEEP },
+		{ XFS_MOUNT_WSYNC,		"," MNTOPT_WSYNC },
+		{ XFS_MOUNT_NOALIGN,		"," MNTOPT_NOALIGN },
+		{ XFS_MOUNT_SWALLOC,		"," MNTOPT_SWALLOC },
+		{ XFS_MOUNT_NOUUID,		"," MNTOPT_NOUUID },
+		{ XFS_MOUNT_NORECOVERY,		"," MNTOPT_NORECOVERY },
+		{ XFS_MOUNT_ATTR2,		"," MNTOPT_ATTR2 },
+		{ XFS_MOUNT_FILESTREAMS,	"," MNTOPT_FILESTREAM },
+		{ XFS_MOUNT_GRPID,		"," MNTOPT_GRPID },
+		{ XFS_MOUNT_DELAYLOG,		"," MNTOPT_DELAYLOG },
+		{ XFS_MOUNT_DISCARD,		"," MNTOPT_DISCARD },
+		{ 0, NULL }
+	};
+	static struct proc_xfs_info xfs_info_unset[] = {
+		/* the few simple ones we can get from the mount struct */
+		{ XFS_MOUNT_COMPAT_IOSIZE,	"," MNTOPT_LARGEIO },
+		{ XFS_MOUNT_BARRIER,		"," MNTOPT_NOBARRIER },
+		{ XFS_MOUNT_SMALL_INUMS,	"," MNTOPT_64BITINODE },
+		{ 0, NULL }
+	};
+	struct proc_xfs_info	*xfs_infop;
+
+	for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) {
+		if (mp->m_flags & xfs_infop->flag)
+			seq_puts(m, xfs_infop->str);
+	}
+	for (xfs_infop = xfs_info_unset; xfs_infop->flag; xfs_infop++) {
+		if (!(mp->m_flags & xfs_infop->flag))
+			seq_puts(m, xfs_infop->str);
+	}
+
+	if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
+		seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk",
+				(int)(1 << mp->m_writeio_log) >> 10);
+
+	if (mp->m_logbufs > 0)
+		seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs);
+	if (mp->m_logbsize > 0)
+		seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
+
+	if (mp->m_logname)
+		seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname);
+	if (mp->m_rtname)
+		seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname);
+
+	if (mp->m_dalign > 0)
+		seq_printf(m, "," MNTOPT_SUNIT "=%d",
+				(int)XFS_FSB_TO_BB(mp, mp->m_dalign));
+	if (mp->m_swidth > 0)
+		seq_printf(m, "," MNTOPT_SWIDTH "=%d",
+				(int)XFS_FSB_TO_BB(mp, mp->m_swidth));
+
+	if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD))
+		seq_puts(m, "," MNTOPT_USRQUOTA);
+	else if (mp->m_qflags & XFS_UQUOTA_ACCT)
+		seq_puts(m, "," MNTOPT_UQUOTANOENF);
+
+	/* Either project or group quotas can be active, not both */
+
+	if (mp->m_qflags & XFS_PQUOTA_ACCT) {
+		if (mp->m_qflags & XFS_OQUOTA_ENFD)
+			seq_puts(m, "," MNTOPT_PRJQUOTA);
+		else
+			seq_puts(m, "," MNTOPT_PQUOTANOENF);
+	} else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+		if (mp->m_qflags & XFS_OQUOTA_ENFD)
+			seq_puts(m, "," MNTOPT_GRPQUOTA);
+		else
+			seq_puts(m, "," MNTOPT_GQUOTANOENF);
+	}
+
+	if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
+		seq_puts(m, "," MNTOPT_NOQUOTA);
+
+	return 0;
+}
+__uint64_t
+xfs_max_file_offset(
+	unsigned int		blockshift)
+{
+	unsigned int		pagefactor = 1;
+	unsigned int		bitshift = BITS_PER_LONG - 1;
+
+	/* Figure out maximum filesize, on Linux this can depend on
+	 * the filesystem blocksize (on 32 bit platforms).
+	 * __block_write_begin does this in an [unsigned] long...
+	 *      page->index << (PAGE_CACHE_SHIFT - bbits)
+	 * So, for page sized blocks (4K on 32 bit platforms),
+	 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
+	 *      (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
+	 * but for smaller blocksizes it is less (bbits = log2 bsize).
+	 * Note1: get_block_t takes a long (implicit cast from above)
+	 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
+	 * can optionally convert the [unsigned] long from above into
+	 * an [unsigned] long long.
+	 */
+
+#if BITS_PER_LONG == 32
+# if defined(CONFIG_LBDAF)
+	ASSERT(sizeof(sector_t) == 8);
+	pagefactor = PAGE_CACHE_SIZE;
+	bitshift = BITS_PER_LONG;
+# else
+	pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
+# endif
+#endif
+
+	return (((__uint64_t)pagefactor) << bitshift) - 1;
+}
+
+STATIC int
+xfs_blkdev_get(
+	xfs_mount_t		*mp,
+	const char		*name,
+	struct block_device	**bdevp)
+{
+	int			error = 0;
+
+	*bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				    mp);
+	if (IS_ERR(*bdevp)) {
+		error = PTR_ERR(*bdevp);
+		xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
+	}
+
+	return -error;
+}
+
+STATIC void
+xfs_blkdev_put(
+	struct block_device	*bdev)
+{
+	if (bdev)
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+}
+
+void
+xfs_blkdev_issue_flush(
+	xfs_buftarg_t		*buftarg)
+{
+	blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
+}
+
+STATIC void
+xfs_close_devices(
+	struct xfs_mount	*mp)
+{
+	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+		struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
+		xfs_free_buftarg(mp, mp->m_logdev_targp);
+		xfs_blkdev_put(logdev);
+	}
+	if (mp->m_rtdev_targp) {
+		struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
+		xfs_free_buftarg(mp, mp->m_rtdev_targp);
+		xfs_blkdev_put(rtdev);
+	}
+	xfs_free_buftarg(mp, mp->m_ddev_targp);
+}
+
+/*
+ * The file system configurations are:
+ *	(1) device (partition) with data and internal log
+ *	(2) logical volume with data and log subvolumes.
+ *	(3) logical volume with data, log, and realtime subvolumes.
+ *
+ * We only have to handle opening the log and realtime volumes here if
+ * they are present.  The data subvolume has already been opened by
+ * get_sb_bdev() and is stored in sb->s_bdev.
+ */
+STATIC int
+xfs_open_devices(
+	struct xfs_mount	*mp)
+{
+	struct block_device	*ddev = mp->m_super->s_bdev;
+	struct block_device	*logdev = NULL, *rtdev = NULL;
+	int			error;
+
+	/*
+	 * Open real time and log devices - order is important.
+	 */
+	if (mp->m_logname) {
+		error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
+		if (error)
+			goto out;
+	}
+
+	if (mp->m_rtname) {
+		error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
+		if (error)
+			goto out_close_logdev;
+
+		if (rtdev == ddev || rtdev == logdev) {
+			xfs_warn(mp,
+	"Cannot mount filesystem with identical rtdev and ddev/logdev.");
+			error = EINVAL;
+			goto out_close_rtdev;
+		}
+	}
+
+	/*
+	 * Setup xfs_mount buffer target pointers
+	 */
+	error = ENOMEM;
+	mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
+	if (!mp->m_ddev_targp)
+		goto out_close_rtdev;
+
+	if (rtdev) {
+		mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
+							mp->m_fsname);
+		if (!mp->m_rtdev_targp)
+			goto out_free_ddev_targ;
+	}
+
+	if (logdev && logdev != ddev) {
+		mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
+							mp->m_fsname);
+		if (!mp->m_logdev_targp)
+			goto out_free_rtdev_targ;
+	} else {
+		mp->m_logdev_targp = mp->m_ddev_targp;
+	}
+
+	return 0;
+
+ out_free_rtdev_targ:
+	if (mp->m_rtdev_targp)
+		xfs_free_buftarg(mp, mp->m_rtdev_targp);
+ out_free_ddev_targ:
+	xfs_free_buftarg(mp, mp->m_ddev_targp);
+ out_close_rtdev:
+	if (rtdev)
+		xfs_blkdev_put(rtdev);
+ out_close_logdev:
+	if (logdev && logdev != ddev)
+		xfs_blkdev_put(logdev);
+ out:
+	return error;
+}
+
+/*
+ * Setup xfs_mount buffer target pointers based on superblock
+ */
+STATIC int
+xfs_setup_devices(
+	struct xfs_mount	*mp)
+{
+	int			error;
+
+	error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
+				    mp->m_sb.sb_sectsize);
+	if (error)
+		return error;
+
+	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+		unsigned int	log_sector_size = BBSIZE;
+
+		if (xfs_sb_version_hassector(&mp->m_sb))
+			log_sector_size = mp->m_sb.sb_logsectsize;
+		error = xfs_setsize_buftarg(mp->m_logdev_targp,
+					    mp->m_sb.sb_blocksize,
+					    log_sector_size);
+		if (error)
+			return error;
+	}
+	if (mp->m_rtdev_targp) {
+		error = xfs_setsize_buftarg(mp->m_rtdev_targp,
+					    mp->m_sb.sb_blocksize,
+					    mp->m_sb.sb_sectsize);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/* Catch misguided souls that try to use this interface on XFS */
+STATIC struct inode *
+xfs_fs_alloc_inode(
+	struct super_block	*sb)
+{
+	BUG();
+	return NULL;
+}
+
+/*
+ * Now that the generic code is guaranteed not to be accessing
+ * the linux inode, we can reclaim the inode.
+ */
+STATIC void
+xfs_fs_destroy_inode(
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	trace_xfs_destroy_inode(ip);
+
+	XFS_STATS_INC(vn_reclaim);
+
+	/* bad inode, get out here ASAP */
+	if (is_bad_inode(inode))
+		goto out_reclaim;
+
+	xfs_ioend_wait(ip);
+
+	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+
+	/*
+	 * We should never get here with one of the reclaim flags already set.
+	 */
+	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
+
+	/*
+	 * We always use background reclaim here because even if the
+	 * inode is clean, it still may be under IO and hence we have
+	 * to take the flush lock. The background reclaim path handles
+	 * this more efficiently than we can here, so simply let background
+	 * reclaim tear down all inodes.
+	 */
+out_reclaim:
+	xfs_inode_set_reclaim_tag(ip);
+}
+
+/*
+ * Slab object creation initialisation for the XFS inode.
+ * This covers only the idempotent fields in the XFS inode;
+ * all other fields need to be initialised on allocation
+ * from the slab. This avoids the need to repeatedly initialise
+ * fields in the xfs inode that left in the initialise state
+ * when freeing the inode.
+ */
+STATIC void
+xfs_fs_inode_init_once(
+	void			*inode)
+{
+	struct xfs_inode	*ip = inode;
+
+	memset(ip, 0, sizeof(struct xfs_inode));
+
+	/* vfs inode */
+	inode_init_once(VFS_I(ip));
+
+	/* xfs inode */
+	atomic_set(&ip->i_iocount, 0);
+	atomic_set(&ip->i_pincount, 0);
+	spin_lock_init(&ip->i_flags_lock);
+	init_waitqueue_head(&ip->i_ipin_wait);
+	/*
+	 * Because we want to use a counting completion, complete
+	 * the flush completion once to allow a single access to
+	 * the flush completion without blocking.
+	 */
+	init_completion(&ip->i_flush);
+	complete(&ip->i_flush);
+
+	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+		     "xfsino", ip->i_ino);
+}
+
+/*
+ * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
+ * we catch unlogged VFS level updates to the inode.
+ *
+ * We need the barrier() to maintain correct ordering between unlogged
+ * updates and the transaction commit code that clears the i_update_core
+ * field. This requires all updates to be completed before marking the
+ * inode dirty.
+ */
+STATIC void
+xfs_fs_dirty_inode(
+	struct inode	*inode,
+	int		flags)
+{
+	barrier();
+	XFS_I(inode)->i_update_core = 1;
+}
+
+STATIC int
+xfs_fs_write_inode(
+	struct inode		*inode,
+	struct writeback_control *wbc)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error = EAGAIN;
+
+	trace_xfs_write_inode(ip);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -XFS_ERROR(EIO);
+
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) {
+		/*
+		 * Make sure the inode has made it it into the log.  Instead
+		 * of forcing it all the way to stable storage using a
+		 * synchronous transaction we let the log force inside the
+		 * ->sync_fs call do that for thus, which reduces the number
+		 * of synchronous log foces dramatically.
+		 */
+		xfs_ioend_wait(ip);
+		error = xfs_log_dirty_inode(ip, NULL, 0);
+		if (error)
+			goto out;
+		return 0;
+	} else {
+		if (!ip->i_update_core)
+			return 0;
+
+		/*
+		 * We make this non-blocking if the inode is contended, return
+		 * EAGAIN to indicate to the caller that they did not succeed.
+		 * This prevents the flush path from blocking on inodes inside
+		 * another operation right now, they get caught later by
+		 * xfs_sync.
+		 */
+		if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+			goto out;
+
+		if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+			goto out_unlock;
+
+		/*
+		 * Now we have the flush lock and the inode is not pinned, we
+		 * can check if the inode is really clean as we know that
+		 * there are no pending transaction completions, it is not
+		 * waiting on the delayed write queue and there is no IO in
+		 * progress.
+		 */
+		if (xfs_inode_clean(ip)) {
+			xfs_ifunlock(ip);
+			error = 0;
+			goto out_unlock;
+		}
+		error = xfs_iflush(ip, SYNC_TRYLOCK);
+	}
+
+ out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ out:
+	/*
+	 * if we failed to write out the inode then mark
+	 * it dirty again so we'll try again later.
+	 */
+	if (error)
+		xfs_mark_inode_dirty_sync(ip);
+	return -error;
+}
+
+STATIC void
+xfs_fs_evict_inode(
+	struct inode		*inode)
+{
+	xfs_inode_t		*ip = XFS_I(inode);
+
+	trace_xfs_evict_inode(ip);
+
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	XFS_STATS_INC(vn_rele);
+	XFS_STATS_INC(vn_remove);
+	XFS_STATS_DEC(vn_active);
+
+	/*
+	 * The iolock is used by the file system to coordinate reads,
+	 * writes, and block truncates.  Up to this point the lock
+	 * protected concurrent accesses by users of the inode.  But
+	 * from here forward we're doing some final processing of the
+	 * inode because we're done with it, and although we reuse the
+	 * iolock for protection it is really a distinct lock class
+	 * (in the lockdep sense) from before.  To keep lockdep happy
+	 * (and basically indicate what we are doing), we explicitly
+	 * re-init the iolock here.
+	 */
+	ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+	lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+			&xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
+
+	xfs_inactive(ip);
+}
+
+STATIC void
+xfs_free_fsname(
+	struct xfs_mount	*mp)
+{
+	kfree(mp->m_fsname);
+	kfree(mp->m_rtname);
+	kfree(mp->m_logname);
+}
+
+STATIC void
+xfs_fs_put_super(
+	struct super_block	*sb)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	/*
+	 * Unregister the memory shrinker before we tear down the mount
+	 * structure so we don't have memory reclaim racing with us here.
+	 */
+	xfs_inode_shrinker_unregister(mp);
+	xfs_syncd_stop(mp);
+
+	/*
+	 * Blow away any referenced inode in the filestreams cache.
+	 * This can and will cause log traffic as inodes go inactive
+	 * here.
+	 */
+	xfs_filestream_unmount(mp);
+
+	XFS_bflush(mp->m_ddev_targp);
+
+	xfs_unmountfs(mp);
+	xfs_freesb(mp);
+	xfs_icsb_destroy_counters(mp);
+	xfs_close_devices(mp);
+	xfs_free_fsname(mp);
+	kfree(mp);
+}
+
+STATIC int
+xfs_fs_sync_fs(
+	struct super_block	*sb,
+	int			wait)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+	int			error;
+
+	/*
+	 * Not much we can do for the first async pass.  Writing out the
+	 * superblock would be counter-productive as we are going to redirty
+	 * when writing out other data and metadata (and writing out a single
+	 * block is quite fast anyway).
+	 *
+	 * Try to asynchronously kick off quota syncing at least.
+	 */
+	if (!wait) {
+		xfs_qm_sync(mp, SYNC_TRYLOCK);
+		return 0;
+	}
+
+	error = xfs_quiesce_data(mp);
+	if (error)
+		return -error;
+
+	if (laptop_mode) {
+		/*
+		 * The disk must be active because we're syncing.
+		 * We schedule xfssyncd now (now that the disk is
+		 * active) instead of later (when it might not be).
+		 */
+		flush_delayed_work_sync(&mp->m_sync_work);
+	}
+
+	return 0;
+}
+
+STATIC int
+xfs_fs_statfs(
+	struct dentry		*dentry,
+	struct kstatfs		*statp)
+{
+	struct xfs_mount	*mp = XFS_M(dentry->d_sb);
+	xfs_sb_t		*sbp = &mp->m_sb;
+	struct xfs_inode	*ip = XFS_I(dentry->d_inode);
+	__uint64_t		fakeinos, id;
+	xfs_extlen_t		lsize;
+	__int64_t		ffree;
+
+	statp->f_type = XFS_SB_MAGIC;
+	statp->f_namelen = MAXNAMELEN - 1;
+
+	id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
+	statp->f_fsid.val[0] = (u32)id;
+	statp->f_fsid.val[1] = (u32)(id >> 32);
+
+	xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+
+	spin_lock(&mp->m_sb_lock);
+	statp->f_bsize = sbp->sb_blocksize;
+	lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
+	statp->f_blocks = sbp->sb_dblocks - lsize;
+	statp->f_bfree = statp->f_bavail =
+				sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+	fakeinos = statp->f_bfree << sbp->sb_inopblog;
+	statp->f_files =
+	    MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
+	if (mp->m_maxicount)
+		statp->f_files = min_t(typeof(statp->f_files),
+					statp->f_files,
+					mp->m_maxicount);
+
+	/* make sure statp->f_ffree does not underflow */
+	ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+	statp->f_ffree = max_t(__int64_t, ffree, 0);
+
+	spin_unlock(&mp->m_sb_lock);
+
+	if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
+	    ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
+			      (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
+		xfs_qm_statvfs(ip, statp);
+	return 0;
+}
+
+STATIC void
+xfs_save_resvblks(struct xfs_mount *mp)
+{
+	__uint64_t resblks = 0;
+
+	mp->m_resblks_save = mp->m_resblks;
+	xfs_reserve_blocks(mp, &resblks, NULL);
+}
+
+STATIC void
+xfs_restore_resvblks(struct xfs_mount *mp)
+{
+	__uint64_t resblks;
+
+	if (mp->m_resblks_save) {
+		resblks = mp->m_resblks_save;
+		mp->m_resblks_save = 0;
+	} else
+		resblks = xfs_default_resblks(mp);
+
+	xfs_reserve_blocks(mp, &resblks, NULL);
+}
+
+STATIC int
+xfs_fs_remount(
+	struct super_block	*sb,
+	int			*flags,
+	char			*options)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+	substring_t		args[MAX_OPT_ARGS];
+	char			*p;
+	int			error;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_barrier:
+			mp->m_flags |= XFS_MOUNT_BARRIER;
+			break;
+		case Opt_nobarrier:
+			mp->m_flags &= ~XFS_MOUNT_BARRIER;
+			break;
+		default:
+			/*
+			 * Logically we would return an error here to prevent
+			 * users from believing they might have changed
+			 * mount options using remount which can't be changed.
+			 *
+			 * But unfortunately mount(8) adds all options from
+			 * mtab and fstab to the mount arguments in some cases
+			 * so we can't blindly reject options, but have to
+			 * check for each specified option if it actually
+			 * differs from the currently set option and only
+			 * reject it if that's the case.
+			 *
+			 * Until that is implemented we return success for
+			 * every remount request, and silently ignore all
+			 * options that we can't actually change.
+			 */
+#if 0
+			xfs_info(mp,
+		"mount option \"%s\" not supported for remount\n", p);
+			return -EINVAL;
+#else
+			break;
+#endif
+		}
+	}
+
+	/* ro -> rw */
+	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+		mp->m_flags &= ~XFS_MOUNT_RDONLY;
+
+		/*
+		 * If this is the first remount to writeable state we
+		 * might have some superblock changes to update.
+		 */
+		if (mp->m_update_flags) {
+			error = xfs_mount_log_sb(mp, mp->m_update_flags);
+			if (error) {
+				xfs_warn(mp, "failed to write sb changes");
+				return error;
+			}
+			mp->m_update_flags = 0;
+		}
+
+		/*
+		 * Fill out the reserve pool if it is empty. Use the stashed
+		 * value if it is non-zero, otherwise go with the default.
+		 */
+		xfs_restore_resvblks(mp);
+	}
+
+	/* rw -> ro */
+	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+		/*
+		 * After we have synced the data but before we sync the
+		 * metadata, we need to free up the reserve block pool so that
+		 * the used block count in the superblock on disk is correct at
+		 * the end of the remount. Stash the current reserve pool size
+		 * so that if we get remounted rw, we can return it to the same
+		 * size.
+		 */
+
+		xfs_quiesce_data(mp);
+		xfs_save_resvblks(mp);
+		xfs_quiesce_attr(mp);
+		mp->m_flags |= XFS_MOUNT_RDONLY;
+	}
+
+	return 0;
+}
+
+/*
+ * Second stage of a freeze. The data is already frozen so we only
+ * need to take care of the metadata. Once that's done write a dummy
+ * record to dirty the log in case of a crash while frozen.
+ */
+STATIC int
+xfs_fs_freeze(
+	struct super_block	*sb)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	xfs_save_resvblks(mp);
+	xfs_quiesce_attr(mp);
+	return -xfs_fs_log_dummy(mp);
+}
+
+STATIC int
+xfs_fs_unfreeze(
+	struct super_block	*sb)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	xfs_restore_resvblks(mp);
+	return 0;
+}
+
+STATIC int
+xfs_fs_show_options(
+	struct seq_file		*m,
+	struct vfsmount		*mnt)
+{
+	return -xfs_showargs(XFS_M(mnt->mnt_sb), m);
+}
+
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock _has_ now been read in.
+ */
+STATIC int
+xfs_finish_flags(
+	struct xfs_mount	*mp)
+{
+	int			ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
+
+	/* Fail a mount where the logbuf is smaller than the log stripe */
+	if (xfs_sb_version_haslogv2(&mp->m_sb)) {
+		if (mp->m_logbsize <= 0 &&
+		    mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
+			mp->m_logbsize = mp->m_sb.sb_logsunit;
+		} else if (mp->m_logbsize > 0 &&
+			   mp->m_logbsize < mp->m_sb.sb_logsunit) {
+			xfs_warn(mp,
+		"logbuf size must be greater than or equal to log stripe size");
+			return XFS_ERROR(EINVAL);
+		}
+	} else {
+		/* Fail a mount if the logbuf is larger than 32K */
+		if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
+			xfs_warn(mp,
+		"logbuf size for version 1 logs must be 16K or 32K");
+			return XFS_ERROR(EINVAL);
+		}
+	}
+
+	/*
+	 * mkfs'ed attr2 will turn on attr2 mount unless explicitly
+	 * told by noattr2 to turn it off
+	 */
+	if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+	    !(mp->m_flags & XFS_MOUNT_NOATTR2))
+		mp->m_flags |= XFS_MOUNT_ATTR2;
+
+	/*
+	 * prohibit r/w mounts of read-only filesystems
+	 */
+	if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
+		xfs_warn(mp,
+			"cannot mount a read-only filesystem as read-write");
+		return XFS_ERROR(EROFS);
+	}
+
+	return 0;
+}
+
+STATIC int
+xfs_fs_fill_super(
+	struct super_block	*sb,
+	void			*data,
+	int			silent)
+{
+	struct inode		*root;
+	struct xfs_mount	*mp = NULL;
+	int			flags = 0, error = ENOMEM;
+
+	mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
+	if (!mp)
+		goto out;
+
+	spin_lock_init(&mp->m_sb_lock);
+	mutex_init(&mp->m_growlock);
+	atomic_set(&mp->m_active_trans, 0);
+
+	mp->m_super = sb;
+	sb->s_fs_info = mp;
+
+	error = xfs_parseargs(mp, (char *)data);
+	if (error)
+		goto out_free_fsname;
+
+	sb_min_blocksize(sb, BBSIZE);
+	sb->s_xattr = xfs_xattr_handlers;
+	sb->s_export_op = &xfs_export_operations;
+#ifdef CONFIG_XFS_QUOTA
+	sb->s_qcop = &xfs_quotactl_operations;
+#endif
+	sb->s_op = &xfs_super_operations;
+
+	if (silent)
+		flags |= XFS_MFSI_QUIET;
+
+	error = xfs_open_devices(mp);
+	if (error)
+		goto out_free_fsname;
+
+	error = xfs_icsb_init_counters(mp);
+	if (error)
+		goto out_close_devices;
+
+	error = xfs_readsb(mp, flags);
+	if (error)
+		goto out_destroy_counters;
+
+	error = xfs_finish_flags(mp);
+	if (error)
+		goto out_free_sb;
+
+	error = xfs_setup_devices(mp);
+	if (error)
+		goto out_free_sb;
+
+	error = xfs_filestream_mount(mp);
+	if (error)
+		goto out_free_sb;
+
+	/*
+	 * we must configure the block size in the superblock before we run the
+	 * full mount process as the mount process can lookup and cache inodes.
+	 * For the same reason we must also initialise the syncd and register
+	 * the inode cache shrinker so that inodes can be reclaimed during
+	 * operations like a quotacheck that iterate all inodes in the
+	 * filesystem.
+	 */
+	sb->s_magic = XFS_SB_MAGIC;
+	sb->s_blocksize = mp->m_sb.sb_blocksize;
+	sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
+	sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
+	sb->s_time_gran = 1;
+	set_posix_acl_flag(sb);
+
+	xfs_inode_shrinker_register(mp);
+
+	error = xfs_mountfs(mp);
+	if (error)
+		goto out_filestream_unmount;
+
+	error = xfs_syncd_init(mp);
+	if (error)
+		goto out_unmount;
+
+	root = igrab(VFS_I(mp->m_rootip));
+	if (!root) {
+		error = ENOENT;
+		goto out_syncd_stop;
+	}
+	if (is_bad_inode(root)) {
+		error = EINVAL;
+		goto out_syncd_stop;
+	}
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		error = ENOMEM;
+		goto out_iput;
+	}
+
+	return 0;
+
+ out_filestream_unmount:
+	xfs_inode_shrinker_unregister(mp);
+	xfs_filestream_unmount(mp);
+ out_free_sb:
+	xfs_freesb(mp);
+ out_destroy_counters:
+	xfs_icsb_destroy_counters(mp);
+ out_close_devices:
+	xfs_close_devices(mp);
+ out_free_fsname:
+	xfs_free_fsname(mp);
+	kfree(mp);
+ out:
+	return -error;
+
+ out_iput:
+	iput(root);
+ out_syncd_stop:
+	xfs_syncd_stop(mp);
+ out_unmount:
+	xfs_inode_shrinker_unregister(mp);
+
+	/*
+	 * Blow away any referenced inode in the filestreams cache.
+	 * This can and will cause log traffic as inodes go inactive
+	 * here.
+	 */
+	xfs_filestream_unmount(mp);
+
+	XFS_bflush(mp->m_ddev_targp);
+
+	xfs_unmountfs(mp);
+	goto out_free_sb;
+}
+
+STATIC struct dentry *
+xfs_fs_mount(
+	struct file_system_type	*fs_type,
+	int			flags,
+	const char		*dev_name,
+	void			*data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
+}
+
+static const struct super_operations xfs_super_operations = {
+	.alloc_inode		= xfs_fs_alloc_inode,
+	.destroy_inode		= xfs_fs_destroy_inode,
+	.dirty_inode		= xfs_fs_dirty_inode,
+	.write_inode		= xfs_fs_write_inode,
+	.evict_inode		= xfs_fs_evict_inode,
+	.put_super		= xfs_fs_put_super,
+	.sync_fs		= xfs_fs_sync_fs,
+	.freeze_fs		= xfs_fs_freeze,
+	.unfreeze_fs		= xfs_fs_unfreeze,
+	.statfs			= xfs_fs_statfs,
+	.remount_fs		= xfs_fs_remount,
+	.show_options		= xfs_fs_show_options,
+};
+
+static struct file_system_type xfs_fs_type = {
+	.owner			= THIS_MODULE,
+	.name			= "xfs",
+	.mount			= xfs_fs_mount,
+	.kill_sb		= kill_block_super,
+	.fs_flags		= FS_REQUIRES_DEV,
+};
+
+STATIC int __init
+xfs_init_zones(void)
+{
+
+	xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
+	if (!xfs_ioend_zone)
+		goto out;
+
+	xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
+						  xfs_ioend_zone);
+	if (!xfs_ioend_pool)
+		goto out_destroy_ioend_zone;
+
+	xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
+						"xfs_log_ticket");
+	if (!xfs_log_ticket_zone)
+		goto out_destroy_ioend_pool;
+
+	xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
+						"xfs_bmap_free_item");
+	if (!xfs_bmap_free_item_zone)
+		goto out_destroy_log_ticket_zone;
+
+	xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
+						"xfs_btree_cur");
+	if (!xfs_btree_cur_zone)
+		goto out_destroy_bmap_free_item_zone;
+
+	xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
+						"xfs_da_state");
+	if (!xfs_da_state_zone)
+		goto out_destroy_btree_cur_zone;
+
+	xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
+	if (!xfs_dabuf_zone)
+		goto out_destroy_da_state_zone;
+
+	xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
+	if (!xfs_ifork_zone)
+		goto out_destroy_dabuf_zone;
+
+	xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
+	if (!xfs_trans_zone)
+		goto out_destroy_ifork_zone;
+
+	xfs_log_item_desc_zone =
+		kmem_zone_init(sizeof(struct xfs_log_item_desc),
+			       "xfs_log_item_desc");
+	if (!xfs_log_item_desc_zone)
+		goto out_destroy_trans_zone;
+
+	/*
+	 * The size of the zone allocated buf log item is the maximum
+	 * size possible under XFS.  This wastes a little bit of memory,
+	 * but it is much faster.
+	 */
+	xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
+				(((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
+				  NBWORD) * sizeof(int))), "xfs_buf_item");
+	if (!xfs_buf_item_zone)
+		goto out_destroy_log_item_desc_zone;
+
+	xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
+			((XFS_EFD_MAX_FAST_EXTENTS - 1) *
+				 sizeof(xfs_extent_t))), "xfs_efd_item");
+	if (!xfs_efd_zone)
+		goto out_destroy_buf_item_zone;
+
+	xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
+			((XFS_EFI_MAX_FAST_EXTENTS - 1) *
+				sizeof(xfs_extent_t))), "xfs_efi_item");
+	if (!xfs_efi_zone)
+		goto out_destroy_efd_zone;
+
+	xfs_inode_zone =
+		kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
+			KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
+			xfs_fs_inode_init_once);
+	if (!xfs_inode_zone)
+		goto out_destroy_efi_zone;
+
+	xfs_ili_zone =
+		kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
+					KM_ZONE_SPREAD, NULL);
+	if (!xfs_ili_zone)
+		goto out_destroy_inode_zone;
+
+	return 0;
+
+ out_destroy_inode_zone:
+	kmem_zone_destroy(xfs_inode_zone);
+ out_destroy_efi_zone:
+	kmem_zone_destroy(xfs_efi_zone);
+ out_destroy_efd_zone:
+	kmem_zone_destroy(xfs_efd_zone);
+ out_destroy_buf_item_zone:
+	kmem_zone_destroy(xfs_buf_item_zone);
+ out_destroy_log_item_desc_zone:
+	kmem_zone_destroy(xfs_log_item_desc_zone);
+ out_destroy_trans_zone:
+	kmem_zone_destroy(xfs_trans_zone);
+ out_destroy_ifork_zone:
+	kmem_zone_destroy(xfs_ifork_zone);
+ out_destroy_dabuf_zone:
+	kmem_zone_destroy(xfs_dabuf_zone);
+ out_destroy_da_state_zone:
+	kmem_zone_destroy(xfs_da_state_zone);
+ out_destroy_btree_cur_zone:
+	kmem_zone_destroy(xfs_btree_cur_zone);
+ out_destroy_bmap_free_item_zone:
+	kmem_zone_destroy(xfs_bmap_free_item_zone);
+ out_destroy_log_ticket_zone:
+	kmem_zone_destroy(xfs_log_ticket_zone);
+ out_destroy_ioend_pool:
+	mempool_destroy(xfs_ioend_pool);
+ out_destroy_ioend_zone:
+	kmem_zone_destroy(xfs_ioend_zone);
+ out:
+	return -ENOMEM;
+}
+
+STATIC void
+xfs_destroy_zones(void)
+{
+	kmem_zone_destroy(xfs_ili_zone);
+	kmem_zone_destroy(xfs_inode_zone);
+	kmem_zone_destroy(xfs_efi_zone);
+	kmem_zone_destroy(xfs_efd_zone);
+	kmem_zone_destroy(xfs_buf_item_zone);
+	kmem_zone_destroy(xfs_log_item_desc_zone);
+	kmem_zone_destroy(xfs_trans_zone);
+	kmem_zone_destroy(xfs_ifork_zone);
+	kmem_zone_destroy(xfs_dabuf_zone);
+	kmem_zone_destroy(xfs_da_state_zone);
+	kmem_zone_destroy(xfs_btree_cur_zone);
+	kmem_zone_destroy(xfs_bmap_free_item_zone);
+	kmem_zone_destroy(xfs_log_ticket_zone);
+	mempool_destroy(xfs_ioend_pool);
+	kmem_zone_destroy(xfs_ioend_zone);
+
+}
+
+STATIC int __init
+xfs_init_workqueues(void)
+{
+	/*
+	 * max_active is set to 8 to give enough concurency to allow
+	 * multiple work operations on each CPU to run. This allows multiple
+	 * filesystems to be running sync work concurrently, and scales with
+	 * the number of CPUs in the system.
+	 */
+	xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
+	if (!xfs_syncd_wq)
+		return -ENOMEM;
+	return 0;
+}
+
+STATIC void
+xfs_destroy_workqueues(void)
+{
+	destroy_workqueue(xfs_syncd_wq);
+}
+
+STATIC int __init
+init_xfs_fs(void)
+{
+	int			error;
+
+	printk(KERN_INFO XFS_VERSION_STRING " with "
+			 XFS_BUILD_OPTIONS " enabled\n");
+
+	xfs_ioend_init();
+	xfs_dir_startup();
+
+	error = xfs_init_zones();
+	if (error)
+		goto out;
+
+	error = xfs_init_workqueues();
+	if (error)
+		goto out_destroy_zones;
+
+	error = xfs_mru_cache_init();
+	if (error)
+		goto out_destroy_wq;
+
+	error = xfs_filestream_init();
+	if (error)
+		goto out_mru_cache_uninit;
+
+	error = xfs_buf_init();
+	if (error)
+		goto out_filestream_uninit;
+
+	error = xfs_init_procfs();
+	if (error)
+		goto out_buf_terminate;
+
+	error = xfs_sysctl_register();
+	if (error)
+		goto out_cleanup_procfs;
+
+	vfs_initquota();
+
+	error = register_filesystem(&xfs_fs_type);
+	if (error)
+		goto out_sysctl_unregister;
+	return 0;
+
+ out_sysctl_unregister:
+	xfs_sysctl_unregister();
+ out_cleanup_procfs:
+	xfs_cleanup_procfs();
+ out_buf_terminate:
+	xfs_buf_terminate();
+ out_filestream_uninit:
+	xfs_filestream_uninit();
+ out_mru_cache_uninit:
+	xfs_mru_cache_uninit();
+ out_destroy_wq:
+	xfs_destroy_workqueues();
+ out_destroy_zones:
+	xfs_destroy_zones();
+ out:
+	return error;
+}
+
+STATIC void __exit
+exit_xfs_fs(void)
+{
+	vfs_exitquota();
+	unregister_filesystem(&xfs_fs_type);
+	xfs_sysctl_unregister();
+	xfs_cleanup_procfs();
+	xfs_buf_terminate();
+	xfs_filestream_uninit();
+	xfs_mru_cache_uninit();
+	xfs_destroy_workqueues();
+	xfs_destroy_zones();
+}
+
+module_init(init_xfs_fs);
+module_exit(exit_xfs_fs);
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled");
+MODULE_LICENSE("GPL");
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
new file mode 100644
index 00000000..50a3266c
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPER_H__
+#define __XFS_SUPER_H__
+
+#include <linux/exportfs.h>
+
+#ifdef CONFIG_XFS_QUOTA
+extern void xfs_qm_init(void);
+extern void xfs_qm_exit(void);
+# define vfs_initquota()	xfs_qm_init()
+# define vfs_exitquota()	xfs_qm_exit()
+#else
+# define vfs_initquota()	do { } while (0)
+# define vfs_exitquota()	do { } while (0)
+#endif
+
+#ifdef CONFIG_XFS_POSIX_ACL
+# define XFS_ACL_STRING		"ACLs, "
+# define set_posix_acl_flag(sb)	((sb)->s_flags |= MS_POSIXACL)
+#else
+# define XFS_ACL_STRING
+# define set_posix_acl_flag(sb)	do { } while (0)
+#endif
+
+#define XFS_SECURITY_STRING	"security attributes, "
+
+#ifdef CONFIG_XFS_RT
+# define XFS_REALTIME_STRING	"realtime, "
+#else
+# define XFS_REALTIME_STRING
+#endif
+
+#if XFS_BIG_BLKNOS
+# if XFS_BIG_INUMS
+#  define XFS_BIGFS_STRING	"large block/inode numbers, "
+# else
+#  define XFS_BIGFS_STRING	"large block numbers, "
+# endif
+#else
+# define XFS_BIGFS_STRING
+#endif
+
+#ifdef DEBUG
+# define XFS_DBG_STRING		"debug"
+#else
+# define XFS_DBG_STRING		"no debug"
+#endif
+
+#define XFS_VERSION_STRING	"SGI XFS"
+#define XFS_BUILD_OPTIONS	XFS_ACL_STRING \
+				XFS_SECURITY_STRING \
+				XFS_REALTIME_STRING \
+				XFS_BIGFS_STRING \
+				XFS_DBG_STRING /* DBG must be last */
+
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_buftarg;
+struct block_device;
+
+extern __uint64_t xfs_max_file_offset(unsigned int);
+
+extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
+
+extern const struct export_operations xfs_export_operations;
+extern const struct xattr_handler *xfs_xattr_handlers[];
+extern const struct quotactl_ops xfs_quotactl_operations;
+
+#define XFS_M(sb)		((struct xfs_mount *)((sb)->s_fs_info))
+
+#endif	/* __XFS_SUPER_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
new file mode 100644
index 00000000..2f277a04
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -0,0 +1,1132 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_inode_item.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_fsops.h"
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
+
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH	32
+
+STATIC int
+xfs_inode_ag_walk_grab(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	ASSERT(rcu_read_lock_held());
+
+	/*
+	 * check for stale RCU freed inode
+	 *
+	 * If the inode has been reallocated, it doesn't matter if it's not in
+	 * the AG we are walking - we are walking for writeback, so if it
+	 * passes all the "valid inode" checks and is dirty, then we'll write
+	 * it back anyway.  If it has been reallocated and still being
+	 * initialised, the XFS_INEW check below will catch it.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (!ip->i_ino)
+		goto out_unlock_noent;
+
+	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+		goto out_unlock_noent;
+	spin_unlock(&ip->i_flags_lock);
+
+	/* nothing to sync during shutdown */
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return EFSCORRUPTED;
+
+	/* If we can't grab the inode, it must on it's way to reclaim. */
+	if (!igrab(inode))
+		return ENOENT;
+
+	if (is_bad_inode(inode)) {
+		IRELE(ip);
+		return ENOENT;
+	}
+
+	/* inode is valid */
+	return 0;
+
+out_unlock_noent:
+	spin_unlock(&ip->i_flags_lock);
+	return ENOENT;
+}
+
+STATIC int
+xfs_inode_ag_walk(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	int			(*execute)(struct xfs_inode *ip,
+					   struct xfs_perag *pag, int flags),
+	int			flags)
+{
+	uint32_t		first_index;
+	int			last_error = 0;
+	int			skipped;
+	int			done;
+	int			nr_found;
+
+restart:
+	done = 0;
+	skipped = 0;
+	first_index = 0;
+	nr_found = 0;
+	do {
+		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+		int		error = 0;
+		int		i;
+
+		rcu_read_lock();
+		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+					(void **)batch, first_index,
+					XFS_LOOKUP_BATCH);
+		if (!nr_found) {
+			rcu_read_unlock();
+			break;
+		}
+
+		/*
+		 * Grab the inodes before we drop the lock. if we found
+		 * nothing, nr == 0 and the loop will be skipped.
+		 */
+		for (i = 0; i < nr_found; i++) {
+			struct xfs_inode *ip = batch[i];
+
+			if (done || xfs_inode_ag_walk_grab(ip))
+				batch[i] = NULL;
+
+			/*
+			 * Update the index for the next lookup. Catch
+			 * overflows into the next AG range which can occur if
+			 * we have inodes in the last block of the AG and we
+			 * are currently pointing to the last inode.
+			 *
+			 * Because we may see inodes that are from the wrong AG
+			 * due to RCU freeing and reallocation, only update the
+			 * index if it lies in this AG. It was a race that lead
+			 * us to see this inode, so another lookup from the
+			 * same index will not find it again.
+			 */
+			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+				continue;
+			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+				done = 1;
+		}
+
+		/* unlock now we've grabbed the inodes. */
+		rcu_read_unlock();
+
+		for (i = 0; i < nr_found; i++) {
+			if (!batch[i])
+				continue;
+			error = execute(batch[i], pag, flags);
+			IRELE(batch[i]);
+			if (error == EAGAIN) {
+				skipped++;
+				continue;
+			}
+			if (error && last_error != EFSCORRUPTED)
+				last_error = error;
+		}
+
+		/* bail out if the filesystem is corrupted.  */
+		if (error == EFSCORRUPTED)
+			break;
+
+	} while (nr_found && !done);
+
+	if (skipped) {
+		delay(1);
+		goto restart;
+	}
+	return last_error;
+}
+
+int
+xfs_inode_ag_iterator(
+	struct xfs_mount	*mp,
+	int			(*execute)(struct xfs_inode *ip,
+					   struct xfs_perag *pag, int flags),
+	int			flags)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		ag;
+
+	ag = 0;
+	while ((pag = xfs_perag_get(mp, ag))) {
+		ag = pag->pag_agno + 1;
+		error = xfs_inode_ag_walk(mp, pag, execute, flags);
+		xfs_perag_put(pag);
+		if (error) {
+			last_error = error;
+			if (error == EFSCORRUPTED)
+				break;
+		}
+	}
+	return XFS_ERROR(last_error);
+}
+
+STATIC int
+xfs_sync_inode_data(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	int			flags)
+{
+	struct inode		*inode = VFS_I(ip);
+	struct address_space *mapping = inode->i_mapping;
+	int			error = 0;
+
+	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+		goto out_wait;
+
+	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
+		if (flags & SYNC_TRYLOCK)
+			goto out_wait;
+		xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	}
+
+	error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
+				0 : XBF_ASYNC, FI_NONE);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ out_wait:
+	if (flags & SYNC_WAIT)
+		xfs_ioend_wait(ip);
+	return error;
+}
+
+STATIC int
+xfs_sync_inode_attr(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	int			flags)
+{
+	int			error = 0;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	if (xfs_inode_clean(ip))
+		goto out_unlock;
+	if (!xfs_iflock_nowait(ip)) {
+		if (!(flags & SYNC_WAIT))
+			goto out_unlock;
+		xfs_iflock(ip);
+	}
+
+	if (xfs_inode_clean(ip)) {
+		xfs_ifunlock(ip);
+		goto out_unlock;
+	}
+
+	error = xfs_iflush(ip, flags);
+
+	/*
+	 * We don't want to try again on non-blocking flushes that can't run
+	 * again immediately. If an inode really must be written, then that's
+	 * what the SYNC_WAIT flag is for.
+	 */
+	if (error == EAGAIN) {
+		ASSERT(!(flags & SYNC_WAIT));
+		error = 0;
+	}
+
+ out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return error;
+}
+
+/*
+ * Write out pagecache data for the whole filesystem.
+ */
+STATIC int
+xfs_sync_data(
+	struct xfs_mount	*mp,
+	int			flags)
+{
+	int			error;
+
+	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
+
+	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
+	if (error)
+		return XFS_ERROR(error);
+
+	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
+	return 0;
+}
+
+/*
+ * Write out inode metadata (attributes) for the whole filesystem.
+ */
+STATIC int
+xfs_sync_attr(
+	struct xfs_mount	*mp,
+	int			flags)
+{
+	ASSERT((flags & ~SYNC_WAIT) == 0);
+
+	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
+}
+
+STATIC int
+xfs_sync_fsdata(
+	struct xfs_mount	*mp)
+{
+	struct xfs_buf		*bp;
+
+	/*
+	 * If the buffer is pinned then push on the log so we won't get stuck
+	 * waiting in the write for someone, maybe ourselves, to flush the log.
+	 *
+	 * Even though we just pushed the log above, we did not have the
+	 * superblock buffer locked at that point so it can become pinned in
+	 * between there and here.
+	 */
+	bp = xfs_getsb(mp, 0);
+	if (XFS_BUF_ISPINNED(bp))
+		xfs_log_force(mp, 0);
+
+	return xfs_bwrite(mp, bp);
+}
+
+int
+xfs_log_dirty_inode(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	int			flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (!ip->i_update_core)
+		return 0;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+	error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	return xfs_trans_commit(tp, 0);
+}
+
+/*
+ * When remounting a filesystem read-only or freezing the filesystem, we have
+ * two phases to execute. This first phase is syncing the data before we
+ * quiesce the filesystem, and the second is flushing all the inodes out after
+ * we've waited for all the transactions created by the first phase to
+ * complete. The second phase ensures that the inodes are written to their
+ * location on disk rather than just existing in transactions in the log. This
+ * means after a quiesce there is no log replay required to write the inodes to
+ * disk (this is the main difference between a sync and a quiesce).
+ */
+/*
+ * First stage of freeze - no writers will make progress now we are here,
+ * so we flush delwri and delalloc buffers here, then wait for all I/O to
+ * complete.  Data is frozen at that point. Metadata is not frozen,
+ * transactions can still occur here so don't bother flushing the buftarg
+ * because it'll just get dirty again.
+ */
+int
+xfs_quiesce_data(
+	struct xfs_mount	*mp)
+{
+	int			error, error2 = 0;
+
+	/* push non-blocking */
+	xfs_sync_data(mp, 0);
+	xfs_qm_sync(mp, SYNC_TRYLOCK);
+
+	/* push and block till complete */
+	xfs_sync_data(mp, SYNC_WAIT);
+
+	/*
+	 * Log all pending size and timestamp updates.  The vfs writeback
+	 * code is supposed to do this, but due to its overagressive
+	 * livelock detection it will skip inodes where appending writes
+	 * were written out in the first non-blocking sync phase if their
+	 * completion took long enough that it happened after taking the
+	 * timestamp for the cut-off in the blocking phase.
+	 */
+	xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
+
+	xfs_qm_sync(mp, SYNC_WAIT);
+
+	/* write superblock and hoover up shutdown errors */
+	error = xfs_sync_fsdata(mp);
+
+	/* make sure all delwri buffers are written out */
+	xfs_flush_buftarg(mp->m_ddev_targp, 1);
+
+	/* mark the log as covered if needed */
+	if (xfs_log_need_covered(mp))
+		error2 = xfs_fs_log_dummy(mp);
+
+	/* flush data-only devices */
+	if (mp->m_rtdev_targp)
+		XFS_bflush(mp->m_rtdev_targp);
+
+	return error ? error : error2;
+}
+
+STATIC void
+xfs_quiesce_fs(
+	struct xfs_mount	*mp)
+{
+	int	count = 0, pincount;
+
+	xfs_reclaim_inodes(mp, 0);
+	xfs_flush_buftarg(mp->m_ddev_targp, 0);
+
+	/*
+	 * This loop must run at least twice.  The first instance of the loop
+	 * will flush most meta data but that will generate more meta data
+	 * (typically directory updates).  Which then must be flushed and
+	 * logged before we can write the unmount record. We also so sync
+	 * reclaim of inodes to catch any that the above delwri flush skipped.
+	 */
+	do {
+		xfs_reclaim_inodes(mp, SYNC_WAIT);
+		xfs_sync_attr(mp, SYNC_WAIT);
+		pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+		if (!pincount) {
+			delay(50);
+			count++;
+		}
+	} while (count < 2);
+}
+
+/*
+ * Second stage of a quiesce. The data is already synced, now we have to take
+ * care of the metadata. New transactions are already blocked, so we need to
+ * wait for any remaining transactions to drain out before proceeding.
+ */
+void
+xfs_quiesce_attr(
+	struct xfs_mount	*mp)
+{
+	int	error = 0;
+
+	/* wait for all modifications to complete */
+	while (atomic_read(&mp->m_active_trans) > 0)
+		delay(100);
+
+	/* flush inodes and push all remaining buffers out to disk */
+	xfs_quiesce_fs(mp);
+
+	/*
+	 * Just warn here till VFS can correctly support
+	 * read-only remount without racing.
+	 */
+	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
+
+	/* Push the superblock and write an unmount record */
+	error = xfs_log_sbcount(mp, 1);
+	if (error)
+		xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
+				"Frozen image may not be consistent.");
+	xfs_log_unmount_write(mp);
+	xfs_unmountfs_writesb(mp);
+}
+
+static void
+xfs_syncd_queue_sync(
+	struct xfs_mount        *mp)
+{
+	queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
+				msecs_to_jiffies(xfs_syncd_centisecs * 10));
+}
+
+/*
+ * Every sync period we need to unpin all items, reclaim inodes and sync
+ * disk quotas.  We might need to cover the log to indicate that the
+ * filesystem is idle and not frozen.
+ */
+STATIC void
+xfs_sync_worker(
+	struct work_struct *work)
+{
+	struct xfs_mount *mp = container_of(to_delayed_work(work),
+					struct xfs_mount, m_sync_work);
+	int		error;
+
+	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+		/* dgc: errors ignored here */
+		if (mp->m_super->s_frozen == SB_UNFROZEN &&
+		    xfs_log_need_covered(mp))
+			error = xfs_fs_log_dummy(mp);
+		else
+			xfs_log_force(mp, 0);
+		error = xfs_qm_sync(mp, SYNC_TRYLOCK);
+
+		/* start pushing all the metadata that is currently dirty */
+		xfs_ail_push_all(mp->m_ail);
+	}
+
+	/* queue us up again */
+	xfs_syncd_queue_sync(mp);
+}
+
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_syncd_queue_reclaim(
+	struct xfs_mount        *mp)
+{
+
+	/*
+	 * We can have inodes enter reclaim after we've shut down the syncd
+	 * workqueue during unmount, so don't allow reclaim work to be queued
+	 * during unmount.
+	 */
+	if (!(mp->m_super->s_flags & MS_ACTIVE))
+		return;
+
+	rcu_read_lock();
+	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+		queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
+			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+STATIC void
+xfs_reclaim_worker(
+	struct work_struct *work)
+{
+	struct xfs_mount *mp = container_of(to_delayed_work(work),
+					struct xfs_mount, m_reclaim_work);
+
+	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+	xfs_syncd_queue_reclaim(mp);
+}
+
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations.  At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room.
+ *
+ * Queue a new data flush if there isn't one already in progress and
+ * wait for completion of the flush. This means that we only ever have one
+ * inode flush in progress no matter how many ENOSPC events are occurring and
+ * so will prevent the system from bogging down due to every concurrent
+ * ENOSPC event scanning all the active inodes in the system for writeback.
+ */
+void
+xfs_flush_inodes(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	queue_work(xfs_syncd_wq, &mp->m_flush_work);
+	flush_work_sync(&mp->m_flush_work);
+}
+
+STATIC void
+xfs_flush_worker(
+	struct work_struct *work)
+{
+	struct xfs_mount *mp = container_of(work,
+					struct xfs_mount, m_flush_work);
+
+	xfs_sync_data(mp, SYNC_TRYLOCK);
+	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
+}
+
+int
+xfs_syncd_init(
+	struct xfs_mount	*mp)
+{
+	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
+	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
+	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+
+	xfs_syncd_queue_sync(mp);
+	xfs_syncd_queue_reclaim(mp);
+
+	return 0;
+}
+
+void
+xfs_syncd_stop(
+	struct xfs_mount	*mp)
+{
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
+}
+
+void
+__xfs_inode_set_reclaim_tag(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip)
+{
+	radix_tree_tag_set(&pag->pag_ici_root,
+			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+			   XFS_ICI_RECLAIM_TAG);
+
+	if (!pag->pag_ici_reclaimable) {
+		/* propagate the reclaim tag up into the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				XFS_ICI_RECLAIM_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+
+		/* schedule periodic background inode reclaim */
+		xfs_syncd_queue_reclaim(ip->i_mount);
+
+		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+							-1, _RET_IP_);
+	}
+	pag->pag_ici_reclaimable++;
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+	xfs_inode_t	*ip)
+{
+	struct xfs_mount *mp = ip->i_mount;
+	struct xfs_perag *pag;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	spin_lock(&pag->pag_ici_lock);
+	spin_lock(&ip->i_flags_lock);
+	__xfs_inode_set_reclaim_tag(pag, ip);
+	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+	spin_unlock(&ip->i_flags_lock);
+	spin_unlock(&pag->pag_ici_lock);
+	xfs_perag_put(pag);
+}
+
+STATIC void
+__xfs_inode_clear_reclaim(
+	xfs_perag_t	*pag,
+	xfs_inode_t	*ip)
+{
+	pag->pag_ici_reclaimable--;
+	if (!pag->pag_ici_reclaimable) {
+		/* clear the reclaim tag from the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				XFS_ICI_RECLAIM_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+							-1, _RET_IP_);
+	}
+}
+
+void
+__xfs_inode_clear_reclaim_tag(
+	xfs_mount_t	*mp,
+	xfs_perag_t	*pag,
+	xfs_inode_t	*ip)
+{
+	radix_tree_tag_clear(&pag->pag_ici_root,
+			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+	__xfs_inode_clear_reclaim(pag, ip);
+}
+
+/*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+	struct xfs_inode	*ip,
+	int			flags)
+{
+	ASSERT(rcu_read_lock_held());
+
+	/* quick check for stale RCU freed inode */
+	if (!ip->i_ino)
+		return 1;
+
+	/*
+	 * do some unlocked checks first to avoid unnecessary lock traffic.
+	 * The first is a flush lock check, the second is a already in reclaim
+	 * check. Only do these checks if we are not going to block on locks.
+	 */
+	if ((flags & SYNC_TRYLOCK) &&
+	    (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+		return 1;
+	}
+
+	/*
+	 * The radix tree lock here protects a thread in xfs_iget from racing
+	 * with us starting reclaim on the inode.  Once we have the
+	 * XFS_IRECLAIM flag set it will not touch us.
+	 *
+	 * Due to RCU lookup, we may find inodes that have been freed and only
+	 * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+	 * aren't candidates for reclaim at all, so we must check the
+	 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+		/* not a reclaim candidate. */
+		spin_unlock(&ip->i_flags_lock);
+		return 1;
+	}
+	__xfs_iflags_set(ip, XFS_IRECLAIM);
+	spin_unlock(&ip->i_flags_lock);
+	return 0;
+}
+
+/*
+ * Inodes in different states need to be treated differently, and the return
+ * value of xfs_iflush is not sufficient to get this right. The following table
+ * lists the inode states and the reclaim actions necessary for non-blocking
+ * reclaim:
+ *
+ *
+ *	inode state	     iflush ret		required action
+ *      ---------------      ----------         ---------------
+ *	bad			-		reclaim
+ *	shutdown		EIO		unpin and reclaim
+ *	clean, unpinned		0		reclaim
+ *	stale, unpinned		0		reclaim
+ *	clean, pinned(*)	0		requeue
+ *	stale, pinned		EAGAIN		requeue
+ *	dirty, delwri ok	0		requeue
+ *	dirty, delwri blocked	EAGAIN		requeue
+ *	dirty, sync flush	0		reclaim
+ *
+ * (*) dgc: I don't think the clean, pinned state is possible but it gets
+ * handled anyway given the order of checks implemented.
+ *
+ * As can be seen from the table, the return value of xfs_iflush() is not
+ * sufficient to correctly decide the reclaim action here. The checks in
+ * xfs_iflush() might look like duplicates, but they are not.
+ *
+ * Also, because we get the flush lock first, we know that any inode that has
+ * been flushed delwri has had the flush completed by the time we check that
+ * the inode is clean. The clean inode check needs to be done before flushing
+ * the inode delwri otherwise we would loop forever requeuing clean inodes as
+ * we cannot tell apart a successful delwri flush and a clean inode from the
+ * return value of xfs_iflush().
+ *
+ * Note that because the inode is flushed delayed write by background
+ * writeback, the flush lock may already be held here and waiting on it can
+ * result in very long latencies. Hence for sync reclaims, where we wait on the
+ * flush lock, the caller should push out delayed write inodes first before
+ * trying to reclaim them to minimise the amount of time spent waiting. For
+ * background relaim, we just requeue the inode for the next pass.
+ *
+ * Hence the order of actions after gaining the locks should be:
+ *	bad		=> reclaim
+ *	shutdown	=> unpin and reclaim
+ *	pinned, delwri	=> requeue
+ *	pinned, sync	=> unpin
+ *	stale		=> reclaim
+ *	clean		=> reclaim
+ *	dirty, delwri	=> flush and requeue
+ *	dirty, sync	=> flush, wait and reclaim
+ */
+STATIC int
+xfs_reclaim_inode(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	int			sync_mode)
+{
+	int	error;
+
+restart:
+	error = 0;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (!xfs_iflock_nowait(ip)) {
+		if (!(sync_mode & SYNC_WAIT))
+			goto out;
+
+		/*
+		 * If we only have a single dirty inode in a cluster there is
+		 * a fair chance that the AIL push may have pushed it into
+		 * the buffer, but xfsbufd won't touch it until 30 seconds
+		 * from now, and thus we will lock up here.
+		 *
+		 * Promote the inode buffer to the front of the delwri list
+		 * and wake up xfsbufd now.
+		 */
+		xfs_promote_inode(ip);
+		xfs_iflock(ip);
+	}
+
+	if (is_bad_inode(VFS_I(ip)))
+		goto reclaim;
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		xfs_iunpin_wait(ip);
+		goto reclaim;
+	}
+	if (xfs_ipincount(ip)) {
+		if (!(sync_mode & SYNC_WAIT)) {
+			xfs_ifunlock(ip);
+			goto out;
+		}
+		xfs_iunpin_wait(ip);
+	}
+	if (xfs_iflags_test(ip, XFS_ISTALE))
+		goto reclaim;
+	if (xfs_inode_clean(ip))
+		goto reclaim;
+
+	/*
+	 * Now we have an inode that needs flushing.
+	 *
+	 * We do a nonblocking flush here even if we are doing a SYNC_WAIT
+	 * reclaim as we can deadlock with inode cluster removal.
+	 * xfs_ifree_cluster() can lock the inode buffer before it locks the
+	 * ip->i_lock, and we are doing the exact opposite here. As a result,
+	 * doing a blocking xfs_itobp() to get the cluster buffer will result
+	 * in an ABBA deadlock with xfs_ifree_cluster().
+	 *
+	 * As xfs_ifree_cluser() must gather all inodes that are active in the
+	 * cache to mark them stale, if we hit this case we don't actually want
+	 * to do IO here - we want the inode marked stale so we can simply
+	 * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
+	 * just unlock the inode, back off and try again. Hopefully the next
+	 * pass through will see the stale flag set on the inode.
+	 */
+	error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
+	if (sync_mode & SYNC_WAIT) {
+		if (error == EAGAIN) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			/* backoff longer than in xfs_ifree_cluster */
+			delay(2);
+			goto restart;
+		}
+		xfs_iflock(ip);
+		goto reclaim;
+	}
+
+	/*
+	 * When we have to flush an inode but don't have SYNC_WAIT set, we
+	 * flush the inode out using a delwri buffer and wait for the next
+	 * call into reclaim to find it in a clean state instead of waiting for
+	 * it now. We also don't return errors here - if the error is transient
+	 * then the next reclaim pass will flush the inode, and if the error
+	 * is permanent then the next sync reclaim will reclaim the inode and
+	 * pass on the error.
+	 */
+	if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		xfs_warn(ip->i_mount,
+			"inode 0x%llx background reclaim flush failed with %d",
+			(long long)ip->i_ino, error);
+	}
+out:
+	xfs_iflags_clear(ip, XFS_IRECLAIM);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	/*
+	 * We could return EAGAIN here to make reclaim rescan the inode tree in
+	 * a short while. However, this just burns CPU time scanning the tree
+	 * waiting for IO to complete and xfssyncd never goes back to the idle
+	 * state. Instead, return 0 to let the next scheduled background reclaim
+	 * attempt to reclaim the inode again.
+	 */
+	return 0;
+
+reclaim:
+	xfs_ifunlock(ip);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	XFS_STATS_INC(xs_ig_reclaims);
+	/*
+	 * Remove the inode from the per-AG radix tree.
+	 *
+	 * Because radix_tree_delete won't complain even if the item was never
+	 * added to the tree assert that it's been there before to catch
+	 * problems with the inode life time early on.
+	 */
+	spin_lock(&pag->pag_ici_lock);
+	if (!radix_tree_delete(&pag->pag_ici_root,
+				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+		ASSERT(0);
+	__xfs_inode_clear_reclaim(pag, ip);
+	spin_unlock(&pag->pag_ici_lock);
+
+	/*
+	 * Here we do an (almost) spurious inode lock in order to coordinate
+	 * with inode cache radix tree lookups.  This is because the lookup
+	 * can reference the inodes in the cache without taking references.
+	 *
+	 * We make that OK here by ensuring that we wait until the inode is
+	 * unlocked after the lookup before we go ahead and free it.  We get
+	 * both the ilock and the iolock because the code may need to drop the
+	 * ilock one but will still hold the iolock.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_qm_dqdetach(ip);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+	xfs_inode_free(ip);
+	return error;
+
+}
+
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes. If we don't,
+ * then a shut down during filesystem unmount reclaim walk leak all the
+ * unreclaimed inodes.
+ */
+int
+xfs_reclaim_inodes_ag(
+	struct xfs_mount	*mp,
+	int			flags,
+	int			*nr_to_scan)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		ag;
+	int			trylock = flags & SYNC_TRYLOCK;
+	int			skipped;
+
+restart:
+	ag = 0;
+	skipped = 0;
+	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+		unsigned long	first_index = 0;
+		int		done = 0;
+		int		nr_found = 0;
+
+		ag = pag->pag_agno + 1;
+
+		if (trylock) {
+			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+				skipped++;
+				xfs_perag_put(pag);
+				continue;
+			}
+			first_index = pag->pag_ici_reclaim_cursor;
+		} else
+			mutex_lock(&pag->pag_ici_reclaim_lock);
+
+		do {
+			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+			int	i;
+
+			rcu_read_lock();
+			nr_found = radix_tree_gang_lookup_tag(
+					&pag->pag_ici_root,
+					(void **)batch, first_index,
+					XFS_LOOKUP_BATCH,
+					XFS_ICI_RECLAIM_TAG);
+			if (!nr_found) {
+				done = 1;
+				rcu_read_unlock();
+				break;
+			}
+
+			/*
+			 * Grab the inodes before we drop the lock. if we found
+			 * nothing, nr == 0 and the loop will be skipped.
+			 */
+			for (i = 0; i < nr_found; i++) {
+				struct xfs_inode *ip = batch[i];
+
+				if (done || xfs_reclaim_inode_grab(ip, flags))
+					batch[i] = NULL;
+
+				/*
+				 * Update the index for the next lookup. Catch
+				 * overflows into the next AG range which can
+				 * occur if we have inodes in the last block of
+				 * the AG and we are currently pointing to the
+				 * last inode.
+				 *
+				 * Because we may see inodes that are from the
+				 * wrong AG due to RCU freeing and
+				 * reallocation, only update the index if it
+				 * lies in this AG. It was a race that lead us
+				 * to see this inode, so another lookup from
+				 * the same index will not find it again.
+				 */
+				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+								pag->pag_agno)
+					continue;
+				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+					done = 1;
+			}
+
+			/* unlock now we've grabbed the inodes. */
+			rcu_read_unlock();
+
+			for (i = 0; i < nr_found; i++) {
+				if (!batch[i])
+					continue;
+				error = xfs_reclaim_inode(batch[i], pag, flags);
+				if (error && last_error != EFSCORRUPTED)
+					last_error = error;
+			}
+
+			*nr_to_scan -= XFS_LOOKUP_BATCH;
+
+		} while (nr_found && !done && *nr_to_scan > 0);
+
+		if (trylock && !done)
+			pag->pag_ici_reclaim_cursor = first_index;
+		else
+			pag->pag_ici_reclaim_cursor = 0;
+		mutex_unlock(&pag->pag_ici_reclaim_lock);
+		xfs_perag_put(pag);
+	}
+
+	/*
+	 * if we skipped any AG, and we still have scan count remaining, do
+	 * another pass this time using blocking reclaim semantics (i.e
+	 * waiting on the reclaim locks and ignoring the reclaim cursors). This
+	 * ensure that when we get more reclaimers than AGs we block rather
+	 * than spin trying to execute reclaim.
+	 */
+	if (trylock && skipped && *nr_to_scan > 0) {
+		trylock = 0;
+		goto restart;
+	}
+	return XFS_ERROR(last_error);
+}
+
+int
+xfs_reclaim_inodes(
+	xfs_mount_t	*mp,
+	int		mode)
+{
+	int		nr_to_scan = INT_MAX;
+
+	return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+}
+
+/*
+ * Inode cache shrinker.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doiing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
+ */
+static int
+xfs_reclaim_inode_shrink(
+	struct shrinker	*shrink,
+	struct shrink_control *sc)
+{
+	struct xfs_mount *mp;
+	struct xfs_perag *pag;
+	xfs_agnumber_t	ag;
+	int		reclaimable;
+	int nr_to_scan = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
+
+	mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
+	if (nr_to_scan) {
+		/* kick background reclaimer and push the AIL */
+		xfs_syncd_queue_reclaim(mp);
+		xfs_ail_push_all(mp->m_ail);
+
+		if (!(gfp_mask & __GFP_FS))
+			return -1;
+
+		xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
+					&nr_to_scan);
+		/* terminate if we don't exhaust the scan */
+		if (nr_to_scan > 0)
+			return -1;
+       }
+
+	reclaimable = 0;
+	ag = 0;
+	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+		ag = pag->pag_agno + 1;
+		reclaimable += pag->pag_ici_reclaimable;
+		xfs_perag_put(pag);
+	}
+	return reclaimable;
+}
+
+void
+xfs_inode_shrinker_register(
+	struct xfs_mount	*mp)
+{
+	mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink;
+	mp->m_inode_shrink.seeks = DEFAULT_SEEKS;
+	register_shrinker(&mp->m_inode_shrink);
+}
+
+void
+xfs_inode_shrinker_unregister(
+	struct xfs_mount	*mp)
+{
+	unregister_shrinker(&mp->m_inode_shrink);
+}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
new file mode 100644
index 00000000..ef5b2ce4
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+
+struct xfs_mount;
+struct xfs_perag;
+
+typedef struct xfs_sync_work {
+	struct list_head	w_list;
+	struct xfs_mount	*w_mount;
+	void			*w_data;	/* syncer routine argument */
+	void			(*w_syncer)(struct xfs_mount *, void *);
+	struct completion	*w_completion;
+} xfs_sync_work_t;
+
+#define SYNC_WAIT		0x0001	/* wait for i/o to complete */
+#define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
+
+extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
+
+int xfs_syncd_init(struct xfs_mount *mp);
+void xfs_syncd_stop(struct xfs_mount *mp);
+
+int xfs_quiesce_data(struct xfs_mount *mp);
+void xfs_quiesce_attr(struct xfs_mount *mp);
+
+void xfs_flush_inodes(struct xfs_inode *ip);
+
+int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags);
+
+int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
+
+void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
+void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
+				struct xfs_inode *ip);
+
+int xfs_sync_inode_grab(struct xfs_inode *ip);
+int xfs_inode_ag_iterator(struct xfs_mount *mp,
+	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
+	int flags);
+
+void xfs_inode_shrinker_register(struct xfs_mount *mp);
+void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
+
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
new file mode 100644
index 00000000..ee2d2ada
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2001-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include "xfs_error.h"
+
+static struct ctl_table_header *xfs_table_header;
+
+#ifdef CONFIG_PROC_FS
+STATIC int
+xfs_stats_clear_proc_handler(
+	ctl_table	*ctl,
+	int		write,
+	void		__user *buffer,
+	size_t		*lenp,
+	loff_t		*ppos)
+{
+	int		c, ret, *valp = ctl->data;
+	__uint32_t	vn_active;
+
+	ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+
+	if (!ret && write && *valp) {
+		xfs_notice(NULL, "Clearing xfsstats");
+		for_each_possible_cpu(c) {
+			preempt_disable();
+			/* save vn_active, it's a universal truth! */
+			vn_active = per_cpu(xfsstats, c).vn_active;
+			memset(&per_cpu(xfsstats, c), 0,
+			       sizeof(struct xfsstats));
+			per_cpu(xfsstats, c).vn_active = vn_active;
+			preempt_enable();
+		}
+		xfs_stats_clear = 0;
+	}
+
+	return ret;
+}
+
+STATIC int
+xfs_panic_mask_proc_handler(
+	ctl_table	*ctl,
+	int		write,
+	void		__user *buffer,
+	size_t		*lenp,
+	loff_t		*ppos)
+{
+	int		ret, *valp = ctl->data;
+
+	ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+	if (!ret && write) {
+		xfs_panic_mask = *valp;
+#ifdef DEBUG
+		xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+	}
+	return ret;
+}
+#endif /* CONFIG_PROC_FS */
+
+static ctl_table xfs_table[] = {
+	{
+		.procname	= "irix_sgid_inherit",
+		.data		= &xfs_params.sgid_inherit.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.sgid_inherit.min,
+		.extra2		= &xfs_params.sgid_inherit.max
+	},
+	{
+		.procname	= "irix_symlink_mode",
+		.data		= &xfs_params.symlink_mode.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.symlink_mode.min,
+		.extra2		= &xfs_params.symlink_mode.max
+	},
+	{
+		.procname	= "panic_mask",
+		.data		= &xfs_params.panic_mask.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= xfs_panic_mask_proc_handler,
+		.extra1		= &xfs_params.panic_mask.min,
+		.extra2		= &xfs_params.panic_mask.max
+	},
+
+	{
+		.procname	= "error_level",
+		.data		= &xfs_params.error_level.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.error_level.min,
+		.extra2		= &xfs_params.error_level.max
+	},
+	{
+		.procname	= "xfssyncd_centisecs",
+		.data		= &xfs_params.syncd_timer.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.syncd_timer.min,
+		.extra2		= &xfs_params.syncd_timer.max
+	},
+	{
+		.procname	= "inherit_sync",
+		.data		= &xfs_params.inherit_sync.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inherit_sync.min,
+		.extra2		= &xfs_params.inherit_sync.max
+	},
+	{
+		.procname	= "inherit_nodump",
+		.data		= &xfs_params.inherit_nodump.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inherit_nodump.min,
+		.extra2		= &xfs_params.inherit_nodump.max
+	},
+	{
+		.procname	= "inherit_noatime",
+		.data		= &xfs_params.inherit_noatim.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inherit_noatim.min,
+		.extra2		= &xfs_params.inherit_noatim.max
+	},
+	{
+		.procname	= "xfsbufd_centisecs",
+		.data		= &xfs_params.xfs_buf_timer.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.xfs_buf_timer.min,
+		.extra2		= &xfs_params.xfs_buf_timer.max
+	},
+	{
+		.procname	= "age_buffer_centisecs",
+		.data		= &xfs_params.xfs_buf_age.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.xfs_buf_age.min,
+		.extra2		= &xfs_params.xfs_buf_age.max
+	},
+	{
+		.procname	= "inherit_nosymlinks",
+		.data		= &xfs_params.inherit_nosym.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inherit_nosym.min,
+		.extra2		= &xfs_params.inherit_nosym.max
+	},
+	{
+		.procname	= "rotorstep",
+		.data		= &xfs_params.rotorstep.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.rotorstep.min,
+		.extra2		= &xfs_params.rotorstep.max
+	},
+	{
+		.procname	= "inherit_nodefrag",
+		.data		= &xfs_params.inherit_nodfrg.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inherit_nodfrg.min,
+		.extra2		= &xfs_params.inherit_nodfrg.max
+	},
+	{
+		.procname	= "filestream_centisecs",
+		.data		= &xfs_params.fstrm_timer.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.fstrm_timer.min,
+		.extra2		= &xfs_params.fstrm_timer.max,
+	},
+	/* please keep this the last entry */
+#ifdef CONFIG_PROC_FS
+	{
+		.procname	= "stats_clear",
+		.data		= &xfs_params.stats_clear.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= xfs_stats_clear_proc_handler,
+		.extra1		= &xfs_params.stats_clear.min,
+		.extra2		= &xfs_params.stats_clear.max
+	},
+#endif /* CONFIG_PROC_FS */
+
+	{}
+};
+
+static ctl_table xfs_dir_table[] = {
+	{
+		.procname	= "xfs",
+		.mode		= 0555,
+		.child		= xfs_table
+	},
+	{}
+};
+
+static ctl_table xfs_root_table[] = {
+	{
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= xfs_dir_table
+	},
+	{}
+};
+
+int
+xfs_sysctl_register(void)
+{
+	xfs_table_header = register_sysctl_table(xfs_root_table);
+	if (!xfs_table_header)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+xfs_sysctl_unregister(void)
+{
+	unregister_sysctl_table(xfs_table_header);
+}
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
new file mode 100644
index 00000000..b9937d45
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2001-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SYSCTL_H__
+#define __XFS_SYSCTL_H__
+
+#include <linux/sysctl.h>
+
+/*
+ * Tunable xfs parameters
+ */
+
+typedef struct xfs_sysctl_val {
+	int min;
+	int val;
+	int max;
+} xfs_sysctl_val_t;
+
+typedef struct xfs_param {
+	xfs_sysctl_val_t sgid_inherit;	/* Inherit S_ISGID if process' GID is
+					 * not a member of parent dir GID. */
+	xfs_sysctl_val_t symlink_mode;	/* Link creat mode affected by umask */
+	xfs_sysctl_val_t panic_mask;	/* bitmask to cause panic on errors. */
+	xfs_sysctl_val_t error_level;	/* Degree of reporting for problems  */
+	xfs_sysctl_val_t syncd_timer;	/* Interval between xfssyncd wakeups */
+	xfs_sysctl_val_t stats_clear;	/* Reset all XFS statistics to zero. */
+	xfs_sysctl_val_t inherit_sync;	/* Inherit the "sync" inode flag. */
+	xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */
+	xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */
+	xfs_sysctl_val_t xfs_buf_timer;	/* Interval between xfsbufd wakeups. */
+	xfs_sysctl_val_t xfs_buf_age;	/* Metadata buffer age before flush. */
+	xfs_sysctl_val_t inherit_nosym;	/* Inherit the "nosymlinks" flag. */
+	xfs_sysctl_val_t rotorstep;	/* inode32 AG rotoring control knob */
+	xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
+	xfs_sysctl_val_t fstrm_timer;	/* Filestream dir-AG assoc'n timeout. */
+} xfs_param_t;
+
+/*
+ * xfs_error_level:
+ *
+ * How much error reporting will be done when internal problems are
+ * encountered.  These problems normally return an EFSCORRUPTED to their
+ * caller, with no other information reported.
+ *
+ * 0	No error reports
+ * 1	Report EFSCORRUPTED errors that will cause a filesystem shutdown
+ * 5	Report all EFSCORRUPTED errors (all of the above errors, plus any
+ *	additional errors that are known to not cause shutdowns)
+ *
+ * xfs_panic_mask bit 0x8 turns the error reports into panics
+ */
+
+enum {
+	/* XFS_REFCACHE_SIZE = 1 */
+	/* XFS_REFCACHE_PURGE = 2 */
+	/* XFS_RESTRICT_CHOWN = 3 */
+	XFS_SGID_INHERIT = 4,
+	XFS_SYMLINK_MODE = 5,
+	XFS_PANIC_MASK = 6,
+	XFS_ERRLEVEL = 7,
+	XFS_SYNCD_TIMER = 8,
+	/* XFS_PROBE_DMAPI = 9 */
+	/* XFS_PROBE_IOOPS = 10 */
+	/* XFS_PROBE_QUOTA = 11 */
+	XFS_STATS_CLEAR = 12,
+	XFS_INHERIT_SYNC = 13,
+	XFS_INHERIT_NODUMP = 14,
+	XFS_INHERIT_NOATIME = 15,
+	XFS_BUF_TIMER = 16,
+	XFS_BUF_AGE = 17,
+	/* XFS_IO_BYPASS = 18 */
+	XFS_INHERIT_NOSYM = 19,
+	XFS_ROTORSTEP = 20,
+	XFS_INHERIT_NODFRG = 21,
+	XFS_FILESTREAM_TIMER = 22,
+};
+
+extern xfs_param_t	xfs_params;
+
+#ifdef CONFIG_SYSCTL
+extern int xfs_sysctl_register(void);
+extern void xfs_sysctl_unregister(void);
+#else
+# define xfs_sysctl_register()		(0)
+# define xfs_sysctl_unregister()	do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
+#endif /* __XFS_SYSCTL_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
new file mode 100644
index 00000000..88d25d4a
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2009, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_mount.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_quota.h"
+#include "xfs_iomap.h"
+#include "xfs_aops.h"
+#include "quota/xfs_dquot_item.h"
+#include "quota/xfs_dquot.h"
+#include "xfs_log_recover.h"
+#include "xfs_inode_item.h"
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "xfs_trace.h"
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
new file mode 100644
index 00000000..d48b7a57
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -0,0 +1,1776 @@
+/*
+ * Copyright (c) 2009, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM xfs
+
+#if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_XFS_H
+
+#include <linux/tracepoint.h>
+
+struct xfs_agf;
+struct xfs_alloc_arg;
+struct xfs_attr_list_context;
+struct xfs_buf_log_item;
+struct xfs_da_args;
+struct xfs_da_node_entry;
+struct xfs_dquot;
+struct xlog_ticket;
+struct log;
+struct xlog_recover;
+struct xlog_recover_item;
+struct xfs_buf_log_format;
+struct xfs_inode_log_format;
+
+DECLARE_EVENT_CLASS(xfs_attr_list_class,
+	TP_PROTO(struct xfs_attr_list_context *ctx),
+	TP_ARGS(ctx),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(u32, hashval)
+		__field(u32, blkno)
+		__field(u32, offset)
+		__field(void *, alist)
+		__field(int, bufsize)
+		__field(int, count)
+		__field(int, firstu)
+		__field(int, dupcnt)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
+		__entry->ino = ctx->dp->i_ino;
+		__entry->hashval = ctx->cursor->hashval;
+		__entry->blkno = ctx->cursor->blkno;
+		__entry->offset = ctx->cursor->offset;
+		__entry->alist = ctx->alist;
+		__entry->bufsize = ctx->bufsize;
+		__entry->count = ctx->count;
+		__entry->firstu = ctx->firstu;
+		__entry->flags = ctx->flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
+		  "alist 0x%p size %u count %u firstu %u flags %d %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		   __entry->ino,
+		   __entry->hashval,
+		   __entry->blkno,
+		   __entry->offset,
+		   __entry->dupcnt,
+		   __entry->alist,
+		   __entry->bufsize,
+		   __entry->count,
+		   __entry->firstu,
+		   __entry->flags,
+		   __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS)
+	)
+)
+
+#define DEFINE_ATTR_LIST_EVENT(name) \
+DEFINE_EVENT(xfs_attr_list_class, name, \
+	TP_PROTO(struct xfs_attr_list_context *ctx), \
+	TP_ARGS(ctx))
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf_end);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+
+DECLARE_EVENT_CLASS(xfs_perag_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+		 unsigned long caller_ip),
+	TP_ARGS(mp, agno, refcount, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, refcount)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->refcount = refcount;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno %u refcount %d caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->refcount,
+		  (char *)__entry->caller_ip)
+);
+
+#define DEFINE_PERAG_REF_EVENT(name)	\
+DEFINE_EVENT(xfs_perag_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,	\
+		 unsigned long caller_ip),					\
+	TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
+DEFINE_PERAG_REF_EVENT(xfs_perag_put);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
+
+TRACE_EVENT(xfs_attr_list_node_descend,
+	TP_PROTO(struct xfs_attr_list_context *ctx,
+		 struct xfs_da_node_entry *btree),
+	TP_ARGS(ctx, btree),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(u32, hashval)
+		__field(u32, blkno)
+		__field(u32, offset)
+		__field(void *, alist)
+		__field(int, bufsize)
+		__field(int, count)
+		__field(int, firstu)
+		__field(int, dupcnt)
+		__field(int, flags)
+		__field(u32, bt_hashval)
+		__field(u32, bt_before)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
+		__entry->ino = ctx->dp->i_ino;
+		__entry->hashval = ctx->cursor->hashval;
+		__entry->blkno = ctx->cursor->blkno;
+		__entry->offset = ctx->cursor->offset;
+		__entry->alist = ctx->alist;
+		__entry->bufsize = ctx->bufsize;
+		__entry->count = ctx->count;
+		__entry->firstu = ctx->firstu;
+		__entry->flags = ctx->flags;
+		__entry->bt_hashval = be32_to_cpu(btree->hashval);
+		__entry->bt_before = be32_to_cpu(btree->before);
+	),
+	TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
+		  "alist 0x%p size %u count %u firstu %u flags %d %s "
+		  "node hashval %u, node before %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		   __entry->ino,
+		   __entry->hashval,
+		   __entry->blkno,
+		   __entry->offset,
+		   __entry->dupcnt,
+		   __entry->alist,
+		   __entry->bufsize,
+		   __entry->count,
+		   __entry->firstu,
+		   __entry->flags,
+		   __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
+		   __entry->bt_hashval,
+		   __entry->bt_before)
+);
+
+TRACE_EVENT(xfs_iext_insert,
+	TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx,
+		 struct xfs_bmbt_irec *r, int state, unsigned long caller_ip),
+	TP_ARGS(ip, idx, r, state, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_extnum_t, idx)
+		__field(xfs_fileoff_t, startoff)
+		__field(xfs_fsblock_t, startblock)
+		__field(xfs_filblks_t, blockcount)
+		__field(xfs_exntst_t, state)
+		__field(int, bmap_state)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->idx = idx;
+		__entry->startoff = r->br_startoff;
+		__entry->startblock = r->br_startblock;
+		__entry->blockcount = r->br_blockcount;
+		__entry->state = r->br_state;
+		__entry->bmap_state = state;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
+		  "offset %lld block %lld count %lld flag %d caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
+		  (long)__entry->idx,
+		  __entry->startoff,
+		  (__int64_t)__entry->startblock,
+		  __entry->blockcount,
+		  __entry->state,
+		  (char *)__entry->caller_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_bmap_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state,
+		 unsigned long caller_ip),
+	TP_ARGS(ip, idx, state, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_extnum_t, idx)
+		__field(xfs_fileoff_t, startoff)
+		__field(xfs_fsblock_t, startblock)
+		__field(xfs_filblks_t, blockcount)
+		__field(xfs_exntst_t, state)
+		__field(int, bmap_state)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		struct xfs_ifork	*ifp = (state & BMAP_ATTRFORK) ?
+						ip->i_afp : &ip->i_df;
+		struct xfs_bmbt_irec	r;
+
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r);
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->idx = idx;
+		__entry->startoff = r.br_startoff;
+		__entry->startblock = r.br_startblock;
+		__entry->blockcount = r.br_blockcount;
+		__entry->state = r.br_state;
+		__entry->bmap_state = state;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
+		  "offset %lld block %lld count %lld flag %d caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
+		  (long)__entry->idx,
+		  __entry->startoff,
+		  (__int64_t)__entry->startblock,
+		  __entry->blockcount,
+		  __entry->state,
+		  (char *)__entry->caller_ip)
+)
+
+#define DEFINE_BMAP_EVENT(name) \
+DEFINE_EVENT(xfs_bmap_class, name, \
+	TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \
+		 unsigned long caller_ip), \
+	TP_ARGS(ip, idx, state, caller_ip))
+DEFINE_BMAP_EVENT(xfs_iext_remove);
+DEFINE_BMAP_EVENT(xfs_bmap_pre_update);
+DEFINE_BMAP_EVENT(xfs_bmap_post_update);
+DEFINE_BMAP_EVENT(xfs_extlist);
+
+DECLARE_EVENT_CLASS(xfs_buf_class,
+	TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip),
+	TP_ARGS(bp, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, bno)
+		__field(size_t, buffer_length)
+		__field(int, hold)
+		__field(int, pincount)
+		__field(unsigned, lockval)
+		__field(unsigned, flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = bp->b_target->bt_dev;
+		__entry->bno = bp->b_bn;
+		__entry->buffer_length = bp->b_buffer_length;
+		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->pincount = atomic_read(&bp->b_pin_count);
+		__entry->lockval = xfs_buf_lock_value(bp);
+		__entry->flags = bp->b_flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+		  "lock %d flags %s caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->bno,
+		  __entry->buffer_length,
+		  __entry->hold,
+		  __entry->pincount,
+		  __entry->lockval,
+		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+		  (void *)__entry->caller_ip)
+)
+
+#define DEFINE_BUF_EVENT(name) \
+DEFINE_EVENT(xfs_buf_class, name, \
+	TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \
+	TP_ARGS(bp, caller_ip))
+DEFINE_BUF_EVENT(xfs_buf_init);
+DEFINE_BUF_EVENT(xfs_buf_free);
+DEFINE_BUF_EVENT(xfs_buf_hold);
+DEFINE_BUF_EVENT(xfs_buf_rele);
+DEFINE_BUF_EVENT(xfs_buf_iodone);
+DEFINE_BUF_EVENT(xfs_buf_iorequest);
+DEFINE_BUF_EVENT(xfs_buf_bawrite);
+DEFINE_BUF_EVENT(xfs_buf_bdwrite);
+DEFINE_BUF_EVENT(xfs_buf_lock);
+DEFINE_BUF_EVENT(xfs_buf_lock_done);
+DEFINE_BUF_EVENT(xfs_buf_cond_lock);
+DEFINE_BUF_EVENT(xfs_buf_unlock);
+DEFINE_BUF_EVENT(xfs_buf_iowait);
+DEFINE_BUF_EVENT(xfs_buf_iowait_done);
+DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
+DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
+DEFINE_BUF_EVENT(xfs_buf_delwri_split);
+DEFINE_BUF_EVENT(xfs_buf_get_uncached);
+DEFINE_BUF_EVENT(xfs_bdstrat_shut);
+DEFINE_BUF_EVENT(xfs_buf_item_relse);
+DEFINE_BUF_EVENT(xfs_buf_item_iodone);
+DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
+DEFINE_BUF_EVENT(xfs_buf_error_relse);
+DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
+DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
+
+/* not really buffer traces, but the buf provides useful information */
+DEFINE_BUF_EVENT(xfs_btree_corrupt);
+DEFINE_BUF_EVENT(xfs_da_btree_corrupt);
+DEFINE_BUF_EVENT(xfs_reset_dqcounts);
+DEFINE_BUF_EVENT(xfs_inode_item_push);
+
+/* pass flags explicitly */
+DECLARE_EVENT_CLASS(xfs_buf_flags_class,
+	TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip),
+	TP_ARGS(bp, flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, bno)
+		__field(size_t, buffer_length)
+		__field(int, hold)
+		__field(int, pincount)
+		__field(unsigned, lockval)
+		__field(unsigned, flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = bp->b_target->bt_dev;
+		__entry->bno = bp->b_bn;
+		__entry->buffer_length = bp->b_buffer_length;
+		__entry->flags = flags;
+		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->pincount = atomic_read(&bp->b_pin_count);
+		__entry->lockval = xfs_buf_lock_value(bp);
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+		  "lock %d flags %s caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->bno,
+		  __entry->buffer_length,
+		  __entry->hold,
+		  __entry->pincount,
+		  __entry->lockval,
+		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+		  (void *)__entry->caller_ip)
+)
+
+#define DEFINE_BUF_FLAGS_EVENT(name) \
+DEFINE_EVENT(xfs_buf_flags_class, name, \
+	TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \
+	TP_ARGS(bp, flags, caller_ip))
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
+
+TRACE_EVENT(xfs_buf_ioerror,
+	TP_PROTO(struct xfs_buf *bp, int error, unsigned long caller_ip),
+	TP_ARGS(bp, error, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, bno)
+		__field(size_t, buffer_length)
+		__field(unsigned, flags)
+		__field(int, hold)
+		__field(int, pincount)
+		__field(unsigned, lockval)
+		__field(int, error)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = bp->b_target->bt_dev;
+		__entry->bno = bp->b_bn;
+		__entry->buffer_length = bp->b_buffer_length;
+		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->pincount = atomic_read(&bp->b_pin_count);
+		__entry->lockval = xfs_buf_lock_value(bp);
+		__entry->error = error;
+		__entry->flags = bp->b_flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+		  "lock %d error %d flags %s caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->bno,
+		  __entry->buffer_length,
+		  __entry->hold,
+		  __entry->pincount,
+		  __entry->lockval,
+		  __entry->error,
+		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+		  (void *)__entry->caller_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_buf_item_class,
+	TP_PROTO(struct xfs_buf_log_item *bip),
+	TP_ARGS(bip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, buf_bno)
+		__field(size_t, buf_len)
+		__field(int, buf_hold)
+		__field(int, buf_pincount)
+		__field(int, buf_lockval)
+		__field(unsigned, buf_flags)
+		__field(unsigned, bli_recur)
+		__field(int, bli_refcount)
+		__field(unsigned, bli_flags)
+		__field(void *, li_desc)
+		__field(unsigned, li_flags)
+	),
+	TP_fast_assign(
+		__entry->dev = bip->bli_buf->b_target->bt_dev;
+		__entry->bli_flags = bip->bli_flags;
+		__entry->bli_recur = bip->bli_recur;
+		__entry->bli_refcount = atomic_read(&bip->bli_refcount);
+		__entry->buf_bno = bip->bli_buf->b_bn;
+		__entry->buf_len = bip->bli_buf->b_buffer_length;
+		__entry->buf_flags = bip->bli_buf->b_flags;
+		__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
+		__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
+		__entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf);
+		__entry->li_desc = bip->bli_item.li_desc;
+		__entry->li_flags = bip->bli_item.li_flags;
+	),
+	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+		  "lock %d flags %s recur %d refcount %d bliflags %s "
+		  "lidesc 0x%p liflags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->buf_bno,
+		  __entry->buf_len,
+		  __entry->buf_hold,
+		  __entry->buf_pincount,
+		  __entry->buf_lockval,
+		  __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS),
+		  __entry->bli_recur,
+		  __entry->bli_refcount,
+		  __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS),
+		  __entry->li_desc,
+		  __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS))
+)
+
+#define DEFINE_BUF_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_buf_item_class, name, \
+	TP_PROTO(struct xfs_buf_log_item *bip), \
+	TP_ARGS(bip))
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
+
+DECLARE_EVENT_CLASS(xfs_lock_class,
+	TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
+		 unsigned long caller_ip),
+	TP_ARGS(ip,  lock_flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, lock_flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->lock_flags = lock_flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
+		  (void *)__entry->caller_ip)
+)
+
+#define DEFINE_LOCK_EVENT(name) \
+DEFINE_EVENT(xfs_lock_class, name, \
+	TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \
+		 unsigned long caller_ip), \
+	TP_ARGS(ip,  lock_flags, caller_ip))
+DEFINE_LOCK_EVENT(xfs_ilock);
+DEFINE_LOCK_EVENT(xfs_ilock_nowait);
+DEFINE_LOCK_EVENT(xfs_ilock_demote);
+DEFINE_LOCK_EVENT(xfs_iunlock);
+
+DECLARE_EVENT_CLASS(xfs_inode_class,
+	TP_PROTO(struct xfs_inode *ip),
+	TP_ARGS(ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+	),
+	TP_printk("dev %d:%d ino 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino)
+)
+
+#define DEFINE_INODE_EVENT(name) \
+DEFINE_EVENT(xfs_inode_class, name, \
+	TP_PROTO(struct xfs_inode *ip), \
+	TP_ARGS(ip))
+DEFINE_INODE_EVENT(xfs_iget_skip);
+DEFINE_INODE_EVENT(xfs_iget_reclaim);
+DEFINE_INODE_EVENT(xfs_iget_reclaim_fail);
+DEFINE_INODE_EVENT(xfs_iget_hit);
+DEFINE_INODE_EVENT(xfs_iget_miss);
+
+DEFINE_INODE_EVENT(xfs_getattr);
+DEFINE_INODE_EVENT(xfs_setattr);
+DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_alloc_file_space);
+DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_readdir);
+#ifdef CONFIG_XFS_POSIX_ACL
+DEFINE_INODE_EVENT(xfs_check_acl);
+#endif
+DEFINE_INODE_EVENT(xfs_vm_bmap);
+DEFINE_INODE_EVENT(xfs_file_ioctl);
+DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
+DEFINE_INODE_EVENT(xfs_ioctl_setattr);
+DEFINE_INODE_EVENT(xfs_file_fsync);
+DEFINE_INODE_EVENT(xfs_destroy_inode);
+DEFINE_INODE_EVENT(xfs_write_inode);
+DEFINE_INODE_EVENT(xfs_evict_inode);
+
+DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
+DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
+
+DECLARE_EVENT_CLASS(xfs_iref_class,
+	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
+	TP_ARGS(ip, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, count)
+		__field(int, pincount)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->count = atomic_read(&VFS_I(ip)->i_count);
+		__entry->pincount = atomic_read(&ip->i_pincount);
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->count,
+		  __entry->pincount,
+		  (char *)__entry->caller_ip)
+)
+
+#define DEFINE_IREF_EVENT(name) \
+DEFINE_EVENT(xfs_iref_class, name, \
+	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
+	TP_ARGS(ip, caller_ip))
+DEFINE_IREF_EVENT(xfs_ihold);
+DEFINE_IREF_EVENT(xfs_irele);
+DEFINE_IREF_EVENT(xfs_inode_pin);
+DEFINE_IREF_EVENT(xfs_inode_unpin);
+DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
+
+DECLARE_EVENT_CLASS(xfs_namespace_class,
+	TP_PROTO(struct xfs_inode *dp, struct xfs_name *name),
+	TP_ARGS(dp, name),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, dp_ino)
+		__dynamic_array(char, name, name->len)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(dp)->i_sb->s_dev;
+		__entry->dp_ino = dp->i_ino;
+		memcpy(__get_str(name), name->name, name->len);
+	),
+	TP_printk("dev %d:%d dp ino 0x%llx name %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dp_ino,
+		  __get_str(name))
+)
+
+#define DEFINE_NAMESPACE_EVENT(name) \
+DEFINE_EVENT(xfs_namespace_class, name, \
+	TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \
+	TP_ARGS(dp, name))
+DEFINE_NAMESPACE_EVENT(xfs_remove);
+DEFINE_NAMESPACE_EVENT(xfs_link);
+DEFINE_NAMESPACE_EVENT(xfs_lookup);
+DEFINE_NAMESPACE_EVENT(xfs_create);
+DEFINE_NAMESPACE_EVENT(xfs_symlink);
+
+TRACE_EVENT(xfs_rename,
+	TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp,
+		 struct xfs_name *src_name, struct xfs_name *target_name),
+	TP_ARGS(src_dp, target_dp, src_name, target_name),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, src_dp_ino)
+		__field(xfs_ino_t, target_dp_ino)
+		__dynamic_array(char, src_name, src_name->len)
+		__dynamic_array(char, target_name, target_name->len)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(src_dp)->i_sb->s_dev;
+		__entry->src_dp_ino = src_dp->i_ino;
+		__entry->target_dp_ino = target_dp->i_ino;
+		memcpy(__get_str(src_name), src_name->name, src_name->len);
+		memcpy(__get_str(target_name), target_name->name, target_name->len);
+	),
+	TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
+		  " src name %s target name %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->src_dp_ino,
+		  __entry->target_dp_ino,
+		  __get_str(src_name),
+		  __get_str(target_name))
+)
+
+DECLARE_EVENT_CLASS(xfs_dquot_class,
+	TP_PROTO(struct xfs_dquot *dqp),
+	TP_ARGS(dqp),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(u32, id)
+		__field(unsigned, flags)
+		__field(unsigned, nrefs)
+		__field(unsigned long long, res_bcount)
+		__field(unsigned long long, bcount)
+		__field(unsigned long long, icount)
+		__field(unsigned long long, blk_hardlimit)
+		__field(unsigned long long, blk_softlimit)
+		__field(unsigned long long, ino_hardlimit)
+		__field(unsigned long long, ino_softlimit)
+	), \
+	TP_fast_assign(
+		__entry->dev = dqp->q_mount->m_super->s_dev;
+		__entry->id = be32_to_cpu(dqp->q_core.d_id);
+		__entry->flags = dqp->dq_flags;
+		__entry->nrefs = dqp->q_nrefs;
+		__entry->res_bcount = dqp->q_res_bcount;
+		__entry->bcount = be64_to_cpu(dqp->q_core.d_bcount);
+		__entry->icount = be64_to_cpu(dqp->q_core.d_icount);
+		__entry->blk_hardlimit =
+			be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+		__entry->blk_softlimit =
+			be64_to_cpu(dqp->q_core.d_blk_softlimit);
+		__entry->ino_hardlimit =
+			be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+		__entry->ino_softlimit =
+			be64_to_cpu(dqp->q_core.d_ino_softlimit);
+	),
+	TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
+		  "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
+		  "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->id,
+		  __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
+		  __entry->nrefs,
+		  __entry->res_bcount,
+		  __entry->bcount,
+		  __entry->blk_hardlimit,
+		  __entry->blk_softlimit,
+		  __entry->icount,
+		  __entry->ino_hardlimit,
+		  __entry->ino_softlimit)
+)
+
+#define DEFINE_DQUOT_EVENT(name) \
+DEFINE_EVENT(xfs_dquot_class, name, \
+	TP_PROTO(struct xfs_dquot *dqp), \
+	TP_ARGS(dqp))
+DEFINE_DQUOT_EVENT(xfs_dqadjust);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
+DEFINE_DQUOT_EVENT(xfs_dqattach_found);
+DEFINE_DQUOT_EVENT(xfs_dqattach_get);
+DEFINE_DQUOT_EVENT(xfs_dqinit);
+DEFINE_DQUOT_EVENT(xfs_dqreuse);
+DEFINE_DQUOT_EVENT(xfs_dqalloc);
+DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
+DEFINE_DQUOT_EVENT(xfs_dqread);
+DEFINE_DQUOT_EVENT(xfs_dqread_fail);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
+DEFINE_DQUOT_EVENT(xfs_dqget_hit);
+DEFINE_DQUOT_EVENT(xfs_dqget_miss);
+DEFINE_DQUOT_EVENT(xfs_dqput);
+DEFINE_DQUOT_EVENT(xfs_dqput_wait);
+DEFINE_DQUOT_EVENT(xfs_dqput_free);
+DEFINE_DQUOT_EVENT(xfs_dqrele);
+DEFINE_DQUOT_EVENT(xfs_dqflush);
+DEFINE_DQUOT_EVENT(xfs_dqflush_force);
+DEFINE_DQUOT_EVENT(xfs_dqflush_done);
+
+DECLARE_EVENT_CLASS(xfs_loggrant_class,
+	TP_PROTO(struct log *log, struct xlog_ticket *tic),
+	TP_ARGS(log, tic),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned, trans_type)
+		__field(char, ocnt)
+		__field(char, cnt)
+		__field(int, curr_res)
+		__field(int, unit_res)
+		__field(unsigned int, flags)
+		__field(int, reserveq)
+		__field(int, writeq)
+		__field(int, grant_reserve_cycle)
+		__field(int, grant_reserve_bytes)
+		__field(int, grant_write_cycle)
+		__field(int, grant_write_bytes)
+		__field(int, curr_cycle)
+		__field(int, curr_block)
+		__field(xfs_lsn_t, tail_lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->trans_type = tic->t_trans_type;
+		__entry->ocnt = tic->t_ocnt;
+		__entry->cnt = tic->t_cnt;
+		__entry->curr_res = tic->t_curr_res;
+		__entry->unit_res = tic->t_unit_res;
+		__entry->flags = tic->t_flags;
+		__entry->reserveq = list_empty(&log->l_reserveq);
+		__entry->writeq = list_empty(&log->l_writeq);
+		xlog_crack_grant_head(&log->l_grant_reserve_head,
+				&__entry->grant_reserve_cycle,
+				&__entry->grant_reserve_bytes);
+		xlog_crack_grant_head(&log->l_grant_write_head,
+				&__entry->grant_write_cycle,
+				&__entry->grant_write_bytes);
+		__entry->curr_cycle = log->l_curr_cycle;
+		__entry->curr_block = log->l_curr_block;
+		__entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
+	),
+	TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
+		  "t_unit_res %u t_flags %s reserveq %s "
+		  "writeq %s grant_reserve_cycle %d "
+		  "grant_reserve_bytes %d grant_write_cycle %d "
+		  "grant_write_bytes %d curr_cycle %d curr_block %d "
+		  "tail_cycle %d tail_block %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES),
+		  __entry->ocnt,
+		  __entry->cnt,
+		  __entry->curr_res,
+		  __entry->unit_res,
+		  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
+		  __entry->reserveq ? "empty" : "active",
+		  __entry->writeq ? "empty" : "active",
+		  __entry->grant_reserve_cycle,
+		  __entry->grant_reserve_bytes,
+		  __entry->grant_write_cycle,
+		  __entry->grant_write_bytes,
+		  __entry->curr_cycle,
+		  __entry->curr_block,
+		  CYCLE_LSN(__entry->tail_lsn),
+		  BLOCK_LSN(__entry->tail_lsn)
+	)
+)
+
+#define DEFINE_LOGGRANT_EVENT(name) \
+DEFINE_EVENT(xfs_loggrant_class, name, \
+	TP_PROTO(struct log *log, struct xlog_ticket *tic), \
+	TP_ARGS(log, tic))
+DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
+DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
+DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
+
+DECLARE_EVENT_CLASS(xfs_file_class,
+	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
+	TP_ARGS(ip, count, offset, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_fsize_t, new_size)
+		__field(loff_t, offset)
+		__field(size_t, count)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->new_size = ip->i_new_size;
+		__entry->offset = offset;
+		__entry->count = count;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+		  "offset 0x%llx count 0x%zx ioflags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->new_size,
+		  __entry->offset,
+		  __entry->count,
+		  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
+)
+
+#define DEFINE_RW_EVENT(name)		\
+DEFINE_EVENT(xfs_file_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),	\
+	TP_ARGS(ip, count, offset, flags))
+DEFINE_RW_EVENT(xfs_file_read);
+DEFINE_RW_EVENT(xfs_file_buffered_write);
+DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_splice_read);
+DEFINE_RW_EVENT(xfs_file_splice_write);
+
+DECLARE_EVENT_CLASS(xfs_page_class,
+	TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
+	TP_ARGS(inode, page, off),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(pgoff_t, pgoff)
+		__field(loff_t, size)
+		__field(unsigned long, offset)
+		__field(int, delalloc)
+		__field(int, unwritten)
+	),
+	TP_fast_assign(
+		int delalloc = -1, unwritten = -1;
+
+		if (page_has_buffers(page))
+			xfs_count_page_state(page, &delalloc, &unwritten);
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = XFS_I(inode)->i_ino;
+		__entry->pgoff = page_offset(page);
+		__entry->size = i_size_read(inode);
+		__entry->offset = off;
+		__entry->delalloc = delalloc;
+		__entry->unwritten = unwritten;
+	),
+	TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
+		  "delalloc %d unwritten %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->pgoff,
+		  __entry->size,
+		  __entry->offset,
+		  __entry->delalloc,
+		  __entry->unwritten)
+)
+
+#define DEFINE_PAGE_EVENT(name)		\
+DEFINE_EVENT(xfs_page_class, name,	\
+	TP_PROTO(struct inode *inode, struct page *page, unsigned long off),	\
+	TP_ARGS(inode, page, off))
+DEFINE_PAGE_EVENT(xfs_writepage);
+DEFINE_PAGE_EVENT(xfs_releasepage);
+DEFINE_PAGE_EVENT(xfs_invalidatepage);
+
+DECLARE_EVENT_CLASS(xfs_imap_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
+		 int type, struct xfs_bmbt_irec *irec),
+	TP_ARGS(ip, offset, count, type, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(loff_t, size)
+		__field(loff_t, new_size)
+		__field(loff_t, offset)
+		__field(size_t, count)
+		__field(int, type)
+		__field(xfs_fileoff_t, startoff)
+		__field(xfs_fsblock_t, startblock)
+		__field(xfs_filblks_t, blockcount)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->new_size = ip->i_new_size;
+		__entry->offset = offset;
+		__entry->count = count;
+		__entry->type = type;
+		__entry->startoff = irec ? irec->br_startoff : 0;
+		__entry->startblock = irec ? irec->br_startblock : 0;
+		__entry->blockcount = irec ? irec->br_blockcount : 0;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+		  "offset 0x%llx count %zd type %s "
+		  "startoff 0x%llx startblock %lld blockcount 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->new_size,
+		  __entry->offset,
+		  __entry->count,
+		  __print_symbolic(__entry->type, XFS_IO_TYPES),
+		  __entry->startoff,
+		  (__int64_t)__entry->startblock,
+		  __entry->blockcount)
+)
+
+#define DEFINE_IOMAP_EVENT(name)	\
+DEFINE_EVENT(xfs_imap_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,	\
+		 int type, struct xfs_bmbt_irec *irec),		\
+	TP_ARGS(ip, offset, count, type, irec))
+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
+
+DECLARE_EVENT_CLASS(xfs_simple_io_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
+	TP_ARGS(ip, offset, count),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(loff_t, size)
+		__field(loff_t, new_size)
+		__field(loff_t, offset)
+		__field(size_t, count)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->new_size = ip->i_new_size;
+		__entry->offset = offset;
+		__entry->count = count;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+		  "offset 0x%llx count %zd",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->new_size,
+		  __entry->offset,
+		  __entry->count)
+);
+
+#define DEFINE_SIMPLE_IO_EVENT(name)	\
+DEFINE_EVENT(xfs_simple_io_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),	\
+	TP_ARGS(ip, offset, count))
+DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
+DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
+
+
+TRACE_EVENT(xfs_itruncate_start,
+	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size, int flag,
+		 xfs_off_t toss_start, xfs_off_t toss_finish),
+	TP_ARGS(ip, new_size, flag, toss_start, toss_finish),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_fsize_t, new_size)
+		__field(xfs_off_t, toss_start)
+		__field(xfs_off_t, toss_finish)
+		__field(int, flag)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->new_size = new_size;
+		__entry->toss_start = toss_start;
+		__entry->toss_finish = toss_finish;
+		__entry->flag = flag;
+	),
+	TP_printk("dev %d:%d ino 0x%llx %s size 0x%llx new_size 0x%llx "
+		  "toss start 0x%llx toss finish 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->flag, "|", XFS_ITRUNC_FLAGS),
+		  __entry->size,
+		  __entry->new_size,
+		  __entry->toss_start,
+		  __entry->toss_finish)
+);
+
+DECLARE_EVENT_CLASS(xfs_itrunc_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
+	TP_ARGS(ip, new_size),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_fsize_t, new_size)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->new_size = new_size;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->new_size)
+)
+
+#define DEFINE_ITRUNC_EVENT(name) \
+DEFINE_EVENT(xfs_itrunc_class, name, \
+	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
+	TP_ARGS(ip, new_size))
+DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_end);
+
+TRACE_EVENT(xfs_pagecache_inval,
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
+	TP_ARGS(ip, start, finish),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_off_t, start)
+		__field(xfs_off_t, finish)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->start = start;
+		__entry->finish = finish;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->start,
+		  __entry->finish)
+);
+
+TRACE_EVENT(xfs_bunmap,
+	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len,
+		 int flags, unsigned long caller_ip),
+	TP_ARGS(ip, bno, len, flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_fileoff_t, bno)
+		__field(xfs_filblks_t, len)
+		__field(unsigned long, caller_ip)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->bno = bno;
+		__entry->len = len;
+		__entry->caller_ip = caller_ip;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx"
+		  "flags %s caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->bno,
+		  __entry->len,
+		  __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS),
+		  (void *)__entry->caller_ip)
+
+);
+
+DECLARE_EVENT_CLASS(xfs_busy_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len),
+	TP_ARGS(mp, agno, agbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len)
+);
+#define DEFINE_BUSY_EVENT(name) \
+DEFINE_EVENT(xfs_busy_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len), \
+	TP_ARGS(mp, agno, agbno, len))
+DEFINE_BUSY_EVENT(xfs_alloc_busy);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_force);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_clear);
+
+TRACE_EVENT(xfs_alloc_busy_trim,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len,
+		 xfs_agblock_t tbno, xfs_extlen_t tlen),
+	TP_ARGS(mp, agno, agbno, len, tbno, tlen),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(xfs_agblock_t, tbno)
+		__field(xfs_extlen_t, tlen)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->tbno = tbno;
+		__entry->tlen = tlen;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->tbno,
+		  __entry->tlen)
+);
+
+TRACE_EVENT(xfs_trans_commit_lsn,
+	TP_PROTO(struct xfs_trans *trans),
+	TP_ARGS(trans),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(struct xfs_trans *, tp)
+		__field(xfs_lsn_t, lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = trans->t_mountp->m_super->s_dev;
+		__entry->tp = trans;
+		__entry->lsn = trans->t_commit_lsn;
+	),
+	TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->tp,
+		  __entry->lsn)
+);
+
+TRACE_EVENT(xfs_agf,
+	TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags,
+		 unsigned long caller_ip),
+	TP_ARGS(mp, agf, flags, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, flags)
+		__field(__u32, length)
+		__field(__u32, bno_root)
+		__field(__u32, cnt_root)
+		__field(__u32, bno_level)
+		__field(__u32, cnt_level)
+		__field(__u32, flfirst)
+		__field(__u32, fllast)
+		__field(__u32, flcount)
+		__field(__u32, freeblks)
+		__field(__u32, longest)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = be32_to_cpu(agf->agf_seqno),
+		__entry->flags = flags;
+		__entry->length = be32_to_cpu(agf->agf_length),
+		__entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
+		__entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
+		__entry->bno_level =
+				be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
+		__entry->cnt_level =
+				be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
+		__entry->flfirst = be32_to_cpu(agf->agf_flfirst),
+		__entry->fllast = be32_to_cpu(agf->agf_fllast),
+		__entry->flcount = be32_to_cpu(agf->agf_flcount),
+		__entry->freeblks = be32_to_cpu(agf->agf_freeblks),
+		__entry->longest = be32_to_cpu(agf->agf_longest);
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u "
+		  "levels b %u c %u flfirst %u fllast %u flcount %u "
+		  "freeblks %u longest %u caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __print_flags(__entry->flags, "|", XFS_AGF_FLAGS),
+		  __entry->length,
+		  __entry->bno_root,
+		  __entry->cnt_root,
+		  __entry->bno_level,
+		  __entry->cnt_level,
+		  __entry->flfirst,
+		  __entry->fllast,
+		  __entry->flcount,
+		  __entry->freeblks,
+		  __entry->longest,
+		  (void *)__entry->caller_ip)
+);
+
+TRACE_EVENT(xfs_free_extent,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+		 xfs_extlen_t len, bool isfl, int haveleft, int haveright),
+	TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(int, isfl)
+		__field(int, haveleft)
+		__field(int, haveright)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->isfl = isfl;
+		__entry->haveleft = haveleft;
+		__entry->haveright = haveright;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->isfl,
+		  __entry->haveleft ?
+			(__entry->haveright ? "both" : "left") :
+			(__entry->haveright ? "right" : "none"))
+
+);
+
+DECLARE_EVENT_CLASS(xfs_alloc_class,
+	TP_PROTO(struct xfs_alloc_arg *args),
+	TP_ARGS(args),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, minlen)
+		__field(xfs_extlen_t, maxlen)
+		__field(xfs_extlen_t, mod)
+		__field(xfs_extlen_t, prod)
+		__field(xfs_extlen_t, minleft)
+		__field(xfs_extlen_t, total)
+		__field(xfs_extlen_t, alignment)
+		__field(xfs_extlen_t, minalignslop)
+		__field(xfs_extlen_t, len)
+		__field(short, type)
+		__field(short, otype)
+		__field(char, wasdel)
+		__field(char, wasfromfl)
+		__field(char, isfl)
+		__field(char, userdata)
+		__field(xfs_fsblock_t, firstblock)
+	),
+	TP_fast_assign(
+		__entry->dev = args->mp->m_super->s_dev;
+		__entry->agno = args->agno;
+		__entry->agbno = args->agbno;
+		__entry->minlen = args->minlen;
+		__entry->maxlen = args->maxlen;
+		__entry->mod = args->mod;
+		__entry->prod = args->prod;
+		__entry->minleft = args->minleft;
+		__entry->total = args->total;
+		__entry->alignment = args->alignment;
+		__entry->minalignslop = args->minalignslop;
+		__entry->len = args->len;
+		__entry->type = args->type;
+		__entry->otype = args->otype;
+		__entry->wasdel = args->wasdel;
+		__entry->wasfromfl = args->wasfromfl;
+		__entry->isfl = args->isfl;
+		__entry->userdata = args->userdata;
+		__entry->firstblock = args->firstblock;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
+		  "prod %u minleft %u total %u alignment %u minalignslop %u "
+		  "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d "
+		  "userdata %d firstblock 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->minlen,
+		  __entry->maxlen,
+		  __entry->mod,
+		  __entry->prod,
+		  __entry->minleft,
+		  __entry->total,
+		  __entry->alignment,
+		  __entry->minalignslop,
+		  __entry->len,
+		  __print_symbolic(__entry->type, XFS_ALLOC_TYPES),
+		  __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
+		  __entry->wasdel,
+		  __entry->wasfromfl,
+		  __entry->isfl,
+		  __entry->userdata,
+		  (unsigned long long)__entry->firstblock)
+)
+
+#define DEFINE_ALLOC_EVENT(name) \
+DEFINE_EVENT(xfs_alloc_class, name, \
+	TP_PROTO(struct xfs_alloc_arg *args), \
+	TP_ARGS(args))
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
+
+DECLARE_EVENT_CLASS(xfs_dir2_class,
+	TP_PROTO(struct xfs_da_args *args),
+	TP_ARGS(args),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__dynamic_array(char, name, args->namelen)
+		__field(int, namelen)
+		__field(xfs_dahash_t, hashval)
+		__field(xfs_ino_t, inumber)
+		__field(int, op_flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+		__entry->ino = args->dp->i_ino;
+		if (args->namelen)
+			memcpy(__get_str(name), args->name, args->namelen);
+		__entry->namelen = args->namelen;
+		__entry->hashval = args->hashval;
+		__entry->inumber = args->inumber;
+		__entry->op_flags = args->op_flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x "
+		  "inumber 0x%llx op_flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->namelen,
+		  __entry->namelen ? __get_str(name) : NULL,
+		  __entry->namelen,
+		  __entry->hashval,
+		  __entry->inumber,
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+)
+
+#define DEFINE_DIR2_EVENT(name) \
+DEFINE_EVENT(xfs_dir2_class, name, \
+	TP_PROTO(struct xfs_da_args *args), \
+	TP_ARGS(args))
+DEFINE_DIR2_EVENT(xfs_dir2_sf_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_create);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block);
+DEFINE_DIR2_EVENT(xfs_dir2_block_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_block_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_block_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_block_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf);
+DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node);
+DEFINE_DIR2_EVENT(xfs_dir2_node_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_node_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
+
+DECLARE_EVENT_CLASS(xfs_dir2_space_class,
+	TP_PROTO(struct xfs_da_args *args, int idx),
+	TP_ARGS(args, idx),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, op_flags)
+		__field(int, idx)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+		__entry->ino = args->dp->i_ino;
+		__entry->op_flags = args->op_flags;
+		__entry->idx = idx;
+	),
+	TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+		  __entry->idx)
+)
+
+#define DEFINE_DIR2_SPACE_EVENT(name) \
+DEFINE_EVENT(xfs_dir2_space_class, name, \
+	TP_PROTO(struct xfs_da_args *args, int idx), \
+	TP_ARGS(args, idx))
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode);
+
+TRACE_EVENT(xfs_dir2_leafn_moveents,
+	TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count),
+	TP_ARGS(args, src_idx, dst_idx, count),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, op_flags)
+		__field(int, src_idx)
+		__field(int, dst_idx)
+		__field(int, count)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+		__entry->ino = args->dp->i_ino;
+		__entry->op_flags = args->op_flags;
+		__entry->src_idx = src_idx;
+		__entry->dst_idx = dst_idx;
+		__entry->count = count;
+	),
+	TP_printk("dev %d:%d ino 0x%llx op_flags %s "
+		  "src_idx %d dst_idx %d count %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+		  __entry->src_idx,
+		  __entry->dst_idx,
+		  __entry->count)
+);
+
+#define XFS_SWAPEXT_INODES \
+	{ 0,	"target" }, \
+	{ 1,	"temp" }
+
+#define XFS_INODE_FORMAT_STR \
+	{ 0,	"invalid" }, \
+	{ 1,	"local" }, \
+	{ 2,	"extent" }, \
+	{ 3,	"btree" }
+
+DECLARE_EVENT_CLASS(xfs_swap_extent_class,
+	TP_PROTO(struct xfs_inode *ip, int which),
+	TP_ARGS(ip, which),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, which)
+		__field(xfs_ino_t, ino)
+		__field(int, format)
+		__field(int, nex)
+		__field(int, max_nex)
+		__field(int, broot_size)
+		__field(int, fork_off)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->which = which;
+		__entry->ino = ip->i_ino;
+		__entry->format = ip->i_d.di_format;
+		__entry->nex = ip->i_d.di_nextents;
+		__entry->max_nex = ip->i_df.if_ext_max;
+		__entry->broot_size = ip->i_df.if_broot_bytes;
+		__entry->fork_off = XFS_IFORK_BOFF(ip);
+	),
+	TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
+		  "Max in-fork extents %d, broot size %d, fork offset %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
+		  __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
+		  __entry->nex,
+		  __entry->max_nex,
+		  __entry->broot_size,
+		  __entry->fork_off)
+)
+
+#define DEFINE_SWAPEXT_EVENT(name) \
+DEFINE_EVENT(xfs_swap_extent_class, name, \
+	TP_PROTO(struct xfs_inode *ip, int which), \
+	TP_ARGS(ip, which))
+
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
+	TP_PROTO(struct log *log, struct xlog_recover *trans,
+		struct xlog_recover_item *item, int pass),
+	TP_ARGS(log, trans, item, pass),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long, item)
+		__field(xlog_tid_t, tid)
+		__field(int, type)
+		__field(int, pass)
+		__field(int, count)
+		__field(int, total)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->item = (unsigned long)item;
+		__entry->tid = trans->r_log_tid;
+		__entry->type = ITEM_TYPE(item);
+		__entry->pass = pass;
+		__entry->count = item->ri_cnt;
+		__entry->total = item->ri_total;
+	),
+	TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
+		  "item region count/total %d/%d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->tid,
+		  __entry->pass,
+		  (void *)__entry->item,
+		  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+		  __entry->count,
+		  __entry->total)
+)
+
+#define DEFINE_LOG_RECOVER_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_item_class, name, \
+	TP_PROTO(struct log *log, struct xlog_recover *trans, \
+		struct xlog_recover_item *item, int pass), \
+	TP_ARGS(log, trans, item, pass))
+
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
+	TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f),
+	TP_ARGS(log, buf_f),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(__int64_t, blkno)
+		__field(unsigned short, len)
+		__field(unsigned short, flags)
+		__field(unsigned short, size)
+		__field(unsigned int, map_size)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->blkno = buf_f->blf_blkno;
+		__entry->len = buf_f->blf_len;
+		__entry->flags = buf_f->blf_flags;
+		__entry->size = buf_f->blf_size;
+		__entry->map_size = buf_f->blf_map_size;
+	),
+	TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
+			"map_size %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->blkno,
+		  __entry->len,
+		  __entry->flags,
+		  __entry->size,
+		  __entry->map_size)
+)
+
+#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
+	TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \
+	TP_ARGS(log, buf_f))
+
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
+	TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f),
+	TP_ARGS(log, in_f),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(unsigned short, size)
+		__field(int, fields)
+		__field(unsigned short, asize)
+		__field(unsigned short, dsize)
+		__field(__int64_t, blkno)
+		__field(int, len)
+		__field(int, boffset)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->ino = in_f->ilf_ino;
+		__entry->size = in_f->ilf_size;
+		__entry->fields = in_f->ilf_fields;
+		__entry->asize = in_f->ilf_asize;
+		__entry->dsize = in_f->ilf_dsize;
+		__entry->blkno = in_f->ilf_blkno;
+		__entry->len = in_f->ilf_len;
+		__entry->boffset = in_f->ilf_boffset;
+	),
+	TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
+			"dsize %d, blkno 0x%llx, len %d, boffset %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->fields,
+		  __entry->asize,
+		  __entry->dsize,
+		  __entry->blkno,
+		  __entry->len,
+		  __entry->boffset)
+)
+#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
+	TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \
+	TP_ARGS(log, in_f))
+
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+
+DECLARE_EVENT_CLASS(xfs_discard_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len),
+	TP_ARGS(mp, agno, agbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len)
+)
+
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len), \
+	TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
+
+#endif /* _TRACE_XFS_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE xfs_trace
+#include <trace/define_trace.h>
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
new file mode 100644
index 00000000..7c220b42
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_VNODE_H__
+#define __XFS_VNODE_H__
+
+#include "xfs_fs.h"
+
+struct file;
+struct xfs_inode;
+struct xfs_iomap;
+struct attrlist_cursor_kern;
+
+/*
+ * Return values for xfs_inactive.  A return value of
+ * VN_INACTIVE_NOCACHE implies that the file system behavior
+ * has disassociated its state and bhv_desc_t from the vnode.
+ */
+#define	VN_INACTIVE_CACHE	0
+#define	VN_INACTIVE_NOCACHE	1
+
+/*
+ * Flags for read/write calls - same values as IRIX
+ */
+#define IO_ISDIRECT	0x00004		/* bypass page cache */
+#define IO_INVIS	0x00020		/* don't update inode timestamps */
+
+#define XFS_IO_FLAGS \
+	{ IO_ISDIRECT,	"DIRECT" }, \
+	{ IO_INVIS,	"INVIS"}
+
+/*
+ * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
+ */
+#define FI_NONE			0	/* none */
+#define FI_REMAPF		1	/* Do a remapf prior to the operation */
+#define FI_REMAPF_LOCKED	2	/* Do a remapf prior to the operation.
+					   Prevent VM access to the pages until
+					   the operation completes. */
+
+/*
+ * Some useful predicates.
+ */
+#define VN_MAPPED(vp)	mapping_mapped(vp->i_mapping)
+#define VN_CACHED(vp)	(vp->i_mapping->nrpages)
+#define VN_DIRTY(vp)	mapping_tagged(vp->i_mapping, \
+					PAGECACHE_TAG_DIRTY)
+
+
+#endif	/* __XFS_VNODE_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
new file mode 100644
index 00000000..87d3e038
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (C) 2008 Christoph Hellwig.
+ * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_acl.h"
+#include "xfs_vnodeops.h"
+
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+
+static int
+xfs_xattr_get(struct dentry *dentry, const char *name,
+		void *value, size_t size, int xflags)
+{
+	struct xfs_inode *ip = XFS_I(dentry->d_inode);
+	int error, asize = size;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	/* Convert Linux syscall to XFS internal ATTR flags */
+	if (!size) {
+		xflags |= ATTR_KERNOVAL;
+		value = NULL;
+	}
+
+	error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
+	if (error)
+		return error;
+	return asize;
+}
+
+static int
+xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags, int xflags)
+{
+	struct xfs_inode *ip = XFS_I(dentry->d_inode);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	/* Convert Linux syscall to XFS internal ATTR flags */
+	if (flags & XATTR_CREATE)
+		xflags |= ATTR_CREATE;
+	if (flags & XATTR_REPLACE)
+		xflags |= ATTR_REPLACE;
+
+	if (!value)
+		return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
+	return -xfs_attr_set(ip, (unsigned char *)name,
+				(void *)value, size, xflags);
+}
+
+static const struct xattr_handler xfs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.flags	= 0, /* no flags implies user namespace */
+	.get	= xfs_xattr_get,
+	.set	= xfs_xattr_set,
+};
+
+static const struct xattr_handler xfs_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.flags	= ATTR_ROOT,
+	.get	= xfs_xattr_get,
+	.set	= xfs_xattr_set,
+};
+
+static const struct xattr_handler xfs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.flags	= ATTR_SECURE,
+	.get	= xfs_xattr_get,
+	.set	= xfs_xattr_set,
+};
+
+const struct xattr_handler *xfs_xattr_handlers[] = {
+	&xfs_xattr_user_handler,
+	&xfs_xattr_trusted_handler,
+	&xfs_xattr_security_handler,
+#ifdef CONFIG_XFS_POSIX_ACL
+	&xfs_xattr_acl_access_handler,
+	&xfs_xattr_acl_default_handler,
+#endif
+	NULL
+};
+
+static unsigned int xfs_xattr_prefix_len(int flags)
+{
+	if (flags & XFS_ATTR_SECURE)
+		return sizeof("security");
+	else if (flags & XFS_ATTR_ROOT)
+		return sizeof("trusted");
+	else
+		return sizeof("user");
+}
+
+static const char *xfs_xattr_prefix(int flags)
+{
+	if (flags & XFS_ATTR_SECURE)
+		return xfs_xattr_security_handler.prefix;
+	else if (flags & XFS_ATTR_ROOT)
+		return xfs_xattr_trusted_handler.prefix;
+	else
+		return xfs_xattr_user_handler.prefix;
+}
+
+static int
+xfs_xattr_put_listent(
+	struct xfs_attr_list_context *context,
+	int		flags,
+	unsigned char	*name,
+	int		namelen,
+	int		valuelen,
+	unsigned char	*value)
+{
+	unsigned int prefix_len = xfs_xattr_prefix_len(flags);
+	char *offset;
+	int arraytop;
+
+	ASSERT(context->count >= 0);
+
+	/*
+	 * Only show root namespace entries if we are actually allowed to
+	 * see them.
+	 */
+	if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN))
+		return 0;
+
+	arraytop = context->count + prefix_len + namelen + 1;
+	if (arraytop > context->firstu) {
+		context->count = -1;	/* insufficient space */
+		return 1;
+	}
+	offset = (char *)context->alist + context->count;
+	strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
+	offset += prefix_len;
+	strncpy(offset, (char *)name, namelen);			/* real name */
+	offset += namelen;
+	*offset = '\0';
+	context->count += prefix_len + namelen + 1;
+	return 0;
+}
+
+static int
+xfs_xattr_put_listent_sizes(
+	struct xfs_attr_list_context *context,
+	int		flags,
+	unsigned char	*name,
+	int		namelen,
+	int		valuelen,
+	unsigned char	*value)
+{
+	context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
+	return 0;
+}
+
+static int
+list_one_attr(const char *name, const size_t len, void *data,
+		size_t size, ssize_t *result)
+{
+	char *p = data + *result;
+
+	*result += len;
+	if (!size)
+		return 0;
+	if (*result > size)
+		return -ERANGE;
+
+	strcpy(p, name);
+	return 0;
+}
+
+ssize_t
+xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
+{
+	struct xfs_attr_list_context context;
+	struct attrlist_cursor_kern cursor = { 0 };
+	struct inode		*inode = dentry->d_inode;
+	int			error;
+
+	/*
+	 * First read the regular on-disk attributes.
+	 */
+	memset(&context, 0, sizeof(context));
+	context.dp = XFS_I(inode);
+	context.cursor = &cursor;
+	context.resynch = 1;
+	context.alist = data;
+	context.bufsize = size;
+	context.firstu = context.bufsize;
+
+	if (size)
+		context.put_listent = xfs_xattr_put_listent;
+	else
+		context.put_listent = xfs_xattr_put_listent_sizes;
+
+	xfs_attr_list_int(&context);
+	if (context.count < 0)
+		return -ERANGE;
+
+	/*
+	 * Then add the two synthetic ACL attributes.
+	 */
+	if (posix_acl_access_exists(inode)) {
+		error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
+				strlen(POSIX_ACL_XATTR_ACCESS) + 1,
+				data, size, &context.count);
+		if (error)
+			return error;
+	}
+
+	if (posix_acl_default_exists(inode)) {
+		error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
+				strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
+				data, size, &context.count);
+		if (error)
+			return error;
+	}
+
+	return context.count;
+}
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
new file mode 100644
index 00000000..6fa21460
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -0,0 +1,1496 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+#include "xfs_trace.h"
+
+
+/*
+   LOCK ORDER
+
+   inode lock		    (ilock)
+   dquot hash-chain lock    (hashlock)
+   xqm dquot freelist lock  (freelistlock
+   mount's dquot list lock  (mplistlock)
+   user dquot lock - lock ordering among dquots is based on the uid or gid
+   group dquot lock - similar to udquots. Between the two dquots, the udquot
+		      has to be locked first.
+   pin lock - the dquot lock must be held to take this lock.
+   flush lock - ditto.
+*/
+
+#ifdef DEBUG
+xfs_buftarg_t *xfs_dqerror_target;
+int xfs_do_dqerror;
+int xfs_dqreq_num;
+int xfs_dqerror_mod = 33;
+#endif
+
+static struct lock_class_key xfs_dquot_other_class;
+
+/*
+ * Allocate and initialize a dquot. We don't always allocate fresh memory;
+ * we try to reclaim a free dquot if the number of incore dquots are above
+ * a threshold.
+ * The only field inside the core that gets initialized at this point
+ * is the d_id field. The idea is to fill in the entire q_core
+ * when we read in the on disk dquot.
+ */
+STATIC xfs_dquot_t *
+xfs_qm_dqinit(
+	xfs_mount_t  *mp,
+	xfs_dqid_t   id,
+	uint	     type)
+{
+	xfs_dquot_t	*dqp;
+	boolean_t	brandnewdquot;
+
+	brandnewdquot = xfs_qm_dqalloc_incore(&dqp);
+	dqp->dq_flags = type;
+	dqp->q_core.d_id = cpu_to_be32(id);
+	dqp->q_mount = mp;
+
+	/*
+	 * No need to re-initialize these if this is a reclaimed dquot.
+	 */
+	if (brandnewdquot) {
+		INIT_LIST_HEAD(&dqp->q_freelist);
+		mutex_init(&dqp->q_qlock);
+		init_waitqueue_head(&dqp->q_pinwait);
+
+		/*
+		 * Because we want to use a counting completion, complete
+		 * the flush completion once to allow a single access to
+		 * the flush completion without blocking.
+		 */
+		init_completion(&dqp->q_flush);
+		complete(&dqp->q_flush);
+
+		trace_xfs_dqinit(dqp);
+	} else {
+		/*
+		 * Only the q_core portion was zeroed in dqreclaim_one().
+		 * So, we need to reset others.
+		 */
+		dqp->q_nrefs = 0;
+		dqp->q_blkno = 0;
+		INIT_LIST_HEAD(&dqp->q_mplist);
+		INIT_LIST_HEAD(&dqp->q_hashlist);
+		dqp->q_bufoffset = 0;
+		dqp->q_fileoffset = 0;
+		dqp->q_transp = NULL;
+		dqp->q_gdquot = NULL;
+		dqp->q_res_bcount = 0;
+		dqp->q_res_icount = 0;
+		dqp->q_res_rtbcount = 0;
+		atomic_set(&dqp->q_pincount, 0);
+		dqp->q_hash = NULL;
+		ASSERT(list_empty(&dqp->q_freelist));
+
+		trace_xfs_dqreuse(dqp);
+	}
+
+	/*
+	 * In either case we need to make sure group quotas have a different
+	 * lock class than user quotas, to make sure lockdep knows we can
+	 * locks of one of each at the same time.
+	 */
+	if (!(type & XFS_DQ_USER))
+		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
+
+	/*
+	 * log item gets initialized later
+	 */
+	return (dqp);
+}
+
+/*
+ * This is called to free all the memory associated with a dquot
+ */
+void
+xfs_qm_dqdestroy(
+	xfs_dquot_t	*dqp)
+{
+	ASSERT(list_empty(&dqp->q_freelist));
+
+	mutex_destroy(&dqp->q_qlock);
+	kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
+
+	atomic_dec(&xfs_Gqm->qm_totaldquots);
+}
+
+/*
+ * This is what a 'fresh' dquot inside a dquot chunk looks like on disk.
+ */
+STATIC void
+xfs_qm_dqinit_core(
+	xfs_dqid_t	id,
+	uint		type,
+	xfs_dqblk_t	*d)
+{
+	/*
+	 * Caller has zero'd the entire dquot 'chunk' already.
+	 */
+	d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+	d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+	d->dd_diskdq.d_id = cpu_to_be32(id);
+	d->dd_diskdq.d_flags = type;
+}
+
+/*
+ * If default limits are in force, push them into the dquot now.
+ * We overwrite the dquot limits only if they are zero and this
+ * is not the root dquot.
+ */
+void
+xfs_qm_adjust_dqlimits(
+	xfs_mount_t		*mp,
+	xfs_disk_dquot_t	*d)
+{
+	xfs_quotainfo_t		*q = mp->m_quotainfo;
+
+	ASSERT(d->d_id);
+
+	if (q->qi_bsoftlimit && !d->d_blk_softlimit)
+		d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
+	if (q->qi_bhardlimit && !d->d_blk_hardlimit)
+		d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
+	if (q->qi_isoftlimit && !d->d_ino_softlimit)
+		d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
+	if (q->qi_ihardlimit && !d->d_ino_hardlimit)
+		d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit);
+	if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit)
+		d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
+	if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
+		d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
+}
+
+/*
+ * Check the limits and timers of a dquot and start or reset timers
+ * if necessary.
+ * This gets called even when quota enforcement is OFF, which makes our
+ * life a little less complicated. (We just don't reject any quota
+ * reservations in that case, when enforcement is off).
+ * We also return 0 as the values of the timers in Q_GETQUOTA calls, when
+ * enforcement's off.
+ * In contrast, warnings are a little different in that they don't
+ * 'automatically' get started when limits get exceeded.  They do
+ * get reset to zero, however, when we find the count to be under
+ * the soft limit (they are only ever set non-zero via userspace).
+ */
+void
+xfs_qm_adjust_dqtimers(
+	xfs_mount_t		*mp,
+	xfs_disk_dquot_t	*d)
+{
+	ASSERT(d->d_id);
+
+#ifdef QUOTADEBUG
+	if (d->d_blk_hardlimit)
+		ASSERT(be64_to_cpu(d->d_blk_softlimit) <=
+		       be64_to_cpu(d->d_blk_hardlimit));
+	if (d->d_ino_hardlimit)
+		ASSERT(be64_to_cpu(d->d_ino_softlimit) <=
+		       be64_to_cpu(d->d_ino_hardlimit));
+	if (d->d_rtb_hardlimit)
+		ASSERT(be64_to_cpu(d->d_rtb_softlimit) <=
+		       be64_to_cpu(d->d_rtb_hardlimit));
+#endif
+	if (!d->d_btimer) {
+		if ((d->d_blk_softlimit &&
+		     (be64_to_cpu(d->d_bcount) >=
+		      be64_to_cpu(d->d_blk_softlimit))) ||
+		    (d->d_blk_hardlimit &&
+		     (be64_to_cpu(d->d_bcount) >=
+		      be64_to_cpu(d->d_blk_hardlimit)))) {
+			d->d_btimer = cpu_to_be32(get_seconds() +
+					mp->m_quotainfo->qi_btimelimit);
+		} else {
+			d->d_bwarns = 0;
+		}
+	} else {
+		if ((!d->d_blk_softlimit ||
+		     (be64_to_cpu(d->d_bcount) <
+		      be64_to_cpu(d->d_blk_softlimit))) &&
+		    (!d->d_blk_hardlimit ||
+		    (be64_to_cpu(d->d_bcount) <
+		     be64_to_cpu(d->d_blk_hardlimit)))) {
+			d->d_btimer = 0;
+		}
+	}
+
+	if (!d->d_itimer) {
+		if ((d->d_ino_softlimit &&
+		     (be64_to_cpu(d->d_icount) >=
+		      be64_to_cpu(d->d_ino_softlimit))) ||
+		    (d->d_ino_hardlimit &&
+		     (be64_to_cpu(d->d_icount) >=
+		      be64_to_cpu(d->d_ino_hardlimit)))) {
+			d->d_itimer = cpu_to_be32(get_seconds() +
+					mp->m_quotainfo->qi_itimelimit);
+		} else {
+			d->d_iwarns = 0;
+		}
+	} else {
+		if ((!d->d_ino_softlimit ||
+		     (be64_to_cpu(d->d_icount) <
+		      be64_to_cpu(d->d_ino_softlimit)))  &&
+		    (!d->d_ino_hardlimit ||
+		     (be64_to_cpu(d->d_icount) <
+		      be64_to_cpu(d->d_ino_hardlimit)))) {
+			d->d_itimer = 0;
+		}
+	}
+
+	if (!d->d_rtbtimer) {
+		if ((d->d_rtb_softlimit &&
+		     (be64_to_cpu(d->d_rtbcount) >=
+		      be64_to_cpu(d->d_rtb_softlimit))) ||
+		    (d->d_rtb_hardlimit &&
+		     (be64_to_cpu(d->d_rtbcount) >=
+		      be64_to_cpu(d->d_rtb_hardlimit)))) {
+			d->d_rtbtimer = cpu_to_be32(get_seconds() +
+					mp->m_quotainfo->qi_rtbtimelimit);
+		} else {
+			d->d_rtbwarns = 0;
+		}
+	} else {
+		if ((!d->d_rtb_softlimit ||
+		     (be64_to_cpu(d->d_rtbcount) <
+		      be64_to_cpu(d->d_rtb_softlimit))) &&
+		    (!d->d_rtb_hardlimit ||
+		     (be64_to_cpu(d->d_rtbcount) <
+		      be64_to_cpu(d->d_rtb_hardlimit)))) {
+			d->d_rtbtimer = 0;
+		}
+	}
+}
+
+/*
+ * initialize a buffer full of dquots and log the whole thing
+ */
+STATIC void
+xfs_qm_init_dquot_blk(
+	xfs_trans_t	*tp,
+	xfs_mount_t	*mp,
+	xfs_dqid_t	id,
+	uint		type,
+	xfs_buf_t	*bp)
+{
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	xfs_dqblk_t	*d;
+	int		curid, i;
+
+	ASSERT(tp);
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+
+	d = (xfs_dqblk_t *)XFS_BUF_PTR(bp);
+
+	/*
+	 * ID of the first dquot in the block - id's are zero based.
+	 */
+	curid = id - (id % q->qi_dqperchunk);
+	ASSERT(curid >= 0);
+	memset(d, 0, BBTOB(q->qi_dqchunklen));
+	for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
+		xfs_qm_dqinit_core(curid, type, d);
+	xfs_trans_dquot_buf(tp, bp,
+			    (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
+			    ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
+			     XFS_BLF_GDQUOT_BUF)));
+	xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
+}
+
+
+
+/*
+ * Allocate a block and fill it with dquots.
+ * This is called when the bmapi finds a hole.
+ */
+STATIC int
+xfs_qm_dqalloc(
+	xfs_trans_t	**tpp,
+	xfs_mount_t	*mp,
+	xfs_dquot_t	*dqp,
+	xfs_inode_t	*quotip,
+	xfs_fileoff_t	offset_fsb,
+	xfs_buf_t	**O_bpp)
+{
+	xfs_fsblock_t	firstblock;
+	xfs_bmap_free_t flist;
+	xfs_bmbt_irec_t map;
+	int		nmaps, error, committed;
+	xfs_buf_t	*bp;
+	xfs_trans_t	*tp = *tpp;
+
+	ASSERT(tp != NULL);
+
+	trace_xfs_dqalloc(dqp);
+
+	/*
+	 * Initialize the bmap freelist prior to calling bmapi code.
+	 */
+	xfs_bmap_init(&flist, &firstblock);
+	xfs_ilock(quotip, XFS_ILOCK_EXCL);
+	/*
+	 * Return if this type of quotas is turned off while we didn't
+	 * have an inode lock
+	 */
+	if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+		xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+		return (ESRCH);
+	}
+
+	xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL);
+	nmaps = 1;
+	if ((error = xfs_bmapi(tp, quotip,
+			      offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
+			      XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
+			      &firstblock,
+			      XFS_QM_DQALLOC_SPACE_RES(mp),
+			      &map, &nmaps, &flist))) {
+		goto error0;
+	}
+	ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
+	ASSERT(nmaps == 1);
+	ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+	       (map.br_startblock != HOLESTARTBLOCK));
+
+	/*
+	 * Keep track of the blkno to save a lookup later
+	 */
+	dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+
+	/* now we can just get the buffer (there's nothing to read yet) */
+	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+			       dqp->q_blkno,
+			       mp->m_quotainfo->qi_dqchunklen,
+			       0);
+	if (!bp || (error = XFS_BUF_GETERROR(bp)))
+		goto error1;
+	/*
+	 * Make a chunk of dquots out of this buffer and log
+	 * the entire thing.
+	 */
+	xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id),
+			      dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
+
+	/*
+	 * xfs_bmap_finish() may commit the current transaction and
+	 * start a second transaction if the freelist is not empty.
+	 *
+	 * Since we still want to modify this buffer, we need to
+	 * ensure that the buffer is not released on commit of
+	 * the first transaction and ensure the buffer is added to the
+	 * second transaction.
+	 *
+	 * If there is only one transaction then don't stop the buffer
+	 * from being released when it commits later on.
+	 */
+
+	xfs_trans_bhold(tp, bp);
+
+	if ((error = xfs_bmap_finish(tpp, &flist, &committed))) {
+		goto error1;
+	}
+
+	if (committed) {
+		tp = *tpp;
+		xfs_trans_bjoin(tp, bp);
+	} else {
+		xfs_trans_bhold_release(tp, bp);
+	}
+
+	*O_bpp = bp;
+	return 0;
+
+      error1:
+	xfs_bmap_cancel(&flist);
+      error0:
+	xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+
+	return (error);
+}
+
+/*
+ * Maps a dquot to the buffer containing its on-disk version.
+ * This returns a ptr to the buffer containing the on-disk dquot
+ * in the bpp param, and a ptr to the on-disk dquot within that buffer
+ */
+STATIC int
+xfs_qm_dqtobp(
+	xfs_trans_t		**tpp,
+	xfs_dquot_t		*dqp,
+	xfs_disk_dquot_t	**O_ddpp,
+	xfs_buf_t		**O_bpp,
+	uint			flags)
+{
+	xfs_bmbt_irec_t map;
+	int		nmaps = 1, error;
+	xfs_buf_t	*bp;
+	xfs_inode_t	*quotip = XFS_DQ_TO_QIP(dqp);
+	xfs_mount_t	*mp = dqp->q_mount;
+	xfs_disk_dquot_t *ddq;
+	xfs_dqid_t	id = be32_to_cpu(dqp->q_core.d_id);
+	xfs_trans_t	*tp = (tpp ? *tpp : NULL);
+
+	dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
+
+	xfs_ilock(quotip, XFS_ILOCK_SHARED);
+	if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+		/*
+		 * Return if this type of quotas is turned off while we
+		 * didn't have the quota inode lock.
+		 */
+		xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+		return ESRCH;
+	}
+
+	/*
+	 * Find the block map; no allocations yet
+	 */
+	error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
+			  XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+			  NULL, 0, &map, &nmaps, NULL);
+
+	xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+	if (error)
+		return error;
+
+	ASSERT(nmaps == 1);
+	ASSERT(map.br_blockcount == 1);
+
+	/*
+	 * Offset of dquot in the (fixed sized) dquot chunk.
+	 */
+	dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+		sizeof(xfs_dqblk_t);
+
+	ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+	if (map.br_startblock == HOLESTARTBLOCK) {
+		/*
+		 * We don't allocate unless we're asked to
+		 */
+		if (!(flags & XFS_QMOPT_DQALLOC))
+			return ENOENT;
+
+		ASSERT(tp);
+		error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
+					dqp->q_fileoffset, &bp);
+		if (error)
+			return error;
+		tp = *tpp;
+	} else {
+		trace_xfs_dqtobp_read(dqp);
+
+		/*
+		 * store the blkno etc so that we don't have to do the
+		 * mapping all the time
+		 */
+		dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+
+		error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+					   dqp->q_blkno,
+					   mp->m_quotainfo->qi_dqchunklen,
+					   0, &bp);
+		if (error || !bp)
+			return XFS_ERROR(error);
+	}
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+
+	/*
+	 * calculate the location of the dquot inside the buffer.
+	 */
+	ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+
+	/*
+	 * A simple sanity check in case we got a corrupted dquot...
+	 */
+	error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
+			   flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
+			   "dqtobp");
+	if (error) {
+		if (!(flags & XFS_QMOPT_DQREPAIR)) {
+			xfs_trans_brelse(tp, bp);
+			return XFS_ERROR(EIO);
+		}
+		XFS_BUF_BUSY(bp); /* We dirtied this */
+	}
+
+	*O_bpp = bp;
+	*O_ddpp = ddq;
+
+	return (0);
+}
+
+
+/*
+ * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
+ * and release the buffer immediately.
+ *
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_dqread(
+	xfs_trans_t	**tpp,
+	xfs_dqid_t	id,
+	xfs_dquot_t	*dqp,	/* dquot to get filled in */
+	uint		flags)
+{
+	xfs_disk_dquot_t *ddqp;
+	xfs_buf_t	 *bp;
+	int		 error;
+	xfs_trans_t	 *tp;
+
+	ASSERT(tpp);
+
+	trace_xfs_dqread(dqp);
+
+	/*
+	 * get a pointer to the on-disk dquot and the buffer containing it
+	 * dqp already knows its own type (GROUP/USER).
+	 */
+	if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) {
+		return (error);
+	}
+	tp = *tpp;
+
+	/* copy everything from disk dquot to the incore dquot */
+	memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
+	ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
+	xfs_qm_dquot_logitem_init(dqp);
+
+	/*
+	 * Reservation counters are defined as reservation plus current usage
+	 * to avoid having to add every time.
+	 */
+	dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
+	dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
+	dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
+
+	/* Mark the buf so that this will stay incore a little longer */
+	XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF);
+
+	/*
+	 * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
+	 * So we need to release with xfs_trans_brelse().
+	 * The strategy here is identical to that of inodes; we lock
+	 * the dquot in xfs_qm_dqget() before making it accessible to
+	 * others. This is because dquots, like inodes, need a good level of
+	 * concurrency, and we don't want to take locks on the entire buffers
+	 * for dquot accesses.
+	 * Note also that the dquot buffer may even be dirty at this point, if
+	 * this particular dquot was repaired. We still aren't afraid to
+	 * brelse it because we have the changes incore.
+	 */
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+	xfs_trans_brelse(tp, bp);
+
+	return (error);
+}
+
+
+/*
+ * allocate an incore dquot from the kernel heap,
+ * and fill its core with quota information kept on disk.
+ * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk
+ * if it wasn't already allocated.
+ */
+STATIC int
+xfs_qm_idtodq(
+	xfs_mount_t	*mp,
+	xfs_dqid_t	id,	 /* gid or uid, depending on type */
+	uint		type,	 /* UDQUOT or GDQUOT */
+	uint		flags,	 /* DQALLOC, DQREPAIR */
+	xfs_dquot_t	**O_dqpp)/* OUT : incore dquot, not locked */
+{
+	xfs_dquot_t	*dqp;
+	int		error;
+	xfs_trans_t	*tp;
+	int		cancelflags=0;
+
+	dqp = xfs_qm_dqinit(mp, id, type);
+	tp = NULL;
+	if (flags & XFS_QMOPT_DQALLOC) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
+		error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
+				XFS_WRITE_LOG_RES(mp) +
+				BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
+				128,
+				0,
+				XFS_TRANS_PERM_LOG_RES,
+				XFS_WRITE_LOG_COUNT);
+		if (error) {
+			cancelflags = 0;
+			goto error0;
+		}
+		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+	}
+
+	/*
+	 * Read it from disk; xfs_dqread() takes care of
+	 * all the necessary initialization of dquot's fields (locks, etc)
+	 */
+	if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) {
+		/*
+		 * This can happen if quotas got turned off (ESRCH),
+		 * or if the dquot didn't exist on disk and we ask to
+		 * allocate (ENOENT).
+		 */
+		trace_xfs_dqread_fail(dqp);
+		cancelflags |= XFS_TRANS_ABORT;
+		goto error0;
+	}
+	if (tp) {
+		if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES)))
+			goto error1;
+	}
+
+	*O_dqpp = dqp;
+	return (0);
+
+ error0:
+	ASSERT(error);
+	if (tp)
+		xfs_trans_cancel(tp, cancelflags);
+ error1:
+	xfs_qm_dqdestroy(dqp);
+	*O_dqpp = NULL;
+	return (error);
+}
+
+/*
+ * Lookup a dquot in the incore dquot hashtable. We keep two separate
+ * hashtables for user and group dquots; and, these are global tables
+ * inside the XQM, not per-filesystem tables.
+ * The hash chain must be locked by caller, and it is left locked
+ * on return. Returning dquot is locked.
+ */
+STATIC int
+xfs_qm_dqlookup(
+	xfs_mount_t		*mp,
+	xfs_dqid_t		id,
+	xfs_dqhash_t		*qh,
+	xfs_dquot_t		**O_dqpp)
+{
+	xfs_dquot_t		*dqp;
+	uint			flist_locked;
+
+	ASSERT(mutex_is_locked(&qh->qh_lock));
+
+	flist_locked = B_FALSE;
+
+	/*
+	 * Traverse the hashchain looking for a match
+	 */
+	list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
+		/*
+		 * We already have the hashlock. We don't need the
+		 * dqlock to look at the id field of the dquot, since the
+		 * id can't be modified without the hashlock anyway.
+		 */
+		if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) {
+			trace_xfs_dqlookup_found(dqp);
+
+			/*
+			 * All in core dquots must be on the dqlist of mp
+			 */
+			ASSERT(!list_empty(&dqp->q_mplist));
+
+			xfs_dqlock(dqp);
+			if (dqp->q_nrefs == 0) {
+				ASSERT(!list_empty(&dqp->q_freelist));
+				if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
+					trace_xfs_dqlookup_want(dqp);
+
+					/*
+					 * We may have raced with dqreclaim_one()
+					 * (and lost). So, flag that we don't
+					 * want the dquot to be reclaimed.
+					 */
+					dqp->dq_flags |= XFS_DQ_WANT;
+					xfs_dqunlock(dqp);
+					mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+					xfs_dqlock(dqp);
+					dqp->dq_flags &= ~(XFS_DQ_WANT);
+				}
+				flist_locked = B_TRUE;
+			}
+
+			/*
+			 * id couldn't have changed; we had the hashlock all
+			 * along
+			 */
+			ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
+
+			if (flist_locked) {
+				if (dqp->q_nrefs != 0) {
+					mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+					flist_locked = B_FALSE;
+				} else {
+					/* take it off the freelist */
+					trace_xfs_dqlookup_freelist(dqp);
+					list_del_init(&dqp->q_freelist);
+					xfs_Gqm->qm_dqfrlist_cnt--;
+				}
+			}
+
+			XFS_DQHOLD(dqp);
+
+			if (flist_locked)
+				mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+			/*
+			 * move the dquot to the front of the hashchain
+			 */
+			ASSERT(mutex_is_locked(&qh->qh_lock));
+			list_move(&dqp->q_hashlist, &qh->qh_list);
+			trace_xfs_dqlookup_done(dqp);
+			*O_dqpp = dqp;
+			return 0;
+		}
+	}
+
+	*O_dqpp = NULL;
+	ASSERT(mutex_is_locked(&qh->qh_lock));
+	return (1);
+}
+
+/*
+ * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
+ * a locked dquot, doing an allocation (if requested) as needed.
+ * When both an inode and an id are given, the inode's id takes precedence.
+ * That is, if the id changes while we don't hold the ilock inside this
+ * function, the new dquot is returned, not necessarily the one requested
+ * in the id argument.
+ */
+int
+xfs_qm_dqget(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,	  /* locked inode (optional) */
+	xfs_dqid_t	id,	  /* uid/projid/gid depending on type */
+	uint		type,	  /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */
+	uint		flags,	  /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
+	xfs_dquot_t	**O_dqpp) /* OUT : locked incore dquot */
+{
+	xfs_dquot_t	*dqp;
+	xfs_dqhash_t	*h;
+	uint		version;
+	int		error;
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+	if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
+	    (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
+	    (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
+		return (ESRCH);
+	}
+	h = XFS_DQ_HASH(mp, id, type);
+
+#ifdef DEBUG
+	if (xfs_do_dqerror) {
+		if ((xfs_dqerror_target == mp->m_ddev_targp) &&
+		    (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
+			xfs_debug(mp, "Returning error in dqget");
+			return (EIO);
+		}
+	}
+#endif
+
+ again:
+
+#ifdef DEBUG
+	ASSERT(type == XFS_DQ_USER ||
+	       type == XFS_DQ_PROJ ||
+	       type == XFS_DQ_GROUP);
+	if (ip) {
+		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+		if (type == XFS_DQ_USER)
+			ASSERT(ip->i_udquot == NULL);
+		else
+			ASSERT(ip->i_gdquot == NULL);
+	}
+#endif
+	mutex_lock(&h->qh_lock);
+
+	/*
+	 * Look in the cache (hashtable).
+	 * The chain is kept locked during lookup.
+	 */
+	if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) {
+		XQM_STATS_INC(xqmstats.xs_qm_dqcachehits);
+		/*
+		 * The dquot was found, moved to the front of the chain,
+		 * taken off the freelist if it was on it, and locked
+		 * at this point. Just unlock the hashchain and return.
+		 */
+		ASSERT(*O_dqpp);
+		ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
+		mutex_unlock(&h->qh_lock);
+		trace_xfs_dqget_hit(*O_dqpp);
+		return (0);	/* success */
+	}
+	XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
+
+	/*
+	 * Dquot cache miss. We don't want to keep the inode lock across
+	 * a (potential) disk read. Also we don't want to deal with the lock
+	 * ordering between quotainode and this inode. OTOH, dropping the inode
+	 * lock here means dealing with a chown that can happen before
+	 * we re-acquire the lock.
+	 */
+	if (ip)
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	/*
+	 * Save the hashchain version stamp, and unlock the chain, so that
+	 * we don't keep the lock across a disk read
+	 */
+	version = h->qh_version;
+	mutex_unlock(&h->qh_lock);
+
+	/*
+	 * Allocate the dquot on the kernel heap, and read the ondisk
+	 * portion off the disk. Also, do all the necessary initialization
+	 * This can return ENOENT if dquot didn't exist on disk and we didn't
+	 * ask it to allocate; ESRCH if quotas got turned off suddenly.
+	 */
+	if ((error = xfs_qm_idtodq(mp, id, type,
+				  flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR|
+					   XFS_QMOPT_DOWARN),
+				  &dqp))) {
+		if (ip)
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+		return (error);
+	}
+
+	/*
+	 * See if this is mount code calling to look at the overall quota limits
+	 * which are stored in the id == 0 user or group's dquot.
+	 * Since we may not have done a quotacheck by this point, just return
+	 * the dquot without attaching it to any hashtables, lists, etc, or even
+	 * taking a reference.
+	 * The caller must dqdestroy this once done.
+	 */
+	if (flags & XFS_QMOPT_DQSUSER) {
+		ASSERT(id == 0);
+		ASSERT(! ip);
+		goto dqret;
+	}
+
+	/*
+	 * Dquot lock comes after hashlock in the lock ordering
+	 */
+	if (ip) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+		/*
+		 * A dquot could be attached to this inode by now, since
+		 * we had dropped the ilock.
+		 */
+		if (type == XFS_DQ_USER) {
+			if (!XFS_IS_UQUOTA_ON(mp)) {
+				/* inode stays locked on return */
+				xfs_qm_dqdestroy(dqp);
+				return XFS_ERROR(ESRCH);
+			}
+			if (ip->i_udquot) {
+				xfs_qm_dqdestroy(dqp);
+				dqp = ip->i_udquot;
+				xfs_dqlock(dqp);
+				goto dqret;
+			}
+		} else {
+			if (!XFS_IS_OQUOTA_ON(mp)) {
+				/* inode stays locked on return */
+				xfs_qm_dqdestroy(dqp);
+				return XFS_ERROR(ESRCH);
+			}
+			if (ip->i_gdquot) {
+				xfs_qm_dqdestroy(dqp);
+				dqp = ip->i_gdquot;
+				xfs_dqlock(dqp);
+				goto dqret;
+			}
+		}
+	}
+
+	/*
+	 * Hashlock comes after ilock in lock order
+	 */
+	mutex_lock(&h->qh_lock);
+	if (version != h->qh_version) {
+		xfs_dquot_t *tmpdqp;
+		/*
+		 * Now, see if somebody else put the dquot in the
+		 * hashtable before us. This can happen because we didn't
+		 * keep the hashchain lock. We don't have to worry about
+		 * lock order between the two dquots here since dqp isn't
+		 * on any findable lists yet.
+		 */
+		if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) {
+			/*
+			 * Duplicate found. Just throw away the new dquot
+			 * and start over.
+			 */
+			xfs_qm_dqput(tmpdqp);
+			mutex_unlock(&h->qh_lock);
+			xfs_qm_dqdestroy(dqp);
+			XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
+			goto again;
+		}
+	}
+
+	/*
+	 * Put the dquot at the beginning of the hash-chain and mp's list
+	 * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
+	 */
+	ASSERT(mutex_is_locked(&h->qh_lock));
+	dqp->q_hash = h;
+	list_add(&dqp->q_hashlist, &h->qh_list);
+	h->qh_version++;
+
+	/*
+	 * Attach this dquot to this filesystem's list of all dquots,
+	 * kept inside the mount structure in m_quotainfo field
+	 */
+	mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
+
+	/*
+	 * We return a locked dquot to the caller, with a reference taken
+	 */
+	xfs_dqlock(dqp);
+	dqp->q_nrefs = 1;
+
+	list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
+	mp->m_quotainfo->qi_dquots++;
+	mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+	mutex_unlock(&h->qh_lock);
+ dqret:
+	ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	trace_xfs_dqget_miss(dqp);
+	*O_dqpp = dqp;
+	return (0);
+}
+
+
+/*
+ * Release a reference to the dquot (decrement ref-count)
+ * and unlock it. If there is a group quota attached to this
+ * dquot, carefully release that too without tripping over
+ * deadlocks'n'stuff.
+ */
+void
+xfs_qm_dqput(
+	xfs_dquot_t	*dqp)
+{
+	xfs_dquot_t	*gdqp;
+
+	ASSERT(dqp->q_nrefs > 0);
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	trace_xfs_dqput(dqp);
+
+	if (dqp->q_nrefs != 1) {
+		dqp->q_nrefs--;
+		xfs_dqunlock(dqp);
+		return;
+	}
+
+	/*
+	 * drop the dqlock and acquire the freelist and dqlock
+	 * in the right order; but try to get it out-of-order first
+	 */
+	if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
+		trace_xfs_dqput_wait(dqp);
+		xfs_dqunlock(dqp);
+		mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+		xfs_dqlock(dqp);
+	}
+
+	while (1) {
+		gdqp = NULL;
+
+		/* We can't depend on nrefs being == 1 here */
+		if (--dqp->q_nrefs == 0) {
+			trace_xfs_dqput_free(dqp);
+
+			list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
+			xfs_Gqm->qm_dqfrlist_cnt++;
+
+			/*
+			 * If we just added a udquot to the freelist, then
+			 * we want to release the gdquot reference that
+			 * it (probably) has. Otherwise it'll keep the
+			 * gdquot from getting reclaimed.
+			 */
+			if ((gdqp = dqp->q_gdquot)) {
+				/*
+				 * Avoid a recursive dqput call
+				 */
+				xfs_dqlock(gdqp);
+				dqp->q_gdquot = NULL;
+			}
+		}
+		xfs_dqunlock(dqp);
+
+		/*
+		 * If we had a group quota inside the user quota as a hint,
+		 * release it now.
+		 */
+		if (! gdqp)
+			break;
+		dqp = gdqp;
+	}
+	mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+}
+
+/*
+ * Release a dquot. Flush it if dirty, then dqput() it.
+ * dquot must not be locked.
+ */
+void
+xfs_qm_dqrele(
+	xfs_dquot_t	*dqp)
+{
+	if (!dqp)
+		return;
+
+	trace_xfs_dqrele(dqp);
+
+	xfs_dqlock(dqp);
+	/*
+	 * We don't care to flush it if the dquot is dirty here.
+	 * That will create stutters that we want to avoid.
+	 * Instead we do a delayed write when we try to reclaim
+	 * a dirty dquot. Also xfs_sync will take part of the burden...
+	 */
+	xfs_qm_dqput(dqp);
+}
+
+/*
+ * This is the dquot flushing I/O completion routine.  It is called
+ * from interrupt level when the buffer containing the dquot is
+ * flushed to disk.  It is responsible for removing the dquot logitem
+ * from the AIL if it has not been re-logged, and unlocking the dquot's
+ * flush lock. This behavior is very similar to that of inodes..
+ */
+STATIC void
+xfs_qm_dqflush_done(
+	struct xfs_buf		*bp,
+	struct xfs_log_item	*lip)
+{
+	xfs_dq_logitem_t	*qip = (struct xfs_dq_logitem *)lip;
+	xfs_dquot_t		*dqp = qip->qli_dquot;
+	struct xfs_ail		*ailp = lip->li_ailp;
+
+	/*
+	 * We only want to pull the item from the AIL if its
+	 * location in the log has not changed since we started the flush.
+	 * Thus, we only bother if the dquot's lsn has
+	 * not changed. First we check the lsn outside the lock
+	 * since it's cheaper, and then we recheck while
+	 * holding the lock before removing the dquot from the AIL.
+	 */
+	if ((lip->li_flags & XFS_LI_IN_AIL) &&
+	    lip->li_lsn == qip->qli_flush_lsn) {
+
+		/* xfs_trans_ail_delete() drops the AIL lock. */
+		spin_lock(&ailp->xa_lock);
+		if (lip->li_lsn == qip->qli_flush_lsn)
+			xfs_trans_ail_delete(ailp, lip);
+		else
+			spin_unlock(&ailp->xa_lock);
+	}
+
+	/*
+	 * Release the dq's flush lock since we're done with it.
+	 */
+	xfs_dqfunlock(dqp);
+}
+
+/*
+ * Write a modified dquot to disk.
+ * The dquot must be locked and the flush lock too taken by caller.
+ * The flush lock will not be unlocked until the dquot reaches the disk,
+ * but the dquot is free to be unlocked and modified by the caller
+ * in the interim. Dquot is still locked on return. This behavior is
+ * identical to that of inodes.
+ */
+int
+xfs_qm_dqflush(
+	xfs_dquot_t		*dqp,
+	uint			flags)
+{
+	struct xfs_mount	*mp = dqp->q_mount;
+	struct xfs_buf		*bp;
+	struct xfs_disk_dquot	*ddqp;
+	int			error;
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	ASSERT(!completion_done(&dqp->q_flush));
+
+	trace_xfs_dqflush(dqp);
+
+	/*
+	 * If not dirty, or it's pinned and we are not supposed to block, nada.
+	 */
+	if (!XFS_DQ_IS_DIRTY(dqp) ||
+	    (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
+		xfs_dqfunlock(dqp);
+		return 0;
+	}
+	xfs_qm_dqunpin_wait(dqp);
+
+	/*
+	 * This may have been unpinned because the filesystem is shutting
+	 * down forcibly. If that's the case we must not write this dquot
+	 * to disk, because the log record didn't make it to disk!
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		dqp->dq_flags &= ~XFS_DQ_DIRTY;
+		xfs_dqfunlock(dqp);
+		return XFS_ERROR(EIO);
+	}
+
+	/*
+	 * Get the buffer containing the on-disk dquot
+	 */
+	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+				   mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+	if (error) {
+		ASSERT(error != ENOENT);
+		xfs_dqfunlock(dqp);
+		return error;
+	}
+
+	/*
+	 * Calculate the location of the dquot inside the buffer.
+	 */
+	ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+
+	/*
+	 * A simple sanity check in case we got a corrupted dquot..
+	 */
+	error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+			   XFS_QMOPT_DOWARN, "dqflush (incore copy)");
+	if (error) {
+		xfs_buf_relse(bp);
+		xfs_dqfunlock(dqp);
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		return XFS_ERROR(EIO);
+	}
+
+	/* This is the only portion of data that needs to persist */
+	memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
+
+	/*
+	 * Clear the dirty field and remember the flush lsn for later use.
+	 */
+	dqp->dq_flags &= ~XFS_DQ_DIRTY;
+
+	xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
+					&dqp->q_logitem.qli_item.li_lsn);
+
+	/*
+	 * Attach an iodone routine so that we can remove this dquot from the
+	 * AIL and release the flush lock once the dquot is synced to disk.
+	 */
+	xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
+				  &dqp->q_logitem.qli_item);
+
+	/*
+	 * If the buffer is pinned then push on the log so we won't
+	 * get stuck waiting in the write for too long.
+	 */
+	if (XFS_BUF_ISPINNED(bp)) {
+		trace_xfs_dqflush_force(dqp);
+		xfs_log_force(mp, 0);
+	}
+
+	if (flags & SYNC_WAIT)
+		error = xfs_bwrite(mp, bp);
+	else
+		xfs_bdwrite(mp, bp);
+
+	trace_xfs_dqflush_done(dqp);
+
+	/*
+	 * dqp is still locked, but caller is free to unlock it now.
+	 */
+	return error;
+
+}
+
+int
+xfs_qm_dqlock_nowait(
+	xfs_dquot_t *dqp)
+{
+	return mutex_trylock(&dqp->q_qlock);
+}
+
+void
+xfs_dqlock(
+	xfs_dquot_t *dqp)
+{
+	mutex_lock(&dqp->q_qlock);
+}
+
+void
+xfs_dqunlock(
+	xfs_dquot_t *dqp)
+{
+	mutex_unlock(&(dqp->q_qlock));
+	if (dqp->q_logitem.qli_dquot == dqp) {
+		/* Once was dqp->q_mount, but might just have been cleared */
+		xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
+					(xfs_log_item_t*)&(dqp->q_logitem));
+	}
+}
+
+
+void
+xfs_dqunlock_nonotify(
+	xfs_dquot_t *dqp)
+{
+	mutex_unlock(&(dqp->q_qlock));
+}
+
+/*
+ * Lock two xfs_dquot structures.
+ *
+ * To avoid deadlocks we always lock the quota structure with
+ * the lowerd id first.
+ */
+void
+xfs_dqlock2(
+	xfs_dquot_t	*d1,
+	xfs_dquot_t	*d2)
+{
+	if (d1 && d2) {
+		ASSERT(d1 != d2);
+		if (be32_to_cpu(d1->q_core.d_id) >
+		    be32_to_cpu(d2->q_core.d_id)) {
+			mutex_lock(&d2->q_qlock);
+			mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
+		} else {
+			mutex_lock(&d1->q_qlock);
+			mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED);
+		}
+	} else if (d1) {
+		mutex_lock(&d1->q_qlock);
+	} else if (d2) {
+		mutex_lock(&d2->q_qlock);
+	}
+}
+
+
+/*
+ * Take a dquot out of the mount's dqlist as well as the hashlist.
+ * This is called via unmount as well as quotaoff, and the purge
+ * will always succeed unless there are soft (temp) references
+ * outstanding.
+ *
+ * This returns 0 if it was purged, 1 if it wasn't. It's not an error code
+ * that we're returning! XXXsup - not cool.
+ */
+/* ARGSUSED */
+int
+xfs_qm_dqpurge(
+	xfs_dquot_t	*dqp)
+{
+	xfs_dqhash_t	*qh = dqp->q_hash;
+	xfs_mount_t	*mp = dqp->q_mount;
+
+	ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
+	ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
+
+	xfs_dqlock(dqp);
+	/*
+	 * We really can't afford to purge a dquot that is
+	 * referenced, because these are hard refs.
+	 * It shouldn't happen in general because we went thru _all_ inodes in
+	 * dqrele_all_inodes before calling this and didn't let the mountlock go.
+	 * However it is possible that we have dquots with temporary
+	 * references that are not attached to an inode. e.g. see xfs_setattr().
+	 */
+	if (dqp->q_nrefs != 0) {
+		xfs_dqunlock(dqp);
+		mutex_unlock(&dqp->q_hash->qh_lock);
+		return (1);
+	}
+
+	ASSERT(!list_empty(&dqp->q_freelist));
+
+	/*
+	 * If we're turning off quotas, we have to make sure that, for
+	 * example, we don't delete quota disk blocks while dquots are
+	 * in the process of getting written to those disk blocks.
+	 * This dquot might well be on AIL, and we can't leave it there
+	 * if we're turning off quotas. Basically, we need this flush
+	 * lock, and are willing to block on it.
+	 */
+	if (!xfs_dqflock_nowait(dqp)) {
+		/*
+		 * Block on the flush lock after nudging dquot buffer,
+		 * if it is incore.
+		 */
+		xfs_qm_dqflock_pushbuf_wait(dqp);
+	}
+
+	/*
+	 * XXXIf we're turning this type of quotas off, we don't care
+	 * about the dirty metadata sitting in this dquot. OTOH, if
+	 * we're unmounting, we do care, so we flush it and wait.
+	 */
+	if (XFS_DQ_IS_DIRTY(dqp)) {
+		int	error;
+
+		/* dqflush unlocks dqflock */
+		/*
+		 * Given that dqpurge is a very rare occurrence, it is OK
+		 * that we're holding the hashlist and mplist locks
+		 * across the disk write. But, ... XXXsup
+		 *
+		 * We don't care about getting disk errors here. We need
+		 * to purge this dquot anyway, so we go ahead regardless.
+		 */
+		error = xfs_qm_dqflush(dqp, SYNC_WAIT);
+		if (error)
+			xfs_warn(mp, "%s: dquot %p flush failed",
+				__func__, dqp);
+		xfs_dqflock(dqp);
+	}
+	ASSERT(atomic_read(&dqp->q_pincount) == 0);
+	ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+	       !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
+
+	list_del_init(&dqp->q_hashlist);
+	qh->qh_version++;
+	list_del_init(&dqp->q_mplist);
+	mp->m_quotainfo->qi_dqreclaims++;
+	mp->m_quotainfo->qi_dquots--;
+	/*
+	 * XXX Move this to the front of the freelist, if we can get the
+	 * freelist lock.
+	 */
+	ASSERT(!list_empty(&dqp->q_freelist));
+
+	dqp->q_mount = NULL;
+	dqp->q_hash = NULL;
+	dqp->dq_flags = XFS_DQ_INACTIVE;
+	memset(&dqp->q_core, 0, sizeof(dqp->q_core));
+	xfs_dqfunlock(dqp);
+	xfs_dqunlock(dqp);
+	mutex_unlock(&qh->qh_lock);
+	return (0);
+}
+
+
+#ifdef QUOTADEBUG
+void
+xfs_qm_dqprint(xfs_dquot_t *dqp)
+{
+	struct xfs_mount	*mp = dqp->q_mount;
+
+	xfs_debug(mp, "-----------KERNEL DQUOT----------------");
+	xfs_debug(mp, "---- dquotID =  %d",
+		(int)be32_to_cpu(dqp->q_core.d_id));
+	xfs_debug(mp, "---- type    =  %s", DQFLAGTO_TYPESTR(dqp));
+	xfs_debug(mp, "---- fs      =  0x%p", dqp->q_mount);
+	xfs_debug(mp, "---- blkno   =  0x%x", (int) dqp->q_blkno);
+	xfs_debug(mp, "---- boffset =  0x%x", (int) dqp->q_bufoffset);
+	xfs_debug(mp, "---- blkhlimit =  %Lu (0x%x)",
+		be64_to_cpu(dqp->q_core.d_blk_hardlimit),
+		(int)be64_to_cpu(dqp->q_core.d_blk_hardlimit));
+	xfs_debug(mp, "---- blkslimit =  %Lu (0x%x)",
+		be64_to_cpu(dqp->q_core.d_blk_softlimit),
+		(int)be64_to_cpu(dqp->q_core.d_blk_softlimit));
+	xfs_debug(mp, "---- inohlimit =  %Lu (0x%x)",
+		be64_to_cpu(dqp->q_core.d_ino_hardlimit),
+		(int)be64_to_cpu(dqp->q_core.d_ino_hardlimit));
+	xfs_debug(mp, "---- inoslimit =  %Lu (0x%x)",
+		be64_to_cpu(dqp->q_core.d_ino_softlimit),
+		(int)be64_to_cpu(dqp->q_core.d_ino_softlimit));
+	xfs_debug(mp, "---- bcount  =  %Lu (0x%x)",
+		be64_to_cpu(dqp->q_core.d_bcount),
+		(int)be64_to_cpu(dqp->q_core.d_bcount));
+	xfs_debug(mp, "---- icount  =  %Lu (0x%x)",
+		be64_to_cpu(dqp->q_core.d_icount),
+		(int)be64_to_cpu(dqp->q_core.d_icount));
+	xfs_debug(mp, "---- btimer  =  %d",
+		(int)be32_to_cpu(dqp->q_core.d_btimer));
+	xfs_debug(mp, "---- itimer  =  %d",
+		(int)be32_to_cpu(dqp->q_core.d_itimer));
+	xfs_debug(mp, "---------------------------");
+}
+#endif
+
+/*
+ * Give the buffer a little push if it is incore and
+ * wait on the flush lock.
+ */
+void
+xfs_qm_dqflock_pushbuf_wait(
+	xfs_dquot_t	*dqp)
+{
+	xfs_mount_t	*mp = dqp->q_mount;
+	xfs_buf_t	*bp;
+
+	/*
+	 * Check to see if the dquot has been flushed delayed
+	 * write.  If so, grab its buffer and send it
+	 * out immediately.  We'll be able to acquire
+	 * the flush lock when the I/O completes.
+	 */
+	bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
+			mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
+	if (!bp)
+		goto out_lock;
+
+	if (XFS_BUF_ISDELAYWRITE(bp)) {
+		if (XFS_BUF_ISPINNED(bp))
+			xfs_log_force(mp, 0);
+		xfs_buf_delwri_promote(bp);
+		wake_up_process(bp->b_target->bt_task);
+	}
+	xfs_buf_relse(bp);
+out_lock:
+	xfs_dqflock(dqp);
+}
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
new file mode 100644
index 00000000..5da3a23b
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DQUOT_H__
+#define __XFS_DQUOT_H__
+
+/*
+ * Dquots are structures that hold quota information about a user or a group,
+ * much like inodes are for files. In fact, dquots share many characteristics
+ * with inodes. However, dquots can also be a centralized resource, relative
+ * to a collection of inodes. In this respect, dquots share some characteristics
+ * of the superblock.
+ * XFS dquots exploit both those in its algorithms. They make every attempt
+ * to not be a bottleneck when quotas are on and have minimal impact, if any,
+ * when quotas are off.
+ */
+
+/*
+ * The hash chain headers (hash buckets)
+ */
+typedef struct xfs_dqhash {
+	struct list_head  qh_list;
+	struct mutex	  qh_lock;
+	uint		  qh_version;	/* ever increasing version */
+	uint		  qh_nelems;	/* number of dquots on the list */
+} xfs_dqhash_t;
+
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * The incore dquot structure
+ */
+typedef struct xfs_dquot {
+	uint		 dq_flags;	/* various flags (XFS_DQ_*) */
+	struct list_head q_freelist;	/* global free list of dquots */
+	struct list_head q_mplist;	/* mount's list of dquots */
+	struct list_head q_hashlist;	/* gloabl hash list of dquots */
+	xfs_dqhash_t	*q_hash;	/* the hashchain header */
+	struct xfs_mount*q_mount;	/* filesystem this relates to */
+	struct xfs_trans*q_transp;	/* trans this belongs to currently */
+	uint		 q_nrefs;	/* # active refs from inodes */
+	xfs_daddr_t	 q_blkno;	/* blkno of dquot buffer */
+	int		 q_bufoffset;	/* off of dq in buffer (# dquots) */
+	xfs_fileoff_t	 q_fileoffset;	/* offset in quotas file */
+
+	struct xfs_dquot*q_gdquot;	/* group dquot, hint only */
+	xfs_disk_dquot_t q_core;	/* actual usage & quotas */
+	xfs_dq_logitem_t q_logitem;	/* dquot log item */
+	xfs_qcnt_t	 q_res_bcount;	/* total regular nblks used+reserved */
+	xfs_qcnt_t	 q_res_icount;	/* total inos allocd+reserved */
+	xfs_qcnt_t	 q_res_rtbcount;/* total realtime blks used+reserved */
+	struct mutex	 q_qlock;	/* quota lock */
+	struct completion q_flush;	/* flush completion queue */
+	atomic_t          q_pincount;	/* dquot pin count */
+	wait_queue_head_t q_pinwait;	/* dquot pinning wait queue */
+} xfs_dquot_t;
+
+/*
+ * Lock hierarchy for q_qlock:
+ *	XFS_QLOCK_NORMAL is the implicit default,
+ * 	XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
+ */
+enum {
+	XFS_QLOCK_NORMAL = 0,
+	XFS_QLOCK_NESTED,
+};
+
+#define XFS_DQHOLD(dqp)		((dqp)->q_nrefs++)
+
+/*
+ * Manage the q_flush completion queue embedded in the dquot.  This completion
+ * queue synchronizes processes attempting to flush the in-core dquot back to
+ * disk.
+ */
+static inline void xfs_dqflock(xfs_dquot_t *dqp)
+{
+	wait_for_completion(&dqp->q_flush);
+}
+
+static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp)
+{
+	return try_wait_for_completion(&dqp->q_flush);
+}
+
+static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
+{
+	complete(&dqp->q_flush);
+}
+
+#define XFS_DQ_IS_LOCKED(dqp)	(mutex_is_locked(&((dqp)->q_qlock)))
+#define XFS_DQ_IS_DIRTY(dqp)	((dqp)->dq_flags & XFS_DQ_DIRTY)
+#define XFS_QM_ISUDQ(dqp)	((dqp)->dq_flags & XFS_DQ_USER)
+#define XFS_QM_ISPDQ(dqp)	((dqp)->dq_flags & XFS_DQ_PROJ)
+#define XFS_QM_ISGDQ(dqp)	((dqp)->dq_flags & XFS_DQ_GROUP)
+#define XFS_DQ_TO_QINF(dqp)	((dqp)->q_mount->m_quotainfo)
+#define XFS_DQ_TO_QIP(dqp)	(XFS_QM_ISUDQ(dqp) ? \
+				 XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
+				 XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
+
+#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \
+				     (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
+				     (XFS_IS_OQUOTA_ON((d)->q_mount))))
+
+#ifdef QUOTADEBUG
+extern void		xfs_qm_dqprint(xfs_dquot_t *);
+#else
+#define xfs_qm_dqprint(a)
+#endif
+
+extern void		xfs_qm_dqdestroy(xfs_dquot_t *);
+extern int		xfs_qm_dqflush(xfs_dquot_t *, uint);
+extern int		xfs_qm_dqpurge(xfs_dquot_t *);
+extern void		xfs_qm_dqunpin_wait(xfs_dquot_t *);
+extern int		xfs_qm_dqlock_nowait(xfs_dquot_t *);
+extern void		xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
+extern void		xfs_qm_adjust_dqtimers(xfs_mount_t *,
+					xfs_disk_dquot_t *);
+extern void		xfs_qm_adjust_dqlimits(xfs_mount_t *,
+					xfs_disk_dquot_t *);
+extern int		xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
+					xfs_dqid_t, uint, uint, xfs_dquot_t **);
+extern void		xfs_qm_dqput(xfs_dquot_t *);
+extern void		xfs_dqlock(xfs_dquot_t *);
+extern void		xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *);
+extern void		xfs_dqunlock(xfs_dquot_t *);
+extern void		xfs_dqunlock_nonotify(xfs_dquot_t *);
+
+#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
new file mode 100644
index 00000000..8126fc2e
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+
+static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_dq_logitem, qli_item);
+}
+
+/*
+ * returns the number of iovecs needed to log the given dquot item.
+ */
+STATIC uint
+xfs_qm_dquot_logitem_size(
+	struct xfs_log_item	*lip)
+{
+	/*
+	 * we need only two iovecs, one for the format, one for the real thing
+	 */
+	return 2;
+}
+
+/*
+ * fills in the vector of log iovecs for the given dquot log item.
+ */
+STATIC void
+xfs_qm_dquot_logitem_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_iovec	*logvec)
+{
+	struct xfs_dq_logitem	*qlip = DQUOT_ITEM(lip);
+
+	logvec->i_addr = &qlip->qli_format;
+	logvec->i_len  = sizeof(xfs_dq_logformat_t);
+	logvec->i_type = XLOG_REG_TYPE_QFORMAT;
+	logvec++;
+	logvec->i_addr = &qlip->qli_dquot->q_core;
+	logvec->i_len  = sizeof(xfs_disk_dquot_t);
+	logvec->i_type = XLOG_REG_TYPE_DQUOT;
+
+	ASSERT(2 == lip->li_desc->lid_size);
+	qlip->qli_format.qlf_size = 2;
+
+}
+
+/*
+ * Increment the pin count of the given dquot.
+ */
+STATIC void
+xfs_qm_dquot_logitem_pin(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	atomic_inc(&dqp->q_pincount);
+}
+
+/*
+ * Decrement the pin count of the given dquot, and wake up
+ * anyone in xfs_dqwait_unpin() if the count goes to 0.	 The
+ * dquot must have been previously pinned with a call to
+ * xfs_qm_dquot_logitem_pin().
+ */
+STATIC void
+xfs_qm_dquot_logitem_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+	ASSERT(atomic_read(&dqp->q_pincount) > 0);
+	if (atomic_dec_and_test(&dqp->q_pincount))
+		wake_up(&dqp->q_pinwait);
+}
+
+/*
+ * Given the logitem, this writes the corresponding dquot entry to disk
+ * asynchronously. This is called with the dquot entry securely locked;
+ * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot
+ * at the end.
+ */
+STATIC void
+xfs_qm_dquot_logitem_push(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
+	int			error;
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	ASSERT(!completion_done(&dqp->q_flush));
+
+	/*
+	 * Since we were able to lock the dquot's flush lock and
+	 * we found it on the AIL, the dquot must be dirty.  This
+	 * is because the dquot is removed from the AIL while still
+	 * holding the flush lock in xfs_dqflush_done().  Thus, if
+	 * we found it in the AIL and were able to obtain the flush
+	 * lock without sleeping, then there must not have been
+	 * anyone in the process of flushing the dquot.
+	 */
+	error = xfs_qm_dqflush(dqp, 0);
+	if (error)
+		xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
+			__func__, error, dqp);
+	xfs_dqunlock(dqp);
+}
+
+STATIC xfs_lsn_t
+xfs_qm_dquot_logitem_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	/*
+	 * We always re-log the entire dquot when it becomes dirty,
+	 * so, the latest copy _is_ the only one that matters.
+	 */
+	return lsn;
+}
+
+/*
+ * This is called to wait for the given dquot to be unpinned.
+ * Most of these pin/unpin routines are plagiarized from inode code.
+ */
+void
+xfs_qm_dqunpin_wait(
+	struct xfs_dquot	*dqp)
+{
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	if (atomic_read(&dqp->q_pincount) == 0)
+		return;
+
+	/*
+	 * Give the log a push so we don't wait here too long.
+	 */
+	xfs_log_force(dqp->q_mount, 0);
+	wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
+}
+
+/*
+ * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that
+ * the dquot is locked by us, but the flush lock isn't. So, here we are
+ * going to see if the relevant dquot buffer is incore, waiting on DELWRI.
+ * If so, we want to push it out to help us take this item off the AIL as soon
+ * as possible.
+ *
+ * We must not be holding the AIL lock at this point. Calling incore() to
+ * search the buffer cache can be a time consuming thing, and AIL lock is a
+ * spinlock.
+ */
+STATIC bool
+xfs_qm_dquot_logitem_pushbuf(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_dq_logitem	*qlip = DQUOT_ITEM(lip);
+	struct xfs_dquot	*dqp = qlip->qli_dquot;
+	struct xfs_buf		*bp;
+	bool			ret = true;
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	/*
+	 * If flushlock isn't locked anymore, chances are that the
+	 * inode flush completed and the inode was taken off the AIL.
+	 * So, just get out.
+	 */
+	if (completion_done(&dqp->q_flush) ||
+	    !(lip->li_flags & XFS_LI_IN_AIL)) {
+		xfs_dqunlock(dqp);
+		return true;
+	}
+
+	bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
+			dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
+	xfs_dqunlock(dqp);
+	if (!bp)
+		return true;
+	if (XFS_BUF_ISDELAYWRITE(bp))
+		xfs_buf_delwri_promote(bp);
+	if (XFS_BUF_ISPINNED(bp))
+		ret = false;
+	xfs_buf_relse(bp);
+	return ret;
+}
+
+/*
+ * This is called to attempt to lock the dquot associated with this
+ * dquot log item.  Don't sleep on the dquot lock or the flush lock.
+ * If the flush lock is already held, indicating that the dquot has
+ * been or is in the process of being flushed, then see if we can
+ * find the dquot's buffer in the buffer cache without sleeping.  If
+ * we can and it is marked delayed write, then we want to send it out.
+ * We delay doing so until the push routine, though, to avoid sleeping
+ * in any device strategy routines.
+ */
+STATIC uint
+xfs_qm_dquot_logitem_trylock(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+	if (atomic_read(&dqp->q_pincount) > 0)
+		return XFS_ITEM_PINNED;
+
+	if (!xfs_qm_dqlock_nowait(dqp))
+		return XFS_ITEM_LOCKED;
+
+	if (!xfs_dqflock_nowait(dqp)) {
+		/*
+		 * dquot has already been flushed to the backing buffer,
+		 * leave it locked, pushbuf routine will unlock it.
+		 */
+		return XFS_ITEM_PUSHBUF;
+	}
+
+	ASSERT(lip->li_flags & XFS_LI_IN_AIL);
+	return XFS_ITEM_SUCCESS;
+}
+
+/*
+ * Unlock the dquot associated with the log item.
+ * Clear the fields of the dquot and dquot log item that
+ * are specific to the current transaction.  If the
+ * hold flags is set, do not unlock the dquot.
+ */
+STATIC void
+xfs_qm_dquot_logitem_unlock(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	/*
+	 * Clear the transaction pointer in the dquot
+	 */
+	dqp->q_transp = NULL;
+
+	/*
+	 * dquots are never 'held' from getting unlocked at the end of
+	 * a transaction.  Their locking and unlocking is hidden inside the
+	 * transaction layer, within trans_commit. Hence, no LI_HOLD flag
+	 * for the logitem.
+	 */
+	xfs_dqunlock(dqp);
+}
+
+/*
+ * this needs to stamp an lsn into the dquot, I think.
+ * rpc's that look at user dquot's would then have to
+ * push on the dependency recorded in the dquot
+ */
+STATIC void
+xfs_qm_dquot_logitem_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector for dquots
+ */
+static struct xfs_item_ops xfs_dquot_item_ops = {
+	.iop_size	= xfs_qm_dquot_logitem_size,
+	.iop_format	= xfs_qm_dquot_logitem_format,
+	.iop_pin	= xfs_qm_dquot_logitem_pin,
+	.iop_unpin	= xfs_qm_dquot_logitem_unpin,
+	.iop_trylock	= xfs_qm_dquot_logitem_trylock,
+	.iop_unlock	= xfs_qm_dquot_logitem_unlock,
+	.iop_committed	= xfs_qm_dquot_logitem_committed,
+	.iop_push	= xfs_qm_dquot_logitem_push,
+	.iop_pushbuf	= xfs_qm_dquot_logitem_pushbuf,
+	.iop_committing = xfs_qm_dquot_logitem_committing
+};
+
+/*
+ * Initialize the dquot log item for a newly allocated dquot.
+ * The dquot isn't locked at this point, but it isn't on any of the lists
+ * either, so we don't care.
+ */
+void
+xfs_qm_dquot_logitem_init(
+	struct xfs_dquot	*dqp)
+{
+	struct xfs_dq_logitem	*lp = &dqp->q_logitem;
+
+	xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
+					&xfs_dquot_item_ops);
+	lp->qli_dquot = dqp;
+	lp->qli_format.qlf_type = XFS_LI_DQUOT;
+	lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
+	lp->qli_format.qlf_blkno = dqp->q_blkno;
+	lp->qli_format.qlf_len = 1;
+	/*
+	 * This is just the offset of this dquot within its buffer
+	 * (which is currently 1 FSB and probably won't change).
+	 * Hence 32 bits for this offset should be just fine.
+	 * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
+	 * here, and recompute it at recovery time.
+	 */
+	lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
+}
+
+/*------------------  QUOTAOFF LOG ITEMS  -------------------*/
+
+static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_qoff_logitem, qql_item);
+}
+
+
+/*
+ * This returns the number of iovecs needed to log the given quotaoff item.
+ * We only need 1 iovec for an quotaoff item.  It just logs the
+ * quotaoff_log_format structure.
+ */
+STATIC uint
+xfs_qm_qoff_logitem_size(
+	struct xfs_log_item	*lip)
+{
+	return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given quotaoff log item. We use only 1 iovec, and we point that
+ * at the quotaoff_log_format structure embedded in the quotaoff item.
+ * It is at this point that we assert that all of the extent
+ * slots in the quotaoff item have been filled.
+ */
+STATIC void
+xfs_qm_qoff_logitem_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_iovec	*log_vector)
+{
+	struct xfs_qoff_logitem	*qflip = QOFF_ITEM(lip);
+
+	ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
+
+	log_vector->i_addr = &qflip->qql_format;
+	log_vector->i_len = sizeof(xfs_qoff_logitem_t);
+	log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
+	qflip->qql_format.qf_size = 1;
+}
+
+/*
+ * Pinning has no meaning for an quotaoff item, so just return.
+ */
+STATIC void
+xfs_qm_qoff_logitem_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an quotaoff item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_qm_qoff_logitem_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+}
+
+/*
+ * Quotaoff items have no locking, so just return success.
+ */
+STATIC uint
+xfs_qm_qoff_logitem_trylock(
+	struct xfs_log_item	*lip)
+{
+	return XFS_ITEM_LOCKED;
+}
+
+/*
+ * Quotaoff items have no locking or pushing, so return failure
+ * so that the caller doesn't bother with us.
+ */
+STATIC void
+xfs_qm_qoff_logitem_unlock(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * The quotaoff-start-item is logged only once and cannot be moved in the log,
+ * so simply return the lsn at which it's been logged.
+ */
+STATIC xfs_lsn_t
+xfs_qm_qoff_logitem_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	return lsn;
+}
+
+/*
+ * There isn't much you can do to push on an quotaoff item.  It is simply
+ * stuck waiting for the log to be flushed to disk.
+ */
+STATIC void
+xfs_qm_qoff_logitem_push(
+	struct xfs_log_item	*lip)
+{
+}
+
+
+STATIC xfs_lsn_t
+xfs_qm_qoffend_logitem_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_qoff_logitem	*qfe = QOFF_ITEM(lip);
+	struct xfs_qoff_logitem	*qfs = qfe->qql_start_lip;
+	struct xfs_ail		*ailp = qfs->qql_item.li_ailp;
+
+	/*
+	 * Delete the qoff-start logitem from the AIL.
+	 * xfs_trans_ail_delete() drops the AIL lock.
+	 */
+	spin_lock(&ailp->xa_lock);
+	xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
+
+	kmem_free(qfs);
+	kmem_free(qfe);
+	return (xfs_lsn_t)-1;
+}
+
+/*
+ * XXX rcc - don't know quite what to do with this.  I think we can
+ * just ignore it.  The only time that isn't the case is if we allow
+ * the client to somehow see that quotas have been turned off in which
+ * we can't allow that to get back until the quotaoff hits the disk.
+ * So how would that happen?  Also, do we need different routines for
+ * quotaoff start and quotaoff end?  I suspect the answer is yes but
+ * to be sure, I need to look at the recovery code and see how quota off
+ * recovery is handled (do we roll forward or back or do something else).
+ * If we roll forwards or backwards, then we need two separate routines,
+ * one that does nothing and one that stamps in the lsn that matters
+ * (truly makes the quotaoff irrevocable).  If we do something else,
+ * then maybe we don't need two.
+ */
+STATIC void
+xfs_qm_qoff_logitem_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		commit_lsn)
+{
+}
+
+static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
+	.iop_size	= xfs_qm_qoff_logitem_size,
+	.iop_format	= xfs_qm_qoff_logitem_format,
+	.iop_pin	= xfs_qm_qoff_logitem_pin,
+	.iop_unpin	= xfs_qm_qoff_logitem_unpin,
+	.iop_trylock	= xfs_qm_qoff_logitem_trylock,
+	.iop_unlock	= xfs_qm_qoff_logitem_unlock,
+	.iop_committed	= xfs_qm_qoffend_logitem_committed,
+	.iop_push	= xfs_qm_qoff_logitem_push,
+	.iop_committing = xfs_qm_qoff_logitem_committing
+};
+
+/*
+ * This is the ops vector shared by all quotaoff-start log items.
+ */
+static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
+	.iop_size	= xfs_qm_qoff_logitem_size,
+	.iop_format	= xfs_qm_qoff_logitem_format,
+	.iop_pin	= xfs_qm_qoff_logitem_pin,
+	.iop_unpin	= xfs_qm_qoff_logitem_unpin,
+	.iop_trylock	= xfs_qm_qoff_logitem_trylock,
+	.iop_unlock	= xfs_qm_qoff_logitem_unlock,
+	.iop_committed	= xfs_qm_qoff_logitem_committed,
+	.iop_push	= xfs_qm_qoff_logitem_push,
+	.iop_committing = xfs_qm_qoff_logitem_committing
+};
+
+/*
+ * Allocate and initialize an quotaoff item of the correct quota type(s).
+ */
+struct xfs_qoff_logitem *
+xfs_qm_qoff_logitem_init(
+	struct xfs_mount	*mp,
+	struct xfs_qoff_logitem	*start,
+	uint			flags)
+{
+	struct xfs_qoff_logitem	*qf;
+
+	qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
+
+	xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
+			&xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
+	qf->qql_item.li_mountp = mp;
+	qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
+	qf->qql_format.qf_flags = flags;
+	qf->qql_start_lip = start;
+	return qf;
+}
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h
new file mode 100644
index 00000000..5acae2ad
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot_item.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DQUOT_ITEM_H__
+#define __XFS_DQUOT_ITEM_H__
+
+struct xfs_dquot;
+struct xfs_trans;
+struct xfs_mount;
+struct xfs_qoff_logitem;
+
+typedef struct xfs_dq_logitem {
+	xfs_log_item_t		 qli_item;	   /* common portion */
+	struct xfs_dquot	*qli_dquot;	   /* dquot ptr */
+	xfs_lsn_t		 qli_flush_lsn;	   /* lsn at last flush */
+	xfs_dq_logformat_t	 qli_format;	   /* logged structure */
+} xfs_dq_logitem_t;
+
+typedef struct xfs_qoff_logitem {
+	xfs_log_item_t		 qql_item;	/* common portion */
+	struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
+	xfs_qoff_logformat_t	 qql_format;	/* logged structure */
+} xfs_qoff_logitem_t;
+
+
+extern void		   xfs_qm_dquot_logitem_init(struct xfs_dquot *);
+extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *,
+					struct xfs_qoff_logitem *, uint);
+extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *,
+					struct xfs_qoff_logitem *, uint);
+extern void		   xfs_trans_log_quotaoff_item(struct xfs_trans *,
+					struct xfs_qoff_logitem *);
+
+#endif	/* __XFS_DQUOT_ITEM_H__ */
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
new file mode 100644
index 00000000..e70c7fc9
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm.c
@@ -0,0 +1,2462 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_utils.h"
+#include "xfs_qm.h"
+#include "xfs_trace.h"
+
+/*
+ * The global quota manager. There is only one of these for the entire
+ * system, _not_ one per file system. XQM keeps track of the overall
+ * quota functionality, including maintaining the freelist and hash
+ * tables of dquots.
+ */
+struct mutex	xfs_Gqm_lock;
+struct xfs_qm	*xfs_Gqm;
+uint		ndquot;
+
+kmem_zone_t	*qm_dqzone;
+kmem_zone_t	*qm_dqtrxzone;
+
+STATIC void	xfs_qm_list_init(xfs_dqlist_t *, char *, int);
+STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
+
+STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
+STATIC int	xfs_qm_init_quotainfo(xfs_mount_t *);
+STATIC int	xfs_qm_shake(struct shrinker *, struct shrink_control *);
+
+static struct shrinker xfs_qm_shaker = {
+	.shrink = xfs_qm_shake,
+	.seeks = DEFAULT_SEEKS,
+};
+
+#ifdef DEBUG
+extern struct mutex	qcheck_lock;
+#endif
+
+#ifdef QUOTADEBUG
+static void
+xfs_qm_dquot_list_print(
+	struct xfs_mount *mp)
+{
+	xfs_dquot_t	*dqp;
+	int		i = 0;
+
+	list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
+		xfs_debug(mp, "   %d. \"%d (%s)\"   "
+				  "bcnt = %lld, icnt = %lld, refs = %d",
+			i++, be32_to_cpu(dqp->q_core.d_id),
+			DQFLAGTO_TYPESTR(dqp),
+			(long long)be64_to_cpu(dqp->q_core.d_bcount),
+			(long long)be64_to_cpu(dqp->q_core.d_icount),
+			dqp->q_nrefs);
+	}
+}
+#else
+static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { }
+#endif
+
+/*
+ * Initialize the XQM structure.
+ * Note that there is not one quota manager per file system.
+ */
+STATIC struct xfs_qm *
+xfs_Gqm_init(void)
+{
+	xfs_dqhash_t	*udqhash, *gdqhash;
+	xfs_qm_t	*xqm;
+	size_t		hsize;
+	uint		i;
+
+	/*
+	 * Initialize the dquot hash tables.
+	 */
+	udqhash = kmem_zalloc_greedy(&hsize,
+				     XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
+				     XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
+	if (!udqhash)
+		goto out;
+
+	gdqhash = kmem_zalloc_large(hsize);
+	if (!gdqhash)
+		goto out_free_udqhash;
+
+	hsize /= sizeof(xfs_dqhash_t);
+	ndquot = hsize << 8;
+
+	xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
+	xqm->qm_dqhashmask = hsize - 1;
+	xqm->qm_usr_dqhtable = udqhash;
+	xqm->qm_grp_dqhtable = gdqhash;
+	ASSERT(xqm->qm_usr_dqhtable != NULL);
+	ASSERT(xqm->qm_grp_dqhtable != NULL);
+
+	for (i = 0; i < hsize; i++) {
+		xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i);
+		xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i);
+	}
+
+	/*
+	 * Freelist of all dquots of all file systems
+	 */
+	INIT_LIST_HEAD(&xqm->qm_dqfrlist);
+	xqm->qm_dqfrlist_cnt = 0;
+	mutex_init(&xqm->qm_dqfrlist_lock);
+
+	/*
+	 * dquot zone. we register our own low-memory callback.
+	 */
+	if (!qm_dqzone) {
+		xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t),
+						"xfs_dquots");
+		qm_dqzone = xqm->qm_dqzone;
+	} else
+		xqm->qm_dqzone = qm_dqzone;
+
+	register_shrinker(&xfs_qm_shaker);
+
+	/*
+	 * The t_dqinfo portion of transactions.
+	 */
+	if (!qm_dqtrxzone) {
+		xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t),
+						   "xfs_dqtrx");
+		qm_dqtrxzone = xqm->qm_dqtrxzone;
+	} else
+		xqm->qm_dqtrxzone = qm_dqtrxzone;
+
+	atomic_set(&xqm->qm_totaldquots, 0);
+	xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
+	xqm->qm_nrefs = 0;
+#ifdef DEBUG
+	mutex_init(&qcheck_lock);
+#endif
+	return xqm;
+
+ out_free_udqhash:
+	kmem_free_large(udqhash);
+ out:
+	return NULL;
+}
+
+/*
+ * Destroy the global quota manager when its reference count goes to zero.
+ */
+STATIC void
+xfs_qm_destroy(
+	struct xfs_qm	*xqm)
+{
+	struct xfs_dquot *dqp, *n;
+	int		hsize, i;
+
+	ASSERT(xqm != NULL);
+	ASSERT(xqm->qm_nrefs == 0);
+	unregister_shrinker(&xfs_qm_shaker);
+	hsize = xqm->qm_dqhashmask + 1;
+	for (i = 0; i < hsize; i++) {
+		xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
+		xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
+	}
+	kmem_free_large(xqm->qm_usr_dqhtable);
+	kmem_free_large(xqm->qm_grp_dqhtable);
+	xqm->qm_usr_dqhtable = NULL;
+	xqm->qm_grp_dqhtable = NULL;
+	xqm->qm_dqhashmask = 0;
+
+	/* frlist cleanup */
+	mutex_lock(&xqm->qm_dqfrlist_lock);
+	list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
+		xfs_dqlock(dqp);
+#ifdef QUOTADEBUG
+		xfs_debug(dqp->q_mount, "FREELIST destroy 0x%p", dqp);
+#endif
+		list_del_init(&dqp->q_freelist);
+		xfs_Gqm->qm_dqfrlist_cnt--;
+		xfs_dqunlock(dqp);
+		xfs_qm_dqdestroy(dqp);
+	}
+	mutex_unlock(&xqm->qm_dqfrlist_lock);
+	mutex_destroy(&xqm->qm_dqfrlist_lock);
+#ifdef DEBUG
+	mutex_destroy(&qcheck_lock);
+#endif
+	kmem_free(xqm);
+}
+
+/*
+ * Called at mount time to let XQM know that another file system is
+ * starting quotas. This isn't crucial information as the individual mount
+ * structures are pretty independent, but it helps the XQM keep a
+ * global view of what's going on.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_hold_quotafs_ref(
+	struct xfs_mount *mp)
+{
+	/*
+	 * Need to lock the xfs_Gqm structure for things like this. For example,
+	 * the structure could disappear between the entry to this routine and
+	 * a HOLD operation if not locked.
+	 */
+	mutex_lock(&xfs_Gqm_lock);
+
+	if (!xfs_Gqm) {
+		xfs_Gqm = xfs_Gqm_init();
+		if (!xfs_Gqm) {
+			mutex_unlock(&xfs_Gqm_lock);
+			return ENOMEM;
+		}
+	}
+
+	/*
+	 * We can keep a list of all filesystems with quotas mounted for
+	 * debugging and statistical purposes, but ...
+	 * Just take a reference and get out.
+	 */
+	xfs_Gqm->qm_nrefs++;
+	mutex_unlock(&xfs_Gqm_lock);
+
+	return 0;
+}
+
+
+/*
+ * Release the reference that a filesystem took at mount time,
+ * so that we know when we need to destroy the entire quota manager.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_qm_rele_quotafs_ref(
+	struct xfs_mount *mp)
+{
+	xfs_dquot_t	*dqp, *n;
+
+	ASSERT(xfs_Gqm);
+	ASSERT(xfs_Gqm->qm_nrefs > 0);
+
+	/*
+	 * Go thru the freelist and destroy all inactive dquots.
+	 */
+	mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+
+	list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
+		xfs_dqlock(dqp);
+		if (dqp->dq_flags & XFS_DQ_INACTIVE) {
+			ASSERT(dqp->q_mount == NULL);
+			ASSERT(! XFS_DQ_IS_DIRTY(dqp));
+			ASSERT(list_empty(&dqp->q_hashlist));
+			ASSERT(list_empty(&dqp->q_mplist));
+			list_del_init(&dqp->q_freelist);
+			xfs_Gqm->qm_dqfrlist_cnt--;
+			xfs_dqunlock(dqp);
+			xfs_qm_dqdestroy(dqp);
+		} else {
+			xfs_dqunlock(dqp);
+		}
+	}
+	mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+
+	/*
+	 * Destroy the entire XQM. If somebody mounts with quotaon, this'll
+	 * be restarted.
+	 */
+	mutex_lock(&xfs_Gqm_lock);
+	if (--xfs_Gqm->qm_nrefs == 0) {
+		xfs_qm_destroy(xfs_Gqm);
+		xfs_Gqm = NULL;
+	}
+	mutex_unlock(&xfs_Gqm_lock);
+}
+
+/*
+ * Just destroy the quotainfo structure.
+ */
+void
+xfs_qm_unmount(
+	struct xfs_mount	*mp)
+{
+	if (mp->m_quotainfo) {
+		xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
+		xfs_qm_destroy_quotainfo(mp);
+	}
+}
+
+
+/*
+ * This is called from xfs_mountfs to start quotas and initialize all
+ * necessary data structures like quotainfo.  This is also responsible for
+ * running a quotacheck as necessary.  We are guaranteed that the superblock
+ * is consistently read in at this point.
+ *
+ * If we fail here, the mount will continue with quota turned off. We don't
+ * need to inidicate success or failure at all.
+ */
+void
+xfs_qm_mount_quotas(
+	xfs_mount_t	*mp)
+{
+	int		error = 0;
+	uint		sbf;
+
+	/*
+	 * If quotas on realtime volumes is not supported, we disable
+	 * quotas immediately.
+	 */
+	if (mp->m_sb.sb_rextents) {
+		xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
+		mp->m_qflags = 0;
+		goto write_changes;
+	}
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	/*
+	 * Allocate the quotainfo structure inside the mount struct, and
+	 * create quotainode(s), and change/rev superblock if necessary.
+	 */
+	error = xfs_qm_init_quotainfo(mp);
+	if (error) {
+		/*
+		 * We must turn off quotas.
+		 */
+		ASSERT(mp->m_quotainfo == NULL);
+		mp->m_qflags = 0;
+		goto write_changes;
+	}
+	/*
+	 * If any of the quotas are not consistent, do a quotacheck.
+	 */
+	if (XFS_QM_NEED_QUOTACHECK(mp)) {
+		error = xfs_qm_quotacheck(mp);
+		if (error) {
+			/* Quotacheck failed and disabled quotas. */
+			return;
+		}
+	}
+	/* 
+	 * If one type of quotas is off, then it will lose its
+	 * quotachecked status, since we won't be doing accounting for
+	 * that type anymore.
+	 */
+	if (!XFS_IS_UQUOTA_ON(mp))
+		mp->m_qflags &= ~XFS_UQUOTA_CHKD;
+	if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
+		mp->m_qflags &= ~XFS_OQUOTA_CHKD;
+
+ write_changes:
+	/*
+	 * We actually don't have to acquire the m_sb_lock at all.
+	 * This can only be called from mount, and that's single threaded. XXX
+	 */
+	spin_lock(&mp->m_sb_lock);
+	sbf = mp->m_sb.sb_qflags;
+	mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
+	spin_unlock(&mp->m_sb_lock);
+
+	if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
+		if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
+			/*
+			 * We could only have been turning quotas off.
+			 * We aren't in very good shape actually because
+			 * the incore structures are convinced that quotas are
+			 * off, but the on disk superblock doesn't know that !
+			 */
+			ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
+			xfs_alert(mp, "%s: Superblock update failed!",
+				__func__);
+		}
+	}
+
+	if (error) {
+		xfs_warn(mp, "Failed to initialize disk quotas.");
+		return;
+	}
+
+#ifdef QUOTADEBUG
+	if (XFS_IS_QUOTA_ON(mp))
+		xfs_qm_internalqcheck(mp);
+#endif
+}
+
+/*
+ * Called from the vfsops layer.
+ */
+void
+xfs_qm_unmount_quotas(
+	xfs_mount_t	*mp)
+{
+	/*
+	 * Release the dquots that root inode, et al might be holding,
+	 * before we flush quotas and blow away the quotainfo structure.
+	 */
+	ASSERT(mp->m_rootip);
+	xfs_qm_dqdetach(mp->m_rootip);
+	if (mp->m_rbmip)
+		xfs_qm_dqdetach(mp->m_rbmip);
+	if (mp->m_rsumip)
+		xfs_qm_dqdetach(mp->m_rsumip);
+
+	/*
+	 * Release the quota inodes.
+	 */
+	if (mp->m_quotainfo) {
+		if (mp->m_quotainfo->qi_uquotaip) {
+			IRELE(mp->m_quotainfo->qi_uquotaip);
+			mp->m_quotainfo->qi_uquotaip = NULL;
+		}
+		if (mp->m_quotainfo->qi_gquotaip) {
+			IRELE(mp->m_quotainfo->qi_gquotaip);
+			mp->m_quotainfo->qi_gquotaip = NULL;
+		}
+	}
+}
+
+/*
+ * Flush all dquots of the given file system to disk. The dquots are
+ * _not_ purged from memory here, just their data written to disk.
+ */
+STATIC int
+xfs_qm_dqflush_all(
+	struct xfs_mount	*mp,
+	int			sync_mode)
+{
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	int			recl;
+	struct xfs_dquot	*dqp;
+	int			error;
+
+	if (!q)
+		return 0;
+again:
+	mutex_lock(&q->qi_dqlist_lock);
+	list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
+		xfs_dqlock(dqp);
+		if (! XFS_DQ_IS_DIRTY(dqp)) {
+			xfs_dqunlock(dqp);
+			continue;
+		}
+
+		/* XXX a sentinel would be better */
+		recl = q->qi_dqreclaims;
+		if (!xfs_dqflock_nowait(dqp)) {
+			/*
+			 * If we can't grab the flush lock then check
+			 * to see if the dquot has been flushed delayed
+			 * write.  If so, grab its buffer and send it
+			 * out immediately.  We'll be able to acquire
+			 * the flush lock when the I/O completes.
+			 */
+			xfs_qm_dqflock_pushbuf_wait(dqp);
+		}
+		/*
+		 * Let go of the mplist lock. We don't want to hold it
+		 * across a disk write.
+		 */
+		mutex_unlock(&q->qi_dqlist_lock);
+		error = xfs_qm_dqflush(dqp, sync_mode);
+		xfs_dqunlock(dqp);
+		if (error)
+			return error;
+
+		mutex_lock(&q->qi_dqlist_lock);
+		if (recl != q->qi_dqreclaims) {
+			mutex_unlock(&q->qi_dqlist_lock);
+			/* XXX restart limit */
+			goto again;
+		}
+	}
+
+	mutex_unlock(&q->qi_dqlist_lock);
+	/* return ! busy */
+	return 0;
+}
+/*
+ * Release the group dquot pointers the user dquots may be
+ * carrying around as a hint. mplist is locked on entry and exit.
+ */
+STATIC void
+xfs_qm_detach_gdquots(
+	struct xfs_mount	*mp)
+{
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	struct xfs_dquot	*dqp, *gdqp;
+	int			nrecl;
+
+ again:
+	ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+	list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
+		xfs_dqlock(dqp);
+		if ((gdqp = dqp->q_gdquot)) {
+			xfs_dqlock(gdqp);
+			dqp->q_gdquot = NULL;
+		}
+		xfs_dqunlock(dqp);
+
+		if (gdqp) {
+			/*
+			 * Can't hold the mplist lock across a dqput.
+			 * XXXmust convert to marker based iterations here.
+			 */
+			nrecl = q->qi_dqreclaims;
+			mutex_unlock(&q->qi_dqlist_lock);
+			xfs_qm_dqput(gdqp);
+
+			mutex_lock(&q->qi_dqlist_lock);
+			if (nrecl != q->qi_dqreclaims)
+				goto again;
+		}
+	}
+}
+
+/*
+ * Go through all the incore dquots of this file system and take them
+ * off the mplist and hashlist, if the dquot type matches the dqtype
+ * parameter. This is used when turning off quota accounting for
+ * users and/or groups, as well as when the filesystem is unmounting.
+ */
+STATIC int
+xfs_qm_dqpurge_int(
+	struct xfs_mount	*mp,
+	uint			flags)
+{
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	struct xfs_dquot	*dqp, *n;
+	uint			dqtype;
+	int			nrecl;
+	int			nmisses;
+
+	if (!q)
+		return 0;
+
+	dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
+	dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
+	dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
+
+	mutex_lock(&q->qi_dqlist_lock);
+
+	/*
+	 * In the first pass through all incore dquots of this filesystem,
+	 * we release the group dquot pointers the user dquots may be
+	 * carrying around as a hint. We need to do this irrespective of
+	 * what's being turned off.
+	 */
+	xfs_qm_detach_gdquots(mp);
+
+      again:
+	nmisses = 0;
+	ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+	/*
+	 * Try to get rid of all of the unwanted dquots. The idea is to
+	 * get them off mplist and hashlist, but leave them on freelist.
+	 */
+	list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
+		/*
+		 * It's OK to look at the type without taking dqlock here.
+		 * We're holding the mplist lock here, and that's needed for
+		 * a dqreclaim.
+		 */
+		if ((dqp->dq_flags & dqtype) == 0)
+			continue;
+
+		if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
+			nrecl = q->qi_dqreclaims;
+			mutex_unlock(&q->qi_dqlist_lock);
+			mutex_lock(&dqp->q_hash->qh_lock);
+			mutex_lock(&q->qi_dqlist_lock);
+
+			/*
+			 * XXXTheoretically, we can get into a very long
+			 * ping pong game here.
+			 * No one can be adding dquots to the mplist at
+			 * this point, but somebody might be taking things off.
+			 */
+			if (nrecl != q->qi_dqreclaims) {
+				mutex_unlock(&dqp->q_hash->qh_lock);
+				goto again;
+			}
+		}
+
+		/*
+		 * Take the dquot off the mplist and hashlist. It may remain on
+		 * freelist in INACTIVE state.
+		 */
+		nmisses += xfs_qm_dqpurge(dqp);
+	}
+	mutex_unlock(&q->qi_dqlist_lock);
+	return nmisses;
+}
+
+int
+xfs_qm_dqpurge_all(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	int		ndquots;
+
+	/*
+	 * Purge the dquot cache.
+	 * None of the dquots should really be busy at this point.
+	 */
+	if (mp->m_quotainfo) {
+		while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) {
+			delay(ndquots * 10);
+		}
+	}
+	return 0;
+}
+
+STATIC int
+xfs_qm_dqattach_one(
+	xfs_inode_t	*ip,
+	xfs_dqid_t	id,
+	uint		type,
+	uint		doalloc,
+	xfs_dquot_t	*udqhint, /* hint */
+	xfs_dquot_t	**IO_idqpp)
+{
+	xfs_dquot_t	*dqp;
+	int		error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	error = 0;
+
+	/*
+	 * See if we already have it in the inode itself. IO_idqpp is
+	 * &i_udquot or &i_gdquot. This made the code look weird, but
+	 * made the logic a lot simpler.
+	 */
+	dqp = *IO_idqpp;
+	if (dqp) {
+		trace_xfs_dqattach_found(dqp);
+		return 0;
+	}
+
+	/*
+	 * udqhint is the i_udquot field in inode, and is non-NULL only
+	 * when the type arg is group/project. Its purpose is to save a
+	 * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
+	 * the user dquot.
+	 */
+	if (udqhint) {
+		ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
+		xfs_dqlock(udqhint);
+
+		/*
+		 * No need to take dqlock to look at the id.
+		 *
+		 * The ID can't change until it gets reclaimed, and it won't
+		 * be reclaimed as long as we have a ref from inode and we
+		 * hold the ilock.
+		 */
+		dqp = udqhint->q_gdquot;
+		if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
+			xfs_dqlock(dqp);
+			XFS_DQHOLD(dqp);
+			ASSERT(*IO_idqpp == NULL);
+			*IO_idqpp = dqp;
+
+			xfs_dqunlock(dqp);
+			xfs_dqunlock(udqhint);
+			return 0;
+		}
+
+		/*
+		 * We can't hold a dquot lock when we call the dqget code.
+		 * We'll deadlock in no time, because of (not conforming to)
+		 * lock ordering - the inodelock comes before any dquot lock,
+		 * and we may drop and reacquire the ilock in xfs_qm_dqget().
+		 */
+		xfs_dqunlock(udqhint);
+	}
+
+	/*
+	 * Find the dquot from somewhere. This bumps the
+	 * reference count of dquot and returns it locked.
+	 * This can return ENOENT if dquot didn't exist on
+	 * disk and we didn't ask it to allocate;
+	 * ESRCH if quotas got turned off suddenly.
+	 */
+	error = xfs_qm_dqget(ip->i_mount, ip, id, type,
+			     doalloc | XFS_QMOPT_DOWARN, &dqp);
+	if (error)
+		return error;
+
+	trace_xfs_dqattach_get(dqp);
+
+	/*
+	 * dqget may have dropped and re-acquired the ilock, but it guarantees
+	 * that the dquot returned is the one that should go in the inode.
+	 */
+	*IO_idqpp = dqp;
+	xfs_dqunlock(dqp);
+	return 0;
+}
+
+
+/*
+ * Given a udquot and gdquot, attach a ptr to the group dquot in the
+ * udquot as a hint for future lookups. The idea sounds simple, but the
+ * execution isn't, because the udquot might have a group dquot attached
+ * already and getting rid of that gets us into lock ordering constraints.
+ * The process is complicated more by the fact that the dquots may or may not
+ * be locked on entry.
+ */
+STATIC void
+xfs_qm_dqattach_grouphint(
+	xfs_dquot_t	*udq,
+	xfs_dquot_t	*gdq)
+{
+	xfs_dquot_t	*tmp;
+
+	xfs_dqlock(udq);
+
+	if ((tmp = udq->q_gdquot)) {
+		if (tmp == gdq) {
+			xfs_dqunlock(udq);
+			return;
+		}
+
+		udq->q_gdquot = NULL;
+		/*
+		 * We can't keep any dqlocks when calling dqrele,
+		 * because the freelist lock comes before dqlocks.
+		 */
+		xfs_dqunlock(udq);
+		/*
+		 * we took a hard reference once upon a time in dqget,
+		 * so give it back when the udquot no longer points at it
+		 * dqput() does the unlocking of the dquot.
+		 */
+		xfs_qm_dqrele(tmp);
+
+		xfs_dqlock(udq);
+		xfs_dqlock(gdq);
+
+	} else {
+		ASSERT(XFS_DQ_IS_LOCKED(udq));
+		xfs_dqlock(gdq);
+	}
+
+	ASSERT(XFS_DQ_IS_LOCKED(udq));
+	ASSERT(XFS_DQ_IS_LOCKED(gdq));
+	/*
+	 * Somebody could have attached a gdquot here,
+	 * when we dropped the uqlock. If so, just do nothing.
+	 */
+	if (udq->q_gdquot == NULL) {
+		XFS_DQHOLD(gdq);
+		udq->q_gdquot = gdq;
+	}
+
+	xfs_dqunlock(gdq);
+	xfs_dqunlock(udq);
+}
+
+
+/*
+ * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
+ * into account.
+ * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
+ * Inode may get unlocked and relocked in here, and the caller must deal with
+ * the consequences.
+ */
+int
+xfs_qm_dqattach_locked(
+	xfs_inode_t	*ip,
+	uint		flags)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	uint		nquotas = 0;
+	int		error = 0;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp) ||
+	    !XFS_IS_QUOTA_ON(mp) ||
+	    !XFS_NOT_DQATTACHED(mp, ip) ||
+	    ip->i_ino == mp->m_sb.sb_uquotino ||
+	    ip->i_ino == mp->m_sb.sb_gquotino)
+		return 0;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
+						flags & XFS_QMOPT_DQALLOC,
+						NULL, &ip->i_udquot);
+		if (error)
+			goto done;
+		nquotas++;
+	}
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	if (XFS_IS_OQUOTA_ON(mp)) {
+		error = XFS_IS_GQUOTA_ON(mp) ?
+			xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
+						flags & XFS_QMOPT_DQALLOC,
+						ip->i_udquot, &ip->i_gdquot) :
+			xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
+						flags & XFS_QMOPT_DQALLOC,
+						ip->i_udquot, &ip->i_gdquot);
+		/*
+		 * Don't worry about the udquot that we may have
+		 * attached above. It'll get detached, if not already.
+		 */
+		if (error)
+			goto done;
+		nquotas++;
+	}
+
+	/*
+	 * Attach this group quota to the user quota as a hint.
+	 * This WON'T, in general, result in a thrash.
+	 */
+	if (nquotas == 2) {
+		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+		ASSERT(ip->i_udquot);
+		ASSERT(ip->i_gdquot);
+
+		/*
+		 * We may or may not have the i_udquot locked at this point,
+		 * but this check is OK since we don't depend on the i_gdquot to
+		 * be accurate 100% all the time. It is just a hint, and this
+		 * will succeed in general.
+		 */
+		if (ip->i_udquot->q_gdquot == ip->i_gdquot)
+			goto done;
+		/*
+		 * Attach i_gdquot to the gdquot hint inside the i_udquot.
+		 */
+		xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
+	}
+
+ done:
+#ifdef QUOTADEBUG
+	if (! error) {
+		if (XFS_IS_UQUOTA_ON(mp))
+			ASSERT(ip->i_udquot);
+		if (XFS_IS_OQUOTA_ON(mp))
+			ASSERT(ip->i_gdquot);
+	}
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+#endif
+	return error;
+}
+
+int
+xfs_qm_dqattach(
+	struct xfs_inode	*ip,
+	uint			flags)
+{
+	int			error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	error = xfs_qm_dqattach_locked(ip, flags);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	return error;
+}
+
+/*
+ * Release dquots (and their references) if any.
+ * The inode should be locked EXCL except when this's called by
+ * xfs_ireclaim.
+ */
+void
+xfs_qm_dqdetach(
+	xfs_inode_t	*ip)
+{
+	if (!(ip->i_udquot || ip->i_gdquot))
+		return;
+
+	trace_xfs_dquot_dqdetach(ip);
+
+	ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
+	ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
+	if (ip->i_udquot) {
+		xfs_qm_dqrele(ip->i_udquot);
+		ip->i_udquot = NULL;
+	}
+	if (ip->i_gdquot) {
+		xfs_qm_dqrele(ip->i_gdquot);
+		ip->i_gdquot = NULL;
+	}
+}
+
+int
+xfs_qm_sync(
+	struct xfs_mount	*mp,
+	int			flags)
+{
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	int			recl, restarts;
+	struct xfs_dquot	*dqp;
+	int			error;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+		return 0;
+
+	restarts = 0;
+
+  again:
+	mutex_lock(&q->qi_dqlist_lock);
+	/*
+	 * dqpurge_all() also takes the mplist lock and iterate thru all dquots
+	 * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
+	 * when we have the mplist lock, we know that dquots will be consistent
+	 * as long as we have it locked.
+	 */
+	if (!XFS_IS_QUOTA_ON(mp)) {
+		mutex_unlock(&q->qi_dqlist_lock);
+		return 0;
+	}
+	ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+	list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
+		/*
+		 * If this is vfs_sync calling, then skip the dquots that
+		 * don't 'seem' to be dirty. ie. don't acquire dqlock.
+		 * This is very similar to what xfs_sync does with inodes.
+		 */
+		if (flags & SYNC_TRYLOCK) {
+			if (!XFS_DQ_IS_DIRTY(dqp))
+				continue;
+			if (!xfs_qm_dqlock_nowait(dqp))
+				continue;
+		} else {
+			xfs_dqlock(dqp);
+		}
+
+		/*
+		 * Now, find out for sure if this dquot is dirty or not.
+		 */
+		if (! XFS_DQ_IS_DIRTY(dqp)) {
+			xfs_dqunlock(dqp);
+			continue;
+		}
+
+		/* XXX a sentinel would be better */
+		recl = q->qi_dqreclaims;
+		if (!xfs_dqflock_nowait(dqp)) {
+			if (flags & SYNC_TRYLOCK) {
+				xfs_dqunlock(dqp);
+				continue;
+			}
+			/*
+			 * If we can't grab the flush lock then if the caller
+			 * really wanted us to give this our best shot, so
+			 * see if we can give a push to the buffer before we wait
+			 * on the flush lock. At this point, we know that
+			 * even though the dquot is being flushed,
+			 * it has (new) dirty data.
+			 */
+			xfs_qm_dqflock_pushbuf_wait(dqp);
+		}
+		/*
+		 * Let go of the mplist lock. We don't want to hold it
+		 * across a disk write
+		 */
+		mutex_unlock(&q->qi_dqlist_lock);
+		error = xfs_qm_dqflush(dqp, flags);
+		xfs_dqunlock(dqp);
+		if (error && XFS_FORCED_SHUTDOWN(mp))
+			return 0;	/* Need to prevent umount failure */
+		else if (error)
+			return error;
+
+		mutex_lock(&q->qi_dqlist_lock);
+		if (recl != q->qi_dqreclaims) {
+			if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
+				break;
+
+			mutex_unlock(&q->qi_dqlist_lock);
+			goto again;
+		}
+	}
+
+	mutex_unlock(&q->qi_dqlist_lock);
+	return 0;
+}
+
+/*
+ * The hash chains and the mplist use the same xfs_dqhash structure as
+ * their list head, but we can take the mplist qh_lock and one of the
+ * hash qh_locks at the same time without any problem as they aren't
+ * related.
+ */
+static struct lock_class_key xfs_quota_mplist_class;
+
+/*
+ * This initializes all the quota information that's kept in the
+ * mount structure
+ */
+STATIC int
+xfs_qm_init_quotainfo(
+	xfs_mount_t	*mp)
+{
+	xfs_quotainfo_t *qinf;
+	int		error;
+	xfs_dquot_t	*dqp;
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	/*
+	 * Tell XQM that we exist as soon as possible.
+	 */
+	if ((error = xfs_qm_hold_quotafs_ref(mp))) {
+		return error;
+	}
+
+	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+
+	/*
+	 * See if quotainodes are setup, and if not, allocate them,
+	 * and change the superblock accordingly.
+	 */
+	if ((error = xfs_qm_init_quotainos(mp))) {
+		kmem_free(qinf);
+		mp->m_quotainfo = NULL;
+		return error;
+	}
+
+	INIT_LIST_HEAD(&qinf->qi_dqlist);
+	mutex_init(&qinf->qi_dqlist_lock);
+	lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
+
+	qinf->qi_dqreclaims = 0;
+
+	/* mutex used to serialize quotaoffs */
+	mutex_init(&qinf->qi_quotaofflock);
+
+	/* Precalc some constants */
+	qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
+	ASSERT(qinf->qi_dqchunklen);
+	qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen);
+	do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t));
+
+	mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
+
+	/*
+	 * We try to get the limits from the superuser's limits fields.
+	 * This is quite hacky, but it is standard quota practice.
+	 * We look at the USR dquot with id == 0 first, but if user quotas
+	 * are not enabled we goto the GRP dquot with id == 0.
+	 * We don't really care to keep separate default limits for user
+	 * and group quotas, at least not at this point.
+	 */
+	error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0,
+			     XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : 
+			     (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
+				XFS_DQ_PROJ),
+			     XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN,
+			     &dqp);
+	if (! error) {
+		xfs_disk_dquot_t	*ddqp = &dqp->q_core;
+
+		/*
+		 * The warnings and timers set the grace period given to
+		 * a user or group before he or she can not perform any
+		 * more writing. If it is zero, a default is used.
+		 */
+		qinf->qi_btimelimit = ddqp->d_btimer ?
+			be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT;
+		qinf->qi_itimelimit = ddqp->d_itimer ?
+			be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT;
+		qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ?
+			be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT;
+		qinf->qi_bwarnlimit = ddqp->d_bwarns ?
+			be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT;
+		qinf->qi_iwarnlimit = ddqp->d_iwarns ?
+			be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT;
+		qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ?
+			be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT;
+		qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
+		qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
+		qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
+		qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
+		qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
+		qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+ 
+		/*
+		 * We sent the XFS_QMOPT_DQSUSER flag to dqget because
+		 * we don't want this dquot cached. We haven't done a
+		 * quotacheck yet, and quotacheck doesn't like incore dquots.
+		 */
+		xfs_qm_dqdestroy(dqp);
+	} else {
+		qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
+		qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
+		qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
+		qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
+		qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
+		qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
+	}
+
+	return 0;
+}
+
+
+/*
+ * Gets called when unmounting a filesystem or when all quotas get
+ * turned off.
+ * This purges the quota inodes, destroys locks and frees itself.
+ */
+void
+xfs_qm_destroy_quotainfo(
+	xfs_mount_t	*mp)
+{
+	xfs_quotainfo_t *qi;
+
+	qi = mp->m_quotainfo;
+	ASSERT(qi != NULL);
+	ASSERT(xfs_Gqm != NULL);
+
+	/*
+	 * Release the reference that XQM kept, so that we know
+	 * when the XQM structure should be freed. We cannot assume
+	 * that xfs_Gqm is non-null after this point.
+	 */
+	xfs_qm_rele_quotafs_ref(mp);
+
+	ASSERT(list_empty(&qi->qi_dqlist));
+	mutex_destroy(&qi->qi_dqlist_lock);
+
+	if (qi->qi_uquotaip) {
+		IRELE(qi->qi_uquotaip);
+		qi->qi_uquotaip = NULL; /* paranoia */
+	}
+	if (qi->qi_gquotaip) {
+		IRELE(qi->qi_gquotaip);
+		qi->qi_gquotaip = NULL;
+	}
+	mutex_destroy(&qi->qi_quotaofflock);
+	kmem_free(qi);
+	mp->m_quotainfo = NULL;
+}
+
+
+
+/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
+
+/* ARGSUSED */
+STATIC void
+xfs_qm_list_init(
+	xfs_dqlist_t	*list,
+	char		*str,
+	int		n)
+{
+	mutex_init(&list->qh_lock);
+	INIT_LIST_HEAD(&list->qh_list);
+	list->qh_version = 0;
+	list->qh_nelems = 0;
+}
+
+STATIC void
+xfs_qm_list_destroy(
+	xfs_dqlist_t	*list)
+{
+	mutex_destroy(&(list->qh_lock));
+}
+
+/*
+ * Create an inode and return with a reference already taken, but unlocked
+ * This is how we create quota inodes
+ */
+STATIC int
+xfs_qm_qino_alloc(
+	xfs_mount_t	*mp,
+	xfs_inode_t	**ip,
+	__int64_t	sbfields,
+	uint		flags)
+{
+	xfs_trans_t	*tp;
+	int		error;
+	int		committed;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
+	if ((error = xfs_trans_reserve(tp,
+				      XFS_QM_QINOCREATE_SPACE_RES(mp),
+				      XFS_CREATE_LOG_RES(mp), 0,
+				      XFS_TRANS_PERM_LOG_RES,
+				      XFS_CREATE_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
+	if (error) {
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+				 XFS_TRANS_ABORT);
+		return error;
+	}
+
+	/*
+	 * Make the changes in the superblock, and log those too.
+	 * sbfields arg may contain fields other than *QUOTINO;
+	 * VERSIONNUM for example.
+	 */
+	spin_lock(&mp->m_sb_lock);
+	if (flags & XFS_QMOPT_SBVERSION) {
+		ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
+		ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+				   XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
+		       (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+			XFS_SB_GQUOTINO | XFS_SB_QFLAGS));
+
+		xfs_sb_version_addquota(&mp->m_sb);
+		mp->m_sb.sb_uquotino = NULLFSINO;
+		mp->m_sb.sb_gquotino = NULLFSINO;
+
+		/* qflags will get updated _after_ quotacheck */
+		mp->m_sb.sb_qflags = 0;
+	}
+	if (flags & XFS_QMOPT_UQUOTA)
+		mp->m_sb.sb_uquotino = (*ip)->i_ino;
+	else
+		mp->m_sb.sb_gquotino = (*ip)->i_ino;
+	spin_unlock(&mp->m_sb_lock);
+	xfs_mod_sb(tp, sbfields);
+
+	if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
+		xfs_alert(mp, "%s failed (error %d)!", __func__, error);
+		return error;
+	}
+	return 0;
+}
+
+
+STATIC void
+xfs_qm_reset_dqcounts(
+	xfs_mount_t	*mp,
+	xfs_buf_t	*bp,
+	xfs_dqid_t	id,
+	uint		type)
+{
+	xfs_disk_dquot_t	*ddq;
+	int			j;
+
+	trace_xfs_reset_dqcounts(bp, _RET_IP_);
+
+	/*
+	 * Reset all counters and timers. They'll be
+	 * started afresh by xfs_qm_quotacheck.
+	 */
+#ifdef DEBUG
+	j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
+	do_div(j, sizeof(xfs_dqblk_t));
+	ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
+#endif
+	ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
+	for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
+		/*
+		 * Do a sanity check, and if needed, repair the dqblk. Don't
+		 * output any warnings because it's perfectly possible to
+		 * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
+		 */
+		(void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
+				      "xfs_quotacheck");
+		ddq->d_bcount = 0;
+		ddq->d_icount = 0;
+		ddq->d_rtbcount = 0;
+		ddq->d_btimer = 0;
+		ddq->d_itimer = 0;
+		ddq->d_rtbtimer = 0;
+		ddq->d_bwarns = 0;
+		ddq->d_iwarns = 0;
+		ddq->d_rtbwarns = 0;
+		ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
+	}
+}
+
+STATIC int
+xfs_qm_dqiter_bufs(
+	xfs_mount_t	*mp,
+	xfs_dqid_t	firstid,
+	xfs_fsblock_t	bno,
+	xfs_filblks_t	blkcnt,
+	uint		flags)
+{
+	xfs_buf_t	*bp;
+	int		error;
+	int		type;
+
+	ASSERT(blkcnt > 0);
+	type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
+		(flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
+	error = 0;
+
+	/*
+	 * Blkcnt arg can be a very big number, and might even be
+	 * larger than the log itself. So, we have to break it up into
+	 * manageable-sized transactions.
+	 * Note that we don't start a permanent transaction here; we might
+	 * not be able to get a log reservation for the whole thing up front,
+	 * and we don't really care to either, because we just discard
+	 * everything if we were to crash in the middle of this loop.
+	 */
+	while (blkcnt--) {
+		error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+			      XFS_FSB_TO_DADDR(mp, bno),
+			      mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+		if (error)
+			break;
+
+		xfs_qm_reset_dqcounts(mp, bp, firstid, type);
+		xfs_bdwrite(mp, bp);
+		/*
+		 * goto the next block.
+		 */
+		bno++;
+		firstid += mp->m_quotainfo->qi_dqperchunk;
+	}
+	return error;
+}
+
+/*
+ * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a
+ * caller supplied function for every chunk of dquots that we find.
+ */
+STATIC int
+xfs_qm_dqiterate(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*qip,
+	uint		flags)
+{
+	xfs_bmbt_irec_t		*map;
+	int			i, nmaps;	/* number of map entries */
+	int			error;		/* return value */
+	xfs_fileoff_t		lblkno;
+	xfs_filblks_t		maxlblkcnt;
+	xfs_dqid_t		firstid;
+	xfs_fsblock_t		rablkno;
+	xfs_filblks_t		rablkcnt;
+
+	error = 0;
+	/*
+	 * This looks racy, but we can't keep an inode lock across a
+	 * trans_reserve. But, this gets called during quotacheck, and that
+	 * happens only at mount time which is single threaded.
+	 */
+	if (qip->i_d.di_nblocks == 0)
+		return 0;
+
+	map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
+
+	lblkno = 0;
+	maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+	do {
+		nmaps = XFS_DQITER_MAP_SIZE;
+		/*
+		 * We aren't changing the inode itself. Just changing
+		 * some of its data. No new blocks are added here, and
+		 * the inode is never added to the transaction.
+		 */
+		xfs_ilock(qip, XFS_ILOCK_SHARED);
+		error = xfs_bmapi(NULL, qip, lblkno,
+				  maxlblkcnt - lblkno,
+				  XFS_BMAPI_METADATA,
+				  NULL,
+				  0, map, &nmaps, NULL);
+		xfs_iunlock(qip, XFS_ILOCK_SHARED);
+		if (error)
+			break;
+
+		ASSERT(nmaps <= XFS_DQITER_MAP_SIZE);
+		for (i = 0; i < nmaps; i++) {
+			ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+			ASSERT(map[i].br_blockcount);
+
+
+			lblkno += map[i].br_blockcount;
+
+			if (map[i].br_startblock == HOLESTARTBLOCK)
+				continue;
+
+			firstid = (xfs_dqid_t) map[i].br_startoff *
+				mp->m_quotainfo->qi_dqperchunk;
+			/*
+			 * Do a read-ahead on the next extent.
+			 */
+			if ((i+1 < nmaps) &&
+			    (map[i+1].br_startblock != HOLESTARTBLOCK)) {
+				rablkcnt =  map[i+1].br_blockcount;
+				rablkno = map[i+1].br_startblock;
+				while (rablkcnt--) {
+					xfs_buf_readahead(mp->m_ddev_targp,
+					       XFS_FSB_TO_DADDR(mp, rablkno),
+					       mp->m_quotainfo->qi_dqchunklen);
+					rablkno++;
+				}
+			}
+			/*
+			 * Iterate thru all the blks in the extent and
+			 * reset the counters of all the dquots inside them.
+			 */
+			if ((error = xfs_qm_dqiter_bufs(mp,
+						       firstid,
+						       map[i].br_startblock,
+						       map[i].br_blockcount,
+						       flags))) {
+				break;
+			}
+		}
+
+		if (error)
+			break;
+	} while (nmaps > 0);
+
+	kmem_free(map);
+
+	return error;
+}
+
+/*
+ * Called by dqusage_adjust in doing a quotacheck.
+ *
+ * Given the inode, and a dquot id this updates both the incore dqout as well
+ * as the buffer copy. This is so that once the quotacheck is done, we can
+ * just log all the buffers, as opposed to logging numerous updates to
+ * individual dquots.
+ */
+STATIC int
+xfs_qm_quotacheck_dqadjust(
+	struct xfs_inode	*ip,
+	xfs_dqid_t		id,
+	uint			type,
+	xfs_qcnt_t		nblks,
+	xfs_qcnt_t		rtblks)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_dquot	*dqp;
+	int			error;
+
+	error = xfs_qm_dqget(mp, ip, id, type,
+			     XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp);
+	if (error) {
+		/*
+		 * Shouldn't be able to turn off quotas here.
+		 */
+		ASSERT(error != ESRCH);
+		ASSERT(error != ENOENT);
+		return error;
+	}
+
+	trace_xfs_dqadjust(dqp);
+
+	/*
+	 * Adjust the inode count and the block count to reflect this inode's
+	 * resource usage.
+	 */
+	be64_add_cpu(&dqp->q_core.d_icount, 1);
+	dqp->q_res_icount++;
+	if (nblks) {
+		be64_add_cpu(&dqp->q_core.d_bcount, nblks);
+		dqp->q_res_bcount += nblks;
+	}
+	if (rtblks) {
+		be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks);
+		dqp->q_res_rtbcount += rtblks;
+	}
+
+	/*
+	 * Set default limits, adjust timers (since we changed usages)
+	 *
+	 * There are no timers for the default values set in the root dquot.
+	 */
+	if (dqp->q_core.d_id) {
+		xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
+		xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
+	}
+
+	dqp->dq_flags |= XFS_DQ_DIRTY;
+	xfs_qm_dqput(dqp);
+	return 0;
+}
+
+STATIC int
+xfs_qm_get_rtblks(
+	xfs_inode_t	*ip,
+	xfs_qcnt_t	*O_rtblks)
+{
+	xfs_filblks_t	rtblks;			/* total rt blks */
+	xfs_extnum_t	idx;			/* extent record index */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_extnum_t	nextents;		/* number of extent entries */
+	int		error;
+
+	ASSERT(XFS_IS_REALTIME_INODE(ip));
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK)))
+			return error;
+	}
+	rtblks = 0;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	for (idx = 0; idx < nextents; idx++)
+		rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx));
+	*O_rtblks = (xfs_qcnt_t)rtblks;
+	return 0;
+}
+
+/*
+ * callback routine supplied to bulkstat(). Given an inumber, find its
+ * dquots and update them to account for resources taken by that inode.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_dqusage_adjust(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_ino_t	ino,		/* inode number to get data for */
+	void		__user *buffer,	/* not used */
+	int		ubsize,		/* not used */
+	int		*ubused,	/* not used */
+	int		*res)		/* result code value */
+{
+	xfs_inode_t	*ip;
+	xfs_qcnt_t	nblks, rtblks = 0;
+	int		error;
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	/*
+	 * rootino must have its resources accounted for, not so with the quota
+	 * inodes.
+	 */
+	if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+		*res = BULKSTAT_RV_NOTHING;
+		return XFS_ERROR(EINVAL);
+	}
+
+	/*
+	 * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget
+	 * interface expects the inode to be exclusively locked because that's
+	 * the case in all other instances. It's OK that we do this because
+	 * quotacheck is done only at mount time.
+	 */
+	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
+	if (error) {
+		*res = BULKSTAT_RV_NOTHING;
+		return error;
+	}
+
+	ASSERT(ip->i_delayed_blks == 0);
+
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		/*
+		 * Walk thru the extent list and count the realtime blocks.
+		 */
+		error = xfs_qm_get_rtblks(ip, &rtblks);
+		if (error)
+			goto error0;
+	}
+
+	nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
+
+	/*
+	 * Add the (disk blocks and inode) resources occupied by this
+	 * inode to its dquots. We do this adjustment in the incore dquot,
+	 * and also copy the changes to its buffer.
+	 * We don't care about putting these changes in a transaction
+	 * envelope because if we crash in the middle of a 'quotacheck'
+	 * we have to start from the beginning anyway.
+	 * Once we're done, we'll log all the dquot bufs.
+	 *
+	 * The *QUOTA_ON checks below may look pretty racy, but quotachecks
+	 * and quotaoffs don't race. (Quotachecks happen at mount time only).
+	 */
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
+						   XFS_DQ_USER, nblks, rtblks);
+		if (error)
+			goto error0;
+	}
+
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
+						   XFS_DQ_GROUP, nblks, rtblks);
+		if (error)
+			goto error0;
+	}
+
+	if (XFS_IS_PQUOTA_ON(mp)) {
+		error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
+						   XFS_DQ_PROJ, nblks, rtblks);
+		if (error)
+			goto error0;
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	IRELE(ip);
+	*res = BULKSTAT_RV_DIDONE;
+	return 0;
+
+error0:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	IRELE(ip);
+	*res = BULKSTAT_RV_GIVEUP;
+	return error;
+}
+
+/*
+ * Walk thru all the filesystem inodes and construct a consistent view
+ * of the disk quota world. If the quotacheck fails, disable quotas.
+ */
+int
+xfs_qm_quotacheck(
+	xfs_mount_t	*mp)
+{
+	int		done, count, error;
+	xfs_ino_t	lastino;
+	size_t		structsz;
+	xfs_inode_t	*uip, *gip;
+	uint		flags;
+
+	count = INT_MAX;
+	structsz = 1;
+	lastino = 0;
+	flags = 0;
+
+	ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	/*
+	 * There should be no cached dquots. The (simplistic) quotacheck
+	 * algorithm doesn't like that.
+	 */
+	ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
+
+	xfs_notice(mp, "Quotacheck needed: Please wait.");
+
+	/*
+	 * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
+	 * their counters to zero. We need a clean slate.
+	 * We don't log our changes till later.
+	 */
+	uip = mp->m_quotainfo->qi_uquotaip;
+	if (uip) {
+		error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
+		if (error)
+			goto error_return;
+		flags |= XFS_UQUOTA_CHKD;
+	}
+
+	gip = mp->m_quotainfo->qi_gquotaip;
+	if (gip) {
+		error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
+					XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+		if (error)
+			goto error_return;
+		flags |= XFS_OQUOTA_CHKD;
+	}
+
+	do {
+		/*
+		 * Iterate thru all the inodes in the file system,
+		 * adjusting the corresponding dquot counters in core.
+		 */
+		error = xfs_bulkstat(mp, &lastino, &count,
+				     xfs_qm_dqusage_adjust,
+				     structsz, NULL, &done);
+		if (error)
+			break;
+
+	} while (!done);
+
+	/*
+	 * We've made all the changes that we need to make incore.
+	 * Flush them down to disk buffers if everything was updated
+	 * successfully.
+	 */
+	if (!error)
+		error = xfs_qm_dqflush_all(mp, 0);
+
+	/*
+	 * We can get this error if we couldn't do a dquot allocation inside
+	 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
+	 * dirty dquots that might be cached, we just want to get rid of them
+	 * and turn quotaoff. The dquots won't be attached to any of the inodes
+	 * at this point (because we intentionally didn't in dqget_noattach).
+	 */
+	if (error) {
+		xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
+		goto error_return;
+	}
+
+	/*
+	 * We didn't log anything, because if we crashed, we'll have to
+	 * start the quotacheck from scratch anyway. However, we must make
+	 * sure that our dquot changes are secure before we put the
+	 * quotacheck'd stamp on the superblock. So, here we do a synchronous
+	 * flush.
+	 */
+	XFS_bflush(mp->m_ddev_targp);
+
+	/*
+	 * If one type of quotas is off, then it will lose its
+	 * quotachecked status, since we won't be doing accounting for
+	 * that type anymore.
+	 */
+	mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
+	mp->m_qflags |= flags;
+
+	xfs_qm_dquot_list_print(mp);
+
+ error_return:
+	if (error) {
+		xfs_warn(mp,
+	"Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
+			error);
+		/*
+		 * We must turn off quotas.
+		 */
+		ASSERT(mp->m_quotainfo != NULL);
+		ASSERT(xfs_Gqm != NULL);
+		xfs_qm_destroy_quotainfo(mp);
+		if (xfs_mount_reset_sbqflags(mp)) {
+			xfs_warn(mp,
+				"Quotacheck: Failed to reset quota flags.");
+		}
+	} else
+		xfs_notice(mp, "Quotacheck: Done.");
+	return (error);
+}
+
+/*
+ * This is called after the superblock has been read in and we're ready to
+ * iget the quota inodes.
+ */
+STATIC int
+xfs_qm_init_quotainos(
+	xfs_mount_t	*mp)
+{
+	xfs_inode_t	*uip, *gip;
+	int		error;
+	__int64_t	sbflags;
+	uint		flags;
+
+	ASSERT(mp->m_quotainfo);
+	uip = gip = NULL;
+	sbflags = 0;
+	flags = 0;
+
+	/*
+	 * Get the uquota and gquota inodes
+	 */
+	if (xfs_sb_version_hasquota(&mp->m_sb)) {
+		if (XFS_IS_UQUOTA_ON(mp) &&
+		    mp->m_sb.sb_uquotino != NULLFSINO) {
+			ASSERT(mp->m_sb.sb_uquotino > 0);
+			if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+					     0, 0, &uip)))
+				return XFS_ERROR(error);
+		}
+		if (XFS_IS_OQUOTA_ON(mp) &&
+		    mp->m_sb.sb_gquotino != NULLFSINO) {
+			ASSERT(mp->m_sb.sb_gquotino > 0);
+			if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+					     0, 0, &gip))) {
+				if (uip)
+					IRELE(uip);
+				return XFS_ERROR(error);
+			}
+		}
+	} else {
+		flags |= XFS_QMOPT_SBVERSION;
+		sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+			    XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
+	}
+
+	/*
+	 * Create the two inodes, if they don't exist already. The changes
+	 * made above will get added to a transaction and logged in one of
+	 * the qino_alloc calls below.  If the device is readonly,
+	 * temporarily switch to read-write to do this.
+	 */
+	if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
+		if ((error = xfs_qm_qino_alloc(mp, &uip,
+					      sbflags | XFS_SB_UQUOTINO,
+					      flags | XFS_QMOPT_UQUOTA)))
+			return XFS_ERROR(error);
+
+		flags &= ~XFS_QMOPT_SBVERSION;
+	}
+	if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) {
+		flags |= (XFS_IS_GQUOTA_ON(mp) ?
+				XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+		error = xfs_qm_qino_alloc(mp, &gip,
+					  sbflags | XFS_SB_GQUOTINO, flags);
+		if (error) {
+			if (uip)
+				IRELE(uip);
+
+			return XFS_ERROR(error);
+		}
+	}
+
+	mp->m_quotainfo->qi_uquotaip = uip;
+	mp->m_quotainfo->qi_gquotaip = gip;
+
+	return 0;
+}
+
+
+
+/*
+ * Just pop the least recently used dquot off the freelist and
+ * recycle it. The returned dquot is locked.
+ */
+STATIC xfs_dquot_t *
+xfs_qm_dqreclaim_one(void)
+{
+	xfs_dquot_t	*dqpout;
+	xfs_dquot_t	*dqp;
+	int		restarts;
+	int		startagain;
+
+	restarts = 0;
+	dqpout = NULL;
+
+	/* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
+again:
+	startagain = 0;
+	mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+
+	list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
+		struct xfs_mount *mp = dqp->q_mount;
+		xfs_dqlock(dqp);
+
+		/*
+		 * We are racing with dqlookup here. Naturally we don't
+		 * want to reclaim a dquot that lookup wants. We release the
+		 * freelist lock and start over, so that lookup will grab
+		 * both the dquot and the freelistlock.
+		 */
+		if (dqp->dq_flags & XFS_DQ_WANT) {
+			ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
+
+			trace_xfs_dqreclaim_want(dqp);
+			XQM_STATS_INC(xqmstats.xs_qm_dqwants);
+			restarts++;
+			startagain = 1;
+			goto dqunlock;
+		}
+
+		/*
+		 * If the dquot is inactive, we are assured that it is
+		 * not on the mplist or the hashlist, and that makes our
+		 * life easier.
+		 */
+		if (dqp->dq_flags & XFS_DQ_INACTIVE) {
+			ASSERT(mp == NULL);
+			ASSERT(! XFS_DQ_IS_DIRTY(dqp));
+			ASSERT(list_empty(&dqp->q_hashlist));
+			ASSERT(list_empty(&dqp->q_mplist));
+			list_del_init(&dqp->q_freelist);
+			xfs_Gqm->qm_dqfrlist_cnt--;
+			dqpout = dqp;
+			XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
+			goto dqunlock;
+		}
+
+		ASSERT(dqp->q_hash);
+		ASSERT(!list_empty(&dqp->q_mplist));
+
+		/*
+		 * Try to grab the flush lock. If this dquot is in the process
+		 * of getting flushed to disk, we don't want to reclaim it.
+		 */
+		if (!xfs_dqflock_nowait(dqp))
+			goto dqunlock;
+
+		/*
+		 * We have the flush lock so we know that this is not in the
+		 * process of being flushed. So, if this is dirty, flush it
+		 * DELWRI so that we don't get a freelist infested with
+		 * dirty dquots.
+		 */
+		if (XFS_DQ_IS_DIRTY(dqp)) {
+			int	error;
+
+			trace_xfs_dqreclaim_dirty(dqp);
+
+			/*
+			 * We flush it delayed write, so don't bother
+			 * releasing the freelist lock.
+			 */
+			error = xfs_qm_dqflush(dqp, 0);
+			if (error) {
+				xfs_warn(mp, "%s: dquot %p flush failed",
+					__func__, dqp);
+			}
+			goto dqunlock;
+		}
+
+		/*
+		 * We're trying to get the hashlock out of order. This races
+		 * with dqlookup; so, we giveup and goto the next dquot if
+		 * we couldn't get the hashlock. This way, we won't starve
+		 * a dqlookup process that holds the hashlock that is
+		 * waiting for the freelist lock.
+		 */
+		if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
+			restarts++;
+			goto dqfunlock;
+		}
+
+		/*
+		 * This races with dquot allocation code as well as dqflush_all
+		 * and reclaim code. So, if we failed to grab the mplist lock,
+		 * giveup everything and start over.
+		 */
+		if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
+			restarts++;
+			startagain = 1;
+			goto qhunlock;
+		}
+
+		ASSERT(dqp->q_nrefs == 0);
+		list_del_init(&dqp->q_mplist);
+		mp->m_quotainfo->qi_dquots--;
+		mp->m_quotainfo->qi_dqreclaims++;
+		list_del_init(&dqp->q_hashlist);
+		dqp->q_hash->qh_version++;
+		list_del_init(&dqp->q_freelist);
+		xfs_Gqm->qm_dqfrlist_cnt--;
+		dqpout = dqp;
+		mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+qhunlock:
+		mutex_unlock(&dqp->q_hash->qh_lock);
+dqfunlock:
+		xfs_dqfunlock(dqp);
+dqunlock:
+		xfs_dqunlock(dqp);
+		if (dqpout)
+			break;
+		if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+			break;
+		if (startagain) {
+			mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+			goto again;
+		}
+	}
+	mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+	return dqpout;
+}
+
+/*
+ * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * favor the lookup function ...
+ */
+STATIC int
+xfs_qm_shake_freelist(
+	int	howmany)
+{
+	int		nreclaimed = 0;
+	xfs_dquot_t	*dqp;
+
+	if (howmany <= 0)
+		return 0;
+
+	while (nreclaimed < howmany) {
+		dqp = xfs_qm_dqreclaim_one();
+		if (!dqp)
+			return nreclaimed;
+		xfs_qm_dqdestroy(dqp);
+		nreclaimed++;
+	}
+	return nreclaimed;
+}
+
+/*
+ * The kmem_shake interface is invoked when memory is running low.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_shake(
+	struct shrinker	*shrink,
+	struct shrink_control *sc)
+{
+	int	ndqused, nfree, n;
+	gfp_t gfp_mask = sc->gfp_mask;
+
+	if (!kmem_shake_allow(gfp_mask))
+		return 0;
+	if (!xfs_Gqm)
+		return 0;
+
+	nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
+	/* incore dquots in all f/s's */
+	ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
+
+	ASSERT(ndqused >= 0);
+
+	if (nfree <= ndqused && nfree < ndquot)
+		return 0;
+
+	ndqused *= xfs_Gqm->qm_dqfree_ratio;	/* target # of free dquots */
+	n = nfree - ndqused - ndquot;		/* # over target */
+
+	return xfs_qm_shake_freelist(MAX(nfree, n));
+}
+
+
+/*------------------------------------------------------------------*/
+
+/*
+ * Return a new incore dquot. Depending on the number of
+ * dquots in the system, we either allocate a new one on the kernel heap,
+ * or reclaim a free one.
+ * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed
+ * to reclaim an existing one from the freelist.
+ */
+boolean_t
+xfs_qm_dqalloc_incore(
+	xfs_dquot_t **O_dqpp)
+{
+	xfs_dquot_t	*dqp;
+
+	/*
+	 * Check against high water mark to see if we want to pop
+	 * a nincompoop dquot off the freelist.
+	 */
+	if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) {
+		/*
+		 * Try to recycle a dquot from the freelist.
+		 */
+		if ((dqp = xfs_qm_dqreclaim_one())) {
+			XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
+			/*
+			 * Just zero the core here. The rest will get
+			 * reinitialized by caller. XXX we shouldn't even
+			 * do this zero ...
+			 */
+			memset(&dqp->q_core, 0, sizeof(dqp->q_core));
+			*O_dqpp = dqp;
+			return B_FALSE;
+		}
+		XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
+	}
+
+	/*
+	 * Allocate a brand new dquot on the kernel heap and return it
+	 * to the caller to initialize.
+	 */
+	ASSERT(xfs_Gqm->qm_dqzone != NULL);
+	*O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
+	atomic_inc(&xfs_Gqm->qm_totaldquots);
+
+	return B_TRUE;
+}
+
+
+/*
+ * Start a transaction and write the incore superblock changes to
+ * disk. flags parameter indicates which fields have changed.
+ */
+int
+xfs_qm_write_sb_changes(
+	xfs_mount_t	*mp,
+	__int64_t	flags)
+{
+	xfs_trans_t	*tp;
+	int		error;
+
+#ifdef QUOTADEBUG
+	xfs_notice(mp, "Writing superblock quota changes");
+#endif
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+	if ((error = xfs_trans_reserve(tp, 0,
+				      mp->m_sb.sb_sectsize + 128, 0,
+				      0,
+				      XFS_DEFAULT_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_mod_sb(tp, flags);
+	error = xfs_trans_commit(tp, 0);
+
+	return error;
+}
+
+
+/* --------------- utility functions for vnodeops ---------------- */
+
+
+/*
+ * Given an inode, a uid, gid and prid make sure that we have
+ * allocated relevant dquot(s) on disk, and that we won't exceed inode
+ * quotas by creating this file.
+ * This also attaches dquot(s) to the given inode after locking it,
+ * and returns the dquots corresponding to the uid and/or gid.
+ *
+ * in	: inode (unlocked)
+ * out	: udquot, gdquot with references taken and unlocked
+ */
+int
+xfs_qm_vop_dqalloc(
+	struct xfs_inode	*ip,
+	uid_t			uid,
+	gid_t			gid,
+	prid_t			prid,
+	uint			flags,
+	struct xfs_dquot	**O_udqpp,
+	struct xfs_dquot	**O_gdqpp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_dquot	*uq, *gq;
+	int			error;
+	uint			lockflags;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+		return 0;
+
+	lockflags = XFS_ILOCK_EXCL;
+	xfs_ilock(ip, lockflags);
+
+	if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip))
+		gid = ip->i_d.di_gid;
+
+	/*
+	 * Attach the dquot(s) to this inode, doing a dquot allocation
+	 * if necessary. The dquot(s) will not be locked.
+	 */
+	if (XFS_NOT_DQATTACHED(mp, ip)) {
+		error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC);
+		if (error) {
+			xfs_iunlock(ip, lockflags);
+			return error;
+		}
+	}
+
+	uq = gq = NULL;
+	if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
+		if (ip->i_d.di_uid != uid) {
+			/*
+			 * What we need is the dquot that has this uid, and
+			 * if we send the inode to dqget, the uid of the inode
+			 * takes priority over what's sent in the uid argument.
+			 * We must unlock inode here before calling dqget if
+			 * we're not sending the inode, because otherwise
+			 * we'll deadlock by doing trans_reserve while
+			 * holding ilock.
+			 */
+			xfs_iunlock(ip, lockflags);
+			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+						 XFS_DQ_USER,
+						 XFS_QMOPT_DQALLOC |
+						 XFS_QMOPT_DOWARN,
+						 &uq))) {
+				ASSERT(error != ENOENT);
+				return error;
+			}
+			/*
+			 * Get the ilock in the right order.
+			 */
+			xfs_dqunlock(uq);
+			lockflags = XFS_ILOCK_SHARED;
+			xfs_ilock(ip, lockflags);
+		} else {
+			/*
+			 * Take an extra reference, because we'll return
+			 * this to caller
+			 */
+			ASSERT(ip->i_udquot);
+			uq = ip->i_udquot;
+			xfs_dqlock(uq);
+			XFS_DQHOLD(uq);
+			xfs_dqunlock(uq);
+		}
+	}
+	if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
+		if (ip->i_d.di_gid != gid) {
+			xfs_iunlock(ip, lockflags);
+			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+						 XFS_DQ_GROUP,
+						 XFS_QMOPT_DQALLOC |
+						 XFS_QMOPT_DOWARN,
+						 &gq))) {
+				if (uq)
+					xfs_qm_dqrele(uq);
+				ASSERT(error != ENOENT);
+				return error;
+			}
+			xfs_dqunlock(gq);
+			lockflags = XFS_ILOCK_SHARED;
+			xfs_ilock(ip, lockflags);
+		} else {
+			ASSERT(ip->i_gdquot);
+			gq = ip->i_gdquot;
+			xfs_dqlock(gq);
+			XFS_DQHOLD(gq);
+			xfs_dqunlock(gq);
+		}
+	} else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
+		if (xfs_get_projid(ip) != prid) {
+			xfs_iunlock(ip, lockflags);
+			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
+						 XFS_DQ_PROJ,
+						 XFS_QMOPT_DQALLOC |
+						 XFS_QMOPT_DOWARN,
+						 &gq))) {
+				if (uq)
+					xfs_qm_dqrele(uq);
+				ASSERT(error != ENOENT);
+				return (error);
+			}
+			xfs_dqunlock(gq);
+			lockflags = XFS_ILOCK_SHARED;
+			xfs_ilock(ip, lockflags);
+		} else {
+			ASSERT(ip->i_gdquot);
+			gq = ip->i_gdquot;
+			xfs_dqlock(gq);
+			XFS_DQHOLD(gq);
+			xfs_dqunlock(gq);
+		}
+	}
+	if (uq)
+		trace_xfs_dquot_dqalloc(ip);
+
+	xfs_iunlock(ip, lockflags);
+	if (O_udqpp)
+		*O_udqpp = uq;
+	else if (uq)
+		xfs_qm_dqrele(uq);
+	if (O_gdqpp)
+		*O_gdqpp = gq;
+	else if (gq)
+		xfs_qm_dqrele(gq);
+	return 0;
+}
+
+/*
+ * Actually transfer ownership, and do dquot modifications.
+ * These were already reserved.
+ */
+xfs_dquot_t *
+xfs_qm_vop_chown(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_dquot_t	**IO_olddq,
+	xfs_dquot_t	*newdq)
+{
+	xfs_dquot_t	*prevdq;
+	uint		bfield = XFS_IS_REALTIME_INODE(ip) ?
+				 XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
+
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
+
+	/* old dquot */
+	prevdq = *IO_olddq;
+	ASSERT(prevdq);
+	ASSERT(prevdq != newdq);
+
+	xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks));
+	xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
+
+	/* the sparkling new dquot */
+	xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks);
+	xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
+
+	/*
+	 * Take an extra reference, because the inode
+	 * is going to keep this dquot pointer even
+	 * after the trans_commit.
+	 */
+	xfs_dqlock(newdq);
+	XFS_DQHOLD(newdq);
+	xfs_dqunlock(newdq);
+	*IO_olddq = newdq;
+
+	return prevdq;
+}
+
+/*
+ * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID).
+ */
+int
+xfs_qm_vop_chown_reserve(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_dquot_t	*udqp,
+	xfs_dquot_t	*gdqp,
+	uint		flags)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	uint		delblks, blkflags, prjflags = 0;
+	xfs_dquot_t	*unresudq, *unresgdq, *delblksudq, *delblksgdq;
+	int		error;
+
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	delblks = ip->i_delayed_blks;
+	delblksudq = delblksgdq = unresudq = unresgdq = NULL;
+	blkflags = XFS_IS_REALTIME_INODE(ip) ?
+			XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
+
+	if (XFS_IS_UQUOTA_ON(mp) && udqp &&
+	    ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
+		delblksudq = udqp;
+		/*
+		 * If there are delayed allocation blocks, then we have to
+		 * unreserve those from the old dquot, and add them to the
+		 * new dquot.
+		 */
+		if (delblks) {
+			ASSERT(ip->i_udquot);
+			unresudq = ip->i_udquot;
+		}
+	}
+	if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
+		if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
+		     xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
+			prjflags = XFS_QMOPT_ENOSPC;
+
+		if (prjflags ||
+		    (XFS_IS_GQUOTA_ON(ip->i_mount) &&
+		     ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
+			delblksgdq = gdqp;
+			if (delblks) {
+				ASSERT(ip->i_gdquot);
+				unresgdq = ip->i_gdquot;
+			}
+		}
+	}
+
+	if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
+				delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
+				flags | blkflags | prjflags)))
+		return (error);
+
+	/*
+	 * Do the delayed blks reservations/unreservations now. Since, these
+	 * are done without the help of a transaction, if a reservation fails
+	 * its previous reservations won't be automatically undone by trans
+	 * code. So, we have to do it manually here.
+	 */
+	if (delblks) {
+		/*
+		 * Do the reservations first. Unreservation can't fail.
+		 */
+		ASSERT(delblksudq || delblksgdq);
+		ASSERT(unresudq || unresgdq);
+		if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+				delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
+				flags | blkflags | prjflags)))
+			return (error);
+		xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+				unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
+				blkflags);
+	}
+
+	return (0);
+}
+
+int
+xfs_qm_vop_rename_dqattach(
+	struct xfs_inode	**i_tab)
+{
+	struct xfs_mount	*mp = i_tab[0]->i_mount;
+	int			i;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+		return 0;
+
+	for (i = 0; (i < 4 && i_tab[i]); i++) {
+		struct xfs_inode	*ip = i_tab[i];
+		int			error;
+
+		/*
+		 * Watch out for duplicate entries in the table.
+		 */
+		if (i == 0 || ip != i_tab[i-1]) {
+			if (XFS_NOT_DQATTACHED(mp, ip)) {
+				error = xfs_qm_dqattach(ip, 0);
+				if (error)
+					return error;
+			}
+		}
+	}
+	return 0;
+}
+
+void
+xfs_qm_vop_create_dqattach(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_dquot	*udqp,
+	struct xfs_dquot	*gdqp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+		return;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	if (udqp) {
+		xfs_dqlock(udqp);
+		XFS_DQHOLD(udqp);
+		xfs_dqunlock(udqp);
+		ASSERT(ip->i_udquot == NULL);
+		ip->i_udquot = udqp;
+		ASSERT(XFS_IS_UQUOTA_ON(mp));
+		ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
+		xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
+	}
+	if (gdqp) {
+		xfs_dqlock(gdqp);
+		XFS_DQHOLD(gdqp);
+		xfs_dqunlock(gdqp);
+		ASSERT(ip->i_gdquot == NULL);
+		ip->i_gdquot = gdqp;
+		ASSERT(XFS_IS_OQUOTA_ON(mp));
+		ASSERT((XFS_IS_GQUOTA_ON(mp) ?
+			ip->i_d.di_gid : xfs_get_projid(ip)) ==
+				be32_to_cpu(gdqp->q_core.d_id));
+		xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
+	}
+}
+
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
new file mode 100644
index 00000000..567b29b9
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_QM_H__
+#define __XFS_QM_H__
+
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+#include "xfs_quota_priv.h"
+#include "xfs_qm_stats.h"
+
+struct xfs_qm;
+struct xfs_inode;
+
+extern uint		ndquot;
+extern struct mutex	xfs_Gqm_lock;
+extern struct xfs_qm	*xfs_Gqm;
+extern kmem_zone_t	*qm_dqzone;
+extern kmem_zone_t	*qm_dqtrxzone;
+
+/*
+ * Used in xfs_qm_sync called by xfs_sync to count the max times that it can
+ * iterate over the mountpt's dquot list in one call.
+ */
+#define XFS_QM_SYNC_MAX_RESTARTS	7
+
+/*
+ * Ditto, for xfs_qm_dqreclaim_one.
+ */
+#define XFS_QM_RECLAIM_MAX_RESTARTS	4
+
+/*
+ * Ideal ratio of free to in use dquots. Quota manager makes an attempt
+ * to keep this balance.
+ */
+#define XFS_QM_DQFREE_RATIO		2
+
+/*
+ * Dquot hashtable constants/threshold values.
+ */
+#define XFS_QM_HASHSIZE_LOW		(PAGE_SIZE / sizeof(xfs_dqhash_t))
+#define XFS_QM_HASHSIZE_HIGH		((PAGE_SIZE * 4) / sizeof(xfs_dqhash_t))
+
+/*
+ * This defines the unit of allocation of dquots.
+ * Currently, it is just one file system block, and a 4K blk contains 30
+ * (136 * 30 = 4080) dquots. It's probably not worth trying to make
+ * this more dynamic.
+ * XXXsup However, if this number is changed, we have to make sure that we don't
+ * implicitly assume that we do allocations in chunks of a single filesystem
+ * block in the dquot/xqm code.
+ */
+#define XFS_DQUOT_CLUSTER_SIZE_FSB	(xfs_filblks_t)1
+
+typedef xfs_dqhash_t	xfs_dqlist_t;
+
+/*
+ * Quota Manager (global) structure. Lives only in core.
+ */
+typedef struct xfs_qm {
+	xfs_dqlist_t	*qm_usr_dqhtable;/* udquot hash table */
+	xfs_dqlist_t	*qm_grp_dqhtable;/* gdquot hash table */
+	uint		 qm_dqhashmask;	 /* # buckets in dq hashtab - 1 */
+	struct list_head qm_dqfrlist;	 /* freelist of dquots */
+	struct mutex	 qm_dqfrlist_lock;
+	int		 qm_dqfrlist_cnt;
+	atomic_t	 qm_totaldquots; /* total incore dquots */
+	uint		 qm_nrefs;	 /* file systems with quota on */
+	int		 qm_dqfree_ratio;/* ratio of free to inuse dquots */
+	kmem_zone_t	*qm_dqzone;	 /* dquot mem-alloc zone */
+	kmem_zone_t	*qm_dqtrxzone;	 /* t_dqinfo of transactions */
+} xfs_qm_t;
+
+/*
+ * Various quota information for individual filesystems.
+ * The mount structure keeps a pointer to this.
+ */
+typedef struct xfs_quotainfo {
+	xfs_inode_t	*qi_uquotaip;	 /* user quota inode */
+	xfs_inode_t	*qi_gquotaip;	 /* group quota inode */
+	struct list_head qi_dqlist;	 /* all dquots in filesys */
+	struct mutex	 qi_dqlist_lock;
+	int		 qi_dquots;
+	int		 qi_dqreclaims;	 /* a change here indicates
+					    a removal in the dqlist */
+	time_t		 qi_btimelimit;	 /* limit for blks timer */
+	time_t		 qi_itimelimit;	 /* limit for inodes timer */
+	time_t		 qi_rtbtimelimit;/* limit for rt blks timer */
+	xfs_qwarncnt_t	 qi_bwarnlimit;	 /* limit for blks warnings */
+	xfs_qwarncnt_t	 qi_iwarnlimit;	 /* limit for inodes warnings */
+	xfs_qwarncnt_t	 qi_rtbwarnlimit;/* limit for rt blks warnings */
+	struct mutex	 qi_quotaofflock;/* to serialize quotaoff */
+	xfs_filblks_t	 qi_dqchunklen;	 /* # BBs in a chunk of dqs */
+	uint		 qi_dqperchunk;	 /* # ondisk dqs in above chunk */
+	xfs_qcnt_t	 qi_bhardlimit;	 /* default data blk hard limit */
+	xfs_qcnt_t	 qi_bsoftlimit;	 /* default data blk soft limit */
+	xfs_qcnt_t	 qi_ihardlimit;	 /* default inode count hard limit */
+	xfs_qcnt_t	 qi_isoftlimit;	 /* default inode count soft limit */
+	xfs_qcnt_t	 qi_rtbhardlimit;/* default realtime blk hard limit */
+	xfs_qcnt_t	 qi_rtbsoftlimit;/* default realtime blk soft limit */
+} xfs_quotainfo_t;
+
+
+extern void	xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
+extern int	xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
+			xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
+extern void	xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
+extern void	xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
+
+/*
+ * We keep the usr and grp dquots separately so that locking will be easier
+ * to do at commit time. All transactions that we know of at this point
+ * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
+ */
+#define XFS_QM_TRANS_MAXDQS		2
+typedef struct xfs_dquot_acct {
+	xfs_dqtrx_t	dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
+	xfs_dqtrx_t	dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
+} xfs_dquot_acct_t;
+
+/*
+ * Users are allowed to have a usage exceeding their softlimit for
+ * a period this long.
+ */
+#define XFS_QM_BTIMELIMIT	(7 * 24*60*60)          /* 1 week */
+#define XFS_QM_RTBTIMELIMIT	(7 * 24*60*60)          /* 1 week */
+#define XFS_QM_ITIMELIMIT	(7 * 24*60*60)          /* 1 week */
+
+#define XFS_QM_BWARNLIMIT	5
+#define XFS_QM_IWARNLIMIT	5
+#define XFS_QM_RTBWARNLIMIT	5
+
+extern void		xfs_qm_destroy_quotainfo(xfs_mount_t *);
+extern int		xfs_qm_quotacheck(xfs_mount_t *);
+extern int		xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
+
+/* dquot stuff */
+extern boolean_t	xfs_qm_dqalloc_incore(xfs_dquot_t **);
+extern int		xfs_qm_dqpurge_all(xfs_mount_t *, uint);
+extern void		xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
+
+/* quota ops */
+extern int		xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
+extern int		xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
+					fs_disk_quota_t *);
+extern int		xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
+					fs_disk_quota_t *);
+extern int		xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
+extern int		xfs_qm_scall_quotaon(xfs_mount_t *, uint);
+extern int		xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
+
+#ifdef DEBUG
+extern int		xfs_qm_internalqcheck(xfs_mount_t *);
+#else
+#define xfs_qm_internalqcheck(mp)	(0)
+#endif
+
+#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
new file mode 100644
index 00000000..a0a829ad
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_qm.h"
+
+
+STATIC void
+xfs_fill_statvfs_from_dquot(
+	struct kstatfs		*statp,
+	xfs_disk_dquot_t	*dp)
+{
+	__uint64_t		limit;
+
+	limit = dp->d_blk_softlimit ?
+		be64_to_cpu(dp->d_blk_softlimit) :
+		be64_to_cpu(dp->d_blk_hardlimit);
+	if (limit && statp->f_blocks > limit) {
+		statp->f_blocks = limit;
+		statp->f_bfree = statp->f_bavail =
+			(statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
+			 (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
+	}
+
+	limit = dp->d_ino_softlimit ?
+		be64_to_cpu(dp->d_ino_softlimit) :
+		be64_to_cpu(dp->d_ino_hardlimit);
+	if (limit && statp->f_files > limit) {
+		statp->f_files = limit;
+		statp->f_ffree =
+			(statp->f_files > be64_to_cpu(dp->d_icount)) ?
+			 (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0;
+	}
+}
+
+
+/*
+ * Directory tree accounting is implemented using project quotas, where
+ * the project identifier is inherited from parent directories.
+ * A statvfs (df, etc.) of a directory that is using project quota should
+ * return a statvfs of the project, not the entire filesystem.
+ * This makes such trees appear as if they are filesystems in themselves.
+ */
+void
+xfs_qm_statvfs(
+	xfs_inode_t		*ip,
+	struct kstatfs		*statp)
+{
+	xfs_mount_t		*mp = ip->i_mount;
+	xfs_dquot_t		*dqp;
+
+	if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
+		xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
+		xfs_qm_dqput(dqp);
+	}
+}
+
+int
+xfs_qm_newmount(
+	xfs_mount_t	*mp,
+	uint		*needquotamount,
+	uint		*quotaflags)
+{
+	uint		quotaondisk;
+	uint		uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0;
+
+	quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) &&
+				(mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT);
+
+	if (quotaondisk) {
+		uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT;
+		pquotaondisk = mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT;
+		gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT;
+	}
+
+	/*
+	 * If the device itself is read-only, we can't allow
+	 * the user to change the state of quota on the mount -
+	 * this would generate a transaction on the ro device,
+	 * which would lead to an I/O error and shutdown
+	 */
+
+	if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
+	    (!uquotaondisk &&  XFS_IS_UQUOTA_ON(mp)) ||
+	     (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
+	    (!pquotaondisk &&  XFS_IS_PQUOTA_ON(mp)) ||
+	     (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
+	    (!gquotaondisk &&  XFS_IS_OQUOTA_ON(mp)))  &&
+	    xfs_dev_is_read_only(mp, "changing quota state")) {
+		xfs_warn(mp, "please mount with%s%s%s%s.",
+			(!quotaondisk ? "out quota" : ""),
+			(uquotaondisk ? " usrquota" : ""),
+			(pquotaondisk ? " prjquota" : ""),
+			(gquotaondisk ? " grpquota" : ""));
+		return XFS_ERROR(EPERM);
+	}
+
+	if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
+		/*
+		 * Call mount_quotas at this point only if we won't have to do
+		 * a quotacheck.
+		 */
+		if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
+			/*
+			 * If an error occurred, qm_mount_quotas code
+			 * has already disabled quotas. So, just finish
+			 * mounting, and get on with the boring life
+			 * without disk quotas.
+			 */
+			xfs_qm_mount_quotas(mp);
+		} else {
+			/*
+			 * Clear the quota flags, but remember them. This
+			 * is so that the quota code doesn't get invoked
+			 * before we're ready. This can happen when an
+			 * inode goes inactive and wants to free blocks,
+			 * or via xfs_log_mount_finish.
+			 */
+			*needquotamount = B_TRUE;
+			*quotaflags = mp->m_qflags;
+			mp->m_qflags = 0;
+		}
+	}
+
+	return 0;
+}
+
+void __init
+xfs_qm_init(void)
+{
+	printk(KERN_INFO "SGI XFS Quota Management subsystem\n");
+	mutex_init(&xfs_Gqm_lock);
+	xfs_qm_init_procfs();
+}
+
+void __exit
+xfs_qm_exit(void)
+{
+	xfs_qm_cleanup_procfs();
+	if (qm_dqzone)
+		kmem_zone_destroy(qm_dqzone);
+	if (qm_dqtrxzone)
+		kmem_zone_destroy(qm_dqtrxzone);
+}
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
new file mode 100644
index 00000000..8671a0b3
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_qm.h"
+
+struct xqmstats xqmstats;
+
+static int xqm_proc_show(struct seq_file *m, void *v)
+{
+	/* maximum; incore; ratio free to inuse; freelist */
+	seq_printf(m, "%d\t%d\t%d\t%u\n",
+			ndquot,
+			xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
+			xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
+			xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
+	return 0;
+}
+
+static int xqm_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xqm_proc_show, NULL);
+}
+
+static const struct file_operations xqm_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= xqm_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
+	/* quota performance statistics */
+	seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
+			xqmstats.xs_qm_dqreclaims,
+			xqmstats.xs_qm_dqreclaim_misses,
+			xqmstats.xs_qm_dquot_dups,
+			xqmstats.xs_qm_dqcachemisses,
+			xqmstats.xs_qm_dqcachehits,
+			xqmstats.xs_qm_dqwants,
+			xqmstats.xs_qm_dqshake_reclaims,
+			xqmstats.xs_qm_dqinact_reclaims);
+	return 0;
+}
+
+static int xqmstat_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xqmstat_proc_show, NULL);
+}
+
+static const struct file_operations xqmstat_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= xqmstat_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+void
+xfs_qm_init_procfs(void)
+{
+	proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
+	proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
+}
+
+void
+xfs_qm_cleanup_procfs(void)
+{
+	remove_proc_entry("fs/xfs/xqm", NULL);
+	remove_proc_entry("fs/xfs/xqmstat", NULL);
+}
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h
new file mode 100644
index 00000000..5b964fc0
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_stats.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2002 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_QM_STATS_H__
+#define __XFS_QM_STATS_H__
+
+#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
+
+/*
+ * XQM global statistics
+ */
+struct xqmstats {
+	__uint32_t		xs_qm_dqreclaims;
+	__uint32_t		xs_qm_dqreclaim_misses;
+	__uint32_t		xs_qm_dquot_dups;
+	__uint32_t		xs_qm_dqcachemisses;
+	__uint32_t		xs_qm_dqcachehits;
+	__uint32_t		xs_qm_dqwants;
+	__uint32_t		xs_qm_dqshake_reclaims;
+	__uint32_t		xs_qm_dqinact_reclaims;
+};
+
+extern struct xqmstats xqmstats;
+
+# define XQM_STATS_INC(count)	( (count)++ )
+
+extern void xfs_qm_init_procfs(void);
+extern void xfs_qm_cleanup_procfs(void);
+
+#else
+
+# define XQM_STATS_INC(count)	do { } while (0)
+
+static inline void xfs_qm_init_procfs(void) { };
+static inline void xfs_qm_cleanup_procfs(void) { };
+
+#endif
+
+#endif	/* __XFS_QM_STATS_H__ */
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
new file mode 100644
index 00000000..2dadb15d
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -0,0 +1,1259 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/capability.h>
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_qm.h"
+#include "xfs_trace.h"
+
+STATIC int	xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
+STATIC int	xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
+					uint);
+STATIC uint	xfs_qm_export_flags(uint);
+STATIC uint	xfs_qm_export_qtype_flags(uint);
+STATIC void	xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
+					fs_disk_quota_t *);
+
+
+/*
+ * Turn off quota accounting and/or enforcement for all udquots and/or
+ * gdquots. Called only at unmount time.
+ *
+ * This assumes that there are no dquots of this file system cached
+ * incore, and modifies the ondisk dquot directly. Therefore, for example,
+ * it is an error to call this twice, without purging the cache.
+ */
+int
+xfs_qm_scall_quotaoff(
+	xfs_mount_t		*mp,
+	uint			flags)
+{
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	uint			dqtype;
+	int			error;
+	uint			inactivate_flags;
+	xfs_qoff_logitem_t	*qoffstart;
+	int			nculprits;
+
+	/*
+	 * No file system can have quotas enabled on disk but not in core.
+	 * Note that quota utilities (like quotaoff) _expect_
+	 * errno == EEXIST here.
+	 */
+	if ((mp->m_qflags & flags) == 0)
+		return XFS_ERROR(EEXIST);
+	error = 0;
+
+	flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
+
+	/*
+	 * We don't want to deal with two quotaoffs messing up each other,
+	 * so we're going to serialize it. quotaoff isn't exactly a performance
+	 * critical thing.
+	 * If quotaoff, then we must be dealing with the root filesystem.
+	 */
+	ASSERT(q);
+	mutex_lock(&q->qi_quotaofflock);
+
+	/*
+	 * If we're just turning off quota enforcement, change mp and go.
+	 */
+	if ((flags & XFS_ALL_QUOTA_ACCT) == 0) {
+		mp->m_qflags &= ~(flags);
+
+		spin_lock(&mp->m_sb_lock);
+		mp->m_sb.sb_qflags = mp->m_qflags;
+		spin_unlock(&mp->m_sb_lock);
+		mutex_unlock(&q->qi_quotaofflock);
+
+		/* XXX what to do if error ? Revert back to old vals incore ? */
+		error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
+		return (error);
+	}
+
+	dqtype = 0;
+	inactivate_flags = 0;
+	/*
+	 * If accounting is off, we must turn enforcement off, clear the
+	 * quota 'CHKD' certificate to make it known that we have to
+	 * do a quotacheck the next time this quota is turned on.
+	 */
+	if (flags & XFS_UQUOTA_ACCT) {
+		dqtype |= XFS_QMOPT_UQUOTA;
+		flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD);
+		inactivate_flags |= XFS_UQUOTA_ACTIVE;
+	}
+	if (flags & XFS_GQUOTA_ACCT) {
+		dqtype |= XFS_QMOPT_GQUOTA;
+		flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+		inactivate_flags |= XFS_GQUOTA_ACTIVE;
+	} else if (flags & XFS_PQUOTA_ACCT) {
+		dqtype |= XFS_QMOPT_PQUOTA;
+		flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+		inactivate_flags |= XFS_PQUOTA_ACTIVE;
+	}
+
+	/*
+	 * Nothing to do?  Don't complain. This happens when we're just
+	 * turning off quota enforcement.
+	 */
+	if ((mp->m_qflags & flags) == 0)
+		goto out_unlock;
+
+	/*
+	 * Write the LI_QUOTAOFF log record, and do SB changes atomically,
+	 * and synchronously. If we fail to write, we should abort the
+	 * operation as it cannot be recovered safely if we crash.
+	 */
+	error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+	if (error)
+		goto out_unlock;
+
+	/*
+	 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
+	 * to take care of the race between dqget and quotaoff. We don't take
+	 * any special locks to reset these bits. All processes need to check
+	 * these bits *after* taking inode lock(s) to see if the particular
+	 * quota type is in the process of being turned off. If *ACTIVE, it is
+	 * guaranteed that all dquot structures and all quotainode ptrs will all
+	 * stay valid as long as that inode is kept locked.
+	 *
+	 * There is no turning back after this.
+	 */
+	mp->m_qflags &= ~inactivate_flags;
+
+	/*
+	 * Give back all the dquot reference(s) held by inodes.
+	 * Here we go thru every single incore inode in this file system, and
+	 * do a dqrele on the i_udquot/i_gdquot that it may have.
+	 * Essentially, as long as somebody has an inode locked, this guarantees
+	 * that quotas will not be turned off. This is handy because in a
+	 * transaction once we lock the inode(s) and check for quotaon, we can
+	 * depend on the quota inodes (and other things) being valid as long as
+	 * we keep the lock(s).
+	 */
+	xfs_qm_dqrele_all_inodes(mp, flags);
+
+	/*
+	 * Next we make the changes in the quota flag in the mount struct.
+	 * This isn't protected by a particular lock directly, because we
+	 * don't want to take a mrlock every time we depend on quotas being on.
+	 */
+	mp->m_qflags &= ~(flags);
+
+	/*
+	 * Go through all the dquots of this file system and purge them,
+	 * according to what was turned off. We may not be able to get rid
+	 * of all dquots, because dquots can have temporary references that
+	 * are not attached to inodes. eg. xfs_setattr, xfs_create.
+	 * So, if we couldn't purge all the dquots from the filesystem,
+	 * we can't get rid of the incore data structures.
+	 */
+	while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
+		delay(10 * nculprits);
+
+	/*
+	 * Transactions that had started before ACTIVE state bit was cleared
+	 * could have logged many dquots, so they'd have higher LSNs than
+	 * the first QUOTAOFF log record does. If we happen to crash when
+	 * the tail of the log has gone past the QUOTAOFF record, but
+	 * before the last dquot modification, those dquots __will__
+	 * recover, and that's not good.
+	 *
+	 * So, we have QUOTAOFF start and end logitems; the start
+	 * logitem won't get overwritten until the end logitem appears...
+	 */
+	error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+	if (error) {
+		/* We're screwed now. Shutdown is the only option. */
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		goto out_unlock;
+	}
+
+	/*
+	 * If quotas is completely disabled, close shop.
+	 */
+	if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
+	    ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
+		mutex_unlock(&q->qi_quotaofflock);
+		xfs_qm_destroy_quotainfo(mp);
+		return (0);
+	}
+
+	/*
+	 * Release our quotainode references if we don't need them anymore.
+	 */
+	if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
+		IRELE(q->qi_uquotaip);
+		q->qi_uquotaip = NULL;
+	}
+	if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
+		IRELE(q->qi_gquotaip);
+		q->qi_gquotaip = NULL;
+	}
+
+out_unlock:
+	mutex_unlock(&q->qi_quotaofflock);
+	return error;
+}
+
+STATIC int
+xfs_qm_scall_trunc_qfile(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	struct xfs_inode	*ip;
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (ino == NULLFSINO)
+		return 0;
+
+	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
+	error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+				  XFS_TRANS_PERM_LOG_RES,
+				  XFS_ITRUNCATE_LOG_COUNT);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		goto out_put;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip);
+
+	error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, 1);
+	if (error) {
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+				     XFS_TRANS_ABORT);
+		goto out_unlock;
+	}
+
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+out_put:
+	IRELE(ip);
+	return error;
+}
+
+int
+xfs_qm_scall_trunc_qfiles(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	int		error = 0, error2 = 0;
+
+	if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
+		xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
+			__func__, flags, mp->m_qflags);
+		return XFS_ERROR(EINVAL);
+	}
+
+	if (flags & XFS_DQ_USER)
+		error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
+	if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ))
+		error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+
+	return error ? error : error2;
+}
+
+/*
+ * Switch on (a given) quota enforcement for a filesystem.  This takes
+ * effect immediately.
+ * (Switching on quota accounting must be done at mount time.)
+ */
+int
+xfs_qm_scall_quotaon(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	int		error;
+	uint		qf;
+	__int64_t	sbflags;
+
+	flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
+	/*
+	 * Switching on quota accounting must be done at mount time.
+	 */
+	flags &= ~(XFS_ALL_QUOTA_ACCT);
+
+	sbflags = 0;
+
+	if (flags == 0) {
+		xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
+			__func__, mp->m_qflags);
+		return XFS_ERROR(EINVAL);
+	}
+
+	/* No fs can turn on quotas with a delayed effect */
+	ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0);
+
+	/*
+	 * Can't enforce without accounting. We check the superblock
+	 * qflags here instead of m_qflags because rootfs can have
+	 * quota acct on ondisk without m_qflags' knowing.
+	 */
+	if (((flags & XFS_UQUOTA_ACCT) == 0 &&
+	    (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+	    (flags & XFS_UQUOTA_ENFD))
+	    ||
+	    ((flags & XFS_PQUOTA_ACCT) == 0 &&
+	    (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
+	    (flags & XFS_GQUOTA_ACCT) == 0 &&
+	    (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+	    (flags & XFS_OQUOTA_ENFD))) {
+		xfs_debug(mp,
+			"%s: Can't enforce without acct, flags=%x sbflags=%x\n",
+			__func__, flags, mp->m_sb.sb_qflags);
+		return XFS_ERROR(EINVAL);
+	}
+	/*
+	 * If everything's up to-date incore, then don't waste time.
+	 */
+	if ((mp->m_qflags & flags) == flags)
+		return XFS_ERROR(EEXIST);
+
+	/*
+	 * Change sb_qflags on disk but not incore mp->qflags
+	 * if this is the root filesystem.
+	 */
+	spin_lock(&mp->m_sb_lock);
+	qf = mp->m_sb.sb_qflags;
+	mp->m_sb.sb_qflags = qf | flags;
+	spin_unlock(&mp->m_sb_lock);
+
+	/*
+	 * There's nothing to change if it's the same.
+	 */
+	if ((qf & flags) == flags && sbflags == 0)
+		return XFS_ERROR(EEXIST);
+	sbflags |= XFS_SB_QFLAGS;
+
+	if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
+		return (error);
+	/*
+	 * If we aren't trying to switch on quota enforcement, we are done.
+	 */
+	if  (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) !=
+	     (mp->m_qflags & XFS_UQUOTA_ACCT)) ||
+	     ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) !=
+	     (mp->m_qflags & XFS_PQUOTA_ACCT)) ||
+	     ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
+	     (mp->m_qflags & XFS_GQUOTA_ACCT)) ||
+	    (flags & XFS_ALL_QUOTA_ENFD) == 0)
+		return (0);
+
+	if (! XFS_IS_QUOTA_RUNNING(mp))
+		return XFS_ERROR(ESRCH);
+
+	/*
+	 * Switch on quota enforcement in core.
+	 */
+	mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
+	mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
+	mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
+
+	return (0);
+}
+
+
+/*
+ * Return quota status information, such as uquota-off, enforcements, etc.
+ */
+int
+xfs_qm_scall_getqstat(
+	struct xfs_mount	*mp,
+	struct fs_quota_stat	*out)
+{
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	struct xfs_inode	*uip, *gip;
+	boolean_t		tempuqip, tempgqip;
+
+	uip = gip = NULL;
+	tempuqip = tempgqip = B_FALSE;
+	memset(out, 0, sizeof(fs_quota_stat_t));
+
+	out->qs_version = FS_QSTAT_VERSION;
+	if (!xfs_sb_version_hasquota(&mp->m_sb)) {
+		out->qs_uquota.qfs_ino = NULLFSINO;
+		out->qs_gquota.qfs_ino = NULLFSINO;
+		return (0);
+	}
+	out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
+							(XFS_ALL_QUOTA_ACCT|
+							 XFS_ALL_QUOTA_ENFD));
+	out->qs_pad = 0;
+	out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
+	out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+
+	if (q) {
+		uip = q->qi_uquotaip;
+		gip = q->qi_gquotaip;
+	}
+	if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
+		if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+					0, 0, &uip) == 0)
+			tempuqip = B_TRUE;
+	}
+	if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
+		if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+					0, 0, &gip) == 0)
+			tempgqip = B_TRUE;
+	}
+	if (uip) {
+		out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
+		out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
+		if (tempuqip)
+			IRELE(uip);
+	}
+	if (gip) {
+		out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
+		out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
+		if (tempgqip)
+			IRELE(gip);
+	}
+	if (q) {
+		out->qs_incoredqs = q->qi_dquots;
+		out->qs_btimelimit = q->qi_btimelimit;
+		out->qs_itimelimit = q->qi_itimelimit;
+		out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+		out->qs_bwarnlimit = q->qi_bwarnlimit;
+		out->qs_iwarnlimit = q->qi_iwarnlimit;
+	}
+	return 0;
+}
+
+#define XFS_DQ_MASK \
+	(FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
+
+/*
+ * Adjust quota limits, and start/stop timers accordingly.
+ */
+int
+xfs_qm_scall_setqlim(
+	xfs_mount_t		*mp,
+	xfs_dqid_t		id,
+	uint			type,
+	fs_disk_quota_t		*newlim)
+{
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	xfs_disk_dquot_t	*ddq;
+	xfs_dquot_t		*dqp;
+	xfs_trans_t		*tp;
+	int			error;
+	xfs_qcnt_t		hard, soft;
+
+	if (newlim->d_fieldmask & ~XFS_DQ_MASK)
+		return EINVAL;
+	if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
+		return 0;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
+	if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
+				      0, 0, XFS_DEFAULT_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		return (error);
+	}
+
+	/*
+	 * We don't want to race with a quotaoff so take the quotaoff lock.
+	 * (We don't hold an inode lock, so there's nothing else to stop
+	 * a quotaoff from happening). (XXXThis doesn't currently happen
+	 * because we take the vfslock before calling xfs_qm_sysent).
+	 */
+	mutex_lock(&q->qi_quotaofflock);
+
+	/*
+	 * Get the dquot (locked), and join it to the transaction.
+	 * Allocate the dquot if this doesn't exist.
+	 */
+	if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
+		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+		ASSERT(error != ENOENT);
+		goto out_unlock;
+	}
+	xfs_trans_dqjoin(tp, dqp);
+	ddq = &dqp->q_core;
+
+	/*
+	 * Make sure that hardlimits are >= soft limits before changing.
+	 */
+	hard = (newlim->d_fieldmask & FS_DQ_BHARD) ?
+		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) :
+			be64_to_cpu(ddq->d_blk_hardlimit);
+	soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ?
+		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) :
+			be64_to_cpu(ddq->d_blk_softlimit);
+	if (hard == 0 || hard >= soft) {
+		ddq->d_blk_hardlimit = cpu_to_be64(hard);
+		ddq->d_blk_softlimit = cpu_to_be64(soft);
+		if (id == 0) {
+			q->qi_bhardlimit = hard;
+			q->qi_bsoftlimit = soft;
+		}
+	} else {
+		xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
+	}
+	hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
+		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
+			be64_to_cpu(ddq->d_rtb_hardlimit);
+	soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ?
+		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) :
+			be64_to_cpu(ddq->d_rtb_softlimit);
+	if (hard == 0 || hard >= soft) {
+		ddq->d_rtb_hardlimit = cpu_to_be64(hard);
+		ddq->d_rtb_softlimit = cpu_to_be64(soft);
+		if (id == 0) {
+			q->qi_rtbhardlimit = hard;
+			q->qi_rtbsoftlimit = soft;
+		}
+	} else {
+		xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
+	}
+
+	hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
+		(xfs_qcnt_t) newlim->d_ino_hardlimit :
+			be64_to_cpu(ddq->d_ino_hardlimit);
+	soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ?
+		(xfs_qcnt_t) newlim->d_ino_softlimit :
+			be64_to_cpu(ddq->d_ino_softlimit);
+	if (hard == 0 || hard >= soft) {
+		ddq->d_ino_hardlimit = cpu_to_be64(hard);
+		ddq->d_ino_softlimit = cpu_to_be64(soft);
+		if (id == 0) {
+			q->qi_ihardlimit = hard;
+			q->qi_isoftlimit = soft;
+		}
+	} else {
+		xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
+	}
+
+	/*
+	 * Update warnings counter(s) if requested
+	 */
+	if (newlim->d_fieldmask & FS_DQ_BWARNS)
+		ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns);
+	if (newlim->d_fieldmask & FS_DQ_IWARNS)
+		ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns);
+	if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
+		ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns);
+
+	if (id == 0) {
+		/*
+		 * Timelimits for the super user set the relative time
+		 * the other users can be over quota for this file system.
+		 * If it is zero a default is used.  Ditto for the default
+		 * soft and hard limit values (already done, above), and
+		 * for warnings.
+		 */
+		if (newlim->d_fieldmask & FS_DQ_BTIMER) {
+			q->qi_btimelimit = newlim->d_btimer;
+			ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
+		}
+		if (newlim->d_fieldmask & FS_DQ_ITIMER) {
+			q->qi_itimelimit = newlim->d_itimer;
+			ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
+		}
+		if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
+			q->qi_rtbtimelimit = newlim->d_rtbtimer;
+			ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
+		}
+		if (newlim->d_fieldmask & FS_DQ_BWARNS)
+			q->qi_bwarnlimit = newlim->d_bwarns;
+		if (newlim->d_fieldmask & FS_DQ_IWARNS)
+			q->qi_iwarnlimit = newlim->d_iwarns;
+		if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
+			q->qi_rtbwarnlimit = newlim->d_rtbwarns;
+	} else {
+		/*
+		 * If the user is now over quota, start the timelimit.
+		 * The user will not be 'warned'.
+		 * Note that we keep the timers ticking, whether enforcement
+		 * is on or off. We don't really want to bother with iterating
+		 * over all ondisk dquots and turning the timers on/off.
+		 */
+		xfs_qm_adjust_dqtimers(mp, ddq);
+	}
+	dqp->dq_flags |= XFS_DQ_DIRTY;
+	xfs_trans_log_dquot(tp, dqp);
+
+	error = xfs_trans_commit(tp, 0);
+	xfs_qm_dqprint(dqp);
+	xfs_qm_dqrele(dqp);
+
+ out_unlock:
+	mutex_unlock(&q->qi_quotaofflock);
+	return error;
+}
+
+int
+xfs_qm_scall_getquota(
+	xfs_mount_t	*mp,
+	xfs_dqid_t	id,
+	uint		type,
+	fs_disk_quota_t *out)
+{
+	xfs_dquot_t	*dqp;
+	int		error;
+
+	/*
+	 * Try to get the dquot. We don't want it allocated on disk, so
+	 * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
+	 * exist, we'll get ENOENT back.
+	 */
+	if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) {
+		return (error);
+	}
+
+	/*
+	 * If everything's NULL, this dquot doesn't quite exist as far as
+	 * our utility programs are concerned.
+	 */
+	if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+		xfs_qm_dqput(dqp);
+		return XFS_ERROR(ENOENT);
+	}
+	/* xfs_qm_dqprint(dqp); */
+	/*
+	 * Convert the disk dquot to the exportable format
+	 */
+	xfs_qm_export_dquot(mp, &dqp->q_core, out);
+	xfs_qm_dqput(dqp);
+	return (error ? XFS_ERROR(EFAULT) : 0);
+}
+
+
+STATIC int
+xfs_qm_log_quotaoff_end(
+	xfs_mount_t		*mp,
+	xfs_qoff_logitem_t	*startqoff,
+	uint			flags)
+{
+	xfs_trans_t		*tp;
+	int			error;
+	xfs_qoff_logitem_t	*qoffi;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
+
+	if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2,
+				      0, 0, XFS_DEFAULT_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		return (error);
+	}
+
+	qoffi = xfs_trans_get_qoff_item(tp, startqoff,
+					flags & XFS_ALL_QUOTA_ACCT);
+	xfs_trans_log_quotaoff_item(tp, qoffi);
+
+	/*
+	 * We have to make sure that the transaction is secure on disk before we
+	 * return and actually stop quota accounting. So, make it synchronous.
+	 * We don't care about quotoff's performance.
+	 */
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0);
+	return (error);
+}
+
+
+STATIC int
+xfs_qm_log_quotaoff(
+	xfs_mount_t	       *mp,
+	xfs_qoff_logitem_t     **qoffstartp,
+	uint		       flags)
+{
+	xfs_trans_t	       *tp;
+	int			error;
+	xfs_qoff_logitem_t     *qoffi=NULL;
+	uint			oldsbqflag=0;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
+	if ((error = xfs_trans_reserve(tp, 0,
+				      sizeof(xfs_qoff_logitem_t) * 2 +
+				      mp->m_sb.sb_sectsize + 128,
+				      0,
+				      0,
+				      XFS_DEFAULT_LOG_COUNT))) {
+		goto error0;
+	}
+
+	qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
+	xfs_trans_log_quotaoff_item(tp, qoffi);
+
+	spin_lock(&mp->m_sb_lock);
+	oldsbqflag = mp->m_sb.sb_qflags;
+	mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
+	spin_unlock(&mp->m_sb_lock);
+
+	xfs_mod_sb(tp, XFS_SB_QFLAGS);
+
+	/*
+	 * We have to make sure that the transaction is secure on disk before we
+	 * return and actually stop quota accounting. So, make it synchronous.
+	 * We don't care about quotoff's performance.
+	 */
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0);
+
+error0:
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		/*
+		 * No one else is modifying sb_qflags, so this is OK.
+		 * We still hold the quotaofflock.
+		 */
+		spin_lock(&mp->m_sb_lock);
+		mp->m_sb.sb_qflags = oldsbqflag;
+		spin_unlock(&mp->m_sb_lock);
+	}
+	*qoffstartp = qoffi;
+	return (error);
+}
+
+
+/*
+ * Translate an internal style on-disk-dquot to the exportable format.
+ * The main differences are that the counters/limits are all in Basic
+ * Blocks (BBs) instead of the internal FSBs, and all on-disk data has
+ * to be converted to the native endianness.
+ */
+STATIC void
+xfs_qm_export_dquot(
+	xfs_mount_t		*mp,
+	xfs_disk_dquot_t	*src,
+	struct fs_disk_quota	*dst)
+{
+	memset(dst, 0, sizeof(*dst));
+	dst->d_version = FS_DQUOT_VERSION;  /* different from src->d_version */
+	dst->d_flags = xfs_qm_export_qtype_flags(src->d_flags);
+	dst->d_id = be32_to_cpu(src->d_id);
+	dst->d_blk_hardlimit =
+		XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_hardlimit));
+	dst->d_blk_softlimit =
+		XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_softlimit));
+	dst->d_ino_hardlimit = be64_to_cpu(src->d_ino_hardlimit);
+	dst->d_ino_softlimit = be64_to_cpu(src->d_ino_softlimit);
+	dst->d_bcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_bcount));
+	dst->d_icount = be64_to_cpu(src->d_icount);
+	dst->d_btimer = be32_to_cpu(src->d_btimer);
+	dst->d_itimer = be32_to_cpu(src->d_itimer);
+	dst->d_iwarns = be16_to_cpu(src->d_iwarns);
+	dst->d_bwarns = be16_to_cpu(src->d_bwarns);
+	dst->d_rtb_hardlimit =
+		XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_hardlimit));
+	dst->d_rtb_softlimit =
+		XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_softlimit));
+	dst->d_rtbcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtbcount));
+	dst->d_rtbtimer = be32_to_cpu(src->d_rtbtimer);
+	dst->d_rtbwarns = be16_to_cpu(src->d_rtbwarns);
+
+	/*
+	 * Internally, we don't reset all the timers when quota enforcement
+	 * gets turned off. No need to confuse the user level code,
+	 * so return zeroes in that case.
+	 */
+	if ((!XFS_IS_UQUOTA_ENFORCED(mp) && src->d_flags == XFS_DQ_USER) ||
+	    (!XFS_IS_OQUOTA_ENFORCED(mp) &&
+			(src->d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
+		dst->d_btimer = 0;
+		dst->d_itimer = 0;
+		dst->d_rtbtimer = 0;
+	}
+
+#ifdef DEBUG
+	if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
+	     (XFS_IS_OQUOTA_ENFORCED(mp) &&
+			(dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
+	    dst->d_id != 0) {
+		if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
+		    (dst->d_blk_softlimit > 0)) {
+			ASSERT(dst->d_btimer != 0);
+		}
+		if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) &&
+		    (dst->d_ino_softlimit > 0)) {
+			ASSERT(dst->d_itimer != 0);
+		}
+	}
+#endif
+}
+
+STATIC uint
+xfs_qm_export_qtype_flags(
+	uint flags)
+{
+	/*
+	 * Can't be more than one, or none.
+	 */
+	ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
+		(FS_PROJ_QUOTA | FS_USER_QUOTA));
+	ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
+		(FS_PROJ_QUOTA | FS_GROUP_QUOTA));
+	ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
+		(FS_USER_QUOTA | FS_GROUP_QUOTA));
+	ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
+
+	return (flags & XFS_DQ_USER) ?
+		FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
+			FS_PROJ_QUOTA : FS_GROUP_QUOTA;
+}
+
+STATIC uint
+xfs_qm_export_flags(
+	uint flags)
+{
+	uint uflags;
+
+	uflags = 0;
+	if (flags & XFS_UQUOTA_ACCT)
+		uflags |= FS_QUOTA_UDQ_ACCT;
+	if (flags & XFS_PQUOTA_ACCT)
+		uflags |= FS_QUOTA_PDQ_ACCT;
+	if (flags & XFS_GQUOTA_ACCT)
+		uflags |= FS_QUOTA_GDQ_ACCT;
+	if (flags & XFS_UQUOTA_ENFD)
+		uflags |= FS_QUOTA_UDQ_ENFD;
+	if (flags & (XFS_OQUOTA_ENFD)) {
+		uflags |= (flags & XFS_GQUOTA_ACCT) ?
+			FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
+	}
+	return (uflags);
+}
+
+
+STATIC int
+xfs_dqrele_inode(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	int			flags)
+{
+	/* skip quota inodes */
+	if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
+	    ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
+		ASSERT(ip->i_udquot == NULL);
+		ASSERT(ip->i_gdquot == NULL);
+		return 0;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
+		xfs_qm_dqrele(ip->i_udquot);
+		ip->i_udquot = NULL;
+	}
+	if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+		xfs_qm_dqrele(ip->i_gdquot);
+		ip->i_gdquot = NULL;
+	}
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+}
+
+
+/*
+ * Go thru all the inodes in the file system, releasing their dquots.
+ *
+ * Note that the mount structure gets modified to indicate that quotas are off
+ * AFTER this, in the case of quotaoff.
+ */
+void
+xfs_qm_dqrele_all_inodes(
+	struct xfs_mount *mp,
+	uint		 flags)
+{
+	ASSERT(mp->m_quotainfo);
+	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
+}
+
+/*------------------------------------------------------------------------*/
+#ifdef DEBUG
+/*
+ * This contains all the test functions for XFS disk quotas.
+ * Currently it does a quota accounting check. ie. it walks through
+ * all inodes in the file system, calculating the dquot accounting fields,
+ * and prints out any inconsistencies.
+ */
+xfs_dqhash_t *qmtest_udqtab;
+xfs_dqhash_t *qmtest_gdqtab;
+int	      qmtest_hashmask;
+int	      qmtest_nfails;
+struct mutex  qcheck_lock;
+
+#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
+				 (__psunsigned_t)(id)) & \
+				(qmtest_hashmask - 1))
+
+#define DQTEST_HASH(mp, id, type)   ((type & XFS_DQ_USER) ? \
+				     (qmtest_udqtab + \
+				      DQTEST_HASHVAL(mp, id)) : \
+				     (qmtest_gdqtab + \
+				      DQTEST_HASHVAL(mp, id)))
+
+#define DQTEST_LIST_PRINT(l, NXT, title) \
+{ \
+	  xfs_dqtest_t	*dqp; int i = 0;\
+	  xfs_debug(NULL, "%s (#%d)", title, (int) (l)->qh_nelems); \
+	  for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
+	       dqp = (xfs_dqtest_t *)dqp->NXT) { \
+		xfs_debug(dqp->q_mount,		\
+			"  %d. \"%d (%s)\"  bcnt = %d, icnt = %d", \
+			 ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp),	     \
+			 dqp->d_bcount, dqp->d_icount); } \
+}
+
+typedef struct dqtest {
+	uint		 dq_flags;	/* various flags (XFS_DQ_*) */
+	struct list_head q_hashlist;
+	xfs_dqhash_t	*q_hash;	/* the hashchain header */
+	xfs_mount_t	*q_mount;	/* filesystem this relates to */
+	xfs_dqid_t	d_id;		/* user id or group id */
+	xfs_qcnt_t	d_bcount;	/* # disk blocks owned by the user */
+	xfs_qcnt_t	d_icount;	/* # inodes owned by the user */
+} xfs_dqtest_t;
+
+STATIC void
+xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
+{
+	list_add(&dqp->q_hashlist, &h->qh_list);
+	h->qh_version++;
+	h->qh_nelems++;
+}
+STATIC void
+xfs_qm_dqtest_print(
+	struct xfs_mount	*mp,
+	struct dqtest		*d)
+{
+	xfs_debug(mp, "-----------DQTEST DQUOT----------------");
+	xfs_debug(mp, "---- dquot ID = %d", d->d_id);
+	xfs_debug(mp, "---- fs       = 0x%p", d->q_mount);
+	xfs_debug(mp, "---- bcount   = %Lu (0x%x)",
+		d->d_bcount, (int)d->d_bcount);
+	xfs_debug(mp, "---- icount   = %Lu (0x%x)",
+		d->d_icount, (int)d->d_icount);
+	xfs_debug(mp, "---------------------------");
+}
+
+STATIC void
+xfs_qm_dqtest_failed(
+	xfs_dqtest_t	*d,
+	xfs_dquot_t	*dqp,
+	char		*reason,
+	xfs_qcnt_t	a,
+	xfs_qcnt_t	b,
+	int		error)
+{
+	qmtest_nfails++;
+	if (error)
+		xfs_debug(dqp->q_mount,
+			"quotacheck failed id=%d, err=%d\nreason: %s",
+			d->d_id, error, reason);
+	else
+		xfs_debug(dqp->q_mount,
+			"quotacheck failed id=%d (%s) [%d != %d]",
+			d->d_id, reason, (int)a, (int)b);
+	xfs_qm_dqtest_print(dqp->q_mount, d);
+	if (dqp)
+		xfs_qm_dqprint(dqp);
+}
+
+STATIC int
+xfs_dqtest_cmp2(
+	xfs_dqtest_t	*d,
+	xfs_dquot_t	*dqp)
+{
+	int err = 0;
+	if (be64_to_cpu(dqp->q_core.d_icount) != d->d_icount) {
+		xfs_qm_dqtest_failed(d, dqp, "icount mismatch",
+			be64_to_cpu(dqp->q_core.d_icount),
+			d->d_icount, 0);
+		err++;
+	}
+	if (be64_to_cpu(dqp->q_core.d_bcount) != d->d_bcount) {
+		xfs_qm_dqtest_failed(d, dqp, "bcount mismatch",
+			be64_to_cpu(dqp->q_core.d_bcount),
+			d->d_bcount, 0);
+		err++;
+	}
+	if (dqp->q_core.d_blk_softlimit &&
+	    be64_to_cpu(dqp->q_core.d_bcount) >=
+	    be64_to_cpu(dqp->q_core.d_blk_softlimit)) {
+		if (!dqp->q_core.d_btimer && dqp->q_core.d_id) {
+			xfs_debug(dqp->q_mount,
+				"%d [%s] BLK TIMER NOT STARTED",
+				d->d_id, DQFLAGTO_TYPESTR(d));
+			err++;
+		}
+	}
+	if (dqp->q_core.d_ino_softlimit &&
+	    be64_to_cpu(dqp->q_core.d_icount) >=
+	    be64_to_cpu(dqp->q_core.d_ino_softlimit)) {
+		if (!dqp->q_core.d_itimer && dqp->q_core.d_id) {
+			xfs_debug(dqp->q_mount,
+				"%d [%s] INO TIMER NOT STARTED",
+				d->d_id, DQFLAGTO_TYPESTR(d));
+			err++;
+		}
+	}
+#ifdef QUOTADEBUG
+	if (!err) {
+		xfs_debug(dqp->q_mount, "%d [%s] qchecked",
+			d->d_id, DQFLAGTO_TYPESTR(d));
+	}
+#endif
+	return (err);
+}
+
+STATIC void
+xfs_dqtest_cmp(
+	xfs_dqtest_t	*d)
+{
+	xfs_dquot_t	*dqp;
+	int		error;
+
+	/* xfs_qm_dqtest_print(d); */
+	if ((error = xfs_qm_dqget(d->q_mount, NULL, d->d_id, d->dq_flags, 0,
+				 &dqp))) {
+		xfs_qm_dqtest_failed(d, NULL, "dqget failed", 0, 0, error);
+		return;
+	}
+	xfs_dqtest_cmp2(d, dqp);
+	xfs_qm_dqput(dqp);
+}
+
+STATIC int
+xfs_qm_internalqcheck_dqget(
+	xfs_mount_t	*mp,
+	xfs_dqid_t	id,
+	uint		type,
+	xfs_dqtest_t	**O_dq)
+{
+	xfs_dqtest_t	*d;
+	xfs_dqhash_t	*h;
+
+	h = DQTEST_HASH(mp, id, type);
+	list_for_each_entry(d, &h->qh_list, q_hashlist) {
+		if (d->d_id == id && mp == d->q_mount) {
+			*O_dq = d;
+			return (0);
+		}
+	}
+	d = kmem_zalloc(sizeof(xfs_dqtest_t), KM_SLEEP);
+	d->dq_flags = type;
+	d->d_id = id;
+	d->q_mount = mp;
+	d->q_hash = h;
+	INIT_LIST_HEAD(&d->q_hashlist);
+	xfs_qm_hashinsert(h, d);
+	*O_dq = d;
+	return (0);
+}
+
+STATIC void
+xfs_qm_internalqcheck_get_dquots(
+	xfs_mount_t	*mp,
+	xfs_dqid_t	uid,
+	xfs_dqid_t	projid,
+	xfs_dqid_t	gid,
+	xfs_dqtest_t	**ud,
+	xfs_dqtest_t	**gd)
+{
+	if (XFS_IS_UQUOTA_ON(mp))
+		xfs_qm_internalqcheck_dqget(mp, uid, XFS_DQ_USER, ud);
+	if (XFS_IS_GQUOTA_ON(mp))
+		xfs_qm_internalqcheck_dqget(mp, gid, XFS_DQ_GROUP, gd);
+	else if (XFS_IS_PQUOTA_ON(mp))
+		xfs_qm_internalqcheck_dqget(mp, projid, XFS_DQ_PROJ, gd);
+}
+
+
+STATIC void
+xfs_qm_internalqcheck_dqadjust(
+	xfs_inode_t		*ip,
+	xfs_dqtest_t		*d)
+{
+	d->d_icount++;
+	d->d_bcount += (xfs_qcnt_t)ip->i_d.di_nblocks;
+}
+
+STATIC int
+xfs_qm_internalqcheck_adjust(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_ino_t	ino,		/* inode number to get data for */
+	void		__user *buffer,	/* not used */
+	int		ubsize,		/* not used */
+	int		*ubused,	/* not used */
+	int		*res)		/* bulkstat result code */
+{
+	xfs_inode_t		*ip;
+	xfs_dqtest_t		*ud, *gd;
+	uint			lock_flags;
+	boolean_t		ipreleased;
+	int			error;
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+		*res = BULKSTAT_RV_NOTHING;
+		xfs_debug(mp, "%s: ino=%llu, uqino=%llu, gqino=%llu\n",
+			__func__, (unsigned long long) ino,
+			(unsigned long long) mp->m_sb.sb_uquotino,
+			(unsigned long long) mp->m_sb.sb_gquotino);
+		return XFS_ERROR(EINVAL);
+	}
+	ipreleased = B_FALSE;
+ again:
+	lock_flags = XFS_ILOCK_SHARED;
+	if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip))) {
+		*res = BULKSTAT_RV_NOTHING;
+		return (error);
+	}
+
+	/*
+	 * This inode can have blocks after eof which can get released
+	 * when we send it to inactive. Since we don't check the dquot
+	 * until the after all our calculations are done, we must get rid
+	 * of those now.
+	 */
+	if (! ipreleased) {
+		xfs_iunlock(ip, lock_flags);
+		IRELE(ip);
+		ipreleased = B_TRUE;
+		goto again;
+	}
+	xfs_qm_internalqcheck_get_dquots(mp,
+					(xfs_dqid_t) ip->i_d.di_uid,
+					(xfs_dqid_t) xfs_get_projid(ip),
+					(xfs_dqid_t) ip->i_d.di_gid,
+					&ud, &gd);
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		ASSERT(ud);
+		xfs_qm_internalqcheck_dqadjust(ip, ud);
+	}
+	if (XFS_IS_OQUOTA_ON(mp)) {
+		ASSERT(gd);
+		xfs_qm_internalqcheck_dqadjust(ip, gd);
+	}
+	xfs_iunlock(ip, lock_flags);
+	IRELE(ip);
+	*res = BULKSTAT_RV_DIDONE;
+	return (0);
+}
+
+
+/* PRIVATE, debugging */
+int
+xfs_qm_internalqcheck(
+	xfs_mount_t	*mp)
+{
+	xfs_ino_t	lastino;
+	int		done, count;
+	int		i;
+	int		error;
+
+	lastino = 0;
+	qmtest_hashmask = 32;
+	count = 5;
+	done = 0;
+	qmtest_nfails = 0;
+
+	if (! XFS_IS_QUOTA_ON(mp))
+		return XFS_ERROR(ESRCH);
+
+	xfs_log_force(mp, XFS_LOG_SYNC);
+	XFS_bflush(mp->m_ddev_targp);
+	xfs_log_force(mp, XFS_LOG_SYNC);
+	XFS_bflush(mp->m_ddev_targp);
+
+	mutex_lock(&qcheck_lock);
+	/* There should be absolutely no quota activity while this
+	   is going on. */
+	qmtest_udqtab = kmem_zalloc(qmtest_hashmask *
+				    sizeof(xfs_dqhash_t), KM_SLEEP);
+	qmtest_gdqtab = kmem_zalloc(qmtest_hashmask *
+				    sizeof(xfs_dqhash_t), KM_SLEEP);
+	do {
+		/*
+		 * Iterate thru all the inodes in the file system,
+		 * adjusting the corresponding dquot counters
+		 */
+		error = xfs_bulkstat(mp, &lastino, &count,
+				 xfs_qm_internalqcheck_adjust,
+				 0, NULL, &done);
+		if (error) {
+			xfs_debug(mp, "Bulkstat returned error 0x%x", error);
+			break;
+		}
+	} while (!done);
+
+	xfs_debug(mp, "Checking results against system dquots");
+	for (i = 0; i < qmtest_hashmask; i++) {
+		xfs_dqtest_t	*d, *n;
+		xfs_dqhash_t	*h;
+
+		h = &qmtest_udqtab[i];
+		list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
+			xfs_dqtest_cmp(d);
+			kmem_free(d);
+		}
+		h = &qmtest_gdqtab[i];
+		list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
+			xfs_dqtest_cmp(d);
+			kmem_free(d);
+		}
+	}
+
+	if (qmtest_nfails) {
+		xfs_debug(mp, "******** quotacheck failed  ********");
+		xfs_debug(mp, "failures = %d", qmtest_nfails);
+	} else {
+		xfs_debug(mp, "******** quotacheck successful! ********");
+	}
+	kmem_free(qmtest_udqtab);
+	kmem_free(qmtest_gdqtab);
+	mutex_unlock(&qcheck_lock);
+	return (qmtest_nfails);
+}
+
+#endif /* DEBUG */
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
new file mode 100644
index 00000000..94a3d927
--- /dev/null
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_QUOTA_PRIV_H__
+#define __XFS_QUOTA_PRIV_H__
+
+/*
+ * Number of bmaps that we ask from bmapi when doing a quotacheck.
+ * We make this restriction to keep the memory usage to a minimum.
+ */
+#define XFS_DQITER_MAP_SIZE	10
+
+/*
+ * Hash into a bucket in the dquot hash table, based on <mp, id>.
+ */
+#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
+				 (__psunsigned_t)(id)) & \
+				(xfs_Gqm->qm_dqhashmask - 1))
+#define XFS_DQ_HASH(mp, id, type)   (type == XFS_DQ_USER ? \
+				     (xfs_Gqm->qm_usr_dqhtable + \
+				      XFS_DQ_HASHVAL(mp, id)) : \
+				     (xfs_Gqm->qm_grp_dqhtable + \
+				      XFS_DQ_HASHVAL(mp, id)))
+#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
+	!dqp->q_core.d_blk_hardlimit && \
+	!dqp->q_core.d_blk_softlimit && \
+	!dqp->q_core.d_rtb_hardlimit && \
+	!dqp->q_core.d_rtb_softlimit && \
+	!dqp->q_core.d_ino_hardlimit && \
+	!dqp->q_core.d_ino_softlimit && \
+	!dqp->q_core.d_bcount && \
+	!dqp->q_core.d_rtbcount && \
+	!dqp->q_core.d_icount)
+
+#define DQFLAGTO_TYPESTR(d)	(((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
+				 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
+				 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
+
+#endif	/* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
new file mode 100644
index 00000000..2a364873
--- /dev/null
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -0,0 +1,895 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+
+STATIC void	xfs_trans_alloc_dqinfo(xfs_trans_t *);
+
+/*
+ * Add the locked dquot to the transaction.
+ * The dquot must be locked, and it cannot be associated with any
+ * transaction.
+ */
+void
+xfs_trans_dqjoin(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp)
+{
+	ASSERT(dqp->q_transp != tp);
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	ASSERT(dqp->q_logitem.qli_dquot == dqp);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
+
+	/*
+	 * Initialize i_transp so we can later determine if this dquot is
+	 * associated with this transaction.
+	 */
+	dqp->q_transp = tp;
+}
+
+
+/*
+ * This is called to mark the dquot as needing
+ * to be logged when the transaction is committed.  The dquot must
+ * already be associated with the given transaction.
+ * Note that it marks the entire transaction as dirty. In the ordinary
+ * case, this gets called via xfs_trans_commit, after the transaction
+ * is already dirty. However, there's nothing stop this from getting
+ * called directly, as done by xfs_qm_scall_setqlim. Hence, the TRANS_DIRTY
+ * flag.
+ */
+void
+xfs_trans_log_dquot(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp)
+{
+	ASSERT(dqp->q_transp == tp);
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
+
+/*
+ * Carry forward whatever is left of the quota blk reservation to
+ * the spanky new transaction
+ */
+void
+xfs_trans_dup_dqinfo(
+	xfs_trans_t	*otp,
+	xfs_trans_t	*ntp)
+{
+	xfs_dqtrx_t	*oq, *nq;
+	int		i,j;
+	xfs_dqtrx_t	*oqa, *nqa;
+
+	if (!otp->t_dqinfo)
+		return;
+
+	xfs_trans_alloc_dqinfo(ntp);
+	oqa = otp->t_dqinfo->dqa_usrdquots;
+	nqa = ntp->t_dqinfo->dqa_usrdquots;
+
+	/*
+	 * Because the quota blk reservation is carried forward,
+	 * it is also necessary to carry forward the DQ_DIRTY flag.
+	 */
+	if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
+		ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
+
+	for (j = 0; j < 2; j++) {
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			if (oqa[i].qt_dquot == NULL)
+				break;
+			oq = &oqa[i];
+			nq = &nqa[i];
+
+			nq->qt_dquot = oq->qt_dquot;
+			nq->qt_bcount_delta = nq->qt_icount_delta = 0;
+			nq->qt_rtbcount_delta = 0;
+
+			/*
+			 * Transfer whatever is left of the reservations.
+			 */
+			nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used;
+			oq->qt_blk_res = oq->qt_blk_res_used;
+
+			nq->qt_rtblk_res = oq->qt_rtblk_res -
+				oq->qt_rtblk_res_used;
+			oq->qt_rtblk_res = oq->qt_rtblk_res_used;
+
+			nq->qt_ino_res = oq->qt_ino_res - oq->qt_ino_res_used;
+			oq->qt_ino_res = oq->qt_ino_res_used;
+
+		}
+		oqa = otp->t_dqinfo->dqa_grpdquots;
+		nqa = ntp->t_dqinfo->dqa_grpdquots;
+	}
+}
+
+/*
+ * Wrap around mod_dquot to account for both user and group quotas.
+ */
+void
+xfs_trans_mod_dquot_byino(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		field,
+	long		delta)
+{
+	xfs_mount_t	*mp = tp->t_mountp;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp) ||
+	    !XFS_IS_QUOTA_ON(mp) ||
+	    ip->i_ino == mp->m_sb.sb_uquotino ||
+	    ip->i_ino == mp->m_sb.sb_gquotino)
+		return;
+
+	if (tp->t_dqinfo == NULL)
+		xfs_trans_alloc_dqinfo(tp);
+
+	if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
+		(void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
+	if (XFS_IS_OQUOTA_ON(mp) && ip->i_gdquot)
+		(void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+}
+
+STATIC xfs_dqtrx_t *
+xfs_trans_get_dqtrx(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp)
+{
+	int		i;
+	xfs_dqtrx_t	*qa;
+
+	qa = XFS_QM_ISUDQ(dqp) ?
+		tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
+
+	for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+		if (qa[i].qt_dquot == NULL ||
+		    qa[i].qt_dquot == dqp)
+			return &qa[i];
+	}
+
+	return NULL;
+}
+
+/*
+ * Make the changes in the transaction structure.
+ * The moral equivalent to xfs_trans_mod_sb().
+ * We don't touch any fields in the dquot, so we don't care
+ * if it's locked or not (most of the time it won't be).
+ */
+void
+xfs_trans_mod_dquot(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp,
+	uint		field,
+	long		delta)
+{
+	xfs_dqtrx_t	*qtrx;
+
+	ASSERT(tp);
+	ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
+	qtrx = NULL;
+
+	if (tp->t_dqinfo == NULL)
+		xfs_trans_alloc_dqinfo(tp);
+	/*
+	 * Find either the first free slot or the slot that belongs
+	 * to this dquot.
+	 */
+	qtrx = xfs_trans_get_dqtrx(tp, dqp);
+	ASSERT(qtrx);
+	if (qtrx->qt_dquot == NULL)
+		qtrx->qt_dquot = dqp;
+
+	switch (field) {
+
+		/*
+		 * regular disk blk reservation
+		 */
+	      case XFS_TRANS_DQ_RES_BLKS:
+		qtrx->qt_blk_res += (ulong)delta;
+		break;
+
+		/*
+		 * inode reservation
+		 */
+	      case XFS_TRANS_DQ_RES_INOS:
+		qtrx->qt_ino_res += (ulong)delta;
+		break;
+
+		/*
+		 * disk blocks used.
+		 */
+	      case XFS_TRANS_DQ_BCOUNT:
+		if (qtrx->qt_blk_res && delta > 0) {
+			qtrx->qt_blk_res_used += (ulong)delta;
+			ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
+		}
+		qtrx->qt_bcount_delta += delta;
+		break;
+
+	      case XFS_TRANS_DQ_DELBCOUNT:
+		qtrx->qt_delbcnt_delta += delta;
+		break;
+
+		/*
+		 * Inode Count
+		 */
+	      case XFS_TRANS_DQ_ICOUNT:
+		if (qtrx->qt_ino_res && delta > 0) {
+			qtrx->qt_ino_res_used += (ulong)delta;
+			ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
+		}
+		qtrx->qt_icount_delta += delta;
+		break;
+
+		/*
+		 * rtblk reservation
+		 */
+	      case XFS_TRANS_DQ_RES_RTBLKS:
+		qtrx->qt_rtblk_res += (ulong)delta;
+		break;
+
+		/*
+		 * rtblk count
+		 */
+	      case XFS_TRANS_DQ_RTBCOUNT:
+		if (qtrx->qt_rtblk_res && delta > 0) {
+			qtrx->qt_rtblk_res_used += (ulong)delta;
+			ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
+		}
+		qtrx->qt_rtbcount_delta += delta;
+		break;
+
+	      case XFS_TRANS_DQ_DELRTBCOUNT:
+		qtrx->qt_delrtb_delta += delta;
+		break;
+
+	      default:
+		ASSERT(0);
+	}
+	tp->t_flags |= XFS_TRANS_DQ_DIRTY;
+}
+
+
+/*
+ * Given an array of dqtrx structures, lock all the dquots associated
+ * and join them to the transaction, provided they have been modified.
+ * We know that the highest number of dquots (of one type - usr OR grp),
+ * involved in a transaction is 2 and that both usr and grp combined - 3.
+ * So, we don't attempt to make this very generic.
+ */
+STATIC void
+xfs_trans_dqlockedjoin(
+	xfs_trans_t	*tp,
+	xfs_dqtrx_t	*q)
+{
+	ASSERT(q[0].qt_dquot != NULL);
+	if (q[1].qt_dquot == NULL) {
+		xfs_dqlock(q[0].qt_dquot);
+		xfs_trans_dqjoin(tp, q[0].qt_dquot);
+	} else {
+		ASSERT(XFS_QM_TRANS_MAXDQS == 2);
+		xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot);
+		xfs_trans_dqjoin(tp, q[0].qt_dquot);
+		xfs_trans_dqjoin(tp, q[1].qt_dquot);
+	}
+}
+
+
+/*
+ * Called by xfs_trans_commit() and similar in spirit to
+ * xfs_trans_apply_sb_deltas().
+ * Go thru all the dquots belonging to this transaction and modify the
+ * INCORE dquot to reflect the actual usages.
+ * Unreserve just the reservations done by this transaction.
+ * dquot is still left locked at exit.
+ */
+void
+xfs_trans_apply_dquot_deltas(
+	xfs_trans_t		*tp)
+{
+	int			i, j;
+	xfs_dquot_t		*dqp;
+	xfs_dqtrx_t		*qtrx, *qa;
+	xfs_disk_dquot_t	*d;
+	long			totalbdelta;
+	long			totalrtbdelta;
+
+	if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY))
+		return;
+
+	ASSERT(tp->t_dqinfo);
+	qa = tp->t_dqinfo->dqa_usrdquots;
+	for (j = 0; j < 2; j++) {
+		if (qa[0].qt_dquot == NULL) {
+			qa = tp->t_dqinfo->dqa_grpdquots;
+			continue;
+		}
+
+		/*
+		 * Lock all of the dquots and join them to the transaction.
+		 */
+		xfs_trans_dqlockedjoin(tp, qa);
+
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			qtrx = &qa[i];
+			/*
+			 * The array of dquots is filled
+			 * sequentially, not sparsely.
+			 */
+			if ((dqp = qtrx->qt_dquot) == NULL)
+				break;
+
+			ASSERT(XFS_DQ_IS_LOCKED(dqp));
+			ASSERT(dqp->q_transp == tp);
+
+			/*
+			 * adjust the actual number of blocks used
+			 */
+			d = &dqp->q_core;
+
+			/*
+			 * The issue here is - sometimes we don't make a blkquota
+			 * reservation intentionally to be fair to users
+			 * (when the amount is small). On the other hand,
+			 * delayed allocs do make reservations, but that's
+			 * outside of a transaction, so we have no
+			 * idea how much was really reserved.
+			 * So, here we've accumulated delayed allocation blks and
+			 * non-delay blks. The assumption is that the
+			 * delayed ones are always reserved (outside of a
+			 * transaction), and the others may or may not have
+			 * quota reservations.
+			 */
+			totalbdelta = qtrx->qt_bcount_delta +
+				qtrx->qt_delbcnt_delta;
+			totalrtbdelta = qtrx->qt_rtbcount_delta +
+				qtrx->qt_delrtb_delta;
+#ifdef QUOTADEBUG
+			if (totalbdelta < 0)
+				ASSERT(be64_to_cpu(d->d_bcount) >=
+				       (xfs_qcnt_t) -totalbdelta);
+
+			if (totalrtbdelta < 0)
+				ASSERT(be64_to_cpu(d->d_rtbcount) >=
+				       (xfs_qcnt_t) -totalrtbdelta);
+
+			if (qtrx->qt_icount_delta < 0)
+				ASSERT(be64_to_cpu(d->d_icount) >=
+				       (xfs_qcnt_t) -qtrx->qt_icount_delta);
+#endif
+			if (totalbdelta)
+				be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta);
+
+			if (qtrx->qt_icount_delta)
+				be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta);
+
+			if (totalrtbdelta)
+				be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta);
+
+			/*
+			 * Get any default limits in use.
+			 * Start/reset the timer(s) if needed.
+			 */
+			if (d->d_id) {
+				xfs_qm_adjust_dqlimits(tp->t_mountp, d);
+				xfs_qm_adjust_dqtimers(tp->t_mountp, d);
+			}
+
+			dqp->dq_flags |= XFS_DQ_DIRTY;
+			/*
+			 * add this to the list of items to get logged
+			 */
+			xfs_trans_log_dquot(tp, dqp);
+			/*
+			 * Take off what's left of the original reservation.
+			 * In case of delayed allocations, there's no
+			 * reservation that a transaction structure knows of.
+			 */
+			if (qtrx->qt_blk_res != 0) {
+				if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
+					if (qtrx->qt_blk_res >
+					    qtrx->qt_blk_res_used)
+						dqp->q_res_bcount -= (xfs_qcnt_t)
+							(qtrx->qt_blk_res -
+							 qtrx->qt_blk_res_used);
+					else
+						dqp->q_res_bcount -= (xfs_qcnt_t)
+							(qtrx->qt_blk_res_used -
+							 qtrx->qt_blk_res);
+				}
+			} else {
+				/*
+				 * These blks were never reserved, either inside
+				 * a transaction or outside one (in a delayed
+				 * allocation). Also, this isn't always a
+				 * negative number since we sometimes
+				 * deliberately skip quota reservations.
+				 */
+				if (qtrx->qt_bcount_delta) {
+					dqp->q_res_bcount +=
+					      (xfs_qcnt_t)qtrx->qt_bcount_delta;
+				}
+			}
+			/*
+			 * Adjust the RT reservation.
+			 */
+			if (qtrx->qt_rtblk_res != 0) {
+				if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) {
+					if (qtrx->qt_rtblk_res >
+					    qtrx->qt_rtblk_res_used)
+					       dqp->q_res_rtbcount -= (xfs_qcnt_t)
+						       (qtrx->qt_rtblk_res -
+							qtrx->qt_rtblk_res_used);
+					else
+					       dqp->q_res_rtbcount -= (xfs_qcnt_t)
+						       (qtrx->qt_rtblk_res_used -
+							qtrx->qt_rtblk_res);
+				}
+			} else {
+				if (qtrx->qt_rtbcount_delta)
+					dqp->q_res_rtbcount +=
+					    (xfs_qcnt_t)qtrx->qt_rtbcount_delta;
+			}
+
+			/*
+			 * Adjust the inode reservation.
+			 */
+			if (qtrx->qt_ino_res != 0) {
+				ASSERT(qtrx->qt_ino_res >=
+				       qtrx->qt_ino_res_used);
+				if (qtrx->qt_ino_res > qtrx->qt_ino_res_used)
+					dqp->q_res_icount -= (xfs_qcnt_t)
+						(qtrx->qt_ino_res -
+						 qtrx->qt_ino_res_used);
+			} else {
+				if (qtrx->qt_icount_delta)
+					dqp->q_res_icount +=
+					    (xfs_qcnt_t)qtrx->qt_icount_delta;
+			}
+
+			ASSERT(dqp->q_res_bcount >=
+				be64_to_cpu(dqp->q_core.d_bcount));
+			ASSERT(dqp->q_res_icount >=
+				be64_to_cpu(dqp->q_core.d_icount));
+			ASSERT(dqp->q_res_rtbcount >=
+				be64_to_cpu(dqp->q_core.d_rtbcount));
+		}
+		/*
+		 * Do the group quotas next
+		 */
+		qa = tp->t_dqinfo->dqa_grpdquots;
+	}
+}
+
+/*
+ * Release the reservations, and adjust the dquots accordingly.
+ * This is called only when the transaction is being aborted. If by
+ * any chance we have done dquot modifications incore (ie. deltas) already,
+ * we simply throw those away, since that's the expected behavior
+ * when a transaction is curtailed without a commit.
+ */
+void
+xfs_trans_unreserve_and_mod_dquots(
+	xfs_trans_t		*tp)
+{
+	int			i, j;
+	xfs_dquot_t		*dqp;
+	xfs_dqtrx_t		*qtrx, *qa;
+	boolean_t		locked;
+
+	if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
+		return;
+
+	qa = tp->t_dqinfo->dqa_usrdquots;
+
+	for (j = 0; j < 2; j++) {
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			qtrx = &qa[i];
+			/*
+			 * We assume that the array of dquots is filled
+			 * sequentially, not sparsely.
+			 */
+			if ((dqp = qtrx->qt_dquot) == NULL)
+				break;
+			/*
+			 * Unreserve the original reservation. We don't care
+			 * about the number of blocks used field, or deltas.
+			 * Also we don't bother to zero the fields.
+			 */
+			locked = B_FALSE;
+			if (qtrx->qt_blk_res) {
+				xfs_dqlock(dqp);
+				locked = B_TRUE;
+				dqp->q_res_bcount -=
+					(xfs_qcnt_t)qtrx->qt_blk_res;
+			}
+			if (qtrx->qt_ino_res) {
+				if (!locked) {
+					xfs_dqlock(dqp);
+					locked = B_TRUE;
+				}
+				dqp->q_res_icount -=
+					(xfs_qcnt_t)qtrx->qt_ino_res;
+			}
+
+			if (qtrx->qt_rtblk_res) {
+				if (!locked) {
+					xfs_dqlock(dqp);
+					locked = B_TRUE;
+				}
+				dqp->q_res_rtbcount -=
+					(xfs_qcnt_t)qtrx->qt_rtblk_res;
+			}
+			if (locked)
+				xfs_dqunlock(dqp);
+
+		}
+		qa = tp->t_dqinfo->dqa_grpdquots;
+	}
+}
+
+STATIC void
+xfs_quota_warn(
+	struct xfs_mount	*mp,
+	struct xfs_dquot	*dqp,
+	int			type)
+{
+	/* no warnings for project quotas - we just return ENOSPC later */
+	if (dqp->dq_flags & XFS_DQ_PROJ)
+		return;
+	quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA,
+			   be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev,
+			   type);
+}
+
+/*
+ * This reserves disk blocks and inodes against a dquot.
+ * Flags indicate if the dquot is to be locked here and also
+ * if the blk reservation is for RT or regular blocks.
+ * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check.
+ */
+STATIC int
+xfs_trans_dqresv(
+	xfs_trans_t	*tp,
+	xfs_mount_t	*mp,
+	xfs_dquot_t	*dqp,
+	long		nblks,
+	long		ninos,
+	uint		flags)
+{
+	xfs_qcnt_t	hardlimit;
+	xfs_qcnt_t	softlimit;
+	time_t		timer;
+	xfs_qwarncnt_t	warns;
+	xfs_qwarncnt_t	warnlimit;
+	xfs_qcnt_t	count;
+	xfs_qcnt_t	*resbcountp;
+	xfs_quotainfo_t	*q = mp->m_quotainfo;
+
+
+	xfs_dqlock(dqp);
+
+	if (flags & XFS_TRANS_DQ_RES_BLKS) {
+		hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+		if (!hardlimit)
+			hardlimit = q->qi_bhardlimit;
+		softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit);
+		if (!softlimit)
+			softlimit = q->qi_bsoftlimit;
+		timer = be32_to_cpu(dqp->q_core.d_btimer);
+		warns = be16_to_cpu(dqp->q_core.d_bwarns);
+		warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
+		resbcountp = &dqp->q_res_bcount;
+	} else {
+		ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
+		hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit);
+		if (!hardlimit)
+			hardlimit = q->qi_rtbhardlimit;
+		softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit);
+		if (!softlimit)
+			softlimit = q->qi_rtbsoftlimit;
+		timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
+		warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
+		warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
+		resbcountp = &dqp->q_res_rtbcount;
+	}
+
+	if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
+	    dqp->q_core.d_id &&
+	    ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
+	     (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
+	      (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
+#ifdef QUOTADEBUG
+		xfs_debug(mp,
+			"BLK Res: nblks=%ld + resbcount=%Ld > hardlimit=%Ld?",
+			nblks, *resbcountp, hardlimit);
+#endif
+		if (nblks > 0) {
+			/*
+			 * dquot is locked already. See if we'd go over the
+			 * hardlimit or exceed the timelimit if we allocate
+			 * nblks.
+			 */
+			if (hardlimit > 0ULL &&
+			    hardlimit <= nblks + *resbcountp) {
+				xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
+				goto error_return;
+			}
+			if (softlimit > 0ULL &&
+			    softlimit <= nblks + *resbcountp) {
+				if ((timer != 0 && get_seconds() > timer) ||
+				    (warns != 0 && warns >= warnlimit)) {
+					xfs_quota_warn(mp, dqp,
+						       QUOTA_NL_BSOFTLONGWARN);
+					goto error_return;
+				}
+
+				xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
+			}
+		}
+		if (ninos > 0) {
+			count = be64_to_cpu(dqp->q_core.d_icount);
+			timer = be32_to_cpu(dqp->q_core.d_itimer);
+			warns = be16_to_cpu(dqp->q_core.d_iwarns);
+			warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
+			hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+			if (!hardlimit)
+				hardlimit = q->qi_ihardlimit;
+			softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
+			if (!softlimit)
+				softlimit = q->qi_isoftlimit;
+
+			if (hardlimit > 0ULL && count >= hardlimit) {
+				xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
+				goto error_return;
+			}
+			if (softlimit > 0ULL && count >= softlimit) {
+				if  ((timer != 0 && get_seconds() > timer) ||
+				     (warns != 0 && warns >= warnlimit)) {
+					xfs_quota_warn(mp, dqp,
+						       QUOTA_NL_ISOFTLONGWARN);
+					goto error_return;
+				}
+				xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
+			}
+		}
+	}
+
+	/*
+	 * Change the reservation, but not the actual usage.
+	 * Note that q_res_bcount = q_core.d_bcount + resv
+	 */
+	(*resbcountp) += (xfs_qcnt_t)nblks;
+	if (ninos != 0)
+		dqp->q_res_icount += (xfs_qcnt_t)ninos;
+
+	/*
+	 * note the reservation amt in the trans struct too,
+	 * so that the transaction knows how much was reserved by
+	 * it against this particular dquot.
+	 * We don't do this when we are reserving for a delayed allocation,
+	 * because we don't have the luxury of a transaction envelope then.
+	 */
+	if (tp) {
+		ASSERT(tp->t_dqinfo);
+		ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+		if (nblks != 0)
+			xfs_trans_mod_dquot(tp, dqp,
+					    flags & XFS_QMOPT_RESBLK_MASK,
+					    nblks);
+		if (ninos != 0)
+			xfs_trans_mod_dquot(tp, dqp,
+					    XFS_TRANS_DQ_RES_INOS,
+					    ninos);
+	}
+	ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount));
+	ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
+	ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
+
+	xfs_dqunlock(dqp);
+	return 0;
+
+error_return:
+	xfs_dqunlock(dqp);
+	if (flags & XFS_QMOPT_ENOSPC)
+		return ENOSPC;
+	return EDQUOT;
+}
+
+
+/*
+ * Given dquot(s), make disk block and/or inode reservations against them.
+ * The fact that this does the reservation against both the usr and
+ * grp/prj quotas is important, because this follows a both-or-nothing
+ * approach.
+ *
+ * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
+ *	   XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT.  Used by pquota.
+ *	   XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
+ *	   XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
+ * dquots are unlocked on return, if they were not locked by caller.
+ */
+int
+xfs_trans_reserve_quota_bydquots(
+	xfs_trans_t	*tp,
+	xfs_mount_t	*mp,
+	xfs_dquot_t	*udqp,
+	xfs_dquot_t	*gdqp,
+	long		nblks,
+	long		ninos,
+	uint		flags)
+{
+	int		resvd = 0, error;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+		return 0;
+
+	if (tp && tp->t_dqinfo == NULL)
+		xfs_trans_alloc_dqinfo(tp);
+
+	ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+
+	if (udqp) {
+		error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos,
+					(flags & ~XFS_QMOPT_ENOSPC));
+		if (error)
+			return error;
+		resvd = 1;
+	}
+
+	if (gdqp) {
+		error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
+		if (error) {
+			/*
+			 * can't do it, so backout previous reservation
+			 */
+			if (resvd) {
+				flags |= XFS_QMOPT_FORCE_RES;
+				xfs_trans_dqresv(tp, mp, udqp,
+						 -nblks, -ninos, flags);
+			}
+			return error;
+		}
+	}
+
+	/*
+	 * Didn't change anything critical, so, no need to log
+	 */
+	return 0;
+}
+
+
+/*
+ * Lock the dquot and change the reservation if we can.
+ * This doesn't change the actual usage, just the reservation.
+ * The inode sent in is locked.
+ */
+int
+xfs_trans_reserve_quota_nblks(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	long			nblks,
+	long			ninos,
+	uint			flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+		return 0;
+	if (XFS_IS_PQUOTA_ON(mp))
+		flags |= XFS_QMOPT_ENOSPC;
+
+	ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
+	ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
+				XFS_TRANS_DQ_RES_RTBLKS ||
+	       (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
+				XFS_TRANS_DQ_RES_BLKS);
+
+	/*
+	 * Reserve nblks against these dquots, with trans as the mediator.
+	 */
+	return xfs_trans_reserve_quota_bydquots(tp, mp,
+						ip->i_udquot, ip->i_gdquot,
+						nblks, ninos, flags);
+}
+
+/*
+ * This routine is called to allocate a quotaoff log item.
+ */
+xfs_qoff_logitem_t *
+xfs_trans_get_qoff_item(
+	xfs_trans_t		*tp,
+	xfs_qoff_logitem_t	*startqoff,
+	uint			flags)
+{
+	xfs_qoff_logitem_t	*q;
+
+	ASSERT(tp != NULL);
+
+	q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags);
+	ASSERT(q != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &q->qql_item);
+	return q;
+}
+
+
+/*
+ * This is called to mark the quotaoff logitem as needing
+ * to be logged when the transaction is committed.  The logitem must
+ * already be associated with the given transaction.
+ */
+void
+xfs_trans_log_quotaoff_item(
+	xfs_trans_t		*tp,
+	xfs_qoff_logitem_t	*qlp)
+{
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
+
+STATIC void
+xfs_trans_alloc_dqinfo(
+	xfs_trans_t	*tp)
+{
+	tp->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
+}
+
+void
+xfs_trans_free_dqinfo(
+	xfs_trans_t	*tp)
+{
+	if (!tp->t_dqinfo)
+		return;
+	kmem_zone_free(xfs_Gqm->qm_dqtrxzone, tp->t_dqinfo);
+	tp->t_dqinfo = NULL;
+}
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
new file mode 100644
index 00000000..b83f76b6
--- /dev/null
+++ b/fs/xfs/support/uuid.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <xfs.h>
+
+/* IRIX interpretation of an uuid_t */
+typedef struct {
+	__be32	uu_timelow;
+	__be16	uu_timemid;
+	__be16	uu_timehi;
+	__be16	uu_clockseq;
+	__be16	uu_node[3];
+} xfs_uu_t;
+
+/*
+ * uuid_getnodeuniq - obtain the node unique fields of a UUID.
+ *
+ * This is not in any way a standard or condoned UUID function;
+ * it just something that's needed for user-level file handles.
+ */
+void
+uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
+{
+	xfs_uu_t *uup = (xfs_uu_t *)uuid;
+
+	fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) |
+		   be16_to_cpu(uup->uu_timemid);
+	fsid[1] = be32_to_cpu(uup->uu_timelow);
+}
+
+int
+uuid_is_nil(uuid_t *uuid)
+{
+	int	i;
+	char	*cp = (char *)uuid;
+
+	if (uuid == NULL)
+		return 0;
+	/* implied check of version number here... */
+	for (i = 0; i < sizeof *uuid; i++)
+		if (*cp++) return 0;	/* not nil */
+	return 1;	/* is nil */
+}
+
+int
+uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
+{
+	return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
+}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h
new file mode 100644
index 00000000..4732d712
--- /dev/null
+++ b/fs/xfs/support/uuid.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPPORT_UUID_H__
+#define __XFS_SUPPORT_UUID_H__
+
+typedef struct {
+	unsigned char	__u_bits[16];
+} uuid_t;
+
+extern int uuid_is_nil(uuid_t *uuid);
+extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
+extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
+
+#endif	/* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
new file mode 100644
index 00000000..5ad8ad3a
--- /dev/null
+++ b/fs/xfs/xfs.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_H__
+#define __XFS_H__
+
+#ifdef CONFIG_XFS_DEBUG
+#define STATIC
+#define DEBUG 1
+#define XFS_BUF_LOCK_TRACKING 1
+/* #define QUOTADEBUG 1 */
+#endif
+
+#include <linux-2.6/xfs_linux.h>
+#endif	/* __XFS_H__ */
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
new file mode 100644
index 00000000..11dd7207
--- /dev/null
+++ b/fs/xfs/xfs_acl.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2001-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ACL_H__
+#define __XFS_ACL_H__
+
+struct inode;
+struct posix_acl;
+struct xfs_inode;
+
+#define XFS_ACL_MAX_ENTRIES 25
+#define XFS_ACL_NOT_PRESENT (-1)
+
+/* On-disk XFS access control list structure */
+struct xfs_acl {
+	__be32		acl_cnt;
+	struct xfs_acl_entry {
+		__be32	ae_tag;
+		__be32	ae_id;
+		__be16	ae_perm;
+	} acl_entry[XFS_ACL_MAX_ENTRIES];
+};
+
+/* On-disk XFS extended attribute names */
+#define SGI_ACL_FILE		(unsigned char *)"SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT		(unsigned char *)"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE_SIZE	(sizeof(SGI_ACL_FILE)-1)
+#define SGI_ACL_DEFAULT_SIZE	(sizeof(SGI_ACL_DEFAULT)-1)
+
+#ifdef CONFIG_XFS_POSIX_ACL
+extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
+extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
+extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
+extern int xfs_acl_chmod(struct inode *inode);
+extern int posix_acl_access_exists(struct inode *inode);
+extern int posix_acl_default_exists(struct inode *inode);
+
+extern const struct xattr_handler xfs_xattr_acl_access_handler;
+extern const struct xattr_handler xfs_xattr_acl_default_handler;
+#else
+# define xfs_check_acl					NULL
+# define xfs_get_acl(inode, type)			NULL
+# define xfs_inherit_acl(inode, default_acl)		0
+# define xfs_acl_chmod(inode)				0
+# define posix_acl_access_exists(inode)			0
+# define posix_acl_default_exists(inode)		0
+#endif /* CONFIG_XFS_POSIX_ACL */
+#endif	/* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
new file mode 100644
index 00000000..6530769a
--- /dev/null
+++ b/fs/xfs/xfs_ag.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_AG_H__
+#define	__XFS_AG_H__
+
+/*
+ * Allocation group header
+ * This is divided into three structures, placed in sequential 512-byte
+ * buffers after a copy of the superblock (also in a 512-byte buffer).
+ */
+
+struct xfs_buf;
+struct xfs_mount;
+struct xfs_trans;
+
+#define	XFS_AGF_MAGIC	0x58414746	/* 'XAGF' */
+#define	XFS_AGI_MAGIC	0x58414749	/* 'XAGI' */
+#define	XFS_AGF_VERSION	1
+#define	XFS_AGI_VERSION	1
+
+#define	XFS_AGF_GOOD_VERSION(v)	((v) == XFS_AGF_VERSION)
+#define	XFS_AGI_GOOD_VERSION(v)	((v) == XFS_AGI_VERSION)
+
+/*
+ * Btree number 0 is bno, 1 is cnt.  This value gives the size of the
+ * arrays below.
+ */
+#define	XFS_BTNUM_AGF	((int)XFS_BTNUM_CNTi + 1)
+
+/*
+ * The second word of agf_levels in the first a.g. overlaps the EFS
+ * superblock's magic number.  Since the magic numbers valid for EFS
+ * are > 64k, our value cannot be confused for an EFS superblock's.
+ */
+
+typedef struct xfs_agf {
+	/*
+	 * Common allocation group header information
+	 */
+	__be32		agf_magicnum;	/* magic number == XFS_AGF_MAGIC */
+	__be32		agf_versionnum;	/* header version == XFS_AGF_VERSION */
+	__be32		agf_seqno;	/* sequence # starting from 0 */
+	__be32		agf_length;	/* size in blocks of a.g. */
+	/*
+	 * Freespace information
+	 */
+	__be32		agf_roots[XFS_BTNUM_AGF];	/* root blocks */
+	__be32		agf_spare0;	/* spare field */
+	__be32		agf_levels[XFS_BTNUM_AGF];	/* btree levels */
+	__be32		agf_spare1;	/* spare field */
+	__be32		agf_flfirst;	/* first freelist block's index */
+	__be32		agf_fllast;	/* last freelist block's index */
+	__be32		agf_flcount;	/* count of blocks in freelist */
+	__be32		agf_freeblks;	/* total free blocks */
+	__be32		agf_longest;	/* longest free space */
+	__be32		agf_btreeblks;	/* # of blocks held in AGF btrees */
+} xfs_agf_t;
+
+#define	XFS_AGF_MAGICNUM	0x00000001
+#define	XFS_AGF_VERSIONNUM	0x00000002
+#define	XFS_AGF_SEQNO		0x00000004
+#define	XFS_AGF_LENGTH		0x00000008
+#define	XFS_AGF_ROOTS		0x00000010
+#define	XFS_AGF_LEVELS		0x00000020
+#define	XFS_AGF_FLFIRST		0x00000040
+#define	XFS_AGF_FLLAST		0x00000080
+#define	XFS_AGF_FLCOUNT		0x00000100
+#define	XFS_AGF_FREEBLKS	0x00000200
+#define	XFS_AGF_LONGEST		0x00000400
+#define	XFS_AGF_BTREEBLKS	0x00000800
+#define	XFS_AGF_NUM_BITS	12
+#define	XFS_AGF_ALL_BITS	((1 << XFS_AGF_NUM_BITS) - 1)
+
+#define XFS_AGF_FLAGS \
+	{ XFS_AGF_MAGICNUM,	"MAGICNUM" }, \
+	{ XFS_AGF_VERSIONNUM,	"VERSIONNUM" }, \
+	{ XFS_AGF_SEQNO,	"SEQNO" }, \
+	{ XFS_AGF_LENGTH,	"LENGTH" }, \
+	{ XFS_AGF_ROOTS,	"ROOTS" }, \
+	{ XFS_AGF_LEVELS,	"LEVELS" }, \
+	{ XFS_AGF_FLFIRST,	"FLFIRST" }, \
+	{ XFS_AGF_FLLAST,	"FLLAST" }, \
+	{ XFS_AGF_FLCOUNT,	"FLCOUNT" }, \
+	{ XFS_AGF_FREEBLKS,	"FREEBLKS" }, \
+	{ XFS_AGF_LONGEST,	"LONGEST" }, \
+	{ XFS_AGF_BTREEBLKS,	"BTREEBLKS" }
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGF_DADDR(mp)	((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
+#define	XFS_AGF_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
+#define	XFS_BUF_TO_AGF(bp)	((xfs_agf_t *)XFS_BUF_PTR(bp))
+
+extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+			xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+
+/*
+ * Size of the unlinked inode hash table in the agi.
+ */
+#define	XFS_AGI_UNLINKED_BUCKETS	64
+
+typedef struct xfs_agi {
+	/*
+	 * Common allocation group header information
+	 */
+	__be32		agi_magicnum;	/* magic number == XFS_AGI_MAGIC */
+	__be32		agi_versionnum;	/* header version == XFS_AGI_VERSION */
+	__be32		agi_seqno;	/* sequence # starting from 0 */
+	__be32		agi_length;	/* size in blocks of a.g. */
+	/*
+	 * Inode information
+	 * Inodes are mapped by interpreting the inode number, so no
+	 * mapping data is needed here.
+	 */
+	__be32		agi_count;	/* count of allocated inodes */
+	__be32		agi_root;	/* root of inode btree */
+	__be32		agi_level;	/* levels in inode btree */
+	__be32		agi_freecount;	/* number of free inodes */
+	__be32		agi_newino;	/* new inode just allocated */
+	__be32		agi_dirino;	/* last directory inode chunk */
+	/*
+	 * Hash table of inodes which have been unlinked but are
+	 * still being referenced.
+	 */
+	__be32		agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+} xfs_agi_t;
+
+#define	XFS_AGI_MAGICNUM	0x00000001
+#define	XFS_AGI_VERSIONNUM	0x00000002
+#define	XFS_AGI_SEQNO		0x00000004
+#define	XFS_AGI_LENGTH		0x00000008
+#define	XFS_AGI_COUNT		0x00000010
+#define	XFS_AGI_ROOT		0x00000020
+#define	XFS_AGI_LEVEL		0x00000040
+#define	XFS_AGI_FREECOUNT	0x00000080
+#define	XFS_AGI_NEWINO		0x00000100
+#define	XFS_AGI_DIRINO		0x00000200
+#define	XFS_AGI_UNLINKED	0x00000400
+#define	XFS_AGI_NUM_BITS	11
+#define	XFS_AGI_ALL_BITS	((1 << XFS_AGI_NUM_BITS) - 1)
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGI_DADDR(mp)	((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
+#define	XFS_AGI_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
+#define	XFS_BUF_TO_AGI(bp)	((xfs_agi_t *)XFS_BUF_PTR(bp))
+
+extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
+				xfs_agnumber_t agno, struct xfs_buf **bpp);
+
+/*
+ * The third a.g. block contains the a.g. freelist, an array
+ * of block pointers to blocks owned by the allocation btree code.
+ */
+#define XFS_AGFL_DADDR(mp)	((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
+#define	XFS_AGFL_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
+#define XFS_AGFL_SIZE(mp)	((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t))
+#define	XFS_BUF_TO_AGFL(bp)	((xfs_agfl_t *)XFS_BUF_PTR(bp))
+
+typedef struct xfs_agfl {
+	__be32		agfl_bno[1];	/* actually XFS_AGFL_SIZE(mp) */
+} xfs_agfl_t;
+
+/*
+ * Busy block/extent entry.  Indexed by a rbtree in perag to mark blocks that
+ * have been freed but whose transactions aren't committed to disk yet.
+ *
+ * Note that we use the transaction ID to record the transaction, not the
+ * transaction structure itself. See xfs_alloc_busy_insert() for details.
+ */
+struct xfs_busy_extent {
+	struct rb_node	rb_node;	/* ag by-bno indexed search tree */
+	struct list_head list;		/* transaction busy extent list */
+	xfs_agnumber_t	agno;
+	xfs_agblock_t	bno;
+	xfs_extlen_t	length;
+	unsigned int	flags;
+#define XFS_ALLOC_BUSY_DISCARDED	0x01	/* undergoing a discard op. */
+#define XFS_ALLOC_BUSY_SKIP_DISCARD	0x02	/* do not discard */
+};
+
+/*
+ * Per-ag incore structure, copies of information in agf and agi,
+ * to improve the performance of allocation group selection.
+ */
+#define XFS_PAGB_NUM_SLOTS	128
+
+typedef struct xfs_perag {
+	struct xfs_mount *pag_mount;	/* owner filesystem */
+	xfs_agnumber_t	pag_agno;	/* AG this structure belongs to */
+	atomic_t	pag_ref;	/* perag reference count */
+	char		pagf_init;	/* this agf's entry is initialized */
+	char		pagi_init;	/* this agi's entry is initialized */
+	char		pagf_metadata;	/* the agf is preferred to be metadata */
+	char		pagi_inodeok;	/* The agi is ok for inodes */
+	__uint8_t	pagf_levels[XFS_BTNUM_AGF];
+					/* # of levels in bno & cnt btree */
+	__uint32_t	pagf_flcount;	/* count of blocks in freelist */
+	xfs_extlen_t	pagf_freeblks;	/* total free blocks */
+	xfs_extlen_t	pagf_longest;	/* longest free space */
+	__uint32_t	pagf_btreeblks;	/* # of blocks held in AGF btrees */
+	xfs_agino_t	pagi_freecount;	/* number of free inodes */
+	xfs_agino_t	pagi_count;	/* number of allocated inodes */
+
+	/*
+	 * Inode allocation search lookup optimisation.
+	 * If the pagino matches, the search for new inodes
+	 * doesn't need to search the near ones again straight away
+	 */
+	xfs_agino_t	pagl_pagino;
+	xfs_agino_t	pagl_leftrec;
+	xfs_agino_t	pagl_rightrec;
+#ifdef __KERNEL__
+	spinlock_t	pagb_lock;	/* lock for pagb_tree */
+	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
+
+	atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
+
+	spinlock_t	pag_ici_lock;	/* incore inode cache lock */
+	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
+	int		pag_ici_reclaimable;	/* reclaimable inodes */
+	struct mutex	pag_ici_reclaim_lock;	/* serialisation point */
+	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
+
+	/* buffer cache index */
+	spinlock_t	pag_buf_lock;	/* lock for pag_buf_tree */
+	struct rb_root	pag_buf_tree;	/* ordered tree of active buffers */
+
+	/* for rcu-safe freeing */
+	struct rcu_head	rcu_head;
+#endif
+	int		pagb_count;	/* pagb slots in use */
+} xfs_perag_t;
+
+/*
+ * tags for inode radix tree
+ */
+#define XFS_ICI_NO_TAG		(-1)	/* special flag for an untagged lookup
+					   in xfs_inode_ag_iterator */
+#define XFS_ICI_RECLAIM_TAG	0	/* inode is to be reclaimed */
+
+#define	XFS_AG_MAXLEVELS(mp)		((mp)->m_ag_maxlevels)
+#define	XFS_MIN_FREELIST_RAW(bl,cl,mp)	\
+	(MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
+#define	XFS_MIN_FREELIST(a,mp)		\
+	(XFS_MIN_FREELIST_RAW(		\
+		be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
+		be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
+#define	XFS_MIN_FREELIST_PAG(pag,mp)	\
+	(XFS_MIN_FREELIST_RAW(		\
+		(unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+		(unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
+
+#define XFS_AGB_TO_FSB(mp,agno,agbno)	\
+	(((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
+#define	XFS_FSB_TO_AGNO(mp,fsbno)	\
+	((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
+#define	XFS_FSB_TO_AGBNO(mp,fsbno)	\
+	((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
+#define	XFS_AGB_TO_DADDR(mp,agno,agbno)	\
+	((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
+		(xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
+#define	XFS_AG_DADDR(mp,agno,d)		(XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
+
+/*
+ * For checking for bad ranges of xfs_daddr_t's, covering multiple
+ * allocation groups or a single xfs_daddr_t that's a superblock copy.
+ */
+#define	XFS_AG_CHECK_DADDR(mp,d,len)	\
+	((len) == 1 ? \
+	    ASSERT((d) == XFS_SB_DADDR || \
+		   xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
+	    ASSERT(xfs_daddr_to_agno(mp, d) == \
+		   xfs_daddr_to_agno(mp, (d) + (len) - 1)))
+
+#endif	/* __XFS_AG_H__ */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
new file mode 100644
index 00000000..95862bbf
--- /dev/null
+++ b/fs/xfs/xfs_alloc.c
@@ -0,0 +1,3047 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+
+#define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
+
+#define	XFSA_FIXUP_BNO_OK	1
+#define	XFSA_FIXUP_CNT_OK	2
+
+STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
+		xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
+		xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
+
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int				/* error */
+xfs_alloc_lookup_eq(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+STATIC int				/* error */
+xfs_alloc_lookup_ge(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_le(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int				/* error */
+xfs_alloc_update(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len)	/* length of extent */
+{
+	union xfs_btree_rec	rec;
+
+	rec.alloc.ar_startblock = cpu_to_be32(bno);
+	rec.alloc.ar_blockcount = cpu_to_be32(len);
+	return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_alloc_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		*bno,	/* output: starting block of extent */
+	xfs_extlen_t		*len,	/* output: length of extent */
+	int			*stat)	/* output: success/failure */
+{
+	union xfs_btree_rec	*rec;
+	int			error;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (!error && *stat == 1) {
+		*bno = be32_to_cpu(rec->alloc.ar_startblock);
+		*len = be32_to_cpu(rec->alloc.ar_blockcount);
+	}
+	return error;
+}
+
+/*
+ * Compute aligned version of the found extent.
+ * Takes alignment and min length into account.
+ */
+STATIC void
+xfs_alloc_compute_aligned(
+	xfs_alloc_arg_t	*args,		/* allocation argument structure */
+	xfs_agblock_t	foundbno,	/* starting block in found extent */
+	xfs_extlen_t	foundlen,	/* length in found extent */
+	xfs_agblock_t	*resbno,	/* result block number */
+	xfs_extlen_t	*reslen)	/* result length */
+{
+	xfs_agblock_t	bno;
+	xfs_extlen_t	len;
+
+	/* Trim busy sections out of found extent */
+	xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len);
+
+	if (args->alignment > 1 && len >= args->minlen) {
+		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
+		xfs_extlen_t	diff = aligned_bno - bno;
+
+		*resbno = aligned_bno;
+		*reslen = diff >= len ? 0 : len - diff;
+	} else {
+		*resbno = bno;
+		*reslen = len;
+	}
+}
+
+/*
+ * Compute best start block and diff for "near" allocations.
+ * freelen >= wantlen already checked by caller.
+ */
+STATIC xfs_extlen_t			/* difference value (absolute) */
+xfs_alloc_compute_diff(
+	xfs_agblock_t	wantbno,	/* target starting block */
+	xfs_extlen_t	wantlen,	/* target length */
+	xfs_extlen_t	alignment,	/* target alignment */
+	xfs_agblock_t	freebno,	/* freespace's starting block */
+	xfs_extlen_t	freelen,	/* freespace's length */
+	xfs_agblock_t	*newbnop)	/* result: best start block from free */
+{
+	xfs_agblock_t	freeend;	/* end of freespace extent */
+	xfs_agblock_t	newbno1;	/* return block number */
+	xfs_agblock_t	newbno2;	/* other new block number */
+	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
+	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
+	xfs_agblock_t	wantend;	/* end of target extent */
+
+	ASSERT(freelen >= wantlen);
+	freeend = freebno + freelen;
+	wantend = wantbno + wantlen;
+	if (freebno >= wantbno) {
+		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
+			newbno1 = NULLAGBLOCK;
+	} else if (freeend >= wantend && alignment > 1) {
+		newbno1 = roundup(wantbno, alignment);
+		newbno2 = newbno1 - alignment;
+		if (newbno1 >= freeend)
+			newbno1 = NULLAGBLOCK;
+		else
+			newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
+		if (newbno2 < freebno)
+			newbno2 = NULLAGBLOCK;
+		else
+			newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
+		if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
+			if (newlen1 < newlen2 ||
+			    (newlen1 == newlen2 &&
+			     XFS_ABSDIFF(newbno1, wantbno) >
+			     XFS_ABSDIFF(newbno2, wantbno)))
+				newbno1 = newbno2;
+		} else if (newbno2 != NULLAGBLOCK)
+			newbno1 = newbno2;
+	} else if (freeend >= wantend) {
+		newbno1 = wantbno;
+	} else if (alignment > 1) {
+		newbno1 = roundup(freeend - wantlen, alignment);
+		if (newbno1 > freeend - wantlen &&
+		    newbno1 - alignment >= freebno)
+			newbno1 -= alignment;
+		else if (newbno1 >= freeend)
+			newbno1 = NULLAGBLOCK;
+	} else
+		newbno1 = freeend - wantlen;
+	*newbnop = newbno1;
+	return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
+}
+
+/*
+ * Fix up the length, based on mod and prod.
+ * len should be k * prod + mod for some k.
+ * If len is too small it is returned unchanged.
+ * If len hits maxlen it is left alone.
+ */
+STATIC void
+xfs_alloc_fix_len(
+	xfs_alloc_arg_t	*args)		/* allocation argument structure */
+{
+	xfs_extlen_t	k;
+	xfs_extlen_t	rlen;
+
+	ASSERT(args->mod < args->prod);
+	rlen = args->len;
+	ASSERT(rlen >= args->minlen);
+	ASSERT(rlen <= args->maxlen);
+	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
+	    (args->mod == 0 && rlen < args->prod))
+		return;
+	k = rlen % args->prod;
+	if (k == args->mod)
+		return;
+	if (k > args->mod) {
+		if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen)
+			return;
+	} else {
+		if ((int)(rlen = rlen - args->prod - (args->mod - k)) <
+		    (int)args->minlen)
+			return;
+	}
+	ASSERT(rlen >= args->minlen);
+	ASSERT(rlen <= args->maxlen);
+	args->len = rlen;
+}
+
+/*
+ * Fix up length if there is too little space left in the a.g.
+ * Return 1 if ok, 0 if too little, should give up.
+ */
+STATIC int
+xfs_alloc_fix_minleft(
+	xfs_alloc_arg_t	*args)		/* allocation argument structure */
+{
+	xfs_agf_t	*agf;		/* a.g. freelist header */
+	int		diff;		/* free space difference */
+
+	if (args->minleft == 0)
+		return 1;
+	agf = XFS_BUF_TO_AGF(args->agbp);
+	diff = be32_to_cpu(agf->agf_freeblks)
+		- args->len - args->minleft;
+	if (diff >= 0)
+		return 1;
+	args->len += diff;		/* shrink the allocated space */
+	if (args->len >= args->minlen)
+		return 1;
+	args->agbno = NULLAGBLOCK;
+	return 0;
+}
+
+/*
+ * Update the two btrees, logically removing from freespace the extent
+ * starting at rbno, rlen blocks.  The extent is contained within the
+ * actual (current) free extent fbno for flen blocks.
+ * Flags are passed in indicating whether the cursors are set to the
+ * relevant records.
+ */
+STATIC int				/* error code */
+xfs_alloc_fixup_trees(
+	xfs_btree_cur_t	*cnt_cur,	/* cursor for by-size btree */
+	xfs_btree_cur_t	*bno_cur,	/* cursor for by-block btree */
+	xfs_agblock_t	fbno,		/* starting block of free extent */
+	xfs_extlen_t	flen,		/* length of free extent */
+	xfs_agblock_t	rbno,		/* starting block of returned extent */
+	xfs_extlen_t	rlen,		/* length of returned extent */
+	int		flags)		/* flags, XFSA_FIXUP_... */
+{
+	int		error;		/* error code */
+	int		i;		/* operation results */
+	xfs_agblock_t	nfbno1;		/* first new free startblock */
+	xfs_agblock_t	nfbno2;		/* second new free startblock */
+	xfs_extlen_t	nflen1=0;	/* first new free length */
+	xfs_extlen_t	nflen2=0;	/* second new free length */
+
+	/*
+	 * Look up the record in the by-size tree if necessary.
+	 */
+	if (flags & XFSA_FIXUP_CNT_OK) {
+#ifdef DEBUG
+		if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(
+			i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+	} else {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	/*
+	 * Look up the record in the by-block tree if necessary.
+	 */
+	if (flags & XFSA_FIXUP_BNO_OK) {
+#ifdef DEBUG
+		if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(
+			i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+	} else {
+		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+
+#ifdef DEBUG
+	if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
+		struct xfs_btree_block	*bnoblock;
+		struct xfs_btree_block	*cntblock;
+
+		bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
+		cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
+
+		XFS_WANT_CORRUPTED_RETURN(
+			bnoblock->bb_numrecs == cntblock->bb_numrecs);
+	}
+#endif
+
+	/*
+	 * Deal with all four cases: the allocated record is contained
+	 * within the freespace record, so we can have new freespace
+	 * at either (or both) end, or no freespace remaining.
+	 */
+	if (rbno == fbno && rlen == flen)
+		nfbno1 = nfbno2 = NULLAGBLOCK;
+	else if (rbno == fbno) {
+		nfbno1 = rbno + rlen;
+		nflen1 = flen - rlen;
+		nfbno2 = NULLAGBLOCK;
+	} else if (rbno + rlen == fbno + flen) {
+		nfbno1 = fbno;
+		nflen1 = flen - rlen;
+		nfbno2 = NULLAGBLOCK;
+	} else {
+		nfbno1 = fbno;
+		nflen1 = rbno - fbno;
+		nfbno2 = rbno + rlen;
+		nflen2 = (fbno + flen) - nfbno2;
+	}
+	/*
+	 * Delete the entry from the by-size btree.
+	 */
+	if ((error = xfs_btree_delete(cnt_cur, &i)))
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(i == 1);
+	/*
+	 * Add new by-size btree entry(s).
+	 */
+	if (nfbno1 != NULLAGBLOCK) {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 0);
+		if ((error = xfs_btree_insert(cnt_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	if (nfbno2 != NULLAGBLOCK) {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 0);
+		if ((error = xfs_btree_insert(cnt_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	/*
+	 * Fix up the by-block btree entry(s).
+	 */
+	if (nfbno1 == NULLAGBLOCK) {
+		/*
+		 * No remaining freespace, just delete the by-block tree entry.
+		 */
+		if ((error = xfs_btree_delete(bno_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	} else {
+		/*
+		 * Update the by-block entry to start later|be shorter.
+		 */
+		if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
+			return error;
+	}
+	if (nfbno2 != NULLAGBLOCK) {
+		/*
+		 * 2 resulting free entries, need to add one.
+		 */
+		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 0);
+		if ((error = xfs_btree_insert(bno_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	return 0;
+}
+
+/*
+ * Read in the allocation group free block array.
+ */
+STATIC int				/* error */
+xfs_alloc_read_agfl(
+	xfs_mount_t	*mp,		/* mount point structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_buf_t	**bpp)		/* buffer for the ag free block array */
+{
+	xfs_buf_t	*bp;		/* return value */
+	int		error;
+
+	ASSERT(agno != NULLAGNUMBER);
+	error = xfs_trans_read_buf(
+			mp, tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, &bp);
+	if (error)
+		return error;
+	ASSERT(bp);
+	ASSERT(!XFS_BUF_GETERROR(bp));
+	XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF);
+	*bpp = bp;
+	return 0;
+}
+
+STATIC int
+xfs_alloc_update_counters(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_buf		*agbp,
+	long			len)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+
+	pag->pagf_freeblks += len;
+	be32_add_cpu(&agf->agf_freeblks, len);
+
+	xfs_trans_agblocks_delta(tp, len);
+	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
+		     be32_to_cpu(agf->agf_length)))
+		return EFSCORRUPTED;
+
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+	return 0;
+}
+
+/*
+ * Allocation group level functions.
+ */
+
+/*
+ * Allocate a variable extent in the allocation group agno.
+ * Type and bno are used to determine where in the allocation group the
+ * extent will start.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent(
+	xfs_alloc_arg_t	*args)	/* argument structure for allocation */
+{
+	int		error=0;
+
+	ASSERT(args->minlen > 0);
+	ASSERT(args->maxlen > 0);
+	ASSERT(args->minlen <= args->maxlen);
+	ASSERT(args->mod < args->prod);
+	ASSERT(args->alignment > 0);
+	/*
+	 * Branch to correct routine based on the type.
+	 */
+	args->wasfromfl = 0;
+	switch (args->type) {
+	case XFS_ALLOCTYPE_THIS_AG:
+		error = xfs_alloc_ag_vextent_size(args);
+		break;
+	case XFS_ALLOCTYPE_NEAR_BNO:
+		error = xfs_alloc_ag_vextent_near(args);
+		break;
+	case XFS_ALLOCTYPE_THIS_BNO:
+		error = xfs_alloc_ag_vextent_exact(args);
+		break;
+	default:
+		ASSERT(0);
+		/* NOTREACHED */
+	}
+
+	if (error || args->agbno == NULLAGBLOCK)
+		return error;
+
+	ASSERT(args->len >= args->minlen);
+	ASSERT(args->len <= args->maxlen);
+	ASSERT(!args->wasfromfl || !args->isfl);
+	ASSERT(args->agbno % args->alignment == 0);
+
+	if (!args->wasfromfl) {
+		error = xfs_alloc_update_counters(args->tp, args->pag,
+						  args->agbp,
+						  -((long)(args->len)));
+		if (error)
+			return error;
+
+		ASSERT(!xfs_alloc_busy_search(args->mp, args->agno,
+					      args->agbno, args->len));
+	}
+
+	if (!args->isfl) {
+		xfs_trans_mod_sb(args->tp, args->wasdel ?
+				 XFS_TRANS_SB_RES_FDBLOCKS :
+				 XFS_TRANS_SB_FDBLOCKS,
+				 -((long)(args->len)));
+	}
+
+	XFS_STATS_INC(xs_allocx);
+	XFS_STATS_ADD(xs_allocb, args->len);
+	return error;
+}
+
+/*
+ * Allocate a variable extent at exactly agno/bno.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent_exact(
+	xfs_alloc_arg_t	*args)	/* allocation argument structure */
+{
+	xfs_btree_cur_t	*bno_cur;/* by block-number btree cursor */
+	xfs_btree_cur_t	*cnt_cur;/* by count btree cursor */
+	int		error;
+	xfs_agblock_t	fbno;	/* start block of found extent */
+	xfs_extlen_t	flen;	/* length of found extent */
+	xfs_agblock_t	tbno;	/* start block of trimmed extent */
+	xfs_extlen_t	tlen;	/* length of trimmed extent */
+	xfs_agblock_t	tend;	/* end block of trimmed extent */
+	xfs_agblock_t	end;	/* end of allocated extent */
+	int		i;	/* success/failure of operation */
+	xfs_extlen_t	rlen;	/* length of returned extent */
+
+	ASSERT(args->alignment == 1);
+
+	/*
+	 * Allocate/initialize a cursor for the by-number freespace btree.
+	 */
+	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+					  args->agno, XFS_BTNUM_BNO);
+
+	/*
+	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
+	 * Look for the closest free block <= bno, it must contain bno
+	 * if any free block does.
+	 */
+	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+	if (error)
+		goto error0;
+	if (!i)
+		goto not_found;
+
+	/*
+	 * Grab the freespace record.
+	 */
+	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+	if (error)
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	ASSERT(fbno <= args->agbno);
+
+	/*
+	 * Check for overlapping busy extents.
+	 */
+	xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen);
+
+	/*
+	 * Give up if the start of the extent is busy, or the freespace isn't
+	 * long enough for the minimum request.
+	 */
+	if (tbno > args->agbno)
+		goto not_found;
+	if (tlen < args->minlen)
+		goto not_found;
+	tend = tbno + tlen;
+	if (tend < args->agbno + args->minlen)
+		goto not_found;
+
+	/*
+	 * End of extent will be smaller of the freespace end and the
+	 * maximal requested end.
+	 *
+	 * Fix the length according to mod and prod if given.
+	 */
+	end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen);
+	args->len = end - args->agbno;
+	xfs_alloc_fix_len(args);
+	if (!xfs_alloc_fix_minleft(args))
+		goto not_found;
+
+	rlen = args->len;
+	ASSERT(args->agbno + rlen <= tend);
+	end = args->agbno + rlen;
+
+	/*
+	 * We are allocating agbno for rlen [agbno .. end]
+	 * Allocate/initialize a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT);
+	ASSERT(args->agbno + args->len <=
+		be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
+				      args->len, XFSA_FIXUP_BNO_OK);
+	if (error) {
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+		goto error0;
+	}
+
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+
+	args->wasfromfl = 0;
+	trace_xfs_alloc_exact_done(args);
+	return 0;
+
+not_found:
+	/* Didn't find it, return null. */
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	args->agbno = NULLAGBLOCK;
+	trace_xfs_alloc_exact_notfound(args);
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	trace_xfs_alloc_exact_error(args);
+	return error;
+}
+
+/*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+	struct xfs_alloc_arg	*args,	/* allocation argument structure */
+	struct xfs_btree_cur	**gcur,	/* good cursor */
+	struct xfs_btree_cur	**scur,	/* searching cursor */
+	xfs_agblock_t		gdiff,	/* difference for search comparison */
+	xfs_agblock_t		*sbno,	/* extent found by search */
+	xfs_extlen_t		*slen,	/* extent length */
+	xfs_agblock_t		*sbnoa,	/* aligned extent found by search */
+	xfs_extlen_t		*slena,	/* aligned extent length */
+	int			dir)	/* 0 = search right, 1 = search left */
+{
+	xfs_agblock_t		new;
+	xfs_agblock_t		sdiff;
+	int			error;
+	int			i;
+
+	/* The good extent is perfect, no need to  search. */
+	if (!gdiff)
+		goto out_use_good;
+
+	/*
+	 * Look until we find a better one, run out of space or run off the end.
+	 */
+	do {
+		error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
+
+		/*
+		 * The good extent is closer than this one.
+		 */
+		if (!dir) {
+			if (*sbnoa >= args->agbno + gdiff)
+				goto out_use_good;
+		} else {
+			if (*sbnoa <= args->agbno - gdiff)
+				goto out_use_good;
+		}
+
+		/*
+		 * Same distance, compare length and pick the best.
+		 */
+		if (*slena >= args->minlen) {
+			args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+			xfs_alloc_fix_len(args);
+
+			sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+						       args->alignment, *sbnoa,
+						       *slena, &new);
+
+			/*
+			 * Choose closer size and invalidate other cursor.
+			 */
+			if (sdiff < gdiff)
+				goto out_use_search;
+			goto out_use_good;
+		}
+
+		if (!dir)
+			error = xfs_btree_increment(*scur, 0, &i);
+		else
+			error = xfs_btree_decrement(*scur, 0, &i);
+		if (error)
+			goto error0;
+	} while (i);
+
+out_use_good:
+	xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+	*scur = NULL;
+	return 0;
+
+out_use_search:
+	xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+	*gcur = NULL;
+	return 0;
+
+error0:
+	/* caller invalidates cursors */
+	return error;
+}
+
+/*
+ * Allocate a variable extent near bno in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int				/* error */
+xfs_alloc_ag_vextent_near(
+	xfs_alloc_arg_t	*args)		/* allocation argument structure */
+{
+	xfs_btree_cur_t	*bno_cur_gt;	/* cursor for bno btree, right side */
+	xfs_btree_cur_t	*bno_cur_lt;	/* cursor for bno btree, left side */
+	xfs_btree_cur_t	*cnt_cur;	/* cursor for count btree */
+	xfs_agblock_t	gtbno;		/* start bno of right side entry */
+	xfs_agblock_t	gtbnoa;		/* aligned ... */
+	xfs_extlen_t	gtdiff;		/* difference to right side entry */
+	xfs_extlen_t	gtlen;		/* length of right side entry */
+	xfs_extlen_t	gtlena;		/* aligned ... */
+	xfs_agblock_t	gtnew;		/* useful start bno of right side */
+	int		error;		/* error code */
+	int		i;		/* result code, temporary */
+	int		j;		/* result code, temporary */
+	xfs_agblock_t	ltbno;		/* start bno of left side entry */
+	xfs_agblock_t	ltbnoa;		/* aligned ... */
+	xfs_extlen_t	ltdiff;		/* difference to left side entry */
+	xfs_extlen_t	ltlen;		/* length of left side entry */
+	xfs_extlen_t	ltlena;		/* aligned ... */
+	xfs_agblock_t	ltnew;		/* useful start bno of left side */
+	xfs_extlen_t	rlen;		/* length of returned extent */
+	int		forced = 0;
+#if defined(DEBUG) && defined(__KERNEL__)
+	/*
+	 * Randomly don't execute the first algorithm.
+	 */
+	int		dofirst;	/* set to do first algorithm */
+
+	dofirst = random32() & 1;
+#endif
+
+restart:
+	bno_cur_lt = NULL;
+	bno_cur_gt = NULL;
+	ltlen = 0;
+	gtlena = 0;
+	ltlena = 0;
+
+	/*
+	 * Get a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT);
+
+	/*
+	 * See if there are any free extents as big as maxlen.
+	 */
+	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
+		goto error0;
+	/*
+	 * If none, then pick up the last entry in the tree unless the
+	 * tree is empty.
+	 */
+	if (!i) {
+		if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &ltbno,
+				&ltlen, &i)))
+			goto error0;
+		if (i == 0 || ltlen == 0) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_near_noentry(args);
+			return 0;
+		}
+		ASSERT(i == 1);
+	}
+	args->wasfromfl = 0;
+
+	/*
+	 * First algorithm.
+	 * If the requested extent is large wrt the freespaces available
+	 * in this a.g., then the cursor will be pointing to a btree entry
+	 * near the right edge of the tree.  If it's in the last btree leaf
+	 * block, then we just examine all the entries in that block
+	 * that are big enough, and pick the best one.
+	 * This is written as a while loop so we can break out of it,
+	 * but we never loop back to the top.
+	 */
+	while (xfs_btree_islastblock(cnt_cur, 0)) {
+		xfs_extlen_t	bdiff;
+		int		besti=0;
+		xfs_extlen_t	blen=0;
+		xfs_agblock_t	bnew=0;
+
+#if defined(DEBUG) && defined(__KERNEL__)
+		if (!dofirst)
+			break;
+#endif
+		/*
+		 * Start from the entry that lookup found, sequence through
+		 * all larger free blocks.  If we're actually pointing at a
+		 * record smaller than maxlen, go to the start of this block,
+		 * and skip all those smaller than minlen.
+		 */
+		if (ltlen || args->alignment > 1) {
+			cnt_cur->bc_ptrs[0] = 1;
+			do {
+				if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
+						&ltlen, &i)))
+					goto error0;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+				if (ltlen >= args->minlen)
+					break;
+				if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
+					goto error0;
+			} while (i);
+			ASSERT(ltlen >= args->minlen);
+			if (!i)
+				break;
+		}
+		i = cnt_cur->bc_ptrs[0];
+		for (j = 1, blen = 0, bdiff = 0;
+		     !error && j && (blen < args->maxlen || bdiff > 0);
+		     error = xfs_btree_increment(cnt_cur, 0, &j)) {
+			/*
+			 * For each entry, decide if it's better than
+			 * the previous best entry.
+			 */
+			if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			xfs_alloc_compute_aligned(args, ltbno, ltlen,
+						  &ltbnoa, &ltlena);
+			if (ltlena < args->minlen)
+				continue;
+			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+			xfs_alloc_fix_len(args);
+			ASSERT(args->len >= args->minlen);
+			if (args->len < blen)
+				continue;
+			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+				args->alignment, ltbnoa, ltlena, &ltnew);
+			if (ltnew != NULLAGBLOCK &&
+			    (args->len > blen || ltdiff < bdiff)) {
+				bdiff = ltdiff;
+				bnew = ltnew;
+				blen = args->len;
+				besti = cnt_cur->bc_ptrs[0];
+			}
+		}
+		/*
+		 * It didn't work.  We COULD be in a case where
+		 * there's a good record somewhere, so try again.
+		 */
+		if (blen == 0)
+			break;
+		/*
+		 * Point at the best entry, and retrieve it again.
+		 */
+		cnt_cur->bc_ptrs[0] = besti;
+		if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+		args->len = blen;
+		if (!xfs_alloc_fix_minleft(args)) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_near_nominleft(args);
+			return 0;
+		}
+		blen = args->len;
+		/*
+		 * We are allocating starting at bnew for blen blocks.
+		 */
+		args->agbno = bnew;
+		ASSERT(bnew >= ltbno);
+		ASSERT(bnew + blen <= ltbno + ltlen);
+		/*
+		 * Set up a cursor for the by-bno tree.
+		 */
+		bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
+			args->agbp, args->agno, XFS_BTNUM_BNO);
+		/*
+		 * Fix up the btree entries.
+		 */
+		if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
+				ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
+			goto error0;
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+
+		trace_xfs_alloc_near_first(args);
+		return 0;
+	}
+	/*
+	 * Second algorithm.
+	 * Search in the by-bno tree to the left and to the right
+	 * simultaneously, until in each case we find a space big enough,
+	 * or run into the edge of the tree.  When we run into the edge,
+	 * we deallocate that cursor.
+	 * If both searches succeed, we compare the two spaces and pick
+	 * the better one.
+	 * With alignment, it's possible for both to fail; the upper
+	 * level algorithm that picks allocation groups for allocations
+	 * is not supposed to do this.
+	 */
+	/*
+	 * Allocate and initialize the cursor for the leftward search.
+	 */
+	bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_BNO);
+	/*
+	 * Lookup <= bno to find the leftward search's starting point.
+	 */
+	if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
+		goto error0;
+	if (!i) {
+		/*
+		 * Didn't find anything; use this cursor for the rightward
+		 * search.
+		 */
+		bno_cur_gt = bno_cur_lt;
+		bno_cur_lt = NULL;
+	}
+	/*
+	 * Found something.  Duplicate the cursor for the rightward search.
+	 */
+	else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
+		goto error0;
+	/*
+	 * Increment the cursor, so we will point at the entry just right
+	 * of the leftward entry if any, or to the leftmost entry.
+	 */
+	if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
+		goto error0;
+	if (!i) {
+		/*
+		 * It failed, there are no rightward entries.
+		 */
+		xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
+		bno_cur_gt = NULL;
+	}
+	/*
+	 * Loop going left with the leftward cursor, right with the
+	 * rightward cursor, until either both directions give up or
+	 * we find an entry at least as big as minlen.
+	 */
+	do {
+		if (bno_cur_lt) {
+			if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			xfs_alloc_compute_aligned(args, ltbno, ltlen,
+						  &ltbnoa, &ltlena);
+			if (ltlena >= args->minlen)
+				break;
+			if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
+				goto error0;
+			if (!i) {
+				xfs_btree_del_cursor(bno_cur_lt,
+						     XFS_BTREE_NOERROR);
+				bno_cur_lt = NULL;
+			}
+		}
+		if (bno_cur_gt) {
+			if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			xfs_alloc_compute_aligned(args, gtbno, gtlen,
+						  &gtbnoa, &gtlena);
+			if (gtlena >= args->minlen)
+				break;
+			if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
+				goto error0;
+			if (!i) {
+				xfs_btree_del_cursor(bno_cur_gt,
+						     XFS_BTREE_NOERROR);
+				bno_cur_gt = NULL;
+			}
+		}
+	} while (bno_cur_lt || bno_cur_gt);
+
+	/*
+	 * Got both cursors still active, need to find better entry.
+	 */
+	if (bno_cur_lt && bno_cur_gt) {
+		if (ltlena >= args->minlen) {
+			/*
+			 * Left side is good, look for a right side entry.
+			 */
+			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+			xfs_alloc_fix_len(args);
+			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+				args->alignment, ltbnoa, ltlena, &ltnew);
+
+			error = xfs_alloc_find_best_extent(args,
+						&bno_cur_lt, &bno_cur_gt,
+						ltdiff, &gtbno, &gtlen,
+						&gtbnoa, &gtlena,
+						0 /* search right */);
+		} else {
+			ASSERT(gtlena >= args->minlen);
+
+			/*
+			 * Right side is good, look for a left side entry.
+			 */
+			args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
+			xfs_alloc_fix_len(args);
+			gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+				args->alignment, gtbnoa, gtlena, &gtnew);
+
+			error = xfs_alloc_find_best_extent(args,
+						&bno_cur_gt, &bno_cur_lt,
+						gtdiff, &ltbno, &ltlen,
+						&ltbnoa, &ltlena,
+						1 /* search left */);
+		}
+
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * If we couldn't get anything, give up.
+	 */
+	if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+		if (!forced++) {
+			trace_xfs_alloc_near_busy(args);
+			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			goto restart;
+		}
+
+		trace_xfs_alloc_size_neither(args);
+		args->agbno = NULLAGBLOCK;
+		return 0;
+	}
+
+	/*
+	 * At this point we have selected a freespace entry, either to the
+	 * left or to the right.  If it's on the right, copy all the
+	 * useful variables to the "left" set so we only have one
+	 * copy of this code.
+	 */
+	if (bno_cur_gt) {
+		bno_cur_lt = bno_cur_gt;
+		bno_cur_gt = NULL;
+		ltbno = gtbno;
+		ltbnoa = gtbnoa;
+		ltlen = gtlen;
+		ltlena = gtlena;
+		j = 1;
+	} else
+		j = 0;
+
+	/*
+	 * Fix up the length and compute the useful address.
+	 */
+	args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+	xfs_alloc_fix_len(args);
+	if (!xfs_alloc_fix_minleft(args)) {
+		trace_xfs_alloc_near_nominleft(args);
+		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+		return 0;
+	}
+	rlen = args->len;
+	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
+				     ltbnoa, ltlena, &ltnew);
+	ASSERT(ltnew >= ltbno);
+	ASSERT(ltnew + rlen <= ltbnoa + ltlena);
+	ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+	args->agbno = ltnew;
+
+	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
+			ltnew, rlen, XFSA_FIXUP_BNO_OK)))
+		goto error0;
+
+	if (j)
+		trace_xfs_alloc_near_greater(args);
+	else
+		trace_xfs_alloc_near_lesser(args);
+
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+	return 0;
+
+ error0:
+	trace_xfs_alloc_near_error(args);
+	if (cnt_cur != NULL)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	if (bno_cur_lt != NULL)
+		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
+	if (bno_cur_gt != NULL)
+		xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Allocate a variable extent anywhere in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int				/* error */
+xfs_alloc_ag_vextent_size(
+	xfs_alloc_arg_t	*args)		/* allocation argument structure */
+{
+	xfs_btree_cur_t	*bno_cur;	/* cursor for bno btree */
+	xfs_btree_cur_t	*cnt_cur;	/* cursor for cnt btree */
+	int		error;		/* error result */
+	xfs_agblock_t	fbno;		/* start of found freespace */
+	xfs_extlen_t	flen;		/* length of found freespace */
+	int		i;		/* temp status variable */
+	xfs_agblock_t	rbno;		/* returned block number */
+	xfs_extlen_t	rlen;		/* length of returned extent */
+	int		forced = 0;
+
+restart:
+	/*
+	 * Allocate and initialize a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT);
+	bno_cur = NULL;
+
+	/*
+	 * Look for an entry >= maxlen+alignment-1 blocks.
+	 */
+	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
+			args->maxlen + args->alignment - 1, &i)))
+		goto error0;
+
+	/*
+	 * If none or we have busy extents that we cannot allocate from, then
+	 * we have to settle for a smaller extent. In the case that there are
+	 * no large extents, this will return the last entry in the tree unless
+	 * the tree is empty. In the case that there are only busy large
+	 * extents, this will return the largest small extent unless there
+	 * are no smaller extents available.
+	 */
+	if (!i || forced > 1) {
+		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
+						   &fbno, &flen, &i);
+		if (error)
+			goto error0;
+		if (i == 0 || flen == 0) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_size_noentry(args);
+			return 0;
+		}
+		ASSERT(i == 1);
+		xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+	} else {
+		/*
+		 * Search for a non-busy extent that is large enough.
+		 * If we are at low space, don't check, or if we fall of
+		 * the end of the btree, turn off the busy check and
+		 * restart.
+		 */
+		for (;;) {
+			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+			if (error)
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+			xfs_alloc_compute_aligned(args, fbno, flen,
+						  &rbno, &rlen);
+
+			if (rlen >= args->maxlen)
+				break;
+
+			error = xfs_btree_increment(cnt_cur, 0, &i);
+			if (error)
+				goto error0;
+			if (i == 0) {
+				/*
+				 * Our only valid extents must have been busy.
+				 * Make it unbusy by forcing the log out and
+				 * retrying. If we've been here before, forcing
+				 * the log isn't making the extents available,
+				 * which means they have probably been freed in
+				 * this transaction.  In that case, we have to
+				 * give up on them and we'll attempt a minlen
+				 * allocation the next time around.
+				 */
+				xfs_btree_del_cursor(cnt_cur,
+						     XFS_BTREE_NOERROR);
+				trace_xfs_alloc_size_busy(args);
+				if (!forced++)
+					xfs_log_force(args->mp, XFS_LOG_SYNC);
+				goto restart;
+			}
+		}
+	}
+
+	/*
+	 * In the first case above, we got the last entry in the
+	 * by-size btree.  Now we check to see if the space hits maxlen
+	 * once aligned; if not, we search left for something better.
+	 * This can't happen in the second case above.
+	 */
+	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+	XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+			(rlen <= flen && rbno + rlen <= fbno + flen), error0);
+	if (rlen < args->maxlen) {
+		xfs_agblock_t	bestfbno;
+		xfs_extlen_t	bestflen;
+		xfs_agblock_t	bestrbno;
+		xfs_extlen_t	bestrlen;
+
+		bestrlen = rlen;
+		bestrbno = rbno;
+		bestflen = flen;
+		bestfbno = fbno;
+		for (;;) {
+			if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
+				goto error0;
+			if (i == 0)
+				break;
+			if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
+					&i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if (flen < bestrlen)
+				break;
+			xfs_alloc_compute_aligned(args, fbno, flen,
+						  &rbno, &rlen);
+			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+			XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+				(rlen <= flen && rbno + rlen <= fbno + flen),
+				error0);
+			if (rlen > bestrlen) {
+				bestrlen = rlen;
+				bestrbno = rbno;
+				bestflen = flen;
+				bestfbno = fbno;
+				if (rlen == args->maxlen)
+					break;
+			}
+		}
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
+				&i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		rlen = bestrlen;
+		rbno = bestrbno;
+		flen = bestflen;
+		fbno = bestfbno;
+	}
+	args->wasfromfl = 0;
+	/*
+	 * Fix up the length.
+	 */
+	args->len = rlen;
+	if (rlen < args->minlen) {
+		if (!forced++) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_size_busy(args);
+			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			goto restart;
+		}
+		goto out_nominleft;
+	}
+	xfs_alloc_fix_len(args);
+
+	if (!xfs_alloc_fix_minleft(args))
+		goto out_nominleft;
+	rlen = args->len;
+	XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
+	/*
+	 * Allocate and initialize a cursor for the by-block tree.
+	 */
+	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_BNO);
+	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+			rbno, rlen, XFSA_FIXUP_CNT_OK)))
+		goto error0;
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	cnt_cur = bno_cur = NULL;
+	args->len = rlen;
+	args->agbno = rbno;
+	XFS_WANT_CORRUPTED_GOTO(
+		args->agbno + args->len <=
+			be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+		error0);
+	trace_xfs_alloc_size_done(args);
+	return 0;
+
+error0:
+	trace_xfs_alloc_size_error(args);
+	if (cnt_cur)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	if (bno_cur)
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	return error;
+
+out_nominleft:
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	trace_xfs_alloc_size_nominleft(args);
+	args->agbno = NULLAGBLOCK;
+	return 0;
+}
+
+/*
+ * Deal with the case where only small freespaces remain.
+ * Either return the contents of the last freespace record,
+ * or allocate space from the freelist if there is nothing in the tree.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent_small(
+	xfs_alloc_arg_t	*args,	/* allocation argument structure */
+	xfs_btree_cur_t	*ccur,	/* by-size cursor */
+	xfs_agblock_t	*fbnop,	/* result block number */
+	xfs_extlen_t	*flenp,	/* result length */
+	int		*stat)	/* status: 0-freelist, 1-normal/none */
+{
+	int		error;
+	xfs_agblock_t	fbno;
+	xfs_extlen_t	flen;
+	int		i;
+
+	if ((error = xfs_btree_decrement(ccur, 0, &i)))
+		goto error0;
+	if (i) {
+		if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	}
+	/*
+	 * Nothing in the btree, try the freelist.  Make sure
+	 * to respect minleft even when pulling from the
+	 * freelist.
+	 */
+	else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
+		 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
+		  > args->minleft)) {
+		error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+		if (error)
+			goto error0;
+		if (fbno != NULLAGBLOCK) {
+			xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
+					     args->userdata);
+
+			if (args->userdata) {
+				xfs_buf_t	*bp;
+
+				bp = xfs_btree_get_bufs(args->mp, args->tp,
+					args->agno, fbno, 0);
+				xfs_trans_binval(args->tp, bp);
+			}
+			args->len = 1;
+			args->agbno = fbno;
+			XFS_WANT_CORRUPTED_GOTO(
+				args->agbno + args->len <=
+				be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+				error0);
+			args->wasfromfl = 1;
+			trace_xfs_alloc_small_freelist(args);
+			*stat = 0;
+			return 0;
+		}
+		/*
+		 * Nothing in the freelist.
+		 */
+		else
+			flen = 0;
+	}
+	/*
+	 * Can't allocate from the freelist for some reason.
+	 */
+	else {
+		fbno = NULLAGBLOCK;
+		flen = 0;
+	}
+	/*
+	 * Can't do the allocation, give up.
+	 */
+	if (flen < args->minlen) {
+		args->agbno = NULLAGBLOCK;
+		trace_xfs_alloc_small_notenough(args);
+		flen = 0;
+	}
+	*fbnop = fbno;
+	*flenp = flen;
+	*stat = 1;
+	trace_xfs_alloc_small_done(args);
+	return 0;
+
+error0:
+	trace_xfs_alloc_small_error(args);
+	return error;
+}
+
+/*
+ * Free the extent starting at agno/bno for length.
+ */
+STATIC int			/* error */
+xfs_free_ag_extent(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_buf_t	*agbp,	/* buffer for a.g. freelist header */
+	xfs_agnumber_t	agno,	/* allocation group number */
+	xfs_agblock_t	bno,	/* starting block number */
+	xfs_extlen_t	len,	/* length of extent */
+	int		isfl)	/* set if is freelist blocks - no sb acctg */
+{
+	xfs_btree_cur_t	*bno_cur;	/* cursor for by-block btree */
+	xfs_btree_cur_t	*cnt_cur;	/* cursor for by-size btree */
+	int		error;		/* error return value */
+	xfs_agblock_t	gtbno;		/* start of right neighbor block */
+	xfs_extlen_t	gtlen;		/* length of right neighbor block */
+	int		haveleft;	/* have a left neighbor block */
+	int		haveright;	/* have a right neighbor block */
+	int		i;		/* temp, result code */
+	xfs_agblock_t	ltbno;		/* start of left neighbor block */
+	xfs_extlen_t	ltlen;		/* length of left neighbor block */
+	xfs_mount_t	*mp;		/* mount point struct for filesystem */
+	xfs_agblock_t	nbno;		/* new starting block of freespace */
+	xfs_extlen_t	nlen;		/* new length of freespace */
+	xfs_perag_t	*pag;		/* per allocation group data */
+
+	mp = tp->t_mountp;
+	/*
+	 * Allocate and initialize a cursor for the by-block btree.
+	 */
+	bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
+	cnt_cur = NULL;
+	/*
+	 * Look for a neighboring block on the left (lower block numbers)
+	 * that is contiguous with this space.
+	 */
+	if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
+		goto error0;
+	if (haveleft) {
+		/*
+		 * There is a block to our left.
+		 */
+		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * It's not contiguous, though.
+		 */
+		if (ltbno + ltlen < bno)
+			haveleft = 0;
+		else {
+			/*
+			 * If this failure happens the request to free this
+			 * space was invalid, it's (partly) already free.
+			 * Very bad.
+			 */
+			XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
+		}
+	}
+	/*
+	 * Look for a neighboring block on the right (higher block numbers)
+	 * that is contiguous with this space.
+	 */
+	if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
+		goto error0;
+	if (haveright) {
+		/*
+		 * There is a block to our right.
+		 */
+		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * It's not contiguous, though.
+		 */
+		if (bno + len < gtbno)
+			haveright = 0;
+		else {
+			/*
+			 * If this failure happens the request to free this
+			 * space was invalid, it's (partly) already free.
+			 * Very bad.
+			 */
+			XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
+		}
+	}
+	/*
+	 * Now allocate and initialize a cursor for the by-size tree.
+	 */
+	cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
+	/*
+	 * Have both left and right contiguous neighbors.
+	 * Merge all three into a single free block.
+	 */
+	if (haveleft && haveright) {
+		/*
+		 * Delete the old by-size entry on the left.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Delete the old by-size entry on the right.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Delete the old by-block entry for the right block.
+		 */
+		if ((error = xfs_btree_delete(bno_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Move the by-block cursor back to the left neighbor.
+		 */
+		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+#ifdef DEBUG
+		/*
+		 * Check that this is the right record: delete didn't
+		 * mangle the cursor.
+		 */
+		{
+			xfs_agblock_t	xxbno;
+			xfs_extlen_t	xxlen;
+
+			if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
+					&i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(
+				i == 1 && xxbno == ltbno && xxlen == ltlen,
+				error0);
+		}
+#endif
+		/*
+		 * Update remaining by-block entry to the new, joined block.
+		 */
+		nbno = ltbno;
+		nlen = len + ltlen + gtlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * Have only a left contiguous neighbor.
+	 * Merge it together with the new freespace.
+	 */
+	else if (haveleft) {
+		/*
+		 * Delete the old by-size entry on the left.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Back up the by-block cursor to the left neighbor, and
+		 * update its length.
+		 */
+		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		nbno = ltbno;
+		nlen = len + ltlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * Have only a right contiguous neighbor.
+	 * Merge it together with the new freespace.
+	 */
+	else if (haveright) {
+		/*
+		 * Delete the old by-size entry on the right.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Update the starting block and length of the right
+		 * neighbor in the by-block tree.
+		 */
+		nbno = bno;
+		nlen = len + gtlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * No contiguous neighbors.
+	 * Insert the new freespace into the by-block tree.
+	 */
+	else {
+		nbno = bno;
+		nlen = len;
+		if ((error = xfs_btree_insert(bno_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	}
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	bno_cur = NULL;
+	/*
+	 * In all cases we need to insert the new freespace in the by-size tree.
+	 */
+	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
+	if ((error = xfs_btree_insert(cnt_cur, &i)))
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	cnt_cur = NULL;
+
+	/*
+	 * Update the freespace totals in the ag and superblock.
+	 */
+	pag = xfs_perag_get(mp, agno);
+	error = xfs_alloc_update_counters(tp, pag, agbp, len);
+	xfs_perag_put(pag);
+	if (error)
+		goto error0;
+
+	if (!isfl)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
+	XFS_STATS_INC(xs_freex);
+	XFS_STATS_ADD(xs_freeb, len);
+
+	trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
+
+	return 0;
+
+ error0:
+	trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
+	if (bno_cur)
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	if (cnt_cur)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Visible (exported) allocation/free functions.
+ * Some of these are used just by xfs_alloc_btree.c and this file.
+ */
+
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+	xfs_mount_t	*mp)	/* file system mount structure */
+{
+	int		level;
+	uint		maxblocks;
+	uint		maxleafents;
+	int		minleafrecs;
+	int		minnoderecs;
+
+	maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
+	minleafrecs = mp->m_alloc_mnr[0];
+	minnoderecs = mp->m_alloc_mnr[1];
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++)
+		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	mp->m_ag_maxlevels = level;
+}
+
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag)
+{
+	xfs_extlen_t		need, delta = 0;
+
+	need = XFS_MIN_FREELIST_PAG(pag, mp);
+	if (need > pag->pagf_flcount)
+		delta = need - pag->pagf_flcount;
+
+	if (pag->pagf_longest > delta)
+		return pag->pagf_longest - delta;
+	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
+}
+
+/*
+ * Decide whether to use this allocation group for this allocation.
+ * If so, fix up the btree freelist's size.
+ */
+STATIC int			/* error */
+xfs_alloc_fix_freelist(
+	xfs_alloc_arg_t	*args,	/* allocation argument structure */
+	int		flags)	/* XFS_ALLOC_FLAG_... */
+{
+	xfs_buf_t	*agbp;	/* agf buffer pointer */
+	xfs_agf_t	*agf;	/* a.g. freespace structure pointer */
+	xfs_buf_t	*agflbp;/* agfl buffer pointer */
+	xfs_agblock_t	bno;	/* freelist block */
+	xfs_extlen_t	delta;	/* new blocks needed in freelist */
+	int		error;	/* error result code */
+	xfs_extlen_t	longest;/* longest extent in allocation group */
+	xfs_mount_t	*mp;	/* file system mount point structure */
+	xfs_extlen_t	need;	/* total blocks needed in freelist */
+	xfs_perag_t	*pag;	/* per-ag information structure */
+	xfs_alloc_arg_t	targs;	/* local allocation arguments */
+	xfs_trans_t	*tp;	/* transaction pointer */
+
+	mp = args->mp;
+
+	pag = args->pag;
+	tp = args->tp;
+	if (!pag->pagf_init) {
+		if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+				&agbp)))
+			return error;
+		if (!pag->pagf_init) {
+			ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+			ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+			args->agbp = NULL;
+			return 0;
+		}
+	} else
+		agbp = NULL;
+
+	/*
+	 * If this is a metadata preferred pag and we are user data
+	 * then try somewhere else if we are not being asked to
+	 * try harder at this point
+	 */
+	if (pag->pagf_metadata && args->userdata &&
+	    (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
+		ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+		args->agbp = NULL;
+		return 0;
+	}
+
+	if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+		/*
+		 * If it looks like there isn't a long enough extent, or enough
+		 * total blocks, reject it.
+		 */
+		need = XFS_MIN_FREELIST_PAG(pag, mp);
+		longest = xfs_alloc_longest_free_extent(mp, pag);
+		if ((args->minlen + args->alignment + args->minalignslop - 1) >
+				longest ||
+		    ((int)(pag->pagf_freeblks + pag->pagf_flcount -
+			   need - args->total) < (int)args->minleft)) {
+			if (agbp)
+				xfs_trans_brelse(tp, agbp);
+			args->agbp = NULL;
+			return 0;
+		}
+	}
+
+	/*
+	 * Get the a.g. freespace buffer.
+	 * Can fail if we're not blocking on locks, and it's held.
+	 */
+	if (agbp == NULL) {
+		if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+				&agbp)))
+			return error;
+		if (agbp == NULL) {
+			ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+			ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+			args->agbp = NULL;
+			return 0;
+		}
+	}
+	/*
+	 * Figure out how many blocks we should have in the freelist.
+	 */
+	agf = XFS_BUF_TO_AGF(agbp);
+	need = XFS_MIN_FREELIST(agf, mp);
+	/*
+	 * If there isn't enough total or single-extent, reject it.
+	 */
+	if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+		delta = need > be32_to_cpu(agf->agf_flcount) ?
+			(need - be32_to_cpu(agf->agf_flcount)) : 0;
+		longest = be32_to_cpu(agf->agf_longest);
+		longest = (longest > delta) ? (longest - delta) :
+			(be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
+		if ((args->minlen + args->alignment + args->minalignslop - 1) >
+				longest ||
+		    ((int)(be32_to_cpu(agf->agf_freeblks) +
+		     be32_to_cpu(agf->agf_flcount) - need - args->total) <
+				(int)args->minleft)) {
+			xfs_trans_brelse(tp, agbp);
+			args->agbp = NULL;
+			return 0;
+		}
+	}
+	/*
+	 * Make the freelist shorter if it's too long.
+	 */
+	while (be32_to_cpu(agf->agf_flcount) > need) {
+		xfs_buf_t	*bp;
+
+		error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
+		if (error)
+			return error;
+		if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
+			return error;
+		bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
+		xfs_trans_binval(tp, bp);
+	}
+	/*
+	 * Initialize the args structure.
+	 */
+	targs.tp = tp;
+	targs.mp = mp;
+	targs.agbp = agbp;
+	targs.agno = args->agno;
+	targs.mod = targs.minleft = targs.wasdel = targs.userdata =
+		targs.minalignslop = 0;
+	targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
+	targs.type = XFS_ALLOCTYPE_THIS_AG;
+	targs.pag = pag;
+	if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
+		return error;
+	/*
+	 * Make the freelist longer if it's too short.
+	 */
+	while (be32_to_cpu(agf->agf_flcount) < need) {
+		targs.agbno = 0;
+		targs.maxlen = need - be32_to_cpu(agf->agf_flcount);
+		/*
+		 * Allocate as many blocks as possible at once.
+		 */
+		if ((error = xfs_alloc_ag_vextent(&targs))) {
+			xfs_trans_brelse(tp, agflbp);
+			return error;
+		}
+		/*
+		 * Stop if we run out.  Won't happen if callers are obeying
+		 * the restrictions correctly.  Can happen for free calls
+		 * on a completely full ag.
+		 */
+		if (targs.agbno == NULLAGBLOCK) {
+			if (flags & XFS_ALLOC_FLAG_FREEING)
+				break;
+			xfs_trans_brelse(tp, agflbp);
+			args->agbp = NULL;
+			return 0;
+		}
+		/*
+		 * Put each allocated block on the list.
+		 */
+		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
+			error = xfs_alloc_put_freelist(tp, agbp,
+							agflbp, bno, 0);
+			if (error)
+				return error;
+		}
+	}
+	xfs_trans_brelse(tp, agflbp);
+	args->agbp = agbp;
+	return 0;
+}
+
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int				/* error */
+xfs_alloc_get_freelist(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_buf_t	*agbp,	/* buffer containing the agf structure */
+	xfs_agblock_t	*bnop,	/* block address retrieved from freelist */
+	int		btreeblk) /* destination is a AGF btree */
+{
+	xfs_agf_t	*agf;	/* a.g. freespace structure */
+	xfs_agfl_t	*agfl;	/* a.g. freelist structure */
+	xfs_buf_t	*agflbp;/* buffer for a.g. freelist structure */
+	xfs_agblock_t	bno;	/* block number returned */
+	int		error;
+	int		logflags;
+	xfs_mount_t	*mp;	/* mount structure */
+	xfs_perag_t	*pag;	/* per allocation group data */
+
+	agf = XFS_BUF_TO_AGF(agbp);
+	/*
+	 * Freelist is empty, give up.
+	 */
+	if (!agf->agf_flcount) {
+		*bnop = NULLAGBLOCK;
+		return 0;
+	}
+	/*
+	 * Read the array of free blocks.
+	 */
+	mp = tp->t_mountp;
+	if ((error = xfs_alloc_read_agfl(mp, tp,
+			be32_to_cpu(agf->agf_seqno), &agflbp)))
+		return error;
+	agfl = XFS_BUF_TO_AGFL(agflbp);
+	/*
+	 * Get the block number and update the data structures.
+	 */
+	bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+	be32_add_cpu(&agf->agf_flfirst, 1);
+	xfs_trans_brelse(tp, agflbp);
+	if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
+		agf->agf_flfirst = 0;
+
+	pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+	be32_add_cpu(&agf->agf_flcount, -1);
+	xfs_trans_agflist_delta(tp, -1);
+	pag->pagf_flcount--;
+	xfs_perag_put(pag);
+
+	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
+	if (btreeblk) {
+		be32_add_cpu(&agf->agf_btreeblks, 1);
+		pag->pagf_btreeblks++;
+		logflags |= XFS_AGF_BTREEBLKS;
+	}
+
+	xfs_alloc_log_agf(tp, agbp, logflags);
+	*bnop = bno;
+
+	return 0;
+}
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_buf_t	*bp,	/* buffer for a.g. freelist header */
+	int		fields)	/* mask of fields to be logged (XFS_AGF_...) */
+{
+	int	first;		/* first byte offset */
+	int	last;		/* last byte offset */
+	static const short	offsets[] = {
+		offsetof(xfs_agf_t, agf_magicnum),
+		offsetof(xfs_agf_t, agf_versionnum),
+		offsetof(xfs_agf_t, agf_seqno),
+		offsetof(xfs_agf_t, agf_length),
+		offsetof(xfs_agf_t, agf_roots[0]),
+		offsetof(xfs_agf_t, agf_levels[0]),
+		offsetof(xfs_agf_t, agf_flfirst),
+		offsetof(xfs_agf_t, agf_fllast),
+		offsetof(xfs_agf_t, agf_flcount),
+		offsetof(xfs_agf_t, agf_freeblks),
+		offsetof(xfs_agf_t, agf_longest),
+		offsetof(xfs_agf_t, agf_btreeblks),
+		sizeof(xfs_agf_t)
+	};
+
+	trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
+
+	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
+	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
+}
+
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int					/* error */
+xfs_alloc_pagf_init(
+	xfs_mount_t		*mp,	/* file system mount structure */
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	int			flags)	/* XFS_ALLOC_FLAGS_... */
+{
+	xfs_buf_t		*bp;
+	int			error;
+
+	if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
+		return error;
+	if (bp)
+		xfs_trans_brelse(tp, bp);
+	return 0;
+}
+
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int					/* error */
+xfs_alloc_put_freelist(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_buf_t		*agbp,	/* buffer for a.g. freelist header */
+	xfs_buf_t		*agflbp,/* buffer for a.g. free block array */
+	xfs_agblock_t		bno,	/* block being freed */
+	int			btreeblk) /* block came from a AGF btree */
+{
+	xfs_agf_t		*agf;	/* a.g. freespace structure */
+	xfs_agfl_t		*agfl;	/* a.g. free block array */
+	__be32			*blockp;/* pointer to array entry */
+	int			error;
+	int			logflags;
+	xfs_mount_t		*mp;	/* mount structure */
+	xfs_perag_t		*pag;	/* per allocation group data */
+
+	agf = XFS_BUF_TO_AGF(agbp);
+	mp = tp->t_mountp;
+
+	if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
+			be32_to_cpu(agf->agf_seqno), &agflbp)))
+		return error;
+	agfl = XFS_BUF_TO_AGFL(agflbp);
+	be32_add_cpu(&agf->agf_fllast, 1);
+	if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
+		agf->agf_fllast = 0;
+
+	pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+	be32_add_cpu(&agf->agf_flcount, 1);
+	xfs_trans_agflist_delta(tp, 1);
+	pag->pagf_flcount++;
+
+	logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
+	if (btreeblk) {
+		be32_add_cpu(&agf->agf_btreeblks, -1);
+		pag->pagf_btreeblks--;
+		logflags |= XFS_AGF_BTREEBLKS;
+	}
+	xfs_perag_put(pag);
+
+	xfs_alloc_log_agf(tp, agbp, logflags);
+
+	ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
+	blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)];
+	*blockp = cpu_to_be32(bno);
+	xfs_alloc_log_agf(tp, agbp, logflags);
+	xfs_trans_log_buf(tp, agflbp,
+		(int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
+		(int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl +
+			sizeof(xfs_agblock_t) - 1));
+	return 0;
+}
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int					/* error */
+xfs_read_agf(
+	struct xfs_mount	*mp,	/* mount point structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	int			flags,	/* XFS_BUF_ */
+	struct xfs_buf		**bpp)	/* buffer for the ag freelist header */
+{
+	struct xfs_agf	*agf;		/* ag freelist header */
+	int		agf_ok;		/* set if agf is consistent */
+	int		error;
+
+	ASSERT(agno != NULLAGNUMBER);
+	error = xfs_trans_read_buf(
+			mp, tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), flags, bpp);
+	if (error)
+		return error;
+	if (!*bpp)
+		return 0;
+
+	ASSERT(!XFS_BUF_GETERROR(*bpp));
+	agf = XFS_BUF_TO_AGF(*bpp);
+
+	/*
+	 * Validate the magic number of the agf block.
+	 */
+	agf_ok =
+		be32_to_cpu(agf->agf_magicnum) == XFS_AGF_MAGIC &&
+		XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+		be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+		be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+		be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+		be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
+		be32_to_cpu(agf->agf_seqno) == agno;
+	if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+		agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
+						be32_to_cpu(agf->agf_length);
+	if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
+			XFS_RANDOM_ALLOC_READ_AGF))) {
+		XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
+				     XFS_ERRLEVEL_LOW, mp, agf);
+		xfs_trans_brelse(tp, *bpp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
+	return 0;
+}
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int					/* error */
+xfs_alloc_read_agf(
+	struct xfs_mount	*mp,	/* mount point structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	int			flags,	/* XFS_ALLOC_FLAG_... */
+	struct xfs_buf		**bpp)	/* buffer for the ag freelist header */
+{
+	struct xfs_agf		*agf;		/* ag freelist header */
+	struct xfs_perag	*pag;		/* per allocation group data */
+	int			error;
+
+	ASSERT(agno != NULLAGNUMBER);
+
+	error = xfs_read_agf(mp, tp, agno,
+			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
+			bpp);
+	if (error)
+		return error;
+	if (!*bpp)
+		return 0;
+	ASSERT(!XFS_BUF_GETERROR(*bpp));
+
+	agf = XFS_BUF_TO_AGF(*bpp);
+	pag = xfs_perag_get(mp, agno);
+	if (!pag->pagf_init) {
+		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
+		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
+		pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
+		pag->pagf_longest = be32_to_cpu(agf->agf_longest);
+		pag->pagf_levels[XFS_BTNUM_BNOi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
+		pag->pagf_levels[XFS_BTNUM_CNTi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+		spin_lock_init(&pag->pagb_lock);
+		pag->pagb_count = 0;
+		pag->pagb_tree = RB_ROOT;
+		pag->pagf_init = 1;
+	}
+#ifdef DEBUG
+	else if (!XFS_FORCED_SHUTDOWN(mp)) {
+		ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
+		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
+		ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
+		ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
+		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
+		ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
+		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
+	}
+#endif
+	xfs_perag_put(pag);
+	return 0;
+}
+
+/*
+ * Allocate an extent (variable-size).
+ * Depending on the allocation type, we either look in a single allocation
+ * group or loop over the allocation groups to find the result.
+ */
+int				/* error */
+xfs_alloc_vextent(
+	xfs_alloc_arg_t	*args)	/* allocation argument structure */
+{
+	xfs_agblock_t	agsize;	/* allocation group size */
+	int		error;
+	int		flags;	/* XFS_ALLOC_FLAG_... locking flags */
+	xfs_extlen_t	minleft;/* minimum left value, temp copy */
+	xfs_mount_t	*mp;	/* mount structure pointer */
+	xfs_agnumber_t	sagno;	/* starting allocation group number */
+	xfs_alloctype_t	type;	/* input allocation type */
+	int		bump_rotor = 0;
+	int		no_min = 0;
+	xfs_agnumber_t	rotorstep = xfs_rotorstep; /* inode32 agf stepper */
+
+	mp = args->mp;
+	type = args->otype = args->type;
+	args->agbno = NULLAGBLOCK;
+	/*
+	 * Just fix this up, for the case where the last a.g. is shorter
+	 * (or there's only one a.g.) and the caller couldn't easily figure
+	 * that out (xfs_bmap_alloc).
+	 */
+	agsize = mp->m_sb.sb_agblocks;
+	if (args->maxlen > agsize)
+		args->maxlen = agsize;
+	if (args->alignment == 0)
+		args->alignment = 1;
+	ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
+	ASSERT(args->minlen <= args->maxlen);
+	ASSERT(args->minlen <= agsize);
+	ASSERT(args->mod < args->prod);
+	if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
+	    XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
+	    args->minlen > args->maxlen || args->minlen > agsize ||
+	    args->mod >= args->prod) {
+		args->fsbno = NULLFSBLOCK;
+		trace_xfs_alloc_vextent_badargs(args);
+		return 0;
+	}
+	minleft = args->minleft;
+
+	switch (type) {
+	case XFS_ALLOCTYPE_THIS_AG:
+	case XFS_ALLOCTYPE_NEAR_BNO:
+	case XFS_ALLOCTYPE_THIS_BNO:
+		/*
+		 * These three force us into a single a.g.
+		 */
+		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+		args->pag = xfs_perag_get(mp, args->agno);
+		args->minleft = 0;
+		error = xfs_alloc_fix_freelist(args, 0);
+		args->minleft = minleft;
+		if (error) {
+			trace_xfs_alloc_vextent_nofix(args);
+			goto error0;
+		}
+		if (!args->agbp) {
+			trace_xfs_alloc_vextent_noagbp(args);
+			break;
+		}
+		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+		if ((error = xfs_alloc_ag_vextent(args)))
+			goto error0;
+		break;
+	case XFS_ALLOCTYPE_START_BNO:
+		/*
+		 * Try near allocation first, then anywhere-in-ag after
+		 * the first a.g. fails.
+		 */
+		if ((args->userdata  == XFS_ALLOC_INITIAL_USER_DATA) &&
+		    (mp->m_flags & XFS_MOUNT_32BITINODES)) {
+			args->fsbno = XFS_AGB_TO_FSB(mp,
+					((mp->m_agfrotor / rotorstep) %
+					mp->m_sb.sb_agcount), 0);
+			bump_rotor = 1;
+		}
+		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+		args->type = XFS_ALLOCTYPE_NEAR_BNO;
+		/* FALLTHROUGH */
+	case XFS_ALLOCTYPE_ANY_AG:
+	case XFS_ALLOCTYPE_START_AG:
+	case XFS_ALLOCTYPE_FIRST_AG:
+		/*
+		 * Rotate through the allocation groups looking for a winner.
+		 */
+		if (type == XFS_ALLOCTYPE_ANY_AG) {
+			/*
+			 * Start with the last place we left off.
+			 */
+			args->agno = sagno = (mp->m_agfrotor / rotorstep) %
+					mp->m_sb.sb_agcount;
+			args->type = XFS_ALLOCTYPE_THIS_AG;
+			flags = XFS_ALLOC_FLAG_TRYLOCK;
+		} else if (type == XFS_ALLOCTYPE_FIRST_AG) {
+			/*
+			 * Start with allocation group given by bno.
+			 */
+			args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+			args->type = XFS_ALLOCTYPE_THIS_AG;
+			sagno = 0;
+			flags = 0;
+		} else {
+			if (type == XFS_ALLOCTYPE_START_AG)
+				args->type = XFS_ALLOCTYPE_THIS_AG;
+			/*
+			 * Start with the given allocation group.
+			 */
+			args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+			flags = XFS_ALLOC_FLAG_TRYLOCK;
+		}
+		/*
+		 * Loop over allocation groups twice; first time with
+		 * trylock set, second time without.
+		 */
+		for (;;) {
+			args->pag = xfs_perag_get(mp, args->agno);
+			if (no_min) args->minleft = 0;
+			error = xfs_alloc_fix_freelist(args, flags);
+			args->minleft = minleft;
+			if (error) {
+				trace_xfs_alloc_vextent_nofix(args);
+				goto error0;
+			}
+			/*
+			 * If we get a buffer back then the allocation will fly.
+			 */
+			if (args->agbp) {
+				if ((error = xfs_alloc_ag_vextent(args)))
+					goto error0;
+				break;
+			}
+
+			trace_xfs_alloc_vextent_loopfailed(args);
+
+			/*
+			 * Didn't work, figure out the next iteration.
+			 */
+			if (args->agno == sagno &&
+			    type == XFS_ALLOCTYPE_START_BNO)
+				args->type = XFS_ALLOCTYPE_THIS_AG;
+			/*
+			* For the first allocation, we can try any AG to get
+			* space.  However, if we already have allocated a
+			* block, we don't want to try AGs whose number is below
+			* sagno. Otherwise, we may end up with out-of-order
+			* locking of AGF, which might cause deadlock.
+			*/
+			if (++(args->agno) == mp->m_sb.sb_agcount) {
+				if (args->firstblock != NULLFSBLOCK)
+					args->agno = sagno;
+				else
+					args->agno = 0;
+			}
+			/*
+			 * Reached the starting a.g., must either be done
+			 * or switch to non-trylock mode.
+			 */
+			if (args->agno == sagno) {
+				if (no_min == 1) {
+					args->agbno = NULLAGBLOCK;
+					trace_xfs_alloc_vextent_allfailed(args);
+					break;
+				}
+				if (flags == 0) {
+					no_min = 1;
+				} else {
+					flags = 0;
+					if (type == XFS_ALLOCTYPE_START_BNO) {
+						args->agbno = XFS_FSB_TO_AGBNO(mp,
+							args->fsbno);
+						args->type = XFS_ALLOCTYPE_NEAR_BNO;
+					}
+				}
+			}
+			xfs_perag_put(args->pag);
+		}
+		if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
+			if (args->agno == sagno)
+				mp->m_agfrotor = (mp->m_agfrotor + 1) %
+					(mp->m_sb.sb_agcount * rotorstep);
+			else
+				mp->m_agfrotor = (args->agno * rotorstep + 1) %
+					(mp->m_sb.sb_agcount * rotorstep);
+		}
+		break;
+	default:
+		ASSERT(0);
+		/* NOTREACHED */
+	}
+	if (args->agbno == NULLAGBLOCK)
+		args->fsbno = NULLFSBLOCK;
+	else {
+		args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+#ifdef DEBUG
+		ASSERT(args->len >= args->minlen);
+		ASSERT(args->len <= args->maxlen);
+		ASSERT(args->agbno % args->alignment == 0);
+		XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
+			args->len);
+#endif
+	}
+	xfs_perag_put(args->pag);
+	return 0;
+error0:
+	xfs_perag_put(args->pag);
+	return error;
+}
+
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int				/* error */
+xfs_free_extent(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_fsblock_t	bno,	/* starting block number of extent */
+	xfs_extlen_t	len)	/* length of extent */
+{
+	xfs_alloc_arg_t	args;
+	int		error;
+
+	ASSERT(len != 0);
+	memset(&args, 0, sizeof(xfs_alloc_arg_t));
+	args.tp = tp;
+	args.mp = tp->t_mountp;
+
+	/*
+	 * validate that the block number is legal - the enables us to detect
+	 * and handle a silent filesystem corruption rather than crashing.
+	 */
+	args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
+	if (args.agno >= args.mp->m_sb.sb_agcount)
+		return EFSCORRUPTED;
+
+	args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
+	if (args.agbno >= args.mp->m_sb.sb_agblocks)
+		return EFSCORRUPTED;
+
+	args.pag = xfs_perag_get(args.mp, args.agno);
+	ASSERT(args.pag);
+
+	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+	if (error)
+		goto error0;
+
+	/* validate the extent size is legal now we have the agf locked */
+	if (args.agbno + len >
+			be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
+		error = EFSCORRUPTED;
+		goto error0;
+	}
+
+	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
+	if (!error)
+		xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
+error0:
+	xfs_perag_put(args.pag);
+	return error;
+}
+
+void
+xfs_alloc_busy_insert(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	unsigned int		flags)
+{
+	struct xfs_busy_extent	*new;
+	struct xfs_busy_extent	*busyp;
+	struct xfs_perag	*pag;
+	struct rb_node		**rbp;
+	struct rb_node		*parent = NULL;
+
+	new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
+	if (!new) {
+		/*
+		 * No Memory!  Since it is now not possible to track the free
+		 * block, make this a synchronous transaction to insure that
+		 * the block is not reused before this transaction commits.
+		 */
+		trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
+		xfs_trans_set_sync(tp);
+		return;
+	}
+
+	new->agno = agno;
+	new->bno = bno;
+	new->length = len;
+	INIT_LIST_HEAD(&new->list);
+	new->flags = flags;
+
+	/* trace before insert to be able to see failed inserts */
+	trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
+
+	pag = xfs_perag_get(tp->t_mountp, new->agno);
+	spin_lock(&pag->pagb_lock);
+	rbp = &pag->pagb_tree.rb_node;
+	while (*rbp) {
+		parent = *rbp;
+		busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
+
+		if (new->bno < busyp->bno) {
+			rbp = &(*rbp)->rb_left;
+			ASSERT(new->bno + new->length <= busyp->bno);
+		} else if (new->bno > busyp->bno) {
+			rbp = &(*rbp)->rb_right;
+			ASSERT(bno >= busyp->bno + busyp->length);
+		} else {
+			ASSERT(0);
+		}
+	}
+
+	rb_link_node(&new->rb_node, parent, rbp);
+	rb_insert_color(&new->rb_node, &pag->pagb_tree);
+
+	list_add(&new->list, &tp->t_busy);
+	spin_unlock(&pag->pagb_lock);
+	xfs_perag_put(pag);
+}
+
+/*
+ * Search for a busy extent within the range of the extent we are about to
+ * allocate.  You need to be holding the busy extent tree lock when calling
+ * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
+ */
+int
+xfs_alloc_busy_search(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len)
+{
+	struct xfs_perag	*pag;
+	struct rb_node		*rbp;
+	struct xfs_busy_extent	*busyp;
+	int			match = 0;
+
+	pag = xfs_perag_get(mp, agno);
+	spin_lock(&pag->pagb_lock);
+
+	rbp = pag->pagb_tree.rb_node;
+
+	/* find closest start bno overlap */
+	while (rbp) {
+		busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
+		if (bno < busyp->bno) {
+			/* may overlap, but exact start block is lower */
+			if (bno + len > busyp->bno)
+				match = -1;
+			rbp = rbp->rb_left;
+		} else if (bno > busyp->bno) {
+			/* may overlap, but exact start block is higher */
+			if (bno < busyp->bno + busyp->length)
+				match = -1;
+			rbp = rbp->rb_right;
+		} else {
+			/* bno matches busyp, length determines exact match */
+			match = (busyp->length == len) ? 1 : -1;
+			break;
+		}
+	}
+	spin_unlock(&pag->pagb_lock);
+	xfs_perag_put(pag);
+	return match;
+}
+
+/*
+ * The found free extent [fbno, fend] overlaps part or all of the given busy
+ * extent.  If the overlap covers the beginning, the end, or all of the busy
+ * extent, the overlapping portion can be made unbusy and used for the
+ * allocation.  We can't split a busy extent because we can't modify a
+ * transaction/CIL context busy list, but we can update an entries block
+ * number or length.
+ *
+ * Returns true if the extent can safely be reused, or false if the search
+ * needs to be restarted.
+ */
+STATIC bool
+xfs_alloc_busy_update_extent(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	struct xfs_busy_extent	*busyp,
+	xfs_agblock_t		fbno,
+	xfs_extlen_t		flen,
+	bool			userdata)
+{
+	xfs_agblock_t		fend = fbno + flen;
+	xfs_agblock_t		bbno = busyp->bno;
+	xfs_agblock_t		bend = bbno + busyp->length;
+
+	/*
+	 * This extent is currently being discarded.  Give the thread
+	 * performing the discard a chance to mark the extent unbusy
+	 * and retry.
+	 */
+	if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
+		spin_unlock(&pag->pagb_lock);
+		delay(1);
+		spin_lock(&pag->pagb_lock);
+		return false;
+	}
+
+	/*
+	 * If there is a busy extent overlapping a user allocation, we have
+	 * no choice but to force the log and retry the search.
+	 *
+	 * Fortunately this does not happen during normal operation, but
+	 * only if the filesystem is very low on space and has to dip into
+	 * the AGFL for normal allocations.
+	 */
+	if (userdata)
+		goto out_force_log;
+
+	if (bbno < fbno && bend > fend) {
+		/*
+		 * Case 1:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *        +---------+
+		 *        fbno   fend
+		 */
+
+		/*
+		 * We would have to split the busy extent to be able to track
+		 * it correct, which we cannot do because we would have to
+		 * modify the list of busy extents attached to the transaction
+		 * or CIL context, which is immutable.
+		 *
+		 * Force out the log to clear the busy extent and retry the
+		 * search.
+		 */
+		goto out_force_log;
+	} else if (bbno >= fbno && bend <= fend) {
+		/*
+		 * Case 2:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *    +-----------------+
+		 *    fbno           fend
+		 *
+		 * Case 3:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *    +--------------------------+
+		 *    fbno                    fend
+		 *
+		 * Case 4:
+		 *             bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *    +--------------------------+
+		 *    fbno                    fend
+		 *
+		 * Case 5:
+		 *             bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *    +-----------------------------------+
+		 *    fbno                             fend
+		 *
+		 */
+
+		/*
+		 * The busy extent is fully covered by the extent we are
+		 * allocating, and can simply be removed from the rbtree.
+		 * However we cannot remove it from the immutable list
+		 * tracking busy extents in the transaction or CIL context,
+		 * so set the length to zero to mark it invalid.
+		 *
+		 * We also need to restart the busy extent search from the
+		 * tree root, because erasing the node can rearrange the
+		 * tree topology.
+		 */
+		rb_erase(&busyp->rb_node, &pag->pagb_tree);
+		busyp->length = 0;
+		return false;
+	} else if (fend < bend) {
+		/*
+		 * Case 6:
+		 *              bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *             +---------+
+		 *             fbno   fend
+		 *
+		 * Case 7:
+		 *             bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *    +------------------+
+		 *    fbno            fend
+		 *
+		 */
+		busyp->bno = fend;
+	} else if (bbno < fbno) {
+		/*
+		 * Case 8:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *        +-------------+
+		 *        fbno       fend
+		 *
+		 * Case 9:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *        +----------------------+
+		 *        fbno                fend
+		 */
+		busyp->length = fbno - busyp->bno;
+	} else {
+		ASSERT(0);
+	}
+
+	trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
+	return true;
+
+out_force_log:
+	spin_unlock(&pag->pagb_lock);
+	xfs_log_force(mp, XFS_LOG_SYNC);
+	trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
+	spin_lock(&pag->pagb_lock);
+	return false;
+}
+
+
+/*
+ * For a given extent [fbno, flen], make sure we can reuse it safely.
+ */
+void
+xfs_alloc_busy_reuse(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		fbno,
+	xfs_extlen_t		flen,
+	bool			userdata)
+{
+	struct xfs_perag	*pag;
+	struct rb_node		*rbp;
+
+	ASSERT(flen > 0);
+
+	pag = xfs_perag_get(mp, agno);
+	spin_lock(&pag->pagb_lock);
+restart:
+	rbp = pag->pagb_tree.rb_node;
+	while (rbp) {
+		struct xfs_busy_extent *busyp =
+			rb_entry(rbp, struct xfs_busy_extent, rb_node);
+		xfs_agblock_t	bbno = busyp->bno;
+		xfs_agblock_t	bend = bbno + busyp->length;
+
+		if (fbno + flen <= bbno) {
+			rbp = rbp->rb_left;
+			continue;
+		} else if (fbno >= bend) {
+			rbp = rbp->rb_right;
+			continue;
+		}
+
+		if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
+						  userdata))
+			goto restart;
+	}
+	spin_unlock(&pag->pagb_lock);
+	xfs_perag_put(pag);
+}
+
+/*
+ * For a given extent [fbno, flen], search the busy extent list to find a
+ * subset of the extent that is not busy.  If *rlen is smaller than
+ * args->minlen no suitable extent could be found, and the higher level
+ * code needs to force out the log and retry the allocation.
+ */
+STATIC void
+xfs_alloc_busy_trim(
+	struct xfs_alloc_arg	*args,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	xfs_agblock_t		*rbno,
+	xfs_extlen_t		*rlen)
+{
+	xfs_agblock_t		fbno;
+	xfs_extlen_t		flen;
+	struct rb_node		*rbp;
+
+	ASSERT(len > 0);
+
+	spin_lock(&args->pag->pagb_lock);
+restart:
+	fbno = bno;
+	flen = len;
+	rbp = args->pag->pagb_tree.rb_node;
+	while (rbp && flen >= args->minlen) {
+		struct xfs_busy_extent *busyp =
+			rb_entry(rbp, struct xfs_busy_extent, rb_node);
+		xfs_agblock_t	fend = fbno + flen;
+		xfs_agblock_t	bbno = busyp->bno;
+		xfs_agblock_t	bend = bbno + busyp->length;
+
+		if (fend <= bbno) {
+			rbp = rbp->rb_left;
+			continue;
+		} else if (fbno >= bend) {
+			rbp = rbp->rb_right;
+			continue;
+		}
+
+		/*
+		 * If this is a metadata allocation, try to reuse the busy
+		 * extent instead of trimming the allocation.
+		 */
+		if (!args->userdata &&
+		    !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
+			if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
+							  busyp, fbno, flen,
+							  false))
+				goto restart;
+			continue;
+		}
+
+		if (bbno <= fbno) {
+			/* start overlap */
+
+			/*
+			 * Case 1:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *        +---------+
+			 *        fbno   fend
+			 *
+			 * Case 2:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *    +-------------+
+			 *    fbno       fend
+			 *
+			 * Case 3:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *        +-------------+
+			 *        fbno       fend
+			 *
+			 * Case 4:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *    +-----------------+
+			 *    fbno           fend
+			 *
+			 * No unbusy region in extent, return failure.
+			 */
+			if (fend <= bend)
+				goto fail;
+
+			/*
+			 * Case 5:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *        +----------------------+
+			 *        fbno                fend
+			 *
+			 * Case 6:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *    +--------------------------+
+			 *    fbno                    fend
+			 *
+			 * Needs to be trimmed to:
+			 *                       +-------+
+			 *                       fbno fend
+			 */
+			fbno = bend;
+		} else if (bend >= fend) {
+			/* end overlap */
+
+			/*
+			 * Case 7:
+			 *             bbno           bend
+			 *             +BBBBBBBBBBBBBBBBB+
+			 *    +------------------+
+			 *    fbno            fend
+			 *
+			 * Case 8:
+			 *             bbno           bend
+			 *             +BBBBBBBBBBBBBBBBB+
+			 *    +--------------------------+
+			 *    fbno                    fend
+			 *
+			 * Needs to be trimmed to:
+			 *    +-------+
+			 *    fbno fend
+			 */
+			fend = bbno;
+		} else {
+			/* middle overlap */
+
+			/*
+			 * Case 9:
+			 *             bbno           bend
+			 *             +BBBBBBBBBBBBBBBBB+
+			 *    +-----------------------------------+
+			 *    fbno                             fend
+			 *
+			 * Can be trimmed to:
+			 *    +-------+        OR         +-------+
+			 *    fbno fend                   fbno fend
+			 *
+			 * Backward allocation leads to significant
+			 * fragmentation of directories, which degrades
+			 * directory performance, therefore we always want to
+			 * choose the option that produces forward allocation
+			 * patterns.
+			 * Preferring the lower bno extent will make the next
+			 * request use "fend" as the start of the next
+			 * allocation;  if the segment is no longer busy at
+			 * that point, we'll get a contiguous allocation, but
+			 * even if it is still busy, we will get a forward
+			 * allocation.
+			 * We try to avoid choosing the segment at "bend",
+			 * because that can lead to the next allocation
+			 * taking the segment at "fbno", which would be a
+			 * backward allocation.  We only use the segment at
+			 * "fbno" if it is much larger than the current
+			 * requested size, because in that case there's a
+			 * good chance subsequent allocations will be
+			 * contiguous.
+			 */
+			if (bbno - fbno >= args->maxlen) {
+				/* left candidate fits perfect */
+				fend = bbno;
+			} else if (fend - bend >= args->maxlen * 4) {
+				/* right candidate has enough free space */
+				fbno = bend;
+			} else if (bbno - fbno >= args->minlen) {
+				/* left candidate fits minimum requirement */
+				fend = bbno;
+			} else {
+				goto fail;
+			}
+		}
+
+		flen = fend - fbno;
+	}
+	spin_unlock(&args->pag->pagb_lock);
+
+	if (fbno != bno || flen != len) {
+		trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
+					  fbno, flen);
+	}
+	*rbno = fbno;
+	*rlen = flen;
+	return;
+fail:
+	/*
+	 * Return a zero extent length as failure indications.  All callers
+	 * re-check if the trimmed extent satisfies the minlen requirement.
+	 */
+	spin_unlock(&args->pag->pagb_lock);
+	trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
+	*rbno = fbno;
+	*rlen = 0;
+}
+
+static void
+xfs_alloc_busy_clear_one(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	struct xfs_busy_extent	*busyp)
+{
+	if (busyp->length) {
+		trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
+						busyp->length);
+		rb_erase(&busyp->rb_node, &pag->pagb_tree);
+	}
+
+	list_del_init(&busyp->list);
+	kmem_free(busyp);
+}
+
+/*
+ * Remove all extents on the passed in list from the busy extents tree.
+ * If do_discard is set skip extents that need to be discarded, and mark
+ * these as undergoing a discard operation instead.
+ */
+void
+xfs_alloc_busy_clear(
+	struct xfs_mount	*mp,
+	struct list_head	*list,
+	bool			do_discard)
+{
+	struct xfs_busy_extent	*busyp, *n;
+	struct xfs_perag	*pag = NULL;
+	xfs_agnumber_t		agno = NULLAGNUMBER;
+
+	list_for_each_entry_safe(busyp, n, list, list) {
+		if (busyp->agno != agno) {
+			if (pag) {
+				spin_unlock(&pag->pagb_lock);
+				xfs_perag_put(pag);
+			}
+			pag = xfs_perag_get(mp, busyp->agno);
+			spin_lock(&pag->pagb_lock);
+			agno = busyp->agno;
+		}
+
+		if (do_discard && busyp->length &&
+		    !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
+			busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
+		else
+			xfs_alloc_busy_clear_one(mp, pag, busyp);
+	}
+
+	if (pag) {
+		spin_unlock(&pag->pagb_lock);
+		xfs_perag_put(pag);
+	}
+}
+
+/*
+ * Callback for list_sort to sort busy extents by the AG they reside in.
+ */
+int
+xfs_busy_extent_ag_cmp(
+	void			*priv,
+	struct list_head	*a,
+	struct list_head	*b)
+{
+	return container_of(a, struct xfs_busy_extent, list)->agno -
+		container_of(b, struct xfs_busy_extent, list)->agno;
+}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
new file mode 100644
index 00000000..2f52b924
--- /dev/null
+++ b/fs/xfs/xfs_alloc.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ALLOC_H__
+#define	__XFS_ALLOC_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_trans;
+struct xfs_busy_extent;
+
+/*
+ * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
+ */
+#define XFS_ALLOCTYPE_ANY_AG	0x01	/* allocate anywhere, use rotor */
+#define XFS_ALLOCTYPE_FIRST_AG	0x02	/* ... start at ag 0 */
+#define XFS_ALLOCTYPE_START_AG	0x04	/* anywhere, start in this a.g. */
+#define XFS_ALLOCTYPE_THIS_AG	0x08	/* anywhere in this a.g. */
+#define XFS_ALLOCTYPE_START_BNO	0x10	/* near this block else anywhere */
+#define XFS_ALLOCTYPE_NEAR_BNO	0x20	/* in this a.g. and near this block */
+#define XFS_ALLOCTYPE_THIS_BNO	0x40	/* at exactly this block */
+
+/* this should become an enum again when the tracing code is fixed */
+typedef unsigned int xfs_alloctype_t;
+
+#define XFS_ALLOC_TYPES \
+	{ XFS_ALLOCTYPE_ANY_AG,		"ANY_AG" }, \
+	{ XFS_ALLOCTYPE_FIRST_AG,	"FIRST_AG" }, \
+	{ XFS_ALLOCTYPE_START_AG,	"START_AG" }, \
+	{ XFS_ALLOCTYPE_THIS_AG,	"THIS_AG" }, \
+	{ XFS_ALLOCTYPE_START_BNO,	"START_BNO" }, \
+	{ XFS_ALLOCTYPE_NEAR_BNO,	"NEAR_BNO" }, \
+	{ XFS_ALLOCTYPE_THIS_BNO,	"THIS_BNO" }
+
+/*
+ * Flags for xfs_alloc_fix_freelist.
+ */
+#define	XFS_ALLOC_FLAG_TRYLOCK	0x00000001  /* use trylock for buffer locking */
+#define	XFS_ALLOC_FLAG_FREEING	0x00000002  /* indicate caller is freeing extents*/
+
+/*
+ * In order to avoid ENOSPC-related deadlock caused by
+ * out-of-order locking of AGF buffer (PV 947395), we place
+ * constraints on the relationship among actual allocations for
+ * data blocks, freelist blocks, and potential file data bmap
+ * btree blocks. However, these restrictions may result in no
+ * actual space allocated for a delayed extent, for example, a data
+ * block in a certain AG is allocated but there is no additional
+ * block for the additional bmap btree block due to a split of the
+ * bmap btree of the file. The result of this may lead to an
+ * infinite loop in xfssyncd when the file gets flushed to disk and
+ * all delayed extents need to be actually allocated. To get around
+ * this, we explicitly set aside a few blocks which will not be
+ * reserved in delayed allocation. Considering the minimum number of
+ * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
+ * btree requires 1 fsb, so we set the number of set-aside blocks
+ * to 4 + 4*agcount.
+ */
+#define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
+
+/*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ *	- the AG superblock, AGF, AGI and AGFL
+ *	- the AGF (bno and cnt) and AGI btree root blocks
+ *	- 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+#define XFS_ALLOC_AG_MAX_USABLE(mp)	\
+	((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+
+
+/*
+ * Argument structure for xfs_alloc routines.
+ * This is turned into a structure to avoid having 20 arguments passed
+ * down several levels of the stack.
+ */
+typedef struct xfs_alloc_arg {
+	struct xfs_trans *tp;		/* transaction pointer */
+	struct xfs_mount *mp;		/* file system mount point */
+	struct xfs_buf	*agbp;		/* buffer for a.g. freelist header */
+	struct xfs_perag *pag;		/* per-ag struct for this agno */
+	xfs_fsblock_t	fsbno;		/* file system block number */
+	xfs_agnumber_t	agno;		/* allocation group number */
+	xfs_agblock_t	agbno;		/* allocation group-relative block # */
+	xfs_extlen_t	minlen;		/* minimum size of extent */
+	xfs_extlen_t	maxlen;		/* maximum size of extent */
+	xfs_extlen_t	mod;		/* mod value for extent size */
+	xfs_extlen_t	prod;		/* prod value for extent size */
+	xfs_extlen_t	minleft;	/* min blocks must be left after us */
+	xfs_extlen_t	total;		/* total blocks needed in xaction */
+	xfs_extlen_t	alignment;	/* align answer to multiple of this */
+	xfs_extlen_t	minalignslop;	/* slop for minlen+alignment calcs */
+	xfs_extlen_t	len;		/* output: actual size of extent */
+	xfs_alloctype_t	type;		/* allocation type XFS_ALLOCTYPE_... */
+	xfs_alloctype_t	otype;		/* original allocation type */
+	char		wasdel;		/* set if allocation was prev delayed */
+	char		wasfromfl;	/* set if allocation is from freelist */
+	char		isfl;		/* set if is freelist blocks - !acctg */
+	char		userdata;	/* set if this is user data */
+	xfs_fsblock_t	firstblock;	/* io first block allocated */
+} xfs_alloc_arg_t;
+
+/*
+ * Defines for userdata
+ */
+#define XFS_ALLOC_USERDATA		1	/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA	2	/* special case start of file */
+
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+		struct xfs_perag *pag);
+
+#ifdef __KERNEL__
+void
+xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
+	xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
+
+void
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list,
+	bool do_discard);
+
+int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+	xfs_agblock_t bno, xfs_extlen_t len);
+
+void
+xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
+	xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
+
+int
+xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+
+static inline void xfs_alloc_busy_sort(struct list_head *list)
+{
+	list_sort(NULL, list, xfs_busy_extent_ag_cmp);
+}
+
+#endif	/* __KERNEL__ */
+
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+	struct xfs_mount	*mp);	/* file system mount structure */
+
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int				/* error */
+xfs_alloc_get_freelist(
+	struct xfs_trans *tp,	/* transaction pointer */
+	struct xfs_buf	*agbp,	/* buffer containing the agf structure */
+	xfs_agblock_t	*bnop,	/* block address retrieved from freelist */
+	int		btreeblk); /* destination is a AGF btree */
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+	struct xfs_trans *tp,	/* transaction pointer */
+	struct xfs_buf	*bp,	/* buffer for a.g. freelist header */
+	int		fields);/* mask of fields to be logged (XFS_AGF_...) */
+
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int				/* error */
+xfs_alloc_pagf_init(
+	struct xfs_mount *mp,	/* file system mount structure */
+	struct xfs_trans *tp,	/* transaction pointer */
+	xfs_agnumber_t	agno,	/* allocation group number */
+	int		flags);	/* XFS_ALLOC_FLAGS_... */
+
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int				/* error */
+xfs_alloc_put_freelist(
+	struct xfs_trans *tp,	/* transaction pointer */
+	struct xfs_buf	*agbp,	/* buffer for a.g. freelist header */
+	struct xfs_buf	*agflbp,/* buffer for a.g. free block array */
+	xfs_agblock_t	bno,	/* block being freed */
+	int		btreeblk); /* owner was a AGF btree */
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int					/* error  */
+xfs_alloc_read_agf(
+	struct xfs_mount *mp,		/* mount point structure */
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	int		flags,		/* XFS_ALLOC_FLAG_... */
+	struct xfs_buf	**bpp);		/* buffer for the ag freelist header */
+
+/*
+ * Allocate an extent (variable-size).
+ */
+int				/* error */
+xfs_alloc_vextent(
+	xfs_alloc_arg_t	*args);	/* allocation argument structure */
+
+/*
+ * Free an extent.
+ */
+int				/* error */
+xfs_free_extent(
+	struct xfs_trans *tp,	/* transaction pointer */
+	xfs_fsblock_t	bno,	/* starting block number of extent */
+	xfs_extlen_t	len);	/* length of extent */
+
+int					/* error */
+xfs_alloc_lookup_le(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat);	/* success/failure */
+
+int					/* error */
+xfs_alloc_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		*bno,	/* output: starting block of extent */
+	xfs_extlen_t		*len,	/* output: length of extent */
+	int			*stat);	/* output: success/failure */
+
+#endif	/* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
new file mode 100644
index 00000000..2b351882
--- /dev/null
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+
+STATIC struct xfs_btree_cur *
+xfs_allocbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.a.agbp, cur->bc_private.a.agno,
+			cur->bc_btnum);
+}
+
+STATIC void
+xfs_allocbt_set_root(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			inc)
+{
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
+	int			btnum = cur->bc_btnum;
+	struct xfs_perag	*pag = xfs_perag_get(cur->bc_mp, seqno);
+
+	ASSERT(ptr->s != 0);
+
+	agf->agf_roots[btnum] = ptr->s;
+	be32_add_cpu(&agf->agf_levels[btnum], inc);
+	pag->pagf_levels[btnum] += inc;
+	xfs_perag_put(pag);
+
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+
+STATIC int
+xfs_allocbt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			length,
+	int			*stat)
+{
+	int			error;
+	xfs_agblock_t		bno;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+	/* Allocate the new block from the freelist. If we can't, give up.  */
+	error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+				       &bno, 1);
+	if (error) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+		return error;
+	}
+
+	if (bno == NULLAGBLOCK) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+
+	xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+
+	xfs_trans_agbtree_delta(cur->bc_tp, 1);
+	new->s = cpu_to_be32(bno);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+}
+
+STATIC int
+xfs_allocbt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	xfs_agblock_t		bno;
+	int			error;
+
+	bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
+	error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+	if (error)
+		return error;
+
+	xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+			      XFS_ALLOC_BUSY_SKIP_DISCARD);
+	xfs_trans_agbtree_delta(cur->bc_tp, -1);
+	return 0;
+}
+
+/*
+ * Update the longest extent in the AGF
+ */
+STATIC void
+xfs_allocbt_update_lastrec(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	union xfs_btree_rec	*rec,
+	int			ptr,
+	int			reason)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
+	struct xfs_perag	*pag;
+	__be32			len;
+	int			numrecs;
+
+	ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+
+	switch (reason) {
+	case LASTREC_UPDATE:
+		/*
+		 * If this is the last leaf block and it's the last record,
+		 * then update the size of the longest extent in the AG.
+		 */
+		if (ptr != xfs_btree_get_numrecs(block))
+			return;
+		len = rec->alloc.ar_blockcount;
+		break;
+	case LASTREC_INSREC:
+		if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+		    be32_to_cpu(agf->agf_longest))
+			return;
+		len = rec->alloc.ar_blockcount;
+		break;
+	case LASTREC_DELREC:
+		numrecs = xfs_btree_get_numrecs(block);
+		if (ptr <= numrecs)
+			return;
+		ASSERT(ptr == numrecs + 1);
+
+		if (numrecs) {
+			xfs_alloc_rec_t *rrp;
+
+			rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
+			len = rrp->ar_blockcount;
+		} else {
+			len = 0;
+		}
+
+		break;
+	default:
+		ASSERT(0);
+		return;
+	}
+
+	agf->agf_longest = len;
+	pag = xfs_perag_get(cur->bc_mp, seqno);
+	pag->pagf_longest = be32_to_cpu(len);
+	xfs_perag_put(pag);
+	xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
+}
+
+STATIC int
+xfs_allocbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_alloc_mnr[level != 0];
+}
+
+STATIC int
+xfs_allocbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_alloc_mxr[level != 0];
+}
+
+STATIC void
+xfs_allocbt_init_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	ASSERT(rec->alloc.ar_startblock != 0);
+
+	key->alloc.ar_startblock = rec->alloc.ar_startblock;
+	key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+}
+
+STATIC void
+xfs_allocbt_init_rec_from_key(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	ASSERT(key->alloc.ar_startblock != 0);
+
+	rec->alloc.ar_startblock = key->alloc.ar_startblock;
+	rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
+}
+
+STATIC void
+xfs_allocbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	ASSERT(cur->bc_rec.a.ar_startblock != 0);
+
+	rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+	rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
+}
+
+STATIC void
+xfs_allocbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+
+	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+	ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
+
+	ptr->s = agf->agf_roots[cur->bc_btnum];
+}
+
+STATIC __int64_t
+xfs_allocbt_key_diff(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key)
+{
+	xfs_alloc_rec_incore_t	*rec = &cur->bc_rec.a;
+	xfs_alloc_key_t		*kp = &key->alloc;
+	__int64_t		diff;
+
+	if (cur->bc_btnum == XFS_BTNUM_BNO) {
+		return (__int64_t)be32_to_cpu(kp->ar_startblock) -
+				rec->ar_startblock;
+	}
+
+	diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
+	if (diff)
+		return diff;
+
+	return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+}
+
+#ifdef DEBUG
+STATIC int
+xfs_allocbt_keys_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	if (cur->bc_btnum == XFS_BTNUM_BNO) {
+		return be32_to_cpu(k1->alloc.ar_startblock) <
+		       be32_to_cpu(k2->alloc.ar_startblock);
+	} else {
+		return be32_to_cpu(k1->alloc.ar_blockcount) <
+			be32_to_cpu(k2->alloc.ar_blockcount) ||
+			(k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+			 be32_to_cpu(k1->alloc.ar_startblock) <
+			 be32_to_cpu(k2->alloc.ar_startblock));
+	}
+}
+
+STATIC int
+xfs_allocbt_recs_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*r1,
+	union xfs_btree_rec	*r2)
+{
+	if (cur->bc_btnum == XFS_BTNUM_BNO) {
+		return be32_to_cpu(r1->alloc.ar_startblock) +
+			be32_to_cpu(r1->alloc.ar_blockcount) <=
+			be32_to_cpu(r2->alloc.ar_startblock);
+	} else {
+		return be32_to_cpu(r1->alloc.ar_blockcount) <
+			be32_to_cpu(r2->alloc.ar_blockcount) ||
+			(r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+			 be32_to_cpu(r1->alloc.ar_startblock) <
+			 be32_to_cpu(r2->alloc.ar_startblock));
+	}
+}
+#endif	/* DEBUG */
+
+#ifdef XFS_BTREE_TRACE
+ktrace_t	*xfs_allocbt_trace_buf;
+
+STATIC void
+xfs_allocbt_trace_enter(
+	struct xfs_btree_cur	*cur,
+	const char		*func,
+	char			*s,
+	int			type,
+	int			line,
+	__psunsigned_t		a0,
+	__psunsigned_t		a1,
+	__psunsigned_t		a2,
+	__psunsigned_t		a3,
+	__psunsigned_t		a4,
+	__psunsigned_t		a5,
+	__psunsigned_t		a6,
+	__psunsigned_t		a7,
+	__psunsigned_t		a8,
+	__psunsigned_t		a9,
+	__psunsigned_t		a10)
+{
+	ktrace_enter(xfs_allocbt_trace_buf, (void *)(__psint_t)type,
+		(void *)func, (void *)s, NULL, (void *)cur,
+		(void *)a0, (void *)a1, (void *)a2, (void *)a3,
+		(void *)a4, (void *)a5, (void *)a6, (void *)a7,
+		(void *)a8, (void *)a9, (void *)a10);
+}
+
+STATIC void
+xfs_allocbt_trace_cursor(
+	struct xfs_btree_cur	*cur,
+	__uint32_t		*s0,
+	__uint64_t		*l0,
+	__uint64_t		*l1)
+{
+	*s0 = cur->bc_private.a.agno;
+	*l0 = cur->bc_rec.a.ar_startblock;
+	*l1 = cur->bc_rec.a.ar_blockcount;
+}
+
+STATIC void
+xfs_allocbt_trace_key(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key,
+	__uint64_t		*l0,
+	__uint64_t		*l1)
+{
+	*l0 = be32_to_cpu(key->alloc.ar_startblock);
+	*l1 = be32_to_cpu(key->alloc.ar_blockcount);
+}
+
+STATIC void
+xfs_allocbt_trace_record(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec,
+	__uint64_t		*l0,
+	__uint64_t		*l1,
+	__uint64_t		*l2)
+{
+	*l0 = be32_to_cpu(rec->alloc.ar_startblock);
+	*l1 = be32_to_cpu(rec->alloc.ar_blockcount);
+	*l2 = 0;
+}
+#endif /* XFS_BTREE_TRACE */
+
+static const struct xfs_btree_ops xfs_allocbt_ops = {
+	.rec_len		= sizeof(xfs_alloc_rec_t),
+	.key_len		= sizeof(xfs_alloc_key_t),
+
+	.dup_cursor		= xfs_allocbt_dup_cursor,
+	.set_root		= xfs_allocbt_set_root,
+	.alloc_block		= xfs_allocbt_alloc_block,
+	.free_block		= xfs_allocbt_free_block,
+	.update_lastrec		= xfs_allocbt_update_lastrec,
+	.get_minrecs		= xfs_allocbt_get_minrecs,
+	.get_maxrecs		= xfs_allocbt_get_maxrecs,
+	.init_key_from_rec	= xfs_allocbt_init_key_from_rec,
+	.init_rec_from_key	= xfs_allocbt_init_rec_from_key,
+	.init_rec_from_cur	= xfs_allocbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_allocbt_init_ptr_from_cur,
+	.key_diff		= xfs_allocbt_key_diff,
+
+#ifdef DEBUG
+	.keys_inorder		= xfs_allocbt_keys_inorder,
+	.recs_inorder		= xfs_allocbt_recs_inorder,
+#endif
+
+#ifdef XFS_BTREE_TRACE
+	.trace_enter		= xfs_allocbt_trace_enter,
+	.trace_cursor		= xfs_allocbt_trace_cursor,
+	.trace_key		= xfs_allocbt_trace_key,
+	.trace_record		= xfs_allocbt_trace_record,
+#endif
+};
+
+/*
+ * Allocate a new allocation btree cursor.
+ */
+struct xfs_btree_cur *			/* new alloc btree cursor */
+xfs_allocbt_init_cursor(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_buf		*agbp,		/* buffer for agf structure */
+	xfs_agnumber_t		agno,		/* allocation group number */
+	xfs_btnum_t		btnum)		/* btree identifier */
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	struct xfs_btree_cur	*cur;
+
+	ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
+
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_nlevels = be32_to_cpu(agf->agf_levels[btnum]);
+	cur->bc_btnum = btnum;
+	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+	cur->bc_ops = &xfs_allocbt_ops;
+	if (btnum == XFS_BTNUM_CNT)
+		cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
+
+	cur->bc_private.a.agbp = agbp;
+	cur->bc_private.a.agno = agno;
+
+	return cur;
+}
+
+/*
+ * Calculate number of records in an alloc btree block.
+ */
+int
+xfs_allocbt_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
+
+	if (leaf)
+		return blocklen / sizeof(xfs_alloc_rec_t);
+	return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+}
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
new file mode 100644
index 00000000..a6caa002
--- /dev/null
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ALLOC_BTREE_H__
+#define	__XFS_ALLOC_BTREE_H__
+
+/*
+ * Freespace on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+/*
+ * There are two on-disk btrees, one sorted by blockno and one sorted
+ * by blockcount and blockno.  All blocks look the same to make the code
+ * simpler; if we have time later, we'll make the optimizations.
+ */
+#define	XFS_ABTB_MAGIC	0x41425442	/* 'ABTB' for bno tree */
+#define	XFS_ABTC_MAGIC	0x41425443	/* 'ABTC' for cnt tree */
+
+/*
+ * Data record/key structure
+ */
+typedef struct xfs_alloc_rec {
+	__be32		ar_startblock;	/* starting block number */
+	__be32		ar_blockcount;	/* count of free blocks */
+} xfs_alloc_rec_t, xfs_alloc_key_t;
+
+typedef struct xfs_alloc_rec_incore {
+	xfs_agblock_t	ar_startblock;	/* starting block number */
+	xfs_extlen_t	ar_blockcount;	/* count of free blocks */
+} xfs_alloc_rec_incore_t;
+
+/* btree pointer type */
+typedef __be32 xfs_alloc_ptr_t;
+
+/*
+ * Minimum and maximum blocksize and sectorsize.
+ * The blocksize upper limit is pretty much arbitrary.
+ * The sectorsize upper limit is due to sizeof(sb_sectsize).
+ */
+#define XFS_MIN_BLOCKSIZE_LOG	9	/* i.e. 512 bytes */
+#define XFS_MAX_BLOCKSIZE_LOG	16	/* i.e. 65536 bytes */
+#define XFS_MIN_BLOCKSIZE	(1 << XFS_MIN_BLOCKSIZE_LOG)
+#define XFS_MAX_BLOCKSIZE	(1 << XFS_MAX_BLOCKSIZE_LOG)
+#define XFS_MIN_SECTORSIZE_LOG	9	/* i.e. 512 bytes */
+#define XFS_MAX_SECTORSIZE_LOG	15	/* i.e. 32768 bytes */
+#define XFS_MIN_SECTORSIZE	(1 << XFS_MIN_SECTORSIZE_LOG)
+#define XFS_MAX_SECTORSIZE	(1 << XFS_MAX_SECTORSIZE_LOG)
+
+/*
+ * Block numbers in the AG:
+ * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
+ */
+#define	XFS_BNO_BLOCK(mp)	((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
+#define	XFS_CNT_BLOCK(mp)	((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+
+/*
+ * Btree block header size depends on a superblock flag.
+ *
+ * (not quite yet, but soon)
+ */
+#define XFS_ALLOC_BLOCK_LEN(mp)	XFS_BTREE_SBLOCK_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_ALLOC_REC_ADDR(mp, block, index) \
+	((xfs_alloc_rec_t *) \
+		((char *)(block) + \
+		 XFS_ALLOC_BLOCK_LEN(mp) + \
+		 (((index) - 1) * sizeof(xfs_alloc_rec_t))))
+
+#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
+	((xfs_alloc_key_t *) \
+		((char *)(block) + \
+		 XFS_ALLOC_BLOCK_LEN(mp) + \
+		 ((index) - 1) * sizeof(xfs_alloc_key_t)))
+
+#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
+	((xfs_alloc_ptr_t *) \
+		((char *)(block) + \
+		 XFS_ALLOC_BLOCK_LEN(mp) + \
+		 (maxrecs) * sizeof(xfs_alloc_key_t) + \
+		 ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
+		struct xfs_trans *, struct xfs_buf *,
+		xfs_agnumber_t, xfs_btnum_t);
+extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+
+#endif	/* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
new file mode 100644
index 00000000..09022493
--- /dev/null
+++ b/fs/xfs/xfs_arch.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ARCH_H__
+#define __XFS_ARCH_H__
+
+#ifndef XFS_BIG_INUMS
+# error XFS_BIG_INUMS must be defined true or false
+#endif
+
+#ifdef __KERNEL__
+
+#include <asm/byteorder.h>
+
+#ifdef __BIG_ENDIAN
+#define	XFS_NATIVE_HOST	1
+#else
+#undef XFS_NATIVE_HOST
+#endif
+
+#else /* __KERNEL__ */
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define	XFS_NATIVE_HOST	1
+#else
+#undef XFS_NATIVE_HOST
+#endif
+
+#ifdef XFS_NATIVE_HOST
+#define cpu_to_be16(val)	((__force __be16)(__u16)(val))
+#define cpu_to_be32(val)	((__force __be32)(__u32)(val))
+#define cpu_to_be64(val)	((__force __be64)(__u64)(val))
+#define be16_to_cpu(val)	((__force __u16)(__be16)(val))
+#define be32_to_cpu(val)	((__force __u32)(__be32)(val))
+#define be64_to_cpu(val)	((__force __u64)(__be64)(val))
+#else
+#define cpu_to_be16(val)	((__force __be16)__swab16((__u16)(val)))
+#define cpu_to_be32(val)	((__force __be32)__swab32((__u32)(val)))
+#define cpu_to_be64(val)	((__force __be64)__swab64((__u64)(val)))
+#define be16_to_cpu(val)	(__swab16((__force __u16)(__be16)(val)))
+#define be32_to_cpu(val)	(__swab32((__force __u32)(__be32)(val)))
+#define be64_to_cpu(val)	(__swab64((__force __u64)(__be64)(val)))
+#endif
+
+static inline void be16_add_cpu(__be16 *a, __s16 b)
+{
+	*a = cpu_to_be16(be16_to_cpu(*a) + b);
+}
+
+static inline void be32_add_cpu(__be32 *a, __s32 b)
+{
+	*a = cpu_to_be32(be32_to_cpu(*a) + b);
+}
+
+static inline void be64_add_cpu(__be64 *a, __s64 b)
+{
+	*a = cpu_to_be64(be64_to_cpu(*a) + b);
+}
+
+#endif	/* __KERNEL__ */
+
+/*
+ * get and set integers from potentially unaligned locations
+ */
+
+#define INT_GET_UNALIGNED_16_BE(pointer) \
+   ((__u16)((((__u8*)(pointer))[0] << 8) | (((__u8*)(pointer))[1])))
+#define INT_SET_UNALIGNED_16_BE(pointer,value) \
+    { \
+	((__u8*)(pointer))[0] = (((value) >> 8) & 0xff); \
+	((__u8*)(pointer))[1] = (((value)     ) & 0xff); \
+    }
+
+/*
+ * In directories inode numbers are stored as unaligned arrays of unsigned
+ * 8bit integers on disk.
+ *
+ * For v1 directories or v2 directories that contain inode numbers that
+ * do not fit into 32bit the array has eight members, but the first member
+ * is always zero:
+ *
+ *  |unused|48-55|40-47|32-39|24-31|16-23| 8-15| 0- 7|
+ *
+ * For v2 directories that only contain entries with inode numbers that fit
+ * into 32bits a four-member array is used:
+ *
+ *  |24-31|16-23| 8-15| 0- 7|
+ */ 
+
+#define XFS_GET_DIR_INO4(di) \
+	(((__u32)(di).i[0] << 24) | ((di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
+
+#define XFS_PUT_DIR_INO4(from, di) \
+do { \
+	(di).i[0] = (((from) & 0xff000000ULL) >> 24); \
+	(di).i[1] = (((from) & 0x00ff0000ULL) >> 16); \
+	(di).i[2] = (((from) & 0x0000ff00ULL) >> 8); \
+	(di).i[3] = ((from) & 0x000000ffULL); \
+} while (0)
+
+#define XFS_DI_HI(di) \
+	(((__u32)(di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
+#define XFS_DI_LO(di) \
+	(((__u32)(di).i[4] << 24) | ((di).i[5] << 16) | ((di).i[6] << 8) | ((di).i[7]))
+
+#define XFS_GET_DIR_INO8(di)        \
+	(((xfs_ino_t)XFS_DI_LO(di) & 0xffffffffULL) | \
+	 ((xfs_ino_t)XFS_DI_HI(di) << 32))
+
+#define XFS_PUT_DIR_INO8(from, di) \
+do { \
+	(di).i[0] = 0; \
+	(di).i[1] = (((from) & 0x00ff000000000000ULL) >> 48); \
+	(di).i[2] = (((from) & 0x0000ff0000000000ULL) >> 40); \
+	(di).i[3] = (((from) & 0x000000ff00000000ULL) >> 32); \
+	(di).i[4] = (((from) & 0x00000000ff000000ULL) >> 24); \
+	(di).i[5] = (((from) & 0x0000000000ff0000ULL) >> 16); \
+	(di).i[6] = (((from) & 0x000000000000ff00ULL) >> 8); \
+	(di).i[7] = ((from) & 0x00000000000000ffULL); \
+} while (0)
+	
+#endif	/* __XFS_ARCH_H__ */
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
new file mode 100644
index 00000000..99d40116
--- /dev/null
+++ b/fs/xfs/xfs_attr.c
@@ -0,0 +1,2230 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_rw.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+/*
+ * xfs_attr.c
+ *
+ * Provide the external interfaces to manage attribute lists.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when attribute list fits inside the inode.
+ */
+STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
+
+/*
+ * Internal routines when attribute list is one block.
+ */
+STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
+
+/*
+ * Internal routines when attribute list is more than one block.
+ */
+STATIC int xfs_attr_node_get(xfs_da_args_t *args);
+STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context);
+STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
+STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
+
+/*
+ * Routines to manipulate out-of-line attribute values.
+ */
+STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args);
+STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
+
+#define ATTR_RMTVALUE_MAPSIZE	1	/* # of map entries at once */
+
+STATIC int
+xfs_attr_name_to_xname(
+	struct xfs_name	*xname,
+	const unsigned char *aname)
+{
+	if (!aname)
+		return EINVAL;
+	xname->name = aname;
+	xname->len = strlen((char *)aname);
+	if (xname->len >= MAXNAMELEN)
+		return EFAULT;		/* match IRIX behaviour */
+
+	return 0;
+}
+
+STATIC int
+xfs_inode_hasattr(
+	struct xfs_inode	*ip)
+{
+	if (!XFS_IFORK_Q(ip) ||
+	    (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     ip->i_d.di_anextents == 0))
+		return 0;
+	return 1;
+}
+
+/*========================================================================
+ * Overall external interface routines.
+ *========================================================================*/
+
+STATIC int
+xfs_attr_get_int(
+	struct xfs_inode	*ip,
+	struct xfs_name		*name,
+	unsigned char		*value,
+	int			*valuelenp,
+	int			flags)
+{
+	xfs_da_args_t   args;
+	int             error;
+
+	if (!xfs_inode_hasattr(ip))
+		return ENOATTR;
+
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	memset((char *)&args, 0, sizeof(args));
+	args.name = name->name;
+	args.namelen = name->len;
+	args.value = value;
+	args.valuelen = *valuelenp;
+	args.flags = flags;
+	args.hashval = xfs_da_hashname(args.name, args.namelen);
+	args.dp = ip;
+	args.whichfork = XFS_ATTR_FORK;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		error = xfs_attr_shortform_getvalue(&args);
+	} else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_get(&args);
+	} else {
+		error = xfs_attr_node_get(&args);
+	}
+
+	/*
+	 * Return the number of bytes in the value to the caller.
+	 */
+	*valuelenp = args.valuelen;
+
+	if (error == EEXIST)
+		error = 0;
+	return(error);
+}
+
+int
+xfs_attr_get(
+	xfs_inode_t	*ip,
+	const unsigned char *name,
+	unsigned char	*value,
+	int		*valuelenp,
+	int		flags)
+{
+	int		error;
+	struct xfs_name	xname;
+
+	XFS_STATS_INC(xs_attr_get);
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return(EIO);
+
+	error = xfs_attr_name_to_xname(&xname, name);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return(error);
+}
+
+/*
+ * Calculate how many blocks we need for the new attribute,
+ */
+STATIC int
+xfs_attr_calc_size(
+	struct xfs_inode 	*ip,
+	int			namelen,
+	int			valuelen,
+	int			*local)
+{
+	struct xfs_mount 	*mp = ip->i_mount;
+	int			size;
+	int			nblks;
+
+	/*
+	 * Determine space new attribute will use, and if it would be
+	 * "local" or "remote" (note: local != inline).
+	 */
+	size = xfs_attr_leaf_newentsize(namelen, valuelen,
+					mp->m_sb.sb_blocksize, local);
+
+	nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+	if (*local) {
+		if (size > (mp->m_sb.sb_blocksize >> 1)) {
+			/* Double split possible */
+			nblks *= 2;
+		}
+	} else {
+		/*
+		 * Out of line attribute, cannot double split, but
+		 * make room for the attribute value itself.
+		 */
+		uint	dblocks = XFS_B_TO_FSB(mp, valuelen);
+		nblks += dblocks;
+		nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+	}
+
+	return nblks;
+}
+
+STATIC int
+xfs_attr_set_int(
+	struct xfs_inode *dp,
+	struct xfs_name	*name,
+	unsigned char	*value,
+	int		valuelen,
+	int		flags)
+{
+	xfs_da_args_t	args;
+	xfs_fsblock_t	firstblock;
+	xfs_bmap_free_t flist;
+	int		error, err2, committed;
+	xfs_mount_t	*mp = dp->i_mount;
+	int             rsvd = (flags & ATTR_ROOT) != 0;
+	int		local;
+
+	/*
+	 * Attach the dquots to the inode.
+	 */
+	error = xfs_qm_dqattach(dp, 0);
+	if (error)
+		return error;
+
+	/*
+	 * If the inode doesn't have an attribute fork, add one.
+	 * (inode must not be locked when we call this routine)
+	 */
+	if (XFS_IFORK_Q(dp) == 0) {
+		int sf_size = sizeof(xfs_attr_sf_hdr_t) +
+			      XFS_ATTR_SF_ENTSIZE_BYNAME(name->len, valuelen);
+
+		if ((error = xfs_bmap_add_attrfork(dp, sf_size, rsvd)))
+			return(error);
+	}
+
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	memset((char *)&args, 0, sizeof(args));
+	args.name = name->name;
+	args.namelen = name->len;
+	args.value = value;
+	args.valuelen = valuelen;
+	args.flags = flags;
+	args.hashval = xfs_da_hashname(args.name, args.namelen);
+	args.dp = dp;
+	args.firstblock = &firstblock;
+	args.flist = &flist;
+	args.whichfork = XFS_ATTR_FORK;
+	args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+
+	/* Size is now blocks for attribute data */
+	args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local);
+
+	/*
+	 * Start our first transaction of the day.
+	 *
+	 * All future transactions during this code must be "chained" off
+	 * this one via the trans_dup() call.  All transactions will contain
+	 * the inode, and the inode will always be marked with trans_ihold().
+	 * Since the inode will be locked in all transactions, we must log
+	 * the inode in every transaction to let it float upward through
+	 * the log.
+	 */
+	args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
+
+	/*
+	 * Root fork attributes can use reserved data blocks for this
+	 * operation if necessary
+	 */
+
+	if (rsvd)
+		args.trans->t_flags |= XFS_TRANS_RESERVE;
+
+	if ((error = xfs_trans_reserve(args.trans, args.total,
+			XFS_ATTRSET_LOG_RES(mp, args.total), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) {
+		xfs_trans_cancel(args.trans, 0);
+		return(error);
+	}
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
+				rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+				       XFS_QMOPT_RES_REGBLKS);
+	if (error) {
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+		xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+		return (error);
+	}
+
+	xfs_trans_ijoin(args.trans, dp);
+
+	/*
+	 * If the attribute list is non-existent or a shortform list,
+	 * upgrade it to a single-leaf-block attribute list.
+	 */
+	if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
+	    ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) &&
+	     (dp->i_d.di_anextents == 0))) {
+
+		/*
+		 * Build initial attribute list (if required).
+		 */
+		if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
+			xfs_attr_shortform_create(&args);
+
+		/*
+		 * Try to add the attr to the attribute list in
+		 * the inode.
+		 */
+		error = xfs_attr_shortform_addname(&args);
+		if (error != ENOSPC) {
+			/*
+			 * Commit the shortform mods, and we're done.
+			 * NOTE: this is also the error path (EEXIST, etc).
+			 */
+			ASSERT(args.trans != NULL);
+
+			/*
+			 * If this is a synchronous mount, make sure that
+			 * the transaction goes to disk before returning
+			 * to the user.
+			 */
+			if (mp->m_flags & XFS_MOUNT_WSYNC) {
+				xfs_trans_set_sync(args.trans);
+			}
+
+			if (!error && (flags & ATTR_KERNOTIME) == 0) {
+				xfs_trans_ichgtime(args.trans, dp,
+							XFS_ICHGTIME_CHG);
+			}
+			err2 = xfs_trans_commit(args.trans,
+						 XFS_TRANS_RELEASE_LOG_RES);
+			xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+			return(error == 0 ? err2 : error);
+		}
+
+		/*
+		 * It won't fit in the shortform, transform to a leaf block.
+		 * GROT: another possible req'mt for a double-split btree op.
+		 */
+		xfs_bmap_init(args.flist, args.firstblock);
+		error = xfs_attr_shortform_to_leaf(&args);
+		if (!error) {
+			error = xfs_bmap_finish(&args.trans, args.flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args.trans = NULL;
+			xfs_bmap_cancel(&flist);
+			goto out;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args.trans, dp);
+
+		/*
+		 * Commit the leaf transformation.  We'll need another (linked)
+		 * transaction to add the new attribute to the leaf.
+		 */
+
+		error = xfs_trans_roll(&args.trans, dp);
+		if (error)
+			goto out;
+
+	}
+
+	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_addname(&args);
+	} else {
+		error = xfs_attr_node_addname(&args);
+	}
+	if (error) {
+		goto out;
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(args.trans);
+	}
+
+	if ((flags & ATTR_KERNOTIME) == 0)
+		xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
+	/*
+	 * Commit the last in the sequence of transactions.
+	 */
+	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+	return(error);
+
+out:
+	if (args.trans)
+		xfs_trans_cancel(args.trans,
+			XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return(error);
+}
+
+int
+xfs_attr_set(
+	xfs_inode_t	*dp,
+	const unsigned char *name,
+	unsigned char	*value,
+	int		valuelen,
+	int		flags)
+{
+	int             error;
+	struct xfs_name	xname;
+
+	XFS_STATS_INC(xs_attr_set);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return (EIO);
+
+	error = xfs_attr_name_to_xname(&xname, name);
+	if (error)
+		return error;
+
+	return xfs_attr_set_int(dp, &xname, value, valuelen, flags);
+}
+
+/*
+ * Generic handler routine to remove a name from an attribute list.
+ * Transitions attribute list from Btree to shortform as necessary.
+ */
+STATIC int
+xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
+{
+	xfs_da_args_t	args;
+	xfs_fsblock_t	firstblock;
+	xfs_bmap_free_t	flist;
+	int		error;
+	xfs_mount_t	*mp = dp->i_mount;
+
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	memset((char *)&args, 0, sizeof(args));
+	args.name = name->name;
+	args.namelen = name->len;
+	args.flags = flags;
+	args.hashval = xfs_da_hashname(args.name, args.namelen);
+	args.dp = dp;
+	args.firstblock = &firstblock;
+	args.flist = &flist;
+	args.total = 0;
+	args.whichfork = XFS_ATTR_FORK;
+
+	/*
+	 * we have no control over the attribute names that userspace passes us
+	 * to remove, so we have to allow the name lookup prior to attribute
+	 * removal to fail.
+	 */
+	args.op_flags = XFS_DA_OP_OKNOENT;
+
+	/*
+	 * Attach the dquots to the inode.
+	 */
+	error = xfs_qm_dqattach(dp, 0);
+	if (error)
+		return error;
+
+	/*
+	 * Start our first transaction of the day.
+	 *
+	 * All future transactions during this code must be "chained" off
+	 * this one via the trans_dup() call.  All transactions will contain
+	 * the inode, and the inode will always be marked with trans_ihold().
+	 * Since the inode will be locked in all transactions, we must log
+	 * the inode in every transaction to let it float upward through
+	 * the log.
+	 */
+	args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
+
+	/*
+	 * Root fork attributes can use reserved data blocks for this
+	 * operation if necessary
+	 */
+
+	if (flags & ATTR_ROOT)
+		args.trans->t_flags |= XFS_TRANS_RESERVE;
+
+	if ((error = xfs_trans_reserve(args.trans,
+				      XFS_ATTRRM_SPACE_RES(mp),
+				      XFS_ATTRRM_LOG_RES(mp),
+				      0, XFS_TRANS_PERM_LOG_RES,
+				      XFS_ATTRRM_LOG_COUNT))) {
+		xfs_trans_cancel(args.trans, 0);
+		return(error);
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+	/*
+	 * No need to make quota reservations here. We expect to release some
+	 * blocks not allocate in the common case.
+	 */
+	xfs_trans_ijoin(args.trans, dp);
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (!xfs_inode_hasattr(dp)) {
+		error = XFS_ERROR(ENOATTR);
+		goto out;
+	}
+	if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+		error = xfs_attr_shortform_remove(&args);
+		if (error) {
+			goto out;
+		}
+	} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_removename(&args);
+	} else {
+		error = xfs_attr_node_removename(&args);
+	}
+	if (error) {
+		goto out;
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(args.trans);
+	}
+
+	if ((flags & ATTR_KERNOTIME) == 0)
+		xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
+	/*
+	 * Commit the last in the sequence of transactions.
+	 */
+	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+	return(error);
+
+out:
+	if (args.trans)
+		xfs_trans_cancel(args.trans,
+			XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return(error);
+}
+
+int
+xfs_attr_remove(
+	xfs_inode_t	*dp,
+	const unsigned char *name,
+	int		flags)
+{
+	int		error;
+	struct xfs_name	xname;
+
+	XFS_STATS_INC(xs_attr_remove);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return (EIO);
+
+	error = xfs_attr_name_to_xname(&xname, name);
+	if (error)
+		return error;
+
+	xfs_ilock(dp, XFS_ILOCK_SHARED);
+	if (!xfs_inode_hasattr(dp)) {
+		xfs_iunlock(dp, XFS_ILOCK_SHARED);
+		return XFS_ERROR(ENOATTR);
+	}
+	xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+	return xfs_attr_remove_int(dp, &xname, flags);
+}
+
+int
+xfs_attr_list_int(xfs_attr_list_context_t *context)
+{
+	int error;
+	xfs_inode_t *dp = context->dp;
+
+	XFS_STATS_INC(xs_attr_list);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return EIO;
+
+	xfs_ilock(dp, XFS_ILOCK_SHARED);
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (!xfs_inode_hasattr(dp)) {
+		error = 0;
+	} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		error = xfs_attr_shortform_list(context);
+	} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_list(context);
+	} else {
+		error = xfs_attr_node_list(context);
+	}
+
+	xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+	return error;
+}
+
+#define	ATTR_ENTBASESIZE		/* minimum bytes used by an attr */ \
+	(((struct attrlist_ent *) 0)->a_name - (char *) 0)
+#define	ATTR_ENTSIZE(namelen)		/* actual bytes used by an attr */ \
+	((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
+	 & ~(sizeof(u_int32_t)-1))
+
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_attr_put_listent(
+	xfs_attr_list_context_t *context,
+	int		flags,
+	unsigned char	*name,
+	int		namelen,
+	int		valuelen,
+	unsigned char	*value)
+{
+	struct attrlist *alist = (struct attrlist *)context->alist;
+	attrlist_ent_t *aep;
+	int arraytop;
+
+	ASSERT(!(context->flags & ATTR_KERNOVAL));
+	ASSERT(context->count >= 0);
+	ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+	ASSERT(context->firstu >= sizeof(*alist));
+	ASSERT(context->firstu <= context->bufsize);
+
+	/*
+	 * Only list entries in the right namespace.
+	 */
+	if (((context->flags & ATTR_SECURE) == 0) !=
+	    ((flags & XFS_ATTR_SECURE) == 0))
+		return 0;
+	if (((context->flags & ATTR_ROOT) == 0) !=
+	    ((flags & XFS_ATTR_ROOT) == 0))
+		return 0;
+
+	arraytop = sizeof(*alist) +
+			context->count * sizeof(alist->al_offset[0]);
+	context->firstu -= ATTR_ENTSIZE(namelen);
+	if (context->firstu < arraytop) {
+		trace_xfs_attr_list_full(context);
+		alist->al_more = 1;
+		context->seen_enough = 1;
+		return 1;
+	}
+
+	aep = (attrlist_ent_t *)&context->alist[context->firstu];
+	aep->a_valuelen = valuelen;
+	memcpy(aep->a_name, name, namelen);
+	aep->a_name[namelen] = 0;
+	alist->al_offset[context->count++] = context->firstu;
+	alist->al_count = context->count;
+	trace_xfs_attr_list_add(context);
+	return 0;
+}
+
+/*
+ * Generate a list of extended attribute names and optionally
+ * also value lengths.  Positive return value follows the XFS
+ * convention of being an error, zero or negative return code
+ * is the length of the buffer returned (negated), indicating
+ * success.
+ */
+int
+xfs_attr_list(
+	xfs_inode_t	*dp,
+	char		*buffer,
+	int		bufsize,
+	int		flags,
+	attrlist_cursor_kern_t *cursor)
+{
+	xfs_attr_list_context_t context;
+	struct attrlist *alist;
+	int error;
+
+	/*
+	 * Validate the cursor.
+	 */
+	if (cursor->pad1 || cursor->pad2)
+		return(XFS_ERROR(EINVAL));
+	if ((cursor->initted == 0) &&
+	    (cursor->hashval || cursor->blkno || cursor->offset))
+		return XFS_ERROR(EINVAL);
+
+	/*
+	 * Check for a properly aligned buffer.
+	 */
+	if (((long)buffer) & (sizeof(int)-1))
+		return XFS_ERROR(EFAULT);
+	if (flags & ATTR_KERNOVAL)
+		bufsize = 0;
+
+	/*
+	 * Initialize the output buffer.
+	 */
+	memset(&context, 0, sizeof(context));
+	context.dp = dp;
+	context.cursor = cursor;
+	context.resynch = 1;
+	context.flags = flags;
+	context.alist = buffer;
+	context.bufsize = (bufsize & ~(sizeof(int)-1));  /* align */
+	context.firstu = context.bufsize;
+	context.put_listent = xfs_attr_put_listent;
+
+	alist = (struct attrlist *)context.alist;
+	alist->al_count = 0;
+	alist->al_more = 0;
+	alist->al_offset[0] = context.bufsize;
+
+	error = xfs_attr_list_int(&context);
+	ASSERT(error >= 0);
+	return error;
+}
+
+int								/* error */
+xfs_attr_inactive(xfs_inode_t *dp)
+{
+	xfs_trans_t *trans;
+	xfs_mount_t *mp;
+	int error;
+
+	mp = dp->i_mount;
+	ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
+
+	xfs_ilock(dp, XFS_ILOCK_SHARED);
+	if (!xfs_inode_hasattr(dp) ||
+	    dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		xfs_iunlock(dp, XFS_ILOCK_SHARED);
+		return 0;
+	}
+	xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+	/*
+	 * Start our first transaction of the day.
+	 *
+	 * All future transactions during this code must be "chained" off
+	 * this one via the trans_dup() call.  All transactions will contain
+	 * the inode, and the inode will always be marked with trans_ihold().
+	 * Since the inode will be locked in all transactions, we must log
+	 * the inode in every transaction to let it float upward through
+	 * the log.
+	 */
+	trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
+	if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0,
+				      XFS_TRANS_PERM_LOG_RES,
+				      XFS_ATTRINVAL_LOG_COUNT))) {
+		xfs_trans_cancel(trans, 0);
+		return(error);
+	}
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	/*
+	 * No need to make quota reservations here. We expect to release some
+	 * blocks, not allocate, in the common case.
+	 */
+	xfs_trans_ijoin(trans, dp);
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (!xfs_inode_hasattr(dp) ||
+	    dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		error = 0;
+		goto out;
+	}
+	error = xfs_attr_root_inactive(&trans, dp);
+	if (error)
+		goto out;
+
+	error = xfs_itruncate_finish(&trans, dp, 0LL, XFS_ATTR_FORK, 0);
+	if (error)
+		goto out;
+
+	/*
+	 * Commit the last in the sequence of transactions.
+	 */
+	xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+	error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+	return(error);
+
+out:
+	xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return(error);
+}
+
+
+
+/*========================================================================
+ * External routines when attribute list is inside the inode
+ *========================================================================*/
+
+/*
+ * Add a name to the shortform attribute list structure
+ * This is the external routine.
+ */
+STATIC int
+xfs_attr_shortform_addname(xfs_da_args_t *args)
+{
+	int newsize, forkoff, retval;
+
+	retval = xfs_attr_shortform_lookup(args);
+	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+		return(retval);
+	} else if (retval == EEXIST) {
+		if (args->flags & ATTR_CREATE)
+			return(retval);
+		retval = xfs_attr_shortform_remove(args);
+		ASSERT(retval == 0);
+	}
+
+	if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
+	    args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+		return(XFS_ERROR(ENOSPC));
+
+	newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
+	newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+
+	forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
+	if (!forkoff)
+		return(XFS_ERROR(ENOSPC));
+
+	xfs_attr_shortform_add(args, forkoff);
+	return(0);
+}
+
+
+/*========================================================================
+ * External routines when attribute list is one block
+ *========================================================================*/
+
+/*
+ * Add a name to the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_addname(xfs_da_args_t *args)
+{
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp;
+	int retval, error, committed, forkoff;
+
+	/*
+	 * Read the (only) block in the attribute list in.
+	 */
+	dp = args->dp;
+	args->blkno = 0;
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+					     XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+
+	/*
+	 * Look up the given attribute in the leaf block.  Figure out if
+	 * the given flags produce an error or call for an atomic rename.
+	 */
+	retval = xfs_attr_leaf_lookup_int(bp, args);
+	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+		xfs_da_brelse(args->trans, bp);
+		return(retval);
+	} else if (retval == EEXIST) {
+		if (args->flags & ATTR_CREATE) {	/* pure create op */
+			xfs_da_brelse(args->trans, bp);
+			return(retval);
+		}
+		args->op_flags |= XFS_DA_OP_RENAME;	/* an atomic rename */
+		args->blkno2 = args->blkno;		/* set 2nd entry info*/
+		args->index2 = args->index;
+		args->rmtblkno2 = args->rmtblkno;
+		args->rmtblkcnt2 = args->rmtblkcnt;
+	}
+
+	/*
+	 * Add the attribute to the leaf block, transitioning to a Btree
+	 * if required.
+	 */
+	retval = xfs_attr_leaf_add(bp, args);
+	xfs_da_buf_done(bp);
+	if (retval == ENOSPC) {
+		/*
+		 * Promote the attribute list to the Btree format, then
+		 * Commit that transaction so that the node_addname() call
+		 * can manage its own transactions.
+		 */
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_attr_leaf_to_node(args);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return(error);
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp);
+
+		/*
+		 * Commit the current trans (including the inode) and start
+		 * a new one.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
+			return (error);
+
+		/*
+		 * Fob the whole rest of the problem off on the Btree code.
+		 */
+		error = xfs_attr_node_addname(args);
+		return(error);
+	}
+
+	/*
+	 * Commit the transaction that added the attr name so that
+	 * later routines can manage their own transactions.
+	 */
+	error = xfs_trans_roll(&args->trans, dp);
+	if (error)
+		return (error);
+
+	/*
+	 * If there was an out-of-line value, allocate the blocks we
+	 * identified for its storage and copy the value.  This is done
+	 * after we create the attribute so that we don't overflow the
+	 * maximum size of a transaction and/or hit a deadlock.
+	 */
+	if (args->rmtblkno > 0) {
+		error = xfs_attr_rmtval_set(args);
+		if (error)
+			return(error);
+	}
+
+	/*
+	 * If this is an atomic rename operation, we must "flip" the
+	 * incomplete flags on the "new" and "old" attribute/value pairs
+	 * so that one disappears and one appears atomically.  Then we
+	 * must remove the "old" attribute/value pair.
+	 */
+	if (args->op_flags & XFS_DA_OP_RENAME) {
+		/*
+		 * In a separate transaction, set the incomplete flag on the
+		 * "old" attr and clear the incomplete flag on the "new" attr.
+		 */
+		error = xfs_attr_leaf_flipflags(args);
+		if (error)
+			return(error);
+
+		/*
+		 * Dismantle the "old" attribute/value pair by removing
+		 * a "remote" value (if it exists).
+		 */
+		args->index = args->index2;
+		args->blkno = args->blkno2;
+		args->rmtblkno = args->rmtblkno2;
+		args->rmtblkcnt = args->rmtblkcnt2;
+		if (args->rmtblkno) {
+			error = xfs_attr_rmtval_remove(args);
+			if (error)
+				return(error);
+		}
+
+		/*
+		 * Read in the block containing the "old" attr, then
+		 * remove the "old" attr from that block (neat, huh!)
+		 */
+		error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1,
+						     &bp, XFS_ATTR_FORK);
+		if (error)
+			return(error);
+		ASSERT(bp != NULL);
+		(void)xfs_attr_leaf_remove(bp, args);
+
+		/*
+		 * If the result is small enough, shrink it all into the inode.
+		 */
+		if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+			xfs_bmap_init(args->flist, args->firstblock);
+			error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+			/* bp is gone due to xfs_da_shrink_inode */
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				return(error);
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp);
+		} else
+			xfs_da_buf_done(bp);
+
+		/*
+		 * Commit the remove and start the next trans in series.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+
+	} else if (args->rmtblkno > 0) {
+		/*
+		 * Added a "remote" value, just clear the incomplete flag.
+		 */
+		error = xfs_attr_leaf_clearflag(args);
+	}
+	return(error);
+}
+
+/*
+ * Remove a name from the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_removename(xfs_da_args_t *args)
+{
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp;
+	int error, committed, forkoff;
+
+	/*
+	 * Remove the attribute.
+	 */
+	dp = args->dp;
+	args->blkno = 0;
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+					     XFS_ATTR_FORK);
+	if (error) {
+		return(error);
+	}
+
+	ASSERT(bp != NULL);
+	error = xfs_attr_leaf_lookup_int(bp, args);
+	if (error == ENOATTR) {
+		xfs_da_brelse(args->trans, bp);
+		return(error);
+	}
+
+	(void)xfs_attr_leaf_remove(bp, args);
+
+	/*
+	 * If the result is small enough, shrink it all into the inode.
+	 */
+	if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+		/* bp is gone due to xfs_da_shrink_inode */
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return(error);
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp);
+	} else
+		xfs_da_buf_done(bp);
+	return(0);
+}
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_get(xfs_da_args_t *args)
+{
+	xfs_dabuf_t *bp;
+	int error;
+
+	args->blkno = 0;
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+					     XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+
+	error = xfs_attr_leaf_lookup_int(bp, args);
+	if (error != EEXIST)  {
+		xfs_da_brelse(args->trans, bp);
+		return(error);
+	}
+	error = xfs_attr_leaf_getvalue(bp, args);
+	xfs_da_brelse(args->trans, bp);
+	if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
+		error = xfs_attr_rmtval_get(args);
+	}
+	return(error);
+}
+
+/*
+ * Copy out attribute entries for attr_list(), for leaf attribute lists.
+ */
+STATIC int
+xfs_attr_leaf_list(xfs_attr_list_context_t *context)
+{
+	xfs_attr_leafblock_t *leaf;
+	int error;
+	xfs_dabuf_t *bp;
+
+	context->cursor->blkno = 0;
+	error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
+	if (error)
+		return XFS_ERROR(error);
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	if (unlikely(be16_to_cpu(leaf->hdr.info.magic) != XFS_ATTR_LEAF_MAGIC)) {
+		XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
+				     context->dp->i_mount, leaf);
+		xfs_da_brelse(NULL, bp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	error = xfs_attr_leaf_list_int(bp, context);
+	xfs_da_brelse(NULL, bp);
+	return XFS_ERROR(error);
+}
+
+
+/*========================================================================
+ * External routines when attribute list size > XFS_LBSIZE(mp).
+ *========================================================================*/
+
+/*
+ * Add a name to a Btree-format attribute list.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ *
+ * "Remote" attribute values confuse the issue and atomic rename operations
+ * add a whole extra layer of confusion on top of that.
+ */
+STATIC int
+xfs_attr_node_addname(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	xfs_inode_t *dp;
+	xfs_mount_t *mp;
+	int committed, retval, error;
+
+	/*
+	 * Fill in bucket of arguments/results/context to carry around.
+	 */
+	dp = args->dp;
+	mp = dp->i_mount;
+restart:
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = mp;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+	state->node_ents = state->mp->m_attr_node_ents;
+
+	/*
+	 * Search to see if name already exists, and get back a pointer
+	 * to where it should go.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error)
+		goto out;
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+		goto out;
+	} else if (retval == EEXIST) {
+		if (args->flags & ATTR_CREATE)
+			goto out;
+		args->op_flags |= XFS_DA_OP_RENAME;	/* atomic rename op */
+		args->blkno2 = args->blkno;		/* set 2nd entry info*/
+		args->index2 = args->index;
+		args->rmtblkno2 = args->rmtblkno;
+		args->rmtblkcnt2 = args->rmtblkcnt;
+		args->rmtblkno = 0;
+		args->rmtblkcnt = 0;
+	}
+
+	retval = xfs_attr_leaf_add(blk->bp, state->args);
+	if (retval == ENOSPC) {
+		if (state->path.active == 1) {
+			/*
+			 * Its really a single leaf node, but it had
+			 * out-of-line values so it looked like it *might*
+			 * have been a b-tree.
+			 */
+			xfs_da_state_free(state);
+			xfs_bmap_init(args->flist, args->firstblock);
+			error = xfs_attr_leaf_to_node(args);
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				goto out;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp);
+
+			/*
+			 * Commit the node conversion and start the next
+			 * trans in the chain.
+			 */
+			error = xfs_trans_roll(&args->trans, dp);
+			if (error)
+				goto out;
+
+			goto restart;
+		}
+
+		/*
+		 * Split as many Btree elements as required.
+		 * This code tracks the new and old attr's location
+		 * in the index/blkno/rmtblkno/rmtblkcnt fields and
+		 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
+		 */
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_da_split(state);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			goto out;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp);
+	} else {
+		/*
+		 * Addition succeeded, update Btree hashvals.
+		 */
+		xfs_da_fixhashpath(state, &state->path);
+	}
+
+	/*
+	 * Kill the state structure, we're done with it and need to
+	 * allow the buffers to come back later.
+	 */
+	xfs_da_state_free(state);
+	state = NULL;
+
+	/*
+	 * Commit the leaf addition or btree split and start the next
+	 * trans in the chain.
+	 */
+	error = xfs_trans_roll(&args->trans, dp);
+	if (error)
+		goto out;
+
+	/*
+	 * If there was an out-of-line value, allocate the blocks we
+	 * identified for its storage and copy the value.  This is done
+	 * after we create the attribute so that we don't overflow the
+	 * maximum size of a transaction and/or hit a deadlock.
+	 */
+	if (args->rmtblkno > 0) {
+		error = xfs_attr_rmtval_set(args);
+		if (error)
+			return(error);
+	}
+
+	/*
+	 * If this is an atomic rename operation, we must "flip" the
+	 * incomplete flags on the "new" and "old" attribute/value pairs
+	 * so that one disappears and one appears atomically.  Then we
+	 * must remove the "old" attribute/value pair.
+	 */
+	if (args->op_flags & XFS_DA_OP_RENAME) {
+		/*
+		 * In a separate transaction, set the incomplete flag on the
+		 * "old" attr and clear the incomplete flag on the "new" attr.
+		 */
+		error = xfs_attr_leaf_flipflags(args);
+		if (error)
+			goto out;
+
+		/*
+		 * Dismantle the "old" attribute/value pair by removing
+		 * a "remote" value (if it exists).
+		 */
+		args->index = args->index2;
+		args->blkno = args->blkno2;
+		args->rmtblkno = args->rmtblkno2;
+		args->rmtblkcnt = args->rmtblkcnt2;
+		if (args->rmtblkno) {
+			error = xfs_attr_rmtval_remove(args);
+			if (error)
+				return(error);
+		}
+
+		/*
+		 * Re-find the "old" attribute entry after any split ops.
+		 * The INCOMPLETE flag means that we will find the "old"
+		 * attr, not the "new" one.
+		 */
+		args->flags |= XFS_ATTR_INCOMPLETE;
+		state = xfs_da_state_alloc();
+		state->args = args;
+		state->mp = mp;
+		state->blocksize = state->mp->m_sb.sb_blocksize;
+		state->node_ents = state->mp->m_attr_node_ents;
+		state->inleaf = 0;
+		error = xfs_da_node_lookup_int(state, &retval);
+		if (error)
+			goto out;
+
+		/*
+		 * Remove the name and update the hashvals in the tree.
+		 */
+		blk = &state->path.blk[ state->path.active-1 ];
+		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+		error = xfs_attr_leaf_remove(blk->bp, args);
+		xfs_da_fixhashpath(state, &state->path);
+
+		/*
+		 * Check to see if the tree needs to be collapsed.
+		 */
+		if (retval && (state->path.active > 1)) {
+			xfs_bmap_init(args->flist, args->firstblock);
+			error = xfs_da_join(state);
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				goto out;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp);
+		}
+
+		/*
+		 * Commit and start the next trans in the chain.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
+			goto out;
+
+	} else if (args->rmtblkno > 0) {
+		/*
+		 * Added a "remote" value, just clear the incomplete flag.
+		 */
+		error = xfs_attr_leaf_clearflag(args);
+		if (error)
+			goto out;
+	}
+	retval = error = 0;
+
+out:
+	if (state)
+		xfs_da_state_free(state);
+	if (error)
+		return(error);
+	return(retval);
+}
+
+/*
+ * Remove a name from a B-tree attribute list.
+ *
+ * This will involve walking down the Btree, and may involve joining
+ * leaf nodes and even joining intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+STATIC int
+xfs_attr_node_removename(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp;
+	int retval, error, committed, forkoff;
+
+	/*
+	 * Tie a string around our finger to remind us where we are.
+	 */
+	dp = args->dp;
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = dp->i_mount;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+	state->node_ents = state->mp->m_attr_node_ents;
+
+	/*
+	 * Search to see if name exists, and get back a pointer to it.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error || (retval != EEXIST)) {
+		if (error == 0)
+			error = retval;
+		goto out;
+	}
+
+	/*
+	 * If there is an out-of-line value, de-allocate the blocks.
+	 * This is done before we remove the attribute so that we don't
+	 * overflow the maximum size of a transaction and/or hit a deadlock.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->bp != NULL);
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	if (args->rmtblkno > 0) {
+		/*
+		 * Fill in disk block numbers in the state structure
+		 * so that we can get the buffers back after we commit
+		 * several transactions in the following calls.
+		 */
+		error = xfs_attr_fillstate(state);
+		if (error)
+			goto out;
+
+		/*
+		 * Mark the attribute as INCOMPLETE, then bunmapi() the
+		 * remote value.
+		 */
+		error = xfs_attr_leaf_setflag(args);
+		if (error)
+			goto out;
+		error = xfs_attr_rmtval_remove(args);
+		if (error)
+			goto out;
+
+		/*
+		 * Refill the state structure with buffers, the prior calls
+		 * released our buffers.
+		 */
+		error = xfs_attr_refillstate(state);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * Remove the name and update the hashvals in the tree.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	retval = xfs_attr_leaf_remove(blk->bp, args);
+	xfs_da_fixhashpath(state, &state->path);
+
+	/*
+	 * Check to see if the tree needs to be collapsed.
+	 */
+	if (retval && (state->path.active > 1)) {
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_da_join(state);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			goto out;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp);
+
+		/*
+		 * Commit the Btree join operation and start a new trans.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * If the result is small enough, push it all into the inode.
+	 */
+	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		/*
+		 * Have to get rid of the copy of this dabuf in the state.
+		 */
+		ASSERT(state->path.active == 1);
+		ASSERT(state->path.blk[0].bp);
+		xfs_da_buf_done(state->path.blk[0].bp);
+		state->path.blk[0].bp = NULL;
+
+		error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+						     XFS_ATTR_FORK);
+		if (error)
+			goto out;
+		ASSERT(be16_to_cpu(((xfs_attr_leafblock_t *)
+				      bp->data)->hdr.info.magic)
+						       == XFS_ATTR_LEAF_MAGIC);
+
+		if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+			xfs_bmap_init(args->flist, args->firstblock);
+			error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+			/* bp is gone due to xfs_da_shrink_inode */
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				goto out;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp);
+		} else
+			xfs_da_brelse(args->trans, bp);
+	}
+	error = 0;
+
+out:
+	xfs_da_state_free(state);
+	return(error);
+}
+
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commits have released these buffers.
+ */
+STATIC int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+	xfs_da_state_path_t *path;
+	xfs_da_state_blk_t *blk;
+	int level;
+
+	/*
+	 * Roll down the "path" in the state structure, storing the on-disk
+	 * block number for those buffers in the "path".
+	 */
+	path = &state->path;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->bp) {
+			blk->disk_blkno = xfs_da_blkno(blk->bp);
+			xfs_da_buf_done(blk->bp);
+			blk->bp = NULL;
+		} else {
+			blk->disk_blkno = 0;
+		}
+	}
+
+	/*
+	 * Roll down the "altpath" in the state structure, storing the on-disk
+	 * block number for those buffers in the "altpath".
+	 */
+	path = &state->altpath;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->bp) {
+			blk->disk_blkno = xfs_da_blkno(blk->bp);
+			xfs_da_buf_done(blk->bp);
+			blk->bp = NULL;
+		} else {
+			blk->disk_blkno = 0;
+		}
+	}
+
+	return(0);
+}
+
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commits have released those
+ * buffers from our grip.
+ */
+STATIC int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+	xfs_da_state_path_t *path;
+	xfs_da_state_blk_t *blk;
+	int level, error;
+
+	/*
+	 * Roll down the "path" in the state structure, storing the on-disk
+	 * block number for those buffers in the "path".
+	 */
+	path = &state->path;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->disk_blkno) {
+			error = xfs_da_read_buf(state->args->trans,
+						state->args->dp,
+						blk->blkno, blk->disk_blkno,
+						&blk->bp, XFS_ATTR_FORK);
+			if (error)
+				return(error);
+		} else {
+			blk->bp = NULL;
+		}
+	}
+
+	/*
+	 * Roll down the "altpath" in the state structure, storing the on-disk
+	 * block number for those buffers in the "altpath".
+	 */
+	path = &state->altpath;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->disk_blkno) {
+			error = xfs_da_read_buf(state->args->trans,
+						state->args->dp,
+						blk->blkno, blk->disk_blkno,
+						&blk->bp, XFS_ATTR_FORK);
+			if (error)
+				return(error);
+		} else {
+			blk->bp = NULL;
+		}
+	}
+
+	return(0);
+}
+
+/*
+ * Look up a filename in a node attribute list.
+ *
+ * This routine gets called for any attribute fork that has more than one
+ * block, ie: both true Btree attr lists and for single-leaf-blocks with
+ * "remote" values taking up more blocks.
+ */
+STATIC int
+xfs_attr_node_get(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	int error, retval;
+	int i;
+
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+	state->node_ents = state->mp->m_attr_node_ents;
+
+	/*
+	 * Search to see if name exists, and get back a pointer to it.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error) {
+		retval = error;
+	} else if (retval == EEXIST) {
+		blk = &state->path.blk[ state->path.active-1 ];
+		ASSERT(blk->bp != NULL);
+		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+		/*
+		 * Get the value, local or "remote"
+		 */
+		retval = xfs_attr_leaf_getvalue(blk->bp, args);
+		if (!retval && (args->rmtblkno > 0)
+		    && !(args->flags & ATTR_KERNOVAL)) {
+			retval = xfs_attr_rmtval_get(args);
+		}
+	}
+
+	/*
+	 * If not in a transaction, we have to release all the buffers.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_da_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+
+	xfs_da_state_free(state);
+	return(retval);
+}
+
+STATIC int							/* error */
+xfs_attr_node_list(xfs_attr_list_context_t *context)
+{
+	attrlist_cursor_kern_t *cursor;
+	xfs_attr_leafblock_t *leaf;
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	int error, i;
+	xfs_dabuf_t *bp;
+
+	cursor = context->cursor;
+	cursor->initted = 1;
+
+	/*
+	 * Do all sorts of validation on the passed-in cursor structure.
+	 * If anything is amiss, ignore the cursor and look up the hashval
+	 * starting from the btree root.
+	 */
+	bp = NULL;
+	if (cursor->blkno > 0) {
+		error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+					      &bp, XFS_ATTR_FORK);
+		if ((error != 0) && (error != EFSCORRUPTED))
+			return(error);
+		if (bp) {
+			node = bp->data;
+			switch (be16_to_cpu(node->hdr.info.magic)) {
+			case XFS_DA_NODE_MAGIC:
+				trace_xfs_attr_list_wrong_blk(context);
+				xfs_da_brelse(NULL, bp);
+				bp = NULL;
+				break;
+			case XFS_ATTR_LEAF_MAGIC:
+				leaf = bp->data;
+				if (cursor->hashval > be32_to_cpu(leaf->entries[
+				    be16_to_cpu(leaf->hdr.count)-1].hashval)) {
+					trace_xfs_attr_list_wrong_blk(context);
+					xfs_da_brelse(NULL, bp);
+					bp = NULL;
+				} else if (cursor->hashval <=
+					     be32_to_cpu(leaf->entries[0].hashval)) {
+					trace_xfs_attr_list_wrong_blk(context);
+					xfs_da_brelse(NULL, bp);
+					bp = NULL;
+				}
+				break;
+			default:
+				trace_xfs_attr_list_wrong_blk(context);
+				xfs_da_brelse(NULL, bp);
+				bp = NULL;
+			}
+		}
+	}
+
+	/*
+	 * We did not find what we expected given the cursor's contents,
+	 * so we start from the top and work down based on the hash value.
+	 * Note that start of node block is same as start of leaf block.
+	 */
+	if (bp == NULL) {
+		cursor->blkno = 0;
+		for (;;) {
+			error = xfs_da_read_buf(NULL, context->dp,
+						      cursor->blkno, -1, &bp,
+						      XFS_ATTR_FORK);
+			if (error)
+				return(error);
+			if (unlikely(bp == NULL)) {
+				XFS_ERROR_REPORT("xfs_attr_node_list(2)",
+						 XFS_ERRLEVEL_LOW,
+						 context->dp->i_mount);
+				return(XFS_ERROR(EFSCORRUPTED));
+			}
+			node = bp->data;
+			if (be16_to_cpu(node->hdr.info.magic)
+							== XFS_ATTR_LEAF_MAGIC)
+				break;
+			if (unlikely(be16_to_cpu(node->hdr.info.magic)
+							!= XFS_DA_NODE_MAGIC)) {
+				XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
+						     XFS_ERRLEVEL_LOW,
+						     context->dp->i_mount,
+						     node);
+				xfs_da_brelse(NULL, bp);
+				return(XFS_ERROR(EFSCORRUPTED));
+			}
+			btree = node->btree;
+			for (i = 0; i < be16_to_cpu(node->hdr.count);
+								btree++, i++) {
+				if (cursor->hashval
+						<= be32_to_cpu(btree->hashval)) {
+					cursor->blkno = be32_to_cpu(btree->before);
+					trace_xfs_attr_list_node_descend(context,
+									 btree);
+					break;
+				}
+			}
+			if (i == be16_to_cpu(node->hdr.count)) {
+				xfs_da_brelse(NULL, bp);
+				return(0);
+			}
+			xfs_da_brelse(NULL, bp);
+		}
+	}
+	ASSERT(bp != NULL);
+
+	/*
+	 * Roll upward through the blocks, processing each leaf block in
+	 * order.  As long as there is space in the result buffer, keep
+	 * adding the information.
+	 */
+	for (;;) {
+		leaf = bp->data;
+		if (unlikely(be16_to_cpu(leaf->hdr.info.magic)
+						!= XFS_ATTR_LEAF_MAGIC)) {
+			XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)",
+					     XFS_ERRLEVEL_LOW,
+					     context->dp->i_mount, leaf);
+			xfs_da_brelse(NULL, bp);
+			return(XFS_ERROR(EFSCORRUPTED));
+		}
+		error = xfs_attr_leaf_list_int(bp, context);
+		if (error) {
+			xfs_da_brelse(NULL, bp);
+			return error;
+		}
+		if (context->seen_enough || leaf->hdr.info.forw == 0)
+			break;
+		cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
+		xfs_da_brelse(NULL, bp);
+		error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+					      &bp, XFS_ATTR_FORK);
+		if (error)
+			return(error);
+		if (unlikely((bp == NULL))) {
+			XFS_ERROR_REPORT("xfs_attr_node_list(5)",
+					 XFS_ERRLEVEL_LOW,
+					 context->dp->i_mount);
+			return(XFS_ERROR(EFSCORRUPTED));
+		}
+	}
+	xfs_da_brelse(NULL, bp);
+	return(0);
+}
+
+
+/*========================================================================
+ * External routines for manipulating out-of-line attribute values.
+ *========================================================================*/
+
+/*
+ * Read the value associated with an attribute from the out-of-line buffer
+ * that we stored it in.
+ */
+int
+xfs_attr_rmtval_get(xfs_da_args_t *args)
+{
+	xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
+	xfs_mount_t *mp;
+	xfs_daddr_t dblkno;
+	void *dst;
+	xfs_buf_t *bp;
+	int nmap, error, tmp, valuelen, blkcnt, i;
+	xfs_dablk_t lblkno;
+
+	ASSERT(!(args->flags & ATTR_KERNOVAL));
+
+	mp = args->dp->i_mount;
+	dst = args->value;
+	valuelen = args->valuelen;
+	lblkno = args->rmtblkno;
+	while (valuelen > 0) {
+		nmap = ATTR_RMTVALUE_MAPSIZE;
+		error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
+				  args->rmtblkcnt,
+				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+				  NULL, 0, map, &nmap, NULL);
+		if (error)
+			return(error);
+		ASSERT(nmap >= 1);
+
+		for (i = 0; (i < nmap) && (valuelen > 0); i++) {
+			ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
+			       (map[i].br_startblock != HOLESTARTBLOCK));
+			dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+			blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+			error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
+					     blkcnt, XBF_LOCK | XBF_DONT_BLOCK,
+					     &bp);
+			if (error)
+				return(error);
+
+			tmp = (valuelen < XFS_BUF_SIZE(bp))
+				? valuelen : XFS_BUF_SIZE(bp);
+			xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
+			xfs_buf_relse(bp);
+			dst += tmp;
+			valuelen -= tmp;
+
+			lblkno += map[i].br_blockcount;
+		}
+	}
+	ASSERT(valuelen == 0);
+	return(0);
+}
+
+/*
+ * Write the value associated with an attribute into the out-of-line buffer
+ * that we have defined for it.
+ */
+STATIC int
+xfs_attr_rmtval_set(xfs_da_args_t *args)
+{
+	xfs_mount_t *mp;
+	xfs_fileoff_t lfileoff;
+	xfs_inode_t *dp;
+	xfs_bmbt_irec_t map;
+	xfs_daddr_t dblkno;
+	void *src;
+	xfs_buf_t *bp;
+	xfs_dablk_t lblkno;
+	int blkcnt, valuelen, nmap, error, tmp, committed;
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	src = args->value;
+
+	/*
+	 * Find a "hole" in the attribute address space large enough for
+	 * us to drop the new attribute's value into.
+	 */
+	blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+	lfileoff = 0;
+	error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+						   XFS_ATTR_FORK);
+	if (error) {
+		return(error);
+	}
+	args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
+	args->rmtblkcnt = blkcnt;
+
+	/*
+	 * Roll through the "value", allocating blocks on disk as required.
+	 */
+	while (blkcnt > 0) {
+		/*
+		 * Allocate a single extent, up to the size of the value.
+		 */
+		xfs_bmap_init(args->flist, args->firstblock);
+		nmap = 1;
+		error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
+				  blkcnt,
+				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
+							XFS_BMAPI_WRITE,
+				  args->firstblock, args->total, &map, &nmap,
+				  args->flist);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return(error);
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp);
+
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+		lblkno += map.br_blockcount;
+		blkcnt -= map.br_blockcount;
+
+		/*
+		 * Start the next trans in the chain.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
+			return (error);
+	}
+
+	/*
+	 * Roll through the "value", copying the attribute value to the
+	 * already-allocated blocks.  Blocks are written synchronously
+	 * so that we can know they are all on disk before we turn off
+	 * the INCOMPLETE flag.
+	 */
+	lblkno = args->rmtblkno;
+	valuelen = args->valuelen;
+	while (valuelen > 0) {
+		/*
+		 * Try to remember where we decided to put the value.
+		 */
+		xfs_bmap_init(args->flist, args->firstblock);
+		nmap = 1;
+		error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
+				  args->rmtblkcnt,
+				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+				  args->firstblock, 0, &map, &nmap,
+				  NULL);
+		if (error) {
+			return(error);
+		}
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+
+		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+		bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
+				 XBF_LOCK | XBF_DONT_BLOCK);
+		ASSERT(bp);
+		ASSERT(!XFS_BUF_GETERROR(bp));
+
+		tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
+							XFS_BUF_SIZE(bp);
+		xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
+		if (tmp < XFS_BUF_SIZE(bp))
+			xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
+		if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
+			return (error);
+		}
+		src += tmp;
+		valuelen -= tmp;
+
+		lblkno += map.br_blockcount;
+	}
+	ASSERT(valuelen == 0);
+	return(0);
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+STATIC int
+xfs_attr_rmtval_remove(xfs_da_args_t *args)
+{
+	xfs_mount_t *mp;
+	xfs_bmbt_irec_t map;
+	xfs_buf_t *bp;
+	xfs_daddr_t dblkno;
+	xfs_dablk_t lblkno;
+	int valuelen, blkcnt, nmap, error, done, committed;
+
+	mp = args->dp->i_mount;
+
+	/*
+	 * Roll through the "value", invalidating the attribute value's
+	 * blocks.
+	 */
+	lblkno = args->rmtblkno;
+	valuelen = args->rmtblkcnt;
+	while (valuelen > 0) {
+		/*
+		 * Try to remember where we decided to put the value.
+		 */
+		xfs_bmap_init(args->flist, args->firstblock);
+		nmap = 1;
+		error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
+					args->rmtblkcnt,
+					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+					args->firstblock, 0, &map, &nmap,
+					args->flist);
+		if (error) {
+			return(error);
+		}
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+
+		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+		/*
+		 * If the "remote" value is in the cache, remove it.
+		 */
+		bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
+		if (bp) {
+			XFS_BUF_STALE(bp);
+			XFS_BUF_UNDELAYWRITE(bp);
+			xfs_buf_relse(bp);
+			bp = NULL;
+		}
+
+		valuelen -= map.br_blockcount;
+
+		lblkno += map.br_blockcount;
+	}
+
+	/*
+	 * Keep de-allocating extents until the remote-value region is gone.
+	 */
+	lblkno = args->rmtblkno;
+	blkcnt = args->rmtblkcnt;
+	done = 0;
+	while (!done) {
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
+				    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+				    1, args->firstblock, args->flist,
+				    &done);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return(error);
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, args->dp);
+
+		/*
+		 * Close out trans and start the next one in the chain.
+		 */
+		error = xfs_trans_roll(&args->trans, args->dp);
+		if (error)
+			return (error);
+	}
+	return(0);
+}
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
new file mode 100644
index 00000000..e920d68e
--- /dev/null
+++ b/fs/xfs/xfs_attr.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ATTR_H__
+#define	__XFS_ATTR_H__
+
+struct xfs_inode;
+struct xfs_da_args;
+struct xfs_attr_list_context;
+
+/*
+ * Large attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.
+ * The internal links in the Btree are logical block offsets into the file.
+ *
+ * Small attribute lists use a different format and are packed as tightly
+ * as possible so as to fit into the literal area of the inode.
+ */
+
+/*========================================================================
+ * External interfaces
+ *========================================================================*/
+
+
+#define ATTR_DONTFOLLOW	0x0001	/* -- unused, from IRIX -- */
+#define ATTR_ROOT	0x0002	/* use attrs in root (trusted) namespace */
+#define ATTR_TRUST	0x0004	/* -- unused, from IRIX -- */
+#define ATTR_SECURE	0x0008	/* use attrs in security namespace */
+#define ATTR_CREATE	0x0010	/* pure create: fail if attr already exists */
+#define ATTR_REPLACE	0x0020	/* pure set: fail if attr does not exist */
+
+#define ATTR_KERNOTIME	0x1000	/* [kernel] don't update inode timestamps */
+#define ATTR_KERNOVAL	0x2000	/* [kernel] get attr size only, not value */
+
+#define XFS_ATTR_FLAGS \
+	{ ATTR_DONTFOLLOW, 	"DONTFOLLOW" }, \
+	{ ATTR_ROOT,		"ROOT" }, \
+	{ ATTR_TRUST,		"TRUST" }, \
+	{ ATTR_SECURE,		"SECURE" }, \
+	{ ATTR_CREATE,		"CREATE" }, \
+	{ ATTR_REPLACE,		"REPLACE" }, \
+	{ ATTR_KERNOTIME,	"KERNOTIME" }, \
+	{ ATTR_KERNOVAL,	"KERNOVAL" }
+
+/*
+ * The maximum size (into the kernel or returned from the kernel) of an
+ * attribute value or the buffer used for an attr_list() call.  Larger
+ * sizes will result in an ERANGE return code.
+ */
+#define	ATTR_MAX_VALUELEN	(64*1024)	/* max length of a value */
+
+/*
+ * Define how lists of attribute names are returned to the user from
+ * the attr_list() call.  A large, 32bit aligned, buffer is passed in
+ * along with its size.  We put an array of offsets at the top that each
+ * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom.
+ */
+typedef struct attrlist {
+	__s32	al_count;	/* number of entries in attrlist */
+	__s32	al_more;	/* T/F: more attrs (do call again) */
+	__s32	al_offset[1];	/* byte offsets of attrs [var-sized] */
+} attrlist_t;
+
+/*
+ * Show the interesting info about one attribute.  This is what the
+ * al_offset[i] entry points to.
+ */
+typedef struct attrlist_ent {	/* data from attr_list() */
+	__u32	a_valuelen;	/* number bytes in value of attr */
+	char	a_name[1];	/* attr name (NULL terminated) */
+} attrlist_ent_t;
+
+/*
+ * Given a pointer to the (char*) buffer containing the attr_list() result,
+ * and an index, return a pointer to the indicated attribute in the buffer.
+ */
+#define	ATTR_ENTRY(buffer, index)		\
+	((attrlist_ent_t *)			\
+	 &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
+
+/*
+ * Kernel-internal version of the attrlist cursor.
+ */
+typedef struct attrlist_cursor_kern {
+	__u32	hashval;	/* hash value of next entry to add */
+	__u32	blkno;		/* block containing entry (suggestion) */
+	__u32	offset;		/* offset in list of equal-hashvals */
+	__u16	pad1;		/* padding to match user-level */
+	__u8	pad2;		/* padding to match user-level */
+	__u8	initted;	/* T/F: cursor has been initialized */
+} attrlist_cursor_kern_t;
+
+
+/*========================================================================
+ * Structure used to pass context around among the routines.
+ *========================================================================*/
+
+
+typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
+			      unsigned char *, int, int, unsigned char *);
+
+typedef struct xfs_attr_list_context {
+	struct xfs_inode		*dp;		/* inode */
+	struct attrlist_cursor_kern	*cursor;	/* position in list */
+	char				*alist;		/* output buffer */
+	int				seen_enough;	/* T/F: seen enough of list? */
+	ssize_t				count;		/* num used entries */
+	int				dupcnt;		/* count dup hashvals seen */
+	int				bufsize;	/* total buffer size */
+	int				firstu;		/* first used byte in buffer */
+	int				flags;		/* from VOP call */
+	int				resynch;	/* T/F: resynch with cursor */
+	int				put_value;	/* T/F: need value for listent */
+	put_listent_func_t		put_listent;	/* list output fmt function */
+	int				index;		/* index into output buffer */
+} xfs_attr_list_context_t;
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Overall external interface routines.
+ */
+int xfs_attr_inactive(struct xfs_inode *dp);
+int xfs_attr_rmtval_get(struct xfs_da_args *args);
+int xfs_attr_list_int(struct xfs_attr_list_context *);
+
+#endif	/* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
new file mode 100644
index 00000000..f49ecf2e
--- /dev/null
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -0,0 +1,2979 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+/*
+ * xfs_attr_leaf.c
+ *
+ * Routines to implement leaf blocks of attributes as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
+				    xfs_dabuf_t **bpp);
+STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
+					      int freemap_index);
+STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer);
+STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
+						   xfs_da_state_blk_t *blk1,
+						   xfs_da_state_blk_t *blk2);
+STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
+					   xfs_da_state_blk_t *leaf_blk_1,
+					   xfs_da_state_blk_t *leaf_blk_2,
+					   int *number_entries_in_blk1,
+					   int *number_usedbytes_in_blk1);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
+				  xfs_dabuf_t *bp, int level);
+STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
+				  xfs_dabuf_t *bp);
+STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
+				   xfs_dablk_t blkno, int blkcnt);
+
+/*
+ * Utility routines.
+ */
+STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
+					 int src_start,
+					 xfs_attr_leafblock_t *dst_leaf,
+					 int dst_start, int move_count,
+					 xfs_mount_t *mp);
+STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+
+/*========================================================================
+ * Namespace helper routines
+ *========================================================================*/
+
+/*
+ * If namespace bits don't match return 0.
+ * If all match then return 1.
+ */
+STATIC int
+xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
+{
+	return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
+}
+
+
+/*========================================================================
+ * External routines when attribute fork size < XFS_LITINO(mp).
+ *========================================================================*/
+
+/*
+ * Query whether the requested number of additional bytes of extended
+ * attribute space will be able to fit inline.
+ *
+ * Returns zero if not, else the di_forkoff fork offset to be used in the
+ * literal area for attribute data once the new bytes have been added.
+ *
+ * di_forkoff must be 8 byte aligned, hence is stored as a >>3 value;
+ * special case for dev/uuid inodes, they have fixed size data forks.
+ */
+int
+xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
+{
+	int offset;
+	int minforkoff;	/* lower limit on valid forkoff locations */
+	int maxforkoff;	/* upper limit on valid forkoff locations */
+	int dsize;
+	xfs_mount_t *mp = dp->i_mount;
+
+	offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */
+
+	switch (dp->i_d.di_format) {
+	case XFS_DINODE_FMT_DEV:
+		minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+		return (offset >= minforkoff) ? minforkoff : 0;
+	case XFS_DINODE_FMT_UUID:
+		minforkoff = roundup(sizeof(uuid_t), 8) >> 3;
+		return (offset >= minforkoff) ? minforkoff : 0;
+	}
+
+	/*
+	 * If the requested numbers of bytes is smaller or equal to the
+	 * current attribute fork size we can always proceed.
+	 *
+	 * Note that if_bytes in the data fork might actually be larger than
+	 * the current data fork size is due to delalloc extents. In that
+	 * case either the extent count will go down when they are converted
+	 * to real extents, or the delalloc conversion will take care of the
+	 * literal area rebalancing.
+	 */
+	if (bytes <= XFS_IFORK_ASIZE(dp))
+		return dp->i_d.di_forkoff;
+
+	/*
+	 * For attr2 we can try to move the forkoff if there is space in the
+	 * literal area, but for the old format we are done if there is no
+	 * space in the fixed attribute fork.
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_ATTR2))
+		return 0;
+
+	dsize = dp->i_df.if_bytes;
+
+	switch (dp->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		/*
+		 * If there is no attr fork and the data fork is extents, 
+		 * determine if creating the default attr fork will result
+		 * in the extents form migrating to btree. If so, the
+		 * minimum offset only needs to be the space required for
+		 * the btree root.
+		 */
+		if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
+		    xfs_default_attroffset(dp))
+			dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		/*
+		 * If we have a data btree then keep forkoff if we have one,
+		 * otherwise we are adding a new attr, so then we set
+		 * minforkoff to where the btree root can finish so we have
+		 * plenty of room for attrs
+		 */
+		if (dp->i_d.di_forkoff) {
+			if (offset < dp->i_d.di_forkoff)
+				return 0;
+			return dp->i_d.di_forkoff;
+		}
+		dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
+		break;
+	}
+
+	/*
+	 * A data fork btree root must have space for at least
+	 * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
+	 */
+	minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
+	minforkoff = roundup(minforkoff, 8) >> 3;
+
+	/* attr fork btree root can have at least this many key/ptr pairs */
+	maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	maxforkoff = maxforkoff >> 3;	/* rounded down */
+
+	if (offset >= maxforkoff)
+		return maxforkoff;
+	if (offset >= minforkoff)
+		return offset;
+	return 0;
+}
+
+/*
+ * Switch on the ATTR2 superblock bit (implies also FEATURES2)
+ */
+STATIC void
+xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
+{
+	if ((mp->m_flags & XFS_MOUNT_ATTR2) &&
+	    !(xfs_sb_version_hasattr2(&mp->m_sb))) {
+		spin_lock(&mp->m_sb_lock);
+		if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
+			xfs_sb_version_addattr2(&mp->m_sb);
+			spin_unlock(&mp->m_sb_lock);
+			xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+		} else
+			spin_unlock(&mp->m_sb_lock);
+	}
+}
+
+/*
+ * Create the initial contents of a shortform attribute list.
+ */
+void
+xfs_attr_shortform_create(xfs_da_args_t *args)
+{
+	xfs_attr_sf_hdr_t *hdr;
+	xfs_inode_t *dp;
+	xfs_ifork_t *ifp;
+
+	dp = args->dp;
+	ASSERT(dp != NULL);
+	ifp = dp->i_afp;
+	ASSERT(ifp != NULL);
+	ASSERT(ifp->if_bytes == 0);
+	if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) {
+		ifp->if_flags &= ~XFS_IFEXTENTS;	/* just in case */
+		dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL;
+		ifp->if_flags |= XFS_IFINLINE;
+	} else {
+		ASSERT(ifp->if_flags & XFS_IFINLINE);
+	}
+	xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
+	hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
+	hdr->count = 0;
+	hdr->totsize = cpu_to_be16(sizeof(*hdr));
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+}
+
+/*
+ * Add a name/value pair to the shortform attribute list.
+ * Overflow from the inode has already been checked for.
+ */
+void
+xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int i, offset, size;
+	xfs_mount_t *mp;
+	xfs_inode_t *dp;
+	xfs_ifork_t *ifp;
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	dp->i_d.di_forkoff = forkoff;
+	dp->i_df.if_ext_max =
+		XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
+	dp->i_afp->if_ext_max =
+		XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
+
+	ifp = dp->i_afp;
+	ASSERT(ifp->if_flags & XFS_IFINLINE);
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+#ifdef DEBUG
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+			continue;
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+			continue;
+		ASSERT(0);
+#endif
+	}
+
+	offset = (char *)sfe - (char *)sf;
+	size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+	xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
+
+	sfe->namelen = args->namelen;
+	sfe->valuelen = args->valuelen;
+	sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+	memcpy(sfe->nameval, args->name, args->namelen);
+	memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
+	sf->hdr.count++;
+	be16_add_cpu(&sf->hdr.totsize, size);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+
+	xfs_sbversion_add_attr2(mp, args->trans);
+}
+
+/*
+ * After the last attribute is removed revert to original inode format,
+ * making all literal area available to the data fork once more.
+ */
+STATIC void
+xfs_attr_fork_reset(
+	struct xfs_inode	*ip,
+	struct xfs_trans	*tp)
+{
+	xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+	ip->i_d.di_forkoff = 0;
+	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+
+	ASSERT(ip->i_d.di_anextents == 0);
+	ASSERT(ip->i_afp == NULL);
+
+	ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/*
+ * Remove an attribute from the shortform attribute list structure.
+ */
+int
+xfs_attr_shortform_remove(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int base, size=0, end, totsize, i;
+	xfs_mount_t *mp;
+	xfs_inode_t *dp;
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	base = sizeof(xfs_attr_sf_hdr_t);
+	sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+	sfe = &sf->list[0];
+	end = sf->hdr.count;
+	for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
+					base += size, i++) {
+		size = XFS_ATTR_SF_ENTSIZE(sfe);
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
+			continue;
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+			continue;
+		break;
+	}
+	if (i == end)
+		return(XFS_ERROR(ENOATTR));
+
+	/*
+	 * Fix up the attribute fork data, covering the hole
+	 */
+	end = base + size;
+	totsize = be16_to_cpu(sf->hdr.totsize);
+	if (end != totsize)
+		memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
+	sf->hdr.count--;
+	be16_add_cpu(&sf->hdr.totsize, -size);
+
+	/*
+	 * Fix up the start offset of the attribute fork
+	 */
+	totsize -= size;
+	if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
+	    (mp->m_flags & XFS_MOUNT_ATTR2) &&
+	    (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+	    !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+		xfs_attr_fork_reset(dp, args->trans);
+	} else {
+		xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+		dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
+		ASSERT(dp->i_d.di_forkoff);
+		ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
+				(args->op_flags & XFS_DA_OP_ADDNAME) ||
+				!(mp->m_flags & XFS_MOUNT_ATTR2) ||
+				dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
+		dp->i_afp->if_ext_max =
+			XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
+		dp->i_df.if_ext_max =
+			XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
+		xfs_trans_log_inode(args->trans, dp,
+					XFS_ILOG_CORE | XFS_ILOG_ADATA);
+	}
+
+	xfs_sbversion_add_attr2(mp, args->trans);
+
+	return(0);
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_lookup(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int i;
+	xfs_ifork_t *ifp;
+
+	ifp = args->dp->i_afp;
+	ASSERT(ifp->if_flags & XFS_IFINLINE);
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < sf->hdr.count;
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+			continue;
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+			continue;
+		return(XFS_ERROR(EEXIST));
+	}
+	return(XFS_ERROR(ENOATTR));
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_getvalue(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int i;
+
+	ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE);
+	sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < sf->hdr.count;
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+			continue;
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+			continue;
+		if (args->flags & ATTR_KERNOVAL) {
+			args->valuelen = sfe->valuelen;
+			return(XFS_ERROR(EEXIST));
+		}
+		if (args->valuelen < sfe->valuelen) {
+			args->valuelen = sfe->valuelen;
+			return(XFS_ERROR(ERANGE));
+		}
+		args->valuelen = sfe->valuelen;
+		memcpy(args->value, &sfe->nameval[args->namelen],
+						    args->valuelen);
+		return(XFS_ERROR(EEXIST));
+	}
+	return(XFS_ERROR(ENOATTR));
+}
+
+/*
+ * Convert from using the shortform to the leaf.
+ */
+int
+xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
+{
+	xfs_inode_t *dp;
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	xfs_da_args_t nargs;
+	char *tmpbuffer;
+	int error, i, size;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+	xfs_ifork_t *ifp;
+
+	dp = args->dp;
+	ifp = dp->i_afp;
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	size = be16_to_cpu(sf->hdr.totsize);
+	tmpbuffer = kmem_alloc(size, KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+	memcpy(tmpbuffer, ifp->if_u1.if_data, size);
+	sf = (xfs_attr_shortform_t *)tmpbuffer;
+
+	xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+	bp = NULL;
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error) {
+		/*
+		 * If we hit an IO error middle of the transaction inside
+		 * grow_inode(), we may have inconsistent data. Bail out.
+		 */
+		if (error == EIO)
+			goto out;
+		xfs_idata_realloc(dp, size, XFS_ATTR_FORK);	/* try to put */
+		memcpy(ifp->if_u1.if_data, tmpbuffer, size);	/* it back */
+		goto out;
+	}
+
+	ASSERT(blkno == 0);
+	error = xfs_attr_leaf_create(args, blkno, &bp);
+	if (error) {
+		error = xfs_da_shrink_inode(args, 0, bp);
+		bp = NULL;
+		if (error)
+			goto out;
+		xfs_idata_realloc(dp, size, XFS_ATTR_FORK);	/* try to put */
+		memcpy(ifp->if_u1.if_data, tmpbuffer, size);	/* it back */
+		goto out;
+	}
+
+	memset((char *)&nargs, 0, sizeof(nargs));
+	nargs.dp = dp;
+	nargs.firstblock = args->firstblock;
+	nargs.flist = args->flist;
+	nargs.total = args->total;
+	nargs.whichfork = XFS_ATTR_FORK;
+	nargs.trans = args->trans;
+	nargs.op_flags = XFS_DA_OP_OKNOENT;
+
+	sfe = &sf->list[0];
+	for (i = 0; i < sf->hdr.count; i++) {
+		nargs.name = sfe->nameval;
+		nargs.namelen = sfe->namelen;
+		nargs.value = &sfe->nameval[nargs.namelen];
+		nargs.valuelen = sfe->valuelen;
+		nargs.hashval = xfs_da_hashname(sfe->nameval,
+						sfe->namelen);
+		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
+		error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
+		ASSERT(error == ENOATTR);
+		error = xfs_attr_leaf_add(bp, &nargs);
+		ASSERT(error != ENOSPC);
+		if (error)
+			goto out;
+		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+	}
+	error = 0;
+
+out:
+	if(bp)
+		xfs_da_buf_done(bp);
+	kmem_free(tmpbuffer);
+	return(error);
+}
+
+STATIC int
+xfs_attr_shortform_compare(const void *a, const void *b)
+{
+	xfs_attr_sf_sort_t *sa, *sb;
+
+	sa = (xfs_attr_sf_sort_t *)a;
+	sb = (xfs_attr_sf_sort_t *)b;
+	if (sa->hash < sb->hash) {
+		return(-1);
+	} else if (sa->hash > sb->hash) {
+		return(1);
+	} else {
+		return(sa->entno - sb->entno);
+	}
+}
+
+
+#define XFS_ISRESET_CURSOR(cursor) \
+	(!((cursor)->initted) && !((cursor)->hashval) && \
+	 !((cursor)->blkno) && !((cursor)->offset))
+/*
+ * Copy out entries of shortform attribute lists for attr_list().
+ * Shortform attribute lists are not stored in hashval sorted order.
+ * If the output buffer is not large enough to hold them all, then we
+ * we have to calculate each entries' hashvalue and sort them before
+ * we can begin returning them to the user.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_list(xfs_attr_list_context_t *context)
+{
+	attrlist_cursor_kern_t *cursor;
+	xfs_attr_sf_sort_t *sbuf, *sbp;
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	xfs_inode_t *dp;
+	int sbsize, nsbuf, count, i;
+	int error;
+
+	ASSERT(context != NULL);
+	dp = context->dp;
+	ASSERT(dp != NULL);
+	ASSERT(dp->i_afp != NULL);
+	sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+	ASSERT(sf != NULL);
+	if (!sf->hdr.count)
+		return(0);
+	cursor = context->cursor;
+	ASSERT(cursor != NULL);
+
+	trace_xfs_attr_list_sf(context);
+
+	/*
+	 * If the buffer is large enough and the cursor is at the start,
+	 * do not bother with sorting since we will return everything in
+	 * one buffer and another call using the cursor won't need to be
+	 * made.
+	 * Note the generous fudge factor of 16 overhead bytes per entry.
+	 * If bufsize is zero then put_listent must be a search function
+	 * and can just scan through what we have.
+	 */
+	if (context->bufsize == 0 ||
+	    (XFS_ISRESET_CURSOR(cursor) &&
+             (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
+		for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+			error = context->put_listent(context,
+					   sfe->flags,
+					   sfe->nameval,
+					   (int)sfe->namelen,
+					   (int)sfe->valuelen,
+					   &sfe->nameval[sfe->namelen]);
+
+			/*
+			 * Either search callback finished early or
+			 * didn't fit it all in the buffer after all.
+			 */
+			if (context->seen_enough)
+				break;
+
+			if (error)
+				return error;
+			sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+		}
+		trace_xfs_attr_list_sf_all(context);
+		return(0);
+	}
+
+	/* do no more for a search callback */
+	if (context->bufsize == 0)
+		return 0;
+
+	/*
+	 * It didn't all fit, so we have to sort everything on hashval.
+	 */
+	sbsize = sf->hdr.count * sizeof(*sbuf);
+	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
+
+	/*
+	 * Scan the attribute list for the rest of the entries, storing
+	 * the relevant info from only those that match into a buffer.
+	 */
+	nsbuf = 0;
+	for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+		if (unlikely(
+		    ((char *)sfe < (char *)sf) ||
+		    ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
+			XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
+					     XFS_ERRLEVEL_LOW,
+					     context->dp->i_mount, sfe);
+			kmem_free(sbuf);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+
+		sbp->entno = i;
+		sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
+		sbp->name = sfe->nameval;
+		sbp->namelen = sfe->namelen;
+		/* These are bytes, and both on-disk, don't endian-flip */
+		sbp->valuelen = sfe->valuelen;
+		sbp->flags = sfe->flags;
+		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+		sbp++;
+		nsbuf++;
+	}
+
+	/*
+	 * Sort the entries on hash then entno.
+	 */
+	xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
+
+	/*
+	 * Re-find our place IN THE SORTED LIST.
+	 */
+	count = 0;
+	cursor->initted = 1;
+	cursor->blkno = 0;
+	for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
+		if (sbp->hash == cursor->hashval) {
+			if (cursor->offset == count) {
+				break;
+			}
+			count++;
+		} else if (sbp->hash > cursor->hashval) {
+			break;
+		}
+	}
+	if (i == nsbuf) {
+		kmem_free(sbuf);
+		return(0);
+	}
+
+	/*
+	 * Loop putting entries into the user buffer.
+	 */
+	for ( ; i < nsbuf; i++, sbp++) {
+		if (cursor->hashval != sbp->hash) {
+			cursor->hashval = sbp->hash;
+			cursor->offset = 0;
+		}
+		error = context->put_listent(context,
+					sbp->flags,
+					sbp->name,
+					sbp->namelen,
+					sbp->valuelen,
+					&sbp->name[sbp->namelen]);
+		if (error)
+			return error;
+		if (context->seen_enough)
+			break;
+		cursor->offset++;
+	}
+
+	kmem_free(sbuf);
+	return(0);
+}
+
+/*
+ * Check a leaf attribute block to see if all the entries would fit into
+ * a shortform attribute list.
+ */
+int
+xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	int bytes, i;
+
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+
+	entry = &leaf->entries[0];
+	bytes = sizeof(struct xfs_attr_sf_hdr);
+	for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;		/* don't copy partial entries */
+		if (!(entry->flags & XFS_ATTR_LOCAL))
+			return(0);
+		name_loc = xfs_attr_leaf_name_local(leaf, i);
+		if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+			return(0);
+		if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
+			return(0);
+		bytes += sizeof(struct xfs_attr_sf_entry)-1
+				+ name_loc->namelen
+				+ be16_to_cpu(name_loc->valuelen);
+	}
+	if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
+	    (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+	    (bytes == sizeof(struct xfs_attr_sf_hdr)))
+		return(-1);
+	return(xfs_attr_shortform_bytesfit(dp, bytes));
+}
+
+/*
+ * Convert a leaf attribute list to shortform attribute list
+ */
+int
+xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_da_args_t nargs;
+	xfs_inode_t *dp;
+	char *tmpbuffer;
+	int error, i;
+
+	dp = args->dp;
+	tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+
+	ASSERT(bp != NULL);
+	memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount));
+	leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	memset(bp->data, 0, XFS_LBSIZE(dp->i_mount));
+
+	/*
+	 * Clean out the prior contents of the attribute list.
+	 */
+	error = xfs_da_shrink_inode(args, 0, bp);
+	if (error)
+		goto out;
+
+	if (forkoff == -1) {
+		ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
+		ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
+		xfs_attr_fork_reset(dp, args->trans);
+		goto out;
+	}
+
+	xfs_attr_shortform_create(args);
+
+	/*
+	 * Copy the attributes
+	 */
+	memset((char *)&nargs, 0, sizeof(nargs));
+	nargs.dp = dp;
+	nargs.firstblock = args->firstblock;
+	nargs.flist = args->flist;
+	nargs.total = args->total;
+	nargs.whichfork = XFS_ATTR_FORK;
+	nargs.trans = args->trans;
+	nargs.op_flags = XFS_DA_OP_OKNOENT;
+	entry = &leaf->entries[0];
+	for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;	/* don't copy partial entries */
+		if (!entry->nameidx)
+			continue;
+		ASSERT(entry->flags & XFS_ATTR_LOCAL);
+		name_loc = xfs_attr_leaf_name_local(leaf, i);
+		nargs.name = name_loc->nameval;
+		nargs.namelen = name_loc->namelen;
+		nargs.value = &name_loc->nameval[nargs.namelen];
+		nargs.valuelen = be16_to_cpu(name_loc->valuelen);
+		nargs.hashval = be32_to_cpu(entry->hashval);
+		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
+		xfs_attr_shortform_add(&nargs, forkoff);
+	}
+	error = 0;
+
+out:
+	kmem_free(tmpbuffer);
+	return(error);
+}
+
+/*
+ * Convert from using a single leaf to a root node and a leaf.
+ */
+int
+xfs_attr_leaf_to_node(xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_da_intnode_t *node;
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp1, *bp2;
+	xfs_dablk_t blkno;
+	int error;
+
+	dp = args->dp;
+	bp1 = bp2 = NULL;
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error)
+		goto out;
+	error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
+					     XFS_ATTR_FORK);
+	if (error)
+		goto out;
+	ASSERT(bp1 != NULL);
+	bp2 = NULL;
+	error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
+					    XFS_ATTR_FORK);
+	if (error)
+		goto out;
+	ASSERT(bp2 != NULL);
+	memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount));
+	xfs_da_buf_done(bp1);
+	bp1 = NULL;
+	xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
+
+	/*
+	 * Set up the new root node.
+	 */
+	error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
+	if (error)
+		goto out;
+	node = bp1->data;
+	leaf = bp2->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	/* both on-disk, don't endian-flip twice */
+	node->btree[0].hashval =
+		leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval;
+	node->btree[0].before = cpu_to_be32(blkno);
+	node->hdr.count = cpu_to_be16(1);
+	xfs_da_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1);
+	error = 0;
+out:
+	if (bp1)
+		xfs_da_buf_done(bp1);
+	if (bp2)
+		xfs_da_buf_done(bp2);
+	return(error);
+}
+
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of a leaf attribute list
+ * or a leaf in a node attribute list.
+ */
+STATIC int
+xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_hdr_t *hdr;
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp;
+	int error;
+
+	dp = args->dp;
+	ASSERT(dp != NULL);
+	error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
+					    XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
+	hdr = &leaf->hdr;
+	hdr->info.magic = cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
+	hdr->firstused = cpu_to_be16(XFS_LBSIZE(dp->i_mount));
+	if (!hdr->firstused) {
+		hdr->firstused = cpu_to_be16(
+			XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN);
+	}
+
+	hdr->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
+	hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) -
+					   sizeof(xfs_attr_leaf_hdr_t));
+
+	xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
+
+	*bpp = bp;
+	return(0);
+}
+
+/*
+ * Split the leaf node, rebalance, then add the new entry.
+ */
+int
+xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+				   xfs_da_state_blk_t *newblk)
+{
+	xfs_dablk_t blkno;
+	int error;
+
+	/*
+	 * Allocate space for a new leaf node.
+	 */
+	ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
+	error = xfs_da_grow_inode(state->args, &blkno);
+	if (error)
+		return(error);
+	error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp);
+	if (error)
+		return(error);
+	newblk->blkno = blkno;
+	newblk->magic = XFS_ATTR_LEAF_MAGIC;
+
+	/*
+	 * Rebalance the entries across the two leaves.
+	 * NOTE: rebalance() currently depends on the 2nd block being empty.
+	 */
+	xfs_attr_leaf_rebalance(state, oldblk, newblk);
+	error = xfs_da_blk_link(state, oldblk, newblk);
+	if (error)
+		return(error);
+
+	/*
+	 * Save info on "old" attribute for "atomic rename" ops, leaf_add()
+	 * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the
+	 * "new" attrs info.  Will need the "old" info to remove it later.
+	 *
+	 * Insert the "new" entry in the correct block.
+	 */
+	if (state->inleaf)
+		error = xfs_attr_leaf_add(oldblk->bp, state->args);
+	else
+		error = xfs_attr_leaf_add(newblk->bp, state->args);
+
+	/*
+	 * Update last hashval in each block since we added the name.
+	 */
+	oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
+	newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
+	return(error);
+}
+
+/*
+ * Add a name to the leaf attribute list structure.
+ */
+int
+xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_hdr_t *hdr;
+	xfs_attr_leaf_map_t *map;
+	int tablesize, entsize, sum, tmp, i;
+
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	ASSERT((args->index >= 0)
+		&& (args->index <= be16_to_cpu(leaf->hdr.count)));
+	hdr = &leaf->hdr;
+	entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
+			   args->trans->t_mountp->m_sb.sb_blocksize, NULL);
+
+	/*
+	 * Search through freemap for first-fit on new name length.
+	 * (may need to figure in size of entry struct too)
+	 */
+	tablesize = (be16_to_cpu(hdr->count) + 1)
+					* sizeof(xfs_attr_leaf_entry_t)
+					+ sizeof(xfs_attr_leaf_hdr_t);
+	map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1];
+	for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
+		if (tablesize > be16_to_cpu(hdr->firstused)) {
+			sum += be16_to_cpu(map->size);
+			continue;
+		}
+		if (!map->size)
+			continue;	/* no space in this map */
+		tmp = entsize;
+		if (be16_to_cpu(map->base) < be16_to_cpu(hdr->firstused))
+			tmp += sizeof(xfs_attr_leaf_entry_t);
+		if (be16_to_cpu(map->size) >= tmp) {
+			tmp = xfs_attr_leaf_add_work(bp, args, i);
+			return(tmp);
+		}
+		sum += be16_to_cpu(map->size);
+	}
+
+	/*
+	 * If there are no holes in the address space of the block,
+	 * and we don't have enough freespace, then compaction will do us
+	 * no good and we should just give up.
+	 */
+	if (!hdr->holes && (sum < entsize))
+		return(XFS_ERROR(ENOSPC));
+
+	/*
+	 * Compact the entries to coalesce free space.
+	 * This may change the hdr->count via dropping INCOMPLETE entries.
+	 */
+	xfs_attr_leaf_compact(args->trans, bp);
+
+	/*
+	 * After compaction, the block is guaranteed to have only one
+	 * free region, in freemap[0].  If it is not big enough, give up.
+	 */
+	if (be16_to_cpu(hdr->freemap[0].size)
+				< (entsize + sizeof(xfs_attr_leaf_entry_t)))
+		return(XFS_ERROR(ENOSPC));
+
+	return(xfs_attr_leaf_add_work(bp, args, 0));
+}
+
+/*
+ * Add a name to a leaf attribute list structure.
+ */
+STATIC int
+xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_hdr_t *hdr;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	xfs_attr_leaf_map_t *map;
+	xfs_mount_t *mp;
+	int tmp, i;
+
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	hdr = &leaf->hdr;
+	ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE));
+	ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(hdr->count)));
+
+	/*
+	 * Force open some space in the entry array and fill it in.
+	 */
+	entry = &leaf->entries[args->index];
+	if (args->index < be16_to_cpu(hdr->count)) {
+		tmp  = be16_to_cpu(hdr->count) - args->index;
+		tmp *= sizeof(xfs_attr_leaf_entry_t);
+		memmove((char *)(entry+1), (char *)entry, tmp);
+		xfs_da_log_buf(args->trans, bp,
+		    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+	}
+	be16_add_cpu(&hdr->count, 1);
+
+	/*
+	 * Allocate space for the new string (at the end of the run).
+	 */
+	map = &hdr->freemap[mapindex];
+	mp = args->trans->t_mountp;
+	ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp));
+	ASSERT((be16_to_cpu(map->base) & 0x3) == 0);
+	ASSERT(be16_to_cpu(map->size) >=
+		xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
+					 mp->m_sb.sb_blocksize, NULL));
+	ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
+	ASSERT((be16_to_cpu(map->size) & 0x3) == 0);
+	be16_add_cpu(&map->size,
+		-xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
+					  mp->m_sb.sb_blocksize, &tmp));
+	entry->nameidx = cpu_to_be16(be16_to_cpu(map->base) +
+				     be16_to_cpu(map->size));
+	entry->hashval = cpu_to_be32(args->hashval);
+	entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
+	entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+	if (args->op_flags & XFS_DA_OP_RENAME) {
+		entry->flags |= XFS_ATTR_INCOMPLETE;
+		if ((args->blkno2 == args->blkno) &&
+		    (args->index2 <= args->index)) {
+			args->index2++;
+		}
+	}
+	xfs_da_log_buf(args->trans, bp,
+			  XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+	ASSERT((args->index == 0) ||
+	       (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
+	ASSERT((args->index == be16_to_cpu(hdr->count)-1) ||
+	       (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
+
+	/*
+	 * Copy the attribute name and value into the new space.
+	 *
+	 * For "remote" attribute values, simply note that we need to
+	 * allocate space for the "remote" value.  We can't actually
+	 * allocate the extents in this transaction, and we can't decide
+	 * which blocks they should be as we might allocate more blocks
+	 * as part of this transaction (a split operation for example).
+	 */
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr_leaf_name_local(leaf, args->index);
+		name_loc->namelen = args->namelen;
+		name_loc->valuelen = cpu_to_be16(args->valuelen);
+		memcpy((char *)name_loc->nameval, args->name, args->namelen);
+		memcpy((char *)&name_loc->nameval[args->namelen], args->value,
+				   be16_to_cpu(name_loc->valuelen));
+	} else {
+		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+		name_rmt->namelen = args->namelen;
+		memcpy((char *)name_rmt->name, args->name, args->namelen);
+		entry->flags |= XFS_ATTR_INCOMPLETE;
+		/* just in case */
+		name_rmt->valuelen = 0;
+		name_rmt->valueblk = 0;
+		args->rmtblkno = 1;
+		args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+	}
+	xfs_da_log_buf(args->trans, bp,
+	     XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
+				   xfs_attr_leaf_entsize(leaf, args->index)));
+
+	/*
+	 * Update the control info for this leaf node
+	 */
+	if (be16_to_cpu(entry->nameidx) < be16_to_cpu(hdr->firstused)) {
+		/* both on-disk, don't endian-flip twice */
+		hdr->firstused = entry->nameidx;
+	}
+	ASSERT(be16_to_cpu(hdr->firstused) >=
+	       ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr)));
+	tmp = (be16_to_cpu(hdr->count)-1) * sizeof(xfs_attr_leaf_entry_t)
+					+ sizeof(xfs_attr_leaf_hdr_t);
+	map = &hdr->freemap[0];
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
+		if (be16_to_cpu(map->base) == tmp) {
+			be16_add_cpu(&map->base, sizeof(xfs_attr_leaf_entry_t));
+			be16_add_cpu(&map->size,
+				 -((int)sizeof(xfs_attr_leaf_entry_t)));
+		}
+	}
+	be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index));
+	xfs_da_log_buf(args->trans, bp,
+		XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+	return(0);
+}
+
+/*
+ * Garbage collect a leaf attribute list block by copying it to a new buffer.
+ */
+STATIC void
+xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp)
+{
+	xfs_attr_leafblock_t *leaf_s, *leaf_d;
+	xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+	xfs_mount_t *mp;
+	char *tmpbuffer;
+
+	mp = trans->t_mountp;
+	tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+	memcpy(tmpbuffer, bp->data, XFS_LBSIZE(mp));
+	memset(bp->data, 0, XFS_LBSIZE(mp));
+
+	/*
+	 * Copy basic information
+	 */
+	leaf_s = (xfs_attr_leafblock_t *)tmpbuffer;
+	leaf_d = bp->data;
+	hdr_s = &leaf_s->hdr;
+	hdr_d = &leaf_d->hdr;
+	hdr_d->info = hdr_s->info;	/* struct copy */
+	hdr_d->firstused = cpu_to_be16(XFS_LBSIZE(mp));
+	/* handle truncation gracefully */
+	if (!hdr_d->firstused) {
+		hdr_d->firstused = cpu_to_be16(
+				XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN);
+	}
+	hdr_d->usedbytes = 0;
+	hdr_d->count = 0;
+	hdr_d->holes = 0;
+	hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
+	hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) -
+					     sizeof(xfs_attr_leaf_hdr_t));
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate name/value pairs packed and in sequence.
+	 */
+	xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0,
+				be16_to_cpu(hdr_s->count), mp);
+	xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
+
+	kmem_free(tmpbuffer);
+}
+
+/*
+ * Redistribute the attribute list entries between two leaf nodes,
+ * taking into account the size of the new entry.
+ *
+ * NOTE: if new block is empty, then it will get the upper half of the
+ * old block.  At present, all (one) callers pass in an empty second block.
+ *
+ * This code adjusts the args->index/blkno and args->index2/blkno2 fields
+ * to match what it is doing in splitting the attribute leaf block.  Those
+ * values are used in "atomic rename" operations on attributes.  Note that
+ * the "new" and "old" values can end up in different blocks.
+ */
+STATIC void
+xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+				       xfs_da_state_blk_t *blk2)
+{
+	xfs_da_args_t *args;
+	xfs_da_state_blk_t *tmp_blk;
+	xfs_attr_leafblock_t *leaf1, *leaf2;
+	xfs_attr_leaf_hdr_t *hdr1, *hdr2;
+	int count, totallen, max, space, swap;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
+	leaf1 = blk1->bp->data;
+	leaf2 = blk2->bp->data;
+	ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	args = state->args;
+
+	/*
+	 * Check ordering of blocks, reverse if it makes things simpler.
+	 *
+	 * NOTE: Given that all (current) callers pass in an empty
+	 * second block, this code should never set "swap".
+	 */
+	swap = 0;
+	if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) {
+		tmp_blk = blk1;
+		blk1 = blk2;
+		blk2 = tmp_blk;
+		leaf1 = blk1->bp->data;
+		leaf2 = blk2->bp->data;
+		swap = 1;
+	}
+	hdr1 = &leaf1->hdr;
+	hdr2 = &leaf2->hdr;
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.  Then get
+	 * the direction to copy and the number of elements to move.
+	 *
+	 * "inleaf" is true if the new entry should be inserted into blk1.
+	 * If "swap" is also true, then reverse the sense of "inleaf".
+	 */
+	state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2,
+							    &count, &totallen);
+	if (swap)
+		state->inleaf = !state->inleaf;
+
+	/*
+	 * Move any entries required from leaf to leaf:
+	 */
+	if (count < be16_to_cpu(hdr1->count)) {
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		/* number entries being moved */
+		count = be16_to_cpu(hdr1->count) - count;
+		space  = be16_to_cpu(hdr1->usedbytes) - totallen;
+		space += count * sizeof(xfs_attr_leaf_entry_t);
+
+		/*
+		 * leaf2 is the destination, compact it if it looks tight.
+		 */
+		max  = be16_to_cpu(hdr2->firstused)
+						- sizeof(xfs_attr_leaf_hdr_t);
+		max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t);
+		if (space > max) {
+			xfs_attr_leaf_compact(args->trans, blk2->bp);
+		}
+
+		/*
+		 * Move high entries from leaf1 to low end of leaf2.
+		 */
+		xfs_attr_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count,
+				leaf2, 0, count, state->mp);
+
+		xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
+		xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+	} else if (count > be16_to_cpu(hdr1->count)) {
+		/*
+		 * I assert that since all callers pass in an empty
+		 * second buffer, this code should never execute.
+		 */
+
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		/* number entries being moved */
+		count -= be16_to_cpu(hdr1->count);
+		space  = totallen - be16_to_cpu(hdr1->usedbytes);
+		space += count * sizeof(xfs_attr_leaf_entry_t);
+
+		/*
+		 * leaf1 is the destination, compact it if it looks tight.
+		 */
+		max  = be16_to_cpu(hdr1->firstused)
+						- sizeof(xfs_attr_leaf_hdr_t);
+		max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t);
+		if (space > max) {
+			xfs_attr_leaf_compact(args->trans, blk1->bp);
+		}
+
+		/*
+		 * Move low entries from leaf2 to high end of leaf1.
+		 */
+		xfs_attr_leaf_moveents(leaf2, 0, leaf1,
+				be16_to_cpu(hdr1->count), count, state->mp);
+
+		xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
+		xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+	}
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	blk1->hashval = be32_to_cpu(
+		leaf1->entries[be16_to_cpu(leaf1->hdr.count)-1].hashval);
+	blk2->hashval = be32_to_cpu(
+		leaf2->entries[be16_to_cpu(leaf2->hdr.count)-1].hashval);
+
+	/*
+	 * Adjust the expected index for insertion.
+	 * NOTE: this code depends on the (current) situation that the
+	 * second block was originally empty.
+	 *
+	 * If the insertion point moved to the 2nd block, we must adjust
+	 * the index.  We must also track the entry just following the
+	 * new entry for use in an "atomic rename" operation, that entry
+	 * is always the "old" entry and the "new" entry is what we are
+	 * inserting.  The index/blkno fields refer to the "old" entry,
+	 * while the index2/blkno2 fields refer to the "new" entry.
+	 */
+	if (blk1->index > be16_to_cpu(leaf1->hdr.count)) {
+		ASSERT(state->inleaf == 0);
+		blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count);
+		args->index = args->index2 = blk2->index;
+		args->blkno = args->blkno2 = blk2->blkno;
+	} else if (blk1->index == be16_to_cpu(leaf1->hdr.count)) {
+		if (state->inleaf) {
+			args->index = blk1->index;
+			args->blkno = blk1->blkno;
+			args->index2 = 0;
+			args->blkno2 = blk2->blkno;
+		} else {
+			blk2->index = blk1->index
+				    - be16_to_cpu(leaf1->hdr.count);
+			args->index = args->index2 = blk2->index;
+			args->blkno = args->blkno2 = blk2->blkno;
+		}
+	} else {
+		ASSERT(state->inleaf == 1);
+		args->index = args->index2 = blk1->index;
+		args->blkno = args->blkno2 = blk1->blkno;
+	}
+}
+
+/*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ * GROT: Is this really necessary?  With other than a 512 byte blocksize,
+ * GROT: there will always be enough room in either block for a new entry.
+ * GROT: Do a double-split for this case?
+ */
+STATIC int
+xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
+				    xfs_da_state_blk_t *blk1,
+				    xfs_da_state_blk_t *blk2,
+				    int *countarg, int *usedbytesarg)
+{
+	xfs_attr_leafblock_t *leaf1, *leaf2;
+	xfs_attr_leaf_hdr_t *hdr1, *hdr2;
+	xfs_attr_leaf_entry_t *entry;
+	int count, max, index, totallen, half;
+	int lastdelta, foundit, tmp;
+
+	/*
+	 * Set up environment.
+	 */
+	leaf1 = blk1->bp->data;
+	leaf2 = blk2->bp->data;
+	hdr1 = &leaf1->hdr;
+	hdr2 = &leaf2->hdr;
+	foundit = 0;
+	totallen = 0;
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.
+	 */
+	max = be16_to_cpu(hdr1->count) + be16_to_cpu(hdr2->count);
+	half  = (max+1) * sizeof(*entry);
+	half += be16_to_cpu(hdr1->usedbytes) +
+		be16_to_cpu(hdr2->usedbytes) +
+		xfs_attr_leaf_newentsize(
+				state->args->namelen,
+				state->args->valuelen,
+				state->blocksize, NULL);
+	half /= 2;
+	lastdelta = state->blocksize;
+	entry = &leaf1->entries[0];
+	for (count = index = 0; count < max; entry++, index++, count++) {
+
+#define XFS_ATTR_ABS(A)	(((A) < 0) ? -(A) : (A))
+		/*
+		 * The new entry is in the first block, account for it.
+		 */
+		if (count == blk1->index) {
+			tmp = totallen + sizeof(*entry) +
+				xfs_attr_leaf_newentsize(
+						state->args->namelen,
+						state->args->valuelen,
+						state->blocksize, NULL);
+			if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+				break;
+			lastdelta = XFS_ATTR_ABS(half - tmp);
+			totallen = tmp;
+			foundit = 1;
+		}
+
+		/*
+		 * Wrap around into the second block if necessary.
+		 */
+		if (count == be16_to_cpu(hdr1->count)) {
+			leaf1 = leaf2;
+			entry = &leaf1->entries[0];
+			index = 0;
+		}
+
+		/*
+		 * Figure out if next leaf entry would be too much.
+		 */
+		tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1,
+									index);
+		if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+			break;
+		lastdelta = XFS_ATTR_ABS(half - tmp);
+		totallen = tmp;
+#undef XFS_ATTR_ABS
+	}
+
+	/*
+	 * Calculate the number of usedbytes that will end up in lower block.
+	 * If new entry not in lower block, fix up the count.
+	 */
+	totallen -= count * sizeof(*entry);
+	if (foundit) {
+		totallen -= sizeof(*entry) +
+				xfs_attr_leaf_newentsize(
+						state->args->namelen,
+						state->args->valuelen,
+						state->blocksize, NULL);
+	}
+
+	*countarg = count;
+	*usedbytesarg = totallen;
+	return(foundit);
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ *
+ * GROT: allow for INCOMPLETE entries in calculation.
+ */
+int
+xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_da_state_blk_t *blk;
+	xfs_da_blkinfo_t *info;
+	int count, bytes, forward, error, retval, i;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	info = blk->bp->data;
+	ASSERT(be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC);
+	leaf = (xfs_attr_leafblock_t *)info;
+	count = be16_to_cpu(leaf->hdr.count);
+	bytes = sizeof(xfs_attr_leaf_hdr_t) +
+		count * sizeof(xfs_attr_leaf_entry_t) +
+		be16_to_cpu(leaf->hdr.usedbytes);
+	if (bytes > (state->blocksize >> 1)) {
+		*action = 0;	/* blk over 50%, don't try to join */
+		return(0);
+	}
+
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (arbitrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (info->forw != 0);
+		memcpy(&state->altpath, &state->path, sizeof(state->path));
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error)
+			return(error);
+		if (retval) {
+			*action = 0;
+		} else {
+			*action = 2;
+		}
+		return(0);
+	}
+
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink an attribute list over time.
+	 */
+	/* start with smaller blk num */
+	forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back));
+	for (i = 0; i < 2; forward = !forward, i++) {
+		if (forward)
+			blkno = be32_to_cpu(info->forw);
+		else
+			blkno = be32_to_cpu(info->back);
+		if (blkno == 0)
+			continue;
+		error = xfs_da_read_buf(state->args->trans, state->args->dp,
+					blkno, -1, &bp, XFS_ATTR_FORK);
+		if (error)
+			return(error);
+		ASSERT(bp != NULL);
+
+		leaf = (xfs_attr_leafblock_t *)info;
+		count  = be16_to_cpu(leaf->hdr.count);
+		bytes  = state->blocksize - (state->blocksize>>2);
+		bytes -= be16_to_cpu(leaf->hdr.usedbytes);
+		leaf = bp->data;
+		ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+		count += be16_to_cpu(leaf->hdr.count);
+		bytes -= be16_to_cpu(leaf->hdr.usedbytes);
+		bytes -= count * sizeof(xfs_attr_leaf_entry_t);
+		bytes -= sizeof(xfs_attr_leaf_hdr_t);
+		xfs_da_brelse(state->args->trans, bp);
+		if (bytes >= 0)
+			break;	/* fits with at least 25% to spare */
+	}
+	if (i >= 2) {
+		*action = 0;
+		return(0);
+	}
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	memcpy(&state->altpath, &state->path, sizeof(state->path));
+	if (blkno < blk->blkno) {
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+	} else {
+		error = xfs_da_path_shift(state, &state->path, forward,
+						 0, &retval);
+	}
+	if (error)
+		return(error);
+	if (retval) {
+		*action = 0;
+	} else {
+		*action = 1;
+	}
+	return(0);
+}
+
+/*
+ * Remove a name from the leaf attribute list structure.
+ *
+ * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
+ * If two leaves are 37% full, when combined they will leave 25% free.
+ */
+int
+xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_hdr_t *hdr;
+	xfs_attr_leaf_map_t *map;
+	xfs_attr_leaf_entry_t *entry;
+	int before, after, smallest, entsize;
+	int tablesize, tmp, i;
+	xfs_mount_t *mp;
+
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	hdr = &leaf->hdr;
+	mp = args->trans->t_mountp;
+	ASSERT((be16_to_cpu(hdr->count) > 0)
+		&& (be16_to_cpu(hdr->count) < (XFS_LBSIZE(mp)/8)));
+	ASSERT((args->index >= 0)
+		&& (args->index < be16_to_cpu(hdr->count)));
+	ASSERT(be16_to_cpu(hdr->firstused) >=
+	       ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr)));
+	entry = &leaf->entries[args->index];
+	ASSERT(be16_to_cpu(entry->nameidx) >= be16_to_cpu(hdr->firstused));
+	ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
+
+	/*
+	 * Scan through free region table:
+	 *    check for adjacency of free'd entry with an existing one,
+	 *    find smallest free region in case we need to replace it,
+	 *    adjust any map that borders the entry table,
+	 */
+	tablesize = be16_to_cpu(hdr->count) * sizeof(xfs_attr_leaf_entry_t)
+					+ sizeof(xfs_attr_leaf_hdr_t);
+	map = &hdr->freemap[0];
+	tmp = be16_to_cpu(map->size);
+	before = after = -1;
+	smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
+	entsize = xfs_attr_leaf_entsize(leaf, args->index);
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
+		ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp));
+		ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
+		if (be16_to_cpu(map->base) == tablesize) {
+			be16_add_cpu(&map->base,
+				 -((int)sizeof(xfs_attr_leaf_entry_t)));
+			be16_add_cpu(&map->size, sizeof(xfs_attr_leaf_entry_t));
+		}
+
+		if ((be16_to_cpu(map->base) + be16_to_cpu(map->size))
+				== be16_to_cpu(entry->nameidx)) {
+			before = i;
+		} else if (be16_to_cpu(map->base)
+			== (be16_to_cpu(entry->nameidx) + entsize)) {
+			after = i;
+		} else if (be16_to_cpu(map->size) < tmp) {
+			tmp = be16_to_cpu(map->size);
+			smallest = i;
+		}
+	}
+
+	/*
+	 * Coalesce adjacent freemap regions,
+	 * or replace the smallest region.
+	 */
+	if ((before >= 0) || (after >= 0)) {
+		if ((before >= 0) && (after >= 0)) {
+			map = &hdr->freemap[before];
+			be16_add_cpu(&map->size, entsize);
+			be16_add_cpu(&map->size,
+				 be16_to_cpu(hdr->freemap[after].size));
+			hdr->freemap[after].base = 0;
+			hdr->freemap[after].size = 0;
+		} else if (before >= 0) {
+			map = &hdr->freemap[before];
+			be16_add_cpu(&map->size, entsize);
+		} else {
+			map = &hdr->freemap[after];
+			/* both on-disk, don't endian flip twice */
+			map->base = entry->nameidx;
+			be16_add_cpu(&map->size, entsize);
+		}
+	} else {
+		/*
+		 * Replace smallest region (if it is smaller than free'd entry)
+		 */
+		map = &hdr->freemap[smallest];
+		if (be16_to_cpu(map->size) < entsize) {
+			map->base = cpu_to_be16(be16_to_cpu(entry->nameidx));
+			map->size = cpu_to_be16(entsize);
+		}
+	}
+
+	/*
+	 * Did we remove the first entry?
+	 */
+	if (be16_to_cpu(entry->nameidx) == be16_to_cpu(hdr->firstused))
+		smallest = 1;
+	else
+		smallest = 0;
+
+	/*
+	 * Compress the remaining entries and zero out the removed stuff.
+	 */
+	memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize);
+	be16_add_cpu(&hdr->usedbytes, -entsize);
+	xfs_da_log_buf(args->trans, bp,
+	     XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
+				   entsize));
+
+	tmp = (be16_to_cpu(hdr->count) - args->index)
+					* sizeof(xfs_attr_leaf_entry_t);
+	memmove((char *)entry, (char *)(entry+1), tmp);
+	be16_add_cpu(&hdr->count, -1);
+	xfs_da_log_buf(args->trans, bp,
+	    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+	entry = &leaf->entries[be16_to_cpu(hdr->count)];
+	memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t));
+
+	/*
+	 * If we removed the first entry, re-find the first used byte
+	 * in the name area.  Note that if the entry was the "firstused",
+	 * then we don't have a "hole" in our block resulting from
+	 * removing the name.
+	 */
+	if (smallest) {
+		tmp = XFS_LBSIZE(mp);
+		entry = &leaf->entries[0];
+		for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) {
+			ASSERT(be16_to_cpu(entry->nameidx) >=
+			       be16_to_cpu(hdr->firstused));
+			ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
+
+			if (be16_to_cpu(entry->nameidx) < tmp)
+				tmp = be16_to_cpu(entry->nameidx);
+		}
+		hdr->firstused = cpu_to_be16(tmp);
+		if (!hdr->firstused) {
+			hdr->firstused = cpu_to_be16(
+					tmp - XFS_ATTR_LEAF_NAME_ALIGN);
+		}
+	} else {
+		hdr->holes = 1;		/* mark as needing compaction */
+	}
+	xfs_da_log_buf(args->trans, bp,
+			  XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+
+	/*
+	 * Check if leaf is less than 50% full, caller may want to
+	 * "join" the leaf with a sibling if so.
+	 */
+	tmp  = sizeof(xfs_attr_leaf_hdr_t);
+	tmp += be16_to_cpu(leaf->hdr.count) * sizeof(xfs_attr_leaf_entry_t);
+	tmp += be16_to_cpu(leaf->hdr.usedbytes);
+	return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */
+}
+
+/*
+ * Move all the attribute list entries from drop_leaf into save_leaf.
+ */
+void
+xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+				       xfs_da_state_blk_t *save_blk)
+{
+	xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
+	xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
+	xfs_mount_t *mp;
+	char *tmpbuffer;
+
+	/*
+	 * Set up environment.
+	 */
+	mp = state->mp;
+	ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+	drop_leaf = drop_blk->bp->data;
+	save_leaf = save_blk->bp->data;
+	ASSERT(be16_to_cpu(drop_leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(be16_to_cpu(save_leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	drop_hdr = &drop_leaf->hdr;
+	save_hdr = &save_leaf->hdr;
+
+	/*
+	 * Save last hashval from dying block for later Btree fixup.
+	 */
+	drop_blk->hashval = be32_to_cpu(
+		drop_leaf->entries[be16_to_cpu(drop_leaf->hdr.count)-1].hashval);
+
+	/*
+	 * Check if we need a temp buffer, or can we do it in place.
+	 * Note that we don't check "leaf" for holes because we will
+	 * always be dropping it, toosmall() decided that for us already.
+	 */
+	if (save_hdr->holes == 0) {
+		/*
+		 * dest leaf has no holes, so we add there.  May need
+		 * to make some room in the entry array.
+		 */
+		if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
+			xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0,
+			     be16_to_cpu(drop_hdr->count), mp);
+		} else {
+			xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf,
+				  be16_to_cpu(save_hdr->count),
+				  be16_to_cpu(drop_hdr->count), mp);
+		}
+	} else {
+		/*
+		 * Destination has holes, so we make a temporary copy
+		 * of the leaf and add them both to that.
+		 */
+		tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
+		ASSERT(tmpbuffer != NULL);
+		memset(tmpbuffer, 0, state->blocksize);
+		tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+		tmp_hdr = &tmp_leaf->hdr;
+		tmp_hdr->info = save_hdr->info;	/* struct copy */
+		tmp_hdr->count = 0;
+		tmp_hdr->firstused = cpu_to_be16(state->blocksize);
+		if (!tmp_hdr->firstused) {
+			tmp_hdr->firstused = cpu_to_be16(
+				state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN);
+		}
+		tmp_hdr->usedbytes = 0;
+		if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
+			xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
+				be16_to_cpu(drop_hdr->count), mp);
+			xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf,
+				  be16_to_cpu(tmp_leaf->hdr.count),
+				  be16_to_cpu(save_hdr->count), mp);
+		} else {
+			xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
+				be16_to_cpu(save_hdr->count), mp);
+			xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf,
+				be16_to_cpu(tmp_leaf->hdr.count),
+				be16_to_cpu(drop_hdr->count), mp);
+		}
+		memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize);
+		kmem_free(tmpbuffer);
+	}
+
+	xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
+					   state->blocksize - 1);
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	save_blk->hashval = be32_to_cpu(
+		save_leaf->entries[be16_to_cpu(save_leaf->hdr.count)-1].hashval);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ * This is the internal routine, it uses the caller's buffer.
+ *
+ * Note that duplicate keys are allowed, but only check within the
+ * current leaf node.  The Btree code must check in adjacent leaf nodes.
+ *
+ * Return in args->index the index into the entry[] array of either
+ * the found entry, or where the entry should have been (insert before
+ * that entry).
+ *
+ * Don't change the args->value unless we find the attribute.
+ */
+int
+xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	int probe, span;
+	xfs_dahash_t hashval;
+
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(be16_to_cpu(leaf->hdr.count)
+					< (XFS_LBSIZE(args->dp->i_mount)/8));
+
+	/*
+	 * Binary search.  (note: small blocks will skip this loop)
+	 */
+	hashval = args->hashval;
+	probe = span = be16_to_cpu(leaf->hdr.count) / 2;
+	for (entry = &leaf->entries[probe]; span > 4;
+		   entry = &leaf->entries[probe]) {
+		span /= 2;
+		if (be32_to_cpu(entry->hashval) < hashval)
+			probe += span;
+		else if (be32_to_cpu(entry->hashval) > hashval)
+			probe -= span;
+		else
+			break;
+	}
+	ASSERT((probe >= 0) &&
+	       (!leaf->hdr.count
+	       || (probe < be16_to_cpu(leaf->hdr.count))));
+	ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval));
+
+	/*
+	 * Since we may have duplicate hashval's, find the first matching
+	 * hashval in the leaf.
+	 */
+	while ((probe > 0) && (be32_to_cpu(entry->hashval) >= hashval)) {
+		entry--;
+		probe--;
+	}
+	while ((probe < be16_to_cpu(leaf->hdr.count)) &&
+	       (be32_to_cpu(entry->hashval) < hashval)) {
+		entry++;
+		probe++;
+	}
+	if ((probe == be16_to_cpu(leaf->hdr.count)) ||
+	    (be32_to_cpu(entry->hashval) != hashval)) {
+		args->index = probe;
+		return(XFS_ERROR(ENOATTR));
+	}
+
+	/*
+	 * Duplicate keys may be present, so search all of them for a match.
+	 */
+	for (  ; (probe < be16_to_cpu(leaf->hdr.count)) &&
+			(be32_to_cpu(entry->hashval) == hashval);
+			entry++, probe++) {
+/*
+ * GROT: Add code to remove incomplete entries.
+ */
+		/*
+		 * If we are looking for INCOMPLETE entries, show only those.
+		 * If we are looking for complete entries, show only those.
+		 */
+		if ((args->flags & XFS_ATTR_INCOMPLETE) !=
+		    (entry->flags & XFS_ATTR_INCOMPLETE)) {
+			continue;
+		}
+		if (entry->flags & XFS_ATTR_LOCAL) {
+			name_loc = xfs_attr_leaf_name_local(leaf, probe);
+			if (name_loc->namelen != args->namelen)
+				continue;
+			if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0)
+				continue;
+			if (!xfs_attr_namesp_match(args->flags, entry->flags))
+				continue;
+			args->index = probe;
+			return(XFS_ERROR(EEXIST));
+		} else {
+			name_rmt = xfs_attr_leaf_name_remote(leaf, probe);
+			if (name_rmt->namelen != args->namelen)
+				continue;
+			if (memcmp(args->name, (char *)name_rmt->name,
+					     args->namelen) != 0)
+				continue;
+			if (!xfs_attr_namesp_match(args->flags, entry->flags))
+				continue;
+			args->index = probe;
+			args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+			args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount,
+						   be32_to_cpu(name_rmt->valuelen));
+			return(XFS_ERROR(EEXIST));
+		}
+	}
+	args->index = probe;
+	return(XFS_ERROR(ENOATTR));
+}
+
+/*
+ * Get the value associated with an attribute name from a leaf attribute
+ * list structure.
+ */
+int
+xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+	int valuelen;
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(be16_to_cpu(leaf->hdr.count)
+					< (XFS_LBSIZE(args->dp->i_mount)/8));
+	ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
+
+	entry = &leaf->entries[args->index];
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr_leaf_name_local(leaf, args->index);
+		ASSERT(name_loc->namelen == args->namelen);
+		ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
+		valuelen = be16_to_cpu(name_loc->valuelen);
+		if (args->flags & ATTR_KERNOVAL) {
+			args->valuelen = valuelen;
+			return(0);
+		}
+		if (args->valuelen < valuelen) {
+			args->valuelen = valuelen;
+			return(XFS_ERROR(ERANGE));
+		}
+		args->valuelen = valuelen;
+		memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
+	} else {
+		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+		ASSERT(name_rmt->namelen == args->namelen);
+		ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
+		valuelen = be32_to_cpu(name_rmt->valuelen);
+		args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+		args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen);
+		if (args->flags & ATTR_KERNOVAL) {
+			args->valuelen = valuelen;
+			return(0);
+		}
+		if (args->valuelen < valuelen) {
+			args->valuelen = valuelen;
+			return(XFS_ERROR(ERANGE));
+		}
+		args->valuelen = valuelen;
+	}
+	return(0);
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Move the indicated entries from one leaf to another.
+ * NOTE: this routine modifies both source and destination leaves.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
+			xfs_attr_leafblock_t *leaf_d, int start_d,
+			int count, xfs_mount_t *mp)
+{
+	xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+	xfs_attr_leaf_entry_t *entry_s, *entry_d;
+	int desti, tmp, i;
+
+	/*
+	 * Check for nothing to do.
+	 */
+	if (count == 0)
+		return;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(be16_to_cpu(leaf_s->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(be16_to_cpu(leaf_d->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	hdr_s = &leaf_s->hdr;
+	hdr_d = &leaf_d->hdr;
+	ASSERT((be16_to_cpu(hdr_s->count) > 0) &&
+	       (be16_to_cpu(hdr_s->count) < (XFS_LBSIZE(mp)/8)));
+	ASSERT(be16_to_cpu(hdr_s->firstused) >=
+		((be16_to_cpu(hdr_s->count)
+					* sizeof(*entry_s))+sizeof(*hdr_s)));
+	ASSERT(be16_to_cpu(hdr_d->count) < (XFS_LBSIZE(mp)/8));
+	ASSERT(be16_to_cpu(hdr_d->firstused) >=
+		((be16_to_cpu(hdr_d->count)
+					* sizeof(*entry_d))+sizeof(*hdr_d)));
+
+	ASSERT(start_s < be16_to_cpu(hdr_s->count));
+	ASSERT(start_d <= be16_to_cpu(hdr_d->count));
+	ASSERT(count <= be16_to_cpu(hdr_s->count));
+
+	/*
+	 * Move the entries in the destination leaf up to make a hole?
+	 */
+	if (start_d < be16_to_cpu(hdr_d->count)) {
+		tmp  = be16_to_cpu(hdr_d->count) - start_d;
+		tmp *= sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &leaf_d->entries[start_d];
+		entry_d = &leaf_d->entries[start_d + count];
+		memmove((char *)entry_d, (char *)entry_s, tmp);
+	}
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate attribute info packed and in sequence.
+	 */
+	entry_s = &leaf_s->entries[start_s];
+	entry_d = &leaf_d->entries[start_d];
+	desti = start_d;
+	for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
+		ASSERT(be16_to_cpu(entry_s->nameidx)
+				>= be16_to_cpu(hdr_s->firstused));
+		tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
+#ifdef GROT
+		/*
+		 * Code to drop INCOMPLETE entries.  Difficult to use as we
+		 * may also need to change the insertion index.  Code turned
+		 * off for 6.2, should be revisited later.
+		 */
+		if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
+			memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
+			be16_add_cpu(&hdr_s->usedbytes, -tmp);
+			be16_add_cpu(&hdr_s->count, -1);
+			entry_d--;	/* to compensate for ++ in loop hdr */
+			desti--;
+			if ((start_s + i) < offset)
+				result++;	/* insertion index adjustment */
+		} else {
+#endif /* GROT */
+			be16_add_cpu(&hdr_d->firstused, -tmp);
+			/* both on-disk, don't endian flip twice */
+			entry_d->hashval = entry_s->hashval;
+			/* both on-disk, don't endian flip twice */
+			entry_d->nameidx = hdr_d->firstused;
+			entry_d->flags = entry_s->flags;
+			ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
+							<= XFS_LBSIZE(mp));
+			memmove(xfs_attr_leaf_name(leaf_d, desti),
+				xfs_attr_leaf_name(leaf_s, start_s + i), tmp);
+			ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
+							<= XFS_LBSIZE(mp));
+			memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
+			be16_add_cpu(&hdr_s->usedbytes, -tmp);
+			be16_add_cpu(&hdr_d->usedbytes, tmp);
+			be16_add_cpu(&hdr_s->count, -1);
+			be16_add_cpu(&hdr_d->count, 1);
+			tmp = be16_to_cpu(hdr_d->count)
+						* sizeof(xfs_attr_leaf_entry_t)
+						+ sizeof(xfs_attr_leaf_hdr_t);
+			ASSERT(be16_to_cpu(hdr_d->firstused) >= tmp);
+#ifdef GROT
+		}
+#endif /* GROT */
+	}
+
+	/*
+	 * Zero out the entries we just copied.
+	 */
+	if (start_s == be16_to_cpu(hdr_s->count)) {
+		tmp = count * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &leaf_s->entries[start_s];
+		ASSERT(((char *)entry_s + tmp) <=
+		       ((char *)leaf_s + XFS_LBSIZE(mp)));
+		memset((char *)entry_s, 0, tmp);
+	} else {
+		/*
+		 * Move the remaining entries down to fill the hole,
+		 * then zero the entries at the top.
+		 */
+		tmp  = be16_to_cpu(hdr_s->count) - count;
+		tmp *= sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &leaf_s->entries[start_s + count];
+		entry_d = &leaf_s->entries[start_s];
+		memmove((char *)entry_d, (char *)entry_s, tmp);
+
+		tmp = count * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &leaf_s->entries[be16_to_cpu(hdr_s->count)];
+		ASSERT(((char *)entry_s + tmp) <=
+		       ((char *)leaf_s + XFS_LBSIZE(mp)));
+		memset((char *)entry_s, 0, tmp);
+	}
+
+	/*
+	 * Fill in the freemap information
+	 */
+	hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
+	be16_add_cpu(&hdr_d->freemap[0].base, be16_to_cpu(hdr_d->count) *
+			sizeof(xfs_attr_leaf_entry_t));
+	hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused)
+			      - be16_to_cpu(hdr_d->freemap[0].base));
+	hdr_d->freemap[1].base = 0;
+	hdr_d->freemap[2].base = 0;
+	hdr_d->freemap[1].size = 0;
+	hdr_d->freemap[2].size = 0;
+	hdr_s->holes = 1;	/* leaf may not be compact */
+}
+
+/*
+ * Compare two leaf blocks "order".
+ * Return 0 unless leaf2 should go before leaf1.
+ */
+int
+xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
+{
+	xfs_attr_leafblock_t *leaf1, *leaf2;
+
+	leaf1 = leaf1_bp->data;
+	leaf2 = leaf2_bp->data;
+	ASSERT((be16_to_cpu(leaf1->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC) &&
+	       (be16_to_cpu(leaf2->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC));
+	if ((be16_to_cpu(leaf1->hdr.count) > 0) &&
+	    (be16_to_cpu(leaf2->hdr.count) > 0) &&
+	    ((be32_to_cpu(leaf2->entries[0].hashval) <
+	      be32_to_cpu(leaf1->entries[0].hashval)) ||
+	     (be32_to_cpu(leaf2->entries[
+			be16_to_cpu(leaf2->hdr.count)-1].hashval) <
+	      be32_to_cpu(leaf1->entries[
+			be16_to_cpu(leaf1->hdr.count)-1].hashval)))) {
+		return(1);
+	}
+	return(0);
+}
+
+/*
+ * Pick up the last hashvalue from a leaf block.
+ */
+xfs_dahash_t
+xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count)
+{
+	xfs_attr_leafblock_t *leaf;
+
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	if (count)
+		*count = be16_to_cpu(leaf->hdr.count);
+	if (!leaf->hdr.count)
+		return(0);
+	return be32_to_cpu(leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval);
+}
+
+/*
+ * Calculate the number of bytes used to store the indicated attribute
+ * (whether local or remote only calculate bytes in this block).
+ */
+STATIC int
+xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
+{
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	int size;
+
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	if (leaf->entries[index].flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr_leaf_name_local(leaf, index);
+		size = xfs_attr_leaf_entsize_local(name_loc->namelen,
+						   be16_to_cpu(name_loc->valuelen));
+	} else {
+		name_rmt = xfs_attr_leaf_name_remote(leaf, index);
+		size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
+	}
+	return(size);
+}
+
+/*
+ * Calculate the number of bytes that would be required to store the new
+ * attribute (whether local or remote only calculate bytes in this block).
+ * This routine decides as a side effect whether the attribute will be
+ * a "local" or a "remote" attribute.
+ */
+int
+xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
+{
+	int size;
+
+	size = xfs_attr_leaf_entsize_local(namelen, valuelen);
+	if (size < xfs_attr_leaf_entsize_local_max(blocksize)) {
+		if (local) {
+			*local = 1;
+		}
+	} else {
+		size = xfs_attr_leaf_entsize_remote(namelen);
+		if (local) {
+			*local = 0;
+		}
+	}
+	return(size);
+}
+
+/*
+ * Copy out attribute list entries for attr_list(), for leaf attribute lists.
+ */
+int
+xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
+{
+	attrlist_cursor_kern_t *cursor;
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	int retval, i;
+
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	cursor = context->cursor;
+	cursor->initted = 1;
+
+	trace_xfs_attr_list_leaf(context);
+
+	/*
+	 * Re-find our place in the leaf block if this is a new syscall.
+	 */
+	if (context->resynch) {
+		entry = &leaf->entries[0];
+		for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+			if (be32_to_cpu(entry->hashval) == cursor->hashval) {
+				if (cursor->offset == context->dupcnt) {
+					context->dupcnt = 0;
+					break;
+				}
+				context->dupcnt++;
+			} else if (be32_to_cpu(entry->hashval) >
+					cursor->hashval) {
+				context->dupcnt = 0;
+				break;
+			}
+		}
+		if (i == be16_to_cpu(leaf->hdr.count)) {
+			trace_xfs_attr_list_notfound(context);
+			return(0);
+		}
+	} else {
+		entry = &leaf->entries[0];
+		i = 0;
+	}
+	context->resynch = 0;
+
+	/*
+	 * We have found our place, start copying out the new attributes.
+	 */
+	retval = 0;
+	for (  ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) {
+		if (be32_to_cpu(entry->hashval) != cursor->hashval) {
+			cursor->hashval = be32_to_cpu(entry->hashval);
+			cursor->offset = 0;
+		}
+
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;		/* skip incomplete entries */
+
+		if (entry->flags & XFS_ATTR_LOCAL) {
+			xfs_attr_leaf_name_local_t *name_loc =
+				xfs_attr_leaf_name_local(leaf, i);
+
+			retval = context->put_listent(context,
+						entry->flags,
+						name_loc->nameval,
+						(int)name_loc->namelen,
+						be16_to_cpu(name_loc->valuelen),
+						&name_loc->nameval[name_loc->namelen]);
+			if (retval)
+				return retval;
+		} else {
+			xfs_attr_leaf_name_remote_t *name_rmt =
+				xfs_attr_leaf_name_remote(leaf, i);
+
+			int valuelen = be32_to_cpu(name_rmt->valuelen);
+
+			if (context->put_value) {
+				xfs_da_args_t args;
+
+				memset((char *)&args, 0, sizeof(args));
+				args.dp = context->dp;
+				args.whichfork = XFS_ATTR_FORK;
+				args.valuelen = valuelen;
+				args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
+				args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
+				args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
+				retval = xfs_attr_rmtval_get(&args);
+				if (retval)
+					return retval;
+				retval = context->put_listent(context,
+						entry->flags,
+						name_rmt->name,
+						(int)name_rmt->namelen,
+						valuelen,
+						args.value);
+				kmem_free(args.value);
+			} else {
+				retval = context->put_listent(context,
+						entry->flags,
+						name_rmt->name,
+						(int)name_rmt->namelen,
+						valuelen,
+						NULL);
+			}
+			if (retval)
+				return retval;
+		}
+		if (context->seen_enough)
+			break;
+		cursor->offset++;
+	}
+	trace_xfs_attr_list_leaf_end(context);
+	return(retval);
+}
+
+
+/*========================================================================
+ * Manage the INCOMPLETE flag in a leaf entry
+ *========================================================================*/
+
+/*
+ * Clear the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr_leaf_clearflag(xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	xfs_dabuf_t *bp;
+	int error;
+#ifdef DEBUG
+	xfs_attr_leaf_name_local_t *name_loc;
+	int namelen;
+	char *name;
+#endif /* DEBUG */
+
+	/*
+	 * Set up the operation.
+	 */
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+					     XFS_ATTR_FORK);
+	if (error) {
+		return(error);
+	}
+	ASSERT(bp != NULL);
+
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
+	ASSERT(args->index >= 0);
+	entry = &leaf->entries[ args->index ];
+	ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
+
+#ifdef DEBUG
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr_leaf_name_local(leaf, args->index);
+		namelen = name_loc->namelen;
+		name = (char *)name_loc->nameval;
+	} else {
+		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+		namelen = name_rmt->namelen;
+		name = (char *)name_rmt->name;
+	}
+	ASSERT(be32_to_cpu(entry->hashval) == args->hashval);
+	ASSERT(namelen == args->namelen);
+	ASSERT(memcmp(name, args->name, namelen) == 0);
+#endif /* DEBUG */
+
+	entry->flags &= ~XFS_ATTR_INCOMPLETE;
+	xfs_da_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+
+	if (args->rmtblkno) {
+		ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
+		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+		name_rmt->valuelen = cpu_to_be32(args->valuelen);
+		xfs_da_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+	}
+	xfs_da_buf_done(bp);
+
+	/*
+	 * Commit the flag value change and start the next trans in series.
+	 */
+	return xfs_trans_roll(&args->trans, args->dp);
+}
+
+/*
+ * Set the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr_leaf_setflag(xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	xfs_dabuf_t *bp;
+	int error;
+
+	/*
+	 * Set up the operation.
+	 */
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+					     XFS_ATTR_FORK);
+	if (error) {
+		return(error);
+	}
+	ASSERT(bp != NULL);
+
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
+	ASSERT(args->index >= 0);
+	entry = &leaf->entries[ args->index ];
+
+	ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
+	entry->flags |= XFS_ATTR_INCOMPLETE;
+	xfs_da_log_buf(args->trans, bp,
+			XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+	if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
+		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+		name_rmt->valueblk = 0;
+		name_rmt->valuelen = 0;
+		xfs_da_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+	}
+	xfs_da_buf_done(bp);
+
+	/*
+	 * Commit the flag value change and start the next trans in series.
+	 */
+	return xfs_trans_roll(&args->trans, args->dp);
+}
+
+/*
+ * In a single transaction, clear the INCOMPLETE flag on the leaf entry
+ * given by args->blkno/index and set the INCOMPLETE flag on the leaf
+ * entry given by args->blkno2/index2.
+ *
+ * Note that they could be in different blocks, or in the same block.
+ */
+int
+xfs_attr_leaf_flipflags(xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf1, *leaf2;
+	xfs_attr_leaf_entry_t *entry1, *entry2;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	xfs_dabuf_t *bp1, *bp2;
+	int error;
+#ifdef DEBUG
+	xfs_attr_leaf_name_local_t *name_loc;
+	int namelen1, namelen2;
+	char *name1, *name2;
+#endif /* DEBUG */
+
+	/*
+	 * Read the block containing the "old" attr
+	 */
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1,
+					     XFS_ATTR_FORK);
+	if (error) {
+		return(error);
+	}
+	ASSERT(bp1 != NULL);
+
+	/*
+	 * Read the block containing the "new" attr, if it is different
+	 */
+	if (args->blkno2 != args->blkno) {
+		error = xfs_da_read_buf(args->trans, args->dp, args->blkno2,
+					-1, &bp2, XFS_ATTR_FORK);
+		if (error) {
+			return(error);
+		}
+		ASSERT(bp2 != NULL);
+	} else {
+		bp2 = bp1;
+	}
+
+	leaf1 = bp1->data;
+	ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(args->index < be16_to_cpu(leaf1->hdr.count));
+	ASSERT(args->index >= 0);
+	entry1 = &leaf1->entries[ args->index ];
+
+	leaf2 = bp2->data;
+	ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count));
+	ASSERT(args->index2 >= 0);
+	entry2 = &leaf2->entries[ args->index2 ];
+
+#ifdef DEBUG
+	if (entry1->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr_leaf_name_local(leaf1, args->index);
+		namelen1 = name_loc->namelen;
+		name1 = (char *)name_loc->nameval;
+	} else {
+		name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
+		namelen1 = name_rmt->namelen;
+		name1 = (char *)name_rmt->name;
+	}
+	if (entry2->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr_leaf_name_local(leaf2, args->index2);
+		namelen2 = name_loc->namelen;
+		name2 = (char *)name_loc->nameval;
+	} else {
+		name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
+		namelen2 = name_rmt->namelen;
+		name2 = (char *)name_rmt->name;
+	}
+	ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval));
+	ASSERT(namelen1 == namelen2);
+	ASSERT(memcmp(name1, name2, namelen1) == 0);
+#endif /* DEBUG */
+
+	ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE);
+	ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
+
+	entry1->flags &= ~XFS_ATTR_INCOMPLETE;
+	xfs_da_log_buf(args->trans, bp1,
+			  XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
+	if (args->rmtblkno) {
+		ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
+		name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
+		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+		name_rmt->valuelen = cpu_to_be32(args->valuelen);
+		xfs_da_log_buf(args->trans, bp1,
+			 XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
+	}
+
+	entry2->flags |= XFS_ATTR_INCOMPLETE;
+	xfs_da_log_buf(args->trans, bp2,
+			  XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
+	if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
+		name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
+		name_rmt->valueblk = 0;
+		name_rmt->valuelen = 0;
+		xfs_da_log_buf(args->trans, bp2,
+			 XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
+	}
+	xfs_da_buf_done(bp1);
+	if (bp1 != bp2)
+		xfs_da_buf_done(bp2);
+
+	/*
+	 * Commit the flag value change and start the next trans in series.
+	 */
+	error = xfs_trans_roll(&args->trans, args->dp);
+
+	return(error);
+}
+
+/*========================================================================
+ * Indiscriminately delete the entire attribute fork
+ *========================================================================*/
+
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+int
+xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
+{
+	xfs_da_blkinfo_t *info;
+	xfs_daddr_t blkno;
+	xfs_dabuf_t *bp;
+	int error;
+
+	/*
+	 * Read block 0 to see what we have to work with.
+	 * We only get here if we have extents, since we remove
+	 * the extents in reverse order the extent containing
+	 * block 0 must still be there.
+	 */
+	error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	blkno = xfs_da_blkno(bp);
+
+	/*
+	 * Invalidate the tree, even if the "tree" is only a single leaf block.
+	 * This is a depth-first traversal!
+	 */
+	info = bp->data;
+	if (be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC) {
+		error = xfs_attr_node_inactive(trans, dp, bp, 1);
+	} else if (be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC) {
+		error = xfs_attr_leaf_inactive(trans, dp, bp);
+	} else {
+		error = XFS_ERROR(EIO);
+		xfs_da_brelse(*trans, bp);
+	}
+	if (error)
+		return(error);
+
+	/*
+	 * Invalidate the incore copy of the root block.
+	 */
+	error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	xfs_da_binval(*trans, bp);	/* remove from cache */
+	/*
+	 * Commit the invalidate and start the next transaction.
+	 */
+	error = xfs_trans_roll(trans, dp);
+
+	return (error);
+}
+
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+STATIC int
+xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
+				   int level)
+{
+	xfs_da_blkinfo_t *info;
+	xfs_da_intnode_t *node;
+	xfs_dablk_t child_fsb;
+	xfs_daddr_t parent_blkno, child_blkno;
+	int error, count, i;
+	xfs_dabuf_t *child_bp;
+
+	/*
+	 * Since this code is recursive (gasp!) we must protect ourselves.
+	 */
+	if (level > XFS_DA_NODE_MAXDEPTH) {
+		xfs_da_brelse(*trans, bp);	/* no locks for later trans */
+		return(XFS_ERROR(EIO));
+	}
+
+	node = bp->data;
+	ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+	parent_blkno = xfs_da_blkno(bp);	/* save for re-read later */
+	count = be16_to_cpu(node->hdr.count);
+	if (!count) {
+		xfs_da_brelse(*trans, bp);
+		return(0);
+	}
+	child_fsb = be32_to_cpu(node->btree[0].before);
+	xfs_da_brelse(*trans, bp);	/* no locks for later trans */
+
+	/*
+	 * If this is the node level just above the leaves, simply loop
+	 * over the leaves removing all of them.  If this is higher up
+	 * in the tree, recurse downward.
+	 */
+	for (i = 0; i < count; i++) {
+		/*
+		 * Read the subsidiary block to see what we have to work with.
+		 * Don't do this in a transaction.  This is a depth-first
+		 * traversal of the tree so we may deal with many blocks
+		 * before we come back to this one.
+		 */
+		error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp,
+						XFS_ATTR_FORK);
+		if (error)
+			return(error);
+		if (child_bp) {
+						/* save for re-read later */
+			child_blkno = xfs_da_blkno(child_bp);
+
+			/*
+			 * Invalidate the subtree, however we have to.
+			 */
+			info = child_bp->data;
+			if (be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC) {
+				error = xfs_attr_node_inactive(trans, dp,
+						child_bp, level+1);
+			} else if (be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC) {
+				error = xfs_attr_leaf_inactive(trans, dp,
+						child_bp);
+			} else {
+				error = XFS_ERROR(EIO);
+				xfs_da_brelse(*trans, child_bp);
+			}
+			if (error)
+				return(error);
+
+			/*
+			 * Remove the subsidiary block from the cache
+			 * and from the log.
+			 */
+			error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
+				&child_bp, XFS_ATTR_FORK);
+			if (error)
+				return(error);
+			xfs_da_binval(*trans, child_bp);
+		}
+
+		/*
+		 * If we're not done, re-read the parent to get the next
+		 * child block number.
+		 */
+		if ((i+1) < count) {
+			error = xfs_da_read_buf(*trans, dp, 0, parent_blkno,
+				&bp, XFS_ATTR_FORK);
+			if (error)
+				return(error);
+			child_fsb = be32_to_cpu(node->btree[i+1].before);
+			xfs_da_brelse(*trans, bp);
+		}
+		/*
+		 * Atomically commit the whole invalidate stuff.
+		 */
+		error = xfs_trans_roll(trans, dp);
+		if (error)
+			return (error);
+	}
+
+	return(0);
+}
+
+/*
+ * Invalidate all of the "remote" value regions pointed to by a particular
+ * leaf block.
+ * Note that we must release the lock on the buffer so that we are not
+ * caught holding something that the logging code wants to flush to disk.
+ */
+STATIC int
+xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	xfs_attr_inactive_list_t *list, *lp;
+	int error, count, size, tmp, i;
+
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+
+	/*
+	 * Count the number of "remote" value extents.
+	 */
+	count = 0;
+	entry = &leaf->entries[0];
+	for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+		if (be16_to_cpu(entry->nameidx) &&
+		    ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+			name_rmt = xfs_attr_leaf_name_remote(leaf, i);
+			if (name_rmt->valueblk)
+				count++;
+		}
+	}
+
+	/*
+	 * If there are no "remote" values, we're done.
+	 */
+	if (count == 0) {
+		xfs_da_brelse(*trans, bp);
+		return(0);
+	}
+
+	/*
+	 * Allocate storage for a list of all the "remote" value extents.
+	 */
+	size = count * sizeof(xfs_attr_inactive_list_t);
+	list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP);
+
+	/*
+	 * Identify each of the "remote" value extents.
+	 */
+	lp = list;
+	entry = &leaf->entries[0];
+	for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+		if (be16_to_cpu(entry->nameidx) &&
+		    ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+			name_rmt = xfs_attr_leaf_name_remote(leaf, i);
+			if (name_rmt->valueblk) {
+				lp->valueblk = be32_to_cpu(name_rmt->valueblk);
+				lp->valuelen = XFS_B_TO_FSB(dp->i_mount,
+						    be32_to_cpu(name_rmt->valuelen));
+				lp++;
+			}
+		}
+	}
+	xfs_da_brelse(*trans, bp);	/* unlock for trans. in freextent() */
+
+	/*
+	 * Invalidate each of the "remote" value extents.
+	 */
+	error = 0;
+	for (lp = list, i = 0; i < count; i++, lp++) {
+		tmp = xfs_attr_leaf_freextent(trans, dp,
+				lp->valueblk, lp->valuelen);
+
+		if (error == 0)
+			error = tmp;	/* save only the 1st errno */
+	}
+
+	kmem_free((xfs_caddr_t)list);
+	return(error);
+}
+
+/*
+ * Look at all the extents for this logical region,
+ * invalidate any buffers that are incore/in transactions.
+ */
+STATIC int
+xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
+				    xfs_dablk_t blkno, int blkcnt)
+{
+	xfs_bmbt_irec_t map;
+	xfs_dablk_t tblkno;
+	int tblkcnt, dblkcnt, nmap, error;
+	xfs_daddr_t dblkno;
+	xfs_buf_t *bp;
+
+	/*
+	 * Roll through the "value", invalidating the attribute value's
+	 * blocks.
+	 */
+	tblkno = blkno;
+	tblkcnt = blkcnt;
+	while (tblkcnt > 0) {
+		/*
+		 * Try to remember where we decided to put the value.
+		 */
+		nmap = 1;
+		error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
+					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+					NULL, 0, &map, &nmap, NULL);
+		if (error) {
+			return(error);
+		}
+		ASSERT(nmap == 1);
+		ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+
+		/*
+		 * If it's a hole, these are already unmapped
+		 * so there's nothing to invalidate.
+		 */
+		if (map.br_startblock != HOLESTARTBLOCK) {
+
+			dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
+						  map.br_startblock);
+			dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
+						map.br_blockcount);
+			bp = xfs_trans_get_buf(*trans,
+					dp->i_mount->m_ddev_targp,
+					dblkno, dblkcnt, XBF_LOCK);
+			xfs_trans_binval(*trans, bp);
+			/*
+			 * Roll to next transaction.
+			 */
+			error = xfs_trans_roll(trans, dp);
+			if (error)
+				return (error);
+		}
+
+		tblkno += map.br_blockcount;
+		tblkcnt -= map.br_blockcount;
+	}
+
+	return(0);
+}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
new file mode 100644
index 00000000..9c7d22fd
--- /dev/null
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ATTR_LEAF_H__
+#define	__XFS_ATTR_LEAF_H__
+
+/*
+ * Attribute storage layout, internal structure, access macros, etc.
+ *
+ * Attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.  The
+ * internal links in the Btree are logical block offsets into the file.
+ */
+
+struct attrlist;
+struct attrlist_cursor_kern;
+struct xfs_attr_list_context;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_inode;
+struct xfs_trans;
+
+/*========================================================================
+ * Attribute structure when equal to XFS_LBSIZE(mp) bytes.
+ *========================================================================*/
+
+/*
+ * This is the structure of the leaf nodes in the Btree.
+ *
+ * Struct leaf_entry's are packed from the top.  Name/values grow from the
+ * bottom but are not packed.  The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped.  When the freemap doesn't show enough space
+ * for an allocation, we compact the name/value area and try again.  If we
+ * still don't have enough space, then we have to split the block.  The
+ * name/value structs (both local and remote versions) must be 32bit aligned.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual name string.  The root and intermediate node search always
+ * takes the first-in-the-block key match found, so we should only have
+ * to work "forw"ard.  If none matches, continue with the "forw"ard leaf
+ * nodes until the hash key changes or the attribute name is found.
+ *
+ * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
+ * the leaf_entry.  The namespaces are independent only because we also look
+ * at the namespace bit when we are looking for a matching attribute name.
+ *
+ * We also store an "incomplete" bit in the leaf_entry.  It shows that an
+ * attribute is in the middle of being created and should not be shown to
+ * the user if we crash during the time that the bit is set.  We clear the
+ * bit when we have finished setting up the attribute.  We do this because
+ * we cannot create some large attributes inside a single transaction, and we
+ * need some indication that we weren't finished if we crash in the middle.
+ */
+#define XFS_ATTR_LEAF_MAPSIZE	3	/* how many freespace slots */
+
+typedef struct xfs_attr_leaf_map {	/* RLE map of free bytes */
+	__be16	base;			  /* base of free region */
+	__be16	size;			  /* length of free region */
+} xfs_attr_leaf_map_t;
+
+typedef struct xfs_attr_leaf_hdr {	/* constant-structure header block */
+	xfs_da_blkinfo_t info;		/* block type, links, etc. */
+	__be16	count;			/* count of active leaf_entry's */
+	__be16	usedbytes;		/* num bytes of names/values stored */
+	__be16	firstused;		/* first used byte in name area */
+	__u8	holes;			/* != 0 if blk needs compaction */
+	__u8	pad1;
+	xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
+					/* N largest free regions */
+} xfs_attr_leaf_hdr_t;
+
+typedef struct xfs_attr_leaf_entry {	/* sorted on key, not name */
+	__be32	hashval;		/* hash value of name */
+ 	__be16	nameidx;		/* index into buffer of name/value */
+	__u8	flags;			/* LOCAL/ROOT/SECURE/INCOMPLETE flag */
+	__u8	pad2;			/* unused pad byte */
+} xfs_attr_leaf_entry_t;
+
+typedef struct xfs_attr_leaf_name_local {
+	__be16	valuelen;		/* number of bytes in value */
+	__u8	namelen;		/* length of name bytes */
+	__u8	nameval[1];		/* name/value bytes */
+} xfs_attr_leaf_name_local_t;
+
+typedef struct xfs_attr_leaf_name_remote {
+	__be32	valueblk;		/* block number of value bytes */
+	__be32	valuelen;		/* number of bytes in value */
+	__u8	namelen;		/* length of name bytes */
+	__u8	name[1];		/* name bytes */
+} xfs_attr_leaf_name_remote_t;
+
+typedef struct xfs_attr_leafblock {
+	xfs_attr_leaf_hdr_t	hdr;	/* constant-structure header block */
+	xfs_attr_leaf_entry_t	entries[1];	/* sorted on key, not name */
+	xfs_attr_leaf_name_local_t namelist;	/* grows from bottom of buf */
+	xfs_attr_leaf_name_remote_t valuelist;	/* grows from bottom of buf */
+} xfs_attr_leafblock_t;
+
+/*
+ * Flags used in the leaf_entry[i].flags field.
+ * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
+ * on the system call, they are "or"ed together for various operations.
+ */
+#define	XFS_ATTR_LOCAL_BIT	0	/* attr is stored locally */
+#define	XFS_ATTR_ROOT_BIT	1	/* limit access to trusted attrs */
+#define	XFS_ATTR_SECURE_BIT	2	/* limit access to secure attrs */
+#define	XFS_ATTR_INCOMPLETE_BIT	7	/* attr in middle of create/delete */
+#define XFS_ATTR_LOCAL		(1 << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT		(1 << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE		(1 << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE	(1 << XFS_ATTR_INCOMPLETE_BIT)
+
+/*
+ * Conversion macros for converting namespace bits from argument flags
+ * to ondisk flags.
+ */
+#define XFS_ATTR_NSP_ARGS_MASK		(ATTR_ROOT | ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK_MASK	(XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK(flags)	((flags) & XFS_ATTR_NSP_ONDISK_MASK)
+#define XFS_ATTR_NSP_ARGS(flags)	((flags) & XFS_ATTR_NSP_ARGS_MASK)
+#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x)	(((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
+					 ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
+#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x)	(((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
+					 ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
+
+/*
+ * Alignment for namelist and valuelist entries (since they are mixed
+ * there can be only one alignment value)
+ */
+#define	XFS_ATTR_LEAF_NAME_ALIGN	((uint)sizeof(xfs_dablk_t))
+
+/*
+ * Cast typed pointers for "local" and "remote" name/value structs.
+ */
+static inline xfs_attr_leaf_name_remote_t *
+xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+{
+	return (xfs_attr_leaf_name_remote_t *)
+		&((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
+}
+
+static inline xfs_attr_leaf_name_local_t *
+xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+{
+	return (xfs_attr_leaf_name_local_t *)
+		&((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
+}
+
+static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+{
+	return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
+}
+
+/*
+ * Calculate total bytes used (including trailing pad for alignment) for
+ * a "local" name/value structure, a "remote" name/value structure, and
+ * a pointer which might be either.
+ */
+static inline int xfs_attr_leaf_entsize_remote(int nlen)
+{
+	return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
+		XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
+{
+	return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
+		XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local_max(int bsize)
+{
+	return (((bsize) >> 1) + ((bsize) >> 2));
+}
+
+/*
+ * Used to keep a list of "remote value" extents when unlinking an inode.
+ */
+typedef struct xfs_attr_inactive_list {
+	xfs_dablk_t	valueblk;	/* block number of value bytes */
+	int		valuelen;	/* number of bytes in value */
+} xfs_attr_inactive_list_t;
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when attribute fork size < XFS_LITINO(mp).
+ */
+void	xfs_attr_shortform_create(struct xfs_da_args *args);
+void	xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
+int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
+int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
+int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int	xfs_attr_shortform_remove(struct xfs_da_args *args);
+int	xfs_attr_shortform_list(struct xfs_attr_list_context *context);
+int	xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp);
+int	xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
+
+
+/*
+ * Internal routines when attribute fork size == XFS_LBSIZE(mp).
+ */
+int	xfs_attr_leaf_to_node(struct xfs_da_args *args);
+int	xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp,
+				   struct xfs_da_args *args, int forkoff);
+int	xfs_attr_leaf_clearflag(struct xfs_da_args *args);
+int	xfs_attr_leaf_setflag(struct xfs_da_args *args);
+int	xfs_attr_leaf_flipflags(xfs_da_args_t *args);
+
+/*
+ * Routines used for growing the Btree.
+ */
+int	xfs_attr_leaf_split(struct xfs_da_state *state,
+				   struct xfs_da_state_blk *oldblk,
+				   struct xfs_da_state_blk *newblk);
+int	xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf,
+					struct xfs_da_args *args);
+int	xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args);
+int	xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer,
+				 struct xfs_da_args *args);
+int	xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer,
+				    struct xfs_da_args *args);
+int	xfs_attr_leaf_list_int(struct xfs_dabuf *bp,
+				      struct xfs_attr_list_context *context);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int	xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void	xfs_attr_leaf_unbalance(struct xfs_da_state *state,
+				       struct xfs_da_state_blk *drop_blk,
+				       struct xfs_da_state_blk *save_blk);
+int	xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
+
+/*
+ * Utility routines.
+ */
+xfs_dahash_t	xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count);
+int	xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
+				   struct xfs_dabuf *leaf2_bp);
+int	xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
+					int *local);
+#endif	/* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
new file mode 100644
index 00000000..919756e3
--- /dev/null
+++ b/fs/xfs/xfs_attr_sf.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ATTR_SF_H__
+#define	__XFS_ATTR_SF_H__
+
+/*
+ * Attribute storage when stored inside the inode.
+ *
+ * Small attribute lists are packed as tightly as possible so as
+ * to fit into the literal area of the inode.
+ */
+
+/*
+ * Entries are packed toward the top as tight as possible.
+ */
+typedef struct xfs_attr_shortform {
+	struct xfs_attr_sf_hdr {	/* constant-structure header block */
+		__be16	totsize;	/* total bytes in shortform list */
+		__u8	count;	/* count of active entries */
+	} hdr;
+	struct xfs_attr_sf_entry {
+		__uint8_t namelen;	/* actual length of name (no NULL) */
+		__uint8_t valuelen;	/* actual length of value (no NULL) */
+		__uint8_t flags;	/* flags bits (see xfs_attr_leaf.h) */
+		__uint8_t nameval[1];	/* name & value bytes concatenated */
+	} list[1];			/* variable sized array */
+} xfs_attr_shortform_t;
+typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
+typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
+
+/*
+ * We generate this then sort it, attr_list() must return things in hash-order.
+ */
+typedef struct xfs_attr_sf_sort {
+	__uint8_t	entno;		/* entry number in original list */
+	__uint8_t	namelen;	/* length of name value (no null) */
+	__uint8_t	valuelen;	/* length of value */
+	__uint8_t	flags;		/* flags bits (see xfs_attr_leaf.h) */
+	xfs_dahash_t	hash;		/* this entry's hash value */
+	unsigned char	*name;		/* name value, pointer into buffer */
+} xfs_attr_sf_sort_t;
+
+#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen)	/* space name/value uses */ \
+	(((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen)))
+#define XFS_ATTR_SF_ENTSIZE_MAX			/* max space for name&value */ \
+	((1 << (NBBY*(int)sizeof(__uint8_t))) - 1)
+#define XFS_ATTR_SF_ENTSIZE(sfep)		/* space an entry uses */ \
+	((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
+#define XFS_ATTR_SF_NEXTENTRY(sfep)		/* next entry in struct */ \
+	((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
+#define XFS_ATTR_SF_TOTSIZE(dp)			/* total space in use */ \
+	(be16_to_cpu(((xfs_attr_shortform_t *)	\
+		((dp)->i_afp->if_u1.if_data))->hdr.totsize))
+
+#endif	/* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
new file mode 100644
index 00000000..48228848
--- /dev/null
+++ b/fs/xfs/xfs_bit.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+
+/*
+ * XFS bit manipulation routines, used in non-realtime code.
+ */
+
+/*
+ * Return whether bitmap is empty.
+ * Size is number of words in the bitmap, which is padded to word boundary
+ * Returns 1 for empty, 0 for non-empty.
+ */
+int
+xfs_bitmap_empty(uint *map, uint size)
+{
+	uint i;
+	uint ret = 0;
+
+	for (i = 0; i < size; i++) {
+		ret |= map[i];
+	}
+
+	return (ret == 0);
+}
+
+/*
+ * Count the number of contiguous bits set in the bitmap starting with bit
+ * start_bit.  Size is the size of the bitmap in words.
+ */
+int
+xfs_contig_bits(uint *map, uint	size, uint start_bit)
+{
+	uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+	uint result = 0;
+	uint tmp;
+
+	size <<= BIT_TO_WORD_SHIFT;
+
+	ASSERT(start_bit < size);
+	size -= start_bit & ~(NBWORD - 1);
+	start_bit &= (NBWORD - 1);
+	if (start_bit) {
+		tmp = *p++;
+		/* set to one first offset bits prior to start */
+		tmp |= (~0U >> (NBWORD-start_bit));
+		if (tmp != ~0U)
+			goto found;
+		result += NBWORD;
+		size -= NBWORD;
+	}
+	while (size) {
+		if ((tmp = *p++) != ~0U)
+			goto found;
+		result += NBWORD;
+		size -= NBWORD;
+	}
+	return result - start_bit;
+found:
+	return result + ffz(tmp) - start_bit;
+}
+
+/*
+ * This takes the bit number to start looking from and
+ * returns the next set bit from there.  It returns -1
+ * if there are no more bits set or the start bit is
+ * beyond the end of the bitmap.
+ *
+ * Size is the number of words, not bytes, in the bitmap.
+ */
+int xfs_next_bit(uint *map, uint size, uint start_bit)
+{
+	uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+	uint result = start_bit & ~(NBWORD - 1);
+	uint tmp;
+
+	size <<= BIT_TO_WORD_SHIFT;
+
+	if (start_bit >= size)
+		return -1;
+	size -= result;
+	start_bit &= (NBWORD - 1);
+	if (start_bit) {
+		tmp = *p++;
+		/* set to zero first offset bits prior to start */
+		tmp &= (~0U << start_bit);
+		if (tmp != 0U)
+			goto found;
+		result += NBWORD;
+		size -= NBWORD;
+	}
+	while (size) {
+		if ((tmp = *p++) != 0U)
+			goto found;
+		result += NBWORD;
+		size -= NBWORD;
+	}
+	return -1;
+found:
+	return result + ffs(tmp) - 1;
+}
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
new file mode 100644
index 00000000..f1e3c907
--- /dev/null
+++ b/fs/xfs/xfs_bit.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BIT_H__
+#define	__XFS_BIT_H__
+
+/*
+ * XFS bit manipulation routines.
+ */
+
+/*
+ * masks with n high/low bits set, 64-bit values
+ */
+static inline __uint64_t xfs_mask64hi(int n)
+{
+	return (__uint64_t)-1 << (64 - (n));
+}
+static inline __uint32_t xfs_mask32lo(int n)
+{
+	return ((__uint32_t)1 << (n)) - 1;
+}
+static inline __uint64_t xfs_mask64lo(int n)
+{
+	return ((__uint64_t)1 << (n)) - 1;
+}
+
+/* Get high bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_highbit32(__uint32_t v)
+{
+	return fls(v) - 1;
+}
+
+/* Get high bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_highbit64(__uint64_t v)
+{
+	return fls64(v) - 1;
+}
+
+/* Get low bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_lowbit32(__uint32_t v)
+{
+	return ffs(v) - 1;
+}
+
+/* Get low bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_lowbit64(__uint64_t v)
+{
+	__uint32_t	w = (__uint32_t)v;
+	int		n = 0;
+
+	if (w) {	/* lower bits */
+		n = ffs(w);
+	} else {	/* upper bits */
+		w = (__uint32_t)(v >> 32);
+		if (w && (n = ffs(w)))
+		n += 32;
+	}
+	return n - 1;
+}
+
+/* Return whether bitmap is empty (1 == empty) */
+extern int xfs_bitmap_empty(uint *map, uint size);
+
+/* Count continuous one bits in map starting with start_bit */
+extern int xfs_contig_bits(uint *map, uint size, uint start_bit);
+
+/* Find next set bit in map */
+extern int xfs_next_bit(uint *map, uint size, uint start_bit);
+
+#endif	/* __XFS_BIT_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
new file mode 100644
index 00000000..a175933a
--- /dev/null
+++ b/fs/xfs/xfs_bmap.c
@@ -0,0 +1,6139 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_mount.h"
+#include "xfs_itable.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_inode_item.h"
+#include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_rw.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_buf_item.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+
+#ifdef DEBUG
+STATIC void
+xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork);
+#endif
+
+kmem_zone_t		*xfs_bmap_free_item_zone;
+
+/*
+ * Prototypes for internal bmap routines.
+ */
+
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_extents(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags);	/* inode logging flags */
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_local(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags);	/* inode logging flags */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a delayed
+ * allocation to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_delay_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	xfs_filblks_t		*dnew,	/* new delayed-alloc indirect blocks */
+	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
+	int			*logflagsp); /* inode logging flags */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a delayed allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_hole_delay(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	int			*logflagsp); /* inode logging flags */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_hole_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork); /* data or attr fork */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting an unwritten
+ * allocation to a real allocation or vice versa.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_unwritten_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	int			*logflagsp); /* inode logging flags */
+
+/*
+ * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
+ * It figures out where to ask the underlying allocator to put the new extent.
+ */
+STATIC int				/* error */
+xfs_bmap_alloc(
+	xfs_bmalloca_t		*ap);	/* bmap alloc argument struct */
+
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the file extents are already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int				/* error */
+xfs_bmap_btree_to_extents(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork); /* data or attr fork */
+
+/*
+ * Remove the entry "free" from the free item list.  Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+STATIC void
+xfs_bmap_del_free(
+	xfs_bmap_free_t		*flist,	/* free item list header */
+	xfs_bmap_free_item_t	*prev,	/* previous item on list, if any */
+	xfs_bmap_free_item_t	*free);	/* list item to be freed */
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int					/* error */
+xfs_bmap_extents_to_btree(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first-block-allocated */
+	xfs_bmap_free_t		*flist,		/* blocks freed in xaction */
+	xfs_btree_cur_t		**curp,		/* cursor returned to caller */
+	int			wasdel,		/* converting a delayed alloc */
+	int			*logflagsp,	/* inode logging flags */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Convert a local file to an extents file.
+ * This code is sort of bogus, since the file data needs to get
+ * logged so it won't be lost.  The bmap-level manipulations are ok, though.
+ */
+STATIC int				/* error */
+xfs_bmap_local_to_extents(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */
+	xfs_extlen_t	total,		/* total blocks needed by transaction */
+	int		*logflagsp,	/* inode logging flags */
+	int		whichfork);	/* data or attr fork */
+
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t *		/* pointer to found extent entry */
+xfs_bmap_search_extents(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fileoff_t	bno,		/* block number searched for */
+	int		whichfork,	/* data or attr fork */
+	int		*eofp,		/* out: end of file found */
+	xfs_extnum_t	*lastxp,	/* out: last extent index */
+	xfs_bmbt_irec_t	*gotp,		/* out: extent entry found */
+	xfs_bmbt_irec_t	*prevp);	/* out: previous extent entry found */
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ */
+STATIC int				/* error */
+xfs_bmap_isaeof(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fileoff_t   off,		/* file offset in fsblocks */
+	int             whichfork,	/* data or attribute fork */
+	char		*aeof);		/* return value */
+
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_filblks_t		len);	/* delayed extent length */
+
+#ifdef DEBUG
+/*
+ * Perform various validation checks on the values being returned
+ * from xfs_bmapi().
+ */
+STATIC void
+xfs_bmap_validate_ret(
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	int			flags,
+	xfs_bmbt_irec_t		*mval,
+	int			nmap,
+	int			ret_nmap);
+#else
+#define	xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#endif /* DEBUG */
+
+STATIC int
+xfs_bmap_count_tree(
+	xfs_mount_t     *mp,
+	xfs_trans_t     *tp,
+	xfs_ifork_t	*ifp,
+	xfs_fsblock_t   blockno,
+	int             levelin,
+	int		*count);
+
+STATIC void
+xfs_bmap_count_leaves(
+	xfs_ifork_t		*ifp,
+	xfs_extnum_t		idx,
+	int			numrecs,
+	int			*count);
+
+STATIC void
+xfs_bmap_disk_count_leaves(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*block,
+	int			numrecs,
+	int			*count);
+
+/*
+ * Bmap internal routines.
+ */
+
+STATIC int				/* error */
+xfs_bmbt_lookup_eq(
+	struct xfs_btree_cur	*cur,
+	xfs_fileoff_t		off,
+	xfs_fsblock_t		bno,
+	xfs_filblks_t		len,
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.b.br_startoff = off;
+	cur->bc_rec.b.br_startblock = bno;
+	cur->bc_rec.b.br_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+STATIC int				/* error */
+xfs_bmbt_lookup_ge(
+	struct xfs_btree_cur	*cur,
+	xfs_fileoff_t		off,
+	xfs_fsblock_t		bno,
+	xfs_filblks_t		len,
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.b.br_startoff = off;
+	cur->bc_rec.b.br_startblock = bno;
+	cur->bc_rec.b.br_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+* Update the record referred to by cur to the value given
+ * by [off, bno, len, state].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_bmbt_update(
+	struct xfs_btree_cur	*cur,
+	xfs_fileoff_t		off,
+	xfs_fsblock_t		bno,
+	xfs_filblks_t		len,
+	xfs_exntst_t		state)
+{
+	union xfs_btree_rec	rec;
+
+	xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
+	return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle btree format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_btree(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags)		/* inode logging flags */
+{
+	xfs_btree_cur_t		*cur;		/* btree cursor */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* file system mount struct */
+	int			stat;		/* newroot status */
+
+	mp = ip->i_mount;
+	if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
+		*flags |= XFS_ILOG_DBROOT;
+	else {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.firstblock = *firstblock;
+		if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
+			goto error0;
+		/* must be at least one entry */
+		XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
+		if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
+			goto error0;
+		if (stat == 0) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+			return XFS_ERROR(ENOSPC);
+		}
+		*firstblock = cur->bc_private.b.firstblock;
+		cur->bc_private.b.allocated = 0;
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	}
+	return 0;
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_extents(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags)		/* inode logging flags */
+{
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	int			error;		/* error return value */
+
+	if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
+		return 0;
+	cur = NULL;
+	error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
+		flags, XFS_DATA_FORK);
+	if (cur) {
+		cur->bc_private.b.allocated = 0;
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_local(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags)		/* inode logging flags */
+{
+	xfs_da_args_t		dargs;		/* args for dir/attr code */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* mount structure pointer */
+
+	if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
+		return 0;
+	if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+		mp = ip->i_mount;
+		memset(&dargs, 0, sizeof(dargs));
+		dargs.dp = ip;
+		dargs.firstblock = firstblock;
+		dargs.flist = flist;
+		dargs.total = mp->m_dirblkfsbs;
+		dargs.whichfork = XFS_DATA_FORK;
+		dargs.trans = tp;
+		error = xfs_dir2_sf_to_block(&dargs);
+	} else
+		error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
+			XFS_DATA_FORK);
+	return error;
+}
+
+/*
+ * Called by xfs_bmapi to update file extent records and the btree
+ * after allocating space (or doing a delayed allocation).
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork) /* data or attr fork */
+{
+	xfs_btree_cur_t		*cur;	/* btree cursor or null */
+	xfs_filblks_t		da_new; /* new count del alloc blocks used */
+	xfs_filblks_t		da_old; /* old count del alloc blocks used */
+	int			error;	/* error return value */
+	xfs_ifork_t		*ifp;	/* inode fork ptr */
+	int			logflags; /* returned value */
+	xfs_extnum_t		nextents; /* number of extents in file now */
+
+	XFS_STATS_INC(xs_add_exlist);
+
+	cur = *curp;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	da_old = da_new = 0;
+	error = 0;
+
+	ASSERT(*idx >= 0);
+	ASSERT(*idx <= nextents);
+
+	/*
+	 * This is the first extent added to a new/empty file.
+	 * Special case this one, so other routines get to assume there are
+	 * already extents in the list.
+	 */
+	if (nextents == 0) {
+		xfs_iext_insert(ip, *idx, 1, new,
+				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
+
+		ASSERT(cur == NULL);
+
+		if (!isnullstartblock(new->br_startblock)) {
+			XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+			logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+		} else
+			logflags = 0;
+	}
+	/*
+	 * Any kind of new delayed allocation goes here.
+	 */
+	else if (isnullstartblock(new->br_startblock)) {
+		if (cur)
+			ASSERT((cur->bc_private.b.flags &
+				XFS_BTCUR_BPRV_WASDEL) == 0);
+		error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
+						       &logflags);
+	}
+	/*
+	 * Real allocation off the end of the file.
+	 */
+	else if (*idx == nextents) {
+		if (cur)
+			ASSERT((cur->bc_private.b.flags &
+				XFS_BTCUR_BPRV_WASDEL) == 0);
+		error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
+				&logflags, whichfork);
+	} else {
+		xfs_bmbt_irec_t	prev;	/* old extent at offset idx */
+
+		/*
+		 * Get the record referred to by idx.
+		 */
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
+		/*
+		 * If it's a real allocation record, and the new allocation ends
+		 * after the start of the referred to record, then we're filling
+		 * in a delayed or unwritten allocation with a real one, or
+		 * converting real back to unwritten.
+		 */
+		if (!isnullstartblock(new->br_startblock) &&
+		    new->br_startoff + new->br_blockcount > prev.br_startoff) {
+			if (prev.br_state != XFS_EXT_UNWRITTEN &&
+			    isnullstartblock(prev.br_startblock)) {
+				da_old = startblockval(prev.br_startblock);
+				if (cur)
+					ASSERT(cur->bc_private.b.flags &
+						XFS_BTCUR_BPRV_WASDEL);
+				error = xfs_bmap_add_extent_delay_real(ip,
+						idx, &cur, new, &da_new,
+						first, flist, &logflags);
+			} else {
+				ASSERT(new->br_state == XFS_EXT_NORM ||
+				       new->br_state == XFS_EXT_UNWRITTEN);
+
+				error = xfs_bmap_add_extent_unwritten_real(ip,
+						idx, &cur, new, &logflags);
+				if (error)
+					goto done;
+			}
+		}
+		/*
+		 * Otherwise we're filling in a hole with an allocation.
+		 */
+		else {
+			if (cur)
+				ASSERT((cur->bc_private.b.flags &
+					XFS_BTCUR_BPRV_WASDEL) == 0);
+			error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
+					new, &logflags, whichfork);
+		}
+	}
+
+	if (error)
+		goto done;
+	ASSERT(*curp == cur || *curp == NULL);
+
+	/*
+	 * Convert to a btree if necessary.
+	 */
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(ip->i_transp, ip, first,
+			flist, &cur, da_old > 0, &tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+	/*
+	 * Adjust for changes in reserved delayed indirect blocks.
+	 * Nothing to do for disk quotas here.
+	 */
+	if (da_old || da_new) {
+		xfs_filblks_t	nblks;
+
+		nblks = da_new;
+		if (cur)
+			nblks += cur->bc_private.b.allocated;
+		ASSERT(nblks <= da_old);
+		if (nblks < da_old)
+			xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+				(int64_t)(da_old - nblks), 0);
+	}
+	/*
+	 * Clear out the allocated field, done with it now in any case.
+	 */
+	if (cur) {
+		cur->bc_private.b.allocated = 0;
+		*curp = cur;
+	}
+done:
+#ifdef DEBUG
+	if (!error)
+		xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
+#endif
+	*logflagsp = logflags;
+	return error;
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a delayed
+ * allocation to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_delay_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	xfs_filblks_t		*dnew,	/* new delayed-alloc indirect blocks */
+	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
+	int			*logflagsp) /* inode logging flags */
+{
+	xfs_btree_cur_t		*cur;	/* btree cursor */
+	int			diff;	/* temp value */
+	xfs_bmbt_rec_host_t	*ep;	/* extent entry for idx */
+	int			error;	/* error return value */
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_fileoff_t		new_endoff;	/* end offset of new entry */
+	xfs_bmbt_irec_t		r[3];	/* neighbor extent entries */
+					/* left is 0, right is 1, prev is 2 */
+	int			rval=0;	/* return value (logging flags) */
+	int			state = 0;/* state bits, accessed thru macros */
+	xfs_filblks_t		temp=0;	/* value for dnew calculations */
+	xfs_filblks_t		temp2=0;/* value for dnew calculations */
+	int			tmp_rval;	/* partial logging flags */
+
+#define	LEFT		r[0]
+#define	RIGHT		r[1]
+#define	PREV		r[2]
+
+	/*
+	 * Set up a bunch of variables to make the tests simpler.
+	 */
+	cur = *curp;
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	ep = xfs_iext_get_ext(ifp, *idx);
+	xfs_bmbt_get_all(ep, &PREV);
+	new_endoff = new->br_startoff + new->br_blockcount;
+	ASSERT(PREV.br_startoff <= new->br_startoff);
+	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
+	/*
+	 * Set flags determining what part of the previous delayed allocation
+	 * extent is being replaced by a real allocation.
+	 */
+	if (PREV.br_startoff == new->br_startoff)
+		state |= BMAP_LEFT_FILLING;
+	if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+		state |= BMAP_RIGHT_FILLING;
+
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 */
+	if (*idx > 0) {
+		state |= BMAP_LEFT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
+
+		if (isnullstartblock(LEFT.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+	    LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+	    LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+	    LEFT.br_state == new->br_state &&
+	    LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	/*
+	 * Check and set flags if this segment has a right neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 * Also check for all-three-contiguous being too large.
+	 */
+	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+		state |= BMAP_RIGHT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+
+		if (isnullstartblock(RIGHT.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+	    new_endoff == RIGHT.br_startoff &&
+	    new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+	    new->br_state == RIGHT.br_state &&
+	    new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+	    ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING)) !=
+		      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING) ||
+	     LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+			<= MAXEXTLEN))
+		state |= BMAP_RIGHT_CONTIG;
+
+	error = 0;
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+			 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+	     BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		--*idx;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+			LEFT.br_blockcount + PREV.br_blockcount +
+			RIGHT.br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, *idx + 1, 2, state);
+		ip->i_d.di_nextents--;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+					LEFT.br_startblock,
+					LEFT.br_blockcount +
+					PREV.br_blockcount +
+					RIGHT.br_blockcount, LEFT.br_state)))
+				goto done;
+		}
+		*dnew = 0;
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		--*idx;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+			LEFT.br_blockcount + PREV.br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, *idx + 1, 1, state);
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+					LEFT.br_startblock, LEFT.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+					LEFT.br_startblock,
+					LEFT.br_blockcount +
+					PREV.br_blockcount, LEFT.br_state)))
+				goto done;
+		}
+		*dnew = 0;
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The right neighbor is contiguous, the left is not.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(ep, new->br_startblock);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount + RIGHT.br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, *idx + 1, 1, state);
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+					new->br_startblock,
+					PREV.br_blockcount +
+					RIGHT.br_blockcount, PREV.br_state)))
+				goto done;
+		}
+
+		*dnew = 0;
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(ep, new->br_startblock);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+
+		*dnew = 0;
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Filling in the first part of a previous delayed allocation.
+		 * The left neighbor is contiguous.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
+			LEFT.br_blockcount + new->br_blockcount);
+		xfs_bmbt_set_startoff(ep,
+			PREV.br_startoff + new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
+
+		temp = PREV.br_blockcount - new->br_blockcount;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+					LEFT.br_startblock, LEFT.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+					LEFT.br_startblock,
+					LEFT.br_blockcount +
+					new->br_blockcount,
+					LEFT.br_state)))
+				goto done;
+		}
+		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+			startblockval(PREV.br_startblock));
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		--*idx;
+		*dnew = temp;
+		break;
+
+	case BMAP_LEFT_FILLING:
+		/*
+		 * Filling in the first part of a previous delayed allocation.
+		 * The left neighbor is not contiguous.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_startoff(ep, new_endoff);
+		temp = PREV.br_blockcount - new->br_blockcount;
+		xfs_bmbt_set_blockcount(ep, temp);
+		xfs_iext_insert(ip, *idx, 1, new, state);
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+			error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
+					first, flist, &cur, 1, &tmp_rval,
+					XFS_DATA_FORK);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+			startblockval(PREV.br_startblock) -
+			(cur ? cur->bc_private.b.allocated : 0));
+		ep = xfs_iext_get_ext(ifp, *idx + 1);
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+		trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
+
+		*dnew = temp;
+		break;
+
+	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Filling in the last part of a previous delayed allocation.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		temp = PREV.br_blockcount - new->br_blockcount;
+		trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
+			new->br_startoff, new->br_startblock,
+			new->br_blockcount + RIGHT.br_blockcount,
+			RIGHT.br_state);
+		trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+					new->br_startblock,
+					new->br_blockcount +
+					RIGHT.br_blockcount,
+					RIGHT.br_state)))
+				goto done;
+		}
+
+		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+			startblockval(PREV.br_startblock));
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		++*idx;
+		*dnew = temp;
+		break;
+
+	case BMAP_RIGHT_FILLING:
+		/*
+		 * Filling in the last part of a previous delayed allocation.
+		 * The right neighbor is not contiguous.
+		 */
+		temp = PREV.br_blockcount - new->br_blockcount;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		xfs_iext_insert(ip, *idx + 1, 1, new, state);
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+			error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
+				first, flist, &cur, 1, &tmp_rval,
+				XFS_DATA_FORK);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+			startblockval(PREV.br_startblock) -
+			(cur ? cur->bc_private.b.allocated : 0));
+		ep = xfs_iext_get_ext(ifp, *idx);
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		++*idx;
+		*dnew = temp;
+		break;
+
+	case 0:
+		/*
+		 * Filling in the middle part of a previous delayed allocation.
+		 * Contiguity is impossible here.
+		 * This case is avoided almost all the time.
+		 *
+		 * We start with a delayed allocation:
+		 *
+		 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+		 *  PREV @ idx
+		 *
+	         * and we are allocating:
+		 *                     +rrrrrrrrrrrrrrrrr+
+		 *			      new
+		 *
+		 * and we set it up for insertion as:
+		 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+		 *                            new
+		 *  PREV @ idx          LEFT              RIGHT
+		 *                      inserted at idx + 1
+		 */
+		temp = new->br_startoff - PREV.br_startoff;
+		temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
+		trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);	/* truncate PREV */
+		LEFT = *new;
+		RIGHT.br_state = PREV.br_state;
+		RIGHT.br_startblock = nullstartblock(
+				(int)xfs_bmap_worst_indlen(ip, temp2));
+		RIGHT.br_startoff = new_endoff;
+		RIGHT.br_blockcount = temp2;
+		/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
+		xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+			error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
+					first, flist, &cur, 1, &tmp_rval,
+					XFS_DATA_FORK);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+		temp = xfs_bmap_worst_indlen(ip, temp);
+		temp2 = xfs_bmap_worst_indlen(ip, temp2);
+		diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
+			(cur ? cur->bc_private.b.allocated : 0));
+		if (diff > 0 &&
+		    xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+					     -((int64_t)diff), 0)) {
+			/*
+			 * Ick gross gag me with a spoon.
+			 */
+			ASSERT(0);	/* want to see if this ever happens! */
+			while (diff > 0) {
+				if (temp) {
+					temp--;
+					diff--;
+					if (!diff ||
+					    !xfs_icsb_modify_counters(ip->i_mount,
+						    XFS_SBS_FDBLOCKS,
+						    -((int64_t)diff), 0))
+						break;
+				}
+				if (temp2) {
+					temp2--;
+					diff--;
+					if (!diff ||
+					    !xfs_icsb_modify_counters(ip->i_mount,
+						    XFS_SBS_FDBLOCKS,
+						    -((int64_t)diff), 0))
+						break;
+				}
+			}
+		}
+		ep = xfs_iext_get_ext(ifp, *idx);
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
+			nullstartblock((int)temp2));
+		trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
+
+		++*idx;
+		*dnew = temp + temp2;
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_CONTIG:
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+	*curp = cur;
+done:
+	*logflagsp = rval;
+	return error;
+#undef	LEFT
+#undef	RIGHT
+#undef	PREV
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting an unwritten
+ * allocation to a real allocation or vice versa.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_unwritten_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	int			*logflagsp) /* inode logging flags */
+{
+	xfs_btree_cur_t		*cur;	/* btree cursor */
+	xfs_bmbt_rec_host_t	*ep;	/* extent entry for idx */
+	int			error;	/* error return value */
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_fileoff_t		new_endoff;	/* end offset of new entry */
+	xfs_exntst_t		newext;	/* new extent state */
+	xfs_exntst_t		oldext;	/* old extent state */
+	xfs_bmbt_irec_t		r[3];	/* neighbor extent entries */
+					/* left is 0, right is 1, prev is 2 */
+	int			rval=0;	/* return value (logging flags) */
+	int			state = 0;/* state bits, accessed thru macros */
+
+#define	LEFT		r[0]
+#define	RIGHT		r[1]
+#define	PREV		r[2]
+	/*
+	 * Set up a bunch of variables to make the tests simpler.
+	 */
+	error = 0;
+	cur = *curp;
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	ep = xfs_iext_get_ext(ifp, *idx);
+	xfs_bmbt_get_all(ep, &PREV);
+	newext = new->br_state;
+	oldext = (newext == XFS_EXT_UNWRITTEN) ?
+		XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+	ASSERT(PREV.br_state == oldext);
+	new_endoff = new->br_startoff + new->br_blockcount;
+	ASSERT(PREV.br_startoff <= new->br_startoff);
+	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
+	/*
+	 * Set flags determining what part of the previous oldext allocation
+	 * extent is being replaced by a newext allocation.
+	 */
+	if (PREV.br_startoff == new->br_startoff)
+		state |= BMAP_LEFT_FILLING;
+	if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+		state |= BMAP_RIGHT_FILLING;
+
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 */
+	if (*idx > 0) {
+		state |= BMAP_LEFT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
+
+		if (isnullstartblock(LEFT.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+	    LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+	    LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+	    LEFT.br_state == newext &&
+	    LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	/*
+	 * Check and set flags if this segment has a right neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 * Also check for all-three-contiguous being too large.
+	 */
+	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+		state |= BMAP_RIGHT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+		if (isnullstartblock(RIGHT.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+	    new_endoff == RIGHT.br_startoff &&
+	    new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+	    newext == RIGHT.br_state &&
+	    new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+	    ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING)) !=
+		      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING) ||
+	     LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+			<= MAXEXTLEN))
+		state |= BMAP_RIGHT_CONTIG;
+
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+			 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+	     BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		--*idx;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+			LEFT.br_blockcount + PREV.br_blockcount +
+			RIGHT.br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, *idx + 1, 2, state);
+		ip->i_d.di_nextents -= 2;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+				LEFT.br_startblock,
+				LEFT.br_blockcount + PREV.br_blockcount +
+				RIGHT.br_blockcount, LEFT.br_state)))
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		--*idx;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+			LEFT.br_blockcount + PREV.br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, *idx + 1, 1, state);
+		ip->i_d.di_nextents--;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+				LEFT.br_startblock,
+				LEFT.br_blockcount + PREV.br_blockcount,
+				LEFT.br_state)))
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The right neighbor is contiguous, the left is not.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount + RIGHT.br_blockcount);
+		xfs_bmbt_set_state(ep, newext);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		xfs_iext_remove(ip, *idx + 1, 1, state);
+		ip->i_d.di_nextents--;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock,
+				new->br_blockcount + RIGHT.br_blockcount,
+				newext)))
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_state(ep, newext);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock, new->br_blockcount,
+				newext)))
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is contiguous.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
+			LEFT.br_blockcount + new->br_blockcount);
+		xfs_bmbt_set_startoff(ep,
+			PREV.br_startoff + new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(ep,
+			new->br_startblock + new->br_blockcount);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		--*idx;
+
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur,
+				PREV.br_startoff + new->br_blockcount,
+				PREV.br_startblock + new->br_blockcount,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			if (xfs_bmbt_update(cur, LEFT.br_startoff,
+				LEFT.br_startblock,
+				LEFT.br_blockcount + new->br_blockcount,
+				LEFT.br_state))
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is not contiguous.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
+		xfs_bmbt_set_startoff(ep, new_endoff);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		xfs_bmbt_set_startblock(ep,
+			new->br_startblock + new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_insert(ip, *idx, 1, new, state);
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur,
+				PREV.br_startoff + new->br_blockcount,
+				PREV.br_startblock + new->br_blockcount,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			cur->bc_rec.b = *new;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+		break;
+
+	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		++*idx;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+			new->br_startoff, new->br_startblock,
+			new->br_blockcount + RIGHT.br_blockcount, newext);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock,
+					PREV.br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+				PREV.br_startblock,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			if ((error = xfs_btree_increment(cur, 0, &i)))
+				goto done;
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock,
+				new->br_blockcount + RIGHT.br_blockcount,
+				newext)))
+				goto done;
+		}
+		break;
+
+	case BMAP_RIGHT_FILLING:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is not contiguous.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		++*idx;
+		xfs_iext_insert(ip, *idx, 1, new, state);
+
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+				PREV.br_startblock,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+		break;
+
+	case 0:
+		/*
+		 * Setting the middle part of a previous oldext extent to
+		 * newext.  Contiguity is impossible here.
+		 * One extent becomes three extents.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep,
+			new->br_startoff - PREV.br_startoff);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		r[0] = *new;
+		r[1].br_startoff = new_endoff;
+		r[1].br_blockcount =
+			PREV.br_startoff + PREV.br_blockcount - new_endoff;
+		r[1].br_startblock = new->br_startblock + new->br_blockcount;
+		r[1].br_state = oldext;
+
+		++*idx;
+		xfs_iext_insert(ip, *idx, 2, &r[0], state);
+
+		ip->i_d.di_nextents += 2;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			/* new right extent - oldext */
+			if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
+				r[1].br_startblock, r[1].br_blockcount,
+				r[1].br_state)))
+				goto done;
+			/* new left extent - oldext */
+			cur->bc_rec.b = PREV;
+			cur->bc_rec.b.br_blockcount =
+				new->br_startoff - PREV.br_startoff;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			/*
+			 * Reset the cursor to the position of the new extent
+			 * we are about to insert as we can't trust it after
+			 * the previous insert.
+			 */
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+			/* new middle extent - newext */
+			cur->bc_rec.b.br_state = new->br_state;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_CONTIG:
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+	*curp = cur;
+done:
+	*logflagsp = rval;
+	return error;
+#undef	LEFT
+#undef	RIGHT
+#undef	PREV
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a delayed allocation.
+ */
+/*ARGSUSED*/
+STATIC int				/* error */
+xfs_bmap_add_extent_hole_delay(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	int			*logflagsp) /* inode logging flags */
+{
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
+	xfs_filblks_t		newlen=0;	/* new indirect size */
+	xfs_filblks_t		oldlen=0;	/* old indirect size */
+	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
+	int			state;  /* state bits, accessed thru macros */
+	xfs_filblks_t		temp=0;	/* temp for indirect calculations */
+
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	state = 0;
+	ASSERT(isnullstartblock(new->br_startblock));
+
+	/*
+	 * Check and set flags if this segment has a left neighbor
+	 */
+	if (*idx > 0) {
+		state |= BMAP_LEFT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
+
+		if (isnullstartblock(left.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	/*
+	 * Check and set flags if the current (right) segment exists.
+	 * If it doesn't exist, we're converting the hole at end-of-file.
+	 */
+	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+		state |= BMAP_RIGHT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
+
+		if (isnullstartblock(right.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	/*
+	 * Set contiguity flags on the left and right neighbors.
+	 * Don't let extents get too large, even if the pieces are contiguous.
+	 */
+	if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
+	    left.br_startoff + left.br_blockcount == new->br_startoff &&
+	    left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
+	    new->br_startoff + new->br_blockcount == right.br_startoff &&
+	    new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+	    (!(state & BMAP_LEFT_CONTIG) ||
+	     (left.br_blockcount + new->br_blockcount +
+	      right.br_blockcount <= MAXEXTLEN)))
+		state |= BMAP_RIGHT_CONTIG;
+
+	/*
+	 * Switch out based on the contiguity flags.
+	 */
+	switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with delayed allocations
+		 * on the left and on the right.
+		 * Merge all three into a single extent record.
+		 */
+		--*idx;
+		temp = left.br_blockcount + new->br_blockcount +
+			right.br_blockcount;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+		oldlen = startblockval(left.br_startblock) +
+			startblockval(new->br_startblock) +
+			startblockval(right.br_startblock);
+		newlen = xfs_bmap_worst_indlen(ip, temp);
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+			nullstartblock((int)newlen));
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, *idx + 1, 1, state);
+		break;
+
+	case BMAP_LEFT_CONTIG:
+		/*
+		 * New allocation is contiguous with a delayed allocation
+		 * on the left.
+		 * Merge the new allocation with the left neighbor.
+		 */
+		--*idx;
+		temp = left.br_blockcount + new->br_blockcount;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+		oldlen = startblockval(left.br_startblock) +
+			startblockval(new->br_startblock);
+		newlen = xfs_bmap_worst_indlen(ip, temp);
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+			nullstartblock((int)newlen));
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		break;
+
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with a delayed allocation
+		 * on the right.
+		 * Merge the new allocation with the right neighbor.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		temp = new->br_blockcount + right.br_blockcount;
+		oldlen = startblockval(new->br_startblock) +
+			startblockval(right.br_startblock);
+		newlen = xfs_bmap_worst_indlen(ip, temp);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+			new->br_startoff,
+			nullstartblock((int)newlen), temp, right.br_state);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		break;
+
+	case 0:
+		/*
+		 * New allocation is not contiguous with another
+		 * delayed allocation.
+		 * Insert a new entry.
+		 */
+		oldlen = newlen = 0;
+		xfs_iext_insert(ip, *idx, 1, new, state);
+		break;
+	}
+	if (oldlen != newlen) {
+		ASSERT(oldlen > newlen);
+		xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+			(int64_t)(oldlen - newlen), 0);
+		/*
+		 * Nothing to do for disk quota accounting here.
+		 */
+	}
+	*logflagsp = 0;
+	return 0;
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_hole_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork) /* data or attr fork */
+{
+	int			error;	/* error return value */
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
+	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
+	int			rval=0;	/* return value (logging flags) */
+	int			state;	/* state bits, accessed thru macros */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+	state = 0;
+
+	if (whichfork == XFS_ATTR_FORK)
+		state |= BMAP_ATTRFORK;
+
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 */
+	if (*idx > 0) {
+		state |= BMAP_LEFT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
+		if (isnullstartblock(left.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	/*
+	 * Check and set flags if this segment has a current value.
+	 * Not true if we're inserting into the "hole" at eof.
+	 */
+	if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+		state |= BMAP_RIGHT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
+		if (isnullstartblock(right.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	/*
+	 * We're inserting a real allocation between "left" and "right".
+	 * Set the contiguity flags.  Don't let extents get too large.
+	 */
+	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+	    left.br_startoff + left.br_blockcount == new->br_startoff &&
+	    left.br_startblock + left.br_blockcount == new->br_startblock &&
+	    left.br_state == new->br_state &&
+	    left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+	    new->br_startoff + new->br_blockcount == right.br_startoff &&
+	    new->br_startblock + new->br_blockcount == right.br_startblock &&
+	    new->br_state == right.br_state &&
+	    new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+	    (!(state & BMAP_LEFT_CONTIG) ||
+	     left.br_blockcount + new->br_blockcount +
+	     right.br_blockcount <= MAXEXTLEN))
+		state |= BMAP_RIGHT_CONTIG;
+
+	error = 0;
+	/*
+	 * Select which case we're in here, and implement it.
+	 */
+	switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with real allocations on the
+		 * left and on the right.
+		 * Merge all three into a single extent record.
+		 */
+		--*idx;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+			left.br_blockcount + new->br_blockcount +
+			right.br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, *idx + 1, 1, state);
+
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+		if (cur == NULL) {
+			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+		} else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur,
+					right.br_startoff,
+					right.br_startblock,
+					right.br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, left.br_startoff,
+					left.br_startblock,
+					left.br_blockcount +
+						new->br_blockcount +
+						right.br_blockcount,
+					left.br_state)))
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_CONTIG:
+		/*
+		 * New allocation is contiguous with a real allocation
+		 * on the left.
+		 * Merge the new allocation with the left neighbor.
+		 */
+		--*idx;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+			left.br_blockcount + new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		if (cur == NULL) {
+			rval = xfs_ilog_fext(whichfork);
+		} else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur,
+					left.br_startoff,
+					left.br_startblock,
+					left.br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, left.br_startoff,
+					left.br_startblock,
+					left.br_blockcount +
+						new->br_blockcount,
+					left.br_state)))
+				goto done;
+		}
+		break;
+
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with a real allocation
+		 * on the right.
+		 * Merge the new allocation with the right neighbor.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+			new->br_startoff, new->br_startblock,
+			new->br_blockcount + right.br_blockcount,
+			right.br_state);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		if (cur == NULL) {
+			rval = xfs_ilog_fext(whichfork);
+		} else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur,
+					right.br_startoff,
+					right.br_startblock,
+					right.br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+					new->br_startblock,
+					new->br_blockcount +
+						right.br_blockcount,
+					right.br_state)))
+				goto done;
+		}
+		break;
+
+	case 0:
+		/*
+		 * New allocation is not contiguous with another
+		 * real allocation.
+		 * Insert a new entry.
+		 */
+		xfs_iext_insert(ip, *idx, 1, new, state);
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+			XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+		if (cur == NULL) {
+			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+		} else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur,
+					new->br_startoff,
+					new->br_startblock,
+					new->br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+			cur->bc_rec.b.br_state = new->br_state;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+		break;
+	}
+done:
+	*logflagsp = rval;
+	return error;
+}
+
+/*
+ * Adjust the size of the new extent based on di_extsize and rt extsize.
+ */
+STATIC int
+xfs_bmap_extsize_align(
+	xfs_mount_t	*mp,
+	xfs_bmbt_irec_t	*gotp,		/* next extent pointer */
+	xfs_bmbt_irec_t	*prevp,		/* previous extent pointer */
+	xfs_extlen_t	extsz,		/* align to this extent size */
+	int		rt,		/* is this a realtime inode? */
+	int		eof,		/* is extent at end-of-file? */
+	int		delay,		/* creating delalloc extent? */
+	int		convert,	/* overwriting unwritten extent? */
+	xfs_fileoff_t	*offp,		/* in/out: aligned offset */
+	xfs_extlen_t	*lenp)		/* in/out: aligned length */
+{
+	xfs_fileoff_t	orig_off;	/* original offset */
+	xfs_extlen_t	orig_alen;	/* original length */
+	xfs_fileoff_t	orig_end;	/* original off+len */
+	xfs_fileoff_t	nexto;		/* next file offset */
+	xfs_fileoff_t	prevo;		/* previous file offset */
+	xfs_fileoff_t	align_off;	/* temp for offset */
+	xfs_extlen_t	align_alen;	/* temp for length */
+	xfs_extlen_t	temp;		/* temp for calculations */
+
+	if (convert)
+		return 0;
+
+	orig_off = align_off = *offp;
+	orig_alen = align_alen = *lenp;
+	orig_end = orig_off + orig_alen;
+
+	/*
+	 * If this request overlaps an existing extent, then don't
+	 * attempt to perform any additional alignment.
+	 */
+	if (!delay && !eof &&
+	    (orig_off >= gotp->br_startoff) &&
+	    (orig_end <= gotp->br_startoff + gotp->br_blockcount)) {
+		return 0;
+	}
+
+	/*
+	 * If the file offset is unaligned vs. the extent size
+	 * we need to align it.  This will be possible unless
+	 * the file was previously written with a kernel that didn't
+	 * perform this alignment, or if a truncate shot us in the
+	 * foot.
+	 */
+	temp = do_mod(orig_off, extsz);
+	if (temp) {
+		align_alen += temp;
+		align_off -= temp;
+	}
+	/*
+	 * Same adjustment for the end of the requested area.
+	 */
+	if ((temp = (align_alen % extsz))) {
+		align_alen += extsz - temp;
+	}
+	/*
+	 * If the previous block overlaps with this proposed allocation
+	 * then move the start forward without adjusting the length.
+	 */
+	if (prevp->br_startoff != NULLFILEOFF) {
+		if (prevp->br_startblock == HOLESTARTBLOCK)
+			prevo = prevp->br_startoff;
+		else
+			prevo = prevp->br_startoff + prevp->br_blockcount;
+	} else
+		prevo = 0;
+	if (align_off != orig_off && align_off < prevo)
+		align_off = prevo;
+	/*
+	 * If the next block overlaps with this proposed allocation
+	 * then move the start back without adjusting the length,
+	 * but not before offset 0.
+	 * This may of course make the start overlap previous block,
+	 * and if we hit the offset 0 limit then the next block
+	 * can still overlap too.
+	 */
+	if (!eof && gotp->br_startoff != NULLFILEOFF) {
+		if ((delay && gotp->br_startblock == HOLESTARTBLOCK) ||
+		    (!delay && gotp->br_startblock == DELAYSTARTBLOCK))
+			nexto = gotp->br_startoff + gotp->br_blockcount;
+		else
+			nexto = gotp->br_startoff;
+	} else
+		nexto = NULLFILEOFF;
+	if (!eof &&
+	    align_off + align_alen != orig_end &&
+	    align_off + align_alen > nexto)
+		align_off = nexto > align_alen ? nexto - align_alen : 0;
+	/*
+	 * If we're now overlapping the next or previous extent that
+	 * means we can't fit an extsz piece in this hole.  Just move
+	 * the start forward to the first valid spot and set
+	 * the length so we hit the end.
+	 */
+	if (align_off != orig_off && align_off < prevo)
+		align_off = prevo;
+	if (align_off + align_alen != orig_end &&
+	    align_off + align_alen > nexto &&
+	    nexto != NULLFILEOFF) {
+		ASSERT(nexto > prevo);
+		align_alen = nexto - align_off;
+	}
+
+	/*
+	 * If realtime, and the result isn't a multiple of the realtime
+	 * extent size we need to remove blocks until it is.
+	 */
+	if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
+		/*
+		 * We're not covering the original request, or
+		 * we won't be able to once we fix the length.
+		 */
+		if (orig_off < align_off ||
+		    orig_end > align_off + align_alen ||
+		    align_alen - temp < orig_alen)
+			return XFS_ERROR(EINVAL);
+		/*
+		 * Try to fix it by moving the start up.
+		 */
+		if (align_off + temp <= orig_off) {
+			align_alen -= temp;
+			align_off += temp;
+		}
+		/*
+		 * Try to fix it by moving the end in.
+		 */
+		else if (align_off + align_alen - temp >= orig_end)
+			align_alen -= temp;
+		/*
+		 * Set the start to the minimum then trim the length.
+		 */
+		else {
+			align_alen -= orig_off - align_off;
+			align_off = orig_off;
+			align_alen -= align_alen % mp->m_sb.sb_rextsize;
+		}
+		/*
+		 * Result doesn't cover the request, fail it.
+		 */
+		if (orig_off < align_off || orig_end > align_off + align_alen)
+			return XFS_ERROR(EINVAL);
+	} else {
+		ASSERT(orig_off >= align_off);
+		ASSERT(orig_end <= align_off + align_alen);
+	}
+
+#ifdef DEBUG
+	if (!eof && gotp->br_startoff != NULLFILEOFF)
+		ASSERT(align_off + align_alen <= gotp->br_startoff);
+	if (prevp->br_startoff != NULLFILEOFF)
+		ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount);
+#endif
+
+	*lenp = align_alen;
+	*offp = align_off;
+	return 0;
+}
+
+#define XFS_ALLOC_GAP_UNITS	4
+
+STATIC void
+xfs_bmap_adjacent(
+	xfs_bmalloca_t	*ap)		/* bmap alloc argument struct */
+{
+	xfs_fsblock_t	adjust;		/* adjustment to block numbers */
+	xfs_agnumber_t	fb_agno;	/* ag number of ap->firstblock */
+	xfs_mount_t	*mp;		/* mount point structure */
+	int		nullfb;		/* true if ap->firstblock isn't set */
+	int		rt;		/* true if inode is realtime */
+
+#define	ISVALID(x,y)	\
+	(rt ? \
+		(x) < mp->m_sb.sb_rblocks : \
+		XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
+		XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
+		XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
+
+	mp = ap->ip->i_mount;
+	nullfb = ap->firstblock == NULLFSBLOCK;
+	rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
+	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
+	/*
+	 * If allocating at eof, and there's a previous real block,
+	 * try to use its last block as our starting point.
+	 */
+	if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
+	    !isnullstartblock(ap->prevp->br_startblock) &&
+	    ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount,
+		    ap->prevp->br_startblock)) {
+		ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
+		/*
+		 * Adjust for the gap between prevp and us.
+		 */
+		adjust = ap->off -
+			(ap->prevp->br_startoff + ap->prevp->br_blockcount);
+		if (adjust &&
+		    ISVALID(ap->rval + adjust, ap->prevp->br_startblock))
+			ap->rval += adjust;
+	}
+	/*
+	 * If not at eof, then compare the two neighbor blocks.
+	 * Figure out whether either one gives us a good starting point,
+	 * and pick the better one.
+	 */
+	else if (!ap->eof) {
+		xfs_fsblock_t	gotbno;		/* right side block number */
+		xfs_fsblock_t	gotdiff=0;	/* right side difference */
+		xfs_fsblock_t	prevbno;	/* left side block number */
+		xfs_fsblock_t	prevdiff=0;	/* left side difference */
+
+		/*
+		 * If there's a previous (left) block, select a requested
+		 * start block based on it.
+		 */
+		if (ap->prevp->br_startoff != NULLFILEOFF &&
+		    !isnullstartblock(ap->prevp->br_startblock) &&
+		    (prevbno = ap->prevp->br_startblock +
+			       ap->prevp->br_blockcount) &&
+		    ISVALID(prevbno, ap->prevp->br_startblock)) {
+			/*
+			 * Calculate gap to end of previous block.
+			 */
+			adjust = prevdiff = ap->off -
+				(ap->prevp->br_startoff +
+				 ap->prevp->br_blockcount);
+			/*
+			 * Figure the startblock based on the previous block's
+			 * end and the gap size.
+			 * Heuristic!
+			 * If the gap is large relative to the piece we're
+			 * allocating, or using it gives us an invalid block
+			 * number, then just use the end of the previous block.
+			 */
+			if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+			    ISVALID(prevbno + prevdiff,
+				    ap->prevp->br_startblock))
+				prevbno += adjust;
+			else
+				prevdiff += adjust;
+			/*
+			 * If the firstblock forbids it, can't use it,
+			 * must use default.
+			 */
+			if (!rt && !nullfb &&
+			    XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
+				prevbno = NULLFSBLOCK;
+		}
+		/*
+		 * No previous block or can't follow it, just default.
+		 */
+		else
+			prevbno = NULLFSBLOCK;
+		/*
+		 * If there's a following (right) block, select a requested
+		 * start block based on it.
+		 */
+		if (!isnullstartblock(ap->gotp->br_startblock)) {
+			/*
+			 * Calculate gap to start of next block.
+			 */
+			adjust = gotdiff = ap->gotp->br_startoff - ap->off;
+			/*
+			 * Figure the startblock based on the next block's
+			 * start and the gap size.
+			 */
+			gotbno = ap->gotp->br_startblock;
+			/*
+			 * Heuristic!
+			 * If the gap is large relative to the piece we're
+			 * allocating, or using it gives us an invalid block
+			 * number, then just use the start of the next block
+			 * offset by our length.
+			 */
+			if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+			    ISVALID(gotbno - gotdiff, gotbno))
+				gotbno -= adjust;
+			else if (ISVALID(gotbno - ap->alen, gotbno)) {
+				gotbno -= ap->alen;
+				gotdiff += adjust - ap->alen;
+			} else
+				gotdiff += adjust;
+			/*
+			 * If the firstblock forbids it, can't use it,
+			 * must use default.
+			 */
+			if (!rt && !nullfb &&
+			    XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
+				gotbno = NULLFSBLOCK;
+		}
+		/*
+		 * No next block, just default.
+		 */
+		else
+			gotbno = NULLFSBLOCK;
+		/*
+		 * If both valid, pick the better one, else the only good
+		 * one, else ap->rval is already set (to 0 or the inode block).
+		 */
+		if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
+			ap->rval = prevdiff <= gotdiff ? prevbno : gotbno;
+		else if (prevbno != NULLFSBLOCK)
+			ap->rval = prevbno;
+		else if (gotbno != NULLFSBLOCK)
+			ap->rval = gotbno;
+	}
+#undef ISVALID
+}
+
+STATIC int
+xfs_bmap_rtalloc(
+	xfs_bmalloca_t	*ap)		/* bmap alloc argument struct */
+{
+	xfs_alloctype_t	atype = 0;	/* type for allocation routines */
+	int		error;		/* error return value */
+	xfs_mount_t	*mp;		/* mount point structure */
+	xfs_extlen_t	prod = 0;	/* product factor for allocators */
+	xfs_extlen_t	ralen = 0;	/* realtime allocation length */
+	xfs_extlen_t	align;		/* minimum allocation alignment */
+	xfs_rtblock_t	rtb;
+
+	mp = ap->ip->i_mount;
+	align = xfs_get_extsz_hint(ap->ip);
+	prod = align / mp->m_sb.sb_rextsize;
+	error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
+					align, 1, ap->eof, 0,
+					ap->conv, &ap->off, &ap->alen);
+	if (error)
+		return error;
+	ASSERT(ap->alen);
+	ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0);
+
+	/*
+	 * If the offset & length are not perfectly aligned
+	 * then kill prod, it will just get us in trouble.
+	 */
+	if (do_mod(ap->off, align) || ap->alen % align)
+		prod = 1;
+	/*
+	 * Set ralen to be the actual requested length in rtextents.
+	 */
+	ralen = ap->alen / mp->m_sb.sb_rextsize;
+	/*
+	 * If the old value was close enough to MAXEXTLEN that
+	 * we rounded up to it, cut it back so it's valid again.
+	 * Note that if it's a really large request (bigger than
+	 * MAXEXTLEN), we don't hear about that number, and can't
+	 * adjust the starting point to match it.
+	 */
+	if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
+		ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+
+	/*
+	 * Lock out other modifications to the RT bitmap inode.
+	 */
+	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
+	/*
+	 * If it's an allocation to an empty file at offset 0,
+	 * pick an extent that will space things out in the rt area.
+	 */
+	if (ap->eof && ap->off == 0) {
+		xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
+
+		error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
+		if (error)
+			return error;
+		ap->rval = rtx * mp->m_sb.sb_rextsize;
+	} else {
+		ap->rval = 0;
+	}
+
+	xfs_bmap_adjacent(ap);
+
+	/*
+	 * Realtime allocation, done through xfs_rtallocate_extent.
+	 */
+	atype = ap->rval == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
+	do_div(ap->rval, mp->m_sb.sb_rextsize);
+	rtb = ap->rval;
+	ap->alen = ralen;
+	if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen,
+				&ralen, atype, ap->wasdel, prod, &rtb)))
+		return error;
+	if (rtb == NULLFSBLOCK && prod > 1 &&
+	    (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1,
+					   ap->alen, &ralen, atype,
+					   ap->wasdel, 1, &rtb)))
+		return error;
+	ap->rval = rtb;
+	if (ap->rval != NULLFSBLOCK) {
+		ap->rval *= mp->m_sb.sb_rextsize;
+		ralen *= mp->m_sb.sb_rextsize;
+		ap->alen = ralen;
+		ap->ip->i_d.di_nblocks += ralen;
+		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+		if (ap->wasdel)
+			ap->ip->i_delayed_blks -= ralen;
+		/*
+		 * Adjust the disk quota also. This was reserved
+		 * earlier.
+		 */
+		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+			ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
+					XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
+	} else {
+		ap->alen = 0;
+	}
+	return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc_nullfb(
+	struct xfs_bmalloca	*ap,
+	struct xfs_alloc_arg	*args,
+	xfs_extlen_t		*blen)
+{
+	struct xfs_mount	*mp = ap->ip->i_mount;
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		ag, startag;
+	int			notinit = 0;
+	int			error;
+
+	if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+		args->type = XFS_ALLOCTYPE_NEAR_BNO;
+	else
+		args->type = XFS_ALLOCTYPE_START_BNO;
+	args->total = ap->total;
+
+	/*
+	 * Search for an allocation group with a single extent large enough
+	 * for the request.  If one isn't found, then adjust the minimum
+	 * allocation size to the largest space found.
+	 */
+	startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+	if (startag == NULLAGNUMBER)
+		startag = ag = 0;
+
+	pag = xfs_perag_get(mp, ag);
+	while (*blen < args->maxlen) {
+		if (!pag->pagf_init) {
+			error = xfs_alloc_pagf_init(mp, args->tp, ag,
+						    XFS_ALLOC_FLAG_TRYLOCK);
+			if (error) {
+				xfs_perag_put(pag);
+				return error;
+			}
+		}
+
+		/*
+		 * See xfs_alloc_fix_freelist...
+		 */
+		if (pag->pagf_init) {
+			xfs_extlen_t	longest;
+			longest = xfs_alloc_longest_free_extent(mp, pag);
+			if (*blen < longest)
+				*blen = longest;
+		} else
+			notinit = 1;
+
+		if (xfs_inode_is_filestream(ap->ip)) {
+			if (*blen >= args->maxlen)
+				break;
+
+			if (ap->userdata) {
+				/*
+				 * If startag is an invalid AG, we've
+				 * come here once before and
+				 * xfs_filestream_new_ag picked the
+				 * best currently available.
+				 *
+				 * Don't continue looping, since we
+				 * could loop forever.
+				 */
+				if (startag == NULLAGNUMBER)
+					break;
+
+				error = xfs_filestream_new_ag(ap, &ag);
+				xfs_perag_put(pag);
+				if (error)
+					return error;
+
+				/* loop again to set 'blen'*/
+				startag = NULLAGNUMBER;
+				pag = xfs_perag_get(mp, ag);
+				continue;
+			}
+		}
+		if (++ag == mp->m_sb.sb_agcount)
+			ag = 0;
+		if (ag == startag)
+			break;
+		xfs_perag_put(pag);
+		pag = xfs_perag_get(mp, ag);
+	}
+	xfs_perag_put(pag);
+
+	/*
+	 * Since the above loop did a BUF_TRYLOCK, it is
+	 * possible that there is space for this request.
+	 */
+	if (notinit || *blen < ap->minlen)
+		args->minlen = ap->minlen;
+	/*
+	 * If the best seen length is less than the request
+	 * length, use the best as the minimum.
+	 */
+	else if (*blen < args->maxlen)
+		args->minlen = *blen;
+	/*
+	 * Otherwise we've seen an extent as big as maxlen,
+	 * use that as the minimum.
+	 */
+	else
+		args->minlen = args->maxlen;
+
+	/*
+	 * set the failure fallback case to look in the selected
+	 * AG as the stream may have moved.
+	 */
+	if (xfs_inode_is_filestream(ap->ip))
+		ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+
+	return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc(
+	xfs_bmalloca_t	*ap)		/* bmap alloc argument struct */
+{
+	xfs_mount_t	*mp;		/* mount point structure */
+	xfs_alloctype_t	atype = 0;	/* type for allocation routines */
+	xfs_extlen_t	align;		/* minimum allocation alignment */
+	xfs_agnumber_t	fb_agno;	/* ag number of ap->firstblock */
+	xfs_agnumber_t	ag;
+	xfs_alloc_arg_t	args;
+	xfs_extlen_t	blen;
+	xfs_extlen_t	nextminlen = 0;
+	int		nullfb;		/* true if ap->firstblock isn't set */
+	int		isaligned;
+	int		tryagain;
+	int		error;
+
+	mp = ap->ip->i_mount;
+	align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
+	if (unlikely(align)) {
+		error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
+						align, 0, ap->eof, 0, ap->conv,
+						&ap->off, &ap->alen);
+		ASSERT(!error);
+		ASSERT(ap->alen);
+	}
+	nullfb = ap->firstblock == NULLFSBLOCK;
+	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
+	if (nullfb) {
+		if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
+			ag = xfs_filestream_lookup_ag(ap->ip);
+			ag = (ag != NULLAGNUMBER) ? ag : 0;
+			ap->rval = XFS_AGB_TO_FSB(mp, ag, 0);
+		} else {
+			ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+		}
+	} else
+		ap->rval = ap->firstblock;
+
+	xfs_bmap_adjacent(ap);
+
+	/*
+	 * If allowed, use ap->rval; otherwise must use firstblock since
+	 * it's in the right allocation group.
+	 */
+	if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno)
+		;
+	else
+		ap->rval = ap->firstblock;
+	/*
+	 * Normal allocation, done through xfs_alloc_vextent.
+	 */
+	tryagain = isaligned = 0;
+	args.tp = ap->tp;
+	args.mp = mp;
+	args.fsbno = ap->rval;
+
+	/* Trim the allocation back to the maximum an AG can fit. */
+	args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
+	args.firstblock = ap->firstblock;
+	blen = 0;
+	if (nullfb) {
+		error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+		if (error)
+			return error;
+	} else if (ap->low) {
+		if (xfs_inode_is_filestream(ap->ip))
+			args.type = XFS_ALLOCTYPE_FIRST_AG;
+		else
+			args.type = XFS_ALLOCTYPE_START_BNO;
+		args.total = args.minlen = ap->minlen;
+	} else {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.total = ap->total;
+		args.minlen = ap->minlen;
+	}
+	/* apply extent size hints if obtained earlier */
+	if (unlikely(align)) {
+		args.prod = align;
+		if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
+			args.mod = (xfs_extlen_t)(args.prod - args.mod);
+	} else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
+		args.prod = 1;
+		args.mod = 0;
+	} else {
+		args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
+		if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod))))
+			args.mod = (xfs_extlen_t)(args.prod - args.mod);
+	}
+	/*
+	 * If we are not low on available data blocks, and the
+	 * underlying logical volume manager is a stripe, and
+	 * the file offset is zero then try to allocate data
+	 * blocks on stripe unit boundary.
+	 * NOTE: ap->aeof is only set if the allocation length
+	 * is >= the stripe unit and the allocation offset is
+	 * at the end of file.
+	 */
+	if (!ap->low && ap->aeof) {
+		if (!ap->off) {
+			args.alignment = mp->m_dalign;
+			atype = args.type;
+			isaligned = 1;
+			/*
+			 * Adjust for alignment
+			 */
+			if (blen > args.alignment && blen <= args.maxlen)
+				args.minlen = blen - args.alignment;
+			args.minalignslop = 0;
+		} else {
+			/*
+			 * First try an exact bno allocation.
+			 * If it fails then do a near or start bno
+			 * allocation with alignment turned on.
+			 */
+			atype = args.type;
+			tryagain = 1;
+			args.type = XFS_ALLOCTYPE_THIS_BNO;
+			args.alignment = 1;
+			/*
+			 * Compute the minlen+alignment for the
+			 * next case.  Set slop so that the value
+			 * of minlen+alignment+slop doesn't go up
+			 * between the calls.
+			 */
+			if (blen > mp->m_dalign && blen <= args.maxlen)
+				nextminlen = blen - mp->m_dalign;
+			else
+				nextminlen = args.minlen;
+			if (nextminlen + mp->m_dalign > args.minlen + 1)
+				args.minalignslop =
+					nextminlen + mp->m_dalign -
+					args.minlen - 1;
+			else
+				args.minalignslop = 0;
+		}
+	} else {
+		args.alignment = 1;
+		args.minalignslop = 0;
+	}
+	args.minleft = ap->minleft;
+	args.wasdel = ap->wasdel;
+	args.isfl = 0;
+	args.userdata = ap->userdata;
+	if ((error = xfs_alloc_vextent(&args)))
+		return error;
+	if (tryagain && args.fsbno == NULLFSBLOCK) {
+		/*
+		 * Exact allocation failed. Now try with alignment
+		 * turned on.
+		 */
+		args.type = atype;
+		args.fsbno = ap->rval;
+		args.alignment = mp->m_dalign;
+		args.minlen = nextminlen;
+		args.minalignslop = 0;
+		isaligned = 1;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+	if (isaligned && args.fsbno == NULLFSBLOCK) {
+		/*
+		 * allocation failed, so turn off alignment and
+		 * try again.
+		 */
+		args.type = atype;
+		args.fsbno = ap->rval;
+		args.alignment = 0;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+	if (args.fsbno == NULLFSBLOCK && nullfb &&
+	    args.minlen > ap->minlen) {
+		args.minlen = ap->minlen;
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = ap->rval;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+	if (args.fsbno == NULLFSBLOCK && nullfb) {
+		args.fsbno = 0;
+		args.type = XFS_ALLOCTYPE_FIRST_AG;
+		args.total = ap->minlen;
+		args.minleft = 0;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+		ap->low = 1;
+	}
+	if (args.fsbno != NULLFSBLOCK) {
+		ap->firstblock = ap->rval = args.fsbno;
+		ASSERT(nullfb || fb_agno == args.agno ||
+		       (ap->low && fb_agno < args.agno));
+		ap->alen = args.len;
+		ap->ip->i_d.di_nblocks += args.len;
+		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+		if (ap->wasdel)
+			ap->ip->i_delayed_blks -= args.len;
+		/*
+		 * Adjust the disk quota also. This was reserved
+		 * earlier.
+		 */
+		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+			ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
+					XFS_TRANS_DQ_BCOUNT,
+			(long) args.len);
+	} else {
+		ap->rval = NULLFSBLOCK;
+		ap->alen = 0;
+	}
+	return 0;
+}
+
+/*
+ * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
+ * It figures out where to ask the underlying allocator to put the new extent.
+ */
+STATIC int
+xfs_bmap_alloc(
+	xfs_bmalloca_t	*ap)		/* bmap alloc argument struct */
+{
+	if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
+		return xfs_bmap_rtalloc(ap);
+	return xfs_bmap_btalloc(ap);
+}
+
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the file extents are already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int				/* error */
+xfs_bmap_btree_to_extents(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork)  /* data or attr fork */
+{
+	/* REFERENCED */
+	struct xfs_btree_block	*cblock;/* child btree block */
+	xfs_fsblock_t		cbno;	/* child block number */
+	xfs_buf_t		*cbp;	/* child block's buffer */
+	int			error;	/* error return value */
+	xfs_ifork_t		*ifp;	/* inode fork data */
+	xfs_mount_t		*mp;	/* mount point structure */
+	__be64			*pp;	/* ptr to block address */
+	struct xfs_btree_block	*rblock;/* root btree block */
+
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+	rblock = ifp->if_broot;
+	ASSERT(be16_to_cpu(rblock->bb_level) == 1);
+	ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
+	ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
+	cbno = be64_to_cpu(*pp);
+	*logflagsp = 0;
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
+		return error;
+#endif
+	if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
+			XFS_BMAP_BTREE_REF)))
+		return error;
+	cblock = XFS_BUF_TO_BLOCK(cbp);
+	if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
+		return error;
+	xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+	ip->i_d.di_nblocks--;
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+	xfs_trans_binval(tp, cbp);
+	if (cur->bc_bufs[0] == cbp)
+		cur->bc_bufs[0] = NULL;
+	xfs_iroot_realloc(ip, -1, whichfork);
+	ASSERT(ifp->if_broot == NULL);
+	ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+	return 0;
+}
+
+/*
+ * Called by xfs_bmapi to update file extent records and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int				/* error */
+xfs_bmap_del_extent(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_trans_t		*tp,	/* current transaction pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/delete */
+	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*del,	/* data to remove from extents */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork) /* data or attr fork */
+{
+	xfs_filblks_t		da_new;	/* new delay-alloc indirect blocks */
+	xfs_filblks_t		da_old;	/* old delay-alloc indirect blocks */
+	xfs_fsblock_t		del_endblock=0;	/* first block past del */
+	xfs_fileoff_t		del_endoff;	/* first offset past del */
+	int			delay;	/* current block is delayed allocated */
+	int			do_fx;	/* free extent at end of routine */
+	xfs_bmbt_rec_host_t	*ep;	/* current extent entry pointer */
+	int			error;	/* error return value */
+	int			flags;	/* inode logging flags */
+	xfs_bmbt_irec_t		got;	/* current extent entry */
+	xfs_fileoff_t		got_endoff;	/* first offset past got */
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_mount_t		*mp;	/* mount structure */
+	xfs_filblks_t		nblks;	/* quota/sb block count */
+	xfs_bmbt_irec_t		new;	/* new record to be inserted */
+	/* REFERENCED */
+	uint			qfield;	/* quota field to update */
+	xfs_filblks_t		temp;	/* for indirect length calculations */
+	xfs_filblks_t		temp2;	/* for indirect length calculations */
+	int			state = 0;
+
+	XFS_STATS_INC(xs_del_exlist);
+
+	if (whichfork == XFS_ATTR_FORK)
+		state |= BMAP_ATTRFORK;
+
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
+		(uint)sizeof(xfs_bmbt_rec_t)));
+	ASSERT(del->br_blockcount > 0);
+	ep = xfs_iext_get_ext(ifp, *idx);
+	xfs_bmbt_get_all(ep, &got);
+	ASSERT(got.br_startoff <= del->br_startoff);
+	del_endoff = del->br_startoff + del->br_blockcount;
+	got_endoff = got.br_startoff + got.br_blockcount;
+	ASSERT(got_endoff >= del_endoff);
+	delay = isnullstartblock(got.br_startblock);
+	ASSERT(isnullstartblock(del->br_startblock) == delay);
+	flags = 0;
+	qfield = 0;
+	error = 0;
+	/*
+	 * If deleting a real allocation, must free up the disk space.
+	 */
+	if (!delay) {
+		flags = XFS_ILOG_CORE;
+		/*
+		 * Realtime allocation.  Free it and record di_nblocks update.
+		 */
+		if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+			xfs_fsblock_t	bno;
+			xfs_filblks_t	len;
+
+			ASSERT(do_mod(del->br_blockcount,
+				      mp->m_sb.sb_rextsize) == 0);
+			ASSERT(do_mod(del->br_startblock,
+				      mp->m_sb.sb_rextsize) == 0);
+			bno = del->br_startblock;
+			len = del->br_blockcount;
+			do_div(bno, mp->m_sb.sb_rextsize);
+			do_div(len, mp->m_sb.sb_rextsize);
+			if ((error = xfs_rtfree_extent(ip->i_transp, bno,
+					(xfs_extlen_t)len)))
+				goto done;
+			do_fx = 0;
+			nblks = len * mp->m_sb.sb_rextsize;
+			qfield = XFS_TRANS_DQ_RTBCOUNT;
+		}
+		/*
+		 * Ordinary allocation.
+		 */
+		else {
+			do_fx = 1;
+			nblks = del->br_blockcount;
+			qfield = XFS_TRANS_DQ_BCOUNT;
+		}
+		/*
+		 * Set up del_endblock and cur for later.
+		 */
+		del_endblock = del->br_startblock + del->br_blockcount;
+		if (cur) {
+			if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+					got.br_startblock, got.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+		da_old = da_new = 0;
+	} else {
+		da_old = startblockval(got.br_startblock);
+		da_new = 0;
+		nblks = 0;
+		do_fx = 0;
+	}
+	/*
+	 * Set flag value to use in switch statement.
+	 * Left-contig is 2, right-contig is 1.
+	 */
+	switch (((got.br_startoff == del->br_startoff) << 1) |
+		(got_endoff == del_endoff)) {
+	case 3:
+		/*
+		 * Matches the whole extent.  Delete the entry.
+		 */
+		xfs_iext_remove(ip, *idx, 1,
+				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
+		--*idx;
+		if (delay)
+			break;
+
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+		flags |= XFS_ILOG_CORE;
+		if (!cur) {
+			flags |= xfs_ilog_fext(whichfork);
+			break;
+		}
+		if ((error = xfs_btree_delete(cur, &i)))
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		break;
+
+	case 2:
+		/*
+		 * Deleting the first part of the extent.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_startoff(ep, del_endoff);
+		temp = got.br_blockcount - del->br_blockcount;
+		xfs_bmbt_set_blockcount(ep, temp);
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+			da_new = temp;
+			break;
+		}
+		xfs_bmbt_set_startblock(ep, del_endblock);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		if (!cur) {
+			flags |= xfs_ilog_fext(whichfork);
+			break;
+		}
+		if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
+				got.br_blockcount - del->br_blockcount,
+				got.br_state)))
+			goto done;
+		break;
+
+	case 1:
+		/*
+		 * Deleting the last part of the extent.
+		 */
+		temp = got.br_blockcount - del->br_blockcount;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+			da_new = temp;
+			break;
+		}
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		if (!cur) {
+			flags |= xfs_ilog_fext(whichfork);
+			break;
+		}
+		if ((error = xfs_bmbt_update(cur, got.br_startoff,
+				got.br_startblock,
+				got.br_blockcount - del->br_blockcount,
+				got.br_state)))
+			goto done;
+		break;
+
+	case 0:
+		/*
+		 * Deleting the middle of the extent.
+		 */
+		temp = del->br_startoff - got.br_startoff;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		new.br_startoff = del_endoff;
+		temp2 = got_endoff - del_endoff;
+		new.br_blockcount = temp2;
+		new.br_state = got.br_state;
+		if (!delay) {
+			new.br_startblock = del_endblock;
+			flags |= XFS_ILOG_CORE;
+			if (cur) {
+				if ((error = xfs_bmbt_update(cur,
+						got.br_startoff,
+						got.br_startblock, temp,
+						got.br_state)))
+					goto done;
+				if ((error = xfs_btree_increment(cur, 0, &i)))
+					goto done;
+				cur->bc_rec.b = new;
+				error = xfs_btree_insert(cur, &i);
+				if (error && error != ENOSPC)
+					goto done;
+				/*
+				 * If get no-space back from btree insert,
+				 * it tried a split, and we have a zero
+				 * block reservation.
+				 * Fix up our state and return the error.
+				 */
+				if (error == ENOSPC) {
+					/*
+					 * Reset the cursor, don't trust
+					 * it after any insert operation.
+					 */
+					if ((error = xfs_bmbt_lookup_eq(cur,
+							got.br_startoff,
+							got.br_startblock,
+							temp, &i)))
+						goto done;
+					XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+					/*
+					 * Update the btree record back
+					 * to the original value.
+					 */
+					if ((error = xfs_bmbt_update(cur,
+							got.br_startoff,
+							got.br_startblock,
+							got.br_blockcount,
+							got.br_state)))
+						goto done;
+					/*
+					 * Reset the extent record back
+					 * to the original value.
+					 */
+					xfs_bmbt_set_blockcount(ep,
+						got.br_blockcount);
+					flags = 0;
+					error = XFS_ERROR(ENOSPC);
+					goto done;
+				}
+				XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			} else
+				flags |= xfs_ilog_fext(whichfork);
+			XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+		} else {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			temp = xfs_bmap_worst_indlen(ip, temp);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			temp2 = xfs_bmap_worst_indlen(ip, temp2);
+			new.br_startblock = nullstartblock((int)temp2);
+			da_new = temp + temp2;
+			while (da_new > da_old) {
+				if (temp) {
+					temp--;
+					da_new--;
+					xfs_bmbt_set_startblock(ep,
+						nullstartblock((int)temp));
+				}
+				if (da_new == da_old)
+					break;
+				if (temp2) {
+					temp2--;
+					da_new--;
+					new.br_startblock =
+						nullstartblock((int)temp2);
+				}
+			}
+		}
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		xfs_iext_insert(ip, *idx + 1, 1, &new, state);
+		++*idx;
+		break;
+	}
+	/*
+	 * If we need to, add to list of extents to delete.
+	 */
+	if (do_fx)
+		xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
+			mp);
+	/*
+	 * Adjust inode # blocks in the file.
+	 */
+	if (nblks)
+		ip->i_d.di_nblocks -= nblks;
+	/*
+	 * Adjust quota data.
+	 */
+	if (qfield)
+		xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
+
+	/*
+	 * Account for change in delayed indirect blocks.
+	 * Nothing to do for disk quota accounting here.
+	 */
+	ASSERT(da_old >= da_new);
+	if (da_old > da_new) {
+		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+			(int64_t)(da_old - da_new), 0);
+	}
+done:
+	*logflagsp = flags;
+	return error;
+}
+
+/*
+ * Remove the entry "free" from the free item list.  Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+STATIC void
+xfs_bmap_del_free(
+	xfs_bmap_free_t		*flist,	/* free item list header */
+	xfs_bmap_free_item_t	*prev,	/* previous item on list, if any */
+	xfs_bmap_free_item_t	*free)	/* list item to be freed */
+{
+	if (prev)
+		prev->xbfi_next = free->xbfi_next;
+	else
+		flist->xbf_first = free->xbfi_next;
+	flist->xbf_count--;
+	kmem_zone_free(xfs_bmap_free_item_zone, free);
+}
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int					/* error */
+xfs_bmap_extents_to_btree(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first-block-allocated */
+	xfs_bmap_free_t		*flist,		/* blocks freed in xaction */
+	xfs_btree_cur_t		**curp,		/* cursor returned to caller */
+	int			wasdel,		/* converting a delayed alloc */
+	int			*logflagsp,	/* inode logging flags */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_btree_block	*ablock;	/* allocated (child) bt block */
+	xfs_buf_t		*abp;		/* buffer for ablock */
+	xfs_alloc_arg_t		args;		/* allocation arguments */
+	xfs_bmbt_rec_t		*arp;		/* child record pointer */
+	struct xfs_btree_block	*block;		/* btree root block */
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	xfs_bmbt_rec_host_t	*ep;		/* extent record pointer */
+	int			error;		/* error return value */
+	xfs_extnum_t		i, cnt;		/* extent record index */
+	xfs_ifork_t		*ifp;		/* inode fork pointer */
+	xfs_bmbt_key_t		*kp;		/* root block key pointer */
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_extnum_t		nextents;	/* number of file extents */
+	xfs_bmbt_ptr_t		*pp;		/* root block address pointer */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	/*
+	 * Make space in the inode incore.
+	 */
+	xfs_iroot_realloc(ip, 1, whichfork);
+	ifp->if_flags |= XFS_IFBROOT;
+
+	/*
+	 * Fill in the root.
+	 */
+	block = ifp->if_broot;
+	block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
+	block->bb_level = cpu_to_be16(1);
+	block->bb_numrecs = cpu_to_be16(1);
+	block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+	block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+
+	/*
+	 * Need a cursor.  Can't allocate until bb_level is filled in.
+	 */
+	mp = ip->i_mount;
+	cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+	cur->bc_private.b.firstblock = *firstblock;
+	cur->bc_private.b.flist = flist;
+	cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+	/*
+	 * Convert to a btree with two levels, one record in root.
+	 */
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+	args.tp = tp;
+	args.mp = mp;
+	args.firstblock = *firstblock;
+	if (*firstblock == NULLFSBLOCK) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+	} else if (flist->xbf_low) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = *firstblock;
+	} else {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.fsbno = *firstblock;
+	}
+	args.minlen = args.maxlen = args.prod = 1;
+	args.total = args.minleft = args.alignment = args.mod = args.isfl =
+		args.minalignslop = 0;
+	args.wasdel = wasdel;
+	*logflagsp = 0;
+	if ((error = xfs_alloc_vextent(&args))) {
+		xfs_iroot_realloc(ip, -1, whichfork);
+		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+		return error;
+	}
+	/*
+	 * Allocation can't fail, the space was reserved.
+	 */
+	ASSERT(args.fsbno != NULLFSBLOCK);
+	ASSERT(*firstblock == NULLFSBLOCK ||
+	       args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+	       (flist->xbf_low &&
+		args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
+	*firstblock = cur->bc_private.b.firstblock = args.fsbno;
+	cur->bc_private.b.allocated++;
+	ip->i_d.di_nblocks++;
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+	abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+	/*
+	 * Fill in the child block.
+	 */
+	ablock = XFS_BUF_TO_BLOCK(abp);
+	ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
+	ablock->bb_level = 0;
+	ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+	ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	for (cnt = i = 0; i < nextents; i++) {
+		ep = xfs_iext_get_ext(ifp, i);
+		if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
+			arp->l0 = cpu_to_be64(ep->l0);
+			arp->l1 = cpu_to_be64(ep->l1);
+			arp++; cnt++;
+		}
+	}
+	ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
+	xfs_btree_set_numrecs(ablock, cnt);
+
+	/*
+	 * Fill in the root key and pointer.
+	 */
+	kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+	kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
+	pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+						be16_to_cpu(block->bb_level)));
+	*pp = cpu_to_be64(args.fsbno);
+
+	/*
+	 * Do all this logging at the end so that
+	 * the root is at the right level.
+	 */
+	xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
+	xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+	ASSERT(*curp == NULL);
+	*curp = cur;
+	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
+	return 0;
+}
+
+/*
+ * Calculate the default attribute fork offset for newly created inodes.
+ */
+uint
+xfs_default_attroffset(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	uint			offset;
+
+	if (mp->m_sb.sb_inodesize == 256) {
+		offset = XFS_LITINO(mp) -
+				XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	} else {
+		offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+	}
+
+	ASSERT(offset < XFS_LITINO(mp));
+	return offset;
+}
+
+/*
+ * Helper routine to reset inode di_forkoff field when switching
+ * attribute fork from local to extent format - we reset it where
+ * possible to make space available for inline data fork extents.
+ */
+STATIC void
+xfs_bmap_forkoff_reset(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,
+	int		whichfork)
+{
+	if (whichfork == XFS_ATTR_FORK &&
+	    ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
+	    ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
+	    ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
+		uint	dfl_forkoff = xfs_default_attroffset(ip) >> 3;
+
+		if (dfl_forkoff > ip->i_d.di_forkoff) {
+			ip->i_d.di_forkoff = dfl_forkoff;
+			ip->i_df.if_ext_max =
+				XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
+			ip->i_afp->if_ext_max =
+				XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
+		}
+	}
+}
+
+/*
+ * Convert a local file to an extents file.
+ * This code is out of bounds for data forks of regular files,
+ * since the file data needs to get logged so things will stay consistent.
+ * (The bmap-level manipulations are ok, though).
+ */
+STATIC int				/* error */
+xfs_bmap_local_to_extents(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */
+	xfs_extlen_t	total,		/* total blocks needed by transaction */
+	int		*logflagsp,	/* inode logging flags */
+	int		whichfork)	/* data or attr fork */
+{
+	int		error;		/* error return value */
+	int		flags;		/* logging flags returned */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+
+	/*
+	 * We don't want to deal with the case of keeping inode data inline yet.
+	 * So sending the data fork of a regular inode is invalid.
+	 */
+	ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG &&
+		 whichfork == XFS_DATA_FORK));
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+	flags = 0;
+	error = 0;
+	if (ifp->if_bytes) {
+		xfs_alloc_arg_t	args;	/* allocation arguments */
+		xfs_buf_t	*bp;	/* buffer for extent block */
+		xfs_bmbt_rec_host_t *ep;/* extent record pointer */
+
+		args.tp = tp;
+		args.mp = ip->i_mount;
+		args.firstblock = *firstblock;
+		ASSERT((ifp->if_flags &
+			(XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
+		/*
+		 * Allocate a block.  We know we need only one, since the
+		 * file currently fits in an inode.
+		 */
+		if (*firstblock == NULLFSBLOCK) {
+			args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+			args.type = XFS_ALLOCTYPE_START_BNO;
+		} else {
+			args.fsbno = *firstblock;
+			args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		}
+		args.total = total;
+		args.mod = args.minleft = args.alignment = args.wasdel =
+			args.isfl = args.minalignslop = 0;
+		args.minlen = args.maxlen = args.prod = 1;
+		if ((error = xfs_alloc_vextent(&args)))
+			goto done;
+		/*
+		 * Can't fail, the space was reserved.
+		 */
+		ASSERT(args.fsbno != NULLFSBLOCK);
+		ASSERT(args.len == 1);
+		*firstblock = args.fsbno;
+		bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+		memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data,
+			ifp->if_bytes);
+		xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+		xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
+		xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+		xfs_iext_add(ifp, 0, 1);
+		ep = xfs_iext_get_ext(ifp, 0);
+		xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+		trace_xfs_bmap_post_update(ip, 0,
+				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
+				_THIS_IP_);
+		XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+		ip->i_d.di_nblocks = 1;
+		xfs_trans_mod_dquot_byino(tp, ip,
+			XFS_TRANS_DQ_BCOUNT, 1L);
+		flags |= xfs_ilog_fext(whichfork);
+	} else {
+		ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+		xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
+	}
+	ifp->if_flags &= ~XFS_IFINLINE;
+	ifp->if_flags |= XFS_IFEXTENTS;
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+	flags |= XFS_ILOG_CORE;
+done:
+	*logflagsp = flags;
+	return error;
+}
+
+/*
+ * Search the extent records for the entry containing block bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies
+ * past eof, *eofp will be set, and *prevp will contain the last
+ * entry (null if none).  Else, *lastxp will be set to the index
+ * of the found entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t *		/* pointer to found extent entry */
+xfs_bmap_search_multi_extents(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_fileoff_t	bno,		/* block number searched for */
+	int		*eofp,		/* out: end of file found */
+	xfs_extnum_t	*lastxp,	/* out: last extent index */
+	xfs_bmbt_irec_t	*gotp,		/* out: extent entry found */
+	xfs_bmbt_irec_t	*prevp)		/* out: previous extent entry found */
+{
+	xfs_bmbt_rec_host_t *ep;		/* extent record pointer */
+	xfs_extnum_t	lastx;		/* last extent index */
+
+	/*
+	 * Initialize the extent entry structure to catch access to
+	 * uninitialized br_startblock field.
+	 */
+	gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
+	gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
+	gotp->br_state = XFS_EXT_INVALID;
+#if XFS_BIG_BLKNOS
+	gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
+#else
+	gotp->br_startblock = 0xffffa5a5;
+#endif
+	prevp->br_startoff = NULLFILEOFF;
+
+	ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
+	if (lastx > 0) {
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
+	}
+	if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+		xfs_bmbt_get_all(ep, gotp);
+		*eofp = 0;
+	} else {
+		if (lastx > 0) {
+			*gotp = *prevp;
+		}
+		*eofp = 1;
+		ep = NULL;
+	}
+	*lastxp = lastx;
+	return ep;
+}
+
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */
+xfs_bmap_search_extents(
+	xfs_inode_t     *ip,            /* incore inode pointer */
+	xfs_fileoff_t   bno,            /* block number searched for */
+	int             fork,      	/* data or attr fork */
+	int             *eofp,          /* out: end of file found */
+	xfs_extnum_t    *lastxp,        /* out: last extent index */
+	xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
+	xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
+{
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_bmbt_rec_host_t  *ep;            /* extent record pointer */
+
+	XFS_STATS_INC(xs_look_exlist);
+	ifp = XFS_IFORK_PTR(ip, fork);
+
+	ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
+
+	if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
+		     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
+		xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+				"Access to block zero in inode %llu "
+				"start_block: %llx start_off: %llx "
+				"blkcnt: %llx extent-state: %x lastx: %x\n",
+			(unsigned long long)ip->i_ino,
+			(unsigned long long)gotp->br_startblock,
+			(unsigned long long)gotp->br_startoff,
+			(unsigned long long)gotp->br_blockcount,
+			gotp->br_state, *lastxp);
+		*lastxp = NULLEXTNUM;
+		*eofp = 1;
+		return NULL;
+	}
+	return ep;
+}
+
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_filblks_t	len)		/* delayed extent length */
+{
+	int		level;		/* btree level number */
+	int		maxrecs;	/* maximum record count at this level */
+	xfs_mount_t	*mp;		/* mount structure */
+	xfs_filblks_t	rval;		/* return value */
+
+	mp = ip->i_mount;
+	maxrecs = mp->m_bmap_dmxr[0];
+	for (level = 0, rval = 0;
+	     level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
+	     level++) {
+		len += maxrecs - 1;
+		do_div(len, maxrecs);
+		rval += len;
+		if (len == 1)
+			return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+				level - 1;
+		if (level == 0)
+			maxrecs = mp->m_bmap_dmxr[1];
+	}
+	return rval;
+}
+
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int						/* error code */
+xfs_bmap_add_attrfork(
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			size,		/* space new attribute needs */
+	int			rsvd)		/* xact may use reserved blks */
+{
+	xfs_fsblock_t		firstblock;	/* 1st block/ag allocated */
+	xfs_bmap_free_t		flist;		/* freed extent records */
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	int			blks;		/* space reservation */
+	int			version = 1;	/* superblock attr version */
+	int			committed;	/* xaction was committed */
+	int			logflags;	/* logging flags */
+	int			error;		/* error return value */
+
+	ASSERT(XFS_IFORK_Q(ip) == 0);
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+
+	mp = ip->i_mount;
+	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+	tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+	blks = XFS_ADDAFORK_SPACE_RES(mp);
+	if (rsvd)
+		tp->t_flags |= XFS_TRANS_RESERVE;
+	if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
+		goto error0;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
+			XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+			XFS_QMOPT_RES_REGBLKS);
+	if (error) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+		return error;
+	}
+	if (XFS_IFORK_Q(ip))
+		goto error1;
+	if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
+		/*
+		 * For inodes coming from pre-6.2 filesystems.
+		 */
+		ASSERT(ip->i_d.di_aformat == 0);
+		ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+	}
+	ASSERT(ip->i_d.di_anextents == 0);
+
+	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_DEV:
+		ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+		break;
+	case XFS_DINODE_FMT_UUID:
+		ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+		if (!ip->i_d.di_forkoff)
+			ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+		else if (mp->m_flags & XFS_MOUNT_ATTR2)
+			version = 2;
+		break;
+	default:
+		ASSERT(0);
+		error = XFS_ERROR(EINVAL);
+		goto error1;
+	}
+	ip->i_df.if_ext_max =
+		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(ip->i_afp == NULL);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+	ip->i_afp->if_ext_max =
+		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	ip->i_afp->if_flags = XFS_IFEXTENTS;
+	logflags = 0;
+	xfs_bmap_init(&flist, &firstblock);
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_LOCAL:
+		error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+			&logflags);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
+			&flist, &logflags);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+			&logflags);
+		break;
+	default:
+		error = 0;
+		break;
+	}
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	if (error)
+		goto error2;
+	if (!xfs_sb_version_hasattr(&mp->m_sb) ||
+	   (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
+		__int64_t sbfields = 0;
+
+		spin_lock(&mp->m_sb_lock);
+		if (!xfs_sb_version_hasattr(&mp->m_sb)) {
+			xfs_sb_version_addattr(&mp->m_sb);
+			sbfields |= XFS_SB_VERSIONNUM;
+		}
+		if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
+			xfs_sb_version_addattr2(&mp->m_sb);
+			sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+		}
+		if (sbfields) {
+			spin_unlock(&mp->m_sb_lock);
+			xfs_mod_sb(tp, sbfields);
+		} else
+			spin_unlock(&mp->m_sb_lock);
+	}
+	if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
+		goto error2;
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+	return error;
+error2:
+	xfs_bmap_cancel(&flist);
+error1:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+error0:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+	return error;
+}
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+/* ARGSUSED */
+void
+xfs_bmap_add_free(
+	xfs_fsblock_t		bno,		/* fs block number of extent */
+	xfs_filblks_t		len,		/* length of extent */
+	xfs_bmap_free_t		*flist,		/* list of extents */
+	xfs_mount_t		*mp)		/* mount point structure */
+{
+	xfs_bmap_free_item_t	*cur;		/* current (next) element */
+	xfs_bmap_free_item_t	*new;		/* new element */
+	xfs_bmap_free_item_t	*prev;		/* previous element */
+#ifdef DEBUG
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+
+	ASSERT(bno != NULLFSBLOCK);
+	ASSERT(len > 0);
+	ASSERT(len <= MAXEXTLEN);
+	ASSERT(!isnullstartblock(bno));
+	agno = XFS_FSB_TO_AGNO(mp, bno);
+	agbno = XFS_FSB_TO_AGBNO(mp, bno);
+	ASSERT(agno < mp->m_sb.sb_agcount);
+	ASSERT(agbno < mp->m_sb.sb_agblocks);
+	ASSERT(len < mp->m_sb.sb_agblocks);
+	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+	ASSERT(xfs_bmap_free_item_zone != NULL);
+	new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+	new->xbfi_startblock = bno;
+	new->xbfi_blockcount = (xfs_extlen_t)len;
+	for (prev = NULL, cur = flist->xbf_first;
+	     cur != NULL;
+	     prev = cur, cur = cur->xbfi_next) {
+		if (cur->xbfi_startblock >= bno)
+			break;
+	}
+	if (prev)
+		prev->xbfi_next = new;
+	else
+		flist->xbf_first = new;
+	new->xbfi_next = cur;
+	flist->xbf_count++;
+}
+
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem.  Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	int		whichfork)	/* data or attr fork */
+{
+	int		level;		/* btree level */
+	uint		maxblocks;	/* max blocks at this level */
+	uint		maxleafents;	/* max leaf entries possible */
+	int		maxrootrecs;	/* max records in root block */
+	int		minleafrecs;	/* min records in leaf block */
+	int		minnoderecs;	/* min records in node block */
+	int		sz;		/* root block size */
+
+	/*
+	 * The maximum number of extents in a file, hence the maximum
+	 * number of leaf entries, is controlled by the type of di_nextents
+	 * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
+	 * (a signed 16-bit number, xfs_aextnum_t).
+	 *
+	 * Note that we can no longer assume that if we are in ATTR1 that
+	 * the fork offset of all the inodes will be
+	 * (xfs_default_attroffset(ip) >> 3) because we could have mounted
+	 * with ATTR2 and then mounted back with ATTR1, keeping the
+	 * di_forkoff's fixed but probably at various positions. Therefore,
+	 * for both ATTR1 and ATTR2 we have to assume the worst case scenario
+	 * of a minimum size available.
+	 */
+	if (whichfork == XFS_DATA_FORK) {
+		maxleafents = MAXEXTNUM;
+		sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+	} else {
+		maxleafents = MAXAEXTNUM;
+		sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	}
+	maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
+	minleafrecs = mp->m_bmap_dmnr[0];
+	minnoderecs = mp->m_bmap_dmnr[1];
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++) {
+		if (maxblocks <= maxrootrecs)
+			maxblocks = 1;
+		else
+			maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	}
+	mp->m_bm_maxlevels[whichfork] = level;
+}
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.  We never free any extents in
+ * the first transaction.
+ *
+ * Return 1 if the given transaction was committed and a new one
+ * started, and 0 otherwise in the committed parameter.
+ */
+int						/* error */
+xfs_bmap_finish(
+	xfs_trans_t		**tp,		/* transaction pointer addr */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	int			*committed)	/* xact committed or not */
+{
+	xfs_efd_log_item_t	*efd;		/* extent free data */
+	xfs_efi_log_item_t	*efi;		/* extent free intention */
+	int			error;		/* error return value */
+	xfs_bmap_free_item_t	*free;		/* free extent item */
+	unsigned int		logres;		/* new log reservation */
+	unsigned int		logcount;	/* new log count */
+	xfs_mount_t		*mp;		/* filesystem mount structure */
+	xfs_bmap_free_item_t	*next;		/* next item on free list */
+	xfs_trans_t		*ntp;		/* new transaction pointer */
+
+	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+	if (flist->xbf_count == 0) {
+		*committed = 0;
+		return 0;
+	}
+	ntp = *tp;
+	efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+	for (free = flist->xbf_first; free; free = free->xbfi_next)
+		xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+			free->xbfi_blockcount);
+	logres = ntp->t_log_res;
+	logcount = ntp->t_log_count;
+	ntp = xfs_trans_dup(*tp);
+	error = xfs_trans_commit(*tp, 0);
+	*tp = ntp;
+	*committed = 1;
+	/*
+	 * We have a new transaction, so we should return committed=1,
+	 * even though we're returning an error.
+	 */
+	if (error)
+		return error;
+
+	/*
+	 * transaction commit worked ok so we can drop the extra ticket
+	 * reference that we gained in xfs_trans_dup()
+	 */
+	xfs_log_ticket_put(ntp->t_ticket);
+
+	if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
+			logcount)))
+		return error;
+	efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+	for (free = flist->xbf_first; free != NULL; free = next) {
+		next = free->xbfi_next;
+		if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+				free->xbfi_blockcount))) {
+			/*
+			 * The bmap free list will be cleaned up at a
+			 * higher level.  The EFI will be canceled when
+			 * this transaction is aborted.
+			 * Need to force shutdown here to make sure it
+			 * happens, since this transaction may not be
+			 * dirty yet.
+			 */
+			mp = ntp->t_mountp;
+			if (!XFS_FORCED_SHUTDOWN(mp))
+				xfs_force_shutdown(mp,
+						   (error == EFSCORRUPTED) ?
+						   SHUTDOWN_CORRUPT_INCORE :
+						   SHUTDOWN_META_IO_ERROR);
+			return error;
+		}
+		xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+			free->xbfi_blockcount);
+		xfs_bmap_del_free(flist, NULL, free);
+	}
+	return 0;
+}
+
+/*
+ * Free up any items left in the list.
+ */
+void
+xfs_bmap_cancel(
+	xfs_bmap_free_t		*flist)	/* list of bmap_free_items */
+{
+	xfs_bmap_free_item_t	*free;	/* free list item */
+	xfs_bmap_free_item_t	*next;
+
+	if (flist->xbf_count == 0)
+		return;
+	ASSERT(flist->xbf_first != NULL);
+	for (free = flist->xbf_first; free; free = next) {
+		next = free->xbfi_next;
+		xfs_bmap_del_free(flist, NULL, free);
+	}
+	ASSERT(flist->xbf_count == 0);
+}
+
+/*
+ * Returns the file-relative block number of the first unused block(s)
+ * in the file with at least "len" logically contiguous blocks free.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ * Return 0 if the file is currently local (in-inode).
+ */
+int						/* error */
+xfs_bmap_first_unused(
+	xfs_trans_t	*tp,			/* transaction pointer */
+	xfs_inode_t	*ip,			/* incore inode */
+	xfs_extlen_t	len,			/* size of hole to find */
+	xfs_fileoff_t	*first_unused,		/* unused block */
+	int		whichfork)		/* data or attr fork */
+{
+	int		error;			/* error return value */
+	int		idx;			/* extent record index */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_fileoff_t	lastaddr;		/* last block number seen */
+	xfs_fileoff_t	lowest;			/* lowest useful block */
+	xfs_fileoff_t	max;			/* starting useful block */
+	xfs_fileoff_t	off;			/* offset for this block */
+	xfs_extnum_t	nextents;		/* number of extent entries */
+
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
+	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
+	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		*first_unused = 0;
+		return 0;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	lowest = *first_unused;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
+		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+		off = xfs_bmbt_get_startoff(ep);
+		/*
+		 * See if the hole before this extent will work.
+		 */
+		if (off >= lowest + len && off - max >= len) {
+			*first_unused = max;
+			return 0;
+		}
+		lastaddr = off + xfs_bmbt_get_blockcount(ep);
+		max = XFS_FILEOFF_MAX(lastaddr, lowest);
+	}
+	*first_unused = max;
+	return 0;
+}
+
+/*
+ * Returns the file-relative block number of the last block + 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int						/* error */
+xfs_bmap_last_before(
+	xfs_trans_t	*tp,			/* transaction pointer */
+	xfs_inode_t	*ip,			/* incore inode */
+	xfs_fileoff_t	*last_block,		/* last block */
+	int		whichfork)		/* data or attr fork */
+{
+	xfs_fileoff_t	bno;			/* input file offset */
+	int		eof;			/* hit end of file */
+	xfs_bmbt_rec_host_t *ep;		/* pointer to last extent */
+	int		error;			/* error return value */
+	xfs_bmbt_irec_t	got;			/* current extent value */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_extnum_t	lastx;			/* last extent used */
+	xfs_bmbt_irec_t	prev;			/* previous extent value */
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+	       return XFS_ERROR(EIO);
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		*last_block = 0;
+		return 0;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	bno = *last_block - 1;
+	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+	if (eof || xfs_bmbt_get_startoff(ep) > bno) {
+		if (prev.br_startoff == NULLFILEOFF)
+			*last_block = 0;
+		else
+			*last_block = prev.br_startoff + prev.br_blockcount;
+	}
+	/*
+	 * Otherwise *last_block is already the right answer.
+	 */
+	return 0;
+}
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file.  This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int						/* error */
+xfs_bmap_last_offset(
+	xfs_trans_t	*tp,			/* transaction pointer */
+	xfs_inode_t	*ip,			/* incore inode */
+	xfs_fileoff_t	*last_block,		/* last block */
+	int		whichfork)		/* data or attr fork */
+{
+	xfs_bmbt_rec_host_t *ep;		/* pointer to last extent */
+	int		error;			/* error return value */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_extnum_t	nextents;		/* number of extent entries */
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+	       return XFS_ERROR(EIO);
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		*last_block = 0;
+		return 0;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (!nextents) {
+		*last_block = 0;
+		return 0;
+	}
+	ep = xfs_iext_get_ext(ifp, nextents - 1);
+	*last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep);
+	return 0;
+}
+
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not.  For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int					/* 1=>1 block, 0=>otherwise */
+xfs_bmap_one_block(
+	xfs_inode_t	*ip,		/* incore inode */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_host_t *ep;	/* ptr to fork's extent */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	int		rval;		/* return value */
+	xfs_bmbt_irec_t	s;		/* internal version of extent */
+
+#ifndef DEBUG
+	if (whichfork == XFS_DATA_FORK) {
+		return ((ip->i_d.di_mode & S_IFMT) == S_IFREG) ?
+			(ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
+			(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
+	}
+#endif	/* !DEBUG */
+	if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
+		return 0;
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		return 0;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	ep = xfs_iext_get_ext(ifp, 0);
+	xfs_bmbt_get_all(ep, &s);
+	rval = s.br_startoff == 0 && s.br_blockcount == 1;
+	if (rval && whichfork == XFS_DATA_FORK)
+		ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize);
+	return rval;
+}
+
+STATIC int
+xfs_bmap_sanity_check(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	int			level)
+{
+	struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+
+	if (be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC ||
+	    be16_to_cpu(block->bb_level) != level ||
+	    be16_to_cpu(block->bb_numrecs) == 0 ||
+	    be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+		return 0;
+	return 1;
+}
+
+/*
+ * Read in the extents to if_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in. If the file system cannot contain unwritten
+ * extents, the records are checked for no "state" flags.
+ */
+int					/* error */
+xfs_bmap_read_extents(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_inode_t		*ip,	/* incore inode */
+	int			whichfork) /* data or attr fork */
+{
+	struct xfs_btree_block	*block;	/* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_buf_t		*bp;	/* buffer for "block" */
+	int			error;	/* error return value */
+	xfs_exntfmt_t		exntf;	/* XFS_EXTFMT_NOSTATE, if checking */
+	xfs_extnum_t		i, j;	/* index into the extents list */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	__be64			*pp;	/* pointer to block address */
+	/* REFERENCED */
+	xfs_extnum_t		room;	/* number of entries there's room for */
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+					XFS_EXTFMT_INODE(ip);
+	block = ifp->if_broot;
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	ASSERT(level > 0);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+	bno = be64_to_cpu(*pp);
+	ASSERT(bno != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+	/*
+	 * Go down the tree until leaf level is reached, following the first
+	 * pointer (leftmost) at each level.
+	 */
+	while (level-- > 0) {
+		if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF)))
+			return error;
+		block = XFS_BUF_TO_BLOCK(bp);
+		XFS_WANT_CORRUPTED_GOTO(
+			xfs_bmap_sanity_check(mp, bp, level),
+			error0);
+		if (level == 0)
+			break;
+		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+		bno = be64_to_cpu(*pp);
+		XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+		xfs_trans_brelse(tp, bp);
+	}
+	/*
+	 * Here with bp and block set to the leftmost leaf node in the tree.
+	 */
+	room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	i = 0;
+	/*
+	 * Loop over all leaf nodes.  Copy information to the extent records.
+	 */
+	for (;;) {
+		xfs_bmbt_rec_t	*frp;
+		xfs_fsblock_t	nextbno;
+		xfs_extnum_t	num_recs;
+		xfs_extnum_t	start;
+
+
+		num_recs = xfs_btree_get_numrecs(block);
+		if (unlikely(i + num_recs > room)) {
+			ASSERT(i + num_recs <= room);
+			xfs_warn(ip->i_mount,
+				"corrupt dinode %Lu, (btree extents).",
+				(unsigned long long) ip->i_ino);
+			XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
+				XFS_ERRLEVEL_LOW, ip->i_mount, block);
+			goto error0;
+		}
+		XFS_WANT_CORRUPTED_GOTO(
+			xfs_bmap_sanity_check(mp, bp, 0),
+			error0);
+		/*
+		 * Read-ahead the next leaf block, if any.
+		 */
+		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+		if (nextbno != NULLFSBLOCK)
+			xfs_btree_reada_bufl(mp, nextbno, 1);
+		/*
+		 * Copy records into the extent records.
+		 */
+		frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+		start = i;
+		for (j = 0; j < num_recs; j++, i++, frp++) {
+			xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
+			trp->l0 = be64_to_cpu(frp->l0);
+			trp->l1 = be64_to_cpu(frp->l1);
+		}
+		if (exntf == XFS_EXTFMT_NOSTATE) {
+			/*
+			 * Check all attribute bmap btree records and
+			 * any "older" data bmap btree records for a
+			 * set bit in the "extent flag" position.
+			 */
+			if (unlikely(xfs_check_nostate_extents(ifp,
+					start, num_recs))) {
+				XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
+						 XFS_ERRLEVEL_LOW,
+						 ip->i_mount);
+				goto error0;
+			}
+		}
+		xfs_trans_brelse(tp, bp);
+		bno = nextbno;
+		/*
+		 * If we've reached the end, stop.
+		 */
+		if (bno == NULLFSBLOCK)
+			break;
+		if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF)))
+			return error;
+		block = XFS_BUF_TO_BLOCK(bp);
+	}
+	ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+	ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+	XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
+	return 0;
+error0:
+	xfs_trans_brelse(tp, bp);
+	return XFS_ERROR(EFSCORRUPTED);
+}
+
+#ifdef DEBUG
+/*
+ * Add bmap trace insert entries for all the contents of the extent records.
+ */
+void
+xfs_bmap_trace_exlist(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	cnt,		/* count of entries in the list */
+	int		whichfork,	/* data or attr fork */
+	unsigned long	caller_ip)
+{
+	xfs_extnum_t	idx;		/* extent record index */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	int		state = 0;
+
+	if (whichfork == XFS_ATTR_FORK)
+		state |= BMAP_ATTRFORK;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+	for (idx = 0; idx < cnt; idx++)
+		trace_xfs_extlist(ip, idx, whichfork, caller_ip);
+}
+
+/*
+ * Validate that the bmbt_irecs being returned from bmapi are valid
+ * given the callers original parameters.  Specifically check the
+ * ranges of the returned irecs to ensure that they only extent beyond
+ * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
+ */
+STATIC void
+xfs_bmap_validate_ret(
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	int			flags,
+	xfs_bmbt_irec_t		*mval,
+	int			nmap,
+	int			ret_nmap)
+{
+	int			i;		/* index to map values */
+
+	ASSERT(ret_nmap <= nmap);
+
+	for (i = 0; i < ret_nmap; i++) {
+		ASSERT(mval[i].br_blockcount > 0);
+		if (!(flags & XFS_BMAPI_ENTIRE)) {
+			ASSERT(mval[i].br_startoff >= bno);
+			ASSERT(mval[i].br_blockcount <= len);
+			ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
+			       bno + len);
+		} else {
+			ASSERT(mval[i].br_startoff < bno + len);
+			ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
+			       bno);
+		}
+		ASSERT(i == 0 ||
+		       mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
+		       mval[i].br_startoff);
+		if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY))
+			ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+			       mval[i].br_startblock != HOLESTARTBLOCK);
+		ASSERT(mval[i].br_state == XFS_EXT_NORM ||
+		       mval[i].br_state == XFS_EXT_UNWRITTEN);
+	}
+}
+#endif /* DEBUG */
+
+
+/*
+ * Map file blocks to filesystem blocks.
+ * File range is given by the bno/len pair.
+ * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
+ * into a hole or past eof.
+ * Only allocates blocks from a single allocation group,
+ * to avoid locking problems.
+ * The returned value in "firstblock" from the first call in a transaction
+ * must be remembered and presented to subsequent calls in "firstblock".
+ * An upper bound for the number of blocks to be allocated is supplied to
+ * the first call in "total"; if no allocation group has that many free
+ * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
+ */
+int					/* error */
+xfs_bmapi(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode */
+	xfs_fileoff_t	bno,		/* starting file offs. mapped */
+	xfs_filblks_t	len,		/* length to map in file */
+	int		flags,		/* XFS_BMAPI_... */
+	xfs_fsblock_t	*firstblock,	/* first allocated block
+					   controls a.g. for allocs */
+	xfs_extlen_t	total,		/* total blocks needed */
+	xfs_bmbt_irec_t	*mval,		/* output: map values */
+	int		*nmap,		/* i/o: mval size/count */
+	xfs_bmap_free_t	*flist)		/* i/o: list extents to free */
+{
+	xfs_fsblock_t	abno;		/* allocated block number */
+	xfs_extlen_t	alen;		/* allocated extent length */
+	xfs_fileoff_t	aoff;		/* allocated file offset */
+	xfs_bmalloca_t	bma = { 0 };	/* args for xfs_bmap_alloc */
+	xfs_btree_cur_t	*cur;		/* bmap btree cursor */
+	xfs_fileoff_t	end;		/* end of mapped file region */
+	int		eof;		/* we've hit the end of extents */
+	xfs_bmbt_rec_host_t *ep;	/* extent record pointer */
+	int		error;		/* error return */
+	xfs_bmbt_irec_t	got;		/* current file extent record */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_extlen_t	indlen;		/* indirect blocks length */
+	xfs_extnum_t	lastx;		/* last useful extent number */
+	int		logflags;	/* flags for transaction logging */
+	xfs_extlen_t	minleft;	/* min blocks left after allocation */
+	xfs_extlen_t	minlen;		/* min allocation size */
+	xfs_mount_t	*mp;		/* xfs mount structure */
+	int		n;		/* current extent index */
+	int		nallocs;	/* number of extents alloc'd */
+	xfs_extnum_t	nextents;	/* number of extents in file */
+	xfs_fileoff_t	obno;		/* old block number (offset) */
+	xfs_bmbt_irec_t	prev;		/* previous file extent record */
+	int		tmp_logflags;	/* temp flags holder */
+	int		whichfork;	/* data or attr fork */
+	char		inhole;		/* current location is hole in file */
+	char		wasdelay;	/* old extent was delayed */
+	char		wr;		/* this is a write request */
+	char		rt;		/* this is a realtime file */
+#ifdef DEBUG
+	xfs_fileoff_t	orig_bno;	/* original block number value */
+	int		orig_flags;	/* original flags arg value */
+	xfs_filblks_t	orig_len;	/* original value of len arg */
+	xfs_bmbt_irec_t	*orig_mval;	/* original value of mval */
+	int		orig_nmap;	/* original value of *nmap */
+
+	orig_bno = bno;
+	orig_len = len;
+	orig_flags = flags;
+	orig_mval = mval;
+	orig_nmap = *nmap;
+#endif
+	ASSERT(*nmap >= 1);
+	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE));
+	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+		XFS_ATTR_FORK : XFS_DATA_FORK;
+	mp = ip->i_mount;
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+	rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	if ((wr = (flags & XFS_BMAPI_WRITE)) != 0)
+		XFS_STATS_INC(xs_blk_mapw);
+	else
+		XFS_STATS_INC(xs_blk_mapr);
+	/*
+	 * IGSTATE flag is used to combine extents which
+	 * differ only due to the state of the extents.
+	 * This technique is used from xfs_getbmap()
+	 * when the caller does not wish to see the
+	 * separation (which is the default).
+	 *
+	 * This technique is also used when writing a
+	 * buffer which has been partially written,
+	 * (usually by being flushed during a chunkread),
+	 * to ensure one write takes place. This also
+	 * prevents a change in the xfs inode extents at
+	 * this time, intentionally. This change occurs
+	 * on completion of the write operation, in
+	 * xfs_strat_comp(), where the xfs_bmapi() call
+	 * is transactioned, and the extents combined.
+	 */
+	if ((flags & XFS_BMAPI_IGSTATE) && wr)	/* if writing unwritten space */
+		wr = 0;				/* no allocations are allowed */
+	ASSERT(wr || !(flags & XFS_BMAPI_DELAY));
+	logflags = 0;
+	nallocs = 0;
+	cur = NULL;
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		ASSERT(wr && tp);
+		if ((error = xfs_bmap_local_to_extents(tp, ip,
+				firstblock, total, &logflags, whichfork)))
+			goto error0;
+	}
+	if (wr && *firstblock == NULLFSBLOCK) {
+		if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
+			minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+		else
+			minleft = 1;
+	} else
+		minleft = 0;
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		goto error0;
+	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	n = 0;
+	end = bno + len;
+	obno = bno;
+	bma.ip = NULL;
+
+	while (bno < end && n < *nmap) {
+		/*
+		 * Reading past eof, act as though there's a hole
+		 * up to end.
+		 */
+		if (eof && !wr)
+			got.br_startoff = end;
+		inhole = eof || got.br_startoff > bno;
+		wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) &&
+			isnullstartblock(got.br_startblock);
+		/*
+		 * First, deal with the hole before the allocated space
+		 * that we found, if any.
+		 */
+		if (wr && (inhole || wasdelay)) {
+			/*
+			 * For the wasdelay case, we could also just
+			 * allocate the stuff asked for in this bmap call
+			 * but that wouldn't be as good.
+			 */
+			if (wasdelay) {
+				alen = (xfs_extlen_t)got.br_blockcount;
+				aoff = got.br_startoff;
+				if (lastx != NULLEXTNUM && lastx) {
+					ep = xfs_iext_get_ext(ifp, lastx - 1);
+					xfs_bmbt_get_all(ep, &prev);
+				}
+			} else {
+				alen = (xfs_extlen_t)
+					XFS_FILBLKS_MIN(len, MAXEXTLEN);
+				if (!eof)
+					alen = (xfs_extlen_t)
+						XFS_FILBLKS_MIN(alen,
+							got.br_startoff - bno);
+				aoff = bno;
+			}
+			minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1;
+			if (flags & XFS_BMAPI_DELAY) {
+				xfs_extlen_t	extsz;
+
+				/* Figure out the extent size, adjust alen */
+				extsz = xfs_get_extsz_hint(ip);
+				if (extsz) {
+					/*
+					 * make sure we don't exceed a single
+					 * extent length when we align the
+					 * extent by reducing length we are
+					 * going to allocate by the maximum
+					 * amount extent size aligment may
+					 * require.
+					 */
+					alen = XFS_FILBLKS_MIN(len,
+						   MAXEXTLEN - (2 * extsz - 1));
+					error = xfs_bmap_extsize_align(mp,
+							&got, &prev, extsz,
+							rt, eof,
+							flags&XFS_BMAPI_DELAY,
+							flags&XFS_BMAPI_CONVERT,
+							&aoff, &alen);
+					ASSERT(!error);
+				}
+
+				if (rt)
+					extsz = alen / mp->m_sb.sb_rextsize;
+
+				/*
+				 * Make a transaction-less quota reservation for
+				 * delayed allocation blocks. This number gets
+				 * adjusted later.  We return if we haven't
+				 * allocated blocks already inside this loop.
+				 */
+				error = xfs_trans_reserve_quota_nblks(
+						NULL, ip, (long)alen, 0,
+						rt ? XFS_QMOPT_RES_RTBLKS :
+						     XFS_QMOPT_RES_REGBLKS);
+				if (error) {
+					if (n == 0) {
+						*nmap = 0;
+						ASSERT(cur == NULL);
+						return error;
+					}
+					break;
+				}
+
+				/*
+				 * Split changing sb for alen and indlen since
+				 * they could be coming from different places.
+				 */
+				indlen = (xfs_extlen_t)
+					xfs_bmap_worst_indlen(ip, alen);
+				ASSERT(indlen > 0);
+
+				if (rt) {
+					error = xfs_mod_incore_sb(mp,
+							XFS_SBS_FREXTENTS,
+							-((int64_t)extsz), 0);
+				} else {
+					error = xfs_icsb_modify_counters(mp,
+							XFS_SBS_FDBLOCKS,
+							-((int64_t)alen), 0);
+				}
+				if (!error) {
+					error = xfs_icsb_modify_counters(mp,
+							XFS_SBS_FDBLOCKS,
+							-((int64_t)indlen), 0);
+					if (error && rt)
+						xfs_mod_incore_sb(mp,
+							XFS_SBS_FREXTENTS,
+							(int64_t)extsz, 0);
+					else if (error)
+						xfs_icsb_modify_counters(mp,
+							XFS_SBS_FDBLOCKS,
+							(int64_t)alen, 0);
+				}
+
+				if (error) {
+					if (XFS_IS_QUOTA_ON(mp))
+						/* unreserve the blocks now */
+						(void)
+						xfs_trans_unreserve_quota_nblks(
+							NULL, ip,
+							(long)alen, 0, rt ?
+							XFS_QMOPT_RES_RTBLKS :
+							XFS_QMOPT_RES_REGBLKS);
+					break;
+				}
+
+				ip->i_delayed_blks += alen;
+				abno = nullstartblock(indlen);
+			} else {
+				/*
+				 * If first time, allocate and fill in
+				 * once-only bma fields.
+				 */
+				if (bma.ip == NULL) {
+					bma.tp = tp;
+					bma.ip = ip;
+					bma.prevp = &prev;
+					bma.gotp = &got;
+					bma.total = total;
+					bma.userdata = 0;
+				}
+				/* Indicate if this is the first user data
+				 * in the file, or just any user data.
+				 */
+				if (!(flags & XFS_BMAPI_METADATA)) {
+					bma.userdata = (aoff == 0) ?
+						XFS_ALLOC_INITIAL_USER_DATA :
+						XFS_ALLOC_USERDATA;
+				}
+				/*
+				 * Fill in changeable bma fields.
+				 */
+				bma.eof = eof;
+				bma.firstblock = *firstblock;
+				bma.alen = alen;
+				bma.off = aoff;
+				bma.conv = !!(flags & XFS_BMAPI_CONVERT);
+				bma.wasdel = wasdelay;
+				bma.minlen = minlen;
+				bma.low = flist->xbf_low;
+				bma.minleft = minleft;
+				/*
+				 * Only want to do the alignment at the
+				 * eof if it is userdata and allocation length
+				 * is larger than a stripe unit.
+				 */
+				if (mp->m_dalign && alen >= mp->m_dalign &&
+				    (!(flags & XFS_BMAPI_METADATA)) &&
+				    (whichfork == XFS_DATA_FORK)) {
+					if ((error = xfs_bmap_isaeof(ip, aoff,
+							whichfork, &bma.aeof)))
+						goto error0;
+				} else
+					bma.aeof = 0;
+				/*
+				 * Call allocator.
+				 */
+				if ((error = xfs_bmap_alloc(&bma)))
+					goto error0;
+				/*
+				 * Copy out result fields.
+				 */
+				abno = bma.rval;
+				if ((flist->xbf_low = bma.low))
+					minleft = 0;
+				alen = bma.alen;
+				aoff = bma.off;
+				ASSERT(*firstblock == NULLFSBLOCK ||
+				       XFS_FSB_TO_AGNO(mp, *firstblock) ==
+				       XFS_FSB_TO_AGNO(mp, bma.firstblock) ||
+				       (flist->xbf_low &&
+					XFS_FSB_TO_AGNO(mp, *firstblock) <
+					XFS_FSB_TO_AGNO(mp, bma.firstblock)));
+				*firstblock = bma.firstblock;
+				if (cur)
+					cur->bc_private.b.firstblock =
+						*firstblock;
+				if (abno == NULLFSBLOCK)
+					break;
+				if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
+					cur = xfs_bmbt_init_cursor(mp, tp,
+						ip, whichfork);
+					cur->bc_private.b.firstblock =
+						*firstblock;
+					cur->bc_private.b.flist = flist;
+				}
+				/*
+				 * Bump the number of extents we've allocated
+				 * in this call.
+				 */
+				nallocs++;
+			}
+			if (cur)
+				cur->bc_private.b.flags =
+					wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0;
+			got.br_startoff = aoff;
+			got.br_startblock = abno;
+			got.br_blockcount = alen;
+			got.br_state = XFS_EXT_NORM;	/* assume normal */
+			/*
+			 * Determine state of extent, and the filesystem.
+			 * A wasdelay extent has been initialized, so
+			 * shouldn't be flagged as unwritten.
+			 */
+			if (wr && xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+				if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
+					got.br_state = XFS_EXT_UNWRITTEN;
+			}
+			error = xfs_bmap_add_extent(ip, &lastx, &cur, &got,
+				firstblock, flist, &tmp_logflags,
+				whichfork);
+			logflags |= tmp_logflags;
+			if (error)
+				goto error0;
+			ep = xfs_iext_get_ext(ifp, lastx);
+			nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+			xfs_bmbt_get_all(ep, &got);
+			ASSERT(got.br_startoff <= aoff);
+			ASSERT(got.br_startoff + got.br_blockcount >=
+				aoff + alen);
+#ifdef DEBUG
+			if (flags & XFS_BMAPI_DELAY) {
+				ASSERT(isnullstartblock(got.br_startblock));
+				ASSERT(startblockval(got.br_startblock) > 0);
+			}
+			ASSERT(got.br_state == XFS_EXT_NORM ||
+			       got.br_state == XFS_EXT_UNWRITTEN);
+#endif
+			/*
+			 * Fall down into the found allocated space case.
+			 */
+		} else if (inhole) {
+			/*
+			 * Reading in a hole.
+			 */
+			mval->br_startoff = bno;
+			mval->br_startblock = HOLESTARTBLOCK;
+			mval->br_blockcount =
+				XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+			mval->br_state = XFS_EXT_NORM;
+			bno += mval->br_blockcount;
+			len -= mval->br_blockcount;
+			mval++;
+			n++;
+			continue;
+		}
+		/*
+		 * Then deal with the allocated space we found.
+		 */
+		ASSERT(ep != NULL);
+		if (!(flags & XFS_BMAPI_ENTIRE) &&
+		    (got.br_startoff + got.br_blockcount > obno)) {
+			if (obno > bno)
+				bno = obno;
+			ASSERT((bno >= obno) || (n == 0));
+			ASSERT(bno < end);
+			mval->br_startoff = bno;
+			if (isnullstartblock(got.br_startblock)) {
+				ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
+				mval->br_startblock = DELAYSTARTBLOCK;
+			} else
+				mval->br_startblock =
+					got.br_startblock +
+					(bno - got.br_startoff);
+			/*
+			 * Return the minimum of what we got and what we
+			 * asked for for the length.  We can use the len
+			 * variable here because it is modified below
+			 * and we could have been there before coming
+			 * here if the first part of the allocation
+			 * didn't overlap what was asked for.
+			 */
+			mval->br_blockcount =
+				XFS_FILBLKS_MIN(end - bno, got.br_blockcount -
+					(bno - got.br_startoff));
+			mval->br_state = got.br_state;
+			ASSERT(mval->br_blockcount <= len);
+		} else {
+			*mval = got;
+			if (isnullstartblock(mval->br_startblock)) {
+				ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
+				mval->br_startblock = DELAYSTARTBLOCK;
+			}
+		}
+
+		/*
+		 * Check if writing previously allocated but
+		 * unwritten extents.
+		 */
+		if (wr &&
+		    ((mval->br_state == XFS_EXT_UNWRITTEN &&
+		      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
+		     (mval->br_state == XFS_EXT_NORM &&
+		      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
+				(XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
+			/*
+			 * Modify (by adding) the state flag, if writing.
+			 */
+			ASSERT(mval->br_blockcount <= len);
+			if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
+				cur = xfs_bmbt_init_cursor(mp,
+					tp, ip, whichfork);
+				cur->bc_private.b.firstblock =
+					*firstblock;
+				cur->bc_private.b.flist = flist;
+			}
+			mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+						? XFS_EXT_NORM
+						: XFS_EXT_UNWRITTEN;
+			error = xfs_bmap_add_extent(ip, &lastx, &cur, mval,
+				firstblock, flist, &tmp_logflags,
+				whichfork);
+			logflags |= tmp_logflags;
+			if (error)
+				goto error0;
+			ep = xfs_iext_get_ext(ifp, lastx);
+			nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+			xfs_bmbt_get_all(ep, &got);
+			/*
+			 * We may have combined previously unwritten
+			 * space with written space, so generate
+			 * another request.
+			 */
+			if (mval->br_blockcount < len)
+				continue;
+		}
+
+		ASSERT((flags & XFS_BMAPI_ENTIRE) ||
+		       ((mval->br_startoff + mval->br_blockcount) <= end));
+		ASSERT((flags & XFS_BMAPI_ENTIRE) ||
+		       (mval->br_blockcount <= len) ||
+		       (mval->br_startoff < obno));
+		bno = mval->br_startoff + mval->br_blockcount;
+		len = end - bno;
+		if (n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+			ASSERT(mval->br_startblock == mval[-1].br_startblock);
+			ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+			ASSERT(mval->br_state == mval[-1].br_state);
+			mval[-1].br_blockcount = mval->br_blockcount;
+			mval[-1].br_state = mval->br_state;
+		} else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+			   mval[-1].br_startblock != DELAYSTARTBLOCK &&
+			   mval[-1].br_startblock != HOLESTARTBLOCK &&
+			   mval->br_startblock ==
+			   mval[-1].br_startblock + mval[-1].br_blockcount &&
+			   ((flags & XFS_BMAPI_IGSTATE) ||
+				mval[-1].br_state == mval->br_state)) {
+			ASSERT(mval->br_startoff ==
+			       mval[-1].br_startoff + mval[-1].br_blockcount);
+			mval[-1].br_blockcount += mval->br_blockcount;
+		} else if (n > 0 &&
+			   mval->br_startblock == DELAYSTARTBLOCK &&
+			   mval[-1].br_startblock == DELAYSTARTBLOCK &&
+			   mval->br_startoff ==
+			   mval[-1].br_startoff + mval[-1].br_blockcount) {
+			mval[-1].br_blockcount += mval->br_blockcount;
+			mval[-1].br_state = mval->br_state;
+		} else if (!((n == 0) &&
+			     ((mval->br_startoff + mval->br_blockcount) <=
+			      obno))) {
+			mval++;
+			n++;
+		}
+		/*
+		 * If we're done, stop now.  Stop when we've allocated
+		 * XFS_BMAP_MAX_NMAP extents no matter what.  Otherwise
+		 * the transaction may get too big.
+		 */
+		if (bno >= end || n >= *nmap || nallocs >= *nmap)
+			break;
+		/*
+		 * Else go on to the next record.
+		 */
+		prev = got;
+		if (++lastx < nextents) {
+			ep = xfs_iext_get_ext(ifp, lastx);
+			xfs_bmbt_get_all(ep, &got);
+		} else {
+			eof = 1;
+		}
+	}
+	*nmap = n;
+	/*
+	 * Transform from btree to extents, give it cur.
+	 */
+	if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+		ASSERT(wr && cur);
+		error = xfs_bmap_btree_to_extents(tp, ip, cur,
+			&tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+	}
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
+	       XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
+	error = 0;
+error0:
+	/*
+	 * Log everything.  Do this after conversion, there's no point in
+	 * logging the extent records if we've converted to btree format.
+	 */
+	if ((logflags & xfs_ilog_fext(whichfork)) &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		logflags &= ~xfs_ilog_fext(whichfork);
+	else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
+		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+		logflags &= ~xfs_ilog_fbroot(whichfork);
+	/*
+	 * Log whatever the flags say, even if error.  Otherwise we might miss
+	 * detecting a case where the data is changed, there's an error,
+	 * and it's not logged so we don't shutdown when we should.
+	 */
+	if (logflags) {
+		ASSERT(tp && wr);
+		xfs_trans_log_inode(tp, ip, logflags);
+	}
+	if (cur) {
+		if (!error) {
+			ASSERT(*firstblock == NULLFSBLOCK ||
+			       XFS_FSB_TO_AGNO(mp, *firstblock) ==
+			       XFS_FSB_TO_AGNO(mp,
+				       cur->bc_private.b.firstblock) ||
+			       (flist->xbf_low &&
+				XFS_FSB_TO_AGNO(mp, *firstblock) <
+				XFS_FSB_TO_AGNO(mp,
+					cur->bc_private.b.firstblock)));
+			*firstblock = cur->bc_private.b.firstblock;
+		}
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	if (!error)
+		xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+			orig_nmap, *nmap);
+	return error;
+}
+
+/*
+ * Map file blocks to filesystem blocks, simple version.
+ * One block (extent) only, read-only.
+ * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
+ * For the other flag values, the effect is as if XFS_BMAPI_METADATA
+ * was set and all the others were clear.
+ */
+int						/* error */
+xfs_bmapi_single(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode */
+	int		whichfork,	/* data or attr fork */
+	xfs_fsblock_t	*fsb,		/* output: mapped block */
+	xfs_fileoff_t	bno)		/* starting file offs. mapped */
+{
+	int		eof;		/* we've hit the end of extents */
+	int		error;		/* error return */
+	xfs_bmbt_irec_t	got;		/* current file extent record */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_extnum_t	lastx;		/* last useful extent number */
+	xfs_bmbt_irec_t	prev;		/* previous file extent record */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (unlikely(
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) {
+	       XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW,
+				ip->i_mount);
+	       return XFS_ERROR(EFSCORRUPTED);
+	}
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return XFS_ERROR(EIO);
+	XFS_STATS_INC(xs_blk_mapr);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	(void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+	/*
+	 * Reading past eof, act as though there's a hole
+	 * up to end.
+	 */
+	if (eof || got.br_startoff > bno) {
+		*fsb = NULLFSBLOCK;
+		return 0;
+	}
+	ASSERT(!isnullstartblock(got.br_startblock));
+	ASSERT(bno < got.br_startoff + got.br_blockcount);
+	*fsb = got.br_startblock + (bno - got.br_startoff);
+	return 0;
+}
+
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value.  If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int						/* error */
+xfs_bunmapi(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting offset to unmap */
+	xfs_filblks_t		len,		/* length to unmap in file */
+	int			flags,		/* misc flags */
+	xfs_extnum_t		nexts,		/* number of extents max */
+	xfs_fsblock_t		*firstblock,	/* first allocated block
+						   controls a.g. for allocs */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	int			*done)		/* set if not done yet */
+{
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	xfs_bmbt_irec_t		del;		/* extent being deleted */
+	int			eof;		/* is deleting at eof */
+	xfs_bmbt_rec_host_t	*ep;		/* extent record pointer */
+	int			error;		/* error return value */
+	xfs_extnum_t		extno;		/* extent number in list */
+	xfs_bmbt_irec_t		got;		/* current extent record */
+	xfs_ifork_t		*ifp;		/* inode fork pointer */
+	int			isrt;		/* freeing in rt area */
+	xfs_extnum_t		lastx;		/* last extent index used */
+	int			logflags;	/* transaction logging flags */
+	xfs_extlen_t		mod;		/* rt extent offset */
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_extnum_t		nextents;	/* number of file extents */
+	xfs_bmbt_irec_t		prev;		/* previous extent record */
+	xfs_fileoff_t		start;		/* first file offset deleted */
+	int			tmp_logflags;	/* partial logging flags */
+	int			wasdel;		/* was a delayed alloc extent */
+	int			whichfork;	/* data or attribute fork */
+	xfs_fsblock_t		sum;
+
+	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+
+	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+		XFS_ATTR_FORK : XFS_DATA_FORK;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (unlikely(
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+		XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
+				 ip->i_mount);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	mp = ip->i_mount;
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	ASSERT(len > 0);
+	ASSERT(nexts >= 0);
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*done = 1;
+		return 0;
+	}
+	XFS_STATS_INC(xs_blk_unmap);
+	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+	start = bno;
+	bno = start + len - 1;
+	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+
+	/*
+	 * Check to see if the given block number is past the end of the
+	 * file, back up to the last block if so...
+	 */
+	if (eof) {
+		ep = xfs_iext_get_ext(ifp, --lastx);
+		xfs_bmbt_get_all(ep, &got);
+		bno = got.br_startoff + got.br_blockcount - 1;
+	}
+	logflags = 0;
+	if (ifp->if_flags & XFS_IFBROOT) {
+		ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_private.b.firstblock = *firstblock;
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.flags = 0;
+	} else
+		cur = NULL;
+	extno = 0;
+	while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
+	       (nexts == 0 || extno < nexts)) {
+		/*
+		 * Is the found extent after a hole in which bno lives?
+		 * Just back up to the previous extent, if so.
+		 */
+		if (got.br_startoff > bno) {
+			if (--lastx < 0)
+				break;
+			ep = xfs_iext_get_ext(ifp, lastx);
+			xfs_bmbt_get_all(ep, &got);
+		}
+		/*
+		 * Is the last block of this extent before the range
+		 * we're supposed to delete?  If so, we're done.
+		 */
+		bno = XFS_FILEOFF_MIN(bno,
+			got.br_startoff + got.br_blockcount - 1);
+		if (bno < start)
+			break;
+		/*
+		 * Then deal with the (possibly delayed) allocated space
+		 * we found.
+		 */
+		ASSERT(ep != NULL);
+		del = got;
+		wasdel = isnullstartblock(del.br_startblock);
+		if (got.br_startoff < start) {
+			del.br_startoff = start;
+			del.br_blockcount -= start - got.br_startoff;
+			if (!wasdel)
+				del.br_startblock += start - got.br_startoff;
+		}
+		if (del.br_startoff + del.br_blockcount > bno + 1)
+			del.br_blockcount = bno + 1 - del.br_startoff;
+		sum = del.br_startblock + del.br_blockcount;
+		if (isrt &&
+		    (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
+			/*
+			 * Realtime extent not lined up at the end.
+			 * The extent could have been split into written
+			 * and unwritten pieces, or we could just be
+			 * unmapping part of it.  But we can't really
+			 * get rid of part of a realtime extent.
+			 */
+			if (del.br_state == XFS_EXT_UNWRITTEN ||
+			    !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+				/*
+				 * This piece is unwritten, or we're not
+				 * using unwritten extents.  Skip over it.
+				 */
+				ASSERT(bno >= mod);
+				bno -= mod > del.br_blockcount ?
+					del.br_blockcount : mod;
+				if (bno < got.br_startoff) {
+					if (--lastx >= 0)
+						xfs_bmbt_get_all(xfs_iext_get_ext(
+							ifp, lastx), &got);
+				}
+				continue;
+			}
+			/*
+			 * It's written, turn it unwritten.
+			 * This is better than zeroing it.
+			 */
+			ASSERT(del.br_state == XFS_EXT_NORM);
+			ASSERT(xfs_trans_get_block_res(tp) > 0);
+			/*
+			 * If this spans a realtime extent boundary,
+			 * chop it back to the start of the one we end at.
+			 */
+			if (del.br_blockcount > mod) {
+				del.br_startoff += del.br_blockcount - mod;
+				del.br_startblock += del.br_blockcount - mod;
+				del.br_blockcount = mod;
+			}
+			del.br_state = XFS_EXT_UNWRITTEN;
+			error = xfs_bmap_add_extent(ip, &lastx, &cur, &del,
+				firstblock, flist, &logflags,
+				XFS_DATA_FORK);
+			if (error)
+				goto error0;
+			goto nodelete;
+		}
+		if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) {
+			/*
+			 * Realtime extent is lined up at the end but not
+			 * at the front.  We'll get rid of full extents if
+			 * we can.
+			 */
+			mod = mp->m_sb.sb_rextsize - mod;
+			if (del.br_blockcount > mod) {
+				del.br_blockcount -= mod;
+				del.br_startoff += mod;
+				del.br_startblock += mod;
+			} else if ((del.br_startoff == start &&
+				    (del.br_state == XFS_EXT_UNWRITTEN ||
+				     xfs_trans_get_block_res(tp) == 0)) ||
+				   !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+				/*
+				 * Can't make it unwritten.  There isn't
+				 * a full extent here so just skip it.
+				 */
+				ASSERT(bno >= del.br_blockcount);
+				bno -= del.br_blockcount;
+				if (got.br_startoff > bno) {
+					if (--lastx >= 0) {
+						ep = xfs_iext_get_ext(ifp,
+								      lastx);
+						xfs_bmbt_get_all(ep, &got);
+					}
+				}
+				continue;
+			} else if (del.br_state == XFS_EXT_UNWRITTEN) {
+				/*
+				 * This one is already unwritten.
+				 * It must have a written left neighbor.
+				 * Unwrite the killed part of that one and
+				 * try again.
+				 */
+				ASSERT(lastx > 0);
+				xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+						lastx - 1), &prev);
+				ASSERT(prev.br_state == XFS_EXT_NORM);
+				ASSERT(!isnullstartblock(prev.br_startblock));
+				ASSERT(del.br_startblock ==
+				       prev.br_startblock + prev.br_blockcount);
+				if (prev.br_startoff < start) {
+					mod = start - prev.br_startoff;
+					prev.br_blockcount -= mod;
+					prev.br_startblock += mod;
+					prev.br_startoff = start;
+				}
+				prev.br_state = XFS_EXT_UNWRITTEN;
+				lastx--;
+				error = xfs_bmap_add_extent(ip, &lastx, &cur,
+					&prev, firstblock, flist, &logflags,
+					XFS_DATA_FORK);
+				if (error)
+					goto error0;
+				goto nodelete;
+			} else {
+				ASSERT(del.br_state == XFS_EXT_NORM);
+				del.br_state = XFS_EXT_UNWRITTEN;
+				error = xfs_bmap_add_extent(ip, &lastx, &cur,
+					&del, firstblock, flist, &logflags,
+					XFS_DATA_FORK);
+				if (error)
+					goto error0;
+				goto nodelete;
+			}
+		}
+		if (wasdel) {
+			ASSERT(startblockval(del.br_startblock) > 0);
+			/* Update realtime/data freespace, unreserve quota */
+			if (isrt) {
+				xfs_filblks_t rtexts;
+
+				rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
+				do_div(rtexts, mp->m_sb.sb_rextsize);
+				xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+						(int64_t)rtexts, 0);
+				(void)xfs_trans_reserve_quota_nblks(NULL,
+					ip, -((long)del.br_blockcount), 0,
+					XFS_QMOPT_RES_RTBLKS);
+			} else {
+				xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+						(int64_t)del.br_blockcount, 0);
+				(void)xfs_trans_reserve_quota_nblks(NULL,
+					ip, -((long)del.br_blockcount), 0,
+					XFS_QMOPT_RES_REGBLKS);
+			}
+			ip->i_delayed_blks -= del.br_blockcount;
+			if (cur)
+				cur->bc_private.b.flags |=
+					XFS_BTCUR_BPRV_WASDEL;
+		} else if (cur)
+			cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
+		/*
+		 * If it's the case where the directory code is running
+		 * with no block reservation, and the deleted block is in
+		 * the middle of its extent, and the resulting insert
+		 * of an extent would cause transformation to btree format,
+		 * then reject it.  The calling code will then swap
+		 * blocks around instead.
+		 * We have to do this now, rather than waiting for the
+		 * conversion to btree format, since the transaction
+		 * will be dirty.
+		 */
+		if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
+		    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+		    XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
+		    del.br_startoff > got.br_startoff &&
+		    del.br_startoff + del.br_blockcount <
+		    got.br_startoff + got.br_blockcount) {
+			error = XFS_ERROR(ENOSPC);
+			goto error0;
+		}
+		error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
+				&tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+		bno = del.br_startoff - 1;
+nodelete:
+		/*
+		 * If not done go on to the next (previous) record.
+		 */
+		if (bno != (xfs_fileoff_t)-1 && bno >= start) {
+			if (lastx >= 0) {
+				ep = xfs_iext_get_ext(ifp, lastx);
+				if (xfs_bmbt_get_startoff(ep) > bno) {
+					if (--lastx >= 0)
+						ep = xfs_iext_get_ext(ifp,
+								      lastx);
+				}
+				xfs_bmbt_get_all(ep, &got);
+			}
+			extno++;
+		}
+	}
+	*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	/*
+	 * Convert to a btree if necessary.
+	 */
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
+			&cur, 0, &tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+	}
+	/*
+	 * transform from btree to extents, give it cur
+	 */
+	else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+		 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+		ASSERT(cur != NULL);
+		error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+			whichfork);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+	}
+	/*
+	 * transform from extents to local?
+	 */
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	error = 0;
+error0:
+	/*
+	 * Log everything.  Do this after conversion, there's no point in
+	 * logging the extent records if we've converted to btree format.
+	 */
+	if ((logflags & xfs_ilog_fext(whichfork)) &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		logflags &= ~xfs_ilog_fext(whichfork);
+	else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
+		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+		logflags &= ~xfs_ilog_fbroot(whichfork);
+	/*
+	 * Log inode even in the error case, if the transaction
+	 * is dirty we'll need to shut down the filesystem.
+	 */
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	if (cur) {
+		if (!error) {
+			*firstblock = cur->bc_private.b.firstblock;
+			cur->bc_private.b.allocated = 0;
+		}
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	return error;
+}
+
+/*
+ * returns 1 for success, 0 if we failed to map the extent.
+ */
+STATIC int
+xfs_getbmapx_fix_eof_hole(
+	xfs_inode_t		*ip,		/* xfs incore inode pointer */
+	struct getbmapx		*out,		/* output structure */
+	int			prealloced,	/* this is a file with
+						 * preallocated data space */
+	__int64_t		end,		/* last block requested */
+	xfs_fsblock_t		startblock)
+{
+	__int64_t		fixlen;
+	xfs_mount_t		*mp;		/* file system mount point */
+	xfs_ifork_t		*ifp;		/* inode fork pointer */
+	xfs_extnum_t		lastx;		/* last extent pointer */
+	xfs_fileoff_t		fileblock;
+
+	if (startblock == HOLESTARTBLOCK) {
+		mp = ip->i_mount;
+		out->bmv_block = -1;
+		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size));
+		fixlen -= out->bmv_offset;
+		if (prealloced && out->bmv_offset + out->bmv_length == end) {
+			/* Came to hole at EOF. Trim it. */
+			if (fixlen <= 0)
+				return 0;
+			out->bmv_length = fixlen;
+		}
+	} else {
+		if (startblock == DELAYSTARTBLOCK)
+			out->bmv_block = -2;
+		else
+			out->bmv_block = xfs_fsb_to_db(ip, startblock);
+		fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
+		ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+		if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
+		   (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+			out->bmv_oflags |= BMV_OF_LAST;
+	}
+
+	return 1;
+}
+
+/*
+ * Get inode's extents as described in bmv, and format for output.
+ * Calls formatter to fill the user's buffer until all extents
+ * are mapped, until the passed-in bmv->bmv_count slots have
+ * been filled, or until the formatter short-circuits the loop,
+ * if it is tracking filled-in extents on its own.
+ */
+int						/* error code */
+xfs_getbmap(
+	xfs_inode_t		*ip,
+	struct getbmapx		*bmv,		/* user bmap structure */
+	xfs_bmap_format_t	formatter,	/* format to user */
+	void			*arg)		/* formatter arg */
+{
+	__int64_t		bmvend;		/* last block requested */
+	int			error = 0;	/* return value */
+	__int64_t		fixlen;		/* length for -1 case */
+	int			i;		/* extent number */
+	int			lock;		/* lock state */
+	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
+	xfs_mount_t		*mp;		/* file system mount point */
+	int			nex;		/* # of user extents can do */
+	int			nexleft;	/* # of user extents left */
+	int			subnex;		/* # of bmapi's can do */
+	int			nmap;		/* number of map entries */
+	struct getbmapx		*out;		/* output structure */
+	int			whichfork;	/* data or attr fork */
+	int			prealloced;	/* this is a file with
+						 * preallocated data space */
+	int			iflags;		/* interface flags */
+	int			bmapi_flags;	/* flags for xfs_bmapi */
+	int			cur_ext = 0;
+
+	mp = ip->i_mount;
+	iflags = bmv->bmv_iflags;
+	whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
+
+	if (whichfork == XFS_ATTR_FORK) {
+		if (XFS_IFORK_Q(ip)) {
+			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
+			    ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
+			    ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
+				return XFS_ERROR(EINVAL);
+		} else if (unlikely(
+			   ip->i_d.di_aformat != 0 &&
+			   ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
+			XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
+					 ip->i_mount);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+
+		prealloced = 0;
+		fixlen = 1LL << 32;
+	} else {
+		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
+		    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+			return XFS_ERROR(EINVAL);
+
+		if (xfs_get_extsz_hint(ip) ||
+		    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
+			prealloced = 1;
+			fixlen = XFS_MAXIOFFSET(mp);
+		} else {
+			prealloced = 0;
+			fixlen = ip->i_size;
+		}
+	}
+
+	if (bmv->bmv_length == -1) {
+		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
+		bmv->bmv_length =
+			max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
+	} else if (bmv->bmv_length == 0) {
+		bmv->bmv_entries = 0;
+		return 0;
+	} else if (bmv->bmv_length < 0) {
+		return XFS_ERROR(EINVAL);
+	}
+
+	nex = bmv->bmv_count - 1;
+	if (nex <= 0)
+		return XFS_ERROR(EINVAL);
+	bmvend = bmv->bmv_offset + bmv->bmv_length;
+
+
+	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
+		return XFS_ERROR(ENOMEM);
+	out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
+	if (!out)
+		return XFS_ERROR(ENOMEM);
+
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
+		if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
+			error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
+			if (error)
+				goto out_unlock_iolock;
+		}
+		/*
+		 * even after flushing the inode, there can still be delalloc
+		 * blocks on the inode beyond EOF due to speculative
+		 * preallocation. These are not removed until the release
+		 * function is called or the inode is inactivated. Hence we
+		 * cannot assert here that ip->i_delayed_blks == 0.
+		 */
+	}
+
+	lock = xfs_ilock_map_shared(ip);
+
+	/*
+	 * Don't let nex be bigger than the number of extents
+	 * we can have assuming alternating holes and real extents.
+	 */
+	if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
+		nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
+
+	bmapi_flags = xfs_bmapi_aflag(whichfork);
+	if (!(iflags & BMV_IF_PREALLOC))
+		bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+	/*
+	 * Allocate enough space to handle "subnex" maps at a time.
+	 */
+	error = ENOMEM;
+	subnex = 16;
+	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
+	if (!map)
+		goto out_unlock_ilock;
+
+	bmv->bmv_entries = 0;
+
+	if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
+	    (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
+		error = 0;
+		goto out_free_map;
+	}
+
+	nexleft = nex;
+
+	do {
+		nmap = (nexleft > subnex) ? subnex : nexleft;
+		error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
+				  XFS_BB_TO_FSB(mp, bmv->bmv_length),
+				  bmapi_flags, NULL, 0, map, &nmap,
+				  NULL);
+		if (error)
+			goto out_free_map;
+		ASSERT(nmap <= subnex);
+
+		for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
+			out[cur_ext].bmv_oflags = 0;
+			if (map[i].br_state == XFS_EXT_UNWRITTEN)
+				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
+			else if (map[i].br_startblock == DELAYSTARTBLOCK)
+				out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
+			out[cur_ext].bmv_offset =
+				XFS_FSB_TO_BB(mp, map[i].br_startoff);
+			out[cur_ext].bmv_length =
+				XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+			out[cur_ext].bmv_unused1 = 0;
+			out[cur_ext].bmv_unused2 = 0;
+			ASSERT(((iflags & BMV_IF_DELALLOC) != 0) ||
+			      (map[i].br_startblock != DELAYSTARTBLOCK));
+                        if (map[i].br_startblock == HOLESTARTBLOCK &&
+			    whichfork == XFS_ATTR_FORK) {
+				/* came to the end of attribute fork */
+				out[cur_ext].bmv_oflags |= BMV_OF_LAST;
+				goto out_free_map;
+			}
+
+			if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
+					prealloced, bmvend,
+					map[i].br_startblock))
+				goto out_free_map;
+
+			bmv->bmv_offset =
+				out[cur_ext].bmv_offset +
+				out[cur_ext].bmv_length;
+			bmv->bmv_length =
+				max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+
+			/*
+			 * In case we don't want to return the hole,
+			 * don't increase cur_ext so that we can reuse
+			 * it in the next loop.
+			 */
+			if ((iflags & BMV_IF_NO_HOLES) &&
+			    map[i].br_startblock == HOLESTARTBLOCK) {
+				memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
+				continue;
+			}
+
+			nexleft--;
+			bmv->bmv_entries++;
+			cur_ext++;
+		}
+	} while (nmap && nexleft && bmv->bmv_length);
+
+ out_free_map:
+	kmem_free(map);
+ out_unlock_ilock:
+	xfs_iunlock_map_shared(ip, lock);
+ out_unlock_iolock:
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+	for (i = 0; i < cur_ext; i++) {
+		int full = 0;	/* user array is full */
+
+		/* format results & advance arg */
+		error = formatter(&arg, &out[i], &full);
+		if (error || full)
+			break;
+	}
+
+	kmem_free(out);
+	return error;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ */
+STATIC int				/* error */
+xfs_bmap_isaeof(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fileoff_t   off,		/* file offset in fsblocks */
+	int             whichfork,	/* data or attribute fork */
+	char		*aeof)		/* return value */
+{
+	int		error;		/* error return value */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_bmbt_rec_host_t *lastrec;	/* extent record pointer */
+	xfs_extnum_t	nextents;	/* number of file extents */
+	xfs_bmbt_irec_t	s;		/* expanded extent record */
+
+	ASSERT(whichfork == XFS_DATA_FORK);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(NULL, ip, whichfork)))
+		return error;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*aeof = 1;
+		return 0;
+	}
+	/*
+	 * Go to the last extent
+	 */
+	lastrec = xfs_iext_get_ext(ifp, nextents - 1);
+	xfs_bmbt_get_all(lastrec, &s);
+	/*
+	 * Check we are allocating in the last extent (for delayed allocations)
+	 * or past the last extent for non-delayed allocations.
+	 */
+	*aeof = (off >= s.br_startoff &&
+		 off < s.br_startoff + s.br_blockcount &&
+		 isnullstartblock(s.br_startblock)) ||
+		off >= s.br_startoff + s.br_blockcount;
+	return 0;
+}
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary.
+ */
+int					/* error */
+xfs_bmap_eof(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fileoff_t	endoff,		/* file offset in fsblocks */
+	int		whichfork,	/* data or attribute fork */
+	int		*eof)		/* result value */
+{
+	xfs_fsblock_t	blockcount;	/* extent block count */
+	int		error;		/* error return value */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_bmbt_rec_host_t *lastrec;	/* extent record pointer */
+	xfs_extnum_t	nextents;	/* number of file extents */
+	xfs_fileoff_t	startoff;	/* extent starting file offset */
+
+	ASSERT(whichfork == XFS_DATA_FORK);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(NULL, ip, whichfork)))
+		return error;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*eof = 1;
+		return 0;
+	}
+	/*
+	 * Go to the last extent
+	 */
+	lastrec = xfs_iext_get_ext(ifp, nextents - 1);
+	startoff = xfs_bmbt_get_startoff(lastrec);
+	blockcount = xfs_bmbt_get_blockcount(lastrec);
+	*eof = endoff >= startoff + blockcount;
+	return 0;
+}
+
+#ifdef DEBUG
+STATIC struct xfs_buf *
+xfs_bmap_get_bp(
+	struct xfs_btree_cur	*cur,
+	xfs_fsblock_t		bno)
+{
+	struct xfs_log_item_desc *lidp;
+	int			i;
+
+	if (!cur)
+		return NULL;
+
+	for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
+		if (!cur->bc_bufs[i])
+			break;
+		if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
+			return cur->bc_bufs[i];
+	}
+
+	/* Chase down all the log items to see if the bp is there */
+	list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
+		struct xfs_buf_log_item	*bip;
+		bip = (struct xfs_buf_log_item *)lidp->lid_item;
+		if (bip->bli_item.li_type == XFS_LI_BUF &&
+		    XFS_BUF_ADDR(bip->bli_buf) == bno)
+			return bip->bli_buf;
+	}
+
+	return NULL;
+}
+
+STATIC void
+xfs_check_block(
+	struct xfs_btree_block	*block,
+	xfs_mount_t		*mp,
+	int			root,
+	short			sz)
+{
+	int			i, j, dmxr;
+	__be64			*pp, *thispa;	/* pointer to block address */
+	xfs_bmbt_key_t		*prevp, *keyp;
+
+	ASSERT(be16_to_cpu(block->bb_level) > 0);
+
+	prevp = NULL;
+	for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
+		dmxr = mp->m_bmap_dmxr[0];
+		keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
+
+		if (prevp) {
+			ASSERT(be64_to_cpu(prevp->br_startoff) <
+			       be64_to_cpu(keyp->br_startoff));
+		}
+		prevp = keyp;
+
+		/*
+		 * Compare the block numbers to see if there are dups.
+		 */
+		if (root)
+			pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+		else
+			pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
+
+		for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
+			if (root)
+				thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+			else
+				thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
+			if (*thispa == *pp) {
+				xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
+					__func__, j, i,
+					(unsigned long long)be64_to_cpu(*thispa));
+				panic("%s: ptrs are equal in node\n",
+					__func__);
+			}
+		}
+	}
+}
+
+/*
+ * Check that the extents for the inode ip are in the right order in all
+ * btree leaves.
+ */
+
+STATIC void
+xfs_bmap_check_leaf_extents(
+	xfs_btree_cur_t		*cur,	/* btree cursor or null */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_btree_block	*block;	/* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_buf_t		*bp;	/* buffer for "block" */
+	int			error;	/* error return value */
+	xfs_extnum_t		i=0, j;	/* index into the extents list */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	__be64			*pp;	/* pointer to block address */
+	xfs_bmbt_rec_t		*ep;	/* pointer to current extent */
+	xfs_bmbt_rec_t		last = {0, 0}; /* last extent in prev block */
+	xfs_bmbt_rec_t		*nextp;	/* pointer to next extent */
+	int			bp_release = 0;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+		return;
+	}
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	block = ifp->if_broot;
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	ASSERT(level > 0);
+	xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+	bno = be64_to_cpu(*pp);
+
+	ASSERT(bno != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+	/*
+	 * Go down the tree until leaf level is reached, following the first
+	 * pointer (leftmost) at each level.
+	 */
+	while (level-- > 0) {
+		/* See if buf is in cur first */
+		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+		if (bp) {
+			bp_release = 0;
+		} else {
+			bp_release = 1;
+		}
+		if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF)))
+			goto error_norelse;
+		block = XFS_BUF_TO_BLOCK(bp);
+		XFS_WANT_CORRUPTED_GOTO(
+			xfs_bmap_sanity_check(mp, bp, level),
+			error0);
+		if (level == 0)
+			break;
+
+		/*
+		 * Check this block for basic sanity (increasing keys and
+		 * no duplicate blocks).
+		 */
+
+		xfs_check_block(block, mp, 0, 0);
+		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+		bno = be64_to_cpu(*pp);
+		XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+		if (bp_release) {
+			bp_release = 0;
+			xfs_trans_brelse(NULL, bp);
+		}
+	}
+
+	/*
+	 * Here with bp and block set to the leftmost leaf node in the tree.
+	 */
+	i = 0;
+
+	/*
+	 * Loop over all leaf nodes checking that all extents are in the right order.
+	 */
+	for (;;) {
+		xfs_fsblock_t	nextbno;
+		xfs_extnum_t	num_recs;
+
+
+		num_recs = xfs_btree_get_numrecs(block);
+
+		/*
+		 * Read-ahead the next leaf block, if any.
+		 */
+
+		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+		/*
+		 * Check all the extents to make sure they are OK.
+		 * If we had a previous block, the last entry should
+		 * conform with the first entry in this one.
+		 */
+
+		ep = XFS_BMBT_REC_ADDR(mp, block, 1);
+		if (i) {
+			ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+			       xfs_bmbt_disk_get_blockcount(&last) <=
+			       xfs_bmbt_disk_get_startoff(ep));
+		}
+		for (j = 1; j < num_recs; j++) {
+			nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+			ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+			       xfs_bmbt_disk_get_blockcount(ep) <=
+			       xfs_bmbt_disk_get_startoff(nextp));
+			ep = nextp;
+		}
+
+		last = *ep;
+		i += num_recs;
+		if (bp_release) {
+			bp_release = 0;
+			xfs_trans_brelse(NULL, bp);
+		}
+		bno = nextbno;
+		/*
+		 * If we've reached the end, stop.
+		 */
+		if (bno == NULLFSBLOCK)
+			break;
+
+		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+		if (bp) {
+			bp_release = 0;
+		} else {
+			bp_release = 1;
+		}
+		if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF)))
+			goto error_norelse;
+		block = XFS_BUF_TO_BLOCK(bp);
+	}
+	if (bp_release) {
+		bp_release = 0;
+		xfs_trans_brelse(NULL, bp);
+	}
+	return;
+
+error0:
+	xfs_warn(mp, "%s: at error0", __func__);
+	if (bp_release)
+		xfs_trans_brelse(NULL, bp);
+error_norelse:
+	xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+		__func__, i);
+	panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
+	return;
+}
+#endif
+
+/*
+ * Count fsblocks of the given fork.
+ */
+int						/* error */
+xfs_bmap_count_blocks(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode */
+	int			whichfork,	/* data or attr fork */
+	int			*count)		/* out: count of blocks */
+{
+	struct xfs_btree_block	*block;	/* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	__be64			*pp;	/* pointer to block address */
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
+		xfs_bmap_count_leaves(ifp, 0,
+			ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
+			count);
+		return 0;
+	}
+
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	block = ifp->if_broot;
+	level = be16_to_cpu(block->bb_level);
+	ASSERT(level > 0);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+	bno = be64_to_cpu(*pp);
+	ASSERT(bno != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+	if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
+		XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
+				 mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	return 0;
+}
+
+/*
+ * Recursively walks each level of a btree
+ * to count total fsblocks is use.
+ */
+STATIC int                                     /* error */
+xfs_bmap_count_tree(
+	xfs_mount_t     *mp,            /* file system mount point */
+	xfs_trans_t     *tp,            /* transaction pointer */
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_fsblock_t   blockno,	/* file system block number */
+	int             levelin,	/* level in btree */
+	int		*count)		/* Count of blocks */
+{
+	int			error;
+	xfs_buf_t		*bp, *nbp;
+	int			level = levelin;
+	__be64			*pp;
+	xfs_fsblock_t           bno = blockno;
+	xfs_fsblock_t		nextbno;
+	struct xfs_btree_block	*block, *nextblock;
+	int			numrecs;
+
+	if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
+		return error;
+	*count += 1;
+	block = XFS_BUF_TO_BLOCK(bp);
+
+	if (--level) {
+		/* Not at node above leaves, count this level of nodes */
+		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+		while (nextbno != NULLFSBLOCK) {
+			if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
+				0, &nbp, XFS_BMAP_BTREE_REF)))
+				return error;
+			*count += 1;
+			nextblock = XFS_BUF_TO_BLOCK(nbp);
+			nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
+			xfs_trans_brelse(tp, nbp);
+		}
+
+		/* Dive to the next level */
+		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+		bno = be64_to_cpu(*pp);
+		if (unlikely((error =
+		     xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
+			xfs_trans_brelse(tp, bp);
+			XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
+					 XFS_ERRLEVEL_LOW, mp);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		xfs_trans_brelse(tp, bp);
+	} else {
+		/* count all level 1 nodes and their leaves */
+		for (;;) {
+			nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+			numrecs = be16_to_cpu(block->bb_numrecs);
+			xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
+			xfs_trans_brelse(tp, bp);
+			if (nextbno == NULLFSBLOCK)
+				break;
+			bno = nextbno;
+			if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF)))
+				return error;
+			*count += 1;
+			block = XFS_BUF_TO_BLOCK(bp);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Count leaf blocks given a range of extent records.
+ */
+STATIC void
+xfs_bmap_count_leaves(
+	xfs_ifork_t		*ifp,
+	xfs_extnum_t		idx,
+	int			numrecs,
+	int			*count)
+{
+	int		b;
+
+	for (b = 0; b < numrecs; b++) {
+		xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
+		*count += xfs_bmbt_get_blockcount(frp);
+	}
+}
+
+/*
+ * Count leaf blocks given a range of extent records originally
+ * in btree format.
+ */
+STATIC void
+xfs_bmap_disk_count_leaves(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*block,
+	int			numrecs,
+	int			*count)
+{
+	int		b;
+	xfs_bmbt_rec_t	*frp;
+
+	for (b = 1; b <= numrecs; b++) {
+		frp = XFS_BMBT_REC_ADDR(mp, block, b);
+		*count += xfs_bmbt_disk_get_blockcount(frp);
+	}
+}
+
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will alays punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		start_fsb,
+	xfs_fileoff_t		length)
+{
+	xfs_fileoff_t		remaining = length;
+	int			error = 0;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	do {
+		int		done;
+		xfs_bmbt_irec_t	imap;
+		int		nimaps = 1;
+		xfs_fsblock_t	firstblock;
+		xfs_bmap_free_t flist;
+
+		/*
+		 * Map the range first and check that it is a delalloc extent
+		 * before trying to unmap the range. Otherwise we will be
+		 * trying to remove a real extent (which requires a
+		 * transaction) or a hole, which is probably a bad idea...
+		 */
+		error = xfs_bmapi(NULL, ip, start_fsb, 1,
+				XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
+				&nimaps, NULL);
+
+		if (error) {
+			/* something screwed, just bail */
+			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+				xfs_alert(ip->i_mount,
+			"Failed delalloc mapping lookup ino %lld fsb %lld.",
+						ip->i_ino, start_fsb);
+			}
+			break;
+		}
+		if (!nimaps) {
+			/* nothing there */
+			goto next_block;
+		}
+		if (imap.br_startblock != DELAYSTARTBLOCK) {
+			/* been converted, ignore */
+			goto next_block;
+		}
+		WARN_ON(imap.br_blockcount == 0);
+
+		/*
+		 * Note: while we initialise the firstblock/flist pair, they
+		 * should never be used because blocks should never be
+		 * allocated or freed for a delalloc extent and hence we need
+		 * don't cancel or finish them after the xfs_bunmapi() call.
+		 */
+		xfs_bmap_init(&flist, &firstblock);
+		error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+					&flist, &done);
+		if (error)
+			break;
+
+		ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+		start_fsb++;
+		remaining--;
+	} while(remaining > 0);
+
+	return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
new file mode 100644
index 00000000..c62234bd
--- /dev/null
+++ b/fs/xfs/xfs_bmap.h
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BMAP_H__
+#define	__XFS_BMAP_H__
+
+struct getbmap;
+struct xfs_bmbt_irec;
+struct xfs_ifork;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+extern kmem_zone_t	*xfs_bmap_free_item_zone;
+
+/*
+ * List of extents to be free "later".
+ * The list is kept sorted on xbf_startblock.
+ */
+typedef struct xfs_bmap_free_item
+{
+	xfs_fsblock_t		xbfi_startblock;/* starting fs block number */
+	xfs_extlen_t		xbfi_blockcount;/* number of blocks in extent */
+	struct xfs_bmap_free_item *xbfi_next;	/* link to next entry */
+} xfs_bmap_free_item_t;
+
+/*
+ * Header for free extent list.
+ *
+ * xbf_low is used by the allocator to activate the lowspace algorithm -
+ * when free space is running low the extent allocator may choose to
+ * allocate an extent from an AG without leaving sufficient space for
+ * a btree split when inserting the new extent.  In this case the allocator
+ * will enable the lowspace algorithm which is supposed to allow further
+ * allocations (such as btree splits and newroots) to allocate from
+ * sequential AGs.  In order to avoid locking AGs out of order the lowspace
+ * algorithm will start searching for free space from AG 0.  If the correct
+ * transaction reservations have been made then this algorithm will eventually
+ * find all the space it needs.
+ */
+typedef	struct xfs_bmap_free
+{
+	xfs_bmap_free_item_t	*xbf_first;	/* list of to-be-free extents */
+	int			xbf_count;	/* count of items on list */
+	int			xbf_low;	/* alloc in low mode */
+} xfs_bmap_free_t;
+
+#define	XFS_BMAP_MAX_NMAP	4
+
+/*
+ * Flags for xfs_bmapi
+ */
+#define	XFS_BMAPI_WRITE		0x001	/* write operation: allocate space */
+#define XFS_BMAPI_DELAY		0x002	/* delayed write operation */
+#define XFS_BMAPI_ENTIRE	0x004	/* return entire extent, not trimmed */
+#define XFS_BMAPI_METADATA	0x008	/* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK	0x010	/* use attribute fork not data */
+#define	XFS_BMAPI_PREALLOC	0x040	/* preallocation op: unwritten space */
+#define	XFS_BMAPI_IGSTATE	0x080	/* Ignore state - */
+					/* combine contig. space */
+#define	XFS_BMAPI_CONTIG	0x100	/* must allocate only one extent */
+/*
+ * unwritten extent conversion - this needs write cache flushing and no additional
+ * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
+ * from written to unwritten, otherwise convert from unwritten to written.
+ */
+#define XFS_BMAPI_CONVERT	0x200
+
+#define XFS_BMAPI_FLAGS \
+	{ XFS_BMAPI_WRITE,	"WRITE" }, \
+	{ XFS_BMAPI_DELAY,	"DELAY" }, \
+	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
+	{ XFS_BMAPI_METADATA,	"METADATA" }, \
+	{ XFS_BMAPI_ATTRFORK,	"ATTRFORK" }, \
+	{ XFS_BMAPI_PREALLOC,	"PREALLOC" }, \
+	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
+	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
+	{ XFS_BMAPI_CONVERT,	"CONVERT" }
+
+
+static inline int xfs_bmapi_aflag(int w)
+{
+	return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
+}
+
+/*
+ * Special values for xfs_bmbt_irec_t br_startblock field.
+ */
+#define	DELAYSTARTBLOCK		((xfs_fsblock_t)-1LL)
+#define	HOLESTARTBLOCK		((xfs_fsblock_t)-2LL)
+
+static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
+{
+	((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
+		(flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
+}
+
+/*
+ * Argument structure for xfs_bmap_alloc.
+ */
+typedef struct xfs_bmalloca {
+	xfs_fsblock_t		firstblock; /* i/o first block allocated */
+	xfs_fsblock_t		rval;	/* starting block of new extent */
+	xfs_fileoff_t		off;	/* offset in file filling in */
+	struct xfs_trans	*tp;	/* transaction pointer */
+	struct xfs_inode	*ip;	/* incore inode pointer */
+	struct xfs_bmbt_irec	*prevp;	/* extent before the new one */
+	struct xfs_bmbt_irec	*gotp;	/* extent after, or delayed */
+	xfs_extlen_t		alen;	/* i/o length asked/allocated */
+	xfs_extlen_t		total;	/* total blocks needed for xaction */
+	xfs_extlen_t		minlen;	/* minimum allocation size (blocks) */
+	xfs_extlen_t		minleft; /* amount must be left after alloc */
+	char			eof;	/* set if allocating past last extent */
+	char			wasdel;	/* replacing a delayed allocation */
+	char			userdata;/* set if is user data */
+	char			low;	/* low on space, using seq'l ags */
+	char			aeof;	/* allocated space at eof */
+	char			conv;	/* overwriting unwritten extents */
+} xfs_bmalloca_t;
+
+/*
+ * Flags for xfs_bmap_add_extent*.
+ */
+#define BMAP_LEFT_CONTIG	(1 << 0)
+#define BMAP_RIGHT_CONTIG	(1 << 1)
+#define BMAP_LEFT_FILLING	(1 << 2)
+#define BMAP_RIGHT_FILLING	(1 << 3)
+#define BMAP_LEFT_DELAY		(1 << 4)
+#define BMAP_RIGHT_DELAY	(1 << 5)
+#define BMAP_LEFT_VALID		(1 << 6)
+#define BMAP_RIGHT_VALID	(1 << 7)
+#define BMAP_ATTRFORK		(1 << 8)
+
+#define XFS_BMAP_EXT_FLAGS \
+	{ BMAP_LEFT_CONTIG,	"LC" }, \
+	{ BMAP_RIGHT_CONTIG,	"RC" }, \
+	{ BMAP_LEFT_FILLING,	"LF" }, \
+	{ BMAP_RIGHT_FILLING,	"RF" }, \
+	{ BMAP_ATTRFORK,	"ATTR" }
+
+/*
+ * Add bmap trace insert entries for all the contents of the extent list.
+ *
+ * Quite excessive tracing.  Only do this for debug builds.
+ */
+#if defined(__KERNEL) && defined(DEBUG)
+void
+xfs_bmap_trace_exlist(
+	struct xfs_inode	*ip,		/* incore inode pointer */
+	xfs_extnum_t		cnt,		/* count of entries in list */
+	int			whichfork,
+	unsigned long		caller_ip);	/* data or attr fork */
+#define	XFS_BMAP_TRACE_EXLIST(ip,c,w)	\
+	xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
+#else
+#define	XFS_BMAP_TRACE_EXLIST(ip,c,w)
+#endif
+
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int					/* error code */
+xfs_bmap_add_attrfork(
+	struct xfs_inode	*ip,	/* incore inode pointer */
+	int			size,	/* space needed for new attribute */
+	int			rsvd);	/* flag for reserved block allocation */
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+xfs_bmap_add_free(
+	xfs_fsblock_t		bno,		/* fs block number of extent */
+	xfs_filblks_t		len,		/* length of extent */
+	xfs_bmap_free_t		*flist,		/* list of extents */
+	struct xfs_mount	*mp);		/* mount point structure */
+
+/*
+ * Routine to clean up the free list data structure when
+ * an error occurs during a transaction.
+ */
+void
+xfs_bmap_cancel(
+	xfs_bmap_free_t		*flist);	/* free list to clean up */
+
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem.  Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Returns the file-relative block number of the first unused block in the file.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ */
+int						/* error */
+xfs_bmap_first_unused(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_extlen_t		len,		/* size of hole to find */
+	xfs_fileoff_t		*unused,	/* unused block num */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Returns the file-relative block number of the last block + 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int						/* error */
+xfs_bmap_last_before(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		*last_block,	/* last block */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file.  This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int						/* error */
+xfs_bmap_last_offset(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		*unused,	/* last block num */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not.  For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int
+xfs_bmap_one_block(
+	struct xfs_inode	*ip,		/* incore inode */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Read in the extents to iu_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in.
+ */
+int						/* error */
+xfs_bmap_read_extents(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Map file blocks to filesystem blocks.
+ * File range is given by the bno/len pair.
+ * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
+ * into a hole or past eof.
+ * Only allocates blocks from a single allocation group,
+ * to avoid locking problems.
+ * The returned value in "firstblock" from the first call in a transaction
+ * must be remembered and presented to subsequent calls in "firstblock".
+ * An upper bound for the number of blocks to be allocated is supplied to
+ * the first call in "total"; if no allocation group has that many free
+ * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
+ */
+int						/* error */
+xfs_bmapi(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting file offs. mapped */
+	xfs_filblks_t		len,		/* length to map in file */
+	int			flags,		/* XFS_BMAPI_... */
+	xfs_fsblock_t		*firstblock,	/* first allocated block
+						   controls a.g. for allocs */
+	xfs_extlen_t		total,		/* total blocks needed */
+	struct xfs_bmbt_irec	*mval,		/* output: map values */
+	int			*nmap,		/* i/o: mval size/count */
+	xfs_bmap_free_t		*flist);	/* i/o: list extents to free */
+
+/*
+ * Map file blocks to filesystem blocks, simple version.
+ * One block only, read-only.
+ * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
+ * For the other flag values, the effect is as if XFS_BMAPI_METADATA
+ * was set and all the others were clear.
+ */
+int						/* error */
+xfs_bmapi_single(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	int			whichfork,	/* data or attr fork */
+	xfs_fsblock_t		*fsb,		/* output: mapped block */
+	xfs_fileoff_t		bno);		/* starting file offs. mapped */
+
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value.  If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int						/* error */
+xfs_bunmapi(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting offset to unmap */
+	xfs_filblks_t		len,		/* length to unmap in file */
+	int			flags,		/* XFS_BMAPI_... */
+	xfs_extnum_t		nexts,		/* number of extents max */
+	xfs_fsblock_t		*firstblock,	/* first allocated block
+						   controls a.g. for allocs */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	int			*done);		/* set if not done yet */
+
+/*
+ * Check an extent list, which has just been read, for
+ * any bit in the extent flag field.
+ */
+int
+xfs_check_nostate_extents(
+	struct xfs_ifork	*ifp,
+	xfs_extnum_t		idx,
+	xfs_extnum_t		num);
+
+uint
+xfs_default_attroffset(
+	struct xfs_inode	*ip);
+
+#ifdef __KERNEL__
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.
+ *
+ * Return 1 if the given transaction was committed and a new one allocated,
+ * and 0 otherwise.
+ */
+int						/* error */
+xfs_bmap_finish(
+	struct xfs_trans	**tp,		/* transaction pointer addr */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	int			*committed);	/* xact committed or not */
+
+/* bmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
+
+/*
+ * Get inode's extents as described in bmv, and format for output.
+ */
+int						/* error code */
+xfs_getbmap(
+	xfs_inode_t		*ip,
+	struct getbmapx		*bmv,		/* user bmap structure */
+	xfs_bmap_format_t	formatter,	/* format to user */
+	void			*arg);		/* formatter arg */
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary
+ */
+int
+xfs_bmap_eof(
+	struct xfs_inode        *ip,
+	xfs_fileoff_t           endoff,
+	int                     whichfork,
+	int                     *eof);
+
+/*
+ * Count fsblocks of the given fork.
+ */
+int
+xfs_bmap_count_blocks(
+	xfs_trans_t		*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	int			*count);
+
+int
+xfs_bmap_punch_delalloc_range(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		start_fsb,
+	xfs_fileoff_t		length);
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
new file mode 100644
index 00000000..87d3c10b
--- /dev/null
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -0,0 +1,919 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+#include "xfs_itable.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+
+/*
+ * Determine the extent state.
+ */
+/* ARGSUSED */
+STATIC xfs_exntst_t
+xfs_extent_state(
+	xfs_filblks_t		blks,
+	int			extent_flag)
+{
+	if (extent_flag) {
+		ASSERT(blks != 0);	/* saved for DMIG */
+		return XFS_EXT_UNWRITTEN;
+	}
+	return XFS_EXT_NORM;
+}
+
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+void
+xfs_bmdr_to_bmbt(
+	struct xfs_mount	*mp,
+	xfs_bmdr_block_t	*dblock,
+	int			dblocklen,
+	struct xfs_btree_block	*rblock,
+	int			rblocklen)
+{
+	int			dmxr;
+	xfs_bmbt_key_t		*fkp;
+	__be64			*fpp;
+	xfs_bmbt_key_t		*tkp;
+	__be64			*tpp;
+
+	rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
+	rblock->bb_level = dblock->bb_level;
+	ASSERT(be16_to_cpu(rblock->bb_level) > 0);
+	rblock->bb_numrecs = dblock->bb_numrecs;
+	rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+	rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+	dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+	fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+	tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+	fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+	tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+	dmxr = be16_to_cpu(dblock->bb_numrecs);
+	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+
+/*
+ * Convert a compressed bmap extent record to an uncompressed form.
+ * This code must be in sync with the routines xfs_bmbt_get_startoff,
+ * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
+ */
+STATIC void
+__xfs_bmbt_get_all(
+		__uint64_t l0,
+		__uint64_t l1,
+		xfs_bmbt_irec_t *s)
+{
+	int	ext_flag;
+	xfs_exntst_t st;
+
+	ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
+	s->br_startoff = ((xfs_fileoff_t)l0 &
+			   xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+#if XFS_BIG_BLKNOS
+	s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
+			   (((xfs_fsblock_t)l1) >> 21);
+#else
+#ifdef DEBUG
+	{
+		xfs_dfsbno_t	b;
+
+		b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
+		    (((xfs_dfsbno_t)l1) >> 21);
+		ASSERT((b >> 32) == 0 || isnulldstartblock(b));
+		s->br_startblock = (xfs_fsblock_t)b;
+	}
+#else	/* !DEBUG */
+	s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
+#endif	/* DEBUG */
+#endif	/* XFS_BIG_BLKNOS */
+	s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
+	/* This is xfs_extent_state() in-line */
+	if (ext_flag) {
+		ASSERT(s->br_blockcount != 0);	/* saved for DMIG */
+		st = XFS_EXT_UNWRITTEN;
+	} else
+		st = XFS_EXT_NORM;
+	s->br_state = st;
+}
+
+void
+xfs_bmbt_get_all(
+	xfs_bmbt_rec_host_t *r,
+	xfs_bmbt_irec_t *s)
+{
+	__xfs_bmbt_get_all(r->l0, r->l1, s);
+}
+
+/*
+ * Extract the blockcount field from an in memory bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_get_blockcount(
+	xfs_bmbt_rec_host_t	*r)
+{
+	return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
+}
+
+/*
+ * Extract the startblock field from an in memory bmap extent record.
+ */
+xfs_fsblock_t
+xfs_bmbt_get_startblock(
+	xfs_bmbt_rec_host_t	*r)
+{
+#if XFS_BIG_BLKNOS
+	return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
+	       (((xfs_fsblock_t)r->l1) >> 21);
+#else
+#ifdef DEBUG
+	xfs_dfsbno_t	b;
+
+	b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
+	    (((xfs_dfsbno_t)r->l1) >> 21);
+	ASSERT((b >> 32) == 0 || isnulldstartblock(b));
+	return (xfs_fsblock_t)b;
+#else	/* !DEBUG */
+	return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
+#endif	/* DEBUG */
+#endif	/* XFS_BIG_BLKNOS */
+}
+
+/*
+ * Extract the startoff field from an in memory bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_get_startoff(
+	xfs_bmbt_rec_host_t	*r)
+{
+	return ((xfs_fileoff_t)r->l0 &
+		 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+
+xfs_exntst_t
+xfs_bmbt_get_state(
+	xfs_bmbt_rec_host_t	*r)
+{
+	int	ext_flag;
+
+	ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN));
+	return xfs_extent_state(xfs_bmbt_get_blockcount(r),
+				ext_flag);
+}
+
+/*
+ * Extract the blockcount field from an on disk bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_disk_get_blockcount(
+	xfs_bmbt_rec_t	*r)
+{
+	return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
+}
+
+/*
+ * Extract the startoff field from a disk format bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_disk_get_startoff(
+	xfs_bmbt_rec_t	*r)
+{
+	return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
+		 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+
+
+/*
+ * Set all the fields in a bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_set_allf(
+	xfs_bmbt_rec_host_t	*r,
+	xfs_fileoff_t		startoff,
+	xfs_fsblock_t		startblock,
+	xfs_filblks_t		blockcount,
+	xfs_exntst_t		state)
+{
+	int		extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
+
+	ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
+	ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+	ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+
+#if XFS_BIG_BLKNOS
+	ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+
+	r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+		((xfs_bmbt_rec_base_t)startoff << 9) |
+		((xfs_bmbt_rec_base_t)startblock >> 43);
+	r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
+		((xfs_bmbt_rec_base_t)blockcount &
+		(xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+#else	/* !XFS_BIG_BLKNOS */
+	if (isnullstartblock(startblock)) {
+		r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			((xfs_bmbt_rec_base_t)startoff << 9) |
+			 (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+		r->l1 = xfs_mask64hi(11) |
+			  ((xfs_bmbt_rec_base_t)startblock << 21) |
+			  ((xfs_bmbt_rec_base_t)blockcount &
+			   (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+	} else {
+		r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			((xfs_bmbt_rec_base_t)startoff << 9);
+		r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
+			 ((xfs_bmbt_rec_base_t)blockcount &
+			 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+	}
+#endif	/* XFS_BIG_BLKNOS */
+}
+
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+void
+xfs_bmbt_set_all(
+	xfs_bmbt_rec_host_t *r,
+	xfs_bmbt_irec_t	*s)
+{
+	xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock,
+			     s->br_blockcount, s->br_state);
+}
+
+
+/*
+ * Set all the fields in a disk format bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_disk_set_allf(
+	xfs_bmbt_rec_t		*r,
+	xfs_fileoff_t		startoff,
+	xfs_fsblock_t		startblock,
+	xfs_filblks_t		blockcount,
+	xfs_exntst_t		state)
+{
+	int			extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
+
+	ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
+	ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+	ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+
+#if XFS_BIG_BLKNOS
+	ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+
+	r->l0 = cpu_to_be64(
+		((xfs_bmbt_rec_base_t)extent_flag << 63) |
+		 ((xfs_bmbt_rec_base_t)startoff << 9) |
+		 ((xfs_bmbt_rec_base_t)startblock >> 43));
+	r->l1 = cpu_to_be64(
+		((xfs_bmbt_rec_base_t)startblock << 21) |
+		 ((xfs_bmbt_rec_base_t)blockcount &
+		  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
+#else	/* !XFS_BIG_BLKNOS */
+	if (isnullstartblock(startblock)) {
+		r->l0 = cpu_to_be64(
+			((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			 ((xfs_bmbt_rec_base_t)startoff << 9) |
+			  (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
+		r->l1 = cpu_to_be64(xfs_mask64hi(11) |
+			  ((xfs_bmbt_rec_base_t)startblock << 21) |
+			  ((xfs_bmbt_rec_base_t)blockcount &
+			   (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
+	} else {
+		r->l0 = cpu_to_be64(
+			((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			 ((xfs_bmbt_rec_base_t)startoff << 9));
+		r->l1 = cpu_to_be64(
+			((xfs_bmbt_rec_base_t)startblock << 21) |
+			 ((xfs_bmbt_rec_base_t)blockcount &
+			  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
+	}
+#endif	/* XFS_BIG_BLKNOS */
+}
+
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+STATIC void
+xfs_bmbt_disk_set_all(
+	xfs_bmbt_rec_t	*r,
+	xfs_bmbt_irec_t *s)
+{
+	xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock,
+				  s->br_blockcount, s->br_state);
+}
+
+/*
+ * Set the blockcount field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_blockcount(
+	xfs_bmbt_rec_host_t *r,
+	xfs_filblks_t	v)
+{
+	ASSERT((v & xfs_mask64hi(43)) == 0);
+	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
+		  (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
+}
+
+/*
+ * Set the startblock field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startblock(
+	xfs_bmbt_rec_host_t *r,
+	xfs_fsblock_t	v)
+{
+#if XFS_BIG_BLKNOS
+	ASSERT((v & xfs_mask64hi(12)) == 0);
+	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
+		  (xfs_bmbt_rec_base_t)(v >> 43);
+	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
+		  (xfs_bmbt_rec_base_t)(v << 21);
+#else	/* !XFS_BIG_BLKNOS */
+	if (isnullstartblock(v)) {
+		r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+		r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
+			  ((xfs_bmbt_rec_base_t)v << 21) |
+			  (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+	} else {
+		r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+		r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
+			  (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+	}
+#endif	/* XFS_BIG_BLKNOS */
+}
+
+/*
+ * Set the startoff field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startoff(
+	xfs_bmbt_rec_host_t *r,
+	xfs_fileoff_t	v)
+{
+	ASSERT((v & xfs_mask64hi(9)) == 0);
+	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
+		((xfs_bmbt_rec_base_t)v << 9) |
+		  (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
+}
+
+/*
+ * Set the extent state field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_state(
+	xfs_bmbt_rec_host_t *r,
+	xfs_exntst_t	v)
+{
+	ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
+	if (v == XFS_EXT_NORM)
+		r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
+	else
+		r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
+}
+
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_bmbt_to_bmdr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*rblock,
+	int			rblocklen,
+	xfs_bmdr_block_t	*dblock,
+	int			dblocklen)
+{
+	int			dmxr;
+	xfs_bmbt_key_t		*fkp;
+	__be64			*fpp;
+	xfs_bmbt_key_t		*tkp;
+	__be64			*tpp;
+
+	ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC);
+	ASSERT(be64_to_cpu(rblock->bb_u.l.bb_leftsib) == NULLDFSBNO);
+	ASSERT(be64_to_cpu(rblock->bb_u.l.bb_rightsib) == NULLDFSBNO);
+	ASSERT(be16_to_cpu(rblock->bb_level) > 0);
+	dblock->bb_level = rblock->bb_level;
+	dblock->bb_numrecs = rblock->bb_numrecs;
+	dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+	fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+	tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+	fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+	tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+	dmxr = be16_to_cpu(dblock->bb_numrecs);
+	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+
+/*
+ * Check extent records, which have just been read, for
+ * any bit in the extent flag field. ASSERT on debug
+ * kernels, as this condition should not occur.
+ * Return an error condition (1) if any flags found,
+ * otherwise return 0.
+ */
+
+int
+xfs_check_nostate_extents(
+	xfs_ifork_t		*ifp,
+	xfs_extnum_t		idx,
+	xfs_extnum_t		num)
+{
+	for (; num > 0; num--, idx++) {
+		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+		if ((ep->l0 >>
+		     (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
+			ASSERT(0);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+
+STATIC struct xfs_btree_cur *
+xfs_bmbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	struct xfs_btree_cur	*new;
+
+	new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+
+	/*
+	 * Copy the firstblock, flist, and flags values,
+	 * since init cursor doesn't get them.
+	 */
+	new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+	new->bc_private.b.flist = cur->bc_private.b.flist;
+	new->bc_private.b.flags = cur->bc_private.b.flags;
+
+	return new;
+}
+
+STATIC void
+xfs_bmbt_update_cursor(
+	struct xfs_btree_cur	*src,
+	struct xfs_btree_cur	*dst)
+{
+	ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
+	       (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+	ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+
+	dst->bc_private.b.allocated += src->bc_private.b.allocated;
+	dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
+
+	src->bc_private.b.allocated = 0;
+}
+
+STATIC int
+xfs_bmbt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			length,
+	int			*stat)
+{
+	xfs_alloc_arg_t		args;		/* block allocation args */
+	int			error;		/* error return value */
+
+	memset(&args, 0, sizeof(args));
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	args.fsbno = cur->bc_private.b.firstblock;
+	args.firstblock = args.fsbno;
+
+	if (args.fsbno == NULLFSBLOCK) {
+		args.fsbno = be64_to_cpu(start->l);
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		/*
+		 * Make sure there is sufficient room left in the AG to
+		 * complete a full tree split for an extent insert.  If
+		 * we are converting the middle part of an extent then
+		 * we may need space for two tree splits.
+		 *
+		 * We are relying on the caller to make the correct block
+		 * reservation for this operation to succeed.  If the
+		 * reservation amount is insufficient then we may fail a
+		 * block allocation here and corrupt the filesystem.
+		 */
+		args.minleft = xfs_trans_get_block_res(args.tp);
+	} else if (cur->bc_private.b.flist->xbf_low) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+	} else {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	}
+
+	args.minlen = args.maxlen = args.prod = 1;
+	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+	if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+		error = XFS_ERROR(ENOSPC);
+		goto error0;
+	}
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		goto error0;
+
+	if (args.fsbno == NULLFSBLOCK && args.minleft) {
+		/*
+		 * Could not find an AG with enough free space to satisfy
+		 * a full btree split.  Try again without minleft and if
+		 * successful activate the lowspace algorithm.
+		 */
+		args.fsbno = 0;
+		args.type = XFS_ALLOCTYPE_FIRST_AG;
+		args.minleft = 0;
+		error = xfs_alloc_vextent(&args);
+		if (error)
+			goto error0;
+		cur->bc_private.b.flist->xbf_low = 1;
+	}
+	if (args.fsbno == NULLFSBLOCK) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+	cur->bc_private.b.firstblock = args.fsbno;
+	cur->bc_private.b.allocated++;
+	cur->bc_private.b.ip->i_d.di_nblocks++;
+	xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+	xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
+			XFS_TRANS_DQ_BCOUNT, 1L);
+
+	new->l = cpu_to_be64(args.fsbno);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+ error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+STATIC int
+xfs_bmbt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_inode	*ip = cur->bc_private.b.ip;
+	struct xfs_trans	*tp = cur->bc_tp;
+	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+
+	xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+	ip->i_d.di_nblocks--;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+	xfs_trans_binval(tp, bp);
+	return 0;
+}
+
+STATIC int
+xfs_bmbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp;
+
+		ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+				    cur->bc_private.b.whichfork);
+
+		return xfs_bmbt_maxrecs(cur->bc_mp,
+					ifp->if_broot_bytes, level == 0) / 2;
+	}
+
+	return cur->bc_mp->m_bmap_dmnr[level != 0];
+}
+
+int
+xfs_bmbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp;
+
+		ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+				    cur->bc_private.b.whichfork);
+
+		return xfs_bmbt_maxrecs(cur->bc_mp,
+					ifp->if_broot_bytes, level == 0);
+	}
+
+	return cur->bc_mp->m_bmap_dmxr[level != 0];
+
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it.  After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_bmbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_bmbt_get_dmaxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level != cur->bc_nlevels - 1)
+		return cur->bc_mp->m_bmap_dmxr[level != 0];
+	return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize,
+				level == 0);
+}
+
+STATIC void
+xfs_bmbt_init_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	key->bmbt.br_startoff =
+		cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_key(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	ASSERT(key->bmbt.br_startoff != 0);
+
+	xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
+			       0, 0, XFS_EXT_NORM);
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
+}
+
+STATIC void
+xfs_bmbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	ptr->l = 0;
+}
+
+STATIC __int64_t
+xfs_bmbt_key_diff(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key)
+{
+	return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+				      cur->bc_rec.b.br_startoff;
+}
+
+#ifdef DEBUG
+STATIC int
+xfs_bmbt_keys_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	return be64_to_cpu(k1->bmbt.br_startoff) <
+		be64_to_cpu(k2->bmbt.br_startoff);
+}
+
+STATIC int
+xfs_bmbt_recs_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*r1,
+	union xfs_btree_rec	*r2)
+{
+	return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
+		xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
+		xfs_bmbt_disk_get_startoff(&r2->bmbt);
+}
+#endif	/* DEBUG */
+
+#ifdef XFS_BTREE_TRACE
+ktrace_t	*xfs_bmbt_trace_buf;
+
+STATIC void
+xfs_bmbt_trace_enter(
+	struct xfs_btree_cur	*cur,
+	const char		*func,
+	char			*s,
+	int			type,
+	int			line,
+	__psunsigned_t		a0,
+	__psunsigned_t		a1,
+	__psunsigned_t		a2,
+	__psunsigned_t		a3,
+	__psunsigned_t		a4,
+	__psunsigned_t		a5,
+	__psunsigned_t		a6,
+	__psunsigned_t		a7,
+	__psunsigned_t		a8,
+	__psunsigned_t		a9,
+	__psunsigned_t		a10)
+{
+	struct xfs_inode	*ip = cur->bc_private.b.ip;
+	int			whichfork = cur->bc_private.b.whichfork;
+
+	ktrace_enter(xfs_bmbt_trace_buf,
+		(void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+		(void *)func, (void *)s, (void *)ip, (void *)cur,
+		(void *)a0, (void *)a1, (void *)a2, (void *)a3,
+		(void *)a4, (void *)a5, (void *)a6, (void *)a7,
+		(void *)a8, (void *)a9, (void *)a10);
+}
+
+STATIC void
+xfs_bmbt_trace_cursor(
+	struct xfs_btree_cur	*cur,
+	__uint32_t		*s0,
+	__uint64_t		*l0,
+	__uint64_t		*l1)
+{
+	struct xfs_bmbt_rec_host r;
+
+	xfs_bmbt_set_all(&r, &cur->bc_rec.b);
+
+	*s0 = (cur->bc_nlevels << 24) |
+	      (cur->bc_private.b.flags << 16) |
+	       cur->bc_private.b.allocated;
+	*l0 = r.l0;
+	*l1 = r.l1;
+}
+
+STATIC void
+xfs_bmbt_trace_key(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key,
+	__uint64_t		*l0,
+	__uint64_t		*l1)
+{
+	*l0 = be64_to_cpu(key->bmbt.br_startoff);
+	*l1 = 0;
+}
+
+/* Endian flipping versions of the bmbt extraction functions */
+STATIC void
+xfs_bmbt_disk_get_all(
+	xfs_bmbt_rec_t	*r,
+	xfs_bmbt_irec_t *s)
+{
+	__xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
+				get_unaligned_be64(&r->l1), s);
+}
+
+STATIC void
+xfs_bmbt_trace_record(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec,
+	__uint64_t		*l0,
+	__uint64_t		*l1,
+	__uint64_t		*l2)
+{
+	struct xfs_bmbt_irec	irec;
+
+	xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+	*l0 = irec.br_startoff;
+	*l1 = irec.br_startblock;
+	*l2 = irec.br_blockcount;
+}
+#endif /* XFS_BTREE_TRACE */
+
+static const struct xfs_btree_ops xfs_bmbt_ops = {
+	.rec_len		= sizeof(xfs_bmbt_rec_t),
+	.key_len		= sizeof(xfs_bmbt_key_t),
+
+	.dup_cursor		= xfs_bmbt_dup_cursor,
+	.update_cursor		= xfs_bmbt_update_cursor,
+	.alloc_block		= xfs_bmbt_alloc_block,
+	.free_block		= xfs_bmbt_free_block,
+	.get_maxrecs		= xfs_bmbt_get_maxrecs,
+	.get_minrecs		= xfs_bmbt_get_minrecs,
+	.get_dmaxrecs		= xfs_bmbt_get_dmaxrecs,
+	.init_key_from_rec	= xfs_bmbt_init_key_from_rec,
+	.init_rec_from_key	= xfs_bmbt_init_rec_from_key,
+	.init_rec_from_cur	= xfs_bmbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_bmbt_init_ptr_from_cur,
+	.key_diff		= xfs_bmbt_key_diff,
+
+#ifdef DEBUG
+	.keys_inorder		= xfs_bmbt_keys_inorder,
+	.recs_inorder		= xfs_bmbt_recs_inorder,
+#endif
+
+#ifdef XFS_BTREE_TRACE
+	.trace_enter		= xfs_bmbt_trace_enter,
+	.trace_cursor		= xfs_bmbt_trace_cursor,
+	.trace_key		= xfs_bmbt_trace_key,
+	.trace_record		= xfs_bmbt_trace_record,
+#endif
+};
+
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur *				/* new bmap btree cursor */
+xfs_bmbt_init_cursor(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* inode owning the btree */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_btree_cur	*cur;
+
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+	cur->bc_btnum = XFS_BTNUM_BMAP;
+	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+	cur->bc_ops = &xfs_bmbt_ops;
+	cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+
+	cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+	cur->bc_private.b.ip = ip;
+	cur->bc_private.b.firstblock = NULLFSBLOCK;
+	cur->bc_private.b.flist = NULL;
+	cur->bc_private.b.allocated = 0;
+	cur->bc_private.b.flags = 0;
+	cur->bc_private.b.whichfork = whichfork;
+
+	return cur;
+}
+
+/*
+ * Calculate number of records in a bmap btree block.
+ */
+int
+xfs_bmbt_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+
+	if (leaf)
+		return blocklen / sizeof(xfs_bmbt_rec_t);
+	return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Calculate number of records in a bmap btree inode root.
+ */
+int
+xfs_bmdr_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= sizeof(xfs_bmdr_block_t);
+
+	if (leaf)
+		return blocklen / sizeof(xfs_bmdr_rec_t);
+	return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
+}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
new file mode 100644
index 00000000..0e66c4ea
--- /dev/null
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BMAP_BTREE_H__
+#define __XFS_BMAP_BTREE_H__
+
+#define XFS_BMAP_MAGIC	0x424d4150	/* 'BMAP' */
+
+struct xfs_btree_cur;
+struct xfs_btree_block;
+struct xfs_mount;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Bmap root header, on-disk form only.
+ */
+typedef struct xfs_bmdr_block {
+	__be16		bb_level;	/* 0 is a leaf */
+	__be16		bb_numrecs;	/* current # of data records */
+} xfs_bmdr_block_t;
+
+/*
+ * Bmap btree record and extent descriptor.
+ *  l0:63 is an extent flag (value 1 indicates non-normal).
+ *  l0:9-62 are startoff.
+ *  l0:0-8 and l1:21-63 are startblock.
+ *  l1:0-20 are blockcount.
+ */
+#define BMBT_EXNTFLAG_BITLEN	1
+#define BMBT_STARTOFF_BITLEN	54
+#define BMBT_STARTBLOCK_BITLEN	52
+#define BMBT_BLOCKCOUNT_BITLEN	21
+
+typedef struct xfs_bmbt_rec {
+	__be64			l0, l1;
+} xfs_bmbt_rec_t;
+
+typedef __uint64_t	xfs_bmbt_rec_base_t;	/* use this for casts */
+typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
+
+typedef struct xfs_bmbt_rec_host {
+	__uint64_t		l0, l1;
+} xfs_bmbt_rec_host_t;
+
+/*
+ * Values and macros for delayed-allocation startblock fields.
+ */
+#define STARTBLOCKVALBITS	17
+#define STARTBLOCKMASKBITS	(15 + XFS_BIG_BLKNOS * 20)
+#define DSTARTBLOCKMASKBITS	(15 + 20)
+#define STARTBLOCKMASK		\
+	(((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+#define DSTARTBLOCKMASK		\
+	(((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+
+static inline int isnullstartblock(xfs_fsblock_t x)
+{
+	return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
+}
+
+static inline int isnulldstartblock(xfs_dfsbno_t x)
+{
+	return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
+}
+
+static inline xfs_fsblock_t nullstartblock(int k)
+{
+	ASSERT(k < (1 << STARTBLOCKVALBITS));
+	return STARTBLOCKMASK | (k);
+}
+
+static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
+{
+	return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
+}
+
+/*
+ * Possible extent formats.
+ */
+typedef enum {
+	XFS_EXTFMT_NOSTATE = 0,
+	XFS_EXTFMT_HASSTATE
+} xfs_exntfmt_t;
+
+/*
+ * Possible extent states.
+ */
+typedef enum {
+	XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+	XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
+} xfs_exntst_t;
+
+/*
+ * Extent state and extent format macros.
+ */
+#define XFS_EXTFMT_INODE(x)	\
+	(xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \
+		XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
+#define ISUNWRITTEN(x)	((x)->br_state == XFS_EXT_UNWRITTEN)
+
+/*
+ * Incore version of above.
+ */
+typedef struct xfs_bmbt_irec
+{
+	xfs_fileoff_t	br_startoff;	/* starting file offset */
+	xfs_fsblock_t	br_startblock;	/* starting block number */
+	xfs_filblks_t	br_blockcount;	/* number of blocks */
+	xfs_exntst_t	br_state;	/* extent state */
+} xfs_bmbt_irec_t;
+
+/*
+ * Key structure for non-leaf levels of the tree.
+ */
+typedef struct xfs_bmbt_key {
+	__be64		br_startoff;	/* starting file offset */
+} xfs_bmbt_key_t, xfs_bmdr_key_t;
+
+/* btree pointer type */
+typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
+
+/*
+ * Btree block header size depends on a superblock flag.
+ *
+ * (not quite yet, but soon)
+ */
+#define XFS_BMBT_BLOCK_LEN(mp)	XFS_BTREE_LBLOCK_LEN
+
+#define XFS_BMBT_REC_ADDR(mp, block, index) \
+	((xfs_bmbt_rec_t *) \
+		((char *)(block) + \
+		 XFS_BMBT_BLOCK_LEN(mp) + \
+		 ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
+
+#define XFS_BMBT_KEY_ADDR(mp, block, index) \
+	((xfs_bmbt_key_t *) \
+		((char *)(block) + \
+		 XFS_BMBT_BLOCK_LEN(mp) + \
+		 ((index) - 1) * sizeof(xfs_bmbt_key_t)))
+
+#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
+	((xfs_bmbt_ptr_t *) \
+		((char *)(block) + \
+		 XFS_BMBT_BLOCK_LEN(mp) + \
+		 (maxrecs) * sizeof(xfs_bmbt_key_t) + \
+		 ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
+
+#define XFS_BMDR_REC_ADDR(block, index) \
+	((xfs_bmdr_rec_t *) \
+		((char *)(block) + \
+		 sizeof(struct xfs_bmdr_block) + \
+	         ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
+
+#define XFS_BMDR_KEY_ADDR(block, index) \
+	((xfs_bmdr_key_t *) \
+		((char *)(block) + \
+		 sizeof(struct xfs_bmdr_block) + \
+		 ((index) - 1) * sizeof(xfs_bmdr_key_t)))
+
+#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
+	((xfs_bmdr_ptr_t *) \
+		((char *)(block) + \
+		 sizeof(struct xfs_bmdr_block) + \
+		 (maxrecs) * sizeof(xfs_bmdr_key_t) + \
+		 ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
+
+/*
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
+	XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
+
+#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
+	(int)(XFS_BTREE_LBLOCK_LEN + \
+	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+
+#define XFS_BMAP_BROOT_SPACE(bb) \
+	(XFS_BMAP_BROOT_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
+#define XFS_BMDR_SPACE_CALC(nrecs) \
+	(int)(sizeof(xfs_bmdr_block_t) + \
+	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+
+/*
+ * Maximum number of bmap btree levels.
+ */
+#define XFS_BM_MAXLEVELS(mp,w)		((mp)->m_bm_maxlevels[(w)])
+
+/*
+ * Prototypes for xfs_bmap.c to call.
+ */
+extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int,
+			struct xfs_btree_block *, int);
+extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
+extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
+extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
+extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
+extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
+
+extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
+extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
+
+extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
+extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
+			xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
+extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v);
+extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
+extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
+extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
+
+extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
+			xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
+
+extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
+			xfs_bmdr_block_t *, int);
+
+extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
+extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+
+extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
+		struct xfs_trans *, struct xfs_inode *, int);
+
+
+#endif	/* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
new file mode 100644
index 00000000..2f9e97c1
--- /dev/null
+++ b/fs/xfs/xfs_btree.c
@@ -0,0 +1,3679 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+/*
+ * Cursor allocation zone.
+ */
+kmem_zone_t	*xfs_btree_cur_zone;
+
+/*
+ * Btree magic numbers.
+ */
+const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
+	XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
+};
+
+
+STATIC int				/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lblock(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_btree_block	*block,	/* btree long form block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp)	/* buffer for block, if any */
+{
+	int			lblock_ok; /* block passes checks */
+	struct xfs_mount	*mp;	/* file system mount point */
+
+	mp = cur->bc_mp;
+	lblock_ok =
+		be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
+		be16_to_cpu(block->bb_level) == level &&
+		be16_to_cpu(block->bb_numrecs) <=
+			cur->bc_ops->get_maxrecs(cur, level) &&
+		block->bb_u.l.bb_leftsib &&
+		(be64_to_cpu(block->bb_u.l.bb_leftsib) == NULLDFSBNO ||
+		 XFS_FSB_SANITY_CHECK(mp,
+		 	be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+		block->bb_u.l.bb_rightsib &&
+		(be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO ||
+		 XFS_FSB_SANITY_CHECK(mp,
+		 	be64_to_cpu(block->bb_u.l.bb_rightsib)));
+	if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
+			XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+			XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
+		if (bp)
+			trace_xfs_btree_corrupt(bp, _RET_IP_);
+		XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW,
+				 mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	return 0;
+}
+
+STATIC int				/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sblock(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_btree_block	*block,	/* btree short form block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp)	/* buffer containing block */
+{
+	struct xfs_buf		*agbp;	/* buffer for ag. freespace struct */
+	struct xfs_agf		*agf;	/* ag. freespace structure */
+	xfs_agblock_t		agflen;	/* native ag. freespace length */
+	int			sblock_ok; /* block passes checks */
+
+	agbp = cur->bc_private.a.agbp;
+	agf = XFS_BUF_TO_AGF(agbp);
+	agflen = be32_to_cpu(agf->agf_length);
+	sblock_ok =
+		be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
+		be16_to_cpu(block->bb_level) == level &&
+		be16_to_cpu(block->bb_numrecs) <=
+			cur->bc_ops->get_maxrecs(cur, level) &&
+		(be32_to_cpu(block->bb_u.s.bb_leftsib) == NULLAGBLOCK ||
+		 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
+		block->bb_u.s.bb_leftsib &&
+		(be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK ||
+		 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
+		block->bb_u.s.bb_rightsib;
+	if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
+			XFS_ERRTAG_BTREE_CHECK_SBLOCK,
+			XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
+		if (bp)
+			trace_xfs_btree_corrupt(bp, _RET_IP_);
+		XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
+			XFS_ERRLEVEL_LOW, cur->bc_mp, block);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	return 0;
+}
+
+/*
+ * Debug routine: check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_btree_block	*block,	/* generic btree block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp)	/* buffer containing block, if any */
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return xfs_btree_check_lblock(cur, block, level, bp);
+	else
+		return xfs_btree_check_sblock(cur, block, level, bp);
+}
+
+/*
+ * Check that (long) pointer is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_dfsbno_t		bno,	/* btree block disk address */
+	int			level)	/* btree block level */
+{
+	XFS_WANT_CORRUPTED_RETURN(
+		level > 0 &&
+		bno != NULLDFSBNO &&
+		XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that (short) pointer is ok.
+ */
+STATIC int				/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* btree block disk address */
+	int			level)	/* btree block level */
+{
+	xfs_agblock_t		agblocks = cur->bc_mp->m_sb.sb_agblocks;
+
+	XFS_WANT_CORRUPTED_RETURN(
+		level > 0 &&
+		bno != NULLAGBLOCK &&
+		bno != 0 &&
+		bno < agblocks);
+	return 0;
+}
+
+/*
+ * Check that block ptr is ok.
+ */
+STATIC int				/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_ptr(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	union xfs_btree_ptr	*ptr,	/* btree block disk address */
+	int			index,	/* offset from ptr to check */
+	int			level)	/* btree block level */
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		return xfs_btree_check_lptr(cur,
+				be64_to_cpu((&ptr->l)[index]), level);
+	} else {
+		return xfs_btree_check_sptr(cur,
+				be32_to_cpu((&ptr->s)[index]), level);
+	}
+}
+#endif
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+	xfs_btree_cur_t	*cur,		/* btree cursor */
+	int		error)		/* del because of error */
+{
+	int		i;		/* btree level */
+
+	/*
+	 * Clear the buffer pointers, and release the buffers.
+	 * If we're doing this in the face of an error, we
+	 * need to make sure to inspect all of the entries
+	 * in the bc_bufs array for buffers to be unlocked.
+	 * This is because some of the btree code works from
+	 * level n down to 0, and if we get an error along
+	 * the way we won't have initialized all the entries
+	 * down to 0.
+	 */
+	for (i = 0; i < cur->bc_nlevels; i++) {
+		if (cur->bc_bufs[i])
+			xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
+		else if (!error)
+			break;
+	}
+	/*
+	 * Can't free a bmap cursor without having dealt with the
+	 * allocated indirect blocks' accounting.
+	 */
+	ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
+	       cur->bc_private.b.allocated == 0);
+	/*
+	 * Free the cursor.
+	 */
+	kmem_zone_free(xfs_btree_cur_zone, cur);
+}
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int					/* error */
+xfs_btree_dup_cursor(
+	xfs_btree_cur_t	*cur,		/* input cursor */
+	xfs_btree_cur_t	**ncur)		/* output cursor */
+{
+	xfs_buf_t	*bp;		/* btree block's buffer pointer */
+	int		error;		/* error return value */
+	int		i;		/* level number of btree block */
+	xfs_mount_t	*mp;		/* mount structure for filesystem */
+	xfs_btree_cur_t	*new;		/* new cursor value */
+	xfs_trans_t	*tp;		/* transaction pointer, can be NULL */
+
+	tp = cur->bc_tp;
+	mp = cur->bc_mp;
+
+	/*
+	 * Allocate a new cursor like the old one.
+	 */
+	new = cur->bc_ops->dup_cursor(cur);
+
+	/*
+	 * Copy the record currently in the cursor.
+	 */
+	new->bc_rec = cur->bc_rec;
+
+	/*
+	 * For each level current, re-get the buffer and copy the ptr value.
+	 */
+	for (i = 0; i < new->bc_nlevels; i++) {
+		new->bc_ptrs[i] = cur->bc_ptrs[i];
+		new->bc_ra[i] = cur->bc_ra[i];
+		if ((bp = cur->bc_bufs[i])) {
+			if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+				XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) {
+				xfs_btree_del_cursor(new, error);
+				*ncur = NULL;
+				return error;
+			}
+			new->bc_bufs[i] = bp;
+			ASSERT(bp);
+			ASSERT(!XFS_BUF_GETERROR(bp));
+		} else
+			new->bc_bufs[i] = NULL;
+	}
+	*ncur = new;
+	return 0;
+}
+
+/*
+ * XFS btree block layout and addressing:
+ *
+ * There are two types of blocks in the btree: leaf and non-leaf blocks.
+ *
+ * The leaf record start with a header then followed by records containing
+ * the values.  A non-leaf block also starts with the same header, and
+ * then first contains lookup keys followed by an equal number of pointers
+ * to the btree blocks at the previous level.
+ *
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ * Leaf:	| header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ *
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ * Non-Leaf:	| header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ *
+ * The header is called struct xfs_btree_block for reasons better left unknown
+ * and comes in different versions for short (32bit) and long (64bit) block
+ * pointers.  The record and key structures are defined by the btree instances
+ * and opaque to the btree core.  The block pointers are simple disk endian
+ * integers, available in a short (32bit) and long (64bit) variant.
+ *
+ * The helpers below calculate the offset of a given record, key or pointer
+ * into a btree block (xfs_btree_*_offset) or return a pointer to the given
+ * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
+ * inside the btree block is done using indices starting at one, not zero!
+ */
+
+/*
+ * Return size of the btree block header for this btree instance.
+ */
+static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
+{
+	return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+		XFS_BTREE_LBLOCK_LEN :
+		XFS_BTREE_SBLOCK_LEN;
+}
+
+/*
+ * Return size of btree block pointers for this btree instance.
+ */
+static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
+{
+	return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+		sizeof(__be64) : sizeof(__be32);
+}
+
+/*
+ * Calculate offset of the n-th record in a btree block.
+ */
+STATIC size_t
+xfs_btree_rec_offset(
+	struct xfs_btree_cur	*cur,
+	int			n)
+{
+	return xfs_btree_block_len(cur) +
+		(n - 1) * cur->bc_ops->rec_len;
+}
+
+/*
+ * Calculate offset of the n-th key in a btree block.
+ */
+STATIC size_t
+xfs_btree_key_offset(
+	struct xfs_btree_cur	*cur,
+	int			n)
+{
+	return xfs_btree_block_len(cur) +
+		(n - 1) * cur->bc_ops->key_len;
+}
+
+/*
+ * Calculate offset of the n-th block pointer in a btree block.
+ */
+STATIC size_t
+xfs_btree_ptr_offset(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	int			level)
+{
+	return xfs_btree_block_len(cur) +
+		cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
+		(n - 1) * xfs_btree_ptr_len(cur);
+}
+
+/*
+ * Return a pointer to the n-th record in the btree block.
+ */
+STATIC union xfs_btree_rec *
+xfs_btree_rec_addr(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	struct xfs_btree_block	*block)
+{
+	return (union xfs_btree_rec *)
+		((char *)block + xfs_btree_rec_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_key_addr(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	struct xfs_btree_block	*block)
+{
+	return (union xfs_btree_key *)
+		((char *)block + xfs_btree_key_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th block pointer in the btree block.
+ */
+STATIC union xfs_btree_ptr *
+xfs_btree_ptr_addr(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	struct xfs_btree_block	*block)
+{
+	int			level = xfs_btree_get_level(block);
+
+	ASSERT(block->bb_level != 0);
+
+	return (union xfs_btree_ptr *)
+		((char *)block + xfs_btree_ptr_offset(cur, n, level));
+}
+
+/*
+ * Get a the root block which is stored in the inode.
+ *
+ * For now this btree implementation assumes the btree root is always
+ * stored in the if_broot field of an inode fork.
+ */
+STATIC struct xfs_btree_block *
+xfs_btree_get_iroot(
+       struct xfs_btree_cur    *cur)
+{
+       struct xfs_ifork        *ifp;
+
+       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+       return (struct xfs_btree_block *)ifp->if_broot;
+}
+
+/*
+ * Retrieve the block pointer from the cursor at the given level.
+ * This may be an inode btree root or from a buffer.
+ */
+STATIC struct xfs_btree_block *		/* generic btree block pointer */
+xfs_btree_get_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level in btree */
+	struct xfs_buf		**bpp)	/* buffer containing the block */
+{
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level == cur->bc_nlevels - 1)) {
+		*bpp = NULL;
+		return xfs_btree_get_iroot(cur);
+	}
+
+	*bpp = cur->bc_bufs[level];
+	return XFS_BUF_TO_BLOCK(*bpp);
+}
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+xfs_buf_t *				/* buffer for fsbno */
+xfs_btree_get_bufl(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_fsblock_t	fsbno,		/* file system block number */
+	uint		lock)		/* lock flags for get_buf */
+{
+	xfs_buf_t	*bp;		/* buffer pointer (return value) */
+	xfs_daddr_t		d;		/* real disk block address */
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+	ASSERT(bp);
+	ASSERT(!XFS_BUF_GETERROR(bp));
+	return bp;
+}
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+xfs_buf_t *				/* buffer for agno/agbno */
+xfs_btree_get_bufs(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_agblock_t	agbno,		/* allocation group block number */
+	uint		lock)		/* lock flags for get_buf */
+{
+	xfs_buf_t	*bp;		/* buffer pointer (return value) */
+	xfs_daddr_t		d;		/* real disk block address */
+
+	ASSERT(agno != NULLAGNUMBER);
+	ASSERT(agbno != NULLAGBLOCK);
+	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+	ASSERT(bp);
+	ASSERT(!XFS_BUF_GETERROR(bp));
+	return bp;
+}
+
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int					/* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to check */
+{
+	struct xfs_btree_block	*block;	/* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO;
+	else
+		return be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK;
+}
+
+/*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+STATIC int				/* success=1, failure=0 */
+xfs_btree_firstrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to change */
+{
+	struct xfs_btree_block	*block;	/* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	/*
+	 * Get the block pointer for this level.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	/*
+	 * It's empty, there is no such record.
+	 */
+	if (!block->bb_numrecs)
+		return 0;
+	/*
+	 * Set the ptr value to 1, that's the first record/key.
+	 */
+	cur->bc_ptrs[level] = 1;
+	return 1;
+}
+
+/*
+ * Change the cursor to point to the last record in the current block
+ * at the given level.  Other levels are unaffected.
+ */
+STATIC int				/* success=1, failure=0 */
+xfs_btree_lastrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to change */
+{
+	struct xfs_btree_block	*block;	/* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	/*
+	 * Get the block pointer for this level.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	/*
+	 * It's empty, there is no such record.
+	 */
+	if (!block->bb_numrecs)
+		return 0;
+	/*
+	 * Set the ptr value to numrecs, that's the last record/key.
+	 */
+	cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
+	return 1;
+}
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+	__int64_t	fields,		/* bitmask of fields */
+	const short	*offsets,	/* table of field offsets */
+	int		nbits,		/* number of bits to inspect */
+	int		*first,		/* output: first byte offset */
+	int		*last)		/* output: last byte offset */
+{
+	int		i;		/* current bit number */
+	__int64_t	imask;		/* mask for current bit number */
+
+	ASSERT(fields != 0);
+	/*
+	 * Find the lowest bit, so the first byte offset.
+	 */
+	for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
+		if (imask & fields) {
+			*first = offsets[i];
+			break;
+		}
+	}
+	/*
+	 * Find the highest bit, so the last byte offset.
+	 */
+	for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
+		if (imask & fields) {
+			*last = offsets[i + 1] - 1;
+			break;
+		}
+	}
+}
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int					/* error */
+xfs_btree_read_bufl(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_fsblock_t	fsbno,		/* file system block number */
+	uint		lock,		/* lock flags for read_buf */
+	xfs_buf_t	**bpp,		/* buffer for fsbno */
+	int		refval)		/* ref count value for buffer */
+{
+	xfs_buf_t	*bp;		/* return value */
+	xfs_daddr_t		d;		/* real disk block address */
+	int		error;
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+			mp->m_bsize, lock, &bp))) {
+		return error;
+	}
+	ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+	if (bp)
+		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufl(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_fsblock_t	fsbno,		/* file system block number */
+	xfs_extlen_t	count)		/* count of filesystem blocks */
+{
+	xfs_daddr_t		d;
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufs(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_agblock_t	agbno,		/* allocation group block number */
+	xfs_extlen_t	count)		/* count of filesystem blocks */
+{
+	xfs_daddr_t		d;
+
+	ASSERT(agno != NULLAGNUMBER);
+	ASSERT(agbno != NULLAGBLOCK);
+	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+}
+
+STATIC int
+xfs_btree_readahead_lblock(
+	struct xfs_btree_cur	*cur,
+	int			lr,
+	struct xfs_btree_block	*block)
+{
+	int			rval = 0;
+	xfs_dfsbno_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+	xfs_dfsbno_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
+		xfs_btree_reada_bufl(cur->bc_mp, left, 1);
+		rval++;
+	}
+
+	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
+		xfs_btree_reada_bufl(cur->bc_mp, right, 1);
+		rval++;
+	}
+
+	return rval;
+}
+
+STATIC int
+xfs_btree_readahead_sblock(
+	struct xfs_btree_cur	*cur,
+	int			lr,
+	struct xfs_btree_block *block)
+{
+	int			rval = 0;
+	xfs_agblock_t		left = be32_to_cpu(block->bb_u.s.bb_leftsib);
+	xfs_agblock_t		right = be32_to_cpu(block->bb_u.s.bb_rightsib);
+
+
+	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
+		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+				     left, 1);
+		rval++;
+	}
+
+	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
+		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+				     right, 1);
+		rval++;
+	}
+
+	return rval;
+}
+
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+STATIC int
+xfs_btree_readahead(
+	struct xfs_btree_cur	*cur,		/* btree cursor */
+	int			lev,		/* level in btree */
+	int			lr)		/* left/right bits */
+{
+	struct xfs_btree_block	*block;
+
+	/*
+	 * No readahead needed if we are at the root level and the
+	 * btree root is stored in the inode.
+	 */
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (lev == cur->bc_nlevels - 1))
+		return 0;
+
+	if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+		return 0;
+
+	cur->bc_ra[lev] |= lr;
+	block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return xfs_btree_readahead_lblock(cur, lr, block);
+	return xfs_btree_readahead_sblock(cur, lr, block);
+}
+
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+STATIC void
+xfs_btree_setbuf(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			lev,	/* level in btree */
+	xfs_buf_t		*bp)	/* new buffer to set */
+{
+	struct xfs_btree_block	*b;	/* btree block */
+
+	if (cur->bc_bufs[lev])
+		xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
+	cur->bc_bufs[lev] = bp;
+	cur->bc_ra[lev] = 0;
+
+	b = XFS_BUF_TO_BLOCK(bp);
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
+			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+		if (be64_to_cpu(b->bb_u.l.bb_rightsib) == NULLDFSBNO)
+			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+	} else {
+		if (be32_to_cpu(b->bb_u.s.bb_leftsib) == NULLAGBLOCK)
+			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+		if (be32_to_cpu(b->bb_u.s.bb_rightsib) == NULLAGBLOCK)
+			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+	}
+}
+
+STATIC int
+xfs_btree_ptr_is_null(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return be64_to_cpu(ptr->l) == NULLDFSBNO;
+	else
+		return be32_to_cpu(ptr->s) == NULLAGBLOCK;
+}
+
+STATIC void
+xfs_btree_set_ptr_null(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		ptr->l = cpu_to_be64(NULLDFSBNO);
+	else
+		ptr->s = cpu_to_be32(NULLAGBLOCK);
+}
+
+/*
+ * Get/set/init sibling pointers
+ */
+STATIC void
+xfs_btree_get_sibling(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	union xfs_btree_ptr	*ptr,
+	int			lr)
+{
+	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (lr == XFS_BB_RIGHTSIB)
+			ptr->l = block->bb_u.l.bb_rightsib;
+		else
+			ptr->l = block->bb_u.l.bb_leftsib;
+	} else {
+		if (lr == XFS_BB_RIGHTSIB)
+			ptr->s = block->bb_u.s.bb_rightsib;
+		else
+			ptr->s = block->bb_u.s.bb_leftsib;
+	}
+}
+
+STATIC void
+xfs_btree_set_sibling(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	union xfs_btree_ptr	*ptr,
+	int			lr)
+{
+	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (lr == XFS_BB_RIGHTSIB)
+			block->bb_u.l.bb_rightsib = ptr->l;
+		else
+			block->bb_u.l.bb_leftsib = ptr->l;
+	} else {
+		if (lr == XFS_BB_RIGHTSIB)
+			block->bb_u.s.bb_rightsib = ptr->s;
+		else
+			block->bb_u.s.bb_leftsib = ptr->s;
+	}
+}
+
+STATIC void
+xfs_btree_init_block(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			numrecs,
+	struct xfs_btree_block	*new)	/* new block */
+{
+	new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+	new->bb_level = cpu_to_be16(level);
+	new->bb_numrecs = cpu_to_be16(numrecs);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+		new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+	} else {
+		new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+		new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+	}
+}
+
+/*
+ * Return true if ptr is the last record in the btree and
+ * we need to track updateѕ to this record.  The decision
+ * will be further refined in the update_lastrec method.
+ */
+STATIC int
+xfs_btree_is_lastrec(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	int			level)
+{
+	union xfs_btree_ptr	ptr;
+
+	if (level > 0)
+		return 0;
+	if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+		return 0;
+
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+	if (!xfs_btree_ptr_is_null(cur, &ptr))
+		return 0;
+	return 1;
+}
+
+STATIC void
+xfs_btree_buf_to_ptr(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+					XFS_BUF_ADDR(bp)));
+	else {
+		ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
+					XFS_BUF_ADDR(bp)));
+	}
+}
+
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		ASSERT(be64_to_cpu(ptr->l) != NULLDFSBNO);
+
+		return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+	} else {
+		ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+		ASSERT(be32_to_cpu(ptr->s) != NULLAGBLOCK);
+
+		return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+					be32_to_cpu(ptr->s));
+	}
+}
+
+STATIC void
+xfs_btree_set_refs(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	switch (cur->bc_btnum) {
+	case XFS_BTNUM_BNO:
+	case XFS_BTNUM_CNT:
+		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+		break;
+	case XFS_BTNUM_INO:
+		XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+		break;
+	case XFS_BTNUM_BMAP:
+		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+		break;
+	default:
+		ASSERT(0);
+	}
+}
+
+STATIC int
+xfs_btree_get_buf_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			flags,
+	struct xfs_btree_block	**block,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_daddr_t		d;
+
+	/* need to sort out how callers deal with failures first */
+	ASSERT(!(flags & XBF_TRYLOCK));
+
+	d = xfs_btree_ptr_to_daddr(cur, ptr);
+	*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
+				 mp->m_bsize, flags);
+
+	ASSERT(*bpp);
+	ASSERT(!XFS_BUF_GETERROR(*bpp));
+
+	*block = XFS_BUF_TO_BLOCK(*bpp);
+	return 0;
+}
+
+/*
+ * Read in the buffer at the given ptr and return the buffer and
+ * the block pointer within the buffer.
+ */
+STATIC int
+xfs_btree_read_buf_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			level,
+	int			flags,
+	struct xfs_btree_block	**block,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_daddr_t		d;
+	int			error;
+
+	/* need to sort out how callers deal with failures first */
+	ASSERT(!(flags & XBF_TRYLOCK));
+
+	d = xfs_btree_ptr_to_daddr(cur, ptr);
+	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
+				   mp->m_bsize, flags, bpp);
+	if (error)
+		return error;
+
+	ASSERT(*bpp != NULL);
+	ASSERT(!XFS_BUF_GETERROR(*bpp));
+
+	xfs_btree_set_refs(cur, *bpp);
+	*block = XFS_BUF_TO_BLOCK(*bpp);
+
+	error = xfs_btree_check_block(cur, *block, level, *bpp);
+	if (error)
+		xfs_trans_brelse(cur->bc_tp, *bpp);
+	return error;
+}
+
+/*
+ * Copy keys from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_keys(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*dst_key,
+	union xfs_btree_key	*src_key,
+	int			numkeys)
+{
+	ASSERT(numkeys >= 0);
+	memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Copy records from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_recs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*dst_rec,
+	union xfs_btree_rec	*src_rec,
+	int			numrecs)
+{
+	ASSERT(numrecs >= 0);
+	memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Copy block pointers from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_ptrs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*dst_ptr,
+	union xfs_btree_ptr	*src_ptr,
+	int			numptrs)
+{
+	ASSERT(numptrs >= 0);
+	memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Shift keys one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_keys(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key,
+	int			dir,
+	int			numkeys)
+{
+	char			*dst_key;
+
+	ASSERT(numkeys >= 0);
+	ASSERT(dir == 1 || dir == -1);
+
+	dst_key = (char *)key + (dir * cur->bc_ops->key_len);
+	memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Shift records one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_recs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec,
+	int			dir,
+	int			numrecs)
+{
+	char			*dst_rec;
+
+	ASSERT(numrecs >= 0);
+	ASSERT(dir == 1 || dir == -1);
+
+	dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
+	memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Shift block pointers one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_ptrs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			dir,
+	int			numptrs)
+{
+	char			*dst_ptr;
+
+	ASSERT(numptrs >= 0);
+	ASSERT(dir == 1 || dir == -1);
+
+	dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
+	memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_btree_log_keys(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			first,
+	int			last)
+{
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+	if (bp) {
+		xfs_trans_log_buf(cur->bc_tp, bp,
+				  xfs_btree_key_offset(cur, first),
+				  xfs_btree_key_offset(cur, last + 1) - 1);
+	} else {
+		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+				xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_btree_log_recs(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			first,
+	int			last)
+{
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+	xfs_trans_log_buf(cur->bc_tp, bp,
+			  xfs_btree_rec_offset(cur, first),
+			  xfs_btree_rec_offset(cur, last + 1) - 1);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_btree_log_ptrs(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_buf		*bp,	/* buffer containing btree block */
+	int			first,	/* index of first pointer to log */
+	int			last)	/* index of last pointer to log */
+{
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+	if (bp) {
+		struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+		int			level = xfs_btree_get_level(block);
+
+		xfs_trans_log_buf(cur->bc_tp, bp,
+				xfs_btree_ptr_offset(cur, first, level),
+				xfs_btree_ptr_offset(cur, last + 1, level) - 1);
+	} else {
+		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log fields from a btree block header.
+ */
+void
+xfs_btree_log_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_buf		*bp,	/* buffer containing btree block */
+	int			fields)	/* mask of fields: XFS_BB_... */
+{
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	static const short	soffsets[] = {	/* table of offsets (short) */
+		offsetof(struct xfs_btree_block, bb_magic),
+		offsetof(struct xfs_btree_block, bb_level),
+		offsetof(struct xfs_btree_block, bb_numrecs),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
+		XFS_BTREE_SBLOCK_LEN
+	};
+	static const short	loffsets[] = {	/* table of offsets (long) */
+		offsetof(struct xfs_btree_block, bb_magic),
+		offsetof(struct xfs_btree_block, bb_level),
+		offsetof(struct xfs_btree_block, bb_numrecs),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
+		XFS_BTREE_LBLOCK_LEN
+	};
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
+
+	if (bp) {
+		xfs_btree_offsets(fields,
+				  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+					loffsets : soffsets,
+				  XFS_BB_NUM_BITS, &first, &last);
+		xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+	} else {
+		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int						/* error */
+xfs_btree_increment(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	struct xfs_btree_block	*block;
+	union xfs_btree_ptr	ptr;
+	struct xfs_buf		*bp;
+	int			error;		/* error return value */
+	int			lev;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, level);
+
+	ASSERT(level < cur->bc_nlevels);
+
+	/* Read-ahead to the right at this level. */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+	/* Get a pointer to the btree block. */
+	block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+#endif
+
+	/* We're done if we remain in the block after the increment. */
+	if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+		goto out1;
+
+	/* Fail if we just went off the right edge of the tree. */
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+	if (xfs_btree_ptr_is_null(cur, &ptr))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, increment);
+
+	/*
+	 * March up the tree incrementing pointers.
+	 * Stop when we don't go off the right edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		block = xfs_btree_get_block(cur, lev, &bp);
+
+#ifdef DEBUG
+		error = xfs_btree_check_block(cur, block, lev, bp);
+		if (error)
+			goto error0;
+#endif
+
+		if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+			break;
+
+		/* Read-ahead the right block for the next loop. */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+	}
+
+	/*
+	 * If we went off the root then we are either seriously
+	 * confused or have the tree root in an inode.
+	 */
+	if (lev == cur->bc_nlevels) {
+		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+			goto out0;
+		ASSERT(0);
+		error = EFSCORRUPTED;
+		goto error0;
+	}
+	ASSERT(lev < cur->bc_nlevels);
+
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+		union xfs_btree_ptr	*ptrp;
+
+		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+		error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+							0, &block, &bp);
+		if (error)
+			goto error0;
+
+		xfs_btree_setbuf(cur, lev, bp);
+		cur->bc_ptrs[lev] = 1;
+	}
+out1:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int						/* error */
+xfs_btree_decrement(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	struct xfs_btree_block	*block;
+	xfs_buf_t		*bp;
+	int			error;		/* error return value */
+	int			lev;
+	union xfs_btree_ptr	ptr;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, level);
+
+	ASSERT(level < cur->bc_nlevels);
+
+	/* Read-ahead to the left at this level. */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+
+	/* We're done if we remain in the block after the decrement. */
+	if (--cur->bc_ptrs[level] > 0)
+		goto out1;
+
+	/* Get a pointer to the btree block. */
+	block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+#endif
+
+	/* Fail if we just went off the left edge of the tree. */
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+	if (xfs_btree_ptr_is_null(cur, &ptr))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, decrement);
+
+	/*
+	 * March up the tree decrementing pointers.
+	 * Stop when we don't go off the left edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		if (--cur->bc_ptrs[lev] > 0)
+			break;
+		/* Read-ahead the left block for the next loop. */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+	}
+
+	/*
+	 * If we went off the root then we are seriously confused.
+	 * or the root of the tree is in an inode.
+	 */
+	if (lev == cur->bc_nlevels) {
+		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+			goto out0;
+		ASSERT(0);
+		error = EFSCORRUPTED;
+		goto error0;
+	}
+	ASSERT(lev < cur->bc_nlevels);
+
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+		union xfs_btree_ptr	*ptrp;
+
+		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+		error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+							0, &block, &bp);
+		if (error)
+			goto error0;
+		xfs_btree_setbuf(cur, lev, bp);
+		cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+	}
+out1:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+STATIC int
+xfs_btree_lookup_get_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level in the btree */
+	union xfs_btree_ptr	*pp,	/* ptr to btree block */
+	struct xfs_btree_block	**blkp) /* return btree block */
+{
+	struct xfs_buf		*bp;	/* buffer pointer for btree block */
+	int			error = 0;
+
+	/* special case the root block if in an inode */
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level == cur->bc_nlevels - 1)) {
+		*blkp = xfs_btree_get_iroot(cur);
+		return 0;
+	}
+
+	/*
+	 * If the old buffer at this level for the disk address we are
+	 * looking for re-use it.
+	 *
+	 * Otherwise throw it away and get a new one.
+	 */
+	bp = cur->bc_bufs[level];
+	if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
+		*blkp = XFS_BUF_TO_BLOCK(bp);
+		return 0;
+	}
+
+	error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
+	if (error)
+		return error;
+
+	xfs_btree_setbuf(cur, level, bp);
+	return 0;
+}
+
+/*
+ * Get current search key.  For level 0 we don't actually have a key
+ * structure so we make one up from the record.  For all other levels
+ * we just return the right key.
+ */
+STATIC union xfs_btree_key *
+xfs_lookup_get_search_key(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			keyno,
+	struct xfs_btree_block	*block,
+	union xfs_btree_key	*kp)
+{
+	if (level == 0) {
+		cur->bc_ops->init_key_from_rec(kp,
+				xfs_btree_rec_addr(cur, keyno, block));
+		return kp;
+	}
+
+	return xfs_btree_key_addr(cur, keyno, block);
+}
+
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * Return 0 if can't find any such record, 1 for success.
+ */
+int					/* error */
+xfs_btree_lookup(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_lookup_t		dir,	/* <=, ==, or >= */
+	int			*stat)	/* success/failure */
+{
+	struct xfs_btree_block	*block;	/* current btree block */
+	__int64_t		diff;	/* difference for the current key */
+	int			error;	/* error return value */
+	int			keyno;	/* current key number */
+	int			level;	/* level in the btree */
+	union xfs_btree_ptr	*pp;	/* ptr to btree block */
+	union xfs_btree_ptr	ptr;	/* ptr to btree block */
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, dir);
+
+	XFS_BTREE_STATS_INC(cur, lookup);
+
+	block = NULL;
+	keyno = 0;
+
+	/* initialise start pointer from cursor */
+	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+	pp = &ptr;
+
+	/*
+	 * Iterate over each level in the btree, starting at the root.
+	 * For each level above the leaves, find the key we need, based
+	 * on the lookup record, then follow the corresponding block
+	 * pointer down to the next level.
+	 */
+	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+		/* Get the block we need to do the lookup on. */
+		error = xfs_btree_lookup_get_block(cur, level, pp, &block);
+		if (error)
+			goto error0;
+
+		if (diff == 0) {
+			/*
+			 * If we already had a key match at a higher level, we
+			 * know we need to use the first entry in this block.
+			 */
+			keyno = 1;
+		} else {
+			/* Otherwise search this block. Do a binary search. */
+
+			int	high;	/* high entry number */
+			int	low;	/* low entry number */
+
+			/* Set low and high entry numbers, 1-based. */
+			low = 1;
+			high = xfs_btree_get_numrecs(block);
+			if (!high) {
+				/* Block is empty, must be an empty leaf. */
+				ASSERT(level == 0 && cur->bc_nlevels == 1);
+
+				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+				XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+				*stat = 0;
+				return 0;
+			}
+
+			/* Binary search the block. */
+			while (low <= high) {
+				union xfs_btree_key	key;
+				union xfs_btree_key	*kp;
+
+				XFS_BTREE_STATS_INC(cur, compare);
+
+				/* keyno is average of low and high. */
+				keyno = (low + high) >> 1;
+
+				/* Get current search key */
+				kp = xfs_lookup_get_search_key(cur, level,
+						keyno, block, &key);
+
+				/*
+				 * Compute difference to get next direction:
+				 *  - less than, move right
+				 *  - greater than, move left
+				 *  - equal, we're done
+				 */
+				diff = cur->bc_ops->key_diff(cur, kp);
+				if (diff < 0)
+					low = keyno + 1;
+				else if (diff > 0)
+					high = keyno - 1;
+				else
+					break;
+			}
+		}
+
+		/*
+		 * If there are more levels, set up for the next level
+		 * by getting the block number and filling in the cursor.
+		 */
+		if (level > 0) {
+			/*
+			 * If we moved left, need the previous key number,
+			 * unless there isn't one.
+			 */
+			if (diff > 0 && --keyno < 1)
+				keyno = 1;
+			pp = xfs_btree_ptr_addr(cur, keyno, block);
+
+#ifdef DEBUG
+			error = xfs_btree_check_ptr(cur, pp, 0, level);
+			if (error)
+				goto error0;
+#endif
+			cur->bc_ptrs[level] = keyno;
+		}
+	}
+
+	/* Done with the search. See if we need to adjust the results. */
+	if (dir != XFS_LOOKUP_LE && diff < 0) {
+		keyno++;
+		/*
+		 * If ge search and we went off the end of the block, but it's
+		 * not the last block, we're in the wrong block.
+		 */
+		xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+		if (dir == XFS_LOOKUP_GE &&
+		    keyno > xfs_btree_get_numrecs(block) &&
+		    !xfs_btree_ptr_is_null(cur, &ptr)) {
+			int	i;
+
+			cur->bc_ptrs[0] = keyno;
+			error = xfs_btree_increment(cur, 0, &i);
+			if (error)
+				goto error0;
+			XFS_WANT_CORRUPTED_RETURN(i == 1);
+			XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+			*stat = 1;
+			return 0;
+		}
+	} else if (dir == XFS_LOOKUP_LE && diff > 0)
+		keyno--;
+	cur->bc_ptrs[0] = keyno;
+
+	/* Return if we succeeded or not. */
+	if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
+		*stat = 0;
+	else if (dir != XFS_LOOKUP_EQ || diff == 0)
+		*stat = 1;
+	else
+		*stat = 0;
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int
+xfs_btree_updkey(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*keyp,
+	int			level)
+{
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+	union xfs_btree_key	*kp;
+	int			ptr;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
+
+	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
+
+	/*
+	 * Go up the tree from this level toward the root.
+	 * At each level, update the key value to the value input.
+	 * Stop when we reach a level where the cursor isn't pointing
+	 * at the first entry in the block.
+	 */
+	for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+		int		error;
+#endif
+		block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+		error = xfs_btree_check_block(cur, block, level, bp);
+		if (error) {
+			XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+			return error;
+		}
+#endif
+		ptr = cur->bc_ptrs[level];
+		kp = xfs_btree_key_addr(cur, ptr, block);
+		xfs_btree_copy_keys(cur, kp, keyp, 1);
+		xfs_btree_log_keys(cur, bp, ptr, ptr);
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+}
+
+/*
+ * Update the record referred to by cur to the value in the
+ * given record. This either works (return 0) or gets an
+ * EFSCORRUPTED error.
+ */
+int
+xfs_btree_update(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+	int			error;
+	int			ptr;
+	union xfs_btree_rec	*rp;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGR(cur, rec);
+
+	/* Pick up the current block. */
+	block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, 0, bp);
+	if (error)
+		goto error0;
+#endif
+	/* Get the address of the rec to be updated. */
+	ptr = cur->bc_ptrs[0];
+	rp = xfs_btree_rec_addr(cur, ptr, block);
+
+	/* Fill in the new contents and log them. */
+	xfs_btree_copy_recs(cur, rp, rec, 1);
+	xfs_btree_log_recs(cur, bp, ptr, ptr);
+
+	/*
+	 * If we are tracking the last record in the tree and
+	 * we are at the far right edge of the tree, update it.
+	 */
+	if (xfs_btree_is_lastrec(cur, block, 0)) {
+		cur->bc_ops->update_lastrec(cur, block, rec,
+					    ptr, LASTREC_UPDATE);
+	}
+
+	/* Updating first rec in leaf. Pass new key value up to our parent. */
+	if (ptr == 1) {
+		union xfs_btree_key	key;
+
+		cur->bc_ops->init_key_from_rec(&key, rec);
+		error = xfs_btree_updkey(cur, &key, 1);
+		if (error)
+			goto error0;
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int					/* error */
+xfs_btree_lshift(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	union xfs_btree_key	key;		/* btree key */
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	int			lrecs;		/* left record count */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	int			rrecs;		/* right record count */
+	union xfs_btree_ptr	lptr;		/* left btree pointer */
+	union xfs_btree_key	*rkp = NULL;	/* right btree key */
+	union xfs_btree_ptr	*rpp = NULL;	/* right address pointer */
+	union xfs_btree_rec	*rrp = NULL;	/* right record pointer */
+	int			error;		/* error return value */
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, level);
+
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    level == cur->bc_nlevels - 1)
+		goto out0;
+
+	/* Set up variables for this block as "right". */
+	right = xfs_btree_get_block(cur, level, &rbp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, right, level, rbp);
+	if (error)
+		goto error0;
+#endif
+
+	/* If we've got no left sibling then we can't shift an entry left. */
+	xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+	if (xfs_btree_ptr_is_null(cur, &lptr))
+		goto out0;
+
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	if (cur->bc_ptrs[level] <= 1)
+		goto out0;
+
+	/* Set up the left neighbor as "left". */
+	error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
+	if (error)
+		goto error0;
+
+	/* If it's full, it can't take another entry. */
+	lrecs = xfs_btree_get_numrecs(left);
+	if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
+		goto out0;
+
+	rrecs = xfs_btree_get_numrecs(right);
+
+	/*
+	 * We add one entry to the left side and remove one for the right side.
+	 * Account for it here, the changes will be updated on disk and logged
+	 * later.
+	 */
+	lrecs++;
+	rrecs--;
+
+	XFS_BTREE_STATS_INC(cur, lshift);
+	XFS_BTREE_STATS_ADD(cur, moves, 1);
+
+	/*
+	 * If non-leaf, copy a key and a ptr to the left block.
+	 * Log the changes to the left block.
+	 */
+	if (level > 0) {
+		/* It's a non-leaf.  Move keys and pointers. */
+		union xfs_btree_key	*lkp;	/* left btree key */
+		union xfs_btree_ptr	*lpp;	/* left address pointer */
+
+		lkp = xfs_btree_key_addr(cur, lrecs, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+
+		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+		error = xfs_btree_check_ptr(cur, rpp, 0, level);
+		if (error)
+			goto error0;
+#endif
+		xfs_btree_copy_keys(cur, lkp, rkp, 1);
+		xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
+
+		xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
+		xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
+
+		ASSERT(cur->bc_ops->keys_inorder(cur,
+			xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
+	} else {
+		/* It's a leaf.  Move records.  */
+		union xfs_btree_rec	*lrp;	/* left record pointer */
+
+		lrp = xfs_btree_rec_addr(cur, lrecs, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		xfs_btree_copy_recs(cur, lrp, rrp, 1);
+		xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
+
+		ASSERT(cur->bc_ops->recs_inorder(cur,
+			xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
+	}
+
+	xfs_btree_set_numrecs(left, lrecs);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+	xfs_btree_set_numrecs(right, rrecs);
+	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+	/*
+	 * Slide the contents of right down one entry.
+	 */
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
+	if (level > 0) {
+		/* It's a nonleaf. operate on keys and ptrs */
+#ifdef DEBUG
+		int			i;		/* loop index */
+
+		for (i = 0; i < rrecs; i++) {
+			error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
+			if (error)
+				goto error0;
+		}
+#endif
+		xfs_btree_shift_keys(cur,
+				xfs_btree_key_addr(cur, 2, right),
+				-1, rrecs);
+		xfs_btree_shift_ptrs(cur,
+				xfs_btree_ptr_addr(cur, 2, right),
+				-1, rrecs);
+
+		xfs_btree_log_keys(cur, rbp, 1, rrecs);
+		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+	} else {
+		/* It's a leaf. operate on records */
+		xfs_btree_shift_recs(cur,
+			xfs_btree_rec_addr(cur, 2, right),
+			-1, rrecs);
+		xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+		/*
+		 * If it's the first record in the block, we'll need a key
+		 * structure to pass up to the next level (updkey).
+		 */
+		cur->bc_ops->init_key_from_rec(&key,
+			xfs_btree_rec_addr(cur, 1, right));
+		rkp = &key;
+	}
+
+	/* Update the parent key values of right. */
+	error = xfs_btree_updkey(cur, rkp, level + 1);
+	if (error)
+		goto error0;
+
+	/* Slide the cursor value left one. */
+	cur->bc_ptrs[level]--;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int					/* error */
+xfs_btree_rshift(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	union xfs_btree_key	key;		/* btree key */
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
+	union xfs_btree_ptr	rptr;		/* right block pointer */
+	union xfs_btree_key	*rkp;		/* right btree key */
+	int			rrecs;		/* right record count */
+	int			lrecs;		/* left record count */
+	int			error;		/* error return value */
+	int			i;		/* loop counter */
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, level);
+
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level == cur->bc_nlevels - 1))
+		goto out0;
+
+	/* Set up variables for this block as "left". */
+	left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, left, level, lbp);
+	if (error)
+		goto error0;
+#endif
+
+	/* If we've got no right sibling then we can't shift an entry right. */
+	xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+	if (xfs_btree_ptr_is_null(cur, &rptr))
+		goto out0;
+
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	lrecs = xfs_btree_get_numrecs(left);
+	if (cur->bc_ptrs[level] >= lrecs)
+		goto out0;
+
+	/* Set up the right neighbor as "right". */
+	error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
+	if (error)
+		goto error0;
+
+	/* If it's full, it can't take another entry. */
+	rrecs = xfs_btree_get_numrecs(right);
+	if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, rshift);
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+	/*
+	 * Make a hole at the start of the right neighbor block, then
+	 * copy the last left block entry to the hole.
+	 */
+	if (level > 0) {
+		/* It's a nonleaf. make a hole in the keys and ptrs */
+		union xfs_btree_key	*lkp;
+		union xfs_btree_ptr	*lpp;
+		union xfs_btree_ptr	*rpp;
+
+		lkp = xfs_btree_key_addr(cur, lrecs, left);
+		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+		for (i = rrecs - 1; i >= 0; i--) {
+			error = xfs_btree_check_ptr(cur, rpp, i, level);
+			if (error)
+				goto error0;
+		}
+#endif
+
+		xfs_btree_shift_keys(cur, rkp, 1, rrecs);
+		xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
+
+#ifdef DEBUG
+		error = xfs_btree_check_ptr(cur, lpp, 0, level);
+		if (error)
+			goto error0;
+#endif
+
+		/* Now put the new data in, and log it. */
+		xfs_btree_copy_keys(cur, rkp, lkp, 1);
+		xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
+
+		xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
+		xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
+
+		ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
+			xfs_btree_key_addr(cur, 2, right)));
+	} else {
+		/* It's a leaf. make a hole in the records */
+		union xfs_btree_rec	*lrp;
+		union xfs_btree_rec	*rrp;
+
+		lrp = xfs_btree_rec_addr(cur, lrecs, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		xfs_btree_shift_recs(cur, rrp, 1, rrecs);
+
+		/* Now put the new data in, and log it. */
+		xfs_btree_copy_recs(cur, rrp, lrp, 1);
+		xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
+
+		cur->bc_ops->init_key_from_rec(&key, rrp);
+		rkp = &key;
+
+		ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
+			xfs_btree_rec_addr(cur, 2, right)));
+	}
+
+	/*
+	 * Decrement and log left's numrecs, bump and log right's numrecs.
+	 */
+	xfs_btree_set_numrecs(left, --lrecs);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+	xfs_btree_set_numrecs(right, ++rrecs);
+	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+	/*
+	 * Using a temporary cursor, update the parent key values of the
+	 * block on the right.
+	 */
+	error = xfs_btree_dup_cursor(cur, &tcur);
+	if (error)
+		goto error0;
+	i = xfs_btree_lastrec(tcur, level);
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+	error = xfs_btree_increment(tcur, level, &i);
+	if (error)
+		goto error1;
+
+	error = xfs_btree_updkey(tcur, rkp, level + 1);
+	if (error)
+		goto error1;
+
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+
+error1:
+	XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and the key to its first
+ * record (to be inserted into parent).
+ */
+STATIC int					/* error */
+xfs_btree_split(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	union xfs_btree_ptr	*ptrp,
+	union xfs_btree_key	*key,
+	struct xfs_btree_cur	**curp,
+	int			*stat)		/* success/failure */
+{
+	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	union xfs_btree_ptr	rrptr;		/* right-right sibling ptr */
+	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
+	struct xfs_btree_block	*rrblock;	/* right-right btree block */
+	int			lrecs;
+	int			rrecs;
+	int			src_index;
+	int			error;		/* error return value */
+#ifdef DEBUG
+	int			i;
+#endif
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
+
+	XFS_BTREE_STATS_INC(cur, split);
+
+	/* Set up left block (current one). */
+	left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, left, level, lbp);
+	if (error)
+		goto error0;
+#endif
+
+	xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+
+	/* Allocate the new block. If we can't do it, we're toast. Give up. */
+	error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
+	if (error)
+		goto error0;
+	if (*stat == 0)
+		goto out0;
+	XFS_BTREE_STATS_INC(cur, alloc);
+
+	/* Set up the new block as "right". */
+	error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+	if (error)
+		goto error0;
+
+	/* Fill in the btree header for the new right block. */
+	xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right);
+
+	/*
+	 * Split the entries between the old and the new block evenly.
+	 * Make sure that if there's an odd number of entries now, that
+	 * each new block will have the same number of entries.
+	 */
+	lrecs = xfs_btree_get_numrecs(left);
+	rrecs = lrecs / 2;
+	if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+		rrecs++;
+	src_index = (lrecs - rrecs + 1);
+
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+	/*
+	 * Copy btree block entries from the left block over to the
+	 * new block, the right. Update the right block and log the
+	 * changes.
+	 */
+	if (level > 0) {
+		/* It's a non-leaf.  Move keys and pointers. */
+		union xfs_btree_key	*lkp;	/* left btree key */
+		union xfs_btree_ptr	*lpp;	/* left address pointer */
+		union xfs_btree_key	*rkp;	/* right btree key */
+		union xfs_btree_ptr	*rpp;	/* right address pointer */
+
+		lkp = xfs_btree_key_addr(cur, src_index, left);
+		lpp = xfs_btree_ptr_addr(cur, src_index, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+		for (i = src_index; i < rrecs; i++) {
+			error = xfs_btree_check_ptr(cur, lpp, i, level);
+			if (error)
+				goto error0;
+		}
+#endif
+
+		xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
+		xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
+
+		xfs_btree_log_keys(cur, rbp, 1, rrecs);
+		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+
+		/* Grab the keys to the entries moved to the right block */
+		xfs_btree_copy_keys(cur, key, rkp, 1);
+	} else {
+		/* It's a leaf.  Move records.  */
+		union xfs_btree_rec	*lrp;	/* left record pointer */
+		union xfs_btree_rec	*rrp;	/* right record pointer */
+
+		lrp = xfs_btree_rec_addr(cur, src_index, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
+		xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+		cur->bc_ops->init_key_from_rec(key,
+			xfs_btree_rec_addr(cur, 1, right));
+	}
+
+
+	/*
+	 * Find the left block number by looking in the buffer.
+	 * Adjust numrecs, sibling pointers.
+	 */
+	xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
+	xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
+	xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+	xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+
+	lrecs -= rrecs;
+	xfs_btree_set_numrecs(left, lrecs);
+	xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+
+	xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+	/*
+	 * If there's a block to the new block's right, make that block
+	 * point back to right instead of to left.
+	 */
+	if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
+		error = xfs_btree_read_buf_block(cur, &rrptr, level,
+							0, &rrblock, &rrbp);
+		if (error)
+			goto error0;
+		xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
+		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+	}
+	/*
+	 * If the cursor is really in the right block, move it there.
+	 * If it's just pointing past the last entry in left, then we'll
+	 * insert there, so don't change anything in that case.
+	 */
+	if (cur->bc_ptrs[level] > lrecs + 1) {
+		xfs_btree_setbuf(cur, level, rbp);
+		cur->bc_ptrs[level] -= lrecs;
+	}
+	/*
+	 * If there are more levels, we'll need another cursor which refers
+	 * the right block, no matter where this cursor was.
+	 */
+	if (level + 1 < cur->bc_nlevels) {
+		error = xfs_btree_dup_cursor(cur, curp);
+		if (error)
+			goto error0;
+		(*curp)->bc_ptrs[level + 1]++;
+	}
+	*ptrp = rptr;
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Copy the old inode root contents into a real block and make the
+ * broot point to it.
+ */
+int						/* error */
+xfs_btree_new_iroot(
+	struct xfs_btree_cur	*cur,		/* btree cursor */
+	int			*logflags,	/* logging flags for inode */
+	int			*stat)		/* return status - 0 fail */
+{
+	struct xfs_buf		*cbp;		/* buffer for cblock */
+	struct xfs_btree_block	*block;		/* btree block */
+	struct xfs_btree_block	*cblock;	/* child btree block */
+	union xfs_btree_key	*ckp;		/* child key pointer */
+	union xfs_btree_ptr	*cpp;		/* child ptr pointer */
+	union xfs_btree_key	*kp;		/* pointer to btree key */
+	union xfs_btree_ptr	*pp;		/* pointer to block addr */
+	union xfs_btree_ptr	nptr;		/* new block addr */
+	int			level;		/* btree level */
+	int			error;		/* error return code */
+#ifdef DEBUG
+	int			i;		/* loop counter */
+#endif
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_STATS_INC(cur, newroot);
+
+	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+	level = cur->bc_nlevels - 1;
+
+	block = xfs_btree_get_iroot(cur);
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+
+	/* Allocate the new block. If we can't do it, we're toast. Give up. */
+	error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
+	if (error)
+		goto error0;
+	if (*stat == 0) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		return 0;
+	}
+	XFS_BTREE_STATS_INC(cur, alloc);
+
+	/* Copy the root into a real block. */
+	error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+	if (error)
+		goto error0;
+
+	memcpy(cblock, block, xfs_btree_block_len(cur));
+
+	be16_add_cpu(&block->bb_level, 1);
+	xfs_btree_set_numrecs(block, 1);
+	cur->bc_nlevels++;
+	cur->bc_ptrs[level + 1] = 1;
+
+	kp = xfs_btree_key_addr(cur, 1, block);
+	ckp = xfs_btree_key_addr(cur, 1, cblock);
+	xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
+
+	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+	for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
+		error = xfs_btree_check_ptr(cur, pp, i, level);
+		if (error)
+			goto error0;
+	}
+#endif
+	xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
+
+#ifdef DEBUG
+	error = xfs_btree_check_ptr(cur, &nptr, 0, level);
+	if (error)
+		goto error0;
+#endif
+	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
+
+	xfs_iroot_realloc(cur->bc_private.b.ip,
+			  1 - xfs_btree_get_numrecs(cblock),
+			  cur->bc_private.b.whichfork);
+
+	xfs_btree_setbuf(cur, level, cbp);
+
+	/*
+	 * Do all this logging at the end so that
+	 * the root is at the right level.
+	 */
+	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+	xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+	xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+
+	*logflags |=
+		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
+	*stat = 1;
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int				/* error */
+xfs_btree_new_root(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			*stat)	/* success/failure */
+{
+	struct xfs_btree_block	*block;	/* one half of the old root block */
+	struct xfs_buf		*bp;	/* buffer containing block */
+	int			error;	/* error return value */
+	struct xfs_buf		*lbp;	/* left buffer pointer */
+	struct xfs_btree_block	*left;	/* left btree block */
+	struct xfs_buf		*nbp;	/* new (root) buffer */
+	struct xfs_btree_block	*new;	/* new (root) btree block */
+	int			nptr;	/* new value for key index, 1 or 2 */
+	struct xfs_buf		*rbp;	/* right buffer pointer */
+	struct xfs_btree_block	*right;	/* right btree block */
+	union xfs_btree_ptr	rptr;
+	union xfs_btree_ptr	lptr;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_STATS_INC(cur, newroot);
+
+	/* initialise our start point from the cursor */
+	cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+
+	/* Allocate the new block. If we can't do it, we're toast. Give up. */
+	error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
+	if (error)
+		goto error0;
+	if (*stat == 0)
+		goto out0;
+	XFS_BTREE_STATS_INC(cur, alloc);
+
+	/* Set up the new block. */
+	error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+	if (error)
+		goto error0;
+
+	/* Set the root in the holding structure  increasing the level by 1. */
+	cur->bc_ops->set_root(cur, &lptr, 1);
+
+	/*
+	 * At the previous root level there are now two blocks: the old root,
+	 * and the new block generated when it was split.  We don't know which
+	 * one the cursor is pointing at, so we set up variables "left" and
+	 * "right" for each case.
+	 */
+	block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
+	if (error)
+		goto error0;
+#endif
+
+	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+		/* Our block is left, pick up the right block. */
+		lbp = bp;
+		xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+		left = block;
+		error = xfs_btree_read_buf_block(cur, &rptr,
+					cur->bc_nlevels - 1, 0, &right, &rbp);
+		if (error)
+			goto error0;
+		bp = rbp;
+		nptr = 1;
+	} else {
+		/* Our block is right, pick up the left block. */
+		rbp = bp;
+		xfs_btree_buf_to_ptr(cur, rbp, &rptr);
+		right = block;
+		xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+		error = xfs_btree_read_buf_block(cur, &lptr,
+					cur->bc_nlevels - 1, 0, &left, &lbp);
+		if (error)
+			goto error0;
+		bp = lbp;
+		nptr = 2;
+	}
+	/* Fill in the new block's btree header and log it. */
+	xfs_btree_init_block(cur, cur->bc_nlevels, 2, new);
+	xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
+	ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
+			!xfs_btree_ptr_is_null(cur, &rptr));
+
+	/* Fill in the key data in the new root. */
+	if (xfs_btree_get_level(left) > 0) {
+		xfs_btree_copy_keys(cur,
+				xfs_btree_key_addr(cur, 1, new),
+				xfs_btree_key_addr(cur, 1, left), 1);
+		xfs_btree_copy_keys(cur,
+				xfs_btree_key_addr(cur, 2, new),
+				xfs_btree_key_addr(cur, 1, right), 1);
+	} else {
+		cur->bc_ops->init_key_from_rec(
+				xfs_btree_key_addr(cur, 1, new),
+				xfs_btree_rec_addr(cur, 1, left));
+		cur->bc_ops->init_key_from_rec(
+				xfs_btree_key_addr(cur, 2, new),
+				xfs_btree_rec_addr(cur, 1, right));
+	}
+	xfs_btree_log_keys(cur, nbp, 1, 2);
+
+	/* Fill in the pointer data in the new root. */
+	xfs_btree_copy_ptrs(cur,
+		xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
+	xfs_btree_copy_ptrs(cur,
+		xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
+	xfs_btree_log_ptrs(cur, nbp, 1, 2);
+
+	/* Fix up the cursor. */
+	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+	cur->bc_ptrs[cur->bc_nlevels] = nptr;
+	cur->bc_nlevels++;
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+}
+
+STATIC int
+xfs_btree_make_block_unfull(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* btree level */
+	int			numrecs,/* # of recs in block */
+	int			*oindex,/* old tree index */
+	int			*index,	/* new tree index */
+	union xfs_btree_ptr	*nptr,	/* new btree ptr */
+	struct xfs_btree_cur	**ncur,	/* new btree cursor */
+	union xfs_btree_rec	*nrec,	/* new record */
+	int			*stat)
+{
+	union xfs_btree_key	key;	/* new btree key value */
+	int			error = 0;
+
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    level == cur->bc_nlevels - 1) {
+	    	struct xfs_inode *ip = cur->bc_private.b.ip;
+
+		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
+			/* A root block that can be made bigger. */
+
+			xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+		} else {
+			/* A root block that needs replacing */
+			int	logflags = 0;
+
+			error = xfs_btree_new_iroot(cur, &logflags, stat);
+			if (error || *stat == 0)
+				return error;
+
+			xfs_trans_log_inode(cur->bc_tp, ip, logflags);
+		}
+
+		return 0;
+	}
+
+	/* First, try shifting an entry to the right neighbor. */
+	error = xfs_btree_rshift(cur, level, stat);
+	if (error || *stat)
+		return error;
+
+	/* Next, try shifting an entry to the left neighbor. */
+	error = xfs_btree_lshift(cur, level, stat);
+	if (error)
+		return error;
+
+	if (*stat) {
+		*oindex = *index = cur->bc_ptrs[level];
+		return 0;
+	}
+
+	/*
+	 * Next, try splitting the current block in half.
+	 *
+	 * If this works we have to re-set our variables because we
+	 * could be in a different block now.
+	 */
+	error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+	if (error || *stat == 0)
+		return error;
+
+
+	*index = cur->bc_ptrs[level];
+	cur->bc_ops->init_rec_from_key(&key, nrec);
+	return 0;
+}
+
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int
+xfs_btree_insrec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level to insert record at */
+	union xfs_btree_ptr	*ptrp,	/* i/o: block number inserted */
+	union xfs_btree_rec	*recp,	/* i/o: record data inserted */
+	struct xfs_btree_cur	**curp,	/* output: new cursor replacing cur */
+	int			*stat)	/* success/failure */
+{
+	struct xfs_btree_block	*block;	/* btree block */
+	struct xfs_buf		*bp;	/* buffer for block */
+	union xfs_btree_key	key;	/* btree key */
+	union xfs_btree_ptr	nptr;	/* new block ptr */
+	struct xfs_btree_cur	*ncur;	/* new btree cursor */
+	union xfs_btree_rec	nrec;	/* new record count */
+	int			optr;	/* old key/record index */
+	int			ptr;	/* key/record index */
+	int			numrecs;/* number of records */
+	int			error;	/* error return value */
+#ifdef DEBUG
+	int			i;
+#endif
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+
+	ncur = NULL;
+
+	/*
+	 * If we have an external root pointer, and we've made it to the
+	 * root level, allocate a new root block and we're done.
+	 */
+	if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level >= cur->bc_nlevels)) {
+		error = xfs_btree_new_root(cur, stat);
+		xfs_btree_set_ptr_null(cur, ptrp);
+
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		return error;
+	}
+
+	/* If we're off the left edge, return failure. */
+	ptr = cur->bc_ptrs[level];
+	if (ptr == 0) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+
+	/* Make a key out of the record data to be inserted, and save it. */
+	cur->bc_ops->init_key_from_rec(&key, recp);
+
+	optr = ptr;
+
+	XFS_BTREE_STATS_INC(cur, insrec);
+
+	/* Get pointers to the btree buffer and block. */
+	block = xfs_btree_get_block(cur, level, &bp);
+	numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+
+	/* Check that the new entry is being inserted in the right place. */
+	if (ptr <= numrecs) {
+		if (level == 0) {
+			ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+				xfs_btree_rec_addr(cur, ptr, block)));
+		} else {
+			ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+				xfs_btree_key_addr(cur, ptr, block)));
+		}
+	}
+#endif
+
+	/*
+	 * If the block is full, we can't insert the new entry until we
+	 * make the block un-full.
+	 */
+	xfs_btree_set_ptr_null(cur, &nptr);
+	if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
+		error = xfs_btree_make_block_unfull(cur, level, numrecs,
+					&optr, &ptr, &nptr, &ncur, &nrec, stat);
+		if (error || *stat == 0)
+			goto error0;
+	}
+
+	/*
+	 * The current block may have changed if the block was
+	 * previously full and we have just made space in it.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		return error;
+#endif
+
+	/*
+	 * At this point we know there's room for our new entry in the block
+	 * we're pointing at.
+	 */
+	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
+
+	if (level > 0) {
+		/* It's a nonleaf. make a hole in the keys and ptrs */
+		union xfs_btree_key	*kp;
+		union xfs_btree_ptr	*pp;
+
+		kp = xfs_btree_key_addr(cur, ptr, block);
+		pp = xfs_btree_ptr_addr(cur, ptr, block);
+
+#ifdef DEBUG
+		for (i = numrecs - ptr; i >= 0; i--) {
+			error = xfs_btree_check_ptr(cur, pp, i, level);
+			if (error)
+				return error;
+		}
+#endif
+
+		xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+		xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
+
+#ifdef DEBUG
+		error = xfs_btree_check_ptr(cur, ptrp, 0, level);
+		if (error)
+			goto error0;
+#endif
+
+		/* Now put the new data in, bump numrecs and log it. */
+		xfs_btree_copy_keys(cur, kp, &key, 1);
+		xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
+		numrecs++;
+		xfs_btree_set_numrecs(block, numrecs);
+		xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
+		xfs_btree_log_keys(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+		if (ptr < numrecs) {
+			ASSERT(cur->bc_ops->keys_inorder(cur, kp,
+				xfs_btree_key_addr(cur, ptr + 1, block)));
+		}
+#endif
+	} else {
+		/* It's a leaf. make a hole in the records */
+		union xfs_btree_rec             *rp;
+
+		rp = xfs_btree_rec_addr(cur, ptr, block);
+
+		xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
+
+		/* Now put the new data in, bump numrecs and log it. */
+		xfs_btree_copy_recs(cur, rp, recp, 1);
+		xfs_btree_set_numrecs(block, ++numrecs);
+		xfs_btree_log_recs(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+		if (ptr < numrecs) {
+			ASSERT(cur->bc_ops->recs_inorder(cur, rp,
+				xfs_btree_rec_addr(cur, ptr + 1, block)));
+		}
+#endif
+	}
+
+	/* Log the new number of records in the btree header. */
+	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+	/* If we inserted at the start of a block, update the parents' keys. */
+	if (optr == 1) {
+		error = xfs_btree_updkey(cur, &key, level + 1);
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * If we are tracking the last record in the tree and
+	 * we are at the far right edge of the tree, update it.
+	 */
+	if (xfs_btree_is_lastrec(cur, block, level)) {
+		cur->bc_ops->update_lastrec(cur, block, recp,
+					    ptr, LASTREC_INSREC);
+	}
+
+	/*
+	 * Return the new block number, if any.
+	 * If there is one, give back a record value and a cursor too.
+	 */
+	*ptrp = nptr;
+	if (!xfs_btree_ptr_is_null(cur, &nptr)) {
+		*recp = nrec;
+		*curp = ncur;
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Insert the record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor.  All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
+ */
+int
+xfs_btree_insert(
+	struct xfs_btree_cur	*cur,
+	int			*stat)
+{
+	int			error;	/* error return value */
+	int			i;	/* result value, 0 for failure */
+	int			level;	/* current level number in btree */
+	union xfs_btree_ptr	nptr;	/* new block number (split result) */
+	struct xfs_btree_cur	*ncur;	/* new cursor (split result) */
+	struct xfs_btree_cur	*pcur;	/* previous level's cursor */
+	union xfs_btree_rec	rec;	/* record to insert */
+
+	level = 0;
+	ncur = NULL;
+	pcur = cur;
+
+	xfs_btree_set_ptr_null(cur, &nptr);
+	cur->bc_ops->init_rec_from_cur(cur, &rec);
+
+	/*
+	 * Loop going up the tree, starting at the leaf level.
+	 * Stop when we don't get a split block, that must mean that
+	 * the insert is finished with this level.
+	 */
+	do {
+		/*
+		 * Insert nrec/nptr into this level of the tree.
+		 * Note if we fail, nptr will be null.
+		 */
+		error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+		if (error) {
+			if (pcur != cur)
+				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+			goto error0;
+		}
+
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		level++;
+
+		/*
+		 * See if the cursor we just used is trash.
+		 * Can't trash the caller's cursor, but otherwise we should
+		 * if ncur is a new cursor or we're about to be done.
+		 */
+		if (pcur != cur &&
+		    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
+			/* Save the state from the cursor before we trash it */
+			if (cur->bc_ops->update_cursor)
+				cur->bc_ops->update_cursor(pcur, cur);
+			cur->bc_nlevels = pcur->bc_nlevels;
+			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+		}
+		/* If we got a new cursor, switch to it. */
+		if (ncur) {
+			pcur = ncur;
+			ncur = NULL;
+		}
+	} while (!xfs_btree_ptr_is_null(cur, &nptr));
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = i;
+	return 0;
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Try to merge a non-leaf block back into the inode root.
+ *
+ * Note: the killroot names comes from the fact that we're effectively
+ * killing the old root block.  But because we can't just delete the
+ * inode we have to copy the single block it was pointing to into the
+ * inode.
+ */
+STATIC int
+xfs_btree_kill_iroot(
+	struct xfs_btree_cur	*cur)
+{
+	int			whichfork = cur->bc_private.b.whichfork;
+	struct xfs_inode	*ip = cur->bc_private.b.ip;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_btree_block	*block;
+	struct xfs_btree_block	*cblock;
+	union xfs_btree_key	*kp;
+	union xfs_btree_key	*ckp;
+	union xfs_btree_ptr	*pp;
+	union xfs_btree_ptr	*cpp;
+	struct xfs_buf		*cbp;
+	int			level;
+	int			index;
+	int			numrecs;
+#ifdef DEBUG
+	union xfs_btree_ptr	ptr;
+	int			i;
+#endif
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+	ASSERT(cur->bc_nlevels > 1);
+
+	/*
+	 * Don't deal with the root block needs to be a leaf case.
+	 * We're just going to turn the thing back into extents anyway.
+	 */
+	level = cur->bc_nlevels - 1;
+	if (level == 1)
+		goto out0;
+
+	/*
+	 * Give up if the root has multiple children.
+	 */
+	block = xfs_btree_get_iroot(cur);
+	if (xfs_btree_get_numrecs(block) != 1)
+		goto out0;
+
+	cblock = xfs_btree_get_block(cur, level - 1, &cbp);
+	numrecs = xfs_btree_get_numrecs(cblock);
+
+	/*
+	 * Only do this if the next level will fit.
+	 * Then the data must be copied up to the inode,
+	 * instead of freeing the root you free the next level.
+	 */
+	if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, killroot);
+
+#ifdef DEBUG
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+#endif
+
+	index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+	if (index) {
+		xfs_iroot_realloc(cur->bc_private.b.ip, index,
+				  cur->bc_private.b.whichfork);
+		block = ifp->if_broot;
+	}
+
+	be16_add_cpu(&block->bb_numrecs, index);
+	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+	kp = xfs_btree_key_addr(cur, 1, block);
+	ckp = xfs_btree_key_addr(cur, 1, cblock);
+	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+	for (i = 0; i < numrecs; i++) {
+		int		error;
+
+		error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
+		if (error) {
+			XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+			return error;
+		}
+	}
+#endif
+	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+	cur->bc_ops->free_block(cur, cbp);
+	XFS_BTREE_STATS_INC(cur, free);
+
+	cur->bc_bufs[level - 1] = NULL;
+	be16_add_cpu(&block->bb_level, -1);
+	xfs_trans_log_inode(cur->bc_tp, ip,
+		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+	cur->bc_nlevels--;
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+}
+
+/*
+ * Kill the current root node, and replace it with it's only child node.
+ */
+STATIC int
+xfs_btree_kill_root(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			level,
+	union xfs_btree_ptr	*newroot)
+{
+	int			error;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_STATS_INC(cur, killroot);
+
+	/*
+	 * Update the root pointer, decreasing the level by 1 and then
+	 * free the old root.
+	 */
+	cur->bc_ops->set_root(cur, newroot, -1);
+
+	error = cur->bc_ops->free_block(cur, bp);
+	if (error) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+		return error;
+	}
+
+	XFS_BTREE_STATS_INC(cur, free);
+
+	cur->bc_bufs[level] = NULL;
+	cur->bc_ra[level] = 0;
+	cur->bc_nlevels--;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+}
+
+STATIC int
+xfs_btree_dec_cursor(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)
+{
+	int			error;
+	int			i;
+
+	if (level > 0) {
+		error = xfs_btree_decrement(cur, level, &i);
+		if (error)
+			return error;
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Single level of the btree record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int					/* error */
+xfs_btree_delrec(
+	struct xfs_btree_cur	*cur,		/* btree cursor */
+	int			level,		/* level removing record from */
+	int			*stat)		/* fail/done/go-on */
+{
+	struct xfs_btree_block	*block;		/* btree block */
+	union xfs_btree_ptr	cptr;		/* current block ptr */
+	struct xfs_buf		*bp;		/* buffer for block */
+	int			error;		/* error return value */
+	int			i;		/* loop counter */
+	union xfs_btree_key	key;		/* storage for keyp */
+	union xfs_btree_key	*keyp = &key;	/* passed to the next level */
+	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	int			lrecs = 0;	/* left record count */
+	int			ptr;		/* key/record index */
+	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	struct xfs_btree_block	*rrblock;	/* right-right btree block */
+	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
+	int			rrecs = 0;	/* right record count */
+	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
+	int			numrecs;	/* temporary numrec count */
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, level);
+
+	tcur = NULL;
+
+	/* Get the index of the entry being deleted, check for nothing there. */
+	ptr = cur->bc_ptrs[level];
+	if (ptr == 0) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+
+	/* Get the buffer & block containing the record or key/ptr. */
+	block = xfs_btree_get_block(cur, level, &bp);
+	numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+#endif
+
+	/* Fail if we're off the end of the block. */
+	if (ptr > numrecs) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+
+	XFS_BTREE_STATS_INC(cur, delrec);
+	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
+
+	/* Excise the entries being deleted. */
+	if (level > 0) {
+		/* It's a nonleaf. operate on keys and ptrs */
+		union xfs_btree_key	*lkp;
+		union xfs_btree_ptr	*lpp;
+
+		lkp = xfs_btree_key_addr(cur, ptr + 1, block);
+		lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
+
+#ifdef DEBUG
+		for (i = 0; i < numrecs - ptr; i++) {
+			error = xfs_btree_check_ptr(cur, lpp, i, level);
+			if (error)
+				goto error0;
+		}
+#endif
+
+		if (ptr < numrecs) {
+			xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
+			xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
+			xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
+			xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
+		}
+
+		/*
+		 * If it's the first record in the block, we'll need to pass a
+		 * key up to the next level (updkey).
+		 */
+		if (ptr == 1)
+			keyp = xfs_btree_key_addr(cur, 1, block);
+	} else {
+		/* It's a leaf. operate on records */
+		if (ptr < numrecs) {
+			xfs_btree_shift_recs(cur,
+				xfs_btree_rec_addr(cur, ptr + 1, block),
+				-1, numrecs - ptr);
+			xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
+		}
+
+		/*
+		 * If it's the first record in the block, we'll need a key
+		 * structure to pass up to the next level (updkey).
+		 */
+		if (ptr == 1) {
+			cur->bc_ops->init_key_from_rec(&key,
+					xfs_btree_rec_addr(cur, 1, block));
+			keyp = &key;
+		}
+	}
+
+	/*
+	 * Decrement and log the number of entries in the block.
+	 */
+	xfs_btree_set_numrecs(block, --numrecs);
+	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+	/*
+	 * If we are tracking the last record in the tree and
+	 * we are at the far right edge of the tree, update it.
+	 */
+	if (xfs_btree_is_lastrec(cur, block, level)) {
+		cur->bc_ops->update_lastrec(cur, block, NULL,
+					    ptr, LASTREC_DELREC);
+	}
+
+	/*
+	 * We're at the root level.  First, shrink the root block in-memory.
+	 * Try to get rid of the next level down.  If we can't then there's
+	 * nothing left to do.
+	 */
+	if (level == cur->bc_nlevels - 1) {
+		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+			xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+					  cur->bc_private.b.whichfork);
+
+			error = xfs_btree_kill_iroot(cur);
+			if (error)
+				goto error0;
+
+			error = xfs_btree_dec_cursor(cur, level, stat);
+			if (error)
+				goto error0;
+			*stat = 1;
+			return 0;
+		}
+
+		/*
+		 * If this is the root level, and there's only one entry left,
+		 * and it's NOT the leaf level, then we can get rid of this
+		 * level.
+		 */
+		if (numrecs == 1 && level > 0) {
+			union xfs_btree_ptr	*pp;
+			/*
+			 * pp is still set to the first pointer in the block.
+			 * Make it the new root of the btree.
+			 */
+			pp = xfs_btree_ptr_addr(cur, 1, block);
+			error = xfs_btree_kill_root(cur, bp, level, pp);
+			if (error)
+				goto error0;
+		} else if (level > 0) {
+			error = xfs_btree_dec_cursor(cur, level, stat);
+			if (error)
+				goto error0;
+		}
+		*stat = 1;
+		return 0;
+	}
+
+	/*
+	 * If we deleted the leftmost entry in the block, update the
+	 * key values above us in the tree.
+	 */
+	if (ptr == 1) {
+		error = xfs_btree_updkey(cur, keyp, level + 1);
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * If the number of records remaining in the block is at least
+	 * the minimum, we're done.
+	 */
+	if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
+		error = xfs_btree_dec_cursor(cur, level, stat);
+		if (error)
+			goto error0;
+		return 0;
+	}
+
+	/*
+	 * Otherwise, we have to move some records around to keep the
+	 * tree balanced.  Look at the left and right sibling blocks to
+	 * see if we can re-balance by moving only one record.
+	 */
+	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+	xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
+
+	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+		/*
+		 * One child of root, need to get a chance to copy its contents
+		 * into the root and delete it. Can't go up to next level,
+		 * there's nothing to delete there.
+		 */
+		if (xfs_btree_ptr_is_null(cur, &rptr) &&
+		    xfs_btree_ptr_is_null(cur, &lptr) &&
+		    level == cur->bc_nlevels - 2) {
+			error = xfs_btree_kill_iroot(cur);
+			if (!error)
+				error = xfs_btree_dec_cursor(cur, level, stat);
+			if (error)
+				goto error0;
+			return 0;
+		}
+	}
+
+	ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
+	       !xfs_btree_ptr_is_null(cur, &lptr));
+
+	/*
+	 * Duplicate the cursor so our btree manipulations here won't
+	 * disrupt the next level up.
+	 */
+	error = xfs_btree_dup_cursor(cur, &tcur);
+	if (error)
+		goto error0;
+
+	/*
+	 * If there's a right sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+		/*
+		 * Move the temp cursor to the last entry in the next block.
+		 * Actually any entry but the first would suffice.
+		 */
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		error = xfs_btree_increment(tcur, level, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		/* Grab a pointer to the block. */
+		right = xfs_btree_get_block(tcur, level, &rbp);
+#ifdef DEBUG
+		error = xfs_btree_check_block(tcur, right, level, rbp);
+		if (error)
+			goto error0;
+#endif
+		/* Grab the current block number, for future use. */
+		xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
+
+		/*
+		 * If right block is full enough so that removing one entry
+		 * won't make it too empty, and left-shifting an entry out
+		 * of right to us works, we're done.
+		 */
+		if (xfs_btree_get_numrecs(right) - 1 >=
+		    cur->bc_ops->get_minrecs(tcur, level)) {
+			error = xfs_btree_lshift(tcur, level, &i);
+			if (error)
+				goto error0;
+			if (i) {
+				ASSERT(xfs_btree_get_numrecs(block) >=
+				       cur->bc_ops->get_minrecs(tcur, level));
+
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				tcur = NULL;
+
+				error = xfs_btree_dec_cursor(cur, level, stat);
+				if (error)
+					goto error0;
+				return 0;
+			}
+		}
+
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference, and fix up the temp cursor to point
+		 * to our block again (last record).
+		 */
+		rrecs = xfs_btree_get_numrecs(right);
+		if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+			i = xfs_btree_firstrec(tcur, level);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+			error = xfs_btree_decrement(tcur, level, &i);
+			if (error)
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		}
+	}
+
+	/*
+	 * If there's a left sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+		/*
+		 * Move the temp cursor to the first entry in the
+		 * previous block.
+		 */
+		i = xfs_btree_firstrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		error = xfs_btree_decrement(tcur, level, &i);
+		if (error)
+			goto error0;
+		i = xfs_btree_firstrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		/* Grab a pointer to the block. */
+		left = xfs_btree_get_block(tcur, level, &lbp);
+#ifdef DEBUG
+		error = xfs_btree_check_block(cur, left, level, lbp);
+		if (error)
+			goto error0;
+#endif
+		/* Grab the current block number, for future use. */
+		xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
+
+		/*
+		 * If left block is full enough so that removing one entry
+		 * won't make it too empty, and right-shifting an entry out
+		 * of left to us works, we're done.
+		 */
+		if (xfs_btree_get_numrecs(left) - 1 >=
+		    cur->bc_ops->get_minrecs(tcur, level)) {
+			error = xfs_btree_rshift(tcur, level, &i);
+			if (error)
+				goto error0;
+			if (i) {
+				ASSERT(xfs_btree_get_numrecs(block) >=
+				       cur->bc_ops->get_minrecs(tcur, level));
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				tcur = NULL;
+				if (level == 0)
+					cur->bc_ptrs[0]++;
+				XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+				*stat = 1;
+				return 0;
+			}
+		}
+
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference.
+		 */
+		lrecs = xfs_btree_get_numrecs(left);
+	}
+
+	/* Delete the temp cursor, we're done with it. */
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	tcur = NULL;
+
+	/* If here, we need to do a join to keep the tree balanced. */
+	ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
+
+	if (!xfs_btree_ptr_is_null(cur, &lptr) &&
+	    lrecs + xfs_btree_get_numrecs(block) <=
+			cur->bc_ops->get_maxrecs(cur, level)) {
+		/*
+		 * Set "right" to be the starting block,
+		 * "left" to be the left neighbor.
+		 */
+		rptr = cptr;
+		right = block;
+		rbp = bp;
+		error = xfs_btree_read_buf_block(cur, &lptr, level,
+							0, &left, &lbp);
+		if (error)
+			goto error0;
+
+	/*
+	 * If that won't work, see if we can join with the right neighbor block.
+	 */
+	} else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
+		   rrecs + xfs_btree_get_numrecs(block) <=
+			cur->bc_ops->get_maxrecs(cur, level)) {
+		/*
+		 * Set "left" to be the starting block,
+		 * "right" to be the right neighbor.
+		 */
+		lptr = cptr;
+		left = block;
+		lbp = bp;
+		error = xfs_btree_read_buf_block(cur, &rptr, level,
+							0, &right, &rbp);
+		if (error)
+			goto error0;
+
+	/*
+	 * Otherwise, we can't fix the imbalance.
+	 * Just return.  This is probably a logic error, but it's not fatal.
+	 */
+	} else {
+		error = xfs_btree_dec_cursor(cur, level, stat);
+		if (error)
+			goto error0;
+		return 0;
+	}
+
+	rrecs = xfs_btree_get_numrecs(right);
+	lrecs = xfs_btree_get_numrecs(left);
+
+	/*
+	 * We're now going to join "left" and "right" by moving all the stuff
+	 * in "right" to "left" and deleting "right".
+	 */
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+	if (level > 0) {
+		/* It's a non-leaf.  Move keys and pointers. */
+		union xfs_btree_key	*lkp;	/* left btree key */
+		union xfs_btree_ptr	*lpp;	/* left address pointer */
+		union xfs_btree_key	*rkp;	/* right btree key */
+		union xfs_btree_ptr	*rpp;	/* right address pointer */
+
+		lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
+		lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+		for (i = 1; i < rrecs; i++) {
+			error = xfs_btree_check_ptr(cur, rpp, i, level);
+			if (error)
+				goto error0;
+		}
+#endif
+		xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
+		xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
+
+		xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+		xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+	} else {
+		/* It's a leaf.  Move records.  */
+		union xfs_btree_rec	*lrp;	/* left record pointer */
+		union xfs_btree_rec	*rrp;	/* right record pointer */
+
+		lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
+		xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+	}
+
+	XFS_BTREE_STATS_INC(cur, join);
+
+	/*
+	 * Fix up the number of records and right block pointer in the
+	 * surviving block, and log it.
+	 */
+	xfs_btree_set_numrecs(left, lrecs + rrecs);
+	xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
+	xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+	/* If there is a right sibling, point it to the remaining block. */
+	xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+	if (!xfs_btree_ptr_is_null(cur, &cptr)) {
+		error = xfs_btree_read_buf_block(cur, &cptr, level,
+							0, &rrblock, &rrbp);
+		if (error)
+			goto error0;
+		xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
+		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+	}
+
+	/* Free the deleted block. */
+	error = cur->bc_ops->free_block(cur, rbp);
+	if (error)
+		goto error0;
+	XFS_BTREE_STATS_INC(cur, free);
+
+	/*
+	 * If we joined with the left neighbor, set the buffer in the
+	 * cursor to the left block, and fix up the index.
+	 */
+	if (bp != lbp) {
+		cur->bc_bufs[level] = lbp;
+		cur->bc_ptrs[level] += lrecs;
+		cur->bc_ra[level] = 0;
+	}
+	/*
+	 * If we joined with the right neighbor and there's a level above
+	 * us, increment the cursor at that level.
+	 */
+	else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
+		   (level + 1 < cur->bc_nlevels)) {
+		error = xfs_btree_increment(cur, level + 1, &i);
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * Readjust the ptr at this level if it's not a leaf, since it's
+	 * still pointing at the deletion point, which makes the cursor
+	 * inconsistent.  If this makes the ptr 0, the caller fixes it up.
+	 * We can't use decrement because it would change the next level up.
+	 */
+	if (level > 0)
+		cur->bc_ptrs[level]--;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	/* Return value means the next level up has something to do. */
+	*stat = 2;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	if (tcur)
+		xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int					/* error */
+xfs_btree_delete(
+	struct xfs_btree_cur	*cur,
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+	int			level;
+	int			i;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+	/*
+	 * Go up the tree, starting at leaf level.
+	 *
+	 * If 2 is returned then a join was done; go to the next level.
+	 * Otherwise we are done.
+	 */
+	for (level = 0, i = 2; i == 2; level++) {
+		error = xfs_btree_delrec(cur, level, &i);
+		if (error)
+			goto error0;
+	}
+
+	if (i == 0) {
+		for (level = 1; level < cur->bc_nlevels; level++) {
+			if (cur->bc_ptrs[level] == 0) {
+				error = xfs_btree_decrement(cur, level, &i);
+				if (error)
+					goto error0;
+				break;
+			}
+		}
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = i;
+	return 0;
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_btree_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	union xfs_btree_rec	**recp,	/* output: btree record */
+	int			*stat)	/* output: success/failure */
+{
+	struct xfs_btree_block	*block;	/* btree block */
+	struct xfs_buf		*bp;	/* buffer pointer */
+	int			ptr;	/* record number */
+#ifdef DEBUG
+	int			error;	/* error return value */
+#endif
+
+	ptr = cur->bc_ptrs[0];
+	block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, 0, bp);
+	if (error)
+		return error;
+#endif
+
+	/*
+	 * Off the right end or left end, return failure.
+	 */
+	if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
+		*stat = 0;
+		return 0;
+	}
+
+	/*
+	 * Point to the record and extract its data.
+	 */
+	*recp = xfs_btree_rec_addr(cur, ptr, block);
+	*stat = 1;
+	return 0;
+}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
new file mode 100644
index 00000000..82fafc66
--- /dev/null
+++ b/fs/xfs/xfs_btree.h
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BTREE_H__
+#define	__XFS_BTREE_H__
+
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+extern kmem_zone_t	*xfs_btree_cur_zone;
+
+/*
+ * This nonsense is to make -wlint happy.
+ */
+#define	XFS_LOOKUP_EQ	((xfs_lookup_t)XFS_LOOKUP_EQi)
+#define	XFS_LOOKUP_LE	((xfs_lookup_t)XFS_LOOKUP_LEi)
+#define	XFS_LOOKUP_GE	((xfs_lookup_t)XFS_LOOKUP_GEi)
+
+#define	XFS_BTNUM_BNO	((xfs_btnum_t)XFS_BTNUM_BNOi)
+#define	XFS_BTNUM_CNT	((xfs_btnum_t)XFS_BTNUM_CNTi)
+#define	XFS_BTNUM_BMAP	((xfs_btnum_t)XFS_BTNUM_BMAPi)
+#define	XFS_BTNUM_INO	((xfs_btnum_t)XFS_BTNUM_INOi)
+
+/*
+ * Generic btree header.
+ *
+ * This is a combination of the actual format used on disk for short and long
+ * format btrees.  The first three fields are shared by both format, but
+ * the pointers are different and should be used with care.
+ *
+ * To get the size of the actual short or long form headers please use
+ * the size macros below.  Never use sizeof(xfs_btree_block).
+ */
+struct xfs_btree_block {
+	__be32		bb_magic;	/* magic number for block type */
+	__be16		bb_level;	/* 0 is a leaf */
+	__be16		bb_numrecs;	/* current # of data records */
+	union {
+		struct {
+			__be32		bb_leftsib;
+			__be32		bb_rightsib;
+		} s;			/* short form pointers */
+		struct	{
+			__be64		bb_leftsib;
+			__be64		bb_rightsib;
+		} l;			/* long form pointers */
+	} bb_u;				/* rest */
+};
+
+#define XFS_BTREE_SBLOCK_LEN	16	/* size of a short form block */
+#define XFS_BTREE_LBLOCK_LEN	24	/* size of a long form block */
+
+
+/*
+ * Generic key, ptr and record wrapper structures.
+ *
+ * These are disk format structures, and are converted where necessary
+ * by the btree specific code that needs to interpret them.
+ */
+union xfs_btree_ptr {
+	__be32			s;	/* short form ptr */
+	__be64			l;	/* long form ptr */
+};
+
+union xfs_btree_key {
+	xfs_bmbt_key_t		bmbt;
+	xfs_bmdr_key_t		bmbr;	/* bmbt root block */
+	xfs_alloc_key_t		alloc;
+	xfs_inobt_key_t		inobt;
+};
+
+union xfs_btree_rec {
+	xfs_bmbt_rec_t		bmbt;
+	xfs_bmdr_rec_t		bmbr;	/* bmbt root block */
+	xfs_alloc_rec_t		alloc;
+	xfs_inobt_rec_t		inobt;
+};
+
+/*
+ * For logging record fields.
+ */
+#define	XFS_BB_MAGIC		0x01
+#define	XFS_BB_LEVEL		0x02
+#define	XFS_BB_NUMRECS		0x04
+#define	XFS_BB_LEFTSIB		0x08
+#define	XFS_BB_RIGHTSIB		0x10
+#define	XFS_BB_NUM_BITS		5
+#define	XFS_BB_ALL_BITS		((1 << XFS_BB_NUM_BITS) - 1)
+
+/*
+ * Magic numbers for btree blocks.
+ */
+extern const __uint32_t	xfs_magics[];
+
+/*
+ * Generic stats interface
+ */
+#define __XFS_BTREE_STATS_INC(type, stat) \
+	XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
+#define XFS_BTREE_STATS_INC(cur, stat)  \
+do {    \
+	switch (cur->bc_btnum) {  \
+	case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break;	\
+	case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break;	\
+	case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break;	\
+	case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break;	\
+	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;	\
+	}       \
+} while (0)
+
+#define __XFS_BTREE_STATS_ADD(type, stat, val) \
+	XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
+#define XFS_BTREE_STATS_ADD(cur, stat, val)  \
+do {    \
+	switch (cur->bc_btnum) {  \
+	case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
+	case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
+	case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
+	case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
+	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;	\
+	}       \
+} while (0)
+
+#define	XFS_BTREE_MAXLEVELS	8	/* max of all btrees */
+
+struct xfs_btree_ops {
+	/* size of the key and record structures */
+	size_t	key_len;
+	size_t	rec_len;
+
+	/* cursor operations */
+	struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
+	void	(*update_cursor)(struct xfs_btree_cur *src,
+				 struct xfs_btree_cur *dst);
+
+	/* update btree root pointer */
+	void	(*set_root)(struct xfs_btree_cur *cur,
+			    union xfs_btree_ptr *nptr, int level_change);
+
+	/* block allocation / freeing */
+	int	(*alloc_block)(struct xfs_btree_cur *cur,
+			       union xfs_btree_ptr *start_bno,
+			       union xfs_btree_ptr *new_bno,
+			       int length, int *stat);
+	int	(*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
+	/* update last record information */
+	void	(*update_lastrec)(struct xfs_btree_cur *cur,
+				  struct xfs_btree_block *block,
+				  union xfs_btree_rec *rec,
+				  int ptr, int reason);
+
+	/* records in block/level */
+	int	(*get_minrecs)(struct xfs_btree_cur *cur, int level);
+	int	(*get_maxrecs)(struct xfs_btree_cur *cur, int level);
+
+	/* records on disk.  Matter for the root in inode case. */
+	int	(*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
+
+	/* init values of btree structures */
+	void	(*init_key_from_rec)(union xfs_btree_key *key,
+				     union xfs_btree_rec *rec);
+	void	(*init_rec_from_key)(union xfs_btree_key *key,
+				     union xfs_btree_rec *rec);
+	void	(*init_rec_from_cur)(struct xfs_btree_cur *cur,
+				     union xfs_btree_rec *rec);
+	void	(*init_ptr_from_cur)(struct xfs_btree_cur *cur,
+				     union xfs_btree_ptr *ptr);
+
+	/* difference between key value and cursor value */
+	__int64_t (*key_diff)(struct xfs_btree_cur *cur,
+			      union xfs_btree_key *key);
+
+#ifdef DEBUG
+	/* check that k1 is lower than k2 */
+	int	(*keys_inorder)(struct xfs_btree_cur *cur,
+				union xfs_btree_key *k1,
+				union xfs_btree_key *k2);
+
+	/* check that r1 is lower than r2 */
+	int	(*recs_inorder)(struct xfs_btree_cur *cur,
+				union xfs_btree_rec *r1,
+				union xfs_btree_rec *r2);
+#endif
+
+	/* btree tracing */
+#ifdef XFS_BTREE_TRACE
+	void		(*trace_enter)(struct xfs_btree_cur *, const char *,
+				       char *, int, int, __psunsigned_t,
+				       __psunsigned_t, __psunsigned_t,
+				       __psunsigned_t, __psunsigned_t,
+				       __psunsigned_t, __psunsigned_t,
+				       __psunsigned_t, __psunsigned_t,
+				       __psunsigned_t, __psunsigned_t);
+	void		(*trace_cursor)(struct xfs_btree_cur *, __uint32_t *,
+					__uint64_t *, __uint64_t *);
+	void		(*trace_key)(struct xfs_btree_cur *,
+				     union xfs_btree_key *, __uint64_t *,
+				     __uint64_t *);
+	void		(*trace_record)(struct xfs_btree_cur *,
+					union xfs_btree_rec *, __uint64_t *,
+					__uint64_t *, __uint64_t *);
+#endif
+};
+
+/*
+ * Reasons for the update_lastrec method to be called.
+ */
+#define LASTREC_UPDATE	0
+#define LASTREC_INSREC	1
+#define LASTREC_DELREC	2
+
+
+/*
+ * Btree cursor structure.
+ * This collects all information needed by the btree code in one place.
+ */
+typedef struct xfs_btree_cur
+{
+	struct xfs_trans	*bc_tp;	/* transaction we're in, if any */
+	struct xfs_mount	*bc_mp;	/* file system mount struct */
+	const struct xfs_btree_ops *bc_ops;
+	uint			bc_flags; /* btree features - below */
+	union {
+		xfs_alloc_rec_incore_t	a;
+		xfs_bmbt_irec_t		b;
+		xfs_inobt_rec_incore_t	i;
+	}		bc_rec;		/* current insert/search record value */
+	struct xfs_buf	*bc_bufs[XFS_BTREE_MAXLEVELS];	/* buf ptr per level */
+	int		bc_ptrs[XFS_BTREE_MAXLEVELS];	/* key/record # */
+	__uint8_t	bc_ra[XFS_BTREE_MAXLEVELS];	/* readahead bits */
+#define	XFS_BTCUR_LEFTRA	1	/* left sibling has been read-ahead */
+#define	XFS_BTCUR_RIGHTRA	2	/* right sibling has been read-ahead */
+	__uint8_t	bc_nlevels;	/* number of levels in the tree */
+	__uint8_t	bc_blocklog;	/* log2(blocksize) of btree blocks */
+	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
+	union {
+		struct {			/* needed for BNO, CNT, INO */
+			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
+			xfs_agnumber_t	agno;	/* ag number */
+		} a;
+		struct {			/* needed for BMAP */
+			struct xfs_inode *ip;	/* pointer to our inode */
+			struct xfs_bmap_free *flist;	/* list to free after */
+			xfs_fsblock_t	firstblock;	/* 1st blk allocated */
+			int		allocated;	/* count of alloced */
+			short		forksize;	/* fork's inode space */
+			char		whichfork;	/* data or attr fork */
+			char		flags;		/* flags */
+#define	XFS_BTCUR_BPRV_WASDEL	1			/* was delayed */
+		} b;
+	}		bc_private;	/* per-btree type data */
+} xfs_btree_cur_t;
+
+/* cursor flags */
+#define XFS_BTREE_LONG_PTRS		(1<<0)	/* pointers are 64bits long */
+#define XFS_BTREE_ROOT_IN_INODE		(1<<1)	/* root may be variable size */
+#define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
+
+
+#define	XFS_BTREE_NOERROR	0
+#define	XFS_BTREE_ERROR		1
+
+/*
+ * Convert from buffer to btree block header.
+ */
+#define	XFS_BUF_TO_BLOCK(bp)	((struct xfs_btree_block *)XFS_BUF_PTR(bp))
+
+
+/*
+ * Check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_btree_block	*block,	/* generic btree block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp);	/* buffer containing block, if any */
+
+/*
+ * Check that (long) pointer is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_dfsbno_t		ptr,	/* btree block disk address */
+	int			level);	/* btree block level */
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			error);	/* del because of error */
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int					/* error */
+xfs_btree_dup_cursor(
+	xfs_btree_cur_t		*cur,	/* input cursor */
+	xfs_btree_cur_t		**ncur);/* output cursor */
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+struct xfs_buf *				/* buffer for fsbno */
+xfs_btree_get_bufl(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_fsblock_t		fsbno,	/* file system block number */
+	uint			lock);	/* lock flags for get_buf */
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+struct xfs_buf *				/* buffer for agno/agbno */
+xfs_btree_get_bufs(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	xfs_agblock_t		agbno,	/* allocation group block number */
+	uint			lock);	/* lock flags for get_buf */
+
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int					/* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level);	/* level to check */
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+	__int64_t		fields,	/* bitmask of fields */
+	const short		*offsets,/* table of field offsets */
+	int			nbits,	/* number of bits to inspect */
+	int			*first,	/* output: first byte offset */
+	int			*last);	/* output: last byte offset */
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int					/* error */
+xfs_btree_read_bufl(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_fsblock_t		fsbno,	/* file system block number */
+	uint			lock,	/* lock flags for read_buf */
+	struct xfs_buf		**bpp,	/* buffer for fsbno */
+	int			refval);/* ref count value for buffer */
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+void					/* error */
+xfs_btree_reada_bufl(
+	struct xfs_mount	*mp,	/* file system mount point */
+	xfs_fsblock_t		fsbno,	/* file system block number */
+	xfs_extlen_t		count);	/* count of filesystem blocks */
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+void					/* error */
+xfs_btree_reada_bufs(
+	struct xfs_mount	*mp,	/* file system mount point */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	xfs_agblock_t		agbno,	/* allocation group block number */
+	xfs_extlen_t		count);	/* count of filesystem blocks */
+
+
+/*
+ * Common btree core entry points.
+ */
+int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
+int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
+int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
+int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
+int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
+int xfs_btree_insert(struct xfs_btree_cur *, int *);
+int xfs_btree_delete(struct xfs_btree_cur *, int *);
+int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+
+/*
+ * Internal btree helpers also used by xfs_bmap.c.
+ */
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
+
+/*
+ * Helpers.
+ */
+static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
+{
+	return be16_to_cpu(block->bb_numrecs);
+}
+
+static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
+		__uint16_t numrecs)
+{
+	block->bb_numrecs = cpu_to_be16(numrecs);
+}
+
+static inline int xfs_btree_get_level(struct xfs_btree_block *block)
+{
+	return be16_to_cpu(block->bb_level);
+}
+
+
+/*
+ * Min and max functions for extlen, agblock, fileoff, and filblks types.
+ */
+#define	XFS_EXTLEN_MIN(a,b)	min_t(xfs_extlen_t, (a), (b))
+#define	XFS_EXTLEN_MAX(a,b)	max_t(xfs_extlen_t, (a), (b))
+#define	XFS_AGBLOCK_MIN(a,b)	min_t(xfs_agblock_t, (a), (b))
+#define	XFS_AGBLOCK_MAX(a,b)	max_t(xfs_agblock_t, (a), (b))
+#define	XFS_FILEOFF_MIN(a,b)	min_t(xfs_fileoff_t, (a), (b))
+#define	XFS_FILEOFF_MAX(a,b)	max_t(xfs_fileoff_t, (a), (b))
+#define	XFS_FILBLKS_MIN(a,b)	min_t(xfs_filblks_t, (a), (b))
+#define	XFS_FILBLKS_MAX(a,b)	max_t(xfs_filblks_t, (a), (b))
+
+#define	XFS_FSB_SANITY_CHECK(mp,fsb)	\
+	(XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
+		XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
+
+#endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree_trace.c b/fs/xfs/xfs_btree_trace.c
new file mode 100644
index 00000000..44ff942a
--- /dev/null
+++ b/fs/xfs/xfs_btree_trace.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+
+STATIC void
+xfs_btree_trace_ptr(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	ptr,
+	__psunsigned_t		*high,
+	__psunsigned_t		*low)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		__u64 val = be64_to_cpu(ptr.l);
+		*high = val >> 32;
+		*low = (int)val;
+	} else {
+		*high = 0;
+		*low = be32_to_cpu(ptr.s);
+	}
+}
+
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
+ */
+void
+xfs_btree_trace_argbi(
+	const char		*func,
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*b,
+	int			i,
+	int			line)
+{
+	cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBI,
+				 line, (__psunsigned_t)b, i, 0, 0, 0, 0, 0,
+				 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
+ */
+void
+xfs_btree_trace_argbii(
+	const char		*func,
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*b,
+	int			i0,
+	int			i1,
+	int			line)
+{
+	cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBII,
+				 line, (__psunsigned_t)b, i0, i1, 0, 0, 0, 0,
+				 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for 3 block-length args
+ * and an integer arg.
+ */
+void
+xfs_btree_trace_argfffi(
+	const char		*func,
+	struct xfs_btree_cur	*cur,
+	xfs_dfiloff_t		o,
+	xfs_dfsbno_t		b,
+	xfs_dfilblks_t		i,
+	int			j,
+	int			line)
+{
+	cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGFFFI,
+				 line,
+				 o >> 32, (int)o,
+				 b >> 32, (int)b,
+				 i >> 32, (int)i,
+				 (int)j, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for one integer arg.
+ */
+void
+xfs_btree_trace_argi(
+	const char		*func,
+	struct xfs_btree_cur	*cur,
+	int			i,
+	int			line)
+{
+	cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGI,
+				 line, i, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, key.
+ */
+void
+xfs_btree_trace_argipk(
+	const char		*func,
+	struct xfs_btree_cur	*cur,
+	int			i,
+	union xfs_btree_ptr	ptr,
+	union xfs_btree_key	*key,
+	int			line)
+{
+	__psunsigned_t		high, low;
+	__uint64_t		l0, l1;
+
+	xfs_btree_trace_ptr(cur, ptr, &high, &low);
+	cur->bc_ops->trace_key(cur, key, &l0, &l1);
+	cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPK,
+				 line, i, high, low,
+				 l0 >> 32, (int)l0,
+				 l1 >> 32, (int)l1,
+				 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, rec.
+ */
+void
+xfs_btree_trace_argipr(
+	const char		*func,
+	struct xfs_btree_cur	*cur,
+	int			i,
+	union xfs_btree_ptr	ptr,
+	union xfs_btree_rec	*rec,
+	int			line)
+{
+	__psunsigned_t		high, low;
+	__uint64_t		l0, l1, l2;
+
+	xfs_btree_trace_ptr(cur, ptr, &high, &low);
+	cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+	cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPR,
+			      line, i,
+			      high, low,
+			      l0 >> 32, (int)l0,
+			      l1 >> 32, (int)l1,
+			      l2 >> 32, (int)l2,
+			      0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, key.
+ */
+void
+xfs_btree_trace_argik(
+	const char		*func,
+	struct xfs_btree_cur	*cur,
+	int			i,
+	union xfs_btree_key	*key,
+	int			line)
+{
+	__uint64_t		l0, l1;
+
+	cur->bc_ops->trace_key(cur, key, &l0, &l1);
+	cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIK,
+				 line, i,
+				 l0 >> 32, (int)l0,
+				 l1 >> 32, (int)l1,
+				 0, 0, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for record.
+ */
+void
+xfs_btree_trace_argr(
+	const char		*func,
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec,
+	int			line)
+{
+	__uint64_t		l0, l1, l2;
+
+	cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+	cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGR,
+			      line,
+			      l0 >> 32, (int)l0,
+			      l1 >> 32, (int)l1,
+			      l2 >> 32, (int)l2,
+			      0, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for the cursor/operation.
+ */
+void
+xfs_btree_trace_cursor(
+	const char		*func,
+	struct xfs_btree_cur	*cur,
+	int			type,
+	int			line)
+{
+	__uint32_t		s0;
+	__uint64_t		l0, l1;
+	char			*s;
+
+	switch (type) {
+	case XBT_ARGS:
+		s = "args";
+		break;
+	case XBT_ENTRY:
+		s = "entry";
+		break;
+	case XBT_ERROR:
+		s = "error";
+		break;
+	case XBT_EXIT:
+		s = "exit";
+		break;
+	default:
+		s = "unknown";
+		break;
+	}
+
+	cur->bc_ops->trace_cursor(cur, &s0, &l0, &l1);
+	cur->bc_ops->trace_enter(cur, func, s, XFS_BTREE_KTRACE_CUR, line,
+				 s0,
+				 l0 >> 32, (int)l0,
+				 l1 >> 32, (int)l1,
+				 (__psunsigned_t)cur->bc_bufs[0],
+				 (__psunsigned_t)cur->bc_bufs[1],
+				 (__psunsigned_t)cur->bc_bufs[2],
+				 (__psunsigned_t)cur->bc_bufs[3],
+				 (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
+				 (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
+}
diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h
new file mode 100644
index 00000000..2d8a3098
--- /dev/null
+++ b/fs/xfs/xfs_btree_trace.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BTREE_TRACE_H__
+#define	__XFS_BTREE_TRACE_H__
+
+struct xfs_btree_cur;
+struct xfs_buf;
+
+
+/*
+ * Trace hooks.
+ * i,j = integer (32 bit)
+ * b = btree block buffer (xfs_buf_t)
+ * p = btree ptr
+ * r = btree record
+ * k = btree key
+ */
+
+#ifdef XFS_BTREE_TRACE
+
+/*
+ * Trace buffer entry types.
+ */
+#define XFS_BTREE_KTRACE_ARGBI   1
+#define XFS_BTREE_KTRACE_ARGBII  2
+#define XFS_BTREE_KTRACE_ARGFFFI 3
+#define XFS_BTREE_KTRACE_ARGI    4
+#define XFS_BTREE_KTRACE_ARGIPK  5
+#define XFS_BTREE_KTRACE_ARGIPR  6
+#define XFS_BTREE_KTRACE_ARGIK   7
+#define XFS_BTREE_KTRACE_ARGR	 8
+#define XFS_BTREE_KTRACE_CUR     9
+
+/*
+ * Sub-types for cursor traces.
+ */
+#define XBT_ARGS	0
+#define XBT_ENTRY	1
+#define XBT_ERROR	2
+#define XBT_EXIT	3
+
+void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *,
+		struct xfs_buf *, int, int);
+void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *,
+		struct xfs_buf *, int, int, int);
+void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int);
+void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int,
+		union xfs_btree_ptr, union xfs_btree_key *, int);
+void xfs_btree_trace_argipr(const char *, struct xfs_btree_cur *, int,
+		union xfs_btree_ptr, union xfs_btree_rec *, int);
+void xfs_btree_trace_argik(const char *, struct xfs_btree_cur *, int,
+		union xfs_btree_key *, int);
+void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *,
+		union xfs_btree_rec *, int);
+void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int);
+
+#define	XFS_BTREE_TRACE_ARGBI(c, b, i)	\
+	xfs_btree_trace_argbi(__func__, c, b, i, __LINE__)
+#define	XFS_BTREE_TRACE_ARGBII(c, b, i, j)	\
+	xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__)
+#define	XFS_BTREE_TRACE_ARGI(c, i)	\
+	xfs_btree_trace_argi(__func__, c, i, __LINE__)
+#define	XFS_BTREE_TRACE_ARGIPK(c, i, p, k)	\
+	xfs_btree_trace_argipk(__func__, c, i, p, k, __LINE__)
+#define	XFS_BTREE_TRACE_ARGIPR(c, i, p, r)	\
+	xfs_btree_trace_argipr(__func__, c, i, p, r, __LINE__)
+#define	XFS_BTREE_TRACE_ARGIK(c, i, k)	\
+	xfs_btree_trace_argik(__func__, c, i, k, __LINE__)
+#define XFS_BTREE_TRACE_ARGR(c, r)	\
+	xfs_btree_trace_argr(__func__, c, r, __LINE__)
+#define	XFS_BTREE_TRACE_CURSOR(c, t)	\
+	xfs_btree_trace_cursor(__func__, c, t, __LINE__)
+#else
+#define	XFS_BTREE_TRACE_ARGBI(c, b, i)
+#define	XFS_BTREE_TRACE_ARGBII(c, b, i, j)
+#define	XFS_BTREE_TRACE_ARGI(c, i)
+#define	XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
+#define	XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
+#define	XFS_BTREE_TRACE_ARGIK(c, i, k)
+#define XFS_BTREE_TRACE_ARGR(c, r)
+#define	XFS_BTREE_TRACE_CURSOR(c, t)
+#endif	/* XFS_BTREE_TRACE */
+
+#endif /* __XFS_BTREE_TRACE_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
new file mode 100644
index 00000000..7888a756
--- /dev/null
+++ b/fs/xfs/xfs_buf_item.c
@@ -0,0 +1,1064 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+
+kmem_zone_t	*xfs_buf_item_zone;
+
+static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_buf_log_item, bli_item);
+}
+
+
+#ifdef XFS_TRANS_DEBUG
+/*
+ * This function uses an alternate strategy for tracking the bytes
+ * that the user requests to be logged.  This can then be used
+ * in conjunction with the bli_orig array in the buf log item to
+ * catch bugs in our callers' code.
+ *
+ * We also double check the bits set in xfs_buf_item_log using a
+ * simple algorithm to check that every byte is accounted for.
+ */
+STATIC void
+xfs_buf_item_log_debug(
+	xfs_buf_log_item_t	*bip,
+	uint			first,
+	uint			last)
+{
+	uint	x;
+	uint	byte;
+	uint	nbytes;
+	uint	chunk_num;
+	uint	word_num;
+	uint	bit_num;
+	uint	bit_set;
+	uint	*wordp;
+
+	ASSERT(bip->bli_logged != NULL);
+	byte = first;
+	nbytes = last - first + 1;
+	bfset(bip->bli_logged, first, nbytes);
+	for (x = 0; x < nbytes; x++) {
+		chunk_num = byte >> XFS_BLF_SHIFT;
+		word_num = chunk_num >> BIT_TO_WORD_SHIFT;
+		bit_num = chunk_num & (NBWORD - 1);
+		wordp = &(bip->bli_format.blf_data_map[word_num]);
+		bit_set = *wordp & (1 << bit_num);
+		ASSERT(bit_set);
+		byte++;
+	}
+}
+
+/*
+ * This function is called when we flush something into a buffer without
+ * logging it.  This happens for things like inodes which are logged
+ * separately from the buffer.
+ */
+void
+xfs_buf_item_flush_log_debug(
+	xfs_buf_t	*bp,
+	uint		first,
+	uint		last)
+{
+	xfs_buf_log_item_t	*bip;
+	uint			nbytes;
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+	if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) {
+		return;
+	}
+
+	ASSERT(bip->bli_logged != NULL);
+	nbytes = last - first + 1;
+	bfset(bip->bli_logged, first, nbytes);
+}
+
+/*
+ * This function is called to verify that our callers have logged
+ * all the bytes that they changed.
+ *
+ * It does this by comparing the original copy of the buffer stored in
+ * the buf log item's bli_orig array to the current copy of the buffer
+ * and ensuring that all bytes which mismatch are set in the bli_logged
+ * array of the buf log item.
+ */
+STATIC void
+xfs_buf_item_log_check(
+	xfs_buf_log_item_t	*bip)
+{
+	char		*orig;
+	char		*buffer;
+	int		x;
+	xfs_buf_t	*bp;
+
+	ASSERT(bip->bli_orig != NULL);
+	ASSERT(bip->bli_logged != NULL);
+
+	bp = bip->bli_buf;
+	ASSERT(XFS_BUF_COUNT(bp) > 0);
+	ASSERT(XFS_BUF_PTR(bp) != NULL);
+	orig = bip->bli_orig;
+	buffer = XFS_BUF_PTR(bp);
+	for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
+		if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
+			xfs_emerg(bp->b_mount,
+				"%s: bip %x buffer %x orig %x index %d",
+				__func__, bip, bp, orig, x);
+			ASSERT(0);
+		}
+	}
+}
+#else
+#define		xfs_buf_item_log_debug(x,y,z)
+#define		xfs_buf_item_log_check(x)
+#endif
+
+STATIC void	xfs_buf_do_callbacks(struct xfs_buf *bp);
+
+/*
+ * This returns the number of log iovecs needed to log the
+ * given buf log item.
+ *
+ * It calculates this as 1 iovec for the buf log format structure
+ * and 1 for each stretch of non-contiguous chunks to be logged.
+ * Contiguous chunks are logged in a single iovec.
+ *
+ * If the XFS_BLI_STALE flag has been set, then log nothing.
+ */
+STATIC uint
+xfs_buf_item_size(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
+	uint			nvecs;
+	int			next_bit;
+	int			last_bit;
+
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		/*
+		 * The buffer is stale, so all we need to log
+		 * is the buf log format structure with the
+		 * cancel flag in it.
+		 */
+		trace_xfs_buf_item_size_stale(bip);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+		return 1;
+	}
+
+	ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
+	nvecs = 1;
+	last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+					 bip->bli_format.blf_map_size, 0);
+	ASSERT(last_bit != -1);
+	nvecs++;
+	while (last_bit != -1) {
+		/*
+		 * This takes the bit number to start looking from and
+		 * returns the next set bit from there.  It returns -1
+		 * if there are no more bits set or the start bit is
+		 * beyond the end of the bitmap.
+		 */
+		next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+						 bip->bli_format.blf_map_size,
+						 last_bit + 1);
+		/*
+		 * If we run out of bits, leave the loop,
+		 * else if we find a new set of bits bump the number of vecs,
+		 * else keep scanning the current set of bits.
+		 */
+		if (next_bit == -1) {
+			last_bit = -1;
+		} else if (next_bit != last_bit + 1) {
+			last_bit = next_bit;
+			nvecs++;
+		} else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
+			   (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
+			    XFS_BLF_CHUNK)) {
+			last_bit = next_bit;
+			nvecs++;
+		} else {
+			last_bit++;
+		}
+	}
+
+	trace_xfs_buf_item_size(bip);
+	return nvecs;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given log buf item.  It fills the first entry with a buf log
+ * format structure, and the rest point to contiguous chunks
+ * within the buffer.
+ */
+STATIC void
+xfs_buf_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_iovec	*vecp)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf	*bp = bip->bli_buf;
+	uint		base_size;
+	uint		nvecs;
+	int		first_bit;
+	int		last_bit;
+	int		next_bit;
+	uint		nbits;
+	uint		buffer_offset;
+
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+	       (bip->bli_flags & XFS_BLI_STALE));
+
+	/*
+	 * The size of the base structure is the size of the
+	 * declared structure plus the space for the extra words
+	 * of the bitmap.  We subtract one from the map size, because
+	 * the first element of the bitmap is accounted for in the
+	 * size of the base structure.
+	 */
+	base_size =
+		(uint)(sizeof(xfs_buf_log_format_t) +
+		       ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
+	vecp->i_addr = &bip->bli_format;
+	vecp->i_len = base_size;
+	vecp->i_type = XLOG_REG_TYPE_BFORMAT;
+	vecp++;
+	nvecs = 1;
+
+	/*
+	 * If it is an inode buffer, transfer the in-memory state to the
+	 * format flags and clear the in-memory state. We do not transfer
+	 * this state if the inode buffer allocation has not yet been committed
+	 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
+	 * correct replay of the inode allocation.
+	 */
+	if (bip->bli_flags & XFS_BLI_INODE_BUF) {
+		if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+		      xfs_log_item_in_current_chkpt(lip)))
+			bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
+		bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+	}
+
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		/*
+		 * The buffer is stale, so all we need to log
+		 * is the buf log format structure with the
+		 * cancel flag in it.
+		 */
+		trace_xfs_buf_item_format_stale(bip);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+		bip->bli_format.blf_size = nvecs;
+		return;
+	}
+
+	/*
+	 * Fill in an iovec for each set of contiguous chunks.
+	 */
+	first_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+					 bip->bli_format.blf_map_size, 0);
+	ASSERT(first_bit != -1);
+	last_bit = first_bit;
+	nbits = 1;
+	for (;;) {
+		/*
+		 * This takes the bit number to start looking from and
+		 * returns the next set bit from there.  It returns -1
+		 * if there are no more bits set or the start bit is
+		 * beyond the end of the bitmap.
+		 */
+		next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+						 bip->bli_format.blf_map_size,
+						 (uint)last_bit + 1);
+		/*
+		 * If we run out of bits fill in the last iovec and get
+		 * out of the loop.
+		 * Else if we start a new set of bits then fill in the
+		 * iovec for the series we were looking at and start
+		 * counting the bits in the new one.
+		 * Else we're still in the same set of bits so just
+		 * keep counting and scanning.
+		 */
+		if (next_bit == -1) {
+			buffer_offset = first_bit * XFS_BLF_CHUNK;
+			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+			vecp->i_len = nbits * XFS_BLF_CHUNK;
+			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
+			nvecs++;
+			break;
+		} else if (next_bit != last_bit + 1) {
+			buffer_offset = first_bit * XFS_BLF_CHUNK;
+			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+			vecp->i_len = nbits * XFS_BLF_CHUNK;
+			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
+			nvecs++;
+			vecp++;
+			first_bit = next_bit;
+			last_bit = next_bit;
+			nbits = 1;
+		} else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
+			   (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
+			    XFS_BLF_CHUNK)) {
+			buffer_offset = first_bit * XFS_BLF_CHUNK;
+			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+			vecp->i_len = nbits * XFS_BLF_CHUNK;
+			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
+/* You would think we need to bump the nvecs here too, but we do not
+ * this number is used by recovery, and it gets confused by the boundary
+ * split here
+ *			nvecs++;
+ */
+			vecp++;
+			first_bit = next_bit;
+			last_bit = next_bit;
+			nbits = 1;
+		} else {
+			last_bit++;
+			nbits++;
+		}
+	}
+	bip->bli_format.blf_size = nvecs;
+
+	/*
+	 * Check to make sure everything is consistent.
+	 */
+	trace_xfs_buf_item_format(bip);
+	xfs_buf_item_log_check(bip);
+}
+
+/*
+ * This is called to pin the buffer associated with the buf log item in memory
+ * so it cannot be written out.
+ *
+ * We also always take a reference to the buffer log item here so that the bli
+ * is held while the item is pinned in memory. This means that we can
+ * unconditionally drop the reference count a transaction holds when the
+ * transaction is completed.
+ */
+STATIC void
+xfs_buf_item_pin(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+
+	ASSERT(XFS_BUF_ISBUSY(bip->bli_buf));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+	       (bip->bli_flags & XFS_BLI_STALE));
+
+	trace_xfs_buf_item_pin(bip);
+
+	atomic_inc(&bip->bli_refcount);
+	atomic_inc(&bip->bli_buf->b_pin_count);
+}
+
+/*
+ * This is called to unpin the buffer associated with the buf log
+ * item which was previously pinned with a call to xfs_buf_item_pin().
+ *
+ * Also drop the reference to the buf item for the current transaction.
+ * If the XFS_BLI_STALE flag is set and we are the last reference,
+ * then free up the buf log item and unlock the buffer.
+ *
+ * If the remove flag is set we are called from uncommit in the
+ * forced-shutdown path.  If that is true and the reference count on
+ * the log item is going to drop to zero we need to free the item's
+ * descriptor in the transaction.
+ */
+STATIC void
+xfs_buf_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	xfs_buf_t	*bp = bip->bli_buf;
+	struct xfs_ail	*ailp = lip->li_ailp;
+	int		stale = bip->bli_flags & XFS_BLI_STALE;
+	int		freed;
+
+	ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	trace_xfs_buf_item_unpin(bip);
+
+	freed = atomic_dec_and_test(&bip->bli_refcount);
+
+	if (atomic_dec_and_test(&bp->b_pin_count))
+		wake_up_all(&bp->b_waiters);
+
+	if (freed && stale) {
+		ASSERT(bip->bli_flags & XFS_BLI_STALE);
+		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+		ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
+		ASSERT(XFS_BUF_ISSTALE(bp));
+		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+
+		trace_xfs_buf_item_unpin_stale(bip);
+
+		if (remove) {
+			/*
+			 * If we are in a transaction context, we have to
+			 * remove the log item from the transaction as we are
+			 * about to release our reference to the buffer.  If we
+			 * don't, the unlock that occurs later in
+			 * xfs_trans_uncommit() will try to reference the
+			 * buffer which we no longer have a hold on.
+			 */
+			if (lip->li_desc)
+				xfs_trans_del_item(lip);
+
+			/*
+			 * Since the transaction no longer refers to the buffer,
+			 * the buffer should no longer refer to the transaction.
+			 */
+			XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+		}
+
+		/*
+		 * If we get called here because of an IO error, we may
+		 * or may not have the item on the AIL. xfs_trans_ail_delete()
+		 * will take care of that situation.
+		 * xfs_trans_ail_delete() drops the AIL lock.
+		 */
+		if (bip->bli_flags & XFS_BLI_STALE_INODE) {
+			xfs_buf_do_callbacks(bp);
+			XFS_BUF_SET_FSPRIVATE(bp, NULL);
+			XFS_BUF_CLR_IODONE_FUNC(bp);
+		} else {
+			spin_lock(&ailp->xa_lock);
+			xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
+			xfs_buf_item_relse(bp);
+			ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
+		}
+		xfs_buf_relse(bp);
+	}
+}
+
+/*
+ * This is called to attempt to lock the buffer associated with this
+ * buf log item.  Don't sleep on the buffer lock.  If we can't get
+ * the lock right away, return 0.  If we can get the lock, take a
+ * reference to the buffer. If this is a delayed write buffer that
+ * needs AIL help to be written back, invoke the pushbuf routine
+ * rather than the normal success path.
+ */
+STATIC uint
+xfs_buf_item_trylock(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
+
+	if (XFS_BUF_ISPINNED(bp))
+		return XFS_ITEM_PINNED;
+	if (!XFS_BUF_CPSEMA(bp))
+		return XFS_ITEM_LOCKED;
+
+	/* take a reference to the buffer.  */
+	XFS_BUF_HOLD(bp);
+
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	trace_xfs_buf_item_trylock(bip);
+	if (XFS_BUF_ISDELAYWRITE(bp))
+		return XFS_ITEM_PUSHBUF;
+	return XFS_ITEM_SUCCESS;
+}
+
+/*
+ * Release the buffer associated with the buf log item.  If there is no dirty
+ * logged data associated with the buffer recorded in the buf log item, then
+ * free the buf log item and remove the reference to it in the buffer.
+ *
+ * This call ignores the recursion count.  It is only called when the buffer
+ * should REALLY be unlocked, regardless of the recursion count.
+ *
+ * We unconditionally drop the transaction's reference to the log item. If the
+ * item was logged, then another reference was taken when it was pinned, so we
+ * can safely drop the transaction reference now.  This also allows us to avoid
+ * potential races with the unpin code freeing the bli by not referencing the
+ * bli after we've dropped the reference count.
+ *
+ * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
+ * if necessary but do not unlock the buffer.  This is for support of
+ * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
+ * free the item.
+ */
+STATIC void
+xfs_buf_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
+	int			aborted;
+	uint			hold;
+
+	/* Clear the buffer's association with this transaction. */
+	XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+
+	/*
+	 * If this is a transaction abort, don't return early.  Instead, allow
+	 * the brelse to happen.  Normally it would be done for stale
+	 * (cancelled) buffers at unpin time, but we'll never go through the
+	 * pin/unpin cycle if we abort inside commit.
+	 */
+	aborted = (lip->li_flags & XFS_LI_ABORTED) != 0;
+
+	/*
+	 * Before possibly freeing the buf item, determine if we should
+	 * release the buffer at the end of this routine.
+	 */
+	hold = bip->bli_flags & XFS_BLI_HOLD;
+
+	/* Clear the per transaction state. */
+	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
+
+	/*
+	 * If the buf item is marked stale, then don't do anything.  We'll
+	 * unlock the buffer and free the buf item when the buffer is unpinned
+	 * for the last time.
+	 */
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		trace_xfs_buf_item_unlock_stale(bip);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+		if (!aborted) {
+			atomic_dec(&bip->bli_refcount);
+			return;
+		}
+	}
+
+	trace_xfs_buf_item_unlock(bip);
+
+	/*
+	 * If the buf item isn't tracking any data, free it, otherwise drop the
+	 * reference we hold to it.
+	 */
+	if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
+			     bip->bli_format.blf_map_size))
+		xfs_buf_item_relse(bp);
+	else
+		atomic_dec(&bip->bli_refcount);
+
+	if (!hold)
+		xfs_buf_relse(bp);
+}
+
+/*
+ * This is called to find out where the oldest active copy of the
+ * buf log item in the on disk log resides now that the last log
+ * write of it completed at the given lsn.
+ * We always re-log all the dirty data in a buffer, so usually the
+ * latest copy in the on disk log is the only one that matters.  For
+ * those cases we simply return the given lsn.
+ *
+ * The one exception to this is for buffers full of newly allocated
+ * inodes.  These buffers are only relogged with the XFS_BLI_INODE_BUF
+ * flag set, indicating that only the di_next_unlinked fields from the
+ * inodes in the buffers will be replayed during recovery.  If the
+ * original newly allocated inode images have not yet been flushed
+ * when the buffer is so relogged, then we need to make sure that we
+ * keep the old images in the 'active' portion of the log.  We do this
+ * by returning the original lsn of that transaction here rather than
+ * the current one.
+ */
+STATIC xfs_lsn_t
+xfs_buf_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+
+	trace_xfs_buf_item_committed(bip);
+
+	if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
+		return lip->li_lsn;
+	return lsn;
+}
+
+/*
+ * The buffer is locked, but is not a delayed write buffer. This happens
+ * if we race with IO completion and hence we don't want to try to write it
+ * again. Just release the buffer.
+ */
+STATIC void
+xfs_buf_item_push(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
+
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
+
+	trace_xfs_buf_item_push(bip);
+
+	xfs_buf_relse(bp);
+}
+
+/*
+ * The buffer is locked and is a delayed write buffer. Promote the buffer
+ * in the delayed write queue as the caller knows that they must invoke
+ * the xfsbufd to get this buffer written. We have to unlock the buffer
+ * to allow the xfsbufd to write it, too.
+ */
+STATIC bool
+xfs_buf_item_pushbuf(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
+
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(XFS_BUF_ISDELAYWRITE(bp));
+
+	trace_xfs_buf_item_pushbuf(bip);
+
+	xfs_buf_delwri_promote(bp);
+	xfs_buf_relse(bp);
+	return true;
+}
+
+STATIC void
+xfs_buf_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		commit_lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+static struct xfs_item_ops xfs_buf_item_ops = {
+	.iop_size	= xfs_buf_item_size,
+	.iop_format	= xfs_buf_item_format,
+	.iop_pin	= xfs_buf_item_pin,
+	.iop_unpin	= xfs_buf_item_unpin,
+	.iop_trylock	= xfs_buf_item_trylock,
+	.iop_unlock	= xfs_buf_item_unlock,
+	.iop_committed	= xfs_buf_item_committed,
+	.iop_push	= xfs_buf_item_push,
+	.iop_pushbuf	= xfs_buf_item_pushbuf,
+	.iop_committing = xfs_buf_item_committing
+};
+
+
+/*
+ * Allocate a new buf log item to go with the given buffer.
+ * Set the buffer's b_fsprivate field to point to the new
+ * buf log item.  If there are other item's attached to the
+ * buffer (see xfs_buf_attach_iodone() below), then put the
+ * buf log item at the front.
+ */
+void
+xfs_buf_item_init(
+	xfs_buf_t	*bp,
+	xfs_mount_t	*mp)
+{
+	xfs_log_item_t		*lip;
+	xfs_buf_log_item_t	*bip;
+	int			chunks;
+	int			map_size;
+
+	/*
+	 * Check to see if there is already a buf log item for
+	 * this buffer.  If there is, it is guaranteed to be
+	 * the first.  If we do already have one, there is
+	 * nothing to do here so return.
+	 */
+	ASSERT(bp->b_target->bt_mount == mp);
+	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+		if (lip->li_type == XFS_LI_BUF) {
+			return;
+		}
+	}
+
+	/*
+	 * chunks is the number of XFS_BLF_CHUNK size pieces
+	 * the buffer can be divided into. Make sure not to
+	 * truncate any pieces.  map_size is the size of the
+	 * bitmap needed to describe the chunks of the buffer.
+	 */
+	chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
+	map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
+
+	bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
+						    KM_SLEEP);
+	xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
+	bip->bli_buf = bp;
+	xfs_buf_hold(bp);
+	bip->bli_format.blf_type = XFS_LI_BUF;
+	bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
+	bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
+	bip->bli_format.blf_map_size = map_size;
+
+#ifdef XFS_TRANS_DEBUG
+	/*
+	 * Allocate the arrays for tracking what needs to be logged
+	 * and what our callers request to be logged.  bli_orig
+	 * holds a copy of the original, clean buffer for comparison
+	 * against, and bli_logged keeps a 1 bit flag per byte in
+	 * the buffer to indicate which bytes the callers have asked
+	 * to have logged.
+	 */
+	bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
+	memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp));
+	bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
+#endif
+
+	/*
+	 * Put the buf item into the list of items attached to the
+	 * buffer at the front.
+	 */
+	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+		bip->bli_item.li_bio_list =
+				XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+	}
+	XFS_BUF_SET_FSPRIVATE(bp, bip);
+}
+
+
+/*
+ * Mark bytes first through last inclusive as dirty in the buf
+ * item's bitmap.
+ */
+void
+xfs_buf_item_log(
+	xfs_buf_log_item_t	*bip,
+	uint			first,
+	uint			last)
+{
+	uint		first_bit;
+	uint		last_bit;
+	uint		bits_to_set;
+	uint		bits_set;
+	uint		word_num;
+	uint		*wordp;
+	uint		bit;
+	uint		end_bit;
+	uint		mask;
+
+	/*
+	 * Mark the item as having some dirty data for
+	 * quick reference in xfs_buf_item_dirty.
+	 */
+	bip->bli_flags |= XFS_BLI_DIRTY;
+
+	/*
+	 * Convert byte offsets to bit numbers.
+	 */
+	first_bit = first >> XFS_BLF_SHIFT;
+	last_bit = last >> XFS_BLF_SHIFT;
+
+	/*
+	 * Calculate the total number of bits to be set.
+	 */
+	bits_to_set = last_bit - first_bit + 1;
+
+	/*
+	 * Get a pointer to the first word in the bitmap
+	 * to set a bit in.
+	 */
+	word_num = first_bit >> BIT_TO_WORD_SHIFT;
+	wordp = &(bip->bli_format.blf_data_map[word_num]);
+
+	/*
+	 * Calculate the starting bit in the first word.
+	 */
+	bit = first_bit & (uint)(NBWORD - 1);
+
+	/*
+	 * First set any bits in the first word of our range.
+	 * If it starts at bit 0 of the word, it will be
+	 * set below rather than here.  That is what the variable
+	 * bit tells us. The variable bits_set tracks the number
+	 * of bits that have been set so far.  End_bit is the number
+	 * of the last bit to be set in this word plus one.
+	 */
+	if (bit) {
+		end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
+		mask = ((1 << (end_bit - bit)) - 1) << bit;
+		*wordp |= mask;
+		wordp++;
+		bits_set = end_bit - bit;
+	} else {
+		bits_set = 0;
+	}
+
+	/*
+	 * Now set bits a whole word at a time that are between
+	 * first_bit and last_bit.
+	 */
+	while ((bits_to_set - bits_set) >= NBWORD) {
+		*wordp |= 0xffffffff;
+		bits_set += NBWORD;
+		wordp++;
+	}
+
+	/*
+	 * Finally, set any bits left to be set in one last partial word.
+	 */
+	end_bit = bits_to_set - bits_set;
+	if (end_bit) {
+		mask = (1 << end_bit) - 1;
+		*wordp |= mask;
+	}
+
+	xfs_buf_item_log_debug(bip, first, last);
+}
+
+
+/*
+ * Return 1 if the buffer has some data that has been logged (at any
+ * point, not just the current transaction) and 0 if not.
+ */
+uint
+xfs_buf_item_dirty(
+	xfs_buf_log_item_t	*bip)
+{
+	return (bip->bli_flags & XFS_BLI_DIRTY);
+}
+
+STATIC void
+xfs_buf_item_free(
+	xfs_buf_log_item_t	*bip)
+{
+#ifdef XFS_TRANS_DEBUG
+	kmem_free(bip->bli_orig);
+	kmem_free(bip->bli_logged);
+#endif /* XFS_TRANS_DEBUG */
+
+	kmem_zone_free(xfs_buf_item_zone, bip);
+}
+
+/*
+ * This is called when the buf log item is no longer needed.  It should
+ * free the buf log item associated with the given buffer and clear
+ * the buffer's pointer to the buf log item.  If there are no more
+ * items in the list, clear the b_iodone field of the buffer (see
+ * xfs_buf_attach_iodone() below).
+ */
+void
+xfs_buf_item_relse(
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	trace_xfs_buf_item_relse(bp, _RET_IP_);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+	XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
+	if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
+	    (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
+		XFS_BUF_CLR_IODONE_FUNC(bp);
+	}
+	xfs_buf_rele(bp);
+	xfs_buf_item_free(bip);
+}
+
+
+/*
+ * Add the given log item with its callback to the list of callbacks
+ * to be called when the buffer's I/O completes.  If it is not set
+ * already, set the buffer's b_iodone() routine to be
+ * xfs_buf_iodone_callbacks() and link the log item into the list of
+ * items rooted at b_fsprivate.  Items are always added as the second
+ * entry in the list if there is a first, because the buf item code
+ * assumes that the buf log item is first.
+ */
+void
+xfs_buf_attach_iodone(
+	xfs_buf_t	*bp,
+	void		(*cb)(xfs_buf_t *, xfs_log_item_t *),
+	xfs_log_item_t	*lip)
+{
+	xfs_log_item_t	*head_lip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+
+	lip->li_cb = cb;
+	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+		head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+		lip->li_bio_list = head_lip->li_bio_list;
+		head_lip->li_bio_list = lip;
+	} else {
+		XFS_BUF_SET_FSPRIVATE(bp, lip);
+	}
+
+	ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) ||
+	       (XFS_BUF_IODONE_FUNC(bp) == NULL));
+	XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
+}
+
+/*
+ * We can have many callbacks on a buffer. Running the callbacks individually
+ * can cause a lot of contention on the AIL lock, so we allow for a single
+ * callback to be able to scan the remaining lip->li_bio_list for other items
+ * of the same type and callback to be processed in the first call.
+ *
+ * As a result, the loop walking the callback list below will also modify the
+ * list. it removes the first item from the list and then runs the callback.
+ * The loop then restarts from the new head of the list. This allows the
+ * callback to scan and modify the list attached to the buffer and we don't
+ * have to care about maintaining a next item pointer.
+ */
+STATIC void
+xfs_buf_do_callbacks(
+	struct xfs_buf		*bp)
+{
+	struct xfs_log_item	*lip;
+
+	while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
+		XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
+		ASSERT(lip->li_cb != NULL);
+		/*
+		 * Clear the next pointer so we don't have any
+		 * confusion if the item is added to another buf.
+		 * Don't touch the log item after calling its
+		 * callback, because it could have freed itself.
+		 */
+		lip->li_bio_list = NULL;
+		lip->li_cb(bp, lip);
+	}
+}
+
+/*
+ * This is the iodone() function for buffers which have had callbacks
+ * attached to them by xfs_buf_attach_iodone().  It should remove each
+ * log item from the buffer's list and call the callback of each in turn.
+ * When done, the buffer's fsprivate field is set to NULL and the buffer
+ * is unlocked with a call to iodone().
+ */
+void
+xfs_buf_iodone_callbacks(
+	struct xfs_buf		*bp)
+{
+	struct xfs_log_item	*lip = bp->b_fspriv;
+	struct xfs_mount	*mp = lip->li_mountp;
+	static ulong		lasttime;
+	static xfs_buftarg_t	*lasttarg;
+
+	if (likely(!XFS_BUF_GETERROR(bp)))
+		goto do_callbacks;
+
+	/*
+	 * If we've already decided to shutdown the filesystem because of
+	 * I/O errors, there's no point in giving this a retry.
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		XFS_BUF_SUPER_STALE(bp);
+		trace_xfs_buf_item_iodone(bp, _RET_IP_);
+		goto do_callbacks;
+	}
+
+	if (XFS_BUF_TARGET(bp) != lasttarg ||
+	    time_after(jiffies, (lasttime + 5*HZ))) {
+		lasttime = jiffies;
+		xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
+			XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+		      (__uint64_t)XFS_BUF_ADDR(bp));
+	}
+	lasttarg = XFS_BUF_TARGET(bp);
+
+	/*
+	 * If the write was asynchronous then no one will be looking for the
+	 * error.  Clear the error state and write the buffer out again.
+	 *
+	 * During sync or umount we'll write all pending buffers again
+	 * synchronous, which will catch these errors if they keep hanging
+	 * around.
+	 */
+	if (XFS_BUF_ISASYNC(bp)) {
+		XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
+
+		if (!XFS_BUF_ISSTALE(bp)) {
+			XFS_BUF_DELAYWRITE(bp);
+			XFS_BUF_DONE(bp);
+			XFS_BUF_SET_START(bp);
+		}
+		ASSERT(XFS_BUF_IODONE_FUNC(bp));
+		trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+		xfs_buf_relse(bp);
+		return;
+	}
+
+	/*
+	 * If the write of the buffer was synchronous, we want to make
+	 * sure to return the error to the caller of xfs_bwrite().
+	 */
+	XFS_BUF_STALE(bp);
+	XFS_BUF_DONE(bp);
+	XFS_BUF_UNDELAYWRITE(bp);
+
+	trace_xfs_buf_error_relse(bp, _RET_IP_);
+
+do_callbacks:
+	xfs_buf_do_callbacks(bp);
+	XFS_BUF_SET_FSPRIVATE(bp, NULL);
+	XFS_BUF_CLR_IODONE_FUNC(bp);
+	xfs_buf_ioend(bp, 0);
+}
+
+/*
+ * This is the iodone() function for buffers which have been
+ * logged.  It is called when they are eventually flushed out.
+ * It should remove the buf item from the AIL, and free the buf item.
+ * It is called by xfs_buf_iodone_callbacks() above which will take
+ * care of cleaning up the buffer itself.
+ */
+void
+xfs_buf_iodone(
+	struct xfs_buf		*bp,
+	struct xfs_log_item	*lip)
+{
+	struct xfs_ail		*ailp = lip->li_ailp;
+
+	ASSERT(BUF_ITEM(lip)->bli_buf == bp);
+
+	xfs_buf_rele(bp);
+
+	/*
+	 * If we are forcibly shutting down, this may well be
+	 * off the AIL already. That's because we simulate the
+	 * log-committed callbacks to unpin these buffers. Or we may never
+	 * have put this item on AIL because of the transaction was
+	 * aborted forcibly. xfs_trans_ail_delete() takes care of these.
+	 *
+	 * Either way, AIL is useless if we're forcing a shutdown.
+	 */
+	spin_lock(&ailp->xa_lock);
+	xfs_trans_ail_delete(ailp, lip);
+	xfs_buf_item_free(BUF_ITEM(lip));
+}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
new file mode 100644
index 00000000..b6ecd206
--- /dev/null
+++ b/fs/xfs/xfs_buf_item.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_BUF_ITEM_H__
+#define	__XFS_BUF_ITEM_H__
+
+extern kmem_zone_t	*xfs_buf_item_zone;
+
+/*
+ * This is the structure used to lay out a buf log item in the
+ * log.  The data map describes which 128 byte chunks of the buffer
+ * have been logged.
+ * For 6.2 and beyond, this is XFS_LI_BUF.  We use this to log everything.
+ */
+typedef struct xfs_buf_log_format {
+	unsigned short	blf_type;	/* buf log item type indicator */
+	unsigned short	blf_size;	/* size of this item */
+	ushort		blf_flags;	/* misc state */
+	ushort		blf_len;	/* number of blocks in this buf */
+	__int64_t	blf_blkno;	/* starting blkno of this buf */
+	unsigned int	blf_map_size;	/* size of data bitmap in words */
+	unsigned int	blf_data_map[1];/* variable size bitmap of */
+					/*   regions of buffer in this item */
+} xfs_buf_log_format_t;
+
+/*
+ * This flag indicates that the buffer contains on disk inodes
+ * and requires special recovery handling.
+ */
+#define	XFS_BLF_INODE_BUF	0x1
+/*
+ * This flag indicates that the buffer should not be replayed
+ * during recovery because its blocks are being freed.
+ */
+#define	XFS_BLF_CANCEL		0x2
+/*
+ * This flag indicates that the buffer contains on disk
+ * user or group dquots and may require special recovery handling.
+ */
+#define	XFS_BLF_UDQUOT_BUF	0x4
+#define XFS_BLF_PDQUOT_BUF	0x8
+#define	XFS_BLF_GDQUOT_BUF	0x10
+
+#define	XFS_BLF_CHUNK		128
+#define	XFS_BLF_SHIFT		7
+#define	BIT_TO_WORD_SHIFT	5
+#define	NBWORD			(NBBY * sizeof(unsigned int))
+
+/*
+ * buf log item flags
+ */
+#define	XFS_BLI_HOLD		0x01
+#define	XFS_BLI_DIRTY		0x02
+#define	XFS_BLI_STALE		0x04
+#define	XFS_BLI_LOGGED		0x08
+#define	XFS_BLI_INODE_ALLOC_BUF	0x10
+#define XFS_BLI_STALE_INODE	0x20
+#define	XFS_BLI_INODE_BUF	0x40
+
+#define XFS_BLI_FLAGS \
+	{ XFS_BLI_HOLD,		"HOLD" }, \
+	{ XFS_BLI_DIRTY,	"DIRTY" }, \
+	{ XFS_BLI_STALE,	"STALE" }, \
+	{ XFS_BLI_LOGGED,	"LOGGED" }, \
+	{ XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
+	{ XFS_BLI_STALE_INODE,	"STALE_INODE" }, \
+	{ XFS_BLI_INODE_BUF,	"INODE_BUF" }
+
+
+#ifdef __KERNEL__
+
+struct xfs_buf;
+struct xfs_mount;
+struct xfs_buf_log_item;
+
+/*
+ * This is the in core log item structure used to track information
+ * needed to log buffers.  It tracks how many times the lock has been
+ * locked, and which 128 byte chunks of the buffer are dirty.
+ */
+typedef struct xfs_buf_log_item {
+	xfs_log_item_t		bli_item;	/* common item structure */
+	struct xfs_buf		*bli_buf;	/* real buffer pointer */
+	unsigned int		bli_flags;	/* misc flags */
+	unsigned int		bli_recur;	/* lock recursion count */
+	atomic_t		bli_refcount;	/* cnt of tp refs */
+#ifdef XFS_TRANS_DEBUG
+	char			*bli_orig;	/* original buffer copy */
+	char			*bli_logged;	/* bytes logged (bitmap) */
+#endif
+	xfs_buf_log_format_t	bli_format;	/* in-log header */
+} xfs_buf_log_item_t;
+
+void	xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+void	xfs_buf_item_relse(struct xfs_buf *);
+void	xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
+uint	xfs_buf_item_dirty(xfs_buf_log_item_t *);
+void	xfs_buf_attach_iodone(struct xfs_buf *,
+			      void(*)(struct xfs_buf *, xfs_log_item_t *),
+			      xfs_log_item_t *);
+void	xfs_buf_iodone_callbacks(struct xfs_buf *);
+void	xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
+
+#ifdef XFS_TRANS_DEBUG
+void
+xfs_buf_item_flush_log_debug(
+	struct xfs_buf *bp,
+	uint	first,
+	uint	last);
+#else
+#define	xfs_buf_item_flush_log_debug(bp, first, last)
+#endif
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
new file mode 100644
index 00000000..6102ac6d
--- /dev/null
+++ b/fs/xfs/xfs_da_btree.c
@@ -0,0 +1,2463 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_dir2_node.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+/*
+ * xfs_da_btree.c
+ *
+ * Routines to implement directories as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_da_root_split(xfs_da_state_t *state,
+					    xfs_da_state_blk_t *existing_root,
+					    xfs_da_state_blk_t *new_child);
+STATIC int xfs_da_node_split(xfs_da_state_t *state,
+					    xfs_da_state_blk_t *existing_blk,
+					    xfs_da_state_blk_t *split_blk,
+					    xfs_da_state_blk_t *blk_to_add,
+					    int treelevel,
+					    int *result);
+STATIC void xfs_da_node_rebalance(xfs_da_state_t *state,
+					 xfs_da_state_blk_t *node_blk_1,
+					 xfs_da_state_blk_t *node_blk_2);
+STATIC void xfs_da_node_add(xfs_da_state_t *state,
+				   xfs_da_state_blk_t *old_node_blk,
+				   xfs_da_state_blk_t *new_node_blk);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+STATIC int xfs_da_root_join(xfs_da_state_t *state,
+					   xfs_da_state_blk_t *root_blk);
+STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval);
+STATIC void xfs_da_node_remove(xfs_da_state_t *state,
+					      xfs_da_state_blk_t *drop_blk);
+STATIC void xfs_da_node_unbalance(xfs_da_state_t *state,
+					 xfs_da_state_blk_t *src_node_blk,
+					 xfs_da_state_blk_t *dst_node_blk);
+
+/*
+ * Utility routines.
+ */
+STATIC uint	xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count);
+STATIC int	xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp);
+STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra);
+STATIC int	xfs_da_blk_unlink(xfs_da_state_t *state,
+				  xfs_da_state_blk_t *drop_blk,
+				  xfs_da_state_blk_t *save_blk);
+STATIC void	xfs_da_state_kill_altpath(xfs_da_state_t *state);
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of an intermediate node.
+ */
+int
+xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
+				 xfs_dabuf_t **bpp, int whichfork)
+{
+	xfs_da_intnode_t *node;
+	xfs_dabuf_t *bp;
+	int error;
+	xfs_trans_t *tp;
+
+	tp = args->trans;
+	error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+	node = bp->data;
+	node->hdr.info.forw = 0;
+	node->hdr.info.back = 0;
+	node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC);
+	node->hdr.info.pad = 0;
+	node->hdr.count = 0;
+	node->hdr.level = cpu_to_be16(level);
+
+	xfs_da_log_buf(tp, bp,
+		XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+
+	*bpp = bp;
+	return(0);
+}
+
+/*
+ * Split a leaf node, rebalance, then possibly split
+ * intermediate nodes, rebalance, etc.
+ */
+int							/* error */
+xfs_da_split(xfs_da_state_t *state)
+{
+	xfs_da_state_blk_t *oldblk, *newblk, *addblk;
+	xfs_da_intnode_t *node;
+	xfs_dabuf_t *bp;
+	int max, action, error, i;
+
+	/*
+	 * Walk back up the tree splitting/inserting/adjusting as necessary.
+	 * If we need to insert and there isn't room, split the node, then
+	 * decide which fragment to insert the new block from below into.
+	 * Note that we may split the root this way, but we need more fixup.
+	 */
+	max = state->path.active - 1;
+	ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
+	ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
+	       state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+
+	addblk = &state->path.blk[max];		/* initial dummy value */
+	for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
+		oldblk = &state->path.blk[i];
+		newblk = &state->altpath.blk[i];
+
+		/*
+		 * If a leaf node then
+		 *     Allocate a new leaf node, then rebalance across them.
+		 * else if an intermediate node then
+		 *     We split on the last layer, must we split the node?
+		 */
+		switch (oldblk->magic) {
+		case XFS_ATTR_LEAF_MAGIC:
+			error = xfs_attr_leaf_split(state, oldblk, newblk);
+			if ((error != 0) && (error != ENOSPC)) {
+				return(error);	/* GROT: attr is inconsistent */
+			}
+			if (!error) {
+				addblk = newblk;
+				break;
+			}
+			/*
+			 * Entry wouldn't fit, split the leaf again.
+			 */
+			state->extravalid = 1;
+			if (state->inleaf) {
+				state->extraafter = 0;	/* before newblk */
+				error = xfs_attr_leaf_split(state, oldblk,
+							    &state->extrablk);
+			} else {
+				state->extraafter = 1;	/* after newblk */
+				error = xfs_attr_leaf_split(state, newblk,
+							    &state->extrablk);
+			}
+			if (error)
+				return(error);	/* GROT: attr inconsistent */
+			addblk = newblk;
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+			error = xfs_dir2_leafn_split(state, oldblk, newblk);
+			if (error)
+				return error;
+			addblk = newblk;
+			break;
+		case XFS_DA_NODE_MAGIC:
+			error = xfs_da_node_split(state, oldblk, newblk, addblk,
+							 max - i, &action);
+			xfs_da_buf_done(addblk->bp);
+			addblk->bp = NULL;
+			if (error)
+				return(error);	/* GROT: dir is inconsistent */
+			/*
+			 * Record the newly split block for the next time thru?
+			 */
+			if (action)
+				addblk = newblk;
+			else
+				addblk = NULL;
+			break;
+		}
+
+		/*
+		 * Update the btree to show the new hashval for this child.
+		 */
+		xfs_da_fixhashpath(state, &state->path);
+		/*
+		 * If we won't need this block again, it's getting dropped
+		 * from the active path by the loop control, so we need
+		 * to mark it done now.
+		 */
+		if (i > 0 || !addblk)
+			xfs_da_buf_done(oldblk->bp);
+	}
+	if (!addblk)
+		return(0);
+
+	/*
+	 * Split the root node.
+	 */
+	ASSERT(state->path.active == 0);
+	oldblk = &state->path.blk[0];
+	error = xfs_da_root_split(state, oldblk, addblk);
+	if (error) {
+		xfs_da_buf_done(oldblk->bp);
+		xfs_da_buf_done(addblk->bp);
+		addblk->bp = NULL;
+		return(error);	/* GROT: dir is inconsistent */
+	}
+
+	/*
+	 * Update pointers to the node which used to be block 0 and
+	 * just got bumped because of the addition of a new root node.
+	 * There might be three blocks involved if a double split occurred,
+	 * and the original block 0 could be at any position in the list.
+	 */
+
+	node = oldblk->bp->data;
+	if (node->hdr.info.forw) {
+		if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
+			bp = addblk->bp;
+		} else {
+			ASSERT(state->extravalid);
+			bp = state->extrablk.bp;
+		}
+		node = bp->data;
+		node->hdr.info.back = cpu_to_be32(oldblk->blkno);
+		xfs_da_log_buf(state->args->trans, bp,
+		    XFS_DA_LOGRANGE(node, &node->hdr.info,
+		    sizeof(node->hdr.info)));
+	}
+	node = oldblk->bp->data;
+	if (node->hdr.info.back) {
+		if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
+			bp = addblk->bp;
+		} else {
+			ASSERT(state->extravalid);
+			bp = state->extrablk.bp;
+		}
+		node = bp->data;
+		node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
+		xfs_da_log_buf(state->args->trans, bp,
+		    XFS_DA_LOGRANGE(node, &node->hdr.info,
+		    sizeof(node->hdr.info)));
+	}
+	xfs_da_buf_done(oldblk->bp);
+	xfs_da_buf_done(addblk->bp);
+	addblk->bp = NULL;
+	return(0);
+}
+
+/*
+ * Split the root.  We have to create a new root and point to the two
+ * parts (the split old root) that we just created.  Copy block zero to
+ * the EOF, extending the inode in process.
+ */
+STATIC int						/* error */
+xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+				 xfs_da_state_blk_t *blk2)
+{
+	xfs_da_intnode_t *node, *oldroot;
+	xfs_da_args_t *args;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+	int error, size;
+	xfs_inode_t *dp;
+	xfs_trans_t *tp;
+	xfs_mount_t *mp;
+	xfs_dir2_leaf_t *leaf;
+
+	/*
+	 * Copy the existing (incorrect) block from the root node position
+	 * to a free space somewhere.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error)
+		return(error);
+	dp = args->dp;
+	tp = args->trans;
+	mp = state->mp;
+	error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+	node = bp->data;
+	oldroot = blk1->bp->data;
+	if (be16_to_cpu(oldroot->hdr.info.magic) == XFS_DA_NODE_MAGIC) {
+		size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] -
+			     (char *)oldroot);
+	} else {
+		ASSERT(be16_to_cpu(oldroot->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+		leaf = (xfs_dir2_leaf_t *)oldroot;
+		size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] -
+			     (char *)leaf);
+	}
+	memcpy(node, oldroot, size);
+	xfs_da_log_buf(tp, bp, 0, size - 1);
+	xfs_da_buf_done(blk1->bp);
+	blk1->bp = bp;
+	blk1->blkno = blkno;
+
+	/*
+	 * Set up the new root node.
+	 */
+	error = xfs_da_node_create(args,
+		(args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0,
+		be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork);
+	if (error)
+		return(error);
+	node = bp->data;
+	node->btree[0].hashval = cpu_to_be32(blk1->hashval);
+	node->btree[0].before = cpu_to_be32(blk1->blkno);
+	node->btree[1].hashval = cpu_to_be32(blk2->hashval);
+	node->btree[1].before = cpu_to_be32(blk2->blkno);
+	node->hdr.count = cpu_to_be16(2);
+
+#ifdef DEBUG
+	if (be16_to_cpu(oldroot->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC) {
+		ASSERT(blk1->blkno >= mp->m_dirleafblk &&
+		       blk1->blkno < mp->m_dirfreeblk);
+		ASSERT(blk2->blkno >= mp->m_dirleafblk &&
+		       blk2->blkno < mp->m_dirfreeblk);
+	}
+#endif
+
+	/* Header is already logged by xfs_da_node_create */
+	xfs_da_log_buf(tp, bp,
+		XFS_DA_LOGRANGE(node, node->btree,
+			sizeof(xfs_da_node_entry_t) * 2));
+	xfs_da_buf_done(bp);
+
+	return(0);
+}
+
+/*
+ * Split the node, rebalance, then add the new entry.
+ */
+STATIC int						/* error */
+xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+				 xfs_da_state_blk_t *newblk,
+				 xfs_da_state_blk_t *addblk,
+				 int treelevel, int *result)
+{
+	xfs_da_intnode_t *node;
+	xfs_dablk_t blkno;
+	int newcount, error;
+	int useextra;
+
+	node = oldblk->bp->data;
+	ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+
+	/*
+	 * With V2 dirs the extra block is data or freespace.
+	 */
+	useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
+	newcount = 1 + useextra;
+	/*
+	 * Do we have to split the node?
+	 */
+	if ((be16_to_cpu(node->hdr.count) + newcount) > state->node_ents) {
+		/*
+		 * Allocate a new node, add to the doubly linked chain of
+		 * nodes, then move some of our excess entries into it.
+		 */
+		error = xfs_da_grow_inode(state->args, &blkno);
+		if (error)
+			return(error);	/* GROT: dir is inconsistent */
+
+		error = xfs_da_node_create(state->args, blkno, treelevel,
+					   &newblk->bp, state->args->whichfork);
+		if (error)
+			return(error);	/* GROT: dir is inconsistent */
+		newblk->blkno = blkno;
+		newblk->magic = XFS_DA_NODE_MAGIC;
+		xfs_da_node_rebalance(state, oldblk, newblk);
+		error = xfs_da_blk_link(state, oldblk, newblk);
+		if (error)
+			return(error);
+		*result = 1;
+	} else {
+		*result = 0;
+	}
+
+	/*
+	 * Insert the new entry(s) into the correct block
+	 * (updating last hashval in the process).
+	 *
+	 * xfs_da_node_add() inserts BEFORE the given index,
+	 * and as a result of using node_lookup_int() we always
+	 * point to a valid entry (not after one), but a split
+	 * operation always results in a new block whose hashvals
+	 * FOLLOW the current block.
+	 *
+	 * If we had double-split op below us, then add the extra block too.
+	 */
+	node = oldblk->bp->data;
+	if (oldblk->index <= be16_to_cpu(node->hdr.count)) {
+		oldblk->index++;
+		xfs_da_node_add(state, oldblk, addblk);
+		if (useextra) {
+			if (state->extraafter)
+				oldblk->index++;
+			xfs_da_node_add(state, oldblk, &state->extrablk);
+			state->extravalid = 0;
+		}
+	} else {
+		newblk->index++;
+		xfs_da_node_add(state, newblk, addblk);
+		if (useextra) {
+			if (state->extraafter)
+				newblk->index++;
+			xfs_da_node_add(state, newblk, &state->extrablk);
+			state->extravalid = 0;
+		}
+	}
+
+	return(0);
+}
+
+/*
+ * Balance the btree elements between two intermediate nodes,
+ * usually one full and one empty.
+ *
+ * NOTE: if blk2 is empty, then it will get the upper half of blk1.
+ */
+STATIC void
+xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+				     xfs_da_state_blk_t *blk2)
+{
+	xfs_da_intnode_t *node1, *node2, *tmpnode;
+	xfs_da_node_entry_t *btree_s, *btree_d;
+	int count, tmp;
+	xfs_trans_t *tp;
+
+	node1 = blk1->bp->data;
+	node2 = blk2->bp->data;
+	/*
+	 * Figure out how many entries need to move, and in which direction.
+	 * Swap the nodes around if that makes it simpler.
+	 */
+	if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) &&
+	    ((be32_to_cpu(node2->btree[0].hashval) < be32_to_cpu(node1->btree[0].hashval)) ||
+	     (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) <
+	      be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) {
+		tmpnode = node1;
+		node1 = node2;
+		node2 = tmpnode;
+	}
+	ASSERT(be16_to_cpu(node1->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+	ASSERT(be16_to_cpu(node2->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+	count = (be16_to_cpu(node1->hdr.count) - be16_to_cpu(node2->hdr.count)) / 2;
+	if (count == 0)
+		return;
+	tp = state->args->trans;
+	/*
+	 * Two cases: high-to-low and low-to-high.
+	 */
+	if (count > 0) {
+		/*
+		 * Move elements in node2 up to make a hole.
+		 */
+		if ((tmp = be16_to_cpu(node2->hdr.count)) > 0) {
+			tmp *= (uint)sizeof(xfs_da_node_entry_t);
+			btree_s = &node2->btree[0];
+			btree_d = &node2->btree[count];
+			memmove(btree_d, btree_s, tmp);
+		}
+
+		/*
+		 * Move the req'd B-tree elements from high in node1 to
+		 * low in node2.
+		 */
+		be16_add_cpu(&node2->hdr.count, count);
+		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &node1->btree[be16_to_cpu(node1->hdr.count) - count];
+		btree_d = &node2->btree[0];
+		memcpy(btree_d, btree_s, tmp);
+		be16_add_cpu(&node1->hdr.count, -count);
+	} else {
+		/*
+		 * Move the req'd B-tree elements from low in node2 to
+		 * high in node1.
+		 */
+		count = -count;
+		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &node2->btree[0];
+		btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)];
+		memcpy(btree_d, btree_s, tmp);
+		be16_add_cpu(&node1->hdr.count, count);
+		xfs_da_log_buf(tp, blk1->bp,
+			XFS_DA_LOGRANGE(node1, btree_d, tmp));
+
+		/*
+		 * Move elements in node2 down to fill the hole.
+		 */
+		tmp  = be16_to_cpu(node2->hdr.count) - count;
+		tmp *= (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &node2->btree[count];
+		btree_d = &node2->btree[0];
+		memmove(btree_d, btree_s, tmp);
+		be16_add_cpu(&node2->hdr.count, -count);
+	}
+
+	/*
+	 * Log header of node 1 and all current bits of node 2.
+	 */
+	xfs_da_log_buf(tp, blk1->bp,
+		XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr)));
+	xfs_da_log_buf(tp, blk2->bp,
+		XFS_DA_LOGRANGE(node2, &node2->hdr,
+			sizeof(node2->hdr) +
+			sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count)));
+
+	/*
+	 * Record the last hashval from each block for upward propagation.
+	 * (note: don't use the swapped node pointers)
+	 */
+	node1 = blk1->bp->data;
+	node2 = blk2->bp->data;
+	blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval);
+	blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval);
+
+	/*
+	 * Adjust the expected index for insertion.
+	 */
+	if (blk1->index >= be16_to_cpu(node1->hdr.count)) {
+		blk2->index = blk1->index - be16_to_cpu(node1->hdr.count);
+		blk1->index = be16_to_cpu(node1->hdr.count) + 1;	/* make it invalid */
+	}
+}
+
+/*
+ * Add a new entry to an intermediate node.
+ */
+STATIC void
+xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+			       xfs_da_state_blk_t *newblk)
+{
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	int tmp;
+
+	node = oldblk->bp->data;
+	ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+	ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
+	ASSERT(newblk->blkno != 0);
+	if (state->args->whichfork == XFS_DATA_FORK)
+		ASSERT(newblk->blkno >= state->mp->m_dirleafblk &&
+		       newblk->blkno < state->mp->m_dirfreeblk);
+
+	/*
+	 * We may need to make some room before we insert the new node.
+	 */
+	tmp = 0;
+	btree = &node->btree[ oldblk->index ];
+	if (oldblk->index < be16_to_cpu(node->hdr.count)) {
+		tmp = (be16_to_cpu(node->hdr.count) - oldblk->index) * (uint)sizeof(*btree);
+		memmove(btree + 1, btree, tmp);
+	}
+	btree->hashval = cpu_to_be32(newblk->hashval);
+	btree->before = cpu_to_be32(newblk->blkno);
+	xfs_da_log_buf(state->args->trans, oldblk->bp,
+		XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree)));
+	be16_add_cpu(&node->hdr.count, 1);
+	xfs_da_log_buf(state->args->trans, oldblk->bp,
+		XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+
+	/*
+	 * Copy the last hash value from the oldblk to propagate upwards.
+	 */
+	oldblk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1 ].hashval);
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Deallocate an empty leaf node, remove it from its parent,
+ * possibly deallocating that block, etc...
+ */
+int
+xfs_da_join(xfs_da_state_t *state)
+{
+	xfs_da_state_blk_t *drop_blk, *save_blk;
+	int action, error;
+
+	action = 0;
+	drop_blk = &state->path.blk[ state->path.active-1 ];
+	save_blk = &state->altpath.blk[ state->path.active-1 ];
+	ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
+	ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
+	       drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+
+	/*
+	 * Walk back up the tree joining/deallocating as necessary.
+	 * When we stop dropping blocks, break out.
+	 */
+	for (  ; state->path.active >= 2; drop_blk--, save_blk--,
+		 state->path.active--) {
+		/*
+		 * See if we can combine the block with a neighbor.
+		 *   (action == 0) => no options, just leave
+		 *   (action == 1) => coalesce, then unlink
+		 *   (action == 2) => block empty, unlink it
+		 */
+		switch (drop_blk->magic) {
+		case XFS_ATTR_LEAF_MAGIC:
+			error = xfs_attr_leaf_toosmall(state, &action);
+			if (error)
+				return(error);
+			if (action == 0)
+				return(0);
+			xfs_attr_leaf_unbalance(state, drop_blk, save_blk);
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+			error = xfs_dir2_leafn_toosmall(state, &action);
+			if (error)
+				return error;
+			if (action == 0)
+				return 0;
+			xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
+			break;
+		case XFS_DA_NODE_MAGIC:
+			/*
+			 * Remove the offending node, fixup hashvals,
+			 * check for a toosmall neighbor.
+			 */
+			xfs_da_node_remove(state, drop_blk);
+			xfs_da_fixhashpath(state, &state->path);
+			error = xfs_da_node_toosmall(state, &action);
+			if (error)
+				return(error);
+			if (action == 0)
+				return 0;
+			xfs_da_node_unbalance(state, drop_blk, save_blk);
+			break;
+		}
+		xfs_da_fixhashpath(state, &state->altpath);
+		error = xfs_da_blk_unlink(state, drop_blk, save_blk);
+		xfs_da_state_kill_altpath(state);
+		if (error)
+			return(error);
+		error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
+							 drop_blk->bp);
+		drop_blk->bp = NULL;
+		if (error)
+			return(error);
+	}
+	/*
+	 * We joined all the way to the top.  If it turns out that
+	 * we only have one entry in the root, make the child block
+	 * the new root.
+	 */
+	xfs_da_node_remove(state, drop_blk);
+	xfs_da_fixhashpath(state, &state->path);
+	error = xfs_da_root_join(state, &state->path.blk[0]);
+	return(error);
+}
+
+/*
+ * We have only one entry in the root.  Copy the only remaining child of
+ * the old root to block 0 as the new root node.
+ */
+STATIC int
+xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
+{
+	xfs_da_intnode_t *oldroot;
+	/* REFERENCED */
+	xfs_da_blkinfo_t *blkinfo;
+	xfs_da_args_t *args;
+	xfs_dablk_t child;
+	xfs_dabuf_t *bp;
+	int error;
+
+	args = state->args;
+	ASSERT(args != NULL);
+	ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
+	oldroot = root_blk->bp->data;
+	ASSERT(be16_to_cpu(oldroot->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+	ASSERT(!oldroot->hdr.info.forw);
+	ASSERT(!oldroot->hdr.info.back);
+
+	/*
+	 * If the root has more than one child, then don't do anything.
+	 */
+	if (be16_to_cpu(oldroot->hdr.count) > 1)
+		return(0);
+
+	/*
+	 * Read in the (only) child block, then copy those bytes into
+	 * the root block's buffer and free the original child block.
+	 */
+	child = be32_to_cpu(oldroot->btree[0].before);
+	ASSERT(child != 0);
+	error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
+					     args->whichfork);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+	blkinfo = bp->data;
+	if (be16_to_cpu(oldroot->hdr.level) == 1) {
+		ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DIR2_LEAFN_MAGIC ||
+		       be16_to_cpu(blkinfo->magic) == XFS_ATTR_LEAF_MAGIC);
+	} else {
+		ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DA_NODE_MAGIC);
+	}
+	ASSERT(!blkinfo->forw);
+	ASSERT(!blkinfo->back);
+	memcpy(root_blk->bp->data, bp->data, state->blocksize);
+	xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
+	error = xfs_da_shrink_inode(args, child, bp);
+	return(error);
+}
+
+/*
+ * Check a node block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+STATIC int
+xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
+{
+	xfs_da_intnode_t *node;
+	xfs_da_state_blk_t *blk;
+	xfs_da_blkinfo_t *info;
+	int count, forward, error, retval, i;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	info = blk->bp->data;
+	ASSERT(be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC);
+	node = (xfs_da_intnode_t *)info;
+	count = be16_to_cpu(node->hdr.count);
+	if (count > (state->node_ents >> 1)) {
+		*action = 0;	/* blk over 50%, don't try to join */
+		return(0);	/* blk over 50%, don't try to join */
+	}
+
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (arbitrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (info->forw != 0);
+		memcpy(&state->altpath, &state->path, sizeof(state->path));
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error)
+			return(error);
+		if (retval) {
+			*action = 0;
+		} else {
+			*action = 2;
+		}
+		return(0);
+	}
+
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink a directory over time.
+	 */
+	/* start with smaller blk num */
+	forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back));
+	for (i = 0; i < 2; forward = !forward, i++) {
+		if (forward)
+			blkno = be32_to_cpu(info->forw);
+		else
+			blkno = be32_to_cpu(info->back);
+		if (blkno == 0)
+			continue;
+		error = xfs_da_read_buf(state->args->trans, state->args->dp,
+					blkno, -1, &bp, state->args->whichfork);
+		if (error)
+			return(error);
+		ASSERT(bp != NULL);
+
+		node = (xfs_da_intnode_t *)info;
+		count  = state->node_ents;
+		count -= state->node_ents >> 2;
+		count -= be16_to_cpu(node->hdr.count);
+		node = bp->data;
+		ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+		count -= be16_to_cpu(node->hdr.count);
+		xfs_da_brelse(state->args->trans, bp);
+		if (count >= 0)
+			break;	/* fits with at least 25% to spare */
+	}
+	if (i >= 2) {
+		*action = 0;
+		return(0);
+	}
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	memcpy(&state->altpath, &state->path, sizeof(state->path));
+	if (blkno < blk->blkno) {
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error) {
+			return(error);
+		}
+		if (retval) {
+			*action = 0;
+			return(0);
+		}
+	} else {
+		error = xfs_da_path_shift(state, &state->path, forward,
+						 0, &retval);
+		if (error) {
+			return(error);
+		}
+		if (retval) {
+			*action = 0;
+			return(0);
+		}
+	}
+	*action = 1;
+	return(0);
+}
+
+/*
+ * Walk back up the tree adjusting hash values as necessary,
+ * when we stop making changes, return.
+ */
+void
+xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
+{
+	xfs_da_state_blk_t *blk;
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	xfs_dahash_t lasthash=0;
+	int level, count;
+
+	level = path->active-1;
+	blk = &path->blk[ level ];
+	switch (blk->magic) {
+	case XFS_ATTR_LEAF_MAGIC:
+		lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	case XFS_DIR2_LEAFN_MAGIC:
+		lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	case XFS_DA_NODE_MAGIC:
+		lasthash = xfs_da_node_lasthash(blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	}
+	for (blk--, level--; level >= 0; blk--, level--) {
+		node = blk->bp->data;
+		ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+		btree = &node->btree[ blk->index ];
+		if (be32_to_cpu(btree->hashval) == lasthash)
+			break;
+		blk->hashval = lasthash;
+		btree->hashval = cpu_to_be32(lasthash);
+		xfs_da_log_buf(state->args->trans, blk->bp,
+				  XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
+
+		lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
+	}
+}
+
+/*
+ * Remove an entry from an intermediate node.
+ */
+STATIC void
+xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
+{
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	int tmp;
+
+	node = drop_blk->bp->data;
+	ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count));
+	ASSERT(drop_blk->index >= 0);
+
+	/*
+	 * Copy over the offending entry, or just zero it out.
+	 */
+	btree = &node->btree[drop_blk->index];
+	if (drop_blk->index < (be16_to_cpu(node->hdr.count)-1)) {
+		tmp  = be16_to_cpu(node->hdr.count) - drop_blk->index - 1;
+		tmp *= (uint)sizeof(xfs_da_node_entry_t);
+		memmove(btree, btree + 1, tmp);
+		xfs_da_log_buf(state->args->trans, drop_blk->bp,
+		    XFS_DA_LOGRANGE(node, btree, tmp));
+		btree = &node->btree[be16_to_cpu(node->hdr.count)-1];
+	}
+	memset((char *)btree, 0, sizeof(xfs_da_node_entry_t));
+	xfs_da_log_buf(state->args->trans, drop_blk->bp,
+	    XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
+	be16_add_cpu(&node->hdr.count, -1);
+	xfs_da_log_buf(state->args->trans, drop_blk->bp,
+	    XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+
+	/*
+	 * Copy the last hash value from the block to propagate upwards.
+	 */
+	btree--;
+	drop_blk->hashval = be32_to_cpu(btree->hashval);
+}
+
+/*
+ * Unbalance the btree elements between two intermediate nodes,
+ * move all Btree elements from one node into another.
+ */
+STATIC void
+xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+				     xfs_da_state_blk_t *save_blk)
+{
+	xfs_da_intnode_t *drop_node, *save_node;
+	xfs_da_node_entry_t *btree;
+	int tmp;
+	xfs_trans_t *tp;
+
+	drop_node = drop_blk->bp->data;
+	save_node = save_blk->bp->data;
+	ASSERT(be16_to_cpu(drop_node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+	ASSERT(be16_to_cpu(save_node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+	tp = state->args->trans;
+
+	/*
+	 * If the dying block has lower hashvals, then move all the
+	 * elements in the remaining block up to make a hole.
+	 */
+	if ((be32_to_cpu(drop_node->btree[0].hashval) < be32_to_cpu(save_node->btree[ 0 ].hashval)) ||
+	    (be32_to_cpu(drop_node->btree[be16_to_cpu(drop_node->hdr.count)-1].hashval) <
+	     be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval)))
+	{
+		btree = &save_node->btree[be16_to_cpu(drop_node->hdr.count)];
+		tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
+		memmove(btree, &save_node->btree[0], tmp);
+		btree = &save_node->btree[0];
+		xfs_da_log_buf(tp, save_blk->bp,
+			XFS_DA_LOGRANGE(save_node, btree,
+				(be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) *
+				sizeof(xfs_da_node_entry_t)));
+	} else {
+		btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)];
+		xfs_da_log_buf(tp, save_blk->bp,
+			XFS_DA_LOGRANGE(save_node, btree,
+				be16_to_cpu(drop_node->hdr.count) *
+				sizeof(xfs_da_node_entry_t)));
+	}
+
+	/*
+	 * Move all the B-tree elements from drop_blk to save_blk.
+	 */
+	tmp = be16_to_cpu(drop_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
+	memcpy(btree, &drop_node->btree[0], tmp);
+	be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count));
+
+	xfs_da_log_buf(tp, save_blk->bp,
+		XFS_DA_LOGRANGE(save_node, &save_node->hdr,
+			sizeof(save_node->hdr)));
+
+	/*
+	 * Save the last hashval in the remaining block for upward propagation.
+	 */
+	save_blk->hashval = be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Walk down the Btree looking for a particular filename, filling
+ * in the state structure as we go.
+ *
+ * We will set the state structure to point to each of the elements
+ * in each of the nodes where either the hashval is or should be.
+ *
+ * We support duplicate hashval's so for each entry in the current
+ * node that could contain the desired hashval, descend.  This is a
+ * pruned depth-first tree search.
+ */
+int							/* error */
+xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
+{
+	xfs_da_state_blk_t *blk;
+	xfs_da_blkinfo_t *curr;
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	xfs_dablk_t blkno;
+	int probe, span, max, error, retval;
+	xfs_dahash_t hashval, btreehashval;
+	xfs_da_args_t *args;
+
+	args = state->args;
+
+	/*
+	 * Descend thru the B-tree searching each level for the right
+	 * node to use, until the right hashval is found.
+	 */
+	blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0;
+	for (blk = &state->path.blk[0], state->path.active = 1;
+			 state->path.active <= XFS_DA_NODE_MAXDEPTH;
+			 blk++, state->path.active++) {
+		/*
+		 * Read the next node down in the tree.
+		 */
+		blk->blkno = blkno;
+		error = xfs_da_read_buf(args->trans, args->dp, blkno,
+					-1, &blk->bp, args->whichfork);
+		if (error) {
+			blk->blkno = 0;
+			state->path.active--;
+			return(error);
+		}
+		curr = blk->bp->data;
+		blk->magic = be16_to_cpu(curr->magic);
+		ASSERT(blk->magic == XFS_DA_NODE_MAGIC ||
+		       blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+		       blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+		/*
+		 * Search an intermediate node for a match.
+		 */
+		if (blk->magic == XFS_DA_NODE_MAGIC) {
+			node = blk->bp->data;
+			max = be16_to_cpu(node->hdr.count);
+			blk->hashval = be32_to_cpu(node->btree[max-1].hashval);
+
+			/*
+			 * Binary search.  (note: small blocks will skip loop)
+			 */
+			probe = span = max / 2;
+			hashval = args->hashval;
+			for (btree = &node->btree[probe]; span > 4;
+				   btree = &node->btree[probe]) {
+				span /= 2;
+				btreehashval = be32_to_cpu(btree->hashval);
+				if (btreehashval < hashval)
+					probe += span;
+				else if (btreehashval > hashval)
+					probe -= span;
+				else
+					break;
+			}
+			ASSERT((probe >= 0) && (probe < max));
+			ASSERT((span <= 4) || (be32_to_cpu(btree->hashval) == hashval));
+
+			/*
+			 * Since we may have duplicate hashval's, find the first
+			 * matching hashval in the node.
+			 */
+			while ((probe > 0) && (be32_to_cpu(btree->hashval) >= hashval)) {
+				btree--;
+				probe--;
+			}
+			while ((probe < max) && (be32_to_cpu(btree->hashval) < hashval)) {
+				btree++;
+				probe++;
+			}
+
+			/*
+			 * Pick the right block to descend on.
+			 */
+			if (probe == max) {
+				blk->index = max-1;
+				blkno = be32_to_cpu(node->btree[max-1].before);
+			} else {
+				blk->index = probe;
+				blkno = be32_to_cpu(btree->before);
+			}
+		} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+			break;
+		} else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
+			blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
+			break;
+		}
+	}
+
+	/*
+	 * A leaf block that ends in the hashval that we are interested in
+	 * (final hashval == search hashval) means that the next block may
+	 * contain more entries with the same hashval, shift upward to the
+	 * next leaf and keep searching.
+	 */
+	for (;;) {
+		if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
+			retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
+							&blk->index, state);
+		} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+			retval = xfs_attr_leaf_lookup_int(blk->bp, args);
+			blk->index = args->index;
+			args->blkno = blk->blkno;
+		} else {
+			ASSERT(0);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		if (((retval == ENOENT) || (retval == ENOATTR)) &&
+		    (blk->hashval == args->hashval)) {
+			error = xfs_da_path_shift(state, &state->path, 1, 1,
+							 &retval);
+			if (error)
+				return(error);
+			if (retval == 0) {
+				continue;
+			} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+				/* path_shift() gives ENOENT */
+				retval = XFS_ERROR(ENOATTR);
+			}
+		}
+		break;
+	}
+	*result = retval;
+	return(0);
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Link a new block into a doubly linked list of blocks (of whatever type).
+ */
+int							/* error */
+xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+			       xfs_da_state_blk_t *new_blk)
+{
+	xfs_da_blkinfo_t *old_info, *new_info, *tmp_info;
+	xfs_da_args_t *args;
+	int before=0, error;
+	xfs_dabuf_t *bp;
+
+	/*
+	 * Set up environment.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	old_info = old_blk->bp->data;
+	new_info = new_blk->bp->data;
+	ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
+	       old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+	       old_blk->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(old_blk->magic == be16_to_cpu(old_info->magic));
+	ASSERT(new_blk->magic == be16_to_cpu(new_info->magic));
+	ASSERT(old_blk->magic == new_blk->magic);
+
+	switch (old_blk->magic) {
+	case XFS_ATTR_LEAF_MAGIC:
+		before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
+		break;
+	case XFS_DIR2_LEAFN_MAGIC:
+		before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
+		break;
+	case XFS_DA_NODE_MAGIC:
+		before = xfs_da_node_order(old_blk->bp, new_blk->bp);
+		break;
+	}
+
+	/*
+	 * Link blocks in appropriate order.
+	 */
+	if (before) {
+		/*
+		 * Link new block in before existing block.
+		 */
+		new_info->forw = cpu_to_be32(old_blk->blkno);
+		new_info->back = old_info->back;
+		if (old_info->back) {
+			error = xfs_da_read_buf(args->trans, args->dp,
+						be32_to_cpu(old_info->back),
+						-1, &bp, args->whichfork);
+			if (error)
+				return(error);
+			ASSERT(bp != NULL);
+			tmp_info = bp->data;
+			ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic));
+			ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
+			tmp_info->forw = cpu_to_be32(new_blk->blkno);
+			xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+			xfs_da_buf_done(bp);
+		}
+		old_info->back = cpu_to_be32(new_blk->blkno);
+	} else {
+		/*
+		 * Link new block in after existing block.
+		 */
+		new_info->forw = old_info->forw;
+		new_info->back = cpu_to_be32(old_blk->blkno);
+		if (old_info->forw) {
+			error = xfs_da_read_buf(args->trans, args->dp,
+						be32_to_cpu(old_info->forw),
+						-1, &bp, args->whichfork);
+			if (error)
+				return(error);
+			ASSERT(bp != NULL);
+			tmp_info = bp->data;
+			ASSERT(tmp_info->magic == old_info->magic);
+			ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
+			tmp_info->back = cpu_to_be32(new_blk->blkno);
+			xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+			xfs_da_buf_done(bp);
+		}
+		old_info->forw = cpu_to_be32(new_blk->blkno);
+	}
+
+	xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
+	xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
+	return(0);
+}
+
+/*
+ * Compare two intermediate nodes for "order".
+ */
+STATIC int
+xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp)
+{
+	xfs_da_intnode_t *node1, *node2;
+
+	node1 = node1_bp->data;
+	node2 = node2_bp->data;
+	ASSERT((be16_to_cpu(node1->hdr.info.magic) == XFS_DA_NODE_MAGIC) &&
+	       (be16_to_cpu(node2->hdr.info.magic) == XFS_DA_NODE_MAGIC));
+	if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) &&
+	    ((be32_to_cpu(node2->btree[0].hashval) <
+	      be32_to_cpu(node1->btree[0].hashval)) ||
+	     (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) <
+	      be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) {
+		return(1);
+	}
+	return(0);
+}
+
+/*
+ * Pick up the last hashvalue from an intermediate node.
+ */
+STATIC uint
+xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count)
+{
+	xfs_da_intnode_t *node;
+
+	node = bp->data;
+	ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+	if (count)
+		*count = be16_to_cpu(node->hdr.count);
+	if (!node->hdr.count)
+		return(0);
+	return be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
+}
+
+/*
+ * Unlink a block from a doubly linked list of blocks.
+ */
+STATIC int						/* error */
+xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+				 xfs_da_state_blk_t *save_blk)
+{
+	xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info;
+	xfs_da_args_t *args;
+	xfs_dabuf_t *bp;
+	int error;
+
+	/*
+	 * Set up environment.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	save_info = save_blk->bp->data;
+	drop_info = drop_blk->bp->data;
+	ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
+	       save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+	       save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(save_blk->magic == be16_to_cpu(save_info->magic));
+	ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic));
+	ASSERT(save_blk->magic == drop_blk->magic);
+	ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
+	       (be32_to_cpu(save_info->back) == drop_blk->blkno));
+	ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
+	       (be32_to_cpu(drop_info->back) == save_blk->blkno));
+
+	/*
+	 * Unlink the leaf block from the doubly linked chain of leaves.
+	 */
+	if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
+		save_info->back = drop_info->back;
+		if (drop_info->back) {
+			error = xfs_da_read_buf(args->trans, args->dp,
+						be32_to_cpu(drop_info->back),
+						-1, &bp, args->whichfork);
+			if (error)
+				return(error);
+			ASSERT(bp != NULL);
+			tmp_info = bp->data;
+			ASSERT(tmp_info->magic == save_info->magic);
+			ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
+			tmp_info->forw = cpu_to_be32(save_blk->blkno);
+			xfs_da_log_buf(args->trans, bp, 0,
+						    sizeof(*tmp_info) - 1);
+			xfs_da_buf_done(bp);
+		}
+	} else {
+		save_info->forw = drop_info->forw;
+		if (drop_info->forw) {
+			error = xfs_da_read_buf(args->trans, args->dp,
+						be32_to_cpu(drop_info->forw),
+						-1, &bp, args->whichfork);
+			if (error)
+				return(error);
+			ASSERT(bp != NULL);
+			tmp_info = bp->data;
+			ASSERT(tmp_info->magic == save_info->magic);
+			ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
+			tmp_info->back = cpu_to_be32(save_blk->blkno);
+			xfs_da_log_buf(args->trans, bp, 0,
+						    sizeof(*tmp_info) - 1);
+			xfs_da_buf_done(bp);
+		}
+	}
+
+	xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
+	return(0);
+}
+
+/*
+ * Move a path "forward" or "!forward" one block at the current level.
+ *
+ * This routine will adjust a "path" to point to the next block
+ * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
+ * Btree, including updating pointers to the intermediate nodes between
+ * the new bottom and the root.
+ */
+int							/* error */
+xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+				 int forward, int release, int *result)
+{
+	xfs_da_state_blk_t *blk;
+	xfs_da_blkinfo_t *info;
+	xfs_da_intnode_t *node;
+	xfs_da_args_t *args;
+	xfs_dablk_t blkno=0;
+	int level, error;
+
+	/*
+	 * Roll up the Btree looking for the first block where our
+	 * current index is not at the edge of the block.  Note that
+	 * we skip the bottom layer because we want the sibling block.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	ASSERT(path != NULL);
+	ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	level = (path->active-1) - 1;	/* skip bottom layer in path */
+	for (blk = &path->blk[level]; level >= 0; blk--, level--) {
+		ASSERT(blk->bp != NULL);
+		node = blk->bp->data;
+		ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+		if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) {
+			blk->index++;
+			blkno = be32_to_cpu(node->btree[blk->index].before);
+			break;
+		} else if (!forward && (blk->index > 0)) {
+			blk->index--;
+			blkno = be32_to_cpu(node->btree[blk->index].before);
+			break;
+		}
+	}
+	if (level < 0) {
+		*result = XFS_ERROR(ENOENT);	/* we're out of our tree */
+		ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+		return(0);
+	}
+
+	/*
+	 * Roll down the edge of the subtree until we reach the
+	 * same depth we were at originally.
+	 */
+	for (blk++, level++; level < path->active; blk++, level++) {
+		/*
+		 * Release the old block.
+		 * (if it's dirty, trans won't actually let go)
+		 */
+		if (release)
+			xfs_da_brelse(args->trans, blk->bp);
+
+		/*
+		 * Read the next child block.
+		 */
+		blk->blkno = blkno;
+		error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
+						     &blk->bp, args->whichfork);
+		if (error)
+			return(error);
+		ASSERT(blk->bp != NULL);
+		info = blk->bp->data;
+		ASSERT(be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC ||
+		       be16_to_cpu(info->magic) == XFS_DIR2_LEAFN_MAGIC ||
+		       be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC);
+		blk->magic = be16_to_cpu(info->magic);
+		if (blk->magic == XFS_DA_NODE_MAGIC) {
+			node = (xfs_da_intnode_t *)info;
+			blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
+			if (forward)
+				blk->index = 0;
+			else
+				blk->index = be16_to_cpu(node->hdr.count)-1;
+			blkno = be32_to_cpu(node->btree[blk->index].before);
+		} else {
+			ASSERT(level == path->active-1);
+			blk->index = 0;
+			switch(blk->magic) {
+			case XFS_ATTR_LEAF_MAGIC:
+				blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
+								      NULL);
+				break;
+			case XFS_DIR2_LEAFN_MAGIC:
+				blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
+								       NULL);
+				break;
+			default:
+				ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC ||
+				       blk->magic == XFS_DIR2_LEAFN_MAGIC);
+				break;
+			}
+		}
+	}
+	*result = 0;
+	return(0);
+}
+
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Implement a simple hash on a character string.
+ * Rotate the hash value by 7 bits, then XOR each character in.
+ * This is implemented with some source-level loop unrolling.
+ */
+xfs_dahash_t
+xfs_da_hashname(const __uint8_t *name, int namelen)
+{
+	xfs_dahash_t hash;
+
+	/*
+	 * Do four characters at a time as long as we can.
+	 */
+	for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
+		hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
+		       (name[3] << 0) ^ rol32(hash, 7 * 4);
+
+	/*
+	 * Now do the rest of the characters.
+	 */
+	switch (namelen) {
+	case 3:
+		return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
+		       rol32(hash, 7 * 3);
+	case 2:
+		return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
+	case 1:
+		return (name[0] << 0) ^ rol32(hash, 7 * 1);
+	default: /* case 0: */
+		return hash;
+	}
+}
+
+enum xfs_dacmp
+xfs_da_compname(
+	struct xfs_da_args *args,
+	const unsigned char *name,
+	int		len)
+{
+	return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
+					XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
+}
+
+static xfs_dahash_t
+xfs_default_hashname(
+	struct xfs_name	*name)
+{
+	return xfs_da_hashname(name->name, name->len);
+}
+
+const struct xfs_nameops xfs_default_nameops = {
+	.hashname	= xfs_default_hashname,
+	.compname	= xfs_da_compname
+};
+
+/*
+ * Add a block to the btree ahead of the file.
+ * Return the new block number to the caller.
+ */
+int
+xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
+{
+	xfs_fileoff_t bno, b;
+	xfs_bmbt_irec_t map;
+	xfs_bmbt_irec_t	*mapp;
+	xfs_inode_t *dp;
+	int nmap, error, w, count, c, got, i, mapi;
+	xfs_trans_t *tp;
+	xfs_mount_t *mp;
+	xfs_drfsbno_t	nblks;
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	w = args->whichfork;
+	tp = args->trans;
+	nblks = dp->i_d.di_nblocks;
+
+	/*
+	 * For new directories adjust the file offset and block count.
+	 */
+	if (w == XFS_DATA_FORK) {
+		bno = mp->m_dirleafblk;
+		count = mp->m_dirblkfsbs;
+	} else {
+		bno = 0;
+		count = 1;
+	}
+	/*
+	 * Find a spot in the file space to put the new block.
+	 */
+	if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w)))
+		return error;
+	if (w == XFS_DATA_FORK)
+		ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk);
+	/*
+	 * Try mapping it in one filesystem block.
+	 */
+	nmap = 1;
+	ASSERT(args->firstblock != NULL);
+	if ((error = xfs_bmapi(tp, dp, bno, count,
+			xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
+			XFS_BMAPI_CONTIG,
+			args->firstblock, args->total, &map, &nmap,
+			args->flist))) {
+		return error;
+	}
+	ASSERT(nmap <= 1);
+	if (nmap == 1) {
+		mapp = &map;
+		mapi = 1;
+	}
+	/*
+	 * If we didn't get it and the block might work if fragmented,
+	 * try without the CONTIG flag.  Loop until we get it all.
+	 */
+	else if (nmap == 0 && count > 1) {
+		mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+		for (b = bno, mapi = 0; b < bno + count; ) {
+			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
+			c = (int)(bno + count - b);
+			if ((error = xfs_bmapi(tp, dp, b, c,
+					xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
+					XFS_BMAPI_METADATA,
+					args->firstblock, args->total,
+					&mapp[mapi], &nmap, args->flist))) {
+				kmem_free(mapp);
+				return error;
+			}
+			if (nmap < 1)
+				break;
+			mapi += nmap;
+			b = mapp[mapi - 1].br_startoff +
+			    mapp[mapi - 1].br_blockcount;
+		}
+	} else {
+		mapi = 0;
+		mapp = NULL;
+	}
+	/*
+	 * Count the blocks we got, make sure it matches the total.
+	 */
+	for (i = 0, got = 0; i < mapi; i++)
+		got += mapp[i].br_blockcount;
+	if (got != count || mapp[0].br_startoff != bno ||
+	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+	    bno + count) {
+		if (mapp != &map)
+			kmem_free(mapp);
+		return XFS_ERROR(ENOSPC);
+	}
+	if (mapp != &map)
+		kmem_free(mapp);
+	/* account for newly allocated blocks in reserved blocks total */
+	args->total -= dp->i_d.di_nblocks - nblks;
+	*new_blkno = (xfs_dablk_t)bno;
+	return 0;
+}
+
+/*
+ * Ick.  We need to always be able to remove a btree block, even
+ * if there's no space reservation because the filesystem is full.
+ * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
+ * It swaps the target block with the last block in the file.  The
+ * last block in the file can always be removed since it can't cause
+ * a bmap btree split to do that.
+ */
+STATIC int
+xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
+		      xfs_dabuf_t **dead_bufp)
+{
+	xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno;
+	xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf;
+	xfs_fileoff_t lastoff;
+	xfs_inode_t *ip;
+	xfs_trans_t *tp;
+	xfs_mount_t *mp;
+	int error, w, entno, level, dead_level;
+	xfs_da_blkinfo_t *dead_info, *sib_info;
+	xfs_da_intnode_t *par_node, *dead_node;
+	xfs_dir2_leaf_t *dead_leaf2;
+	xfs_dahash_t dead_hash;
+
+	dead_buf = *dead_bufp;
+	dead_blkno = *dead_blknop;
+	tp = args->trans;
+	ip = args->dp;
+	w = args->whichfork;
+	ASSERT(w == XFS_DATA_FORK);
+	mp = ip->i_mount;
+	lastoff = mp->m_dirfreeblk;
+	error = xfs_bmap_last_before(tp, ip, &lastoff, w);
+	if (error)
+		return error;
+	if (unlikely(lastoff == 0)) {
+		XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
+				 mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	/*
+	 * Read the last block in the btree space.
+	 */
+	last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
+	if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w)))
+		return error;
+	/*
+	 * Copy the last block into the dead buffer and log it.
+	 */
+	memcpy(dead_buf->data, last_buf->data, mp->m_dirblksize);
+	xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1);
+	dead_info = dead_buf->data;
+	/*
+	 * Get values from the moved block.
+	 */
+	if (be16_to_cpu(dead_info->magic) == XFS_DIR2_LEAFN_MAGIC) {
+		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
+		dead_level = 0;
+		dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval);
+	} else {
+		ASSERT(be16_to_cpu(dead_info->magic) == XFS_DA_NODE_MAGIC);
+		dead_node = (xfs_da_intnode_t *)dead_info;
+		dead_level = be16_to_cpu(dead_node->hdr.level);
+		dead_hash = be32_to_cpu(dead_node->btree[be16_to_cpu(dead_node->hdr.count) - 1].hashval);
+	}
+	sib_buf = par_buf = NULL;
+	/*
+	 * If the moved block has a left sibling, fix up the pointers.
+	 */
+	if ((sib_blkno = be32_to_cpu(dead_info->back))) {
+		if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+			goto done;
+		sib_info = sib_buf->data;
+		if (unlikely(
+		    be32_to_cpu(sib_info->forw) != last_blkno ||
+		    sib_info->magic != dead_info->magic)) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		sib_info->forw = cpu_to_be32(dead_blkno);
+		xfs_da_log_buf(tp, sib_buf,
+			XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
+					sizeof(sib_info->forw)));
+		xfs_da_buf_done(sib_buf);
+		sib_buf = NULL;
+	}
+	/*
+	 * If the moved block has a right sibling, fix up the pointers.
+	 */
+	if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
+		if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+			goto done;
+		sib_info = sib_buf->data;
+		if (unlikely(
+		       be32_to_cpu(sib_info->back) != last_blkno ||
+		       sib_info->magic != dead_info->magic)) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		sib_info->back = cpu_to_be32(dead_blkno);
+		xfs_da_log_buf(tp, sib_buf,
+			XFS_DA_LOGRANGE(sib_info, &sib_info->back,
+					sizeof(sib_info->back)));
+		xfs_da_buf_done(sib_buf);
+		sib_buf = NULL;
+	}
+	par_blkno = mp->m_dirleafblk;
+	level = -1;
+	/*
+	 * Walk down the tree looking for the parent of the moved block.
+	 */
+	for (;;) {
+		if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+			goto done;
+		par_node = par_buf->data;
+		if (unlikely(
+		    be16_to_cpu(par_node->hdr.info.magic) != XFS_DA_NODE_MAGIC ||
+		    (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		level = be16_to_cpu(par_node->hdr.level);
+		for (entno = 0;
+		     entno < be16_to_cpu(par_node->hdr.count) &&
+		     be32_to_cpu(par_node->btree[entno].hashval) < dead_hash;
+		     entno++)
+			continue;
+		if (unlikely(entno == be16_to_cpu(par_node->hdr.count))) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		par_blkno = be32_to_cpu(par_node->btree[entno].before);
+		if (level == dead_level + 1)
+			break;
+		xfs_da_brelse(tp, par_buf);
+		par_buf = NULL;
+	}
+	/*
+	 * We're in the right parent block.
+	 * Look for the right entry.
+	 */
+	for (;;) {
+		for (;
+		     entno < be16_to_cpu(par_node->hdr.count) &&
+		     be32_to_cpu(par_node->btree[entno].before) != last_blkno;
+		     entno++)
+			continue;
+		if (entno < be16_to_cpu(par_node->hdr.count))
+			break;
+		par_blkno = be32_to_cpu(par_node->hdr.info.forw);
+		xfs_da_brelse(tp, par_buf);
+		par_buf = NULL;
+		if (unlikely(par_blkno == 0)) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+			goto done;
+		par_node = par_buf->data;
+		if (unlikely(
+		    be16_to_cpu(par_node->hdr.level) != level ||
+		    be16_to_cpu(par_node->hdr.info.magic) != XFS_DA_NODE_MAGIC)) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		entno = 0;
+	}
+	/*
+	 * Update the parent entry pointing to the moved block.
+	 */
+	par_node->btree[entno].before = cpu_to_be32(dead_blkno);
+	xfs_da_log_buf(tp, par_buf,
+		XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before,
+				sizeof(par_node->btree[entno].before)));
+	xfs_da_buf_done(par_buf);
+	xfs_da_buf_done(dead_buf);
+	*dead_blknop = last_blkno;
+	*dead_bufp = last_buf;
+	return 0;
+done:
+	if (par_buf)
+		xfs_da_brelse(tp, par_buf);
+	if (sib_buf)
+		xfs_da_brelse(tp, sib_buf);
+	xfs_da_brelse(tp, last_buf);
+	return error;
+}
+
+/*
+ * Remove a btree block from a directory or attribute.
+ */
+int
+xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+		    xfs_dabuf_t *dead_buf)
+{
+	xfs_inode_t *dp;
+	int done, error, w, count;
+	xfs_trans_t *tp;
+	xfs_mount_t *mp;
+
+	dp = args->dp;
+	w = args->whichfork;
+	tp = args->trans;
+	mp = dp->i_mount;
+	if (w == XFS_DATA_FORK)
+		count = mp->m_dirblkfsbs;
+	else
+		count = 1;
+	for (;;) {
+		/*
+		 * Remove extents.  If we get ENOSPC for a dir we have to move
+		 * the last block to the place we want to kill.
+		 */
+		if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
+				xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+				0, args->firstblock, args->flist,
+				&done)) == ENOSPC) {
+			if (w != XFS_DATA_FORK)
+				break;
+			if ((error = xfs_da_swap_lastblock(args, &dead_blkno,
+					&dead_buf)))
+				break;
+		} else {
+			break;
+		}
+	}
+	xfs_da_binval(tp, dead_buf);
+	return error;
+}
+
+/*
+ * See if the mapping(s) for this btree block are valid, i.e.
+ * don't contain holes, are logically contiguous, and cover the whole range.
+ */
+STATIC int
+xfs_da_map_covers_blocks(
+	int		nmap,
+	xfs_bmbt_irec_t	*mapp,
+	xfs_dablk_t	bno,
+	int		count)
+{
+	int		i;
+	xfs_fileoff_t	off;
+
+	for (i = 0, off = bno; i < nmap; i++) {
+		if (mapp[i].br_startblock == HOLESTARTBLOCK ||
+		    mapp[i].br_startblock == DELAYSTARTBLOCK) {
+			return 0;
+		}
+		if (off != mapp[i].br_startoff) {
+			return 0;
+		}
+		off += mapp[i].br_blockcount;
+	}
+	return off == bno + count;
+}
+
+/*
+ * Make a dabuf.
+ * Used for get_buf, read_buf, read_bufr, and reada_buf.
+ */
+STATIC int
+xfs_da_do_buf(
+	xfs_trans_t	*trans,
+	xfs_inode_t	*dp,
+	xfs_dablk_t	bno,
+	xfs_daddr_t	*mappedbnop,
+	xfs_dabuf_t	**bpp,
+	int		whichfork,
+	int		caller,
+	inst_t		*ra)
+{
+	xfs_buf_t	*bp = NULL;
+	xfs_buf_t	**bplist;
+	int		error=0;
+	int		i;
+	xfs_bmbt_irec_t	map;
+	xfs_bmbt_irec_t	*mapp;
+	xfs_daddr_t	mappedbno;
+	xfs_mount_t	*mp;
+	int		nbplist=0;
+	int		nfsb;
+	int		nmap;
+	xfs_dabuf_t	*rbp;
+
+	mp = dp->i_mount;
+	nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1;
+	mappedbno = *mappedbnop;
+	/*
+	 * Caller doesn't have a mapping.  -2 means don't complain
+	 * if we land in a hole.
+	 */
+	if (mappedbno == -1 || mappedbno == -2) {
+		/*
+		 * Optimize the one-block case.
+		 */
+		if (nfsb == 1) {
+			xfs_fsblock_t	fsb;
+
+			if ((error =
+			    xfs_bmapi_single(trans, dp, whichfork, &fsb,
+				    (xfs_fileoff_t)bno))) {
+				return error;
+			}
+			mapp = &map;
+			if (fsb == NULLFSBLOCK) {
+				nmap = 0;
+			} else {
+				map.br_startblock = fsb;
+				map.br_startoff = (xfs_fileoff_t)bno;
+				map.br_blockcount = 1;
+				nmap = 1;
+			}
+		} else {
+			mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
+			nmap = nfsb;
+			if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
+					nfsb,
+					XFS_BMAPI_METADATA |
+						xfs_bmapi_aflag(whichfork),
+					NULL, 0, mapp, &nmap, NULL)))
+				goto exit0;
+		}
+	} else {
+		map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
+		map.br_startoff = (xfs_fileoff_t)bno;
+		map.br_blockcount = nfsb;
+		mapp = &map;
+		nmap = 1;
+	}
+	if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) {
+		error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
+		if (unlikely(error == EFSCORRUPTED)) {
+			if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+				xfs_alert(mp, "%s: bno %lld dir: inode %lld",
+					__func__, (long long)bno,
+					(long long)dp->i_ino);
+				for (i = 0; i < nmap; i++) {
+					xfs_alert(mp,
+"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
+						i,
+						(long long)mapp[i].br_startoff,
+						(long long)mapp[i].br_startblock,
+						(long long)mapp[i].br_blockcount,
+						mapp[i].br_state);
+				}
+			}
+			XFS_ERROR_REPORT("xfs_da_do_buf(1)",
+					 XFS_ERRLEVEL_LOW, mp);
+		}
+		goto exit0;
+	}
+	if (caller != 3 && nmap > 1) {
+		bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP);
+		nbplist = 0;
+	} else
+		bplist = NULL;
+	/*
+	 * Turn the mapping(s) into buffer(s).
+	 */
+	for (i = 0; i < nmap; i++) {
+		int	nmapped;
+
+		mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock);
+		if (i == 0)
+			*mappedbnop = mappedbno;
+		nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount);
+		switch (caller) {
+		case 0:
+			bp = xfs_trans_get_buf(trans, mp->m_ddev_targp,
+				mappedbno, nmapped, 0);
+			error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO);
+			break;
+		case 1:
+		case 2:
+			bp = NULL;
+			error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp,
+				mappedbno, nmapped, 0, &bp);
+			break;
+		case 3:
+			xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped);
+			error = 0;
+			bp = NULL;
+			break;
+		}
+		if (error) {
+			if (bp)
+				xfs_trans_brelse(trans, bp);
+			goto exit1;
+		}
+		if (!bp)
+			continue;
+		if (caller == 1) {
+			if (whichfork == XFS_ATTR_FORK) {
+				XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE,
+						XFS_ATTR_BTREE_REF);
+			} else {
+				XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE,
+						XFS_DIR_BTREE_REF);
+			}
+		}
+		if (bplist) {
+			bplist[nbplist++] = bp;
+		}
+	}
+	/*
+	 * Build a dabuf structure.
+	 */
+	if (bplist) {
+		rbp = xfs_da_buf_make(nbplist, bplist, ra);
+	} else if (bp)
+		rbp = xfs_da_buf_make(1, &bp, ra);
+	else
+		rbp = NULL;
+	/*
+	 * For read_buf, check the magic number.
+	 */
+	if (caller == 1) {
+		xfs_dir2_data_t		*data;
+		xfs_dir2_free_t		*free;
+		xfs_da_blkinfo_t	*info;
+		uint			magic, magic1;
+
+		info = rbp->data;
+		data = rbp->data;
+		free = rbp->data;
+		magic = be16_to_cpu(info->magic);
+		magic1 = be32_to_cpu(data->hdr.magic);
+		if (unlikely(
+		    XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
+				   (magic != XFS_ATTR_LEAF_MAGIC) &&
+				   (magic != XFS_DIR2_LEAF1_MAGIC) &&
+				   (magic != XFS_DIR2_LEAFN_MAGIC) &&
+				   (magic1 != XFS_DIR2_BLOCK_MAGIC) &&
+				   (magic1 != XFS_DIR2_DATA_MAGIC) &&
+				   (be32_to_cpu(free->hdr.magic) != XFS_DIR2_FREE_MAGIC),
+				mp, XFS_ERRTAG_DA_READ_BUF,
+				XFS_RANDOM_DA_READ_BUF))) {
+			trace_xfs_da_btree_corrupt(rbp->bps[0], _RET_IP_);
+			XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)",
+					     XFS_ERRLEVEL_LOW, mp, info);
+			error = XFS_ERROR(EFSCORRUPTED);
+			xfs_da_brelse(trans, rbp);
+			nbplist = 0;
+			goto exit1;
+		}
+	}
+	if (bplist) {
+		kmem_free(bplist);
+	}
+	if (mapp != &map) {
+		kmem_free(mapp);
+	}
+	if (bpp)
+		*bpp = rbp;
+	return 0;
+exit1:
+	if (bplist) {
+		for (i = 0; i < nbplist; i++)
+			xfs_trans_brelse(trans, bplist[i]);
+		kmem_free(bplist);
+	}
+exit0:
+	if (mapp != &map)
+		kmem_free(mapp);
+	if (bpp)
+		*bpp = NULL;
+	return error;
+}
+
+/*
+ * Get a buffer for the dir/attr block.
+ */
+int
+xfs_da_get_buf(
+	xfs_trans_t	*trans,
+	xfs_inode_t	*dp,
+	xfs_dablk_t	bno,
+	xfs_daddr_t		mappedbno,
+	xfs_dabuf_t	**bpp,
+	int		whichfork)
+{
+	return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0,
+						 (inst_t *)__return_address);
+}
+
+/*
+ * Get a buffer for the dir/attr block, fill in the contents.
+ */
+int
+xfs_da_read_buf(
+	xfs_trans_t	*trans,
+	xfs_inode_t	*dp,
+	xfs_dablk_t	bno,
+	xfs_daddr_t		mappedbno,
+	xfs_dabuf_t	**bpp,
+	int		whichfork)
+{
+	return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1,
+		(inst_t *)__return_address);
+}
+
+/*
+ * Readahead the dir/attr block.
+ */
+xfs_daddr_t
+xfs_da_reada_buf(
+	xfs_trans_t	*trans,
+	xfs_inode_t	*dp,
+	xfs_dablk_t	bno,
+	int		whichfork)
+{
+	xfs_daddr_t		rval;
+
+	rval = -1;
+	if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3,
+			(inst_t *)__return_address))
+		return -1;
+	else
+		return rval;
+}
+
+kmem_zone_t *xfs_da_state_zone;	/* anchor for state struct zone */
+kmem_zone_t *xfs_dabuf_zone;		/* dabuf zone */
+
+/*
+ * Allocate a dir-state structure.
+ * We don't put them on the stack since they're large.
+ */
+xfs_da_state_t *
+xfs_da_state_alloc(void)
+{
+	return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
+}
+
+/*
+ * Kill the altpath contents of a da-state structure.
+ */
+STATIC void
+xfs_da_state_kill_altpath(xfs_da_state_t *state)
+{
+	int	i;
+
+	for (i = 0; i < state->altpath.active; i++) {
+		if (state->altpath.blk[i].bp) {
+			if (state->altpath.blk[i].bp != state->path.blk[i].bp)
+				xfs_da_buf_done(state->altpath.blk[i].bp);
+			state->altpath.blk[i].bp = NULL;
+		}
+	}
+	state->altpath.active = 0;
+}
+
+/*
+ * Free a da-state structure.
+ */
+void
+xfs_da_state_free(xfs_da_state_t *state)
+{
+	int	i;
+
+	xfs_da_state_kill_altpath(state);
+	for (i = 0; i < state->path.active; i++) {
+		if (state->path.blk[i].bp)
+			xfs_da_buf_done(state->path.blk[i].bp);
+	}
+	if (state->extravalid && state->extrablk.bp)
+		xfs_da_buf_done(state->extrablk.bp);
+#ifdef DEBUG
+	memset((char *)state, 0, sizeof(*state));
+#endif /* DEBUG */
+	kmem_zone_free(xfs_da_state_zone, state);
+}
+
+#ifdef XFS_DABUF_DEBUG
+xfs_dabuf_t	*xfs_dabuf_global_list;
+static DEFINE_SPINLOCK(xfs_dabuf_global_lock);
+#endif
+
+/*
+ * Create a dabuf.
+ */
+/* ARGSUSED */
+STATIC xfs_dabuf_t *
+xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
+{
+	xfs_buf_t	*bp;
+	xfs_dabuf_t	*dabuf;
+	int		i;
+	int		off;
+
+	if (nbuf == 1)
+		dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS);
+	else
+		dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS);
+	dabuf->dirty = 0;
+#ifdef XFS_DABUF_DEBUG
+	dabuf->ra = ra;
+	dabuf->target = XFS_BUF_TARGET(bps[0]);
+	dabuf->blkno = XFS_BUF_ADDR(bps[0]);
+#endif
+	if (nbuf == 1) {
+		dabuf->nbuf = 1;
+		bp = bps[0];
+		dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp));
+		dabuf->data = XFS_BUF_PTR(bp);
+		dabuf->bps[0] = bp;
+	} else {
+		dabuf->nbuf = nbuf;
+		for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) {
+			dabuf->bps[i] = bp = bps[i];
+			dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp));
+		}
+		dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
+		for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) {
+			bp = bps[i];
+			memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp),
+				XFS_BUF_COUNT(bp));
+		}
+	}
+#ifdef XFS_DABUF_DEBUG
+	{
+		xfs_dabuf_t	*p;
+
+		spin_lock(&xfs_dabuf_global_lock);
+		for (p = xfs_dabuf_global_list; p; p = p->next) {
+			ASSERT(p->blkno != dabuf->blkno ||
+			       p->target != dabuf->target);
+		}
+		dabuf->prev = NULL;
+		if (xfs_dabuf_global_list)
+			xfs_dabuf_global_list->prev = dabuf;
+		dabuf->next = xfs_dabuf_global_list;
+		xfs_dabuf_global_list = dabuf;
+		spin_unlock(&xfs_dabuf_global_lock);
+	}
+#endif
+	return dabuf;
+}
+
+/*
+ * Un-dirty a dabuf.
+ */
+STATIC void
+xfs_da_buf_clean(xfs_dabuf_t *dabuf)
+{
+	xfs_buf_t	*bp;
+	int		i;
+	int		off;
+
+	if (dabuf->dirty) {
+		ASSERT(dabuf->nbuf > 1);
+		dabuf->dirty = 0;
+		for (i = off = 0; i < dabuf->nbuf;
+				i++, off += XFS_BUF_COUNT(bp)) {
+			bp = dabuf->bps[i];
+			memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off,
+				XFS_BUF_COUNT(bp));
+		}
+	}
+}
+
+/*
+ * Release a dabuf.
+ */
+void
+xfs_da_buf_done(xfs_dabuf_t *dabuf)
+{
+	ASSERT(dabuf);
+	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+	if (dabuf->dirty)
+		xfs_da_buf_clean(dabuf);
+	if (dabuf->nbuf > 1)
+		kmem_free(dabuf->data);
+#ifdef XFS_DABUF_DEBUG
+	{
+		spin_lock(&xfs_dabuf_global_lock);
+		if (dabuf->prev)
+			dabuf->prev->next = dabuf->next;
+		else
+			xfs_dabuf_global_list = dabuf->next;
+		if (dabuf->next)
+			dabuf->next->prev = dabuf->prev;
+		spin_unlock(&xfs_dabuf_global_lock);
+	}
+	memset(dabuf, 0, XFS_DA_BUF_SIZE(dabuf->nbuf));
+#endif
+	if (dabuf->nbuf == 1)
+		kmem_zone_free(xfs_dabuf_zone, dabuf);
+	else
+		kmem_free(dabuf);
+}
+
+/*
+ * Log transaction from a dabuf.
+ */
+void
+xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
+{
+	xfs_buf_t	*bp;
+	uint		f;
+	int		i;
+	uint		l;
+	int		off;
+
+	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+	if (dabuf->nbuf == 1) {
+		ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0]));
+		xfs_trans_log_buf(tp, dabuf->bps[0], first, last);
+		return;
+	}
+	dabuf->dirty = 1;
+	ASSERT(first <= last);
+	for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) {
+		bp = dabuf->bps[i];
+		f = off;
+		l = f + XFS_BUF_COUNT(bp) - 1;
+		if (f < first)
+			f = first;
+		if (l > last)
+			l = last;
+		if (f <= l)
+			xfs_trans_log_buf(tp, bp, f - off, l - off);
+		/*
+		 * B_DONE is set by xfs_trans_log buf.
+		 * If we don't set it on a new buffer (get not read)
+		 * then if we don't put anything in the buffer it won't
+		 * be set, and at commit it it released into the cache,
+		 * and then a read will fail.
+		 */
+		else if (!(XFS_BUF_ISDONE(bp)))
+		  XFS_BUF_DONE(bp);
+	}
+	ASSERT(last < off);
+}
+
+/*
+ * Release dabuf from a transaction.
+ * Have to free up the dabuf before the buffers are released,
+ * since the synchronization on the dabuf is really the lock on the buffer.
+ */
+void
+xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
+{
+	xfs_buf_t	*bp;
+	xfs_buf_t	**bplist;
+	int		i;
+	int		nbuf;
+
+	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+	if ((nbuf = dabuf->nbuf) == 1) {
+		bplist = &bp;
+		bp = dabuf->bps[0];
+	} else {
+		bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
+		memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist));
+	}
+	xfs_da_buf_done(dabuf);
+	for (i = 0; i < nbuf; i++)
+		xfs_trans_brelse(tp, bplist[i]);
+	if (bplist != &bp)
+		kmem_free(bplist);
+}
+
+/*
+ * Invalidate dabuf from a transaction.
+ */
+void
+xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
+{
+	xfs_buf_t	*bp;
+	xfs_buf_t	**bplist;
+	int		i;
+	int		nbuf;
+
+	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+	if ((nbuf = dabuf->nbuf) == 1) {
+		bplist = &bp;
+		bp = dabuf->bps[0];
+	} else {
+		bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
+		memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist));
+	}
+	xfs_da_buf_done(dabuf);
+	for (i = 0; i < nbuf; i++)
+		xfs_trans_binval(tp, bplist[i]);
+	if (bplist != &bp)
+		kmem_free(bplist);
+}
+
+/*
+ * Get the first daddr from a dabuf.
+ */
+xfs_daddr_t
+xfs_da_blkno(xfs_dabuf_t *dabuf)
+{
+	ASSERT(dabuf->nbuf);
+	ASSERT(dabuf->data);
+	return XFS_BUF_ADDR(dabuf->bps[0]);
+}
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
new file mode 100644
index 00000000..fe9f5a8c
--- /dev/null
+++ b/fs/xfs/xfs_da_btree.h
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DA_BTREE_H__
+#define	__XFS_DA_BTREE_H__
+
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+struct zone;
+
+/*========================================================================
+ * Directory Structure when greater than XFS_LBSIZE(mp) bytes.
+ *========================================================================*/
+
+/*
+ * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
+ *
+ * Is is used to manage a doubly linked list of all blocks at the same
+ * level in the Btree, and to identify which type of block this is.
+ */
+#define XFS_DA_NODE_MAGIC	0xfebe	/* magic number: non-leaf blocks */
+#define XFS_ATTR_LEAF_MAGIC	0xfbee	/* magic number: attribute leaf blks */
+#define	XFS_DIR2_LEAF1_MAGIC	0xd2f1	/* magic number: v2 dirlf single blks */
+#define	XFS_DIR2_LEAFN_MAGIC	0xd2ff	/* magic number: v2 dirlf multi blks */
+
+typedef struct xfs_da_blkinfo {
+	__be32		forw;			/* previous block in list */
+	__be32		back;			/* following block in list */
+	__be16		magic;			/* validity check on block */
+	__be16		pad;			/* unused */
+} xfs_da_blkinfo_t;
+
+/*
+ * This is the structure of the root and intermediate nodes in the Btree.
+ * The leaf nodes are defined above.
+ *
+ * Entries are not packed.
+ *
+ * Since we have duplicate keys, use a binary search but always follow
+ * all match in the block, not just the first match found.
+ */
+#define	XFS_DA_NODE_MAXDEPTH	5	/* max depth of Btree */
+
+typedef struct xfs_da_intnode {
+	struct xfs_da_node_hdr {	/* constant-structure header block */
+		xfs_da_blkinfo_t info;	/* block type, links, etc. */
+		__be16	count;		/* count of active entries */
+		__be16	level;		/* level above leaves (leaf == 0) */
+	} hdr;
+	struct xfs_da_node_entry {
+		__be32	hashval;	/* hash value for this descendant */
+		__be32	before;		/* Btree block before this key */
+	} btree[1];			/* variable sized array of keys */
+} xfs_da_intnode_t;
+typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
+typedef struct xfs_da_node_entry xfs_da_node_entry_t;
+
+#define	XFS_LBSIZE(mp)	(mp)->m_sb.sb_blocksize
+
+/*========================================================================
+ * Btree searching and modification structure definitions.
+ *========================================================================*/
+
+/*
+ * Search comparison results
+ */
+enum xfs_dacmp {
+	XFS_CMP_DIFFERENT,	/* names are completely different */
+	XFS_CMP_EXACT,		/* names are exactly the same */
+	XFS_CMP_CASE		/* names are same but differ in case */
+};
+
+/*
+ * Structure to ease passing around component names.
+ */
+typedef struct xfs_da_args {
+	const __uint8_t	*name;		/* string (maybe not NULL terminated) */
+	int		namelen;	/* length of string (maybe no NULL) */
+	__uint8_t	*value;		/* set of bytes (maybe contain NULLs) */
+	int		valuelen;	/* length of value */
+	int		flags;		/* argument flags (eg: ATTR_NOCREATE) */
+	xfs_dahash_t	hashval;	/* hash value of name */
+	xfs_ino_t	inumber;	/* input/output inode number */
+	struct xfs_inode *dp;		/* directory inode to manipulate */
+	xfs_fsblock_t	*firstblock;	/* ptr to firstblock for bmap calls */
+	struct xfs_bmap_free *flist;	/* ptr to freelist for bmap_finish */
+	struct xfs_trans *trans;	/* current trans (changes over time) */
+	xfs_extlen_t	total;		/* total blocks needed, for 1st bmap */
+	int		whichfork;	/* data or attribute fork */
+	xfs_dablk_t	blkno;		/* blkno of attr leaf of interest */
+	int		index;		/* index of attr of interest in blk */
+	xfs_dablk_t	rmtblkno;	/* remote attr value starting blkno */
+	int		rmtblkcnt;	/* remote attr value block count */
+	xfs_dablk_t	blkno2;		/* blkno of 2nd attr leaf of interest */
+	int		index2;		/* index of 2nd attr in blk */
+	xfs_dablk_t	rmtblkno2;	/* remote attr value starting blkno */
+	int		rmtblkcnt2;	/* remote attr value block count */
+	int		op_flags;	/* operation flags */
+	enum xfs_dacmp	cmpresult;	/* name compare result for lookups */
+} xfs_da_args_t;
+
+/*
+ * Operation flags:
+ */
+#define XFS_DA_OP_JUSTCHECK	0x0001	/* check for ok with no space */
+#define XFS_DA_OP_RENAME	0x0002	/* this is an atomic rename op */
+#define XFS_DA_OP_ADDNAME	0x0004	/* this is an add operation */
+#define XFS_DA_OP_OKNOENT	0x0008	/* lookup/add op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP	0x0010	/* lookup to return CI name if found */
+
+#define XFS_DA_OP_FLAGS \
+	{ XFS_DA_OP_JUSTCHECK,	"JUSTCHECK" }, \
+	{ XFS_DA_OP_RENAME,	"RENAME" }, \
+	{ XFS_DA_OP_ADDNAME,	"ADDNAME" }, \
+	{ XFS_DA_OP_OKNOENT,	"OKNOENT" }, \
+	{ XFS_DA_OP_CILOOKUP,	"CILOOKUP" }
+
+/*
+ * Structure to describe buffer(s) for a block.
+ * This is needed in the directory version 2 format case, when
+ * multiple non-contiguous fsblocks might be needed to cover one
+ * logical directory block.
+ * If the buffer count is 1 then the data pointer points to the
+ * same place as the b_addr field for the buffer, else to kmem_alloced memory.
+ */
+typedef struct xfs_dabuf {
+	int		nbuf;		/* number of buffer pointers present */
+	short		dirty;		/* data needs to be copied back */
+	short		bbcount;	/* how large is data in bbs */
+	void		*data;		/* pointer for buffers' data */
+#ifdef XFS_DABUF_DEBUG
+	inst_t		*ra;		/* return address of caller to make */
+	struct xfs_dabuf *next;		/* next in global chain */
+	struct xfs_dabuf *prev;		/* previous in global chain */
+	struct xfs_buftarg *target;	/* device for buffer */
+	xfs_daddr_t	blkno;		/* daddr first in bps[0] */
+#endif
+	struct xfs_buf	*bps[1];	/* actually nbuf of these */
+} xfs_dabuf_t;
+#define	XFS_DA_BUF_SIZE(n)	\
+	(sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1))
+
+#ifdef XFS_DABUF_DEBUG
+extern xfs_dabuf_t	*xfs_dabuf_global_list;
+#endif
+
+/*
+ * Storage for holding state during Btree searches and split/join ops.
+ *
+ * Only need space for 5 intermediate nodes.  With a minimum of 62-way
+ * fanout to the Btree, we can support over 900 million directory blocks,
+ * which is slightly more than enough.
+ */
+typedef struct xfs_da_state_blk {
+	xfs_dabuf_t	*bp;		/* buffer containing block */
+	xfs_dablk_t	blkno;		/* filesystem blkno of buffer */
+	xfs_daddr_t	disk_blkno;	/* on-disk blkno (in BBs) of buffer */
+	int		index;		/* relevant index into block */
+	xfs_dahash_t	hashval;	/* last hash value in block */
+	int		magic;		/* blk's magic number, ie: blk type */
+} xfs_da_state_blk_t;
+
+typedef struct xfs_da_state_path {
+	int			active;		/* number of active levels */
+	xfs_da_state_blk_t	blk[XFS_DA_NODE_MAXDEPTH];
+} xfs_da_state_path_t;
+
+typedef struct xfs_da_state {
+	xfs_da_args_t		*args;		/* filename arguments */
+	struct xfs_mount	*mp;		/* filesystem mount point */
+	unsigned int		blocksize;	/* logical block size */
+	unsigned int		node_ents;	/* how many entries in danode */
+	xfs_da_state_path_t	path;		/* search/split paths */
+	xfs_da_state_path_t	altpath;	/* alternate path for join */
+	unsigned char		inleaf;		/* insert into 1->lf, 0->splf */
+	unsigned char		extravalid;	/* T/F: extrablk is in use */
+	unsigned char		extraafter;	/* T/F: extrablk is after new */
+	xfs_da_state_blk_t	extrablk;	/* for double-splits on leaves */
+						/* for dirv2 extrablk is data */
+} xfs_da_state_t;
+
+/*
+ * Utility macros to aid in logging changed structure fields.
+ */
+#define XFS_DA_LOGOFF(BASE, ADDR)	((char *)(ADDR) - (char *)(BASE))
+#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE)	\
+		(uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
+		(uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
+
+/*
+ * Name ops for directory and/or attr name operations
+ */
+struct xfs_nameops {
+	xfs_dahash_t	(*hashname)(struct xfs_name *);
+	enum xfs_dacmp	(*compname)(struct xfs_da_args *,
+					const unsigned char *, int);
+};
+
+
+/*========================================================================
+ * Function prototypes.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+int	xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
+					 xfs_dabuf_t **bpp, int whichfork);
+int	xfs_da_split(xfs_da_state_t *state);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int	xfs_da_join(xfs_da_state_t *state);
+void	xfs_da_fixhashpath(xfs_da_state_t *state,
+					  xfs_da_state_path_t *path_to_to_fix);
+
+/*
+ * Routines used for finding things in the Btree.
+ */
+int	xfs_da_node_lookup_int(xfs_da_state_t *state, int *result);
+int	xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+					 int forward, int release, int *result);
+/*
+ * Utility routines.
+ */
+int	xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+				       xfs_da_state_blk_t *new_blk);
+
+/*
+ * Utility routines.
+ */
+int	xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
+int	xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+			      xfs_dablk_t bno, xfs_daddr_t mappedbno,
+			      xfs_dabuf_t **bp, int whichfork);
+int	xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+			       xfs_dablk_t bno, xfs_daddr_t mappedbno,
+			       xfs_dabuf_t **bpp, int whichfork);
+xfs_daddr_t	xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+			xfs_dablk_t bno, int whichfork);
+int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+					  xfs_dabuf_t *dead_buf);
+
+uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
+enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
+				const unsigned char *name, int len);
+
+
+xfs_da_state_t *xfs_da_state_alloc(void);
+void xfs_da_state_free(xfs_da_state_t *state);
+
+void xfs_da_buf_done(xfs_dabuf_t *dabuf);
+void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first,
+			   uint last);
+void xfs_da_brelse(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
+void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
+xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
+
+extern struct kmem_zone *xfs_da_state_zone;
+extern struct kmem_zone *xfs_dabuf_zone;
+extern const struct xfs_nameops xfs_default_nameops;
+
+#endif	/* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
new file mode 100644
index 00000000..9a84a85c
--- /dev/null
+++ b/fs/xfs/xfs_dfrag.c
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_itable.h"
+#include "xfs_dfrag.h"
+#include "xfs_error.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+
+static int xfs_swap_extents(
+	xfs_inode_t	*ip,	/* target inode */
+	xfs_inode_t	*tip,	/* tmp inode */
+	xfs_swapext_t	*sxp);
+
+/*
+ * ioctl interface for swapext
+ */
+int
+xfs_swapext(
+	xfs_swapext_t	*sxp)
+{
+	xfs_inode_t     *ip, *tip;
+	struct file	*file, *tmp_file;
+	int		error = 0;
+
+	/* Pull information for the target fd */
+	file = fget((int)sxp->sx_fdtarget);
+	if (!file) {
+		error = XFS_ERROR(EINVAL);
+		goto out;
+	}
+
+	if (!(file->f_mode & FMODE_WRITE) ||
+	    !(file->f_mode & FMODE_READ) ||
+	    (file->f_flags & O_APPEND)) {
+		error = XFS_ERROR(EBADF);
+		goto out_put_file;
+	}
+
+	tmp_file = fget((int)sxp->sx_fdtmp);
+	if (!tmp_file) {
+		error = XFS_ERROR(EINVAL);
+		goto out_put_file;
+	}
+
+	if (!(tmp_file->f_mode & FMODE_WRITE) ||
+	    !(tmp_file->f_mode & FMODE_READ) ||
+	    (tmp_file->f_flags & O_APPEND)) {
+		error = XFS_ERROR(EBADF);
+		goto out_put_tmp_file;
+	}
+
+	if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
+	    IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) {
+		error = XFS_ERROR(EINVAL);
+		goto out_put_tmp_file;
+	}
+
+	ip = XFS_I(file->f_path.dentry->d_inode);
+	tip = XFS_I(tmp_file->f_path.dentry->d_inode);
+
+	if (ip->i_mount != tip->i_mount) {
+		error = XFS_ERROR(EINVAL);
+		goto out_put_tmp_file;
+	}
+
+	if (ip->i_ino == tip->i_ino) {
+		error = XFS_ERROR(EINVAL);
+		goto out_put_tmp_file;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		error = XFS_ERROR(EIO);
+		goto out_put_tmp_file;
+	}
+
+	error = xfs_swap_extents(ip, tip, sxp);
+
+ out_put_tmp_file:
+	fput(tmp_file);
+ out_put_file:
+	fput(file);
+ out:
+	return error;
+}
+
+/*
+ * We need to check that the format of the data fork in the temporary inode is
+ * valid for the target inode before doing the swap. This is not a problem with
+ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
+ * data fork depending on the space the attribute fork is taking so we can get
+ * invalid formats on the target inode.
+ *
+ * E.g. target has space for 7 extents in extent format, temp inode only has
+ * space for 6.  If we defragment down to 7 extents, then the tmp format is a
+ * btree, but when swapped it needs to be in extent format. Hence we can't just
+ * blindly swap data forks on attr2 filesystems.
+ *
+ * Note that we check the swap in both directions so that we don't end up with
+ * a corrupt temporary inode, either.
+ *
+ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
+ * inode will prevent this situation from occurring, so all we do here is
+ * reject and log the attempt. basically we are putting the responsibility on
+ * userspace to get this right.
+ */
+static int
+xfs_swap_extents_check_format(
+	xfs_inode_t	*ip,	/* target inode */
+	xfs_inode_t	*tip)	/* tmp inode */
+{
+
+	/* Should never get a local format */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		return EINVAL;
+
+	/*
+	 * if the target inode has less extents that then temporary inode then
+	 * why did userspace call us?
+	 */
+	if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+		return EINVAL;
+
+	/*
+	 * if the target inode is in extent form and the temp inode is in btree
+	 * form then we will end up with the target inode in the wrong format
+	 * as we already know there are less extents in the temp inode.
+	 */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+		return EINVAL;
+
+	/* Check temp in extent form to max in target */
+	if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+		return EINVAL;
+
+	/* Check target in extent form to max in temp */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+		return EINVAL;
+
+	/*
+	 * If we are in a btree format, check that the temp root block will fit
+	 * in the target and that it has enough extents to be in btree format
+	 * in the target.
+	 *
+	 * Note that we have to be careful to allow btree->extent conversions
+	 * (a common defrag case) which will occur when the temp inode is in
+	 * extent format...
+	 */
+	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+	    ((XFS_IFORK_BOFF(ip) &&
+	      tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
+	     XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
+		return EINVAL;
+
+	/* Reciprocal target->temp btree format checks */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+	    ((XFS_IFORK_BOFF(tip) &&
+	      ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
+	     XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
+		return EINVAL;
+
+	return 0;
+}
+
+static int
+xfs_swap_extents(
+	xfs_inode_t	*ip,	/* target inode */
+	xfs_inode_t	*tip,	/* tmp inode */
+	xfs_swapext_t	*sxp)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_trans_t	*tp;
+	xfs_bstat_t	*sbp = &sxp->sx_stat;
+	xfs_ifork_t	*tempifp, *ifp, *tifp;
+	int		ilf_fields, tilf_fields;
+	int		error = 0;
+	int		aforkblks = 0;
+	int		taforkblks = 0;
+	__uint64_t	tmp;
+
+	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
+	if (!tempifp) {
+		error = XFS_ERROR(ENOMEM);
+		goto out;
+	}
+
+	/*
+	 * we have to do two separate lock calls here to keep lockdep
+	 * happy. If we try to get all the locks in one call, lock will
+	 * report false positives when we drop the ILOCK and regain them
+	 * below.
+	 */
+	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+
+	/* Verify that both files have the same format */
+	if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
+		error = XFS_ERROR(EINVAL);
+		goto out_unlock;
+	}
+
+	/* Verify both files are either real-time or non-realtime */
+	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
+		error = XFS_ERROR(EINVAL);
+		goto out_unlock;
+	}
+
+	if (VN_CACHED(VFS_I(tip)) != 0) {
+		error = xfs_flushinval_pages(tip, 0, -1,
+				FI_REMAPF_LOCKED);
+		if (error)
+			goto out_unlock;
+	}
+
+	/* Verify O_DIRECT for ftmp */
+	if (VN_CACHED(VFS_I(tip)) != 0) {
+		error = XFS_ERROR(EINVAL);
+		goto out_unlock;
+	}
+
+	/* Verify all data are being swapped */
+	if (sxp->sx_offset != 0 ||
+	    sxp->sx_length != ip->i_d.di_size ||
+	    sxp->sx_length != tip->i_d.di_size) {
+		error = XFS_ERROR(EFAULT);
+		goto out_unlock;
+	}
+
+	trace_xfs_swap_extent_before(ip, 0);
+	trace_xfs_swap_extent_before(tip, 1);
+
+	/* check inode formats now that data is flushed */
+	error = xfs_swap_extents_check_format(ip, tip);
+	if (error) {
+		xfs_notice(mp,
+		    "%s: inode 0x%llx format is incompatible for exchanging.",
+				__func__, ip->i_ino);
+		goto out_unlock;
+	}
+
+	/*
+	 * Compare the current change & modify times with that
+	 * passed in.  If they differ, we abort this swap.
+	 * This is the mechanism used to ensure the calling
+	 * process that the file was not changed out from
+	 * under it.
+	 */
+	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
+	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
+	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
+		error = XFS_ERROR(EBUSY);
+		goto out_unlock;
+	}
+
+	/* We need to fail if the file is memory mapped.  Once we have tossed
+	 * all existing pages, the page fault will have no option
+	 * but to go to the filesystem for pages. By making the page fault call
+	 * vop_read (or write in the case of autogrow) they block on the iolock
+	 * until we have switched the extents.
+	 */
+	if (VN_MAPPED(VFS_I(ip))) {
+		error = XFS_ERROR(EBUSY);
+		goto out_unlock;
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	xfs_iunlock(tip, XFS_ILOCK_EXCL);
+
+	/*
+	 * There is a race condition here since we gave up the
+	 * ilock.  However, the data fork will not change since
+	 * we have the iolock (locked for truncation too) so we
+	 * are safe.  We don't really care if non-io related
+	 * fields change.
+	 */
+
+	xfs_tosspages(ip, 0, -1, FI_REMAPF);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
+	if ((error = xfs_trans_reserve(tp, 0,
+				     XFS_ICHANGE_LOG_RES(mp), 0,
+				     0, 0))) {
+		xfs_iunlock(ip,  XFS_IOLOCK_EXCL);
+		xfs_iunlock(tip, XFS_IOLOCK_EXCL);
+		xfs_trans_cancel(tp, 0);
+		goto out;
+	}
+	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+
+	/*
+	 * Count the number of extended attribute blocks
+	 */
+	if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
+	     (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
+		if (error)
+			goto out_trans_cancel;
+	}
+	if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
+	     (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
+			&taforkblks);
+		if (error)
+			goto out_trans_cancel;
+	}
+
+	/*
+	 * Swap the data forks of the inodes
+	 */
+	ifp = &ip->i_df;
+	tifp = &tip->i_df;
+	*tempifp = *ifp;	/* struct copy */
+	*ifp = *tifp;		/* struct copy */
+	*tifp = *tempifp;	/* struct copy */
+
+	/*
+	 * Fix the in-memory data fork values that are dependent on the fork
+	 * offset in the inode. We can't assume they remain the same as attr2
+	 * has dynamic fork offsets.
+	 */
+	ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
+					(uint)sizeof(xfs_bmbt_rec_t);
+	tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
+					(uint)sizeof(xfs_bmbt_rec_t);
+
+	/*
+	 * Fix the on-disk inode values
+	 */
+	tmp = (__uint64_t)ip->i_d.di_nblocks;
+	ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
+	tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
+
+	tmp = (__uint64_t) ip->i_d.di_nextents;
+	ip->i_d.di_nextents = tip->i_d.di_nextents;
+	tip->i_d.di_nextents = tmp;
+
+	tmp = (__uint64_t) ip->i_d.di_format;
+	ip->i_d.di_format = tip->i_d.di_format;
+	tip->i_d.di_format = tmp;
+
+	/*
+	 * The extents in the source inode could still contain speculative
+	 * preallocation beyond EOF (e.g. the file is open but not modified
+	 * while defrag is in progress). In that case, we need to copy over the
+	 * number of delalloc blocks the data fork in the source inode is
+	 * tracking beyond EOF so that when the fork is truncated away when the
+	 * temporary inode is unlinked we don't underrun the i_delayed_blks
+	 * counter on that inode.
+	 */
+	ASSERT(tip->i_delayed_blks == 0);
+	tip->i_delayed_blks = ip->i_delayed_blks;
+	ip->i_delayed_blks = 0;
+
+	ilf_fields = XFS_ILOG_CORE;
+
+	switch(ip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		/* If the extents fit in the inode, fix the
+		 * pointer.  Otherwise it's already NULL or
+		 * pointing to the extent.
+		 */
+		if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+			ifp->if_u1.if_extents =
+				ifp->if_u2.if_inline_ext;
+		}
+		ilf_fields |= XFS_ILOG_DEXT;
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		ilf_fields |= XFS_ILOG_DBROOT;
+		break;
+	}
+
+	tilf_fields = XFS_ILOG_CORE;
+
+	switch(tip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		/* If the extents fit in the inode, fix the
+		 * pointer.  Otherwise it's already NULL or
+		 * pointing to the extent.
+		 */
+		if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+			tifp->if_u1.if_extents =
+				tifp->if_u2.if_inline_ext;
+		}
+		tilf_fields |= XFS_ILOG_DEXT;
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		tilf_fields |= XFS_ILOG_DBROOT;
+		break;
+	}
+
+
+	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+	xfs_trans_log_inode(tp, ip,  ilf_fields);
+	xfs_trans_log_inode(tp, tip, tilf_fields);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(tp);
+
+	error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
+
+	trace_xfs_swap_extent_after(ip, 0);
+	trace_xfs_swap_extent_after(tip, 1);
+out:
+	kmem_free(tempifp);
+	return error;
+
+out_unlock:
+	xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	goto out;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp, 0);
+	goto out_unlock;
+}
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
new file mode 100644
index 00000000..20bdd935
--- /dev/null
+++ b/fs/xfs/xfs_dfrag.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DFRAG_H__
+#define	__XFS_DFRAG_H__
+
+/*
+ * Structure passed to xfs_swapext
+ */
+
+typedef struct xfs_swapext
+{
+	__int64_t	sx_version;	/* version */
+	__int64_t	sx_fdtarget;	/* fd of target file */
+	__int64_t	sx_fdtmp;	/* fd of tmp file */
+	xfs_off_t	sx_offset;	/* offset into file */
+	xfs_off_t	sx_length;	/* leng from offset */
+	char		sx_pad[16];	/* pad space, unused */
+	xfs_bstat_t	sx_stat;	/* stat of target b4 copy */
+} xfs_swapext_t;
+
+/*
+ * Version flag
+ */
+#define XFS_SX_VERSION		0
+
+#ifdef __KERNEL__
+/*
+ * Prototypes for visible xfs_dfrag.c routines.
+ */
+
+/*
+ * Syscall interface for xfs_swapext
+ */
+int	xfs_swapext(struct xfs_swapext *sx);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
new file mode 100644
index 00000000..dffba9ba
--- /dev/null
+++ b/fs/xfs/xfs_dinode.h
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DINODE_H__
+#define	__XFS_DINODE_H__
+
+#define	XFS_DINODE_MAGIC		0x494e	/* 'IN' */
+#define XFS_DINODE_GOOD_VERSION(v)	(((v) == 1 || (v) == 2))
+
+typedef struct xfs_timestamp {
+	__be32		t_sec;		/* timestamp seconds */
+	__be32		t_nsec;		/* timestamp nanoseconds */
+} xfs_timestamp_t;
+
+/*
+ * On-disk inode structure.
+ *
+ * This is just the header or "dinode core", the inode is expanded to fill a
+ * variable size the leftover area split into a data and an attribute fork.
+ * The format of the data and attribute fork depends on the format of the
+ * inode as indicated by di_format and di_aformat.  To access the data and
+ * attribute use the XFS_DFORK_PTR, XFS_DFORK_DPTR, and XFS_DFORK_PTR macros
+ * below.
+ *
+ * There is a very similar struct icdinode in xfs_inode which matches the
+ * layout of the first 96 bytes of this structure, but is kept in native
+ * format instead of big endian.
+ */
+typedef struct xfs_dinode {
+	__be16		di_magic;	/* inode magic # = XFS_DINODE_MAGIC */
+	__be16		di_mode;	/* mode and type of file */
+	__u8		di_version;	/* inode version */
+	__u8		di_format;	/* format of di_c data */
+	__be16		di_onlink;	/* old number of links to file */
+	__be32		di_uid;		/* owner's user id */
+	__be32		di_gid;		/* owner's group id */
+	__be32		di_nlink;	/* number of links to file */
+	__be16		di_projid_lo;	/* lower part of owner's project id */
+	__be16		di_projid_hi;	/* higher part owner's project id */
+	__u8		di_pad[6];	/* unused, zeroed space */
+	__be16		di_flushiter;	/* incremented on flush */
+	xfs_timestamp_t	di_atime;	/* time last accessed */
+	xfs_timestamp_t	di_mtime;	/* time last modified */
+	xfs_timestamp_t	di_ctime;	/* time created/inode modified */
+	__be64		di_size;	/* number of bytes in file */
+	__be64		di_nblocks;	/* # of direct & btree blocks used */
+	__be32		di_extsize;	/* basic/minimum extent size for file */
+	__be32		di_nextents;	/* number of extents in data fork */
+	__be16		di_anextents;	/* number of extents in attribute fork*/
+	__u8		di_forkoff;	/* attr fork offs, <<3 for 64b align */
+	__s8		di_aformat;	/* format of attr fork's data */
+	__be32		di_dmevmask;	/* DMIG event mask */
+	__be16		di_dmstate;	/* DMIG state info */
+	__be16		di_flags;	/* random flags, XFS_DIFLAG_... */
+	__be32		di_gen;		/* generation number */
+
+	/* di_next_unlinked is the only non-core field in the old dinode */
+	__be32		di_next_unlinked;/* agi unlinked list ptr */
+} __attribute__((packed)) xfs_dinode_t;
+
+#define DI_MAX_FLUSH 0xffff
+
+/*
+ * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
+ * Since the pathconf interface is signed, we use 2^31 - 1 instead.
+ * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
+ */
+#define	XFS_MAXLINK		((1U << 31) - 1U)
+#define	XFS_MAXLINK_1		65535U
+
+/*
+ * Values for di_format
+ */
+typedef enum xfs_dinode_fmt {
+	XFS_DINODE_FMT_DEV,		/* xfs_dev_t */
+	XFS_DINODE_FMT_LOCAL,		/* bulk data */
+	XFS_DINODE_FMT_EXTENTS,		/* struct xfs_bmbt_rec */
+	XFS_DINODE_FMT_BTREE,		/* struct xfs_bmdr_block */
+	XFS_DINODE_FMT_UUID		/* uuid_t */
+} xfs_dinode_fmt_t;
+
+/*
+ * Inode minimum and maximum sizes.
+ */
+#define	XFS_DINODE_MIN_LOG	8
+#define	XFS_DINODE_MAX_LOG	11
+#define	XFS_DINODE_MIN_SIZE	(1 << XFS_DINODE_MIN_LOG)
+#define	XFS_DINODE_MAX_SIZE	(1 << XFS_DINODE_MAX_LOG)
+
+/*
+ * Inode size for given fs.
+ */
+#define XFS_LITINO(mp) \
+	((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode)))
+
+#define	XFS_BROOT_SIZE_ADJ	\
+	(XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
+
+/*
+ * Inode data & attribute fork sizes, per inode.
+ */
+#define XFS_DFORK_Q(dip)		((dip)->di_forkoff != 0)
+#define XFS_DFORK_BOFF(dip)		((int)((dip)->di_forkoff << 3))
+
+#define XFS_DFORK_DSIZE(dip,mp) \
+	(XFS_DFORK_Q(dip) ? \
+		XFS_DFORK_BOFF(dip) : \
+		XFS_LITINO(mp))
+#define XFS_DFORK_ASIZE(dip,mp) \
+	(XFS_DFORK_Q(dip) ? \
+		XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : \
+		0)
+#define XFS_DFORK_SIZE(dip,mp,w) \
+	((w) == XFS_DATA_FORK ? \
+		XFS_DFORK_DSIZE(dip, mp) : \
+		XFS_DFORK_ASIZE(dip, mp))
+
+/*
+ * Return pointers to the data or attribute forks.
+ */
+#define XFS_DFORK_DPTR(dip) \
+	((char *)(dip) + sizeof(struct xfs_dinode))
+#define XFS_DFORK_APTR(dip)	\
+	(XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
+#define XFS_DFORK_PTR(dip,w)	\
+	((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
+
+#define XFS_DFORK_FORMAT(dip,w) \
+	((w) == XFS_DATA_FORK ? \
+		(dip)->di_format : \
+		(dip)->di_aformat)
+#define XFS_DFORK_NEXTENTS(dip,w) \
+	((w) == XFS_DATA_FORK ? \
+		be32_to_cpu((dip)->di_nextents) : \
+		be16_to_cpu((dip)->di_anextents))
+
+#define	XFS_BUF_TO_DINODE(bp)	((xfs_dinode_t *)XFS_BUF_PTR(bp))
+
+/*
+ * For block and character special files the 32bit dev_t is stored at the
+ * beginning of the data fork.
+ */
+static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
+{
+	return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
+}
+
+static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
+{
+	*(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
+}
+
+/*
+ * Values for di_flags
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_XFLAG_s.
+ */
+#define XFS_DIFLAG_REALTIME_BIT  0	/* file's blocks come from rt area */
+#define XFS_DIFLAG_PREALLOC_BIT  1	/* file space has been preallocated */
+#define XFS_DIFLAG_NEWRTBM_BIT   2	/* for rtbitmap inode, new format */
+#define XFS_DIFLAG_IMMUTABLE_BIT 3	/* inode is immutable */
+#define XFS_DIFLAG_APPEND_BIT    4	/* inode is append-only */
+#define XFS_DIFLAG_SYNC_BIT      5	/* inode is written synchronously */
+#define XFS_DIFLAG_NOATIME_BIT   6	/* do not update atime */
+#define XFS_DIFLAG_NODUMP_BIT    7	/* do not dump */
+#define XFS_DIFLAG_RTINHERIT_BIT 8	/* create with realtime bit set */
+#define XFS_DIFLAG_PROJINHERIT_BIT   9	/* create with parents projid */
+#define XFS_DIFLAG_NOSYMLINKS_BIT   10	/* disallow symlink creation */
+#define XFS_DIFLAG_EXTSIZE_BIT      11	/* inode extent size allocator hint */
+#define XFS_DIFLAG_EXTSZINHERIT_BIT 12	/* inherit inode extent size */
+#define XFS_DIFLAG_NODEFRAG_BIT     13	/* do not reorganize/defragment */
+#define XFS_DIFLAG_FILESTREAM_BIT   14  /* use filestream allocator */
+#define XFS_DIFLAG_REALTIME      (1 << XFS_DIFLAG_REALTIME_BIT)
+#define XFS_DIFLAG_PREALLOC      (1 << XFS_DIFLAG_PREALLOC_BIT)
+#define XFS_DIFLAG_NEWRTBM       (1 << XFS_DIFLAG_NEWRTBM_BIT)
+#define XFS_DIFLAG_IMMUTABLE     (1 << XFS_DIFLAG_IMMUTABLE_BIT)
+#define XFS_DIFLAG_APPEND        (1 << XFS_DIFLAG_APPEND_BIT)
+#define XFS_DIFLAG_SYNC          (1 << XFS_DIFLAG_SYNC_BIT)
+#define XFS_DIFLAG_NOATIME       (1 << XFS_DIFLAG_NOATIME_BIT)
+#define XFS_DIFLAG_NODUMP        (1 << XFS_DIFLAG_NODUMP_BIT)
+#define XFS_DIFLAG_RTINHERIT     (1 << XFS_DIFLAG_RTINHERIT_BIT)
+#define XFS_DIFLAG_PROJINHERIT   (1 << XFS_DIFLAG_PROJINHERIT_BIT)
+#define XFS_DIFLAG_NOSYMLINKS    (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
+#define XFS_DIFLAG_EXTSIZE       (1 << XFS_DIFLAG_EXTSIZE_BIT)
+#define XFS_DIFLAG_EXTSZINHERIT  (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
+#define XFS_DIFLAG_NODEFRAG      (1 << XFS_DIFLAG_NODEFRAG_BIT)
+#define XFS_DIFLAG_FILESTREAM    (1 << XFS_DIFLAG_FILESTREAM_BIT)
+
+#ifdef CONFIG_XFS_RT
+#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
+#else
+#define XFS_IS_REALTIME_INODE(ip) (0)
+#endif
+
+#define XFS_DIFLAG_ANY \
+	(XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
+	 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
+	 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
+	 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
+	 XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
+
+#endif	/* __XFS_DINODE_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
new file mode 100644
index 00000000..dba7a71c
--- /dev/null
+++ b/fs/xfs/xfs_dir2.c
@@ -0,0 +1,764 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_dir2_node.h"
+#include "xfs_error.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2};
+
+/*
+ * ASCII case-insensitive (ie. A-Z) support for directories that was
+ * used in IRIX.
+ */
+STATIC xfs_dahash_t
+xfs_ascii_ci_hashname(
+	struct xfs_name	*name)
+{
+	xfs_dahash_t	hash;
+	int		i;
+
+	for (i = 0, hash = 0; i < name->len; i++)
+		hash = tolower(name->name[i]) ^ rol32(hash, 7);
+
+	return hash;
+}
+
+STATIC enum xfs_dacmp
+xfs_ascii_ci_compname(
+	struct xfs_da_args *args,
+	const unsigned char *name,
+	int		len)
+{
+	enum xfs_dacmp	result;
+	int		i;
+
+	if (args->namelen != len)
+		return XFS_CMP_DIFFERENT;
+
+	result = XFS_CMP_EXACT;
+	for (i = 0; i < len; i++) {
+		if (args->name[i] == name[i])
+			continue;
+		if (tolower(args->name[i]) != tolower(name[i]))
+			return XFS_CMP_DIFFERENT;
+		result = XFS_CMP_CASE;
+	}
+
+	return result;
+}
+
+static struct xfs_nameops xfs_ascii_ci_nameops = {
+	.hashname	= xfs_ascii_ci_hashname,
+	.compname	= xfs_ascii_ci_compname,
+};
+
+void
+xfs_dir_mount(
+	xfs_mount_t	*mp)
+{
+	ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb));
+	ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
+	       XFS_MAX_BLOCKSIZE);
+	mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
+	mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog;
+	mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp));
+	mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
+	mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp));
+	mp->m_attr_node_ents =
+		(mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) /
+		(uint)sizeof(xfs_da_node_entry_t);
+	mp->m_dir_node_ents =
+		(mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) /
+		(uint)sizeof(xfs_da_node_entry_t);
+	mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
+	if (xfs_sb_version_hasasciici(&mp->m_sb))
+		mp->m_dirnameops = &xfs_ascii_ci_nameops;
+	else
+		mp->m_dirnameops = &xfs_default_nameops;
+}
+
+/*
+ * Return 1 if directory contains only "." and "..".
+ */
+int
+xfs_dir_isempty(
+	xfs_inode_t	*dp)
+{
+	xfs_dir2_sf_t	*sfp;
+
+	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+	if (dp->i_d.di_size == 0)	/* might happen during shutdown. */
+		return 1;
+	if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
+		return 0;
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	return !sfp->hdr.count;
+}
+
+/*
+ * Validate a given inode number.
+ */
+int
+xfs_dir_ino_validate(
+	xfs_mount_t	*mp,
+	xfs_ino_t	ino)
+{
+	xfs_agblock_t	agblkno;
+	xfs_agino_t	agino;
+	xfs_agnumber_t	agno;
+	int		ino_ok;
+	int		ioff;
+
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agblkno = XFS_INO_TO_AGBNO(mp, ino);
+	ioff = XFS_INO_TO_OFFSET(mp, ino);
+	agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
+	ino_ok =
+		agno < mp->m_sb.sb_agcount &&
+		agblkno < mp->m_sb.sb_agblocks &&
+		agblkno != 0 &&
+		ioff < (1 << mp->m_sb.sb_inopblog) &&
+		XFS_AGINO_TO_INO(mp, agno, agino) == ino;
+	if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
+			XFS_RANDOM_DIR_INO_VALIDATE))) {
+		xfs_warn(mp, "Invalid inode number 0x%Lx",
+				(unsigned long long) ino);
+		XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	return 0;
+}
+
+/*
+ * Initialize a directory with its "." and ".." entries.
+ */
+int
+xfs_dir_init(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	xfs_inode_t	*pdp)
+{
+	xfs_da_args_t	args;
+	int		error;
+
+	memset((char *)&args, 0, sizeof(args));
+	args.dp = dp;
+	args.trans = tp;
+	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+	if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino)))
+		return error;
+	return xfs_dir2_sf_create(&args, pdp->i_ino);
+}
+
+/*
+  Enter a name in a directory.
+ */
+int
+xfs_dir_createname(
+	xfs_trans_t		*tp,
+	xfs_inode_t		*dp,
+	struct xfs_name		*name,
+	xfs_ino_t		inum,		/* new entry inode number */
+	xfs_fsblock_t		*first,		/* bmap's firstblock */
+	xfs_bmap_free_t		*flist,		/* bmap's freeblock list */
+	xfs_extlen_t		total)		/* bmap's total block count */
+{
+	xfs_da_args_t		args;
+	int			rval;
+	int			v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
+		return rval;
+	XFS_STATS_INC(xs_dir_create);
+
+	memset(&args, 0, sizeof(xfs_da_args_t));
+	args.name = name->name;
+	args.namelen = name->len;
+	args.hashval = dp->i_mount->m_dirnameops->hashname(name);
+	args.inumber = inum;
+	args.dp = dp;
+	args.firstblock = first;
+	args.flist = flist;
+	args.total = total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = tp;
+	args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_addname(&args);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
+		return rval;
+	else if (v)
+		rval = xfs_dir2_block_addname(&args);
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
+		return rval;
+	else if (v)
+		rval = xfs_dir2_leaf_addname(&args);
+	else
+		rval = xfs_dir2_node_addname(&args);
+	return rval;
+}
+
+/*
+ * If doing a CI lookup and case-insensitive match, dup actual name into
+ * args.value. Return EEXIST for success (ie. name found) or an error.
+ */
+int
+xfs_dir_cilookup_result(
+	struct xfs_da_args *args,
+	const unsigned char *name,
+	int		len)
+{
+	if (args->cmpresult == XFS_CMP_DIFFERENT)
+		return ENOENT;
+	if (args->cmpresult != XFS_CMP_CASE ||
+					!(args->op_flags & XFS_DA_OP_CILOOKUP))
+		return EEXIST;
+
+	args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
+	if (!args->value)
+		return ENOMEM;
+
+	memcpy(args->value, name, len);
+	args->valuelen = len;
+	return EEXIST;
+}
+
+/*
+ * Lookup a name in a directory, give back the inode number.
+ * If ci_name is not NULL, returns the actual name in ci_name if it differs
+ * to name, or ci_name->name is set to NULL for an exact match.
+ */
+
+int
+xfs_dir_lookup(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	struct xfs_name	*name,
+	xfs_ino_t	*inum,		/* out: inode number */
+	struct xfs_name *ci_name)	/* out: actual name if CI match */
+{
+	xfs_da_args_t	args;
+	int		rval;
+	int		v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+	XFS_STATS_INC(xs_dir_lookup);
+
+	memset(&args, 0, sizeof(xfs_da_args_t));
+	args.name = name->name;
+	args.namelen = name->len;
+	args.hashval = dp->i_mount->m_dirnameops->hashname(name);
+	args.dp = dp;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = tp;
+	args.op_flags = XFS_DA_OP_OKNOENT;
+	if (ci_name)
+		args.op_flags |= XFS_DA_OP_CILOOKUP;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_lookup(&args);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
+		return rval;
+	else if (v)
+		rval = xfs_dir2_block_lookup(&args);
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
+		return rval;
+	else if (v)
+		rval = xfs_dir2_leaf_lookup(&args);
+	else
+		rval = xfs_dir2_node_lookup(&args);
+	if (rval == EEXIST)
+		rval = 0;
+	if (!rval) {
+		*inum = args.inumber;
+		if (ci_name) {
+			ci_name->name = args.value;
+			ci_name->len = args.valuelen;
+		}
+	}
+	return rval;
+}
+
+/*
+ * Remove an entry from a directory.
+ */
+int
+xfs_dir_removename(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	struct xfs_name	*name,
+	xfs_ino_t	ino,
+	xfs_fsblock_t	*first,		/* bmap's firstblock */
+	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
+	xfs_extlen_t	total)		/* bmap's total block count */
+{
+	xfs_da_args_t	args;
+	int		rval;
+	int		v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+	XFS_STATS_INC(xs_dir_remove);
+
+	memset(&args, 0, sizeof(xfs_da_args_t));
+	args.name = name->name;
+	args.namelen = name->len;
+	args.hashval = dp->i_mount->m_dirnameops->hashname(name);
+	args.inumber = ino;
+	args.dp = dp;
+	args.firstblock = first;
+	args.flist = flist;
+	args.total = total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = tp;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_removename(&args);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
+		return rval;
+	else if (v)
+		rval = xfs_dir2_block_removename(&args);
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
+		return rval;
+	else if (v)
+		rval = xfs_dir2_leaf_removename(&args);
+	else
+		rval = xfs_dir2_node_removename(&args);
+	return rval;
+}
+
+/*
+ * Read a directory.
+ */
+int
+xfs_readdir(
+	xfs_inode_t	*dp,
+	void		*dirent,
+	size_t		bufsize,
+	xfs_off_t	*offset,
+	filldir_t	filldir)
+{
+	int		rval;		/* return value */
+	int		v;		/* type-checking value */
+
+	trace_xfs_readdir(dp);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return XFS_ERROR(EIO);
+
+	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+	XFS_STATS_INC(xs_dir_getdents);
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir);
+	else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
+		;
+	else if (v)
+		rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir);
+	else
+		rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset,
+					      filldir);
+	return rval;
+}
+
+/*
+ * Replace the inode number of a directory entry.
+ */
+int
+xfs_dir_replace(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	struct xfs_name	*name,		/* name of entry to replace */
+	xfs_ino_t	inum,		/* new inode number */
+	xfs_fsblock_t	*first,		/* bmap's firstblock */
+	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
+	xfs_extlen_t	total)		/* bmap's total block count */
+{
+	xfs_da_args_t	args;
+	int		rval;
+	int		v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+
+	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
+		return rval;
+
+	memset(&args, 0, sizeof(xfs_da_args_t));
+	args.name = name->name;
+	args.namelen = name->len;
+	args.hashval = dp->i_mount->m_dirnameops->hashname(name);
+	args.inumber = inum;
+	args.dp = dp;
+	args.firstblock = first;
+	args.flist = flist;
+	args.total = total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = tp;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_replace(&args);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
+		return rval;
+	else if (v)
+		rval = xfs_dir2_block_replace(&args);
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
+		return rval;
+	else if (v)
+		rval = xfs_dir2_leaf_replace(&args);
+	else
+		rval = xfs_dir2_node_replace(&args);
+	return rval;
+}
+
+/*
+ * See if this entry can be added to the directory without allocating space.
+ * First checks that the caller couldn't reserve enough space (resblks = 0).
+ */
+int
+xfs_dir_canenter(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	struct xfs_name	*name,		/* name of entry to add */
+	uint		resblks)
+{
+	xfs_da_args_t	args;
+	int		rval;
+	int		v;		/* type-checking value */
+
+	if (resblks)
+		return 0;
+
+	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+
+	memset(&args, 0, sizeof(xfs_da_args_t));
+	args.name = name->name;
+	args.namelen = name->len;
+	args.hashval = dp->i_mount->m_dirnameops->hashname(name);
+	args.dp = dp;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = tp;
+	args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
+							XFS_DA_OP_OKNOENT;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_addname(&args);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
+		return rval;
+	else if (v)
+		rval = xfs_dir2_block_addname(&args);
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
+		return rval;
+	else if (v)
+		rval = xfs_dir2_leaf_addname(&args);
+	else
+		rval = xfs_dir2_node_addname(&args);
+	return rval;
+}
+
+/*
+ * Utility routines.
+ */
+
+/*
+ * Add a block to the directory.
+ * This routine is for data and free blocks, not leaf/node blocks
+ * which are handled by xfs_da_grow_inode.
+ */
+int
+xfs_dir2_grow_inode(
+	xfs_da_args_t	*args,
+	int		space,		/* v2 dir's space XFS_DIR2_xxx_SPACE */
+	xfs_dir2_db_t	*dbp)		/* out: block number added */
+{
+	xfs_fileoff_t	bno;		/* directory offset of new block */
+	int		count;		/* count of filesystem blocks */
+	xfs_inode_t	*dp;		/* incore directory inode */
+	int		error;
+	int		got;		/* blocks actually mapped */
+	int		i;
+	xfs_bmbt_irec_t	map;		/* single structure for bmap */
+	int		mapi;		/* mapping index */
+	xfs_bmbt_irec_t	*mapp;		/* bmap mapping structure(s) */
+	xfs_mount_t	*mp;
+	int		nmap;		/* number of bmap entries */
+	xfs_trans_t	*tp;
+	xfs_drfsbno_t	nblks;
+
+	trace_xfs_dir2_grow_inode(args, space);
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	nblks = dp->i_d.di_nblocks;
+	/*
+	 * Set lowest possible block in the space requested.
+	 */
+	bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
+	count = mp->m_dirblkfsbs;
+	/*
+	 * Find the first hole for our block.
+	 */
+	if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK)))
+		return error;
+	nmap = 1;
+	ASSERT(args->firstblock != NULL);
+	/*
+	 * Try mapping the new block contiguously (one extent).
+	 */
+	if ((error = xfs_bmapi(tp, dp, bno, count,
+			XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
+			args->firstblock, args->total, &map, &nmap,
+			args->flist)))
+		return error;
+	ASSERT(nmap <= 1);
+	if (nmap == 1) {
+		mapp = &map;
+		mapi = 1;
+	}
+	/*
+	 * Didn't work and this is a multiple-fsb directory block.
+	 * Try again with contiguous flag turned on.
+	 */
+	else if (nmap == 0 && count > 1) {
+		xfs_fileoff_t	b;	/* current file offset */
+
+		/*
+		 * Space for maximum number of mappings.
+		 */
+		mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+		/*
+		 * Iterate until we get to the end of our block.
+		 */
+		for (b = bno, mapi = 0; b < bno + count; ) {
+			int	c;	/* current fsb count */
+
+			/*
+			 * Can't map more than MAX_NMAP at once.
+			 */
+			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
+			c = (int)(bno + count - b);
+			if ((error = xfs_bmapi(tp, dp, b, c,
+					XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
+					args->firstblock, args->total,
+					&mapp[mapi], &nmap, args->flist))) {
+				kmem_free(mapp);
+				return error;
+			}
+			if (nmap < 1)
+				break;
+			/*
+			 * Add this bunch into our table, go to the next offset.
+			 */
+			mapi += nmap;
+			b = mapp[mapi - 1].br_startoff +
+			    mapp[mapi - 1].br_blockcount;
+		}
+	}
+	/*
+	 * Didn't work.
+	 */
+	else {
+		mapi = 0;
+		mapp = NULL;
+	}
+	/*
+	 * See how many fsb's we got.
+	 */
+	for (i = 0, got = 0; i < mapi; i++)
+		got += mapp[i].br_blockcount;
+	/*
+	 * Didn't get enough fsb's, or the first/last block's are wrong.
+	 */
+	if (got != count || mapp[0].br_startoff != bno ||
+	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+	    bno + count) {
+		if (mapp != &map)
+			kmem_free(mapp);
+		return XFS_ERROR(ENOSPC);
+	}
+	/*
+	 * Done with the temporary mapping table.
+	 */
+	if (mapp != &map)
+		kmem_free(mapp);
+
+	/* account for newly allocated blocks in reserved blocks total */
+	args->total -= dp->i_d.di_nblocks - nblks;
+	*dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno);
+
+	/*
+	 * Update file's size if this is the data space and it grew.
+	 */
+	if (space == XFS_DIR2_DATA_SPACE) {
+		xfs_fsize_t	size;		/* directory file (data) size */
+
+		size = XFS_FSB_TO_B(mp, bno + count);
+		if (size > dp->i_d.di_size) {
+			dp->i_d.di_size = size;
+			xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+		}
+	}
+	return 0;
+}
+
+/*
+ * See if the directory is a single-block form directory.
+ */
+int
+xfs_dir2_isblock(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	int		*vp)		/* out: 1 is block, 0 is not block */
+{
+	xfs_fileoff_t	last;		/* last file offset */
+	xfs_mount_t	*mp;
+	int		rval;
+
+	mp = dp->i_mount;
+	if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
+		return rval;
+	rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize;
+	ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize);
+	*vp = rval;
+	return 0;
+}
+
+/*
+ * See if the directory is a single-leaf form directory.
+ */
+int
+xfs_dir2_isleaf(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	int		*vp)		/* out: 1 is leaf, 0 is not leaf */
+{
+	xfs_fileoff_t	last;		/* last file offset */
+	xfs_mount_t	*mp;
+	int		rval;
+
+	mp = dp->i_mount;
+	if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
+		return rval;
+	*vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog);
+	return 0;
+}
+
+/*
+ * Remove the given block from the directory.
+ * This routine is used for data and free blocks, leaf/node are done
+ * by xfs_da_shrink_inode.
+ */
+int
+xfs_dir2_shrink_inode(
+	xfs_da_args_t	*args,
+	xfs_dir2_db_t	db,
+	xfs_dabuf_t	*bp)
+{
+	xfs_fileoff_t	bno;		/* directory file offset */
+	xfs_dablk_t	da;		/* directory file offset */
+	int		done;		/* bunmap is finished */
+	xfs_inode_t	*dp;
+	int		error;
+	xfs_mount_t	*mp;
+	xfs_trans_t	*tp;
+
+	trace_xfs_dir2_shrink_inode(args, db);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	da = xfs_dir2_db_to_da(mp, db);
+	/*
+	 * Unmap the fsblock(s).
+	 */
+	if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
+			XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
+			&done))) {
+		/*
+		 * ENOSPC actually can happen if we're in a removename with
+		 * no space reservation, and the resulting block removal
+		 * would cause a bmap btree split or conversion from extents
+		 * to btree.  This can only happen for un-fragmented
+		 * directory blocks, since you need to be punching out
+		 * the middle of an extent.
+		 * In this case we need to leave the block in the file,
+		 * and not binval it.
+		 * So the block has to be in a consistent empty state
+		 * and appropriately logged.
+		 * We don't free up the buffer, the caller can tell it
+		 * hasn't happened since it got an error back.
+		 */
+		return error;
+	}
+	ASSERT(done);
+	/*
+	 * Invalidate the buffer from the transaction.
+	 */
+	xfs_da_binval(tp, bp);
+	/*
+	 * If it's not a data block, we're done.
+	 */
+	if (db >= XFS_DIR2_LEAF_FIRSTDB(mp))
+		return 0;
+	/*
+	 * If the block isn't the last one in the directory, we're done.
+	 */
+	if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(mp, db + 1, 0))
+		return 0;
+	bno = da;
+	if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
+		/*
+		 * This can't really happen unless there's kernel corruption.
+		 */
+		return error;
+	}
+	if (db == mp->m_dirdatablk)
+		ASSERT(bno == 0);
+	else
+		ASSERT(bno > 0);
+	/*
+	 * Set the size to the new last block.
+	 */
+	dp->i_d.di_size = XFS_FSB_TO_B(mp, bno);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+	return 0;
+}
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
new file mode 100644
index 00000000..74a3b105
--- /dev/null
+++ b/fs/xfs/xfs_dir2.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DIR2_H__
+#define	__XFS_DIR2_H__
+
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_dir2_put_args;
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Directory version 2.
+ * There are 4 possible formats:
+ *	shortform
+ *	single block - data with embedded leaf at the end
+ *	multiple data blocks, single leaf+freeindex block
+ *	data blocks, node&leaf blocks (btree), freeindex blocks
+ *
+ *	The shortform format is in xfs_dir2_sf.h.
+ *	The single block format is in xfs_dir2_block.h.
+ *	The data block format is in xfs_dir2_data.h.
+ *	The leaf and freeindex block formats are in xfs_dir2_leaf.h.
+ *	Node blocks are the same as the other version, in xfs_da_btree.h.
+ */
+
+/*
+ * Byte offset in data block and shortform entry.
+ */
+typedef	__uint16_t	xfs_dir2_data_off_t;
+#define	NULLDATAOFF	0xffffU
+typedef uint		xfs_dir2_data_aoff_t;	/* argument form */
+
+/*
+ * Directory block number (logical dirblk in file)
+ */
+typedef	__uint32_t	xfs_dir2_db_t;
+
+/*
+ * Byte offset in a directory.
+ */
+typedef	xfs_off_t	xfs_dir2_off_t;
+
+extern struct xfs_name	xfs_name_dotdot;
+
+/*
+ * Generic directory interface routines
+ */
+extern void xfs_dir_startup(void);
+extern void xfs_dir_mount(struct xfs_mount *mp);
+extern int xfs_dir_isempty(struct xfs_inode *dp);
+extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_inode *pdp);
+extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_name *name, xfs_ino_t inum,
+				xfs_fsblock_t *first,
+				struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_name *name, xfs_ino_t *inum,
+				struct xfs_name *ci_name);
+extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_name *name, xfs_ino_t ino,
+				xfs_fsblock_t *first,
+				struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_name *name, xfs_ino_t inum,
+				xfs_fsblock_t *first,
+				struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_name *name, uint resblks);
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+
+/*
+ * Utility routines for v2 directories.
+ */
+extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
+				xfs_dir2_db_t *dbp);
+extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp,
+				int *vp);
+extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp,
+				int *vp);
+extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
+				struct xfs_dabuf *bp);
+
+extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
+				const unsigned char *name, int len);
+
+#endif	/* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
new file mode 100644
index 00000000..580d99ce
--- /dev/null
+++ b/fs/xfs/xfs_dir2_block.c
@@ -0,0 +1,1233 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+/*
+ * Local function prototypes.
+ */
+static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, xfs_dabuf_t *bp, int first,
+				    int last);
+static void xfs_dir2_block_log_tail(xfs_trans_t *tp, xfs_dabuf_t *bp);
+static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp,
+				     int *entno);
+static int xfs_dir2_block_sort(const void *a, const void *b);
+
+static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+
+/*
+ * One-time startup routine called from xfs_init().
+ */
+void
+xfs_dir_startup(void)
+{
+	xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
+	xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
+}
+
+/*
+ * Add an entry to a block directory.
+ */
+int						/* error */
+xfs_dir2_block_addname(
+	xfs_da_args_t		*args)		/* directory op arguments */
+{
+	xfs_dir2_data_free_t	*bf;		/* bestfree table in block */
+	xfs_dir2_block_t	*block;		/* directory block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dabuf_t		*bp;		/* buffer for block */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	int			compact;	/* need to compact leaf ents */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* block unused entry */
+	int			error;		/* error return value */
+	xfs_dir2_data_unused_t	*enddup=NULL;	/* unused at end of data */
+	xfs_dahash_t		hash;		/* hash value of found entry */
+	int			high;		/* high index for binary srch */
+	int			highstale;	/* high stale index */
+	int			lfloghigh=0;	/* last final leaf to log */
+	int			lfloglow=0;	/* first final leaf to log */
+	int			len;		/* length of the new entry */
+	int			low;		/* low index for binary srch */
+	int			lowstale;	/* low stale index */
+	int			mid=0;		/* midpoint for binary srch */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log header */
+	int			needscan;	/* need to rescan freespace */
+	__be16			*tagp;		/* pointer to tag value */
+	xfs_trans_t		*tp;		/* transaction structure */
+
+	trace_xfs_dir2_block_addname(args);
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	/*
+	 * Read the (one and only) directory block into dabuf bp.
+	 */
+	if ((error =
+	    xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+		return error;
+	}
+	ASSERT(bp != NULL);
+	block = bp->data;
+	/*
+	 * Check the magic number, corrupted if wrong.
+	 */
+	if (unlikely(be32_to_cpu(block->hdr.magic) != XFS_DIR2_BLOCK_MAGIC)) {
+		XFS_CORRUPTION_ERROR("xfs_dir2_block_addname",
+				     XFS_ERRLEVEL_LOW, mp, block);
+		xfs_da_brelse(tp, bp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	len = xfs_dir2_data_entsize(args->namelen);
+	/*
+	 * Set up pointers to parts of the block.
+	 */
+	bf = block->hdr.bestfree;
+	btp = xfs_dir2_block_tail_p(mp, block);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * No stale entries?  Need space for entry and new leaf.
+	 */
+	if (!btp->stale) {
+		/*
+		 * Tag just before the first leaf entry.
+		 */
+		tagp = (__be16 *)blp - 1;
+		/*
+		 * Data object just before the first leaf entry.
+		 */
+		enddup = (xfs_dir2_data_unused_t *)((char *)block + be16_to_cpu(*tagp));
+		/*
+		 * If it's not free then can't do this add without cleaning up:
+		 * the space before the first leaf entry needs to be free so it
+		 * can be expanded to hold the pointer to the new entry.
+		 */
+		if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+			dup = enddup = NULL;
+		/*
+		 * Check out the biggest freespace and see if it's the same one.
+		 */
+		else {
+			dup = (xfs_dir2_data_unused_t *)
+			      ((char *)block + be16_to_cpu(bf[0].offset));
+			if (dup == enddup) {
+				/*
+				 * It is the biggest freespace, is it too small
+				 * to hold the new leaf too?
+				 */
+				if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
+					/*
+					 * Yes, we use the second-largest
+					 * entry instead if it works.
+					 */
+					if (be16_to_cpu(bf[1].length) >= len)
+						dup = (xfs_dir2_data_unused_t *)
+						      ((char *)block +
+						       be16_to_cpu(bf[1].offset));
+					else
+						dup = NULL;
+				}
+			} else {
+				/*
+				 * Not the same free entry,
+				 * just check its length.
+				 */
+				if (be16_to_cpu(dup->length) < len) {
+					dup = NULL;
+				}
+			}
+		}
+		compact = 0;
+	}
+	/*
+	 * If there are stale entries we'll use one for the leaf.
+	 * Is the biggest entry enough to avoid compaction?
+	 */
+	else if (be16_to_cpu(bf[0].length) >= len) {
+		dup = (xfs_dir2_data_unused_t *)
+		      ((char *)block + be16_to_cpu(bf[0].offset));
+		compact = 0;
+	}
+	/*
+	 * Will need to compact to make this work.
+	 */
+	else {
+		/*
+		 * Tag just before the first leaf entry.
+		 */
+		tagp = (__be16 *)blp - 1;
+		/*
+		 * Data object just before the first leaf entry.
+		 */
+		dup = (xfs_dir2_data_unused_t *)((char *)block + be16_to_cpu(*tagp));
+		/*
+		 * If it's not free then the data will go where the
+		 * leaf data starts now, if it works at all.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
+			    (uint)sizeof(*blp) < len)
+				dup = NULL;
+		} else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
+			dup = NULL;
+		else
+			dup = (xfs_dir2_data_unused_t *)blp;
+		compact = 1;
+	}
+	/*
+	 * If this isn't a real add, we're done with the buffer.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+		xfs_da_brelse(tp, bp);
+	/*
+	 * If we don't have space for the new entry & leaf ...
+	 */
+	if (!dup) {
+		/*
+		 * Not trying to actually do anything, or don't have
+		 * a space reservation: return no-space.
+		 */
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+			return XFS_ERROR(ENOSPC);
+		/*
+		 * Convert to the next larger format.
+		 * Then add the new entry in that format.
+		 */
+		error = xfs_dir2_block_to_leaf(args, bp);
+		xfs_da_buf_done(bp);
+		if (error)
+			return error;
+		return xfs_dir2_leaf_addname(args);
+	}
+	/*
+	 * Just checking, and it would work, so say so.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+		return 0;
+	needlog = needscan = 0;
+	/*
+	 * If need to compact the leaf entries, do it now.
+	 * Leave the highest-numbered stale entry stale.
+	 * XXX should be the one closest to mid but mid is not yet computed.
+	 */
+	if (compact) {
+		int	fromidx;		/* source leaf index */
+		int	toidx;			/* target leaf index */
+
+		for (fromidx = toidx = be32_to_cpu(btp->count) - 1,
+			highstale = lfloghigh = -1;
+		     fromidx >= 0;
+		     fromidx--) {
+			if (be32_to_cpu(blp[fromidx].address) == XFS_DIR2_NULL_DATAPTR) {
+				if (highstale == -1)
+					highstale = toidx;
+				else {
+					if (lfloghigh == -1)
+						lfloghigh = toidx;
+					continue;
+				}
+			}
+			if (fromidx < toidx)
+				blp[toidx] = blp[fromidx];
+			toidx--;
+		}
+		lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
+		lfloghigh -= be32_to_cpu(btp->stale) - 1;
+		be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+		xfs_dir2_data_make_free(tp, bp,
+			(xfs_dir2_data_aoff_t)((char *)blp - (char *)block),
+			(xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
+			&needlog, &needscan);
+		blp += be32_to_cpu(btp->stale) - 1;
+		btp->stale = cpu_to_be32(1);
+		/*
+		 * If we now need to rebuild the bestfree map, do so.
+		 * This needs to happen before the next call to use_free.
+		 */
+		if (needscan) {
+			xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog);
+			needscan = 0;
+		}
+	}
+	/*
+	 * Set leaf logging boundaries to impossible state.
+	 * For the no-stale case they're set explicitly.
+	 */
+	else if (btp->stale) {
+		lfloglow = be32_to_cpu(btp->count);
+		lfloghigh = -1;
+	}
+	/*
+	 * Find the slot that's first lower than our hash value, -1 if none.
+	 */
+	for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) {
+		mid = (low + high) >> 1;
+		if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+			break;
+		if (hash < args->hashval)
+			low = mid + 1;
+		else
+			high = mid - 1;
+	}
+	while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) {
+		mid--;
+	}
+	/*
+	 * No stale entries, will use enddup space to hold new leaf.
+	 */
+	if (!btp->stale) {
+		/*
+		 * Mark the space needed for the new leaf entry, now in use.
+		 */
+		xfs_dir2_data_use_free(tp, bp, enddup,
+			(xfs_dir2_data_aoff_t)
+			((char *)enddup - (char *)block + be16_to_cpu(enddup->length) -
+			 sizeof(*blp)),
+			(xfs_dir2_data_aoff_t)sizeof(*blp),
+			&needlog, &needscan);
+		/*
+		 * Update the tail (entry count).
+		 */
+		be32_add_cpu(&btp->count, 1);
+		/*
+		 * If we now need to rebuild the bestfree map, do so.
+		 * This needs to happen before the next call to use_free.
+		 */
+		if (needscan) {
+			xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block,
+				&needlog);
+			needscan = 0;
+		}
+		/*
+		 * Adjust pointer to the first leaf entry, we're about to move
+		 * the table up one to open up space for the new leaf entry.
+		 * Then adjust our index to match.
+		 */
+		blp--;
+		mid++;
+		if (mid)
+			memmove(blp, &blp[1], mid * sizeof(*blp));
+		lfloglow = 0;
+		lfloghigh = mid;
+	}
+	/*
+	 * Use a stale leaf for our new entry.
+	 */
+	else {
+		for (lowstale = mid;
+		     lowstale >= 0 &&
+			be32_to_cpu(blp[lowstale].address) != XFS_DIR2_NULL_DATAPTR;
+		     lowstale--)
+			continue;
+		for (highstale = mid + 1;
+		     highstale < be32_to_cpu(btp->count) &&
+			be32_to_cpu(blp[highstale].address) != XFS_DIR2_NULL_DATAPTR &&
+			(lowstale < 0 || mid - lowstale > highstale - mid);
+		     highstale++)
+			continue;
+		/*
+		 * Move entries toward the low-numbered stale entry.
+		 */
+		if (lowstale >= 0 &&
+		    (highstale == be32_to_cpu(btp->count) ||
+		     mid - lowstale <= highstale - mid)) {
+			if (mid - lowstale)
+				memmove(&blp[lowstale], &blp[lowstale + 1],
+					(mid - lowstale) * sizeof(*blp));
+			lfloglow = MIN(lowstale, lfloglow);
+			lfloghigh = MAX(mid, lfloghigh);
+		}
+		/*
+		 * Move entries toward the high-numbered stale entry.
+		 */
+		else {
+			ASSERT(highstale < be32_to_cpu(btp->count));
+			mid++;
+			if (highstale - mid)
+				memmove(&blp[mid + 1], &blp[mid],
+					(highstale - mid) * sizeof(*blp));
+			lfloglow = MIN(mid, lfloglow);
+			lfloghigh = MAX(highstale, lfloghigh);
+		}
+		be32_add_cpu(&btp->stale, -1);
+	}
+	/*
+	 * Point to the new data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	/*
+	 * Fill in the leaf entry.
+	 */
+	blp[mid].hashval = cpu_to_be32(args->hashval);
+	blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+				(char *)dep - (char *)block));
+	xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
+	/*
+	 * Mark space for the data entry used.
+	 */
+	xfs_dir2_data_use_free(tp, bp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)block),
+		(xfs_dir2_data_aoff_t)len, &needlog, &needscan);
+	/*
+	 * Create the new data entry.
+	 */
+	dep->inumber = cpu_to_be64(args->inumber);
+	dep->namelen = args->namelen;
+	memcpy(dep->name, args->name, args->namelen);
+	tagp = xfs_dir2_data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)block);
+	/*
+	 * Clean up the bestfree array and log the header, tail, and entry.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(tp, bp);
+	xfs_dir2_block_log_tail(tp, bp);
+	xfs_dir2_data_log_entry(tp, bp, dep);
+	xfs_dir2_data_check(dp, bp);
+	xfs_da_buf_done(bp);
+	return 0;
+}
+
+/*
+ * Readdir for block directories.
+ */
+int						/* error */
+xfs_dir2_block_getdents(
+	xfs_inode_t		*dp,		/* incore inode */
+	void			*dirent,
+	xfs_off_t		*offset,
+	filldir_t		filldir)
+{
+	xfs_dir2_block_t	*block;		/* directory block structure */
+	xfs_dabuf_t		*bp;		/* buffer for block */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_dir2_data_unused_t	*dup;		/* block unused entry */
+	char			*endptr;	/* end of the data entries */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	char			*ptr;		/* current data entry */
+	int			wantoff;	/* starting block offset */
+	xfs_off_t		cook;
+
+	mp = dp->i_mount;
+	/*
+	 * If the block number in the offset is out of range, we're done.
+	 */
+	if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) {
+		return 0;
+	}
+	/*
+	 * Can't read the block, give up, else get dabuf in bp.
+	 */
+	error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1,
+				&bp, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	ASSERT(bp != NULL);
+	/*
+	 * Extract the byte offset we start at from the seek pointer.
+	 * We'll skip entries before this.
+	 */
+	wantoff = xfs_dir2_dataptr_to_off(mp, *offset);
+	block = bp->data;
+	xfs_dir2_data_check(dp, bp);
+	/*
+	 * Set up values for the loop.
+	 */
+	btp = xfs_dir2_block_tail_p(mp, block);
+	ptr = (char *)block->u;
+	endptr = (char *)xfs_dir2_block_leaf_p(btp);
+
+	/*
+	 * Loop over the data portion of the block.
+	 * Each object is a real entry (dep) or an unused one (dup).
+	 */
+	while (ptr < endptr) {
+		dup = (xfs_dir2_data_unused_t *)ptr;
+		/*
+		 * Unused, skip it.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			ptr += be16_to_cpu(dup->length);
+			continue;
+		}
+
+		dep = (xfs_dir2_data_entry_t *)ptr;
+
+		/*
+		 * Bump pointer for the next iteration.
+		 */
+		ptr += xfs_dir2_data_entsize(dep->namelen);
+		/*
+		 * The entry is before the desired starting point, skip it.
+		 */
+		if ((char *)dep - (char *)block < wantoff)
+			continue;
+
+		cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+					    (char *)dep - (char *)block);
+
+		/*
+		 * If it didn't fit, set the final offset to here & return.
+		 */
+		if (filldir(dirent, (char *)dep->name, dep->namelen,
+			    cook & 0x7fffffff, be64_to_cpu(dep->inumber),
+			    DT_UNKNOWN)) {
+			*offset = cook & 0x7fffffff;
+			xfs_da_brelse(NULL, bp);
+			return 0;
+		}
+	}
+
+	/*
+	 * Reached the end of the block.
+	 * Set the offset to a non-existent block 1 and return.
+	 */
+	*offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+			0x7fffffff;
+	xfs_da_brelse(NULL, bp);
+	return 0;
+}
+
+/*
+ * Log leaf entries from the block.
+ */
+static void
+xfs_dir2_block_log_leaf(
+	xfs_trans_t		*tp,		/* transaction structure */
+	xfs_dabuf_t		*bp,		/* block buffer */
+	int			first,		/* index of first logged leaf */
+	int			last)		/* index of last logged leaf */
+{
+	xfs_dir2_block_t	*block;		/* directory block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	mp = tp->t_mountp;
+	block = bp->data;
+	btp = xfs_dir2_block_tail_p(mp, block);
+	blp = xfs_dir2_block_leaf_p(btp);
+	xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block),
+		(uint)((char *)&blp[last + 1] - (char *)block - 1));
+}
+
+/*
+ * Log the block tail.
+ */
+static void
+xfs_dir2_block_log_tail(
+	xfs_trans_t		*tp,		/* transaction structure */
+	xfs_dabuf_t		*bp)		/* block buffer */
+{
+	xfs_dir2_block_t	*block;		/* directory block structure */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	mp = tp->t_mountp;
+	block = bp->data;
+	btp = xfs_dir2_block_tail_p(mp, block);
+	xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block),
+		(uint)((char *)(btp + 1) - (char *)block - 1));
+}
+
+/*
+ * Look up an entry in the block.  This is the external routine,
+ * xfs_dir2_block_lookup_int does the real work.
+ */
+int						/* error */
+xfs_dir2_block_lookup(
+	xfs_da_args_t		*args)		/* dir lookup arguments */
+{
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* entry index */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	trace_xfs_dir2_block_lookup(args);
+
+	/*
+	 * Get the buffer, look up the entry.
+	 * If not found (ENOENT) then return, have no buffer.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
+		return error;
+	dp = args->dp;
+	mp = dp->i_mount;
+	block = bp->data;
+	xfs_dir2_data_check(dp, bp);
+	btp = xfs_dir2_block_tail_p(mp, block);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Get the offset from the leaf entry, to point to the data.
+	 */
+	dep = (xfs_dir2_data_entry_t *)((char *)block +
+		xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+	/*
+	 * Fill in inode number, CI name if appropriate, release the block.
+	 */
+	args->inumber = be64_to_cpu(dep->inumber);
+	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+	xfs_da_brelse(args->trans, bp);
+	return XFS_ERROR(error);
+}
+
+/*
+ * Internal block lookup routine.
+ */
+static int					/* error */
+xfs_dir2_block_lookup_int(
+	xfs_da_args_t		*args,		/* dir lookup arguments */
+	xfs_dabuf_t		**bpp,		/* returned block buffer */
+	int			*entno)		/* returned entry number */
+{
+	xfs_dir2_dataptr_t	addr;		/* data entry address */
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			error;		/* error return value */
+	xfs_dahash_t		hash;		/* found hash value */
+	int			high;		/* binary search high index */
+	int			low;		/* binary search low index */
+	int			mid;		/* binary search current idx */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	enum xfs_dacmp		cmp;		/* comparison result */
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	/*
+	 * Read the buffer, return error if we can't get it.
+	 */
+	if ((error =
+	    xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+		return error;
+	}
+	ASSERT(bp != NULL);
+	block = bp->data;
+	xfs_dir2_data_check(dp, bp);
+	btp = xfs_dir2_block_tail_p(mp, block);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Loop doing a binary search for our hash value.
+	 * Find our entry, ENOENT if it's not there.
+	 */
+	for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) {
+		ASSERT(low <= high);
+		mid = (low + high) >> 1;
+		if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+			break;
+		if (hash < args->hashval)
+			low = mid + 1;
+		else
+			high = mid - 1;
+		if (low > high) {
+			ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+			xfs_da_brelse(tp, bp);
+			return XFS_ERROR(ENOENT);
+		}
+	}
+	/*
+	 * Back up to the first one with the right hash value.
+	 */
+	while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) {
+		mid--;
+	}
+	/*
+	 * Now loop forward through all the entries with the
+	 * right hash value looking for our name.
+	 */
+	do {
+		if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Get pointer to the entry from the leaf.
+		 */
+		dep = (xfs_dir2_data_entry_t *)
+			((char *)block + xfs_dir2_dataptr_to_off(mp, addr));
+		/*
+		 * Compare name and if it's an exact match, return the index
+		 * and buffer. If it's the first case-insensitive match, store
+		 * the index and buffer and continue looking for an exact match.
+		 */
+		cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			args->cmpresult = cmp;
+			*bpp = bp;
+			*entno = mid;
+			if (cmp == XFS_CMP_EXACT)
+				return 0;
+		}
+	} while (++mid < be32_to_cpu(btp->count) &&
+			be32_to_cpu(blp[mid].hashval) == hash);
+
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	/*
+	 * Here, we can only be doing a lookup (not a rename or replace).
+	 * If a case-insensitive match was found earlier, return success.
+	 */
+	if (args->cmpresult == XFS_CMP_CASE)
+		return 0;
+	/*
+	 * No match, release the buffer and return ENOENT.
+	 */
+	xfs_da_brelse(tp, bp);
+	return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Remove an entry from a block format directory.
+ * If that makes the block small enough to fit in shortform, transform it.
+ */
+int						/* error */
+xfs_dir2_block_removename(
+	xfs_da_args_t		*args)		/* directory operation args */
+{
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf pointer */
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* block leaf entry index */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to fixup bestfree */
+	xfs_dir2_sf_hdr_t	sfh;		/* shortform header */
+	int			size;		/* shortform size */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	trace_xfs_dir2_block_removename(args);
+
+	/*
+	 * Look up the entry in the block.  Gets the buffer and entry index.
+	 * It will always be there, the vnodeops level does a lookup first.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+		return error;
+	}
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	block = bp->data;
+	btp = xfs_dir2_block_tail_p(mp, block);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Point to the data entry using the leaf entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+	/*
+	 * Mark the data entry's space free.
+	 */
+	needlog = needscan = 0;
+	xfs_dir2_data_make_free(tp, bp,
+		(xfs_dir2_data_aoff_t)((char *)dep - (char *)block),
+		xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
+	/*
+	 * Fix up the block tail.
+	 */
+	be32_add_cpu(&btp->stale, 1);
+	xfs_dir2_block_log_tail(tp, bp);
+	/*
+	 * Remove the leaf entry by marking it stale.
+	 */
+	blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+	xfs_dir2_block_log_leaf(tp, bp, ent, ent);
+	/*
+	 * Fix up bestfree, log the header if necessary.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(tp, bp);
+	xfs_dir2_data_check(dp, bp);
+	/*
+	 * See if the size as a shortform is good enough.
+	 */
+	if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) >
+	    XFS_IFORK_DSIZE(dp)) {
+		xfs_da_buf_done(bp);
+		return 0;
+	}
+	/*
+	 * If it works, do the conversion.
+	 */
+	return xfs_dir2_block_to_sf(args, bp, size, &sfh);
+}
+
+/*
+ * Replace an entry in a V2 block directory.
+ * Change the inode number to the new value.
+ */
+int						/* error */
+xfs_dir2_block_replace(
+	xfs_da_args_t		*args)		/* directory operation args */
+{
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* leaf entry index */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	trace_xfs_dir2_block_replace(args);
+
+	/*
+	 * Lookup the entry in the directory.  Get buffer and entry index.
+	 * This will always succeed since the caller has already done a lookup.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+		return error;
+	}
+	dp = args->dp;
+	mp = dp->i_mount;
+	block = bp->data;
+	btp = xfs_dir2_block_tail_p(mp, block);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Point to the data entry we need to change.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+	ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
+	/*
+	 * Change the inode number to the new value.
+	 */
+	dep->inumber = cpu_to_be64(args->inumber);
+	xfs_dir2_data_log_entry(args->trans, bp, dep);
+	xfs_dir2_data_check(dp, bp);
+	xfs_da_buf_done(bp);
+	return 0;
+}
+
+/*
+ * Qsort comparison routine for the block leaf entries.
+ */
+static int					/* sort order */
+xfs_dir2_block_sort(
+	const void			*a,	/* first leaf entry */
+	const void			*b)	/* second leaf entry */
+{
+	const xfs_dir2_leaf_entry_t	*la;	/* first leaf entry */
+	const xfs_dir2_leaf_entry_t	*lb;	/* second leaf entry */
+
+	la = a;
+	lb = b;
+	return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 :
+		(be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0);
+}
+
+/*
+ * Convert a V2 leaf directory to a V2 block directory if possible.
+ */
+int						/* error */
+xfs_dir2_leaf_to_block(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*lbp,		/* leaf buffer */
+	xfs_dabuf_t		*dbp)		/* data buffer */
+{
+	__be16			*bestsp;	/* leaf bests table */
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused data entry */
+	int			error;		/* error return value */
+	int			from;		/* leaf from index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* file system mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to scan for bestfree */
+	xfs_dir2_sf_hdr_t	sfh;		/* shortform header */
+	int			size;		/* bytes used */
+	__be16			*tagp;		/* end of entry (tag) */
+	int			to;		/* block/leaf to index */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	trace_xfs_dir2_leaf_to_block(args);
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = lbp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC);
+	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+	/*
+	 * If there are data blocks other than the first one, take this
+	 * opportunity to remove trailing empty data blocks that may have
+	 * been left behind during no-space-reservation operations.
+	 * These will show up in the leaf bests table.
+	 */
+	while (dp->i_d.di_size > mp->m_dirblksize) {
+		bestsp = xfs_dir2_leaf_bests_p(ltp);
+		if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
+		    mp->m_dirblksize - (uint)sizeof(block->hdr)) {
+			if ((error =
+			    xfs_dir2_leaf_trim_data(args, lbp,
+				    (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
+				goto out;
+		} else {
+			error = 0;
+			goto out;
+		}
+	}
+	/*
+	 * Read the data block if we don't already have it, give up if it fails.
+	 */
+	if (dbp == NULL &&
+	    (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp,
+		    XFS_DATA_FORK))) {
+		goto out;
+	}
+	block = dbp->data;
+	ASSERT(be32_to_cpu(block->hdr.magic) == XFS_DIR2_DATA_MAGIC);
+	/*
+	 * Size of the "leaf" area in the block.
+	 */
+	size = (uint)sizeof(block->tail) +
+	       (uint)sizeof(*lep) * (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale));
+	/*
+	 * Look at the last data entry.
+	 */
+	tagp = (__be16 *)((char *)block + mp->m_dirblksize) - 1;
+	dup = (xfs_dir2_data_unused_t *)((char *)block + be16_to_cpu(*tagp));
+	/*
+	 * If it's not free or is too short we can't do it.
+	 */
+	if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG ||
+	    be16_to_cpu(dup->length) < size) {
+		error = 0;
+		goto out;
+	}
+	/*
+	 * Start converting it to block form.
+	 */
+	block->hdr.magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+	needlog = 1;
+	needscan = 0;
+	/*
+	 * Use up the space at the end of the block (blp/btp).
+	 */
+	xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size,
+		&needlog, &needscan);
+	/*
+	 * Initialize the block tail.
+	 */
+	btp = xfs_dir2_block_tail_p(mp, block);
+	btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale));
+	btp->stale = 0;
+	xfs_dir2_block_log_tail(tp, dbp);
+	/*
+	 * Initialize the block leaf area.  We compact out stale entries.
+	 */
+	lep = xfs_dir2_block_leaf_p(btp);
+	for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) {
+		if (be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		lep[to++] = leaf->ents[from];
+	}
+	ASSERT(to == be32_to_cpu(btp->count));
+	xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
+	/*
+	 * Scan the bestfree if we need it and log the data block header.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	/*
+	 * Pitch the old leaf block.
+	 */
+	error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp);
+	lbp = NULL;
+	if (error) {
+		goto out;
+	}
+	/*
+	 * Now see if the resulting block can be shrunken to shortform.
+	 */
+	if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) >
+	    XFS_IFORK_DSIZE(dp)) {
+		error = 0;
+		goto out;
+	}
+	return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
+out:
+	if (lbp)
+		xfs_da_buf_done(lbp);
+	if (dbp)
+		xfs_da_buf_done(dbp);
+	return error;
+}
+
+/*
+ * Convert the shortform directory to block form.
+ */
+int						/* error */
+xfs_dir2_sf_to_block(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_dir2_db_t		blkno;		/* dir-relative block # (0) */
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail pointer */
+	char			*buf;		/* sf buffer */
+	int			buf_len;
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			dummy;		/* trash */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry pointer */
+	int			endoffset;	/* end of data objects */
+	int			error;		/* error return value */
+	int			i;		/* index */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to scan block freespc */
+	int			newoffset;	/* offset from current entry */
+	int			offset;		/* target block offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* sf entry pointer */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	__be16			*tagp;		/* end of data entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_name		name;
+
+	trace_xfs_dir2_sf_to_block(args);
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Bomb out if the shortform directory is way too short.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
+	/*
+	 * Copy the directory into the stack buffer.
+	 * Then pitch the incore inode data so we can make extents.
+	 */
+
+	buf_len = dp->i_df.if_bytes;
+	buf = kmem_alloc(buf_len, KM_SLEEP);
+
+	memcpy(buf, sfp, buf_len);
+	xfs_idata_realloc(dp, -buf_len, XFS_DATA_FORK);
+	dp->i_d.di_size = 0;
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+	/*
+	 * Reset pointer - old sfp is gone.
+	 */
+	sfp = (xfs_dir2_sf_t *)buf;
+	/*
+	 * Add block 0 to the inode.
+	 */
+	error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
+	if (error) {
+		kmem_free(buf);
+		return error;
+	}
+	/*
+	 * Initialize the data block.
+	 */
+	error = xfs_dir2_data_init(args, blkno, &bp);
+	if (error) {
+		kmem_free(buf);
+		return error;
+	}
+	block = bp->data;
+	block->hdr.magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+	/*
+	 * Compute size of block "tail" area.
+	 */
+	i = (uint)sizeof(*btp) +
+	    (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
+	/*
+	 * The whole thing is initialized to free by the init routine.
+	 * Say we're using the leaf and tail area.
+	 */
+	dup = (xfs_dir2_data_unused_t *)block->u;
+	needlog = needscan = 0;
+	xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog,
+		&needscan);
+	ASSERT(needscan == 0);
+	/*
+	 * Fill in the tail.
+	 */
+	btp = xfs_dir2_block_tail_p(mp, block);
+	btp->count = cpu_to_be32(sfp->hdr.count + 2);	/* ., .. */
+	btp->stale = 0;
+	blp = xfs_dir2_block_leaf_p(btp);
+	endoffset = (uint)((char *)blp - (char *)block);
+	/*
+	 * Remove the freespace, we'll manage it.
+	 */
+	xfs_dir2_data_use_free(tp, bp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)block),
+		be16_to_cpu(dup->length), &needlog, &needscan);
+	/*
+	 * Create entry for .
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)block + XFS_DIR2_DATA_DOT_OFFSET);
+	dep->inumber = cpu_to_be64(dp->i_ino);
+	dep->namelen = 1;
+	dep->name[0] = '.';
+	tagp = xfs_dir2_data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)block);
+	xfs_dir2_data_log_entry(tp, bp, dep);
+	blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
+	blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+				(char *)dep - (char *)block));
+	/*
+	 * Create entry for ..
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+		((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET);
+	dep->inumber = cpu_to_be64(xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent));
+	dep->namelen = 2;
+	dep->name[0] = dep->name[1] = '.';
+	tagp = xfs_dir2_data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)block);
+	xfs_dir2_data_log_entry(tp, bp, dep);
+	blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
+	blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+				(char *)dep - (char *)block));
+	offset = XFS_DIR2_DATA_FIRST_OFFSET;
+	/*
+	 * Loop over existing entries, stuff them in.
+	 */
+	if ((i = 0) == sfp->hdr.count)
+		sfep = NULL;
+	else
+		sfep = xfs_dir2_sf_firstentry(sfp);
+	/*
+	 * Need to preserve the existing offset values in the sf directory.
+	 * Insert holes (unused entries) where necessary.
+	 */
+	while (offset < endoffset) {
+		/*
+		 * sfep is null when we reach the end of the list.
+		 */
+		if (sfep == NULL)
+			newoffset = endoffset;
+		else
+			newoffset = xfs_dir2_sf_get_offset(sfep);
+		/*
+		 * There should be a hole here, make one.
+		 */
+		if (offset < newoffset) {
+			dup = (xfs_dir2_data_unused_t *)
+			      ((char *)block + offset);
+			dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+			dup->length = cpu_to_be16(newoffset - offset);
+			*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
+				((char *)dup - (char *)block));
+			xfs_dir2_data_log_unused(tp, bp, dup);
+			(void)xfs_dir2_data_freeinsert((xfs_dir2_data_t *)block,
+				dup, &dummy);
+			offset += be16_to_cpu(dup->length);
+			continue;
+		}
+		/*
+		 * Copy a real entry.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset);
+		dep->inumber = cpu_to_be64(xfs_dir2_sf_get_inumber(sfp,
+				xfs_dir2_sf_inumberp(sfep)));
+		dep->namelen = sfep->namelen;
+		memcpy(dep->name, sfep->name, dep->namelen);
+		tagp = xfs_dir2_data_entry_tag_p(dep);
+		*tagp = cpu_to_be16((char *)dep - (char *)block);
+		xfs_dir2_data_log_entry(tp, bp, dep);
+		name.name = sfep->name;
+		name.len = sfep->namelen;
+		blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
+							hashname(&name));
+		blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+						 (char *)dep - (char *)block));
+		offset = (int)((char *)(tagp + 1) - (char *)block);
+		if (++i == sfp->hdr.count)
+			sfep = NULL;
+		else
+			sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+	}
+	/* Done with the temporary buffer */
+	kmem_free(buf);
+	/*
+	 * Sort the leaf entries by hash value.
+	 */
+	xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort);
+	/*
+	 * Log the leaf entry area and tail.
+	 * Already logged the header in data_init, ignore needlog.
+	 */
+	ASSERT(needscan == 0);
+	xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
+	xfs_dir2_block_log_tail(tp, bp);
+	xfs_dir2_data_check(dp, bp);
+	xfs_da_buf_done(bp);
+	return 0;
+}
diff --git a/fs/xfs/xfs_dir2_block.h b/fs/xfs/xfs_dir2_block.h
new file mode 100644
index 00000000..10e68967
--- /dev/null
+++ b/fs/xfs/xfs_dir2_block.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DIR2_BLOCK_H__
+#define	__XFS_DIR2_BLOCK_H__
+
+/*
+ * xfs_dir2_block.h
+ * Directory version 2, single block format structures
+ */
+
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_dir2_data_hdr;
+struct xfs_dir2_leaf_entry;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * The single block format is as follows:
+ * xfs_dir2_data_hdr_t structure
+ * xfs_dir2_data_entry_t and xfs_dir2_data_unused_t structures
+ * xfs_dir2_leaf_entry_t structures
+ * xfs_dir2_block_tail_t structure
+ */
+
+#define	XFS_DIR2_BLOCK_MAGIC	0x58443242	/* XD2B: for one block dirs */
+
+typedef struct xfs_dir2_block_tail {
+	__be32		count;			/* count of leaf entries */
+	__be32		stale;			/* count of stale lf entries */
+} xfs_dir2_block_tail_t;
+
+/*
+ * Generic single-block structure, for xfs_db.
+ */
+typedef struct xfs_dir2_block {
+	xfs_dir2_data_hdr_t	hdr;		/* magic XFS_DIR2_BLOCK_MAGIC */
+	xfs_dir2_data_union_t	u[1];
+	xfs_dir2_leaf_entry_t	leaf[1];
+	xfs_dir2_block_tail_t	tail;
+} xfs_dir2_block_t;
+
+/*
+ * Pointer to the leaf header embedded in a data block (1-block format)
+ */
+static inline xfs_dir2_block_tail_t *
+xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block)
+{
+	return (((xfs_dir2_block_tail_t *)
+		((char *)(block) + (mp)->m_dirblksize)) - 1);
+}
+
+/*
+ * Pointer to the leaf entries embedded in a data block (1-block format)
+ */
+static inline struct xfs_dir2_leaf_entry *
+xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp)
+{
+	return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
+}
+
+/*
+ * Function declarations.
+ */
+extern int xfs_dir2_block_addname(struct xfs_da_args *args);
+extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
+				   xfs_off_t *offset, filldir_t filldir);
+extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_block_removename(struct xfs_da_args *args);
+extern int xfs_dir2_block_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
+				  struct xfs_dabuf *lbp, struct xfs_dabuf *dbp);
+extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
+
+#endif	/* __XFS_DIR2_BLOCK_H__ */
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
new file mode 100644
index 00000000..921595b8
--- /dev/null
+++ b/fs/xfs/xfs_dir2_data.c
@@ -0,0 +1,833 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_error.h"
+
+#ifdef DEBUG
+/*
+ * Check the consistency of the data block.
+ * The input can also be a block-format directory.
+ * Pop an assert if we find anything bad.
+ */
+void
+xfs_dir2_data_check(
+	xfs_inode_t		*dp,		/* incore inode pointer */
+	xfs_dabuf_t		*bp)		/* data block's buffer */
+{
+	xfs_dir2_dataptr_t	addr;		/* addr for leaf lookup */
+	xfs_dir2_data_free_t	*bf;		/* bestfree table */
+	xfs_dir2_block_tail_t	*btp=NULL;	/* block tail */
+	int			count;		/* count of entries found */
+	xfs_dir2_data_t		*d;		/* data block pointer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry */
+	char			*endp;		/* end of useful data */
+	int			freeseen;	/* mask of bestfrees seen */
+	xfs_dahash_t		hash;		/* hash of current name */
+	int			i;		/* leaf index */
+	int			lastfree;	/* last entry was unused */
+	xfs_dir2_leaf_entry_t	*lep=NULL;	/* block leaf entries */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	char			*p;		/* current data position */
+	int			stale;		/* count of stale leaves */
+	struct xfs_name		name;
+
+	mp = dp->i_mount;
+	d = bp->data;
+	ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+	       be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+	bf = d->hdr.bestfree;
+	p = (char *)d->u;
+	if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) {
+		btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d);
+		lep = xfs_dir2_block_leaf_p(btp);
+		endp = (char *)lep;
+	} else
+		endp = (char *)d + mp->m_dirblksize;
+	count = lastfree = freeseen = 0;
+	/*
+	 * Account for zero bestfree entries.
+	 */
+	if (!bf[0].length) {
+		ASSERT(!bf[0].offset);
+		freeseen |= 1 << 0;
+	}
+	if (!bf[1].length) {
+		ASSERT(!bf[1].offset);
+		freeseen |= 1 << 1;
+	}
+	if (!bf[2].length) {
+		ASSERT(!bf[2].offset);
+		freeseen |= 1 << 2;
+	}
+	ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length));
+	ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length));
+	/*
+	 * Loop over the data/unused entries.
+	 */
+	while (p < endp) {
+		dup = (xfs_dir2_data_unused_t *)p;
+		/*
+		 * If it's unused, look for the space in the bestfree table.
+		 * If we find it, account for that, else make sure it
+		 * doesn't need to be there.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			ASSERT(lastfree == 0);
+			ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
+			       (char *)dup - (char *)d);
+			dfp = xfs_dir2_data_freefind(d, dup);
+			if (dfp) {
+				i = (int)(dfp - bf);
+				ASSERT((freeseen & (1 << i)) == 0);
+				freeseen |= 1 << i;
+			} else {
+				ASSERT(be16_to_cpu(dup->length) <=
+				       be16_to_cpu(bf[2].length));
+			}
+			p += be16_to_cpu(dup->length);
+			lastfree = 1;
+			continue;
+		}
+		/*
+		 * It's a real entry.  Validate the fields.
+		 * If this is a block directory then make sure it's
+		 * in the leaf section of the block.
+		 * The linear search is crude but this is DEBUG code.
+		 */
+		dep = (xfs_dir2_data_entry_t *)p;
+		ASSERT(dep->namelen != 0);
+		ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0);
+		ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
+		       (char *)dep - (char *)d);
+		count++;
+		lastfree = 0;
+		if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) {
+			addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+				(xfs_dir2_data_aoff_t)
+				((char *)dep - (char *)d));
+			name.name = dep->name;
+			name.len = dep->namelen;
+			hash = mp->m_dirnameops->hashname(&name);
+			for (i = 0; i < be32_to_cpu(btp->count); i++) {
+				if (be32_to_cpu(lep[i].address) == addr &&
+				    be32_to_cpu(lep[i].hashval) == hash)
+					break;
+			}
+			ASSERT(i < be32_to_cpu(btp->count));
+		}
+		p += xfs_dir2_data_entsize(dep->namelen);
+	}
+	/*
+	 * Need to have seen all the entries and all the bestfree slots.
+	 */
+	ASSERT(freeseen == 7);
+	if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) {
+		for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
+			if (be32_to_cpu(lep[i].address) == XFS_DIR2_NULL_DATAPTR)
+				stale++;
+			if (i > 0)
+				ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval));
+		}
+		ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
+		ASSERT(stale == be32_to_cpu(btp->stale));
+	}
+}
+#endif
+
+/*
+ * Given a data block and an unused entry from that block,
+ * return the bestfree entry if any that corresponds to it.
+ */
+xfs_dir2_data_free_t *
+xfs_dir2_data_freefind(
+	xfs_dir2_data_t		*d,		/* data block */
+	xfs_dir2_data_unused_t	*dup)		/* data unused entry */
+{
+	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
+	xfs_dir2_data_aoff_t	off;		/* offset value needed */
+#if defined(DEBUG) && defined(__KERNEL__)
+	int			matched;	/* matched the value */
+	int			seenzero;	/* saw a 0 bestfree entry */
+#endif
+
+	off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)d);
+#if defined(DEBUG) && defined(__KERNEL__)
+	/*
+	 * Validate some consistency in the bestfree table.
+	 * Check order, non-overlapping entries, and if we find the
+	 * one we're looking for it has to be exact.
+	 */
+	ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+	       be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+	for (dfp = &d->hdr.bestfree[0], seenzero = matched = 0;
+	     dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT];
+	     dfp++) {
+		if (!dfp->offset) {
+			ASSERT(!dfp->length);
+			seenzero = 1;
+			continue;
+		}
+		ASSERT(seenzero == 0);
+		if (be16_to_cpu(dfp->offset) == off) {
+			matched = 1;
+			ASSERT(dfp->length == dup->length);
+		} else if (off < be16_to_cpu(dfp->offset))
+			ASSERT(off + be16_to_cpu(dup->length) <= be16_to_cpu(dfp->offset));
+		else
+			ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off);
+		ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length));
+		if (dfp > &d->hdr.bestfree[0])
+			ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length));
+	}
+#endif
+	/*
+	 * If this is smaller than the smallest bestfree entry,
+	 * it can't be there since they're sorted.
+	 */
+	if (be16_to_cpu(dup->length) <
+	    be16_to_cpu(d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length))
+		return NULL;
+	/*
+	 * Look at the three bestfree entries for our guy.
+	 */
+	for (dfp = &d->hdr.bestfree[0];
+	     dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT];
+	     dfp++) {
+		if (!dfp->offset)
+			return NULL;
+		if (be16_to_cpu(dfp->offset) == off)
+			return dfp;
+	}
+	/*
+	 * Didn't find it.  This only happens if there are duplicate lengths.
+	 */
+	return NULL;
+}
+
+/*
+ * Insert an unused-space entry into the bestfree table.
+ */
+xfs_dir2_data_free_t *				/* entry inserted */
+xfs_dir2_data_freeinsert(
+	xfs_dir2_data_t		*d,		/* data block pointer */
+	xfs_dir2_data_unused_t	*dup,		/* unused space */
+	int			*loghead)	/* log the data header (out) */
+{
+	xfs_dir2_data_free_t	*dfp;		/* bestfree table pointer */
+	xfs_dir2_data_free_t	new;		/* new bestfree entry */
+
+#ifdef __KERNEL__
+	ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+	       be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+#endif
+	dfp = d->hdr.bestfree;
+	new.length = dup->length;
+	new.offset = cpu_to_be16((char *)dup - (char *)d);
+	/*
+	 * Insert at position 0, 1, or 2; or not at all.
+	 */
+	if (be16_to_cpu(new.length) > be16_to_cpu(dfp[0].length)) {
+		dfp[2] = dfp[1];
+		dfp[1] = dfp[0];
+		dfp[0] = new;
+		*loghead = 1;
+		return &dfp[0];
+	}
+	if (be16_to_cpu(new.length) > be16_to_cpu(dfp[1].length)) {
+		dfp[2] = dfp[1];
+		dfp[1] = new;
+		*loghead = 1;
+		return &dfp[1];
+	}
+	if (be16_to_cpu(new.length) > be16_to_cpu(dfp[2].length)) {
+		dfp[2] = new;
+		*loghead = 1;
+		return &dfp[2];
+	}
+	return NULL;
+}
+
+/*
+ * Remove a bestfree entry from the table.
+ */
+STATIC void
+xfs_dir2_data_freeremove(
+	xfs_dir2_data_t		*d,		/* data block pointer */
+	xfs_dir2_data_free_t	*dfp,		/* bestfree entry pointer */
+	int			*loghead)	/* out: log data header */
+{
+#ifdef __KERNEL__
+	ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+	       be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+#endif
+	/*
+	 * It's the first entry, slide the next 2 up.
+	 */
+	if (dfp == &d->hdr.bestfree[0]) {
+		d->hdr.bestfree[0] = d->hdr.bestfree[1];
+		d->hdr.bestfree[1] = d->hdr.bestfree[2];
+	}
+	/*
+	 * It's the second entry, slide the 3rd entry up.
+	 */
+	else if (dfp == &d->hdr.bestfree[1])
+		d->hdr.bestfree[1] = d->hdr.bestfree[2];
+	/*
+	 * Must be the last entry.
+	 */
+	else
+		ASSERT(dfp == &d->hdr.bestfree[2]);
+	/*
+	 * Clear the 3rd entry, must be zero now.
+	 */
+	d->hdr.bestfree[2].length = 0;
+	d->hdr.bestfree[2].offset = 0;
+	*loghead = 1;
+}
+
+/*
+ * Given a data block, reconstruct its bestfree map.
+ */
+void
+xfs_dir2_data_freescan(
+	xfs_mount_t		*mp,		/* filesystem mount point */
+	xfs_dir2_data_t		*d,		/* data block pointer */
+	int			*loghead)	/* out: log data header */
+{
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* active data entry */
+	xfs_dir2_data_unused_t	*dup;		/* unused data entry */
+	char			*endp;		/* end of block's data */
+	char			*p;		/* current entry pointer */
+
+#ifdef __KERNEL__
+	ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+	       be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+#endif
+	/*
+	 * Start by clearing the table.
+	 */
+	memset(d->hdr.bestfree, 0, sizeof(d->hdr.bestfree));
+	*loghead = 1;
+	/*
+	 * Set up pointers.
+	 */
+	p = (char *)d->u;
+	if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) {
+		btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d);
+		endp = (char *)xfs_dir2_block_leaf_p(btp);
+	} else
+		endp = (char *)d + mp->m_dirblksize;
+	/*
+	 * Loop over the block's entries.
+	 */
+	while (p < endp) {
+		dup = (xfs_dir2_data_unused_t *)p;
+		/*
+		 * If it's a free entry, insert it.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			ASSERT((char *)dup - (char *)d ==
+			       be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+			xfs_dir2_data_freeinsert(d, dup, loghead);
+			p += be16_to_cpu(dup->length);
+		}
+		/*
+		 * For active entries, check their tags and skip them.
+		 */
+		else {
+			dep = (xfs_dir2_data_entry_t *)p;
+			ASSERT((char *)dep - (char *)d ==
+			       be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)));
+			p += xfs_dir2_data_entsize(dep->namelen);
+		}
+	}
+}
+
+/*
+ * Initialize a data block at the given block number in the directory.
+ * Give back the buffer for the created block.
+ */
+int						/* error */
+xfs_dir2_data_init(
+	xfs_da_args_t		*args,		/* directory operation args */
+	xfs_dir2_db_t		blkno,		/* logical dir block number */
+	xfs_dabuf_t		**bpp)		/* output block buffer */
+{
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_data_t		*d;		/* pointer to block */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry pointer */
+	int			error;		/* error return value */
+	int			i;		/* bestfree index */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	int                     t;              /* temp */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Get the buffer set up for the block.
+	 */
+	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
+		XFS_DATA_FORK);
+	if (error) {
+		return error;
+	}
+	ASSERT(bp != NULL);
+	/*
+	 * Initialize the header.
+	 */
+	d = bp->data;
+	d->hdr.magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+	d->hdr.bestfree[0].offset = cpu_to_be16(sizeof(d->hdr));
+	for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
+		d->hdr.bestfree[i].length = 0;
+		d->hdr.bestfree[i].offset = 0;
+	}
+	/*
+	 * Set up an unused entry for the block's body.
+	 */
+	dup = &d->u[0].unused;
+	dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+
+	t=mp->m_dirblksize - (uint)sizeof(d->hdr);
+	d->hdr.bestfree[0].length = cpu_to_be16(t);
+	dup->length = cpu_to_be16(t);
+	*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)d);
+	/*
+	 * Log it and return it.
+	 */
+	xfs_dir2_data_log_header(tp, bp);
+	xfs_dir2_data_log_unused(tp, bp, dup);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Log an active data entry from the block.
+ */
+void
+xfs_dir2_data_log_entry(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* block buffer */
+	xfs_dir2_data_entry_t	*dep)		/* data entry pointer */
+{
+	xfs_dir2_data_t		*d;		/* data block pointer */
+
+	d = bp->data;
+	ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+	       be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+	xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)d),
+		(uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) -
+		       (char *)d - 1));
+}
+
+/*
+ * Log a data block header.
+ */
+void
+xfs_dir2_data_log_header(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp)		/* block buffer */
+{
+	xfs_dir2_data_t		*d;		/* data block pointer */
+
+	d = bp->data;
+	ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+	       be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+	xfs_da_log_buf(tp, bp, (uint)((char *)&d->hdr - (char *)d),
+		(uint)(sizeof(d->hdr) - 1));
+}
+
+/*
+ * Log a data unused entry.
+ */
+void
+xfs_dir2_data_log_unused(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* block buffer */
+	xfs_dir2_data_unused_t	*dup)		/* data unused pointer */
+{
+	xfs_dir2_data_t		*d;		/* data block pointer */
+
+	d = bp->data;
+	ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+	       be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+	/*
+	 * Log the first part of the unused entry.
+	 */
+	xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)d),
+		(uint)((char *)&dup->length + sizeof(dup->length) -
+		       1 - (char *)d));
+	/*
+	 * Log the end (tag) of the unused entry.
+	 */
+	xfs_da_log_buf(tp, bp,
+		(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)d),
+		(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)d +
+		       sizeof(xfs_dir2_data_off_t) - 1));
+}
+
+/*
+ * Make a byte range in the data block unused.
+ * Its current contents are unimportant.
+ */
+void
+xfs_dir2_data_make_free(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* block buffer */
+	xfs_dir2_data_aoff_t	offset,		/* starting byte offset */
+	xfs_dir2_data_aoff_t	len,		/* length in bytes */
+	int			*needlogp,	/* out: log header */
+	int			*needscanp)	/* out: regen bestfree */
+{
+	xfs_dir2_data_t		*d;		/* data block pointer */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree pointer */
+	char			*endptr;	/* end of data area */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needscan;	/* need to regen bestfree */
+	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
+	xfs_dir2_data_unused_t	*postdup;	/* unused entry after us */
+	xfs_dir2_data_unused_t	*prevdup;	/* unused entry before us */
+
+	mp = tp->t_mountp;
+	d = bp->data;
+	/*
+	 * Figure out where the end of the data area is.
+	 */
+	if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC)
+		endptr = (char *)d + mp->m_dirblksize;
+	else {
+		xfs_dir2_block_tail_t	*btp;	/* block tail */
+
+		ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+		btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d);
+		endptr = (char *)xfs_dir2_block_leaf_p(btp);
+	}
+	/*
+	 * If this isn't the start of the block, then back up to
+	 * the previous entry and see if it's free.
+	 */
+	if (offset > sizeof(d->hdr)) {
+		__be16			*tagp;	/* tag just before us */
+
+		tagp = (__be16 *)((char *)d + offset) - 1;
+		prevdup = (xfs_dir2_data_unused_t *)((char *)d + be16_to_cpu(*tagp));
+		if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+			prevdup = NULL;
+	} else
+		prevdup = NULL;
+	/*
+	 * If this isn't the end of the block, see if the entry after
+	 * us is free.
+	 */
+	if ((char *)d + offset + len < endptr) {
+		postdup =
+			(xfs_dir2_data_unused_t *)((char *)d + offset + len);
+		if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+			postdup = NULL;
+	} else
+		postdup = NULL;
+	ASSERT(*needscanp == 0);
+	needscan = 0;
+	/*
+	 * Previous and following entries are both free,
+	 * merge everything into a single free entry.
+	 */
+	if (prevdup && postdup) {
+		xfs_dir2_data_free_t	*dfp2;	/* another bestfree pointer */
+
+		/*
+		 * See if prevdup and/or postdup are in bestfree table.
+		 */
+		dfp = xfs_dir2_data_freefind(d, prevdup);
+		dfp2 = xfs_dir2_data_freefind(d, postdup);
+		/*
+		 * We need a rescan unless there are exactly 2 free entries
+		 * namely our two.  Then we know what's happening, otherwise
+		 * since the third bestfree is there, there might be more
+		 * entries.
+		 */
+		needscan = (d->hdr.bestfree[2].length != 0);
+		/*
+		 * Fix up the new big freespace.
+		 */
+		be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
+		*xfs_dir2_data_unused_tag_p(prevdup) =
+			cpu_to_be16((char *)prevdup - (char *)d);
+		xfs_dir2_data_log_unused(tp, bp, prevdup);
+		if (!needscan) {
+			/*
+			 * Has to be the case that entries 0 and 1 are
+			 * dfp and dfp2 (don't know which is which), and
+			 * entry 2 is empty.
+			 * Remove entry 1 first then entry 0.
+			 */
+			ASSERT(dfp && dfp2);
+			if (dfp == &d->hdr.bestfree[1]) {
+				dfp = &d->hdr.bestfree[0];
+				ASSERT(dfp2 == dfp);
+				dfp2 = &d->hdr.bestfree[1];
+			}
+			xfs_dir2_data_freeremove(d, dfp2, needlogp);
+			xfs_dir2_data_freeremove(d, dfp, needlogp);
+			/*
+			 * Now insert the new entry.
+			 */
+			dfp = xfs_dir2_data_freeinsert(d, prevdup, needlogp);
+			ASSERT(dfp == &d->hdr.bestfree[0]);
+			ASSERT(dfp->length == prevdup->length);
+			ASSERT(!dfp[1].length);
+			ASSERT(!dfp[2].length);
+		}
+	}
+	/*
+	 * The entry before us is free, merge with it.
+	 */
+	else if (prevdup) {
+		dfp = xfs_dir2_data_freefind(d, prevdup);
+		be16_add_cpu(&prevdup->length, len);
+		*xfs_dir2_data_unused_tag_p(prevdup) =
+			cpu_to_be16((char *)prevdup - (char *)d);
+		xfs_dir2_data_log_unused(tp, bp, prevdup);
+		/*
+		 * If the previous entry was in the table, the new entry
+		 * is longer, so it will be in the table too.  Remove
+		 * the old one and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(d, dfp, needlogp);
+			(void)xfs_dir2_data_freeinsert(d, prevdup, needlogp);
+		}
+		/*
+		 * Otherwise we need a scan if the new entry is big enough.
+		 */
+		else {
+			needscan = be16_to_cpu(prevdup->length) >
+				   be16_to_cpu(d->hdr.bestfree[2].length);
+		}
+	}
+	/*
+	 * The following entry is free, merge with it.
+	 */
+	else if (postdup) {
+		dfp = xfs_dir2_data_freefind(d, postdup);
+		newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
+		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)d);
+		xfs_dir2_data_log_unused(tp, bp, newdup);
+		/*
+		 * If the following entry was in the table, the new entry
+		 * is longer, so it will be in the table too.  Remove
+		 * the old one and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(d, dfp, needlogp);
+			(void)xfs_dir2_data_freeinsert(d, newdup, needlogp);
+		}
+		/*
+		 * Otherwise we need a scan if the new entry is big enough.
+		 */
+		else {
+			needscan = be16_to_cpu(newdup->length) >
+				   be16_to_cpu(d->hdr.bestfree[2].length);
+		}
+	}
+	/*
+	 * Neither neighbor is free.  Make a new entry.
+	 */
+	else {
+		newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
+		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup->length = cpu_to_be16(len);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)d);
+		xfs_dir2_data_log_unused(tp, bp, newdup);
+		(void)xfs_dir2_data_freeinsert(d, newdup, needlogp);
+	}
+	*needscanp = needscan;
+}
+
+/*
+ * Take a byte range out of an existing unused space and make it un-free.
+ */
+void
+xfs_dir2_data_use_free(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* data block buffer */
+	xfs_dir2_data_unused_t	*dup,		/* unused entry */
+	xfs_dir2_data_aoff_t	offset,		/* starting offset to use */
+	xfs_dir2_data_aoff_t	len,		/* length to use */
+	int			*needlogp,	/* out: need to log header */
+	int			*needscanp)	/* out: need regen bestfree */
+{
+	xfs_dir2_data_t		*d;		/* data block */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree pointer */
+	int			matchback;	/* matches end of freespace */
+	int			matchfront;	/* matches start of freespace */
+	int			needscan;	/* need to regen bestfree */
+	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
+	xfs_dir2_data_unused_t	*newdup2;	/* another new unused entry */
+	int			oldlen;		/* old unused entry's length */
+
+	d = bp->data;
+	ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+	       be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+	ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
+	ASSERT(offset >= (char *)dup - (char *)d);
+	ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)d);
+	ASSERT((char *)dup - (char *)d == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+	/*
+	 * Look up the entry in the bestfree table.
+	 */
+	dfp = xfs_dir2_data_freefind(d, dup);
+	oldlen = be16_to_cpu(dup->length);
+	ASSERT(dfp || oldlen <= be16_to_cpu(d->hdr.bestfree[2].length));
+	/*
+	 * Check for alignment with front and back of the entry.
+	 */
+	matchfront = (char *)dup - (char *)d == offset;
+	matchback = (char *)dup + oldlen - (char *)d == offset + len;
+	ASSERT(*needscanp == 0);
+	needscan = 0;
+	/*
+	 * If we matched it exactly we just need to get rid of it from
+	 * the bestfree table.
+	 */
+	if (matchfront && matchback) {
+		if (dfp) {
+			needscan = (d->hdr.bestfree[2].offset != 0);
+			if (!needscan)
+				xfs_dir2_data_freeremove(d, dfp, needlogp);
+		}
+	}
+	/*
+	 * We match the first part of the entry.
+	 * Make a new entry with the remaining freespace.
+	 */
+	else if (matchfront) {
+		newdup = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
+		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup->length = cpu_to_be16(oldlen - len);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)d);
+		xfs_dir2_data_log_unused(tp, bp, newdup);
+		/*
+		 * If it was in the table, remove it and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(d, dfp, needlogp);
+			dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp);
+			ASSERT(dfp != NULL);
+			ASSERT(dfp->length == newdup->length);
+			ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)d);
+			/*
+			 * If we got inserted at the last slot,
+			 * that means we don't know if there was a better
+			 * choice for the last slot, or not.  Rescan.
+			 */
+			needscan = dfp == &d->hdr.bestfree[2];
+		}
+	}
+	/*
+	 * We match the last part of the entry.
+	 * Trim the allocated space off the tail of the entry.
+	 */
+	else if (matchback) {
+		newdup = dup;
+		newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)d);
+		xfs_dir2_data_log_unused(tp, bp, newdup);
+		/*
+		 * If it was in the table, remove it and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(d, dfp, needlogp);
+			dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp);
+			ASSERT(dfp != NULL);
+			ASSERT(dfp->length == newdup->length);
+			ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)d);
+			/*
+			 * If we got inserted at the last slot,
+			 * that means we don't know if there was a better
+			 * choice for the last slot, or not.  Rescan.
+			 */
+			needscan = dfp == &d->hdr.bestfree[2];
+		}
+	}
+	/*
+	 * Poking out the middle of an entry.
+	 * Make two new entries.
+	 */
+	else {
+		newdup = dup;
+		newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)d);
+		xfs_dir2_data_log_unused(tp, bp, newdup);
+		newdup2 = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
+		newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
+		*xfs_dir2_data_unused_tag_p(newdup2) =
+			cpu_to_be16((char *)newdup2 - (char *)d);
+		xfs_dir2_data_log_unused(tp, bp, newdup2);
+		/*
+		 * If the old entry was in the table, we need to scan
+		 * if the 3rd entry was valid, since these entries
+		 * are smaller than the old one.
+		 * If we don't need to scan that means there were 1 or 2
+		 * entries in the table, and removing the old and adding
+		 * the 2 new will work.
+		 */
+		if (dfp) {
+			needscan = (d->hdr.bestfree[2].length != 0);
+			if (!needscan) {
+				xfs_dir2_data_freeremove(d, dfp, needlogp);
+				(void)xfs_dir2_data_freeinsert(d, newdup,
+					needlogp);
+				(void)xfs_dir2_data_freeinsert(d, newdup2,
+					needlogp);
+			}
+		}
+	}
+	*needscanp = needscan;
+}
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
new file mode 100644
index 00000000..efbc290c
--- /dev/null
+++ b/fs/xfs/xfs_dir2_data.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DIR2_DATA_H__
+#define	__XFS_DIR2_DATA_H__
+
+/*
+ * Directory format 2, data block structures.
+ */
+
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Constants.
+ */
+#define	XFS_DIR2_DATA_MAGIC	0x58443244	/* XD2D: for multiblock dirs */
+#define	XFS_DIR2_DATA_ALIGN_LOG	3		/* i.e., 8 bytes */
+#define	XFS_DIR2_DATA_ALIGN	(1 << XFS_DIR2_DATA_ALIGN_LOG)
+#define	XFS_DIR2_DATA_FREE_TAG	0xffff
+#define	XFS_DIR2_DATA_FD_COUNT	3
+
+/*
+ * Directory address space divided into sections,
+ * spaces separated by 32GB.
+ */
+#define	XFS_DIR2_SPACE_SIZE	(1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
+#define	XFS_DIR2_DATA_SPACE	0
+#define	XFS_DIR2_DATA_OFFSET	(XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
+#define	XFS_DIR2_DATA_FIRSTDB(mp)	\
+	xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET)
+
+/*
+ * Offsets of . and .. in data space (always block 0)
+ */
+#define	XFS_DIR2_DATA_DOT_OFFSET	\
+	((xfs_dir2_data_aoff_t)sizeof(xfs_dir2_data_hdr_t))
+#define	XFS_DIR2_DATA_DOTDOT_OFFSET	\
+	(XFS_DIR2_DATA_DOT_OFFSET + xfs_dir2_data_entsize(1))
+#define	XFS_DIR2_DATA_FIRST_OFFSET		\
+	(XFS_DIR2_DATA_DOTDOT_OFFSET + xfs_dir2_data_entsize(2))
+
+/*
+ * Structures.
+ */
+
+/*
+ * Describe a free area in the data block.
+ * The freespace will be formatted as a xfs_dir2_data_unused_t.
+ */
+typedef struct xfs_dir2_data_free {
+	__be16			offset;		/* start of freespace */
+	__be16			length;		/* length of freespace */
+} xfs_dir2_data_free_t;
+
+/*
+ * Header for the data blocks.
+ * Always at the beginning of a directory-sized block.
+ * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
+ */
+typedef struct xfs_dir2_data_hdr {
+	__be32			magic;		/* XFS_DIR2_DATA_MAGIC */
+						/* or XFS_DIR2_BLOCK_MAGIC */
+	xfs_dir2_data_free_t	bestfree[XFS_DIR2_DATA_FD_COUNT];
+} xfs_dir2_data_hdr_t;
+
+/*
+ * Active entry in a data block.  Aligned to 8 bytes.
+ * Tag appears as the last 2 bytes.
+ */
+typedef struct xfs_dir2_data_entry {
+	__be64			inumber;	/* inode number */
+	__u8			namelen;	/* name length */
+	__u8			name[1];	/* name bytes, no null */
+						/* variable offset */
+	__be16			tag;		/* starting offset of us */
+} xfs_dir2_data_entry_t;
+
+/*
+ * Unused entry in a data block.  Aligned to 8 bytes.
+ * Tag appears as the last 2 bytes.
+ */
+typedef struct xfs_dir2_data_unused {
+	__be16			freetag;	/* XFS_DIR2_DATA_FREE_TAG */
+	__be16			length;		/* total free length */
+						/* variable offset */
+	__be16			tag;		/* starting offset of us */
+} xfs_dir2_data_unused_t;
+
+typedef union {
+	xfs_dir2_data_entry_t	entry;
+	xfs_dir2_data_unused_t	unused;
+} xfs_dir2_data_union_t;
+
+/*
+ * Generic data block structure, for xfs_db.
+ */
+typedef struct xfs_dir2_data {
+	xfs_dir2_data_hdr_t	hdr;		/* magic XFS_DIR2_DATA_MAGIC */
+	xfs_dir2_data_union_t	u[1];
+} xfs_dir2_data_t;
+
+/*
+ * Macros.
+ */
+
+/*
+ * Size of a data entry.
+ */
+static inline int xfs_dir2_data_entsize(int n)
+{
+	return (int)roundup(offsetof(xfs_dir2_data_entry_t, name[0]) + (n) + \
+		 (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN);
+}
+
+/*
+ * Pointer to an entry's tag word.
+ */
+static inline __be16 *
+xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep)
+{
+	return (__be16 *)((char *)dep +
+		xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+/*
+ * Pointer to a freespace's tag word.
+ */
+static inline __be16 *
+xfs_dir2_data_unused_tag_p(xfs_dir2_data_unused_t *dup)
+{
+	return (__be16 *)((char *)dup +
+			be16_to_cpu(dup->length) - sizeof(__be16));
+}
+
+/*
+ * Function declarations.
+ */
+#ifdef DEBUG
+extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp);
+#else
+#define	xfs_dir2_data_check(dp,bp)
+#endif
+extern xfs_dir2_data_free_t *xfs_dir2_data_freefind(xfs_dir2_data_t *d,
+				xfs_dir2_data_unused_t *dup);
+extern xfs_dir2_data_free_t *xfs_dir2_data_freeinsert(xfs_dir2_data_t *d,
+				xfs_dir2_data_unused_t *dup, int *loghead);
+extern void xfs_dir2_data_freescan(struct xfs_mount *mp, xfs_dir2_data_t *d,
+				int *loghead);
+extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
+				struct xfs_dabuf **bpp);
+extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp,
+				xfs_dir2_data_entry_t *dep);
+extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
+				struct xfs_dabuf *bp);
+extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp,
+				xfs_dir2_data_unused_t *dup);
+extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
+				xfs_dir2_data_aoff_t offset,
+				xfs_dir2_data_aoff_t len, int *needlogp,
+				int *needscanp);
+extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
+			       xfs_dir2_data_unused_t *dup,
+			       xfs_dir2_data_aoff_t offset,
+			       xfs_dir2_data_aoff_t len, int *needlogp,
+			       int *needscanp);
+
+#endif	/* __XFS_DIR2_DATA_H__ */
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
new file mode 100644
index 00000000..ae891223
--- /dev/null
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -0,0 +1,1881 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_dir2_node.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+/*
+ * Local function declarations.
+ */
+#ifdef DEBUG
+static void xfs_dir2_leaf_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
+#else
+#define	xfs_dir2_leaf_check(dp, bp)
+#endif
+static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **lbpp,
+				    int *indexp, xfs_dabuf_t **dbpp);
+static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
+				    int first, int last);
+static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp);
+
+
+/*
+ * Convert a block form directory to a leaf form directory.
+ */
+int						/* error */
+xfs_dir2_block_to_leaf(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*dbp)		/* input block's buffer */
+{
+	__be16			*bestsp;	/* leaf's bestsp entries */
+	xfs_dablk_t		blkno;		/* leaf block's bno */
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block's leaf entries */
+	xfs_dir2_block_tail_t	*btp;		/* block's tail */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dabuf_t		*lbp;		/* leaf block's buffer */
+	xfs_dir2_db_t		ldb;		/* leaf block's bno */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf's tail */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to rescan bestfree */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	trace_xfs_dir2_block_to_leaf(args);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Add the leaf block to the inode.
+	 * This interface will only put blocks in the leaf/node range.
+	 * Since that's empty now, we'll get the root (block 0 in range).
+	 */
+	if ((error = xfs_da_grow_inode(args, &blkno))) {
+		return error;
+	}
+	ldb = xfs_dir2_da_to_db(mp, blkno);
+	ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp));
+	/*
+	 * Initialize the leaf block, get a buffer for it.
+	 */
+	if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) {
+		return error;
+	}
+	ASSERT(lbp != NULL);
+	leaf = lbp->data;
+	block = dbp->data;
+	xfs_dir2_data_check(dp, dbp);
+	btp = xfs_dir2_block_tail_p(mp, block);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Set the counts in the leaf header.
+	 */
+	leaf->hdr.count = cpu_to_be16(be32_to_cpu(btp->count));
+	leaf->hdr.stale = cpu_to_be16(be32_to_cpu(btp->stale));
+	/*
+	 * Could compact these but I think we always do the conversion
+	 * after squeezing out stale entries.
+	 */
+	memcpy(leaf->ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
+	xfs_dir2_leaf_log_ents(tp, lbp, 0, be16_to_cpu(leaf->hdr.count) - 1);
+	needscan = 0;
+	needlog = 1;
+	/*
+	 * Make the space formerly occupied by the leaf entries and block
+	 * tail be free.
+	 */
+	xfs_dir2_data_make_free(tp, dbp,
+		(xfs_dir2_data_aoff_t)((char *)blp - (char *)block),
+		(xfs_dir2_data_aoff_t)((char *)block + mp->m_dirblksize -
+				       (char *)blp),
+		&needlog, &needscan);
+	/*
+	 * Fix up the block header, make it a data block.
+	 */
+	block->hdr.magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+	if (needscan)
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog);
+	/*
+	 * Set up leaf tail and bests table.
+	 */
+	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+	ltp->bestcount = cpu_to_be32(1);
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	bestsp[0] =  block->hdr.bestfree[0].length;
+	/*
+	 * Log the data header and leaf bests table.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	xfs_dir2_leaf_check(dp, lbp);
+	xfs_dir2_data_check(dp, dbp);
+	xfs_dir2_leaf_log_bests(tp, lbp, 0, 0);
+	xfs_da_buf_done(lbp);
+	return 0;
+}
+
+/*
+ * Add an entry to a leaf form directory.
+ */
+int						/* error */
+xfs_dir2_leaf_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	__be16			*bestsp;	/* freespace table in leaf */
+	int			compact;	/* need to compact leaves */
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* data unused entry */
+	int			error;		/* error return value */
+	int			grown;		/* allocated new data block */
+	int			highstale;	/* index of next stale leaf */
+	int			i;		/* temporary, index */
+	int			index;		/* leaf table position */
+	xfs_dabuf_t		*lbp;		/* leaf's buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			length;		/* length of new entry */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry table pointer */
+	int			lfloglow;	/* low leaf logging index */
+	int			lfloghigh;	/* high leaf logging index */
+	int			lowstale;	/* index of prev stale leaf */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail pointer */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needbytes;	/* leaf block bytes needed */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data free */
+	__be16			*tagp;		/* end of data entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	xfs_dir2_db_t		use_block;	/* data block number */
+
+	trace_xfs_dir2_leaf_addname(args);
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	/*
+	 * Read the leaf block.
+	 */
+	error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
+		XFS_DATA_FORK);
+	if (error) {
+		return error;
+	}
+	ASSERT(lbp != NULL);
+	/*
+	 * Look up the entry by hash value and name.
+	 * We know it's not there, our caller has already done a lookup.
+	 * So the index is of the entry to insert in front of.
+	 * But if there are dup hash values the index is of the first of those.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, lbp);
+	leaf = lbp->data;
+	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	length = xfs_dir2_data_entsize(args->namelen);
+	/*
+	 * See if there are any entries with the same hash value
+	 * and space in their block for the new entry.
+	 * This is good because it puts multiple same-hash value entries
+	 * in a data block, improving the lookup of those entries.
+	 */
+	for (use_block = -1, lep = &leaf->ents[index];
+	     index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval;
+	     index++, lep++) {
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		i = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+		ASSERT(i < be32_to_cpu(ltp->bestcount));
+		ASSERT(be16_to_cpu(bestsp[i]) != NULLDATAOFF);
+		if (be16_to_cpu(bestsp[i]) >= length) {
+			use_block = i;
+			break;
+		}
+	}
+	/*
+	 * Didn't find a block yet, linear search all the data blocks.
+	 */
+	if (use_block == -1) {
+		for (i = 0; i < be32_to_cpu(ltp->bestcount); i++) {
+			/*
+			 * Remember a block we see that's missing.
+			 */
+			if (be16_to_cpu(bestsp[i]) == NULLDATAOFF && use_block == -1)
+				use_block = i;
+			else if (be16_to_cpu(bestsp[i]) >= length) {
+				use_block = i;
+				break;
+			}
+		}
+	}
+	/*
+	 * How many bytes do we need in the leaf block?
+	 */
+	needbytes =
+		(leaf->hdr.stale ? 0 : (uint)sizeof(leaf->ents[0])) +
+		(use_block != -1 ? 0 : (uint)sizeof(leaf->bests[0]));
+	/*
+	 * Now kill use_block if it refers to a missing block, so we
+	 * can use it as an indication of allocation needed.
+	 */
+	if (use_block != -1 && be16_to_cpu(bestsp[use_block]) == NULLDATAOFF)
+		use_block = -1;
+	/*
+	 * If we don't have enough free bytes but we can make enough
+	 * by compacting out stale entries, we'll do that.
+	 */
+	if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <
+				needbytes && be16_to_cpu(leaf->hdr.stale) > 1) {
+		compact = 1;
+	}
+	/*
+	 * Otherwise if we don't have enough free bytes we need to
+	 * convert to node form.
+	 */
+	else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(
+						leaf->hdr.count)] < needbytes) {
+		/*
+		 * Just checking or no space reservation, give up.
+		 */
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+							args->total == 0) {
+			xfs_da_brelse(tp, lbp);
+			return XFS_ERROR(ENOSPC);
+		}
+		/*
+		 * Convert to node form.
+		 */
+		error = xfs_dir2_leaf_to_node(args, lbp);
+		xfs_da_buf_done(lbp);
+		if (error)
+			return error;
+		/*
+		 * Then add the new entry.
+		 */
+		return xfs_dir2_node_addname(args);
+	}
+	/*
+	 * Otherwise it will fit without compaction.
+	 */
+	else
+		compact = 0;
+	/*
+	 * If just checking, then it will fit unless we needed to allocate
+	 * a new data block.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+		xfs_da_brelse(tp, lbp);
+		return use_block == -1 ? XFS_ERROR(ENOSPC) : 0;
+	}
+	/*
+	 * If no allocations are allowed, return now before we've
+	 * changed anything.
+	 */
+	if (args->total == 0 && use_block == -1) {
+		xfs_da_brelse(tp, lbp);
+		return XFS_ERROR(ENOSPC);
+	}
+	/*
+	 * Need to compact the leaf entries, removing stale ones.
+	 * Leave one stale entry behind - the one closest to our
+	 * insertion index - and we'll shift that one to our insertion
+	 * point later.
+	 */
+	if (compact) {
+		xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale,
+			&lfloglow, &lfloghigh);
+	}
+	/*
+	 * There are stale entries, so we'll need log-low and log-high
+	 * impossibly bad values later.
+	 */
+	else if (be16_to_cpu(leaf->hdr.stale)) {
+		lfloglow = be16_to_cpu(leaf->hdr.count);
+		lfloghigh = -1;
+	}
+	/*
+	 * If there was no data block space found, we need to allocate
+	 * a new one.
+	 */
+	if (use_block == -1) {
+		/*
+		 * Add the new data block.
+		 */
+		if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
+				&use_block))) {
+			xfs_da_brelse(tp, lbp);
+			return error;
+		}
+		/*
+		 * Initialize the block.
+		 */
+		if ((error = xfs_dir2_data_init(args, use_block, &dbp))) {
+			xfs_da_brelse(tp, lbp);
+			return error;
+		}
+		/*
+		 * If we're adding a new data block on the end we need to
+		 * extend the bests table.  Copy it up one entry.
+		 */
+		if (use_block >= be32_to_cpu(ltp->bestcount)) {
+			bestsp--;
+			memmove(&bestsp[0], &bestsp[1],
+				be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
+			be32_add_cpu(&ltp->bestcount, 1);
+			xfs_dir2_leaf_log_tail(tp, lbp);
+			xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+		}
+		/*
+		 * If we're filling in a previously empty block just log it.
+		 */
+		else
+			xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+		data = dbp->data;
+		bestsp[use_block] = data->hdr.bestfree[0].length;
+		grown = 1;
+	}
+	/*
+	 * Already had space in some data block.
+	 * Just read that one in.
+	 */
+	else {
+		if ((error =
+		    xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block),
+			    -1, &dbp, XFS_DATA_FORK))) {
+			xfs_da_brelse(tp, lbp);
+			return error;
+		}
+		data = dbp->data;
+		grown = 0;
+	}
+	xfs_dir2_data_check(dp, dbp);
+	/*
+	 * Point to the biggest freespace in our data block.
+	 */
+	dup = (xfs_dir2_data_unused_t *)
+	      ((char *)data + be16_to_cpu(data->hdr.bestfree[0].offset));
+	ASSERT(be16_to_cpu(dup->length) >= length);
+	needscan = needlog = 0;
+	/*
+	 * Mark the initial part of our freespace in use for the new entry.
+	 */
+	xfs_dir2_data_use_free(tp, dbp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length,
+		&needlog, &needscan);
+	/*
+	 * Initialize our new entry (at last).
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	dep->inumber = cpu_to_be64(args->inumber);
+	dep->namelen = args->namelen;
+	memcpy(dep->name, args->name, dep->namelen);
+	tagp = xfs_dir2_data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)data);
+	/*
+	 * Need to scan fix up the bestfree table.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, data, &needlog);
+	/*
+	 * Need to log the data block's header.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	xfs_dir2_data_log_entry(tp, dbp, dep);
+	/*
+	 * If the bests table needs to be changed, do it.
+	 * Log the change unless we've already done that.
+	 */
+	if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(data->hdr.bestfree[0].length)) {
+		bestsp[use_block] = data->hdr.bestfree[0].length;
+		if (!grown)
+			xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+	}
+	/*
+	 * Now we need to make room to insert the leaf entry.
+	 * If there are no stale entries, we just insert a hole at index.
+	 */
+	if (!leaf->hdr.stale) {
+		/*
+		 * lep is still good as the index leaf entry.
+		 */
+		if (index < be16_to_cpu(leaf->hdr.count))
+			memmove(lep + 1, lep,
+				(be16_to_cpu(leaf->hdr.count) - index) * sizeof(*lep));
+		/*
+		 * Record low and high logging indices for the leaf.
+		 */
+		lfloglow = index;
+		lfloghigh = be16_to_cpu(leaf->hdr.count);
+		be16_add_cpu(&leaf->hdr.count, 1);
+	}
+	/*
+	 * There are stale entries.
+	 * We will use one of them for the new entry.
+	 * It's probably not at the right location, so we'll have to
+	 * shift some up or down first.
+	 */
+	else {
+		/*
+		 * If we didn't compact before, we need to find the nearest
+		 * stale entries before and after our insertion point.
+		 */
+		if (compact == 0) {
+			/*
+			 * Find the first stale entry before the insertion
+			 * point, if any.
+			 */
+			for (lowstale = index - 1;
+			     lowstale >= 0 &&
+				be32_to_cpu(leaf->ents[lowstale].address) !=
+				XFS_DIR2_NULL_DATAPTR;
+			     lowstale--)
+				continue;
+			/*
+			 * Find the next stale entry at or after the insertion
+			 * point, if any.   Stop if we go so far that the
+			 * lowstale entry would be better.
+			 */
+			for (highstale = index;
+			     highstale < be16_to_cpu(leaf->hdr.count) &&
+				be32_to_cpu(leaf->ents[highstale].address) !=
+				XFS_DIR2_NULL_DATAPTR &&
+				(lowstale < 0 ||
+				 index - lowstale - 1 >= highstale - index);
+			     highstale++)
+				continue;
+		}
+		/*
+		 * If the low one is better, use it.
+		 */
+		if (lowstale >= 0 &&
+		    (highstale == be16_to_cpu(leaf->hdr.count) ||
+		     index - lowstale - 1 < highstale - index)) {
+			ASSERT(index - lowstale - 1 >= 0);
+			ASSERT(be32_to_cpu(leaf->ents[lowstale].address) ==
+			       XFS_DIR2_NULL_DATAPTR);
+			/*
+			 * Copy entries up to cover the stale entry
+			 * and make room for the new entry.
+			 */
+			if (index - lowstale - 1 > 0)
+				memmove(&leaf->ents[lowstale],
+					&leaf->ents[lowstale + 1],
+					(index - lowstale - 1) * sizeof(*lep));
+			lep = &leaf->ents[index - 1];
+			lfloglow = MIN(lowstale, lfloglow);
+			lfloghigh = MAX(index - 1, lfloghigh);
+		}
+		/*
+		 * The high one is better, so use that one.
+		 */
+		else {
+			ASSERT(highstale - index >= 0);
+			ASSERT(be32_to_cpu(leaf->ents[highstale].address) ==
+			       XFS_DIR2_NULL_DATAPTR);
+			/*
+			 * Copy entries down to cover the stale entry
+			 * and make room for the new entry.
+			 */
+			if (highstale - index > 0)
+				memmove(&leaf->ents[index + 1],
+					&leaf->ents[index],
+					(highstale - index) * sizeof(*lep));
+			lep = &leaf->ents[index];
+			lfloglow = MIN(index, lfloglow);
+			lfloghigh = MAX(highstale, lfloghigh);
+		}
+		be16_add_cpu(&leaf->hdr.stale, -1);
+	}
+	/*
+	 * Fill in the new leaf entry.
+	 */
+	lep->hashval = cpu_to_be32(args->hashval);
+	lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, use_block,
+				be16_to_cpu(*tagp)));
+	/*
+	 * Log the leaf fields and give up the buffers.
+	 */
+	xfs_dir2_leaf_log_header(tp, lbp);
+	xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh);
+	xfs_dir2_leaf_check(dp, lbp);
+	xfs_da_buf_done(lbp);
+	xfs_dir2_data_check(dp, dbp);
+	xfs_da_buf_done(dbp);
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check the internal consistency of a leaf1 block.
+ * Pop an assert if something is wrong.
+ */
+STATIC void
+xfs_dir2_leaf_check(
+	xfs_inode_t		*dp,		/* incore directory inode */
+	xfs_dabuf_t		*bp)		/* leaf's buffer */
+{
+	int			i;		/* leaf index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail pointer */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			stale;		/* count of stale leaves */
+
+	leaf = bp->data;
+	mp = dp->i_mount;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC);
+	/*
+	 * This value is not restrictive enough.
+	 * Should factor in the size of the bests table as well.
+	 * We can deduce a value for that from di_size.
+	 */
+	ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp));
+	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+	/*
+	 * Leaves and bests don't overlap.
+	 */
+	ASSERT((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <=
+	       (char *)xfs_dir2_leaf_bests_p(ltp));
+	/*
+	 * Check hash value order, count stale entries.
+	 */
+	for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) {
+		if (i + 1 < be16_to_cpu(leaf->hdr.count))
+			ASSERT(be32_to_cpu(leaf->ents[i].hashval) <=
+			       be32_to_cpu(leaf->ents[i + 1].hashval));
+		if (be32_to_cpu(leaf->ents[i].address) == XFS_DIR2_NULL_DATAPTR)
+			stale++;
+	}
+	ASSERT(be16_to_cpu(leaf->hdr.stale) == stale);
+}
+#endif	/* DEBUG */
+
+/*
+ * Compact out any stale entries in the leaf.
+ * Log the header and changed leaf entries, if any.
+ */
+void
+xfs_dir2_leaf_compact(
+	xfs_da_args_t	*args,		/* operation arguments */
+	xfs_dabuf_t	*bp)		/* leaf buffer */
+{
+	int		from;		/* source leaf index */
+	xfs_dir2_leaf_t	*leaf;		/* leaf structure */
+	int		loglow;		/* first leaf entry to log */
+	int		to;		/* target leaf index */
+
+	leaf = bp->data;
+	if (!leaf->hdr.stale) {
+		return;
+	}
+	/*
+	 * Compress out the stale entries in place.
+	 */
+	for (from = to = 0, loglow = -1; from < be16_to_cpu(leaf->hdr.count); from++) {
+		if (be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Only actually copy the entries that are different.
+		 */
+		if (from > to) {
+			if (loglow == -1)
+				loglow = to;
+			leaf->ents[to] = leaf->ents[from];
+		}
+		to++;
+	}
+	/*
+	 * Update and log the header, log the leaf entries.
+	 */
+	ASSERT(be16_to_cpu(leaf->hdr.stale) == from - to);
+	be16_add_cpu(&leaf->hdr.count, -(be16_to_cpu(leaf->hdr.stale)));
+	leaf->hdr.stale = 0;
+	xfs_dir2_leaf_log_header(args->trans, bp);
+	if (loglow != -1)
+		xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1);
+}
+
+/*
+ * Compact the leaf entries, removing stale ones.
+ * Leave one stale entry behind - the one closest to our
+ * insertion index - and the caller will shift that one to our insertion
+ * point later.
+ * Return new insertion index, where the remaining stale entry is,
+ * and leaf logging indices.
+ */
+void
+xfs_dir2_leaf_compact_x1(
+	xfs_dabuf_t	*bp,		/* leaf buffer */
+	int		*indexp,	/* insertion index */
+	int		*lowstalep,	/* out: stale entry before us */
+	int		*highstalep,	/* out: stale entry after us */
+	int		*lowlogp,	/* out: low log index */
+	int		*highlogp)	/* out: high log index */
+{
+	int		from;		/* source copy index */
+	int		highstale;	/* stale entry at/after index */
+	int		index;		/* insertion index */
+	int		keepstale;	/* source index of kept stale */
+	xfs_dir2_leaf_t	*leaf;		/* leaf structure */
+	int		lowstale;	/* stale entry before index */
+	int		newindex=0;	/* new insertion index */
+	int		to;		/* destination copy index */
+
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.stale) > 1);
+	index = *indexp;
+	/*
+	 * Find the first stale entry before our index, if any.
+	 */
+	for (lowstale = index - 1;
+	     lowstale >= 0 &&
+		be32_to_cpu(leaf->ents[lowstale].address) != XFS_DIR2_NULL_DATAPTR;
+	     lowstale--)
+		continue;
+	/*
+	 * Find the first stale entry at or after our index, if any.
+	 * Stop if the answer would be worse than lowstale.
+	 */
+	for (highstale = index;
+	     highstale < be16_to_cpu(leaf->hdr.count) &&
+		be32_to_cpu(leaf->ents[highstale].address) != XFS_DIR2_NULL_DATAPTR &&
+		(lowstale < 0 || index - lowstale > highstale - index);
+	     highstale++)
+		continue;
+	/*
+	 * Pick the better of lowstale and highstale.
+	 */
+	if (lowstale >= 0 &&
+	    (highstale == be16_to_cpu(leaf->hdr.count) ||
+	     index - lowstale <= highstale - index))
+		keepstale = lowstale;
+	else
+		keepstale = highstale;
+	/*
+	 * Copy the entries in place, removing all the stale entries
+	 * except keepstale.
+	 */
+	for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) {
+		/*
+		 * Notice the new value of index.
+		 */
+		if (index == from)
+			newindex = to;
+		if (from != keepstale &&
+		    be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR) {
+			if (from == to)
+				*lowlogp = to;
+			continue;
+		}
+		/*
+		 * Record the new keepstale value for the insertion.
+		 */
+		if (from == keepstale)
+			lowstale = highstale = to;
+		/*
+		 * Copy only the entries that have moved.
+		 */
+		if (from > to)
+			leaf->ents[to] = leaf->ents[from];
+		to++;
+	}
+	ASSERT(from > to);
+	/*
+	 * If the insertion point was past the last entry,
+	 * set the new insertion point accordingly.
+	 */
+	if (index == from)
+		newindex = to;
+	*indexp = newindex;
+	/*
+	 * Adjust the leaf header values.
+	 */
+	be16_add_cpu(&leaf->hdr.count, -(from - to));
+	leaf->hdr.stale = cpu_to_be16(1);
+	/*
+	 * Remember the low/high stale value only in the "right"
+	 * direction.
+	 */
+	if (lowstale >= newindex)
+		lowstale = -1;
+	else
+		highstale = be16_to_cpu(leaf->hdr.count);
+	*highlogp = be16_to_cpu(leaf->hdr.count) - 1;
+	*lowstalep = lowstale;
+	*highstalep = highstale;
+}
+
+/*
+ * Getdents (readdir) for leaf and node directories.
+ * This reads the data blocks only, so is the same for both forms.
+ */
+int						/* error */
+xfs_dir2_leaf_getdents(
+	xfs_inode_t		*dp,		/* incore directory inode */
+	void			*dirent,
+	size_t			bufsize,
+	xfs_off_t		*offset,
+	filldir_t		filldir)
+{
+	xfs_dabuf_t		*bp;		/* data block buffer */
+	int			byteoff;	/* offset in current block */
+	xfs_dir2_db_t		curdb;		/* db for current block */
+	xfs_dir2_off_t		curoff;		/* current overall offset */
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry */
+	int			error = 0;	/* error return value */
+	int			i;		/* temporary loop index */
+	int			j;		/* temporary loop index */
+	int			length;		/* temporary length value */
+	xfs_bmbt_irec_t		*map;		/* map vector for blocks */
+	xfs_extlen_t		map_blocks;	/* number of fsbs in map */
+	xfs_dablk_t		map_off;	/* last mapped file offset */
+	int			map_size;	/* total entries in *map */
+	int			map_valid;	/* valid entries in *map */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_off_t		newoff;		/* new curoff after new blk */
+	int			nmap;		/* mappings to ask xfs_bmapi */
+	char			*ptr = NULL;	/* pointer to current data */
+	int			ra_current;	/* number of read-ahead blks */
+	int			ra_index;	/* *map index for read-ahead */
+	int			ra_offset;	/* map entry offset for ra */
+	int			ra_want;	/* readahead count wanted */
+
+	/*
+	 * If the offset is at or past the largest allowed value,
+	 * give up right away.
+	 */
+	if (*offset >= XFS_DIR2_MAX_DATAPTR)
+		return 0;
+
+	mp = dp->i_mount;
+
+	/*
+	 * Set up to bmap a number of blocks based on the caller's
+	 * buffer size, the directory block size, and the filesystem
+	 * block size.
+	 */
+	map_size = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize);
+	map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP);
+	map_valid = ra_index = ra_offset = ra_current = map_blocks = 0;
+	bp = NULL;
+
+	/*
+	 * Inside the loop we keep the main offset value as a byte offset
+	 * in the directory file.
+	 */
+	curoff = xfs_dir2_dataptr_to_byte(mp, *offset);
+
+	/*
+	 * Force this conversion through db so we truncate the offset
+	 * down to get the start of the data block.
+	 */
+	map_off = xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, curoff));
+	/*
+	 * Loop over directory entries until we reach the end offset.
+	 * Get more blocks and readahead as necessary.
+	 */
+	while (curoff < XFS_DIR2_LEAF_OFFSET) {
+		/*
+		 * If we have no buffer, or we're off the end of the
+		 * current buffer, need to get another one.
+		 */
+		if (!bp || ptr >= (char *)bp->data + mp->m_dirblksize) {
+			/*
+			 * If we have a buffer, we need to release it and
+			 * take it out of the mapping.
+			 */
+			if (bp) {
+				xfs_da_brelse(NULL, bp);
+				bp = NULL;
+				map_blocks -= mp->m_dirblkfsbs;
+				/*
+				 * Loop to get rid of the extents for the
+				 * directory block.
+				 */
+				for (i = mp->m_dirblkfsbs; i > 0; ) {
+					j = MIN((int)map->br_blockcount, i);
+					map->br_blockcount -= j;
+					map->br_startblock += j;
+					map->br_startoff += j;
+					/*
+					 * If mapping is done, pitch it from
+					 * the table.
+					 */
+					if (!map->br_blockcount && --map_valid)
+						memmove(&map[0], &map[1],
+							sizeof(map[0]) *
+							map_valid);
+					i -= j;
+				}
+			}
+			/*
+			 * Recalculate the readahead blocks wanted.
+			 */
+			ra_want = howmany(bufsize + mp->m_dirblksize,
+					  mp->m_sb.sb_blocksize) - 1;
+			ASSERT(ra_want >= 0);
+
+			/*
+			 * If we don't have as many as we want, and we haven't
+			 * run out of data blocks, get some more mappings.
+			 */
+			if (1 + ra_want > map_blocks &&
+			    map_off <
+			    xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
+				/*
+				 * Get more bmaps, fill in after the ones
+				 * we already have in the table.
+				 */
+				nmap = map_size - map_valid;
+				error = xfs_bmapi(NULL, dp,
+					map_off,
+					xfs_dir2_byte_to_da(mp,
+						XFS_DIR2_LEAF_OFFSET) - map_off,
+					XFS_BMAPI_METADATA, NULL, 0,
+					&map[map_valid], &nmap, NULL);
+				/*
+				 * Don't know if we should ignore this or
+				 * try to return an error.
+				 * The trouble with returning errors
+				 * is that readdir will just stop without
+				 * actually passing the error through.
+				 */
+				if (error)
+					break;	/* XXX */
+				/*
+				 * If we got all the mappings we asked for,
+				 * set the final map offset based on the
+				 * last bmap value received.
+				 * Otherwise, we've reached the end.
+				 */
+				if (nmap == map_size - map_valid)
+					map_off =
+					map[map_valid + nmap - 1].br_startoff +
+					map[map_valid + nmap - 1].br_blockcount;
+				else
+					map_off =
+						xfs_dir2_byte_to_da(mp,
+							XFS_DIR2_LEAF_OFFSET);
+				/*
+				 * Look for holes in the mapping, and
+				 * eliminate them.  Count up the valid blocks.
+				 */
+				for (i = map_valid; i < map_valid + nmap; ) {
+					if (map[i].br_startblock ==
+					    HOLESTARTBLOCK) {
+						nmap--;
+						length = map_valid + nmap - i;
+						if (length)
+							memmove(&map[i],
+								&map[i + 1],
+								sizeof(map[i]) *
+								length);
+					} else {
+						map_blocks +=
+							map[i].br_blockcount;
+						i++;
+					}
+				}
+				map_valid += nmap;
+			}
+			/*
+			 * No valid mappings, so no more data blocks.
+			 */
+			if (!map_valid) {
+				curoff = xfs_dir2_da_to_byte(mp, map_off);
+				break;
+			}
+			/*
+			 * Read the directory block starting at the first
+			 * mapping.
+			 */
+			curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
+			error = xfs_da_read_buf(NULL, dp, map->br_startoff,
+				map->br_blockcount >= mp->m_dirblkfsbs ?
+				    XFS_FSB_TO_DADDR(mp, map->br_startblock) :
+				    -1,
+				&bp, XFS_DATA_FORK);
+			/*
+			 * Should just skip over the data block instead
+			 * of giving up.
+			 */
+			if (error)
+				break;	/* XXX */
+			/*
+			 * Adjust the current amount of read-ahead: we just
+			 * read a block that was previously ra.
+			 */
+			if (ra_current)
+				ra_current -= mp->m_dirblkfsbs;
+			/*
+			 * Do we need more readahead?
+			 */
+			for (ra_index = ra_offset = i = 0;
+			     ra_want > ra_current && i < map_blocks;
+			     i += mp->m_dirblkfsbs) {
+				ASSERT(ra_index < map_valid);
+				/*
+				 * Read-ahead a contiguous directory block.
+				 */
+				if (i > ra_current &&
+				    map[ra_index].br_blockcount >=
+				    mp->m_dirblkfsbs) {
+					xfs_buf_readahead(mp->m_ddev_targp,
+						XFS_FSB_TO_DADDR(mp,
+						   map[ra_index].br_startblock +
+						   ra_offset),
+						(int)BTOBB(mp->m_dirblksize));
+					ra_current = i;
+				}
+				/*
+				 * Read-ahead a non-contiguous directory block.
+				 * This doesn't use our mapping, but this
+				 * is a very rare case.
+				 */
+				else if (i > ra_current) {
+					(void)xfs_da_reada_buf(NULL, dp,
+						map[ra_index].br_startoff +
+						ra_offset, XFS_DATA_FORK);
+					ra_current = i;
+				}
+				/*
+				 * Advance offset through the mapping table.
+				 */
+				for (j = 0; j < mp->m_dirblkfsbs; j++) {
+					/*
+					 * The rest of this extent but not
+					 * more than a dir block.
+					 */
+					length = MIN(mp->m_dirblkfsbs,
+						(int)(map[ra_index].br_blockcount -
+						ra_offset));
+					j += length;
+					ra_offset += length;
+					/*
+					 * Advance to the next mapping if
+					 * this one is used up.
+					 */
+					if (ra_offset ==
+					    map[ra_index].br_blockcount) {
+						ra_offset = 0;
+						ra_index++;
+					}
+				}
+			}
+			/*
+			 * Having done a read, we need to set a new offset.
+			 */
+			newoff = xfs_dir2_db_off_to_byte(mp, curdb, 0);
+			/*
+			 * Start of the current block.
+			 */
+			if (curoff < newoff)
+				curoff = newoff;
+			/*
+			 * Make sure we're in the right block.
+			 */
+			else if (curoff > newoff)
+				ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
+				       curdb);
+			data = bp->data;
+			xfs_dir2_data_check(dp, bp);
+			/*
+			 * Find our position in the block.
+			 */
+			ptr = (char *)&data->u;
+			byteoff = xfs_dir2_byte_to_off(mp, curoff);
+			/*
+			 * Skip past the header.
+			 */
+			if (byteoff == 0)
+				curoff += (uint)sizeof(data->hdr);
+			/*
+			 * Skip past entries until we reach our offset.
+			 */
+			else {
+				while ((char *)ptr - (char *)data < byteoff) {
+					dup = (xfs_dir2_data_unused_t *)ptr;
+
+					if (be16_to_cpu(dup->freetag)
+						  == XFS_DIR2_DATA_FREE_TAG) {
+
+						length = be16_to_cpu(dup->length);
+						ptr += length;
+						continue;
+					}
+					dep = (xfs_dir2_data_entry_t *)ptr;
+					length =
+					   xfs_dir2_data_entsize(dep->namelen);
+					ptr += length;
+				}
+				/*
+				 * Now set our real offset.
+				 */
+				curoff =
+					xfs_dir2_db_off_to_byte(mp,
+					    xfs_dir2_byte_to_db(mp, curoff),
+					    (char *)ptr - (char *)data);
+				if (ptr >= (char *)data + mp->m_dirblksize) {
+					continue;
+				}
+			}
+		}
+		/*
+		 * We have a pointer to an entry.
+		 * Is it a live one?
+		 */
+		dup = (xfs_dir2_data_unused_t *)ptr;
+		/*
+		 * No, it's unused, skip over it.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			length = be16_to_cpu(dup->length);
+			ptr += length;
+			curoff += length;
+			continue;
+		}
+
+		dep = (xfs_dir2_data_entry_t *)ptr;
+		length = xfs_dir2_data_entsize(dep->namelen);
+
+		if (filldir(dirent, (char *)dep->name, dep->namelen,
+			    xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
+			    be64_to_cpu(dep->inumber), DT_UNKNOWN))
+			break;
+
+		/*
+		 * Advance to next entry in the block.
+		 */
+		ptr += length;
+		curoff += length;
+		/* bufsize may have just been a guess; don't go negative */
+		bufsize = bufsize > length ? bufsize - length : 0;
+	}
+
+	/*
+	 * All done.  Set output offset value to current offset.
+	 */
+	if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
+		*offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
+	else
+		*offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+	kmem_free(map);
+	if (bp)
+		xfs_da_brelse(NULL, bp);
+	return error;
+}
+
+/*
+ * Initialize a new leaf block, leaf1 or leafn magic accepted.
+ */
+int
+xfs_dir2_leaf_init(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dir2_db_t		bno,		/* directory block number */
+	xfs_dabuf_t		**bpp,		/* out: leaf buffer */
+	int			magic)		/* magic number for block */
+{
+	xfs_dabuf_t		*bp;		/* leaf buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	ASSERT(dp != NULL);
+	tp = args->trans;
+	mp = dp->i_mount;
+	ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) &&
+	       bno < XFS_DIR2_FREE_FIRSTDB(mp));
+	/*
+	 * Get the buffer for the block.
+	 */
+	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
+		XFS_DATA_FORK);
+	if (error) {
+		return error;
+	}
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	/*
+	 * Initialize the header.
+	 */
+	leaf->hdr.info.magic = cpu_to_be16(magic);
+	leaf->hdr.info.forw = 0;
+	leaf->hdr.info.back = 0;
+	leaf->hdr.count = 0;
+	leaf->hdr.stale = 0;
+	xfs_dir2_leaf_log_header(tp, bp);
+	/*
+	 * If it's a leaf-format directory initialize the tail.
+	 * In this case our caller has the real bests table to copy into
+	 * the block.
+	 */
+	if (magic == XFS_DIR2_LEAF1_MAGIC) {
+		ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+		ltp->bestcount = 0;
+		xfs_dir2_leaf_log_tail(tp, bp);
+	}
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Log the bests entries indicated from a leaf1 block.
+ */
+static void
+xfs_dir2_leaf_log_bests(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	int			first,		/* first entry to log */
+	int			last)		/* last entry to log */
+{
+	__be16			*firstb;	/* pointer to first entry */
+	__be16			*lastb;		/* pointer to last entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC);
+	ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf);
+	firstb = xfs_dir2_leaf_bests_p(ltp) + first;
+	lastb = xfs_dir2_leaf_bests_p(ltp) + last;
+	xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf),
+		(uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
+}
+
+/*
+ * Log the leaf entries indicated from a leaf1 or leafn block.
+ */
+void
+xfs_dir2_leaf_log_ents(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	int			first,		/* first entry to log */
+	int			last)		/* last entry to log */
+{
+	xfs_dir2_leaf_entry_t	*firstlep;	/* pointer to first entry */
+	xfs_dir2_leaf_entry_t	*lastlep;	/* pointer to last entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC ||
+	       be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+	firstlep = &leaf->ents[first];
+	lastlep = &leaf->ents[last];
+	xfs_da_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf),
+		(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
+}
+
+/*
+ * Log the header of the leaf1 or leafn block.
+ */
+void
+xfs_dir2_leaf_log_header(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp)		/* leaf buffer */
+{
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC ||
+	       be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+	xfs_da_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
+		(uint)(sizeof(leaf->hdr) - 1));
+}
+
+/*
+ * Log the tail of the leaf1 block.
+ */
+STATIC void
+xfs_dir2_leaf_log_tail(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp)		/* leaf buffer */
+{
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	mp = tp->t_mountp;
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC);
+	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+	xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf),
+		(uint)(mp->m_dirblksize - 1));
+}
+
+/*
+ * Look up the entry referred to by args in the leaf format directory.
+ * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which
+ * is also used by the node-format code.
+ */
+int
+xfs_dir2_leaf_lookup(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* found entry index */
+	xfs_dabuf_t		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	trace_xfs_dir2_leaf_lookup(args);
+
+	/*
+	 * Look up name in the leaf block, returning both buffers and index.
+	 */
+	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+		return error;
+	}
+	tp = args->trans;
+	dp = args->dp;
+	xfs_dir2_leaf_check(dp, lbp);
+	leaf = lbp->data;
+	/*
+	 * Get to the leaf entry and contained data entry address.
+	 */
+	lep = &leaf->ents[index];
+	/*
+	 * Point to the data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)dbp->data +
+	       xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
+	/*
+	 * Return the found inode number & CI name if appropriate
+	 */
+	args->inumber = be64_to_cpu(dep->inumber);
+	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+	xfs_da_brelse(tp, dbp);
+	xfs_da_brelse(tp, lbp);
+	return XFS_ERROR(error);
+}
+
+/*
+ * Look up name/hash in the leaf block.
+ * Fill in indexp with the found index, and dbpp with the data buffer.
+ * If not found dbpp will be NULL, and ENOENT comes back.
+ * lbpp will always be filled in with the leaf buffer unless there's an error.
+ */
+static int					/* error */
+xfs_dir2_leaf_lookup_int(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		**lbpp,		/* out: leaf buffer */
+	int			*indexp,	/* out: index in leaf block */
+	xfs_dabuf_t		**dbpp)		/* out: data buffer */
+{
+	xfs_dir2_db_t		curdb = -1;	/* current data block number */
+	xfs_dabuf_t		*dbp = NULL;	/* data buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* index in leaf block */
+	xfs_dabuf_t		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	xfs_dir2_db_t		cidb = -1;	/* case match data block no. */
+	enum xfs_dacmp		cmp;		/* name compare result */
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	/*
+	 * Read the leaf block into the buffer.
+	 */
+	error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
+							XFS_DATA_FORK);
+	if (error)
+		return error;
+	*lbpp = lbp;
+	leaf = lbp->data;
+	xfs_dir2_leaf_check(dp, lbp);
+	/*
+	 * Look for the first leaf entry with our hash value.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, lbp);
+	/*
+	 * Loop over all the entries with the right hash value
+	 * looking to match the name.
+	 */
+	for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
+				be32_to_cpu(lep->hashval) == args->hashval;
+				lep++, index++) {
+		/*
+		 * Skip over stale leaf entries.
+		 */
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Get the new data block number.
+		 */
+		newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+		/*
+		 * If it's not the same as the old data block number,
+		 * need to pitch the old one and read the new one.
+		 */
+		if (newdb != curdb) {
+			if (dbp)
+				xfs_da_brelse(tp, dbp);
+			error = xfs_da_read_buf(tp, dp,
+						xfs_dir2_db_to_da(mp, newdb),
+						-1, &dbp, XFS_DATA_FORK);
+			if (error) {
+				xfs_da_brelse(tp, lbp);
+				return error;
+			}
+			xfs_dir2_data_check(dp, dbp);
+			curdb = newdb;
+		}
+		/*
+		 * Point to the data entry.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)dbp->data +
+			xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+		/*
+		 * Compare name and if it's an exact match, return the index
+		 * and buffer. If it's the first case-insensitive match, store
+		 * the index and buffer and continue looking for an exact match.
+		 */
+		cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			args->cmpresult = cmp;
+			*indexp = index;
+			/* case exact match: return the current buffer. */
+			if (cmp == XFS_CMP_EXACT) {
+				*dbpp = dbp;
+				return 0;
+			}
+			cidb = curdb;
+		}
+	}
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	/*
+	 * Here, we can only be doing a lookup (not a rename or remove).
+	 * If a case-insensitive match was found earlier, re-read the
+	 * appropriate data block if required and return it.
+	 */
+	if (args->cmpresult == XFS_CMP_CASE) {
+		ASSERT(cidb != -1);
+		if (cidb != curdb) {
+			xfs_da_brelse(tp, dbp);
+			error = xfs_da_read_buf(tp, dp,
+						xfs_dir2_db_to_da(mp, cidb),
+						-1, &dbp, XFS_DATA_FORK);
+			if (error) {
+				xfs_da_brelse(tp, lbp);
+				return error;
+			}
+		}
+		*dbpp = dbp;
+		return 0;
+	}
+	/*
+	 * No match found, return ENOENT.
+	 */
+	ASSERT(cidb == -1);
+	if (dbp)
+		xfs_da_brelse(tp, dbp);
+	xfs_da_brelse(tp, lbp);
+	return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Remove an entry from a leaf format directory.
+ */
+int						/* error */
+xfs_dir2_leaf_removename(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	__be16			*bestsp;	/* leaf block best freespace */
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dir2_db_t		db;		/* data block number */
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry structure */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dir2_db_t		i;		/* temporary data block # */
+	int			index;		/* index into leaf entries */
+	xfs_dabuf_t		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	xfs_dir2_data_off_t	oldbest;	/* old value of best free */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	trace_xfs_dir2_leaf_removename(args);
+
+	/*
+	 * Lookup the leaf entry, get the leaf and data blocks read in.
+	 */
+	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+		return error;
+	}
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = lbp->data;
+	data = dbp->data;
+	xfs_dir2_data_check(dp, dbp);
+	/*
+	 * Point to the leaf entry, use that to point to the data entry.
+	 */
+	lep = &leaf->ents[index];
+	db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)data + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+	needscan = needlog = 0;
+	oldbest = be16_to_cpu(data->hdr.bestfree[0].length);
+	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
+	/*
+	 * Mark the former data entry unused.
+	 */
+	xfs_dir2_data_make_free(tp, dbp,
+		(xfs_dir2_data_aoff_t)((char *)dep - (char *)data),
+		xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
+	/*
+	 * We just mark the leaf entry stale by putting a null in it.
+	 */
+	be16_add_cpu(&leaf->hdr.stale, 1);
+	xfs_dir2_leaf_log_header(tp, lbp);
+	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+	xfs_dir2_leaf_log_ents(tp, lbp, index, index);
+	/*
+	 * Scan the freespace in the data block again if necessary,
+	 * log the data block header if necessary.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, data, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	/*
+	 * If the longest freespace in the data block has changed,
+	 * put the new value in the bests table and log that.
+	 */
+	if (be16_to_cpu(data->hdr.bestfree[0].length) != oldbest) {
+		bestsp[db] = data->hdr.bestfree[0].length;
+		xfs_dir2_leaf_log_bests(tp, lbp, db, db);
+	}
+	xfs_dir2_data_check(dp, dbp);
+	/*
+	 * If the data block is now empty then get rid of the data block.
+	 */
+	if (be16_to_cpu(data->hdr.bestfree[0].length) ==
+	    mp->m_dirblksize - (uint)sizeof(data->hdr)) {
+		ASSERT(db != mp->m_dirdatablk);
+		if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+			/*
+			 * Nope, can't get rid of it because it caused
+			 * allocation of a bmap btree block to do so.
+			 * Just go on, returning success, leaving the
+			 * empty block in place.
+			 */
+			if (error == ENOSPC && args->total == 0) {
+				xfs_da_buf_done(dbp);
+				error = 0;
+			}
+			xfs_dir2_leaf_check(dp, lbp);
+			xfs_da_buf_done(lbp);
+			return error;
+		}
+		dbp = NULL;
+		/*
+		 * If this is the last data block then compact the
+		 * bests table by getting rid of entries.
+		 */
+		if (db == be32_to_cpu(ltp->bestcount) - 1) {
+			/*
+			 * Look for the last active entry (i).
+			 */
+			for (i = db - 1; i > 0; i--) {
+				if (be16_to_cpu(bestsp[i]) != NULLDATAOFF)
+					break;
+			}
+			/*
+			 * Copy the table down so inactive entries at the
+			 * end are removed.
+			 */
+			memmove(&bestsp[db - i], bestsp,
+				(be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
+			be32_add_cpu(&ltp->bestcount, -(db - i));
+			xfs_dir2_leaf_log_tail(tp, lbp);
+			xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+		} else
+			bestsp[db] = cpu_to_be16(NULLDATAOFF);
+	}
+	/*
+	 * If the data block was not the first one, drop it.
+	 */
+	else if (db != mp->m_dirdatablk && dbp != NULL) {
+		xfs_da_buf_done(dbp);
+		dbp = NULL;
+	}
+	xfs_dir2_leaf_check(dp, lbp);
+	/*
+	 * See if we can convert to block form.
+	 */
+	return xfs_dir2_leaf_to_block(args, lbp, dbp);
+}
+
+/*
+ * Replace the inode number in a leaf format directory entry.
+ */
+int						/* error */
+xfs_dir2_leaf_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* index of leaf entry */
+	xfs_dabuf_t		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	trace_xfs_dir2_leaf_replace(args);
+
+	/*
+	 * Look up the entry.
+	 */
+	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+		return error;
+	}
+	dp = args->dp;
+	leaf = lbp->data;
+	/*
+	 * Point to the leaf entry, get data address from it.
+	 */
+	lep = &leaf->ents[index];
+	/*
+	 * Point to the data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)dbp->data +
+	       xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
+	ASSERT(args->inumber != be64_to_cpu(dep->inumber));
+	/*
+	 * Put the new inode number in, log it.
+	 */
+	dep->inumber = cpu_to_be64(args->inumber);
+	tp = args->trans;
+	xfs_dir2_data_log_entry(tp, dbp, dep);
+	xfs_da_buf_done(dbp);
+	xfs_dir2_leaf_check(dp, lbp);
+	xfs_da_brelse(tp, lbp);
+	return 0;
+}
+
+/*
+ * Return index in the leaf block (lbp) which is either the first
+ * one with this hash value, or if there are none, the insert point
+ * for that hash value.
+ */
+int						/* index value */
+xfs_dir2_leaf_search_hash(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*lbp)		/* leaf buffer */
+{
+	xfs_dahash_t		hash=0;		/* hash from this entry */
+	xfs_dahash_t		hashwant;	/* hash value looking for */
+	int			high;		/* high leaf index */
+	int			low;		/* low leaf index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			mid=0;		/* current leaf index */
+
+	leaf = lbp->data;
+#ifndef __KERNEL__
+	if (!leaf->hdr.count)
+		return 0;
+#endif
+	/*
+	 * Note, the table cannot be empty, so we have to go through the loop.
+	 * Binary search the leaf entries looking for our hash value.
+	 */
+	for (lep = leaf->ents, low = 0, high = be16_to_cpu(leaf->hdr.count) - 1,
+		hashwant = args->hashval;
+	     low <= high; ) {
+		mid = (low + high) >> 1;
+		if ((hash = be32_to_cpu(lep[mid].hashval)) == hashwant)
+			break;
+		if (hash < hashwant)
+			low = mid + 1;
+		else
+			high = mid - 1;
+	}
+	/*
+	 * Found one, back up through all the equal hash values.
+	 */
+	if (hash == hashwant) {
+		while (mid > 0 && be32_to_cpu(lep[mid - 1].hashval) == hashwant) {
+			mid--;
+		}
+	}
+	/*
+	 * Need to point to an entry higher than ours.
+	 */
+	else if (hash < hashwant)
+		mid++;
+	return mid;
+}
+
+/*
+ * Trim off a trailing data block.  We know it's empty since the leaf
+ * freespace table says so.
+ */
+int						/* error */
+xfs_dir2_leaf_trim_data(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*lbp,		/* leaf buffer */
+	xfs_dir2_db_t		db)		/* data block number */
+{
+	__be16			*bestsp;	/* leaf bests table */
+#ifdef DEBUG
+	xfs_dir2_data_t		*data;		/* data block structure */
+#endif
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Read the offending data block.  We need its buffer.
+	 */
+	if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp,
+			XFS_DATA_FORK))) {
+		return error;
+	}
+#ifdef DEBUG
+	data = dbp->data;
+	ASSERT(be32_to_cpu(data->hdr.magic) == XFS_DIR2_DATA_MAGIC);
+#endif
+	/* this seems to be an error
+	 * data is only valid if DEBUG is defined?
+	 * RMC 09/08/1999
+	 */
+
+	leaf = lbp->data;
+	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+	ASSERT(be16_to_cpu(data->hdr.bestfree[0].length) ==
+	       mp->m_dirblksize - (uint)sizeof(data->hdr));
+	ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
+	/*
+	 * Get rid of the data block.
+	 */
+	if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+		ASSERT(error != ENOSPC);
+		xfs_da_brelse(tp, dbp);
+		return error;
+	}
+	/*
+	 * Eliminate the last bests entry from the table.
+	 */
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	be32_add_cpu(&ltp->bestcount, -1);
+	memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
+	xfs_dir2_leaf_log_tail(tp, lbp);
+	xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+	return 0;
+}
+
+/*
+ * Convert node form directory to leaf form directory.
+ * The root of the node form dir needs to already be a LEAFN block.
+ * Just return if we can't do anything.
+ */
+int						/* error */
+xfs_dir2_node_to_leaf(
+	xfs_da_state_t		*state)		/* directory operation state */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dabuf_t		*fbp;		/* buffer for freespace block */
+	xfs_fileoff_t		fo;		/* freespace file offset */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	xfs_dabuf_t		*lbp;		/* buffer for leaf block */
+	xfs_dir2_leaf_tail_t	*ltp;		/* tail of leaf structure */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			rval;		/* successful free trim? */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	/*
+	 * There's more than a leaf level in the btree, so there must
+	 * be multiple leafn blocks.  Give up.
+	 */
+	if (state->path.active > 1)
+		return 0;
+	args = state->args;
+
+	trace_xfs_dir2_node_to_leaf(args);
+
+	mp = state->mp;
+	dp = args->dp;
+	tp = args->trans;
+	/*
+	 * Get the last offset in the file.
+	 */
+	if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) {
+		return error;
+	}
+	fo -= mp->m_dirblkfsbs;
+	/*
+	 * If there are freespace blocks other than the first one,
+	 * take this opportunity to remove trailing empty freespace blocks
+	 * that may have been left behind during no-space-reservation
+	 * operations.
+	 */
+	while (fo > mp->m_dirfreeblk) {
+		if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
+			return error;
+		}
+		if (rval)
+			fo -= mp->m_dirblkfsbs;
+		else
+			return 0;
+	}
+	/*
+	 * Now find the block just before the freespace block.
+	 */
+	if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) {
+		return error;
+	}
+	/*
+	 * If it's not the single leaf block, give up.
+	 */
+	if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize)
+		return 0;
+	lbp = state->path.blk[0].bp;
+	leaf = lbp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * Read the freespace block.
+	 */
+	if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp,
+			XFS_DATA_FORK))) {
+		return error;
+	}
+	free = fbp->data;
+	ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+	ASSERT(!free->hdr.firstdb);
+	/*
+	 * Now see if the leafn and free data will fit in a leaf1.
+	 * If not, release the buffer and give up.
+	 */
+	if ((uint)sizeof(leaf->hdr) +
+	    (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)) * (uint)sizeof(leaf->ents[0]) +
+	    be32_to_cpu(free->hdr.nvalid) * (uint)sizeof(leaf->bests[0]) +
+	    (uint)sizeof(leaf->tail) >
+	    mp->m_dirblksize) {
+		xfs_da_brelse(tp, fbp);
+		return 0;
+	}
+	/*
+	 * If the leaf has any stale entries in it, compress them out.
+	 * The compact routine will log the header.
+	 */
+	if (be16_to_cpu(leaf->hdr.stale))
+		xfs_dir2_leaf_compact(args, lbp);
+	else
+		xfs_dir2_leaf_log_header(tp, lbp);
+	leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
+	/*
+	 * Set up the leaf tail from the freespace block.
+	 */
+	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+	ltp->bestcount = free->hdr.nvalid;
+	/*
+	 * Set up the leaf bests table.
+	 */
+	memcpy(xfs_dir2_leaf_bests_p(ltp), free->bests,
+		be32_to_cpu(ltp->bestcount) * sizeof(leaf->bests[0]));
+	xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+	xfs_dir2_leaf_log_tail(tp, lbp);
+	xfs_dir2_leaf_check(dp, lbp);
+	/*
+	 * Get rid of the freespace block.
+	 */
+	error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp);
+	if (error) {
+		/*
+		 * This can't fail here because it can only happen when
+		 * punching out the middle of an extent, and this is an
+		 * isolated block.
+		 */
+		ASSERT(error != ENOSPC);
+		return error;
+	}
+	fbp = NULL;
+	/*
+	 * Now see if we can convert the single-leaf directory
+	 * down to a block form directory.
+	 * This routine always kills the dabuf for the leaf, so
+	 * eliminate it from the path.
+	 */
+	error = xfs_dir2_leaf_to_block(args, lbp, NULL);
+	state->path.blk[0].bp = NULL;
+	return error;
+}
diff --git a/fs/xfs/xfs_dir2_leaf.h b/fs/xfs/xfs_dir2_leaf.h
new file mode 100644
index 00000000..6c9539f0
--- /dev/null
+++ b/fs/xfs/xfs_dir2_leaf.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DIR2_LEAF_H__
+#define	__XFS_DIR2_LEAF_H__
+
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Offset of the leaf/node space.  First block in this space
+ * is the btree root.
+ */
+#define	XFS_DIR2_LEAF_SPACE	1
+#define	XFS_DIR2_LEAF_OFFSET	(XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
+#define	XFS_DIR2_LEAF_FIRSTDB(mp)	\
+	xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET)
+
+/*
+ * Offset in data space of a data entry.
+ */
+typedef	__uint32_t	xfs_dir2_dataptr_t;
+#define	XFS_DIR2_MAX_DATAPTR	((xfs_dir2_dataptr_t)0xffffffff)
+#define	XFS_DIR2_NULL_DATAPTR	((xfs_dir2_dataptr_t)0)
+
+/*
+ * Leaf block header.
+ */
+typedef struct xfs_dir2_leaf_hdr {
+	xfs_da_blkinfo_t	info;		/* header for da routines */
+	__be16			count;		/* count of entries */
+	__be16			stale;		/* count of stale entries */
+} xfs_dir2_leaf_hdr_t;
+
+/*
+ * Leaf block entry.
+ */
+typedef struct xfs_dir2_leaf_entry {
+	__be32			hashval;	/* hash value of name */
+	__be32			address;	/* address of data entry */
+} xfs_dir2_leaf_entry_t;
+
+/*
+ * Leaf block tail.
+ */
+typedef struct xfs_dir2_leaf_tail {
+	__be32			bestcount;
+} xfs_dir2_leaf_tail_t;
+
+/*
+ * Leaf block.
+ * bests and tail are at the end of the block for single-leaf only
+ * (magic = XFS_DIR2_LEAF1_MAGIC not XFS_DIR2_LEAFN_MAGIC).
+ */
+typedef struct xfs_dir2_leaf {
+	xfs_dir2_leaf_hdr_t	hdr;		/* leaf header */
+	xfs_dir2_leaf_entry_t	ents[1];	/* entries */
+						/* ... */
+	xfs_dir2_data_off_t	bests[1];	/* best free counts */
+	xfs_dir2_leaf_tail_t	tail;		/* leaf tail */
+} xfs_dir2_leaf_t;
+
+/*
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+
+static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp)
+{
+	return (int)(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_leaf_hdr_t)) /
+	       (uint)sizeof(xfs_dir2_leaf_entry_t));
+}
+
+/*
+ * Get address of the bestcount field in the single-leaf block.
+ */
+static inline xfs_dir2_leaf_tail_t *
+xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp)
+{
+	return (xfs_dir2_leaf_tail_t *)
+		((char *)(lp) + (mp)->m_dirblksize - 
+		  (uint)sizeof(xfs_dir2_leaf_tail_t));
+}
+
+/*
+ * Get address of the bests array in the single-leaf block.
+ */
+static inline __be16 *
+xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp)
+{
+	return (__be16 *)ltp - be32_to_cpu(ltp->bestcount);
+}
+
+/*
+ * Convert dataptr to byte in file space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
+{
+	return (xfs_dir2_off_t)(dp) << XFS_DIR2_DATA_ALIGN_LOG;
+}
+
+/*
+ * Convert byte in file space to dataptr.  It had better be aligned.
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by)
+{
+	return (xfs_dir2_dataptr_t)((by) >> XFS_DIR2_DATA_ALIGN_LOG);
+}
+
+/*
+ * Convert byte in space to (DB) block
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by)
+{
+	return (xfs_dir2_db_t)((by) >> \
+		 ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog));
+}
+
+/*
+ * Convert dataptr to a block number
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
+{
+	return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp));
+}
+
+/*
+ * Convert byte in space to offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by)
+{
+	return (xfs_dir2_data_aoff_t)((by) & \
+		((1 << ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) - 1));
+}
+
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
+{
+	return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp));
+}
+
+/*
+ * Convert block and offset to byte in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db,
+			xfs_dir2_data_aoff_t o)
+{
+	return ((xfs_dir2_off_t)(db) << \
+		((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) + (o);
+}
+
+/*
+ * Convert block (DB) to block (dablk)
+ */
+static inline xfs_dablk_t
+xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db)
+{
+	return (xfs_dablk_t)((db) << (mp)->m_sb.sb_dirblklog);
+}
+
+/*
+ * Convert byte in space to (DA) block
+ */
+static inline xfs_dablk_t
+xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by)
+{
+	return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by));
+}
+
+/*
+ * Convert block and offset to dataptr
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db,
+			   xfs_dir2_data_aoff_t o)
+{
+	return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o));
+}
+
+/*
+ * Convert block (dablk) to block (DB)
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da)
+{
+	return (xfs_dir2_db_t)((da) >> (mp)->m_sb.sb_dirblklog);
+}
+
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da)
+{
+	return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0);
+}
+
+/*
+ * Function declarations.
+ */
+extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
+				  struct xfs_dabuf *dbp);
+extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
+extern void xfs_dir2_leaf_compact(struct xfs_da_args *args,
+				  struct xfs_dabuf *bp);
+extern void xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp,
+				     int *lowstalep, int *highstalep,
+				     int *lowlogp, int *highlogp);
+extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent,
+				  size_t bufsize, xfs_off_t *offset,
+				  filldir_t filldir);
+extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno,
+			      struct xfs_dabuf **bpp, int magic);
+extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp,
+				   int first, int last);
+extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp,
+				     struct xfs_dabuf *bp);
+extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
+				     struct xfs_dabuf *lbp);
+extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
+				   struct xfs_dabuf *lbp, xfs_dir2_db_t db);
+extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
+
+#endif	/* __XFS_DIR2_LEAF_H__ */
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
new file mode 100644
index 00000000..a0aab7d3
--- /dev/null
+++ b/fs/xfs/xfs_dir2_node.c
@@ -0,0 +1,2076 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_dir2_node.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+/*
+ * Function declarations.
+ */
+static void xfs_dir2_free_log_header(xfs_trans_t *tp, xfs_dabuf_t *bp);
+static int xfs_dir2_leafn_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index);
+#ifdef DEBUG
+static void xfs_dir2_leafn_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
+#else
+#define	xfs_dir2_leafn_check(dp, bp)
+#endif
+static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, xfs_dabuf_t *bp_s,
+				    int start_s, xfs_dabuf_t *bp_d, int start_d,
+				    int count);
+static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
+				     xfs_da_state_blk_t *blk1,
+				     xfs_da_state_blk_t *blk2);
+static int xfs_dir2_leafn_remove(xfs_da_args_t *args, xfs_dabuf_t *bp,
+				 int index, xfs_da_state_blk_t *dblk,
+				 int *rval);
+static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
+				     xfs_da_state_blk_t *fblk);
+
+/*
+ * Log entries from a freespace block.
+ */
+STATIC void
+xfs_dir2_free_log_bests(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* freespace buffer */
+	int			first,		/* first entry to log */
+	int			last)		/* last entry to log */
+{
+	xfs_dir2_free_t		*free;		/* freespace structure */
+
+	free = bp->data;
+	ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+	xfs_da_log_buf(tp, bp,
+		(uint)((char *)&free->bests[first] - (char *)free),
+		(uint)((char *)&free->bests[last] - (char *)free +
+		       sizeof(free->bests[0]) - 1));
+}
+
+/*
+ * Log header from a freespace block.
+ */
+static void
+xfs_dir2_free_log_header(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp)		/* freespace buffer */
+{
+	xfs_dir2_free_t		*free;		/* freespace structure */
+
+	free = bp->data;
+	ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+	xfs_da_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free),
+		(uint)(sizeof(xfs_dir2_free_hdr_t) - 1));
+}
+
+/*
+ * Convert a leaf-format directory to a node-format directory.
+ * We need to change the magic number of the leaf block, and copy
+ * the freespace table out of the leaf block into its own block.
+ */
+int						/* error */
+xfs_dir2_leaf_to_node(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*lbp)		/* leaf buffer */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	xfs_dabuf_t		*fbp;		/* freespace buffer */
+	xfs_dir2_db_t		fdb;		/* freespace block number */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	__be16			*from;		/* pointer to freespace entry */
+	int			i;		/* leaf freespace index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			n;		/* count of live freespc ents */
+	xfs_dir2_data_off_t	off;		/* freespace entry value */
+	__be16			*to;		/* pointer to freespace entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	trace_xfs_dir2_leaf_to_node(args);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Add a freespace block to the directory.
+	 */
+	if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
+		return error;
+	}
+	ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp));
+	/*
+	 * Get the buffer for the new freespace block.
+	 */
+	if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
+			XFS_DATA_FORK))) {
+		return error;
+	}
+	ASSERT(fbp != NULL);
+	free = fbp->data;
+	leaf = lbp->data;
+	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+	/*
+	 * Initialize the freespace block header.
+	 */
+	free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC);
+	free->hdr.firstdb = 0;
+	ASSERT(be32_to_cpu(ltp->bestcount) <= (uint)dp->i_d.di_size / mp->m_dirblksize);
+	free->hdr.nvalid = ltp->bestcount;
+	/*
+	 * Copy freespace entries from the leaf block to the new block.
+	 * Count active entries.
+	 */
+	for (i = n = 0, from = xfs_dir2_leaf_bests_p(ltp), to = free->bests;
+	     i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
+		if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
+			n++;
+		*to = cpu_to_be16(off);
+	}
+	free->hdr.nused = cpu_to_be32(n);
+	leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * Log everything.
+	 */
+	xfs_dir2_leaf_log_header(tp, lbp);
+	xfs_dir2_free_log_header(tp, fbp);
+	xfs_dir2_free_log_bests(tp, fbp, 0, be32_to_cpu(free->hdr.nvalid) - 1);
+	xfs_da_buf_done(fbp);
+	xfs_dir2_leafn_check(dp, lbp);
+	return 0;
+}
+
+/*
+ * Add a leaf entry to a leaf block in a node-form directory.
+ * The other work necessary is done from the caller.
+ */
+static int					/* error */
+xfs_dir2_leafn_add(
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			index)		/* insertion pt for new entry */
+{
+	int			compact;	/* compacting stale leaves */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			highstale;	/* next stale entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			lfloghigh;	/* high leaf entry logging */
+	int			lfloglow;	/* low leaf entry logging */
+	int			lowstale;	/* previous stale entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	trace_xfs_dir2_leafn_add(args, index);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	leaf = bp->data;
+
+	/*
+	 * Quick check just to make sure we are not going to index
+	 * into other peoples memory
+	 */
+	if (index < 0)
+		return XFS_ERROR(EFSCORRUPTED);
+
+	/*
+	 * If there are already the maximum number of leaf entries in
+	 * the block, if there are no stale entries it won't fit.
+	 * Caller will do a split.  If there are stale entries we'll do
+	 * a compact.
+	 */
+
+	if (be16_to_cpu(leaf->hdr.count) == xfs_dir2_max_leaf_ents(mp)) {
+		if (!leaf->hdr.stale)
+			return XFS_ERROR(ENOSPC);
+		compact = be16_to_cpu(leaf->hdr.stale) > 1;
+	} else
+		compact = 0;
+	ASSERT(index == 0 || be32_to_cpu(leaf->ents[index - 1].hashval) <= args->hashval);
+	ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
+	       be32_to_cpu(leaf->ents[index].hashval) >= args->hashval);
+
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+		return 0;
+
+	/*
+	 * Compact out all but one stale leaf entry.  Leaves behind
+	 * the entry closest to index.
+	 */
+	if (compact) {
+		xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale,
+			&lfloglow, &lfloghigh);
+	}
+	/*
+	 * Set impossible logging indices for this case.
+	 */
+	else if (leaf->hdr.stale) {
+		lfloglow = be16_to_cpu(leaf->hdr.count);
+		lfloghigh = -1;
+	}
+	/*
+	 * No stale entries, just insert a space for the new entry.
+	 */
+	if (!leaf->hdr.stale) {
+		lep = &leaf->ents[index];
+		if (index < be16_to_cpu(leaf->hdr.count))
+			memmove(lep + 1, lep,
+				(be16_to_cpu(leaf->hdr.count) - index) * sizeof(*lep));
+		lfloglow = index;
+		lfloghigh = be16_to_cpu(leaf->hdr.count);
+		be16_add_cpu(&leaf->hdr.count, 1);
+	}
+	/*
+	 * There are stale entries.  We'll use one for the new entry.
+	 */
+	else {
+		/*
+		 * If we didn't do a compact then we need to figure out
+		 * which stale entry will be used.
+		 */
+		if (compact == 0) {
+			/*
+			 * Find first stale entry before our insertion point.
+			 */
+			for (lowstale = index - 1;
+			     lowstale >= 0 &&
+				be32_to_cpu(leaf->ents[lowstale].address) !=
+				XFS_DIR2_NULL_DATAPTR;
+			     lowstale--)
+				continue;
+			/*
+			 * Find next stale entry after insertion point.
+			 * Stop looking if the answer would be worse than
+			 * lowstale already found.
+			 */
+			for (highstale = index;
+			     highstale < be16_to_cpu(leaf->hdr.count) &&
+				be32_to_cpu(leaf->ents[highstale].address) !=
+				XFS_DIR2_NULL_DATAPTR &&
+				(lowstale < 0 ||
+				 index - lowstale - 1 >= highstale - index);
+			     highstale++)
+				continue;
+		}
+		/*
+		 * Using the low stale entry.
+		 * Shift entries up toward the stale slot.
+		 */
+		if (lowstale >= 0 &&
+		    (highstale == be16_to_cpu(leaf->hdr.count) ||
+		     index - lowstale - 1 < highstale - index)) {
+			ASSERT(be32_to_cpu(leaf->ents[lowstale].address) ==
+			       XFS_DIR2_NULL_DATAPTR);
+			ASSERT(index - lowstale - 1 >= 0);
+			if (index - lowstale - 1 > 0)
+				memmove(&leaf->ents[lowstale],
+					&leaf->ents[lowstale + 1],
+					(index - lowstale - 1) * sizeof(*lep));
+			lep = &leaf->ents[index - 1];
+			lfloglow = MIN(lowstale, lfloglow);
+			lfloghigh = MAX(index - 1, lfloghigh);
+		}
+		/*
+		 * Using the high stale entry.
+		 * Shift entries down toward the stale slot.
+		 */
+		else {
+			ASSERT(be32_to_cpu(leaf->ents[highstale].address) ==
+			       XFS_DIR2_NULL_DATAPTR);
+			ASSERT(highstale - index >= 0);
+			if (highstale - index > 0)
+				memmove(&leaf->ents[index + 1],
+					&leaf->ents[index],
+					(highstale - index) * sizeof(*lep));
+			lep = &leaf->ents[index];
+			lfloglow = MIN(index, lfloglow);
+			lfloghigh = MAX(highstale, lfloghigh);
+		}
+		be16_add_cpu(&leaf->hdr.stale, -1);
+	}
+	/*
+	 * Insert the new entry, log everything.
+	 */
+	lep->hashval = cpu_to_be32(args->hashval);
+	lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp,
+				args->blkno, args->index));
+	xfs_dir2_leaf_log_header(tp, bp);
+	xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh);
+	xfs_dir2_leafn_check(dp, bp);
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check internal consistency of a leafn block.
+ */
+void
+xfs_dir2_leafn_check(
+	xfs_inode_t	*dp,			/* incore directory inode */
+	xfs_dabuf_t	*bp)			/* leaf buffer */
+{
+	int		i;			/* leaf index */
+	xfs_dir2_leaf_t	*leaf;			/* leaf structure */
+	xfs_mount_t	*mp;			/* filesystem mount point */
+	int		stale;			/* count of stale leaves */
+
+	leaf = bp->data;
+	mp = dp->i_mount;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp));
+	for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) {
+		if (i + 1 < be16_to_cpu(leaf->hdr.count)) {
+			ASSERT(be32_to_cpu(leaf->ents[i].hashval) <=
+			       be32_to_cpu(leaf->ents[i + 1].hashval));
+		}
+		if (be32_to_cpu(leaf->ents[i].address) == XFS_DIR2_NULL_DATAPTR)
+			stale++;
+	}
+	ASSERT(be16_to_cpu(leaf->hdr.stale) == stale);
+}
+#endif	/* DEBUG */
+
+/*
+ * Return the last hash value in the leaf.
+ * Stale entries are ok.
+ */
+xfs_dahash_t					/* hash value */
+xfs_dir2_leafn_lasthash(
+	xfs_dabuf_t	*bp,			/* leaf buffer */
+	int		*count)			/* count of entries in leaf */
+{
+	xfs_dir2_leaf_t	*leaf;			/* leaf structure */
+
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+	if (count)
+		*count = be16_to_cpu(leaf->hdr.count);
+	if (!leaf->hdr.count)
+		return 0;
+	return be32_to_cpu(leaf->ents[be16_to_cpu(leaf->hdr.count) - 1].hashval);
+}
+
+/*
+ * Look up a leaf entry for space to add a name in a node-format leaf block.
+ * The extrablk in state is a freespace block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_addname(
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	xfs_dabuf_t		*curbp = NULL;	/* current data/free buffer */
+	xfs_dir2_db_t		curdb = -1;	/* current data block number */
+	xfs_dir2_db_t		curfdb = -1;	/* current free block number */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			fi;		/* free entry index */
+	xfs_dir2_free_t		*free = NULL;	/* free block structure */
+	int			index;		/* leaf entry index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			length;		/* length of new data entry */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_dir2_db_t		newfdb;		/* new free block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+#ifdef __KERNEL__
+	ASSERT(be16_to_cpu(leaf->hdr.count) > 0);
+#endif
+	xfs_dir2_leafn_check(dp, bp);
+	/*
+	 * Look up the hash value in the leaf entries.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, bp);
+	/*
+	 * Do we have a buffer coming in?
+	 */
+	if (state->extravalid) {
+		/* If so, it's a free block buffer, get the block number. */
+		curbp = state->extrablk.bp;
+		curfdb = state->extrablk.blkno;
+		free = curbp->data;
+		ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+	}
+	length = xfs_dir2_data_entsize(args->namelen);
+	/*
+	 * Loop over leaf entries with the right hash value.
+	 */
+	for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
+				be32_to_cpu(lep->hashval) == args->hashval;
+				lep++, index++) {
+		/*
+		 * Skip stale leaf entries.
+		 */
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Pull the data block number from the entry.
+		 */
+		newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+		/*
+		 * For addname, we're looking for a place to put the new entry.
+		 * We want to use a data block with an entry of equal
+		 * hash value to ours if there is one with room.
+		 *
+		 * If this block isn't the data block we already have
+		 * in hand, take a look at it.
+		 */
+		if (newdb != curdb) {
+			curdb = newdb;
+			/*
+			 * Convert the data block to the free block
+			 * holding its freespace information.
+			 */
+			newfdb = xfs_dir2_db_to_fdb(mp, newdb);
+			/*
+			 * If it's not the one we have in hand, read it in.
+			 */
+			if (newfdb != curfdb) {
+				/*
+				 * If we had one before, drop it.
+				 */
+				if (curbp)
+					xfs_da_brelse(tp, curbp);
+				/*
+				 * Read the free block.
+				 */
+				error = xfs_da_read_buf(tp, dp,
+						xfs_dir2_db_to_da(mp, newfdb),
+						-1, &curbp, XFS_DATA_FORK);
+				if (error)
+					return error;
+				free = curbp->data;
+				ASSERT(be32_to_cpu(free->hdr.magic) ==
+					XFS_DIR2_FREE_MAGIC);
+				ASSERT((be32_to_cpu(free->hdr.firstdb) %
+					XFS_DIR2_MAX_FREE_BESTS(mp)) == 0);
+				ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb);
+				ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) +
+					be32_to_cpu(free->hdr.nvalid));
+			}
+			/*
+			 * Get the index for our entry.
+			 */
+			fi = xfs_dir2_db_to_fdindex(mp, curdb);
+			/*
+			 * If it has room, return it.
+			 */
+			if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) {
+				XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
+							XFS_ERRLEVEL_LOW, mp);
+				if (curfdb != newfdb)
+					xfs_da_brelse(tp, curbp);
+				return XFS_ERROR(EFSCORRUPTED);
+			}
+			curfdb = newfdb;
+			if (be16_to_cpu(free->bests[fi]) >= length)
+				goto out;
+		}
+	}
+	/* Didn't find any space */
+	fi = -1;
+out:
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	if (curbp) {
+		/* Giving back a free block. */
+		state->extravalid = 1;
+		state->extrablk.bp = curbp;
+		state->extrablk.index = fi;
+		state->extrablk.blkno = curfdb;
+		state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+	} else {
+		state->extravalid = 0;
+	}
+	/*
+	 * Return the index, that will be the insertion point.
+	 */
+	*indexp = index;
+	return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * The extrablk in state a data block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_entry(
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	xfs_dabuf_t		*curbp = NULL;	/* current data/free buffer */
+	xfs_dir2_db_t		curdb = -1;	/* current data block number */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			index;		/* leaf entry index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	enum xfs_dacmp		cmp;		/* comparison result */
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+#ifdef __KERNEL__
+	ASSERT(be16_to_cpu(leaf->hdr.count) > 0);
+#endif
+	xfs_dir2_leafn_check(dp, bp);
+	/*
+	 * Look up the hash value in the leaf entries.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, bp);
+	/*
+	 * Do we have a buffer coming in?
+	 */
+	if (state->extravalid) {
+		curbp = state->extrablk.bp;
+		curdb = state->extrablk.blkno;
+	}
+	/*
+	 * Loop over leaf entries with the right hash value.
+	 */
+	for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
+				be32_to_cpu(lep->hashval) == args->hashval;
+				lep++, index++) {
+		/*
+		 * Skip stale leaf entries.
+		 */
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Pull the data block number from the entry.
+		 */
+		newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+		/*
+		 * Not adding a new entry, so we really want to find
+		 * the name given to us.
+		 *
+		 * If it's a different data block, go get it.
+		 */
+		if (newdb != curdb) {
+			/*
+			 * If we had a block before that we aren't saving
+			 * for a CI name, drop it
+			 */
+			if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
+						curdb != state->extrablk.blkno))
+				xfs_da_brelse(tp, curbp);
+			/*
+			 * If needing the block that is saved with a CI match,
+			 * use it otherwise read in the new data block.
+			 */
+			if (args->cmpresult != XFS_CMP_DIFFERENT &&
+					newdb == state->extrablk.blkno) {
+				ASSERT(state->extravalid);
+				curbp = state->extrablk.bp;
+			} else {
+				error = xfs_da_read_buf(tp, dp,
+						xfs_dir2_db_to_da(mp, newdb),
+						-1, &curbp, XFS_DATA_FORK);
+				if (error)
+					return error;
+			}
+			xfs_dir2_data_check(dp, curbp);
+			curdb = newdb;
+		}
+		/*
+		 * Point to the data entry.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)curbp->data +
+			xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+		/*
+		 * Compare the entry and if it's an exact match, return
+		 * EEXIST immediately. If it's the first case-insensitive
+		 * match, store the block & inode number and continue looking.
+		 */
+		cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			/* If there is a CI match block, drop it */
+			if (args->cmpresult != XFS_CMP_DIFFERENT &&
+						curdb != state->extrablk.blkno)
+				xfs_da_brelse(tp, state->extrablk.bp);
+			args->cmpresult = cmp;
+			args->inumber = be64_to_cpu(dep->inumber);
+			*indexp = index;
+			state->extravalid = 1;
+			state->extrablk.bp = curbp;
+			state->extrablk.blkno = curdb;
+			state->extrablk.index = (int)((char *)dep -
+							(char *)curbp->data);
+			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+			if (cmp == XFS_CMP_EXACT)
+				return XFS_ERROR(EEXIST);
+		}
+	}
+	ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
+					(args->op_flags & XFS_DA_OP_OKNOENT));
+	if (curbp) {
+		if (args->cmpresult == XFS_CMP_DIFFERENT) {
+			/* Giving back last used data block. */
+			state->extravalid = 1;
+			state->extrablk.bp = curbp;
+			state->extrablk.index = -1;
+			state->extrablk.blkno = curdb;
+			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+		} else {
+			/* If the curbp is not the CI match block, drop it */
+			if (state->extrablk.bp != curbp)
+				xfs_da_brelse(tp, curbp);
+		}
+	} else {
+		state->extravalid = 0;
+	}
+	*indexp = index;
+	return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * If this is an addname then the extrablk in state is a freespace block,
+ * otherwise it's a data block.
+ */
+int
+xfs_dir2_leafn_lookup_int(
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	if (args->op_flags & XFS_DA_OP_ADDNAME)
+		return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp,
+							state);
+	return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state);
+}
+
+/*
+ * Move count leaf entries from source to destination leaf.
+ * Log entries and headers.  Stale entries are preserved.
+ */
+static void
+xfs_dir2_leafn_moveents(
+	xfs_da_args_t	*args,			/* operation arguments */
+	xfs_dabuf_t	*bp_s,			/* source leaf buffer */
+	int		start_s,		/* source leaf index */
+	xfs_dabuf_t	*bp_d,			/* destination leaf buffer */
+	int		start_d,		/* destination leaf index */
+	int		count)			/* count of leaves to copy */
+{
+	xfs_dir2_leaf_t	*leaf_d;		/* destination leaf structure */
+	xfs_dir2_leaf_t	*leaf_s;		/* source leaf structure */
+	int		stale;			/* count stale leaves copied */
+	xfs_trans_t	*tp;			/* transaction pointer */
+
+	trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
+
+	/*
+	 * Silently return if nothing to do.
+	 */
+	if (count == 0) {
+		return;
+	}
+	tp = args->trans;
+	leaf_s = bp_s->data;
+	leaf_d = bp_d->data;
+	/*
+	 * If the destination index is not the end of the current
+	 * destination leaf entries, open up a hole in the destination
+	 * to hold the new entries.
+	 */
+	if (start_d < be16_to_cpu(leaf_d->hdr.count)) {
+		memmove(&leaf_d->ents[start_d + count], &leaf_d->ents[start_d],
+			(be16_to_cpu(leaf_d->hdr.count) - start_d) *
+			sizeof(xfs_dir2_leaf_entry_t));
+		xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count,
+			count + be16_to_cpu(leaf_d->hdr.count) - 1);
+	}
+	/*
+	 * If the source has stale leaves, count the ones in the copy range
+	 * so we can update the header correctly.
+	 */
+	if (leaf_s->hdr.stale) {
+		int	i;			/* temp leaf index */
+
+		for (i = start_s, stale = 0; i < start_s + count; i++) {
+			if (be32_to_cpu(leaf_s->ents[i].address) == XFS_DIR2_NULL_DATAPTR)
+				stale++;
+		}
+	} else
+		stale = 0;
+	/*
+	 * Copy the leaf entries from source to destination.
+	 */
+	memcpy(&leaf_d->ents[start_d], &leaf_s->ents[start_s],
+		count * sizeof(xfs_dir2_leaf_entry_t));
+	xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1);
+	/*
+	 * If there are source entries after the ones we copied,
+	 * delete the ones we copied by sliding the next ones down.
+	 */
+	if (start_s + count < be16_to_cpu(leaf_s->hdr.count)) {
+		memmove(&leaf_s->ents[start_s], &leaf_s->ents[start_s + count],
+			count * sizeof(xfs_dir2_leaf_entry_t));
+		xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1);
+	}
+	/*
+	 * Update the headers and log them.
+	 */
+	be16_add_cpu(&leaf_s->hdr.count, -(count));
+	be16_add_cpu(&leaf_s->hdr.stale, -(stale));
+	be16_add_cpu(&leaf_d->hdr.count, count);
+	be16_add_cpu(&leaf_d->hdr.stale, stale);
+	xfs_dir2_leaf_log_header(tp, bp_s);
+	xfs_dir2_leaf_log_header(tp, bp_d);
+	xfs_dir2_leafn_check(args->dp, bp_s);
+	xfs_dir2_leafn_check(args->dp, bp_d);
+}
+
+/*
+ * Determine the sort order of two leaf blocks.
+ * Returns 1 if both are valid and leaf2 should be before leaf1, else 0.
+ */
+int						/* sort order */
+xfs_dir2_leafn_order(
+	xfs_dabuf_t	*leaf1_bp,		/* leaf1 buffer */
+	xfs_dabuf_t	*leaf2_bp)		/* leaf2 buffer */
+{
+	xfs_dir2_leaf_t	*leaf1;			/* leaf1 structure */
+	xfs_dir2_leaf_t	*leaf2;			/* leaf2 structure */
+
+	leaf1 = leaf1_bp->data;
+	leaf2 = leaf2_bp->data;
+	ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+	if (be16_to_cpu(leaf1->hdr.count) > 0 &&
+	    be16_to_cpu(leaf2->hdr.count) > 0 &&
+	    (be32_to_cpu(leaf2->ents[0].hashval) < be32_to_cpu(leaf1->ents[0].hashval) ||
+	     be32_to_cpu(leaf2->ents[be16_to_cpu(leaf2->hdr.count) - 1].hashval) <
+	     be32_to_cpu(leaf1->ents[be16_to_cpu(leaf1->hdr.count) - 1].hashval)))
+		return 1;
+	return 0;
+}
+
+/*
+ * Rebalance leaf entries between two leaf blocks.
+ * This is actually only called when the second block is new,
+ * though the code deals with the general case.
+ * A new entry will be inserted in one of the blocks, and that
+ * entry is taken into account when balancing.
+ */
+static void
+xfs_dir2_leafn_rebalance(
+	xfs_da_state_t		*state,		/* btree cursor */
+	xfs_da_state_blk_t	*blk1,		/* first btree block */
+	xfs_da_state_blk_t	*blk2)		/* second btree block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	int			count;		/* count (& direction) leaves */
+	int			isleft;		/* new goes in left leaf */
+	xfs_dir2_leaf_t		*leaf1;		/* first leaf structure */
+	xfs_dir2_leaf_t		*leaf2;		/* second leaf structure */
+	int			mid;		/* midpoint leaf index */
+#ifdef DEBUG
+	int			oldstale;	/* old count of stale leaves */
+#endif
+	int			oldsum;		/* old total leaf count */
+	int			swap;		/* swapped leaf blocks */
+
+	args = state->args;
+	/*
+	 * If the block order is wrong, swap the arguments.
+	 */
+	if ((swap = xfs_dir2_leafn_order(blk1->bp, blk2->bp))) {
+		xfs_da_state_blk_t	*tmp;	/* temp for block swap */
+
+		tmp = blk1;
+		blk1 = blk2;
+		blk2 = tmp;
+	}
+	leaf1 = blk1->bp->data;
+	leaf2 = blk2->bp->data;
+	oldsum = be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count);
+#ifdef DEBUG
+	oldstale = be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale);
+#endif
+	mid = oldsum >> 1;
+	/*
+	 * If the old leaf count was odd then the new one will be even,
+	 * so we need to divide the new count evenly.
+	 */
+	if (oldsum & 1) {
+		xfs_dahash_t	midhash;	/* middle entry hash value */
+
+		if (mid >= be16_to_cpu(leaf1->hdr.count))
+			midhash = be32_to_cpu(leaf2->ents[mid - be16_to_cpu(leaf1->hdr.count)].hashval);
+		else
+			midhash = be32_to_cpu(leaf1->ents[mid].hashval);
+		isleft = args->hashval <= midhash;
+	}
+	/*
+	 * If the old count is even then the new count is odd, so there's
+	 * no preferred side for the new entry.
+	 * Pick the left one.
+	 */
+	else
+		isleft = 1;
+	/*
+	 * Calculate moved entry count.  Positive means left-to-right,
+	 * negative means right-to-left.  Then move the entries.
+	 */
+	count = be16_to_cpu(leaf1->hdr.count) - mid + (isleft == 0);
+	if (count > 0)
+		xfs_dir2_leafn_moveents(args, blk1->bp,
+			be16_to_cpu(leaf1->hdr.count) - count, blk2->bp, 0, count);
+	else if (count < 0)
+		xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp,
+			be16_to_cpu(leaf1->hdr.count), count);
+	ASSERT(be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count) == oldsum);
+	ASSERT(be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale) == oldstale);
+	/*
+	 * Mark whether we're inserting into the old or new leaf.
+	 */
+	if (be16_to_cpu(leaf1->hdr.count) < be16_to_cpu(leaf2->hdr.count))
+		state->inleaf = swap;
+	else if (be16_to_cpu(leaf1->hdr.count) > be16_to_cpu(leaf2->hdr.count))
+		state->inleaf = !swap;
+	else
+		state->inleaf =
+			swap ^ (blk1->index <= be16_to_cpu(leaf1->hdr.count));
+	/*
+	 * Adjust the expected index for insertion.
+	 */
+	if (!state->inleaf)
+		blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count);
+
+	/*
+	 * Finally sanity check just to make sure we are not returning a
+	 * negative index
+	 */
+	if(blk2->index < 0) {
+		state->inleaf = 1;
+		blk2->index = 0;
+		xfs_alert(args->dp->i_mount,
+	"%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n",
+			__func__, blk1->index);
+	}
+}
+
+/*
+ * Remove an entry from a node directory.
+ * This removes the leaf entry and the data entry,
+ * and updates the free block if necessary.
+ */
+static int					/* error */
+xfs_dir2_leafn_remove(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	int			index,		/* leaf entry index */
+	xfs_da_state_blk_t	*dblk,		/* data block */
+	int			*rval)		/* resulting block needs join */
+{
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dir2_db_t		db;		/* data block number */
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			longest;	/* longest data free entry */
+	int			off;		/* data block entry offset */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	trace_xfs_dir2_leafn_remove(args, index);
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * Point to the entry we're removing.
+	 */
+	lep = &leaf->ents[index];
+	/*
+	 * Extract the data block and offset from the entry.
+	 */
+	db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+	ASSERT(dblk->blkno == db);
+	off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address));
+	ASSERT(dblk->index == off);
+	/*
+	 * Kill the leaf entry by marking it stale.
+	 * Log the leaf block changes.
+	 */
+	be16_add_cpu(&leaf->hdr.stale, 1);
+	xfs_dir2_leaf_log_header(tp, bp);
+	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+	xfs_dir2_leaf_log_ents(tp, bp, index, index);
+	/*
+	 * Make the data entry free.  Keep track of the longest freespace
+	 * in the data block in case it changes.
+	 */
+	dbp = dblk->bp;
+	data = dbp->data;
+	dep = (xfs_dir2_data_entry_t *)((char *)data + off);
+	longest = be16_to_cpu(data->hdr.bestfree[0].length);
+	needlog = needscan = 0;
+	xfs_dir2_data_make_free(tp, dbp, off,
+		xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
+	/*
+	 * Rescan the data block freespaces for bestfree.
+	 * Log the data block header if needed.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, data, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	xfs_dir2_data_check(dp, dbp);
+	/*
+	 * If the longest data block freespace changes, need to update
+	 * the corresponding freeblock entry.
+	 */
+	if (longest < be16_to_cpu(data->hdr.bestfree[0].length)) {
+		int		error;		/* error return value */
+		xfs_dabuf_t	*fbp;		/* freeblock buffer */
+		xfs_dir2_db_t	fdb;		/* freeblock block number */
+		int		findex;		/* index in freeblock entries */
+		xfs_dir2_free_t	*free;		/* freeblock structure */
+		int		logfree;	/* need to log free entry */
+
+		/*
+		 * Convert the data block number to a free block,
+		 * read in the free block.
+		 */
+		fdb = xfs_dir2_db_to_fdb(mp, db);
+		if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb),
+				-1, &fbp, XFS_DATA_FORK))) {
+			return error;
+		}
+		free = fbp->data;
+		ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+		ASSERT(be32_to_cpu(free->hdr.firstdb) ==
+		       XFS_DIR2_MAX_FREE_BESTS(mp) *
+		       (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
+		/*
+		 * Calculate which entry we need to fix.
+		 */
+		findex = xfs_dir2_db_to_fdindex(mp, db);
+		longest = be16_to_cpu(data->hdr.bestfree[0].length);
+		/*
+		 * If the data block is now empty we can get rid of it
+		 * (usually).
+		 */
+		if (longest == mp->m_dirblksize - (uint)sizeof(data->hdr)) {
+			/*
+			 * Try to punch out the data block.
+			 */
+			error = xfs_dir2_shrink_inode(args, db, dbp);
+			if (error == 0) {
+				dblk->bp = NULL;
+				data = NULL;
+			}
+			/*
+			 * We can get ENOSPC if there's no space reservation.
+			 * In this case just drop the buffer and some one else
+			 * will eventually get rid of the empty block.
+			 */
+			else if (error == ENOSPC && args->total == 0)
+				xfs_da_buf_done(dbp);
+			else
+				return error;
+		}
+		/*
+		 * If we got rid of the data block, we can eliminate that entry
+		 * in the free block.
+		 */
+		if (data == NULL) {
+			/*
+			 * One less used entry in the free table.
+			 */
+			be32_add_cpu(&free->hdr.nused, -1);
+			xfs_dir2_free_log_header(tp, fbp);
+			/*
+			 * If this was the last entry in the table, we can
+			 * trim the table size back.  There might be other
+			 * entries at the end referring to non-existent
+			 * data blocks, get those too.
+			 */
+			if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
+				int	i;		/* free entry index */
+
+				for (i = findex - 1;
+				     i >= 0 && be16_to_cpu(free->bests[i]) == NULLDATAOFF;
+				     i--)
+					continue;
+				free->hdr.nvalid = cpu_to_be32(i + 1);
+				logfree = 0;
+			}
+			/*
+			 * Not the last entry, just punch it out.
+			 */
+			else {
+				free->bests[findex] = cpu_to_be16(NULLDATAOFF);
+				logfree = 1;
+			}
+			/*
+			 * If there are no useful entries left in the block,
+			 * get rid of the block if we can.
+			 */
+			if (!free->hdr.nused) {
+				error = xfs_dir2_shrink_inode(args, fdb, fbp);
+				if (error == 0) {
+					fbp = NULL;
+					logfree = 0;
+				} else if (error != ENOSPC || args->total != 0)
+					return error;
+				/*
+				 * It's possible to get ENOSPC if there is no
+				 * space reservation.  In this case some one
+				 * else will eventually get rid of this block.
+				 */
+			}
+		}
+		/*
+		 * Data block is not empty, just set the free entry to
+		 * the new value.
+		 */
+		else {
+			free->bests[findex] = cpu_to_be16(longest);
+			logfree = 1;
+		}
+		/*
+		 * Log the free entry that changed, unless we got rid of it.
+		 */
+		if (logfree)
+			xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+		/*
+		 * Drop the buffer if we still have it.
+		 */
+		if (fbp)
+			xfs_da_buf_done(fbp);
+	}
+	xfs_dir2_leafn_check(dp, bp);
+	/*
+	 * Return indication of whether this leaf block is empty enough
+	 * to justify trying to join it with a neighbor.
+	 */
+	*rval =
+		((uint)sizeof(leaf->hdr) +
+		 (uint)sizeof(leaf->ents[0]) *
+		 (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale))) <
+		mp->m_dir_magicpct;
+	return 0;
+}
+
+/*
+ * Split the leaf entries in the old block into old and new blocks.
+ */
+int						/* error */
+xfs_dir2_leafn_split(
+	xfs_da_state_t		*state,		/* btree cursor */
+	xfs_da_state_blk_t	*oldblk,	/* original block */
+	xfs_da_state_blk_t	*newblk)	/* newly created block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_dablk_t		blkno;		/* new leaf block number */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	/*
+	 * Allocate space for a new leaf node.
+	 */
+	args = state->args;
+	mp = args->dp->i_mount;
+	ASSERT(args != NULL);
+	ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Initialize the new leaf block.
+	 */
+	error = xfs_dir2_leaf_init(args, xfs_dir2_da_to_db(mp, blkno),
+		&newblk->bp, XFS_DIR2_LEAFN_MAGIC);
+	if (error) {
+		return error;
+	}
+	newblk->blkno = blkno;
+	newblk->magic = XFS_DIR2_LEAFN_MAGIC;
+	/*
+	 * Rebalance the entries across the two leaves, link the new
+	 * block into the leaves.
+	 */
+	xfs_dir2_leafn_rebalance(state, oldblk, newblk);
+	error = xfs_da_blk_link(state, oldblk, newblk);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Insert the new entry in the correct block.
+	 */
+	if (state->inleaf)
+		error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index);
+	else
+		error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index);
+	/*
+	 * Update last hashval in each block since we added the name.
+	 */
+	oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL);
+	newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL);
+	xfs_dir2_leafn_check(args->dp, oldblk->bp);
+	xfs_dir2_leafn_check(args->dp, newblk->bp);
+	return error;
+}
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+int						/* error */
+xfs_dir2_leafn_toosmall(
+	xfs_da_state_t		*state,		/* btree cursor */
+	int			*action)	/* resulting action to take */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block */
+	xfs_dablk_t		blkno;		/* leaf block number */
+	xfs_dabuf_t		*bp;		/* leaf buffer */
+	int			bytes;		/* bytes in use */
+	int			count;		/* leaf live entry count */
+	int			error;		/* error return value */
+	int			forward;	/* sibling block direction */
+	int			i;		/* sibling counter */
+	xfs_da_blkinfo_t	*info;		/* leaf block header */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			rval;		/* result from path_shift */
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[state->path.active - 1];
+	info = blk->bp->data;
+	ASSERT(be16_to_cpu(info->magic) == XFS_DIR2_LEAFN_MAGIC);
+	leaf = (xfs_dir2_leaf_t *)info;
+	count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
+	bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]);
+	if (bytes > (state->blocksize >> 1)) {
+		/*
+		 * Blk over 50%, don't try to join.
+		 */
+		*action = 0;
+		return 0;
+	}
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (arbitrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (info->forw != 0);
+		memcpy(&state->altpath, &state->path, sizeof(state->path));
+		error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+			&rval);
+		if (error)
+			return error;
+		*action = rval ? 2 : 0;
+		return 0;
+	}
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink a directory over time.
+	 */
+	forward = be32_to_cpu(info->forw) < be32_to_cpu(info->back);
+	for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
+		blkno = forward ? be32_to_cpu(info->forw) : be32_to_cpu(info->back);
+		if (blkno == 0)
+			continue;
+		/*
+		 * Read the sibling leaf block.
+		 */
+		if ((error =
+		    xfs_da_read_buf(state->args->trans, state->args->dp, blkno,
+			    -1, &bp, XFS_DATA_FORK))) {
+			return error;
+		}
+		ASSERT(bp != NULL);
+		/*
+		 * Count bytes in the two blocks combined.
+		 */
+		leaf = (xfs_dir2_leaf_t *)info;
+		count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
+		bytes = state->blocksize - (state->blocksize >> 2);
+		leaf = bp->data;
+		ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+		count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
+		bytes -= count * (uint)sizeof(leaf->ents[0]);
+		/*
+		 * Fits with at least 25% to spare.
+		 */
+		if (bytes >= 0)
+			break;
+		xfs_da_brelse(state->args->trans, bp);
+	}
+	/*
+	 * Didn't like either block, give up.
+	 */
+	if (i >= 2) {
+		*action = 0;
+		return 0;
+	}
+	/*
+	 * Done with the sibling leaf block here, drop the dabuf
+	 * so path_shift can get it.
+	 */
+	xfs_da_buf_done(bp);
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	memcpy(&state->altpath, &state->path, sizeof(state->path));
+	if (blkno < blk->blkno)
+		error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+			&rval);
+	else
+		error = xfs_da_path_shift(state, &state->path, forward, 0,
+			&rval);
+	if (error) {
+		return error;
+	}
+	*action = rval ? 0 : 1;
+	return 0;
+}
+
+/*
+ * Move all the leaf entries from drop_blk to save_blk.
+ * This is done as part of a join operation.
+ */
+void
+xfs_dir2_leafn_unbalance(
+	xfs_da_state_t		*state,		/* cursor */
+	xfs_da_state_blk_t	*drop_blk,	/* dead block */
+	xfs_da_state_blk_t	*save_blk)	/* surviving block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_dir2_leaf_t		*drop_leaf;	/* dead leaf structure */
+	xfs_dir2_leaf_t		*save_leaf;	/* surviving leaf structure */
+
+	args = state->args;
+	ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	drop_leaf = drop_blk->bp->data;
+	save_leaf = save_blk->bp->data;
+	ASSERT(be16_to_cpu(drop_leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(be16_to_cpu(save_leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * If there are any stale leaf entries, take this opportunity
+	 * to purge them.
+	 */
+	if (drop_leaf->hdr.stale)
+		xfs_dir2_leaf_compact(args, drop_blk->bp);
+	if (save_leaf->hdr.stale)
+		xfs_dir2_leaf_compact(args, save_blk->bp);
+	/*
+	 * Move the entries from drop to the appropriate end of save.
+	 */
+	drop_blk->hashval = be32_to_cpu(drop_leaf->ents[be16_to_cpu(drop_leaf->hdr.count) - 1].hashval);
+	if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp))
+		xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0,
+			be16_to_cpu(drop_leaf->hdr.count));
+	else
+		xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp,
+			be16_to_cpu(save_leaf->hdr.count), be16_to_cpu(drop_leaf->hdr.count));
+	save_blk->hashval = be32_to_cpu(save_leaf->ents[be16_to_cpu(save_leaf->hdr.count) - 1].hashval);
+	xfs_dir2_leafn_check(args->dp, save_blk->bp);
+}
+
+/*
+ * Top-level node form directory addname routine.
+ */
+int						/* error */
+xfs_dir2_node_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block for insert */
+	int			error;		/* error return value */
+	int			rval;		/* sub-return value */
+	xfs_da_state_t		*state;		/* btree cursor */
+
+	trace_xfs_dir2_node_addname(args);
+
+	/*
+	 * Allocate and initialize the state (btree cursor).
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_dirblksize;
+	state->node_ents = state->mp->m_dir_node_ents;
+	/*
+	 * Look up the name.  We're not supposed to find it, but
+	 * this gives us the insertion point.
+	 */
+	error = xfs_da_node_lookup_int(state, &rval);
+	if (error)
+		rval = error;
+	if (rval != ENOENT) {
+		goto done;
+	}
+	/*
+	 * Add the data entry to a data block.
+	 * Extravalid is set to a freeblock found by lookup.
+	 */
+	rval = xfs_dir2_node_addname_int(args,
+		state->extravalid ? &state->extrablk : NULL);
+	if (rval) {
+		goto done;
+	}
+	blk = &state->path.blk[state->path.active - 1];
+	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * Add the new leaf entry.
+	 */
+	rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+	if (rval == 0) {
+		/*
+		 * It worked, fix the hash values up the btree.
+		 */
+		if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
+			xfs_da_fixhashpath(state, &state->path);
+	} else {
+		/*
+		 * It didn't work, we need to split the leaf block.
+		 */
+		if (args->total == 0) {
+			ASSERT(rval == ENOSPC);
+			goto done;
+		}
+		/*
+		 * Split the leaf block and insert the new entry.
+		 */
+		rval = xfs_da_split(state);
+	}
+done:
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int					/* error */
+xfs_dir2_node_addname_int(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_da_state_blk_t	*fblk)		/* optional freespace block */
+{
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dir2_db_t		dbno;		/* data block number */
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* data unused entry pointer */
+	int			error;		/* error return value */
+	xfs_dir2_db_t		fbno;		/* freespace block number */
+	xfs_dabuf_t		*fbp;		/* freespace buffer */
+	int			findex;		/* freespace entry index */
+	xfs_dir2_free_t		*free=NULL;	/* freespace block structure */
+	xfs_dir2_db_t		ifbno;		/* initial freespace block no */
+	xfs_dir2_db_t		lastfbno=0;	/* highest freespace block no */
+	int			length;		/* length of the new entry */
+	int			logfree;	/* need to log free entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	__be16			*tagp;		/* data entry tag pointer */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	length = xfs_dir2_data_entsize(args->namelen);
+	/*
+	 * If we came in with a freespace block that means that lookup
+	 * found an entry with our hash value.  This is the freespace
+	 * block for that data entry.
+	 */
+	if (fblk) {
+		fbp = fblk->bp;
+		/*
+		 * Remember initial freespace block number.
+		 */
+		ifbno = fblk->blkno;
+		free = fbp->data;
+		ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+		findex = fblk->index;
+		/*
+		 * This means the free entry showed that the data block had
+		 * space for our entry, so we remembered it.
+		 * Use that data block.
+		 */
+		if (findex >= 0) {
+			ASSERT(findex < be32_to_cpu(free->hdr.nvalid));
+			ASSERT(be16_to_cpu(free->bests[findex]) != NULLDATAOFF);
+			ASSERT(be16_to_cpu(free->bests[findex]) >= length);
+			dbno = be32_to_cpu(free->hdr.firstdb) + findex;
+		}
+		/*
+		 * The data block looked at didn't have enough room.
+		 * We'll start at the beginning of the freespace entries.
+		 */
+		else {
+			dbno = -1;
+			findex = 0;
+		}
+	}
+	/*
+	 * Didn't come in with a freespace block, so don't have a data block.
+	 */
+	else {
+		ifbno = dbno = -1;
+		fbp = NULL;
+		findex = 0;
+	}
+	/*
+	 * If we don't have a data block yet, we're going to scan the
+	 * freespace blocks looking for one.  Figure out what the
+	 * highest freespace block number is.
+	 */
+	if (dbno == -1) {
+		xfs_fileoff_t	fo;		/* freespace block number */
+
+		if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK)))
+			return error;
+		lastfbno = xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo);
+		fbno = ifbno;
+	}
+	/*
+	 * While we haven't identified a data block, search the freeblock
+	 * data for a good data block.  If we find a null freeblock entry,
+	 * indicating a hole in the data blocks, remember that.
+	 */
+	while (dbno == -1) {
+		/*
+		 * If we don't have a freeblock in hand, get the next one.
+		 */
+		if (fbp == NULL) {
+			/*
+			 * Happens the first time through unless lookup gave
+			 * us a freespace block to start with.
+			 */
+			if (++fbno == 0)
+				fbno = XFS_DIR2_FREE_FIRSTDB(mp);
+			/*
+			 * If it's ifbno we already looked at it.
+			 */
+			if (fbno == ifbno)
+				fbno++;
+			/*
+			 * If it's off the end we're done.
+			 */
+			if (fbno >= lastfbno)
+				break;
+			/*
+			 * Read the block.  There can be holes in the
+			 * freespace blocks, so this might not succeed.
+			 * This should be really rare, so there's no reason
+			 * to avoid it.
+			 */
+			if ((error = xfs_da_read_buf(tp, dp,
+					xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
+					XFS_DATA_FORK))) {
+				return error;
+			}
+			if (unlikely(fbp == NULL)) {
+				continue;
+			}
+			free = fbp->data;
+			ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+			findex = 0;
+		}
+		/*
+		 * Look at the current free entry.  Is it good enough?
+		 */
+		if (be16_to_cpu(free->bests[findex]) != NULLDATAOFF &&
+		    be16_to_cpu(free->bests[findex]) >= length)
+			dbno = be32_to_cpu(free->hdr.firstdb) + findex;
+		else {
+			/*
+			 * Are we done with the freeblock?
+			 */
+			if (++findex == be32_to_cpu(free->hdr.nvalid)) {
+				/*
+				 * Drop the block.
+				 */
+				xfs_da_brelse(tp, fbp);
+				fbp = NULL;
+				if (fblk && fblk->bp)
+					fblk->bp = NULL;
+			}
+		}
+	}
+	/*
+	 * If we don't have a data block, we need to allocate one and make
+	 * the freespace entries refer to it.
+	 */
+	if (unlikely(dbno == -1)) {
+		/*
+		 * Not allowed to allocate, return failure.
+		 */
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+							args->total == 0) {
+			/*
+			 * Drop the freespace buffer unless it came from our
+			 * caller.
+			 */
+			if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
+				xfs_da_buf_done(fbp);
+			return XFS_ERROR(ENOSPC);
+		}
+		/*
+		 * Allocate and initialize the new data block.
+		 */
+		if (unlikely((error = xfs_dir2_grow_inode(args,
+							 XFS_DIR2_DATA_SPACE,
+							 &dbno)) ||
+		    (error = xfs_dir2_data_init(args, dbno, &dbp)))) {
+			/*
+			 * Drop the freespace buffer unless it came from our
+			 * caller.
+			 */
+			if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
+				xfs_da_buf_done(fbp);
+			return error;
+		}
+		/*
+		 * If (somehow) we have a freespace block, get rid of it.
+		 */
+		if (fbp)
+			xfs_da_brelse(tp, fbp);
+		if (fblk && fblk->bp)
+			fblk->bp = NULL;
+
+		/*
+		 * Get the freespace block corresponding to the data block
+		 * that was just allocated.
+		 */
+		fbno = xfs_dir2_db_to_fdb(mp, dbno);
+		if (unlikely(error = xfs_da_read_buf(tp, dp,
+				xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
+				XFS_DATA_FORK))) {
+			xfs_da_buf_done(dbp);
+			return error;
+  		}
+		/*
+		 * If there wasn't a freespace block, the read will
+		 * return a NULL fbp.  Allocate and initialize a new one.
+		 */
+		if( fbp == NULL ) {
+			if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
+							&fbno))) {
+				return error;
+			}
+
+			if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
+				xfs_alert(mp,
+			"%s: dir ino " "%llu needed freesp block %lld for\n"
+			"  data block %lld, got %lld ifbno %llu lastfbno %d",
+					__func__, (unsigned long long)dp->i_ino,
+					(long long)xfs_dir2_db_to_fdb(mp, dbno),
+					(long long)dbno, (long long)fbno,
+					(unsigned long long)ifbno, lastfbno);
+				if (fblk) {
+					xfs_alert(mp,
+				" fblk 0x%p blkno %llu index %d magic 0x%x",
+						fblk,
+						(unsigned long long)fblk->blkno,
+						fblk->index,
+						fblk->magic);
+				} else {
+					xfs_alert(mp, " ... fblk is NULL");
+				}
+				XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
+						 XFS_ERRLEVEL_LOW, mp);
+				return XFS_ERROR(EFSCORRUPTED);
+			}
+
+			/*
+			 * Get a buffer for the new block.
+			 */
+			if ((error = xfs_da_get_buf(tp, dp,
+						   xfs_dir2_db_to_da(mp, fbno),
+						   -1, &fbp, XFS_DATA_FORK))) {
+				return error;
+			}
+			ASSERT(fbp != NULL);
+
+			/*
+			 * Initialize the new block to be empty, and remember
+			 * its first slot as our empty slot.
+			 */
+			free = fbp->data;
+			free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC);
+			free->hdr.firstdb = cpu_to_be32(
+				(fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
+				XFS_DIR2_MAX_FREE_BESTS(mp));
+			free->hdr.nvalid = 0;
+			free->hdr.nused = 0;
+		} else {
+			free = fbp->data;
+			ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+		}
+
+		/*
+		 * Set the freespace block index from the data block number.
+		 */
+		findex = xfs_dir2_db_to_fdindex(mp, dbno);
+		/*
+		 * If it's after the end of the current entries in the
+		 * freespace block, extend that table.
+		 */
+		if (findex >= be32_to_cpu(free->hdr.nvalid)) {
+			ASSERT(findex < XFS_DIR2_MAX_FREE_BESTS(mp));
+			free->hdr.nvalid = cpu_to_be32(findex + 1);
+			/*
+			 * Tag new entry so nused will go up.
+			 */
+			free->bests[findex] = cpu_to_be16(NULLDATAOFF);
+		}
+		/*
+		 * If this entry was for an empty data block
+		 * (this should always be true) then update the header.
+		 */
+		if (be16_to_cpu(free->bests[findex]) == NULLDATAOFF) {
+			be32_add_cpu(&free->hdr.nused, 1);
+			xfs_dir2_free_log_header(tp, fbp);
+		}
+		/*
+		 * Update the real value in the table.
+		 * We haven't allocated the data entry yet so this will
+		 * change again.
+		 */
+		data = dbp->data;
+		free->bests[findex] = data->hdr.bestfree[0].length;
+		logfree = 1;
+	}
+	/*
+	 * We had a data block so we don't have to make a new one.
+	 */
+	else {
+		/*
+		 * If just checking, we succeeded.
+		 */
+		if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+			if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
+				xfs_da_buf_done(fbp);
+			return 0;
+		}
+		/*
+		 * Read the data block in.
+		 */
+		if (unlikely(
+		    error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno),
+				-1, &dbp, XFS_DATA_FORK))) {
+			if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
+				xfs_da_buf_done(fbp);
+			return error;
+		}
+		data = dbp->data;
+		logfree = 0;
+	}
+	ASSERT(be16_to_cpu(data->hdr.bestfree[0].length) >= length);
+	/*
+	 * Point to the existing unused space.
+	 */
+	dup = (xfs_dir2_data_unused_t *)
+	      ((char *)data + be16_to_cpu(data->hdr.bestfree[0].offset));
+	needscan = needlog = 0;
+	/*
+	 * Mark the first part of the unused space, inuse for us.
+	 */
+	xfs_dir2_data_use_free(tp, dbp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length,
+		&needlog, &needscan);
+	/*
+	 * Fill in the new entry and log it.
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	dep->inumber = cpu_to_be64(args->inumber);
+	dep->namelen = args->namelen;
+	memcpy(dep->name, args->name, dep->namelen);
+	tagp = xfs_dir2_data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)data);
+	xfs_dir2_data_log_entry(tp, dbp, dep);
+	/*
+	 * Rescan the block for bestfree if needed.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, data, &needlog);
+	/*
+	 * Log the data block header if needed.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	/*
+	 * If the freespace entry is now wrong, update it.
+	 */
+	if (be16_to_cpu(free->bests[findex]) != be16_to_cpu(data->hdr.bestfree[0].length)) {
+		free->bests[findex] = data->hdr.bestfree[0].length;
+		logfree = 1;
+	}
+	/*
+	 * Log the freespace entry if needed.
+	 */
+	if (logfree)
+		xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+	/*
+	 * If the caller didn't hand us the freespace block, drop it.
+	 */
+	if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
+		xfs_da_buf_done(fbp);
+	/*
+	 * Return the data block and offset in args, then drop the data block.
+	 */
+	args->blkno = (xfs_dablk_t)dbno;
+	args->index = be16_to_cpu(*tagp);
+	xfs_da_buf_done(dbp);
+	return 0;
+}
+
+/*
+ * Lookup an entry in a node-format directory.
+ * All the real work happens in xfs_da_node_lookup_int.
+ * The only real output is the inode number of the entry.
+ */
+int						/* error */
+xfs_dir2_node_lookup(
+	xfs_da_args_t	*args)			/* operation arguments */
+{
+	int		error;			/* error return value */
+	int		i;			/* btree level */
+	int		rval;			/* operation return value */
+	xfs_da_state_t	*state;			/* btree cursor */
+
+	trace_xfs_dir2_node_lookup(args);
+
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_dirblksize;
+	state->node_ents = state->mp->m_dir_node_ents;
+	/*
+	 * Fill in the path to the entry in the cursor.
+	 */
+	error = xfs_da_node_lookup_int(state, &rval);
+	if (error)
+		rval = error;
+	else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) {
+		/* If a CI match, dup the actual name and return EEXIST */
+		xfs_dir2_data_entry_t	*dep;
+
+		dep = (xfs_dir2_data_entry_t *)((char *)state->extrablk.bp->
+						data + state->extrablk.index);
+		rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+	}
+	/*
+	 * Release the btree blocks and leaf block.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_da_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+	/*
+	 * Release the data block if we have it.
+	 */
+	if (state->extravalid && state->extrablk.bp) {
+		xfs_da_brelse(args->trans, state->extrablk.bp);
+		state->extrablk.bp = NULL;
+	}
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Remove an entry from a node-format directory.
+ */
+int						/* error */
+xfs_dir2_node_removename(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block */
+	int			error;		/* error return value */
+	int			rval;		/* operation return value */
+	xfs_da_state_t		*state;		/* btree cursor */
+
+	trace_xfs_dir2_node_removename(args);
+
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_dirblksize;
+	state->node_ents = state->mp->m_dir_node_ents;
+	/*
+	 * Look up the entry we're deleting, set up the cursor.
+	 */
+	error = xfs_da_node_lookup_int(state, &rval);
+	if (error)
+		rval = error;
+	/*
+	 * Didn't find it, upper layer screwed up.
+	 */
+	if (rval != EEXIST) {
+		xfs_da_state_free(state);
+		return rval;
+	}
+	blk = &state->path.blk[state->path.active - 1];
+	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(state->extravalid);
+	/*
+	 * Remove the leaf and data entries.
+	 * Extrablk refers to the data block.
+	 */
+	error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
+		&state->extrablk, &rval);
+	if (error)
+		return error;
+	/*
+	 * Fix the hash values up the btree.
+	 */
+	xfs_da_fixhashpath(state, &state->path);
+	/*
+	 * If we need to join leaf blocks, do it.
+	 */
+	if (rval && state->path.active > 1)
+		error = xfs_da_join(state);
+	/*
+	 * If no errors so far, try conversion to leaf format.
+	 */
+	if (!error)
+		error = xfs_dir2_node_to_leaf(state);
+	xfs_da_state_free(state);
+	return error;
+}
+
+/*
+ * Replace an entry's inode number in a node-format directory.
+ */
+int						/* error */
+xfs_dir2_node_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block */
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dir2_data_entry_t	*dep;		/* data entry changed */
+	int			error;		/* error return value */
+	int			i;		/* btree level */
+	xfs_ino_t		inum;		/* new inode number */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry being changed */
+	int			rval;		/* internal return value */
+	xfs_da_state_t		*state;		/* btree cursor */
+
+	trace_xfs_dir2_node_replace(args);
+
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_dirblksize;
+	state->node_ents = state->mp->m_dir_node_ents;
+	inum = args->inumber;
+	/*
+	 * Lookup the entry to change in the btree.
+	 */
+	error = xfs_da_node_lookup_int(state, &rval);
+	if (error) {
+		rval = error;
+	}
+	/*
+	 * It should be found, since the vnodeops layer has looked it up
+	 * and locked it.  But paranoia is good.
+	 */
+	if (rval == EEXIST) {
+		/*
+		 * Find the leaf entry.
+		 */
+		blk = &state->path.blk[state->path.active - 1];
+		ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+		leaf = blk->bp->data;
+		lep = &leaf->ents[blk->index];
+		ASSERT(state->extravalid);
+		/*
+		 * Point to the data entry.
+		 */
+		data = state->extrablk.bp->data;
+		ASSERT(be32_to_cpu(data->hdr.magic) == XFS_DIR2_DATA_MAGIC);
+		dep = (xfs_dir2_data_entry_t *)
+		      ((char *)data +
+		       xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address)));
+		ASSERT(inum != be64_to_cpu(dep->inumber));
+		/*
+		 * Fill in the new inode number and log the entry.
+		 */
+		dep->inumber = cpu_to_be64(inum);
+		xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
+		rval = 0;
+	}
+	/*
+	 * Didn't find it, and we're holding a data block.  Drop it.
+	 */
+	else if (state->extravalid) {
+		xfs_da_brelse(args->trans, state->extrablk.bp);
+		state->extrablk.bp = NULL;
+	}
+	/*
+	 * Release all the buffers in the cursor.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_da_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Trim off a trailing empty freespace block.
+ * Return (in rvalp) 1 if we did it, 0 if not.
+ */
+int						/* error */
+xfs_dir2_node_trim_free(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_fileoff_t		fo,		/* free block number */
+	int			*rvalp)		/* out: did something */
+{
+	xfs_dabuf_t		*bp;		/* freespace buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Read the freespace block.
+	 */
+	if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp,
+			XFS_DATA_FORK))) {
+		return error;
+	}
+
+	/*
+	 * There can be holes in freespace.  If fo is a hole, there's
+	 * nothing to do.
+	 */
+	if (bp == NULL) {
+		return 0;
+	}
+	free = bp->data;
+	ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+	/*
+	 * If there are used entries, there's nothing to do.
+	 */
+	if (be32_to_cpu(free->hdr.nused) > 0) {
+		xfs_da_brelse(tp, bp);
+		*rvalp = 0;
+		return 0;
+	}
+	/*
+	 * Blow the block away.
+	 */
+	if ((error =
+	    xfs_dir2_shrink_inode(args, xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo),
+		    bp))) {
+		/*
+		 * Can't fail with ENOSPC since that only happens with no
+		 * space reservation, when breaking up an extent into two
+		 * pieces.  This is the last block of an extent.
+		 */
+		ASSERT(error != ENOSPC);
+		xfs_da_brelse(tp, bp);
+		return error;
+	}
+	/*
+	 * Return that we succeeded.
+	 */
+	*rvalp = 1;
+	return 0;
+}
diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h
new file mode 100644
index 00000000..82dfe714
--- /dev/null
+++ b/fs/xfs/xfs_dir2_node.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DIR2_NODE_H__
+#define	__XFS_DIR2_NODE_H__
+
+/*
+ * Directory version 2, btree node format structures
+ */
+
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Offset of the freespace index.
+ */
+#define	XFS_DIR2_FREE_SPACE	2
+#define	XFS_DIR2_FREE_OFFSET	(XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
+#define	XFS_DIR2_FREE_FIRSTDB(mp)	\
+	xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET)
+
+#define	XFS_DIR2_FREE_MAGIC	0x58443246	/* XD2F */
+
+typedef	struct xfs_dir2_free_hdr {
+	__be32			magic;		/* XFS_DIR2_FREE_MAGIC */
+	__be32			firstdb;	/* db of first entry */
+	__be32			nvalid;		/* count of valid entries */
+	__be32			nused;		/* count of used entries */
+} xfs_dir2_free_hdr_t;
+
+typedef struct xfs_dir2_free {
+	xfs_dir2_free_hdr_t	hdr;		/* block header */
+	__be16			bests[1];	/* best free counts */
+						/* unused entries are -1 */
+} xfs_dir2_free_t;
+
+#define	XFS_DIR2_MAX_FREE_BESTS(mp)	\
+	(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_free_hdr_t)) / \
+	 (uint)sizeof(xfs_dir2_data_off_t))
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
+{
+	return (XFS_DIR2_FREE_FIRSTDB(mp) + (db) / XFS_DIR2_MAX_FREE_BESTS(mp));
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static inline int
+xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
+{
+	return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp));
+}
+
+extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
+				 struct xfs_dabuf *lbp);
+extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count);
+extern int xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp,
+				     struct xfs_da_args *args, int *indexp,
+				     struct xfs_da_state *state);
+extern int xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp,
+				struct xfs_dabuf *leaf2_bp);
+extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
+				struct xfs_da_state_blk *oldblk,
+				struct xfs_da_state_blk *newblk);
+extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
+extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
+				     struct xfs_da_state_blk *drop_blk,
+				     struct xfs_da_state_blk *save_blk);
+extern int xfs_dir2_node_addname(struct xfs_da_args *args);
+extern int xfs_dir2_node_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_node_removename(struct xfs_da_args *args);
+extern int xfs_dir2_node_replace(struct xfs_da_args *args);
+extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
+				   int *rvalp);
+
+#endif	/* __XFS_DIR2_NODE_H__ */
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
new file mode 100644
index 00000000..b1bae6b1
--- /dev/null
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -0,0 +1,1267 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_error.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_trace.h"
+
+/*
+ * Prototypes for internal functions.
+ */
+static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args,
+				     xfs_dir2_sf_entry_t *sfep,
+				     xfs_dir2_data_aoff_t offset,
+				     int new_isize);
+static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange,
+				     int new_isize);
+static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange,
+				    xfs_dir2_sf_entry_t **sfepp,
+				    xfs_dir2_data_aoff_t *offsetp);
+#ifdef DEBUG
+static void xfs_dir2_sf_check(xfs_da_args_t *args);
+#else
+#define	xfs_dir2_sf_check(args)
+#endif /* DEBUG */
+#if XFS_BIG_INUMS
+static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
+static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
+#endif /* XFS_BIG_INUMS */
+
+/*
+ * Given a block directory (dp/block), calculate its size as a shortform (sf)
+ * directory and a header for the sf directory, if it will fit it the
+ * space currently present in the inode.  If it won't fit, the output
+ * size is too big (but not accurate).
+ */
+int						/* size for sf form */
+xfs_dir2_block_sfsize(
+	xfs_inode_t		*dp,		/* incore inode pointer */
+	xfs_dir2_block_t	*block,		/* block directory data */
+	xfs_dir2_sf_hdr_t	*sfhp)		/* output: header for sf form */
+{
+	xfs_dir2_dataptr_t	addr;		/* data entry address */
+	xfs_dir2_leaf_entry_t	*blp;		/* leaf area of the block */
+	xfs_dir2_block_tail_t	*btp;		/* tail area of the block */
+	int			count;		/* shortform entry count */
+	xfs_dir2_data_entry_t	*dep;		/* data entry in the block */
+	int			i;		/* block entry index */
+	int			i8count;	/* count of big-inode entries */
+	int			isdot;		/* entry is "." */
+	int			isdotdot;	/* entry is ".." */
+	xfs_mount_t		*mp;		/* mount structure pointer */
+	int			namelen;	/* total name bytes */
+	xfs_ino_t		parent = 0;	/* parent inode number */
+	int			size=0;		/* total computed size */
+
+	mp = dp->i_mount;
+
+	count = i8count = namelen = 0;
+	btp = xfs_dir2_block_tail_p(mp, block);
+	blp = xfs_dir2_block_leaf_p(btp);
+
+	/*
+	 * Iterate over the block's data entries by using the leaf pointers.
+	 */
+	for (i = 0; i < be32_to_cpu(btp->count); i++) {
+		if ((addr = be32_to_cpu(blp[i].address)) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Calculate the pointer to the entry at hand.
+		 */
+		dep = (xfs_dir2_data_entry_t *)
+		      ((char *)block + xfs_dir2_dataptr_to_off(mp, addr));
+		/*
+		 * Detect . and .., so we can special-case them.
+		 * . is not included in sf directories.
+		 * .. is included by just the parent inode number.
+		 */
+		isdot = dep->namelen == 1 && dep->name[0] == '.';
+		isdotdot =
+			dep->namelen == 2 &&
+			dep->name[0] == '.' && dep->name[1] == '.';
+#if XFS_BIG_INUMS
+		if (!isdot)
+			i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
+#endif
+		if (!isdot && !isdotdot) {
+			count++;
+			namelen += dep->namelen;
+		} else if (isdotdot)
+			parent = be64_to_cpu(dep->inumber);
+		/*
+		 * Calculate the new size, see if we should give up yet.
+		 */
+		size = xfs_dir2_sf_hdr_size(i8count) +		/* header */
+		       count +					/* namelen */
+		       count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
+		       namelen +				/* name */
+		       (i8count ?				/* inumber */
+				(uint)sizeof(xfs_dir2_ino8_t) * count :
+				(uint)sizeof(xfs_dir2_ino4_t) * count);
+		if (size > XFS_IFORK_DSIZE(dp))
+			return size;		/* size value is a failure */
+	}
+	/*
+	 * Create the output header, if it worked.
+	 */
+	sfhp->count = count;
+	sfhp->i8count = i8count;
+	xfs_dir2_sf_put_inumber((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent);
+	return size;
+}
+
+/*
+ * Convert a block format directory to shortform.
+ * Caller has already checked that it will fit, and built us a header.
+ */
+int						/* error */
+xfs_dir2_block_to_sf(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*bp,		/* block buffer */
+	int			size,		/* shortform directory size */
+	xfs_dir2_sf_hdr_t	*sfhp)		/* shortform directory hdr */
+{
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_block_tail_t	*btp;		/* block tail pointer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused data pointer */
+	char			*endptr;	/* end of data entries */
+	int			error;		/* error return value */
+	int			logflags;	/* inode logging flags */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	char			*ptr;		/* current data pointer */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	xfs_ino_t               temp;
+
+	trace_xfs_dir2_block_to_sf(args);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+
+	/*
+	 * Make a copy of the block data, so we can shrink the inode
+	 * and add local data.
+	 */
+	block = kmem_alloc(mp->m_dirblksize, KM_SLEEP);
+	memcpy(block, bp->data, mp->m_dirblksize);
+	logflags = XFS_ILOG_CORE;
+	if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) {
+		ASSERT(error != ENOSPC);
+		goto out;
+	}
+	/*
+	 * The buffer is now unconditionally gone, whether
+	 * xfs_dir2_shrink_inode worked or not.
+	 *
+	 * Convert the inode to local format.
+	 */
+	dp->i_df.if_flags &= ~XFS_IFEXTENTS;
+	dp->i_df.if_flags |= XFS_IFINLINE;
+	dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+	ASSERT(dp->i_df.if_bytes == 0);
+	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+	logflags |= XFS_ILOG_DDATA;
+	/*
+	 * Copy the header into the newly allocate local space.
+	 */
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
+	dp->i_d.di_size = size;
+	/*
+	 * Set up to loop over the block's entries.
+	 */
+	btp = xfs_dir2_block_tail_p(mp, block);
+	ptr = (char *)block->u;
+	endptr = (char *)xfs_dir2_block_leaf_p(btp);
+	sfep = xfs_dir2_sf_firstentry(sfp);
+	/*
+	 * Loop over the active and unused entries.
+	 * Stop when we reach the leaf/tail portion of the block.
+	 */
+	while (ptr < endptr) {
+		/*
+		 * If it's unused, just skip over it.
+		 */
+		dup = (xfs_dir2_data_unused_t *)ptr;
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			ptr += be16_to_cpu(dup->length);
+			continue;
+		}
+		dep = (xfs_dir2_data_entry_t *)ptr;
+		/*
+		 * Skip .
+		 */
+		if (dep->namelen == 1 && dep->name[0] == '.')
+			ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino);
+		/*
+		 * Skip .., but make sure the inode number is right.
+		 */
+		else if (dep->namelen == 2 &&
+			 dep->name[0] == '.' && dep->name[1] == '.')
+			ASSERT(be64_to_cpu(dep->inumber) ==
+			       xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent));
+		/*
+		 * Normal entry, copy it into shortform.
+		 */
+		else {
+			sfep->namelen = dep->namelen;
+			xfs_dir2_sf_put_offset(sfep,
+				(xfs_dir2_data_aoff_t)
+				((char *)dep - (char *)block));
+			memcpy(sfep->name, dep->name, dep->namelen);
+			temp = be64_to_cpu(dep->inumber);
+			xfs_dir2_sf_put_inumber(sfp, &temp,
+				xfs_dir2_sf_inumberp(sfep));
+			sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+		}
+		ptr += xfs_dir2_data_entsize(dep->namelen);
+	}
+	ASSERT((char *)sfep - (char *)sfp == size);
+	xfs_dir2_sf_check(args);
+out:
+	xfs_trans_log_inode(args->trans, dp, logflags);
+	kmem_free(block);
+	return error;
+}
+
+/*
+ * Add a name to a shortform directory.
+ * There are two algorithms, "easy" and "hard" which we decide on
+ * before changing anything.
+ * Convert to block form if necessary, if the new entry won't fit.
+ */
+int						/* error */
+xfs_dir2_sf_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	int			add_entsize;	/* size of the new entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			incr_isize;	/* total change in size */
+	int			new_isize;	/* di_size after adding name */
+	int			objchange;	/* changing to 8-byte inodes */
+	xfs_dir2_data_aoff_t	offset = 0;	/* offset for new entry */
+	int			old_isize;	/* di_size before adding name */
+	int			pick;		/* which algorithm to use */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	xfs_dir2_sf_entry_t	*sfep = NULL;	/* shortform entry */
+
+	trace_xfs_dir2_sf_addname(args);
+
+	ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
+	dp = args->dp;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Make sure the shortform value has some of its header.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
+	/*
+	 * Compute entry (and change in) size.
+	 */
+	add_entsize = xfs_dir2_sf_entsize_byname(sfp, args->namelen);
+	incr_isize = add_entsize;
+	objchange = 0;
+#if XFS_BIG_INUMS
+	/*
+	 * Do we have to change to 8 byte inodes?
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) {
+		/*
+		 * Yes, adjust the entry size and the total size.
+		 */
+		add_entsize +=
+			(uint)sizeof(xfs_dir2_ino8_t) -
+			(uint)sizeof(xfs_dir2_ino4_t);
+		incr_isize +=
+			(sfp->hdr.count + 2) *
+			((uint)sizeof(xfs_dir2_ino8_t) -
+			 (uint)sizeof(xfs_dir2_ino4_t));
+		objchange = 1;
+	}
+#endif
+	old_isize = (int)dp->i_d.di_size;
+	new_isize = old_isize + incr_isize;
+	/*
+	 * Won't fit as shortform any more (due to size),
+	 * or the pick routine says it won't (due to offset values).
+	 */
+	if (new_isize > XFS_IFORK_DSIZE(dp) ||
+	    (pick =
+	     xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
+		/*
+		 * Just checking or no space reservation, it doesn't fit.
+		 */
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+			return XFS_ERROR(ENOSPC);
+		/*
+		 * Convert to block form then add the name.
+		 */
+		error = xfs_dir2_sf_to_block(args);
+		if (error)
+			return error;
+		return xfs_dir2_block_addname(args);
+	}
+	/*
+	 * Just checking, it fits.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+		return 0;
+	/*
+	 * Do it the easy way - just add it at the end.
+	 */
+	if (pick == 1)
+		xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize);
+	/*
+	 * Do it the hard way - look for a place to insert the new entry.
+	 * Convert to 8 byte inode numbers first if necessary.
+	 */
+	else {
+		ASSERT(pick == 2);
+#if XFS_BIG_INUMS
+		if (objchange)
+			xfs_dir2_sf_toino8(args);
+#endif
+		xfs_dir2_sf_addname_hard(args, objchange, new_isize);
+	}
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Add the new entry the "easy" way.
+ * This is copying the old directory and adding the new entry at the end.
+ * Since it's sorted by "offset" we need room after the last offset
+ * that's already there, and then room to convert to a block directory.
+ * This is already checked by the pick routine.
+ */
+static void
+xfs_dir2_sf_addname_easy(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dir2_sf_entry_t	*sfep,		/* pointer to new entry */
+	xfs_dir2_data_aoff_t	offset,		/* offset to use for new ent */
+	int			new_isize)	/* new directory size */
+{
+	int			byteoff;	/* byte offset in sf dir */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+
+	dp = args->dp;
+
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	byteoff = (int)((char *)sfep - (char *)sfp);
+	/*
+	 * Grow the in-inode space.
+	 */
+	xfs_idata_realloc(dp, xfs_dir2_sf_entsize_byname(sfp, args->namelen),
+		XFS_DATA_FORK);
+	/*
+	 * Need to set up again due to realloc of the inode data.
+	 */
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
+	/*
+	 * Fill in the new entry.
+	 */
+	sfep->namelen = args->namelen;
+	xfs_dir2_sf_put_offset(sfep, offset);
+	memcpy(sfep->name, args->name, sfep->namelen);
+	xfs_dir2_sf_put_inumber(sfp, &args->inumber,
+		xfs_dir2_sf_inumberp(sfep));
+	/*
+	 * Update the header and inode.
+	 */
+	sfp->hdr.count++;
+#if XFS_BIG_INUMS
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
+		sfp->hdr.i8count++;
+#endif
+	dp->i_d.di_size = new_isize;
+	xfs_dir2_sf_check(args);
+}
+
+/*
+ * Add the new entry the "hard" way.
+ * The caller has already converted to 8 byte inode numbers if necessary,
+ * in which case we need to leave the i8count at 1.
+ * Find a hole that the new entry will fit into, and copy
+ * the first part of the entries, the new entry, and the last part of
+ * the entries.
+ */
+/* ARGSUSED */
+static void
+xfs_dir2_sf_addname_hard(
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			objchange,	/* changing inode number size */
+	int			new_isize)	/* new directory size */
+{
+	int			add_datasize;	/* data size need for new ent */
+	char			*buf;		/* buffer for old */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			eof;		/* reached end of old dir */
+	int			nbytes;		/* temp for byte copies */
+	xfs_dir2_data_aoff_t	new_offset;	/* next offset value */
+	xfs_dir2_data_aoff_t	offset;		/* current offset value */
+	int			old_isize;	/* previous di_size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* entry in original dir */
+	xfs_dir2_sf_t		*oldsfp;	/* original shortform dir */
+	xfs_dir2_sf_entry_t	*sfep;		/* entry in new dir */
+	xfs_dir2_sf_t		*sfp;		/* new shortform dir */
+
+	/*
+	 * Copy the old directory to the stack buffer.
+	 */
+	dp = args->dp;
+
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	old_isize = (int)dp->i_d.di_size;
+	buf = kmem_alloc(old_isize, KM_SLEEP);
+	oldsfp = (xfs_dir2_sf_t *)buf;
+	memcpy(oldsfp, sfp, old_isize);
+	/*
+	 * Loop over the old directory finding the place we're going
+	 * to insert the new entry.
+	 * If it's going to end up at the end then oldsfep will point there.
+	 */
+	for (offset = XFS_DIR2_DATA_FIRST_OFFSET,
+	      oldsfep = xfs_dir2_sf_firstentry(oldsfp),
+	      add_datasize = xfs_dir2_data_entsize(args->namelen),
+	      eof = (char *)oldsfep == &buf[old_isize];
+	     !eof;
+	     offset = new_offset + xfs_dir2_data_entsize(oldsfep->namelen),
+	      oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep),
+	      eof = (char *)oldsfep == &buf[old_isize]) {
+		new_offset = xfs_dir2_sf_get_offset(oldsfep);
+		if (offset + add_datasize <= new_offset)
+			break;
+	}
+	/*
+	 * Get rid of the old directory, then allocate space for
+	 * the new one.  We do this so xfs_idata_realloc won't copy
+	 * the data.
+	 */
+	xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
+	/*
+	 * Reset the pointer since the buffer was reallocated.
+	 */
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Copy the first part of the directory, including the header.
+	 */
+	nbytes = (int)((char *)oldsfep - (char *)oldsfp);
+	memcpy(sfp, oldsfp, nbytes);
+	sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes);
+	/*
+	 * Fill in the new entry, and update the header counts.
+	 */
+	sfep->namelen = args->namelen;
+	xfs_dir2_sf_put_offset(sfep, offset);
+	memcpy(sfep->name, args->name, sfep->namelen);
+	xfs_dir2_sf_put_inumber(sfp, &args->inumber,
+		xfs_dir2_sf_inumberp(sfep));
+	sfp->hdr.count++;
+#if XFS_BIG_INUMS
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
+		sfp->hdr.i8count++;
+#endif
+	/*
+	 * If there's more left to copy, do that.
+	 */
+	if (!eof) {
+		sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+		memcpy(sfep, oldsfep, old_isize - nbytes);
+	}
+	kmem_free(buf);
+	dp->i_d.di_size = new_isize;
+	xfs_dir2_sf_check(args);
+}
+
+/*
+ * Decide if the new entry will fit at all.
+ * If it will fit, pick between adding the new entry to the end (easy)
+ * or somewhere else (hard).
+ * Return 0 (won't fit), 1 (easy), 2 (hard).
+ */
+/*ARGSUSED*/
+static int					/* pick result */
+xfs_dir2_sf_addname_pick(
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			objchange,	/* inode # size changes */
+	xfs_dir2_sf_entry_t	**sfepp,	/* out(1): new entry ptr */
+	xfs_dir2_data_aoff_t	*offsetp)	/* out(1): new offset */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			holefit;	/* found hole it will fit in */
+	int			i;		/* entry number */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_data_aoff_t	offset;		/* data block offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	int			size;		/* entry's data size */
+	int			used;		/* data bytes used */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	size = xfs_dir2_data_entsize(args->namelen);
+	offset = XFS_DIR2_DATA_FIRST_OFFSET;
+	sfep = xfs_dir2_sf_firstentry(sfp);
+	holefit = 0;
+	/*
+	 * Loop over sf entries.
+	 * Keep track of data offset and whether we've seen a place
+	 * to insert the new entry.
+	 */
+	for (i = 0; i < sfp->hdr.count; i++) {
+		if (!holefit)
+			holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
+		offset = xfs_dir2_sf_get_offset(sfep) +
+			 xfs_dir2_data_entsize(sfep->namelen);
+		sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+	}
+	/*
+	 * Calculate data bytes used excluding the new entry, if this
+	 * was a data block (block form directory).
+	 */
+	used = offset +
+	       (sfp->hdr.count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+	       (uint)sizeof(xfs_dir2_block_tail_t);
+	/*
+	 * If it won't fit in a block form then we can't insert it,
+	 * we'll go back, convert to block, then try the insert and convert
+	 * to leaf.
+	 */
+	if (used + (holefit ? 0 : size) > mp->m_dirblksize)
+		return 0;
+	/*
+	 * If changing the inode number size, do it the hard way.
+	 */
+#if XFS_BIG_INUMS
+	if (objchange) {
+		return 2;
+	}
+#else
+	ASSERT(objchange == 0);
+#endif
+	/*
+	 * If it won't fit at the end then do it the hard way (use the hole).
+	 */
+	if (used + size > mp->m_dirblksize)
+		return 2;
+	/*
+	 * Do it the easy way.
+	 */
+	*sfepp = sfep;
+	*offsetp = offset;
+	return 1;
+}
+
+#ifdef DEBUG
+/*
+ * Check consistency of shortform directory, assert if bad.
+ */
+static void
+xfs_dir2_sf_check(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry number */
+	int			i8count;	/* number of big inode#s */
+	xfs_ino_t		ino;		/* entry inode number */
+	int			offset;		/* data offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform dir entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+
+	dp = args->dp;
+
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	offset = XFS_DIR2_DATA_FIRST_OFFSET;
+	ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
+	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
+
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
+	     i < sfp->hdr.count;
+	     i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+		ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
+		ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
+		i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
+		offset =
+			xfs_dir2_sf_get_offset(sfep) +
+			xfs_dir2_data_entsize(sfep->namelen);
+	}
+	ASSERT(i8count == sfp->hdr.i8count);
+	ASSERT(XFS_BIG_INUMS || i8count == 0);
+	ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
+	ASSERT(offset +
+	       (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+	       (uint)sizeof(xfs_dir2_block_tail_t) <=
+	       dp->i_mount->m_dirblksize);
+}
+#endif	/* DEBUG */
+
+/*
+ * Create a new (shortform) directory.
+ */
+int					/* error, always 0 */
+xfs_dir2_sf_create(
+	xfs_da_args_t	*args,		/* operation arguments */
+	xfs_ino_t	pino)		/* parent inode number */
+{
+	xfs_inode_t	*dp;		/* incore directory inode */
+	int		i8count;	/* parent inode is an 8-byte number */
+	xfs_dir2_sf_t	*sfp;		/* shortform structure */
+	int		size;		/* directory size */
+
+	trace_xfs_dir2_sf_create(args);
+
+	dp = args->dp;
+
+	ASSERT(dp != NULL);
+	ASSERT(dp->i_d.di_size == 0);
+	/*
+	 * If it's currently a zero-length extent file,
+	 * convert it to local format.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
+		dp->i_df.if_flags &= ~XFS_IFEXTENTS;	/* just in case */
+		dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+		dp->i_df.if_flags |= XFS_IFINLINE;
+	}
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	ASSERT(dp->i_df.if_bytes == 0);
+	i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
+	size = xfs_dir2_sf_hdr_size(i8count);
+	/*
+	 * Make a buffer for the data.
+	 */
+	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+	/*
+	 * Fill in the header,
+	 */
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	sfp->hdr.i8count = i8count;
+	/*
+	 * Now can put in the inode number, since i8count is set.
+	 */
+	xfs_dir2_sf_put_inumber(sfp, &pino, &sfp->hdr.parent);
+	sfp->hdr.count = 0;
+	dp->i_d.di_size = size;
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+int						/* error */
+xfs_dir2_sf_getdents(
+	xfs_inode_t		*dp,		/* incore directory inode */
+	void			*dirent,
+	xfs_off_t		*offset,
+	filldir_t		filldir)
+{
+	int			i;		/* shortform entry number */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_dataptr_t	off;		/* current entry's offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	xfs_dir2_dataptr_t	dot_offset;
+	xfs_dir2_dataptr_t	dotdot_offset;
+	xfs_ino_t		ino;
+
+	mp = dp->i_mount;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Give up if the directory is way too short.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		return XFS_ERROR(EIO);
+	}
+
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+
+	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
+
+	/*
+	 * If the block number in the offset is out of range, we're done.
+	 */
+	if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
+		return 0;
+
+	/*
+	 * Precalculate offsets for . and .. as we will always need them.
+	 *
+	 * XXX(hch): the second argument is sometimes 0 and sometimes
+	 * mp->m_dirdatablk.
+	 */
+	dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+					     XFS_DIR2_DATA_DOT_OFFSET);
+	dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+						XFS_DIR2_DATA_DOTDOT_OFFSET);
+
+	/*
+	 * Put . entry unless we're starting past it.
+	 */
+	if (*offset <= dot_offset) {
+		if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
+			*offset = dot_offset & 0x7fffffff;
+			return 0;
+		}
+	}
+
+	/*
+	 * Put .. entry unless we're starting past it.
+	 */
+	if (*offset <= dotdot_offset) {
+		ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
+		if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
+			*offset = dotdot_offset & 0x7fffffff;
+			return 0;
+		}
+	}
+
+	/*
+	 * Loop while there are more entries and put'ing works.
+	 */
+	sfep = xfs_dir2_sf_firstentry(sfp);
+	for (i = 0; i < sfp->hdr.count; i++) {
+		off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+				xfs_dir2_sf_get_offset(sfep));
+
+		if (*offset > off) {
+			sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+			continue;
+		}
+
+		ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
+		if (filldir(dirent, (char *)sfep->name, sfep->namelen,
+			    off & 0x7fffffff, ino, DT_UNKNOWN)) {
+			*offset = off & 0x7fffffff;
+			return 0;
+		}
+		sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+	}
+
+	*offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+			0x7fffffff;
+	return 0;
+}
+
+/*
+ * Lookup an entry in a shortform directory.
+ * Returns EEXIST if found, ENOENT if not found.
+ */
+int						/* error */
+xfs_dir2_sf_lookup(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	int			error;
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	enum xfs_dacmp		cmp;		/* comparison result */
+	xfs_dir2_sf_entry_t	*ci_sfep;	/* case-insens. entry */
+
+	trace_xfs_dir2_sf_lookup(args);
+
+	xfs_dir2_sf_check(args);
+	dp = args->dp;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Bail out if the directory is way too short.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
+	/*
+	 * Special case for .
+	 */
+	if (args->namelen == 1 && args->name[0] == '.') {
+		args->inumber = dp->i_ino;
+		args->cmpresult = XFS_CMP_EXACT;
+		return XFS_ERROR(EEXIST);
+	}
+	/*
+	 * Special case for ..
+	 */
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+		args->inumber = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
+		args->cmpresult = XFS_CMP_EXACT;
+		return XFS_ERROR(EEXIST);
+	}
+	/*
+	 * Loop over all the entries trying to match ours.
+	 */
+	ci_sfep = NULL;
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count;
+				i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+		/*
+		 * Compare name and if it's an exact match, return the inode
+		 * number. If it's the first case-insensitive match, store the
+		 * inode number and continue looking for an exact match.
+		 */
+		cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name,
+								sfep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			args->cmpresult = cmp;
+			args->inumber = xfs_dir2_sf_get_inumber(sfp,
+						xfs_dir2_sf_inumberp(sfep));
+			if (cmp == XFS_CMP_EXACT)
+				return XFS_ERROR(EEXIST);
+			ci_sfep = sfep;
+		}
+	}
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	/*
+	 * Here, we can only be doing a lookup (not a rename or replace).
+	 * If a case-insensitive match was not found, return ENOENT.
+	 */
+	if (!ci_sfep)
+		return XFS_ERROR(ENOENT);
+	/* otherwise process the CI match as required by the caller */
+	error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
+	return XFS_ERROR(error);
+}
+
+/*
+ * Remove an entry from a shortform directory.
+ */
+int						/* error */
+xfs_dir2_sf_removename(
+	xfs_da_args_t		*args)
+{
+	int			byteoff;	/* offset of removed entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			entsize;	/* this entry's size */
+	int			i;		/* shortform entry index */
+	int			newsize;	/* new inode size */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+
+	trace_xfs_dir2_sf_removename(args);
+
+	dp = args->dp;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	oldsize = (int)dp->i_d.di_size;
+	/*
+	 * Bail out if the directory is way too short.
+	 */
+	if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == oldsize);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
+	/*
+	 * Loop over the old directory entries.
+	 * Find the one we're deleting.
+	 */
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count;
+				i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+		if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+								XFS_CMP_EXACT) {
+			ASSERT(xfs_dir2_sf_get_inumber(sfp,
+						xfs_dir2_sf_inumberp(sfep)) ==
+								args->inumber);
+			break;
+		}
+	}
+	/*
+	 * Didn't find it.
+	 */
+	if (i == sfp->hdr.count)
+		return XFS_ERROR(ENOENT);
+	/*
+	 * Calculate sizes.
+	 */
+	byteoff = (int)((char *)sfep - (char *)sfp);
+	entsize = xfs_dir2_sf_entsize_byname(sfp, args->namelen);
+	newsize = oldsize - entsize;
+	/*
+	 * Copy the part if any after the removed entry, sliding it down.
+	 */
+	if (byteoff + entsize < oldsize)
+		memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize,
+			oldsize - (byteoff + entsize));
+	/*
+	 * Fix up the header and file size.
+	 */
+	sfp->hdr.count--;
+	dp->i_d.di_size = newsize;
+	/*
+	 * Reallocate, making it smaller.
+	 */
+	xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+#if XFS_BIG_INUMS
+	/*
+	 * Are we changing inode number size?
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+		if (sfp->hdr.i8count == 1)
+			xfs_dir2_sf_toino4(args);
+		else
+			sfp->hdr.i8count--;
+	}
+#endif
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Replace the inode number of an entry in a shortform directory.
+ */
+int						/* error */
+xfs_dir2_sf_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+#if XFS_BIG_INUMS || defined(DEBUG)
+	xfs_ino_t		ino=0;		/* entry old inode number */
+#endif
+#if XFS_BIG_INUMS
+	int			i8elevated;	/* sf_toino8 set i8count=1 */
+#endif
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+
+	trace_xfs_dir2_sf_replace(args);
+
+	dp = args->dp;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Bail out if the shortform directory is way too small.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
+#if XFS_BIG_INUMS
+	/*
+	 * New inode number is large, and need to convert to 8-byte inodes.
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) {
+		int	error;			/* error return value */
+		int	newsize;		/* new inode size */
+
+		newsize =
+			dp->i_df.if_bytes +
+			(sfp->hdr.count + 1) *
+			((uint)sizeof(xfs_dir2_ino8_t) -
+			 (uint)sizeof(xfs_dir2_ino4_t));
+		/*
+		 * Won't fit as shortform, convert to block then do replace.
+		 */
+		if (newsize > XFS_IFORK_DSIZE(dp)) {
+			error = xfs_dir2_sf_to_block(args);
+			if (error) {
+				return error;
+			}
+			return xfs_dir2_block_replace(args);
+		}
+		/*
+		 * Still fits, convert to 8-byte now.
+		 */
+		xfs_dir2_sf_toino8(args);
+		i8elevated = 1;
+		sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	} else
+		i8elevated = 0;
+#endif
+	ASSERT(args->namelen != 1 || args->name[0] != '.');
+	/*
+	 * Replace ..'s entry.
+	 */
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+#if XFS_BIG_INUMS || defined(DEBUG)
+		ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
+		ASSERT(args->inumber != ino);
+#endif
+		xfs_dir2_sf_put_inumber(sfp, &args->inumber, &sfp->hdr.parent);
+	}
+	/*
+	 * Normal entry, look for the name.
+	 */
+	else {
+		for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
+				i < sfp->hdr.count;
+				i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+			if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+								XFS_CMP_EXACT) {
+#if XFS_BIG_INUMS || defined(DEBUG)
+				ino = xfs_dir2_sf_get_inumber(sfp,
+					xfs_dir2_sf_inumberp(sfep));
+				ASSERT(args->inumber != ino);
+#endif
+				xfs_dir2_sf_put_inumber(sfp, &args->inumber,
+					xfs_dir2_sf_inumberp(sfep));
+				break;
+			}
+		}
+		/*
+		 * Didn't find it.
+		 */
+		if (i == sfp->hdr.count) {
+			ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+#if XFS_BIG_INUMS
+			if (i8elevated)
+				xfs_dir2_sf_toino4(args);
+#endif
+			return XFS_ERROR(ENOENT);
+		}
+	}
+#if XFS_BIG_INUMS
+	/*
+	 * See if the old number was large, the new number is small.
+	 */
+	if (ino > XFS_DIR2_MAX_SHORT_INUM &&
+	    args->inumber <= XFS_DIR2_MAX_SHORT_INUM) {
+		/*
+		 * And the old count was one, so need to convert to small.
+		 */
+		if (sfp->hdr.i8count == 1)
+			xfs_dir2_sf_toino4(args);
+		else
+			sfp->hdr.i8count--;
+	}
+	/*
+	 * See if the old number was small, the new number is large.
+	 */
+	if (ino <= XFS_DIR2_MAX_SHORT_INUM &&
+	    args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+		/*
+		 * add to the i8count unless we just converted to 8-byte
+		 * inodes (which does an implied i8count = 1)
+		 */
+		ASSERT(sfp->hdr.i8count != 0);
+		if (!i8elevated)
+			sfp->hdr.i8count++;
+	}
+#endif
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+	return 0;
+}
+
+#if XFS_BIG_INUMS
+/*
+ * Convert from 8-byte inode numbers to 4-byte inode numbers.
+ * The last 8-byte inode number is gone, but the count is still 1.
+ */
+static void
+xfs_dir2_sf_toino4(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	char			*buf;		/* old dir's buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	xfs_ino_t		ino;		/* entry inode number */
+	int			newsize;	/* new inode size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
+	xfs_dir2_sf_t		*oldsfp;	/* old sf directory */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
+	xfs_dir2_sf_t		*sfp;		/* new sf directory */
+
+	trace_xfs_dir2_sf_toino4(args);
+
+	dp = args->dp;
+
+	/*
+	 * Copy the old directory to the buffer.
+	 * Then nuke it from the inode, and add the new buffer to the inode.
+	 * Don't want xfs_idata_realloc copying the data here.
+	 */
+	oldsize = dp->i_df.if_bytes;
+	buf = kmem_alloc(oldsize, KM_SLEEP);
+	oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsfp->hdr.i8count == 1);
+	memcpy(buf, oldsfp, oldsize);
+	/*
+	 * Compute the new inode size.
+	 */
+	newsize =
+		oldsize -
+		(oldsfp->hdr.count + 1) *
+		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+	/*
+	 * Reset our pointers, the data has moved.
+	 */
+	oldsfp = (xfs_dir2_sf_t *)buf;
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Fill in the new header.
+	 */
+	sfp->hdr.count = oldsfp->hdr.count;
+	sfp->hdr.i8count = 0;
+	ino = xfs_dir2_sf_get_inumber(oldsfp, &oldsfp->hdr.parent);
+	xfs_dir2_sf_put_inumber(sfp, &ino, &sfp->hdr.parent);
+	/*
+	 * Copy the entries field by field.
+	 */
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+		    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+	     i < sfp->hdr.count;
+	     i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep),
+		  oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) {
+		sfep->namelen = oldsfep->namelen;
+		sfep->offset = oldsfep->offset;
+		memcpy(sfep->name, oldsfep->name, sfep->namelen);
+		ino = xfs_dir2_sf_get_inumber(oldsfp,
+			xfs_dir2_sf_inumberp(oldsfep));
+		xfs_dir2_sf_put_inumber(sfp, &ino, xfs_dir2_sf_inumberp(sfep));
+	}
+	/*
+	 * Clean up the inode.
+	 */
+	kmem_free(buf);
+	dp->i_d.di_size = newsize;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+
+/*
+ * Convert from 4-byte inode numbers to 8-byte inode numbers.
+ * The new 8-byte inode number is not there yet, we leave with the
+ * count 1 but no corresponding entry.
+ */
+static void
+xfs_dir2_sf_toino8(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	char			*buf;		/* old dir's buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	xfs_ino_t		ino;		/* entry inode number */
+	int			newsize;	/* new inode size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
+	xfs_dir2_sf_t		*oldsfp;	/* old sf directory */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
+	xfs_dir2_sf_t		*sfp;		/* new sf directory */
+
+	trace_xfs_dir2_sf_toino8(args);
+
+	dp = args->dp;
+
+	/*
+	 * Copy the old directory to the buffer.
+	 * Then nuke it from the inode, and add the new buffer to the inode.
+	 * Don't want xfs_idata_realloc copying the data here.
+	 */
+	oldsize = dp->i_df.if_bytes;
+	buf = kmem_alloc(oldsize, KM_SLEEP);
+	oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsfp->hdr.i8count == 0);
+	memcpy(buf, oldsfp, oldsize);
+	/*
+	 * Compute the new inode size.
+	 */
+	newsize =
+		oldsize +
+		(oldsfp->hdr.count + 1) *
+		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+	/*
+	 * Reset our pointers, the data has moved.
+	 */
+	oldsfp = (xfs_dir2_sf_t *)buf;
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Fill in the new header.
+	 */
+	sfp->hdr.count = oldsfp->hdr.count;
+	sfp->hdr.i8count = 1;
+	ino = xfs_dir2_sf_get_inumber(oldsfp, &oldsfp->hdr.parent);
+	xfs_dir2_sf_put_inumber(sfp, &ino, &sfp->hdr.parent);
+	/*
+	 * Copy the entries field by field.
+	 */
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+		    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+	     i < sfp->hdr.count;
+	     i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep),
+		  oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) {
+		sfep->namelen = oldsfep->namelen;
+		sfep->offset = oldsfep->offset;
+		memcpy(sfep->name, oldsfep->name, sfep->namelen);
+		ino = xfs_dir2_sf_get_inumber(oldsfp,
+			xfs_dir2_sf_inumberp(oldsfep));
+		xfs_dir2_sf_put_inumber(sfp, &ino, xfs_dir2_sf_inumberp(sfep));
+	}
+	/*
+	 * Clean up the inode.
+	 */
+	kmem_free(buf);
+	dp->i_d.di_size = newsize;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+#endif	/* XFS_BIG_INUMS */
diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h
new file mode 100644
index 00000000..6ac44b55
--- /dev/null
+++ b/fs/xfs/xfs_dir2_sf.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DIR2_SF_H__
+#define	__XFS_DIR2_SF_H__
+
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to
+ * fit into the literal area of the inode.
+ */
+
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_dir2_block;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Inode number stored as 8 8-bit values.
+ */
+typedef	struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
+
+/*
+ * Inode number stored as 4 8-bit values.
+ * Works a lot of the time, when all the inode numbers in a directory
+ * fit in 32 bits.
+ */
+typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
+
+typedef union {
+	xfs_dir2_ino8_t	i8;
+	xfs_dir2_ino4_t	i4;
+} xfs_dir2_inou_t;
+#define	XFS_DIR2_MAX_SHORT_INUM	((xfs_ino_t)0xffffffffULL)
+
+/*
+ * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
+ * Only need 16 bits, this is the byte offset into the single block form.
+ */
+typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
+
+/*
+ * The parent directory has a dedicated field, and the self-pointer must
+ * be calculated on the fly.
+ *
+ * Entries are packed toward the top as tightly as possible.  The header
+ * and the elements must be memcpy'd out into a work area to get correct
+ * alignment for the inode number fields.
+ */
+typedef struct xfs_dir2_sf_hdr {
+	__uint8_t		count;		/* count of entries */
+	__uint8_t		i8count;	/* count of 8-byte inode #s */
+	xfs_dir2_inou_t		parent;		/* parent dir inode number */
+} __arch_pack xfs_dir2_sf_hdr_t;
+
+typedef struct xfs_dir2_sf_entry {
+	__uint8_t		namelen;	/* actual name length */
+	xfs_dir2_sf_off_t	offset;		/* saved offset */
+	__uint8_t		name[1];	/* name, variable size */
+	xfs_dir2_inou_t		inumber;	/* inode number, var. offset */
+} __arch_pack xfs_dir2_sf_entry_t; 
+
+typedef struct xfs_dir2_sf {
+	xfs_dir2_sf_hdr_t	hdr;		/* shortform header */
+	xfs_dir2_sf_entry_t	list[1];	/* shortform entries */
+} xfs_dir2_sf_t;
+
+static inline int xfs_dir2_sf_hdr_size(int i8count)
+{
+	return ((uint)sizeof(xfs_dir2_sf_hdr_t) - \
+		((i8count) == 0) * \
+		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)));
+}
+
+static inline xfs_dir2_inou_t *xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep)
+{
+	return (xfs_dir2_inou_t *)&(sfep)->name[(sfep)->namelen];
+}
+
+static inline xfs_intino_t
+xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from)
+{
+	return ((sfp)->hdr.i8count == 0 ? \
+		(xfs_intino_t)XFS_GET_DIR_INO4((from)->i4) : \
+		(xfs_intino_t)XFS_GET_DIR_INO8((from)->i8));
+}
+
+static inline void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from,
+						xfs_dir2_inou_t *to)
+{
+	if ((sfp)->hdr.i8count == 0)
+		XFS_PUT_DIR_INO4(*(from), (to)->i4);
+	else
+		XFS_PUT_DIR_INO8(*(from), (to)->i8);
+}
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
+{
+	return INT_GET_UNALIGNED_16_BE(&(sfep)->offset.i);
+}
+
+static inline void
+xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
+{
+	INT_SET_UNALIGNED_16_BE(&(sfep)->offset.i, off);
+}
+
+static inline int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len)
+{
+	return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (len) - \
+		((sfp)->hdr.i8count == 0) * \
+		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)));
+}
+
+static inline int
+xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
+{
+	return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (sfep)->namelen - \
+		((sfp)->hdr.i8count == 0) * \
+		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)));
+}
+
+static inline xfs_dir2_sf_entry_t *xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp)
+{
+	return ((xfs_dir2_sf_entry_t *) \
+		((char *)(sfp) + xfs_dir2_sf_hdr_size(sfp->hdr.i8count)));
+}
+
+static inline xfs_dir2_sf_entry_t *
+xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
+{
+	return ((xfs_dir2_sf_entry_t *) \
+		((char *)(sfep) + xfs_dir2_sf_entsize_byentry(sfp,sfep)));
+}
+
+/*
+ * Functions.
+ */
+extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
+				 struct xfs_dir2_block *block,
+				 xfs_dir2_sf_hdr_t *sfhp);
+extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp,
+				int size, xfs_dir2_sf_hdr_t *sfhp);
+extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
+extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
+extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent,
+				xfs_off_t *offset, filldir_t filldir);
+extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
+
+#endif	/* __XFS_DIR2_SF_H__ */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
new file mode 100644
index 00000000..39f06336
--- /dev/null
+++ b/fs/xfs/xfs_error.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_utils.h"
+#include "xfs_error.h"
+
+#ifdef DEBUG
+
+int	xfs_etrap[XFS_ERROR_NTRAP] = {
+	0,
+};
+
+int
+xfs_error_trap(int e)
+{
+	int i;
+
+	if (!e)
+		return 0;
+	for (i = 0; i < XFS_ERROR_NTRAP; i++) {
+		if (xfs_etrap[i] == 0)
+			break;
+		if (e != xfs_etrap[i])
+			continue;
+		xfs_notice(NULL, "%s: error %d", __func__, e);
+		BUG();
+		break;
+	}
+	return e;
+}
+
+int	xfs_etest[XFS_NUM_INJECT_ERROR];
+int64_t	xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
+char *	xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
+int	xfs_error_test_active;
+
+int
+xfs_error_test(int error_tag, int *fsidp, char *expression,
+	       int line, char *file, unsigned long randfactor)
+{
+	int i;
+	int64_t fsid;
+
+	if (random32() % randfactor)
+		return 0;
+
+	memcpy(&fsid, fsidp, sizeof(xfs_fsid_t));
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
+		if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
+			xfs_warn(NULL,
+	"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
+				expression, file, line, xfs_etest_fsname[i]);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+int
+xfs_errortag_add(int error_tag, xfs_mount_t *mp)
+{
+	int i;
+	int len;
+	int64_t fsid;
+
+	memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
+		if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
+			xfs_warn(mp, "error tag #%d on", error_tag);
+			return 0;
+		}
+	}
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
+		if (xfs_etest[i] == 0) {
+			xfs_warn(mp, "Turned on XFS error tag #%d",
+				error_tag);
+			xfs_etest[i] = error_tag;
+			xfs_etest_fsid[i] = fsid;
+			len = strlen(mp->m_fsname);
+			xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
+			strcpy(xfs_etest_fsname[i], mp->m_fsname);
+			xfs_error_test_active++;
+			return 0;
+		}
+	}
+
+	xfs_warn(mp, "error tag overflow, too many turned on");
+
+	return 1;
+}
+
+int
+xfs_errortag_clearall(xfs_mount_t *mp, int loud)
+{
+	int64_t fsid;
+	int cleared = 0;
+	int i;
+
+	memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
+
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
+		if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
+		     xfs_etest[i] != 0) {
+			cleared = 1;
+			xfs_warn(mp, "Clearing XFS error tag #%d",
+				xfs_etest[i]);
+			xfs_etest[i] = 0;
+			xfs_etest_fsid[i] = 0LL;
+			kmem_free(xfs_etest_fsname[i]);
+			xfs_etest_fsname[i] = NULL;
+			xfs_error_test_active--;
+		}
+	}
+
+	if (loud || cleared)
+		xfs_warn(mp, "Cleared all XFS error tags for filesystem");
+
+	return 0;
+}
+#endif /* DEBUG */
+
+void
+xfs_error_report(
+	const char		*tag,
+	int			level,
+	struct xfs_mount	*mp,
+	const char		*filename,
+	int			linenum,
+	inst_t			*ra)
+{
+	if (level <= xfs_error_level) {
+		xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
+		"Internal error %s at line %d of file %s.  Caller 0x%p\n",
+			    tag, linenum, filename, ra);
+
+		xfs_stack_trace();
+	}
+}
+
+void
+xfs_corruption_error(
+	const char		*tag,
+	int			level,
+	struct xfs_mount	*mp,
+	void			*p,
+	const char		*filename,
+	int			linenum,
+	inst_t			*ra)
+{
+	if (level <= xfs_error_level)
+		xfs_hex_dump(p, 16);
+	xfs_error_report(tag, level, mp, filename, linenum, ra);
+	xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
+}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
new file mode 100644
index 00000000..079a367f
--- /dev/null
+++ b/fs/xfs/xfs_error.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_ERROR_H__
+#define	__XFS_ERROR_H__
+
+#ifdef DEBUG
+#define	XFS_ERROR_NTRAP	10
+extern int	xfs_etrap[XFS_ERROR_NTRAP];
+extern int	xfs_error_trap(int);
+#define	XFS_ERROR(e)	xfs_error_trap(e)
+#else
+#define	XFS_ERROR(e)	(e)
+#endif
+
+struct xfs_mount;
+
+extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
+			const char *filename, int linenum, inst_t *ra);
+extern void xfs_corruption_error(const char *tag, int level,
+			struct xfs_mount *mp, void *p, const char *filename,
+			int linenum, inst_t *ra);
+
+#define	XFS_ERROR_REPORT(e, lvl, mp)	\
+	xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
+#define	XFS_CORRUPTION_ERROR(e, lvl, mp, mem)	\
+	xfs_corruption_error(e, lvl, mp, mem, \
+			     __FILE__, __LINE__, __return_address)
+
+#define XFS_ERRLEVEL_OFF	0
+#define XFS_ERRLEVEL_LOW	1
+#define XFS_ERRLEVEL_HIGH	5
+
+/*
+ * Macros to set EFSCORRUPTED & return/branch.
+ */
+#define	XFS_WANT_CORRUPTED_GOTO(x,l)	\
+	{ \
+		int fs_is_ok = (x); \
+		ASSERT(fs_is_ok); \
+		if (unlikely(!fs_is_ok)) { \
+			XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
+					 XFS_ERRLEVEL_LOW, NULL); \
+			error = XFS_ERROR(EFSCORRUPTED); \
+			goto l; \
+		} \
+	}
+
+#define	XFS_WANT_CORRUPTED_RETURN(x)	\
+	{ \
+		int fs_is_ok = (x); \
+		ASSERT(fs_is_ok); \
+		if (unlikely(!fs_is_ok)) { \
+			XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
+					 XFS_ERRLEVEL_LOW, NULL); \
+			return XFS_ERROR(EFSCORRUPTED); \
+		} \
+	}
+
+/*
+ * error injection tags - the labels can be anything you want
+ * but each tag should have its own unique number
+ */
+
+#define XFS_ERRTAG_NOERROR				0
+#define XFS_ERRTAG_IFLUSH_1				1
+#define XFS_ERRTAG_IFLUSH_2				2
+#define XFS_ERRTAG_IFLUSH_3				3
+#define XFS_ERRTAG_IFLUSH_4				4
+#define XFS_ERRTAG_IFLUSH_5				5
+#define XFS_ERRTAG_IFLUSH_6				6
+#define	XFS_ERRTAG_DA_READ_BUF				7
+#define	XFS_ERRTAG_BTREE_CHECK_LBLOCK			8
+#define	XFS_ERRTAG_BTREE_CHECK_SBLOCK			9
+#define	XFS_ERRTAG_ALLOC_READ_AGF			10
+#define	XFS_ERRTAG_IALLOC_READ_AGI			11
+#define	XFS_ERRTAG_ITOBP_INOTOBP			12
+#define	XFS_ERRTAG_IUNLINK				13
+#define	XFS_ERRTAG_IUNLINK_REMOVE			14
+#define	XFS_ERRTAG_DIR_INO_VALIDATE			15
+#define XFS_ERRTAG_BULKSTAT_READ_CHUNK			16
+#define XFS_ERRTAG_IODONE_IOERR				17
+#define XFS_ERRTAG_STRATREAD_IOERR			18
+#define XFS_ERRTAG_STRATCMPL_IOERR			19
+#define XFS_ERRTAG_DIOWRITE_IOERR			20
+#define XFS_ERRTAG_BMAPIFORMAT				21
+#define XFS_ERRTAG_MAX					22
+
+/*
+ * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
+ */
+#define XFS_RANDOM_DEFAULT				100
+#define XFS_RANDOM_IFLUSH_1				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_2				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_3				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_4				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_5				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_6				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DA_READ_BUF				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BTREE_CHECK_LBLOCK			(XFS_RANDOM_DEFAULT/4)
+#define XFS_RANDOM_BTREE_CHECK_SBLOCK			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ALLOC_READ_AGF			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IALLOC_READ_AGI			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ITOBP_INOTOBP			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK_REMOVE			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DIR_INO_VALIDATE			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BULKSTAT_READ_CHUNK			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IODONE_IOERR				(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATREAD_IOERR			(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATCMPL_IOERR			(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_DIOWRITE_IOERR			(XFS_RANDOM_DEFAULT/10)
+#define	XFS_RANDOM_BMAPIFORMAT				XFS_RANDOM_DEFAULT
+
+#ifdef DEBUG
+extern int xfs_error_test_active;
+extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
+
+#define	XFS_NUM_INJECT_ERROR				10
+#define XFS_TEST_ERROR(expr, mp, tag, rf)		\
+	((expr) || (xfs_error_test_active && \
+	 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
+			(rf))))
+
+extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
+extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
+#else
+#define XFS_TEST_ERROR(expr, mp, tag, rf)	(expr)
+#define xfs_errortag_add(tag, mp)		(ENOSYS)
+#define xfs_errortag_clearall(mp, loud)		(ENOSYS)
+#endif /* DEBUG */
+
+/*
+ * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
+ *			a panic by setting xfs_panic_mask in a sysctl.
+ */
+#define		XFS_NO_PTAG			0
+#define		XFS_PTAG_IFLUSH			0x00000001
+#define		XFS_PTAG_LOGRES			0x00000002
+#define		XFS_PTAG_AILDELETE		0x00000004
+#define		XFS_PTAG_ERROR_REPORT		0x00000008
+#define		XFS_PTAG_SHUTDOWN_CORRUPT	0x00000010
+#define		XFS_PTAG_SHUTDOWN_IOERROR	0x00000020
+#define		XFS_PTAG_SHUTDOWN_LOGERROR	0x00000040
+#define		XFS_PTAG_FSBLOCK_ZERO		0x00000080
+
+#endif	/* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
new file mode 100644
index 00000000..d22e6262
--- /dev/null
+++ b/fs/xfs/xfs_extfree_item.c
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_extfree_item.h"
+
+
+kmem_zone_t	*xfs_efi_zone;
+kmem_zone_t	*xfs_efd_zone;
+
+static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_efi_log_item, efi_item);
+}
+
+void
+xfs_efi_item_free(
+	struct xfs_efi_log_item	*efip)
+{
+	if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
+		kmem_free(efip);
+	else
+		kmem_zone_free(xfs_efi_zone, efip);
+}
+
+/*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the
+ * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
+ * the EFI.
+ */
+STATIC void
+__xfs_efi_release(
+	struct xfs_efi_log_item	*efip)
+{
+	struct xfs_ail		*ailp = efip->efi_item.li_ailp;
+
+	if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
+		spin_lock(&ailp->xa_lock);
+		/* xfs_trans_ail_delete() drops the AIL lock. */
+		xfs_trans_ail_delete(ailp, &efip->efi_item);
+		xfs_efi_item_free(efip);
+	}
+}
+
+/*
+ * This returns the number of iovecs needed to log the given efi item.
+ * We only need 1 iovec for an efi item.  It just logs the efi_log_format
+ * structure.
+ */
+STATIC uint
+xfs_efi_item_size(
+	struct xfs_log_item	*lip)
+{
+	return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given efi log item. We use only 1 iovec, and we point that
+ * at the efi_log_format structure embedded in the efi item.
+ * It is at this point that we assert that all of the extent
+ * slots in the efi item have been filled.
+ */
+STATIC void
+xfs_efi_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_iovec	*log_vector)
+{
+	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
+	uint			size;
+
+	ASSERT(atomic_read(&efip->efi_next_extent) ==
+				efip->efi_format.efi_nextents);
+
+	efip->efi_format.efi_type = XFS_LI_EFI;
+
+	size = sizeof(xfs_efi_log_format_t);
+	size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
+	efip->efi_format.efi_size = 1;
+
+	log_vector->i_addr = &efip->efi_format;
+	log_vector->i_len = size;
+	log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
+	ASSERT(size >= sizeof(xfs_efi_log_format_t));
+}
+
+
+/*
+ * Pinning has no meaning for an efi item, so just return.
+ */
+STATIC void
+xfs_efi_item_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * While EFIs cannot really be pinned, the unpin operation is the last place at
+ * which the EFI is manipulated during a transaction.  If we are being asked to
+ * remove the EFI it's because the transaction has been cancelled and by
+ * definition that means the EFI cannot be in the AIL so remove it from the
+ * transaction and free it.  Otherwise coordinate with xfs_efi_release() (via
+ * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
+ */
+STATIC void
+xfs_efi_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
+
+	if (remove) {
+		ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
+		if (lip->li_desc)
+			xfs_trans_del_item(lip);
+		xfs_efi_item_free(efip);
+		return;
+	}
+	__xfs_efi_release(efip);
+}
+
+/*
+ * Efi items have no locking or pushing.  However, since EFIs are
+ * pulled from the AIL when their corresponding EFDs are committed
+ * to disk, their situation is very similar to being pinned.  Return
+ * XFS_ITEM_PINNED so that the caller will eventually flush the log.
+ * This should help in getting the EFI out of the AIL.
+ */
+STATIC uint
+xfs_efi_item_trylock(
+	struct xfs_log_item	*lip)
+{
+	return XFS_ITEM_PINNED;
+}
+
+/*
+ * Efi items have no locking, so just return.
+ */
+STATIC void
+xfs_efi_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	if (lip->li_flags & XFS_LI_ABORTED)
+		xfs_efi_item_free(EFI_ITEM(lip));
+}
+
+/*
+ * The EFI is logged only once and cannot be moved in the log, so simply return
+ * the lsn at which it's been logged.  For bulk transaction committed
+ * processing, the EFI may be processed but not yet unpinned prior to the EFD
+ * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
+ * when processing the EFD.
+ */
+STATIC xfs_lsn_t
+xfs_efi_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
+
+	set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
+	return lsn;
+}
+
+/*
+ * There isn't much you can do to push on an efi item.  It is simply
+ * stuck waiting for all of its corresponding efd items to be
+ * committed to disk.
+ */
+STATIC void
+xfs_efi_item_push(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * The EFI dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_efi_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all efi log items.
+ */
+static struct xfs_item_ops xfs_efi_item_ops = {
+	.iop_size	= xfs_efi_item_size,
+	.iop_format	= xfs_efi_item_format,
+	.iop_pin	= xfs_efi_item_pin,
+	.iop_unpin	= xfs_efi_item_unpin,
+	.iop_trylock	= xfs_efi_item_trylock,
+	.iop_unlock	= xfs_efi_item_unlock,
+	.iop_committed	= xfs_efi_item_committed,
+	.iop_push	= xfs_efi_item_push,
+	.iop_committing = xfs_efi_item_committing
+};
+
+
+/*
+ * Allocate and initialize an efi item with the given number of extents.
+ */
+struct xfs_efi_log_item *
+xfs_efi_init(
+	struct xfs_mount	*mp,
+	uint			nextents)
+
+{
+	struct xfs_efi_log_item	*efip;
+	uint			size;
+
+	ASSERT(nextents > 0);
+	if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
+		size = (uint)(sizeof(xfs_efi_log_item_t) +
+			((nextents - 1) * sizeof(xfs_extent_t)));
+		efip = kmem_zalloc(size, KM_SLEEP);
+	} else {
+		efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
+	}
+
+	xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
+	efip->efi_format.efi_nextents = nextents;
+	efip->efi_format.efi_id = (__psint_t)(void*)efip;
+	atomic_set(&efip->efi_next_extent, 0);
+
+	return efip;
+}
+
+/*
+ * Copy an EFI format buffer from the given buf, and into the destination
+ * EFI format structure.
+ * The given buffer can be in 32 bit or 64 bit form (which has different padding),
+ * one of which will be the native format for this kernel.
+ * It will handle the conversion of formats if necessary.
+ */
+int
+xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
+{
+	xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
+	uint i;
+	uint len = sizeof(xfs_efi_log_format_t) + 
+		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);  
+	uint len32 = sizeof(xfs_efi_log_format_32_t) + 
+		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);  
+	uint len64 = sizeof(xfs_efi_log_format_64_t) + 
+		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);  
+
+	if (buf->i_len == len) {
+		memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
+		return 0;
+	} else if (buf->i_len == len32) {
+		xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
+
+		dst_efi_fmt->efi_type     = src_efi_fmt_32->efi_type;
+		dst_efi_fmt->efi_size     = src_efi_fmt_32->efi_size;
+		dst_efi_fmt->efi_nextents = src_efi_fmt_32->efi_nextents;
+		dst_efi_fmt->efi_id       = src_efi_fmt_32->efi_id;
+		for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+			dst_efi_fmt->efi_extents[i].ext_start =
+				src_efi_fmt_32->efi_extents[i].ext_start;
+			dst_efi_fmt->efi_extents[i].ext_len =
+				src_efi_fmt_32->efi_extents[i].ext_len;
+		}
+		return 0;
+	} else if (buf->i_len == len64) {
+		xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr;
+
+		dst_efi_fmt->efi_type     = src_efi_fmt_64->efi_type;
+		dst_efi_fmt->efi_size     = src_efi_fmt_64->efi_size;
+		dst_efi_fmt->efi_nextents = src_efi_fmt_64->efi_nextents;
+		dst_efi_fmt->efi_id       = src_efi_fmt_64->efi_id;
+		for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+			dst_efi_fmt->efi_extents[i].ext_start =
+				src_efi_fmt_64->efi_extents[i].ext_start;
+			dst_efi_fmt->efi_extents[i].ext_len =
+				src_efi_fmt_64->efi_extents[i].ext_len;
+		}
+		return 0;
+	}
+	return EFSCORRUPTED;
+}
+
+/*
+ * This is called by the efd item code below to release references to the given
+ * efi item.  Each efd calls this with the number of extents that it has
+ * logged, and when the sum of these reaches the total number of extents logged
+ * by this efi item we can free the efi item.
+ */
+void
+xfs_efi_release(xfs_efi_log_item_t	*efip,
+		uint			nextents)
+{
+	ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
+	if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
+		__xfs_efi_release(efip);
+}
+
+static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_efd_log_item, efd_item);
+}
+
+STATIC void
+xfs_efd_item_free(struct xfs_efd_log_item *efdp)
+{
+	if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
+		kmem_free(efdp);
+	else
+		kmem_zone_free(xfs_efd_zone, efdp);
+}
+
+/*
+ * This returns the number of iovecs needed to log the given efd item.
+ * We only need 1 iovec for an efd item.  It just logs the efd_log_format
+ * structure.
+ */
+STATIC uint
+xfs_efd_item_size(
+	struct xfs_log_item	*lip)
+{
+	return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given efd log item. We use only 1 iovec, and we point that
+ * at the efd_log_format structure embedded in the efd item.
+ * It is at this point that we assert that all of the extent
+ * slots in the efd item have been filled.
+ */
+STATIC void
+xfs_efd_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_iovec	*log_vector)
+{
+	struct xfs_efd_log_item	*efdp = EFD_ITEM(lip);
+	uint			size;
+
+	ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
+
+	efdp->efd_format.efd_type = XFS_LI_EFD;
+
+	size = sizeof(xfs_efd_log_format_t);
+	size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
+	efdp->efd_format.efd_size = 1;
+
+	log_vector->i_addr = &efdp->efd_format;
+	log_vector->i_len = size;
+	log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
+	ASSERT(size >= sizeof(xfs_efd_log_format_t));
+}
+
+/*
+ * Pinning has no meaning for an efd item, so just return.
+ */
+STATIC void
+xfs_efd_item_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an efd item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_efd_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+}
+
+/*
+ * Efd items have no locking, so just return success.
+ */
+STATIC uint
+xfs_efd_item_trylock(
+	struct xfs_log_item	*lip)
+{
+	return XFS_ITEM_LOCKED;
+}
+
+/*
+ * Efd items have no locking or pushing, so return failure
+ * so that the caller doesn't bother with us.
+ */
+STATIC void
+xfs_efd_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	if (lip->li_flags & XFS_LI_ABORTED)
+		xfs_efd_item_free(EFD_ITEM(lip));
+}
+
+/*
+ * When the efd item is committed to disk, all we need to do
+ * is delete our reference to our partner efi item and then
+ * free ourselves.  Since we're freeing ourselves we must
+ * return -1 to keep the transaction code from further referencing
+ * this item.
+ */
+STATIC xfs_lsn_t
+xfs_efd_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_efd_log_item	*efdp = EFD_ITEM(lip);
+
+	/*
+	 * If we got a log I/O error, it's always the case that the LR with the
+	 * EFI got unpinned and freed before the EFD got aborted.
+	 */
+	if (!(lip->li_flags & XFS_LI_ABORTED))
+		xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
+
+	xfs_efd_item_free(efdp);
+	return (xfs_lsn_t)-1;
+}
+
+/*
+ * There isn't much you can do to push on an efd item.  It is simply
+ * stuck waiting for the log to be flushed to disk.
+ */
+STATIC void
+xfs_efd_item_push(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * The EFD dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_efd_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all efd log items.
+ */
+static struct xfs_item_ops xfs_efd_item_ops = {
+	.iop_size	= xfs_efd_item_size,
+	.iop_format	= xfs_efd_item_format,
+	.iop_pin	= xfs_efd_item_pin,
+	.iop_unpin	= xfs_efd_item_unpin,
+	.iop_trylock	= xfs_efd_item_trylock,
+	.iop_unlock	= xfs_efd_item_unlock,
+	.iop_committed	= xfs_efd_item_committed,
+	.iop_push	= xfs_efd_item_push,
+	.iop_committing = xfs_efd_item_committing
+};
+
+/*
+ * Allocate and initialize an efd item with the given number of extents.
+ */
+struct xfs_efd_log_item *
+xfs_efd_init(
+	struct xfs_mount	*mp,
+	struct xfs_efi_log_item	*efip,
+	uint			nextents)
+
+{
+	struct xfs_efd_log_item	*efdp;
+	uint			size;
+
+	ASSERT(nextents > 0);
+	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
+		size = (uint)(sizeof(xfs_efd_log_item_t) +
+			((nextents - 1) * sizeof(xfs_extent_t)));
+		efdp = kmem_zalloc(size, KM_SLEEP);
+	} else {
+		efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
+	}
+
+	xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
+	efdp->efd_efip = efip;
+	efdp->efd_format.efd_nextents = nextents;
+	efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
+
+	return efdp;
+}
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
new file mode 100644
index 00000000..375f68e4
--- /dev/null
+++ b/fs/xfs/xfs_extfree_item.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_EXTFREE_ITEM_H__
+#define	__XFS_EXTFREE_ITEM_H__
+
+struct xfs_mount;
+struct kmem_zone;
+
+typedef struct xfs_extent {
+	xfs_dfsbno_t	ext_start;
+	xfs_extlen_t	ext_len;
+} xfs_extent_t;
+
+/*
+ * Since an xfs_extent_t has types (start:64, len: 32)
+ * there are different alignments on 32 bit and 64 bit kernels.
+ * So we provide the different variants for use by a
+ * conversion routine.
+ */
+
+typedef struct xfs_extent_32 {
+	__uint64_t	ext_start;
+	__uint32_t	ext_len;
+} __attribute__((packed)) xfs_extent_32_t;
+
+typedef struct xfs_extent_64 {
+	__uint64_t	ext_start;
+	__uint32_t	ext_len;
+	__uint32_t	ext_pad;
+} xfs_extent_64_t;
+
+/*
+ * This is the structure used to lay out an efi log item in the
+ * log.  The efi_extents field is a variable size array whose
+ * size is given by efi_nextents.
+ */
+typedef struct xfs_efi_log_format {
+	__uint16_t		efi_type;	/* efi log item type */
+	__uint16_t		efi_size;	/* size of this item */
+	__uint32_t		efi_nextents;	/* # extents to free */
+	__uint64_t		efi_id;		/* efi identifier */
+	xfs_extent_t		efi_extents[1];	/* array of extents to free */
+} xfs_efi_log_format_t;
+
+typedef struct xfs_efi_log_format_32 {
+	__uint16_t		efi_type;	/* efi log item type */
+	__uint16_t		efi_size;	/* size of this item */
+	__uint32_t		efi_nextents;	/* # extents to free */
+	__uint64_t		efi_id;		/* efi identifier */
+	xfs_extent_32_t		efi_extents[1];	/* array of extents to free */
+} __attribute__((packed)) xfs_efi_log_format_32_t;
+
+typedef struct xfs_efi_log_format_64 {
+	__uint16_t		efi_type;	/* efi log item type */
+	__uint16_t		efi_size;	/* size of this item */
+	__uint32_t		efi_nextents;	/* # extents to free */
+	__uint64_t		efi_id;		/* efi identifier */
+	xfs_extent_64_t		efi_extents[1];	/* array of extents to free */
+} xfs_efi_log_format_64_t;
+
+/*
+ * This is the structure used to lay out an efd log item in the
+ * log.  The efd_extents array is a variable size array whose
+ * size is given by efd_nextents;
+ */
+typedef struct xfs_efd_log_format {
+	__uint16_t		efd_type;	/* efd log item type */
+	__uint16_t		efd_size;	/* size of this item */
+	__uint32_t		efd_nextents;	/* # of extents freed */
+	__uint64_t		efd_efi_id;	/* id of corresponding efi */
+	xfs_extent_t		efd_extents[1];	/* array of extents freed */
+} xfs_efd_log_format_t;
+
+typedef struct xfs_efd_log_format_32 {
+	__uint16_t		efd_type;	/* efd log item type */
+	__uint16_t		efd_size;	/* size of this item */
+	__uint32_t		efd_nextents;	/* # of extents freed */
+	__uint64_t		efd_efi_id;	/* id of corresponding efi */
+	xfs_extent_32_t		efd_extents[1];	/* array of extents freed */
+} __attribute__((packed)) xfs_efd_log_format_32_t;
+
+typedef struct xfs_efd_log_format_64 {
+	__uint16_t		efd_type;	/* efd log item type */
+	__uint16_t		efd_size;	/* size of this item */
+	__uint32_t		efd_nextents;	/* # of extents freed */
+	__uint64_t		efd_efi_id;	/* id of corresponding efi */
+	xfs_extent_64_t		efd_extents[1];	/* array of extents freed */
+} xfs_efd_log_format_64_t;
+
+
+#ifdef __KERNEL__
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define	XFS_EFI_MAX_FAST_EXTENTS	16
+
+/*
+ * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
+ */
+#define	XFS_EFI_RECOVERED	1
+#define	XFS_EFI_COMMITTED	2
+
+/*
+ * This is the "extent free intention" log item.  It is used
+ * to log the fact that some extents need to be free.  It is
+ * used in conjunction with the "extent free done" log item
+ * described below.
+ */
+typedef struct xfs_efi_log_item {
+	xfs_log_item_t		efi_item;
+	atomic_t		efi_next_extent;
+	unsigned long		efi_flags;	/* misc flags */
+	xfs_efi_log_format_t	efi_format;
+} xfs_efi_log_item_t;
+
+/*
+ * This is the "extent free done" log item.  It is used to log
+ * the fact that some extents earlier mentioned in an efi item
+ * have been freed.
+ */
+typedef struct xfs_efd_log_item {
+	xfs_log_item_t		efd_item;
+	xfs_efi_log_item_t	*efd_efip;
+	uint			efd_next_extent;
+	xfs_efd_log_format_t	efd_format;
+} xfs_efd_log_item_t;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define	XFS_EFD_MAX_FAST_EXTENTS	16
+
+extern struct kmem_zone	*xfs_efi_zone;
+extern struct kmem_zone	*xfs_efd_zone;
+
+xfs_efi_log_item_t	*xfs_efi_init(struct xfs_mount *, uint);
+xfs_efd_log_item_t	*xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
+				      uint);
+int			xfs_efi_copy_format(xfs_log_iovec_t *buf,
+					    xfs_efi_log_format_t *dst_efi_fmt);
+void			xfs_efi_item_free(xfs_efi_log_item_t *);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
new file mode 100644
index 00000000..9124425b
--- /dev/null
+++ b/fs/xfs/xfs_filestream.c
@@ -0,0 +1,824 @@
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inum.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_ag.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_bmap.h"
+#include "xfs_alloc.h"
+#include "xfs_utils.h"
+#include "xfs_mru_cache.h"
+#include "xfs_filestream.h"
+#include "xfs_trace.h"
+
+#ifdef XFS_FILESTREAMS_TRACE
+
+ktrace_t *xfs_filestreams_trace_buf;
+
+STATIC void
+xfs_filestreams_trace(
+	xfs_mount_t	*mp,	/* mount point */
+	int		type,	/* type of trace */
+	const char	*func,	/* source function */
+	int		line,	/* source line number */
+	__psunsigned_t	arg0,
+	__psunsigned_t	arg1,
+	__psunsigned_t	arg2,
+	__psunsigned_t	arg3,
+	__psunsigned_t	arg4,
+	__psunsigned_t	arg5)
+{
+	ktrace_enter(xfs_filestreams_trace_buf,
+		(void *)(__psint_t)(type | (line << 16)),
+		(void *)func,
+		(void *)(__psunsigned_t)current_pid(),
+		(void *)mp,
+		(void *)(__psunsigned_t)arg0,
+		(void *)(__psunsigned_t)arg1,
+		(void *)(__psunsigned_t)arg2,
+		(void *)(__psunsigned_t)arg3,
+		(void *)(__psunsigned_t)arg4,
+		(void *)(__psunsigned_t)arg5,
+		NULL, NULL, NULL, NULL, NULL, NULL);
+}
+
+#define TRACE0(mp,t)			TRACE6(mp,t,0,0,0,0,0,0)
+#define TRACE1(mp,t,a0)			TRACE6(mp,t,a0,0,0,0,0,0)
+#define TRACE2(mp,t,a0,a1)		TRACE6(mp,t,a0,a1,0,0,0,0)
+#define TRACE3(mp,t,a0,a1,a2)		TRACE6(mp,t,a0,a1,a2,0,0,0)
+#define TRACE4(mp,t,a0,a1,a2,a3)	TRACE6(mp,t,a0,a1,a2,a3,0,0)
+#define TRACE5(mp,t,a0,a1,a2,a3,a4)	TRACE6(mp,t,a0,a1,a2,a3,a4,0)
+#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
+	xfs_filestreams_trace(mp, t, __func__, __LINE__, \
+				(__psunsigned_t)a0, (__psunsigned_t)a1, \
+				(__psunsigned_t)a2, (__psunsigned_t)a3, \
+				(__psunsigned_t)a4, (__psunsigned_t)a5)
+
+#define TRACE_AG_SCAN(mp, ag, ag2) \
+		TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2);
+#define TRACE_AG_PICK1(mp, max_ag, maxfree) \
+		TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree);
+#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \
+		TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \
+			 cnt, free, scan, flag)
+#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \
+		TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2)
+#define TRACE_FREE(mp, ip, pip, ag, cnt) \
+		TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt)
+#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \
+		TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt)
+#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \
+		TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt)
+#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \
+		TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt)
+#define TRACE_ORPHAN(mp, ip, ag) \
+		TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag);
+
+
+#else
+#define TRACE_AG_SCAN(mp, ag, ag2)
+#define TRACE_AG_PICK1(mp, max_ag, maxfree)
+#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag)
+#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2)
+#define TRACE_FREE(mp, ip, pip, ag, cnt)
+#define TRACE_LOOKUP(mp, ip, pip, ag, cnt)
+#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt)
+#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt)
+#define TRACE_ORPHAN(mp, ip, ag)
+#endif
+
+static kmem_zone_t *item_zone;
+
+/*
+ * Structure for associating a file or a directory with an allocation group.
+ * The parent directory pointer is only needed for files, but since there will
+ * generally be vastly more files than directories in the cache, using the same
+ * data structure simplifies the code with very little memory overhead.
+ */
+typedef struct fstrm_item
+{
+	xfs_agnumber_t	ag;	/* AG currently in use for the file/directory. */
+	xfs_inode_t	*ip;	/* inode self-pointer. */
+	xfs_inode_t	*pip;	/* Parent directory inode pointer. */
+} fstrm_item_t;
+
+/*
+ * Allocation group filestream associations are tracked with per-ag atomic
+ * counters.  These counters allow _xfs_filestream_pick_ag() to tell whether a
+ * particular AG already has active filestreams associated with it. The mount
+ * point's m_peraglock is used to protect these counters from per-ag array
+ * re-allocation during a growfs operation.  When xfs_growfs_data_private() is
+ * about to reallocate the array, it calls xfs_filestream_flush() with the
+ * m_peraglock held in write mode.
+ *
+ * Since xfs_mru_cache_flush() guarantees that all the free functions for all
+ * the cache elements have finished executing before it returns, it's safe for
+ * the free functions to use the atomic counters without m_peraglock protection.
+ * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
+ * whether it was called with the m_peraglock held in read mode, write mode or
+ * not held at all.  The race condition this addresses is the following:
+ *
+ *  - The work queue scheduler fires and pulls a filestream directory cache
+ *    element off the LRU end of the cache for deletion, then gets pre-empted.
+ *  - A growfs operation grabs the m_peraglock in write mode, flushes all the
+ *    remaining items from the cache and reallocates the mount point's per-ag
+ *    array, resetting all the counters to zero.
+ *  - The work queue thread resumes and calls the free function for the element
+ *    it started cleaning up earlier.  In the process it decrements the
+ *    filestreams counter for an AG that now has no references.
+ *
+ * With a shrinkfs feature, the above scenario could panic the system.
+ *
+ * All other uses of the following macros should be protected by either the
+ * m_peraglock held in read mode, or the cache's internal locking exposed by the
+ * interval between a call to xfs_mru_cache_lookup() and a call to
+ * xfs_mru_cache_done().  In addition, the m_peraglock must be held in read mode
+ * when new elements are added to the cache.
+ *
+ * Combined, these locking rules ensure that no associations will ever exist in
+ * the cache that reference per-ag array elements that have since been
+ * reallocated.
+ */
+static int
+xfs_filestream_peek_ag(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno)
+{
+	struct xfs_perag *pag;
+	int		ret;
+
+	pag = xfs_perag_get(mp, agno);
+	ret = atomic_read(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+	return ret;
+}
+
+static int
+xfs_filestream_get_ag(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno)
+{
+	struct xfs_perag *pag;
+	int		ret;
+
+	pag = xfs_perag_get(mp, agno);
+	ret = atomic_inc_return(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+	return ret;
+}
+
+static void
+xfs_filestream_put_ag(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno)
+{
+	struct xfs_perag *pag;
+
+	pag = xfs_perag_get(mp, agno);
+	atomic_dec(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+}
+
+/*
+ * Scan the AGs starting at startag looking for an AG that isn't in use and has
+ * at least minlen blocks free.
+ */
+static int
+_xfs_filestream_pick_ag(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	startag,
+	xfs_agnumber_t	*agp,
+	int		flags,
+	xfs_extlen_t	minlen)
+{
+	int		streams, max_streams;
+	int		err, trylock, nscan;
+	xfs_extlen_t	longest, free, minfree, maxfree = 0;
+	xfs_agnumber_t	ag, max_ag = NULLAGNUMBER;
+	struct xfs_perag *pag;
+
+	/* 2% of an AG's blocks must be free for it to be chosen. */
+	minfree = mp->m_sb.sb_agblocks / 50;
+
+	ag = startag;
+	*agp = NULLAGNUMBER;
+
+	/* For the first pass, don't sleep trying to init the per-AG. */
+	trylock = XFS_ALLOC_FLAG_TRYLOCK;
+
+	for (nscan = 0; 1; nscan++) {
+		pag = xfs_perag_get(mp, ag);
+		TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
+
+		if (!pag->pagf_init) {
+			err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
+			if (err && !trylock) {
+				xfs_perag_put(pag);
+				return err;
+			}
+		}
+
+		/* Might fail sometimes during the 1st pass with trylock set. */
+		if (!pag->pagf_init)
+			goto next_ag;
+
+		/* Keep track of the AG with the most free blocks. */
+		if (pag->pagf_freeblks > maxfree) {
+			maxfree = pag->pagf_freeblks;
+			max_streams = atomic_read(&pag->pagf_fstrms);
+			max_ag = ag;
+		}
+
+		/*
+		 * The AG reference count does two things: it enforces mutual
+		 * exclusion when examining the suitability of an AG in this
+		 * loop, and it guards against two filestreams being established
+		 * in the same AG as each other.
+		 */
+		if (xfs_filestream_get_ag(mp, ag) > 1) {
+			xfs_filestream_put_ag(mp, ag);
+			goto next_ag;
+		}
+
+		longest = xfs_alloc_longest_free_extent(mp, pag);
+		if (((minlen && longest >= minlen) ||
+		     (!minlen && pag->pagf_freeblks >= minfree)) &&
+		    (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
+		     (flags & XFS_PICK_LOWSPACE))) {
+
+			/* Break out, retaining the reference on the AG. */
+			free = pag->pagf_freeblks;
+			streams = atomic_read(&pag->pagf_fstrms);
+			xfs_perag_put(pag);
+			*agp = ag;
+			break;
+		}
+
+		/* Drop the reference on this AG, it's not usable. */
+		xfs_filestream_put_ag(mp, ag);
+next_ag:
+		xfs_perag_put(pag);
+		/* Move to the next AG, wrapping to AG 0 if necessary. */
+		if (++ag >= mp->m_sb.sb_agcount)
+			ag = 0;
+
+		/* If a full pass of the AGs hasn't been done yet, continue. */
+		if (ag != startag)
+			continue;
+
+		/* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */
+		if (trylock != 0) {
+			trylock = 0;
+			continue;
+		}
+
+		/* Finally, if lowspace wasn't set, set it for the 3rd pass. */
+		if (!(flags & XFS_PICK_LOWSPACE)) {
+			flags |= XFS_PICK_LOWSPACE;
+			continue;
+		}
+
+		/*
+		 * Take the AG with the most free space, regardless of whether
+		 * it's already in use by another filestream.
+		 */
+		if (max_ag != NULLAGNUMBER) {
+			xfs_filestream_get_ag(mp, max_ag);
+			TRACE_AG_PICK1(mp, max_ag, maxfree);
+			streams = max_streams;
+			free = maxfree;
+			*agp = max_ag;
+			break;
+		}
+
+		/* take AG 0 if none matched */
+		TRACE_AG_PICK1(mp, max_ag, maxfree);
+		*agp = 0;
+		return 0;
+	}
+
+	TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
+
+	return 0;
+}
+
+/*
+ * Set the allocation group number for a file or a directory, updating inode
+ * references and per-AG references as appropriate.
+ */
+static int
+_xfs_filestream_update_ag(
+	xfs_inode_t	*ip,
+	xfs_inode_t	*pip,
+	xfs_agnumber_t	ag)
+{
+	int		err = 0;
+	xfs_mount_t	*mp;
+	xfs_mru_cache_t	*cache;
+	fstrm_item_t	*item;
+	xfs_agnumber_t	old_ag;
+	xfs_inode_t	*old_pip;
+
+	/*
+	 * Either ip is a regular file and pip is a directory, or ip is a
+	 * directory and pip is NULL.
+	 */
+	ASSERT(ip && (((ip->i_d.di_mode & S_IFREG) && pip &&
+	               (pip->i_d.di_mode & S_IFDIR)) ||
+	              ((ip->i_d.di_mode & S_IFDIR) && !pip)));
+
+	mp = ip->i_mount;
+	cache = mp->m_filestream;
+
+	item = xfs_mru_cache_lookup(cache, ip->i_ino);
+	if (item) {
+		ASSERT(item->ip == ip);
+		old_ag = item->ag;
+		item->ag = ag;
+		old_pip = item->pip;
+		item->pip = pip;
+		xfs_mru_cache_done(cache);
+
+		/*
+		 * If the AG has changed, drop the old ref and take a new one,
+		 * effectively transferring the reference from old to new AG.
+		 */
+		if (ag != old_ag) {
+			xfs_filestream_put_ag(mp, old_ag);
+			xfs_filestream_get_ag(mp, ag);
+		}
+
+		/*
+		 * If ip is a file and its pip has changed, drop the old ref and
+		 * take a new one.
+		 */
+		if (pip && pip != old_pip) {
+			IRELE(old_pip);
+			IHOLD(pip);
+		}
+
+		TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag),
+				ag, xfs_filestream_peek_ag(mp, ag));
+		return 0;
+	}
+
+	item = kmem_zone_zalloc(item_zone, KM_MAYFAIL);
+	if (!item)
+		return ENOMEM;
+
+	item->ag = ag;
+	item->ip = ip;
+	item->pip = pip;
+
+	err = xfs_mru_cache_insert(cache, ip->i_ino, item);
+	if (err) {
+		kmem_zone_free(item_zone, item);
+		return err;
+	}
+
+	/* Take a reference on the AG. */
+	xfs_filestream_get_ag(mp, ag);
+
+	/*
+	 * Take a reference on the inode itself regardless of whether it's a
+	 * regular file or a directory.
+	 */
+	IHOLD(ip);
+
+	/*
+	 * In the case of a regular file, take a reference on the parent inode
+	 * as well to ensure it remains in-core.
+	 */
+	if (pip)
+		IHOLD(pip);
+
+	TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag),
+			ag, xfs_filestream_peek_ag(mp, ag));
+
+	return 0;
+}
+
+/* xfs_fstrm_free_func(): callback for freeing cached stream items. */
+STATIC void
+xfs_fstrm_free_func(
+	unsigned long	ino,
+	void		*data)
+{
+	fstrm_item_t	*item  = (fstrm_item_t *)data;
+	xfs_inode_t	*ip = item->ip;
+
+	ASSERT(ip->i_ino == ino);
+
+	xfs_iflags_clear(ip, XFS_IFILESTREAM);
+
+	/* Drop the reference taken on the AG when the item was added. */
+	xfs_filestream_put_ag(ip->i_mount, item->ag);
+
+	TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
+		xfs_filestream_peek_ag(ip->i_mount, item->ag));
+
+	/*
+	 * _xfs_filestream_update_ag() always takes a reference on the inode
+	 * itself, whether it's a file or a directory.  Release it here.
+	 * This can result in the inode being freed and so we must
+	 * not hold any inode locks when freeing filesstreams objects
+	 * otherwise we can deadlock here.
+	 */
+	IRELE(ip);
+
+	/*
+	 * In the case of a regular file, _xfs_filestream_update_ag() also
+	 * takes a ref on the parent inode to keep it in-core.  Release that
+	 * too.
+	 */
+	if (item->pip)
+		IRELE(item->pip);
+
+	/* Finally, free the memory allocated for the item. */
+	kmem_zone_free(item_zone, item);
+}
+
+/*
+ * xfs_filestream_init() is called at xfs initialisation time to set up the
+ * memory zone that will be used for filestream data structure allocation.
+ */
+int
+xfs_filestream_init(void)
+{
+	item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
+	if (!item_zone)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/*
+ * xfs_filestream_uninit() is called at xfs termination time to destroy the
+ * memory zone that was used for filestream data structure allocation.
+ */
+void
+xfs_filestream_uninit(void)
+{
+	kmem_zone_destroy(item_zone);
+}
+
+/*
+ * xfs_filestream_mount() is called when a file system is mounted with the
+ * filestream option.  It is responsible for allocating the data structures
+ * needed to track the new file system's file streams.
+ */
+int
+xfs_filestream_mount(
+	xfs_mount_t	*mp)
+{
+	int		err;
+	unsigned int	lifetime, grp_count;
+
+	/*
+	 * The filestream timer tunable is currently fixed within the range of
+	 * one second to four minutes, with five seconds being the default.  The
+	 * group count is somewhat arbitrary, but it'd be nice to adhere to the
+	 * timer tunable to within about 10 percent.  This requires at least 10
+	 * groups.
+	 */
+	lifetime  = xfs_fstrm_centisecs * 10;
+	grp_count = 10;
+
+	err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count,
+	                     xfs_fstrm_free_func);
+
+	return err;
+}
+
+/*
+ * xfs_filestream_unmount() is called when a file system that was mounted with
+ * the filestream option is unmounted.  It drains the data structures created
+ * to track the file system's file streams and frees all the memory that was
+ * allocated.
+ */
+void
+xfs_filestream_unmount(
+	xfs_mount_t	*mp)
+{
+	xfs_mru_cache_destroy(mp->m_filestream);
+}
+
+/*
+ * Return the AG of the filestream the file or directory belongs to, or
+ * NULLAGNUMBER otherwise.
+ */
+xfs_agnumber_t
+xfs_filestream_lookup_ag(
+	xfs_inode_t	*ip)
+{
+	xfs_mru_cache_t	*cache;
+	fstrm_item_t	*item;
+	xfs_agnumber_t	ag;
+	int		ref;
+
+	if (!(ip->i_d.di_mode & (S_IFREG | S_IFDIR))) {
+		ASSERT(0);
+		return NULLAGNUMBER;
+	}
+
+	cache = ip->i_mount->m_filestream;
+	item = xfs_mru_cache_lookup(cache, ip->i_ino);
+	if (!item) {
+		TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0);
+		return NULLAGNUMBER;
+	}
+
+	ASSERT(ip == item->ip);
+	ag = item->ag;
+	ref = xfs_filestream_peek_ag(ip->i_mount, ag);
+	xfs_mru_cache_done(cache);
+
+	TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref);
+	return ag;
+}
+
+/*
+ * xfs_filestream_associate() should only be called to associate a regular file
+ * with its parent directory.  Calling it with a child directory isn't
+ * appropriate because filestreams don't apply to entire directory hierarchies.
+ * Creating a file in a child directory of an existing filestream directory
+ * starts a new filestream with its own allocation group association.
+ *
+ * Returns < 0 on error, 0 if successful association occurred, > 0 if
+ * we failed to get an association because of locking issues.
+ */
+int
+xfs_filestream_associate(
+	xfs_inode_t	*pip,
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp;
+	xfs_mru_cache_t	*cache;
+	fstrm_item_t	*item;
+	xfs_agnumber_t	ag, rotorstep, startag;
+	int		err = 0;
+
+	ASSERT(pip->i_d.di_mode & S_IFDIR);
+	ASSERT(ip->i_d.di_mode & S_IFREG);
+	if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG))
+		return -EINVAL;
+
+	mp = pip->i_mount;
+	cache = mp->m_filestream;
+
+	/*
+	 * We have a problem, Houston.
+	 *
+	 * Taking the iolock here violates inode locking order - we already
+	 * hold the ilock. Hence if we block getting this lock we may never
+	 * wake. Unfortunately, that means if we can't get the lock, we're
+	 * screwed in terms of getting a stream association - we can't spin
+	 * waiting for the lock because someone else is waiting on the lock we
+	 * hold and we cannot drop that as we are in a transaction here.
+	 *
+	 * Lucky for us, this inversion is not a problem because it's a
+	 * directory inode that we are trying to lock here.
+	 *
+	 * So, if we can't get the iolock without sleeping then just give up
+	 */
+	if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
+		return 1;
+
+	/* If the parent directory is already in the cache, use its AG. */
+	item = xfs_mru_cache_lookup(cache, pip->i_ino);
+	if (item) {
+		ASSERT(item->ip == pip);
+		ag = item->ag;
+		xfs_mru_cache_done(cache);
+
+		TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag));
+		err = _xfs_filestream_update_ag(ip, pip, ag);
+
+		goto exit;
+	}
+
+	/*
+	 * Set the starting AG using the rotor for inode32, otherwise
+	 * use the directory inode's AG.
+	 */
+	if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+		rotorstep = xfs_rotorstep;
+		startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
+		mp->m_agfrotor = (mp->m_agfrotor + 1) %
+		                 (mp->m_sb.sb_agcount * rotorstep);
+	} else
+		startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
+
+	/* Pick a new AG for the parent inode starting at startag. */
+	err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0);
+	if (err || ag == NULLAGNUMBER)
+		goto exit_did_pick;
+
+	/* Associate the parent inode with the AG. */
+	err = _xfs_filestream_update_ag(pip, NULL, ag);
+	if (err)
+		goto exit_did_pick;
+
+	/* Associate the file inode with the AG. */
+	err = _xfs_filestream_update_ag(ip, pip, ag);
+	if (err)
+		goto exit_did_pick;
+
+	TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag));
+
+exit_did_pick:
+	/*
+	 * If _xfs_filestream_pick_ag() returned a valid AG, remove the
+	 * reference it took on it, since the file and directory will have taken
+	 * their own now if they were successfully cached.
+	 */
+	if (ag != NULLAGNUMBER)
+		xfs_filestream_put_ag(mp, ag);
+
+exit:
+	xfs_iunlock(pip, XFS_IOLOCK_EXCL);
+	return -err;
+}
+
+/*
+ * Pick a new allocation group for the current file and its file stream.  This
+ * function is called by xfs_bmap_filestreams() with the mount point's per-ag
+ * lock held.
+ */
+int
+xfs_filestream_new_ag(
+	xfs_bmalloca_t	*ap,
+	xfs_agnumber_t	*agp)
+{
+	int		flags, err;
+	xfs_inode_t	*ip, *pip = NULL;
+	xfs_mount_t	*mp;
+	xfs_mru_cache_t	*cache;
+	xfs_extlen_t	minlen;
+	fstrm_item_t	*dir, *file;
+	xfs_agnumber_t	ag = NULLAGNUMBER;
+
+	ip = ap->ip;
+	mp = ip->i_mount;
+	cache = mp->m_filestream;
+	minlen = ap->alen;
+	*agp = NULLAGNUMBER;
+
+	/*
+	 * Look for the file in the cache, removing it if it's found.  Doing
+	 * this allows it to be held across the dir lookup that follows.
+	 */
+	file = xfs_mru_cache_remove(cache, ip->i_ino);
+	if (file) {
+		ASSERT(ip == file->ip);
+
+		/* Save the file's parent inode and old AG number for later. */
+		pip = file->pip;
+		ag = file->ag;
+
+		/* Look for the file's directory in the cache. */
+		dir = xfs_mru_cache_lookup(cache, pip->i_ino);
+		if (dir) {
+			ASSERT(pip == dir->ip);
+
+			/*
+			 * If the directory has already moved on to a new AG,
+			 * use that AG as the new AG for the file. Don't
+			 * forget to twiddle the AG refcounts to match the
+			 * movement.
+			 */
+			if (dir->ag != file->ag) {
+				xfs_filestream_put_ag(mp, file->ag);
+				xfs_filestream_get_ag(mp, dir->ag);
+				*agp = file->ag = dir->ag;
+			}
+
+			xfs_mru_cache_done(cache);
+		}
+
+		/*
+		 * Put the file back in the cache.  If this fails, the free
+		 * function needs to be called to tidy up in the same way as if
+		 * the item had simply expired from the cache.
+		 */
+		err = xfs_mru_cache_insert(cache, ip->i_ino, file);
+		if (err) {
+			xfs_fstrm_free_func(ip->i_ino, file);
+			return err;
+		}
+
+		/*
+		 * If the file's AG was moved to the directory's new AG, there's
+		 * nothing more to be done.
+		 */
+		if (*agp != NULLAGNUMBER) {
+			TRACE_MOVEAG(mp, ip, pip,
+					ag, xfs_filestream_peek_ag(mp, ag),
+					*agp, xfs_filestream_peek_ag(mp, *agp));
+			return 0;
+		}
+	}
+
+	/*
+	 * If the file's parent directory is known, take its iolock in exclusive
+	 * mode to prevent two sibling files from racing each other to migrate
+	 * themselves and their parent to different AGs.
+	 *
+	 * Note that we lock the parent directory iolock inside the child
+	 * iolock here.  That's fine as we never hold both parent and child
+	 * iolock in any other place.  This is different from the ilock,
+	 * which requires locking of the child after the parent for namespace
+	 * operations.
+	 */
+	if (pip)
+		xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
+
+	/*
+	 * A new AG needs to be found for the file.  If the file's parent
+	 * directory is also known, it will be moved to the new AG as well to
+	 * ensure that files created inside it in future use the new AG.
+	 */
+	ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
+	flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
+	        (ap->low ? XFS_PICK_LOWSPACE : 0);
+
+	err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
+	if (err || *agp == NULLAGNUMBER)
+		goto exit;
+
+	/*
+	 * If the file wasn't found in the file cache, then its parent directory
+	 * inode isn't known.  For this to have happened, the file must either
+	 * be pre-existing, or it was created long enough ago that its cache
+	 * entry has expired.  This isn't the sort of usage that the filestreams
+	 * allocator is trying to optimise, so there's no point trying to track
+	 * its new AG somehow in the filestream data structures.
+	 */
+	if (!pip) {
+		TRACE_ORPHAN(mp, ip, *agp);
+		goto exit;
+	}
+
+	/* Associate the parent inode with the AG. */
+	err = _xfs_filestream_update_ag(pip, NULL, *agp);
+	if (err)
+		goto exit;
+
+	/* Associate the file inode with the AG. */
+	err = _xfs_filestream_update_ag(ip, pip, *agp);
+	if (err)
+		goto exit;
+
+	TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0,
+			*agp, xfs_filestream_peek_ag(mp, *agp));
+
+exit:
+	/*
+	 * If _xfs_filestream_pick_ag() returned a valid AG, remove the
+	 * reference it took on it, since the file and directory will have taken
+	 * their own now if they were successfully cached.
+	 */
+	if (*agp != NULLAGNUMBER)
+		xfs_filestream_put_ag(mp, *agp);
+	else
+		*agp = 0;
+
+	if (pip)
+		xfs_iunlock(pip, XFS_IOLOCK_EXCL);
+
+	return err;
+}
+
+/*
+ * Remove an association between an inode and a filestream object.
+ * Typically this is done on last close of an unlinked file.
+ */
+void
+xfs_filestream_deassociate(
+	xfs_inode_t	*ip)
+{
+	xfs_mru_cache_t	*cache = ip->i_mount->m_filestream;
+
+	xfs_mru_cache_delete(cache, ip->i_ino);
+}
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
new file mode 100644
index 00000000..09dd9af4
--- /dev/null
+++ b/fs/xfs/xfs_filestream.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_FILESTREAM_H__
+#define __XFS_FILESTREAM_H__
+
+#ifdef __KERNEL__
+
+struct xfs_mount;
+struct xfs_inode;
+struct xfs_perag;
+struct xfs_bmalloca;
+
+#ifdef XFS_FILESTREAMS_TRACE
+#define XFS_FSTRM_KTRACE_INFO		1
+#define XFS_FSTRM_KTRACE_AGSCAN		2
+#define XFS_FSTRM_KTRACE_AGPICK1	3
+#define XFS_FSTRM_KTRACE_AGPICK2	4
+#define XFS_FSTRM_KTRACE_UPDATE		5
+#define XFS_FSTRM_KTRACE_FREE		6
+#define	XFS_FSTRM_KTRACE_ITEM_LOOKUP	7
+#define	XFS_FSTRM_KTRACE_ASSOCIATE	8
+#define	XFS_FSTRM_KTRACE_MOVEAG		9
+#define	XFS_FSTRM_KTRACE_ORPHAN		10
+
+#define XFS_FSTRM_KTRACE_SIZE	16384
+extern ktrace_t *xfs_filestreams_trace_buf;
+
+#endif
+
+/* allocation selection flags */
+typedef enum xfs_fstrm_alloc {
+	XFS_PICK_USERDATA = 1,
+	XFS_PICK_LOWSPACE = 2,
+} xfs_fstrm_alloc_t;
+
+/* prototypes for filestream.c */
+int xfs_filestream_init(void);
+void xfs_filestream_uninit(void);
+int xfs_filestream_mount(struct xfs_mount *mp);
+void xfs_filestream_unmount(struct xfs_mount *mp);
+xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
+int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
+void xfs_filestream_deassociate(struct xfs_inode *ip);
+int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
+
+
+/* filestreams for the inode? */
+static inline int
+xfs_inode_is_filestream(
+	struct xfs_inode	*ip)
+{
+	return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) ||
+		xfs_iflags_test(ip, XFS_IFILESTREAM) ||
+		(ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM);
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* __XFS_FILESTREAM_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
new file mode 100644
index 00000000..8f6fc1a9
--- /dev/null
+++ b/fs/xfs/xfs_fs.h
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 1995-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_FS_H__
+#define __XFS_FS_H__
+
+/*
+ * SGI's XFS filesystem's major stuff (constants, structures)
+ */
+
+/*
+ * Direct I/O attribute record used with XFS_IOC_DIOINFO
+ * d_miniosz is the min xfer size, xfer size multiple and file seek offset
+ * alignment.
+ */
+#ifndef HAVE_DIOATTR
+struct dioattr {
+	__u32		d_mem;		/* data buffer memory alignment */
+	__u32		d_miniosz;	/* min xfer size		*/
+	__u32		d_maxiosz;	/* max xfer size		*/
+};
+#endif
+
+/*
+ * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR.
+ */
+#ifndef HAVE_FSXATTR
+struct fsxattr {
+	__u32		fsx_xflags;	/* xflags field value (get/set) */
+	__u32		fsx_extsize;	/* extsize field value (get/set)*/
+	__u32		fsx_nextents;	/* nextents field value (get)	*/
+	__u32		fsx_projid;	/* project identifier (get/set) */
+	unsigned char	fsx_pad[12];
+};
+#endif
+
+/*
+ * Flags for the bs_xflags/fsx_xflags field
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_DIFLAG_s.
+ */
+#define XFS_XFLAG_REALTIME	0x00000001	/* data in realtime volume */
+#define XFS_XFLAG_PREALLOC	0x00000002	/* preallocated file extents */
+#define XFS_XFLAG_IMMUTABLE	0x00000008	/* file cannot be modified */
+#define XFS_XFLAG_APPEND	0x00000010	/* all writes append */
+#define XFS_XFLAG_SYNC		0x00000020	/* all writes synchronous */
+#define XFS_XFLAG_NOATIME	0x00000040	/* do not update access time */
+#define XFS_XFLAG_NODUMP	0x00000080	/* do not include in backups */
+#define XFS_XFLAG_RTINHERIT	0x00000100	/* create with rt bit set */
+#define XFS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
+#define XFS_XFLAG_NOSYMLINKS	0x00000400	/* disallow symlink creation */
+#define XFS_XFLAG_EXTSIZE	0x00000800	/* extent size allocator hint */
+#define XFS_XFLAG_EXTSZINHERIT	0x00001000	/* inherit inode extent size */
+#define XFS_XFLAG_NODEFRAG	0x00002000  	/* do not defragment */
+#define XFS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */
+#define XFS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
+
+/*
+ * Structure for XFS_IOC_GETBMAP.
+ * On input, fill in bmv_offset and bmv_length of the first structure
+ * to indicate the area of interest in the file, and bmv_entries with
+ * the number of array elements given back.  The first structure is
+ * updated on return to give the offset and length for the next call.
+ */
+#ifndef HAVE_GETBMAP
+struct getbmap {
+	__s64		bmv_offset;	/* file offset of segment in blocks */
+	__s64		bmv_block;	/* starting block (64-bit daddr_t)  */
+	__s64		bmv_length;	/* length of segment, blocks	    */
+	__s32		bmv_count;	/* # of entries in array incl. 1st  */
+	__s32		bmv_entries;	/* # of entries filled in (output)  */
+};
+#endif
+
+/*
+ *	Structure for XFS_IOC_GETBMAPX.	 Fields bmv_offset through bmv_entries
+ *	are used exactly as in the getbmap structure.  The getbmapx structure
+ *	has additional bmv_iflags and bmv_oflags fields. The bmv_iflags field
+ *	is only used for the first structure.  It contains input flags
+ *	specifying XFS_IOC_GETBMAPX actions.  The bmv_oflags field is filled
+ *	in by the XFS_IOC_GETBMAPX command for each returned structure after
+ *	the first.
+ */
+#ifndef HAVE_GETBMAPX
+struct getbmapx {
+	__s64		bmv_offset;	/* file offset of segment in blocks */
+	__s64		bmv_block;	/* starting block (64-bit daddr_t)  */
+	__s64		bmv_length;	/* length of segment, blocks	    */
+	__s32		bmv_count;	/* # of entries in array incl. 1st  */
+	__s32		bmv_entries;	/* # of entries filled in (output). */
+	__s32		bmv_iflags;	/* input flags (1st structure)	    */
+	__s32		bmv_oflags;	/* output flags (after 1st structure)*/
+	__s32		bmv_unused1;	/* future use			    */
+	__s32		bmv_unused2;	/* future use			    */
+};
+#endif
+
+/*	bmv_iflags values - set by XFS_IOC_GETBMAPX caller.	*/
+#define BMV_IF_ATTRFORK		0x1	/* return attr fork rather than data */
+#define BMV_IF_NO_DMAPI_READ	0x2	/* Do not generate DMAPI read event  */
+#define BMV_IF_PREALLOC		0x4	/* rtn status BMV_OF_PREALLOC if req */
+#define BMV_IF_DELALLOC		0x8	/* rtn status BMV_OF_DELALLOC if req */
+#define BMV_IF_NO_HOLES		0x10	/* Do not return holes */
+#define BMV_IF_VALID	\
+	(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|	\
+	 BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
+
+/*	bmv_oflags values - returned for each non-header segment */
+#define BMV_OF_PREALLOC		0x1	/* segment = unwritten pre-allocation */
+#define BMV_OF_DELALLOC		0x2	/* segment = delayed allocation */
+#define BMV_OF_LAST		0x4	/* segment is the last in the file */
+
+/*
+ * Structure for XFS_IOC_FSSETDM.
+ * For use by backup and restore programs to set the XFS on-disk inode
+ * fields di_dmevmask and di_dmstate.  These must be set to exactly and
+ * only values previously obtained via xfs_bulkstat!  (Specifically the
+ * xfs_bstat_t fields bs_dmevmask and bs_dmstate.)
+ */
+#ifndef HAVE_FSDMIDATA
+struct fsdmidata {
+	__u32		fsd_dmevmask;	/* corresponds to di_dmevmask */
+	__u16		fsd_padding;
+	__u16		fsd_dmstate;	/* corresponds to di_dmstate  */
+};
+#endif
+
+/*
+ * File segment locking set data type for 64 bit access.
+ * Also used for all the RESV/FREE interfaces.
+ */
+typedef struct xfs_flock64 {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start;
+	__s64		l_len;		/* len == 0 means until end of file */
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserve area			    */
+} xfs_flock64_t;
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY_V1
+ */
+typedef struct xfs_fsop_geom_v1 {
+	__u32		blocksize;	/* filesystem (data) block size */
+	__u32		rtextsize;	/* realtime extent size		*/
+	__u32		agblocks;	/* fsblocks in an AG		*/
+	__u32		agcount;	/* number of allocation groups	*/
+	__u32		logblocks;	/* fsblocks in the log		*/
+	__u32		sectsize;	/* (data) sector size, bytes	*/
+	__u32		inodesize;	/* inode size in bytes		*/
+	__u32		imaxpct;	/* max allowed inode space(%)	*/
+	__u64		datablocks;	/* fsblocks in data subvolume	*/
+	__u64		rtblocks;	/* fsblocks in realtime subvol	*/
+	__u64		rtextents;	/* rt extents in realtime subvol*/
+	__u64		logstart;	/* starting fsblock of the log	*/
+	unsigned char	uuid[16];	/* unique id of the filesystem	*/
+	__u32		sunit;		/* stripe unit, fsblocks	*/
+	__u32		swidth;		/* stripe width, fsblocks	*/
+	__s32		version;	/* structure version		*/
+	__u32		flags;		/* superblock version flags	*/
+	__u32		logsectsize;	/* log sector size, bytes	*/
+	__u32		rtsectsize;	/* realtime sector size, bytes	*/
+	__u32		dirblocksize;	/* directory block size, bytes	*/
+} xfs_fsop_geom_v1_t;
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY
+ */
+typedef struct xfs_fsop_geom {
+	__u32		blocksize;	/* filesystem (data) block size */
+	__u32		rtextsize;	/* realtime extent size		*/
+	__u32		agblocks;	/* fsblocks in an AG		*/
+	__u32		agcount;	/* number of allocation groups	*/
+	__u32		logblocks;	/* fsblocks in the log		*/
+	__u32		sectsize;	/* (data) sector size, bytes	*/
+	__u32		inodesize;	/* inode size in bytes		*/
+	__u32		imaxpct;	/* max allowed inode space(%)	*/
+	__u64		datablocks;	/* fsblocks in data subvolume	*/
+	__u64		rtblocks;	/* fsblocks in realtime subvol	*/
+	__u64		rtextents;	/* rt extents in realtime subvol*/
+	__u64		logstart;	/* starting fsblock of the log	*/
+	unsigned char	uuid[16];	/* unique id of the filesystem	*/
+	__u32		sunit;		/* stripe unit, fsblocks	*/
+	__u32		swidth;		/* stripe width, fsblocks	*/
+	__s32		version;	/* structure version		*/
+	__u32		flags;		/* superblock version flags	*/
+	__u32		logsectsize;	/* log sector size, bytes	*/
+	__u32		rtsectsize;	/* realtime sector size, bytes	*/
+	__u32		dirblocksize;	/* directory block size, bytes	*/
+	__u32		logsunit;	/* log stripe unit, bytes */
+} xfs_fsop_geom_t;
+
+/* Output for XFS_FS_COUNTS */
+typedef struct xfs_fsop_counts {
+	__u64	freedata;	/* free data section blocks */
+	__u64	freertx;	/* free rt extents */
+	__u64	freeino;	/* free inodes */
+	__u64	allocino;	/* total allocated inodes */
+} xfs_fsop_counts_t;
+
+/* Input/Output for XFS_GET_RESBLKS and XFS_SET_RESBLKS */
+typedef struct xfs_fsop_resblks {
+	__u64  resblks;
+	__u64  resblks_avail;
+} xfs_fsop_resblks_t;
+
+#define XFS_FSOP_GEOM_VERSION	0
+
+#define XFS_FSOP_GEOM_FLAGS_ATTR	0x0001	/* attributes in use	*/
+#define XFS_FSOP_GEOM_FLAGS_NLINK	0x0002	/* 32-bit nlink values	*/
+#define XFS_FSOP_GEOM_FLAGS_QUOTA	0x0004	/* quotas enabled	*/
+#define XFS_FSOP_GEOM_FLAGS_IALIGN	0x0008	/* inode alignment	*/
+#define XFS_FSOP_GEOM_FLAGS_DALIGN	0x0010	/* large data alignment */
+#define XFS_FSOP_GEOM_FLAGS_SHARED	0x0020	/* read-only shared	*/
+#define XFS_FSOP_GEOM_FLAGS_EXTFLG	0x0040	/* special extent flag	*/
+#define XFS_FSOP_GEOM_FLAGS_DIRV2	0x0080	/* directory version 2	*/
+#define XFS_FSOP_GEOM_FLAGS_LOGV2	0x0100	/* log format version 2	*/
+#define XFS_FSOP_GEOM_FLAGS_SECTOR	0x0200	/* sector sizes >1BB	*/
+#define XFS_FSOP_GEOM_FLAGS_ATTR2	0x0400	/* inline attributes rework */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI	0x1000	/* ASCII only CI names */
+#define XFS_FSOP_GEOM_FLAGS_LAZYSB	0x4000	/* lazy superblock counters */
+
+
+/*
+ * Minimum and maximum sizes need for growth checks
+ */
+#define XFS_MIN_AG_BLOCKS	64
+#define XFS_MIN_LOG_BLOCKS	512ULL
+#define XFS_MAX_LOG_BLOCKS	(1024 * 1024ULL)
+#define XFS_MIN_LOG_BYTES	(10 * 1024 * 1024ULL)
+
+/* keep the maximum size under 2^31 by a small amount */
+#define XFS_MAX_LOG_BYTES \
+	((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
+
+/*
+ * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
+ */
+typedef struct xfs_growfs_data {
+	__u64		newblocks;	/* new data subvol size, fsblocks */
+	__u32		imaxpct;	/* new inode space percentage limit */
+} xfs_growfs_data_t;
+
+typedef struct xfs_growfs_log {
+	__u32		newblocks;	/* new log size, fsblocks */
+	__u32		isint;		/* 1 if new log is internal */
+} xfs_growfs_log_t;
+
+typedef struct xfs_growfs_rt {
+	__u64		newblocks;	/* new realtime size, fsblocks */
+	__u32		extsize;	/* new realtime extent size, fsblocks */
+} xfs_growfs_rt_t;
+
+
+/*
+ * Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE
+ */
+typedef struct xfs_bstime {
+	time_t		tv_sec;		/* seconds		*/
+	__s32		tv_nsec;	/* and nanoseconds	*/
+} xfs_bstime_t;
+
+typedef struct xfs_bstat {
+	__u64		bs_ino;		/* inode number			*/
+	__u16		bs_mode;	/* type and mode		*/
+	__u16		bs_nlink;	/* number of links		*/
+	__u32		bs_uid;		/* user id			*/
+	__u32		bs_gid;		/* group id			*/
+	__u32		bs_rdev;	/* device value			*/
+	__s32		bs_blksize;	/* block size			*/
+	__s64		bs_size;	/* file size			*/
+	xfs_bstime_t	bs_atime;	/* access time			*/
+	xfs_bstime_t	bs_mtime;	/* modify time			*/
+	xfs_bstime_t	bs_ctime;	/* inode change time		*/
+	int64_t		bs_blocks;	/* number of blocks		*/
+	__u32		bs_xflags;	/* extended flags		*/
+	__s32		bs_extsize;	/* extent size			*/
+	__s32		bs_extents;	/* number of extents		*/
+	__u32		bs_gen;		/* generation count		*/
+	__u16		bs_projid_lo;	/* lower part of project id	*/
+#define	bs_projid	bs_projid_lo	/* (previously just bs_projid)	*/
+	__u16		bs_forkoff;	/* inode fork offset in bytes	*/
+	__u16		bs_projid_hi;	/* higher part of project id	*/
+	unsigned char	bs_pad[10];	/* pad space, unused		*/
+	__u32		bs_dmevmask;	/* DMIG event mask		*/
+	__u16		bs_dmstate;	/* DMIG state info		*/
+	__u16		bs_aextents;	/* attribute number of extents	*/
+} xfs_bstat_t;
+
+/*
+ * The user-level BulkStat Request interface structure.
+ */
+typedef struct xfs_fsop_bulkreq {
+	__u64		__user *lastip;	/* last inode # pointer		*/
+	__s32		icount;		/* count of entries in buffer	*/
+	void		__user *ubuffer;/* user buffer for inode desc.	*/
+	__s32		__user *ocount;	/* output count pointer		*/
+} xfs_fsop_bulkreq_t;
+
+
+/*
+ * Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS).
+ */
+typedef struct xfs_inogrp {
+	__u64		xi_startino;	/* starting inode number	*/
+	__s32		xi_alloccount;	/* # bits set in allocmask	*/
+	__u64		xi_allocmask;	/* mask of allocated inodes	*/
+} xfs_inogrp_t;
+
+
+/*
+ * Error injection.
+ */
+typedef struct xfs_error_injection {
+	__s32		fd;
+	__s32		errtag;
+} xfs_error_injection_t;
+
+
+/*
+ * The user-level Handle Request interface structure.
+ */
+typedef struct xfs_fsop_handlereq {
+	__u32		fd;		/* fd for FD_TO_HANDLE		*/
+	void		__user *path;	/* user pathname		*/
+	__u32		oflags;		/* open flags			*/
+	void		__user *ihandle;/* user supplied handle		*/
+	__u32		ihandlen;	/* user supplied length		*/
+	void		__user *ohandle;/* user buffer for handle	*/
+	__u32		__user *ohandlen;/* user buffer length		*/
+} xfs_fsop_handlereq_t;
+
+/*
+ * Compound structures for passing args through Handle Request interfaces
+ * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle
+ * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and
+ *	     XFS_IOC_ATTRMULTI_BY_HANDLE
+ */
+
+typedef struct xfs_fsop_setdm_handlereq {
+	struct xfs_fsop_handlereq	hreq;	/* handle information	*/
+	struct fsdmidata		__user *data;	/* DMAPI data	*/
+} xfs_fsop_setdm_handlereq_t;
+
+typedef struct xfs_attrlist_cursor {
+	__u32		opaque[4];
+} xfs_attrlist_cursor_t;
+
+typedef struct xfs_fsop_attrlist_handlereq {
+	struct xfs_fsop_handlereq	hreq; /* handle interface structure */
+	struct xfs_attrlist_cursor	pos; /* opaque cookie, list offset */
+	__u32				flags;	/* which namespace to use */
+	__u32				buflen;	/* length of buffer supplied */
+	void				__user *buffer;	/* returned names */
+} xfs_fsop_attrlist_handlereq_t;
+
+typedef struct xfs_attr_multiop {
+	__u32		am_opcode;
+#define ATTR_OP_GET	1	/* return the indicated attr's value */
+#define ATTR_OP_SET	2	/* set/create the indicated attr/value pair */
+#define ATTR_OP_REMOVE	3	/* remove the indicated attr */
+	__s32		am_error;
+	void		__user *am_attrname;
+	void		__user *am_attrvalue;
+	__u32		am_length;
+	__u32		am_flags;
+} xfs_attr_multiop_t;
+
+typedef struct xfs_fsop_attrmulti_handlereq {
+	struct xfs_fsop_handlereq	hreq; /* handle interface structure */
+	__u32				opcount;/* count of following multiop */
+	struct xfs_attr_multiop		__user *ops; /* attr_multi data */
+} xfs_fsop_attrmulti_handlereq_t;
+
+/*
+ * per machine unique filesystem identifier types.
+ */
+typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */
+
+typedef struct xfs_fid {
+	__u16	fid_len;		/* length of remainder	*/
+	__u16	fid_pad;
+	__u32	fid_gen;		/* generation number	*/
+	__u64	fid_ino;		/* 64 bits inode number */
+} xfs_fid_t;
+
+typedef struct xfs_handle {
+	union {
+		__s64	    align;	/* force alignment of ha_fid	 */
+		xfs_fsid_t  _ha_fsid;	/* unique file system identifier */
+	} ha_u;
+	xfs_fid_t	ha_fid;		/* file system specific file ID	 */
+} xfs_handle_t;
+#define ha_fsid ha_u._ha_fsid
+
+#define XFS_HSIZE(handle)	(((char *) &(handle).ha_fid.fid_pad	 \
+				 - (char *) &(handle))			  \
+				 + (handle).ha_fid.fid_len)
+
+/*
+ * Flags for going down operation
+ */
+#define XFS_FSOP_GOING_FLAGS_DEFAULT		0x0	/* going down */
+#define XFS_FSOP_GOING_FLAGS_LOGFLUSH		0x1	/* flush log but not data */
+#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH		0x2	/* don't flush log nor data */
+
+/*
+ * ioctl commands that are used by Linux filesystems
+ */
+#define XFS_IOC_GETXFLAGS	FS_IOC_GETFLAGS
+#define XFS_IOC_SETXFLAGS	FS_IOC_SETFLAGS
+#define XFS_IOC_GETVERSION	FS_IOC_GETVERSION
+
+/*
+ * ioctl commands that replace IRIX fcntl()'s
+ * For 'documentation' purposed more than anything else,
+ * the "cmd #" field reflects the IRIX fcntl number.
+ */
+#define XFS_IOC_ALLOCSP		_IOW ('X', 10, struct xfs_flock64)
+#define XFS_IOC_FREESP		_IOW ('X', 11, struct xfs_flock64)
+#define XFS_IOC_DIOINFO		_IOR ('X', 30, struct dioattr)
+#define XFS_IOC_FSGETXATTR	_IOR ('X', 31, struct fsxattr)
+#define XFS_IOC_FSSETXATTR	_IOW ('X', 32, struct fsxattr)
+#define XFS_IOC_ALLOCSP64	_IOW ('X', 36, struct xfs_flock64)
+#define XFS_IOC_FREESP64	_IOW ('X', 37, struct xfs_flock64)
+#define XFS_IOC_GETBMAP		_IOWR('X', 38, struct getbmap)
+#define XFS_IOC_FSSETDM		_IOW ('X', 39, struct fsdmidata)
+#define XFS_IOC_RESVSP		_IOW ('X', 40, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP	_IOW ('X', 41, struct xfs_flock64)
+#define XFS_IOC_RESVSP64	_IOW ('X', 42, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP64	_IOW ('X', 43, struct xfs_flock64)
+#define XFS_IOC_GETBMAPA	_IOWR('X', 44, struct getbmap)
+#define XFS_IOC_FSGETXATTRA	_IOR ('X', 45, struct fsxattr)
+/*	XFS_IOC_SETBIOSIZE ---- deprecated 46	   */
+/*	XFS_IOC_GETBIOSIZE ---- deprecated 47	   */
+#define XFS_IOC_GETBMAPX	_IOWR('X', 56, struct getbmap)
+#define XFS_IOC_ZERO_RANGE	_IOW ('X', 57, struct xfs_flock64)
+
+/*
+ * ioctl commands that replace IRIX syssgi()'s
+ */
+#define XFS_IOC_FSGEOMETRY_V1	     _IOR ('X', 100, struct xfs_fsop_geom_v1)
+#define XFS_IOC_FSBULKSTAT	     _IOWR('X', 101, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE    _IOWR('X', 102, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS	     _IOWR('X', 103, struct xfs_fsop_bulkreq)
+#define XFS_IOC_PATH_TO_FSHANDLE     _IOWR('X', 104, struct xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE	     _IOWR('X', 105, struct xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE	     _IOWR('X', 106, struct xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE	     _IOWR('X', 107, struct xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE   _IOWR('X', 108, struct xfs_fsop_handlereq)
+#define XFS_IOC_SWAPEXT		     _IOWR('X', 109, struct xfs_swapext)
+#define XFS_IOC_FSGROWFSDATA	     _IOW ('X', 110, struct xfs_growfs_data)
+#define XFS_IOC_FSGROWFSLOG	     _IOW ('X', 111, struct xfs_growfs_log)
+#define XFS_IOC_FSGROWFSRT	     _IOW ('X', 112, struct xfs_growfs_rt)
+#define XFS_IOC_FSCOUNTS	     _IOR ('X', 113, struct xfs_fsop_counts)
+#define XFS_IOC_SET_RESBLKS	     _IOWR('X', 114, struct xfs_fsop_resblks)
+#define XFS_IOC_GET_RESBLKS	     _IOR ('X', 115, struct xfs_fsop_resblks)
+#define XFS_IOC_ERROR_INJECTION	     _IOW ('X', 116, struct xfs_error_injection)
+#define XFS_IOC_ERROR_CLEARALL	     _IOW ('X', 117, struct xfs_error_injection)
+/*	XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118	 */
+/*	XFS_IOC_FREEZE		  -- FIFREEZE   119	 */
+/*	XFS_IOC_THAW		  -- FITHAW     120	 */
+#define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
+#define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
+#define XFS_IOC_ATTRMULTI_BY_HANDLE  _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
+#define XFS_IOC_FSGEOMETRY	     _IOR ('X', 124, struct xfs_fsop_geom)
+#define XFS_IOC_GOINGDOWN	     _IOR ('X', 125, __uint32_t)
+/*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */
+
+
+#ifndef HAVE_BBMACROS
+/*
+ * Block I/O parameterization.	A basic block (BB) is the lowest size of
+ * filesystem allocation, and must equal 512.  Length units given to bio
+ * routines are in BB's.
+ */
+#define BBSHIFT		9
+#define BBSIZE		(1<<BBSHIFT)
+#define BBMASK		(BBSIZE-1)
+#define BTOBB(bytes)	(((__u64)(bytes) + BBSIZE - 1) >> BBSHIFT)
+#define BTOBBT(bytes)	((__u64)(bytes) >> BBSHIFT)
+#define BBTOB(bbs)	((bbs) << BBSHIFT)
+#endif
+
+#endif	/* __XFS_FS_H__ */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
new file mode 100644
index 00000000..9153d2c7
--- /dev/null
+++ b/fs/xfs/xfs_fsops.c
@@ -0,0 +1,671 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_fsops.h"
+#include "xfs_itable.h"
+#include "xfs_trans_space.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rw.h"
+#include "xfs_filestream.h"
+#include "xfs_trace.h"
+
+/*
+ * File system operations
+ */
+
+int
+xfs_fs_geometry(
+	xfs_mount_t		*mp,
+	xfs_fsop_geom_t		*geo,
+	int			new_version)
+{
+
+	memset(geo, 0, sizeof(*geo));
+
+	geo->blocksize = mp->m_sb.sb_blocksize;
+	geo->rtextsize = mp->m_sb.sb_rextsize;
+	geo->agblocks = mp->m_sb.sb_agblocks;
+	geo->agcount = mp->m_sb.sb_agcount;
+	geo->logblocks = mp->m_sb.sb_logblocks;
+	geo->sectsize = mp->m_sb.sb_sectsize;
+	geo->inodesize = mp->m_sb.sb_inodesize;
+	geo->imaxpct = mp->m_sb.sb_imax_pct;
+	geo->datablocks = mp->m_sb.sb_dblocks;
+	geo->rtblocks = mp->m_sb.sb_rblocks;
+	geo->rtextents = mp->m_sb.sb_rextents;
+	geo->logstart = mp->m_sb.sb_logstart;
+	ASSERT(sizeof(geo->uuid)==sizeof(mp->m_sb.sb_uuid));
+	memcpy(geo->uuid, &mp->m_sb.sb_uuid, sizeof(mp->m_sb.sb_uuid));
+	if (new_version >= 2) {
+		geo->sunit = mp->m_sb.sb_unit;
+		geo->swidth = mp->m_sb.sb_width;
+	}
+	if (new_version >= 3) {
+		geo->version = XFS_FSOP_GEOM_VERSION;
+		geo->flags =
+			(xfs_sb_version_hasattr(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_ATTR : 0) |
+			(xfs_sb_version_hasnlink(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_NLINK : 0) |
+			(xfs_sb_version_hasquota(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_QUOTA : 0) |
+			(xfs_sb_version_hasalign(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_IALIGN : 0) |
+			(xfs_sb_version_hasdalign(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_DALIGN : 0) |
+			(xfs_sb_version_hasshared(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_SHARED : 0) |
+			(xfs_sb_version_hasextflgbit(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) |
+			(xfs_sb_version_hasdirv2(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) |
+			(xfs_sb_version_hassector(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_SECTOR : 0) |
+			(xfs_sb_version_hasasciici(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) |
+			(xfs_sb_version_haslazysbcount(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
+			(xfs_sb_version_hasattr2(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_ATTR2 : 0);
+		geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
+				mp->m_sb.sb_logsectsize : BBSIZE;
+		geo->rtsectsize = mp->m_sb.sb_blocksize;
+		geo->dirblocksize = mp->m_dirblksize;
+	}
+	if (new_version >= 4) {
+		geo->flags |=
+			(xfs_sb_version_haslogv2(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_LOGV2 : 0);
+		geo->logsunit = mp->m_sb.sb_logsunit;
+	}
+	return 0;
+}
+
+static int
+xfs_growfs_data_private(
+	xfs_mount_t		*mp,		/* mount point for filesystem */
+	xfs_growfs_data_t	*in)		/* growfs data input struct */
+{
+	xfs_agf_t		*agf;
+	xfs_agi_t		*agi;
+	xfs_agnumber_t		agno;
+	xfs_extlen_t		agsize;
+	xfs_extlen_t		tmpsize;
+	xfs_alloc_rec_t		*arec;
+	struct xfs_btree_block	*block;
+	xfs_buf_t		*bp;
+	int			bucket;
+	int			dpct;
+	int			error;
+	xfs_agnumber_t		nagcount;
+	xfs_agnumber_t		nagimax = 0;
+	xfs_rfsblock_t		nb, nb_mod;
+	xfs_rfsblock_t		new;
+	xfs_rfsblock_t		nfree;
+	xfs_agnumber_t		oagcount;
+	int			pct;
+	xfs_trans_t		*tp;
+
+	nb = in->newblocks;
+	pct = in->imaxpct;
+	if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
+		return XFS_ERROR(EINVAL);
+	if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
+		return error;
+	dpct = pct - mp->m_sb.sb_imax_pct;
+	bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
+				BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
+	if (!bp)
+		return EIO;
+	xfs_buf_relse(bp);
+
+	new = nb;	/* use new as a temporary here */
+	nb_mod = do_div(new, mp->m_sb.sb_agblocks);
+	nagcount = new + (nb_mod != 0);
+	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
+		nagcount--;
+		nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
+		if (nb < mp->m_sb.sb_dblocks)
+			return XFS_ERROR(EINVAL);
+	}
+	new = nb - mp->m_sb.sb_dblocks;
+	oagcount = mp->m_sb.sb_agcount;
+
+	/* allocate the new per-ag structures */
+	if (nagcount > oagcount) {
+		error = xfs_initialize_perag(mp, nagcount, &nagimax);
+		if (error)
+			return error;
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
+	tp->t_flags |= XFS_TRANS_RESERVE;
+	if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
+			XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	/*
+	 * Write new AG headers to disk. Non-transactional, but written
+	 * synchronously so they are completed prior to the growfs transaction
+	 * being logged.
+	 */
+	nfree = 0;
+	for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
+		/*
+		 * AG freelist header block
+		 */
+		bp = xfs_buf_get(mp->m_ddev_targp,
+				 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+				 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+		agf = XFS_BUF_TO_AGF(bp);
+		memset(agf, 0, mp->m_sb.sb_sectsize);
+		agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
+		agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
+		agf->agf_seqno = cpu_to_be32(agno);
+		if (agno == nagcount - 1)
+			agsize =
+				nb -
+				(agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
+		else
+			agsize = mp->m_sb.sb_agblocks;
+		agf->agf_length = cpu_to_be32(agsize);
+		agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
+		agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
+		agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
+		agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
+		agf->agf_flfirst = 0;
+		agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
+		agf->agf_flcount = 0;
+		tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
+		agf->agf_freeblks = cpu_to_be32(tmpsize);
+		agf->agf_longest = cpu_to_be32(tmpsize);
+		error = xfs_bwrite(mp, bp);
+		if (error) {
+			goto error0;
+		}
+		/*
+		 * AG inode header block
+		 */
+		bp = xfs_buf_get(mp->m_ddev_targp,
+				 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+				 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+		agi = XFS_BUF_TO_AGI(bp);
+		memset(agi, 0, mp->m_sb.sb_sectsize);
+		agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
+		agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
+		agi->agi_seqno = cpu_to_be32(agno);
+		agi->agi_length = cpu_to_be32(agsize);
+		agi->agi_count = 0;
+		agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
+		agi->agi_level = cpu_to_be32(1);
+		agi->agi_freecount = 0;
+		agi->agi_newino = cpu_to_be32(NULLAGINO);
+		agi->agi_dirino = cpu_to_be32(NULLAGINO);
+		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
+			agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+		error = xfs_bwrite(mp, bp);
+		if (error) {
+			goto error0;
+		}
+		/*
+		 * BNO btree root block
+		 */
+		bp = xfs_buf_get(mp->m_ddev_targp,
+				 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
+				 BTOBB(mp->m_sb.sb_blocksize),
+				 XBF_LOCK | XBF_MAPPED);
+		block = XFS_BUF_TO_BLOCK(bp);
+		memset(block, 0, mp->m_sb.sb_blocksize);
+		block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
+		block->bb_level = 0;
+		block->bb_numrecs = cpu_to_be16(1);
+		block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+		block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+		arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
+		arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
+		arec->ar_blockcount = cpu_to_be32(
+			agsize - be32_to_cpu(arec->ar_startblock));
+		error = xfs_bwrite(mp, bp);
+		if (error) {
+			goto error0;
+		}
+		/*
+		 * CNT btree root block
+		 */
+		bp = xfs_buf_get(mp->m_ddev_targp,
+				 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
+				 BTOBB(mp->m_sb.sb_blocksize),
+				 XBF_LOCK | XBF_MAPPED);
+		block = XFS_BUF_TO_BLOCK(bp);
+		memset(block, 0, mp->m_sb.sb_blocksize);
+		block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
+		block->bb_level = 0;
+		block->bb_numrecs = cpu_to_be16(1);
+		block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+		block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+		arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
+		arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
+		arec->ar_blockcount = cpu_to_be32(
+			agsize - be32_to_cpu(arec->ar_startblock));
+		nfree += be32_to_cpu(arec->ar_blockcount);
+		error = xfs_bwrite(mp, bp);
+		if (error) {
+			goto error0;
+		}
+		/*
+		 * INO btree root block
+		 */
+		bp = xfs_buf_get(mp->m_ddev_targp,
+				 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
+				 BTOBB(mp->m_sb.sb_blocksize),
+				 XBF_LOCK | XBF_MAPPED);
+		block = XFS_BUF_TO_BLOCK(bp);
+		memset(block, 0, mp->m_sb.sb_blocksize);
+		block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
+		block->bb_level = 0;
+		block->bb_numrecs = 0;
+		block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+		block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+		error = xfs_bwrite(mp, bp);
+		if (error) {
+			goto error0;
+		}
+	}
+	xfs_trans_agblocks_delta(tp, nfree);
+	/*
+	 * There are new blocks in the old last a.g.
+	 */
+	if (new) {
+		/*
+		 * Change the agi length.
+		 */
+		error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
+		if (error) {
+			goto error0;
+		}
+		ASSERT(bp);
+		agi = XFS_BUF_TO_AGI(bp);
+		be32_add_cpu(&agi->agi_length, new);
+		ASSERT(nagcount == oagcount ||
+		       be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
+		xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
+		/*
+		 * Change agf length.
+		 */
+		error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp);
+		if (error) {
+			goto error0;
+		}
+		ASSERT(bp);
+		agf = XFS_BUF_TO_AGF(bp);
+		be32_add_cpu(&agf->agf_length, new);
+		ASSERT(be32_to_cpu(agf->agf_length) ==
+		       be32_to_cpu(agi->agi_length));
+
+		xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
+		/*
+		 * Free the new space.
+		 */
+		error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno,
+			be32_to_cpu(agf->agf_length) - new), new);
+		if (error) {
+			goto error0;
+		}
+	}
+
+	/*
+	 * Update changed superblock fields transactionally. These are not
+	 * seen by the rest of the world until the transaction commit applies
+	 * them atomically to the superblock.
+	 */
+	if (nagcount > oagcount)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
+	if (nb > mp->m_sb.sb_dblocks)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
+				 nb - mp->m_sb.sb_dblocks);
+	if (nfree)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
+	if (dpct)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+	error = xfs_trans_commit(tp, 0);
+	if (error)
+		return error;
+
+	/* New allocation groups fully initialized, so update mount struct */
+	if (nagimax)
+		mp->m_maxagi = nagimax;
+	if (mp->m_sb.sb_imax_pct) {
+		__uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
+		do_div(icount, 100);
+		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
+	} else
+		mp->m_maxicount = 0;
+	xfs_set_low_space_thresholds(mp);
+
+	/* update secondary superblocks. */
+	for (agno = 1; agno < nagcount; agno++) {
+		error = xfs_read_buf(mp, mp->m_ddev_targp,
+				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+				  XFS_FSS_TO_BB(mp, 1), 0, &bp);
+		if (error) {
+			xfs_warn(mp,
+		"error %d reading secondary superblock for ag %d",
+				error, agno);
+			break;
+		}
+		xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
+		/*
+		 * If we get an error writing out the alternate superblocks,
+		 * just issue a warning and continue.  The real work is
+		 * already done and committed.
+		 */
+		if (!(error = xfs_bwrite(mp, bp))) {
+			continue;
+		} else {
+			xfs_warn(mp,
+		"write error %d updating secondary superblock for ag %d",
+				error, agno);
+			break; /* no point in continuing */
+		}
+	}
+	return 0;
+
+ error0:
+	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+	return error;
+}
+
+static int
+xfs_growfs_log_private(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_growfs_log_t	*in)	/* growfs log input struct */
+{
+	xfs_extlen_t		nb;
+
+	nb = in->newblocks;
+	if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
+		return XFS_ERROR(EINVAL);
+	if (nb == mp->m_sb.sb_logblocks &&
+	    in->isint == (mp->m_sb.sb_logstart != 0))
+		return XFS_ERROR(EINVAL);
+	/*
+	 * Moving the log is hard, need new interfaces to sync
+	 * the log first, hold off all activity while moving it.
+	 * Can have shorter or longer log in the same space,
+	 * or transform internal to external log or vice versa.
+	 */
+	return XFS_ERROR(ENOSYS);
+}
+
+/*
+ * protected versions of growfs function acquire and release locks on the mount
+ * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
+ * XFS_IOC_FSGROWFSRT
+ */
+
+
+int
+xfs_growfs_data(
+	xfs_mount_t		*mp,
+	xfs_growfs_data_t	*in)
+{
+	int error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return XFS_ERROR(EPERM);
+	if (!mutex_trylock(&mp->m_growlock))
+		return XFS_ERROR(EWOULDBLOCK);
+	error = xfs_growfs_data_private(mp, in);
+	mutex_unlock(&mp->m_growlock);
+	return error;
+}
+
+int
+xfs_growfs_log(
+	xfs_mount_t		*mp,
+	xfs_growfs_log_t	*in)
+{
+	int error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return XFS_ERROR(EPERM);
+	if (!mutex_trylock(&mp->m_growlock))
+		return XFS_ERROR(EWOULDBLOCK);
+	error = xfs_growfs_log_private(mp, in);
+	mutex_unlock(&mp->m_growlock);
+	return error;
+}
+
+/*
+ * exported through ioctl XFS_IOC_FSCOUNTS
+ */
+
+int
+xfs_fs_counts(
+	xfs_mount_t		*mp,
+	xfs_fsop_counts_t	*cnt)
+{
+	xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+	spin_lock(&mp->m_sb_lock);
+	cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+	cnt->freertx = mp->m_sb.sb_frextents;
+	cnt->freeino = mp->m_sb.sb_ifree;
+	cnt->allocino = mp->m_sb.sb_icount;
+	spin_unlock(&mp->m_sb_lock);
+	return 0;
+}
+
+/*
+ * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
+ *
+ * xfs_reserve_blocks is called to set m_resblks
+ * in the in-core mount table. The number of unused reserved blocks
+ * is kept in m_resblks_avail.
+ *
+ * Reserve the requested number of blocks if available. Otherwise return
+ * as many as possible to satisfy the request. The actual number
+ * reserved are returned in outval
+ *
+ * A null inval pointer indicates that only the current reserved blocks
+ * available  should  be returned no settings are changed.
+ */
+
+int
+xfs_reserve_blocks(
+	xfs_mount_t             *mp,
+	__uint64_t              *inval,
+	xfs_fsop_resblks_t      *outval)
+{
+	__int64_t		lcounter, delta, fdblks_delta;
+	__uint64_t		request;
+
+	/* If inval is null, report current values and return */
+	if (inval == (__uint64_t *)NULL) {
+		if (!outval)
+			return EINVAL;
+		outval->resblks = mp->m_resblks;
+		outval->resblks_avail = mp->m_resblks_avail;
+		return 0;
+	}
+
+	request = *inval;
+
+	/*
+	 * With per-cpu counters, this becomes an interesting
+	 * problem. we needto work out if we are freeing or allocation
+	 * blocks first, then we can do the modification as necessary.
+	 *
+	 * We do this under the m_sb_lock so that if we are near
+	 * ENOSPC, we will hold out any changes while we work out
+	 * what to do. This means that the amount of free space can
+	 * change while we do this, so we need to retry if we end up
+	 * trying to reserve more space than is available.
+	 *
+	 * We also use the xfs_mod_incore_sb() interface so that we
+	 * don't have to care about whether per cpu counter are
+	 * enabled, disabled or even compiled in....
+	 */
+retry:
+	spin_lock(&mp->m_sb_lock);
+	xfs_icsb_sync_counters_locked(mp, 0);
+
+	/*
+	 * If our previous reservation was larger than the current value,
+	 * then move any unused blocks back to the free pool.
+	 */
+	fdblks_delta = 0;
+	if (mp->m_resblks > request) {
+		lcounter = mp->m_resblks_avail - request;
+		if (lcounter  > 0) {		/* release unused blocks */
+			fdblks_delta = lcounter;
+			mp->m_resblks_avail -= lcounter;
+		}
+		mp->m_resblks = request;
+	} else {
+		__int64_t	free;
+
+		free =  mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+		if (!free)
+			goto out; /* ENOSPC and fdblks_delta = 0 */
+
+		delta = request - mp->m_resblks;
+		lcounter = free - delta;
+		if (lcounter < 0) {
+			/* We can't satisfy the request, just get what we can */
+			mp->m_resblks += free;
+			mp->m_resblks_avail += free;
+			fdblks_delta = -free;
+		} else {
+			fdblks_delta = -delta;
+			mp->m_resblks = request;
+			mp->m_resblks_avail += delta;
+		}
+	}
+out:
+	if (outval) {
+		outval->resblks = mp->m_resblks;
+		outval->resblks_avail = mp->m_resblks_avail;
+	}
+	spin_unlock(&mp->m_sb_lock);
+
+	if (fdblks_delta) {
+		/*
+		 * If we are putting blocks back here, m_resblks_avail is
+		 * already at its max so this will put it in the free pool.
+		 *
+		 * If we need space, we'll either succeed in getting it
+		 * from the free block count or we'll get an enospc. If
+		 * we get a ENOSPC, it means things changed while we were
+		 * calculating fdblks_delta and so we should try again to
+		 * see if there is anything left to reserve.
+		 *
+		 * Don't set the reserved flag here - we don't want to reserve
+		 * the extra reserve blocks from the reserve.....
+		 */
+		int error;
+		error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+						 fdblks_delta, 0);
+		if (error == ENOSPC)
+			goto retry;
+	}
+	return 0;
+}
+
+/*
+ * Dump a transaction into the log that contains no real change. This is needed
+ * to be able to make the log dirty or stamp the current tail LSN into the log
+ * during the covering operation.
+ *
+ * We cannot use an inode here for this - that will push dirty state back up
+ * into the VFS and then periodic inode flushing will prevent log covering from
+ * making progress. Hence we log a field in the superblock instead and use a
+ * synchronous transaction to ensure the superblock is immediately unpinned
+ * and can be written back.
+ */
+int
+xfs_fs_log_dummy(
+	xfs_mount_t	*mp)
+{
+	xfs_trans_t	*tp;
+	int		error;
+
+	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
+	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+					XFS_DEFAULT_LOG_COUNT);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	/* log the UUID because it is an unchanging field */
+	xfs_mod_sb(tp, XFS_SB_UUID);
+	xfs_trans_set_sync(tp);
+	return xfs_trans_commit(tp, 0);
+}
+
+int
+xfs_fs_goingdown(
+	xfs_mount_t	*mp,
+	__uint32_t	inflags)
+{
+	switch (inflags) {
+	case XFS_FSOP_GOING_FLAGS_DEFAULT: {
+		struct super_block *sb = freeze_bdev(mp->m_super->s_bdev);
+
+		if (sb && !IS_ERR(sb)) {
+			xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
+			thaw_bdev(sb->s_bdev, sb);
+		}
+
+		break;
+	}
+	case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
+		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
+		break;
+	case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
+		xfs_force_shutdown(mp,
+				SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
+		break;
+	default:
+		return XFS_ERROR(EINVAL);
+	}
+
+	return 0;
+}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
new file mode 100644
index 00000000..1b6a98b6
--- /dev/null
+++ b/fs/xfs/xfs_fsops.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_FSOPS_H__
+#define	__XFS_FSOPS_H__
+
+extern int xfs_fs_geometry(xfs_mount_t *mp, xfs_fsop_geom_t *geo, int nversion);
+extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in);
+extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in);
+extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
+extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
+				xfs_fsop_resblks_t *outval);
+extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
+extern int xfs_fs_log_dummy(struct xfs_mount *mp);
+
+#endif	/* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
new file mode 100644
index 00000000..84ebeec1
--- /dev/null
+++ b/fs/xfs/xfs_ialloc.c
@@ -0,0 +1,1563 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+
+
+/*
+ * Allocation group level functions.
+ */
+static inline int
+xfs_ialloc_cluster_alignment(
+	xfs_alloc_arg_t	*args)
+{
+	if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
+	    args->mp->m_sb.sb_inoalignmt >=
+	     XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp)))
+		return args->mp->m_sb.sb_inoalignmt;
+	return 1;
+}
+
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agino_t		ino,	/* starting inode of chunk */
+	xfs_lookup_t		dir,	/* <=, >=, == */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.i.ir_startino = ino;
+	cur->bc_rec.i.ir_freecount = 0;
+	cur->bc_rec.i.ir_free = 0;
+	return xfs_btree_lookup(cur, dir, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int				/* error */
+xfs_inobt_update(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_inobt_rec_incore_t	*irec)	/* btree record */
+{
+	union xfs_btree_rec	rec;
+
+	rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
+	rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+	rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
+	return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_inobt_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_inobt_rec_incore_t	*irec,	/* btree record */
+	int			*stat)	/* output: success/failure */
+{
+	union xfs_btree_rec	*rec;
+	int			error;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (!error && *stat == 1) {
+		irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+		irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
+		irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+	}
+	return error;
+}
+
+/*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+STATIC int
+xfs_check_agi_freecount(
+	struct xfs_btree_cur	*cur,
+	struct xfs_agi		*agi)
+{
+	if (cur->bc_nlevels == 1) {
+		xfs_inobt_rec_incore_t rec;
+		int		freecount = 0;
+		int		error;
+		int		i;
+
+		error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+		if (error)
+			return error;
+
+		do {
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+			if (error)
+				return error;
+
+			if (i) {
+				freecount += rec.ir_freecount;
+				error = xfs_btree_increment(cur, 0, &i);
+				if (error)
+					return error;
+			}
+		} while (i == 1);
+
+		if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+			ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+	}
+	return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur, agi)	0
+#endif
+
+/*
+ * Initialise a new set of inodes.
+ */
+STATIC void
+xfs_ialloc_inode_init(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	xfs_agblock_t		length,
+	unsigned int		gen)
+{
+	struct xfs_buf		*fbuf;
+	struct xfs_dinode	*free;
+	int			blks_per_cluster, nbufs, ninodes;
+	int			version;
+	int			i, j;
+	xfs_daddr_t		d;
+
+	/*
+	 * Loop over the new block(s), filling in the inodes.
+	 * For small block sizes, manipulate the inodes in buffers
+	 * which are multiples of the blocks size.
+	 */
+	if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+		blks_per_cluster = 1;
+		nbufs = length;
+		ninodes = mp->m_sb.sb_inopblock;
+	} else {
+		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
+				   mp->m_sb.sb_blocksize;
+		nbufs = length / blks_per_cluster;
+		ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
+	}
+
+	/*
+	 * Figure out what version number to use in the inodes we create.
+	 * If the superblock version has caught up to the one that supports
+	 * the new inode format, then use the new inode version.  Otherwise
+	 * use the old version so that old kernels will continue to be
+	 * able to use the file system.
+	 */
+	if (xfs_sb_version_hasnlink(&mp->m_sb))
+		version = 2;
+	else
+		version = 1;
+
+	for (j = 0; j < nbufs; j++) {
+		/*
+		 * Get the block.
+		 */
+		d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+		fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+					 mp->m_bsize * blks_per_cluster,
+					 XBF_LOCK);
+		ASSERT(fbuf);
+		ASSERT(!XFS_BUF_GETERROR(fbuf));
+
+		/*
+		 * Initialize all inodes in this buffer and then log them.
+		 *
+		 * XXX: It would be much better if we had just one transaction
+		 *	to log a whole cluster of inodes instead of all the
+		 *	individual transactions causing a lot of log traffic.
+		 */
+		xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+		for (i = 0; i < ninodes; i++) {
+			int	ioffset = i << mp->m_sb.sb_inodelog;
+			uint	isize = sizeof(struct xfs_dinode);
+
+			free = xfs_make_iptr(mp, fbuf, i);
+			free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+			free->di_version = version;
+			free->di_gen = cpu_to_be32(gen);
+			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+			xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
+		}
+		xfs_trans_inode_alloc_buf(tp, fbuf);
+	}
+}
+
+/*
+ * Allocate new inodes in the allocation group specified by agbp.
+ * Return 0 for success, else error code.
+ */
+STATIC int				/* error code or 0 */
+xfs_ialloc_ag_alloc(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_buf_t	*agbp,		/* alloc group buffer */
+	int		*alloc)
+{
+	xfs_agi_t	*agi;		/* allocation group header */
+	xfs_alloc_arg_t	args;		/* allocation argument structure */
+	xfs_btree_cur_t	*cur;		/* inode btree cursor */
+	xfs_agnumber_t	agno;
+	int		error;
+	int		i;
+	xfs_agino_t	newino;		/* new first inode's number */
+	xfs_agino_t	newlen;		/* new number of inodes */
+	xfs_agino_t	thisino;	/* current inode number, for loop */
+	int		isaligned = 0;	/* inode allocation at stripe unit */
+					/* boundary */
+	struct xfs_perag *pag;
+
+	args.tp = tp;
+	args.mp = tp->t_mountp;
+
+	/*
+	 * Locking will ensure that we don't have two callers in here
+	 * at one time.
+	 */
+	newlen = XFS_IALLOC_INODES(args.mp);
+	if (args.mp->m_maxicount &&
+	    args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
+		return XFS_ERROR(ENOSPC);
+	args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
+	/*
+	 * First try to allocate inodes contiguous with the last-allocated
+	 * chunk of inodes.  If the filesystem is striped, this will fill
+	 * an entire stripe unit with inodes.
+ 	 */
+	agi = XFS_BUF_TO_AGI(agbp);
+	newino = be32_to_cpu(agi->agi_newino);
+	agno = be32_to_cpu(agi->agi_seqno);
+	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
+			XFS_IALLOC_BLOCKS(args.mp);
+	if (likely(newino != NULLAGINO &&
+		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+		args.type = XFS_ALLOCTYPE_THIS_BNO;
+		args.mod = args.total = args.wasdel = args.isfl =
+			args.userdata = args.minalignslop = 0;
+		args.prod = 1;
+
+		/*
+		 * We need to take into account alignment here to ensure that
+		 * we don't modify the free list if we fail to have an exact
+		 * block. If we don't have an exact match, and every oher
+		 * attempt allocation attempt fails, we'll end up cancelling
+		 * a dirty transaction and shutting down.
+		 *
+		 * For an exact allocation, alignment must be 1,
+		 * however we need to take cluster alignment into account when
+		 * fixing up the freelist. Use the minalignslop field to
+		 * indicate that extra blocks might be required for alignment,
+		 * but not to use them in the actual exact allocation.
+		 */
+		args.alignment = 1;
+		args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
+
+		/* Allow space for the inode btree to split. */
+		args.minleft = args.mp->m_in_maxlevels - 1;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	} else
+		args.fsbno = NULLFSBLOCK;
+
+	if (unlikely(args.fsbno == NULLFSBLOCK)) {
+		/*
+		 * Set the alignment for the allocation.
+		 * If stripe alignment is turned on then align at stripe unit
+		 * boundary.
+		 * If the cluster size is smaller than a filesystem block
+		 * then we're doing I/O for inodes in filesystem block size
+		 * pieces, so don't need alignment anyway.
+		 */
+		isaligned = 0;
+		if (args.mp->m_sinoalign) {
+			ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
+			args.alignment = args.mp->m_dalign;
+			isaligned = 1;
+		} else
+			args.alignment = xfs_ialloc_cluster_alignment(&args);
+		/*
+		 * Need to figure out where to allocate the inode blocks.
+		 * Ideally they should be spaced out through the a.g.
+		 * For now, just allocate blocks up front.
+		 */
+		args.agbno = be32_to_cpu(agi->agi_root);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+		/*
+		 * Allocate a fixed-size extent of inodes.
+		 */
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.mod = args.total = args.wasdel = args.isfl =
+			args.userdata = args.minalignslop = 0;
+		args.prod = 1;
+		/*
+		 * Allow space for the inode btree to split.
+		 */
+		args.minleft = args.mp->m_in_maxlevels - 1;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+
+	/*
+	 * If stripe alignment is turned on, then try again with cluster
+	 * alignment.
+	 */
+	if (isaligned && args.fsbno == NULLFSBLOCK) {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.agbno = be32_to_cpu(agi->agi_root);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+		args.alignment = xfs_ialloc_cluster_alignment(&args);
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+
+	if (args.fsbno == NULLFSBLOCK) {
+		*alloc = 0;
+		return 0;
+	}
+	ASSERT(args.len == args.minlen);
+
+	/*
+	 * Stamp and write the inode buffers.
+	 *
+	 * Seed the new inode cluster with a random generation number. This
+	 * prevents short-term reuse of generation numbers if a chunk is
+	 * freed and then immediately reallocated. We use random numbers
+	 * rather than a linear progression to prevent the next generation
+	 * number from being easily guessable.
+	 */
+	xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
+			      random32());
+
+	/*
+	 * Convert the results.
+	 */
+	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+	be32_add_cpu(&agi->agi_count, newlen);
+	be32_add_cpu(&agi->agi_freecount, newlen);
+	pag = xfs_perag_get(args.mp, agno);
+	pag->pagi_freecount += newlen;
+	xfs_perag_put(pag);
+	agi->agi_newino = cpu_to_be32(newino);
+
+	/*
+	 * Insert records describing the new inode chunk into the btree.
+	 */
+	cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno);
+	for (thisino = newino;
+	     thisino < newino + newlen;
+	     thisino += XFS_INODES_PER_CHUNK) {
+		cur->bc_rec.i.ir_startino = thisino;
+		cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
+		cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
+		error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
+		if (error) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+			return error;
+		}
+		ASSERT(i == 0);
+		error = xfs_btree_insert(cur, &i);
+		if (error) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+			return error;
+		}
+		ASSERT(i == 1);
+	}
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	/*
+	 * Log allocation group header fields
+	 */
+	xfs_ialloc_log_agi(tp, agbp,
+		XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
+	/*
+	 * Modify/log superblock values for inode count and inode free count.
+	 */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
+	*alloc = 1;
+	return 0;
+}
+
+STATIC xfs_agnumber_t
+xfs_ialloc_next_ag(
+	xfs_mount_t	*mp)
+{
+	xfs_agnumber_t	agno;
+
+	spin_lock(&mp->m_agirotor_lock);
+	agno = mp->m_agirotor;
+	if (++mp->m_agirotor == mp->m_maxagi)
+		mp->m_agirotor = 0;
+	spin_unlock(&mp->m_agirotor_lock);
+
+	return agno;
+}
+
+/*
+ * Select an allocation group to look for a free inode in, based on the parent
+ * inode and then mode.  Return the allocation group buffer.
+ */
+STATIC xfs_buf_t *			/* allocation group buffer */
+xfs_ialloc_ag_select(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	parent,		/* parent directory inode number */
+	mode_t		mode,		/* bits set to indicate file type */
+	int		okalloc)	/* ok to allocate more space */
+{
+	xfs_buf_t	*agbp;		/* allocation group header buffer */
+	xfs_agnumber_t	agcount;	/* number of ag's in the filesystem */
+	xfs_agnumber_t	agno;		/* current ag number */
+	int		flags;		/* alloc buffer locking flags */
+	xfs_extlen_t	ineed;		/* blocks needed for inode allocation */
+	xfs_extlen_t	longest = 0;	/* longest extent available */
+	xfs_mount_t	*mp;		/* mount point structure */
+	int		needspace;	/* file mode implies space allocated */
+	xfs_perag_t	*pag;		/* per allocation group data */
+	xfs_agnumber_t	pagno;		/* parent (starting) ag number */
+
+	/*
+	 * Files of these types need at least one block if length > 0
+	 * (and they won't fit in the inode, but that's hard to figure out).
+	 */
+	needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
+	mp = tp->t_mountp;
+	agcount = mp->m_maxagi;
+	if (S_ISDIR(mode))
+		pagno = xfs_ialloc_next_ag(mp);
+	else {
+		pagno = XFS_INO_TO_AGNO(mp, parent);
+		if (pagno >= agcount)
+			pagno = 0;
+	}
+	ASSERT(pagno < agcount);
+	/*
+	 * Loop through allocation groups, looking for one with a little
+	 * free space in it.  Note we don't look for free inodes, exactly.
+	 * Instead, we include whether there is a need to allocate inodes
+	 * to mean that blocks must be allocated for them,
+	 * if none are currently free.
+	 */
+	agno = pagno;
+	flags = XFS_ALLOC_FLAG_TRYLOCK;
+	for (;;) {
+		pag = xfs_perag_get(mp, agno);
+		if (!pag->pagi_init) {
+			if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
+				agbp = NULL;
+				goto nextag;
+			}
+		} else
+			agbp = NULL;
+
+		if (!pag->pagi_inodeok) {
+			xfs_ialloc_next_ag(mp);
+			goto unlock_nextag;
+		}
+
+		/*
+		 * Is there enough free space for the file plus a block
+		 * of inodes (if we need to allocate some)?
+		 */
+		ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp);
+		if (ineed && !pag->pagf_init) {
+			if (agbp == NULL &&
+			    xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
+				agbp = NULL;
+				goto nextag;
+			}
+			(void)xfs_alloc_pagf_init(mp, tp, agno, flags);
+		}
+		if (!ineed || pag->pagf_init) {
+			if (ineed && !(longest = pag->pagf_longest))
+				longest = pag->pagf_flcount > 0;
+			if (!ineed ||
+			    (pag->pagf_freeblks >= needspace + ineed &&
+			     longest >= ineed &&
+			     okalloc)) {
+				if (agbp == NULL &&
+				    xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
+					agbp = NULL;
+					goto nextag;
+				}
+				xfs_perag_put(pag);
+				return agbp;
+			}
+		}
+unlock_nextag:
+		if (agbp)
+			xfs_trans_brelse(tp, agbp);
+nextag:
+		xfs_perag_put(pag);
+		/*
+		 * No point in iterating over the rest, if we're shutting
+		 * down.
+		 */
+		if (XFS_FORCED_SHUTDOWN(mp))
+			return NULL;
+		agno++;
+		if (agno >= agcount)
+			agno = 0;
+		if (agno == pagno) {
+			if (flags == 0)
+				return NULL;
+			flags = 0;
+		}
+	}
+}
+
+/*
+ * Try to retrieve the next record to the left/right from the current one.
+ */
+STATIC int
+xfs_ialloc_next_rec(
+	struct xfs_btree_cur	*cur,
+	xfs_inobt_rec_incore_t	*rec,
+	int			*done,
+	int			left)
+{
+	int                     error;
+	int			i;
+
+	if (left)
+		error = xfs_btree_decrement(cur, 0, &i);
+	else
+		error = xfs_btree_increment(cur, 0, &i);
+
+	if (error)
+		return error;
+	*done = !i;
+	if (i) {
+		error = xfs_inobt_get_rec(cur, rec, &i);
+		if (error)
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+
+	return 0;
+}
+
+STATIC int
+xfs_ialloc_get_rec(
+	struct xfs_btree_cur	*cur,
+	xfs_agino_t		agino,
+	xfs_inobt_rec_incore_t	*rec,
+	int			*done,
+	int			left)
+{
+	int                     error;
+	int			i;
+
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+	if (error)
+		return error;
+	*done = !i;
+	if (i) {
+		error = xfs_inobt_get_rec(cur, rec, &i);
+		if (error)
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+
+	return 0;
+}
+
+/*
+ * Visible inode allocation functions.
+ */
+
+/*
+ * Allocate an inode on disk.
+ * Mode is used to tell whether the new inode will need space, and whether
+ * it is a directory.
+ *
+ * The arguments IO_agbp and alloc_done are defined to work within
+ * the constraint of one allocation per transaction.
+ * xfs_dialloc() is designed to be called twice if it has to do an
+ * allocation to make more free inodes.  On the first call,
+ * IO_agbp should be set to NULL. If an inode is available,
+ * i.e., xfs_dialloc() did not need to do an allocation, an inode
+ * number is returned.  In this case, IO_agbp would be set to the
+ * current ag_buf and alloc_done set to false.
+ * If an allocation needed to be done, xfs_dialloc would return
+ * the current ag_buf in IO_agbp and set alloc_done to true.
+ * The caller should then commit the current transaction, allocate a new
+ * transaction, and call xfs_dialloc() again, passing in the previous
+ * value of IO_agbp.  IO_agbp should be held across the transactions.
+ * Since the agbp is locked across the two calls, the second call is
+ * guaranteed to have a free inode available.
+ *
+ * Once we successfully pick an inode its number is returned and the
+ * on-disk data structures are updated.  The inode itself is not read
+ * in, since doing so would break ordering constraints with xfs_reclaim.
+ */
+int
+xfs_dialloc(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	parent,		/* parent inode (directory) */
+	mode_t		mode,		/* mode bits for new inode */
+	int		okalloc,	/* ok to allocate more space */
+	xfs_buf_t	**IO_agbp,	/* in/out ag header's buffer */
+	boolean_t	*alloc_done,	/* true if we needed to replenish
+					   inode freelist */
+	xfs_ino_t	*inop)		/* inode number allocated */
+{
+	xfs_agnumber_t	agcount;	/* number of allocation groups */
+	xfs_buf_t	*agbp;		/* allocation group header's buffer */
+	xfs_agnumber_t	agno;		/* allocation group number */
+	xfs_agi_t	*agi;		/* allocation group header structure */
+	xfs_btree_cur_t	*cur;		/* inode allocation btree cursor */
+	int		error;		/* error return value */
+	int		i;		/* result code */
+	int		ialloced;	/* inode allocation status */
+	int		noroom = 0;	/* no space for inode blk allocation */
+	xfs_ino_t	ino;		/* fs-relative inode to be returned */
+	/* REFERENCED */
+	int		j;		/* result code */
+	xfs_mount_t	*mp;		/* file system mount structure */
+	int		offset;		/* index of inode in chunk */
+	xfs_agino_t	pagino;		/* parent's AG relative inode # */
+	xfs_agnumber_t	pagno;		/* parent's AG number */
+	xfs_inobt_rec_incore_t rec;	/* inode allocation record */
+	xfs_agnumber_t	tagno;		/* testing allocation group number */
+	xfs_btree_cur_t	*tcur;		/* temp cursor */
+	xfs_inobt_rec_incore_t trec;	/* temp inode allocation record */
+	struct xfs_perag *pag;
+
+
+	if (*IO_agbp == NULL) {
+		/*
+		 * We do not have an agbp, so select an initial allocation
+		 * group for inode allocation.
+		 */
+		agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
+		/*
+		 * Couldn't find an allocation group satisfying the
+		 * criteria, give up.
+		 */
+		if (!agbp) {
+			*inop = NULLFSINO;
+			return 0;
+		}
+		agi = XFS_BUF_TO_AGI(agbp);
+		ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
+	} else {
+		/*
+		 * Continue where we left off before.  In this case, we
+		 * know that the allocation group has free inodes.
+		 */
+		agbp = *IO_agbp;
+		agi = XFS_BUF_TO_AGI(agbp);
+		ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
+		ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
+	}
+	mp = tp->t_mountp;
+	agcount = mp->m_sb.sb_agcount;
+	agno = be32_to_cpu(agi->agi_seqno);
+	tagno = agno;
+	pagno = XFS_INO_TO_AGNO(mp, parent);
+	pagino = XFS_INO_TO_AGINO(mp, parent);
+
+	/*
+	 * If we have already hit the ceiling of inode blocks then clear
+	 * okalloc so we scan all available agi structures for a free
+	 * inode.
+	 */
+
+	if (mp->m_maxicount &&
+	    mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
+		noroom = 1;
+		okalloc = 0;
+	}
+
+	/*
+	 * Loop until we find an allocation group that either has free inodes
+	 * or in which we can allocate some inodes.  Iterate through the
+	 * allocation groups upward, wrapping at the end.
+	 */
+	*alloc_done = B_FALSE;
+	while (!agi->agi_freecount) {
+		/*
+		 * Don't do anything if we're not supposed to allocate
+		 * any blocks, just go on to the next ag.
+		 */
+		if (okalloc) {
+			/*
+			 * Try to allocate some new inodes in the allocation
+			 * group.
+			 */
+			if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) {
+				xfs_trans_brelse(tp, agbp);
+				if (error == ENOSPC) {
+					*inop = NULLFSINO;
+					return 0;
+				} else
+					return error;
+			}
+			if (ialloced) {
+				/*
+				 * We successfully allocated some inodes, return
+				 * the current context to the caller so that it
+				 * can commit the current transaction and call
+				 * us again where we left off.
+				 */
+				ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
+				*alloc_done = B_TRUE;
+				*IO_agbp = agbp;
+				*inop = NULLFSINO;
+				return 0;
+			}
+		}
+		/*
+		 * If it failed, give up on this ag.
+		 */
+		xfs_trans_brelse(tp, agbp);
+		/*
+		 * Go on to the next ag: get its ag header.
+		 */
+nextag:
+		if (++tagno == agcount)
+			tagno = 0;
+		if (tagno == agno) {
+			*inop = NULLFSINO;
+			return noroom ? ENOSPC : 0;
+		}
+		pag = xfs_perag_get(mp, tagno);
+		if (pag->pagi_inodeok == 0) {
+			xfs_perag_put(pag);
+			goto nextag;
+		}
+		error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
+		xfs_perag_put(pag);
+		if (error)
+			goto nextag;
+		agi = XFS_BUF_TO_AGI(agbp);
+		ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
+	}
+	/*
+	 * Here with an allocation group that has a free inode.
+	 * Reset agno since we may have chosen a new ag in the
+	 * loop above.
+	 */
+	agno = tagno;
+	*IO_agbp = NULL;
+	pag = xfs_perag_get(mp, agno);
+
+ restart_pagno:
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
+	/*
+	 * If pagino is 0 (this is the root inode allocation) use newino.
+	 * This must work because we've just allocated some.
+	 */
+	if (!pagino)
+		pagino = be32_to_cpu(agi->agi_newino);
+
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
+	/*
+	 * If in the same AG as the parent, try to get near the parent.
+	 */
+	if (pagno == agno) {
+		int		doneleft;	/* done, to the left */
+		int		doneright;	/* done, to the right */
+		int		searchdistance = 10;
+
+		error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		error = xfs_inobt_get_rec(cur, &rec, &j);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		if (rec.ir_freecount > 0) {
+			/*
+			 * Found a free inode in the same chunk
+			 * as the parent, done.
+			 */
+			goto alloc_inode;
+		}
+
+
+		/*
+		 * In the same AG as parent, but parent's chunk is full.
+		 */
+
+		/* duplicate the cursor, search left & right simultaneously */
+		error = xfs_btree_dup_cursor(cur, &tcur);
+		if (error)
+			goto error0;
+
+		/*
+		 * Skip to last blocks looked up if same parent inode.
+		 */
+		if (pagino != NULLAGINO &&
+		    pag->pagl_pagino == pagino &&
+		    pag->pagl_leftrec != NULLAGINO &&
+		    pag->pagl_rightrec != NULLAGINO) {
+			error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+						   &trec, &doneleft, 1);
+			if (error)
+				goto error1;
+
+			error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+						   &rec, &doneright, 0);
+			if (error)
+				goto error1;
+		} else {
+			/* search left with tcur, back up 1 record */
+			error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+			if (error)
+				goto error1;
+
+			/* search right with cur, go forward 1 record. */
+			error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+			if (error)
+				goto error1;
+		}
+
+		/*
+		 * Loop until we find an inode chunk with a free inode.
+		 */
+		while (!doneleft || !doneright) {
+			int	useleft;  /* using left inode chunk this time */
+
+			if (!--searchdistance) {
+				/*
+				 * Not in range - save last search
+				 * location and allocate a new inode
+				 */
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
+				goto newino;
+			}
+
+			/* figure out the closer block if both are valid. */
+			if (!doneleft && !doneright) {
+				useleft = pagino -
+				 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
+				  rec.ir_startino - pagino;
+			} else {
+				useleft = !doneleft;
+			}
+
+			/* free inodes to the left? */
+			if (useleft && trec.ir_freecount) {
+				rec = trec;
+				xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+				cur = tcur;
+
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
+				goto alloc_inode;
+			}
+
+			/* free inodes to the right? */
+			if (!useleft && rec.ir_freecount) {
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
+				goto alloc_inode;
+			}
+
+			/* get next record to check */
+			if (useleft) {
+				error = xfs_ialloc_next_rec(tcur, &trec,
+								 &doneleft, 1);
+			} else {
+				error = xfs_ialloc_next_rec(cur, &rec,
+								 &doneright, 0);
+			}
+			if (error)
+				goto error1;
+		}
+
+		/*
+		 * We've reached the end of the btree. because
+		 * we are only searching a small chunk of the
+		 * btree each search, there is obviously free
+		 * inodes closer to the parent inode than we
+		 * are now. restart the search again.
+		 */
+		pag->pagl_pagino = NULLAGINO;
+		pag->pagl_leftrec = NULLAGINO;
+		pag->pagl_rightrec = NULLAGINO;
+		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+		goto restart_pagno;
+	}
+
+	/*
+	 * In a different AG from the parent.
+	 * See if the most recently allocated block has any free.
+	 */
+newino:
+	if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
+		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+					 XFS_LOOKUP_EQ, &i);
+		if (error)
+			goto error0;
+
+		if (i == 1) {
+			error = xfs_inobt_get_rec(cur, &rec, &j);
+			if (error)
+				goto error0;
+
+			if (j == 1 && rec.ir_freecount > 0) {
+				/*
+				 * The last chunk allocated in the group
+				 * still has a free inode.
+				 */
+				goto alloc_inode;
+			}
+		}
+	}
+
+	/*
+	 * None left in the last group, search the whole AG
+	 */
+	error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+	if (error)
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+	for (;;) {
+		error = xfs_inobt_get_rec(cur, &rec, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if (rec.ir_freecount > 0)
+			break;
+		error = xfs_btree_increment(cur, 0, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	}
+
+alloc_inode:
+	offset = xfs_ialloc_find_free(&rec.ir_free);
+	ASSERT(offset >= 0);
+	ASSERT(offset < XFS_INODES_PER_CHUNK);
+	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+				   XFS_INODES_PER_CHUNK) == 0);
+	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+	rec.ir_free &= ~XFS_INOBT_MASK(offset);
+	rec.ir_freecount--;
+	error = xfs_inobt_update(cur, &rec);
+	if (error)
+		goto error0;
+	be32_add_cpu(&agi->agi_freecount, -1);
+	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+	pag->pagi_freecount--;
+
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+	xfs_perag_put(pag);
+	*inop = ino;
+	return 0;
+error1:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int
+xfs_difree(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	inode,		/* inode to be freed */
+	xfs_bmap_free_t	*flist,		/* extents to free */
+	int		*delete,	/* set if inode cluster was deleted */
+	xfs_ino_t	*first_ino)	/* first inode in deleted cluster */
+{
+	/* REFERENCED */
+	xfs_agblock_t	agbno;	/* block number containing inode */
+	xfs_buf_t	*agbp;	/* buffer containing allocation group header */
+	xfs_agino_t	agino;	/* inode number relative to allocation group */
+	xfs_agnumber_t	agno;	/* allocation group number */
+	xfs_agi_t	*agi;	/* allocation group header */
+	xfs_btree_cur_t	*cur;	/* inode btree cursor */
+	int		error;	/* error return value */
+	int		i;	/* result code */
+	int		ilen;	/* inodes in an inode cluster */
+	xfs_mount_t	*mp;	/* mount structure for filesystem */
+	int		off;	/* offset of inode in inode chunk */
+	xfs_inobt_rec_incore_t rec;	/* btree record */
+	struct xfs_perag *pag;
+
+	mp = tp->t_mountp;
+
+	/*
+	 * Break up inode number into its components.
+	 */
+	agno = XFS_INO_TO_AGNO(mp, inode);
+	if (agno >= mp->m_sb.sb_agcount)  {
+		xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
+			__func__, agno, mp->m_sb.sb_agcount);
+		ASSERT(0);
+		return XFS_ERROR(EINVAL);
+	}
+	agino = XFS_INO_TO_AGINO(mp, inode);
+	if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
+		xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+			__func__, (unsigned long long)inode,
+			(unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
+		ASSERT(0);
+		return XFS_ERROR(EINVAL);
+	}
+	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+	if (agbno >= mp->m_sb.sb_agblocks)  {
+		xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
+			__func__, agbno, mp->m_sb.sb_agblocks);
+		ASSERT(0);
+		return XFS_ERROR(EINVAL);
+	}
+	/*
+	 * Get the allocation group header.
+	 */
+	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+	if (error) {
+		xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
+			__func__, error);
+		return error;
+	}
+	agi = XFS_BUF_TO_AGI(agbp);
+	ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
+	ASSERT(agbno < be32_to_cpu(agi->agi_length));
+	/*
+	 * Initialize the cursor.
+	 */
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
+	/*
+	 * Look for the entry describing this inode.
+	 */
+	if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
+		xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
+			__func__, error);
+		goto error0;
+	}
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	error = xfs_inobt_get_rec(cur, &rec, &i);
+	if (error) {
+		xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
+			__func__, error);
+		goto error0;
+	}
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	/*
+	 * Get the offset in the inode chunk.
+	 */
+	off = agino - rec.ir_startino;
+	ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
+	ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
+	/*
+	 * Mark the inode free & increment the count.
+	 */
+	rec.ir_free |= XFS_INOBT_MASK(off);
+	rec.ir_freecount++;
+
+	/*
+	 * When an inode cluster is free, it becomes eligible for removal
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
+	    (rec.ir_freecount == XFS_IALLOC_INODES(mp))) {
+
+		*delete = 1;
+		*first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+
+		/*
+		 * Remove the inode cluster from the AGI B+Tree, adjust the
+		 * AGI and Superblock inode counts, and mark the disk space
+		 * to be freed when the transaction is committed.
+		 */
+		ilen = XFS_IALLOC_INODES(mp);
+		be32_add_cpu(&agi->agi_count, -ilen);
+		be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
+		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
+		pag = xfs_perag_get(mp, agno);
+		pag->pagi_freecount -= ilen - 1;
+		xfs_perag_put(pag);
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
+
+		if ((error = xfs_btree_delete(cur, &i))) {
+			xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
+				__func__, error);
+			goto error0;
+		}
+
+		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp,
+				agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)),
+				XFS_IALLOC_BLOCKS(mp), flist, mp);
+	} else {
+		*delete = 0;
+
+		error = xfs_inobt_update(cur, &rec);
+		if (error) {
+			xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
+				__func__, error);
+			goto error0;
+		}
+
+		/* 
+		 * Change the inode free counts and log the ag/sb changes.
+		 */
+		be32_add_cpu(&agi->agi_freecount, 1);
+		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+		pag = xfs_perag_get(mp, agno);
+		pag->pagi_freecount++;
+		xfs_perag_put(pag);
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
+	}
+
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+STATIC int
+xfs_imap_lookup(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		agino,
+	xfs_agblock_t		agbno,
+	xfs_agblock_t		*chunk_agbno,
+	xfs_agblock_t		*offset_agbno,
+	int			flags)
+{
+	struct xfs_inobt_rec_incore rec;
+	struct xfs_btree_cur	*cur;
+	struct xfs_buf		*agbp;
+	int			error;
+	int			i;
+
+	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+	if (error) {
+		xfs_alert(mp,
+			"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
+			__func__, error, agno);
+		return error;
+	}
+
+	/*
+	 * Lookup the inode record for the given agino. If the record cannot be
+	 * found, then it's an invalid inode number and we should abort. Once
+	 * we have a record, we need to ensure it contains the inode number
+	 * we are looking up.
+	 */
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
+	if (!error) {
+		if (i)
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+		if (!error && i == 0)
+			error = EINVAL;
+	}
+
+	xfs_trans_brelse(tp, agbp);
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	if (error)
+		return error;
+
+	/* check that the returned record contains the required inode */
+	if (rec.ir_startino > agino ||
+	    rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
+		return EINVAL;
+
+	/* for untrusted inodes check it is allocated first */
+	if ((flags & XFS_IGET_UNTRUSTED) &&
+	    (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
+		return EINVAL;
+
+	*chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
+	*offset_agbno = agbno - *chunk_agbno;
+	return 0;
+}
+
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+	xfs_mount_t	 *mp,	/* file system mount structure */
+	xfs_trans_t	 *tp,	/* transaction pointer */
+	xfs_ino_t	ino,	/* inode to locate */
+	struct xfs_imap	*imap,	/* location map structure */
+	uint		flags)	/* flags for inode btree lookup */
+{
+	xfs_agblock_t	agbno;	/* block number of inode in the alloc group */
+	xfs_agino_t	agino;	/* inode number within alloc group */
+	xfs_agnumber_t	agno;	/* allocation group number */
+	int		blks_per_cluster; /* num blocks per inode cluster */
+	xfs_agblock_t	chunk_agbno;	/* first block in inode chunk */
+	xfs_agblock_t	cluster_agbno;	/* first block in inode cluster */
+	int		error;	/* error code */
+	int		offset;	/* index of inode in its buffer */
+	int		offset_agbno;	/* blks from chunk start to inode */
+
+	ASSERT(ino != NULLFSINO);
+
+	/*
+	 * Split up the inode number into its parts.
+	 */
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agino = XFS_INO_TO_AGINO(mp, ino);
+	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+	if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
+	    ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+#ifdef DEBUG
+		/*
+		 * Don't output diagnostic information for untrusted inodes
+		 * as they can be invalid without implying corruption.
+		 */
+		if (flags & XFS_IGET_UNTRUSTED)
+			return XFS_ERROR(EINVAL);
+		if (agno >= mp->m_sb.sb_agcount) {
+			xfs_alert(mp,
+				"%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
+				__func__, agno, mp->m_sb.sb_agcount);
+		}
+		if (agbno >= mp->m_sb.sb_agblocks) {
+			xfs_alert(mp,
+		"%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
+				__func__, (unsigned long long)agbno,
+				(unsigned long)mp->m_sb.sb_agblocks);
+		}
+		if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+			xfs_alert(mp,
+		"%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
+				__func__, ino,
+				XFS_AGINO_TO_INO(mp, agno, agino));
+		}
+		xfs_stack_trace();
+#endif /* DEBUG */
+		return XFS_ERROR(EINVAL);
+	}
+
+	blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
+
+	/*
+	 * For bulkstat and handle lookups, we have an untrusted inode number
+	 * that we have to verify is valid. We cannot do this just by reading
+	 * the inode buffer as it may have been unlinked and removed leaving
+	 * inodes in stale state on disk. Hence we have to do a btree lookup
+	 * in all cases where an untrusted inode number is passed.
+	 */
+	if (flags & XFS_IGET_UNTRUSTED) {
+		error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+					&chunk_agbno, &offset_agbno, flags);
+		if (error)
+			return error;
+		goto out_map;
+	}
+
+	/*
+	 * If the inode cluster size is the same as the blocksize or
+	 * smaller we get to the buffer by simple arithmetics.
+	 */
+	if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) {
+		offset = XFS_INO_TO_OFFSET(mp, ino);
+		ASSERT(offset < mp->m_sb.sb_inopblock);
+
+		imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+		imap->im_len = XFS_FSB_TO_BB(mp, 1);
+		imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+		return 0;
+	}
+
+	/*
+	 * If the inode chunks are aligned then use simple maths to
+	 * find the location. Otherwise we have to do a btree
+	 * lookup to find the location.
+	 */
+	if (mp->m_inoalign_mask) {
+		offset_agbno = agbno & mp->m_inoalign_mask;
+		chunk_agbno = agbno - offset_agbno;
+	} else {
+		error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+					&chunk_agbno, &offset_agbno, flags);
+		if (error)
+			return error;
+	}
+
+out_map:
+	ASSERT(agbno >= chunk_agbno);
+	cluster_agbno = chunk_agbno +
+		((offset_agbno / blks_per_cluster) * blks_per_cluster);
+	offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+		XFS_INO_TO_OFFSET(mp, ino);
+
+	imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
+	imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+	imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+
+	/*
+	 * If the inode number maps to a block outside the bounds
+	 * of the file system then return NULL rather than calling
+	 * read_buf and panicing when we get an error from the
+	 * driver.
+	 */
+	if ((imap->im_blkno + imap->im_len) >
+	    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+		xfs_alert(mp,
+	"%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
+			__func__, (unsigned long long) imap->im_blkno,
+			(unsigned long long) imap->im_len,
+			XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+		return XFS_ERROR(EINVAL);
+	}
+	return 0;
+}
+
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+	xfs_mount_t	*mp)		/* file system mount structure */
+{
+	int		level;
+	uint		maxblocks;
+	uint		maxleafents;
+	int		minleafrecs;
+	int		minnoderecs;
+
+	maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
+		XFS_INODES_PER_CHUNK_LOG;
+	minleafrecs = mp->m_alloc_mnr[0];
+	minnoderecs = mp->m_alloc_mnr[1];
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++)
+		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	mp->m_in_maxlevels = level;
+}
+
+/*
+ * Log specified fields for the ag hdr (inode section)
+ */
+void
+xfs_ialloc_log_agi(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_buf_t	*bp,		/* allocation group header buffer */
+	int		fields)		/* bitmask of fields to log */
+{
+	int			first;		/* first byte number */
+	int			last;		/* last byte number */
+	static const short	offsets[] = {	/* field starting offsets */
+					/* keep in sync with bit definitions */
+		offsetof(xfs_agi_t, agi_magicnum),
+		offsetof(xfs_agi_t, agi_versionnum),
+		offsetof(xfs_agi_t, agi_seqno),
+		offsetof(xfs_agi_t, agi_length),
+		offsetof(xfs_agi_t, agi_count),
+		offsetof(xfs_agi_t, agi_root),
+		offsetof(xfs_agi_t, agi_level),
+		offsetof(xfs_agi_t, agi_freecount),
+		offsetof(xfs_agi_t, agi_newino),
+		offsetof(xfs_agi_t, agi_dirino),
+		offsetof(xfs_agi_t, agi_unlinked),
+		sizeof(xfs_agi_t)
+	};
+#ifdef DEBUG
+	xfs_agi_t		*agi;	/* allocation group header */
+
+	agi = XFS_BUF_TO_AGI(bp);
+	ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
+#endif
+	/*
+	 * Compute byte offsets for the first and last fields.
+	 */
+	xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
+	/*
+	 * Log the allocation group inode header buffer.
+	 */
+	xfs_trans_log_buf(tp, bp, first, last);
+}
+
+#ifdef DEBUG
+STATIC void
+xfs_check_agi_unlinked(
+	struct xfs_agi		*agi)
+{
+	int			i;
+
+	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+		ASSERT(agi->agi_unlinked[i]);
+}
+#else
+#define xfs_check_agi_unlinked(agi)
+#endif
+
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int
+xfs_read_agi(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	struct xfs_buf		**bpp)	/* allocation group hdr buf */
+{
+	struct xfs_agi		*agi;	/* allocation group header */
+	int			agi_ok;	/* agi is consistent */
+	int			error;
+
+	ASSERT(agno != NULLAGNUMBER);
+
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, bpp);
+	if (error)
+		return error;
+
+	ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp));
+	agi = XFS_BUF_TO_AGI(*bpp);
+
+	/*
+	 * Validate the magic number of the agi block.
+	 */
+	agi_ok = be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
+		XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
+		be32_to_cpu(agi->agi_seqno) == agno;
+	if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
+			XFS_RANDOM_IALLOC_READ_AGI))) {
+		XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
+				     mp, agi);
+		xfs_trans_brelse(tp, *bpp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF);
+
+	xfs_check_agi_unlinked(agi);
+	return 0;
+}
+
+int
+xfs_ialloc_read_agi(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	struct xfs_buf		**bpp)	/* allocation group hdr buf */
+{
+	struct xfs_agi		*agi;	/* allocation group header */
+	struct xfs_perag	*pag;	/* per allocation group data */
+	int			error;
+
+	error = xfs_read_agi(mp, tp, agno, bpp);
+	if (error)
+		return error;
+
+	agi = XFS_BUF_TO_AGI(*bpp);
+	pag = xfs_perag_get(mp, agno);
+	if (!pag->pagi_init) {
+		pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
+		pag->pagi_count = be32_to_cpu(agi->agi_count);
+		pag->pagi_init = 1;
+	}
+
+	/*
+	 * It's possible for these to be out of sync if
+	 * we are in the middle of a forced shutdown.
+	 */
+	ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
+		XFS_FORCED_SHUTDOWN(mp));
+	xfs_perag_put(pag);
+	return 0;
+}
+
+/*
+ * Read in the agi to initialise the per-ag data in the mount structure
+ */
+int
+xfs_ialloc_pagi_init(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno)		/* allocation group number */
+{
+	xfs_buf_t	*bp = NULL;
+	int		error;
+
+	error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
+	if (error)
+		return error;
+	if (bp)
+		xfs_trans_brelse(tp, bp);
+	return 0;
+}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
new file mode 100644
index 00000000..bb538547
--- /dev/null
+++ b/fs/xfs/xfs_ialloc.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IALLOC_H__
+#define	__XFS_IALLOC_H__
+
+struct xfs_buf;
+struct xfs_dinode;
+struct xfs_imap;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Allocation parameters for inode allocation.
+ */
+#define	XFS_IALLOC_INODES(mp)	(mp)->m_ialloc_inos
+#define	XFS_IALLOC_BLOCKS(mp)	(mp)->m_ialloc_blks
+
+/*
+ * Move inodes in clusters of this size.
+ */
+#define	XFS_INODE_BIG_CLUSTER_SIZE	8192
+#define	XFS_INODE_CLUSTER_SIZE(mp)	(mp)->m_inode_cluster_size
+
+/*
+ * Make an inode pointer out of the buffer/offset.
+ */
+static inline struct xfs_dinode *
+xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
+{
+	return (xfs_dinode_t *)
+		(xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
+}
+
+/*
+ * Find a free (set) bit in the inode bitmask.
+ */
+static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
+{
+	return xfs_lowbit64(*fp);
+}
+
+
+/*
+ * Allocate an inode on disk.
+ * Mode is used to tell whether the new inode will need space, and whether
+ * it is a directory.
+ *
+ * To work within the constraint of one allocation per transaction,
+ * xfs_dialloc() is designed to be called twice if it has to do an
+ * allocation to make more free inodes.  If an inode is
+ * available without an allocation, agbp would be set to the current
+ * agbp and alloc_done set to false.
+ * If an allocation needed to be done, agbp would be set to the
+ * inode header of the allocation group and alloc_done set to true.
+ * The caller should then commit the current transaction and allocate a new
+ * transaction.  xfs_dialloc() should then be called again with
+ * the agbp value returned from the previous call.
+ *
+ * Once we successfully pick an inode its number is returned and the
+ * on-disk data structures are updated.  The inode itself is not read
+ * in, since doing so would break ordering constraints with xfs_reclaim.
+ *
+ * *agbp should be set to NULL on the first call, *alloc_done set to FALSE.
+ */
+int					/* error */
+xfs_dialloc(
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_ino_t	parent,		/* parent inode (directory) */
+	mode_t		mode,		/* mode bits for new inode */
+	int		okalloc,	/* ok to allocate more space */
+	struct xfs_buf	**agbp,		/* buf for a.g. inode header */
+	boolean_t	*alloc_done,	/* an allocation was done to replenish
+					   the free inodes */
+	xfs_ino_t	*inop);		/* inode number allocated */
+
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int					/* error */
+xfs_difree(
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_ino_t	inode,		/* inode to be freed */
+	struct xfs_bmap_free *flist,	/* extents to free */
+	int		*delete,	/* set if inode cluster was deleted */
+	xfs_ino_t	*first_ino);	/* first inode in deleted cluster */
+
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+	struct xfs_mount *mp,		/* file system mount structure */
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_ino_t	ino,		/* inode to locate */
+	struct xfs_imap	*imap,		/* location map structure */
+	uint		flags);		/* flags for inode btree lookup */
+
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+	struct xfs_mount *mp);		/* file system mount structure */
+
+/*
+ * Log specified fields for the ag hdr (inode section)
+ */
+void
+xfs_ialloc_log_agi(
+	struct xfs_trans *tp,		/* transaction pointer */
+	struct xfs_buf	*bp,		/* allocation group header buffer */
+	int		fields);	/* bitmask of fields to log */
+
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int					/* error */
+xfs_ialloc_read_agi(
+	struct xfs_mount *mp,		/* file system mount structure */
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	struct xfs_buf	**bpp);		/* allocation group hdr buf */
+
+/*
+ * Read in the allocation group header to initialise the per-ag data
+ * in the mount structure
+ */
+int
+xfs_ialloc_pagi_init(
+	struct xfs_mount *mp,		/* file system mount structure */
+	struct xfs_trans *tp,		/* transaction pointer */
+        xfs_agnumber_t  agno);		/* allocation group number */
+
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
+		xfs_lookup_t dir, int *stat);
+
+/*
+ * Get the data from the pointed-to record.
+ */
+extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+		xfs_inobt_rec_incore_t *rec, int *stat);
+
+#endif	/* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
new file mode 100644
index 00000000..16921f55
--- /dev/null
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+
+
+STATIC int
+xfs_inobt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_inobt_mnr[level != 0];
+}
+
+STATIC struct xfs_btree_cur *
+xfs_inobt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.a.agbp, cur->bc_private.a.agno);
+}
+
+STATIC void
+xfs_inobt_set_root(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*nptr,
+	int			inc)	/* level change */
+{
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
+
+	agi->agi_root = nptr->s;
+	be32_add_cpu(&agi->agi_level, inc);
+	xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
+}
+
+STATIC int
+xfs_inobt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			length,
+	int			*stat)
+{
+	xfs_alloc_arg_t		args;		/* block allocation args */
+	int			error;		/* error return value */
+	xfs_agblock_t		sbno = be32_to_cpu(start->s);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+	memset(&args, 0, sizeof(args));
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
+	args.minlen = 1;
+	args.maxlen = 1;
+	args.prod = 1;
+	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+
+	error = xfs_alloc_vextent(&args);
+	if (error) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+		return error;
+	}
+	if (args.fsbno == NULLFSBLOCK) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+
+	new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
+	*stat = 1;
+	return 0;
+}
+
+STATIC int
+xfs_inobt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	xfs_fsblock_t		fsbno;
+	int			error;
+
+	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
+	error = xfs_free_extent(cur->bc_tp, fsbno, 1);
+	if (error)
+		return error;
+
+	xfs_trans_binval(cur->bc_tp, bp);
+	return error;
+}
+
+STATIC int
+xfs_inobt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_inobt_mxr[level != 0];
+}
+
+STATIC void
+xfs_inobt_init_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	key->inobt.ir_startino = rec->inobt.ir_startino;
+}
+
+STATIC void
+xfs_inobt_init_rec_from_key(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	rec->inobt.ir_startino = key->inobt.ir_startino;
+}
+
+STATIC void
+xfs_inobt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+	rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+	rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
+}
+
+/*
+ * initial value of ptr for lookup
+ */
+STATIC void
+xfs_inobt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+
+	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+
+	ptr->s = agi->agi_root;
+}
+
+STATIC __int64_t
+xfs_inobt_key_diff(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key)
+{
+	return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
+			  cur->bc_rec.i.ir_startino;
+}
+
+#ifdef DEBUG
+STATIC int
+xfs_inobt_keys_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	return be32_to_cpu(k1->inobt.ir_startino) <
+		be32_to_cpu(k2->inobt.ir_startino);
+}
+
+STATIC int
+xfs_inobt_recs_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*r1,
+	union xfs_btree_rec	*r2)
+{
+	return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
+		be32_to_cpu(r2->inobt.ir_startino);
+}
+#endif	/* DEBUG */
+
+#ifdef XFS_BTREE_TRACE
+ktrace_t	*xfs_inobt_trace_buf;
+
+STATIC void
+xfs_inobt_trace_enter(
+	struct xfs_btree_cur	*cur,
+	const char		*func,
+	char			*s,
+	int			type,
+	int			line,
+	__psunsigned_t		a0,
+	__psunsigned_t		a1,
+	__psunsigned_t		a2,
+	__psunsigned_t		a3,
+	__psunsigned_t		a4,
+	__psunsigned_t		a5,
+	__psunsigned_t		a6,
+	__psunsigned_t		a7,
+	__psunsigned_t		a8,
+	__psunsigned_t		a9,
+	__psunsigned_t		a10)
+{
+	ktrace_enter(xfs_inobt_trace_buf, (void *)(__psint_t)type,
+		(void *)func, (void *)s, NULL, (void *)cur,
+		(void *)a0, (void *)a1, (void *)a2, (void *)a3,
+		(void *)a4, (void *)a5, (void *)a6, (void *)a7,
+		(void *)a8, (void *)a9, (void *)a10);
+}
+
+STATIC void
+xfs_inobt_trace_cursor(
+	struct xfs_btree_cur	*cur,
+	__uint32_t		*s0,
+	__uint64_t		*l0,
+	__uint64_t		*l1)
+{
+	*s0 = cur->bc_private.a.agno;
+	*l0 = cur->bc_rec.i.ir_startino;
+	*l1 = cur->bc_rec.i.ir_free;
+}
+
+STATIC void
+xfs_inobt_trace_key(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key,
+	__uint64_t		*l0,
+	__uint64_t		*l1)
+{
+	*l0 = be32_to_cpu(key->inobt.ir_startino);
+	*l1 = 0;
+}
+
+STATIC void
+xfs_inobt_trace_record(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec,
+	__uint64_t		*l0,
+	__uint64_t		*l1,
+	__uint64_t		*l2)
+{
+	*l0 = be32_to_cpu(rec->inobt.ir_startino);
+	*l1 = be32_to_cpu(rec->inobt.ir_freecount);
+	*l2 = be64_to_cpu(rec->inobt.ir_free);
+}
+#endif /* XFS_BTREE_TRACE */
+
+static const struct xfs_btree_ops xfs_inobt_ops = {
+	.rec_len		= sizeof(xfs_inobt_rec_t),
+	.key_len		= sizeof(xfs_inobt_key_t),
+
+	.dup_cursor		= xfs_inobt_dup_cursor,
+	.set_root		= xfs_inobt_set_root,
+	.alloc_block		= xfs_inobt_alloc_block,
+	.free_block		= xfs_inobt_free_block,
+	.get_minrecs		= xfs_inobt_get_minrecs,
+	.get_maxrecs		= xfs_inobt_get_maxrecs,
+	.init_key_from_rec	= xfs_inobt_init_key_from_rec,
+	.init_rec_from_key	= xfs_inobt_init_rec_from_key,
+	.init_rec_from_cur	= xfs_inobt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_inobt_init_ptr_from_cur,
+	.key_diff		= xfs_inobt_key_diff,
+
+#ifdef DEBUG
+	.keys_inorder		= xfs_inobt_keys_inorder,
+	.recs_inorder		= xfs_inobt_recs_inorder,
+#endif
+
+#ifdef XFS_BTREE_TRACE
+	.trace_enter		= xfs_inobt_trace_enter,
+	.trace_cursor		= xfs_inobt_trace_cursor,
+	.trace_key		= xfs_inobt_trace_key,
+	.trace_record		= xfs_inobt_trace_record,
+#endif
+};
+
+/*
+ * Allocate a new inode btree cursor.
+ */
+struct xfs_btree_cur *				/* new inode btree cursor */
+xfs_inobt_init_cursor(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_buf		*agbp,		/* buffer for agi structure */
+	xfs_agnumber_t		agno)		/* allocation group number */
+{
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
+	struct xfs_btree_cur	*cur;
+
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+	cur->bc_btnum = XFS_BTNUM_INO;
+	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+	cur->bc_ops = &xfs_inobt_ops;
+
+	cur->bc_private.a.agbp = agbp;
+	cur->bc_private.a.agno = agno;
+
+	return cur;
+}
+
+/*
+ * Calculate number of records in an inobt btree block.
+ */
+int
+xfs_inobt_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= XFS_INOBT_BLOCK_LEN(mp);
+
+	if (leaf)
+		return blocklen / sizeof(xfs_inobt_rec_t);
+	return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+}
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
new file mode 100644
index 00000000..f782ad0c
--- /dev/null
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IALLOC_BTREE_H__
+#define	__XFS_IALLOC_BTREE_H__
+
+/*
+ * Inode map on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+/*
+ * There is a btree for the inode map per allocation group.
+ */
+#define	XFS_IBT_MAGIC	0x49414254	/* 'IABT' */
+
+typedef	__uint64_t	xfs_inofree_t;
+#define	XFS_INODES_PER_CHUNK		(NBBY * sizeof(xfs_inofree_t))
+#define	XFS_INODES_PER_CHUNK_LOG	(XFS_NBBYLOG + 3)
+#define	XFS_INOBT_ALL_FREE		((xfs_inofree_t)-1)
+#define	XFS_INOBT_MASK(i)		((xfs_inofree_t)1 << (i))
+
+static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
+{
+	return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
+}
+
+/*
+ * Data record structure
+ */
+typedef struct xfs_inobt_rec {
+	__be32		ir_startino;	/* starting inode number */
+	__be32		ir_freecount;	/* count of free inodes (set bits) */
+	__be64		ir_free;	/* free inode mask */
+} xfs_inobt_rec_t;
+
+typedef struct xfs_inobt_rec_incore {
+	xfs_agino_t	ir_startino;	/* starting inode number */
+	__int32_t	ir_freecount;	/* count of free inodes (set bits) */
+	xfs_inofree_t	ir_free;	/* free inode mask */
+} xfs_inobt_rec_incore_t;
+
+
+/*
+ * Key structure
+ */
+typedef struct xfs_inobt_key {
+	__be32		ir_startino;	/* starting inode number */
+} xfs_inobt_key_t;
+
+/* btree pointer type */
+typedef __be32 xfs_inobt_ptr_t;
+
+/*
+ * block numbers in the AG.
+ */
+#define	XFS_IBT_BLOCK(mp)		((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
+#define	XFS_PREALLOC_BLOCKS(mp)		((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+
+/*
+ * Btree block header size depends on a superblock flag.
+ *
+ * (not quite yet, but soon)
+ */
+#define XFS_INOBT_BLOCK_LEN(mp)	XFS_BTREE_SBLOCK_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_INOBT_REC_ADDR(mp, block, index) \
+	((xfs_inobt_rec_t *) \
+		((char *)(block) + \
+		 XFS_INOBT_BLOCK_LEN(mp) + \
+		 (((index) - 1) * sizeof(xfs_inobt_rec_t))))
+
+#define XFS_INOBT_KEY_ADDR(mp, block, index) \
+	((xfs_inobt_key_t *) \
+		((char *)(block) + \
+		 XFS_INOBT_BLOCK_LEN(mp) + \
+		 ((index) - 1) * sizeof(xfs_inobt_key_t)))
+
+#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
+	((xfs_inobt_ptr_t *) \
+		((char *)(block) + \
+		 XFS_INOBT_BLOCK_LEN(mp) + \
+		 (maxrecs) * sizeof(xfs_inobt_key_t) + \
+		 ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
+		struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
+extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+
+#endif	/* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
new file mode 100644
index 00000000..ca752f05
--- /dev/null
+++ b/fs/xfs/xfs_iget.c
@@ -0,0 +1,727 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_acl.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_quota.h"
+#include "xfs_utils.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_btree_trace.h"
+#include "xfs_trace.h"
+
+
+/*
+ * Define xfs inode iolock lockdep classes. We need to ensure that all active
+ * inodes are considered the same for lockdep purposes, including inodes that
+ * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
+ * guarantee the locks are considered the same when there are multiple lock
+ * initialisation siteѕ. Also, define a reclaimable inode class so it is
+ * obvious in lockdep reports which class the report is against.
+ */
+static struct lock_class_key xfs_iolock_active;
+struct lock_class_key xfs_iolock_reclaimable;
+
+/*
+ * Allocate and initialise an xfs_inode.
+ */
+STATIC struct xfs_inode *
+xfs_inode_alloc(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	struct xfs_inode	*ip;
+
+	/*
+	 * if this didn't occur in transactions, we could use
+	 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+	 * code up to do this anyway.
+	 */
+	ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+	if (!ip)
+		return NULL;
+	if (inode_init_always(mp->m_super, VFS_I(ip))) {
+		kmem_zone_free(xfs_inode_zone, ip);
+		return NULL;
+	}
+
+	ASSERT(atomic_read(&ip->i_iocount) == 0);
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(!spin_is_locked(&ip->i_flags_lock));
+	ASSERT(completion_done(&ip->i_flush));
+	ASSERT(ip->i_ino == 0);
+
+	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+	lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+			&xfs_iolock_active, "xfs_iolock_active");
+
+	/* initialise the xfs inode */
+	ip->i_ino = ino;
+	ip->i_mount = mp;
+	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+	ip->i_afp = NULL;
+	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+	ip->i_flags = 0;
+	ip->i_update_core = 0;
+	ip->i_delayed_blks = 0;
+	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+	ip->i_size = 0;
+	ip->i_new_size = 0;
+
+	return ip;
+}
+
+STATIC void
+xfs_inode_free_callback(
+	struct rcu_head		*head)
+{
+	struct inode		*inode = container_of(head, struct inode, i_rcu);
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_zone_free(xfs_inode_zone, ip);
+}
+
+void
+xfs_inode_free(
+	struct xfs_inode	*ip)
+{
+	switch (ip->i_d.di_mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFLNK:
+		xfs_idestroy_fork(ip, XFS_DATA_FORK);
+		break;
+	}
+
+	if (ip->i_afp)
+		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+	if (ip->i_itemp) {
+		/*
+		 * Only if we are shutting down the fs will we see an
+		 * inode still in the AIL. If it is there, we should remove
+		 * it to prevent a use-after-free from occurring.
+		 */
+		xfs_log_item_t	*lip = &ip->i_itemp->ili_item;
+		struct xfs_ail	*ailp = lip->li_ailp;
+
+		ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
+				       XFS_FORCED_SHUTDOWN(ip->i_mount));
+		if (lip->li_flags & XFS_LI_IN_AIL) {
+			spin_lock(&ailp->xa_lock);
+			if (lip->li_flags & XFS_LI_IN_AIL)
+				xfs_trans_ail_delete(ailp, lip);
+			else
+				spin_unlock(&ailp->xa_lock);
+		}
+		xfs_inode_item_destroy(ip);
+		ip->i_itemp = NULL;
+	}
+
+	/* asserts to verify all state is correct here */
+	ASSERT(atomic_read(&ip->i_iocount) == 0);
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(!spin_is_locked(&ip->i_flags_lock));
+	ASSERT(completion_done(&ip->i_flush));
+
+	/*
+	 * Because we use RCU freeing we need to ensure the inode always
+	 * appears to be reclaimed with an invalid inode number when in the
+	 * free state. The ip->i_flags_lock provides the barrier against lookup
+	 * races.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags = XFS_IRECLAIM;
+	ip->i_ino = 0;
+	spin_unlock(&ip->i_flags_lock);
+
+	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip,
+	xfs_ino_t		ino,
+	int			flags,
+	int			lock_flags) __releases(RCU)
+{
+	struct inode		*inode = VFS_I(ip);
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error;
+
+	/*
+	 * check for re-use of an inode within an RCU grace period due to the
+	 * radix tree nodes not being updated yet. We monitor for this by
+	 * setting the inode number to zero before freeing the inode structure.
+	 * If the inode has been reallocated and set up, then the inode number
+	 * will not match, so check for that, too.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (ip->i_ino != ino) {
+		trace_xfs_iget_skip(ip);
+		XFS_STATS_INC(xs_ig_frecycle);
+		error = EAGAIN;
+		goto out_error;
+	}
+
+
+	/*
+	 * If we are racing with another cache hit that is currently
+	 * instantiating this inode or currently recycling it out of
+	 * reclaimabe state, wait for the initialisation to complete
+	 * before continuing.
+	 *
+	 * XXX(hch): eventually we should do something equivalent to
+	 *	     wait_on_inode to wait for these flags to be cleared
+	 *	     instead of polling for it.
+	 */
+	if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
+		trace_xfs_iget_skip(ip);
+		XFS_STATS_INC(xs_ig_frecycle);
+		error = EAGAIN;
+		goto out_error;
+	}
+
+	/*
+	 * If lookup is racing with unlink return an error immediately.
+	 */
+	if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+		error = ENOENT;
+		goto out_error;
+	}
+
+	/*
+	 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+	 * Need to carefully get it back into useable state.
+	 */
+	if (ip->i_flags & XFS_IRECLAIMABLE) {
+		trace_xfs_iget_reclaim(ip);
+
+		/*
+		 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
+		 * from stomping over us while we recycle the inode.  We can't
+		 * clear the radix tree reclaimable tag yet as it requires
+		 * pag_ici_lock to be held exclusive.
+		 */
+		ip->i_flags |= XFS_IRECLAIM;
+
+		spin_unlock(&ip->i_flags_lock);
+		rcu_read_unlock();
+
+		error = -inode_init_always(mp->m_super, inode);
+		if (error) {
+			/*
+			 * Re-initializing the inode failed, and we are in deep
+			 * trouble.  Try to re-add it to the reclaim list.
+			 */
+			rcu_read_lock();
+			spin_lock(&ip->i_flags_lock);
+
+			ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+			ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+			trace_xfs_iget_reclaim_fail(ip);
+			goto out_error;
+		}
+
+		spin_lock(&pag->pag_ici_lock);
+		spin_lock(&ip->i_flags_lock);
+
+		/*
+		 * Clear the per-lifetime state in the inode as we are now
+		 * effectively a new inode and need to return to the initial
+		 * state before reuse occurs.
+		 */
+		ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+		ip->i_flags |= XFS_INEW;
+		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
+		inode->i_state = I_NEW;
+
+		ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+		mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+		lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+				&xfs_iolock_active, "xfs_iolock_active");
+
+		spin_unlock(&ip->i_flags_lock);
+		spin_unlock(&pag->pag_ici_lock);
+	} else {
+		/* If the VFS inode is being torn down, pause and try again. */
+		if (!igrab(inode)) {
+			trace_xfs_iget_skip(ip);
+			error = EAGAIN;
+			goto out_error;
+		}
+
+		/* We've got a live one. */
+		spin_unlock(&ip->i_flags_lock);
+		rcu_read_unlock();
+		trace_xfs_iget_hit(ip);
+	}
+
+	if (lock_flags != 0)
+		xfs_ilock(ip, lock_flags);
+
+	xfs_iflags_clear(ip, XFS_ISTALE);
+	XFS_STATS_INC(xs_ig_found);
+
+	return 0;
+
+out_error:
+	spin_unlock(&ip->i_flags_lock);
+	rcu_read_unlock();
+	return error;
+}
+
+
+static int
+xfs_iget_cache_miss(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	xfs_trans_t		*tp,
+	xfs_ino_t		ino,
+	struct xfs_inode	**ipp,
+	int			flags,
+	int			lock_flags)
+{
+	struct xfs_inode	*ip;
+	int			error;
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ino);
+
+	ip = xfs_inode_alloc(mp, ino);
+	if (!ip)
+		return ENOMEM;
+
+	error = xfs_iread(mp, tp, ip, flags);
+	if (error)
+		goto out_destroy;
+
+	trace_xfs_iget_miss(ip);
+
+	if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+		error = ENOENT;
+		goto out_destroy;
+	}
+
+	/*
+	 * Preload the radix tree so we can insert safely under the
+	 * write spinlock. Note that we cannot sleep inside the preload
+	 * region.
+	 */
+	if (radix_tree_preload(GFP_KERNEL)) {
+		error = EAGAIN;
+		goto out_destroy;
+	}
+
+	/*
+	 * Because the inode hasn't been added to the radix-tree yet it can't
+	 * be found by another thread, so we can do the non-sleeping lock here.
+	 */
+	if (lock_flags) {
+		if (!xfs_ilock_nowait(ip, lock_flags))
+			BUG();
+	}
+
+	/*
+	 * These values must be set before inserting the inode into the radix
+	 * tree as the moment it is inserted a concurrent lookup (allowed by the
+	 * RCU locking mechanism) can find it and that lookup must see that this
+	 * is an inode currently under construction (i.e. that XFS_INEW is set).
+	 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+	 * memory barrier that ensures this detection works correctly at lookup
+	 * time.
+	 */
+	ip->i_udquot = ip->i_gdquot = NULL;
+	xfs_iflags_set(ip, XFS_INEW);
+
+	/* insert the new inode */
+	spin_lock(&pag->pag_ici_lock);
+	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
+	if (unlikely(error)) {
+		WARN_ON(error != -EEXIST);
+		XFS_STATS_INC(xs_ig_dup);
+		error = EAGAIN;
+		goto out_preload_end;
+	}
+	spin_unlock(&pag->pag_ici_lock);
+	radix_tree_preload_end();
+
+	*ipp = ip;
+	return 0;
+
+out_preload_end:
+	spin_unlock(&pag->pag_ici_lock);
+	radix_tree_preload_end();
+	if (lock_flags)
+		xfs_iunlock(ip, lock_flags);
+out_destroy:
+	__destroy_inode(VFS_I(ip));
+	xfs_inode_free(ip);
+	return error;
+}
+
+/*
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.  It points
+ *       to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.  This is
+ *       simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *        within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *		 for xfs_ilock() for a list of valid values.
+ */
+int
+xfs_iget(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	uint		flags,
+	uint		lock_flags,
+	xfs_inode_t	**ipp)
+{
+	xfs_inode_t	*ip;
+	int		error;
+	xfs_perag_t	*pag;
+	xfs_agino_t	agino;
+
+	/* reject inode numbers outside existing AGs */
+	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+		return EINVAL;
+
+	/* get the perag structure and ensure that it's inode capable */
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+	agino = XFS_INO_TO_AGINO(mp, ino);
+
+again:
+	error = 0;
+	rcu_read_lock();
+	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+	if (ip) {
+		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
+		if (error)
+			goto out_error_or_again;
+	} else {
+		rcu_read_unlock();
+		XFS_STATS_INC(xs_ig_missed);
+
+		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
+							flags, lock_flags);
+		if (error)
+			goto out_error_or_again;
+	}
+	xfs_perag_put(pag);
+
+	*ipp = ip;
+
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
+	/*
+	 * If we have a real type for an on-disk inode, we can set ops(&unlock)
+	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
+	 */
+	if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+		xfs_setup_inode(ip);
+	return 0;
+
+out_error_or_again:
+	if (error == EAGAIN) {
+		delay(1);
+		goto again;
+	}
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * This is a wrapper routine around the xfs_ilock() routine
+ * used to centralize some grungy code.  It is used in places
+ * that wish to lock the inode solely for reading the extents.
+ * The reason these places can't just call xfs_ilock(SHARED)
+ * is that the inode lock also guards to bringing in of the
+ * extents from disk for a file in b-tree format.  If the inode
+ * is in b-tree format, then we need to lock the inode exclusively
+ * until the extents are read in.  Locking it exclusively all
+ * the time would limit our parallelism unnecessarily, though.
+ * What we do instead is check to see if the extents have been
+ * read in yet, and only lock the inode exclusively if they
+ * have not.
+ *
+ * The function returns a value which should be given to the
+ * corresponding xfs_iunlock_map_shared().  This value is
+ * the mode in which the lock was actually taken.
+ */
+uint
+xfs_ilock_map_shared(
+	xfs_inode_t	*ip)
+{
+	uint	lock_mode;
+
+	if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+	    ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+		lock_mode = XFS_ILOCK_EXCL;
+	} else {
+		lock_mode = XFS_ILOCK_SHARED;
+	}
+
+	xfs_ilock(ip, lock_mode);
+
+	return lock_mode;
+}
+
+/*
+ * This is simply the unlock routine to go with xfs_ilock_map_shared().
+ * All it does is call xfs_iunlock() with the given lock_mode.
+ */
+void
+xfs_iunlock_map_shared(
+	xfs_inode_t	*ip,
+	unsigned int	lock_mode)
+{
+	xfs_iunlock(ip, lock_mode);
+}
+
+/*
+ * The xfs inode contains 2 locks: a multi-reader lock called the
+ * i_iolock and a multi-reader lock called the i_lock.  This routine
+ * allows either or both of the locks to be obtained.
+ *
+ * The 2 locks should always be ordered so that the IO lock is
+ * obtained first in order to prevent deadlock.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks
+ *       to be locked.  It can be:
+ *		XFS_IOLOCK_SHARED,
+ *		XFS_IOLOCK_EXCL,
+ *		XFS_ILOCK_SHARED,
+ *		XFS_ILOCK_EXCL,
+ *		XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ *		XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ *		XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ *		XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ */
+void
+xfs_ilock(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+	else if (lock_flags & XFS_ILOCK_SHARED)
+		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+
+	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
+}
+
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep.  It returns 1 if it gets
+ * the requested locks and 0 otherwise.  If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be locked.  See the comment for xfs_ilock() for a list
+ *	 of valid values.
+ */
+int
+xfs_ilock_nowait(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		if (!mrtryupdate(&ip->i_iolock))
+			goto out;
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		if (!mrtryaccess(&ip->i_iolock))
+			goto out;
+	}
+	if (lock_flags & XFS_ILOCK_EXCL) {
+		if (!mrtryupdate(&ip->i_lock))
+			goto out_undo_iolock;
+	} else if (lock_flags & XFS_ILOCK_SHARED) {
+		if (!mrtryaccess(&ip->i_lock))
+			goto out_undo_iolock;
+	}
+	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
+	return 1;
+
+ out_undo_iolock:
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrunlock_excl(&ip->i_iolock);
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mrunlock_shared(&ip->i_iolock);
+ out:
+	return 0;
+}
+
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be unlocked.  See the comment for xfs_ilock() for a list
+ *	 of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY |
+			XFS_LOCK_DEP_MASK)) == 0);
+	ASSERT(lock_flags != 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrunlock_excl(&ip->i_iolock);
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mrunlock_shared(&ip->i_iolock);
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrunlock_excl(&ip->i_lock);
+	else if (lock_flags & XFS_ILOCK_SHARED)
+		mrunlock_shared(&ip->i_lock);
+
+	if ((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) &&
+	    !(lock_flags & XFS_IUNLOCK_NONOTIFY) && ip->i_itemp) {
+		/*
+		 * Let the AIL know that this item has been unlocked in case
+		 * it is in the AIL and anyone is waiting on it.  Don't do
+		 * this if the caller has asked us not to.
+		 */
+		xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
+					(xfs_log_item_t*)(ip->i_itemp));
+	}
+	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
+}
+
+/*
+ * give up write locks.  the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrdemote(&ip->i_lock);
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrdemote(&ip->i_iolock);
+
+	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
+}
+
+#ifdef DEBUG
+int
+xfs_isilocked(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+		if (!(lock_flags & XFS_ILOCK_SHARED))
+			return !!ip->i_lock.mr_writer;
+		return rwsem_is_locked(&ip->i_lock.mr_lock);
+	}
+
+	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+		if (!(lock_flags & XFS_IOLOCK_SHARED))
+			return !!ip->i_iolock.mr_writer;
+		return rwsem_is_locked(&ip->i_iolock.mr_lock);
+	}
+
+	ASSERT(0);
+	return 0;
+}
+#endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
new file mode 100644
index 00000000..57152799
--- /dev/null
+++ b/fs/xfs/xfs_inode.c
@@ -0,0 +1,4145 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/log2.h>
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_utils.h"
+#include "xfs_quota.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+kmem_zone_t *xfs_ifork_zone;
+kmem_zone_t *xfs_inode_zone;
+
+/*
+ * Used in xfs_itruncate().  This is the maximum number of extents
+ * freed from a file in a single transaction.
+ */
+#define	XFS_ITRUNC_MAX_EXTENTS	2
+
+STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
+STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
+STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
+STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+
+#ifdef DEBUG
+/*
+ * Make sure that the extents in the given memory buffer
+ * are valid.
+ */
+STATIC void
+xfs_validate_extents(
+	xfs_ifork_t		*ifp,
+	int			nrecs,
+	xfs_exntfmt_t		fmt)
+{
+	xfs_bmbt_irec_t		irec;
+	xfs_bmbt_rec_host_t	rec;
+	int			i;
+
+	for (i = 0; i < nrecs; i++) {
+		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+		rec.l0 = get_unaligned(&ep->l0);
+		rec.l1 = get_unaligned(&ep->l1);
+		xfs_bmbt_get_all(&rec, &irec);
+		if (fmt == XFS_EXTFMT_NOSTATE)
+			ASSERT(irec.br_state == XFS_EXT_NORM);
+	}
+}
+#else /* DEBUG */
+#define xfs_validate_extents(ifp, nrecs, fmt)
+#endif /* DEBUG */
+
+/*
+ * Check that none of the inode's in the buffer have a next
+ * unlinked field of 0.
+ */
+#if defined(DEBUG)
+void
+xfs_inobp_check(
+	xfs_mount_t	*mp,
+	xfs_buf_t	*bp)
+{
+	int		i;
+	int		j;
+	xfs_dinode_t	*dip;
+
+	j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+
+	for (i = 0; i < j; i++) {
+		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+					i * mp->m_sb.sb_inodesize);
+		if (!dip->di_next_unlinked)  {
+			xfs_alert(mp,
+	"Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
+				bp);
+			ASSERT(dip->di_next_unlinked);
+		}
+	}
+}
+#endif
+
+/*
+ * Find the buffer associated with the given inode map
+ * We do basic validation checks on the buffer once it has been
+ * retrieved from disk.
+ */
+STATIC int
+xfs_imap_to_bp(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	struct xfs_imap	*imap,
+	xfs_buf_t	**bpp,
+	uint		buf_flags,
+	uint		iget_flags)
+{
+	int		error;
+	int		i;
+	int		ni;
+	xfs_buf_t	*bp;
+
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+				   (int)imap->im_len, buf_flags, &bp);
+	if (error) {
+		if (error != EAGAIN) {
+			xfs_warn(mp,
+				"%s: xfs_trans_read_buf() returned error %d.",
+				__func__, error);
+		} else {
+			ASSERT(buf_flags & XBF_TRYLOCK);
+		}
+		return error;
+	}
+
+	/*
+	 * Validate the magic number and version of every inode in the buffer
+	 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
+	 */
+#ifdef DEBUG
+	ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
+#else	/* usual case */
+	ni = 1;
+#endif
+
+	for (i = 0; i < ni; i++) {
+		int		di_ok;
+		xfs_dinode_t	*dip;
+
+		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+					(i << mp->m_sb.sb_inodelog));
+		di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC &&
+			    XFS_DINODE_GOOD_VERSION(dip->di_version);
+		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+						XFS_ERRTAG_ITOBP_INOTOBP,
+						XFS_RANDOM_ITOBP_INOTOBP))) {
+			if (iget_flags & XFS_IGET_UNTRUSTED) {
+				xfs_trans_brelse(tp, bp);
+				return XFS_ERROR(EINVAL);
+			}
+			XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
+						XFS_ERRLEVEL_HIGH, mp, dip);
+#ifdef DEBUG
+			xfs_emerg(mp,
+				"bad inode magic/vsn daddr %lld #%d (magic=%x)",
+				(unsigned long long)imap->im_blkno, i,
+				be16_to_cpu(dip->di_magic));
+			ASSERT(0);
+#endif
+			xfs_trans_brelse(tp, bp);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+	}
+
+	xfs_inobp_check(mp, bp);
+
+	/*
+	 * Mark the buffer as an inode buffer now that it looks good
+	 */
+	XFS_BUF_SET_VTYPE(bp, B_FS_INO);
+
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * This routine is called to map an inode number within a file
+ * system to the buffer containing the on-disk version of the
+ * inode.  It returns a pointer to the buffer containing the
+ * on-disk inode in the bpp parameter, and in the dip parameter
+ * it returns a pointer to the on-disk inode within that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and
+ * dipp are undefined.
+ *
+ * Use xfs_imap() to determine the size and location of the
+ * buffer to read from disk.
+ */
+int
+xfs_inotobp(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	xfs_dinode_t	**dipp,
+	xfs_buf_t	**bpp,
+	int		*offset,
+	uint		imap_flags)
+{
+	struct xfs_imap	imap;
+	xfs_buf_t	*bp;
+	int		error;
+
+	imap.im_blkno = 0;
+	error = xfs_imap(mp, tp, ino, &imap, imap_flags);
+	if (error)
+		return error;
+
+	error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags);
+	if (error)
+		return error;
+
+	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+	*bpp = bp;
+	*offset = imap.im_boffset;
+	return 0;
+}
+
+
+/*
+ * This routine is called to map an inode to the buffer containing
+ * the on-disk version of the inode.  It returns a pointer to the
+ * buffer containing the on-disk inode in the bpp parameter, and in
+ * the dip parameter it returns a pointer to the on-disk inode within
+ * that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and
+ * dipp are undefined.
+ *
+ * The inode is expected to already been mapped to its buffer and read
+ * in once, thus we can use the mapping information stored in the inode
+ * rather than calling xfs_imap().  This allows us to avoid the overhead
+ * of looking at the inode btree for small block file systems
+ * (see xfs_imap()).
+ */
+int
+xfs_itobp(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_dinode_t	**dipp,
+	xfs_buf_t	**bpp,
+	uint		buf_flags)
+{
+	xfs_buf_t	*bp;
+	int		error;
+
+	ASSERT(ip->i_imap.im_blkno != 0);
+
+	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0);
+	if (error)
+		return error;
+
+	if (!bp) {
+		ASSERT(buf_flags & XBF_TRYLOCK);
+		ASSERT(tp == NULL);
+		*bpp = NULL;
+		return EAGAIN;
+	}
+
+	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Move inode type and inode format specific information from the
+ * on-disk inode to the in-core inode.  For fifos, devs, and sockets
+ * this means set if_rdev to the proper value.  For files, directories,
+ * and symlinks this means to bring in the in-line data or extent
+ * pointers.  For a file in B-tree format, only the root is immediately
+ * brought in-core.  The rest will be in-lined in if_extents when it
+ * is first referenced (see xfs_iread_extents()).
+ */
+STATIC int
+xfs_iformat(
+	xfs_inode_t		*ip,
+	xfs_dinode_t		*dip)
+{
+	xfs_attr_shortform_t	*atp;
+	int			size;
+	int			error;
+	xfs_fsize_t             di_size;
+	ip->i_df.if_ext_max =
+		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	error = 0;
+
+	if (unlikely(be32_to_cpu(dip->di_nextents) +
+		     be16_to_cpu(dip->di_anextents) >
+		     be64_to_cpu(dip->di_nblocks))) {
+		xfs_warn(ip->i_mount,
+			"corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
+			(unsigned long long)ip->i_ino,
+			(int)(be32_to_cpu(dip->di_nextents) +
+			      be16_to_cpu(dip->di_anextents)),
+			(unsigned long long)
+				be64_to_cpu(dip->di_nblocks));
+		XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
+				     ip->i_mount, dip);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
+		xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
+			(unsigned long long)ip->i_ino,
+			dip->di_forkoff);
+		XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
+				     ip->i_mount, dip);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+		     !ip->i_mount->m_rtdev_targp)) {
+		xfs_warn(ip->i_mount,
+			"corrupt dinode %Lu, has realtime flag set.",
+			ip->i_ino);
+		XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+				     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	switch (ip->i_d.di_mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFSOCK:
+		if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
+			XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
+					      ip->i_mount, dip);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		ip->i_d.di_size = 0;
+		ip->i_size = 0;
+		ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
+		break;
+
+	case S_IFREG:
+	case S_IFLNK:
+	case S_IFDIR:
+		switch (dip->di_format) {
+		case XFS_DINODE_FMT_LOCAL:
+			/*
+			 * no local regular files yet
+			 */
+			if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
+				xfs_warn(ip->i_mount,
+			"corrupt inode %Lu (local format for regular file).",
+					(unsigned long long) ip->i_ino);
+				XFS_CORRUPTION_ERROR("xfs_iformat(4)",
+						     XFS_ERRLEVEL_LOW,
+						     ip->i_mount, dip);
+				return XFS_ERROR(EFSCORRUPTED);
+			}
+
+			di_size = be64_to_cpu(dip->di_size);
+			if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
+				xfs_warn(ip->i_mount,
+			"corrupt inode %Lu (bad size %Ld for local inode).",
+					(unsigned long long) ip->i_ino,
+					(long long) di_size);
+				XFS_CORRUPTION_ERROR("xfs_iformat(5)",
+						     XFS_ERRLEVEL_LOW,
+						     ip->i_mount, dip);
+				return XFS_ERROR(EFSCORRUPTED);
+			}
+
+			size = (int)di_size;
+			error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
+			break;
+		case XFS_DINODE_FMT_EXTENTS:
+			error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+			break;
+		case XFS_DINODE_FMT_BTREE:
+			error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+			break;
+		default:
+			XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
+					 ip->i_mount);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		break;
+
+	default:
+		XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	if (error) {
+		return error;
+	}
+	if (!XFS_DFORK_Q(dip))
+		return 0;
+	ASSERT(ip->i_afp == NULL);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
+	ip->i_afp->if_ext_max =
+		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	switch (dip->di_aformat) {
+	case XFS_DINODE_FMT_LOCAL:
+		atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
+		size = be16_to_cpu(atp->hdr.totsize);
+
+		if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
+			xfs_warn(ip->i_mount,
+				"corrupt inode %Lu (bad attr fork size %Ld).",
+				(unsigned long long) ip->i_ino,
+				(long long) size);
+			XFS_CORRUPTION_ERROR("xfs_iformat(8)",
+					     XFS_ERRLEVEL_LOW,
+					     ip->i_mount, dip);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+
+		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+		break;
+	default:
+		error = XFS_ERROR(EFSCORRUPTED);
+		break;
+	}
+	if (error) {
+		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+		ip->i_afp = NULL;
+		xfs_idestroy_fork(ip, XFS_DATA_FORK);
+	}
+	return error;
+}
+
+/*
+ * The file is in-lined in the on-disk inode.
+ * If it fits into if_inline_data, then copy
+ * it there, otherwise allocate a buffer for it
+ * and copy the data there.  Either way, set
+ * if_data to point at the data.
+ * If we allocate a buffer for the data, make
+ * sure that its size is a multiple of 4 and
+ * record the real size in i_real_bytes.
+ */
+STATIC int
+xfs_iformat_local(
+	xfs_inode_t	*ip,
+	xfs_dinode_t	*dip,
+	int		whichfork,
+	int		size)
+{
+	xfs_ifork_t	*ifp;
+	int		real_size;
+
+	/*
+	 * If the size is unreasonable, then something
+	 * is wrong and we just bail out rather than crash in
+	 * kmem_alloc() or memcpy() below.
+	 */
+	if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+		xfs_warn(ip->i_mount,
+	"corrupt inode %Lu (bad size %d for local fork, size = %d).",
+			(unsigned long long) ip->i_ino, size,
+			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
+		XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
+				     ip->i_mount, dip);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	real_size = 0;
+	if (size == 0)
+		ifp->if_u1.if_data = NULL;
+	else if (size <= sizeof(ifp->if_u2.if_inline_data))
+		ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+	else {
+		real_size = roundup(size, 4);
+		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+	}
+	ifp->if_bytes = size;
+	ifp->if_real_bytes = real_size;
+	if (size)
+		memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
+	ifp->if_flags &= ~XFS_IFEXTENTS;
+	ifp->if_flags |= XFS_IFINLINE;
+	return 0;
+}
+
+/*
+ * The file consists of a set of extents all
+ * of which fit into the on-disk inode.
+ * If there are few enough extents to fit into
+ * the if_inline_ext, then copy them there.
+ * Otherwise allocate a buffer for them and copy
+ * them into it.  Either way, set if_extents
+ * to point at the extents.
+ */
+STATIC int
+xfs_iformat_extents(
+	xfs_inode_t	*ip,
+	xfs_dinode_t	*dip,
+	int		whichfork)
+{
+	xfs_bmbt_rec_t	*dp;
+	xfs_ifork_t	*ifp;
+	int		nex;
+	int		size;
+	int		i;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+	size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+
+	/*
+	 * If the number of extents is unreasonable, then something
+	 * is wrong and we just bail out rather than crash in
+	 * kmem_alloc() or memcpy() below.
+	 */
+	if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+		xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
+			(unsigned long long) ip->i_ino, nex);
+		XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
+				     ip->i_mount, dip);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	ifp->if_real_bytes = 0;
+	if (nex == 0)
+		ifp->if_u1.if_extents = NULL;
+	else if (nex <= XFS_INLINE_EXTS)
+		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+	else
+		xfs_iext_add(ifp, 0, nex);
+
+	ifp->if_bytes = size;
+	if (size) {
+		dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
+		xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
+		for (i = 0; i < nex; i++, dp++) {
+			xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+			ep->l0 = get_unaligned_be64(&dp->l0);
+			ep->l1 = get_unaligned_be64(&dp->l1);
+		}
+		XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
+		if (whichfork != XFS_DATA_FORK ||
+			XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
+				if (unlikely(xfs_check_nostate_extents(
+				    ifp, 0, nex))) {
+					XFS_ERROR_REPORT("xfs_iformat_extents(2)",
+							 XFS_ERRLEVEL_LOW,
+							 ip->i_mount);
+					return XFS_ERROR(EFSCORRUPTED);
+				}
+	}
+	ifp->if_flags |= XFS_IFEXTENTS;
+	return 0;
+}
+
+/*
+ * The file has too many extents to fit into
+ * the inode, so they are in B-tree format.
+ * Allocate a buffer for the root of the B-tree
+ * and copy the root into it.  The i_extents
+ * field will remain NULL until all of the
+ * extents are read in (when they are needed).
+ */
+STATIC int
+xfs_iformat_btree(
+	xfs_inode_t		*ip,
+	xfs_dinode_t		*dip,
+	int			whichfork)
+{
+	xfs_bmdr_block_t	*dfp;
+	xfs_ifork_t		*ifp;
+	/* REFERENCED */
+	int			nrecs;
+	int			size;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
+	size = XFS_BMAP_BROOT_SPACE(dfp);
+	nrecs = be16_to_cpu(dfp->bb_numrecs);
+
+	/*
+	 * blow out if -- fork has less extents than can fit in
+	 * fork (fork shouldn't be a btree format), root btree
+	 * block has more records than can fit into the fork,
+	 * or the number of extents is greater than the number of
+	 * blocks.
+	 */
+	if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
+	    || XFS_BMDR_SPACE_CALC(nrecs) >
+			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
+	    || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+		xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
+			(unsigned long long) ip->i_ino);
+		XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
+				 ip->i_mount, dip);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	ifp->if_broot_bytes = size;
+	ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
+	ASSERT(ifp->if_broot != NULL);
+	/*
+	 * Copy and convert from the on-disk structure
+	 * to the in-memory structure.
+	 */
+	xfs_bmdr_to_bmbt(ip->i_mount, dfp,
+			 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+			 ifp->if_broot, size);
+	ifp->if_flags &= ~XFS_IFEXTENTS;
+	ifp->if_flags |= XFS_IFBROOT;
+
+	return 0;
+}
+
+STATIC void
+xfs_dinode_from_disk(
+	xfs_icdinode_t		*to,
+	xfs_dinode_t		*from)
+{
+	to->di_magic = be16_to_cpu(from->di_magic);
+	to->di_mode = be16_to_cpu(from->di_mode);
+	to->di_version = from ->di_version;
+	to->di_format = from->di_format;
+	to->di_onlink = be16_to_cpu(from->di_onlink);
+	to->di_uid = be32_to_cpu(from->di_uid);
+	to->di_gid = be32_to_cpu(from->di_gid);
+	to->di_nlink = be32_to_cpu(from->di_nlink);
+	to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+	to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+	to->di_flushiter = be16_to_cpu(from->di_flushiter);
+	to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
+	to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
+	to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
+	to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
+	to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
+	to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
+	to->di_size = be64_to_cpu(from->di_size);
+	to->di_nblocks = be64_to_cpu(from->di_nblocks);
+	to->di_extsize = be32_to_cpu(from->di_extsize);
+	to->di_nextents = be32_to_cpu(from->di_nextents);
+	to->di_anextents = be16_to_cpu(from->di_anextents);
+	to->di_forkoff = from->di_forkoff;
+	to->di_aformat	= from->di_aformat;
+	to->di_dmevmask	= be32_to_cpu(from->di_dmevmask);
+	to->di_dmstate	= be16_to_cpu(from->di_dmstate);
+	to->di_flags	= be16_to_cpu(from->di_flags);
+	to->di_gen	= be32_to_cpu(from->di_gen);
+}
+
+void
+xfs_dinode_to_disk(
+	xfs_dinode_t		*to,
+	xfs_icdinode_t		*from)
+{
+	to->di_magic = cpu_to_be16(from->di_magic);
+	to->di_mode = cpu_to_be16(from->di_mode);
+	to->di_version = from ->di_version;
+	to->di_format = from->di_format;
+	to->di_onlink = cpu_to_be16(from->di_onlink);
+	to->di_uid = cpu_to_be32(from->di_uid);
+	to->di_gid = cpu_to_be32(from->di_gid);
+	to->di_nlink = cpu_to_be32(from->di_nlink);
+	to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+	to->di_flushiter = cpu_to_be16(from->di_flushiter);
+	to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
+	to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
+	to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
+	to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
+	to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
+	to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
+	to->di_size = cpu_to_be64(from->di_size);
+	to->di_nblocks = cpu_to_be64(from->di_nblocks);
+	to->di_extsize = cpu_to_be32(from->di_extsize);
+	to->di_nextents = cpu_to_be32(from->di_nextents);
+	to->di_anextents = cpu_to_be16(from->di_anextents);
+	to->di_forkoff = from->di_forkoff;
+	to->di_aformat = from->di_aformat;
+	to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+	to->di_dmstate = cpu_to_be16(from->di_dmstate);
+	to->di_flags = cpu_to_be16(from->di_flags);
+	to->di_gen = cpu_to_be32(from->di_gen);
+}
+
+STATIC uint
+_xfs_dic2xflags(
+	__uint16_t		di_flags)
+{
+	uint			flags = 0;
+
+	if (di_flags & XFS_DIFLAG_ANY) {
+		if (di_flags & XFS_DIFLAG_REALTIME)
+			flags |= XFS_XFLAG_REALTIME;
+		if (di_flags & XFS_DIFLAG_PREALLOC)
+			flags |= XFS_XFLAG_PREALLOC;
+		if (di_flags & XFS_DIFLAG_IMMUTABLE)
+			flags |= XFS_XFLAG_IMMUTABLE;
+		if (di_flags & XFS_DIFLAG_APPEND)
+			flags |= XFS_XFLAG_APPEND;
+		if (di_flags & XFS_DIFLAG_SYNC)
+			flags |= XFS_XFLAG_SYNC;
+		if (di_flags & XFS_DIFLAG_NOATIME)
+			flags |= XFS_XFLAG_NOATIME;
+		if (di_flags & XFS_DIFLAG_NODUMP)
+			flags |= XFS_XFLAG_NODUMP;
+		if (di_flags & XFS_DIFLAG_RTINHERIT)
+			flags |= XFS_XFLAG_RTINHERIT;
+		if (di_flags & XFS_DIFLAG_PROJINHERIT)
+			flags |= XFS_XFLAG_PROJINHERIT;
+		if (di_flags & XFS_DIFLAG_NOSYMLINKS)
+			flags |= XFS_XFLAG_NOSYMLINKS;
+		if (di_flags & XFS_DIFLAG_EXTSIZE)
+			flags |= XFS_XFLAG_EXTSIZE;
+		if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
+			flags |= XFS_XFLAG_EXTSZINHERIT;
+		if (di_flags & XFS_DIFLAG_NODEFRAG)
+			flags |= XFS_XFLAG_NODEFRAG;
+		if (di_flags & XFS_DIFLAG_FILESTREAM)
+			flags |= XFS_XFLAG_FILESTREAM;
+	}
+
+	return flags;
+}
+
+uint
+xfs_ip2xflags(
+	xfs_inode_t		*ip)
+{
+	xfs_icdinode_t		*dic = &ip->i_d;
+
+	return _xfs_dic2xflags(dic->di_flags) |
+				(XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
+}
+
+uint
+xfs_dic2xflags(
+	xfs_dinode_t		*dip)
+{
+	return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
+				(XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
+}
+
+/*
+ * Read the disk inode attributes into the in-core inode structure.
+ */
+int
+xfs_iread(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		iget_flags)
+{
+	xfs_buf_t	*bp;
+	xfs_dinode_t	*dip;
+	int		error;
+
+	/*
+	 * Fill in the location information in the in-core inode.
+	 */
+	error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
+	if (error)
+		return error;
+
+	/*
+	 * Get pointers to the on-disk inode and the buffer containing it.
+	 */
+	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
+			       XBF_LOCK, iget_flags);
+	if (error)
+		return error;
+	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
+
+	/*
+	 * If we got something that isn't an inode it means someone
+	 * (nfs or dmi) has a stale handle.
+	 */
+	if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
+#ifdef DEBUG
+		xfs_alert(mp,
+			"%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
+			__func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
+#endif /* DEBUG */
+		error = XFS_ERROR(EINVAL);
+		goto out_brelse;
+	}
+
+	/*
+	 * If the on-disk inode is already linked to a directory
+	 * entry, copy all of the inode into the in-core inode.
+	 * xfs_iformat() handles copying in the inode format
+	 * specific information.
+	 * Otherwise, just get the truly permanent information.
+	 */
+	if (dip->di_mode) {
+		xfs_dinode_from_disk(&ip->i_d, dip);
+		error = xfs_iformat(ip, dip);
+		if (error)  {
+#ifdef DEBUG
+			xfs_alert(mp, "%s: xfs_iformat() returned error %d",
+				__func__, error);
+#endif /* DEBUG */
+			goto out_brelse;
+		}
+	} else {
+		ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
+		ip->i_d.di_version = dip->di_version;
+		ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
+		ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
+		/*
+		 * Make sure to pull in the mode here as well in
+		 * case the inode is released without being used.
+		 * This ensures that xfs_inactive() will see that
+		 * the inode is already free and not try to mess
+		 * with the uninitialized part of it.
+		 */
+		ip->i_d.di_mode = 0;
+		/*
+		 * Initialize the per-fork minima and maxima for a new
+		 * inode here.  xfs_iformat will do it for old inodes.
+		 */
+		ip->i_df.if_ext_max =
+			XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	}
+
+	/*
+	 * The inode format changed when we moved the link count and
+	 * made it 32 bits long.  If this is an old format inode,
+	 * convert it in memory to look like a new one.  If it gets
+	 * flushed to disk we will convert back before flushing or
+	 * logging it.  We zero out the new projid field and the old link
+	 * count field.  We'll handle clearing the pad field (the remains
+	 * of the old uuid field) when we actually convert the inode to
+	 * the new format. We don't change the version number so that we
+	 * can distinguish this from a real new format inode.
+	 */
+	if (ip->i_d.di_version == 1) {
+		ip->i_d.di_nlink = ip->i_d.di_onlink;
+		ip->i_d.di_onlink = 0;
+		xfs_set_projid(ip, 0);
+	}
+
+	ip->i_delayed_blks = 0;
+	ip->i_size = ip->i_d.di_size;
+
+	/*
+	 * Mark the buffer containing the inode as something to keep
+	 * around for a while.  This helps to keep recently accessed
+	 * meta-data in-core longer.
+	 */
+	xfs_buf_set_ref(bp, XFS_INO_REF);
+
+	/*
+	 * Use xfs_trans_brelse() to release the buffer containing the
+	 * on-disk inode, because it was acquired with xfs_trans_read_buf()
+	 * in xfs_itobp() above.  If tp is NULL, this is just a normal
+	 * brelse().  If we're within a transaction, then xfs_trans_brelse()
+	 * will only release the buffer if it is not dirty within the
+	 * transaction.  It will be OK to release the buffer in this case,
+	 * because inodes on disk are never destroyed and we will be
+	 * locking the new in-core inode before putting it in the hash
+	 * table where other processes can find it.  Thus we don't have
+	 * to worry about the inode being changed just because we released
+	 * the buffer.
+	 */
+ out_brelse:
+	xfs_trans_brelse(tp, bp);
+	return error;
+}
+
+/*
+ * Read in extents from a btree-format inode.
+ * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
+ */
+int
+xfs_iread_extents(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	int		whichfork)
+{
+	int		error;
+	xfs_ifork_t	*ifp;
+	xfs_extnum_t	nextents;
+
+	if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+		XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
+				 ip->i_mount);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+
+	/*
+	 * We know that the size is valid (it's checked in iformat_btree)
+	 */
+	ifp->if_bytes = ifp->if_real_bytes = 0;
+	ifp->if_flags |= XFS_IFEXTENTS;
+	xfs_iext_add(ifp, 0, nextents);
+	error = xfs_bmap_read_extents(tp, ip, whichfork);
+	if (error) {
+		xfs_iext_destroy(ifp);
+		ifp->if_flags &= ~XFS_IFEXTENTS;
+		return error;
+	}
+	xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
+	return 0;
+}
+
+/*
+ * Allocate an inode on disk and return a copy of its in-core version.
+ * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
+ * appropriately within the inode.  The uid and gid for the inode are
+ * set according to the contents of the given cred structure.
+ *
+ * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
+ * has a free inode available, call xfs_iget()
+ * to obtain the in-core version of the allocated inode.  Finally,
+ * fill in the inode and log its initial contents.  In this case,
+ * ialloc_context would be set to NULL and call_again set to false.
+ *
+ * If xfs_dialloc() does not have an available inode,
+ * it will replenish its supply by doing an allocation. Since we can
+ * only do one allocation within a transaction without deadlocks, we
+ * must commit the current transaction before returning the inode itself.
+ * In this case, therefore, we will set call_again to true and return.
+ * The caller should then commit the current transaction, start a new
+ * transaction, and call xfs_ialloc() again to actually get the inode.
+ *
+ * To ensure that some other process does not grab the inode that
+ * was allocated during the first call to xfs_ialloc(), this routine
+ * also returns the [locked] bp pointing to the head of the freelist
+ * as ialloc_context.  The caller should hold this buffer across
+ * the commit and pass it back into this routine on the second call.
+ *
+ * If we are allocating quota inodes, we do not have a parent inode
+ * to attach to or associate with (i.e. pip == NULL) because they
+ * are not linked into the directory structure - they are attached
+ * directly to the superblock - and so have no parent.
+ */
+int
+xfs_ialloc(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*pip,
+	mode_t		mode,
+	xfs_nlink_t	nlink,
+	xfs_dev_t	rdev,
+	prid_t		prid,
+	int		okalloc,
+	xfs_buf_t	**ialloc_context,
+	boolean_t	*call_again,
+	xfs_inode_t	**ipp)
+{
+	xfs_ino_t	ino;
+	xfs_inode_t	*ip;
+	uint		flags;
+	int		error;
+	timespec_t	tv;
+	int		filestreams = 0;
+
+	/*
+	 * Call the space management code to pick
+	 * the on-disk inode to be allocated.
+	 */
+	error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
+			    ialloc_context, call_again, &ino);
+	if (error)
+		return error;
+	if (*call_again || ino == NULLFSINO) {
+		*ipp = NULL;
+		return 0;
+	}
+	ASSERT(*ialloc_context == NULL);
+
+	/*
+	 * Get the in-core inode with the lock held exclusively.
+	 * This is because we're setting fields here we need
+	 * to prevent others from looking at until we're done.
+	 */
+	error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
+			 XFS_ILOCK_EXCL, &ip);
+	if (error)
+		return error;
+	ASSERT(ip != NULL);
+
+	ip->i_d.di_mode = (__uint16_t)mode;
+	ip->i_d.di_onlink = 0;
+	ip->i_d.di_nlink = nlink;
+	ASSERT(ip->i_d.di_nlink == nlink);
+	ip->i_d.di_uid = current_fsuid();
+	ip->i_d.di_gid = current_fsgid();
+	xfs_set_projid(ip, prid);
+	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+
+	/*
+	 * If the superblock version is up to where we support new format
+	 * inodes and this is currently an old format inode, then change
+	 * the inode version number now.  This way we only do the conversion
+	 * here rather than here and in the flush/logging code.
+	 */
+	if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
+	    ip->i_d.di_version == 1) {
+		ip->i_d.di_version = 2;
+		/*
+		 * We've already zeroed the old link count, the projid field,
+		 * and the pad field.
+		 */
+	}
+
+	/*
+	 * Project ids won't be stored on disk if we are using a version 1 inode.
+	 */
+	if ((prid != 0) && (ip->i_d.di_version == 1))
+		xfs_bump_ino_vers2(tp, ip);
+
+	if (pip && XFS_INHERIT_GID(pip)) {
+		ip->i_d.di_gid = pip->i_d.di_gid;
+		if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) {
+			ip->i_d.di_mode |= S_ISGID;
+		}
+	}
+
+	/*
+	 * If the group ID of the new file does not match the effective group
+	 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
+	 * (and only if the irix_sgid_inherit compatibility variable is set).
+	 */
+	if ((irix_sgid_inherit) &&
+	    (ip->i_d.di_mode & S_ISGID) &&
+	    (!in_group_p((gid_t)ip->i_d.di_gid))) {
+		ip->i_d.di_mode &= ~S_ISGID;
+	}
+
+	ip->i_d.di_size = 0;
+	ip->i_size = 0;
+	ip->i_d.di_nextents = 0;
+	ASSERT(ip->i_d.di_nblocks == 0);
+
+	nanotime(&tv);
+	ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
+	ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+	ip->i_d.di_atime = ip->i_d.di_mtime;
+	ip->i_d.di_ctime = ip->i_d.di_mtime;
+
+	/*
+	 * di_gen will have been taken care of in xfs_iread.
+	 */
+	ip->i_d.di_extsize = 0;
+	ip->i_d.di_dmevmask = 0;
+	ip->i_d.di_dmstate = 0;
+	ip->i_d.di_flags = 0;
+	flags = XFS_ILOG_CORE;
+	switch (mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFSOCK:
+		ip->i_d.di_format = XFS_DINODE_FMT_DEV;
+		ip->i_df.if_u2.if_rdev = rdev;
+		ip->i_df.if_flags = 0;
+		flags |= XFS_ILOG_DEV;
+		break;
+	case S_IFREG:
+		/*
+		 * we can't set up filestreams until after the VFS inode
+		 * is set up properly.
+		 */
+		if (pip && xfs_inode_is_filestream(pip))
+			filestreams = 1;
+		/* fall through */
+	case S_IFDIR:
+		if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
+			uint	di_flags = 0;
+
+			if ((mode & S_IFMT) == S_IFDIR) {
+				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
+					di_flags |= XFS_DIFLAG_RTINHERIT;
+				if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
+					di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+					ip->i_d.di_extsize = pip->i_d.di_extsize;
+				}
+			} else if ((mode & S_IFMT) == S_IFREG) {
+				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
+					di_flags |= XFS_DIFLAG_REALTIME;
+				if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
+					di_flags |= XFS_DIFLAG_EXTSIZE;
+					ip->i_d.di_extsize = pip->i_d.di_extsize;
+				}
+			}
+			if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
+			    xfs_inherit_noatime)
+				di_flags |= XFS_DIFLAG_NOATIME;
+			if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
+			    xfs_inherit_nodump)
+				di_flags |= XFS_DIFLAG_NODUMP;
+			if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
+			    xfs_inherit_sync)
+				di_flags |= XFS_DIFLAG_SYNC;
+			if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
+			    xfs_inherit_nosymlinks)
+				di_flags |= XFS_DIFLAG_NOSYMLINKS;
+			if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+				di_flags |= XFS_DIFLAG_PROJINHERIT;
+			if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
+			    xfs_inherit_nodefrag)
+				di_flags |= XFS_DIFLAG_NODEFRAG;
+			if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
+				di_flags |= XFS_DIFLAG_FILESTREAM;
+			ip->i_d.di_flags |= di_flags;
+		}
+		/* FALLTHROUGH */
+	case S_IFLNK:
+		ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+		ip->i_df.if_flags = XFS_IFEXTENTS;
+		ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
+		ip->i_df.if_u1.if_extents = NULL;
+		break;
+	default:
+		ASSERT(0);
+	}
+	/*
+	 * Attribute fork settings for new inode.
+	 */
+	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+	ip->i_d.di_anextents = 0;
+
+	/*
+	 * Log the new values stuffed into the inode.
+	 */
+	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, flags);
+
+	/* now that we have an i_mode we can setup inode ops and unlock */
+	xfs_setup_inode(ip);
+
+	/* now we have set up the vfs inode we can associate the filestream */
+	if (filestreams) {
+		error = xfs_filestream_associate(pip, ip);
+		if (error < 0)
+			return -error;
+		if (!error)
+			xfs_iflags_set(ip, XFS_IFILESTREAM);
+	}
+
+	*ipp = ip;
+	return 0;
+}
+
+/*
+ * Check to make sure that there are no blocks allocated to the
+ * file beyond the size of the file.  We don't check this for
+ * files with fixed size extents or real time extents, but we
+ * at least do it for regular files.
+ */
+#ifdef DEBUG
+void
+xfs_isize_check(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,
+	xfs_fsize_t	isize)
+{
+	xfs_fileoff_t	map_first;
+	int		nimaps;
+	xfs_bmbt_irec_t	imaps[2];
+
+	if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
+		return;
+
+	if (XFS_IS_REALTIME_INODE(ip))
+		return;
+
+	if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
+		return;
+
+	nimaps = 2;
+	map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+	/*
+	 * The filesystem could be shutting down, so bmapi may return
+	 * an error.
+	 */
+	if (xfs_bmapi(NULL, ip, map_first,
+			 (XFS_B_TO_FSB(mp,
+				       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
+			  map_first),
+			 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
+			 NULL))
+	    return;
+	ASSERT(nimaps == 1);
+	ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
+}
+#endif	/* DEBUG */
+
+/*
+ * Calculate the last possible buffered byte in a file.  This must
+ * include data that was buffered beyond the EOF by the write code.
+ * This also needs to deal with overflowing the xfs_fsize_t type
+ * which can happen for sizes near the limit.
+ *
+ * We also need to take into account any blocks beyond the EOF.  It
+ * may be the case that they were buffered by a write which failed.
+ * In that case the pages will still be in memory, but the inode size
+ * will never have been updated.
+ */
+STATIC xfs_fsize_t
+xfs_file_last_byte(
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp;
+	xfs_fsize_t	last_byte;
+	xfs_fileoff_t	last_block;
+	xfs_fileoff_t	size_last_block;
+	int		error;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
+
+	mp = ip->i_mount;
+	/*
+	 * Only check for blocks beyond the EOF if the extents have
+	 * been read in.  This eliminates the need for the inode lock,
+	 * and it also saves us from looking when it really isn't
+	 * necessary.
+	 */
+	if (ip->i_df.if_flags & XFS_IFEXTENTS) {
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+		error = xfs_bmap_last_offset(NULL, ip, &last_block,
+			XFS_DATA_FORK);
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		if (error) {
+			last_block = 0;
+		}
+	} else {
+		last_block = 0;
+	}
+	size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_size);
+	last_block = XFS_FILEOFF_MAX(last_block, size_last_block);
+
+	last_byte = XFS_FSB_TO_B(mp, last_block);
+	if (last_byte < 0) {
+		return XFS_MAXIOFFSET(mp);
+	}
+	last_byte += (1 << mp->m_writeio_log);
+	if (last_byte < 0) {
+		return XFS_MAXIOFFSET(mp);
+	}
+	return last_byte;
+}
+
+/*
+ * Start the truncation of the file to new_size.  The new size
+ * must be smaller than the current size.  This routine will
+ * clear the buffer and page caches of file data in the removed
+ * range, and xfs_itruncate_finish() will remove the underlying
+ * disk blocks.
+ *
+ * The inode must have its I/O lock locked EXCLUSIVELY, and it
+ * must NOT have the inode lock held at all.  This is because we're
+ * calling into the buffer/page cache code and we can't hold the
+ * inode lock when we do so.
+ *
+ * We need to wait for any direct I/Os in flight to complete before we
+ * proceed with the truncate. This is needed to prevent the extents
+ * being read or written by the direct I/Os from being removed while the
+ * I/O is in flight as there is no other method of synchronising
+ * direct I/O with the truncate operation.  Also, because we hold
+ * the IOLOCK in exclusive mode, we prevent new direct I/Os from being
+ * started until the truncate completes and drops the lock. Essentially,
+ * the xfs_ioend_wait() call forms an I/O barrier that provides strict
+ * ordering between direct I/Os and the truncate operation.
+ *
+ * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
+ * or XFS_ITRUNC_MAYBE.  The XFS_ITRUNC_MAYBE value should be used
+ * in the case that the caller is locking things out of order and
+ * may not be able to call xfs_itruncate_finish() with the inode lock
+ * held without dropping the I/O lock.  If the caller must drop the
+ * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start()
+ * must be called again with all the same restrictions as the initial
+ * call.
+ */
+int
+xfs_itruncate_start(
+	xfs_inode_t	*ip,
+	uint		flags,
+	xfs_fsize_t	new_size)
+{
+	xfs_fsize_t	last_byte;
+	xfs_off_t	toss_start;
+	xfs_mount_t	*mp;
+	int		error = 0;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	ASSERT((new_size == 0) || (new_size <= ip->i_size));
+	ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
+	       (flags == XFS_ITRUNC_MAYBE));
+
+	mp = ip->i_mount;
+
+	/* wait for the completion of any pending DIOs */
+	if (new_size == 0 || new_size < ip->i_size)
+		xfs_ioend_wait(ip);
+
+	/*
+	 * Call toss_pages or flushinval_pages to get rid of pages
+	 * overlapping the region being removed.  We have to use
+	 * the less efficient flushinval_pages in the case that the
+	 * caller may not be able to finish the truncate without
+	 * dropping the inode's I/O lock.  Make sure
+	 * to catch any pages brought in by buffers overlapping
+	 * the EOF by searching out beyond the isize by our
+	 * block size. We round new_size up to a block boundary
+	 * so that we don't toss things on the same block as
+	 * new_size but before it.
+	 *
+	 * Before calling toss_page or flushinval_pages, make sure to
+	 * call remapf() over the same region if the file is mapped.
+	 * This frees up mapped file references to the pages in the
+	 * given range and for the flushinval_pages case it ensures
+	 * that we get the latest mapped changes flushed out.
+	 */
+	toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
+	toss_start = XFS_FSB_TO_B(mp, toss_start);
+	if (toss_start < 0) {
+		/*
+		 * The place to start tossing is beyond our maximum
+		 * file size, so there is no way that the data extended
+		 * out there.
+		 */
+		return 0;
+	}
+	last_byte = xfs_file_last_byte(ip);
+	trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte);
+	if (last_byte > toss_start) {
+		if (flags & XFS_ITRUNC_DEFINITE) {
+			xfs_tosspages(ip, toss_start,
+					-1, FI_REMAPF_LOCKED);
+		} else {
+			error = xfs_flushinval_pages(ip, toss_start,
+					-1, FI_REMAPF_LOCKED);
+		}
+	}
+
+#ifdef DEBUG
+	if (new_size == 0) {
+		ASSERT(VN_CACHED(VFS_I(ip)) == 0);
+	}
+#endif
+	return error;
+}
+
+/*
+ * Shrink the file to the given new_size.  The new size must be smaller than
+ * the current size.  This will free up the underlying blocks in the removed
+ * range after a call to xfs_itruncate_start() or xfs_atruncate_start().
+ *
+ * The transaction passed to this routine must have made a permanent log
+ * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
+ * given transaction and start new ones, so make sure everything involved in
+ * the transaction is tidy before calling here.  Some transaction will be
+ * returned to the caller to be committed.  The incoming transaction must
+ * already include the inode, and both inode locks must be held exclusively.
+ * The inode must also be "held" within the transaction.  On return the inode
+ * will be "held" within the returned transaction.  This routine does NOT
+ * require any disk space to be reserved for it within the transaction.
+ *
+ * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it
+ * indicates the fork which is to be truncated.  For the attribute fork we only
+ * support truncation to size 0.
+ *
+ * We use the sync parameter to indicate whether or not the first transaction
+ * we perform might have to be synchronous.  For the attr fork, it needs to be
+ * so if the unlink of the inode is not yet known to be permanent in the log.
+ * This keeps us from freeing and reusing the blocks of the attribute fork
+ * before the unlink of the inode becomes permanent.
+ *
+ * For the data fork, we normally have to run synchronously if we're being
+ * called out of the inactive path or we're being called out of the create path
+ * where we're truncating an existing file.  Either way, the truncate needs to
+ * be sync so blocks don't reappear in the file with altered data in case of a
+ * crash.  wsync filesystems can run the first case async because anything that
+ * shrinks the inode has to run sync so by the time we're called here from
+ * inactive, the inode size is permanently set to 0.
+ *
+ * Calls from the truncate path always need to be sync unless we're in a wsync
+ * filesystem and the file has already been unlinked.
+ *
+ * The caller is responsible for correctly setting the sync parameter.  It gets
+ * too hard for us to guess here which path we're being called out of just
+ * based on inode state.
+ *
+ * If we get an error, we must return with the inode locked and linked into the
+ * current transaction. This keeps things simple for the higher level code,
+ * because it always knows that the inode is locked and held in the transaction
+ * that returns to it whether errors occur or not.  We don't mark the inode
+ * dirty on error so that transactions can be easily aborted if possible.
+ */
+int
+xfs_itruncate_finish(
+	xfs_trans_t	**tp,
+	xfs_inode_t	*ip,
+	xfs_fsize_t	new_size,
+	int		fork,
+	int		sync)
+{
+	xfs_fsblock_t	first_block;
+	xfs_fileoff_t	first_unmap_block;
+	xfs_fileoff_t	last_block;
+	xfs_filblks_t	unmap_len=0;
+	xfs_mount_t	*mp;
+	xfs_trans_t	*ntp;
+	int		done;
+	int		committed;
+	xfs_bmap_free_t	free_list;
+	int		error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+	ASSERT((new_size == 0) || (new_size <= ip->i_size));
+	ASSERT(*tp != NULL);
+	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+	ASSERT(ip->i_transp == *tp);
+	ASSERT(ip->i_itemp != NULL);
+	ASSERT(ip->i_itemp->ili_lock_flags == 0);
+
+
+	ntp = *tp;
+	mp = (ntp)->t_mountp;
+	ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
+
+	/*
+	 * We only support truncating the entire attribute fork.
+	 */
+	if (fork == XFS_ATTR_FORK) {
+		new_size = 0LL;
+	}
+	first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
+	trace_xfs_itruncate_finish_start(ip, new_size);
+
+	/*
+	 * The first thing we do is set the size to new_size permanently
+	 * on disk.  This way we don't have to worry about anyone ever
+	 * being able to look at the data being freed even in the face
+	 * of a crash.  What we're getting around here is the case where
+	 * we free a block, it is allocated to another file, it is written
+	 * to, and then we crash.  If the new data gets written to the
+	 * file but the log buffers containing the free and reallocation
+	 * don't, then we'd end up with garbage in the blocks being freed.
+	 * As long as we make the new_size permanent before actually
+	 * freeing any blocks it doesn't matter if they get written to.
+	 *
+	 * The callers must signal into us whether or not the size
+	 * setting here must be synchronous.  There are a few cases
+	 * where it doesn't have to be synchronous.  Those cases
+	 * occur if the file is unlinked and we know the unlink is
+	 * permanent or if the blocks being truncated are guaranteed
+	 * to be beyond the inode eof (regardless of the link count)
+	 * and the eof value is permanent.  Both of these cases occur
+	 * only on wsync-mounted filesystems.  In those cases, we're
+	 * guaranteed that no user will ever see the data in the blocks
+	 * that are being truncated so the truncate can run async.
+	 * In the free beyond eof case, the file may wind up with
+	 * more blocks allocated to it than it needs if we crash
+	 * and that won't get fixed until the next time the file
+	 * is re-opened and closed but that's ok as that shouldn't
+	 * be too many blocks.
+	 *
+	 * However, we can't just make all wsync xactions run async
+	 * because there's one call out of the create path that needs
+	 * to run sync where it's truncating an existing file to size
+	 * 0 whose size is > 0.
+	 *
+	 * It's probably possible to come up with a test in this
+	 * routine that would correctly distinguish all the above
+	 * cases from the values of the function parameters and the
+	 * inode state but for sanity's sake, I've decided to let the
+	 * layers above just tell us.  It's simpler to correctly figure
+	 * out in the layer above exactly under what conditions we
+	 * can run async and I think it's easier for others read and
+	 * follow the logic in case something has to be changed.
+	 * cscope is your friend -- rcc.
+	 *
+	 * The attribute fork is much simpler.
+	 *
+	 * For the attribute fork we allow the caller to tell us whether
+	 * the unlink of the inode that led to this call is yet permanent
+	 * in the on disk log.  If it is not and we will be freeing extents
+	 * in this inode then we make the first transaction synchronous
+	 * to make sure that the unlink is permanent by the time we free
+	 * the blocks.
+	 */
+	if (fork == XFS_DATA_FORK) {
+		if (ip->i_d.di_nextents > 0) {
+			/*
+			 * If we are not changing the file size then do
+			 * not update the on-disk file size - we may be
+			 * called from xfs_inactive_free_eofblocks().  If we
+			 * update the on-disk file size and then the system
+			 * crashes before the contents of the file are
+			 * flushed to disk then the files may be full of
+			 * holes (ie NULL files bug).
+			 */
+			if (ip->i_size != new_size) {
+				ip->i_d.di_size = new_size;
+				ip->i_size = new_size;
+				xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+			}
+		}
+	}
+
+	/*
+	 * Since it is possible for space to become allocated beyond
+	 * the end of the file (in a crash where the space is allocated
+	 * but the inode size is not yet updated), simply remove any
+	 * blocks which show up between the new EOF and the maximum
+	 * possible file size.  If the first block to be removed is
+	 * beyond the maximum file size (ie it is the same as last_block),
+	 * then there is nothing to do.
+	 */
+	last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+	ASSERT(first_unmap_block <= last_block);
+	done = 0;
+	if (last_block == first_unmap_block) {
+		done = 1;
+	} else {
+		unmap_len = last_block - first_unmap_block + 1;
+	}
+	while (!done) {
+		/*
+		 * Free up up to XFS_ITRUNC_MAX_EXTENTS.  xfs_bunmapi()
+		 * will tell us whether it freed the entire range or
+		 * not.  If this is a synchronous mount (wsync),
+		 * then we can tell bunmapi to keep all the
+		 * transactions asynchronous since the unlink
+		 * transaction that made this inode inactive has
+		 * already hit the disk.  There's no danger of
+		 * the freed blocks being reused, there being a
+		 * crash, and the reused blocks suddenly reappearing
+		 * in this file with garbage in them once recovery
+		 * runs.
+		 */
+		xfs_bmap_init(&free_list, &first_block);
+		error = xfs_bunmapi(ntp, ip,
+				    first_unmap_block, unmap_len,
+				    xfs_bmapi_aflag(fork),
+				    XFS_ITRUNC_MAX_EXTENTS,
+				    &first_block, &free_list,
+				    &done);
+		if (error) {
+			/*
+			 * If the bunmapi call encounters an error,
+			 * return to the caller where the transaction
+			 * can be properly aborted.  We just need to
+			 * make sure we're not holding any resources
+			 * that we were not when we came in.
+			 */
+			xfs_bmap_cancel(&free_list);
+			return error;
+		}
+
+		/*
+		 * Duplicate the transaction that has the permanent
+		 * reservation and commit the old transaction.
+		 */
+		error = xfs_bmap_finish(tp, &free_list, &committed);
+		ntp = *tp;
+		if (committed)
+			xfs_trans_ijoin(ntp, ip);
+
+		if (error) {
+			/*
+			 * If the bmap finish call encounters an error, return
+			 * to the caller where the transaction can be properly
+			 * aborted.  We just need to make sure we're not
+			 * holding any resources that we were not when we came
+			 * in.
+			 *
+			 * Aborting from this point might lose some blocks in
+			 * the file system, but oh well.
+			 */
+			xfs_bmap_cancel(&free_list);
+			return error;
+		}
+
+		if (committed) {
+			/*
+			 * Mark the inode dirty so it will be logged and
+			 * moved forward in the log as part of every commit.
+			 */
+			xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+		}
+
+		ntp = xfs_trans_dup(ntp);
+		error = xfs_trans_commit(*tp, 0);
+		*tp = ntp;
+
+		xfs_trans_ijoin(ntp, ip);
+
+		if (error)
+			return error;
+		/*
+		 * transaction commit worked ok so we can drop the extra ticket
+		 * reference that we gained in xfs_trans_dup()
+		 */
+		xfs_log_ticket_put(ntp->t_ticket);
+		error = xfs_trans_reserve(ntp, 0,
+					XFS_ITRUNCATE_LOG_RES(mp), 0,
+					XFS_TRANS_PERM_LOG_RES,
+					XFS_ITRUNCATE_LOG_COUNT);
+		if (error)
+			return error;
+	}
+	/*
+	 * Only update the size in the case of the data fork, but
+	 * always re-log the inode so that our permanent transaction
+	 * can keep on rolling it forward in the log.
+	 */
+	if (fork == XFS_DATA_FORK) {
+		xfs_isize_check(mp, ip, new_size);
+		/*
+		 * If we are not changing the file size then do
+		 * not update the on-disk file size - we may be
+		 * called from xfs_inactive_free_eofblocks().  If we
+		 * update the on-disk file size and then the system
+		 * crashes before the contents of the file are
+		 * flushed to disk then the files may be full of
+		 * holes (ie NULL files bug).
+		 */
+		if (ip->i_size != new_size) {
+			ip->i_d.di_size = new_size;
+			ip->i_size = new_size;
+		}
+	}
+	xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+	ASSERT((new_size != 0) ||
+	       (fork == XFS_ATTR_FORK) ||
+	       (ip->i_delayed_blks == 0));
+	ASSERT((new_size != 0) ||
+	       (fork == XFS_ATTR_FORK) ||
+	       (ip->i_d.di_nextents == 0));
+	trace_xfs_itruncate_finish_end(ip, new_size);
+	return 0;
+}
+
+/*
+ * This is called when the inode's link count goes to 0.
+ * We place the on-disk inode on a list in the AGI.  It
+ * will be pulled from this list when the inode is freed.
+ */
+int
+xfs_iunlink(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp;
+	xfs_agi_t	*agi;
+	xfs_dinode_t	*dip;
+	xfs_buf_t	*agibp;
+	xfs_buf_t	*ibp;
+	xfs_agino_t	agino;
+	short		bucket_index;
+	int		offset;
+	int		error;
+
+	ASSERT(ip->i_d.di_nlink == 0);
+	ASSERT(ip->i_d.di_mode != 0);
+	ASSERT(ip->i_transp == tp);
+
+	mp = tp->t_mountp;
+
+	/*
+	 * Get the agi buffer first.  It ensures lock ordering
+	 * on the list.
+	 */
+	error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
+	if (error)
+		return error;
+	agi = XFS_BUF_TO_AGI(agibp);
+
+	/*
+	 * Get the index into the agi hash table for the
+	 * list this inode will go on.
+	 */
+	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	ASSERT(agino != 0);
+	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	ASSERT(agi->agi_unlinked[bucket_index]);
+	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
+
+	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) {
+		/*
+		 * There is already another inode in the bucket we need
+		 * to add ourselves to.  Add us at the front of the list.
+		 * Here we put the head pointer into our next pointer,
+		 * and then we fall through to point the head at us.
+		 */
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
+		if (error)
+			return error;
+
+		ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO);
+		/* both on-disk, don't endian flip twice */
+		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
+		offset = ip->i_imap.im_boffset +
+			offsetof(xfs_dinode_t, di_next_unlinked);
+		xfs_trans_inode_buf(tp, ibp);
+		xfs_trans_log_buf(tp, ibp, offset,
+				  (offset + sizeof(xfs_agino_t) - 1));
+		xfs_inobp_check(mp, ibp);
+	}
+
+	/*
+	 * Point the bucket head pointer at the inode being inserted.
+	 */
+	ASSERT(agino != 0);
+	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
+	offset = offsetof(xfs_agi_t, agi_unlinked) +
+		(sizeof(xfs_agino_t) * bucket_index);
+	xfs_trans_log_buf(tp, agibp, offset,
+			  (offset + sizeof(xfs_agino_t) - 1));
+	return 0;
+}
+
+/*
+ * Pull the on-disk inode from the AGI unlinked list.
+ */
+STATIC int
+xfs_iunlink_remove(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	xfs_ino_t	next_ino;
+	xfs_mount_t	*mp;
+	xfs_agi_t	*agi;
+	xfs_dinode_t	*dip;
+	xfs_buf_t	*agibp;
+	xfs_buf_t	*ibp;
+	xfs_agnumber_t	agno;
+	xfs_agino_t	agino;
+	xfs_agino_t	next_agino;
+	xfs_buf_t	*last_ibp;
+	xfs_dinode_t	*last_dip = NULL;
+	short		bucket_index;
+	int		offset, last_offset = 0;
+	int		error;
+
+	mp = tp->t_mountp;
+	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+
+	/*
+	 * Get the agi buffer first.  It ensures lock ordering
+	 * on the list.
+	 */
+	error = xfs_read_agi(mp, tp, agno, &agibp);
+	if (error)
+		return error;
+
+	agi = XFS_BUF_TO_AGI(agibp);
+
+	/*
+	 * Get the index into the agi hash table for the
+	 * list this inode will go on.
+	 */
+	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	ASSERT(agino != 0);
+	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO);
+	ASSERT(agi->agi_unlinked[bucket_index]);
+
+	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
+		/*
+		 * We're at the head of the list.  Get the inode's
+		 * on-disk buffer to see if there is anyone after us
+		 * on the list.  Only modify our next pointer if it
+		 * is not already NULLAGINO.  This saves us the overhead
+		 * of dealing with the buffer when there is no need to
+		 * change it.
+		 */
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
+		if (error) {
+			xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
+				__func__, error);
+			return error;
+		}
+		next_agino = be32_to_cpu(dip->di_next_unlinked);
+		ASSERT(next_agino != 0);
+		if (next_agino != NULLAGINO) {
+			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
+			offset = ip->i_imap.im_boffset +
+				offsetof(xfs_dinode_t, di_next_unlinked);
+			xfs_trans_inode_buf(tp, ibp);
+			xfs_trans_log_buf(tp, ibp, offset,
+					  (offset + sizeof(xfs_agino_t) - 1));
+			xfs_inobp_check(mp, ibp);
+		} else {
+			xfs_trans_brelse(tp, ibp);
+		}
+		/*
+		 * Point the bucket head pointer at the next inode.
+		 */
+		ASSERT(next_agino != 0);
+		ASSERT(next_agino != agino);
+		agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
+		offset = offsetof(xfs_agi_t, agi_unlinked) +
+			(sizeof(xfs_agino_t) * bucket_index);
+		xfs_trans_log_buf(tp, agibp, offset,
+				  (offset + sizeof(xfs_agino_t) - 1));
+	} else {
+		/*
+		 * We need to search the list for the inode being freed.
+		 */
+		next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+		last_ibp = NULL;
+		while (next_agino != agino) {
+			/*
+			 * If the last inode wasn't the one pointing to
+			 * us, then release its buffer since we're not
+			 * going to do anything with it.
+			 */
+			if (last_ibp != NULL) {
+				xfs_trans_brelse(tp, last_ibp);
+			}
+			next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
+			error = xfs_inotobp(mp, tp, next_ino, &last_dip,
+					    &last_ibp, &last_offset, 0);
+			if (error) {
+				xfs_warn(mp,
+					"%s: xfs_inotobp() returned error %d.",
+					__func__, error);
+				return error;
+			}
+			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
+			ASSERT(next_agino != NULLAGINO);
+			ASSERT(next_agino != 0);
+		}
+		/*
+		 * Now last_ibp points to the buffer previous to us on
+		 * the unlinked list.  Pull us from the list.
+		 */
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
+		if (error) {
+			xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
+				__func__, error);
+			return error;
+		}
+		next_agino = be32_to_cpu(dip->di_next_unlinked);
+		ASSERT(next_agino != 0);
+		ASSERT(next_agino != agino);
+		if (next_agino != NULLAGINO) {
+			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
+			offset = ip->i_imap.im_boffset +
+				offsetof(xfs_dinode_t, di_next_unlinked);
+			xfs_trans_inode_buf(tp, ibp);
+			xfs_trans_log_buf(tp, ibp, offset,
+					  (offset + sizeof(xfs_agino_t) - 1));
+			xfs_inobp_check(mp, ibp);
+		} else {
+			xfs_trans_brelse(tp, ibp);
+		}
+		/*
+		 * Point the previous inode on the list to the next inode.
+		 */
+		last_dip->di_next_unlinked = cpu_to_be32(next_agino);
+		ASSERT(next_agino != 0);
+		offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
+		xfs_trans_inode_buf(tp, last_ibp);
+		xfs_trans_log_buf(tp, last_ibp, offset,
+				  (offset + sizeof(xfs_agino_t) - 1));
+		xfs_inobp_check(mp, last_ibp);
+	}
+	return 0;
+}
+
+/*
+ * A big issue when freeing the inode cluster is is that we _cannot_ skip any
+ * inodes that are in memory - they all must be marked stale and attached to
+ * the cluster buffer.
+ */
+STATIC void
+xfs_ifree_cluster(
+	xfs_inode_t	*free_ip,
+	xfs_trans_t	*tp,
+	xfs_ino_t	inum)
+{
+	xfs_mount_t		*mp = free_ip->i_mount;
+	int			blks_per_cluster;
+	int			nbufs;
+	int			ninodes;
+	int			i, j;
+	xfs_daddr_t		blkno;
+	xfs_buf_t		*bp;
+	xfs_inode_t		*ip;
+	xfs_inode_log_item_t	*iip;
+	xfs_log_item_t		*lip;
+	struct xfs_perag	*pag;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
+	if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+		blks_per_cluster = 1;
+		ninodes = mp->m_sb.sb_inopblock;
+		nbufs = XFS_IALLOC_BLOCKS(mp);
+	} else {
+		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
+					mp->m_sb.sb_blocksize;
+		ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
+		nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
+	}
+
+	for (j = 0; j < nbufs; j++, inum += ninodes) {
+		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
+					 XFS_INO_TO_AGBNO(mp, inum));
+
+		/*
+		 * We obtain and lock the backing buffer first in the process
+		 * here, as we have to ensure that any dirty inode that we
+		 * can't get the flush lock on is attached to the buffer.
+		 * If we scan the in-memory inodes first, then buffer IO can
+		 * complete before we get a lock on it, and hence we may fail
+		 * to mark all the active inodes on the buffer stale.
+		 */
+		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+					mp->m_bsize * blks_per_cluster,
+					XBF_LOCK);
+
+		/*
+		 * Walk the inodes already attached to the buffer and mark them
+		 * stale. These will all have the flush locks held, so an
+		 * in-memory inode walk can't lock them. By marking them all
+		 * stale first, we will not attempt to lock them in the loop
+		 * below as the XFS_ISTALE flag will be set.
+		 */
+		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+		while (lip) {
+			if (lip->li_type == XFS_LI_INODE) {
+				iip = (xfs_inode_log_item_t *)lip;
+				ASSERT(iip->ili_logged == 1);
+				lip->li_cb = xfs_istale_done;
+				xfs_trans_ail_copy_lsn(mp->m_ail,
+							&iip->ili_flush_lsn,
+							&iip->ili_item.li_lsn);
+				xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
+			}
+			lip = lip->li_bio_list;
+		}
+
+
+		/*
+		 * For each inode in memory attempt to add it to the inode
+		 * buffer and set it up for being staled on buffer IO
+		 * completion.  This is safe as we've locked out tail pushing
+		 * and flushing by locking the buffer.
+		 *
+		 * We have already marked every inode that was part of a
+		 * transaction stale above, which means there is no point in
+		 * even trying to lock them.
+		 */
+		for (i = 0; i < ninodes; i++) {
+retry:
+			rcu_read_lock();
+			ip = radix_tree_lookup(&pag->pag_ici_root,
+					XFS_INO_TO_AGINO(mp, (inum + i)));
+
+			/* Inode not in memory, nothing to do */
+			if (!ip) {
+				rcu_read_unlock();
+				continue;
+			}
+
+			/*
+			 * because this is an RCU protected lookup, we could
+			 * find a recently freed or even reallocated inode
+			 * during the lookup. We need to check under the
+			 * i_flags_lock for a valid inode here. Skip it if it
+			 * is not valid, the wrong inode or stale.
+			 */
+			spin_lock(&ip->i_flags_lock);
+			if (ip->i_ino != inum + i ||
+			    __xfs_iflags_test(ip, XFS_ISTALE)) {
+				spin_unlock(&ip->i_flags_lock);
+				rcu_read_unlock();
+				continue;
+			}
+			spin_unlock(&ip->i_flags_lock);
+
+			/*
+			 * Don't try to lock/unlock the current inode, but we
+			 * _cannot_ skip the other inodes that we did not find
+			 * in the list attached to the buffer and are not
+			 * already marked stale. If we can't lock it, back off
+			 * and retry.
+			 */
+			if (ip != free_ip &&
+			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+				rcu_read_unlock();
+				delay(1);
+				goto retry;
+			}
+			rcu_read_unlock();
+
+			xfs_iflock(ip);
+			xfs_iflags_set(ip, XFS_ISTALE);
+
+			/*
+			 * we don't need to attach clean inodes or those only
+			 * with unlogged changes (which we throw away, anyway).
+			 */
+			iip = ip->i_itemp;
+			if (!iip || xfs_inode_clean(ip)) {
+				ASSERT(ip != free_ip);
+				ip->i_update_core = 0;
+				xfs_ifunlock(ip);
+				xfs_iunlock(ip, XFS_ILOCK_EXCL);
+				continue;
+			}
+
+			iip->ili_last_fields = iip->ili_format.ilf_fields;
+			iip->ili_format.ilf_fields = 0;
+			iip->ili_logged = 1;
+			xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+						&iip->ili_item.li_lsn);
+
+			xfs_buf_attach_iodone(bp, xfs_istale_done,
+						  &iip->ili_item);
+
+			if (ip != free_ip)
+				xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		}
+
+		xfs_trans_stale_inode_buf(tp, bp);
+		xfs_trans_binval(tp, bp);
+	}
+
+	xfs_perag_put(pag);
+}
+
+/*
+ * This is called to return an inode to the inode free list.
+ * The inode should already be truncated to 0 length and have
+ * no pages associated with it.  This routine also assumes that
+ * the inode is already a part of the transaction.
+ *
+ * The on-disk copy of the inode will have been added to the list
+ * of unlinked inodes in the AGI. We need to remove the inode from
+ * that list atomically with respect to freeing it here.
+ */
+int
+xfs_ifree(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_bmap_free_t	*flist)
+{
+	int			error;
+	int			delete;
+	xfs_ino_t		first_ino;
+	xfs_dinode_t    	*dip;
+	xfs_buf_t       	*ibp;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(ip->i_transp == tp);
+	ASSERT(ip->i_d.di_nlink == 0);
+	ASSERT(ip->i_d.di_nextents == 0);
+	ASSERT(ip->i_d.di_anextents == 0);
+	ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
+	       ((ip->i_d.di_mode & S_IFMT) != S_IFREG));
+	ASSERT(ip->i_d.di_nblocks == 0);
+
+	/*
+	 * Pull the on-disk inode from the AGI unlinked list.
+	 */
+	error = xfs_iunlink_remove(tp, ip);
+	if (error != 0) {
+		return error;
+	}
+
+	error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
+	if (error != 0) {
+		return error;
+	}
+	ip->i_d.di_mode = 0;		/* mark incore inode as free */
+	ip->i_d.di_flags = 0;
+	ip->i_d.di_dmevmask = 0;
+	ip->i_d.di_forkoff = 0;		/* mark the attr fork not in use */
+	ip->i_df.if_ext_max =
+		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+	/*
+	 * Bump the generation count so no one will be confused
+	 * by reincarnations of this inode.
+	 */
+	ip->i_d.di_gen++;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK);
+	if (error)
+		return error;
+
+        /*
+	* Clear the on-disk di_mode. This is to prevent xfs_bulkstat
+	* from picking up this inode when it is reclaimed (its incore state
+	* initialzed but not flushed to disk yet). The in-core di_mode is
+	* already cleared  and a corresponding transaction logged.
+	* The hack here just synchronizes the in-core to on-disk
+	* di_mode value in advance before the actual inode sync to disk.
+	* This is OK because the inode is already unlinked and would never
+	* change its di_mode again for this inode generation.
+	* This is a temporary hack that would require a proper fix
+	* in the future.
+	*/
+	dip->di_mode = 0;
+
+	if (delete) {
+		xfs_ifree_cluster(ip, tp, first_ino);
+	}
+
+	return 0;
+}
+
+/*
+ * Reallocate the space for if_broot based on the number of records
+ * being added or deleted as indicated in rec_diff.  Move the records
+ * and pointers in if_broot to fit the new size.  When shrinking this
+ * will eliminate holes between the records and pointers created by
+ * the caller.  When growing this will create holes to be filled in
+ * by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root.  If the if_broot is currently NULL, then
+ * if we adding records one will be allocated.  The caller must also
+ * not request that the number of records go below zero, although
+ * it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * ext_diff -- the change in the number of records, positive or negative,
+ *	 requested for the if_broot array.
+ */
+void
+xfs_iroot_realloc(
+	xfs_inode_t		*ip,
+	int			rec_diff,
+	int			whichfork)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	int			cur_max;
+	xfs_ifork_t		*ifp;
+	struct xfs_btree_block	*new_broot;
+	int			new_max;
+	size_t			new_size;
+	char			*np;
+	char			*op;
+
+	/*
+	 * Handle the degenerate case quietly.
+	 */
+	if (rec_diff == 0) {
+		return;
+	}
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (rec_diff > 0) {
+		/*
+		 * If there wasn't any memory allocated before, just
+		 * allocate it now and get out.
+		 */
+		if (ifp->if_broot_bytes == 0) {
+			new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
+			ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+			ifp->if_broot_bytes = (int)new_size;
+			return;
+		}
+
+		/*
+		 * If there is already an existing if_broot, then we need
+		 * to realloc() it and shift the pointers to their new
+		 * location.  The records don't change location because
+		 * they are kept butted up against the btree block header.
+		 */
+		cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+		new_max = cur_max + rec_diff;
+		new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
+		ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
+				(size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
+				KM_SLEEP | KM_NOFS);
+		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+						     ifp->if_broot_bytes);
+		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+						     (int)new_size);
+		ifp->if_broot_bytes = (int)new_size;
+		ASSERT(ifp->if_broot_bytes <=
+			XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
+		memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
+		return;
+	}
+
+	/*
+	 * rec_diff is less than 0.  In this case, we are shrinking the
+	 * if_broot buffer.  It must already exist.  If we go to zero
+	 * records, just get rid of the root and clear the status bit.
+	 */
+	ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
+	cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+	new_max = cur_max + rec_diff;
+	ASSERT(new_max >= 0);
+	if (new_max > 0)
+		new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
+	else
+		new_size = 0;
+	if (new_size > 0) {
+		new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+		/*
+		 * First copy over the btree block header.
+		 */
+		memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
+	} else {
+		new_broot = NULL;
+		ifp->if_flags &= ~XFS_IFBROOT;
+	}
+
+	/*
+	 * Only copy the records and pointers if there are any.
+	 */
+	if (new_max > 0) {
+		/*
+		 * First copy the records.
+		 */
+		op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
+		np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
+		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+
+		/*
+		 * Then copy the pointers.
+		 */
+		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+						     ifp->if_broot_bytes);
+		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
+						     (int)new_size);
+		memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
+	}
+	kmem_free(ifp->if_broot);
+	ifp->if_broot = new_broot;
+	ifp->if_broot_bytes = (int)new_size;
+	ASSERT(ifp->if_broot_bytes <=
+		XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
+	return;
+}
+
+
+/*
+ * This is called when the amount of space needed for if_data
+ * is increased or decreased.  The change in size is indicated by
+ * the number of bytes that need to be added or deleted in the
+ * byte_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer.  Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_data area is changing
+ * byte_diff -- the change in the number of bytes, positive or negative,
+ *	 requested for the if_data array.
+ */
+void
+xfs_idata_realloc(
+	xfs_inode_t	*ip,
+	int		byte_diff,
+	int		whichfork)
+{
+	xfs_ifork_t	*ifp;
+	int		new_size;
+	int		real_size;
+
+	if (byte_diff == 0) {
+		return;
+	}
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	new_size = (int)ifp->if_bytes + byte_diff;
+	ASSERT(new_size >= 0);
+
+	if (new_size == 0) {
+		if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+			kmem_free(ifp->if_u1.if_data);
+		}
+		ifp->if_u1.if_data = NULL;
+		real_size = 0;
+	} else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
+		/*
+		 * If the valid extents/data can fit in if_inline_ext/data,
+		 * copy them from the malloc'd vector and free it.
+		 */
+		if (ifp->if_u1.if_data == NULL) {
+			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+			ASSERT(ifp->if_real_bytes != 0);
+			memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
+			      new_size);
+			kmem_free(ifp->if_u1.if_data);
+			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+		}
+		real_size = 0;
+	} else {
+		/*
+		 * Stuck with malloc/realloc.
+		 * For inline data, the underlying buffer must be
+		 * a multiple of 4 bytes in size so that it can be
+		 * logged and stay on word boundaries.  We enforce
+		 * that here.
+		 */
+		real_size = roundup(new_size, 4);
+		if (ifp->if_u1.if_data == NULL) {
+			ASSERT(ifp->if_real_bytes == 0);
+			ifp->if_u1.if_data = kmem_alloc(real_size,
+							KM_SLEEP | KM_NOFS);
+		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+			/*
+			 * Only do the realloc if the underlying size
+			 * is really changing.
+			 */
+			if (ifp->if_real_bytes != real_size) {
+				ifp->if_u1.if_data =
+					kmem_realloc(ifp->if_u1.if_data,
+							real_size,
+							ifp->if_real_bytes,
+							KM_SLEEP | KM_NOFS);
+			}
+		} else {
+			ASSERT(ifp->if_real_bytes == 0);
+			ifp->if_u1.if_data = kmem_alloc(real_size,
+							KM_SLEEP | KM_NOFS);
+			memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
+				ifp->if_bytes);
+		}
+	}
+	ifp->if_real_bytes = real_size;
+	ifp->if_bytes = new_size;
+	ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+}
+
+void
+xfs_idestroy_fork(
+	xfs_inode_t	*ip,
+	int		whichfork)
+{
+	xfs_ifork_t	*ifp;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (ifp->if_broot != NULL) {
+		kmem_free(ifp->if_broot);
+		ifp->if_broot = NULL;
+	}
+
+	/*
+	 * If the format is local, then we can't have an extents
+	 * array so just look for an inline data array.  If we're
+	 * not local then we may or may not have an extents list,
+	 * so check and free it up if we do.
+	 */
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
+		    (ifp->if_u1.if_data != NULL)) {
+			ASSERT(ifp->if_real_bytes != 0);
+			kmem_free(ifp->if_u1.if_data);
+			ifp->if_u1.if_data = NULL;
+			ifp->if_real_bytes = 0;
+		}
+	} else if ((ifp->if_flags & XFS_IFEXTENTS) &&
+		   ((ifp->if_flags & XFS_IFEXTIREC) ||
+		    ((ifp->if_u1.if_extents != NULL) &&
+		     (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
+		ASSERT(ifp->if_real_bytes != 0);
+		xfs_iext_destroy(ifp);
+	}
+	ASSERT(ifp->if_u1.if_extents == NULL ||
+	       ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
+	ASSERT(ifp->if_real_bytes == 0);
+	if (whichfork == XFS_ATTR_FORK) {
+		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+		ip->i_afp = NULL;
+	}
+}
+
+/*
+ * This is called to unpin an inode.  The caller must have the inode locked
+ * in at least shared mode so that the buffer cannot be subsequently pinned
+ * once someone is waiting for it to be unpinned.
+ */
+static void
+xfs_iunpin_nowait(
+	struct xfs_inode	*ip)
+{
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+
+	trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
+
+	/* Give the log a push to start the unpinning I/O */
+	xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
+
+}
+
+void
+xfs_iunpin_wait(
+	struct xfs_inode	*ip)
+{
+	if (xfs_ipincount(ip)) {
+		xfs_iunpin_nowait(ip);
+		wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
+	}
+}
+
+/*
+ * xfs_iextents_copy()
+ *
+ * This is called to copy the REAL extents (as opposed to the delayed
+ * allocation extents) from the inode into the given buffer.  It
+ * returns the number of bytes copied into the buffer.
+ *
+ * If there are no delayed allocation extents, then we can just
+ * memcpy() the extents into the buffer.  Otherwise, we need to
+ * examine each extent in turn and skip those which are delayed.
+ */
+int
+xfs_iextents_copy(
+	xfs_inode_t		*ip,
+	xfs_bmbt_rec_t		*dp,
+	int			whichfork)
+{
+	int			copied;
+	int			i;
+	xfs_ifork_t		*ifp;
+	int			nrecs;
+	xfs_fsblock_t		start_block;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+	ASSERT(ifp->if_bytes > 0);
+
+	nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
+	ASSERT(nrecs > 0);
+
+	/*
+	 * There are some delayed allocation extents in the
+	 * inode, so copy the extents one at a time and skip
+	 * the delayed ones.  There must be at least one
+	 * non-delayed extent.
+	 */
+	copied = 0;
+	for (i = 0; i < nrecs; i++) {
+		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+		start_block = xfs_bmbt_get_startblock(ep);
+		if (isnullstartblock(start_block)) {
+			/*
+			 * It's a delayed allocation extent, so skip it.
+			 */
+			continue;
+		}
+
+		/* Translate to on disk format */
+		put_unaligned(cpu_to_be64(ep->l0), &dp->l0);
+		put_unaligned(cpu_to_be64(ep->l1), &dp->l1);
+		dp++;
+		copied++;
+	}
+	ASSERT(copied != 0);
+	xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
+
+	return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+}
+
+/*
+ * Each of the following cases stores data into the same region
+ * of the on-disk inode, so only one of them can be valid at
+ * any given time. While it is possible to have conflicting formats
+ * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
+ * in EXTENTS format, this can only happen when the fork has
+ * changed formats after being modified but before being flushed.
+ * In these cases, the format always takes precedence, because the
+ * format indicates the current state of the fork.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_iflush_fork(
+	xfs_inode_t		*ip,
+	xfs_dinode_t		*dip,
+	xfs_inode_log_item_t	*iip,
+	int			whichfork,
+	xfs_buf_t		*bp)
+{
+	char			*cp;
+	xfs_ifork_t		*ifp;
+	xfs_mount_t		*mp;
+#ifdef XFS_TRANS_DEBUG
+	int			first;
+#endif
+	static const short	brootflag[2] =
+		{ XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
+	static const short	dataflag[2] =
+		{ XFS_ILOG_DDATA, XFS_ILOG_ADATA };
+	static const short	extflag[2] =
+		{ XFS_ILOG_DEXT, XFS_ILOG_AEXT };
+
+	if (!iip)
+		return;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	/*
+	 * This can happen if we gave up in iformat in an error path,
+	 * for the attribute fork.
+	 */
+	if (!ifp) {
+		ASSERT(whichfork == XFS_ATTR_FORK);
+		return;
+	}
+	cp = XFS_DFORK_PTR(dip, whichfork);
+	mp = ip->i_mount;
+	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+	case XFS_DINODE_FMT_LOCAL:
+		if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
+		    (ifp->if_bytes > 0)) {
+			ASSERT(ifp->if_u1.if_data != NULL);
+			ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+			memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+		}
+		break;
+
+	case XFS_DINODE_FMT_EXTENTS:
+		ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
+		       !(iip->ili_format.ilf_fields & extflag[whichfork]));
+		if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
+		    (ifp->if_bytes > 0)) {
+			ASSERT(xfs_iext_get_ext(ifp, 0));
+			ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
+			(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
+				whichfork);
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
+		    (ifp->if_broot_bytes > 0)) {
+			ASSERT(ifp->if_broot != NULL);
+			ASSERT(ifp->if_broot_bytes <=
+			       (XFS_IFORK_SIZE(ip, whichfork) +
+				XFS_BROOT_SIZE_ADJ));
+			xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
+				(xfs_bmdr_block_t *)cp,
+				XFS_DFORK_SIZE(dip, mp, whichfork));
+		}
+		break;
+
+	case XFS_DINODE_FMT_DEV:
+		if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
+		}
+		break;
+
+	case XFS_DINODE_FMT_UUID:
+		if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			memcpy(XFS_DFORK_DPTR(dip),
+			       &ip->i_df.if_u2.if_uuid,
+			       sizeof(uuid_t));
+		}
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+STATIC int
+xfs_iflush_cluster(
+	xfs_inode_t	*ip,
+	xfs_buf_t	*bp)
+{
+	xfs_mount_t		*mp = ip->i_mount;
+	struct xfs_perag	*pag;
+	unsigned long		first_index, mask;
+	unsigned long		inodes_per_cluster;
+	int			ilist_size;
+	xfs_inode_t		**ilist;
+	xfs_inode_t		*iq;
+	int			nr_found;
+	int			clcount = 0;
+	int			bufwasdelwri;
+	int			i;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
+	inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
+	ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+	ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
+	if (!ilist)
+		goto out_put;
+
+	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
+	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
+	rcu_read_lock();
+	/* really need a gang lookup range call here */
+	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
+					first_index, inodes_per_cluster);
+	if (nr_found == 0)
+		goto out_free;
+
+	for (i = 0; i < nr_found; i++) {
+		iq = ilist[i];
+		if (iq == ip)
+			continue;
+
+		/*
+		 * because this is an RCU protected lookup, we could find a
+		 * recently freed or even reallocated inode during the lookup.
+		 * We need to check under the i_flags_lock for a valid inode
+		 * here. Skip it if it is not valid or the wrong inode.
+		 */
+		spin_lock(&ip->i_flags_lock);
+		if (!ip->i_ino ||
+		    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+			spin_unlock(&ip->i_flags_lock);
+			continue;
+		}
+		spin_unlock(&ip->i_flags_lock);
+
+		/*
+		 * Do an un-protected check to see if the inode is dirty and
+		 * is a candidate for flushing.  These checks will be repeated
+		 * later after the appropriate locks are acquired.
+		 */
+		if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
+			continue;
+
+		/*
+		 * Try to get locks.  If any are unavailable or it is pinned,
+		 * then this inode cannot be flushed and is skipped.
+		 */
+
+		if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
+			continue;
+		if (!xfs_iflock_nowait(iq)) {
+			xfs_iunlock(iq, XFS_ILOCK_SHARED);
+			continue;
+		}
+		if (xfs_ipincount(iq)) {
+			xfs_ifunlock(iq);
+			xfs_iunlock(iq, XFS_ILOCK_SHARED);
+			continue;
+		}
+
+		/*
+		 * arriving here means that this inode can be flushed.  First
+		 * re-check that it's dirty before flushing.
+		 */
+		if (!xfs_inode_clean(iq)) {
+			int	error;
+			error = xfs_iflush_int(iq, bp);
+			if (error) {
+				xfs_iunlock(iq, XFS_ILOCK_SHARED);
+				goto cluster_corrupt_out;
+			}
+			clcount++;
+		} else {
+			xfs_ifunlock(iq);
+		}
+		xfs_iunlock(iq, XFS_ILOCK_SHARED);
+	}
+
+	if (clcount) {
+		XFS_STATS_INC(xs_icluster_flushcnt);
+		XFS_STATS_ADD(xs_icluster_flushinode, clcount);
+	}
+
+out_free:
+	rcu_read_unlock();
+	kmem_free(ilist);
+out_put:
+	xfs_perag_put(pag);
+	return 0;
+
+
+cluster_corrupt_out:
+	/*
+	 * Corruption detected in the clustering loop.  Invalidate the
+	 * inode buffer and shut down the filesystem.
+	 */
+	rcu_read_unlock();
+	/*
+	 * Clean up the buffer.  If it was B_DELWRI, just release it --
+	 * brelse can handle it with no problems.  If not, shut down the
+	 * filesystem before releasing the buffer.
+	 */
+	bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp);
+	if (bufwasdelwri)
+		xfs_buf_relse(bp);
+
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+
+	if (!bufwasdelwri) {
+		/*
+		 * Just like incore_relse: if we have b_iodone functions,
+		 * mark the buffer as an error and call them.  Otherwise
+		 * mark it as stale and brelse.
+		 */
+		if (XFS_BUF_IODONE_FUNC(bp)) {
+			XFS_BUF_UNDONE(bp);
+			XFS_BUF_STALE(bp);
+			XFS_BUF_ERROR(bp,EIO);
+			xfs_buf_ioend(bp, 0);
+		} else {
+			XFS_BUF_STALE(bp);
+			xfs_buf_relse(bp);
+		}
+	}
+
+	/*
+	 * Unlocks the flush lock
+	 */
+	xfs_iflush_abort(iq);
+	kmem_free(ilist);
+	xfs_perag_put(pag);
+	return XFS_ERROR(EFSCORRUPTED);
+}
+
+/*
+ * xfs_iflush() will write a modified inode's changes out to the
+ * inode's on disk home.  The caller must have the inode lock held
+ * in at least shared mode and the inode flush completion must be
+ * active as well.  The inode lock will still be held upon return from
+ * the call and the caller is free to unlock it.
+ * The inode flush will be completed when the inode reaches the disk.
+ * The flags indicate how the inode's buffer should be written out.
+ */
+int
+xfs_iflush(
+	xfs_inode_t		*ip,
+	uint			flags)
+{
+	xfs_inode_log_item_t	*iip;
+	xfs_buf_t		*bp;
+	xfs_dinode_t		*dip;
+	xfs_mount_t		*mp;
+	int			error;
+
+	XFS_STATS_INC(xs_iflush_count);
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+	ASSERT(!completion_done(&ip->i_flush));
+	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
+
+	iip = ip->i_itemp;
+	mp = ip->i_mount;
+
+	/*
+	 * We can't flush the inode until it is unpinned, so wait for it if we
+	 * are allowed to block.  We know no one new can pin it, because we are
+	 * holding the inode lock shared and you need to hold it exclusively to
+	 * pin the inode.
+	 *
+	 * If we are not allowed to block, force the log out asynchronously so
+	 * that when we come back the inode will be unpinned. If other inodes
+	 * in the same cluster are dirty, they will probably write the inode
+	 * out for us if they occur after the log force completes.
+	 */
+	if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
+		xfs_iunpin_nowait(ip);
+		xfs_ifunlock(ip);
+		return EAGAIN;
+	}
+	xfs_iunpin_wait(ip);
+
+	/*
+	 * For stale inodes we cannot rely on the backing buffer remaining
+	 * stale in cache for the remaining life of the stale inode and so
+	 * xfs_itobp() below may give us a buffer that no longer contains
+	 * inodes below. We have to check this after ensuring the inode is
+	 * unpinned so that it is safe to reclaim the stale inode after the
+	 * flush call.
+	 */
+	if (xfs_iflags_test(ip, XFS_ISTALE)) {
+		xfs_ifunlock(ip);
+		return 0;
+	}
+
+	/*
+	 * This may have been unpinned because the filesystem is shutting
+	 * down forcibly. If that's the case we must not write this inode
+	 * to disk, because the log record didn't make it to disk!
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		ip->i_update_core = 0;
+		if (iip)
+			iip->ili_format.ilf_fields = 0;
+		xfs_ifunlock(ip);
+		return XFS_ERROR(EIO);
+	}
+
+	/*
+	 * Get the buffer containing the on-disk inode.
+	 */
+	error = xfs_itobp(mp, NULL, ip, &dip, &bp,
+				(flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK);
+	if (error || !bp) {
+		xfs_ifunlock(ip);
+		return error;
+	}
+
+	/*
+	 * First flush out the inode that xfs_iflush was called with.
+	 */
+	error = xfs_iflush_int(ip, bp);
+	if (error)
+		goto corrupt_out;
+
+	/*
+	 * If the buffer is pinned then push on the log now so we won't
+	 * get stuck waiting in the write for too long.
+	 */
+	if (XFS_BUF_ISPINNED(bp))
+		xfs_log_force(mp, 0);
+
+	/*
+	 * inode clustering:
+	 * see if other inodes can be gathered into this write
+	 */
+	error = xfs_iflush_cluster(ip, bp);
+	if (error)
+		goto cluster_corrupt_out;
+
+	if (flags & SYNC_WAIT)
+		error = xfs_bwrite(mp, bp);
+	else
+		xfs_bdwrite(mp, bp);
+	return error;
+
+corrupt_out:
+	xfs_buf_relse(bp);
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+cluster_corrupt_out:
+	/*
+	 * Unlocks the flush lock
+	 */
+	xfs_iflush_abort(ip);
+	return XFS_ERROR(EFSCORRUPTED);
+}
+
+
+STATIC int
+xfs_iflush_int(
+	xfs_inode_t		*ip,
+	xfs_buf_t		*bp)
+{
+	xfs_inode_log_item_t	*iip;
+	xfs_dinode_t		*dip;
+	xfs_mount_t		*mp;
+#ifdef XFS_TRANS_DEBUG
+	int			first;
+#endif
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+	ASSERT(!completion_done(&ip->i_flush));
+	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
+
+	iip = ip->i_itemp;
+	mp = ip->i_mount;
+
+	/* set *dip = inode's place in the buffer */
+	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
+
+	/*
+	 * Clear i_update_core before copying out the data.
+	 * This is for coordination with our timestamp updates
+	 * that don't hold the inode lock. They will always
+	 * update the timestamps BEFORE setting i_update_core,
+	 * so if we clear i_update_core after they set it we
+	 * are guaranteed to see their updates to the timestamps.
+	 * I believe that this depends on strongly ordered memory
+	 * semantics, but we have that.  We use the SYNCHRONIZE
+	 * macro to make sure that the compiler does not reorder
+	 * the i_update_core access below the data copy below.
+	 */
+	ip->i_update_core = 0;
+	SYNCHRONIZE();
+
+	/*
+	 * Make sure to get the latest timestamps from the Linux inode.
+	 */
+	xfs_synchronize_times(ip);
+
+	if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
+			       mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
+			__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
+		goto corrupt_out;
+	}
+	if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
+				mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
+			__func__, ip->i_ino, ip, ip->i_d.di_magic);
+		goto corrupt_out;
+	}
+	if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
+		if (XFS_TEST_ERROR(
+		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
+		    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
+			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+				"%s: Bad regular inode %Lu, ptr 0x%p",
+				__func__, ip->i_ino, ip);
+			goto corrupt_out;
+		}
+	} else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+		if (XFS_TEST_ERROR(
+		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+		    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
+		    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
+			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+				"%s: Bad directory inode %Lu, ptr 0x%p",
+				__func__, ip->i_ino, ip);
+			goto corrupt_out;
+		}
+	}
+	if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
+				ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
+				XFS_RANDOM_IFLUSH_5)) {
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: detected corrupt incore inode %Lu, "
+			"total extents = %d, nblocks = %Ld, ptr 0x%p",
+			__func__, ip->i_ino,
+			ip->i_d.di_nextents + ip->i_d.di_anextents,
+			ip->i_d.di_nblocks, ip);
+		goto corrupt_out;
+	}
+	if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
+				mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
+			__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
+		goto corrupt_out;
+	}
+	/*
+	 * bump the flush iteration count, used to detect flushes which
+	 * postdate a log record during recovery.
+	 */
+
+	ip->i_d.di_flushiter++;
+
+	/*
+	 * Copy the dirty parts of the inode into the on-disk
+	 * inode.  We always copy out the core of the inode,
+	 * because if the inode is dirty at all the core must
+	 * be.
+	 */
+	xfs_dinode_to_disk(dip, &ip->i_d);
+
+	/* Wrap, we never let the log put out DI_MAX_FLUSH */
+	if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
+		ip->i_d.di_flushiter = 0;
+
+	/*
+	 * If this is really an old format inode and the superblock version
+	 * has not been updated to support only new format inodes, then
+	 * convert back to the old inode format.  If the superblock version
+	 * has been updated, then make the conversion permanent.
+	 */
+	ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
+	if (ip->i_d.di_version == 1) {
+		if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
+			/*
+			 * Convert it back.
+			 */
+			ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
+			dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
+		} else {
+			/*
+			 * The superblock version has already been bumped,
+			 * so just make the conversion to the new inode
+			 * format permanent.
+			 */
+			ip->i_d.di_version = 2;
+			dip->di_version = 2;
+			ip->i_d.di_onlink = 0;
+			dip->di_onlink = 0;
+			memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+			memset(&(dip->di_pad[0]), 0,
+			      sizeof(dip->di_pad));
+			ASSERT(xfs_get_projid(ip) == 0);
+		}
+	}
+
+	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
+	if (XFS_IFORK_Q(ip))
+		xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
+	xfs_inobp_check(mp, bp);
+
+	/*
+	 * We've recorded everything logged in the inode, so we'd
+	 * like to clear the ilf_fields bits so we don't log and
+	 * flush things unnecessarily.  However, we can't stop
+	 * logging all this information until the data we've copied
+	 * into the disk buffer is written to disk.  If we did we might
+	 * overwrite the copy of the inode in the log with all the
+	 * data after re-logging only part of it, and in the face of
+	 * a crash we wouldn't have all the data we need to recover.
+	 *
+	 * What we do is move the bits to the ili_last_fields field.
+	 * When logging the inode, these bits are moved back to the
+	 * ilf_fields field.  In the xfs_iflush_done() routine we
+	 * clear ili_last_fields, since we know that the information
+	 * those bits represent is permanently on disk.  As long as
+	 * the flush completes before the inode is logged again, then
+	 * both ilf_fields and ili_last_fields will be cleared.
+	 *
+	 * We can play with the ilf_fields bits here, because the inode
+	 * lock must be held exclusively in order to set bits there
+	 * and the flush lock protects the ili_last_fields bits.
+	 * Set ili_logged so the flush done
+	 * routine can tell whether or not to look in the AIL.
+	 * Also, store the current LSN of the inode so that we can tell
+	 * whether the item has moved in the AIL from xfs_iflush_done().
+	 * In order to read the lsn we need the AIL lock, because
+	 * it is a 64 bit value that cannot be read atomically.
+	 */
+	if (iip != NULL && iip->ili_format.ilf_fields != 0) {
+		iip->ili_last_fields = iip->ili_format.ilf_fields;
+		iip->ili_format.ilf_fields = 0;
+		iip->ili_logged = 1;
+
+		xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+					&iip->ili_item.li_lsn);
+
+		/*
+		 * Attach the function xfs_iflush_done to the inode's
+		 * buffer.  This will remove the inode from the AIL
+		 * and unlock the inode's flush lock when the inode is
+		 * completely written to disk.
+		 */
+		xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
+
+		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+		ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
+	} else {
+		/*
+		 * We're flushing an inode which is not in the AIL and has
+		 * not been logged but has i_update_core set.  For this
+		 * case we can use a B_DELWRI flush and immediately drop
+		 * the inode flush lock because we can avoid the whole
+		 * AIL state thing.  It's OK to drop the flush lock now,
+		 * because we've already locked the buffer and to do anything
+		 * you really need both.
+		 */
+		if (iip != NULL) {
+			ASSERT(iip->ili_logged == 0);
+			ASSERT(iip->ili_last_fields == 0);
+			ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
+		}
+		xfs_ifunlock(ip);
+	}
+
+	return 0;
+
+corrupt_out:
+	return XFS_ERROR(EFSCORRUPTED);
+}
+
+void
+xfs_promote_inode(
+	struct xfs_inode	*ip)
+{
+	struct xfs_buf		*bp;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+
+	bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno,
+			ip->i_imap.im_len, XBF_TRYLOCK);
+	if (!bp)
+		return;
+
+	if (XFS_BUF_ISDELAYWRITE(bp)) {
+		xfs_buf_delwri_promote(bp);
+		wake_up_process(ip->i_mount->m_ddev_targp->bt_task);
+	}
+
+	xfs_buf_relse(bp);
+}
+
+/*
+ * Return a pointer to the extent record at file index idx.
+ */
+xfs_bmbt_rec_host_t *
+xfs_iext_get_ext(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	idx)		/* index of target extent */
+{
+	ASSERT(idx >= 0);
+	ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+
+	if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
+		return ifp->if_u1.if_ext_irec->er_extbuf;
+	} else if (ifp->if_flags & XFS_IFEXTIREC) {
+		xfs_ext_irec_t	*erp;		/* irec pointer */
+		int		erp_idx = 0;	/* irec index */
+		xfs_extnum_t	page_idx = idx;	/* ext index in target list */
+
+		erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
+		return &erp->er_extbuf[page_idx];
+	} else if (ifp->if_bytes) {
+		return &ifp->if_u1.if_extents[idx];
+	} else {
+		return NULL;
+	}
+}
+
+/*
+ * Insert new item(s) into the extent records for incore inode
+ * fork 'ifp'.  'count' new items are inserted at index 'idx'.
+ */
+void
+xfs_iext_insert(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* starting index of new items */
+	xfs_extnum_t	count,		/* number of inserted items */
+	xfs_bmbt_irec_t	*new,		/* items to insert */
+	int		state)		/* type of extent conversion */
+{
+	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+	xfs_extnum_t	i;		/* extent record index */
+
+	trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
+
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	xfs_iext_add(ifp, idx, count);
+	for (i = idx; i < idx + count; i++, new++)
+		xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be increased. The ext_diff parameter stores the
+ * number of new extents being added and the idx parameter contains
+ * the extent index where the new extents will be added. If the new
+ * extents are being appended, then we just need to (re)allocate and
+ * initialize the space. Otherwise, if the new extents are being
+ * inserted into the middle of the existing entries, a bit more work
+ * is required to make room for the new extents to be inserted. The
+ * caller is responsible for filling in the new extent entries upon
+ * return.
+ */
+void
+xfs_iext_add(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	idx,		/* index to begin adding exts */
+	int		ext_diff)	/* number of extents to add */
+{
+	int		byte_diff;	/* new bytes being added */
+	int		new_size;	/* size of extents after adding */
+	xfs_extnum_t	nextents;	/* number of extents in file */
+
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT((idx >= 0) && (idx <= nextents));
+	byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
+	new_size = ifp->if_bytes + byte_diff;
+	/*
+	 * If the new number of extents (nextents + ext_diff)
+	 * fits inside the inode, then continue to use the inline
+	 * extent buffer.
+	 */
+	if (nextents + ext_diff <= XFS_INLINE_EXTS) {
+		if (idx < nextents) {
+			memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
+				&ifp->if_u2.if_inline_ext[idx],
+				(nextents - idx) * sizeof(xfs_bmbt_rec_t));
+			memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
+		}
+		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+		ifp->if_real_bytes = 0;
+	}
+	/*
+	 * Otherwise use a linear (direct) extent list.
+	 * If the extents are currently inside the inode,
+	 * xfs_iext_realloc_direct will switch us from
+	 * inline to direct extent allocation mode.
+	 */
+	else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
+		xfs_iext_realloc_direct(ifp, new_size);
+		if (idx < nextents) {
+			memmove(&ifp->if_u1.if_extents[idx + ext_diff],
+				&ifp->if_u1.if_extents[idx],
+				(nextents - idx) * sizeof(xfs_bmbt_rec_t));
+			memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
+		}
+	}
+	/* Indirection array */
+	else {
+		xfs_ext_irec_t	*erp;
+		int		erp_idx = 0;
+		int		page_idx = idx;
+
+		ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
+		if (ifp->if_flags & XFS_IFEXTIREC) {
+			erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
+		} else {
+			xfs_iext_irec_init(ifp);
+			ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+			erp = ifp->if_u1.if_ext_irec;
+		}
+		/* Extents fit in target extent page */
+		if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
+			if (page_idx < erp->er_extcount) {
+				memmove(&erp->er_extbuf[page_idx + ext_diff],
+					&erp->er_extbuf[page_idx],
+					(erp->er_extcount - page_idx) *
+					sizeof(xfs_bmbt_rec_t));
+				memset(&erp->er_extbuf[page_idx], 0, byte_diff);
+			}
+			erp->er_extcount += ext_diff;
+			xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+		}
+		/* Insert a new extent page */
+		else if (erp) {
+			xfs_iext_add_indirect_multi(ifp,
+				erp_idx, page_idx, ext_diff);
+		}
+		/*
+		 * If extent(s) are being appended to the last page in
+		 * the indirection array and the new extent(s) don't fit
+		 * in the page, then erp is NULL and erp_idx is set to
+		 * the next index needed in the indirection array.
+		 */
+		else {
+			int	count = ext_diff;
+
+			while (count) {
+				erp = xfs_iext_irec_new(ifp, erp_idx);
+				erp->er_extcount = count;
+				count -= MIN(count, (int)XFS_LINEAR_EXTS);
+				if (count) {
+					erp_idx++;
+				}
+			}
+		}
+	}
+	ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being added to the indirection
+ * array and the new extents do not fit in the target extent list. The
+ * erp_idx parameter contains the irec index for the target extent list
+ * in the indirection array, and the idx parameter contains the extent
+ * index within the list. The number of extents being added is stored
+ * in the count parameter.
+ *
+ *    |-------|   |-------|
+ *    |       |   |       |    idx - number of extents before idx
+ *    |  idx  |   | count |
+ *    |       |   |       |    count - number of extents being inserted at idx
+ *    |-------|   |-------|
+ *    | count |   | nex2  |    nex2 - number of extents after idx + count
+ *    |-------|   |-------|
+ */
+void
+xfs_iext_add_indirect_multi(
+	xfs_ifork_t	*ifp,			/* inode fork pointer */
+	int		erp_idx,		/* target extent irec index */
+	xfs_extnum_t	idx,			/* index within target list */
+	int		count)			/* new extents being added */
+{
+	int		byte_diff;		/* new bytes being added */
+	xfs_ext_irec_t	*erp;			/* pointer to irec entry */
+	xfs_extnum_t	ext_diff;		/* number of extents to add */
+	xfs_extnum_t	ext_cnt;		/* new extents still needed */
+	xfs_extnum_t	nex2;			/* extents after idx + count */
+	xfs_bmbt_rec_t	*nex2_ep = NULL;	/* temp list for nex2 extents */
+	int		nlists;			/* number of irec's (lists) */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	erp = &ifp->if_u1.if_ext_irec[erp_idx];
+	nex2 = erp->er_extcount - idx;
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+	/*
+	 * Save second part of target extent list
+	 * (all extents past */
+	if (nex2) {
+		byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+		nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
+		memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
+		erp->er_extcount -= nex2;
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
+		memset(&erp->er_extbuf[idx], 0, byte_diff);
+	}
+
+	/*
+	 * Add the new extents to the end of the target
+	 * list, then allocate new irec record(s) and
+	 * extent buffer(s) as needed to store the rest
+	 * of the new extents.
+	 */
+	ext_cnt = count;
+	ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
+	if (ext_diff) {
+		erp->er_extcount += ext_diff;
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+		ext_cnt -= ext_diff;
+	}
+	while (ext_cnt) {
+		erp_idx++;
+		erp = xfs_iext_irec_new(ifp, erp_idx);
+		ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
+		erp->er_extcount = ext_diff;
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+		ext_cnt -= ext_diff;
+	}
+
+	/* Add nex2 extents back to indirection array */
+	if (nex2) {
+		xfs_extnum_t	ext_avail;
+		int		i;
+
+		byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+		ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
+		i = 0;
+		/*
+		 * If nex2 extents fit in the current page, append
+		 * nex2_ep after the new extents.
+		 */
+		if (nex2 <= ext_avail) {
+			i = erp->er_extcount;
+		}
+		/*
+		 * Otherwise, check if space is available in the
+		 * next page.
+		 */
+		else if ((erp_idx < nlists - 1) &&
+			 (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
+			  ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
+			erp_idx++;
+			erp++;
+			/* Create a hole for nex2 extents */
+			memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
+				erp->er_extcount * sizeof(xfs_bmbt_rec_t));
+		}
+		/*
+		 * Final choice, create a new extent page for
+		 * nex2 extents.
+		 */
+		else {
+			erp_idx++;
+			erp = xfs_iext_irec_new(ifp, erp_idx);
+		}
+		memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
+		kmem_free(nex2_ep);
+		erp->er_extcount += nex2;
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
+	}
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be decreased. The ext_diff parameter stores the
+ * number of extents to be removed and the idx parameter contains
+ * the extent index where the extents will be removed from.
+ *
+ * If the amount of space needed has decreased below the linear
+ * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
+ * extent array.  Otherwise, use kmem_realloc() to adjust the
+ * size to what is needed.
+ */
+void
+xfs_iext_remove(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index to begin removing exts */
+	int		ext_diff,	/* number of extents to remove */
+	int		state)		/* type of extent conversion */
+{
+	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+	xfs_extnum_t	nextents;	/* number of extents in file */
+	int		new_size;	/* size of extents after removal */
+
+	trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
+
+	ASSERT(ext_diff > 0);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
+
+	if (new_size == 0) {
+		xfs_iext_destroy(ifp);
+	} else if (ifp->if_flags & XFS_IFEXTIREC) {
+		xfs_iext_remove_indirect(ifp, idx, ext_diff);
+	} else if (ifp->if_real_bytes) {
+		xfs_iext_remove_direct(ifp, idx, ext_diff);
+	} else {
+		xfs_iext_remove_inline(ifp, idx, ext_diff);
+	}
+	ifp->if_bytes = new_size;
+}
+
+/*
+ * This removes ext_diff extents from the inline buffer, beginning
+ * at extent index idx.
+ */
+void
+xfs_iext_remove_inline(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	idx,		/* index to begin removing exts */
+	int		ext_diff)	/* number of extents to remove */
+{
+	int		nextents;	/* number of extents in file */
+
+	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+	ASSERT(idx < XFS_INLINE_EXTS);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(((nextents - ext_diff) > 0) &&
+		(nextents - ext_diff) < XFS_INLINE_EXTS);
+
+	if (idx + ext_diff < nextents) {
+		memmove(&ifp->if_u2.if_inline_ext[idx],
+			&ifp->if_u2.if_inline_ext[idx + ext_diff],
+			(nextents - (idx + ext_diff)) *
+			 sizeof(xfs_bmbt_rec_t));
+		memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
+			0, ext_diff * sizeof(xfs_bmbt_rec_t));
+	} else {
+		memset(&ifp->if_u2.if_inline_ext[idx], 0,
+			ext_diff * sizeof(xfs_bmbt_rec_t));
+	}
+}
+
+/*
+ * This removes ext_diff extents from a linear (direct) extent list,
+ * beginning at extent index idx. If the extents are being removed
+ * from the end of the list (ie. truncate) then we just need to re-
+ * allocate the list to remove the extra space. Otherwise, if the
+ * extents are being removed from the middle of the existing extent
+ * entries, then we first need to move the extent records beginning
+ * at idx + ext_diff up in the list to overwrite the records being
+ * removed, then remove the extra space via kmem_realloc.
+ */
+void
+xfs_iext_remove_direct(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	idx,		/* index to begin removing exts */
+	int		ext_diff)	/* number of extents to remove */
+{
+	xfs_extnum_t	nextents;	/* number of extents in file */
+	int		new_size;	/* size of extents after removal */
+
+	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+	new_size = ifp->if_bytes -
+		(ext_diff * sizeof(xfs_bmbt_rec_t));
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+	if (new_size == 0) {
+		xfs_iext_destroy(ifp);
+		return;
+	}
+	/* Move extents up in the list (if needed) */
+	if (idx + ext_diff < nextents) {
+		memmove(&ifp->if_u1.if_extents[idx],
+			&ifp->if_u1.if_extents[idx + ext_diff],
+			(nextents - (idx + ext_diff)) *
+			 sizeof(xfs_bmbt_rec_t));
+	}
+	memset(&ifp->if_u1.if_extents[nextents - ext_diff],
+		0, ext_diff * sizeof(xfs_bmbt_rec_t));
+	/*
+	 * Reallocate the direct extent list. If the extents
+	 * will fit inside the inode then xfs_iext_realloc_direct
+	 * will switch from direct to inline extent allocation
+	 * mode for us.
+	 */
+	xfs_iext_realloc_direct(ifp, new_size);
+	ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being removed from the
+ * indirection array and the extents being removed span multiple extent
+ * buffers. The idx parameter contains the file extent index where we
+ * want to begin removing extents, and the count parameter contains
+ * how many extents need to be removed.
+ *
+ *    |-------|   |-------|
+ *    | nex1  |   |       |    nex1 - number of extents before idx
+ *    |-------|   | count |
+ *    |       |   |       |    count - number of extents being removed at idx
+ *    | count |   |-------|
+ *    |       |   | nex2  |    nex2 - number of extents after idx + count
+ *    |-------|   |-------|
+ */
+void
+xfs_iext_remove_indirect(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	idx,		/* index to begin removing extents */
+	int		count)		/* number of extents to remove */
+{
+	xfs_ext_irec_t	*erp;		/* indirection array pointer */
+	int		erp_idx = 0;	/* indirection array index */
+	xfs_extnum_t	ext_cnt;	/* extents left to remove */
+	xfs_extnum_t	ext_diff;	/* extents to remove in current list */
+	xfs_extnum_t	nex1;		/* number of extents before idx */
+	xfs_extnum_t	nex2;		/* extents after idx + count */
+	int		page_idx = idx;	/* index in target extent list */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
+	ASSERT(erp != NULL);
+	nex1 = page_idx;
+	ext_cnt = count;
+	while (ext_cnt) {
+		nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
+		ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
+		/*
+		 * Check for deletion of entire list;
+		 * xfs_iext_irec_remove() updates extent offsets.
+		 */
+		if (ext_diff == erp->er_extcount) {
+			xfs_iext_irec_remove(ifp, erp_idx);
+			ext_cnt -= ext_diff;
+			nex1 = 0;
+			if (ext_cnt) {
+				ASSERT(erp_idx < ifp->if_real_bytes /
+					XFS_IEXT_BUFSZ);
+				erp = &ifp->if_u1.if_ext_irec[erp_idx];
+				nex1 = 0;
+				continue;
+			} else {
+				break;
+			}
+		}
+		/* Move extents up (if needed) */
+		if (nex2) {
+			memmove(&erp->er_extbuf[nex1],
+				&erp->er_extbuf[nex1 + ext_diff],
+				nex2 * sizeof(xfs_bmbt_rec_t));
+		}
+		/* Zero out rest of page */
+		memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
+			((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
+		/* Update remaining counters */
+		erp->er_extcount -= ext_diff;
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
+		ext_cnt -= ext_diff;
+		nex1 = 0;
+		erp_idx++;
+		erp++;
+	}
+	ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
+	xfs_iext_irec_compact(ifp);
+}
+
+/*
+ * Create, destroy, or resize a linear (direct) block of extents.
+ */
+void
+xfs_iext_realloc_direct(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		new_size)	/* new size of extents */
+{
+	int		rnew_size;	/* real new size of extents */
+
+	rnew_size = new_size;
+
+	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
+		((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
+		 (new_size != ifp->if_real_bytes)));
+
+	/* Free extent records */
+	if (new_size == 0) {
+		xfs_iext_destroy(ifp);
+	}
+	/* Resize direct extent list and zero any new bytes */
+	else if (ifp->if_real_bytes) {
+		/* Check if extents will fit inside the inode */
+		if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
+			xfs_iext_direct_to_inline(ifp, new_size /
+				(uint)sizeof(xfs_bmbt_rec_t));
+			ifp->if_bytes = new_size;
+			return;
+		}
+		if (!is_power_of_2(new_size)){
+			rnew_size = roundup_pow_of_two(new_size);
+		}
+		if (rnew_size != ifp->if_real_bytes) {
+			ifp->if_u1.if_extents =
+				kmem_realloc(ifp->if_u1.if_extents,
+						rnew_size,
+						ifp->if_real_bytes, KM_NOFS);
+		}
+		if (rnew_size > ifp->if_real_bytes) {
+			memset(&ifp->if_u1.if_extents[ifp->if_bytes /
+				(uint)sizeof(xfs_bmbt_rec_t)], 0,
+				rnew_size - ifp->if_real_bytes);
+		}
+	}
+	/*
+	 * Switch from the inline extent buffer to a direct
+	 * extent list. Be sure to include the inline extent
+	 * bytes in new_size.
+	 */
+	else {
+		new_size += ifp->if_bytes;
+		if (!is_power_of_2(new_size)) {
+			rnew_size = roundup_pow_of_two(new_size);
+		}
+		xfs_iext_inline_to_direct(ifp, rnew_size);
+	}
+	ifp->if_real_bytes = rnew_size;
+	ifp->if_bytes = new_size;
+}
+
+/*
+ * Switch from linear (direct) extent records to inline buffer.
+ */
+void
+xfs_iext_direct_to_inline(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	nextents)	/* number of extents in file */
+{
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	ASSERT(nextents <= XFS_INLINE_EXTS);
+	/*
+	 * The inline buffer was zeroed when we switched
+	 * from inline to direct extent allocation mode,
+	 * so we don't need to clear it here.
+	 */
+	memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
+		nextents * sizeof(xfs_bmbt_rec_t));
+	kmem_free(ifp->if_u1.if_extents);
+	ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+	ifp->if_real_bytes = 0;
+}
+
+/*
+ * Switch from inline buffer to linear (direct) extent records.
+ * new_size should already be rounded up to the next power of 2
+ * by the caller (when appropriate), so use new_size as it is.
+ * However, since new_size may be rounded up, we can't update
+ * if_bytes here. It is the caller's responsibility to update
+ * if_bytes upon return.
+ */
+void
+xfs_iext_inline_to_direct(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		new_size)	/* number of extents in file */
+{
+	ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
+	memset(ifp->if_u1.if_extents, 0, new_size);
+	if (ifp->if_bytes) {
+		memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
+			ifp->if_bytes);
+		memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+			sizeof(xfs_bmbt_rec_t));
+	}
+	ifp->if_real_bytes = new_size;
+}
+
+/*
+ * Resize an extent indirection array to new_size bytes.
+ */
+STATIC void
+xfs_iext_realloc_indirect(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		new_size)	/* new indirection array size */
+{
+	int		nlists;		/* number of irec's (ex lists) */
+	int		size;		/* current indirection array size */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	size = nlists * sizeof(xfs_ext_irec_t);
+	ASSERT(ifp->if_real_bytes);
+	ASSERT((new_size >= 0) && (new_size != size));
+	if (new_size == 0) {
+		xfs_iext_destroy(ifp);
+	} else {
+		ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
+			kmem_realloc(ifp->if_u1.if_ext_irec,
+				new_size, size, KM_NOFS);
+	}
+}
+
+/*
+ * Switch from indirection array to linear (direct) extent allocations.
+ */
+STATIC void
+xfs_iext_indirect_to_direct(
+	 xfs_ifork_t	*ifp)		/* inode fork pointer */
+{
+	xfs_bmbt_rec_host_t *ep;	/* extent record pointer */
+	xfs_extnum_t	nextents;	/* number of extents in file */
+	int		size;		/* size of file extents */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(nextents <= XFS_LINEAR_EXTS);
+	size = nextents * sizeof(xfs_bmbt_rec_t);
+
+	xfs_iext_irec_compact_pages(ifp);
+	ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
+
+	ep = ifp->if_u1.if_ext_irec->er_extbuf;
+	kmem_free(ifp->if_u1.if_ext_irec);
+	ifp->if_flags &= ~XFS_IFEXTIREC;
+	ifp->if_u1.if_extents = ep;
+	ifp->if_bytes = size;
+	if (nextents < XFS_LINEAR_EXTS) {
+		xfs_iext_realloc_direct(ifp, size);
+	}
+}
+
+/*
+ * Free incore file extents.
+ */
+void
+xfs_iext_destroy(
+	xfs_ifork_t	*ifp)		/* inode fork pointer */
+{
+	if (ifp->if_flags & XFS_IFEXTIREC) {
+		int	erp_idx;
+		int	nlists;
+
+		nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+		for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
+			xfs_iext_irec_remove(ifp, erp_idx);
+		}
+		ifp->if_flags &= ~XFS_IFEXTIREC;
+	} else if (ifp->if_real_bytes) {
+		kmem_free(ifp->if_u1.if_extents);
+	} else if (ifp->if_bytes) {
+		memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+			sizeof(xfs_bmbt_rec_t));
+	}
+	ifp->if_u1.if_extents = NULL;
+	ifp->if_real_bytes = 0;
+	ifp->if_bytes = 0;
+}
+
+/*
+ * Return a pointer to the extent record for file system block bno.
+ */
+xfs_bmbt_rec_host_t *			/* pointer to found extent record */
+xfs_iext_bno_to_ext(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_fileoff_t	bno,		/* block number to search for */
+	xfs_extnum_t	*idxp)		/* index of target extent */
+{
+	xfs_bmbt_rec_host_t *base;	/* pointer to first extent */
+	xfs_filblks_t	blockcount = 0;	/* number of blocks in extent */
+	xfs_bmbt_rec_host_t *ep = NULL;	/* pointer to target extent */
+	xfs_ext_irec_t	*erp = NULL;	/* indirection array pointer */
+	int		high;		/* upper boundary in search */
+	xfs_extnum_t	idx = 0;	/* index of target extent */
+	int		low;		/* lower boundary in search */
+	xfs_extnum_t	nextents;	/* number of file extents */
+	xfs_fileoff_t	startoff = 0;	/* start offset of extent */
+
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*idxp = 0;
+		return NULL;
+	}
+	low = 0;
+	if (ifp->if_flags & XFS_IFEXTIREC) {
+		/* Find target extent list */
+		int	erp_idx = 0;
+		erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
+		base = erp->er_extbuf;
+		high = erp->er_extcount - 1;
+	} else {
+		base = ifp->if_u1.if_extents;
+		high = nextents - 1;
+	}
+	/* Binary search extent records */
+	while (low <= high) {
+		idx = (low + high) >> 1;
+		ep = base + idx;
+		startoff = xfs_bmbt_get_startoff(ep);
+		blockcount = xfs_bmbt_get_blockcount(ep);
+		if (bno < startoff) {
+			high = idx - 1;
+		} else if (bno >= startoff + blockcount) {
+			low = idx + 1;
+		} else {
+			/* Convert back to file-based extent index */
+			if (ifp->if_flags & XFS_IFEXTIREC) {
+				idx += erp->er_extoff;
+			}
+			*idxp = idx;
+			return ep;
+		}
+	}
+	/* Convert back to file-based extent index */
+	if (ifp->if_flags & XFS_IFEXTIREC) {
+		idx += erp->er_extoff;
+	}
+	if (bno >= startoff + blockcount) {
+		if (++idx == nextents) {
+			ep = NULL;
+		} else {
+			ep = xfs_iext_get_ext(ifp, idx);
+		}
+	}
+	*idxp = idx;
+	return ep;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record for filesystem block bno. Store the index of the
+ * target irec in *erp_idxp.
+ */
+xfs_ext_irec_t *			/* pointer to found extent record */
+xfs_iext_bno_to_irec(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_fileoff_t	bno,		/* block number to search for */
+	int		*erp_idxp)	/* irec index of target ext list */
+{
+	xfs_ext_irec_t	*erp = NULL;	/* indirection array pointer */
+	xfs_ext_irec_t	*erp_next;	/* next indirection array entry */
+	int		erp_idx;	/* indirection array index */
+	int		nlists;		/* number of extent irec's (lists) */
+	int		high;		/* binary search upper limit */
+	int		low;		/* binary search lower limit */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	erp_idx = 0;
+	low = 0;
+	high = nlists - 1;
+	while (low <= high) {
+		erp_idx = (low + high) >> 1;
+		erp = &ifp->if_u1.if_ext_irec[erp_idx];
+		erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
+		if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
+			high = erp_idx - 1;
+		} else if (erp_next && bno >=
+			   xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
+			low = erp_idx + 1;
+		} else {
+			break;
+		}
+	}
+	*erp_idxp = erp_idx;
+	return erp;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record at file extent index *idxp. Store the index of the
+ * target irec in *erp_idxp and store the page index of the target
+ * extent record in *idxp.
+ */
+xfs_ext_irec_t *
+xfs_iext_idx_to_irec(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	*idxp,		/* extent index (file -> page) */
+	int		*erp_idxp,	/* pointer to target irec */
+	int		realloc)	/* new bytes were just added */
+{
+	xfs_ext_irec_t	*prev;		/* pointer to previous irec */
+	xfs_ext_irec_t	*erp = NULL;	/* pointer to current irec */
+	int		erp_idx;	/* indirection array index */
+	int		nlists;		/* number of irec's (ex lists) */
+	int		high;		/* binary search upper limit */
+	int		low;		/* binary search lower limit */
+	xfs_extnum_t	page_idx = *idxp; /* extent index in target list */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	ASSERT(page_idx >= 0);
+	ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+	ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	erp_idx = 0;
+	low = 0;
+	high = nlists - 1;
+
+	/* Binary search extent irec's */
+	while (low <= high) {
+		erp_idx = (low + high) >> 1;
+		erp = &ifp->if_u1.if_ext_irec[erp_idx];
+		prev = erp_idx > 0 ? erp - 1 : NULL;
+		if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
+		     realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
+			high = erp_idx - 1;
+		} else if (page_idx > erp->er_extoff + erp->er_extcount ||
+			   (page_idx == erp->er_extoff + erp->er_extcount &&
+			    !realloc)) {
+			low = erp_idx + 1;
+		} else if (page_idx == erp->er_extoff + erp->er_extcount &&
+			   erp->er_extcount == XFS_LINEAR_EXTS) {
+			ASSERT(realloc);
+			page_idx = 0;
+			erp_idx++;
+			erp = erp_idx < nlists ? erp + 1 : NULL;
+			break;
+		} else {
+			page_idx -= erp->er_extoff;
+			break;
+		}
+	}
+	*idxp = page_idx;
+	*erp_idxp = erp_idx;
+	return(erp);
+}
+
+/*
+ * Allocate and initialize an indirection array once the space needed
+ * for incore extents increases above XFS_IEXT_BUFSZ.
+ */
+void
+xfs_iext_irec_init(
+	xfs_ifork_t	*ifp)		/* inode fork pointer */
+{
+	xfs_ext_irec_t	*erp;		/* indirection array pointer */
+	xfs_extnum_t	nextents;	/* number of extents in file */
+
+	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(nextents <= XFS_LINEAR_EXTS);
+
+	erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
+
+	if (nextents == 0) {
+		ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+	} else if (!ifp->if_real_bytes) {
+		xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
+	} else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
+		xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
+	}
+	erp->er_extbuf = ifp->if_u1.if_extents;
+	erp->er_extcount = nextents;
+	erp->er_extoff = 0;
+
+	ifp->if_flags |= XFS_IFEXTIREC;
+	ifp->if_real_bytes = XFS_IEXT_BUFSZ;
+	ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
+	ifp->if_u1.if_ext_irec = erp;
+
+	return;
+}
+
+/*
+ * Allocate and initialize a new entry in the indirection array.
+ */
+xfs_ext_irec_t *
+xfs_iext_irec_new(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		erp_idx)	/* index for new irec */
+{
+	xfs_ext_irec_t	*erp;		/* indirection array pointer */
+	int		i;		/* loop counter */
+	int		nlists;		/* number of irec's (ex lists) */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+	/* Resize indirection array */
+	xfs_iext_realloc_indirect(ifp, ++nlists *
+				  sizeof(xfs_ext_irec_t));
+	/*
+	 * Move records down in the array so the
+	 * new page can use erp_idx.
+	 */
+	erp = ifp->if_u1.if_ext_irec;
+	for (i = nlists - 1; i > erp_idx; i--) {
+		memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
+	}
+	ASSERT(i == erp_idx);
+
+	/* Initialize new extent record */
+	erp = ifp->if_u1.if_ext_irec;
+	erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+	memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
+	erp[erp_idx].er_extcount = 0;
+	erp[erp_idx].er_extoff = erp_idx > 0 ?
+		erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
+	return (&erp[erp_idx]);
+}
+
+/*
+ * Remove a record from the indirection array.
+ */
+void
+xfs_iext_irec_remove(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		erp_idx)	/* irec index to remove */
+{
+	xfs_ext_irec_t	*erp;		/* indirection array pointer */
+	int		i;		/* loop counter */
+	int		nlists;		/* number of irec's (ex lists) */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	erp = &ifp->if_u1.if_ext_irec[erp_idx];
+	if (erp->er_extbuf) {
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
+			-erp->er_extcount);
+		kmem_free(erp->er_extbuf);
+	}
+	/* Compact extent records */
+	erp = ifp->if_u1.if_ext_irec;
+	for (i = erp_idx; i < nlists - 1; i++) {
+		memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
+	}
+	/*
+	 * Manually free the last extent record from the indirection
+	 * array.  A call to xfs_iext_realloc_indirect() with a size
+	 * of zero would result in a call to xfs_iext_destroy() which
+	 * would in turn call this function again, creating a nasty
+	 * infinite loop.
+	 */
+	if (--nlists) {
+		xfs_iext_realloc_indirect(ifp,
+			nlists * sizeof(xfs_ext_irec_t));
+	} else {
+		kmem_free(ifp->if_u1.if_ext_irec);
+	}
+	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+}
+
+/*
+ * This is called to clean up large amounts of unused memory allocated
+ * by the indirection array.  Before compacting anything though, verify
+ * that the indirection array is still needed and switch back to the
+ * linear extent list (or even the inline buffer) if possible.  The
+ * compaction policy is as follows:
+ *
+ *    Full Compaction: Extents fit into a single page (or inline buffer)
+ * Partial Compaction: Extents occupy less than 50% of allocated space
+ *      No Compaction: Extents occupy at least 50% of allocated space
+ */
+void
+xfs_iext_irec_compact(
+	xfs_ifork_t	*ifp)		/* inode fork pointer */
+{
+	xfs_extnum_t	nextents;	/* number of extents in file */
+	int		nlists;		/* number of irec's (ex lists) */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+	if (nextents == 0) {
+		xfs_iext_destroy(ifp);
+	} else if (nextents <= XFS_INLINE_EXTS) {
+		xfs_iext_indirect_to_direct(ifp);
+		xfs_iext_direct_to_inline(ifp, nextents);
+	} else if (nextents <= XFS_LINEAR_EXTS) {
+		xfs_iext_indirect_to_direct(ifp);
+	} else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
+		xfs_iext_irec_compact_pages(ifp);
+	}
+}
+
+/*
+ * Combine extents from neighboring extent pages.
+ */
+void
+xfs_iext_irec_compact_pages(
+	xfs_ifork_t	*ifp)		/* inode fork pointer */
+{
+	xfs_ext_irec_t	*erp, *erp_next;/* pointers to irec entries */
+	int		erp_idx = 0;	/* indirection array index */
+	int		nlists;		/* number of irec's (ex lists) */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	while (erp_idx < nlists - 1) {
+		erp = &ifp->if_u1.if_ext_irec[erp_idx];
+		erp_next = erp + 1;
+		if (erp_next->er_extcount <=
+		    (XFS_LINEAR_EXTS - erp->er_extcount)) {
+			memcpy(&erp->er_extbuf[erp->er_extcount],
+				erp_next->er_extbuf, erp_next->er_extcount *
+				sizeof(xfs_bmbt_rec_t));
+			erp->er_extcount += erp_next->er_extcount;
+			/*
+			 * Free page before removing extent record
+			 * so er_extoffs don't get modified in
+			 * xfs_iext_irec_remove.
+			 */
+			kmem_free(erp_next->er_extbuf);
+			erp_next->er_extbuf = NULL;
+			xfs_iext_irec_remove(ifp, erp_idx + 1);
+			nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+		} else {
+			erp_idx++;
+		}
+	}
+}
+
+/*
+ * This is called to update the er_extoff field in the indirection
+ * array when extents have been added or removed from one of the
+ * extent lists. erp_idx contains the irec index to begin updating
+ * at and ext_diff contains the number of extents that were added
+ * or removed.
+ */
+void
+xfs_iext_irec_update_extoffs(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		erp_idx,	/* irec index to update */
+	int		ext_diff)	/* number of new extents */
+{
+	int		i;		/* loop counter */
+	int		nlists;		/* number of irec's (ex lists */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	for (i = erp_idx; i < nlists; i++) {
+		ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
+	}
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
new file mode 100644
index 00000000..28b35964
--- /dev/null
+++ b/fs/xfs/xfs_inode.h
@@ -0,0 +1,600 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_INODE_H__
+#define	__XFS_INODE_H__
+
+struct posix_acl;
+struct xfs_dinode;
+struct xfs_inode;
+
+/*
+ * Fork identifiers.
+ */
+#define	XFS_DATA_FORK	0
+#define	XFS_ATTR_FORK	1
+
+/*
+ * The following xfs_ext_irec_t struct introduces a second (top) level
+ * to the in-core extent allocation scheme. These structs are allocated
+ * in a contiguous block, creating an indirection array where each entry
+ * (irec) contains a pointer to a buffer of in-core extent records which
+ * it manages. Each extent buffer is 4k in size, since 4k is the system
+ * page size on Linux i386 and systems with larger page sizes don't seem
+ * to gain much, if anything, by using their native page size as the
+ * extent buffer size. Also, using 4k extent buffers everywhere provides
+ * a consistent interface for CXFS across different platforms.
+ *
+ * There is currently no limit on the number of irec's (extent lists)
+ * allowed, so heavily fragmented files may require an indirection array
+ * which spans multiple system pages of memory. The number of extents
+ * which would require this amount of contiguous memory is very large
+ * and should not cause problems in the foreseeable future. However,
+ * if the memory needed for the contiguous array ever becomes a problem,
+ * it is possible that a third level of indirection may be required.
+ */
+typedef struct xfs_ext_irec {
+	xfs_bmbt_rec_host_t *er_extbuf;	/* block of extent records */
+	xfs_extnum_t	er_extoff;	/* extent offset in file */
+	xfs_extnum_t	er_extcount;	/* number of extents in page/block */
+} xfs_ext_irec_t;
+
+/*
+ * File incore extent information, present for each of data & attr forks.
+ */
+#define	XFS_IEXT_BUFSZ		4096
+#define	XFS_LINEAR_EXTS		(XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
+#define	XFS_INLINE_EXTS		2
+#define	XFS_INLINE_DATA		32
+typedef struct xfs_ifork {
+	int			if_bytes;	/* bytes in if_u1 */
+	int			if_real_bytes;	/* bytes allocated in if_u1 */
+	struct xfs_btree_block	*if_broot;	/* file's incore btree root */
+	short			if_broot_bytes;	/* bytes allocated for root */
+	unsigned char		if_flags;	/* per-fork flags */
+	unsigned char		if_ext_max;	/* max # of extent records */
+	union {
+		xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
+		xfs_ext_irec_t	*if_ext_irec;	/* irec map file exts */
+		char		*if_data;	/* inline file data */
+	} if_u1;
+	union {
+		xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
+						/* very small file extents */
+		char		if_inline_data[XFS_INLINE_DATA];
+						/* very small file data */
+		xfs_dev_t	if_rdev;	/* dev number if special */
+		uuid_t		if_uuid;	/* mount point value */
+	} if_u2;
+} xfs_ifork_t;
+
+/*
+ * Inode location information.  Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
+ */
+struct xfs_imap {
+	xfs_daddr_t	im_blkno;	/* starting BB of inode chunk */
+	ushort		im_len;		/* length in BBs of inode chunk */
+	ushort		im_boffset;	/* inode offset in block in bytes */
+};
+
+/*
+ * This is the xfs in-core inode structure.
+ * Most of the on-disk inode is embedded in the i_d field.
+ *
+ * The extent pointers/inline file space, however, are managed
+ * separately.  The memory for this information is pointed to by
+ * the if_u1 unions depending on the type of the data.
+ * This is used to linearize the array of extents for fast in-core
+ * access.  This is used until the file's number of extents
+ * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers
+ * are accessed through the buffer cache.
+ *
+ * Other state kept in the in-core inode is used for identification,
+ * locking, transactional updating, etc of the inode.
+ *
+ * Generally, we do not want to hold the i_rlock while holding the
+ * i_ilock. Hierarchy is i_iolock followed by i_rlock.
+ *
+ * xfs_iptr_t contains all the inode fields up to and including the
+ * i_mnext and i_mprev fields, it is used as a marker in the inode
+ * chain off the mount structure by xfs_sync calls.
+ */
+
+typedef struct xfs_ictimestamp {
+	__int32_t	t_sec;		/* timestamp seconds */
+	__int32_t	t_nsec;		/* timestamp nanoseconds */
+} xfs_ictimestamp_t;
+
+/*
+ * NOTE:  This structure must be kept identical to struct xfs_dinode
+ * 	  in xfs_dinode.h except for the endianness annotations.
+ */
+typedef struct xfs_icdinode {
+	__uint16_t	di_magic;	/* inode magic # = XFS_DINODE_MAGIC */
+	__uint16_t	di_mode;	/* mode and type of file */
+	__int8_t	di_version;	/* inode version */
+	__int8_t	di_format;	/* format of di_c data */
+	__uint16_t	di_onlink;	/* old number of links to file */
+	__uint32_t	di_uid;		/* owner's user id */
+	__uint32_t	di_gid;		/* owner's group id */
+	__uint32_t	di_nlink;	/* number of links to file */
+	__uint16_t	di_projid_lo;	/* lower part of owner's project id */
+	__uint16_t	di_projid_hi;	/* higher part of owner's project id */
+	__uint8_t	di_pad[6];	/* unused, zeroed space */
+	__uint16_t	di_flushiter;	/* incremented on flush */
+	xfs_ictimestamp_t di_atime;	/* time last accessed */
+	xfs_ictimestamp_t di_mtime;	/* time last modified */
+	xfs_ictimestamp_t di_ctime;	/* time created/inode modified */
+	xfs_fsize_t	di_size;	/* number of bytes in file */
+	xfs_drfsbno_t	di_nblocks;	/* # of direct & btree blocks used */
+	xfs_extlen_t	di_extsize;	/* basic/minimum extent size for file */
+	xfs_extnum_t	di_nextents;	/* number of extents in data fork */
+	xfs_aextnum_t	di_anextents;	/* number of extents in attribute fork*/
+	__uint8_t	di_forkoff;	/* attr fork offs, <<3 for 64b align */
+	__int8_t	di_aformat;	/* format of attr fork's data */
+	__uint32_t	di_dmevmask;	/* DMIG event mask */
+	__uint16_t	di_dmstate;	/* DMIG state info */
+	__uint16_t	di_flags;	/* random flags, XFS_DIFLAG_... */
+	__uint32_t	di_gen;		/* generation number */
+} xfs_icdinode_t;
+
+/*
+ * Flags for xfs_ichgtime().
+ */
+#define	XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
+#define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
+
+/*
+ * Per-fork incore inode flags.
+ */
+#define	XFS_IFINLINE	0x01	/* Inline data is read in */
+#define	XFS_IFEXTENTS	0x02	/* All extent pointers are read in */
+#define	XFS_IFBROOT	0x04	/* i_broot points to the bmap b-tree root */
+#define	XFS_IFEXTIREC	0x08	/* Indirection array of extent blocks */
+
+/*
+ * Fork handling.
+ */
+
+#define XFS_IFORK_Q(ip)			((ip)->i_d.di_forkoff != 0)
+#define XFS_IFORK_BOFF(ip)		((int)((ip)->i_d.di_forkoff << 3))
+
+#define XFS_IFORK_PTR(ip,w)		\
+	((w) == XFS_DATA_FORK ? \
+		&(ip)->i_df : \
+		(ip)->i_afp)
+#define XFS_IFORK_DSIZE(ip) \
+	(XFS_IFORK_Q(ip) ? \
+		XFS_IFORK_BOFF(ip) : \
+		XFS_LITINO((ip)->i_mount))
+#define XFS_IFORK_ASIZE(ip) \
+	(XFS_IFORK_Q(ip) ? \
+		XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
+		0)
+#define XFS_IFORK_SIZE(ip,w) \
+	((w) == XFS_DATA_FORK ? \
+		XFS_IFORK_DSIZE(ip) : \
+		XFS_IFORK_ASIZE(ip))
+#define XFS_IFORK_FORMAT(ip,w) \
+	((w) == XFS_DATA_FORK ? \
+		(ip)->i_d.di_format : \
+		(ip)->i_d.di_aformat)
+#define XFS_IFORK_FMT_SET(ip,w,n) \
+	((w) == XFS_DATA_FORK ? \
+		((ip)->i_d.di_format = (n)) : \
+		((ip)->i_d.di_aformat = (n)))
+#define XFS_IFORK_NEXTENTS(ip,w) \
+	((w) == XFS_DATA_FORK ? \
+		(ip)->i_d.di_nextents : \
+		(ip)->i_d.di_anextents)
+#define XFS_IFORK_NEXT_SET(ip,w,n) \
+	((w) == XFS_DATA_FORK ? \
+		((ip)->i_d.di_nextents = (n)) : \
+		((ip)->i_d.di_anextents = (n)))
+
+
+
+#ifdef __KERNEL__
+
+struct bhv_desc;
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_bmbt_irec;
+struct xfs_inode_log_item;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot;
+
+typedef struct dm_attrs_s {
+	__uint32_t	da_dmevmask;	/* DMIG event mask */
+	__uint16_t	da_dmstate;	/* DMIG state info */
+	__uint16_t	da_pad;		/* DMIG extra padding */
+} dm_attrs_t;
+
+typedef struct xfs_inode {
+	/* Inode linking and identification information. */
+	struct xfs_mount	*i_mount;	/* fs mount struct ptr */
+	struct xfs_dquot	*i_udquot;	/* user dquot */
+	struct xfs_dquot	*i_gdquot;	/* group dquot */
+
+	/* Inode location stuff */
+	xfs_ino_t		i_ino;		/* inode number (agno/agino)*/
+	struct xfs_imap		i_imap;		/* location for xfs_imap() */
+
+	/* Extent information. */
+	xfs_ifork_t		*i_afp;		/* attribute fork pointer */
+	xfs_ifork_t		i_df;		/* data fork */
+
+	/* Transaction and locking information. */
+	struct xfs_trans	*i_transp;	/* ptr to owning transaction*/
+	struct xfs_inode_log_item *i_itemp;	/* logging information */
+	mrlock_t		i_lock;		/* inode lock */
+	mrlock_t		i_iolock;	/* inode IO lock */
+	struct completion	i_flush;	/* inode flush completion q */
+	atomic_t		i_pincount;	/* inode pin count */
+	wait_queue_head_t	i_ipin_wait;	/* inode pinning wait queue */
+	spinlock_t		i_flags_lock;	/* inode i_flags lock */
+	/* Miscellaneous state. */
+	unsigned short		i_flags;	/* see defined flags below */
+	unsigned char		i_update_core;	/* timestamps/size is dirty */
+	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
+
+	xfs_icdinode_t		i_d;		/* most of ondisk inode */
+
+	xfs_fsize_t		i_size;		/* in-memory size */
+	xfs_fsize_t		i_new_size;	/* size when write completes */
+	atomic_t		i_iocount;	/* outstanding I/O count */
+
+	/* VFS inode */
+	struct inode		i_vnode;	/* embedded VFS inode */
+} xfs_inode_t;
+
+#define XFS_ISIZE(ip)	(((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \
+				(ip)->i_size : (ip)->i_d.di_size;
+
+/* Convert from vfs inode to xfs inode */
+static inline struct xfs_inode *XFS_I(struct inode *inode)
+{
+	return container_of(inode, struct xfs_inode, i_vnode);
+}
+
+/* convert from xfs inode to vfs inode */
+static inline struct inode *VFS_I(struct xfs_inode *ip)
+{
+	return &ip->i_vnode;
+}
+
+/*
+ * i_flags helper functions
+ */
+static inline void
+__xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
+{
+	ip->i_flags |= flags;
+}
+
+static inline void
+xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
+{
+	spin_lock(&ip->i_flags_lock);
+	__xfs_iflags_set(ip, flags);
+	spin_unlock(&ip->i_flags_lock);
+}
+
+static inline void
+xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags)
+{
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags &= ~flags;
+	spin_unlock(&ip->i_flags_lock);
+}
+
+static inline int
+__xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
+{
+	return (ip->i_flags & flags);
+}
+
+static inline int
+xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
+{
+	int ret;
+	spin_lock(&ip->i_flags_lock);
+	ret = __xfs_iflags_test(ip, flags);
+	spin_unlock(&ip->i_flags_lock);
+	return ret;
+}
+
+static inline int
+xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
+{
+	int ret;
+
+	spin_lock(&ip->i_flags_lock);
+	ret = ip->i_flags & flags;
+	if (ret)
+		ip->i_flags &= ~flags;
+	spin_unlock(&ip->i_flags_lock);
+	return ret;
+}
+
+/*
+ * Project quota id helpers (previously projid was 16bit only
+ * and using two 16bit values to hold new 32bit projid was chosen
+ * to retain compatibility with "old" filesystems).
+ */
+static inline prid_t
+xfs_get_projid(struct xfs_inode *ip)
+{
+	return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
+}
+
+static inline void
+xfs_set_projid(struct xfs_inode *ip,
+		prid_t projid)
+{
+	ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16);
+	ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
+}
+
+/*
+ * Manage the i_flush queue embedded in the inode.  This completion
+ * queue synchronizes processes attempting to flush the in-core
+ * inode back to disk.
+ */
+static inline void xfs_iflock(xfs_inode_t *ip)
+{
+	wait_for_completion(&ip->i_flush);
+}
+
+static inline int xfs_iflock_nowait(xfs_inode_t *ip)
+{
+	return try_wait_for_completion(&ip->i_flush);
+}
+
+static inline void xfs_ifunlock(xfs_inode_t *ip)
+{
+	complete(&ip->i_flush);
+}
+
+/*
+ * In-core inode flags.
+ */
+#define XFS_IRECLAIM		0x0001  /* started reclaiming this inode */
+#define XFS_ISTALE		0x0002	/* inode has been staled */
+#define XFS_IRECLAIMABLE	0x0004	/* inode can be reclaimed */
+#define XFS_INEW		0x0008	/* inode has just been allocated */
+#define XFS_IFILESTREAM		0x0010	/* inode is in a filestream directory */
+#define XFS_ITRUNCATED		0x0020	/* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE	0x0040	/* dirty release already seen */
+
+/*
+ * Per-lifetime flags need to be reset when re-using a reclaimable inode during
+ * inode lookup. Thi prevents unintended behaviour on the new inode from
+ * ocurring.
+ */
+#define XFS_IRECLAIM_RESET_FLAGS	\
+	(XFS_IRECLAIMABLE | XFS_IRECLAIM | \
+	 XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \
+	 XFS_IFILESTREAM);
+
+/*
+ * Flags for inode locking.
+ * Bit ranges:	1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
+ *		1<<16 - 1<<32-1 -- lockdep annotation (integers)
+ */
+#define	XFS_IOLOCK_EXCL		(1<<0)
+#define	XFS_IOLOCK_SHARED	(1<<1)
+#define	XFS_ILOCK_EXCL		(1<<2)
+#define	XFS_ILOCK_SHARED	(1<<3)
+#define	XFS_IUNLOCK_NONOTIFY	(1<<4)
+
+#define XFS_LOCK_MASK		(XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
+				| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
+
+#define XFS_LOCK_FLAGS \
+	{ XFS_IOLOCK_EXCL,	"IOLOCK_EXCL" }, \
+	{ XFS_IOLOCK_SHARED,	"IOLOCK_SHARED" }, \
+	{ XFS_ILOCK_EXCL,	"ILOCK_EXCL" }, \
+	{ XFS_ILOCK_SHARED,	"ILOCK_SHARED" }, \
+	{ XFS_IUNLOCK_NONOTIFY,	"IUNLOCK_NONOTIFY" }
+
+
+/*
+ * Flags for lockdep annotations.
+ *
+ * XFS_LOCK_PARENT - for directory operations that require locking a
+ * parent directory inode and a child entry inode.  The parent gets locked
+ * with this flag so it gets a lockdep subclass of 1 and the child entry
+ * lock will have a lockdep subclass of 0.
+ *
+ * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
+ * inodes do not participate in the normal lock order, and thus have their
+ * own subclasses.
+ *
+ * XFS_LOCK_INUMORDER - for locking several inodes at the some time
+ * with xfs_lock_inodes().  This flag is used as the starting subclass
+ * and each subsequent lock acquired will increment the subclass by one.
+ * So the first lock acquired will have a lockdep subclass of 4, the
+ * second lock will have a lockdep subclass of 5, and so on. It is
+ * the responsibility of the class builder to shift this to the correct
+ * portion of the lock_mode lockdep mask.
+ */
+#define XFS_LOCK_PARENT		1
+#define XFS_LOCK_RTBITMAP	2
+#define XFS_LOCK_RTSUM		3
+#define XFS_LOCK_INUMORDER	4
+
+#define XFS_IOLOCK_SHIFT	16
+#define	XFS_IOLOCK_PARENT	(XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
+
+#define XFS_ILOCK_SHIFT		24
+#define	XFS_ILOCK_PARENT	(XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
+#define	XFS_ILOCK_RTBITMAP	(XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
+#define	XFS_ILOCK_RTSUM		(XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
+
+#define XFS_IOLOCK_DEP_MASK	0x00ff0000
+#define XFS_ILOCK_DEP_MASK	0xff000000
+#define XFS_LOCK_DEP_MASK	(XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK)
+
+#define XFS_IOLOCK_DEP(flags)	(((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
+#define XFS_ILOCK_DEP(flags)	(((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
+
+extern struct lock_class_key xfs_iolock_reclaimable;
+
+/*
+ * Flags for xfs_itruncate_start().
+ */
+#define	XFS_ITRUNC_DEFINITE	0x1
+#define	XFS_ITRUNC_MAYBE	0x2
+
+#define XFS_ITRUNC_FLAGS \
+	{ XFS_ITRUNC_DEFINITE,	"DEFINITE" }, \
+	{ XFS_ITRUNC_MAYBE,	"MAYBE" }
+
+/*
+ * For multiple groups support: if S_ISGID bit is set in the parent
+ * directory, group of new file is set to that of the parent, and
+ * new subdirectory gets S_ISGID bit from parent.
+ */
+#define XFS_INHERIT_GID(pip)	\
+	(((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
+	 ((pip)->i_d.di_mode & S_ISGID))
+
+/*
+ * xfs_iget.c prototypes.
+ */
+int		xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
+			 uint, uint, xfs_inode_t **);
+void		xfs_ilock(xfs_inode_t *, uint);
+int		xfs_ilock_nowait(xfs_inode_t *, uint);
+void		xfs_iunlock(xfs_inode_t *, uint);
+void		xfs_ilock_demote(xfs_inode_t *, uint);
+int		xfs_isilocked(xfs_inode_t *, uint);
+uint		xfs_ilock_map_shared(xfs_inode_t *);
+void		xfs_iunlock_map_shared(xfs_inode_t *, uint);
+void		xfs_inode_free(struct xfs_inode *ip);
+
+/*
+ * xfs_inode.c prototypes.
+ */
+int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
+			   xfs_nlink_t, xfs_dev_t, prid_t, int,
+			   struct xfs_buf **, boolean_t *, xfs_inode_t **);
+
+uint		xfs_ip2xflags(struct xfs_inode *);
+uint		xfs_dic2xflags(struct xfs_dinode *);
+int		xfs_ifree(struct xfs_trans *, xfs_inode_t *,
+			   struct xfs_bmap_free *);
+int		xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t);
+int		xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
+				     xfs_fsize_t, int, int);
+int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
+
+void		xfs_iext_realloc(xfs_inode_t *, int, int);
+void		xfs_iunpin_wait(xfs_inode_t *);
+int		xfs_iflush(xfs_inode_t *, uint);
+void		xfs_promote_inode(struct xfs_inode *);
+void		xfs_lock_inodes(xfs_inode_t **, int, uint);
+void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
+
+void		xfs_synchronize_times(xfs_inode_t *);
+void		xfs_mark_inode_dirty(xfs_inode_t *);
+void		xfs_mark_inode_dirty_sync(xfs_inode_t *);
+
+#define IHOLD(ip) \
+do { \
+	ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
+	ihold(VFS_I(ip)); \
+	trace_xfs_ihold(ip, _THIS_IP_); \
+} while (0)
+
+#define IRELE(ip) \
+do { \
+	trace_xfs_irele(ip, _THIS_IP_); \
+	iput(VFS_I(ip)); \
+} while (0)
+
+#endif /* __KERNEL__ */
+
+/*
+ * Flags for xfs_iget()
+ */
+#define XFS_IGET_CREATE		0x1
+#define XFS_IGET_UNTRUSTED	0x2
+
+int		xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
+			    xfs_ino_t, struct xfs_dinode **,
+			    struct xfs_buf **, int *, uint);
+int		xfs_itobp(struct xfs_mount *, struct xfs_trans *,
+			  struct xfs_inode *, struct xfs_dinode **,
+			  struct xfs_buf **, uint);
+int		xfs_iread(struct xfs_mount *, struct xfs_trans *,
+			  struct xfs_inode *, uint);
+void		xfs_dinode_to_disk(struct xfs_dinode *,
+				   struct xfs_icdinode *);
+void		xfs_idestroy_fork(struct xfs_inode *, int);
+void		xfs_idata_realloc(struct xfs_inode *, int, int);
+void		xfs_iroot_realloc(struct xfs_inode *, int, int);
+int		xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int		xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
+
+xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
+void		xfs_iext_insert(xfs_inode_t *, xfs_extnum_t, xfs_extnum_t,
+				xfs_bmbt_irec_t *, int);
+void		xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int);
+void		xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int);
+void		xfs_iext_remove(xfs_inode_t *, xfs_extnum_t, int, int);
+void		xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
+void		xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
+void		xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
+void		xfs_iext_realloc_direct(xfs_ifork_t *, int);
+void		xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
+void		xfs_iext_inline_to_direct(xfs_ifork_t *, int);
+void		xfs_iext_destroy(xfs_ifork_t *);
+xfs_bmbt_rec_host_t *xfs_iext_bno_to_ext(xfs_ifork_t *, xfs_fileoff_t, int *);
+xfs_ext_irec_t	*xfs_iext_bno_to_irec(xfs_ifork_t *, xfs_fileoff_t, int *);
+xfs_ext_irec_t	*xfs_iext_idx_to_irec(xfs_ifork_t *, xfs_extnum_t *, int *, int);
+void		xfs_iext_irec_init(xfs_ifork_t *);
+xfs_ext_irec_t *xfs_iext_irec_new(xfs_ifork_t *, int);
+void		xfs_iext_irec_remove(xfs_ifork_t *, int);
+void		xfs_iext_irec_compact(xfs_ifork_t *);
+void		xfs_iext_irec_compact_pages(xfs_ifork_t *);
+void		xfs_iext_irec_compact_full(xfs_ifork_t *);
+void		xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
+
+#define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))
+
+#ifdef DEBUG
+void		xfs_isize_check(struct xfs_mount *, struct xfs_inode *,
+				xfs_fsize_t);
+#else	/* DEBUG */
+#define xfs_isize_check(mp, ip, isize)
+#endif	/* DEBUG */
+
+#if defined(DEBUG)
+void		xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
+#else
+#define	xfs_inobp_check(mp, bp)
+#endif /* DEBUG */
+
+extern struct kmem_zone	*xfs_ifork_zone;
+extern struct kmem_zone	*xfs_inode_zone;
+extern struct kmem_zone	*xfs_ili_zone;
+
+#endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
new file mode 100644
index 00000000..391044c6
--- /dev/null
+++ b/fs/xfs/xfs_inode_item.c
@@ -0,0 +1,1060 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+
+kmem_zone_t	*xfs_ili_zone;		/* inode log item zone */
+
+static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_inode_log_item, ili_item);
+}
+
+
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We need one iovec for the inode log format structure, one for the
+ * inode core, and possibly one for the inode data/extents/b-tree root
+ * and one for the inode attribute data/extents/b-tree root.
+ */
+STATIC uint
+xfs_inode_item_size(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+	uint			nvecs = 2;
+
+	/*
+	 * Only log the data/extents/b-tree root if there is something
+	 * left to log.
+	 */
+	iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
+
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) &&
+		    (ip->i_d.di_nextents > 0) &&
+		    (ip->i_df.if_bytes > 0)) {
+			ASSERT(ip->i_df.if_u1.if_extents != NULL);
+			nvecs++;
+		} else {
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT;
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		ASSERT(ip->i_df.if_ext_max ==
+		       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) &&
+		    (ip->i_df.if_broot_bytes > 0)) {
+			ASSERT(ip->i_df.if_broot != NULL);
+			nvecs++;
+		} else {
+			ASSERT(!(iip->ili_format.ilf_fields &
+				 XFS_ILOG_DBROOT));
+#ifdef XFS_TRANS_DEBUG
+			if (iip->ili_root_size > 0) {
+				ASSERT(iip->ili_root_size ==
+				       ip->i_df.if_broot_bytes);
+				ASSERT(memcmp(iip->ili_orig_root,
+					    ip->i_df.if_broot,
+					    iip->ili_root_size) == 0);
+			} else {
+				ASSERT(ip->i_df.if_broot_bytes == 0);
+			}
+#endif
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT;
+		}
+		break;
+
+	case XFS_DINODE_FMT_LOCAL:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) &&
+		    (ip->i_df.if_bytes > 0)) {
+			ASSERT(ip->i_df.if_u1.if_data != NULL);
+			ASSERT(ip->i_d.di_size > 0);
+			nvecs++;
+		} else {
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA;
+		}
+		break;
+
+	case XFS_DINODE_FMT_DEV:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEXT | XFS_ILOG_UUID);
+		break;
+
+	case XFS_DINODE_FMT_UUID:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEXT | XFS_ILOG_DEV);
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	/*
+	 * If there are no attributes associated with this file,
+	 * then there cannot be anything more to log.
+	 * Clear all attribute-related log flags.
+	 */
+	if (!XFS_IFORK_Q(ip)) {
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+		return nvecs;
+	}
+
+	/*
+	 * Log any necessary attribute data.
+	 */
+	switch (ip->i_d.di_aformat) {
+	case XFS_DINODE_FMT_EXTENTS:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) &&
+		    (ip->i_d.di_anextents > 0) &&
+		    (ip->i_afp->if_bytes > 0)) {
+			ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+			nvecs++;
+		} else {
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT;
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) &&
+		    (ip->i_afp->if_broot_bytes > 0)) {
+			ASSERT(ip->i_afp->if_broot != NULL);
+			nvecs++;
+		} else {
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT;
+		}
+		break;
+
+	case XFS_DINODE_FMT_LOCAL:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) &&
+		    (ip->i_afp->if_bytes > 0)) {
+			ASSERT(ip->i_afp->if_u1.if_data != NULL);
+			nvecs++;
+		} else {
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA;
+		}
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	return nvecs;
+}
+
+/*
+ * xfs_inode_item_format_extents - convert in-core extents to on-disk form
+ *
+ * For either the data or attr fork in extent format, we need to endian convert
+ * the in-core extent as we place them into the on-disk inode. In this case, we
+ * need to do this conversion before we write the extents into the log. Because
+ * we don't have the disk inode to write into here, we allocate a buffer and
+ * format the extents into it via xfs_iextents_copy(). We free the buffer in
+ * the unlock routine after the copy for the log has been made.
+ *
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * different due to delayed allocation extents. We only log on-disk extents
+ * here, so always use the physical fork size to determine the size of the
+ * buffer we need to allocate.
+ */
+STATIC void
+xfs_inode_item_format_extents(
+	struct xfs_inode	*ip,
+	struct xfs_log_iovec	*vecp,
+	int			whichfork,
+	int			type)
+{
+	xfs_bmbt_rec_t		*ext_buffer;
+
+	ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP);
+	if (whichfork == XFS_DATA_FORK)
+		ip->i_itemp->ili_extents_buf = ext_buffer;
+	else
+		ip->i_itemp->ili_aextents_buf = ext_buffer;
+
+	vecp->i_addr = ext_buffer;
+	vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork);
+	vecp->i_type = type;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given inode log item.  It fills the first item with an inode
+ * log format structure, the second with the on-disk inode structure,
+ * and a possible third and/or fourth with the inode data/extents/b-tree
+ * root and inode attributes data/extents/b-tree root.
+ */
+STATIC void
+xfs_inode_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_iovec	*vecp)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+	uint			nvecs;
+	size_t			data_bytes;
+	xfs_mount_t		*mp;
+
+	vecp->i_addr = &iip->ili_format;
+	vecp->i_len  = sizeof(xfs_inode_log_format_t);
+	vecp->i_type = XLOG_REG_TYPE_IFORMAT;
+	vecp++;
+	nvecs	     = 1;
+
+	/*
+	 * Clear i_update_core if the timestamps (or any other
+	 * non-transactional modification) need flushing/logging
+	 * and we're about to log them with the rest of the core.
+	 *
+	 * This is the same logic as xfs_iflush() but this code can't
+	 * run at the same time as xfs_iflush because we're in commit
+	 * processing here and so we have the inode lock held in
+	 * exclusive mode.  Although it doesn't really matter
+	 * for the timestamps if both routines were to grab the
+	 * timestamps or not.  That would be ok.
+	 *
+	 * We clear i_update_core before copying out the data.
+	 * This is for coordination with our timestamp updates
+	 * that don't hold the inode lock. They will always
+	 * update the timestamps BEFORE setting i_update_core,
+	 * so if we clear i_update_core after they set it we
+	 * are guaranteed to see their updates to the timestamps
+	 * either here.  Likewise, if they set it after we clear it
+	 * here, we'll see it either on the next commit of this
+	 * inode or the next time the inode gets flushed via
+	 * xfs_iflush().  This depends on strongly ordered memory
+	 * semantics, but we have that.  We use the SYNCHRONIZE
+	 * macro to make sure that the compiler does not reorder
+	 * the i_update_core access below the data copy below.
+	 */
+	if (ip->i_update_core)  {
+		ip->i_update_core = 0;
+		SYNCHRONIZE();
+	}
+
+	/*
+	 * Make sure to get the latest timestamps from the Linux inode.
+	 */
+	xfs_synchronize_times(ip);
+
+	vecp->i_addr = &ip->i_d;
+	vecp->i_len  = sizeof(struct xfs_icdinode);
+	vecp->i_type = XLOG_REG_TYPE_ICORE;
+	vecp++;
+	nvecs++;
+	iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
+
+	/*
+	 * If this is really an old format inode, then we need to
+	 * log it as such.  This means that we have to copy the link
+	 * count from the new field to the old.  We don't have to worry
+	 * about the new fields, because nothing trusts them as long as
+	 * the old inode version number is there.  If the superblock already
+	 * has a new version number, then we don't bother converting back.
+	 */
+	mp = ip->i_mount;
+	ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
+	if (ip->i_d.di_version == 1) {
+		if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
+			/*
+			 * Convert it back.
+			 */
+			ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
+			ip->i_d.di_onlink = ip->i_d.di_nlink;
+		} else {
+			/*
+			 * The superblock version has already been bumped,
+			 * so just make the conversion to the new inode
+			 * format permanent.
+			 */
+			ip->i_d.di_version = 2;
+			ip->i_d.di_onlink = 0;
+			memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+		}
+	}
+
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) {
+			ASSERT(ip->i_df.if_bytes > 0);
+			ASSERT(ip->i_df.if_u1.if_extents != NULL);
+			ASSERT(ip->i_d.di_nextents > 0);
+			ASSERT(iip->ili_extents_buf == NULL);
+			ASSERT((ip->i_df.if_bytes /
+				(uint)sizeof(xfs_bmbt_rec_t)) > 0);
+#ifdef XFS_NATIVE_HOST
+                       if (ip->i_d.di_nextents == ip->i_df.if_bytes /
+                                               (uint)sizeof(xfs_bmbt_rec_t)) {
+				/*
+				 * There are no delayed allocation
+				 * extents, so just point to the
+				 * real extents array.
+				 */
+				vecp->i_addr = ip->i_df.if_u1.if_extents;
+				vecp->i_len = ip->i_df.if_bytes;
+				vecp->i_type = XLOG_REG_TYPE_IEXT;
+			} else
+#endif
+			{
+				xfs_inode_item_format_extents(ip, vecp,
+					XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
+			}
+			ASSERT(vecp->i_len <= ip->i_df.if_bytes);
+			iip->ili_format.ilf_dsize = vecp->i_len;
+			vecp++;
+			nvecs++;
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
+			ASSERT(ip->i_df.if_broot_bytes > 0);
+			ASSERT(ip->i_df.if_broot != NULL);
+			vecp->i_addr = ip->i_df.if_broot;
+			vecp->i_len = ip->i_df.if_broot_bytes;
+			vecp->i_type = XLOG_REG_TYPE_IBROOT;
+			vecp++;
+			nvecs++;
+			iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
+		}
+		break;
+
+	case XFS_DINODE_FMT_LOCAL:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) {
+			ASSERT(ip->i_df.if_bytes > 0);
+			ASSERT(ip->i_df.if_u1.if_data != NULL);
+			ASSERT(ip->i_d.di_size > 0);
+
+			vecp->i_addr = ip->i_df.if_u1.if_data;
+			/*
+			 * Round i_bytes up to a word boundary.
+			 * The underlying memory is guaranteed to
+			 * to be there by xfs_idata_realloc().
+			 */
+			data_bytes = roundup(ip->i_df.if_bytes, 4);
+			ASSERT((ip->i_df.if_real_bytes == 0) ||
+			       (ip->i_df.if_real_bytes == data_bytes));
+			vecp->i_len = (int)data_bytes;
+			vecp->i_type = XLOG_REG_TYPE_ILOCAL;
+			vecp++;
+			nvecs++;
+			iip->ili_format.ilf_dsize = (unsigned)data_bytes;
+		}
+		break;
+
+	case XFS_DINODE_FMT_DEV:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+			  XFS_ILOG_DDATA | XFS_ILOG_UUID)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+			iip->ili_format.ilf_u.ilfu_rdev =
+				ip->i_df.if_u2.if_rdev;
+		}
+		break;
+
+	case XFS_DINODE_FMT_UUID:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+			  XFS_ILOG_DDATA | XFS_ILOG_DEV)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+			iip->ili_format.ilf_u.ilfu_uuid =
+				ip->i_df.if_u2.if_uuid;
+		}
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	/*
+	 * If there are no attributes associated with the file,
+	 * then we're done.
+	 * Assert that no attribute-related log flags are set.
+	 */
+	if (!XFS_IFORK_Q(ip)) {
+		ASSERT(nvecs == lip->li_desc->lid_size);
+		iip->ili_format.ilf_size = nvecs;
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
+		return;
+	}
+
+	switch (ip->i_d.di_aformat) {
+	case XFS_DINODE_FMT_EXTENTS:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
+#ifdef DEBUG
+			int nrecs = ip->i_afp->if_bytes /
+				(uint)sizeof(xfs_bmbt_rec_t);
+			ASSERT(nrecs > 0);
+			ASSERT(nrecs == ip->i_d.di_anextents);
+			ASSERT(ip->i_afp->if_bytes > 0);
+			ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+			ASSERT(ip->i_d.di_anextents > 0);
+#endif
+#ifdef XFS_NATIVE_HOST
+			/*
+			 * There are not delayed allocation extents
+			 * for attributes, so just point at the array.
+			 */
+			vecp->i_addr = ip->i_afp->if_u1.if_extents;
+			vecp->i_len = ip->i_afp->if_bytes;
+			vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
+#else
+			ASSERT(iip->ili_aextents_buf == NULL);
+			xfs_inode_item_format_extents(ip, vecp,
+					XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
+#endif
+			iip->ili_format.ilf_asize = vecp->i_len;
+			vecp++;
+			nvecs++;
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_ADATA | XFS_ILOG_AEXT)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
+			ASSERT(ip->i_afp->if_broot_bytes > 0);
+			ASSERT(ip->i_afp->if_broot != NULL);
+			vecp->i_addr = ip->i_afp->if_broot;
+			vecp->i_len = ip->i_afp->if_broot_bytes;
+			vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
+			vecp++;
+			nvecs++;
+			iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
+		}
+		break;
+
+	case XFS_DINODE_FMT_LOCAL:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) {
+			ASSERT(ip->i_afp->if_bytes > 0);
+			ASSERT(ip->i_afp->if_u1.if_data != NULL);
+
+			vecp->i_addr = ip->i_afp->if_u1.if_data;
+			/*
+			 * Round i_bytes up to a word boundary.
+			 * The underlying memory is guaranteed to
+			 * to be there by xfs_idata_realloc().
+			 */
+			data_bytes = roundup(ip->i_afp->if_bytes, 4);
+			ASSERT((ip->i_afp->if_real_bytes == 0) ||
+			       (ip->i_afp->if_real_bytes == data_bytes));
+			vecp->i_len = (int)data_bytes;
+			vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL;
+			vecp++;
+			nvecs++;
+			iip->ili_format.ilf_asize = (unsigned)data_bytes;
+		}
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	ASSERT(nvecs == lip->li_desc->lid_size);
+	iip->ili_format.ilf_size = nvecs;
+}
+
+
+/*
+ * This is called to pin the inode associated with the inode log
+ * item in memory so it cannot be written out.
+ */
+STATIC void
+xfs_inode_item_pin(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_inode	*ip = INODE_ITEM(lip)->ili_inode;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	trace_xfs_inode_pin(ip, _RET_IP_);
+	atomic_inc(&ip->i_pincount);
+}
+
+
+/*
+ * This is called to unpin the inode associated with the inode log
+ * item which was previously pinned with a call to xfs_inode_item_pin().
+ *
+ * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
+ */
+STATIC void
+xfs_inode_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_inode	*ip = INODE_ITEM(lip)->ili_inode;
+
+	trace_xfs_inode_unpin(ip, _RET_IP_);
+	ASSERT(atomic_read(&ip->i_pincount) > 0);
+	if (atomic_dec_and_test(&ip->i_pincount))
+		wake_up(&ip->i_ipin_wait);
+}
+
+/*
+ * This is called to attempt to lock the inode associated with this
+ * inode log item, in preparation for the push routine which does the actual
+ * iflush.  Don't sleep on the inode lock or the flush lock.
+ *
+ * If the flush lock is already held, indicating that the inode has
+ * been or is in the process of being flushed, then (ideally) we'd like to
+ * see if the inode's buffer is still incore, and if so give it a nudge.
+ * We delay doing so until the pushbuf routine, though, to avoid holding
+ * the AIL lock across a call to the blackhole which is the buffer cache.
+ * Also we don't want to sleep in any device strategy routines, which can happen
+ * if we do the subsequent bawrite in here.
+ */
+STATIC uint
+xfs_inode_item_trylock(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+
+	if (xfs_ipincount(ip) > 0)
+		return XFS_ITEM_PINNED;
+
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+		return XFS_ITEM_LOCKED;
+
+	if (!xfs_iflock_nowait(ip)) {
+		/*
+		 * inode has already been flushed to the backing buffer,
+		 * leave it locked in shared mode, pushbuf routine will
+		 * unlock it.
+		 */
+		return XFS_ITEM_PUSHBUF;
+	}
+
+	/* Stale items should force out the iclog */
+	if (ip->i_flags & XFS_ISTALE) {
+		xfs_ifunlock(ip);
+		/*
+		 * we hold the AIL lock - notify the unlock routine of this
+		 * so it doesn't try to get the lock again.
+		 */
+		xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
+		return XFS_ITEM_PINNED;
+	}
+
+#ifdef DEBUG
+	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		ASSERT(iip->ili_format.ilf_fields != 0);
+		ASSERT(iip->ili_logged == 0);
+		ASSERT(lip->li_flags & XFS_LI_IN_AIL);
+	}
+#endif
+	return XFS_ITEM_SUCCESS;
+}
+
+/*
+ * Unlock the inode associated with the inode log item.
+ * Clear the fields of the inode and inode log item that
+ * are specific to the current transaction.  If the
+ * hold flags is set, do not unlock the inode.
+ */
+STATIC void
+xfs_inode_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+	unsigned short		lock_flags;
+
+	ASSERT(iip->ili_inode->i_itemp != NULL);
+	ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
+
+	/*
+	 * Clear the transaction pointer in the inode.
+	 */
+	ip->i_transp = NULL;
+
+	/*
+	 * If the inode needed a separate buffer with which to log
+	 * its extents, then free it now.
+	 */
+	if (iip->ili_extents_buf != NULL) {
+		ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
+		ASSERT(ip->i_d.di_nextents > 0);
+		ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT);
+		ASSERT(ip->i_df.if_bytes > 0);
+		kmem_free(iip->ili_extents_buf);
+		iip->ili_extents_buf = NULL;
+	}
+	if (iip->ili_aextents_buf != NULL) {
+		ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
+		ASSERT(ip->i_d.di_anextents > 0);
+		ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT);
+		ASSERT(ip->i_afp->if_bytes > 0);
+		kmem_free(iip->ili_aextents_buf);
+		iip->ili_aextents_buf = NULL;
+	}
+
+	lock_flags = iip->ili_lock_flags;
+	iip->ili_lock_flags = 0;
+	if (lock_flags) {
+		xfs_iunlock(iip->ili_inode, lock_flags);
+		IRELE(iip->ili_inode);
+	}
+}
+
+/*
+ * This is called to find out where the oldest active copy of the inode log
+ * item in the on disk log resides now that the last log write of it completed
+ * at the given lsn.  Since we always re-log all dirty data in an inode, the
+ * latest copy in the on disk log is the only one that matters.  Therefore,
+ * simply return the given lsn.
+ *
+ * If the inode has been marked stale because the cluster is being freed, we
+ * don't want to (re-)insert this inode into the AIL. There is a race condition
+ * where the cluster buffer may be unpinned before the inode is inserted into
+ * the AIL during transaction committed processing. If the buffer is unpinned
+ * before the inode item has been committed and inserted, then it is possible
+ * for the buffer to be written and IO completes before the inode is inserted
+ * into the AIL. In that case, we'd be inserting a clean, stale inode into the
+ * AIL which will never get removed. It will, however, get reclaimed which
+ * triggers an assert in xfs_inode_free() complaining about freein an inode
+ * still in the AIL.
+ *
+ * To avoid this, just unpin the inode directly and return a LSN of -1 so the
+ * transaction committed code knows that it does not need to do any further
+ * processing on the item.
+ */
+STATIC xfs_lsn_t
+xfs_inode_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+
+	if (xfs_iflags_test(ip, XFS_ISTALE)) {
+		xfs_inode_item_unpin(lip, 0);
+		return -1;
+	}
+	return lsn;
+}
+
+/*
+ * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
+ * failed to get the inode flush lock but did get the inode locked SHARED.
+ * Here we're trying to see if the inode buffer is incore, and if so whether it's
+ * marked delayed write. If that's the case, we'll promote it and that will
+ * allow the caller to write the buffer by triggering the xfsbufd to run.
+ */
+STATIC bool
+xfs_inode_item_pushbuf(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+	struct xfs_buf		*bp;
+	bool			ret = true;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
+
+	/*
+	 * If a flush is not in progress anymore, chances are that the
+	 * inode was taken off the AIL. So, just get out.
+	 */
+	if (completion_done(&ip->i_flush) ||
+	    !(lip->li_flags & XFS_LI_IN_AIL)) {
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return true;
+	}
+
+	bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
+			iip->ili_format.ilf_len, XBF_TRYLOCK);
+
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	if (!bp)
+		return true;
+	if (XFS_BUF_ISDELAYWRITE(bp))
+		xfs_buf_delwri_promote(bp);
+	if (XFS_BUF_ISPINNED(bp))
+		ret = false;
+	xfs_buf_relse(bp);
+	return ret;
+}
+
+/*
+ * This is called to asynchronously write the inode associated with this
+ * inode log item out to disk. The inode will already have been locked by
+ * a successful call to xfs_inode_item_trylock().
+ */
+STATIC void
+xfs_inode_item_push(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
+	ASSERT(!completion_done(&ip->i_flush));
+
+	/*
+	 * Since we were able to lock the inode's flush lock and
+	 * we found it on the AIL, the inode must be dirty.  This
+	 * is because the inode is removed from the AIL while still
+	 * holding the flush lock in xfs_iflush_done().  Thus, if
+	 * we found it in the AIL and were able to obtain the flush
+	 * lock without sleeping, then there must not have been
+	 * anyone in the process of flushing the inode.
+	 */
+	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
+	       iip->ili_format.ilf_fields != 0);
+
+	/*
+	 * Push the inode to it's backing buffer. This will not remove the
+	 * inode from the AIL - a further push will be required to trigger a
+	 * buffer push. However, this allows all the dirty inodes to be pushed
+	 * to the buffer before it is pushed to disk. The buffer IO completion
+	 * will pull the inode from the AIL, mark it clean and unlock the flush
+	 * lock.
+	 */
+	(void) xfs_iflush(ip, SYNC_TRYLOCK);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+}
+
+/*
+ * XXX rcc - this one really has to do something.  Probably needs
+ * to stamp in a new field in the incore inode.
+ */
+STATIC void
+xfs_inode_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	INODE_ITEM(lip)->ili_last_lsn = lsn;
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+static struct xfs_item_ops xfs_inode_item_ops = {
+	.iop_size	= xfs_inode_item_size,
+	.iop_format	= xfs_inode_item_format,
+	.iop_pin	= xfs_inode_item_pin,
+	.iop_unpin	= xfs_inode_item_unpin,
+	.iop_trylock	= xfs_inode_item_trylock,
+	.iop_unlock	= xfs_inode_item_unlock,
+	.iop_committed	= xfs_inode_item_committed,
+	.iop_push	= xfs_inode_item_push,
+	.iop_pushbuf	= xfs_inode_item_pushbuf,
+	.iop_committing = xfs_inode_item_committing
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ */
+void
+xfs_inode_item_init(
+	struct xfs_inode	*ip,
+	struct xfs_mount	*mp)
+{
+	struct xfs_inode_log_item *iip;
+
+	ASSERT(ip->i_itemp == NULL);
+	iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
+
+	iip->ili_inode = ip;
+	xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
+						&xfs_inode_item_ops);
+	iip->ili_format.ilf_type = XFS_LI_INODE;
+	iip->ili_format.ilf_ino = ip->i_ino;
+	iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
+	iip->ili_format.ilf_len = ip->i_imap.im_len;
+	iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
+}
+
+/*
+ * Free the inode log item and any memory hanging off of it.
+ */
+void
+xfs_inode_item_destroy(
+	xfs_inode_t	*ip)
+{
+#ifdef XFS_TRANS_DEBUG
+	if (ip->i_itemp->ili_root_size != 0) {
+		kmem_free(ip->i_itemp->ili_orig_root);
+	}
+#endif
+	kmem_zone_free(xfs_ili_zone, ip->i_itemp);
+}
+
+
+/*
+ * This is the inode flushing I/O completion routine.  It is called
+ * from interrupt level when the buffer containing the inode is
+ * flushed to disk.  It is responsible for removing the inode item
+ * from the AIL if it has not been re-logged, and unlocking the inode's
+ * flush lock.
+ *
+ * To reduce AIL lock traffic as much as possible, we scan the buffer log item
+ * list for other inodes that will run this function. We remove them from the
+ * buffer list so we can process all the inode IO completions in one AIL lock
+ * traversal.
+ */
+void
+xfs_iflush_done(
+	struct xfs_buf		*bp,
+	struct xfs_log_item	*lip)
+{
+	struct xfs_inode_log_item *iip;
+	struct xfs_log_item	*blip;
+	struct xfs_log_item	*next;
+	struct xfs_log_item	*prev;
+	struct xfs_ail		*ailp = lip->li_ailp;
+	int			need_ail = 0;
+
+	/*
+	 * Scan the buffer IO completions for other inodes being completed and
+	 * attach them to the current inode log item.
+	 */
+	blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+	prev = NULL;
+	while (blip != NULL) {
+		if (lip->li_cb != xfs_iflush_done) {
+			prev = blip;
+			blip = blip->li_bio_list;
+			continue;
+		}
+
+		/* remove from list */
+		next = blip->li_bio_list;
+		if (!prev) {
+			XFS_BUF_SET_FSPRIVATE(bp, next);
+		} else {
+			prev->li_bio_list = next;
+		}
+
+		/* add to current list */
+		blip->li_bio_list = lip->li_bio_list;
+		lip->li_bio_list = blip;
+
+		/*
+		 * while we have the item, do the unlocked check for needing
+		 * the AIL lock.
+		 */
+		iip = INODE_ITEM(blip);
+		if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+			need_ail++;
+
+		blip = next;
+	}
+
+	/* make sure we capture the state of the initial inode. */
+	iip = INODE_ITEM(lip);
+	if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+		need_ail++;
+
+	/*
+	 * We only want to pull the item from the AIL if it is
+	 * actually there and its location in the log has not
+	 * changed since we started the flush.  Thus, we only bother
+	 * if the ili_logged flag is set and the inode's lsn has not
+	 * changed.  First we check the lsn outside
+	 * the lock since it's cheaper, and then we recheck while
+	 * holding the lock before removing the inode from the AIL.
+	 */
+	if (need_ail) {
+		struct xfs_log_item *log_items[need_ail];
+		int i = 0;
+		spin_lock(&ailp->xa_lock);
+		for (blip = lip; blip; blip = blip->li_bio_list) {
+			iip = INODE_ITEM(blip);
+			if (iip->ili_logged &&
+			    blip->li_lsn == iip->ili_flush_lsn) {
+				log_items[i++] = blip;
+			}
+			ASSERT(i <= need_ail);
+		}
+		/* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+		xfs_trans_ail_delete_bulk(ailp, log_items, i);
+	}
+
+
+	/*
+	 * clean up and unlock the flush lock now we are done. We can clear the
+	 * ili_last_fields bits now that we know that the data corresponding to
+	 * them is safely on disk.
+	 */
+	for (blip = lip; blip; blip = next) {
+		next = blip->li_bio_list;
+		blip->li_bio_list = NULL;
+
+		iip = INODE_ITEM(blip);
+		iip->ili_logged = 0;
+		iip->ili_last_fields = 0;
+		xfs_ifunlock(iip->ili_inode);
+	}
+}
+
+/*
+ * This is the inode flushing abort routine.  It is called
+ * from xfs_iflush when the filesystem is shutting down to clean
+ * up the inode state.
+ * It is responsible for removing the inode item
+ * from the AIL if it has not been re-logged, and unlocking the inode's
+ * flush lock.
+ */
+void
+xfs_iflush_abort(
+	xfs_inode_t		*ip)
+{
+	xfs_inode_log_item_t	*iip = ip->i_itemp;
+
+	if (iip) {
+		struct xfs_ail	*ailp = iip->ili_item.li_ailp;
+		if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
+			spin_lock(&ailp->xa_lock);
+			if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
+				/* xfs_trans_ail_delete() drops the AIL lock. */
+				xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip);
+			} else
+				spin_unlock(&ailp->xa_lock);
+		}
+		iip->ili_logged = 0;
+		/*
+		 * Clear the ili_last_fields bits now that we know that the
+		 * data corresponding to them is safely on disk.
+		 */
+		iip->ili_last_fields = 0;
+		/*
+		 * Clear the inode logging fields so no more flushes are
+		 * attempted.
+		 */
+		iip->ili_format.ilf_fields = 0;
+	}
+	/*
+	 * Release the inode's flush lock since we're done with it.
+	 */
+	xfs_ifunlock(ip);
+}
+
+void
+xfs_istale_done(
+	struct xfs_buf		*bp,
+	struct xfs_log_item	*lip)
+{
+	xfs_iflush_abort(INODE_ITEM(lip)->ili_inode);
+}
+
+/*
+ * convert an xfs_inode_log_format struct from either 32 or 64 bit versions
+ * (which can have different field alignments) to the native version
+ */
+int
+xfs_inode_item_format_convert(
+	xfs_log_iovec_t		*buf,
+	xfs_inode_log_format_t	*in_f)
+{
+	if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
+		xfs_inode_log_format_32_t *in_f32 = buf->i_addr;
+
+		in_f->ilf_type = in_f32->ilf_type;
+		in_f->ilf_size = in_f32->ilf_size;
+		in_f->ilf_fields = in_f32->ilf_fields;
+		in_f->ilf_asize = in_f32->ilf_asize;
+		in_f->ilf_dsize = in_f32->ilf_dsize;
+		in_f->ilf_ino = in_f32->ilf_ino;
+		/* copy biggest field of ilf_u */
+		memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
+		       in_f32->ilf_u.ilfu_uuid.__u_bits,
+		       sizeof(uuid_t));
+		in_f->ilf_blkno = in_f32->ilf_blkno;
+		in_f->ilf_len = in_f32->ilf_len;
+		in_f->ilf_boffset = in_f32->ilf_boffset;
+		return 0;
+	} else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
+		xfs_inode_log_format_64_t *in_f64 = buf->i_addr;
+
+		in_f->ilf_type = in_f64->ilf_type;
+		in_f->ilf_size = in_f64->ilf_size;
+		in_f->ilf_fields = in_f64->ilf_fields;
+		in_f->ilf_asize = in_f64->ilf_asize;
+		in_f->ilf_dsize = in_f64->ilf_dsize;
+		in_f->ilf_ino = in_f64->ilf_ino;
+		/* copy biggest field of ilf_u */
+		memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
+		       in_f64->ilf_u.ilfu_uuid.__u_bits,
+		       sizeof(uuid_t));
+		in_f->ilf_blkno = in_f64->ilf_blkno;
+		in_f->ilf_len = in_f64->ilf_len;
+		in_f->ilf_boffset = in_f64->ilf_boffset;
+		return 0;
+	}
+	return EFSCORRUPTED;
+}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
new file mode 100644
index 00000000..d3dee61e
--- /dev/null
+++ b/fs/xfs/xfs_inode_item.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_INODE_ITEM_H__
+#define	__XFS_INODE_ITEM_H__
+
+/*
+ * This is the structure used to lay out an inode log item in the
+ * log.  The size of the inline data/extents/b-tree root to be logged
+ * (if any) is indicated in the ilf_dsize field.  Changes to this structure
+ * must be added on to the end.
+ */
+typedef struct xfs_inode_log_format {
+	__uint16_t		ilf_type;	/* inode log item type */
+	__uint16_t		ilf_size;	/* size of this item */
+	__uint32_t		ilf_fields;	/* flags for fields logged */
+	__uint16_t		ilf_asize;	/* size of attr d/ext/root */
+	__uint16_t		ilf_dsize;	/* size of data/ext/root */
+	__uint64_t		ilf_ino;	/* inode number */
+	union {
+		__uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* mount point value */
+	} ilf_u;
+	__int64_t		ilf_blkno;	/* blkno of inode buffer */
+	__int32_t		ilf_len;	/* len of inode buffer */
+	__int32_t		ilf_boffset;	/* off of inode in buffer */
+} xfs_inode_log_format_t;
+
+typedef struct xfs_inode_log_format_32 {
+	__uint16_t		ilf_type;	/* inode log item type */
+	__uint16_t		ilf_size;	/* size of this item */
+	__uint32_t		ilf_fields;	/* flags for fields logged */
+	__uint16_t		ilf_asize;	/* size of attr d/ext/root */
+	__uint16_t		ilf_dsize;	/* size of data/ext/root */
+	__uint64_t		ilf_ino;	/* inode number */
+	union {
+		__uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* mount point value */
+	} ilf_u;
+	__int64_t		ilf_blkno;	/* blkno of inode buffer */
+	__int32_t		ilf_len;	/* len of inode buffer */
+	__int32_t		ilf_boffset;	/* off of inode in buffer */
+} __attribute__((packed)) xfs_inode_log_format_32_t;
+
+typedef struct xfs_inode_log_format_64 {
+	__uint16_t		ilf_type;	/* inode log item type */
+	__uint16_t		ilf_size;	/* size of this item */
+	__uint32_t		ilf_fields;	/* flags for fields logged */
+	__uint16_t		ilf_asize;	/* size of attr d/ext/root */
+	__uint16_t		ilf_dsize;	/* size of data/ext/root */
+	__uint32_t		ilf_pad;	/* pad for 64 bit boundary */
+	__uint64_t		ilf_ino;	/* inode number */
+	union {
+		__uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* mount point value */
+	} ilf_u;
+	__int64_t		ilf_blkno;	/* blkno of inode buffer */
+	__int32_t		ilf_len;	/* len of inode buffer */
+	__int32_t		ilf_boffset;	/* off of inode in buffer */
+} xfs_inode_log_format_64_t;
+
+/*
+ * Flags for xfs_trans_log_inode flags field.
+ */
+#define	XFS_ILOG_CORE	0x001	/* log standard inode fields */
+#define	XFS_ILOG_DDATA	0x002	/* log i_df.if_data */
+#define	XFS_ILOG_DEXT	0x004	/* log i_df.if_extents */
+#define	XFS_ILOG_DBROOT	0x008	/* log i_df.i_broot */
+#define	XFS_ILOG_DEV	0x010	/* log the dev field */
+#define	XFS_ILOG_UUID	0x020	/* log the uuid field */
+#define	XFS_ILOG_ADATA	0x040	/* log i_af.if_data */
+#define	XFS_ILOG_AEXT	0x080	/* log i_af.if_extents */
+#define	XFS_ILOG_ABROOT	0x100	/* log i_af.i_broot */
+
+#define	XFS_ILOG_NONCORE	(XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+				 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
+				 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
+				 XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
+
+#define	XFS_ILOG_DFORK		(XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+				 XFS_ILOG_DBROOT)
+
+#define	XFS_ILOG_AFORK		(XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+				 XFS_ILOG_ABROOT)
+
+#define	XFS_ILOG_ALL		(XFS_ILOG_CORE | XFS_ILOG_DDATA | \
+				 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
+				 XFS_ILOG_DEV | XFS_ILOG_UUID | \
+				 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+				 XFS_ILOG_ABROOT)
+
+static inline int xfs_ilog_fbroot(int w)
+{
+	return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+
+static inline int xfs_ilog_fext(int w)
+{
+	return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+
+static inline int xfs_ilog_fdata(int w)
+{
+	return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
+
+#ifdef __KERNEL__
+
+struct xfs_buf;
+struct xfs_bmbt_rec;
+struct xfs_inode;
+struct xfs_mount;
+
+
+typedef struct xfs_inode_log_item {
+	xfs_log_item_t		ili_item;	   /* common portion */
+	struct xfs_inode	*ili_inode;	   /* inode ptr */
+	xfs_lsn_t		ili_flush_lsn;	   /* lsn at last flush */
+	xfs_lsn_t		ili_last_lsn;	   /* lsn at last transaction */
+	unsigned short		ili_lock_flags;	   /* lock flags */
+	unsigned short		ili_logged;	   /* flushed logged data */
+	unsigned int		ili_last_fields;   /* fields when flushed */
+	struct xfs_bmbt_rec	*ili_extents_buf;  /* array of logged
+						      data exts */
+	struct xfs_bmbt_rec	*ili_aextents_buf; /* array of logged
+						      attr exts */
+#ifdef XFS_TRANS_DEBUG
+	int			ili_root_size;
+	char			*ili_orig_root;
+#endif
+	xfs_inode_log_format_t	ili_format;	   /* logged structure */
+} xfs_inode_log_item_t;
+
+
+static inline int xfs_inode_clean(xfs_inode_t *ip)
+{
+	return (!ip->i_itemp ||
+		!(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
+	       !ip->i_update_core;
+}
+
+extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
+extern void xfs_inode_item_destroy(struct xfs_inode *);
+extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
+extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
+extern void xfs_iflush_abort(struct xfs_inode *);
+extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
+					 xfs_inode_log_format_t *);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_INODE_ITEM_H__ */
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
new file mode 100644
index 00000000..b8e4ee4e
--- /dev/null
+++ b/fs/xfs/xfs_inum.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_INUM_H__
+#define	__XFS_INUM_H__
+
+/*
+ * Inode number format:
+ * low inopblog bits - offset in block
+ * next agblklog bits - block number in ag
+ * next agno_log bits - ag number
+ * high agno_log-agblklog-inopblog bits - 0
+ */
+
+typedef	__uint32_t	xfs_agino_t;	/* within allocation grp inode number */
+
+/*
+ * Useful inode bits for this kernel.
+ * Used in some places where having 64-bits in the 32-bit kernels
+ * costs too much.
+ */
+#if XFS_BIG_INUMS
+typedef	xfs_ino_t	xfs_intino_t;
+#else
+typedef	__uint32_t	xfs_intino_t;
+#endif
+
+#define	NULLFSINO	((xfs_ino_t)-1)
+#define	NULLAGINO	((xfs_agino_t)-1)
+
+struct xfs_mount;
+
+#define	XFS_INO_MASK(k)			(__uint32_t)((1ULL << (k)) - 1)
+#define	XFS_INO_OFFSET_BITS(mp)		(mp)->m_sb.sb_inopblog
+#define	XFS_INO_AGBNO_BITS(mp)		(mp)->m_sb.sb_agblklog
+#define	XFS_INO_AGINO_BITS(mp)		(mp)->m_agino_log
+#define	XFS_INO_AGNO_BITS(mp)		(mp)->m_agno_log
+#define	XFS_INO_BITS(mp)		\
+	XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
+#define	XFS_INO_TO_AGNO(mp,i)		\
+	((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
+#define	XFS_INO_TO_AGINO(mp,i)		\
+	((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
+#define	XFS_INO_TO_AGBNO(mp,i)		\
+	(((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
+		XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
+#define	XFS_INO_TO_OFFSET(mp,i)		\
+	((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define	XFS_INO_TO_FSB(mp,i)		\
+	XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
+#define	XFS_AGINO_TO_INO(mp,a,i)	\
+	(((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
+#define	XFS_AGINO_TO_AGBNO(mp,i)	((i) >> XFS_INO_OFFSET_BITS(mp))
+#define	XFS_AGINO_TO_OFFSET(mp,i)	\
+	((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define	XFS_OFFBNO_TO_AGINO(mp,b,o)	\
+	((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
+
+#if XFS_BIG_INUMS
+#define	XFS_MAXINUMBER		((xfs_ino_t)((1ULL << 56) - 1ULL))
+#else
+#define	XFS_MAXINUMBER		((xfs_ino_t)((1ULL << 32) - 1ULL))
+#endif
+#define	XFS_MAXINUMBER_32	((xfs_ino_t)((1ULL << 32) - 1ULL))
+
+#endif	/* __XFS_INUM_H__ */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
new file mode 100644
index 00000000..091d82b9
--- /dev/null
+++ b/fs/xfs/xfs_iomap.c
@@ -0,0 +1,748 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_utils.h"
+#include "xfs_iomap.h"
+#include "xfs_trace.h"
+
+
+#define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
+						<< mp->m_writeio_log)
+#define XFS_WRITE_IMAPS		XFS_BMAP_MAX_NMAP
+
+STATIC int
+xfs_iomap_eof_align_last_fsb(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,
+	xfs_extlen_t	extsize,
+	xfs_fileoff_t	*last_fsb)
+{
+	xfs_fileoff_t	new_last_fsb = 0;
+	xfs_extlen_t	align;
+	int		eof, error;
+
+	if (XFS_IS_REALTIME_INODE(ip))
+		;
+	/*
+	 * If mounted with the "-o swalloc" option, roundup the allocation
+	 * request to a stripe width boundary if the file size is >=
+	 * stripe width and we are allocating past the allocation eof.
+	 */
+	else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
+	        (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
+		new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
+	/*
+	 * Roundup the allocation request to a stripe unit (m_dalign) boundary
+	 * if the file size is >= stripe unit size, and we are allocating past
+	 * the allocation eof.
+	 */
+	else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
+		new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
+
+	/*
+	 * Always round up the allocation request to an extent boundary
+	 * (when file on a real-time subvolume or has di_extsize hint).
+	 */
+	if (extsize) {
+		if (new_last_fsb)
+			align = roundup_64(new_last_fsb, extsize);
+		else
+			align = extsize;
+		new_last_fsb = roundup_64(*last_fsb, align);
+	}
+
+	if (new_last_fsb) {
+		error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
+		if (error)
+			return error;
+		if (eof)
+			*last_fsb = new_last_fsb;
+	}
+	return 0;
+}
+
+STATIC int
+xfs_alert_fsblock_zero(
+	xfs_inode_t	*ip,
+	xfs_bmbt_irec_t	*imap)
+{
+	xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+			"Access to block zero in inode %llu "
+			"start_block: %llx start_off: %llx "
+			"blkcnt: %llx extent-state: %x\n",
+		(unsigned long long)ip->i_ino,
+		(unsigned long long)imap->br_startblock,
+		(unsigned long long)imap->br_startoff,
+		(unsigned long long)imap->br_blockcount,
+		imap->br_state);
+	return EFSCORRUPTED;
+}
+
+int
+xfs_iomap_write_direct(
+	xfs_inode_t	*ip,
+	xfs_off_t	offset,
+	size_t		count,
+	xfs_bmbt_irec_t *imap,
+	int		nmaps)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_fileoff_t	offset_fsb;
+	xfs_fileoff_t	last_fsb;
+	xfs_filblks_t	count_fsb, resaligned;
+	xfs_fsblock_t	firstfsb;
+	xfs_extlen_t	extsz, temp;
+	int		nimaps;
+	int		bmapi_flag;
+	int		quota_flag;
+	int		rt;
+	xfs_trans_t	*tp;
+	xfs_bmap_free_t free_list;
+	uint		qblocks, resblks, resrtextents;
+	int		committed;
+	int		error;
+
+	/*
+	 * Make sure that the dquots are there. This doesn't hold
+	 * the ilock across a disk read.
+	 */
+	error = xfs_qm_dqattach_locked(ip, 0);
+	if (error)
+		return XFS_ERROR(error);
+
+	rt = XFS_IS_REALTIME_INODE(ip);
+	extsz = xfs_get_extsz_hint(ip);
+
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+	if ((offset + count) > ip->i_size) {
+		error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
+		if (error)
+			goto error_out;
+	} else {
+		if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
+			last_fsb = MIN(last_fsb, (xfs_fileoff_t)
+					imap->br_blockcount +
+					imap->br_startoff);
+	}
+	count_fsb = last_fsb - offset_fsb;
+	ASSERT(count_fsb > 0);
+
+	resaligned = count_fsb;
+	if (unlikely(extsz)) {
+		if ((temp = do_mod(offset_fsb, extsz)))
+			resaligned += temp;
+		if ((temp = do_mod(resaligned, extsz)))
+			resaligned += extsz - temp;
+	}
+
+	if (unlikely(rt)) {
+		resrtextents = qblocks = resaligned;
+		resrtextents /= mp->m_sb.sb_rextsize;
+		resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+		quota_flag = XFS_QMOPT_RES_RTBLKS;
+	} else {
+		resrtextents = 0;
+		resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+		quota_flag = XFS_QMOPT_RES_REGBLKS;
+	}
+
+	/*
+	 * Allocate and setup the transaction
+	 */
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+	error = xfs_trans_reserve(tp, resblks,
+			XFS_WRITE_LOG_RES(mp), resrtextents,
+			XFS_TRANS_PERM_LOG_RES,
+			XFS_WRITE_LOG_COUNT);
+	/*
+	 * Check for running out of space, note: need lock to return
+	 */
+	if (error)
+		xfs_trans_cancel(tp, 0);
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (error)
+		goto error_out;
+
+	error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
+	if (error)
+		goto error1;
+
+	xfs_trans_ijoin(tp, ip);
+
+	bmapi_flag = XFS_BMAPI_WRITE;
+	if (offset < ip->i_size || extsz)
+		bmapi_flag |= XFS_BMAPI_PREALLOC;
+
+	/*
+	 * Issue the xfs_bmapi() call to allocate the blocks.
+	 *
+	 * From this point onwards we overwrite the imap pointer that the
+	 * caller gave to us.
+	 */
+	xfs_bmap_init(&free_list, &firstfsb);
+	nimaps = 1;
+	error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
+		&firstfsb, 0, imap, &nimaps, &free_list);
+	if (error)
+		goto error0;
+
+	/*
+	 * Complete the transaction
+	 */
+	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	if (error)
+		goto error0;
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	if (error)
+		goto error_out;
+
+	/*
+	 * Copy any maps to caller's array and return any error.
+	 */
+	if (nimaps == 0) {
+		error = ENOSPC;
+		goto error_out;
+	}
+
+	if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
+		error = xfs_alert_fsblock_zero(ip, imap);
+		goto error_out;
+	}
+
+	return 0;
+
+error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
+	xfs_bmap_cancel(&free_list);
+	xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
+
+error1:	/* Just cancel transaction */
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+
+error_out:
+	return XFS_ERROR(error);
+}
+
+/*
+ * If the caller is doing a write at the end of the file, then extend the
+ * allocation out to the file system's write iosize.  We clean up any extra
+ * space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
+ */
+STATIC int
+xfs_iomap_eof_want_preallocate(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,
+	xfs_off_t	offset,
+	size_t		count,
+	xfs_bmbt_irec_t *imap,
+	int		nimaps,
+	int		*prealloc)
+{
+	xfs_fileoff_t   start_fsb;
+	xfs_filblks_t   count_fsb;
+	xfs_fsblock_t	firstblock;
+	int		n, error, imaps;
+	int		found_delalloc = 0;
+
+	*prealloc = 0;
+	if ((offset + count) <= ip->i_size)
+		return 0;
+
+	/*
+	 * If there are any real blocks past eof, then don't
+	 * do any speculative allocation.
+	 */
+	start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1)));
+	count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+	while (count_fsb > 0) {
+		imaps = nimaps;
+		firstblock = NULLFSBLOCK;
+		error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0,
+				  &firstblock, 0, imap, &imaps, NULL);
+		if (error)
+			return error;
+		for (n = 0; n < imaps; n++) {
+			if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
+			    (imap[n].br_startblock != DELAYSTARTBLOCK))
+				return 0;
+			start_fsb += imap[n].br_blockcount;
+			count_fsb -= imap[n].br_blockcount;
+
+			if (imap[n].br_startblock == DELAYSTARTBLOCK)
+				found_delalloc = 1;
+		}
+	}
+	if (!found_delalloc)
+		*prealloc = 1;
+	return 0;
+}
+
+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows. Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to full, the smaller the maximum prealocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip)
+{
+	xfs_fsblock_t		alloc_blocks = 0;
+
+	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+		int shift = 0;
+		int64_t freesp;
+
+		/*
+		 * rounddown_pow_of_two() returns an undefined result
+		 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
+		 * ensure we always pass in a non-zero value.
+		 */
+		alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+		alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
+					rounddown_pow_of_two(alloc_blocks));
+
+		xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+		freesp = mp->m_sb.sb_fdblocks;
+		if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+			shift = 2;
+			if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+				shift++;
+			if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+				shift++;
+			if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+				shift++;
+			if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+				shift++;
+		}
+		if (shift)
+			alloc_blocks >>= shift;
+	}
+
+	if (alloc_blocks < mp->m_writeio_blocks)
+		alloc_blocks = mp->m_writeio_blocks;
+
+	return alloc_blocks;
+}
+
+int
+xfs_iomap_write_delay(
+	xfs_inode_t	*ip,
+	xfs_off_t	offset,
+	size_t		count,
+	xfs_bmbt_irec_t *ret_imap)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_fileoff_t	offset_fsb;
+	xfs_fileoff_t	last_fsb;
+	xfs_off_t	aligned_offset;
+	xfs_fileoff_t	ioalign;
+	xfs_fsblock_t	firstblock;
+	xfs_extlen_t	extsz;
+	int		nimaps;
+	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
+	int		prealloc, flushed = 0;
+	int		error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	/*
+	 * Make sure that the dquots are there. This doesn't hold
+	 * the ilock across a disk read.
+	 */
+	error = xfs_qm_dqattach_locked(ip, 0);
+	if (error)
+		return XFS_ERROR(error);
+
+	extsz = xfs_get_extsz_hint(ip);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+
+	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
+				imap, XFS_WRITE_IMAPS, &prealloc);
+	if (error)
+		return error;
+
+retry:
+	if (prealloc) {
+		xfs_fsblock_t	alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
+
+		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
+		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
+		last_fsb = ioalign + alloc_blocks;
+	} else {
+		last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+	}
+
+	if (prealloc || extsz) {
+		error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
+		if (error)
+			return error;
+	}
+
+	nimaps = XFS_WRITE_IMAPS;
+	firstblock = NULLFSBLOCK;
+	error = xfs_bmapi(NULL, ip, offset_fsb,
+			  (xfs_filblks_t)(last_fsb - offset_fsb),
+			  XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
+			  XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
+			  &nimaps, NULL);
+	switch (error) {
+	case 0:
+	case ENOSPC:
+	case EDQUOT:
+		break;
+	default:
+		return XFS_ERROR(error);
+	}
+
+	/*
+	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
+	 * ENOSPC, * flush all other inodes with delalloc blocks to free up
+	 * some of the excess reserved metadata space. For both cases, retry
+	 * without EOF preallocation.
+	 */
+	if (nimaps == 0) {
+		trace_xfs_delalloc_enospc(ip, offset, count);
+		if (flushed)
+			return XFS_ERROR(error ? error : ENOSPC);
+
+		if (error == ENOSPC) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			xfs_flush_inodes(ip);
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+		}
+
+		flushed = 1;
+		error = 0;
+		prealloc = 0;
+		goto retry;
+	}
+
+	if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
+		return xfs_alert_fsblock_zero(ip, &imap[0]);
+
+	*ret_imap = imap[0];
+	return 0;
+}
+
+/*
+ * Pass in a delayed allocate extent, convert it to real extents;
+ * return to the caller the extent we create which maps on top of
+ * the originating callers request.
+ *
+ * Called without a lock on the inode.
+ *
+ * We no longer bother to look at the incoming map - all we have to
+ * guarantee is that whatever we allocate fills the required range.
+ */
+int
+xfs_iomap_write_allocate(
+	xfs_inode_t	*ip,
+	xfs_off_t	offset,
+	size_t		count,
+	xfs_bmbt_irec_t *imap)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_fileoff_t	offset_fsb, last_block;
+	xfs_fileoff_t	end_fsb, map_start_fsb;
+	xfs_fsblock_t	first_block;
+	xfs_bmap_free_t	free_list;
+	xfs_filblks_t	count_fsb;
+	xfs_trans_t	*tp;
+	int		nimaps, committed;
+	int		error = 0;
+	int		nres;
+
+	/*
+	 * Make sure that the dquots are there.
+	 */
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		return XFS_ERROR(error);
+
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	count_fsb = imap->br_blockcount;
+	map_start_fsb = imap->br_startoff;
+
+	XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
+
+	while (count_fsb != 0) {
+		/*
+		 * Set up a transaction with which to allocate the
+		 * backing store for the file.  Do allocations in a
+		 * loop until we get some space in the range we are
+		 * interested in.  The other space that might be allocated
+		 * is in the delayed allocation extent on which we sit
+		 * but before our buffer starts.
+		 */
+
+		nimaps = 0;
+		while (nimaps == 0) {
+			tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+			tp->t_flags |= XFS_TRANS_RESERVE;
+			nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+			error = xfs_trans_reserve(tp, nres,
+					XFS_WRITE_LOG_RES(mp),
+					0, XFS_TRANS_PERM_LOG_RES,
+					XFS_WRITE_LOG_COUNT);
+			if (error) {
+				xfs_trans_cancel(tp, 0);
+				return XFS_ERROR(error);
+			}
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+			xfs_trans_ijoin(tp, ip);
+
+			xfs_bmap_init(&free_list, &first_block);
+
+			/*
+			 * it is possible that the extents have changed since
+			 * we did the read call as we dropped the ilock for a
+			 * while. We have to be careful about truncates or hole
+			 * punchs here - we are not allowed to allocate
+			 * non-delalloc blocks here.
+			 *
+			 * The only protection against truncation is the pages
+			 * for the range we are being asked to convert are
+			 * locked and hence a truncate will block on them
+			 * first.
+			 *
+			 * As a result, if we go beyond the range we really
+			 * need and hit an delalloc extent boundary followed by
+			 * a hole while we have excess blocks in the map, we
+			 * will fill the hole incorrectly and overrun the
+			 * transaction reservation.
+			 *
+			 * Using a single map prevents this as we are forced to
+			 * check each map we look for overlap with the desired
+			 * range and abort as soon as we find it. Also, given
+			 * that we only return a single map, having one beyond
+			 * what we can return is probably a bit silly.
+			 *
+			 * We also need to check that we don't go beyond EOF;
+			 * this is a truncate optimisation as a truncate sets
+			 * the new file size before block on the pages we
+			 * currently have locked under writeback. Because they
+			 * are about to be tossed, we don't need to write them
+			 * back....
+			 */
+			nimaps = 1;
+			end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
+			error = xfs_bmap_last_offset(NULL, ip, &last_block,
+							XFS_DATA_FORK);
+			if (error)
+				goto trans_cancel;
+
+			last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
+			if ((map_start_fsb + count_fsb) > last_block) {
+				count_fsb = last_block - map_start_fsb;
+				if (count_fsb == 0) {
+					error = EAGAIN;
+					goto trans_cancel;
+				}
+			}
+
+			/*
+			 * Go get the actual blocks.
+	 	 	 *
+			 * From this point onwards we overwrite the imap
+			 * pointer that the caller gave to us.
+			 */
+			error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
+					XFS_BMAPI_WRITE, &first_block, 1,
+					imap, &nimaps, &free_list);
+			if (error)
+				goto trans_cancel;
+
+			error = xfs_bmap_finish(&tp, &free_list, &committed);
+			if (error)
+				goto trans_cancel;
+
+			error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+			if (error)
+				goto error0;
+
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		}
+
+		/*
+		 * See if we were able to allocate an extent that
+		 * covers at least part of the callers request
+		 */
+		if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
+			return xfs_alert_fsblock_zero(ip, imap);
+
+		if ((offset_fsb >= imap->br_startoff) &&
+		    (offset_fsb < (imap->br_startoff +
+				   imap->br_blockcount))) {
+			XFS_STATS_INC(xs_xstrat_quick);
+			return 0;
+		}
+
+		/*
+		 * So far we have not mapped the requested part of the
+		 * file, just surrounding data, try again.
+		 */
+		count_fsb -= imap->br_blockcount;
+		map_start_fsb = imap->br_startoff + imap->br_blockcount;
+	}
+
+trans_cancel:
+	xfs_bmap_cancel(&free_list);
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+error0:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return XFS_ERROR(error);
+}
+
+int
+xfs_iomap_write_unwritten(
+	xfs_inode_t	*ip,
+	xfs_off_t	offset,
+	size_t		count)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_fileoff_t	offset_fsb;
+	xfs_filblks_t	count_fsb;
+	xfs_filblks_t	numblks_fsb;
+	xfs_fsblock_t	firstfsb;
+	int		nimaps;
+	xfs_trans_t	*tp;
+	xfs_bmbt_irec_t imap;
+	xfs_bmap_free_t free_list;
+	uint		resblks;
+	int		committed;
+	int		error;
+
+	trace_xfs_unwritten_convert(ip, offset, count);
+
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+	count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
+
+	/*
+	 * Reserve enough blocks in this transaction for two complete extent
+	 * btree splits.  We may be converting the middle part of an unwritten
+	 * extent and in this case we will insert two new extents in the btree
+	 * each of which could cause a full split.
+	 *
+	 * This reservation amount will be used in the first call to
+	 * xfs_bmbt_split() to select an AG with enough space to satisfy the
+	 * rest of the operation.
+	 */
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+
+	do {
+		/*
+		 * set up a transaction to convert the range of extents
+		 * from unwritten to real. Do allocations in a loop until
+		 * we have covered the range passed in.
+		 *
+		 * Note that we open code the transaction allocation here
+		 * to pass KM_NOFS--we can't risk to recursing back into
+		 * the filesystem here as we might be asked to write out
+		 * the same inode that we complete here and might deadlock
+		 * on the iolock.
+		 */
+		xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+		tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
+		tp->t_flags |= XFS_TRANS_RESERVE;
+		error = xfs_trans_reserve(tp, resblks,
+				XFS_WRITE_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES,
+				XFS_WRITE_LOG_COUNT);
+		if (error) {
+			xfs_trans_cancel(tp, 0);
+			return XFS_ERROR(error);
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip);
+
+		/*
+		 * Modify the unwritten extent state of the buffer.
+		 */
+		xfs_bmap_init(&free_list, &firstfsb);
+		nimaps = 1;
+		error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
+				  XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
+				  1, &imap, &nimaps, &free_list);
+		if (error)
+			goto error_on_bmapi_transaction;
+
+		error = xfs_bmap_finish(&(tp), &(free_list), &committed);
+		if (error)
+			goto error_on_bmapi_transaction;
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		if (error)
+			return XFS_ERROR(error);
+
+		if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
+			return xfs_alert_fsblock_zero(ip, &imap);
+
+		if ((numblks_fsb = imap.br_blockcount) == 0) {
+			/*
+			 * The numblks_fsb value should always get
+			 * smaller, otherwise the loop is stuck.
+			 */
+			ASSERT(imap.br_blockcount);
+			break;
+		}
+		offset_fsb += numblks_fsb;
+		count_fsb -= numblks_fsb;
+	} while (count_fsb > 0);
+
+	return 0;
+
+error_on_bmapi_transaction:
+	xfs_bmap_cancel(&free_list);
+	xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return XFS_ERROR(error);
+}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
new file mode 100644
index 00000000..80615760
--- /dev/null
+++ b/fs/xfs/xfs_iomap.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2003-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IOMAP_H__
+#define __XFS_IOMAP_H__
+
+struct xfs_inode;
+struct xfs_bmbt_irec;
+
+extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
+			struct xfs_bmbt_irec *, int);
+extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+			struct xfs_bmbt_irec *);
+extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+			struct xfs_bmbt_irec *);
+extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
+
+#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
new file mode 100644
index 00000000..751e94fe
--- /dev/null
+++ b/fs/xfs/xfs_itable.c
@@ -0,0 +1,736 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_trace.h"
+
+STATIC int
+xfs_internal_inum(
+	xfs_mount_t	*mp,
+	xfs_ino_t	ino)
+{
+	return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
+		(xfs_sb_version_hasquota(&mp->m_sb) &&
+		 (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
+}
+
+/*
+ * Return stat information for one inode.
+ * Return 0 if ok, else errno.
+ */
+int
+xfs_bulkstat_one_int(
+	struct xfs_mount	*mp,		/* mount point for filesystem */
+	xfs_ino_t		ino,		/* inode to get data for */
+	void __user		*buffer,	/* buffer to place output in */
+	int			ubsize,		/* size of buffer */
+	bulkstat_one_fmt_pf	formatter,	/* formatter, copy to user */
+	int			*ubused,	/* bytes used by me */
+	int			*stat)		/* BULKSTAT_RV_... */
+{
+	struct xfs_icdinode	*dic;		/* dinode core info pointer */
+	struct xfs_inode	*ip;		/* incore inode pointer */
+	struct inode		*inode;
+	struct xfs_bstat	*buf;		/* return buffer */
+	int			error = 0;	/* error value */
+
+	*stat = BULKSTAT_RV_NOTHING;
+
+	if (!buffer || xfs_internal_inum(mp, ino))
+		return XFS_ERROR(EINVAL);
+
+	buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
+	if (!buf)
+		return XFS_ERROR(ENOMEM);
+
+	error = xfs_iget(mp, NULL, ino,
+			 XFS_IGET_UNTRUSTED, XFS_ILOCK_SHARED, &ip);
+	if (error) {
+		*stat = BULKSTAT_RV_NOTHING;
+		goto out_free;
+	}
+
+	ASSERT(ip != NULL);
+	ASSERT(ip->i_imap.im_blkno != 0);
+
+	dic = &ip->i_d;
+	inode = VFS_I(ip);
+
+	/* xfs_iget returns the following without needing
+	 * further change.
+	 */
+	buf->bs_nlink = dic->di_nlink;
+	buf->bs_projid_lo = dic->di_projid_lo;
+	buf->bs_projid_hi = dic->di_projid_hi;
+	buf->bs_ino = ino;
+	buf->bs_mode = dic->di_mode;
+	buf->bs_uid = dic->di_uid;
+	buf->bs_gid = dic->di_gid;
+	buf->bs_size = dic->di_size;
+
+	/*
+	 * We need to read the timestamps from the Linux inode because
+	 * the VFS keeps writing directly into the inode structure instead
+	 * of telling us about the updates.
+	 */
+	buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
+	buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
+	buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
+	buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
+	buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
+	buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
+
+	buf->bs_xflags = xfs_ip2xflags(ip);
+	buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
+	buf->bs_extents = dic->di_nextents;
+	buf->bs_gen = dic->di_gen;
+	memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
+	buf->bs_dmevmask = dic->di_dmevmask;
+	buf->bs_dmstate = dic->di_dmstate;
+	buf->bs_aextents = dic->di_anextents;
+	buf->bs_forkoff = XFS_IFORK_BOFF(ip);
+
+	switch (dic->di_format) {
+	case XFS_DINODE_FMT_DEV:
+		buf->bs_rdev = ip->i_df.if_u2.if_rdev;
+		buf->bs_blksize = BLKDEV_IOSIZE;
+		buf->bs_blocks = 0;
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_UUID:
+		buf->bs_rdev = 0;
+		buf->bs_blksize = mp->m_sb.sb_blocksize;
+		buf->bs_blocks = 0;
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		buf->bs_rdev = 0;
+		buf->bs_blksize = mp->m_sb.sb_blocksize;
+		buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
+		break;
+	}
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	IRELE(ip);
+
+	error = formatter(buffer, ubsize, ubused, buf);
+
+	if (!error)
+		*stat = BULKSTAT_RV_DIDONE;
+
+ out_free:
+	kmem_free(buf);
+	return error;
+}
+
+/* Return 0 on success or positive error */
+STATIC int
+xfs_bulkstat_one_fmt(
+	void			__user *ubuffer,
+	int			ubsize,
+	int			*ubused,
+	const xfs_bstat_t	*buffer)
+{
+	if (ubsize < sizeof(*buffer))
+		return XFS_ERROR(ENOMEM);
+	if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
+		return XFS_ERROR(EFAULT);
+	if (ubused)
+		*ubused = sizeof(*buffer);
+	return 0;
+}
+
+int
+xfs_bulkstat_one(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_ino_t	ino,		/* inode number to get data for */
+	void		__user *buffer,	/* buffer to place output in */
+	int		ubsize,		/* size of buffer */
+	int		*ubused,	/* bytes used by me */
+	int		*stat)		/* BULKSTAT_RV_... */
+{
+	return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
+				    xfs_bulkstat_one_fmt, ubused, stat);
+}
+
+#define XFS_BULKSTAT_UBLEFT(ubleft)	((ubleft) >= statstruct_size)
+
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ */
+int					/* error status */
+xfs_bulkstat(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_ino_t		*lastinop, /* last inode returned */
+	int			*ubcountp, /* size of buffer/count returned */
+	bulkstat_one_pf		formatter, /* func that'd fill a single buf */
+	size_t			statstruct_size, /* sizeof struct filling */
+	char			__user *ubuffer, /* buffer with inode stats */
+	int			*done)	/* 1 if there are more stats to get */
+{
+	xfs_agblock_t		agbno=0;/* allocation group block number */
+	xfs_buf_t		*agbp;	/* agi header buffer */
+	xfs_agi_t		*agi;	/* agi header data */
+	xfs_agino_t		agino;	/* inode # in allocation group */
+	xfs_agnumber_t		agno;	/* allocation group number */
+	int			chunkidx; /* current index into inode chunk */
+	int			clustidx; /* current index into inode cluster */
+	xfs_btree_cur_t		*cur;	/* btree cursor for ialloc btree */
+	int			end_of_ag; /* set if we've seen the ag end */
+	int			error;	/* error code */
+	int                     fmterror;/* bulkstat formatter result */
+	int			i;	/* loop index */
+	int			icount;	/* count of inodes good in irbuf */
+	size_t			irbsize; /* size of irec buffer in bytes */
+	xfs_ino_t		ino;	/* inode number (filesystem) */
+	xfs_inobt_rec_incore_t	*irbp;	/* current irec buffer pointer */
+	xfs_inobt_rec_incore_t	*irbuf;	/* start of irec buffer */
+	xfs_inobt_rec_incore_t	*irbufend; /* end of good irec buffer entries */
+	xfs_ino_t		lastino; /* last inode number returned */
+	int			nbcluster; /* # of blocks in a cluster */
+	int			nicluster; /* # of inodes in a cluster */
+	int			nimask;	/* mask for inode clusters */
+	int			nirbuf;	/* size of irbuf */
+	int			rval;	/* return value error code */
+	int			tmp;	/* result value from btree calls */
+	int			ubcount; /* size of user's buffer */
+	int			ubleft;	/* bytes left in user's buffer */
+	char			__user *ubufp;	/* pointer into user's buffer */
+	int			ubelem;	/* spaces used in user's buffer */
+	int			ubused;	/* bytes used by formatter */
+	xfs_buf_t		*bp;	/* ptr to on-disk inode cluster buf */
+
+	/*
+	 * Get the last inode value, see if there's nothing to do.
+	 */
+	ino = (xfs_ino_t)*lastinop;
+	lastino = ino;
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agino = XFS_INO_TO_AGINO(mp, ino);
+	if (agno >= mp->m_sb.sb_agcount ||
+	    ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+		*done = 1;
+		*ubcountp = 0;
+		return 0;
+	}
+	if (!ubcountp || *ubcountp <= 0) {
+		return EINVAL;
+	}
+	ubcount = *ubcountp; /* statstruct's */
+	ubleft = ubcount * statstruct_size; /* bytes */
+	*ubcountp = ubelem = 0;
+	*done = 0;
+	fmterror = 0;
+	ubufp = ubuffer;
+	nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ?
+		mp->m_sb.sb_inopblock :
+		(XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
+	nimask = ~(nicluster - 1);
+	nbcluster = nicluster >> mp->m_sb.sb_inopblog;
+	irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
+	if (!irbuf)
+		return ENOMEM;
+
+	nirbuf = irbsize / sizeof(*irbuf);
+
+	/*
+	 * Loop over the allocation groups, starting from the last
+	 * inode returned; 0 means start of the allocation group.
+	 */
+	rval = 0;
+	while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
+		cond_resched();
+		bp = NULL;
+		error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
+		if (error) {
+			/*
+			 * Skip this allocation group and go to the next one.
+			 */
+			agno++;
+			agino = 0;
+			continue;
+		}
+		agi = XFS_BUF_TO_AGI(agbp);
+		/*
+		 * Allocate and initialize a btree cursor for ialloc btree.
+		 */
+		cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
+		irbp = irbuf;
+		irbufend = irbuf + nirbuf;
+		end_of_ag = 0;
+		/*
+		 * If we're returning in the middle of an allocation group,
+		 * we need to get the remainder of the chunk we're in.
+		 */
+		if (agino > 0) {
+			xfs_inobt_rec_incore_t r;
+
+			/*
+			 * Lookup the inode chunk that this inode lives in.
+			 */
+			error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
+						 &tmp);
+			if (!error &&	/* no I/O error */
+			    tmp &&	/* lookup succeeded */
+					/* got the record, should always work */
+			    !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
+			    i == 1 &&
+					/* this is the right chunk */
+			    agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
+					/* lastino was not last in chunk */
+			    (chunkidx = agino - r.ir_startino + 1) <
+				    XFS_INODES_PER_CHUNK &&
+					/* there are some left allocated */
+			    xfs_inobt_maskn(chunkidx,
+				    XFS_INODES_PER_CHUNK - chunkidx) &
+				    ~r.ir_free) {
+				/*
+				 * Grab the chunk record.  Mark all the
+				 * uninteresting inodes (because they're
+				 * before our start point) free.
+				 */
+				for (i = 0; i < chunkidx; i++) {
+					if (XFS_INOBT_MASK(i) & ~r.ir_free)
+						r.ir_freecount++;
+				}
+				r.ir_free |= xfs_inobt_maskn(0, chunkidx);
+				irbp->ir_startino = r.ir_startino;
+				irbp->ir_freecount = r.ir_freecount;
+				irbp->ir_free = r.ir_free;
+				irbp++;
+				agino = r.ir_startino + XFS_INODES_PER_CHUNK;
+				icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
+			} else {
+				/*
+				 * If any of those tests failed, bump the
+				 * inode number (just in case).
+				 */
+				agino++;
+				icount = 0;
+			}
+			/*
+			 * In any case, increment to the next record.
+			 */
+			if (!error)
+				error = xfs_btree_increment(cur, 0, &tmp);
+		} else {
+			/*
+			 * Start of ag.  Lookup the first inode chunk.
+			 */
+			error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
+			icount = 0;
+		}
+		/*
+		 * Loop through inode btree records in this ag,
+		 * until we run out of inodes or space in the buffer.
+		 */
+		while (irbp < irbufend && icount < ubcount) {
+			xfs_inobt_rec_incore_t r;
+
+			/*
+			 * Loop as long as we're unable to read the
+			 * inode btree.
+			 */
+			while (error) {
+				agino += XFS_INODES_PER_CHUNK;
+				if (XFS_AGINO_TO_AGBNO(mp, agino) >=
+						be32_to_cpu(agi->agi_length))
+					break;
+				error = xfs_inobt_lookup(cur, agino,
+							 XFS_LOOKUP_GE, &tmp);
+				cond_resched();
+			}
+			/*
+			 * If ran off the end of the ag either with an error,
+			 * or the normal way, set end and stop collecting.
+			 */
+			if (error) {
+				end_of_ag = 1;
+				break;
+			}
+
+			error = xfs_inobt_get_rec(cur, &r, &i);
+			if (error || i == 0) {
+				end_of_ag = 1;
+				break;
+			}
+
+			/*
+			 * If this chunk has any allocated inodes, save it.
+			 * Also start read-ahead now for this chunk.
+			 */
+			if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+				/*
+				 * Loop over all clusters in the next chunk.
+				 * Do a readahead if there are any allocated
+				 * inodes in that cluster.
+				 */
+				agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
+				for (chunkidx = 0;
+				     chunkidx < XFS_INODES_PER_CHUNK;
+				     chunkidx += nicluster,
+				     agbno += nbcluster) {
+					if (xfs_inobt_maskn(chunkidx, nicluster)
+							& ~r.ir_free)
+						xfs_btree_reada_bufs(mp, agno,
+							agbno, nbcluster);
+				}
+				irbp->ir_startino = r.ir_startino;
+				irbp->ir_freecount = r.ir_freecount;
+				irbp->ir_free = r.ir_free;
+				irbp++;
+				icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
+			}
+			/*
+			 * Set agino to after this chunk and bump the cursor.
+			 */
+			agino = r.ir_startino + XFS_INODES_PER_CHUNK;
+			error = xfs_btree_increment(cur, 0, &tmp);
+			cond_resched();
+		}
+		/*
+		 * Drop the btree buffers and the agi buffer.
+		 * We can't hold any of the locks these represent
+		 * when calling iget.
+		 */
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+		xfs_buf_relse(agbp);
+		/*
+		 * Now format all the good inodes into the user's buffer.
+		 */
+		irbufend = irbp;
+		for (irbp = irbuf;
+		     irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
+			/*
+			 * Now process this chunk of inodes.
+			 */
+			for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
+			     XFS_BULKSTAT_UBLEFT(ubleft) &&
+				irbp->ir_freecount < XFS_INODES_PER_CHUNK;
+			     chunkidx++, clustidx++, agino++) {
+				ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
+				/*
+				 * Recompute agbno if this is the
+				 * first inode of the cluster.
+				 *
+				 * Careful with clustidx.   There can be
+				 * multiple clusters per chunk, a single
+				 * cluster per chunk or a cluster that has
+				 * inodes represented from several different
+				 * chunks (if blocksize is large).
+				 *
+				 * Because of this, the starting clustidx is
+				 * initialized to zero in this loop but must
+				 * later be reset after reading in the cluster
+				 * buffer.
+				 */
+				if ((chunkidx & (nicluster - 1)) == 0) {
+					agbno = XFS_AGINO_TO_AGBNO(mp,
+							irbp->ir_startino) +
+						((chunkidx & nimask) >>
+						 mp->m_sb.sb_inopblog);
+				}
+				ino = XFS_AGINO_TO_INO(mp, agno, agino);
+				/*
+				 * Skip if this inode is free.
+				 */
+				if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
+					lastino = ino;
+					continue;
+				}
+				/*
+				 * Count used inodes as free so we can tell
+				 * when the chunk is used up.
+				 */
+				irbp->ir_freecount++;
+
+				/*
+				 * Get the inode and fill in a single buffer.
+				 */
+				ubused = statstruct_size;
+				error = formatter(mp, ino, ubufp, ubleft,
+						  &ubused, &fmterror);
+				if (fmterror == BULKSTAT_RV_NOTHING) {
+					if (error && error != ENOENT &&
+						error != EINVAL) {
+						ubleft = 0;
+						rval = error;
+						break;
+					}
+					lastino = ino;
+					continue;
+				}
+				if (fmterror == BULKSTAT_RV_GIVEUP) {
+					ubleft = 0;
+					ASSERT(error);
+					rval = error;
+					break;
+				}
+				if (ubufp)
+					ubufp += ubused;
+				ubleft -= ubused;
+				ubelem++;
+				lastino = ino;
+			}
+
+			cond_resched();
+		}
+
+		if (bp)
+			xfs_buf_relse(bp);
+
+		/*
+		 * Set up for the next loop iteration.
+		 */
+		if (XFS_BULKSTAT_UBLEFT(ubleft)) {
+			if (end_of_ag) {
+				agno++;
+				agino = 0;
+			} else
+				agino = XFS_INO_TO_AGINO(mp, lastino);
+		} else
+			break;
+	}
+	/*
+	 * Done, we're either out of filesystem or space to put the data.
+	 */
+	kmem_free_large(irbuf);
+	*ubcountp = ubelem;
+	/*
+	 * Found some inodes, return them now and return the error next time.
+	 */
+	if (ubelem)
+		rval = 0;
+	if (agno >= mp->m_sb.sb_agcount) {
+		/*
+		 * If we ran out of filesystem, mark lastino as off
+		 * the end of the filesystem, so the next call
+		 * will return immediately.
+		 */
+		*lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0);
+		*done = 1;
+	} else
+		*lastinop = (xfs_ino_t)lastino;
+
+	return rval;
+}
+
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ * Special case for non-sequential one inode bulkstat.
+ */
+int					/* error status */
+xfs_bulkstat_single(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_ino_t		*lastinop, /* inode to return */
+	char			__user *buffer, /* buffer with inode stats */
+	int			*done)	/* 1 if there are more stats to get */
+{
+	int			count;	/* count value for bulkstat call */
+	int			error;	/* return value */
+	xfs_ino_t		ino;	/* filesystem inode number */
+	int			res;	/* result from bs1 */
+
+	/*
+	 * note that requesting valid inode numbers which are not allocated
+	 * to inodes will most likely cause xfs_itobp to generate warning
+	 * messages about bad magic numbers. This is ok. The fact that
+	 * the inode isn't actually an inode is handled by the
+	 * error check below. Done this way to make the usual case faster
+	 * at the expense of the error case.
+	 */
+
+	ino = (xfs_ino_t)*lastinop;
+	error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res);
+	if (error) {
+		/*
+		 * Special case way failed, do it the "long" way
+		 * to see if that works.
+		 */
+		(*lastinop)--;
+		count = 1;
+		if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one,
+				sizeof(xfs_bstat_t), buffer, done))
+			return error;
+		if (count == 0 || (xfs_ino_t)*lastinop != ino)
+			return error == EFSCORRUPTED ?
+				XFS_ERROR(EINVAL) : error;
+		else
+			return 0;
+	}
+	*done = 0;
+	return 0;
+}
+
+int
+xfs_inumbers_fmt(
+	void			__user *ubuffer, /* buffer to write to */
+	const xfs_inogrp_t	*buffer,	/* buffer to read from */
+	long			count,		/* # of elements to read */
+	long			*written)	/* # of bytes written */
+{
+	if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer)))
+		return -EFAULT;
+	*written = count * sizeof(*buffer);
+	return 0;
+}
+
+/*
+ * Return inode number table for the filesystem.
+ */
+int					/* error status */
+xfs_inumbers(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_ino_t	*lastino,	/* last inode returned */
+	int		*count,		/* size of buffer/count returned */
+	void		__user *ubuffer,/* buffer with inode descriptions */
+	inumbers_fmt_pf	formatter)
+{
+	xfs_buf_t	*agbp;
+	xfs_agino_t	agino;
+	xfs_agnumber_t	agno;
+	int		bcount;
+	xfs_inogrp_t	*buffer;
+	int		bufidx;
+	xfs_btree_cur_t	*cur;
+	int		error;
+	xfs_inobt_rec_incore_t r;
+	int		i;
+	xfs_ino_t	ino;
+	int		left;
+	int		tmp;
+
+	ino = (xfs_ino_t)*lastino;
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agino = XFS_INO_TO_AGINO(mp, ino);
+	left = *count;
+	*count = 0;
+	bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer)));
+	buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);
+	error = bufidx = 0;
+	cur = NULL;
+	agbp = NULL;
+	while (left > 0 && agno < mp->m_sb.sb_agcount) {
+		if (agbp == NULL) {
+			error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
+			if (error) {
+				/*
+				 * If we can't read the AGI of this ag,
+				 * then just skip to the next one.
+				 */
+				ASSERT(cur == NULL);
+				agbp = NULL;
+				agno++;
+				agino = 0;
+				continue;
+			}
+			cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
+			error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
+						 &tmp);
+			if (error) {
+				xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+				cur = NULL;
+				xfs_buf_relse(agbp);
+				agbp = NULL;
+				/*
+				 * Move up the last inode in the current
+				 * chunk.  The lookup_ge will always get
+				 * us the first inode in the next chunk.
+				 */
+				agino += XFS_INODES_PER_CHUNK - 1;
+				continue;
+			}
+		}
+		error = xfs_inobt_get_rec(cur, &r, &i);
+		if (error || i == 0) {
+			xfs_buf_relse(agbp);
+			agbp = NULL;
+			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+			cur = NULL;
+			agno++;
+			agino = 0;
+			continue;
+		}
+		agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
+		buffer[bufidx].xi_startino =
+			XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
+		buffer[bufidx].xi_alloccount =
+			XFS_INODES_PER_CHUNK - r.ir_freecount;
+		buffer[bufidx].xi_allocmask = ~r.ir_free;
+		bufidx++;
+		left--;
+		if (bufidx == bcount) {
+			long written;
+			if (formatter(ubuffer, buffer, bufidx, &written)) {
+				error = XFS_ERROR(EFAULT);
+				break;
+			}
+			ubuffer += written;
+			*count += bufidx;
+			bufidx = 0;
+		}
+		if (left) {
+			error = xfs_btree_increment(cur, 0, &tmp);
+			if (error) {
+				xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+				cur = NULL;
+				xfs_buf_relse(agbp);
+				agbp = NULL;
+				/*
+				 * The agino value has already been bumped.
+				 * Just try to skip up to it.
+				 */
+				agino += XFS_INODES_PER_CHUNK;
+				continue;
+			}
+		}
+	}
+	if (!error) {
+		if (bufidx) {
+			long written;
+			if (formatter(ubuffer, buffer, bufidx, &written))
+				error = XFS_ERROR(EFAULT);
+			else
+				*count += bufidx;
+		}
+		*lastino = XFS_AGINO_TO_INO(mp, agno, agino);
+	}
+	kmem_free(buffer);
+	if (cur)
+		xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
+					   XFS_BTREE_NOERROR));
+	if (agbp)
+		xfs_buf_relse(agbp);
+	return error;
+}
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
new file mode 100644
index 00000000..97295d91
--- /dev/null
+++ b/fs/xfs/xfs_itable.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ITABLE_H__
+#define	__XFS_ITABLE_H__
+
+/*
+ * xfs_bulkstat() is used to fill in xfs_bstat structures as well as dm_stat
+ * structures (by the dmi library). This is a pointer to a formatter function
+ * that will iget the inode and fill in the appropriate structure.
+ * see xfs_bulkstat_one() and xfs_dm_bulkstat_one() in dmapi_xfs.c
+ */
+typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
+			       xfs_ino_t	ino,
+			       void		__user *buffer,
+			       int		ubsize,
+			       int		*ubused,
+			       int		*stat);
+
+/*
+ * Values for stat return value.
+ */
+#define BULKSTAT_RV_NOTHING	0
+#define BULKSTAT_RV_DIDONE	1
+#define BULKSTAT_RV_GIVEUP	2
+
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ */
+int					/* error status */
+xfs_bulkstat(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_ino_t	*lastino,	/* last inode returned */
+	int		*count,		/* size of buffer/count returned */
+	bulkstat_one_pf formatter,	/* func that'd fill a single buf */
+	size_t		statstruct_size,/* sizeof struct that we're filling */
+	char		__user *ubuffer,/* buffer with inode stats */
+	int		*done);		/* 1 if there are more stats to get */
+
+int
+xfs_bulkstat_single(
+	xfs_mount_t		*mp,
+	xfs_ino_t		*lastinop,
+	char			__user *buffer,
+	int			*done);
+
+typedef int (*bulkstat_one_fmt_pf)(  /* used size in bytes or negative error */
+	void			__user *ubuffer, /* buffer to write to */
+	int			ubsize,		 /* remaining user buffer sz */
+	int			*ubused,	 /* bytes used by formatter */
+	const xfs_bstat_t	*buffer);        /* buffer to read from */
+
+int
+xfs_bulkstat_one_int(
+	xfs_mount_t		*mp,
+	xfs_ino_t		ino,
+	void			__user *buffer,
+	int			ubsize,
+	bulkstat_one_fmt_pf	formatter,
+	int			*ubused,
+	int			*stat);
+
+int
+xfs_bulkstat_one(
+	xfs_mount_t		*mp,
+	xfs_ino_t		ino,
+	void			__user *buffer,
+	int			ubsize,
+	int			*ubused,
+	int			*stat);
+
+typedef int (*inumbers_fmt_pf)(
+	void			__user *ubuffer, /* buffer to write to */
+	const xfs_inogrp_t	*buffer,	/* buffer to read from */
+	long			count,		/* # of elements to read */
+	long			*written);	/* # of bytes written */
+
+int
+xfs_inumbers_fmt(
+	void			__user *ubuffer, /* buffer to write to */
+	const xfs_inogrp_t	*buffer,	/* buffer to read from */
+	long			count,		/* # of elements to read */
+	long			*written);	/* # of bytes written */
+
+int					/* error status */
+xfs_inumbers(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_ino_t		*last,	/* last inode returned */
+	int			*count,	/* size of buffer/count returned */
+	void			__user *buffer, /* buffer with inode info */
+	inumbers_fmt_pf		formatter);
+
+#endif	/* __XFS_ITABLE_H__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
new file mode 100644
index 00000000..41d5b8f2
--- /dev/null
+++ b/fs/xfs/xfs_log.c
@@ -0,0 +1,3767 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_log_recover.h"
+#include "xfs_trans_priv.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_rw.h"
+#include "xfs_trace.h"
+
+kmem_zone_t	*xfs_log_ticket_zone;
+
+/* Local miscellaneous function prototypes */
+STATIC int	 xlog_commit_record(struct log *log, struct xlog_ticket *ticket,
+				    xlog_in_core_t **, xfs_lsn_t *);
+STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
+				xfs_buftarg_t	*log_target,
+				xfs_daddr_t	blk_offset,
+				int		num_bblks);
+STATIC int	 xlog_space_left(struct log *log, atomic64_t *head);
+STATIC int	 xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
+STATIC void	 xlog_dealloc_log(xlog_t *log);
+
+/* local state machine functions */
+STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
+STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog);
+STATIC int  xlog_state_get_iclog_space(xlog_t		*log,
+				       int		len,
+				       xlog_in_core_t	**iclog,
+				       xlog_ticket_t	*ticket,
+				       int		*continued_write,
+				       int		*logoffsetp);
+STATIC int  xlog_state_release_iclog(xlog_t		*log,
+				     xlog_in_core_t	*iclog);
+STATIC void xlog_state_switch_iclogs(xlog_t		*log,
+				     xlog_in_core_t *iclog,
+				     int		eventual_size);
+STATIC void xlog_state_want_sync(xlog_t	*log, xlog_in_core_t *iclog);
+
+/* local functions to manipulate grant head */
+STATIC int  xlog_grant_log_space(xlog_t		*log,
+				 xlog_ticket_t	*xtic);
+STATIC void xlog_grant_push_ail(struct log	*log,
+				int		need_bytes);
+STATIC void xlog_regrant_reserve_log_space(xlog_t	 *log,
+					   xlog_ticket_t *ticket);
+STATIC int xlog_regrant_write_log_space(xlog_t		*log,
+					 xlog_ticket_t  *ticket);
+STATIC void xlog_ungrant_log_space(xlog_t	 *log,
+				   xlog_ticket_t *ticket);
+
+#if defined(DEBUG)
+STATIC void	xlog_verify_dest_ptr(xlog_t *log, char *ptr);
+STATIC void	xlog_verify_grant_tail(struct log *log);
+STATIC void	xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
+				  int count, boolean_t syncing);
+STATIC void	xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
+				     xfs_lsn_t tail_lsn);
+#else
+#define xlog_verify_dest_ptr(a,b)
+#define xlog_verify_grant_tail(a)
+#define xlog_verify_iclog(a,b,c,d)
+#define xlog_verify_tail_lsn(a,b,c)
+#endif
+
+STATIC int	xlog_iclogs_empty(xlog_t *log);
+
+static void
+xlog_grant_sub_space(
+	struct log	*log,
+	atomic64_t	*head,
+	int		bytes)
+{
+	int64_t	head_val = atomic64_read(head);
+	int64_t new, old;
+
+	do {
+		int	cycle, space;
+
+		xlog_crack_grant_head_val(head_val, &cycle, &space);
+
+		space -= bytes;
+		if (space < 0) {
+			space += log->l_logsize;
+			cycle--;
+		}
+
+		old = head_val;
+		new = xlog_assign_grant_head_val(cycle, space);
+		head_val = atomic64_cmpxchg(head, old, new);
+	} while (head_val != old);
+}
+
+static void
+xlog_grant_add_space(
+	struct log	*log,
+	atomic64_t	*head,
+	int		bytes)
+{
+	int64_t	head_val = atomic64_read(head);
+	int64_t new, old;
+
+	do {
+		int		tmp;
+		int		cycle, space;
+
+		xlog_crack_grant_head_val(head_val, &cycle, &space);
+
+		tmp = log->l_logsize - space;
+		if (tmp > bytes)
+			space += bytes;
+		else {
+			space = bytes - tmp;
+			cycle++;
+		}
+
+		old = head_val;
+		new = xlog_assign_grant_head_val(cycle, space);
+		head_val = atomic64_cmpxchg(head, old, new);
+	} while (head_val != old);
+}
+
+static void
+xlog_tic_reset_res(xlog_ticket_t *tic)
+{
+	tic->t_res_num = 0;
+	tic->t_res_arr_sum = 0;
+	tic->t_res_num_ophdrs = 0;
+}
+
+static void
+xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
+{
+	if (tic->t_res_num == XLOG_TIC_LEN_MAX) {
+		/* add to overflow and start again */
+		tic->t_res_o_flow += tic->t_res_arr_sum;
+		tic->t_res_num = 0;
+		tic->t_res_arr_sum = 0;
+	}
+
+	tic->t_res_arr[tic->t_res_num].r_len = len;
+	tic->t_res_arr[tic->t_res_num].r_type = type;
+	tic->t_res_arr_sum += len;
+	tic->t_res_num++;
+}
+
+/*
+ * NOTES:
+ *
+ *	1. currblock field gets updated at startup and after in-core logs
+ *		marked as with WANT_SYNC.
+ */
+
+/*
+ * This routine is called when a user of a log manager ticket is done with
+ * the reservation.  If the ticket was ever used, then a commit record for
+ * the associated transaction is written out as a log operation header with
+ * no data.  The flag XLOG_TIC_INITED is set when the first write occurs with
+ * a given ticket.  If the ticket was one with a permanent reservation, then
+ * a few operations are done differently.  Permanent reservation tickets by
+ * default don't release the reservation.  They just commit the current
+ * transaction with the belief that the reservation is still needed.  A flag
+ * must be passed in before permanent reservations are actually released.
+ * When these type of tickets are not released, they need to be set into
+ * the inited state again.  By doing this, a start record will be written
+ * out when the next write occurs.
+ */
+xfs_lsn_t
+xfs_log_done(
+	struct xfs_mount	*mp,
+	struct xlog_ticket	*ticket,
+	struct xlog_in_core	**iclog,
+	uint			flags)
+{
+	struct log		*log = mp->m_log;
+	xfs_lsn_t		lsn = 0;
+
+	if (XLOG_FORCED_SHUTDOWN(log) ||
+	    /*
+	     * If nothing was ever written, don't write out commit record.
+	     * If we get an error, just continue and give back the log ticket.
+	     */
+	    (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
+	     (xlog_commit_record(log, ticket, iclog, &lsn)))) {
+		lsn = (xfs_lsn_t) -1;
+		if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
+			flags |= XFS_LOG_REL_PERM_RESERV;
+		}
+	}
+
+
+	if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
+	    (flags & XFS_LOG_REL_PERM_RESERV)) {
+		trace_xfs_log_done_nonperm(log, ticket);
+
+		/*
+		 * Release ticket if not permanent reservation or a specific
+		 * request has been made to release a permanent reservation.
+		 */
+		xlog_ungrant_log_space(log, ticket);
+		xfs_log_ticket_put(ticket);
+	} else {
+		trace_xfs_log_done_perm(log, ticket);
+
+		xlog_regrant_reserve_log_space(log, ticket);
+		/* If this ticket was a permanent reservation and we aren't
+		 * trying to release it, reset the inited flags; so next time
+		 * we write, a start record will be written out.
+		 */
+		ticket->t_flags |= XLOG_TIC_INITED;
+	}
+
+	return lsn;
+}
+
+/*
+ * Attaches a new iclog I/O completion callback routine during
+ * transaction commit.  If the log is in error state, a non-zero
+ * return code is handed back and the caller is responsible for
+ * executing the callback at an appropriate time.
+ */
+int
+xfs_log_notify(
+	struct xfs_mount	*mp,
+	struct xlog_in_core	*iclog,
+	xfs_log_callback_t	*cb)
+{
+	int	abortflg;
+
+	spin_lock(&iclog->ic_callback_lock);
+	abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
+	if (!abortflg) {
+		ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
+			      (iclog->ic_state == XLOG_STATE_WANT_SYNC));
+		cb->cb_next = NULL;
+		*(iclog->ic_callback_tail) = cb;
+		iclog->ic_callback_tail = &(cb->cb_next);
+	}
+	spin_unlock(&iclog->ic_callback_lock);
+	return abortflg;
+}
+
+int
+xfs_log_release_iclog(
+	struct xfs_mount	*mp,
+	struct xlog_in_core	*iclog)
+{
+	if (xlog_state_release_iclog(mp->m_log, iclog)) {
+		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+		return EIO;
+	}
+
+	return 0;
+}
+
+/*
+ *  1. Reserve an amount of on-disk log space and return a ticket corresponding
+ *	to the reservation.
+ *  2. Potentially, push buffers at tail of log to disk.
+ *
+ * Each reservation is going to reserve extra space for a log record header.
+ * When writes happen to the on-disk log, we don't subtract the length of the
+ * log record header from any reservation.  By wasting space in each
+ * reservation, we prevent over allocation problems.
+ */
+int
+xfs_log_reserve(
+	struct xfs_mount	*mp,
+	int		 	unit_bytes,
+	int		 	cnt,
+	struct xlog_ticket	**ticket,
+	__uint8_t	 	client,
+	uint		 	flags,
+	uint		 	t_type)
+{
+	struct log		*log = mp->m_log;
+	struct xlog_ticket	*internal_ticket;
+	int			retval = 0;
+
+	ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
+
+	if (XLOG_FORCED_SHUTDOWN(log))
+		return XFS_ERROR(EIO);
+
+	XFS_STATS_INC(xs_try_logspace);
+
+
+	if (*ticket != NULL) {
+		ASSERT(flags & XFS_LOG_PERM_RESERV);
+		internal_ticket = *ticket;
+
+		/*
+		 * this is a new transaction on the ticket, so we need to
+		 * change the transaction ID so that the next transaction has a
+		 * different TID in the log. Just add one to the existing tid
+		 * so that we can see chains of rolling transactions in the log
+		 * easily.
+		 */
+		internal_ticket->t_tid++;
+
+		trace_xfs_log_reserve(log, internal_ticket);
+
+		xlog_grant_push_ail(log, internal_ticket->t_unit_res);
+		retval = xlog_regrant_write_log_space(log, internal_ticket);
+	} else {
+		/* may sleep if need to allocate more tickets */
+		internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
+						  client, flags,
+						  KM_SLEEP|KM_MAYFAIL);
+		if (!internal_ticket)
+			return XFS_ERROR(ENOMEM);
+		internal_ticket->t_trans_type = t_type;
+		*ticket = internal_ticket;
+
+		trace_xfs_log_reserve(log, internal_ticket);
+
+		xlog_grant_push_ail(log,
+				    (internal_ticket->t_unit_res *
+				     internal_ticket->t_cnt));
+		retval = xlog_grant_log_space(log, internal_ticket);
+	}
+
+	return retval;
+}	/* xfs_log_reserve */
+
+
+/*
+ * Mount a log filesystem
+ *
+ * mp		- ubiquitous xfs mount point structure
+ * log_target	- buftarg of on-disk log device
+ * blk_offset	- Start block # where block size is 512 bytes (BBSIZE)
+ * num_bblocks	- Number of BBSIZE blocks in on-disk log
+ *
+ * Return error or zero.
+ */
+int
+xfs_log_mount(
+	xfs_mount_t	*mp,
+	xfs_buftarg_t	*log_target,
+	xfs_daddr_t	blk_offset,
+	int		num_bblks)
+{
+	int		error;
+
+	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+		xfs_notice(mp, "Mounting Filesystem");
+	else {
+		xfs_notice(mp,
+"Mounting filesystem in no-recovery mode.  Filesystem will be inconsistent.");
+		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+	}
+
+	mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
+	if (IS_ERR(mp->m_log)) {
+		error = -PTR_ERR(mp->m_log);
+		goto out;
+	}
+
+	/*
+	 * Initialize the AIL now we have a log.
+	 */
+	error = xfs_trans_ail_init(mp);
+	if (error) {
+		xfs_warn(mp, "AIL initialisation failed: error %d", error);
+		goto out_free_log;
+	}
+	mp->m_log->l_ailp = mp->m_ail;
+
+	/*
+	 * skip log recovery on a norecovery mount.  pretend it all
+	 * just worked.
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
+		int	readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
+
+		if (readonly)
+			mp->m_flags &= ~XFS_MOUNT_RDONLY;
+
+		error = xlog_recover(mp->m_log);
+
+		if (readonly)
+			mp->m_flags |= XFS_MOUNT_RDONLY;
+		if (error) {
+			xfs_warn(mp, "log mount/recovery failed: error %d",
+				error);
+			goto out_destroy_ail;
+		}
+	}
+
+	/* Normal transactions can now occur */
+	mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+
+	/*
+	 * Now the log has been fully initialised and we know were our
+	 * space grant counters are, we can initialise the permanent ticket
+	 * needed for delayed logging to work.
+	 */
+	xlog_cil_init_post_recovery(mp->m_log);
+
+	return 0;
+
+out_destroy_ail:
+	xfs_trans_ail_destroy(mp);
+out_free_log:
+	xlog_dealloc_log(mp->m_log);
+out:
+	return error;
+}
+
+/*
+ * Finish the recovery of the file system.  This is separate from
+ * the xfs_log_mount() call, because it depends on the code in
+ * xfs_mountfs() to read in the root and real-time bitmap inodes
+ * between calling xfs_log_mount() and here.
+ *
+ * mp		- ubiquitous xfs mount point structure
+ */
+int
+xfs_log_mount_finish(xfs_mount_t *mp)
+{
+	int	error;
+
+	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+		error = xlog_recover_finish(mp->m_log);
+	else {
+		error = 0;
+		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+	}
+
+	return error;
+}
+
+/*
+ * Final log writes as part of unmount.
+ *
+ * Mark the filesystem clean as unmount happens.  Note that during relocation
+ * this routine needs to be executed as part of source-bag while the
+ * deallocation must not be done until source-end.
+ */
+
+/*
+ * Unmount record used to have a string "Unmount filesystem--" in the
+ * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
+ * We just write the magic number now since that particular field isn't
+ * currently architecture converted and "nUmount" is a bit foo.
+ * As far as I know, there weren't any dependencies on the old behaviour.
+ */
+
+int
+xfs_log_unmount_write(xfs_mount_t *mp)
+{
+	xlog_t		 *log = mp->m_log;
+	xlog_in_core_t	 *iclog;
+#ifdef DEBUG
+	xlog_in_core_t	 *first_iclog;
+#endif
+	xlog_ticket_t	*tic = NULL;
+	xfs_lsn_t	 lsn;
+	int		 error;
+
+	/*
+	 * Don't write out unmount record on read-only mounts.
+	 * Or, if we are doing a forced umount (typically because of IO errors).
+	 */
+	if (mp->m_flags & XFS_MOUNT_RDONLY)
+		return 0;
+
+	error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+	ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
+
+#ifdef DEBUG
+	first_iclog = iclog = log->l_iclog;
+	do {
+		if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
+			ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE);
+			ASSERT(iclog->ic_offset == 0);
+		}
+		iclog = iclog->ic_next;
+	} while (iclog != first_iclog);
+#endif
+	if (! (XLOG_FORCED_SHUTDOWN(log))) {
+		error = xfs_log_reserve(mp, 600, 1, &tic,
+					XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
+		if (!error) {
+			/* the data section must be 32 bit size aligned */
+			struct {
+			    __uint16_t magic;
+			    __uint16_t pad1;
+			    __uint32_t pad2; /* may as well make it 64 bits */
+			} magic = {
+				.magic = XLOG_UNMOUNT_TYPE,
+			};
+			struct xfs_log_iovec reg = {
+				.i_addr = &magic,
+				.i_len = sizeof(magic),
+				.i_type = XLOG_REG_TYPE_UNMOUNT,
+			};
+			struct xfs_log_vec vec = {
+				.lv_niovecs = 1,
+				.lv_iovecp = &reg,
+			};
+
+			/* remove inited flag */
+			tic->t_flags = 0;
+			error = xlog_write(log, &vec, tic, &lsn,
+					   NULL, XLOG_UNMOUNT_TRANS);
+			/*
+			 * At this point, we're umounting anyway,
+			 * so there's no point in transitioning log state
+			 * to IOERROR. Just continue...
+			 */
+		}
+
+		if (error)
+			xfs_alert(mp, "%s: unmount record failed", __func__);
+
+
+		spin_lock(&log->l_icloglock);
+		iclog = log->l_iclog;
+		atomic_inc(&iclog->ic_refcnt);
+		xlog_state_want_sync(log, iclog);
+		spin_unlock(&log->l_icloglock);
+		error = xlog_state_release_iclog(log, iclog);
+
+		spin_lock(&log->l_icloglock);
+		if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
+		      iclog->ic_state == XLOG_STATE_DIRTY)) {
+			if (!XLOG_FORCED_SHUTDOWN(log)) {
+				xlog_wait(&iclog->ic_force_wait,
+							&log->l_icloglock);
+			} else {
+				spin_unlock(&log->l_icloglock);
+			}
+		} else {
+			spin_unlock(&log->l_icloglock);
+		}
+		if (tic) {
+			trace_xfs_log_umount_write(log, tic);
+			xlog_ungrant_log_space(log, tic);
+			xfs_log_ticket_put(tic);
+		}
+	} else {
+		/*
+		 * We're already in forced_shutdown mode, couldn't
+		 * even attempt to write out the unmount transaction.
+		 *
+		 * Go through the motions of sync'ing and releasing
+		 * the iclog, even though no I/O will actually happen,
+		 * we need to wait for other log I/Os that may already
+		 * be in progress.  Do this as a separate section of
+		 * code so we'll know if we ever get stuck here that
+		 * we're in this odd situation of trying to unmount
+		 * a file system that went into forced_shutdown as
+		 * the result of an unmount..
+		 */
+		spin_lock(&log->l_icloglock);
+		iclog = log->l_iclog;
+		atomic_inc(&iclog->ic_refcnt);
+
+		xlog_state_want_sync(log, iclog);
+		spin_unlock(&log->l_icloglock);
+		error =  xlog_state_release_iclog(log, iclog);
+
+		spin_lock(&log->l_icloglock);
+
+		if ( ! (   iclog->ic_state == XLOG_STATE_ACTIVE
+			|| iclog->ic_state == XLOG_STATE_DIRTY
+			|| iclog->ic_state == XLOG_STATE_IOERROR) ) {
+
+				xlog_wait(&iclog->ic_force_wait,
+							&log->l_icloglock);
+		} else {
+			spin_unlock(&log->l_icloglock);
+		}
+	}
+
+	return error;
+}	/* xfs_log_unmount_write */
+
+/*
+ * Deallocate log structures for unmount/relocation.
+ *
+ * We need to stop the aild from running before we destroy
+ * and deallocate the log as the aild references the log.
+ */
+void
+xfs_log_unmount(xfs_mount_t *mp)
+{
+	xfs_trans_ail_destroy(mp);
+	xlog_dealloc_log(mp->m_log);
+}
+
+void
+xfs_log_item_init(
+	struct xfs_mount	*mp,
+	struct xfs_log_item	*item,
+	int			type,
+	struct xfs_item_ops	*ops)
+{
+	item->li_mountp = mp;
+	item->li_ailp = mp->m_ail;
+	item->li_type = type;
+	item->li_ops = ops;
+	item->li_lv = NULL;
+
+	INIT_LIST_HEAD(&item->li_ail);
+	INIT_LIST_HEAD(&item->li_cil);
+}
+
+/*
+ * Write region vectors to log.  The write happens using the space reservation
+ * of the ticket (tic).  It is not a requirement that all writes for a given
+ * transaction occur with one call to xfs_log_write(). However, it is important
+ * to note that the transaction reservation code makes an assumption about the
+ * number of log headers a transaction requires that may be violated if you
+ * don't pass all the transaction vectors in one call....
+ */
+int
+xfs_log_write(
+	struct xfs_mount	*mp,
+	struct xfs_log_iovec	reg[],
+	int			nentries,
+	struct xlog_ticket	*tic,
+	xfs_lsn_t		*start_lsn)
+{
+	struct log		*log = mp->m_log;
+	int			error;
+	struct xfs_log_vec	vec = {
+		.lv_niovecs = nentries,
+		.lv_iovecp = reg,
+	};
+
+	if (XLOG_FORCED_SHUTDOWN(log))
+		return XFS_ERROR(EIO);
+
+	error = xlog_write(log, &vec, tic, start_lsn, NULL, 0);
+	if (error)
+		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+	return error;
+}
+
+void
+xfs_log_move_tail(xfs_mount_t	*mp,
+		  xfs_lsn_t	tail_lsn)
+{
+	xlog_ticket_t	*tic;
+	xlog_t		*log = mp->m_log;
+	int		need_bytes, free_bytes;
+
+	if (XLOG_FORCED_SHUTDOWN(log))
+		return;
+
+	if (tail_lsn == 0)
+		tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+
+	/* tail_lsn == 1 implies that we weren't passed a valid value.  */
+	if (tail_lsn != 1)
+		atomic64_set(&log->l_tail_lsn, tail_lsn);
+
+	if (!list_empty_careful(&log->l_writeq)) {
+#ifdef DEBUG
+		if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+			panic("Recovery problem");
+#endif
+		spin_lock(&log->l_grant_write_lock);
+		free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+		list_for_each_entry(tic, &log->l_writeq, t_queue) {
+			ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+
+			if (free_bytes < tic->t_unit_res && tail_lsn != 1)
+				break;
+			tail_lsn = 0;
+			free_bytes -= tic->t_unit_res;
+			trace_xfs_log_regrant_write_wake_up(log, tic);
+			wake_up(&tic->t_wait);
+		}
+		spin_unlock(&log->l_grant_write_lock);
+	}
+
+	if (!list_empty_careful(&log->l_reserveq)) {
+#ifdef DEBUG
+		if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+			panic("Recovery problem");
+#endif
+		spin_lock(&log->l_grant_reserve_lock);
+		free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+		list_for_each_entry(tic, &log->l_reserveq, t_queue) {
+			if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+				need_bytes = tic->t_unit_res*tic->t_cnt;
+			else
+				need_bytes = tic->t_unit_res;
+			if (free_bytes < need_bytes && tail_lsn != 1)
+				break;
+			tail_lsn = 0;
+			free_bytes -= need_bytes;
+			trace_xfs_log_grant_wake_up(log, tic);
+			wake_up(&tic->t_wait);
+		}
+		spin_unlock(&log->l_grant_reserve_lock);
+	}
+}
+
+/*
+ * Determine if we have a transaction that has gone to disk
+ * that needs to be covered. To begin the transition to the idle state
+ * firstly the log needs to be idle (no AIL and nothing in the iclogs).
+ * If we are then in a state where covering is needed, the caller is informed
+ * that dummy transactions are required to move the log into the idle state.
+ *
+ * Because this is called as part of the sync process, we should also indicate
+ * that dummy transactions should be issued in anything but the covered or
+ * idle states. This ensures that the log tail is accurately reflected in
+ * the log at the end of the sync, hence if a crash occurrs avoids replay
+ * of transactions where the metadata is already on disk.
+ */
+int
+xfs_log_need_covered(xfs_mount_t *mp)
+{
+	int		needed = 0;
+	xlog_t		*log = mp->m_log;
+
+	if (!xfs_fs_writable(mp))
+		return 0;
+
+	spin_lock(&log->l_icloglock);
+	switch (log->l_covered_state) {
+	case XLOG_STATE_COVER_DONE:
+	case XLOG_STATE_COVER_DONE2:
+	case XLOG_STATE_COVER_IDLE:
+		break;
+	case XLOG_STATE_COVER_NEED:
+	case XLOG_STATE_COVER_NEED2:
+		if (!xfs_ail_min_lsn(log->l_ailp) &&
+		    xlog_iclogs_empty(log)) {
+			if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+				log->l_covered_state = XLOG_STATE_COVER_DONE;
+			else
+				log->l_covered_state = XLOG_STATE_COVER_DONE2;
+		}
+		/* FALLTHRU */
+	default:
+		needed = 1;
+		break;
+	}
+	spin_unlock(&log->l_icloglock);
+	return needed;
+}
+
+/******************************************************************************
+ *
+ *	local routines
+ *
+ ******************************************************************************
+ */
+
+/* xfs_trans_tail_ail returns 0 when there is nothing in the list.
+ * The log manager must keep track of the last LR which was committed
+ * to disk.  The lsn of this LR will become the new tail_lsn whenever
+ * xfs_trans_tail_ail returns 0.  If we don't do this, we run into
+ * the situation where stuff could be written into the log but nothing
+ * was ever in the AIL when asked.  Eventually, we panic since the
+ * tail hits the head.
+ *
+ * We may be holding the log iclog lock upon entering this routine.
+ */
+xfs_lsn_t
+xlog_assign_tail_lsn(
+	struct xfs_mount	*mp)
+{
+	xfs_lsn_t		tail_lsn;
+	struct log		*log = mp->m_log;
+
+	tail_lsn = xfs_ail_min_lsn(mp->m_ail);
+	if (!tail_lsn)
+		tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+
+	atomic64_set(&log->l_tail_lsn, tail_lsn);
+	return tail_lsn;
+}
+
+/*
+ * Return the space in the log between the tail and the head.  The head
+ * is passed in the cycle/bytes formal parms.  In the special case where
+ * the reserve head has wrapped passed the tail, this calculation is no
+ * longer valid.  In this case, just return 0 which means there is no space
+ * in the log.  This works for all places where this function is called
+ * with the reserve head.  Of course, if the write head were to ever
+ * wrap the tail, we should blow up.  Rather than catch this case here,
+ * we depend on other ASSERTions in other parts of the code.   XXXmiken
+ *
+ * This code also handles the case where the reservation head is behind
+ * the tail.  The details of this case are described below, but the end
+ * result is that we return the size of the log as the amount of space left.
+ */
+STATIC int
+xlog_space_left(
+	struct log	*log,
+	atomic64_t	*head)
+{
+	int		free_bytes;
+	int		tail_bytes;
+	int		tail_cycle;
+	int		head_cycle;
+	int		head_bytes;
+
+	xlog_crack_grant_head(head, &head_cycle, &head_bytes);
+	xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
+	tail_bytes = BBTOB(tail_bytes);
+	if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
+		free_bytes = log->l_logsize - (head_bytes - tail_bytes);
+	else if (tail_cycle + 1 < head_cycle)
+		return 0;
+	else if (tail_cycle < head_cycle) {
+		ASSERT(tail_cycle == (head_cycle - 1));
+		free_bytes = tail_bytes - head_bytes;
+	} else {
+		/*
+		 * The reservation head is behind the tail.
+		 * In this case we just want to return the size of the
+		 * log as the amount of space left.
+		 */
+		xfs_alert(log->l_mp,
+			"xlog_space_left: head behind tail\n"
+			"  tail_cycle = %d, tail_bytes = %d\n"
+			"  GH   cycle = %d, GH   bytes = %d",
+			tail_cycle, tail_bytes, head_cycle, head_bytes);
+		ASSERT(0);
+		free_bytes = log->l_logsize;
+	}
+	return free_bytes;
+}
+
+
+/*
+ * Log function which is called when an io completes.
+ *
+ * The log manager needs its own routine, in order to control what
+ * happens with the buffer after the write completes.
+ */
+void
+xlog_iodone(xfs_buf_t *bp)
+{
+	xlog_in_core_t	*iclog;
+	xlog_t		*l;
+	int		aborted;
+
+	iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2);
+	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
+	aborted = 0;
+	l = iclog->ic_log;
+
+	/*
+	 * Race to shutdown the filesystem if we see an error.
+	 */
+	if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
+			XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
+		xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
+		XFS_BUF_STALE(bp);
+		xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
+		/*
+		 * This flag will be propagated to the trans-committed
+		 * callback routines to let them know that the log-commit
+		 * didn't succeed.
+		 */
+		aborted = XFS_LI_ABORTED;
+	} else if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		aborted = XFS_LI_ABORTED;
+	}
+
+	/* log I/O is always issued ASYNC */
+	ASSERT(XFS_BUF_ISASYNC(bp));
+	xlog_state_done_syncing(iclog, aborted);
+	/*
+	 * do not reference the buffer (bp) here as we could race
+	 * with it being freed after writing the unmount record to the
+	 * log.
+	 */
+
+}	/* xlog_iodone */
+
+/*
+ * Return size of each in-core log record buffer.
+ *
+ * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
+ *
+ * If the filesystem blocksize is too large, we may need to choose a
+ * larger size since the directory code currently logs entire blocks.
+ */
+
+STATIC void
+xlog_get_iclog_buffer_size(xfs_mount_t	*mp,
+			   xlog_t	*log)
+{
+	int size;
+	int xhdrs;
+
+	if (mp->m_logbufs <= 0)
+		log->l_iclog_bufs = XLOG_MAX_ICLOGS;
+	else
+		log->l_iclog_bufs = mp->m_logbufs;
+
+	/*
+	 * Buffer size passed in from mount system call.
+	 */
+	if (mp->m_logbsize > 0) {
+		size = log->l_iclog_size = mp->m_logbsize;
+		log->l_iclog_size_log = 0;
+		while (size != 1) {
+			log->l_iclog_size_log++;
+			size >>= 1;
+		}
+
+		if (xfs_sb_version_haslogv2(&mp->m_sb)) {
+			/* # headers = size / 32k
+			 * one header holds cycles from 32k of data
+			 */
+
+			xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
+			if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
+				xhdrs++;
+			log->l_iclog_hsize = xhdrs << BBSHIFT;
+			log->l_iclog_heads = xhdrs;
+		} else {
+			ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
+			log->l_iclog_hsize = BBSIZE;
+			log->l_iclog_heads = 1;
+		}
+		goto done;
+	}
+
+	/* All machines use 32kB buffers by default. */
+	log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
+	log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
+
+	/* the default log size is 16k or 32k which is one header sector */
+	log->l_iclog_hsize = BBSIZE;
+	log->l_iclog_heads = 1;
+
+done:
+	/* are we being asked to make the sizes selected above visible? */
+	if (mp->m_logbufs == 0)
+		mp->m_logbufs = log->l_iclog_bufs;
+	if (mp->m_logbsize == 0)
+		mp->m_logbsize = log->l_iclog_size;
+}	/* xlog_get_iclog_buffer_size */
+
+
+/*
+ * This routine initializes some of the log structure for a given mount point.
+ * Its primary purpose is to fill in enough, so recovery can occur.  However,
+ * some other stuff may be filled in too.
+ */
+STATIC xlog_t *
+xlog_alloc_log(xfs_mount_t	*mp,
+	       xfs_buftarg_t	*log_target,
+	       xfs_daddr_t	blk_offset,
+	       int		num_bblks)
+{
+	xlog_t			*log;
+	xlog_rec_header_t	*head;
+	xlog_in_core_t		**iclogp;
+	xlog_in_core_t		*iclog, *prev_iclog=NULL;
+	xfs_buf_t		*bp;
+	int			i;
+	int			error = ENOMEM;
+	uint			log2_size = 0;
+
+	log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
+	if (!log) {
+		xfs_warn(mp, "Log allocation failed: No memory!");
+		goto out;
+	}
+
+	log->l_mp	   = mp;
+	log->l_targ	   = log_target;
+	log->l_logsize     = BBTOB(num_bblks);
+	log->l_logBBstart  = blk_offset;
+	log->l_logBBsize   = num_bblks;
+	log->l_covered_state = XLOG_STATE_COVER_IDLE;
+	log->l_flags	   |= XLOG_ACTIVE_RECOVERY;
+
+	log->l_prev_block  = -1;
+	/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
+	xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
+	xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
+	log->l_curr_cycle  = 1;	    /* 0 is bad since this is initial value */
+	xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
+	xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
+	INIT_LIST_HEAD(&log->l_reserveq);
+	INIT_LIST_HEAD(&log->l_writeq);
+	spin_lock_init(&log->l_grant_reserve_lock);
+	spin_lock_init(&log->l_grant_write_lock);
+
+	error = EFSCORRUPTED;
+	if (xfs_sb_version_hassector(&mp->m_sb)) {
+	        log2_size = mp->m_sb.sb_logsectlog;
+		if (log2_size < BBSHIFT) {
+			xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
+				log2_size, BBSHIFT);
+			goto out_free_log;
+		}
+
+	        log2_size -= BBSHIFT;
+		if (log2_size > mp->m_sectbb_log) {
+			xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
+				log2_size, mp->m_sectbb_log);
+			goto out_free_log;
+		}
+
+		/* for larger sector sizes, must have v2 or external log */
+		if (log2_size && log->l_logBBstart > 0 &&
+			    !xfs_sb_version_haslogv2(&mp->m_sb)) {
+			xfs_warn(mp,
+		"log sector size (0x%x) invalid for configuration.",
+				log2_size);
+			goto out_free_log;
+		}
+	}
+	log->l_sectBBsize = 1 << log2_size;
+
+	xlog_get_iclog_buffer_size(mp, log);
+
+	error = ENOMEM;
+	bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
+	if (!bp)
+		goto out_free_log;
+	XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
+	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+	log->l_xbuf = bp;
+
+	spin_lock_init(&log->l_icloglock);
+	init_waitqueue_head(&log->l_flush_wait);
+
+	/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
+	ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
+
+	iclogp = &log->l_iclog;
+	/*
+	 * The amount of memory to allocate for the iclog structure is
+	 * rather funky due to the way the structure is defined.  It is
+	 * done this way so that we can use different sizes for machines
+	 * with different amounts of memory.  See the definition of
+	 * xlog_in_core_t in xfs_log_priv.h for details.
+	 */
+	ASSERT(log->l_iclog_size >= 4096);
+	for (i=0; i < log->l_iclog_bufs; i++) {
+		*iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
+		if (!*iclogp)
+			goto out_free_iclog;
+
+		iclog = *iclogp;
+		iclog->ic_prev = prev_iclog;
+		prev_iclog = iclog;
+
+		bp = xfs_buf_get_uncached(mp->m_logdev_targp,
+						log->l_iclog_size, 0);
+		if (!bp)
+			goto out_free_iclog;
+		if (!XFS_BUF_CPSEMA(bp))
+			ASSERT(0);
+		XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
+		XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
+		iclog->ic_bp = bp;
+		iclog->ic_data = bp->b_addr;
+#ifdef DEBUG
+		log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
+#endif
+		head = &iclog->ic_header;
+		memset(head, 0, sizeof(xlog_rec_header_t));
+		head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
+		head->h_version = cpu_to_be32(
+			xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
+		head->h_size = cpu_to_be32(log->l_iclog_size);
+		/* new fields */
+		head->h_fmt = cpu_to_be32(XLOG_FMT);
+		memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+
+		iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
+		iclog->ic_state = XLOG_STATE_ACTIVE;
+		iclog->ic_log = log;
+		atomic_set(&iclog->ic_refcnt, 0);
+		spin_lock_init(&iclog->ic_callback_lock);
+		iclog->ic_callback_tail = &(iclog->ic_callback);
+		iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
+
+		ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
+		ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
+		init_waitqueue_head(&iclog->ic_force_wait);
+		init_waitqueue_head(&iclog->ic_write_wait);
+
+		iclogp = &iclog->ic_next;
+	}
+	*iclogp = log->l_iclog;			/* complete ring */
+	log->l_iclog->ic_prev = prev_iclog;	/* re-write 1st prev ptr */
+
+	error = xlog_cil_init(log);
+	if (error)
+		goto out_free_iclog;
+	return log;
+
+out_free_iclog:
+	for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
+		prev_iclog = iclog->ic_next;
+		if (iclog->ic_bp)
+			xfs_buf_free(iclog->ic_bp);
+		kmem_free(iclog);
+	}
+	spinlock_destroy(&log->l_icloglock);
+	xfs_buf_free(log->l_xbuf);
+out_free_log:
+	kmem_free(log);
+out:
+	return ERR_PTR(-error);
+}	/* xlog_alloc_log */
+
+
+/*
+ * Write out the commit record of a transaction associated with the given
+ * ticket.  Return the lsn of the commit record.
+ */
+STATIC int
+xlog_commit_record(
+	struct log		*log,
+	struct xlog_ticket	*ticket,
+	struct xlog_in_core	**iclog,
+	xfs_lsn_t		*commitlsnp)
+{
+	struct xfs_mount *mp = log->l_mp;
+	int	error;
+	struct xfs_log_iovec reg = {
+		.i_addr = NULL,
+		.i_len = 0,
+		.i_type = XLOG_REG_TYPE_COMMIT,
+	};
+	struct xfs_log_vec vec = {
+		.lv_niovecs = 1,
+		.lv_iovecp = &reg,
+	};
+
+	ASSERT_ALWAYS(iclog);
+	error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
+					XLOG_COMMIT_TRANS);
+	if (error)
+		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+	return error;
+}
+
+/*
+ * Push on the buffer cache code if we ever use more than 75% of the on-disk
+ * log space.  This code pushes on the lsn which would supposedly free up
+ * the 25% which we want to leave free.  We may need to adopt a policy which
+ * pushes on an lsn which is further along in the log once we reach the high
+ * water mark.  In this manner, we would be creating a low water mark.
+ */
+STATIC void
+xlog_grant_push_ail(
+	struct log	*log,
+	int		need_bytes)
+{
+	xfs_lsn_t	threshold_lsn = 0;
+	xfs_lsn_t	last_sync_lsn;
+	int		free_blocks;
+	int		free_bytes;
+	int		threshold_block;
+	int		threshold_cycle;
+	int		free_threshold;
+
+	ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+
+	free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+	free_blocks = BTOBBT(free_bytes);
+
+	/*
+	 * Set the threshold for the minimum number of free blocks in the
+	 * log to the maximum of what the caller needs, one quarter of the
+	 * log, and 256 blocks.
+	 */
+	free_threshold = BTOBB(need_bytes);
+	free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+	free_threshold = MAX(free_threshold, 256);
+	if (free_blocks >= free_threshold)
+		return;
+
+	xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
+						&threshold_block);
+	threshold_block += free_threshold;
+	if (threshold_block >= log->l_logBBsize) {
+		threshold_block -= log->l_logBBsize;
+		threshold_cycle += 1;
+	}
+	threshold_lsn = xlog_assign_lsn(threshold_cycle,
+					threshold_block);
+	/*
+	 * Don't pass in an lsn greater than the lsn of the last
+	 * log record known to be on disk. Use a snapshot of the last sync lsn
+	 * so that it doesn't change between the compare and the set.
+	 */
+	last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+	if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+		threshold_lsn = last_sync_lsn;
+
+	/*
+	 * Get the transaction layer to kick the dirty buffers out to
+	 * disk asynchronously. No point in trying to do this if
+	 * the filesystem is shutting down.
+	 */
+	if (!XLOG_FORCED_SHUTDOWN(log))
+		xfs_ail_push(log->l_ailp, threshold_lsn);
+}
+
+/*
+ * The bdstrat callback function for log bufs. This gives us a central
+ * place to trap bufs in case we get hit by a log I/O error and need to
+ * shutdown. Actually, in practice, even when we didn't get a log error,
+ * we transition the iclogs to IOERROR state *after* flushing all existing
+ * iclogs to disk. This is because we don't want anymore new transactions to be
+ * started or completed afterwards.
+ */
+STATIC int
+xlog_bdstrat(
+	struct xfs_buf		*bp)
+{
+	struct xlog_in_core	*iclog;
+
+	iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		XFS_BUF_ERROR(bp, EIO);
+		XFS_BUF_STALE(bp);
+		xfs_buf_ioend(bp, 0);
+		/*
+		 * It would seem logical to return EIO here, but we rely on
+		 * the log state machine to propagate I/O errors instead of
+		 * doing it here.
+		 */
+		return 0;
+	}
+
+	bp->b_flags |= _XBF_RUN_QUEUES;
+	xfs_buf_iorequest(bp);
+	return 0;
+}
+
+/*
+ * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 
+ * fashion.  Previously, we should have moved the current iclog
+ * ptr in the log to point to the next available iclog.  This allows further
+ * write to continue while this code syncs out an iclog ready to go.
+ * Before an in-core log can be written out, the data section must be scanned
+ * to save away the 1st word of each BBSIZE block into the header.  We replace
+ * it with the current cycle count.  Each BBSIZE block is tagged with the
+ * cycle count because there in an implicit assumption that drives will
+ * guarantee that entire 512 byte blocks get written at once.  In other words,
+ * we can't have part of a 512 byte block written and part not written.  By
+ * tagging each block, we will know which blocks are valid when recovering
+ * after an unclean shutdown.
+ *
+ * This routine is single threaded on the iclog.  No other thread can be in
+ * this routine with the same iclog.  Changing contents of iclog can there-
+ * fore be done without grabbing the state machine lock.  Updating the global
+ * log will require grabbing the lock though.
+ *
+ * The entire log manager uses a logical block numbering scheme.  Only
+ * log_sync (and then only bwrite()) know about the fact that the log may
+ * not start with block zero on a given device.  The log block start offset
+ * is added immediately before calling bwrite().
+ */
+
+STATIC int
+xlog_sync(xlog_t		*log,
+	  xlog_in_core_t	*iclog)
+{
+	xfs_caddr_t	dptr;		/* pointer to byte sized element */
+	xfs_buf_t	*bp;
+	int		i;
+	uint		count;		/* byte count of bwrite */
+	uint		count_init;	/* initial count before roundup */
+	int		roundoff;       /* roundoff to BB or stripe */
+	int		split = 0;	/* split write into two regions */
+	int		error;
+	int		v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
+
+	XFS_STATS_INC(xs_log_writes);
+	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
+
+	/* Add for LR header */
+	count_init = log->l_iclog_hsize + iclog->ic_offset;
+
+	/* Round out the log write size */
+	if (v2 && log->l_mp->m_sb.sb_logsunit > 1) {
+		/* we have a v2 stripe unit to use */
+		count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
+	} else {
+		count = BBTOB(BTOBB(count_init));
+	}
+	roundoff = count - count_init;
+	ASSERT(roundoff >= 0);
+	ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 && 
+                roundoff < log->l_mp->m_sb.sb_logsunit)
+		|| 
+		(log->l_mp->m_sb.sb_logsunit <= 1 && 
+		 roundoff < BBTOB(1)));
+
+	/* move grant heads by roundoff in sync */
+	xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
+	xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
+
+	/* put cycle number in every block */
+	xlog_pack_data(log, iclog, roundoff); 
+
+	/* real byte length */
+	if (v2) {
+		iclog->ic_header.h_len =
+			cpu_to_be32(iclog->ic_offset + roundoff);
+	} else {
+		iclog->ic_header.h_len =
+			cpu_to_be32(iclog->ic_offset);
+	}
+
+	bp = iclog->ic_bp;
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1);
+	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
+	XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
+
+	XFS_STATS_ADD(xs_log_blocks, BTOBB(count));
+
+	/* Do we need to split this write into 2 parts? */
+	if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
+		split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
+		count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
+		iclog->ic_bwritecnt = 2;	/* split into 2 writes */
+	} else {
+		iclog->ic_bwritecnt = 1;
+	}
+	XFS_BUF_SET_COUNT(bp, count);
+	XFS_BUF_SET_FSPRIVATE(bp, iclog);	/* save for later */
+	XFS_BUF_ZEROFLAGS(bp);
+	XFS_BUF_BUSY(bp);
+	XFS_BUF_ASYNC(bp);
+	bp->b_flags |= XBF_LOG_BUFFER;
+
+	if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
+		/*
+		 * If we have an external log device, flush the data device
+		 * before flushing the log to make sure all meta data
+		 * written back from the AIL actually made it to disk
+		 * before writing out the new log tail LSN in the log buffer.
+		 */
+		if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
+			xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
+		XFS_BUF_ORDERED(bp);
+	}
+
+	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
+	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
+
+	xlog_verify_iclog(log, iclog, count, B_TRUE);
+
+	/* account for log which doesn't start at block #0 */
+	XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
+	/*
+	 * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
+	 * is shutting down.
+	 */
+	XFS_BUF_WRITE(bp);
+
+	if ((error = xlog_bdstrat(bp))) {
+		xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
+				  XFS_BUF_ADDR(bp));
+		return error;
+	}
+	if (split) {
+		bp = iclog->ic_log->l_xbuf;
+		ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) ==
+							(unsigned long)1);
+		XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
+		XFS_BUF_SET_ADDR(bp, 0);	     /* logical 0 */
+		XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+
+					    (__psint_t)count), split);
+		XFS_BUF_SET_FSPRIVATE(bp, iclog);
+		XFS_BUF_ZEROFLAGS(bp);
+		XFS_BUF_BUSY(bp);
+		XFS_BUF_ASYNC(bp);
+		bp->b_flags |= XBF_LOG_BUFFER;
+		if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
+			XFS_BUF_ORDERED(bp);
+		dptr = XFS_BUF_PTR(bp);
+		/*
+		 * Bump the cycle numbers at the start of each block
+		 * since this part of the buffer is at the start of
+		 * a new cycle.  Watch out for the header magic number
+		 * case, though.
+		 */
+		for (i = 0; i < split; i += BBSIZE) {
+			be32_add_cpu((__be32 *)dptr, 1);
+			if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM)
+				be32_add_cpu((__be32 *)dptr, 1);
+			dptr += BBSIZE;
+		}
+
+		ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
+		ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
+
+		/* account for internal log which doesn't start at block #0 */
+		XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
+		XFS_BUF_WRITE(bp);
+		if ((error = xlog_bdstrat(bp))) {
+			xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
+					  bp, XFS_BUF_ADDR(bp));
+			return error;
+		}
+	}
+	return 0;
+}	/* xlog_sync */
+
+
+/*
+ * Deallocate a log structure
+ */
+STATIC void
+xlog_dealloc_log(xlog_t *log)
+{
+	xlog_in_core_t	*iclog, *next_iclog;
+	int		i;
+
+	xlog_cil_destroy(log);
+
+	/*
+	 * always need to ensure that the extra buffer does not point to memory
+	 * owned by another log buffer before we free it.
+	 */
+	xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size);
+	xfs_buf_free(log->l_xbuf);
+
+	iclog = log->l_iclog;
+	for (i=0; i<log->l_iclog_bufs; i++) {
+		xfs_buf_free(iclog->ic_bp);
+		next_iclog = iclog->ic_next;
+		kmem_free(iclog);
+		iclog = next_iclog;
+	}
+	spinlock_destroy(&log->l_icloglock);
+
+	log->l_mp->m_log = NULL;
+	kmem_free(log);
+}	/* xlog_dealloc_log */
+
+/*
+ * Update counters atomically now that memcpy is done.
+ */
+/* ARGSUSED */
+static inline void
+xlog_state_finish_copy(xlog_t		*log,
+		       xlog_in_core_t	*iclog,
+		       int		record_cnt,
+		       int		copy_bytes)
+{
+	spin_lock(&log->l_icloglock);
+
+	be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt);
+	iclog->ic_offset += copy_bytes;
+
+	spin_unlock(&log->l_icloglock);
+}	/* xlog_state_finish_copy */
+
+
+
+
+/*
+ * print out info relating to regions written which consume
+ * the reservation
+ */
+void
+xlog_print_tic_res(
+	struct xfs_mount	*mp,
+	struct xlog_ticket	*ticket)
+{
+	uint i;
+	uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
+
+	/* match with XLOG_REG_TYPE_* in xfs_log.h */
+	static char *res_type_str[XLOG_REG_TYPE_MAX] = {
+	    "bformat",
+	    "bchunk",
+	    "efi_format",
+	    "efd_format",
+	    "iformat",
+	    "icore",
+	    "iext",
+	    "ibroot",
+	    "ilocal",
+	    "iattr_ext",
+	    "iattr_broot",
+	    "iattr_local",
+	    "qformat",
+	    "dquot",
+	    "quotaoff",
+	    "LR header",
+	    "unmount",
+	    "commit",
+	    "trans header"
+	};
+	static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
+	    "SETATTR_NOT_SIZE",
+	    "SETATTR_SIZE",
+	    "INACTIVE",
+	    "CREATE",
+	    "CREATE_TRUNC",
+	    "TRUNCATE_FILE",
+	    "REMOVE",
+	    "LINK",
+	    "RENAME",
+	    "MKDIR",
+	    "RMDIR",
+	    "SYMLINK",
+	    "SET_DMATTRS",
+	    "GROWFS",
+	    "STRAT_WRITE",
+	    "DIOSTRAT",
+	    "WRITE_SYNC",
+	    "WRITEID",
+	    "ADDAFORK",
+	    "ATTRINVAL",
+	    "ATRUNCATE",
+	    "ATTR_SET",
+	    "ATTR_RM",
+	    "ATTR_FLAG",
+	    "CLEAR_AGI_BUCKET",
+	    "QM_SBCHANGE",
+	    "DUMMY1",
+	    "DUMMY2",
+	    "QM_QUOTAOFF",
+	    "QM_DQALLOC",
+	    "QM_SETQLIM",
+	    "QM_DQCLUSTER",
+	    "QM_QINOCREATE",
+	    "QM_QUOTAOFF_END",
+	    "SB_UNIT",
+	    "FSYNC_TS",
+	    "GROWFSRT_ALLOC",
+	    "GROWFSRT_ZERO",
+	    "GROWFSRT_FREE",
+	    "SWAPEXT"
+	};
+
+	xfs_warn(mp,
+		"xfs_log_write: reservation summary:\n"
+		"  trans type  = %s (%u)\n"
+		"  unit res    = %d bytes\n"
+		"  current res = %d bytes\n"
+		"  total reg   = %u bytes (o/flow = %u bytes)\n"
+		"  ophdrs      = %u (ophdr space = %u bytes)\n"
+		"  ophdr + reg = %u bytes\n"
+		"  num regions = %u\n",
+		((ticket->t_trans_type <= 0 ||
+		  ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
+		  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
+		ticket->t_trans_type,
+		ticket->t_unit_res,
+		ticket->t_curr_res,
+		ticket->t_res_arr_sum, ticket->t_res_o_flow,
+		ticket->t_res_num_ophdrs, ophdr_spc,
+		ticket->t_res_arr_sum +
+		ticket->t_res_o_flow + ophdr_spc,
+		ticket->t_res_num);
+
+	for (i = 0; i < ticket->t_res_num; i++) {
+		uint r_type = ticket->t_res_arr[i].r_type;
+		xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
+			    ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
+			    "bad-rtype" : res_type_str[r_type-1]),
+			    ticket->t_res_arr[i].r_len);
+	}
+
+	xfs_alert_tag(mp, XFS_PTAG_LOGRES,
+		"xfs_log_write: reservation ran out. Need to up reservation");
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+}
+
+/*
+ * Calculate the potential space needed by the log vector.  Each region gets
+ * its own xlog_op_header_t and may need to be double word aligned.
+ */
+static int
+xlog_write_calc_vec_length(
+	struct xlog_ticket	*ticket,
+	struct xfs_log_vec	*log_vector)
+{
+	struct xfs_log_vec	*lv;
+	int			headers = 0;
+	int			len = 0;
+	int			i;
+
+	/* acct for start rec of xact */
+	if (ticket->t_flags & XLOG_TIC_INITED)
+		headers++;
+
+	for (lv = log_vector; lv; lv = lv->lv_next) {
+		headers += lv->lv_niovecs;
+
+		for (i = 0; i < lv->lv_niovecs; i++) {
+			struct xfs_log_iovec	*vecp = &lv->lv_iovecp[i];
+
+			len += vecp->i_len;
+			xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
+		}
+	}
+
+	ticket->t_res_num_ophdrs += headers;
+	len += headers * sizeof(struct xlog_op_header);
+
+	return len;
+}
+
+/*
+ * If first write for transaction, insert start record  We can't be trying to
+ * commit if we are inited.  We can't have any "partial_copy" if we are inited.
+ */
+static int
+xlog_write_start_rec(
+	struct xlog_op_header	*ophdr,
+	struct xlog_ticket	*ticket)
+{
+	if (!(ticket->t_flags & XLOG_TIC_INITED))
+		return 0;
+
+	ophdr->oh_tid	= cpu_to_be32(ticket->t_tid);
+	ophdr->oh_clientid = ticket->t_clientid;
+	ophdr->oh_len = 0;
+	ophdr->oh_flags = XLOG_START_TRANS;
+	ophdr->oh_res2 = 0;
+
+	ticket->t_flags &= ~XLOG_TIC_INITED;
+
+	return sizeof(struct xlog_op_header);
+}
+
+static xlog_op_header_t *
+xlog_write_setup_ophdr(
+	struct log		*log,
+	struct xlog_op_header	*ophdr,
+	struct xlog_ticket	*ticket,
+	uint			flags)
+{
+	ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+	ophdr->oh_clientid = ticket->t_clientid;
+	ophdr->oh_res2 = 0;
+
+	/* are we copying a commit or unmount record? */
+	ophdr->oh_flags = flags;
+
+	/*
+	 * We've seen logs corrupted with bad transaction client ids.  This
+	 * makes sure that XFS doesn't generate them on.  Turn this into an EIO
+	 * and shut down the filesystem.
+	 */
+	switch (ophdr->oh_clientid)  {
+	case XFS_TRANSACTION:
+	case XFS_VOLUME:
+	case XFS_LOG:
+		break;
+	default:
+		xfs_warn(log->l_mp,
+			"Bad XFS transaction clientid 0x%x in ticket 0x%p",
+			ophdr->oh_clientid, ticket);
+		return NULL;
+	}
+
+	return ophdr;
+}
+
+/*
+ * Set up the parameters of the region copy into the log. This has
+ * to handle region write split across multiple log buffers - this
+ * state is kept external to this function so that this code can
+ * can be written in an obvious, self documenting manner.
+ */
+static int
+xlog_write_setup_copy(
+	struct xlog_ticket	*ticket,
+	struct xlog_op_header	*ophdr,
+	int			space_available,
+	int			space_required,
+	int			*copy_off,
+	int			*copy_len,
+	int			*last_was_partial_copy,
+	int			*bytes_consumed)
+{
+	int			still_to_copy;
+
+	still_to_copy = space_required - *bytes_consumed;
+	*copy_off = *bytes_consumed;
+
+	if (still_to_copy <= space_available) {
+		/* write of region completes here */
+		*copy_len = still_to_copy;
+		ophdr->oh_len = cpu_to_be32(*copy_len);
+		if (*last_was_partial_copy)
+			ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
+		*last_was_partial_copy = 0;
+		*bytes_consumed = 0;
+		return 0;
+	}
+
+	/* partial write of region, needs extra log op header reservation */
+	*copy_len = space_available;
+	ophdr->oh_len = cpu_to_be32(*copy_len);
+	ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+	if (*last_was_partial_copy)
+		ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
+	*bytes_consumed += *copy_len;
+	(*last_was_partial_copy)++;
+
+	/* account for new log op header */
+	ticket->t_curr_res -= sizeof(struct xlog_op_header);
+	ticket->t_res_num_ophdrs++;
+
+	return sizeof(struct xlog_op_header);
+}
+
+static int
+xlog_write_copy_finish(
+	struct log		*log,
+	struct xlog_in_core	*iclog,
+	uint			flags,
+	int			*record_cnt,
+	int			*data_cnt,
+	int			*partial_copy,
+	int			*partial_copy_len,
+	int			log_offset,
+	struct xlog_in_core	**commit_iclog)
+{
+	if (*partial_copy) {
+		/*
+		 * This iclog has already been marked WANT_SYNC by
+		 * xlog_state_get_iclog_space.
+		 */
+		xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+		*record_cnt = 0;
+		*data_cnt = 0;
+		return xlog_state_release_iclog(log, iclog);
+	}
+
+	*partial_copy = 0;
+	*partial_copy_len = 0;
+
+	if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
+		/* no more space in this iclog - push it. */
+		xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+		*record_cnt = 0;
+		*data_cnt = 0;
+
+		spin_lock(&log->l_icloglock);
+		xlog_state_want_sync(log, iclog);
+		spin_unlock(&log->l_icloglock);
+
+		if (!commit_iclog)
+			return xlog_state_release_iclog(log, iclog);
+		ASSERT(flags & XLOG_COMMIT_TRANS);
+		*commit_iclog = iclog;
+	}
+
+	return 0;
+}
+
+/*
+ * Write some region out to in-core log
+ *
+ * This will be called when writing externally provided regions or when
+ * writing out a commit record for a given transaction.
+ *
+ * General algorithm:
+ *	1. Find total length of this write.  This may include adding to the
+ *		lengths passed in.
+ *	2. Check whether we violate the tickets reservation.
+ *	3. While writing to this iclog
+ *	    A. Reserve as much space in this iclog as can get
+ *	    B. If this is first write, save away start lsn
+ *	    C. While writing this region:
+ *		1. If first write of transaction, write start record
+ *		2. Write log operation header (header per region)
+ *		3. Find out if we can fit entire region into this iclog
+ *		4. Potentially, verify destination memcpy ptr
+ *		5. Memcpy (partial) region
+ *		6. If partial copy, release iclog; otherwise, continue
+ *			copying more regions into current iclog
+ *	4. Mark want sync bit (in simulation mode)
+ *	5. Release iclog for potential flush to on-disk log.
+ *
+ * ERRORS:
+ * 1.	Panic if reservation is overrun.  This should never happen since
+ *	reservation amounts are generated internal to the filesystem.
+ * NOTES:
+ * 1. Tickets are single threaded data structures.
+ * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the
+ *	syncing routine.  When a single log_write region needs to span
+ *	multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set
+ *	on all log operation writes which don't contain the end of the
+ *	region.  The XLOG_END_TRANS bit is used for the in-core log
+ *	operation which contains the end of the continued log_write region.
+ * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog,
+ *	we don't really know exactly how much space will be used.  As a result,
+ *	we don't update ic_offset until the end when we know exactly how many
+ *	bytes have been written out.
+ */
+int
+xlog_write(
+	struct log		*log,
+	struct xfs_log_vec	*log_vector,
+	struct xlog_ticket	*ticket,
+	xfs_lsn_t		*start_lsn,
+	struct xlog_in_core	**commit_iclog,
+	uint			flags)
+{
+	struct xlog_in_core	*iclog = NULL;
+	struct xfs_log_iovec	*vecp;
+	struct xfs_log_vec	*lv;
+	int			len;
+	int			index;
+	int			partial_copy = 0;
+	int			partial_copy_len = 0;
+	int			contwr = 0;
+	int			record_cnt = 0;
+	int			data_cnt = 0;
+	int			error;
+
+	*start_lsn = 0;
+
+	len = xlog_write_calc_vec_length(ticket, log_vector);
+	if (log->l_cilp) {
+		/*
+		 * Region headers and bytes are already accounted for.
+		 * We only need to take into account start records and
+		 * split regions in this function.
+		 */
+		if (ticket->t_flags & XLOG_TIC_INITED)
+			ticket->t_curr_res -= sizeof(xlog_op_header_t);
+
+		/*
+		 * Commit record headers need to be accounted for. These
+		 * come in as separate writes so are easy to detect.
+		 */
+		if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
+			ticket->t_curr_res -= sizeof(xlog_op_header_t);
+	} else
+		ticket->t_curr_res -= len;
+
+	if (ticket->t_curr_res < 0)
+		xlog_print_tic_res(log->l_mp, ticket);
+
+	index = 0;
+	lv = log_vector;
+	vecp = lv->lv_iovecp;
+	while (lv && index < lv->lv_niovecs) {
+		void		*ptr;
+		int		log_offset;
+
+		error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+						   &contwr, &log_offset);
+		if (error)
+			return error;
+
+		ASSERT(log_offset <= iclog->ic_size - 1);
+		ptr = iclog->ic_datap + log_offset;
+
+		/* start_lsn is the first lsn written to. That's all we need. */
+		if (!*start_lsn)
+			*start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+
+		/*
+		 * This loop writes out as many regions as can fit in the amount
+		 * of space which was allocated by xlog_state_get_iclog_space().
+		 */
+		while (lv && index < lv->lv_niovecs) {
+			struct xfs_log_iovec	*reg = &vecp[index];
+			struct xlog_op_header	*ophdr;
+			int			start_rec_copy;
+			int			copy_len;
+			int			copy_off;
+
+			ASSERT(reg->i_len % sizeof(__int32_t) == 0);
+			ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
+
+			start_rec_copy = xlog_write_start_rec(ptr, ticket);
+			if (start_rec_copy) {
+				record_cnt++;
+				xlog_write_adv_cnt(&ptr, &len, &log_offset,
+						   start_rec_copy);
+			}
+
+			ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
+			if (!ophdr)
+				return XFS_ERROR(EIO);
+
+			xlog_write_adv_cnt(&ptr, &len, &log_offset,
+					   sizeof(struct xlog_op_header));
+
+			len += xlog_write_setup_copy(ticket, ophdr,
+						     iclog->ic_size-log_offset,
+						     reg->i_len,
+						     &copy_off, &copy_len,
+						     &partial_copy,
+						     &partial_copy_len);
+			xlog_verify_dest_ptr(log, ptr);
+
+			/* copy region */
+			ASSERT(copy_len >= 0);
+			memcpy(ptr, reg->i_addr + copy_off, copy_len);
+			xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
+
+			copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+			record_cnt++;
+			data_cnt += contwr ? copy_len : 0;
+
+			error = xlog_write_copy_finish(log, iclog, flags,
+						       &record_cnt, &data_cnt,
+						       &partial_copy,
+						       &partial_copy_len,
+						       log_offset,
+						       commit_iclog);
+			if (error)
+				return error;
+
+			/*
+			 * if we had a partial copy, we need to get more iclog
+			 * space but we don't want to increment the region
+			 * index because there is still more is this region to
+			 * write.
+			 *
+			 * If we completed writing this region, and we flushed
+			 * the iclog (indicated by resetting of the record
+			 * count), then we also need to get more log space. If
+			 * this was the last record, though, we are done and
+			 * can just return.
+			 */
+			if (partial_copy)
+				break;
+
+			if (++index == lv->lv_niovecs) {
+				lv = lv->lv_next;
+				index = 0;
+				if (lv)
+					vecp = lv->lv_iovecp;
+			}
+			if (record_cnt == 0) {
+				if (!lv)
+					return 0;
+				break;
+			}
+		}
+	}
+
+	ASSERT(len == 0);
+
+	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+	if (!commit_iclog)
+		return xlog_state_release_iclog(log, iclog);
+
+	ASSERT(flags & XLOG_COMMIT_TRANS);
+	*commit_iclog = iclog;
+	return 0;
+}
+
+
+/*****************************************************************************
+ *
+ *		State Machine functions
+ *
+ *****************************************************************************
+ */
+
+/* Clean iclogs starting from the head.  This ordering must be
+ * maintained, so an iclog doesn't become ACTIVE beyond one that
+ * is SYNCING.  This is also required to maintain the notion that we use
+ * a ordered wait queue to hold off would be writers to the log when every
+ * iclog is trying to sync to disk.
+ *
+ * State Change: DIRTY -> ACTIVE
+ */
+STATIC void
+xlog_state_clean_log(xlog_t *log)
+{
+	xlog_in_core_t	*iclog;
+	int changed = 0;
+
+	iclog = log->l_iclog;
+	do {
+		if (iclog->ic_state == XLOG_STATE_DIRTY) {
+			iclog->ic_state	= XLOG_STATE_ACTIVE;
+			iclog->ic_offset       = 0;
+			ASSERT(iclog->ic_callback == NULL);
+			/*
+			 * If the number of ops in this iclog indicate it just
+			 * contains the dummy transaction, we can
+			 * change state into IDLE (the second time around).
+			 * Otherwise we should change the state into
+			 * NEED a dummy.
+			 * We don't need to cover the dummy.
+			 */
+			if (!changed &&
+			   (be32_to_cpu(iclog->ic_header.h_num_logops) ==
+			   		XLOG_COVER_OPS)) {
+				changed = 1;
+			} else {
+				/*
+				 * We have two dirty iclogs so start over
+				 * This could also be num of ops indicates
+				 * this is not the dummy going out.
+				 */
+				changed = 2;
+			}
+			iclog->ic_header.h_num_logops = 0;
+			memset(iclog->ic_header.h_cycle_data, 0,
+			      sizeof(iclog->ic_header.h_cycle_data));
+			iclog->ic_header.h_lsn = 0;
+		} else if (iclog->ic_state == XLOG_STATE_ACTIVE)
+			/* do nothing */;
+		else
+			break;	/* stop cleaning */
+		iclog = iclog->ic_next;
+	} while (iclog != log->l_iclog);
+
+	/* log is locked when we are called */
+	/*
+	 * Change state for the dummy log recording.
+	 * We usually go to NEED. But we go to NEED2 if the changed indicates
+	 * we are done writing the dummy record.
+	 * If we are done with the second dummy recored (DONE2), then
+	 * we go to IDLE.
+	 */
+	if (changed) {
+		switch (log->l_covered_state) {
+		case XLOG_STATE_COVER_IDLE:
+		case XLOG_STATE_COVER_NEED:
+		case XLOG_STATE_COVER_NEED2:
+			log->l_covered_state = XLOG_STATE_COVER_NEED;
+			break;
+
+		case XLOG_STATE_COVER_DONE:
+			if (changed == 1)
+				log->l_covered_state = XLOG_STATE_COVER_NEED2;
+			else
+				log->l_covered_state = XLOG_STATE_COVER_NEED;
+			break;
+
+		case XLOG_STATE_COVER_DONE2:
+			if (changed == 1)
+				log->l_covered_state = XLOG_STATE_COVER_IDLE;
+			else
+				log->l_covered_state = XLOG_STATE_COVER_NEED;
+			break;
+
+		default:
+			ASSERT(0);
+		}
+	}
+}	/* xlog_state_clean_log */
+
+STATIC xfs_lsn_t
+xlog_get_lowest_lsn(
+	xlog_t		*log)
+{
+	xlog_in_core_t  *lsn_log;
+	xfs_lsn_t	lowest_lsn, lsn;
+
+	lsn_log = log->l_iclog;
+	lowest_lsn = 0;
+	do {
+	    if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) {
+		lsn = be64_to_cpu(lsn_log->ic_header.h_lsn);
+		if ((lsn && !lowest_lsn) ||
+		    (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) {
+			lowest_lsn = lsn;
+		}
+	    }
+	    lsn_log = lsn_log->ic_next;
+	} while (lsn_log != log->l_iclog);
+	return lowest_lsn;
+}
+
+
+STATIC void
+xlog_state_do_callback(
+	xlog_t		*log,
+	int		aborted,
+	xlog_in_core_t	*ciclog)
+{
+	xlog_in_core_t	   *iclog;
+	xlog_in_core_t	   *first_iclog;	/* used to know when we've
+						 * processed all iclogs once */
+	xfs_log_callback_t *cb, *cb_next;
+	int		   flushcnt = 0;
+	xfs_lsn_t	   lowest_lsn;
+	int		   ioerrors;	/* counter: iclogs with errors */
+	int		   loopdidcallbacks; /* flag: inner loop did callbacks*/
+	int		   funcdidcallbacks; /* flag: function did callbacks */
+	int		   repeats;	/* for issuing console warnings if
+					 * looping too many times */
+	int		   wake = 0;
+
+	spin_lock(&log->l_icloglock);
+	first_iclog = iclog = log->l_iclog;
+	ioerrors = 0;
+	funcdidcallbacks = 0;
+	repeats = 0;
+
+	do {
+		/*
+		 * Scan all iclogs starting with the one pointed to by the
+		 * log.  Reset this starting point each time the log is
+		 * unlocked (during callbacks).
+		 *
+		 * Keep looping through iclogs until one full pass is made
+		 * without running any callbacks.
+		 */
+		first_iclog = log->l_iclog;
+		iclog = log->l_iclog;
+		loopdidcallbacks = 0;
+		repeats++;
+
+		do {
+
+			/* skip all iclogs in the ACTIVE & DIRTY states */
+			if (iclog->ic_state &
+			    (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
+				iclog = iclog->ic_next;
+				continue;
+			}
+
+			/*
+			 * Between marking a filesystem SHUTDOWN and stopping
+			 * the log, we do flush all iclogs to disk (if there
+			 * wasn't a log I/O error). So, we do want things to
+			 * go smoothly in case of just a SHUTDOWN  w/o a
+			 * LOG_IO_ERROR.
+			 */
+			if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
+				/*
+				 * Can only perform callbacks in order.  Since
+				 * this iclog is not in the DONE_SYNC/
+				 * DO_CALLBACK state, we skip the rest and
+				 * just try to clean up.  If we set our iclog
+				 * to DO_CALLBACK, we will not process it when
+				 * we retry since a previous iclog is in the
+				 * CALLBACK and the state cannot change since
+				 * we are holding the l_icloglock.
+				 */
+				if (!(iclog->ic_state &
+					(XLOG_STATE_DONE_SYNC |
+						 XLOG_STATE_DO_CALLBACK))) {
+					if (ciclog && (ciclog->ic_state ==
+							XLOG_STATE_DONE_SYNC)) {
+						ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
+					}
+					break;
+				}
+				/*
+				 * We now have an iclog that is in either the
+				 * DO_CALLBACK or DONE_SYNC states. The other
+				 * states (WANT_SYNC, SYNCING, or CALLBACK were
+				 * caught by the above if and are going to
+				 * clean (i.e. we aren't doing their callbacks)
+				 * see the above if.
+				 */
+
+				/*
+				 * We will do one more check here to see if we
+				 * have chased our tail around.
+				 */
+
+				lowest_lsn = xlog_get_lowest_lsn(log);
+				if (lowest_lsn &&
+				    XFS_LSN_CMP(lowest_lsn,
+						be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
+					iclog = iclog->ic_next;
+					continue; /* Leave this iclog for
+						   * another thread */
+				}
+
+				iclog->ic_state = XLOG_STATE_CALLBACK;
+
+
+				/*
+				 * update the last_sync_lsn before we drop the
+				 * icloglock to ensure we are the only one that
+				 * can update it.
+				 */
+				ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
+					be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
+				atomic64_set(&log->l_last_sync_lsn,
+					be64_to_cpu(iclog->ic_header.h_lsn));
+
+			} else
+				ioerrors++;
+
+			spin_unlock(&log->l_icloglock);
+
+			/*
+			 * Keep processing entries in the callback list until
+			 * we come around and it is empty.  We need to
+			 * atomically see that the list is empty and change the
+			 * state to DIRTY so that we don't miss any more
+			 * callbacks being added.
+			 */
+			spin_lock(&iclog->ic_callback_lock);
+			cb = iclog->ic_callback;
+			while (cb) {
+				iclog->ic_callback_tail = &(iclog->ic_callback);
+				iclog->ic_callback = NULL;
+				spin_unlock(&iclog->ic_callback_lock);
+
+				/* perform callbacks in the order given */
+				for (; cb; cb = cb_next) {
+					cb_next = cb->cb_next;
+					cb->cb_func(cb->cb_arg, aborted);
+				}
+				spin_lock(&iclog->ic_callback_lock);
+				cb = iclog->ic_callback;
+			}
+
+			loopdidcallbacks++;
+			funcdidcallbacks++;
+
+			spin_lock(&log->l_icloglock);
+			ASSERT(iclog->ic_callback == NULL);
+			spin_unlock(&iclog->ic_callback_lock);
+			if (!(iclog->ic_state & XLOG_STATE_IOERROR))
+				iclog->ic_state = XLOG_STATE_DIRTY;
+
+			/*
+			 * Transition from DIRTY to ACTIVE if applicable.
+			 * NOP if STATE_IOERROR.
+			 */
+			xlog_state_clean_log(log);
+
+			/* wake up threads waiting in xfs_log_force() */
+			wake_up_all(&iclog->ic_force_wait);
+
+			iclog = iclog->ic_next;
+		} while (first_iclog != iclog);
+
+		if (repeats > 5000) {
+			flushcnt += repeats;
+			repeats = 0;
+			xfs_warn(log->l_mp,
+				"%s: possible infinite loop (%d iterations)",
+				__func__, flushcnt);
+		}
+	} while (!ioerrors && loopdidcallbacks);
+
+	/*
+	 * make one last gasp attempt to see if iclogs are being left in
+	 * limbo..
+	 */
+#ifdef DEBUG
+	if (funcdidcallbacks) {
+		first_iclog = iclog = log->l_iclog;
+		do {
+			ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
+			/*
+			 * Terminate the loop if iclogs are found in states
+			 * which will cause other threads to clean up iclogs.
+			 *
+			 * SYNCING - i/o completion will go through logs
+			 * DONE_SYNC - interrupt thread should be waiting for
+			 *              l_icloglock
+			 * IOERROR - give up hope all ye who enter here
+			 */
+			if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+			    iclog->ic_state == XLOG_STATE_SYNCING ||
+			    iclog->ic_state == XLOG_STATE_DONE_SYNC ||
+			    iclog->ic_state == XLOG_STATE_IOERROR )
+				break;
+			iclog = iclog->ic_next;
+		} while (first_iclog != iclog);
+	}
+#endif
+
+	if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
+		wake = 1;
+	spin_unlock(&log->l_icloglock);
+
+	if (wake)
+		wake_up_all(&log->l_flush_wait);
+}
+
+
+/*
+ * Finish transitioning this iclog to the dirty state.
+ *
+ * Make sure that we completely execute this routine only when this is
+ * the last call to the iclog.  There is a good chance that iclog flushes,
+ * when we reach the end of the physical log, get turned into 2 separate
+ * calls to bwrite.  Hence, one iclog flush could generate two calls to this
+ * routine.  By using the reference count bwritecnt, we guarantee that only
+ * the second completion goes through.
+ *
+ * Callbacks could take time, so they are done outside the scope of the
+ * global state machine log lock.
+ */
+STATIC void
+xlog_state_done_syncing(
+	xlog_in_core_t	*iclog,
+	int		aborted)
+{
+	xlog_t		   *log = iclog->ic_log;
+
+	spin_lock(&log->l_icloglock);
+
+	ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
+	       iclog->ic_state == XLOG_STATE_IOERROR);
+	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
+	ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
+
+
+	/*
+	 * If we got an error, either on the first buffer, or in the case of
+	 * split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
+	 * and none should ever be attempted to be written to disk
+	 * again.
+	 */
+	if (iclog->ic_state != XLOG_STATE_IOERROR) {
+		if (--iclog->ic_bwritecnt == 1) {
+			spin_unlock(&log->l_icloglock);
+			return;
+		}
+		iclog->ic_state = XLOG_STATE_DONE_SYNC;
+	}
+
+	/*
+	 * Someone could be sleeping prior to writing out the next
+	 * iclog buffer, we wake them all, one will get to do the
+	 * I/O, the others get to wait for the result.
+	 */
+	wake_up_all(&iclog->ic_write_wait);
+	spin_unlock(&log->l_icloglock);
+	xlog_state_do_callback(log, aborted, iclog);	/* also cleans log */
+}	/* xlog_state_done_syncing */
+
+
+/*
+ * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
+ * sleep.  We wait on the flush queue on the head iclog as that should be
+ * the first iclog to complete flushing. Hence if all iclogs are syncing,
+ * we will wait here and all new writes will sleep until a sync completes.
+ *
+ * The in-core logs are used in a circular fashion. They are not used
+ * out-of-order even when an iclog past the head is free.
+ *
+ * return:
+ *	* log_offset where xlog_write() can start writing into the in-core
+ *		log's data space.
+ *	* in-core log pointer to which xlog_write() should write.
+ *	* boolean indicating this is a continued write to an in-core log.
+ *		If this is the last write, then the in-core log's offset field
+ *		needs to be incremented, depending on the amount of data which
+ *		is copied.
+ */
+STATIC int
+xlog_state_get_iclog_space(xlog_t	  *log,
+			   int		  len,
+			   xlog_in_core_t **iclogp,
+			   xlog_ticket_t  *ticket,
+			   int		  *continued_write,
+			   int		  *logoffsetp)
+{
+	int		  log_offset;
+	xlog_rec_header_t *head;
+	xlog_in_core_t	  *iclog;
+	int		  error;
+
+restart:
+	spin_lock(&log->l_icloglock);
+	if (XLOG_FORCED_SHUTDOWN(log)) {
+		spin_unlock(&log->l_icloglock);
+		return XFS_ERROR(EIO);
+	}
+
+	iclog = log->l_iclog;
+	if (iclog->ic_state != XLOG_STATE_ACTIVE) {
+		XFS_STATS_INC(xs_log_noiclogs);
+
+		/* Wait for log writes to have flushed */
+		xlog_wait(&log->l_flush_wait, &log->l_icloglock);
+		goto restart;
+	}
+
+	head = &iclog->ic_header;
+
+	atomic_inc(&iclog->ic_refcnt);	/* prevents sync */
+	log_offset = iclog->ic_offset;
+
+	/* On the 1st write to an iclog, figure out lsn.  This works
+	 * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
+	 * committing to.  If the offset is set, that's how many blocks
+	 * must be written.
+	 */
+	if (log_offset == 0) {
+		ticket->t_curr_res -= log->l_iclog_hsize;
+		xlog_tic_add_region(ticket,
+				    log->l_iclog_hsize,
+				    XLOG_REG_TYPE_LRHEADER);
+		head->h_cycle = cpu_to_be32(log->l_curr_cycle);
+		head->h_lsn = cpu_to_be64(
+			xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
+		ASSERT(log->l_curr_block >= 0);
+	}
+
+	/* If there is enough room to write everything, then do it.  Otherwise,
+	 * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC
+	 * bit is on, so this will get flushed out.  Don't update ic_offset
+	 * until you know exactly how many bytes get copied.  Therefore, wait
+	 * until later to update ic_offset.
+	 *
+	 * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
+	 * can fit into remaining data section.
+	 */
+	if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
+		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
+
+		/*
+		 * If I'm the only one writing to this iclog, sync it to disk.
+		 * We need to do an atomic compare and decrement here to avoid
+		 * racing with concurrent atomic_dec_and_lock() calls in
+		 * xlog_state_release_iclog() when there is more than one
+		 * reference to the iclog.
+		 */
+		if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
+			/* we are the only one */
+			spin_unlock(&log->l_icloglock);
+			error = xlog_state_release_iclog(log, iclog);
+			if (error)
+				return error;
+		} else {
+			spin_unlock(&log->l_icloglock);
+		}
+		goto restart;
+	}
+
+	/* Do we have enough room to write the full amount in the remainder
+	 * of this iclog?  Or must we continue a write on the next iclog and
+	 * mark this iclog as completely taken?  In the case where we switch
+	 * iclogs (to mark it taken), this particular iclog will release/sync
+	 * to disk in xlog_write().
+	 */
+	if (len <= iclog->ic_size - iclog->ic_offset) {
+		*continued_write = 0;
+		iclog->ic_offset += len;
+	} else {
+		*continued_write = 1;
+		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
+	}
+	*iclogp = iclog;
+
+	ASSERT(iclog->ic_offset <= iclog->ic_size);
+	spin_unlock(&log->l_icloglock);
+
+	*logoffsetp = log_offset;
+	return 0;
+}	/* xlog_state_get_iclog_space */
+
+/*
+ * Atomically get the log space required for a log ticket.
+ *
+ * Once a ticket gets put onto the reserveq, it will only return after
+ * the needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off the reserveq under the
+ * l_grant_reserve_lock, we only need to take that lock if we are going
+ * to add the ticket to the queue and sleep. We can avoid taking the lock if the
+ * ticket was never added to the reserveq because the t_queue list head will be
+ * empty and we hold the only reference to it so it can safely be checked
+ * unlocked.
+ */
+STATIC int
+xlog_grant_log_space(xlog_t	   *log,
+		     xlog_ticket_t *tic)
+{
+	int		 free_bytes;
+	int		 need_bytes;
+
+#ifdef DEBUG
+	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+		panic("grant Recovery problem");
+#endif
+
+	trace_xfs_log_grant_enter(log, tic);
+
+	need_bytes = tic->t_unit_res;
+	if (tic->t_flags & XFS_LOG_PERM_RESERV)
+		need_bytes *= tic->t_ocnt;
+
+	/* something is already sleeping; insert new transaction at end */
+	if (!list_empty_careful(&log->l_reserveq)) {
+		spin_lock(&log->l_grant_reserve_lock);
+		/* recheck the queue now we are locked */
+		if (list_empty(&log->l_reserveq)) {
+			spin_unlock(&log->l_grant_reserve_lock);
+			goto redo;
+		}
+		list_add_tail(&tic->t_queue, &log->l_reserveq);
+
+		trace_xfs_log_grant_sleep1(log, tic);
+
+		/*
+		 * Gotta check this before going to sleep, while we're
+		 * holding the grant lock.
+		 */
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto error_return;
+
+		XFS_STATS_INC(xs_sleep_logspace);
+		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+
+		/*
+		 * If we got an error, and the filesystem is shutting down,
+		 * we'll catch it down below. So just continue...
+		 */
+		trace_xfs_log_grant_wake1(log, tic);
+	}
+
+redo:
+	if (XLOG_FORCED_SHUTDOWN(log))
+		goto error_return_unlocked;
+
+	free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+	if (free_bytes < need_bytes) {
+		spin_lock(&log->l_grant_reserve_lock);
+		if (list_empty(&tic->t_queue))
+			list_add_tail(&tic->t_queue, &log->l_reserveq);
+
+		trace_xfs_log_grant_sleep2(log, tic);
+
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto error_return;
+
+		xlog_grant_push_ail(log, need_bytes);
+
+		XFS_STATS_INC(xs_sleep_logspace);
+		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+
+		trace_xfs_log_grant_wake2(log, tic);
+		goto redo;
+	}
+
+	if (!list_empty(&tic->t_queue)) {
+		spin_lock(&log->l_grant_reserve_lock);
+		list_del_init(&tic->t_queue);
+		spin_unlock(&log->l_grant_reserve_lock);
+	}
+
+	/* we've got enough space */
+	xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
+	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
+	trace_xfs_log_grant_exit(log, tic);
+	xlog_verify_grant_tail(log);
+	return 0;
+
+error_return_unlocked:
+	spin_lock(&log->l_grant_reserve_lock);
+error_return:
+	list_del_init(&tic->t_queue);
+	spin_unlock(&log->l_grant_reserve_lock);
+	trace_xfs_log_grant_error(log, tic);
+
+	/*
+	 * If we are failing, make sure the ticket doesn't have any
+	 * current reservations. We don't want to add this back when
+	 * the ticket/transaction gets cancelled.
+	 */
+	tic->t_curr_res = 0;
+	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+	return XFS_ERROR(EIO);
+}	/* xlog_grant_log_space */
+
+
+/*
+ * Replenish the byte reservation required by moving the grant write head.
+ *
+ * Similar to xlog_grant_log_space, the function is structured to have a lock
+ * free fast path.
+ */
+STATIC int
+xlog_regrant_write_log_space(xlog_t	   *log,
+			     xlog_ticket_t *tic)
+{
+	int		free_bytes, need_bytes;
+
+	tic->t_curr_res = tic->t_unit_res;
+	xlog_tic_reset_res(tic);
+
+	if (tic->t_cnt > 0)
+		return 0;
+
+#ifdef DEBUG
+	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+		panic("regrant Recovery problem");
+#endif
+
+	trace_xfs_log_regrant_write_enter(log, tic);
+	if (XLOG_FORCED_SHUTDOWN(log))
+		goto error_return_unlocked;
+
+	/* If there are other waiters on the queue then give them a
+	 * chance at logspace before us. Wake up the first waiters,
+	 * if we do not wake up all the waiters then go to sleep waiting
+	 * for more free space, otherwise try to get some space for
+	 * this transaction.
+	 */
+	need_bytes = tic->t_unit_res;
+	if (!list_empty_careful(&log->l_writeq)) {
+		struct xlog_ticket *ntic;
+
+		spin_lock(&log->l_grant_write_lock);
+		free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+		list_for_each_entry(ntic, &log->l_writeq, t_queue) {
+			ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
+
+			if (free_bytes < ntic->t_unit_res)
+				break;
+			free_bytes -= ntic->t_unit_res;
+			wake_up(&ntic->t_wait);
+		}
+
+		if (ntic != list_first_entry(&log->l_writeq,
+						struct xlog_ticket, t_queue)) {
+			if (list_empty(&tic->t_queue))
+				list_add_tail(&tic->t_queue, &log->l_writeq);
+			trace_xfs_log_regrant_write_sleep1(log, tic);
+
+			xlog_grant_push_ail(log, need_bytes);
+
+			XFS_STATS_INC(xs_sleep_logspace);
+			xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+			trace_xfs_log_regrant_write_wake1(log, tic);
+		} else
+			spin_unlock(&log->l_grant_write_lock);
+	}
+
+redo:
+	if (XLOG_FORCED_SHUTDOWN(log))
+		goto error_return_unlocked;
+
+	free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+	if (free_bytes < need_bytes) {
+		spin_lock(&log->l_grant_write_lock);
+		if (list_empty(&tic->t_queue))
+			list_add_tail(&tic->t_queue, &log->l_writeq);
+
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto error_return;
+
+		xlog_grant_push_ail(log, need_bytes);
+
+		XFS_STATS_INC(xs_sleep_logspace);
+		trace_xfs_log_regrant_write_sleep2(log, tic);
+		xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+
+		trace_xfs_log_regrant_write_wake2(log, tic);
+		goto redo;
+	}
+
+	if (!list_empty(&tic->t_queue)) {
+		spin_lock(&log->l_grant_write_lock);
+		list_del_init(&tic->t_queue);
+		spin_unlock(&log->l_grant_write_lock);
+	}
+
+	/* we've got enough space */
+	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
+	trace_xfs_log_regrant_write_exit(log, tic);
+	xlog_verify_grant_tail(log);
+	return 0;
+
+
+ error_return_unlocked:
+	spin_lock(&log->l_grant_write_lock);
+ error_return:
+	list_del_init(&tic->t_queue);
+	spin_unlock(&log->l_grant_write_lock);
+	trace_xfs_log_regrant_write_error(log, tic);
+
+	/*
+	 * If we are failing, make sure the ticket doesn't have any
+	 * current reservations. We don't want to add this back when
+	 * the ticket/transaction gets cancelled.
+	 */
+	tic->t_curr_res = 0;
+	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+	return XFS_ERROR(EIO);
+}	/* xlog_regrant_write_log_space */
+
+
+/* The first cnt-1 times through here we don't need to
+ * move the grant write head because the permanent
+ * reservation has reserved cnt times the unit amount.
+ * Release part of current permanent unit reservation and
+ * reset current reservation to be one units worth.  Also
+ * move grant reservation head forward.
+ */
+STATIC void
+xlog_regrant_reserve_log_space(xlog_t	     *log,
+			       xlog_ticket_t *ticket)
+{
+	trace_xfs_log_regrant_reserve_enter(log, ticket);
+
+	if (ticket->t_cnt > 0)
+		ticket->t_cnt--;
+
+	xlog_grant_sub_space(log, &log->l_grant_reserve_head,
+					ticket->t_curr_res);
+	xlog_grant_sub_space(log, &log->l_grant_write_head,
+					ticket->t_curr_res);
+	ticket->t_curr_res = ticket->t_unit_res;
+	xlog_tic_reset_res(ticket);
+
+	trace_xfs_log_regrant_reserve_sub(log, ticket);
+
+	/* just return if we still have some of the pre-reserved space */
+	if (ticket->t_cnt > 0)
+		return;
+
+	xlog_grant_add_space(log, &log->l_grant_reserve_head,
+					ticket->t_unit_res);
+
+	trace_xfs_log_regrant_reserve_exit(log, ticket);
+
+	ticket->t_curr_res = ticket->t_unit_res;
+	xlog_tic_reset_res(ticket);
+}	/* xlog_regrant_reserve_log_space */
+
+
+/*
+ * Give back the space left from a reservation.
+ *
+ * All the information we need to make a correct determination of space left
+ * is present.  For non-permanent reservations, things are quite easy.  The
+ * count should have been decremented to zero.  We only need to deal with the
+ * space remaining in the current reservation part of the ticket.  If the
+ * ticket contains a permanent reservation, there may be left over space which
+ * needs to be released.  A count of N means that N-1 refills of the current
+ * reservation can be done before we need to ask for more space.  The first
+ * one goes to fill up the first current reservation.  Once we run out of
+ * space, the count will stay at zero and the only space remaining will be
+ * in the current reservation field.
+ */
+STATIC void
+xlog_ungrant_log_space(xlog_t	     *log,
+		       xlog_ticket_t *ticket)
+{
+	int	bytes;
+
+	if (ticket->t_cnt > 0)
+		ticket->t_cnt--;
+
+	trace_xfs_log_ungrant_enter(log, ticket);
+	trace_xfs_log_ungrant_sub(log, ticket);
+
+	/*
+	 * If this is a permanent reservation ticket, we may be able to free
+	 * up more space based on the remaining count.
+	 */
+	bytes = ticket->t_curr_res;
+	if (ticket->t_cnt > 0) {
+		ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
+		bytes += ticket->t_unit_res*ticket->t_cnt;
+	}
+
+	xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
+	xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
+
+	trace_xfs_log_ungrant_exit(log, ticket);
+
+	xfs_log_move_tail(log->l_mp, 1);
+}	/* xlog_ungrant_log_space */
+
+
+/*
+ * Flush iclog to disk if this is the last reference to the given iclog and
+ * the WANT_SYNC bit is set.
+ *
+ * When this function is entered, the iclog is not necessarily in the
+ * WANT_SYNC state.  It may be sitting around waiting to get filled.
+ *
+ *
+ */
+STATIC int
+xlog_state_release_iclog(
+	xlog_t		*log,
+	xlog_in_core_t	*iclog)
+{
+	int		sync = 0;	/* do we sync? */
+
+	if (iclog->ic_state & XLOG_STATE_IOERROR)
+		return XFS_ERROR(EIO);
+
+	ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
+	if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
+		return 0;
+
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		spin_unlock(&log->l_icloglock);
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
+	       iclog->ic_state == XLOG_STATE_WANT_SYNC);
+
+	if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+		/* update tail before writing to iclog */
+		xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+		sync++;
+		iclog->ic_state = XLOG_STATE_SYNCING;
+		iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+		xlog_verify_tail_lsn(log, iclog, tail_lsn);
+		/* cycle incremented when incrementing curr_block */
+	}
+	spin_unlock(&log->l_icloglock);
+
+	/*
+	 * We let the log lock go, so it's possible that we hit a log I/O
+	 * error or some other SHUTDOWN condition that marks the iclog
+	 * as XLOG_STATE_IOERROR before the bwrite. However, we know that
+	 * this iclog has consistent data, so we ignore IOERROR
+	 * flags after this point.
+	 */
+	if (sync)
+		return xlog_sync(log, iclog);
+	return 0;
+}	/* xlog_state_release_iclog */
+
+
+/*
+ * This routine will mark the current iclog in the ring as WANT_SYNC
+ * and move the current iclog pointer to the next iclog in the ring.
+ * When this routine is called from xlog_state_get_iclog_space(), the
+ * exact size of the iclog has not yet been determined.  All we know is
+ * that every data block.  We have run out of space in this log record.
+ */
+STATIC void
+xlog_state_switch_iclogs(xlog_t		*log,
+			 xlog_in_core_t *iclog,
+			 int		eventual_size)
+{
+	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+	if (!eventual_size)
+		eventual_size = iclog->ic_offset;
+	iclog->ic_state = XLOG_STATE_WANT_SYNC;
+	iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block);
+	log->l_prev_block = log->l_curr_block;
+	log->l_prev_cycle = log->l_curr_cycle;
+
+	/* roll log?: ic_offset changed later */
+	log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
+
+	/* Round up to next log-sunit */
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
+	    log->l_mp->m_sb.sb_logsunit > 1) {
+		__uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
+		log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
+	}
+
+	if (log->l_curr_block >= log->l_logBBsize) {
+		log->l_curr_cycle++;
+		if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
+			log->l_curr_cycle++;
+		log->l_curr_block -= log->l_logBBsize;
+		ASSERT(log->l_curr_block >= 0);
+	}
+	ASSERT(iclog == log->l_iclog);
+	log->l_iclog = iclog->ic_next;
+}	/* xlog_state_switch_iclogs */
+
+/*
+ * Write out all data in the in-core log as of this exact moment in time.
+ *
+ * Data may be written to the in-core log during this call.  However,
+ * we don't guarantee this data will be written out.  A change from past
+ * implementation means this routine will *not* write out zero length LRs.
+ *
+ * Basically, we try and perform an intelligent scan of the in-core logs.
+ * If we determine there is no flushable data, we just return.  There is no
+ * flushable data if:
+ *
+ *	1. the current iclog is active and has no data; the previous iclog
+ *		is in the active or dirty state.
+ *	2. the current iclog is drity, and the previous iclog is in the
+ *		active or dirty state.
+ *
+ * We may sleep if:
+ *
+ *	1. the current iclog is not in the active nor dirty state.
+ *	2. the current iclog dirty, and the previous iclog is not in the
+ *		active nor dirty state.
+ *	3. the current iclog is active, and there is another thread writing
+ *		to this particular iclog.
+ *	4. a) the current iclog is active and has no other writers
+ *	   b) when we return from flushing out this iclog, it is still
+ *		not in the active nor dirty state.
+ */
+int
+_xfs_log_force(
+	struct xfs_mount	*mp,
+	uint			flags,
+	int			*log_flushed)
+{
+	struct log		*log = mp->m_log;
+	struct xlog_in_core	*iclog;
+	xfs_lsn_t		lsn;
+
+	XFS_STATS_INC(xs_log_force);
+
+	if (log->l_cilp)
+		xlog_cil_force(log);
+
+	spin_lock(&log->l_icloglock);
+
+	iclog = log->l_iclog;
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		spin_unlock(&log->l_icloglock);
+		return XFS_ERROR(EIO);
+	}
+
+	/* If the head iclog is not active nor dirty, we just attach
+	 * ourselves to the head and go to sleep.
+	 */
+	if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+	    iclog->ic_state == XLOG_STATE_DIRTY) {
+		/*
+		 * If the head is dirty or (active and empty), then
+		 * we need to look at the previous iclog.  If the previous
+		 * iclog is active or dirty we are done.  There is nothing
+		 * to sync out.  Otherwise, we attach ourselves to the
+		 * previous iclog and go to sleep.
+		 */
+		if (iclog->ic_state == XLOG_STATE_DIRTY ||
+		    (atomic_read(&iclog->ic_refcnt) == 0
+		     && iclog->ic_offset == 0)) {
+			iclog = iclog->ic_prev;
+			if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+			    iclog->ic_state == XLOG_STATE_DIRTY)
+				goto no_sleep;
+			else
+				goto maybe_sleep;
+		} else {
+			if (atomic_read(&iclog->ic_refcnt) == 0) {
+				/* We are the only one with access to this
+				 * iclog.  Flush it out now.  There should
+				 * be a roundoff of zero to show that someone
+				 * has already taken care of the roundoff from
+				 * the previous sync.
+				 */
+				atomic_inc(&iclog->ic_refcnt);
+				lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+				xlog_state_switch_iclogs(log, iclog, 0);
+				spin_unlock(&log->l_icloglock);
+
+				if (xlog_state_release_iclog(log, iclog))
+					return XFS_ERROR(EIO);
+
+				if (log_flushed)
+					*log_flushed = 1;
+				spin_lock(&log->l_icloglock);
+				if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn &&
+				    iclog->ic_state != XLOG_STATE_DIRTY)
+					goto maybe_sleep;
+				else
+					goto no_sleep;
+			} else {
+				/* Someone else is writing to this iclog.
+				 * Use its call to flush out the data.  However,
+				 * the other thread may not force out this LR,
+				 * so we mark it WANT_SYNC.
+				 */
+				xlog_state_switch_iclogs(log, iclog, 0);
+				goto maybe_sleep;
+			}
+		}
+	}
+
+	/* By the time we come around again, the iclog could've been filled
+	 * which would give it another lsn.  If we have a new lsn, just
+	 * return because the relevant data has been flushed.
+	 */
+maybe_sleep:
+	if (flags & XFS_LOG_SYNC) {
+		/*
+		 * We must check if we're shutting down here, before
+		 * we wait, while we're holding the l_icloglock.
+		 * Then we check again after waking up, in case our
+		 * sleep was disturbed by a bad news.
+		 */
+		if (iclog->ic_state & XLOG_STATE_IOERROR) {
+			spin_unlock(&log->l_icloglock);
+			return XFS_ERROR(EIO);
+		}
+		XFS_STATS_INC(xs_log_force_sleep);
+		xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+		/*
+		 * No need to grab the log lock here since we're
+		 * only deciding whether or not to return EIO
+		 * and the memory read should be atomic.
+		 */
+		if (iclog->ic_state & XLOG_STATE_IOERROR)
+			return XFS_ERROR(EIO);
+		if (log_flushed)
+			*log_flushed = 1;
+	} else {
+
+no_sleep:
+		spin_unlock(&log->l_icloglock);
+	}
+	return 0;
+}
+
+/*
+ * Wrapper for _xfs_log_force(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	int	error;
+
+	error = _xfs_log_force(mp, flags, NULL);
+	if (error)
+		xfs_warn(mp, "%s: error %d returned.", __func__, error);
+}
+
+/*
+ * Force the in-core log to disk for a specific LSN.
+ *
+ * Find in-core log with lsn.
+ *	If it is in the DIRTY state, just return.
+ *	If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
+ *		state and go to sleep or return.
+ *	If it is in any other state, go to sleep or return.
+ *
+ * Synchronous forces are implemented with a signal variable. All callers
+ * to force a given lsn to disk will wait on a the sv attached to the
+ * specific in-core log.  When given in-core log finally completes its
+ * write to disk, that thread will wake up all threads waiting on the
+ * sv.
+ */
+int
+_xfs_log_force_lsn(
+	struct xfs_mount	*mp,
+	xfs_lsn_t		lsn,
+	uint			flags,
+	int			*log_flushed)
+{
+	struct log		*log = mp->m_log;
+	struct xlog_in_core	*iclog;
+	int			already_slept = 0;
+
+	ASSERT(lsn != 0);
+
+	XFS_STATS_INC(xs_log_force);
+
+	if (log->l_cilp) {
+		lsn = xlog_cil_force_lsn(log, lsn);
+		if (lsn == NULLCOMMITLSN)
+			return 0;
+	}
+
+try_again:
+	spin_lock(&log->l_icloglock);
+	iclog = log->l_iclog;
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		spin_unlock(&log->l_icloglock);
+		return XFS_ERROR(EIO);
+	}
+
+	do {
+		if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
+			iclog = iclog->ic_next;
+			continue;
+		}
+
+		if (iclog->ic_state == XLOG_STATE_DIRTY) {
+			spin_unlock(&log->l_icloglock);
+			return 0;
+		}
+
+		if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+			/*
+			 * We sleep here if we haven't already slept (e.g.
+			 * this is the first time we've looked at the correct
+			 * iclog buf) and the buffer before us is going to
+			 * be sync'ed. The reason for this is that if we
+			 * are doing sync transactions here, by waiting for
+			 * the previous I/O to complete, we can allow a few
+			 * more transactions into this iclog before we close
+			 * it down.
+			 *
+			 * Otherwise, we mark the buffer WANT_SYNC, and bump
+			 * up the refcnt so we can release the log (which
+			 * drops the ref count).  The state switch keeps new
+			 * transaction commits from using this buffer.  When
+			 * the current commits finish writing into the buffer,
+			 * the refcount will drop to zero and the buffer will
+			 * go out then.
+			 */
+			if (!already_slept &&
+			    (iclog->ic_prev->ic_state &
+			     (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
+				ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
+
+				XFS_STATS_INC(xs_log_force_sleep);
+
+				xlog_wait(&iclog->ic_prev->ic_write_wait,
+							&log->l_icloglock);
+				if (log_flushed)
+					*log_flushed = 1;
+				already_slept = 1;
+				goto try_again;
+			}
+			atomic_inc(&iclog->ic_refcnt);
+			xlog_state_switch_iclogs(log, iclog, 0);
+			spin_unlock(&log->l_icloglock);
+			if (xlog_state_release_iclog(log, iclog))
+				return XFS_ERROR(EIO);
+			if (log_flushed)
+				*log_flushed = 1;
+			spin_lock(&log->l_icloglock);
+		}
+
+		if ((flags & XFS_LOG_SYNC) && /* sleep */
+		    !(iclog->ic_state &
+		      (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
+			/*
+			 * Don't wait on completion if we know that we've
+			 * gotten a log write error.
+			 */
+			if (iclog->ic_state & XLOG_STATE_IOERROR) {
+				spin_unlock(&log->l_icloglock);
+				return XFS_ERROR(EIO);
+			}
+			XFS_STATS_INC(xs_log_force_sleep);
+			xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+			/*
+			 * No need to grab the log lock here since we're
+			 * only deciding whether or not to return EIO
+			 * and the memory read should be atomic.
+			 */
+			if (iclog->ic_state & XLOG_STATE_IOERROR)
+				return XFS_ERROR(EIO);
+
+			if (log_flushed)
+				*log_flushed = 1;
+		} else {		/* just return */
+			spin_unlock(&log->l_icloglock);
+		}
+
+		return 0;
+	} while (iclog != log->l_iclog);
+
+	spin_unlock(&log->l_icloglock);
+	return 0;
+}
+
+/*
+ * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force_lsn(
+	xfs_mount_t	*mp,
+	xfs_lsn_t	lsn,
+	uint		flags)
+{
+	int	error;
+
+	error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
+	if (error)
+		xfs_warn(mp, "%s: error %d returned.", __func__, error);
+}
+
+/*
+ * Called when we want to mark the current iclog as being ready to sync to
+ * disk.
+ */
+STATIC void
+xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
+{
+	assert_spin_locked(&log->l_icloglock);
+
+	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+		xlog_state_switch_iclogs(log, iclog, 0);
+	} else {
+		ASSERT(iclog->ic_state &
+			(XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
+	}
+}
+
+
+/*****************************************************************************
+ *
+ *		TICKET functions
+ *
+ *****************************************************************************
+ */
+
+/*
+ * Free a used ticket when its refcount falls to zero.
+ */
+void
+xfs_log_ticket_put(
+	xlog_ticket_t	*ticket)
+{
+	ASSERT(atomic_read(&ticket->t_ref) > 0);
+	if (atomic_dec_and_test(&ticket->t_ref))
+		kmem_zone_free(xfs_log_ticket_zone, ticket);
+}
+
+xlog_ticket_t *
+xfs_log_ticket_get(
+	xlog_ticket_t	*ticket)
+{
+	ASSERT(atomic_read(&ticket->t_ref) > 0);
+	atomic_inc(&ticket->t_ref);
+	return ticket;
+}
+
+/*
+ * Allocate and initialise a new log ticket.
+ */
+xlog_ticket_t *
+xlog_ticket_alloc(
+	struct log	*log,
+	int		unit_bytes,
+	int		cnt,
+	char		client,
+	uint		xflags,
+	int		alloc_flags)
+{
+	struct xlog_ticket *tic;
+	uint		num_headers;
+	int		iclog_space;
+
+	tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
+	if (!tic)
+		return NULL;
+
+	/*
+	 * Permanent reservations have up to 'cnt'-1 active log operations
+	 * in the log.  A unit in this case is the amount of space for one
+	 * of these log operations.  Normal reservations have a cnt of 1
+	 * and their unit amount is the total amount of space required.
+	 *
+	 * The following lines of code account for non-transaction data
+	 * which occupy space in the on-disk log.
+	 *
+	 * Normal form of a transaction is:
+	 * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
+	 * and then there are LR hdrs, split-recs and roundoff at end of syncs.
+	 *
+	 * We need to account for all the leadup data and trailer data
+	 * around the transaction data.
+	 * And then we need to account for the worst case in terms of using
+	 * more space.
+	 * The worst case will happen if:
+	 * - the placement of the transaction happens to be such that the
+	 *   roundoff is at its maximum
+	 * - the transaction data is synced before the commit record is synced
+	 *   i.e. <transaction-data><roundoff> | <commit-rec><roundoff>
+	 *   Therefore the commit record is in its own Log Record.
+	 *   This can happen as the commit record is called with its
+	 *   own region to xlog_write().
+	 *   This then means that in the worst case, roundoff can happen for
+	 *   the commit-rec as well.
+	 *   The commit-rec is smaller than padding in this scenario and so it is
+	 *   not added separately.
+	 */
+
+	/* for trans header */
+	unit_bytes += sizeof(xlog_op_header_t);
+	unit_bytes += sizeof(xfs_trans_header_t);
+
+	/* for start-rec */
+	unit_bytes += sizeof(xlog_op_header_t);
+
+	/*
+	 * for LR headers - the space for data in an iclog is the size minus
+	 * the space used for the headers. If we use the iclog size, then we
+	 * undercalculate the number of headers required.
+	 *
+	 * Furthermore - the addition of op headers for split-recs might
+	 * increase the space required enough to require more log and op
+	 * headers, so take that into account too.
+	 *
+	 * IMPORTANT: This reservation makes the assumption that if this
+	 * transaction is the first in an iclog and hence has the LR headers
+	 * accounted to it, then the remaining space in the iclog is
+	 * exclusively for this transaction.  i.e. if the transaction is larger
+	 * than the iclog, it will be the only thing in that iclog.
+	 * Fundamentally, this means we must pass the entire log vector to
+	 * xlog_write to guarantee this.
+	 */
+	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+	num_headers = howmany(unit_bytes, iclog_space);
+
+	/* for split-recs - ophdrs added when data split over LRs */
+	unit_bytes += sizeof(xlog_op_header_t) * num_headers;
+
+	/* add extra header reservations if we overrun */
+	while (!num_headers ||
+	       howmany(unit_bytes, iclog_space) > num_headers) {
+		unit_bytes += sizeof(xlog_op_header_t);
+		num_headers++;
+	}
+	unit_bytes += log->l_iclog_hsize * num_headers;
+
+	/* for commit-rec LR header - note: padding will subsume the ophdr */
+	unit_bytes += log->l_iclog_hsize;
+
+	/* for roundoff padding for transaction data and one for commit record */
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
+	    log->l_mp->m_sb.sb_logsunit > 1) {
+		/* log su roundoff */
+		unit_bytes += 2*log->l_mp->m_sb.sb_logsunit;
+	} else {
+		/* BB roundoff */
+		unit_bytes += 2*BBSIZE;
+        }
+
+	atomic_set(&tic->t_ref, 1);
+	INIT_LIST_HEAD(&tic->t_queue);
+	tic->t_unit_res		= unit_bytes;
+	tic->t_curr_res		= unit_bytes;
+	tic->t_cnt		= cnt;
+	tic->t_ocnt		= cnt;
+	tic->t_tid		= random32();
+	tic->t_clientid		= client;
+	tic->t_flags		= XLOG_TIC_INITED;
+	tic->t_trans_type	= 0;
+	if (xflags & XFS_LOG_PERM_RESERV)
+		tic->t_flags |= XLOG_TIC_PERM_RESERV;
+	init_waitqueue_head(&tic->t_wait);
+
+	xlog_tic_reset_res(tic);
+
+	return tic;
+}
+
+
+/******************************************************************************
+ *
+ *		Log debug routines
+ *
+ ******************************************************************************
+ */
+#if defined(DEBUG)
+/*
+ * Make sure that the destination ptr is within the valid data region of
+ * one of the iclogs.  This uses backup pointers stored in a different
+ * part of the log in case we trash the log structure.
+ */
+void
+xlog_verify_dest_ptr(
+	struct log	*log,
+	char		*ptr)
+{
+	int i;
+	int good_ptr = 0;
+
+	for (i = 0; i < log->l_iclog_bufs; i++) {
+		if (ptr >= log->l_iclog_bak[i] &&
+		    ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
+			good_ptr++;
+	}
+
+	if (!good_ptr)
+		xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
+}
+
+/*
+ * Check to make sure the grant write head didn't just over lap the tail.  If
+ * the cycles are the same, we can't be overlapping.  Otherwise, make sure that
+ * the cycles differ by exactly one and check the byte count.
+ *
+ * This check is run unlocked, so can give false positives. Rather than assert
+ * on failures, use a warn-once flag and a panic tag to allow the admin to
+ * determine if they want to panic the machine when such an error occurs. For
+ * debug kernels this will have the same effect as using an assert but, unlinke
+ * an assert, it can be turned off at runtime.
+ */
+STATIC void
+xlog_verify_grant_tail(
+	struct log	*log)
+{
+	int		tail_cycle, tail_blocks;
+	int		cycle, space;
+
+	xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
+	xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
+	if (tail_cycle != cycle) {
+		if (cycle - 1 != tail_cycle &&
+		    !(log->l_flags & XLOG_TAIL_WARN)) {
+			xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+				"%s: cycle - 1 != tail_cycle", __func__);
+			log->l_flags |= XLOG_TAIL_WARN;
+		}
+
+		if (space > BBTOB(tail_blocks) &&
+		    !(log->l_flags & XLOG_TAIL_WARN)) {
+			xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+				"%s: space > BBTOB(tail_blocks)", __func__);
+			log->l_flags |= XLOG_TAIL_WARN;
+		}
+	}
+}
+
+/* check if it will fit */
+STATIC void
+xlog_verify_tail_lsn(xlog_t	    *log,
+		     xlog_in_core_t *iclog,
+		     xfs_lsn_t	    tail_lsn)
+{
+    int blocks;
+
+    if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
+	blocks =
+	    log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
+	if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
+		xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
+    } else {
+	ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
+
+	if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
+		xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
+
+	blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
+	if (blocks < BTOBB(iclog->ic_offset) + 1)
+		xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
+    }
+}	/* xlog_verify_tail_lsn */
+
+/*
+ * Perform a number of checks on the iclog before writing to disk.
+ *
+ * 1. Make sure the iclogs are still circular
+ * 2. Make sure we have a good magic number
+ * 3. Make sure we don't have magic numbers in the data
+ * 4. Check fields of each log operation header for:
+ *	A. Valid client identifier
+ *	B. tid ptr value falls in valid ptr space (user space code)
+ *	C. Length in log record header is correct according to the
+ *		individual operation headers within record.
+ * 5. When a bwrite will occur within 5 blocks of the front of the physical
+ *	log, check the preceding blocks of the physical log to make sure all
+ *	the cycle numbers agree with the current cycle number.
+ */
+STATIC void
+xlog_verify_iclog(xlog_t	 *log,
+		  xlog_in_core_t *iclog,
+		  int		 count,
+		  boolean_t	 syncing)
+{
+	xlog_op_header_t	*ophead;
+	xlog_in_core_t		*icptr;
+	xlog_in_core_2_t	*xhdr;
+	xfs_caddr_t		ptr;
+	xfs_caddr_t		base_ptr;
+	__psint_t		field_offset;
+	__uint8_t		clientid;
+	int			len, i, j, k, op_len;
+	int			idx;
+
+	/* check validity of iclog pointers */
+	spin_lock(&log->l_icloglock);
+	icptr = log->l_iclog;
+	for (i=0; i < log->l_iclog_bufs; i++) {
+		if (icptr == NULL)
+			xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
+		icptr = icptr->ic_next;
+	}
+	if (icptr != log->l_iclog)
+		xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
+	spin_unlock(&log->l_icloglock);
+
+	/* check log magic numbers */
+	if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM)
+		xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
+
+	ptr = (xfs_caddr_t) &iclog->ic_header;
+	for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
+	     ptr += BBSIZE) {
+		if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
+			xfs_emerg(log->l_mp, "%s: unexpected magic num",
+				__func__);
+	}
+
+	/* check fields */
+	len = be32_to_cpu(iclog->ic_header.h_num_logops);
+	ptr = iclog->ic_datap;
+	base_ptr = ptr;
+	ophead = (xlog_op_header_t *)ptr;
+	xhdr = iclog->ic_data;
+	for (i = 0; i < len; i++) {
+		ophead = (xlog_op_header_t *)ptr;
+
+		/* clientid is only 1 byte */
+		field_offset = (__psint_t)
+			       ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
+		if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+			clientid = ophead->oh_clientid;
+		} else {
+			idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
+			if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
+				j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				clientid = xlog_get_client_id(
+					xhdr[j].hic_xheader.xh_cycle_data[k]);
+			} else {
+				clientid = xlog_get_client_id(
+					iclog->ic_header.h_cycle_data[idx]);
+			}
+		}
+		if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
+			xfs_warn(log->l_mp,
+				"%s: invalid clientid %d op 0x%p offset 0x%lx",
+				__func__, clientid, ophead,
+				(unsigned long)field_offset);
+
+		/* check length */
+		field_offset = (__psint_t)
+			       ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
+		if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+			op_len = be32_to_cpu(ophead->oh_len);
+		} else {
+			idx = BTOBBT((__psint_t)&ophead->oh_len -
+				    (__psint_t)iclog->ic_datap);
+			if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
+				j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]);
+			} else {
+				op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]);
+			}
+		}
+		ptr += sizeof(xlog_op_header_t) + op_len;
+	}
+}	/* xlog_verify_iclog */
+#endif
+
+/*
+ * Mark all iclogs IOERROR. l_icloglock is held by the caller.
+ */
+STATIC int
+xlog_state_ioerror(
+	xlog_t	*log)
+{
+	xlog_in_core_t	*iclog, *ic;
+
+	iclog = log->l_iclog;
+	if (! (iclog->ic_state & XLOG_STATE_IOERROR)) {
+		/*
+		 * Mark all the incore logs IOERROR.
+		 * From now on, no log flushes will result.
+		 */
+		ic = iclog;
+		do {
+			ic->ic_state = XLOG_STATE_IOERROR;
+			ic = ic->ic_next;
+		} while (ic != iclog);
+		return 0;
+	}
+	/*
+	 * Return non-zero, if state transition has already happened.
+	 */
+	return 1;
+}
+
+/*
+ * This is called from xfs_force_shutdown, when we're forcibly
+ * shutting down the filesystem, typically because of an IO error.
+ * Our main objectives here are to make sure that:
+ *	a. the filesystem gets marked 'SHUTDOWN' for all interested
+ *	   parties to find out, 'atomically'.
+ *	b. those who're sleeping on log reservations, pinned objects and
+ *	    other resources get woken up, and be told the bad news.
+ *	c. nothing new gets queued up after (a) and (b) are done.
+ *	d. if !logerror, flush the iclogs to disk, then seal them off
+ *	   for business.
+ *
+ * Note: for delayed logging the !logerror case needs to flush the regions
+ * held in memory out to the iclogs before flushing them to disk. This needs
+ * to be done before the log is marked as shutdown, otherwise the flush to the
+ * iclogs will fail.
+ */
+int
+xfs_log_force_umount(
+	struct xfs_mount	*mp,
+	int			logerror)
+{
+	xlog_ticket_t	*tic;
+	xlog_t		*log;
+	int		retval;
+
+	log = mp->m_log;
+
+	/*
+	 * If this happens during log recovery, don't worry about
+	 * locking; the log isn't open for business yet.
+	 */
+	if (!log ||
+	    log->l_flags & XLOG_ACTIVE_RECOVERY) {
+		mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
+		if (mp->m_sb_bp)
+			XFS_BUF_DONE(mp->m_sb_bp);
+		return 0;
+	}
+
+	/*
+	 * Somebody could've already done the hard work for us.
+	 * No need to get locks for this.
+	 */
+	if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
+		ASSERT(XLOG_FORCED_SHUTDOWN(log));
+		return 1;
+	}
+	retval = 0;
+
+	/*
+	 * Flush the in memory commit item list before marking the log as
+	 * being shut down. We need to do it in this order to ensure all the
+	 * completed transactions are flushed to disk with the xfs_log_force()
+	 * call below.
+	 */
+	if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
+		xlog_cil_force(log);
+
+	/*
+	 * mark the filesystem and the as in a shutdown state and wake
+	 * everybody up to tell them the bad news.
+	 */
+	spin_lock(&log->l_icloglock);
+	mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
+	if (mp->m_sb_bp)
+		XFS_BUF_DONE(mp->m_sb_bp);
+
+	/*
+	 * This flag is sort of redundant because of the mount flag, but
+	 * it's good to maintain the separation between the log and the rest
+	 * of XFS.
+	 */
+	log->l_flags |= XLOG_IO_ERROR;
+
+	/*
+	 * If we hit a log error, we want to mark all the iclogs IOERROR
+	 * while we're still holding the loglock.
+	 */
+	if (logerror)
+		retval = xlog_state_ioerror(log);
+	spin_unlock(&log->l_icloglock);
+
+	/*
+	 * We don't want anybody waiting for log reservations after this. That
+	 * means we have to wake up everybody queued up on reserveq as well as
+	 * writeq.  In addition, we make sure in xlog_{re}grant_log_space that
+	 * we don't enqueue anything once the SHUTDOWN flag is set, and this
+	 * action is protected by the grant locks.
+	 */
+	spin_lock(&log->l_grant_reserve_lock);
+	list_for_each_entry(tic, &log->l_reserveq, t_queue)
+		wake_up(&tic->t_wait);
+	spin_unlock(&log->l_grant_reserve_lock);
+
+	spin_lock(&log->l_grant_write_lock);
+	list_for_each_entry(tic, &log->l_writeq, t_queue)
+		wake_up(&tic->t_wait);
+	spin_unlock(&log->l_grant_write_lock);
+
+	if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
+		ASSERT(!logerror);
+		/*
+		 * Force the incore logs to disk before shutting the
+		 * log down completely.
+		 */
+		_xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+
+		spin_lock(&log->l_icloglock);
+		retval = xlog_state_ioerror(log);
+		spin_unlock(&log->l_icloglock);
+	}
+	/*
+	 * Wake up everybody waiting on xfs_log_force.
+	 * Callback all log item committed functions as if the
+	 * log writes were completed.
+	 */
+	xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
+
+#ifdef XFSERRORDEBUG
+	{
+		xlog_in_core_t	*iclog;
+
+		spin_lock(&log->l_icloglock);
+		iclog = log->l_iclog;
+		do {
+			ASSERT(iclog->ic_callback == 0);
+			iclog = iclog->ic_next;
+		} while (iclog != log->l_iclog);
+		spin_unlock(&log->l_icloglock);
+	}
+#endif
+	/* return non-zero if log IOERROR transition had already happened */
+	return retval;
+}
+
+STATIC int
+xlog_iclogs_empty(xlog_t *log)
+{
+	xlog_in_core_t	*iclog;
+
+	iclog = log->l_iclog;
+	do {
+		/* endianness does not matter here, zero is zero in
+		 * any language.
+		 */
+		if (iclog->ic_header.h_num_logops)
+			return 0;
+		iclog = iclog->ic_next;
+	} while (iclog != log->l_iclog);
+	return 1;
+}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
new file mode 100644
index 00000000..78c90399
--- /dev/null
+++ b/fs/xfs/xfs_log.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_LOG_H__
+#define __XFS_LOG_H__
+
+/* get lsn fields */
+#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
+#define BLOCK_LSN(lsn) ((uint)(lsn))
+
+/* this is used in a spot where we might otherwise double-endian-flip */
+#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
+
+#ifdef __KERNEL__
+/*
+ * By comparing each component, we don't have to worry about extra
+ * endian issues in treating two 32 bit numbers as one 64 bit number
+ */
+static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
+{
+	if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2))
+		return (CYCLE_LSN(lsn1)<CYCLE_LSN(lsn2))? -999 : 999;
+
+	if (BLOCK_LSN(lsn1) != BLOCK_LSN(lsn2))
+		return (BLOCK_LSN(lsn1)<BLOCK_LSN(lsn2))? -999 : 999;
+
+	return 0;
+}
+
+#define	XFS_LSN_CMP(x,y) _lsn_cmp(x,y)
+
+/*
+ * Macros, structures, prototypes for interface to the log manager.
+ */
+
+/*
+ * Flags to xfs_log_done()
+ */
+#define XFS_LOG_REL_PERM_RESERV	0x1
+
+/*
+ * Flags to xfs_log_reserve()
+ *
+ *	XFS_LOG_PERM_RESERV: Permanent reservation.  When writes are
+ *		performed against this type of reservation, the reservation
+ *		is not decreased.  Long running transactions should use this.
+ */
+#define XFS_LOG_PERM_RESERV	0x2
+
+/*
+ * Flags to xfs_log_force()
+ *
+ *	XFS_LOG_SYNC:	Synchronous force in-core log to disk
+ */
+#define XFS_LOG_SYNC		0x1
+
+#endif	/* __KERNEL__ */
+
+
+/* Log Clients */
+#define XFS_TRANSACTION		0x69
+#define XFS_VOLUME		0x2
+#define XFS_LOG			0xaa
+
+
+/* Region types for iovec's i_type */
+#define XLOG_REG_TYPE_BFORMAT		1
+#define XLOG_REG_TYPE_BCHUNK		2
+#define XLOG_REG_TYPE_EFI_FORMAT	3
+#define XLOG_REG_TYPE_EFD_FORMAT	4
+#define XLOG_REG_TYPE_IFORMAT		5
+#define XLOG_REG_TYPE_ICORE		6
+#define XLOG_REG_TYPE_IEXT		7
+#define XLOG_REG_TYPE_IBROOT		8
+#define XLOG_REG_TYPE_ILOCAL		9
+#define XLOG_REG_TYPE_IATTR_EXT		10
+#define XLOG_REG_TYPE_IATTR_BROOT	11
+#define XLOG_REG_TYPE_IATTR_LOCAL	12
+#define XLOG_REG_TYPE_QFORMAT		13
+#define XLOG_REG_TYPE_DQUOT		14
+#define XLOG_REG_TYPE_QUOTAOFF		15
+#define XLOG_REG_TYPE_LRHEADER		16
+#define XLOG_REG_TYPE_UNMOUNT		17
+#define XLOG_REG_TYPE_COMMIT		18
+#define XLOG_REG_TYPE_TRANSHDR		19
+#define XLOG_REG_TYPE_MAX		19
+
+typedef struct xfs_log_iovec {
+	void		*i_addr;	/* beginning address of region */
+	int		i_len;		/* length in bytes of region */
+	uint		i_type;		/* type of region */
+} xfs_log_iovec_t;
+
+struct xfs_log_vec {
+	struct xfs_log_vec	*lv_next;	/* next lv in build list */
+	int			lv_niovecs;	/* number of iovecs in lv */
+	struct xfs_log_iovec	*lv_iovecp;	/* iovec array */
+	struct xfs_log_item	*lv_item;	/* owner */
+	char			*lv_buf;	/* formatted buffer */
+	int			lv_buf_len;	/* size of formatted buffer */
+};
+
+/*
+ * Structure used to pass callback function and the function's argument
+ * to the log manager.
+ */
+typedef struct xfs_log_callback {
+	struct xfs_log_callback	*cb_next;
+	void			(*cb_func)(void *, int);
+	void			*cb_arg;
+} xfs_log_callback_t;
+
+
+#ifdef __KERNEL__
+/* Log manager interfaces */
+struct xfs_mount;
+struct xlog_in_core;
+struct xlog_ticket;
+struct xfs_log_item;
+struct xfs_item_ops;
+struct xfs_trans;
+
+void	xfs_log_item_init(struct xfs_mount	*mp,
+			struct xfs_log_item	*item,
+			int			type,
+			struct xfs_item_ops	*ops);
+
+xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
+		       struct xlog_ticket *ticket,
+		       struct xlog_in_core **iclog,
+		       uint		flags);
+int	  _xfs_log_force(struct xfs_mount *mp,
+			 uint		flags,
+			 int		*log_forced);
+void	  xfs_log_force(struct xfs_mount	*mp,
+			uint			flags);
+int	  _xfs_log_force_lsn(struct xfs_mount *mp,
+			     xfs_lsn_t		lsn,
+			     uint		flags,
+			     int		*log_forced);
+void	  xfs_log_force_lsn(struct xfs_mount	*mp,
+			    xfs_lsn_t		lsn,
+			    uint		flags);
+int	  xfs_log_mount(struct xfs_mount	*mp,
+			struct xfs_buftarg	*log_target,
+			xfs_daddr_t		start_block,
+			int		 	num_bblocks);
+int	  xfs_log_mount_finish(struct xfs_mount *mp);
+void	  xfs_log_move_tail(struct xfs_mount	*mp,
+			    xfs_lsn_t		tail_lsn);
+int	  xfs_log_notify(struct xfs_mount	*mp,
+			 struct xlog_in_core	*iclog,
+			 xfs_log_callback_t	*callback_entry);
+int	  xfs_log_release_iclog(struct xfs_mount *mp,
+			 struct xlog_in_core	 *iclog);
+int	  xfs_log_reserve(struct xfs_mount *mp,
+			  int		   length,
+			  int		   count,
+			  struct xlog_ticket **ticket,
+			  __uint8_t	   clientid,
+			  uint		   flags,
+			  uint		   t_type);
+int	  xfs_log_write(struct xfs_mount *mp,
+			xfs_log_iovec_t  region[],
+			int		 nentries,
+			struct xlog_ticket *ticket,
+			xfs_lsn_t	 *start_lsn);
+int	  xfs_log_unmount_write(struct xfs_mount *mp);
+void      xfs_log_unmount(struct xfs_mount *mp);
+int	  xfs_log_force_umount(struct xfs_mount *mp, int logerror);
+int	  xfs_log_need_covered(struct xfs_mount *mp);
+
+void	  xlog_iodone(struct xfs_buf *);
+
+struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
+void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
+
+void	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+				struct xfs_log_vec *log_vector,
+				xfs_lsn_t *commit_lsn, int flags);
+bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
+
+#endif
+#endif	/* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 00000000..c7755d5a
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,804 @@
+/*
+ * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+#include "xfs_discard.h"
+
+/*
+ * Perform initial CIL structure initialisation. If the CIL is not
+ * enabled in this filesystem, ensure the log->l_cilp is null so
+ * we can check this conditional to determine if we are doing delayed
+ * logging or not.
+ */
+int
+xlog_cil_init(
+	struct log	*log)
+{
+	struct xfs_cil	*cil;
+	struct xfs_cil_ctx *ctx;
+
+	log->l_cilp = NULL;
+	if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
+		return 0;
+
+	cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+	if (!cil)
+		return ENOMEM;
+
+	ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+	if (!ctx) {
+		kmem_free(cil);
+		return ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&cil->xc_cil);
+	INIT_LIST_HEAD(&cil->xc_committing);
+	spin_lock_init(&cil->xc_cil_lock);
+	init_rwsem(&cil->xc_ctx_lock);
+	init_waitqueue_head(&cil->xc_commit_wait);
+
+	INIT_LIST_HEAD(&ctx->committing);
+	INIT_LIST_HEAD(&ctx->busy_extents);
+	ctx->sequence = 1;
+	ctx->cil = cil;
+	cil->xc_ctx = ctx;
+	cil->xc_current_sequence = ctx->sequence;
+
+	cil->xc_log = log;
+	log->l_cilp = cil;
+	return 0;
+}
+
+void
+xlog_cil_destroy(
+	struct log	*log)
+{
+	if (!log->l_cilp)
+		return;
+
+	if (log->l_cilp->xc_ctx) {
+		if (log->l_cilp->xc_ctx->ticket)
+			xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
+		kmem_free(log->l_cilp->xc_ctx);
+	}
+
+	ASSERT(list_empty(&log->l_cilp->xc_cil));
+	kmem_free(log->l_cilp);
+}
+
+/*
+ * Allocate a new ticket. Failing to get a new ticket makes it really hard to
+ * recover, so we don't allow failure here. Also, we allocate in a context that
+ * we don't want to be issuing transactions from, so we need to tell the
+ * allocation code this as well.
+ *
+ * We don't reserve any space for the ticket - we are going to steal whatever
+ * space we require from transactions as they commit. To ensure we reserve all
+ * the space required, we need to set the current reservation of the ticket to
+ * zero so that we know to steal the initial transaction overhead from the
+ * first transaction commit.
+ */
+static struct xlog_ticket *
+xlog_cil_ticket_alloc(
+	struct log	*log)
+{
+	struct xlog_ticket *tic;
+
+	tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
+				KM_SLEEP|KM_NOFS);
+	tic->t_trans_type = XFS_TRANS_CHECKPOINT;
+
+	/*
+	 * set the current reservation to zero so we know to steal the basic
+	 * transaction overhead reservation from the first transaction commit.
+	 */
+	tic->t_curr_res = 0;
+	return tic;
+}
+
+/*
+ * After the first stage of log recovery is done, we know where the head and
+ * tail of the log are. We need this log initialisation done before we can
+ * initialise the first CIL checkpoint context.
+ *
+ * Here we allocate a log ticket to track space usage during a CIL push.  This
+ * ticket is passed to xlog_write() directly so that we don't slowly leak log
+ * space by failing to account for space used by log headers and additional
+ * region headers for split regions.
+ */
+void
+xlog_cil_init_post_recovery(
+	struct log	*log)
+{
+	if (!log->l_cilp)
+		return;
+
+	log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
+	log->l_cilp->xc_ctx->sequence = 1;
+	log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
+								log->l_curr_block);
+}
+
+/*
+ * Format log item into a flat buffers
+ *
+ * For delayed logging, we need to hold a formatted buffer containing all the
+ * changes on the log item. This enables us to relog the item in memory and
+ * write it out asynchronously without needing to relock the object that was
+ * modified at the time it gets written into the iclog.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and formats the vector for the item into the buffer.
+ * The buffer is then attached to the log item are then inserted into the
+ * Committed Item List for tracking until the next checkpoint is written out.
+ *
+ * We don't set up region headers during this process; we simply copy the
+ * regions into the flat buffer. We can do this because we still have to do a
+ * formatting step to write the regions into the iclog buffer.  Writing the
+ * ophdrs during the iclog write means that we can support splitting large
+ * regions across iclog boundares without needing a change in the format of the
+ * item/region encapsulation.
+ *
+ * Hence what we need to do now is change the rewrite the vector array to point
+ * to the copied region inside the buffer we just allocated. This allows us to
+ * format the regions into the iclog as though they are being formatted
+ * directly out of the objects themselves.
+ */
+static void
+xlog_cil_format_items(
+	struct log		*log,
+	struct xfs_log_vec	*log_vector)
+{
+	struct xfs_log_vec *lv;
+
+	ASSERT(log_vector);
+	for (lv = log_vector; lv; lv = lv->lv_next) {
+		void	*ptr;
+		int	index;
+		int	len = 0;
+
+		/* build the vector array and calculate it's length */
+		IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
+		for (index = 0; index < lv->lv_niovecs; index++)
+			len += lv->lv_iovecp[index].i_len;
+
+		lv->lv_buf_len = len;
+		lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+		ptr = lv->lv_buf;
+
+		for (index = 0; index < lv->lv_niovecs; index++) {
+			struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
+
+			memcpy(ptr, vec->i_addr, vec->i_len);
+			vec->i_addr = ptr;
+			ptr += vec->i_len;
+		}
+		ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+	}
+}
+
+/*
+ * Prepare the log item for insertion into the CIL. Calculate the difference in
+ * log space and vectors it will consume, and if it is a new item pin it as
+ * well.
+ */
+STATIC void
+xfs_cil_prepare_item(
+	struct log		*log,
+	struct xfs_log_vec	*lv,
+	int			*len,
+	int			*diff_iovecs)
+{
+	struct xfs_log_vec	*old = lv->lv_item->li_lv;
+
+	if (old) {
+		/* existing lv on log item, space used is a delta */
+		ASSERT(!list_empty(&lv->lv_item->li_cil));
+		ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+
+		*len += lv->lv_buf_len - old->lv_buf_len;
+		*diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
+		kmem_free(old->lv_buf);
+		kmem_free(old);
+	} else {
+		/* new lv, must pin the log item */
+		ASSERT(!lv->lv_item->li_lv);
+		ASSERT(list_empty(&lv->lv_item->li_cil));
+
+		*len += lv->lv_buf_len;
+		*diff_iovecs += lv->lv_niovecs;
+		IOP_PIN(lv->lv_item);
+
+	}
+
+	/* attach new log vector to log item */
+	lv->lv_item->li_lv = lv;
+
+	/*
+	 * If this is the first time the item is being committed to the
+	 * CIL, store the sequence number on the log item so we can
+	 * tell in future commits whether this is the first checkpoint
+	 * the item is being committed into.
+	 */
+	if (!lv->lv_item->li_seq)
+		lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
+}
+
+/*
+ * Insert the log items into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ */
+static void
+xlog_cil_insert_items(
+	struct log		*log,
+	struct xfs_log_vec	*log_vector,
+	struct xlog_ticket	*ticket)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_cil_ctx	*ctx = cil->xc_ctx;
+	struct xfs_log_vec	*lv;
+	int			len = 0;
+	int			diff_iovecs = 0;
+	int			iclog_space;
+
+	ASSERT(log_vector);
+
+	/*
+	 * Do all the accounting aggregation and switching of log vectors
+	 * around in a separate loop to the insertion of items into the CIL.
+	 * Then we can do a separate loop to update the CIL within a single
+	 * lock/unlock pair. This reduces the number of round trips on the CIL
+	 * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
+	 * hold time for the transaction commit.
+	 *
+	 * If this is the first time the item is being placed into the CIL in
+	 * this context, pin it so it can't be written to disk until the CIL is
+	 * flushed to the iclog and the iclog written to disk.
+	 *
+	 * We can do this safely because the context can't checkpoint until we
+	 * are done so it doesn't matter exactly how we update the CIL.
+	 */
+	for (lv = log_vector; lv; lv = lv->lv_next)
+		xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
+
+	/* account for space used by new iovec headers  */
+	len += diff_iovecs * sizeof(xlog_op_header_t);
+
+	spin_lock(&cil->xc_cil_lock);
+
+	/* move the items to the tail of the CIL */
+	for (lv = log_vector; lv; lv = lv->lv_next)
+		list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
+
+	ctx->nvecs += diff_iovecs;
+
+	/*
+	 * Now transfer enough transaction reservation to the context ticket
+	 * for the checkpoint. The context ticket is special - the unit
+	 * reservation has to grow as well as the current reservation as we
+	 * steal from tickets so we can correctly determine the space used
+	 * during the transaction commit.
+	 */
+	if (ctx->ticket->t_curr_res == 0) {
+		/* first commit in checkpoint, steal the header reservation */
+		ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+		ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+		ticket->t_curr_res -= ctx->ticket->t_unit_res;
+	}
+
+	/* do we need space for more log record headers? */
+	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+	if (len > 0 && (ctx->space_used / iclog_space !=
+				(ctx->space_used + len) / iclog_space)) {
+		int hdrs;
+
+		hdrs = (len + iclog_space - 1) / iclog_space;
+		/* need to take into account split region headers, too */
+		hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+		ctx->ticket->t_unit_res += hdrs;
+		ctx->ticket->t_curr_res += hdrs;
+		ticket->t_curr_res -= hdrs;
+		ASSERT(ticket->t_curr_res >= len);
+	}
+	ticket->t_curr_res -= len;
+	ctx->space_used += len;
+
+	spin_unlock(&cil->xc_cil_lock);
+}
+
+static void
+xlog_cil_free_logvec(
+	struct xfs_log_vec	*log_vector)
+{
+	struct xfs_log_vec	*lv;
+
+	for (lv = log_vector; lv; ) {
+		struct xfs_log_vec *next = lv->lv_next;
+		kmem_free(lv->lv_buf);
+		kmem_free(lv);
+		lv = next;
+	}
+}
+
+/*
+ * Mark all items committed and clear busy extents. We free the log vector
+ * chains in a separate pass so that we unpin the log items as quickly as
+ * possible.
+ */
+static void
+xlog_cil_committed(
+	void	*args,
+	int	abort)
+{
+	struct xfs_cil_ctx	*ctx = args;
+	struct xfs_mount	*mp = ctx->cil->xc_log->l_mp;
+
+	xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
+					ctx->start_lsn, abort);
+
+	xfs_alloc_busy_sort(&ctx->busy_extents);
+	xfs_alloc_busy_clear(mp, &ctx->busy_extents,
+			     (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
+
+	spin_lock(&ctx->cil->xc_cil_lock);
+	list_del(&ctx->committing);
+	spin_unlock(&ctx->cil->xc_cil_lock);
+
+	xlog_cil_free_logvec(ctx->lv_chain);
+
+	if (!list_empty(&ctx->busy_extents)) {
+		ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
+
+		xfs_discard_extents(mp, &ctx->busy_extents);
+		xfs_alloc_busy_clear(mp, &ctx->busy_extents, false);
+	}
+
+	kmem_free(ctx);
+}
+
+/*
+ * Push the Committed Item List to the log. If @push_seq flag is zero, then it
+ * is a background flush and so we can chose to ignore it. Otherwise, if the
+ * current sequence is the same as @push_seq we need to do a flush. If
+ * @push_seq is less than the current sequence, then it has already been
+ * flushed and we don't need to do anything - the caller will wait for it to
+ * complete if necessary.
+ *
+ * @push_seq is a value rather than a flag because that allows us to do an
+ * unlocked check of the sequence number for a match. Hence we can allows log
+ * forces to run racily and not issue pushes for the same sequence twice. If we
+ * get a race between multiple pushes for the same sequence they will block on
+ * the first one and then abort, hence avoiding needless pushes.
+ */
+STATIC int
+xlog_cil_push(
+	struct log		*log,
+	xfs_lsn_t		push_seq)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_log_vec	*lv;
+	struct xfs_cil_ctx	*ctx;
+	struct xfs_cil_ctx	*new_ctx;
+	struct xlog_in_core	*commit_iclog;
+	struct xlog_ticket	*tic;
+	int			num_lv;
+	int			num_iovecs;
+	int			len;
+	int			error = 0;
+	struct xfs_trans_header thdr;
+	struct xfs_log_iovec	lhdr;
+	struct xfs_log_vec	lvhdr = { NULL };
+	xfs_lsn_t		commit_lsn;
+
+	if (!cil)
+		return 0;
+
+	ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
+
+	new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+	new_ctx->ticket = xlog_cil_ticket_alloc(log);
+
+	/*
+	 * Lock out transaction commit, but don't block for background pushes
+	 * unless we are well over the CIL space limit. See the definition of
+	 * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
+	 * used here.
+	 */
+	if (!down_write_trylock(&cil->xc_ctx_lock)) {
+		if (!push_seq &&
+		    cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
+			goto out_free_ticket;
+		down_write(&cil->xc_ctx_lock);
+	}
+	ctx = cil->xc_ctx;
+
+	/* check if we've anything to push */
+	if (list_empty(&cil->xc_cil))
+		goto out_skip;
+
+	/* check for spurious background flush */
+	if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+		goto out_skip;
+
+	/* check for a previously pushed seqeunce */
+	if (push_seq && push_seq < cil->xc_ctx->sequence)
+		goto out_skip;
+
+	/*
+	 * pull all the log vectors off the items in the CIL, and
+	 * remove the items from the CIL. We don't need the CIL lock
+	 * here because it's only needed on the transaction commit
+	 * side which is currently locked out by the flush lock.
+	 */
+	lv = NULL;
+	num_lv = 0;
+	num_iovecs = 0;
+	len = 0;
+	while (!list_empty(&cil->xc_cil)) {
+		struct xfs_log_item	*item;
+		int			i;
+
+		item = list_first_entry(&cil->xc_cil,
+					struct xfs_log_item, li_cil);
+		list_del_init(&item->li_cil);
+		if (!ctx->lv_chain)
+			ctx->lv_chain = item->li_lv;
+		else
+			lv->lv_next = item->li_lv;
+		lv = item->li_lv;
+		item->li_lv = NULL;
+
+		num_lv++;
+		num_iovecs += lv->lv_niovecs;
+		for (i = 0; i < lv->lv_niovecs; i++)
+			len += lv->lv_iovecp[i].i_len;
+	}
+
+	/*
+	 * initialise the new context and attach it to the CIL. Then attach
+	 * the current context to the CIL committing lsit so it can be found
+	 * during log forces to extract the commit lsn of the sequence that
+	 * needs to be forced.
+	 */
+	INIT_LIST_HEAD(&new_ctx->committing);
+	INIT_LIST_HEAD(&new_ctx->busy_extents);
+	new_ctx->sequence = ctx->sequence + 1;
+	new_ctx->cil = cil;
+	cil->xc_ctx = new_ctx;
+
+	/*
+	 * mirror the new sequence into the cil structure so that we can do
+	 * unlocked checks against the current sequence in log forces without
+	 * risking deferencing a freed context pointer.
+	 */
+	cil->xc_current_sequence = new_ctx->sequence;
+
+	/*
+	 * The switch is now done, so we can drop the context lock and move out
+	 * of a shared context. We can't just go straight to the commit record,
+	 * though - we need to synchronise with previous and future commits so
+	 * that the commit records are correctly ordered in the log to ensure
+	 * that we process items during log IO completion in the correct order.
+	 *
+	 * For example, if we get an EFI in one checkpoint and the EFD in the
+	 * next (e.g. due to log forces), we do not want the checkpoint with
+	 * the EFD to be committed before the checkpoint with the EFI.  Hence
+	 * we must strictly order the commit records of the checkpoints so
+	 * that: a) the checkpoint callbacks are attached to the iclogs in the
+	 * correct order; and b) the checkpoints are replayed in correct order
+	 * in log recovery.
+	 *
+	 * Hence we need to add this context to the committing context list so
+	 * that higher sequences will wait for us to write out a commit record
+	 * before they do.
+	 */
+	spin_lock(&cil->xc_cil_lock);
+	list_add(&ctx->committing, &cil->xc_committing);
+	spin_unlock(&cil->xc_cil_lock);
+	up_write(&cil->xc_ctx_lock);
+
+	/*
+	 * Build a checkpoint transaction header and write it to the log to
+	 * begin the transaction. We need to account for the space used by the
+	 * transaction header here as it is not accounted for in xlog_write().
+	 *
+	 * The LSN we need to pass to the log items on transaction commit is
+	 * the LSN reported by the first log vector write. If we use the commit
+	 * record lsn then we can move the tail beyond the grant write head.
+	 */
+	tic = ctx->ticket;
+	thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+	thdr.th_type = XFS_TRANS_CHECKPOINT;
+	thdr.th_tid = tic->t_tid;
+	thdr.th_num_items = num_iovecs;
+	lhdr.i_addr = &thdr;
+	lhdr.i_len = sizeof(xfs_trans_header_t);
+	lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
+	tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
+
+	lvhdr.lv_niovecs = 1;
+	lvhdr.lv_iovecp = &lhdr;
+	lvhdr.lv_next = ctx->lv_chain;
+
+	error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+	if (error)
+		goto out_abort_free_ticket;
+
+	/*
+	 * now that we've written the checkpoint into the log, strictly
+	 * order the commit records so replay will get them in the right order.
+	 */
+restart:
+	spin_lock(&cil->xc_cil_lock);
+	list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
+		/*
+		 * Higher sequences will wait for this one so skip them.
+		 * Don't wait for own own sequence, either.
+		 */
+		if (new_ctx->sequence >= ctx->sequence)
+			continue;
+		if (!new_ctx->commit_lsn) {
+			/*
+			 * It is still being pushed! Wait for the push to
+			 * complete, then start again from the beginning.
+			 */
+			xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
+			goto restart;
+		}
+	}
+	spin_unlock(&cil->xc_cil_lock);
+
+	/* xfs_log_done always frees the ticket on error. */
+	commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+	if (commit_lsn == -1)
+		goto out_abort;
+
+	/* attach all the transactions w/ busy extents to iclog */
+	ctx->log_cb.cb_func = xlog_cil_committed;
+	ctx->log_cb.cb_arg = ctx;
+	error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
+	if (error)
+		goto out_abort;
+
+	/*
+	 * now the checkpoint commit is complete and we've attached the
+	 * callbacks to the iclog we can assign the commit LSN to the context
+	 * and wake up anyone who is waiting for the commit to complete.
+	 */
+	spin_lock(&cil->xc_cil_lock);
+	ctx->commit_lsn = commit_lsn;
+	wake_up_all(&cil->xc_commit_wait);
+	spin_unlock(&cil->xc_cil_lock);
+
+	/* release the hounds! */
+	return xfs_log_release_iclog(log->l_mp, commit_iclog);
+
+out_skip:
+	up_write(&cil->xc_ctx_lock);
+out_free_ticket:
+	xfs_log_ticket_put(new_ctx->ticket);
+	kmem_free(new_ctx);
+	return 0;
+
+out_abort_free_ticket:
+	xfs_log_ticket_put(tic);
+out_abort:
+	xlog_cil_committed(ctx, XFS_LI_ABORTED);
+	return XFS_ERROR(EIO);
+}
+
+/*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * For more specific information about the order of operations in
+ * xfs_log_commit_cil() please refer to the comments in
+ * xfs_trans_commit_iclog().
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+void
+xfs_log_commit_cil(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_log_vec	*log_vector,
+	xfs_lsn_t		*commit_lsn,
+	int			flags)
+{
+	struct log		*log = mp->m_log;
+	int			log_flags = 0;
+	int			push = 0;
+
+	if (flags & XFS_TRANS_RELEASE_LOG_RES)
+		log_flags = XFS_LOG_REL_PERM_RESERV;
+
+	/*
+	 * do all the hard work of formatting items (including memory
+	 * allocation) outside the CIL context lock. This prevents stalling CIL
+	 * pushes when we are low on memory and a transaction commit spends a
+	 * lot of time in memory reclaim.
+	 */
+	xlog_cil_format_items(log, log_vector);
+
+	/* lock out background commit */
+	down_read(&log->l_cilp->xc_ctx_lock);
+	if (commit_lsn)
+		*commit_lsn = log->l_cilp->xc_ctx->sequence;
+
+	xlog_cil_insert_items(log, log_vector, tp->t_ticket);
+
+	/* check we didn't blow the reservation */
+	if (tp->t_ticket->t_curr_res < 0)
+		xlog_print_tic_res(log->l_mp, tp->t_ticket);
+
+	/* attach the transaction to the CIL if it has any busy extents */
+	if (!list_empty(&tp->t_busy)) {
+		spin_lock(&log->l_cilp->xc_cil_lock);
+		list_splice_init(&tp->t_busy,
+					&log->l_cilp->xc_ctx->busy_extents);
+		spin_unlock(&log->l_cilp->xc_cil_lock);
+	}
+
+	tp->t_commit_lsn = *commit_lsn;
+	xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+	xfs_trans_unreserve_and_mod_sb(tp);
+
+	/*
+	 * Once all the items of the transaction have been copied to the CIL,
+	 * the items can be unlocked and freed.
+	 *
+	 * This needs to be done before we drop the CIL context lock because we
+	 * have to update state in the log items and unlock them before they go
+	 * to disk. If we don't, then the CIL checkpoint can race with us and
+	 * we can run checkpoint completion before we've updated and unlocked
+	 * the log items. This affects (at least) processing of stale buffers,
+	 * inodes and EFIs.
+	 */
+	xfs_trans_free_items(tp, *commit_lsn, 0);
+
+	/* check for background commit before unlock */
+	if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+		push = 1;
+
+	up_read(&log->l_cilp->xc_ctx_lock);
+
+	/*
+	 * We need to push CIL every so often so we don't cache more than we
+	 * can fit in the log. The limit really is that a checkpoint can't be
+	 * more than half the log (the current checkpoint is not allowed to
+	 * overwrite the previous checkpoint), but commit latency and memory
+	 * usage limit this to a smaller size in most cases.
+	 */
+	if (push)
+		xlog_cil_push(log, 0);
+}
+
+/*
+ * Conditionally push the CIL based on the sequence passed in.
+ *
+ * We only need to push if we haven't already pushed the sequence
+ * number given. Hence the only time we will trigger a push here is
+ * if the push sequence is the same as the current context.
+ *
+ * We return the current commit lsn to allow the callers to determine if a
+ * iclog flush is necessary following this call.
+ *
+ * XXX: Initially, just push the CIL unconditionally and return whatever
+ * commit lsn is there. It'll be empty, so this is broken for now.
+ */
+xfs_lsn_t
+xlog_cil_force_lsn(
+	struct log	*log,
+	xfs_lsn_t	sequence)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_cil_ctx	*ctx;
+	xfs_lsn_t		commit_lsn = NULLCOMMITLSN;
+
+	ASSERT(sequence <= cil->xc_current_sequence);
+
+	/*
+	 * check to see if we need to force out the current context.
+	 * xlog_cil_push() handles racing pushes for the same sequence,
+	 * so no need to deal with it here.
+	 */
+	if (sequence == cil->xc_current_sequence)
+		xlog_cil_push(log, sequence);
+
+	/*
+	 * See if we can find a previous sequence still committing.
+	 * We need to wait for all previous sequence commits to complete
+	 * before allowing the force of push_seq to go ahead. Hence block
+	 * on commits for those as well.
+	 */
+restart:
+	spin_lock(&cil->xc_cil_lock);
+	list_for_each_entry(ctx, &cil->xc_committing, committing) {
+		if (ctx->sequence > sequence)
+			continue;
+		if (!ctx->commit_lsn) {
+			/*
+			 * It is still being pushed! Wait for the push to
+			 * complete, then start again from the beginning.
+			 */
+			xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
+			goto restart;
+		}
+		if (ctx->sequence != sequence)
+			continue;
+		/* found it! */
+		commit_lsn = ctx->commit_lsn;
+	}
+	spin_unlock(&cil->xc_cil_lock);
+	return commit_lsn;
+}
+
+/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+bool
+xfs_log_item_in_current_chkpt(
+	struct xfs_log_item *lip)
+{
+	struct xfs_cil_ctx *ctx;
+
+	if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
+		return false;
+	if (list_empty(&lip->li_cil))
+		return false;
+
+	ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+
+	/*
+	 * li_seq is written on the first commit of a log item to record the
+	 * first checkpoint it is written to. Hence if it is different to the
+	 * current sequence, we're in a new checkpoint.
+	 */
+	if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
+		return false;
+	return true;
+}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
new file mode 100644
index 00000000..2d3b6a49
--- /dev/null
+++ b/fs/xfs/xfs_log_priv.h
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_LOG_PRIV_H__
+#define __XFS_LOG_PRIV_H__
+
+struct xfs_buf;
+struct log;
+struct xlog_ticket;
+struct xfs_mount;
+
+/*
+ * Macros, structures, prototypes for internal log manager use.
+ */
+
+#define XLOG_MIN_ICLOGS		2
+#define XLOG_MAX_ICLOGS		8
+#define XLOG_HEADER_MAGIC_NUM	0xFEEDbabe	/* Invalid cycle number */
+#define XLOG_VERSION_1		1
+#define XLOG_VERSION_2		2		/* Large IClogs, Log sunit */
+#define XLOG_VERSION_OKBITS	(XLOG_VERSION_1 | XLOG_VERSION_2)
+#define XLOG_MIN_RECORD_BSIZE	(16*1024)	/* eventually 32k */
+#define XLOG_BIG_RECORD_BSIZE	(32*1024)	/* 32k buffers */
+#define XLOG_MAX_RECORD_BSIZE	(256*1024)
+#define XLOG_HEADER_CYCLE_SIZE	(32*1024)	/* cycle data in header */
+#define XLOG_MIN_RECORD_BSHIFT	14		/* 16384 == 1 << 14 */
+#define XLOG_BIG_RECORD_BSHIFT	15		/* 32k == 1 << 15 */
+#define XLOG_MAX_RECORD_BSHIFT	18		/* 256k == 1 << 18 */
+#define XLOG_BTOLSUNIT(log, b)  (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
+                                 (log)->l_mp->m_sb.sb_logsunit)
+#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
+
+#define XLOG_HEADER_SIZE	512
+
+#define XLOG_REC_SHIFT(log) \
+	BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+	 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+#define XLOG_TOTAL_REC_SHIFT(log) \
+	BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+	 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+
+static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
+{
+	return ((xfs_lsn_t)cycle << 32) | block;
+}
+
+static inline uint xlog_get_cycle(char *ptr)
+{
+	if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
+		return be32_to_cpu(*((__be32 *)ptr + 1));
+	else
+		return be32_to_cpu(*(__be32 *)ptr);
+}
+
+#define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
+
+#ifdef __KERNEL__
+
+/*
+ * get client id from packed copy.
+ *
+ * this hack is here because the xlog_pack code copies four bytes
+ * of xlog_op_header containing the fields oh_clientid, oh_flags
+ * and oh_res2 into the packed copy.
+ *
+ * later on this four byte chunk is treated as an int and the
+ * client id is pulled out.
+ *
+ * this has endian issues, of course.
+ */
+static inline uint xlog_get_client_id(__be32 i)
+{
+	return be32_to_cpu(i) >> 24;
+}
+
+/*
+ * In core log state
+ */
+#define XLOG_STATE_ACTIVE    0x0001 /* Current IC log being written to */
+#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */
+#define XLOG_STATE_SYNCING   0x0004 /* This IC log is syncing */
+#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */
+#define XLOG_STATE_DO_CALLBACK \
+			     0x0010 /* Process callback functions */
+#define XLOG_STATE_CALLBACK  0x0020 /* Callback functions now */
+#define XLOG_STATE_DIRTY     0x0040 /* Dirty IC log, not ready for ACTIVE status*/
+#define XLOG_STATE_IOERROR   0x0080 /* IO error happened in sync'ing log */
+#define XLOG_STATE_ALL	     0x7FFF /* All possible valid flags */
+#define XLOG_STATE_NOTUSED   0x8000 /* This IC log not being used */
+#endif	/* __KERNEL__ */
+
+/*
+ * Flags to log operation header
+ *
+ * The first write of a new transaction will be preceded with a start
+ * record, XLOG_START_TRANS.  Once a transaction is committed, a commit
+ * record is written, XLOG_COMMIT_TRANS.  If a single region can not fit into
+ * the remainder of the current active in-core log, it is split up into
+ * multiple regions.  Each partial region will be marked with a
+ * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
+ *
+ */
+#define XLOG_START_TRANS	0x01	/* Start a new transaction */
+#define XLOG_COMMIT_TRANS	0x02	/* Commit this transaction */
+#define XLOG_CONTINUE_TRANS	0x04	/* Cont this trans into new region */
+#define XLOG_WAS_CONT_TRANS	0x08	/* Cont this trans into new region */
+#define XLOG_END_TRANS		0x10	/* End a continued transaction */
+#define XLOG_UNMOUNT_TRANS	0x20	/* Unmount a filesystem transaction */
+
+#ifdef __KERNEL__
+/*
+ * Flags to log ticket
+ */
+#define XLOG_TIC_INITED		0x1	/* has been initialized */
+#define XLOG_TIC_PERM_RESERV	0x2	/* permanent reservation */
+
+#define XLOG_TIC_FLAGS \
+	{ XLOG_TIC_INITED,	"XLOG_TIC_INITED" }, \
+	{ XLOG_TIC_PERM_RESERV,	"XLOG_TIC_PERM_RESERV" }
+
+#endif	/* __KERNEL__ */
+
+#define XLOG_UNMOUNT_TYPE	0x556e	/* Un for Unmount */
+
+/*
+ * Flags for log structure
+ */
+#define XLOG_CHKSUM_MISMATCH	0x1	/* used only during recovery */
+#define XLOG_ACTIVE_RECOVERY	0x2	/* in the middle of recovery */
+#define	XLOG_RECOVERY_NEEDED	0x4	/* log was recovered */
+#define XLOG_IO_ERROR		0x8	/* log hit an I/O error, and being
+					   shutdown */
+#define XLOG_TAIL_WARN		0x10	/* log tail verify warning issued */
+
+typedef __uint32_t xlog_tid_t;
+
+#ifdef __KERNEL__
+/*
+ * Below are states for covering allocation transactions.
+ * By covering, we mean changing the h_tail_lsn in the last on-disk
+ * log write such that no allocation transactions will be re-done during
+ * recovery after a system crash. Recovery starts at the last on-disk
+ * log write.
+ *
+ * These states are used to insert dummy log entries to cover
+ * space allocation transactions which can undo non-transactional changes
+ * after a crash. Writes to a file with space
+ * already allocated do not result in any transactions. Allocations
+ * might include space beyond the EOF. So if we just push the EOF a
+ * little, the last transaction for the file could contain the wrong
+ * size. If there is no file system activity, after an allocation
+ * transaction, and the system crashes, the allocation transaction
+ * will get replayed and the file will be truncated. This could
+ * be hours/days/... after the allocation occurred.
+ *
+ * The fix for this is to do two dummy transactions when the
+ * system is idle. We need two dummy transaction because the h_tail_lsn
+ * in the log record header needs to point beyond the last possible
+ * non-dummy transaction. The first dummy changes the h_tail_lsn to
+ * the first transaction before the dummy. The second dummy causes
+ * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn.
+ *
+ * These dummy transactions get committed when everything
+ * is idle (after there has been some activity).
+ *
+ * There are 5 states used to control this.
+ *
+ *  IDLE -- no logging has been done on the file system or
+ *		we are done covering previous transactions.
+ *  NEED -- logging has occurred and we need a dummy transaction
+ *		when the log becomes idle.
+ *  DONE -- we were in the NEED state and have committed a dummy
+ *		transaction.
+ *  NEED2 -- we detected that a dummy transaction has gone to the
+ *		on disk log with no other transactions.
+ *  DONE2 -- we committed a dummy transaction when in the NEED2 state.
+ *
+ * There are two places where we switch states:
+ *
+ * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2.
+ *	We commit the dummy transaction and switch to DONE or DONE2,
+ *	respectively. In all other states, we don't do anything.
+ *
+ * 2.) When we finish writing the on-disk log (xlog_state_clean_log).
+ *
+ *	No matter what state we are in, if this isn't the dummy
+ *	transaction going out, the next state is NEED.
+ *	So, if we aren't in the DONE or DONE2 states, the next state
+ *	is NEED. We can't be finishing a write of the dummy record
+ *	unless it was committed and the state switched to DONE or DONE2.
+ *
+ *	If we are in the DONE state and this was a write of the
+ *		dummy transaction, we move to NEED2.
+ *
+ *	If we are in the DONE2 state and this was a write of the
+ *		dummy transaction, we move to IDLE.
+ *
+ *
+ * Writing only one dummy transaction can get appended to
+ * one file space allocation. When this happens, the log recovery
+ * code replays the space allocation and a file could be truncated.
+ * This is why we have the NEED2 and DONE2 states before going idle.
+ */
+
+#define XLOG_STATE_COVER_IDLE	0
+#define XLOG_STATE_COVER_NEED	1
+#define XLOG_STATE_COVER_DONE	2
+#define XLOG_STATE_COVER_NEED2	3
+#define XLOG_STATE_COVER_DONE2	4
+
+#define XLOG_COVER_OPS		5
+
+
+/* Ticket reservation region accounting */ 
+#define XLOG_TIC_LEN_MAX	15
+
+/*
+ * Reservation region
+ * As would be stored in xfs_log_iovec but without the i_addr which
+ * we don't care about.
+ */
+typedef struct xlog_res {
+	uint	r_len;	/* region length		:4 */
+	uint	r_type;	/* region's transaction type	:4 */
+} xlog_res_t;
+
+typedef struct xlog_ticket {
+	wait_queue_head_t  t_wait;	 /* ticket wait queue */
+	struct list_head   t_queue;	 /* reserve/write queue */
+	xlog_tid_t	   t_tid;	 /* transaction identifier	 : 4  */
+	atomic_t	   t_ref;	 /* ticket reference count       : 4  */
+	int		   t_curr_res;	 /* current reservation in bytes : 4  */
+	int		   t_unit_res;	 /* unit reservation in bytes    : 4  */
+	char		   t_ocnt;	 /* original count		 : 1  */
+	char		   t_cnt;	 /* current count		 : 1  */
+	char		   t_clientid;	 /* who does this belong to;	 : 1  */
+	char		   t_flags;	 /* properties of reservation	 : 1  */
+	uint		   t_trans_type; /* transaction type             : 4  */
+
+        /* reservation array fields */
+	uint		   t_res_num;                    /* num in array : 4 */
+	uint		   t_res_num_ophdrs;		 /* num op hdrs  : 4 */
+	uint		   t_res_arr_sum;		 /* array sum    : 4 */
+	uint		   t_res_o_flow;		 /* sum overflow : 4 */
+	xlog_res_t	   t_res_arr[XLOG_TIC_LEN_MAX];  /* array of res : 8 * 15 */ 
+} xlog_ticket_t;
+
+#endif
+
+
+typedef struct xlog_op_header {
+	__be32	   oh_tid;	/* transaction id of operation	:  4 b */
+	__be32	   oh_len;	/* bytes in data region		:  4 b */
+	__u8	   oh_clientid;	/* who sent me this		:  1 b */
+	__u8	   oh_flags;	/*				:  1 b */
+	__u16	   oh_res2;	/* 32 bit align			:  2 b */
+} xlog_op_header_t;
+
+
+/* valid values for h_fmt */
+#define XLOG_FMT_UNKNOWN  0
+#define XLOG_FMT_LINUX_LE 1
+#define XLOG_FMT_LINUX_BE 2
+#define XLOG_FMT_IRIX_BE  3
+
+/* our fmt */
+#ifdef XFS_NATIVE_HOST
+#define XLOG_FMT XLOG_FMT_LINUX_BE
+#else
+#define XLOG_FMT XLOG_FMT_LINUX_LE
+#endif
+
+typedef struct xlog_rec_header {
+	__be32	  h_magicno;	/* log record (LR) identifier		:  4 */
+	__be32	  h_cycle;	/* write cycle of log			:  4 */
+	__be32	  h_version;	/* LR version				:  4 */
+	__be32	  h_len;	/* len in bytes; should be 64-bit aligned: 4 */
+	__be64	  h_lsn;	/* lsn of this LR			:  8 */
+	__be64	  h_tail_lsn;	/* lsn of 1st LR w/ buffers not committed: 8 */
+	__be32	  h_chksum;	/* may not be used; non-zero if used	:  4 */
+	__be32	  h_prev_block; /* block number to previous LR		:  4 */
+	__be32	  h_num_logops;	/* number of log operations in this LR	:  4 */
+	__be32	  h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
+	/* new fields */
+	__be32    h_fmt;        /* format of log record                 :  4 */
+	uuid_t	  h_fs_uuid;    /* uuid of FS                           : 16 */
+	__be32	  h_size;	/* iclog size				:  4 */
+} xlog_rec_header_t;
+
+typedef struct xlog_rec_ext_header {
+	__be32	  xh_cycle;	/* write cycle of log			: 4 */
+	__be32	  xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /*	: 256 */
+} xlog_rec_ext_header_t;
+
+#ifdef __KERNEL__
+
+/*
+ * Quite misnamed, because this union lays out the actual on-disk log buffer.
+ */
+typedef union xlog_in_core2 {
+	xlog_rec_header_t	hic_header;
+	xlog_rec_ext_header_t	hic_xheader;
+	char			hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
+
+/*
+ * - A log record header is 512 bytes.  There is plenty of room to grow the
+ *	xlog_rec_header_t into the reserved space.
+ * - ic_data follows, so a write to disk can start at the beginning of
+ *	the iclog.
+ * - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
+ * - ic_next is the pointer to the next iclog in the ring.
+ * - ic_bp is a pointer to the buffer used to write this incore log to disk.
+ * - ic_log is a pointer back to the global log structure.
+ * - ic_callback is a linked list of callback function/argument pairs to be
+ *	called after an iclog finishes writing.
+ * - ic_size is the full size of the header plus data.
+ * - ic_offset is the current number of bytes written to in this iclog.
+ * - ic_refcnt is bumped when someone is writing to the log.
+ * - ic_state is the state of the iclog.
+ *
+ * Because of cacheline contention on large machines, we need to separate
+ * various resources onto different cachelines. To start with, make the
+ * structure cacheline aligned. The following fields can be contended on
+ * by independent processes:
+ *
+ *	- ic_callback_*
+ *	- ic_refcnt
+ *	- fields protected by the global l_icloglock
+ *
+ * so we need to ensure that these fields are located in separate cachelines.
+ * We'll put all the read-only and l_icloglock fields in the first cacheline,
+ * and move everything else out to subsequent cachelines.
+ */
+typedef struct xlog_in_core {
+	wait_queue_head_t	ic_force_wait;
+	wait_queue_head_t	ic_write_wait;
+	struct xlog_in_core	*ic_next;
+	struct xlog_in_core	*ic_prev;
+	struct xfs_buf		*ic_bp;
+	struct log		*ic_log;
+	int			ic_size;
+	int			ic_offset;
+	int			ic_bwritecnt;
+	unsigned short		ic_state;
+	char			*ic_datap;	/* pointer to iclog data */
+
+	/* Callback structures need their own cacheline */
+	spinlock_t		ic_callback_lock ____cacheline_aligned_in_smp;
+	xfs_log_callback_t	*ic_callback;
+	xfs_log_callback_t	**ic_callback_tail;
+
+	/* reference counts need their own cacheline */
+	atomic_t		ic_refcnt ____cacheline_aligned_in_smp;
+	xlog_in_core_2_t	*ic_data;
+#define ic_header	ic_data->hic_header
+} xlog_in_core_t;
+
+/*
+ * The CIL context is used to aggregate per-transaction details as well be
+ * passed to the iclog for checkpoint post-commit processing.  After being
+ * passed to the iclog, another context needs to be allocated for tracking the
+ * next set of transactions to be aggregated into a checkpoint.
+ */
+struct xfs_cil;
+
+struct xfs_cil_ctx {
+	struct xfs_cil		*cil;
+	xfs_lsn_t		sequence;	/* chkpt sequence # */
+	xfs_lsn_t		start_lsn;	/* first LSN of chkpt commit */
+	xfs_lsn_t		commit_lsn;	/* chkpt commit record lsn */
+	struct xlog_ticket	*ticket;	/* chkpt ticket */
+	int			nvecs;		/* number of regions */
+	int			space_used;	/* aggregate size of regions */
+	struct list_head	busy_extents;	/* busy extents in chkpt */
+	struct xfs_log_vec	*lv_chain;	/* logvecs being pushed */
+	xfs_log_callback_t	log_cb;		/* completion callback hook. */
+	struct list_head	committing;	/* ctx committing list */
+};
+
+/*
+ * Committed Item List structure
+ *
+ * This structure is used to track log items that have been committed but not
+ * yet written into the log. It is used only when the delayed logging mount
+ * option is enabled.
+ *
+ * This structure tracks the list of committing checkpoint contexts so
+ * we can avoid the problem of having to hold out new transactions during a
+ * flush until we have a the commit record LSN of the checkpoint. We can
+ * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
+ * sequence match and extract the commit LSN directly from there. If the
+ * checkpoint is still in the process of committing, we can block waiting for
+ * the commit LSN to be determined as well. This should make synchronous
+ * operations almost as efficient as the old logging methods.
+ */
+struct xfs_cil {
+	struct log		*xc_log;
+	struct list_head	xc_cil;
+	spinlock_t		xc_cil_lock;
+	struct xfs_cil_ctx	*xc_ctx;
+	struct rw_semaphore	xc_ctx_lock;
+	struct list_head	xc_committing;
+	wait_queue_head_t	xc_commit_wait;
+	xfs_lsn_t		xc_current_sequence;
+};
+
+/*
+ * The amount of log space we allow the CIL to aggregate is difficult to size.
+ * Whatever we choose, we have to make sure we can get a reservation for the
+ * log space effectively, that it is large enough to capture sufficient
+ * relogging to reduce log buffer IO significantly, but it is not too large for
+ * the log or induces too much latency when writing out through the iclogs. We
+ * track both space consumed and the number of vectors in the checkpoint
+ * context, so we need to decide which to use for limiting.
+ *
+ * Every log buffer we write out during a push needs a header reserved, which
+ * is at least one sector and more for v2 logs. Hence we need a reservation of
+ * at least 512 bytes per 32k of log space just for the LR headers. That means
+ * 16KB of reservation per megabyte of delayed logging space we will consume,
+ * plus various headers.  The number of headers will vary based on the num of
+ * io vectors, so limiting on a specific number of vectors is going to result
+ * in transactions of varying size. IOWs, it is more consistent to track and
+ * limit space consumed in the log rather than by the number of objects being
+ * logged in order to prevent checkpoint ticket overruns.
+ *
+ * Further, use of static reservations through the log grant mechanism is
+ * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
+ * grant) and a significant deadlock potential because regranting write space
+ * can block on log pushes. Hence if we have to regrant log space during a log
+ * push, we can deadlock.
+ *
+ * However, we can avoid this by use of a dynamic "reservation stealing"
+ * technique during transaction commit whereby unused reservation space in the
+ * transaction ticket is transferred to the CIL ctx commit ticket to cover the
+ * space needed by the checkpoint transaction. This means that we never need to
+ * specifically reserve space for the CIL checkpoint transaction, nor do we
+ * need to regrant space once the checkpoint completes. This also means the
+ * checkpoint transaction ticket is specific to the checkpoint context, rather
+ * than the CIL itself.
+ *
+ * With dynamic reservations, we can effectively make up arbitrary limits for
+ * the checkpoint size so long as they don't violate any other size rules.
+ * Recovery imposes a rule that no transaction exceed half the log, so we are
+ * limited by that.  Furthermore, the log transaction reservation subsystem
+ * tries to keep 25% of the log free, so we need to keep below that limit or we
+ * risk running out of free log space to start any new transactions.
+ *
+ * In order to keep background CIL push efficient, we will set a lower
+ * threshold at which background pushing is attempted without blocking current
+ * transaction commits.  A separate, higher bound defines when CIL pushes are
+ * enforced to ensure we stay within our maximum checkpoint size bounds.
+ * threshold, yet give us plenty of space for aggregation on large logs.
+ */
+#define XLOG_CIL_SPACE_LIMIT(log)	(log->l_logsize >> 3)
+#define XLOG_CIL_HARD_SPACE_LIMIT(log)	(3 * (log->l_logsize >> 4))
+
+/*
+ * The reservation head lsn is not made up of a cycle number and block number.
+ * Instead, it uses a cycle number and byte number.  Logs don't expect to
+ * overflow 31 bits worth of byte offset, so using a byte number will mean
+ * that round off problems won't occur when releasing partial reservations.
+ */
+typedef struct log {
+	/* The following fields don't need locking */
+	struct xfs_mount	*l_mp;	        /* mount point */
+	struct xfs_ail		*l_ailp;	/* AIL log is working with */
+	struct xfs_cil		*l_cilp;	/* CIL log is working with */
+	struct xfs_buf		*l_xbuf;        /* extra buffer for log
+						 * wrapping */
+	struct xfs_buftarg	*l_targ;        /* buftarg of log */
+	uint			l_flags;
+	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
+	struct list_head	*l_buf_cancel_table;
+	int			l_iclog_hsize;  /* size of iclog header */
+	int			l_iclog_heads;  /* # of iclog header sectors */
+	uint			l_sectBBsize;   /* sector size in BBs (2^n) */
+	int			l_iclog_size;	/* size of log in bytes */
+	int			l_iclog_size_log; /* log power size of log */
+	int			l_iclog_bufs;	/* number of iclog buffers */
+	xfs_daddr_t		l_logBBstart;   /* start block of log */
+	int			l_logsize;      /* size of log in bytes */
+	int			l_logBBsize;    /* size of log in BB chunks */
+
+	/* The following block of fields are changed while holding icloglock */
+	wait_queue_head_t	l_flush_wait ____cacheline_aligned_in_smp;
+						/* waiting for iclog flush */
+	int			l_covered_state;/* state of "covering disk
+						 * log entries" */
+	xlog_in_core_t		*l_iclog;       /* head log queue	*/
+	spinlock_t		l_icloglock;    /* grab to change iclog state */
+	int			l_curr_cycle;   /* Cycle number of log writes */
+	int			l_prev_cycle;   /* Cycle number before last
+						 * block increment */
+	int			l_curr_block;   /* current logical log block */
+	int			l_prev_block;   /* previous logical log block */
+
+	/*
+	 * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
+	 * read without needing to hold specific locks. To avoid operations
+	 * contending with other hot objects, place each of them on a separate
+	 * cacheline.
+	 */
+	/* lsn of last LR on disk */
+	atomic64_t		l_last_sync_lsn ____cacheline_aligned_in_smp;
+	/* lsn of 1st LR with unflushed * buffers */
+	atomic64_t		l_tail_lsn ____cacheline_aligned_in_smp;
+
+	/*
+	 * ticket grant locks, queues and accounting have their own cachlines
+	 * as these are quite hot and can be operated on concurrently.
+	 */
+	spinlock_t		l_grant_reserve_lock ____cacheline_aligned_in_smp;
+	struct list_head	l_reserveq;
+	atomic64_t		l_grant_reserve_head;
+
+	spinlock_t		l_grant_write_lock ____cacheline_aligned_in_smp;
+	struct list_head	l_writeq;
+	atomic64_t		l_grant_write_head;
+
+	/* The following field are used for debugging; need to hold icloglock */
+#ifdef DEBUG
+	char			*l_iclog_bak[XLOG_MAX_ICLOGS];
+#endif
+
+} xlog_t;
+
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+	((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
+
+#define XLOG_FORCED_SHUTDOWN(log)	((log)->l_flags & XLOG_IO_ERROR)
+
+/* common routines */
+extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
+extern int	 xlog_recover(xlog_t *log);
+extern int	 xlog_recover_finish(xlog_t *log);
+extern void	 xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
+
+extern kmem_zone_t *xfs_log_ticket_zone;
+struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
+				int count, char client, uint xflags,
+				int alloc_flags);
+
+
+static inline void
+xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
+{
+	*ptr += bytes;
+	*len -= bytes;
+	*off += bytes;
+}
+
+void	xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
+int	xlog_write(struct log *log, struct xfs_log_vec *log_vector,
+				struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
+				xlog_in_core_t **commit_iclog, uint flags);
+
+/*
+ * When we crack an atomic LSN, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from. This should always
+ * be used to sample and crack LSNs that are stored and updated in atomic
+ * variables.
+ */
+static inline void
+xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
+{
+	xfs_lsn_t val = atomic64_read(lsn);
+
+	*cycle = CYCLE_LSN(val);
+	*block = BLOCK_LSN(val);
+}
+
+/*
+ * Calculate and assign a value to an atomic LSN variable from component pieces.
+ */
+static inline void
+xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
+{
+	atomic64_set(lsn, xlog_assign_lsn(cycle, block));
+}
+
+/*
+ * When we crack the grant head, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from.
+ */
+static inline void
+xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
+{
+	*cycle = val >> 32;
+	*space = val & 0xffffffff;
+}
+
+static inline void
+xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
+{
+	xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
+}
+
+static inline int64_t
+xlog_assign_grant_head_val(int cycle, int space)
+{
+	return ((int64_t)cycle << 32) | space;
+}
+
+static inline void
+xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
+{
+	atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
+}
+
+/*
+ * Committed Item List interfaces
+ */
+int	xlog_cil_init(struct log *log);
+void	xlog_cil_init_post_recovery(struct log *log);
+void	xlog_cil_destroy(struct log *log);
+
+/*
+ * CIL force routines
+ */
+xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
+
+static inline void
+xlog_cil_force(struct log *log)
+{
+	xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
+}
+
+/*
+ * Unmount record type is used as a pseudo transaction type for the ticket.
+ * It's value must be outside the range of XFS_TRANS_* values.
+ */
+#define XLOG_UNMOUNT_REC_TYPE	(-1U)
+
+/*
+ * Wrapper function for waiting on a wait queue serialised against wakeups
+ * by a spinlock. This matches the semantics of all the wait queues used in the
+ * log code.
+ */
+static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue_exclusive(wq, &wait);
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	spin_unlock(lock);
+	schedule();
+	remove_wait_queue(wq, &wait);
+}
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
new file mode 100644
index 00000000..b75fd67c
--- /dev/null
+++ b/fs/xfs/xfs_log_recover.c
@@ -0,0 +1,3843 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_log_recover.h"
+#include "xfs_extfree_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_quota.h"
+#include "xfs_rw.h"
+#include "xfs_utils.h"
+#include "xfs_trace.h"
+
+STATIC int	xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
+STATIC int	xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
+#if defined(DEBUG)
+STATIC void	xlog_recover_check_summary(xlog_t *);
+#else
+#define	xlog_recover_check_summary(log)
+#endif
+
+/*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+	xfs_daddr_t		bc_blkno;
+	uint			bc_len;
+	int			bc_refcount;
+	struct list_head	bc_list;
+};
+
+/*
+ * Sector aligned buffer routines for buffer create/read/write/access
+ */
+
+/*
+ * Verify the given count of basic blocks is valid number of blocks
+ * to specify for an operation involving the given XFS log buffer.
+ * Returns nonzero if the count is valid, 0 otherwise.
+ */
+
+static inline int
+xlog_buf_bbcount_valid(
+	xlog_t		*log,
+	int		bbcount)
+{
+	return bbcount > 0 && bbcount <= log->l_logBBsize;
+}
+
+/*
+ * Allocate a buffer to hold log data.  The buffer needs to be able
+ * to map to a range of nbblks basic blocks at any valid (basic
+ * block) offset within the log.
+ */
+STATIC xfs_buf_t *
+xlog_get_bp(
+	xlog_t		*log,
+	int		nbblks)
+{
+	if (!xlog_buf_bbcount_valid(log, nbblks)) {
+		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
+			nbblks);
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
+		return NULL;
+	}
+
+	/*
+	 * We do log I/O in units of log sectors (a power-of-2
+	 * multiple of the basic block size), so we round up the
+	 * requested size to accommodate the basic blocks required
+	 * for complete log sectors.
+	 *
+	 * In addition, the buffer may be used for a non-sector-
+	 * aligned block offset, in which case an I/O of the
+	 * requested size could extend beyond the end of the
+	 * buffer.  If the requested size is only 1 basic block it
+	 * will never straddle a sector boundary, so this won't be
+	 * an issue.  Nor will this be a problem if the log I/O is
+	 * done in basic blocks (sector size 1).  But otherwise we
+	 * extend the buffer by one extra log sector to ensure
+	 * there's space to accommodate this possibility.
+	 */
+	if (nbblks > 1 && log->l_sectBBsize > 1)
+		nbblks += log->l_sectBBsize;
+	nbblks = round_up(nbblks, log->l_sectBBsize);
+
+	return xfs_buf_get_uncached(log->l_mp->m_logdev_targp,
+					BBTOB(nbblks), 0);
+}
+
+STATIC void
+xlog_put_bp(
+	xfs_buf_t	*bp)
+{
+	xfs_buf_free(bp);
+}
+
+/*
+ * Return the address of the start of the given block number's data
+ * in a log buffer.  The buffer covers a log sector-aligned region.
+ */
+STATIC xfs_caddr_t
+xlog_align(
+	xlog_t		*log,
+	xfs_daddr_t	blk_no,
+	int		nbblks,
+	xfs_buf_t	*bp)
+{
+	xfs_daddr_t	offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
+
+	ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
+	return XFS_BUF_PTR(bp) + BBTOB(offset);
+}
+
+
+/*
+ * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
+ */
+STATIC int
+xlog_bread_noalign(
+	xlog_t		*log,
+	xfs_daddr_t	blk_no,
+	int		nbblks,
+	xfs_buf_t	*bp)
+{
+	int		error;
+
+	if (!xlog_buf_bbcount_valid(log, nbblks)) {
+		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
+			nbblks);
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
+		return EFSCORRUPTED;
+	}
+
+	blk_no = round_down(blk_no, log->l_sectBBsize);
+	nbblks = round_up(nbblks, log->l_sectBBsize);
+
+	ASSERT(nbblks > 0);
+	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+
+	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
+	XFS_BUF_READ(bp);
+	XFS_BUF_BUSY(bp);
+	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
+
+	xfsbdstrat(log->l_mp, bp);
+	error = xfs_buf_iowait(bp);
+	if (error)
+		xfs_ioerror_alert("xlog_bread", log->l_mp,
+				  bp, XFS_BUF_ADDR(bp));
+	return error;
+}
+
+STATIC int
+xlog_bread(
+	xlog_t		*log,
+	xfs_daddr_t	blk_no,
+	int		nbblks,
+	xfs_buf_t	*bp,
+	xfs_caddr_t	*offset)
+{
+	int		error;
+
+	error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+	if (error)
+		return error;
+
+	*offset = xlog_align(log, blk_no, nbblks, bp);
+	return 0;
+}
+
+/*
+ * Read at an offset into the buffer. Returns with the buffer in it's original
+ * state regardless of the result of the read.
+ */
+STATIC int
+xlog_bread_offset(
+	xlog_t		*log,
+	xfs_daddr_t	blk_no,		/* block to read from */
+	int		nbblks,		/* blocks to read */
+	xfs_buf_t	*bp,
+	xfs_caddr_t	offset)
+{
+	xfs_caddr_t	orig_offset = XFS_BUF_PTR(bp);
+	int		orig_len = bp->b_buffer_length;
+	int		error, error2;
+
+	error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks));
+	if (error)
+		return error;
+
+	error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+
+	/* must reset buffer pointer even on error */
+	error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len);
+	if (error)
+		return error;
+	return error2;
+}
+
+/*
+ * Write out the buffer at the given block for the given number of blocks.
+ * The buffer is kept locked across the write and is returned locked.
+ * This can only be used for synchronous log writes.
+ */
+STATIC int
+xlog_bwrite(
+	xlog_t		*log,
+	xfs_daddr_t	blk_no,
+	int		nbblks,
+	xfs_buf_t	*bp)
+{
+	int		error;
+
+	if (!xlog_buf_bbcount_valid(log, nbblks)) {
+		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
+			nbblks);
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
+		return EFSCORRUPTED;
+	}
+
+	blk_no = round_down(blk_no, log->l_sectBBsize);
+	nbblks = round_up(nbblks, log->l_sectBBsize);
+
+	ASSERT(nbblks > 0);
+	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+
+	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
+	XFS_BUF_ZEROFLAGS(bp);
+	XFS_BUF_BUSY(bp);
+	XFS_BUF_HOLD(bp);
+	XFS_BUF_PSEMA(bp, PRIBIO);
+	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
+
+	if ((error = xfs_bwrite(log->l_mp, bp)))
+		xfs_ioerror_alert("xlog_bwrite", log->l_mp,
+				  bp, XFS_BUF_ADDR(bp));
+	return error;
+}
+
+#ifdef DEBUG
+/*
+ * dump debug superblock and log record information
+ */
+STATIC void
+xlog_header_check_dump(
+	xfs_mount_t		*mp,
+	xlog_rec_header_t	*head)
+{
+	xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d\n",
+		__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
+	xfs_debug(mp, "    log : uuid = %pU, fmt = %d\n",
+		&head->h_fs_uuid, be32_to_cpu(head->h_fmt));
+}
+#else
+#define xlog_header_check_dump(mp, head)
+#endif
+
+/*
+ * check log record header for recovery
+ */
+STATIC int
+xlog_header_check_recover(
+	xfs_mount_t		*mp,
+	xlog_rec_header_t	*head)
+{
+	ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
+
+	/*
+	 * IRIX doesn't write the h_fmt field and leaves it zeroed
+	 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
+	 * a dirty log created in IRIX.
+	 */
+	if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
+		xfs_warn(mp,
+	"dirty log written in incompatible format - can't recover");
+		xlog_header_check_dump(mp, head);
+		XFS_ERROR_REPORT("xlog_header_check_recover(1)",
+				 XFS_ERRLEVEL_HIGH, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
+		xfs_warn(mp,
+	"dirty log entry has mismatched uuid - can't recover");
+		xlog_header_check_dump(mp, head);
+		XFS_ERROR_REPORT("xlog_header_check_recover(2)",
+				 XFS_ERRLEVEL_HIGH, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	return 0;
+}
+
+/*
+ * read the head block of the log and check the header
+ */
+STATIC int
+xlog_header_check_mount(
+	xfs_mount_t		*mp,
+	xlog_rec_header_t	*head)
+{
+	ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
+
+	if (uuid_is_nil(&head->h_fs_uuid)) {
+		/*
+		 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
+		 * h_fs_uuid is nil, we assume this log was last mounted
+		 * by IRIX and continue.
+		 */
+		xfs_warn(mp, "nil uuid in log - IRIX style log");
+	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
+		xfs_warn(mp, "log has mismatched uuid - can't recover");
+		xlog_header_check_dump(mp, head);
+		XFS_ERROR_REPORT("xlog_header_check_mount",
+				 XFS_ERRLEVEL_HIGH, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	return 0;
+}
+
+STATIC void
+xlog_recover_iodone(
+	struct xfs_buf	*bp)
+{
+	if (XFS_BUF_GETERROR(bp)) {
+		/*
+		 * We're not going to bother about retrying
+		 * this during recovery. One strike!
+		 */
+		xfs_ioerror_alert("xlog_recover_iodone",
+					bp->b_target->bt_mount, bp,
+					XFS_BUF_ADDR(bp));
+		xfs_force_shutdown(bp->b_target->bt_mount,
+					SHUTDOWN_META_IO_ERROR);
+	}
+	XFS_BUF_CLR_IODONE_FUNC(bp);
+	xfs_buf_ioend(bp, 0);
+}
+
+/*
+ * This routine finds (to an approximation) the first block in the physical
+ * log which contains the given cycle.  It uses a binary search algorithm.
+ * Note that the algorithm can not be perfect because the disk will not
+ * necessarily be perfect.
+ */
+STATIC int
+xlog_find_cycle_start(
+	xlog_t		*log,
+	xfs_buf_t	*bp,
+	xfs_daddr_t	first_blk,
+	xfs_daddr_t	*last_blk,
+	uint		cycle)
+{
+	xfs_caddr_t	offset;
+	xfs_daddr_t	mid_blk;
+	xfs_daddr_t	end_blk;
+	uint		mid_cycle;
+	int		error;
+
+	end_blk = *last_blk;
+	mid_blk = BLK_AVG(first_blk, end_blk);
+	while (mid_blk != first_blk && mid_blk != end_blk) {
+		error = xlog_bread(log, mid_blk, 1, bp, &offset);
+		if (error)
+			return error;
+		mid_cycle = xlog_get_cycle(offset);
+		if (mid_cycle == cycle)
+			end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
+		else
+			first_blk = mid_blk; /* first_half_cycle == mid_cycle */
+		mid_blk = BLK_AVG(first_blk, end_blk);
+	}
+	ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
+	       (mid_blk == end_blk && mid_blk-1 == first_blk));
+
+	*last_blk = end_blk;
+
+	return 0;
+}
+
+/*
+ * Check that a range of blocks does not contain stop_on_cycle_no.
+ * Fill in *new_blk with the block offset where such a block is
+ * found, or with -1 (an invalid block number) if there is no such
+ * block in the range.  The scan needs to occur from front to back
+ * and the pointer into the region must be updated since a later
+ * routine will need to perform another test.
+ */
+STATIC int
+xlog_find_verify_cycle(
+	xlog_t		*log,
+	xfs_daddr_t	start_blk,
+	int		nbblks,
+	uint		stop_on_cycle_no,
+	xfs_daddr_t	*new_blk)
+{
+	xfs_daddr_t	i, j;
+	uint		cycle;
+	xfs_buf_t	*bp;
+	xfs_daddr_t	bufblks;
+	xfs_caddr_t	buf = NULL;
+	int		error = 0;
+
+	/*
+	 * Greedily allocate a buffer big enough to handle the full
+	 * range of basic blocks we'll be examining.  If that fails,
+	 * try a smaller size.  We need to be able to read at least
+	 * a log sector, or we're out of luck.
+	 */
+	bufblks = 1 << ffs(nbblks);
+	while (!(bp = xlog_get_bp(log, bufblks))) {
+		bufblks >>= 1;
+		if (bufblks < log->l_sectBBsize)
+			return ENOMEM;
+	}
+
+	for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
+		int	bcount;
+
+		bcount = min(bufblks, (start_blk + nbblks - i));
+
+		error = xlog_bread(log, i, bcount, bp, &buf);
+		if (error)
+			goto out;
+
+		for (j = 0; j < bcount; j++) {
+			cycle = xlog_get_cycle(buf);
+			if (cycle == stop_on_cycle_no) {
+				*new_blk = i+j;
+				goto out;
+			}
+
+			buf += BBSIZE;
+		}
+	}
+
+	*new_blk = -1;
+
+out:
+	xlog_put_bp(bp);
+	return error;
+}
+
+/*
+ * Potentially backup over partial log record write.
+ *
+ * In the typical case, last_blk is the number of the block directly after
+ * a good log record.  Therefore, we subtract one to get the block number
+ * of the last block in the given buffer.  extra_bblks contains the number
+ * of blocks we would have read on a previous read.  This happens when the
+ * last log record is split over the end of the physical log.
+ *
+ * extra_bblks is the number of blocks potentially verified on a previous
+ * call to this routine.
+ */
+STATIC int
+xlog_find_verify_log_record(
+	xlog_t			*log,
+	xfs_daddr_t		start_blk,
+	xfs_daddr_t		*last_blk,
+	int			extra_bblks)
+{
+	xfs_daddr_t		i;
+	xfs_buf_t		*bp;
+	xfs_caddr_t		offset = NULL;
+	xlog_rec_header_t	*head = NULL;
+	int			error = 0;
+	int			smallmem = 0;
+	int			num_blks = *last_blk - start_blk;
+	int			xhdrs;
+
+	ASSERT(start_blk != 0 || *last_blk != start_blk);
+
+	if (!(bp = xlog_get_bp(log, num_blks))) {
+		if (!(bp = xlog_get_bp(log, 1)))
+			return ENOMEM;
+		smallmem = 1;
+	} else {
+		error = xlog_bread(log, start_blk, num_blks, bp, &offset);
+		if (error)
+			goto out;
+		offset += ((num_blks - 1) << BBSHIFT);
+	}
+
+	for (i = (*last_blk) - 1; i >= 0; i--) {
+		if (i < start_blk) {
+			/* valid log record not found */
+			xfs_warn(log->l_mp,
+		"Log inconsistent (didn't find previous header)");
+			ASSERT(0);
+			error = XFS_ERROR(EIO);
+			goto out;
+		}
+
+		if (smallmem) {
+			error = xlog_bread(log, i, 1, bp, &offset);
+			if (error)
+				goto out;
+		}
+
+		head = (xlog_rec_header_t *)offset;
+
+		if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno))
+			break;
+
+		if (!smallmem)
+			offset -= BBSIZE;
+	}
+
+	/*
+	 * We hit the beginning of the physical log & still no header.  Return
+	 * to caller.  If caller can handle a return of -1, then this routine
+	 * will be called again for the end of the physical log.
+	 */
+	if (i == -1) {
+		error = -1;
+		goto out;
+	}
+
+	/*
+	 * We have the final block of the good log (the first block
+	 * of the log record _before_ the head. So we check the uuid.
+	 */
+	if ((error = xlog_header_check_mount(log->l_mp, head)))
+		goto out;
+
+	/*
+	 * We may have found a log record header before we expected one.
+	 * last_blk will be the 1st block # with a given cycle #.  We may end
+	 * up reading an entire log record.  In this case, we don't want to
+	 * reset last_blk.  Only when last_blk points in the middle of a log
+	 * record do we update last_blk.
+	 */
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+		uint	h_size = be32_to_cpu(head->h_size);
+
+		xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
+		if (h_size % XLOG_HEADER_CYCLE_SIZE)
+			xhdrs++;
+	} else {
+		xhdrs = 1;
+	}
+
+	if (*last_blk - i + extra_bblks !=
+	    BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
+		*last_blk = i;
+
+out:
+	xlog_put_bp(bp);
+	return error;
+}
+
+/*
+ * Head is defined to be the point of the log where the next log write
+ * write could go.  This means that incomplete LR writes at the end are
+ * eliminated when calculating the head.  We aren't guaranteed that previous
+ * LR have complete transactions.  We only know that a cycle number of
+ * current cycle number -1 won't be present in the log if we start writing
+ * from our current block number.
+ *
+ * last_blk contains the block number of the first block with a given
+ * cycle number.
+ *
+ * Return: zero if normal, non-zero if error.
+ */
+STATIC int
+xlog_find_head(
+	xlog_t 		*log,
+	xfs_daddr_t	*return_head_blk)
+{
+	xfs_buf_t	*bp;
+	xfs_caddr_t	offset;
+	xfs_daddr_t	new_blk, first_blk, start_blk, last_blk, head_blk;
+	int		num_scan_bblks;
+	uint		first_half_cycle, last_half_cycle;
+	uint		stop_on_cycle;
+	int		error, log_bbnum = log->l_logBBsize;
+
+	/* Is the end of the log device zeroed? */
+	if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
+		*return_head_blk = first_blk;
+
+		/* Is the whole lot zeroed? */
+		if (!first_blk) {
+			/* Linux XFS shouldn't generate totally zeroed logs -
+			 * mkfs etc write a dummy unmount record to a fresh
+			 * log so we can store the uuid in there
+			 */
+			xfs_warn(log->l_mp, "totally zeroed log");
+		}
+
+		return 0;
+	} else if (error) {
+		xfs_warn(log->l_mp, "empty log check failed");
+		return error;
+	}
+
+	first_blk = 0;			/* get cycle # of 1st block */
+	bp = xlog_get_bp(log, 1);
+	if (!bp)
+		return ENOMEM;
+
+	error = xlog_bread(log, 0, 1, bp, &offset);
+	if (error)
+		goto bp_err;
+
+	first_half_cycle = xlog_get_cycle(offset);
+
+	last_blk = head_blk = log_bbnum - 1;	/* get cycle # of last block */
+	error = xlog_bread(log, last_blk, 1, bp, &offset);
+	if (error)
+		goto bp_err;
+
+	last_half_cycle = xlog_get_cycle(offset);
+	ASSERT(last_half_cycle != 0);
+
+	/*
+	 * If the 1st half cycle number is equal to the last half cycle number,
+	 * then the entire log is stamped with the same cycle number.  In this
+	 * case, head_blk can't be set to zero (which makes sense).  The below
+	 * math doesn't work out properly with head_blk equal to zero.  Instead,
+	 * we set it to log_bbnum which is an invalid block number, but this
+	 * value makes the math correct.  If head_blk doesn't changed through
+	 * all the tests below, *head_blk is set to zero at the very end rather
+	 * than log_bbnum.  In a sense, log_bbnum and zero are the same block
+	 * in a circular file.
+	 */
+	if (first_half_cycle == last_half_cycle) {
+		/*
+		 * In this case we believe that the entire log should have
+		 * cycle number last_half_cycle.  We need to scan backwards
+		 * from the end verifying that there are no holes still
+		 * containing last_half_cycle - 1.  If we find such a hole,
+		 * then the start of that hole will be the new head.  The
+		 * simple case looks like
+		 *        x | x ... | x - 1 | x
+		 * Another case that fits this picture would be
+		 *        x | x + 1 | x ... | x
+		 * In this case the head really is somewhere at the end of the
+		 * log, as one of the latest writes at the beginning was
+		 * incomplete.
+		 * One more case is
+		 *        x | x + 1 | x ... | x - 1 | x
+		 * This is really the combination of the above two cases, and
+		 * the head has to end up at the start of the x-1 hole at the
+		 * end of the log.
+		 *
+		 * In the 256k log case, we will read from the beginning to the
+		 * end of the log and search for cycle numbers equal to x-1.
+		 * We don't worry about the x+1 blocks that we encounter,
+		 * because we know that they cannot be the head since the log
+		 * started with x.
+		 */
+		head_blk = log_bbnum;
+		stop_on_cycle = last_half_cycle - 1;
+	} else {
+		/*
+		 * In this case we want to find the first block with cycle
+		 * number matching last_half_cycle.  We expect the log to be
+		 * some variation on
+		 *        x + 1 ... | x ... | x
+		 * The first block with cycle number x (last_half_cycle) will
+		 * be where the new head belongs.  First we do a binary search
+		 * for the first occurrence of last_half_cycle.  The binary
+		 * search may not be totally accurate, so then we scan back
+		 * from there looking for occurrences of last_half_cycle before
+		 * us.  If that backwards scan wraps around the beginning of
+		 * the log, then we look for occurrences of last_half_cycle - 1
+		 * at the end of the log.  The cases we're looking for look
+		 * like
+		 *                               v binary search stopped here
+		 *        x + 1 ... | x | x + 1 | x ... | x
+		 *                   ^ but we want to locate this spot
+		 * or
+		 *        <---------> less than scan distance
+		 *        x + 1 ... | x ... | x - 1 | x
+		 *                           ^ we want to locate this spot
+		 */
+		stop_on_cycle = last_half_cycle;
+		if ((error = xlog_find_cycle_start(log, bp, first_blk,
+						&head_blk, last_half_cycle)))
+			goto bp_err;
+	}
+
+	/*
+	 * Now validate the answer.  Scan back some number of maximum possible
+	 * blocks and make sure each one has the expected cycle number.  The
+	 * maximum is determined by the total possible amount of buffering
+	 * in the in-core log.  The following number can be made tighter if
+	 * we actually look at the block size of the filesystem.
+	 */
+	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
+	if (head_blk >= num_scan_bblks) {
+		/*
+		 * We are guaranteed that the entire check can be performed
+		 * in one buffer.
+		 */
+		start_blk = head_blk - num_scan_bblks;
+		if ((error = xlog_find_verify_cycle(log,
+						start_blk, num_scan_bblks,
+						stop_on_cycle, &new_blk)))
+			goto bp_err;
+		if (new_blk != -1)
+			head_blk = new_blk;
+	} else {		/* need to read 2 parts of log */
+		/*
+		 * We are going to scan backwards in the log in two parts.
+		 * First we scan the physical end of the log.  In this part
+		 * of the log, we are looking for blocks with cycle number
+		 * last_half_cycle - 1.
+		 * If we find one, then we know that the log starts there, as
+		 * we've found a hole that didn't get written in going around
+		 * the end of the physical log.  The simple case for this is
+		 *        x + 1 ... | x ... | x - 1 | x
+		 *        <---------> less than scan distance
+		 * If all of the blocks at the end of the log have cycle number
+		 * last_half_cycle, then we check the blocks at the start of
+		 * the log looking for occurrences of last_half_cycle.  If we
+		 * find one, then our current estimate for the location of the
+		 * first occurrence of last_half_cycle is wrong and we move
+		 * back to the hole we've found.  This case looks like
+		 *        x + 1 ... | x | x + 1 | x ...
+		 *                               ^ binary search stopped here
+		 * Another case we need to handle that only occurs in 256k
+		 * logs is
+		 *        x + 1 ... | x ... | x+1 | x ...
+		 *                   ^ binary search stops here
+		 * In a 256k log, the scan at the end of the log will see the
+		 * x + 1 blocks.  We need to skip past those since that is
+		 * certainly not the head of the log.  By searching for
+		 * last_half_cycle-1 we accomplish that.
+		 */
+		ASSERT(head_blk <= INT_MAX &&
+			(xfs_daddr_t) num_scan_bblks >= head_blk);
+		start_blk = log_bbnum - (num_scan_bblks - head_blk);
+		if ((error = xlog_find_verify_cycle(log, start_blk,
+					num_scan_bblks - (int)head_blk,
+					(stop_on_cycle - 1), &new_blk)))
+			goto bp_err;
+		if (new_blk != -1) {
+			head_blk = new_blk;
+			goto validate_head;
+		}
+
+		/*
+		 * Scan beginning of log now.  The last part of the physical
+		 * log is good.  This scan needs to verify that it doesn't find
+		 * the last_half_cycle.
+		 */
+		start_blk = 0;
+		ASSERT(head_blk <= INT_MAX);
+		if ((error = xlog_find_verify_cycle(log,
+					start_blk, (int)head_blk,
+					stop_on_cycle, &new_blk)))
+			goto bp_err;
+		if (new_blk != -1)
+			head_blk = new_blk;
+	}
+
+validate_head:
+	/*
+	 * Now we need to make sure head_blk is not pointing to a block in
+	 * the middle of a log record.
+	 */
+	num_scan_bblks = XLOG_REC_SHIFT(log);
+	if (head_blk >= num_scan_bblks) {
+		start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
+
+		/* start ptr at last block ptr before head_blk */
+		if ((error = xlog_find_verify_log_record(log, start_blk,
+							&head_blk, 0)) == -1) {
+			error = XFS_ERROR(EIO);
+			goto bp_err;
+		} else if (error)
+			goto bp_err;
+	} else {
+		start_blk = 0;
+		ASSERT(head_blk <= INT_MAX);
+		if ((error = xlog_find_verify_log_record(log, start_blk,
+							&head_blk, 0)) == -1) {
+			/* We hit the beginning of the log during our search */
+			start_blk = log_bbnum - (num_scan_bblks - head_blk);
+			new_blk = log_bbnum;
+			ASSERT(start_blk <= INT_MAX &&
+				(xfs_daddr_t) log_bbnum-start_blk >= 0);
+			ASSERT(head_blk <= INT_MAX);
+			if ((error = xlog_find_verify_log_record(log,
+							start_blk, &new_blk,
+							(int)head_blk)) == -1) {
+				error = XFS_ERROR(EIO);
+				goto bp_err;
+			} else if (error)
+				goto bp_err;
+			if (new_blk != log_bbnum)
+				head_blk = new_blk;
+		} else if (error)
+			goto bp_err;
+	}
+
+	xlog_put_bp(bp);
+	if (head_blk == log_bbnum)
+		*return_head_blk = 0;
+	else
+		*return_head_blk = head_blk;
+	/*
+	 * When returning here, we have a good block number.  Bad block
+	 * means that during a previous crash, we didn't have a clean break
+	 * from cycle number N to cycle number N-1.  In this case, we need
+	 * to find the first block with cycle number N-1.
+	 */
+	return 0;
+
+ bp_err:
+	xlog_put_bp(bp);
+
+	if (error)
+		xfs_warn(log->l_mp, "failed to find log head");
+	return error;
+}
+
+/*
+ * Find the sync block number or the tail of the log.
+ *
+ * This will be the block number of the last record to have its
+ * associated buffers synced to disk.  Every log record header has
+ * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
+ * to get a sync block number.  The only concern is to figure out which
+ * log record header to believe.
+ *
+ * The following algorithm uses the log record header with the largest
+ * lsn.  The entire log record does not need to be valid.  We only care
+ * that the header is valid.
+ *
+ * We could speed up search by using current head_blk buffer, but it is not
+ * available.
+ */
+STATIC int
+xlog_find_tail(
+	xlog_t			*log,
+	xfs_daddr_t		*head_blk,
+	xfs_daddr_t		*tail_blk)
+{
+	xlog_rec_header_t	*rhead;
+	xlog_op_header_t	*op_head;
+	xfs_caddr_t		offset = NULL;
+	xfs_buf_t		*bp;
+	int			error, i, found;
+	xfs_daddr_t		umount_data_blk;
+	xfs_daddr_t		after_umount_blk;
+	xfs_lsn_t		tail_lsn;
+	int			hblks;
+
+	found = 0;
+
+	/*
+	 * Find previous log record
+	 */
+	if ((error = xlog_find_head(log, head_blk)))
+		return error;
+
+	bp = xlog_get_bp(log, 1);
+	if (!bp)
+		return ENOMEM;
+	if (*head_blk == 0) {				/* special case */
+		error = xlog_bread(log, 0, 1, bp, &offset);
+		if (error)
+			goto done;
+
+		if (xlog_get_cycle(offset) == 0) {
+			*tail_blk = 0;
+			/* leave all other log inited values alone */
+			goto done;
+		}
+	}
+
+	/*
+	 * Search backwards looking for log record header block
+	 */
+	ASSERT(*head_blk < INT_MAX);
+	for (i = (int)(*head_blk) - 1; i >= 0; i--) {
+		error = xlog_bread(log, i, 1, bp, &offset);
+		if (error)
+			goto done;
+
+		if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
+			found = 1;
+			break;
+		}
+	}
+	/*
+	 * If we haven't found the log record header block, start looking
+	 * again from the end of the physical log.  XXXmiken: There should be
+	 * a check here to make sure we didn't search more than N blocks in
+	 * the previous code.
+	 */
+	if (!found) {
+		for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
+			error = xlog_bread(log, i, 1, bp, &offset);
+			if (error)
+				goto done;
+
+			if (XLOG_HEADER_MAGIC_NUM ==
+			    be32_to_cpu(*(__be32 *)offset)) {
+				found = 2;
+				break;
+			}
+		}
+	}
+	if (!found) {
+		xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+		ASSERT(0);
+		return XFS_ERROR(EIO);
+	}
+
+	/* find blk_no of tail of log */
+	rhead = (xlog_rec_header_t *)offset;
+	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
+
+	/*
+	 * Reset log values according to the state of the log when we
+	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
+	 * one because the next write starts a new cycle rather than
+	 * continuing the cycle of the last good log record.  At this
+	 * point we have guaranteed that all partial log records have been
+	 * accounted for.  Therefore, we know that the last good log record
+	 * written was complete and ended exactly on the end boundary
+	 * of the physical log.
+	 */
+	log->l_prev_block = i;
+	log->l_curr_block = (int)*head_blk;
+	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
+	if (found == 2)
+		log->l_curr_cycle++;
+	atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+	atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+	xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
+					BBTOB(log->l_curr_block));
+	xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
+					BBTOB(log->l_curr_block));
+
+	/*
+	 * Look for unmount record.  If we find it, then we know there
+	 * was a clean unmount.  Since 'i' could be the last block in
+	 * the physical log, we convert to a log block before comparing
+	 * to the head_blk.
+	 *
+	 * Save the current tail lsn to use to pass to
+	 * xlog_clear_stale_blocks() below.  We won't want to clear the
+	 * unmount record if there is one, so we pass the lsn of the
+	 * unmount record rather than the block after it.
+	 */
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+		int	h_size = be32_to_cpu(rhead->h_size);
+		int	h_version = be32_to_cpu(rhead->h_version);
+
+		if ((h_version & XLOG_VERSION_2) &&
+		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+			if (h_size % XLOG_HEADER_CYCLE_SIZE)
+				hblks++;
+		} else {
+			hblks = 1;
+		}
+	} else {
+		hblks = 1;
+	}
+	after_umount_blk = (i + hblks + (int)
+		BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
+	tail_lsn = atomic64_read(&log->l_tail_lsn);
+	if (*head_blk == after_umount_blk &&
+	    be32_to_cpu(rhead->h_num_logops) == 1) {
+		umount_data_blk = (i + hblks) % log->l_logBBsize;
+		error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+		if (error)
+			goto done;
+
+		op_head = (xlog_op_header_t *)offset;
+		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
+			/*
+			 * Set tail and last sync so that newly written
+			 * log records will point recovery to after the
+			 * current unmount record.
+			 */
+			xlog_assign_atomic_lsn(&log->l_tail_lsn,
+					log->l_curr_cycle, after_umount_blk);
+			xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
+					log->l_curr_cycle, after_umount_blk);
+			*tail_blk = after_umount_blk;
+
+			/*
+			 * Note that the unmount was clean. If the unmount
+			 * was not clean, we need to know this to rebuild the
+			 * superblock counters from the perag headers if we
+			 * have a filesystem using non-persistent counters.
+			 */
+			log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+		}
+	}
+
+	/*
+	 * Make sure that there are no blocks in front of the head
+	 * with the same cycle number as the head.  This can happen
+	 * because we allow multiple outstanding log writes concurrently,
+	 * and the later writes might make it out before earlier ones.
+	 *
+	 * We use the lsn from before modifying it so that we'll never
+	 * overwrite the unmount record after a clean unmount.
+	 *
+	 * Do this only if we are going to recover the filesystem
+	 *
+	 * NOTE: This used to say "if (!readonly)"
+	 * However on Linux, we can & do recover a read-only filesystem.
+	 * We only skip recovery if NORECOVERY is specified on mount,
+	 * in which case we would not be here.
+	 *
+	 * But... if the -device- itself is readonly, just skip this.
+	 * We can't recover this device anyway, so it won't matter.
+	 */
+	if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
+		error = xlog_clear_stale_blocks(log, tail_lsn);
+
+done:
+	xlog_put_bp(bp);
+
+	if (error)
+		xfs_warn(log->l_mp, "failed to locate log tail");
+	return error;
+}
+
+/*
+ * Is the log zeroed at all?
+ *
+ * The last binary search should be changed to perform an X block read
+ * once X becomes small enough.  You can then search linearly through
+ * the X blocks.  This will cut down on the number of reads we need to do.
+ *
+ * If the log is partially zeroed, this routine will pass back the blkno
+ * of the first block with cycle number 0.  It won't have a complete LR
+ * preceding it.
+ *
+ * Return:
+ *	0  => the log is completely written to
+ *	-1 => use *blk_no as the first block of the log
+ *	>0 => error has occurred
+ */
+STATIC int
+xlog_find_zeroed(
+	xlog_t		*log,
+	xfs_daddr_t	*blk_no)
+{
+	xfs_buf_t	*bp;
+	xfs_caddr_t	offset;
+	uint	        first_cycle, last_cycle;
+	xfs_daddr_t	new_blk, last_blk, start_blk;
+	xfs_daddr_t     num_scan_bblks;
+	int	        error, log_bbnum = log->l_logBBsize;
+
+	*blk_no = 0;
+
+	/* check totally zeroed log */
+	bp = xlog_get_bp(log, 1);
+	if (!bp)
+		return ENOMEM;
+	error = xlog_bread(log, 0, 1, bp, &offset);
+	if (error)
+		goto bp_err;
+
+	first_cycle = xlog_get_cycle(offset);
+	if (first_cycle == 0) {		/* completely zeroed log */
+		*blk_no = 0;
+		xlog_put_bp(bp);
+		return -1;
+	}
+
+	/* check partially zeroed log */
+	error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
+	if (error)
+		goto bp_err;
+
+	last_cycle = xlog_get_cycle(offset);
+	if (last_cycle != 0) {		/* log completely written to */
+		xlog_put_bp(bp);
+		return 0;
+	} else if (first_cycle != 1) {
+		/*
+		 * If the cycle of the last block is zero, the cycle of
+		 * the first block must be 1. If it's not, maybe we're
+		 * not looking at a log... Bail out.
+		 */
+		xfs_warn(log->l_mp,
+			"Log inconsistent or not a log (last==0, first!=1)");
+		return XFS_ERROR(EINVAL);
+	}
+
+	/* we have a partially zeroed log */
+	last_blk = log_bbnum-1;
+	if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
+		goto bp_err;
+
+	/*
+	 * Validate the answer.  Because there is no way to guarantee that
+	 * the entire log is made up of log records which are the same size,
+	 * we scan over the defined maximum blocks.  At this point, the maximum
+	 * is not chosen to mean anything special.   XXXmiken
+	 */
+	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
+	ASSERT(num_scan_bblks <= INT_MAX);
+
+	if (last_blk < num_scan_bblks)
+		num_scan_bblks = last_blk;
+	start_blk = last_blk - num_scan_bblks;
+
+	/*
+	 * We search for any instances of cycle number 0 that occur before
+	 * our current estimate of the head.  What we're trying to detect is
+	 *        1 ... | 0 | 1 | 0...
+	 *                       ^ binary search ends here
+	 */
+	if ((error = xlog_find_verify_cycle(log, start_blk,
+					 (int)num_scan_bblks, 0, &new_blk)))
+		goto bp_err;
+	if (new_blk != -1)
+		last_blk = new_blk;
+
+	/*
+	 * Potentially backup over partial log record write.  We don't need
+	 * to search the end of the log because we know it is zero.
+	 */
+	if ((error = xlog_find_verify_log_record(log, start_blk,
+				&last_blk, 0)) == -1) {
+	    error = XFS_ERROR(EIO);
+	    goto bp_err;
+	} else if (error)
+	    goto bp_err;
+
+	*blk_no = last_blk;
+bp_err:
+	xlog_put_bp(bp);
+	if (error)
+		return error;
+	return -1;
+}
+
+/*
+ * These are simple subroutines used by xlog_clear_stale_blocks() below
+ * to initialize a buffer full of empty log record headers and write
+ * them into the log.
+ */
+STATIC void
+xlog_add_record(
+	xlog_t			*log,
+	xfs_caddr_t		buf,
+	int			cycle,
+	int			block,
+	int			tail_cycle,
+	int			tail_block)
+{
+	xlog_rec_header_t	*recp = (xlog_rec_header_t *)buf;
+
+	memset(buf, 0, BBSIZE);
+	recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
+	recp->h_cycle = cpu_to_be32(cycle);
+	recp->h_version = cpu_to_be32(
+			xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
+	recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
+	recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
+	recp->h_fmt = cpu_to_be32(XLOG_FMT);
+	memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
+}
+
+STATIC int
+xlog_write_log_records(
+	xlog_t		*log,
+	int		cycle,
+	int		start_block,
+	int		blocks,
+	int		tail_cycle,
+	int		tail_block)
+{
+	xfs_caddr_t	offset;
+	xfs_buf_t	*bp;
+	int		balign, ealign;
+	int		sectbb = log->l_sectBBsize;
+	int		end_block = start_block + blocks;
+	int		bufblks;
+	int		error = 0;
+	int		i, j = 0;
+
+	/*
+	 * Greedily allocate a buffer big enough to handle the full
+	 * range of basic blocks to be written.  If that fails, try
+	 * a smaller size.  We need to be able to write at least a
+	 * log sector, or we're out of luck.
+	 */
+	bufblks = 1 << ffs(blocks);
+	while (!(bp = xlog_get_bp(log, bufblks))) {
+		bufblks >>= 1;
+		if (bufblks < sectbb)
+			return ENOMEM;
+	}
+
+	/* We may need to do a read at the start to fill in part of
+	 * the buffer in the starting sector not covered by the first
+	 * write below.
+	 */
+	balign = round_down(start_block, sectbb);
+	if (balign != start_block) {
+		error = xlog_bread_noalign(log, start_block, 1, bp);
+		if (error)
+			goto out_put_bp;
+
+		j = start_block - balign;
+	}
+
+	for (i = start_block; i < end_block; i += bufblks) {
+		int		bcount, endcount;
+
+		bcount = min(bufblks, end_block - start_block);
+		endcount = bcount - j;
+
+		/* We may need to do a read at the end to fill in part of
+		 * the buffer in the final sector not covered by the write.
+		 * If this is the same sector as the above read, skip it.
+		 */
+		ealign = round_down(end_block, sectbb);
+		if (j == 0 && (start_block + endcount > ealign)) {
+			offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block);
+			error = xlog_bread_offset(log, ealign, sectbb,
+							bp, offset);
+			if (error)
+				break;
+
+		}
+
+		offset = xlog_align(log, start_block, endcount, bp);
+		for (; j < endcount; j++) {
+			xlog_add_record(log, offset, cycle, i+j,
+					tail_cycle, tail_block);
+			offset += BBSIZE;
+		}
+		error = xlog_bwrite(log, start_block, endcount, bp);
+		if (error)
+			break;
+		start_block += endcount;
+		j = 0;
+	}
+
+ out_put_bp:
+	xlog_put_bp(bp);
+	return error;
+}
+
+/*
+ * This routine is called to blow away any incomplete log writes out
+ * in front of the log head.  We do this so that we won't become confused
+ * if we come up, write only a little bit more, and then crash again.
+ * If we leave the partial log records out there, this situation could
+ * cause us to think those partial writes are valid blocks since they
+ * have the current cycle number.  We get rid of them by overwriting them
+ * with empty log records with the old cycle number rather than the
+ * current one.
+ *
+ * The tail lsn is passed in rather than taken from
+ * the log so that we will not write over the unmount record after a
+ * clean unmount in a 512 block log.  Doing so would leave the log without
+ * any valid log records in it until a new one was written.  If we crashed
+ * during that time we would not be able to recover.
+ */
+STATIC int
+xlog_clear_stale_blocks(
+	xlog_t		*log,
+	xfs_lsn_t	tail_lsn)
+{
+	int		tail_cycle, head_cycle;
+	int		tail_block, head_block;
+	int		tail_distance, max_distance;
+	int		distance;
+	int		error;
+
+	tail_cycle = CYCLE_LSN(tail_lsn);
+	tail_block = BLOCK_LSN(tail_lsn);
+	head_cycle = log->l_curr_cycle;
+	head_block = log->l_curr_block;
+
+	/*
+	 * Figure out the distance between the new head of the log
+	 * and the tail.  We want to write over any blocks beyond the
+	 * head that we may have written just before the crash, but
+	 * we don't want to overwrite the tail of the log.
+	 */
+	if (head_cycle == tail_cycle) {
+		/*
+		 * The tail is behind the head in the physical log,
+		 * so the distance from the head to the tail is the
+		 * distance from the head to the end of the log plus
+		 * the distance from the beginning of the log to the
+		 * tail.
+		 */
+		if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
+			XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
+					 XFS_ERRLEVEL_LOW, log->l_mp);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		tail_distance = tail_block + (log->l_logBBsize - head_block);
+	} else {
+		/*
+		 * The head is behind the tail in the physical log,
+		 * so the distance from the head to the tail is just
+		 * the tail block minus the head block.
+		 */
+		if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
+			XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
+					 XFS_ERRLEVEL_LOW, log->l_mp);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		tail_distance = tail_block - head_block;
+	}
+
+	/*
+	 * If the head is right up against the tail, we can't clear
+	 * anything.
+	 */
+	if (tail_distance <= 0) {
+		ASSERT(tail_distance == 0);
+		return 0;
+	}
+
+	max_distance = XLOG_TOTAL_REC_SHIFT(log);
+	/*
+	 * Take the smaller of the maximum amount of outstanding I/O
+	 * we could have and the distance to the tail to clear out.
+	 * We take the smaller so that we don't overwrite the tail and
+	 * we don't waste all day writing from the head to the tail
+	 * for no reason.
+	 */
+	max_distance = MIN(max_distance, tail_distance);
+
+	if ((head_block + max_distance) <= log->l_logBBsize) {
+		/*
+		 * We can stomp all the blocks we need to without
+		 * wrapping around the end of the log.  Just do it
+		 * in a single write.  Use the cycle number of the
+		 * current cycle minus one so that the log will look like:
+		 *     n ... | n - 1 ...
+		 */
+		error = xlog_write_log_records(log, (head_cycle - 1),
+				head_block, max_distance, tail_cycle,
+				tail_block);
+		if (error)
+			return error;
+	} else {
+		/*
+		 * We need to wrap around the end of the physical log in
+		 * order to clear all the blocks.  Do it in two separate
+		 * I/Os.  The first write should be from the head to the
+		 * end of the physical log, and it should use the current
+		 * cycle number minus one just like above.
+		 */
+		distance = log->l_logBBsize - head_block;
+		error = xlog_write_log_records(log, (head_cycle - 1),
+				head_block, distance, tail_cycle,
+				tail_block);
+
+		if (error)
+			return error;
+
+		/*
+		 * Now write the blocks at the start of the physical log.
+		 * This writes the remainder of the blocks we want to clear.
+		 * It uses the current cycle number since we're now on the
+		 * same cycle as the head so that we get:
+		 *    n ... n ... | n - 1 ...
+		 *    ^^^^^ blocks we're writing
+		 */
+		distance = max_distance - (log->l_logBBsize - head_block);
+		error = xlog_write_log_records(log, head_cycle, 0, distance,
+				tail_cycle, tail_block);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/******************************************************************************
+ *
+ *		Log recover routines
+ *
+ ******************************************************************************
+ */
+
+STATIC xlog_recover_t *
+xlog_recover_find_tid(
+	struct hlist_head	*head,
+	xlog_tid_t		tid)
+{
+	xlog_recover_t		*trans;
+	struct hlist_node	*n;
+
+	hlist_for_each_entry(trans, n, head, r_list) {
+		if (trans->r_log_tid == tid)
+			return trans;
+	}
+	return NULL;
+}
+
+STATIC void
+xlog_recover_new_tid(
+	struct hlist_head	*head,
+	xlog_tid_t		tid,
+	xfs_lsn_t		lsn)
+{
+	xlog_recover_t		*trans;
+
+	trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
+	trans->r_log_tid   = tid;
+	trans->r_lsn	   = lsn;
+	INIT_LIST_HEAD(&trans->r_itemq);
+
+	INIT_HLIST_NODE(&trans->r_list);
+	hlist_add_head(&trans->r_list, head);
+}
+
+STATIC void
+xlog_recover_add_item(
+	struct list_head	*head)
+{
+	xlog_recover_item_t	*item;
+
+	item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
+	INIT_LIST_HEAD(&item->ri_list);
+	list_add_tail(&item->ri_list, head);
+}
+
+STATIC int
+xlog_recover_add_to_cont_trans(
+	struct log		*log,
+	xlog_recover_t		*trans,
+	xfs_caddr_t		dp,
+	int			len)
+{
+	xlog_recover_item_t	*item;
+	xfs_caddr_t		ptr, old_ptr;
+	int			old_len;
+
+	if (list_empty(&trans->r_itemq)) {
+		/* finish copying rest of trans header */
+		xlog_recover_add_item(&trans->r_itemq);
+		ptr = (xfs_caddr_t) &trans->r_theader +
+				sizeof(xfs_trans_header_t) - len;
+		memcpy(ptr, dp, len); /* d, s, l */
+		return 0;
+	}
+	/* take the tail entry */
+	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+
+	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
+	old_len = item->ri_buf[item->ri_cnt-1].i_len;
+
+	ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
+	memcpy(&ptr[old_len], dp, len); /* d, s, l */
+	item->ri_buf[item->ri_cnt-1].i_len += len;
+	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+	trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
+	return 0;
+}
+
+/*
+ * The next region to add is the start of a new region.  It could be
+ * a whole region or it could be the first part of a new region.  Because
+ * of this, the assumption here is that the type and size fields of all
+ * format structures fit into the first 32 bits of the structure.
+ *
+ * This works because all regions must be 32 bit aligned.  Therefore, we
+ * either have both fields or we have neither field.  In the case we have
+ * neither field, the data part of the region is zero length.  We only have
+ * a log_op_header and can throw away the header since a new one will appear
+ * later.  If we have at least 4 bytes, then we can determine how many regions
+ * will appear in the current log item.
+ */
+STATIC int
+xlog_recover_add_to_trans(
+	struct log		*log,
+	xlog_recover_t		*trans,
+	xfs_caddr_t		dp,
+	int			len)
+{
+	xfs_inode_log_format_t	*in_f;			/* any will do */
+	xlog_recover_item_t	*item;
+	xfs_caddr_t		ptr;
+
+	if (!len)
+		return 0;
+	if (list_empty(&trans->r_itemq)) {
+		/* we need to catch log corruptions here */
+		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
+			xfs_warn(log->l_mp, "%s: bad header magic number",
+				__func__);
+			ASSERT(0);
+			return XFS_ERROR(EIO);
+		}
+		if (len == sizeof(xfs_trans_header_t))
+			xlog_recover_add_item(&trans->r_itemq);
+		memcpy(&trans->r_theader, dp, len); /* d, s, l */
+		return 0;
+	}
+
+	ptr = kmem_alloc(len, KM_SLEEP);
+	memcpy(ptr, dp, len);
+	in_f = (xfs_inode_log_format_t *)ptr;
+
+	/* take the tail entry */
+	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+	if (item->ri_total != 0 &&
+	     item->ri_total == item->ri_cnt) {
+		/* tail item is in use, get a new one */
+		xlog_recover_add_item(&trans->r_itemq);
+		item = list_entry(trans->r_itemq.prev,
+					xlog_recover_item_t, ri_list);
+	}
+
+	if (item->ri_total == 0) {		/* first region to be added */
+		if (in_f->ilf_size == 0 ||
+		    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
+			xfs_warn(log->l_mp,
+		"bad number of regions (%d) in inode log format",
+				  in_f->ilf_size);
+			ASSERT(0);
+			return XFS_ERROR(EIO);
+		}
+
+		item->ri_total = in_f->ilf_size;
+		item->ri_buf =
+			kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+				    KM_SLEEP);
+	}
+	ASSERT(item->ri_total > item->ri_cnt);
+	/* Description region is ri_buf[0] */
+	item->ri_buf[item->ri_cnt].i_addr = ptr;
+	item->ri_buf[item->ri_cnt].i_len  = len;
+	item->ri_cnt++;
+	trace_xfs_log_recover_item_add(log, trans, item, 0);
+	return 0;
+}
+
+/*
+ * Sort the log items in the transaction. Cancelled buffers need
+ * to be put first so they are processed before any items that might
+ * modify the buffers. If they are cancelled, then the modifications
+ * don't need to be replayed.
+ */
+STATIC int
+xlog_recover_reorder_trans(
+	struct log		*log,
+	xlog_recover_t		*trans,
+	int			pass)
+{
+	xlog_recover_item_t	*item, *n;
+	LIST_HEAD(sort_list);
+
+	list_splice_init(&trans->r_itemq, &sort_list);
+	list_for_each_entry_safe(item, n, &sort_list, ri_list) {
+		xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
+
+		switch (ITEM_TYPE(item)) {
+		case XFS_LI_BUF:
+			if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+				trace_xfs_log_recover_item_reorder_head(log,
+							trans, item, pass);
+				list_move(&item->ri_list, &trans->r_itemq);
+				break;
+			}
+		case XFS_LI_INODE:
+		case XFS_LI_DQUOT:
+		case XFS_LI_QUOTAOFF:
+		case XFS_LI_EFD:
+		case XFS_LI_EFI:
+			trace_xfs_log_recover_item_reorder_tail(log,
+							trans, item, pass);
+			list_move_tail(&item->ri_list, &trans->r_itemq);
+			break;
+		default:
+			xfs_warn(log->l_mp,
+				"%s: unrecognized type of log operation",
+				__func__);
+			ASSERT(0);
+			return XFS_ERROR(EIO);
+		}
+	}
+	ASSERT(list_empty(&sort_list));
+	return 0;
+}
+
+/*
+ * Build up the table of buf cancel records so that we don't replay
+ * cancelled data in the second pass.  For buffer records that are
+ * not cancel records, there is nothing to do here so we just return.
+ *
+ * If we get a cancel record which is already in the table, this indicates
+ * that the buffer was cancelled multiple times.  In order to ensure
+ * that during pass 2 we keep the record in the table until we reach its
+ * last occurrence in the log, we keep a reference count in the cancel
+ * record in the table to tell us how many times we expect to see this
+ * record during the second pass.
+ */
+STATIC int
+xlog_recover_buffer_pass1(
+	struct log		*log,
+	xlog_recover_item_t	*item)
+{
+	xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
+	struct list_head	*bucket;
+	struct xfs_buf_cancel	*bcp;
+
+	/*
+	 * If this isn't a cancel buffer item, then just return.
+	 */
+	if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+		trace_xfs_log_recover_buf_not_cancel(log, buf_f);
+		return 0;
+	}
+
+	/*
+	 * Insert an xfs_buf_cancel record into the hash table of them.
+	 * If there is already an identical record, bump its reference count.
+	 */
+	bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
+	list_for_each_entry(bcp, bucket, bc_list) {
+		if (bcp->bc_blkno == buf_f->blf_blkno &&
+		    bcp->bc_len == buf_f->blf_len) {
+			bcp->bc_refcount++;
+			trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
+			return 0;
+		}
+	}
+
+	bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
+	bcp->bc_blkno = buf_f->blf_blkno;
+	bcp->bc_len = buf_f->blf_len;
+	bcp->bc_refcount = 1;
+	list_add_tail(&bcp->bc_list, bucket);
+
+	trace_xfs_log_recover_buf_cancel_add(log, buf_f);
+	return 0;
+}
+
+/*
+ * Check to see whether the buffer being recovered has a corresponding
+ * entry in the buffer cancel record table.  If it does then return 1
+ * so that it will be cancelled, otherwise return 0.  If the buffer is
+ * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
+ * the refcount on the entry in the table and remove it from the table
+ * if this is the last reference.
+ *
+ * We remove the cancel record from the table when we encounter its
+ * last occurrence in the log so that if the same buffer is re-used
+ * again after its last cancellation we actually replay the changes
+ * made at that point.
+ */
+STATIC int
+xlog_check_buffer_cancelled(
+	struct log		*log,
+	xfs_daddr_t		blkno,
+	uint			len,
+	ushort			flags)
+{
+	struct list_head	*bucket;
+	struct xfs_buf_cancel	*bcp;
+
+	if (log->l_buf_cancel_table == NULL) {
+		/*
+		 * There is nothing in the table built in pass one,
+		 * so this buffer must not be cancelled.
+		 */
+		ASSERT(!(flags & XFS_BLF_CANCEL));
+		return 0;
+	}
+
+	/*
+	 * Search for an entry in the  cancel table that matches our buffer.
+	 */
+	bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
+	list_for_each_entry(bcp, bucket, bc_list) {
+		if (bcp->bc_blkno == blkno && bcp->bc_len == len)
+			goto found;
+	}
+
+	/*
+	 * We didn't find a corresponding entry in the table, so return 0 so
+	 * that the buffer is NOT cancelled.
+	 */
+	ASSERT(!(flags & XFS_BLF_CANCEL));
+	return 0;
+
+found:
+	/*
+	 * We've go a match, so return 1 so that the recovery of this buffer
+	 * is cancelled.  If this buffer is actually a buffer cancel log
+	 * item, then decrement the refcount on the one in the table and
+	 * remove it if this is the last reference.
+	 */
+	if (flags & XFS_BLF_CANCEL) {
+		if (--bcp->bc_refcount == 0) {
+			list_del(&bcp->bc_list);
+			kmem_free(bcp);
+		}
+	}
+	return 1;
+}
+
+/*
+ * Perform recovery for a buffer full of inodes.  In these buffers, the only
+ * data which should be recovered is that which corresponds to the
+ * di_next_unlinked pointers in the on disk inode structures.  The rest of the
+ * data for the inodes is always logged through the inodes themselves rather
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
+ *
+ * The only time when buffers full of inodes are fully recovered is when the
+ * buffer is full of newly allocated inodes.  In this case the buffer will
+ * not be marked as an inode buffer and so will be sent to
+ * xlog_recover_do_reg_buffer() below during recovery.
+ */
+STATIC int
+xlog_recover_do_inode_buffer(
+	struct xfs_mount	*mp,
+	xlog_recover_item_t	*item,
+	struct xfs_buf		*bp,
+	xfs_buf_log_format_t	*buf_f)
+{
+	int			i;
+	int			item_index = 0;
+	int			bit = 0;
+	int			nbits = 0;
+	int			reg_buf_offset = 0;
+	int			reg_buf_bytes = 0;
+	int			next_unlinked_offset;
+	int			inodes_per_buf;
+	xfs_agino_t		*logged_nextp;
+	xfs_agino_t		*buffer_nextp;
+
+	trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
+
+	inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
+	for (i = 0; i < inodes_per_buf; i++) {
+		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
+			offsetof(xfs_dinode_t, di_next_unlinked);
+
+		while (next_unlinked_offset >=
+		       (reg_buf_offset + reg_buf_bytes)) {
+			/*
+			 * The next di_next_unlinked field is beyond
+			 * the current logged region.  Find the next
+			 * logged region that contains or is beyond
+			 * the current di_next_unlinked field.
+			 */
+			bit += nbits;
+			bit = xfs_next_bit(buf_f->blf_data_map,
+					   buf_f->blf_map_size, bit);
+
+			/*
+			 * If there are no more logged regions in the
+			 * buffer, then we're done.
+			 */
+			if (bit == -1)
+				return 0;
+
+			nbits = xfs_contig_bits(buf_f->blf_data_map,
+						buf_f->blf_map_size, bit);
+			ASSERT(nbits > 0);
+			reg_buf_offset = bit << XFS_BLF_SHIFT;
+			reg_buf_bytes = nbits << XFS_BLF_SHIFT;
+			item_index++;
+		}
+
+		/*
+		 * If the current logged region starts after the current
+		 * di_next_unlinked field, then move on to the next
+		 * di_next_unlinked field.
+		 */
+		if (next_unlinked_offset < reg_buf_offset)
+			continue;
+
+		ASSERT(item->ri_buf[item_index].i_addr != NULL);
+		ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
+		ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
+
+		/*
+		 * The current logged region contains a copy of the
+		 * current di_next_unlinked field.  Extract its value
+		 * and copy it to the buffer copy.
+		 */
+		logged_nextp = item->ri_buf[item_index].i_addr +
+				next_unlinked_offset - reg_buf_offset;
+		if (unlikely(*logged_nextp == 0)) {
+			xfs_alert(mp,
+		"Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
+		"Trying to replay bad (0) inode di_next_unlinked field.",
+				item, bp);
+			XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
+					 XFS_ERRLEVEL_LOW, mp);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+
+		buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
+					      next_unlinked_offset);
+		*buffer_nextp = *logged_nextp;
+	}
+
+	return 0;
+}
+
+/*
+ * Perform a 'normal' buffer recovery.  Each logged region of the
+ * buffer should be copied over the corresponding region in the
+ * given buffer.  The bitmap in the buf log format structure indicates
+ * where to place the logged data.
+ */
+STATIC void
+xlog_recover_do_reg_buffer(
+	struct xfs_mount	*mp,
+	xlog_recover_item_t	*item,
+	struct xfs_buf		*bp,
+	xfs_buf_log_format_t	*buf_f)
+{
+	int			i;
+	int			bit;
+	int			nbits;
+	int                     error;
+
+	trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
+
+	bit = 0;
+	i = 1;  /* 0 is the buf format structure */
+	while (1) {
+		bit = xfs_next_bit(buf_f->blf_data_map,
+				   buf_f->blf_map_size, bit);
+		if (bit == -1)
+			break;
+		nbits = xfs_contig_bits(buf_f->blf_data_map,
+					buf_f->blf_map_size, bit);
+		ASSERT(nbits > 0);
+		ASSERT(item->ri_buf[i].i_addr != NULL);
+		ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
+		ASSERT(XFS_BUF_COUNT(bp) >=
+		       ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
+
+		/*
+		 * Do a sanity check if this is a dquot buffer. Just checking
+		 * the first dquot in the buffer should do. XXXThis is
+		 * probably a good thing to do for other buf types also.
+		 */
+		error = 0;
+		if (buf_f->blf_flags &
+		   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
+			if (item->ri_buf[i].i_addr == NULL) {
+				xfs_alert(mp,
+					"XFS: NULL dquot in %s.", __func__);
+				goto next;
+			}
+			if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
+				xfs_alert(mp,
+					"XFS: dquot too small (%d) in %s.",
+					item->ri_buf[i].i_len, __func__);
+				goto next;
+			}
+			error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
+					       -1, 0, XFS_QMOPT_DOWARN,
+					       "dquot_buf_recover");
+			if (error)
+				goto next;
+		}
+
+		memcpy(xfs_buf_offset(bp,
+			(uint)bit << XFS_BLF_SHIFT),	/* dest */
+			item->ri_buf[i].i_addr,		/* source */
+			nbits<<XFS_BLF_SHIFT);		/* length */
+ next:
+		i++;
+		bit += nbits;
+	}
+
+	/* Shouldn't be any more regions */
+	ASSERT(i == item->ri_total);
+}
+
+/*
+ * Do some primitive error checking on ondisk dquot data structures.
+ */
+int
+xfs_qm_dqcheck(
+	struct xfs_mount *mp,
+	xfs_disk_dquot_t *ddq,
+	xfs_dqid_t	 id,
+	uint		 type,	  /* used only when IO_dorepair is true */
+	uint		 flags,
+	char		 *str)
+{
+	xfs_dqblk_t	 *d = (xfs_dqblk_t *)ddq;
+	int		errs = 0;
+
+	/*
+	 * We can encounter an uninitialized dquot buffer for 2 reasons:
+	 * 1. If we crash while deleting the quotainode(s), and those blks got
+	 *    used for user data. This is because we take the path of regular
+	 *    file deletion; however, the size field of quotainodes is never
+	 *    updated, so all the tricks that we play in itruncate_finish
+	 *    don't quite matter.
+	 *
+	 * 2. We don't play the quota buffers when there's a quotaoff logitem.
+	 *    But the allocation will be replayed so we'll end up with an
+	 *    uninitialized quota block.
+	 *
+	 * This is all fine; things are still consistent, and we haven't lost
+	 * any quota information. Just don't complain about bad dquot blks.
+	 */
+	if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
+			str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
+		errs++;
+	}
+	if (ddq->d_version != XFS_DQUOT_VERSION) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
+			str, id, ddq->d_version, XFS_DQUOT_VERSION);
+		errs++;
+	}
+
+	if (ddq->d_flags != XFS_DQ_USER &&
+	    ddq->d_flags != XFS_DQ_PROJ &&
+	    ddq->d_flags != XFS_DQ_GROUP) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : XFS dquot ID 0x%x, unknown flags 0x%x",
+			str, id, ddq->d_flags);
+		errs++;
+	}
+
+	if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : ondisk-dquot 0x%p, ID mismatch: "
+			"0x%x expected, found id 0x%x",
+			str, ddq, id, be32_to_cpu(ddq->d_id));
+		errs++;
+	}
+
+	if (!errs && ddq->d_id) {
+		if (ddq->d_blk_softlimit &&
+		    be64_to_cpu(ddq->d_bcount) >=
+				be64_to_cpu(ddq->d_blk_softlimit)) {
+			if (!ddq->d_btimer) {
+				if (flags & XFS_QMOPT_DOWARN)
+					xfs_alert(mp,
+			"%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
+					str, (int)be32_to_cpu(ddq->d_id), ddq);
+				errs++;
+			}
+		}
+		if (ddq->d_ino_softlimit &&
+		    be64_to_cpu(ddq->d_icount) >=
+				be64_to_cpu(ddq->d_ino_softlimit)) {
+			if (!ddq->d_itimer) {
+				if (flags & XFS_QMOPT_DOWARN)
+					xfs_alert(mp,
+			"%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
+					str, (int)be32_to_cpu(ddq->d_id), ddq);
+				errs++;
+			}
+		}
+		if (ddq->d_rtb_softlimit &&
+		    be64_to_cpu(ddq->d_rtbcount) >=
+				be64_to_cpu(ddq->d_rtb_softlimit)) {
+			if (!ddq->d_rtbtimer) {
+				if (flags & XFS_QMOPT_DOWARN)
+					xfs_alert(mp,
+			"%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
+					str, (int)be32_to_cpu(ddq->d_id), ddq);
+				errs++;
+			}
+		}
+	}
+
+	if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
+		return errs;
+
+	if (flags & XFS_QMOPT_DOWARN)
+		xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
+
+	/*
+	 * Typically, a repair is only requested by quotacheck.
+	 */
+	ASSERT(id != -1);
+	ASSERT(flags & XFS_QMOPT_DQREPAIR);
+	memset(d, 0, sizeof(xfs_dqblk_t));
+
+	d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+	d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+	d->dd_diskdq.d_flags = type;
+	d->dd_diskdq.d_id = cpu_to_be32(id);
+
+	return errs;
+}
+
+/*
+ * Perform a dquot buffer recovery.
+ * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
+ * (ie. USR or GRP), then just toss this buffer away; don't recover it.
+ * Else, treat it as a regular buffer and do recovery.
+ */
+STATIC void
+xlog_recover_do_dquot_buffer(
+	xfs_mount_t		*mp,
+	xlog_t			*log,
+	xlog_recover_item_t	*item,
+	xfs_buf_t		*bp,
+	xfs_buf_log_format_t	*buf_f)
+{
+	uint			type;
+
+	trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
+
+	/*
+	 * Filesystems are required to send in quota flags at mount time.
+	 */
+	if (mp->m_qflags == 0) {
+		return;
+	}
+
+	type = 0;
+	if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
+		type |= XFS_DQ_USER;
+	if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
+		type |= XFS_DQ_PROJ;
+	if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
+		type |= XFS_DQ_GROUP;
+	/*
+	 * This type of quotas was turned off, so ignore this buffer
+	 */
+	if (log->l_quotaoffs_flag & type)
+		return;
+
+	xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+}
+
+/*
+ * This routine replays a modification made to a buffer at runtime.
+ * There are actually two types of buffer, regular and inode, which
+ * are handled differently.  Inode buffers are handled differently
+ * in that we only recover a specific set of data from them, namely
+ * the inode di_next_unlinked fields.  This is because all other inode
+ * data is actually logged via inode records and any data we replay
+ * here which overlaps that may be stale.
+ *
+ * When meta-data buffers are freed at run time we log a buffer item
+ * with the XFS_BLF_CANCEL bit set to indicate that previous copies
+ * of the buffer in the log should not be replayed at recovery time.
+ * This is so that if the blocks covered by the buffer are reused for
+ * file data before we crash we don't end up replaying old, freed
+ * meta-data into a user's file.
+ *
+ * To handle the cancellation of buffer log items, we make two passes
+ * over the log during recovery.  During the first we build a table of
+ * those buffers which have been cancelled, and during the second we
+ * only replay those buffers which do not have corresponding cancel
+ * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
+ * for more details on the implementation of the table of cancel records.
+ */
+STATIC int
+xlog_recover_buffer_pass2(
+	xlog_t			*log,
+	xlog_recover_item_t	*item)
+{
+	xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
+	xfs_mount_t		*mp = log->l_mp;
+	xfs_buf_t		*bp;
+	int			error;
+	uint			buf_flags;
+
+	/*
+	 * In this pass we only want to recover all the buffers which have
+	 * not been cancelled and are not cancellation buffers themselves.
+	 */
+	if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
+			buf_f->blf_len, buf_f->blf_flags)) {
+		trace_xfs_log_recover_buf_cancel(log, buf_f);
+		return 0;
+	}
+
+	trace_xfs_log_recover_buf_recover(log, buf_f);
+
+	buf_flags = XBF_LOCK;
+	if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
+		buf_flags |= XBF_MAPPED;
+
+	bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+			  buf_flags);
+	if (XFS_BUF_ISERROR(bp)) {
+		xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
+				  bp, buf_f->blf_blkno);
+		error = XFS_BUF_GETERROR(bp);
+		xfs_buf_relse(bp);
+		return error;
+	}
+
+	error = 0;
+	if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
+	} else if (buf_f->blf_flags &
+		  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
+		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+	} else {
+		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+	}
+	if (error)
+		return XFS_ERROR(error);
+
+	/*
+	 * Perform delayed write on the buffer.  Asynchronous writes will be
+	 * slower when taking into account all the buffers to be flushed.
+	 *
+	 * Also make sure that only inode buffers with good sizes stay in
+	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
+	 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
+	 * buffers in the log can be a different size if the log was generated
+	 * by an older kernel using unclustered inode buffers or a newer kernel
+	 * running with a different inode cluster size.  Regardless, if the
+	 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
+	 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
+	 * the buffer out of the buffer cache so that the buffer won't
+	 * overlap with future reads of those inodes.
+	 */
+	if (XFS_DINODE_MAGIC ==
+	    be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
+	    (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
+			(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
+		XFS_BUF_STALE(bp);
+		error = xfs_bwrite(mp, bp);
+	} else {
+		ASSERT(bp->b_target->bt_mount == mp);
+		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+		xfs_bdwrite(mp, bp);
+	}
+
+	return (error);
+}
+
+STATIC int
+xlog_recover_inode_pass2(
+	xlog_t			*log,
+	xlog_recover_item_t	*item)
+{
+	xfs_inode_log_format_t	*in_f;
+	xfs_mount_t		*mp = log->l_mp;
+	xfs_buf_t		*bp;
+	xfs_dinode_t		*dip;
+	int			len;
+	xfs_caddr_t		src;
+	xfs_caddr_t		dest;
+	int			error;
+	int			attr_index;
+	uint			fields;
+	xfs_icdinode_t		*dicp;
+	int			need_free = 0;
+
+	if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
+		in_f = item->ri_buf[0].i_addr;
+	} else {
+		in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
+		need_free = 1;
+		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
+		if (error)
+			goto error;
+	}
+
+	/*
+	 * Inode buffers can be freed, look out for it,
+	 * and do not replay the inode.
+	 */
+	if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
+					in_f->ilf_len, 0)) {
+		error = 0;
+		trace_xfs_log_recover_inode_cancel(log, in_f);
+		goto error;
+	}
+	trace_xfs_log_recover_inode_recover(log, in_f);
+
+	bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
+			  XBF_LOCK);
+	if (XFS_BUF_ISERROR(bp)) {
+		xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
+				  bp, in_f->ilf_blkno);
+		error = XFS_BUF_GETERROR(bp);
+		xfs_buf_relse(bp);
+		goto error;
+	}
+	error = 0;
+	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
+	dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
+
+	/*
+	 * Make sure the place we're flushing out to really looks
+	 * like an inode!
+	 */
+	if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
+		xfs_buf_relse(bp);
+		xfs_alert(mp,
+	"%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
+			__func__, dip, bp, in_f->ilf_ino);
+		XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
+				 XFS_ERRLEVEL_LOW, mp);
+		error = EFSCORRUPTED;
+		goto error;
+	}
+	dicp = item->ri_buf[1].i_addr;
+	if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
+		xfs_buf_relse(bp);
+		xfs_alert(mp,
+			"%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
+			__func__, item, in_f->ilf_ino);
+		XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
+				 XFS_ERRLEVEL_LOW, mp);
+		error = EFSCORRUPTED;
+		goto error;
+	}
+
+	/* Skip replay when the on disk inode is newer than the log one */
+	if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+		/*
+		 * Deal with the wrap case, DI_MAX_FLUSH is less
+		 * than smaller numbers
+		 */
+		if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
+		    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+			/* do nothing */
+		} else {
+			xfs_buf_relse(bp);
+			trace_xfs_log_recover_inode_skip(log, in_f);
+			error = 0;
+			goto error;
+		}
+	}
+	/* Take the opportunity to reset the flush iteration count */
+	dicp->di_flushiter = 0;
+
+	if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
+		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
+			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
+					 XFS_ERRLEVEL_LOW, mp, dicp);
+			xfs_buf_relse(bp);
+			xfs_alert(mp,
+		"%s: Bad regular inode log record, rec ptr 0x%p, "
+		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+				__func__, item, dip, bp, in_f->ilf_ino);
+			error = EFSCORRUPTED;
+			goto error;
+		}
+	} else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
+		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
+		    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
+			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
+					     XFS_ERRLEVEL_LOW, mp, dicp);
+			xfs_buf_relse(bp);
+			xfs_alert(mp,
+		"%s: Bad dir inode log record, rec ptr 0x%p, "
+		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+				__func__, item, dip, bp, in_f->ilf_ino);
+			error = EFSCORRUPTED;
+			goto error;
+		}
+	}
+	if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
+		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
+				     XFS_ERRLEVEL_LOW, mp, dicp);
+		xfs_buf_relse(bp);
+		xfs_alert(mp,
+	"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
+	"dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+			__func__, item, dip, bp, in_f->ilf_ino,
+			dicp->di_nextents + dicp->di_anextents,
+			dicp->di_nblocks);
+		error = EFSCORRUPTED;
+		goto error;
+	}
+	if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
+		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
+				     XFS_ERRLEVEL_LOW, mp, dicp);
+		xfs_buf_relse(bp);
+		xfs_alert(mp,
+	"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
+	"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
+			item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
+		error = EFSCORRUPTED;
+		goto error;
+	}
+	if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
+		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
+				     XFS_ERRLEVEL_LOW, mp, dicp);
+		xfs_buf_relse(bp);
+		xfs_alert(mp,
+			"%s: Bad inode log record length %d, rec ptr 0x%p",
+			__func__, item->ri_buf[1].i_len, item);
+		error = EFSCORRUPTED;
+		goto error;
+	}
+
+	/* The core is in in-core format */
+	xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
+
+	/* the rest is in on-disk format */
+	if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
+		memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
+			item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
+			item->ri_buf[1].i_len  - sizeof(struct xfs_icdinode));
+	}
+
+	fields = in_f->ilf_fields;
+	switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
+	case XFS_ILOG_DEV:
+		xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
+		break;
+	case XFS_ILOG_UUID:
+		memcpy(XFS_DFORK_DPTR(dip),
+		       &in_f->ilf_u.ilfu_uuid,
+		       sizeof(uuid_t));
+		break;
+	}
+
+	if (in_f->ilf_size == 2)
+		goto write_inode_buffer;
+	len = item->ri_buf[2].i_len;
+	src = item->ri_buf[2].i_addr;
+	ASSERT(in_f->ilf_size <= 4);
+	ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
+	ASSERT(!(fields & XFS_ILOG_DFORK) ||
+	       (len == in_f->ilf_dsize));
+
+	switch (fields & XFS_ILOG_DFORK) {
+	case XFS_ILOG_DDATA:
+	case XFS_ILOG_DEXT:
+		memcpy(XFS_DFORK_DPTR(dip), src, len);
+		break;
+
+	case XFS_ILOG_DBROOT:
+		xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
+				 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
+				 XFS_DFORK_DSIZE(dip, mp));
+		break;
+
+	default:
+		/*
+		 * There are no data fork flags set.
+		 */
+		ASSERT((fields & XFS_ILOG_DFORK) == 0);
+		break;
+	}
+
+	/*
+	 * If we logged any attribute data, recover it.  There may or
+	 * may not have been any other non-core data logged in this
+	 * transaction.
+	 */
+	if (in_f->ilf_fields & XFS_ILOG_AFORK) {
+		if (in_f->ilf_fields & XFS_ILOG_DFORK) {
+			attr_index = 3;
+		} else {
+			attr_index = 2;
+		}
+		len = item->ri_buf[attr_index].i_len;
+		src = item->ri_buf[attr_index].i_addr;
+		ASSERT(len == in_f->ilf_asize);
+
+		switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
+		case XFS_ILOG_ADATA:
+		case XFS_ILOG_AEXT:
+			dest = XFS_DFORK_APTR(dip);
+			ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
+			memcpy(dest, src, len);
+			break;
+
+		case XFS_ILOG_ABROOT:
+			dest = XFS_DFORK_APTR(dip);
+			xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
+					 len, (xfs_bmdr_block_t*)dest,
+					 XFS_DFORK_ASIZE(dip, mp));
+			break;
+
+		default:
+			xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
+			ASSERT(0);
+			xfs_buf_relse(bp);
+			error = EIO;
+			goto error;
+		}
+	}
+
+write_inode_buffer:
+	ASSERT(bp->b_target->bt_mount == mp);
+	XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+	xfs_bdwrite(mp, bp);
+error:
+	if (need_free)
+		kmem_free(in_f);
+	return XFS_ERROR(error);
+}
+
+/*
+ * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
+ * structure, so that we know not to do any dquot item or dquot buffer recovery,
+ * of that type.
+ */
+STATIC int
+xlog_recover_quotaoff_pass1(
+	xlog_t			*log,
+	xlog_recover_item_t	*item)
+{
+	xfs_qoff_logformat_t	*qoff_f = item->ri_buf[0].i_addr;
+	ASSERT(qoff_f);
+
+	/*
+	 * The logitem format's flag tells us if this was user quotaoff,
+	 * group/project quotaoff or both.
+	 */
+	if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
+		log->l_quotaoffs_flag |= XFS_DQ_USER;
+	if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
+		log->l_quotaoffs_flag |= XFS_DQ_PROJ;
+	if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
+		log->l_quotaoffs_flag |= XFS_DQ_GROUP;
+
+	return (0);
+}
+
+/*
+ * Recover a dquot record
+ */
+STATIC int
+xlog_recover_dquot_pass2(
+	xlog_t			*log,
+	xlog_recover_item_t	*item)
+{
+	xfs_mount_t		*mp = log->l_mp;
+	xfs_buf_t		*bp;
+	struct xfs_disk_dquot	*ddq, *recddq;
+	int			error;
+	xfs_dq_logformat_t	*dq_f;
+	uint			type;
+
+
+	/*
+	 * Filesystems are required to send in quota flags at mount time.
+	 */
+	if (mp->m_qflags == 0)
+		return (0);
+
+	recddq = item->ri_buf[1].i_addr;
+	if (recddq == NULL) {
+		xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
+		return XFS_ERROR(EIO);
+	}
+	if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
+		xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
+			item->ri_buf[1].i_len, __func__);
+		return XFS_ERROR(EIO);
+	}
+
+	/*
+	 * This type of quotas was turned off, so ignore this record.
+	 */
+	type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
+	ASSERT(type);
+	if (log->l_quotaoffs_flag & type)
+		return (0);
+
+	/*
+	 * At this point we know that quota was _not_ turned off.
+	 * Since the mount flags are not indicating to us otherwise, this
+	 * must mean that quota is on, and the dquot needs to be replayed.
+	 * Remember that we may not have fully recovered the superblock yet,
+	 * so we can't do the usual trick of looking at the SB quota bits.
+	 *
+	 * The other possibility, of course, is that the quota subsystem was
+	 * removed since the last mount - ENOSYS.
+	 */
+	dq_f = item->ri_buf[0].i_addr;
+	ASSERT(dq_f);
+	error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+			   "xlog_recover_dquot_pass2 (log copy)");
+	if (error)
+		return XFS_ERROR(EIO);
+	ASSERT(dq_f->qlf_len == 1);
+
+	error = xfs_read_buf(mp, mp->m_ddev_targp,
+			     dq_f->qlf_blkno,
+			     XFS_FSB_TO_BB(mp, dq_f->qlf_len),
+			     0, &bp);
+	if (error) {
+		xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
+				  bp, dq_f->qlf_blkno);
+		return error;
+	}
+	ASSERT(bp);
+	ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
+
+	/*
+	 * At least the magic num portion should be on disk because this
+	 * was among a chunk of dquots created earlier, and we did some
+	 * minimal initialization then.
+	 */
+	error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+			   "xlog_recover_dquot_pass2");
+	if (error) {
+		xfs_buf_relse(bp);
+		return XFS_ERROR(EIO);
+	}
+
+	memcpy(ddq, recddq, item->ri_buf[1].i_len);
+
+	ASSERT(dq_f->qlf_size == 2);
+	ASSERT(bp->b_target->bt_mount == mp);
+	XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+	xfs_bdwrite(mp, bp);
+
+	return (0);
+}
+
+/*
+ * This routine is called to create an in-core extent free intent
+ * item from the efi format structure which was logged on disk.
+ * It allocates an in-core efi, copies the extents from the format
+ * structure into it, and adds the efi to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_efi_pass2(
+	xlog_t			*log,
+	xlog_recover_item_t	*item,
+	xfs_lsn_t		lsn)
+{
+	int			error;
+	xfs_mount_t		*mp = log->l_mp;
+	xfs_efi_log_item_t	*efip;
+	xfs_efi_log_format_t	*efi_formatp;
+
+	efi_formatp = item->ri_buf[0].i_addr;
+
+	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
+	if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
+					 &(efip->efi_format)))) {
+		xfs_efi_item_free(efip);
+		return error;
+	}
+	atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
+
+	spin_lock(&log->l_ailp->xa_lock);
+	/*
+	 * xfs_trans_ail_update() drops the AIL lock.
+	 */
+	xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
+	return 0;
+}
+
+
+/*
+ * This routine is called when an efd format structure is found in
+ * a committed transaction in the log.  It's purpose is to cancel
+ * the corresponding efi if it was still in the log.  To do this
+ * it searches the AIL for the efi with an id equal to that in the
+ * efd format structure.  If we find it, we remove the efi from the
+ * AIL and free it.
+ */
+STATIC int
+xlog_recover_efd_pass2(
+	xlog_t			*log,
+	xlog_recover_item_t	*item)
+{
+	xfs_efd_log_format_t	*efd_formatp;
+	xfs_efi_log_item_t	*efip = NULL;
+	xfs_log_item_t		*lip;
+	__uint64_t		efi_id;
+	struct xfs_ail_cursor	cur;
+	struct xfs_ail		*ailp = log->l_ailp;
+
+	efd_formatp = item->ri_buf[0].i_addr;
+	ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
+		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
+	       (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
+		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
+	efi_id = efd_formatp->efd_efi_id;
+
+	/*
+	 * Search for the efi with the id in the efd format structure
+	 * in the AIL.
+	 */
+	spin_lock(&ailp->xa_lock);
+	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+	while (lip != NULL) {
+		if (lip->li_type == XFS_LI_EFI) {
+			efip = (xfs_efi_log_item_t *)lip;
+			if (efip->efi_format.efi_id == efi_id) {
+				/*
+				 * xfs_trans_ail_delete() drops the
+				 * AIL lock.
+				 */
+				xfs_trans_ail_delete(ailp, lip);
+				xfs_efi_item_free(efip);
+				spin_lock(&ailp->xa_lock);
+				break;
+			}
+		}
+		lip = xfs_trans_ail_cursor_next(ailp, &cur);
+	}
+	xfs_trans_ail_cursor_done(ailp, &cur);
+	spin_unlock(&ailp->xa_lock);
+
+	return 0;
+}
+
+/*
+ * Free up any resources allocated by the transaction
+ *
+ * Remember that EFIs, EFDs, and IUNLINKs are handled later.
+ */
+STATIC void
+xlog_recover_free_trans(
+	struct xlog_recover	*trans)
+{
+	xlog_recover_item_t	*item, *n;
+	int			i;
+
+	list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
+		/* Free the regions in the item. */
+		list_del(&item->ri_list);
+		for (i = 0; i < item->ri_cnt; i++)
+			kmem_free(item->ri_buf[i].i_addr);
+		/* Free the item itself */
+		kmem_free(item->ri_buf);
+		kmem_free(item);
+	}
+	/* Free the transaction recover structure */
+	kmem_free(trans);
+}
+
+STATIC int
+xlog_recover_commit_pass1(
+	struct log		*log,
+	struct xlog_recover	*trans,
+	xlog_recover_item_t	*item)
+{
+	trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
+
+	switch (ITEM_TYPE(item)) {
+	case XFS_LI_BUF:
+		return xlog_recover_buffer_pass1(log, item);
+	case XFS_LI_QUOTAOFF:
+		return xlog_recover_quotaoff_pass1(log, item);
+	case XFS_LI_INODE:
+	case XFS_LI_EFI:
+	case XFS_LI_EFD:
+	case XFS_LI_DQUOT:
+		/* nothing to do in pass 1 */
+		return 0;
+	default:
+		xfs_warn(log->l_mp, "%s: invalid item type (%d)",
+			__func__, ITEM_TYPE(item));
+		ASSERT(0);
+		return XFS_ERROR(EIO);
+	}
+}
+
+STATIC int
+xlog_recover_commit_pass2(
+	struct log		*log,
+	struct xlog_recover	*trans,
+	xlog_recover_item_t	*item)
+{
+	trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
+
+	switch (ITEM_TYPE(item)) {
+	case XFS_LI_BUF:
+		return xlog_recover_buffer_pass2(log, item);
+	case XFS_LI_INODE:
+		return xlog_recover_inode_pass2(log, item);
+	case XFS_LI_EFI:
+		return xlog_recover_efi_pass2(log, item, trans->r_lsn);
+	case XFS_LI_EFD:
+		return xlog_recover_efd_pass2(log, item);
+	case XFS_LI_DQUOT:
+		return xlog_recover_dquot_pass2(log, item);
+	case XFS_LI_QUOTAOFF:
+		/* nothing to do in pass2 */
+		return 0;
+	default:
+		xfs_warn(log->l_mp, "%s: invalid item type (%d)",
+			__func__, ITEM_TYPE(item));
+		ASSERT(0);
+		return XFS_ERROR(EIO);
+	}
+}
+
+/*
+ * Perform the transaction.
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
+xlog_recover_commit_trans(
+	struct log		*log,
+	struct xlog_recover	*trans,
+	int			pass)
+{
+	int			error = 0;
+	xlog_recover_item_t	*item;
+
+	hlist_del(&trans->r_list);
+
+	error = xlog_recover_reorder_trans(log, trans, pass);
+	if (error)
+		return error;
+
+	list_for_each_entry(item, &trans->r_itemq, ri_list) {
+		if (pass == XLOG_RECOVER_PASS1)
+			error = xlog_recover_commit_pass1(log, trans, item);
+		else
+			error = xlog_recover_commit_pass2(log, trans, item);
+		if (error)
+			return error;
+	}
+
+	xlog_recover_free_trans(trans);
+	return 0;
+}
+
+STATIC int
+xlog_recover_unmount_trans(
+	struct log		*log,
+	xlog_recover_t		*trans)
+{
+	/* Do nothing now */
+	xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
+	return 0;
+}
+
+/*
+ * There are two valid states of the r_state field.  0 indicates that the
+ * transaction structure is in a normal state.  We have either seen the
+ * start of the transaction or the last operation we added was not a partial
+ * operation.  If the last operation we added to the transaction was a
+ * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
+ *
+ * NOTE: skip LRs with 0 data length.
+ */
+STATIC int
+xlog_recover_process_data(
+	xlog_t			*log,
+	struct hlist_head	rhash[],
+	xlog_rec_header_t	*rhead,
+	xfs_caddr_t		dp,
+	int			pass)
+{
+	xfs_caddr_t		lp;
+	int			num_logops;
+	xlog_op_header_t	*ohead;
+	xlog_recover_t		*trans;
+	xlog_tid_t		tid;
+	int			error;
+	unsigned long		hash;
+	uint			flags;
+
+	lp = dp + be32_to_cpu(rhead->h_len);
+	num_logops = be32_to_cpu(rhead->h_num_logops);
+
+	/* check the log format matches our own - else we can't recover */
+	if (xlog_header_check_recover(log->l_mp, rhead))
+		return (XFS_ERROR(EIO));
+
+	while ((dp < lp) && num_logops) {
+		ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
+		ohead = (xlog_op_header_t *)dp;
+		dp += sizeof(xlog_op_header_t);
+		if (ohead->oh_clientid != XFS_TRANSACTION &&
+		    ohead->oh_clientid != XFS_LOG) {
+			xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
+					__func__, ohead->oh_clientid);
+			ASSERT(0);
+			return (XFS_ERROR(EIO));
+		}
+		tid = be32_to_cpu(ohead->oh_tid);
+		hash = XLOG_RHASH(tid);
+		trans = xlog_recover_find_tid(&rhash[hash], tid);
+		if (trans == NULL) {		   /* not found; add new tid */
+			if (ohead->oh_flags & XLOG_START_TRANS)
+				xlog_recover_new_tid(&rhash[hash], tid,
+					be64_to_cpu(rhead->h_lsn));
+		} else {
+			if (dp + be32_to_cpu(ohead->oh_len) > lp) {
+				xfs_warn(log->l_mp, "%s: bad length 0x%x",
+					__func__, be32_to_cpu(ohead->oh_len));
+				WARN_ON(1);
+				return (XFS_ERROR(EIO));
+			}
+			flags = ohead->oh_flags & ~XLOG_END_TRANS;
+			if (flags & XLOG_WAS_CONT_TRANS)
+				flags &= ~XLOG_CONTINUE_TRANS;
+			switch (flags) {
+			case XLOG_COMMIT_TRANS:
+				error = xlog_recover_commit_trans(log,
+								trans, pass);
+				break;
+			case XLOG_UNMOUNT_TRANS:
+				error = xlog_recover_unmount_trans(log, trans);
+				break;
+			case XLOG_WAS_CONT_TRANS:
+				error = xlog_recover_add_to_cont_trans(log,
+						trans, dp,
+						be32_to_cpu(ohead->oh_len));
+				break;
+			case XLOG_START_TRANS:
+				xfs_warn(log->l_mp, "%s: bad transaction",
+					__func__);
+				ASSERT(0);
+				error = XFS_ERROR(EIO);
+				break;
+			case 0:
+			case XLOG_CONTINUE_TRANS:
+				error = xlog_recover_add_to_trans(log, trans,
+						dp, be32_to_cpu(ohead->oh_len));
+				break;
+			default:
+				xfs_warn(log->l_mp, "%s: bad flag 0x%x",
+					__func__, flags);
+				ASSERT(0);
+				error = XFS_ERROR(EIO);
+				break;
+			}
+			if (error)
+				return error;
+		}
+		dp += be32_to_cpu(ohead->oh_len);
+		num_logops--;
+	}
+	return 0;
+}
+
+/*
+ * Process an extent free intent item that was recovered from
+ * the log.  We need to free the extents that it describes.
+ */
+STATIC int
+xlog_recover_process_efi(
+	xfs_mount_t		*mp,
+	xfs_efi_log_item_t	*efip)
+{
+	xfs_efd_log_item_t	*efdp;
+	xfs_trans_t		*tp;
+	int			i;
+	int			error = 0;
+	xfs_extent_t		*extp;
+	xfs_fsblock_t		startblock_fsb;
+
+	ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
+
+	/*
+	 * First check the validity of the extents described by the
+	 * EFI.  If any are bad, then assume that all are bad and
+	 * just toss the EFI.
+	 */
+	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+		extp = &(efip->efi_format.efi_extents[i]);
+		startblock_fsb = XFS_BB_TO_FSB(mp,
+				   XFS_FSB_TO_DADDR(mp, extp->ext_start));
+		if ((startblock_fsb == 0) ||
+		    (extp->ext_len == 0) ||
+		    (startblock_fsb >= mp->m_sb.sb_dblocks) ||
+		    (extp->ext_len >= mp->m_sb.sb_agblocks)) {
+			/*
+			 * This will pull the EFI from the AIL and
+			 * free the memory associated with it.
+			 */
+			xfs_efi_release(efip, efip->efi_format.efi_nextents);
+			return XFS_ERROR(EIO);
+		}
+	}
+
+	tp = xfs_trans_alloc(mp, 0);
+	error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+	if (error)
+		goto abort_error;
+	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
+
+	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+		extp = &(efip->efi_format.efi_extents[i]);
+		error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+		if (error)
+			goto abort_error;
+		xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
+					 extp->ext_len);
+	}
+
+	set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
+	error = xfs_trans_commit(tp, 0);
+	return error;
+
+abort_error:
+	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+	return error;
+}
+
+/*
+ * When this is called, all of the EFIs which did not have
+ * corresponding EFDs should be in the AIL.  What we do now
+ * is free the extents associated with each one.
+ *
+ * Since we process the EFIs in normal transactions, they
+ * will be removed at some point after the commit.  This prevents
+ * us from just walking down the list processing each one.
+ * We'll use a flag in the EFI to skip those that we've already
+ * processed and use the AIL iteration mechanism's generation
+ * count to try to speed this up at least a bit.
+ *
+ * When we start, we know that the EFIs are the only things in
+ * the AIL.  As we process them, however, other items are added
+ * to the AIL.  Since everything added to the AIL must come after
+ * everything already in the AIL, we stop processing as soon as
+ * we see something other than an EFI in the AIL.
+ */
+STATIC int
+xlog_recover_process_efis(
+	xlog_t			*log)
+{
+	xfs_log_item_t		*lip;
+	xfs_efi_log_item_t	*efip;
+	int			error = 0;
+	struct xfs_ail_cursor	cur;
+	struct xfs_ail		*ailp;
+
+	ailp = log->l_ailp;
+	spin_lock(&ailp->xa_lock);
+	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+	while (lip != NULL) {
+		/*
+		 * We're done when we see something other than an EFI.
+		 * There should be no EFIs left in the AIL now.
+		 */
+		if (lip->li_type != XFS_LI_EFI) {
+#ifdef DEBUG
+			for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
+				ASSERT(lip->li_type != XFS_LI_EFI);
+#endif
+			break;
+		}
+
+		/*
+		 * Skip EFIs that we've already processed.
+		 */
+		efip = (xfs_efi_log_item_t *)lip;
+		if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
+			lip = xfs_trans_ail_cursor_next(ailp, &cur);
+			continue;
+		}
+
+		spin_unlock(&ailp->xa_lock);
+		error = xlog_recover_process_efi(log->l_mp, efip);
+		spin_lock(&ailp->xa_lock);
+		if (error)
+			goto out;
+		lip = xfs_trans_ail_cursor_next(ailp, &cur);
+	}
+out:
+	xfs_trans_ail_cursor_done(ailp, &cur);
+	spin_unlock(&ailp->xa_lock);
+	return error;
+}
+
+/*
+ * This routine performs a transaction to null out a bad inode pointer
+ * in an agi unlinked inode hash bucket.
+ */
+STATIC void
+xlog_recover_clear_agi_bucket(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno,
+	int		bucket)
+{
+	xfs_trans_t	*tp;
+	xfs_agi_t	*agi;
+	xfs_buf_t	*agibp;
+	int		offset;
+	int		error;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
+	error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
+				  0, 0, 0);
+	if (error)
+		goto out_abort;
+
+	error = xfs_read_agi(mp, tp, agno, &agibp);
+	if (error)
+		goto out_abort;
+
+	agi = XFS_BUF_TO_AGI(agibp);
+	agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+	offset = offsetof(xfs_agi_t, agi_unlinked) +
+		 (sizeof(xfs_agino_t) * bucket);
+	xfs_trans_log_buf(tp, agibp, offset,
+			  (offset + sizeof(xfs_agino_t) - 1));
+
+	error = xfs_trans_commit(tp, 0);
+	if (error)
+		goto out_error;
+	return;
+
+out_abort:
+	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+out_error:
+	xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
+	return;
+}
+
+STATIC xfs_agino_t
+xlog_recover_process_one_iunlink(
+	struct xfs_mount		*mp,
+	xfs_agnumber_t			agno,
+	xfs_agino_t			agino,
+	int				bucket)
+{
+	struct xfs_buf			*ibp;
+	struct xfs_dinode		*dip;
+	struct xfs_inode		*ip;
+	xfs_ino_t			ino;
+	int				error;
+
+	ino = XFS_AGINO_TO_INO(mp, agno, agino);
+	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
+	if (error)
+		goto fail;
+
+	/*
+	 * Get the on disk inode to find the next inode in the bucket.
+	 */
+	error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
+	if (error)
+		goto fail_iput;
+
+	ASSERT(ip->i_d.di_nlink == 0);
+	ASSERT(ip->i_d.di_mode != 0);
+
+	/* setup for the next pass */
+	agino = be32_to_cpu(dip->di_next_unlinked);
+	xfs_buf_relse(ibp);
+
+	/*
+	 * Prevent any DMAPI event from being sent when the reference on
+	 * the inode is dropped.
+	 */
+	ip->i_d.di_dmevmask = 0;
+
+	IRELE(ip);
+	return agino;
+
+ fail_iput:
+	IRELE(ip);
+ fail:
+	/*
+	 * We can't read in the inode this bucket points to, or this inode
+	 * is messed up.  Just ditch this bucket of inodes.  We will lose
+	 * some inodes and space, but at least we won't hang.
+	 *
+	 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
+	 * clear the inode pointer in the bucket.
+	 */
+	xlog_recover_clear_agi_bucket(mp, agno, bucket);
+	return NULLAGINO;
+}
+
+/*
+ * xlog_iunlink_recover
+ *
+ * This is called during recovery to process any inodes which
+ * we unlinked but not freed when the system crashed.  These
+ * inodes will be on the lists in the AGI blocks.  What we do
+ * here is scan all the AGIs and fully truncate and free any
+ * inodes found on the lists.  Each inode is removed from the
+ * lists when it has been fully truncated and is freed.  The
+ * freeing of the inode and its removal from the list must be
+ * atomic.
+ */
+STATIC void
+xlog_recover_process_iunlinks(
+	xlog_t		*log)
+{
+	xfs_mount_t	*mp;
+	xfs_agnumber_t	agno;
+	xfs_agi_t	*agi;
+	xfs_buf_t	*agibp;
+	xfs_agino_t	agino;
+	int		bucket;
+	int		error;
+	uint		mp_dmevmask;
+
+	mp = log->l_mp;
+
+	/*
+	 * Prevent any DMAPI event from being sent while in this function.
+	 */
+	mp_dmevmask = mp->m_dmevmask;
+	mp->m_dmevmask = 0;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		/*
+		 * Find the agi for this ag.
+		 */
+		error = xfs_read_agi(mp, NULL, agno, &agibp);
+		if (error) {
+			/*
+			 * AGI is b0rked. Don't process it.
+			 *
+			 * We should probably mark the filesystem as corrupt
+			 * after we've recovered all the ag's we can....
+			 */
+			continue;
+		}
+		/*
+		 * Unlock the buffer so that it can be acquired in the normal
+		 * course of the transaction to truncate and free each inode.
+		 * Because we are not racing with anyone else here for the AGI
+		 * buffer, we don't even need to hold it locked to read the
+		 * initial unlinked bucket entries out of the buffer. We keep
+		 * buffer reference though, so that it stays pinned in memory
+		 * while we need the buffer.
+		 */
+		agi = XFS_BUF_TO_AGI(agibp);
+		xfs_buf_unlock(agibp);
+
+		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
+			agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+			while (agino != NULLAGINO) {
+				agino = xlog_recover_process_one_iunlink(mp,
+							agno, agino, bucket);
+			}
+		}
+		xfs_buf_rele(agibp);
+	}
+
+	mp->m_dmevmask = mp_dmevmask;
+}
+
+
+#ifdef DEBUG
+STATIC void
+xlog_pack_data_checksum(
+	xlog_t		*log,
+	xlog_in_core_t	*iclog,
+	int		size)
+{
+	int		i;
+	__be32		*up;
+	uint		chksum = 0;
+
+	up = (__be32 *)iclog->ic_datap;
+	/* divide length by 4 to get # words */
+	for (i = 0; i < (size >> 2); i++) {
+		chksum ^= be32_to_cpu(*up);
+		up++;
+	}
+	iclog->ic_header.h_chksum = cpu_to_be32(chksum);
+}
+#else
+#define xlog_pack_data_checksum(log, iclog, size)
+#endif
+
+/*
+ * Stamp cycle number in every block
+ */
+void
+xlog_pack_data(
+	xlog_t			*log,
+	xlog_in_core_t		*iclog,
+	int			roundoff)
+{
+	int			i, j, k;
+	int			size = iclog->ic_offset + roundoff;
+	__be32			cycle_lsn;
+	xfs_caddr_t		dp;
+
+	xlog_pack_data_checksum(log, iclog, size);
+
+	cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
+
+	dp = iclog->ic_datap;
+	for (i = 0; i < BTOBB(size) &&
+		i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+		iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
+		*(__be32 *)dp = cycle_lsn;
+		dp += BBSIZE;
+	}
+
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+		xlog_in_core_2_t *xhdr = iclog->ic_data;
+
+		for ( ; i < BTOBB(size); i++) {
+			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
+			*(__be32 *)dp = cycle_lsn;
+			dp += BBSIZE;
+		}
+
+		for (i = 1; i < log->l_iclog_heads; i++) {
+			xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+		}
+	}
+}
+
+STATIC void
+xlog_unpack_data(
+	xlog_rec_header_t	*rhead,
+	xfs_caddr_t		dp,
+	xlog_t			*log)
+{
+	int			i, j, k;
+
+	for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
+		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+		*(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
+		dp += BBSIZE;
+	}
+
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+		xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
+		for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
+			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			*(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
+			dp += BBSIZE;
+		}
+	}
+}
+
+STATIC int
+xlog_valid_rec_header(
+	xlog_t			*log,
+	xlog_rec_header_t	*rhead,
+	xfs_daddr_t		blkno)
+{
+	int			hlen;
+
+	if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) {
+		XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
+				XFS_ERRLEVEL_LOW, log->l_mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	if (unlikely(
+	    (!rhead->h_version ||
+	    (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
+		xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
+			__func__, be32_to_cpu(rhead->h_version));
+		return XFS_ERROR(EIO);
+	}
+
+	/* LR body must have data or it wouldn't have been written */
+	hlen = be32_to_cpu(rhead->h_len);
+	if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
+		XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
+				XFS_ERRLEVEL_LOW, log->l_mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
+		XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
+				XFS_ERRLEVEL_LOW, log->l_mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	return 0;
+}
+
+/*
+ * Read the log from tail to head and process the log records found.
+ * Handle the two cases where the tail and head are in the same cycle
+ * and where the active portion of the log wraps around the end of
+ * the physical log separately.  The pass parameter is passed through
+ * to the routines called to process the data and is not looked at
+ * here.
+ */
+STATIC int
+xlog_do_recovery_pass(
+	xlog_t			*log,
+	xfs_daddr_t		head_blk,
+	xfs_daddr_t		tail_blk,
+	int			pass)
+{
+	xlog_rec_header_t	*rhead;
+	xfs_daddr_t		blk_no;
+	xfs_caddr_t		offset;
+	xfs_buf_t		*hbp, *dbp;
+	int			error = 0, h_size;
+	int			bblks, split_bblks;
+	int			hblks, split_hblks, wrapped_hblks;
+	struct hlist_head	rhash[XLOG_RHASH_SIZE];
+
+	ASSERT(head_blk != tail_blk);
+
+	/*
+	 * Read the header of the tail block and get the iclog buffer size from
+	 * h_size.  Use this to tell how many sectors make up the log header.
+	 */
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+		/*
+		 * When using variable length iclogs, read first sector of
+		 * iclog header and extract the header size from it.  Get a
+		 * new hbp that is the correct size.
+		 */
+		hbp = xlog_get_bp(log, 1);
+		if (!hbp)
+			return ENOMEM;
+
+		error = xlog_bread(log, tail_blk, 1, hbp, &offset);
+		if (error)
+			goto bread_err1;
+
+		rhead = (xlog_rec_header_t *)offset;
+		error = xlog_valid_rec_header(log, rhead, tail_blk);
+		if (error)
+			goto bread_err1;
+		h_size = be32_to_cpu(rhead->h_size);
+		if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
+		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+			if (h_size % XLOG_HEADER_CYCLE_SIZE)
+				hblks++;
+			xlog_put_bp(hbp);
+			hbp = xlog_get_bp(log, hblks);
+		} else {
+			hblks = 1;
+		}
+	} else {
+		ASSERT(log->l_sectBBsize == 1);
+		hblks = 1;
+		hbp = xlog_get_bp(log, 1);
+		h_size = XLOG_BIG_RECORD_BSIZE;
+	}
+
+	if (!hbp)
+		return ENOMEM;
+	dbp = xlog_get_bp(log, BTOBB(h_size));
+	if (!dbp) {
+		xlog_put_bp(hbp);
+		return ENOMEM;
+	}
+
+	memset(rhash, 0, sizeof(rhash));
+	if (tail_blk <= head_blk) {
+		for (blk_no = tail_blk; blk_no < head_blk; ) {
+			error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+			if (error)
+				goto bread_err2;
+
+			rhead = (xlog_rec_header_t *)offset;
+			error = xlog_valid_rec_header(log, rhead, blk_no);
+			if (error)
+				goto bread_err2;
+
+			/* blocks in data section */
+			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+			error = xlog_bread(log, blk_no + hblks, bblks, dbp,
+					   &offset);
+			if (error)
+				goto bread_err2;
+
+			xlog_unpack_data(rhead, offset, log);
+			if ((error = xlog_recover_process_data(log,
+						rhash, rhead, offset, pass)))
+				goto bread_err2;
+			blk_no += bblks + hblks;
+		}
+	} else {
+		/*
+		 * Perform recovery around the end of the physical log.
+		 * When the head is not on the same cycle number as the tail,
+		 * we can't do a sequential recovery as above.
+		 */
+		blk_no = tail_blk;
+		while (blk_no < log->l_logBBsize) {
+			/*
+			 * Check for header wrapping around physical end-of-log
+			 */
+			offset = XFS_BUF_PTR(hbp);
+			split_hblks = 0;
+			wrapped_hblks = 0;
+			if (blk_no + hblks <= log->l_logBBsize) {
+				/* Read header in one read */
+				error = xlog_bread(log, blk_no, hblks, hbp,
+						   &offset);
+				if (error)
+					goto bread_err2;
+			} else {
+				/* This LR is split across physical log end */
+				if (blk_no != log->l_logBBsize) {
+					/* some data before physical log end */
+					ASSERT(blk_no <= INT_MAX);
+					split_hblks = log->l_logBBsize - (int)blk_no;
+					ASSERT(split_hblks > 0);
+					error = xlog_bread(log, blk_no,
+							   split_hblks, hbp,
+							   &offset);
+					if (error)
+						goto bread_err2;
+				}
+
+				/*
+				 * Note: this black magic still works with
+				 * large sector sizes (non-512) only because:
+				 * - we increased the buffer size originally
+				 *   by 1 sector giving us enough extra space
+				 *   for the second read;
+				 * - the log start is guaranteed to be sector
+				 *   aligned;
+				 * - we read the log end (LR header start)
+				 *   _first_, then the log start (LR header end)
+				 *   - order is important.
+				 */
+				wrapped_hblks = hblks - split_hblks;
+				error = xlog_bread_offset(log, 0,
+						wrapped_hblks, hbp,
+						offset + BBTOB(split_hblks));
+				if (error)
+					goto bread_err2;
+			}
+			rhead = (xlog_rec_header_t *)offset;
+			error = xlog_valid_rec_header(log, rhead,
+						split_hblks ? blk_no : 0);
+			if (error)
+				goto bread_err2;
+
+			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+			blk_no += hblks;
+
+			/* Read in data for log record */
+			if (blk_no + bblks <= log->l_logBBsize) {
+				error = xlog_bread(log, blk_no, bblks, dbp,
+						   &offset);
+				if (error)
+					goto bread_err2;
+			} else {
+				/* This log record is split across the
+				 * physical end of log */
+				offset = XFS_BUF_PTR(dbp);
+				split_bblks = 0;
+				if (blk_no != log->l_logBBsize) {
+					/* some data is before the physical
+					 * end of log */
+					ASSERT(!wrapped_hblks);
+					ASSERT(blk_no <= INT_MAX);
+					split_bblks =
+						log->l_logBBsize - (int)blk_no;
+					ASSERT(split_bblks > 0);
+					error = xlog_bread(log, blk_no,
+							split_bblks, dbp,
+							&offset);
+					if (error)
+						goto bread_err2;
+				}
+
+				/*
+				 * Note: this black magic still works with
+				 * large sector sizes (non-512) only because:
+				 * - we increased the buffer size originally
+				 *   by 1 sector giving us enough extra space
+				 *   for the second read;
+				 * - the log start is guaranteed to be sector
+				 *   aligned;
+				 * - we read the log end (LR header start)
+				 *   _first_, then the log start (LR header end)
+				 *   - order is important.
+				 */
+				error = xlog_bread_offset(log, 0,
+						bblks - split_bblks, hbp,
+						offset + BBTOB(split_bblks));
+				if (error)
+					goto bread_err2;
+			}
+			xlog_unpack_data(rhead, offset, log);
+			if ((error = xlog_recover_process_data(log, rhash,
+							rhead, offset, pass)))
+				goto bread_err2;
+			blk_no += bblks;
+		}
+
+		ASSERT(blk_no >= log->l_logBBsize);
+		blk_no -= log->l_logBBsize;
+
+		/* read first part of physical log */
+		while (blk_no < head_blk) {
+			error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+			if (error)
+				goto bread_err2;
+
+			rhead = (xlog_rec_header_t *)offset;
+			error = xlog_valid_rec_header(log, rhead, blk_no);
+			if (error)
+				goto bread_err2;
+
+			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+			error = xlog_bread(log, blk_no+hblks, bblks, dbp,
+					   &offset);
+			if (error)
+				goto bread_err2;
+
+			xlog_unpack_data(rhead, offset, log);
+			if ((error = xlog_recover_process_data(log, rhash,
+							rhead, offset, pass)))
+				goto bread_err2;
+			blk_no += bblks + hblks;
+		}
+	}
+
+ bread_err2:
+	xlog_put_bp(dbp);
+ bread_err1:
+	xlog_put_bp(hbp);
+	return error;
+}
+
+/*
+ * Do the recovery of the log.  We actually do this in two phases.
+ * The two passes are necessary in order to implement the function
+ * of cancelling a record written into the log.  The first pass
+ * determines those things which have been cancelled, and the
+ * second pass replays log items normally except for those which
+ * have been cancelled.  The handling of the replay and cancellations
+ * takes place in the log item type specific routines.
+ *
+ * The table of items which have cancel records in the log is allocated
+ * and freed at this level, since only here do we know when all of
+ * the log recovery has been completed.
+ */
+STATIC int
+xlog_do_log_recovery(
+	xlog_t		*log,
+	xfs_daddr_t	head_blk,
+	xfs_daddr_t	tail_blk)
+{
+	int		error, i;
+
+	ASSERT(head_blk != tail_blk);
+
+	/*
+	 * First do a pass to find all of the cancelled buf log items.
+	 * Store them in the buf_cancel_table for use in the second pass.
+	 */
+	log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
+						 sizeof(struct list_head),
+						 KM_SLEEP);
+	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+		INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+
+	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
+				      XLOG_RECOVER_PASS1);
+	if (error != 0) {
+		kmem_free(log->l_buf_cancel_table);
+		log->l_buf_cancel_table = NULL;
+		return error;
+	}
+	/*
+	 * Then do a second pass to actually recover the items in the log.
+	 * When it is complete free the table of buf cancel items.
+	 */
+	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
+				      XLOG_RECOVER_PASS2);
+#ifdef DEBUG
+	if (!error) {
+		int	i;
+
+		for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+			ASSERT(list_empty(&log->l_buf_cancel_table[i]));
+	}
+#endif	/* DEBUG */
+
+	kmem_free(log->l_buf_cancel_table);
+	log->l_buf_cancel_table = NULL;
+
+	return error;
+}
+
+/*
+ * Do the actual recovery
+ */
+STATIC int
+xlog_do_recover(
+	xlog_t		*log,
+	xfs_daddr_t	head_blk,
+	xfs_daddr_t	tail_blk)
+{
+	int		error;
+	xfs_buf_t	*bp;
+	xfs_sb_t	*sbp;
+
+	/*
+	 * First replay the images in the log.
+	 */
+	error = xlog_do_log_recovery(log, head_blk, tail_blk);
+	if (error) {
+		return error;
+	}
+
+	XFS_bflush(log->l_mp->m_ddev_targp);
+
+	/*
+	 * If IO errors happened during recovery, bail out.
+	 */
+	if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+		return (EIO);
+	}
+
+	/*
+	 * We now update the tail_lsn since much of the recovery has completed
+	 * and there may be space available to use.  If there were no extent
+	 * or iunlinks, we can free up the entire log and set the tail_lsn to
+	 * be the last_sync_lsn.  This was set in xlog_find_tail to be the
+	 * lsn of the last known good LR on disk.  If there are extent frees
+	 * or iunlinks they will have some entries in the AIL; so we look at
+	 * the AIL to determine how to set the tail_lsn.
+	 */
+	xlog_assign_tail_lsn(log->l_mp);
+
+	/*
+	 * Now that we've finished replaying all buffer and inode
+	 * updates, re-read in the superblock.
+	 */
+	bp = xfs_getsb(log->l_mp, 0);
+	XFS_BUF_UNDONE(bp);
+	ASSERT(!(XFS_BUF_ISWRITE(bp)));
+	ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
+	XFS_BUF_READ(bp);
+	XFS_BUF_UNASYNC(bp);
+	xfsbdstrat(log->l_mp, bp);
+	error = xfs_buf_iowait(bp);
+	if (error) {
+		xfs_ioerror_alert("xlog_do_recover",
+				  log->l_mp, bp, XFS_BUF_ADDR(bp));
+		ASSERT(0);
+		xfs_buf_relse(bp);
+		return error;
+	}
+
+	/* Convert superblock from on-disk format */
+	sbp = &log->l_mp->m_sb;
+	xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+	ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
+	ASSERT(xfs_sb_good_version(sbp));
+	xfs_buf_relse(bp);
+
+	/* We've re-read the superblock so re-initialize per-cpu counters */
+	xfs_icsb_reinit_counters(log->l_mp);
+
+	xlog_recover_check_summary(log);
+
+	/* Normal transactions can now occur */
+	log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+	return 0;
+}
+
+/*
+ * Perform recovery and re-initialize some log variables in xlog_find_tail.
+ *
+ * Return error or zero.
+ */
+int
+xlog_recover(
+	xlog_t		*log)
+{
+	xfs_daddr_t	head_blk, tail_blk;
+	int		error;
+
+	/* find the tail of the log */
+	if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
+		return error;
+
+	if (tail_blk != head_blk) {
+		/* There used to be a comment here:
+		 *
+		 * disallow recovery on read-only mounts.  note -- mount
+		 * checks for ENOSPC and turns it into an intelligent
+		 * error message.
+		 * ...but this is no longer true.  Now, unless you specify
+		 * NORECOVERY (in which case this function would never be
+		 * called), we just go ahead and recover.  We do this all
+		 * under the vfs layer, so we can get away with it unless
+		 * the device itself is read-only, in which case we fail.
+		 */
+		if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
+			return error;
+		}
+
+		xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
+				log->l_mp->m_logname ? log->l_mp->m_logname
+						     : "internal");
+
+		error = xlog_do_recover(log, head_blk, tail_blk);
+		log->l_flags |= XLOG_RECOVERY_NEEDED;
+	}
+	return error;
+}
+
+/*
+ * In the first part of recovery we replay inodes and buffers and build
+ * up the list of extent free items which need to be processed.  Here
+ * we process the extent free items and clean up the on disk unlinked
+ * inode lists.  This is separated from the first part of recovery so
+ * that the root and real-time bitmap inodes can be read in from disk in
+ * between the two stages.  This is necessary so that we can free space
+ * in the real-time portion of the file system.
+ */
+int
+xlog_recover_finish(
+	xlog_t		*log)
+{
+	/*
+	 * Now we're ready to do the transactions needed for the
+	 * rest of recovery.  Start with completing all the extent
+	 * free intent records and then process the unlinked inode
+	 * lists.  At this point, we essentially run in normal mode
+	 * except that we're still performing recovery actions
+	 * rather than accepting new requests.
+	 */
+	if (log->l_flags & XLOG_RECOVERY_NEEDED) {
+		int	error;
+		error = xlog_recover_process_efis(log);
+		if (error) {
+			xfs_alert(log->l_mp, "Failed to recover EFIs");
+			return error;
+		}
+		/*
+		 * Sync the log to get all the EFIs out of the AIL.
+		 * This isn't absolutely necessary, but it helps in
+		 * case the unlink transactions would have problems
+		 * pushing the EFIs out of the way.
+		 */
+		xfs_log_force(log->l_mp, XFS_LOG_SYNC);
+
+		xlog_recover_process_iunlinks(log);
+
+		xlog_recover_check_summary(log);
+
+		xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
+				log->l_mp->m_logname ? log->l_mp->m_logname
+						     : "internal");
+		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
+	} else {
+		xfs_info(log->l_mp, "Ending clean mount");
+	}
+	return 0;
+}
+
+
+#if defined(DEBUG)
+/*
+ * Read all of the agf and agi counters and check that they
+ * are consistent with the superblock counters.
+ */
+void
+xlog_recover_check_summary(
+	xlog_t		*log)
+{
+	xfs_mount_t	*mp;
+	xfs_agf_t	*agfp;
+	xfs_buf_t	*agfbp;
+	xfs_buf_t	*agibp;
+	xfs_agnumber_t	agno;
+	__uint64_t	freeblks;
+	__uint64_t	itotal;
+	__uint64_t	ifree;
+	int		error;
+
+	mp = log->l_mp;
+
+	freeblks = 0LL;
+	itotal = 0LL;
+	ifree = 0LL;
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
+		if (error) {
+			xfs_alert(mp, "%s agf read failed agno %d error %d",
+						__func__, agno, error);
+		} else {
+			agfp = XFS_BUF_TO_AGF(agfbp);
+			freeblks += be32_to_cpu(agfp->agf_freeblks) +
+				    be32_to_cpu(agfp->agf_flcount);
+			xfs_buf_relse(agfbp);
+		}
+
+		error = xfs_read_agi(mp, NULL, agno, &agibp);
+		if (error) {
+			xfs_alert(mp, "%s agi read failed agno %d error %d",
+						__func__, agno, error);
+		} else {
+			struct xfs_agi	*agi = XFS_BUF_TO_AGI(agibp);
+
+			itotal += be32_to_cpu(agi->agi_count);
+			ifree += be32_to_cpu(agi->agi_freecount);
+			xfs_buf_relse(agibp);
+		}
+	}
+}
+#endif /* DEBUG */
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
new file mode 100644
index 00000000..1c55ccbb
--- /dev/null
+++ b/fs/xfs/xfs_log_recover.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_LOG_RECOVER_H__
+#define __XFS_LOG_RECOVER_H__
+
+/*
+ * Macros, structures, prototypes for internal log manager use.
+ */
+
+#define XLOG_RHASH_BITS  4
+#define XLOG_RHASH_SIZE	16
+#define XLOG_RHASH_SHIFT 2
+#define XLOG_RHASH(tid)	\
+	((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
+
+#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
+
+
+/*
+ * item headers are in ri_buf[0].  Additional buffers follow.
+ */
+typedef struct xlog_recover_item {
+	struct list_head	ri_list;
+	int			ri_type;
+	int			ri_cnt;	/* count of regions found */
+	int			ri_total;	/* total regions */
+	xfs_log_iovec_t		*ri_buf;	/* ptr to regions buffer */
+} xlog_recover_item_t;
+
+struct xlog_tid;
+typedef struct xlog_recover {
+	struct hlist_node	r_list;
+	xlog_tid_t		r_log_tid;	/* log's transaction id */
+	xfs_trans_header_t	r_theader;	/* trans header for partial */
+	int			r_state;	/* not needed */
+	xfs_lsn_t		r_lsn;		/* xact lsn */
+	struct list_head	r_itemq;	/* q for items */
+} xlog_recover_t;
+
+#define ITEM_TYPE(i)	(*(ushort *)(i)->ri_buf[0].i_addr)
+
+/*
+ * This is the number of entries in the l_buf_cancel_table used during
+ * recovery.
+ */
+#define	XLOG_BC_TABLE_SIZE	64
+
+#define	XLOG_RECOVER_PASS1	1
+#define	XLOG_RECOVER_PASS2	2
+
+#endif	/* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
new file mode 100644
index 00000000..9afdd497
--- /dev/null
+++ b/fs/xfs/xfs_mount.c
@@ -0,0 +1,2585 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_rw.h"
+#include "xfs_quota.h"
+#include "xfs_fsops.h"
+#include "xfs_utils.h"
+#include "xfs_trace.h"
+
+
+#ifdef HAVE_PERCPU_SB
+STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
+						int);
+STATIC void	xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
+						int);
+STATIC void	xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
+#else
+
+#define xfs_icsb_balance_counter(mp, a, b)		do { } while (0)
+#define xfs_icsb_balance_counter_locked(mp, a, b)	do { } while (0)
+#endif
+
+static const struct {
+	short offset;
+	short type;	/* 0 = integer
+			 * 1 = binary / string (no translation)
+			 */
+} xfs_sb_info[] = {
+    { offsetof(xfs_sb_t, sb_magicnum),   0 },
+    { offsetof(xfs_sb_t, sb_blocksize),  0 },
+    { offsetof(xfs_sb_t, sb_dblocks),    0 },
+    { offsetof(xfs_sb_t, sb_rblocks),    0 },
+    { offsetof(xfs_sb_t, sb_rextents),   0 },
+    { offsetof(xfs_sb_t, sb_uuid),       1 },
+    { offsetof(xfs_sb_t, sb_logstart),   0 },
+    { offsetof(xfs_sb_t, sb_rootino),    0 },
+    { offsetof(xfs_sb_t, sb_rbmino),     0 },
+    { offsetof(xfs_sb_t, sb_rsumino),    0 },
+    { offsetof(xfs_sb_t, sb_rextsize),   0 },
+    { offsetof(xfs_sb_t, sb_agblocks),   0 },
+    { offsetof(xfs_sb_t, sb_agcount),    0 },
+    { offsetof(xfs_sb_t, sb_rbmblocks),  0 },
+    { offsetof(xfs_sb_t, sb_logblocks),  0 },
+    { offsetof(xfs_sb_t, sb_versionnum), 0 },
+    { offsetof(xfs_sb_t, sb_sectsize),   0 },
+    { offsetof(xfs_sb_t, sb_inodesize),  0 },
+    { offsetof(xfs_sb_t, sb_inopblock),  0 },
+    { offsetof(xfs_sb_t, sb_fname[0]),   1 },
+    { offsetof(xfs_sb_t, sb_blocklog),   0 },
+    { offsetof(xfs_sb_t, sb_sectlog),    0 },
+    { offsetof(xfs_sb_t, sb_inodelog),   0 },
+    { offsetof(xfs_sb_t, sb_inopblog),   0 },
+    { offsetof(xfs_sb_t, sb_agblklog),   0 },
+    { offsetof(xfs_sb_t, sb_rextslog),   0 },
+    { offsetof(xfs_sb_t, sb_inprogress), 0 },
+    { offsetof(xfs_sb_t, sb_imax_pct),   0 },
+    { offsetof(xfs_sb_t, sb_icount),     0 },
+    { offsetof(xfs_sb_t, sb_ifree),      0 },
+    { offsetof(xfs_sb_t, sb_fdblocks),   0 },
+    { offsetof(xfs_sb_t, sb_frextents),  0 },
+    { offsetof(xfs_sb_t, sb_uquotino),   0 },
+    { offsetof(xfs_sb_t, sb_gquotino),   0 },
+    { offsetof(xfs_sb_t, sb_qflags),     0 },
+    { offsetof(xfs_sb_t, sb_flags),      0 },
+    { offsetof(xfs_sb_t, sb_shared_vn),  0 },
+    { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
+    { offsetof(xfs_sb_t, sb_unit),	 0 },
+    { offsetof(xfs_sb_t, sb_width),	 0 },
+    { offsetof(xfs_sb_t, sb_dirblklog),	 0 },
+    { offsetof(xfs_sb_t, sb_logsectlog), 0 },
+    { offsetof(xfs_sb_t, sb_logsectsize),0 },
+    { offsetof(xfs_sb_t, sb_logsunit),	 0 },
+    { offsetof(xfs_sb_t, sb_features2),	 0 },
+    { offsetof(xfs_sb_t, sb_bad_features2), 0 },
+    { sizeof(xfs_sb_t),			 0 }
+};
+
+static DEFINE_MUTEX(xfs_uuid_table_mutex);
+static int xfs_uuid_table_size;
+static uuid_t *xfs_uuid_table;
+
+/*
+ * See if the UUID is unique among mounted XFS filesystems.
+ * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
+ */
+STATIC int
+xfs_uuid_mount(
+	struct xfs_mount	*mp)
+{
+	uuid_t			*uuid = &mp->m_sb.sb_uuid;
+	int			hole, i;
+
+	if (mp->m_flags & XFS_MOUNT_NOUUID)
+		return 0;
+
+	if (uuid_is_nil(uuid)) {
+		xfs_warn(mp, "Filesystem has nil UUID - can't mount");
+		return XFS_ERROR(EINVAL);
+	}
+
+	mutex_lock(&xfs_uuid_table_mutex);
+	for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
+		if (uuid_is_nil(&xfs_uuid_table[i])) {
+			hole = i;
+			continue;
+		}
+		if (uuid_equal(uuid, &xfs_uuid_table[i]))
+			goto out_duplicate;
+	}
+
+	if (hole < 0) {
+		xfs_uuid_table = kmem_realloc(xfs_uuid_table,
+			(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
+			xfs_uuid_table_size  * sizeof(*xfs_uuid_table),
+			KM_SLEEP);
+		hole = xfs_uuid_table_size++;
+	}
+	xfs_uuid_table[hole] = *uuid;
+	mutex_unlock(&xfs_uuid_table_mutex);
+
+	return 0;
+
+ out_duplicate:
+	mutex_unlock(&xfs_uuid_table_mutex);
+	xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
+	return XFS_ERROR(EINVAL);
+}
+
+STATIC void
+xfs_uuid_unmount(
+	struct xfs_mount	*mp)
+{
+	uuid_t			*uuid = &mp->m_sb.sb_uuid;
+	int			i;
+
+	if (mp->m_flags & XFS_MOUNT_NOUUID)
+		return;
+
+	mutex_lock(&xfs_uuid_table_mutex);
+	for (i = 0; i < xfs_uuid_table_size; i++) {
+		if (uuid_is_nil(&xfs_uuid_table[i]))
+			continue;
+		if (!uuid_equal(uuid, &xfs_uuid_table[i]))
+			continue;
+		memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
+		break;
+	}
+	ASSERT(i < xfs_uuid_table_size);
+	mutex_unlock(&xfs_uuid_table_mutex);
+}
+
+
+/*
+ * Reference counting access wrappers to the perag structures.
+ * Because we never free per-ag structures, the only thing we
+ * have to protect against changes is the tree structure itself.
+ */
+struct xfs_perag *
+xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
+{
+	struct xfs_perag	*pag;
+	int			ref = 0;
+
+	rcu_read_lock();
+	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+	if (pag) {
+		ASSERT(atomic_read(&pag->pag_ref) >= 0);
+		ref = atomic_inc_return(&pag->pag_ref);
+	}
+	rcu_read_unlock();
+	trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+	return pag;
+}
+
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		first,
+	int			tag)
+{
+	struct xfs_perag	*pag;
+	int			found;
+	int			ref;
+
+	rcu_read_lock();
+	found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+					(void **)&pag, first, 1, tag);
+	if (found <= 0) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	ref = atomic_inc_return(&pag->pag_ref);
+	rcu_read_unlock();
+	trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+	return pag;
+}
+
+void
+xfs_perag_put(struct xfs_perag *pag)
+{
+	int	ref;
+
+	ASSERT(atomic_read(&pag->pag_ref) > 0);
+	ref = atomic_dec_return(&pag->pag_ref);
+	trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+
+STATIC void
+__xfs_free_perag(
+	struct rcu_head	*head)
+{
+	struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+
+	ASSERT(atomic_read(&pag->pag_ref) == 0);
+	kmem_free(pag);
+}
+
+/*
+ * Free up the per-ag resources associated with the mount structure.
+ */
+STATIC void
+xfs_free_perag(
+	xfs_mount_t	*mp)
+{
+	xfs_agnumber_t	agno;
+	struct xfs_perag *pag;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		spin_lock(&mp->m_perag_lock);
+		pag = radix_tree_delete(&mp->m_perag_tree, agno);
+		spin_unlock(&mp->m_perag_lock);
+		ASSERT(pag);
+		ASSERT(atomic_read(&pag->pag_ref) == 0);
+		call_rcu(&pag->rcu_head, __xfs_free_perag);
+	}
+}
+
+/*
+ * Check size of device based on the (data/realtime) block count.
+ * Note: this check is used by the growfs code as well as mount.
+ */
+int
+xfs_sb_validate_fsb_count(
+	xfs_sb_t	*sbp,
+	__uint64_t	nblocks)
+{
+	ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
+	ASSERT(sbp->sb_blocklog >= BBSHIFT);
+
+#if XFS_BIG_BLKNOS     /* Limited by ULONG_MAX of page cache index */
+	if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
+		return EFBIG;
+#else                  /* Limited by UINT_MAX of sectors */
+	if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
+		return EFBIG;
+#endif
+	return 0;
+}
+
+/*
+ * Check the validity of the SB found.
+ */
+STATIC int
+xfs_mount_validate_sb(
+	xfs_mount_t	*mp,
+	xfs_sb_t	*sbp,
+	int		flags)
+{
+	int		loud = !(flags & XFS_MFSI_QUIET);
+
+	/*
+	 * If the log device and data device have the
+	 * same device number, the log is internal.
+	 * Consequently, the sb_logstart should be non-zero.  If
+	 * we have a zero sb_logstart in this case, we may be trying to mount
+	 * a volume filesystem in a non-volume manner.
+	 */
+	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+		if (loud)
+			xfs_warn(mp, "bad magic number");
+		return XFS_ERROR(EWRONGFS);
+	}
+
+	if (!xfs_sb_good_version(sbp)) {
+		if (loud)
+			xfs_warn(mp, "bad version");
+		return XFS_ERROR(EWRONGFS);
+	}
+
+	if (unlikely(
+	    sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
+		if (loud)
+			xfs_warn(mp,
+		"filesystem is marked as having an external log; "
+		"specify logdev on the mount command line.");
+		return XFS_ERROR(EINVAL);
+	}
+
+	if (unlikely(
+	    sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
+		if (loud)
+			xfs_warn(mp,
+		"filesystem is marked as having an internal log; "
+		"do not specify logdev on the mount command line.");
+		return XFS_ERROR(EINVAL);
+	}
+
+	/*
+	 * More sanity checking. These were stolen directly from
+	 * xfs_repair.
+	 */
+	if (unlikely(
+	    sbp->sb_agcount <= 0					||
+	    sbp->sb_sectsize < XFS_MIN_SECTORSIZE			||
+	    sbp->sb_sectsize > XFS_MAX_SECTORSIZE			||
+	    sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG			||
+	    sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG			||
+	    sbp->sb_sectsize != (1 << sbp->sb_sectlog)			||
+	    sbp->sb_blocksize < XFS_MIN_BLOCKSIZE			||
+	    sbp->sb_blocksize > XFS_MAX_BLOCKSIZE			||
+	    sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG			||
+	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG			||
+	    sbp->sb_blocksize != (1 << sbp->sb_blocklog)		||
+	    sbp->sb_inodesize < XFS_DINODE_MIN_SIZE			||
+	    sbp->sb_inodesize > XFS_DINODE_MAX_SIZE			||
+	    sbp->sb_inodelog < XFS_DINODE_MIN_LOG			||
+	    sbp->sb_inodelog > XFS_DINODE_MAX_LOG			||
+	    sbp->sb_inodesize != (1 << sbp->sb_inodelog)		||
+	    (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)	||
+	    (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)	||
+	    (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)	||
+	    (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
+		if (loud)
+			xfs_warn(mp, "SB sanity check 1 failed");
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	/*
+	 * Sanity check AG count, size fields against data size field
+	 */
+	if (unlikely(
+	    sbp->sb_dblocks == 0 ||
+	    sbp->sb_dblocks >
+	     (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
+	    sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
+			      sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
+		if (loud)
+			xfs_warn(mp, "SB sanity check 2 failed");
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	/*
+	 * Until this is fixed only page-sized or smaller data blocks work.
+	 */
+	if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
+		if (loud) {
+			xfs_warn(mp,
+		"File system with blocksize %d bytes. "
+		"Only pagesize (%ld) or less will currently work.",
+				sbp->sb_blocksize, PAGE_SIZE);
+		}
+		return XFS_ERROR(ENOSYS);
+	}
+
+	/*
+	 * Currently only very few inode sizes are supported.
+	 */
+	switch (sbp->sb_inodesize) {
+	case 256:
+	case 512:
+	case 1024:
+	case 2048:
+		break;
+	default:
+		if (loud)
+			xfs_warn(mp, "inode size of %d bytes not supported",
+				sbp->sb_inodesize);
+		return XFS_ERROR(ENOSYS);
+	}
+
+	if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
+	    xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
+		if (loud)
+			xfs_warn(mp,
+		"file system too large to be mounted on this system.");
+		return XFS_ERROR(EFBIG);
+	}
+
+	if (unlikely(sbp->sb_inprogress)) {
+		if (loud)
+			xfs_warn(mp, "file system busy");
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	/*
+	 * Version 1 directory format has never worked on Linux.
+	 */
+	if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
+		if (loud)
+			xfs_warn(mp,
+				"file system using version 1 directory format");
+		return XFS_ERROR(ENOSYS);
+	}
+
+	return 0;
+}
+
+int
+xfs_initialize_perag(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agcount,
+	xfs_agnumber_t	*maxagi)
+{
+	xfs_agnumber_t	index, max_metadata;
+	xfs_agnumber_t	first_initialised = 0;
+	xfs_perag_t	*pag;
+	xfs_agino_t	agino;
+	xfs_ino_t	ino;
+	xfs_sb_t	*sbp = &mp->m_sb;
+	int		error = -ENOMEM;
+
+	/*
+	 * Walk the current per-ag tree so we don't try to initialise AGs
+	 * that already exist (growfs case). Allocate and insert all the
+	 * AGs we don't find ready for initialisation.
+	 */
+	for (index = 0; index < agcount; index++) {
+		pag = xfs_perag_get(mp, index);
+		if (pag) {
+			xfs_perag_put(pag);
+			continue;
+		}
+		if (!first_initialised)
+			first_initialised = index;
+
+		pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+		if (!pag)
+			goto out_unwind;
+		pag->pag_agno = index;
+		pag->pag_mount = mp;
+		spin_lock_init(&pag->pag_ici_lock);
+		mutex_init(&pag->pag_ici_reclaim_lock);
+		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+		spin_lock_init(&pag->pag_buf_lock);
+		pag->pag_buf_tree = RB_ROOT;
+
+		if (radix_tree_preload(GFP_NOFS))
+			goto out_unwind;
+
+		spin_lock(&mp->m_perag_lock);
+		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
+			BUG();
+			spin_unlock(&mp->m_perag_lock);
+			radix_tree_preload_end();
+			error = -EEXIST;
+			goto out_unwind;
+		}
+		spin_unlock(&mp->m_perag_lock);
+		radix_tree_preload_end();
+	}
+
+	/*
+	 * If we mount with the inode64 option, or no inode overflows
+	 * the legacy 32-bit address space clear the inode32 option.
+	 */
+	agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+	ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+
+	if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
+		mp->m_flags |= XFS_MOUNT_32BITINODES;
+	else
+		mp->m_flags &= ~XFS_MOUNT_32BITINODES;
+
+	if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+		/*
+		 * Calculate how much should be reserved for inodes to meet
+		 * the max inode percentage.
+		 */
+		if (mp->m_maxicount) {
+			__uint64_t	icount;
+
+			icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+			do_div(icount, 100);
+			icount += sbp->sb_agblocks - 1;
+			do_div(icount, sbp->sb_agblocks);
+			max_metadata = icount;
+		} else {
+			max_metadata = agcount;
+		}
+
+		for (index = 0; index < agcount; index++) {
+			ino = XFS_AGINO_TO_INO(mp, index, agino);
+			if (ino > XFS_MAXINUMBER_32) {
+				index++;
+				break;
+			}
+
+			pag = xfs_perag_get(mp, index);
+			pag->pagi_inodeok = 1;
+			if (index < max_metadata)
+				pag->pagf_metadata = 1;
+			xfs_perag_put(pag);
+		}
+	} else {
+		for (index = 0; index < agcount; index++) {
+			pag = xfs_perag_get(mp, index);
+			pag->pagi_inodeok = 1;
+			xfs_perag_put(pag);
+		}
+	}
+
+	if (maxagi)
+		*maxagi = index;
+	return 0;
+
+out_unwind:
+	kmem_free(pag);
+	for (; index > first_initialised; index--) {
+		pag = radix_tree_delete(&mp->m_perag_tree, index);
+		kmem_free(pag);
+	}
+	return error;
+}
+
+void
+xfs_sb_from_disk(
+	xfs_sb_t	*to,
+	xfs_dsb_t	*from)
+{
+	to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
+	to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
+	to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
+	to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
+	to->sb_rextents = be64_to_cpu(from->sb_rextents);
+	memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
+	to->sb_logstart = be64_to_cpu(from->sb_logstart);
+	to->sb_rootino = be64_to_cpu(from->sb_rootino);
+	to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
+	to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
+	to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
+	to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
+	to->sb_agcount = be32_to_cpu(from->sb_agcount);
+	to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
+	to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
+	to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
+	to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
+	to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
+	to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
+	memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
+	to->sb_blocklog = from->sb_blocklog;
+	to->sb_sectlog = from->sb_sectlog;
+	to->sb_inodelog = from->sb_inodelog;
+	to->sb_inopblog = from->sb_inopblog;
+	to->sb_agblklog = from->sb_agblklog;
+	to->sb_rextslog = from->sb_rextslog;
+	to->sb_inprogress = from->sb_inprogress;
+	to->sb_imax_pct = from->sb_imax_pct;
+	to->sb_icount = be64_to_cpu(from->sb_icount);
+	to->sb_ifree = be64_to_cpu(from->sb_ifree);
+	to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
+	to->sb_frextents = be64_to_cpu(from->sb_frextents);
+	to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
+	to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
+	to->sb_qflags = be16_to_cpu(from->sb_qflags);
+	to->sb_flags = from->sb_flags;
+	to->sb_shared_vn = from->sb_shared_vn;
+	to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
+	to->sb_unit = be32_to_cpu(from->sb_unit);
+	to->sb_width = be32_to_cpu(from->sb_width);
+	to->sb_dirblklog = from->sb_dirblklog;
+	to->sb_logsectlog = from->sb_logsectlog;
+	to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
+	to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
+	to->sb_features2 = be32_to_cpu(from->sb_features2);
+	to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
+}
+
+/*
+ * Copy in core superblock to ondisk one.
+ *
+ * The fields argument is mask of superblock fields to copy.
+ */
+void
+xfs_sb_to_disk(
+	xfs_dsb_t	*to,
+	xfs_sb_t	*from,
+	__int64_t	fields)
+{
+	xfs_caddr_t	to_ptr = (xfs_caddr_t)to;
+	xfs_caddr_t	from_ptr = (xfs_caddr_t)from;
+	xfs_sb_field_t	f;
+	int		first;
+	int		size;
+
+	ASSERT(fields);
+	if (!fields)
+		return;
+
+	while (fields) {
+		f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+		first = xfs_sb_info[f].offset;
+		size = xfs_sb_info[f + 1].offset - first;
+
+		ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
+
+		if (size == 1 || xfs_sb_info[f].type == 1) {
+			memcpy(to_ptr + first, from_ptr + first, size);
+		} else {
+			switch (size) {
+			case 2:
+				*(__be16 *)(to_ptr + first) =
+					cpu_to_be16(*(__u16 *)(from_ptr + first));
+				break;
+			case 4:
+				*(__be32 *)(to_ptr + first) =
+					cpu_to_be32(*(__u32 *)(from_ptr + first));
+				break;
+			case 8:
+				*(__be64 *)(to_ptr + first) =
+					cpu_to_be64(*(__u64 *)(from_ptr + first));
+				break;
+			default:
+				ASSERT(0);
+			}
+		}
+
+		fields &= ~(1LL << f);
+	}
+}
+
+/*
+ * xfs_readsb
+ *
+ * Does the initial read of the superblock.
+ */
+int
+xfs_readsb(xfs_mount_t *mp, int flags)
+{
+	unsigned int	sector_size;
+	xfs_buf_t	*bp;
+	int		error;
+	int		loud = !(flags & XFS_MFSI_QUIET);
+
+	ASSERT(mp->m_sb_bp == NULL);
+	ASSERT(mp->m_ddev_targp != NULL);
+
+	/*
+	 * Allocate a (locked) buffer to hold the superblock.
+	 * This will be kept around at all times to optimize
+	 * access to the superblock.
+	 */
+	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
+
+reread:
+	bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+					XFS_SB_DADDR, sector_size, 0);
+	if (!bp) {
+		if (loud)
+			xfs_warn(mp, "SB buffer read failed");
+		return EIO;
+	}
+
+	/*
+	 * Initialize the mount structure from the superblock.
+	 * But first do some basic consistency checking.
+	 */
+	xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+	error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
+	if (error) {
+		if (loud)
+			xfs_warn(mp, "SB validate failed");
+		goto release_buf;
+	}
+
+	/*
+	 * We must be able to do sector-sized and sector-aligned IO.
+	 */
+	if (sector_size > mp->m_sb.sb_sectsize) {
+		if (loud)
+			xfs_warn(mp, "device supports %u byte sectors (not %u)",
+				sector_size, mp->m_sb.sb_sectsize);
+		error = ENOSYS;
+		goto release_buf;
+	}
+
+	/*
+	 * If device sector size is smaller than the superblock size,
+	 * re-read the superblock so the buffer is correctly sized.
+	 */
+	if (sector_size < mp->m_sb.sb_sectsize) {
+		xfs_buf_relse(bp);
+		sector_size = mp->m_sb.sb_sectsize;
+		goto reread;
+	}
+
+	/* Initialize per-cpu counters */
+	xfs_icsb_reinit_counters(mp);
+
+	mp->m_sb_bp = bp;
+	xfs_buf_unlock(bp);
+	return 0;
+
+release_buf:
+	xfs_buf_relse(bp);
+	return error;
+}
+
+
+/*
+ * xfs_mount_common
+ *
+ * Mount initialization code establishing various mount
+ * fields from the superblock associated with the given
+ * mount structure
+ */
+STATIC void
+xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
+{
+	mp->m_agfrotor = mp->m_agirotor = 0;
+	spin_lock_init(&mp->m_agirotor_lock);
+	mp->m_maxagi = mp->m_sb.sb_agcount;
+	mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
+	mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
+	mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
+	mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
+	mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+	mp->m_blockmask = sbp->sb_blocksize - 1;
+	mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
+	mp->m_blockwmask = mp->m_blockwsize - 1;
+
+	mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
+	mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+	mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
+	mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
+
+	mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+	mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+	mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
+	mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
+
+	mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
+	mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+	mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
+	mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
+
+	mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
+	mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
+					sbp->sb_inopblock);
+	mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+}
+
+/*
+ * xfs_initialize_perag_data
+ *
+ * Read in each per-ag structure so we can count up the number of
+ * allocated inodes, free inodes and used filesystem blocks as this
+ * information is no longer persistent in the superblock. Once we have
+ * this information, write it into the in-core superblock structure.
+ */
+STATIC int
+xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
+{
+	xfs_agnumber_t	index;
+	xfs_perag_t	*pag;
+	xfs_sb_t	*sbp = &mp->m_sb;
+	uint64_t	ifree = 0;
+	uint64_t	ialloc = 0;
+	uint64_t	bfree = 0;
+	uint64_t	bfreelst = 0;
+	uint64_t	btree = 0;
+	int		error;
+
+	for (index = 0; index < agcount; index++) {
+		/*
+		 * read the agf, then the agi. This gets us
+		 * all the information we need and populates the
+		 * per-ag structures for us.
+		 */
+		error = xfs_alloc_pagf_init(mp, NULL, index, 0);
+		if (error)
+			return error;
+
+		error = xfs_ialloc_pagi_init(mp, NULL, index);
+		if (error)
+			return error;
+		pag = xfs_perag_get(mp, index);
+		ifree += pag->pagi_freecount;
+		ialloc += pag->pagi_count;
+		bfree += pag->pagf_freeblks;
+		bfreelst += pag->pagf_flcount;
+		btree += pag->pagf_btreeblks;
+		xfs_perag_put(pag);
+	}
+	/*
+	 * Overwrite incore superblock counters with just-read data
+	 */
+	spin_lock(&mp->m_sb_lock);
+	sbp->sb_ifree = ifree;
+	sbp->sb_icount = ialloc;
+	sbp->sb_fdblocks = bfree + bfreelst + btree;
+	spin_unlock(&mp->m_sb_lock);
+
+	/* Fixup the per-cpu counters as well. */
+	xfs_icsb_reinit_counters(mp);
+
+	return 0;
+}
+
+/*
+ * Update alignment values based on mount options and sb values
+ */
+STATIC int
+xfs_update_alignment(xfs_mount_t *mp)
+{
+	xfs_sb_t	*sbp = &(mp->m_sb);
+
+	if (mp->m_dalign) {
+		/*
+		 * If stripe unit and stripe width are not multiples
+		 * of the fs blocksize turn off alignment.
+		 */
+		if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
+		    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
+			if (mp->m_flags & XFS_MOUNT_RETERR) {
+				xfs_warn(mp, "alignment check 1 failed");
+				return XFS_ERROR(EINVAL);
+			}
+			mp->m_dalign = mp->m_swidth = 0;
+		} else {
+			/*
+			 * Convert the stripe unit and width to FSBs.
+			 */
+			mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
+			if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
+				if (mp->m_flags & XFS_MOUNT_RETERR) {
+					return XFS_ERROR(EINVAL);
+				}
+				xfs_warn(mp,
+		"stripe alignment turned off: sunit(%d)/swidth(%d) "
+		"incompatible with agsize(%d)",
+					mp->m_dalign, mp->m_swidth,
+					sbp->sb_agblocks);
+
+				mp->m_dalign = 0;
+				mp->m_swidth = 0;
+			} else if (mp->m_dalign) {
+				mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
+			} else {
+				if (mp->m_flags & XFS_MOUNT_RETERR) {
+					xfs_warn(mp,
+		"stripe alignment turned off: sunit(%d) less than bsize(%d)",
+						mp->m_dalign,
+						mp->m_blockmask +1);
+					return XFS_ERROR(EINVAL);
+				}
+				mp->m_swidth = 0;
+			}
+		}
+
+		/*
+		 * Update superblock with new values
+		 * and log changes
+		 */
+		if (xfs_sb_version_hasdalign(sbp)) {
+			if (sbp->sb_unit != mp->m_dalign) {
+				sbp->sb_unit = mp->m_dalign;
+				mp->m_update_flags |= XFS_SB_UNIT;
+			}
+			if (sbp->sb_width != mp->m_swidth) {
+				sbp->sb_width = mp->m_swidth;
+				mp->m_update_flags |= XFS_SB_WIDTH;
+			}
+		}
+	} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
+		    xfs_sb_version_hasdalign(&mp->m_sb)) {
+			mp->m_dalign = sbp->sb_unit;
+			mp->m_swidth = sbp->sb_width;
+	}
+
+	return 0;
+}
+
+/*
+ * Set the maximum inode count for this filesystem
+ */
+STATIC void
+xfs_set_maxicount(xfs_mount_t *mp)
+{
+	xfs_sb_t	*sbp = &(mp->m_sb);
+	__uint64_t	icount;
+
+	if (sbp->sb_imax_pct) {
+		/*
+		 * Make sure the maximum inode count is a multiple
+		 * of the units we allocate inodes in.
+		 */
+		icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+		do_div(icount, 100);
+		do_div(icount, mp->m_ialloc_blks);
+		mp->m_maxicount = (icount * mp->m_ialloc_blks)  <<
+				   sbp->sb_inopblog;
+	} else {
+		mp->m_maxicount = 0;
+	}
+}
+
+/*
+ * Set the default minimum read and write sizes unless
+ * already specified in a mount option.
+ * We use smaller I/O sizes when the file system
+ * is being used for NFS service (wsync mount option).
+ */
+STATIC void
+xfs_set_rw_sizes(xfs_mount_t *mp)
+{
+	xfs_sb_t	*sbp = &(mp->m_sb);
+	int		readio_log, writeio_log;
+
+	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+		if (mp->m_flags & XFS_MOUNT_WSYNC) {
+			readio_log = XFS_WSYNC_READIO_LOG;
+			writeio_log = XFS_WSYNC_WRITEIO_LOG;
+		} else {
+			readio_log = XFS_READIO_LOG_LARGE;
+			writeio_log = XFS_WRITEIO_LOG_LARGE;
+		}
+	} else {
+		readio_log = mp->m_readio_log;
+		writeio_log = mp->m_writeio_log;
+	}
+
+	if (sbp->sb_blocklog > readio_log) {
+		mp->m_readio_log = sbp->sb_blocklog;
+	} else {
+		mp->m_readio_log = readio_log;
+	}
+	mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
+	if (sbp->sb_blocklog > writeio_log) {
+		mp->m_writeio_log = sbp->sb_blocklog;
+	} else {
+		mp->m_writeio_log = writeio_log;
+	}
+	mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
+}
+
+/*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+	struct xfs_mount	*mp)
+{
+	int i;
+
+	for (i = 0; i < XFS_LOWSP_MAX; i++) {
+		__uint64_t space = mp->m_sb.sb_dblocks;
+
+		do_div(space, 100);
+		mp->m_low_space[i] = space * (i + 1);
+	}
+}
+
+
+/*
+ * Set whether we're using inode alignment.
+ */
+STATIC void
+xfs_set_inoalignment(xfs_mount_t *mp)
+{
+	if (xfs_sb_version_hasalign(&mp->m_sb) &&
+	    mp->m_sb.sb_inoalignmt >=
+	    XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
+		mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
+	else
+		mp->m_inoalign_mask = 0;
+	/*
+	 * If we are using stripe alignment, check whether
+	 * the stripe unit is a multiple of the inode alignment
+	 */
+	if (mp->m_dalign && mp->m_inoalign_mask &&
+	    !(mp->m_dalign & mp->m_inoalign_mask))
+		mp->m_sinoalign = mp->m_dalign;
+	else
+		mp->m_sinoalign = 0;
+}
+
+/*
+ * Check that the data (and log if separate) are an ok size.
+ */
+STATIC int
+xfs_check_sizes(xfs_mount_t *mp)
+{
+	xfs_buf_t	*bp;
+	xfs_daddr_t	d;
+
+	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
+		xfs_warn(mp, "filesystem size mismatch detected");
+		return XFS_ERROR(EFBIG);
+	}
+	bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+					d - XFS_FSS_TO_BB(mp, 1),
+					BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
+	if (!bp) {
+		xfs_warn(mp, "last sector read failed");
+		return EIO;
+	}
+	xfs_buf_relse(bp);
+
+	if (mp->m_logdev_targp != mp->m_ddev_targp) {
+		d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+		if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
+			xfs_warn(mp, "log size mismatch detected");
+			return XFS_ERROR(EFBIG);
+		}
+		bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
+					d - XFS_FSB_TO_BB(mp, 1),
+					XFS_FSB_TO_B(mp, 1), 0);
+		if (!bp) {
+			xfs_warn(mp, "log device read failed");
+			return EIO;
+		}
+		xfs_buf_relse(bp);
+	}
+	return 0;
+}
+
+/*
+ * Clear the quotaflags in memory and in the superblock.
+ */
+int
+xfs_mount_reset_sbqflags(
+	struct xfs_mount	*mp)
+{
+	int			error;
+	struct xfs_trans	*tp;
+
+	mp->m_qflags = 0;
+
+	/*
+	 * It is OK to look at sb_qflags here in mount path,
+	 * without m_sb_lock.
+	 */
+	if (mp->m_sb.sb_qflags == 0)
+		return 0;
+	spin_lock(&mp->m_sb_lock);
+	mp->m_sb.sb_qflags = 0;
+	spin_unlock(&mp->m_sb_lock);
+
+	/*
+	 * If the fs is readonly, let the incore superblock run
+	 * with quotas off but don't flush the update out to disk
+	 */
+	if (mp->m_flags & XFS_MOUNT_RDONLY)
+		return 0;
+
+#ifdef QUOTADEBUG
+	xfs_notice(mp, "Writing superblock quota changes");
+#endif
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+				      XFS_DEFAULT_LOG_COUNT);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		xfs_alert(mp, "%s: Superblock update failed!", __func__);
+		return error;
+	}
+
+	xfs_mod_sb(tp, XFS_SB_QFLAGS);
+	return xfs_trans_commit(tp, 0);
+}
+
+__uint64_t
+xfs_default_resblks(xfs_mount_t *mp)
+{
+	__uint64_t resblks;
+
+	/*
+	 * We default to 5% or 8192 fsbs of space reserved, whichever is
+	 * smaller.  This is intended to cover concurrent allocation
+	 * transactions when we initially hit enospc. These each require a 4
+	 * block reservation. Hence by default we cover roughly 2000 concurrent
+	 * allocation reservations.
+	 */
+	resblks = mp->m_sb.sb_dblocks;
+	do_div(resblks, 20);
+	resblks = min_t(__uint64_t, resblks, 8192);
+	return resblks;
+}
+
+/*
+ * This function does the following on an initial mount of a file system:
+ *	- reads the superblock from disk and init the mount struct
+ *	- if we're a 32-bit kernel, do a size check on the superblock
+ *		so we don't mount terabyte filesystems
+ *	- init mount struct realtime fields
+ *	- allocate inode hash table for fs
+ *	- init directory manager
+ *	- perform recovery and init the log manager
+ */
+int
+xfs_mountfs(
+	xfs_mount_t	*mp)
+{
+	xfs_sb_t	*sbp = &(mp->m_sb);
+	xfs_inode_t	*rip;
+	__uint64_t	resblks;
+	uint		quotamount = 0;
+	uint		quotaflags = 0;
+	int		error = 0;
+
+	xfs_mount_common(mp, sbp);
+
+	/*
+	 * Check for a mismatched features2 values.  Older kernels
+	 * read & wrote into the wrong sb offset for sb_features2
+	 * on some platforms due to xfs_sb_t not being 64bit size aligned
+	 * when sb_features2 was added, which made older superblock
+	 * reading/writing routines swap it as a 64-bit value.
+	 *
+	 * For backwards compatibility, we make both slots equal.
+	 *
+	 * If we detect a mismatched field, we OR the set bits into the
+	 * existing features2 field in case it has already been modified; we
+	 * don't want to lose any features.  We then update the bad location
+	 * with the ORed value so that older kernels will see any features2
+	 * flags, and mark the two fields as needing updates once the
+	 * transaction subsystem is online.
+	 */
+	if (xfs_sb_has_mismatched_features2(sbp)) {
+		xfs_warn(mp, "correcting sb_features alignment problem");
+		sbp->sb_features2 |= sbp->sb_bad_features2;
+		sbp->sb_bad_features2 = sbp->sb_features2;
+		mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
+
+		/*
+		 * Re-check for ATTR2 in case it was found in bad_features2
+		 * slot.
+		 */
+		if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+		   !(mp->m_flags & XFS_MOUNT_NOATTR2))
+			mp->m_flags |= XFS_MOUNT_ATTR2;
+	}
+
+	if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+	   (mp->m_flags & XFS_MOUNT_NOATTR2)) {
+		xfs_sb_version_removeattr2(&mp->m_sb);
+		mp->m_update_flags |= XFS_SB_FEATURES2;
+
+		/* update sb_versionnum for the clearing of the morebits */
+		if (!sbp->sb_features2)
+			mp->m_update_flags |= XFS_SB_VERSIONNUM;
+	}
+
+	/*
+	 * Check if sb_agblocks is aligned at stripe boundary
+	 * If sb_agblocks is NOT aligned turn off m_dalign since
+	 * allocator alignment is within an ag, therefore ag has
+	 * to be aligned at stripe boundary.
+	 */
+	error = xfs_update_alignment(mp);
+	if (error)
+		goto out;
+
+	xfs_alloc_compute_maxlevels(mp);
+	xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
+	xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
+	xfs_ialloc_compute_maxlevels(mp);
+
+	xfs_set_maxicount(mp);
+
+	mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog);
+
+	error = xfs_uuid_mount(mp);
+	if (error)
+		goto out;
+
+	/*
+	 * Set the minimum read and write sizes
+	 */
+	xfs_set_rw_sizes(mp);
+
+	/* set the low space thresholds for dynamic preallocation */
+	xfs_set_low_space_thresholds(mp);
+
+	/*
+	 * Set the inode cluster size.
+	 * This may still be overridden by the file system
+	 * block size if it is larger than the chosen cluster size.
+	 */
+	mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
+
+	/*
+	 * Set inode alignment fields
+	 */
+	xfs_set_inoalignment(mp);
+
+	/*
+	 * Check that the data (and log if separate) are an ok size.
+	 */
+	error = xfs_check_sizes(mp);
+	if (error)
+		goto out_remove_uuid;
+
+	/*
+	 * Initialize realtime fields in the mount structure
+	 */
+	error = xfs_rtmount_init(mp);
+	if (error) {
+		xfs_warn(mp, "RT mount failed");
+		goto out_remove_uuid;
+	}
+
+	/*
+	 *  Copies the low order bits of the timestamp and the randomly
+	 *  set "sequence" number out of a UUID.
+	 */
+	uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid);
+
+	mp->m_dmevmask = 0;	/* not persistent; set after each mount */
+
+	xfs_dir_mount(mp);
+
+	/*
+	 * Initialize the attribute manager's entries.
+	 */
+	mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100;
+
+	/*
+	 * Initialize the precomputed transaction reservations values.
+	 */
+	xfs_trans_init(mp);
+
+	/*
+	 * Allocate and initialize the per-ag data.
+	 */
+	spin_lock_init(&mp->m_perag_lock);
+	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
+	error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+	if (error) {
+		xfs_warn(mp, "Failed per-ag init: %d", error);
+		goto out_remove_uuid;
+	}
+
+	if (!sbp->sb_logblocks) {
+		xfs_warn(mp, "no log defined");
+		XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
+		error = XFS_ERROR(EFSCORRUPTED);
+		goto out_free_perag;
+	}
+
+	/*
+	 * log's mount-time initialization. Perform 1st part recovery if needed
+	 */
+	error = xfs_log_mount(mp, mp->m_logdev_targp,
+			      XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
+			      XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
+	if (error) {
+		xfs_warn(mp, "log mount failed");
+		goto out_free_perag;
+	}
+
+	/*
+	 * Now the log is mounted, we know if it was an unclean shutdown or
+	 * not. If it was, with the first phase of recovery has completed, we
+	 * have consistent AG blocks on disk. We have not recovered EFIs yet,
+	 * but they are recovered transactionally in the second recovery phase
+	 * later.
+	 *
+	 * Hence we can safely re-initialise incore superblock counters from
+	 * the per-ag data. These may not be correct if the filesystem was not
+	 * cleanly unmounted, so we need to wait for recovery to finish before
+	 * doing this.
+	 *
+	 * If the filesystem was cleanly unmounted, then we can trust the
+	 * values in the superblock to be correct and we don't need to do
+	 * anything here.
+	 *
+	 * If we are currently making the filesystem, the initialisation will
+	 * fail as the perag data is in an undefined state.
+	 */
+	if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
+	    !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
+	     !mp->m_sb.sb_inprogress) {
+		error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
+		if (error)
+			goto out_free_perag;
+	}
+
+	/*
+	 * Get and sanity-check the root inode.
+	 * Save the pointer to it in the mount structure.
+	 */
+	error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
+	if (error) {
+		xfs_warn(mp, "failed to read root inode");
+		goto out_log_dealloc;
+	}
+
+	ASSERT(rip != NULL);
+
+	if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
+		xfs_warn(mp, "corrupted root inode %llu: not a directory",
+			(unsigned long long)rip->i_ino);
+		xfs_iunlock(rip, XFS_ILOCK_EXCL);
+		XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
+				 mp);
+		error = XFS_ERROR(EFSCORRUPTED);
+		goto out_rele_rip;
+	}
+	mp->m_rootip = rip;	/* save it */
+
+	xfs_iunlock(rip, XFS_ILOCK_EXCL);
+
+	/*
+	 * Initialize realtime inode pointers in the mount structure
+	 */
+	error = xfs_rtmount_inodes(mp);
+	if (error) {
+		/*
+		 * Free up the root inode.
+		 */
+		xfs_warn(mp, "failed to read RT inodes");
+		goto out_rele_rip;
+	}
+
+	/*
+	 * If this is a read-only mount defer the superblock updates until
+	 * the next remount into writeable mode.  Otherwise we would never
+	 * perform the update e.g. for the root filesystem.
+	 */
+	if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+		error = xfs_mount_log_sb(mp, mp->m_update_flags);
+		if (error) {
+			xfs_warn(mp, "failed to write sb changes");
+			goto out_rtunmount;
+		}
+	}
+
+	/*
+	 * Initialise the XFS quota management subsystem for this mount
+	 */
+	if (XFS_IS_QUOTA_RUNNING(mp)) {
+		error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
+		if (error)
+			goto out_rtunmount;
+	} else {
+		ASSERT(!XFS_IS_QUOTA_ON(mp));
+
+		/*
+		 * If a file system had quotas running earlier, but decided to
+		 * mount without -o uquota/pquota/gquota options, revoke the
+		 * quotachecked license.
+		 */
+		if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
+			xfs_notice(mp, "resetting quota flags");
+			error = xfs_mount_reset_sbqflags(mp);
+			if (error)
+				return error;
+		}
+	}
+
+	/*
+	 * Finish recovering the file system.  This part needed to be
+	 * delayed until after the root and real-time bitmap inodes
+	 * were consistently read in.
+	 */
+	error = xfs_log_mount_finish(mp);
+	if (error) {
+		xfs_warn(mp, "log mount finish failed");
+		goto out_rtunmount;
+	}
+
+	/*
+	 * Complete the quota initialisation, post-log-replay component.
+	 */
+	if (quotamount) {
+		ASSERT(mp->m_qflags == 0);
+		mp->m_qflags = quotaflags;
+
+		xfs_qm_mount_quotas(mp);
+	}
+
+	/*
+	 * Now we are mounted, reserve a small amount of unused space for
+	 * privileged transactions. This is needed so that transaction
+	 * space required for critical operations can dip into this pool
+	 * when at ENOSPC. This is needed for operations like create with
+	 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
+	 * are not allowed to use this reserved space.
+	 *
+	 * This may drive us straight to ENOSPC on mount, but that implies
+	 * we were already there on the last unmount. Warn if this occurs.
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+		resblks = xfs_default_resblks(mp);
+		error = xfs_reserve_blocks(mp, &resblks, NULL);
+		if (error)
+			xfs_warn(mp,
+	"Unable to allocate reserve blocks. Continuing without reserve pool.");
+	}
+
+	return 0;
+
+ out_rtunmount:
+	xfs_rtunmount_inodes(mp);
+ out_rele_rip:
+	IRELE(rip);
+ out_log_dealloc:
+	xfs_log_unmount(mp);
+ out_free_perag:
+	xfs_free_perag(mp);
+ out_remove_uuid:
+	xfs_uuid_unmount(mp);
+ out:
+	return error;
+}
+
+/*
+ * This flushes out the inodes,dquots and the superblock, unmounts the
+ * log and makes sure that incore structures are freed.
+ */
+void
+xfs_unmountfs(
+	struct xfs_mount	*mp)
+{
+	__uint64_t		resblks;
+	int			error;
+
+	xfs_qm_unmount_quotas(mp);
+	xfs_rtunmount_inodes(mp);
+	IRELE(mp->m_rootip);
+
+	/*
+	 * We can potentially deadlock here if we have an inode cluster
+	 * that has been freed has its buffer still pinned in memory because
+	 * the transaction is still sitting in a iclog. The stale inodes
+	 * on that buffer will have their flush locks held until the
+	 * transaction hits the disk and the callbacks run. the inode
+	 * flush takes the flush lock unconditionally and with nothing to
+	 * push out the iclog we will never get that unlocked. hence we
+	 * need to force the log first.
+	 */
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/*
+	 * Do a delwri reclaim pass first so that as many dirty inodes are
+	 * queued up for IO as possible. Then flush the buffers before making
+	 * a synchronous path to catch all the remaining inodes are reclaimed.
+	 * This makes the reclaim process as quick as possible by avoiding
+	 * synchronous writeout and blocking on inodes already in the delwri
+	 * state as much as possible.
+	 */
+	xfs_reclaim_inodes(mp, 0);
+	XFS_bflush(mp->m_ddev_targp);
+	xfs_reclaim_inodes(mp, SYNC_WAIT);
+
+	xfs_qm_unmount(mp);
+
+	/*
+	 * Flush out the log synchronously so that we know for sure
+	 * that nothing is pinned.  This is important because bflush()
+	 * will skip pinned buffers.
+	 */
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/*
+	 * Unreserve any blocks we have so that when we unmount we don't account
+	 * the reserved free space as used. This is really only necessary for
+	 * lazy superblock counting because it trusts the incore superblock
+	 * counters to be absolutely correct on clean unmount.
+	 *
+	 * We don't bother correcting this elsewhere for lazy superblock
+	 * counting because on mount of an unclean filesystem we reconstruct the
+	 * correct counter value and this is irrelevant.
+	 *
+	 * For non-lazy counter filesystems, this doesn't matter at all because
+	 * we only every apply deltas to the superblock and hence the incore
+	 * value does not matter....
+	 */
+	resblks = 0;
+	error = xfs_reserve_blocks(mp, &resblks, NULL);
+	if (error)
+		xfs_warn(mp, "Unable to free reserved block pool. "
+				"Freespace may not be correct on next mount.");
+
+	error = xfs_log_sbcount(mp, 1);
+	if (error)
+		xfs_warn(mp, "Unable to update superblock counters. "
+				"Freespace may not be correct on next mount.");
+	xfs_unmountfs_writesb(mp);
+
+	/*
+	 * Make sure all buffers have been flushed and completed before
+	 * unmounting the log.
+	 */
+	error = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+	if (error)
+		xfs_warn(mp, "%d busy buffers during unmount.", error);
+	xfs_wait_buftarg(mp->m_ddev_targp);
+
+	xfs_log_unmount_write(mp);
+	xfs_log_unmount(mp);
+	xfs_uuid_unmount(mp);
+
+#if defined(DEBUG)
+	xfs_errortag_clearall(mp, 0);
+#endif
+	xfs_free_perag(mp);
+}
+
+int
+xfs_fs_writable(xfs_mount_t *mp)
+{
+	return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) ||
+		(mp->m_flags & XFS_MOUNT_RDONLY));
+}
+
+/*
+ * xfs_log_sbcount
+ *
+ * Called either periodically to keep the on disk superblock values
+ * roughly up to date or from unmount to make sure the values are
+ * correct on a clean unmount.
+ *
+ * Note this code can be called during the process of freezing, so
+ * we may need to use the transaction allocator which does not not
+ * block when the transaction subsystem is in its frozen state.
+ */
+int
+xfs_log_sbcount(
+	xfs_mount_t	*mp,
+	uint		sync)
+{
+	xfs_trans_t	*tp;
+	int		error;
+
+	if (!xfs_fs_writable(mp))
+		return 0;
+
+	xfs_icsb_sync_counters(mp, 0);
+
+	/*
+	 * we don't need to do this if we are updating the superblock
+	 * counters on every modification.
+	 */
+	if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
+		return 0;
+
+	tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
+	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+					XFS_DEFAULT_LOG_COUNT);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
+	if (sync)
+		xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0);
+	return error;
+}
+
+int
+xfs_unmountfs_writesb(xfs_mount_t *mp)
+{
+	xfs_buf_t	*sbp;
+	int		error = 0;
+
+	/*
+	 * skip superblock write if fs is read-only, or
+	 * if we are doing a forced umount.
+	 */
+	if (!((mp->m_flags & XFS_MOUNT_RDONLY) ||
+		XFS_FORCED_SHUTDOWN(mp))) {
+
+		sbp = xfs_getsb(mp, 0);
+
+		XFS_BUF_UNDONE(sbp);
+		XFS_BUF_UNREAD(sbp);
+		XFS_BUF_UNDELAYWRITE(sbp);
+		XFS_BUF_WRITE(sbp);
+		XFS_BUF_UNASYNC(sbp);
+		ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
+		xfsbdstrat(mp, sbp);
+		error = xfs_buf_iowait(sbp);
+		if (error)
+			xfs_ioerror_alert("xfs_unmountfs_writesb",
+					  mp, sbp, XFS_BUF_ADDR(sbp));
+		xfs_buf_relse(sbp);
+	}
+	return error;
+}
+
+/*
+ * xfs_mod_sb() can be used to copy arbitrary changes to the
+ * in-core superblock into the superblock buffer to be logged.
+ * It does not provide the higher level of locking that is
+ * needed to protect the in-core superblock from concurrent
+ * access.
+ */
+void
+xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
+{
+	xfs_buf_t	*bp;
+	int		first;
+	int		last;
+	xfs_mount_t	*mp;
+	xfs_sb_field_t	f;
+
+	ASSERT(fields);
+	if (!fields)
+		return;
+	mp = tp->t_mountp;
+	bp = xfs_trans_getsb(tp, mp, 0);
+	first = sizeof(xfs_sb_t);
+	last = 0;
+
+	/* translate/copy */
+
+	xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
+
+	/* find modified range */
+	f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+	last = xfs_sb_info[f + 1].offset - 1;
+
+	f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+	first = xfs_sb_info[f].offset;
+
+	xfs_trans_log_buf(tp, bp, first, last);
+}
+
+
+/*
+ * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
+ * a delta to a specified field in the in-core superblock.  Simply
+ * switch on the field indicated and apply the delta to that field.
+ * Fields are not allowed to dip below zero, so if the delta would
+ * do this do not apply it and return EINVAL.
+ *
+ * The m_sb_lock must be held when this routine is called.
+ */
+STATIC int
+xfs_mod_incore_sb_unlocked(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field,
+	int64_t		delta,
+	int		rsvd)
+{
+	int		scounter;	/* short counter for 32 bit fields */
+	long long	lcounter;	/* long counter for 64 bit fields */
+	long long	res_used, rem;
+
+	/*
+	 * With the in-core superblock spin lock held, switch
+	 * on the indicated field.  Apply the delta to the
+	 * proper field.  If the fields value would dip below
+	 * 0, then do not apply the delta and return EINVAL.
+	 */
+	switch (field) {
+	case XFS_SBS_ICOUNT:
+		lcounter = (long long)mp->m_sb.sb_icount;
+		lcounter += delta;
+		if (lcounter < 0) {
+			ASSERT(0);
+			return XFS_ERROR(EINVAL);
+		}
+		mp->m_sb.sb_icount = lcounter;
+		return 0;
+	case XFS_SBS_IFREE:
+		lcounter = (long long)mp->m_sb.sb_ifree;
+		lcounter += delta;
+		if (lcounter < 0) {
+			ASSERT(0);
+			return XFS_ERROR(EINVAL);
+		}
+		mp->m_sb.sb_ifree = lcounter;
+		return 0;
+	case XFS_SBS_FDBLOCKS:
+		lcounter = (long long)
+			mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+		res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
+
+		if (delta > 0) {		/* Putting blocks back */
+			if (res_used > delta) {
+				mp->m_resblks_avail += delta;
+			} else {
+				rem = delta - res_used;
+				mp->m_resblks_avail = mp->m_resblks;
+				lcounter += rem;
+			}
+		} else {				/* Taking blocks away */
+			lcounter += delta;
+			if (lcounter >= 0) {
+				mp->m_sb.sb_fdblocks = lcounter +
+							XFS_ALLOC_SET_ASIDE(mp);
+				return 0;
+			}
+
+			/*
+			 * We are out of blocks, use any available reserved
+			 * blocks if were allowed to.
+			 */
+			if (!rsvd)
+				return XFS_ERROR(ENOSPC);
+
+			lcounter = (long long)mp->m_resblks_avail + delta;
+			if (lcounter >= 0) {
+				mp->m_resblks_avail = lcounter;
+				return 0;
+			}
+			printk_once(KERN_WARNING
+				"Filesystem \"%s\": reserve blocks depleted! "
+				"Consider increasing reserve pool size.",
+				mp->m_fsname);
+			return XFS_ERROR(ENOSPC);
+		}
+
+		mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
+		return 0;
+	case XFS_SBS_FREXTENTS:
+		lcounter = (long long)mp->m_sb.sb_frextents;
+		lcounter += delta;
+		if (lcounter < 0) {
+			return XFS_ERROR(ENOSPC);
+		}
+		mp->m_sb.sb_frextents = lcounter;
+		return 0;
+	case XFS_SBS_DBLOCKS:
+		lcounter = (long long)mp->m_sb.sb_dblocks;
+		lcounter += delta;
+		if (lcounter < 0) {
+			ASSERT(0);
+			return XFS_ERROR(EINVAL);
+		}
+		mp->m_sb.sb_dblocks = lcounter;
+		return 0;
+	case XFS_SBS_AGCOUNT:
+		scounter = mp->m_sb.sb_agcount;
+		scounter += delta;
+		if (scounter < 0) {
+			ASSERT(0);
+			return XFS_ERROR(EINVAL);
+		}
+		mp->m_sb.sb_agcount = scounter;
+		return 0;
+	case XFS_SBS_IMAX_PCT:
+		scounter = mp->m_sb.sb_imax_pct;
+		scounter += delta;
+		if (scounter < 0) {
+			ASSERT(0);
+			return XFS_ERROR(EINVAL);
+		}
+		mp->m_sb.sb_imax_pct = scounter;
+		return 0;
+	case XFS_SBS_REXTSIZE:
+		scounter = mp->m_sb.sb_rextsize;
+		scounter += delta;
+		if (scounter < 0) {
+			ASSERT(0);
+			return XFS_ERROR(EINVAL);
+		}
+		mp->m_sb.sb_rextsize = scounter;
+		return 0;
+	case XFS_SBS_RBMBLOCKS:
+		scounter = mp->m_sb.sb_rbmblocks;
+		scounter += delta;
+		if (scounter < 0) {
+			ASSERT(0);
+			return XFS_ERROR(EINVAL);
+		}
+		mp->m_sb.sb_rbmblocks = scounter;
+		return 0;
+	case XFS_SBS_RBLOCKS:
+		lcounter = (long long)mp->m_sb.sb_rblocks;
+		lcounter += delta;
+		if (lcounter < 0) {
+			ASSERT(0);
+			return XFS_ERROR(EINVAL);
+		}
+		mp->m_sb.sb_rblocks = lcounter;
+		return 0;
+	case XFS_SBS_REXTENTS:
+		lcounter = (long long)mp->m_sb.sb_rextents;
+		lcounter += delta;
+		if (lcounter < 0) {
+			ASSERT(0);
+			return XFS_ERROR(EINVAL);
+		}
+		mp->m_sb.sb_rextents = lcounter;
+		return 0;
+	case XFS_SBS_REXTSLOG:
+		scounter = mp->m_sb.sb_rextslog;
+		scounter += delta;
+		if (scounter < 0) {
+			ASSERT(0);
+			return XFS_ERROR(EINVAL);
+		}
+		mp->m_sb.sb_rextslog = scounter;
+		return 0;
+	default:
+		ASSERT(0);
+		return XFS_ERROR(EINVAL);
+	}
+}
+
+/*
+ * xfs_mod_incore_sb() is used to change a field in the in-core
+ * superblock structure by the specified delta.  This modification
+ * is protected by the m_sb_lock.  Just use the xfs_mod_incore_sb_unlocked()
+ * routine to do the work.
+ */
+int
+xfs_mod_incore_sb(
+	struct xfs_mount	*mp,
+	xfs_sb_field_t		field,
+	int64_t			delta,
+	int			rsvd)
+{
+	int			status;
+
+#ifdef HAVE_PERCPU_SB
+	ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
+#endif
+	spin_lock(&mp->m_sb_lock);
+	status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+	spin_unlock(&mp->m_sb_lock);
+
+	return status;
+}
+
+/*
+ * Change more than one field in the in-core superblock structure at a time.
+ *
+ * The fields and changes to those fields are specified in the array of
+ * xfs_mod_sb structures passed in.  Either all of the specified deltas
+ * will be applied or none of them will.  If any modified field dips below 0,
+ * then all modifications will be backed out and EINVAL will be returned.
+ *
+ * Note that this function may not be used for the superblock values that
+ * are tracked with the in-memory per-cpu counters - a direct call to
+ * xfs_icsb_modify_counters is required for these.
+ */
+int
+xfs_mod_incore_sb_batch(
+	struct xfs_mount	*mp,
+	xfs_mod_sb_t		*msb,
+	uint			nmsb,
+	int			rsvd)
+{
+	xfs_mod_sb_t		*msbp;
+	int			error = 0;
+
+	/*
+	 * Loop through the array of mod structures and apply each individually.
+	 * If any fail, then back out all those which have already been applied.
+	 * Do all of this within the scope of the m_sb_lock so that all of the
+	 * changes will be atomic.
+	 */
+	spin_lock(&mp->m_sb_lock);
+	for (msbp = msb; msbp < (msb + nmsb); msbp++) {
+		ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
+		       msbp->msb_field > XFS_SBS_FDBLOCKS);
+
+		error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
+						   msbp->msb_delta, rsvd);
+		if (error)
+			goto unwind;
+	}
+	spin_unlock(&mp->m_sb_lock);
+	return 0;
+
+unwind:
+	while (--msbp >= msb) {
+		error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
+						   -msbp->msb_delta, rsvd);
+		ASSERT(error == 0);
+	}
+	spin_unlock(&mp->m_sb_lock);
+	return error;
+}
+
+/*
+ * xfs_getsb() is called to obtain the buffer for the superblock.
+ * The buffer is returned locked and read in from disk.
+ * The buffer should be released with a call to xfs_brelse().
+ *
+ * If the flags parameter is BUF_TRYLOCK, then we'll only return
+ * the superblock buffer if it can be locked without sleeping.
+ * If it can't then we'll return NULL.
+ */
+xfs_buf_t *
+xfs_getsb(
+	xfs_mount_t	*mp,
+	int		flags)
+{
+	xfs_buf_t	*bp;
+
+	ASSERT(mp->m_sb_bp != NULL);
+	bp = mp->m_sb_bp;
+	if (flags & XBF_TRYLOCK) {
+		if (!XFS_BUF_CPSEMA(bp)) {
+			return NULL;
+		}
+	} else {
+		XFS_BUF_PSEMA(bp, PRIBIO);
+	}
+	XFS_BUF_HOLD(bp);
+	ASSERT(XFS_BUF_ISDONE(bp));
+	return bp;
+}
+
+/*
+ * Used to free the superblock along various error paths.
+ */
+void
+xfs_freesb(
+	struct xfs_mount	*mp)
+{
+	struct xfs_buf		*bp = mp->m_sb_bp;
+
+	xfs_buf_lock(bp);
+	mp->m_sb_bp = NULL;
+	xfs_buf_relse(bp);
+}
+
+/*
+ * Used to log changes to the superblock unit and width fields which could
+ * be altered by the mount options, as well as any potential sb_features2
+ * fixup. Only the first superblock is updated.
+ */
+int
+xfs_mount_log_sb(
+	xfs_mount_t	*mp,
+	__int64_t	fields)
+{
+	xfs_trans_t	*tp;
+	int		error;
+
+	ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
+			 XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 |
+			 XFS_SB_VERSIONNUM));
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
+	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+				XFS_DEFAULT_LOG_COUNT);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+	xfs_mod_sb(tp, fields);
+	error = xfs_trans_commit(tp, 0);
+	return error;
+}
+
+/*
+ * If the underlying (data/log/rt) device is readonly, there are some
+ * operations that cannot proceed.
+ */
+int
+xfs_dev_is_read_only(
+	struct xfs_mount	*mp,
+	char			*message)
+{
+	if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
+	    xfs_readonly_buftarg(mp->m_logdev_targp) ||
+	    (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
+		xfs_notice(mp, "%s required on read-only device.", message);
+		xfs_notice(mp, "write access unavailable, cannot proceed.");
+		return EROFS;
+	}
+	return 0;
+}
+
+#ifdef HAVE_PERCPU_SB
+/*
+ * Per-cpu incore superblock counters
+ *
+ * Simple concept, difficult implementation
+ *
+ * Basically, replace the incore superblock counters with a distributed per cpu
+ * counter for contended fields (e.g.  free block count).
+ *
+ * Difficulties arise in that the incore sb is used for ENOSPC checking, and
+ * hence needs to be accurately read when we are running low on space. Hence
+ * there is a method to enable and disable the per-cpu counters based on how
+ * much "stuff" is available in them.
+ *
+ * Basically, a counter is enabled if there is enough free resource to justify
+ * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local
+ * ENOSPC), then we disable the counters to synchronise all callers and
+ * re-distribute the available resources.
+ *
+ * If, once we redistributed the available resources, we still get a failure,
+ * we disable the per-cpu counter and go through the slow path.
+ *
+ * The slow path is the current xfs_mod_incore_sb() function.  This means that
+ * when we disable a per-cpu counter, we need to drain its resources back to
+ * the global superblock. We do this after disabling the counter to prevent
+ * more threads from queueing up on the counter.
+ *
+ * Essentially, this means that we still need a lock in the fast path to enable
+ * synchronisation between the global counters and the per-cpu counters. This
+ * is not a problem because the lock will be local to a CPU almost all the time
+ * and have little contention except when we get to ENOSPC conditions.
+ *
+ * Basically, this lock becomes a barrier that enables us to lock out the fast
+ * path while we do things like enabling and disabling counters and
+ * synchronising the counters.
+ *
+ * Locking rules:
+ *
+ * 	1. m_sb_lock before picking up per-cpu locks
+ * 	2. per-cpu locks always picked up via for_each_online_cpu() order
+ * 	3. accurate counter sync requires m_sb_lock + per cpu locks
+ * 	4. modifying per-cpu counters requires holding per-cpu lock
+ * 	5. modifying global counters requires holding m_sb_lock
+ *	6. enabling or disabling a counter requires holding the m_sb_lock 
+ *	   and _none_ of the per-cpu locks.
+ *
+ * Disabled counters are only ever re-enabled by a balance operation
+ * that results in more free resources per CPU than a given threshold.
+ * To ensure counters don't remain disabled, they are rebalanced when
+ * the global resource goes above a higher threshold (i.e. some hysteresis
+ * is present to prevent thrashing).
+ */
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * hot-plug CPU notifier support.
+ *
+ * We need a notifier per filesystem as we need to be able to identify
+ * the filesystem to balance the counters out. This is achieved by
+ * having a notifier block embedded in the xfs_mount_t and doing pointer
+ * magic to get the mount pointer from the notifier block address.
+ */
+STATIC int
+xfs_icsb_cpu_notify(
+	struct notifier_block *nfb,
+	unsigned long action,
+	void *hcpu)
+{
+	xfs_icsb_cnts_t *cntp;
+	xfs_mount_t	*mp;
+
+	mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier);
+	cntp = (xfs_icsb_cnts_t *)
+			per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		/* Easy Case - initialize the area and locks, and
+		 * then rebalance when online does everything else for us. */
+		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
+		break;
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		xfs_icsb_lock(mp);
+		xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
+		xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
+		xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
+		xfs_icsb_unlock(mp);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		/* Disable all the counters, then fold the dead cpu's
+		 * count into the total on the global superblock and
+		 * re-enable the counters. */
+		xfs_icsb_lock(mp);
+		spin_lock(&mp->m_sb_lock);
+		xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT);
+		xfs_icsb_disable_counter(mp, XFS_SBS_IFREE);
+		xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS);
+
+		mp->m_sb.sb_icount += cntp->icsb_icount;
+		mp->m_sb.sb_ifree += cntp->icsb_ifree;
+		mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks;
+
+		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
+
+		xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0);
+		xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0);
+		xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0);
+		spin_unlock(&mp->m_sb_lock);
+		xfs_icsb_unlock(mp);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int
+xfs_icsb_init_counters(
+	xfs_mount_t	*mp)
+{
+	xfs_icsb_cnts_t *cntp;
+	int		i;
+
+	mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t);
+	if (mp->m_sb_cnts == NULL)
+		return -ENOMEM;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
+	mp->m_icsb_notifier.priority = 0;
+	register_hotcpu_notifier(&mp->m_icsb_notifier);
+#endif /* CONFIG_HOTPLUG_CPU */
+
+	for_each_online_cpu(i) {
+		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
+	}
+
+	mutex_init(&mp->m_icsb_mutex);
+
+	/*
+	 * start with all counters disabled so that the
+	 * initial balance kicks us off correctly
+	 */
+	mp->m_icsb_counters = -1;
+	return 0;
+}
+
+void
+xfs_icsb_reinit_counters(
+	xfs_mount_t	*mp)
+{
+	xfs_icsb_lock(mp);
+	/*
+	 * start with all counters disabled so that the
+	 * initial balance kicks us off correctly
+	 */
+	mp->m_icsb_counters = -1;
+	xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
+	xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
+	xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
+	xfs_icsb_unlock(mp);
+}
+
+void
+xfs_icsb_destroy_counters(
+	xfs_mount_t	*mp)
+{
+	if (mp->m_sb_cnts) {
+		unregister_hotcpu_notifier(&mp->m_icsb_notifier);
+		free_percpu(mp->m_sb_cnts);
+	}
+	mutex_destroy(&mp->m_icsb_mutex);
+}
+
+STATIC void
+xfs_icsb_lock_cntr(
+	xfs_icsb_cnts_t	*icsbp)
+{
+	while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) {
+		ndelay(1000);
+	}
+}
+
+STATIC void
+xfs_icsb_unlock_cntr(
+	xfs_icsb_cnts_t	*icsbp)
+{
+	clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags);
+}
+
+
+STATIC void
+xfs_icsb_lock_all_counters(
+	xfs_mount_t	*mp)
+{
+	xfs_icsb_cnts_t *cntp;
+	int		i;
+
+	for_each_online_cpu(i) {
+		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+		xfs_icsb_lock_cntr(cntp);
+	}
+}
+
+STATIC void
+xfs_icsb_unlock_all_counters(
+	xfs_mount_t	*mp)
+{
+	xfs_icsb_cnts_t *cntp;
+	int		i;
+
+	for_each_online_cpu(i) {
+		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+		xfs_icsb_unlock_cntr(cntp);
+	}
+}
+
+STATIC void
+xfs_icsb_count(
+	xfs_mount_t	*mp,
+	xfs_icsb_cnts_t	*cnt,
+	int		flags)
+{
+	xfs_icsb_cnts_t *cntp;
+	int		i;
+
+	memset(cnt, 0, sizeof(xfs_icsb_cnts_t));
+
+	if (!(flags & XFS_ICSB_LAZY_COUNT))
+		xfs_icsb_lock_all_counters(mp);
+
+	for_each_online_cpu(i) {
+		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+		cnt->icsb_icount += cntp->icsb_icount;
+		cnt->icsb_ifree += cntp->icsb_ifree;
+		cnt->icsb_fdblocks += cntp->icsb_fdblocks;
+	}
+
+	if (!(flags & XFS_ICSB_LAZY_COUNT))
+		xfs_icsb_unlock_all_counters(mp);
+}
+
+STATIC int
+xfs_icsb_counter_disabled(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field)
+{
+	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
+	return test_bit(field, &mp->m_icsb_counters);
+}
+
+STATIC void
+xfs_icsb_disable_counter(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field)
+{
+	xfs_icsb_cnts_t	cnt;
+
+	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
+
+	/*
+	 * If we are already disabled, then there is nothing to do
+	 * here. We check before locking all the counters to avoid
+	 * the expensive lock operation when being called in the
+	 * slow path and the counter is already disabled. This is
+	 * safe because the only time we set or clear this state is under
+	 * the m_icsb_mutex.
+	 */
+	if (xfs_icsb_counter_disabled(mp, field))
+		return;
+
+	xfs_icsb_lock_all_counters(mp);
+	if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
+		/* drain back to superblock */
+
+		xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT);
+		switch(field) {
+		case XFS_SBS_ICOUNT:
+			mp->m_sb.sb_icount = cnt.icsb_icount;
+			break;
+		case XFS_SBS_IFREE:
+			mp->m_sb.sb_ifree = cnt.icsb_ifree;
+			break;
+		case XFS_SBS_FDBLOCKS:
+			mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
+			break;
+		default:
+			BUG();
+		}
+	}
+
+	xfs_icsb_unlock_all_counters(mp);
+}
+
+STATIC void
+xfs_icsb_enable_counter(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field,
+	uint64_t	count,
+	uint64_t	resid)
+{
+	xfs_icsb_cnts_t	*cntp;
+	int		i;
+
+	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
+
+	xfs_icsb_lock_all_counters(mp);
+	for_each_online_cpu(i) {
+		cntp = per_cpu_ptr(mp->m_sb_cnts, i);
+		switch (field) {
+		case XFS_SBS_ICOUNT:
+			cntp->icsb_icount = count + resid;
+			break;
+		case XFS_SBS_IFREE:
+			cntp->icsb_ifree = count + resid;
+			break;
+		case XFS_SBS_FDBLOCKS:
+			cntp->icsb_fdblocks = count + resid;
+			break;
+		default:
+			BUG();
+			break;
+		}
+		resid = 0;
+	}
+	clear_bit(field, &mp->m_icsb_counters);
+	xfs_icsb_unlock_all_counters(mp);
+}
+
+void
+xfs_icsb_sync_counters_locked(
+	xfs_mount_t	*mp,
+	int		flags)
+{
+	xfs_icsb_cnts_t	cnt;
+
+	xfs_icsb_count(mp, &cnt, flags);
+
+	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
+		mp->m_sb.sb_icount = cnt.icsb_icount;
+	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
+		mp->m_sb.sb_ifree = cnt.icsb_ifree;
+	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
+		mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
+}
+
+/*
+ * Accurate update of per-cpu counters to incore superblock
+ */
+void
+xfs_icsb_sync_counters(
+	xfs_mount_t	*mp,
+	int		flags)
+{
+	spin_lock(&mp->m_sb_lock);
+	xfs_icsb_sync_counters_locked(mp, flags);
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/*
+ * Balance and enable/disable counters as necessary.
+ *
+ * Thresholds for re-enabling counters are somewhat magic.  inode counts are
+ * chosen to be the same number as single on disk allocation chunk per CPU, and
+ * free blocks is something far enough zero that we aren't going thrash when we
+ * get near ENOSPC. We also need to supply a minimum we require per cpu to
+ * prevent looping endlessly when xfs_alloc_space asks for more than will
+ * be distributed to a single CPU but each CPU has enough blocks to be
+ * reenabled.
+ *
+ * Note that we can be called when counters are already disabled.
+ * xfs_icsb_disable_counter() optimises the counter locking in this case to
+ * prevent locking every per-cpu counter needlessly.
+ */
+
+#define XFS_ICSB_INO_CNTR_REENABLE	(uint64_t)64
+#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
+		(uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp))
+STATIC void
+xfs_icsb_balance_counter_locked(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t  field,
+	int		min_per_cpu)
+{
+	uint64_t	count, resid;
+	int		weight = num_online_cpus();
+	uint64_t	min = (uint64_t)min_per_cpu;
+
+	/* disable counter and sync counter */
+	xfs_icsb_disable_counter(mp, field);
+
+	/* update counters  - first CPU gets residual*/
+	switch (field) {
+	case XFS_SBS_ICOUNT:
+		count = mp->m_sb.sb_icount;
+		resid = do_div(count, weight);
+		if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
+			return;
+		break;
+	case XFS_SBS_IFREE:
+		count = mp->m_sb.sb_ifree;
+		resid = do_div(count, weight);
+		if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
+			return;
+		break;
+	case XFS_SBS_FDBLOCKS:
+		count = mp->m_sb.sb_fdblocks;
+		resid = do_div(count, weight);
+		if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp)))
+			return;
+		break;
+	default:
+		BUG();
+		count = resid = 0;	/* quiet, gcc */
+		break;
+	}
+
+	xfs_icsb_enable_counter(mp, field, count, resid);
+}
+
+STATIC void
+xfs_icsb_balance_counter(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t  fields,
+	int		min_per_cpu)
+{
+	spin_lock(&mp->m_sb_lock);
+	xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu);
+	spin_unlock(&mp->m_sb_lock);
+}
+
+int
+xfs_icsb_modify_counters(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field,
+	int64_t		delta,
+	int		rsvd)
+{
+	xfs_icsb_cnts_t	*icsbp;
+	long long	lcounter;	/* long counter for 64 bit fields */
+	int		ret = 0;
+
+	might_sleep();
+again:
+	preempt_disable();
+	icsbp = this_cpu_ptr(mp->m_sb_cnts);
+
+	/*
+	 * if the counter is disabled, go to slow path
+	 */
+	if (unlikely(xfs_icsb_counter_disabled(mp, field)))
+		goto slow_path;
+	xfs_icsb_lock_cntr(icsbp);
+	if (unlikely(xfs_icsb_counter_disabled(mp, field))) {
+		xfs_icsb_unlock_cntr(icsbp);
+		goto slow_path;
+	}
+
+	switch (field) {
+	case XFS_SBS_ICOUNT:
+		lcounter = icsbp->icsb_icount;
+		lcounter += delta;
+		if (unlikely(lcounter < 0))
+			goto balance_counter;
+		icsbp->icsb_icount = lcounter;
+		break;
+
+	case XFS_SBS_IFREE:
+		lcounter = icsbp->icsb_ifree;
+		lcounter += delta;
+		if (unlikely(lcounter < 0))
+			goto balance_counter;
+		icsbp->icsb_ifree = lcounter;
+		break;
+
+	case XFS_SBS_FDBLOCKS:
+		BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
+
+		lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+		lcounter += delta;
+		if (unlikely(lcounter < 0))
+			goto balance_counter;
+		icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
+		break;
+	default:
+		BUG();
+		break;
+	}
+	xfs_icsb_unlock_cntr(icsbp);
+	preempt_enable();
+	return 0;
+
+slow_path:
+	preempt_enable();
+
+	/*
+	 * serialise with a mutex so we don't burn lots of cpu on
+	 * the superblock lock. We still need to hold the superblock
+	 * lock, however, when we modify the global structures.
+	 */
+	xfs_icsb_lock(mp);
+
+	/*
+	 * Now running atomically.
+	 *
+	 * If the counter is enabled, someone has beaten us to rebalancing.
+	 * Drop the lock and try again in the fast path....
+	 */
+	if (!(xfs_icsb_counter_disabled(mp, field))) {
+		xfs_icsb_unlock(mp);
+		goto again;
+	}
+
+	/*
+	 * The counter is currently disabled. Because we are
+	 * running atomically here, we know a rebalance cannot
+	 * be in progress. Hence we can go straight to operating
+	 * on the global superblock. We do not call xfs_mod_incore_sb()
+	 * here even though we need to get the m_sb_lock. Doing so
+	 * will cause us to re-enter this function and deadlock.
+	 * Hence we get the m_sb_lock ourselves and then call
+	 * xfs_mod_incore_sb_unlocked() as the unlocked path operates
+	 * directly on the global counters.
+	 */
+	spin_lock(&mp->m_sb_lock);
+	ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+	spin_unlock(&mp->m_sb_lock);
+
+	/*
+	 * Now that we've modified the global superblock, we
+	 * may be able to re-enable the distributed counters
+	 * (e.g. lots of space just got freed). After that
+	 * we are done.
+	 */
+	if (ret != ENOSPC)
+		xfs_icsb_balance_counter(mp, field, 0);
+	xfs_icsb_unlock(mp);
+	return ret;
+
+balance_counter:
+	xfs_icsb_unlock_cntr(icsbp);
+	preempt_enable();
+
+	/*
+	 * We may have multiple threads here if multiple per-cpu
+	 * counters run dry at the same time. This will mean we can
+	 * do more balances than strictly necessary but it is not
+	 * the common slowpath case.
+	 */
+	xfs_icsb_lock(mp);
+
+	/*
+	 * running atomically.
+	 *
+	 * This will leave the counter in the correct state for future
+	 * accesses. After the rebalance, we simply try again and our retry
+	 * will either succeed through the fast path or slow path without
+	 * another balance operation being required.
+	 */
+	xfs_icsb_balance_counter(mp, field, delta);
+	xfs_icsb_unlock(mp);
+	goto again;
+}
+
+#endif
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
new file mode 100644
index 00000000..3d68bb26
--- /dev/null
+++ b/fs/xfs/xfs_mount.h
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_MOUNT_H__
+#define	__XFS_MOUNT_H__
+
+typedef struct xfs_trans_reservations {
+	uint	tr_write;	/* extent alloc trans */
+	uint	tr_itruncate;	/* truncate trans */
+	uint	tr_rename;	/* rename trans */
+	uint	tr_link;	/* link trans */
+	uint	tr_remove;	/* unlink trans */
+	uint	tr_symlink;	/* symlink trans */
+	uint	tr_create;	/* create trans */
+	uint	tr_mkdir;	/* mkdir trans */
+	uint	tr_ifree;	/* inode free trans */
+	uint	tr_ichange;	/* inode update trans */
+	uint	tr_growdata;	/* fs data section grow trans */
+	uint	tr_swrite;	/* sync write inode trans */
+	uint	tr_addafork;	/* cvt inode to attributed trans */
+	uint	tr_writeid;	/* write setuid/setgid file */
+	uint	tr_attrinval;	/* attr fork buffer invalidation */
+	uint	tr_attrset;	/* set/create an attribute */
+	uint	tr_attrrm;	/* remove an attribute */
+	uint	tr_clearagi;	/* clear bad agi unlinked ino bucket */
+	uint	tr_growrtalloc;	/* grow realtime allocations */
+	uint	tr_growrtzero;	/* grow realtime zeroing */
+	uint	tr_growrtfree;	/* grow realtime freeing */
+} xfs_trans_reservations_t;
+
+#ifndef __KERNEL__
+
+#define xfs_daddr_to_agno(mp,d) \
+	((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
+#define xfs_daddr_to_agbno(mp,d) \
+	((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
+
+#else /* __KERNEL__ */
+
+#include "xfs_sync.h"
+
+struct log;
+struct xfs_mount_args;
+struct xfs_inode;
+struct xfs_bmbt_irec;
+struct xfs_bmap_free;
+struct xfs_extdelta;
+struct xfs_swapext;
+struct xfs_mru_cache;
+struct xfs_nameops;
+struct xfs_ail;
+struct xfs_quotainfo;
+
+#ifdef HAVE_PERCPU_SB
+
+/*
+ * Valid per-cpu incore superblock counters. Note that if you add new counters,
+ * you may need to define new counter disabled bit field descriptors as there
+ * are more possible fields in the superblock that can fit in a bitfield on a
+ * 32 bit platform. The XFS_SBS_* values for the current current counters just
+ * fit.
+ */
+typedef struct xfs_icsb_cnts {
+	uint64_t	icsb_fdblocks;
+	uint64_t	icsb_ifree;
+	uint64_t	icsb_icount;
+	unsigned long	icsb_flags;
+} xfs_icsb_cnts_t;
+
+#define XFS_ICSB_FLAG_LOCK	(1 << 0)	/* counter lock bit */
+
+#define XFS_ICSB_LAZY_COUNT	(1 << 1)	/* accuracy not needed */
+
+extern int	xfs_icsb_init_counters(struct xfs_mount *);
+extern void	xfs_icsb_reinit_counters(struct xfs_mount *);
+extern void	xfs_icsb_destroy_counters(struct xfs_mount *);
+extern void	xfs_icsb_sync_counters(struct xfs_mount *, int);
+extern void	xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
+extern int	xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
+						int64_t, int);
+
+#else
+#define xfs_icsb_init_counters(mp)		(0)
+#define xfs_icsb_destroy_counters(mp)		do { } while (0)
+#define xfs_icsb_reinit_counters(mp)		do { } while (0)
+#define xfs_icsb_sync_counters(mp, flags)	do { } while (0)
+#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
+#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
+	xfs_mod_incore_sb(mp, field, delta, rsvd)
+#endif
+
+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+	XFS_LOWSP_1_PCNT = 0,
+	XFS_LOWSP_2_PCNT,
+	XFS_LOWSP_3_PCNT,
+	XFS_LOWSP_4_PCNT,
+	XFS_LOWSP_5_PCNT,
+	XFS_LOWSP_MAX,
+};
+
+typedef struct xfs_mount {
+	struct super_block	*m_super;
+	xfs_tid_t		m_tid;		/* next unused tid for fs */
+	struct xfs_ail		*m_ail;		/* fs active log item list */
+	xfs_sb_t		m_sb;		/* copy of fs superblock */
+	spinlock_t		m_sb_lock;	/* sb counter lock */
+	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
+	char			*m_fsname;	/* filesystem name */
+	int			m_fsname_len;	/* strlen of fs name */
+	char			*m_rtname;	/* realtime device name */
+	char			*m_logname;	/* external log device name */
+	int			m_bsize;	/* fs logical block size */
+	xfs_agnumber_t		m_agfrotor;	/* last ag where space found */
+	xfs_agnumber_t		m_agirotor;	/* last ag dir inode alloced */
+	spinlock_t		m_agirotor_lock;/* .. and lock protecting it */
+	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
+	uint			m_readio_log;	/* min read size log bytes */
+	uint			m_readio_blocks; /* min read size blocks */
+	uint			m_writeio_log;	/* min write size log bytes */
+	uint			m_writeio_blocks; /* min write size blocks */
+	struct log		*m_log;		/* log specific stuff */
+	int			m_logbufs;	/* number of log buffers */
+	int			m_logbsize;	/* size of each log buffer */
+	uint			m_rsumlevels;	/* rt summary levels */
+	uint			m_rsumsize;	/* size of rt summary, bytes */
+	struct xfs_inode	*m_rbmip;	/* pointer to bitmap inode */
+	struct xfs_inode	*m_rsumip;	/* pointer to summary inode */
+	struct xfs_inode	*m_rootip;	/* pointer to root directory */
+	struct xfs_quotainfo	*m_quotainfo;	/* disk quota information */
+	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
+	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */
+	xfs_buftarg_t		*m_rtdev_targp;	/* ptr to rt device */
+	__uint8_t		m_blkbit_log;	/* blocklog + NBBY */
+	__uint8_t		m_blkbb_log;	/* blocklog - BBSHIFT */
+	__uint8_t		m_agno_log;	/* log #ag's */
+	__uint8_t		m_agino_log;	/* #bits for agino in inum */
+	__uint16_t		m_inode_cluster_size;/* min inode buf size */
+	uint			m_blockmask;	/* sb_blocksize-1 */
+	uint			m_blockwsize;	/* sb_blocksize in words */
+	uint			m_blockwmask;	/* blockwsize-1 */
+	uint			m_alloc_mxr[2];	/* max alloc btree records */
+	uint			m_alloc_mnr[2];	/* min alloc btree records */
+	uint			m_bmap_dmxr[2];	/* max bmap btree records */
+	uint			m_bmap_dmnr[2];	/* min bmap btree records */
+	uint			m_inobt_mxr[2];	/* max inobt btree records */
+	uint			m_inobt_mnr[2];	/* min inobt btree records */
+	uint			m_ag_maxlevels;	/* XFS_AG_MAXLEVELS */
+	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
+	uint			m_in_maxlevels;	/* max inobt btree levels. */
+	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
+	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
+	struct mutex		m_growlock;	/* growfs mutex */
+	int			m_fixedfsid[2];	/* unchanged for life of FS */
+	uint			m_dmevmask;	/* DMI events for this FS */
+	__uint64_t		m_flags;	/* global mount flags */
+	uint			m_dir_node_ents; /* #entries in a dir danode */
+	uint			m_attr_node_ents; /* #entries in attr danode */
+	int			m_ialloc_inos;	/* inodes in inode allocation */
+	int			m_ialloc_blks;	/* blocks in inode allocation */
+	int			m_inoalign_mask;/* mask sb_inoalignmt if used */
+	uint			m_qflags;	/* quota status flags */
+	xfs_trans_reservations_t m_reservations;/* precomputed res values */
+	__uint64_t		m_maxicount;	/* maximum inode count */
+	__uint64_t		m_maxioffset;	/* maximum inode offset */
+	__uint64_t		m_resblks;	/* total reserved blocks */
+	__uint64_t		m_resblks_avail;/* available reserved blocks */
+	__uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
+	int			m_dalign;	/* stripe unit */
+	int			m_swidth;	/* stripe width */
+	int			m_sinoalign;	/* stripe unit inode alignment */
+	int			m_attr_magicpct;/* 37% of the blocksize */
+	int			m_dir_magicpct;	/* 37% of the dir blocksize */
+	__uint8_t		m_sectbb_log;	/* sectlog - BBSHIFT */
+	const struct xfs_nameops *m_dirnameops;	/* vector of dir name ops */
+	int			m_dirblksize;	/* directory block sz--bytes */
+	int			m_dirblkfsbs;	/* directory block sz--fsbs */
+	xfs_dablk_t		m_dirdatablk;	/* blockno of dir data v2 */
+	xfs_dablk_t		m_dirleafblk;	/* blockno of dir non-data v2 */
+	xfs_dablk_t		m_dirfreeblk;	/* blockno of dirfreeindex v2 */
+	uint			m_chsize;	/* size of next field */
+	struct xfs_chash	*m_chash;	/* fs private inode per-cluster
+						 * hash table */
+	atomic_t		m_active_trans;	/* number trans frozen */
+#ifdef HAVE_PERCPU_SB
+	xfs_icsb_cnts_t __percpu *m_sb_cnts;	/* per-cpu superblock counters */
+	unsigned long		m_icsb_counters; /* disabled per-cpu counters */
+	struct notifier_block	m_icsb_notifier; /* hotplug cpu notifier */
+	struct mutex		m_icsb_mutex;	/* balancer sync lock */
+#endif
+	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
+	struct delayed_work	m_sync_work;	/* background sync work */
+	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
+	struct work_struct	m_flush_work;	/* background inode flush */
+	__int64_t		m_update_flags;	/* sb flags we need to update
+						   on the next remount,rw */
+	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
+	int64_t			m_low_space[XFS_LOWSP_MAX];
+						/* low free space thresholds */
+} xfs_mount_t;
+
+/*
+ * Flags for m_flags.
+ */
+#define XFS_MOUNT_WSYNC		(1ULL << 0)	/* for nfs - all metadata ops
+						   must be synchronous except
+						   for space allocations */
+#define XFS_MOUNT_DELAYLOG	(1ULL << 1)	/* delayed logging is enabled */
+#define XFS_MOUNT_WAS_CLEAN	(1ULL << 3)
+#define XFS_MOUNT_FS_SHUTDOWN	(1ULL << 4)	/* atomic stop of all filesystem
+						   operations, typically for
+						   disk errors in metadata */
+#define XFS_MOUNT_DISCARD	(1ULL << 5)	/* discard unused blocks */
+#define XFS_MOUNT_RETERR	(1ULL << 6)     /* return alignment errors to
+						   user */
+#define XFS_MOUNT_NOALIGN	(1ULL << 7)	/* turn off stripe alignment
+						   allocations */
+#define XFS_MOUNT_ATTR2		(1ULL << 8)	/* allow use of attr2 format */
+#define XFS_MOUNT_GRPID		(1ULL << 9)	/* group-ID assigned from directory */
+#define XFS_MOUNT_NORECOVERY	(1ULL << 10)	/* no recovery - dirty fs */
+#define XFS_MOUNT_DFLT_IOSIZE	(1ULL << 12)	/* set default i/o size */
+#define XFS_MOUNT_32BITINODES	(1ULL << 14)	/* do not create inodes above
+						 * 32 bits in size */
+#define XFS_MOUNT_SMALL_INUMS	(1ULL << 15)	/* users wants 32bit inodes */
+#define XFS_MOUNT_NOUUID	(1ULL << 16)	/* ignore uuid during mount */
+#define XFS_MOUNT_BARRIER	(1ULL << 17)
+#define XFS_MOUNT_IKEEP		(1ULL << 18)	/* keep empty inode clusters*/
+#define XFS_MOUNT_SWALLOC	(1ULL << 19)	/* turn on stripe width
+						 * allocation */
+#define XFS_MOUNT_RDONLY	(1ULL << 20)	/* read-only fs */
+#define XFS_MOUNT_DIRSYNC	(1ULL << 21)	/* synchronous directory ops */
+#define XFS_MOUNT_COMPAT_IOSIZE	(1ULL << 22)	/* don't report large preferred
+						 * I/O size in stat() */
+#define XFS_MOUNT_FILESTREAMS	(1ULL << 24)	/* enable the filestreams
+						   allocator */
+#define XFS_MOUNT_NOATTR2	(1ULL << 25)	/* disable use of attr2 format */
+
+
+/*
+ * Default minimum read and write sizes.
+ */
+#define XFS_READIO_LOG_LARGE	16
+#define XFS_WRITEIO_LOG_LARGE	16
+
+/*
+ * Max and min values for mount-option defined I/O
+ * preallocation sizes.
+ */
+#define XFS_MAX_IO_LOG		30	/* 1G */
+#define XFS_MIN_IO_LOG		PAGE_SHIFT
+
+/*
+ * Synchronous read and write sizes.  This should be
+ * better for NFSv2 wsync filesystems.
+ */
+#define	XFS_WSYNC_READIO_LOG	15	/* 32k */
+#define	XFS_WSYNC_WRITEIO_LOG	14	/* 16k */
+
+/*
+ * Allow large block sizes to be reported to userspace programs if the
+ * "largeio" mount option is used.
+ *
+ * If compatibility mode is specified, simply return the basic unit of caching
+ * so that we don't get inefficient read/modify/write I/O from user apps.
+ * Otherwise....
+ *
+ * If the underlying volume is a stripe, then return the stripe width in bytes
+ * as the recommended I/O size. It is not a stripe and we've set a default
+ * buffered I/O size, return that, otherwise return the compat default.
+ */
+static inline unsigned long
+xfs_preferred_iosize(xfs_mount_t *mp)
+{
+	if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)
+		return PAGE_CACHE_SIZE;
+	return (mp->m_swidth ?
+		(mp->m_swidth << mp->m_sb.sb_blocklog) :
+		((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ?
+			(1 << (int)MAX(mp->m_readio_log, mp->m_writeio_log)) :
+			PAGE_CACHE_SIZE));
+}
+
+#define XFS_MAXIOFFSET(mp)	((mp)->m_maxioffset)
+
+#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp)	\
+				((mp)->m_flags & XFS_MOUNT_WAS_CLEAN)
+#define XFS_FORCED_SHUTDOWN(mp)	((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
+void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
+		int lnnum);
+#define xfs_force_shutdown(m,f)	\
+	xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
+
+#define SHUTDOWN_META_IO_ERROR	0x0001	/* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR	0x0002	/* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT	0x0004	/* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE	0x0008	/* corrupt in-memory data structures */
+#define SHUTDOWN_REMOTE_REQ	0x0010	/* shutdown came from remote cell */
+#define SHUTDOWN_DEVICE_REQ	0x0020	/* failed all paths to the device */
+
+#define xfs_test_for_freeze(mp)		((mp)->m_super->s_frozen)
+#define xfs_wait_for_freeze(mp,l)	vfs_check_frozen((mp)->m_super, (l))
+
+/*
+ * Flags for xfs_mountfs
+ */
+#define XFS_MFSI_QUIET		0x40	/* Be silent if mount errors found */
+
+static inline xfs_agnumber_t
+xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
+{
+	xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
+	do_div(ld, mp->m_sb.sb_agblocks);
+	return (xfs_agnumber_t) ld;
+}
+
+static inline xfs_agblock_t
+xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
+{
+	xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
+	return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
+}
+
+/*
+ * perag get/put wrappers for ref counting
+ */
+struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
+struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
+					int tag);
+void	xfs_perag_put(struct xfs_perag *pag);
+
+/*
+ * Per-cpu superblock locking functions
+ */
+#ifdef HAVE_PERCPU_SB
+static inline void
+xfs_icsb_lock(xfs_mount_t *mp)
+{
+	mutex_lock(&mp->m_icsb_mutex);
+}
+
+static inline void
+xfs_icsb_unlock(xfs_mount_t *mp)
+{
+	mutex_unlock(&mp->m_icsb_mutex);
+}
+#else
+#define xfs_icsb_lock(mp)
+#define xfs_icsb_unlock(mp)
+#endif
+
+/*
+ * This structure is for use by the xfs_mod_incore_sb_batch() routine.
+ * xfs_growfs can specify a few fields which are more than int limit
+ */
+typedef struct xfs_mod_sb {
+	xfs_sb_field_t	msb_field;	/* Field to modify, see below */
+	int64_t		msb_delta;	/* Change to make to specified field */
+} xfs_mod_sb_t;
+
+extern int	xfs_log_sbcount(xfs_mount_t *, uint);
+extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
+extern int	xfs_mountfs(xfs_mount_t *mp);
+
+extern void	xfs_unmountfs(xfs_mount_t *);
+extern int	xfs_unmountfs_writesb(xfs_mount_t *);
+extern int	xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
+extern int	xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
+			uint, int);
+extern int	xfs_mount_log_sb(xfs_mount_t *, __int64_t);
+extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
+extern int	xfs_readsb(xfs_mount_t *, int);
+extern void	xfs_freesb(xfs_mount_t *);
+extern int	xfs_fs_writable(xfs_mount_t *);
+extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
+
+extern int	xfs_dev_is_read_only(struct xfs_mount *, char *);
+
+extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
+
+#endif	/* __KERNEL__ */
+
+extern void	xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern int	xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
+					xfs_agnumber_t *);
+extern void	xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void	xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+
+#endif	/* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
new file mode 100644
index 00000000..4aff5639
--- /dev/null
+++ b/fs/xfs/xfs_mru_cache.c
@@ -0,0 +1,576 @@
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_mru_cache.h"
+
+/*
+ * The MRU Cache data structure consists of a data store, an array of lists and
+ * a lock to protect its internal state.  At initialisation time, the client
+ * supplies an element lifetime in milliseconds and a group count, as well as a
+ * function pointer to call when deleting elements.  A data structure for
+ * queueing up work in the form of timed callbacks is also included.
+ *
+ * The group count controls how many lists are created, and thereby how finely
+ * the elements are grouped in time.  When reaping occurs, all the elements in
+ * all the lists whose time has expired are deleted.
+ *
+ * To give an example of how this works in practice, consider a client that
+ * initialises an MRU Cache with a lifetime of ten seconds and a group count of
+ * five.  Five internal lists will be created, each representing a two second
+ * period in time.  When the first element is added, time zero for the data
+ * structure is initialised to the current time.
+ *
+ * All the elements added in the first two seconds are appended to the first
+ * list.  Elements added in the third second go into the second list, and so on.
+ * If an element is accessed at any point, it is removed from its list and
+ * inserted at the head of the current most-recently-used list.
+ *
+ * The reaper function will have nothing to do until at least twelve seconds
+ * have elapsed since the first element was added.  The reason for this is that
+ * if it were called at t=11s, there could be elements in the first list that
+ * have only been inactive for nine seconds, so it still does nothing.  If it is
+ * called anywhere between t=12 and t=14 seconds, it will delete all the
+ * elements that remain in the first list.  It's therefore possible for elements
+ * to remain in the data store even after they've been inactive for up to
+ * (t + t/g) seconds, where t is the inactive element lifetime and g is the
+ * number of groups.
+ *
+ * The above example assumes that the reaper function gets called at least once
+ * every (t/g) seconds.  If it is called less frequently, unused elements will
+ * accumulate in the reap list until the reaper function is eventually called.
+ * The current implementation uses work queue callbacks to carefully time the
+ * reaper function calls, so this should happen rarely, if at all.
+ *
+ * From a design perspective, the primary reason for the choice of a list array
+ * representing discrete time intervals is that it's only practical to reap
+ * expired elements in groups of some appreciable size.  This automatically
+ * introduces a granularity to element lifetimes, so there's no point storing an
+ * individual timeout with each element that specifies a more precise reap time.
+ * The bonus is a saving of sizeof(long) bytes of memory per element stored.
+ *
+ * The elements could have been stored in just one list, but an array of
+ * counters or pointers would need to be maintained to allow them to be divided
+ * up into discrete time groups.  More critically, the process of touching or
+ * removing an element would involve walking large portions of the entire list,
+ * which would have a detrimental effect on performance.  The additional memory
+ * requirement for the array of list heads is minimal.
+ *
+ * When an element is touched or deleted, it needs to be removed from its
+ * current list.  Doubly linked lists are used to make the list maintenance
+ * portion of these operations O(1).  Since reaper timing can be imprecise,
+ * inserts and lookups can occur when there are no free lists available.  When
+ * this happens, all the elements on the LRU list need to be migrated to the end
+ * of the reap list.  To keep the list maintenance portion of these operations
+ * O(1) also, list tails need to be accessible without walking the entire list.
+ * This is the reason why doubly linked list heads are used.
+ */
+
+/*
+ * An MRU Cache is a dynamic data structure that stores its elements in a way
+ * that allows efficient lookups, but also groups them into discrete time
+ * intervals based on insertion time.  This allows elements to be efficiently
+ * and automatically reaped after a fixed period of inactivity.
+ *
+ * When a client data pointer is stored in the MRU Cache it needs to be added to
+ * both the data store and to one of the lists.  It must also be possible to
+ * access each of these entries via the other, i.e. to:
+ *
+ *    a) Walk a list, removing the corresponding data store entry for each item.
+ *    b) Look up a data store entry, then access its list entry directly.
+ *
+ * To achieve both of these goals, each entry must contain both a list entry and
+ * a key, in addition to the user's data pointer.  Note that it's not a good
+ * idea to have the client embed one of these structures at the top of their own
+ * data structure, because inserting the same item more than once would most
+ * likely result in a loop in one of the lists.  That's a sure-fire recipe for
+ * an infinite loop in the code.
+ */
+typedef struct xfs_mru_cache_elem
+{
+	struct list_head list_node;
+	unsigned long	key;
+	void		*value;
+} xfs_mru_cache_elem_t;
+
+static kmem_zone_t		*xfs_mru_elem_zone;
+static struct workqueue_struct	*xfs_mru_reap_wq;
+
+/*
+ * When inserting, destroying or reaping, it's first necessary to update the
+ * lists relative to a particular time.  In the case of destroying, that time
+ * will be well in the future to ensure that all items are moved to the reap
+ * list.  In all other cases though, the time will be the current time.
+ *
+ * This function enters a loop, moving the contents of the LRU list to the reap
+ * list again and again until either a) the lists are all empty, or b) time zero
+ * has been advanced sufficiently to be within the immediate element lifetime.
+ *
+ * Case a) above is detected by counting how many groups are migrated and
+ * stopping when they've all been moved.  Case b) is detected by monitoring the
+ * time_zero field, which is updated as each group is migrated.
+ *
+ * The return value is the earliest time that more migration could be needed, or
+ * zero if there's no need to schedule more work because the lists are empty.
+ */
+STATIC unsigned long
+_xfs_mru_cache_migrate(
+	xfs_mru_cache_t	*mru,
+	unsigned long	now)
+{
+	unsigned int	grp;
+	unsigned int	migrated = 0;
+	struct list_head *lru_list;
+
+	/* Nothing to do if the data store is empty. */
+	if (!mru->time_zero)
+		return 0;
+
+	/* While time zero is older than the time spanned by all the lists. */
+	while (mru->time_zero <= now - mru->grp_count * mru->grp_time) {
+
+		/*
+		 * If the LRU list isn't empty, migrate its elements to the tail
+		 * of the reap list.
+		 */
+		lru_list = mru->lists + mru->lru_grp;
+		if (!list_empty(lru_list))
+			list_splice_init(lru_list, mru->reap_list.prev);
+
+		/*
+		 * Advance the LRU group number, freeing the old LRU list to
+		 * become the new MRU list; advance time zero accordingly.
+		 */
+		mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count;
+		mru->time_zero += mru->grp_time;
+
+		/*
+		 * If reaping is so far behind that all the elements on all the
+		 * lists have been migrated to the reap list, it's now empty.
+		 */
+		if (++migrated == mru->grp_count) {
+			mru->lru_grp = 0;
+			mru->time_zero = 0;
+			return 0;
+		}
+	}
+
+	/* Find the first non-empty list from the LRU end. */
+	for (grp = 0; grp < mru->grp_count; grp++) {
+
+		/* Check the grp'th list from the LRU end. */
+		lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count);
+		if (!list_empty(lru_list))
+			return mru->time_zero +
+			       (mru->grp_count + grp) * mru->grp_time;
+	}
+
+	/* All the lists must be empty. */
+	mru->lru_grp = 0;
+	mru->time_zero = 0;
+	return 0;
+}
+
+/*
+ * When inserting or doing a lookup, an element needs to be inserted into the
+ * MRU list.  The lists must be migrated first to ensure that they're
+ * up-to-date, otherwise the new element could be given a shorter lifetime in
+ * the cache than it should.
+ */
+STATIC void
+_xfs_mru_cache_list_insert(
+	xfs_mru_cache_t		*mru,
+	xfs_mru_cache_elem_t	*elem)
+{
+	unsigned int	grp = 0;
+	unsigned long	now = jiffies;
+
+	/*
+	 * If the data store is empty, initialise time zero, leave grp set to
+	 * zero and start the work queue timer if necessary.  Otherwise, set grp
+	 * to the number of group times that have elapsed since time zero.
+	 */
+	if (!_xfs_mru_cache_migrate(mru, now)) {
+		mru->time_zero = now;
+		if (!mru->queued) {
+			mru->queued = 1;
+			queue_delayed_work(xfs_mru_reap_wq, &mru->work,
+			                   mru->grp_count * mru->grp_time);
+		}
+	} else {
+		grp = (now - mru->time_zero) / mru->grp_time;
+		grp = (mru->lru_grp + grp) % mru->grp_count;
+	}
+
+	/* Insert the element at the tail of the corresponding list. */
+	list_add_tail(&elem->list_node, mru->lists + grp);
+}
+
+/*
+ * When destroying or reaping, all the elements that were migrated to the reap
+ * list need to be deleted.  For each element this involves removing it from the
+ * data store, removing it from the reap list, calling the client's free
+ * function and deleting the element from the element zone.
+ *
+ * We get called holding the mru->lock, which we drop and then reacquire.
+ * Sparse need special help with this to tell it we know what we are doing.
+ */
+STATIC void
+_xfs_mru_cache_clear_reap_list(
+	xfs_mru_cache_t		*mru) __releases(mru->lock) __acquires(mru->lock)
+
+{
+	xfs_mru_cache_elem_t	*elem, *next;
+	struct list_head	tmp;
+
+	INIT_LIST_HEAD(&tmp);
+	list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) {
+
+		/* Remove the element from the data store. */
+		radix_tree_delete(&mru->store, elem->key);
+
+		/*
+		 * remove to temp list so it can be freed without
+		 * needing to hold the lock
+		 */
+		list_move(&elem->list_node, &tmp);
+	}
+	spin_unlock(&mru->lock);
+
+	list_for_each_entry_safe(elem, next, &tmp, list_node) {
+
+		/* Remove the element from the reap list. */
+		list_del_init(&elem->list_node);
+
+		/* Call the client's free function with the key and value pointer. */
+		mru->free_func(elem->key, elem->value);
+
+		/* Free the element structure. */
+		kmem_zone_free(xfs_mru_elem_zone, elem);
+	}
+
+	spin_lock(&mru->lock);
+}
+
+/*
+ * We fire the reap timer every group expiry interval so
+ * we always have a reaper ready to run. This makes shutdown
+ * and flushing of the reaper easy to do. Hence we need to
+ * keep when the next reap must occur so we can determine
+ * at each interval whether there is anything we need to do.
+ */
+STATIC void
+_xfs_mru_cache_reap(
+	struct work_struct	*work)
+{
+	xfs_mru_cache_t		*mru = container_of(work, xfs_mru_cache_t, work.work);
+	unsigned long		now, next;
+
+	ASSERT(mru && mru->lists);
+	if (!mru || !mru->lists)
+		return;
+
+	spin_lock(&mru->lock);
+	next = _xfs_mru_cache_migrate(mru, jiffies);
+	_xfs_mru_cache_clear_reap_list(mru);
+
+	mru->queued = next;
+	if ((mru->queued > 0)) {
+		now = jiffies;
+		if (next <= now)
+			next = 0;
+		else
+			next -= now;
+		queue_delayed_work(xfs_mru_reap_wq, &mru->work, next);
+	}
+
+	spin_unlock(&mru->lock);
+}
+
+int
+xfs_mru_cache_init(void)
+{
+	xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t),
+	                                 "xfs_mru_cache_elem");
+	if (!xfs_mru_elem_zone)
+		goto out;
+
+	xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
+	if (!xfs_mru_reap_wq)
+		goto out_destroy_mru_elem_zone;
+
+	return 0;
+
+ out_destroy_mru_elem_zone:
+	kmem_zone_destroy(xfs_mru_elem_zone);
+ out:
+	return -ENOMEM;
+}
+
+void
+xfs_mru_cache_uninit(void)
+{
+	destroy_workqueue(xfs_mru_reap_wq);
+	kmem_zone_destroy(xfs_mru_elem_zone);
+}
+
+/*
+ * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create()
+ * with the address of the pointer, a lifetime value in milliseconds, a group
+ * count and a free function to use when deleting elements.  This function
+ * returns 0 if the initialisation was successful.
+ */
+int
+xfs_mru_cache_create(
+	xfs_mru_cache_t		**mrup,
+	unsigned int		lifetime_ms,
+	unsigned int		grp_count,
+	xfs_mru_cache_free_func_t free_func)
+{
+	xfs_mru_cache_t	*mru = NULL;
+	int		err = 0, grp;
+	unsigned int	grp_time;
+
+	if (mrup)
+		*mrup = NULL;
+
+	if (!mrup || !grp_count || !lifetime_ms || !free_func)
+		return EINVAL;
+
+	if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
+		return EINVAL;
+
+	if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
+		return ENOMEM;
+
+	/* An extra list is needed to avoid reaping up to a grp_time early. */
+	mru->grp_count = grp_count + 1;
+	mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
+
+	if (!mru->lists) {
+		err = ENOMEM;
+		goto exit;
+	}
+
+	for (grp = 0; grp < mru->grp_count; grp++)
+		INIT_LIST_HEAD(mru->lists + grp);
+
+	/*
+	 * We use GFP_KERNEL radix tree preload and do inserts under a
+	 * spinlock so GFP_ATOMIC is appropriate for the radix tree itself.
+	 */
+	INIT_RADIX_TREE(&mru->store, GFP_ATOMIC);
+	INIT_LIST_HEAD(&mru->reap_list);
+	spin_lock_init(&mru->lock);
+	INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap);
+
+	mru->grp_time  = grp_time;
+	mru->free_func = free_func;
+
+	*mrup = mru;
+
+exit:
+	if (err && mru && mru->lists)
+		kmem_free(mru->lists);
+	if (err && mru)
+		kmem_free(mru);
+
+	return err;
+}
+
+/*
+ * Call xfs_mru_cache_flush() to flush out all cached entries, calling their
+ * free functions as they're deleted.  When this function returns, the caller is
+ * guaranteed that all the free functions for all the elements have finished
+ * executing and the reaper is not running.
+ */
+static void
+xfs_mru_cache_flush(
+	xfs_mru_cache_t		*mru)
+{
+	if (!mru || !mru->lists)
+		return;
+
+	spin_lock(&mru->lock);
+	if (mru->queued) {
+		spin_unlock(&mru->lock);
+		cancel_delayed_work_sync(&mru->work);
+		spin_lock(&mru->lock);
+	}
+
+	_xfs_mru_cache_migrate(mru, jiffies + mru->grp_count * mru->grp_time);
+	_xfs_mru_cache_clear_reap_list(mru);
+
+	spin_unlock(&mru->lock);
+}
+
+void
+xfs_mru_cache_destroy(
+	xfs_mru_cache_t		*mru)
+{
+	if (!mru || !mru->lists)
+		return;
+
+	xfs_mru_cache_flush(mru);
+
+	kmem_free(mru->lists);
+	kmem_free(mru);
+}
+
+/*
+ * To insert an element, call xfs_mru_cache_insert() with the data store, the
+ * element's key and the client data pointer.  This function returns 0 on
+ * success or ENOMEM if memory for the data element couldn't be allocated.
+ */
+int
+xfs_mru_cache_insert(
+	xfs_mru_cache_t	*mru,
+	unsigned long	key,
+	void		*value)
+{
+	xfs_mru_cache_elem_t *elem;
+
+	ASSERT(mru && mru->lists);
+	if (!mru || !mru->lists)
+		return EINVAL;
+
+	elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP);
+	if (!elem)
+		return ENOMEM;
+
+	if (radix_tree_preload(GFP_KERNEL)) {
+		kmem_zone_free(xfs_mru_elem_zone, elem);
+		return ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&elem->list_node);
+	elem->key = key;
+	elem->value = value;
+
+	spin_lock(&mru->lock);
+
+	radix_tree_insert(&mru->store, key, elem);
+	radix_tree_preload_end();
+	_xfs_mru_cache_list_insert(mru, elem);
+
+	spin_unlock(&mru->lock);
+
+	return 0;
+}
+
+/*
+ * To remove an element without calling the free function, call
+ * xfs_mru_cache_remove() with the data store and the element's key.  On success
+ * the client data pointer for the removed element is returned, otherwise this
+ * function will return a NULL pointer.
+ */
+void *
+xfs_mru_cache_remove(
+	xfs_mru_cache_t	*mru,
+	unsigned long	key)
+{
+	xfs_mru_cache_elem_t *elem;
+	void		*value = NULL;
+
+	ASSERT(mru && mru->lists);
+	if (!mru || !mru->lists)
+		return NULL;
+
+	spin_lock(&mru->lock);
+	elem = radix_tree_delete(&mru->store, key);
+	if (elem) {
+		value = elem->value;
+		list_del(&elem->list_node);
+	}
+
+	spin_unlock(&mru->lock);
+
+	if (elem)
+		kmem_zone_free(xfs_mru_elem_zone, elem);
+
+	return value;
+}
+
+/*
+ * To remove and element and call the free function, call xfs_mru_cache_delete()
+ * with the data store and the element's key.
+ */
+void
+xfs_mru_cache_delete(
+	xfs_mru_cache_t	*mru,
+	unsigned long	key)
+{
+	void		*value = xfs_mru_cache_remove(mru, key);
+
+	if (value)
+		mru->free_func(key, value);
+}
+
+/*
+ * To look up an element using its key, call xfs_mru_cache_lookup() with the
+ * data store and the element's key.  If found, the element will be moved to the
+ * head of the MRU list to indicate that it's been touched.
+ *
+ * The internal data structures are protected by a spinlock that is STILL HELD
+ * when this function returns.  Call xfs_mru_cache_done() to release it.  Note
+ * that it is not safe to call any function that might sleep in the interim.
+ *
+ * The implementation could have used reference counting to avoid this
+ * restriction, but since most clients simply want to get, set or test a member
+ * of the returned data structure, the extra per-element memory isn't warranted.
+ *
+ * If the element isn't found, this function returns NULL and the spinlock is
+ * released.  xfs_mru_cache_done() should NOT be called when this occurs.
+ *
+ * Because sparse isn't smart enough to know about conditional lock return
+ * status, we need to help it get it right by annotating the path that does
+ * not release the lock.
+ */
+void *
+xfs_mru_cache_lookup(
+	xfs_mru_cache_t	*mru,
+	unsigned long	key)
+{
+	xfs_mru_cache_elem_t *elem;
+
+	ASSERT(mru && mru->lists);
+	if (!mru || !mru->lists)
+		return NULL;
+
+	spin_lock(&mru->lock);
+	elem = radix_tree_lookup(&mru->store, key);
+	if (elem) {
+		list_del(&elem->list_node);
+		_xfs_mru_cache_list_insert(mru, elem);
+		__release(mru_lock); /* help sparse not be stupid */
+	} else
+		spin_unlock(&mru->lock);
+
+	return elem ? elem->value : NULL;
+}
+
+/*
+ * To release the internal data structure spinlock after having performed an
+ * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
+ * with the data store pointer.
+ */
+void
+xfs_mru_cache_done(
+	xfs_mru_cache_t	*mru) __releases(mru->lock)
+{
+	spin_unlock(&mru->lock);
+}
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
new file mode 100644
index 00000000..36dd3ec8
--- /dev/null
+++ b/fs/xfs/xfs_mru_cache.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_MRU_CACHE_H__
+#define __XFS_MRU_CACHE_H__
+
+
+/* Function pointer type for callback to free a client's data pointer. */
+typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*);
+
+typedef struct xfs_mru_cache
+{
+	struct radix_tree_root	store;     /* Core storage data structure.  */
+	struct list_head	*lists;    /* Array of lists, one per grp.  */
+	struct list_head	reap_list; /* Elements overdue for reaping. */
+	spinlock_t		lock;      /* Lock to protect this struct.  */
+	unsigned int		grp_count; /* Number of discrete groups.    */
+	unsigned int		grp_time;  /* Time period spanned by grps.  */
+	unsigned int		lru_grp;   /* Group containing time zero.   */
+	unsigned long		time_zero; /* Time first element was added. */
+	xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
+	struct delayed_work	work;      /* Workqueue data for reaping.   */
+	unsigned int		queued;	   /* work has been queued */
+} xfs_mru_cache_t;
+
+int xfs_mru_cache_init(void);
+void xfs_mru_cache_uninit(void);
+int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
+			     unsigned int grp_count,
+			     xfs_mru_cache_free_func_t free_func);
+void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
+int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
+				void *value);
+void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
+void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
+void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
+void xfs_mru_cache_done(struct xfs_mru_cache *mru);
+
+#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
new file mode 100644
index 00000000..a595f295
--- /dev/null
+++ b/fs/xfs/xfs_quota.h
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_QUOTA_H__
+#define __XFS_QUOTA_H__
+
+struct xfs_trans;
+
+/*
+ * The ondisk form of a dquot structure.
+ */
+#define XFS_DQUOT_MAGIC		0x4451		/* 'DQ' */
+#define XFS_DQUOT_VERSION	(u_int8_t)0x01	/* latest version number */
+
+/*
+ * uid_t and gid_t are hard-coded to 32 bits in the inode.
+ * Hence, an 'id' in a dquot is 32 bits..
+ */
+typedef __uint32_t	xfs_dqid_t;
+
+/*
+ * Even though users may not have quota limits occupying all 64-bits,
+ * they may need 64-bit accounting. Hence, 64-bit quota-counters,
+ * and quota-limits. This is a waste in the common case, but hey ...
+ */
+typedef __uint64_t	xfs_qcnt_t;
+typedef __uint16_t	xfs_qwarncnt_t;
+
+/*
+ * This is the main portion of the on-disk representation of quota
+ * information for a user. This is the q_core of the xfs_dquot_t that
+ * is kept in kernel memory. We pad this with some more expansion room
+ * to construct the on disk structure.
+ */
+typedef struct	xfs_disk_dquot {
+	__be16		d_magic;	/* dquot magic = XFS_DQUOT_MAGIC */
+	__u8		d_version;	/* dquot version */
+	__u8		d_flags;	/* XFS_DQ_USER/PROJ/GROUP */
+	__be32		d_id;		/* user,project,group id */
+	__be64		d_blk_hardlimit;/* absolute limit on disk blks */
+	__be64		d_blk_softlimit;/* preferred limit on disk blks */
+	__be64		d_ino_hardlimit;/* maximum # allocated inodes */
+	__be64		d_ino_softlimit;/* preferred inode limit */
+	__be64		d_bcount;	/* disk blocks owned by the user */
+	__be64		d_icount;	/* inodes owned by the user */
+	__be32		d_itimer;	/* zero if within inode limits if not,
+					   this is when we refuse service */
+	__be32		d_btimer;	/* similar to above; for disk blocks */
+	__be16		d_iwarns;	/* warnings issued wrt num inodes */
+	__be16		d_bwarns;	/* warnings issued wrt disk blocks */
+	__be32		d_pad0;		/* 64 bit align */
+	__be64		d_rtb_hardlimit;/* absolute limit on realtime blks */
+	__be64		d_rtb_softlimit;/* preferred limit on RT disk blks */
+	__be64		d_rtbcount;	/* realtime blocks owned */
+	__be32		d_rtbtimer;	/* similar to above; for RT disk blocks */
+	__be16		d_rtbwarns;	/* warnings issued wrt RT disk blocks */
+	__be16		d_pad;
+} xfs_disk_dquot_t;
+
+/*
+ * This is what goes on disk. This is separated from the xfs_disk_dquot because
+ * carrying the unnecessary padding would be a waste of memory.
+ */
+typedef struct xfs_dqblk {
+	xfs_disk_dquot_t  dd_diskdq;	/* portion that lives incore as well */
+	char		  dd_fill[32];	/* filling for posterity */
+} xfs_dqblk_t;
+
+/*
+ * flags for q_flags field in the dquot.
+ */
+#define XFS_DQ_USER		0x0001		/* a user quota */
+#define XFS_DQ_PROJ		0x0002		/* project quota */
+#define XFS_DQ_GROUP		0x0004		/* a group quota */
+#define XFS_DQ_DIRTY		0x0008		/* dquot is dirty */
+#define XFS_DQ_WANT		0x0010		/* for lookup/reclaim race */
+#define XFS_DQ_INACTIVE		0x0020		/* dq off mplist & hashlist */
+
+#define XFS_DQ_ALLTYPES		(XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
+
+#define XFS_DQ_FLAGS \
+	{ XFS_DQ_USER,		"USER" }, \
+	{ XFS_DQ_PROJ,		"PROJ" }, \
+	{ XFS_DQ_GROUP,		"GROUP" }, \
+	{ XFS_DQ_DIRTY,		"DIRTY" }, \
+	{ XFS_DQ_WANT,		"WANT" }, \
+	{ XFS_DQ_INACTIVE,	"INACTIVE" }
+
+/*
+ * In the worst case, when both user and group quotas are on,
+ * we can have a max of three dquots changing in a single transaction.
+ */
+#define XFS_DQUOT_LOGRES(mp)	(sizeof(xfs_disk_dquot_t) * 3)
+
+
+/*
+ * These are the structures used to lay out dquots and quotaoff
+ * records on the log. Quite similar to those of inodes.
+ */
+
+/*
+ * log format struct for dquots.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ */
+typedef struct xfs_dq_logformat {
+	__uint16_t		qlf_type;      /* dquot log item type */
+	__uint16_t		qlf_size;      /* size of this item */
+	xfs_dqid_t		qlf_id;	       /* usr/grp/proj id : 32 bits */
+	__int64_t		qlf_blkno;     /* blkno of dquot buffer */
+	__int32_t		qlf_len;       /* len of dquot buffer */
+	__uint32_t		qlf_boffset;   /* off of dquot in buffer */
+} xfs_dq_logformat_t;
+
+/*
+ * log format struct for QUOTAOFF records.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
+ * to the first and ensures that the first logitem is taken out of the AIL
+ * only when the last one is securely committed.
+ */
+typedef struct xfs_qoff_logformat {
+	unsigned short		qf_type;	/* quotaoff log item type */
+	unsigned short		qf_size;	/* size of this item */
+	unsigned int		qf_flags;	/* USR and/or GRP */
+	char			qf_pad[12];	/* padding for future */
+} xfs_qoff_logformat_t;
+
+
+/*
+ * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
+ */
+#define XFS_UQUOTA_ACCT	0x0001  /* user quota accounting ON */
+#define XFS_UQUOTA_ENFD	0x0002  /* user quota limits enforced */
+#define XFS_UQUOTA_CHKD	0x0004  /* quotacheck run on usr quotas */
+#define XFS_PQUOTA_ACCT	0x0008  /* project quota accounting ON */
+#define XFS_OQUOTA_ENFD	0x0010  /* other (grp/prj) quota limits enforced */
+#define XFS_OQUOTA_CHKD	0x0020  /* quotacheck run on other (grp/prj) quotas */
+#define XFS_GQUOTA_ACCT	0x0040  /* group quota accounting ON */
+
+/*
+ * Quota Accounting/Enforcement flags
+ */
+#define XFS_ALL_QUOTA_ACCT	\
+		(XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
+#define XFS_ALL_QUOTA_ENFD	(XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD	(XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD)
+
+#define XFS_IS_QUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_UQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_PQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_PQUOTA_ACCT)
+#define XFS_IS_GQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_GQUOTA_ACCT)
+#define XFS_IS_UQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_UQUOTA_ENFD)
+#define XFS_IS_OQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_OQUOTA_ENFD)
+
+/*
+ * Incore only flags for quotaoff - these bits get cleared when quota(s)
+ * are in the process of getting turned off. These flags are in m_qflags but
+ * never in sb_qflags.
+ */
+#define XFS_UQUOTA_ACTIVE	0x0100  /* uquotas are being turned off */
+#define XFS_PQUOTA_ACTIVE	0x0200  /* pquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE	0x0400  /* gquotas are being turned off */
+
+/*
+ * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
+ * quota will be not be switched off as long as that inode lock is held.
+ */
+#define XFS_IS_QUOTA_ON(mp)	((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
+						   XFS_GQUOTA_ACTIVE | \
+						   XFS_PQUOTA_ACTIVE))
+#define XFS_IS_OQUOTA_ON(mp)	((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
+						   XFS_PQUOTA_ACTIVE))
+#define XFS_IS_UQUOTA_ON(mp)	((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
+#define XFS_IS_GQUOTA_ON(mp)	((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
+#define XFS_IS_PQUOTA_ON(mp)	((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
+
+/*
+ * Flags to tell various functions what to do. Not all of these are meaningful
+ * to a single function. None of these XFS_QMOPT_* flags are meant to have
+ * persistent values (ie. their values can and will change between versions)
+ */
+#define XFS_QMOPT_DQALLOC	0x0000002 /* alloc dquot ondisk if needed */
+#define XFS_QMOPT_UQUOTA	0x0000004 /* user dquot requested */
+#define XFS_QMOPT_PQUOTA	0x0000008 /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES	0x0000010 /* ignore quota limits */
+#define XFS_QMOPT_DQSUSER	0x0000020 /* don't cache super users dquot */
+#define XFS_QMOPT_SBVERSION	0x0000040 /* change superblock version num */
+#define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
+#define XFS_QMOPT_DQREPAIR	0x0001000 /* repair dquot if damaged */
+#define XFS_QMOPT_GQUOTA	0x0002000 /* group dquot requested */
+#define XFS_QMOPT_ENOSPC	0x0004000 /* enospc instead of edquot (prj) */
+
+/*
+ * flags to xfs_trans_mod_dquot to indicate which field needs to be
+ * modified.
+ */
+#define XFS_QMOPT_RES_REGBLKS	0x0010000
+#define XFS_QMOPT_RES_RTBLKS	0x0020000
+#define XFS_QMOPT_BCOUNT	0x0040000
+#define XFS_QMOPT_ICOUNT	0x0080000
+#define XFS_QMOPT_RTBCOUNT	0x0100000
+#define XFS_QMOPT_DELBCOUNT	0x0200000
+#define XFS_QMOPT_DELRTBCOUNT	0x0400000
+#define XFS_QMOPT_RES_INOS	0x0800000
+
+/*
+ * flags for dqalloc.
+ */
+#define XFS_QMOPT_INHERIT	0x1000000
+
+/*
+ * flags to xfs_trans_mod_dquot.
+ */
+#define XFS_TRANS_DQ_RES_BLKS	XFS_QMOPT_RES_REGBLKS
+#define XFS_TRANS_DQ_RES_RTBLKS	XFS_QMOPT_RES_RTBLKS
+#define XFS_TRANS_DQ_RES_INOS	XFS_QMOPT_RES_INOS
+#define XFS_TRANS_DQ_BCOUNT	XFS_QMOPT_BCOUNT
+#define XFS_TRANS_DQ_DELBCOUNT	XFS_QMOPT_DELBCOUNT
+#define XFS_TRANS_DQ_ICOUNT	XFS_QMOPT_ICOUNT
+#define XFS_TRANS_DQ_RTBCOUNT	XFS_QMOPT_RTBCOUNT
+#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
+
+
+#define XFS_QMOPT_QUOTALL	\
+		(XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
+#define XFS_QMOPT_RESBLK_MASK	(XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
+#ifdef __KERNEL__
+/*
+ * This check is done typically without holding the inode lock;
+ * that may seem racy, but it is harmless in the context that it is used.
+ * The inode cannot go inactive as long a reference is kept, and
+ * therefore if dquot(s) were attached, they'll stay consistent.
+ * If, for example, the ownership of the inode changes while
+ * we didn't have the inode locked, the appropriate dquot(s) will be
+ * attached atomically.
+ */
+#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\
+				     (ip)->i_udquot == NULL) || \
+				    (XFS_IS_OQUOTA_ON(mp) && \
+				     (ip)->i_gdquot == NULL))
+
+#define XFS_QM_NEED_QUOTACHECK(mp) \
+	((XFS_IS_UQUOTA_ON(mp) && \
+		(mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
+	 (XFS_IS_GQUOTA_ON(mp) && \
+		((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
+		 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \
+	 (XFS_IS_PQUOTA_ON(mp) && \
+		((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
+		 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT))))
+
+#define XFS_MOUNT_QUOTA_SET1	(XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+				 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
+				 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
+
+#define XFS_MOUNT_QUOTA_SET2	(XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+				 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+				 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
+
+#define XFS_MOUNT_QUOTA_ALL	(XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+				 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
+				 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\
+				 XFS_GQUOTA_ACCT)
+
+
+/*
+ * The structure kept inside the xfs_trans_t keep track of dquot changes
+ * within a transaction and apply them later.
+ */
+typedef struct xfs_dqtrx {
+	struct xfs_dquot *qt_dquot;	  /* the dquot this refers to */
+	ulong		qt_blk_res;	  /* blks reserved on a dquot */
+	ulong		qt_blk_res_used;  /* blks used from the reservation */
+	ulong		qt_ino_res;	  /* inode reserved on a dquot */
+	ulong		qt_ino_res_used;  /* inodes used from the reservation */
+	long		qt_bcount_delta;  /* dquot blk count changes */
+	long		qt_delbcnt_delta; /* delayed dquot blk count changes */
+	long		qt_icount_delta;  /* dquot inode count changes */
+	ulong		qt_rtblk_res;	  /* # blks reserved on a dquot */
+	ulong		qt_rtblk_res_used;/* # blks used from reservation */
+	long		qt_rtbcount_delta;/* dquot realtime blk changes */
+	long		qt_delrtb_delta;  /* delayed RT blk count changes */
+} xfs_dqtrx_t;
+
+#ifdef CONFIG_XFS_QUOTA
+extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
+extern void xfs_trans_free_dqinfo(struct xfs_trans *);
+extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
+		uint, long);
+extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
+extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
+extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
+		struct xfs_inode *, long, long, uint);
+extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
+		struct xfs_mount *, struct xfs_dquot *,
+		struct xfs_dquot *, long, long, uint);
+
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
+		struct xfs_dquot **, struct xfs_dquot **);
+extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
+		struct xfs_dquot *, struct xfs_dquot *);
+extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
+extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
+		struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
+extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
+		struct xfs_dquot *, struct xfs_dquot *, uint);
+extern int xfs_qm_dqattach(struct xfs_inode *, uint);
+extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
+extern void xfs_qm_dqdetach(struct xfs_inode *);
+extern void xfs_qm_dqrele(struct xfs_dquot *);
+extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
+extern int xfs_qm_sync(struct xfs_mount *, int);
+extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *);
+extern void xfs_qm_mount_quotas(struct xfs_mount *);
+extern void xfs_qm_unmount(struct xfs_mount *);
+extern void xfs_qm_unmount_quotas(struct xfs_mount *);
+
+#else
+static inline int
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
+		uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
+{
+	*udqp = NULL;
+	*gdqp = NULL;
+	return 0;
+}
+#define xfs_trans_dup_dqinfo(tp, tp2)
+#define xfs_trans_free_dqinfo(tp)
+#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
+#define xfs_trans_apply_dquot_deltas(tp)
+#define xfs_trans_unreserve_and_mod_dquots(tp)
+static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
+		struct xfs_inode *ip, long nblks, long ninos, uint flags)
+{
+	return 0;
+}
+static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
+		struct xfs_mount *mp, struct xfs_dquot *udqp,
+		struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
+{
+	return 0;
+}
+#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
+#define xfs_qm_vop_rename_dqattach(it)					(0)
+#define xfs_qm_vop_chown(tp, ip, old, new)				(NULL)
+#define xfs_qm_vop_chown_reserve(tp, ip, u, g, fl)			(0)
+#define xfs_qm_dqattach(ip, fl)						(0)
+#define xfs_qm_dqattach_locked(ip, fl)					(0)
+#define xfs_qm_dqdetach(ip)
+#define xfs_qm_dqrele(d)
+#define xfs_qm_statvfs(ip, s)
+static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
+{
+	return 0;
+}
+#define xfs_qm_newmount(mp, a, b)					(0)
+#define xfs_qm_mount_quotas(mp)
+#define xfs_qm_unmount(mp)
+#define xfs_qm_unmount_quotas(mp)
+#endif /* CONFIG_XFS_QUOTA */
+
+#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
+	xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
+#define xfs_trans_reserve_quota(tp, mp, ud, gd, nb, ni, f) \
+	xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
+				f | XFS_QMOPT_RES_REGBLKS)
+
+extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
+				xfs_dqid_t, uint, uint, char *);
+extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
+
+#endif	/* __KERNEL__ */
+#endif	/* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
new file mode 100644
index 00000000..77a59891
--- /dev/null
+++ b/fs/xfs/xfs_rename.c
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_utils.h"
+#include "xfs_trans_space.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+
+/*
+ * Enter all inodes for a rename transaction into a sorted array.
+ */
+STATIC void
+xfs_sort_for_rename(
+	xfs_inode_t	*dp1,	/* in: old (source) directory inode */
+	xfs_inode_t	*dp2,	/* in: new (target) directory inode */
+	xfs_inode_t	*ip1,	/* in: inode of old entry */
+	xfs_inode_t	*ip2,	/* in: inode of new entry, if it
+				   already exists, NULL otherwise. */
+	xfs_inode_t	**i_tab,/* out: array of inode returned, sorted */
+	int		*num_inodes)  /* out: number of inodes in array */
+{
+	xfs_inode_t		*temp;
+	int			i, j;
+
+	/*
+	 * i_tab contains a list of pointers to inodes.  We initialize
+	 * the table here & we'll sort it.  We will then use it to
+	 * order the acquisition of the inode locks.
+	 *
+	 * Note that the table may contain duplicates.  e.g., dp1 == dp2.
+	 */
+	i_tab[0] = dp1;
+	i_tab[1] = dp2;
+	i_tab[2] = ip1;
+	if (ip2) {
+		*num_inodes = 4;
+		i_tab[3] = ip2;
+	} else {
+		*num_inodes = 3;
+		i_tab[3] = NULL;
+	}
+
+	/*
+	 * Sort the elements via bubble sort.  (Remember, there are at
+	 * most 4 elements to sort, so this is adequate.)
+	 */
+	for (i = 0; i < *num_inodes; i++) {
+		for (j = 1; j < *num_inodes; j++) {
+			if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
+				temp = i_tab[j];
+				i_tab[j] = i_tab[j-1];
+				i_tab[j-1] = temp;
+			}
+		}
+	}
+}
+
+/*
+ * xfs_rename
+ */
+int
+xfs_rename(
+	xfs_inode_t	*src_dp,
+	struct xfs_name	*src_name,
+	xfs_inode_t	*src_ip,
+	xfs_inode_t	*target_dp,
+	struct xfs_name	*target_name,
+	xfs_inode_t	*target_ip)
+{
+	xfs_trans_t	*tp = NULL;
+	xfs_mount_t	*mp = src_dp->i_mount;
+	int		new_parent;		/* moving to a new dir */
+	int		src_is_directory;	/* src_name is a directory */
+	int		error;
+	xfs_bmap_free_t free_list;
+	xfs_fsblock_t   first_block;
+	int		cancel_flags;
+	int		committed;
+	xfs_inode_t	*inodes[4];
+	int		spaceres;
+	int		num_inodes;
+
+	trace_xfs_rename(src_dp, target_dp, src_name, target_name);
+
+	new_parent = (src_dp != target_dp);
+	src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);
+
+	if (src_is_directory) {
+		/*
+		 * Check for link count overflow on target_dp
+		 */
+		if (target_ip == NULL && new_parent &&
+		    target_dp->i_d.di_nlink >= XFS_MAXLINK) {
+			error = XFS_ERROR(EMLINK);
+			goto std_return;
+		}
+	}
+
+	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
+				inodes, &num_inodes);
+
+	xfs_bmap_init(&free_list, &first_block);
+	tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
+	error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
+	if (error == ENOSPC) {
+		spaceres = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
+	}
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		goto std_return;
+	}
+
+	/*
+	 * Attach the dquots to the inodes
+	 */
+	error = xfs_qm_vop_rename_dqattach(inodes);
+	if (error) {
+		xfs_trans_cancel(tp, cancel_flags);
+		goto std_return;
+	}
+
+	/*
+	 * Lock all the participating inodes. Depending upon whether
+	 * the target_name exists in the target directory, and
+	 * whether the target directory is the same as the source
+	 * directory, we can lock from 2 to 4 inodes.
+	 */
+	xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
+
+	/*
+	 * Join all the inodes to the transaction. From this point on,
+	 * we can rely on either trans_commit or trans_cancel to unlock
+	 * them.
+	 */
+	xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL);
+	if (new_parent)
+		xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL);
+	if (target_ip)
+		xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * If we are using project inheritance, we only allow renames
+	 * into our tree when the project IDs are the same; else the
+	 * tree quota mechanism would be circumvented.
+	 */
+	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+		     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
+		error = XFS_ERROR(EXDEV);
+		goto error_return;
+	}
+
+	/*
+	 * Set up the target.
+	 */
+	if (target_ip == NULL) {
+		/*
+		 * If there's no space reservation, check the entry will
+		 * fit before actually inserting it.
+		 */
+		error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
+		if (error)
+			goto error_return;
+		/*
+		 * If target does not exist and the rename crosses
+		 * directories, adjust the target directory link count
+		 * to account for the ".." reference from the new entry.
+		 */
+		error = xfs_dir_createname(tp, target_dp, target_name,
+						src_ip->i_ino, &first_block,
+						&free_list, spaceres);
+		if (error == ENOSPC)
+			goto error_return;
+		if (error)
+			goto abort_return;
+
+		xfs_trans_ichgtime(tp, target_dp,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		if (new_parent && src_is_directory) {
+			error = xfs_bumplink(tp, target_dp);
+			if (error)
+				goto abort_return;
+		}
+	} else { /* target_ip != NULL */
+		/*
+		 * If target exists and it's a directory, check that both
+		 * target and source are directories and that target can be
+		 * destroyed, or that neither is a directory.
+		 */
+		if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+			/*
+			 * Make sure target dir is empty.
+			 */
+			if (!(xfs_dir_isempty(target_ip)) ||
+			    (target_ip->i_d.di_nlink > 2)) {
+				error = XFS_ERROR(EEXIST);
+				goto error_return;
+			}
+		}
+
+		/*
+		 * Link the source inode under the target name.
+		 * If the source inode is a directory and we are moving
+		 * it across directories, its ".." entry will be
+		 * inconsistent until we replace that down below.
+		 *
+		 * In case there is already an entry with the same
+		 * name at the destination directory, remove it first.
+		 */
+		error = xfs_dir_replace(tp, target_dp, target_name,
+					src_ip->i_ino,
+					&first_block, &free_list, spaceres);
+		if (error)
+			goto abort_return;
+
+		xfs_trans_ichgtime(tp, target_dp,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		/*
+		 * Decrement the link count on the target since the target
+		 * dir no longer points to it.
+		 */
+		error = xfs_droplink(tp, target_ip);
+		if (error)
+			goto abort_return;
+
+		if (src_is_directory) {
+			/*
+			 * Drop the link from the old "." entry.
+			 */
+			error = xfs_droplink(tp, target_ip);
+			if (error)
+				goto abort_return;
+		}
+	} /* target_ip != NULL */
+
+	/*
+	 * Remove the source.
+	 */
+	if (new_parent && src_is_directory) {
+		/*
+		 * Rewrite the ".." entry to point to the new
+		 * directory.
+		 */
+		error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+					target_dp->i_ino,
+					&first_block, &free_list, spaceres);
+		ASSERT(error != EEXIST);
+		if (error)
+			goto abort_return;
+	}
+
+	/*
+	 * We always want to hit the ctime on the source inode.
+	 *
+	 * This isn't strictly required by the standards since the source
+	 * inode isn't really being changed, but old unix file systems did
+	 * it and some incremental backup programs won't work without it.
+	 */
+	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
+
+	/*
+	 * Adjust the link count on src_dp.  This is necessary when
+	 * renaming a directory, either within one parent when
+	 * the target existed, or across two parent directories.
+	 */
+	if (src_is_directory && (new_parent || target_ip != NULL)) {
+
+		/*
+		 * Decrement link count on src_directory since the
+		 * entry that's moved no longer points to it.
+		 */
+		error = xfs_droplink(tp, src_dp);
+		if (error)
+			goto abort_return;
+	}
+
+	error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+					&first_block, &free_list, spaceres);
+	if (error)
+		goto abort_return;
+
+	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+	if (new_parent)
+		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * rename transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	if (error) {
+		xfs_bmap_cancel(&free_list);
+		xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
+				 XFS_TRANS_ABORT));
+		goto std_return;
+	}
+
+	/*
+	 * trans_commit will unlock src_ip, target_ip & decrement
+	 * the vnode references.
+	 */
+	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+ abort_return:
+	cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+	xfs_bmap_cancel(&free_list);
+	xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+	return error;
+}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
new file mode 100644
index 00000000..8f76fdff
--- /dev/null
+++ b/fs/xfs/xfs_rtalloc.c
@@ -0,0 +1,2325 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_fsops.h"
+#include "xfs_error.h"
+#include "xfs_rw.h"
+#include "xfs_inode_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_utils.h"
+#include "xfs_trace.h"
+#include "xfs_buf.h"
+
+
+/*
+ * Prototypes for internal functions.
+ */
+
+
+STATIC int xfs_rtallocate_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+		xfs_extlen_t, xfs_buf_t **, xfs_fsblock_t *);
+STATIC int xfs_rtany_summary(xfs_mount_t *, xfs_trans_t *, int, int,
+		xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, int *);
+STATIC int xfs_rtcheck_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+		xfs_extlen_t, int, xfs_rtblock_t *, int *);
+STATIC int xfs_rtfind_back(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+		xfs_rtblock_t, xfs_rtblock_t *);
+STATIC int xfs_rtfind_forw(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+		xfs_rtblock_t, xfs_rtblock_t *);
+STATIC int xfs_rtget_summary( xfs_mount_t *, xfs_trans_t *, int,
+		xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, xfs_suminfo_t *);
+STATIC int xfs_rtmodify_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+		xfs_extlen_t, int);
+STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
+		xfs_rtblock_t, int, xfs_buf_t **, xfs_fsblock_t *);
+
+/*
+ * Internal functions.
+ */
+
+/*
+ * Allocate space to the bitmap or summary file, and zero it, for growfs.
+ */
+STATIC int				/* error */
+xfs_growfs_rt_alloc(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_extlen_t	oblocks,	/* old count of blocks */
+	xfs_extlen_t	nblocks,	/* new count of blocks */
+	xfs_inode_t	*ip)		/* inode (bitmap/summary) */
+{
+	xfs_fileoff_t	bno;		/* block number in file */
+	xfs_buf_t	*bp;		/* temporary buffer for zeroing */
+	int		committed;	/* transaction committed flag */
+	xfs_daddr_t	d;		/* disk block address */
+	int		error;		/* error return value */
+	xfs_fsblock_t	firstblock;	/* first block allocated in xaction */
+	xfs_bmap_free_t	flist;		/* list of freed blocks */
+	xfs_fsblock_t	fsbno;		/* filesystem block for bno */
+	xfs_bmbt_irec_t	map;		/* block map output */
+	int		nmap;		/* number of block maps */
+	int		resblks;	/* space reservation */
+
+	/*
+	 * Allocate space to the file, as necessary.
+	 */
+	while (oblocks < nblocks) {
+		int		cancelflags = 0;
+		xfs_trans_t	*tp;
+
+		tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
+		resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
+		/*
+		 * Reserve space & log for one extent added to the file.
+		 */
+		if ((error = xfs_trans_reserve(tp, resblks,
+				XFS_GROWRTALLOC_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES,
+				XFS_DEFAULT_PERM_LOG_COUNT)))
+			goto error_cancel;
+		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+		/*
+		 * Lock the inode.
+		 */
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+
+		xfs_bmap_init(&flist, &firstblock);
+		/*
+		 * Allocate blocks to the bitmap file.
+		 */
+		nmap = 1;
+		cancelflags |= XFS_TRANS_ABORT;
+		error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
+			XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
+			resblks, &map, &nmap, &flist);
+		if (!error && nmap < 1)
+			error = XFS_ERROR(ENOSPC);
+		if (error)
+			goto error_cancel;
+		/*
+		 * Free any blocks freed up in the transaction, then commit.
+		 */
+		error = xfs_bmap_finish(&tp, &flist, &committed);
+		if (error)
+			goto error_cancel;
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		if (error)
+			goto error;
+		/*
+		 * Now we need to clear the allocated blocks.
+		 * Do this one block per transaction, to keep it simple.
+		 */
+		cancelflags = 0;
+		for (bno = map.br_startoff, fsbno = map.br_startblock;
+		     bno < map.br_startoff + map.br_blockcount;
+		     bno++, fsbno++) {
+			tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
+			/*
+			 * Reserve log for one block zeroing.
+			 */
+			if ((error = xfs_trans_reserve(tp, 0,
+					XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0)))
+				goto error_cancel;
+			/*
+			 * Lock the bitmap inode.
+			 */
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+			xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+			/*
+			 * Get a buffer for the block.
+			 */
+			d = XFS_FSB_TO_DADDR(mp, fsbno);
+			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+				mp->m_bsize, 0);
+			if (bp == NULL) {
+				error = XFS_ERROR(EIO);
+error_cancel:
+				xfs_trans_cancel(tp, cancelflags);
+				goto error;
+			}
+			memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
+			xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
+			/*
+			 * Commit the transaction.
+			 */
+			error = xfs_trans_commit(tp, 0);
+			if (error)
+				goto error;
+		}
+		/*
+		 * Go on to the next extent, if any.
+		 */
+		oblocks = map.br_startoff + map.br_blockcount;
+	}
+	return 0;
+error:
+	return error;
+}
+
+/*
+ * Attempt to allocate an extent minlen<=len<=maxlen starting from
+ * bitmap block bbno.  If we don't get maxlen then use prod to trim
+ * the length, if given.  Returns error; returns starting block in *rtblock.
+ * The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_block(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_rtblock_t	*nextp,		/* out: next block to try */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	xfs_rtblock_t	besti;		/* best rtblock found so far */
+	xfs_rtblock_t	bestlen;	/* best length found so far */
+	xfs_rtblock_t	end;		/* last rtblock in chunk */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current rtblock trying */
+	xfs_rtblock_t	next;		/* next rtblock to try */
+	int		stat;		/* status from internal calls */
+
+	/*
+	 * Loop over all the extents starting in this bitmap block,
+	 * looking for one that's long enough.
+	 */
+	for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0,
+		end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
+	     i <= end;
+	     i++) {
+		/*
+		 * See if there's a free extent of maxlen starting at i.
+		 * If it's not so then next will contain the first non-free.
+		 */
+		error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat);
+		if (error) {
+			return error;
+		}
+		if (stat) {
+			/*
+			 * i for maxlen is all free, allocate and return that.
+			 */
+			error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp,
+				rsb);
+			if (error) {
+				return error;
+			}
+			*len = maxlen;
+			*rtblock = i;
+			return 0;
+		}
+		/*
+		 * In the case where we have a variable-sized allocation
+		 * request, figure out how big this free piece is,
+		 * and if it's big enough for the minimum, and the best
+		 * so far, remember it.
+		 */
+		if (minlen < maxlen) {
+			xfs_rtblock_t	thislen;	/* this extent size */
+
+			thislen = next - i;
+			if (thislen >= minlen && thislen > bestlen) {
+				besti = i;
+				bestlen = thislen;
+			}
+		}
+		/*
+		 * If not done yet, find the start of the next free space.
+		 */
+		if (next < end) {
+			error = xfs_rtfind_forw(mp, tp, next, end, &i);
+			if (error) {
+				return error;
+			}
+		} else
+			break;
+	}
+	/*
+	 * Searched the whole thing & didn't find a maxlen free extent.
+	 */
+	if (minlen < maxlen && besti != -1) {
+		xfs_extlen_t	p;	/* amount to trim length by */
+
+		/*
+		 * If size should be a multiple of prod, make that so.
+		 */
+		if (prod > 1 && (p = do_mod(bestlen, prod)))
+			bestlen -= p;
+		/*
+		 * Allocate besti for bestlen & return that.
+		 */
+		error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+		*len = bestlen;
+		*rtblock = besti;
+		return 0;
+	}
+	/*
+	 * Allocation failed.  Set *nextp to the next block to try.
+	 */
+	*nextp = next;
+	*rtblock = NULLRTBLOCK;
+	return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, starting at block
+ * bno.  If we don't get maxlen then use prod to trim the length, if given.
+ * Returns error; returns starting block in *rtblock.
+ * The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_exact(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to allocate */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		error;		/* error value */
+	xfs_extlen_t	i;		/* extent length trimmed due to prod */
+	int		isfree;		/* extent is free */
+	xfs_rtblock_t	next;		/* next block to try (dummy) */
+
+	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	/*
+	 * Check if the range in question (for maxlen) is free.
+	 */
+	error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree);
+	if (error) {
+		return error;
+	}
+	if (isfree) {
+		/*
+		 * If it is, allocate it and return success.
+		 */
+		error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+		*len = maxlen;
+		*rtblock = bno;
+		return 0;
+	}
+	/*
+	 * If not, allocate what there is, if it's at least minlen.
+	 */
+	maxlen = next - bno;
+	if (maxlen < minlen) {
+		/*
+		 * Failed, return failure status.
+		 */
+		*rtblock = NULLRTBLOCK;
+		return 0;
+	}
+	/*
+	 * Trim off tail of extent, if prod is specified.
+	 */
+	if (prod > 1 && (i = maxlen % prod)) {
+		maxlen -= i;
+		if (maxlen < minlen) {
+			/*
+			 * Now we can't do it, return failure status.
+			 */
+			*rtblock = NULLRTBLOCK;
+			return 0;
+		}
+	}
+	/*
+	 * Allocate what we can and return it.
+	 */
+	error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+	if (error) {
+		return error;
+	}
+	*len = maxlen;
+	*rtblock = bno;
+	return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, starting as near
+ * to bno as possible.  If we don't get maxlen then use prod to trim
+ * the length, if given.  The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_near(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to allocate */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		any;		/* any useful extents from summary */
+	xfs_rtblock_t	bbno;		/* bitmap block number */
+	int		error;		/* error value */
+	int		i;		/* bitmap block offset (loop control) */
+	int		j;		/* secondary loop control */
+	int		log2len;	/* log2 of minlen */
+	xfs_rtblock_t	n;		/* next block to try */
+	xfs_rtblock_t	r;		/* result block */
+
+	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	/*
+	 * If the block number given is off the end, silently set it to
+	 * the last block.
+	 */
+	if (bno >= mp->m_sb.sb_rextents)
+		bno = mp->m_sb.sb_rextents - 1;
+	/*
+	 * Try the exact allocation first.
+	 */
+	error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len,
+		rbpp, rsb, prod, &r);
+	if (error) {
+		return error;
+	}
+	/*
+	 * If the exact allocation worked, return that.
+	 */
+	if (r != NULLRTBLOCK) {
+		*rtblock = r;
+		return 0;
+	}
+	bbno = XFS_BITTOBLOCK(mp, bno);
+	i = 0;
+	ASSERT(minlen != 0);
+	log2len = xfs_highbit32(minlen);
+	/*
+	 * Loop over all bitmap blocks (bbno + i is current block).
+	 */
+	for (;;) {
+		/*
+		 * Get summary information of extents of all useful levels
+		 * starting in this bitmap block.
+		 */
+		error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1,
+			bbno + i, rbpp, rsb, &any);
+		if (error) {
+			return error;
+		}
+		/*
+		 * If there are any useful extents starting here, try
+		 * allocating one.
+		 */
+		if (any) {
+			/*
+			 * On the positive side of the starting location.
+			 */
+			if (i >= 0) {
+				/*
+				 * Try to allocate an extent starting in
+				 * this block.
+				 */
+				error = xfs_rtallocate_extent_block(mp, tp,
+					bbno + i, minlen, maxlen, len, &n, rbpp,
+					rsb, prod, &r);
+				if (error) {
+					return error;
+				}
+				/*
+				 * If it worked, return it.
+				 */
+				if (r != NULLRTBLOCK) {
+					*rtblock = r;
+					return 0;
+				}
+			}
+			/*
+			 * On the negative side of the starting location.
+			 */
+			else {		/* i < 0 */
+				/*
+				 * Loop backwards through the bitmap blocks from
+				 * the starting point-1 up to where we are now.
+				 * There should be an extent which ends in this
+				 * bitmap block and is long enough.
+				 */
+				for (j = -1; j > i; j--) {
+					/*
+					 * Grab the summary information for
+					 * this bitmap block.
+					 */
+					error = xfs_rtany_summary(mp, tp,
+						log2len, mp->m_rsumlevels - 1,
+						bbno + j, rbpp, rsb, &any);
+					if (error) {
+						return error;
+					}
+					/*
+					 * If there's no extent given in the
+					 * summary that means the extent we
+					 * found must carry over from an
+					 * earlier block.  If there is an
+					 * extent given, we've already tried
+					 * that allocation, don't do it again.
+					 */
+					if (any)
+						continue;
+					error = xfs_rtallocate_extent_block(mp,
+						tp, bbno + j, minlen, maxlen,
+						len, &n, rbpp, rsb, prod, &r);
+					if (error) {
+						return error;
+					}
+					/*
+					 * If it works, return the extent.
+					 */
+					if (r != NULLRTBLOCK) {
+						*rtblock = r;
+						return 0;
+					}
+				}
+				/*
+				 * There weren't intervening bitmap blocks
+				 * with a long enough extent, or the
+				 * allocation didn't work for some reason
+				 * (i.e. it's a little * too short).
+				 * Try to allocate from the summary block
+				 * that we found.
+				 */
+				error = xfs_rtallocate_extent_block(mp, tp,
+					bbno + i, minlen, maxlen, len, &n, rbpp,
+					rsb, prod, &r);
+				if (error) {
+					return error;
+				}
+				/*
+				 * If it works, return the extent.
+				 */
+				if (r != NULLRTBLOCK) {
+					*rtblock = r;
+					return 0;
+				}
+			}
+		}
+		/*
+		 * Loop control.  If we were on the positive side, and there's
+		 * still more blocks on the negative side, go there.
+		 */
+		if (i > 0 && (int)bbno - i >= 0)
+			i = -i;
+		/*
+		 * If positive, and no more negative, but there are more
+		 * positive, go there.
+		 */
+		else if (i > 0 && (int)bbno + i < mp->m_sb.sb_rbmblocks - 1)
+			i++;
+		/*
+		 * If negative or 0 (just started), and there are positive
+		 * blocks to go, go there.  The 0 case moves to block 1.
+		 */
+		else if (i <= 0 && (int)bbno - i < mp->m_sb.sb_rbmblocks - 1)
+			i = 1 - i;
+		/*
+		 * If negative or 0 and there are more negative blocks,
+		 * go there.
+		 */
+		else if (i <= 0 && (int)bbno + i > 0)
+			i--;
+		/*
+		 * Must be done.  Return failure.
+		 */
+		else
+			break;
+	}
+	*rtblock = NULLRTBLOCK;
+	return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, with no position
+ * specified.  If we don't get maxlen then use prod to trim
+ * the length, if given.  The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_size(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		error;		/* error value */
+	int		i;		/* bitmap block number */
+	int		l;		/* level number (loop control) */
+	xfs_rtblock_t	n;		/* next block to be tried */
+	xfs_rtblock_t	r;		/* result block number */
+	xfs_suminfo_t	sum;		/* summary information for extents */
+
+	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	ASSERT(maxlen != 0);
+
+	/*
+	 * Loop over all the levels starting with maxlen.
+	 * At each level, look at all the bitmap blocks, to see if there
+	 * are extents starting there that are long enough (>= maxlen).
+	 * Note, only on the initial level can the allocation fail if
+	 * the summary says there's an extent.
+	 */
+	for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) {
+		/*
+		 * Loop over all the bitmap blocks.
+		 */
+		for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+			/*
+			 * Get the summary for this level/block.
+			 */
+			error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
+				&sum);
+			if (error) {
+				return error;
+			}
+			/*
+			 * Nothing there, on to the next block.
+			 */
+			if (!sum)
+				continue;
+			/*
+			 * Try allocating the extent.
+			 */
+			error = xfs_rtallocate_extent_block(mp, tp, i, maxlen,
+				maxlen, len, &n, rbpp, rsb, prod, &r);
+			if (error) {
+				return error;
+			}
+			/*
+			 * If it worked, return that.
+			 */
+			if (r != NULLRTBLOCK) {
+				*rtblock = r;
+				return 0;
+			}
+			/*
+			 * If the "next block to try" returned from the
+			 * allocator is beyond the next bitmap block,
+			 * skip to that bitmap block.
+			 */
+			if (XFS_BITTOBLOCK(mp, n) > i + 1)
+				i = XFS_BITTOBLOCK(mp, n) - 1;
+		}
+	}
+	/*
+	 * Didn't find any maxlen blocks.  Try smaller ones, unless
+	 * we're asking for a fixed size extent.
+	 */
+	if (minlen > --maxlen) {
+		*rtblock = NULLRTBLOCK;
+		return 0;
+	}
+	ASSERT(minlen != 0);
+	ASSERT(maxlen != 0);
+
+	/*
+	 * Loop over sizes, from maxlen down to minlen.
+	 * This time, when we do the allocations, allow smaller ones
+	 * to succeed.
+	 */
+	for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) {
+		/*
+		 * Loop over all the bitmap blocks, try an allocation
+		 * starting in that block.
+		 */
+		for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+			/*
+			 * Get the summary information for this level/block.
+			 */
+			error =	xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
+						  &sum);
+			if (error) {
+				return error;
+			}
+			/*
+			 * If nothing there, go on to next.
+			 */
+			if (!sum)
+				continue;
+			/*
+			 * Try the allocation.  Make sure the specified
+			 * minlen/maxlen are in the possible range for
+			 * this summary level.
+			 */
+			error = xfs_rtallocate_extent_block(mp, tp, i,
+					XFS_RTMAX(minlen, 1 << l),
+					XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
+					len, &n, rbpp, rsb, prod, &r);
+			if (error) {
+				return error;
+			}
+			/*
+			 * If it worked, return that extent.
+			 */
+			if (r != NULLRTBLOCK) {
+				*rtblock = r;
+				return 0;
+			}
+			/*
+			 * If the "next block to try" returned from the
+			 * allocator is beyond the next bitmap block,
+			 * skip to that bitmap block.
+			 */
+			if (XFS_BITTOBLOCK(mp, n) > i + 1)
+				i = XFS_BITTOBLOCK(mp, n) - 1;
+		}
+	}
+	/*
+	 * Got nothing, return failure.
+	 */
+	*rtblock = NULLRTBLOCK;
+	return 0;
+}
+
+/*
+ * Mark an extent specified by start and len allocated.
+ * Updates all the summary information as well as the bitmap.
+ */
+STATIC int				/* error */
+xfs_rtallocate_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* start block to allocate */
+	xfs_extlen_t	len,		/* length to allocate */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_rtblock_t	end;		/* end of the allocated extent */
+	int		error;		/* error value */
+	xfs_rtblock_t	postblock;	/* first block allocated > end */
+	xfs_rtblock_t	preblock;	/* first block allocated < start */
+
+	end = start + len - 1;
+	/*
+	 * Assume we're allocating out of the middle of a free extent.
+	 * We need to find the beginning and end of the extent so we can
+	 * properly update the summary.
+	 */
+	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Find the next allocated block (end of free extent).
+	 */
+	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+		&postblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Decrement the summary information corresponding to the entire
+	 * (old) free extent.
+	 */
+	error = xfs_rtmodify_summary(mp, tp,
+		XFS_RTBLOCKLOG(postblock + 1 - preblock),
+		XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+	if (error) {
+		return error;
+	}
+	/*
+	 * If there are blocks not being allocated at the front of the
+	 * old extent, add summary data for them to be free.
+	 */
+	if (preblock < start) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(start - preblock),
+			XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * If there are blocks not being allocated at the end of the
+	 * old extent, add summary data for them to be free.
+	 */
+	if (postblock > end) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(postblock - end),
+			XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * Modify the bitmap to mark this extent allocated.
+	 */
+	error = xfs_rtmodify_range(mp, tp, start, len, 0);
+	return error;
+}
+
+/*
+ * Return whether there are any free extents in the size range given
+ * by low and high, for the bitmap block bbno.
+ */
+STATIC int				/* error */
+xfs_rtany_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		low,		/* low log2 extent size */
+	int		high,		/* high log2 extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	int		*stat)		/* out: any good extents here? */
+{
+	int		error;		/* error value */
+	int		log;		/* loop counter, log2 of ext. size */
+	xfs_suminfo_t	sum;		/* summary data */
+
+	/*
+	 * Loop over logs of extent sizes.  Order is irrelevant.
+	 */
+	for (log = low; log <= high; log++) {
+		/*
+		 * Get one summary datum.
+		 */
+		error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
+		if (error) {
+			return error;
+		}
+		/*
+		 * If there are any, return success.
+		 */
+		if (sum) {
+			*stat = 1;
+			return 0;
+		}
+	}
+	/*
+	 * Found nothing, return failure.
+	 */
+	*stat = 0;
+	return 0;
+}
+
+/*
+ * Get a buffer for the bitmap or summary file block specified.
+ * The buffer is returned read and locked.
+ */
+STATIC int				/* error */
+xfs_rtbuf_get(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	block,		/* block number in bitmap or summary */
+	int		issum,		/* is summary not bitmap */
+	xfs_buf_t	**bpp)		/* output: buffer for the block */
+{
+	xfs_buf_t	*bp;		/* block buffer, result */
+	xfs_daddr_t	d;		/* disk addr of block */
+	int		error;		/* error value */
+	xfs_fsblock_t	fsb;		/* fs block number for block */
+	xfs_inode_t	*ip;		/* bitmap or summary inode */
+
+	ip = issum ? mp->m_rsumip : mp->m_rbmip;
+	/*
+	 * Map from the file offset (block) and inode number to the
+	 * file system block.
+	 */
+	error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block);
+	if (error) {
+		return error;
+	}
+	ASSERT(fsb != NULLFSBLOCK);
+	/*
+	 * Convert to disk address for buffer cache.
+	 */
+	d = XFS_FSB_TO_DADDR(mp, fsb);
+	/*
+	 * Read the buffer.
+	 */
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+				   mp->m_bsize, 0, &bp);
+	if (error) {
+		return error;
+	}
+	ASSERT(bp && !XFS_BUF_GETERROR(bp));
+	*bpp = bp;
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that the given extent (block range) is allocated already.
+ */
+STATIC int				/* error */
+xfs_rtcheck_alloc_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		*stat)		/* out: 1 for allocated, 0 for not */
+{
+	xfs_rtblock_t	new;		/* dummy for xfs_rtcheck_range */
+
+	return xfs_rtcheck_range(mp, tp, bno, len, 0, &new, stat);
+}
+#endif
+
+/*
+ * Check that the given range is either all allocated (val = 0) or
+ * all free (val = 1).
+ */
+STATIC int				/* error */
+xfs_rtcheck_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block number of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		val,		/* 1 for free, 0 for allocated */
+	xfs_rtblock_t	*new,		/* out: first block not matching */
+	int		*stat)		/* out: 1 for matches, 0 for not */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	lastbit;	/* last useful bit in word */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute starting bitmap block number
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	/*
+	 * Read the bitmap block.
+	 */
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+	/*
+	 * Compute the starting word's address, and starting bit.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	/*
+	 * 0 (allocated) => all zero's; 1 (free) => all one's.
+	 */
+	val = -val;
+	/*
+	 * If not starting on a word boundary, deal with the first
+	 * (partial) word.
+	 */
+	if (bit) {
+		/*
+		 * Compute first bit not examined.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		/*
+		 * Mask of relevant bits.
+		 */
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ val) & mask)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = XFS_RTLOBIT(wdiff) - bit;
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		}
+		i = lastbit - bit;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ val)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Mask of relevant bits.
+		 */
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ val) & mask)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * Successful, return.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*new = start + i;
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Copy and transform the summary file, given the old and new
+ * parameters in the mount structures.
+ */
+STATIC int				/* error */
+xfs_rtcopy_summary(
+	xfs_mount_t	*omp,		/* old file system mount point */
+	xfs_mount_t	*nmp,		/* new file system mount point */
+	xfs_trans_t	*tp)		/* transaction pointer */
+{
+	xfs_rtblock_t	bbno;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* summary buffer */
+	int		error;		/* error return value */
+	int		log;		/* summary level number (log length) */
+	xfs_suminfo_t	sum;		/* summary data */
+	xfs_fsblock_t	sumbno;		/* summary block number */
+
+	bp = NULL;
+	for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
+		for (bbno = omp->m_sb.sb_rbmblocks - 1;
+		     (xfs_srtblock_t)bbno >= 0;
+		     bbno--) {
+			error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
+				&sumbno, &sum);
+			if (error)
+				return error;
+			if (sum == 0)
+				continue;
+			error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
+				&bp, &sumbno);
+			if (error)
+				return error;
+			error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
+				&bp, &sumbno);
+			if (error)
+				return error;
+			ASSERT(sum > 0);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Searching backward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+STATIC int				/* error */
+xfs_rtfind_back(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to look at */
+	xfs_rtblock_t	limit,		/* last block to look at */
+	xfs_rtblock_t	*rtblock)	/* out: start block found */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	firstbit;	/* first useful bit in the word */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	len;		/* length of inspected area */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	want;		/* mask for "good" values */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute and read in starting bitmap block for starting block.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+	/*
+	 * Get the first word's index & point to it.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	len = start - limit + 1;
+	/*
+	 * Compute match value, based on the bit at start: if 1 (free)
+	 * then all-ones, else all-zeroes.
+	 */
+	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	/*
+	 * If the starting position is not word-aligned, deal with the
+	 * partial word.
+	 */
+	if (bit < XFS_NBWORD - 1) {
+		/*
+		 * Calculate first (leftmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+		mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
+			firstbit;
+		/*
+		 * Calculate the difference between the value there
+		 * and what we're looking for.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different.  Mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = bit - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		}
+		i = bit - firstbit + 1;
+		/*
+		 * Go on to previous block if that's where the previous word is
+		 * and we need the previous word.
+		 */
+		if (--word == -1 && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = XFS_BLOCKWMASK(mp);
+			b = &bufp[word];
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b--;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the previous one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ want)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to previous block if that's where the previous word is
+		 * and we need the previous word.
+		 */
+		if (--word == -1 && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = XFS_BLOCKWMASK(mp);
+			b = &bufp[word];
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b--;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if (len - i) {
+		/*
+		 * Calculate first (leftmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		firstbit = XFS_NBWORD - (len - i);
+		mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * No match, return that we scanned the whole area.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*rtblock = start - i + 1;
+	return 0;
+}
+
+/*
+ * Searching forward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+STATIC int				/* error */
+xfs_rtfind_forw(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to look at */
+	xfs_rtblock_t	limit,		/* last block to look at */
+	xfs_rtblock_t	*rtblock)	/* out: start block found */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	lastbit;	/* last useful bit in the word */
+	xfs_rtblock_t	len;		/* length of inspected area */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	want;		/* mask for "good" values */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute and read in starting bitmap block for starting block.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+	/*
+	 * Get the first word's index & point to it.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	len = limit - start + 1;
+	/*
+	 * Compute match value, based on the bit at start: if 1 (free)
+	 * then all-ones, else all-zeroes.
+	 */
+	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	/*
+	 * If the starting position is not word-aligned, deal with the
+	 * partial word.
+	 */
+	if (bit) {
+		/*
+		 * Calculate last (rightmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Calculate the difference between the value there
+		 * and what we're looking for.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different.  Mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = XFS_RTLOBIT(wdiff) - bit;
+			*rtblock = start + i - 1;
+			return 0;
+		}
+		i = lastbit - bit;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ want)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*rtblock = start + i - 1;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Calculate mask for all the relevant bits in this word.
+		 */
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*rtblock = start + i - 1;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * No match, return that we scanned the whole area.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*rtblock = start + i - 1;
+	return 0;
+}
+
+/*
+ * Mark an extent specified by start and len freed.
+ * Updates all the summary information as well as the bitmap.
+ */
+STATIC int				/* error */
+xfs_rtfree_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to free */
+	xfs_extlen_t	len,		/* length to free */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_rtblock_t	end;		/* end of the freed extent */
+	int		error;		/* error value */
+	xfs_rtblock_t	postblock;	/* first block freed > end */
+	xfs_rtblock_t	preblock;	/* first block freed < start */
+
+	end = start + len - 1;
+	/*
+	 * Modify the bitmap to mark this extent freed.
+	 */
+	error = xfs_rtmodify_range(mp, tp, start, len, 1);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Assume we're freeing out of the middle of an allocated extent.
+	 * We need to find the beginning and end of the extent so we can
+	 * properly update the summary.
+	 */
+	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Find the next allocated block (end of allocated extent).
+	 */
+	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+		&postblock);
+	if (error)
+		return error;
+	/*
+	 * If there are blocks not being freed at the front of the
+	 * old extent, add summary data for them to be allocated.
+	 */
+	if (preblock < start) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(start - preblock),
+			XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * If there are blocks not being freed at the end of the
+	 * old extent, add summary data for them to be allocated.
+	 */
+	if (postblock > end) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(postblock - end),
+			XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * Increment the summary information corresponding to the entire
+	 * (new) free extent.
+	 */
+	error = xfs_rtmodify_summary(mp, tp,
+		XFS_RTBLOCKLOG(postblock + 1 - preblock),
+		XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+	return error;
+}
+
+/*
+ * Read and return the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+STATIC int				/* error */
+xfs_rtget_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_suminfo_t	*sum)		/* out: summary info for this block */
+{
+	xfs_buf_t	*bp;		/* buffer for summary block */
+	int		error;		/* error value */
+	xfs_fsblock_t	sb;		/* summary fsblock */
+	int		so;		/* index into the summary file */
+	xfs_suminfo_t	*sp;		/* pointer to returned data */
+
+	/*
+	 * Compute entry number in the summary file.
+	 */
+	so = XFS_SUMOFFS(mp, log, bbno);
+	/*
+	 * Compute the block number in the summary file.
+	 */
+	sb = XFS_SUMOFFSTOBLOCK(mp, so);
+	/*
+	 * If we have an old buffer, and the block number matches, use that.
+	 */
+	if (rbpp && *rbpp && *rsb == sb)
+		bp = *rbpp;
+	/*
+	 * Otherwise we have to get the buffer.
+	 */
+	else {
+		/*
+		 * If there was an old one, get rid of it first.
+		 */
+		if (rbpp && *rbpp)
+			xfs_trans_brelse(tp, *rbpp);
+		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+		if (error) {
+			return error;
+		}
+		/*
+		 * Remember this buffer and block for the next call.
+		 */
+		if (rbpp) {
+			*rbpp = bp;
+			*rsb = sb;
+		}
+	}
+	/*
+	 * Point to the summary information & copy it out.
+	 */
+	sp = XFS_SUMPTR(mp, bp, so);
+	*sum = *sp;
+	/*
+	 * Drop the buffer if we're not asked to remember it.
+	 */
+	if (!rbpp)
+		xfs_trans_brelse(tp, bp);
+	return 0;
+}
+
+/*
+ * Set the given range of bitmap bits to the given value.
+ * Do whatever I/O and logging is required.
+ */
+STATIC int				/* error */
+xfs_rtmodify_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to modify */
+	xfs_extlen_t	len,		/* length of extent to modify */
+	int		val)		/* 1 for free, 0 for allocated */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtword_t	*first;		/* first used word in the buffer */
+	int		i;		/* current bit number rel. to start */
+	int		lastbit;	/* last useful bit in word */
+	xfs_rtword_t	mask;		/* mask o frelevant bits for value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute starting bitmap block number.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	/*
+	 * Read the bitmap block, and point to its data.
+	 */
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+	/*
+	 * Compute the starting word's address, and starting bit.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	first = b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	/*
+	 * 0 (allocated) => all zeroes; 1 (free) => all ones.
+	 */
+	val = -val;
+	/*
+	 * If not starting on a word boundary, deal with the first
+	 * (partial) word.
+	 */
+	if (bit) {
+		/*
+		 * Compute first bit not changed and mask of relevant bits.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Set/clear the active bits.
+		 */
+		if (val)
+			*b |= mask;
+		else
+			*b &= ~mask;
+		i = lastbit - bit;
+		/*
+		 * Go on to the next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * Log the changed part of this block.
+			 * Get the next one.
+			 */
+			xfs_trans_log_buf(tp, bp,
+				(uint)((char *)first - (char *)bufp),
+				(uint)((char *)b - (char *)bufp));
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Set the word value correctly.
+		 */
+		*b = val;
+		i += XFS_NBWORD;
+		/*
+		 * Go on to the next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * Log the changed part of this block.
+			 * Get the next one.
+			 */
+			xfs_trans_log_buf(tp, bp,
+				(uint)((char *)first - (char *)bufp),
+				(uint)((char *)b - (char *)bufp));
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Compute a mask of relevant bits.
+		 */
+		bit = 0;
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Set/clear the active bits.
+		 */
+		if (val)
+			*b |= mask;
+		else
+			*b &= ~mask;
+		b++;
+	}
+	/*
+	 * Log any remaining changed bytes.
+	 */
+	if (b > first)
+		xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
+			(uint)((char *)b - (char *)bufp - 1));
+	return 0;
+}
+
+/*
+ * Read and modify the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+STATIC int				/* error */
+xfs_rtmodify_summary(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	int		delta,		/* change to make to summary info */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_buf_t	*bp;		/* buffer for the summary block */
+	int		error;		/* error value */
+	xfs_fsblock_t	sb;		/* summary fsblock */
+	int		so;		/* index into the summary file */
+	xfs_suminfo_t	*sp;		/* pointer to returned data */
+
+	/*
+	 * Compute entry number in the summary file.
+	 */
+	so = XFS_SUMOFFS(mp, log, bbno);
+	/*
+	 * Compute the block number in the summary file.
+	 */
+	sb = XFS_SUMOFFSTOBLOCK(mp, so);
+	/*
+	 * If we have an old buffer, and the block number matches, use that.
+	 */
+	if (rbpp && *rbpp && *rsb == sb)
+		bp = *rbpp;
+	/*
+	 * Otherwise we have to get the buffer.
+	 */
+	else {
+		/*
+		 * If there was an old one, get rid of it first.
+		 */
+		if (rbpp && *rbpp)
+			xfs_trans_brelse(tp, *rbpp);
+		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+		if (error) {
+			return error;
+		}
+		/*
+		 * Remember this buffer and block for the next call.
+		 */
+		if (rbpp) {
+			*rbpp = bp;
+			*rsb = sb;
+		}
+	}
+	/*
+	 * Point to the summary information, modify and log it.
+	 */
+	sp = XFS_SUMPTR(mp, bp, so);
+	*sp += delta;
+	xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)),
+		(uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1));
+	return 0;
+}
+
+/*
+ * Visible (exported) functions.
+ */
+
+/*
+ * Grow the realtime area of the filesystem.
+ */
+int
+xfs_growfs_rt(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_growfs_rt_t	*in)		/* growfs rt input struct */
+{
+	xfs_rtblock_t	bmbno;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* temporary buffer */
+	int		error;		/* error return value */
+	xfs_mount_t	*nmp;		/* new (fake) mount structure */
+	xfs_drfsbno_t	nrblocks;	/* new number of realtime blocks */
+	xfs_extlen_t	nrbmblocks;	/* new number of rt bitmap blocks */
+	xfs_drtbno_t	nrextents;	/* new number of realtime extents */
+	uint8_t		nrextslog;	/* new log2 of sb_rextents */
+	xfs_extlen_t	nrsumblocks;	/* new number of summary blocks */
+	uint		nrsumlevels;	/* new rt summary levels */
+	uint		nrsumsize;	/* new size of rt summary, bytes */
+	xfs_sb_t	*nsbp;		/* new superblock */
+	xfs_extlen_t	rbmblocks;	/* current number of rt bitmap blocks */
+	xfs_extlen_t	rsumblocks;	/* current number of rt summary blks */
+	xfs_sb_t	*sbp;		/* old superblock */
+	xfs_fsblock_t	sumbno;		/* summary block number */
+
+	sbp = &mp->m_sb;
+	/*
+	 * Initial error checking.
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return XFS_ERROR(EPERM);
+	if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
+	    (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
+	    (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
+		return XFS_ERROR(EINVAL);
+	if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks)))
+		return error;
+	/*
+	 * Read in the last block of the device, make sure it exists.
+	 */
+	bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+				XFS_FSB_TO_BB(mp, nrblocks - 1),
+				XFS_FSB_TO_B(mp, 1), 0);
+	if (!bp)
+		return EIO;
+	xfs_buf_relse(bp);
+
+	/*
+	 * Calculate new parameters.  These are the final values to be reached.
+	 */
+	nrextents = nrblocks;
+	do_div(nrextents, in->extsize);
+	nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize);
+	nrextslog = xfs_highbit32(nrextents);
+	nrsumlevels = nrextslog + 1;
+	nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks;
+	nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+	nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+	/*
+	 * New summary size can't be more than half the size of
+	 * the log.  This prevents us from getting a log overflow,
+	 * since we'll log basically the whole summary file at once.
+	 */
+	if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
+		return XFS_ERROR(EINVAL);
+	/*
+	 * Get the old block counts for bitmap and summary inodes.
+	 * These can't change since other growfs callers are locked out.
+	 */
+	rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_d.di_size);
+	rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_d.di_size);
+	/*
+	 * Allocate space to the bitmap and summary files, as necessary.
+	 */
+	error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
+	if (error)
+		return error;
+	error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
+	if (error)
+		return error;
+	/*
+	 * Allocate a new (fake) mount/sb.
+	 */
+	nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
+	/*
+	 * Loop over the bitmap blocks.
+	 * We will do everything one bitmap block at a time.
+	 * Skip the current block if it is exactly full.
+	 * This also deals with the case where there were no rtextents before.
+	 */
+	for (bmbno = sbp->sb_rbmblocks -
+		     ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
+	     bmbno < nrbmblocks;
+	     bmbno++) {
+		xfs_trans_t	*tp;
+		int		cancelflags = 0;
+
+		*nmp = *mp;
+		nsbp = &nmp->m_sb;
+		/*
+		 * Calculate new sb and mount fields for this round.
+		 */
+		nsbp->sb_rextsize = in->extsize;
+		nsbp->sb_rbmblocks = bmbno + 1;
+		nsbp->sb_rblocks =
+			XFS_RTMIN(nrblocks,
+				  nsbp->sb_rbmblocks * NBBY *
+				  nsbp->sb_blocksize * nsbp->sb_rextsize);
+		nsbp->sb_rextents = nsbp->sb_rblocks;
+		do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
+		ASSERT(nsbp->sb_rextents != 0);
+		nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
+		nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
+		nrsumsize =
+			(uint)sizeof(xfs_suminfo_t) * nrsumlevels *
+			nsbp->sb_rbmblocks;
+		nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+		nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+		/*
+		 * Start a transaction, get the log reservation.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
+		if ((error = xfs_trans_reserve(tp, 0,
+				XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
+			goto error_cancel;
+		/*
+		 * Lock out other callers by grabbing the bitmap inode lock.
+		 */
+		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+		/*
+		 * Update the bitmap inode's size.
+		 */
+		mp->m_rbmip->i_d.di_size =
+			nsbp->sb_rbmblocks * nsbp->sb_blocksize;
+		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+		cancelflags |= XFS_TRANS_ABORT;
+		/*
+		 * Get the summary inode into the transaction.
+		 */
+		xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+		/*
+		 * Update the summary inode's size.
+		 */
+		mp->m_rsumip->i_d.di_size = nmp->m_rsumsize;
+		xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE);
+		/*
+		 * Copy summary data from old to new sizes.
+		 * Do this when the real size (not block-aligned) changes.
+		 */
+		if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks ||
+		    mp->m_rsumlevels != nmp->m_rsumlevels) {
+			error = xfs_rtcopy_summary(mp, nmp, tp);
+			if (error)
+				goto error_cancel;
+		}
+		/*
+		 * Update superblock fields.
+		 */
+		if (nsbp->sb_rextsize != sbp->sb_rextsize)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
+				nsbp->sb_rextsize - sbp->sb_rextsize);
+		if (nsbp->sb_rbmblocks != sbp->sb_rbmblocks)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
+				nsbp->sb_rbmblocks - sbp->sb_rbmblocks);
+		if (nsbp->sb_rblocks != sbp->sb_rblocks)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
+				nsbp->sb_rblocks - sbp->sb_rblocks);
+		if (nsbp->sb_rextents != sbp->sb_rextents)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
+				nsbp->sb_rextents - sbp->sb_rextents);
+		if (nsbp->sb_rextslog != sbp->sb_rextslog)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
+				nsbp->sb_rextslog - sbp->sb_rextslog);
+		/*
+		 * Free new extent.
+		 */
+		bp = NULL;
+		error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
+			nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
+		if (error) {
+error_cancel:
+			xfs_trans_cancel(tp, cancelflags);
+			break;
+		}
+		/*
+		 * Mark more blocks free in the superblock.
+		 */
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS,
+			nsbp->sb_rextents - sbp->sb_rextents);
+		/*
+		 * Update mp values into the real mp structure.
+		 */
+		mp->m_rsumlevels = nrsumlevels;
+		mp->m_rsumsize = nrsumsize;
+
+		error = xfs_trans_commit(tp, 0);
+		if (error)
+			break;
+	}
+
+	/*
+	 * Free the fake mp structure.
+	 */
+	kmem_free(nmp);
+
+	return error;
+}
+
+/*
+ * Allocate an extent in the realtime subvolume, with the usual allocation
+ * parameters.  The length units are all in realtime extents, as is the
+ * result block number.
+ */
+int					/* error */
+xfs_rtallocate_extent(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to allocate */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_alloctype_t	type,		/* allocation type XFS_ALLOCTYPE... */
+	int		wasdel,		/* was a delayed allocation extent */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	xfs_mount_t	*mp = tp->t_mountp;
+	int		error;		/* error value */
+	xfs_rtblock_t	r;		/* result allocated block */
+	xfs_fsblock_t	sb;		/* summary file block number */
+	xfs_buf_t	*sumbp;		/* summary file block buffer */
+
+	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+	ASSERT(minlen > 0 && minlen <= maxlen);
+
+	/*
+	 * If prod is set then figure out what to do to minlen and maxlen.
+	 */
+	if (prod > 1) {
+		xfs_extlen_t	i;
+
+		if ((i = maxlen % prod))
+			maxlen -= i;
+		if ((i = minlen % prod))
+			minlen += prod - i;
+		if (maxlen < minlen) {
+			*rtblock = NULLRTBLOCK;
+			return 0;
+		}
+	}
+
+	sumbp = NULL;
+	/*
+	 * Allocate by size, or near another block, or exactly at some block.
+	 */
+	switch (type) {
+	case XFS_ALLOCTYPE_ANY_AG:
+		error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len,
+				&sumbp,	&sb, prod, &r);
+		break;
+	case XFS_ALLOCTYPE_NEAR_BNO:
+		error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen,
+				len, &sumbp, &sb, prod, &r);
+		break;
+	case XFS_ALLOCTYPE_THIS_BNO:
+		error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen,
+				len, &sumbp, &sb, prod, &r);
+		break;
+	default:
+		error = EIO;
+		ASSERT(0);
+	}
+	if (error)
+		return error;
+
+	/*
+	 * If it worked, update the superblock.
+	 */
+	if (r != NULLRTBLOCK) {
+		long	slen = (long)*len;
+
+		ASSERT(*len >= minlen && *len <= maxlen);
+		if (wasdel)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen);
+		else
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen);
+	}
+	*rtblock = r;
+	return 0;
+}
+
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int					/* error */
+xfs_rtfree_extent(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to free */
+	xfs_extlen_t	len)		/* length of extent freed */
+{
+	int		error;		/* error value */
+	xfs_mount_t	*mp;		/* file system mount structure */
+	xfs_fsblock_t	sb;		/* summary file block number */
+	xfs_buf_t	*sumbp;		/* summary file block buffer */
+
+	mp = tp->t_mountp;
+	/*
+	 * Synchronize by locking the bitmap inode.
+	 */
+	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
+#if defined(__KERNEL__) && defined(DEBUG)
+	/*
+	 * Check to see that this whole range is currently allocated.
+	 */
+	{
+		int	stat;		/* result from checking range */
+
+		error = xfs_rtcheck_alloc_range(mp, tp, bno, len, &stat);
+		if (error) {
+			return error;
+		}
+		ASSERT(stat);
+	}
+#endif
+	sumbp = NULL;
+	/*
+	 * Free the range of realtime blocks.
+	 */
+	error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Mark more blocks free in the superblock.
+	 */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+	/*
+	 * If we've now freed all the blocks, reset the file sequence
+	 * number to 0.
+	 */
+	if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+	    mp->m_sb.sb_rextents) {
+		if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+			mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+		*(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
+		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+	}
+	return 0;
+}
+
+/*
+ * Initialize realtime fields in the mount structure.
+ */
+int				/* error */
+xfs_rtmount_init(
+	xfs_mount_t	*mp)	/* file system mount structure */
+{
+	xfs_buf_t	*bp;	/* buffer for last block of subvolume */
+	xfs_daddr_t	d;	/* address of last block of subvolume */
+	xfs_sb_t	*sbp;	/* filesystem superblock copy in mount */
+
+	sbp = &mp->m_sb;
+	if (sbp->sb_rblocks == 0)
+		return 0;
+	if (mp->m_rtdev_targp == NULL) {
+		xfs_warn(mp,
+	"Filesystem has a realtime volume, use rtdev=device option");
+		return XFS_ERROR(ENODEV);
+	}
+	mp->m_rsumlevels = sbp->sb_rextslog + 1;
+	mp->m_rsumsize =
+		(uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels *
+		sbp->sb_rbmblocks;
+	mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize);
+	mp->m_rbmip = mp->m_rsumip = NULL;
+	/*
+	 * Check that the realtime section is an ok size.
+	 */
+	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
+		xfs_warn(mp, "realtime mount -- %llu != %llu",
+			(unsigned long long) XFS_BB_TO_FSB(mp, d),
+			(unsigned long long) mp->m_sb.sb_rblocks);
+		return XFS_ERROR(EFBIG);
+	}
+	bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+					d - XFS_FSB_TO_BB(mp, 1),
+					XFS_FSB_TO_B(mp, 1), 0);
+	if (!bp) {
+		xfs_warn(mp, "realtime device size check failed");
+		return EIO;
+	}
+	xfs_buf_relse(bp);
+	return 0;
+}
+
+/*
+ * Get the bitmap and summary inodes into the mount structure
+ * at mount time.
+ */
+int					/* error */
+xfs_rtmount_inodes(
+	xfs_mount_t	*mp)		/* file system mount structure */
+{
+	int		error;		/* error return value */
+	xfs_sb_t	*sbp;
+
+	sbp = &mp->m_sb;
+	if (sbp->sb_rbmino == NULLFSINO)
+		return 0;
+	error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
+	if (error)
+		return error;
+	ASSERT(mp->m_rbmip != NULL);
+	ASSERT(sbp->sb_rsumino != NULLFSINO);
+	error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
+	if (error) {
+		IRELE(mp->m_rbmip);
+		return error;
+	}
+	ASSERT(mp->m_rsumip != NULL);
+	return 0;
+}
+
+void
+xfs_rtunmount_inodes(
+	struct xfs_mount	*mp)
+{
+	if (mp->m_rbmip)
+		IRELE(mp->m_rbmip);
+	if (mp->m_rsumip)
+		IRELE(mp->m_rsumip);
+}
+
+/*
+ * Pick an extent for allocation at the start of a new realtime file.
+ * Use the sequence number stored in the atime field of the bitmap inode.
+ * Translate this to a fraction of the rtextents, and return the product
+ * of rtextents and the fraction.
+ * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
+ */
+int					/* error */
+xfs_rtpick_extent(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_extlen_t	len,		/* allocation length (rtextents) */
+	xfs_rtblock_t	*pick)		/* result rt extent */
+{
+	xfs_rtblock_t	b;		/* result block */
+	int		log2;		/* log of sequence number */
+	__uint64_t	resid;		/* residual after log removed */
+	__uint64_t	seq;		/* sequence number of file creation */
+	__uint64_t	*seqp;		/* pointer to seqno in inode */
+
+	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+
+	seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
+	if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
+		mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+		*seqp = 0;
+	}
+	seq = *seqp;
+	if ((log2 = xfs_highbit64(seq)) == -1)
+		b = 0;
+	else {
+		resid = seq - (1ULL << log2);
+		b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >>
+		    (log2 + 1);
+		if (b >= mp->m_sb.sb_rextents)
+			b = do_mod(b, mp->m_sb.sb_rextents);
+		if (b + len > mp->m_sb.sb_rextents)
+			b = mp->m_sb.sb_rextents - len;
+	}
+	*seqp = seq + 1;
+	xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+	*pick = b;
+	return 0;
+}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
new file mode 100644
index 00000000..09e1f4f3
--- /dev/null
+++ b/fs/xfs/xfs_rtalloc.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_RTALLOC_H__
+#define	__XFS_RTALLOC_H__
+
+struct xfs_mount;
+struct xfs_trans;
+
+/* Min and max rt extent sizes, specified in bytes */
+#define	XFS_MAX_RTEXTSIZE	(1024 * 1024 * 1024)	/* 1GB */
+#define	XFS_DFL_RTEXTSIZE	(64 * 1024)	        /* 64kB */
+#define	XFS_MIN_RTEXTSIZE	(4 * 1024)		/* 4kB */
+
+/*
+ * Constants for bit manipulations.
+ */
+#define	XFS_NBBYLOG	3		/* log2(NBBY) */
+#define	XFS_WORDLOG	2		/* log2(sizeof(xfs_rtword_t)) */
+#define	XFS_NBWORDLOG	(XFS_NBBYLOG + XFS_WORDLOG)
+#define	XFS_NBWORD	(1 << XFS_NBWORDLOG)
+#define	XFS_WORDMASK	((1 << XFS_WORDLOG) - 1)
+
+#define	XFS_BLOCKSIZE(mp)	((mp)->m_sb.sb_blocksize)
+#define	XFS_BLOCKMASK(mp)	((mp)->m_blockmask)
+#define	XFS_BLOCKWSIZE(mp)	((mp)->m_blockwsize)
+#define	XFS_BLOCKWMASK(mp)	((mp)->m_blockwmask)
+
+/*
+ * Summary and bit manipulation macros.
+ */
+#define	XFS_SUMOFFS(mp,ls,bb)	((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
+#define	XFS_SUMOFFSTOBLOCK(mp,s)	\
+	(((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
+#define	XFS_SUMPTR(mp,bp,so)	\
+	((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \
+		(((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
+
+#define	XFS_BITTOBLOCK(mp,bi)	((bi) >> (mp)->m_blkbit_log)
+#define	XFS_BLOCKTOBIT(mp,bb)	((bb) << (mp)->m_blkbit_log)
+#define	XFS_BITTOWORD(mp,bi)	\
+	((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
+
+#define	XFS_RTMIN(a,b)	((a) < (b) ? (a) : (b))
+#define	XFS_RTMAX(a,b)	((a) > (b) ? (a) : (b))
+
+#define	XFS_RTLOBIT(w)	xfs_lowbit32(w)
+#define	XFS_RTHIBIT(w)	xfs_highbit32(w)
+
+#if XFS_BIG_BLKNOS
+#define	XFS_RTBLOCKLOG(b)	xfs_highbit64(b)
+#else
+#define	XFS_RTBLOCKLOG(b)	xfs_highbit32(b)
+#endif
+
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_XFS_RT
+/*
+ * Function prototypes for exported functions.
+ */
+
+/*
+ * Allocate an extent in the realtime subvolume, with the usual allocation
+ * parameters.  The length units are all in realtime extents, as is the
+ * result block number.
+ */
+int					/* error */
+xfs_rtallocate_extent(
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_rtblock_t		bno,	/* starting block number to allocate */
+	xfs_extlen_t		minlen,	/* minimum length to allocate */
+	xfs_extlen_t		maxlen,	/* maximum length to allocate */
+	xfs_extlen_t		*len,	/* out: actual length allocated */
+	xfs_alloctype_t		type,	/* allocation type XFS_ALLOCTYPE... */
+	int			wasdel,	/* was a delayed allocation extent */
+	xfs_extlen_t		prod,	/* extent product factor */
+	xfs_rtblock_t		*rtblock); /* out: start block allocated */
+
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int					/* error */
+xfs_rtfree_extent(
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_rtblock_t		bno,	/* starting block number to free */
+	xfs_extlen_t		len);	/* length of extent freed */
+
+/*
+ * Initialize realtime fields in the mount structure.
+ */
+int					/* error */
+xfs_rtmount_init(
+	struct xfs_mount	*mp);	/* file system mount structure */
+void
+xfs_rtunmount_inodes(
+	struct xfs_mount	*mp);
+
+/*
+ * Get the bitmap and summary inodes into the mount structure
+ * at mount time.
+ */
+int					/* error */
+xfs_rtmount_inodes(
+	struct xfs_mount	*mp);	/* file system mount structure */
+
+/*
+ * Pick an extent for allocation at the start of a new realtime file.
+ * Use the sequence number stored in the atime field of the bitmap inode.
+ * Translate this to a fraction of the rtextents, and return the product
+ * of rtextents and the fraction.
+ * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
+ */
+int					/* error */
+xfs_rtpick_extent(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_extlen_t		len,	/* allocation length (rtextents) */
+	xfs_rtblock_t		*pick);	/* result rt extent */
+
+/*
+ * Grow the realtime area of the filesystem.
+ */
+int
+xfs_growfs_rt(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	xfs_growfs_rt_t		*in);	/* user supplied growfs struct */
+
+#else
+# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb)  (ENOSYS)
+# define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
+# define xfs_rtpick_extent(m,t,l,rb)                    (ENOSYS)
+# define xfs_growfs_rt(mp,in)                           (ENOSYS)
+static inline int		/* error */
+xfs_rtmount_init(
+	xfs_mount_t	*mp)	/* file system mount structure */
+{
+	if (mp->m_sb.sb_rblocks == 0)
+		return 0;
+
+	xfs_warn(mp, "Not built with CONFIG_XFS_RT");
+	return ENOSYS;
+}
+# define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+# define xfs_rtunmount_inodes(m)
+#endif	/* CONFIG_XFS_RT */
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
new file mode 100644
index 00000000..d6d6fdfe
--- /dev/null
+++ b/fs/xfs/xfs_rw.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_rw.h"
+
+/*
+ * Force a shutdown of the filesystem instantly while keeping
+ * the filesystem consistent. We don't do an unmount here; just shutdown
+ * the shop, make sure that absolutely nothing persistent happens to
+ * this filesystem after this point.
+ */
+void
+xfs_do_force_shutdown(
+	xfs_mount_t	*mp,
+	int		flags,
+	char		*fname,
+	int		lnnum)
+{
+	int		logerror;
+
+	logerror = flags & SHUTDOWN_LOG_IO_ERROR;
+
+	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+		xfs_notice(mp,
+	"%s(0x%x) called from line %d of file %s.  Return address = 0x%p",
+			__func__, flags, lnnum, fname, __return_address);
+	}
+	/*
+	 * No need to duplicate efforts.
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
+		return;
+
+	/*
+	 * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
+	 * queue up anybody new on the log reservations, and wakes up
+	 * everybody who's sleeping on log reservations to tell them
+	 * the bad news.
+	 */
+	if (xfs_log_force_umount(mp, logerror))
+		return;
+
+	if (flags & SHUTDOWN_CORRUPT_INCORE) {
+		xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
+    "Corruption of in-memory data detected.  Shutting down filesystem");
+		if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
+			xfs_stack_trace();
+	} else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+		if (logerror) {
+			xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+		"Log I/O Error Detected.  Shutting down filesystem");
+		} else if (flags & SHUTDOWN_DEVICE_REQ) {
+			xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+		"All device paths lost.  Shutting down filesystem");
+		} else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
+			xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+		"I/O Error Detected. Shutting down filesystem");
+		}
+	}
+	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+		xfs_alert(mp,
+	"Please umount the filesystem and rectify the problem(s)");
+	}
+}
+
+/*
+ * Prints out an ALERT message about I/O error.
+ */
+void
+xfs_ioerror_alert(
+	char			*func,
+	struct xfs_mount	*mp,
+	xfs_buf_t		*bp,
+	xfs_daddr_t		blkno)
+{
+	xfs_alert(mp,
+		 "I/O error occurred: meta-data dev %s block 0x%llx"
+		 "       (\"%s\") error %d buf count %zd",
+		XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+		(__uint64_t)blkno, func,
+		XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
+}
+
+/*
+ * This isn't an absolute requirement, but it is
+ * just a good idea to call xfs_read_buf instead of
+ * directly doing a read_buf call. For one, we shouldn't
+ * be doing this disk read if we are in SHUTDOWN state anyway,
+ * so this stops that from happening. Secondly, this does all
+ * the error checking stuff and the brelse if appropriate for
+ * the caller, so the code can be a little leaner.
+ */
+
+int
+xfs_read_buf(
+	struct xfs_mount *mp,
+	xfs_buftarg_t	 *target,
+	xfs_daddr_t	 blkno,
+	int              len,
+	uint             flags,
+	xfs_buf_t	 **bpp)
+{
+	xfs_buf_t	 *bp;
+	int		 error;
+
+	if (!flags)
+		flags = XBF_LOCK | XBF_MAPPED;
+
+	bp = xfs_buf_read(target, blkno, len, flags);
+	if (!bp)
+		return XFS_ERROR(EIO);
+	error = XFS_BUF_GETERROR(bp);
+	if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) {
+		*bpp = bp;
+	} else {
+		*bpp = NULL;
+		if (error) {
+			xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp));
+		} else {
+			error = XFS_ERROR(EIO);
+		}
+		if (bp) {
+			XFS_BUF_UNDONE(bp);
+			XFS_BUF_UNDELAYWRITE(bp);
+			XFS_BUF_STALE(bp);
+			/*
+			 * brelse clears B_ERROR and b_error
+			 */
+			xfs_buf_relse(bp);
+		}
+	}
+	return (error);
+}
+
+/*
+ * helper function to extract extent size hint from inode
+ */
+xfs_extlen_t
+xfs_get_extsz_hint(
+	struct xfs_inode	*ip)
+{
+	if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
+		return ip->i_d.di_extsize;
+	if (XFS_IS_REALTIME_INODE(ip))
+		return ip->i_mount->m_sb.sb_rextsize;
+	return 0;
+}
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
new file mode 100644
index 00000000..11c41ec6
--- /dev/null
+++ b/fs/xfs/xfs_rw.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_RW_H__
+#define	__XFS_RW_H__
+
+struct xfs_buf;
+struct xfs_inode;
+struct xfs_mount;
+
+/*
+ * Convert the given file system block to a disk block.
+ * We have to treat it differently based on whether the
+ * file is a real time file or not, because the bmap code
+ * does.
+ */
+static inline xfs_daddr_t
+xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
+{
+	return (XFS_IS_REALTIME_INODE(ip) ? \
+		 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+		 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+}
+
+/*
+ * Prototypes for functions in xfs_rw.c.
+ */
+extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
+			xfs_daddr_t blkno, int len, uint flags,
+			struct xfs_buf **bpp);
+extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
+				xfs_buf_t *bp, xfs_daddr_t blkno);
+extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
+
+#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
new file mode 100644
index 00000000..1eb2ba58
--- /dev/null
+++ b/fs/xfs/xfs_sb.h
@@ -0,0 +1,543 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SB_H__
+#define	__XFS_SB_H__
+
+/*
+ * Super block
+ * Fits into a sector-sized buffer at address 0 of each allocation group.
+ * Only the first of these is ever updated except during growfs.
+ */
+
+struct xfs_buf;
+struct xfs_mount;
+
+#define	XFS_SB_MAGIC		0x58465342	/* 'XFSB' */
+#define	XFS_SB_VERSION_1	1		/* 5.3, 6.0.1, 6.1 */
+#define	XFS_SB_VERSION_2	2		/* 6.2 - attributes */
+#define	XFS_SB_VERSION_3	3		/* 6.2 - new inode version */
+#define	XFS_SB_VERSION_4	4		/* 6.2+ - bitmask version */
+#define	XFS_SB_VERSION_NUMBITS		0x000f
+#define	XFS_SB_VERSION_ALLFBITS		0xfff0
+#define	XFS_SB_VERSION_SASHFBITS	0xf000
+#define	XFS_SB_VERSION_REALFBITS	0x0ff0
+#define	XFS_SB_VERSION_ATTRBIT		0x0010
+#define	XFS_SB_VERSION_NLINKBIT		0x0020
+#define	XFS_SB_VERSION_QUOTABIT		0x0040
+#define	XFS_SB_VERSION_ALIGNBIT		0x0080
+#define	XFS_SB_VERSION_DALIGNBIT	0x0100
+#define	XFS_SB_VERSION_SHAREDBIT	0x0200
+#define XFS_SB_VERSION_LOGV2BIT		0x0400
+#define XFS_SB_VERSION_SECTORBIT	0x0800
+#define	XFS_SB_VERSION_EXTFLGBIT	0x1000
+#define	XFS_SB_VERSION_DIRV2BIT		0x2000
+#define	XFS_SB_VERSION_BORGBIT		0x4000	/* ASCII only case-insens. */
+#define	XFS_SB_VERSION_MOREBITSBIT	0x8000
+#define	XFS_SB_VERSION_OKSASHFBITS	\
+	(XFS_SB_VERSION_EXTFLGBIT | \
+	 XFS_SB_VERSION_DIRV2BIT | \
+	 XFS_SB_VERSION_BORGBIT)
+#define	XFS_SB_VERSION_OKREALFBITS	\
+	(XFS_SB_VERSION_ATTRBIT | \
+	 XFS_SB_VERSION_NLINKBIT | \
+	 XFS_SB_VERSION_QUOTABIT | \
+	 XFS_SB_VERSION_ALIGNBIT | \
+	 XFS_SB_VERSION_DALIGNBIT | \
+	 XFS_SB_VERSION_SHAREDBIT | \
+	 XFS_SB_VERSION_LOGV2BIT | \
+	 XFS_SB_VERSION_SECTORBIT | \
+	 XFS_SB_VERSION_MOREBITSBIT)
+#define	XFS_SB_VERSION_OKREALBITS	\
+	(XFS_SB_VERSION_NUMBITS | \
+	 XFS_SB_VERSION_OKREALFBITS | \
+	 XFS_SB_VERSION_OKSASHFBITS)
+
+/*
+ * There are two words to hold XFS "feature" bits: the original
+ * word, sb_versionnum, and sb_features2.  Whenever a bit is set in
+ * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
+ *
+ * These defines represent bits in sb_features2.
+ */
+#define XFS_SB_VERSION2_REALFBITS	0x00ffffff	/* Mask: features */
+#define XFS_SB_VERSION2_RESERVED1BIT	0x00000001
+#define XFS_SB_VERSION2_LAZYSBCOUNTBIT	0x00000002	/* Superblk counters */
+#define XFS_SB_VERSION2_RESERVED4BIT	0x00000004
+#define XFS_SB_VERSION2_ATTR2BIT	0x00000008	/* Inline attr rework */
+#define XFS_SB_VERSION2_PARENTBIT	0x00000010	/* parent pointers */
+#define XFS_SB_VERSION2_PROJID32BIT	0x00000080	/* 32 bit project id */
+
+#define	XFS_SB_VERSION2_OKREALFBITS	\
+	(XFS_SB_VERSION2_LAZYSBCOUNTBIT	| \
+	 XFS_SB_VERSION2_ATTR2BIT	| \
+	 XFS_SB_VERSION2_PROJID32BIT)
+#define	XFS_SB_VERSION2_OKSASHFBITS	\
+	(0)
+#define XFS_SB_VERSION2_OKREALBITS	\
+	(XFS_SB_VERSION2_OKREALFBITS |	\
+	 XFS_SB_VERSION2_OKSASHFBITS )
+
+/*
+ * Superblock - in core version.  Must match the ondisk version below.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_sb {
+	__uint32_t	sb_magicnum;	/* magic number == XFS_SB_MAGIC */
+	__uint32_t	sb_blocksize;	/* logical block size, bytes */
+	xfs_drfsbno_t	sb_dblocks;	/* number of data blocks */
+	xfs_drfsbno_t	sb_rblocks;	/* number of realtime blocks */
+	xfs_drtbno_t	sb_rextents;	/* number of realtime extents */
+	uuid_t		sb_uuid;	/* file system unique id */
+	xfs_dfsbno_t	sb_logstart;	/* starting block of log if internal */
+	xfs_ino_t	sb_rootino;	/* root inode number */
+	xfs_ino_t	sb_rbmino;	/* bitmap inode for realtime extents */
+	xfs_ino_t	sb_rsumino;	/* summary inode for rt bitmap */
+	xfs_agblock_t	sb_rextsize;	/* realtime extent size, blocks */
+	xfs_agblock_t	sb_agblocks;	/* size of an allocation group */
+	xfs_agnumber_t	sb_agcount;	/* number of allocation groups */
+	xfs_extlen_t	sb_rbmblocks;	/* number of rt bitmap blocks */
+	xfs_extlen_t	sb_logblocks;	/* number of log blocks */
+	__uint16_t	sb_versionnum;	/* header version == XFS_SB_VERSION */
+	__uint16_t	sb_sectsize;	/* volume sector size, bytes */
+	__uint16_t	sb_inodesize;	/* inode size, bytes */
+	__uint16_t	sb_inopblock;	/* inodes per block */
+	char		sb_fname[12];	/* file system name */
+	__uint8_t	sb_blocklog;	/* log2 of sb_blocksize */
+	__uint8_t	sb_sectlog;	/* log2 of sb_sectsize */
+	__uint8_t	sb_inodelog;	/* log2 of sb_inodesize */
+	__uint8_t	sb_inopblog;	/* log2 of sb_inopblock */
+	__uint8_t	sb_agblklog;	/* log2 of sb_agblocks (rounded up) */
+	__uint8_t	sb_rextslog;	/* log2 of sb_rextents */
+	__uint8_t	sb_inprogress;	/* mkfs is in progress, don't mount */
+	__uint8_t	sb_imax_pct;	/* max % of fs for inode space */
+					/* statistics */
+	/*
+	 * These fields must remain contiguous.  If you really
+	 * want to change their layout, make sure you fix the
+	 * code in xfs_trans_apply_sb_deltas().
+	 */
+	__uint64_t	sb_icount;	/* allocated inodes */
+	__uint64_t	sb_ifree;	/* free inodes */
+	__uint64_t	sb_fdblocks;	/* free data blocks */
+	__uint64_t	sb_frextents;	/* free realtime extents */
+	/*
+	 * End contiguous fields.
+	 */
+	xfs_ino_t	sb_uquotino;	/* user quota inode */
+	xfs_ino_t	sb_gquotino;	/* group quota inode */
+	__uint16_t	sb_qflags;	/* quota flags */
+	__uint8_t	sb_flags;	/* misc. flags */
+	__uint8_t	sb_shared_vn;	/* shared version number */
+	xfs_extlen_t	sb_inoalignmt;	/* inode chunk alignment, fsblocks */
+	__uint32_t	sb_unit;	/* stripe or raid unit */
+	__uint32_t	sb_width;	/* stripe or raid width */
+	__uint8_t	sb_dirblklog;	/* log2 of dir block size (fsbs) */
+	__uint8_t	sb_logsectlog;	/* log2 of the log sector size */
+	__uint16_t	sb_logsectsize;	/* sector size for the log, bytes */
+	__uint32_t	sb_logsunit;	/* stripe unit size for the log */
+	__uint32_t	sb_features2;	/* additional feature bits */
+
+	/*
+	 * bad features2 field as a result of failing to pad the sb
+	 * structure to 64 bits. Some machines will be using this field
+	 * for features2 bits. Easiest just to mark it bad and not use
+	 * it for anything else.
+	 */
+	__uint32_t	sb_bad_features2;
+
+	/* must be padded to 64 bit alignment */
+} xfs_sb_t;
+
+/*
+ * Superblock - on disk version.  Must match the in core version above.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_dsb {
+	__be32		sb_magicnum;	/* magic number == XFS_SB_MAGIC */
+	__be32		sb_blocksize;	/* logical block size, bytes */
+	__be64		sb_dblocks;	/* number of data blocks */
+	__be64		sb_rblocks;	/* number of realtime blocks */
+	__be64		sb_rextents;	/* number of realtime extents */
+	uuid_t		sb_uuid;	/* file system unique id */
+	__be64		sb_logstart;	/* starting block of log if internal */
+	__be64		sb_rootino;	/* root inode number */
+	__be64		sb_rbmino;	/* bitmap inode for realtime extents */
+	__be64		sb_rsumino;	/* summary inode for rt bitmap */
+	__be32		sb_rextsize;	/* realtime extent size, blocks */
+	__be32		sb_agblocks;	/* size of an allocation group */
+	__be32		sb_agcount;	/* number of allocation groups */
+	__be32		sb_rbmblocks;	/* number of rt bitmap blocks */
+	__be32		sb_logblocks;	/* number of log blocks */
+	__be16		sb_versionnum;	/* header version == XFS_SB_VERSION */
+	__be16		sb_sectsize;	/* volume sector size, bytes */
+	__be16		sb_inodesize;	/* inode size, bytes */
+	__be16		sb_inopblock;	/* inodes per block */
+	char		sb_fname[12];	/* file system name */
+	__u8		sb_blocklog;	/* log2 of sb_blocksize */
+	__u8		sb_sectlog;	/* log2 of sb_sectsize */
+	__u8		sb_inodelog;	/* log2 of sb_inodesize */
+	__u8		sb_inopblog;	/* log2 of sb_inopblock */
+	__u8		sb_agblklog;	/* log2 of sb_agblocks (rounded up) */
+	__u8		sb_rextslog;	/* log2 of sb_rextents */
+	__u8		sb_inprogress;	/* mkfs is in progress, don't mount */
+	__u8		sb_imax_pct;	/* max % of fs for inode space */
+					/* statistics */
+	/*
+	 * These fields must remain contiguous.  If you really
+	 * want to change their layout, make sure you fix the
+	 * code in xfs_trans_apply_sb_deltas().
+	 */
+	__be64		sb_icount;	/* allocated inodes */
+	__be64		sb_ifree;	/* free inodes */
+	__be64		sb_fdblocks;	/* free data blocks */
+	__be64		sb_frextents;	/* free realtime extents */
+	/*
+	 * End contiguous fields.
+	 */
+	__be64		sb_uquotino;	/* user quota inode */
+	__be64		sb_gquotino;	/* group quota inode */
+	__be16		sb_qflags;	/* quota flags */
+	__u8		sb_flags;	/* misc. flags */
+	__u8		sb_shared_vn;	/* shared version number */
+	__be32		sb_inoalignmt;	/* inode chunk alignment, fsblocks */
+	__be32		sb_unit;	/* stripe or raid unit */
+	__be32		sb_width;	/* stripe or raid width */
+	__u8		sb_dirblklog;	/* log2 of dir block size (fsbs) */
+	__u8		sb_logsectlog;	/* log2 of the log sector size */
+	__be16		sb_logsectsize;	/* sector size for the log, bytes */
+	__be32		sb_logsunit;	/* stripe unit size for the log */
+	__be32		sb_features2;	/* additional feature bits */
+	/*
+	 * bad features2 field as a result of failing to pad the sb
+	 * structure to 64 bits. Some machines will be using this field
+	 * for features2 bits. Easiest just to mark it bad and not use
+	 * it for anything else.
+	 */
+	__be32	sb_bad_features2;
+
+	/* must be padded to 64 bit alignment */
+} xfs_dsb_t;
+
+/*
+ * Sequence number values for the fields.
+ */
+typedef enum {
+	XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
+	XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
+	XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
+	XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
+	XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
+	XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
+	XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
+	XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
+	XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
+	XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
+	XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
+	XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
+	XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2,
+	XFS_SBS_FIELDCOUNT
+} xfs_sb_field_t;
+
+/*
+ * Mask values, defined based on the xfs_sb_field_t values.
+ * Only define the ones we're using.
+ */
+#define	XFS_SB_MVAL(x)		(1LL << XFS_SBS_ ## x)
+#define	XFS_SB_UUID		XFS_SB_MVAL(UUID)
+#define	XFS_SB_FNAME		XFS_SB_MVAL(FNAME)
+#define	XFS_SB_ROOTINO		XFS_SB_MVAL(ROOTINO)
+#define	XFS_SB_RBMINO		XFS_SB_MVAL(RBMINO)
+#define	XFS_SB_RSUMINO		XFS_SB_MVAL(RSUMINO)
+#define	XFS_SB_VERSIONNUM	XFS_SB_MVAL(VERSIONNUM)
+#define XFS_SB_UQUOTINO		XFS_SB_MVAL(UQUOTINO)
+#define XFS_SB_GQUOTINO		XFS_SB_MVAL(GQUOTINO)
+#define XFS_SB_QFLAGS		XFS_SB_MVAL(QFLAGS)
+#define XFS_SB_SHARED_VN	XFS_SB_MVAL(SHARED_VN)
+#define XFS_SB_UNIT		XFS_SB_MVAL(UNIT)
+#define XFS_SB_WIDTH		XFS_SB_MVAL(WIDTH)
+#define XFS_SB_ICOUNT		XFS_SB_MVAL(ICOUNT)
+#define XFS_SB_IFREE		XFS_SB_MVAL(IFREE)
+#define XFS_SB_FDBLOCKS		XFS_SB_MVAL(FDBLOCKS)
+#define XFS_SB_FEATURES2	XFS_SB_MVAL(FEATURES2)
+#define XFS_SB_BAD_FEATURES2	XFS_SB_MVAL(BAD_FEATURES2)
+#define	XFS_SB_NUM_BITS		((int)XFS_SBS_FIELDCOUNT)
+#define	XFS_SB_ALL_BITS		((1LL << XFS_SB_NUM_BITS) - 1)
+#define	XFS_SB_MOD_BITS		\
+	(XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
+	 XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
+	 XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
+	 XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
+	 XFS_SB_BAD_FEATURES2)
+
+
+/*
+ * Misc. Flags - warning - these will be cleared by xfs_repair unless
+ * a feature bit is set when the flag is used.
+ */
+#define XFS_SBF_NOFLAGS		0x00	/* no flags set */
+#define XFS_SBF_READONLY	0x01	/* only read-only mounts allowed */
+
+/*
+ * define max. shared version we can interoperate with
+ */
+#define XFS_SB_MAX_SHARED_VN	0
+
+#define	XFS_SB_VERSION_NUM(sbp)	((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
+
+static inline int xfs_sb_good_version(xfs_sb_t *sbp)
+{
+	/* We always support version 1-3 */
+	if (sbp->sb_versionnum >= XFS_SB_VERSION_1 &&
+	    sbp->sb_versionnum <= XFS_SB_VERSION_3)
+		return 1;
+
+	/* We support version 4 if all feature bits are supported */
+	if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) {
+		if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) ||
+		    ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+		     (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
+			return 0;
+
+#ifdef __KERNEL__
+		if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
+			return 0;
+#else
+		if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) &&
+		    sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
+			return 0;
+#endif
+
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Detect a mismatched features2 field.  Older kernels read/wrote
+ * this into the wrong slot, so to be safe we keep them in sync.
+ */
+static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp)
+{
+	return (sbp->sb_bad_features2 != sbp->sb_features2);
+}
+
+static inline unsigned xfs_sb_version_tonew(unsigned v)
+{
+	if (v == XFS_SB_VERSION_1)
+		return XFS_SB_VERSION_4;
+
+	if (v == XFS_SB_VERSION_2)
+		return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
+
+	return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT |
+		XFS_SB_VERSION_NLINKBIT;
+}
+
+static inline unsigned xfs_sb_version_toold(unsigned v)
+{
+	if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT))
+		return 0;
+	if (v & XFS_SB_VERSION_NLINKBIT)
+		return XFS_SB_VERSION_3;
+	if (v & XFS_SB_VERSION_ATTRBIT)
+		return XFS_SB_VERSION_2;
+	return XFS_SB_VERSION_1;
+}
+
+static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp)
+{
+	return sbp->sb_versionnum == XFS_SB_VERSION_2 ||
+		sbp->sb_versionnum == XFS_SB_VERSION_3 ||
+		(XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		 (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
+}
+
+static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
+{
+	if (sbp->sb_versionnum == XFS_SB_VERSION_1)
+		sbp->sb_versionnum = XFS_SB_VERSION_2;
+	else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+		sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
+	else
+		sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
+}
+
+static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp)
+{
+	return sbp->sb_versionnum == XFS_SB_VERSION_3 ||
+		 (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		  (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
+}
+
+static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp)
+{
+	if (sbp->sb_versionnum <= XFS_SB_VERSION_2)
+		sbp->sb_versionnum = XFS_SB_VERSION_3;
+	else
+		sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
+}
+
+static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		(sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
+}
+
+static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
+{
+	if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+		sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
+	else
+		sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) |
+					XFS_SB_VERSION_QUOTABIT;
+}
+
+static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		(sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
+}
+
+static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		(sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
+}
+
+static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		(sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
+}
+
+static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+}
+
+static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		(sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
+}
+
+static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
+}
+
+static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		(sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
+}
+
+static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		(sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
+}
+
+static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		(sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+}
+
+/*
+ * sb_features2 bit version macros.
+ *
+ * For example, for a bit defined as XFS_SB_VERSION2_FUNBIT, has a macro:
+ *
+ * SB_VERSION_HASFUNBIT(xfs_sb_t *sbp)
+ *	((xfs_sb_version_hasmorebits(sbp) &&
+ *	 ((sbp)->sb_features2 & XFS_SB_VERSION2_FUNBIT)
+ */
+
+static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp)
+{
+	return xfs_sb_version_hasmorebits(sbp) &&
+		(sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT);
+}
+
+static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp)
+{
+	return xfs_sb_version_hasmorebits(sbp) &&
+		(sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
+}
+
+static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
+{
+	sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+	sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+}
+
+static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
+{
+	sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+	if (!sbp->sb_features2)
+		sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
+}
+
+static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
+{
+	return xfs_sb_version_hasmorebits(sbp) &&
+		(sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
+}
+
+/*
+ * end of superblock version macros
+ */
+
+#define XFS_SB_DADDR		((xfs_daddr_t)0) /* daddr in filesystem/ag */
+#define	XFS_SB_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
+#define XFS_BUF_TO_SBP(bp)	((xfs_dsb_t *)XFS_BUF_PTR(bp))
+
+#define	XFS_HDR_BLOCK(mp,d)	((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
+#define	XFS_DADDR_TO_FSB(mp,d)	XFS_AGB_TO_FSB(mp, \
+			xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
+#define	XFS_FSB_TO_DADDR(mp,fsbno)	XFS_AGB_TO_DADDR(mp, \
+			XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
+
+/*
+ * File system sector to basic block conversions.
+ */
+#define XFS_FSS_TO_BB(mp,sec)	((sec) << (mp)->m_sectbb_log)
+
+/*
+ * File system block to basic block conversions.
+ */
+#define	XFS_FSB_TO_BB(mp,fsbno)	((fsbno) << (mp)->m_blkbb_log)
+#define	XFS_BB_TO_FSB(mp,bb)	\
+	(((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
+#define	XFS_BB_TO_FSBT(mp,bb)	((bb) >> (mp)->m_blkbb_log)
+#define	XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1))
+
+/*
+ * File system block to byte conversions.
+ */
+#define XFS_FSB_TO_B(mp,fsbno)	((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSB(mp,b)	\
+	((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSBT(mp,b)	(((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_FSB_OFFSET(mp,b)	((b) & (mp)->m_blockmask)
+
+#endif	/* __XFS_SB_H__ */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
new file mode 100644
index 00000000..efc147f0
--- /dev/null
+++ b/fs/xfs/xfs_trans.c
@@ -0,0 +1,2026 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_trans_priv.h"
+#include "xfs_trans_space.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+
+kmem_zone_t	*xfs_trans_zone;
+kmem_zone_t	*xfs_log_item_desc_zone;
+
+
+/*
+ * Various log reservation values.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate.  Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note:  Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time.  This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction.  See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+
+
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode's bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_write_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+		     2 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+		     128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+			    XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
+		    (2 * mp->m_sb.sb_sectsize +
+		     2 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+		     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
+}
+
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_itruncate_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
+		     128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+		    (4 * mp->m_sb.sb_sectsize +
+		     4 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 4) +
+		     128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
+		     128 * 5 +
+		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+			    XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+}
+
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *	of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_rename_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((4 * mp->m_sb.sb_inodesize +
+		     2 * XFS_DIROP_LOG_RES(mp) +
+		     128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
+		    (3 * mp->m_sb.sb_sectsize +
+		     3 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 3) +
+		     128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
+}
+
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_link_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     mp->m_sb.sb_inodesize +
+		     XFS_DIROP_LOG_RES(mp) +
+		     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+		    (mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		     128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+}
+
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_remove_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     mp->m_sb.sb_inodesize +
+		     XFS_DIROP_LOG_RES(mp) +
+		     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+		    (2 * mp->m_sb.sb_sectsize +
+		     2 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+		     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
+}
+
+/*
+ * For symlink we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: 1 block
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ *    the blocks for the symlink: 1 kB
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     mp->m_sb.sb_inodesize +
+		     XFS_FSB_TO_B(mp, 1) +
+		     XFS_DIROP_LOG_RES(mp) +
+		     1024 +
+		     128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
+		    (2 * mp->m_sb.sb_sectsize +
+		     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+		     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+			    XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+}
+
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_create_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     mp->m_sb.sb_inodesize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_FSB_TO_B(mp, 1) +
+		     XFS_DIROP_LOG_RES(mp) +
+		     128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
+		    (3 * mp->m_sb.sb_sectsize +
+		     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+		     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+			    XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+}
+
+/*
+ * Making a new directory is the same as creating a new file.
+ */
+STATIC uint
+xfs_calc_mkdir_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_create_reservation(mp);
+}
+
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_ifree_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		mp->m_sb.sb_inodesize +
+		mp->m_sb.sb_sectsize +
+		mp->m_sb.sb_sectsize +
+		XFS_FSB_TO_B(mp, 1) +
+		MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
+		    XFS_INODE_CLUSTER_SIZE(mp)) +
+		128 * 5 +
+		XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+		       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+}
+
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
+STATIC uint
+xfs_calc_ichange_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		mp->m_sb.sb_inodesize +
+		mp->m_sb.sb_sectsize +
+		512;
+
+}
+
+/*
+ * Growing the data section of the filesystem.
+ *	superblock
+ *	agi and agf
+ *	allocation btrees
+ */
+STATIC uint
+xfs_calc_growdata_reservation(
+	struct xfs_mount	*mp)
+{
+	return mp->m_sb.sb_sectsize * 3 +
+		XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *	superblock: sector size
+ *	agf of the ag from which the extent is allocated: sector size
+ *	bmap btree for bitmap/summary inode: max depth * blocksize
+ *	bitmap/summary inode: inode size
+ *	allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
+STATIC uint
+xfs_calc_growrtalloc_reservation(
+	struct xfs_mount	*mp)
+{
+	return 2 * mp->m_sb.sb_sectsize +
+		XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+		mp->m_sb.sb_inodesize +
+		XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+		       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *	one bitmap/summary block: blocksize
+ */
+STATIC uint
+xfs_calc_growrtzero_reservation(
+	struct xfs_mount	*mp)
+{
+	return mp->m_sb.sb_blocksize + 128;
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *	superblock: sector size
+ *	bitmap inode: inode size
+ *	summary inode: inode size
+ *	one bitmap block: blocksize
+ *	summary blocks: new summary size
+ */
+STATIC uint
+xfs_calc_growrtfree_reservation(
+	struct xfs_mount	*mp)
+{
+	return mp->m_sb.sb_sectsize +
+		2 * mp->m_sb.sb_inodesize +
+		mp->m_sb.sb_blocksize +
+		mp->m_rsumsize +
+		128 * 5;
+}
+
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *	inode
+ */
+STATIC uint
+xfs_calc_swrite_reservation(
+	struct xfs_mount	*mp)
+{
+	return mp->m_sb.sb_inodesize + 128;
+}
+
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *	inode
+ */
+STATIC uint
+xfs_calc_writeid_reservation(xfs_mount_t *mp)
+{
+	return mp->m_sb.sb_inodesize + 128;
+}
+
+/*
+ * Converting the inode from non-attributed to attributed.
+ *	the inode being converted: inode size
+ *	agf block and superblock (for block allocation)
+ *	the new block (directory sized)
+ *	bmap blocks for the new directory block
+ *	allocation btrees
+ */
+STATIC uint
+xfs_calc_addafork_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		mp->m_sb.sb_inodesize +
+		mp->m_sb.sb_sectsize * 2 +
+		mp->m_dirblksize +
+		XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
+		XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
+		       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+}
+
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrinval_reservation(
+	struct xfs_mount	*mp)
+{
+	return MAX((mp->m_sb.sb_inodesize +
+		    XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+		    128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
+		   (4 * mp->m_sb.sb_sectsize +
+		    4 * mp->m_sb.sb_sectsize +
+		    mp->m_sb.sb_sectsize +
+		    XFS_ALLOCFREE_LOG_RES(mp, 4) +
+		    128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
+}
+
+/*
+ * Setting an attribute.
+ *	the inode getting the attribute
+ *	the superblock for allocations
+ *	the agfs extents are allocated from
+ *	the attribute btree * max depth
+ *	the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime.
+ */
+STATIC uint
+xfs_calc_attrset_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		mp->m_sb.sb_inodesize +
+		mp->m_sb.sb_sectsize +
+		XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+		128 * (2 + XFS_DA_NODE_MAXDEPTH);
+}
+
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrrm_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+		     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+		     128 * (1 + XFS_DA_NODE_MAXDEPTH +
+			    XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+		    (2 * mp->m_sb.sb_sectsize +
+		     2 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+		     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
+}
+
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
+STATIC uint
+xfs_calc_clear_agi_bucket_reservation(
+	struct xfs_mount	*mp)
+{
+	return mp->m_sb.sb_sectsize + 128;
+}
+
+/*
+ * Initialize the precomputed transaction reservation values
+ * in the mount structure.
+ */
+void
+xfs_trans_init(
+	struct xfs_mount	*mp)
+{
+	struct xfs_trans_reservations *resp = &mp->m_reservations;
+
+	resp->tr_write = xfs_calc_write_reservation(mp);
+	resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
+	resp->tr_rename = xfs_calc_rename_reservation(mp);
+	resp->tr_link = xfs_calc_link_reservation(mp);
+	resp->tr_remove = xfs_calc_remove_reservation(mp);
+	resp->tr_symlink = xfs_calc_symlink_reservation(mp);
+	resp->tr_create = xfs_calc_create_reservation(mp);
+	resp->tr_mkdir = xfs_calc_mkdir_reservation(mp);
+	resp->tr_ifree = xfs_calc_ifree_reservation(mp);
+	resp->tr_ichange = xfs_calc_ichange_reservation(mp);
+	resp->tr_growdata = xfs_calc_growdata_reservation(mp);
+	resp->tr_swrite = xfs_calc_swrite_reservation(mp);
+	resp->tr_writeid = xfs_calc_writeid_reservation(mp);
+	resp->tr_addafork = xfs_calc_addafork_reservation(mp);
+	resp->tr_attrinval = xfs_calc_attrinval_reservation(mp);
+	resp->tr_attrset = xfs_calc_attrset_reservation(mp);
+	resp->tr_attrrm = xfs_calc_attrrm_reservation(mp);
+	resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp);
+	resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp);
+	resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp);
+	resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp);
+}
+
+/*
+ * This routine is called to allocate a transaction structure.
+ * The type parameter indicates the type of the transaction.  These
+ * are enumerated in xfs_trans.h.
+ *
+ * Dynamically allocate the transaction structure from the transaction
+ * zone, initialize it, and return it to the caller.
+ */
+xfs_trans_t *
+xfs_trans_alloc(
+	xfs_mount_t	*mp,
+	uint		type)
+{
+	xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+	return _xfs_trans_alloc(mp, type, KM_SLEEP);
+}
+
+xfs_trans_t *
+_xfs_trans_alloc(
+	xfs_mount_t	*mp,
+	uint		type,
+	uint		memflags)
+{
+	xfs_trans_t	*tp;
+
+	atomic_inc(&mp->m_active_trans);
+
+	tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
+	tp->t_magic = XFS_TRANS_MAGIC;
+	tp->t_type = type;
+	tp->t_mountp = mp;
+	INIT_LIST_HEAD(&tp->t_items);
+	INIT_LIST_HEAD(&tp->t_busy);
+	return tp;
+}
+
+/*
+ * Free the transaction structure.  If there is more clean up
+ * to do when the structure is freed, add it here.
+ */
+STATIC void
+xfs_trans_free(
+	struct xfs_trans	*tp)
+{
+	xfs_alloc_busy_sort(&tp->t_busy);
+	xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false);
+
+	atomic_dec(&tp->t_mountp->m_active_trans);
+	xfs_trans_free_dqinfo(tp);
+	kmem_zone_free(xfs_trans_zone, tp);
+}
+
+/*
+ * This is called to create a new transaction which will share the
+ * permanent log reservation of the given transaction.  The remaining
+ * unused block and rt extent reservations are also inherited.  This
+ * implies that the original transaction is no longer allowed to allocate
+ * blocks.  Locks and log items, however, are no inherited.  They must
+ * be added to the new transaction explicitly.
+ */
+xfs_trans_t *
+xfs_trans_dup(
+	xfs_trans_t	*tp)
+{
+	xfs_trans_t	*ntp;
+
+	ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+
+	/*
+	 * Initialize the new transaction structure.
+	 */
+	ntp->t_magic = XFS_TRANS_MAGIC;
+	ntp->t_type = tp->t_type;
+	ntp->t_mountp = tp->t_mountp;
+	INIT_LIST_HEAD(&ntp->t_items);
+	INIT_LIST_HEAD(&ntp->t_busy);
+
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+	ASSERT(tp->t_ticket != NULL);
+
+	ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE);
+	ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
+	ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
+	tp->t_blk_res = tp->t_blk_res_used;
+	ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
+	tp->t_rtx_res = tp->t_rtx_res_used;
+	ntp->t_pflags = tp->t_pflags;
+
+	xfs_trans_dup_dqinfo(tp, ntp);
+
+	atomic_inc(&tp->t_mountp->m_active_trans);
+	return ntp;
+}
+
+/*
+ * This is called to reserve free disk blocks and log space for the
+ * given transaction.  This must be done before allocating any resources
+ * within the transaction.
+ *
+ * This will return ENOSPC if there are not enough blocks available.
+ * It will sleep waiting for available log space.
+ * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which
+ * is used by long running transactions.  If any one of the reservations
+ * fails then they will all be backed out.
+ *
+ * This does not do quota reservations. That typically is done by the
+ * caller afterwards.
+ */
+int
+xfs_trans_reserve(
+	xfs_trans_t	*tp,
+	uint		blocks,
+	uint		logspace,
+	uint		rtextents,
+	uint		flags,
+	uint		logcount)
+{
+	int		log_flags;
+	int		error = 0;
+	int		rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+
+	/* Mark this thread as being in a transaction */
+	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+
+	/*
+	 * Attempt to reserve the needed disk blocks by decrementing
+	 * the number needed from the number available.  This will
+	 * fail if the count would go below zero.
+	 */
+	if (blocks > 0) {
+		error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
+					  -((int64_t)blocks), rsvd);
+		if (error != 0) {
+			current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+			return (XFS_ERROR(ENOSPC));
+		}
+		tp->t_blk_res += blocks;
+	}
+
+	/*
+	 * Reserve the log space needed for this transaction.
+	 */
+	if (logspace > 0) {
+		ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace));
+		ASSERT((tp->t_log_count == 0) ||
+			(tp->t_log_count == logcount));
+		if (flags & XFS_TRANS_PERM_LOG_RES) {
+			log_flags = XFS_LOG_PERM_RESERV;
+			tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
+		} else {
+			ASSERT(tp->t_ticket == NULL);
+			ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
+			log_flags = 0;
+		}
+
+		error = xfs_log_reserve(tp->t_mountp, logspace, logcount,
+					&tp->t_ticket,
+					XFS_TRANSACTION, log_flags, tp->t_type);
+		if (error) {
+			goto undo_blocks;
+		}
+		tp->t_log_res = logspace;
+		tp->t_log_count = logcount;
+	}
+
+	/*
+	 * Attempt to reserve the needed realtime extents by decrementing
+	 * the number needed from the number available.  This will
+	 * fail if the count would go below zero.
+	 */
+	if (rtextents > 0) {
+		error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS,
+					  -((int64_t)rtextents), rsvd);
+		if (error) {
+			error = XFS_ERROR(ENOSPC);
+			goto undo_log;
+		}
+		tp->t_rtx_res += rtextents;
+	}
+
+	return 0;
+
+	/*
+	 * Error cases jump to one of these labels to undo any
+	 * reservations which have already been performed.
+	 */
+undo_log:
+	if (logspace > 0) {
+		if (flags & XFS_TRANS_PERM_LOG_RES) {
+			log_flags = XFS_LOG_REL_PERM_RESERV;
+		} else {
+			log_flags = 0;
+		}
+		xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags);
+		tp->t_ticket = NULL;
+		tp->t_log_res = 0;
+		tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
+	}
+
+undo_blocks:
+	if (blocks > 0) {
+		xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
+					 (int64_t)blocks, rsvd);
+		tp->t_blk_res = 0;
+	}
+
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+
+	return error;
+}
+
+/*
+ * Record the indicated change to the given field for application
+ * to the file system's superblock when the transaction commits.
+ * For now, just store the change in the transaction structure.
+ *
+ * Mark the transaction structure to indicate that the superblock
+ * needs to be updated before committing.
+ *
+ * Because we may not be keeping track of allocated/free inodes and
+ * used filesystem blocks in the superblock, we do not mark the
+ * superblock dirty in this transaction if we modify these fields.
+ * We still need to update the transaction deltas so that they get
+ * applied to the incore superblock, but we don't want them to
+ * cause the superblock to get locked and logged if these are the
+ * only fields in the superblock that the transaction modifies.
+ */
+void
+xfs_trans_mod_sb(
+	xfs_trans_t	*tp,
+	uint		field,
+	int64_t		delta)
+{
+	uint32_t	flags = (XFS_TRANS_DIRTY|XFS_TRANS_SB_DIRTY);
+	xfs_mount_t	*mp = tp->t_mountp;
+
+	switch (field) {
+	case XFS_TRANS_SB_ICOUNT:
+		tp->t_icount_delta += delta;
+		if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+			flags &= ~XFS_TRANS_SB_DIRTY;
+		break;
+	case XFS_TRANS_SB_IFREE:
+		tp->t_ifree_delta += delta;
+		if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+			flags &= ~XFS_TRANS_SB_DIRTY;
+		break;
+	case XFS_TRANS_SB_FDBLOCKS:
+		/*
+		 * Track the number of blocks allocated in the
+		 * transaction.  Make sure it does not exceed the
+		 * number reserved.
+		 */
+		if (delta < 0) {
+			tp->t_blk_res_used += (uint)-delta;
+			ASSERT(tp->t_blk_res_used <= tp->t_blk_res);
+		}
+		tp->t_fdblocks_delta += delta;
+		if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+			flags &= ~XFS_TRANS_SB_DIRTY;
+		break;
+	case XFS_TRANS_SB_RES_FDBLOCKS:
+		/*
+		 * The allocation has already been applied to the
+		 * in-core superblock's counter.  This should only
+		 * be applied to the on-disk superblock.
+		 */
+		ASSERT(delta < 0);
+		tp->t_res_fdblocks_delta += delta;
+		if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+			flags &= ~XFS_TRANS_SB_DIRTY;
+		break;
+	case XFS_TRANS_SB_FREXTENTS:
+		/*
+		 * Track the number of blocks allocated in the
+		 * transaction.  Make sure it does not exceed the
+		 * number reserved.
+		 */
+		if (delta < 0) {
+			tp->t_rtx_res_used += (uint)-delta;
+			ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res);
+		}
+		tp->t_frextents_delta += delta;
+		break;
+	case XFS_TRANS_SB_RES_FREXTENTS:
+		/*
+		 * The allocation has already been applied to the
+		 * in-core superblock's counter.  This should only
+		 * be applied to the on-disk superblock.
+		 */
+		ASSERT(delta < 0);
+		tp->t_res_frextents_delta += delta;
+		break;
+	case XFS_TRANS_SB_DBLOCKS:
+		ASSERT(delta > 0);
+		tp->t_dblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_AGCOUNT:
+		ASSERT(delta > 0);
+		tp->t_agcount_delta += delta;
+		break;
+	case XFS_TRANS_SB_IMAXPCT:
+		tp->t_imaxpct_delta += delta;
+		break;
+	case XFS_TRANS_SB_REXTSIZE:
+		tp->t_rextsize_delta += delta;
+		break;
+	case XFS_TRANS_SB_RBMBLOCKS:
+		tp->t_rbmblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_RBLOCKS:
+		tp->t_rblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_REXTENTS:
+		tp->t_rextents_delta += delta;
+		break;
+	case XFS_TRANS_SB_REXTSLOG:
+		tp->t_rextslog_delta += delta;
+		break;
+	default:
+		ASSERT(0);
+		return;
+	}
+
+	tp->t_flags |= flags;
+}
+
+/*
+ * xfs_trans_apply_sb_deltas() is called from the commit code
+ * to bring the superblock buffer into the current transaction
+ * and modify it as requested by earlier calls to xfs_trans_mod_sb().
+ *
+ * For now we just look at each field allowed to change and change
+ * it if necessary.
+ */
+STATIC void
+xfs_trans_apply_sb_deltas(
+	xfs_trans_t	*tp)
+{
+	xfs_dsb_t	*sbp;
+	xfs_buf_t	*bp;
+	int		whole = 0;
+
+	bp = xfs_trans_getsb(tp, tp->t_mountp, 0);
+	sbp = XFS_BUF_TO_SBP(bp);
+
+	/*
+	 * Check that superblock mods match the mods made to AGF counters.
+	 */
+	ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) ==
+	       (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta +
+		tp->t_ag_btree_delta));
+
+	/*
+	 * Only update the superblock counters if we are logging them
+	 */
+	if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) {
+		if (tp->t_icount_delta)
+			be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta);
+		if (tp->t_ifree_delta)
+			be64_add_cpu(&sbp->sb_ifree, tp->t_ifree_delta);
+		if (tp->t_fdblocks_delta)
+			be64_add_cpu(&sbp->sb_fdblocks, tp->t_fdblocks_delta);
+		if (tp->t_res_fdblocks_delta)
+			be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta);
+	}
+
+	if (tp->t_frextents_delta)
+		be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta);
+	if (tp->t_res_frextents_delta)
+		be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta);
+
+	if (tp->t_dblocks_delta) {
+		be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
+		whole = 1;
+	}
+	if (tp->t_agcount_delta) {
+		be32_add_cpu(&sbp->sb_agcount, tp->t_agcount_delta);
+		whole = 1;
+	}
+	if (tp->t_imaxpct_delta) {
+		sbp->sb_imax_pct += tp->t_imaxpct_delta;
+		whole = 1;
+	}
+	if (tp->t_rextsize_delta) {
+		be32_add_cpu(&sbp->sb_rextsize, tp->t_rextsize_delta);
+		whole = 1;
+	}
+	if (tp->t_rbmblocks_delta) {
+		be32_add_cpu(&sbp->sb_rbmblocks, tp->t_rbmblocks_delta);
+		whole = 1;
+	}
+	if (tp->t_rblocks_delta) {
+		be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta);
+		whole = 1;
+	}
+	if (tp->t_rextents_delta) {
+		be64_add_cpu(&sbp->sb_rextents, tp->t_rextents_delta);
+		whole = 1;
+	}
+	if (tp->t_rextslog_delta) {
+		sbp->sb_rextslog += tp->t_rextslog_delta;
+		whole = 1;
+	}
+
+	if (whole)
+		/*
+		 * Log the whole thing, the fields are noncontiguous.
+		 */
+		xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_dsb_t) - 1);
+	else
+		/*
+		 * Since all the modifiable fields are contiguous, we
+		 * can get away with this.
+		 */
+		xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount),
+				  offsetof(xfs_dsb_t, sb_frextents) +
+				  sizeof(sbp->sb_frextents) - 1);
+}
+
+/*
+ * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations
+ * and apply superblock counter changes to the in-core superblock.  The
+ * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT
+ * applied to the in-core superblock.  The idea is that that has already been
+ * done.
+ *
+ * This is done efficiently with a single call to xfs_mod_incore_sb_batch().
+ * However, we have to ensure that we only modify each superblock field only
+ * once because the application of the delta values may not be atomic. That can
+ * lead to ENOSPC races occurring if we have two separate modifcations of the
+ * free space counter to put back the entire reservation and then take away
+ * what we used.
+ *
+ * If we are not logging superblock counters, then the inode allocated/free and
+ * used block counts are not updated in the on disk superblock. In this case,
+ * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
+ * still need to update the incore superblock with the changes.
+ */
+void
+xfs_trans_unreserve_and_mod_sb(
+	xfs_trans_t	*tp)
+{
+	xfs_mod_sb_t	msb[9];	/* If you add cases, add entries */
+	xfs_mod_sb_t	*msbp;
+	xfs_mount_t	*mp = tp->t_mountp;
+	/* REFERENCED */
+	int		error;
+	int		rsvd;
+	int64_t		blkdelta = 0;
+	int64_t		rtxdelta = 0;
+	int64_t		idelta = 0;
+	int64_t		ifreedelta = 0;
+
+	msbp = msb;
+	rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+
+	/* calculate deltas */
+	if (tp->t_blk_res > 0)
+		blkdelta = tp->t_blk_res;
+	if ((tp->t_fdblocks_delta != 0) &&
+	    (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
+	     (tp->t_flags & XFS_TRANS_SB_DIRTY)))
+	        blkdelta += tp->t_fdblocks_delta;
+
+	if (tp->t_rtx_res > 0)
+		rtxdelta = tp->t_rtx_res;
+	if ((tp->t_frextents_delta != 0) &&
+	    (tp->t_flags & XFS_TRANS_SB_DIRTY))
+		rtxdelta += tp->t_frextents_delta;
+
+	if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
+	     (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
+		idelta = tp->t_icount_delta;
+		ifreedelta = tp->t_ifree_delta;
+	}
+
+	/* apply the per-cpu counters */
+	if (blkdelta) {
+		error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+						 blkdelta, rsvd);
+		if (error)
+			goto out;
+	}
+
+	if (idelta) {
+		error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
+						 idelta, rsvd);
+		if (error)
+			goto out_undo_fdblocks;
+	}
+
+	if (ifreedelta) {
+		error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
+						 ifreedelta, rsvd);
+		if (error)
+			goto out_undo_icount;
+	}
+
+	/* apply remaining deltas */
+	if (rtxdelta != 0) {
+		msbp->msb_field = XFS_SBS_FREXTENTS;
+		msbp->msb_delta = rtxdelta;
+		msbp++;
+	}
+
+	if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
+		if (tp->t_dblocks_delta != 0) {
+			msbp->msb_field = XFS_SBS_DBLOCKS;
+			msbp->msb_delta = tp->t_dblocks_delta;
+			msbp++;
+		}
+		if (tp->t_agcount_delta != 0) {
+			msbp->msb_field = XFS_SBS_AGCOUNT;
+			msbp->msb_delta = tp->t_agcount_delta;
+			msbp++;
+		}
+		if (tp->t_imaxpct_delta != 0) {
+			msbp->msb_field = XFS_SBS_IMAX_PCT;
+			msbp->msb_delta = tp->t_imaxpct_delta;
+			msbp++;
+		}
+		if (tp->t_rextsize_delta != 0) {
+			msbp->msb_field = XFS_SBS_REXTSIZE;
+			msbp->msb_delta = tp->t_rextsize_delta;
+			msbp++;
+		}
+		if (tp->t_rbmblocks_delta != 0) {
+			msbp->msb_field = XFS_SBS_RBMBLOCKS;
+			msbp->msb_delta = tp->t_rbmblocks_delta;
+			msbp++;
+		}
+		if (tp->t_rblocks_delta != 0) {
+			msbp->msb_field = XFS_SBS_RBLOCKS;
+			msbp->msb_delta = tp->t_rblocks_delta;
+			msbp++;
+		}
+		if (tp->t_rextents_delta != 0) {
+			msbp->msb_field = XFS_SBS_REXTENTS;
+			msbp->msb_delta = tp->t_rextents_delta;
+			msbp++;
+		}
+		if (tp->t_rextslog_delta != 0) {
+			msbp->msb_field = XFS_SBS_REXTSLOG;
+			msbp->msb_delta = tp->t_rextslog_delta;
+			msbp++;
+		}
+	}
+
+	/*
+	 * If we need to change anything, do it.
+	 */
+	if (msbp > msb) {
+		error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
+			(uint)(msbp - msb), rsvd);
+		if (error)
+			goto out_undo_ifreecount;
+	}
+
+	return;
+
+out_undo_ifreecount:
+	if (ifreedelta)
+		xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
+out_undo_icount:
+	if (idelta)
+		xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
+out_undo_fdblocks:
+	if (blkdelta)
+		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
+out:
+	ASSERT(error == 0);
+	return;
+}
+
+/*
+ * Add the given log item to the transaction's list of log items.
+ *
+ * The log item will now point to its new descriptor with its li_desc field.
+ */
+void
+xfs_trans_add_item(
+	struct xfs_trans	*tp,
+	struct xfs_log_item	*lip)
+{
+	struct xfs_log_item_desc *lidp;
+
+	ASSERT(lip->li_mountp = tp->t_mountp);
+	ASSERT(lip->li_ailp = tp->t_mountp->m_ail);
+
+	lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
+
+	lidp->lid_item = lip;
+	lidp->lid_flags = 0;
+	lidp->lid_size = 0;
+	list_add_tail(&lidp->lid_trans, &tp->t_items);
+
+	lip->li_desc = lidp;
+}
+
+STATIC void
+xfs_trans_free_item_desc(
+	struct xfs_log_item_desc *lidp)
+{
+	list_del_init(&lidp->lid_trans);
+	kmem_zone_free(xfs_log_item_desc_zone, lidp);
+}
+
+/*
+ * Unlink and free the given descriptor.
+ */
+void
+xfs_trans_del_item(
+	struct xfs_log_item	*lip)
+{
+	xfs_trans_free_item_desc(lip->li_desc);
+	lip->li_desc = NULL;
+}
+
+/*
+ * Unlock all of the items of a transaction and free all the descriptors
+ * of that transaction.
+ */
+void
+xfs_trans_free_items(
+	struct xfs_trans	*tp,
+	xfs_lsn_t		commit_lsn,
+	int			flags)
+{
+	struct xfs_log_item_desc *lidp, *next;
+
+	list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
+		struct xfs_log_item	*lip = lidp->lid_item;
+
+		lip->li_desc = NULL;
+
+		if (commit_lsn != NULLCOMMITLSN)
+			IOP_COMMITTING(lip, commit_lsn);
+		if (flags & XFS_TRANS_ABORT)
+			lip->li_flags |= XFS_LI_ABORTED;
+		IOP_UNLOCK(lip);
+
+		xfs_trans_free_item_desc(lidp);
+	}
+}
+
+/*
+ * Unlock the items associated with a transaction.
+ *
+ * Items which were not logged should be freed.  Those which were logged must
+ * still be tracked so they can be unpinned when the transaction commits.
+ */
+STATIC void
+xfs_trans_unlock_items(
+	struct xfs_trans	*tp,
+	xfs_lsn_t		commit_lsn)
+{
+	struct xfs_log_item_desc *lidp, *next;
+
+	list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
+		struct xfs_log_item	*lip = lidp->lid_item;
+
+		lip->li_desc = NULL;
+
+		if (commit_lsn != NULLCOMMITLSN)
+			IOP_COMMITTING(lip, commit_lsn);
+		IOP_UNLOCK(lip);
+
+		/*
+		 * Free the descriptor if the item is not dirty
+		 * within this transaction.
+		 */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY))
+			xfs_trans_free_item_desc(lidp);
+	}
+}
+
+/*
+ * Total up the number of log iovecs needed to commit this
+ * transaction.  The transaction itself needs one for the
+ * transaction header.  Ask each dirty item in turn how many
+ * it needs to get the total.
+ */
+static uint
+xfs_trans_count_vecs(
+	struct xfs_trans	*tp)
+{
+	int			nvecs;
+	struct xfs_log_item_desc *lidp;
+
+	nvecs = 1;
+
+	/* In the non-debug case we need to start bailing out if we
+	 * didn't find a log_item here, return zero and let trans_commit
+	 * deal with it.
+	 */
+	if (list_empty(&tp->t_items)) {
+		ASSERT(0);
+		return 0;
+	}
+
+	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+		/*
+		 * Skip items which aren't dirty in this transaction.
+		 */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY))
+			continue;
+		lidp->lid_size = IOP_SIZE(lidp->lid_item);
+		nvecs += lidp->lid_size;
+	}
+
+	return nvecs;
+}
+
+/*
+ * Fill in the vector with pointers to data to be logged
+ * by this transaction.  The transaction header takes
+ * the first vector, and then each dirty item takes the
+ * number of vectors it indicated it needed in xfs_trans_count_vecs().
+ *
+ * As each item fills in the entries it needs, also pin the item
+ * so that it cannot be flushed out until the log write completes.
+ */
+static void
+xfs_trans_fill_vecs(
+	struct xfs_trans	*tp,
+	struct xfs_log_iovec	*log_vector)
+{
+	struct xfs_log_item_desc *lidp;
+	struct xfs_log_iovec	*vecp;
+	uint			nitems;
+
+	/*
+	 * Skip over the entry for the transaction header, we'll
+	 * fill that in at the end.
+	 */
+	vecp = log_vector + 1;
+
+	nitems = 0;
+	ASSERT(!list_empty(&tp->t_items));
+	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+		/* Skip items which aren't dirty in this transaction. */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY))
+			continue;
+
+		/*
+		 * The item may be marked dirty but not log anything.  This can
+		 * be used to get called when a transaction is committed.
+		 */
+		if (lidp->lid_size)
+			nitems++;
+		IOP_FORMAT(lidp->lid_item, vecp);
+		vecp += lidp->lid_size;
+		IOP_PIN(lidp->lid_item);
+	}
+
+	/*
+	 * Now that we've counted the number of items in this transaction, fill
+	 * in the transaction header. Note that the transaction header does not
+	 * have a log item.
+	 */
+	tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
+	tp->t_header.th_type = tp->t_type;
+	tp->t_header.th_num_items = nitems;
+	log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
+	log_vector->i_len = sizeof(xfs_trans_header_t);
+	log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
+}
+
+/*
+ * The committed item processing consists of calling the committed routine of
+ * each logged item, updating the item's position in the AIL if necessary, and
+ * unpinning each item.  If the committed routine returns -1, then do nothing
+ * further with the item because it may have been freed.
+ *
+ * Since items are unlocked when they are copied to the incore log, it is
+ * possible for two transactions to be completing and manipulating the same
+ * item simultaneously.  The AIL lock will protect the lsn field of each item.
+ * The value of this field can never go backwards.
+ *
+ * We unpin the items after repositioning them in the AIL, because otherwise
+ * they could be immediately flushed and we'd have to race with the flusher
+ * trying to pull the item from the AIL as we add it.
+ */
+static void
+xfs_trans_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		commit_lsn,
+	int			aborted)
+{
+	xfs_lsn_t		item_lsn;
+	struct xfs_ail		*ailp;
+
+	if (aborted)
+		lip->li_flags |= XFS_LI_ABORTED;
+	item_lsn = IOP_COMMITTED(lip, commit_lsn);
+
+	/* item_lsn of -1 means the item needs no further processing */
+	if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+		return;
+
+	/*
+	 * If the returned lsn is greater than what it contained before, update
+	 * the location of the item in the AIL.  If it is not, then do nothing.
+	 * Items can never move backwards in the AIL.
+	 *
+	 * While the new lsn should usually be greater, it is possible that a
+	 * later transaction completing simultaneously with an earlier one
+	 * using the same item could complete first with a higher lsn.  This
+	 * would cause the earlier transaction to fail the test below.
+	 */
+	ailp = lip->li_ailp;
+	spin_lock(&ailp->xa_lock);
+	if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
+		/*
+		 * This will set the item's lsn to item_lsn and update the
+		 * position of the item in the AIL.
+		 *
+		 * xfs_trans_ail_update() drops the AIL lock.
+		 */
+		xfs_trans_ail_update(ailp, lip, item_lsn);
+	} else {
+		spin_unlock(&ailp->xa_lock);
+	}
+
+	/*
+	 * Now that we've repositioned the item in the AIL, unpin it so it can
+	 * be flushed. Pass information about buffer stale state down from the
+	 * log item flags, if anyone else stales the buffer we do not want to
+	 * pay any attention to it.
+	 */
+	IOP_UNPIN(lip, 0);
+}
+
+/*
+ * This is typically called by the LM when a transaction has been fully
+ * committed to disk.  It needs to unpin the items which have
+ * been logged by the transaction and update their positions
+ * in the AIL if necessary.
+ *
+ * This also gets called when the transactions didn't get written out
+ * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
+ */
+STATIC void
+xfs_trans_committed(
+	void			*arg,
+	int			abortflag)
+{
+	struct xfs_trans	*tp = arg;
+	struct xfs_log_item_desc *lidp, *next;
+
+	list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
+		xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
+		xfs_trans_free_item_desc(lidp);
+	}
+
+	xfs_trans_free(tp);
+}
+
+static inline void
+xfs_log_item_batch_insert(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	struct xfs_log_item	**log_items,
+	int			nr_items,
+	xfs_lsn_t		commit_lsn)
+{
+	int	i;
+
+	spin_lock(&ailp->xa_lock);
+	/* xfs_trans_ail_update_bulk drops ailp->xa_lock */
+	xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn);
+
+	for (i = 0; i < nr_items; i++)
+		IOP_UNPIN(log_items[i], 0);
+}
+
+/*
+ * Bulk operation version of xfs_trans_committed that takes a log vector of
+ * items to insert into the AIL. This uses bulk AIL insertion techniques to
+ * minimise lock traffic.
+ *
+ * If we are called with the aborted flag set, it is because a log write during
+ * a CIL checkpoint commit has failed. In this case, all the items in the
+ * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
+ * means that checkpoint commit abort handling is treated exactly the same
+ * as an iclog write error even though we haven't started any IO yet. Hence in
+ * this case all we need to do is IOP_COMMITTED processing, followed by an
+ * IOP_UNPIN(aborted) call.
+ *
+ * The AIL cursor is used to optimise the insert process. If commit_lsn is not
+ * at the end of the AIL, the insert cursor avoids the need to walk
+ * the AIL to find the insertion point on every xfs_log_item_batch_insert()
+ * call. This saves a lot of needless list walking and is a net win, even
+ * though it slightly increases that amount of AIL lock traffic to set it up
+ * and tear it down.
+ */
+void
+xfs_trans_committed_bulk(
+	struct xfs_ail		*ailp,
+	struct xfs_log_vec	*log_vector,
+	xfs_lsn_t		commit_lsn,
+	int			aborted)
+{
+#define LOG_ITEM_BATCH_SIZE	32
+	struct xfs_log_item	*log_items[LOG_ITEM_BATCH_SIZE];
+	struct xfs_log_vec	*lv;
+	struct xfs_ail_cursor	cur;
+	int			i = 0;
+
+	spin_lock(&ailp->xa_lock);
+	xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn);
+	spin_unlock(&ailp->xa_lock);
+
+	/* unpin all the log items */
+	for (lv = log_vector; lv; lv = lv->lv_next ) {
+		struct xfs_log_item	*lip = lv->lv_item;
+		xfs_lsn_t		item_lsn;
+
+		if (aborted)
+			lip->li_flags |= XFS_LI_ABORTED;
+		item_lsn = IOP_COMMITTED(lip, commit_lsn);
+
+		/* item_lsn of -1 means the item needs no further processing */
+		if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+			continue;
+
+		/*
+		 * if we are aborting the operation, no point in inserting the
+		 * object into the AIL as we are in a shutdown situation.
+		 */
+		if (aborted) {
+			ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
+			IOP_UNPIN(lip, 1);
+			continue;
+		}
+
+		if (item_lsn != commit_lsn) {
+
+			/*
+			 * Not a bulk update option due to unusual item_lsn.
+			 * Push into AIL immediately, rechecking the lsn once
+			 * we have the ail lock. Then unpin the item. This does
+			 * not affect the AIL cursor the bulk insert path is
+			 * using.
+			 */
+			spin_lock(&ailp->xa_lock);
+			if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
+				xfs_trans_ail_update(ailp, lip, item_lsn);
+			else
+				spin_unlock(&ailp->xa_lock);
+			IOP_UNPIN(lip, 0);
+			continue;
+		}
+
+		/* Item is a candidate for bulk AIL insert.  */
+		log_items[i++] = lv->lv_item;
+		if (i >= LOG_ITEM_BATCH_SIZE) {
+			xfs_log_item_batch_insert(ailp, &cur, log_items,
+					LOG_ITEM_BATCH_SIZE, commit_lsn);
+			i = 0;
+		}
+	}
+
+	/* make sure we insert the remainder! */
+	if (i)
+		xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn);
+
+	spin_lock(&ailp->xa_lock);
+	xfs_trans_ail_cursor_done(ailp, &cur);
+	spin_unlock(&ailp->xa_lock);
+}
+
+/*
+ * Called from the trans_commit code when we notice that the filesystem is in
+ * the middle of a forced shutdown.
+ *
+ * When we are called here, we have already pinned all the items in the
+ * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
+ * so we can simply walk the items in the transaction, unpin them with an abort
+ * flag and then free the items. Note that unpinning the items can result in
+ * them being freed immediately, so we need to use a safe list traversal method
+ * here.
+ */
+STATIC void
+xfs_trans_uncommit(
+	struct xfs_trans	*tp,
+	uint			flags)
+{
+	struct xfs_log_item_desc *lidp, *n;
+
+	list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
+		if (lidp->lid_flags & XFS_LID_DIRTY)
+			IOP_UNPIN(lidp->lid_item, 1);
+	}
+
+	xfs_trans_unreserve_and_mod_sb(tp);
+	xfs_trans_unreserve_and_mod_dquots(tp);
+
+	xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
+	xfs_trans_free(tp);
+}
+
+/*
+ * Format the transaction direct to the iclog. This isolates the physical
+ * transaction commit operation from the logical operation and hence allows
+ * other methods to be introduced without affecting the existing commit path.
+ */
+static int
+xfs_trans_commit_iclog(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_lsn_t		*commit_lsn,
+	int			flags)
+{
+	int			shutdown;
+	int			error;
+	int			log_flags = 0;
+	struct xlog_in_core	*commit_iclog;
+#define XFS_TRANS_LOGVEC_COUNT  16
+	struct xfs_log_iovec	log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
+	struct xfs_log_iovec	*log_vector;
+	uint			nvec;
+
+
+	/*
+	 * Ask each log item how many log_vector entries it will
+	 * need so we can figure out how many to allocate.
+	 * Try to avoid the kmem_alloc() call in the common case
+	 * by using a vector from the stack when it fits.
+	 */
+	nvec = xfs_trans_count_vecs(tp);
+	if (nvec == 0) {
+		return ENOMEM;	/* triggers a shutdown! */
+	} else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
+		log_vector = log_vector_fast;
+	} else {
+		log_vector = (xfs_log_iovec_t *)kmem_alloc(nvec *
+						   sizeof(xfs_log_iovec_t),
+						   KM_SLEEP);
+	}
+
+	/*
+	 * Fill in the log_vector and pin the logged items, and
+	 * then write the transaction to the log.
+	 */
+	xfs_trans_fill_vecs(tp, log_vector);
+
+	if (flags & XFS_TRANS_RELEASE_LOG_RES)
+		log_flags = XFS_LOG_REL_PERM_RESERV;
+
+	error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn));
+
+	/*
+	 * The transaction is committed incore here, and can go out to disk
+	 * at any time after this call.  However, all the items associated
+	 * with the transaction are still locked and pinned in memory.
+	 */
+	*commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
+
+	tp->t_commit_lsn = *commit_lsn;
+	trace_xfs_trans_commit_lsn(tp);
+
+	if (nvec > XFS_TRANS_LOGVEC_COUNT)
+		kmem_free(log_vector);
+
+	/*
+	 * If we got a log write error. Unpin the logitems that we
+	 * had pinned, clean up, free trans structure, and return error.
+	 */
+	if (error || *commit_lsn == -1) {
+		current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+		xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
+		return XFS_ERROR(EIO);
+	}
+
+	/*
+	 * Once the transaction has committed, unused
+	 * reservations need to be released and changes to
+	 * the superblock need to be reflected in the in-core
+	 * version.  Do that now.
+	 */
+	xfs_trans_unreserve_and_mod_sb(tp);
+
+	/*
+	 * Tell the LM to call the transaction completion routine
+	 * when the log write with LSN commit_lsn completes (e.g.
+	 * when the transaction commit really hits the on-disk log).
+	 * After this call we cannot reference tp, because the call
+	 * can happen at any time and the call will free the transaction
+	 * structure pointed to by tp.  The only case where we call
+	 * the completion routine (xfs_trans_committed) directly is
+	 * if the log is turned off on a debug kernel or we're
+	 * running in simulation mode (the log is explicitly turned
+	 * off).
+	 */
+	tp->t_logcb.cb_func = xfs_trans_committed;
+	tp->t_logcb.cb_arg = tp;
+
+	/*
+	 * We need to pass the iclog buffer which was used for the
+	 * transaction commit record into this function, and attach
+	 * the callback to it. The callback must be attached before
+	 * the items are unlocked to avoid racing with other threads
+	 * waiting for an item to unlock.
+	 */
+	shutdown = xfs_log_notify(mp, commit_iclog, &(tp->t_logcb));
+
+	/*
+	 * Mark this thread as no longer being in a transaction
+	 */
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+
+	/*
+	 * Once all the items of the transaction have been copied
+	 * to the in core log and the callback is attached, the
+	 * items can be unlocked.
+	 *
+	 * This will free descriptors pointing to items which were
+	 * not logged since there is nothing more to do with them.
+	 * For items which were logged, we will keep pointers to them
+	 * so they can be unpinned after the transaction commits to disk.
+	 * This will also stamp each modified meta-data item with
+	 * the commit lsn of this transaction for dependency tracking
+	 * purposes.
+	 */
+	xfs_trans_unlock_items(tp, *commit_lsn);
+
+	/*
+	 * If we detected a log error earlier, finish committing
+	 * the transaction now (unpin log items, etc).
+	 *
+	 * Order is critical here, to avoid using the transaction
+	 * pointer after its been freed (by xfs_trans_committed
+	 * either here now, or as a callback).  We cannot do this
+	 * step inside xfs_log_notify as was done earlier because
+	 * of this issue.
+	 */
+	if (shutdown)
+		xfs_trans_committed(tp, XFS_LI_ABORTED);
+
+	/*
+	 * Now that the xfs_trans_committed callback has been attached,
+	 * and the items are released we can finally allow the iclog to
+	 * go to disk.
+	 */
+	return xfs_log_release_iclog(mp, commit_iclog);
+}
+
+/*
+ * Walk the log items and allocate log vector structures for
+ * each item large enough to fit all the vectors they require.
+ * Note that this format differs from the old log vector format in
+ * that there is no transaction header in these log vectors.
+ */
+STATIC struct xfs_log_vec *
+xfs_trans_alloc_log_vecs(
+	xfs_trans_t	*tp)
+{
+	struct xfs_log_item_desc *lidp;
+	struct xfs_log_vec	*lv = NULL;
+	struct xfs_log_vec	*ret_lv = NULL;
+
+
+	/* Bail out if we didn't find a log item.  */
+	if (list_empty(&tp->t_items)) {
+		ASSERT(0);
+		return NULL;
+	}
+
+	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+		struct xfs_log_vec *new_lv;
+
+		/* Skip items which aren't dirty in this transaction. */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY))
+			continue;
+
+		/* Skip items that do not have any vectors for writing */
+		lidp->lid_size = IOP_SIZE(lidp->lid_item);
+		if (!lidp->lid_size)
+			continue;
+
+		new_lv = kmem_zalloc(sizeof(*new_lv) +
+				lidp->lid_size * sizeof(struct xfs_log_iovec),
+				KM_SLEEP);
+
+		/* The allocated iovec region lies beyond the log vector. */
+		new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
+		new_lv->lv_niovecs = lidp->lid_size;
+		new_lv->lv_item = lidp->lid_item;
+		if (!ret_lv)
+			ret_lv = new_lv;
+		else
+			lv->lv_next = new_lv;
+		lv = new_lv;
+	}
+
+	return ret_lv;
+}
+
+static int
+xfs_trans_commit_cil(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_lsn_t		*commit_lsn,
+	int			flags)
+{
+	struct xfs_log_vec	*log_vector;
+
+	/*
+	 * Get each log item to allocate a vector structure for
+	 * the log item to to pass to the log write code. The
+	 * CIL commit code will format the vector and save it away.
+	 */
+	log_vector = xfs_trans_alloc_log_vecs(tp);
+	if (!log_vector)
+		return ENOMEM;
+
+	xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
+
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	xfs_trans_free(tp);
+	return 0;
+}
+
+/*
+ * xfs_trans_commit
+ *
+ * Commit the given transaction to the log a/synchronously.
+ *
+ * XFS disk error handling mechanism is not based on a typical
+ * transaction abort mechanism. Logically after the filesystem
+ * gets marked 'SHUTDOWN', we can't let any new transactions
+ * be durable - ie. committed to disk - because some metadata might
+ * be inconsistent. In such cases, this returns an error, and the
+ * caller may assume that all locked objects joined to the transaction
+ * have already been unlocked as if the commit had succeeded.
+ * Do not reference the transaction structure after this call.
+ */
+int
+_xfs_trans_commit(
+	struct xfs_trans	*tp,
+	uint			flags,
+	int			*log_flushed)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	xfs_lsn_t		commit_lsn = -1;
+	int			error = 0;
+	int			log_flags = 0;
+	int			sync = tp->t_flags & XFS_TRANS_SYNC;
+
+	/*
+	 * Determine whether this commit is releasing a permanent
+	 * log reservation or not.
+	 */
+	if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+		ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+		log_flags = XFS_LOG_REL_PERM_RESERV;
+	}
+
+	/*
+	 * If there is nothing to be logged by the transaction,
+	 * then unlock all of the items associated with the
+	 * transaction and free the transaction structure.
+	 * Also make sure to return any reserved blocks to
+	 * the free pool.
+	 */
+	if (!(tp->t_flags & XFS_TRANS_DIRTY))
+		goto out_unreserve;
+
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		error = XFS_ERROR(EIO);
+		goto out_unreserve;
+	}
+
+	ASSERT(tp->t_ticket != NULL);
+
+	/*
+	 * If we need to update the superblock, then do it now.
+	 */
+	if (tp->t_flags & XFS_TRANS_SB_DIRTY)
+		xfs_trans_apply_sb_deltas(tp);
+	xfs_trans_apply_dquot_deltas(tp);
+
+	if (mp->m_flags & XFS_MOUNT_DELAYLOG)
+		error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
+	else
+		error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
+
+	if (error == ENOMEM) {
+		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+		error = XFS_ERROR(EIO);
+		goto out_unreserve;
+	}
+
+	/*
+	 * If the transaction needs to be synchronous, then force the
+	 * log out now and wait for it.
+	 */
+	if (sync) {
+		if (!error) {
+			error = _xfs_log_force_lsn(mp, commit_lsn,
+				      XFS_LOG_SYNC, log_flushed);
+		}
+		XFS_STATS_INC(xs_trans_sync);
+	} else {
+		XFS_STATS_INC(xs_trans_async);
+	}
+
+	return error;
+
+out_unreserve:
+	xfs_trans_unreserve_and_mod_sb(tp);
+
+	/*
+	 * It is indeed possible for the transaction to be not dirty but
+	 * the dqinfo portion to be.  All that means is that we have some
+	 * (non-persistent) quota reservations that need to be unreserved.
+	 */
+	xfs_trans_unreserve_and_mod_dquots(tp);
+	if (tp->t_ticket) {
+		commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+		if (commit_lsn == -1 && !error)
+			error = XFS_ERROR(EIO);
+	}
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
+	xfs_trans_free(tp);
+
+	XFS_STATS_INC(xs_trans_empty);
+	return error;
+}
+
+/*
+ * Unlock all of the transaction's items and free the transaction.
+ * The transaction must not have modified any of its items, because
+ * there is no way to restore them to their previous state.
+ *
+ * If the transaction has made a log reservation, make sure to release
+ * it as well.
+ */
+void
+xfs_trans_cancel(
+	xfs_trans_t		*tp,
+	int			flags)
+{
+	int			log_flags;
+	xfs_mount_t		*mp = tp->t_mountp;
+
+	/*
+	 * See if the caller is being too lazy to figure out if
+	 * the transaction really needs an abort.
+	 */
+	if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY))
+		flags &= ~XFS_TRANS_ABORT;
+	/*
+	 * See if the caller is relying on us to shut down the
+	 * filesystem.  This happens in paths where we detect
+	 * corruption and decide to give up.
+	 */
+	if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) {
+		XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+	}
+#ifdef DEBUG
+	if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) {
+		struct xfs_log_item_desc *lidp;
+
+		list_for_each_entry(lidp, &tp->t_items, lid_trans)
+			ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD));
+	}
+#endif
+	xfs_trans_unreserve_and_mod_sb(tp);
+	xfs_trans_unreserve_and_mod_dquots(tp);
+
+	if (tp->t_ticket) {
+		if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+			ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+			log_flags = XFS_LOG_REL_PERM_RESERV;
+		} else {
+			log_flags = 0;
+		}
+		xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+	}
+
+	/* mark this thread as no longer being in a transaction */
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+
+	xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
+	xfs_trans_free(tp);
+}
+
+/*
+ * Roll from one trans in the sequence of PERMANENT transactions to
+ * the next: permanent transactions are only flushed out when
+ * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon
+ * as possible to let chunks of it go to the log. So we commit the
+ * chunk we've been working on and get a new transaction to continue.
+ */
+int
+xfs_trans_roll(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*dp)
+{
+	struct xfs_trans	*trans;
+	unsigned int		logres, count;
+	int			error;
+
+	/*
+	 * Ensure that the inode is always logged.
+	 */
+	trans = *tpp;
+	xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+
+	/*
+	 * Copy the critical parameters from one trans to the next.
+	 */
+	logres = trans->t_log_res;
+	count = trans->t_log_count;
+	*tpp = xfs_trans_dup(trans);
+
+	/*
+	 * Commit the current transaction.
+	 * If this commit failed, then it'd just unlock those items that
+	 * are not marked ihold. That also means that a filesystem shutdown
+	 * is in progress. The caller takes the responsibility to cancel
+	 * the duplicate transaction that gets returned.
+	 */
+	error = xfs_trans_commit(trans, 0);
+	if (error)
+		return (error);
+
+	trans = *tpp;
+
+	/*
+	 * transaction commit worked ok so we can drop the extra ticket
+	 * reference that we gained in xfs_trans_dup()
+	 */
+	xfs_log_ticket_put(trans->t_ticket);
+
+
+	/*
+	 * Reserve space in the log for th next transaction.
+	 * This also pushes items in the "AIL", the list of logged items,
+	 * out to disk if they are taking up space at the tail of the log
+	 * that we want to use.  This requires that either nothing be locked
+	 * across this call, or that anything that is locked be logged in
+	 * the prior and the next transactions.
+	 */
+	error = xfs_trans_reserve(trans, 0, logres, 0,
+				  XFS_TRANS_PERM_LOG_RES, count);
+	/*
+	 *  Ensure that the inode is in the new transaction and locked.
+	 */
+	if (error)
+		return error;
+
+	xfs_trans_ijoin(trans, dp);
+	return 0;
+}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
new file mode 100644
index 00000000..53597f4d
--- /dev/null
+++ b/fs/xfs/xfs_trans.h
@@ -0,0 +1,506 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef	__XFS_TRANS_H__
+#define	__XFS_TRANS_H__
+
+struct xfs_log_item;
+
+/*
+ * This is the structure written in the log at the head of
+ * every transaction. It identifies the type and id of the
+ * transaction, and contains the number of items logged by
+ * the transaction so we know how many to expect during recovery.
+ *
+ * Do not change the below structure without redoing the code in
+ * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
+ */
+typedef struct xfs_trans_header {
+	uint		th_magic;		/* magic number */
+	uint		th_type;		/* transaction type */
+	__int32_t	th_tid;			/* transaction id (unused) */
+	uint		th_num_items;		/* num items logged by trans */
+} xfs_trans_header_t;
+
+#define	XFS_TRANS_HEADER_MAGIC	0x5452414e	/* TRAN */
+
+/*
+ * Log item types.
+ */
+#define	XFS_LI_EFI		0x1236
+#define	XFS_LI_EFD		0x1237
+#define	XFS_LI_IUNLINK		0x1238
+#define	XFS_LI_INODE		0x123b	/* aligned ino chunks, var-size ibufs */
+#define	XFS_LI_BUF		0x123c	/* v2 bufs, variable sized inode bufs */
+#define	XFS_LI_DQUOT		0x123d
+#define	XFS_LI_QUOTAOFF		0x123e
+
+#define XFS_LI_TYPE_DESC \
+	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
+	{ XFS_LI_EFD,		"XFS_LI_EFD" }, \
+	{ XFS_LI_IUNLINK,	"XFS_LI_IUNLINK" }, \
+	{ XFS_LI_INODE,		"XFS_LI_INODE" }, \
+	{ XFS_LI_BUF,		"XFS_LI_BUF" }, \
+	{ XFS_LI_DQUOT,		"XFS_LI_DQUOT" }, \
+	{ XFS_LI_QUOTAOFF,	"XFS_LI_QUOTAOFF" }
+
+/*
+ * Transaction types.  Used to distinguish types of buffers.
+ */
+#define XFS_TRANS_SETATTR_NOT_SIZE	1
+#define XFS_TRANS_SETATTR_SIZE		2
+#define XFS_TRANS_INACTIVE		3
+#define XFS_TRANS_CREATE		4
+#define XFS_TRANS_CREATE_TRUNC		5
+#define XFS_TRANS_TRUNCATE_FILE		6
+#define XFS_TRANS_REMOVE		7
+#define XFS_TRANS_LINK			8
+#define XFS_TRANS_RENAME		9
+#define XFS_TRANS_MKDIR			10
+#define XFS_TRANS_RMDIR			11
+#define XFS_TRANS_SYMLINK		12
+#define XFS_TRANS_SET_DMATTRS		13
+#define XFS_TRANS_GROWFS		14
+#define XFS_TRANS_STRAT_WRITE		15
+#define XFS_TRANS_DIOSTRAT		16
+/* 17 was XFS_TRANS_WRITE_SYNC */
+#define	XFS_TRANS_WRITEID		18
+#define	XFS_TRANS_ADDAFORK		19
+#define	XFS_TRANS_ATTRINVAL		20
+#define	XFS_TRANS_ATRUNCATE		21
+#define	XFS_TRANS_ATTR_SET		22
+#define	XFS_TRANS_ATTR_RM		23
+#define	XFS_TRANS_ATTR_FLAG		24
+#define	XFS_TRANS_CLEAR_AGI_BUCKET	25
+#define XFS_TRANS_QM_SBCHANGE		26
+/*
+ * Dummy entries since we use the transaction type to index into the
+ * trans_type[] in xlog_recover_print_trans_head()
+ */
+#define XFS_TRANS_DUMMY1		27
+#define XFS_TRANS_DUMMY2		28
+#define XFS_TRANS_QM_QUOTAOFF		29
+#define XFS_TRANS_QM_DQALLOC		30
+#define XFS_TRANS_QM_SETQLIM		31
+#define XFS_TRANS_QM_DQCLUSTER		32
+#define XFS_TRANS_QM_QINOCREATE		33
+#define XFS_TRANS_QM_QUOTAOFF_END	34
+#define XFS_TRANS_SB_UNIT		35
+#define XFS_TRANS_FSYNC_TS		36
+#define	XFS_TRANS_GROWFSRT_ALLOC	37
+#define	XFS_TRANS_GROWFSRT_ZERO		38
+#define	XFS_TRANS_GROWFSRT_FREE		39
+#define	XFS_TRANS_SWAPEXT		40
+#define	XFS_TRANS_SB_COUNT		41
+#define	XFS_TRANS_CHECKPOINT		42
+#define	XFS_TRANS_TYPE_MAX		42
+/* new transaction types need to be reflected in xfs_logprint(8) */
+
+#define XFS_TRANS_TYPES \
+	{ XFS_TRANS_SETATTR_NOT_SIZE,	"SETATTR_NOT_SIZE" }, \
+	{ XFS_TRANS_SETATTR_SIZE,	"SETATTR_SIZE" }, \
+	{ XFS_TRANS_INACTIVE,		"INACTIVE" }, \
+	{ XFS_TRANS_CREATE,		"CREATE" }, \
+	{ XFS_TRANS_CREATE_TRUNC,	"CREATE_TRUNC" }, \
+	{ XFS_TRANS_TRUNCATE_FILE,	"TRUNCATE_FILE" }, \
+	{ XFS_TRANS_REMOVE,		"REMOVE" }, \
+	{ XFS_TRANS_LINK,		"LINK" }, \
+	{ XFS_TRANS_RENAME,		"RENAME" }, \
+	{ XFS_TRANS_MKDIR,		"MKDIR" }, \
+	{ XFS_TRANS_RMDIR,		"RMDIR" }, \
+	{ XFS_TRANS_SYMLINK,		"SYMLINK" }, \
+	{ XFS_TRANS_SET_DMATTRS,	"SET_DMATTRS" }, \
+	{ XFS_TRANS_GROWFS,		"GROWFS" }, \
+	{ XFS_TRANS_STRAT_WRITE,	"STRAT_WRITE" }, \
+	{ XFS_TRANS_DIOSTRAT,		"DIOSTRAT" }, \
+	{ XFS_TRANS_WRITEID,		"WRITEID" }, \
+	{ XFS_TRANS_ADDAFORK,		"ADDAFORK" }, \
+	{ XFS_TRANS_ATTRINVAL,		"ATTRINVAL" }, \
+	{ XFS_TRANS_ATRUNCATE,		"ATRUNCATE" }, \
+	{ XFS_TRANS_ATTR_SET,		"ATTR_SET" }, \
+	{ XFS_TRANS_ATTR_RM,		"ATTR_RM" }, \
+	{ XFS_TRANS_ATTR_FLAG,		"ATTR_FLAG" }, \
+	{ XFS_TRANS_CLEAR_AGI_BUCKET,	"CLEAR_AGI_BUCKET" }, \
+	{ XFS_TRANS_QM_SBCHANGE,	"QM_SBCHANGE" }, \
+	{ XFS_TRANS_QM_QUOTAOFF,	"QM_QUOTAOFF" }, \
+	{ XFS_TRANS_QM_DQALLOC,		"QM_DQALLOC" }, \
+	{ XFS_TRANS_QM_SETQLIM,		"QM_SETQLIM" }, \
+	{ XFS_TRANS_QM_DQCLUSTER,	"QM_DQCLUSTER" }, \
+	{ XFS_TRANS_QM_QINOCREATE,	"QM_QINOCREATE" }, \
+	{ XFS_TRANS_QM_QUOTAOFF_END,	"QM_QOFF_END" }, \
+	{ XFS_TRANS_SB_UNIT,		"SB_UNIT" }, \
+	{ XFS_TRANS_FSYNC_TS,		"FSYNC_TS" }, \
+	{ XFS_TRANS_GROWFSRT_ALLOC,	"GROWFSRT_ALLOC" }, \
+	{ XFS_TRANS_GROWFSRT_ZERO,	"GROWFSRT_ZERO" }, \
+	{ XFS_TRANS_GROWFSRT_FREE,	"GROWFSRT_FREE" }, \
+	{ XFS_TRANS_SWAPEXT,		"SWAPEXT" }, \
+	{ XFS_TRANS_SB_COUNT,		"SB_COUNT" }, \
+	{ XFS_TRANS_CHECKPOINT,		"CHECKPOINT" }, \
+	{ XFS_TRANS_DUMMY1,		"DUMMY1" }, \
+	{ XFS_TRANS_DUMMY2,		"DUMMY2" }, \
+	{ XLOG_UNMOUNT_REC_TYPE,	"UNMOUNT" }
+
+/*
+ * This structure is used to track log items associated with
+ * a transaction.  It points to the log item and keeps some
+ * flags to track the state of the log item.  It also tracks
+ * the amount of space needed to log the item it describes
+ * once we get to commit processing (see xfs_trans_commit()).
+ */
+struct xfs_log_item_desc {
+	struct xfs_log_item	*lid_item;
+	ushort			lid_size;
+	unsigned char		lid_flags;
+	struct list_head	lid_trans;
+};
+
+#define XFS_LID_DIRTY		0x1
+
+#define	XFS_TRANS_MAGIC		0x5452414E	/* 'TRAN' */
+/*
+ * Values for t_flags.
+ */
+#define	XFS_TRANS_DIRTY		0x01	/* something needs to be logged */
+#define	XFS_TRANS_SB_DIRTY	0x02	/* superblock is modified */
+#define	XFS_TRANS_PERM_LOG_RES	0x04	/* xact took a permanent log res */
+#define	XFS_TRANS_SYNC		0x08	/* make commit synchronous */
+#define XFS_TRANS_DQ_DIRTY	0x10	/* at least one dquot in trx dirty */
+#define XFS_TRANS_RESERVE	0x20    /* OK to use reserved data blocks */
+
+/*
+ * Values for call flags parameter.
+ */
+#define	XFS_TRANS_RELEASE_LOG_RES	0x4
+#define	XFS_TRANS_ABORT			0x8
+
+/*
+ * Field values for xfs_trans_mod_sb.
+ */
+#define	XFS_TRANS_SB_ICOUNT		0x00000001
+#define	XFS_TRANS_SB_IFREE		0x00000002
+#define	XFS_TRANS_SB_FDBLOCKS		0x00000004
+#define	XFS_TRANS_SB_RES_FDBLOCKS	0x00000008
+#define	XFS_TRANS_SB_FREXTENTS		0x00000010
+#define	XFS_TRANS_SB_RES_FREXTENTS	0x00000020
+#define	XFS_TRANS_SB_DBLOCKS		0x00000040
+#define	XFS_TRANS_SB_AGCOUNT		0x00000080
+#define	XFS_TRANS_SB_IMAXPCT		0x00000100
+#define	XFS_TRANS_SB_REXTSIZE		0x00000200
+#define	XFS_TRANS_SB_RBMBLOCKS		0x00000400
+#define	XFS_TRANS_SB_RBLOCKS		0x00000800
+#define	XFS_TRANS_SB_REXTENTS		0x00001000
+#define	XFS_TRANS_SB_REXTSLOG		0x00002000
+
+
+/*
+ * Per-extent log reservation for the allocation btree changes
+ * involved in freeing or allocating an extent.
+ * 2 trees * (2 blocks/level * max depth - 1) * block size
+ */
+#define	XFS_ALLOCFREE_LOG_RES(mp,nx) \
+	((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+#define	XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
+	((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+
+/*
+ * Per-directory log reservation for any directory change.
+ * dir blocks: (1 btree block per level + data block + free block) * dblock size
+ * bmap btree: (levels + 2) * max depth * block size
+ * v2 directory blocks can be fragmented below the dirblksize down to the fsb
+ * size, so account for that in the DAENTER macros.
+ */
+#define	XFS_DIROP_LOG_RES(mp)	\
+	(XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
+	 (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
+#define	XFS_DIROP_LOG_COUNT(mp)	\
+	(XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
+	 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
+
+
+#define	XFS_WRITE_LOG_RES(mp)	((mp)->m_reservations.tr_write)
+#define	XFS_ITRUNCATE_LOG_RES(mp)   ((mp)->m_reservations.tr_itruncate)
+#define	XFS_RENAME_LOG_RES(mp)	((mp)->m_reservations.tr_rename)
+#define	XFS_LINK_LOG_RES(mp)	((mp)->m_reservations.tr_link)
+#define	XFS_REMOVE_LOG_RES(mp)	((mp)->m_reservations.tr_remove)
+#define	XFS_SYMLINK_LOG_RES(mp)	((mp)->m_reservations.tr_symlink)
+#define	XFS_CREATE_LOG_RES(mp)	((mp)->m_reservations.tr_create)
+#define	XFS_MKDIR_LOG_RES(mp)	((mp)->m_reservations.tr_mkdir)
+#define	XFS_IFREE_LOG_RES(mp)	((mp)->m_reservations.tr_ifree)
+#define	XFS_ICHANGE_LOG_RES(mp)	((mp)->m_reservations.tr_ichange)
+#define	XFS_GROWDATA_LOG_RES(mp)    ((mp)->m_reservations.tr_growdata)
+#define	XFS_GROWRTALLOC_LOG_RES(mp)	((mp)->m_reservations.tr_growrtalloc)
+#define	XFS_GROWRTZERO_LOG_RES(mp)	((mp)->m_reservations.tr_growrtzero)
+#define	XFS_GROWRTFREE_LOG_RES(mp)	((mp)->m_reservations.tr_growrtfree)
+#define	XFS_SWRITE_LOG_RES(mp)	((mp)->m_reservations.tr_swrite)
+/*
+ * Logging the inode timestamps on an fsync -- same as SWRITE
+ * as long as SWRITE logs the entire inode core
+ */
+#define XFS_FSYNC_TS_LOG_RES(mp)        ((mp)->m_reservations.tr_swrite)
+#define	XFS_WRITEID_LOG_RES(mp)	((mp)->m_reservations.tr_swrite)
+#define	XFS_ADDAFORK_LOG_RES(mp)	((mp)->m_reservations.tr_addafork)
+#define	XFS_ATTRINVAL_LOG_RES(mp)	((mp)->m_reservations.tr_attrinval)
+#define	XFS_ATTRSET_LOG_RES(mp, ext)	\
+	((mp)->m_reservations.tr_attrset + \
+	 (ext * (mp)->m_sb.sb_sectsize) + \
+	 (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
+	 (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
+#define	XFS_ATTRRM_LOG_RES(mp)	((mp)->m_reservations.tr_attrrm)
+#define	XFS_CLEAR_AGI_BUCKET_LOG_RES(mp)  ((mp)->m_reservations.tr_clearagi)
+
+
+/*
+ * Various log count values.
+ */
+#define	XFS_DEFAULT_LOG_COUNT		1
+#define	XFS_DEFAULT_PERM_LOG_COUNT	2
+#define	XFS_ITRUNCATE_LOG_COUNT		2
+#define XFS_INACTIVE_LOG_COUNT		2
+#define	XFS_CREATE_LOG_COUNT		2
+#define	XFS_MKDIR_LOG_COUNT		3
+#define	XFS_SYMLINK_LOG_COUNT		3
+#define	XFS_REMOVE_LOG_COUNT		2
+#define	XFS_LINK_LOG_COUNT		2
+#define	XFS_RENAME_LOG_COUNT		2
+#define	XFS_WRITE_LOG_COUNT		2
+#define	XFS_ADDAFORK_LOG_COUNT		2
+#define	XFS_ATTRINVAL_LOG_COUNT		1
+#define	XFS_ATTRSET_LOG_COUNT		3
+#define	XFS_ATTRRM_LOG_COUNT		3
+
+/*
+ * Here we centralize the specification of XFS meta-data buffer
+ * reference count values.  This determine how hard the buffer
+ * cache tries to hold onto the buffer.
+ */
+#define	XFS_AGF_REF		4
+#define	XFS_AGI_REF		4
+#define	XFS_AGFL_REF		3
+#define	XFS_INO_BTREE_REF	3
+#define	XFS_ALLOC_BTREE_REF	2
+#define	XFS_BMAP_BTREE_REF	2
+#define	XFS_DIR_BTREE_REF	2
+#define	XFS_INO_REF		2
+#define	XFS_ATTR_BTREE_REF	1
+#define	XFS_DQUOT_REF		1
+
+#ifdef __KERNEL__
+
+struct xfs_buf;
+struct xfs_buftarg;
+struct xfs_efd_log_item;
+struct xfs_efi_log_item;
+struct xfs_inode;
+struct xfs_item_ops;
+struct xfs_log_iovec;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot_acct;
+struct xfs_busy_extent;
+
+typedef struct xfs_log_item {
+	struct list_head		li_ail;		/* AIL pointers */
+	xfs_lsn_t			li_lsn;		/* last on-disk lsn */
+	struct xfs_log_item_desc	*li_desc;	/* ptr to current desc*/
+	struct xfs_mount		*li_mountp;	/* ptr to fs mount */
+	struct xfs_ail			*li_ailp;	/* ptr to AIL */
+	uint				li_type;	/* item type */
+	uint				li_flags;	/* misc flags */
+	struct xfs_log_item		*li_bio_list;	/* buffer item list */
+	void				(*li_cb)(struct xfs_buf *,
+						 struct xfs_log_item *);
+							/* buffer item iodone */
+							/* callback func */
+	struct xfs_item_ops		*li_ops;	/* function list */
+
+	/* delayed logging */
+	struct list_head		li_cil;		/* CIL pointers */
+	struct xfs_log_vec		*li_lv;		/* active log vector */
+	xfs_lsn_t			li_seq;		/* CIL commit seq */
+} xfs_log_item_t;
+
+#define	XFS_LI_IN_AIL	0x1
+#define XFS_LI_ABORTED	0x2
+
+#define XFS_LI_FLAGS \
+	{ XFS_LI_IN_AIL,	"IN_AIL" }, \
+	{ XFS_LI_ABORTED,	"ABORTED" }
+
+typedef struct xfs_item_ops {
+	uint (*iop_size)(xfs_log_item_t *);
+	void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+	void (*iop_pin)(xfs_log_item_t *);
+	void (*iop_unpin)(xfs_log_item_t *, int remove);
+	uint (*iop_trylock)(xfs_log_item_t *);
+	void (*iop_unlock)(xfs_log_item_t *);
+	xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+	void (*iop_push)(xfs_log_item_t *);
+	bool (*iop_pushbuf)(xfs_log_item_t *);
+	void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+} xfs_item_ops_t;
+
+#define IOP_SIZE(ip)		(*(ip)->li_ops->iop_size)(ip)
+#define IOP_FORMAT(ip,vp)	(*(ip)->li_ops->iop_format)(ip, vp)
+#define IOP_PIN(ip)		(*(ip)->li_ops->iop_pin)(ip)
+#define IOP_UNPIN(ip, remove)	(*(ip)->li_ops->iop_unpin)(ip, remove)
+#define IOP_TRYLOCK(ip)		(*(ip)->li_ops->iop_trylock)(ip)
+#define IOP_UNLOCK(ip)		(*(ip)->li_ops->iop_unlock)(ip)
+#define IOP_COMMITTED(ip, lsn)	(*(ip)->li_ops->iop_committed)(ip, lsn)
+#define IOP_PUSH(ip)		(*(ip)->li_ops->iop_push)(ip)
+#define IOP_PUSHBUF(ip)		(*(ip)->li_ops->iop_pushbuf)(ip)
+#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
+
+/*
+ * Return values for the IOP_TRYLOCK() routines.
+ */
+#define	XFS_ITEM_SUCCESS	0
+#define	XFS_ITEM_PINNED		1
+#define	XFS_ITEM_LOCKED		2
+#define XFS_ITEM_PUSHBUF	3
+
+/*
+ * This is the type of function which can be given to xfs_trans_callback()
+ * to be called upon the transaction's commit to disk.
+ */
+typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
+
+/*
+ * This is the structure maintained for every active transaction.
+ */
+typedef struct xfs_trans {
+	unsigned int		t_magic;	/* magic number */
+	xfs_log_callback_t	t_logcb;	/* log callback struct */
+	unsigned int		t_type;		/* transaction type */
+	unsigned int		t_log_res;	/* amt of log space resvd */
+	unsigned int		t_log_count;	/* count for perm log res */
+	unsigned int		t_blk_res;	/* # of blocks resvd */
+	unsigned int		t_blk_res_used;	/* # of resvd blocks used */
+	unsigned int		t_rtx_res;	/* # of rt extents resvd */
+	unsigned int		t_rtx_res_used;	/* # of resvd rt extents used */
+	struct xlog_ticket	*t_ticket;	/* log mgr ticket */
+	xfs_lsn_t		t_lsn;		/* log seq num of start of
+						 * transaction. */
+	xfs_lsn_t		t_commit_lsn;	/* log seq num of end of
+						 * transaction. */
+	struct xfs_mount	*t_mountp;	/* ptr to fs mount struct */
+	struct xfs_dquot_acct   *t_dqinfo;	/* acctg info for dquots */
+	unsigned int		t_flags;	/* misc flags */
+	int64_t			t_icount_delta;	/* superblock icount change */
+	int64_t			t_ifree_delta;	/* superblock ifree change */
+	int64_t			t_fdblocks_delta; /* superblock fdblocks chg */
+	int64_t			t_res_fdblocks_delta; /* on-disk only chg */
+	int64_t			t_frextents_delta;/* superblock freextents chg*/
+	int64_t			t_res_frextents_delta; /* on-disk only chg */
+#ifdef DEBUG
+	int64_t			t_ag_freeblks_delta; /* debugging counter */
+	int64_t			t_ag_flist_delta; /* debugging counter */
+	int64_t			t_ag_btree_delta; /* debugging counter */
+#endif
+	int64_t			t_dblocks_delta;/* superblock dblocks change */
+	int64_t			t_agcount_delta;/* superblock agcount change */
+	int64_t			t_imaxpct_delta;/* superblock imaxpct change */
+	int64_t			t_rextsize_delta;/* superblock rextsize chg */
+	int64_t			t_rbmblocks_delta;/* superblock rbmblocks chg */
+	int64_t			t_rblocks_delta;/* superblock rblocks change */
+	int64_t			t_rextents_delta;/* superblocks rextents chg */
+	int64_t			t_rextslog_delta;/* superblocks rextslog chg */
+	struct list_head	t_items;	/* log item descriptors */
+	xfs_trans_header_t	t_header;	/* header for in-log trans */
+	struct list_head	t_busy;		/* list of busy extents */
+	unsigned long		t_pflags;	/* saved process flags state */
+} xfs_trans_t;
+
+/*
+ * XFS transaction mechanism exported interfaces that are
+ * actually macros.
+ */
+#define	xfs_trans_get_log_res(tp)	((tp)->t_log_res)
+#define	xfs_trans_get_log_count(tp)	((tp)->t_log_count)
+#define	xfs_trans_get_block_res(tp)	((tp)->t_blk_res)
+#define	xfs_trans_set_sync(tp)		((tp)->t_flags |= XFS_TRANS_SYNC)
+
+#ifdef DEBUG
+#define	xfs_trans_agblocks_delta(tp, d)	((tp)->t_ag_freeblks_delta += (int64_t)d)
+#define	xfs_trans_agflist_delta(tp, d)	((tp)->t_ag_flist_delta += (int64_t)d)
+#define	xfs_trans_agbtree_delta(tp, d)	((tp)->t_ag_btree_delta += (int64_t)d)
+#else
+#define	xfs_trans_agblocks_delta(tp, d)
+#define	xfs_trans_agflist_delta(tp, d)
+#define	xfs_trans_agbtree_delta(tp, d)
+#endif
+
+/*
+ * XFS transaction mechanism exported interfaces.
+ */
+xfs_trans_t	*xfs_trans_alloc(struct xfs_mount *, uint);
+xfs_trans_t	*_xfs_trans_alloc(struct xfs_mount *, uint, uint);
+xfs_trans_t	*xfs_trans_dup(xfs_trans_t *);
+int		xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
+				  uint, uint);
+void		xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
+struct xfs_buf	*xfs_trans_get_buf(xfs_trans_t *, struct xfs_buftarg *, xfs_daddr_t,
+				   int, uint);
+int		xfs_trans_read_buf(struct xfs_mount *, xfs_trans_t *,
+				   struct xfs_buftarg *, xfs_daddr_t, int, uint,
+				   struct xfs_buf **);
+struct xfs_buf	*xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
+
+void		xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
+void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
+void		xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
+void		xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
+void		xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
+void		xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
+struct xfs_efi_log_item	*xfs_trans_get_efi(xfs_trans_t *, uint);
+void		xfs_efi_release(struct xfs_efi_log_item *, uint);
+void		xfs_trans_log_efi_extent(xfs_trans_t *,
+					 struct xfs_efi_log_item *,
+					 xfs_fsblock_t,
+					 xfs_extlen_t);
+struct xfs_efd_log_item	*xfs_trans_get_efd(xfs_trans_t *,
+				  struct xfs_efi_log_item *,
+				  uint);
+void		xfs_trans_log_efd_extent(xfs_trans_t *,
+					 struct xfs_efd_log_item *,
+					 xfs_fsblock_t,
+					 xfs_extlen_t);
+int		_xfs_trans_commit(xfs_trans_t *,
+				  uint flags,
+				  int *);
+#define xfs_trans_commit(tp, flags)	_xfs_trans_commit(tp, flags, NULL)
+void		xfs_trans_cancel(xfs_trans_t *, int);
+int		xfs_trans_ail_init(struct xfs_mount *);
+void		xfs_trans_ail_destroy(struct xfs_mount *);
+
+extern kmem_zone_t	*xfs_trans_zone;
+extern kmem_zone_t	*xfs_log_item_desc_zone;
+
+#endif	/* __KERNEL__ */
+
+void		xfs_trans_init(struct xfs_mount *);
+int		xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
+
+#endif	/* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
new file mode 100644
index 00000000..a4c281bf
--- /dev/null
+++ b/fs/xfs/xfs_trans_ail.c
@@ -0,0 +1,890 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2008 Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+
+#ifdef DEBUG
+/*
+ * Check that the list is sorted as it should be.
+ */
+STATIC void
+xfs_ail_check(
+	struct xfs_ail	*ailp,
+	xfs_log_item_t	*lip)
+{
+	xfs_log_item_t	*prev_lip;
+
+	if (list_empty(&ailp->xa_ail))
+		return;
+
+	/*
+	 * Check the next and previous entries are valid.
+	 */
+	ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+	prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
+	if (&prev_lip->li_ail != &ailp->xa_ail)
+		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+
+	prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
+	if (&prev_lip->li_ail != &ailp->xa_ail)
+		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
+
+
+#ifdef XFS_TRANS_DEBUG
+	/*
+	 * Walk the list checking lsn ordering, and that every entry has the
+	 * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
+	 * when specifically debugging the transaction subsystem.
+	 */
+	prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
+	list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
+		if (&prev_lip->li_ail != &ailp->xa_ail)
+			ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+		ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+		prev_lip = lip;
+	}
+#endif /* XFS_TRANS_DEBUG */
+}
+#else /* !DEBUG */
+#define	xfs_ail_check(a,l)
+#endif /* DEBUG */
+
+/*
+ * Return a pointer to the first item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_min(
+	struct xfs_ail  *ailp)
+{
+	if (list_empty(&ailp->xa_ail))
+		return NULL;
+
+	return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
+}
+
+ /*
+ * Return a pointer to the last item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_max(
+	struct xfs_ail  *ailp)
+{
+	if (list_empty(&ailp->xa_ail))
+		return NULL;
+
+	return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail);
+}
+
+/*
+ * Return a pointer to the item which follows the given item in the AIL.  If
+ * the given item is the last item in the list, then return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_next(
+	struct xfs_ail  *ailp,
+	xfs_log_item_t  *lip)
+{
+	if (lip->li_ail.next == &ailp->xa_ail)
+		return NULL;
+
+	return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
+}
+
+/*
+ * This is called by the log manager code to determine the LSN of the tail of
+ * the log.  This is exactly the LSN of the first item in the AIL.  If the AIL
+ * is empty, then this function returns 0.
+ *
+ * We need the AIL lock in order to get a coherent read of the lsn of the last
+ * item in the AIL.
+ */
+xfs_lsn_t
+xfs_ail_min_lsn(
+	struct xfs_ail	*ailp)
+{
+	xfs_lsn_t	lsn = 0;
+	xfs_log_item_t	*lip;
+
+	spin_lock(&ailp->xa_lock);
+	lip = xfs_ail_min(ailp);
+	if (lip)
+		lsn = lip->li_lsn;
+	spin_unlock(&ailp->xa_lock);
+
+	return lsn;
+}
+
+/*
+ * Return the maximum lsn held in the AIL, or zero if the AIL is empty.
+ */
+static xfs_lsn_t
+xfs_ail_max_lsn(
+	struct xfs_ail  *ailp)
+{
+	xfs_lsn_t       lsn = 0;
+	xfs_log_item_t  *lip;
+
+	spin_lock(&ailp->xa_lock);
+	lip = xfs_ail_max(ailp);
+	if (lip)
+		lsn = lip->li_lsn;
+	spin_unlock(&ailp->xa_lock);
+
+	return lsn;
+}
+
+/*
+ * AIL traversal cursor initialisation.
+ *
+ * The cursor keeps track of where our current traversal is up
+ * to by tracking the next ƣtem in the list for us. However, for
+ * this to be safe, removing an object from the AIL needs to invalidate
+ * any cursor that points to it. hence the traversal cursor needs to
+ * be linked to the struct xfs_ail so that deletion can search all the
+ * active cursors for invalidation.
+ *
+ * We don't link the push cursor because it is embedded in the struct
+ * xfs_ail and hence easily findable.
+ */
+STATIC void
+xfs_trans_ail_cursor_init(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur)
+{
+	cur->item = NULL;
+	if (cur == &ailp->xa_cursors)
+		return;
+
+	cur->next = ailp->xa_cursors.next;
+	ailp->xa_cursors.next = cur;
+}
+
+/*
+ * Set the cursor to the next item, because when we look
+ * up the cursor the current item may have been freed.
+ */
+STATIC void
+xfs_trans_ail_cursor_set(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	struct xfs_log_item	*lip)
+{
+	if (lip)
+		cur->item = xfs_ail_next(ailp, lip);
+}
+
+/*
+ * Get the next item in the traversal and advance the cursor.
+ * If the cursor was invalidated (inidicated by a lip of 1),
+ * restart the traversal.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_next(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur)
+{
+	struct xfs_log_item	*lip = cur->item;
+
+	if ((__psint_t)lip & 1)
+		lip = xfs_ail_min(ailp);
+	xfs_trans_ail_cursor_set(ailp, cur, lip);
+	return lip;
+}
+
+/*
+ * Now that the traversal is complete, we need to remove the cursor
+ * from the list of traversing cursors. Avoid removing the embedded
+ * push cursor, but use the fact it is always present to make the
+ * list deletion simple.
+ */
+void
+xfs_trans_ail_cursor_done(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*done)
+{
+	struct xfs_ail_cursor	*prev = NULL;
+	struct xfs_ail_cursor	*cur;
+
+	done->item = NULL;
+	if (done == &ailp->xa_cursors)
+		return;
+	prev = &ailp->xa_cursors;
+	for (cur = prev->next; cur; prev = cur, cur = prev->next) {
+		if (cur == done) {
+			prev->next = cur->next;
+			break;
+		}
+	}
+	ASSERT(cur);
+}
+
+/*
+ * Invalidate any cursor that is pointing to this item. This is
+ * called when an item is removed from the AIL. Any cursor pointing
+ * to this object is now invalid and the traversal needs to be
+ * terminated so it doesn't reference a freed object. We set the
+ * cursor item to a value of 1 so we can distinguish between an
+ * invalidation and the end of the list when getting the next item
+ * from the cursor.
+ */
+STATIC void
+xfs_trans_ail_cursor_clear(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip)
+{
+	struct xfs_ail_cursor	*cur;
+
+	/* need to search all cursors */
+	for (cur = &ailp->xa_cursors; cur; cur = cur->next) {
+		if (cur->item == lip)
+			cur->item = (struct xfs_log_item *)
+					((__psint_t)cur->item | 1);
+	}
+}
+
+/*
+ * Initialise the cursor to the first item in the AIL with the given @lsn.
+ * This searches the list from lowest LSN to highest. Pass a @lsn of zero
+ * to initialise the cursor to the first item in the AIL.
+ */
+xfs_log_item_t *
+xfs_trans_ail_cursor_first(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	xfs_lsn_t		lsn)
+{
+	xfs_log_item_t		*lip;
+
+	xfs_trans_ail_cursor_init(ailp, cur);
+	lip = xfs_ail_min(ailp);
+	if (lsn == 0)
+		goto out;
+
+	list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
+		if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
+			goto out;
+	}
+	lip = NULL;
+out:
+	xfs_trans_ail_cursor_set(ailp, cur, lip);
+	return lip;
+}
+
+/*
+ * Initialise the cursor to the last item in the AIL with the given @lsn.
+ * This searches the list from highest LSN to lowest. If there is no item with
+ * the value of @lsn, then it sets the cursor to the last item with an LSN lower
+ * than @lsn.
+ */
+static struct xfs_log_item *
+__xfs_trans_ail_cursor_last(
+	struct xfs_ail		*ailp,
+	xfs_lsn_t		lsn)
+{
+	xfs_log_item_t		*lip;
+
+	list_for_each_entry_reverse(lip, &ailp->xa_ail, li_ail) {
+		if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0)
+			return lip;
+	}
+	return NULL;
+}
+
+/*
+ * Initialise the cursor to the last item in the AIL with the given @lsn.
+ * This searches the list from highest LSN to lowest.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_last(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	xfs_lsn_t		lsn)
+{
+	xfs_trans_ail_cursor_init(ailp, cur);
+	cur->item = __xfs_trans_ail_cursor_last(ailp, lsn);
+	return cur->item;
+}
+
+/*
+ * splice the log item list into the AIL at the given LSN. We splice to the
+ * tail of the given LSN to maintain insert order for push traversals. The
+ * cursor is optional, allowing repeated updates to the same LSN to avoid
+ * repeated traversals.
+ */
+static void
+xfs_ail_splice(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	struct list_head	*list,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_log_item	*lip = cur ? cur->item : NULL;
+	struct xfs_log_item	*next_lip;
+
+	/*
+	 * Get a new cursor if we don't have a placeholder or the existing one
+	 * has been invalidated.
+	 */
+	if (!lip || (__psint_t)lip & 1) {
+		lip = __xfs_trans_ail_cursor_last(ailp, lsn);
+
+		if (!lip) {
+			/* The list is empty, so just splice and return.  */
+			if (cur)
+				cur->item = NULL;
+			list_splice(list, &ailp->xa_ail);
+			return;
+		}
+	}
+
+	/*
+	 * Our cursor points to the item we want to insert _after_, so we have
+	 * to update the cursor to point to the end of the list we are splicing
+	 * in so that it points to the correct location for the next splice.
+	 * i.e. before the splice
+	 *
+	 *  lsn -> lsn -> lsn + x -> lsn + x ...
+	 *          ^
+	 *          | cursor points here
+	 *
+	 * After the splice we have:
+	 *
+	 *  lsn -> lsn -> lsn -> lsn -> .... -> lsn -> lsn + x -> lsn + x ...
+	 *          ^                            ^
+	 *          | cursor points here         | needs to move here
+	 *
+	 * So we set the cursor to the last item in the list to be spliced
+	 * before we execute the splice, resulting in the cursor pointing to
+	 * the correct item after the splice occurs.
+	 */
+	if (cur) {
+		next_lip = list_entry(list->prev, struct xfs_log_item, li_ail);
+		cur->item = next_lip;
+	}
+	list_splice(list, &lip->li_ail);
+}
+
+/*
+ * Delete the given item from the AIL.  Return a pointer to the item.
+ */
+static void
+xfs_ail_delete(
+	struct xfs_ail  *ailp,
+	xfs_log_item_t  *lip)
+{
+	xfs_ail_check(ailp, lip);
+	list_del(&lip->li_ail);
+	xfs_trans_ail_cursor_clear(ailp, lip);
+}
+
+static long
+xfsaild_push(
+	struct xfs_ail		*ailp)
+{
+	xfs_mount_t		*mp = ailp->xa_mount;
+	struct xfs_ail_cursor	*cur = &ailp->xa_cursors;
+	xfs_log_item_t		*lip;
+	xfs_lsn_t		lsn;
+	xfs_lsn_t		target;
+	long			tout = 10;
+	int			flush_log = 0;
+	int			stuck = 0;
+	int			count = 0;
+	int			push_xfsbufd = 0;
+
+	spin_lock(&ailp->xa_lock);
+	target = ailp->xa_target;
+	xfs_trans_ail_cursor_init(ailp, cur);
+	lip = xfs_trans_ail_cursor_first(ailp, cur, ailp->xa_last_pushed_lsn);
+	if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
+		/*
+		 * AIL is empty or our push has reached the end.
+		 */
+		xfs_trans_ail_cursor_done(ailp, cur);
+		spin_unlock(&ailp->xa_lock);
+		goto out_done;
+	}
+
+	XFS_STATS_INC(xs_push_ail);
+
+	/*
+	 * While the item we are looking at is below the given threshold
+	 * try to flush it out. We'd like not to stop until we've at least
+	 * tried to push on everything in the AIL with an LSN less than
+	 * the given threshold.
+	 *
+	 * However, we will stop after a certain number of pushes and wait
+	 * for a reduced timeout to fire before pushing further. This
+	 * prevents use from spinning when we can't do anything or there is
+	 * lots of contention on the AIL lists.
+	 */
+	lsn = lip->li_lsn;
+	while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
+		int	lock_result;
+		/*
+		 * If we can lock the item without sleeping, unlock the AIL
+		 * lock and flush the item.  Then re-grab the AIL lock so we
+		 * can look for the next item on the AIL. List changes are
+		 * handled by the AIL lookup functions internally
+		 *
+		 * If we can't lock the item, either its holder will flush it
+		 * or it is already being flushed or it is being relogged.  In
+		 * any of these case it is being taken care of and we can just
+		 * skip to the next item in the list.
+		 */
+		lock_result = IOP_TRYLOCK(lip);
+		spin_unlock(&ailp->xa_lock);
+		switch (lock_result) {
+		case XFS_ITEM_SUCCESS:
+			XFS_STATS_INC(xs_push_ail_success);
+			IOP_PUSH(lip);
+			ailp->xa_last_pushed_lsn = lsn;
+			break;
+
+		case XFS_ITEM_PUSHBUF:
+			XFS_STATS_INC(xs_push_ail_pushbuf);
+
+			if (!IOP_PUSHBUF(lip)) {
+				stuck++;
+				flush_log = 1;
+			} else {
+				ailp->xa_last_pushed_lsn = lsn;
+			}
+			push_xfsbufd = 1;
+			break;
+
+		case XFS_ITEM_PINNED:
+			XFS_STATS_INC(xs_push_ail_pinned);
+			stuck++;
+			flush_log = 1;
+			break;
+
+		case XFS_ITEM_LOCKED:
+			XFS_STATS_INC(xs_push_ail_locked);
+			stuck++;
+			break;
+
+		default:
+			ASSERT(0);
+			break;
+		}
+
+		spin_lock(&ailp->xa_lock);
+		/* should we bother continuing? */
+		if (XFS_FORCED_SHUTDOWN(mp))
+			break;
+		ASSERT(mp->m_log);
+
+		count++;
+
+		/*
+		 * Are there too many items we can't do anything with?
+		 * If we we are skipping too many items because we can't flush
+		 * them or they are already being flushed, we back off and
+		 * given them time to complete whatever operation is being
+		 * done. i.e. remove pressure from the AIL while we can't make
+		 * progress so traversals don't slow down further inserts and
+		 * removals to/from the AIL.
+		 *
+		 * The value of 100 is an arbitrary magic number based on
+		 * observation.
+		 */
+		if (stuck > 100)
+			break;
+
+		lip = xfs_trans_ail_cursor_next(ailp, cur);
+		if (lip == NULL)
+			break;
+		lsn = lip->li_lsn;
+	}
+	xfs_trans_ail_cursor_done(ailp, cur);
+	spin_unlock(&ailp->xa_lock);
+
+	if (flush_log) {
+		/*
+		 * If something we need to push out was pinned, then
+		 * push out the log so it will become unpinned and
+		 * move forward in the AIL.
+		 */
+		XFS_STATS_INC(xs_push_ail_flush);
+		xfs_log_force(mp, 0);
+	}
+
+	if (push_xfsbufd) {
+		/* we've got delayed write buffers to flush */
+		wake_up_process(mp->m_ddev_targp->bt_task);
+	}
+
+	/* assume we have more work to do in a short while */
+out_done:
+	if (!count) {
+		/* We're past our target or empty, so idle */
+		ailp->xa_last_pushed_lsn = 0;
+
+		tout = 50;
+	} else if (XFS_LSN_CMP(lsn, target) >= 0) {
+		/*
+		 * We reached the target so wait a bit longer for I/O to
+		 * complete and remove pushed items from the AIL before we
+		 * start the next scan from the start of the AIL.
+		 */
+		tout = 50;
+		ailp->xa_last_pushed_lsn = 0;
+	} else if ((stuck * 100) / count > 90) {
+		/*
+		 * Either there is a lot of contention on the AIL or we
+		 * are stuck due to operations in progress. "Stuck" in this
+		 * case is defined as >90% of the items we tried to push
+		 * were stuck.
+		 *
+		 * Backoff a bit more to allow some I/O to complete before
+		 * continuing from where we were.
+		 */
+		tout = 20;
+	}
+
+	return tout;
+}
+
+static int
+xfsaild(
+	void		*data)
+{
+	struct xfs_ail	*ailp = data;
+	long		tout = 0;	/* milliseconds */
+
+	while (!kthread_should_stop()) {
+		if (tout && tout <= 20)
+			__set_current_state(TASK_KILLABLE);
+		else
+			__set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(tout ?
+				 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+
+		try_to_freeze();
+
+		tout = xfsaild_push(ailp);
+	}
+
+	return 0;
+}
+
+/*
+ * This routine is called to move the tail of the AIL forward.  It does this by
+ * trying to flush items in the AIL whose lsns are below the given
+ * threshold_lsn.
+ *
+ * The push is run asynchronously in a workqueue, which means the caller needs
+ * to handle waiting on the async flush for space to become available.
+ * We don't want to interrupt any push that is in progress, hence we only queue
+ * work if we set the pushing bit approriately.
+ *
+ * We do this unlocked - we only need to know whether there is anything in the
+ * AIL at the time we are called. We don't need to access the contents of
+ * any of the objects, so the lock is not needed.
+ */
+void
+xfs_ail_push(
+	struct xfs_ail	*ailp,
+	xfs_lsn_t	threshold_lsn)
+{
+	xfs_log_item_t	*lip;
+
+	lip = xfs_ail_min(ailp);
+	if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) ||
+	    XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0)
+		return;
+
+	/*
+	 * Ensure that the new target is noticed in push code before it clears
+	 * the XFS_AIL_PUSHING_BIT.
+	 */
+	smp_wmb();
+	xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn);
+	smp_wmb();
+
+	wake_up_process(ailp->xa_task);
+}
+
+/*
+ * Push out all items in the AIL immediately
+ */
+void
+xfs_ail_push_all(
+	struct xfs_ail  *ailp)
+{
+	xfs_lsn_t       threshold_lsn = xfs_ail_max_lsn(ailp);
+
+	if (threshold_lsn)
+		xfs_ail_push(ailp, threshold_lsn);
+}
+
+/*
+ * This is to be called when an item is unlocked that may have
+ * been in the AIL.  It will wake up the first member of the AIL
+ * wait list if this item's unlocking might allow it to progress.
+ * If the item is in the AIL, then we need to get the AIL lock
+ * while doing our checking so we don't race with someone going
+ * to sleep waiting for this event in xfs_trans_push_ail().
+ */
+void
+xfs_trans_unlocked_item(
+	struct xfs_ail	*ailp,
+	xfs_log_item_t	*lip)
+{
+	xfs_log_item_t	*min_lip;
+
+	/*
+	 * If we're forcibly shutting down, we may have
+	 * unlocked log items arbitrarily. The last thing
+	 * we want to do is to move the tail of the log
+	 * over some potentially valid data.
+	 */
+	if (!(lip->li_flags & XFS_LI_IN_AIL) ||
+	    XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+		return;
+	}
+
+	/*
+	 * This is the one case where we can call into xfs_ail_min()
+	 * without holding the AIL lock because we only care about the
+	 * case where we are at the tail of the AIL.  If the object isn't
+	 * at the tail, it doesn't matter what result we get back.  This
+	 * is slightly racy because since we were just unlocked, we could
+	 * go to sleep between the call to xfs_ail_min and the call to
+	 * xfs_log_move_tail, have someone else lock us, commit to us disk,
+	 * move us out of the tail of the AIL, and then we wake up.  However,
+	 * the call to xfs_log_move_tail() doesn't do anything if there's
+	 * not enough free space to wake people up so we're safe calling it.
+	 */
+	min_lip = xfs_ail_min(ailp);
+
+	if (min_lip == lip)
+		xfs_log_move_tail(ailp->xa_mount, 1);
+}	/* xfs_trans_unlocked_item */
+
+/*
+ * xfs_trans_ail_update - bulk AIL insertion operation.
+ *
+ * @xfs_trans_ail_update takes an array of log items that all need to be
+ * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
+ * be added.  Otherwise, it will be repositioned  by removing it and re-adding
+ * it to the AIL. If we move the first item in the AIL, update the log tail to
+ * match the new minimum LSN in the AIL.
+ *
+ * This function takes the AIL lock once to execute the update operations on
+ * all the items in the array, and as such should not be called with the AIL
+ * lock held. As a result, once we have the AIL lock, we need to check each log
+ * item LSN to confirm it needs to be moved forward in the AIL.
+ *
+ * To optimise the insert operation, we delete all the items from the AIL in
+ * the first pass, moving them into a temporary list, then splice the temporary
+ * list into the correct position in the AIL. This avoids needing to do an
+ * insert operation on every item.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
+ */
+void
+xfs_trans_ail_update_bulk(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	struct xfs_log_item	**log_items,
+	int			nr_items,
+	xfs_lsn_t		lsn) __releases(ailp->xa_lock)
+{
+	xfs_log_item_t		*mlip;
+	xfs_lsn_t		tail_lsn;
+	int			mlip_changed = 0;
+	int			i;
+	LIST_HEAD(tmp);
+
+	mlip = xfs_ail_min(ailp);
+
+	for (i = 0; i < nr_items; i++) {
+		struct xfs_log_item *lip = log_items[i];
+		if (lip->li_flags & XFS_LI_IN_AIL) {
+			/* check if we really need to move the item */
+			if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
+				continue;
+
+			xfs_ail_delete(ailp, lip);
+			if (mlip == lip)
+				mlip_changed = 1;
+		} else {
+			lip->li_flags |= XFS_LI_IN_AIL;
+		}
+		lip->li_lsn = lsn;
+		list_add(&lip->li_ail, &tmp);
+	}
+
+	xfs_ail_splice(ailp, cur, &tmp, lsn);
+
+	if (!mlip_changed) {
+		spin_unlock(&ailp->xa_lock);
+		return;
+	}
+
+	/*
+	 * It is not safe to access mlip after the AIL lock is dropped, so we
+	 * must get a copy of li_lsn before we do so.  This is especially
+	 * important on 32-bit platforms where accessing and updating 64-bit
+	 * values like li_lsn is not atomic.
+	 */
+	mlip = xfs_ail_min(ailp);
+	tail_lsn = mlip->li_lsn;
+	spin_unlock(&ailp->xa_lock);
+	xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
+
+/*
+ * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
+ *
+ * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
+ * removed from the AIL. The caller is already holding the AIL lock, and done
+ * all the checks necessary to ensure the items passed in via @log_items are
+ * ready for deletion. This includes checking that the items are in the AIL.
+ *
+ * For each log item to be removed, unlink it  from the AIL, clear the IN_AIL
+ * flag from the item and reset the item's lsn to 0. If we remove the first
+ * item in the AIL, update the log tail to match the new minimum LSN in the
+ * AIL.
+ *
+ * This function will not drop the AIL lock until all items are removed from
+ * the AIL to minimise the amount of lock traffic on the AIL. This does not
+ * greatly increase the AIL hold time, but does significantly reduce the amount
+ * of traffic on the lock, especially during IO completion.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
+ */
+void
+xfs_trans_ail_delete_bulk(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	**log_items,
+	int			nr_items) __releases(ailp->xa_lock)
+{
+	xfs_log_item_t		*mlip;
+	xfs_lsn_t		tail_lsn;
+	int			mlip_changed = 0;
+	int			i;
+
+	mlip = xfs_ail_min(ailp);
+
+	for (i = 0; i < nr_items; i++) {
+		struct xfs_log_item *lip = log_items[i];
+		if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+			struct xfs_mount	*mp = ailp->xa_mount;
+
+			spin_unlock(&ailp->xa_lock);
+			if (!XFS_FORCED_SHUTDOWN(mp)) {
+				xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+		"%s: attempting to delete a log item that is not in the AIL",
+						__func__);
+				xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+			}
+			return;
+		}
+
+		xfs_ail_delete(ailp, lip);
+		lip->li_flags &= ~XFS_LI_IN_AIL;
+		lip->li_lsn = 0;
+		if (mlip == lip)
+			mlip_changed = 1;
+	}
+
+	if (!mlip_changed) {
+		spin_unlock(&ailp->xa_lock);
+		return;
+	}
+
+	/*
+	 * It is not safe to access mlip after the AIL lock is dropped, so we
+	 * must get a copy of li_lsn before we do so.  This is especially
+	 * important on 32-bit platforms where accessing and updating 64-bit
+	 * values like li_lsn is not atomic. It is possible we've emptied the
+	 * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
+	 */
+	mlip = xfs_ail_min(ailp);
+	tail_lsn = mlip ? mlip->li_lsn : 0;
+	spin_unlock(&ailp->xa_lock);
+	xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
+
+/*
+ * The active item list (AIL) is a doubly linked list of log
+ * items sorted by ascending lsn.  The base of the list is
+ * a forw/back pointer pair embedded in the xfs mount structure.
+ * The base is initialized with both pointers pointing to the
+ * base.  This case always needs to be distinguished, because
+ * the base has no lsn to look at.  We almost always insert
+ * at the end of the list, so on inserts we search from the
+ * end of the list to find where the new item belongs.
+ */
+
+/*
+ * Initialize the doubly linked list to point only to itself.
+ */
+int
+xfs_trans_ail_init(
+	xfs_mount_t	*mp)
+{
+	struct xfs_ail	*ailp;
+
+	ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
+	if (!ailp)
+		return ENOMEM;
+
+	ailp->xa_mount = mp;
+	INIT_LIST_HEAD(&ailp->xa_ail);
+	spin_lock_init(&ailp->xa_lock);
+
+	ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+			ailp->xa_mount->m_fsname);
+	if (IS_ERR(ailp->xa_task))
+		goto out_free_ailp;
+
+	mp->m_ail = ailp;
+	return 0;
+
+out_free_ailp:
+	kmem_free(ailp);
+	return ENOMEM;
+}
+
+void
+xfs_trans_ail_destroy(
+	xfs_mount_t	*mp)
+{
+	struct xfs_ail	*ailp = mp->m_ail;
+
+	kthread_stop(ailp->xa_task);
+	kmem_free(ailp);
+}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
new file mode 100644
index 00000000..03b3b7f8
--- /dev/null
+++ b/fs/xfs/xfs_trans_buf.c
@@ -0,0 +1,879 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+#include "xfs_rw.h"
+#include "xfs_trace.h"
+
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.
+ */
+STATIC struct xfs_buf *
+xfs_trans_buf_item_match(
+	struct xfs_trans	*tp,
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	int			len)
+{
+	struct xfs_log_item_desc *lidp;
+	struct xfs_buf_log_item	*blip;
+
+	len = BBTOB(len);
+	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+		blip = (struct xfs_buf_log_item *)lidp->lid_item;
+		if (blip->bli_item.li_type == XFS_LI_BUF &&
+		    XFS_BUF_TARGET(blip->bli_buf) == target &&
+		    XFS_BUF_ADDR(blip->bli_buf) == blkno &&
+		    XFS_BUF_COUNT(blip->bli_buf) == len)
+			return blip->bli_buf;
+	}
+
+	return NULL;
+}
+
+/*
+ * Add the locked buffer to the transaction.
+ *
+ * The buffer must be locked, and it cannot be associated with any
+ * transaction.
+ *
+ * If the buffer does not yet have a buf log item associated with it,
+ * then allocate one for it.  Then add the buf item to the transaction.
+ */
+STATIC void
+_xfs_trans_bjoin(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	int			reset_recur)
+{
+	struct xfs_buf_log_item	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
+
+	/*
+	 * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+	 * it doesn't have one yet, then allocate one and initialize it.
+	 * The checks to see if one is there are in xfs_buf_item_init().
+	 */
+	xfs_buf_item_init(bp, tp->t_mountp);
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+	if (reset_recur)
+		bip->bli_recur = 0;
+
+	/*
+	 * Take a reference for this transaction on the buf item.
+	 */
+	atomic_inc(&bip->bli_refcount);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &bip->bli_item);
+
+	/*
+	 * Initialize b_fsprivate2 so we can find it with incore_match()
+	 * in xfs_trans_get_buf() and friends above.
+	 */
+	XFS_BUF_SET_FSPRIVATE2(bp, tp);
+
+}
+
+void
+xfs_trans_bjoin(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp)
+{
+	_xfs_trans_bjoin(tp, bp, 0);
+	trace_xfs_trans_bjoin(bp->b_fspriv);
+}
+
+/*
+ * Get and lock the buffer for the caller if it is not already
+ * locked within the given transaction.  If it is already locked
+ * within the transaction, just increment its lock recursion count
+ * and return a pointer to it.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * get_buf() call.
+ */
+xfs_buf_t *
+xfs_trans_get_buf(xfs_trans_t	*tp,
+		  xfs_buftarg_t	*target_dev,
+		  xfs_daddr_t	blkno,
+		  int		len,
+		  uint		flags)
+{
+	xfs_buf_t		*bp;
+	xfs_buf_log_item_t	*bip;
+
+	if (flags == 0)
+		flags = XBF_LOCK | XBF_MAPPED;
+
+	/*
+	 * Default to a normal get_buf() call if the tp is NULL.
+	 */
+	if (tp == NULL)
+		return xfs_buf_get(target_dev, blkno, len,
+				   flags | XBF_DONT_BLOCK);
+
+	/*
+	 * If we find the buffer in the cache with this transaction
+	 * pointer in its b_fsprivate2 field, then we know we already
+	 * have it locked.  In this case we just increment the lock
+	 * recursion count and return the buffer to the caller.
+	 */
+	bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
+	if (bp != NULL) {
+		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+		if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
+			XFS_BUF_SUPER_STALE(bp);
+
+		/*
+		 * If the buffer is stale then it was binval'ed
+		 * since last read.  This doesn't matter since the
+		 * caller isn't allowed to use the data anyway.
+		 */
+		else if (XFS_BUF_ISSTALE(bp))
+			ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
+
+		ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+		bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+		ASSERT(bip != NULL);
+		ASSERT(atomic_read(&bip->bli_refcount) > 0);
+		bip->bli_recur++;
+		trace_xfs_trans_get_buf_recur(bip);
+		return (bp);
+	}
+
+	/*
+	 * We always specify the XBF_DONT_BLOCK flag within a transaction
+	 * so that get_buf does not try to push out a delayed write buffer
+	 * which might cause another transaction to take place (if the
+	 * buffer was delayed alloc).  Such recursive transactions can
+	 * easily deadlock with our current transaction as well as cause
+	 * us to run out of stack space.
+	 */
+	bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK);
+	if (bp == NULL) {
+		return NULL;
+	}
+
+	ASSERT(!XFS_BUF_GETERROR(bp));
+
+	_xfs_trans_bjoin(tp, bp, 1);
+	trace_xfs_trans_get_buf(bp->b_fspriv);
+	return (bp);
+}
+
+/*
+ * Get and lock the superblock buffer of this file system for the
+ * given transaction.
+ *
+ * We don't need to use incore_match() here, because the superblock
+ * buffer is a private buffer which we keep a pointer to in the
+ * mount structure.
+ */
+xfs_buf_t *
+xfs_trans_getsb(xfs_trans_t	*tp,
+		struct xfs_mount *mp,
+		int		flags)
+{
+	xfs_buf_t		*bp;
+	xfs_buf_log_item_t	*bip;
+
+	/*
+	 * Default to just trying to lock the superblock buffer
+	 * if tp is NULL.
+	 */
+	if (tp == NULL) {
+		return (xfs_getsb(mp, flags));
+	}
+
+	/*
+	 * If the superblock buffer already has this transaction
+	 * pointer in its b_fsprivate2 field, then we know we already
+	 * have it locked.  In this case we just increment the lock
+	 * recursion count and return the buffer to the caller.
+	 */
+	bp = mp->m_sb_bp;
+	if (XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp) {
+		bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+		ASSERT(bip != NULL);
+		ASSERT(atomic_read(&bip->bli_refcount) > 0);
+		bip->bli_recur++;
+		trace_xfs_trans_getsb_recur(bip);
+		return (bp);
+	}
+
+	bp = xfs_getsb(mp, flags);
+	if (bp == NULL)
+		return NULL;
+
+	_xfs_trans_bjoin(tp, bp, 1);
+	trace_xfs_trans_getsb(bp->b_fspriv);
+	return (bp);
+}
+
+#ifdef DEBUG
+xfs_buftarg_t *xfs_error_target;
+int	xfs_do_error;
+int	xfs_req_num;
+int	xfs_error_mod = 33;
+#endif
+
+/*
+ * Get and lock the buffer for the caller if it is not already
+ * locked within the given transaction.  If it has not yet been
+ * read in, read it from disk. If it is already locked
+ * within the transaction and already read in, just increment its
+ * lock recursion count and return a pointer to it.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * read_buf() call.
+ */
+int
+xfs_trans_read_buf(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_buftarg_t	*target,
+	xfs_daddr_t	blkno,
+	int		len,
+	uint		flags,
+	xfs_buf_t	**bpp)
+{
+	xfs_buf_t		*bp;
+	xfs_buf_log_item_t	*bip;
+	int			error;
+
+	if (flags == 0)
+		flags = XBF_LOCK | XBF_MAPPED;
+
+	/*
+	 * Default to a normal get_buf() call if the tp is NULL.
+	 */
+	if (tp == NULL) {
+		bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
+		if (!bp)
+			return (flags & XBF_TRYLOCK) ?
+					EAGAIN : XFS_ERROR(ENOMEM);
+
+		if (XFS_BUF_GETERROR(bp) != 0) {
+			xfs_ioerror_alert("xfs_trans_read_buf", mp,
+					  bp, blkno);
+			error = XFS_BUF_GETERROR(bp);
+			xfs_buf_relse(bp);
+			return error;
+		}
+#ifdef DEBUG
+		if (xfs_do_error) {
+			if (xfs_error_target == target) {
+				if (((xfs_req_num++) % xfs_error_mod) == 0) {
+					xfs_buf_relse(bp);
+					xfs_debug(mp, "Returning error!");
+					return XFS_ERROR(EIO);
+				}
+			}
+		}
+#endif
+		if (XFS_FORCED_SHUTDOWN(mp))
+			goto shutdown_abort;
+		*bpp = bp;
+		return 0;
+	}
+
+	/*
+	 * If we find the buffer in the cache with this transaction
+	 * pointer in its b_fsprivate2 field, then we know we already
+	 * have it locked.  If it is already read in we just increment
+	 * the lock recursion count and return the buffer to the caller.
+	 * If the buffer is not yet read in, then we read it in, increment
+	 * the lock recursion count, and return it to the caller.
+	 */
+	bp = xfs_trans_buf_item_match(tp, target, blkno, len);
+	if (bp != NULL) {
+		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+		ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+		ASSERT((XFS_BUF_ISERROR(bp)) == 0);
+		if (!(XFS_BUF_ISDONE(bp))) {
+			trace_xfs_trans_read_buf_io(bp, _RET_IP_);
+			ASSERT(!XFS_BUF_ISASYNC(bp));
+			XFS_BUF_READ(bp);
+			xfsbdstrat(tp->t_mountp, bp);
+			error = xfs_buf_iowait(bp);
+			if (error) {
+				xfs_ioerror_alert("xfs_trans_read_buf", mp,
+						  bp, blkno);
+				xfs_buf_relse(bp);
+				/*
+				 * We can gracefully recover from most read
+				 * errors. Ones we can't are those that happen
+				 * after the transaction's already dirty.
+				 */
+				if (tp->t_flags & XFS_TRANS_DIRTY)
+					xfs_force_shutdown(tp->t_mountp,
+							SHUTDOWN_META_IO_ERROR);
+				return error;
+			}
+		}
+		/*
+		 * We never locked this buf ourselves, so we shouldn't
+		 * brelse it either. Just get out.
+		 */
+		if (XFS_FORCED_SHUTDOWN(mp)) {
+			trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
+			*bpp = NULL;
+			return XFS_ERROR(EIO);
+		}
+
+
+		bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+		bip->bli_recur++;
+
+		ASSERT(atomic_read(&bip->bli_refcount) > 0);
+		trace_xfs_trans_read_buf_recur(bip);
+		*bpp = bp;
+		return 0;
+	}
+
+	/*
+	 * We always specify the XBF_DONT_BLOCK flag within a transaction
+	 * so that get_buf does not try to push out a delayed write buffer
+	 * which might cause another transaction to take place (if the
+	 * buffer was delayed alloc).  Such recursive transactions can
+	 * easily deadlock with our current transaction as well as cause
+	 * us to run out of stack space.
+	 */
+	bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
+	if (bp == NULL) {
+		*bpp = NULL;
+		return (flags & XBF_TRYLOCK) ?
+					0 : XFS_ERROR(ENOMEM);
+	}
+	if (XFS_BUF_GETERROR(bp) != 0) {
+	    XFS_BUF_SUPER_STALE(bp);
+		error = XFS_BUF_GETERROR(bp);
+
+		xfs_ioerror_alert("xfs_trans_read_buf", mp,
+				  bp, blkno);
+		if (tp->t_flags & XFS_TRANS_DIRTY)
+			xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
+		xfs_buf_relse(bp);
+		return error;
+	}
+#ifdef DEBUG
+	if (xfs_do_error && !(tp->t_flags & XFS_TRANS_DIRTY)) {
+		if (xfs_error_target == target) {
+			if (((xfs_req_num++) % xfs_error_mod) == 0) {
+				xfs_force_shutdown(tp->t_mountp,
+						   SHUTDOWN_META_IO_ERROR);
+				xfs_buf_relse(bp);
+				xfs_debug(mp, "Returning trans error!");
+				return XFS_ERROR(EIO);
+			}
+		}
+	}
+#endif
+	if (XFS_FORCED_SHUTDOWN(mp))
+		goto shutdown_abort;
+
+	_xfs_trans_bjoin(tp, bp, 1);
+	trace_xfs_trans_read_buf(bp->b_fspriv);
+
+	*bpp = bp;
+	return 0;
+
+shutdown_abort:
+	/*
+	 * the theory here is that buffer is good but we're
+	 * bailing out because the filesystem is being forcibly
+	 * shut down.  So we should leave the b_flags alone since
+	 * the buffer's not staled and just get out.
+	 */
+#if defined(DEBUG)
+	if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
+		xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
+#endif
+	ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
+				     (XBF_STALE|XBF_DELWRI));
+
+	trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
+	xfs_buf_relse(bp);
+	*bpp = NULL;
+	return XFS_ERROR(EIO);
+}
+
+
+/*
+ * Release the buffer bp which was previously acquired with one of the
+ * xfs_trans_... buffer allocation routines if the buffer has not
+ * been modified within this transaction.  If the buffer is modified
+ * within this transaction, do decrement the recursion count but do
+ * not release the buffer even if the count goes to 0.  If the buffer is not
+ * modified within the transaction, decrement the recursion count and
+ * release the buffer if the recursion count goes to 0.
+ *
+ * If the buffer is to be released and it was not modified before
+ * this transaction began, then free the buf_log_item associated with it.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * brelse() call.
+ */
+void
+xfs_trans_brelse(xfs_trans_t	*tp,
+		 xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+	xfs_log_item_t		*lip;
+
+	/*
+	 * Default to a normal brelse() call if the tp is NULL.
+	 */
+	if (tp == NULL) {
+		ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
+		/*
+		 * If there's a buf log item attached to the buffer,
+		 * then let the AIL know that the buffer is being
+		 * unlocked.
+		 */
+		if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+			lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+			if (lip->li_type == XFS_LI_BUF) {
+				bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
+				xfs_trans_unlocked_item(bip->bli_item.li_ailp,
+							lip);
+			}
+		}
+		xfs_buf_relse(bp);
+		return;
+	}
+
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	trace_xfs_trans_brelse(bip);
+
+	/*
+	 * If the release is just for a recursive lock,
+	 * then decrement the count and return.
+	 */
+	if (bip->bli_recur > 0) {
+		bip->bli_recur--;
+		return;
+	}
+
+	/*
+	 * If the buffer is dirty within this transaction, we can't
+	 * release it until we commit.
+	 */
+	if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY)
+		return;
+
+	/*
+	 * If the buffer has been invalidated, then we can't release
+	 * it until the transaction commits to disk unless it is re-dirtied
+	 * as part of this transaction.  This prevents us from pulling
+	 * the item from the AIL before we should.
+	 */
+	if (bip->bli_flags & XFS_BLI_STALE)
+		return;
+
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+
+	/*
+	 * Free up the log item descriptor tracking the released item.
+	 */
+	xfs_trans_del_item(&bip->bli_item);
+
+	/*
+	 * Clear the hold flag in the buf log item if it is set.
+	 * We wouldn't want the next user of the buffer to
+	 * get confused.
+	 */
+	if (bip->bli_flags & XFS_BLI_HOLD) {
+		bip->bli_flags &= ~XFS_BLI_HOLD;
+	}
+
+	/*
+	 * Drop our reference to the buf log item.
+	 */
+	atomic_dec(&bip->bli_refcount);
+
+	/*
+	 * If the buf item is not tracking data in the log, then
+	 * we must free it before releasing the buffer back to the
+	 * free pool.  Before releasing the buffer to the free pool,
+	 * clear the transaction pointer in b_fsprivate2 to dissolve
+	 * its relation to this transaction.
+	 */
+	if (!xfs_buf_item_dirty(bip)) {
+/***
+		ASSERT(bp->b_pincount == 0);
+***/
+		ASSERT(atomic_read(&bip->bli_refcount) == 0);
+		ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
+		ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
+		xfs_buf_item_relse(bp);
+		bip = NULL;
+	}
+	XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+
+	/*
+	 * If we've still got a buf log item on the buffer, then
+	 * tell the AIL that the buffer is being unlocked.
+	 */
+	if (bip != NULL) {
+		xfs_trans_unlocked_item(bip->bli_item.li_ailp,
+					(xfs_log_item_t*)bip);
+	}
+
+	xfs_buf_relse(bp);
+	return;
+}
+
+/*
+ * Mark the buffer as not needing to be unlocked when the buf item's
+ * IOP_UNLOCK() routine is called.  The buffer must already be locked
+ * and associated with the given transaction.
+ */
+/* ARGSUSED */
+void
+xfs_trans_bhold(xfs_trans_t	*tp,
+		xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	bip->bli_flags |= XFS_BLI_HOLD;
+	trace_xfs_trans_bhold(bip);
+}
+
+/*
+ * Cancel the previous buffer hold request made on this buffer
+ * for this transaction.
+ */
+void
+xfs_trans_bhold_release(xfs_trans_t	*tp,
+			xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT(bip->bli_flags & XFS_BLI_HOLD);
+	bip->bli_flags &= ~XFS_BLI_HOLD;
+
+	trace_xfs_trans_bhold_release(bip);
+}
+
+/*
+ * This is called to mark bytes first through last inclusive of the given
+ * buffer as needing to be logged when the transaction is committed.
+ * The buffer must already be associated with the given transaction.
+ *
+ * First and last are numbers relative to the beginning of this buffer,
+ * so the first byte in the buffer is numbered 0 regardless of the
+ * value of b_blkno.
+ */
+void
+xfs_trans_log_buf(xfs_trans_t	*tp,
+		  xfs_buf_t	*bp,
+		  uint		first,
+		  uint		last)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+	ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp)));
+	ASSERT((XFS_BUF_IODONE_FUNC(bp) == NULL) ||
+	       (XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks));
+
+	/*
+	 * Mark the buffer as needing to be written out eventually,
+	 * and set its iodone function to remove the buffer's buf log
+	 * item from the AIL and free it when the buffer is flushed
+	 * to disk.  See xfs_buf_attach_iodone() for more details
+	 * on li_cb and xfs_buf_iodone_callbacks().
+	 * If we end up aborting this transaction, we trap this buffer
+	 * inside the b_bdstrat callback so that this won't get written to
+	 * disk.
+	 */
+	XFS_BUF_DELAYWRITE(bp);
+	XFS_BUF_DONE(bp);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
+	bip->bli_item.li_cb = xfs_buf_iodone;
+
+	trace_xfs_trans_log_buf(bip);
+
+	/*
+	 * If we invalidated the buffer within this transaction, then
+	 * cancel the invalidation now that we're dirtying the buffer
+	 * again.  There are no races with the code in xfs_buf_item_unpin(),
+	 * because we have a reference to the buffer this entire time.
+	 */
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		bip->bli_flags &= ~XFS_BLI_STALE;
+		ASSERT(XFS_BUF_ISSTALE(bp));
+		XFS_BUF_UNSTALE(bp);
+		bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
+	}
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	bip->bli_flags |= XFS_BLI_LOGGED;
+	xfs_buf_item_log(bip, first, last);
+}
+
+
+/*
+ * This called to invalidate a buffer that is being used within
+ * a transaction.  Typically this is because the blocks in the
+ * buffer are being freed, so we need to prevent it from being
+ * written out when we're done.  Allowing it to be written again
+ * might overwrite data in the free blocks if they are reallocated
+ * to a file.
+ *
+ * We prevent the buffer from being written out by clearing the
+ * B_DELWRI flag.  We can't always
+ * get rid of the buf log item at this point, though, because
+ * the buffer may still be pinned by another transaction.  If that
+ * is the case, then we'll wait until the buffer is committed to
+ * disk for the last time (we can tell by the ref count) and
+ * free it in xfs_buf_item_unpin().  Until it is cleaned up we
+ * will keep the buffer locked so that the buffer and buf log item
+ * are not reused.
+ */
+void
+xfs_trans_binval(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	trace_xfs_trans_binval(bip);
+
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		/*
+		 * If the buffer is already invalidated, then
+		 * just return.
+		 */
+		ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
+		ASSERT(XFS_BUF_ISSTALE(bp));
+		ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
+		ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
+		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+		ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
+		ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
+		return;
+	}
+
+	/*
+	 * Clear the dirty bit in the buffer and set the STALE flag
+	 * in the buf log item.  The STALE flag will be used in
+	 * xfs_buf_item_unpin() to determine if it should clean up
+	 * when the last reference to the buf item is given up.
+	 * We set the XFS_BLF_CANCEL flag in the buf log format structure
+	 * and log the buf item.  This will be used at recovery time
+	 * to determine that copies of the buffer in the log before
+	 * this should not be replayed.
+	 * We mark the item descriptor and the transaction dirty so
+	 * that we'll hold the buffer until after the commit.
+	 *
+	 * Since we're invalidating the buffer, we also clear the state
+	 * about which parts of the buffer have been logged.  We also
+	 * clear the flag indicating that this is an inode buffer since
+	 * the data in the buffer will no longer be valid.
+	 *
+	 * We set the stale bit in the buffer as well since we're getting
+	 * rid of it.
+	 */
+	XFS_BUF_UNDELAYWRITE(bp);
+	XFS_BUF_STALE(bp);
+	bip->bli_flags |= XFS_BLI_STALE;
+	bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
+	bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
+	bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
+	memset((char *)(bip->bli_format.blf_data_map), 0,
+	      (bip->bli_format.blf_map_size * sizeof(uint)));
+	bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	tp->t_flags |= XFS_TRANS_DIRTY;
+}
+
+/*
+ * This call is used to indicate that the buffer contains on-disk inodes which
+ * must be handled specially during recovery.  They require special handling
+ * because only the di_next_unlinked from the inodes in the buffer should be
+ * recovered.  The rest of the data in the buffer is logged via the inodes
+ * themselves.
+ *
+ * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
+ * transferred to the buffer's log format structure so that we'll know what to
+ * do at recovery time.
+ */
+void
+xfs_trans_inode_buf(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_flags |= XFS_BLI_INODE_BUF;
+}
+
+/*
+ * This call is used to indicate that the buffer is going to
+ * be staled and was an inode buffer. This means it gets
+ * special processing during unpin - where any inodes 
+ * associated with the buffer should be removed from ail.
+ * There is also special processing during recovery,
+ * any replay of the inodes in the buffer needs to be
+ * prevented as the buffer may have been reused.
+ */
+void
+xfs_trans_stale_inode_buf(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_flags |= XFS_BLI_STALE_INODE;
+	bip->bli_item.li_cb = xfs_buf_iodone;
+}
+
+/*
+ * Mark the buffer as being one which contains newly allocated
+ * inodes.  We need to make sure that even if this buffer is
+ * relogged as an 'inode buf' we still recover all of the inode
+ * images in the face of a crash.  This works in coordination with
+ * xfs_buf_item_committed() to ensure that the buffer remains in the
+ * AIL at its original location even after it has been relogged.
+ */
+/* ARGSUSED */
+void
+xfs_trans_inode_alloc_buf(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
+}
+
+
+/*
+ * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of
+ * dquots. However, unlike in inode buffer recovery, dquot buffers get
+ * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag).
+ * The only thing that makes dquot buffers different from regular
+ * buffers is that we must not replay dquot bufs when recovering
+ * if a _corresponding_ quotaoff has happened. We also have to distinguish
+ * between usr dquot bufs and grp dquot bufs, because usr and grp quotas
+ * can be turned off independently.
+ */
+/* ARGSUSED */
+void
+xfs_trans_dquot_buf(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp,
+	uint		type)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+	ASSERT(type == XFS_BLF_UDQUOT_BUF ||
+	       type == XFS_BLF_PDQUOT_BUF ||
+	       type == XFS_BLF_GDQUOT_BUF);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_format.blf_flags |= type;
+}
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
new file mode 100644
index 00000000..f7590f5b
--- /dev/null
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_extfree_item.h"
+
+/*
+ * This routine is called to allocate an "extent free intention"
+ * log item that will hold nextents worth of extents.  The
+ * caller must use all nextents extents, because we are not
+ * flexible about this at all.
+ */
+xfs_efi_log_item_t *
+xfs_trans_get_efi(xfs_trans_t	*tp,
+		  uint		nextents)
+{
+	xfs_efi_log_item_t	*efip;
+
+	ASSERT(tp != NULL);
+	ASSERT(nextents > 0);
+
+	efip = xfs_efi_init(tp->t_mountp, nextents);
+	ASSERT(efip != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &efip->efi_item);
+	return efip;
+}
+
+/*
+ * This routine is called to indicate that the described
+ * extent is to be logged as needing to be freed.  It should
+ * be called once for each extent to be freed.
+ */
+void
+xfs_trans_log_efi_extent(xfs_trans_t		*tp,
+			 xfs_efi_log_item_t	*efip,
+			 xfs_fsblock_t		start_block,
+			 xfs_extlen_t		ext_len)
+{
+	uint			next_extent;
+	xfs_extent_t		*extp;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	/*
+	 * atomic_inc_return gives us the value after the increment;
+	 * we want to use it as an array index so we need to subtract 1 from
+	 * it.
+	 */
+	next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
+	ASSERT(next_extent < efip->efi_format.efi_nextents);
+	extp = &(efip->efi_format.efi_extents[next_extent]);
+	extp->ext_start = start_block;
+	extp->ext_len = ext_len;
+}
+
+
+/*
+ * This routine is called to allocate an "extent free done"
+ * log item that will hold nextents worth of extents.  The
+ * caller must use all nextents extents, because we are not
+ * flexible about this at all.
+ */
+xfs_efd_log_item_t *
+xfs_trans_get_efd(xfs_trans_t		*tp,
+		  xfs_efi_log_item_t	*efip,
+		  uint			nextents)
+{
+	xfs_efd_log_item_t	*efdp;
+
+	ASSERT(tp != NULL);
+	ASSERT(nextents > 0);
+
+	efdp = xfs_efd_init(tp->t_mountp, efip, nextents);
+	ASSERT(efdp != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &efdp->efd_item);
+	return efdp;
+}
+
+/*
+ * This routine is called to indicate that the described
+ * extent is to be logged as having been freed.  It should
+ * be called once for each extent freed.
+ */
+void
+xfs_trans_log_efd_extent(xfs_trans_t		*tp,
+			 xfs_efd_log_item_t	*efdp,
+			 xfs_fsblock_t		start_block,
+			 xfs_extlen_t		ext_len)
+{
+	uint			next_extent;
+	xfs_extent_t		*extp;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	next_extent = efdp->efd_next_extent;
+	ASSERT(next_extent < efdp->efd_format.efd_nextents);
+	extp = &(efdp->efd_format.efd_extents[next_extent]);
+	extp->ext_start = start_block;
+	extp->ext_len = ext_len;
+	efdp->efd_next_extent++;
+}
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
new file mode 100644
index 00000000..048b0c68
--- /dev/null
+++ b/fs/xfs/xfs_trans_inode.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+
+#ifdef XFS_TRANS_DEBUG
+STATIC void
+xfs_trans_inode_broot_debug(
+	xfs_inode_t	*ip);
+#else
+#define	xfs_trans_inode_broot_debug(ip)
+#endif
+
+/*
+ * Add a locked inode to the transaction.
+ *
+ * The inode must be locked, and it cannot be associated with any transaction.
+ */
+void
+xfs_trans_ijoin(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	xfs_inode_log_item_t	*iip;
+
+	ASSERT(ip->i_transp == NULL);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	if (ip->i_itemp == NULL)
+		xfs_inode_item_init(ip, ip->i_mount);
+	iip = ip->i_itemp;
+	ASSERT(iip->ili_lock_flags == 0);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &iip->ili_item);
+
+	xfs_trans_inode_broot_debug(ip);
+
+	/*
+	 * Initialize i_transp so we can find it with xfs_inode_incore()
+	 * in xfs_trans_iget() above.
+	 */
+	ip->i_transp = tp;
+}
+
+/*
+ * Add a locked inode to the transaction.
+ *
+ *
+ * Grabs a reference to the inode which will be dropped when the transaction
+ * is committed.  The inode will also be unlocked at that point.  The inode
+ * must be locked, and it cannot be associated with any transaction.
+ */
+void
+xfs_trans_ijoin_ref(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	uint			lock_flags)
+{
+	xfs_trans_ijoin(tp, ip);
+	IHOLD(ip);
+	ip->i_itemp->ili_lock_flags = lock_flags;
+}
+
+/*
+ * Transactional inode timestamp update. Requires the inode to be locked and
+ * joined to the transaction supplied. Relies on the transaction subsystem to
+ * track dirty state and update/writeback the inode accordingly.
+ */
+void
+xfs_trans_ichgtime(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			flags)
+{
+	struct inode		*inode = VFS_I(ip);
+	timespec_t		tv;
+
+	ASSERT(tp);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(ip->i_transp == tp);
+
+	tv = current_fs_time(inode->i_sb);
+
+	if ((flags & XFS_ICHGTIME_MOD) &&
+	    !timespec_equal(&inode->i_mtime, &tv)) {
+		inode->i_mtime = tv;
+	}
+	if ((flags & XFS_ICHGTIME_CHG) &&
+	    !timespec_equal(&inode->i_ctime, &tv)) {
+		inode->i_ctime = tv;
+	}
+}
+
+/*
+ * This is called to mark the fields indicated in fieldmask as needing
+ * to be logged when the transaction is committed.  The inode must
+ * already be associated with the given transaction.
+ *
+ * The values for fieldmask are defined in xfs_inode_item.h.  We always
+ * log all of the core inode if any of it has changed, and we always log
+ * all of the inline data/extents/b-tree root if any of them has changed.
+ */
+void
+xfs_trans_log_inode(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		flags)
+{
+	ASSERT(ip->i_transp == tp);
+	ASSERT(ip->i_itemp != NULL);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	/*
+	 * Always OR in the bits from the ili_last_fields field.
+	 * This is to coordinate with the xfs_iflush() and xfs_iflush_done()
+	 * routines in the eventual clearing of the ilf_fields bits.
+	 * See the big comment in xfs_iflush() for an explanation of
+	 * this coordination mechanism.
+	 */
+	flags |= ip->i_itemp->ili_last_fields;
+	ip->i_itemp->ili_format.ilf_fields |= flags;
+}
+
+#ifdef XFS_TRANS_DEBUG
+/*
+ * Keep track of the state of the inode btree root to make sure we
+ * log it properly.
+ */
+STATIC void
+xfs_trans_inode_broot_debug(
+	xfs_inode_t	*ip)
+{
+	xfs_inode_log_item_t	*iip;
+
+	ASSERT(ip->i_itemp != NULL);
+	iip = ip->i_itemp;
+	if (iip->ili_root_size != 0) {
+		ASSERT(iip->ili_orig_root != NULL);
+		kmem_free(iip->ili_orig_root);
+		iip->ili_root_size = 0;
+		iip->ili_orig_root = NULL;
+	}
+	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+		ASSERT((ip->i_df.if_broot != NULL) &&
+		       (ip->i_df.if_broot_bytes > 0));
+		iip->ili_root_size = ip->i_df.if_broot_bytes;
+		iip->ili_orig_root =
+			(char*)kmem_alloc(iip->ili_root_size, KM_SLEEP);
+		memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot),
+		      iip->ili_root_size);
+	}
+}
+#endif
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
new file mode 100644
index 00000000..fe2e3cbc
--- /dev/null
+++ b/fs/xfs/xfs_trans_priv.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_TRANS_PRIV_H__
+#define	__XFS_TRANS_PRIV_H__
+
+struct xfs_log_item;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_ail;
+struct xfs_log_vec;
+
+void	xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
+void	xfs_trans_del_item(struct xfs_log_item *);
+void	xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
+				int flags);
+void	xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
+
+void	xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+				xfs_lsn_t commit_lsn, int aborted);
+/*
+ * AIL traversal cursor.
+ *
+ * Rather than using a generation number for detecting changes in the ail, use
+ * a cursor that is protected by the ail lock. The aild cursor exists in the
+ * struct xfs_ail, but other traversals can declare it on the stack and link it
+ * to the ail list.
+ *
+ * When an object is deleted from or moved int the AIL, the cursor list is
+ * searched to see if the object is a designated cursor item. If it is, it is
+ * deleted from the cursor so that the next time the cursor is used traversal
+ * will return to the start.
+ *
+ * This means a traversal colliding with a removal will cause a restart of the
+ * list scan, rather than any insertion or deletion anywhere in the list. The
+ * low bit of the item pointer is set if the cursor has been invalidated so
+ * that we can tell the difference between invalidation and reaching the end
+ * of the list to trigger traversal restarts.
+ */
+struct xfs_ail_cursor {
+	struct xfs_ail_cursor	*next;
+	struct xfs_log_item	*item;
+};
+
+/*
+ * Private AIL structures.
+ *
+ * Eventually we need to drive the locking in here as well.
+ */
+struct xfs_ail {
+	struct xfs_mount	*xa_mount;
+	struct task_struct	*xa_task;
+	struct list_head	xa_ail;
+	xfs_lsn_t		xa_target;
+	struct xfs_ail_cursor	xa_cursors;
+	spinlock_t		xa_lock;
+	xfs_lsn_t		xa_last_pushed_lsn;
+};
+
+/*
+ * From xfs_trans_ail.c
+ */
+void	xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
+				struct xfs_ail_cursor *cur,
+				struct xfs_log_item **log_items, int nr_items,
+				xfs_lsn_t lsn) __releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_update(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn) __releases(ailp->xa_lock)
+{
+	xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
+}
+
+void	xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+				struct xfs_log_item **log_items, int nr_items)
+				__releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_delete(
+	struct xfs_ail	*ailp,
+	xfs_log_item_t	*lip) __releases(ailp->xa_lock)
+{
+	xfs_trans_ail_delete_bulk(ailp, &lip, 1);
+}
+
+void			xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
+void			xfs_ail_push_all(struct xfs_ail *);
+xfs_lsn_t		xfs_ail_min_lsn(struct xfs_ail *ailp);
+
+void			xfs_trans_unlocked_item(struct xfs_ail *,
+					xfs_log_item_t *);
+
+struct xfs_log_item *	xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
+					struct xfs_ail_cursor *cur,
+					xfs_lsn_t lsn);
+struct xfs_log_item *	xfs_trans_ail_cursor_last(struct xfs_ail *ailp,
+					struct xfs_ail_cursor *cur,
+					xfs_lsn_t lsn);
+struct xfs_log_item *	xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
+					struct xfs_ail_cursor *cur);
+void			xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
+					struct xfs_ail_cursor *cur);
+
+#if BITS_PER_LONG != 64
+static inline void
+xfs_trans_ail_copy_lsn(
+	struct xfs_ail	*ailp,
+	xfs_lsn_t	*dst,
+	xfs_lsn_t	*src)
+{
+	ASSERT(sizeof(xfs_lsn_t) == 8);	/* don't lock if it shrinks */
+	spin_lock(&ailp->xa_lock);
+	*dst = *src;
+	spin_unlock(&ailp->xa_lock);
+}
+#else
+static inline void
+xfs_trans_ail_copy_lsn(
+	struct xfs_ail	*ailp,
+	xfs_lsn_t	*dst,
+	xfs_lsn_t	*src)
+{
+	ASSERT(sizeof(xfs_lsn_t) == 8);
+	*dst = *src;
+}
+#endif
+#endif	/* __XFS_TRANS_PRIV_H__ */
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
new file mode 100644
index 00000000..7d2c920d
--- /dev/null
+++ b/fs/xfs/xfs_trans_space.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_TRANS_SPACE_H__
+#define __XFS_TRANS_SPACE_H__
+
+/*
+ * Components of space reservations.
+ */
+#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)    \
+		(((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
+#define	XFS_EXTENTADD_SPACE_RES(mp,w)	(XFS_BM_MAXLEVELS(mp,w) - 1)
+#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\
+	(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+	  XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+	  XFS_EXTENTADD_SPACE_RES(mp,w))
+#define	XFS_DAENTER_1B(mp,w)	((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1)
+#define	XFS_DAENTER_DBS(mp,w)	\
+	(XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0))
+#define	XFS_DAENTER_BLOCKS(mp,w)	\
+	(XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
+#define	XFS_DAENTER_BMAP1B(mp,w)	\
+	XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w)
+#define	XFS_DAENTER_BMAPS(mp,w)		\
+	(XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w))
+#define	XFS_DAENTER_SPACE_RES(mp,w)	\
+	(XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
+#define	XFS_DAREMOVE_SPACE_RES(mp,w)	XFS_DAENTER_BMAPS(mp,w)
+#define	XFS_DIRENTER_MAX_SPLIT(mp,nl)	1
+#define	XFS_DIRENTER_SPACE_RES(mp,nl)	\
+	(XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
+	 XFS_DIRENTER_MAX_SPLIT(mp,nl))
+#define	XFS_DIRREMOVE_SPACE_RES(mp)	\
+	XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
+#define	XFS_IALLOC_SPACE_RES(mp)	\
+	(XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1)
+
+/*
+ * Space reservation values for various transactions.
+ */
+#define	XFS_ADDAFORK_SPACE_RES(mp)	\
+	((mp)->m_dirblkfsbs + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
+#define	XFS_ATTRRM_SPACE_RES(mp)	\
+	XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
+/* This macro is not used - see inline code in xfs_attr_set */
+#define	XFS_ATTRSET_SPACE_RES(mp, v)	\
+	(XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v))
+#define	XFS_CREATE_SPACE_RES(mp,nl)	\
+	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define	XFS_DIOSTRAT_SPACE_RES(mp, v)	\
+	(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
+#define	XFS_GROWFS_SPACE_RES(mp)	\
+	(2 * XFS_AG_MAXLEVELS(mp))
+#define	XFS_GROWFSRT_SPACE_RES(mp,b)	\
+	((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
+#define	XFS_LINK_SPACE_RES(mp,nl)	\
+	XFS_DIRENTER_SPACE_RES(mp,nl)
+#define	XFS_MKDIR_SPACE_RES(mp,nl)	\
+	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define	XFS_QM_DQALLOC_SPACE_RES(mp)	\
+	(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \
+	 XFS_DQUOT_CLUSTER_SIZE_FSB)
+#define	XFS_QM_QINOCREATE_SPACE_RES(mp)	\
+	XFS_IALLOC_SPACE_RES(mp)
+#define	XFS_REMOVE_SPACE_RES(mp)	\
+	XFS_DIRREMOVE_SPACE_RES(mp)
+#define	XFS_RENAME_SPACE_RES(mp,nl)	\
+	(XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define	XFS_SYMLINK_SPACE_RES(mp,nl,b)	\
+	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
+
+#endif	/* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
new file mode 100644
index 00000000..65584b55
--- /dev/null
+++ b/fs/xfs/xfs_types.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_TYPES_H__
+#define	__XFS_TYPES_H__
+
+#ifdef __KERNEL__
+
+/*
+ * Additional type declarations for XFS
+ */
+typedef signed char		__int8_t;
+typedef unsigned char		__uint8_t;
+typedef signed short int	__int16_t;
+typedef unsigned short int	__uint16_t;
+typedef signed int		__int32_t;
+typedef unsigned int		__uint32_t;
+typedef signed long long int	__int64_t;
+typedef unsigned long long int	__uint64_t;
+
+typedef enum { B_FALSE,B_TRUE }	boolean_t;
+typedef __uint32_t		prid_t;		/* project ID */
+typedef __uint32_t		inst_t;		/* an instruction */
+
+typedef __s64			xfs_off_t;	/* <file offset> type */
+typedef unsigned long long	xfs_ino_t;	/* <inode> type */
+typedef __s64			xfs_daddr_t;	/* <disk address> type */
+typedef char *			xfs_caddr_t;	/* <core address> type */
+typedef __u32			xfs_dev_t;
+typedef __u32			xfs_nlink_t;
+
+/* __psint_t is the same size as a pointer */
+#if (BITS_PER_LONG == 32)
+typedef __int32_t __psint_t;
+typedef __uint32_t __psunsigned_t;
+#elif (BITS_PER_LONG == 64)
+typedef __int64_t __psint_t;
+typedef __uint64_t __psunsigned_t;
+#else
+#error BITS_PER_LONG must be 32 or 64
+#endif
+
+#endif	/* __KERNEL__ */
+
+typedef __uint32_t	xfs_agblock_t;	/* blockno in alloc. group */
+typedef	__uint32_t	xfs_extlen_t;	/* extent length in blocks */
+typedef	__uint32_t	xfs_agnumber_t;	/* allocation group number */
+typedef __int32_t	xfs_extnum_t;	/* # of extents in a file */
+typedef __int16_t	xfs_aextnum_t;	/* # extents in an attribute fork */
+typedef	__int64_t	xfs_fsize_t;	/* bytes in a file */
+typedef __uint64_t	xfs_ufsize_t;	/* unsigned bytes in a file */
+
+typedef	__int32_t	xfs_suminfo_t;	/* type of bitmap summary info */
+typedef	__int32_t	xfs_rtword_t;	/* word type for bitmap manipulations */
+
+typedef	__int64_t	xfs_lsn_t;	/* log sequence number */
+typedef	__int32_t	xfs_tid_t;	/* transaction identifier */
+
+typedef	__uint32_t	xfs_dablk_t;	/* dir/attr block number (in file) */
+typedef	__uint32_t	xfs_dahash_t;	/* dir/attr hash value */
+
+/*
+ * These types are 64 bits on disk but are either 32 or 64 bits in memory.
+ * Disk based types:
+ */
+typedef __uint64_t	xfs_dfsbno_t;	/* blockno in filesystem (agno|agbno) */
+typedef __uint64_t	xfs_drfsbno_t;	/* blockno in filesystem (raw) */
+typedef	__uint64_t	xfs_drtbno_t;	/* extent (block) in realtime area */
+typedef	__uint64_t	xfs_dfiloff_t;	/* block number in a file */
+typedef	__uint64_t	xfs_dfilblks_t;	/* number of blocks in a file */
+
+/*
+ * Memory based types are conditional.
+ */
+#if XFS_BIG_BLKNOS
+typedef	__uint64_t	xfs_fsblock_t;	/* blockno in filesystem (agno|agbno) */
+typedef __uint64_t	xfs_rfsblock_t;	/* blockno in filesystem (raw) */
+typedef __uint64_t	xfs_rtblock_t;	/* extent (block) in realtime area */
+typedef	__int64_t	xfs_srtblock_t;	/* signed version of xfs_rtblock_t */
+#else
+typedef	__uint32_t	xfs_fsblock_t;	/* blockno in filesystem (agno|agbno) */
+typedef __uint32_t	xfs_rfsblock_t;	/* blockno in filesystem (raw) */
+typedef __uint32_t	xfs_rtblock_t;	/* extent (block) in realtime area */
+typedef	__int32_t	xfs_srtblock_t;	/* signed version of xfs_rtblock_t */
+#endif
+typedef __uint64_t	xfs_fileoff_t;	/* block number in a file */
+typedef __int64_t	xfs_sfiloff_t;	/* signed block number in a file */
+typedef __uint64_t	xfs_filblks_t;	/* number of blocks in a file */
+
+/*
+ * Null values for the types.
+ */
+#define	NULLDFSBNO	((xfs_dfsbno_t)-1)
+#define	NULLDRFSBNO	((xfs_drfsbno_t)-1)
+#define	NULLDRTBNO	((xfs_drtbno_t)-1)
+#define	NULLDFILOFF	((xfs_dfiloff_t)-1)
+
+#define	NULLFSBLOCK	((xfs_fsblock_t)-1)
+#define	NULLRFSBLOCK	((xfs_rfsblock_t)-1)
+#define	NULLRTBLOCK	((xfs_rtblock_t)-1)
+#define	NULLFILEOFF	((xfs_fileoff_t)-1)
+
+#define	NULLAGBLOCK	((xfs_agblock_t)-1)
+#define	NULLAGNUMBER	((xfs_agnumber_t)-1)
+#define	NULLEXTNUM	((xfs_extnum_t)-1)
+
+#define NULLCOMMITLSN	((xfs_lsn_t)-1)
+
+/*
+ * Max values for extlen, extnum, aextnum.
+ */
+#define	MAXEXTLEN	((xfs_extlen_t)0x001fffff)	/* 21 bits */
+#define	MAXEXTNUM	((xfs_extnum_t)0x7fffffff)	/* signed int */
+#define	MAXAEXTNUM	((xfs_aextnum_t)0x7fff)		/* signed short */
+
+/*
+ * Min numbers of data/attr fork btree root pointers.
+ */
+#define MINDBTPTRS	3
+#define MINABTPTRS	2
+
+/*
+ * MAXNAMELEN is the length (including the terminating null) of
+ * the longest permissible file (component) name.
+ */
+#define MAXNAMELEN	256
+
+typedef enum {
+	XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi
+} xfs_lookup_t;
+
+typedef enum {
+	XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi,
+	XFS_BTNUM_MAX
+} xfs_btnum_t;
+
+struct xfs_name {
+	const unsigned char	*name;
+	int			len;
+};
+
+#endif	/* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
new file mode 100644
index 00000000..8b32d1a4
--- /dev/null
+++ b/fs/xfs/xfs_utils.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_itable.h"
+#include "xfs_utils.h"
+
+
+/*
+ * Allocates a new inode from disk and return a pointer to the
+ * incore copy. This routine will internally commit the current
+ * transaction and allocate a new one if the Space Manager needed
+ * to do an allocation to replenish the inode free-list.
+ *
+ * This routine is designed to be called from xfs_create and
+ * xfs_create_dir.
+ *
+ */
+int
+xfs_dir_ialloc(
+	xfs_trans_t	**tpp,		/* input: current transaction;
+					   output: may be a new transaction. */
+	xfs_inode_t	*dp,		/* directory within whose allocate
+					   the inode. */
+	mode_t		mode,
+	xfs_nlink_t	nlink,
+	xfs_dev_t	rdev,
+	prid_t		prid,		/* project id */
+	int		okalloc,	/* ok to allocate new space */
+	xfs_inode_t	**ipp,		/* pointer to inode; it will be
+					   locked. */
+	int		*committed)
+
+{
+	xfs_trans_t	*tp;
+	xfs_trans_t	*ntp;
+	xfs_inode_t	*ip;
+	xfs_buf_t	*ialloc_context = NULL;
+	boolean_t	call_again = B_FALSE;
+	int		code;
+	uint		log_res;
+	uint		log_count;
+	void		*dqinfo;
+	uint		tflags;
+
+	tp = *tpp;
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+	/*
+	 * xfs_ialloc will return a pointer to an incore inode if
+	 * the Space Manager has an available inode on the free
+	 * list. Otherwise, it will do an allocation and replenish
+	 * the freelist.  Since we can only do one allocation per
+	 * transaction without deadlocks, we will need to commit the
+	 * current transaction and start a new one.  We will then
+	 * need to call xfs_ialloc again to get the inode.
+	 *
+	 * If xfs_ialloc did an allocation to replenish the freelist,
+	 * it returns the bp containing the head of the freelist as
+	 * ialloc_context. We will hold a lock on it across the
+	 * transaction commit so that no other process can steal
+	 * the inode(s) that we've just allocated.
+	 */
+	code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
+			  &ialloc_context, &call_again, &ip);
+
+	/*
+	 * Return an error if we were unable to allocate a new inode.
+	 * This should only happen if we run out of space on disk or
+	 * encounter a disk error.
+	 */
+	if (code) {
+		*ipp = NULL;
+		return code;
+	}
+	if (!call_again && (ip == NULL)) {
+		*ipp = NULL;
+		return XFS_ERROR(ENOSPC);
+	}
+
+	/*
+	 * If call_again is set, then we were unable to get an
+	 * inode in one operation.  We need to commit the current
+	 * transaction and call xfs_ialloc() again.  It is guaranteed
+	 * to succeed the second time.
+	 */
+	if (call_again) {
+
+		/*
+		 * Normally, xfs_trans_commit releases all the locks.
+		 * We call bhold to hang on to the ialloc_context across
+		 * the commit.  Holding this buffer prevents any other
+		 * processes from doing any allocations in this
+		 * allocation group.
+		 */
+		xfs_trans_bhold(tp, ialloc_context);
+		/*
+		 * Save the log reservation so we can use
+		 * them in the next transaction.
+		 */
+		log_res = xfs_trans_get_log_res(tp);
+		log_count = xfs_trans_get_log_count(tp);
+
+		/*
+		 * We want the quota changes to be associated with the next
+		 * transaction, NOT this one. So, detach the dqinfo from this
+		 * and attach it to the next transaction.
+		 */
+		dqinfo = NULL;
+		tflags = 0;
+		if (tp->t_dqinfo) {
+			dqinfo = (void *)tp->t_dqinfo;
+			tp->t_dqinfo = NULL;
+			tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
+			tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
+		}
+
+		ntp = xfs_trans_dup(tp);
+		code = xfs_trans_commit(tp, 0);
+		tp = ntp;
+		if (committed != NULL) {
+			*committed = 1;
+		}
+		/*
+		 * If we get an error during the commit processing,
+		 * release the buffer that is still held and return
+		 * to the caller.
+		 */
+		if (code) {
+			xfs_buf_relse(ialloc_context);
+			if (dqinfo) {
+				tp->t_dqinfo = dqinfo;
+				xfs_trans_free_dqinfo(tp);
+			}
+			*tpp = ntp;
+			*ipp = NULL;
+			return code;
+		}
+
+		/*
+		 * transaction commit worked ok so we can drop the extra ticket
+		 * reference that we gained in xfs_trans_dup()
+		 */
+		xfs_log_ticket_put(tp->t_ticket);
+		code = xfs_trans_reserve(tp, 0, log_res, 0,
+					 XFS_TRANS_PERM_LOG_RES, log_count);
+		/*
+		 * Re-attach the quota info that we detached from prev trx.
+		 */
+		if (dqinfo) {
+			tp->t_dqinfo = dqinfo;
+			tp->t_flags |= tflags;
+		}
+
+		if (code) {
+			xfs_buf_relse(ialloc_context);
+			*tpp = ntp;
+			*ipp = NULL;
+			return code;
+		}
+		xfs_trans_bjoin(tp, ialloc_context);
+
+		/*
+		 * Call ialloc again. Since we've locked out all
+		 * other allocations in this allocation group,
+		 * this call should always succeed.
+		 */
+		code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
+				  okalloc, &ialloc_context, &call_again, &ip);
+
+		/*
+		 * If we get an error at this point, return to the caller
+		 * so that the current transaction can be aborted.
+		 */
+		if (code) {
+			*tpp = tp;
+			*ipp = NULL;
+			return code;
+		}
+		ASSERT ((!call_again) && (ip != NULL));
+
+	} else {
+		if (committed != NULL) {
+			*committed = 0;
+		}
+	}
+
+	*ipp = ip;
+	*tpp = tp;
+
+	return 0;
+}
+
+/*
+ * Decrement the link count on an inode & log the change.
+ * If this causes the link count to go to zero, initiate the
+ * logging activity required to truncate a file.
+ */
+int				/* error */
+xfs_droplink(
+	xfs_trans_t *tp,
+	xfs_inode_t *ip)
+{
+	int	error;
+
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+	ASSERT (ip->i_d.di_nlink > 0);
+	ip->i_d.di_nlink--;
+	drop_nlink(VFS_I(ip));
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	error = 0;
+	if (ip->i_d.di_nlink == 0) {
+		/*
+		 * We're dropping the last link to this file.
+		 * Move the on-disk inode to the AGI unlinked list.
+		 * From xfs_inactive() we will pull the inode from
+		 * the list and free it.
+		 */
+		error = xfs_iunlink(tp, ip);
+	}
+	return error;
+}
+
+/*
+ * This gets called when the inode's version needs to be changed from 1 to 2.
+ * Currently this happens when the nlink field overflows the old 16-bit value
+ * or when chproj is called to change the project for the first time.
+ * As a side effect the superblock version will also get rev'd
+ * to contain the NLINK bit.
+ */
+void
+xfs_bump_ino_vers2(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(ip->i_d.di_version == 1);
+
+	ip->i_d.di_version = 2;
+	ip->i_d.di_onlink = 0;
+	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+	mp = tp->t_mountp;
+	if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
+		spin_lock(&mp->m_sb_lock);
+		if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
+			xfs_sb_version_addnlink(&mp->m_sb);
+			spin_unlock(&mp->m_sb_lock);
+			xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
+		} else {
+			spin_unlock(&mp->m_sb_lock);
+		}
+	}
+	/* Caller must log the inode */
+}
+
+/*
+ * Increment the link count on an inode & log the change.
+ */
+int
+xfs_bumplink(
+	xfs_trans_t *tp,
+	xfs_inode_t *ip)
+{
+	if (ip->i_d.di_nlink >= XFS_MAXLINK)
+		return XFS_ERROR(EMLINK);
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+	ASSERT(ip->i_d.di_nlink > 0);
+	ip->i_d.di_nlink++;
+	inc_nlink(VFS_I(ip));
+	if ((ip->i_d.di_version == 1) &&
+	    (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
+		/*
+		 * The inode has increased its number of links beyond
+		 * what can fit in an old format inode.  It now needs
+		 * to be converted to a version 2 inode with a 32 bit
+		 * link count.  If this is the first inode in the file
+		 * system to do this, then we need to bump the superblock
+		 * version number as well.
+		 */
+		xfs_bump_ino_vers2(tp, ip);
+	}
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	return 0;
+}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
new file mode 100644
index 00000000..456fca31
--- /dev/null
+++ b/fs/xfs/xfs_utils.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_UTILS_H__
+#define __XFS_UTILS_H__
+
+extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
+				xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
+extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
+extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
+extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
+
+#endif	/* __XFS_UTILS_H__ */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
new file mode 100644
index 00000000..59509ae0
--- /dev/null
+++ b/fs/xfs/xfs_vnodeops.c
@@ -0,0 +1,2852 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_itable.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_acl.h"
+#include "xfs_attr.h"
+#include "xfs_rw.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_utils.h"
+#include "xfs_rtalloc.h"
+#include "xfs_trans_space.h"
+#include "xfs_log_priv.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+int
+xfs_setattr(
+	struct xfs_inode	*ip,
+	struct iattr		*iattr,
+	int			flags)
+{
+	xfs_mount_t		*mp = ip->i_mount;
+	struct inode		*inode = VFS_I(ip);
+	int			mask = iattr->ia_valid;
+	xfs_trans_t		*tp;
+	int			code;
+	uint			lock_flags;
+	uint			commit_flags=0;
+	uid_t			uid=0, iuid=0;
+	gid_t			gid=0, igid=0;
+	struct xfs_dquot	*udqp, *gdqp, *olddquot1, *olddquot2;
+	int			need_iolock = 1;
+
+	trace_xfs_setattr(ip);
+
+	if (mp->m_flags & XFS_MOUNT_RDONLY)
+		return XFS_ERROR(EROFS);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	code = -inode_change_ok(inode, iattr);
+	if (code)
+		return code;
+
+	olddquot1 = olddquot2 = NULL;
+	udqp = gdqp = NULL;
+
+	/*
+	 * If disk quotas is on, we make sure that the dquots do exist on disk,
+	 * before we start any other transactions. Trying to do this later
+	 * is messy. We don't care to take a readlock to look at the ids
+	 * in inode here, because we can't hold it across the trans_reserve.
+	 * If the IDs do change before we take the ilock, we're covered
+	 * because the i_*dquot fields will get updated anyway.
+	 */
+	if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) {
+		uint	qflags = 0;
+
+		if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
+			uid = iattr->ia_uid;
+			qflags |= XFS_QMOPT_UQUOTA;
+		} else {
+			uid = ip->i_d.di_uid;
+		}
+		if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
+			gid = iattr->ia_gid;
+			qflags |= XFS_QMOPT_GQUOTA;
+		}  else {
+			gid = ip->i_d.di_gid;
+		}
+
+		/*
+		 * We take a reference when we initialize udqp and gdqp,
+		 * so it is important that we never blindly double trip on
+		 * the same variable. See xfs_create() for an example.
+		 */
+		ASSERT(udqp == NULL);
+		ASSERT(gdqp == NULL);
+		code = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
+					 qflags, &udqp, &gdqp);
+		if (code)
+			return code;
+	}
+
+	/*
+	 * For the other attributes, we acquire the inode lock and
+	 * first do an error checking pass.
+	 */
+	tp = NULL;
+	lock_flags = XFS_ILOCK_EXCL;
+	if (flags & XFS_ATTR_NOLOCK)
+		need_iolock = 0;
+	if (!(mask & ATTR_SIZE)) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+		commit_flags = 0;
+		code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp),
+					 0, 0, 0);
+		if (code) {
+			lock_flags = 0;
+			goto error_return;
+		}
+	} else {
+		if (need_iolock)
+			lock_flags |= XFS_IOLOCK_EXCL;
+	}
+
+	xfs_ilock(ip, lock_flags);
+
+	/*
+	 * Change file ownership.  Must be the owner or privileged.
+	 */
+	if (mask & (ATTR_UID|ATTR_GID)) {
+		/*
+		 * These IDs could have changed since we last looked at them.
+		 * But, we're assured that if the ownership did change
+		 * while we didn't have the inode locked, inode's dquot(s)
+		 * would have changed also.
+		 */
+		iuid = ip->i_d.di_uid;
+		igid = ip->i_d.di_gid;
+		gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
+		uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
+
+		/*
+		 * Do a quota reservation only if uid/gid is actually
+		 * going to change.
+		 */
+		if (XFS_IS_QUOTA_RUNNING(mp) &&
+		    ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
+		     (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
+			ASSERT(tp);
+			code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
+						capable(CAP_FOWNER) ?
+						XFS_QMOPT_FORCE_RES : 0);
+			if (code)	/* out of quota */
+				goto error_return;
+		}
+	}
+
+	/*
+	 * Truncate file.  Must have write permission and not be a directory.
+	 */
+	if (mask & ATTR_SIZE) {
+		/* Short circuit the truncate case for zero length files */
+		if (iattr->ia_size == 0 &&
+		    ip->i_size == 0 && ip->i_d.di_nextents == 0) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			lock_flags &= ~XFS_ILOCK_EXCL;
+			if (mask & ATTR_CTIME) {
+				inode->i_mtime = inode->i_ctime =
+						current_fs_time(inode->i_sb);
+				xfs_mark_inode_dirty_sync(ip);
+			}
+			code = 0;
+			goto error_return;
+		}
+
+		if (S_ISDIR(ip->i_d.di_mode)) {
+			code = XFS_ERROR(EISDIR);
+			goto error_return;
+		} else if (!S_ISREG(ip->i_d.di_mode)) {
+			code = XFS_ERROR(EINVAL);
+			goto error_return;
+		}
+
+		/*
+		 * Make sure that the dquots are attached to the inode.
+		 */
+		code = xfs_qm_dqattach_locked(ip, 0);
+		if (code)
+			goto error_return;
+
+		/*
+		 * Now we can make the changes.  Before we join the inode
+		 * to the transaction, if ATTR_SIZE is set then take care of
+		 * the part of the truncation that must be done without the
+		 * inode lock.  This needs to be done before joining the inode
+		 * to the transaction, because the inode cannot be unlocked
+		 * once it is a part of the transaction.
+		 */
+		if (iattr->ia_size > ip->i_size) {
+			/*
+			 * Do the first part of growing a file: zero any data
+			 * in the last block that is beyond the old EOF.  We
+			 * need to do this before the inode is joined to the
+			 * transaction to modify the i_size.
+			 */
+			code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
+			if (code)
+				goto error_return;
+		}
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		lock_flags &= ~XFS_ILOCK_EXCL;
+
+		/*
+		 * We are going to log the inode size change in this
+		 * transaction so any previous writes that are beyond the on
+		 * disk EOF and the new EOF that have not been written out need
+		 * to be written here. If we do not write the data out, we
+		 * expose ourselves to the null files problem.
+		 *
+		 * Only flush from the on disk size to the smaller of the in
+		 * memory file size or the new size as that's the range we
+		 * really care about here and prevents waiting for other data
+		 * not within the range we care about here.
+		 */
+		if (ip->i_size != ip->i_d.di_size &&
+		    iattr->ia_size > ip->i_d.di_size) {
+			code = xfs_flush_pages(ip,
+					ip->i_d.di_size, iattr->ia_size,
+					XBF_ASYNC, FI_NONE);
+			if (code)
+				goto error_return;
+		}
+
+		/* wait for all I/O to complete */
+		xfs_ioend_wait(ip);
+
+		code = -block_truncate_page(inode->i_mapping, iattr->ia_size,
+					    xfs_get_blocks);
+		if (code)
+			goto error_return;
+
+		tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
+		code = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+					 XFS_TRANS_PERM_LOG_RES,
+					 XFS_ITRUNCATE_LOG_COUNT);
+		if (code)
+			goto error_return;
+
+		truncate_setsize(inode, iattr->ia_size);
+
+		commit_flags = XFS_TRANS_RELEASE_LOG_RES;
+		lock_flags |= XFS_ILOCK_EXCL;
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+		xfs_trans_ijoin(tp, ip);
+
+		/*
+		 * Only change the c/mtime if we are changing the size
+		 * or we are explicitly asked to change it. This handles
+		 * the semantic difference between truncate() and ftruncate()
+		 * as implemented in the VFS.
+		 *
+		 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME
+		 * is a special case where we need to update the times despite
+		 * not having these flags set.  For all other operations the
+		 * VFS set these flags explicitly if it wants a timestamp
+		 * update.
+		 */
+		if (iattr->ia_size != ip->i_size &&
+		    (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+			iattr->ia_ctime = iattr->ia_mtime =
+				current_fs_time(inode->i_sb);
+			mask |= ATTR_CTIME | ATTR_MTIME;
+		}
+
+		if (iattr->ia_size > ip->i_size) {
+			ip->i_d.di_size = iattr->ia_size;
+			ip->i_size = iattr->ia_size;
+			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		} else if (iattr->ia_size <= ip->i_size ||
+			   (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
+			/*
+			 * signal a sync transaction unless
+			 * we're truncating an already unlinked
+			 * file on a wsync filesystem
+			 */
+			code = xfs_itruncate_finish(&tp, ip, iattr->ia_size,
+					    XFS_DATA_FORK,
+					    ((ip->i_d.di_nlink != 0 ||
+					      !(mp->m_flags & XFS_MOUNT_WSYNC))
+					     ? 1 : 0));
+			if (code)
+				goto abort_return;
+			/*
+			 * Truncated "down", so we're removing references
+			 * to old data here - if we now delay flushing for
+			 * a long time, we expose ourselves unduly to the
+			 * notorious NULL files problem.  So, we mark this
+			 * vnode and flush it when the file is closed, and
+			 * do not wait the usual (long) time for writeout.
+			 */
+			xfs_iflags_set(ip, XFS_ITRUNCATED);
+		}
+	} else if (tp) {
+		xfs_trans_ijoin(tp, ip);
+	}
+
+	/*
+	 * Change file ownership.  Must be the owner or privileged.
+	 */
+	if (mask & (ATTR_UID|ATTR_GID)) {
+		/*
+		 * CAP_FSETID overrides the following restrictions:
+		 *
+		 * The set-user-ID and set-group-ID bits of a file will be
+		 * cleared upon successful return from chown()
+		 */
+		if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+		    !capable(CAP_FSETID)) {
+			ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+		}
+
+		/*
+		 * Change the ownerships and register quota modifications
+		 * in the transaction.
+		 */
+		if (iuid != uid) {
+			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
+				ASSERT(mask & ATTR_UID);
+				ASSERT(udqp);
+				olddquot1 = xfs_qm_vop_chown(tp, ip,
+							&ip->i_udquot, udqp);
+			}
+			ip->i_d.di_uid = uid;
+			inode->i_uid = uid;
+		}
+		if (igid != gid) {
+			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
+				ASSERT(!XFS_IS_PQUOTA_ON(mp));
+				ASSERT(mask & ATTR_GID);
+				ASSERT(gdqp);
+				olddquot2 = xfs_qm_vop_chown(tp, ip,
+							&ip->i_gdquot, gdqp);
+			}
+			ip->i_d.di_gid = gid;
+			inode->i_gid = gid;
+		}
+	}
+
+	/*
+	 * Change file access modes.
+	 */
+	if (mask & ATTR_MODE) {
+		umode_t mode = iattr->ia_mode;
+
+		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+			mode &= ~S_ISGID;
+
+		ip->i_d.di_mode &= S_IFMT;
+		ip->i_d.di_mode |= mode & ~S_IFMT;
+
+		inode->i_mode &= S_IFMT;
+		inode->i_mode |= mode & ~S_IFMT;
+	}
+
+	/*
+	 * Change file access or modified times.
+	 */
+	if (mask & ATTR_ATIME) {
+		inode->i_atime = iattr->ia_atime;
+		ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+		ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+		ip->i_update_core = 1;
+	}
+	if (mask & ATTR_CTIME) {
+		inode->i_ctime = iattr->ia_ctime;
+		ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+		ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
+		ip->i_update_core = 1;
+	}
+	if (mask & ATTR_MTIME) {
+		inode->i_mtime = iattr->ia_mtime;
+		ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+		ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+		ip->i_update_core = 1;
+	}
+
+	/*
+	 * And finally, log the inode core if any attribute in it
+	 * has been changed.
+	 */
+	if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE|
+		    ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	XFS_STATS_INC(xs_ig_attrchg);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 * This is slightly sub-optimal in that truncates require
+	 * two sync transactions instead of one for wsync filesystems.
+	 * One for the truncate and one for the timestamps since we
+	 * don't want to change the timestamps unless we're sure the
+	 * truncate worked.  Truncates are less than 1% of the laddis
+	 * mix so this probably isn't worth the trouble to optimize.
+	 */
+	code = 0;
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(tp);
+
+	code = xfs_trans_commit(tp, commit_flags);
+
+	xfs_iunlock(ip, lock_flags);
+
+	/*
+	 * Release any dquot(s) the inode had kept before chown.
+	 */
+	xfs_qm_dqrele(olddquot1);
+	xfs_qm_dqrele(olddquot2);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+
+	if (code)
+		return code;
+
+	/*
+	 * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
+	 * 	     update.  We could avoid this with linked transactions
+	 * 	     and passing down the transaction pointer all the way
+	 *	     to attr_set.  No previous user of the generic
+	 * 	     Posix ACL code seems to care about this issue either.
+	 */
+	if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
+		code = -xfs_acl_chmod(inode);
+		if (code)
+			return XFS_ERROR(code);
+	}
+
+	return 0;
+
+ abort_return:
+	commit_flags |= XFS_TRANS_ABORT;
+ error_return:
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	if (tp) {
+		xfs_trans_cancel(tp, commit_flags);
+	}
+	if (lock_flags != 0) {
+		xfs_iunlock(ip, lock_flags);
+	}
+	return code;
+}
+
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 2 extents back from
+ * bmapi.
+ */
+#define SYMLINK_MAPS 2
+
+STATIC int
+xfs_readlink_bmap(
+	xfs_inode_t	*ip,
+	char		*link)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	int		pathlen = ip->i_d.di_size;
+	int             nmaps = SYMLINK_MAPS;
+	xfs_bmbt_irec_t mval[SYMLINK_MAPS];
+	xfs_daddr_t	d;
+	int		byte_cnt;
+	int		n;
+	xfs_buf_t	*bp;
+	int		error = 0;
+
+	error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
+			mval, &nmaps, NULL);
+	if (error)
+		goto out;
+
+	for (n = 0; n < nmaps; n++) {
+		d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+		byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+
+		bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt),
+				  XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK);
+		error = XFS_BUF_GETERROR(bp);
+		if (error) {
+			xfs_ioerror_alert("xfs_readlink",
+				  ip->i_mount, bp, XFS_BUF_ADDR(bp));
+			xfs_buf_relse(bp);
+			goto out;
+		}
+		if (pathlen < byte_cnt)
+			byte_cnt = pathlen;
+		pathlen -= byte_cnt;
+
+		memcpy(link, XFS_BUF_PTR(bp), byte_cnt);
+		xfs_buf_relse(bp);
+	}
+
+	link[ip->i_d.di_size] = '\0';
+	error = 0;
+
+ out:
+	return error;
+}
+
+int
+xfs_readlink(
+	xfs_inode_t     *ip,
+	char		*link)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_fsize_t	pathlen;
+	int		error = 0;
+
+	trace_xfs_readlink(ip);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+	pathlen = ip->i_d.di_size;
+	if (!pathlen)
+		goto out;
+
+	if (pathlen < 0 || pathlen > MAXPATHLEN) {
+		xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
+			 __func__, (unsigned long long) ip->i_ino,
+			 (long long) pathlen);
+		ASSERT(0);
+		error = XFS_ERROR(EFSCORRUPTED);
+		goto out;
+	}
+
+
+	if (ip->i_df.if_flags & XFS_IFINLINE) {
+		memcpy(link, ip->i_df.if_u1.if_data, pathlen);
+		link[pathlen] = '\0';
+	} else {
+		error = xfs_readlink_bmap(ip, link);
+	}
+
+ out:
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return error;
+}
+
+/*
+ * Flags for xfs_free_eofblocks
+ */
+#define XFS_FREE_EOF_TRYLOCK	(1<<0)
+
+/*
+ * This is called by xfs_inactive to free any blocks beyond eof
+ * when the link count isn't zero and by xfs_dm_punch_hole() when
+ * punching a hole to EOF.
+ */
+STATIC int
+xfs_free_eofblocks(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,
+	int		flags)
+{
+	xfs_trans_t	*tp;
+	int		error;
+	xfs_fileoff_t	end_fsb;
+	xfs_fileoff_t	last_fsb;
+	xfs_filblks_t	map_len;
+	int		nimaps;
+	xfs_bmbt_irec_t	imap;
+
+	/*
+	 * Figure out if there are any blocks beyond the end
+	 * of the file.  If not, then there is nothing to do.
+	 */
+	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
+	last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+	if (last_fsb <= end_fsb)
+		return 0;
+	map_len = last_fsb - end_fsb;
+
+	nimaps = 1;
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
+			  NULL, 0, &imap, &nimaps, NULL);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (!error && (nimaps != 0) &&
+	    (imap.br_startblock != HOLESTARTBLOCK ||
+	     ip->i_delayed_blks)) {
+		/*
+		 * Attach the dquots to the inode up front.
+		 */
+		error = xfs_qm_dqattach(ip, 0);
+		if (error)
+			return error;
+
+		/*
+		 * There are blocks after the end of file.
+		 * Free them up now by truncating the file to
+		 * its current size.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+
+		/*
+		 * Do the xfs_itruncate_start() call before
+		 * reserving any log space because
+		 * itruncate_start will call into the buffer
+		 * cache and we can't
+		 * do that within a transaction.
+		 */
+		if (flags & XFS_FREE_EOF_TRYLOCK) {
+			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+				xfs_trans_cancel(tp, 0);
+				return 0;
+			}
+		} else {
+			xfs_ilock(ip, XFS_IOLOCK_EXCL);
+		}
+		error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
+				    ip->i_size);
+		if (error) {
+			xfs_trans_cancel(tp, 0);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			return error;
+		}
+
+		error = xfs_trans_reserve(tp, 0,
+					  XFS_ITRUNCATE_LOG_RES(mp),
+					  0, XFS_TRANS_PERM_LOG_RES,
+					  XFS_ITRUNCATE_LOG_COUNT);
+		if (error) {
+			ASSERT(XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			return error;
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip);
+
+		error = xfs_itruncate_finish(&tp, ip,
+					     ip->i_size,
+					     XFS_DATA_FORK,
+					     0);
+		/*
+		 * If we get an error at this point we
+		 * simply don't bother truncating the file.
+		 */
+		if (error) {
+			xfs_trans_cancel(tp,
+					 (XFS_TRANS_RELEASE_LOG_RES |
+					  XFS_TRANS_ABORT));
+		} else {
+			error = xfs_trans_commit(tp,
+						XFS_TRANS_RELEASE_LOG_RES);
+		}
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
+	}
+	return error;
+}
+
+/*
+ * Free a symlink that has blocks associated with it.
+ */
+STATIC int
+xfs_inactive_symlink_rmt(
+	xfs_inode_t	*ip,
+	xfs_trans_t	**tpp)
+{
+	xfs_buf_t	*bp;
+	int		committed;
+	int		done;
+	int		error;
+	xfs_fsblock_t	first_block;
+	xfs_bmap_free_t	free_list;
+	int		i;
+	xfs_mount_t	*mp;
+	xfs_bmbt_irec_t	mval[SYMLINK_MAPS];
+	int		nmaps;
+	xfs_trans_t	*ntp;
+	int		size;
+	xfs_trans_t	*tp;
+
+	tp = *tpp;
+	mp = ip->i_mount;
+	ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
+	/*
+	 * We're freeing a symlink that has some
+	 * blocks allocated to it.  Free the
+	 * blocks here.  We know that we've got
+	 * either 1 or 2 extents and that we can
+	 * free them all in one bunmapi call.
+	 */
+	ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
+	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		xfs_trans_cancel(tp, 0);
+		*tpp = NULL;
+		return error;
+	}
+	/*
+	 * Lock the inode, fix the size, and join it to the transaction.
+	 * Hold it so in the normal path, we still have it locked for
+	 * the second transaction.  In the error paths we need it
+	 * held so the cancel won't rele it, see below.
+	 */
+	xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	size = (int)ip->i_d.di_size;
+	ip->i_d.di_size = 0;
+	xfs_trans_ijoin(tp, ip);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	/*
+	 * Find the block(s) so we can inval and unmap them.
+	 */
+	done = 0;
+	xfs_bmap_init(&free_list, &first_block);
+	nmaps = ARRAY_SIZE(mval);
+	if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
+			XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
+			&free_list)))
+		goto error0;
+	/*
+	 * Invalidate the block(s).
+	 */
+	for (i = 0; i < nmaps; i++) {
+		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+			XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
+			XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
+		xfs_trans_binval(tp, bp);
+	}
+	/*
+	 * Unmap the dead block(s) to the free_list.
+	 */
+	if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
+			&first_block, &free_list, &done)))
+		goto error1;
+	ASSERT(done);
+	/*
+	 * Commit the first transaction.  This logs the EFI and the inode.
+	 */
+	if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
+		goto error1;
+	/*
+	 * The transaction must have been committed, since there were
+	 * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
+	 * The new tp has the extent freeing and EFDs.
+	 */
+	ASSERT(committed);
+	/*
+	 * The first xact was committed, so add the inode to the new one.
+	 * Mark it dirty so it will be logged and moved forward in the log as
+	 * part of every commit.
+	 */
+	xfs_trans_ijoin(tp, ip);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	/*
+	 * Get a new, empty transaction to return to our caller.
+	 */
+	ntp = xfs_trans_dup(tp);
+	/*
+	 * Commit the transaction containing extent freeing and EFDs.
+	 * If we get an error on the commit here or on the reserve below,
+	 * we need to unlock the inode since the new transaction doesn't
+	 * have the inode attached.
+	 */
+	error = xfs_trans_commit(tp, 0);
+	tp = ntp;
+	if (error) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		goto error0;
+	}
+	/*
+	 * transaction commit worked ok so we can drop the extra ticket
+	 * reference that we gained in xfs_trans_dup()
+	 */
+	xfs_log_ticket_put(tp->t_ticket);
+
+	/*
+	 * Remove the memory for extent descriptions (just bookkeeping).
+	 */
+	if (ip->i_df.if_bytes)
+		xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
+	ASSERT(ip->i_df.if_bytes == 0);
+	/*
+	 * Put an itruncate log reservation in the new transaction
+	 * for our caller.
+	 */
+	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		goto error0;
+	}
+	/*
+	 * Return with the inode locked but not joined to the transaction.
+	 */
+	*tpp = tp;
+	return 0;
+
+ error1:
+	xfs_bmap_cancel(&free_list);
+ error0:
+	/*
+	 * Have to come here with the inode locked and either
+	 * (held and in the transaction) or (not in the transaction).
+	 * If the inode isn't held then cancel would iput it, but
+	 * that's wrong since this is inactive and the vnode ref
+	 * count is 0 already.
+	 * Cancel won't do anything to the inode if held, but it still
+	 * needs to be locked until the cancel is done, if it was
+	 * joined to the transaction.
+	 */
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	*tpp = NULL;
+	return error;
+
+}
+
+STATIC int
+xfs_inactive_symlink_local(
+	xfs_inode_t	*ip,
+	xfs_trans_t	**tpp)
+{
+	int		error;
+
+	ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
+	/*
+	 * We're freeing a symlink which fit into
+	 * the inode.  Just free the memory used
+	 * to hold the old symlink.
+	 */
+	error = xfs_trans_reserve(*tpp, 0,
+				  XFS_ITRUNCATE_LOG_RES(ip->i_mount),
+				  0, XFS_TRANS_PERM_LOG_RES,
+				  XFS_ITRUNCATE_LOG_COUNT);
+
+	if (error) {
+		xfs_trans_cancel(*tpp, 0);
+		*tpp = NULL;
+		return error;
+	}
+	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+	/*
+	 * Zero length symlinks _can_ exist.
+	 */
+	if (ip->i_df.if_bytes > 0) {
+		xfs_idata_realloc(ip,
+				  -(ip->i_df.if_bytes),
+				  XFS_DATA_FORK);
+		ASSERT(ip->i_df.if_bytes == 0);
+	}
+	return 0;
+}
+
+STATIC int
+xfs_inactive_attrs(
+	xfs_inode_t	*ip,
+	xfs_trans_t	**tpp)
+{
+	xfs_trans_t	*tp;
+	int		error;
+	xfs_mount_t	*mp;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	tp = *tpp;
+	mp = ip->i_mount;
+	ASSERT(ip->i_d.di_forkoff != 0);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	if (error)
+		goto error_unlock;
+
+	error = xfs_attr_inactive(ip);
+	if (error)
+		goto error_unlock;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+	error = xfs_trans_reserve(tp, 0,
+				  XFS_IFREE_LOG_RES(mp),
+				  0, XFS_TRANS_PERM_LOG_RES,
+				  XFS_INACTIVE_LOG_COUNT);
+	if (error)
+		goto error_cancel;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip);
+	xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+	ASSERT(ip->i_d.di_anextents == 0);
+
+	*tpp = tp;
+	return 0;
+
+error_cancel:
+	ASSERT(XFS_FORCED_SHUTDOWN(mp));
+	xfs_trans_cancel(tp, 0);
+error_unlock:
+	*tpp = NULL;
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
+
+int
+xfs_release(
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	int		error;
+
+	if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
+		return 0;
+
+	/* If this is a read-only mount, don't do this (would generate I/O) */
+	if (mp->m_flags & XFS_MOUNT_RDONLY)
+		return 0;
+
+	if (!XFS_FORCED_SHUTDOWN(mp)) {
+		int truncated;
+
+		/*
+		 * If we are using filestreams, and we have an unlinked
+		 * file that we are processing the last close on, then nothing
+		 * will be able to reopen and write to this file. Purge this
+		 * inode from the filestreams cache so that it doesn't delay
+		 * teardown of the inode.
+		 */
+		if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
+			xfs_filestream_deassociate(ip);
+
+		/*
+		 * If we previously truncated this file and removed old data
+		 * in the process, we want to initiate "early" writeout on
+		 * the last close.  This is an attempt to combat the notorious
+		 * NULL files problem which is particularly noticeable from a
+		 * truncate down, buffered (re-)write (delalloc), followed by
+		 * a crash.  What we are effectively doing here is
+		 * significantly reducing the time window where we'd otherwise
+		 * be exposed to that problem.
+		 */
+		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
+		if (truncated) {
+			xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
+			if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
+				xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
+		}
+	}
+
+	if (ip->i_d.di_nlink == 0)
+		return 0;
+
+	if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+	     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
+	       ip->i_delayed_blks > 0)) &&
+	     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
+	    (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
+
+		/*
+		 * If we can't get the iolock just skip truncating the blocks
+		 * past EOF because we could deadlock with the mmap_sem
+		 * otherwise.  We'll get another chance to drop them once the
+		 * last reference to the inode is dropped, so we'll never leak
+		 * blocks permanently.
+		 *
+		 * Further, check if the inode is being opened, written and
+		 * closed frequently and we have delayed allocation blocks
+		 * outstanding (e.g. streaming writes from the NFS server),
+		 * truncating the blocks past EOF will cause fragmentation to
+		 * occur.
+		 *
+		 * In this case don't do the truncation, either, but we have to
+		 * be careful how we detect this case. Blocks beyond EOF show
+		 * up as i_delayed_blks even when the inode is clean, so we
+		 * need to truncate them away first before checking for a dirty
+		 * release. Hence on the first dirty close we will still remove
+		 * the speculative allocation, but after that we will leave it
+		 * in place.
+		 */
+		if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+			return 0;
+
+		error = xfs_free_eofblocks(mp, ip,
+					   XFS_FREE_EOF_TRYLOCK);
+		if (error)
+			return error;
+
+		/* delalloc blocks after truncation means it really is dirty */
+		if (ip->i_delayed_blks)
+			xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+	}
+	return 0;
+}
+
+/*
+ * xfs_inactive
+ *
+ * This is called when the vnode reference count for the vnode
+ * goes to zero.  If the file has been unlinked, then it must
+ * now be truncated.  Also, we clear all of the read-ahead state
+ * kept for the inode here since the file is now closed.
+ */
+int
+xfs_inactive(
+	xfs_inode_t	*ip)
+{
+	xfs_bmap_free_t	free_list;
+	xfs_fsblock_t	first_block;
+	int		committed;
+	xfs_trans_t	*tp;
+	xfs_mount_t	*mp;
+	int		error;
+	int		truncate;
+
+	/*
+	 * If the inode is already free, then there can be nothing
+	 * to clean up here.
+	 */
+	if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
+		ASSERT(ip->i_df.if_real_bytes == 0);
+		ASSERT(ip->i_df.if_broot_bytes == 0);
+		return VN_INACTIVE_CACHE;
+	}
+
+	/*
+	 * Only do a truncate if it's a regular file with
+	 * some actual space in it.  It's OK to look at the
+	 * inode's fields without the lock because we're the
+	 * only one with a reference to the inode.
+	 */
+	truncate = ((ip->i_d.di_nlink == 0) &&
+	    ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
+	     (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
+	    ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
+
+	mp = ip->i_mount;
+
+	error = 0;
+
+	/* If this is a read-only mount, don't do this (would generate I/O) */
+	if (mp->m_flags & XFS_MOUNT_RDONLY)
+		goto out;
+
+	if (ip->i_d.di_nlink != 0) {
+		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
+                       ip->i_delayed_blks > 0)) &&
+		      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
+		     (!(ip->i_d.di_flags &
+				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
+		      (ip->i_delayed_blks != 0)))) {
+			error = xfs_free_eofblocks(mp, ip, 0);
+			if (error)
+				return VN_INACTIVE_CACHE;
+		}
+		goto out;
+	}
+
+	ASSERT(ip->i_d.di_nlink == 0);
+
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		return VN_INACTIVE_CACHE;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+	if (truncate) {
+		/*
+		 * Do the xfs_itruncate_start() call before
+		 * reserving any log space because itruncate_start
+		 * will call into the buffer cache and we can't
+		 * do that within a transaction.
+		 */
+		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+		error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
+		if (error) {
+			xfs_trans_cancel(tp, 0);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			return VN_INACTIVE_CACHE;
+		}
+
+		error = xfs_trans_reserve(tp, 0,
+					  XFS_ITRUNCATE_LOG_RES(mp),
+					  0, XFS_TRANS_PERM_LOG_RES,
+					  XFS_ITRUNCATE_LOG_COUNT);
+		if (error) {
+			/* Don't call itruncate_cleanup */
+			ASSERT(XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			return VN_INACTIVE_CACHE;
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip);
+
+		/*
+		 * normally, we have to run xfs_itruncate_finish sync.
+		 * But if filesystem is wsync and we're in the inactive
+		 * path, then we know that nlink == 0, and that the
+		 * xaction that made nlink == 0 is permanently committed
+		 * since xfs_remove runs as a synchronous transaction.
+		 */
+		error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
+				(!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
+
+		if (error) {
+			xfs_trans_cancel(tp,
+				XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+			return VN_INACTIVE_CACHE;
+		}
+	} else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
+
+		/*
+		 * If we get an error while cleaning up a
+		 * symlink we bail out.
+		 */
+		error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
+			xfs_inactive_symlink_rmt(ip, &tp) :
+			xfs_inactive_symlink_local(ip, &tp);
+
+		if (error) {
+			ASSERT(tp == NULL);
+			return VN_INACTIVE_CACHE;
+		}
+
+		xfs_trans_ijoin(tp, ip);
+	} else {
+		error = xfs_trans_reserve(tp, 0,
+					  XFS_IFREE_LOG_RES(mp),
+					  0, XFS_TRANS_PERM_LOG_RES,
+					  XFS_INACTIVE_LOG_COUNT);
+		if (error) {
+			ASSERT(XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			return VN_INACTIVE_CACHE;
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+		xfs_trans_ijoin(tp, ip);
+	}
+
+	/*
+	 * If there are attributes associated with the file
+	 * then blow them away now.  The code calls a routine
+	 * that recursively deconstructs the attribute fork.
+	 * We need to just commit the current transaction
+	 * because we can't use it for xfs_attr_inactive().
+	 */
+	if (ip->i_d.di_anextents > 0) {
+		error = xfs_inactive_attrs(ip, &tp);
+		/*
+		 * If we got an error, the transaction is already
+		 * cancelled, and the inode is unlocked. Just get out.
+		 */
+		 if (error)
+			 return VN_INACTIVE_CACHE;
+	} else if (ip->i_afp) {
+		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+	}
+
+	/*
+	 * Free the inode.
+	 */
+	xfs_bmap_init(&free_list, &first_block);
+	error = xfs_ifree(tp, ip, &free_list);
+	if (error) {
+		/*
+		 * If we fail to free the inode, shut down.  The cancel
+		 * might do that, we need to make sure.  Otherwise the
+		 * inode might be lost for a long time or forever.
+		 */
+		if (!XFS_FORCED_SHUTDOWN(mp)) {
+			xfs_notice(mp, "%s: xfs_ifree returned error %d",
+				__func__, error);
+			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+		}
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	} else {
+		/*
+		 * Credit the quota account(s). The inode is gone.
+		 */
+		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+
+		/*
+		 * Just ignore errors at this point.  There is nothing we can
+		 * do except to try to keep going. Make sure it's not a silent
+		 * error.
+		 */
+		error = xfs_bmap_finish(&tp,  &free_list, &committed);
+		if (error)
+			xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
+				__func__, error);
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		if (error)
+			xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
+				__func__, error);
+	}
+
+	/*
+	 * Release the dquots held by inode, if any.
+	 */
+	xfs_qm_dqdetach(ip);
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+
+ out:
+	return VN_INACTIVE_CACHE;
+}
+
+/*
+ * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
+ * is allowed, otherwise it has to be an exact match. If a CI match is found,
+ * ci_name->name will point to a the actual name (caller must free) or
+ * will be set to NULL if an exact match is found.
+ */
+int
+xfs_lookup(
+	xfs_inode_t		*dp,
+	struct xfs_name		*name,
+	xfs_inode_t		**ipp,
+	struct xfs_name		*ci_name)
+{
+	xfs_ino_t		inum;
+	int			error;
+	uint			lock_mode;
+
+	trace_xfs_lookup(dp, name);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return XFS_ERROR(EIO);
+
+	lock_mode = xfs_ilock_map_shared(dp);
+	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
+	xfs_iunlock_map_shared(dp, lock_mode);
+
+	if (error)
+		goto out;
+
+	error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
+	if (error)
+		goto out_free_name;
+
+	return 0;
+
+out_free_name:
+	if (ci_name)
+		kmem_free(ci_name->name);
+out:
+	*ipp = NULL;
+	return error;
+}
+
+int
+xfs_create(
+	xfs_inode_t		*dp,
+	struct xfs_name		*name,
+	mode_t			mode,
+	xfs_dev_t		rdev,
+	xfs_inode_t		**ipp)
+{
+	int			is_dir = S_ISDIR(mode);
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_inode	*ip = NULL;
+	struct xfs_trans	*tp = NULL;
+	int			error;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	boolean_t		unlock_dp_on_error = B_FALSE;
+	uint			cancel_flags;
+	int			committed;
+	prid_t			prid;
+	struct xfs_dquot	*udqp = NULL;
+	struct xfs_dquot	*gdqp = NULL;
+	uint			resblks;
+	uint			log_res;
+	uint			log_count;
+
+	trace_xfs_create(dp, name);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+		prid = xfs_get_projid(dp);
+	else
+		prid = XFS_PROJID_DEFAULT;
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.
+	 */
+	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
+			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
+	if (error)
+		return error;
+
+	if (is_dir) {
+		rdev = 0;
+		resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
+		log_res = XFS_MKDIR_LOG_RES(mp);
+		log_count = XFS_MKDIR_LOG_COUNT;
+		tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
+	} else {
+		resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+		log_res = XFS_CREATE_LOG_RES(mp);
+		log_count = XFS_CREATE_LOG_COUNT;
+		tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
+	}
+
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+	/*
+	 * Initially assume that the file does not exist and
+	 * reserve the resources for that case.  If that is not
+	 * the case we'll drop the one we have and get a more
+	 * appropriate transaction later.
+	 */
+	error = xfs_trans_reserve(tp, resblks, log_res, 0,
+			XFS_TRANS_PERM_LOG_RES, log_count);
+	if (error == ENOSPC) {
+		/* flush outstanding delalloc blocks and retry */
+		xfs_flush_inodes(dp);
+		error = xfs_trans_reserve(tp, resblks, log_res, 0,
+				XFS_TRANS_PERM_LOG_RES, log_count);
+	}
+	if (error == ENOSPC) {
+		/* No space at all so try a "no-allocation" reservation */
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, log_res, 0,
+				XFS_TRANS_PERM_LOG_RES, log_count);
+	}
+	if (error) {
+		cancel_flags = 0;
+		goto out_trans_cancel;
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+	unlock_dp_on_error = B_TRUE;
+
+	/*
+	 * Check for directory link count overflow.
+	 */
+	if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) {
+		error = XFS_ERROR(EMLINK);
+		goto out_trans_cancel;
+	}
+
+	xfs_bmap_init(&free_list, &first_block);
+
+	/*
+	 * Reserve disk quota and the inode.
+	 */
+	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
+	if (error)
+		goto out_trans_cancel;
+
+	error = xfs_dir_canenter(tp, dp, name, resblks);
+	if (error)
+		goto out_trans_cancel;
+
+	/*
+	 * A newly created regular or special file just has one directory
+	 * entry pointing to them, but a directory also the "." entry
+	 * pointing to itself.
+	 */
+	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
+			       prid, resblks > 0, &ip, &committed);
+	if (error) {
+		if (error == ENOSPC)
+			goto out_trans_cancel;
+		goto out_trans_abort;
+	}
+
+	/*
+	 * Now we join the directory inode to the transaction.  We do not do it
+	 * earlier because xfs_dir_ialloc might commit the previous transaction
+	 * (and release all the locks).  An error from here on will result in
+	 * the transaction cancel unlocking dp so don't do it explicitly in the
+	 * error path.
+	 */
+	xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+	unlock_dp_on_error = B_FALSE;
+
+	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
+					&first_block, &free_list, resblks ?
+					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+	if (error) {
+		ASSERT(error != ENOSPC);
+		goto out_trans_abort;
+	}
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+	if (is_dir) {
+		error = xfs_dir_init(tp, ip, dp);
+		if (error)
+			goto out_bmap_cancel;
+
+		error = xfs_bumplink(tp, dp);
+		if (error)
+			goto out_bmap_cancel;
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * create transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+		xfs_trans_set_sync(tp);
+
+	/*
+	 * Attach the dquot(s) to the inodes and modify them incore.
+	 * These ids of the inode couldn't have changed since the new
+	 * inode has been locked ever since it was created.
+	 */
+	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
+
+	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	if (error)
+		goto out_bmap_cancel;
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	if (error)
+		goto out_release_inode;
+
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+
+	*ipp = ip;
+	return 0;
+
+ out_bmap_cancel:
+	xfs_bmap_cancel(&free_list);
+ out_trans_abort:
+	cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+	xfs_trans_cancel(tp, cancel_flags);
+ out_release_inode:
+	/*
+	 * Wait until after the current transaction is aborted to
+	 * release the inode.  This prevents recursive transactions
+	 * and deadlocks from xfs_inactive.
+	 */
+	if (ip)
+		IRELE(ip);
+
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+
+	if (unlock_dp_on_error)
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return error;
+}
+
+#ifdef DEBUG
+int xfs_locked_n;
+int xfs_small_retries;
+int xfs_middle_retries;
+int xfs_lots_retries;
+int xfs_lock_delays;
+#endif
+
+/*
+ * Bump the subclass so xfs_lock_inodes() acquires each lock with
+ * a different value
+ */
+static inline int
+xfs_lock_inumorder(int lock_mode, int subclass)
+{
+	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+		lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+	if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
+		lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
+
+	return lock_mode;
+}
+
+/*
+ * The following routine will lock n inodes in exclusive mode.
+ * We assume the caller calls us with the inodes in i_ino order.
+ *
+ * We need to detect deadlock where an inode that we lock
+ * is in the AIL and we start waiting for another inode that is locked
+ * by a thread in a long running transaction (such as truncate). This can
+ * result in deadlock since the long running trans might need to wait
+ * for the inode we just locked in order to push the tail and free space
+ * in the log.
+ */
+void
+xfs_lock_inodes(
+	xfs_inode_t	**ips,
+	int		inodes,
+	uint		lock_mode)
+{
+	int		attempts = 0, i, j, try_lock;
+	xfs_log_item_t	*lp;
+
+	ASSERT(ips && (inodes >= 2)); /* we need at least two */
+
+	try_lock = 0;
+	i = 0;
+
+again:
+	for (; i < inodes; i++) {
+		ASSERT(ips[i]);
+
+		if (i && (ips[i] == ips[i-1]))	/* Already locked */
+			continue;
+
+		/*
+		 * If try_lock is not set yet, make sure all locked inodes
+		 * are not in the AIL.
+		 * If any are, set try_lock to be used later.
+		 */
+
+		if (!try_lock) {
+			for (j = (i - 1); j >= 0 && !try_lock; j--) {
+				lp = (xfs_log_item_t *)ips[j]->i_itemp;
+				if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+					try_lock++;
+				}
+			}
+		}
+
+		/*
+		 * If any of the previous locks we have locked is in the AIL,
+		 * we must TRY to get the second and subsequent locks. If
+		 * we can't get any, we must release all we have
+		 * and try again.
+		 */
+
+		if (try_lock) {
+			/* try_lock must be 0 if i is 0. */
+			/*
+			 * try_lock means we have an inode locked
+			 * that is in the AIL.
+			 */
+			ASSERT(i != 0);
+			if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
+				attempts++;
+
+				/*
+				 * Unlock all previous guys and try again.
+				 * xfs_iunlock will try to push the tail
+				 * if the inode is in the AIL.
+				 */
+
+				for(j = i - 1; j >= 0; j--) {
+
+					/*
+					 * Check to see if we've already
+					 * unlocked this one.
+					 * Not the first one going back,
+					 * and the inode ptr is the same.
+					 */
+					if ((j != (i - 1)) && ips[j] ==
+								ips[j+1])
+						continue;
+
+					xfs_iunlock(ips[j], lock_mode);
+				}
+
+				if ((attempts % 5) == 0) {
+					delay(1); /* Don't just spin the CPU */
+#ifdef DEBUG
+					xfs_lock_delays++;
+#endif
+				}
+				i = 0;
+				try_lock = 0;
+				goto again;
+			}
+		} else {
+			xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
+		}
+	}
+
+#ifdef DEBUG
+	if (attempts) {
+		if (attempts < 5) xfs_small_retries++;
+		else if (attempts < 100) xfs_middle_retries++;
+		else xfs_lots_retries++;
+	} else {
+		xfs_locked_n++;
+	}
+#endif
+}
+
+/*
+ * xfs_lock_two_inodes() can only be used to lock one type of lock
+ * at a time - the iolock or the ilock, but not both at once. If
+ * we lock both at once, lockdep will report false positives saying
+ * we have violated locking orders.
+ */
+void
+xfs_lock_two_inodes(
+	xfs_inode_t		*ip0,
+	xfs_inode_t		*ip1,
+	uint			lock_mode)
+{
+	xfs_inode_t		*temp;
+	int			attempts = 0;
+	xfs_log_item_t		*lp;
+
+	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+		ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
+	ASSERT(ip0->i_ino != ip1->i_ino);
+
+	if (ip0->i_ino > ip1->i_ino) {
+		temp = ip0;
+		ip0 = ip1;
+		ip1 = temp;
+	}
+
+ again:
+	xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
+
+	/*
+	 * If the first lock we have locked is in the AIL, we must TRY to get
+	 * the second lock. If we can't get it, we must release the first one
+	 * and try again.
+	 */
+	lp = (xfs_log_item_t *)ip0->i_itemp;
+	if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+		if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
+			xfs_iunlock(ip0, lock_mode);
+			if ((++attempts % 5) == 0)
+				delay(1); /* Don't just spin the CPU */
+			goto again;
+		}
+	} else {
+		xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
+	}
+}
+
+int
+xfs_remove(
+	xfs_inode_t             *dp,
+	struct xfs_name		*name,
+	xfs_inode_t		*ip)
+{
+	xfs_mount_t		*mp = dp->i_mount;
+	xfs_trans_t             *tp = NULL;
+	int			is_dir = S_ISDIR(ip->i_d.di_mode);
+	int                     error = 0;
+	xfs_bmap_free_t         free_list;
+	xfs_fsblock_t           first_block;
+	int			cancel_flags;
+	int			committed;
+	int			link_zero;
+	uint			resblks;
+	uint			log_count;
+
+	trace_xfs_remove(dp, name);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	error = xfs_qm_dqattach(dp, 0);
+	if (error)
+		goto std_return;
+
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		goto std_return;
+
+	if (is_dir) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
+		log_count = XFS_DEFAULT_LOG_COUNT;
+	} else {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+		log_count = XFS_REMOVE_LOG_COUNT;
+	}
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+	/*
+	 * We try to get the real space reservation first,
+	 * allowing for directory btree deletion(s) implying
+	 * possible bmap insert(s).  If we can't get the space
+	 * reservation then we use 0 instead, and avoid the bmap
+	 * btree insert(s) in the directory code by, if the bmap
+	 * insert tries to happen, instead trimming the LAST
+	 * block from the directory.
+	 */
+	resblks = XFS_REMOVE_SPACE_RES(mp);
+	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
+				  XFS_TRANS_PERM_LOG_RES, log_count);
+	if (error == ENOSPC) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
+					  XFS_TRANS_PERM_LOG_RES, log_count);
+	}
+	if (error) {
+		ASSERT(error != ENOSPC);
+		cancel_flags = 0;
+		goto out_trans_cancel;
+	}
+
+	xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
+
+	xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * If we're removing a directory perform some additional validation.
+	 */
+	if (is_dir) {
+		ASSERT(ip->i_d.di_nlink >= 2);
+		if (ip->i_d.di_nlink != 2) {
+			error = XFS_ERROR(ENOTEMPTY);
+			goto out_trans_cancel;
+		}
+		if (!xfs_dir_isempty(ip)) {
+			error = XFS_ERROR(ENOTEMPTY);
+			goto out_trans_cancel;
+		}
+	}
+
+	xfs_bmap_init(&free_list, &first_block);
+	error = xfs_dir_removename(tp, dp, name, ip->i_ino,
+					&first_block, &free_list, resblks);
+	if (error) {
+		ASSERT(error != ENOENT);
+		goto out_bmap_cancel;
+	}
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	if (is_dir) {
+		/*
+		 * Drop the link from ip's "..".
+		 */
+		error = xfs_droplink(tp, dp);
+		if (error)
+			goto out_bmap_cancel;
+
+		/*
+		 * Drop the "." link from ip to self.
+		 */
+		error = xfs_droplink(tp, ip);
+		if (error)
+			goto out_bmap_cancel;
+	} else {
+		/*
+		 * When removing a non-directory we need to log the parent
+		 * inode here.  For a directory this is done implicitly
+		 * by the xfs_droplink call for the ".." entry.
+		 */
+		xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+	}
+
+	/*
+	 * Drop the link from dp to ip.
+	 */
+	error = xfs_droplink(tp, ip);
+	if (error)
+		goto out_bmap_cancel;
+
+	/*
+	 * Determine if this is the last link while
+	 * we are in the transaction.
+	 */
+	link_zero = (ip->i_d.di_nlink == 0);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * remove transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+		xfs_trans_set_sync(tp);
+
+	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	if (error)
+		goto out_bmap_cancel;
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	if (error)
+		goto std_return;
+
+	/*
+	 * If we are using filestreams, kill the stream association.
+	 * If the file is still open it may get a new one but that
+	 * will get killed on last close in xfs_close() so we don't
+	 * have to worry about that.
+	 */
+	if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
+		xfs_filestream_deassociate(ip);
+
+	return 0;
+
+ out_bmap_cancel:
+	xfs_bmap_cancel(&free_list);
+	cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+	xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+	return error;
+}
+
+int
+xfs_link(
+	xfs_inode_t		*tdp,
+	xfs_inode_t		*sip,
+	struct xfs_name		*target_name)
+{
+	xfs_mount_t		*mp = tdp->i_mount;
+	xfs_trans_t		*tp;
+	int			error;
+	xfs_bmap_free_t         free_list;
+	xfs_fsblock_t           first_block;
+	int			cancel_flags;
+	int			committed;
+	int			resblks;
+
+	trace_xfs_link(tdp, target_name);
+
+	ASSERT(!S_ISDIR(sip->i_d.di_mode));
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	error = xfs_qm_dqattach(sip, 0);
+	if (error)
+		goto std_return;
+
+	error = xfs_qm_dqattach(tdp, 0);
+	if (error)
+		goto std_return;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
+	error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
+	if (error == ENOSPC) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
+	}
+	if (error) {
+		cancel_flags = 0;
+		goto error_return;
+	}
+
+	xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
+
+	xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL);
+
+	/*
+	 * If the source has too many links, we can't make any more to it.
+	 */
+	if (sip->i_d.di_nlink >= XFS_MAXLINK) {
+		error = XFS_ERROR(EMLINK);
+		goto error_return;
+	}
+
+	/*
+	 * If we are using project inheritance, we only allow hard link
+	 * creation in our tree when the project IDs are the same; else
+	 * the tree quota mechanism could be circumvented.
+	 */
+	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+		     (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
+		error = XFS_ERROR(EXDEV);
+		goto error_return;
+	}
+
+	error = xfs_dir_canenter(tp, tdp, target_name, resblks);
+	if (error)
+		goto error_return;
+
+	xfs_bmap_init(&free_list, &first_block);
+
+	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
+					&first_block, &free_list, resblks);
+	if (error)
+		goto abort_return;
+	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
+
+	error = xfs_bumplink(tp, sip);
+	if (error)
+		goto abort_return;
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * link transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_bmap_finish (&tp, &free_list, &committed);
+	if (error) {
+		xfs_bmap_cancel(&free_list);
+		goto abort_return;
+	}
+
+	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+ abort_return:
+	cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+	xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+	return error;
+}
+
+int
+xfs_symlink(
+	xfs_inode_t		*dp,
+	struct xfs_name		*link_name,
+	const char		*target_path,
+	mode_t			mode,
+	xfs_inode_t		**ipp)
+{
+	xfs_mount_t		*mp = dp->i_mount;
+	xfs_trans_t		*tp;
+	xfs_inode_t		*ip;
+	int			error;
+	int			pathlen;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	boolean_t		unlock_dp_on_error = B_FALSE;
+	uint			cancel_flags;
+	int			committed;
+	xfs_fileoff_t		first_fsb;
+	xfs_filblks_t		fs_blocks;
+	int			nmaps;
+	xfs_bmbt_irec_t		mval[SYMLINK_MAPS];
+	xfs_daddr_t		d;
+	const char		*cur_chunk;
+	int			byte_cnt;
+	int			n;
+	xfs_buf_t		*bp;
+	prid_t			prid;
+	struct xfs_dquot	*udqp, *gdqp;
+	uint			resblks;
+
+	*ipp = NULL;
+	error = 0;
+	ip = NULL;
+	tp = NULL;
+
+	trace_xfs_symlink(dp, link_name);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	/*
+	 * Check component lengths of the target path name.
+	 */
+	pathlen = strlen(target_path);
+	if (pathlen >= MAXPATHLEN)      /* total string too long */
+		return XFS_ERROR(ENAMETOOLONG);
+
+	udqp = gdqp = NULL;
+	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+		prid = xfs_get_projid(dp);
+	else
+		prid = XFS_PROJID_DEFAULT;
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.
+	 */
+	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
+			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
+	if (error)
+		goto std_return;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	/*
+	 * The symlink will fit into the inode data fork?
+	 * There can't be any attributes so we get the whole variable part.
+	 */
+	if (pathlen <= XFS_LITINO(mp))
+		fs_blocks = 0;
+	else
+		fs_blocks = XFS_B_TO_FSB(mp, pathlen);
+	resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
+	error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+	if (error == ENOSPC && fs_blocks == 0) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+	}
+	if (error) {
+		cancel_flags = 0;
+		goto error_return;
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+	unlock_dp_on_error = B_TRUE;
+
+	/*
+	 * Check whether the directory allows new symlinks or not.
+	 */
+	if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
+		error = XFS_ERROR(EPERM);
+		goto error_return;
+	}
+
+	/*
+	 * Reserve disk quota : blocks and inode.
+	 */
+	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
+	if (error)
+		goto error_return;
+
+	/*
+	 * Check for ability to enter directory entry, if no space reserved.
+	 */
+	error = xfs_dir_canenter(tp, dp, link_name, resblks);
+	if (error)
+		goto error_return;
+	/*
+	 * Initialize the bmap freelist prior to calling either
+	 * bmapi or the directory create code.
+	 */
+	xfs_bmap_init(&free_list, &first_block);
+
+	/*
+	 * Allocate an inode for the symlink.
+	 */
+	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
+			       prid, resblks > 0, &ip, NULL);
+	if (error) {
+		if (error == ENOSPC)
+			goto error_return;
+		goto error1;
+	}
+
+	/*
+	 * An error after we've joined dp to the transaction will result in the
+	 * transaction cancel unlocking dp so don't do it explicitly in the
+	 * error path.
+	 */
+	xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+	unlock_dp_on_error = B_FALSE;
+
+	/*
+	 * Also attach the dquot(s) to it, if applicable.
+	 */
+	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
+
+	if (resblks)
+		resblks -= XFS_IALLOC_SPACE_RES(mp);
+	/*
+	 * If the symlink will fit into the inode, write it inline.
+	 */
+	if (pathlen <= XFS_IFORK_DSIZE(ip)) {
+		xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
+		memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
+		ip->i_d.di_size = pathlen;
+
+		/*
+		 * The inode was initially created in extent format.
+		 */
+		ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+		ip->i_df.if_flags |= XFS_IFINLINE;
+
+		ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+
+	} else {
+		first_fsb = 0;
+		nmaps = SYMLINK_MAPS;
+
+		error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
+				  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
+				  &first_block, resblks, mval, &nmaps,
+				  &free_list);
+		if (error)
+			goto error2;
+
+		if (resblks)
+			resblks -= fs_blocks;
+		ip->i_d.di_size = pathlen;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+		cur_chunk = target_path;
+		for (n = 0; n < nmaps; n++) {
+			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+					       BTOBB(byte_cnt), 0);
+			ASSERT(bp && !XFS_BUF_GETERROR(bp));
+			if (pathlen < byte_cnt) {
+				byte_cnt = pathlen;
+			}
+			pathlen -= byte_cnt;
+
+			memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
+			cur_chunk += byte_cnt;
+
+			xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
+		}
+	}
+
+	/*
+	 * Create the directory entry for the symlink.
+	 */
+	error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
+					&first_block, &free_list, resblks);
+	if (error)
+		goto error2;
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * symlink transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	if (error) {
+		goto error2;
+	}
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+
+	*ipp = ip;
+	return 0;
+
+ error2:
+	IRELE(ip);
+ error1:
+	xfs_bmap_cancel(&free_list);
+	cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+	xfs_trans_cancel(tp, cancel_flags);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+
+	if (unlock_dp_on_error)
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ std_return:
+	return error;
+}
+
+int
+xfs_set_dmattrs(
+	xfs_inode_t     *ip,
+	u_int		evmask,
+	u_int16_t	state)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_trans_t	*tp;
+	int		error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return XFS_ERROR(EPERM);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+
+	ip->i_d.di_dmevmask = evmask;
+	ip->i_d.di_dmstate  = state;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	error = xfs_trans_commit(tp, 0);
+
+	return error;
+}
+
+/*
+ * xfs_alloc_file_space()
+ *      This routine allocates disk space for the given file.
+ *
+ *	If alloc_type == 0, this request is for an ALLOCSP type
+ *	request which will change the file size.  In this case, no
+ *	DMAPI event will be generated by the call.  A TRUNCATE event
+ *	will be generated later by xfs_setattr.
+ *
+ *	If alloc_type != 0, this request is for a RESVSP type
+ *	request, and a DMAPI DM_EVENT_WRITE will be generated if the
+ *	lower block boundary byte address is less than the file's
+ *	length.
+ *
+ * RETURNS:
+ *       0 on success
+ *      errno on error
+ *
+ */
+STATIC int
+xfs_alloc_file_space(
+	xfs_inode_t		*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len,
+	int			alloc_type,
+	int			attr_flags)
+{
+	xfs_mount_t		*mp = ip->i_mount;
+	xfs_off_t		count;
+	xfs_filblks_t		allocated_fsb;
+	xfs_filblks_t		allocatesize_fsb;
+	xfs_extlen_t		extsz, temp;
+	xfs_fileoff_t		startoffset_fsb;
+	xfs_fsblock_t		firstfsb;
+	int			nimaps;
+	int			bmapi_flag;
+	int			quota_flag;
+	int			rt;
+	xfs_trans_t		*tp;
+	xfs_bmbt_irec_t		imaps[1], *imapp;
+	xfs_bmap_free_t		free_list;
+	uint			qblocks, resblks, resrtextents;
+	int			committed;
+	int			error;
+
+	trace_xfs_alloc_file_space(ip);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		return error;
+
+	if (len <= 0)
+		return XFS_ERROR(EINVAL);
+
+	rt = XFS_IS_REALTIME_INODE(ip);
+	extsz = xfs_get_extsz_hint(ip);
+
+	count = len;
+	imapp = &imaps[0];
+	nimaps = 1;
+	bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
+	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
+	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+
+	/*
+	 * Allocate file space until done or until there is an error
+	 */
+	while (allocatesize_fsb && !error) {
+		xfs_fileoff_t	s, e;
+
+		/*
+		 * Determine space reservations for data/realtime.
+		 */
+		if (unlikely(extsz)) {
+			s = startoffset_fsb;
+			do_div(s, extsz);
+			s *= extsz;
+			e = startoffset_fsb + allocatesize_fsb;
+			if ((temp = do_mod(startoffset_fsb, extsz)))
+				e += temp;
+			if ((temp = do_mod(e, extsz)))
+				e += extsz - temp;
+		} else {
+			s = 0;
+			e = allocatesize_fsb;
+		}
+
+		/*
+		 * The transaction reservation is limited to a 32-bit block
+		 * count, hence we need to limit the number of blocks we are
+		 * trying to reserve to avoid an overflow. We can't allocate
+		 * more than @nimaps extents, and an extent is limited on disk
+		 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+		 */
+		resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
+		if (unlikely(rt)) {
+			resrtextents = qblocks = resblks;
+			resrtextents /= mp->m_sb.sb_rextsize;
+			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+			quota_flag = XFS_QMOPT_RES_RTBLKS;
+		} else {
+			resrtextents = 0;
+			resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
+			quota_flag = XFS_QMOPT_RES_REGBLKS;
+		}
+
+		/*
+		 * Allocate and setup the transaction.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+		error = xfs_trans_reserve(tp, resblks,
+					  XFS_WRITE_LOG_RES(mp), resrtextents,
+					  XFS_TRANS_PERM_LOG_RES,
+					  XFS_WRITE_LOG_COUNT);
+		/*
+		 * Check for running out of space
+		 */
+		if (error) {
+			/*
+			 * Free the transaction structure.
+			 */
+			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			break;
+		}
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
+						      0, quota_flag);
+		if (error)
+			goto error1;
+
+		xfs_trans_ijoin(tp, ip);
+
+		/*
+		 * Issue the xfs_bmapi() call to allocate the blocks
+		 */
+		xfs_bmap_init(&free_list, &firstfsb);
+		error = xfs_bmapi(tp, ip, startoffset_fsb,
+				  allocatesize_fsb, bmapi_flag,
+				  &firstfsb, 0, imapp, &nimaps,
+				  &free_list);
+		if (error) {
+			goto error0;
+		}
+
+		/*
+		 * Complete the transaction
+		 */
+		error = xfs_bmap_finish(&tp, &free_list, &committed);
+		if (error) {
+			goto error0;
+		}
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		if (error) {
+			break;
+		}
+
+		allocated_fsb = imapp->br_blockcount;
+
+		if (nimaps == 0) {
+			error = XFS_ERROR(ENOSPC);
+			break;
+		}
+
+		startoffset_fsb += allocated_fsb;
+		allocatesize_fsb -= allocated_fsb;
+	}
+
+	return error;
+
+error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
+	xfs_bmap_cancel(&free_list);
+	xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
+
+error1:	/* Just cancel transaction */
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Zero file bytes between startoff and endoff inclusive.
+ * The iolock is held exclusive and no blocks are buffered.
+ *
+ * This function is used by xfs_free_file_space() to zero
+ * partial blocks when the range to free is not block aligned.
+ * When unreserving space with boundaries that are not block
+ * aligned we round up the start and round down the end
+ * boundaries and then use this function to zero the parts of
+ * the blocks that got dropped during the rounding.
+ */
+STATIC int
+xfs_zero_remaining_bytes(
+	xfs_inode_t		*ip,
+	xfs_off_t		startoff,
+	xfs_off_t		endoff)
+{
+	xfs_bmbt_irec_t		imap;
+	xfs_fileoff_t		offset_fsb;
+	xfs_off_t		lastoffset;
+	xfs_off_t		offset;
+	xfs_buf_t		*bp;
+	xfs_mount_t		*mp = ip->i_mount;
+	int			nimap;
+	int			error = 0;
+
+	/*
+	 * Avoid doing I/O beyond eof - it's not necessary
+	 * since nothing can read beyond eof.  The space will
+	 * be zeroed when the file is extended anyway.
+	 */
+	if (startoff >= ip->i_size)
+		return 0;
+
+	if (endoff > ip->i_size)
+		endoff = ip->i_size;
+
+	bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
+					mp->m_rtdev_targp : mp->m_ddev_targp,
+				mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
+	if (!bp)
+		return XFS_ERROR(ENOMEM);
+
+	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
+		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+		nimap = 1;
+		error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
+			NULL, 0, &imap, &nimap, NULL);
+		if (error || nimap < 1)
+			break;
+		ASSERT(imap.br_blockcount >= 1);
+		ASSERT(imap.br_startoff == offset_fsb);
+		lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
+		if (lastoffset > endoff)
+			lastoffset = endoff;
+		if (imap.br_startblock == HOLESTARTBLOCK)
+			continue;
+		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+		if (imap.br_state == XFS_EXT_UNWRITTEN)
+			continue;
+		XFS_BUF_UNDONE(bp);
+		XFS_BUF_UNWRITE(bp);
+		XFS_BUF_READ(bp);
+		XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
+		xfsbdstrat(mp, bp);
+		error = xfs_buf_iowait(bp);
+		if (error) {
+			xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
+					  mp, bp, XFS_BUF_ADDR(bp));
+			break;
+		}
+		memset(XFS_BUF_PTR(bp) +
+			(offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
+		      0, lastoffset - offset + 1);
+		XFS_BUF_UNDONE(bp);
+		XFS_BUF_UNREAD(bp);
+		XFS_BUF_WRITE(bp);
+		xfsbdstrat(mp, bp);
+		error = xfs_buf_iowait(bp);
+		if (error) {
+			xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
+					  mp, bp, XFS_BUF_ADDR(bp));
+			break;
+		}
+	}
+	xfs_buf_free(bp);
+	return error;
+}
+
+/*
+ * xfs_free_file_space()
+ *      This routine frees disk space for the given file.
+ *
+ *	This routine is only called by xfs_change_file_space
+ *	for an UNRESVSP type call.
+ *
+ * RETURNS:
+ *       0 on success
+ *      errno on error
+ *
+ */
+STATIC int
+xfs_free_file_space(
+	xfs_inode_t		*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len,
+	int			attr_flags)
+{
+	int			committed;
+	int			done;
+	xfs_fileoff_t		endoffset_fsb;
+	int			error;
+	xfs_fsblock_t		firstfsb;
+	xfs_bmap_free_t		free_list;
+	xfs_bmbt_irec_t		imap;
+	xfs_off_t		ioffset;
+	xfs_extlen_t		mod=0;
+	xfs_mount_t		*mp;
+	int			nimap;
+	uint			resblks;
+	uint			rounding;
+	int			rt;
+	xfs_fileoff_t		startoffset_fsb;
+	xfs_trans_t		*tp;
+	int			need_iolock = 1;
+
+	mp = ip->i_mount;
+
+	trace_xfs_free_file_space(ip);
+
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		return error;
+
+	error = 0;
+	if (len <= 0)	/* if nothing being freed */
+		return error;
+	rt = XFS_IS_REALTIME_INODE(ip);
+	startoffset_fsb	= XFS_B_TO_FSB(mp, offset);
+	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
+
+	if (attr_flags & XFS_ATTR_NOLOCK)
+		need_iolock = 0;
+	if (need_iolock) {
+		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+		/* wait for the completion of any pending DIOs */
+		xfs_ioend_wait(ip);
+	}
+
+	rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+	ioffset = offset & ~(rounding - 1);
+
+	if (VN_CACHED(VFS_I(ip)) != 0) {
+		error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
+		if (error)
+			goto out_unlock_iolock;
+	}
+
+	/*
+	 * Need to zero the stuff we're not freeing, on disk.
+	 * If it's a realtime file & can't use unwritten extents then we
+	 * actually need to zero the extent edges.  Otherwise xfs_bunmapi
+	 * will take care of it for us.
+	 */
+	if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+		nimap = 1;
+		error = xfs_bmapi(NULL, ip, startoffset_fsb,
+			1, 0, NULL, 0, &imap, &nimap, NULL);
+		if (error)
+			goto out_unlock_iolock;
+		ASSERT(nimap == 0 || nimap == 1);
+		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+			xfs_daddr_t	block;
+
+			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+			block = imap.br_startblock;
+			mod = do_div(block, mp->m_sb.sb_rextsize);
+			if (mod)
+				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+		}
+		nimap = 1;
+		error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
+			1, 0, NULL, 0, &imap, &nimap, NULL);
+		if (error)
+			goto out_unlock_iolock;
+		ASSERT(nimap == 0 || nimap == 1);
+		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+			mod++;
+			if (mod && (mod != mp->m_sb.sb_rextsize))
+				endoffset_fsb -= mod;
+		}
+	}
+	if ((done = (endoffset_fsb <= startoffset_fsb)))
+		/*
+		 * One contiguous piece to clear
+		 */
+		error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
+	else {
+		/*
+		 * Some full blocks, possibly two pieces to clear
+		 */
+		if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
+			error = xfs_zero_remaining_bytes(ip, offset,
+				XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
+		if (!error &&
+		    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
+			error = xfs_zero_remaining_bytes(ip,
+				XFS_FSB_TO_B(mp, endoffset_fsb),
+				offset + len - 1);
+	}
+
+	/*
+	 * free file space until done or until there is an error
+	 */
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+	while (!error && !done) {
+
+		/*
+		 * allocate and setup the transaction. Allow this
+		 * transaction to dip into the reserve blocks to ensure
+		 * the freeing of the space succeeds at ENOSPC.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+		tp->t_flags |= XFS_TRANS_RESERVE;
+		error = xfs_trans_reserve(tp,
+					  resblks,
+					  XFS_WRITE_LOG_RES(mp),
+					  0,
+					  XFS_TRANS_PERM_LOG_RES,
+					  XFS_WRITE_LOG_COUNT);
+
+		/*
+		 * check for running out of space
+		 */
+		if (error) {
+			/*
+			 * Free the transaction structure.
+			 */
+			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			break;
+		}
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_trans_reserve_quota(tp, mp,
+				ip->i_udquot, ip->i_gdquot,
+				resblks, 0, XFS_QMOPT_RES_REGBLKS);
+		if (error)
+			goto error1;
+
+		xfs_trans_ijoin(tp, ip);
+
+		/*
+		 * issue the bunmapi() call to free the blocks
+		 */
+		xfs_bmap_init(&free_list, &firstfsb);
+		error = xfs_bunmapi(tp, ip, startoffset_fsb,
+				  endoffset_fsb - startoffset_fsb,
+				  0, 2, &firstfsb, &free_list, &done);
+		if (error) {
+			goto error0;
+		}
+
+		/*
+		 * complete the transaction
+		 */
+		error = xfs_bmap_finish(&tp, &free_list, &committed);
+		if (error) {
+			goto error0;
+		}
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+
+ out_unlock_iolock:
+	if (need_iolock)
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+
+ error0:
+	xfs_bmap_cancel(&free_list);
+ error1:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
+		    XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * xfs_change_file_space()
+ *      This routine allocates or frees disk space for the given file.
+ *      The user specified parameters are checked for alignment and size
+ *      limitations.
+ *
+ * RETURNS:
+ *       0 on success
+ *      errno on error
+ *
+ */
+int
+xfs_change_file_space(
+	xfs_inode_t	*ip,
+	int		cmd,
+	xfs_flock64_t	*bf,
+	xfs_off_t	offset,
+	int		attr_flags)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	int		clrprealloc;
+	int		error;
+	xfs_fsize_t	fsize;
+	int		setprealloc;
+	xfs_off_t	startoffset;
+	xfs_off_t	llen;
+	xfs_trans_t	*tp;
+	struct iattr	iattr;
+	int		prealloc_type;
+
+	if (!S_ISREG(ip->i_d.di_mode))
+		return XFS_ERROR(EINVAL);
+
+	switch (bf->l_whence) {
+	case 0: /*SEEK_SET*/
+		break;
+	case 1: /*SEEK_CUR*/
+		bf->l_start += offset;
+		break;
+	case 2: /*SEEK_END*/
+		bf->l_start += ip->i_size;
+		break;
+	default:
+		return XFS_ERROR(EINVAL);
+	}
+
+	llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
+
+	if (   (bf->l_start < 0)
+	    || (bf->l_start > XFS_MAXIOFFSET(mp))
+	    || (bf->l_start + llen < 0)
+	    || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
+		return XFS_ERROR(EINVAL);
+
+	bf->l_whence = 0;
+
+	startoffset = bf->l_start;
+	fsize = ip->i_size;
+
+	/*
+	 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
+	 * file space.
+	 * These calls do NOT zero the data space allocated to the file,
+	 * nor do they change the file size.
+	 *
+	 * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
+	 * space.
+	 * These calls cause the new file data to be zeroed and the file
+	 * size to be changed.
+	 */
+	setprealloc = clrprealloc = 0;
+	prealloc_type = XFS_BMAPI_PREALLOC;
+
+	switch (cmd) {
+	case XFS_IOC_ZERO_RANGE:
+		prealloc_type |= XFS_BMAPI_CONVERT;
+		xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
+		/* FALLTHRU */
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_RESVSP64:
+		error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
+						prealloc_type, attr_flags);
+		if (error)
+			return error;
+		setprealloc = 1;
+		break;
+
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_UNRESVSP64:
+		if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
+								attr_flags)))
+			return error;
+		break;
+
+	case XFS_IOC_ALLOCSP:
+	case XFS_IOC_ALLOCSP64:
+	case XFS_IOC_FREESP:
+	case XFS_IOC_FREESP64:
+		if (startoffset > fsize) {
+			error = xfs_alloc_file_space(ip, fsize,
+					startoffset - fsize, 0, attr_flags);
+			if (error)
+				break;
+		}
+
+		iattr.ia_valid = ATTR_SIZE;
+		iattr.ia_size = startoffset;
+
+		error = xfs_setattr(ip, &iattr, attr_flags);
+
+		if (error)
+			return error;
+
+		clrprealloc = 1;
+		break;
+
+	default:
+		ASSERT(0);
+		return XFS_ERROR(EINVAL);
+	}
+
+	/*
+	 * update the inode timestamp, mode, and prealloc flag bits
+	 */
+	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+
+	if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
+				      0, 0, 0))) {
+		/* ASSERT(0); */
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	xfs_trans_ijoin(tp, ip);
+
+	if ((attr_flags & XFS_ATTR_DMI) == 0) {
+		ip->i_d.di_mode &= ~S_ISUID;
+
+		/*
+		 * Note that we don't have to worry about mandatory
+		 * file locking being disabled here because we only
+		 * clear the S_ISGID bit if the Group execute bit is
+		 * on, but if it was on then mandatory locking wouldn't
+		 * have been enabled.
+		 */
+		if (ip->i_d.di_mode & S_IXGRP)
+			ip->i_d.di_mode &= ~S_ISGID;
+
+		xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	}
+	if (setprealloc)
+		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+	else if (clrprealloc)
+		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	if (attr_flags & XFS_ATTR_SYNC)
+		xfs_trans_set_sync(tp);
+
+	error = xfs_trans_commit(tp, 0);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	return error;
+}
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
new file mode 100644
index 00000000..3bcd2335
--- /dev/null
+++ b/fs/xfs/xfs_vnodeops.h
@@ -0,0 +1,63 @@
+#ifndef _XFS_VNODEOPS_H
+#define _XFS_VNODEOPS_H 1
+
+struct attrlist_cursor_kern;
+struct file;
+struct iattr;
+struct inode;
+struct iovec;
+struct kiocb;
+struct pipe_inode_info;
+struct uio;
+struct xfs_inode;
+struct xfs_iomap;
+
+
+int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
+#define	XFS_ATTR_DMI		0x01	/* invocation from a DMI function */
+#define	XFS_ATTR_NONBLOCK	0x02	/* return EAGAIN if operation would block */
+#define XFS_ATTR_NOLOCK		0x04	/* Don't grab any conflicting locks */
+#define XFS_ATTR_NOACL		0x08	/* Don't call xfs_acl_chmod */
+#define XFS_ATTR_SYNC		0x10	/* synchronous operation required */
+
+int xfs_readlink(struct xfs_inode *ip, char *link);
+int xfs_release(struct xfs_inode *ip);
+int xfs_inactive(struct xfs_inode *ip);
+int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
+		struct xfs_inode **ipp, struct xfs_name *ci_name);
+int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
+		xfs_dev_t rdev, struct xfs_inode **ipp);
+int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
+		struct xfs_inode *ip);
+int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
+		struct xfs_name *target_name);
+int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
+		       xfs_off_t *offset, filldir_t filldir);
+int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
+		const char *target_path, mode_t mode, struct xfs_inode **ipp);
+int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
+int xfs_change_file_space(struct xfs_inode *ip, int cmd,
+		xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
+int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
+		struct xfs_inode *src_ip, struct xfs_inode *target_dp,
+		struct xfs_name *target_name, struct xfs_inode *target_ip);
+int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
+		unsigned char *value, int *valuelenp, int flags);
+int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
+		unsigned char *value, int valuelen, int flags);
+int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
+int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
+		int flags, struct attrlist_cursor_kern *cursor);
+int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
+		int flags, struct xfs_iomap *iomapp, int *niomaps);
+void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
+		xfs_off_t last, int fiopt);
+int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
+		xfs_off_t last, int fiopt);
+int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
+		xfs_off_t last, uint64_t flags, int fiopt);
+int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
+
+int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
+
+#endif /* _XFS_VNODEOPS_H */
diff --git a/fs/yaffs2/Kconfig b/fs/yaffs2/Kconfig
new file mode 100644
index 00000000..63541405
--- /dev/null
+++ b/fs/yaffs2/Kconfig
@@ -0,0 +1,161 @@
+#
+# YAFFS file system configurations
+#
+
+config YAFFS_FS
+	tristate "YAFFS2 file system support"
+	default n
+	depends on MTD_BLOCK
+	select YAFFS_YAFFS1
+	select YAFFS_YAFFS2
+	help
+	  YAFFS2, or Yet Another Flash Filing System, is a filing system
+	  optimised for NAND Flash chips.
+
+	  To compile the YAFFS2 file system support as a module, choose M
+	  here: the module will be called yaffs2.
+
+	  If unsure, say N.
+
+	  Further information on YAFFS2 is available at
+	  <http://www.aleph1.co.uk/yaffs/>.
+
+config YAFFS_YAFFS1
+	bool "512 byte / page devices"
+	depends on YAFFS_FS
+	default y
+	help
+	  Enable YAFFS1 support -- yaffs for 512 byte / page devices
+
+	  Not needed for 2K-page devices.
+
+	  If unsure, say Y.
+
+config YAFFS_9BYTE_TAGS
+	bool "Use older-style on-NAND data format with pageStatus byte"
+	depends on YAFFS_YAFFS1
+	default n
+	help
+
+	  Older-style on-NAND data format has a "pageStatus" byte to record
+	  chunk/page state.  This byte is zero when the page is discarded.
+	  Choose this option if you have existing on-NAND data using this
+	  format that you need to continue to support.  New data written
+	  also uses the older-style format.  Note: Use of this option
+	  generally requires that MTD's oob layout be adjusted to use the
+	  older-style format.  See notes on tags formats and MTD versions
+	  in yaffs_mtdif1.c.
+
+	  If unsure, say N.
+
+config YAFFS_DOES_ECC
+	bool "Lets Yaffs do its own ECC"
+	depends on YAFFS_FS && YAFFS_YAFFS1 && !YAFFS_9BYTE_TAGS
+	default n
+	help
+	  This enables Yaffs to use its own ECC functions instead of using
+	  the ones from the generic MTD-NAND driver.
+
+	  If unsure, say N.
+
+config YAFFS_ECC_WRONG_ORDER
+	bool "Use the same ecc byte order as Steven Hill's nand_ecc.c"
+	depends on YAFFS_FS && YAFFS_DOES_ECC && !YAFFS_9BYTE_TAGS
+	default n
+	help
+	  This makes yaffs_ecc.c use the same ecc byte order as Steven
+	  Hill's nand_ecc.c. If not set, then you get the same ecc byte
+	  order as SmartMedia.
+
+	  If unsure, say N.
+
+config YAFFS_YAFFS2
+	bool "2048 byte (or larger) / page devices"
+	depends on YAFFS_FS
+	default y
+	help
+	  Enable YAFFS2 support -- yaffs for >= 2K bytes per page devices
+
+	  If unsure, say Y.
+
+config YAFFS_AUTO_YAFFS2
+	bool "Autoselect yaffs2 format"
+	depends on YAFFS_YAFFS2
+	default y
+	help
+	  Without this, you need to explicitely use yaffs2 as the file
+	  system type. With this, you can say "yaffs" and yaffs or yaffs2
+	  will be used depending on the device page size (yaffs on
+	  512-byte page devices, yaffs2 on 2K page devices).
+
+	  If unsure, say Y.
+
+config YAFFS_DISABLE_TAGS_ECC
+	bool "Disable YAFFS from doing ECC on tags by default"
+	depends on YAFFS_FS && YAFFS_YAFFS2
+	default n
+	help
+	  This defaults Yaffs to using its own ECC calculations on tags instead of
+	  just relying on the MTD.
+	  This behavior can also be overridden with tags_ecc_on and
+	  tags_ecc_off mount options.
+
+	  If unsure, say N.
+
+config YAFFS_ALWAYS_CHECK_CHUNK_ERASED
+	bool "Force chunk erase check"
+	depends on YAFFS_FS
+	default n
+	help
+          Normally YAFFS only checks chunks before writing until an erased
+	  chunk is found. This helps to detect any partially written
+	  chunks that might have happened due to power loss.
+
+	  Enabling this forces on the test that chunks are erased in flash
+	  before writing to them. This takes more time but is potentially
+	  a bit more secure.
+
+	  Suggest setting Y during development and ironing out driver
+	  issues etc. Suggest setting to N if you want faster writing.
+
+	  If unsure, say Y.
+
+config YAFFS_EMPTY_LOST_AND_FOUND
+	bool "Empty lost and found on boot"
+	depends on YAFFS_FS
+	default n
+	help
+	  If this is enabled then the contents of lost and found is
+	  automatically dumped at mount.
+
+	  If unsure, say N.
+
+config YAFFS_DISABLE_BLOCK_REFRESHING
+	bool "Disable yaffs2 block refreshing"
+	depends on YAFFS_FS
+	default n
+	help
+	 If this is set, then block refreshing is disabled.
+	 Block refreshing infrequently refreshes the oldest block in
+	 a yaffs2 file system. This mechanism helps to refresh flash to
+	 mitigate against data loss. This is particularly useful for MLC.
+
+	  If unsure, say N.
+
+config YAFFS_DISABLE_BACKGROUND
+	bool "Disable yaffs2 background processing"
+	depends on YAFFS_FS
+	default n
+	help
+	 If this is set, then background processing is disabled.
+	 Background processing makes many foreground activities faster.
+
+	 If unsure, say N.
+
+config YAFFS_XATTR
+	bool "Enable yaffs2 xattr support"
+	depends on YAFFS_FS
+	default y
+	help
+	 If this is set then yaffs2 will provide xattr support.
+	 If unsure, say Y.
diff --git a/fs/yaffs2/Makefile b/fs/yaffs2/Makefile
new file mode 100644
index 00000000..e63a28aa
--- /dev/null
+++ b/fs/yaffs2/Makefile
@@ -0,0 +1,17 @@
+#
+# Makefile for the linux YAFFS filesystem routines.
+#
+
+obj-$(CONFIG_YAFFS_FS) += yaffs.o
+
+yaffs-y := yaffs_ecc.o yaffs_vfs.o yaffs_guts.o yaffs_checkptrw.o
+yaffs-y += yaffs_packedtags1.o yaffs_packedtags2.o yaffs_nand.o
+yaffs-y += yaffs_tagscompat.o yaffs_tagsvalidity.o
+yaffs-y += yaffs_mtdif.o yaffs_mtdif1.o yaffs_mtdif2.o
+yaffs-y += yaffs_nameval.o yaffs_attribs.o
+yaffs-y += yaffs_allocator.o
+yaffs-y += yaffs_yaffs1.o
+yaffs-y += yaffs_yaffs2.o
+yaffs-y += yaffs_bitmap.o
+yaffs-y += yaffs_verify.o
+
diff --git a/fs/yaffs2/yaffs_allocator.c b/fs/yaffs2/yaffs_allocator.c
new file mode 100644
index 00000000..f9cd5bec
--- /dev/null
+++ b/fs/yaffs2/yaffs_allocator.c
@@ -0,0 +1,396 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "yaffs_allocator.h"
+#include "yaffs_guts.h"
+#include "yaffs_trace.h"
+#include "yportenv.h"
+
+#ifdef CONFIG_YAFFS_KMALLOC_ALLOCATOR
+
+void yaffs_deinit_raw_tnodes_and_objs(struct yaffs_dev *dev)
+{
+	dev = dev;
+}
+
+void yaffs_init_raw_tnodes_and_objs(struct yaffs_dev *dev)
+{
+	dev = dev;
+}
+
+struct yaffs_tnode *yaffs_alloc_raw_tnode(struct yaffs_dev *dev)
+{
+	return (struct yaffs_tnode *)kmalloc(dev->tnode_size, GFP_NOFS);
+}
+
+void yaffs_free_raw_tnode(struct yaffs_dev *dev, struct yaffs_tnode *tn)
+{
+	dev = dev;
+	kfree(tn);
+}
+
+void yaffs_init_raw_objs(struct yaffs_dev *dev)
+{
+	dev = dev;
+}
+
+void yaffs_deinit_raw_objs(struct yaffs_dev *dev)
+{
+	dev = dev;
+}
+
+struct yaffs_obj *yaffs_alloc_raw_obj(struct yaffs_dev *dev)
+{
+	dev = dev;
+	return (struct yaffs_obj *)kmalloc(sizeof(struct yaffs_obj));
+}
+
+void yaffs_free_raw_obj(struct yaffs_dev *dev, struct yaffs_obj *obj)
+{
+
+	dev = dev;
+	kfree(obj);
+}
+
+#else
+
+struct yaffs_tnode_list {
+	struct yaffs_tnode_list *next;
+	struct yaffs_tnode *tnodes;
+};
+
+struct yaffs_obj_list {
+	struct yaffs_obj_list *next;
+	struct yaffs_obj *objects;
+};
+
+struct yaffs_allocator {
+	int n_tnodes_created;
+	struct yaffs_tnode *free_tnodes;
+	int n_free_tnodes;
+	struct yaffs_tnode_list *alloc_tnode_list;
+
+	int n_obj_created;
+	struct yaffs_obj *free_objs;
+	int n_free_objects;
+
+	struct yaffs_obj_list *allocated_obj_list;
+};
+
+static void yaffs_deinit_raw_tnodes(struct yaffs_dev *dev)
+{
+
+	struct yaffs_allocator *allocator =
+	    (struct yaffs_allocator *)dev->allocator;
+
+	struct yaffs_tnode_list *tmp;
+
+	if (!allocator) {
+		YBUG();
+		return;
+	}
+
+	while (allocator->alloc_tnode_list) {
+		tmp = allocator->alloc_tnode_list->next;
+
+		kfree(allocator->alloc_tnode_list->tnodes);
+		kfree(allocator->alloc_tnode_list);
+		allocator->alloc_tnode_list = tmp;
+
+	}
+
+	allocator->free_tnodes = NULL;
+	allocator->n_free_tnodes = 0;
+	allocator->n_tnodes_created = 0;
+}
+
+static void yaffs_init_raw_tnodes(struct yaffs_dev *dev)
+{
+	struct yaffs_allocator *allocator = dev->allocator;
+
+	if (allocator) {
+		allocator->alloc_tnode_list = NULL;
+		allocator->free_tnodes = NULL;
+		allocator->n_free_tnodes = 0;
+		allocator->n_tnodes_created = 0;
+	} else {
+		YBUG();
+	}
+}
+
+static int yaffs_create_tnodes(struct yaffs_dev *dev, int n_tnodes)
+{
+	struct yaffs_allocator *allocator =
+	    (struct yaffs_allocator *)dev->allocator;
+	int i;
+	struct yaffs_tnode *new_tnodes;
+	u8 *mem;
+	struct yaffs_tnode *curr;
+	struct yaffs_tnode *next;
+	struct yaffs_tnode_list *tnl;
+
+	if (!allocator) {
+		YBUG();
+		return YAFFS_FAIL;
+	}
+
+	if (n_tnodes < 1)
+		return YAFFS_OK;
+
+	/* make these things */
+
+	new_tnodes = kmalloc(n_tnodes * dev->tnode_size, GFP_NOFS);
+	mem = (u8 *) new_tnodes;
+
+	if (!new_tnodes) {
+		yaffs_trace(YAFFS_TRACE_ERROR,
+			"yaffs: Could not allocate Tnodes");
+		return YAFFS_FAIL;
+	}
+
+	/* New hookup for wide tnodes */
+	for (i = 0; i < n_tnodes - 1; i++) {
+		curr = (struct yaffs_tnode *)&mem[i * dev->tnode_size];
+		next = (struct yaffs_tnode *)&mem[(i + 1) * dev->tnode_size];
+		curr->internal[0] = next;
+	}
+
+	curr = (struct yaffs_tnode *)&mem[(n_tnodes - 1) * dev->tnode_size];
+	curr->internal[0] = allocator->free_tnodes;
+	allocator->free_tnodes = (struct yaffs_tnode *)mem;
+
+	allocator->n_free_tnodes += n_tnodes;
+	allocator->n_tnodes_created += n_tnodes;
+
+	/* Now add this bunch of tnodes to a list for freeing up.
+	 * NB If we can't add this to the management list it isn't fatal
+	 * but it just means we can't free this bunch of tnodes later.
+	 */
+
+	tnl = kmalloc(sizeof(struct yaffs_tnode_list), GFP_NOFS);
+	if (!tnl) {
+		yaffs_trace(YAFFS_TRACE_ERROR,
+			"Could not add tnodes to management list");
+		return YAFFS_FAIL;
+	} else {
+		tnl->tnodes = new_tnodes;
+		tnl->next = allocator->alloc_tnode_list;
+		allocator->alloc_tnode_list = tnl;
+	}
+
+	yaffs_trace(YAFFS_TRACE_ALLOCATE,"Tnodes added");
+
+	return YAFFS_OK;
+}
+
+struct yaffs_tnode *yaffs_alloc_raw_tnode(struct yaffs_dev *dev)
+{
+	struct yaffs_allocator *allocator =
+	    (struct yaffs_allocator *)dev->allocator;
+	struct yaffs_tnode *tn = NULL;
+
+	if (!allocator) {
+		YBUG();
+		return NULL;
+	}
+
+	/* If there are none left make more */
+	if (!allocator->free_tnodes)
+		yaffs_create_tnodes(dev, YAFFS_ALLOCATION_NTNODES);
+
+	if (allocator->free_tnodes) {
+		tn = allocator->free_tnodes;
+		allocator->free_tnodes = allocator->free_tnodes->internal[0];
+		allocator->n_free_tnodes--;
+	}
+
+	return tn;
+}
+
+/* FreeTnode frees up a tnode and puts it back on the free list */
+void yaffs_free_raw_tnode(struct yaffs_dev *dev, struct yaffs_tnode *tn)
+{
+	struct yaffs_allocator *allocator = dev->allocator;
+
+	if (!allocator) {
+		YBUG();
+		return;
+	}
+
+	if (tn) {
+		tn->internal[0] = allocator->free_tnodes;
+		allocator->free_tnodes = tn;
+		allocator->n_free_tnodes++;
+	}
+	dev->checkpoint_blocks_required = 0;	/* force recalculation */
+}
+
+static void yaffs_init_raw_objs(struct yaffs_dev *dev)
+{
+	struct yaffs_allocator *allocator = dev->allocator;
+
+	if (allocator) {
+		allocator->allocated_obj_list = NULL;
+		allocator->free_objs = NULL;
+		allocator->n_free_objects = 0;
+	} else {
+		YBUG();
+	}
+}
+
+static void yaffs_deinit_raw_objs(struct yaffs_dev *dev)
+{
+	struct yaffs_allocator *allocator = dev->allocator;
+	struct yaffs_obj_list *tmp;
+
+	if (!allocator) {
+		YBUG();
+		return;
+	}
+
+	while (allocator->allocated_obj_list) {
+		tmp = allocator->allocated_obj_list->next;
+		kfree(allocator->allocated_obj_list->objects);
+		kfree(allocator->allocated_obj_list);
+
+		allocator->allocated_obj_list = tmp;
+	}
+
+	allocator->free_objs = NULL;
+	allocator->n_free_objects = 0;
+	allocator->n_obj_created = 0;
+}
+
+static int yaffs_create_free_objs(struct yaffs_dev *dev, int n_obj)
+{
+	struct yaffs_allocator *allocator = dev->allocator;
+
+	int i;
+	struct yaffs_obj *new_objs;
+	struct yaffs_obj_list *list;
+
+	if (!allocator) {
+		YBUG();
+		return YAFFS_FAIL;
+	}
+
+	if (n_obj < 1)
+		return YAFFS_OK;
+
+	/* make these things */
+	new_objs = kmalloc(n_obj * sizeof(struct yaffs_obj), GFP_NOFS);
+	list = kmalloc(sizeof(struct yaffs_obj_list), GFP_NOFS);
+
+	if (!new_objs || !list) {
+		if (new_objs) {
+			kfree(new_objs);
+			new_objs = NULL;
+		}
+		if (list) {
+			kfree(list);
+			list = NULL;
+		}
+		yaffs_trace(YAFFS_TRACE_ALLOCATE,
+			"Could not allocate more objects");
+		return YAFFS_FAIL;
+	}
+
+	/* Hook them into the free list */
+	for (i = 0; i < n_obj - 1; i++) {
+		new_objs[i].siblings.next =
+		    (struct list_head *)(&new_objs[i + 1]);
+	}
+
+	new_objs[n_obj - 1].siblings.next = (void *)allocator->free_objs;
+	allocator->free_objs = new_objs;
+	allocator->n_free_objects += n_obj;
+	allocator->n_obj_created += n_obj;
+
+	/* Now add this bunch of Objects to a list for freeing up. */
+
+	list->objects = new_objs;
+	list->next = allocator->allocated_obj_list;
+	allocator->allocated_obj_list = list;
+
+	return YAFFS_OK;
+}
+
+struct yaffs_obj *yaffs_alloc_raw_obj(struct yaffs_dev *dev)
+{
+	struct yaffs_obj *obj = NULL;
+	struct yaffs_allocator *allocator = dev->allocator;
+
+	if (!allocator) {
+		YBUG();
+		return obj;
+	}
+
+	/* If there are none left make more */
+	if (!allocator->free_objs)
+		yaffs_create_free_objs(dev, YAFFS_ALLOCATION_NOBJECTS);
+
+	if (allocator->free_objs) {
+		obj = allocator->free_objs;
+		allocator->free_objs =
+		    (struct yaffs_obj *)(allocator->free_objs->siblings.next);
+		allocator->n_free_objects--;
+	}
+
+	return obj;
+}
+
+void yaffs_free_raw_obj(struct yaffs_dev *dev, struct yaffs_obj *obj)
+{
+
+	struct yaffs_allocator *allocator = dev->allocator;
+
+	if (!allocator)
+		YBUG();
+	else {
+		/* Link into the free list. */
+		obj->siblings.next = (struct list_head *)(allocator->free_objs);
+		allocator->free_objs = obj;
+		allocator->n_free_objects++;
+	}
+}
+
+void yaffs_deinit_raw_tnodes_and_objs(struct yaffs_dev *dev)
+{
+	if (dev->allocator) {
+		yaffs_deinit_raw_tnodes(dev);
+		yaffs_deinit_raw_objs(dev);
+
+		kfree(dev->allocator);
+		dev->allocator = NULL;
+	} else {
+		YBUG();
+	}
+}
+
+void yaffs_init_raw_tnodes_and_objs(struct yaffs_dev *dev)
+{
+	struct yaffs_allocator *allocator;
+
+	if (!dev->allocator) {
+		allocator = kmalloc(sizeof(struct yaffs_allocator), GFP_NOFS);
+		if (allocator) {
+			dev->allocator = allocator;
+			yaffs_init_raw_tnodes(dev);
+			yaffs_init_raw_objs(dev);
+		}
+	} else {
+		YBUG();
+	}
+}
+
+#endif
diff --git a/fs/yaffs2/yaffs_allocator.h b/fs/yaffs2/yaffs_allocator.h
new file mode 100644
index 00000000..4d5f2aec
--- /dev/null
+++ b/fs/yaffs2/yaffs_allocator.h
@@ -0,0 +1,30 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YAFFS_ALLOCATOR_H__
+#define __YAFFS_ALLOCATOR_H__
+
+#include "yaffs_guts.h"
+
+void yaffs_init_raw_tnodes_and_objs(struct yaffs_dev *dev);
+void yaffs_deinit_raw_tnodes_and_objs(struct yaffs_dev *dev);
+
+struct yaffs_tnode *yaffs_alloc_raw_tnode(struct yaffs_dev *dev);
+void yaffs_free_raw_tnode(struct yaffs_dev *dev, struct yaffs_tnode *tn);
+
+struct yaffs_obj *yaffs_alloc_raw_obj(struct yaffs_dev *dev);
+void yaffs_free_raw_obj(struct yaffs_dev *dev, struct yaffs_obj *obj);
+
+#endif
diff --git a/fs/yaffs2/yaffs_attribs.c b/fs/yaffs2/yaffs_attribs.c
new file mode 100644
index 00000000..9b47d376
--- /dev/null
+++ b/fs/yaffs2/yaffs_attribs.c
@@ -0,0 +1,124 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "yaffs_guts.h"
+#include "yaffs_attribs.h"
+
+void yaffs_load_attribs(struct yaffs_obj *obj, struct yaffs_obj_hdr *oh)
+{
+	obj->yst_uid = oh->yst_uid;
+	obj->yst_gid = oh->yst_gid;
+	obj->yst_atime = oh->yst_atime;
+	obj->yst_mtime = oh->yst_mtime;
+	obj->yst_ctime = oh->yst_ctime;
+	obj->yst_rdev = oh->yst_rdev;
+}
+
+void yaffs_load_attribs_oh(struct yaffs_obj_hdr *oh, struct yaffs_obj *obj)
+{
+	oh->yst_uid = obj->yst_uid;
+	oh->yst_gid = obj->yst_gid;
+	oh->yst_atime = obj->yst_atime;
+	oh->yst_mtime = obj->yst_mtime;
+	oh->yst_ctime = obj->yst_ctime;
+	oh->yst_rdev = obj->yst_rdev;
+
+}
+
+void yaffs_load_current_time(struct yaffs_obj *obj, int do_a, int do_c)
+{
+	obj->yst_mtime = Y_CURRENT_TIME;
+	if (do_a)
+		obj->yst_atime = obj->yst_mtime;
+	if (do_c)
+		obj->yst_ctime = obj->yst_mtime;
+}
+
+void yaffs_attribs_init(struct yaffs_obj *obj, u32 gid, u32 uid, u32 rdev)
+{
+	yaffs_load_current_time(obj, 1, 1);
+	obj->yst_rdev = rdev;
+	obj->yst_uid = uid;
+	obj->yst_gid = gid;
+}
+
+loff_t yaffs_get_file_size(struct yaffs_obj *obj)
+{
+	YCHAR *alias = NULL;
+	obj = yaffs_get_equivalent_obj(obj);
+
+	switch (obj->variant_type) {
+	case YAFFS_OBJECT_TYPE_FILE:
+		return obj->variant.file_variant.file_size;
+	case YAFFS_OBJECT_TYPE_SYMLINK:
+		alias = obj->variant.symlink_variant.alias;
+		if (!alias)
+			return 0;
+		return strnlen(alias, YAFFS_MAX_ALIAS_LENGTH);
+	default:
+		return 0;
+	}
+}
+
+int yaffs_set_attribs(struct yaffs_obj *obj, struct iattr *attr)
+{
+	unsigned int valid = attr->ia_valid;
+
+	if (valid & ATTR_MODE)
+		obj->yst_mode = attr->ia_mode;
+	if (valid & ATTR_UID)
+		obj->yst_uid = attr->ia_uid;
+	if (valid & ATTR_GID)
+		obj->yst_gid = attr->ia_gid;
+
+	if (valid & ATTR_ATIME)
+		obj->yst_atime = Y_TIME_CONVERT(attr->ia_atime);
+	if (valid & ATTR_CTIME)
+		obj->yst_ctime = Y_TIME_CONVERT(attr->ia_ctime);
+	if (valid & ATTR_MTIME)
+		obj->yst_mtime = Y_TIME_CONVERT(attr->ia_mtime);
+
+	if (valid & ATTR_SIZE)
+		yaffs_resize_file(obj, attr->ia_size);
+
+	yaffs_update_oh(obj, NULL, 1, 0, 0, NULL);
+
+	return YAFFS_OK;
+
+}
+
+int yaffs_get_attribs(struct yaffs_obj *obj, struct iattr *attr)
+{
+	unsigned int valid = 0;
+
+	attr->ia_mode = obj->yst_mode;
+	valid |= ATTR_MODE;
+	attr->ia_uid = obj->yst_uid;
+	valid |= ATTR_UID;
+	attr->ia_gid = obj->yst_gid;
+	valid |= ATTR_GID;
+
+	Y_TIME_CONVERT(attr->ia_atime) = obj->yst_atime;
+	valid |= ATTR_ATIME;
+	Y_TIME_CONVERT(attr->ia_ctime) = obj->yst_ctime;
+	valid |= ATTR_CTIME;
+	Y_TIME_CONVERT(attr->ia_mtime) = obj->yst_mtime;
+	valid |= ATTR_MTIME;
+
+	attr->ia_size = yaffs_get_file_size(obj);
+	valid |= ATTR_SIZE;
+
+	attr->ia_valid = valid;
+
+	return YAFFS_OK;
+}
diff --git a/fs/yaffs2/yaffs_attribs.h b/fs/yaffs2/yaffs_attribs.h
new file mode 100644
index 00000000..33d541d6
--- /dev/null
+++ b/fs/yaffs2/yaffs_attribs.h
@@ -0,0 +1,28 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YAFFS_ATTRIBS_H__
+#define __YAFFS_ATTRIBS_H__
+
+#include "yaffs_guts.h"
+
+void yaffs_load_attribs(struct yaffs_obj *obj, struct yaffs_obj_hdr *oh);
+void yaffs_load_attribs_oh(struct yaffs_obj_hdr *oh, struct yaffs_obj *obj);
+void yaffs_attribs_init(struct yaffs_obj *obj, u32 gid, u32 uid, u32 rdev);
+void yaffs_load_current_time(struct yaffs_obj *obj, int do_a, int do_c);
+int yaffs_set_attribs(struct yaffs_obj *obj, struct iattr *attr);
+int yaffs_get_attribs(struct yaffs_obj *obj, struct iattr *attr);
+
+#endif
diff --git a/fs/yaffs2/yaffs_bitmap.c b/fs/yaffs2/yaffs_bitmap.c
new file mode 100644
index 00000000..7df42cd0
--- /dev/null
+++ b/fs/yaffs2/yaffs_bitmap.c
@@ -0,0 +1,98 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "yaffs_bitmap.h"
+#include "yaffs_trace.h"
+/*
+ * Chunk bitmap manipulations
+ */
+
+static inline u8 *yaffs_block_bits(struct yaffs_dev *dev, int blk)
+{
+	if (blk < dev->internal_start_block || blk > dev->internal_end_block) {
+		yaffs_trace(YAFFS_TRACE_ERROR,
+			"BlockBits block %d is not valid",
+			blk);
+		YBUG();
+	}
+	return dev->chunk_bits +
+	    (dev->chunk_bit_stride * (blk - dev->internal_start_block));
+}
+
+void yaffs_verify_chunk_bit_id(struct yaffs_dev *dev, int blk, int chunk)
+{
+	if (blk < dev->internal_start_block || blk > dev->internal_end_block ||
+	    chunk < 0 || chunk >= dev->param.chunks_per_block) {
+		yaffs_trace(YAFFS_TRACE_ERROR,
+			"Chunk Id (%d:%d) invalid",
+			blk, chunk);
+		YBUG();
+	}
+}
+
+void yaffs_clear_chunk_bits(struct yaffs_dev *dev, int blk)
+{
+	u8 *blk_bits = yaffs_block_bits(dev, blk);
+
+	memset(blk_bits, 0, dev->chunk_bit_stride);
+}
+
+void yaffs_clear_chunk_bit(struct yaffs_dev *dev, int blk, int chunk)
+{
+	u8 *blk_bits = yaffs_block_bits(dev, blk);
+
+	yaffs_verify_chunk_bit_id(dev, blk, chunk);
+
+	blk_bits[chunk / 8] &= ~(1 << (chunk & 7));
+}
+
+void yaffs_set_chunk_bit(struct yaffs_dev *dev, int blk, int chunk)
+{
+	u8 *blk_bits = yaffs_block_bits(dev, blk);
+
+	yaffs_verify_chunk_bit_id(dev, blk, chunk);
+
+	blk_bits[chunk / 8] |= (1 << (chunk & 7));
+}
+
+int yaffs_check_chunk_bit(struct yaffs_dev *dev, int blk, int chunk)
+{
+	u8 *blk_bits = yaffs_block_bits(dev, blk);
+	yaffs_verify_chunk_bit_id(dev, blk, chunk);
+
+	return (blk_bits[chunk / 8] & (1 << (chunk & 7))) ? 1 : 0;
+}
+
+int yaffs_still_some_chunks(struct yaffs_dev *dev, int blk)
+{
+	u8 *blk_bits = yaffs_block_bits(dev, blk);
+	int i;
+	for (i = 0; i < dev->chunk_bit_stride; i++) {
+		if (*blk_bits)
+			return 1;
+		blk_bits++;
+	}
+	return 0;
+}
+
+int yaffs_count_chunk_bits(struct yaffs_dev *dev, int blk)
+{
+	u8 *blk_bits = yaffs_block_bits(dev, blk);
+	int i;
+	int n = 0;
+
+	for (i = 0; i < dev->chunk_bit_stride; i++, blk_bits++)
+		n += hweight8(*blk_bits);
+
+	return n;
+}
diff --git a/fs/yaffs2/yaffs_bitmap.h b/fs/yaffs2/yaffs_bitmap.h
new file mode 100644
index 00000000..cf9ea58d
--- /dev/null
+++ b/fs/yaffs2/yaffs_bitmap.h
@@ -0,0 +1,33 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+/*
+ * Chunk bitmap manipulations
+ */
+
+#ifndef __YAFFS_BITMAP_H__
+#define __YAFFS_BITMAP_H__
+
+#include "yaffs_guts.h"
+
+void yaffs_verify_chunk_bit_id(struct yaffs_dev *dev, int blk, int chunk);
+void yaffs_clear_chunk_bits(struct yaffs_dev *dev, int blk);
+void yaffs_clear_chunk_bit(struct yaffs_dev *dev, int blk, int chunk);
+void yaffs_set_chunk_bit(struct yaffs_dev *dev, int blk, int chunk);
+int yaffs_check_chunk_bit(struct yaffs_dev *dev, int blk, int chunk);
+int yaffs_still_some_chunks(struct yaffs_dev *dev, int blk);
+int yaffs_count_chunk_bits(struct yaffs_dev *dev, int blk);
+
+#endif
diff --git a/fs/yaffs2/yaffs_checkptrw.c b/fs/yaffs2/yaffs_checkptrw.c
new file mode 100644
index 00000000..4e40f437
--- /dev/null
+++ b/fs/yaffs2/yaffs_checkptrw.c
@@ -0,0 +1,415 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "yaffs_checkptrw.h"
+#include "yaffs_getblockinfo.h"
+
+static int yaffs2_checkpt_space_ok(struct yaffs_dev *dev)
+{
+	int blocks_avail = dev->n_erased_blocks - dev->param.n_reserved_blocks;
+
+	yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+		"checkpt blocks_avail = %d", blocks_avail);
+
+	return (blocks_avail <= 0) ? 0 : 1;
+}
+
+static int yaffs_checkpt_erase(struct yaffs_dev *dev)
+{
+	int i;
+
+	if (!dev->param.erase_fn)
+		return 0;
+	yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+		"checking blocks %d to %d",
+		dev->internal_start_block, dev->internal_end_block);
+
+	for (i = dev->internal_start_block; i <= dev->internal_end_block; i++) {
+		struct yaffs_block_info *bi = yaffs_get_block_info(dev, i);
+		if (bi->block_state == YAFFS_BLOCK_STATE_CHECKPOINT) {
+			yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+			"erasing checkpt block %d", i);
+
+			dev->n_erasures++;
+
+			if (dev->param.
+			    erase_fn(dev,
+				     i - dev->block_offset /* realign */ )) {
+				bi->block_state = YAFFS_BLOCK_STATE_EMPTY;
+				dev->n_erased_blocks++;
+				dev->n_free_chunks +=
+				    dev->param.chunks_per_block;
+			} else {
+				dev->param.bad_block_fn(dev, i);
+				bi->block_state = YAFFS_BLOCK_STATE_DEAD;
+			}
+		}
+	}
+
+	dev->blocks_in_checkpt = 0;
+
+	return 1;
+}
+
+static void yaffs2_checkpt_find_erased_block(struct yaffs_dev *dev)
+{
+	int i;
+	int blocks_avail = dev->n_erased_blocks - dev->param.n_reserved_blocks;
+	yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+		"allocating checkpt block: erased %d reserved %d avail %d next %d ",
+		dev->n_erased_blocks, dev->param.n_reserved_blocks,
+		blocks_avail, dev->checkpt_next_block);
+
+	if (dev->checkpt_next_block >= 0 &&
+	    dev->checkpt_next_block <= dev->internal_end_block &&
+	    blocks_avail > 0) {
+
+		for (i = dev->checkpt_next_block; i <= dev->internal_end_block;
+		     i++) {
+			struct yaffs_block_info *bi =
+			    yaffs_get_block_info(dev, i);
+			if (bi->block_state == YAFFS_BLOCK_STATE_EMPTY) {
+				dev->checkpt_next_block = i + 1;
+				dev->checkpt_cur_block = i;
+				yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+					"allocating checkpt block %d", i);
+				return;
+			}
+		}
+	}
+	yaffs_trace(YAFFS_TRACE_CHECKPOINT, "out of checkpt blocks");
+
+	dev->checkpt_next_block = -1;
+	dev->checkpt_cur_block = -1;
+}
+
+static void yaffs2_checkpt_find_block(struct yaffs_dev *dev)
+{
+	int i;
+	struct yaffs_ext_tags tags;
+
+	yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+		"find next checkpt block: start:  blocks %d next %d",
+		dev->blocks_in_checkpt, dev->checkpt_next_block);
+
+	if (dev->blocks_in_checkpt < dev->checkpt_max_blocks)
+		for (i = dev->checkpt_next_block; i <= dev->internal_end_block;
+		     i++) {
+			int chunk = i * dev->param.chunks_per_block;
+			int realigned_chunk = chunk - dev->chunk_offset;
+
+			dev->param.read_chunk_tags_fn(dev, realigned_chunk,
+						      NULL, &tags);
+			yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+				"find next checkpt block: search: block %d oid %d seq %d eccr %d",
+				i, tags.obj_id, tags.seq_number,
+				tags.ecc_result);
+
+			if (tags.seq_number == YAFFS_SEQUENCE_CHECKPOINT_DATA) {
+				/* Right kind of block */
+				dev->checkpt_next_block = tags.obj_id;
+				dev->checkpt_cur_block = i;
+				dev->checkpt_block_list[dev->
+							blocks_in_checkpt] = i;
+				dev->blocks_in_checkpt++;
+				yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+					"found checkpt block %d", i);
+				return;
+			}
+		}
+
+	yaffs_trace(YAFFS_TRACE_CHECKPOINT, "found no more checkpt blocks");
+
+	dev->checkpt_next_block = -1;
+	dev->checkpt_cur_block = -1;
+}
+
+int yaffs2_checkpt_open(struct yaffs_dev *dev, int writing)
+{
+
+	dev->checkpt_open_write = writing;
+
+	/* Got the functions we need? */
+	if (!dev->param.write_chunk_tags_fn ||
+	    !dev->param.read_chunk_tags_fn ||
+	    !dev->param.erase_fn || !dev->param.bad_block_fn)
+		return 0;
+
+	if (writing && !yaffs2_checkpt_space_ok(dev))
+		return 0;
+
+	if (!dev->checkpt_buffer)
+		dev->checkpt_buffer =
+		    kmalloc(dev->param.total_bytes_per_chunk, GFP_NOFS);
+	if (!dev->checkpt_buffer)
+		return 0;
+
+	dev->checkpt_page_seq = 0;
+	dev->checkpt_byte_count = 0;
+	dev->checkpt_sum = 0;
+	dev->checkpt_xor = 0;
+	dev->checkpt_cur_block = -1;
+	dev->checkpt_cur_chunk = -1;
+	dev->checkpt_next_block = dev->internal_start_block;
+
+	/* Erase all the blocks in the checkpoint area */
+	if (writing) {
+		memset(dev->checkpt_buffer, 0, dev->data_bytes_per_chunk);
+		dev->checkpt_byte_offs = 0;
+		return yaffs_checkpt_erase(dev);
+	} else {
+		int i;
+		/* Set to a value that will kick off a read */
+		dev->checkpt_byte_offs = dev->data_bytes_per_chunk;
+		/* A checkpoint block list of 1 checkpoint block per 16 block is (hopefully)
+		 * going to be way more than we need */
+		dev->blocks_in_checkpt = 0;
+		dev->checkpt_max_blocks =
+		    (dev->internal_end_block - dev->internal_start_block) / 16 +
+		    2;
+		dev->checkpt_block_list =
+		    kmalloc(sizeof(int) * dev->checkpt_max_blocks, GFP_NOFS);
+		if (!dev->checkpt_block_list)
+			return 0;
+
+		for (i = 0; i < dev->checkpt_max_blocks; i++)
+			dev->checkpt_block_list[i] = -1;
+	}
+
+	return 1;
+}
+
+int yaffs2_get_checkpt_sum(struct yaffs_dev *dev, u32 * sum)
+{
+	u32 composite_sum;
+	composite_sum = (dev->checkpt_sum << 8) | (dev->checkpt_xor & 0xFF);
+	*sum = composite_sum;
+	return 1;
+}
+
+static int yaffs2_checkpt_flush_buffer(struct yaffs_dev *dev)
+{
+	int chunk;
+	int realigned_chunk;
+
+	struct yaffs_ext_tags tags;
+
+	if (dev->checkpt_cur_block < 0) {
+		yaffs2_checkpt_find_erased_block(dev);
+		dev->checkpt_cur_chunk = 0;
+	}
+
+	if (dev->checkpt_cur_block < 0)
+		return 0;
+
+	tags.is_deleted = 0;
+	tags.obj_id = dev->checkpt_next_block;	/* Hint to next place to look */
+	tags.chunk_id = dev->checkpt_page_seq + 1;
+	tags.seq_number = YAFFS_SEQUENCE_CHECKPOINT_DATA;
+	tags.n_bytes = dev->data_bytes_per_chunk;
+	if (dev->checkpt_cur_chunk == 0) {
+		/* First chunk we write for the block? Set block state to
+		   checkpoint */
+		struct yaffs_block_info *bi =
+		    yaffs_get_block_info(dev, dev->checkpt_cur_block);
+		bi->block_state = YAFFS_BLOCK_STATE_CHECKPOINT;
+		dev->blocks_in_checkpt++;
+	}
+
+	chunk =
+	    dev->checkpt_cur_block * dev->param.chunks_per_block +
+	    dev->checkpt_cur_chunk;
+
+	yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+		"checkpoint wite buffer nand %d(%d:%d) objid %d chId %d",
+		chunk, dev->checkpt_cur_block, dev->checkpt_cur_chunk,
+		tags.obj_id, tags.chunk_id);
+
+	realigned_chunk = chunk - dev->chunk_offset;
+
+	dev->n_page_writes++;
+
+	dev->param.write_chunk_tags_fn(dev, realigned_chunk,
+				       dev->checkpt_buffer, &tags);
+	dev->checkpt_byte_offs = 0;
+	dev->checkpt_page_seq++;
+	dev->checkpt_cur_chunk++;
+	if (dev->checkpt_cur_chunk >= dev->param.chunks_per_block) {
+		dev->checkpt_cur_chunk = 0;
+		dev->checkpt_cur_block = -1;
+	}
+	memset(dev->checkpt_buffer, 0, dev->data_bytes_per_chunk);
+
+	return 1;
+}
+
+int yaffs2_checkpt_wr(struct yaffs_dev *dev, const void *data, int n_bytes)
+{
+	int i = 0;
+	int ok = 1;
+
+	u8 *data_bytes = (u8 *) data;
+
+	if (!dev->checkpt_buffer)
+		return 0;
+
+	if (!dev->checkpt_open_write)
+		return -1;
+
+	while (i < n_bytes && ok) {
+		dev->checkpt_buffer[dev->checkpt_byte_offs] = *data_bytes;
+		dev->checkpt_sum += *data_bytes;
+		dev->checkpt_xor ^= *data_bytes;
+
+		dev->checkpt_byte_offs++;
+		i++;
+		data_bytes++;
+		dev->checkpt_byte_count++;
+
+		if (dev->checkpt_byte_offs < 0 ||
+		    dev->checkpt_byte_offs >= dev->data_bytes_per_chunk)
+			ok = yaffs2_checkpt_flush_buffer(dev);
+	}
+
+	return i;
+}
+
+int yaffs2_checkpt_rd(struct yaffs_dev *dev, void *data, int n_bytes)
+{
+	int i = 0;
+	int ok = 1;
+	struct yaffs_ext_tags tags;
+
+	int chunk;
+	int realigned_chunk;
+
+	u8 *data_bytes = (u8 *) data;
+
+	if (!dev->checkpt_buffer)
+		return 0;
+
+	if (dev->checkpt_open_write)
+		return -1;
+
+	while (i < n_bytes && ok) {
+
+		if (dev->checkpt_byte_offs < 0 ||
+		    dev->checkpt_byte_offs >= dev->data_bytes_per_chunk) {
+
+			if (dev->checkpt_cur_block < 0) {
+				yaffs2_checkpt_find_block(dev);
+				dev->checkpt_cur_chunk = 0;
+			}
+
+			if (dev->checkpt_cur_block < 0)
+				ok = 0;
+			else {
+				chunk = dev->checkpt_cur_block *
+				    dev->param.chunks_per_block +
+				    dev->checkpt_cur_chunk;
+
+				realigned_chunk = chunk - dev->chunk_offset;
+
+				dev->n_page_reads++;
+
+				/* read in the next chunk */
+				dev->param.read_chunk_tags_fn(dev,
+							      realigned_chunk,
+							      dev->
+							      checkpt_buffer,
+							      &tags);
+
+				if (tags.chunk_id != (dev->checkpt_page_seq + 1)
+				    || tags.ecc_result > YAFFS_ECC_RESULT_FIXED
+				    || tags.seq_number !=
+				    YAFFS_SEQUENCE_CHECKPOINT_DATA)
+					ok = 0;
+
+				dev->checkpt_byte_offs = 0;
+				dev->checkpt_page_seq++;
+				dev->checkpt_cur_chunk++;
+
+				if (dev->checkpt_cur_chunk >=
+				    dev->param.chunks_per_block)
+					dev->checkpt_cur_block = -1;
+			}
+		}
+
+		if (ok) {
+			*data_bytes =
+			    dev->checkpt_buffer[dev->checkpt_byte_offs];
+			dev->checkpt_sum += *data_bytes;
+			dev->checkpt_xor ^= *data_bytes;
+			dev->checkpt_byte_offs++;
+			i++;
+			data_bytes++;
+			dev->checkpt_byte_count++;
+		}
+	}
+
+	return i;
+}
+
+int yaffs_checkpt_close(struct yaffs_dev *dev)
+{
+
+	if (dev->checkpt_open_write) {
+		if (dev->checkpt_byte_offs != 0)
+			yaffs2_checkpt_flush_buffer(dev);
+	} else if (dev->checkpt_block_list) {
+		int i;
+		for (i = 0;
+		     i < dev->blocks_in_checkpt
+		     && dev->checkpt_block_list[i] >= 0; i++) {
+			int blk = dev->checkpt_block_list[i];
+			struct yaffs_block_info *bi = NULL;
+			if (dev->internal_start_block <= blk
+			    && blk <= dev->internal_end_block)
+				bi = yaffs_get_block_info(dev, blk);
+			if (bi && bi->block_state == YAFFS_BLOCK_STATE_EMPTY)
+				bi->block_state = YAFFS_BLOCK_STATE_CHECKPOINT;
+			else {
+				/* Todo this looks odd... */
+			}
+		}
+		kfree(dev->checkpt_block_list);
+		dev->checkpt_block_list = NULL;
+	}
+
+	dev->n_free_chunks -=
+	    dev->blocks_in_checkpt * dev->param.chunks_per_block;
+	dev->n_erased_blocks -= dev->blocks_in_checkpt;
+
+	yaffs_trace(YAFFS_TRACE_CHECKPOINT,"checkpoint byte count %d",
+		dev->checkpt_byte_count);
+
+	if (dev->checkpt_buffer) {
+		/* free the buffer */
+		kfree(dev->checkpt_buffer);
+		dev->checkpt_buffer = NULL;
+		return 1;
+	} else {
+		return 0;
+        }
+}
+
+int yaffs2_checkpt_invalidate_stream(struct yaffs_dev *dev)
+{
+	/* Erase the checkpoint data */
+
+	yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+		"checkpoint invalidate of %d blocks",
+		dev->blocks_in_checkpt);
+
+	return yaffs_checkpt_erase(dev);
+}
diff --git a/fs/yaffs2/yaffs_checkptrw.h b/fs/yaffs2/yaffs_checkptrw.h
new file mode 100644
index 00000000..361c6067
--- /dev/null
+++ b/fs/yaffs2/yaffs_checkptrw.h
@@ -0,0 +1,33 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YAFFS_CHECKPTRW_H__
+#define __YAFFS_CHECKPTRW_H__
+
+#include "yaffs_guts.h"
+
+int yaffs2_checkpt_open(struct yaffs_dev *dev, int writing);
+
+int yaffs2_checkpt_wr(struct yaffs_dev *dev, const void *data, int n_bytes);
+
+int yaffs2_checkpt_rd(struct yaffs_dev *dev, void *data, int n_bytes);
+
+int yaffs2_get_checkpt_sum(struct yaffs_dev *dev, u32 * sum);
+
+int yaffs_checkpt_close(struct yaffs_dev *dev);
+
+int yaffs2_checkpt_invalidate_stream(struct yaffs_dev *dev);
+
+#endif
diff --git a/fs/yaffs2/yaffs_ecc.c b/fs/yaffs2/yaffs_ecc.c
new file mode 100644
index 00000000..e95a8069
--- /dev/null
+++ b/fs/yaffs2/yaffs_ecc.c
@@ -0,0 +1,298 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * This code implements the ECC algorithm used in SmartMedia.
+ *
+ * The ECC comprises 22 bits of parity information and is stuffed into 3 bytes.
+ * The two unused bit are set to 1.
+ * The ECC can correct single bit errors in a 256-byte page of data. Thus, two such ECC
+ * blocks are used on a 512-byte NAND page.
+ *
+ */
+
+/* Table generated by gen-ecc.c
+ * Using a table means we do not have to calculate p1..p4 and p1'..p4'
+ * for each byte of data. These are instead provided in a table in bits7..2.
+ * Bit 0 of each entry indicates whether the entry has an odd or even parity, and therefore
+ * this bytes influence on the line parity.
+ */
+
+#include "yportenv.h"
+
+#include "yaffs_ecc.h"
+
+static const unsigned char column_parity_table[] = {
+	0x00, 0x55, 0x59, 0x0c, 0x65, 0x30, 0x3c, 0x69,
+	0x69, 0x3c, 0x30, 0x65, 0x0c, 0x59, 0x55, 0x00,
+	0x95, 0xc0, 0xcc, 0x99, 0xf0, 0xa5, 0xa9, 0xfc,
+	0xfc, 0xa9, 0xa5, 0xf0, 0x99, 0xcc, 0xc0, 0x95,
+	0x99, 0xcc, 0xc0, 0x95, 0xfc, 0xa9, 0xa5, 0xf0,
+	0xf0, 0xa5, 0xa9, 0xfc, 0x95, 0xc0, 0xcc, 0x99,
+	0x0c, 0x59, 0x55, 0x00, 0x69, 0x3c, 0x30, 0x65,
+	0x65, 0x30, 0x3c, 0x69, 0x00, 0x55, 0x59, 0x0c,
+	0xa5, 0xf0, 0xfc, 0xa9, 0xc0, 0x95, 0x99, 0xcc,
+	0xcc, 0x99, 0x95, 0xc0, 0xa9, 0xfc, 0xf0, 0xa5,
+	0x30, 0x65, 0x69, 0x3c, 0x55, 0x00, 0x0c, 0x59,
+	0x59, 0x0c, 0x00, 0x55, 0x3c, 0x69, 0x65, 0x30,
+	0x3c, 0x69, 0x65, 0x30, 0x59, 0x0c, 0x00, 0x55,
+	0x55, 0x00, 0x0c, 0x59, 0x30, 0x65, 0x69, 0x3c,
+	0xa9, 0xfc, 0xf0, 0xa5, 0xcc, 0x99, 0x95, 0xc0,
+	0xc0, 0x95, 0x99, 0xcc, 0xa5, 0xf0, 0xfc, 0xa9,
+	0xa9, 0xfc, 0xf0, 0xa5, 0xcc, 0x99, 0x95, 0xc0,
+	0xc0, 0x95, 0x99, 0xcc, 0xa5, 0xf0, 0xfc, 0xa9,
+	0x3c, 0x69, 0x65, 0x30, 0x59, 0x0c, 0x00, 0x55,
+	0x55, 0x00, 0x0c, 0x59, 0x30, 0x65, 0x69, 0x3c,
+	0x30, 0x65, 0x69, 0x3c, 0x55, 0x00, 0x0c, 0x59,
+	0x59, 0x0c, 0x00, 0x55, 0x3c, 0x69, 0x65, 0x30,
+	0xa5, 0xf0, 0xfc, 0xa9, 0xc0, 0x95, 0x99, 0xcc,
+	0xcc, 0x99, 0x95, 0xc0, 0xa9, 0xfc, 0xf0, 0xa5,
+	0x0c, 0x59, 0x55, 0x00, 0x69, 0x3c, 0x30, 0x65,
+	0x65, 0x30, 0x3c, 0x69, 0x00, 0x55, 0x59, 0x0c,
+	0x99, 0xcc, 0xc0, 0x95, 0xfc, 0xa9, 0xa5, 0xf0,
+	0xf0, 0xa5, 0xa9, 0xfc, 0x95, 0xc0, 0xcc, 0x99,
+	0x95, 0xc0, 0xcc, 0x99, 0xf0, 0xa5, 0xa9, 0xfc,
+	0xfc, 0xa9, 0xa5, 0xf0, 0x99, 0xcc, 0xc0, 0x95,
+	0x00, 0x55, 0x59, 0x0c, 0x65, 0x30, 0x3c, 0x69,
+	0x69, 0x3c, 0x30, 0x65, 0x0c, 0x59, 0x55, 0x00,
+};
+
+
+/* Calculate the ECC for a 256-byte block of data */
+void yaffs_ecc_cacl(const unsigned char *data, unsigned char *ecc)
+{
+	unsigned int i;
+
+	unsigned char col_parity = 0;
+	unsigned char line_parity = 0;
+	unsigned char line_parity_prime = 0;
+	unsigned char t;
+	unsigned char b;
+
+	for (i = 0; i < 256; i++) {
+		b = column_parity_table[*data++];
+		col_parity ^= b;
+
+		if (b & 0x01) {	/* odd number of bits in the byte */
+			line_parity ^= i;
+			line_parity_prime ^= ~i;
+		}
+	}
+
+	ecc[2] = (~col_parity) | 0x03;
+
+	t = 0;
+	if (line_parity & 0x80)
+		t |= 0x80;
+	if (line_parity_prime & 0x80)
+		t |= 0x40;
+	if (line_parity & 0x40)
+		t |= 0x20;
+	if (line_parity_prime & 0x40)
+		t |= 0x10;
+	if (line_parity & 0x20)
+		t |= 0x08;
+	if (line_parity_prime & 0x20)
+		t |= 0x04;
+	if (line_parity & 0x10)
+		t |= 0x02;
+	if (line_parity_prime & 0x10)
+		t |= 0x01;
+	ecc[1] = ~t;
+
+	t = 0;
+	if (line_parity & 0x08)
+		t |= 0x80;
+	if (line_parity_prime & 0x08)
+		t |= 0x40;
+	if (line_parity & 0x04)
+		t |= 0x20;
+	if (line_parity_prime & 0x04)
+		t |= 0x10;
+	if (line_parity & 0x02)
+		t |= 0x08;
+	if (line_parity_prime & 0x02)
+		t |= 0x04;
+	if (line_parity & 0x01)
+		t |= 0x02;
+	if (line_parity_prime & 0x01)
+		t |= 0x01;
+	ecc[0] = ~t;
+
+#ifdef CONFIG_YAFFS_ECC_WRONG_ORDER
+	/* Swap the bytes into the wrong order */
+	t = ecc[0];
+	ecc[0] = ecc[1];
+	ecc[1] = t;
+#endif
+}
+
+/* Correct the ECC on a 256 byte block of data */
+
+int yaffs_ecc_correct(unsigned char *data, unsigned char *read_ecc,
+		      const unsigned char *test_ecc)
+{
+	unsigned char d0, d1, d2;	/* deltas */
+
+	d0 = read_ecc[0] ^ test_ecc[0];
+	d1 = read_ecc[1] ^ test_ecc[1];
+	d2 = read_ecc[2] ^ test_ecc[2];
+
+	if ((d0 | d1 | d2) == 0)
+		return 0;	/* no error */
+
+	if (((d0 ^ (d0 >> 1)) & 0x55) == 0x55 &&
+	    ((d1 ^ (d1 >> 1)) & 0x55) == 0x55 &&
+	    ((d2 ^ (d2 >> 1)) & 0x54) == 0x54) {
+		/* Single bit (recoverable) error in data */
+
+		unsigned byte;
+		unsigned bit;
+
+#ifdef CONFIG_YAFFS_ECC_WRONG_ORDER
+		/* swap the bytes to correct for the wrong order */
+		unsigned char t;
+
+		t = d0;
+		d0 = d1;
+		d1 = t;
+#endif
+
+		bit = byte = 0;
+
+		if (d1 & 0x80)
+			byte |= 0x80;
+		if (d1 & 0x20)
+			byte |= 0x40;
+		if (d1 & 0x08)
+			byte |= 0x20;
+		if (d1 & 0x02)
+			byte |= 0x10;
+		if (d0 & 0x80)
+			byte |= 0x08;
+		if (d0 & 0x20)
+			byte |= 0x04;
+		if (d0 & 0x08)
+			byte |= 0x02;
+		if (d0 & 0x02)
+			byte |= 0x01;
+
+		if (d2 & 0x80)
+			bit |= 0x04;
+		if (d2 & 0x20)
+			bit |= 0x02;
+		if (d2 & 0x08)
+			bit |= 0x01;
+
+		data[byte] ^= (1 << bit);
+
+		return 1;	/* Corrected the error */
+	}
+
+	if ((hweight8(d0) + hweight8(d1) + hweight8(d2)) == 1) {
+		/* Reccoverable error in ecc */
+
+		read_ecc[0] = test_ecc[0];
+		read_ecc[1] = test_ecc[1];
+		read_ecc[2] = test_ecc[2];
+
+		return 1;	/* Corrected the error */
+	}
+
+	/* Unrecoverable error */
+
+	return -1;
+
+}
+
+/*
+ * ECCxxxOther does ECC calcs on arbitrary n bytes of data
+ */
+void yaffs_ecc_calc_other(const unsigned char *data, unsigned n_bytes,
+			  struct yaffs_ecc_other *ecc_other)
+{
+	unsigned int i;
+
+	unsigned char col_parity = 0;
+	unsigned line_parity = 0;
+	unsigned line_parity_prime = 0;
+	unsigned char b;
+
+	for (i = 0; i < n_bytes; i++) {
+		b = column_parity_table[*data++];
+		col_parity ^= b;
+
+		if (b & 0x01) {
+			/* odd number of bits in the byte */
+			line_parity ^= i;
+			line_parity_prime ^= ~i;
+		}
+
+	}
+
+	ecc_other->col_parity = (col_parity >> 2) & 0x3f;
+	ecc_other->line_parity = line_parity;
+	ecc_other->line_parity_prime = line_parity_prime;
+}
+
+int yaffs_ecc_correct_other(unsigned char *data, unsigned n_bytes,
+			    struct yaffs_ecc_other *read_ecc,
+			    const struct yaffs_ecc_other *test_ecc)
+{
+	unsigned char delta_col;	/* column parity delta */
+	unsigned delta_line;	/* line parity delta */
+	unsigned delta_line_prime;	/* line parity delta */
+	unsigned bit;
+
+	delta_col = read_ecc->col_parity ^ test_ecc->col_parity;
+	delta_line = read_ecc->line_parity ^ test_ecc->line_parity;
+	delta_line_prime =
+	    read_ecc->line_parity_prime ^ test_ecc->line_parity_prime;
+
+	if ((delta_col | delta_line | delta_line_prime) == 0)
+		return 0;	/* no error */
+
+	if (delta_line == ~delta_line_prime &&
+	    (((delta_col ^ (delta_col >> 1)) & 0x15) == 0x15)) {
+		/* Single bit (recoverable) error in data */
+
+		bit = 0;
+
+		if (delta_col & 0x20)
+			bit |= 0x04;
+		if (delta_col & 0x08)
+			bit |= 0x02;
+		if (delta_col & 0x02)
+			bit |= 0x01;
+
+		if (delta_line >= n_bytes)
+			return -1;
+
+		data[delta_line] ^= (1 << bit);
+
+		return 1;	/* corrected */
+	}
+
+	if ((hweight32(delta_line) +
+	     hweight32(delta_line_prime) +
+	     hweight8(delta_col)) == 1) {
+		/* Reccoverable error in ecc */
+
+		*read_ecc = *test_ecc;
+		return 1;	/* corrected */
+	}
+
+	/* Unrecoverable error */
+
+	return -1;
+}
diff --git a/fs/yaffs2/yaffs_ecc.h b/fs/yaffs2/yaffs_ecc.h
new file mode 100644
index 00000000..b0c461d6
--- /dev/null
+++ b/fs/yaffs2/yaffs_ecc.h
@@ -0,0 +1,44 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+/*
+ * This code implements the ECC algorithm used in SmartMedia.
+ *
+ * The ECC comprises 22 bits of parity information and is stuffed into 3 bytes.
+ * The two unused bit are set to 1.
+ * The ECC can correct single bit errors in a 256-byte page of data. Thus, two such ECC
+ * blocks are used on a 512-byte NAND page.
+ *
+ */
+
+#ifndef __YAFFS_ECC_H__
+#define __YAFFS_ECC_H__
+
+struct yaffs_ecc_other {
+	unsigned char col_parity;
+	unsigned line_parity;
+	unsigned line_parity_prime;
+};
+
+void yaffs_ecc_cacl(const unsigned char *data, unsigned char *ecc);
+int yaffs_ecc_correct(unsigned char *data, unsigned char *read_ecc,
+		      const unsigned char *test_ecc);
+
+void yaffs_ecc_calc_other(const unsigned char *data, unsigned n_bytes,
+			  struct yaffs_ecc_other *ecc);
+int yaffs_ecc_correct_other(unsigned char *data, unsigned n_bytes,
+			    struct yaffs_ecc_other *read_ecc,
+			    const struct yaffs_ecc_other *test_ecc);
+#endif
diff --git a/fs/yaffs2/yaffs_getblockinfo.h b/fs/yaffs2/yaffs_getblockinfo.h
new file mode 100644
index 00000000..d87acbde
--- /dev/null
+++ b/fs/yaffs2/yaffs_getblockinfo.h
@@ -0,0 +1,35 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YAFFS_GETBLOCKINFO_H__
+#define __YAFFS_GETBLOCKINFO_H__
+
+#include "yaffs_guts.h"
+#include "yaffs_trace.h"
+
+/* Function to manipulate block info */
+static inline struct yaffs_block_info *yaffs_get_block_info(struct yaffs_dev
+							      *dev, int blk)
+{
+	if (blk < dev->internal_start_block || blk > dev->internal_end_block) {
+		yaffs_trace(YAFFS_TRACE_ERROR,
+			"**>> yaffs: get_block_info block %d is not valid",
+			blk);
+		YBUG();
+	}
+	return &dev->block_info[blk - dev->internal_start_block];
+}
+
+#endif
diff --git a/fs/yaffs2/yaffs_guts.c b/fs/yaffs2/yaffs_guts.c
new file mode 100644
index 00000000..f4ae9dee
--- /dev/null
+++ b/fs/yaffs2/yaffs_guts.c
@@ -0,0 +1,5164 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "yportenv.h"
+#include "yaffs_trace.h"
+
+#include "yaffs_guts.h"
+#include "yaffs_tagsvalidity.h"
+#include "yaffs_getblockinfo.h"
+
+#include "yaffs_tagscompat.h"
+
+#include "yaffs_nand.h"
+
+#include "yaffs_yaffs1.h"
+#include "yaffs_yaffs2.h"
+#include "yaffs_bitmap.h"
+#include "yaffs_verify.h"
+
+#include "yaffs_nand.h"
+#include "yaffs_packedtags2.h"
+
+#include "yaffs_nameval.h"
+#include "yaffs_allocator.h"
+
+#include "yaffs_attribs.h"
+
+/* Note YAFFS_GC_GOOD_ENOUGH must be <= YAFFS_GC_PASSIVE_THRESHOLD */
+#define YAFFS_GC_GOOD_ENOUGH 2
+#define YAFFS_GC_PASSIVE_THRESHOLD 4
+
+#include "yaffs_ecc.h"
+
+/* Forward declarations */
+
+static int yaffs_wr_data_obj(struct yaffs_obj *in, int inode_chunk,
+			     const u8 * buffer, int n_bytes, int use_reserve);
+
+
+
+/* Function to calculate chunk and offset */
+
+static void yaffs_addr_to_chunk(struct yaffs_dev *dev, loff_t addr,
+				int *chunk_out, u32 * offset_out)
+{
+	int chunk;
+	u32 offset;
+
+	chunk = (u32) (addr >> dev->chunk_shift);
+
+	if (dev->chunk_div == 1) {
+		/* easy power of 2 case */
+		offset = (u32) (addr & dev->chunk_mask);
+	} else {
+		/* Non power-of-2 case */
+
+		loff_t chunk_base;
+
+		chunk /= dev->chunk_div;
+
+		chunk_base = ((loff_t) chunk) * dev->data_bytes_per_chunk;
+		offset = (u32) (addr - chunk_base);
+	}
+
+	*chunk_out = chunk;
+	*offset_out = offset;
+}
+
+/* Function to return the number of shifts for a power of 2 greater than or
+ * equal to the given number
+ * Note we don't try to cater for all possible numbers and this does not have to
+ * be hellishly efficient.
+ */
+
+static u32 calc_shifts_ceiling(u32 x)
+{
+	int extra_bits;
+	int shifts;
+
+	shifts = extra_bits = 0;
+
+	while (x > 1) {
+		if (x & 1)
+			extra_bits++;
+		x >>= 1;
+		shifts++;
+	}
+
+	if (extra_bits)
+		shifts++;
+
+	return shifts;
+}
+
+/* Function to return the number of shifts to get a 1 in bit 0
+ */
+
+static u32 calc_shifts(u32 x)
+{
+	u32 shifts;
+
+	shifts = 0;
+
+	if (!x)
+		return 0;
+
+	while (!(x & 1)) {
+		x >>= 1;
+		shifts++;
+	}
+
+	return shifts;
+}
+
+/*
+ * Temporary buffer manipulations.
+ */
+
+static int yaffs_init_tmp_buffers(struct yaffs_dev *dev)
+{
+	int i;
+	u8 *buf = (u8 *) 1;
+
+	memset(dev->temp_buffer, 0, sizeof(dev->temp_buffer));
+
+	for (i = 0; buf && i < YAFFS_N_TEMP_BUFFERS; i++) {
+		dev->temp_buffer[i].line = 0;	/* not in use */
+		dev->temp_buffer[i].buffer = buf =
+		    kmalloc(dev->param.total_bytes_per_chunk, GFP_NOFS);
+	}
+
+	return buf ? YAFFS_OK : YAFFS_FAIL;
+}
+
+u8 *yaffs_get_temp_buffer(struct yaffs_dev * dev, int line_no)
+{
+	int i, j;
+
+	dev->temp_in_use++;
+	if (dev->temp_in_use > dev->max_temp)
+		dev->max_temp = dev->temp_in_use;
+
+	for (i = 0; i < YAFFS_N_TEMP_BUFFERS; i++) {
+		if (dev->temp_buffer[i].line == 0) {
+			dev->temp_buffer[i].line = line_no;
+			if ((i + 1) > dev->max_temp) {
+				dev->max_temp = i + 1;
+				for (j = 0; j <= i; j++)
+					dev->temp_buffer[j].max_line =
+					    dev->temp_buffer[j].line;
+			}
+
+			return dev->temp_buffer[i].buffer;
+		}
+	}
+
+	yaffs_trace(YAFFS_TRACE_BUFFERS,
+		"Out of temp buffers at line %d, other held by lines:",
+		line_no);
+	for (i = 0; i < YAFFS_N_TEMP_BUFFERS; i++)
+		yaffs_trace(YAFFS_TRACE_BUFFERS," %d", dev->temp_buffer[i].line);
+
+	/*
+	 * If we got here then we have to allocate an unmanaged one
+	 * This is not good.
+	 */
+
+	dev->unmanaged_buffer_allocs++;
+	return kmalloc(dev->data_bytes_per_chunk, GFP_NOFS);
+
+}
+
+void yaffs_release_temp_buffer(struct yaffs_dev *dev, u8 * buffer, int line_no)
+{
+	int i;
+
+	dev->temp_in_use--;
+
+	for (i = 0; i < YAFFS_N_TEMP_BUFFERS; i++) {
+		if (dev->temp_buffer[i].buffer == buffer) {
+			dev->temp_buffer[i].line = 0;
+			return;
+		}
+	}
+
+	if (buffer) {
+		/* assume it is an unmanaged one. */
+		yaffs_trace(YAFFS_TRACE_BUFFERS,
+		  "Releasing unmanaged temp buffer in line %d",
+		   line_no);
+		kfree(buffer);
+		dev->unmanaged_buffer_deallocs++;
+	}
+
+}
+
+/*
+ * Determine if we have a managed buffer.
+ */
+int yaffs_is_managed_tmp_buffer(struct yaffs_dev *dev, const u8 * buffer)
+{
+	int i;
+
+	for (i = 0; i < YAFFS_N_TEMP_BUFFERS; i++) {
+		if (dev->temp_buffer[i].buffer == buffer)
+			return 1;
+	}
+
+	for (i = 0; i < dev->param.n_caches; i++) {
+		if (dev->cache[i].data == buffer)
+			return 1;
+	}
+
+	if (buffer == dev->checkpt_buffer)
+		return 1;
+
+	yaffs_trace(YAFFS_TRACE_ALWAYS,
+	  "yaffs: unmaged buffer detected.");
+	return 0;
+}
+
+/*
+ * Functions for robustisizing TODO
+ *
+ */
+
+static void yaffs_handle_chunk_wr_ok(struct yaffs_dev *dev, int nand_chunk,
+				     const u8 * data,
+				     const struct yaffs_ext_tags *tags)
+{
+	dev = dev;
+	nand_chunk = nand_chunk;
+	data = data;
+	tags = tags;
+}
+
+static void yaffs_handle_chunk_update(struct yaffs_dev *dev, int nand_chunk,
+				      const struct yaffs_ext_tags *tags)
+{
+	dev = dev;
+	nand_chunk = nand_chunk;
+	tags = tags;
+}
+
+void yaffs_handle_chunk_error(struct yaffs_dev *dev,
+			      struct yaffs_block_info *bi)
+{
+	if (!bi->gc_prioritise) {
+		bi->gc_prioritise = 1;
+		dev->has_pending_prioritised_gc = 1;
+		bi->chunk_error_strikes++;
+
+		if (bi->chunk_error_strikes > 3) {
+			bi->needs_retiring = 1;	/* Too many stikes, so retire this */
+			yaffs_trace(YAFFS_TRACE_ALWAYS, "yaffs: Block struck out");
+
+		}
+	}
+}
+
+static void yaffs_handle_chunk_wr_error(struct yaffs_dev *dev, int nand_chunk,
+					int erased_ok)
+{
+	int flash_block = nand_chunk / dev->param.chunks_per_block;
+	struct yaffs_block_info *bi = yaffs_get_block_info(dev, flash_block);
+
+	yaffs_handle_chunk_error(dev, bi);
+
+	if (erased_ok) {
+		/* Was an actual write failure, so mark the block for retirement  */
+		bi->needs_retiring = 1;
+		yaffs_trace(YAFFS_TRACE_ERROR | YAFFS_TRACE_BAD_BLOCKS,
+		  "**>> Block %d needs retiring", flash_block);
+	}
+
+	/* Delete the chunk */
+	yaffs_chunk_del(dev, nand_chunk, 1, __LINE__);
+	yaffs_skip_rest_of_block(dev);
+}
+
+/*
+ * Verification code
+ */
+
+/*
+ *  Simple hash function. Needs to have a reasonable spread
+ */
+
+static inline int yaffs_hash_fn(int n)
+{
+	n = abs(n);
+	return n % YAFFS_NOBJECT_BUCKETS;
+}
+
+/*
+ * Access functions to useful fake objects.
+ * Note that root might have a presence in NAND if permissions are set.
+ */
+
+struct yaffs_obj *yaffs_root(struct yaffs_dev *dev)
+{
+	return dev->root_dir;
+}
+
+struct yaffs_obj *yaffs_lost_n_found(struct yaffs_dev *dev)
+{
+	return dev->lost_n_found;
+}
+
+/*
+ *  Erased NAND checking functions
+ */
+
+int yaffs_check_ff(u8 * buffer, int n_bytes)
+{
+	/* Horrible, slow implementation */
+	while (n_bytes--) {
+		if (*buffer != 0xFF)
+			return 0;
+		buffer++;
+	}
+	return 1;
+}
+
+static int yaffs_check_chunk_erased(struct yaffs_dev *dev, int nand_chunk)
+{
+	int retval = YAFFS_OK;
+	u8 *data = yaffs_get_temp_buffer(dev, __LINE__);
+	struct yaffs_ext_tags tags;
+	int result;
+
+	result = yaffs_rd_chunk_tags_nand(dev, nand_chunk, data, &tags);
+
+	if (tags.ecc_result > YAFFS_ECC_RESULT_NO_ERROR)
+		retval = YAFFS_FAIL;
+
+	if (!yaffs_check_ff(data, dev->data_bytes_per_chunk) ||
+		tags.chunk_used) {
+		yaffs_trace(YAFFS_TRACE_NANDACCESS, "Chunk %d not erased", nand_chunk);
+		retval = YAFFS_FAIL;
+	}
+
+	yaffs_release_temp_buffer(dev, data, __LINE__);
+
+	return retval;
+
+}
+
+static int yaffs_verify_chunk_written(struct yaffs_dev *dev,
+				      int nand_chunk,
+				      const u8 * data,
+				      struct yaffs_ext_tags *tags)
+{
+	int retval = YAFFS_OK;
+	struct yaffs_ext_tags temp_tags;
+	u8 *buffer = yaffs_get_temp_buffer(dev, __LINE__);
+	int result;
+
+	result = yaffs_rd_chunk_tags_nand(dev, nand_chunk, buffer, &temp_tags);
+	if (memcmp(buffer, data, dev->data_bytes_per_chunk) ||
+	    temp_tags.obj_id != tags->obj_id ||
+	    temp_tags.chunk_id != tags->chunk_id ||
+	    temp_tags.n_bytes != tags->n_bytes)
+		retval = YAFFS_FAIL;
+
+	yaffs_release_temp_buffer(dev, buffer, __LINE__);
+
+	return retval;
+}
+
+
+int yaffs_check_alloc_available(struct yaffs_dev *dev, int n_chunks)
+{
+	int reserved_chunks;
+	int reserved_blocks = dev->param.n_reserved_blocks;
+	int checkpt_blocks;
+
+	checkpt_blocks = yaffs_calc_checkpt_blocks_required(dev);
+
+	reserved_chunks =
+	    ((reserved_blocks + checkpt_blocks) * dev->param.chunks_per_block);
+
+	return (dev->n_free_chunks > (reserved_chunks + n_chunks));
+}
+
+static int yaffs_find_alloc_block(struct yaffs_dev *dev)
+{
+	int i;
+
+	struct yaffs_block_info *bi;
+
+	if (dev->n_erased_blocks < 1) {
+		/* Hoosterman we've got a problem.
+		 * Can't get space to gc
+		 */
+		yaffs_trace(YAFFS_TRACE_ERROR,
+		  "yaffs tragedy: no more erased blocks" );
+
+		return -1;
+	}
+
+	/* Find an empty block. */
+
+	for (i = dev->internal_start_block; i <= dev->internal_end_block; i++) {
+		dev->alloc_block_finder++;
+		if (dev->alloc_block_finder < dev->internal_start_block
+		    || dev->alloc_block_finder > dev->internal_end_block) {
+			dev->alloc_block_finder = dev->internal_start_block;
+		}
+
+		bi = yaffs_get_block_info(dev, dev->alloc_block_finder);
+
+		if (bi->block_state == YAFFS_BLOCK_STATE_EMPTY) {
+			bi->block_state = YAFFS_BLOCK_STATE_ALLOCATING;
+			dev->seq_number++;
+			bi->seq_number = dev->seq_number;
+			dev->n_erased_blocks--;
+			yaffs_trace(YAFFS_TRACE_ALLOCATE,
+			  "Allocated block %d, seq  %d, %d left" ,
+			   dev->alloc_block_finder, dev->seq_number,
+			   dev->n_erased_blocks);
+			return dev->alloc_block_finder;
+		}
+	}
+
+	yaffs_trace(YAFFS_TRACE_ALWAYS,
+		"yaffs tragedy: no more erased blocks, but there should have been %d",
+		dev->n_erased_blocks);
+
+	return -1;
+}
+
+static int yaffs_alloc_chunk(struct yaffs_dev *dev, int use_reserver,
+			     struct yaffs_block_info **block_ptr)
+{
+	int ret_val;
+	struct yaffs_block_info *bi;
+
+	if (dev->alloc_block < 0) {
+		/* Get next block to allocate off */
+		dev->alloc_block = yaffs_find_alloc_block(dev);
+		dev->alloc_page = 0;
+	}
+
+	if (!use_reserver && !yaffs_check_alloc_available(dev, 1)) {
+		/* Not enough space to allocate unless we're allowed to use the reserve. */
+		return -1;
+	}
+
+	if (dev->n_erased_blocks < dev->param.n_reserved_blocks
+	    && dev->alloc_page == 0)
+		yaffs_trace(YAFFS_TRACE_ALLOCATE, "Allocating reserve");
+
+	/* Next page please.... */
+	if (dev->alloc_block >= 0) {
+		bi = yaffs_get_block_info(dev, dev->alloc_block);
+
+		ret_val = (dev->alloc_block * dev->param.chunks_per_block) +
+		    dev->alloc_page;
+		bi->pages_in_use++;
+		yaffs_set_chunk_bit(dev, dev->alloc_block, dev->alloc_page);
+
+		dev->alloc_page++;
+
+		dev->n_free_chunks--;
+
+		/* If the block is full set the state to full */
+		if (dev->alloc_page >= dev->param.chunks_per_block) {
+			bi->block_state = YAFFS_BLOCK_STATE_FULL;
+			dev->alloc_block = -1;
+		}
+
+		if (block_ptr)
+			*block_ptr = bi;
+
+		return ret_val;
+	}
+
+	yaffs_trace(YAFFS_TRACE_ERROR, "!!!!!!!!! Allocator out !!!!!!!!!!!!!!!!!" );
+
+	return -1;
+}
+
+static int yaffs_get_erased_chunks(struct yaffs_dev *dev)
+{
+	int n;
+
+	n = dev->n_erased_blocks * dev->param.chunks_per_block;
+
+	if (dev->alloc_block > 0)
+		n += (dev->param.chunks_per_block - dev->alloc_page);
+
+	return n;
+
+}
+
+/*
+ * yaffs_skip_rest_of_block() skips over the rest of the allocation block
+ * if we don't want to write to it.
+ */
+void yaffs_skip_rest_of_block(struct yaffs_dev *dev)
+{
+	if (dev->alloc_block > 0) {
+		struct yaffs_block_info *bi =
+		    yaffs_get_block_info(dev, dev->alloc_block);
+		if (bi->block_state == YAFFS_BLOCK_STATE_ALLOCATING) {
+			bi->block_state = YAFFS_BLOCK_STATE_FULL;
+			dev->alloc_block = -1;
+		}
+	}
+}
+
+static int yaffs_write_new_chunk(struct yaffs_dev *dev,
+				 const u8 * data,
+				 struct yaffs_ext_tags *tags, int use_reserver)
+{
+	int attempts = 0;
+	int write_ok = 0;
+	int chunk;
+
+	yaffs2_checkpt_invalidate(dev);
+
+	do {
+		struct yaffs_block_info *bi = 0;
+		int erased_ok = 0;
+
+		chunk = yaffs_alloc_chunk(dev, use_reserver, &bi);
+		if (chunk < 0) {
+			/* no space */
+			break;
+		}
+
+		/* First check this chunk is erased, if it needs
+		 * checking.  The checking policy (unless forced
+		 * always on) is as follows:
+		 *
+		 * Check the first page we try to write in a block.
+		 * If the check passes then we don't need to check any
+		 * more.        If the check fails, we check again...
+		 * If the block has been erased, we don't need to check.
+		 *
+		 * However, if the block has been prioritised for gc,
+		 * then we think there might be something odd about
+		 * this block and stop using it.
+		 *
+		 * Rationale: We should only ever see chunks that have
+		 * not been erased if there was a partially written
+		 * chunk due to power loss.  This checking policy should
+		 * catch that case with very few checks and thus save a
+		 * lot of checks that are most likely not needed.
+		 *
+		 * Mods to the above
+		 * If an erase check fails or the write fails we skip the 
+		 * rest of the block.
+		 */
+
+		/* let's give it a try */
+		attempts++;
+
+		if (dev->param.always_check_erased)
+			bi->skip_erased_check = 0;
+
+		if (!bi->skip_erased_check) {
+			erased_ok = yaffs_check_chunk_erased(dev, chunk);
+			if (erased_ok != YAFFS_OK) {
+				yaffs_trace(YAFFS_TRACE_ERROR,
+				  "**>> yaffs chunk %d was not erased",
+				  chunk);
+
+				/* If not erased, delete this one,
+				 * skip rest of block and
+				 * try another chunk */
+				yaffs_chunk_del(dev, chunk, 1, __LINE__);
+				yaffs_skip_rest_of_block(dev);
+				continue;
+			}
+		}
+
+		write_ok = yaffs_wr_chunk_tags_nand(dev, chunk, data, tags);
+
+		if (!bi->skip_erased_check)
+			write_ok =
+			    yaffs_verify_chunk_written(dev, chunk, data, tags);
+
+		if (write_ok != YAFFS_OK) {
+			/* Clean up aborted write, skip to next block and
+			 * try another chunk */
+			yaffs_handle_chunk_wr_error(dev, chunk, erased_ok);
+			continue;
+		}
+
+		bi->skip_erased_check = 1;
+
+		/* Copy the data into the robustification buffer */
+		yaffs_handle_chunk_wr_ok(dev, chunk, data, tags);
+
+	} while (write_ok != YAFFS_OK &&
+		 (yaffs_wr_attempts <= 0 || attempts <= yaffs_wr_attempts));
+
+	if (!write_ok)
+		chunk = -1;
+
+	if (attempts > 1) {
+		yaffs_trace(YAFFS_TRACE_ERROR,
+			"**>> yaffs write required %d attempts",
+			attempts);
+		dev->n_retired_writes += (attempts - 1);
+	}
+
+	return chunk;
+}
+
+/*
+ * Block retiring for handling a broken block.
+ */
+
+static void yaffs_retire_block(struct yaffs_dev *dev, int flash_block)
+{
+	struct yaffs_block_info *bi = yaffs_get_block_info(dev, flash_block);
+
+	yaffs2_checkpt_invalidate(dev);
+
+	yaffs2_clear_oldest_dirty_seq(dev, bi);
+
+	if (yaffs_mark_bad(dev, flash_block) != YAFFS_OK) {
+		if (yaffs_erase_block(dev, flash_block) != YAFFS_OK) {
+			yaffs_trace(YAFFS_TRACE_ALWAYS,
+				"yaffs: Failed to mark bad and erase block %d",
+				flash_block);
+		} else {
+			struct yaffs_ext_tags tags;
+			int chunk_id =
+			    flash_block * dev->param.chunks_per_block;
+
+			u8 *buffer = yaffs_get_temp_buffer(dev, __LINE__);
+
+			memset(buffer, 0xff, dev->data_bytes_per_chunk);
+			yaffs_init_tags(&tags);
+			tags.seq_number = YAFFS_SEQUENCE_BAD_BLOCK;
+			if (dev->param.write_chunk_tags_fn(dev, chunk_id -
+							   dev->chunk_offset,
+							   buffer,
+							   &tags) != YAFFS_OK)
+				yaffs_trace(YAFFS_TRACE_ALWAYS,
+					"yaffs: Failed to write bad block marker to block %d",
+					flash_block);
+
+			yaffs_release_temp_buffer(dev, buffer, __LINE__);
+		}
+	}
+
+	bi->block_state = YAFFS_BLOCK_STATE_DEAD;
+	bi->gc_prioritise = 0;
+	bi->needs_retiring = 0;
+
+	dev->n_retired_blocks++;
+}
+
+/*---------------- Name handling functions ------------*/
+
+static u16 yaffs_calc_name_sum(const YCHAR * name)
+{
+	u16 sum = 0;
+	u16 i = 1;
+
+	const YUCHAR *bname = (const YUCHAR *)name;
+	if (bname) {
+		while ((*bname) && (i < (YAFFS_MAX_NAME_LENGTH / 2))) {
+
+			/* 0x1f mask is case insensitive */
+			sum += ((*bname) & 0x1f) * i;
+			i++;
+			bname++;
+		}
+	}
+	return sum;
+}
+
+void yaffs_set_obj_name(struct yaffs_obj *obj, const YCHAR * name)
+{
+#ifndef CONFIG_YAFFS_NO_SHORT_NAMES
+	memset(obj->short_name, 0, sizeof(obj->short_name));
+	if (name && 
+	        strnlen(name, YAFFS_SHORT_NAME_LENGTH + 1) <=
+	    YAFFS_SHORT_NAME_LENGTH)
+		strcpy(obj->short_name, name);
+	else
+		obj->short_name[0] = _Y('\0');
+#endif
+	obj->sum = yaffs_calc_name_sum(name);
+}
+
+void yaffs_set_obj_name_from_oh(struct yaffs_obj *obj,
+				const struct yaffs_obj_hdr *oh)
+{
+#ifdef CONFIG_YAFFS_AUTO_UNICODE
+	YCHAR tmp_name[YAFFS_MAX_NAME_LENGTH + 1];
+	memset(tmp_name, 0, sizeof(tmp_name));
+	yaffs_load_name_from_oh(obj->my_dev, tmp_name, oh->name,
+				YAFFS_MAX_NAME_LENGTH + 1);
+	yaffs_set_obj_name(obj, tmp_name);
+#else
+	yaffs_set_obj_name(obj, oh->name);
+#endif
+}
+
+/*-------------------- TNODES -------------------
+
+ * List of spare tnodes
+ * The list is hooked together using the first pointer
+ * in the tnode.
+ */
+
+struct yaffs_tnode *yaffs_get_tnode(struct yaffs_dev *dev)
+{
+	struct yaffs_tnode *tn = yaffs_alloc_raw_tnode(dev);
+	if (tn) {
+		memset(tn, 0, dev->tnode_size);
+		dev->n_tnodes++;
+	}
+
+	dev->checkpoint_blocks_required = 0;	/* force recalculation */
+
+	return tn;
+}
+
+/* FreeTnode frees up a tnode and puts it back on the free list */
+static void yaffs_free_tnode(struct yaffs_dev *dev, struct yaffs_tnode *tn)
+{
+	yaffs_free_raw_tnode(dev, tn);
+	dev->n_tnodes--;
+	dev->checkpoint_blocks_required = 0;	/* force recalculation */
+}
+
+static void yaffs_deinit_tnodes_and_objs(struct yaffs_dev *dev)
+{
+	yaffs_deinit_raw_tnodes_and_objs(dev);
+	dev->n_obj = 0;
+	dev->n_tnodes = 0;
+}
+
+void yaffs_load_tnode_0(struct yaffs_dev *dev, struct yaffs_tnode *tn,
+			unsigned pos, unsigned val)
+{
+	u32 *map = (u32 *) tn;
+	u32 bit_in_map;
+	u32 bit_in_word;
+	u32 word_in_map;
+	u32 mask;
+
+	pos &= YAFFS_TNODES_LEVEL0_MASK;
+	val >>= dev->chunk_grp_bits;
+
+	bit_in_map = pos * dev->tnode_width;
+	word_in_map = bit_in_map / 32;
+	bit_in_word = bit_in_map & (32 - 1);
+
+	mask = dev->tnode_mask << bit_in_word;
+
+	map[word_in_map] &= ~mask;
+	map[word_in_map] |= (mask & (val << bit_in_word));
+
+	if (dev->tnode_width > (32 - bit_in_word)) {
+		bit_in_word = (32 - bit_in_word);
+		word_in_map++;;
+		mask =
+		    dev->tnode_mask >> ( /*dev->tnode_width - */ bit_in_word);
+		map[word_in_map] &= ~mask;
+		map[word_in_map] |= (mask & (val >> bit_in_word));
+	}
+}
+
+u32 yaffs_get_group_base(struct yaffs_dev *dev, struct yaffs_tnode *tn,
+			 unsigned pos)
+{
+	u32 *map = (u32 *) tn;
+	u32 bit_in_map;
+	u32 bit_in_word;
+	u32 word_in_map;
+	u32 val;
+
+	pos &= YAFFS_TNODES_LEVEL0_MASK;
+
+	bit_in_map = pos * dev->tnode_width;
+	word_in_map = bit_in_map / 32;
+	bit_in_word = bit_in_map & (32 - 1);
+
+	val = map[word_in_map] >> bit_in_word;
+
+	if (dev->tnode_width > (32 - bit_in_word)) {
+		bit_in_word = (32 - bit_in_word);
+		word_in_map++;;
+		val |= (map[word_in_map] << bit_in_word);
+	}
+
+	val &= dev->tnode_mask;
+	val <<= dev->chunk_grp_bits;
+
+	return val;
+}
+
+/* ------------------- End of individual tnode manipulation -----------------*/
+
+/* ---------Functions to manipulate the look-up tree (made up of tnodes) ------
+ * The look up tree is represented by the top tnode and the number of top_level
+ * in the tree. 0 means only the level 0 tnode is in the tree.
+ */
+
+/* FindLevel0Tnode finds the level 0 tnode, if one exists. */
+struct yaffs_tnode *yaffs_find_tnode_0(struct yaffs_dev *dev,
+				       struct yaffs_file_var *file_struct,
+				       u32 chunk_id)
+{
+	struct yaffs_tnode *tn = file_struct->top;
+	u32 i;
+	int required_depth;
+	int level = file_struct->top_level;
+
+	dev = dev;
+
+	/* Check sane level and chunk Id */
+	if (level < 0 || level > YAFFS_TNODES_MAX_LEVEL)
+		return NULL;
+
+	if (chunk_id > YAFFS_MAX_CHUNK_ID)
+		return NULL;
+
+	/* First check we're tall enough (ie enough top_level) */
+
+	i = chunk_id >> YAFFS_TNODES_LEVEL0_BITS;
+	required_depth = 0;
+	while (i) {
+		i >>= YAFFS_TNODES_INTERNAL_BITS;
+		required_depth++;
+	}
+
+	if (required_depth > file_struct->top_level)
+		return NULL;	/* Not tall enough, so we can't find it */
+
+	/* Traverse down to level 0 */
+	while (level > 0 && tn) {
+		tn = tn->internal[(chunk_id >>
+				   (YAFFS_TNODES_LEVEL0_BITS +
+				    (level - 1) *
+				    YAFFS_TNODES_INTERNAL_BITS)) &
+				  YAFFS_TNODES_INTERNAL_MASK];
+		level--;
+	}
+
+	return tn;
+}
+
+/* AddOrFindLevel0Tnode finds the level 0 tnode if it exists, otherwise first expands the tree.
+ * This happens in two steps:
+ *  1. If the tree isn't tall enough, then make it taller.
+ *  2. Scan down the tree towards the level 0 tnode adding tnodes if required.
+ *
+ * Used when modifying the tree.
+ *
+ *  If the tn argument is NULL, then a fresh tnode will be added otherwise the specified tn will
+ *  be plugged into the ttree.
+ */
+
+struct yaffs_tnode *yaffs_add_find_tnode_0(struct yaffs_dev *dev,
+					   struct yaffs_file_var *file_struct,
+					   u32 chunk_id,
+					   struct yaffs_tnode *passed_tn)
+{
+	int required_depth;
+	int i;
+	int l;
+	struct yaffs_tnode *tn;
+
+	u32 x;
+
+	/* Check sane level and page Id */
+	if (file_struct->top_level < 0
+	    || file_struct->top_level > YAFFS_TNODES_MAX_LEVEL)
+		return NULL;
+
+	if (chunk_id > YAFFS_MAX_CHUNK_ID)
+		return NULL;
+
+	/* First check we're tall enough (ie enough top_level) */
+
+	x = chunk_id >> YAFFS_TNODES_LEVEL0_BITS;
+	required_depth = 0;
+	while (x) {
+		x >>= YAFFS_TNODES_INTERNAL_BITS;
+		required_depth++;
+	}
+
+	if (required_depth > file_struct->top_level) {
+		/* Not tall enough, gotta make the tree taller */
+		for (i = file_struct->top_level; i < required_depth; i++) {
+
+			tn = yaffs_get_tnode(dev);
+
+			if (tn) {
+				tn->internal[0] = file_struct->top;
+				file_struct->top = tn;
+				file_struct->top_level++;
+			} else {
+				yaffs_trace(YAFFS_TRACE_ERROR, "yaffs: no more tnodes");
+				return NULL;
+			}
+		}
+	}
+
+	/* Traverse down to level 0, adding anything we need */
+
+	l = file_struct->top_level;
+	tn = file_struct->top;
+
+	if (l > 0) {
+		while (l > 0 && tn) {
+			x = (chunk_id >>
+			     (YAFFS_TNODES_LEVEL0_BITS +
+			      (l - 1) * YAFFS_TNODES_INTERNAL_BITS)) &
+			    YAFFS_TNODES_INTERNAL_MASK;
+
+			if ((l > 1) && !tn->internal[x]) {
+				/* Add missing non-level-zero tnode */
+				tn->internal[x] = yaffs_get_tnode(dev);
+				if (!tn->internal[x])
+					return NULL;
+			} else if (l == 1) {
+				/* Looking from level 1 at level 0 */
+				if (passed_tn) {
+					/* If we already have one, then release it. */
+					if (tn->internal[x])
+						yaffs_free_tnode(dev,
+								 tn->
+								 internal[x]);
+					tn->internal[x] = passed_tn;
+
+				} else if (!tn->internal[x]) {
+					/* Don't have one, none passed in */
+					tn->internal[x] = yaffs_get_tnode(dev);
+					if (!tn->internal[x])
+						return NULL;
+				}
+			}
+
+			tn = tn->internal[x];
+			l--;
+		}
+	} else {
+		/* top is level 0 */
+		if (passed_tn) {
+			memcpy(tn, passed_tn,
+			       (dev->tnode_width * YAFFS_NTNODES_LEVEL0) / 8);
+			yaffs_free_tnode(dev, passed_tn);
+		}
+	}
+
+	return tn;
+}
+
+static int yaffs_tags_match(const struct yaffs_ext_tags *tags, int obj_id,
+			    int chunk_obj)
+{
+	return (tags->chunk_id == chunk_obj &&
+		tags->obj_id == obj_id && !tags->is_deleted) ? 1 : 0;
+
+}
+
+static int yaffs_find_chunk_in_group(struct yaffs_dev *dev, int the_chunk,
+				     struct yaffs_ext_tags *tags, int obj_id,
+				     int inode_chunk)
+{
+	int j;
+
+	for (j = 0; the_chunk && j < dev->chunk_grp_size; j++) {
+		if (yaffs_check_chunk_bit
+		    (dev, the_chunk / dev->param.chunks_per_block,
+		     the_chunk % dev->param.chunks_per_block)) {
+
+			if (dev->chunk_grp_size == 1)
+				return the_chunk;
+			else {
+				yaffs_rd_chunk_tags_nand(dev, the_chunk, NULL,
+							 tags);
+				if (yaffs_tags_match(tags, obj_id, inode_chunk)) {
+					/* found it; */
+					return the_chunk;
+				}
+			}
+		}
+		the_chunk++;
+	}
+	return -1;
+}
+
+static int yaffs_find_chunk_in_file(struct yaffs_obj *in, int inode_chunk,
+				    struct yaffs_ext_tags *tags)
+{
+	/*Get the Tnode, then get the level 0 offset chunk offset */
+	struct yaffs_tnode *tn;
+	int the_chunk = -1;
+	struct yaffs_ext_tags local_tags;
+	int ret_val = -1;
+
+	struct yaffs_dev *dev = in->my_dev;
+
+	if (!tags) {
+		/* Passed a NULL, so use our own tags space */
+		tags = &local_tags;
+	}
+
+	tn = yaffs_find_tnode_0(dev, &in->variant.file_variant, inode_chunk);
+
+	if (tn) {
+		the_chunk = yaffs_get_group_base(dev, tn, inode_chunk);
+
+		ret_val =
+		    yaffs_find_chunk_in_group(dev, the_chunk, tags, in->obj_id,
+					      inode_chunk);
+	}
+	return ret_val;
+}
+
+static int yaffs_find_del_file_chunk(struct yaffs_obj *in, int inode_chunk,
+				     struct yaffs_ext_tags *tags)
+{
+	/* Get the Tnode, then get the level 0 offset chunk offset */
+	struct yaffs_tnode *tn;
+	int the_chunk = -1;
+	struct yaffs_ext_tags local_tags;
+
+	struct yaffs_dev *dev = in->my_dev;
+	int ret_val = -1;
+
+	if (!tags) {
+		/* Passed a NULL, so use our own tags space */
+		tags = &local_tags;
+	}
+
+	tn = yaffs_find_tnode_0(dev, &in->variant.file_variant, inode_chunk);
+
+	if (tn) {
+
+		the_chunk = yaffs_get_group_base(dev, tn, inode_chunk);
+
+		ret_val =
+		    yaffs_find_chunk_in_group(dev, the_chunk, tags, in->obj_id,
+					      inode_chunk);
+
+		/* Delete the entry in the filestructure (if found) */
+		if (ret_val != -1)
+			yaffs_load_tnode_0(dev, tn, inode_chunk, 0);
+	}
+
+	return ret_val;
+}
+
+int yaffs_put_chunk_in_file(struct yaffs_obj *in, int inode_chunk,
+			    int nand_chunk, int in_scan)
+{
+	/* NB in_scan is zero unless scanning.
+	 * For forward scanning, in_scan is > 0;
+	 * for backward scanning in_scan is < 0
+	 *
+	 * nand_chunk = 0 is a dummy insert to make sure the tnodes are there.
+	 */
+
+	struct yaffs_tnode *tn;
+	struct yaffs_dev *dev = in->my_dev;
+	int existing_cunk;
+	struct yaffs_ext_tags existing_tags;
+	struct yaffs_ext_tags new_tags;
+	unsigned existing_serial, new_serial;
+
+	if (in->variant_type != YAFFS_OBJECT_TYPE_FILE) {
+		/* Just ignore an attempt at putting a chunk into a non-file during scanning
+		 * If it is not during Scanning then something went wrong!
+		 */
+		if (!in_scan) {
+			yaffs_trace(YAFFS_TRACE_ERROR,
+				"yaffs tragedy:attempt to put data chunk into a non-file"
+				);
+			YBUG();
+		}
+
+		yaffs_chunk_del(dev, nand_chunk, 1, __LINE__);
+		return YAFFS_OK;
+	}
+
+	tn = yaffs_add_find_tnode_0(dev,
+				    &in->variant.file_variant,
+				    inode_chunk, NULL);
+	if (!tn)
+		return YAFFS_FAIL;
+
+	if (!nand_chunk)
+		/* Dummy insert, bail now */
+		return YAFFS_OK;
+
+	existing_cunk = yaffs_get_group_base(dev, tn, inode_chunk);
+
+	if (in_scan != 0) {
+		/* If we're scanning then we need to test for duplicates
+		 * NB This does not need to be efficient since it should only ever
+		 * happen when the power fails during a write, then only one
+		 * chunk should ever be affected.
+		 *
+		 * Correction for YAFFS2: This could happen quite a lot and we need to think about efficiency! TODO
+		 * Update: For backward scanning we don't need to re-read tags so this is quite cheap.
+		 */
+
+		if (existing_cunk > 0) {
+			/* NB Right now existing chunk will not be real chunk_id if the chunk group size > 1
+			 *    thus we have to do a FindChunkInFile to get the real chunk id.
+			 *
+			 * We have a duplicate now we need to decide which one to use:
+			 *
+			 * Backwards scanning YAFFS2: The old one is what we use, dump the new one.
+			 * Forward scanning YAFFS2: The new one is what we use, dump the old one.
+			 * YAFFS1: Get both sets of tags and compare serial numbers.
+			 */
+
+			if (in_scan > 0) {
+				/* Only do this for forward scanning */
+				yaffs_rd_chunk_tags_nand(dev,
+							 nand_chunk,
+							 NULL, &new_tags);
+
+				/* Do a proper find */
+				existing_cunk =
+				    yaffs_find_chunk_in_file(in, inode_chunk,
+							     &existing_tags);
+			}
+
+			if (existing_cunk <= 0) {
+				/*Hoosterman - how did this happen? */
+
+				yaffs_trace(YAFFS_TRACE_ERROR,
+					"yaffs tragedy: existing chunk < 0 in scan"
+					);
+
+			}
+
+			/* NB The deleted flags should be false, otherwise the chunks will
+			 * not be loaded during a scan
+			 */
+
+			if (in_scan > 0) {
+				new_serial = new_tags.serial_number;
+				existing_serial = existing_tags.serial_number;
+			}
+
+			if ((in_scan > 0) &&
+			    (existing_cunk <= 0 ||
+			     ((existing_serial + 1) & 3) == new_serial)) {
+				/* Forward scanning.
+				 * Use new
+				 * Delete the old one and drop through to update the tnode
+				 */
+				yaffs_chunk_del(dev, existing_cunk, 1,
+						__LINE__);
+			} else {
+				/* Backward scanning or we want to use the existing one
+				 * Use existing.
+				 * Delete the new one and return early so that the tnode isn't changed
+				 */
+				yaffs_chunk_del(dev, nand_chunk, 1, __LINE__);
+				return YAFFS_OK;
+			}
+		}
+
+	}
+
+	if (existing_cunk == 0)
+		in->n_data_chunks++;
+
+	yaffs_load_tnode_0(dev, tn, inode_chunk, nand_chunk);
+
+	return YAFFS_OK;
+}
+
+static void yaffs_soft_del_chunk(struct yaffs_dev *dev, int chunk)
+{
+	struct yaffs_block_info *the_block;
+	unsigned block_no;
+
+	yaffs_trace(YAFFS_TRACE_DELETION, "soft delete chunk %d", chunk);
+
+	block_no = chunk / dev->param.chunks_per_block;
+	the_block = yaffs_get_block_info(dev, block_no);
+	if (the_block) {
+		the_block->soft_del_pages++;
+		dev->n_free_chunks++;
+		yaffs2_update_oldest_dirty_seq(dev, block_no, the_block);
+	}
+}
+
+/* SoftDeleteWorker scans backwards through the tnode tree and soft deletes all the chunks in the file.
+ * All soft deleting does is increment the block's softdelete count and pulls the chunk out
+ * of the tnode.
+ * Thus, essentially this is the same as DeleteWorker except that the chunks are soft deleted.
+ */
+
+static int yaffs_soft_del_worker(struct yaffs_obj *in, struct yaffs_tnode *tn,
+				 u32 level, int chunk_offset)
+{
+	int i;
+	int the_chunk;
+	int all_done = 1;
+	struct yaffs_dev *dev = in->my_dev;
+
+	if (tn) {
+		if (level > 0) {
+
+			for (i = YAFFS_NTNODES_INTERNAL - 1; all_done && i >= 0;
+			     i--) {
+				if (tn->internal[i]) {
+					all_done =
+					    yaffs_soft_del_worker(in,
+								  tn->internal
+								  [i],
+								  level - 1,
+								  (chunk_offset
+								   <<
+								   YAFFS_TNODES_INTERNAL_BITS)
+								  + i);
+					if (all_done) {
+						yaffs_free_tnode(dev,
+								 tn->internal
+								 [i]);
+						tn->internal[i] = NULL;
+					} else {
+						/* Hoosterman... how could this happen? */
+					}
+				}
+			}
+			return (all_done) ? 1 : 0;
+		} else if (level == 0) {
+
+			for (i = YAFFS_NTNODES_LEVEL0 - 1; i >= 0; i--) {
+				the_chunk = yaffs_get_group_base(dev, tn, i);
+				if (the_chunk) {
+					/* Note this does not find the real chunk, only the chunk group.
+					 * We make an assumption that a chunk group is not larger than
+					 * a block.
+					 */
+					yaffs_soft_del_chunk(dev, the_chunk);
+					yaffs_load_tnode_0(dev, tn, i, 0);
+				}
+
+			}
+			return 1;
+
+		}
+
+	}
+
+	return 1;
+
+}
+
+static void yaffs_remove_obj_from_dir(struct yaffs_obj *obj)
+{
+	struct yaffs_dev *dev = obj->my_dev;
+	struct yaffs_obj *parent;
+
+	yaffs_verify_obj_in_dir(obj);
+	parent = obj->parent;
+
+	yaffs_verify_dir(parent);
+
+	if (dev && dev->param.remove_obj_fn)
+		dev->param.remove_obj_fn(obj);
+
+	list_del_init(&obj->siblings);
+	obj->parent = NULL;
+
+	yaffs_verify_dir(parent);
+}
+
+void yaffs_add_obj_to_dir(struct yaffs_obj *directory, struct yaffs_obj *obj)
+{
+	if (!directory) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS,
+			"tragedy: Trying to add an object to a null pointer directory"
+			);
+		YBUG();
+		return;
+	}
+	if (directory->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS,
+			"tragedy: Trying to add an object to a non-directory"
+			);
+		YBUG();
+	}
+
+	if (obj->siblings.prev == NULL) {
+		/* Not initialised */
+		YBUG();
+	}
+
+	yaffs_verify_dir(directory);
+
+	yaffs_remove_obj_from_dir(obj);
+
+	/* Now add it */
+	list_add(&obj->siblings, &directory->variant.dir_variant.children);
+	obj->parent = directory;
+
+	if (directory == obj->my_dev->unlinked_dir
+	    || directory == obj->my_dev->del_dir) {
+		obj->unlinked = 1;
+		obj->my_dev->n_unlinked_files++;
+		obj->rename_allowed = 0;
+	}
+
+	yaffs_verify_dir(directory);
+	yaffs_verify_obj_in_dir(obj);
+}
+
+static int yaffs_change_obj_name(struct yaffs_obj *obj,
+				 struct yaffs_obj *new_dir,
+				 const YCHAR * new_name, int force, int shadows)
+{
+	int unlink_op;
+	int del_op;
+
+	struct yaffs_obj *existing_target;
+
+	if (new_dir == NULL)
+		new_dir = obj->parent;	/* use the old directory */
+
+	if (new_dir->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS,
+			"tragedy: yaffs_change_obj_name: new_dir is not a directory"
+			);
+		YBUG();
+	}
+
+	/* TODO: Do we need this different handling for YAFFS2 and YAFFS1?? */
+	if (obj->my_dev->param.is_yaffs2)
+		unlink_op = (new_dir == obj->my_dev->unlinked_dir);
+	else
+		unlink_op = (new_dir == obj->my_dev->unlinked_dir
+			     && obj->variant_type == YAFFS_OBJECT_TYPE_FILE);
+
+	del_op = (new_dir == obj->my_dev->del_dir);
+
+	existing_target = yaffs_find_by_name(new_dir, new_name);
+
+	/* If the object is a file going into the unlinked directory,
+	 *   then it is OK to just stuff it in since duplicate names are allowed.
+	 *   else only proceed if the new name does not exist and if we're putting
+	 *   it into a directory.
+	 */
+	if ((unlink_op ||
+	     del_op ||
+	     force ||
+	     (shadows > 0) ||
+	     !existing_target) &&
+	    new_dir->variant_type == YAFFS_OBJECT_TYPE_DIRECTORY) {
+		yaffs_set_obj_name(obj, new_name);
+		obj->dirty = 1;
+
+		yaffs_add_obj_to_dir(new_dir, obj);
+
+		if (unlink_op)
+			obj->unlinked = 1;
+
+		/* If it is a deletion then we mark it as a shrink for gc purposes. */
+		if (yaffs_update_oh(obj, new_name, 0, del_op, shadows, NULL) >=
+		    0)
+			return YAFFS_OK;
+	}
+
+	return YAFFS_FAIL;
+}
+
+/*------------------------ Short Operations Cache ----------------------------------------
+ *   In many situations where there is no high level buffering  a lot of
+ *   reads might be short sequential reads, and a lot of writes may be short
+ *   sequential writes. eg. scanning/writing a jpeg file.
+ *   In these cases, a short read/write cache can provide a huge perfomance
+ *   benefit with dumb-as-a-rock code.
+ *   In Linux, the page cache provides read buffering and the short op cache 
+ *   provides write buffering.
+ *
+ *   There are a limited number (~10) of cache chunks per device so that we don't
+ *   need a very intelligent search.
+ */
+
+static int yaffs_obj_cache_dirty(struct yaffs_obj *obj)
+{
+	struct yaffs_dev *dev = obj->my_dev;
+	int i;
+	struct yaffs_cache *cache;
+	int n_caches = obj->my_dev->param.n_caches;
+
+	for (i = 0; i < n_caches; i++) {
+		cache = &dev->cache[i];
+		if (cache->object == obj && cache->dirty)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void yaffs_flush_file_cache(struct yaffs_obj *obj)
+{
+	struct yaffs_dev *dev = obj->my_dev;
+	int lowest = -99;	/* Stop compiler whining. */
+	int i;
+	struct yaffs_cache *cache;
+	int chunk_written = 0;
+	int n_caches = obj->my_dev->param.n_caches;
+
+	if (n_caches > 0) {
+		do {
+			cache = NULL;
+
+			/* Find the dirty cache for this object with the lowest chunk id. */
+			for (i = 0; i < n_caches; i++) {
+				if (dev->cache[i].object == obj &&
+				    dev->cache[i].dirty) {
+					if (!cache
+					    || dev->cache[i].chunk_id <
+					    lowest) {
+						cache = &dev->cache[i];
+						lowest = cache->chunk_id;
+					}
+				}
+			}
+
+			if (cache && !cache->locked) {
+				/* Write it out and free it up */
+
+				chunk_written =
+				    yaffs_wr_data_obj(cache->object,
+						      cache->chunk_id,
+						      cache->data,
+						      cache->n_bytes, 1);
+				cache->dirty = 0;
+				cache->object = NULL;
+			}
+
+		} while (cache && chunk_written > 0);
+
+		if (cache)
+			/* Hoosterman, disk full while writing cache out. */
+			yaffs_trace(YAFFS_TRACE_ERROR,
+				"yaffs tragedy: no space during cache write");
+
+	}
+
+}
+
+/*yaffs_flush_whole_cache(dev)
+ *
+ *
+ */
+
+void yaffs_flush_whole_cache(struct yaffs_dev *dev)
+{
+	struct yaffs_obj *obj;
+	int n_caches = dev->param.n_caches;
+	int i;
+
+	/* Find a dirty object in the cache and flush it...
+	 * until there are no further dirty objects.
+	 */
+	do {
+		obj = NULL;
+		for (i = 0; i < n_caches && !obj; i++) {
+			if (dev->cache[i].object && dev->cache[i].dirty)
+				obj = dev->cache[i].object;
+
+		}
+		if (obj)
+			yaffs_flush_file_cache(obj);
+
+	} while (obj);
+
+}
+
+/* Grab us a cache chunk for use.
+ * First look for an empty one.
+ * Then look for the least recently used non-dirty one.
+ * Then look for the least recently used dirty one...., flush and look again.
+ */
+static struct yaffs_cache *yaffs_grab_chunk_worker(struct yaffs_dev *dev)
+{
+	int i;
+
+	if (dev->param.n_caches > 0) {
+		for (i = 0; i < dev->param.n_caches; i++) {
+			if (!dev->cache[i].object)
+				return &dev->cache[i];
+		}
+	}
+
+	return NULL;
+}
+
+static struct yaffs_cache *yaffs_grab_chunk_cache(struct yaffs_dev *dev)
+{
+	struct yaffs_cache *cache;
+	struct yaffs_obj *the_obj;
+	int usage;
+	int i;
+	int pushout;
+
+	if (dev->param.n_caches > 0) {
+		/* Try find a non-dirty one... */
+
+		cache = yaffs_grab_chunk_worker(dev);
+
+		if (!cache) {
+			/* They were all dirty, find the last recently used object and flush
+			 * its cache, then  find again.
+			 * NB what's here is not very accurate, we actually flush the object
+			 * the last recently used page.
+			 */
+
+			/* With locking we can't assume we can use entry zero */
+
+			the_obj = NULL;
+			usage = -1;
+			cache = NULL;
+			pushout = -1;
+
+			for (i = 0; i < dev->param.n_caches; i++) {
+				if (dev->cache[i].object &&
+				    !dev->cache[i].locked &&
+				    (dev->cache[i].last_use < usage
+				     || !cache)) {
+					usage = dev->cache[i].last_use;
+					the_obj = dev->cache[i].object;
+					cache = &dev->cache[i];
+					pushout = i;
+				}
+			}
+
+			if (!cache || cache->dirty) {
+				/* Flush and try again */
+				yaffs_flush_file_cache(the_obj);
+				cache = yaffs_grab_chunk_worker(dev);
+			}
+
+		}
+		return cache;
+	} else {
+		return NULL;
+        }
+}
+
+/* Find a cached chunk */
+static struct yaffs_cache *yaffs_find_chunk_cache(const struct yaffs_obj *obj,
+						  int chunk_id)
+{
+	struct yaffs_dev *dev = obj->my_dev;
+	int i;
+	if (dev->param.n_caches > 0) {
+		for (i = 0; i < dev->param.n_caches; i++) {
+			if (dev->cache[i].object == obj &&
+			    dev->cache[i].chunk_id == chunk_id) {
+				dev->cache_hits++;
+
+				return &dev->cache[i];
+			}
+		}
+	}
+	return NULL;
+}
+
+/* Mark the chunk for the least recently used algorithym */
+static void yaffs_use_cache(struct yaffs_dev *dev, struct yaffs_cache *cache,
+			    int is_write)
+{
+
+	if (dev->param.n_caches > 0) {
+		if (dev->cache_last_use < 0 || dev->cache_last_use > 100000000) {
+			/* Reset the cache usages */
+			int i;
+			for (i = 1; i < dev->param.n_caches; i++)
+				dev->cache[i].last_use = 0;
+
+			dev->cache_last_use = 0;
+		}
+
+		dev->cache_last_use++;
+
+		cache->last_use = dev->cache_last_use;
+
+		if (is_write)
+			cache->dirty = 1;
+	}
+}
+
+/* Invalidate a single cache page.
+ * Do this when a whole page gets written,
+ * ie the short cache for this page is no longer valid.
+ */
+static void yaffs_invalidate_chunk_cache(struct yaffs_obj *object, int chunk_id)
+{
+	if (object->my_dev->param.n_caches > 0) {
+		struct yaffs_cache *cache =
+		    yaffs_find_chunk_cache(object, chunk_id);
+
+		if (cache)
+			cache->object = NULL;
+	}
+}
+
+/* Invalidate all the cache pages associated with this object
+ * Do this whenever ther file is deleted or resized.
+ */
+static void yaffs_invalidate_whole_cache(struct yaffs_obj *in)
+{
+	int i;
+	struct yaffs_dev *dev = in->my_dev;
+
+	if (dev->param.n_caches > 0) {
+		/* Invalidate it. */
+		for (i = 0; i < dev->param.n_caches; i++) {
+			if (dev->cache[i].object == in)
+				dev->cache[i].object = NULL;
+		}
+	}
+}
+
+static void yaffs_unhash_obj(struct yaffs_obj *obj)
+{
+	int bucket;
+	struct yaffs_dev *dev = obj->my_dev;
+
+	/* If it is still linked into the bucket list, free from the list */
+	if (!list_empty(&obj->hash_link)) {
+		list_del_init(&obj->hash_link);
+		bucket = yaffs_hash_fn(obj->obj_id);
+		dev->obj_bucket[bucket].count--;
+	}
+}
+
+/*  FreeObject frees up a Object and puts it back on the free list */
+static void yaffs_free_obj(struct yaffs_obj *obj)
+{
+	struct yaffs_dev *dev = obj->my_dev;
+
+	yaffs_trace(YAFFS_TRACE_OS, "FreeObject %p inode %p",
+		obj, obj->my_inode);
+
+	if (!obj)
+		YBUG();
+	if (obj->parent)
+		YBUG();
+	if (!list_empty(&obj->siblings))
+		YBUG();
+
+	if (obj->my_inode) {
+		/* We're still hooked up to a cached inode.
+		 * Don't delete now, but mark for later deletion
+		 */
+		obj->defered_free = 1;
+		return;
+	}
+
+	yaffs_unhash_obj(obj);
+
+	yaffs_free_raw_obj(dev, obj);
+	dev->n_obj--;
+	dev->checkpoint_blocks_required = 0;	/* force recalculation */
+}
+
+void yaffs_handle_defered_free(struct yaffs_obj *obj)
+{
+	if (obj->defered_free)
+		yaffs_free_obj(obj);
+}
+
+static int yaffs_generic_obj_del(struct yaffs_obj *in)
+{
+
+	/* First off, invalidate the file's data in the cache, without flushing. */
+	yaffs_invalidate_whole_cache(in);
+
+	if (in->my_dev->param.is_yaffs2 && (in->parent != in->my_dev->del_dir)) {
+		/* Move to the unlinked directory so we have a record that it was deleted. */
+		yaffs_change_obj_name(in, in->my_dev->del_dir, _Y("deleted"), 0,
+				      0);
+
+	}
+
+	yaffs_remove_obj_from_dir(in);
+	yaffs_chunk_del(in->my_dev, in->hdr_chunk, 1, __LINE__);
+	in->hdr_chunk = 0;
+
+	yaffs_free_obj(in);
+	return YAFFS_OK;
+
+}
+
+static void yaffs_soft_del_file(struct yaffs_obj *obj)
+{
+	if (obj->deleted &&
+	    obj->variant_type == YAFFS_OBJECT_TYPE_FILE && !obj->soft_del) {
+		if (obj->n_data_chunks <= 0) {
+			/* Empty file with no duplicate object headers,
+			 * just delete it immediately */
+			yaffs_free_tnode(obj->my_dev,
+					 obj->variant.file_variant.top);
+			obj->variant.file_variant.top = NULL;
+			yaffs_trace(YAFFS_TRACE_TRACING,
+				"yaffs: Deleting empty file %d",
+				obj->obj_id);
+			yaffs_generic_obj_del(obj);
+		} else {
+			yaffs_soft_del_worker(obj,
+					      obj->variant.file_variant.top,
+					      obj->variant.
+					      file_variant.top_level, 0);
+			obj->soft_del = 1;
+		}
+	}
+}
+
+/* Pruning removes any part of the file structure tree that is beyond the
+ * bounds of the file (ie that does not point to chunks).
+ *
+ * A file should only get pruned when its size is reduced.
+ *
+ * Before pruning, the chunks must be pulled from the tree and the
+ * level 0 tnode entries must be zeroed out.
+ * Could also use this for file deletion, but that's probably better handled
+ * by a special case.
+ *
+ * This function is recursive. For levels > 0 the function is called again on
+ * any sub-tree. For level == 0 we just check if the sub-tree has data.
+ * If there is no data in a subtree then it is pruned.
+ */
+
+static struct yaffs_tnode *yaffs_prune_worker(struct yaffs_dev *dev,
+					      struct yaffs_tnode *tn, u32 level,
+					      int del0)
+{
+	int i;
+	int has_data;
+
+	if (tn) {
+		has_data = 0;
+
+		if (level > 0) {
+			for (i = 0; i < YAFFS_NTNODES_INTERNAL; i++) {
+				if (tn->internal[i]) {
+					tn->internal[i] =
+					    yaffs_prune_worker(dev,
+							       tn->internal[i],
+							       level - 1,
+							       (i ==
+								0) ? del0 : 1);
+				}
+
+				if (tn->internal[i])
+					has_data++;
+			}
+		} else {
+			int tnode_size_u32 = dev->tnode_size / sizeof(u32);
+			u32 *map = (u32 *) tn;
+
+			for (i = 0; !has_data && i < tnode_size_u32; i++) {
+				if (map[i])
+					has_data++;
+			}
+		}
+
+		if (has_data == 0 && del0) {
+			/* Free and return NULL */
+
+			yaffs_free_tnode(dev, tn);
+			tn = NULL;
+		}
+
+	}
+
+	return tn;
+
+}
+
+static int yaffs_prune_tree(struct yaffs_dev *dev,
+			    struct yaffs_file_var *file_struct)
+{
+	int i;
+	int has_data;
+	int done = 0;
+	struct yaffs_tnode *tn;
+
+	if (file_struct->top_level > 0) {
+		file_struct->top =
+		    yaffs_prune_worker(dev, file_struct->top,
+				       file_struct->top_level, 0);
+
+		/* Now we have a tree with all the non-zero branches NULL but the height
+		 * is the same as it was.
+		 * Let's see if we can trim internal tnodes to shorten the tree.
+		 * We can do this if only the 0th element in the tnode is in use
+		 * (ie all the non-zero are NULL)
+		 */
+
+		while (file_struct->top_level && !done) {
+			tn = file_struct->top;
+
+			has_data = 0;
+			for (i = 1; i < YAFFS_NTNODES_INTERNAL; i++) {
+				if (tn->internal[i])
+					has_data++;
+			}
+
+			if (!has_data) {
+				file_struct->top = tn->internal[0];
+				file_struct->top_level--;
+				yaffs_free_tnode(dev, tn);
+			} else {
+				done = 1;
+			}
+		}
+	}
+
+	return YAFFS_OK;
+}
+
+/*-------------------- End of File Structure functions.-------------------*/
+
+/* AllocateEmptyObject gets us a clean Object. Tries to make allocate more if we run out */
+static struct yaffs_obj *yaffs_alloc_empty_obj(struct yaffs_dev *dev)
+{
+	struct yaffs_obj *obj = yaffs_alloc_raw_obj(dev);
+
+	if (obj) {
+		dev->n_obj++;
+
+		/* Now sweeten it up... */
+
+		memset(obj, 0, sizeof(struct yaffs_obj));
+		obj->being_created = 1;
+
+		obj->my_dev = dev;
+		obj->hdr_chunk = 0;
+		obj->variant_type = YAFFS_OBJECT_TYPE_UNKNOWN;
+		INIT_LIST_HEAD(&(obj->hard_links));
+		INIT_LIST_HEAD(&(obj->hash_link));
+		INIT_LIST_HEAD(&obj->siblings);
+
+		/* Now make the directory sane */
+		if (dev->root_dir) {
+			obj->parent = dev->root_dir;
+			list_add(&(obj->siblings),
+				 &dev->root_dir->variant.dir_variant.children);
+		}
+
+		/* Add it to the lost and found directory.
+		 * NB Can't put root or lost-n-found in lost-n-found so
+		 * check if lost-n-found exists first
+		 */
+		if (dev->lost_n_found)
+			yaffs_add_obj_to_dir(dev->lost_n_found, obj);
+
+		obj->being_created = 0;
+	}
+
+	dev->checkpoint_blocks_required = 0;	/* force recalculation */
+
+	return obj;
+}
+
+static int yaffs_find_nice_bucket(struct yaffs_dev *dev)
+{
+	int i;
+	int l = 999;
+	int lowest = 999999;
+
+	/* Search for the shortest list or one that
+	 * isn't too long.
+	 */
+
+	for (i = 0; i < 10 && lowest > 4; i++) {
+		dev->bucket_finder++;
+		dev->bucket_finder %= YAFFS_NOBJECT_BUCKETS;
+		if (dev->obj_bucket[dev->bucket_finder].count < lowest) {
+			lowest = dev->obj_bucket[dev->bucket_finder].count;
+			l = dev->bucket_finder;
+		}
+
+	}
+
+	return l;
+}
+
+static int yaffs_new_obj_id(struct yaffs_dev *dev)
+{
+	int bucket = yaffs_find_nice_bucket(dev);
+
+	/* Now find an object value that has not already been taken
+	 * by scanning the list.
+	 */
+
+	int found = 0;
+	struct list_head *i;
+
+	u32 n = (u32) bucket;
+
+	/* yaffs_check_obj_hash_sane();  */
+
+	while (!found) {
+		found = 1;
+		n += YAFFS_NOBJECT_BUCKETS;
+		if (1 || dev->obj_bucket[bucket].count > 0) {
+			list_for_each(i, &dev->obj_bucket[bucket].list) {
+				/* If there is already one in the list */
+				if (i && list_entry(i, struct yaffs_obj,
+						    hash_link)->obj_id == n) {
+					found = 0;
+				}
+			}
+		}
+	}
+
+	return n;
+}
+
+static void yaffs_hash_obj(struct yaffs_obj *in)
+{
+	int bucket = yaffs_hash_fn(in->obj_id);
+	struct yaffs_dev *dev = in->my_dev;
+
+	list_add(&in->hash_link, &dev->obj_bucket[bucket].list);
+	dev->obj_bucket[bucket].count++;
+}
+
+struct yaffs_obj *yaffs_find_by_number(struct yaffs_dev *dev, u32 number)
+{
+	int bucket = yaffs_hash_fn(number);
+	struct list_head *i;
+	struct yaffs_obj *in;
+
+	list_for_each(i, &dev->obj_bucket[bucket].list) {
+		/* Look if it is in the list */
+		if (i) {
+			in = list_entry(i, struct yaffs_obj, hash_link);
+			if (in->obj_id == number) {
+
+				/* Don't tell the VFS about this one if it is defered free */
+				if (in->defered_free)
+					return NULL;
+
+				return in;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+struct yaffs_obj *yaffs_new_obj(struct yaffs_dev *dev, int number,
+				enum yaffs_obj_type type)
+{
+	struct yaffs_obj *the_obj = NULL;
+	struct yaffs_tnode *tn = NULL;
+
+	if (number < 0)
+		number = yaffs_new_obj_id(dev);
+
+	if (type == YAFFS_OBJECT_TYPE_FILE) {
+		tn = yaffs_get_tnode(dev);
+		if (!tn)
+			return NULL;
+	}
+
+	the_obj = yaffs_alloc_empty_obj(dev);
+	if (!the_obj) {
+		if (tn)
+			yaffs_free_tnode(dev, tn);
+		return NULL;
+	}
+
+	if (the_obj) {
+		the_obj->fake = 0;
+		the_obj->rename_allowed = 1;
+		the_obj->unlink_allowed = 1;
+		the_obj->obj_id = number;
+		yaffs_hash_obj(the_obj);
+		the_obj->variant_type = type;
+		yaffs_load_current_time(the_obj, 1, 1);
+
+		switch (type) {
+		case YAFFS_OBJECT_TYPE_FILE:
+			the_obj->variant.file_variant.file_size = 0;
+			the_obj->variant.file_variant.scanned_size = 0;
+			the_obj->variant.file_variant.shrink_size = ~0;	/* max */
+			the_obj->variant.file_variant.top_level = 0;
+			the_obj->variant.file_variant.top = tn;
+			break;
+		case YAFFS_OBJECT_TYPE_DIRECTORY:
+			INIT_LIST_HEAD(&the_obj->variant.dir_variant.children);
+			INIT_LIST_HEAD(&the_obj->variant.dir_variant.dirty);
+			break;
+		case YAFFS_OBJECT_TYPE_SYMLINK:
+		case YAFFS_OBJECT_TYPE_HARDLINK:
+		case YAFFS_OBJECT_TYPE_SPECIAL:
+			/* No action required */
+			break;
+		case YAFFS_OBJECT_TYPE_UNKNOWN:
+			/* todo this should not happen */
+			break;
+		}
+	}
+
+	return the_obj;
+}
+
+static struct yaffs_obj *yaffs_create_fake_dir(struct yaffs_dev *dev,
+					       int number, u32 mode)
+{
+
+	struct yaffs_obj *obj =
+	    yaffs_new_obj(dev, number, YAFFS_OBJECT_TYPE_DIRECTORY);
+	if (obj) {
+		obj->fake = 1;	/* it is fake so it might have no NAND presence... */
+		obj->rename_allowed = 0;	/* ... and we're not allowed to rename it... */
+		obj->unlink_allowed = 0;	/* ... or unlink it */
+		obj->deleted = 0;
+		obj->unlinked = 0;
+		obj->yst_mode = mode;
+		obj->my_dev = dev;
+		obj->hdr_chunk = 0;	/* Not a valid chunk. */
+	}
+
+	return obj;
+
+}
+
+
+static void yaffs_init_tnodes_and_objs(struct yaffs_dev *dev)
+{
+	int i;
+
+	dev->n_obj = 0;
+	dev->n_tnodes = 0;
+
+	yaffs_init_raw_tnodes_and_objs(dev);
+
+	for (i = 0; i < YAFFS_NOBJECT_BUCKETS; i++) {
+		INIT_LIST_HEAD(&dev->obj_bucket[i].list);
+		dev->obj_bucket[i].count = 0;
+	}
+}
+
+struct yaffs_obj *yaffs_find_or_create_by_number(struct yaffs_dev *dev,
+						 int number,
+						 enum yaffs_obj_type type)
+{
+	struct yaffs_obj *the_obj = NULL;
+
+	if (number > 0)
+		the_obj = yaffs_find_by_number(dev, number);
+
+	if (!the_obj)
+		the_obj = yaffs_new_obj(dev, number, type);
+
+	return the_obj;
+
+}
+
+YCHAR *yaffs_clone_str(const YCHAR * str)
+{
+	YCHAR *new_str = NULL;
+	int len;
+
+	if (!str)
+		str = _Y("");
+
+	len = strnlen(str, YAFFS_MAX_ALIAS_LENGTH);
+	new_str = kmalloc((len + 1) * sizeof(YCHAR), GFP_NOFS);
+	if (new_str) {
+		strncpy(new_str, str, len);
+		new_str[len] = 0;
+	}
+	return new_str;
+
+}
+/*
+ *yaffs_update_parent() handles fixing a directories mtime and ctime when a new
+ * link (ie. name) is created or deleted in the directory.
+ *
+ * ie.
+ *   create dir/a : update dir's mtime/ctime
+ *   rm dir/a:   update dir's mtime/ctime
+ *   modify dir/a: don't update dir's mtimme/ctime
+ *
+ * This can be handled immediately or defered. Defering helps reduce the number
+ * of updates when many files in a directory are changed within a brief period.
+ *
+ * If the directory updating is defered then yaffs_update_dirty_dirs must be
+ * called periodically.
+ */
+
+static void yaffs_update_parent(struct yaffs_obj *obj)
+{
+	struct yaffs_dev *dev;
+	if (!obj)
+		return;
+	dev = obj->my_dev;
+	obj->dirty = 1;
+	yaffs_load_current_time(obj, 0, 1);
+	if (dev->param.defered_dir_update) {
+		struct list_head *link = &obj->variant.dir_variant.dirty;
+
+		if (list_empty(link)) {
+			list_add(link, &dev->dirty_dirs);
+			yaffs_trace(YAFFS_TRACE_BACKGROUND,
+			  "Added object %d to dirty directories",
+			   obj->obj_id);
+		}
+
+	} else {
+		yaffs_update_oh(obj, NULL, 0, 0, 0, NULL);
+        }
+}
+
+void yaffs_update_dirty_dirs(struct yaffs_dev *dev)
+{
+	struct list_head *link;
+	struct yaffs_obj *obj;
+	struct yaffs_dir_var *d_s;
+	union yaffs_obj_var *o_v;
+
+	yaffs_trace(YAFFS_TRACE_BACKGROUND, "Update dirty directories");
+
+	while (!list_empty(&dev->dirty_dirs)) {
+		link = dev->dirty_dirs.next;
+		list_del_init(link);
+
+		d_s = list_entry(link, struct yaffs_dir_var, dirty);
+		o_v = list_entry(d_s, union yaffs_obj_var, dir_variant);
+		obj = list_entry(o_v, struct yaffs_obj, variant);
+
+		yaffs_trace(YAFFS_TRACE_BACKGROUND, "Update directory %d",
+			obj->obj_id);
+
+		if (obj->dirty)
+			yaffs_update_oh(obj, NULL, 0, 0, 0, NULL);
+	}
+}
+
+/*
+ * Mknod (create) a new object.
+ * equiv_obj only has meaning for a hard link;
+ * alias_str only has meaning for a symlink.
+ * rdev only has meaning for devices (a subset of special objects)
+ */
+
+static struct yaffs_obj *yaffs_create_obj(enum yaffs_obj_type type,
+					  struct yaffs_obj *parent,
+					  const YCHAR * name,
+					  u32 mode,
+					  u32 uid,
+					  u32 gid,
+					  struct yaffs_obj *equiv_obj,
+					  const YCHAR * alias_str, u32 rdev)
+{
+	struct yaffs_obj *in;
+	YCHAR *str = NULL;
+
+	struct yaffs_dev *dev = parent->my_dev;
+
+	/* Check if the entry exists. If it does then fail the call since we don't want a dup. */
+	if (yaffs_find_by_name(parent, name))
+		return NULL;
+
+	if (type == YAFFS_OBJECT_TYPE_SYMLINK) {
+		str = yaffs_clone_str(alias_str);
+		if (!str)
+			return NULL;
+	}
+
+	in = yaffs_new_obj(dev, -1, type);
+
+	if (!in) {
+		if (str)
+			kfree(str);
+		return NULL;
+	}
+
+	if (in) {
+		in->hdr_chunk = 0;
+		in->valid = 1;
+		in->variant_type = type;
+
+		in->yst_mode = mode;
+
+		yaffs_attribs_init(in, gid, uid, rdev);
+
+		in->n_data_chunks = 0;
+
+		yaffs_set_obj_name(in, name);
+		in->dirty = 1;
+
+		yaffs_add_obj_to_dir(parent, in);
+
+		in->my_dev = parent->my_dev;
+
+		switch (type) {
+		case YAFFS_OBJECT_TYPE_SYMLINK:
+			in->variant.symlink_variant.alias = str;
+			break;
+		case YAFFS_OBJECT_TYPE_HARDLINK:
+			in->variant.hardlink_variant.equiv_obj = equiv_obj;
+			in->variant.hardlink_variant.equiv_id =
+			    equiv_obj->obj_id;
+			list_add(&in->hard_links, &equiv_obj->hard_links);
+			break;
+		case YAFFS_OBJECT_TYPE_FILE:
+		case YAFFS_OBJECT_TYPE_DIRECTORY:
+		case YAFFS_OBJECT_TYPE_SPECIAL:
+		case YAFFS_OBJECT_TYPE_UNKNOWN:
+			/* do nothing */
+			break;
+		}
+
+		if (yaffs_update_oh(in, name, 0, 0, 0, NULL) < 0) {
+			/* Could not create the object header, fail the creation */
+			yaffs_del_obj(in);
+			in = NULL;
+		}
+
+		yaffs_update_parent(parent);
+	}
+
+	return in;
+}
+
+struct yaffs_obj *yaffs_create_file(struct yaffs_obj *parent,
+				    const YCHAR * name, u32 mode, u32 uid,
+				    u32 gid)
+{
+	return yaffs_create_obj(YAFFS_OBJECT_TYPE_FILE, parent, name, mode,
+				uid, gid, NULL, NULL, 0);
+}
+
+struct yaffs_obj *yaffs_create_dir(struct yaffs_obj *parent, const YCHAR * name,
+				   u32 mode, u32 uid, u32 gid)
+{
+	return yaffs_create_obj(YAFFS_OBJECT_TYPE_DIRECTORY, parent, name,
+				mode, uid, gid, NULL, NULL, 0);
+}
+
+struct yaffs_obj *yaffs_create_special(struct yaffs_obj *parent,
+				       const YCHAR * name, u32 mode, u32 uid,
+				       u32 gid, u32 rdev)
+{
+	return yaffs_create_obj(YAFFS_OBJECT_TYPE_SPECIAL, parent, name, mode,
+				uid, gid, NULL, NULL, rdev);
+}
+
+struct yaffs_obj *yaffs_create_symlink(struct yaffs_obj *parent,
+				       const YCHAR * name, u32 mode, u32 uid,
+				       u32 gid, const YCHAR * alias)
+{
+	return yaffs_create_obj(YAFFS_OBJECT_TYPE_SYMLINK, parent, name, mode,
+				uid, gid, NULL, alias, 0);
+}
+
+/* yaffs_link_obj returns the object id of the equivalent object.*/
+struct yaffs_obj *yaffs_link_obj(struct yaffs_obj *parent, const YCHAR * name,
+				 struct yaffs_obj *equiv_obj)
+{
+	/* Get the real object in case we were fed a hard link as an equivalent object */
+	equiv_obj = yaffs_get_equivalent_obj(equiv_obj);
+
+	if (yaffs_create_obj
+	    (YAFFS_OBJECT_TYPE_HARDLINK, parent, name, 0, 0, 0,
+	     equiv_obj, NULL, 0)) {
+		return equiv_obj;
+	} else {
+		return NULL;
+	}
+
+}
+
+
+
+/*------------------------- Block Management and Page Allocation ----------------*/
+
+static int yaffs_init_blocks(struct yaffs_dev *dev)
+{
+	int n_blocks = dev->internal_end_block - dev->internal_start_block + 1;
+
+	dev->block_info = NULL;
+	dev->chunk_bits = NULL;
+
+	dev->alloc_block = -1;	/* force it to get a new one */
+
+	/* If the first allocation strategy fails, thry the alternate one */
+	dev->block_info =
+		kmalloc(n_blocks * sizeof(struct yaffs_block_info), GFP_NOFS);
+	if (!dev->block_info) {
+		dev->block_info =
+		    vmalloc(n_blocks * sizeof(struct yaffs_block_info));
+		dev->block_info_alt = 1;
+	} else {
+		dev->block_info_alt = 0;
+        }
+
+	if (dev->block_info) {
+		/* Set up dynamic blockinfo stuff. Round up bytes. */
+		dev->chunk_bit_stride = (dev->param.chunks_per_block + 7) / 8;
+		dev->chunk_bits =
+			kmalloc(dev->chunk_bit_stride * n_blocks, GFP_NOFS);
+		if (!dev->chunk_bits) {
+			dev->chunk_bits =
+			    vmalloc(dev->chunk_bit_stride * n_blocks);
+			dev->chunk_bits_alt = 1;
+		} else {
+			dev->chunk_bits_alt = 0;
+                }
+	}
+
+	if (dev->block_info && dev->chunk_bits) {
+		memset(dev->block_info, 0,
+		       n_blocks * sizeof(struct yaffs_block_info));
+		memset(dev->chunk_bits, 0, dev->chunk_bit_stride * n_blocks);
+		return YAFFS_OK;
+	}
+
+	return YAFFS_FAIL;
+}
+
+static void yaffs_deinit_blocks(struct yaffs_dev *dev)
+{
+	if (dev->block_info_alt && dev->block_info)
+		vfree(dev->block_info);
+	else if (dev->block_info)
+		kfree(dev->block_info);
+
+	dev->block_info_alt = 0;
+
+	dev->block_info = NULL;
+
+	if (dev->chunk_bits_alt && dev->chunk_bits)
+		vfree(dev->chunk_bits);
+	else if (dev->chunk_bits)
+		kfree(dev->chunk_bits);
+	dev->chunk_bits_alt = 0;
+	dev->chunk_bits = NULL;
+}
+
+void yaffs_block_became_dirty(struct yaffs_dev *dev, int block_no)
+{
+	struct yaffs_block_info *bi = yaffs_get_block_info(dev, block_no);
+
+	int erased_ok = 0;
+
+	/* If the block is still healthy erase it and mark as clean.
+	 * If the block has had a data failure, then retire it.
+	 */
+
+	yaffs_trace(YAFFS_TRACE_GC | YAFFS_TRACE_ERASE,
+		"yaffs_block_became_dirty block %d state %d %s",
+		block_no, bi->block_state,
+		(bi->needs_retiring) ? "needs retiring" : "");
+
+	yaffs2_clear_oldest_dirty_seq(dev, bi);
+
+	bi->block_state = YAFFS_BLOCK_STATE_DIRTY;
+
+	/* If this is the block being garbage collected then stop gc'ing this block */
+	if (block_no == dev->gc_block)
+		dev->gc_block = 0;
+
+	/* If this block is currently the best candidate for gc then drop as a candidate */
+	if (block_no == dev->gc_dirtiest) {
+		dev->gc_dirtiest = 0;
+		dev->gc_pages_in_use = 0;
+	}
+
+	if (!bi->needs_retiring) {
+		yaffs2_checkpt_invalidate(dev);
+		erased_ok = yaffs_erase_block(dev, block_no);
+		if (!erased_ok) {
+			dev->n_erase_failures++;
+			yaffs_trace(YAFFS_TRACE_ERROR | YAFFS_TRACE_BAD_BLOCKS,
+			  "**>> Erasure failed %d", block_no);
+		}
+	}
+
+	if (erased_ok &&
+	    ((yaffs_trace_mask & YAFFS_TRACE_ERASE)
+	     || !yaffs_skip_verification(dev))) {
+		int i;
+		for (i = 0; i < dev->param.chunks_per_block; i++) {
+			if (!yaffs_check_chunk_erased
+			    (dev, block_no * dev->param.chunks_per_block + i)) {
+				yaffs_trace(YAFFS_TRACE_ERROR,
+					">>Block %d erasure supposedly OK, but chunk %d not erased",
+					block_no, i);
+			}
+		}
+	}
+
+	if (erased_ok) {
+		/* Clean it up... */
+		bi->block_state = YAFFS_BLOCK_STATE_EMPTY;
+		bi->seq_number = 0;
+		dev->n_erased_blocks++;
+		bi->pages_in_use = 0;
+		bi->soft_del_pages = 0;
+		bi->has_shrink_hdr = 0;
+		bi->skip_erased_check = 1;	/* Clean, so no need to check */
+		bi->gc_prioritise = 0;
+		yaffs_clear_chunk_bits(dev, block_no);
+
+		yaffs_trace(YAFFS_TRACE_ERASE,
+			"Erased block %d", block_no);
+	} else {
+		/* We lost a block of free space */
+		dev->n_free_chunks -= dev->param.chunks_per_block;
+		yaffs_retire_block(dev, block_no);
+		yaffs_trace(YAFFS_TRACE_ERROR | YAFFS_TRACE_BAD_BLOCKS,
+			"**>> Block %d retired", block_no);
+	}
+}
+
+
+
+static int yaffs_gc_block(struct yaffs_dev *dev, int block, int whole_block)
+{
+	int old_chunk;
+	int new_chunk;
+	int mark_flash;
+	int ret_val = YAFFS_OK;
+	int i;
+	int is_checkpt_block;
+	int matching_chunk;
+	int max_copies;
+
+	int chunks_before = yaffs_get_erased_chunks(dev);
+	int chunks_after;
+
+	struct yaffs_ext_tags tags;
+
+	struct yaffs_block_info *bi = yaffs_get_block_info(dev, block);
+
+	struct yaffs_obj *object;
+
+	is_checkpt_block = (bi->block_state == YAFFS_BLOCK_STATE_CHECKPOINT);
+
+	yaffs_trace(YAFFS_TRACE_TRACING,
+		"Collecting block %d, in use %d, shrink %d, whole_block %d",
+		block, bi->pages_in_use, bi->has_shrink_hdr,
+		whole_block);
+
+	/*yaffs_verify_free_chunks(dev); */
+
+	if (bi->block_state == YAFFS_BLOCK_STATE_FULL)
+		bi->block_state = YAFFS_BLOCK_STATE_COLLECTING;
+
+	bi->has_shrink_hdr = 0;	/* clear the flag so that the block can erase */
+
+	dev->gc_disable = 1;
+
+	if (is_checkpt_block || !yaffs_still_some_chunks(dev, block)) {
+		yaffs_trace(YAFFS_TRACE_TRACING,
+			"Collecting block %d that has no chunks in use",
+		   	block);
+		yaffs_block_became_dirty(dev, block);
+	} else {
+
+		u8 *buffer = yaffs_get_temp_buffer(dev, __LINE__);
+
+		yaffs_verify_blk(dev, bi, block);
+
+		max_copies = (whole_block) ? dev->param.chunks_per_block : 5;
+		old_chunk = block * dev->param.chunks_per_block + dev->gc_chunk;
+
+		for ( /* init already done */ ;
+		     ret_val == YAFFS_OK &&
+		     dev->gc_chunk < dev->param.chunks_per_block &&
+		     (bi->block_state == YAFFS_BLOCK_STATE_COLLECTING) &&
+		     max_copies > 0; dev->gc_chunk++, old_chunk++) {
+			if (yaffs_check_chunk_bit(dev, block, dev->gc_chunk)) {
+
+				/* This page is in use and might need to be copied off */
+
+				max_copies--;
+
+				mark_flash = 1;
+
+				yaffs_init_tags(&tags);
+
+				yaffs_rd_chunk_tags_nand(dev, old_chunk,
+							 buffer, &tags);
+
+				object = yaffs_find_by_number(dev, tags.obj_id);
+
+				yaffs_trace(YAFFS_TRACE_GC_DETAIL,
+					"Collecting chunk in block %d, %d %d %d ",
+					dev->gc_chunk, tags.obj_id,
+					tags.chunk_id, tags.n_bytes);
+
+				if (object && !yaffs_skip_verification(dev)) {
+					if (tags.chunk_id == 0)
+						matching_chunk =
+						    object->hdr_chunk;
+					else if (object->soft_del)
+						matching_chunk = old_chunk;	/* Defeat the test */
+					else
+						matching_chunk =
+						    yaffs_find_chunk_in_file
+						    (object, tags.chunk_id,
+						     NULL);
+
+					if (old_chunk != matching_chunk)
+						yaffs_trace(YAFFS_TRACE_ERROR,
+							"gc: page in gc mismatch: %d %d %d %d",
+							old_chunk,
+							matching_chunk,
+							tags.obj_id,
+						   	tags.chunk_id);
+
+				}
+
+				if (!object) {
+					yaffs_trace(YAFFS_TRACE_ERROR,
+						"page %d in gc has no object: %d %d %d ",
+						old_chunk,
+						tags.obj_id, tags.chunk_id,
+						tags.n_bytes);
+				}
+
+				if (object &&
+				    object->deleted &&
+				    object->soft_del && tags.chunk_id != 0) {
+					/* Data chunk in a soft deleted file, throw it away
+					 * It's a soft deleted data chunk,
+					 * No need to copy this, just forget about it and
+					 * fix up the object.
+					 */
+
+					/* Free chunks already includes softdeleted chunks.
+					 * How ever this chunk is going to soon be really deleted
+					 * which will increment free chunks.
+					 * We have to decrement free chunks so this works out properly.
+					 */
+					dev->n_free_chunks--;
+					bi->soft_del_pages--;
+
+					object->n_data_chunks--;
+
+					if (object->n_data_chunks <= 0) {
+						/* remeber to clean up the object */
+						dev->gc_cleanup_list[dev->
+								     n_clean_ups]
+						    = tags.obj_id;
+						dev->n_clean_ups++;
+					}
+					mark_flash = 0;
+				} else if (0) {
+					/* Todo object && object->deleted && object->n_data_chunks == 0 */
+					/* Deleted object header with no data chunks.
+					 * Can be discarded and the file deleted.
+					 */
+					object->hdr_chunk = 0;
+					yaffs_free_tnode(object->my_dev,
+							 object->
+							 variant.file_variant.
+							 top);
+					object->variant.file_variant.top = NULL;
+					yaffs_generic_obj_del(object);
+
+				} else if (object) {
+					/* It's either a data chunk in a live file or
+					 * an ObjectHeader, so we're interested in it.
+					 * NB Need to keep the ObjectHeaders of deleted files
+					 * until the whole file has been deleted off
+					 */
+					tags.serial_number++;
+
+					dev->n_gc_copies++;
+
+					if (tags.chunk_id == 0) {
+						/* It is an object Id,
+						 * We need to nuke the shrinkheader flags first
+						 * Also need to clean up shadowing.
+						 * We no longer want the shrink_header flag since its work is done
+						 * and if it is left in place it will mess up scanning.
+						 */
+
+						struct yaffs_obj_hdr *oh;
+						oh = (struct yaffs_obj_hdr *)
+						    buffer;
+
+						oh->is_shrink = 0;
+						tags.extra_is_shrink = 0;
+
+						oh->shadows_obj = 0;
+						oh->inband_shadowed_obj_id = 0;
+						tags.extra_shadows = 0;
+
+						/* Update file size */
+						if (object->variant_type ==
+						    YAFFS_OBJECT_TYPE_FILE) {
+							oh->file_size =
+							    object->variant.
+							    file_variant.
+							    file_size;
+							tags.extra_length =
+							    oh->file_size;
+						}
+
+						yaffs_verify_oh(object, oh,
+								&tags, 1);
+						new_chunk =
+						    yaffs_write_new_chunk(dev,
+									  (u8 *)
+									  oh,
+									  &tags,
+									  1);
+					} else {
+						new_chunk =
+						    yaffs_write_new_chunk(dev,
+									  buffer,
+									  &tags,
+									  1);
+                                        }
+
+					if (new_chunk < 0) {
+						ret_val = YAFFS_FAIL;
+					} else {
+
+						/* Ok, now fix up the Tnodes etc. */
+
+						if (tags.chunk_id == 0) {
+							/* It's a header */
+							object->hdr_chunk =
+							    new_chunk;
+							object->serial =
+							    tags.serial_number;
+						} else {
+							/* It's a data chunk */
+							int ok;
+							ok = yaffs_put_chunk_in_file(object, tags.chunk_id, new_chunk, 0);
+						}
+					}
+				}
+
+				if (ret_val == YAFFS_OK)
+					yaffs_chunk_del(dev, old_chunk,
+							mark_flash, __LINE__);
+
+			}
+		}
+
+		yaffs_release_temp_buffer(dev, buffer, __LINE__);
+
+	}
+
+	yaffs_verify_collected_blk(dev, bi, block);
+
+	if (bi->block_state == YAFFS_BLOCK_STATE_COLLECTING) {
+		/*
+		 * The gc did not complete. Set block state back to FULL
+		 * because checkpointing does not restore gc.
+		 */
+		bi->block_state = YAFFS_BLOCK_STATE_FULL;
+	} else {
+		/* The gc completed. */
+		/* Do any required cleanups */
+		for (i = 0; i < dev->n_clean_ups; i++) {
+			/* Time to delete the file too */
+			object =
+			    yaffs_find_by_number(dev, dev->gc_cleanup_list[i]);
+			if (object) {
+				yaffs_free_tnode(dev,
+						 object->variant.
+						 file_variant.top);
+				object->variant.file_variant.top = NULL;
+				yaffs_trace(YAFFS_TRACE_GC,
+					"yaffs: About to finally delete object %d",
+					object->obj_id);
+				yaffs_generic_obj_del(object);
+				object->my_dev->n_deleted_files--;
+			}
+
+		}
+
+		chunks_after = yaffs_get_erased_chunks(dev);
+		if (chunks_before >= chunks_after)
+			yaffs_trace(YAFFS_TRACE_GC,
+				"gc did not increase free chunks before %d after %d",
+				chunks_before, chunks_after);
+		dev->gc_block = 0;
+		dev->gc_chunk = 0;
+		dev->n_clean_ups = 0;
+	}
+
+	dev->gc_disable = 0;
+
+	return ret_val;
+}
+
+/*
+ * FindBlockForgarbageCollection is used to select the dirtiest block (or close enough)
+ * for garbage collection.
+ */
+
+static unsigned yaffs_find_gc_block(struct yaffs_dev *dev,
+				    int aggressive, int background)
+{
+	int i;
+	int iterations;
+	unsigned selected = 0;
+	int prioritised = 0;
+	int prioritised_exist = 0;
+	struct yaffs_block_info *bi;
+	int threshold;
+
+	/* First let's see if we need to grab a prioritised block */
+	if (dev->has_pending_prioritised_gc && !aggressive) {
+		dev->gc_dirtiest = 0;
+		bi = dev->block_info;
+		for (i = dev->internal_start_block;
+		     i <= dev->internal_end_block && !selected; i++) {
+
+			if (bi->gc_prioritise) {
+				prioritised_exist = 1;
+				if (bi->block_state == YAFFS_BLOCK_STATE_FULL &&
+				    yaffs_block_ok_for_gc(dev, bi)) {
+					selected = i;
+					prioritised = 1;
+				}
+			}
+			bi++;
+		}
+
+		/*
+		 * If there is a prioritised block and none was selected then
+		 * this happened because there is at least one old dirty block gumming
+		 * up the works. Let's gc the oldest dirty block.
+		 */
+
+		if (prioritised_exist &&
+		    !selected && dev->oldest_dirty_block > 0)
+			selected = dev->oldest_dirty_block;
+
+		if (!prioritised_exist)	/* None found, so we can clear this */
+			dev->has_pending_prioritised_gc = 0;
+	}
+
+	/* If we're doing aggressive GC then we are happy to take a less-dirty block, and
+	 * search harder.
+	 * else (we're doing a leasurely gc), then we only bother to do this if the
+	 * block has only a few pages in use.
+	 */
+
+	if (!selected) {
+		int pages_used;
+		int n_blocks =
+		    dev->internal_end_block - dev->internal_start_block + 1;
+		if (aggressive) {
+			threshold = dev->param.chunks_per_block;
+			iterations = n_blocks;
+		} else {
+			int max_threshold;
+
+			if (background)
+				max_threshold = dev->param.chunks_per_block / 2;
+			else
+				max_threshold = dev->param.chunks_per_block / 8;
+
+			if (max_threshold < YAFFS_GC_PASSIVE_THRESHOLD)
+				max_threshold = YAFFS_GC_PASSIVE_THRESHOLD;
+
+			threshold = background ? (dev->gc_not_done + 2) * 2 : 0;
+			if (threshold < YAFFS_GC_PASSIVE_THRESHOLD)
+				threshold = YAFFS_GC_PASSIVE_THRESHOLD;
+			if (threshold > max_threshold)
+				threshold = max_threshold;
+
+			iterations = n_blocks / 16 + 1;
+			if (iterations > 100)
+				iterations = 100;
+		}
+
+		for (i = 0;
+		     i < iterations &&
+		     (dev->gc_dirtiest < 1 ||
+		      dev->gc_pages_in_use > YAFFS_GC_GOOD_ENOUGH); i++) {
+			dev->gc_block_finder++;
+			if (dev->gc_block_finder < dev->internal_start_block ||
+			    dev->gc_block_finder > dev->internal_end_block)
+				dev->gc_block_finder =
+				    dev->internal_start_block;
+
+			bi = yaffs_get_block_info(dev, dev->gc_block_finder);
+
+			pages_used = bi->pages_in_use - bi->soft_del_pages;
+
+			if (bi->block_state == YAFFS_BLOCK_STATE_FULL &&
+			    pages_used < dev->param.chunks_per_block &&
+			    (dev->gc_dirtiest < 1
+			     || pages_used < dev->gc_pages_in_use)
+			    && yaffs_block_ok_for_gc(dev, bi)) {
+				dev->gc_dirtiest = dev->gc_block_finder;
+				dev->gc_pages_in_use = pages_used;
+			}
+		}
+
+		if (dev->gc_dirtiest > 0 && dev->gc_pages_in_use <= threshold)
+			selected = dev->gc_dirtiest;
+	}
+
+	/*
+	 * If nothing has been selected for a while, try selecting the oldest dirty
+	 * because that's gumming up the works.
+	 */
+
+	if (!selected && dev->param.is_yaffs2 &&
+	    dev->gc_not_done >= (background ? 10 : 20)) {
+		yaffs2_find_oldest_dirty_seq(dev);
+		if (dev->oldest_dirty_block > 0) {
+			selected = dev->oldest_dirty_block;
+			dev->gc_dirtiest = selected;
+			dev->oldest_dirty_gc_count++;
+			bi = yaffs_get_block_info(dev, selected);
+			dev->gc_pages_in_use =
+			    bi->pages_in_use - bi->soft_del_pages;
+		} else {
+			dev->gc_not_done = 0;
+                }
+	}
+
+	if (selected) {
+		yaffs_trace(YAFFS_TRACE_GC,
+			"GC Selected block %d with %d free, prioritised:%d",
+			selected,
+			dev->param.chunks_per_block - dev->gc_pages_in_use,
+			prioritised);
+
+		dev->n_gc_blocks++;
+		if (background)
+			dev->bg_gcs++;
+
+		dev->gc_dirtiest = 0;
+		dev->gc_pages_in_use = 0;
+		dev->gc_not_done = 0;
+		if (dev->refresh_skip > 0)
+			dev->refresh_skip--;
+	} else {
+		dev->gc_not_done++;
+		yaffs_trace(YAFFS_TRACE_GC,
+			"GC none: finder %d skip %d threshold %d dirtiest %d using %d oldest %d%s",
+			dev->gc_block_finder, dev->gc_not_done, threshold,
+			dev->gc_dirtiest, dev->gc_pages_in_use,
+			dev->oldest_dirty_block, background ? " bg" : "");
+	}
+
+	return selected;
+}
+
+/* New garbage collector
+ * If we're very low on erased blocks then we do aggressive garbage collection
+ * otherwise we do "leasurely" garbage collection.
+ * Aggressive gc looks further (whole array) and will accept less dirty blocks.
+ * Passive gc only inspects smaller areas and will only accept more dirty blocks.
+ *
+ * The idea is to help clear out space in a more spread-out manner.
+ * Dunno if it really does anything useful.
+ */
+static int yaffs_check_gc(struct yaffs_dev *dev, int background)
+{
+	int aggressive = 0;
+	int gc_ok = YAFFS_OK;
+	int max_tries = 0;
+	int min_erased;
+	int erased_chunks;
+	int checkpt_block_adjust;
+
+	if (dev->param.gc_control && (dev->param.gc_control(dev) & 1) == 0)
+		return YAFFS_OK;
+
+	if (dev->gc_disable) {
+		/* Bail out so we don't get recursive gc */
+		return YAFFS_OK;
+	}
+
+	/* This loop should pass the first time.
+	 * We'll only see looping here if the collection does not increase space.
+	 */
+
+	do {
+		max_tries++;
+
+		checkpt_block_adjust = yaffs_calc_checkpt_blocks_required(dev);
+
+		min_erased =
+		    dev->param.n_reserved_blocks + checkpt_block_adjust + 1;
+		erased_chunks =
+		    dev->n_erased_blocks * dev->param.chunks_per_block;
+
+		/* If we need a block soon then do aggressive gc. */
+		if (dev->n_erased_blocks < min_erased)
+			aggressive = 1;
+		else {
+			if (!background
+			    && erased_chunks > (dev->n_free_chunks / 4))
+				break;
+
+			if (dev->gc_skip > 20)
+				dev->gc_skip = 20;
+			if (erased_chunks < dev->n_free_chunks / 2 ||
+			    dev->gc_skip < 1 || background)
+				aggressive = 0;
+			else {
+				dev->gc_skip--;
+				break;
+			}
+		}
+
+		dev->gc_skip = 5;
+
+		/* If we don't already have a block being gc'd then see if we should start another */
+
+		if (dev->gc_block < 1 && !aggressive) {
+			dev->gc_block = yaffs2_find_refresh_block(dev);
+			dev->gc_chunk = 0;
+			dev->n_clean_ups = 0;
+		}
+		if (dev->gc_block < 1) {
+			dev->gc_block =
+			    yaffs_find_gc_block(dev, aggressive, background);
+			dev->gc_chunk = 0;
+			dev->n_clean_ups = 0;
+		}
+
+		if (dev->gc_block > 0) {
+			dev->all_gcs++;
+			if (!aggressive)
+				dev->passive_gc_count++;
+
+			yaffs_trace(YAFFS_TRACE_GC,
+				"yaffs: GC n_erased_blocks %d aggressive %d",
+				dev->n_erased_blocks, aggressive);
+
+			gc_ok = yaffs_gc_block(dev, dev->gc_block, aggressive);
+		}
+
+		if (dev->n_erased_blocks < (dev->param.n_reserved_blocks)
+		    && dev->gc_block > 0) {
+			yaffs_trace(YAFFS_TRACE_GC,
+				"yaffs: GC !!!no reclaim!!! n_erased_blocks %d after try %d block %d",
+				dev->n_erased_blocks, max_tries,
+				dev->gc_block);
+		}
+	} while ((dev->n_erased_blocks < dev->param.n_reserved_blocks) &&
+		 (dev->gc_block > 0) && (max_tries < 2));
+
+	return aggressive ? gc_ok : YAFFS_OK;
+}
+
+/*
+ * yaffs_bg_gc()
+ * Garbage collects. Intended to be called from a background thread.
+ * Returns non-zero if at least half the free chunks are erased.
+ */
+int yaffs_bg_gc(struct yaffs_dev *dev, unsigned urgency)
+{
+	int erased_chunks = dev->n_erased_blocks * dev->param.chunks_per_block;
+
+	yaffs_trace(YAFFS_TRACE_BACKGROUND, "Background gc %u", urgency);
+
+	yaffs_check_gc(dev, 1);
+	return erased_chunks > dev->n_free_chunks / 2;
+}
+
+/*-------------------- Data file manipulation -----------------*/
+
+static int yaffs_rd_data_obj(struct yaffs_obj *in, int inode_chunk, u8 * buffer)
+{
+	int nand_chunk = yaffs_find_chunk_in_file(in, inode_chunk, NULL);
+
+	if (nand_chunk >= 0)
+		return yaffs_rd_chunk_tags_nand(in->my_dev, nand_chunk,
+						buffer, NULL);
+	else {
+		yaffs_trace(YAFFS_TRACE_NANDACCESS,
+			"Chunk %d not found zero instead",
+			nand_chunk);
+		/* get sane (zero) data if you read a hole */
+		memset(buffer, 0, in->my_dev->data_bytes_per_chunk);
+		return 0;
+	}
+
+}
+
+void yaffs_chunk_del(struct yaffs_dev *dev, int chunk_id, int mark_flash,
+		     int lyn)
+{
+	int block;
+	int page;
+	struct yaffs_ext_tags tags;
+	struct yaffs_block_info *bi;
+
+	if (chunk_id <= 0)
+		return;
+
+	dev->n_deletions++;
+	block = chunk_id / dev->param.chunks_per_block;
+	page = chunk_id % dev->param.chunks_per_block;
+
+	if (!yaffs_check_chunk_bit(dev, block, page))
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Deleting invalid chunk %d", chunk_id);
+
+	bi = yaffs_get_block_info(dev, block);
+
+	yaffs2_update_oldest_dirty_seq(dev, block, bi);
+
+	yaffs_trace(YAFFS_TRACE_DELETION,
+		"line %d delete of chunk %d",
+		lyn, chunk_id);
+
+	if (!dev->param.is_yaffs2 && mark_flash &&
+	    bi->block_state != YAFFS_BLOCK_STATE_COLLECTING) {
+
+		yaffs_init_tags(&tags);
+
+		tags.is_deleted = 1;
+
+		yaffs_wr_chunk_tags_nand(dev, chunk_id, NULL, &tags);
+		yaffs_handle_chunk_update(dev, chunk_id, &tags);
+	} else {
+		dev->n_unmarked_deletions++;
+	}
+
+	/* Pull out of the management area.
+	 * If the whole block became dirty, this will kick off an erasure.
+	 */
+	if (bi->block_state == YAFFS_BLOCK_STATE_ALLOCATING ||
+	    bi->block_state == YAFFS_BLOCK_STATE_FULL ||
+	    bi->block_state == YAFFS_BLOCK_STATE_NEEDS_SCANNING ||
+	    bi->block_state == YAFFS_BLOCK_STATE_COLLECTING) {
+		dev->n_free_chunks++;
+
+		yaffs_clear_chunk_bit(dev, block, page);
+
+		bi->pages_in_use--;
+
+		if (bi->pages_in_use == 0 &&
+		    !bi->has_shrink_hdr &&
+		    bi->block_state != YAFFS_BLOCK_STATE_ALLOCATING &&
+		    bi->block_state != YAFFS_BLOCK_STATE_NEEDS_SCANNING) {
+			yaffs_block_became_dirty(dev, block);
+		}
+
+	}
+
+}
+
+static int yaffs_wr_data_obj(struct yaffs_obj *in, int inode_chunk,
+			     const u8 * buffer, int n_bytes, int use_reserve)
+{
+	/* Find old chunk Need to do this to get serial number
+	 * Write new one and patch into tree.
+	 * Invalidate old tags.
+	 */
+
+	int prev_chunk_id;
+	struct yaffs_ext_tags prev_tags;
+
+	int new_chunk_id;
+	struct yaffs_ext_tags new_tags;
+
+	struct yaffs_dev *dev = in->my_dev;
+
+	yaffs_check_gc(dev, 0);
+
+	/* Get the previous chunk at this location in the file if it exists.
+	 * If it does not exist then put a zero into the tree. This creates
+	 * the tnode now, rather than later when it is harder to clean up.
+	 */
+	prev_chunk_id = yaffs_find_chunk_in_file(in, inode_chunk, &prev_tags);
+	if (prev_chunk_id < 1 &&
+	    !yaffs_put_chunk_in_file(in, inode_chunk, 0, 0))
+		return 0;
+
+	/* Set up new tags */
+	yaffs_init_tags(&new_tags);
+
+	new_tags.chunk_id = inode_chunk;
+	new_tags.obj_id = in->obj_id;
+	new_tags.serial_number =
+	    (prev_chunk_id > 0) ? prev_tags.serial_number + 1 : 1;
+	new_tags.n_bytes = n_bytes;
+
+	if (n_bytes < 1 || n_bytes > dev->param.total_bytes_per_chunk) {
+		yaffs_trace(YAFFS_TRACE_ERROR,
+		  "Writing %d bytes to chunk!!!!!!!!!",
+		   n_bytes);
+		YBUG();
+	}
+
+	new_chunk_id =
+	    yaffs_write_new_chunk(dev, buffer, &new_tags, use_reserve);
+
+	if (new_chunk_id > 0) {
+		yaffs_put_chunk_in_file(in, inode_chunk, new_chunk_id, 0);
+
+		if (prev_chunk_id > 0)
+			yaffs_chunk_del(dev, prev_chunk_id, 1, __LINE__);
+
+		yaffs_verify_file_sane(in);
+	}
+	return new_chunk_id;
+
+}
+
+
+
+static int yaffs_do_xattrib_mod(struct yaffs_obj *obj, int set,
+				const YCHAR * name, const void *value, int size,
+				int flags)
+{
+	struct yaffs_xattr_mod xmod;
+
+	int result;
+
+	xmod.set = set;
+	xmod.name = name;
+	xmod.data = value;
+	xmod.size = size;
+	xmod.flags = flags;
+	xmod.result = -ENOSPC;
+
+	result = yaffs_update_oh(obj, NULL, 0, 0, 0, &xmod);
+
+	if (result > 0)
+		return xmod.result;
+	else
+		return -ENOSPC;
+}
+
+static int yaffs_apply_xattrib_mod(struct yaffs_obj *obj, char *buffer,
+				   struct yaffs_xattr_mod *xmod)
+{
+	int retval = 0;
+	int x_offs = sizeof(struct yaffs_obj_hdr);
+	struct yaffs_dev *dev = obj->my_dev;
+	int x_size = dev->data_bytes_per_chunk - sizeof(struct yaffs_obj_hdr);
+
+	char *x_buffer = buffer + x_offs;
+
+	if (xmod->set)
+		retval =
+		    nval_set(x_buffer, x_size, xmod->name, xmod->data,
+			     xmod->size, xmod->flags);
+	else
+		retval = nval_del(x_buffer, x_size, xmod->name);
+
+	obj->has_xattr = nval_hasvalues(x_buffer, x_size);
+	obj->xattr_known = 1;
+
+	xmod->result = retval;
+
+	return retval;
+}
+
+static int yaffs_do_xattrib_fetch(struct yaffs_obj *obj, const YCHAR * name,
+				  void *value, int size)
+{
+	char *buffer = NULL;
+	int result;
+	struct yaffs_ext_tags tags;
+	struct yaffs_dev *dev = obj->my_dev;
+	int x_offs = sizeof(struct yaffs_obj_hdr);
+	int x_size = dev->data_bytes_per_chunk - sizeof(struct yaffs_obj_hdr);
+
+	char *x_buffer;
+
+	int retval = 0;
+
+	if (obj->hdr_chunk < 1)
+		return -ENODATA;
+
+	/* If we know that the object has no xattribs then don't do all the
+	 * reading and parsing.
+	 */
+	if (obj->xattr_known && !obj->has_xattr) {
+		if (name)
+			return -ENODATA;
+		else
+			return 0;
+	}
+
+	buffer = (char *)yaffs_get_temp_buffer(dev, __LINE__);
+	if (!buffer)
+		return -ENOMEM;
+
+	result =
+	    yaffs_rd_chunk_tags_nand(dev, obj->hdr_chunk, (u8 *) buffer, &tags);
+
+	if (result != YAFFS_OK)
+		retval = -ENOENT;
+	else {
+		x_buffer = buffer + x_offs;
+
+		if (!obj->xattr_known) {
+			obj->has_xattr = nval_hasvalues(x_buffer, x_size);
+			obj->xattr_known = 1;
+		}
+
+		if (name)
+			retval = nval_get(x_buffer, x_size, name, value, size);
+		else
+			retval = nval_list(x_buffer, x_size, value, size);
+	}
+	yaffs_release_temp_buffer(dev, (u8 *) buffer, __LINE__);
+	return retval;
+}
+
+int yaffs_set_xattrib(struct yaffs_obj *obj, const YCHAR * name,
+		      const void *value, int size, int flags)
+{
+	return yaffs_do_xattrib_mod(obj, 1, name, value, size, flags);
+}
+
+int yaffs_remove_xattrib(struct yaffs_obj *obj, const YCHAR * name)
+{
+	return yaffs_do_xattrib_mod(obj, 0, name, NULL, 0, 0);
+}
+
+int yaffs_get_xattrib(struct yaffs_obj *obj, const YCHAR * name, void *value,
+		      int size)
+{
+	return yaffs_do_xattrib_fetch(obj, name, value, size);
+}
+
+int yaffs_list_xattrib(struct yaffs_obj *obj, char *buffer, int size)
+{
+	return yaffs_do_xattrib_fetch(obj, NULL, buffer, size);
+}
+
+static void yaffs_check_obj_details_loaded(struct yaffs_obj *in)
+{
+	u8 *chunk_data;
+	struct yaffs_obj_hdr *oh;
+	struct yaffs_dev *dev;
+	struct yaffs_ext_tags tags;
+	int result;
+	int alloc_failed = 0;
+
+	if (!in)
+		return;
+
+	dev = in->my_dev;
+
+	if (in->lazy_loaded && in->hdr_chunk > 0) {
+		in->lazy_loaded = 0;
+		chunk_data = yaffs_get_temp_buffer(dev, __LINE__);
+
+		result =
+		    yaffs_rd_chunk_tags_nand(dev, in->hdr_chunk, chunk_data,
+					     &tags);
+		oh = (struct yaffs_obj_hdr *)chunk_data;
+
+		in->yst_mode = oh->yst_mode;
+		yaffs_load_attribs(in, oh);
+		yaffs_set_obj_name_from_oh(in, oh);
+
+		if (in->variant_type == YAFFS_OBJECT_TYPE_SYMLINK) {
+			in->variant.symlink_variant.alias =
+			    yaffs_clone_str(oh->alias);
+			if (!in->variant.symlink_variant.alias)
+				alloc_failed = 1;	/* Not returned to caller */
+		}
+
+		yaffs_release_temp_buffer(dev, chunk_data, __LINE__);
+	}
+}
+
+static void yaffs_load_name_from_oh(struct yaffs_dev *dev, YCHAR * name,
+				    const YCHAR * oh_name, int buff_size)
+{
+#ifdef CONFIG_YAFFS_AUTO_UNICODE
+	if (dev->param.auto_unicode) {
+		if (*oh_name) {
+			/* It is an ASCII name, do an ASCII to
+			 * unicode conversion */
+			const char *ascii_oh_name = (const char *)oh_name;
+			int n = buff_size - 1;
+			while (n > 0 && *ascii_oh_name) {
+				*name = *ascii_oh_name;
+				name++;
+				ascii_oh_name++;
+				n--;
+			}
+		} else {
+			strncpy(name, oh_name + 1, buff_size - 1);
+                }
+	} else {
+#else
+        {
+#endif
+		strncpy(name, oh_name, buff_size - 1);
+        }
+}
+
+static void yaffs_load_oh_from_name(struct yaffs_dev *dev, YCHAR * oh_name,
+				    const YCHAR * name)
+{
+#ifdef CONFIG_YAFFS_AUTO_UNICODE
+
+	int is_ascii;
+	YCHAR *w;
+
+	if (dev->param.auto_unicode) {
+
+		is_ascii = 1;
+		w = name;
+
+		/* Figure out if the name will fit in ascii character set */
+		while (is_ascii && *w) {
+			if ((*w) & 0xff00)
+				is_ascii = 0;
+			w++;
+		}
+
+		if (is_ascii) {
+			/* It is an ASCII name, so do a unicode to ascii conversion */
+			char *ascii_oh_name = (char *)oh_name;
+			int n = YAFFS_MAX_NAME_LENGTH - 1;
+			while (n > 0 && *name) {
+				*ascii_oh_name = *name;
+				name++;
+				ascii_oh_name++;
+				n--;
+			}
+		} else {
+			/* It is a unicode name, so save starting at the second YCHAR */
+			*oh_name = 0;
+			strncpy(oh_name + 1, name,
+				      YAFFS_MAX_NAME_LENGTH - 2);
+		}
+	} else {
+#else
+        {
+#endif
+		strncpy(oh_name, name, YAFFS_MAX_NAME_LENGTH - 1);
+        }
+
+}
+
+/* UpdateObjectHeader updates the header on NAND for an object.
+ * If name is not NULL, then that new name is used.
+ */
+int yaffs_update_oh(struct yaffs_obj *in, const YCHAR * name, int force,
+		    int is_shrink, int shadows, struct yaffs_xattr_mod *xmod)
+{
+
+	struct yaffs_block_info *bi;
+
+	struct yaffs_dev *dev = in->my_dev;
+
+	int prev_chunk_id;
+	int ret_val = 0;
+	int result = 0;
+
+	int new_chunk_id;
+	struct yaffs_ext_tags new_tags;
+	struct yaffs_ext_tags old_tags;
+	const YCHAR *alias = NULL;
+
+	u8 *buffer = NULL;
+	YCHAR old_name[YAFFS_MAX_NAME_LENGTH + 1];
+
+	struct yaffs_obj_hdr *oh = NULL;
+
+	strcpy(old_name, _Y("silly old name"));
+
+	if (!in->fake || in == dev->root_dir ||
+	    force || xmod) {
+
+		yaffs_check_gc(dev, 0);
+		yaffs_check_obj_details_loaded(in);
+
+		buffer = yaffs_get_temp_buffer(in->my_dev, __LINE__);
+		oh = (struct yaffs_obj_hdr *)buffer;
+
+		prev_chunk_id = in->hdr_chunk;
+
+		if (prev_chunk_id > 0) {
+			result = yaffs_rd_chunk_tags_nand(dev, prev_chunk_id,
+							  buffer, &old_tags);
+
+			yaffs_verify_oh(in, oh, &old_tags, 0);
+
+			memcpy(old_name, oh->name, sizeof(oh->name));
+			memset(buffer, 0xFF, sizeof(struct yaffs_obj_hdr));
+		} else {
+			memset(buffer, 0xFF, dev->data_bytes_per_chunk);
+                }
+
+		oh->type = in->variant_type;
+		oh->yst_mode = in->yst_mode;
+		oh->shadows_obj = oh->inband_shadowed_obj_id = shadows;
+
+		yaffs_load_attribs_oh(oh, in);
+
+		if (in->parent)
+			oh->parent_obj_id = in->parent->obj_id;
+		else
+			oh->parent_obj_id = 0;
+
+		if (name && *name) {
+			memset(oh->name, 0, sizeof(oh->name));
+			yaffs_load_oh_from_name(dev, oh->name, name);
+		} else if (prev_chunk_id > 0) {
+			memcpy(oh->name, old_name, sizeof(oh->name));
+		} else {
+			memset(oh->name, 0, sizeof(oh->name));
+                }
+
+		oh->is_shrink = is_shrink;
+
+		switch (in->variant_type) {
+		case YAFFS_OBJECT_TYPE_UNKNOWN:
+			/* Should not happen */
+			break;
+		case YAFFS_OBJECT_TYPE_FILE:
+			oh->file_size =
+			    (oh->parent_obj_id == YAFFS_OBJECTID_DELETED
+			     || oh->parent_obj_id ==
+			     YAFFS_OBJECTID_UNLINKED) ? 0 : in->
+			    variant.file_variant.file_size;
+			break;
+		case YAFFS_OBJECT_TYPE_HARDLINK:
+			oh->equiv_id = in->variant.hardlink_variant.equiv_id;
+			break;
+		case YAFFS_OBJECT_TYPE_SPECIAL:
+			/* Do nothing */
+			break;
+		case YAFFS_OBJECT_TYPE_DIRECTORY:
+			/* Do nothing */
+			break;
+		case YAFFS_OBJECT_TYPE_SYMLINK:
+			alias = in->variant.symlink_variant.alias;
+			if (!alias)
+				alias = _Y("no alias");
+			strncpy(oh->alias, alias, YAFFS_MAX_ALIAS_LENGTH);
+			oh->alias[YAFFS_MAX_ALIAS_LENGTH] = 0;
+			break;
+		}
+
+		/* process any xattrib modifications */
+		if (xmod)
+			yaffs_apply_xattrib_mod(in, (char *)buffer, xmod);
+
+		/* Tags */
+		yaffs_init_tags(&new_tags);
+		in->serial++;
+		new_tags.chunk_id = 0;
+		new_tags.obj_id = in->obj_id;
+		new_tags.serial_number = in->serial;
+
+		/* Add extra info for file header */
+
+		new_tags.extra_available = 1;
+		new_tags.extra_parent_id = oh->parent_obj_id;
+		new_tags.extra_length = oh->file_size;
+		new_tags.extra_is_shrink = oh->is_shrink;
+		new_tags.extra_equiv_id = oh->equiv_id;
+		new_tags.extra_shadows = (oh->shadows_obj > 0) ? 1 : 0;
+		new_tags.extra_obj_type = in->variant_type;
+
+		yaffs_verify_oh(in, oh, &new_tags, 1);
+
+		/* Create new chunk in NAND */
+		new_chunk_id =
+		    yaffs_write_new_chunk(dev, buffer, &new_tags,
+					  (prev_chunk_id > 0) ? 1 : 0);
+
+		if (new_chunk_id >= 0) {
+
+			in->hdr_chunk = new_chunk_id;
+
+			if (prev_chunk_id > 0) {
+				yaffs_chunk_del(dev, prev_chunk_id, 1,
+						__LINE__);
+			}
+
+			if (!yaffs_obj_cache_dirty(in))
+				in->dirty = 0;
+
+			/* If this was a shrink, then mark the block that the chunk lives on */
+			if (is_shrink) {
+				bi = yaffs_get_block_info(in->my_dev,
+							  new_chunk_id /
+							  in->my_dev->param.
+							  chunks_per_block);
+				bi->has_shrink_hdr = 1;
+			}
+
+		}
+
+		ret_val = new_chunk_id;
+
+	}
+
+	if (buffer)
+		yaffs_release_temp_buffer(dev, buffer, __LINE__);
+
+	return ret_val;
+}
+
+/*--------------------- File read/write ------------------------
+ * Read and write have very similar structures.
+ * In general the read/write has three parts to it
+ * An incomplete chunk to start with (if the read/write is not chunk-aligned)
+ * Some complete chunks
+ * An incomplete chunk to end off with
+ *
+ * Curve-balls: the first chunk might also be the last chunk.
+ */
+
+int yaffs_file_rd(struct yaffs_obj *in, u8 * buffer, loff_t offset, int n_bytes)
+{
+
+	int chunk;
+	u32 start;
+	int n_copy;
+	int n = n_bytes;
+	int n_done = 0;
+	struct yaffs_cache *cache;
+
+	struct yaffs_dev *dev;
+
+	dev = in->my_dev;
+
+	while (n > 0) {
+		/* chunk = offset / dev->data_bytes_per_chunk + 1; */
+		/* start = offset % dev->data_bytes_per_chunk; */
+		yaffs_addr_to_chunk(dev, offset, &chunk, &start);
+		chunk++;
+
+		/* OK now check for the curveball where the start and end are in
+		 * the same chunk.
+		 */
+		if ((start + n) < dev->data_bytes_per_chunk)
+			n_copy = n;
+		else
+			n_copy = dev->data_bytes_per_chunk - start;
+
+		cache = yaffs_find_chunk_cache(in, chunk);
+
+		/* If the chunk is already in the cache or it is less than a whole chunk
+		 * or we're using inband tags then use the cache (if there is caching)
+		 * else bypass the cache.
+		 */
+		if (cache || n_copy != dev->data_bytes_per_chunk
+		    || dev->param.inband_tags) {
+			if (dev->param.n_caches > 0) {
+
+				/* If we can't find the data in the cache, then load it up. */
+
+				if (!cache) {
+					cache =
+					    yaffs_grab_chunk_cache(in->my_dev);
+					cache->object = in;
+					cache->chunk_id = chunk;
+					cache->dirty = 0;
+					cache->locked = 0;
+					yaffs_rd_data_obj(in, chunk,
+							  cache->data);
+					cache->n_bytes = 0;
+				}
+
+				yaffs_use_cache(dev, cache, 0);
+
+				cache->locked = 1;
+
+				memcpy(buffer, &cache->data[start], n_copy);
+
+				cache->locked = 0;
+			} else {
+				/* Read into the local buffer then copy.. */
+
+				u8 *local_buffer =
+				    yaffs_get_temp_buffer(dev, __LINE__);
+				yaffs_rd_data_obj(in, chunk, local_buffer);
+
+				memcpy(buffer, &local_buffer[start], n_copy);
+
+				yaffs_release_temp_buffer(dev, local_buffer,
+							  __LINE__);
+			}
+
+		} else {
+
+			/* A full chunk. Read directly into the supplied buffer. */
+			yaffs_rd_data_obj(in, chunk, buffer);
+
+		}
+
+		n -= n_copy;
+		offset += n_copy;
+		buffer += n_copy;
+		n_done += n_copy;
+
+	}
+
+	return n_done;
+}
+
+int yaffs_do_file_wr(struct yaffs_obj *in, const u8 * buffer, loff_t offset,
+		     int n_bytes, int write_trhrough)
+{
+
+	int chunk;
+	u32 start;
+	int n_copy;
+	int n = n_bytes;
+	int n_done = 0;
+	int n_writeback;
+	int start_write = offset;
+	int chunk_written = 0;
+	u32 n_bytes_read;
+	u32 chunk_start;
+
+	struct yaffs_dev *dev;
+
+	dev = in->my_dev;
+
+	while (n > 0 && chunk_written >= 0) {
+		yaffs_addr_to_chunk(dev, offset, &chunk, &start);
+
+		if (chunk * dev->data_bytes_per_chunk + start != offset ||
+		    start >= dev->data_bytes_per_chunk) {
+			yaffs_trace(YAFFS_TRACE_ERROR,
+				"AddrToChunk of offset %d gives chunk %d start %d",
+				(int)offset, chunk, start);
+		}
+		chunk++;	/* File pos to chunk in file offset */
+
+		/* OK now check for the curveball where the start and end are in
+		 * the same chunk.
+		 */
+
+		if ((start + n) < dev->data_bytes_per_chunk) {
+			n_copy = n;
+
+			/* Now folks, to calculate how many bytes to write back....
+			 * If we're overwriting and not writing to then end of file then
+			 * we need to write back as much as was there before.
+			 */
+
+			chunk_start = ((chunk - 1) * dev->data_bytes_per_chunk);
+
+			if (chunk_start > in->variant.file_variant.file_size)
+				n_bytes_read = 0;	/* Past end of file */
+			else
+				n_bytes_read =
+				    in->variant.file_variant.file_size -
+				    chunk_start;
+
+			if (n_bytes_read > dev->data_bytes_per_chunk)
+				n_bytes_read = dev->data_bytes_per_chunk;
+
+			n_writeback =
+			    (n_bytes_read >
+			     (start + n)) ? n_bytes_read : (start + n);
+
+			if (n_writeback < 0
+			    || n_writeback > dev->data_bytes_per_chunk)
+				YBUG();
+
+		} else {
+			n_copy = dev->data_bytes_per_chunk - start;
+			n_writeback = dev->data_bytes_per_chunk;
+		}
+
+		if (n_copy != dev->data_bytes_per_chunk
+		    || dev->param.inband_tags) {
+			/* An incomplete start or end chunk (or maybe both start and end chunk),
+			 * or we're using inband tags, so we want to use the cache buffers.
+			 */
+			if (dev->param.n_caches > 0) {
+				struct yaffs_cache *cache;
+				/* If we can't find the data in the cache, then load the cache */
+				cache = yaffs_find_chunk_cache(in, chunk);
+
+				if (!cache
+				    && yaffs_check_alloc_available(dev, 1)) {
+					cache = yaffs_grab_chunk_cache(dev);
+					cache->object = in;
+					cache->chunk_id = chunk;
+					cache->dirty = 0;
+					cache->locked = 0;
+					yaffs_rd_data_obj(in, chunk,
+							  cache->data);
+				} else if (cache &&
+					   !cache->dirty &&
+					   !yaffs_check_alloc_available(dev,
+									1)) {
+					/* Drop the cache if it was a read cache item and
+					 * no space check has been made for it.
+					 */
+					cache = NULL;
+				}
+
+				if (cache) {
+					yaffs_use_cache(dev, cache, 1);
+					cache->locked = 1;
+
+					memcpy(&cache->data[start], buffer,
+					       n_copy);
+
+					cache->locked = 0;
+					cache->n_bytes = n_writeback;
+
+					if (write_trhrough) {
+						chunk_written =
+						    yaffs_wr_data_obj
+						    (cache->object,
+						     cache->chunk_id,
+						     cache->data,
+						     cache->n_bytes, 1);
+						cache->dirty = 0;
+					}
+
+				} else {
+					chunk_written = -1;	/* fail the write */
+				}
+			} else {
+				/* An incomplete start or end chunk (or maybe both start and end chunk)
+				 * Read into the local buffer then copy, then copy over and write back.
+				 */
+
+				u8 *local_buffer =
+				    yaffs_get_temp_buffer(dev, __LINE__);
+
+				yaffs_rd_data_obj(in, chunk, local_buffer);
+
+				memcpy(&local_buffer[start], buffer, n_copy);
+
+				chunk_written =
+				    yaffs_wr_data_obj(in, chunk,
+						      local_buffer,
+						      n_writeback, 0);
+
+				yaffs_release_temp_buffer(dev, local_buffer,
+							  __LINE__);
+
+			}
+
+		} else {
+			/* A full chunk. Write directly from the supplied buffer. */
+
+			chunk_written =
+			    yaffs_wr_data_obj(in, chunk, buffer,
+					      dev->data_bytes_per_chunk, 0);
+
+			/* Since we've overwritten the cached data, we better invalidate it. */
+			yaffs_invalidate_chunk_cache(in, chunk);
+		}
+
+		if (chunk_written >= 0) {
+			n -= n_copy;
+			offset += n_copy;
+			buffer += n_copy;
+			n_done += n_copy;
+		}
+
+	}
+
+	/* Update file object */
+
+	if ((start_write + n_done) > in->variant.file_variant.file_size)
+		in->variant.file_variant.file_size = (start_write + n_done);
+
+	in->dirty = 1;
+
+	return n_done;
+}
+
+int yaffs_wr_file(struct yaffs_obj *in, const u8 * buffer, loff_t offset,
+		  int n_bytes, int write_trhrough)
+{
+	yaffs2_handle_hole(in, offset);
+	return yaffs_do_file_wr(in, buffer, offset, n_bytes, write_trhrough);
+}
+
+/* ---------------------- File resizing stuff ------------------ */
+
+static void yaffs_prune_chunks(struct yaffs_obj *in, int new_size)
+{
+
+	struct yaffs_dev *dev = in->my_dev;
+	int old_size = in->variant.file_variant.file_size;
+
+	int last_del = 1 + (old_size - 1) / dev->data_bytes_per_chunk;
+
+	int start_del = 1 + (new_size + dev->data_bytes_per_chunk - 1) /
+	    dev->data_bytes_per_chunk;
+	int i;
+	int chunk_id;
+
+	/* Delete backwards so that we don't end up with holes if
+	 * power is lost part-way through the operation.
+	 */
+	for (i = last_del; i >= start_del; i--) {
+		/* NB this could be optimised somewhat,
+		 * eg. could retrieve the tags and write them without
+		 * using yaffs_chunk_del
+		 */
+
+		chunk_id = yaffs_find_del_file_chunk(in, i, NULL);
+		if (chunk_id > 0) {
+			if (chunk_id <
+			    (dev->internal_start_block *
+			     dev->param.chunks_per_block)
+			    || chunk_id >=
+			    ((dev->internal_end_block +
+			      1) * dev->param.chunks_per_block)) {
+				yaffs_trace(YAFFS_TRACE_ALWAYS,
+					"Found daft chunk_id %d for %d",
+					chunk_id, i);
+			} else {
+				in->n_data_chunks--;
+				yaffs_chunk_del(dev, chunk_id, 1, __LINE__);
+			}
+		}
+	}
+
+}
+
+void yaffs_resize_file_down(struct yaffs_obj *obj, loff_t new_size)
+{
+	int new_full;
+	u32 new_partial;
+	struct yaffs_dev *dev = obj->my_dev;
+
+	yaffs_addr_to_chunk(dev, new_size, &new_full, &new_partial);
+
+	yaffs_prune_chunks(obj, new_size);
+
+	if (new_partial != 0) {
+		int last_chunk = 1 + new_full;
+		u8 *local_buffer = yaffs_get_temp_buffer(dev, __LINE__);
+
+		/* Rewrite the last chunk with its new size and zero pad */
+		yaffs_rd_data_obj(obj, last_chunk, local_buffer);
+		memset(local_buffer + new_partial, 0,
+		       dev->data_bytes_per_chunk - new_partial);
+
+		yaffs_wr_data_obj(obj, last_chunk, local_buffer,
+				  new_partial, 1);
+
+		yaffs_release_temp_buffer(dev, local_buffer, __LINE__);
+	}
+
+	obj->variant.file_variant.file_size = new_size;
+
+	yaffs_prune_tree(dev, &obj->variant.file_variant);
+}
+
+int yaffs_resize_file(struct yaffs_obj *in, loff_t new_size)
+{
+	struct yaffs_dev *dev = in->my_dev;
+	int old_size = in->variant.file_variant.file_size;
+
+	yaffs_flush_file_cache(in);
+	yaffs_invalidate_whole_cache(in);
+
+	yaffs_check_gc(dev, 0);
+
+	if (in->variant_type != YAFFS_OBJECT_TYPE_FILE)
+		return YAFFS_FAIL;
+
+	if (new_size == old_size)
+		return YAFFS_OK;
+
+	if (new_size > old_size) {
+		yaffs2_handle_hole(in, new_size);
+		in->variant.file_variant.file_size = new_size;
+	} else {
+		/* new_size < old_size */
+		yaffs_resize_file_down(in, new_size);
+	}
+
+	/* Write a new object header to reflect the resize.
+	 * show we've shrunk the file, if need be
+	 * Do this only if the file is not in the deleted directories
+	 * and is not shadowed.
+	 */
+	if (in->parent &&
+	    !in->is_shadowed &&
+	    in->parent->obj_id != YAFFS_OBJECTID_UNLINKED &&
+	    in->parent->obj_id != YAFFS_OBJECTID_DELETED)
+		yaffs_update_oh(in, NULL, 0, 0, 0, NULL);
+
+	return YAFFS_OK;
+}
+
+int yaffs_flush_file(struct yaffs_obj *in, int update_time, int data_sync)
+{
+	int ret_val;
+	if (in->dirty) {
+		yaffs_flush_file_cache(in);
+		if (data_sync)	/* Only sync data */
+			ret_val = YAFFS_OK;
+		else {
+			if (update_time)
+				yaffs_load_current_time(in, 0, 0);
+
+			ret_val = (yaffs_update_oh(in, NULL, 0, 0, 0, NULL) >=
+				   0) ? YAFFS_OK : YAFFS_FAIL;
+		}
+	} else {
+		ret_val = YAFFS_OK;
+	}
+
+	return ret_val;
+
+}
+
+
+/* yaffs_del_file deletes the whole file data
+ * and the inode associated with the file.
+ * It does not delete the links associated with the file.
+ */
+static int yaffs_unlink_file_if_needed(struct yaffs_obj *in)
+{
+
+	int ret_val;
+	int del_now = 0;
+	struct yaffs_dev *dev = in->my_dev;
+
+	if (!in->my_inode)
+		del_now = 1;
+
+	if (del_now) {
+		ret_val =
+		    yaffs_change_obj_name(in, in->my_dev->del_dir,
+					  _Y("deleted"), 0, 0);
+		yaffs_trace(YAFFS_TRACE_TRACING,
+			"yaffs: immediate deletion of file %d",
+			in->obj_id);
+		in->deleted = 1;
+		in->my_dev->n_deleted_files++;
+		if (dev->param.disable_soft_del || dev->param.is_yaffs2)
+			yaffs_resize_file(in, 0);
+		yaffs_soft_del_file(in);
+	} else {
+		ret_val =
+		    yaffs_change_obj_name(in, in->my_dev->unlinked_dir,
+					  _Y("unlinked"), 0, 0);
+	}
+
+	return ret_val;
+}
+
+int yaffs_del_file(struct yaffs_obj *in)
+{
+	int ret_val = YAFFS_OK;
+	int deleted;		/* Need to cache value on stack if in is freed */
+	struct yaffs_dev *dev = in->my_dev;
+
+	if (dev->param.disable_soft_del || dev->param.is_yaffs2)
+		yaffs_resize_file(in, 0);
+
+	if (in->n_data_chunks > 0) {
+		/* Use soft deletion if there is data in the file.
+		 * That won't be the case if it has been resized to zero.
+		 */
+		if (!in->unlinked)
+			ret_val = yaffs_unlink_file_if_needed(in);
+
+		deleted = in->deleted;
+
+		if (ret_val == YAFFS_OK && in->unlinked && !in->deleted) {
+			in->deleted = 1;
+			deleted = 1;
+			in->my_dev->n_deleted_files++;
+			yaffs_soft_del_file(in);
+		}
+		return deleted ? YAFFS_OK : YAFFS_FAIL;
+	} else {
+		/* The file has no data chunks so we toss it immediately */
+		yaffs_free_tnode(in->my_dev, in->variant.file_variant.top);
+		in->variant.file_variant.top = NULL;
+		yaffs_generic_obj_del(in);
+
+		return YAFFS_OK;
+	}
+}
+
+int yaffs_is_non_empty_dir(struct yaffs_obj *obj)
+{
+	return (obj &&
+	        obj->variant_type == YAFFS_OBJECT_TYPE_DIRECTORY) &&
+	        !(list_empty(&obj->variant.dir_variant.children));
+}
+
+static int yaffs_del_dir(struct yaffs_obj *obj)
+{
+	/* First check that the directory is empty. */
+	if (yaffs_is_non_empty_dir(obj))
+		return YAFFS_FAIL;
+
+	return yaffs_generic_obj_del(obj);
+}
+
+static int yaffs_del_symlink(struct yaffs_obj *in)
+{
+	if (in->variant.symlink_variant.alias)
+		kfree(in->variant.symlink_variant.alias);
+	in->variant.symlink_variant.alias = NULL;
+
+	return yaffs_generic_obj_del(in);
+}
+
+static int yaffs_del_link(struct yaffs_obj *in)
+{
+	/* remove this hardlink from the list assocaited with the equivalent
+	 * object
+	 */
+	list_del_init(&in->hard_links);
+	return yaffs_generic_obj_del(in);
+}
+
+int yaffs_del_obj(struct yaffs_obj *obj)
+{
+	int ret_val = -1;
+	switch (obj->variant_type) {
+	case YAFFS_OBJECT_TYPE_FILE:
+		ret_val = yaffs_del_file(obj);
+		break;
+	case YAFFS_OBJECT_TYPE_DIRECTORY:
+		if (!list_empty(&obj->variant.dir_variant.dirty)) {
+			yaffs_trace(YAFFS_TRACE_BACKGROUND,
+				"Remove object %d from dirty directories",
+				obj->obj_id);
+			list_del_init(&obj->variant.dir_variant.dirty);
+		}
+		return yaffs_del_dir(obj);
+		break;
+	case YAFFS_OBJECT_TYPE_SYMLINK:
+		ret_val = yaffs_del_symlink(obj);
+		break;
+	case YAFFS_OBJECT_TYPE_HARDLINK:
+		ret_val = yaffs_del_link(obj);
+		break;
+	case YAFFS_OBJECT_TYPE_SPECIAL:
+		ret_val = yaffs_generic_obj_del(obj);
+		break;
+	case YAFFS_OBJECT_TYPE_UNKNOWN:
+		ret_val = 0;
+		break;		/* should not happen. */
+	}
+
+	return ret_val;
+}
+
+static int yaffs_unlink_worker(struct yaffs_obj *obj)
+{
+
+	int del_now = 0;
+
+	if (!obj->my_inode)
+		del_now = 1;
+
+	if (obj)
+		yaffs_update_parent(obj->parent);
+
+	if (obj->variant_type == YAFFS_OBJECT_TYPE_HARDLINK) {
+		return yaffs_del_link(obj);
+	} else if (!list_empty(&obj->hard_links)) {
+		/* Curve ball: We're unlinking an object that has a hardlink.
+		 *
+		 * This problem arises because we are not strictly following
+		 * The Linux link/inode model.
+		 *
+		 * We can't really delete the object.
+		 * Instead, we do the following:
+		 * - Select a hardlink.
+		 * - Unhook it from the hard links
+		 * - Move it from its parent directory (so that the rename can work)
+		 * - Rename the object to the hardlink's name.
+		 * - Delete the hardlink
+		 */
+
+		struct yaffs_obj *hl;
+		struct yaffs_obj *parent;
+		int ret_val;
+		YCHAR name[YAFFS_MAX_NAME_LENGTH + 1];
+
+		hl = list_entry(obj->hard_links.next, struct yaffs_obj,
+				hard_links);
+
+		yaffs_get_obj_name(hl, name, YAFFS_MAX_NAME_LENGTH + 1);
+		parent = hl->parent;
+
+		list_del_init(&hl->hard_links);
+
+		yaffs_add_obj_to_dir(obj->my_dev->unlinked_dir, hl);
+
+		ret_val = yaffs_change_obj_name(obj, parent, name, 0, 0);
+
+		if (ret_val == YAFFS_OK)
+			ret_val = yaffs_generic_obj_del(hl);
+
+		return ret_val;
+
+	} else if (del_now) {
+		switch (obj->variant_type) {
+		case YAFFS_OBJECT_TYPE_FILE:
+			return yaffs_del_file(obj);
+			break;
+		case YAFFS_OBJECT_TYPE_DIRECTORY:
+			list_del_init(&obj->variant.dir_variant.dirty);
+			return yaffs_del_dir(obj);
+			break;
+		case YAFFS_OBJECT_TYPE_SYMLINK:
+			return yaffs_del_symlink(obj);
+			break;
+		case YAFFS_OBJECT_TYPE_SPECIAL:
+			return yaffs_generic_obj_del(obj);
+			break;
+		case YAFFS_OBJECT_TYPE_HARDLINK:
+		case YAFFS_OBJECT_TYPE_UNKNOWN:
+		default:
+			return YAFFS_FAIL;
+		}
+	} else if (yaffs_is_non_empty_dir(obj)) {
+		return YAFFS_FAIL;
+	} else {
+		return yaffs_change_obj_name(obj, obj->my_dev->unlinked_dir,
+					     _Y("unlinked"), 0, 0);
+        }
+}
+
+static int yaffs_unlink_obj(struct yaffs_obj *obj)
+{
+
+	if (obj && obj->unlink_allowed)
+		return yaffs_unlink_worker(obj);
+
+	return YAFFS_FAIL;
+
+}
+
+int yaffs_unlinker(struct yaffs_obj *dir, const YCHAR * name)
+{
+	struct yaffs_obj *obj;
+
+	obj = yaffs_find_by_name(dir, name);
+	return yaffs_unlink_obj(obj);
+}
+
+/* Note:
+ * If old_name is NULL then we take old_dir as the object to be renamed.
+ */
+int yaffs_rename_obj(struct yaffs_obj *old_dir, const YCHAR * old_name,
+		     struct yaffs_obj *new_dir, const YCHAR * new_name)
+{
+	struct yaffs_obj *obj = NULL;
+	struct yaffs_obj *existing_target = NULL;
+	int force = 0;
+	int result;
+	struct yaffs_dev *dev;
+
+	if (!old_dir || old_dir->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY)
+		YBUG();
+	if (!new_dir || new_dir->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY)
+		YBUG();
+
+	dev = old_dir->my_dev;
+
+#ifdef CONFIG_YAFFS_CASE_INSENSITIVE
+	/* Special case for case insemsitive systems.
+	 * While look-up is case insensitive, the name isn't.
+	 * Therefore we might want to change x.txt to X.txt
+	 */
+	if (old_dir == new_dir && 
+		old_name && new_name && 
+		strcmp(old_name, new_name) == 0)
+		force = 1;
+#endif
+
+	if (strnlen(new_name, YAFFS_MAX_NAME_LENGTH + 1) >
+	    YAFFS_MAX_NAME_LENGTH)
+		/* ENAMETOOLONG */
+		return YAFFS_FAIL;
+
+	if(old_name)
+		obj = yaffs_find_by_name(old_dir, old_name);
+	else{
+		obj = old_dir;
+		old_dir = obj->parent;
+	}
+
+
+	if (obj && obj->rename_allowed) {
+
+		/* Now do the handling for an existing target, if there is one */
+
+		existing_target = yaffs_find_by_name(new_dir, new_name);
+		if (yaffs_is_non_empty_dir(existing_target)){
+			return YAFFS_FAIL;	/* ENOTEMPTY */
+		} else if (existing_target && existing_target != obj) {
+			/* Nuke the target first, using shadowing,
+			 * but only if it isn't the same object.
+			 *
+			 * Note we must disable gc otherwise it can mess up the shadowing.
+			 *
+			 */
+			dev->gc_disable = 1;
+			yaffs_change_obj_name(obj, new_dir, new_name, force,
+					      existing_target->obj_id);
+			existing_target->is_shadowed = 1;
+			yaffs_unlink_obj(existing_target);
+			dev->gc_disable = 0;
+		}
+
+		result = yaffs_change_obj_name(obj, new_dir, new_name, 1, 0);
+
+		yaffs_update_parent(old_dir);
+		if (new_dir != old_dir)
+			yaffs_update_parent(new_dir);
+
+		return result;
+	}
+	return YAFFS_FAIL;
+}
+
+/*----------------------- Initialisation Scanning ---------------------- */
+
+void yaffs_handle_shadowed_obj(struct yaffs_dev *dev, int obj_id,
+			       int backward_scanning)
+{
+	struct yaffs_obj *obj;
+
+	if (!backward_scanning) {
+		/* Handle YAFFS1 forward scanning case
+		 * For YAFFS1 we always do the deletion
+		 */
+
+	} else {
+		/* Handle YAFFS2 case (backward scanning)
+		 * If the shadowed object exists then ignore.
+		 */
+		obj = yaffs_find_by_number(dev, obj_id);
+		if (obj)
+			return;
+	}
+
+	/* Let's create it (if it does not exist) assuming it is a file so that it can do shrinking etc.
+	 * We put it in unlinked dir to be cleaned up after the scanning
+	 */
+	obj =
+	    yaffs_find_or_create_by_number(dev, obj_id, YAFFS_OBJECT_TYPE_FILE);
+	if (!obj)
+		return;
+	obj->is_shadowed = 1;
+	yaffs_add_obj_to_dir(dev->unlinked_dir, obj);
+	obj->variant.file_variant.shrink_size = 0;
+	obj->valid = 1;		/* So that we don't read any other info for this file */
+
+}
+
+void yaffs_link_fixup(struct yaffs_dev *dev, struct yaffs_obj *hard_list)
+{
+	struct yaffs_obj *hl;
+	struct yaffs_obj *in;
+
+	while (hard_list) {
+		hl = hard_list;
+		hard_list = (struct yaffs_obj *)(hard_list->hard_links.next);
+
+		in = yaffs_find_by_number(dev,
+					  hl->variant.
+					  hardlink_variant.equiv_id);
+
+		if (in) {
+			/* Add the hardlink pointers */
+			hl->variant.hardlink_variant.equiv_obj = in;
+			list_add(&hl->hard_links, &in->hard_links);
+		} else {
+			/* Todo Need to report/handle this better.
+			 * Got a problem... hardlink to a non-existant object
+			 */
+			hl->variant.hardlink_variant.equiv_obj = NULL;
+			INIT_LIST_HEAD(&hl->hard_links);
+
+		}
+	}
+}
+
+static void yaffs_strip_deleted_objs(struct yaffs_dev *dev)
+{
+	/*
+	 *  Sort out state of unlinked and deleted objects after scanning.
+	 */
+	struct list_head *i;
+	struct list_head *n;
+	struct yaffs_obj *l;
+
+	if (dev->read_only)
+		return;
+
+	/* Soft delete all the unlinked files */
+	list_for_each_safe(i, n,
+			   &dev->unlinked_dir->variant.dir_variant.children) {
+		if (i) {
+			l = list_entry(i, struct yaffs_obj, siblings);
+			yaffs_del_obj(l);
+		}
+	}
+
+	list_for_each_safe(i, n, &dev->del_dir->variant.dir_variant.children) {
+		if (i) {
+			l = list_entry(i, struct yaffs_obj, siblings);
+			yaffs_del_obj(l);
+		}
+	}
+
+}
+
+/*
+ * This code iterates through all the objects making sure that they are rooted.
+ * Any unrooted objects are re-rooted in lost+found.
+ * An object needs to be in one of:
+ * - Directly under deleted, unlinked
+ * - Directly or indirectly under root.
+ *
+ * Note:
+ *  This code assumes that we don't ever change the current relationships between
+ *  directories:
+ *   root_dir->parent == unlinked_dir->parent == del_dir->parent == NULL
+ *   lost-n-found->parent == root_dir
+ *
+ * This fixes the problem where directories might have inadvertently been deleted
+ * leaving the object "hanging" without being rooted in the directory tree.
+ */
+
+static int yaffs_has_null_parent(struct yaffs_dev *dev, struct yaffs_obj *obj)
+{
+	return (obj == dev->del_dir ||
+		obj == dev->unlinked_dir || obj == dev->root_dir);
+}
+
+static void yaffs_fix_hanging_objs(struct yaffs_dev *dev)
+{
+	struct yaffs_obj *obj;
+	struct yaffs_obj *parent;
+	int i;
+	struct list_head *lh;
+	struct list_head *n;
+	int depth_limit;
+	int hanging;
+
+	if (dev->read_only)
+		return;
+
+	/* Iterate through the objects in each hash entry,
+	 * looking at each object.
+	 * Make sure it is rooted.
+	 */
+
+	for (i = 0; i < YAFFS_NOBJECT_BUCKETS; i++) {
+		list_for_each_safe(lh, n, &dev->obj_bucket[i].list) {
+			if (lh) {
+				obj =
+				    list_entry(lh, struct yaffs_obj, hash_link);
+				parent = obj->parent;
+
+				if (yaffs_has_null_parent(dev, obj)) {
+					/* These directories are not hanging */
+					hanging = 0;
+				} else if (!parent
+					   || parent->variant_type !=
+					   YAFFS_OBJECT_TYPE_DIRECTORY) {
+					hanging = 1;
+				} else if (yaffs_has_null_parent(dev, parent)) {
+					hanging = 0;
+				} else {
+					/*
+					 * Need to follow the parent chain to see if it is hanging.
+					 */
+					hanging = 0;
+					depth_limit = 100;
+
+					while (parent != dev->root_dir &&
+					       parent->parent &&
+					       parent->parent->variant_type ==
+					       YAFFS_OBJECT_TYPE_DIRECTORY
+					       && depth_limit > 0) {
+						parent = parent->parent;
+						depth_limit--;
+					}
+					if (parent != dev->root_dir)
+						hanging = 1;
+				}
+				if (hanging) {
+					yaffs_trace(YAFFS_TRACE_SCAN,
+						"Hanging object %d moved to lost and found",
+						obj->obj_id);
+					yaffs_add_obj_to_dir(dev->lost_n_found,
+							     obj);
+				}
+			}
+		}
+	}
+}
+
+/*
+ * Delete directory contents for cleaning up lost and found.
+ */
+static void yaffs_del_dir_contents(struct yaffs_obj *dir)
+{
+	struct yaffs_obj *obj;
+	struct list_head *lh;
+	struct list_head *n;
+
+	if (dir->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY)
+		YBUG();
+
+	list_for_each_safe(lh, n, &dir->variant.dir_variant.children) {
+		if (lh) {
+			obj = list_entry(lh, struct yaffs_obj, siblings);
+			if (obj->variant_type == YAFFS_OBJECT_TYPE_DIRECTORY)
+				yaffs_del_dir_contents(obj);
+
+			yaffs_trace(YAFFS_TRACE_SCAN,
+				"Deleting lost_found object %d",
+				obj->obj_id);
+
+			/* Need to use UnlinkObject since Delete would not handle
+			 * hardlinked objects correctly.
+			 */
+			yaffs_unlink_obj(obj);
+		}
+	}
+
+}
+
+static void yaffs_empty_l_n_f(struct yaffs_dev *dev)
+{
+	yaffs_del_dir_contents(dev->lost_n_found);
+}
+
+
+struct yaffs_obj *yaffs_find_by_name(struct yaffs_obj *directory,
+				     const YCHAR * name)
+{
+	int sum;
+
+	struct list_head *i;
+	YCHAR buffer[YAFFS_MAX_NAME_LENGTH + 1];
+
+	struct yaffs_obj *l;
+
+	if (!name)
+		return NULL;
+
+	if (!directory) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS,
+			"tragedy: yaffs_find_by_name: null pointer directory"
+			);
+		YBUG();
+		return NULL;
+	}
+	if (directory->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS,
+			"tragedy: yaffs_find_by_name: non-directory"
+			);
+		YBUG();
+	}
+
+	sum = yaffs_calc_name_sum(name);
+
+	list_for_each(i, &directory->variant.dir_variant.children) {
+		if (i) {
+			l = list_entry(i, struct yaffs_obj, siblings);
+
+			if (l->parent != directory)
+				YBUG();
+
+			yaffs_check_obj_details_loaded(l);
+
+			/* Special case for lost-n-found */
+			if (l->obj_id == YAFFS_OBJECTID_LOSTNFOUND) {
+				if (!strcmp(name, YAFFS_LOSTNFOUND_NAME))
+					return l;
+			} else if (l->sum == sum
+				   || l->hdr_chunk <= 0) {
+				/* LostnFound chunk called Objxxx
+				 * Do a real check
+				 */
+				yaffs_get_obj_name(l, buffer,
+						   YAFFS_MAX_NAME_LENGTH + 1);
+				if (strncmp
+				    (name, buffer, YAFFS_MAX_NAME_LENGTH) == 0)
+					return l;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+/* GetEquivalentObject dereferences any hard links to get to the
+ * actual object.
+ */
+
+struct yaffs_obj *yaffs_get_equivalent_obj(struct yaffs_obj *obj)
+{
+	if (obj && obj->variant_type == YAFFS_OBJECT_TYPE_HARDLINK) {
+		/* We want the object id of the equivalent object, not this one */
+		obj = obj->variant.hardlink_variant.equiv_obj;
+		yaffs_check_obj_details_loaded(obj);
+	}
+	return obj;
+}
+
+/*
+ *  A note or two on object names.
+ *  * If the object name is missing, we then make one up in the form objnnn
+ *
+ *  * ASCII names are stored in the object header's name field from byte zero
+ *  * Unicode names are historically stored starting from byte zero.
+ *
+ * Then there are automatic Unicode names...
+ * The purpose of these is to save names in a way that can be read as
+ * ASCII or Unicode names as appropriate, thus allowing a Unicode and ASCII
+ * system to share files.
+ *
+ * These automatic unicode are stored slightly differently...
+ *  - If the name can fit in the ASCII character space then they are saved as 
+ *    ascii names as per above.
+ *  - If the name needs Unicode then the name is saved in Unicode
+ *    starting at oh->name[1].
+
+ */
+static void yaffs_fix_null_name(struct yaffs_obj *obj, YCHAR * name,
+				int buffer_size)
+{
+	/* Create an object name if we could not find one. */
+	if (strnlen(name, YAFFS_MAX_NAME_LENGTH) == 0) {
+		YCHAR local_name[20];
+		YCHAR num_string[20];
+		YCHAR *x = &num_string[19];
+		unsigned v = obj->obj_id;
+		num_string[19] = 0;
+		while (v > 0) {
+			x--;
+			*x = '0' + (v % 10);
+			v /= 10;
+		}
+		/* make up a name */
+		strcpy(local_name, YAFFS_LOSTNFOUND_PREFIX);
+		strcat(local_name, x);
+		strncpy(name, local_name, buffer_size - 1);
+	}
+}
+
+int yaffs_get_obj_name(struct yaffs_obj *obj, YCHAR * name, int buffer_size)
+{
+	memset(name, 0, buffer_size * sizeof(YCHAR));
+
+	yaffs_check_obj_details_loaded(obj);
+
+	if (obj->obj_id == YAFFS_OBJECTID_LOSTNFOUND) {
+		strncpy(name, YAFFS_LOSTNFOUND_NAME, buffer_size - 1);
+	}
+#ifndef CONFIG_YAFFS_NO_SHORT_NAMES
+	else if (obj->short_name[0]) {
+		strcpy(name, obj->short_name);
+	}
+#endif
+	else if (obj->hdr_chunk > 0) {
+		int result;
+		u8 *buffer = yaffs_get_temp_buffer(obj->my_dev, __LINE__);
+
+		struct yaffs_obj_hdr *oh = (struct yaffs_obj_hdr *)buffer;
+
+		memset(buffer, 0, obj->my_dev->data_bytes_per_chunk);
+
+		if (obj->hdr_chunk > 0) {
+			result = yaffs_rd_chunk_tags_nand(obj->my_dev,
+							  obj->hdr_chunk,
+							  buffer, NULL);
+		}
+		yaffs_load_name_from_oh(obj->my_dev, name, oh->name,
+					buffer_size);
+
+		yaffs_release_temp_buffer(obj->my_dev, buffer, __LINE__);
+	}
+
+	yaffs_fix_null_name(obj, name, buffer_size);
+
+	return strnlen(name, YAFFS_MAX_NAME_LENGTH);
+}
+
+int yaffs_get_obj_length(struct yaffs_obj *obj)
+{
+	/* Dereference any hard linking */
+	obj = yaffs_get_equivalent_obj(obj);
+
+	if (obj->variant_type == YAFFS_OBJECT_TYPE_FILE)
+		return obj->variant.file_variant.file_size;
+	if (obj->variant_type == YAFFS_OBJECT_TYPE_SYMLINK) {
+		if (!obj->variant.symlink_variant.alias)
+			return 0;
+		return strnlen(obj->variant.symlink_variant.alias,
+				     YAFFS_MAX_ALIAS_LENGTH);
+	} else {
+		/* Only a directory should drop through to here */
+		return obj->my_dev->data_bytes_per_chunk;
+	}
+}
+
+int yaffs_get_obj_link_count(struct yaffs_obj *obj)
+{
+	int count = 0;
+	struct list_head *i;
+
+	if (!obj->unlinked)
+		count++;	/* the object itself */
+
+	list_for_each(i, &obj->hard_links)
+	    count++;		/* add the hard links; */
+
+	return count;
+}
+
+int yaffs_get_obj_inode(struct yaffs_obj *obj)
+{
+	obj = yaffs_get_equivalent_obj(obj);
+
+	return obj->obj_id;
+}
+
+unsigned yaffs_get_obj_type(struct yaffs_obj *obj)
+{
+	obj = yaffs_get_equivalent_obj(obj);
+
+	switch (obj->variant_type) {
+	case YAFFS_OBJECT_TYPE_FILE:
+		return DT_REG;
+		break;
+	case YAFFS_OBJECT_TYPE_DIRECTORY:
+		return DT_DIR;
+		break;
+	case YAFFS_OBJECT_TYPE_SYMLINK:
+		return DT_LNK;
+		break;
+	case YAFFS_OBJECT_TYPE_HARDLINK:
+		return DT_REG;
+		break;
+	case YAFFS_OBJECT_TYPE_SPECIAL:
+		if (S_ISFIFO(obj->yst_mode))
+			return DT_FIFO;
+		if (S_ISCHR(obj->yst_mode))
+			return DT_CHR;
+		if (S_ISBLK(obj->yst_mode))
+			return DT_BLK;
+		if (S_ISSOCK(obj->yst_mode))
+			return DT_SOCK;
+	default:
+		return DT_REG;
+		break;
+	}
+}
+
+YCHAR *yaffs_get_symlink_alias(struct yaffs_obj *obj)
+{
+	obj = yaffs_get_equivalent_obj(obj);
+	if (obj->variant_type == YAFFS_OBJECT_TYPE_SYMLINK)
+		return yaffs_clone_str(obj->variant.symlink_variant.alias);
+	else
+		return yaffs_clone_str(_Y(""));
+}
+
+/*--------------------------- Initialisation code -------------------------- */
+
+static int yaffs_check_dev_fns(const struct yaffs_dev *dev)
+{
+
+	/* Common functions, gotta have */
+	if (!dev->param.erase_fn || !dev->param.initialise_flash_fn)
+		return 0;
+
+#ifdef CONFIG_YAFFS_YAFFS2
+
+	/* Can use the "with tags" style interface for yaffs1 or yaffs2 */
+	if (dev->param.write_chunk_tags_fn &&
+	    dev->param.read_chunk_tags_fn &&
+	    !dev->param.write_chunk_fn &&
+	    !dev->param.read_chunk_fn &&
+	    dev->param.bad_block_fn && dev->param.query_block_fn)
+		return 1;
+#endif
+
+	/* Can use the "spare" style interface for yaffs1 */
+	if (!dev->param.is_yaffs2 &&
+	    !dev->param.write_chunk_tags_fn &&
+	    !dev->param.read_chunk_tags_fn &&
+	    dev->param.write_chunk_fn &&
+	    dev->param.read_chunk_fn &&
+	    !dev->param.bad_block_fn && !dev->param.query_block_fn)
+		return 1;
+
+	return 0;		/* bad */
+}
+
+static int yaffs_create_initial_dir(struct yaffs_dev *dev)
+{
+	/* Initialise the unlinked, deleted, root and lost and found directories */
+
+	dev->lost_n_found = dev->root_dir = NULL;
+	dev->unlinked_dir = dev->del_dir = NULL;
+
+	dev->unlinked_dir =
+	    yaffs_create_fake_dir(dev, YAFFS_OBJECTID_UNLINKED, S_IFDIR);
+
+	dev->del_dir =
+	    yaffs_create_fake_dir(dev, YAFFS_OBJECTID_DELETED, S_IFDIR);
+
+	dev->root_dir =
+	    yaffs_create_fake_dir(dev, YAFFS_OBJECTID_ROOT,
+				  YAFFS_ROOT_MODE | S_IFDIR);
+	dev->lost_n_found =
+	    yaffs_create_fake_dir(dev, YAFFS_OBJECTID_LOSTNFOUND,
+				  YAFFS_LOSTNFOUND_MODE | S_IFDIR);
+
+	if (dev->lost_n_found && dev->root_dir && dev->unlinked_dir
+	    && dev->del_dir) {
+		yaffs_add_obj_to_dir(dev->root_dir, dev->lost_n_found);
+		return YAFFS_OK;
+	}
+
+	return YAFFS_FAIL;
+}
+
+int yaffs_guts_initialise(struct yaffs_dev *dev)
+{
+	int init_failed = 0;
+	unsigned x;
+	int bits;
+
+	yaffs_trace(YAFFS_TRACE_TRACING, "yaffs: yaffs_guts_initialise()" );
+
+	/* Check stuff that must be set */
+
+	if (!dev) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS,
+			"yaffs: Need a device"
+			);
+		return YAFFS_FAIL;
+	}
+
+	dev->internal_start_block = dev->param.start_block;
+	dev->internal_end_block = dev->param.end_block;
+	dev->block_offset = 0;
+	dev->chunk_offset = 0;
+	dev->n_free_chunks = 0;
+
+	dev->gc_block = 0;
+
+	if (dev->param.start_block == 0) {
+		dev->internal_start_block = dev->param.start_block + 1;
+		dev->internal_end_block = dev->param.end_block + 1;
+		dev->block_offset = 1;
+		dev->chunk_offset = dev->param.chunks_per_block;
+	}
+
+	/* Check geometry parameters. */
+
+	if ((!dev->param.inband_tags && dev->param.is_yaffs2 &&
+		dev->param.total_bytes_per_chunk < 1024) ||
+		(!dev->param.is_yaffs2 &&
+			dev->param.total_bytes_per_chunk < 512) ||
+		(dev->param.inband_tags && !dev->param.is_yaffs2) ||
+		 dev->param.chunks_per_block < 2 ||
+		 dev->param.n_reserved_blocks < 2 ||
+		dev->internal_start_block <= 0 || 
+		dev->internal_end_block <= 0 || 
+		dev->internal_end_block <= 
+		(dev->internal_start_block + dev->param.n_reserved_blocks + 2)
+		) {
+		/* otherwise it is too small */
+		yaffs_trace(YAFFS_TRACE_ALWAYS,
+			"NAND geometry problems: chunk size %d, type is yaffs%s, inband_tags %d ",
+			dev->param.total_bytes_per_chunk,
+			dev->param.is_yaffs2 ? "2" : "",
+			dev->param.inband_tags);
+		return YAFFS_FAIL;
+	}
+
+	if (yaffs_init_nand(dev) != YAFFS_OK) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS, "InitialiseNAND failed");
+		return YAFFS_FAIL;
+	}
+
+	/* Sort out space for inband tags, if required */
+	if (dev->param.inband_tags)
+		dev->data_bytes_per_chunk =
+		    dev->param.total_bytes_per_chunk -
+		    sizeof(struct yaffs_packed_tags2_tags_only);
+	else
+		dev->data_bytes_per_chunk = dev->param.total_bytes_per_chunk;
+
+	/* Got the right mix of functions? */
+	if (!yaffs_check_dev_fns(dev)) {
+		/* Function missing */
+		yaffs_trace(YAFFS_TRACE_ALWAYS,
+			"device function(s) missing or wrong");
+
+		return YAFFS_FAIL;
+	}
+
+	if (dev->is_mounted) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS, "device already mounted");
+		return YAFFS_FAIL;
+	}
+
+	/* Finished with most checks. One or two more checks happen later on too. */
+
+	dev->is_mounted = 1;
+
+	/* OK now calculate a few things for the device */
+
+	/*
+	 *  Calculate all the chunk size manipulation numbers:
+	 */
+	x = dev->data_bytes_per_chunk;
+	/* We always use dev->chunk_shift and dev->chunk_div */
+	dev->chunk_shift = calc_shifts(x);
+	x >>= dev->chunk_shift;
+	dev->chunk_div = x;
+	/* We only use chunk mask if chunk_div is 1 */
+	dev->chunk_mask = (1 << dev->chunk_shift) - 1;
+
+	/*
+	 * Calculate chunk_grp_bits.
+	 * We need to find the next power of 2 > than internal_end_block
+	 */
+
+	x = dev->param.chunks_per_block * (dev->internal_end_block + 1);
+
+	bits = calc_shifts_ceiling(x);
+
+	/* Set up tnode width if wide tnodes are enabled. */
+	if (!dev->param.wide_tnodes_disabled) {
+		/* bits must be even so that we end up with 32-bit words */
+		if (bits & 1)
+			bits++;
+		if (bits < 16)
+			dev->tnode_width = 16;
+		else
+			dev->tnode_width = bits;
+	} else {
+		dev->tnode_width = 16;
+        }
+
+	dev->tnode_mask = (1 << dev->tnode_width) - 1;
+
+	/* Level0 Tnodes are 16 bits or wider (if wide tnodes are enabled),
+	 * so if the bitwidth of the
+	 * chunk range we're using is greater than 16 we need
+	 * to figure out chunk shift and chunk_grp_size
+	 */
+
+	if (bits <= dev->tnode_width)
+		dev->chunk_grp_bits = 0;
+	else
+		dev->chunk_grp_bits = bits - dev->tnode_width;
+
+	dev->tnode_size = (dev->tnode_width * YAFFS_NTNODES_LEVEL0) / 8;
+	if (dev->tnode_size < sizeof(struct yaffs_tnode))
+		dev->tnode_size = sizeof(struct yaffs_tnode);
+
+	dev->chunk_grp_size = 1 << dev->chunk_grp_bits;
+
+	if (dev->param.chunks_per_block < dev->chunk_grp_size) {
+		/* We have a problem because the soft delete won't work if
+		 * the chunk group size > chunks per block.
+		 * This can be remedied by using larger "virtual blocks".
+		 */
+		yaffs_trace(YAFFS_TRACE_ALWAYS, "chunk group too large");
+
+		return YAFFS_FAIL;
+	}
+
+	/* OK, we've finished verifying the device, lets continue with initialisation */
+
+	/* More device initialisation */
+	dev->all_gcs = 0;
+	dev->passive_gc_count = 0;
+	dev->oldest_dirty_gc_count = 0;
+	dev->bg_gcs = 0;
+	dev->gc_block_finder = 0;
+	dev->buffered_block = -1;
+	dev->doing_buffered_block_rewrite = 0;
+	dev->n_deleted_files = 0;
+	dev->n_bg_deletions = 0;
+	dev->n_unlinked_files = 0;
+	dev->n_ecc_fixed = 0;
+	dev->n_ecc_unfixed = 0;
+	dev->n_tags_ecc_fixed = 0;
+	dev->n_tags_ecc_unfixed = 0;
+	dev->n_erase_failures = 0;
+	dev->n_erased_blocks = 0;
+	dev->gc_disable = 0;
+	dev->has_pending_prioritised_gc = 1;	/* Assume the worst for now, will get fixed on first GC */
+	INIT_LIST_HEAD(&dev->dirty_dirs);
+	dev->oldest_dirty_seq = 0;
+	dev->oldest_dirty_block = 0;
+
+	/* Initialise temporary buffers and caches. */
+	if (!yaffs_init_tmp_buffers(dev))
+		init_failed = 1;
+
+	dev->cache = NULL;
+	dev->gc_cleanup_list = NULL;
+
+	if (!init_failed && dev->param.n_caches > 0) {
+		int i;
+		void *buf;
+		int cache_bytes =
+		    dev->param.n_caches * sizeof(struct yaffs_cache);
+
+		if (dev->param.n_caches > YAFFS_MAX_SHORT_OP_CACHES)
+			dev->param.n_caches = YAFFS_MAX_SHORT_OP_CACHES;
+
+		dev->cache = kmalloc(cache_bytes, GFP_NOFS);
+
+		buf = (u8 *) dev->cache;
+
+		if (dev->cache)
+			memset(dev->cache, 0, cache_bytes);
+
+		for (i = 0; i < dev->param.n_caches && buf; i++) {
+			dev->cache[i].object = NULL;
+			dev->cache[i].last_use = 0;
+			dev->cache[i].dirty = 0;
+			dev->cache[i].data = buf =
+			    kmalloc(dev->param.total_bytes_per_chunk, GFP_NOFS);
+		}
+		if (!buf)
+			init_failed = 1;
+
+		dev->cache_last_use = 0;
+	}
+
+	dev->cache_hits = 0;
+
+	if (!init_failed) {
+		dev->gc_cleanup_list =
+		    kmalloc(dev->param.chunks_per_block * sizeof(u32),
+					GFP_NOFS);
+		if (!dev->gc_cleanup_list)
+			init_failed = 1;
+	}
+
+	if (dev->param.is_yaffs2)
+		dev->param.use_header_file_size = 1;
+
+	if (!init_failed && !yaffs_init_blocks(dev))
+		init_failed = 1;
+
+	yaffs_init_tnodes_and_objs(dev);
+
+	if (!init_failed && !yaffs_create_initial_dir(dev))
+		init_failed = 1;
+
+	if (!init_failed) {
+		/* Now scan the flash. */
+		if (dev->param.is_yaffs2) {
+			if (yaffs2_checkpt_restore(dev)) {
+				yaffs_check_obj_details_loaded(dev->root_dir);
+				yaffs_trace(YAFFS_TRACE_CHECKPOINT | YAFFS_TRACE_MOUNT,
+					"yaffs: restored from checkpoint"
+					);
+			} else {
+
+				/* Clean up the mess caused by an aborted checkpoint load
+				 * and scan backwards.
+				 */
+				yaffs_deinit_blocks(dev);
+
+				yaffs_deinit_tnodes_and_objs(dev);
+
+				dev->n_erased_blocks = 0;
+				dev->n_free_chunks = 0;
+				dev->alloc_block = -1;
+				dev->alloc_page = -1;
+				dev->n_deleted_files = 0;
+				dev->n_unlinked_files = 0;
+				dev->n_bg_deletions = 0;
+
+				if (!init_failed && !yaffs_init_blocks(dev))
+					init_failed = 1;
+
+				yaffs_init_tnodes_and_objs(dev);
+
+				if (!init_failed
+				    && !yaffs_create_initial_dir(dev))
+					init_failed = 1;
+
+				if (!init_failed && !yaffs2_scan_backwards(dev))
+					init_failed = 1;
+			}
+		} else if (!yaffs1_scan(dev)) {
+			init_failed = 1;
+                }
+
+		yaffs_strip_deleted_objs(dev);
+		yaffs_fix_hanging_objs(dev);
+		if (dev->param.empty_lost_n_found)
+			yaffs_empty_l_n_f(dev);
+	}
+
+	if (init_failed) {
+		/* Clean up the mess */
+		yaffs_trace(YAFFS_TRACE_TRACING,
+		  "yaffs: yaffs_guts_initialise() aborted.");
+
+		yaffs_deinitialise(dev);
+		return YAFFS_FAIL;
+	}
+
+	/* Zero out stats */
+	dev->n_page_reads = 0;
+	dev->n_page_writes = 0;
+	dev->n_erasures = 0;
+	dev->n_gc_copies = 0;
+	dev->n_retired_writes = 0;
+
+	dev->n_retired_blocks = 0;
+
+	yaffs_verify_free_chunks(dev);
+	yaffs_verify_blocks(dev);
+
+	/* Clean up any aborted checkpoint data */
+	if (!dev->is_checkpointed && dev->blocks_in_checkpt > 0)
+		yaffs2_checkpt_invalidate(dev);
+
+	yaffs_trace(YAFFS_TRACE_TRACING,
+	  "yaffs: yaffs_guts_initialise() done.");
+	return YAFFS_OK;
+
+}
+
+void yaffs_deinitialise(struct yaffs_dev *dev)
+{
+	if (dev->is_mounted) {
+		int i;
+
+		yaffs_deinit_blocks(dev);
+		yaffs_deinit_tnodes_and_objs(dev);
+		if (dev->param.n_caches > 0 && dev->cache) {
+
+			for (i = 0; i < dev->param.n_caches; i++) {
+				if (dev->cache[i].data)
+					kfree(dev->cache[i].data);
+				dev->cache[i].data = NULL;
+			}
+
+			kfree(dev->cache);
+			dev->cache = NULL;
+		}
+
+		kfree(dev->gc_cleanup_list);
+
+		for (i = 0; i < YAFFS_N_TEMP_BUFFERS; i++)
+			kfree(dev->temp_buffer[i].buffer);
+
+		dev->is_mounted = 0;
+
+		if (dev->param.deinitialise_flash_fn)
+			dev->param.deinitialise_flash_fn(dev);
+	}
+}
+
+int yaffs_count_free_chunks(struct yaffs_dev *dev)
+{
+	int n_free = 0;
+	int b;
+
+	struct yaffs_block_info *blk;
+
+	blk = dev->block_info;
+	for (b = dev->internal_start_block; b <= dev->internal_end_block; b++) {
+		switch (blk->block_state) {
+		case YAFFS_BLOCK_STATE_EMPTY:
+		case YAFFS_BLOCK_STATE_ALLOCATING:
+		case YAFFS_BLOCK_STATE_COLLECTING:
+		case YAFFS_BLOCK_STATE_FULL:
+			n_free +=
+			    (dev->param.chunks_per_block - blk->pages_in_use +
+			     blk->soft_del_pages);
+			break;
+		default:
+			break;
+		}
+		blk++;
+	}
+
+	return n_free;
+}
+
+int yaffs_get_n_free_chunks(struct yaffs_dev *dev)
+{
+	/* This is what we report to the outside world */
+
+	int n_free;
+	int n_dirty_caches;
+	int blocks_for_checkpt;
+	int i;
+
+	n_free = dev->n_free_chunks;
+	n_free += dev->n_deleted_files;
+
+	/* Now count the number of dirty chunks in the cache and subtract those */
+
+	for (n_dirty_caches = 0, i = 0; i < dev->param.n_caches; i++) {
+		if (dev->cache[i].dirty)
+			n_dirty_caches++;
+	}
+
+	n_free -= n_dirty_caches;
+
+	n_free -=
+	    ((dev->param.n_reserved_blocks + 1) * dev->param.chunks_per_block);
+
+	/* Now we figure out how much to reserve for the checkpoint and report that... */
+	blocks_for_checkpt = yaffs_calc_checkpt_blocks_required(dev);
+
+	n_free -= (blocks_for_checkpt * dev->param.chunks_per_block);
+
+	if (n_free < 0)
+		n_free = 0;
+
+	return n_free;
+
+}
diff --git a/fs/yaffs2/yaffs_guts.h b/fs/yaffs2/yaffs_guts.h
new file mode 100644
index 00000000..307eba28
--- /dev/null
+++ b/fs/yaffs2/yaffs_guts.h
@@ -0,0 +1,915 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YAFFS_GUTS_H__
+#define __YAFFS_GUTS_H__
+
+#include "yportenv.h"
+
+#define YAFFS_OK	1
+#define YAFFS_FAIL  0
+
+/* Give us a  Y=0x59,
+ * Give us an A=0x41,
+ * Give us an FF=0xFF
+ * Give us an S=0x53
+ * And what have we got...
+ */
+#define YAFFS_MAGIC			0x5941FF53
+
+#define YAFFS_NTNODES_LEVEL0	  	16
+#define YAFFS_TNODES_LEVEL0_BITS	4
+#define YAFFS_TNODES_LEVEL0_MASK	0xf
+
+#define YAFFS_NTNODES_INTERNAL 		(YAFFS_NTNODES_LEVEL0 / 2)
+#define YAFFS_TNODES_INTERNAL_BITS 	(YAFFS_TNODES_LEVEL0_BITS - 1)
+#define YAFFS_TNODES_INTERNAL_MASK	0x7
+#define YAFFS_TNODES_MAX_LEVEL		6
+
+#ifndef CONFIG_YAFFS_NO_YAFFS1
+#define YAFFS_BYTES_PER_SPARE		16
+#define YAFFS_BYTES_PER_CHUNK		512
+#define YAFFS_CHUNK_SIZE_SHIFT		9
+#define YAFFS_CHUNKS_PER_BLOCK		32
+#define YAFFS_BYTES_PER_BLOCK		(YAFFS_CHUNKS_PER_BLOCK*YAFFS_BYTES_PER_CHUNK)
+#endif
+
+#define YAFFS_MIN_YAFFS2_CHUNK_SIZE 	1024
+#define YAFFS_MIN_YAFFS2_SPARE_SIZE	32
+
+#define YAFFS_MAX_CHUNK_ID		0x000FFFFF
+
+#define YAFFS_ALLOCATION_NOBJECTS	100
+#define YAFFS_ALLOCATION_NTNODES	100
+#define YAFFS_ALLOCATION_NLINKS		100
+
+#define YAFFS_NOBJECT_BUCKETS		256
+
+#define YAFFS_OBJECT_SPACE		0x40000
+#define YAFFS_MAX_OBJECT_ID		(YAFFS_OBJECT_SPACE -1)
+
+#define YAFFS_CHECKPOINT_VERSION 	4
+
+#ifdef CONFIG_YAFFS_UNICODE
+#define YAFFS_MAX_NAME_LENGTH		127
+#define YAFFS_MAX_ALIAS_LENGTH		79
+#else
+#define YAFFS_MAX_NAME_LENGTH		255
+#define YAFFS_MAX_ALIAS_LENGTH		159
+#endif
+
+#define YAFFS_SHORT_NAME_LENGTH		15
+
+/* Some special object ids for pseudo objects */
+#define YAFFS_OBJECTID_ROOT		1
+#define YAFFS_OBJECTID_LOSTNFOUND	2
+#define YAFFS_OBJECTID_UNLINKED		3
+#define YAFFS_OBJECTID_DELETED		4
+
+/* Pseudo object ids for checkpointing */
+#define YAFFS_OBJECTID_SB_HEADER	0x10
+#define YAFFS_OBJECTID_CHECKPOINT_DATA	0x20
+#define YAFFS_SEQUENCE_CHECKPOINT_DATA  0x21
+
+#define YAFFS_MAX_SHORT_OP_CACHES	20
+
+#define YAFFS_N_TEMP_BUFFERS		6
+
+/* We limit the number attempts at sucessfully saving a chunk of data.
+ * Small-page devices have 32 pages per block; large-page devices have 64.
+ * Default to something in the order of 5 to 10 blocks worth of chunks.
+ */
+#define YAFFS_WR_ATTEMPTS		(5*64)
+
+/* Sequence numbers are used in YAFFS2 to determine block allocation order.
+ * The range is limited slightly to help distinguish bad numbers from good.
+ * This also allows us to perhaps in the future use special numbers for
+ * special purposes.
+ * EFFFFF00 allows the allocation of 8 blocks per second (~1Mbytes) for 15 years,
+ * and is a larger number than the lifetime of a 2GB device.
+ */
+#define YAFFS_LOWEST_SEQUENCE_NUMBER	0x00001000
+#define YAFFS_HIGHEST_SEQUENCE_NUMBER	0xEFFFFF00
+
+/* Special sequence number for bad block that failed to be marked bad */
+#define YAFFS_SEQUENCE_BAD_BLOCK	0xFFFF0000
+
+/* ChunkCache is used for short read/write operations.*/
+struct yaffs_cache {
+	struct yaffs_obj *object;
+	int chunk_id;
+	int last_use;
+	int dirty;
+	int n_bytes;		/* Only valid if the cache is dirty */
+	int locked;		/* Can't push out or flush while locked. */
+	u8 *data;
+};
+
+/* Tags structures in RAM
+ * NB This uses bitfield. Bitfields should not straddle a u32 boundary otherwise
+ * the structure size will get blown out.
+ */
+
+#ifndef CONFIG_YAFFS_NO_YAFFS1
+struct yaffs_tags {
+	unsigned chunk_id:20;
+	unsigned serial_number:2;
+	unsigned n_bytes_lsb:10;
+	unsigned obj_id:18;
+	unsigned ecc:12;
+	unsigned n_bytes_msb:2;
+};
+
+union yaffs_tags_union {
+	struct yaffs_tags as_tags;
+	u8 as_bytes[8];
+};
+
+#endif
+
+/* Stuff used for extended tags in YAFFS2 */
+
+enum yaffs_ecc_result {
+	YAFFS_ECC_RESULT_UNKNOWN,
+	YAFFS_ECC_RESULT_NO_ERROR,
+	YAFFS_ECC_RESULT_FIXED,
+	YAFFS_ECC_RESULT_UNFIXED
+};
+
+enum yaffs_obj_type {
+	YAFFS_OBJECT_TYPE_UNKNOWN,
+	YAFFS_OBJECT_TYPE_FILE,
+	YAFFS_OBJECT_TYPE_SYMLINK,
+	YAFFS_OBJECT_TYPE_DIRECTORY,
+	YAFFS_OBJECT_TYPE_HARDLINK,
+	YAFFS_OBJECT_TYPE_SPECIAL
+};
+
+#define YAFFS_OBJECT_TYPE_MAX YAFFS_OBJECT_TYPE_SPECIAL
+
+struct yaffs_ext_tags {
+
+	unsigned validity0;
+	unsigned chunk_used;	/*  Status of the chunk: used or unused */
+	unsigned obj_id;	/* If 0 then this is not part of an object (unused) */
+	unsigned chunk_id;	/* If 0 then this is a header, else a data chunk */
+	unsigned n_bytes;	/* Only valid for data chunks */
+
+	/* The following stuff only has meaning when we read */
+	enum yaffs_ecc_result ecc_result;
+	unsigned block_bad;
+
+	/* YAFFS 1 stuff */
+	unsigned is_deleted;	/* The chunk is marked deleted */
+	unsigned serial_number;	/* Yaffs1 2-bit serial number */
+
+	/* YAFFS2 stuff */
+	unsigned seq_number;	/* The sequence number of this block */
+
+	/* Extra info if this is an object header (YAFFS2 only) */
+
+	unsigned extra_available;	/* There is extra info available if this is not zero */
+	unsigned extra_parent_id;	/* The parent object */
+	unsigned extra_is_shrink;	/* Is it a shrink header? */
+	unsigned extra_shadows;	/* Does this shadow another object? */
+
+	enum yaffs_obj_type extra_obj_type;	/* What object type? */
+
+	unsigned extra_length;	/* Length if it is a file */
+	unsigned extra_equiv_id;	/* Equivalent object Id if it is a hard link */
+
+	unsigned validity1;
+
+};
+
+/* Spare structure for YAFFS1 */
+struct yaffs_spare {
+	u8 tb0;
+	u8 tb1;
+	u8 tb2;
+	u8 tb3;
+	u8 page_status;		/* set to 0 to delete the chunk */
+	u8 block_status;
+	u8 tb4;
+	u8 tb5;
+	u8 ecc1[3];
+	u8 tb6;
+	u8 tb7;
+	u8 ecc2[3];
+};
+
+/*Special structure for passing through to mtd */
+struct yaffs_nand_spare {
+	struct yaffs_spare spare;
+	int eccres1;
+	int eccres2;
+};
+
+/* Block data in RAM */
+
+enum yaffs_block_state {
+	YAFFS_BLOCK_STATE_UNKNOWN = 0,
+
+	YAFFS_BLOCK_STATE_SCANNING,
+	/* Being scanned */
+
+	YAFFS_BLOCK_STATE_NEEDS_SCANNING,
+	/* The block might have something on it (ie it is allocating or full, perhaps empty)
+	 * but it needs to be scanned to determine its true state.
+	 * This state is only valid during scanning.
+	 * NB We tolerate empty because the pre-scanner might be incapable of deciding
+	 * However, if this state is returned on a YAFFS2 device, then we expect a sequence number
+	 */
+
+	YAFFS_BLOCK_STATE_EMPTY,
+	/* This block is empty */
+
+	YAFFS_BLOCK_STATE_ALLOCATING,
+	/* This block is partially allocated.
+	 * At least one page holds valid data.
+	 * This is the one currently being used for page
+	 * allocation. Should never be more than one of these.
+	 * If a block is only partially allocated at mount it is treated as full.
+	 */
+
+	YAFFS_BLOCK_STATE_FULL,
+	/* All the pages in this block have been allocated.
+	 * If a block was only partially allocated when mounted we treat
+	 * it as fully allocated.
+	 */
+
+	YAFFS_BLOCK_STATE_DIRTY,
+	/* The block was full and now all chunks have been deleted.
+	 * Erase me, reuse me.
+	 */
+
+	YAFFS_BLOCK_STATE_CHECKPOINT,
+	/* This block is assigned to holding checkpoint data. */
+
+	YAFFS_BLOCK_STATE_COLLECTING,
+	/* This block is being garbage collected */
+
+	YAFFS_BLOCK_STATE_DEAD
+	    /* This block has failed and is not in use */
+};
+
+#define	YAFFS_NUMBER_OF_BLOCK_STATES (YAFFS_BLOCK_STATE_DEAD + 1)
+
+struct yaffs_block_info {
+
+	int soft_del_pages:10;	/* number of soft deleted pages */
+	int pages_in_use:10;	/* number of pages in use */
+	unsigned block_state:4;	/* One of the above block states. NB use unsigned because enum is sometimes an int */
+	u32 needs_retiring:1;	/* Data has failed on this block, need to get valid data off */
+	/* and retire the block. */
+	u32 skip_erased_check:1;	/* If this is set we can skip the erased check on this block */
+	u32 gc_prioritise:1;	/* An ECC check or blank check has failed on this block.
+				   It should be prioritised for GC */
+	u32 chunk_error_strikes:3;	/* How many times we've had ecc etc failures on this block and tried to reuse it */
+
+#ifdef CONFIG_YAFFS_YAFFS2
+	u32 has_shrink_hdr:1;	/* This block has at least one shrink object header */
+	u32 seq_number;		/* block sequence number for yaffs2 */
+#endif
+
+};
+
+/* -------------------------- Object structure -------------------------------*/
+/* This is the object structure as stored on NAND */
+
+struct yaffs_obj_hdr {
+	enum yaffs_obj_type type;
+
+	/* Apply to everything  */
+	int parent_obj_id;
+	u16 sum_no_longer_used;	/* checksum of name. No longer used */
+	YCHAR name[YAFFS_MAX_NAME_LENGTH + 1];
+
+	/* The following apply to directories, files, symlinks - not hard links */
+	u32 yst_mode;		/* protection */
+
+	u32 yst_uid;
+	u32 yst_gid;
+	u32 yst_atime;
+	u32 yst_mtime;
+	u32 yst_ctime;
+
+	/* File size  applies to files only */
+	int file_size;
+
+	/* Equivalent object id applies to hard links only. */
+	int equiv_id;
+
+	/* Alias is for symlinks only. */
+	YCHAR alias[YAFFS_MAX_ALIAS_LENGTH + 1];
+
+	u32 yst_rdev;		/* device stuff for block and char devices (major/min) */
+
+	u32 win_ctime[2];
+	u32 win_atime[2];
+	u32 win_mtime[2];
+
+	u32 inband_shadowed_obj_id;
+	u32 inband_is_shrink;
+
+	u32 reserved[2];
+	int shadows_obj;	/* This object header shadows the specified object if > 0 */
+
+	/* is_shrink applies to object headers written when we shrink the file (ie resize) */
+	u32 is_shrink;
+
+};
+
+/*--------------------------- Tnode -------------------------- */
+
+struct yaffs_tnode {
+	struct yaffs_tnode *internal[YAFFS_NTNODES_INTERNAL];
+};
+
+/*------------------------  Object -----------------------------*/
+/* An object can be one of:
+ * - a directory (no data, has children links
+ * - a regular file (data.... not prunes :->).
+ * - a symlink [symbolic link] (the alias).
+ * - a hard link
+ */
+
+struct yaffs_file_var {
+	u32 file_size;
+	u32 scanned_size;
+	u32 shrink_size;
+	int top_level;
+	struct yaffs_tnode *top;
+};
+
+struct yaffs_dir_var {
+	struct list_head children;	/* list of child links */
+	struct list_head dirty;	/* Entry for list of dirty directories */
+};
+
+struct yaffs_symlink_var {
+	YCHAR *alias;
+};
+
+struct yaffs_hardlink_var {
+	struct yaffs_obj *equiv_obj;
+	u32 equiv_id;
+};
+
+union yaffs_obj_var {
+	struct yaffs_file_var file_variant;
+	struct yaffs_dir_var dir_variant;
+	struct yaffs_symlink_var symlink_variant;
+	struct yaffs_hardlink_var hardlink_variant;
+};
+
+struct yaffs_obj {
+	u8 deleted:1;		/* This should only apply to unlinked files. */
+	u8 soft_del:1;		/* it has also been soft deleted */
+	u8 unlinked:1;		/* An unlinked file. The file should be in the unlinked directory. */
+	u8 fake:1;		/* A fake object has no presence on NAND. */
+	u8 rename_allowed:1;	/* Some objects are not allowed to be renamed. */
+	u8 unlink_allowed:1;
+	u8 dirty:1;		/* the object needs to be written to flash */
+	u8 valid:1;		/* When the file system is being loaded up, this
+				 * object might be created before the data
+				 * is available (ie. file data records appear before the header).
+				 */
+	u8 lazy_loaded:1;	/* This object has been lazy loaded and is missing some detail */
+
+	u8 defered_free:1;	/* For Linux kernel. Object is removed from NAND, but is
+				 * still in the inode cache. Free of object is defered.
+				 * until the inode is released.
+				 */
+	u8 being_created:1;	/* This object is still being created so skip some checks. */
+	u8 is_shadowed:1;	/* This object is shadowed on the way to being renamed. */
+
+	u8 xattr_known:1;	/* We know if this has object has xattribs or not. */
+	u8 has_xattr:1;		/* This object has xattribs. Valid if xattr_known. */
+
+	u8 serial;		/* serial number of chunk in NAND. Cached here */
+	u16 sum;		/* sum of the name to speed searching */
+
+	struct yaffs_dev *my_dev;	/* The device I'm on */
+
+	struct list_head hash_link;	/* list of objects in this hash bucket */
+
+	struct list_head hard_links;	/* all the equivalent hard linked objects */
+
+	/* directory structure stuff */
+	/* also used for linking up the free list */
+	struct yaffs_obj *parent;
+	struct list_head siblings;
+
+	/* Where's my object header in NAND? */
+	int hdr_chunk;
+
+	int n_data_chunks;	/* Number of data chunks attached to the file. */
+
+	u32 obj_id;		/* the object id value */
+
+	u32 yst_mode;
+
+#ifndef CONFIG_YAFFS_NO_SHORT_NAMES
+	YCHAR short_name[YAFFS_SHORT_NAME_LENGTH + 1];
+#endif
+
+#ifdef CONFIG_YAFFS_WINCE
+	u32 win_ctime[2];
+	u32 win_mtime[2];
+	u32 win_atime[2];
+#else
+	u32 yst_uid;
+	u32 yst_gid;
+	u32 yst_atime;
+	u32 yst_mtime;
+	u32 yst_ctime;
+#endif
+
+	u32 yst_rdev;
+
+	void *my_inode;
+
+	enum yaffs_obj_type variant_type;
+
+	union yaffs_obj_var variant;
+
+};
+
+struct yaffs_obj_bucket {
+	struct list_head list;
+	int count;
+};
+
+/* yaffs_checkpt_obj holds the definition of an object as dumped
+ * by checkpointing.
+ */
+
+struct yaffs_checkpt_obj {
+	int struct_type;
+	u32 obj_id;
+	u32 parent_id;
+	int hdr_chunk;
+	enum yaffs_obj_type variant_type:3;
+	u8 deleted:1;
+	u8 soft_del:1;
+	u8 unlinked:1;
+	u8 fake:1;
+	u8 rename_allowed:1;
+	u8 unlink_allowed:1;
+	u8 serial;
+	int n_data_chunks;
+	u32 size_or_equiv_obj;
+};
+
+/*--------------------- Temporary buffers ----------------
+ *
+ * These are chunk-sized working buffers. Each device has a few
+ */
+
+struct yaffs_buffer {
+	u8 *buffer;
+	int line;		/* track from whence this buffer was allocated */
+	int max_line;
+};
+
+/*----------------- Device ---------------------------------*/
+
+struct yaffs_param {
+	const YCHAR *name;
+
+	/*
+	 * Entry parameters set up way early. Yaffs sets up the rest.
+	 * The structure should be zeroed out before use so that unused
+	 * and defualt values are zero.
+	 */
+
+	int inband_tags;	/* Use unband tags */
+	u32 total_bytes_per_chunk;	/* Should be >= 512, does not need to be a power of 2 */
+	int chunks_per_block;	/* does not need to be a power of 2 */
+	int spare_bytes_per_chunk;	/* spare area size */
+	int start_block;	/* Start block we're allowed to use */
+	int end_block;		/* End block we're allowed to use */
+	int n_reserved_blocks;	/* We want this tuneable so that we can reduce */
+	/* reserved blocks on NOR and RAM. */
+
+	int n_caches;		/* If <= 0, then short op caching is disabled, else
+				 * the number of short op caches (don't use too many).
+				 * 10 to 20 is a good bet.
+				 */
+	int use_nand_ecc;	/* Flag to decide whether or not to use NANDECC on data (yaffs1) */
+	int no_tags_ecc;	/* Flag to decide whether or not to do ECC on packed tags (yaffs2) */
+
+	int is_yaffs2;		/* Use yaffs2 mode on this device */
+
+	int empty_lost_n_found;	/* Auto-empty lost+found directory on mount */
+
+	int refresh_period;	/* How often we should check to do a block refresh */
+
+	/* Checkpoint control. Can be set before or after initialisation */
+	u8 skip_checkpt_rd;
+	u8 skip_checkpt_wr;
+
+	int enable_xattr;	/* Enable xattribs */
+
+	/* NAND access functions (Must be set before calling YAFFS) */
+
+	int (*write_chunk_fn) (struct yaffs_dev * dev,
+			       int nand_chunk, const u8 * data,
+			       const struct yaffs_spare * spare);
+	int (*read_chunk_fn) (struct yaffs_dev * dev,
+			      int nand_chunk, u8 * data,
+			      struct yaffs_spare * spare);
+	int (*erase_fn) (struct yaffs_dev * dev, int flash_block);
+	int (*initialise_flash_fn) (struct yaffs_dev * dev);
+	int (*deinitialise_flash_fn) (struct yaffs_dev * dev);
+
+#ifdef CONFIG_YAFFS_YAFFS2
+	int (*write_chunk_tags_fn) (struct yaffs_dev * dev,
+				    int nand_chunk, const u8 * data,
+				    const struct yaffs_ext_tags * tags);
+	int (*read_chunk_tags_fn) (struct yaffs_dev * dev,
+				   int nand_chunk, u8 * data,
+				   struct yaffs_ext_tags * tags);
+	int (*bad_block_fn) (struct yaffs_dev * dev, int block_no);
+	int (*query_block_fn) (struct yaffs_dev * dev, int block_no,
+			       enum yaffs_block_state * state,
+			       u32 * seq_number);
+#endif
+
+	/* The remove_obj_fn function must be supplied by OS flavours that
+	 * need it.
+	 * yaffs direct uses it to implement the faster readdir.
+	 * Linux uses it to protect the directory during unlocking.
+	 */
+	void (*remove_obj_fn) (struct yaffs_obj * obj);
+
+	/* Callback to mark the superblock dirty */
+	void (*sb_dirty_fn) (struct yaffs_dev * dev);
+
+	/*  Callback to control garbage collection. */
+	unsigned (*gc_control) (struct yaffs_dev * dev);
+
+	/* Debug control flags. Don't use unless you know what you're doing */
+	int use_header_file_size;	/* Flag to determine if we should use file sizes from the header */
+	int disable_lazy_load;	/* Disable lazy loading on this device */
+	int wide_tnodes_disabled;	/* Set to disable wide tnodes */
+	int disable_soft_del;	/* yaffs 1 only: Set to disable the use of softdeletion. */
+
+	int defered_dir_update;	/* Set to defer directory updates */
+
+#ifdef CONFIG_YAFFS_AUTO_UNICODE
+	int auto_unicode;
+#endif
+	int always_check_erased;	/* Force chunk erased check always on */
+};
+
+struct yaffs_dev {
+	struct yaffs_param param;
+
+	/* Context storage. Holds extra OS specific data for this device */
+
+	void *os_context;
+	void *driver_context;
+
+	struct list_head dev_list;
+
+	/* Runtime parameters. Set up by YAFFS. */
+	int data_bytes_per_chunk;
+
+	/* Non-wide tnode stuff */
+	u16 chunk_grp_bits;	/* Number of bits that need to be resolved if
+				 * the tnodes are not wide enough.
+				 */
+	u16 chunk_grp_size;	/* == 2^^chunk_grp_bits */
+
+	/* Stuff to support wide tnodes */
+	u32 tnode_width;
+	u32 tnode_mask;
+	u32 tnode_size;
+
+	/* Stuff for figuring out file offset to chunk conversions */
+	u32 chunk_shift;	/* Shift value */
+	u32 chunk_div;		/* Divisor after shifting: 1 for power-of-2 sizes */
+	u32 chunk_mask;		/* Mask to use for power-of-2 case */
+
+	int is_mounted;
+	int read_only;
+	int is_checkpointed;
+
+	/* Stuff to support block offsetting to support start block zero */
+	int internal_start_block;
+	int internal_end_block;
+	int block_offset;
+	int chunk_offset;
+
+	/* Runtime checkpointing stuff */
+	int checkpt_page_seq;	/* running sequence number of checkpoint pages */
+	int checkpt_byte_count;
+	int checkpt_byte_offs;
+	u8 *checkpt_buffer;
+	int checkpt_open_write;
+	int blocks_in_checkpt;
+	int checkpt_cur_chunk;
+	int checkpt_cur_block;
+	int checkpt_next_block;
+	int *checkpt_block_list;
+	int checkpt_max_blocks;
+	u32 checkpt_sum;
+	u32 checkpt_xor;
+
+	int checkpoint_blocks_required;	/* Number of blocks needed to store current checkpoint set */
+
+	/* Block Info */
+	struct yaffs_block_info *block_info;
+	u8 *chunk_bits;		/* bitmap of chunks in use */
+	unsigned block_info_alt:1;	/* was allocated using alternative strategy */
+	unsigned chunk_bits_alt:1;	/* was allocated using alternative strategy */
+	int chunk_bit_stride;	/* Number of bytes of chunk_bits per block.
+				 * Must be consistent with chunks_per_block.
+				 */
+
+	int n_erased_blocks;
+	int alloc_block;	/* Current block being allocated off */
+	u32 alloc_page;
+	int alloc_block_finder;	/* Used to search for next allocation block */
+
+	/* Object and Tnode memory management */
+	void *allocator;
+	int n_obj;
+	int n_tnodes;
+
+	int n_hardlinks;
+
+	struct yaffs_obj_bucket obj_bucket[YAFFS_NOBJECT_BUCKETS];
+	u32 bucket_finder;
+
+	int n_free_chunks;
+
+	/* Garbage collection control */
+	u32 *gc_cleanup_list;	/* objects to delete at the end of a GC. */
+	u32 n_clean_ups;
+
+	unsigned has_pending_prioritised_gc;	/* We think this device might have pending prioritised gcs */
+	unsigned gc_disable;
+	unsigned gc_block_finder;
+	unsigned gc_dirtiest;
+	unsigned gc_pages_in_use;
+	unsigned gc_not_done;
+	unsigned gc_block;
+	unsigned gc_chunk;
+	unsigned gc_skip;
+
+	/* Special directories */
+	struct yaffs_obj *root_dir;
+	struct yaffs_obj *lost_n_found;
+
+	/* Buffer areas for storing data to recover from write failures TODO
+	 *      u8            buffered_data[YAFFS_CHUNKS_PER_BLOCK][YAFFS_BYTES_PER_CHUNK];
+	 *      struct yaffs_spare buffered_spare[YAFFS_CHUNKS_PER_BLOCK];
+	 */
+
+	int buffered_block;	/* Which block is buffered here? */
+	int doing_buffered_block_rewrite;
+
+	struct yaffs_cache *cache;
+	int cache_last_use;
+
+	/* Stuff for background deletion and unlinked files. */
+	struct yaffs_obj *unlinked_dir;	/* Directory where unlinked and deleted files live. */
+	struct yaffs_obj *del_dir;	/* Directory where deleted objects are sent to disappear. */
+	struct yaffs_obj *unlinked_deletion;	/* Current file being background deleted. */
+	int n_deleted_files;	/* Count of files awaiting deletion; */
+	int n_unlinked_files;	/* Count of unlinked files. */
+	int n_bg_deletions;	/* Count of background deletions. */
+
+	/* Temporary buffer management */
+	struct yaffs_buffer temp_buffer[YAFFS_N_TEMP_BUFFERS];
+	int max_temp;
+	int temp_in_use;
+	int unmanaged_buffer_allocs;
+	int unmanaged_buffer_deallocs;
+
+	/* yaffs2 runtime stuff */
+	unsigned seq_number;	/* Sequence number of currently allocating block */
+	unsigned oldest_dirty_seq;
+	unsigned oldest_dirty_block;
+
+	/* Block refreshing */
+	int refresh_skip;	/* A skip down counter. Refresh happens when this gets to zero. */
+
+	/* Dirty directory handling */
+	struct list_head dirty_dirs;	/* List of dirty directories */
+
+	/* Statistcs */
+	u32 n_page_writes;
+	u32 n_page_reads;
+	u32 n_erasures;
+	u32 n_erase_failures;
+	u32 n_gc_copies;
+	u32 all_gcs;
+	u32 passive_gc_count;
+	u32 oldest_dirty_gc_count;
+	u32 n_gc_blocks;
+	u32 bg_gcs;
+	u32 n_retired_writes;
+	u32 n_retired_blocks;
+	u32 n_ecc_fixed;
+	u32 n_ecc_unfixed;
+	u32 n_tags_ecc_fixed;
+	u32 n_tags_ecc_unfixed;
+	u32 n_deletions;
+	u32 n_unmarked_deletions;
+	u32 refresh_count;
+	u32 cache_hits;
+
+};
+
+/* The CheckpointDevice structure holds the device information that changes at runtime and
+ * must be preserved over unmount/mount cycles.
+ */
+struct yaffs_checkpt_dev {
+	int struct_type;
+	int n_erased_blocks;
+	int alloc_block;	/* Current block being allocated off */
+	u32 alloc_page;
+	int n_free_chunks;
+
+	int n_deleted_files;	/* Count of files awaiting deletion; */
+	int n_unlinked_files;	/* Count of unlinked files. */
+	int n_bg_deletions;	/* Count of background deletions. */
+
+	/* yaffs2 runtime stuff */
+	unsigned seq_number;	/* Sequence number of currently allocating block */
+
+};
+
+struct yaffs_checkpt_validity {
+	int struct_type;
+	u32 magic;
+	u32 version;
+	u32 head;
+};
+
+struct yaffs_shadow_fixer {
+	int obj_id;
+	int shadowed_id;
+	struct yaffs_shadow_fixer *next;
+};
+
+/* Structure for doing xattr modifications */
+struct yaffs_xattr_mod {
+	int set;		/* If 0 then this is a deletion */
+	const YCHAR *name;
+	const void *data;
+	int size;
+	int flags;
+	int result;
+};
+
+/*----------------------- YAFFS Functions -----------------------*/
+
+int yaffs_guts_initialise(struct yaffs_dev *dev);
+void yaffs_deinitialise(struct yaffs_dev *dev);
+
+int yaffs_get_n_free_chunks(struct yaffs_dev *dev);
+
+int yaffs_rename_obj(struct yaffs_obj *old_dir, const YCHAR * old_name,
+		     struct yaffs_obj *new_dir, const YCHAR * new_name);
+
+int yaffs_unlinker(struct yaffs_obj *dir, const YCHAR * name);
+int yaffs_del_obj(struct yaffs_obj *obj);
+
+int yaffs_get_obj_name(struct yaffs_obj *obj, YCHAR * name, int buffer_size);
+int yaffs_get_obj_length(struct yaffs_obj *obj);
+int yaffs_get_obj_inode(struct yaffs_obj *obj);
+unsigned yaffs_get_obj_type(struct yaffs_obj *obj);
+int yaffs_get_obj_link_count(struct yaffs_obj *obj);
+
+/* File operations */
+int yaffs_file_rd(struct yaffs_obj *obj, u8 * buffer, loff_t offset,
+		  int n_bytes);
+int yaffs_wr_file(struct yaffs_obj *obj, const u8 * buffer, loff_t offset,
+		  int n_bytes, int write_trhrough);
+int yaffs_resize_file(struct yaffs_obj *obj, loff_t new_size);
+
+struct yaffs_obj *yaffs_create_file(struct yaffs_obj *parent,
+				    const YCHAR * name, u32 mode, u32 uid,
+				    u32 gid);
+
+int yaffs_flush_file(struct yaffs_obj *obj, int update_time, int data_sync);
+
+/* Flushing and checkpointing */
+void yaffs_flush_whole_cache(struct yaffs_dev *dev);
+
+int yaffs_checkpoint_save(struct yaffs_dev *dev);
+int yaffs_checkpoint_restore(struct yaffs_dev *dev);
+
+/* Directory operations */
+struct yaffs_obj *yaffs_create_dir(struct yaffs_obj *parent, const YCHAR * name,
+				   u32 mode, u32 uid, u32 gid);
+struct yaffs_obj *yaffs_find_by_name(struct yaffs_obj *the_dir,
+				     const YCHAR * name);
+struct yaffs_obj *yaffs_find_by_number(struct yaffs_dev *dev, u32 number);
+
+/* Link operations */
+struct yaffs_obj *yaffs_link_obj(struct yaffs_obj *parent, const YCHAR * name,
+				 struct yaffs_obj *equiv_obj);
+
+struct yaffs_obj *yaffs_get_equivalent_obj(struct yaffs_obj *obj);
+
+/* Symlink operations */
+struct yaffs_obj *yaffs_create_symlink(struct yaffs_obj *parent,
+				       const YCHAR * name, u32 mode, u32 uid,
+				       u32 gid, const YCHAR * alias);
+YCHAR *yaffs_get_symlink_alias(struct yaffs_obj *obj);
+
+/* Special inodes (fifos, sockets and devices) */
+struct yaffs_obj *yaffs_create_special(struct yaffs_obj *parent,
+				       const YCHAR * name, u32 mode, u32 uid,
+				       u32 gid, u32 rdev);
+
+int yaffs_set_xattrib(struct yaffs_obj *obj, const YCHAR * name,
+		      const void *value, int size, int flags);
+int yaffs_get_xattrib(struct yaffs_obj *obj, const YCHAR * name, void *value,
+		      int size);
+int yaffs_list_xattrib(struct yaffs_obj *obj, char *buffer, int size);
+int yaffs_remove_xattrib(struct yaffs_obj *obj, const YCHAR * name);
+
+/* Special directories */
+struct yaffs_obj *yaffs_root(struct yaffs_dev *dev);
+struct yaffs_obj *yaffs_lost_n_found(struct yaffs_dev *dev);
+
+void yaffs_handle_defered_free(struct yaffs_obj *obj);
+
+void yaffs_update_dirty_dirs(struct yaffs_dev *dev);
+
+int yaffs_bg_gc(struct yaffs_dev *dev, unsigned urgency);
+
+/* Debug dump  */
+int yaffs_dump_obj(struct yaffs_obj *obj);
+
+void yaffs_guts_test(struct yaffs_dev *dev);
+
+/* A few useful functions to be used within the core files*/
+void yaffs_chunk_del(struct yaffs_dev *dev, int chunk_id, int mark_flash,
+		     int lyn);
+int yaffs_check_ff(u8 * buffer, int n_bytes);
+void yaffs_handle_chunk_error(struct yaffs_dev *dev,
+			      struct yaffs_block_info *bi);
+
+u8 *yaffs_get_temp_buffer(struct yaffs_dev *dev, int line_no);
+void yaffs_release_temp_buffer(struct yaffs_dev *dev, u8 * buffer, int line_no);
+
+struct yaffs_obj *yaffs_find_or_create_by_number(struct yaffs_dev *dev,
+						 int number,
+						 enum yaffs_obj_type type);
+int yaffs_put_chunk_in_file(struct yaffs_obj *in, int inode_chunk,
+			    int nand_chunk, int in_scan);
+void yaffs_set_obj_name(struct yaffs_obj *obj, const YCHAR * name);
+void yaffs_set_obj_name_from_oh(struct yaffs_obj *obj,
+				const struct yaffs_obj_hdr *oh);
+void yaffs_add_obj_to_dir(struct yaffs_obj *directory, struct yaffs_obj *obj);
+YCHAR *yaffs_clone_str(const YCHAR * str);
+void yaffs_link_fixup(struct yaffs_dev *dev, struct yaffs_obj *hard_list);
+void yaffs_block_became_dirty(struct yaffs_dev *dev, int block_no);
+int yaffs_update_oh(struct yaffs_obj *in, const YCHAR * name,
+		    int force, int is_shrink, int shadows,
+		    struct yaffs_xattr_mod *xop);
+void yaffs_handle_shadowed_obj(struct yaffs_dev *dev, int obj_id,
+			       int backward_scanning);
+int yaffs_check_alloc_available(struct yaffs_dev *dev, int n_chunks);
+struct yaffs_tnode *yaffs_get_tnode(struct yaffs_dev *dev);
+struct yaffs_tnode *yaffs_add_find_tnode_0(struct yaffs_dev *dev,
+					   struct yaffs_file_var *file_struct,
+					   u32 chunk_id,
+					   struct yaffs_tnode *passed_tn);
+
+int yaffs_do_file_wr(struct yaffs_obj *in, const u8 * buffer, loff_t offset,
+		     int n_bytes, int write_trhrough);
+void yaffs_resize_file_down(struct yaffs_obj *obj, loff_t new_size);
+void yaffs_skip_rest_of_block(struct yaffs_dev *dev);
+
+int yaffs_count_free_chunks(struct yaffs_dev *dev);
+
+struct yaffs_tnode *yaffs_find_tnode_0(struct yaffs_dev *dev,
+				       struct yaffs_file_var *file_struct,
+				       u32 chunk_id);
+
+u32 yaffs_get_group_base(struct yaffs_dev *dev, struct yaffs_tnode *tn,
+			 unsigned pos);
+
+int yaffs_is_non_empty_dir(struct yaffs_obj *obj);
+#endif
diff --git a/fs/yaffs2/yaffs_linux.h b/fs/yaffs2/yaffs_linux.h
new file mode 100644
index 00000000..3b508cbc
--- /dev/null
+++ b/fs/yaffs2/yaffs_linux.h
@@ -0,0 +1,41 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YAFFS_LINUX_H__
+#define __YAFFS_LINUX_H__
+
+#include "yportenv.h"
+
+struct yaffs_linux_context {
+	struct list_head context_list;	/* List of these we have mounted */
+	struct yaffs_dev *dev;
+	struct super_block *super;
+	struct task_struct *bg_thread;	/* Background thread for this device */
+	int bg_running;
+	struct mutex gross_lock;	/* Gross locking mutex*/
+	u8 *spare_buffer;	/* For mtdif2 use. Don't know the size of the buffer
+				 * at compile time so we have to allocate it.
+				 */
+	struct list_head search_contexts;
+	void (*put_super_fn) (struct super_block * sb);
+
+	struct task_struct *readdir_process;
+	unsigned mount_id;
+};
+
+#define yaffs_dev_to_lc(dev) ((struct yaffs_linux_context *)((dev)->os_context))
+#define yaffs_dev_to_mtd(dev) ((struct mtd_info *)((dev)->driver_context))
+
+#endif
diff --git a/fs/yaffs2/yaffs_mtdif.c b/fs/yaffs2/yaffs_mtdif.c
new file mode 100644
index 00000000..7cf53b3d
--- /dev/null
+++ b/fs/yaffs2/yaffs_mtdif.c
@@ -0,0 +1,54 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "yportenv.h"
+
+#include "yaffs_mtdif.h"
+
+#include "linux/mtd/mtd.h"
+#include "linux/types.h"
+#include "linux/time.h"
+#include "linux/mtd/nand.h"
+
+#include "yaffs_linux.h"
+
+int nandmtd_erase_block(struct yaffs_dev *dev, int block_no)
+{
+	struct mtd_info *mtd = yaffs_dev_to_mtd(dev);
+	u32 addr =
+	    ((loff_t) block_no) * dev->param.total_bytes_per_chunk
+	    * dev->param.chunks_per_block;
+	struct erase_info ei;
+
+	int retval = 0;
+
+	ei.mtd = mtd;
+	ei.addr = addr;
+	ei.len = dev->param.total_bytes_per_chunk * dev->param.chunks_per_block;
+	ei.time = 1000;
+	ei.retries = 2;
+	ei.callback = NULL;
+	ei.priv = (u_long) dev;
+
+	retval = mtd->erase(mtd, &ei);
+
+	if (retval == 0)
+		return YAFFS_OK;
+	else
+		return YAFFS_FAIL;
+}
+
+int nandmtd_initialise(struct yaffs_dev *dev)
+{
+	return YAFFS_OK;
+}
diff --git a/fs/yaffs2/yaffs_mtdif.h b/fs/yaffs2/yaffs_mtdif.h
new file mode 100644
index 00000000..66650741
--- /dev/null
+++ b/fs/yaffs2/yaffs_mtdif.h
@@ -0,0 +1,23 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YAFFS_MTDIF_H__
+#define __YAFFS_MTDIF_H__
+
+#include "yaffs_guts.h"
+
+int nandmtd_erase_block(struct yaffs_dev *dev, int block_no);
+int nandmtd_initialise(struct yaffs_dev *dev);
+#endif
diff --git a/fs/yaffs2/yaffs_mtdif1.c b/fs/yaffs2/yaffs_mtdif1.c
new file mode 100644
index 00000000..51083695
--- /dev/null
+++ b/fs/yaffs2/yaffs_mtdif1.c
@@ -0,0 +1,330 @@
+/*
+ * YAFFS: Yet another FFS. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * This module provides the interface between yaffs_nand.c and the
+ * MTD API.  This version is used when the MTD interface supports the
+ * 'mtd_oob_ops' style calls to read_oob and write_oob, circa 2.6.17,
+ * and we have small-page NAND device.
+ *
+ * These functions are invoked via function pointers in yaffs_nand.c.
+ * This replaces functionality provided by functions in yaffs_mtdif.c
+ * and the yaffs_tags compatability functions in yaffs_tagscompat.c that are
+ * called in yaffs_mtdif.c when the function pointers are NULL.
+ * We assume the MTD layer is performing ECC (use_nand_ecc is true).
+ */
+
+#include "yportenv.h"
+#include "yaffs_trace.h"
+#include "yaffs_guts.h"
+#include "yaffs_packedtags1.h"
+#include "yaffs_tagscompat.h"	/* for yaffs_calc_tags_ecc */
+#include "yaffs_linux.h"
+
+#include "linux/kernel.h"
+#include "linux/version.h"
+#include "linux/types.h"
+#include "linux/mtd/mtd.h"
+
+#ifndef CONFIG_YAFFS_9BYTE_TAGS
+# define YTAG1_SIZE 8
+#else
+# define YTAG1_SIZE 9
+#endif
+
+/* Write a chunk (page) of data to NAND.
+ *
+ * Caller always provides ExtendedTags data which are converted to a more
+ * compact (packed) form for storage in NAND.  A mini-ECC runs over the
+ * contents of the tags meta-data; used to valid the tags when read.
+ *
+ *  - Pack ExtendedTags to packed_tags1 form
+ *  - Compute mini-ECC for packed_tags1
+ *  - Write data and packed tags to NAND.
+ *
+ * Note: Due to the use of the packed_tags1 meta-data which does not include
+ * a full sequence number (as found in the larger packed_tags2 form) it is
+ * necessary for Yaffs to re-write a chunk/page (just once) to mark it as
+ * discarded and dirty.  This is not ideal: newer NAND parts are supposed
+ * to be written just once.  When Yaffs performs this operation, this
+ * function is called with a NULL data pointer -- calling MTD write_oob
+ * without data is valid usage (2.6.17).
+ *
+ * Any underlying MTD error results in YAFFS_FAIL.
+ * Returns YAFFS_OK or YAFFS_FAIL.
+ */
+int nandmtd1_write_chunk_tags(struct yaffs_dev *dev,
+			      int nand_chunk, const u8 * data,
+			      const struct yaffs_ext_tags *etags)
+{
+	struct mtd_info *mtd = yaffs_dev_to_mtd(dev);
+	int chunk_bytes = dev->data_bytes_per_chunk;
+	loff_t addr = ((loff_t) nand_chunk) * chunk_bytes;
+	struct mtd_oob_ops ops;
+	struct yaffs_packed_tags1 pt1;
+	int retval;
+
+	/* we assume that packed_tags1 and struct yaffs_tags are compatible */
+	compile_time_assertion(sizeof(struct yaffs_packed_tags1) == 12);
+	compile_time_assertion(sizeof(struct yaffs_tags) == 8);
+
+	yaffs_pack_tags1(&pt1, etags);
+	yaffs_calc_tags_ecc((struct yaffs_tags *)&pt1);
+
+	/* When deleting a chunk, the upper layer provides only skeletal
+	 * etags, one with is_deleted set.  However, we need to update the
+	 * tags, not erase them completely.  So we use the NAND write property
+	 * that only zeroed-bits stick and set tag bytes to all-ones and
+	 * zero just the (not) deleted bit.
+	 */
+#ifndef CONFIG_YAFFS_9BYTE_TAGS
+	if (etags->is_deleted) {
+		memset(&pt1, 0xff, 8);
+		/* clear delete status bit to indicate deleted */
+		pt1.deleted = 0;
+	}
+#else
+	((u8 *) & pt1)[8] = 0xff;
+	if (etags->is_deleted) {
+		memset(&pt1, 0xff, 8);
+		/* zero page_status byte to indicate deleted */
+		((u8 *) & pt1)[8] = 0;
+	}
+#endif
+
+	memset(&ops, 0, sizeof(ops));
+	ops.mode = MTD_OOB_AUTO;
+	ops.len = (data) ? chunk_bytes : 0;
+	ops.ooblen = YTAG1_SIZE;
+	ops.datbuf = (u8 *) data;
+	ops.oobbuf = (u8 *) & pt1;
+
+	retval = mtd->write_oob(mtd, addr, &ops);
+	if (retval) {
+		yaffs_trace(YAFFS_TRACE_MTD,
+			"write_oob failed, chunk %d, mtd error %d",
+			nand_chunk, retval);
+	}
+	return retval ? YAFFS_FAIL : YAFFS_OK;
+}
+
+/* Return with empty ExtendedTags but add ecc_result.
+ */
+static int rettags(struct yaffs_ext_tags *etags, int ecc_result, int retval)
+{
+	if (etags) {
+		memset(etags, 0, sizeof(*etags));
+		etags->ecc_result = ecc_result;
+	}
+	return retval;
+}
+
+/* Read a chunk (page) from NAND.
+ *
+ * Caller expects ExtendedTags data to be usable even on error; that is,
+ * all members except ecc_result and block_bad are zeroed.
+ *
+ *  - Check ECC results for data (if applicable)
+ *  - Check for blank/erased block (return empty ExtendedTags if blank)
+ *  - Check the packed_tags1 mini-ECC (correct if necessary/possible)
+ *  - Convert packed_tags1 to ExtendedTags
+ *  - Update ecc_result and block_bad members to refect state.
+ *
+ * Returns YAFFS_OK or YAFFS_FAIL.
+ */
+int nandmtd1_read_chunk_tags(struct yaffs_dev *dev,
+			     int nand_chunk, u8 * data,
+			     struct yaffs_ext_tags *etags)
+{
+	struct mtd_info *mtd = yaffs_dev_to_mtd(dev);
+	int chunk_bytes = dev->data_bytes_per_chunk;
+	loff_t addr = ((loff_t) nand_chunk) * chunk_bytes;
+	int eccres = YAFFS_ECC_RESULT_NO_ERROR;
+	struct mtd_oob_ops ops;
+	struct yaffs_packed_tags1 pt1;
+	int retval;
+	int deleted;
+
+	memset(&ops, 0, sizeof(ops));
+	ops.mode = MTD_OOB_AUTO;
+	ops.len = (data) ? chunk_bytes : 0;
+	ops.ooblen = YTAG1_SIZE;
+	ops.datbuf = data;
+	ops.oobbuf = (u8 *) & pt1;
+
+	/* Read page and oob using MTD.
+	 * Check status and determine ECC result.
+	 */
+	retval = mtd->read_oob(mtd, addr, &ops);
+	if (retval) {
+		yaffs_trace(YAFFS_TRACE_MTD,
+			"read_oob failed, chunk %d, mtd error %d",
+			nand_chunk, retval);
+	}
+
+	switch (retval) {
+	case 0:
+		/* no error */
+		break;
+
+	case -EUCLEAN:
+		/* MTD's ECC fixed the data */
+		eccres = YAFFS_ECC_RESULT_FIXED;
+		dev->n_ecc_fixed++;
+		break;
+
+	case -EBADMSG:
+		/* MTD's ECC could not fix the data */
+		dev->n_ecc_unfixed++;
+		/* fall into... */
+	default:
+		rettags(etags, YAFFS_ECC_RESULT_UNFIXED, 0);
+		etags->block_bad = (mtd->block_isbad) (mtd, addr);
+		return YAFFS_FAIL;
+	}
+
+	/* Check for a blank/erased chunk.
+	 */
+	if (yaffs_check_ff((u8 *) & pt1, 8)) {
+		/* when blank, upper layers want ecc_result to be <= NO_ERROR */
+		return rettags(etags, YAFFS_ECC_RESULT_NO_ERROR, YAFFS_OK);
+	}
+#ifndef CONFIG_YAFFS_9BYTE_TAGS
+	/* Read deleted status (bit) then return it to it's non-deleted
+	 * state before performing tags mini-ECC check. pt1.deleted is
+	 * inverted.
+	 */
+	deleted = !pt1.deleted;
+	pt1.deleted = 1;
+#else
+	deleted = (yaffs_count_bits(((u8 *) & pt1)[8]) < 7);
+#endif
+
+	/* Check the packed tags mini-ECC and correct if necessary/possible.
+	 */
+	retval = yaffs_check_tags_ecc((struct yaffs_tags *)&pt1);
+	switch (retval) {
+	case 0:
+		/* no tags error, use MTD result */
+		break;
+	case 1:
+		/* recovered tags-ECC error */
+		dev->n_tags_ecc_fixed++;
+		if (eccres == YAFFS_ECC_RESULT_NO_ERROR)
+			eccres = YAFFS_ECC_RESULT_FIXED;
+		break;
+	default:
+		/* unrecovered tags-ECC error */
+		dev->n_tags_ecc_unfixed++;
+		return rettags(etags, YAFFS_ECC_RESULT_UNFIXED, YAFFS_FAIL);
+	}
+
+	/* Unpack the tags to extended form and set ECC result.
+	 * [set should_be_ff just to keep yaffs_unpack_tags1 happy]
+	 */
+	pt1.should_be_ff = 0xFFFFFFFF;
+	yaffs_unpack_tags1(etags, &pt1);
+	etags->ecc_result = eccres;
+
+	/* Set deleted state */
+	etags->is_deleted = deleted;
+	return YAFFS_OK;
+}
+
+/* Mark a block bad.
+ *
+ * This is a persistant state.
+ * Use of this function should be rare.
+ *
+ * Returns YAFFS_OK or YAFFS_FAIL.
+ */
+int nandmtd1_mark_block_bad(struct yaffs_dev *dev, int block_no)
+{
+	struct mtd_info *mtd = yaffs_dev_to_mtd(dev);
+	int blocksize = dev->param.chunks_per_block * dev->data_bytes_per_chunk;
+	int retval;
+
+	yaffs_trace(YAFFS_TRACE_BAD_BLOCKS,
+		"marking block %d bad", block_no);
+
+	retval = mtd->block_markbad(mtd, (loff_t) blocksize * block_no);
+	return (retval) ? YAFFS_FAIL : YAFFS_OK;
+}
+
+/* Check any MTD prerequists.
+ *
+ * Returns YAFFS_OK or YAFFS_FAIL.
+ */
+static int nandmtd1_test_prerequists(struct mtd_info *mtd)
+{
+	/* 2.6.18 has mtd->ecclayout->oobavail */
+	/* 2.6.21 has mtd->ecclayout->oobavail and mtd->oobavail */
+	int oobavail = mtd->ecclayout->oobavail;
+
+	if (oobavail < YTAG1_SIZE) {
+		yaffs_trace(YAFFS_TRACE_ERROR,
+			"mtd device has only %d bytes for tags, need %d",
+			oobavail, YTAG1_SIZE);
+		return YAFFS_FAIL;
+	}
+	return YAFFS_OK;
+}
+
+/* Query for the current state of a specific block.
+ *
+ * Examine the tags of the first chunk of the block and return the state:
+ *  - YAFFS_BLOCK_STATE_DEAD, the block is marked bad
+ *  - YAFFS_BLOCK_STATE_NEEDS_SCANNING, the block is in use
+ *  - YAFFS_BLOCK_STATE_EMPTY, the block is clean
+ *
+ * Always returns YAFFS_OK.
+ */
+int nandmtd1_query_block(struct yaffs_dev *dev, int block_no,
+			 enum yaffs_block_state *state_ptr, u32 * seq_ptr)
+{
+	struct mtd_info *mtd = yaffs_dev_to_mtd(dev);
+	int chunk_num = block_no * dev->param.chunks_per_block;
+	loff_t addr = (loff_t) chunk_num * dev->data_bytes_per_chunk;
+	struct yaffs_ext_tags etags;
+	int state = YAFFS_BLOCK_STATE_DEAD;
+	int seqnum = 0;
+	int retval;
+
+	/* We don't yet have a good place to test for MTD config prerequists.
+	 * Do it here as we are called during the initial scan.
+	 */
+	if (nandmtd1_test_prerequists(mtd) != YAFFS_OK)
+		return YAFFS_FAIL;
+
+	retval = nandmtd1_read_chunk_tags(dev, chunk_num, NULL, &etags);
+	etags.block_bad = (mtd->block_isbad) (mtd, addr);
+	if (etags.block_bad) {
+		yaffs_trace(YAFFS_TRACE_BAD_BLOCKS,
+			"block %d is marked bad", block_no);
+		state = YAFFS_BLOCK_STATE_DEAD;
+	} else if (etags.ecc_result != YAFFS_ECC_RESULT_NO_ERROR) {
+		/* bad tags, need to look more closely */
+		state = YAFFS_BLOCK_STATE_NEEDS_SCANNING;
+	} else if (etags.chunk_used) {
+		state = YAFFS_BLOCK_STATE_NEEDS_SCANNING;
+		seqnum = etags.seq_number;
+	} else {
+		state = YAFFS_BLOCK_STATE_EMPTY;
+	}
+
+	*state_ptr = state;
+	*seq_ptr = seqnum;
+
+	/* query always succeeds */
+	return YAFFS_OK;
+}
diff --git a/fs/yaffs2/yaffs_mtdif1.h b/fs/yaffs2/yaffs_mtdif1.h
new file mode 100644
index 00000000..07ce4524
--- /dev/null
+++ b/fs/yaffs2/yaffs_mtdif1.h
@@ -0,0 +1,29 @@
+/*
+ * YAFFS: Yet another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YAFFS_MTDIF1_H__
+#define __YAFFS_MTDIF1_H__
+
+int nandmtd1_write_chunk_tags(struct yaffs_dev *dev, int nand_chunk,
+			      const u8 * data,
+			      const struct yaffs_ext_tags *tags);
+
+int nandmtd1_read_chunk_tags(struct yaffs_dev *dev, int nand_chunk,
+			     u8 * data, struct yaffs_ext_tags *tags);
+
+int nandmtd1_mark_block_bad(struct yaffs_dev *dev, int block_no);
+
+int nandmtd1_query_block(struct yaffs_dev *dev, int block_no,
+			 enum yaffs_block_state *state, u32 * seq_number);
+
+#endif
diff --git a/fs/yaffs2/yaffs_mtdif2.c b/fs/yaffs2/yaffs_mtdif2.c
new file mode 100644
index 00000000..d1643df2
--- /dev/null
+++ b/fs/yaffs2/yaffs_mtdif2.c
@@ -0,0 +1,225 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* mtd interface for YAFFS2 */
+
+#include "yportenv.h"
+#include "yaffs_trace.h"
+
+#include "yaffs_mtdif2.h"
+
+#include "linux/mtd/mtd.h"
+#include "linux/types.h"
+#include "linux/time.h"
+
+#include "yaffs_packedtags2.h"
+
+#include "yaffs_linux.h"
+
+/* NB For use with inband tags....
+ * We assume that the data buffer is of size total_bytes_per_chunk so that we can also
+ * use it to load the tags.
+ */
+int nandmtd2_write_chunk_tags(struct yaffs_dev *dev, int nand_chunk,
+			      const u8 * data,
+			      const struct yaffs_ext_tags *tags)
+{
+	struct mtd_info *mtd = yaffs_dev_to_mtd(dev);
+	struct mtd_oob_ops ops;
+	int retval = 0;
+
+	loff_t addr;
+
+	struct yaffs_packed_tags2 pt;
+
+	int packed_tags_size =
+	    dev->param.no_tags_ecc ? sizeof(pt.t) : sizeof(pt);
+	void *packed_tags_ptr =
+	    dev->param.no_tags_ecc ? (void *)&pt.t : (void *)&pt;
+
+	yaffs_trace(YAFFS_TRACE_MTD,
+		"nandmtd2_write_chunk_tags chunk %d data %p tags %p",
+		nand_chunk, data, tags);
+
+	addr = ((loff_t) nand_chunk) * dev->param.total_bytes_per_chunk;
+
+	/* For yaffs2 writing there must be both data and tags.
+	 * If we're using inband tags, then the tags are stuffed into
+	 * the end of the data buffer.
+	 */
+	if (!data || !tags)
+		BUG();
+	else if (dev->param.inband_tags) {
+		struct yaffs_packed_tags2_tags_only *pt2tp;
+		pt2tp =
+		    (struct yaffs_packed_tags2_tags_only *)(data +
+							    dev->
+							    data_bytes_per_chunk);
+		yaffs_pack_tags2_tags_only(pt2tp, tags);
+	} else {
+		yaffs_pack_tags2(&pt, tags, !dev->param.no_tags_ecc);
+        }
+
+	ops.mode = MTD_OOB_AUTO;
+	ops.ooblen = (dev->param.inband_tags) ? 0 : packed_tags_size;
+	ops.len = dev->param.total_bytes_per_chunk;
+	ops.ooboffs = 0;
+	ops.datbuf = (u8 *) data;
+	ops.oobbuf = (dev->param.inband_tags) ? NULL : packed_tags_ptr;
+	retval = mtd->write_oob(mtd, addr, &ops);
+
+	if (retval == 0)
+		return YAFFS_OK;
+	else
+		return YAFFS_FAIL;
+}
+
+int nandmtd2_read_chunk_tags(struct yaffs_dev *dev, int nand_chunk,
+			     u8 * data, struct yaffs_ext_tags *tags)
+{
+	struct mtd_info *mtd = yaffs_dev_to_mtd(dev);
+	struct mtd_oob_ops ops;
+
+	size_t dummy;
+	int retval = 0;
+	int local_data = 0;
+
+	loff_t addr = ((loff_t) nand_chunk) * dev->param.total_bytes_per_chunk;
+
+	struct yaffs_packed_tags2 pt;
+
+	int packed_tags_size =
+	    dev->param.no_tags_ecc ? sizeof(pt.t) : sizeof(pt);
+	void *packed_tags_ptr =
+	    dev->param.no_tags_ecc ? (void *)&pt.t : (void *)&pt;
+
+	yaffs_trace(YAFFS_TRACE_MTD,
+		"nandmtd2_read_chunk_tags chunk %d data %p tags %p",
+		nand_chunk, data, tags);
+
+	if (dev->param.inband_tags) {
+
+		if (!data) {
+			local_data = 1;
+			data = yaffs_get_temp_buffer(dev, __LINE__);
+		}
+
+	}
+
+	if (dev->param.inband_tags || (data && !tags))
+		retval = mtd->read(mtd, addr, dev->param.total_bytes_per_chunk,
+				   &dummy, data);
+	else if (tags) {
+		ops.mode = MTD_OOB_AUTO;
+		ops.ooblen = packed_tags_size;
+		ops.len = data ? dev->data_bytes_per_chunk : packed_tags_size;
+		ops.ooboffs = 0;
+		ops.datbuf = data;
+		ops.oobbuf = yaffs_dev_to_lc(dev)->spare_buffer;
+		retval = mtd->read_oob(mtd, addr, &ops);
+	}
+
+	if (dev->param.inband_tags) {
+		if (tags) {
+			struct yaffs_packed_tags2_tags_only *pt2tp;
+			pt2tp =
+			    (struct yaffs_packed_tags2_tags_only *)&data[dev->
+									 data_bytes_per_chunk];
+			yaffs_unpack_tags2_tags_only(tags, pt2tp);
+		}
+	} else {
+		if (tags) {
+			memcpy(packed_tags_ptr,
+			       yaffs_dev_to_lc(dev)->spare_buffer,
+			       packed_tags_size);
+			yaffs_unpack_tags2(tags, &pt, !dev->param.no_tags_ecc);
+		}
+	}
+
+	if (local_data)
+		yaffs_release_temp_buffer(dev, data, __LINE__);
+
+	if (tags && retval == -EBADMSG
+	    && tags->ecc_result == YAFFS_ECC_RESULT_NO_ERROR) {
+		tags->ecc_result = YAFFS_ECC_RESULT_UNFIXED;
+		dev->n_ecc_unfixed++;
+	}
+	if (tags && retval == -EUCLEAN
+	    && tags->ecc_result == YAFFS_ECC_RESULT_NO_ERROR) {
+		tags->ecc_result = YAFFS_ECC_RESULT_FIXED;
+		dev->n_ecc_fixed++;
+	}
+	if (retval == 0)
+		return YAFFS_OK;
+	else
+		return YAFFS_FAIL;
+}
+
+int nandmtd2_mark_block_bad(struct yaffs_dev *dev, int block_no)
+{
+	struct mtd_info *mtd = yaffs_dev_to_mtd(dev);
+	int retval;
+	yaffs_trace(YAFFS_TRACE_MTD,
+		"nandmtd2_mark_block_bad %d", block_no);
+
+	retval =
+	    mtd->block_markbad(mtd,
+			       block_no * dev->param.chunks_per_block *
+			       dev->param.total_bytes_per_chunk);
+
+	if (retval == 0)
+		return YAFFS_OK;
+	else
+		return YAFFS_FAIL;
+
+}
+
+int nandmtd2_query_block(struct yaffs_dev *dev, int block_no,
+			 enum yaffs_block_state *state, u32 * seq_number)
+{
+	struct mtd_info *mtd = yaffs_dev_to_mtd(dev);
+	int retval;
+
+	yaffs_trace(YAFFS_TRACE_MTD, "nandmtd2_query_block %d", block_no);
+	retval =
+	    mtd->block_isbad(mtd,
+			     block_no * dev->param.chunks_per_block *
+			     dev->param.total_bytes_per_chunk);
+
+	if (retval) {
+		yaffs_trace(YAFFS_TRACE_MTD, "block is bad");
+
+		*state = YAFFS_BLOCK_STATE_DEAD;
+		*seq_number = 0;
+	} else {
+		struct yaffs_ext_tags t;
+		nandmtd2_read_chunk_tags(dev, block_no *
+					 dev->param.chunks_per_block, NULL, &t);
+
+		if (t.chunk_used) {
+			*seq_number = t.seq_number;
+			*state = YAFFS_BLOCK_STATE_NEEDS_SCANNING;
+		} else {
+			*seq_number = 0;
+			*state = YAFFS_BLOCK_STATE_EMPTY;
+		}
+	}
+	yaffs_trace(YAFFS_TRACE_MTD,
+		"block is bad seq %d state %d", *seq_number, *state);
+
+	if (retval == 0)
+		return YAFFS_OK;
+	else
+		return YAFFS_FAIL;
+}
+
diff --git a/fs/yaffs2/yaffs_mtdif2.h b/fs/yaffs2/yaffs_mtdif2.h
new file mode 100644
index 00000000..d8211261
--- /dev/null
+++ b/fs/yaffs2/yaffs_mtdif2.h
@@ -0,0 +1,29 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YAFFS_MTDIF2_H__
+#define __YAFFS_MTDIF2_H__
+
+#include "yaffs_guts.h"
+int nandmtd2_write_chunk_tags(struct yaffs_dev *dev, int nand_chunk,
+			      const u8 * data,
+			      const struct yaffs_ext_tags *tags);
+int nandmtd2_read_chunk_tags(struct yaffs_dev *dev, int nand_chunk,
+			     u8 * data, struct yaffs_ext_tags *tags);
+int nandmtd2_mark_block_bad(struct yaffs_dev *dev, int block_no);
+int nandmtd2_query_block(struct yaffs_dev *dev, int block_no,
+			 enum yaffs_block_state *state, u32 * seq_number);
+
+#endif
diff --git a/fs/yaffs2/yaffs_nameval.c b/fs/yaffs2/yaffs_nameval.c
new file mode 100644
index 00000000..daa36f98
--- /dev/null
+++ b/fs/yaffs2/yaffs_nameval.c
@@ -0,0 +1,201 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * This simple implementation of a name-value store assumes a small number of values and fits
+ * into a small finite buffer.
+ *
+ * Each attribute is stored as a record:
+ *  sizeof(int) bytes   record size.
+ *  strnlen+1 bytes name null terminated.
+ *  nbytes    value.
+ *  ----------
+ *  total size  stored in record size 
+ *
+ * This code has not been tested with unicode yet.
+ */
+
+#include "yaffs_nameval.h"
+
+#include "yportenv.h"
+
+static int nval_find(const char *xb, int xb_size, const YCHAR * name,
+		     int *exist_size)
+{
+	int pos = 0;
+	int size;
+
+	memcpy(&size, xb, sizeof(int));
+	while (size > 0 && (size < xb_size) && (pos + size < xb_size)) {
+		if (strncmp
+		    ((YCHAR *) (xb + pos + sizeof(int)), name, size) == 0) {
+			if (exist_size)
+				*exist_size = size;
+			return pos;
+		}
+		pos += size;
+		if (pos < xb_size - sizeof(int))
+			memcpy(&size, xb + pos, sizeof(int));
+		else
+			size = 0;
+	}
+	if (exist_size)
+		*exist_size = 0;
+	return -1;
+}
+
+static int nval_used(const char *xb, int xb_size)
+{
+	int pos = 0;
+	int size;
+
+	memcpy(&size, xb + pos, sizeof(int));
+	while (size > 0 && (size < xb_size) && (pos + size < xb_size)) {
+		pos += size;
+		if (pos < xb_size - sizeof(int))
+			memcpy(&size, xb + pos, sizeof(int));
+		else
+			size = 0;
+	}
+	return pos;
+}
+
+int nval_del(char *xb, int xb_size, const YCHAR * name)
+{
+	int pos = nval_find(xb, xb_size, name, NULL);
+	int size;
+
+	if (pos >= 0 && pos < xb_size) {
+		/* Find size, shift rest over this record, then zero out the rest of buffer */
+		memcpy(&size, xb + pos, sizeof(int));
+		memcpy(xb + pos, xb + pos + size, xb_size - (pos + size));
+		memset(xb + (xb_size - size), 0, size);
+		return 0;
+	} else {
+		return -ENODATA;
+        }
+}
+
+int nval_set(char *xb, int xb_size, const YCHAR * name, const char *buf,
+	     int bsize, int flags)
+{
+	int pos;
+	int namelen = strnlen(name, xb_size);
+	int reclen;
+	int size_exist = 0;
+	int space;
+	int start;
+
+	pos = nval_find(xb, xb_size, name, &size_exist);
+
+	if (flags & XATTR_CREATE && pos >= 0)
+		return -EEXIST;
+	if (flags & XATTR_REPLACE && pos < 0)
+		return -ENODATA;
+
+	start = nval_used(xb, xb_size);
+	space = xb_size - start + size_exist;
+
+	reclen = (sizeof(int) + namelen + 1 + bsize);
+
+	if (reclen > space)
+		return -ENOSPC;
+
+	if (pos >= 0) {
+		nval_del(xb, xb_size, name);
+		start = nval_used(xb, xb_size);
+	}
+
+	pos = start;
+
+	memcpy(xb + pos, &reclen, sizeof(int));
+	pos += sizeof(int);
+	strncpy((YCHAR *) (xb + pos), name, reclen);
+	pos += (namelen + 1);
+	memcpy(xb + pos, buf, bsize);
+	return 0;
+}
+
+int nval_get(const char *xb, int xb_size, const YCHAR * name, char *buf,
+	     int bsize)
+{
+	int pos = nval_find(xb, xb_size, name, NULL);
+	int size;
+
+	if (pos >= 0 && pos < xb_size) {
+
+		memcpy(&size, xb + pos, sizeof(int));
+		pos += sizeof(int);	/* advance past record length */
+		size -= sizeof(int);
+
+		/* Advance over name string */
+		while (xb[pos] && size > 0 && pos < xb_size) {
+			pos++;
+			size--;
+		}
+		/*Advance over NUL */
+		pos++;
+		size--;
+
+		if (size <= bsize) {
+			memcpy(buf, xb + pos, size);
+			return size;
+		}
+
+	}
+	if (pos >= 0)
+		return -ERANGE;
+	else
+		return -ENODATA;
+}
+
+int nval_list(const char *xb, int xb_size, char *buf, int bsize)
+{
+	int pos = 0;
+	int size;
+	int name_len;
+	int ncopied = 0;
+	int filled = 0;
+
+	memcpy(&size, xb + pos, sizeof(int));
+	while (size > sizeof(int) && size <= xb_size && (pos + size) < xb_size
+	       && !filled) {
+		pos += sizeof(int);
+		size -= sizeof(int);
+		name_len = strnlen((YCHAR *) (xb + pos), size);
+		if (ncopied + name_len + 1 < bsize) {
+			memcpy(buf, xb + pos, name_len * sizeof(YCHAR));
+			buf += name_len;
+			*buf = '\0';
+			buf++;
+			if (sizeof(YCHAR) > 1) {
+				*buf = '\0';
+				buf++;
+			}
+			ncopied += (name_len + 1);
+		} else {
+			filled = 1;
+                }
+		pos += size;
+		if (pos < xb_size - sizeof(int))
+			memcpy(&size, xb + pos, sizeof(int));
+		else
+			size = 0;
+	}
+	return ncopied;
+}
+
+int nval_hasvalues(const char *xb, int xb_size)
+{
+	return nval_used(xb, xb_size) > 0;
+}
diff --git a/fs/yaffs2/yaffs_nameval.h b/fs/yaffs2/yaffs_nameval.h
new file mode 100644
index 00000000..2bb02b62
--- /dev/null
+++ b/fs/yaffs2/yaffs_nameval.h
@@ -0,0 +1,28 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __NAMEVAL_H__
+#define __NAMEVAL_H__
+
+#include "yportenv.h"
+
+int nval_del(char *xb, int xb_size, const YCHAR * name);
+int nval_set(char *xb, int xb_size, const YCHAR * name, const char *buf,
+	     int bsize, int flags);
+int nval_get(const char *xb, int xb_size, const YCHAR * name, char *buf,
+	     int bsize);
+int nval_list(const char *xb, int xb_size, char *buf, int bsize);
+int nval_hasvalues(const char *xb, int xb_size);
+#endif
diff --git a/fs/yaffs2/yaffs_nand.c b/fs/yaffs2/yaffs_nand.c
new file mode 100644
index 00000000..e816cabf
--- /dev/null
+++ b/fs/yaffs2/yaffs_nand.c
@@ -0,0 +1,127 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "yaffs_nand.h"
+#include "yaffs_tagscompat.h"
+#include "yaffs_tagsvalidity.h"
+
+#include "yaffs_getblockinfo.h"
+
+int yaffs_rd_chunk_tags_nand(struct yaffs_dev *dev, int nand_chunk,
+			     u8 * buffer, struct yaffs_ext_tags *tags)
+{
+	int result;
+	struct yaffs_ext_tags local_tags;
+
+	int realigned_chunk = nand_chunk - dev->chunk_offset;
+
+	dev->n_page_reads++;
+
+	/* If there are no tags provided, use local tags to get prioritised gc working */
+	if (!tags)
+		tags = &local_tags;
+
+	if (dev->param.read_chunk_tags_fn)
+		result =
+		    dev->param.read_chunk_tags_fn(dev, realigned_chunk, buffer,
+						  tags);
+	else
+		result = yaffs_tags_compat_rd(dev,
+					      realigned_chunk, buffer, tags);
+	if (tags && tags->ecc_result > YAFFS_ECC_RESULT_NO_ERROR) {
+
+		struct yaffs_block_info *bi;
+		bi = yaffs_get_block_info(dev,
+					  nand_chunk /
+					  dev->param.chunks_per_block);
+		yaffs_handle_chunk_error(dev, bi);
+	}
+
+	return result;
+}
+
+int yaffs_wr_chunk_tags_nand(struct yaffs_dev *dev,
+			     int nand_chunk,
+			     const u8 * buffer, struct yaffs_ext_tags *tags)
+{
+
+	dev->n_page_writes++;
+
+	nand_chunk -= dev->chunk_offset;
+
+	if (tags) {
+		tags->seq_number = dev->seq_number;
+		tags->chunk_used = 1;
+		if (!yaffs_validate_tags(tags)) {
+			yaffs_trace(YAFFS_TRACE_ERROR, "Writing uninitialised tags");
+			YBUG();
+		}
+		yaffs_trace(YAFFS_TRACE_WRITE,
+			"Writing chunk %d tags %d %d",
+			nand_chunk, tags->obj_id, tags->chunk_id);
+	} else {
+		yaffs_trace(YAFFS_TRACE_ERROR, "Writing with no tags");
+		YBUG();
+	}
+
+	if (dev->param.write_chunk_tags_fn)
+		return dev->param.write_chunk_tags_fn(dev, nand_chunk, buffer,
+						      tags);
+	else
+		return yaffs_tags_compat_wr(dev, nand_chunk, buffer, tags);
+}
+
+int yaffs_mark_bad(struct yaffs_dev *dev, int block_no)
+{
+	block_no -= dev->block_offset;
+
+	if (dev->param.bad_block_fn)
+		return dev->param.bad_block_fn(dev, block_no);
+	else
+		return yaffs_tags_compat_mark_bad(dev, block_no);
+}
+
+int yaffs_query_init_block_state(struct yaffs_dev *dev,
+				 int block_no,
+				 enum yaffs_block_state *state,
+				 u32 * seq_number)
+{
+	block_no -= dev->block_offset;
+
+	if (dev->param.query_block_fn)
+		return dev->param.query_block_fn(dev, block_no, state,
+						 seq_number);
+	else
+		return yaffs_tags_compat_query_block(dev, block_no,
+						     state, seq_number);
+}
+
+int yaffs_erase_block(struct yaffs_dev *dev, int flash_block)
+{
+	int result;
+
+	flash_block -= dev->block_offset;
+
+	dev->n_erasures++;
+
+	result = dev->param.erase_fn(dev, flash_block);
+
+	return result;
+}
+
+int yaffs_init_nand(struct yaffs_dev *dev)
+{
+	if (dev->param.initialise_flash_fn)
+		return dev->param.initialise_flash_fn(dev);
+	return YAFFS_OK;
+}
diff --git a/fs/yaffs2/yaffs_nand.h b/fs/yaffs2/yaffs_nand.h
new file mode 100644
index 00000000..543f1987
--- /dev/null
+++ b/fs/yaffs2/yaffs_nand.h
@@ -0,0 +1,38 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YAFFS_NAND_H__
+#define __YAFFS_NAND_H__
+#include "yaffs_guts.h"
+
+int yaffs_rd_chunk_tags_nand(struct yaffs_dev *dev, int nand_chunk,
+			     u8 * buffer, struct yaffs_ext_tags *tags);
+
+int yaffs_wr_chunk_tags_nand(struct yaffs_dev *dev,
+			     int nand_chunk,
+			     const u8 * buffer, struct yaffs_ext_tags *tags);
+
+int yaffs_mark_bad(struct yaffs_dev *dev, int block_no);
+
+int yaffs_query_init_block_state(struct yaffs_dev *dev,
+				 int block_no,
+				 enum yaffs_block_state *state,
+				 unsigned *seq_number);
+
+int yaffs_erase_block(struct yaffs_dev *dev, int flash_block);
+
+int yaffs_init_nand(struct yaffs_dev *dev);
+
+#endif
diff --git a/fs/yaffs2/yaffs_packedtags1.c b/fs/yaffs2/yaffs_packedtags1.c
new file mode 100644
index 00000000..a77f0954
--- /dev/null
+++ b/fs/yaffs2/yaffs_packedtags1.c
@@ -0,0 +1,53 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "yaffs_packedtags1.h"
+#include "yportenv.h"
+
+void yaffs_pack_tags1(struct yaffs_packed_tags1 *pt,
+		      const struct yaffs_ext_tags *t)
+{
+	pt->chunk_id = t->chunk_id;
+	pt->serial_number = t->serial_number;
+	pt->n_bytes = t->n_bytes;
+	pt->obj_id = t->obj_id;
+	pt->ecc = 0;
+	pt->deleted = (t->is_deleted) ? 0 : 1;
+	pt->unused_stuff = 0;
+	pt->should_be_ff = 0xFFFFFFFF;
+
+}
+
+void yaffs_unpack_tags1(struct yaffs_ext_tags *t,
+			const struct yaffs_packed_tags1 *pt)
+{
+	static const u8 all_ff[] =
+	    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff
+	};
+
+	if (memcmp(all_ff, pt, sizeof(struct yaffs_packed_tags1))) {
+		t->block_bad = 0;
+		if (pt->should_be_ff != 0xFFFFFFFF)
+			t->block_bad = 1;
+		t->chunk_used = 1;
+		t->obj_id = pt->obj_id;
+		t->chunk_id = pt->chunk_id;
+		t->n_bytes = pt->n_bytes;
+		t->ecc_result = YAFFS_ECC_RESULT_NO_ERROR;
+		t->is_deleted = (pt->deleted) ? 0 : 1;
+		t->serial_number = pt->serial_number;
+	} else {
+		memset(t, 0, sizeof(struct yaffs_ext_tags));
+	}
+}
diff --git a/fs/yaffs2/yaffs_packedtags1.h b/fs/yaffs2/yaffs_packedtags1.h
new file mode 100644
index 00000000..d6861ff5
--- /dev/null
+++ b/fs/yaffs2/yaffs_packedtags1.h
@@ -0,0 +1,39 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+/* This is used to pack YAFFS1 tags, not YAFFS2 tags. */
+
+#ifndef __YAFFS_PACKEDTAGS1_H__
+#define __YAFFS_PACKEDTAGS1_H__
+
+#include "yaffs_guts.h"
+
+struct yaffs_packed_tags1 {
+	unsigned chunk_id:20;
+	unsigned serial_number:2;
+	unsigned n_bytes:10;
+	unsigned obj_id:18;
+	unsigned ecc:12;
+	unsigned deleted:1;
+	unsigned unused_stuff:1;
+	unsigned should_be_ff;
+
+};
+
+void yaffs_pack_tags1(struct yaffs_packed_tags1 *pt,
+		      const struct yaffs_ext_tags *t);
+void yaffs_unpack_tags1(struct yaffs_ext_tags *t,
+			const struct yaffs_packed_tags1 *pt);
+#endif
diff --git a/fs/yaffs2/yaffs_packedtags2.c b/fs/yaffs2/yaffs_packedtags2.c
new file mode 100644
index 00000000..8e7fea3d
--- /dev/null
+++ b/fs/yaffs2/yaffs_packedtags2.c
@@ -0,0 +1,196 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "yaffs_packedtags2.h"
+#include "yportenv.h"
+#include "yaffs_trace.h"
+#include "yaffs_tagsvalidity.h"
+
+/* This code packs a set of extended tags into a binary structure for
+ * NAND storage
+ */
+
+/* Some of the information is "extra" struff which can be packed in to
+ * speed scanning
+ * This is defined by having the EXTRA_HEADER_INFO_FLAG set.
+ */
+
+/* Extra flags applied to chunk_id */
+
+#define EXTRA_HEADER_INFO_FLAG	0x80000000
+#define EXTRA_SHRINK_FLAG	0x40000000
+#define EXTRA_SHADOWS_FLAG	0x20000000
+#define EXTRA_SPARE_FLAGS	0x10000000
+
+#define ALL_EXTRA_FLAGS		0xF0000000
+
+/* Also, the top 4 bits of the object Id are set to the object type. */
+#define EXTRA_OBJECT_TYPE_SHIFT (28)
+#define EXTRA_OBJECT_TYPE_MASK  ((0x0F) << EXTRA_OBJECT_TYPE_SHIFT)
+
+static void yaffs_dump_packed_tags2_tags_only(const struct
+					      yaffs_packed_tags2_tags_only *ptt)
+{
+	yaffs_trace(YAFFS_TRACE_MTD,
+		"packed tags obj %d chunk %d byte %d seq %d",
+		ptt->obj_id, ptt->chunk_id, ptt->n_bytes, ptt->seq_number);
+}
+
+static void yaffs_dump_packed_tags2(const struct yaffs_packed_tags2 *pt)
+{
+	yaffs_dump_packed_tags2_tags_only(&pt->t);
+}
+
+static void yaffs_dump_tags2(const struct yaffs_ext_tags *t)
+{
+	yaffs_trace(YAFFS_TRACE_MTD,
+		"ext.tags eccres %d blkbad %d chused %d obj %d chunk%d byte %d del %d ser %d seq %d",
+		t->ecc_result, t->block_bad, t->chunk_used, t->obj_id,
+		t->chunk_id, t->n_bytes, t->is_deleted, t->serial_number,
+		t->seq_number);
+
+}
+
+void yaffs_pack_tags2_tags_only(struct yaffs_packed_tags2_tags_only *ptt,
+				const struct yaffs_ext_tags *t)
+{
+	ptt->chunk_id = t->chunk_id;
+	ptt->seq_number = t->seq_number;
+	ptt->n_bytes = t->n_bytes;
+	ptt->obj_id = t->obj_id;
+
+	if (t->chunk_id == 0 && t->extra_available) {
+		/* Store the extra header info instead */
+		/* We save the parent object in the chunk_id */
+		ptt->chunk_id = EXTRA_HEADER_INFO_FLAG | t->extra_parent_id;
+		if (t->extra_is_shrink)
+			ptt->chunk_id |= EXTRA_SHRINK_FLAG;
+		if (t->extra_shadows)
+			ptt->chunk_id |= EXTRA_SHADOWS_FLAG;
+
+		ptt->obj_id &= ~EXTRA_OBJECT_TYPE_MASK;
+		ptt->obj_id |= (t->extra_obj_type << EXTRA_OBJECT_TYPE_SHIFT);
+
+		if (t->extra_obj_type == YAFFS_OBJECT_TYPE_HARDLINK)
+			ptt->n_bytes = t->extra_equiv_id;
+		else if (t->extra_obj_type == YAFFS_OBJECT_TYPE_FILE)
+			ptt->n_bytes = t->extra_length;
+		else
+			ptt->n_bytes = 0;
+	}
+
+	yaffs_dump_packed_tags2_tags_only(ptt);
+	yaffs_dump_tags2(t);
+}
+
+void yaffs_pack_tags2(struct yaffs_packed_tags2 *pt,
+		      const struct yaffs_ext_tags *t, int tags_ecc)
+{
+	yaffs_pack_tags2_tags_only(&pt->t, t);
+
+	if (tags_ecc)
+		yaffs_ecc_calc_other((unsigned char *)&pt->t,
+				     sizeof(struct
+					    yaffs_packed_tags2_tags_only),
+				     &pt->ecc);
+}
+
+void yaffs_unpack_tags2_tags_only(struct yaffs_ext_tags *t,
+				  struct yaffs_packed_tags2_tags_only *ptt)
+{
+
+	memset(t, 0, sizeof(struct yaffs_ext_tags));
+
+	yaffs_init_tags(t);
+
+	if (ptt->seq_number != 0xFFFFFFFF) {
+		t->block_bad = 0;
+		t->chunk_used = 1;
+		t->obj_id = ptt->obj_id;
+		t->chunk_id = ptt->chunk_id;
+		t->n_bytes = ptt->n_bytes;
+		t->is_deleted = 0;
+		t->serial_number = 0;
+		t->seq_number = ptt->seq_number;
+
+		/* Do extra header info stuff */
+
+		if (ptt->chunk_id & EXTRA_HEADER_INFO_FLAG) {
+			t->chunk_id = 0;
+			t->n_bytes = 0;
+
+			t->extra_available = 1;
+			t->extra_parent_id =
+			    ptt->chunk_id & (~(ALL_EXTRA_FLAGS));
+			t->extra_is_shrink =
+			    (ptt->chunk_id & EXTRA_SHRINK_FLAG) ? 1 : 0;
+			t->extra_shadows =
+			    (ptt->chunk_id & EXTRA_SHADOWS_FLAG) ? 1 : 0;
+			t->extra_obj_type =
+			    ptt->obj_id >> EXTRA_OBJECT_TYPE_SHIFT;
+			t->obj_id &= ~EXTRA_OBJECT_TYPE_MASK;
+
+			if (t->extra_obj_type == YAFFS_OBJECT_TYPE_HARDLINK)
+				t->extra_equiv_id = ptt->n_bytes;
+			else
+				t->extra_length = ptt->n_bytes;
+		}
+	}
+
+	yaffs_dump_packed_tags2_tags_only(ptt);
+	yaffs_dump_tags2(t);
+
+}
+
+void yaffs_unpack_tags2(struct yaffs_ext_tags *t, struct yaffs_packed_tags2 *pt,
+			int tags_ecc)
+{
+
+	enum yaffs_ecc_result ecc_result = YAFFS_ECC_RESULT_NO_ERROR;
+
+	if (pt->t.seq_number != 0xFFFFFFFF && tags_ecc) {
+		/* Chunk is in use and we need to do ECC */
+
+		struct yaffs_ecc_other ecc;
+		int result;
+		yaffs_ecc_calc_other((unsigned char *)&pt->t,
+				     sizeof(struct
+					    yaffs_packed_tags2_tags_only),
+				     &ecc);
+		result =
+		    yaffs_ecc_correct_other((unsigned char *)&pt->t,
+					    sizeof(struct
+						   yaffs_packed_tags2_tags_only),
+					    &pt->ecc, &ecc);
+		switch (result) {
+		case 0:
+			ecc_result = YAFFS_ECC_RESULT_NO_ERROR;
+			break;
+		case 1:
+			ecc_result = YAFFS_ECC_RESULT_FIXED;
+			break;
+		case -1:
+			ecc_result = YAFFS_ECC_RESULT_UNFIXED;
+			break;
+		default:
+			ecc_result = YAFFS_ECC_RESULT_UNKNOWN;
+		}
+	}
+
+	yaffs_unpack_tags2_tags_only(t, &pt->t);
+
+	t->ecc_result = ecc_result;
+
+	yaffs_dump_packed_tags2(pt);
+	yaffs_dump_tags2(t);
+}
diff --git a/fs/yaffs2/yaffs_packedtags2.h b/fs/yaffs2/yaffs_packedtags2.h
new file mode 100644
index 00000000..f3296697
--- /dev/null
+++ b/fs/yaffs2/yaffs_packedtags2.h
@@ -0,0 +1,47 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+/* This is used to pack YAFFS2 tags, not YAFFS1tags. */
+
+#ifndef __YAFFS_PACKEDTAGS2_H__
+#define __YAFFS_PACKEDTAGS2_H__
+
+#include "yaffs_guts.h"
+#include "yaffs_ecc.h"
+
+struct yaffs_packed_tags2_tags_only {
+	unsigned seq_number;
+	unsigned obj_id;
+	unsigned chunk_id;
+	unsigned n_bytes;
+};
+
+struct yaffs_packed_tags2 {
+	struct yaffs_packed_tags2_tags_only t;
+	struct yaffs_ecc_other ecc;
+};
+
+/* Full packed tags with ECC, used for oob tags */
+void yaffs_pack_tags2(struct yaffs_packed_tags2 *pt,
+		      const struct yaffs_ext_tags *t, int tags_ecc);
+void yaffs_unpack_tags2(struct yaffs_ext_tags *t, struct yaffs_packed_tags2 *pt,
+			int tags_ecc);
+
+/* Only the tags part (no ECC for use with inband tags */
+void yaffs_pack_tags2_tags_only(struct yaffs_packed_tags2_tags_only *pt,
+				const struct yaffs_ext_tags *t);
+void yaffs_unpack_tags2_tags_only(struct yaffs_ext_tags *t,
+				  struct yaffs_packed_tags2_tags_only *pt);
+#endif
diff --git a/fs/yaffs2/yaffs_tagscompat.c b/fs/yaffs2/yaffs_tagscompat.c
new file mode 100644
index 00000000..7578075d
--- /dev/null
+++ b/fs/yaffs2/yaffs_tagscompat.c
@@ -0,0 +1,422 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "yaffs_guts.h"
+#include "yaffs_tagscompat.h"
+#include "yaffs_ecc.h"
+#include "yaffs_getblockinfo.h"
+#include "yaffs_trace.h"
+
+static void yaffs_handle_rd_data_error(struct yaffs_dev *dev, int nand_chunk);
+
+
+/********** Tags ECC calculations  *********/
+
+void yaffs_calc_ecc(const u8 * data, struct yaffs_spare *spare)
+{
+	yaffs_ecc_cacl(data, spare->ecc1);
+	yaffs_ecc_cacl(&data[256], spare->ecc2);
+}
+
+void yaffs_calc_tags_ecc(struct yaffs_tags *tags)
+{
+	/* Calculate an ecc */
+
+	unsigned char *b = ((union yaffs_tags_union *)tags)->as_bytes;
+	unsigned i, j;
+	unsigned ecc = 0;
+	unsigned bit = 0;
+
+	tags->ecc = 0;
+
+	for (i = 0; i < 8; i++) {
+		for (j = 1; j & 0xff; j <<= 1) {
+			bit++;
+			if (b[i] & j)
+				ecc ^= bit;
+		}
+	}
+
+	tags->ecc = ecc;
+
+}
+
+int yaffs_check_tags_ecc(struct yaffs_tags *tags)
+{
+	unsigned ecc = tags->ecc;
+
+	yaffs_calc_tags_ecc(tags);
+
+	ecc ^= tags->ecc;
+
+	if (ecc && ecc <= 64) {
+		/* TODO: Handle the failure better. Retire? */
+		unsigned char *b = ((union yaffs_tags_union *)tags)->as_bytes;
+
+		ecc--;
+
+		b[ecc / 8] ^= (1 << (ecc & 7));
+
+		/* Now recvalc the ecc */
+		yaffs_calc_tags_ecc(tags);
+
+		return 1;	/* recovered error */
+	} else if (ecc) {
+		/* Wierd ecc failure value */
+		/* TODO Need to do somethiong here */
+		return -1;	/* unrecovered error */
+	}
+
+	return 0;
+}
+
+/********** Tags **********/
+
+static void yaffs_load_tags_to_spare(struct yaffs_spare *spare_ptr,
+				     struct yaffs_tags *tags_ptr)
+{
+	union yaffs_tags_union *tu = (union yaffs_tags_union *)tags_ptr;
+
+	yaffs_calc_tags_ecc(tags_ptr);
+
+	spare_ptr->tb0 = tu->as_bytes[0];
+	spare_ptr->tb1 = tu->as_bytes[1];
+	spare_ptr->tb2 = tu->as_bytes[2];
+	spare_ptr->tb3 = tu->as_bytes[3];
+	spare_ptr->tb4 = tu->as_bytes[4];
+	spare_ptr->tb5 = tu->as_bytes[5];
+	spare_ptr->tb6 = tu->as_bytes[6];
+	spare_ptr->tb7 = tu->as_bytes[7];
+}
+
+static void yaffs_get_tags_from_spare(struct yaffs_dev *dev,
+				      struct yaffs_spare *spare_ptr,
+				      struct yaffs_tags *tags_ptr)
+{
+	union yaffs_tags_union *tu = (union yaffs_tags_union *)tags_ptr;
+	int result;
+
+	tu->as_bytes[0] = spare_ptr->tb0;
+	tu->as_bytes[1] = spare_ptr->tb1;
+	tu->as_bytes[2] = spare_ptr->tb2;
+	tu->as_bytes[3] = spare_ptr->tb3;
+	tu->as_bytes[4] = spare_ptr->tb4;
+	tu->as_bytes[5] = spare_ptr->tb5;
+	tu->as_bytes[6] = spare_ptr->tb6;
+	tu->as_bytes[7] = spare_ptr->tb7;
+
+	result = yaffs_check_tags_ecc(tags_ptr);
+	if (result > 0)
+		dev->n_tags_ecc_fixed++;
+	else if (result < 0)
+		dev->n_tags_ecc_unfixed++;
+}
+
+static void yaffs_spare_init(struct yaffs_spare *spare)
+{
+	memset(spare, 0xFF, sizeof(struct yaffs_spare));
+}
+
+static int yaffs_wr_nand(struct yaffs_dev *dev,
+			 int nand_chunk, const u8 * data,
+			 struct yaffs_spare *spare)
+{
+	if (nand_chunk < dev->param.start_block * dev->param.chunks_per_block) {
+		yaffs_trace(YAFFS_TRACE_ERROR,
+			"**>> yaffs chunk %d is not valid",
+			nand_chunk);
+		return YAFFS_FAIL;
+	}
+
+	return dev->param.write_chunk_fn(dev, nand_chunk, data, spare);
+}
+
+static int yaffs_rd_chunk_nand(struct yaffs_dev *dev,
+			       int nand_chunk,
+			       u8 * data,
+			       struct yaffs_spare *spare,
+			       enum yaffs_ecc_result *ecc_result,
+			       int correct_errors)
+{
+	int ret_val;
+	struct yaffs_spare local_spare;
+
+	if (!spare && data) {
+		/* If we don't have a real spare, then we use a local one. */
+		/* Need this for the calculation of the ecc */
+		spare = &local_spare;
+	}
+
+	if (!dev->param.use_nand_ecc) {
+		ret_val =
+		    dev->param.read_chunk_fn(dev, nand_chunk, data, spare);
+		if (data && correct_errors) {
+			/* Do ECC correction */
+			/* Todo handle any errors */
+			int ecc_result1, ecc_result2;
+			u8 calc_ecc[3];
+
+			yaffs_ecc_cacl(data, calc_ecc);
+			ecc_result1 =
+			    yaffs_ecc_correct(data, spare->ecc1, calc_ecc);
+			yaffs_ecc_cacl(&data[256], calc_ecc);
+			ecc_result2 =
+			    yaffs_ecc_correct(&data[256], spare->ecc2,
+					      calc_ecc);
+
+			if (ecc_result1 > 0) {
+				yaffs_trace(YAFFS_TRACE_ERROR,
+					"**>>yaffs ecc error fix performed on chunk %d:0",
+					nand_chunk);
+				dev->n_ecc_fixed++;
+			} else if (ecc_result1 < 0) {
+				yaffs_trace(YAFFS_TRACE_ERROR,
+					"**>>yaffs ecc error unfixed on chunk %d:0",
+					nand_chunk);
+				dev->n_ecc_unfixed++;
+			}
+
+			if (ecc_result2 > 0) {
+				yaffs_trace(YAFFS_TRACE_ERROR,
+					"**>>yaffs ecc error fix performed on chunk %d:1",
+					nand_chunk);
+				dev->n_ecc_fixed++;
+			} else if (ecc_result2 < 0) {
+				yaffs_trace(YAFFS_TRACE_ERROR,
+					"**>>yaffs ecc error unfixed on chunk %d:1",
+					nand_chunk);
+				dev->n_ecc_unfixed++;
+			}
+
+			if (ecc_result1 || ecc_result2) {
+				/* We had a data problem on this page */
+				yaffs_handle_rd_data_error(dev, nand_chunk);
+			}
+
+			if (ecc_result1 < 0 || ecc_result2 < 0)
+				*ecc_result = YAFFS_ECC_RESULT_UNFIXED;
+			else if (ecc_result1 > 0 || ecc_result2 > 0)
+				*ecc_result = YAFFS_ECC_RESULT_FIXED;
+			else
+				*ecc_result = YAFFS_ECC_RESULT_NO_ERROR;
+		}
+	} else {
+		/* Must allocate enough memory for spare+2*sizeof(int) */
+		/* for ecc results from device. */
+		struct yaffs_nand_spare nspare;
+
+		memset(&nspare, 0, sizeof(nspare));
+
+		ret_val = dev->param.read_chunk_fn(dev, nand_chunk, data,
+						   (struct yaffs_spare *)
+						   &nspare);
+		memcpy(spare, &nspare, sizeof(struct yaffs_spare));
+		if (data && correct_errors) {
+			if (nspare.eccres1 > 0) {
+				yaffs_trace(YAFFS_TRACE_ERROR,
+					"**>>mtd ecc error fix performed on chunk %d:0",
+					nand_chunk);
+			} else if (nspare.eccres1 < 0) {
+				yaffs_trace(YAFFS_TRACE_ERROR,
+					"**>>mtd ecc error unfixed on chunk %d:0",
+					nand_chunk);
+			}
+
+			if (nspare.eccres2 > 0) {
+				yaffs_trace(YAFFS_TRACE_ERROR,
+					"**>>mtd ecc error fix performed on chunk %d:1",
+					nand_chunk);
+			} else if (nspare.eccres2 < 0) {
+				yaffs_trace(YAFFS_TRACE_ERROR,
+					"**>>mtd ecc error unfixed on chunk %d:1",
+					nand_chunk);
+			}
+
+			if (nspare.eccres1 || nspare.eccres2) {
+				/* We had a data problem on this page */
+				yaffs_handle_rd_data_error(dev, nand_chunk);
+			}
+
+			if (nspare.eccres1 < 0 || nspare.eccres2 < 0)
+				*ecc_result = YAFFS_ECC_RESULT_UNFIXED;
+			else if (nspare.eccres1 > 0 || nspare.eccres2 > 0)
+				*ecc_result = YAFFS_ECC_RESULT_FIXED;
+			else
+				*ecc_result = YAFFS_ECC_RESULT_NO_ERROR;
+
+		}
+	}
+	return ret_val;
+}
+
+/*
+ * Functions for robustisizing
+ */
+
+static void yaffs_handle_rd_data_error(struct yaffs_dev *dev, int nand_chunk)
+{
+	int flash_block = nand_chunk / dev->param.chunks_per_block;
+
+	/* Mark the block for retirement */
+	yaffs_get_block_info(dev,
+			     flash_block + dev->block_offset)->needs_retiring =
+	    1;
+	yaffs_trace(YAFFS_TRACE_ERROR | YAFFS_TRACE_BAD_BLOCKS,
+		"**>>Block %d marked for retirement",
+		flash_block);
+
+	/* TODO:
+	 * Just do a garbage collection on the affected block
+	 * then retire the block
+	 * NB recursion
+	 */
+}
+
+int yaffs_tags_compat_wr(struct yaffs_dev *dev,
+			 int nand_chunk,
+			 const u8 * data, const struct yaffs_ext_tags *ext_tags)
+{
+	struct yaffs_spare spare;
+	struct yaffs_tags tags;
+
+	yaffs_spare_init(&spare);
+
+	if (ext_tags->is_deleted)
+		spare.page_status = 0;
+	else {
+		tags.obj_id = ext_tags->obj_id;
+		tags.chunk_id = ext_tags->chunk_id;
+
+		tags.n_bytes_lsb = ext_tags->n_bytes & 0x3ff;
+
+		if (dev->data_bytes_per_chunk >= 1024)
+			tags.n_bytes_msb = (ext_tags->n_bytes >> 10) & 3;
+		else
+			tags.n_bytes_msb = 3;
+
+		tags.serial_number = ext_tags->serial_number;
+
+		if (!dev->param.use_nand_ecc && data)
+			yaffs_calc_ecc(data, &spare);
+
+		yaffs_load_tags_to_spare(&spare, &tags);
+
+	}
+
+	return yaffs_wr_nand(dev, nand_chunk, data, &spare);
+}
+
+int yaffs_tags_compat_rd(struct yaffs_dev *dev,
+			 int nand_chunk,
+			 u8 * data, struct yaffs_ext_tags *ext_tags)
+{
+
+	struct yaffs_spare spare;
+	struct yaffs_tags tags;
+	enum yaffs_ecc_result ecc_result = YAFFS_ECC_RESULT_UNKNOWN;
+
+	static struct yaffs_spare spare_ff;
+	static int init;
+
+	if (!init) {
+		memset(&spare_ff, 0xFF, sizeof(spare_ff));
+		init = 1;
+	}
+
+	if (yaffs_rd_chunk_nand(dev, nand_chunk, data, &spare, &ecc_result, 1)) {
+		/* ext_tags may be NULL */
+		if (ext_tags) {
+
+			int deleted =
+			    (hweight8(spare.page_status) < 7) ? 1 : 0;
+
+			ext_tags->is_deleted = deleted;
+			ext_tags->ecc_result = ecc_result;
+			ext_tags->block_bad = 0;	/* We're reading it */
+			/* therefore it is not a bad block */
+			ext_tags->chunk_used =
+			    (memcmp(&spare_ff, &spare, sizeof(spare_ff)) !=
+			     0) ? 1 : 0;
+
+			if (ext_tags->chunk_used) {
+				yaffs_get_tags_from_spare(dev, &spare, &tags);
+
+				ext_tags->obj_id = tags.obj_id;
+				ext_tags->chunk_id = tags.chunk_id;
+				ext_tags->n_bytes = tags.n_bytes_lsb;
+
+				if (dev->data_bytes_per_chunk >= 1024)
+					ext_tags->n_bytes |=
+					    (((unsigned)tags.
+					      n_bytes_msb) << 10);
+
+				ext_tags->serial_number = tags.serial_number;
+			}
+		}
+
+		return YAFFS_OK;
+	} else {
+		return YAFFS_FAIL;
+	}
+}
+
+int yaffs_tags_compat_mark_bad(struct yaffs_dev *dev, int flash_block)
+{
+
+	struct yaffs_spare spare;
+
+	memset(&spare, 0xff, sizeof(struct yaffs_spare));
+
+	spare.block_status = 'Y';
+
+	yaffs_wr_nand(dev, flash_block * dev->param.chunks_per_block, NULL,
+		      &spare);
+	yaffs_wr_nand(dev, flash_block * dev->param.chunks_per_block + 1,
+		      NULL, &spare);
+
+	return YAFFS_OK;
+
+}
+
+int yaffs_tags_compat_query_block(struct yaffs_dev *dev,
+				  int block_no,
+				  enum yaffs_block_state *state,
+				  u32 * seq_number)
+{
+
+	struct yaffs_spare spare0, spare1;
+	static struct yaffs_spare spare_ff;
+	static int init;
+	enum yaffs_ecc_result dummy;
+
+	if (!init) {
+		memset(&spare_ff, 0xFF, sizeof(spare_ff));
+		init = 1;
+	}
+
+	*seq_number = 0;
+
+	yaffs_rd_chunk_nand(dev, block_no * dev->param.chunks_per_block, NULL,
+			    &spare0, &dummy, 1);
+	yaffs_rd_chunk_nand(dev, block_no * dev->param.chunks_per_block + 1,
+			    NULL, &spare1, &dummy, 1);
+
+	if (hweight8(spare0.block_status & spare1.block_status) < 7)
+		*state = YAFFS_BLOCK_STATE_DEAD;
+	else if (memcmp(&spare_ff, &spare0, sizeof(spare_ff)) == 0)
+		*state = YAFFS_BLOCK_STATE_EMPTY;
+	else
+		*state = YAFFS_BLOCK_STATE_NEEDS_SCANNING;
+
+	return YAFFS_OK;
+}
diff --git a/fs/yaffs2/yaffs_tagscompat.h b/fs/yaffs2/yaffs_tagscompat.h
new file mode 100644
index 00000000..8cd35dcd
--- /dev/null
+++ b/fs/yaffs2/yaffs_tagscompat.h
@@ -0,0 +1,36 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YAFFS_TAGSCOMPAT_H__
+#define __YAFFS_TAGSCOMPAT_H__
+
+#include "yaffs_guts.h"
+int yaffs_tags_compat_wr(struct yaffs_dev *dev,
+			 int nand_chunk,
+			 const u8 * data, const struct yaffs_ext_tags *tags);
+int yaffs_tags_compat_rd(struct yaffs_dev *dev,
+			 int nand_chunk,
+			 u8 * data, struct yaffs_ext_tags *tags);
+int yaffs_tags_compat_mark_bad(struct yaffs_dev *dev, int block_no);
+int yaffs_tags_compat_query_block(struct yaffs_dev *dev,
+				  int block_no,
+				  enum yaffs_block_state *state,
+				  u32 * seq_number);
+
+void yaffs_calc_tags_ecc(struct yaffs_tags *tags);
+int yaffs_check_tags_ecc(struct yaffs_tags *tags);
+int yaffs_count_bits(u8 byte);
+
+#endif
diff --git a/fs/yaffs2/yaffs_tagsvalidity.c b/fs/yaffs2/yaffs_tagsvalidity.c
new file mode 100644
index 00000000..4358d79d
--- /dev/null
+++ b/fs/yaffs2/yaffs_tagsvalidity.c
@@ -0,0 +1,27 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "yaffs_tagsvalidity.h"
+
+void yaffs_init_tags(struct yaffs_ext_tags *tags)
+{
+	memset(tags, 0, sizeof(struct yaffs_ext_tags));
+	tags->validity0 = 0xAAAAAAAA;
+	tags->validity1 = 0x55555555;
+}
+
+int yaffs_validate_tags(struct yaffs_ext_tags *tags)
+{
+	return (tags->validity0 == 0xAAAAAAAA && tags->validity1 == 0x55555555);
+
+}
diff --git a/fs/yaffs2/yaffs_tagsvalidity.h b/fs/yaffs2/yaffs_tagsvalidity.h
new file mode 100644
index 00000000..36a021fc
--- /dev/null
+++ b/fs/yaffs2/yaffs_tagsvalidity.h
@@ -0,0 +1,23 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YAFFS_TAGS_VALIDITY_H__
+#define __YAFFS_TAGS_VALIDITY_H__
+
+#include "yaffs_guts.h"
+
+void yaffs_init_tags(struct yaffs_ext_tags *tags);
+int yaffs_validate_tags(struct yaffs_ext_tags *tags);
+#endif
diff --git a/fs/yaffs2/yaffs_trace.h b/fs/yaffs2/yaffs_trace.h
new file mode 100644
index 00000000..6273dbf9
--- /dev/null
+++ b/fs/yaffs2/yaffs_trace.h
@@ -0,0 +1,57 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YTRACE_H__
+#define __YTRACE_H__
+
+extern unsigned int yaffs_trace_mask;
+extern unsigned int yaffs_wr_attempts;
+
+/*
+ * Tracing flags.
+ * The flags masked in YAFFS_TRACE_ALWAYS are always traced.
+ */
+
+#define YAFFS_TRACE_OS			0x00000002
+#define YAFFS_TRACE_ALLOCATE		0x00000004
+#define YAFFS_TRACE_SCAN		0x00000008
+#define YAFFS_TRACE_BAD_BLOCKS		0x00000010
+#define YAFFS_TRACE_ERASE		0x00000020
+#define YAFFS_TRACE_GC			0x00000040
+#define YAFFS_TRACE_WRITE		0x00000080
+#define YAFFS_TRACE_TRACING		0x00000100
+#define YAFFS_TRACE_DELETION		0x00000200
+#define YAFFS_TRACE_BUFFERS		0x00000400
+#define YAFFS_TRACE_NANDACCESS		0x00000800
+#define YAFFS_TRACE_GC_DETAIL		0x00001000
+#define YAFFS_TRACE_SCAN_DEBUG		0x00002000
+#define YAFFS_TRACE_MTD			0x00004000
+#define YAFFS_TRACE_CHECKPOINT		0x00008000
+
+#define YAFFS_TRACE_VERIFY		0x00010000
+#define YAFFS_TRACE_VERIFY_NAND		0x00020000
+#define YAFFS_TRACE_VERIFY_FULL		0x00040000
+#define YAFFS_TRACE_VERIFY_ALL		0x000F0000
+
+#define YAFFS_TRACE_SYNC		0x00100000
+#define YAFFS_TRACE_BACKGROUND		0x00200000
+#define YAFFS_TRACE_LOCK		0x00400000
+#define YAFFS_TRACE_MOUNT		0x00800000
+
+#define YAFFS_TRACE_ERROR		0x40000000
+#define YAFFS_TRACE_BUG			0x80000000
+#define YAFFS_TRACE_ALWAYS		0xF0000000
+
+#endif
diff --git a/fs/yaffs2/yaffs_verify.c b/fs/yaffs2/yaffs_verify.c
new file mode 100644
index 00000000..738c7f69
--- /dev/null
+++ b/fs/yaffs2/yaffs_verify.c
@@ -0,0 +1,535 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "yaffs_verify.h"
+#include "yaffs_trace.h"
+#include "yaffs_bitmap.h"
+#include "yaffs_getblockinfo.h"
+#include "yaffs_nand.h"
+
+int yaffs_skip_verification(struct yaffs_dev *dev)
+{
+	dev = dev;
+	return !(yaffs_trace_mask &
+		 (YAFFS_TRACE_VERIFY | YAFFS_TRACE_VERIFY_FULL));
+}
+
+static int yaffs_skip_full_verification(struct yaffs_dev *dev)
+{
+	dev = dev;
+	return !(yaffs_trace_mask & (YAFFS_TRACE_VERIFY_FULL));
+}
+
+static int yaffs_skip_nand_verification(struct yaffs_dev *dev)
+{
+	dev = dev;
+	return !(yaffs_trace_mask & (YAFFS_TRACE_VERIFY_NAND));
+}
+
+static const char *block_state_name[] = {
+	"Unknown",
+	"Needs scanning",
+	"Scanning",
+	"Empty",
+	"Allocating",
+	"Full",
+	"Dirty",
+	"Checkpoint",
+	"Collecting",
+	"Dead"
+};
+
+void yaffs_verify_blk(struct yaffs_dev *dev, struct yaffs_block_info *bi, int n)
+{
+	int actually_used;
+	int in_use;
+
+	if (yaffs_skip_verification(dev))
+		return;
+
+	/* Report illegal runtime states */
+	if (bi->block_state >= YAFFS_NUMBER_OF_BLOCK_STATES)
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Block %d has undefined state %d",
+			n, bi->block_state);
+
+	switch (bi->block_state) {
+	case YAFFS_BLOCK_STATE_UNKNOWN:
+	case YAFFS_BLOCK_STATE_SCANNING:
+	case YAFFS_BLOCK_STATE_NEEDS_SCANNING:
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Block %d has bad run-state %s",
+			n, block_state_name[bi->block_state]);
+	}
+
+	/* Check pages in use and soft deletions are legal */
+
+	actually_used = bi->pages_in_use - bi->soft_del_pages;
+
+	if (bi->pages_in_use < 0
+	    || bi->pages_in_use > dev->param.chunks_per_block
+	    || bi->soft_del_pages < 0
+	    || bi->soft_del_pages > dev->param.chunks_per_block
+	    || actually_used < 0 || actually_used > dev->param.chunks_per_block)
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Block %d has illegal values pages_in_used %d soft_del_pages %d",
+			n, bi->pages_in_use, bi->soft_del_pages);
+
+	/* Check chunk bitmap legal */
+	in_use = yaffs_count_chunk_bits(dev, n);
+	if (in_use != bi->pages_in_use)
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Block %d has inconsistent values pages_in_use %d counted chunk bits %d",
+			n, bi->pages_in_use, in_use);
+
+}
+
+void yaffs_verify_collected_blk(struct yaffs_dev *dev,
+				struct yaffs_block_info *bi, int n)
+{
+	yaffs_verify_blk(dev, bi, n);
+
+	/* After collection the block should be in the erased state */
+
+	if (bi->block_state != YAFFS_BLOCK_STATE_COLLECTING &&
+	    bi->block_state != YAFFS_BLOCK_STATE_EMPTY) {
+		yaffs_trace(YAFFS_TRACE_ERROR,
+			"Block %d is in state %d after gc, should be erased",
+			n, bi->block_state);
+	}
+}
+
+void yaffs_verify_blocks(struct yaffs_dev *dev)
+{
+	int i;
+	int state_count[YAFFS_NUMBER_OF_BLOCK_STATES];
+	int illegal_states = 0;
+
+	if (yaffs_skip_verification(dev))
+		return;
+
+	memset(state_count, 0, sizeof(state_count));
+
+	for (i = dev->internal_start_block; i <= dev->internal_end_block; i++) {
+		struct yaffs_block_info *bi = yaffs_get_block_info(dev, i);
+		yaffs_verify_blk(dev, bi, i);
+
+		if (bi->block_state < YAFFS_NUMBER_OF_BLOCK_STATES)
+			state_count[bi->block_state]++;
+		else
+			illegal_states++;
+	}
+
+	yaffs_trace(YAFFS_TRACE_VERIFY,	"Block summary");
+
+	yaffs_trace(YAFFS_TRACE_VERIFY,
+		"%d blocks have illegal states",
+		illegal_states);
+	if (state_count[YAFFS_BLOCK_STATE_ALLOCATING] > 1)
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Too many allocating blocks");
+
+	for (i = 0; i < YAFFS_NUMBER_OF_BLOCK_STATES; i++)
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"%s %d blocks",
+			block_state_name[i], state_count[i]);
+
+	if (dev->blocks_in_checkpt != state_count[YAFFS_BLOCK_STATE_CHECKPOINT])
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Checkpoint block count wrong dev %d count %d",
+			dev->blocks_in_checkpt,
+			state_count[YAFFS_BLOCK_STATE_CHECKPOINT]);
+
+	if (dev->n_erased_blocks != state_count[YAFFS_BLOCK_STATE_EMPTY])
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Erased block count wrong dev %d count %d",
+			dev->n_erased_blocks,
+			state_count[YAFFS_BLOCK_STATE_EMPTY]);
+
+	if (state_count[YAFFS_BLOCK_STATE_COLLECTING] > 1)
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Too many collecting blocks %d (max is 1)",
+			state_count[YAFFS_BLOCK_STATE_COLLECTING]);
+}
+
+/*
+ * Verify the object header. oh must be valid, but obj and tags may be NULL in which
+ * case those tests will not be performed.
+ */
+void yaffs_verify_oh(struct yaffs_obj *obj, struct yaffs_obj_hdr *oh,
+		     struct yaffs_ext_tags *tags, int parent_check)
+{
+	if (obj && yaffs_skip_verification(obj->my_dev))
+		return;
+
+	if (!(tags && obj && oh)) {
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Verifying object header tags %p obj %p oh %p",
+			tags, obj, oh);
+		return;
+	}
+
+	if (oh->type <= YAFFS_OBJECT_TYPE_UNKNOWN ||
+	    oh->type > YAFFS_OBJECT_TYPE_MAX)
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Obj %d header type is illegal value 0x%x",
+			tags->obj_id, oh->type);
+
+	if (tags->obj_id != obj->obj_id)
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Obj %d header mismatch obj_id %d",
+			tags->obj_id, obj->obj_id);
+
+	/*
+	 * Check that the object's parent ids match if parent_check requested.
+	 *
+	 * Tests do not apply to the root object.
+	 */
+
+	if (parent_check && tags->obj_id > 1 && !obj->parent)
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Obj %d header mismatch parent_id %d obj->parent is NULL",
+			tags->obj_id, oh->parent_obj_id);
+
+	if (parent_check && obj->parent &&
+	    oh->parent_obj_id != obj->parent->obj_id &&
+	    (oh->parent_obj_id != YAFFS_OBJECTID_UNLINKED ||
+	     obj->parent->obj_id != YAFFS_OBJECTID_DELETED))
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Obj %d header mismatch parent_id %d parent_obj_id %d",
+			tags->obj_id, oh->parent_obj_id,
+			obj->parent->obj_id);
+
+	if (tags->obj_id > 1 && oh->name[0] == 0)	/* Null name */
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Obj %d header name is NULL",
+			obj->obj_id);
+
+	if (tags->obj_id > 1 && ((u8) (oh->name[0])) == 0xff)	/* Trashed name */
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Obj %d header name is 0xFF",
+			obj->obj_id);
+}
+
+void yaffs_verify_file(struct yaffs_obj *obj)
+{
+	int required_depth;
+	int actual_depth;
+	u32 last_chunk;
+	u32 x;
+	u32 i;
+	struct yaffs_dev *dev;
+	struct yaffs_ext_tags tags;
+	struct yaffs_tnode *tn;
+	u32 obj_id;
+
+	if (!obj)
+		return;
+
+	if (yaffs_skip_verification(obj->my_dev))
+		return;
+
+	dev = obj->my_dev;
+	obj_id = obj->obj_id;
+
+	/* Check file size is consistent with tnode depth */
+	last_chunk =
+	    obj->variant.file_variant.file_size / dev->data_bytes_per_chunk + 1;
+	x = last_chunk >> YAFFS_TNODES_LEVEL0_BITS;
+	required_depth = 0;
+	while (x > 0) {
+		x >>= YAFFS_TNODES_INTERNAL_BITS;
+		required_depth++;
+	}
+
+	actual_depth = obj->variant.file_variant.top_level;
+
+	/* Check that the chunks in the tnode tree are all correct.
+	 * We do this by scanning through the tnode tree and
+	 * checking the tags for every chunk match.
+	 */
+
+	if (yaffs_skip_nand_verification(dev))
+		return;
+
+	for (i = 1; i <= last_chunk; i++) {
+		tn = yaffs_find_tnode_0(dev, &obj->variant.file_variant, i);
+
+		if (tn) {
+			u32 the_chunk = yaffs_get_group_base(dev, tn, i);
+			if (the_chunk > 0) {
+				yaffs_rd_chunk_tags_nand(dev, the_chunk, NULL,
+							 &tags);
+				if (tags.obj_id != obj_id || tags.chunk_id != i)
+				yaffs_trace(YAFFS_TRACE_VERIFY,
+					"Object %d chunk_id %d NAND mismatch chunk %d tags (%d:%d)",
+					 obj_id, i, the_chunk,
+					 tags.obj_id, tags.chunk_id);
+			}
+		}
+	}
+}
+
+void yaffs_verify_link(struct yaffs_obj *obj)
+{
+	if (obj && yaffs_skip_verification(obj->my_dev))
+		return;
+
+	/* Verify sane equivalent object */
+}
+
+void yaffs_verify_symlink(struct yaffs_obj *obj)
+{
+	if (obj && yaffs_skip_verification(obj->my_dev))
+		return;
+
+	/* Verify symlink string */
+}
+
+void yaffs_verify_special(struct yaffs_obj *obj)
+{
+	if (obj && yaffs_skip_verification(obj->my_dev))
+		return;
+}
+
+void yaffs_verify_obj(struct yaffs_obj *obj)
+{
+	struct yaffs_dev *dev;
+
+	u32 chunk_min;
+	u32 chunk_max;
+
+	u32 chunk_id_ok;
+	u32 chunk_in_range;
+	u32 chunk_wrongly_deleted;
+	u32 chunk_valid;
+
+	if (!obj)
+		return;
+
+	if (obj->being_created)
+		return;
+
+	dev = obj->my_dev;
+
+	if (yaffs_skip_verification(dev))
+		return;
+
+	/* Check sane object header chunk */
+
+	chunk_min = dev->internal_start_block * dev->param.chunks_per_block;
+	chunk_max =
+	    (dev->internal_end_block + 1) * dev->param.chunks_per_block - 1;
+
+	chunk_in_range = (((unsigned)(obj->hdr_chunk)) >= chunk_min &&
+			  ((unsigned)(obj->hdr_chunk)) <= chunk_max);
+	chunk_id_ok = chunk_in_range || (obj->hdr_chunk == 0);
+	chunk_valid = chunk_in_range &&
+	    yaffs_check_chunk_bit(dev,
+				  obj->hdr_chunk / dev->param.chunks_per_block,
+				  obj->hdr_chunk % dev->param.chunks_per_block);
+	chunk_wrongly_deleted = chunk_in_range && !chunk_valid;
+
+	if (!obj->fake && (!chunk_id_ok || chunk_wrongly_deleted))
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Obj %d has chunk_id %d %s %s",
+			obj->obj_id, obj->hdr_chunk,
+			chunk_id_ok ? "" : ",out of range",
+			chunk_wrongly_deleted ? ",marked as deleted" : "");
+
+	if (chunk_valid && !yaffs_skip_nand_verification(dev)) {
+		struct yaffs_ext_tags tags;
+		struct yaffs_obj_hdr *oh;
+		u8 *buffer = yaffs_get_temp_buffer(dev, __LINE__);
+
+		oh = (struct yaffs_obj_hdr *)buffer;
+
+		yaffs_rd_chunk_tags_nand(dev, obj->hdr_chunk, buffer, &tags);
+
+		yaffs_verify_oh(obj, oh, &tags, 1);
+
+		yaffs_release_temp_buffer(dev, buffer, __LINE__);
+	}
+
+	/* Verify it has a parent */
+	if (obj && !obj->fake && (!obj->parent || obj->parent->my_dev != dev)) {
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Obj %d has parent pointer %p which does not look like an object",
+			obj->obj_id, obj->parent);
+	}
+
+	/* Verify parent is a directory */
+	if (obj->parent
+	    && obj->parent->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) {
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Obj %d's parent is not a directory (type %d)",
+			obj->obj_id, obj->parent->variant_type);
+	}
+
+	switch (obj->variant_type) {
+	case YAFFS_OBJECT_TYPE_FILE:
+		yaffs_verify_file(obj);
+		break;
+	case YAFFS_OBJECT_TYPE_SYMLINK:
+		yaffs_verify_symlink(obj);
+		break;
+	case YAFFS_OBJECT_TYPE_DIRECTORY:
+		yaffs_verify_dir(obj);
+		break;
+	case YAFFS_OBJECT_TYPE_HARDLINK:
+		yaffs_verify_link(obj);
+		break;
+	case YAFFS_OBJECT_TYPE_SPECIAL:
+		yaffs_verify_special(obj);
+		break;
+	case YAFFS_OBJECT_TYPE_UNKNOWN:
+	default:
+		yaffs_trace(YAFFS_TRACE_VERIFY,
+			"Obj %d has illegaltype %d",
+		   obj->obj_id, obj->variant_type);
+		break;
+	}
+}
+
+void yaffs_verify_objects(struct yaffs_dev *dev)
+{
+	struct yaffs_obj *obj;
+	int i;
+	struct list_head *lh;
+
+	if (yaffs_skip_verification(dev))
+		return;
+
+	/* Iterate through the objects in each hash entry */
+
+	for (i = 0; i < YAFFS_NOBJECT_BUCKETS; i++) {
+		list_for_each(lh, &dev->obj_bucket[i].list) {
+			if (lh) {
+				obj =
+				    list_entry(lh, struct yaffs_obj, hash_link);
+				yaffs_verify_obj(obj);
+			}
+		}
+	}
+}
+
+void yaffs_verify_obj_in_dir(struct yaffs_obj *obj)
+{
+	struct list_head *lh;
+	struct yaffs_obj *list_obj;
+
+	int count = 0;
+
+	if (!obj) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS, "No object to verify");
+		YBUG();
+		return;
+	}
+
+	if (yaffs_skip_verification(obj->my_dev))
+		return;
+
+	if (!obj->parent) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS, "Object does not have parent" );
+		YBUG();
+		return;
+	}
+
+	if (obj->parent->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS, "Parent is not directory");
+		YBUG();
+	}
+
+	/* Iterate through the objects in each hash entry */
+
+	list_for_each(lh, &obj->parent->variant.dir_variant.children) {
+		if (lh) {
+			list_obj = list_entry(lh, struct yaffs_obj, siblings);
+			yaffs_verify_obj(list_obj);
+			if (obj == list_obj)
+				count++;
+		}
+	}
+
+	if (count != 1) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS,
+			"Object in directory %d times",
+			count);
+		YBUG();
+	}
+}
+
+void yaffs_verify_dir(struct yaffs_obj *directory)
+{
+	struct list_head *lh;
+	struct yaffs_obj *list_obj;
+
+	if (!directory) {
+		YBUG();
+		return;
+	}
+
+	if (yaffs_skip_full_verification(directory->my_dev))
+		return;
+
+	if (directory->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS,
+			"Directory has wrong type: %d",
+			directory->variant_type);
+		YBUG();
+	}
+
+	/* Iterate through the objects in each hash entry */
+
+	list_for_each(lh, &directory->variant.dir_variant.children) {
+		if (lh) {
+			list_obj = list_entry(lh, struct yaffs_obj, siblings);
+			if (list_obj->parent != directory) {
+				yaffs_trace(YAFFS_TRACE_ALWAYS,
+					"Object in directory list has wrong parent %p",
+					list_obj->parent);
+				YBUG();
+			}
+			yaffs_verify_obj_in_dir(list_obj);
+		}
+	}
+}
+
+static int yaffs_free_verification_failures;
+
+void yaffs_verify_free_chunks(struct yaffs_dev *dev)
+{
+	int counted;
+	int difference;
+
+	if (yaffs_skip_verification(dev))
+		return;
+
+	counted = yaffs_count_free_chunks(dev);
+
+	difference = dev->n_free_chunks - counted;
+
+	if (difference) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS,
+		 	"Freechunks verification failure %d %d %d",
+			dev->n_free_chunks, counted, difference);
+		yaffs_free_verification_failures++;
+	}
+}
+
+int yaffs_verify_file_sane(struct yaffs_obj *in)
+{
+	in = in;
+	return YAFFS_OK;
+}
+
diff --git a/fs/yaffs2/yaffs_verify.h b/fs/yaffs2/yaffs_verify.h
new file mode 100644
index 00000000..cc6f8899
--- /dev/null
+++ b/fs/yaffs2/yaffs_verify.h
@@ -0,0 +1,43 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YAFFS_VERIFY_H__
+#define __YAFFS_VERIFY_H__
+
+#include "yaffs_guts.h"
+
+void yaffs_verify_blk(struct yaffs_dev *dev, struct yaffs_block_info *bi,
+		      int n);
+void yaffs_verify_collected_blk(struct yaffs_dev *dev,
+				struct yaffs_block_info *bi, int n);
+void yaffs_verify_blocks(struct yaffs_dev *dev);
+
+void yaffs_verify_oh(struct yaffs_obj *obj, struct yaffs_obj_hdr *oh,
+		     struct yaffs_ext_tags *tags, int parent_check);
+void yaffs_verify_file(struct yaffs_obj *obj);
+void yaffs_verify_link(struct yaffs_obj *obj);
+void yaffs_verify_symlink(struct yaffs_obj *obj);
+void yaffs_verify_special(struct yaffs_obj *obj);
+void yaffs_verify_obj(struct yaffs_obj *obj);
+void yaffs_verify_objects(struct yaffs_dev *dev);
+void yaffs_verify_obj_in_dir(struct yaffs_obj *obj);
+void yaffs_verify_dir(struct yaffs_obj *directory);
+void yaffs_verify_free_chunks(struct yaffs_dev *dev);
+
+int yaffs_verify_file_sane(struct yaffs_obj *obj);
+
+int yaffs_skip_verification(struct yaffs_dev *dev);
+
+#endif
diff --git a/fs/yaffs2/yaffs_vfs.c b/fs/yaffs2/yaffs_vfs.c
new file mode 100644
index 00000000..d5b87531
--- /dev/null
+++ b/fs/yaffs2/yaffs_vfs.c
@@ -0,0 +1,2792 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ * Acknowledgements:
+ * Luc van OostenRyck for numerous patches.
+ * Nick Bane for numerous patches.
+ * Nick Bane for 2.5/2.6 integration.
+ * Andras Toth for mknod rdev issue.
+ * Michael Fischer for finding the problem with inode inconsistency.
+ * Some code bodily lifted from JFFS
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ *
+ * This is the file system front-end to YAFFS that hooks it up to
+ * the VFS.
+ *
+ * Special notes:
+ * >> 2.4: sb->u.generic_sbp points to the struct yaffs_dev associated with
+ *         this superblock
+ * >> 2.6: sb->s_fs_info  points to the struct yaffs_dev associated with this
+ *         superblock
+ * >> inode->u.generic_ip points to the associated struct yaffs_obj.
+ */
+
+/*
+ * NB There are two variants of Linux VFS glue code. This variant supports
+ * a single version and should not include any multi-version code.
+ */
+#include <linux/version.h>
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/smp_lock.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/mtd.h>
+#include <linux/interrupt.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/namei.h>
+#include <linux/exportfs.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+
+#include <asm/div64.h>
+
+#include <linux/statfs.h>
+
+#define UnlockPage(p) unlock_page(p)
+#define Page_Uptodate(page)	test_bit(PG_uptodate, &(page)->flags)
+
+#define yaffs_devname(sb, buf)	bdevname(sb->s_bdev, buf)
+
+#define YPROC_ROOT  NULL
+
+#define Y_INIT_TIMER(a)	init_timer_on_stack(a)
+
+#define WRITE_SIZE_STR "writesize"
+#define WRITE_SIZE(mtd) ((mtd)->writesize)
+
+static uint32_t YCALCBLOCKS(uint64_t partition_size, uint32_t block_size)
+{
+	uint64_t result = partition_size;
+	do_div(result, block_size);
+	return (uint32_t) result;
+}
+
+#include <linux/uaccess.h>
+#include <linux/mtd/mtd.h>
+
+#include "yportenv.h"
+#include "yaffs_trace.h"
+#include "yaffs_guts.h"
+#include "yaffs_attribs.h"
+
+#include "yaffs_linux.h"
+
+#include "yaffs_mtdif.h"
+#include "yaffs_mtdif1.h"
+#include "yaffs_mtdif2.h"
+
+unsigned int yaffs_trace_mask = YAFFS_TRACE_BAD_BLOCKS | YAFFS_TRACE_ALWAYS;
+unsigned int yaffs_wr_attempts = YAFFS_WR_ATTEMPTS;
+unsigned int yaffs_auto_checkpoint = 1;
+unsigned int yaffs_gc_control = 1;
+unsigned int yaffs_bg_enable = 1;
+
+/* Module Parameters */
+module_param(yaffs_trace_mask, uint, 0644);
+module_param(yaffs_wr_attempts, uint, 0644);
+module_param(yaffs_auto_checkpoint, uint, 0644);
+module_param(yaffs_gc_control, uint, 0644);
+module_param(yaffs_bg_enable, uint, 0644);
+
+
+#define yaffs_inode_to_obj_lv(iptr) ((iptr)->i_private)
+#define yaffs_inode_to_obj(iptr) ((struct yaffs_obj *)(yaffs_inode_to_obj_lv(iptr)))
+#define yaffs_dentry_to_obj(dptr) yaffs_inode_to_obj((dptr)->d_inode)
+#define yaffs_super_to_dev(sb)	((struct yaffs_dev *)sb->s_fs_info)
+
+#define update_dir_time(dir) do {\
+			(dir)->i_ctime = (dir)->i_mtime = CURRENT_TIME; \
+		} while(0)
+
+
+static unsigned yaffs_gc_control_callback(struct yaffs_dev *dev)
+{
+	return yaffs_gc_control;
+}
+
+static void yaffs_gross_lock(struct yaffs_dev *dev)
+{
+	yaffs_trace(YAFFS_TRACE_LOCK, "yaffs locking %p", current);
+	mutex_lock(&(yaffs_dev_to_lc(dev)->gross_lock));
+	yaffs_trace(YAFFS_TRACE_LOCK, "yaffs locked %p", current);
+}
+
+static void yaffs_gross_unlock(struct yaffs_dev *dev)
+{
+	yaffs_trace(YAFFS_TRACE_LOCK, "yaffs unlocking %p", current);
+	mutex_unlock(&(yaffs_dev_to_lc(dev)->gross_lock));
+}
+
+static void yaffs_fill_inode_from_obj(struct inode *inode,
+				      struct yaffs_obj *obj);
+
+static struct inode *yaffs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct inode *inode;
+	struct yaffs_obj *obj;
+	struct yaffs_dev *dev = yaffs_super_to_dev(sb);
+
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_iget for %lu", ino);
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	/* NB This is called as a side effect of other functions, but
+	 * we had to release the lock to prevent deadlocks, so
+	 * need to lock again.
+	 */
+
+	yaffs_gross_lock(dev);
+
+	obj = yaffs_find_by_number(dev, inode->i_ino);
+
+	yaffs_fill_inode_from_obj(inode, obj);
+
+	yaffs_gross_unlock(dev);
+
+	unlock_new_inode(inode);
+	return inode;
+}
+
+struct inode *yaffs_get_inode(struct super_block *sb, int mode, int dev,
+			      struct yaffs_obj *obj)
+{
+	struct inode *inode;
+
+	if (!sb) {
+		yaffs_trace(YAFFS_TRACE_OS,
+			"yaffs_get_inode for NULL super_block!!");
+		return NULL;
+
+	}
+
+	if (!obj) {
+		yaffs_trace(YAFFS_TRACE_OS,
+			"yaffs_get_inode for NULL object!!");
+		return NULL;
+
+	}
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_get_inode for object %d",
+		obj->obj_id);
+
+	inode = yaffs_iget(sb, obj->obj_id);
+	if (IS_ERR(inode))
+		return NULL;
+
+	/* NB Side effect: iget calls back to yaffs_read_inode(). */
+	/* iget also increments the inode's i_count */
+	/* NB You can't be holding gross_lock or deadlock will happen! */
+
+	return inode;
+}
+
+static int yaffs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+		       dev_t rdev)
+{
+	struct inode *inode;
+
+	struct yaffs_obj *obj = NULL;
+	struct yaffs_dev *dev;
+
+	struct yaffs_obj *parent = yaffs_inode_to_obj(dir);
+
+	int error = -ENOSPC;
+	uid_t uid = current->cred->fsuid;
+	gid_t gid =
+	    (dir->i_mode & S_ISGID) ? dir->i_gid : current->cred->fsgid;
+
+	if ((dir->i_mode & S_ISGID) && S_ISDIR(mode))
+		mode |= S_ISGID;
+
+	if (parent) {
+		yaffs_trace(YAFFS_TRACE_OS,
+			"yaffs_mknod: parent object %d type %d",
+			parent->obj_id, parent->variant_type);
+	} else {
+		yaffs_trace(YAFFS_TRACE_OS,
+			"yaffs_mknod: could not get parent object");
+		return -EPERM;
+	}
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_mknod: making oject for %s, mode %x dev %x",
+		dentry->d_name.name, mode, rdev);
+
+	dev = parent->my_dev;
+
+	yaffs_gross_lock(dev);
+
+	switch (mode & S_IFMT) {
+	default:
+		/* Special (socket, fifo, device...) */
+		yaffs_trace(YAFFS_TRACE_OS, "yaffs_mknod: making special");
+		obj =
+		    yaffs_create_special(parent, dentry->d_name.name, mode, uid,
+					 gid, old_encode_dev(rdev));
+		break;
+	case S_IFREG:		/* file          */
+		yaffs_trace(YAFFS_TRACE_OS, "yaffs_mknod: making file");
+		obj = yaffs_create_file(parent, dentry->d_name.name, mode, uid,
+					gid);
+		break;
+	case S_IFDIR:		/* directory */
+		yaffs_trace(YAFFS_TRACE_OS, "yaffs_mknod: making directory");
+		obj = yaffs_create_dir(parent, dentry->d_name.name, mode,
+				       uid, gid);
+		break;
+	case S_IFLNK:		/* symlink */
+		yaffs_trace(YAFFS_TRACE_OS, "yaffs_mknod: making symlink");
+		obj = NULL;	/* Do we ever get here? */
+		break;
+	}
+
+	/* Can not call yaffs_get_inode() with gross lock held */
+	yaffs_gross_unlock(dev);
+
+	if (obj) {
+		inode = yaffs_get_inode(dir->i_sb, mode, rdev, obj);
+		d_instantiate(dentry, inode);
+		update_dir_time(dir);
+		yaffs_trace(YAFFS_TRACE_OS,
+			"yaffs_mknod created object %d count = %d",
+			obj->obj_id, atomic_read(&inode->i_count));
+		error = 0;
+		yaffs_fill_inode_from_obj(dir, parent);
+	} else {
+		yaffs_trace(YAFFS_TRACE_OS, "yaffs_mknod failed making object");
+		error = -ENOMEM;
+	}
+
+	return error;
+}
+
+static int yaffs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	return yaffs_mknod(dir, dentry, mode | S_IFDIR, 0);
+}
+
+static int yaffs_create(struct inode *dir, struct dentry *dentry, int mode,
+			struct nameidata *n)
+{
+	return yaffs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+static int yaffs_link(struct dentry *old_dentry, struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+	struct yaffs_obj *obj = NULL;
+	struct yaffs_obj *link = NULL;
+	struct yaffs_dev *dev;
+
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_link");
+
+	obj = yaffs_inode_to_obj(inode);
+	dev = obj->my_dev;
+
+	yaffs_gross_lock(dev);
+
+	if (!S_ISDIR(inode->i_mode))	/* Don't link directories */
+		link =
+		    yaffs_link_obj(yaffs_inode_to_obj(dir), dentry->d_name.name,
+				   obj);
+
+	if (link) {
+		old_dentry->d_inode->i_nlink = yaffs_get_obj_link_count(obj);
+		d_instantiate(dentry, old_dentry->d_inode);
+		atomic_inc(&old_dentry->d_inode->i_count);
+		yaffs_trace(YAFFS_TRACE_OS,
+			"yaffs_link link count %d i_count %d",
+			old_dentry->d_inode->i_nlink,
+			atomic_read(&old_dentry->d_inode->i_count));
+	}
+
+	yaffs_gross_unlock(dev);
+
+	if (link) {
+		update_dir_time(dir);
+		return 0;
+	}
+
+	return -EPERM;
+}
+
+static int yaffs_symlink(struct inode *dir, struct dentry *dentry,
+			 const char *symname)
+{
+	struct yaffs_obj *obj;
+	struct yaffs_dev *dev;
+	uid_t uid = current->cred->fsuid;
+	gid_t gid =
+	    (dir->i_mode & S_ISGID) ? dir->i_gid : current->cred->fsgid;
+
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_symlink");
+
+	dev = yaffs_inode_to_obj(dir)->my_dev;
+	yaffs_gross_lock(dev);
+	obj = yaffs_create_symlink(yaffs_inode_to_obj(dir), dentry->d_name.name,
+				   S_IFLNK | S_IRWXUGO, uid, gid, symname);
+	yaffs_gross_unlock(dev);
+
+	if (obj) {
+		struct inode *inode;
+
+		inode = yaffs_get_inode(dir->i_sb, obj->yst_mode, 0, obj);
+		d_instantiate(dentry, inode);
+		update_dir_time(dir);
+		yaffs_trace(YAFFS_TRACE_OS, "symlink created OK");
+		return 0;
+	} else {
+		yaffs_trace(YAFFS_TRACE_OS, "symlink not created");
+	}
+
+	return -ENOMEM;
+}
+
+static struct dentry *yaffs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *n)
+{
+	struct yaffs_obj *obj;
+	struct inode *inode = NULL;
+
+	struct yaffs_dev *dev = yaffs_inode_to_obj(dir)->my_dev;
+
+	if (current != yaffs_dev_to_lc(dev)->readdir_process)
+		yaffs_gross_lock(dev);
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_lookup for %d:%s",
+		yaffs_inode_to_obj(dir)->obj_id, dentry->d_name.name);
+
+	obj = yaffs_find_by_name(yaffs_inode_to_obj(dir), dentry->d_name.name);
+
+	obj = yaffs_get_equivalent_obj(obj);	/* in case it was a hardlink */
+
+	/* Can't hold gross lock when calling yaffs_get_inode() */
+	if (current != yaffs_dev_to_lc(dev)->readdir_process)
+		yaffs_gross_unlock(dev);
+
+	if (obj) {
+		yaffs_trace(YAFFS_TRACE_OS,
+			"yaffs_lookup found %d", obj->obj_id);
+
+		inode = yaffs_get_inode(dir->i_sb, obj->yst_mode, 0, obj);
+
+		if (inode) {
+			yaffs_trace(YAFFS_TRACE_OS, "yaffs_loookup dentry");
+			d_add(dentry, inode);
+			/* return dentry; */
+			return NULL;
+		}
+
+	} else {
+		yaffs_trace(YAFFS_TRACE_OS, "yaffs_lookup not found");
+
+	}
+
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static int yaffs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int ret_val;
+
+	struct yaffs_dev *dev;
+	struct yaffs_obj *obj;
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_unlink %d:%s",
+		(int)(dir->i_ino), dentry->d_name.name);
+	obj = yaffs_inode_to_obj(dir);
+	dev = obj->my_dev;
+
+	yaffs_gross_lock(dev);
+
+	ret_val = yaffs_unlinker(obj, dentry->d_name.name);
+
+	if (ret_val == YAFFS_OK) {
+		dentry->d_inode->i_nlink--;
+		dir->i_version++;
+		yaffs_gross_unlock(dev);
+		mark_inode_dirty(dentry->d_inode);
+		update_dir_time(dir);
+		return 0;
+	}
+	yaffs_gross_unlock(dev);
+	return -ENOTEMPTY;
+}
+
+static int yaffs_sync_object(struct file *file, int datasync)
+{
+
+	struct yaffs_obj *obj;
+	struct yaffs_dev *dev;
+	struct dentry *dentry = file->f_path.dentry;
+
+	obj = yaffs_dentry_to_obj(dentry);
+
+	dev = obj->my_dev;
+
+	yaffs_trace(YAFFS_TRACE_OS | YAFFS_TRACE_SYNC, "yaffs_sync_object");
+	yaffs_gross_lock(dev);
+	yaffs_flush_file(obj, 1, datasync);
+	yaffs_gross_unlock(dev);
+	return 0;
+}
+/*
+ * The VFS layer already does all the dentry stuff for rename.
+ *
+ * NB: POSIX says you can rename an object over an old object of the same name
+ */
+static int yaffs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct yaffs_dev *dev;
+	int ret_val = YAFFS_FAIL;
+	struct yaffs_obj *target;
+
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_rename");
+	dev = yaffs_inode_to_obj(old_dir)->my_dev;
+
+	yaffs_gross_lock(dev);
+
+	/* Check if the target is an existing directory that is not empty. */
+	target = yaffs_find_by_name(yaffs_inode_to_obj(new_dir),
+				    new_dentry->d_name.name);
+
+	if (target && target->variant_type == YAFFS_OBJECT_TYPE_DIRECTORY &&
+	    !list_empty(&target->variant.dir_variant.children)) {
+
+		yaffs_trace(YAFFS_TRACE_OS, "target is non-empty dir");
+
+		ret_val = YAFFS_FAIL;
+	} else {
+		/* Now does unlinking internally using shadowing mechanism */
+		yaffs_trace(YAFFS_TRACE_OS, "calling yaffs_rename_obj");
+
+		ret_val = yaffs_rename_obj(yaffs_inode_to_obj(old_dir),
+					   old_dentry->d_name.name,
+					   yaffs_inode_to_obj(new_dir),
+					   new_dentry->d_name.name);
+	}
+	yaffs_gross_unlock(dev);
+
+	if (ret_val == YAFFS_OK) {
+		if (target) {
+			new_dentry->d_inode->i_nlink--;
+			mark_inode_dirty(new_dentry->d_inode);
+		}
+
+		update_dir_time(old_dir);
+		if (old_dir != new_dir)
+			update_dir_time(new_dir);
+		return 0;
+	} else {
+		return -ENOTEMPTY;
+	}
+}
+
+static int yaffs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error = 0;
+	struct yaffs_dev *dev;
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_setattr of object %d",
+		yaffs_inode_to_obj(inode)->obj_id);
+
+	/* Fail if a requested resize >= 2GB */
+	if (attr->ia_valid & ATTR_SIZE && (attr->ia_size >> 31))
+		error = -EINVAL;
+
+	if (error == 0)
+		error = inode_change_ok(inode, attr);
+	if (error == 0) {
+		int result;
+		if (!error) {
+			setattr_copy(inode, attr);
+			yaffs_trace(YAFFS_TRACE_OS, "inode_setattr called");
+			if (attr->ia_valid & ATTR_SIZE) {
+				truncate_setsize(inode, attr->ia_size);
+				inode->i_blocks = (inode->i_size + 511) >> 9;
+			}
+		}
+		dev = yaffs_inode_to_obj(inode)->my_dev;
+		if (attr->ia_valid & ATTR_SIZE) {
+			yaffs_trace(YAFFS_TRACE_OS, "resize to %d(%x)",
+					   (int)(attr->ia_size),
+					   (int)(attr->ia_size));
+		}
+		yaffs_gross_lock(dev);
+		result = yaffs_set_attribs(yaffs_inode_to_obj(inode), attr);
+		if (result == YAFFS_OK) {
+			error = 0;
+		} else {
+			error = -EPERM;
+		}
+		yaffs_gross_unlock(dev);
+
+	}
+
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_setattr done returning %d", error);
+
+	return error;
+}
+
+#ifdef CONFIG_YAFFS_XATTR
+static int yaffs_setxattr(struct dentry *dentry, const char *name,
+		   const void *value, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	int error = 0;
+	struct yaffs_dev *dev;
+	struct yaffs_obj *obj = yaffs_inode_to_obj(inode);
+
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_setxattr of object %d", obj->obj_id);
+
+	if (error == 0) {
+		int result;
+		dev = obj->my_dev;
+		yaffs_gross_lock(dev);
+		result = yaffs_set_xattrib(obj, name, value, size, flags);
+		if (result == YAFFS_OK)
+			error = 0;
+		else if (result < 0)
+			error = result;
+		yaffs_gross_unlock(dev);
+
+	}
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_setxattr done returning %d", error);
+
+	return error;
+}
+
+static ssize_t yaffs_getxattr(struct dentry * dentry, const char *name, void *buff,
+		       size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	int error = 0;
+	struct yaffs_dev *dev;
+	struct yaffs_obj *obj = yaffs_inode_to_obj(inode);
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_getxattr \"%s\" from object %d",
+		name, obj->obj_id);
+
+	if (error == 0) {
+		dev = obj->my_dev;
+		yaffs_gross_lock(dev);
+		error = yaffs_get_xattrib(obj, name, buff, size);
+		yaffs_gross_unlock(dev);
+
+	}
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_getxattr done returning %d", error);
+
+	return error;
+}
+
+static int yaffs_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = dentry->d_inode;
+	int error = 0;
+	struct yaffs_dev *dev;
+	struct yaffs_obj *obj = yaffs_inode_to_obj(inode);
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_removexattr of object %d", obj->obj_id);
+
+	if (error == 0) {
+		int result;
+		dev = obj->my_dev;
+		yaffs_gross_lock(dev);
+		result = yaffs_remove_xattrib(obj, name);
+		if (result == YAFFS_OK)
+			error = 0;
+		else if (result < 0)
+			error = result;
+		yaffs_gross_unlock(dev);
+
+	}
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_removexattr done returning %d", error);
+
+	return error;
+}
+
+static ssize_t yaffs_listxattr(struct dentry * dentry, char *buff, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	int error = 0;
+	struct yaffs_dev *dev;
+	struct yaffs_obj *obj = yaffs_inode_to_obj(inode);
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_listxattr of object %d", obj->obj_id);
+
+	if (error == 0) {
+		dev = obj->my_dev;
+		yaffs_gross_lock(dev);
+		error = yaffs_list_xattrib(obj, buff, size);
+		yaffs_gross_unlock(dev);
+
+	}
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_listxattr done returning %d", error);
+
+	return error;
+}
+
+#endif
+
+static const struct inode_operations yaffs_dir_inode_operations = {
+	.create = yaffs_create,
+	.lookup = yaffs_lookup,
+	.link = yaffs_link,
+	.unlink = yaffs_unlink,
+	.symlink = yaffs_symlink,
+	.mkdir = yaffs_mkdir,
+	.rmdir = yaffs_unlink,
+	.mknod = yaffs_mknod,
+	.rename = yaffs_rename,
+	.setattr = yaffs_setattr,
+#ifdef CONFIG_YAFFS_XATTR
+	.setxattr = yaffs_setxattr,
+	.getxattr = yaffs_getxattr,
+	.listxattr = yaffs_listxattr,
+	.removexattr = yaffs_removexattr,
+#endif
+};
+/*-----------------------------------------------------------------*/
+/* Directory search context allows us to unlock access to yaffs during
+ * filldir without causing problems with the directory being modified.
+ * This is similar to the tried and tested mechanism used in yaffs direct.
+ *
+ * A search context iterates along a doubly linked list of siblings in the
+ * directory. If the iterating object is deleted then this would corrupt
+ * the list iteration, likely causing a crash. The search context avoids
+ * this by using the remove_obj_fn to move the search context to the
+ * next object before the object is deleted.
+ *
+ * Many readdirs (and thus seach conexts) may be alive simulateously so
+ * each struct yaffs_dev has a list of these.
+ *
+ * A seach context lives for the duration of a readdir.
+ *
+ * All these functions must be called while yaffs is locked.
+ */
+
+struct yaffs_search_context {
+	struct yaffs_dev *dev;
+	struct yaffs_obj *dir_obj;
+	struct yaffs_obj *next_return;
+	struct list_head others;
+};
+
+/*
+ * yaffs_new_search() creates a new search context, initialises it and
+ * adds it to the device's search context list.
+ *
+ * Called at start of readdir.
+ */
+static struct yaffs_search_context *yaffs_new_search(struct yaffs_obj *dir)
+{
+	struct yaffs_dev *dev = dir->my_dev;
+	struct yaffs_search_context *sc =
+	    kmalloc(sizeof(struct yaffs_search_context), GFP_NOFS);
+	if (sc) {
+		sc->dir_obj = dir;
+		sc->dev = dev;
+		if (list_empty(&sc->dir_obj->variant.dir_variant.children))
+			sc->next_return = NULL;
+		else
+			sc->next_return =
+			    list_entry(dir->variant.dir_variant.children.next,
+				       struct yaffs_obj, siblings);
+		INIT_LIST_HEAD(&sc->others);
+		list_add(&sc->others, &(yaffs_dev_to_lc(dev)->search_contexts));
+	}
+	return sc;
+}
+
+/*
+ * yaffs_search_end() disposes of a search context and cleans up.
+ */
+static void yaffs_search_end(struct yaffs_search_context *sc)
+{
+	if (sc) {
+		list_del(&sc->others);
+		kfree(sc);
+	}
+}
+
+/*
+ * yaffs_search_advance() moves a search context to the next object.
+ * Called when the search iterates or when an object removal causes
+ * the search context to be moved to the next object.
+ */
+static void yaffs_search_advance(struct yaffs_search_context *sc)
+{
+	if (!sc)
+		return;
+
+	if (sc->next_return == NULL ||
+	    list_empty(&sc->dir_obj->variant.dir_variant.children))
+		sc->next_return = NULL;
+	else {
+		struct list_head *next = sc->next_return->siblings.next;
+
+		if (next == &sc->dir_obj->variant.dir_variant.children)
+			sc->next_return = NULL;	/* end of list */
+		else
+			sc->next_return =
+			    list_entry(next, struct yaffs_obj, siblings);
+	}
+}
+
+/*
+ * yaffs_remove_obj_callback() is called when an object is unlinked.
+ * We check open search contexts and advance any which are currently
+ * on the object being iterated.
+ */
+static void yaffs_remove_obj_callback(struct yaffs_obj *obj)
+{
+
+	struct list_head *i;
+	struct yaffs_search_context *sc;
+	struct list_head *search_contexts =
+	    &(yaffs_dev_to_lc(obj->my_dev)->search_contexts);
+
+	/* Iterate through the directory search contexts.
+	 * If any are currently on the object being removed, then advance
+	 * the search context to the next object to prevent a hanging pointer.
+	 */
+	list_for_each(i, search_contexts) {
+		if (i) {
+			sc = list_entry(i, struct yaffs_search_context, others);
+			if (sc->next_return == obj)
+				yaffs_search_advance(sc);
+		}
+	}
+
+}
+
+static int yaffs_readdir(struct file *f, void *dirent, filldir_t filldir)
+{
+	struct yaffs_obj *obj;
+	struct yaffs_dev *dev;
+	struct yaffs_search_context *sc;
+	struct inode *inode = f->f_dentry->d_inode;
+	unsigned long offset, curoffs;
+	struct yaffs_obj *l;
+	int ret_val = 0;
+
+	char name[YAFFS_MAX_NAME_LENGTH + 1];
+
+	obj = yaffs_dentry_to_obj(f->f_dentry);
+	dev = obj->my_dev;
+
+	yaffs_gross_lock(dev);
+
+	yaffs_dev_to_lc(dev)->readdir_process = current;
+
+	offset = f->f_pos;
+
+	sc = yaffs_new_search(obj);
+	if (!sc) {
+		ret_val = -ENOMEM;
+		goto out;
+	}
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_readdir: starting at %d", (int)offset);
+
+	if (offset == 0) {
+		yaffs_trace(YAFFS_TRACE_OS,
+			"yaffs_readdir: entry . ino %d",
+			(int)inode->i_ino);
+		yaffs_gross_unlock(dev);
+		if (filldir(dirent, ".", 1, offset, inode->i_ino, DT_DIR) < 0) {
+			yaffs_gross_lock(dev);
+			goto out;
+		}
+		yaffs_gross_lock(dev);
+		offset++;
+		f->f_pos++;
+	}
+	if (offset == 1) {
+		yaffs_trace(YAFFS_TRACE_OS,
+			"yaffs_readdir: entry .. ino %d",
+			(int)f->f_dentry->d_parent->d_inode->i_ino);
+		yaffs_gross_unlock(dev);
+		if (filldir(dirent, "..", 2, offset,
+			    f->f_dentry->d_parent->d_inode->i_ino,
+			    DT_DIR) < 0) {
+			yaffs_gross_lock(dev);
+			goto out;
+		}
+		yaffs_gross_lock(dev);
+		offset++;
+		f->f_pos++;
+	}
+
+	curoffs = 1;
+
+	/* If the directory has changed since the open or last call to
+	   readdir, rewind to after the 2 canned entries. */
+	if (f->f_version != inode->i_version) {
+		offset = 2;
+		f->f_pos = offset;
+		f->f_version = inode->i_version;
+	}
+
+	while (sc->next_return) {
+		curoffs++;
+		l = sc->next_return;
+		if (curoffs >= offset) {
+			int this_inode = yaffs_get_obj_inode(l);
+			int this_type = yaffs_get_obj_type(l);
+
+			yaffs_get_obj_name(l, name, YAFFS_MAX_NAME_LENGTH + 1);
+			yaffs_trace(YAFFS_TRACE_OS,
+				"yaffs_readdir: %s inode %d",
+				name, yaffs_get_obj_inode(l));
+
+			yaffs_gross_unlock(dev);
+
+			if (filldir(dirent,
+				    name,
+				    strlen(name),
+				    offset, this_inode, this_type) < 0) {
+				yaffs_gross_lock(dev);
+				goto out;
+			}
+
+			yaffs_gross_lock(dev);
+
+			offset++;
+			f->f_pos++;
+		}
+		yaffs_search_advance(sc);
+	}
+
+out:
+	yaffs_search_end(sc);
+	yaffs_dev_to_lc(dev)->readdir_process = NULL;
+	yaffs_gross_unlock(dev);
+
+	return ret_val;
+}
+
+static const struct file_operations yaffs_dir_operations = {
+	.read = generic_read_dir,
+	.readdir = yaffs_readdir,
+	.fsync = yaffs_sync_object,
+	.llseek = generic_file_llseek,
+};
+
+
+
+static int yaffs_file_flush(struct file *file, fl_owner_t id)
+{
+	struct yaffs_obj *obj = yaffs_dentry_to_obj(file->f_dentry);
+
+	struct yaffs_dev *dev = obj->my_dev;
+
+	yaffs_trace(YAFFS_TRACE_OS,
+	  	"yaffs_file_flush object %d (%s)",
+		obj->obj_id, obj->dirty ? "dirty" : "clean");
+
+	yaffs_gross_lock(dev);
+
+	yaffs_flush_file(obj, 1, 0);
+
+	yaffs_gross_unlock(dev);
+
+	return 0;
+}
+
+static const struct file_operations yaffs_file_operations = {
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = generic_file_aio_read,
+	.aio_write = generic_file_aio_write,
+	.mmap = generic_file_mmap,
+	.flush = yaffs_file_flush,
+	.fsync = yaffs_sync_object,
+	.splice_read = generic_file_splice_read,
+	.splice_write = generic_file_splice_write,
+	.llseek = generic_file_llseek,
+};
+
+
+/* ExportFS support */
+static struct inode *yaffs2_nfs_get_inode(struct super_block *sb, uint64_t ino,
+					  uint32_t generation)
+{
+	return yaffs_iget(sb, ino);
+}
+
+static struct dentry *yaffs2_fh_to_dentry(struct super_block *sb,
+					  struct fid *fid, int fh_len,
+					  int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    yaffs2_nfs_get_inode);
+}
+
+static struct dentry *yaffs2_fh_to_parent(struct super_block *sb,
+					  struct fid *fid, int fh_len,
+					  int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    yaffs2_nfs_get_inode);
+}
+
+struct dentry *yaffs2_get_parent(struct dentry *dentry)
+{
+
+	struct super_block *sb = dentry->d_inode->i_sb;
+	struct dentry *parent = ERR_PTR(-ENOENT);
+	struct inode *inode;
+	unsigned long parent_ino;
+	struct yaffs_obj *d_obj;
+	struct yaffs_obj *parent_obj;
+
+	d_obj = yaffs_inode_to_obj(dentry->d_inode);
+
+	if (d_obj) {
+		parent_obj = d_obj->parent;
+		if (parent_obj) {
+			parent_ino = yaffs_get_obj_inode(parent_obj);
+			inode = yaffs_iget(sb, parent_ino);
+
+			if (IS_ERR(inode)) {
+				parent = ERR_CAST(inode);
+			} else {
+				parent = d_obtain_alias(inode);
+				if (!IS_ERR(parent)) {
+					parent = ERR_PTR(-ENOMEM);
+					iput(inode);
+				}
+			}
+		}
+	}
+
+	return parent;
+}
+
+/* Just declare a zero structure as a NULL value implies
+ * using the default functions of exportfs.
+ */
+
+static struct export_operations yaffs_export_ops = {
+	.fh_to_dentry = yaffs2_fh_to_dentry,
+	.fh_to_parent = yaffs2_fh_to_parent,
+	.get_parent = yaffs2_get_parent,
+};
+
+
+/*-----------------------------------------------------------------*/
+
+static int yaffs_readlink(struct dentry *dentry, char __user * buffer,
+			  int buflen)
+{
+	unsigned char *alias;
+	int ret;
+
+	struct yaffs_dev *dev = yaffs_dentry_to_obj(dentry)->my_dev;
+
+	yaffs_gross_lock(dev);
+
+	alias = yaffs_get_symlink_alias(yaffs_dentry_to_obj(dentry));
+
+	yaffs_gross_unlock(dev);
+
+	if (!alias)
+		return -ENOMEM;
+
+	ret = vfs_readlink(dentry, buffer, buflen, alias);
+	kfree(alias);
+	return ret;
+}
+
+static void *yaffs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned char *alias;
+	void *ret;
+	struct yaffs_dev *dev = yaffs_dentry_to_obj(dentry)->my_dev;
+
+	yaffs_gross_lock(dev);
+
+	alias = yaffs_get_symlink_alias(yaffs_dentry_to_obj(dentry));
+	yaffs_gross_unlock(dev);
+
+	if (!alias) {
+		ret = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	nd_set_link(nd, alias);
+	ret = (void *)alias;
+out:
+	return ret;
+}
+
+void yaffs_put_link(struct dentry *dentry, struct nameidata *nd, void *alias)
+{
+	kfree(alias);
+}
+
+
+static void yaffs_unstitch_obj(struct inode *inode, struct yaffs_obj *obj)
+{
+	/* Clear the association between the inode and
+	 * the struct yaffs_obj.
+	 */
+	obj->my_inode = NULL;
+	yaffs_inode_to_obj_lv(inode) = NULL;
+
+	/* If the object freeing was deferred, then the real
+	 * free happens now.
+	 * This should fix the inode inconsistency problem.
+	 */
+	yaffs_handle_defered_free(obj);
+}
+
+/* yaffs_evict_inode combines into one operation what was previously done in
+ * yaffs_clear_inode() and yaffs_delete_inode()
+ *
+ */
+static void yaffs_evict_inode(struct inode *inode)
+{
+	struct yaffs_obj *obj;
+	struct yaffs_dev *dev;
+	int deleteme = 0;
+
+	obj = yaffs_inode_to_obj(inode);
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_evict_inode: ino %d, count %d %s",
+		(int)inode->i_ino,
+		atomic_read(&inode->i_count),
+		obj ? "object exists" : "null object");
+
+	if (!inode->i_nlink && !is_bad_inode(inode))
+		deleteme = 1;
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+
+	if (deleteme && obj) {
+		dev = obj->my_dev;
+		yaffs_gross_lock(dev);
+		yaffs_del_obj(obj);
+		yaffs_gross_unlock(dev);
+	}
+	if (obj) {
+		dev = obj->my_dev;
+		yaffs_gross_lock(dev);
+		yaffs_unstitch_obj(inode, obj);
+		yaffs_gross_unlock(dev);
+	}
+
+}
+
+static void yaffs_touch_super(struct yaffs_dev *dev)
+{
+	struct super_block *sb = yaffs_dev_to_lc(dev)->super;
+
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_touch_super() sb = %p", sb);
+	if (sb)
+		sb->s_dirt = 1;
+}
+
+static int yaffs_readpage_nolock(struct file *f, struct page *pg)
+{
+	/* Lifted from jffs2 */
+
+	struct yaffs_obj *obj;
+	unsigned char *pg_buf;
+	int ret;
+
+	struct yaffs_dev *dev;
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_readpage_nolock at %08x, size %08x",
+		(unsigned)(pg->index << PAGE_CACHE_SHIFT),
+		(unsigned)PAGE_CACHE_SIZE);
+
+	obj = yaffs_dentry_to_obj(f->f_dentry);
+
+	dev = obj->my_dev;
+
+	BUG_ON(!PageLocked(pg));
+
+	pg_buf = kmap(pg);
+	/* FIXME: Can kmap fail? */
+
+	yaffs_gross_lock(dev);
+
+	ret = yaffs_file_rd(obj, pg_buf,
+			    pg->index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE);
+
+	yaffs_gross_unlock(dev);
+
+	if (ret >= 0)
+		ret = 0;
+
+	if (ret) {
+		ClearPageUptodate(pg);
+		SetPageError(pg);
+	} else {
+		SetPageUptodate(pg);
+		ClearPageError(pg);
+	}
+
+	flush_dcache_page(pg);
+	kunmap(pg);
+
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_readpage_nolock done");
+	return ret;
+}
+
+static int yaffs_readpage_unlock(struct file *f, struct page *pg)
+{
+	int ret = yaffs_readpage_nolock(f, pg);
+	UnlockPage(pg);
+	return ret;
+}
+
+static int yaffs_readpage(struct file *f, struct page *pg)
+{
+	int ret;
+
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_readpage");
+	ret = yaffs_readpage_unlock(f, pg);
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_readpage done");
+	return ret;
+}
+
+/* writepage inspired by/stolen from smbfs */
+
+static int yaffs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct yaffs_dev *dev;
+	struct address_space *mapping = page->mapping;
+	struct inode *inode;
+	unsigned long end_index;
+	char *buffer;
+	struct yaffs_obj *obj;
+	int n_written = 0;
+	unsigned n_bytes;
+	loff_t i_size;
+
+	if (!mapping)
+		BUG();
+	inode = mapping->host;
+	if (!inode)
+		BUG();
+	i_size = i_size_read(inode);
+
+	end_index = i_size >> PAGE_CACHE_SHIFT;
+
+	if (page->index < end_index)
+		n_bytes = PAGE_CACHE_SIZE;
+	else {
+		n_bytes = i_size & (PAGE_CACHE_SIZE - 1);
+
+		if (page->index > end_index || !n_bytes) {
+			yaffs_trace(YAFFS_TRACE_OS,
+				"yaffs_writepage at %08x, inode size = %08x!!!",
+				(unsigned)(page->index << PAGE_CACHE_SHIFT),
+				(unsigned)inode->i_size);
+			yaffs_trace(YAFFS_TRACE_OS,
+			  "                -> don't care!!");
+
+			zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+			set_page_writeback(page);
+			unlock_page(page);
+			end_page_writeback(page);
+			return 0;
+		}
+	}
+
+	if (n_bytes != PAGE_CACHE_SIZE)
+		zero_user_segment(page, n_bytes, PAGE_CACHE_SIZE);
+
+	get_page(page);
+
+	buffer = kmap(page);
+
+	obj = yaffs_inode_to_obj(inode);
+	dev = obj->my_dev;
+	yaffs_gross_lock(dev);
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_writepage at %08x, size %08x",
+		(unsigned)(page->index << PAGE_CACHE_SHIFT), n_bytes);
+	yaffs_trace(YAFFS_TRACE_OS,
+		"writepag0: obj = %05x, ino = %05x",
+		(int)obj->variant.file_variant.file_size, (int)inode->i_size);
+
+	n_written = yaffs_wr_file(obj, buffer,
+				  page->index << PAGE_CACHE_SHIFT, n_bytes, 0);
+
+	yaffs_touch_super(dev);
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"writepag1: obj = %05x, ino = %05x",
+		(int)obj->variant.file_variant.file_size, (int)inode->i_size);
+
+	yaffs_gross_unlock(dev);
+
+	kunmap(page);
+	set_page_writeback(page);
+	unlock_page(page);
+	end_page_writeback(page);
+	put_page(page);
+
+	return (n_written == n_bytes) ? 0 : -ENOSPC;
+}
+
+/* Space holding and freeing is done to ensure we have space available for 
+ * write_begin/end.
+ * For now we just assume few parallel writes and check against a small
+ * number.
+ * Todo: need to do this with a counter to handle parallel reads better.
+ */
+
+static ssize_t yaffs_hold_space(struct file *f)
+{
+	struct yaffs_obj *obj;
+	struct yaffs_dev *dev;
+
+	int n_free_chunks;
+
+	obj = yaffs_dentry_to_obj(f->f_dentry);
+
+	dev = obj->my_dev;
+
+	yaffs_gross_lock(dev);
+
+	n_free_chunks = yaffs_get_n_free_chunks(dev);
+
+	yaffs_gross_unlock(dev);
+
+	return (n_free_chunks > 20) ? 1 : 0;
+}
+
+static void yaffs_release_space(struct file *f)
+{
+	struct yaffs_obj *obj;
+	struct yaffs_dev *dev;
+
+	obj = yaffs_dentry_to_obj(f->f_dentry);
+
+	dev = obj->my_dev;
+
+	yaffs_gross_lock(dev);
+
+	yaffs_gross_unlock(dev);
+}
+
+static int yaffs_write_begin(struct file *filp, struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata)
+{
+	struct page *pg = NULL;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+
+	int ret = 0;
+	int space_held = 0;
+
+	/* Get a page */
+	pg = grab_cache_page_write_begin(mapping, index, flags);
+
+	*pagep = pg;
+	if (!pg) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	yaffs_trace(YAFFS_TRACE_OS,
+		"start yaffs_write_begin index %d(%x) uptodate %d",
+		(int)index, (int)index, Page_Uptodate(pg) ? 1 : 0);
+
+	/* Get fs space */
+	space_held = yaffs_hold_space(filp);
+
+	if (!space_held) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	/* Update page if required */
+
+	if (!Page_Uptodate(pg))
+		ret = yaffs_readpage_nolock(filp, pg);
+
+	if (ret)
+		goto out;
+
+	/* Happy path return */
+	yaffs_trace(YAFFS_TRACE_OS, "end yaffs_write_begin - ok");
+
+	return 0;
+
+out:
+	yaffs_trace(YAFFS_TRACE_OS,
+		"end yaffs_write_begin fail returning %d", ret);
+	if (space_held)
+		yaffs_release_space(filp);
+	if (pg) {
+		unlock_page(pg);
+		page_cache_release(pg);
+	}
+	return ret;
+}
+
+static ssize_t yaffs_file_write(struct file *f, const char *buf, size_t n,
+				loff_t * pos)
+{
+	struct yaffs_obj *obj;
+	int n_written, ipos;
+	struct inode *inode;
+	struct yaffs_dev *dev;
+
+	obj = yaffs_dentry_to_obj(f->f_dentry);
+
+	dev = obj->my_dev;
+
+	yaffs_gross_lock(dev);
+
+	inode = f->f_dentry->d_inode;
+
+	if (!S_ISBLK(inode->i_mode) && f->f_flags & O_APPEND)
+		ipos = inode->i_size;
+	else
+		ipos = *pos;
+
+	if (!obj)
+		yaffs_trace(YAFFS_TRACE_OS,
+			"yaffs_file_write: hey obj is null!");
+	else
+		yaffs_trace(YAFFS_TRACE_OS,
+			"yaffs_file_write about to write writing %u(%x) bytes to object %d at %d(%x)",
+			(unsigned)n, (unsigned)n, obj->obj_id, ipos, ipos);
+
+	n_written = yaffs_wr_file(obj, buf, ipos, n, 0);
+
+	yaffs_touch_super(dev);
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_file_write: %d(%x) bytes written",
+		(unsigned)n, (unsigned)n);
+
+	if (n_written > 0) {
+		ipos += n_written;
+		*pos = ipos;
+		if (ipos > inode->i_size) {
+			inode->i_size = ipos;
+			inode->i_blocks = (ipos + 511) >> 9;
+
+			yaffs_trace(YAFFS_TRACE_OS,
+				"yaffs_file_write size updated to %d bytes, %d blocks",
+				ipos, (int)(inode->i_blocks));
+		}
+
+	}
+	yaffs_gross_unlock(dev);
+	return (n_written == 0) && (n > 0) ? -ENOSPC : n_written;
+}
+
+static int yaffs_write_end(struct file *filp, struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *pg, void *fsdadata)
+{
+	int ret = 0;
+	void *addr, *kva;
+	uint32_t offset_into_page = pos & (PAGE_CACHE_SIZE - 1);
+
+	kva = kmap(pg);
+	addr = kva + offset_into_page;
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_write_end addr %p pos %x n_bytes %d",
+		addr, (unsigned)pos, copied);
+
+	ret = yaffs_file_write(filp, addr, copied, &pos);
+
+	if (ret != copied) {
+		yaffs_trace(YAFFS_TRACE_OS,
+			"yaffs_write_end not same size ret %d  copied %d",
+			ret, copied);
+		SetPageError(pg);
+	}
+
+	kunmap(pg);
+
+	yaffs_release_space(filp);
+	unlock_page(pg);
+	page_cache_release(pg);
+	return ret;
+}
+
+static int yaffs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct yaffs_dev *dev = yaffs_dentry_to_obj(dentry)->my_dev;
+	struct super_block *sb = dentry->d_sb;
+
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_statfs");
+
+	yaffs_gross_lock(dev);
+
+	buf->f_type = YAFFS_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_namelen = 255;
+
+	if (dev->data_bytes_per_chunk & (dev->data_bytes_per_chunk - 1)) {
+		/* Do this if chunk size is not a power of 2 */
+
+		uint64_t bytes_in_dev;
+		uint64_t bytes_free;
+
+		bytes_in_dev =
+		    ((uint64_t)
+		     ((dev->param.end_block - dev->param.start_block +
+		       1))) * ((uint64_t) (dev->param.chunks_per_block *
+					   dev->data_bytes_per_chunk));
+
+		do_div(bytes_in_dev, sb->s_blocksize);	/* bytes_in_dev becomes the number of blocks */
+		buf->f_blocks = bytes_in_dev;
+
+		bytes_free = ((uint64_t) (yaffs_get_n_free_chunks(dev))) *
+		    ((uint64_t) (dev->data_bytes_per_chunk));
+
+		do_div(bytes_free, sb->s_blocksize);
+
+		buf->f_bfree = bytes_free;
+
+	} else if (sb->s_blocksize > dev->data_bytes_per_chunk) {
+
+		buf->f_blocks =
+		    (dev->param.end_block - dev->param.start_block + 1) *
+		    dev->param.chunks_per_block /
+		    (sb->s_blocksize / dev->data_bytes_per_chunk);
+		buf->f_bfree =
+		    yaffs_get_n_free_chunks(dev) /
+		    (sb->s_blocksize / dev->data_bytes_per_chunk);
+	} else {
+		buf->f_blocks =
+		    (dev->param.end_block - dev->param.start_block + 1) *
+		    dev->param.chunks_per_block *
+		    (dev->data_bytes_per_chunk / sb->s_blocksize);
+
+		buf->f_bfree =
+		    yaffs_get_n_free_chunks(dev) *
+		    (dev->data_bytes_per_chunk / sb->s_blocksize);
+	}
+
+	buf->f_files = 0;
+	buf->f_ffree = 0;
+	buf->f_bavail = buf->f_bfree;
+
+	yaffs_gross_unlock(dev);
+	return 0;
+}
+
+static void yaffs_flush_inodes(struct super_block *sb)
+{
+	struct inode *iptr;
+	struct yaffs_obj *obj;
+
+	list_for_each_entry(iptr, &sb->s_inodes, i_sb_list) {
+		obj = yaffs_inode_to_obj(iptr);
+		if (obj) {
+			yaffs_trace(YAFFS_TRACE_OS,
+				"flushing obj %d", obj->obj_id);
+			yaffs_flush_file(obj, 1, 0);
+		}
+	}
+}
+
+static void yaffs_flush_super(struct super_block *sb, int do_checkpoint)
+{
+	struct yaffs_dev *dev = yaffs_super_to_dev(sb);
+	if (!dev)
+		return;
+
+	yaffs_flush_inodes(sb);
+	yaffs_update_dirty_dirs(dev);
+	yaffs_flush_whole_cache(dev);
+	if (do_checkpoint)
+		yaffs_checkpoint_save(dev);
+}
+
+static unsigned yaffs_bg_gc_urgency(struct yaffs_dev *dev)
+{
+	unsigned erased_chunks =
+	    dev->n_erased_blocks * dev->param.chunks_per_block;
+	struct yaffs_linux_context *context = yaffs_dev_to_lc(dev);
+	unsigned scattered = 0;	/* Free chunks not in an erased block */
+
+	if (erased_chunks < dev->n_free_chunks)
+		scattered = (dev->n_free_chunks - erased_chunks);
+
+	if (!context->bg_running)
+		return 0;
+	else if (scattered < (dev->param.chunks_per_block * 2))
+		return 0;
+	else if (erased_chunks > dev->n_free_chunks / 2)
+		return 0;
+	else if (erased_chunks > dev->n_free_chunks / 4)
+		return 1;
+	else
+		return 2;
+}
+
+static int yaffs_do_sync_fs(struct super_block *sb, int request_checkpoint)
+{
+
+	struct yaffs_dev *dev = yaffs_super_to_dev(sb);
+	unsigned int oneshot_checkpoint = (yaffs_auto_checkpoint & 4);
+	unsigned gc_urgent = yaffs_bg_gc_urgency(dev);
+	int do_checkpoint;
+
+	yaffs_trace(YAFFS_TRACE_OS | YAFFS_TRACE_SYNC | YAFFS_TRACE_BACKGROUND,
+		"yaffs_do_sync_fs: gc-urgency %d %s %s%s",
+		gc_urgent,
+		sb->s_dirt ? "dirty" : "clean",
+		request_checkpoint ? "checkpoint requested" : "no checkpoint",
+		oneshot_checkpoint ? " one-shot" : "");
+
+	yaffs_gross_lock(dev);
+	do_checkpoint = ((request_checkpoint && !gc_urgent) ||
+			 oneshot_checkpoint) && !dev->is_checkpointed;
+
+	if (sb->s_dirt || do_checkpoint) {
+		yaffs_flush_super(sb, !dev->is_checkpointed && do_checkpoint);
+		sb->s_dirt = 0;
+		if (oneshot_checkpoint)
+			yaffs_auto_checkpoint &= ~4;
+	}
+	yaffs_gross_unlock(dev);
+
+	return 0;
+}
+
+/*
+ * yaffs background thread functions .
+ * yaffs_bg_thread_fn() the thread function
+ * yaffs_bg_start() launches the background thread.
+ * yaffs_bg_stop() cleans up the background thread.
+ *
+ * NB: 
+ * The thread should only run after the yaffs is initialised
+ * The thread should be stopped before yaffs is unmounted.
+ * The thread should not do any writing while the fs is in read only.
+ */
+
+void yaffs_background_waker(unsigned long data)
+{
+	wake_up_process((struct task_struct *)data);
+}
+
+static int yaffs_bg_thread_fn(void *data)
+{
+	struct yaffs_dev *dev = (struct yaffs_dev *)data;
+	struct yaffs_linux_context *context = yaffs_dev_to_lc(dev);
+	unsigned long now = jiffies;
+	unsigned long next_dir_update = now;
+	unsigned long next_gc = now;
+	unsigned long expires;
+	unsigned int urgency;
+
+	int gc_result;
+	struct timer_list timer;
+
+	yaffs_trace(YAFFS_TRACE_BACKGROUND,
+		"yaffs_background starting for dev %p", (void *)dev);
+
+	set_freezable();
+	while (context->bg_running) {
+		yaffs_trace(YAFFS_TRACE_BACKGROUND, "yaffs_background");
+
+		if (kthread_should_stop())
+			break;
+
+		if (try_to_freeze())
+			continue;
+
+		yaffs_gross_lock(dev);
+
+		now = jiffies;
+
+		if (time_after(now, next_dir_update) && yaffs_bg_enable) {
+			yaffs_update_dirty_dirs(dev);
+			next_dir_update = now + HZ;
+		}
+
+		if (time_after(now, next_gc) && yaffs_bg_enable) {
+			if (!dev->is_checkpointed) {
+				urgency = yaffs_bg_gc_urgency(dev);
+				gc_result = yaffs_bg_gc(dev, urgency);
+				if (urgency > 1)
+					next_gc = now + HZ / 20 + 1;
+				else if (urgency > 0)
+					next_gc = now + HZ / 10 + 1;
+				else
+					next_gc = now + HZ * 2;
+			} else	{
+			        /*
+				 * gc not running so set to next_dir_update
+				 * to cut down on wake ups
+				 */
+				next_gc = next_dir_update;
+                        }
+		}
+		yaffs_gross_unlock(dev);
+		expires = next_dir_update;
+		if (time_before(next_gc, expires))
+			expires = next_gc;
+		if (time_before(expires, now))
+			expires = now + HZ;
+
+		Y_INIT_TIMER(&timer);
+		timer.expires = expires + 1;
+		timer.data = (unsigned long)current;
+		timer.function = yaffs_background_waker;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_timer(&timer);
+		schedule();
+		del_timer_sync(&timer);
+	}
+
+	return 0;
+}
+
+static int yaffs_bg_start(struct yaffs_dev *dev)
+{
+	int retval = 0;
+	struct yaffs_linux_context *context = yaffs_dev_to_lc(dev);
+
+	if (dev->read_only)
+		return -1;
+
+	context->bg_running = 1;
+
+	context->bg_thread = kthread_run(yaffs_bg_thread_fn,
+					 (void *)dev, "yaffs-bg-%d",
+					 context->mount_id);
+
+	if (IS_ERR(context->bg_thread)) {
+		retval = PTR_ERR(context->bg_thread);
+		context->bg_thread = NULL;
+		context->bg_running = 0;
+	}
+	return retval;
+}
+
+static void yaffs_bg_stop(struct yaffs_dev *dev)
+{
+	struct yaffs_linux_context *ctxt = yaffs_dev_to_lc(dev);
+
+	ctxt->bg_running = 0;
+
+	if (ctxt->bg_thread) {
+		kthread_stop(ctxt->bg_thread);
+		ctxt->bg_thread = NULL;
+	}
+}
+
+static void yaffs_write_super(struct super_block *sb)
+{
+	unsigned request_checkpoint = (yaffs_auto_checkpoint >= 2);
+
+	yaffs_trace(YAFFS_TRACE_OS | YAFFS_TRACE_SYNC | YAFFS_TRACE_BACKGROUND,
+		"yaffs_write_super%s",
+		request_checkpoint ? " checkpt" : "");
+
+	yaffs_do_sync_fs(sb, request_checkpoint);
+
+}
+
+static int yaffs_sync_fs(struct super_block *sb, int wait)
+{
+	unsigned request_checkpoint = (yaffs_auto_checkpoint >= 1);
+
+	yaffs_trace(YAFFS_TRACE_OS | YAFFS_TRACE_SYNC,
+		"yaffs_sync_fs%s", request_checkpoint ? " checkpt" : "");
+
+	yaffs_do_sync_fs(sb, request_checkpoint);
+
+	return 0;
+}
+
+
+static LIST_HEAD(yaffs_context_list);
+struct mutex yaffs_context_lock;
+
+
+
+struct yaffs_options {
+	int inband_tags;
+	int skip_checkpoint_read;
+	int skip_checkpoint_write;
+	int no_cache;
+	int tags_ecc_on;
+	int tags_ecc_overridden;
+	int lazy_loading_enabled;
+	int lazy_loading_overridden;
+	int empty_lost_and_found;
+	int empty_lost_and_found_overridden;
+};
+
+#define MAX_OPT_LEN 30
+static int yaffs_parse_options(struct yaffs_options *options,
+			       const char *options_str)
+{
+	char cur_opt[MAX_OPT_LEN + 1];
+	int p;
+	int error = 0;
+
+	/* Parse through the options which is a comma seperated list */
+
+	while (options_str && *options_str && !error) {
+		memset(cur_opt, 0, MAX_OPT_LEN + 1);
+		p = 0;
+
+		while (*options_str == ',')
+			options_str++;
+
+		while (*options_str && *options_str != ',') {
+			if (p < MAX_OPT_LEN) {
+				cur_opt[p] = *options_str;
+				p++;
+			}
+			options_str++;
+		}
+
+		if (!strcmp(cur_opt, "inband-tags")) {
+			options->inband_tags = 1;
+		} else if (!strcmp(cur_opt, "tags-ecc-off")) {
+			options->tags_ecc_on = 0;
+			options->tags_ecc_overridden = 1;
+		} else if (!strcmp(cur_opt, "tags-ecc-on")) {
+			options->tags_ecc_on = 1;
+			options->tags_ecc_overridden = 1;
+		} else if (!strcmp(cur_opt, "lazy-loading-off")) {
+			options->lazy_loading_enabled = 0;
+			options->lazy_loading_overridden = 1;
+		} else if (!strcmp(cur_opt, "lazy-loading-on")) {
+			options->lazy_loading_enabled = 1;
+			options->lazy_loading_overridden = 1;
+		} else if (!strcmp(cur_opt, "empty-lost-and-found-off")) {
+			options->empty_lost_and_found = 0;
+			options->empty_lost_and_found_overridden = 1;
+		} else if (!strcmp(cur_opt, "empty-lost-and-found-on")) {
+			options->empty_lost_and_found = 1;
+			options->empty_lost_and_found_overridden = 1;
+		} else if (!strcmp(cur_opt, "no-cache")) {
+			options->no_cache = 1;
+		} else if (!strcmp(cur_opt, "no-checkpoint-read")) {
+			options->skip_checkpoint_read = 1;
+		} else if (!strcmp(cur_opt, "no-checkpoint-write")) {
+			options->skip_checkpoint_write = 1;
+		} else if (!strcmp(cur_opt, "no-checkpoint")) {
+			options->skip_checkpoint_read = 1;
+			options->skip_checkpoint_write = 1;
+		} else {
+			printk(KERN_INFO "yaffs: Bad mount option \"%s\"\n",
+			       cur_opt);
+			error = 1;
+		}
+	}
+
+	return error;
+}
+
+static struct address_space_operations yaffs_file_address_operations = {
+	.readpage = yaffs_readpage,
+	.writepage = yaffs_writepage,
+	.write_begin = yaffs_write_begin,
+	.write_end = yaffs_write_end,
+};
+
+
+
+static const struct inode_operations yaffs_file_inode_operations = {
+	.setattr = yaffs_setattr,
+#ifdef CONFIG_YAFFS_XATTR
+	.setxattr = yaffs_setxattr,
+	.getxattr = yaffs_getxattr,
+	.listxattr = yaffs_listxattr,
+	.removexattr = yaffs_removexattr,
+#endif
+};
+
+static const struct inode_operations yaffs_symlink_inode_operations = {
+	.readlink = yaffs_readlink,
+	.follow_link = yaffs_follow_link,
+	.put_link = yaffs_put_link,
+	.setattr = yaffs_setattr,
+#ifdef CONFIG_YAFFS_XATTR
+	.setxattr = yaffs_setxattr,
+	.getxattr = yaffs_getxattr,
+	.listxattr = yaffs_listxattr,
+	.removexattr = yaffs_removexattr,
+#endif
+};
+
+static void yaffs_fill_inode_from_obj(struct inode *inode,
+				      struct yaffs_obj *obj)
+{
+	if (inode && obj) {
+
+		/* Check mode against the variant type and attempt to repair if broken. */
+		u32 mode = obj->yst_mode;
+		switch (obj->variant_type) {
+		case YAFFS_OBJECT_TYPE_FILE:
+			if (!S_ISREG(mode)) {
+				obj->yst_mode &= ~S_IFMT;
+				obj->yst_mode |= S_IFREG;
+			}
+
+			break;
+		case YAFFS_OBJECT_TYPE_SYMLINK:
+			if (!S_ISLNK(mode)) {
+				obj->yst_mode &= ~S_IFMT;
+				obj->yst_mode |= S_IFLNK;
+			}
+
+			break;
+		case YAFFS_OBJECT_TYPE_DIRECTORY:
+			if (!S_ISDIR(mode)) {
+				obj->yst_mode &= ~S_IFMT;
+				obj->yst_mode |= S_IFDIR;
+			}
+
+			break;
+		case YAFFS_OBJECT_TYPE_UNKNOWN:
+		case YAFFS_OBJECT_TYPE_HARDLINK:
+		case YAFFS_OBJECT_TYPE_SPECIAL:
+		default:
+			/* TODO? */
+			break;
+		}
+
+		inode->i_flags |= S_NOATIME;
+
+		inode->i_ino = obj->obj_id;
+		inode->i_mode = obj->yst_mode;
+		inode->i_uid = obj->yst_uid;
+		inode->i_gid = obj->yst_gid;
+
+		inode->i_rdev = old_decode_dev(obj->yst_rdev);
+
+		inode->i_atime.tv_sec = (time_t) (obj->yst_atime);
+		inode->i_atime.tv_nsec = 0;
+		inode->i_mtime.tv_sec = (time_t) obj->yst_mtime;
+		inode->i_mtime.tv_nsec = 0;
+		inode->i_ctime.tv_sec = (time_t) obj->yst_ctime;
+		inode->i_ctime.tv_nsec = 0;
+		inode->i_size = yaffs_get_obj_length(obj);
+		inode->i_blocks = (inode->i_size + 511) >> 9;
+
+		inode->i_nlink = yaffs_get_obj_link_count(obj);
+
+		yaffs_trace(YAFFS_TRACE_OS,
+			"yaffs_fill_inode mode %x uid %d gid %d size %d count %d",
+			inode->i_mode, inode->i_uid, inode->i_gid,
+			(int)inode->i_size, atomic_read(&inode->i_count));
+
+		switch (obj->yst_mode & S_IFMT) {
+		default:	/* fifo, device or socket */
+			init_special_inode(inode, obj->yst_mode,
+					   old_decode_dev(obj->yst_rdev));
+			break;
+		case S_IFREG:	/* file */
+			inode->i_op = &yaffs_file_inode_operations;
+			inode->i_fop = &yaffs_file_operations;
+			inode->i_mapping->a_ops =
+			    &yaffs_file_address_operations;
+			break;
+		case S_IFDIR:	/* directory */
+			inode->i_op = &yaffs_dir_inode_operations;
+			inode->i_fop = &yaffs_dir_operations;
+			break;
+		case S_IFLNK:	/* symlink */
+			inode->i_op = &yaffs_symlink_inode_operations;
+			break;
+		}
+
+		yaffs_inode_to_obj_lv(inode) = obj;
+
+		obj->my_inode = inode;
+
+	} else {
+		yaffs_trace(YAFFS_TRACE_OS,
+			"yaffs_fill_inode invalid parameters");
+	}
+}
+
+static void yaffs_put_super(struct super_block *sb)
+{
+	struct yaffs_dev *dev = yaffs_super_to_dev(sb);
+
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_put_super");
+
+	yaffs_trace(YAFFS_TRACE_OS | YAFFS_TRACE_BACKGROUND,
+		"Shutting down yaffs background thread");
+	yaffs_bg_stop(dev);
+	yaffs_trace(YAFFS_TRACE_OS | YAFFS_TRACE_BACKGROUND,
+		"yaffs background thread shut down");
+
+	yaffs_gross_lock(dev);
+
+	yaffs_flush_super(sb, 1);
+
+	if (yaffs_dev_to_lc(dev)->put_super_fn)
+		yaffs_dev_to_lc(dev)->put_super_fn(sb);
+
+	yaffs_deinitialise(dev);
+
+	yaffs_gross_unlock(dev);
+	mutex_lock(&yaffs_context_lock);
+	list_del_init(&(yaffs_dev_to_lc(dev)->context_list));
+	mutex_unlock(&yaffs_context_lock);
+
+	if (yaffs_dev_to_lc(dev)->spare_buffer) {
+		kfree(yaffs_dev_to_lc(dev)->spare_buffer);
+		yaffs_dev_to_lc(dev)->spare_buffer = NULL;
+	}
+
+	kfree(dev);
+}
+
+static void yaffs_mtd_put_super(struct super_block *sb)
+{
+	struct mtd_info *mtd = yaffs_dev_to_mtd(yaffs_super_to_dev(sb));
+
+	if (mtd->sync)
+		mtd->sync(mtd);
+
+	put_mtd_device(mtd);
+}
+
+static const struct super_operations yaffs_super_ops = {
+	.statfs = yaffs_statfs,
+	.put_super = yaffs_put_super,
+	.evict_inode = yaffs_evict_inode,
+	.sync_fs = yaffs_sync_fs,
+	.write_super = yaffs_write_super,
+};
+
+static struct super_block *yaffs_internal_read_super(int yaffs_version,
+						     struct super_block *sb,
+						     void *data, int silent)
+{
+	int n_blocks;
+	struct inode *inode = NULL;
+	struct dentry *root;
+	struct yaffs_dev *dev = 0;
+	char devname_buf[BDEVNAME_SIZE + 1];
+	struct mtd_info *mtd;
+	int err;
+	char *data_str = (char *)data;
+	struct yaffs_linux_context *context = NULL;
+	struct yaffs_param *param;
+
+	int read_only = 0;
+
+	struct yaffs_options options;
+
+	unsigned mount_id;
+	int found;
+	struct yaffs_linux_context *context_iterator;
+	struct list_head *l;
+
+	sb->s_magic = YAFFS_MAGIC;
+	sb->s_op = &yaffs_super_ops;
+	sb->s_flags |= MS_NOATIME;
+
+	read_only = ((sb->s_flags & MS_RDONLY) != 0);
+
+	sb->s_export_op = &yaffs_export_ops;
+
+	if (!sb)
+		printk(KERN_INFO "yaffs: sb is NULL\n");
+	else if (!sb->s_dev)
+		printk(KERN_INFO "yaffs: sb->s_dev is NULL\n");
+	else if (!yaffs_devname(sb, devname_buf))
+		printk(KERN_INFO "yaffs: devname is NULL\n");
+	else
+		printk(KERN_INFO "yaffs: dev is %d name is \"%s\" %s\n",
+		       sb->s_dev,
+		       yaffs_devname(sb, devname_buf), read_only ? "ro" : "rw");
+
+	if (!data_str)
+		data_str = "";
+
+	printk(KERN_INFO "yaffs: passed flags \"%s\"\n", data_str);
+
+	memset(&options, 0, sizeof(options));
+
+	if (yaffs_parse_options(&options, data_str)) {
+		/* Option parsing failed */
+		return NULL;
+	}
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_read_super: Using yaffs%d", yaffs_version);
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_read_super: block size %d", (int)(sb->s_blocksize));
+
+	yaffs_trace(YAFFS_TRACE_ALWAYS,
+		"Attempting MTD mount of %u.%u,\"%s\"",
+		MAJOR(sb->s_dev), MINOR(sb->s_dev),
+		yaffs_devname(sb, devname_buf));
+
+	/* Check it's an mtd device..... */
+	if (MAJOR(sb->s_dev) != MTD_BLOCK_MAJOR)
+		return NULL;	/* This isn't an mtd device */
+
+	/* Get the device */
+	mtd = get_mtd_device(NULL, MINOR(sb->s_dev));
+	if (!mtd) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS,
+			"MTD device #%u doesn't appear to exist",
+			MINOR(sb->s_dev));
+		return NULL;
+	}
+	/* Check it's NAND */
+	if (mtd->type != MTD_NANDFLASH) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS,
+			"MTD device is not NAND it's type %d",
+			mtd->type);
+		return NULL;
+	}
+
+	yaffs_trace(YAFFS_TRACE_OS, " erase %p", mtd->erase);
+	yaffs_trace(YAFFS_TRACE_OS, " read %p", mtd->read);
+	yaffs_trace(YAFFS_TRACE_OS, " write %p", mtd->write);
+	yaffs_trace(YAFFS_TRACE_OS, " readoob %p", mtd->read_oob);
+	yaffs_trace(YAFFS_TRACE_OS, " writeoob %p", mtd->write_oob);
+	yaffs_trace(YAFFS_TRACE_OS, " block_isbad %p", mtd->block_isbad);
+	yaffs_trace(YAFFS_TRACE_OS, " block_markbad %p", mtd->block_markbad);
+	yaffs_trace(YAFFS_TRACE_OS, " %s %d", WRITE_SIZE_STR, WRITE_SIZE(mtd));
+	yaffs_trace(YAFFS_TRACE_OS, " oobsize %d", mtd->oobsize);
+	yaffs_trace(YAFFS_TRACE_OS, " erasesize %d", mtd->erasesize);
+	yaffs_trace(YAFFS_TRACE_OS, " size %lld", mtd->size);
+
+#ifdef CONFIG_YAFFS_AUTO_YAFFS2
+
+	if (yaffs_version == 1 && WRITE_SIZE(mtd) >= 2048) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS, "auto selecting yaffs2");
+		yaffs_version = 2;
+	}
+
+	/* Added NCB 26/5/2006 for completeness */
+	if (yaffs_version == 2 && !options.inband_tags
+	    && WRITE_SIZE(mtd) == 512) {
+		yaffs_trace(YAFFS_TRACE_ALWAYS, "auto selecting yaffs1");
+		yaffs_version = 1;
+	}
+#endif
+
+	if (yaffs_version == 2) {
+		/* Check for version 2 style functions */
+		if (!mtd->erase ||
+		    !mtd->block_isbad ||
+		    !mtd->block_markbad ||
+		    !mtd->read ||
+		    !mtd->write || !mtd->read_oob || !mtd->write_oob) {
+			yaffs_trace(YAFFS_TRACE_ALWAYS,
+				"MTD device does not support required functions");
+			return NULL;
+		}
+
+		if ((WRITE_SIZE(mtd) < YAFFS_MIN_YAFFS2_CHUNK_SIZE ||
+		     mtd->oobsize < YAFFS_MIN_YAFFS2_SPARE_SIZE) &&
+		    !options.inband_tags) {
+			yaffs_trace(YAFFS_TRACE_ALWAYS,
+				"MTD device does not have the right page sizes");
+			return NULL;
+		}
+	} else {
+		/* Check for V1 style functions */
+		if (!mtd->erase ||
+		    !mtd->read ||
+		    !mtd->write || !mtd->read_oob || !mtd->write_oob) {
+			yaffs_trace(YAFFS_TRACE_ALWAYS,
+				"MTD device does not support required functions");
+			return NULL;
+		}
+
+		if (WRITE_SIZE(mtd) < YAFFS_BYTES_PER_CHUNK ||
+		    mtd->oobsize != YAFFS_BYTES_PER_SPARE) {
+			yaffs_trace(YAFFS_TRACE_ALWAYS,
+				"MTD device does not support have the right page sizes");
+			return NULL;
+		}
+	}
+
+	/* OK, so if we got here, we have an MTD that's NAND and looks
+	 * like it has the right capabilities
+	 * Set the struct yaffs_dev up for mtd
+	 */
+
+	if (!read_only && !(mtd->flags & MTD_WRITEABLE)) {
+		read_only = 1;
+		printk(KERN_INFO
+		       "yaffs: mtd is read only, setting superblock read only");
+		sb->s_flags |= MS_RDONLY;
+	}
+
+	dev = kmalloc(sizeof(struct yaffs_dev), GFP_KERNEL);
+	context = kmalloc(sizeof(struct yaffs_linux_context), GFP_KERNEL);
+
+	if (!dev || !context) {
+		if (dev)
+			kfree(dev);
+		if (context)
+			kfree(context);
+		dev = NULL;
+		context = NULL;
+	}
+
+	if (!dev) {
+		/* Deep shit could not allocate device structure */
+		yaffs_trace(YAFFS_TRACE_ALWAYS,
+			"yaffs_read_super failed trying to allocate yaffs_dev");
+		return NULL;
+	}
+	memset(dev, 0, sizeof(struct yaffs_dev));
+	param = &(dev->param);
+
+	memset(context, 0, sizeof(struct yaffs_linux_context));
+	dev->os_context = context;
+	INIT_LIST_HEAD(&(context->context_list));
+	context->dev = dev;
+	context->super = sb;
+
+	dev->read_only = read_only;
+
+	sb->s_fs_info = dev;
+
+	dev->driver_context = mtd;
+	param->name = mtd->name;
+
+	/* Set up the memory size parameters.... */
+
+	n_blocks =
+	    YCALCBLOCKS(mtd->size,
+			(YAFFS_CHUNKS_PER_BLOCK * YAFFS_BYTES_PER_CHUNK));
+
+	param->start_block = 0;
+	param->end_block = n_blocks - 1;
+	param->chunks_per_block = YAFFS_CHUNKS_PER_BLOCK;
+	param->total_bytes_per_chunk = YAFFS_BYTES_PER_CHUNK;
+	param->n_reserved_blocks = 5;
+	param->n_caches = (options.no_cache) ? 0 : 10;
+	param->inband_tags = options.inband_tags;
+
+#ifdef CONFIG_YAFFS_DISABLE_LAZY_LOAD
+	param->disable_lazy_load = 1;
+#endif
+#ifdef CONFIG_YAFFS_XATTR
+	param->enable_xattr = 1;
+#endif
+	if (options.lazy_loading_overridden)
+		param->disable_lazy_load = !options.lazy_loading_enabled;
+
+#ifdef CONFIG_YAFFS_DISABLE_TAGS_ECC
+	param->no_tags_ecc = 1;
+#endif
+
+#ifdef CONFIG_YAFFS_DISABLE_BACKGROUND
+#else
+	param->defered_dir_update = 1;
+#endif
+
+	if (options.tags_ecc_overridden)
+		param->no_tags_ecc = !options.tags_ecc_on;
+
+#ifdef CONFIG_YAFFS_EMPTY_LOST_AND_FOUND
+	param->empty_lost_n_found = 1;
+#endif
+
+#ifdef CONFIG_YAFFS_DISABLE_BLOCK_REFRESHING
+	param->refresh_period = 0;
+#else
+	param->refresh_period = 500;
+#endif
+
+#ifdef CONFIG_YAFFS_ALWAYS_CHECK_CHUNK_ERASED
+	param->always_check_erased = 1;
+#endif
+
+	if (options.empty_lost_and_found_overridden)
+		param->empty_lost_n_found = options.empty_lost_and_found;
+
+	/* ... and the functions. */
+	if (yaffs_version == 2) {
+		param->write_chunk_tags_fn = nandmtd2_write_chunk_tags;
+		param->read_chunk_tags_fn = nandmtd2_read_chunk_tags;
+		param->bad_block_fn = nandmtd2_mark_block_bad;
+		param->query_block_fn = nandmtd2_query_block;
+		yaffs_dev_to_lc(dev)->spare_buffer = 
+		                kmalloc(mtd->oobsize, GFP_NOFS);
+		param->is_yaffs2 = 1;
+		param->total_bytes_per_chunk = mtd->writesize;
+		param->chunks_per_block = mtd->erasesize / mtd->writesize;
+		n_blocks = YCALCBLOCKS(mtd->size, mtd->erasesize);
+
+		param->start_block = 0;
+		param->end_block = n_blocks - 1;
+	} else {
+		/* use the MTD interface in yaffs_mtdif1.c */
+		param->write_chunk_tags_fn = nandmtd1_write_chunk_tags;
+		param->read_chunk_tags_fn = nandmtd1_read_chunk_tags;
+		param->bad_block_fn = nandmtd1_mark_block_bad;
+		param->query_block_fn = nandmtd1_query_block;
+		param->is_yaffs2 = 0;
+	}
+	/* ... and common functions */
+	param->erase_fn = nandmtd_erase_block;
+	param->initialise_flash_fn = nandmtd_initialise;
+
+	yaffs_dev_to_lc(dev)->put_super_fn = yaffs_mtd_put_super;
+
+	param->sb_dirty_fn = yaffs_touch_super;
+	param->gc_control = yaffs_gc_control_callback;
+
+	yaffs_dev_to_lc(dev)->super = sb;
+
+#ifndef CONFIG_YAFFS_DOES_ECC
+	param->use_nand_ecc = 1;
+#endif
+
+	param->skip_checkpt_rd = options.skip_checkpoint_read;
+	param->skip_checkpt_wr = options.skip_checkpoint_write;
+
+	mutex_lock(&yaffs_context_lock);
+	/* Get a mount id */
+	found = 0;
+	for (mount_id = 0; !found; mount_id++) {
+		found = 1;
+		list_for_each(l, &yaffs_context_list) {
+			context_iterator =
+			    list_entry(l, struct yaffs_linux_context,
+				       context_list);
+			if (context_iterator->mount_id == mount_id)
+				found = 0;
+		}
+	}
+	context->mount_id = mount_id;
+
+	list_add_tail(&(yaffs_dev_to_lc(dev)->context_list),
+		      &yaffs_context_list);
+	mutex_unlock(&yaffs_context_lock);
+
+	/* Directory search handling... */
+	INIT_LIST_HEAD(&(yaffs_dev_to_lc(dev)->search_contexts));
+	param->remove_obj_fn = yaffs_remove_obj_callback;
+
+	mutex_init(&(yaffs_dev_to_lc(dev)->gross_lock));
+
+	yaffs_gross_lock(dev);
+
+	err = yaffs_guts_initialise(dev);
+
+	yaffs_trace(YAFFS_TRACE_OS,
+		"yaffs_read_super: guts initialised %s",
+		(err == YAFFS_OK) ? "OK" : "FAILED");
+
+	if (err == YAFFS_OK)
+		yaffs_bg_start(dev);
+
+	if (!context->bg_thread)
+		param->defered_dir_update = 0;
+
+	/* Release lock before yaffs_get_inode() */
+	yaffs_gross_unlock(dev);
+
+	/* Create root inode */
+	if (err == YAFFS_OK)
+		inode = yaffs_get_inode(sb, S_IFDIR | 0755, 0, yaffs_root(dev));
+
+	if (!inode)
+		return NULL;
+
+	inode->i_op = &yaffs_dir_inode_operations;
+	inode->i_fop = &yaffs_dir_operations;
+
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_read_super: got root inode");
+
+	root = d_alloc_root(inode);
+
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_read_super: d_alloc_root done");
+
+	if (!root) {
+		iput(inode);
+		return NULL;
+	}
+	sb->s_root = root;
+	sb->s_dirt = !dev->is_checkpointed;
+	yaffs_trace(YAFFS_TRACE_ALWAYS,
+		"yaffs_read_super: is_checkpointed %d",
+		dev->is_checkpointed);
+
+	yaffs_trace(YAFFS_TRACE_OS, "yaffs_read_super: done");
+	return sb;
+}
+
+static int yaffs_internal_read_super_mtd(struct super_block *sb, void *data,
+					 int silent)
+{
+	return yaffs_internal_read_super(1, sb, data, silent) ? 0 : -EINVAL;
+}
+
+static int yaffs_read_super(struct file_system_type *fs,
+			    int flags, const char *dev_name,
+			    void *data, struct vfsmount *mnt)
+{
+
+	return get_sb_bdev(fs, flags, dev_name, data,
+			   yaffs_internal_read_super_mtd, mnt);
+}
+
+static struct file_system_type yaffs_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "yaffs",
+	.get_sb = yaffs_read_super,
+	.kill_sb = kill_block_super,
+	.fs_flags = FS_REQUIRES_DEV,
+};
+
+#ifdef CONFIG_YAFFS_YAFFS2
+
+static int yaffs2_internal_read_super_mtd(struct super_block *sb, void *data,
+					  int silent)
+{
+	return yaffs_internal_read_super(2, sb, data, silent) ? 0 : -EINVAL;
+}
+
+static int yaffs2_read_super(struct file_system_type *fs,
+			     int flags, const char *dev_name, void *data,
+			     struct vfsmount *mnt)
+{
+	return get_sb_bdev(fs, flags, dev_name, data,
+			   yaffs2_internal_read_super_mtd, mnt);
+}
+
+static struct file_system_type yaffs2_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "yaffs2",
+	.get_sb = yaffs2_read_super,
+	.kill_sb = kill_block_super,
+	.fs_flags = FS_REQUIRES_DEV,
+};
+#endif /* CONFIG_YAFFS_YAFFS2 */
+
+static struct proc_dir_entry *my_proc_entry;
+
+static char *yaffs_dump_dev_part0(char *buf, struct yaffs_dev *dev)
+{
+	struct yaffs_param *param = &dev->param;
+	buf += sprintf(buf, "start_block........... %d\n", param->start_block);
+	buf += sprintf(buf, "end_block............. %d\n", param->end_block);
+	buf += sprintf(buf, "total_bytes_per_chunk. %d\n",
+			param->total_bytes_per_chunk);
+	buf += sprintf(buf, "use_nand_ecc.......... %d\n",
+			param->use_nand_ecc);
+	buf += sprintf(buf, "no_tags_ecc........... %d\n", param->no_tags_ecc);
+	buf += sprintf(buf, "is_yaffs2............. %d\n", param->is_yaffs2);
+	buf += sprintf(buf, "inband_tags........... %d\n", param->inband_tags);
+	buf += sprintf(buf, "empty_lost_n_found.... %d\n",
+			param->empty_lost_n_found);
+	buf += sprintf(buf, "disable_lazy_load..... %d\n",
+			param->disable_lazy_load);
+	buf += sprintf(buf, "refresh_period........ %d\n",
+			param->refresh_period);
+	buf += sprintf(buf, "n_caches.............. %d\n", param->n_caches);
+	buf += sprintf(buf, "n_reserved_blocks..... %d\n",
+			param->n_reserved_blocks);
+	buf += sprintf(buf, "always_check_erased... %d\n",
+			param->always_check_erased);
+
+	return buf;
+}
+
+static char *yaffs_dump_dev_part1(char *buf, struct yaffs_dev *dev)
+{
+	buf +=
+	    sprintf(buf, "data_bytes_per_chunk.. %d\n",
+		    dev->data_bytes_per_chunk);
+	buf += sprintf(buf, "chunk_grp_bits........ %d\n", dev->chunk_grp_bits);
+	buf += sprintf(buf, "chunk_grp_size........ %d\n", dev->chunk_grp_size);
+	buf +=
+	    sprintf(buf, "n_erased_blocks....... %d\n", dev->n_erased_blocks);
+	buf +=
+	    sprintf(buf, "blocks_in_checkpt..... %d\n", dev->blocks_in_checkpt);
+	buf += sprintf(buf, "\n");
+	buf += sprintf(buf, "n_tnodes.............. %d\n", dev->n_tnodes);
+	buf += sprintf(buf, "n_obj................. %d\n", dev->n_obj);
+	buf += sprintf(buf, "n_free_chunks......... %d\n", dev->n_free_chunks);
+	buf += sprintf(buf, "\n");
+	buf += sprintf(buf, "n_page_writes......... %u\n", dev->n_page_writes);
+	buf += sprintf(buf, "n_page_reads.......... %u\n", dev->n_page_reads);
+	buf += sprintf(buf, "n_erasures............ %u\n", dev->n_erasures);
+	buf += sprintf(buf, "n_gc_copies........... %u\n", dev->n_gc_copies);
+	buf += sprintf(buf, "all_gcs............... %u\n", dev->all_gcs);
+	buf +=
+	    sprintf(buf, "passive_gc_count...... %u\n", dev->passive_gc_count);
+	buf +=
+	    sprintf(buf, "oldest_dirty_gc_count. %u\n",
+		    dev->oldest_dirty_gc_count);
+	buf += sprintf(buf, "n_gc_blocks........... %u\n", dev->n_gc_blocks);
+	buf += sprintf(buf, "bg_gcs................ %u\n", dev->bg_gcs);
+	buf +=
+	    sprintf(buf, "n_retired_writes...... %u\n", dev->n_retired_writes);
+	buf +=
+	    sprintf(buf, "n_retired_blocks...... %u\n", dev->n_retired_blocks);
+	buf += sprintf(buf, "n_ecc_fixed........... %u\n", dev->n_ecc_fixed);
+	buf += sprintf(buf, "n_ecc_unfixed......... %u\n", dev->n_ecc_unfixed);
+	buf +=
+	    sprintf(buf, "n_tags_ecc_fixed...... %u\n", dev->n_tags_ecc_fixed);
+	buf +=
+	    sprintf(buf, "n_tags_ecc_unfixed.... %u\n",
+		    dev->n_tags_ecc_unfixed);
+	buf += sprintf(buf, "cache_hits............ %u\n", dev->cache_hits);
+	buf +=
+	    sprintf(buf, "n_deleted_files....... %u\n", dev->n_deleted_files);
+	buf +=
+	    sprintf(buf, "n_unlinked_files...... %u\n", dev->n_unlinked_files);
+	buf += sprintf(buf, "refresh_count......... %u\n", dev->refresh_count);
+	buf += sprintf(buf, "n_bg_deletions........ %u\n", dev->n_bg_deletions);
+
+	return buf;
+}
+
+static int yaffs_proc_read(char *page,
+			   char **start,
+			   off_t offset, int count, int *eof, void *data)
+{
+	struct list_head *item;
+	char *buf = page;
+	int step = offset;
+	int n = 0;
+
+	/* Get proc_file_read() to step 'offset' by one on each sucessive call.
+	 * We use 'offset' (*ppos) to indicate where we are in dev_list.
+	 * This also assumes the user has posted a read buffer large
+	 * enough to hold the complete output; but that's life in /proc.
+	 */
+
+	*(int *)start = 1;
+
+	/* Print header first */
+	if (step == 0)
+		buf += sprintf(buf, "YAFFS built:" __DATE__ " " __TIME__ "\n");
+	else if (step == 1)
+		buf += sprintf(buf, "\n");
+	else {
+		step -= 2;
+
+		mutex_lock(&yaffs_context_lock);
+
+		/* Locate and print the Nth entry.  Order N-squared but N is small. */
+		list_for_each(item, &yaffs_context_list) {
+			struct yaffs_linux_context *dc =
+			    list_entry(item, struct yaffs_linux_context,
+				       context_list);
+			struct yaffs_dev *dev = dc->dev;
+
+			if (n < (step & ~1)) {
+				n += 2;
+				continue;
+			}
+			if ((step & 1) == 0) {
+				buf +=
+				    sprintf(buf, "\nDevice %d \"%s\"\n", n,
+					    dev->param.name);
+				buf = yaffs_dump_dev_part0(buf, dev);
+			} else {
+				buf = yaffs_dump_dev_part1(buf, dev);
+                        }
+
+			break;
+		}
+		mutex_unlock(&yaffs_context_lock);
+	}
+
+	return buf - page < count ? buf - page : count;
+}
+
+
+/**
+ * Set the verbosity of the warnings and error messages.
+ *
+ * Note that the names can only be a..z or _ with the current code.
+ */
+
+static struct {
+	char *mask_name;
+	unsigned mask_bitfield;
+} mask_flags[] = {
+	{"allocate", YAFFS_TRACE_ALLOCATE}, 
+	{"always", YAFFS_TRACE_ALWAYS},
+	{"background", YAFFS_TRACE_BACKGROUND},
+	{"bad_blocks", YAFFS_TRACE_BAD_BLOCKS},
+	{"buffers", YAFFS_TRACE_BUFFERS},
+	{"bug", YAFFS_TRACE_BUG},
+	{"checkpt", YAFFS_TRACE_CHECKPOINT},
+	{"deletion", YAFFS_TRACE_DELETION},
+	{"erase", YAFFS_TRACE_ERASE},
+	{"error", YAFFS_TRACE_ERROR},
+	{"gc_detail", YAFFS_TRACE_GC_DETAIL},
+	{"gc", YAFFS_TRACE_GC},
+	{"lock", YAFFS_TRACE_LOCK},
+	{"mtd", YAFFS_TRACE_MTD},
+	{"nandaccess", YAFFS_TRACE_NANDACCESS},
+	{"os", YAFFS_TRACE_OS},
+	{"scan_debug", YAFFS_TRACE_SCAN_DEBUG},
+	{"scan", YAFFS_TRACE_SCAN},
+	{"mount", YAFFS_TRACE_MOUNT},
+	{"tracing", YAFFS_TRACE_TRACING},
+	{"sync", YAFFS_TRACE_SYNC},
+	{"write", YAFFS_TRACE_WRITE},
+	{"verify", YAFFS_TRACE_VERIFY},
+	{"verify_nand", YAFFS_TRACE_VERIFY_NAND},
+	{"verify_full", YAFFS_TRACE_VERIFY_FULL},
+	{"verify_all", YAFFS_TRACE_VERIFY_ALL},
+	{"all", 0xffffffff},
+	{"none", 0},
+	{NULL, 0},
+};
+
+#define MAX_MASK_NAME_LENGTH 40
+static int yaffs_proc_write_trace_options(struct file *file, const char *buf,
+					  unsigned long count, void *data)
+{
+	unsigned rg = 0, mask_bitfield;
+	char *end;
+	char *mask_name;
+	const char *x;
+	char substring[MAX_MASK_NAME_LENGTH + 1];
+	int i;
+	int done = 0;
+	int add, len = 0;
+	int pos = 0;
+
+	rg = yaffs_trace_mask;
+
+	while (!done && (pos < count)) {
+		done = 1;
+		while ((pos < count) && isspace(buf[pos]))
+			pos++;
+
+		switch (buf[pos]) {
+		case '+':
+		case '-':
+		case '=':
+			add = buf[pos];
+			pos++;
+			break;
+
+		default:
+			add = ' ';
+			break;
+		}
+		mask_name = NULL;
+
+		mask_bitfield = simple_strtoul(buf + pos, &end, 0);
+
+		if (end > buf + pos) {
+			mask_name = "numeral";
+			len = end - (buf + pos);
+			pos += len;
+			done = 0;
+		} else {
+			for (x = buf + pos, i = 0;
+			     (*x == '_' || (*x >= 'a' && *x <= 'z')) &&
+			     i < MAX_MASK_NAME_LENGTH; x++, i++, pos++)
+				substring[i] = *x;
+			substring[i] = '\0';
+
+			for (i = 0; mask_flags[i].mask_name != NULL; i++) {
+				if (strcmp(substring, mask_flags[i].mask_name)
+				    == 0) {
+					mask_name = mask_flags[i].mask_name;
+					mask_bitfield =
+					    mask_flags[i].mask_bitfield;
+					done = 0;
+					break;
+				}
+			}
+		}
+
+		if (mask_name != NULL) {
+			done = 0;
+			switch (add) {
+			case '-':
+				rg &= ~mask_bitfield;
+				break;
+			case '+':
+				rg |= mask_bitfield;
+				break;
+			case '=':
+				rg = mask_bitfield;
+				break;
+			default:
+				rg |= mask_bitfield;
+				break;
+			}
+		}
+	}
+
+	yaffs_trace_mask = rg | YAFFS_TRACE_ALWAYS;
+
+	printk(KERN_DEBUG "new trace = 0x%08X\n", yaffs_trace_mask);
+
+	if (rg & YAFFS_TRACE_ALWAYS) {
+		for (i = 0; mask_flags[i].mask_name != NULL; i++) {
+			char flag;
+			flag = ((rg & mask_flags[i].mask_bitfield) ==
+				mask_flags[i].mask_bitfield) ? '+' : '-';
+			printk(KERN_DEBUG "%c%s\n", flag,
+			       mask_flags[i].mask_name);
+		}
+	}
+
+	return count;
+}
+
+static int yaffs_proc_write(struct file *file, const char *buf,
+			    unsigned long count, void *data)
+{
+	return yaffs_proc_write_trace_options(file, buf, count, data);
+}
+
+/* Stuff to handle installation of file systems */
+struct file_system_to_install {
+	struct file_system_type *fst;
+	int installed;
+};
+
+static struct file_system_to_install fs_to_install[] = {
+	{&yaffs_fs_type, 0},
+	{&yaffs2_fs_type, 0},
+	{NULL, 0}
+};
+
+static int __init init_yaffs_fs(void)
+{
+	int error = 0;
+	struct file_system_to_install *fsinst;
+
+	yaffs_trace(YAFFS_TRACE_ALWAYS,
+		"yaffs built " __DATE__ " " __TIME__ " Installing.");
+
+#ifdef CONFIG_YAFFS_ALWAYS_CHECK_CHUNK_ERASED
+	yaffs_trace(YAFFS_TRACE_ALWAYS,
+		"\n\nYAFFS-WARNING CONFIG_YAFFS_ALWAYS_CHECK_CHUNK_ERASED selected.\n\n\n");
+#endif
+
+	mutex_init(&yaffs_context_lock);
+
+	/* Install the proc_fs entries */
+	my_proc_entry = create_proc_entry("yaffs",
+					  S_IRUGO | S_IFREG, YPROC_ROOT);
+
+	if (my_proc_entry) {
+		my_proc_entry->write_proc = yaffs_proc_write;
+		my_proc_entry->read_proc = yaffs_proc_read;
+		my_proc_entry->data = NULL;
+	} else {
+		return -ENOMEM;
+        }
+
+
+	/* Now add the file system entries */
+
+	fsinst = fs_to_install;
+
+	while (fsinst->fst && !error) {
+		error = register_filesystem(fsinst->fst);
+		if (!error)
+			fsinst->installed = 1;
+		fsinst++;
+	}
+
+	/* Any errors? uninstall  */
+	if (error) {
+		fsinst = fs_to_install;
+
+		while (fsinst->fst) {
+			if (fsinst->installed) {
+				unregister_filesystem(fsinst->fst);
+				fsinst->installed = 0;
+			}
+			fsinst++;
+		}
+	}
+
+	return error;
+}
+
+static void __exit exit_yaffs_fs(void)
+{
+
+	struct file_system_to_install *fsinst;
+
+	yaffs_trace(YAFFS_TRACE_ALWAYS,
+		"yaffs built " __DATE__ " " __TIME__ " removing.");
+
+	remove_proc_entry("yaffs", YPROC_ROOT);
+
+	fsinst = fs_to_install;
+
+	while (fsinst->fst) {
+		if (fsinst->installed) {
+			unregister_filesystem(fsinst->fst);
+			fsinst->installed = 0;
+		}
+		fsinst++;
+	}
+}
+
+module_init(init_yaffs_fs)
+    module_exit(exit_yaffs_fs)
+
+    MODULE_DESCRIPTION("YAFFS2 - a NAND specific flash file system");
+MODULE_AUTHOR("Charles Manning, Aleph One Ltd., 2002-2010");
+MODULE_LICENSE("GPL");
diff --git a/fs/yaffs2/yaffs_yaffs1.c b/fs/yaffs2/yaffs_yaffs1.c
new file mode 100644
index 00000000..9eb60308
--- /dev/null
+++ b/fs/yaffs2/yaffs_yaffs1.c
@@ -0,0 +1,433 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "yaffs_yaffs1.h"
+#include "yportenv.h"
+#include "yaffs_trace.h"
+#include "yaffs_bitmap.h"
+#include "yaffs_getblockinfo.h"
+#include "yaffs_nand.h"
+#include "yaffs_attribs.h"
+
+int yaffs1_scan(struct yaffs_dev *dev)
+{
+	struct yaffs_ext_tags tags;
+	int blk;
+	int result;
+
+	int chunk;
+	int c;
+	int deleted;
+	enum yaffs_block_state state;
+	struct yaffs_obj *hard_list = NULL;
+	struct yaffs_block_info *bi;
+	u32 seq_number;
+	struct yaffs_obj_hdr *oh;
+	struct yaffs_obj *in;
+	struct yaffs_obj *parent;
+
+	int alloc_failed = 0;
+
+	struct yaffs_shadow_fixer *shadow_fixers = NULL;
+
+	u8 *chunk_data;
+
+	yaffs_trace(YAFFS_TRACE_SCAN,
+		"yaffs1_scan starts  intstartblk %d intendblk %d...",
+		dev->internal_start_block, dev->internal_end_block);
+
+	chunk_data = yaffs_get_temp_buffer(dev, __LINE__);
+
+	dev->seq_number = YAFFS_LOWEST_SEQUENCE_NUMBER;
+
+	/* Scan all the blocks to determine their state */
+	bi = dev->block_info;
+	for (blk = dev->internal_start_block; blk <= dev->internal_end_block;
+	     blk++) {
+		yaffs_clear_chunk_bits(dev, blk);
+		bi->pages_in_use = 0;
+		bi->soft_del_pages = 0;
+
+		yaffs_query_init_block_state(dev, blk, &state, &seq_number);
+
+		bi->block_state = state;
+		bi->seq_number = seq_number;
+
+		if (bi->seq_number == YAFFS_SEQUENCE_BAD_BLOCK)
+			bi->block_state = state = YAFFS_BLOCK_STATE_DEAD;
+
+		yaffs_trace(YAFFS_TRACE_SCAN_DEBUG,
+			"Block scanning block %d state %d seq %d",
+			blk, state, seq_number);
+
+		if (state == YAFFS_BLOCK_STATE_DEAD) {
+			yaffs_trace(YAFFS_TRACE_BAD_BLOCKS,
+				"block %d is bad", blk);
+		} else if (state == YAFFS_BLOCK_STATE_EMPTY) {
+			yaffs_trace(YAFFS_TRACE_SCAN_DEBUG, "Block empty ");
+			dev->n_erased_blocks++;
+			dev->n_free_chunks += dev->param.chunks_per_block;
+		}
+		bi++;
+	}
+
+	/* For each block.... */
+	for (blk = dev->internal_start_block;
+	     !alloc_failed && blk <= dev->internal_end_block; blk++) {
+
+		cond_resched();
+
+		bi = yaffs_get_block_info(dev, blk);
+		state = bi->block_state;
+
+		deleted = 0;
+
+		/* For each chunk in each block that needs scanning.... */
+		for (c = 0; !alloc_failed && c < dev->param.chunks_per_block &&
+		     state == YAFFS_BLOCK_STATE_NEEDS_SCANNING; c++) {
+			/* Read the tags and decide what to do */
+			chunk = blk * dev->param.chunks_per_block + c;
+
+			result = yaffs_rd_chunk_tags_nand(dev, chunk, NULL,
+							  &tags);
+
+			/* Let's have a good look at this chunk... */
+
+			if (tags.ecc_result == YAFFS_ECC_RESULT_UNFIXED
+			    || tags.is_deleted) {
+				/* YAFFS1 only...
+				 * A deleted chunk
+				 */
+				deleted++;
+				dev->n_free_chunks++;
+				/*T((" %d %d deleted\n",blk,c)); */
+			} else if (!tags.chunk_used) {
+				/* An unassigned chunk in the block
+				 * This means that either the block is empty or
+				 * this is the one being allocated from
+				 */
+
+				if (c == 0) {
+					/* We're looking at the first chunk in the block so the block is unused */
+					state = YAFFS_BLOCK_STATE_EMPTY;
+					dev->n_erased_blocks++;
+				} else {
+					/* this is the block being allocated from */
+					yaffs_trace(YAFFS_TRACE_SCAN,
+						" Allocating from %d %d",
+						blk, c);
+					state = YAFFS_BLOCK_STATE_ALLOCATING;
+					dev->alloc_block = blk;
+					dev->alloc_page = c;
+					dev->alloc_block_finder = blk;
+					/* Set block finder here to encourage the allocator to go forth from here. */
+
+				}
+
+				dev->n_free_chunks +=
+				    (dev->param.chunks_per_block - c);
+			} else if (tags.chunk_id > 0) {
+				/* chunk_id > 0 so it is a data chunk... */
+				unsigned int endpos;
+
+				yaffs_set_chunk_bit(dev, blk, c);
+				bi->pages_in_use++;
+
+				in = yaffs_find_or_create_by_number(dev,
+								    tags.obj_id,
+								    YAFFS_OBJECT_TYPE_FILE);
+				/* PutChunkIntoFile checks for a clash (two data chunks with
+				 * the same chunk_id).
+				 */
+
+				if (!in)
+					alloc_failed = 1;
+
+				if (in) {
+					if (!yaffs_put_chunk_in_file
+					    (in, tags.chunk_id, chunk, 1))
+						alloc_failed = 1;
+				}
+
+				endpos =
+				    (tags.chunk_id -
+				     1) * dev->data_bytes_per_chunk +
+				    tags.n_bytes;
+				if (in
+				    && in->variant_type ==
+				    YAFFS_OBJECT_TYPE_FILE
+				    && in->variant.file_variant.scanned_size <
+				    endpos) {
+					in->variant.file_variant.scanned_size =
+					    endpos;
+					if (!dev->param.use_header_file_size) {
+						in->variant.
+						    file_variant.file_size =
+						    in->variant.
+						    file_variant.scanned_size;
+					}
+
+				}
+				/* T((" %d %d data %d %d\n",blk,c,tags.obj_id,tags.chunk_id));   */
+			} else {
+				/* chunk_id == 0, so it is an ObjectHeader.
+				 * Thus, we read in the object header and make the object
+				 */
+				yaffs_set_chunk_bit(dev, blk, c);
+				bi->pages_in_use++;
+
+				result = yaffs_rd_chunk_tags_nand(dev, chunk,
+								  chunk_data,
+								  NULL);
+
+				oh = (struct yaffs_obj_hdr *)chunk_data;
+
+				in = yaffs_find_by_number(dev, tags.obj_id);
+				if (in && in->variant_type != oh->type) {
+					/* This should not happen, but somehow
+					 * Wev'e ended up with an obj_id that has been reused but not yet
+					 * deleted, and worse still it has changed type. Delete the old object.
+					 */
+
+					yaffs_del_obj(in);
+
+					in = 0;
+				}
+
+				in = yaffs_find_or_create_by_number(dev,
+								    tags.obj_id,
+								    oh->type);
+
+				if (!in)
+					alloc_failed = 1;
+
+				if (in && oh->shadows_obj > 0) {
+
+					struct yaffs_shadow_fixer *fixer;
+					fixer =
+						kmalloc(sizeof
+						(struct yaffs_shadow_fixer),
+						GFP_NOFS);
+					if (fixer) {
+						fixer->next = shadow_fixers;
+						shadow_fixers = fixer;
+						fixer->obj_id = tags.obj_id;
+						fixer->shadowed_id =
+						    oh->shadows_obj;
+						yaffs_trace(YAFFS_TRACE_SCAN,
+							" Shadow fixer: %d shadows %d",
+							fixer->obj_id,
+							fixer->shadowed_id);
+
+					}
+
+				}
+
+				if (in && in->valid) {
+					/* We have already filled this one. We have a duplicate and need to resolve it. */
+
+					unsigned existing_serial = in->serial;
+					unsigned new_serial =
+					    tags.serial_number;
+
+					if (((existing_serial + 1) & 3) ==
+					    new_serial) {
+						/* Use new one - destroy the exisiting one */
+						yaffs_chunk_del(dev,
+								in->hdr_chunk,
+								1, __LINE__);
+						in->valid = 0;
+					} else {
+						/* Use existing - destroy this one. */
+						yaffs_chunk_del(dev, chunk, 1,
+								__LINE__);
+					}
+				}
+
+				if (in && !in->valid &&
+				    (tags.obj_id == YAFFS_OBJECTID_ROOT ||
+				     tags.obj_id ==
+				     YAFFS_OBJECTID_LOSTNFOUND)) {
+					/* We only load some info, don't fiddle with directory structure */
+					in->valid = 1;
+					in->variant_type = oh->type;
+
+					in->yst_mode = oh->yst_mode;
+					yaffs_load_attribs(in, oh);
+					in->hdr_chunk = chunk;
+					in->serial = tags.serial_number;
+
+				} else if (in && !in->valid) {
+					/* we need to load this info */
+
+					in->valid = 1;
+					in->variant_type = oh->type;
+
+					in->yst_mode = oh->yst_mode;
+					yaffs_load_attribs(in, oh);
+					in->hdr_chunk = chunk;
+					in->serial = tags.serial_number;
+
+					yaffs_set_obj_name_from_oh(in, oh);
+					in->dirty = 0;
+
+					/* directory stuff...
+					 * hook up to parent
+					 */
+
+					parent =
+					    yaffs_find_or_create_by_number
+					    (dev, oh->parent_obj_id,
+					     YAFFS_OBJECT_TYPE_DIRECTORY);
+					if (!parent)
+						alloc_failed = 1;
+					if (parent && parent->variant_type ==
+					    YAFFS_OBJECT_TYPE_UNKNOWN) {
+						/* Set up as a directory */
+						parent->variant_type =
+						    YAFFS_OBJECT_TYPE_DIRECTORY;
+						INIT_LIST_HEAD(&parent->
+							       variant.dir_variant.children);
+					} else if (!parent
+						   || parent->variant_type !=
+						   YAFFS_OBJECT_TYPE_DIRECTORY) {
+						/* Hoosterman, another problem....
+						 * We're trying to use a non-directory as a directory
+						 */
+
+						yaffs_trace(YAFFS_TRACE_ERROR,
+							"yaffs tragedy: attempting to use non-directory as a directory in scan. Put in lost+found."
+							);
+						parent = dev->lost_n_found;
+					}
+
+					yaffs_add_obj_to_dir(parent, in);
+
+					if (0 && (parent == dev->del_dir ||
+						  parent ==
+						  dev->unlinked_dir)) {
+						in->deleted = 1;	/* If it is unlinked at start up then it wants deleting */
+						dev->n_deleted_files++;
+					}
+					/* Note re hardlinks.
+					 * Since we might scan a hardlink before its equivalent object is scanned
+					 * we put them all in a list.
+					 * After scanning is complete, we should have all the objects, so we run through this
+					 * list and fix up all the chains.
+					 */
+
+					switch (in->variant_type) {
+					case YAFFS_OBJECT_TYPE_UNKNOWN:
+						/* Todo got a problem */
+						break;
+					case YAFFS_OBJECT_TYPE_FILE:
+						if (dev->param.
+						    use_header_file_size)
+
+							in->variant.
+							    file_variant.file_size
+							    = oh->file_size;
+
+						break;
+					case YAFFS_OBJECT_TYPE_HARDLINK:
+						in->variant.
+						    hardlink_variant.equiv_id =
+						    oh->equiv_id;
+						in->hard_links.next =
+						    (struct list_head *)
+						    hard_list;
+						hard_list = in;
+						break;
+					case YAFFS_OBJECT_TYPE_DIRECTORY:
+						/* Do nothing */
+						break;
+					case YAFFS_OBJECT_TYPE_SPECIAL:
+						/* Do nothing */
+						break;
+					case YAFFS_OBJECT_TYPE_SYMLINK:
+						in->variant.symlink_variant.
+						    alias =
+						    yaffs_clone_str(oh->alias);
+						if (!in->variant.
+						    symlink_variant.alias)
+							alloc_failed = 1;
+						break;
+					}
+
+				}
+			}
+		}
+
+		if (state == YAFFS_BLOCK_STATE_NEEDS_SCANNING) {
+			/* If we got this far while scanning, then the block is fully allocated. */
+			state = YAFFS_BLOCK_STATE_FULL;
+		}
+
+		if (state == YAFFS_BLOCK_STATE_ALLOCATING) {
+			/* If the block was partially allocated then treat it as fully allocated. */
+			state = YAFFS_BLOCK_STATE_FULL;
+			dev->alloc_block = -1;
+		}
+
+		bi->block_state = state;
+
+		/* Now let's see if it was dirty */
+		if (bi->pages_in_use == 0 &&
+		    !bi->has_shrink_hdr &&
+		    bi->block_state == YAFFS_BLOCK_STATE_FULL) {
+			yaffs_block_became_dirty(dev, blk);
+		}
+
+	}
+
+	/* Ok, we've done all the scanning.
+	 * Fix up the hard link chains.
+	 * We should now have scanned all the objects, now it's time to add these
+	 * hardlinks.
+	 */
+
+	yaffs_link_fixup(dev, hard_list);
+
+	/* Fix up any shadowed objects */
+	{
+		struct yaffs_shadow_fixer *fixer;
+		struct yaffs_obj *obj;
+
+		while (shadow_fixers) {
+			fixer = shadow_fixers;
+			shadow_fixers = fixer->next;
+			/* Complete the rename transaction by deleting the shadowed object
+			 * then setting the object header to unshadowed.
+			 */
+			obj = yaffs_find_by_number(dev, fixer->shadowed_id);
+			if (obj)
+				yaffs_del_obj(obj);
+
+			obj = yaffs_find_by_number(dev, fixer->obj_id);
+
+			if (obj)
+				yaffs_update_oh(obj, NULL, 1, 0, 0, NULL);
+
+			kfree(fixer);
+		}
+	}
+
+	yaffs_release_temp_buffer(dev, chunk_data, __LINE__);
+
+	if (alloc_failed)
+		return YAFFS_FAIL;
+
+	yaffs_trace(YAFFS_TRACE_SCAN, "yaffs1_scan ends");
+
+	return YAFFS_OK;
+}
diff --git a/fs/yaffs2/yaffs_yaffs1.h b/fs/yaffs2/yaffs_yaffs1.h
new file mode 100644
index 00000000..db23e049
--- /dev/null
+++ b/fs/yaffs2/yaffs_yaffs1.h
@@ -0,0 +1,22 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YAFFS_YAFFS1_H__
+#define __YAFFS_YAFFS1_H__
+
+#include "yaffs_guts.h"
+int yaffs1_scan(struct yaffs_dev *dev);
+
+#endif
diff --git a/fs/yaffs2/yaffs_yaffs2.c b/fs/yaffs2/yaffs_yaffs2.c
new file mode 100644
index 00000000..33397af7
--- /dev/null
+++ b/fs/yaffs2/yaffs_yaffs2.c
@@ -0,0 +1,1598 @@
+/*
+ * YAFFS: Yet Another Flash File System. A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "yaffs_guts.h"
+#include "yaffs_trace.h"
+#include "yaffs_yaffs2.h"
+#include "yaffs_checkptrw.h"
+#include "yaffs_bitmap.h"
+#include "yaffs_nand.h"
+#include "yaffs_getblockinfo.h"
+#include "yaffs_verify.h"
+#include "yaffs_attribs.h"
+
+/*
+ * Checkpoints are really no benefit on very small partitions.
+ *
+ * To save space on small partitions don't bother with checkpoints unless
+ * the partition is at least this big.
+ */
+#define YAFFS_CHECKPOINT_MIN_BLOCKS 60
+
+#define YAFFS_SMALL_HOLE_THRESHOLD 4
+
+/*
+ * Oldest Dirty Sequence Number handling.
+ */
+
+/* yaffs_calc_oldest_dirty_seq()
+ * yaffs2_find_oldest_dirty_seq()
+ * Calculate the oldest dirty sequence number if we don't know it.
+ */
+void yaffs_calc_oldest_dirty_seq(struct yaffs_dev *dev)
+{
+	int i;
+	unsigned seq;
+	unsigned block_no = 0;
+	struct yaffs_block_info *b;
+
+	if (!dev->param.is_yaffs2)
+		return;
+
+	/* Find the oldest dirty sequence number. */
+	seq = dev->seq_number + 1;
+	b = dev->block_info;
+	for (i = dev->internal_start_block; i <= dev->internal_end_block; i++) {
+		if (b->block_state == YAFFS_BLOCK_STATE_FULL &&
+		    (b->pages_in_use - b->soft_del_pages) <
+		    dev->param.chunks_per_block && b->seq_number < seq) {
+			seq = b->seq_number;
+			block_no = i;
+		}
+		b++;
+	}
+
+	if (block_no) {
+		dev->oldest_dirty_seq = seq;
+		dev->oldest_dirty_block = block_no;
+	}
+
+}
+
+void yaffs2_find_oldest_dirty_seq(struct yaffs_dev *dev)
+{
+	if (!dev->param.is_yaffs2)
+		return;
+
+	if (!dev->oldest_dirty_seq)
+		yaffs_calc_oldest_dirty_seq(dev);
+}
+
+/*
+ * yaffs_clear_oldest_dirty_seq()
+ * Called when a block is erased or marked bad. (ie. when its seq_number
+ * becomes invalid). If the value matches the oldest then we clear 
+ * dev->oldest_dirty_seq to force its recomputation.
+ */
+void yaffs2_clear_oldest_dirty_seq(struct yaffs_dev *dev,
+				   struct yaffs_block_info *bi)
+{
+
+	if (!dev->param.is_yaffs2)
+		return;
+
+	if (!bi || bi->seq_number == dev->oldest_dirty_seq) {
+		dev->oldest_dirty_seq = 0;
+		dev->oldest_dirty_block = 0;
+	}
+}
+
+/*
+ * yaffs2_update_oldest_dirty_seq()
+ * Update the oldest dirty sequence number whenever we dirty a block.
+ * Only do this if the oldest_dirty_seq is actually being tracked.
+ */
+void yaffs2_update_oldest_dirty_seq(struct yaffs_dev *dev, unsigned block_no,
+				    struct yaffs_block_info *bi)
+{
+	if (!dev->param.is_yaffs2)
+		return;
+
+	if (dev->oldest_dirty_seq) {
+		if (dev->oldest_dirty_seq > bi->seq_number) {
+			dev->oldest_dirty_seq = bi->seq_number;
+			dev->oldest_dirty_block = block_no;
+		}
+	}
+}
+
+int yaffs_block_ok_for_gc(struct yaffs_dev *dev, struct yaffs_block_info *bi)
+{
+
+	if (!dev->param.is_yaffs2)
+		return 1;	/* disqualification only applies to yaffs2. */
+
+	if (!bi->has_shrink_hdr)
+		return 1;	/* can gc */
+
+	yaffs2_find_oldest_dirty_seq(dev);
+
+	/* Can't do gc of this block if there are any blocks older than this one that have
+	 * discarded pages.
+	 */
+	return (bi->seq_number <= dev->oldest_dirty_seq);
+}
+
+/*
+ * yaffs2_find_refresh_block()
+ * periodically finds the oldest full block by sequence number for refreshing.
+ * Only for yaffs2.
+ */
+u32 yaffs2_find_refresh_block(struct yaffs_dev * dev)
+{
+	u32 b;
+
+	u32 oldest = 0;
+	u32 oldest_seq = 0;
+
+	struct yaffs_block_info *bi;
+
+	if (!dev->param.is_yaffs2)
+		return oldest;
+
+	/*
+	 * If refresh period < 10 then refreshing is disabled.
+	 */
+	if (dev->param.refresh_period < 10)
+		return oldest;
+
+	/*
+	 * Fix broken values.
+	 */
+	if (dev->refresh_skip > dev->param.refresh_period)
+		dev->refresh_skip = dev->param.refresh_period;
+
+	if (dev->refresh_skip > 0)
+		return oldest;
+
+	/*
+	 * Refresh skip is now zero.
+	 * We'll do a refresh this time around....
+	 * Update the refresh skip and find the oldest block.
+	 */
+	dev->refresh_skip = dev->param.refresh_period;
+	dev->refresh_count++;
+	bi = dev->block_info;
+	for (b = dev->internal_start_block; b <= dev->internal_end_block; b++) {
+
+		if (bi->block_state == YAFFS_BLOCK_STATE_FULL) {
+
+			if (oldest < 1 || bi->seq_number < oldest_seq) {
+				oldest = b;
+				oldest_seq = bi->seq_number;
+			}
+		}
+		bi++;
+	}
+
+	if (oldest > 0) {
+		yaffs_trace(YAFFS_TRACE_GC,
+			"GC refresh count %d selected block %d with seq_number %d",
+			dev->refresh_count, oldest, oldest_seq);
+	}
+
+	return oldest;
+}
+
+int yaffs2_checkpt_required(struct yaffs_dev *dev)
+{
+	int nblocks;
+
+	if (!dev->param.is_yaffs2)
+		return 0;
+
+	nblocks = dev->internal_end_block - dev->internal_start_block + 1;
+
+	return !dev->param.skip_checkpt_wr &&
+	    !dev->read_only && (nblocks >= YAFFS_CHECKPOINT_MIN_BLOCKS);
+}
+
+int yaffs_calc_checkpt_blocks_required(struct yaffs_dev *dev)
+{
+	int retval;
+
+	if (!dev->param.is_yaffs2)
+		return 0;
+
+	if (!dev->checkpoint_blocks_required && yaffs2_checkpt_required(dev)) {
+		/* Not a valid value so recalculate */
+		int n_bytes = 0;
+		int n_blocks;
+		int dev_blocks =
+		    (dev->param.end_block - dev->param.start_block + 1);
+
+		n_bytes += sizeof(struct yaffs_checkpt_validity);
+		n_bytes += sizeof(struct yaffs_checkpt_dev);
+		n_bytes += dev_blocks * sizeof(struct yaffs_block_info);
+		n_bytes += dev_blocks * dev->chunk_bit_stride;
+		n_bytes +=
+		    (sizeof(struct yaffs_checkpt_obj) +
+		     sizeof(u32)) * (dev->n_obj);
+		n_bytes += (dev->tnode_size + sizeof(u32)) * (dev->n_tnodes);
+		n_bytes += sizeof(struct yaffs_checkpt_validity);
+		n_bytes += sizeof(u32);	/* checksum */
+
+		/* Round up and add 2 blocks to allow for some bad blocks, so add 3 */
+
+		n_blocks =
+		    (n_bytes /
+		     (dev->data_bytes_per_chunk *
+		      dev->param.chunks_per_block)) + 3;
+
+		dev->checkpoint_blocks_required = n_blocks;
+	}
+
+	retval = dev->checkpoint_blocks_required - dev->blocks_in_checkpt;
+	if (retval < 0)
+		retval = 0;
+	return retval;
+}
+
+/*--------------------- Checkpointing --------------------*/
+
+static int yaffs2_wr_checkpt_validity_marker(struct yaffs_dev *dev, int head)
+{
+	struct yaffs_checkpt_validity cp;
+
+	memset(&cp, 0, sizeof(cp));
+
+	cp.struct_type = sizeof(cp);
+	cp.magic = YAFFS_MAGIC;
+	cp.version = YAFFS_CHECKPOINT_VERSION;
+	cp.head = (head) ? 1 : 0;
+
+	return (yaffs2_checkpt_wr(dev, &cp, sizeof(cp)) == sizeof(cp)) ? 1 : 0;
+}
+
+static int yaffs2_rd_checkpt_validity_marker(struct yaffs_dev *dev, int head)
+{
+	struct yaffs_checkpt_validity cp;
+	int ok;
+
+	ok = (yaffs2_checkpt_rd(dev, &cp, sizeof(cp)) == sizeof(cp));
+
+	if (ok)
+		ok = (cp.struct_type == sizeof(cp)) &&
+		    (cp.magic == YAFFS_MAGIC) &&
+		    (cp.version == YAFFS_CHECKPOINT_VERSION) &&
+		    (cp.head == ((head) ? 1 : 0));
+	return ok ? 1 : 0;
+}
+
+static void yaffs2_dev_to_checkpt_dev(struct yaffs_checkpt_dev *cp,
+				      struct yaffs_dev *dev)
+{
+	cp->n_erased_blocks = dev->n_erased_blocks;
+	cp->alloc_block = dev->alloc_block;
+	cp->alloc_page = dev->alloc_page;
+	cp->n_free_chunks = dev->n_free_chunks;
+
+	cp->n_deleted_files = dev->n_deleted_files;
+	cp->n_unlinked_files = dev->n_unlinked_files;
+	cp->n_bg_deletions = dev->n_bg_deletions;
+	cp->seq_number = dev->seq_number;
+
+}
+
+static void yaffs_checkpt_dev_to_dev(struct yaffs_dev *dev,
+				     struct yaffs_checkpt_dev *cp)
+{
+	dev->n_erased_blocks = cp->n_erased_blocks;
+	dev->alloc_block = cp->alloc_block;
+	dev->alloc_page = cp->alloc_page;
+	dev->n_free_chunks = cp->n_free_chunks;
+
+	dev->n_deleted_files = cp->n_deleted_files;
+	dev->n_unlinked_files = cp->n_unlinked_files;
+	dev->n_bg_deletions = cp->n_bg_deletions;
+	dev->seq_number = cp->seq_number;
+}
+
+static int yaffs2_wr_checkpt_dev(struct yaffs_dev *dev)
+{
+	struct yaffs_checkpt_dev cp;
+	u32 n_bytes;
+	u32 n_blocks =
+	    (dev->internal_end_block - dev->internal_start_block + 1);
+
+	int ok;
+
+	/* Write device runtime values */
+	yaffs2_dev_to_checkpt_dev(&cp, dev);
+	cp.struct_type = sizeof(cp);
+
+	ok = (yaffs2_checkpt_wr(dev, &cp, sizeof(cp)) == sizeof(cp));
+
+	/* Write block info */
+	if (ok) {
+		n_bytes = n_blocks * sizeof(struct yaffs_block_info);
+		ok = (yaffs2_checkpt_wr(dev, dev->block_info, n_bytes) ==
+		      n_bytes);
+	}
+
+	/* Write chunk bits */
+	if (ok) {
+		n_bytes = n_blocks * dev->chunk_bit_stride;
+		ok = (yaffs2_checkpt_wr(dev, dev->chunk_bits, n_bytes) ==
+		      n_bytes);
+	}
+	return ok ? 1 : 0;
+
+}
+
+static int yaffs2_rd_checkpt_dev(struct yaffs_dev *dev)
+{
+	struct yaffs_checkpt_dev cp;
+	u32 n_bytes;
+	u32 n_blocks =
+	    (dev->internal_end_block - dev->internal_start_block + 1);
+
+	int ok;
+
+	ok = (yaffs2_checkpt_rd(dev, &cp, sizeof(cp)) == sizeof(cp));
+	if (!ok)
+		return 0;
+
+	if (cp.struct_type != sizeof(cp))
+		return 0;
+
+	yaffs_checkpt_dev_to_dev(dev, &cp);
+
+	n_bytes = n_blocks * sizeof(struct yaffs_block_info);
+
+	ok = (yaffs2_checkpt_rd(dev, dev->block_info, n_bytes) == n_bytes);
+
+	if (!ok)
+		return 0;
+	n_bytes = n_blocks * dev->chunk_bit_stride;
+
+	ok = (yaffs2_checkpt_rd(dev, dev->chunk_bits, n_bytes) == n_bytes);
+
+	return ok ? 1 : 0;
+}
+
+static void yaffs2_obj_checkpt_obj(struct yaffs_checkpt_obj *cp,
+				   struct yaffs_obj *obj)
+{
+
+	cp->obj_id = obj->obj_id;
+	cp->parent_id = (obj->parent) ? obj->parent->obj_id : 0;
+	cp->hdr_chunk = obj->hdr_chunk;
+	cp->variant_type = obj->variant_type;
+	cp->deleted = obj->deleted;
+	cp->soft_del = obj->soft_del;
+	cp->unlinked = obj->unlinked;
+	cp->fake = obj->fake;
+	cp->rename_allowed = obj->rename_allowed;
+	cp->unlink_allowed = obj->unlink_allowed;
+	cp->serial = obj->serial;
+	cp->n_data_chunks = obj->n_data_chunks;
+
+	if (obj->variant_type == YAFFS_OBJECT_TYPE_FILE)
+		cp->size_or_equiv_obj = obj->variant.file_variant.file_size;
+	else if (obj->variant_type == YAFFS_OBJECT_TYPE_HARDLINK)
+		cp->size_or_equiv_obj = obj->variant.hardlink_variant.equiv_id;
+}
+
+static int taffs2_checkpt_obj_to_obj(struct yaffs_obj *obj,
+				     struct yaffs_checkpt_obj *cp)
+{
+
+	struct yaffs_obj *parent;
+
+	if (obj->variant_type != cp->variant_type) {
+		yaffs_trace(YAFFS_TRACE_ERROR,
+			"Checkpoint read object %d type %d chunk %d does not match existing object type %d",
+			cp->obj_id, cp->variant_type, cp->hdr_chunk,
+			obj->variant_type);
+		return 0;
+	}
+
+	obj->obj_id = cp->obj_id;
+
+	if (cp->parent_id)
+		parent = yaffs_find_or_create_by_number(obj->my_dev,
+							cp->parent_id,
+							YAFFS_OBJECT_TYPE_DIRECTORY);
+	else
+		parent = NULL;
+
+	if (parent) {
+		if (parent->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) {
+			yaffs_trace(YAFFS_TRACE_ALWAYS,
+				"Checkpoint read object %d parent %d type %d chunk %d Parent type, %d, not directory",
+				cp->obj_id, cp->parent_id,
+				cp->variant_type, cp->hdr_chunk,
+				parent->variant_type);
+			return 0;
+		}
+		yaffs_add_obj_to_dir(parent, obj);
+	}
+
+	obj->hdr_chunk = cp->hdr_chunk;
+	obj->variant_type = cp->variant_type;
+	obj->deleted = cp->deleted;
+	obj->soft_del = cp->soft_del;
+	obj->unlinked = cp->unlinked;
+	obj->fake = cp->fake;
+	obj->rename_allowed = cp->rename_allowed;
+	obj->unlink_allowed = cp->unlink_allowed;
+	obj->serial = cp->serial;
+	obj->n_data_chunks = cp->n_data_chunks;
+
+	if (obj->variant_type == YAFFS_OBJECT_TYPE_FILE)
+		obj->variant.file_variant.file_size = cp->size_or_equiv_obj;
+	else if (obj->variant_type == YAFFS_OBJECT_TYPE_HARDLINK)
+		obj->variant.hardlink_variant.equiv_id = cp->size_or_equiv_obj;
+
+	if (obj->hdr_chunk > 0)
+		obj->lazy_loaded = 1;
+	return 1;
+}
+
+static int yaffs2_checkpt_tnode_worker(struct yaffs_obj *in,
+				       struct yaffs_tnode *tn, u32 level,
+				       int chunk_offset)
+{
+	int i;
+	struct yaffs_dev *dev = in->my_dev;
+	int ok = 1;
+
+	if (tn) {
+		if (level > 0) {
+
+			for (i = 0; i < YAFFS_NTNODES_INTERNAL && ok; i++) {
+				if (tn->internal[i]) {
+					ok = yaffs2_checkpt_tnode_worker(in,
+									 tn->
+									 internal
+									 [i],
+									 level -
+									 1,
+									 (chunk_offset
+									  <<
+									  YAFFS_TNODES_INTERNAL_BITS)
+									 + i);
+				}
+			}
+		} else if (level == 0) {
+			u32 base_offset =
+			    chunk_offset << YAFFS_TNODES_LEVEL0_BITS;
+			ok = (yaffs2_checkpt_wr
+			      (dev, &base_offset,
+			       sizeof(base_offset)) == sizeof(base_offset));
+			if (ok)
+				ok = (yaffs2_checkpt_wr
+				      (dev, tn,
+				       dev->tnode_size) == dev->tnode_size);
+		}
+	}
+
+	return ok;
+
+}
+
+static int yaffs2_wr_checkpt_tnodes(struct yaffs_obj *obj)
+{
+	u32 end_marker = ~0;
+	int ok = 1;
+
+	if (obj->variant_type == YAFFS_OBJECT_TYPE_FILE) {
+		ok = yaffs2_checkpt_tnode_worker(obj,
+						 obj->variant.file_variant.top,
+						 obj->variant.file_variant.
+						 top_level, 0);
+		if (ok)
+			ok = (yaffs2_checkpt_wr
+			      (obj->my_dev, &end_marker,
+			       sizeof(end_marker)) == sizeof(end_marker));
+	}
+
+	return ok ? 1 : 0;
+}
+
+static int yaffs2_rd_checkpt_tnodes(struct yaffs_obj *obj)
+{
+	u32 base_chunk;
+	int ok = 1;
+	struct yaffs_dev *dev = obj->my_dev;
+	struct yaffs_file_var *file_stuct_ptr = &obj->variant.file_variant;
+	struct yaffs_tnode *tn;
+	int nread = 0;
+
+	ok = (yaffs2_checkpt_rd(dev, &base_chunk, sizeof(base_chunk)) ==
+	      sizeof(base_chunk));
+
+	while (ok && (~base_chunk)) {
+		nread++;
+		/* Read level 0 tnode */
+
+		tn = yaffs_get_tnode(dev);
+		if (tn) {
+			ok = (yaffs2_checkpt_rd(dev, tn, dev->tnode_size) ==
+			      dev->tnode_size);
+		} else {
+			ok = 0;
+                }
+
+		if (tn && ok)
+			ok = yaffs_add_find_tnode_0(dev,
+						    file_stuct_ptr,
+						    base_chunk, tn) ? 1 : 0;
+
+		if (ok)
+			ok = (yaffs2_checkpt_rd
+			      (dev, &base_chunk,
+			       sizeof(base_chunk)) == sizeof(base_chunk));
+
+	}
+
+	yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+		"Checkpoint read tnodes %d records, last %d. ok %d",
+		nread, base_chunk, ok);
+
+	return ok ? 1 : 0;
+}
+
+static int yaffs2_wr_checkpt_objs(struct yaffs_dev *dev)
+{
+	struct yaffs_obj *obj;
+	struct yaffs_checkpt_obj cp;
+	int i;
+	int ok = 1;
+	struct list_head *lh;
+
+	/* Iterate through the objects in each hash entry,
+	 * dumping them to the checkpointing stream.
+	 */
+
+	for (i = 0; ok && i < YAFFS_NOBJECT_BUCKETS; i++) {
+		list_for_each(lh, &dev->obj_bucket[i].list) {
+			if (lh) {
+				obj =
+				    list_entry(lh, struct yaffs_obj, hash_link);
+				if (!obj->defered_free) {
+					yaffs2_obj_checkpt_obj(&cp, obj);
+					cp.struct_type = sizeof(cp);
+
+					yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+						"Checkpoint write object %d parent %d type %d chunk %d obj addr %p",
+						cp.obj_id, cp.parent_id,
+						cp.variant_type, cp.hdr_chunk, obj);
+
+					ok = (yaffs2_checkpt_wr
+					      (dev, &cp,
+					       sizeof(cp)) == sizeof(cp));
+
+					if (ok
+					    && obj->variant_type ==
+					    YAFFS_OBJECT_TYPE_FILE)
+						ok = yaffs2_wr_checkpt_tnodes
+						    (obj);
+				}
+			}
+		}
+	}
+
+	/* Dump end of list */
+	memset(&cp, 0xFF, sizeof(struct yaffs_checkpt_obj));
+	cp.struct_type = sizeof(cp);
+
+	if (ok)
+		ok = (yaffs2_checkpt_wr(dev, &cp, sizeof(cp)) == sizeof(cp));
+
+	return ok ? 1 : 0;
+}
+
+static int yaffs2_rd_checkpt_objs(struct yaffs_dev *dev)
+{
+	struct yaffs_obj *obj;
+	struct yaffs_checkpt_obj cp;
+	int ok = 1;
+	int done = 0;
+	struct yaffs_obj *hard_list = NULL;
+
+	while (ok && !done) {
+		ok = (yaffs2_checkpt_rd(dev, &cp, sizeof(cp)) == sizeof(cp));
+		if (cp.struct_type != sizeof(cp)) {
+			yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+				"struct size %d instead of %d ok %d",
+				cp.struct_type, (int)sizeof(cp), ok);
+			ok = 0;
+		}
+
+		yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+			"Checkpoint read object %d parent %d type %d chunk %d ",
+			cp.obj_id, cp.parent_id, cp.variant_type,
+			cp.hdr_chunk);
+
+		if (ok && cp.obj_id == ~0) {
+			done = 1;
+		} else if (ok) {
+			obj =
+			    yaffs_find_or_create_by_number(dev, cp.obj_id,
+							   cp.variant_type);
+			if (obj) {
+				ok = taffs2_checkpt_obj_to_obj(obj, &cp);
+				if (!ok)
+					break;
+				if (obj->variant_type == YAFFS_OBJECT_TYPE_FILE) {
+					ok = yaffs2_rd_checkpt_tnodes(obj);
+				} else if (obj->variant_type ==
+					   YAFFS_OBJECT_TYPE_HARDLINK) {
+					obj->hard_links.next =
+					    (struct list_head *)hard_list;
+					hard_list = obj;
+				}
+			} else {
+				ok = 0;
+                        }
+		}
+	}
+
+	if (ok)
+		yaffs_link_fixup(dev, hard_list);
+
+	return ok ? 1 : 0;
+}
+
+static int yaffs2_wr_checkpt_sum(struct yaffs_dev *dev)
+{
+	u32 checkpt_sum;
+	int ok;
+
+	yaffs2_get_checkpt_sum(dev, &checkpt_sum);
+
+	ok = (yaffs2_checkpt_wr(dev, &checkpt_sum, sizeof(checkpt_sum)) ==
+	      sizeof(checkpt_sum));
+
+	if (!ok)
+		return 0;
+
+	return 1;
+}
+
+static int yaffs2_rd_checkpt_sum(struct yaffs_dev *dev)
+{
+	u32 checkpt_sum0;
+	u32 checkpt_sum1;
+	int ok;
+
+	yaffs2_get_checkpt_sum(dev, &checkpt_sum0);
+
+	ok = (yaffs2_checkpt_rd(dev, &checkpt_sum1, sizeof(checkpt_sum1)) ==
+	      sizeof(checkpt_sum1));
+
+	if (!ok)
+		return 0;
+
+	if (checkpt_sum0 != checkpt_sum1)
+		return 0;
+
+	return 1;
+}
+
+static int yaffs2_wr_checkpt_data(struct yaffs_dev *dev)
+{
+	int ok = 1;
+
+	if (!yaffs2_checkpt_required(dev)) {
+		yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+			"skipping checkpoint write");
+		ok = 0;
+	}
+
+	if (ok)
+		ok = yaffs2_checkpt_open(dev, 1);
+
+	if (ok) {
+		yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+			"write checkpoint validity");
+		ok = yaffs2_wr_checkpt_validity_marker(dev, 1);
+	}
+	if (ok) {
+		yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+			"write checkpoint device");
+		ok = yaffs2_wr_checkpt_dev(dev);
+	}
+	if (ok) {
+		yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+			"write checkpoint objects");
+		ok = yaffs2_wr_checkpt_objs(dev);
+	}
+	if (ok) {
+		yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+			"write checkpoint validity");
+		ok = yaffs2_wr_checkpt_validity_marker(dev, 0);
+	}
+
+	if (ok)
+		ok = yaffs2_wr_checkpt_sum(dev);
+
+	if (!yaffs_checkpt_close(dev))
+		ok = 0;
+
+	if (ok)
+		dev->is_checkpointed = 1;
+	else
+		dev->is_checkpointed = 0;
+
+	return dev->is_checkpointed;
+}
+
+static int yaffs2_rd_checkpt_data(struct yaffs_dev *dev)
+{
+	int ok = 1;
+
+	if (!dev->param.is_yaffs2)
+		ok = 0;
+
+	if (ok && dev->param.skip_checkpt_rd) {
+		yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+			"skipping checkpoint read");
+		ok = 0;
+	}
+
+	if (ok)
+		ok = yaffs2_checkpt_open(dev, 0); /* open for read */
+
+	if (ok) {
+		yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+			"read checkpoint validity");
+		ok = yaffs2_rd_checkpt_validity_marker(dev, 1);
+	}
+	if (ok) {
+		yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+			"read checkpoint device");
+		ok = yaffs2_rd_checkpt_dev(dev);
+	}
+	if (ok) {
+		yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+			"read checkpoint objects");
+		ok = yaffs2_rd_checkpt_objs(dev);
+	}
+	if (ok) {
+		yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+			"read checkpoint validity");
+		ok = yaffs2_rd_checkpt_validity_marker(dev, 0);
+	}
+
+	if (ok) {
+		ok = yaffs2_rd_checkpt_sum(dev);
+		yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+			"read checkpoint checksum %d", ok);
+	}
+
+	if (!yaffs_checkpt_close(dev))
+		ok = 0;
+
+	if (ok)
+		dev->is_checkpointed = 1;
+	else
+		dev->is_checkpointed = 0;
+
+	return ok ? 1 : 0;
+
+}
+
+void yaffs2_checkpt_invalidate(struct yaffs_dev *dev)
+{
+	if (dev->is_checkpointed || dev->blocks_in_checkpt > 0) {
+		dev->is_checkpointed = 0;
+		yaffs2_checkpt_invalidate_stream(dev);
+	}
+	if (dev->param.sb_dirty_fn)
+		dev->param.sb_dirty_fn(dev);
+}
+
+int yaffs_checkpoint_save(struct yaffs_dev *dev)
+{
+
+	yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+		"save entry: is_checkpointed %d",
+		dev->is_checkpointed);
+
+	yaffs_verify_objects(dev);
+	yaffs_verify_blocks(dev);
+	yaffs_verify_free_chunks(dev);
+
+	if (!dev->is_checkpointed) {
+		yaffs2_checkpt_invalidate(dev);
+		yaffs2_wr_checkpt_data(dev);
+	}
+
+	yaffs_trace(YAFFS_TRACE_CHECKPOINT | YAFFS_TRACE_MOUNT,
+		"save exit: is_checkpointed %d",
+		dev->is_checkpointed);
+
+	return dev->is_checkpointed;
+}
+
+int yaffs2_checkpt_restore(struct yaffs_dev *dev)
+{
+	int retval;
+	yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+		"restore entry: is_checkpointed %d",
+		dev->is_checkpointed);
+
+	retval = yaffs2_rd_checkpt_data(dev);
+
+	if (dev->is_checkpointed) {
+		yaffs_verify_objects(dev);
+		yaffs_verify_blocks(dev);
+		yaffs_verify_free_chunks(dev);
+	}
+
+	yaffs_trace(YAFFS_TRACE_CHECKPOINT,
+		"restore exit: is_checkpointed %d",
+		dev->is_checkpointed);
+
+	return retval;
+}
+
+int yaffs2_handle_hole(struct yaffs_obj *obj, loff_t new_size)
+{
+	/* if new_size > old_file_size.
+	 * We're going to be writing a hole.
+	 * If the hole is small then write zeros otherwise write a start of hole marker.
+	 */
+
+	loff_t old_file_size;
+	int increase;
+	int small_hole;
+	int result = YAFFS_OK;
+	struct yaffs_dev *dev = NULL;
+
+	u8 *local_buffer = NULL;
+
+	int small_increase_ok = 0;
+
+	if (!obj)
+		return YAFFS_FAIL;
+
+	if (obj->variant_type != YAFFS_OBJECT_TYPE_FILE)
+		return YAFFS_FAIL;
+
+	dev = obj->my_dev;
+
+	/* Bail out if not yaffs2 mode */
+	if (!dev->param.is_yaffs2)
+		return YAFFS_OK;
+
+	old_file_size = obj->variant.file_variant.file_size;
+
+	if (new_size <= old_file_size)
+		return YAFFS_OK;
+
+	increase = new_size - old_file_size;
+
+	if (increase < YAFFS_SMALL_HOLE_THRESHOLD * dev->data_bytes_per_chunk &&
+	    yaffs_check_alloc_available(dev, YAFFS_SMALL_HOLE_THRESHOLD + 1))
+		small_hole = 1;
+	else
+		small_hole = 0;
+
+	if (small_hole)
+		local_buffer = yaffs_get_temp_buffer(dev, __LINE__);
+
+	if (local_buffer) {
+		/* fill hole with zero bytes */
+		int pos = old_file_size;
+		int this_write;
+		int written;
+		memset(local_buffer, 0, dev->data_bytes_per_chunk);
+		small_increase_ok = 1;
+
+		while (increase > 0 && small_increase_ok) {
+			this_write = increase;
+			if (this_write > dev->data_bytes_per_chunk)
+				this_write = dev->data_bytes_per_chunk;
+			written =
+			    yaffs_do_file_wr(obj, local_buffer, pos, this_write,
+					     0);
+			if (written == this_write) {
+				pos += this_write;
+				increase -= this_write;
+			} else {
+				small_increase_ok = 0;
+                        }
+		}
+
+		yaffs_release_temp_buffer(dev, local_buffer, __LINE__);
+
+		/* If we were out of space then reverse any chunks we've added */
+		if (!small_increase_ok)
+			yaffs_resize_file_down(obj, old_file_size);
+	}
+
+	if (!small_increase_ok &&
+	    obj->parent &&
+	    obj->parent->obj_id != YAFFS_OBJECTID_UNLINKED &&
+	    obj->parent->obj_id != YAFFS_OBJECTID_DELETED) {
+		/* Write a hole start header with the old file size */
+		yaffs_update_oh(obj, NULL, 0, 1, 0, NULL);
+	}
+
+	return result;
+
+}
+
+struct yaffs_block_index {
+	int seq;
+	int block;
+};
+
+static int yaffs2_ybicmp(const void *a, const void *b)
+{
+	int aseq = ((struct yaffs_block_index *)a)->seq;
+	int bseq = ((struct yaffs_block_index *)b)->seq;
+	int ablock = ((struct yaffs_block_index *)a)->block;
+	int bblock = ((struct yaffs_block_index *)b)->block;
+	if (aseq == bseq)
+		return ablock - bblock;
+	else
+		return aseq - bseq;
+}
+
+int yaffs2_scan_backwards(struct yaffs_dev *dev)
+{
+	struct yaffs_ext_tags tags;
+	int blk;
+	int block_iter;
+	int start_iter;
+	int end_iter;
+	int n_to_scan = 0;
+
+	int chunk;
+	int result;
+	int c;
+	int deleted;
+	enum yaffs_block_state state;
+	struct yaffs_obj *hard_list = NULL;
+	struct yaffs_block_info *bi;
+	u32 seq_number;
+	struct yaffs_obj_hdr *oh;
+	struct yaffs_obj *in;
+	struct yaffs_obj *parent;
+	int n_blocks = dev->internal_end_block - dev->internal_start_block + 1;
+	int is_unlinked;
+	u8 *chunk_data;
+
+	int file_size;
+	int is_shrink;
+	int found_chunks;
+	int equiv_id;
+	int alloc_failed = 0;
+
+	struct yaffs_block_index *block_index = NULL;
+	int alt_block_index = 0;
+
+	yaffs_trace(YAFFS_TRACE_SCAN,
+		"yaffs2_scan_backwards starts  intstartblk %d intendblk %d...",
+		dev->internal_start_block, dev->internal_end_block);
+
+	dev->seq_number = YAFFS_LOWEST_SEQUENCE_NUMBER;
+
+	block_index = kmalloc(n_blocks * sizeof(struct yaffs_block_index),
+			GFP_NOFS);
+
+	if (!block_index) {
+		block_index =
+		    vmalloc(n_blocks * sizeof(struct yaffs_block_index));
+		alt_block_index = 1;
+	}
+
+	if (!block_index) {
+		yaffs_trace(YAFFS_TRACE_SCAN,
+			"yaffs2_scan_backwards() could not allocate block index!"
+			);
+		return YAFFS_FAIL;
+	}
+
+	dev->blocks_in_checkpt = 0;
+
+	chunk_data = yaffs_get_temp_buffer(dev, __LINE__);
+
+	/* Scan all the blocks to determine their state */
+	bi = dev->block_info;
+	for (blk = dev->internal_start_block; blk <= dev->internal_end_block;
+	     blk++) {
+		yaffs_clear_chunk_bits(dev, blk);
+		bi->pages_in_use = 0;
+		bi->soft_del_pages = 0;
+
+		yaffs_query_init_block_state(dev, blk, &state, &seq_number);
+
+		bi->block_state = state;
+		bi->seq_number = seq_number;
+
+		if (bi->seq_number == YAFFS_SEQUENCE_CHECKPOINT_DATA)
+			bi->block_state = state = YAFFS_BLOCK_STATE_CHECKPOINT;
+		if (bi->seq_number == YAFFS_SEQUENCE_BAD_BLOCK)
+			bi->block_state = state = YAFFS_BLOCK_STATE_DEAD;
+
+		yaffs_trace(YAFFS_TRACE_SCAN_DEBUG,
+			"Block scanning block %d state %d seq %d",
+			blk, state, seq_number);
+
+		if (state == YAFFS_BLOCK_STATE_CHECKPOINT) {
+			dev->blocks_in_checkpt++;
+
+		} else if (state == YAFFS_BLOCK_STATE_DEAD) {
+			yaffs_trace(YAFFS_TRACE_BAD_BLOCKS,
+				"block %d is bad", blk);
+		} else if (state == YAFFS_BLOCK_STATE_EMPTY) {
+			yaffs_trace(YAFFS_TRACE_SCAN_DEBUG, "Block empty ");
+			dev->n_erased_blocks++;
+			dev->n_free_chunks += dev->param.chunks_per_block;
+		} else if (state == YAFFS_BLOCK_STATE_NEEDS_SCANNING) {
+
+			/* Determine the highest sequence number */
+			if (seq_number >= YAFFS_LOWEST_SEQUENCE_NUMBER &&
+			    seq_number < YAFFS_HIGHEST_SEQUENCE_NUMBER) {
+
+				block_index[n_to_scan].seq = seq_number;
+				block_index[n_to_scan].block = blk;
+
+				n_to_scan++;
+
+				if (seq_number >= dev->seq_number)
+					dev->seq_number = seq_number;
+			} else {
+				/* TODO: Nasty sequence number! */
+				yaffs_trace(YAFFS_TRACE_SCAN,
+					"Block scanning block %d has bad sequence number %d",
+					blk, seq_number);
+
+			}
+		}
+		bi++;
+	}
+
+	yaffs_trace(YAFFS_TRACE_SCAN, "%d blocks to be sorted...", n_to_scan);
+
+	cond_resched();
+
+	/* Sort the blocks by sequence number */
+	sort(block_index, n_to_scan, sizeof(struct yaffs_block_index),
+		   yaffs2_ybicmp, NULL);
+
+	cond_resched();
+
+	yaffs_trace(YAFFS_TRACE_SCAN, "...done");
+
+	/* Now scan the blocks looking at the data. */
+	start_iter = 0;
+	end_iter = n_to_scan - 1;
+	yaffs_trace(YAFFS_TRACE_SCAN_DEBUG, "%d blocks to scan", n_to_scan);
+
+	/* For each block.... backwards */
+	for (block_iter = end_iter; !alloc_failed && block_iter >= start_iter;
+	     block_iter--) {
+		/* Cooperative multitasking! This loop can run for so
+		   long that watchdog timers expire. */
+		cond_resched();
+
+		/* get the block to scan in the correct order */
+		blk = block_index[block_iter].block;
+
+		bi = yaffs_get_block_info(dev, blk);
+
+		state = bi->block_state;
+
+		deleted = 0;
+
+		/* For each chunk in each block that needs scanning.... */
+		found_chunks = 0;
+		for (c = dev->param.chunks_per_block - 1;
+		     !alloc_failed && c >= 0 &&
+		     (state == YAFFS_BLOCK_STATE_NEEDS_SCANNING ||
+		      state == YAFFS_BLOCK_STATE_ALLOCATING); c--) {
+			/* Scan backwards...
+			 * Read the tags and decide what to do
+			 */
+
+			chunk = blk * dev->param.chunks_per_block + c;
+
+			result = yaffs_rd_chunk_tags_nand(dev, chunk, NULL,
+							  &tags);
+
+			/* Let's have a good look at this chunk... */
+
+			if (!tags.chunk_used) {
+				/* An unassigned chunk in the block.
+				 * If there are used chunks after this one, then
+				 * it is a chunk that was skipped due to failing the erased
+				 * check. Just skip it so that it can be deleted.
+				 * But, more typically, We get here when this is an unallocated
+				 * chunk and his means that either the block is empty or
+				 * this is the one being allocated from
+				 */
+
+				if (found_chunks) {
+					/* This is a chunk that was skipped due to failing the erased check */
+				} else if (c == 0) {
+					/* We're looking at the first chunk in the block so the block is unused */
+					state = YAFFS_BLOCK_STATE_EMPTY;
+					dev->n_erased_blocks++;
+				} else {
+					if (state ==
+					    YAFFS_BLOCK_STATE_NEEDS_SCANNING
+					    || state ==
+					    YAFFS_BLOCK_STATE_ALLOCATING) {
+						if (dev->seq_number ==
+						    bi->seq_number) {
+							/* this is the block being allocated from */
+
+							yaffs_trace(YAFFS_TRACE_SCAN,
+								" Allocating from %d %d",
+								blk, c);
+
+							state =
+							    YAFFS_BLOCK_STATE_ALLOCATING;
+							dev->alloc_block = blk;
+							dev->alloc_page = c;
+							dev->
+							    alloc_block_finder =
+							    blk;
+						} else {
+							/* This is a partially written block that is not
+							 * the current allocation block.
+							 */
+
+							yaffs_trace(YAFFS_TRACE_SCAN,
+								"Partially written block %d detected",
+								blk);
+						}
+					}
+				}
+
+				dev->n_free_chunks++;
+
+			} else if (tags.ecc_result == YAFFS_ECC_RESULT_UNFIXED) {
+				yaffs_trace(YAFFS_TRACE_SCAN,
+					" Unfixed ECC in chunk(%d:%d), chunk ignored",
+					blk, c);
+
+				dev->n_free_chunks++;
+
+			} else if (tags.obj_id > YAFFS_MAX_OBJECT_ID ||
+				   tags.chunk_id > YAFFS_MAX_CHUNK_ID ||
+				   (tags.chunk_id > 0
+				    && tags.n_bytes > dev->data_bytes_per_chunk)
+				   || tags.seq_number != bi->seq_number) {
+				yaffs_trace(YAFFS_TRACE_SCAN,
+					"Chunk (%d:%d) with bad tags:obj = %d, chunk_id = %d, n_bytes = %d, ignored",
+					blk, c, tags.obj_id,
+					tags.chunk_id, tags.n_bytes);
+
+				dev->n_free_chunks++;
+
+			} else if (tags.chunk_id > 0) {
+				/* chunk_id > 0 so it is a data chunk... */
+				unsigned int endpos;
+				u32 chunk_base =
+				    (tags.chunk_id -
+				     1) * dev->data_bytes_per_chunk;
+
+				found_chunks = 1;
+
+				yaffs_set_chunk_bit(dev, blk, c);
+				bi->pages_in_use++;
+
+				in = yaffs_find_or_create_by_number(dev,
+								    tags.obj_id,
+								    YAFFS_OBJECT_TYPE_FILE);
+				if (!in) {
+					/* Out of memory */
+					alloc_failed = 1;
+				}
+
+				if (in &&
+				    in->variant_type == YAFFS_OBJECT_TYPE_FILE
+				    && chunk_base <
+				    in->variant.file_variant.shrink_size) {
+					/* This has not been invalidated by a resize */
+					if (!yaffs_put_chunk_in_file
+					    (in, tags.chunk_id, chunk, -1)) {
+						alloc_failed = 1;
+					}
+
+					/* File size is calculated by looking at the data chunks if we have not
+					 * seen an object header yet. Stop this practice once we find an object header.
+					 */
+					endpos = chunk_base + tags.n_bytes;
+
+					if (!in->valid &&	/* have not got an object header yet */
+					    in->variant.file_variant.
+					    scanned_size < endpos) {
+						in->variant.file_variant.
+						    scanned_size = endpos;
+						in->variant.file_variant.
+						    file_size = endpos;
+					}
+
+				} else if (in) {
+					/* This chunk has been invalidated by a resize, or a past file deletion
+					 * so delete the chunk*/
+					yaffs_chunk_del(dev, chunk, 1,
+							__LINE__);
+
+				}
+			} else {
+				/* chunk_id == 0, so it is an ObjectHeader.
+				 * Thus, we read in the object header and make the object
+				 */
+				found_chunks = 1;
+
+				yaffs_set_chunk_bit(dev, blk, c);
+				bi->pages_in_use++;
+
+				oh = NULL;
+				in = NULL;
+
+				if (tags.extra_available) {
+					in = yaffs_find_or_create_by_number(dev,
+									    tags.
+									    obj_id,
+									    tags.
+									    extra_obj_type);
+					if (!in)
+						alloc_failed = 1;
+				}
+
+				if (!in ||
+				    (!in->valid && dev->param.disable_lazy_load)
+				    || tags.extra_shadows || (!in->valid
+							      && (tags.obj_id ==
+								  YAFFS_OBJECTID_ROOT
+								  || tags.
+								  obj_id ==
+								  YAFFS_OBJECTID_LOSTNFOUND)))
+				{
+
+					/* If we don't have  valid info then we need to read the chunk
+					 * TODO In future we can probably defer reading the chunk and
+					 * living with invalid data until needed.
+					 */
+
+					result = yaffs_rd_chunk_tags_nand(dev,
+									  chunk,
+									  chunk_data,
+									  NULL);
+
+					oh = (struct yaffs_obj_hdr *)chunk_data;
+
+					if (dev->param.inband_tags) {
+						/* Fix up the header if they got corrupted by inband tags */
+						oh->shadows_obj =
+						    oh->inband_shadowed_obj_id;
+						oh->is_shrink =
+						    oh->inband_is_shrink;
+					}
+
+					if (!in) {
+						in = yaffs_find_or_create_by_number(dev, tags.obj_id, oh->type);
+						if (!in)
+							alloc_failed = 1;
+					}
+
+				}
+
+				if (!in) {
+					/* TODO Hoosterman we have a problem! */
+					yaffs_trace(YAFFS_TRACE_ERROR,
+						"yaffs tragedy: Could not make object for object  %d at chunk %d during scan",
+						tags.obj_id, chunk);
+					continue;
+				}
+
+				if (in->valid) {
+					/* We have already filled this one.
+					 * We have a duplicate that will be discarded, but
+					 * we first have to suck out resize info if it is a file.
+					 */
+
+					if ((in->variant_type ==
+					     YAFFS_OBJECT_TYPE_FILE) && ((oh
+									  &&
+									  oh->
+									  type
+									  ==
+									  YAFFS_OBJECT_TYPE_FILE)
+									 ||
+									 (tags.
+									  extra_available
+									  &&
+									  tags.
+									  extra_obj_type
+									  ==
+									  YAFFS_OBJECT_TYPE_FILE)))
+					{
+						u32 this_size =
+						    (oh) ? oh->
+						    file_size :
+						    tags.extra_length;
+						u32 parent_obj_id =
+						    (oh) ? oh->parent_obj_id :
+						    tags.extra_parent_id;
+
+						is_shrink =
+						    (oh) ? oh->
+						    is_shrink :
+						    tags.extra_is_shrink;
+
+						/* If it is deleted (unlinked at start also means deleted)
+						 * we treat the file size as being zeroed at this point.
+						 */
+						if (parent_obj_id ==
+						    YAFFS_OBJECTID_DELETED
+						    || parent_obj_id ==
+						    YAFFS_OBJECTID_UNLINKED) {
+							this_size = 0;
+							is_shrink = 1;
+						}
+
+						if (is_shrink
+						    && in->variant.file_variant.
+						    shrink_size > this_size)
+							in->variant.
+							    file_variant.
+							    shrink_size =
+							    this_size;
+
+						if (is_shrink)
+							bi->has_shrink_hdr = 1;
+
+					}
+					/* Use existing - destroy this one. */
+					yaffs_chunk_del(dev, chunk, 1,
+							__LINE__);
+
+				}
+
+				if (!in->valid && in->variant_type !=
+				    (oh ? oh->type : tags.extra_obj_type))
+					yaffs_trace(YAFFS_TRACE_ERROR,
+						"yaffs tragedy: Bad object type, %d != %d, for object %d at chunk %d during scan",
+						oh ?
+						oh->type : tags.extra_obj_type,
+						in->variant_type, tags.obj_id,
+						chunk);
+
+				if (!in->valid &&
+				    (tags.obj_id == YAFFS_OBJECTID_ROOT ||
+				     tags.obj_id ==
+				     YAFFS_OBJECTID_LOSTNFOUND)) {
+					/* We only load some info, don't fiddle with directory structure */
+					in->valid = 1;
+
+					if (oh) {
+
+						in->yst_mode = oh->yst_mode;
+						yaffs_load_attribs(in, oh);
+						in->lazy_loaded = 0;
+					} else {
+						in->lazy_loaded = 1;
+                                        }
+					in->hdr_chunk = chunk;
+
+				} else if (!in->valid) {
+					/* we need to load this info */
+
+					in->valid = 1;
+					in->hdr_chunk = chunk;
+
+					if (oh) {
+						in->variant_type = oh->type;
+
+						in->yst_mode = oh->yst_mode;
+						yaffs_load_attribs(in, oh);
+
+						if (oh->shadows_obj > 0)
+							yaffs_handle_shadowed_obj
+							    (dev,
+							     oh->shadows_obj,
+							     1);
+
+						yaffs_set_obj_name_from_oh(in,
+									   oh);
+						parent =
+						    yaffs_find_or_create_by_number
+						    (dev, oh->parent_obj_id,
+						     YAFFS_OBJECT_TYPE_DIRECTORY);
+
+						file_size = oh->file_size;
+						is_shrink = oh->is_shrink;
+						equiv_id = oh->equiv_id;
+
+					} else {
+						in->variant_type =
+						    tags.extra_obj_type;
+						parent =
+						    yaffs_find_or_create_by_number
+						    (dev, tags.extra_parent_id,
+						     YAFFS_OBJECT_TYPE_DIRECTORY);
+						file_size = tags.extra_length;
+						is_shrink =
+						    tags.extra_is_shrink;
+						equiv_id = tags.extra_equiv_id;
+						in->lazy_loaded = 1;
+
+					}
+					in->dirty = 0;
+
+					if (!parent)
+						alloc_failed = 1;
+
+					/* directory stuff...
+					 * hook up to parent
+					 */
+
+					if (parent && parent->variant_type ==
+					    YAFFS_OBJECT_TYPE_UNKNOWN) {
+						/* Set up as a directory */
+						parent->variant_type =
+						    YAFFS_OBJECT_TYPE_DIRECTORY;
+						INIT_LIST_HEAD(&parent->
+							       variant.dir_variant.children);
+					} else if (!parent
+						   || parent->variant_type !=
+						   YAFFS_OBJECT_TYPE_DIRECTORY) {
+						/* Hoosterman, another problem....
+						 * We're trying to use a non-directory as a directory
+						 */
+
+						yaffs_trace(YAFFS_TRACE_ERROR,
+							"yaffs tragedy: attempting to use non-directory as a directory in scan. Put in lost+found."
+							);
+						parent = dev->lost_n_found;
+					}
+
+					yaffs_add_obj_to_dir(parent, in);
+
+					is_unlinked = (parent == dev->del_dir)
+					    || (parent == dev->unlinked_dir);
+
+					if (is_shrink) {
+						/* Mark the block as having a shrink header */
+						bi->has_shrink_hdr = 1;
+					}
+
+					/* Note re hardlinks.
+					 * Since we might scan a hardlink before its equivalent object is scanned
+					 * we put them all in a list.
+					 * After scanning is complete, we should have all the objects, so we run
+					 * through this list and fix up all the chains.
+					 */
+
+					switch (in->variant_type) {
+					case YAFFS_OBJECT_TYPE_UNKNOWN:
+						/* Todo got a problem */
+						break;
+					case YAFFS_OBJECT_TYPE_FILE:
+
+						if (in->variant.
+						    file_variant.scanned_size <
+						    file_size) {
+							/* This covers the case where the file size is greater
+							 * than where the data is
+							 * This will happen if the file is resized to be larger
+							 * than its current data extents.
+							 */
+							in->variant.
+							    file_variant.
+							    file_size =
+							    file_size;
+							in->variant.
+							    file_variant.
+							    scanned_size =
+							    file_size;
+						}
+
+						if (in->variant.file_variant.
+						    shrink_size > file_size)
+							in->variant.
+							    file_variant.
+							    shrink_size =
+							    file_size;
+
+						break;
+					case YAFFS_OBJECT_TYPE_HARDLINK:
+						if (!is_unlinked) {
+							in->variant.
+							    hardlink_variant.
+							    equiv_id = equiv_id;
+							in->hard_links.next =
+							    (struct list_head *)
+							    hard_list;
+							hard_list = in;
+						}
+						break;
+					case YAFFS_OBJECT_TYPE_DIRECTORY:
+						/* Do nothing */
+						break;
+					case YAFFS_OBJECT_TYPE_SPECIAL:
+						/* Do nothing */
+						break;
+					case YAFFS_OBJECT_TYPE_SYMLINK:
+						if (oh) {
+							in->variant.
+							    symlink_variant.
+							    alias =
+							    yaffs_clone_str(oh->
+									    alias);
+							if (!in->variant.
+							    symlink_variant.
+							    alias)
+								alloc_failed =
+								    1;
+						}
+						break;
+					}
+
+				}
+
+			}
+
+		}		/* End of scanning for each chunk */
+
+		if (state == YAFFS_BLOCK_STATE_NEEDS_SCANNING) {
+			/* If we got this far while scanning, then the block is fully allocated. */
+			state = YAFFS_BLOCK_STATE_FULL;
+		}
+
+		bi->block_state = state;
+
+		/* Now let's see if it was dirty */
+		if (bi->pages_in_use == 0 &&
+		    !bi->has_shrink_hdr &&
+		    bi->block_state == YAFFS_BLOCK_STATE_FULL) {
+			yaffs_block_became_dirty(dev, blk);
+		}
+
+	}
+
+	yaffs_skip_rest_of_block(dev);
+
+	if (alt_block_index)
+		vfree(block_index);
+	else
+		kfree(block_index);
+
+	/* Ok, we've done all the scanning.
+	 * Fix up the hard link chains.
+	 * We should now have scanned all the objects, now it's time to add these
+	 * hardlinks.
+	 */
+	yaffs_link_fixup(dev, hard_list);
+
+	yaffs_release_temp_buffer(dev, chunk_data, __LINE__);
+
+	if (alloc_failed)
+		return YAFFS_FAIL;
+
+	yaffs_trace(YAFFS_TRACE_SCAN, "yaffs2_scan_backwards ends");
+
+	return YAFFS_OK;
+}
diff --git a/fs/yaffs2/yaffs_yaffs2.h b/fs/yaffs2/yaffs_yaffs2.h
new file mode 100644
index 00000000..e1a9287f
--- /dev/null
+++ b/fs/yaffs2/yaffs_yaffs2.h
@@ -0,0 +1,39 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YAFFS_YAFFS2_H__
+#define __YAFFS_YAFFS2_H__
+
+#include "yaffs_guts.h"
+
+void yaffs_calc_oldest_dirty_seq(struct yaffs_dev *dev);
+void yaffs2_find_oldest_dirty_seq(struct yaffs_dev *dev);
+void yaffs2_clear_oldest_dirty_seq(struct yaffs_dev *dev,
+				   struct yaffs_block_info *bi);
+void yaffs2_update_oldest_dirty_seq(struct yaffs_dev *dev, unsigned block_no,
+				    struct yaffs_block_info *bi);
+int yaffs_block_ok_for_gc(struct yaffs_dev *dev, struct yaffs_block_info *bi);
+u32 yaffs2_find_refresh_block(struct yaffs_dev *dev);
+int yaffs2_checkpt_required(struct yaffs_dev *dev);
+int yaffs_calc_checkpt_blocks_required(struct yaffs_dev *dev);
+
+void yaffs2_checkpt_invalidate(struct yaffs_dev *dev);
+int yaffs2_checkpt_save(struct yaffs_dev *dev);
+int yaffs2_checkpt_restore(struct yaffs_dev *dev);
+
+int yaffs2_handle_hole(struct yaffs_obj *obj, loff_t new_size);
+int yaffs2_scan_backwards(struct yaffs_dev *dev);
+
+#endif
diff --git a/fs/yaffs2/yportenv.h b/fs/yaffs2/yportenv.h
new file mode 100644
index 00000000..81834254
--- /dev/null
+++ b/fs/yaffs2/yportenv.h
@@ -0,0 +1,70 @@
+/*
+ * YAFFS: Yet another Flash File System . A NAND-flash specific file system.
+ *
+ * Copyright (C) 2002-2010 Aleph One Ltd.
+ *   for Toby Churchill Ltd and Brightstar Engineering
+ *
+ * Created by Charles Manning <charles@aleph1.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL.
+ */
+
+#ifndef __YPORTENV_LINUX_H__
+#define __YPORTENV_LINUX_H__
+
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/xattr.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/sort.h>
+#include <linux/bitops.h>
+
+#define YCHAR char
+#define YUCHAR unsigned char
+#define _Y(x)     x
+
+#define YAFFS_LOSTNFOUND_NAME		"lost+found"
+#define YAFFS_LOSTNFOUND_PREFIX		"obj"
+
+
+#define YAFFS_ROOT_MODE			0755
+#define YAFFS_LOSTNFOUND_MODE		0700
+
+#define Y_CURRENT_TIME CURRENT_TIME.tv_sec
+#define Y_TIME_CONVERT(x) (x).tv_sec
+
+#define compile_time_assertion(assertion) \
+	({ int x = __builtin_choose_expr(assertion, 0, (void)0); (void) x; })
+
+
+#ifndef Y_DUMP_STACK
+#define Y_DUMP_STACK() dump_stack()
+#endif
+
+#define yaffs_trace(msk, fmt, ...) do { \
+	if(yaffs_trace_mask & (msk)) \
+		printk(KERN_DEBUG "yaffs: " fmt "\n", ##__VA_ARGS__); \
+} while(0)
+
+#ifndef YBUG
+#define YBUG() do {\
+	yaffs_trace(YAFFS_TRACE_BUG,\
+		"bug " __FILE__ " %d",\
+		__LINE__);\
+	Y_DUMP_STACK();\
+} while (0)
+#endif
+
+#endif
-- 
cgit v1.2.3